diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/fftconv.cpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/fftconv.cpp new file mode 100644 index 0000000000000..fc62182ffe7a8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/fftconv.cpp @@ -0,0 +1,241 @@ +#include +#include +#include +#include + +#include +#include + +#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA") +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") + +#define DISPATCH_FLOAT_AND_HALF_AND_BF16(INTYPE, OUTTYPE, NAME, ...) \ + if (INTYPE == at::ScalarType::Half) { \ + using input_t = at::Half; \ + using output_t = at::Half; \ + __VA_ARGS__(); \ + } else if (INTYPE == at::ScalarType::BFloat16) { \ + using input_t = at::BFloat16; \ + using output_t = at::BFloat16; \ + __VA_ARGS__(); \ + } else if ((INTYPE == at::ScalarType::Float) && (OUTTYPE == at::ScalarType::Float)) { \ + using input_t = float; \ + using output_t = float; \ + __VA_ARGS__(); \ + } else if ((INTYPE == at::ScalarType::Float) && (OUTTYPE == at::ScalarType::Half)) { \ + using input_t = float; \ + using output_t = at::Half; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for in-type '", toString(INTYPE), "' and out-type '", toString(OUTTYPE), "'"); \ + } + +template +void fftconv_fwd_cuda_dispatch( + const input_t *u, const c10::complex *filter, + const input_t *v, int head_dim, const input_t *q, + const float *D, const float *dropout_mask, output_t *out, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, bool output_hbl_layout, bool fftfp16); + +template +void fftconv_bwd_cuda_dispatch( + const output_t *dout, + const input_t *u, const c10::complex *filter, + const input_t *v, int head_dim, const input_t *q, + const float *D, const float *dropout_mask, + input_t *du, c10::complex *dfilter, float *dD, + float *dv, input_t *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +torch::Tensor fftconv_fwd(torch::Tensor u, torch::Tensor filter, + torch::Tensor D, + c10::optional v, int head_dim, + c10::optional q, + c10::optional dropout_mask, + bool gelu, bool gelu_inp, bool gelu_q, int fft_size, + bool force_fp16_output, bool output_hbl_layout, + bool fftfp16 + ) { + CHECK_DEVICE(u); + CHECK_DEVICE(filter); + CHECK_DEVICE(D); + + TORCH_CHECK(u.stride(-1) == 1); + TORCH_CHECK(filter.is_contiguous()); + TORCH_CHECK(D.is_contiguous()); + + const int batch_size = u.size(0); + const int H = u.size(1); + const int L = u.size(2); + CHECK_SHAPE(u, batch_size, H, L); + CHECK_SHAPE(filter, H / head_dim, fft_size / 2 + 1); + CHECK_SHAPE(D, H / head_dim); + + TORCH_CHECK(u.dtype() == torch::kFloat16 || u.dtype() == torch::kFloat32 || u.dtype() == torch::kBFloat16); + // TODO: check filter.dtype is complex64 (no complex32) + TORCH_CHECK(D.dtype() == torch::kFloat32); + + if (dropout_mask.has_value()) { + auto dropout_mask_value = dropout_mask.value(); + CHECK_DEVICE(dropout_mask_value); + CHECK_SHAPE(dropout_mask_value, batch_size, H); + TORCH_CHECK(dropout_mask_value.dtype() == torch::kFloat32); + } + if (v.has_value()) { + auto v_value = v.value(); + CHECK_DEVICE(v_value); + CHECK_SHAPE(v_value, batch_size, H, L); + TORCH_CHECK(v_value.stride(-1) == 1); + TORCH_CHECK(v_value.stride(0) == u.stride(0) && v_value.stride(1) == u.stride(1)); + TORCH_CHECK(v_value.dtype() == u.dtype()); + } + if (q.has_value()) { + auto q_value = q.value(); + CHECK_DEVICE(q_value); + CHECK_SHAPE(q_value, batch_size, H, L); + TORCH_CHECK(q_value.stride(-1) == 1); + TORCH_CHECK(q_value.stride(0) == u.stride(0) && q_value.stride(1) == u.stride(1)); + TORCH_CHECK(q_value.dtype() == u.dtype()); + } + + TORCH_CHECK((!gelu_inp) && (!gelu_q)); + TORCH_CHECK((H % head_dim) == 0); + TORCH_CHECK(!fftfp16 || head_dim == 8); // fp16 only suported for head dim 8 + + auto opts = u.options(); + at::ScalarType u_dtype = ::detail::scalar_type(u.scalar_type()); + if (u.dtype() == at::ScalarType::BFloat16) { force_fp16_output = false; } + auto out = !output_hbl_layout + ? torch::empty({batch_size, H, L}, opts.dtype(force_fp16_output ? torch::kFloat16 : u_dtype)) + : torch::empty({H, batch_size, L}, opts.dtype(force_fp16_output ? torch::kFloat16 : u_dtype)).permute({1, 0, 2}); + TORCH_CHECK((L <= fft_size / 2) && (L % 2 == 0)); + TORCH_CHECK(fft_size >= 16 && fft_size <= 16384 && (fft_size == 1 << int(log2(float(fft_size))))); + + size_t batch_stride = u.stride(0), H_stride = u.stride(1); + DISPATCH_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), out.scalar_type(), "fftconv_fwd", [&] { + fftconv_fwd_cuda_dispatch( + static_cast(u.data_ptr()), + static_cast *>(filter.data_ptr()), + v.has_value() ? static_cast(v.value().data_ptr()) : nullptr, + head_dim, + q.has_value() ? static_cast(q.value().data_ptr()) : nullptr, + static_cast(D.data_ptr()), + dropout_mask.has_value() ? static_cast(dropout_mask.value().data_ptr()) : nullptr, + static_cast(out.data_ptr()), + gelu, gelu_inp, gelu_q, batch_size, H, L, batch_stride, H_stride, fft_size, + output_hbl_layout, fftfp16); + }); + return out; +} + +std::tuple +fftconv_bwd(torch::Tensor dout, + torch::Tensor u, + torch::Tensor filter, + torch::Tensor D, + c10::optional v, int head_dim, + c10::optional q, + c10::optional dropout_mask, + bool gelu, bool gelu_inp, bool gelu_q, int fft_size, + bool output_hbl_layout, bool fftfp16) { + CHECK_DEVICE(dout); + CHECK_DEVICE(u); + CHECK_DEVICE(filter); + CHECK_DEVICE(D); + + TORCH_CHECK(u.stride(-1) == 1); + TORCH_CHECK(filter.is_contiguous()); + TORCH_CHECK(D.is_contiguous()); + + const int batch_size = u.size(0); + const int H = u.size(1); + const int L = u.size(2); + CHECK_SHAPE(dout, batch_size, H, L); + CHECK_SHAPE(u, batch_size, H, L); + CHECK_SHAPE(filter, H / head_dim, fft_size / 2 + 1); + CHECK_SHAPE(D, H / head_dim); + if (!output_hbl_layout) { + TORCH_CHECK(dout.is_contiguous()); + } else { + // Previously we were checking + // TORCH_CHECK(dout.stride(1) == batch_size * L && dout.stride(0) == L) + // but this fails for the edge case of batch_size=1, where shape (H, 1, L) + // is already contiguous, and dout.stride(0) = L * H in that case. + TORCH_CHECK(dout.permute({1, 0, 2}).is_contiguous()); + } + + TORCH_CHECK(dout.dtype() == torch::kFloat16 || dout.dtype() == torch::kFloat32 || dout.dtype() == torch::kBFloat16); + TORCH_CHECK(u.dtype() == torch::kFloat16 || u.dtype() == torch::kFloat32 || u.dtype() == torch::kBFloat16); + TORCH_CHECK(D.dtype() == torch::kFloat32); + + auto opts = u.options(); + + torch::Tensor dv; + torch::Tensor dq; + + if (dropout_mask.has_value()) { + auto dropout_mask_value = dropout_mask.value(); + CHECK_DEVICE(dropout_mask_value); + CHECK_SHAPE(dropout_mask_value, batch_size, H); + TORCH_CHECK(dropout_mask_value.dtype() == torch::kFloat32); + } + if (v.has_value()) { + auto v_value = v.value(); + CHECK_DEVICE(v_value); + CHECK_SHAPE(v_value, batch_size, H, L); + TORCH_CHECK(v_value.stride(-1) == 1); + TORCH_CHECK(v_value.stride(0) == u.stride(0) && v_value.stride(1) == u.stride(1)); + TORCH_CHECK(v_value.dtype() == u.dtype()); + dv = torch::zeros_like(v_value, opts.dtype(torch::kFloat)); + } + if (q.has_value()) { + auto q_value = q.value(); + CHECK_DEVICE(q_value); + CHECK_SHAPE(q_value, batch_size, H, L); + TORCH_CHECK(q_value.stride(-1) == 1); + TORCH_CHECK(q_value.stride(0) == u.stride(0) && q_value.stride(1) == u.stride(1)); + TORCH_CHECK(q_value.dtype() == u.dtype()); + dq = torch::empty_like(q_value); + } + + TORCH_CHECK((!gelu_inp) && (!gelu_q)); + TORCH_CHECK((H % head_dim) == 0); + TORCH_CHECK(!fftfp16 || head_dim == 8); // fp16 only suported for head dim 8 + + auto du = torch::empty_like(u); + auto dfilter = torch::empty({batch_size, H / head_dim, head_dim, fft_size / 2 + 1}, opts.dtype(filter.dtype())); + auto dD = torch::empty({batch_size, H / head_dim, head_dim}, opts.dtype(torch::kFloat)); + + TORCH_CHECK((L <= fft_size / 2) && (L % 2 == 0)); + TORCH_CHECK(fft_size >= 16 && fft_size <= 16384 && (fft_size == 1 << int(log2(float(fft_size))))); + + size_t batch_stride = u.stride(0), H_stride = u.stride(1); + DISPATCH_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), dout.scalar_type(), "fftconv_bwd", [&] { + fftconv_bwd_cuda_dispatch( + static_cast(dout.data_ptr()), + static_cast(u.data_ptr()), + static_cast *>(filter.data_ptr()), + v.has_value() ? static_cast(v.value().data_ptr()) : nullptr, + head_dim, + q.has_value() ? static_cast(q.value().data_ptr()) : nullptr, + static_cast(D.data_ptr()), + dropout_mask.has_value() ? static_cast(dropout_mask.value().data_ptr()) : nullptr, + static_cast(du.data_ptr()), + static_cast *>(dfilter.data_ptr()), + static_cast(dD.data_ptr()), + v.has_value() ? static_cast(dv.data_ptr()) : nullptr, + q.has_value() ? static_cast(dq.data_ptr()) : nullptr, + gelu, gelu_inp, gelu_q, batch_size, H, L, batch_stride, H_stride, fft_size, + output_hbl_layout, fftfp16); + }); + + return std::make_tuple(du, dfilter.sum(/*dim=*/std::vector{0, 2}), dD.sum(/*dim=*/std::vector{0, 2}), dv, dq); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("fftconv_fwd", &fftconv_fwd, "Convolution with FFT"); + m.def("fftconv_bwd", &fftconv_bwd, "Convolution with FFT, backward"); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/fftconv_cuda.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/fftconv_cuda.cu new file mode 100644 index 0000000000000..4213c4bcba32f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/fftconv_cuda.cu @@ -0,0 +1,1981 @@ +// Copyright (c) 2022 Tri Dao, Dan Fu + +#include + +#include +#include + +#include + +#include +#include +#include + +#include // For C10_CUDA_KERNEL_LAUNCH_CHECK + +#include "static_switch.h" +#include "twiddle.cuh" + +// *************** FOR ERROR CHECKING ******************* +#ifndef CUDA_RT_CALL +#define CUDA_RT_CALL( call ) \ + { \ + auto status = static_cast( call ); \ + if ( status != cudaSuccess ) \ + fprintf( stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, \ + __LINE__, \ + __FILE__, \ + cudaGetErrorString( status ), \ + status ); \ + } +#endif // CUDA_RT_CALL +// *************** FOR ERROR CHECKING ******************* + +template +inline __device__ void gelu(float (&output)[N], const float (&input)[N]) { + constexpr float kAlpha = M_SQRT1_2; + #pragma unroll + for (int i = 0; i < N; ++i) { + output[i] = input[i] * 0.5 * (1 + erff(input[i] * kAlpha)); + } +} + +template +inline __device__ void dgelu(float (&grad_input)[N], const float (&grad_output)[N], const float (&input)[N]) { + constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + constexpr float kAlpha = M_SQRT1_2; + #pragma unroll + for (int i = 0; i < N; ++i) { + const float cdf = 0.5 * (1 + erff(input[i] * kAlpha)); + const float pdf = expf(-0.5 * input[i] * input[i]) * kBeta; + grad_input[i] = grad_output[i] * (cdf + input[i] * pdf); + } +} + +// GeLU(input0) * input1 +template +inline __device__ void geglu(float (&output)[N], const float (&input0)[N], const float (&input1)[N]) { + constexpr float kAlpha = M_SQRT1_2; + #pragma unroll + for (int i = 0; i < N; ++i) { + output[i] = input1 * (input0[i] * 0.5 * (1 + erff(input0[i] * kAlpha))); + } +} + +template +inline __device__ void dgeglu(float (&grad_input0)[N], float (&grad_input1)[N], + const float (&grad_output)[N], const float (&input0)[N], const float (&input1)[N]) { + constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + constexpr float kAlpha = M_SQRT1_2; + #pragma unroll + for (int i = 0; i < N; ++i) { + const float cdf = 0.5 * (1 + erff(input0[i] * kAlpha)); + const float pdf = expf(-0.5 * input0[i] * input0[i]) * kBeta; + grad_input0[i] = grad_output[i] * input1[i] * (cdf + input0[i] * pdf); + grad_input1[i] = grad_output[i] * input0[i] * cdf; + } +} + +template +__device__ c10::complex pointwise_mul(const c10::complex a, const c10::complex b) { + return c10::complex(a.real_ * b.real_, a.imag_ * b.imag_); +} + + +inline __device__ void read_rrii(cufftdx::detail::complex<__half2> val, c10::complex result [2]) { + using cfloat_t = c10::complex; + result[0] = cfloat_t(__half2float(val.x.x), __half2float(val.y.x)); + result[1] = cfloat_t(__half2float(val.x.y), __half2float(val.y.y)); +} + +inline __device__ cufftdx::detail::complex<__half2> write_rrii(c10::complex val [2]) { + using complex_t = typename cufftdx::detail::complex<__half2>; + return complex_t { + __float22half2_rn(float2 {val[0].real(), val[1].real()}), + __float22half2_rn(float2 {val[0].imag(), val[1].imag()}), + }; +} + +// Implement a real FFT of size 2 * N by calling a complex FFT of size N. +// http://www.robinscheibler.org/2013/02/13/real-fft.html +template +inline __device__ void rfft(c10::complex (&thread_data)[FFT::elements_per_thread], + c10::complex *shared_mem){ + using cfloat_t = typename c10::complex; + using complex_t = typename cufftdx::detail::complex; + constexpr int N = cufftdx::size_of::value; + constexpr int EPT = FFT::elements_per_thread; + + complex_t *smem_c = reinterpret_cast(shared_mem); + complex_t (&thread_data_fft)[EPT] = reinterpret_cast(thread_data); + FFT().execute(thread_data_fft, smem_c); + __syncthreads(); + #pragma unroll + for (int i = 0; i < EPT; ++i) { + smem_c[threadIdx.x + FFT::stride * i] = thread_data_fft[i]; + } + __syncthreads(); + #pragma unroll + for (int i = 0; i < EPT; ++i) { + if ((threadIdx.x == 0) && (i == 0)) { + cfloat_t smem_val = shared_mem[0]; + thread_data[i] = cfloat_t(smem_val.real_ + smem_val.imag_, smem_val.real_ - smem_val.imag_); + } else { + int index = threadIdx.x + FFT::stride * i; + cfloat_t smem_val_0 = shared_mem[index], smem_val_1 = shared_mem[N - index]; + cfloat_t X_even = smem_val_0 + std::conj(smem_val_1); + // constexpr cfloat_t j = cfloat_t(0.f, 1.f); + // cfloat_t X_odd = -j * (smem_val_0 - std::conj(smem_val_1)); + // Algebraic simplification + cfloat_t X_odd = cfloat_t(smem_val_0.imag_ + smem_val_1.imag_, -smem_val_0.real_ + smem_val_1.real_); + // cfloat_t twiddle; + // sincospif(-float(index) / N, reinterpret_cast(&twiddle) + 1, + // reinterpret_cast(&twiddle)); + // Reading from lookup table is faster than computing the twiddle + int quadrant = i / (EPT / 4); + cfloat_t twiddle = twiddle_from_lut(quadrant, index); + thread_data[i] = (X_even + X_odd * twiddle) / 2; + } + } +} + +// Implement a conjugate symmetric inverse FFT of size 2 * N by calling a complex iFFT of size N. +// http://www.robinscheibler.org/2013/02/13/real-fft.html +template +inline __device__ void irfft(c10::complex (&thread_data)[IFFT::elements_per_thread], + c10::complex *shared_mem){ + using cfloat_t = typename c10::complex; + using complex_t = typename cufftdx::detail::complex; + constexpr int N = cufftdx::size_of::value; + constexpr int EPT = IFFT::elements_per_thread; + + #pragma unroll + for (int i = 0; i < EPT; ++i) { + shared_mem[threadIdx.x + IFFT::stride * i] = thread_data[i]; + } + __syncthreads(); + #pragma unroll + for (int i = 0; i < EPT; ++i) { + if ((threadIdx.x == 0) && (i == 0)) { + cfloat_t smem_val = shared_mem[0]; + thread_data[i] = cfloat_t(smem_val.real_ + smem_val.imag_, smem_val.real_ - smem_val.imag_); + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("%f.4f+%.4fi, ", thread_data[i].real_, thread_data[i].imag_); + // } + } else { + int index = threadIdx.x + IFFT::stride * i; + cfloat_t smem_val_0 = shared_mem[index], smem_val_1 = shared_mem[N - index]; + cfloat_t X_even = smem_val_0 + std::conj(smem_val_1); + // cfloat_t twiddle; + // sincospif(float(index) / N, reinterpret_cast(&twiddle) + 1, + // reinterpret_cast(&twiddle));; + // Reading from lookup table is faster than computing the twiddle + int quadrant = i / (EPT / 4); + cfloat_t twiddle = std::conj(twiddle_from_lut(quadrant, index)); + // cfloat_t X_odd = (smem_val_0 - std::conj(smem_val_1)) * twiddle; + // constexpr cfloat_t j = cfloat_t(0.f, 1.f); + // thread_data[i] = (X_even + j * X_odd) / 2; + // Algebraic simplification + cfloat_t X_odd_j = cfloat_t(-smem_val_0.imag_ - smem_val_1.imag_, smem_val_0.real_ - smem_val_1.real_) * twiddle; + thread_data[i] = X_even + X_odd_j; + } + } + __syncthreads(); + IFFT().execute(reinterpret_cast(thread_data), + reinterpret_cast(shared_mem)); +} + +// // Implement a real FFT of size 2 * N by calling a complex FFT of size N. +// // http://www.robinscheibler.org/2013/02/13/real-fft.html +// template +// inline __device__ void rfftfp16(cufftdx::detail::complex<__half2> (&thread_data)[FFT::elements_per_thread], cufftdx::detail::complex<__half2> *shared_mem){ +// using cfloat_t = typename c10::complex; +// using complex_t = typename cufftdx::detail::complex<__half2>; +// constexpr int N = cufftdx::size_of::value; +// constexpr int EPT = FFT::elements_per_thread; + +// // complex_t *smem_c = reinterpret_cast(shared_mem); +// // complex_t (&thread_data_fft)[EPT] = reinterpret_cast(thread_data); +// FFT().execute(thread_data, shared_mem); +// __syncthreads(); +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// shared_mem[threadIdx.x + FFT::stride * i] = thread_data[i]; +// } +// __syncthreads(); +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// if ((threadIdx.x == 0) && (i == 0)) { +// complex_t smem_val_half = shared_mem[0]; +// cfloat_t smem_val[2]; +// read_rrii(smem_val_half, smem_val); +// // thread_data[i] = cfloat_t(smem_val.real_ + smem_val.imag_, smem_val.real_ - smem_val.imag_); +// cfloat_t res[2] = { +// cfloat_t(smem_val[0].real() + smem_val[0].imag(), smem_val[0].real() - smem_val[0].imag()), +// cfloat_t(smem_val[1].real() + smem_val[1].imag(), smem_val[1].real() - smem_val[1].imag()) +// }; +// thread_data[i] = write_rrii(res); +// } else { +// int index = threadIdx.x + FFT::stride * i; +// // cfloat_t smem_val_0 = shared_mem[index], smem_val_1 = shared_mem[N - index]; +// complex_t smem_val_0_half = shared_mem[index], smem_val_1_half = shared_mem[N - index]; +// cfloat_t smem_val_0[2], smem_val_1[2]; +// read_rrii(smem_val_0_half, smem_val_0); +// read_rrii(smem_val_1_half, smem_val_1); + +// // cfloat_t X_even = smem_val_0 + std::conj(smem_val_1); +// cfloat_t X_even[2] = { +// smem_val_0[0] + std::conj(smem_val_1[0]), +// smem_val_0[1] + std::conj(smem_val_1[1]) +// }; + +// // constexpr cfloat_t j = cfloat_t(0.f, 1.f); +// // cfloat_t X_odd = -j * (smem_val_0 - std::conj(smem_val_1)); +// // Algebraic simplification +// // cfloat_t X_odd = cfloat_t(smem_val_0.imag_ + smem_val_1.imag_, -smem_val_0.real_ + smem_val_1.real_); +// cfloat_t X_odd[2] = { +// cfloat_t(smem_val_0[0].imag() + smem_val_1[0].imag(), -smem_val_0[0].real() + smem_val_1[0].real()), +// cfloat_t(smem_val_0[1].imag() + smem_val_1[1].imag(), -smem_val_0[1].real() + smem_val_1[1].real()) +// }; + +// // cfloat_t twiddle; +// // sincospif(-float(index) / N, reinterpret_cast(&twiddle) + 1, +// // reinterpret_cast(&twiddle)); +// // Reading from lookup table is faster than computing the twiddle +// int quadrant = i / (EPT / 4); +// cfloat_t twiddle = twiddle_from_lut(quadrant, index); + +// // thread_data[i] = (X_even + X_odd * twiddle) / 2; +// cfloat_t result[2] = { +// (X_even[0] + X_odd[0] * twiddle) / 2, +// (X_even[1] + X_odd[1] * twiddle) / 2 +// }; +// thread_data[i] = write_rrii(result); +// } +// } +// } + +// // Implement a conjugate symmetric inverse FFT of size 2 * N by calling a complex iFFT of size N. +// // http://www.robinscheibler.org/2013/02/13/real-fft.html +// template +// inline __device__ void irfftfp16(cufftdx::detail::complex<__half2> (&thread_data)[IFFT::elements_per_thread], +// cufftdx::detail::complex<__half2> *shared_mem){ +// using cfloat_t = typename c10::complex; +// using complex_t = typename cufftdx::detail::complex<__half2>; +// constexpr int N = cufftdx::size_of::value; +// constexpr int EPT = IFFT::elements_per_thread; + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// shared_mem[threadIdx.x + IFFT::stride * i] = thread_data[i]; +// } +// __syncthreads(); +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// if ((threadIdx.x == 0) && (i == 0)) { +// complex_t smem_val_half = shared_mem[0]; +// cfloat_t smem_val[2]; +// read_rrii(smem_val_half, smem_val); +// // thread_data[i] = cfloat_t(smem_val.real_ + smem_val.imag_, smem_val.real_ - smem_val.imag_); +// cfloat_t res[2] = { +// cfloat_t(smem_val[0].real() + smem_val[0].imag(), smem_val[0].real() - smem_val[0].imag()), +// cfloat_t(smem_val[1].real() + smem_val[1].imag(), smem_val[1].real() - smem_val[1].imag()) +// }; +// thread_data[i] = write_rrii(res); +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { +// // printf("%f.4f+%.4fi, ", thread_data[i].real_, thread_data[i].imag_); +// // } +// } else { +// int index = threadIdx.x + IFFT::stride * i; +// // cfloat_t smem_val_0 = shared_mem[index], smem_val_1 = shared_mem[N - index]; +// complex_t smem_val_0_half = shared_mem[index], smem_val_1_half = shared_mem[N - index]; +// cfloat_t smem_val_0[2], smem_val_1[2]; +// read_rrii(smem_val_0_half, smem_val_0); +// read_rrii(smem_val_1_half, smem_val_1); + +// // cfloat_t X_even = smem_val_0 + std::conj(smem_val_1); +// cfloat_t X_even[2] = { +// smem_val_0[0] + std::conj(smem_val_1[0]), +// smem_val_0[1] + std::conj(smem_val_1[1]) +// }; + +// // cfloat_t twiddle; +// // sincospif(float(index) / N, reinterpret_cast(&twiddle) + 1, +// // reinterpret_cast(&twiddle));; +// // Reading from lookup table is faster than computing the twiddle +// int quadrant = i / (EPT / 4); +// cfloat_t twiddle = std::conj(twiddle_from_lut(quadrant, index)); + +// // cfloat_t X_odd = (smem_val_0 - std::conj(smem_val_1)) * twiddle; +// // constexpr cfloat_t j = cfloat_t(0.f, 1.f); +// // thread_data[i] = (X_even + j * X_odd) / 2; +// // Algebraic simplification +// // cfloat_t X_odd_j = cfloat_t(-smem_val_0.imag_ - smem_val_1.imag_, smem_val_0.real_ - smem_val_1.real_) * twiddle; +// cfloat_t X_odd_j[2] = { +// cfloat_t(-smem_val_0[0].imag() - smem_val_1[0].imag(), smem_val_0[0].real() - smem_val_1[0].real()) * twiddle, +// cfloat_t(-smem_val_0[1].imag() - smem_val_1[1].imag(), smem_val_0[1].real() - smem_val_1[1].real()) * twiddle +// }; + +// // thread_data[i] = X_even + X_odd_j; +// cfloat_t res[2] = { +// (X_even[0] + X_odd_j[0]), +// (X_even[1] + X_odd_j[1]) +// }; +// thread_data[i] = write_rrii(res); +// } +// } +// __syncthreads(); +// IFFT().execute(reinterpret_cast(thread_data), +// reinterpret_cast(shared_mem)); +// } + +template +__launch_bounds__( FFT::max_threads_per_block ) +__global__ void fftconv_fwd_kernel(const input_t *__restrict__ inputData, + const c10::complex *__restrict__ filterData, + const input_t *__restrict__ inputMulVData, + const input_t *__restrict__ inputMulQData, + const float *__restrict__ DData, + const float *__restrict__ dropmaskData, + output_t *__restrict__ outputData, + int batch_size, + int H, + int signal_size, + size_t batch_stride, size_t H_stride, + bool output_hbl_layout) { + + using complex_t = typename cufftdx::detail::complex; + using cfloat_t = typename c10::complex; + constexpr int N = cufftdx::size_of::value; + constexpr int EPT = FFT::elements_per_thread; + static_assert(FFT::storage_size == EPT); + static_assert(IFFT::storage_size == EPT); + + using BlockLoad_input = cub::BlockLoad; + using BlockLoad_filter = cub::BlockLoad; + using BlockStore_output = cub::BlockStore, FFT::block_dim.x, EPT / 2, cub::BLOCK_STORE_STRIPED>; + + extern __shared__ cfloat_t shared_mem[]; + + float result_data[EPT] = { 0 }; + + cfloat_t filter_data[EPT]; + // Adjust for head dim + unsigned int filter_id = blockIdx.y; + BlockLoad_filter().Load(filterData + filter_id * (N + 1), filter_data); + // CHECK THIS!!! + if (threadIdx.x == 0) { + filter_data[0].imag_ = *(reinterpret_cast(filterData + filter_id * (N + 1) + N)); + } + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { filter_data[i] /= 2 * N; } + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // for (int i = 0; i < FFT::storage_size / 2; i++) { + // printf("%.4f+%.4fi, ", filter_data[i].real_, filter_data[i].imag_); + // } + // printf("\n"); + // } + + // CHECK THIS!!! + float D_val = DData[filter_id]; + unsigned int dropmask_id = blockIdx.x * H + blockIdx.y; + float dropmask_val = dropmaskData == nullptr ? 1.f : dropmaskData[dropmask_id]; + + float v_data[EPT]; + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + // Used for inputMulVData only + size_t v_offset = blockIdx.x * batch_stride + (blockIdx.y * HEADDIM + blockIdx.z) * H_stride; + if (QV) { + BlockLoad_input().Load(reinterpret_cast *>(inputMulVData + v_offset), + reinterpret_cast(v_data), + signal_size / 2, cfloat_t(0.f)); + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("v_data: "); + // for (int i = 0; i < EPT; i++) { + // printf("%.4f, ", v_data[i]); + // } + // printf("\n"); + // } + } + + // Doesn't seem to matter if we put #pragma unroll + // #pragma unroll + for (int head_i = 0; head_i < HEADDIM; head_i++) { + // Local array and copy data into it + float u_og_data[EPT]; + cfloat_t thread_data[EPT]; + + // Id for inputData and inputMulQData + size_t u_offset = blockIdx.x * batch_stride + (blockIdx.y * HEADDIM + head_i) * H_stride; + + BlockLoad_input().Load(reinterpret_cast *>(inputData + u_offset), + reinterpret_cast(u_og_data), + signal_size / 2, cfloat_t(0.f)); + // TODO: what if signal_size is odd + if (GELU_INPUT) { gelu(u_og_data, u_og_data); } + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("u_og_data: "); + // for (int i = 0; i < EPT; i++) { + // printf("%.4f, ", u_og_data[i]); + // } + // printf("\n"); + // } + + if (QV) { + #pragma unroll + for (int i = 0; i < EPT; ++i) { + u_og_data[i] *= v_data[i]; + } + } + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("u_og_data: "); + // for (int i = 0; i < EPT; i++) { + // printf("%.4f, ", u_og_data[i]); + // } + // printf("\n"); + // } + + #pragma unroll + for (int i = 0; i < EPT; ++i) { + thread_data[i] = i < EPT / 2 ? cfloat_t(u_og_data[i * 2], u_og_data[i * 2 + 1]) : cfloat_t(0.f); + } + + if (head_i > 0) { __syncthreads(); } + // Execute FFT + rfft(thread_data, shared_mem); + + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + thread_data[i] = (threadIdx.x == 0) && (i == 0) ? + pointwise_mul(thread_data[i], filter_data[i]) : thread_data[i] * filter_data[i]; + } + + // Execute FFT + __syncthreads(); + irfft(thread_data, shared_mem); + + float out_data[EPT] {}; + + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + out_data[i] = reinterpret_cast(thread_data)[i] + u_og_data[i] * D_val; + } + + // GELU_OUTPUT and dropout + // https://github.com/pytorch/pytorch/blob/dc169d53aa266560750ea25ee0cf31c7e614550d/aten/src/ATen/native/cuda/Activation.cu#L395 + if (GELU_OUTPUT) { gelu(out_data, out_data); } + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + out_data[i] *= dropmask_val; + } + + float q_data[EPT]; + + if (QV) { + BlockLoad_input().Load(reinterpret_cast *>(inputMulQData + u_offset), + reinterpret_cast(q_data), + signal_size / 2, cfloat_t(0.f)); + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("q_data: "); + // for (int i = 0; i < EPT; i++) { + // printf("%.4f, ", q_data[i]); + // } + // printf("\n"); + // } + + if (GELU_Q) { gelu(q_data, q_data); } + #pragma unroll + for (int i = 0; i < EPT; ++i) { + out_data[i] *= q_data[i]; + } + } + + #pragma unroll + for (int i = 0; i < EPT; ++i) { + result_data[i] += out_data[i]; + } + } + + // Save results + c10::complex write_data[EPT / 2]; + #pragma unroll + for (int i = 0; i < EPT / 2; ++i) { + write_data[i] = c10::complex(output_t(result_data[i * 2]), output_t(result_data[i * 2 + 1])); + } + unsigned int output_fft_id = !output_hbl_layout ? blockIdx.x * H + blockIdx.y * HEADDIM + blockIdx.z : blockIdx.x + (blockIdx.y * HEADDIM + blockIdx.z) * batch_size; + BlockStore_output().Store(reinterpret_cast *>(outputData + output_fft_id * signal_size), + write_data, signal_size / 2); + // TODO: what if signal_size is odd? +} + + +// template +// __launch_bounds__( FFT::max_threads_per_block ) +// __global__ void fftconv_fwd_kernelfp16(const input_t *__restrict__ inputData, +// const c10::complex *__restrict__ filterData, +// const input_t *__restrict__ inputMulVData, +// const input_t *__restrict__ inputMulQData, +// const float *__restrict__ DData, +// const float *__restrict__ dropmaskData, +// output_t *__restrict__ outputData, +// int batch_size, +// int H, +// int signal_size, +// bool output_hbl_layout) { + +// using complex_t = typename cufftdx::detail::complex<__half2>; +// using cfloat_t = typename c10::complex; +// constexpr int N = cufftdx::size_of::value; +// constexpr int EPT = FFT::elements_per_thread; +// static_assert(FFT::storage_size == EPT); +// static_assert(IFFT::storage_size == EPT); + +// using BlockLoad_input = cub::BlockLoad; +// using BlockLoad_filter = cub::BlockLoad; +// using BlockStore_output = cub::BlockStore, FFT::block_dim.x, EPT / 2, cub::BLOCK_STORE_STRIPED>; + +// extern __shared__ cfloat_t shared_mem[]; + +// float result_data[2][EPT] = { 0 }; + +// cfloat_t filter_data[EPT]; +// unsigned int filter_id = blockIdx.y; +// BlockLoad_filter().Load(filterData + filter_id * (N + 1), filter_data); +// if (threadIdx.x == 0) { +// filter_data[0].imag_ = *(reinterpret_cast(filterData + filter_id * (N + 1) + N)); +// } +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { filter_data[i] /= 2 * N; } + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0)) { +// // for (int i = 0; i < FFT::storage_size / 2; i++) { +// // printf("%.4f+%.4fi, ", filter_data[i].real_, filter_data[i].imag_); +// // } +// // printf("\n"); +// // } + +// float D_val = DData[filter_id]; +// unsigned int dropmask_id = blockIdx.x * H + blockIdx.y; +// float dropmask_val = dropmaskData == nullptr ? 1.f : dropmaskData[dropmask_id]; + +// // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) +// // Used for inputMulVData only +// unsigned int global_fft_id = blockIdx.x * H * FFT::ffts_per_block + blockIdx.y * HEADDIM + blockIdx.z; + +// // do not pragma unroll this!! +// for (int head_i = 0; head_i < HEADDIM; head_i++) { +// // Local array and copy data into it +// float u_og_data[2][EPT]; +// float v_data[2][EPT]; +// complex_t thread_data[EPT]; + +// // Id for inputData and inputMulQData +// unsigned int head_fft_id = blockIdx.x * H * FFT::ffts_per_block + blockIdx.y * HEADDIM + head_i; + +// BlockLoad_input().Load(reinterpret_cast *>(inputData + head_fft_id * signal_size), +// reinterpret_cast(u_og_data[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_input().Load(reinterpret_cast *>(inputData + head_fft_id * signal_size + H * signal_size), +// reinterpret_cast(u_og_data[1]), +// signal_size / 2, cfloat_t(0.f)); +// // TODO: what if signal_size is odd +// if (GELU_INPUT) { +// gelu(u_og_data[0], u_og_data[0]); +// gelu(u_og_data[1], u_og_data[1]); +// } + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // printf("u_og_data[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", u_og_data[0][i]); +// // } +// // printf("\n"); +// // printf("u_og_data[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", u_og_data[1][i]); +// // } +// // printf("\n"); +// // } + +// BlockLoad_input().Load(reinterpret_cast *>(inputMulVData + global_fft_id * signal_size), +// reinterpret_cast(v_data[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_input().Load(reinterpret_cast *>(inputMulVData + global_fft_id * signal_size + H * signal_size), +// reinterpret_cast(v_data[1]), +// signal_size / 2, cfloat_t(0.f)); + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// u_og_data[0][i] = u_og_data[0][i] * v_data[0][i]; +// u_og_data[1][i] = u_og_data[1][i] * v_data[1][i]; +// } + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // printf("u_og_data[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", u_og_data[0][i]); +// // } +// // printf("\n"); +// // printf("u_og_data[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", u_og_data[1][i]); +// // } +// // printf("\n"); +// // } + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// thread_data[i] = i < EPT / 2 ? complex_t { +// __float22half2_rn({u_og_data[0][i * 2], u_og_data[1][i * 2]}), +// __float22half2_rn({u_og_data[0][i * 2 + 1], u_og_data[1][i * 2 + 1]}) +// } : complex_t { __float22half2_rn({0.f, 0.f}), __float22half2_rn({0.f, 0.f}) }; +// } + +// if (head_i > 0) { __syncthreads(); } +// // Execute FFT +// rfftfp16(thread_data, reinterpret_cast *>(shared_mem)); + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // cfloat_t thread_floats [2]; +// // printf("fft(u)[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // read_rrii(thread_data[i], thread_floats); +// // printf("%.4f+%.4fi, ", thread_floats[0].real_, thread_floats[0].imag_); +// // } +// // printf("\n"); +// // printf("fft(u)[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // read_rrii(thread_data[i], thread_floats); +// // printf("%.4f+%.4fi, ", thread_floats[1].real_, thread_floats[1].imag_); +// // } +// // printf("\n"); +// // } + +// // here, do a pointwise mul converting from rr fp16 to fp32 +// cfloat_t thread_floats [2]; +// cfloat_t res [2]; +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// read_rrii(thread_data[i], thread_floats); +// for ( int j = 0; j < 2; j++ ) { +// res[j] = (threadIdx.x == 0) && (i == 0) ? +// pointwise_mul(thread_floats[j], filter_data[i]) : thread_floats[j] * filter_data[i]; +// } +// thread_data[i] = write_rrii(res); +// // thread_data[i] = (threadIdx.x == 0) && (i == 0) ? +// // pointwise_mul(thread_data[i], filter_data[i]) : thread_data[i] * filter_data[i]; +// } + +// // Execute FFT +// __syncthreads(); +// irfftfp16(thread_data, reinterpret_cast *>(shared_mem)); + + +// float out_data[2][EPT] {}; + +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// out_data[0][i] = __half2float(reinterpret_cast<__half2 (&)[EPT * 2]>(thread_data)[i].x) + u_og_data[0][i] * D_val; +// out_data[1][i] = __half2float(reinterpret_cast<__half2 (&)[EPT * 2]>(thread_data)[i].y) + u_og_data[1][i] * D_val; +// } + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // printf("out[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", out_data[0][i]); +// // } +// // printf("\n"); +// // printf("out[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", out_data[1][i]); +// // } +// // printf("\n"); +// // } + +// // GELU and dropout +// // https://github.com/pytorch/pytorch/blob/dc169d53aa266560750ea25ee0cf31c7e614550d/aten/src/ATen/native/cuda/Activation.cu#L395 + +// if (GELU_OUTPUT) { +// gelu(out_data[0], out_data[0]); +// gelu(out_data[1], out_data[1]); +// } +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// out_data[0][i] = out_data[0][i] * dropmask_val; +// out_data[1][i] = out_data[1][i] * dropmask_val; +// } + +// float q_data[2][EPT]; + +// BlockLoad_input().Load(reinterpret_cast *>(inputMulQData + head_fft_id * signal_size), +// reinterpret_cast(q_data[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_input().Load(reinterpret_cast *>(inputMulQData + head_fft_id * signal_size + H * signal_size), +// reinterpret_cast(q_data[1]), +// signal_size / 2, cfloat_t(0.f)); + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // printf("q[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", q_data[0][i]); +// // } +// // printf("\n"); +// // printf("q[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", q_data[1][i]); +// // } +// // printf("\n"); +// // } + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// out_data[0][i] = q_data[0][i] * out_data[0][i]; +// out_data[1][i] = q_data[1][i] * out_data[1][i]; +// } + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// result_data[0][i] += out_data[0][i]; +// result_data[1][i] += out_data[1][i]; +// } +// } + +// // Save results +// c10::complex write_data[2][EPT / 2]; +// #pragma unroll +// for (int i = 0; i < EPT / 2; ++i) { +// write_data[0][i] = c10::complex(output_t(result_data[0][i * 2]), output_t(result_data[0][i * 2 + 1])); +// write_data[1][i] = c10::complex(output_t(result_data[1][i * 2]), output_t(result_data[1][i * 2 + 1])); +// } + +// unsigned int output_fft_id = !output_hbl_layout ? blockIdx.x * H * FFT::ffts_per_block + blockIdx.y * HEADDIM + blockIdx.z : blockIdx.x * FFT::ffts_per_block + (blockIdx.y * HEADDIM + blockIdx.z) * batch_size; +// BlockStore_output().Store(reinterpret_cast *>(outputData + output_fft_id * signal_size), +// write_data[0], signal_size / 2); +// BlockStore_output().Store(reinterpret_cast *>(outputData + output_fft_id * signal_size + (!output_hbl_layout ? H * signal_size : signal_size)), +// write_data[1], signal_size / 2); +// // TODO: what if signal_size is odd? +// } + +template +void fftconv_fwd_cuda(const input_t *u, const c10::complex *filter, + const input_t *v, int head_dim, const input_t *q, + const float *D, const float *dropout_mask, output_t *out, + bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, bool output_hbl_layout, bool fftfp16) { +#if defined(__CUDA_ARCH__) + constexpr uint ARCH = __CUDA_ARCH__; +#else + constexpr uint ARCH = 700; +#endif + + (void) gelu_inp; // these options are not supported right now + (void) gelu_q; // these options are not supported right now + + switch (head_dim) { + case 1: + { + constexpr uint FPB = 1; + // FFT is defined, its: size, type, direction, precision. Block() operator + // informs that FFT will be executed on block level. Shared memory is + // required for co-operation between threads. + + using FFT_base = decltype(cufftdx::Block() + cufftdx::Size() + cufftdx::Precision() + + cufftdx::ElementsPerThread() + cufftdx::FFTsPerBlock() + cufftdx::SM() + + cufftdx::Type()); + + using FFT = decltype(FFT_base() + cufftdx::Direction()); + using IFFT = decltype(FFT_base() + cufftdx::Direction()); + + // By default the shared memory size is 4 * FFT_SIZE (idk how). + // So it wouldn't work for our rfft and irfft functions. + const auto shared_memory_size = std::max({FFT::shared_memory_size, IFFT::shared_memory_size, + 8 * FFT_SIZE}); + // printf("shared_memory_size = %d\n", shared_memory_size); + + // unsigned int blocks_per_grid { static_cast( std::ceil( batch_size / FPB ) ) }; + unsigned int H_per_grid { static_cast( std::ceil( H / FPB ) ) }; + dim3 block(batch_size, H_per_grid / head_dim, head_dim); + BOOL_SWITCH(v != nullptr, QV, [&] { + auto kernel = &fftconv_fwd_kernel; + // Increase dynamic memory limit if required. + CUDA_RT_CALL( cudaFuncSetAttribute(kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size )); + kernel<<>>(u, filter, v, q, D, dropout_mask, out, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout); + }); + break; + } + case 8: + { + if (fftfp16) { + // constexpr uint FPB = 2; + + // using FFT_base = decltype(cufftdx::Block() + cufftdx::Size() + cufftdx::Precision<__half>() + + // cufftdx::ElementsPerThread() + cufftdx::FFTsPerBlock() + cufftdx::SM() + // + cufftdx::Type()); + + // using FFT = decltype(FFT_base() + cufftdx::Direction()); + // using IFFT = decltype(FFT_base() + cufftdx::Direction()); + + // // By default the shared memory size is 4 * FFT_SIZE (idk how). + // // So it wouldn't work for our rfft and irfft functions. + // const auto shared_memory_size = std::max({FFT::shared_memory_size, IFFT::shared_memory_size, + // 8 * FFT_SIZE}); + // // printf("shared_memory_size = %d\n", shared_memory_size); + + // unsigned int blocks_per_grid { static_cast( std::ceil( batch_size / FPB ) ) }; + // // unsigned int H_per_grid { static_cast( std::ceil( H / FPB ) ) }; + // dim3 block(blocks_per_grid, H / head_dim, head_dim); + // constexpr bool QV = true; // Multi-head requires QV + + // auto kernel = &fftconv_fwd_kernelfp16; + + // // Increase dynamic memory limit if required. + // CUDA_RT_CALL( cudaFuncSetAttribute(kernel, + // cudaFuncAttributeMaxDynamicSharedMemorySize, + // shared_memory_size )); + // kernel<<>>(u, filter, v, q, D, dropout_mask, out, batch_size, H, signal_size, output_hbl_layout); + } + else { + // uncomment this and the kernel line below to go back to fp32 + constexpr uint FPB = 1; + + using FFT_base = decltype(cufftdx::Block() + cufftdx::Size() + cufftdx::Precision() + + cufftdx::ElementsPerThread() + cufftdx::FFTsPerBlock() + cufftdx::SM() + + cufftdx::Type()); + + using FFT = decltype(FFT_base() + cufftdx::Direction()); + using IFFT = decltype(FFT_base() + cufftdx::Direction()); + + // By default the shared memory size is 4 * FFT_SIZE (idk how). + // So it wouldn't work for our rfft and irfft functions. + const auto shared_memory_size = std::max({FFT::shared_memory_size, IFFT::shared_memory_size, + 8 * FFT_SIZE}); + // printf("shared_memory_size = %d\n", shared_memory_size); + + unsigned int blocks_per_grid { static_cast( std::ceil( batch_size / FPB ) ) }; + // unsigned int H_per_grid { static_cast( std::ceil( H / FPB ) ) }; + dim3 block(blocks_per_grid, H / head_dim, head_dim); + constexpr bool QV = true; // Multi-head requires QV + + // change this line to go back to fp32 + auto kernel = &fftconv_fwd_kernel; + + // Increase dynamic memory limit if required. + CUDA_RT_CALL( cudaFuncSetAttribute(kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size )); + kernel<<>>(u, filter, v, q, D, dropout_mask, out, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout); + } + break; + } + default: + AT_ERROR("fftconv forward not implemented for this head_dim"); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); +}; + +template +void fftconv_fwd_cuda_dispatch(const input_t *u, const c10::complex *filter, + const input_t *v, int head_dim, const input_t *q, + const float *D, const float *dropout_mask, output_t *out, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16) { + BOOL_SWITCH(gelu, GELU_OUTPUT, [&] { + switch(fft_size) { + case 16: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 32: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 64: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 128: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 256: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 512: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 1024: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 2048: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 4096: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 8192: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 16384: + fftconv_fwd_cuda( + u, filter, v, head_dim, q, D, dropout_mask, out, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + default: + AT_ERROR("fftconv forward not implemented for this fft_size"); + } + }); +} + +template +__launch_bounds__( FFT::max_threads_per_block ) +__global__ void fftconv_bwd_kernel(const output_t *__restrict__ doutData, + const input_t *__restrict__ inputData, + const c10::complex *__restrict__ filterData, + const input_t *__restrict__ inputMulVData, + const input_t *__restrict__ inputMulQData, + const float *__restrict__ DData, + const float *__restrict__ dropmaskData, + input_t *__restrict__ duData, + c10::complex *__restrict__ dfilterData, + float *__restrict__ dDData, + float *__restrict__ dvData, + input_t *__restrict__ dqData, + int batch_size, + int H, + int signal_size, + size_t batch_stride, size_t H_stride, + bool output_hbl_layout) { + + using complex_t = typename cufftdx::detail::complex; + using cfloat_t = typename c10::complex; + constexpr int N = cufftdx::size_of::value; + constexpr int EPT = FFT::elements_per_thread; + static_assert(FFT::storage_size == EPT); + static_assert(IFFT::storage_size == EPT); + + using BlockLoad_input = cub::BlockLoad; + using BlockLoad_filter = cub::BlockLoad; + using BlockLoad_dout = cub::BlockLoad; + using BlockStore_dinput = cub::BlockStore, FFT::block_dim.x, EPT / 2, cub::BLOCK_STORE_STRIPED>; + using BlockStore_dv = cub::BlockStore; + using BlockStore_dfilter = cub::BlockStore; + + extern __shared__ cfloat_t shared_mem[]; + + float du_data[EPT] = { 0 }; + float dq_data[EPT] = { 0 }; + cfloat_t dfilter_data[EPT] = { 0 }; + float dD_val = 0.f; + + // #pragma unroll + // for ( int i = 0; i < EPT; i++ ) { + // dfilter_data[i] = cfloat_t(0, 0); + // } + + // Local array and copy data into it + float u_og_data_before_gelu[EPT]; + float u_og_data[EPT]; + float q_data[EPT]; + + // Id for inputData and inputMulQData + size_t u_offset = blockIdx.x * batch_stride + (blockIdx.y * HEADDIM + blockIdx.z) * H_stride; + BlockLoad_input().Load(reinterpret_cast *>(inputData + u_offset), + reinterpret_cast(u_og_data_before_gelu), + signal_size / 2, cfloat_t(0.f)); + // TODO: what if signal_size is odd + if (GELU_INPUT) { + gelu(u_og_data, u_og_data_before_gelu); + } else { + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { u_og_data[i] = u_og_data_before_gelu[i]; } + } + + cfloat_t filter_data[EPT]; + + unsigned int filter_id = blockIdx.y; + BlockLoad_filter().Load(filterData + filter_id * (N + 1), filter_data); + if (threadIdx.x == 0) { + filter_data[0].imag_ = *(reinterpret_cast(filterData + filter_id * (N + 1) + N)); + } + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { filter_data[i] /= 2 * N; } + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // for (int i = 0; i < FFT::storage_size / 2; i++) { + // printf("%.4f+%.4fi, ", filter_data[i].real_, filter_data[i].imag_); + // } + // printf("\n"); + // } + + float D_val = DData[filter_id]; + unsigned int dropmask_id = blockIdx.x * H + blockIdx.y; + float dropmask_val = dropmaskData == nullptr ? 1.f : dropmaskData[dropmask_id]; + + if (QV) { + // Will need to change this if head_dim is not 1 + BlockLoad_input().Load(reinterpret_cast *>(inputMulQData + u_offset), + reinterpret_cast(q_data), + signal_size / 2, cfloat_t(0.f)); + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("q_data: "); + // for (int i = 0; i < EPT; i++) { + // printf("%.4f, ", q_data[i]); + // } + // printf("\n"); + // } + } + + // do not pragma unroll this!! + for (int head_i = 0; head_i < HEADDIM; head_i++) { + float k_data[EPT]; + float v_data[EPT]; + cfloat_t thread_data[EPT]; + float grad_data[EPT]; + + #pragma unroll + for (int i = 0; i < EPT; ++i) { k_data[i] = u_og_data[i]; } + + size_t v_offset = blockIdx.x * batch_stride + (blockIdx.y * HEADDIM + head_i) * H_stride; + if (QV) { + BlockLoad_input().Load(reinterpret_cast *>(inputMulVData + v_offset), + reinterpret_cast(v_data), + signal_size / 2, cfloat_t(0.f)); + + // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // printf("v_data: "); + // for (int i = 0; i < EPT; i++) { + // printf("%.4f, ", v_data[i]); + // } + // printf("\n"); + // } + + #pragma unroll + for (int i = 0; i < EPT; ++i) { + k_data[i] *= v_data[i]; + } + } + + #pragma unroll + for (int i = 0; i < EPT; ++i) { + thread_data[i] = i < EPT / 2 ? cfloat_t(k_data[i * 2], k_data[i * 2 + 1]) : cfloat_t(0.f); + } + + __syncthreads(); + // Execute FFT + rfft(thread_data, shared_mem); + + cfloat_t u_f[EPT]; + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { u_f[i] = thread_data[i]; } + + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + thread_data[i] = (threadIdx.x == 0) && (i == 0) ? + pointwise_mul(thread_data[i], filter_data[i]) : thread_data[i] * filter_data[i]; + } + + // Execute FFT + __syncthreads(); + irfft(thread_data, shared_mem); + + + float out_data[EPT] {}; + + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + out_data[i] = reinterpret_cast(thread_data)[i] + k_data[i] * D_val; + } + + unsigned int output_fft_id = !output_hbl_layout ? blockIdx.x * H + blockIdx.y * HEADDIM + head_i : blockIdx.x + (blockIdx.y * HEADDIM + head_i) * batch_size; + BlockLoad_dout().Load(reinterpret_cast *>(doutData + output_fft_id * signal_size), + reinterpret_cast(grad_data), + signal_size / 2, cfloat_t(0.f)); + + float out_data_before_gelu[EPT]; + #pragma unroll + for (int i = 0; i < EPT; ++i) { out_data_before_gelu[i] = out_data[i]; }; + if (GELU_OUTPUT) { gelu(out_data, out_data); } + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + out_data[i] *= dropmask_val; + } + + // dQ + if (QV) { + #pragma unroll + for (int i = 0; i < EPT; ++i) { + if (GELU_Q) { + constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + constexpr float kAlpha = M_SQRT1_2; + const float cdf = 0.5 * (1 + erff(q_data[i] * kAlpha)); + const float pdf = expf(-0.5 * q_data[i] * q_data[i]) * kBeta; + dq_data[i] += (cdf + q_data[i] * pdf) * grad_data[i] * out_data[i]; + grad_data[i] *= q_data[i] * cdf; + } else { + dq_data[i] += grad_data[i] * out_data[i]; + grad_data[i] *= q_data[i]; + } + } + } + + // dGELU and dropout + // https://github.com/pytorch/pytorch/blob/dc169d53aa266560750ea25ee0cf31c7e614550d/aten/src/ATen/native/cuda/Activation.cu#L418 + #pragma unroll + for ( int i = 0; i < EPT; ++i) { grad_data[i] *= dropmask_val; } + if (GELU_OUTPUT) { dgelu(grad_data, grad_data, out_data_before_gelu); } + + // CHANGE THIS!!! + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + dD_val += grad_data[i] * k_data[i]; + } + + cfloat_t grad_data_c[EPT]; + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + grad_data_c[i] = i < EPT / 2 ? cfloat_t(grad_data[i * 2], grad_data[i * 2 + 1]) : cfloat_t(0.f); + } + + __syncthreads(); + rfft(grad_data_c, shared_mem); + + // CHANGE THIS!!! + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + dfilter_data[i] += ((threadIdx.x == 0) && (i == 0) ? + pointwise_mul(grad_data_c[i], u_f[i]) : grad_data_c[i] * std::conj(u_f[i])) / (2 * N); + } + + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + grad_data_c[i] = (threadIdx.x == 0) && (i == 0) ? + pointwise_mul(grad_data_c[i], filter_data[i]) : grad_data_c[i] * std::conj(filter_data[i]); + } + + __syncthreads(); + irfft(grad_data_c, shared_mem); + + float du_data_local[EPT]; + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + du_data_local[i] = reinterpret_cast(grad_data_c)[i] + grad_data[i] * D_val; + } + + float dv_data[EPT]; + // compute dv, and update du + if (QV) { + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + // dv + constexpr float kAlpha = M_SQRT1_2; + dv_data[i] = du_data_local[i] * (GELU_INPUT ? (u_og_data_before_gelu[i] * 0.5 * (1 + erff(u_og_data_before_gelu[i] * kAlpha))) : u_og_data_before_gelu[i]); + + // update du + du_data_local[i] = du_data_local[i] * v_data[i]; + if (GELU_INPUT) { + constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + const float cdf = 0.5 * (1 + erff(u_og_data_before_gelu[i] * kAlpha)); + const float pdf = expf(-0.5 * u_og_data_before_gelu[i] * u_og_data_before_gelu[i]) * kBeta; + du_data_local[i] = (cdf + u_og_data_before_gelu[i] * pdf) * du_data_local[i]; + } + } + } + + #pragma unroll + for ( int i = 0; i < EPT; i++ ) { + du_data[i] += du_data_local[i]; + } + + // store dv using atomic add + if (QV) { + unsigned int dv_data_idx; + unsigned int thread_id = threadIdx.x; + #pragma unroll + for (int i = 0; i < EPT / 2; ++i) { + // compute index based on thread idx, i, and head_i + dv_data_idx = FFT::block_dim.x * i + thread_id; + if (dv_data_idx < signal_size / 2) { + // add the real and imaginary parts separately + cfloat_t *loc = &reinterpret_cast(dvData + v_offset)[dv_data_idx]; + atomicAdd(reinterpret_cast(loc), dv_data[i * 2]); + atomicAdd(reinterpret_cast(loc) + 1, dv_data[i * 2 + 1]); + } + } + } + // TODO: what if signal_size is odd? + } + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + unsigned int dfilter_id = blockIdx.x * H + blockIdx.y * HEADDIM + blockIdx.z; + + // There may be something wrong here?? + // Save dD + using BlockReduceT = cub::BlockReduce; + using TempStorageT = typename BlockReduceT::TempStorage; + __syncthreads(); + dD_val = BlockReduceT(reinterpret_cast(shared_mem)).Sum(dD_val); + if (threadIdx.x == 0) { *(dDData + dfilter_id) = dD_val; } + + // Save dfilter + float dfilter_extra = 0.f; + if (threadIdx.x == 0) { + dfilter_extra = dfilter_data[0].imag_; + dfilter_data[0].imag_ = 0.f; + } + + BlockStore_dfilter().Store(dfilterData + dfilter_id * (N + 1), dfilter_data); + if (threadIdx.x == 0) { + *(dfilterData + dfilter_id * (N + 1) + N) = cfloat_t(dfilter_extra, 0.f); + } + + // Save results + c10::complex du_data_c[EPT / 2]; + #pragma unroll + for (int i = 0; i < EPT / 2; ++i) { + du_data_c[i] = c10::complex(input_t(du_data[i * 2]), input_t(du_data[i * 2 + 1])); + } + BlockStore_dinput().Store(reinterpret_cast *>(duData + u_offset), + du_data_c, signal_size / 2); + if (QV) { + c10::complex dq_data_c[EPT / 2]; + #pragma unroll + for (int i = 0; i < EPT / 2; ++i) { + dq_data_c[i] = c10::complex(input_t(dq_data[i * 2]), input_t(dq_data[i * 2 + 1])); + } + + BlockStore_dinput().Store(reinterpret_cast *>(dqData + u_offset), // check this pointer arithmetic + dq_data_c, signal_size / 2); + } +} + +// template +// __launch_bounds__( FFT::max_threads_per_block ) +// __global__ void fftconv_bwd_kernelfp16(const output_t *__restrict__ doutData, +// const input_t *__restrict__ inputData, +// const c10::complex *__restrict__ filterData, +// const input_t *__restrict__ inputMulVData, +// const input_t *__restrict__ inputMulQData, +// const float *__restrict__ DData, +// const float *__restrict__ dropmaskData, +// input_t *__restrict__ duData, +// c10::complex *__restrict__ dfilterData, +// float *__restrict__ dDData, +// float *__restrict__ dvData, +// input_t *__restrict__ dqData, +// int batch_size, +// int H, +// int signal_size, +// bool output_hbl_layout) { + +// using complex_t = typename cufftdx::detail::complex<__half2>; +// using cfloat_t = typename c10::complex; +// constexpr int N = cufftdx::size_of::value; +// constexpr int EPT = FFT::elements_per_thread; +// static_assert(FFT::storage_size == EPT); +// static_assert(IFFT::storage_size == EPT); + +// using BlockLoad_input = cub::BlockLoad; +// using BlockLoad_filter = cub::BlockLoad; +// using BlockLoad_dout = cub::BlockLoad; +// using BlockStore_dinput = cub::BlockStore, FFT::block_dim.x, EPT / 2, cub::BLOCK_STORE_STRIPED>; +// using BlockStore_dv = cub::BlockStore; +// using BlockStore_dfilter = cub::BlockStore; + +// extern __shared__ cfloat_t shared_mem[]; + +// float du_data[2][EPT] = { 0 }; +// float dq_data[2][EPT] = { 0 }; +// cfloat_t dfilter_data[2][EPT] = { 0 }; +// float dD_val [2] = { 0.f, 0.f }; + +// // Local array and copy data into it +// float u_og_data_before_gelu[2][EPT]; +// float u_og_data[2][EPT]; +// float q_data[2][EPT]; + +// // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) +// unsigned int global_fft_id = blockIdx.x * H * FFT::ffts_per_block + blockIdx.y * HEADDIM + blockIdx.z; + +// BlockLoad_input().Load(reinterpret_cast *>(inputData + global_fft_id * signal_size), +// reinterpret_cast(u_og_data_before_gelu[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_input().Load(reinterpret_cast *>(inputData + global_fft_id * signal_size + H * signal_size), +// reinterpret_cast(u_og_data_before_gelu[1]), +// signal_size / 2, cfloat_t(0.f)); +// // TODO: what if signal_size is odd +// if (GELU_INPUT) { +// gelu(u_og_data[0], u_og_data_before_gelu[0]); +// gelu(u_og_data[1], u_og_data_before_gelu[1]); +// } else { +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// u_og_data[0][i] = u_og_data_before_gelu[0][i]; +// u_og_data[1][i] = u_og_data_before_gelu[1][i]; +// } +// } + +// cfloat_t filter_data[EPT]; + +// unsigned int filter_id = blockIdx.y; +// BlockLoad_filter().Load(filterData + filter_id * (N + 1), filter_data); +// if (threadIdx.x == 0) { +// filter_data[0].imag_ = *(reinterpret_cast(filterData + filter_id * (N + 1) + N)); +// } +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { filter_data[i] /= 2 * N; } + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { +// // for (int i = 0; i < FFT::storage_size / 2; i++) { +// // printf("%.4f+%.4fi, ", filter_data[i].real_, filter_data[i].imag_); +// // } +// // printf("\n"); +// // } + +// float D_val = DData[filter_id]; +// unsigned int dropmask_id = blockIdx.x * H + blockIdx.y; +// float dropmask_val = dropmaskData == nullptr ? 1.f : dropmaskData[dropmask_id]; + +// if (QV) { +// // Will need to change this if head_dim is not 1 +// BlockLoad_input().Load(reinterpret_cast *>(inputMulQData + global_fft_id * signal_size), +// reinterpret_cast(q_data[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_input().Load(reinterpret_cast *>(inputMulQData + global_fft_id * signal_size + H * signal_size), +// reinterpret_cast(q_data[1]), +// signal_size / 2, cfloat_t(0.f)); + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { +// // printf("q_data: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", q_data[i]); +// // } +// // printf("\n"); +// // } +// } + +// // do not pragma unroll this!! +// for (int head_i = 0; head_i < HEADDIM; head_i++) { +// float k_data[2][EPT]; +// float v_data[2][EPT]; +// complex_t thread_data[EPT]; +// float grad_data[2][EPT]; + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// k_data[0][i] = u_og_data[0][i]; +// k_data[1][i] = u_og_data[1][i]; +// } + +// unsigned int head_fft_id = blockIdx.x * H * FFT::ffts_per_block + blockIdx.y * HEADDIM + head_i; + +// if (QV) { +// BlockLoad_input().Load(reinterpret_cast *>(inputMulVData + head_fft_id * signal_size), +// reinterpret_cast(v_data[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_input().Load(reinterpret_cast *>(inputMulVData + head_fft_id * signal_size + H * signal_size), +// reinterpret_cast(v_data[1]), +// signal_size / 2, cfloat_t(0.f)); + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { +// // printf("v_data: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", v_data[i]); +// // } +// // printf("\n"); +// // } + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// k_data[0][i] *= v_data[0][i]; +// k_data[1][i] *= v_data[1][i]; +// } +// } + +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// thread_data[i] = i < EPT / 2 ? complex_t { +// __float22half2_rn({k_data[0][i * 2], k_data[1][i * 2]}), +// __float22half2_rn({k_data[0][i * 2 + 1], k_data[1][i * 2 + 1]}) +// } : complex_t { __float22half2_rn({0.f, 0.f}), __float22half2_rn({0.f, 0.f}) }; +// } + +// if (head_i > 0) { __syncthreads(); } +// // Execute FFT +// rfftfp16(thread_data, reinterpret_cast *>(shared_mem)); + +// cfloat_t u_f[2][EPT]; +// cfloat_t thread_floats[2]; +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// read_rrii(thread_data[i], thread_floats); +// u_f[0][i] = thread_floats[0]; +// u_f[1][i] = thread_floats[1]; +// } + +// cfloat_t res [2]; +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// read_rrii(thread_data[i], thread_floats); +// for ( int j = 0; j < 2; j++ ) { +// res[j] = (threadIdx.x == 0) && (i == 0) ? +// pointwise_mul(thread_floats[j], filter_data[i]) : thread_floats[j] * filter_data[i]; +// } +// thread_data[i] = write_rrii(res); +// // thread_data[i] = (threadIdx.x == 0) && (i == 0) ? +// // pointwise_mul(thread_data[i], filter_data[i]) : thread_data[i] * filter_data[i]; +// } + +// // Execute FFT +// __syncthreads(); +// irfftfp16(thread_data, reinterpret_cast *>(shared_mem)); + + +// float out_data[2][EPT] {}; + +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// out_data[0][i] = __half2float(reinterpret_cast<__half2 (&)[EPT * 2]>(thread_data)[i].x) + k_data[0][i] * D_val; +// out_data[1][i] = __half2float(reinterpret_cast<__half2 (&)[EPT * 2]>(thread_data)[i].y) + k_data[1][i] * D_val; +// } + +// unsigned int output_fft_id = !output_hbl_layout ? blockIdx.x * H * FFT::ffts_per_block + blockIdx.y * HEADDIM + head_i : blockIdx.x * FFT::ffts_per_block + (blockIdx.y * HEADDIM + head_i) * batch_size; +// BlockLoad_dout().Load(reinterpret_cast *>(doutData + output_fft_id * signal_size), +// reinterpret_cast(grad_data[0]), +// signal_size / 2, cfloat_t(0.f)); +// BlockLoad_dout().Load(reinterpret_cast *>(doutData + output_fft_id * signal_size + (!output_hbl_layout ? H * signal_size : signal_size)), +// reinterpret_cast(grad_data[1]), +// signal_size / 2, cfloat_t(0.f)); + +// float out_data_before_gelu[2][EPT]; +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// out_data_before_gelu[0][i] = out_data[0][i]; +// out_data_before_gelu[1][i] = out_data[1][i]; +// }; +// if (GELU_OUTPUT) { +// gelu(out_data[0], out_data[0]); +// gelu(out_data[1], out_data[1]); +// } +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// out_data[0][i] *= dropmask_val; +// out_data[1][i] *= dropmask_val; +// } + +// // dQ +// if (QV) { +// #pragma unroll +// for (int i = 0; i < EPT; ++i) { +// for (int j = 0; j < 2; ++j) { +// if (GELU_Q) { +// constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; +// constexpr float kAlpha = M_SQRT1_2; +// const float cdf = 0.5 * (1 + erff(q_data[j][i] * kAlpha)); +// const float pdf = expf(-0.5 * q_data[j][i] * q_data[j][i]) * kBeta; +// dq_data[j][i] += (cdf + q_data[j][i] * pdf) * grad_data[j][i] * out_data[j][i]; +// grad_data[j][i] *= q_data[j][i] * cdf; +// } else { +// dq_data[j][i] += grad_data[j][i] * out_data[j][i]; +// grad_data[j][i] *= q_data[j][i]; +// } +// } +// } +// } + +// // dGELU and dropout +// // https://github.com/pytorch/pytorch/blob/dc169d53aa266560750ea25ee0cf31c7e614550d/aten/src/ATen/native/cuda/Activation.cu#L418 +// #pragma unroll +// for ( int i = 0; i < EPT; ++i) { +// grad_data[0][i] *= dropmask_val; +// grad_data[1][i] *= dropmask_val; +// } +// if (GELU_OUTPUT) { +// dgelu(grad_data[0], grad_data[0], out_data_before_gelu[0]); +// dgelu(grad_data[1], grad_data[1], out_data_before_gelu[1]); +// } + +// // CHANGE THIS!!! +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// dD_val[0] += grad_data[0][i] * k_data[0][i]; +// dD_val[1] += grad_data[1][i] * k_data[1][i]; +// } + +// complex_t grad_data_c[EPT]; +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// // grad_data_c[i] = i < EPT / 2 ? cfloat_t(grad_data[i * 2], grad_data[i * 2 + 1]) : cfloat_t(0.f); +// grad_data_c[i] = i < EPT / 2 ? complex_t { +// __float22half2_rn({grad_data[0][i * 2], grad_data[1][i * 2]}), +// __float22half2_rn({grad_data[0][i * 2 + 1], grad_data[1][i * 2 + 1]}) +// } : complex_t { __float22half2_rn({0.f, 0.f}), __float22half2_rn({0.f, 0.f}) }; +// } + +// __syncthreads(); +// rfftfp16(grad_data_c, reinterpret_cast *>(shared_mem)); + +// cfloat_t grad_floats [2]; +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// read_rrii(grad_data_c[i], grad_floats); +// for ( int j = 0; j < 2; j++ ) { +// dfilter_data[j][i] += ((threadIdx.x == 0) && (i == 0) ? +// pointwise_mul(grad_floats[j], u_f[j][i]) : grad_floats[j] * std::conj(u_f[j][i])) / (2 * N); +// } +// // dfilter_data[i] += ((threadIdx.x == 0) && (i == 0) ? +// // pointwise_mul(grad_data_c[i], u_f[i]) : grad_data_c[i] * std::conj(u_f[i])) / (2 * N); +// } + +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// read_rrii(grad_data_c[i], grad_floats); +// for ( int j = 0; j < 2; j++ ) { +// res[j] = (threadIdx.x == 0) && (i == 0) ? +// pointwise_mul(grad_floats[j], filter_data[i]) : grad_floats[j] * std::conj(filter_data[i]); +// } +// grad_data_c[i] = write_rrii(res); +// // grad_data_c[i] = (threadIdx.x == 0) && (i == 0) ? +// // pointwise_mul(grad_data_c[i], filter_data[i]) : grad_data_c[i] * std::conj(filter_data[i]); +// } + +// __syncthreads(); +// irfftfp16(grad_data_c, reinterpret_cast *>(shared_mem)); + +// float du_data_local[2][EPT]; +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// // du_data_local[i] = reinterpret_cast(grad_data_c)[i] + grad_data[i] * D_val; +// du_data_local[0][i] = __half2float(reinterpret_cast<__half2 (&)[EPT * 2]>(grad_data_c)[i].x) + grad_data[0][i] * D_val; +// du_data_local[1][i] = __half2float(reinterpret_cast<__half2 (&)[EPT * 2]>(grad_data_c)[i].y) + grad_data[1][i] * D_val; +// } + +// float dv_data[2][EPT]; +// // compute dv, and update du +// if (QV) { +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// // dv +// dv_data[0][i] = du_data_local[0][i] * u_og_data_before_gelu[0][i]; +// dv_data[1][i] = du_data_local[1][i] * u_og_data_before_gelu[1][i]; + +// // update du +// du_data_local[0][i] = du_data_local[0][i] * v_data[0][i]; +// du_data_local[1][i] = du_data_local[1][i] * v_data[1][i]; +// } +// } + +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // printf("u_og_data_before_gelu[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", u_og_data_before_gelu[0][i]); +// // } +// // printf("\n"); +// // printf("u_og_data_before_gelu[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", u_og_data_before_gelu[1][i]); +// // } +// // printf("\n"); +// // } +// // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0) && (blockIdx.z == 0) && (head_i == 0)) { +// // printf("dv_data[0]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", dv_data[0][i]); +// // } +// // printf("\n"); +// // printf("dv_data[1]: "); +// // for (int i = 0; i < EPT; i++) { +// // printf("%.4f, ", dv_data[1][i]); +// // } +// // printf("\n"); +// // } + +// #pragma unroll +// for ( int i = 0; i < EPT; i++ ) { +// du_data[0][i] += du_data_local[0][i]; +// du_data[1][i] += du_data_local[1][i]; +// } + +// // store dv using atomic add +// if (QV) { +// unsigned int dv_data_idx; +// unsigned int thread_id = threadIdx.x; +// #pragma unroll +// for (int i = 0; i < EPT / 2; ++i) { +// // compute index based on thread idx, i, and head_i +// dv_data_idx = FFT::block_dim.x * i + thread_id; +// if (dv_data_idx < signal_size / 2) { +// // add the real and imaginary parts separately +// cfloat_t *loc = &reinterpret_cast(dvData + head_fft_id * signal_size)[dv_data_idx]; +// atomicAdd(reinterpret_cast(loc), dv_data[0][i * 2]); +// atomicAdd(reinterpret_cast(loc) + 1, dv_data[0][i * 2 + 1]); + +// // add the real and imaginary parts separately +// loc = &reinterpret_cast(dvData + head_fft_id * signal_size + H * signal_size)[dv_data_idx]; +// atomicAdd(reinterpret_cast(loc), dv_data[1][i * 2]); +// atomicAdd(reinterpret_cast(loc) + 1, dv_data[1][i * 2 + 1]); +// } +// } +// } +// // TODO: what if signal_size is odd? +// } + +// unsigned int dfilter_id = global_fft_id; + +// // There may be something wrong here?? +// // Save dD +// using BlockReduceT = cub::BlockReduce; +// using TempStorageT = typename BlockReduceT::TempStorage; +// __syncthreads(); +// dD_val[0] = BlockReduceT(reinterpret_cast(shared_mem)).Sum(dD_val[0]); +// dD_val[1] = BlockReduceT(reinterpret_cast(shared_mem)).Sum(dD_val[1]); +// if (threadIdx.x == 0) { +// *(dDData + dfilter_id) = dD_val[0]; +// *(dDData + dfilter_id + H) = dD_val[1]; +// } + +// // Save dfilter +// float dfilter_extra [2] = { 0.f, 0.f }; +// if (threadIdx.x == 0) { +// dfilter_extra[0] = dfilter_data[0][0].imag_; +// dfilter_data[0][0].imag_ = 0.f; +// dfilter_extra[1] = dfilter_data[1][0].imag_; +// dfilter_data[1][0].imag_ = 0.f; +// } + +// BlockStore_dfilter().Store(dfilterData + dfilter_id * (N + 1), dfilter_data[0]); +// BlockStore_dfilter().Store(dfilterData + (dfilter_id + H) * (N + 1), dfilter_data[1]); +// if (threadIdx.x == 0) { +// *(dfilterData + dfilter_id * (N + 1) + N) = cfloat_t(dfilter_extra[0], 0.f); +// *(dfilterData + (dfilter_id + H) * (N + 1) + N) = cfloat_t(dfilter_extra[1], 0.f); +// } + +// // Save results +// c10::complex du_data_c[2][EPT / 2]; +// #pragma unroll +// for (int i = 0; i < EPT / 2; ++i) { +// du_data_c[0][i] = c10::complex(input_t(du_data[0][i * 2]), input_t(du_data[0][i * 2 + 1])); +// du_data_c[1][i] = c10::complex(input_t(du_data[1][i * 2]), input_t(du_data[1][i * 2 + 1])); +// } +// BlockStore_dinput().Store(reinterpret_cast *>(duData + global_fft_id * signal_size), +// du_data_c[0], signal_size / 2); +// BlockStore_dinput().Store(reinterpret_cast *>(duData + global_fft_id * signal_size + H * signal_size), +// du_data_c[1], signal_size / 2); +// if (QV) { +// c10::complex dq_data_c[2][EPT / 2]; +// #pragma unroll +// for (int i = 0; i < EPT / 2; ++i) { +// dq_data_c[0][i] = c10::complex(input_t(dq_data[0][i * 2]), input_t(dq_data[0][i * 2 + 1])); +// dq_data_c[1][i] = c10::complex(input_t(dq_data[1][i * 2]), input_t(dq_data[1][i * 2 + 1])); +// } + +// BlockStore_dinput().Store(reinterpret_cast *>(dqData + global_fft_id * signal_size), // check this pointer arithmetic +// dq_data_c[0], signal_size / 2); +// BlockStore_dinput().Store(reinterpret_cast *>(dqData + global_fft_id * signal_size + H * signal_size), // check this pointer arithmetic +// dq_data_c[1], signal_size / 2); +// } +// } + +template +void fftconv_bwd_cuda( + const output_t *dout, const input_t *u, + const c10::complex *filter, + const input_t *v, int head_dim, const input_t *q, + const float *D, const float *dropout_mask, + input_t *du, c10::complex *dfilter, + float *dD, + float *dv, input_t *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, bool output_hbl_layout, bool fftfp16 +) { +#if defined(__CUDA_ARCH__) + constexpr uint ARCH = __CUDA_ARCH__; +#else + constexpr uint ARCH = 700; +#endif + + (void) gelu_inp; + (void) gelu_q; + + switch (head_dim) { + case 1: + { + constexpr uint FPB = 1; + + // FFT is defined, its: size, type, direction, precision. Block() operator + // informs that FFT will be executed on block level. Shared memory is + // required for co-operation between threads. + using FFT_base = decltype(cufftdx::Block() + cufftdx::Size() + cufftdx::Precision() + + cufftdx::ElementsPerThread() + cufftdx::FFTsPerBlock() + cufftdx::SM() + + cufftdx::Type()); + + using FFT = decltype(FFT_base() + cufftdx::Direction()); + using IFFT = decltype(FFT_base() + cufftdx::Direction()); + + // By default the shared memory size is 4 * FFT_SIZE (idk how). + // So it wouldn't work for our rfft and irfft functions. + const auto shared_memory_size = std::max({FFT::shared_memory_size, IFFT::shared_memory_size, + 8 * FFT_SIZE}); + // printf("shared_memory_size = %d\n", shared_memory_size); + + // unsigned int blocks_per_grid { static_cast( std::ceil( batch_size / FPB ) ) }; + unsigned int H_per_grid { static_cast( std::ceil( H / FPB ) ) }; + dim3 block(batch_size, H_per_grid / head_dim, head_dim); + + BOOL_SWITCH(v != nullptr, QV, [&] { + auto kernel = &fftconv_bwd_kernel; + // Increase dynamic memory limit if required. + CUDA_RT_CALL( cudaFuncSetAttribute(kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size )); + kernel<<>>( + dout, u, filter, v, q, D, dropout_mask, du, dfilter, dD, dv, dq, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout); + }); + break; + } + case 8: + { + if (fftfp16) { + // constexpr uint FPB = 2; + + // using FFT_base = decltype(cufftdx::Block() + cufftdx::Size() + cufftdx::Precision<__half>() + + // cufftdx::ElementsPerThread() + cufftdx::FFTsPerBlock() + cufftdx::SM() + // + cufftdx::Type()); + + // using FFT = decltype(FFT_base() + cufftdx::Direction()); + // using IFFT = decltype(FFT_base() + cufftdx::Direction()); + + // // By default the shared memory size is 4 * FFT_SIZE (idk how). + // // So it wouldn't work for our rfft and irfft functions. + // const auto shared_memory_size = std::max({FFT::shared_memory_size, IFFT::shared_memory_size, + // 8 * FFT_SIZE}); + // // printf("shared_memory_size = %d\n", shared_memory_size); + + // unsigned int blocks_per_grid { static_cast( std::ceil( batch_size / FPB ) ) }; + // // unsigned int H_per_grid { static_cast( std::ceil( H / FPB ) ) }; + // dim3 block(blocks_per_grid, H / head_dim, head_dim); + + // constexpr bool QV = true; // Multi-head requires QV + // auto kernel = &fftconv_bwd_kernelfp16; + // // Increase dynamic memory limit if required. + // CUDA_RT_CALL( cudaFuncSetAttribute(kernel, + // cudaFuncAttributeMaxDynamicSharedMemorySize, + // shared_memory_size )); + // kernel<<>>( + // dout, u, filter, v, q, D, dropout_mask, du, dfilter, dD, dv, dq, batch_size, H, signal_size, output_hbl_layout); + } + else { + // to go back to fp32 + constexpr uint FPB = 1; + + using FFT_base = decltype(cufftdx::Block() + cufftdx::Size() + cufftdx::Precision() + + cufftdx::ElementsPerThread() + cufftdx::FFTsPerBlock() + cufftdx::SM() + + cufftdx::Type()); + + using FFT = decltype(FFT_base() + cufftdx::Direction()); + using IFFT = decltype(FFT_base() + cufftdx::Direction()); + + // By default the shared memory size is 4 * FFT_SIZE (idk how). + // So it wouldn't work for our rfft and irfft functions. + const auto shared_memory_size = std::max({FFT::shared_memory_size, IFFT::shared_memory_size, + 8 * FFT_SIZE}); + // printf("shared_memory_size = %d\n", shared_memory_size); + + unsigned int blocks_per_grid { static_cast( std::ceil( batch_size / FPB ) ) }; + // unsigned int H_per_grid { static_cast( std::ceil( H / FPB ) ) }; + dim3 block(blocks_per_grid, H / head_dim, head_dim); + + constexpr bool QV = true; // Multi-head requires QV + auto kernel = &fftconv_bwd_kernel; + // Increase dynamic memory limit if required. + CUDA_RT_CALL( cudaFuncSetAttribute(kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size )); + kernel<<>>( + dout, u, filter, v, q, D, dropout_mask, du, dfilter, dD, dv, dq, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout); + } + break; + } + default: + AT_ERROR("fftconv backward not implemented for this head_dim"); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); +}; + +template +void fftconv_bwd_cuda_dispatch( + const output_t *dout, + const input_t *u, const c10::complex *filter, + const input_t *v, int head_dim, const input_t *q, + const float *D, const float *dropout_mask, + input_t *du, c10::complex *dfilter, float *dD, + float *dv, input_t *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16 +) { + BOOL_SWITCH(gelu, GELU_OUTPUT, [&] { + switch(fft_size) { + case 16: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 32: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 64: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 128: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 256: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 512: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 1024: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 2048: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 4096: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 8192: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + case 16384: + fftconv_bwd_cuda( + dout, u, filter, v, head_dim, q, D, dropout_mask, + du, dfilter, dD, dv, dq, + gelu, gelu_inp, gelu_q, batch_size, H, signal_size, batch_stride, H_stride, output_hbl_layout, fftfp16); + break; + default: + AT_ERROR("fftconv backward not implemented for this fft_size"); + } + }); +}; + +template void fftconv_fwd_cuda_dispatch( + const float *u, const c10::complex *filter, + const float *v, int head_dim, const float *q, + const float *D, const float *dropout_mask, float *out, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_fwd_cuda_dispatch( + const float *u, const c10::complex *filter, + const float *v, int head_dim, const float *q, + const float *D, const float *dropout_mask, at::Half *out, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_fwd_cuda_dispatch( + const at::Half *u, const c10::complex *filter, + const at::Half *v, int head_dim, const at::Half *q, + const float *D, const float *dropout_mask, at::Half *out, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_fwd_cuda_dispatch( + const at::BFloat16 *u, const c10::complex *filter, + const at::BFloat16 *v, int head_dim, const at::BFloat16 *q, + const float *D, const float *dropout_mask, at::BFloat16 *out, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_bwd_cuda_dispatch( + const float *dout, + const float *u, const c10::complex *filter, + const float *v, int head_dim, const float *q, + const float *D, const float *dropout_mask, + float *du, c10::complex *dfilter, float *dD, + float *dv, float *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_bwd_cuda_dispatch( + const at::Half *dout, + const float *u, const c10::complex *filter, + const float *v, int head_dim, const float *q, + const float *D, const float *dropout_mask, + float *du, c10::complex *dfilter, float *dD, + float *dv, float *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_bwd_cuda_dispatch( + const at::Half *dout, + const at::Half *u, const c10::complex *filter, + const at::Half *v, int head_dim, const at::Half *q, + const float *D, const float *dropout_mask, + at::Half *du, c10::complex *dfilter, float *dD, + float *dv, at::Half *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); + +template void fftconv_bwd_cuda_dispatch( + const at::BFloat16 *dout, + const at::BFloat16 *u, const c10::complex *filter, + const at::BFloat16 *v, int head_dim, const at::BFloat16 *q, + const float *D, const float *dropout_mask, + at::BFloat16 *du, c10::complex *dfilter, float *dD, + float *dv, at::BFloat16 *dq, + bool gelu, bool gelu_inp, bool gelu_q, int batch_size, int H, int signal_size, + size_t batch_stride, size_t H_stride, int fft_size, + bool output_hbl_layout, bool fftfp16); \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/launch_fftconv.py b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/launch_fftconv.py new file mode 100644 index 0000000000000..6a54352ad57e2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/launch_fftconv.py @@ -0,0 +1,56 @@ +import torch +import torch.nn.functional as F + +from einops import rearrange + +from fftconv import fftconv_fwd, fftconv_bwd + + +def fftconv_ref(u, k, D, dropout_mask): + seqlen = u.shape[-1] + fft_size = 2 * seqlen + k_f = torch.fft.rfft(k, n=fft_size) / fft_size + u_f = torch.fft.rfft(u.to(dtype=k.dtype), n=fft_size) + y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen] + out = y + u * D.unsqueeze(-1) + return (F.gelu(out) * rearrange(dropout_mask, 'b H -> b H 1')).to(dtype=u.dtype) + + +def fftconv_fast(u, k, D, dropout_mask): + """Fuse padding + rfft + pointwise mult + ifft + multiply with D + gelu + dropout + """ + seqlen = u.shape[-1] + fft_size = 2 * seqlen + k_f = torch.fft.rfft(k, n=fft_size) + out = fftconv_fwd(u, k_f, D, dropout_mask, fft_size) + return out + + +def fftconv_fast_bwd(dout, u, k, D, dropout_mask=None): + seqlen = u.shape[-1] + fft_size = 2 * seqlen + k_f = torch.fft.rfft(k, n=fft_size) + dx, dk_f, dD = fftconv_bwd(dout, u, k_f, D, dropout_mask, fft_size) + dk = torch.fft.irfft(dk_f, n=fft_size, norm='forward')[..., :seqlen] + return dx, dk, dD + + +device = 'cuda' +dtype = torch.float32 +# dtype = torch.float16 +batch_size = 64 +H = 256 +fft_size = 2048 +seqlen = 1024 +dropout_prob = 0.37 + +torch.manual_seed(0) +u = torch.randn(batch_size, H, seqlen, device=device, dtype=dtype, requires_grad=True) +k = torch.randn(H, seqlen, device=device, requires_grad=True) +D = torch.randn(H, device=device, requires_grad=True) +dropout_mask = F.dropout(torch.ones(batch_size, H, device=device), dropout_prob) + +out = fftconv_ref(u, k, D, dropout_mask) +out = fftconv_fast(u, k, D, dropout_mask) +g = torch.randn_like(out) +fftconv_fast_bwd(g, u, k, D, dropout_mask) diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/lut.h b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/lut.h new file mode 100644 index 0000000000000..e73454f644adb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/lut.h @@ -0,0 +1,240 @@ +// Generated by lut_code_gen.py + +#pragma once + +#include + +namespace cufftdx { +namespace database { +namespace detail { + +#define T_16384_1029 {0.9231440482496219290808880941767711192369,-0.3844542445874408187478366016875952482224} +#define T_16384_1031 {0.9228489040350941197132783599954564124346,-0.3851621740529898585414514400326879695058} +#define T_16384_1033 {0.9225532169323328313126353350526187568903,-0.3858698769375553117022548121894942596555} +#define T_16384_1039 {0.9216629000356947321037637266272213310003,-0.3879916219427848589340612761589000001550} +#define T_16384_1041 {0.9213650431226423354047483371687121689320,-0.3886984143415191939041619662020821124315} +#define T_16384_1047 {0.9204682209940671100412146188318729400635,-0.3908174179076684962019783142750384286046} +#define T_16384_1049 {0.9201681970742663363438396117999218404293,-0.3915232931679724082130178430816158652306} +#define T_16384_1051 {0.9198676318432229548349710057664196938276,-0.3922289381052103118818763505259994417429} +#define T_16384_1057 {0.9189626900523756303229561126499902456999,-0.3943444868280795989612386165390489622951} +#define T_16384_1059 {0.9186599613476919001797682540200185030699,-0.3950492063232847739229214312217663973570} +#define T_16384_1065 {0.9177485334036612485419937002006918191910,-0.3971619687676916088392431447573471814394} +#define T_16384_1067 {0.9174436440747352206059872514742892235518,-0.3978657561877757542490030573389958590269} +#define T_16384_1075 {0.9162186914727942221858825178060214966536,-0.4006785611882432429631251125101698562503} +#define T_16384_1077 {0.9159111054015098840608288810472004115582,-0.4013811742000167881450067852711072191596} +#define T_16384_1083 {0.9149851150715893055576088954694569110870,-0.4034875948494953123990569565648911520839} +#define T_16384_1085 {0.9146753748615223944540275624603964388371,-0.4041892608938706943355612111190566793084} +#define T_16384_1093 {0.9134310350485547180809930978284683078527,-0.4069935432044665124529103650274919345975} +#define T_16384_1095 {0.9131186062671542424951098837482277303934,-0.4076940162532801115169434069684939458966} +#define T_16384_1101 {0.9121780976748071756432523216062691062689,-0.4097939947368311464082069051073631271720} +#define T_16384_1111 {0.9105998536115589292450067659956403076649,-0.4132891319676909591684932365751592442393} +#define T_16384_1113 {0.9102825970072817574063606116396840661764,-0.4139874316759853956071424363472033292055} +#define T_16384_1119 {0.9093276149677672615112555831728968769312,-0.4160808679295792122942998503276612609625} +#define T_16384_1129 {0.9077252820676764422103133256314322352409,-0.4195650274929468848128522040497045964003} +#define T_16384_1131 {0.9074032127578081086127781418326776474714,-0.4202611205867228805210800146596739068627} +#define T_16384_1141 {0.9057848629797865536161793897917959839106,-0.4237378694389838384992685860197525471449} +#define T_16384_1147 {0.9048074573903165385146962762519251555204,-0.4258209307336482396522114868275821208954} +#define T_16384_1149 {0.9044805907214682472172739835514221340418,-0.4265147840440515203397353616310283541679} +#define T_16384_1159 {0.9028382799945028347465836304763797670603,-0.4299802788228406225101707605062983930111} +#define T_16384_1165 {0.9018465186139017486510738308425061404705,-0.4320565435958415023698364620940992608666} +#define T_16384_1167 {0.9015148701612787363046663813292980194092,-0.4327481240607436996370438464509788900614} +#define T_16384_1175 {0.9001829749267568070436595917271915823221,-0.4355118961084920026216593669232679530978} +#define T_16384_1177 {0.8998486767415185827445611721486784517765,-0.4362021996351439501182767344289459288120} +#define T_16384_1183 {0.8988426068272423741234433691715821623802,-0.4382715689524104285546002301998669281602} +#define T_16384_1185 {0.8985061923939019479234957543667405843735,-0.4389608436179843198310379648319212719798} +#define T_16384_1193 {0.8971552509638085481924463238101452589035,-0.4417153559341873148014201433397829532623} +#define T_16384_1195 {0.8968161956755072994340594050299841910601,-0.4424033354012040786251702684239717200398} +#define T_16384_1201 {0.8957958651668135319212638023600447922945,-0.4444657106572340032890622296690708026290} +#define T_16384_1203 {0.8954547007829124494193706595979165285826,-0.4451526466795236447460126782971201464534} +#define T_16384_1209 {0.8944280477979738019911337687517516314983,-0.4472118818997383171698345449840417131782} +#define T_16384_1211 {0.8940847775296419941426506738935131579638,-0.4478977680115973081242941589152906090021} +#define T_16384_1213 {0.8937409812942710418681713235855568200350,-0.4485833906367392431846496947400737553835} +#define T_16384_1219 {0.8927064388099353875460906238004099577665,-0.4506386735592975978370589018595637753606} +#define T_16384_1221 {0.8923605407319653570752393534348811954260,-0.4513232382057835168076564968941966071725} +#define T_16384_1227 {0.8913196975972413893529733286413829773664,-0.4533753375241777461290837436536094173789} +#define T_16384_1229 {0.8909717009323968595069231923844199627638,-0.4540588377486244331393550055508967489004} +#define T_16384_1231 {0.8906231801318559293534349308174569159746,-0.4547420708619554496898729212261969223619} +#define T_16384_1237 {0.8895744749678545781890193211438599973917,-0.4567901635167571638973527115012984722853} +#define T_16384_1245 {0.8881688759895617346629137500713113695383,-0.4595171898019034806281979399500414729118} +#define T_16384_1247 {0.8878161695102544381796860761824063956738,-0.4601982715701343207292950410192133858800} +#define T_16384_1249 {0.8874629407515688406249410036252811551094,-0.4608790826155786946038972473616013303399} +#define T_16384_1255 {0.8864001228787306008172208748874254524708,-0.4629198874109550776445587416674243286252} +#define T_16384_1263 {0.8849757331116666625447919614089187234640,-0.4656371460734936573722109187656315043569} +#define T_16384_1265 {0.8846183336243699235623694221430923789740,-0.4663157769319444256872486676002154126763} +#define T_16384_1267 {0.8842604137388991869528354072826914489269,-0.4669941334688380019102282858511898666620} +#define T_16384_1281 {0.8817404211168983207969063187192659825087,-0.4717349147228714345558842069294769316912} +#define T_16384_1283 {0.8813783456517069181401780042506288737059,-0.4724110623347640425251370288606267422438} +#define T_16384_1285 {0.8810157516943428746003519336227327585220,-0.4730869320394000543394952273956732824445} +#define T_16384_1299 {0.8784630941679578697289798583369702100754,-0.4778102051912009873291253825300373136997} +#define T_16384_1301 {0.8780963599777771300125550624215975403786,-0.4784838373380839726678459555841982364655} +#define T_16384_1303 {0.8777291092261315652578446133702527731657,-0.4791571880052533094485056608391460031271} +#define T_16384_1311 {0.8762549449303385085130457810009829699993,-0.4818477679569860283592674932151567190886} +#define T_16384_1317 {0.8751439084295603576535427237104158848524,-0.4838627279907322664698199332633521407843} +#define T_16384_1319 {0.8747725329892841461543184777838177978992,-0.4845338125740161761001445483998395502567} +#define T_16384_1321 {0.8744006429428647919621653272770345211029,-0.4852046121185418225962848737253807485104} +#define T_16384_1329 {0.8729079410757610846260945436370093375444,-0.4878849520193010436130975904234219342470} +#define T_16384_1335 {0.8717830220609931179964746661426033824682,-0.4898921947185951863978914389008423313498} +#define T_16384_1337 {0.8714070230666709493405619468830991536379,-0.4905606997610820196875636156619293615222} +#define T_16384_1339 {0.8710305114460482611704605915292631834745,-0.4912289162193482772167385519423987716436} +#define T_16384_1345 {0.8698979030428063419222439733857754617929,-0.4932318301587279019138065905281109735370} +#define T_16384_1347 {0.8695193431349168555755113629857078194618,-0.4938988883508674820888018075493164360523} +#define T_16384_1353 {0.8683805952085797974504544072260614484549,-0.4958983180705421878187166839779820293188} +#define T_16384_1355 {0.8679999905765735102036728676466736942530,-0.4965642117179492887046876603562850505114} +#define T_16384_1357 {0.8676188753225362315646407296299003064632,-0.4972298132494242239864945531735429540277} +#define T_16384_1363 {0.8664724680717430516097010695375502109528,-0.4992248612335550839169684422813588753343} +#define T_16384_1365 {0.8660893125745867671128053189022466540337,-0.4998892903874613269366022905160207301378} +#define T_16384_1371 {0.8649367899980490159705937003309372812510,-0.5018808118546382868174760005786083638668} +#define T_16384_1373 {0.8645515978641793441994423119467683136463,-0.5025440623771156856136599344608839601278} +#define T_16384_1381 {0.8630057456648703162471747418749146163464,-0.5051941042306622442481511825462803244591} +#define T_16384_1383 {0.8626180128358167387148114357842132449150,-0.5058558726862688592618155780655797570944} +#define T_16384_1389 {0.8614517705268093239467930288810748606920,-0.5078393910048977222260191410896368324757} +#define T_16384_1391 {0.8610620092454914775714769348269328474998,-0.5084999667985409255166473485587630420923} +#define T_16384_1399 {0.8594979010116017281717404330265708267689,-0.5111392747154643867446566218859516084194} +#define T_16384_1401 {0.8591056093261304482666673720814287662506,-0.5117983509394868901765107693790923804045} +#define T_16384_1407 {0.8579257028561297904190041663241572678089,-0.5137737715948680339295151497935876250267} +#define T_16384_1417 {0.8559491012608269056016752074356190860271,-0.5170600894004319103558486858673859387636} +#define T_16384_1419 {0.8555522694116468596092772713745944201946,-0.5177164419878711454359176968864630907774} +#define T_16384_1425 {0.8543587550032274435807266854681074619293,-0.5196836708511584079772660516027826815844} +#define T_16384_1435 {0.8523595155129470857247042658855207264423,-0.5229562661585901439664780809835065156221} +#define T_16384_1437 {0.8519581624091063787318489630706608295441,-0.5236098638342279176782767535769380629063} +#define T_16384_1447 {0.8499438838467822110445126781996805220842,-0.5268732241359845858141852659173309803009} +#define T_16384_1453 {0.8487293148118171259852715593297034502029,-0.5288275240369618712676924587867688387632} +#define T_16384_1455 {0.8483234595778016418066158621513750404119,-0.5294783356568519838702968627330847084522} +#define T_16384_1465 {0.8462867024905597057582440356782171875238,-0.5327277139286588081290574336890131235123} +#define T_16384_1471 {0.8450586721365954678830689772439654916525,-0.5346735832699555102109911786101292818785} +#define T_16384_1473 {0.8446483341114178200470519186637829989195,-0.5353215778229071242222403270716313272715} +#define T_16384_1481 {0.8430020155804729409254605343448929488659,-0.5379104030665888824813691826420836150646} +#define T_16384_1483 {0.8425891955507867070451766267069615423679,-0.5385568192318040958710412269283551722765} +#define T_16384_1489 {0.8413477623935019522605216479860246181488,-0.5404941652926952277979921746009495109320} +#define T_16384_1491 {0.8409329611297797768543205165769904851913,-0.5411393119017507968848690325103234499693} +#define T_16384_1499 {0.8392688115274752336247843231831211596727,-0.5437167111624022775018261199875269085169} +#define T_16384_1501 {0.8388515392137657622484425701259169727564,-0.5443602622884003983116940617037471383810} +#define T_16384_1507 {0.8375967624074830419900195010995958000422,-0.5462889927542952106520601773809175938368} +#define T_16384_1509 {0.8371775176705072984972844096773769706488,-0.5469312606782021912721347689512185752392} +#define T_16384_1515 {0.8359168295077663568548587136319838464260,-0.5488561324661352935905256344994995743036} +#define T_16384_1517 {0.8354956162936153507558856290415860712528,-0.5494971111426809606825827358989045023918} +#define T_16384_1519 {0.8350739115789194144667817454319447278976,-0.5501377665642336323159611310984473675489} +#define T_16384_1525 {0.8338058509137863394400369543291162699461,-0.5520577895310749827473273398936726152897} +#define T_16384_1527 {0.8333821826805797305937062446901109069586,-0.5526971481657497742290274800325278192759} +#define T_16384_1533 {0.8321082374357355870841956857475452125072,-0.5546132717413040369436316723295021802187} +#define T_16384_1535 {0.8316826096717451211048910408862866461277,-0.5552513275712139817485990533896256238222} +#define T_16384_1537 {0.8312564926503032136650972461211495101452,-0.5558890567610738075998710883141029626131} +#define T_16384_1543 {0.8299752085494439546309308752825018018484,-0.5578002807397169915404333551123272627592} +#define T_16384_1551 {0.8282599953843856610546936281025409698486,-0.5603439836795408579561694750736933201551} +#define T_16384_1553 {0.8278299733517299197060879123455379158258,-0.5609790862594381533057230626582168042660} +#define T_16384_1555 {0.8273994643280294658538309704454150050879,-0.5616138588297924227887847337115090340376} +#define T_16384_1561 {0.8261050178446646130581143552262801676989,-0.5635161927503647971704481278720777481794} +#define T_16384_1569 {0.8243722867225512507260987149493303149939,-0.5660479952122714486506538378307595849037} +#define T_16384_1571 {0.8239378909117913707405023160390555858612,-0.5666801142795016010822450880368705838919} +#define T_16384_1573 {0.8235030103995985006903879366291221231222,-0.5673118999834207976107336435234174132347} +#define T_16384_1587 {0.8204452966996520490994271312956698238850,-0.5717250345431971192411424453894142061472} +#define T_16384_1589 {0.8200065478097596782802725101646501570940,-0.5723541399772699156400790343468543142080} +#define T_16384_1591 {0.8195673165311422314616152107191737741232,-0.5729829087101485640687315026298165321350} +#define T_16384_1605 {0.8164792124368653869481704532518051564693,-0.5773748311612448835816735481785144656897} +#define T_16384_1607 {0.8160361313742368061241450050147250294685,-0.5780008929852699095519596994563471525908} +#define T_16384_1609 {0.8155925702585767878005640341143589466810,-0.5786266147862614284136384412704501301050} +#define T_16384_1617 {0.8138135304885671938990299167926423251629,-0.5811260944009776219232321636809501796961} +#define T_16384_1623 {0.8124742229182104757967408659169450402260,-0.5829971158534577035936763422796502709389} +#define T_16384_1625 {0.8120268307956697295679759918130002915859,-0.5836201042355727564014955532911699265242} +#define T_16384_1627 {0.8115789609786658864720720885088667273521,-0.5842427492890169826722512880223803222179} +#define T_16384_1635 {0.8097827100396365329615377959271427243948,-0.5867298888934003864292776597721967846155} +#define T_16384_1641 {0.8084305189815427228339217435859609395266,-0.5885916207178228942709097282204311341047} +#define T_16384_1643 {0.8079788371173363126231947717315051704645,-0.5892115059726149572938425080792512744665} +#define T_16384_1645 {0.8075266799399971606732151485630311071873,-0.5898310446094587877752246640739031136036} +#define T_16384_1651 {0.8061673591905044178318462400056887418032,-0.5916875771687354346184406495012808591127} +#define T_16384_1653 {0.8057133034233522339562227898568380624056,-0.5923057256912422907646487146848812699318} +#define T_16384_1659 {0.8043482933094607822965826926520094275475,-0.5941580791760368018827875857823528349400} +#define T_16384_1661 {0.8038923432262412571702725472277961671352,-0.5947748317659575789662085298914462327957} +#define T_16384_1663 {0.8034359202338681171795542468316853046417,-0.5953912344651687282848229187948163598776} +#define T_16384_1669 {0.8020638164882354370632810969254933297634,-0.5972383395934374172853154050244484096766} +#define T_16384_1671 {0.8016055045470461548617890912282746285200,-0.5978533391057339052565566817065700888634} +#define T_16384_1677 {0.8002277404201247890114245819859206676483,-0.5996962259862083088890472026832867413759} +#define T_16384_1679 {0.7997675438439256767608753762033302336931,-0.6003098165229804328291152160090859979391} +#define T_16384_1687 {0.7979220554240930018963240399898495525122,-0.6027606435956072150617046645493246614933} +#define T_16384_1689 {0.7974595091474424579658375478175003081560,-0.6033724647929502582499594609544146806002} +#define T_16384_1695 {0.7960690566579879945408038111054338514805,-0.6052057972554965026290574314771220088005} +#define T_16384_1697 {0.7956046355171880746226520386699121445417,-0.6058161965015149696967000636504963040352} +#define T_16384_1705 {0.7937422733531002139173438081343192607164,-0.6082542260373144937801725973258726298809} +#define T_16384_1707 {0.7932755147813306262349897224339656531811,-0.6088628397664082037010757630923762917519} +#define T_16384_1713 {0.7918724401844404736650062659464310854673,-0.6106865304526862825440503002027980983257} +#define T_16384_1723 {0.7895246694419821853472285511088557541370,-0.6137188251492117219143551665183622390032} +#define T_16384_1725 {0.7890537208161518822890911906142719089985,-0.6143242024095959541440947759838309139013} +#define T_16384_1731 {0.7876380909683674547139276000962127000093,-0.6161381644206969099286652635782957077026} +#define T_16384_1741 {0.7852694446596758526268899913702625781298,-0.6191541805430084144390434630622621625662} +#define T_16384_1743 {0.7847943284204992320240990011370740830898,-0.6197562924884406632131117476092185825109} +#define T_16384_1753 {0.7824118277098365270560975659464020282030,-0.6227613763390863477198422515357378870249} +#define T_16384_1759 {0.7809768017677537477183591363427694886923,-0.6245600332238772089965550549095496535301} +#define T_16384_1761 {0.7804975405545319100397705369687173515558,-0.6251588511637076184257466593408025801182} +#define T_16384_1771 {0.7780943529377027934046395785117056220770,-0.6281474173523740045510521667893044650555} +#define T_16384_1777 {0.7766469453107620601883809285936877131462,-0.6299361256027964373060967773199081420898} +#define T_16384_1779 {0.7761635619603043378944562391552608460188,-0.6305316210033345969421247900754678994417} +#define T_16384_1787 {0.7742254654358606824615662844735197722912,-0.6329098898505417514215309893188532441854} +#define T_16384_1789 {0.7737398019492618406189876623102463781834,-0.6335035271247643207104260909545700997114} +#define T_16384_1795 {0.7722800816064743223421373841119930148125,-0.6352822015088234186563909133838023990393} +#define T_16384_1797 {0.7717925991520101502985085062391590327024,-0.6358743459946977205632379082089755684137} +#define T_16384_1805 {0.7698381319488798446570854139281436800957,-0.6382391797731152838224488732521422207355} +#define T_16384_1807 {0.7693483822389822757159549837524536997080,-0.6388294504374862903262055624509230256081} +#define T_16384_1813 {0.7678764187360606063847967561741825193167,-0.6405980062013010289945214026374742388725} +#define T_16384_1815 {0.7673848604061417333355166192632168531418,-0.6411867715568112524593402667960617691278} +#define T_16384_1821 {0.7659074779779443398197713577246759086847,-0.6429508030770820781185648229438811540604} +#define T_16384_1823 {0.7654141156547382696118120293249376118183,-0.6435380575820477400128538647550158202648} +#define T_16384_1825 {0.7649203030581284146194320783251896500587,-0.6441249335101545403503564557468052953482} +#define T_16384_1831 {0.7634361665341720115662837997660972177982,-0.6458832863819963243656729900976642966270} +#define T_16384_1833 {0.7629405557515657188005775424244347959757,-0.6464686445524577829147006013954523950815} +#define T_16384_1839 {0.7614510316616536211853372151381336152554,-0.6482224358824704157910900903516449034214} +#define T_16384_1841 {0.7609536273579281528967044323508162051439,-0.6488062707856725452870705339591950178146} +#define T_16384_1843 {0.7604557754047892581539258571865502744913,-0.6493897240128616576981812613666988909245} +#define T_16384_1849 {0.7589595365789424397107154618424829095602,-0.6511377902071703305253436155908275395632} +#define T_16384_1857 {0.7569583021837504865914070251164957880974,-0.6534631808718023293636179005261510610580} +#define T_16384_1859 {0.7564568796008337425718082158709876239300,-0.6540435683534926436522027870523743331432} +#define T_16384_1861 {0.7559550120138244233558566520514432340860,-0.6546235710782026817611267688334919512272} +#define T_16384_1867 {0.7544467421819064378851749097520951181650,-0.6563612672995779995233078807359561324120} +#define T_16384_1875 {0.7524295036229123878612767839513253420591,-0.6586727883234418934321752203686628490686} +#define T_16384_1877 {0.7519240866536035516887181984202470630407,-0.6592497007281414855839329902664758265018} +#define T_16384_1879 {0.7514182273467274741918231484305579215288,-0.6598262253132273214006886519200634211302} +#define T_16384_1893 {0.7478648517765094094755795595119707286358,-0.6638510099994573421255950052000116556883} +#define T_16384_1895 {0.7473554645039401922446131720789708197117,-0.6644244198372751819547943341603968292475} +#define T_16384_1897 {0.7468456375814065406615327447070740163326,-0.6649974388113253365162336194771341979504} +#define T_16384_1911 {0.7432645641503216049628122163994703441858,-0.6689975991574502733882923166675027459860} +#define T_16384_1913 {0.7427512308468090518331905514060053974390,-0.6695674791053924934658425627276301383972} +#define T_16384_1915 {0.7422374606018840026422367373015731573105,-0.6701369651640376456924741432885639369488} +#define T_16384_1923 {0.7401780162566662379930448878440074622631,-0.6724109638088497931107667682226747274399} +#define T_16384_1929 {0.7386288599481748429198546546103898435831,-0.6741123105623123556995324179297313094139} +#define T_16384_1931 {0.7381116050740642586802664482092950493097,-0.6746786334655845429608689300948753952980} +#define T_16384_1933 {0.7375939159879135731401333941903430968523,-0.6752445594727992661532312013150658458471} +#define T_16384_1941 {0.7355188236175989047183065849822014570236,-0.6775042878861974315896077314391732215881} +#define T_16384_1947 {0.7339579600614959398185987993201706558466,-0.6791949004979112025637277838541194796562} +#define T_16384_1949 {0.7334368082639957098223248976864852011204,-0.6797576393712120301771051344985608011484} +#define T_16384_1951 {0.7329152250045177785509054046997334808111,-0.6803199783606072026387323603557888418436} +#define T_16384_1957 {0.7313478895238255672595073519914876669645,-0.6820045927184408274257521043182350695133} +#define T_16384_1959 {0.7308245834873121626884540091850794851780,-0.6825653288664732531998424747143872082233} +#define T_16384_1965 {0.7292520870587869685763848792703356593847,-0.6842451267787030833034123133984394371510} +#define T_16384_1967 {0.7287270631707938317589423604658804833889,-0.6848042548075106150662350046331994235516} +#define T_16384_1969 {0.7282016105914446146840646179043687880039,-0.6853629799836187252992658613948151469231} +#define T_16384_1975 {0.7266226837976228480897589179221540689468,-0.6870367351100956643250583510962314903736} +#define T_16384_1977 {0.7260955195464710021369114656408783048391,-0.6875938455909421653799995510780718177557} +#define T_16384_1983 {0.7245114651750196310686646938847843557596,-0.6892627487612734693556149068172089755535} +#define T_16384_1985 {0.7239825942139355152704638385330326855183,-0.6898182393031224712842686130898073315620} +#define T_16384_1993 {0.7218628544814963410303221280628349632025,-0.6920361401833187153798121471481863409281} +#define T_16384_1995 {0.7213318571350962882249291396874468773603,-0.6925895984506503788580289437959436327219} +#define T_16384_2001 {0.7197363203009510268515214193030260503292,-0.6942475273558033066478856198955327272415} +#define T_16384_2003 {0.7192036274674912244009306050429586321115,-0.6947993539415548980286985170096158981323} +#define T_16384_2011 {0.7170686283814374784029155307507608085871,-0.6970025697163274580603342656104359775782} +#define T_16384_2013 {0.7165338232418266839474085827532690018415,-0.6975523493978431632811521012627054005861} +#define T_16384_2019 {0.7149268799723594858264164031425025314093,-0.6991992250374621242769990203669294714928} +#define T_16384_2029 {0.7122402339424455108840561479155439883471,-0.7019357870586243608457266418554354459047} +#define T_16384_2031 {0.7117016464931029684493068998563103377819,-0.7024818619573079958584571613755542784929} +#define T_16384_2037 {0.7100833733592027963155146608187351375818,-0.7041176058577253149550756461394485086203} +#define T_16384_2047 {0.7073779012376421038155172027472872287035,-0.7068355571422737515518974760198034346104} + +static const __device__ float2 lut_mine_sp_8_8192[1025] = { + T_2_0,T_8192_1,T_4096_1,T_8192_3,T_2048_1,T_8192_5,T_4096_3,T_8192_7,T_1024_1,T_8192_9,T_4096_5,T_8192_11,T_2048_3,T_8192_13,T_4096_7,T_8192_15,T_512_1,T_8192_17,T_4096_9,T_8192_19,T_2048_5,T_8192_21,T_4096_11,T_8192_23,T_1024_3,T_8192_25,T_4096_13,T_8192_27,T_2048_7,T_8192_29,T_4096_15,T_8192_31,T_256_1,T_8192_33,T_4096_17,T_8192_35,T_2048_9,T_8192_37,T_4096_19,T_8192_39,T_1024_5,T_8192_41,T_4096_21,T_8192_43,T_2048_11,T_8192_45,T_4096_23,T_8192_47,T_512_3,T_8192_49,T_4096_25,T_8192_51,T_2048_13,T_8192_53,T_4096_27,T_8192_55,T_1024_7,T_8192_57,T_4096_29,T_8192_59,T_2048_15,T_8192_61,T_4096_31,T_8192_63,T_128_1,T_8192_65,T_4096_33,T_8192_67,T_2048_17,T_8192_69,T_4096_35,T_8192_71,T_1024_9,T_8192_73,T_4096_37,T_8192_75,T_2048_19,T_8192_77,T_4096_39,T_8192_79,T_512_5,T_8192_81,T_4096_41,T_8192_83,T_2048_21,T_8192_85,T_4096_43,T_8192_87,T_1024_11,T_8192_89,T_4096_45,T_8192_91,T_2048_23,T_8192_93,T_4096_47,T_8192_95,T_256_3,T_8192_97,T_4096_49,T_8192_99,T_2048_25,T_8192_101,T_4096_51,T_8192_103,T_1024_13,T_8192_105,T_4096_53,T_8192_107,T_2048_27,T_8192_109,T_4096_55,T_8192_111,T_512_7,T_8192_113,T_4096_57,T_8192_115,T_2048_29,T_8192_117,T_4096_59,T_8192_119,T_1024_15,T_8192_121,T_4096_61,T_8192_123,T_2048_31,T_8192_125,T_4096_63,T_8192_127,T_64_1,T_8192_129,T_4096_65,T_8192_131,T_2048_33,T_8192_133,T_4096_67,T_8192_135,T_1024_17,T_8192_137,T_4096_69,T_8192_139,T_2048_35,T_8192_141,T_4096_71,T_8192_143,T_512_9,T_8192_145,T_4096_73,T_8192_147,T_2048_37,T_8192_149,T_4096_75,T_8192_151,T_1024_19,T_8192_153,T_4096_77,T_8192_155,T_2048_39,T_8192_157,T_4096_79,T_8192_159,T_256_5,T_8192_161,T_4096_81,T_8192_163,T_2048_41,T_8192_165,T_4096_83,T_8192_167,T_1024_21,T_8192_169,T_4096_85,T_8192_171,T_2048_43,T_8192_173,T_4096_87,T_8192_175,T_512_11,T_8192_177,T_4096_89,T_8192_179,T_2048_45,T_8192_181,T_4096_91,T_8192_183,T_1024_23,T_8192_185,T_4096_93,T_8192_187,T_2048_47,T_8192_189,T_4096_95,T_8192_191,T_128_3,T_8192_193,T_4096_97,T_8192_195,T_2048_49,T_8192_197,T_4096_99,T_8192_199,T_1024_25,T_8192_201,T_4096_101,T_8192_203,T_2048_51,T_8192_205,T_4096_103,T_8192_207,T_512_13,T_8192_209,T_4096_105,T_8192_211,T_2048_53,T_8192_213,T_4096_107,T_8192_215,T_1024_27,T_8192_217,T_4096_109,T_8192_219,T_2048_55,T_8192_221,T_4096_111,T_8192_223,T_256_7,T_8192_225,T_4096_113,T_8192_227,T_2048_57,T_8192_229,T_4096_115,T_8192_231,T_1024_29,T_8192_233,T_4096_117,T_8192_235,T_2048_59,T_8192_237,T_4096_119,T_8192_239,T_512_15,T_8192_241,T_4096_121,T_8192_243,T_2048_61,T_8192_245,T_4096_123,T_8192_247,T_1024_31,T_8192_249,T_4096_125,T_8192_251,T_2048_63,T_8192_253,T_4096_127,T_8192_255,T_32_1,T_8192_257,T_4096_129,T_8192_259,T_2048_65,T_8192_261,T_4096_131,T_8192_263,T_1024_33,T_8192_265,T_4096_133,T_8192_267,T_2048_67,T_8192_269,T_4096_135,T_8192_271,T_512_17,T_8192_273,T_4096_137,T_8192_275,T_2048_69,T_8192_277,T_4096_139,T_8192_279,T_1024_35,T_8192_281,T_4096_141,T_8192_283,T_2048_71,T_8192_285,T_4096_143,T_8192_287,T_256_9,T_8192_289,T_4096_145,T_8192_291,T_2048_73,T_8192_293,T_4096_147,T_8192_295,T_1024_37,T_8192_297,T_4096_149,T_8192_299,T_2048_75,T_8192_301,T_4096_151,T_8192_303,T_512_19,T_8192_305,T_4096_153,T_8192_307,T_2048_77,T_8192_309,T_4096_155,T_8192_311,T_1024_39,T_8192_313,T_4096_157,T_8192_315,T_2048_79,T_8192_317,T_4096_159,T_8192_319,T_128_5,T_8192_321,T_4096_161,T_8192_323,T_2048_81,T_8192_325,T_4096_163,T_8192_327,T_1024_41,T_8192_329,T_4096_165,T_8192_331,T_2048_83,T_8192_333,T_4096_167,T_8192_335,T_512_21,T_8192_337,T_4096_169,T_8192_339,T_2048_85,T_8192_341,T_4096_171,T_8192_343,T_1024_43,T_8192_345,T_4096_173,T_8192_347,T_2048_87,T_8192_349,T_4096_175,T_8192_351,T_256_11,T_8192_353,T_4096_177,T_8192_355,T_2048_89,T_8192_357,T_4096_179,T_8192_359,T_1024_45,T_8192_361,T_4096_181,T_8192_363,T_2048_91,T_8192_365,T_4096_183,T_8192_367,T_512_23,T_8192_369,T_4096_185,T_8192_371,T_2048_93,T_8192_373,T_4096_187,T_8192_375,T_1024_47,T_8192_377,T_4096_189,T_8192_379,T_2048_95,T_8192_381,T_4096_191,T_8192_383,T_64_3,T_8192_385,T_4096_193,T_8192_387,T_2048_97,T_8192_389,T_4096_195,T_8192_391,T_1024_49,T_8192_393,T_4096_197,T_8192_395,T_2048_99,T_8192_397,T_4096_199,T_8192_399,T_512_25,T_8192_401,T_4096_201,T_8192_403,T_2048_101,T_8192_405,T_4096_203,T_8192_407,T_1024_51,T_8192_409,T_4096_205,T_8192_411,T_2048_103,T_8192_413,T_4096_207,T_8192_415,T_256_13,T_8192_417,T_4096_209,T_8192_419,T_2048_105,T_8192_421,T_4096_211,T_8192_423,T_1024_53,T_8192_425,T_4096_213,T_8192_427,T_2048_107,T_8192_429,T_4096_215,T_8192_431,T_512_27,T_8192_433,T_4096_217,T_8192_435,T_2048_109,T_8192_437,T_4096_219,T_8192_439,T_1024_55,T_8192_441,T_4096_221,T_8192_443,T_2048_111,T_8192_445,T_4096_223,T_8192_447,T_128_7,T_8192_449,T_4096_225,T_8192_451,T_2048_113,T_8192_453,T_4096_227,T_8192_455,T_1024_57,T_8192_457,T_4096_229,T_8192_459,T_2048_115,T_8192_461,T_4096_231,T_8192_463,T_512_29,T_8192_465,T_4096_233,T_8192_467,T_2048_117,T_8192_469,T_4096_235,T_8192_471,T_1024_59,T_8192_473,T_4096_237,T_8192_475,T_2048_119,T_8192_477,T_4096_239,T_8192_479,T_256_15,T_8192_481,T_4096_241,T_8192_483,T_2048_121,T_8192_485,T_4096_243,T_8192_487,T_1024_61,T_8192_489,T_4096_245,T_8192_491,T_2048_123,T_8192_493,T_4096_247,T_8192_495,T_512_31,T_8192_497,T_4096_249,T_8192_499,T_2048_125,T_8192_501,T_4096_251,T_8192_503,T_1024_63,T_8192_505,T_4096_253,T_8192_507,T_2048_127,T_8192_509,T_4096_255,T_8192_511,T_16_1,T_8192_513,T_4096_257,T_8192_515,T_2048_129,T_8192_517,T_4096_259,T_8192_519,T_1024_65,T_8192_521,T_4096_261,T_8192_523,T_2048_131,T_8192_525,T_4096_263,T_8192_527,T_512_33,T_8192_529,T_4096_265,T_8192_531,T_2048_133,T_8192_533,T_4096_267,T_8192_535,T_1024_67,T_8192_537,T_4096_269,T_8192_539,T_2048_135,T_8192_541,T_4096_271,T_8192_543,T_256_17,T_8192_545,T_4096_273,T_8192_547,T_2048_137,T_8192_549,T_4096_275,T_8192_551,T_1024_69,T_8192_553,T_4096_277,T_8192_555,T_2048_139,T_8192_557,T_4096_279,T_8192_559,T_512_35,T_8192_561,T_4096_281,T_8192_563,T_2048_141,T_8192_565,T_4096_283,T_8192_567,T_1024_71,T_8192_569,T_4096_285,T_8192_571,T_2048_143,T_8192_573,T_4096_287,T_8192_575,T_128_9,T_8192_577,T_4096_289,T_8192_579,T_2048_145,T_8192_581,T_4096_291,T_8192_583,T_1024_73,T_8192_585,T_4096_293,T_8192_587,T_2048_147,T_8192_589,T_4096_295,T_8192_591,T_512_37,T_8192_593,T_4096_297,T_8192_595,T_2048_149,T_8192_597,T_4096_299,T_8192_599,T_1024_75,T_8192_601,T_4096_301,T_8192_603,T_2048_151,T_8192_605,T_4096_303,T_8192_607,T_256_19,T_8192_609,T_4096_305,T_8192_611,T_2048_153,T_8192_613,T_4096_307,T_8192_615,T_1024_77,T_8192_617,T_4096_309,T_8192_619,T_2048_155,T_8192_621,T_4096_311,T_8192_623,T_512_39,T_8192_625,T_4096_313,T_8192_627,T_2048_157,T_8192_629,T_4096_315,T_8192_631,T_1024_79,T_8192_633,T_4096_317,T_8192_635,T_2048_159,T_8192_637,T_4096_319,T_8192_639,T_64_5,T_8192_641,T_4096_321,T_8192_643,T_2048_161,T_8192_645,T_4096_323,T_8192_647,T_1024_81,T_8192_649,T_4096_325,T_8192_651,T_2048_163,T_8192_653,T_4096_327,T_8192_655,T_512_41,T_8192_657,T_4096_329,T_8192_659,T_2048_165,T_8192_661,T_4096_331,T_8192_663,T_1024_83,T_8192_665,T_4096_333,T_8192_667,T_2048_167,T_8192_669,T_4096_335,T_8192_671,T_256_21,T_8192_673,T_4096_337,T_8192_675,T_2048_169,T_8192_677,T_4096_339,T_8192_679,T_1024_85,T_8192_681,T_4096_341,T_8192_683,T_2048_171,T_8192_685,T_4096_343,T_8192_687,T_512_43,T_8192_689,T_4096_345,T_8192_691,T_2048_173,T_8192_693,T_4096_347,T_8192_695,T_1024_87,T_8192_697,T_4096_349,T_8192_699,T_2048_175,T_8192_701,T_4096_351,T_8192_703,T_128_11,T_8192_705,T_4096_353,T_8192_707,T_2048_177,T_8192_709,T_4096_355,T_8192_711,T_1024_89,T_8192_713,T_4096_357,T_8192_715,T_2048_179,T_8192_717,T_4096_359,T_8192_719,T_512_45,T_8192_721,T_4096_361,T_8192_723,T_2048_181,T_8192_725,T_4096_363,T_8192_727,T_1024_91,T_8192_729,T_4096_365,T_8192_731,T_2048_183,T_8192_733,T_4096_367,T_8192_735,T_256_23,T_8192_737,T_4096_369,T_8192_739,T_2048_185,T_8192_741,T_4096_371,T_8192_743,T_1024_93,T_8192_745,T_4096_373,T_8192_747,T_2048_187,T_8192_749,T_4096_375,T_8192_751,T_512_47,T_8192_753,T_4096_377,T_8192_755,T_2048_189,T_8192_757,T_4096_379,T_8192_759,T_1024_95,T_8192_761,T_4096_381,T_8192_763,T_2048_191,T_8192_765,T_4096_383,T_8192_767,T_32_3,T_8192_769,T_4096_385,T_8192_771,T_2048_193,T_8192_773,T_4096_387,T_8192_775,T_1024_97,T_8192_777,T_4096_389,T_8192_779,T_2048_195,T_8192_781,T_4096_391,T_8192_783,T_512_49,T_8192_785,T_4096_393,T_8192_787,T_2048_197,T_8192_789,T_4096_395,T_8192_791,T_1024_99,T_8192_793,T_4096_397,T_8192_795,T_2048_199,T_8192_797,T_4096_399,T_8192_799,T_256_25,T_8192_801,T_4096_401,T_8192_803,T_2048_201,T_8192_805,T_4096_403,T_8192_807,T_1024_101,T_8192_809,T_4096_405,T_8192_811,T_2048_203,T_8192_813,T_4096_407,T_8192_815,T_512_51,T_8192_817,T_4096_409,T_8192_819,T_2048_205,T_8192_821,T_4096_411,T_8192_823,T_1024_103,T_8192_825,T_4096_413,T_8192_827,T_2048_207,T_8192_829,T_4096_415,T_8192_831,T_128_13,T_8192_833,T_4096_417,T_8192_835,T_2048_209,T_8192_837,T_4096_419,T_8192_839,T_1024_105,T_8192_841,T_4096_421,T_8192_843,T_2048_211,T_8192_845,T_4096_423,T_8192_847,T_512_53,T_8192_849,T_4096_425,T_8192_851,T_2048_213,T_8192_853,T_4096_427,T_8192_855,T_1024_107,T_8192_857,T_4096_429,T_8192_859,T_2048_215,T_8192_861,T_4096_431,T_8192_863,T_256_27,T_8192_865,T_4096_433,T_8192_867,T_2048_217,T_8192_869,T_4096_435,T_8192_871,T_1024_109,T_8192_873,T_4096_437,T_8192_875,T_2048_219,T_8192_877,T_4096_439,T_8192_879,T_512_55,T_8192_881,T_4096_441,T_8192_883,T_2048_221,T_8192_885,T_4096_443,T_8192_887,T_1024_111,T_8192_889,T_4096_445,T_8192_891,T_2048_223,T_8192_893,T_4096_447,T_8192_895,T_64_7,T_8192_897,T_4096_449,T_8192_899,T_2048_225,T_8192_901,T_4096_451,T_8192_903,T_1024_113,T_8192_905,T_4096_453,T_8192_907,T_2048_227,T_8192_909,T_4096_455,T_8192_911,T_512_57,T_8192_913,T_4096_457,T_8192_915,T_2048_229,T_8192_917,T_4096_459,T_8192_919,T_1024_115,T_8192_921,T_4096_461,T_8192_923,T_2048_231,T_8192_925,T_4096_463,T_8192_927,T_256_29,T_8192_929,T_4096_465,T_8192_931,T_2048_233,T_8192_933,T_4096_467,T_8192_935,T_1024_117,T_8192_937,T_4096_469,T_8192_939,T_2048_235,T_8192_941,T_4096_471,T_8192_943,T_512_59,T_8192_945,T_4096_473,T_8192_947,T_2048_237,T_8192_949,T_4096_475,T_8192_951,T_1024_119,T_8192_953,T_4096_477,T_8192_955,T_2048_239,T_8192_957,T_4096_479,T_8192_959,T_128_15,T_8192_961,T_4096_481,T_8192_963,T_2048_241,T_8192_965,T_4096_483,T_8192_967,T_1024_121,T_8192_969,T_4096_485,T_8192_971,T_2048_243,T_8192_973,T_4096_487,T_8192_975,T_512_61,T_8192_977,T_4096_489,T_8192_979,T_2048_245,T_8192_981,T_4096_491,T_8192_983,T_1024_123,T_8192_985,T_4096_493,T_8192_987,T_2048_247,T_8192_989,T_4096_495,T_8192_991,T_256_31,T_8192_993,T_4096_497,T_8192_995,T_2048_249,T_8192_997,T_4096_499,T_8192_999,T_1024_125,T_8192_1001,T_4096_501,T_8192_1003,T_2048_251,T_8192_1005,T_4096_503,T_8192_1007,T_512_63,T_8192_1009,T_4096_505,T_8192_1011,T_2048_253,T_8192_1013,T_4096_507,T_8192_1015,T_1024_127,T_8192_1017,T_4096_509,T_8192_1019,T_2048_255,T_8192_1021,T_4096_511,T_8192_1023,T_8_1 +}; + +static const __device__ float2 lut_mine_sp_8_16384[2049] = { + T_2_0,T_16384_1,T_8192_1,T_16384_3,T_4096_1,T_16384_5,T_8192_3,T_16384_7,T_2048_1,T_16384_9,T_8192_5,T_16384_11,T_4096_3,T_16384_13,T_8192_7,T_16384_15,T_1024_1,T_16384_17,T_8192_9,T_16384_19,T_4096_5,T_16384_21,T_8192_11,T_16384_23,T_2048_3,T_16384_25,T_8192_13,T_16384_27,T_4096_7,T_16384_29,T_8192_15,T_16384_31,T_512_1,T_16384_33,T_8192_17,T_16384_35,T_4096_9,T_16384_37,T_8192_19,T_16384_39,T_2048_5,T_16384_41,T_8192_21,T_16384_43,T_4096_11,T_16384_45,T_8192_23,T_16384_47,T_1024_3,T_16384_49,T_8192_25,T_16384_51,T_4096_13,T_16384_53,T_8192_27,T_16384_55,T_2048_7,T_16384_57,T_8192_29,T_16384_59,T_4096_15,T_16384_61,T_8192_31,T_16384_63,T_256_1,T_16384_65,T_8192_33,T_16384_67,T_4096_17,T_16384_69,T_8192_35,T_16384_71,T_2048_9,T_16384_73,T_8192_37,T_16384_75,T_4096_19,T_16384_77,T_8192_39,T_16384_79,T_1024_5,T_16384_81,T_8192_41,T_16384_83,T_4096_21,T_16384_85,T_8192_43,T_16384_87,T_2048_11,T_16384_89,T_8192_45,T_16384_91,T_4096_23,T_16384_93,T_8192_47,T_16384_95,T_512_3,T_16384_97,T_8192_49,T_16384_99,T_4096_25,T_16384_101,T_8192_51,T_16384_103,T_2048_13,T_16384_105,T_8192_53,T_16384_107,T_4096_27,T_16384_109,T_8192_55,T_16384_111,T_1024_7,T_16384_113,T_8192_57,T_16384_115,T_4096_29,T_16384_117,T_8192_59,T_16384_119,T_2048_15,T_16384_121,T_8192_61,T_16384_123,T_4096_31,T_16384_125,T_8192_63,T_16384_127,T_128_1,T_16384_129,T_8192_65,T_16384_131,T_4096_33,T_16384_133,T_8192_67,T_16384_135,T_2048_17,T_16384_137,T_8192_69,T_16384_139,T_4096_35,T_16384_141,T_8192_71,T_16384_143,T_1024_9,T_16384_145,T_8192_73,T_16384_147,T_4096_37,T_16384_149,T_8192_75,T_16384_151,T_2048_19,T_16384_153,T_8192_77,T_16384_155,T_4096_39,T_16384_157,T_8192_79,T_16384_159,T_512_5,T_16384_161,T_8192_81,T_16384_163,T_4096_41,T_16384_165,T_8192_83,T_16384_167,T_2048_21,T_16384_169,T_8192_85,T_16384_171,T_4096_43,T_16384_173,T_8192_87,T_16384_175,T_1024_11,T_16384_177,T_8192_89,T_16384_179,T_4096_45,T_16384_181,T_8192_91,T_16384_183,T_2048_23,T_16384_185,T_8192_93,T_16384_187,T_4096_47,T_16384_189,T_8192_95,T_16384_191,T_256_3,T_16384_193,T_8192_97,T_16384_195,T_4096_49,T_16384_197,T_8192_99,T_16384_199,T_2048_25,T_16384_201,T_8192_101,T_16384_203,T_4096_51,T_16384_205,T_8192_103,T_16384_207,T_1024_13,T_16384_209,T_8192_105,T_16384_211,T_4096_53,T_16384_213,T_8192_107,T_16384_215,T_2048_27,T_16384_217,T_8192_109,T_16384_219,T_4096_55,T_16384_221,T_8192_111,T_16384_223,T_512_7,T_16384_225,T_8192_113,T_16384_227,T_4096_57,T_16384_229,T_8192_115,T_16384_231,T_2048_29,T_16384_233,T_8192_117,T_16384_235,T_4096_59,T_16384_237,T_8192_119,T_16384_239,T_1024_15,T_16384_241,T_8192_121,T_16384_243,T_4096_61,T_16384_245,T_8192_123,T_16384_247,T_2048_31,T_16384_249,T_8192_125,T_16384_251,T_4096_63,T_16384_253,T_8192_127,T_16384_255,T_64_1,T_16384_257,T_8192_129,T_16384_259,T_4096_65,T_16384_261,T_8192_131,T_16384_263,T_2048_33,T_16384_265,T_8192_133,T_16384_267,T_4096_67,T_16384_269,T_8192_135,T_16384_271,T_1024_17,T_16384_273,T_8192_137,T_16384_275,T_4096_69,T_16384_277,T_8192_139,T_16384_279,T_2048_35,T_16384_281,T_8192_141,T_16384_283,T_4096_71,T_16384_285,T_8192_143,T_16384_287,T_512_9,T_16384_289,T_8192_145,T_16384_291,T_4096_73,T_16384_293,T_8192_147,T_16384_295,T_2048_37,T_16384_297,T_8192_149,T_16384_299,T_4096_75,T_16384_301,T_8192_151,T_16384_303,T_1024_19,T_16384_305,T_8192_153,T_16384_307,T_4096_77,T_16384_309,T_8192_155,T_16384_311,T_2048_39,T_16384_313,T_8192_157,T_16384_315,T_4096_79,T_16384_317,T_8192_159,T_16384_319,T_256_5,T_16384_321,T_8192_161,T_16384_323,T_4096_81,T_16384_325,T_8192_163,T_16384_327,T_2048_41,T_16384_329,T_8192_165,T_16384_331,T_4096_83,T_16384_333,T_8192_167,T_16384_335,T_1024_21,T_16384_337,T_8192_169,T_16384_339,T_4096_85,T_16384_341,T_8192_171,T_16384_343,T_2048_43,T_16384_345,T_8192_173,T_16384_347,T_4096_87,T_16384_349,T_8192_175,T_16384_351,T_512_11,T_16384_353,T_8192_177,T_16384_355,T_4096_89,T_16384_357,T_8192_179,T_16384_359,T_2048_45,T_16384_361,T_8192_181,T_16384_363,T_4096_91,T_16384_365,T_8192_183,T_16384_367,T_1024_23,T_16384_369,T_8192_185,T_16384_371,T_4096_93,T_16384_373,T_8192_187,T_16384_375,T_2048_47,T_16384_377,T_8192_189,T_16384_379,T_4096_95,T_16384_381,T_8192_191,T_16384_383,T_128_3,T_16384_385,T_8192_193,T_16384_387,T_4096_97,T_16384_389,T_8192_195,T_16384_391,T_2048_49,T_16384_393,T_8192_197,T_16384_395,T_4096_99,T_16384_397,T_8192_199,T_16384_399,T_1024_25,T_16384_401,T_8192_201,T_16384_403,T_4096_101,T_16384_405,T_8192_203,T_16384_407,T_2048_51,T_16384_409,T_8192_205,T_16384_411,T_4096_103,T_16384_413,T_8192_207,T_16384_415,T_512_13,T_16384_417,T_8192_209,T_16384_419,T_4096_105,T_16384_421,T_8192_211,T_16384_423,T_2048_53,T_16384_425,T_8192_213,T_16384_427,T_4096_107,T_16384_429,T_8192_215,T_16384_431,T_1024_27,T_16384_433,T_8192_217,T_16384_435,T_4096_109,T_16384_437,T_8192_219,T_16384_439,T_2048_55,T_16384_441,T_8192_221,T_16384_443,T_4096_111,T_16384_445,T_8192_223,T_16384_447,T_256_7,T_16384_449,T_8192_225,T_16384_451,T_4096_113,T_16384_453,T_8192_227,T_16384_455,T_2048_57,T_16384_457,T_8192_229,T_16384_459,T_4096_115,T_16384_461,T_8192_231,T_16384_463,T_1024_29,T_16384_465,T_8192_233,T_16384_467,T_4096_117,T_16384_469,T_8192_235,T_16384_471,T_2048_59,T_16384_473,T_8192_237,T_16384_475,T_4096_119,T_16384_477,T_8192_239,T_16384_479,T_512_15,T_16384_481,T_8192_241,T_16384_483,T_4096_121,T_16384_485,T_8192_243,T_16384_487,T_2048_61,T_16384_489,T_8192_245,T_16384_491,T_4096_123,T_16384_493,T_8192_247,T_16384_495,T_1024_31,T_16384_497,T_8192_249,T_16384_499,T_4096_125,T_16384_501,T_8192_251,T_16384_503,T_2048_63,T_16384_505,T_8192_253,T_16384_507,T_4096_127,T_16384_509,T_8192_255,T_16384_511,T_32_1,T_16384_513,T_8192_257,T_16384_515,T_4096_129,T_16384_517,T_8192_259,T_16384_519,T_2048_65,T_16384_521,T_8192_261,T_16384_523,T_4096_131,T_16384_525,T_8192_263,T_16384_527,T_1024_33,T_16384_529,T_8192_265,T_16384_531,T_4096_133,T_16384_533,T_8192_267,T_16384_535,T_2048_67,T_16384_537,T_8192_269,T_16384_539,T_4096_135,T_16384_541,T_8192_271,T_16384_543,T_512_17,T_16384_545,T_8192_273,T_16384_547,T_4096_137,T_16384_549,T_8192_275,T_16384_551,T_2048_69,T_16384_553,T_8192_277,T_16384_555,T_4096_139,T_16384_557,T_8192_279,T_16384_559,T_1024_35,T_16384_561,T_8192_281,T_16384_563,T_4096_141,T_16384_565,T_8192_283,T_16384_567,T_2048_71,T_16384_569,T_8192_285,T_16384_571,T_4096_143,T_16384_573,T_8192_287,T_16384_575,T_256_9,T_16384_577,T_8192_289,T_16384_579,T_4096_145,T_16384_581,T_8192_291,T_16384_583,T_2048_73,T_16384_585,T_8192_293,T_16384_587,T_4096_147,T_16384_589,T_8192_295,T_16384_591,T_1024_37,T_16384_593,T_8192_297,T_16384_595,T_4096_149,T_16384_597,T_8192_299,T_16384_599,T_2048_75,T_16384_601,T_8192_301,T_16384_603,T_4096_151,T_16384_605,T_8192_303,T_16384_607,T_512_19,T_16384_609,T_8192_305,T_16384_611,T_4096_153,T_16384_613,T_8192_307,T_16384_615,T_2048_77,T_16384_617,T_8192_309,T_16384_619,T_4096_155,T_16384_621,T_8192_311,T_16384_623,T_1024_39,T_16384_625,T_8192_313,T_16384_627,T_4096_157,T_16384_629,T_8192_315,T_16384_631,T_2048_79,T_16384_633,T_8192_317,T_16384_635,T_4096_159,T_16384_637,T_8192_319,T_16384_639,T_128_5,T_16384_641,T_8192_321,T_16384_643,T_4096_161,T_16384_645,T_8192_323,T_16384_647,T_2048_81,T_16384_649,T_8192_325,T_16384_651,T_4096_163,T_16384_653,T_8192_327,T_16384_655,T_1024_41,T_16384_657,T_8192_329,T_16384_659,T_4096_165,T_16384_661,T_8192_331,T_16384_663,T_2048_83,T_16384_665,T_8192_333,T_16384_667,T_4096_167,T_16384_669,T_8192_335,T_16384_671,T_512_21,T_16384_673,T_8192_337,T_16384_675,T_4096_169,T_16384_677,T_8192_339,T_16384_679,T_2048_85,T_16384_681,T_8192_341,T_16384_683,T_4096_171,T_16384_685,T_8192_343,T_16384_687,T_1024_43,T_16384_689,T_8192_345,T_16384_691,T_4096_173,T_16384_693,T_8192_347,T_16384_695,T_2048_87,T_16384_697,T_8192_349,T_16384_699,T_4096_175,T_16384_701,T_8192_351,T_16384_703,T_256_11,T_16384_705,T_8192_353,T_16384_707,T_4096_177,T_16384_709,T_8192_355,T_16384_711,T_2048_89,T_16384_713,T_8192_357,T_16384_715,T_4096_179,T_16384_717,T_8192_359,T_16384_719,T_1024_45,T_16384_721,T_8192_361,T_16384_723,T_4096_181,T_16384_725,T_8192_363,T_16384_727,T_2048_91,T_16384_729,T_8192_365,T_16384_731,T_4096_183,T_16384_733,T_8192_367,T_16384_735,T_512_23,T_16384_737,T_8192_369,T_16384_739,T_4096_185,T_16384_741,T_8192_371,T_16384_743,T_2048_93,T_16384_745,T_8192_373,T_16384_747,T_4096_187,T_16384_749,T_8192_375,T_16384_751,T_1024_47,T_16384_753,T_8192_377,T_16384_755,T_4096_189,T_16384_757,T_8192_379,T_16384_759,T_2048_95,T_16384_761,T_8192_381,T_16384_763,T_4096_191,T_16384_765,T_8192_383,T_16384_767,T_64_3,T_16384_769,T_8192_385,T_16384_771,T_4096_193,T_16384_773,T_8192_387,T_16384_775,T_2048_97,T_16384_777,T_8192_389,T_16384_779,T_4096_195,T_16384_781,T_8192_391,T_16384_783,T_1024_49,T_16384_785,T_8192_393,T_16384_787,T_4096_197,T_16384_789,T_8192_395,T_16384_791,T_2048_99,T_16384_793,T_8192_397,T_16384_795,T_4096_199,T_16384_797,T_8192_399,T_16384_799,T_512_25,T_16384_801,T_8192_401,T_16384_803,T_4096_201,T_16384_805,T_8192_403,T_16384_807,T_2048_101,T_16384_809,T_8192_405,T_16384_811,T_4096_203,T_16384_813,T_8192_407,T_16384_815,T_1024_51,T_16384_817,T_8192_409,T_16384_819,T_4096_205,T_16384_821,T_8192_411,T_16384_823,T_2048_103,T_16384_825,T_8192_413,T_16384_827,T_4096_207,T_16384_829,T_8192_415,T_16384_831,T_256_13,T_16384_833,T_8192_417,T_16384_835,T_4096_209,T_16384_837,T_8192_419,T_16384_839,T_2048_105,T_16384_841,T_8192_421,T_16384_843,T_4096_211,T_16384_845,T_8192_423,T_16384_847,T_1024_53,T_16384_849,T_8192_425,T_16384_851,T_4096_213,T_16384_853,T_8192_427,T_16384_855,T_2048_107,T_16384_857,T_8192_429,T_16384_859,T_4096_215,T_16384_861,T_8192_431,T_16384_863,T_512_27,T_16384_865,T_8192_433,T_16384_867,T_4096_217,T_16384_869,T_8192_435,T_16384_871,T_2048_109,T_16384_873,T_8192_437,T_16384_875,T_4096_219,T_16384_877,T_8192_439,T_16384_879,T_1024_55,T_16384_881,T_8192_441,T_16384_883,T_4096_221,T_16384_885,T_8192_443,T_16384_887,T_2048_111,T_16384_889,T_8192_445,T_16384_891,T_4096_223,T_16384_893,T_8192_447,T_16384_895,T_128_7,T_16384_897,T_8192_449,T_16384_899,T_4096_225,T_16384_901,T_8192_451,T_16384_903,T_2048_113,T_16384_905,T_8192_453,T_16384_907,T_4096_227,T_16384_909,T_8192_455,T_16384_911,T_1024_57,T_16384_913,T_8192_457,T_16384_915,T_4096_229,T_16384_917,T_8192_459,T_16384_919,T_2048_115,T_16384_921,T_8192_461,T_16384_923,T_4096_231,T_16384_925,T_8192_463,T_16384_927,T_512_29,T_16384_929,T_8192_465,T_16384_931,T_4096_233,T_16384_933,T_8192_467,T_16384_935,T_2048_117,T_16384_937,T_8192_469,T_16384_939,T_4096_235,T_16384_941,T_8192_471,T_16384_943,T_1024_59,T_16384_945,T_8192_473,T_16384_947,T_4096_237,T_16384_949,T_8192_475,T_16384_951,T_2048_119,T_16384_953,T_8192_477,T_16384_955,T_4096_239,T_16384_957,T_8192_479,T_16384_959,T_256_15,T_16384_961,T_8192_481,T_16384_963,T_4096_241,T_16384_965,T_8192_483,T_16384_967,T_2048_121,T_16384_969,T_8192_485,T_16384_971,T_4096_243,T_16384_973,T_8192_487,T_16384_975,T_1024_61,T_16384_977,T_8192_489,T_16384_979,T_4096_245,T_16384_981,T_8192_491,T_16384_983,T_2048_123,T_16384_985,T_8192_493,T_16384_987,T_4096_247,T_16384_989,T_8192_495,T_16384_991,T_512_31,T_16384_993,T_8192_497,T_16384_995,T_4096_249,T_16384_997,T_8192_499,T_16384_999,T_2048_125,T_16384_1001,T_8192_501,T_16384_1003,T_4096_251,T_16384_1005,T_8192_503,T_16384_1007,T_1024_63,T_16384_1009,T_8192_505,T_16384_1011,T_4096_253,T_16384_1013,T_8192_507,T_16384_1015,T_2048_127,T_16384_1017,T_8192_509,T_16384_1019,T_4096_255,T_16384_1021,T_8192_511,T_16384_1023,T_16_1,T_16384_1025,T_8192_513,T_16384_1027,T_4096_257,T_16384_1029,T_8192_515,T_16384_1031,T_2048_129,T_16384_1033,T_8192_517,T_16384_1035,T_4096_259,T_16384_1037,T_8192_519,T_16384_1039,T_1024_65,T_16384_1041,T_8192_521,T_16384_1043,T_4096_261,T_16384_1045,T_8192_523,T_16384_1047,T_2048_131,T_16384_1049,T_8192_525,T_16384_1051,T_4096_263,T_16384_1053,T_8192_527,T_16384_1055,T_512_33,T_16384_1057,T_8192_529,T_16384_1059,T_4096_265,T_16384_1061,T_8192_531,T_16384_1063,T_2048_133,T_16384_1065,T_8192_533,T_16384_1067,T_4096_267,T_16384_1069,T_8192_535,T_16384_1071,T_1024_67,T_16384_1073,T_8192_537,T_16384_1075,T_4096_269,T_16384_1077,T_8192_539,T_16384_1079,T_2048_135,T_16384_1081,T_8192_541,T_16384_1083,T_4096_271,T_16384_1085,T_8192_543,T_16384_1087,T_256_17,T_16384_1089,T_8192_545,T_16384_1091,T_4096_273,T_16384_1093,T_8192_547,T_16384_1095,T_2048_137,T_16384_1097,T_8192_549,T_16384_1099,T_4096_275,T_16384_1101,T_8192_551,T_16384_1103,T_1024_69,T_16384_1105,T_8192_553,T_16384_1107,T_4096_277,T_16384_1109,T_8192_555,T_16384_1111,T_2048_139,T_16384_1113,T_8192_557,T_16384_1115,T_4096_279,T_16384_1117,T_8192_559,T_16384_1119,T_512_35,T_16384_1121,T_8192_561,T_16384_1123,T_4096_281,T_16384_1125,T_8192_563,T_16384_1127,T_2048_141,T_16384_1129,T_8192_565,T_16384_1131,T_4096_283,T_16384_1133,T_8192_567,T_16384_1135,T_1024_71,T_16384_1137,T_8192_569,T_16384_1139,T_4096_285,T_16384_1141,T_8192_571,T_16384_1143,T_2048_143,T_16384_1145,T_8192_573,T_16384_1147,T_4096_287,T_16384_1149,T_8192_575,T_16384_1151,T_128_9,T_16384_1153,T_8192_577,T_16384_1155,T_4096_289,T_16384_1157,T_8192_579,T_16384_1159,T_2048_145,T_16384_1161,T_8192_581,T_16384_1163,T_4096_291,T_16384_1165,T_8192_583,T_16384_1167,T_1024_73,T_16384_1169,T_8192_585,T_16384_1171,T_4096_293,T_16384_1173,T_8192_587,T_16384_1175,T_2048_147,T_16384_1177,T_8192_589,T_16384_1179,T_4096_295,T_16384_1181,T_8192_591,T_16384_1183,T_512_37,T_16384_1185,T_8192_593,T_16384_1187,T_4096_297,T_16384_1189,T_8192_595,T_16384_1191,T_2048_149,T_16384_1193,T_8192_597,T_16384_1195,T_4096_299,T_16384_1197,T_8192_599,T_16384_1199,T_1024_75,T_16384_1201,T_8192_601,T_16384_1203,T_4096_301,T_16384_1205,T_8192_603,T_16384_1207,T_2048_151,T_16384_1209,T_8192_605,T_16384_1211,T_4096_303,T_16384_1213,T_8192_607,T_16384_1215,T_256_19,T_16384_1217,T_8192_609,T_16384_1219,T_4096_305,T_16384_1221,T_8192_611,T_16384_1223,T_2048_153,T_16384_1225,T_8192_613,T_16384_1227,T_4096_307,T_16384_1229,T_8192_615,T_16384_1231,T_1024_77,T_16384_1233,T_8192_617,T_16384_1235,T_4096_309,T_16384_1237,T_8192_619,T_16384_1239,T_2048_155,T_16384_1241,T_8192_621,T_16384_1243,T_4096_311,T_16384_1245,T_8192_623,T_16384_1247,T_512_39,T_16384_1249,T_8192_625,T_16384_1251,T_4096_313,T_16384_1253,T_8192_627,T_16384_1255,T_2048_157,T_16384_1257,T_8192_629,T_16384_1259,T_4096_315,T_16384_1261,T_8192_631,T_16384_1263,T_1024_79,T_16384_1265,T_8192_633,T_16384_1267,T_4096_317,T_16384_1269,T_8192_635,T_16384_1271,T_2048_159,T_16384_1273,T_8192_637,T_16384_1275,T_4096_319,T_16384_1277,T_8192_639,T_16384_1279,T_64_5,T_16384_1281,T_8192_641,T_16384_1283,T_4096_321,T_16384_1285,T_8192_643,T_16384_1287,T_2048_161,T_16384_1289,T_8192_645,T_16384_1291,T_4096_323,T_16384_1293,T_8192_647,T_16384_1295,T_1024_81,T_16384_1297,T_8192_649,T_16384_1299,T_4096_325,T_16384_1301,T_8192_651,T_16384_1303,T_2048_163,T_16384_1305,T_8192_653,T_16384_1307,T_4096_327,T_16384_1309,T_8192_655,T_16384_1311,T_512_41,T_16384_1313,T_8192_657,T_16384_1315,T_4096_329,T_16384_1317,T_8192_659,T_16384_1319,T_2048_165,T_16384_1321,T_8192_661,T_16384_1323,T_4096_331,T_16384_1325,T_8192_663,T_16384_1327,T_1024_83,T_16384_1329,T_8192_665,T_16384_1331,T_4096_333,T_16384_1333,T_8192_667,T_16384_1335,T_2048_167,T_16384_1337,T_8192_669,T_16384_1339,T_4096_335,T_16384_1341,T_8192_671,T_16384_1343,T_256_21,T_16384_1345,T_8192_673,T_16384_1347,T_4096_337,T_16384_1349,T_8192_675,T_16384_1351,T_2048_169,T_16384_1353,T_8192_677,T_16384_1355,T_4096_339,T_16384_1357,T_8192_679,T_16384_1359,T_1024_85,T_16384_1361,T_8192_681,T_16384_1363,T_4096_341,T_16384_1365,T_8192_683,T_16384_1367,T_2048_171,T_16384_1369,T_8192_685,T_16384_1371,T_4096_343,T_16384_1373,T_8192_687,T_16384_1375,T_512_43,T_16384_1377,T_8192_689,T_16384_1379,T_4096_345,T_16384_1381,T_8192_691,T_16384_1383,T_2048_173,T_16384_1385,T_8192_693,T_16384_1387,T_4096_347,T_16384_1389,T_8192_695,T_16384_1391,T_1024_87,T_16384_1393,T_8192_697,T_16384_1395,T_4096_349,T_16384_1397,T_8192_699,T_16384_1399,T_2048_175,T_16384_1401,T_8192_701,T_16384_1403,T_4096_351,T_16384_1405,T_8192_703,T_16384_1407,T_128_11,T_16384_1409,T_8192_705,T_16384_1411,T_4096_353,T_16384_1413,T_8192_707,T_16384_1415,T_2048_177,T_16384_1417,T_8192_709,T_16384_1419,T_4096_355,T_16384_1421,T_8192_711,T_16384_1423,T_1024_89,T_16384_1425,T_8192_713,T_16384_1427,T_4096_357,T_16384_1429,T_8192_715,T_16384_1431,T_2048_179,T_16384_1433,T_8192_717,T_16384_1435,T_4096_359,T_16384_1437,T_8192_719,T_16384_1439,T_512_45,T_16384_1441,T_8192_721,T_16384_1443,T_4096_361,T_16384_1445,T_8192_723,T_16384_1447,T_2048_181,T_16384_1449,T_8192_725,T_16384_1451,T_4096_363,T_16384_1453,T_8192_727,T_16384_1455,T_1024_91,T_16384_1457,T_8192_729,T_16384_1459,T_4096_365,T_16384_1461,T_8192_731,T_16384_1463,T_2048_183,T_16384_1465,T_8192_733,T_16384_1467,T_4096_367,T_16384_1469,T_8192_735,T_16384_1471,T_256_23,T_16384_1473,T_8192_737,T_16384_1475,T_4096_369,T_16384_1477,T_8192_739,T_16384_1479,T_2048_185,T_16384_1481,T_8192_741,T_16384_1483,T_4096_371,T_16384_1485,T_8192_743,T_16384_1487,T_1024_93,T_16384_1489,T_8192_745,T_16384_1491,T_4096_373,T_16384_1493,T_8192_747,T_16384_1495,T_2048_187,T_16384_1497,T_8192_749,T_16384_1499,T_4096_375,T_16384_1501,T_8192_751,T_16384_1503,T_512_47,T_16384_1505,T_8192_753,T_16384_1507,T_4096_377,T_16384_1509,T_8192_755,T_16384_1511,T_2048_189,T_16384_1513,T_8192_757,T_16384_1515,T_4096_379,T_16384_1517,T_8192_759,T_16384_1519,T_1024_95,T_16384_1521,T_8192_761,T_16384_1523,T_4096_381,T_16384_1525,T_8192_763,T_16384_1527,T_2048_191,T_16384_1529,T_8192_765,T_16384_1531,T_4096_383,T_16384_1533,T_8192_767,T_16384_1535,T_32_3,T_16384_1537,T_8192_769,T_16384_1539,T_4096_385,T_16384_1541,T_8192_771,T_16384_1543,T_2048_193,T_16384_1545,T_8192_773,T_16384_1547,T_4096_387,T_16384_1549,T_8192_775,T_16384_1551,T_1024_97,T_16384_1553,T_8192_777,T_16384_1555,T_4096_389,T_16384_1557,T_8192_779,T_16384_1559,T_2048_195,T_16384_1561,T_8192_781,T_16384_1563,T_4096_391,T_16384_1565,T_8192_783,T_16384_1567,T_512_49,T_16384_1569,T_8192_785,T_16384_1571,T_4096_393,T_16384_1573,T_8192_787,T_16384_1575,T_2048_197,T_16384_1577,T_8192_789,T_16384_1579,T_4096_395,T_16384_1581,T_8192_791,T_16384_1583,T_1024_99,T_16384_1585,T_8192_793,T_16384_1587,T_4096_397,T_16384_1589,T_8192_795,T_16384_1591,T_2048_199,T_16384_1593,T_8192_797,T_16384_1595,T_4096_399,T_16384_1597,T_8192_799,T_16384_1599,T_256_25,T_16384_1601,T_8192_801,T_16384_1603,T_4096_401,T_16384_1605,T_8192_803,T_16384_1607,T_2048_201,T_16384_1609,T_8192_805,T_16384_1611,T_4096_403,T_16384_1613,T_8192_807,T_16384_1615,T_1024_101,T_16384_1617,T_8192_809,T_16384_1619,T_4096_405,T_16384_1621,T_8192_811,T_16384_1623,T_2048_203,T_16384_1625,T_8192_813,T_16384_1627,T_4096_407,T_16384_1629,T_8192_815,T_16384_1631,T_512_51,T_16384_1633,T_8192_817,T_16384_1635,T_4096_409,T_16384_1637,T_8192_819,T_16384_1639,T_2048_205,T_16384_1641,T_8192_821,T_16384_1643,T_4096_411,T_16384_1645,T_8192_823,T_16384_1647,T_1024_103,T_16384_1649,T_8192_825,T_16384_1651,T_4096_413,T_16384_1653,T_8192_827,T_16384_1655,T_2048_207,T_16384_1657,T_8192_829,T_16384_1659,T_4096_415,T_16384_1661,T_8192_831,T_16384_1663,T_128_13,T_16384_1665,T_8192_833,T_16384_1667,T_4096_417,T_16384_1669,T_8192_835,T_16384_1671,T_2048_209,T_16384_1673,T_8192_837,T_16384_1675,T_4096_419,T_16384_1677,T_8192_839,T_16384_1679,T_1024_105,T_16384_1681,T_8192_841,T_16384_1683,T_4096_421,T_16384_1685,T_8192_843,T_16384_1687,T_2048_211,T_16384_1689,T_8192_845,T_16384_1691,T_4096_423,T_16384_1693,T_8192_847,T_16384_1695,T_512_53,T_16384_1697,T_8192_849,T_16384_1699,T_4096_425,T_16384_1701,T_8192_851,T_16384_1703,T_2048_213,T_16384_1705,T_8192_853,T_16384_1707,T_4096_427,T_16384_1709,T_8192_855,T_16384_1711,T_1024_107,T_16384_1713,T_8192_857,T_16384_1715,T_4096_429,T_16384_1717,T_8192_859,T_16384_1719,T_2048_215,T_16384_1721,T_8192_861,T_16384_1723,T_4096_431,T_16384_1725,T_8192_863,T_16384_1727,T_256_27,T_16384_1729,T_8192_865,T_16384_1731,T_4096_433,T_16384_1733,T_8192_867,T_16384_1735,T_2048_217,T_16384_1737,T_8192_869,T_16384_1739,T_4096_435,T_16384_1741,T_8192_871,T_16384_1743,T_1024_109,T_16384_1745,T_8192_873,T_16384_1747,T_4096_437,T_16384_1749,T_8192_875,T_16384_1751,T_2048_219,T_16384_1753,T_8192_877,T_16384_1755,T_4096_439,T_16384_1757,T_8192_879,T_16384_1759,T_512_55,T_16384_1761,T_8192_881,T_16384_1763,T_4096_441,T_16384_1765,T_8192_883,T_16384_1767,T_2048_221,T_16384_1769,T_8192_885,T_16384_1771,T_4096_443,T_16384_1773,T_8192_887,T_16384_1775,T_1024_111,T_16384_1777,T_8192_889,T_16384_1779,T_4096_445,T_16384_1781,T_8192_891,T_16384_1783,T_2048_223,T_16384_1785,T_8192_893,T_16384_1787,T_4096_447,T_16384_1789,T_8192_895,T_16384_1791,T_64_7,T_16384_1793,T_8192_897,T_16384_1795,T_4096_449,T_16384_1797,T_8192_899,T_16384_1799,T_2048_225,T_16384_1801,T_8192_901,T_16384_1803,T_4096_451,T_16384_1805,T_8192_903,T_16384_1807,T_1024_113,T_16384_1809,T_8192_905,T_16384_1811,T_4096_453,T_16384_1813,T_8192_907,T_16384_1815,T_2048_227,T_16384_1817,T_8192_909,T_16384_1819,T_4096_455,T_16384_1821,T_8192_911,T_16384_1823,T_512_57,T_16384_1825,T_8192_913,T_16384_1827,T_4096_457,T_16384_1829,T_8192_915,T_16384_1831,T_2048_229,T_16384_1833,T_8192_917,T_16384_1835,T_4096_459,T_16384_1837,T_8192_919,T_16384_1839,T_1024_115,T_16384_1841,T_8192_921,T_16384_1843,T_4096_461,T_16384_1845,T_8192_923,T_16384_1847,T_2048_231,T_16384_1849,T_8192_925,T_16384_1851,T_4096_463,T_16384_1853,T_8192_927,T_16384_1855,T_256_29,T_16384_1857,T_8192_929,T_16384_1859,T_4096_465,T_16384_1861,T_8192_931,T_16384_1863,T_2048_233,T_16384_1865,T_8192_933,T_16384_1867,T_4096_467,T_16384_1869,T_8192_935,T_16384_1871,T_1024_117,T_16384_1873,T_8192_937,T_16384_1875,T_4096_469,T_16384_1877,T_8192_939,T_16384_1879,T_2048_235,T_16384_1881,T_8192_941,T_16384_1883,T_4096_471,T_16384_1885,T_8192_943,T_16384_1887,T_512_59,T_16384_1889,T_8192_945,T_16384_1891,T_4096_473,T_16384_1893,T_8192_947,T_16384_1895,T_2048_237,T_16384_1897,T_8192_949,T_16384_1899,T_4096_475,T_16384_1901,T_8192_951,T_16384_1903,T_1024_119,T_16384_1905,T_8192_953,T_16384_1907,T_4096_477,T_16384_1909,T_8192_955,T_16384_1911,T_2048_239,T_16384_1913,T_8192_957,T_16384_1915,T_4096_479,T_16384_1917,T_8192_959,T_16384_1919,T_128_15,T_16384_1921,T_8192_961,T_16384_1923,T_4096_481,T_16384_1925,T_8192_963,T_16384_1927,T_2048_241,T_16384_1929,T_8192_965,T_16384_1931,T_4096_483,T_16384_1933,T_8192_967,T_16384_1935,T_1024_121,T_16384_1937,T_8192_969,T_16384_1939,T_4096_485,T_16384_1941,T_8192_971,T_16384_1943,T_2048_243,T_16384_1945,T_8192_973,T_16384_1947,T_4096_487,T_16384_1949,T_8192_975,T_16384_1951,T_512_61,T_16384_1953,T_8192_977,T_16384_1955,T_4096_489,T_16384_1957,T_8192_979,T_16384_1959,T_2048_245,T_16384_1961,T_8192_981,T_16384_1963,T_4096_491,T_16384_1965,T_8192_983,T_16384_1967,T_1024_123,T_16384_1969,T_8192_985,T_16384_1971,T_4096_493,T_16384_1973,T_8192_987,T_16384_1975,T_2048_247,T_16384_1977,T_8192_989,T_16384_1979,T_4096_495,T_16384_1981,T_8192_991,T_16384_1983,T_256_31,T_16384_1985,T_8192_993,T_16384_1987,T_4096_497,T_16384_1989,T_8192_995,T_16384_1991,T_2048_249,T_16384_1993,T_8192_997,T_16384_1995,T_4096_499,T_16384_1997,T_8192_999,T_16384_1999,T_1024_125,T_16384_2001,T_8192_1001,T_16384_2003,T_4096_501,T_16384_2005,T_8192_1003,T_16384_2007,T_2048_251,T_16384_2009,T_8192_1005,T_16384_2011,T_4096_503,T_16384_2013,T_8192_1007,T_16384_2015,T_512_63,T_16384_2017,T_8192_1009,T_16384_2019,T_4096_505,T_16384_2021,T_8192_1011,T_16384_2023,T_2048_253,T_16384_2025,T_8192_1013,T_16384_2027,T_4096_507,T_16384_2029,T_8192_1015,T_16384_2031,T_1024_127,T_16384_2033,T_8192_1017,T_16384_2035,T_4096_509,T_16384_2037,T_8192_1019,T_16384_2039,T_2048_255,T_16384_2041,T_8192_1021,T_16384_2043,T_4096_511,T_16384_2045,T_8192_1023,T_16384_2047,T_8_1 +}; + +} // namespace detail +} // namespace database +} // namespace cufftdx diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/lut_code_gen.py b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/lut_code_gen.py new file mode 100644 index 0000000000000..4d68074ad6b32 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/lut_code_gen.py @@ -0,0 +1,32 @@ +import math + +import re +import numpy as np + + +# N = 8192 +N = 16384 +# The case of 0 / N is special, we want to simplify it to 0 / 2 instead of 0 / 1 +numerator = np.arange(1, N // 8 + 1) +gcd = np.gcd(numerator, N) +num = numerator // gcd +denom = N // gcd +lut_vals = ['T_2_0'] + [f'T_{d}_{n}' for n, d in zip(num, denom)] +lut_string = f"static const __device__ float2 lut_mine_sp_8_{N}[{N // 8 + 1}] = {{\n {','.join(lut_vals)}\n}};" +print(lut_string) + +# Only define new values if it's not already in the cuFFTDx lookup table +cufftdx_lut_filename = 'mathdx/22.02/include/cufftdx/include/database/lut_defines_0.hpp.inc' +matches = set() +reg = re.compile(f'^#define T_{N}_([0-9]+) ') +with open(cufftdx_lut_filename, 'r') as f: + for line in f: + if (match := reg.match(line)) is not None: + matches.add(int(match[1])) + +numerator = np.arange(1, N // 8 + 1, 2) +angle = -2 * math.pi * numerator.astype(np.float64) / N +cos, sin = np.cos(angle), np.sin(angle) +defs = [f'#define T_{N}_{n} {{{c:.40f},{s:.40f}}}' for n, c, s in zip(numerator, cos, sin) if n not in matches] +def_string = '\n'.join(defs) +print(def_string) diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/map.h b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/map.h new file mode 100644 index 0000000000000..f2dcdb33a71dd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/map.h @@ -0,0 +1,72 @@ +// Downloaded from https://github.com/swansontec/map-macro + +/* + * Copyright (C) 2012 William Swanson + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF + * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Except as contained in this notice, the names of the authors or + * their institutions shall not be used in advertising or otherwise to + * promote the sale, use or other dealings in this Software without + * prior written authorization from the authors. + */ + +#ifndef MAP_H_INCLUDED +#define MAP_H_INCLUDED + +#define EVAL0(...) __VA_ARGS__ +#define EVAL1(...) EVAL0(EVAL0(EVAL0(__VA_ARGS__))) +#define EVAL2(...) EVAL1(EVAL1(EVAL1(__VA_ARGS__))) +#define EVAL3(...) EVAL2(EVAL2(EVAL2(__VA_ARGS__))) +#define EVAL4(...) EVAL3(EVAL3(EVAL3(__VA_ARGS__))) +#define EVAL(...) EVAL4(EVAL4(EVAL4(__VA_ARGS__))) + +#define MAP_END(...) +#define MAP_OUT +#define MAP_COMMA , + +#define MAP_GET_END2() 0, MAP_END +#define MAP_GET_END1(...) MAP_GET_END2 +#define MAP_GET_END(...) MAP_GET_END1 +#define MAP_NEXT0(test, next, ...) next MAP_OUT +#define MAP_NEXT1(test, next) MAP_NEXT0(test, next, 0) +#define MAP_NEXT(test, next) MAP_NEXT1(MAP_GET_END test, next) + +#define MAP0(f, x, peek, ...) f(x) MAP_NEXT(peek, MAP1)(f, peek, __VA_ARGS__) +#define MAP1(f, x, peek, ...) f(x) MAP_NEXT(peek, MAP0)(f, peek, __VA_ARGS__) + +#define MAP_LIST_NEXT1(test, next) MAP_NEXT0(test, MAP_COMMA next, 0) +#define MAP_LIST_NEXT(test, next) MAP_LIST_NEXT1(MAP_GET_END test, next) + +#define MAP_LIST0(f, x, peek, ...) f(x) MAP_LIST_NEXT(peek, MAP_LIST1)(f, peek, __VA_ARGS__) +#define MAP_LIST1(f, x, peek, ...) f(x) MAP_LIST_NEXT(peek, MAP_LIST0)(f, peek, __VA_ARGS__) + +/** + * Applies the function macro `f` to each of the remaining parameters. + */ +#define MAP(f, ...) EVAL(MAP1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) + +/** + * Applies the function macro `f` to each of the remaining parameters and + * inserts commas between the results. + */ +#define MAP_LIST(f, ...) EVAL(MAP_LIST1(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/LICENSE.txt b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/LICENSE.txt new file mode 100644 index 0000000000000..57ebdf2c6245c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/LICENSE.txt @@ -0,0 +1,137 @@ +LICENSE AGREEMENT FOR NVIDIA MATH LIBRARIES SOFTWARE DEVELOPMENT KITS + +This license agreement("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA math libraries software development kit as available at NVIDIA’s discretion (each, a “SDK”). + +Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation. + +This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used. + +If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent. + +If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK. + +You agree to use the SDK only for purposes that are permitted by (a) this Agreement, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. + +1. License. + +1.1 Grant + +Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to: + +(i) Install and use the SDK, and + +(ii) Distribute the binary files, files identified as samples, and headers as incorporated into a software application that meets the distribution requirements indicated in this Agreement. + +1.2 Distribution Requirements + +These are the distribution requirements for you to exercise the distribution grant: + +(i) Your application must have material additional functionality, beyond the included portions of the SDK. + +(ii) The distributable portions of the SDK shall only be accessed by your application. + +(iii) The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.” + +(iv) Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only. + +(v) The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. + +(vi) You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SDK not in compliance with the requirements of this Agreement, and to enforce the terms of your agreements with respect to distributed SDK. + +1.3 Authorized Users + +You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf. + +If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network. + +You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences. + +1.4 Pre-Release SDK +The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss. +You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems. +NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability. +1.5 Updates + +NVIDIA may, at its option, make available patches, workarounds or other updates to this SDK. Unless the updates are provided with their separate governing terms, they are deemed part of the SDK licensed to you as provided in this Agreement. + +You agree that the form and content of the SDK that NVIDIA provides may change without prior notice to you. While NVIDIA generally maintains compatibility between versions, NVIDIA may in some cases make changes that introduce incompatibilities in future versions of the SDK. + +1.6 Components Under Other Licenses. + +The SDK may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SDK, such as components governed by open source software licenses. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +1.7 Reservation of Rights + +NVIDIA reserves all rights, title and interest in and to the SDK not expressly granted to you under this Agreement. + +2. Limitations. + +The following license limitations apply to your use of the SDK: + +2.1 The SDK is licensed for you to develop applications only for use in systems with NVIDIA GPUs. + +2.2 You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK. + +2.3 Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. + +2.4 Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA. + +2.5 You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK. + +2.6 You may not use the SDK in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SDK be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. + +2.7 You acknowledge that the SDK as delivered is not tested or certified by NVIDIA for use in connection with the design, construction, maintenance, and/or operation of any system where the use or failure of such system could result in a situation that threatens the safety of human life or results in catastrophic damages (each, a “Critical Application”). Examples of Critical Applications include use in avionics, navigation, autonomous vehicle applications, ai solutions for automotive products, military, medical, life support or other life critical applications. NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. You are solely responsible for ensuring that any product or service developed with the SDK as a whole includes sufficient features to comply with all applicable legal and regulatory standards and requirements. + +2.8 You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to products or services that use the SDK in or for Critical Applications, and for use of the SDK outside of the scope of this Agreement, or not in compliance with its terms. + +3. Ownership. + +3.1 NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications, including their respective intellectual property rights. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights. + +3.2 You may, but don’t have to, provide to NVIDIA suggestions, feature requests or other feedback regarding the SDK, including possible enhancements or modifications to the SDK. For any feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) it without the payment of any royalties or fees to you. NVIDIA will use feedback at its choice. + +4. No Warranties. + +THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE. + +5. Limitations of Liability. + +TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. + +These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different. + +6. Termination. + +6.1 This Agreement will continue to apply until terminated by either you or NVIDIA as described below. + +6.2 If you want to terminate this Agreement, you may do so by stopping to use the SDK. + +6.3 NVIDIA may, at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); (ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or (iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. + +6.4 Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the licenses granted to you. + +7. General. + +If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified. + +You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement. + +This Agreement will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. + +The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. + +If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, such provision will be construed as limited to the extent necessary to be consistent with and fully enforceable under the law and the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative. + +Each party acknowledges and agrees that the other is an independent contractor in the performance of this Agreement. + +The SDK has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this Agreement pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. + +The SDK is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SDK into any country, or use the SDK in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this Agreement, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SDK. + +Any notice delivered by NVIDIA to you under this Agreement will be delivered via mail, email or fax. You agree that any notices that NVIDIA sends you electronically will satisfy any legal communication requirements. Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. + +This Agreement constitutes the entire agreement of the parties with respect to the subject matter of this Agreement and supersedes all prior negotiations or documentation exchanged between the parties relating to this subject matter. Any additional and/or conflicting terms on documents issued by you are null, void, and invalid. Any amendment or waiver under this Agreement shall be in writing and signed by representatives of both parties. + +If the distribution terms in this Agreement are not suitable for your organization, or for any questions regarding this Agreement, please contact NVIDIA at nvidia-compute-license-questions@nvidia.com. + +(v. February 10, 2022) diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/basic.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/basic.css new file mode 100644 index 0000000000000..bf18350b65c61 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/basic.css @@ -0,0 +1,906 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 450px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a.brackets:before, +span.brackets > a:before{ + content: "["; +} + +a.brackets:after, +span.brackets > a:after { + content: "]"; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +table.footnote td, table.footnote th { + border: 0 !important; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +dl.footnote > dt, +dl.citation > dt { + float: left; + margin-right: 0.5em; +} + +dl.footnote > dd, +dl.citation > dd { + margin-bottom: 0em; +} + +dl.footnote > dd:after, +dl.citation > dd:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dt:after { + content: ":"; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/badge_only.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/badge_only.css new file mode 100644 index 0000000000000..e380325bc6e27 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/badge_only.css @@ -0,0 +1 @@ +.fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Bold.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 0000000000000..6cb60000181db Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Bold.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 0000000000000..7059e23142aae Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Regular.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 0000000000000..f815f63f99da8 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Regular.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 0000000000000..f2c76e5bda18a Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.eot b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.eot new file mode 100644 index 0000000000000..e9f60ca953f93 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.eot differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.svg b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.svg new file mode 100644 index 0000000000000..855c845e538b6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserved. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.ttf b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 0000000000000..35acda2fa1196 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.ttf differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.woff new file mode 100644 index 0000000000000..400014a4b06ee Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 0000000000000..4d13fc60404b9 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/fontawesome-webfont.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold-italic.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold-italic.woff new file mode 100644 index 0000000000000..88ad05b9ff413 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold-italic.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold-italic.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 0000000000000..c4e3d804b57b6 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold-italic.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold.woff new file mode 100644 index 0000000000000..c6dff51f063cc Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold.woff2 new file mode 100644 index 0000000000000..bb195043cfc07 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-bold.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal-italic.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal-italic.woff new file mode 100644 index 0000000000000..76114bc033622 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal-italic.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal-italic.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 0000000000000..3404f37e2e312 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal-italic.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal.woff new file mode 100644 index 0000000000000..ae1307ff5f4c4 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal.woff2 new file mode 100644 index 0000000000000..3bf9843328a63 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/fonts/lato-normal.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/theme.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/theme.css new file mode 100644 index 0000000000000..0d9ae7e1a45b8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/css/theme.css @@ -0,0 +1,4 @@ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,.wy-nav-top a,.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs li{display:inline-block}.wy-breadcrumbs li.wy-breadcrumbs-aside{float:right}.wy-breadcrumbs li a{display:inline-block;padding:5px}.wy-breadcrumbs li a:first-child{padding-left:0}.rst-content .wy-breadcrumbs li tt,.wy-breadcrumbs li .rst-content tt,.wy-breadcrumbs li code{padding:5px;border:none;background:none}.rst-content .wy-breadcrumbs li tt.literal,.wy-breadcrumbs li .rst-content tt.literal,.wy-breadcrumbs li code.literal{color:#404040}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.field-list>dt:after,html.writer-html5 .rst-content dl.footnote>dt:after{content:":"}html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.footnote>dt>span.brackets{margin-right:.5rem}html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{font-style:italic}html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.footnote>dd p,html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{font-size:inherit;line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/cufftdx_override.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/cufftdx_override.css new file mode 100644 index 0000000000000..8355a4ea4aa2e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/cufftdx_override.css @@ -0,0 +1,3 @@ +.wy-nav-content { +max-width: 1240px !important; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/doctools.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/doctools.js new file mode 100644 index 0000000000000..e509e48349c55 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/doctools.js @@ -0,0 +1,326 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for all documentation. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + +/** + * make the code below compatible with browsers without + * an installed firebug like debugger +if (!window.console || !console.firebug) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", + "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", + "profile", "profileEnd"]; + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +} + */ + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} + +/** + * Small JavaScript module for the documentation. + */ +var Documentation = { + + init : function() { + this.fixFirefoxAnchorBug(); + this.highlightSearchWords(); + this.initIndexTable(); + if (DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) { + this.initOnKeyListeners(); + } + }, + + /** + * i18n support + */ + TRANSLATIONS : {}, + PLURAL_EXPR : function(n) { return n === 1 ? 0 : 1; }, + LOCALE : 'unknown', + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext : function(string) { + var translated = Documentation.TRANSLATIONS[string]; + if (typeof translated === 'undefined') + return string; + return (typeof translated === 'string') ? translated : translated[0]; + }, + + ngettext : function(singular, plural, n) { + var translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated === 'undefined') + return (n == 1) ? singular : plural; + return translated[Documentation.PLURALEXPR(n)]; + }, + + addTranslations : function(catalog) { + for (var key in catalog.messages) + this.TRANSLATIONS[key] = catalog.messages[key]; + this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); + this.LOCALE = catalog.locale; + }, + + /** + * add context elements like header anchor links + */ + addContextElements : function() { + $('div[id] > :header:first').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this headline')). + appendTo(this); + }); + $('dt[id]').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this definition')). + appendTo(this); + }); + }, + + /** + * workaround a firefox stupidity + * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 + */ + fixFirefoxAnchorBug : function() { + if (document.location.hash && $.browser.mozilla) + window.setTimeout(function() { + document.location.href += ''; + }, 10); + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords : function() { + var params = $.getQueryParameters(); + var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; + if (terms.length) { + var body = $('div.body'); + if (!body.length) { + body = $('body'); + } + window.setTimeout(function() { + $.each(terms, function() { + body.highlightText(this.toLowerCase(), 'highlighted'); + }); + }, 10); + $('') + .appendTo($('#searchbox')); + } + }, + + /** + * init the domain index toggle buttons + */ + initIndexTable : function() { + var togglers = $('img.toggler').click(function() { + var src = $(this).attr('src'); + var idnum = $(this).attr('id').substr(7); + $('tr.cg-' + idnum).toggle(); + if (src.substr(-9) === 'minus.png') + $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); + else + $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); + }).css('display', ''); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { + togglers.click(); + } + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords : function() { + $('#searchbox .highlight-link').fadeOut(300); + $('span.highlighted').removeClass('highlighted'); + var url = new URL(window.location); + url.searchParams.delete('highlight'); + window.history.replaceState({}, '', url); + }, + + /** + * make the url absolute + */ + makeURL : function(relativeURL) { + return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; + }, + + /** + * get the current relative url + */ + getCurrentURL : function() { + var path = document.location.pathname; + var parts = path.split(/\//); + $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { + if (this === '..') + parts.pop(); + }); + var url = parts.join('/'); + return path.substring(url.lastIndexOf('/') + 1, path.length - 1); + }, + + initOnKeyListeners: function() { + $(document).keydown(function(event) { + var activeElementType = document.activeElement.tagName; + // don't navigate when in search box, textarea, dropdown or button + if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT' + && activeElementType !== 'BUTTON' && !event.altKey && !event.ctrlKey && !event.metaKey + && !event.shiftKey) { + switch (event.keyCode) { + case 37: // left + var prevHref = $('link[rel="prev"]').prop('href'); + if (prevHref) { + window.location.href = prevHref; + return false; + } + break; + case 39: // right + var nextHref = $('link[rel="next"]').prop('href'); + if (nextHref) { + window.location.href = nextHref; + return false; + } + break; + } + } + }); + } +}; + +// quick alias for translations +_ = Documentation.gettext; + +$(document).ready(function() { + Documentation.init(); +}); diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/documentation_options.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/documentation_options.js new file mode 100644 index 0000000000000..9af54603b9c00 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/documentation_options.js @@ -0,0 +1,12 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '1.0.0', + LANGUAGE: 'None', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: false, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false +}; \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/file.png b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/file.png new file mode 100644 index 0000000000000..a858a410e4faa Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/file.png differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/jquery-3.5.1.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/jquery-3.5.1.js new file mode 100644 index 0000000000000..50937333b99a5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/_static/jquery-3.5.1.js @@ -0,0 +1,10872 @@ +/*! + * jQuery JavaScript Library v3.5.1 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2020-05-04T22:49Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var flat = arr.flat ? function( array ) { + return arr.flat.call( array ); +} : function( array ) { + return arr.concat.apply( [], array ); +}; + + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + +var isFunction = function isFunction( obj ) { + + // Support: Chrome <=57, Firefox <=52 + // In some browsers, typeof returns "function" for HTML elements + // (i.e., `typeof document.createElement( "object" ) === "function"`). + // We don't want to classify *any* DOM node as a function. + return typeof obj === "function" && typeof obj.nodeType !== "number"; + }; + + +var isWindow = function isWindow( obj ) { + return obj != null && obj === obj.window; + }; + + +var document = window.document; + + + + var preservedScriptAttributes = { + type: true, + src: true, + nonce: true, + noModule: true + }; + + function DOMEval( code, node, doc ) { + doc = doc || document; + + var i, val, + script = doc.createElement( "script" ); + + script.text = code; + if ( node ) { + for ( i in preservedScriptAttributes ) { + + // Support: Firefox 64+, Edge 18+ + // Some browsers don't support the "nonce" property on scripts. + // On the other hand, just using `getAttribute` is not enough as + // the `nonce` attribute is reset to an empty string whenever it + // becomes browsing-context connected. + // See https://github.com/whatwg/html/issues/2369 + // See https://html.spec.whatwg.org/#nonce-attributes + // The `node.getAttribute` check was added for the sake of + // `jQuery.globalEval` so that it can fake a nonce-containing node + // via an object. + val = node[ i ] || node.getAttribute && node.getAttribute( i ); + if ( val ) { + script.setAttribute( i, val ); + } + } + } + doc.head.appendChild( script ).parentNode.removeChild( script ); + } + + +function toType( obj ) { + if ( obj == null ) { + return obj + ""; + } + + // Support: Android <=2.3 only (functionish RegExp) + return typeof obj === "object" || typeof obj === "function" ? + class2type[ toString.call( obj ) ] || "object" : + typeof obj; +} +/* global Symbol */ +// Defining this global in .eslintrc.json would create a danger of using the global +// unguarded in another place, it seems safer to define global only for this module + + + +var + version = "3.5.1", + + // Define a local copy of jQuery + jQuery = function( selector, context ) { + + // The jQuery object is actually just the init constructor 'enhanced' + // Need init if jQuery is called (just allow error to be thrown if not included) + return new jQuery.fn.init( selector, context ); + }; + +jQuery.fn = jQuery.prototype = { + + // The current version of jQuery being used + jquery: version, + + constructor: jQuery, + + // The default length of a jQuery object is 0 + length: 0, + + toArray: function() { + return slice.call( this ); + }, + + // Get the Nth element in the matched element set OR + // Get the whole matched element set as a clean array + get: function( num ) { + + // Return all the elements in a clean array + if ( num == null ) { + return slice.call( this ); + } + + // Return just the one element from the set + return num < 0 ? this[ num + this.length ] : this[ num ]; + }, + + // Take an array of elements and push it onto the stack + // (returning the new matched element set) + pushStack: function( elems ) { + + // Build a new jQuery matched element set + var ret = jQuery.merge( this.constructor(), elems ); + + // Add the old object onto the stack (as a reference) + ret.prevObject = this; + + // Return the newly-formed element set + return ret; + }, + + // Execute a callback for every element in the matched set. + each: function( callback ) { + return jQuery.each( this, callback ); + }, + + map: function( callback ) { + return this.pushStack( jQuery.map( this, function( elem, i ) { + return callback.call( elem, i, elem ); + } ) ); + }, + + slice: function() { + return this.pushStack( slice.apply( this, arguments ) ); + }, + + first: function() { + return this.eq( 0 ); + }, + + last: function() { + return this.eq( -1 ); + }, + + even: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return ( i + 1 ) % 2; + } ) ); + }, + + odd: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return i % 2; + } ) ); + }, + + eq: function( i ) { + var len = this.length, + j = +i + ( i < 0 ? len : 0 ); + return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); + }, + + end: function() { + return this.prevObject || this.constructor(); + }, + + // For internal use only. + // Behaves like an Array's method, not like a jQuery method. + push: push, + sort: arr.sort, + splice: arr.splice +}; + +jQuery.extend = jQuery.fn.extend = function() { + var options, name, src, copy, copyIsArray, clone, + target = arguments[ 0 ] || {}, + i = 1, + length = arguments.length, + deep = false; + + // Handle a deep copy situation + if ( typeof target === "boolean" ) { + deep = target; + + // Skip the boolean and the target + target = arguments[ i ] || {}; + i++; + } + + // Handle case when target is a string or something (possible in deep copy) + if ( typeof target !== "object" && !isFunction( target ) ) { + target = {}; + } + + // Extend jQuery itself if only one argument is passed + if ( i === length ) { + target = this; + i--; + } + + for ( ; i < length; i++ ) { + + // Only deal with non-null/undefined values + if ( ( options = arguments[ i ] ) != null ) { + + // Extend the base object + for ( name in options ) { + copy = options[ name ]; + + // Prevent Object.prototype pollution + // Prevent never-ending loop + if ( name === "__proto__" || target === copy ) { + continue; + } + + // Recurse if we're merging plain objects or arrays + if ( deep && copy && ( jQuery.isPlainObject( copy ) || + ( copyIsArray = Array.isArray( copy ) ) ) ) { + src = target[ name ]; + + // Ensure proper type for the source value + if ( copyIsArray && !Array.isArray( src ) ) { + clone = []; + } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { + clone = {}; + } else { + clone = src; + } + copyIsArray = false; + + // Never move original objects, clone them + target[ name ] = jQuery.extend( deep, clone, copy ); + + // Don't bring in undefined values + } else if ( copy !== undefined ) { + target[ name ] = copy; + } + } + } + } + + // Return the modified object + return target; +}; + +jQuery.extend( { + + // Unique for each copy of jQuery on the page + expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), + + // Assume jQuery is ready without the ready module + isReady: true, + + error: function( msg ) { + throw new Error( msg ); + }, + + noop: function() {}, + + isPlainObject: function( obj ) { + var proto, Ctor; + + // Detect obvious negatives + // Use toString instead of jQuery.type to catch host objects + if ( !obj || toString.call( obj ) !== "[object Object]" ) { + return false; + } + + proto = getProto( obj ); + + // Objects with no prototype (e.g., `Object.create( null )`) are plain + if ( !proto ) { + return true; + } + + // Objects with prototype are plain iff they were constructed by a global Object function + Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; + return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; + }, + + isEmptyObject: function( obj ) { + var name; + + for ( name in obj ) { + return false; + } + return true; + }, + + // Evaluates a script in a provided context; falls back to the global one + // if not specified. + globalEval: function( code, options, doc ) { + DOMEval( code, { nonce: options && options.nonce }, doc ); + }, + + each: function( obj, callback ) { + var length, i = 0; + + if ( isArrayLike( obj ) ) { + length = obj.length; + for ( ; i < length; i++ ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } else { + for ( i in obj ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } + + return obj; + }, + + // results is for internal usage only + makeArray: function( arr, results ) { + var ret = results || []; + + if ( arr != null ) { + if ( isArrayLike( Object( arr ) ) ) { + jQuery.merge( ret, + typeof arr === "string" ? + [ arr ] : arr + ); + } else { + push.call( ret, arr ); + } + } + + return ret; + }, + + inArray: function( elem, arr, i ) { + return arr == null ? -1 : indexOf.call( arr, elem, i ); + }, + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + merge: function( first, second ) { + var len = +second.length, + j = 0, + i = first.length; + + for ( ; j < len; j++ ) { + first[ i++ ] = second[ j ]; + } + + first.length = i; + + return first; + }, + + grep: function( elems, callback, invert ) { + var callbackInverse, + matches = [], + i = 0, + length = elems.length, + callbackExpect = !invert; + + // Go through the array, only saving the items + // that pass the validator function + for ( ; i < length; i++ ) { + callbackInverse = !callback( elems[ i ], i ); + if ( callbackInverse !== callbackExpect ) { + matches.push( elems[ i ] ); + } + } + + return matches; + }, + + // arg is for internal usage only + map: function( elems, callback, arg ) { + var length, value, + i = 0, + ret = []; + + // Go through the array, translating each of the items to their new values + if ( isArrayLike( elems ) ) { + length = elems.length; + for ( ; i < length; i++ ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + + // Go through every key on the object, + } else { + for ( i in elems ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + } + + // Flatten any nested arrays + return flat( ret ); + }, + + // A global GUID counter for objects + guid: 1, + + // jQuery.support is not used in Core but other projects attach their + // properties to it so it needs to exist. + support: support +} ); + +if ( typeof Symbol === "function" ) { + jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; +} + +// Populate the class2type map +jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), +function( _i, name ) { + class2type[ "[object " + name + "]" ] = name.toLowerCase(); +} ); + +function isArrayLike( obj ) { + + // Support: real iOS 8.2 only (not reproducible in simulator) + // `in` check used to prevent JIT error (gh-2145) + // hasOwn isn't used here due to false negatives + // regarding Nodelist length in IE + var length = !!obj && "length" in obj && obj.length, + type = toType( obj ); + + if ( isFunction( obj ) || isWindow( obj ) ) { + return false; + } + + return type === "array" || length === 0 || + typeof length === "number" && length > 0 && ( length - 1 ) in obj; +} +var Sizzle = +/*! + * Sizzle CSS Selector Engine v2.3.5 + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://js.foundation/ + * + * Date: 2020-03-14 + */ +( function( window ) { +var i, + support, + Expr, + getText, + isXML, + tokenize, + compile, + select, + outermostContext, + sortInput, + hasDuplicate, + + // Local document vars + setDocument, + document, + docElem, + documentIsHTML, + rbuggyQSA, + rbuggyMatches, + matches, + contains, + + // Instance-specific data + expando = "sizzle" + 1 * new Date(), + preferredDoc = window.document, + dirruns = 0, + done = 0, + classCache = createCache(), + tokenCache = createCache(), + compilerCache = createCache(), + nonnativeSelectorCache = createCache(), + sortOrder = function( a, b ) { + if ( a === b ) { + hasDuplicate = true; + } + return 0; + }, + + // Instance methods + hasOwn = ( {} ).hasOwnProperty, + arr = [], + pop = arr.pop, + pushNative = arr.push, + push = arr.push, + slice = arr.slice, + + // Use a stripped-down indexOf as it's faster than native + // https://jsperf.com/thor-indexof-vs-for/5 + indexOf = function( list, elem ) { + var i = 0, + len = list.length; + for ( ; i < len; i++ ) { + if ( list[ i ] === elem ) { + return i; + } + } + return -1; + }, + + booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + + "ismap|loop|multiple|open|readonly|required|scoped", + + // Regular expressions + + // http://www.w3.org/TR/css3-selectors/#whitespace + whitespace = "[\\x20\\t\\r\\n\\f]", + + // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram + identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + + "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", + + // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors + attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + + + // Operator (capture 2) + "*([*^$|!~]?=)" + whitespace + + + // "Attribute values must be CSS identifiers [capture 5] + // or strings [capture 3 or capture 4]" + "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + + whitespace + "*\\]", + + pseudos = ":(" + identifier + ")(?:\\((" + + + // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: + // 1. quoted (capture 3; capture 4 or capture 5) + "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + + + // 2. simple (capture 6) + "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + + + // 3. anything else (capture 2) + ".*" + + ")\\)|)", + + // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter + rwhitespace = new RegExp( whitespace + "+", "g" ), + rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + + whitespace + "+$", "g" ), + + rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), + rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + + "*" ), + rdescend = new RegExp( whitespace + "|>" ), + + rpseudo = new RegExp( pseudos ), + ridentifier = new RegExp( "^" + identifier + "$" ), + + matchExpr = { + "ID": new RegExp( "^#(" + identifier + ")" ), + "CLASS": new RegExp( "^\\.(" + identifier + ")" ), + "TAG": new RegExp( "^(" + identifier + "|[*])" ), + "ATTR": new RegExp( "^" + attributes ), + "PSEUDO": new RegExp( "^" + pseudos ), + "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + + whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + + whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), + "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), + + // For use in libraries implementing .is() + // We use this for POS matching in `select` + "needsContext": new RegExp( "^" + whitespace + + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) + }, + + rhtml = /HTML$/i, + rinputs = /^(?:input|select|textarea|button)$/i, + rheader = /^h\d$/i, + + rnative = /^[^{]+\{\s*\[native \w/, + + // Easily-parseable/retrievable ID or TAG or CLASS selectors + rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, + + rsibling = /[+~]/, + + // CSS escapes + // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters + runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), + funescape = function( escape, nonHex ) { + var high = "0x" + escape.slice( 1 ) - 0x10000; + + return nonHex ? + + // Strip the backslash prefix from a non-hex escape sequence + nonHex : + + // Replace a hexadecimal escape sequence with the encoded Unicode code point + // Support: IE <=11+ + // For values outside the Basic Multilingual Plane (BMP), manually construct a + // surrogate pair + high < 0 ? + String.fromCharCode( high + 0x10000 ) : + String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); + }, + + // CSS string/identifier serialization + // https://drafts.csswg.org/cssom/#common-serializing-idioms + rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, + fcssescape = function( ch, asCodePoint ) { + if ( asCodePoint ) { + + // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER + if ( ch === "\0" ) { + return "\uFFFD"; + } + + // Control characters and (dependent upon position) numbers get escaped as code points + return ch.slice( 0, -1 ) + "\\" + + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; + } + + // Other potentially-special ASCII characters get backslash-escaped + return "\\" + ch; + }, + + // Used for iframes + // See setDocument() + // Removing the function wrapper causes a "Permission Denied" + // error in IE + unloadHandler = function() { + setDocument(); + }, + + inDisabledFieldset = addCombinator( + function( elem ) { + return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; + }, + { dir: "parentNode", next: "legend" } + ); + +// Optimize for push.apply( _, NodeList ) +try { + push.apply( + ( arr = slice.call( preferredDoc.childNodes ) ), + preferredDoc.childNodes + ); + + // Support: Android<4.0 + // Detect silently failing push.apply + // eslint-disable-next-line no-unused-expressions + arr[ preferredDoc.childNodes.length ].nodeType; +} catch ( e ) { + push = { apply: arr.length ? + + // Leverage slice if possible + function( target, els ) { + pushNative.apply( target, slice.call( els ) ); + } : + + // Support: IE<9 + // Otherwise append directly + function( target, els ) { + var j = target.length, + i = 0; + + // Can't trust NodeList.length + while ( ( target[ j++ ] = els[ i++ ] ) ) {} + target.length = j - 1; + } + }; +} + +function Sizzle( selector, context, results, seed ) { + var m, i, elem, nid, match, groups, newSelector, + newContext = context && context.ownerDocument, + + // nodeType defaults to 9, since context defaults to document + nodeType = context ? context.nodeType : 9; + + results = results || []; + + // Return early from calls with invalid selector or context + if ( typeof selector !== "string" || !selector || + nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { + + return results; + } + + // Try to shortcut find operations (as opposed to filters) in HTML documents + if ( !seed ) { + setDocument( context ); + context = context || document; + + if ( documentIsHTML ) { + + // If the selector is sufficiently simple, try using a "get*By*" DOM method + // (excepting DocumentFragment context, where the methods don't exist) + if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { + + // ID selector + if ( ( m = match[ 1 ] ) ) { + + // Document context + if ( nodeType === 9 ) { + if ( ( elem = context.getElementById( m ) ) ) { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( elem.id === m ) { + results.push( elem ); + return results; + } + } else { + return results; + } + + // Element context + } else { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( newContext && ( elem = newContext.getElementById( m ) ) && + contains( context, elem ) && + elem.id === m ) { + + results.push( elem ); + return results; + } + } + + // Type selector + } else if ( match[ 2 ] ) { + push.apply( results, context.getElementsByTagName( selector ) ); + return results; + + // Class selector + } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && + context.getElementsByClassName ) { + + push.apply( results, context.getElementsByClassName( m ) ); + return results; + } + } + + // Take advantage of querySelectorAll + if ( support.qsa && + !nonnativeSelectorCache[ selector + " " ] && + ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && + + // Support: IE 8 only + // Exclude object elements + ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { + + newSelector = selector; + newContext = context; + + // qSA considers elements outside a scoping root when evaluating child or + // descendant combinators, which is not what we want. + // In such cases, we work around the behavior by prefixing every selector in the + // list with an ID selector referencing the scope context. + // The technique has to be used as well when a leading combinator is used + // as such selectors are not recognized by querySelectorAll. + // Thanks to Andrew Dupont for this technique. + if ( nodeType === 1 && + ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { + + // Expand context for sibling selectors + newContext = rsibling.test( selector ) && testContext( context.parentNode ) || + context; + + // We can use :scope instead of the ID hack if the browser + // supports it & if we're not changing the context. + if ( newContext !== context || !support.scope ) { + + // Capture the context ID, setting it first if necessary + if ( ( nid = context.getAttribute( "id" ) ) ) { + nid = nid.replace( rcssescape, fcssescape ); + } else { + context.setAttribute( "id", ( nid = expando ) ); + } + } + + // Prefix every selector in the list + groups = tokenize( selector ); + i = groups.length; + while ( i-- ) { + groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + + toSelector( groups[ i ] ); + } + newSelector = groups.join( "," ); + } + + try { + push.apply( results, + newContext.querySelectorAll( newSelector ) + ); + return results; + } catch ( qsaError ) { + nonnativeSelectorCache( selector, true ); + } finally { + if ( nid === expando ) { + context.removeAttribute( "id" ); + } + } + } + } + } + + // All others + return select( selector.replace( rtrim, "$1" ), context, results, seed ); +} + +/** + * Create key-value caches of limited size + * @returns {function(string, object)} Returns the Object data after storing it on itself with + * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) + * deleting the oldest entry + */ +function createCache() { + var keys = []; + + function cache( key, value ) { + + // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) + if ( keys.push( key + " " ) > Expr.cacheLength ) { + + // Only keep the most recent entries + delete cache[ keys.shift() ]; + } + return ( cache[ key + " " ] = value ); + } + return cache; +} + +/** + * Mark a function for special use by Sizzle + * @param {Function} fn The function to mark + */ +function markFunction( fn ) { + fn[ expando ] = true; + return fn; +} + +/** + * Support testing using an element + * @param {Function} fn Passed the created element and returns a boolean result + */ +function assert( fn ) { + var el = document.createElement( "fieldset" ); + + try { + return !!fn( el ); + } catch ( e ) { + return false; + } finally { + + // Remove from its parent by default + if ( el.parentNode ) { + el.parentNode.removeChild( el ); + } + + // release memory in IE + el = null; + } +} + +/** + * Adds the same handler for all of the specified attrs + * @param {String} attrs Pipe-separated list of attributes + * @param {Function} handler The method that will be applied + */ +function addHandle( attrs, handler ) { + var arr = attrs.split( "|" ), + i = arr.length; + + while ( i-- ) { + Expr.attrHandle[ arr[ i ] ] = handler; + } +} + +/** + * Checks document order of two siblings + * @param {Element} a + * @param {Element} b + * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b + */ +function siblingCheck( a, b ) { + var cur = b && a, + diff = cur && a.nodeType === 1 && b.nodeType === 1 && + a.sourceIndex - b.sourceIndex; + + // Use IE sourceIndex if available on both nodes + if ( diff ) { + return diff; + } + + // Check if b follows a + if ( cur ) { + while ( ( cur = cur.nextSibling ) ) { + if ( cur === b ) { + return -1; + } + } + } + + return a ? 1 : -1; +} + +/** + * Returns a function to use in pseudos for input types + * @param {String} type + */ +function createInputPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for buttons + * @param {String} type + */ +function createButtonPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return ( name === "input" || name === "button" ) && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for :enabled/:disabled + * @param {Boolean} disabled true for :disabled; false for :enabled + */ +function createDisabledPseudo( disabled ) { + + // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable + return function( elem ) { + + // Only certain elements can match :enabled or :disabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled + if ( "form" in elem ) { + + // Check for inherited disabledness on relevant non-disabled elements: + // * listed form-associated elements in a disabled fieldset + // https://html.spec.whatwg.org/multipage/forms.html#category-listed + // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled + // * option elements in a disabled optgroup + // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled + // All such elements have a "form" property. + if ( elem.parentNode && elem.disabled === false ) { + + // Option elements defer to a parent optgroup if present + if ( "label" in elem ) { + if ( "label" in elem.parentNode ) { + return elem.parentNode.disabled === disabled; + } else { + return elem.disabled === disabled; + } + } + + // Support: IE 6 - 11 + // Use the isDisabled shortcut property to check for disabled fieldset ancestors + return elem.isDisabled === disabled || + + // Where there is no isDisabled, check manually + /* jshint -W018 */ + elem.isDisabled !== !disabled && + inDisabledFieldset( elem ) === disabled; + } + + return elem.disabled === disabled; + + // Try to winnow out elements that can't be disabled before trusting the disabled property. + // Some victims get caught in our net (label, legend, menu, track), but it shouldn't + // even exist on them, let alone have a boolean value. + } else if ( "label" in elem ) { + return elem.disabled === disabled; + } + + // Remaining elements are neither :enabled nor :disabled + return false; + }; +} + +/** + * Returns a function to use in pseudos for positionals + * @param {Function} fn + */ +function createPositionalPseudo( fn ) { + return markFunction( function( argument ) { + argument = +argument; + return markFunction( function( seed, matches ) { + var j, + matchIndexes = fn( [], seed.length, argument ), + i = matchIndexes.length; + + // Match elements found at the specified indexes + while ( i-- ) { + if ( seed[ ( j = matchIndexes[ i ] ) ] ) { + seed[ j ] = !( matches[ j ] = seed[ j ] ); + } + } + } ); + } ); +} + +/** + * Checks a node for validity as a Sizzle context + * @param {Element|Object=} context + * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value + */ +function testContext( context ) { + return context && typeof context.getElementsByTagName !== "undefined" && context; +} + +// Expose support vars for convenience +support = Sizzle.support = {}; + +/** + * Detects XML nodes + * @param {Element|Object} elem An element or a document + * @returns {Boolean} True iff elem is a non-HTML XML node + */ +isXML = Sizzle.isXML = function( elem ) { + var namespace = elem.namespaceURI, + docElem = ( elem.ownerDocument || elem ).documentElement; + + // Support: IE <=8 + // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes + // https://bugs.jquery.com/ticket/4833 + return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); +}; + +/** + * Sets document-related variables once based on the current document + * @param {Element|Object} [doc] An element or document object to use to set the document + * @returns {Object} Returns the current document + */ +setDocument = Sizzle.setDocument = function( node ) { + var hasCompare, subWindow, + doc = node ? node.ownerDocument || node : preferredDoc; + + // Return early if doc is invalid or already selected + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { + return document; + } + + // Update global variables + document = doc; + docElem = document.documentElement; + documentIsHTML = !isXML( document ); + + // Support: IE 9 - 11+, Edge 12 - 18+ + // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( preferredDoc != document && + ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { + + // Support: IE 11, Edge + if ( subWindow.addEventListener ) { + subWindow.addEventListener( "unload", unloadHandler, false ); + + // Support: IE 9 - 10 only + } else if ( subWindow.attachEvent ) { + subWindow.attachEvent( "onunload", unloadHandler ); + } + } + + // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, + // Safari 4 - 5 only, Opera <=11.6 - 12.x only + // IE/Edge & older browsers don't support the :scope pseudo-class. + // Support: Safari 6.0 only + // Safari 6.0 supports :scope but it's an alias of :root there. + support.scope = assert( function( el ) { + docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); + return typeof el.querySelectorAll !== "undefined" && + !el.querySelectorAll( ":scope fieldset div" ).length; + } ); + + /* Attributes + ---------------------------------------------------------------------- */ + + // Support: IE<8 + // Verify that getAttribute really returns attributes and not properties + // (excepting IE8 booleans) + support.attributes = assert( function( el ) { + el.className = "i"; + return !el.getAttribute( "className" ); + } ); + + /* getElement(s)By* + ---------------------------------------------------------------------- */ + + // Check if getElementsByTagName("*") returns only elements + support.getElementsByTagName = assert( function( el ) { + el.appendChild( document.createComment( "" ) ); + return !el.getElementsByTagName( "*" ).length; + } ); + + // Support: IE<9 + support.getElementsByClassName = rnative.test( document.getElementsByClassName ); + + // Support: IE<10 + // Check if getElementById returns elements by name + // The broken getElementById methods don't pick up programmatically-set names, + // so use a roundabout getElementsByName test + support.getById = assert( function( el ) { + docElem.appendChild( el ).id = expando; + return !document.getElementsByName || !document.getElementsByName( expando ).length; + } ); + + // ID filter and find + if ( support.getById ) { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + return elem.getAttribute( "id" ) === attrId; + }; + }; + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var elem = context.getElementById( id ); + return elem ? [ elem ] : []; + } + }; + } else { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + var node = typeof elem.getAttributeNode !== "undefined" && + elem.getAttributeNode( "id" ); + return node && node.value === attrId; + }; + }; + + // Support: IE 6 - 7 only + // getElementById is not reliable as a find shortcut + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var node, i, elems, + elem = context.getElementById( id ); + + if ( elem ) { + + // Verify the id attribute + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + + // Fall back on getElementsByName + elems = context.getElementsByName( id ); + i = 0; + while ( ( elem = elems[ i++ ] ) ) { + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + } + } + + return []; + } + }; + } + + // Tag + Expr.find[ "TAG" ] = support.getElementsByTagName ? + function( tag, context ) { + if ( typeof context.getElementsByTagName !== "undefined" ) { + return context.getElementsByTagName( tag ); + + // DocumentFragment nodes don't have gEBTN + } else if ( support.qsa ) { + return context.querySelectorAll( tag ); + } + } : + + function( tag, context ) { + var elem, + tmp = [], + i = 0, + + // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too + results = context.getElementsByTagName( tag ); + + // Filter out possible comments + if ( tag === "*" ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem.nodeType === 1 ) { + tmp.push( elem ); + } + } + + return tmp; + } + return results; + }; + + // Class + Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { + if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { + return context.getElementsByClassName( className ); + } + }; + + /* QSA/matchesSelector + ---------------------------------------------------------------------- */ + + // QSA and matchesSelector support + + // matchesSelector(:active) reports false when true (IE9/Opera 11.5) + rbuggyMatches = []; + + // qSa(:focus) reports false when true (Chrome 21) + // We allow this because of a bug in IE8/9 that throws an error + // whenever `document.activeElement` is accessed on an iframe + // So, we allow :focus to pass through QSA all the time to avoid the IE error + // See https://bugs.jquery.com/ticket/13378 + rbuggyQSA = []; + + if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { + + // Build QSA regex + // Regex strategy adopted from Diego Perini + assert( function( el ) { + + var input; + + // Select is set to empty string on purpose + // This is to test IE's treatment of not explicitly + // setting a boolean content attribute, + // since its presence should be enough + // https://bugs.jquery.com/ticket/12359 + docElem.appendChild( el ).innerHTML = "" + + ""; + + // Support: IE8, Opera 11-12.16 + // Nothing should be selected when empty strings follow ^= or $= or *= + // The test attribute must be unknown in Opera but "safe" for WinRT + // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section + if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { + rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); + } + + // Support: IE8 + // Boolean attributes and "value" are not treated correctly + if ( !el.querySelectorAll( "[selected]" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); + } + + // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ + if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { + rbuggyQSA.push( "~=" ); + } + + // Support: IE 11+, Edge 15 - 18+ + // IE 11/Edge don't find elements on a `[name='']` query in some cases. + // Adding a temporary attribute to the document before the selection works + // around the issue. + // Interestingly, IE 10 & older don't seem to have the issue. + input = document.createElement( "input" ); + input.setAttribute( "name", "" ); + el.appendChild( input ); + if ( !el.querySelectorAll( "[name='']" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + + whitespace + "*(?:''|\"\")" ); + } + + // Webkit/Opera - :checked should return selected option elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + // IE8 throws error here and will not see later tests + if ( !el.querySelectorAll( ":checked" ).length ) { + rbuggyQSA.push( ":checked" ); + } + + // Support: Safari 8+, iOS 8+ + // https://bugs.webkit.org/show_bug.cgi?id=136851 + // In-page `selector#id sibling-combinator selector` fails + if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { + rbuggyQSA.push( ".#.+[+~]" ); + } + + // Support: Firefox <=3.6 - 5 only + // Old Firefox doesn't throw on a badly-escaped identifier. + el.querySelectorAll( "\\\f" ); + rbuggyQSA.push( "[\\r\\n\\f]" ); + } ); + + assert( function( el ) { + el.innerHTML = "" + + ""; + + // Support: Windows 8 Native Apps + // The type and name attributes are restricted during .innerHTML assignment + var input = document.createElement( "input" ); + input.setAttribute( "type", "hidden" ); + el.appendChild( input ).setAttribute( "name", "D" ); + + // Support: IE8 + // Enforce case-sensitivity of name attribute + if ( el.querySelectorAll( "[name=d]" ).length ) { + rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); + } + + // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) + // IE8 throws error here and will not see later tests + if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: IE9-11+ + // IE's :disabled selector does not pick up the children of disabled fieldsets + docElem.appendChild( el ).disabled = true; + if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: Opera 10 - 11 only + // Opera 10-11 does not throw on post-comma invalid pseudos + el.querySelectorAll( "*,:x" ); + rbuggyQSA.push( ",.*:" ); + } ); + } + + if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || + docElem.webkitMatchesSelector || + docElem.mozMatchesSelector || + docElem.oMatchesSelector || + docElem.msMatchesSelector ) ) ) ) { + + assert( function( el ) { + + // Check to see if it's possible to do matchesSelector + // on a disconnected node (IE 9) + support.disconnectedMatch = matches.call( el, "*" ); + + // This should fail with an exception + // Gecko does not error, returns false instead + matches.call( el, "[s!='']:x" ); + rbuggyMatches.push( "!=", pseudos ); + } ); + } + + rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); + rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); + + /* Contains + ---------------------------------------------------------------------- */ + hasCompare = rnative.test( docElem.compareDocumentPosition ); + + // Element contains another + // Purposefully self-exclusive + // As in, an element does not contain itself + contains = hasCompare || rnative.test( docElem.contains ) ? + function( a, b ) { + var adown = a.nodeType === 9 ? a.documentElement : a, + bup = b && b.parentNode; + return a === bup || !!( bup && bup.nodeType === 1 && ( + adown.contains ? + adown.contains( bup ) : + a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 + ) ); + } : + function( a, b ) { + if ( b ) { + while ( ( b = b.parentNode ) ) { + if ( b === a ) { + return true; + } + } + } + return false; + }; + + /* Sorting + ---------------------------------------------------------------------- */ + + // Document order sorting + sortOrder = hasCompare ? + function( a, b ) { + + // Flag for duplicate removal + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + // Sort on method existence if only one input has compareDocumentPosition + var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; + if ( compare ) { + return compare; + } + + // Calculate position if both inputs belong to the same document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? + a.compareDocumentPosition( b ) : + + // Otherwise we know they are disconnected + 1; + + // Disconnected nodes + if ( compare & 1 || + ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { + + // Choose the first element that is related to our preferred document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( a == document || a.ownerDocument == preferredDoc && + contains( preferredDoc, a ) ) { + return -1; + } + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( b == document || b.ownerDocument == preferredDoc && + contains( preferredDoc, b ) ) { + return 1; + } + + // Maintain original order + return sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + } + + return compare & 4 ? -1 : 1; + } : + function( a, b ) { + + // Exit early if the nodes are identical + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + var cur, + i = 0, + aup = a.parentNode, + bup = b.parentNode, + ap = [ a ], + bp = [ b ]; + + // Parentless nodes are either documents or disconnected + if ( !aup || !bup ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + return a == document ? -1 : + b == document ? 1 : + /* eslint-enable eqeqeq */ + aup ? -1 : + bup ? 1 : + sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + + // If the nodes are siblings, we can do a quick check + } else if ( aup === bup ) { + return siblingCheck( a, b ); + } + + // Otherwise we need full lists of their ancestors for comparison + cur = a; + while ( ( cur = cur.parentNode ) ) { + ap.unshift( cur ); + } + cur = b; + while ( ( cur = cur.parentNode ) ) { + bp.unshift( cur ); + } + + // Walk down the tree looking for a discrepancy + while ( ap[ i ] === bp[ i ] ) { + i++; + } + + return i ? + + // Do a sibling check if the nodes have a common ancestor + siblingCheck( ap[ i ], bp[ i ] ) : + + // Otherwise nodes in our document sort first + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + ap[ i ] == preferredDoc ? -1 : + bp[ i ] == preferredDoc ? 1 : + /* eslint-enable eqeqeq */ + 0; + }; + + return document; +}; + +Sizzle.matches = function( expr, elements ) { + return Sizzle( expr, null, null, elements ); +}; + +Sizzle.matchesSelector = function( elem, expr ) { + setDocument( elem ); + + if ( support.matchesSelector && documentIsHTML && + !nonnativeSelectorCache[ expr + " " ] && + ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && + ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { + + try { + var ret = matches.call( elem, expr ); + + // IE 9's matchesSelector returns false on disconnected nodes + if ( ret || support.disconnectedMatch || + + // As well, disconnected nodes are said to be in a document + // fragment in IE 9 + elem.document && elem.document.nodeType !== 11 ) { + return ret; + } + } catch ( e ) { + nonnativeSelectorCache( expr, true ); + } + } + + return Sizzle( expr, document, null, [ elem ] ).length > 0; +}; + +Sizzle.contains = function( context, elem ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( context.ownerDocument || context ) != document ) { + setDocument( context ); + } + return contains( context, elem ); +}; + +Sizzle.attr = function( elem, name ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( elem.ownerDocument || elem ) != document ) { + setDocument( elem ); + } + + var fn = Expr.attrHandle[ name.toLowerCase() ], + + // Don't get fooled by Object.prototype properties (jQuery #13807) + val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? + fn( elem, name, !documentIsHTML ) : + undefined; + + return val !== undefined ? + val : + support.attributes || !documentIsHTML ? + elem.getAttribute( name ) : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; +}; + +Sizzle.escape = function( sel ) { + return ( sel + "" ).replace( rcssescape, fcssescape ); +}; + +Sizzle.error = function( msg ) { + throw new Error( "Syntax error, unrecognized expression: " + msg ); +}; + +/** + * Document sorting and removing duplicates + * @param {ArrayLike} results + */ +Sizzle.uniqueSort = function( results ) { + var elem, + duplicates = [], + j = 0, + i = 0; + + // Unless we *know* we can detect duplicates, assume their presence + hasDuplicate = !support.detectDuplicates; + sortInput = !support.sortStable && results.slice( 0 ); + results.sort( sortOrder ); + + if ( hasDuplicate ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem === results[ i ] ) { + j = duplicates.push( i ); + } + } + while ( j-- ) { + results.splice( duplicates[ j ], 1 ); + } + } + + // Clear input after sorting to release objects + // See https://github.com/jquery/sizzle/pull/225 + sortInput = null; + + return results; +}; + +/** + * Utility function for retrieving the text value of an array of DOM nodes + * @param {Array|Element} elem + */ +getText = Sizzle.getText = function( elem ) { + var node, + ret = "", + i = 0, + nodeType = elem.nodeType; + + if ( !nodeType ) { + + // If no nodeType, this is expected to be an array + while ( ( node = elem[ i++ ] ) ) { + + // Do not traverse comment nodes + ret += getText( node ); + } + } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { + + // Use textContent for elements + // innerText usage removed for consistency of new lines (jQuery #11153) + if ( typeof elem.textContent === "string" ) { + return elem.textContent; + } else { + + // Traverse its children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + ret += getText( elem ); + } + } + } else if ( nodeType === 3 || nodeType === 4 ) { + return elem.nodeValue; + } + + // Do not include comment or processing instruction nodes + + return ret; +}; + +Expr = Sizzle.selectors = { + + // Can be adjusted by the user + cacheLength: 50, + + createPseudo: markFunction, + + match: matchExpr, + + attrHandle: {}, + + find: {}, + + relative: { + ">": { dir: "parentNode", first: true }, + " ": { dir: "parentNode" }, + "+": { dir: "previousSibling", first: true }, + "~": { dir: "previousSibling" } + }, + + preFilter: { + "ATTR": function( match ) { + match[ 1 ] = match[ 1 ].replace( runescape, funescape ); + + // Move the given value to match[3] whether quoted or unquoted + match[ 3 ] = ( match[ 3 ] || match[ 4 ] || + match[ 5 ] || "" ).replace( runescape, funescape ); + + if ( match[ 2 ] === "~=" ) { + match[ 3 ] = " " + match[ 3 ] + " "; + } + + return match.slice( 0, 4 ); + }, + + "CHILD": function( match ) { + + /* matches from matchExpr["CHILD"] + 1 type (only|nth|...) + 2 what (child|of-type) + 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) + 4 xn-component of xn+y argument ([+-]?\d*n|) + 5 sign of xn-component + 6 x of xn-component + 7 sign of y-component + 8 y of y-component + */ + match[ 1 ] = match[ 1 ].toLowerCase(); + + if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { + + // nth-* requires argument + if ( !match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + // numeric x and y parameters for Expr.filter.CHILD + // remember that false/true cast respectively to 0/1 + match[ 4 ] = +( match[ 4 ] ? + match[ 5 ] + ( match[ 6 ] || 1 ) : + 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); + match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); + + // other types prohibit arguments + } else if ( match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + return match; + }, + + "PSEUDO": function( match ) { + var excess, + unquoted = !match[ 6 ] && match[ 2 ]; + + if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { + return null; + } + + // Accept quoted arguments as-is + if ( match[ 3 ] ) { + match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; + + // Strip excess characters from unquoted arguments + } else if ( unquoted && rpseudo.test( unquoted ) && + + // Get excess from tokenize (recursively) + ( excess = tokenize( unquoted, true ) ) && + + // advance to the next closing parenthesis + ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { + + // excess is a negative index + match[ 0 ] = match[ 0 ].slice( 0, excess ); + match[ 2 ] = unquoted.slice( 0, excess ); + } + + // Return only captures needed by the pseudo filter method (type and argument) + return match.slice( 0, 3 ); + } + }, + + filter: { + + "TAG": function( nodeNameSelector ) { + var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); + return nodeNameSelector === "*" ? + function() { + return true; + } : + function( elem ) { + return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; + }; + }, + + "CLASS": function( className ) { + var pattern = classCache[ className + " " ]; + + return pattern || + ( pattern = new RegExp( "(^|" + whitespace + + ")" + className + "(" + whitespace + "|$)" ) ) && classCache( + className, function( elem ) { + return pattern.test( + typeof elem.className === "string" && elem.className || + typeof elem.getAttribute !== "undefined" && + elem.getAttribute( "class" ) || + "" + ); + } ); + }, + + "ATTR": function( name, operator, check ) { + return function( elem ) { + var result = Sizzle.attr( elem, name ); + + if ( result == null ) { + return operator === "!="; + } + if ( !operator ) { + return true; + } + + result += ""; + + /* eslint-disable max-len */ + + return operator === "=" ? result === check : + operator === "!=" ? result !== check : + operator === "^=" ? check && result.indexOf( check ) === 0 : + operator === "*=" ? check && result.indexOf( check ) > -1 : + operator === "$=" ? check && result.slice( -check.length ) === check : + operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : + operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : + false; + /* eslint-enable max-len */ + + }; + }, + + "CHILD": function( type, what, _argument, first, last ) { + var simple = type.slice( 0, 3 ) !== "nth", + forward = type.slice( -4 ) !== "last", + ofType = what === "of-type"; + + return first === 1 && last === 0 ? + + // Shortcut for :nth-*(n) + function( elem ) { + return !!elem.parentNode; + } : + + function( elem, _context, xml ) { + var cache, uniqueCache, outerCache, node, nodeIndex, start, + dir = simple !== forward ? "nextSibling" : "previousSibling", + parent = elem.parentNode, + name = ofType && elem.nodeName.toLowerCase(), + useCache = !xml && !ofType, + diff = false; + + if ( parent ) { + + // :(first|last|only)-(child|of-type) + if ( simple ) { + while ( dir ) { + node = elem; + while ( ( node = node[ dir ] ) ) { + if ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) { + + return false; + } + } + + // Reverse direction for :only-* (if we haven't yet done so) + start = dir = type === "only" && !start && "nextSibling"; + } + return true; + } + + start = [ forward ? parent.firstChild : parent.lastChild ]; + + // non-xml :nth-child(...) stores cache data on `parent` + if ( forward && useCache ) { + + // Seek `elem` from a previously-cached index + + // ...in a gzip-friendly way + node = parent; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex && cache[ 2 ]; + node = nodeIndex && parent.childNodes[ nodeIndex ]; + + while ( ( node = ++nodeIndex && node && node[ dir ] || + + // Fallback to seeking `elem` from the start + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + // When found, cache indexes on `parent` and break + if ( node.nodeType === 1 && ++diff && node === elem ) { + uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; + break; + } + } + + } else { + + // Use previously-cached element index if available + if ( useCache ) { + + // ...in a gzip-friendly way + node = elem; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex; + } + + // xml :nth-child(...) + // or :nth-last-child(...) or :nth(-last)?-of-type(...) + if ( diff === false ) { + + // Use the same loop as above to seek `elem` from the start + while ( ( node = ++nodeIndex && node && node[ dir ] || + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + if ( ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) && + ++diff ) { + + // Cache the index of each encountered element + if ( useCache ) { + outerCache = node[ expando ] || + ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + uniqueCache[ type ] = [ dirruns, diff ]; + } + + if ( node === elem ) { + break; + } + } + } + } + } + + // Incorporate the offset, then check against cycle size + diff -= last; + return diff === first || ( diff % first === 0 && diff / first >= 0 ); + } + }; + }, + + "PSEUDO": function( pseudo, argument ) { + + // pseudo-class names are case-insensitive + // http://www.w3.org/TR/selectors/#pseudo-classes + // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters + // Remember that setFilters inherits from pseudos + var args, + fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || + Sizzle.error( "unsupported pseudo: " + pseudo ); + + // The user may use createPseudo to indicate that + // arguments are needed to create the filter function + // just as Sizzle does + if ( fn[ expando ] ) { + return fn( argument ); + } + + // But maintain support for old signatures + if ( fn.length > 1 ) { + args = [ pseudo, pseudo, "", argument ]; + return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? + markFunction( function( seed, matches ) { + var idx, + matched = fn( seed, argument ), + i = matched.length; + while ( i-- ) { + idx = indexOf( seed, matched[ i ] ); + seed[ idx ] = !( matches[ idx ] = matched[ i ] ); + } + } ) : + function( elem ) { + return fn( elem, 0, args ); + }; + } + + return fn; + } + }, + + pseudos: { + + // Potentially complex pseudos + "not": markFunction( function( selector ) { + + // Trim the selector passed to compile + // to avoid treating leading and trailing + // spaces as combinators + var input = [], + results = [], + matcher = compile( selector.replace( rtrim, "$1" ) ); + + return matcher[ expando ] ? + markFunction( function( seed, matches, _context, xml ) { + var elem, + unmatched = matcher( seed, null, xml, [] ), + i = seed.length; + + // Match elements unmatched by `matcher` + while ( i-- ) { + if ( ( elem = unmatched[ i ] ) ) { + seed[ i ] = !( matches[ i ] = elem ); + } + } + } ) : + function( elem, _context, xml ) { + input[ 0 ] = elem; + matcher( input, null, xml, results ); + + // Don't keep the element (issue #299) + input[ 0 ] = null; + return !results.pop(); + }; + } ), + + "has": markFunction( function( selector ) { + return function( elem ) { + return Sizzle( selector, elem ).length > 0; + }; + } ), + + "contains": markFunction( function( text ) { + text = text.replace( runescape, funescape ); + return function( elem ) { + return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; + }; + } ), + + // "Whether an element is represented by a :lang() selector + // is based solely on the element's language value + // being equal to the identifier C, + // or beginning with the identifier C immediately followed by "-". + // The matching of C against the element's language value is performed case-insensitively. + // The identifier C does not have to be a valid language name." + // http://www.w3.org/TR/selectors/#lang-pseudo + "lang": markFunction( function( lang ) { + + // lang value must be a valid identifier + if ( !ridentifier.test( lang || "" ) ) { + Sizzle.error( "unsupported lang: " + lang ); + } + lang = lang.replace( runescape, funescape ).toLowerCase(); + return function( elem ) { + var elemLang; + do { + if ( ( elemLang = documentIsHTML ? + elem.lang : + elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { + + elemLang = elemLang.toLowerCase(); + return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; + } + } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); + return false; + }; + } ), + + // Miscellaneous + "target": function( elem ) { + var hash = window.location && window.location.hash; + return hash && hash.slice( 1 ) === elem.id; + }, + + "root": function( elem ) { + return elem === docElem; + }, + + "focus": function( elem ) { + return elem === document.activeElement && + ( !document.hasFocus || document.hasFocus() ) && + !!( elem.type || elem.href || ~elem.tabIndex ); + }, + + // Boolean properties + "enabled": createDisabledPseudo( false ), + "disabled": createDisabledPseudo( true ), + + "checked": function( elem ) { + + // In CSS3, :checked should return both checked and selected elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + var nodeName = elem.nodeName.toLowerCase(); + return ( nodeName === "input" && !!elem.checked ) || + ( nodeName === "option" && !!elem.selected ); + }, + + "selected": function( elem ) { + + // Accessing this property makes selected-by-default + // options in Safari work properly + if ( elem.parentNode ) { + // eslint-disable-next-line no-unused-expressions + elem.parentNode.selectedIndex; + } + + return elem.selected === true; + }, + + // Contents + "empty": function( elem ) { + + // http://www.w3.org/TR/selectors/#empty-pseudo + // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), + // but not by others (comment: 8; processing instruction: 7; etc.) + // nodeType < 6 works because attributes (2) do not appear as children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + if ( elem.nodeType < 6 ) { + return false; + } + } + return true; + }, + + "parent": function( elem ) { + return !Expr.pseudos[ "empty" ]( elem ); + }, + + // Element/input types + "header": function( elem ) { + return rheader.test( elem.nodeName ); + }, + + "input": function( elem ) { + return rinputs.test( elem.nodeName ); + }, + + "button": function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === "button" || name === "button"; + }, + + "text": function( elem ) { + var attr; + return elem.nodeName.toLowerCase() === "input" && + elem.type === "text" && + + // Support: IE<8 + // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" + ( ( attr = elem.getAttribute( "type" ) ) == null || + attr.toLowerCase() === "text" ); + }, + + // Position-in-collection + "first": createPositionalPseudo( function() { + return [ 0 ]; + } ), + + "last": createPositionalPseudo( function( _matchIndexes, length ) { + return [ length - 1 ]; + } ), + + "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { + return [ argument < 0 ? argument + length : argument ]; + } ), + + "even": createPositionalPseudo( function( matchIndexes, length ) { + var i = 0; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "odd": createPositionalPseudo( function( matchIndexes, length ) { + var i = 1; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? + argument + length : + argument > length ? + length : + argument; + for ( ; --i >= 0; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; ++i < length; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ) + } +}; + +Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; + +// Add button/input type pseudos +for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { + Expr.pseudos[ i ] = createInputPseudo( i ); +} +for ( i in { submit: true, reset: true } ) { + Expr.pseudos[ i ] = createButtonPseudo( i ); +} + +// Easy API for creating new setFilters +function setFilters() {} +setFilters.prototype = Expr.filters = Expr.pseudos; +Expr.setFilters = new setFilters(); + +tokenize = Sizzle.tokenize = function( selector, parseOnly ) { + var matched, match, tokens, type, + soFar, groups, preFilters, + cached = tokenCache[ selector + " " ]; + + if ( cached ) { + return parseOnly ? 0 : cached.slice( 0 ); + } + + soFar = selector; + groups = []; + preFilters = Expr.preFilter; + + while ( soFar ) { + + // Comma and first run + if ( !matched || ( match = rcomma.exec( soFar ) ) ) { + if ( match ) { + + // Don't consume trailing commas as valid + soFar = soFar.slice( match[ 0 ].length ) || soFar; + } + groups.push( ( tokens = [] ) ); + } + + matched = false; + + // Combinators + if ( ( match = rcombinators.exec( soFar ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + + // Cast descendant combinators to space + type: match[ 0 ].replace( rtrim, " " ) + } ); + soFar = soFar.slice( matched.length ); + } + + // Filters + for ( type in Expr.filter ) { + if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || + ( match = preFilters[ type ]( match ) ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + type: type, + matches: match + } ); + soFar = soFar.slice( matched.length ); + } + } + + if ( !matched ) { + break; + } + } + + // Return the length of the invalid excess + // if we're just parsing + // Otherwise, throw an error or return tokens + return parseOnly ? + soFar.length : + soFar ? + Sizzle.error( selector ) : + + // Cache the tokens + tokenCache( selector, groups ).slice( 0 ); +}; + +function toSelector( tokens ) { + var i = 0, + len = tokens.length, + selector = ""; + for ( ; i < len; i++ ) { + selector += tokens[ i ].value; + } + return selector; +} + +function addCombinator( matcher, combinator, base ) { + var dir = combinator.dir, + skip = combinator.next, + key = skip || dir, + checkNonElements = base && key === "parentNode", + doneName = done++; + + return combinator.first ? + + // Check against closest ancestor/preceding element + function( elem, context, xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + return matcher( elem, context, xml ); + } + } + return false; + } : + + // Check against all ancestor/preceding elements + function( elem, context, xml ) { + var oldCache, uniqueCache, outerCache, + newCache = [ dirruns, doneName ]; + + // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching + if ( xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + if ( matcher( elem, context, xml ) ) { + return true; + } + } + } + } else { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + outerCache = elem[ expando ] || ( elem[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ elem.uniqueID ] || + ( outerCache[ elem.uniqueID ] = {} ); + + if ( skip && skip === elem.nodeName.toLowerCase() ) { + elem = elem[ dir ] || elem; + } else if ( ( oldCache = uniqueCache[ key ] ) && + oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { + + // Assign to newCache so results back-propagate to previous elements + return ( newCache[ 2 ] = oldCache[ 2 ] ); + } else { + + // Reuse newcache so results back-propagate to previous elements + uniqueCache[ key ] = newCache; + + // A match means we're done; a fail means we have to keep checking + if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { + return true; + } + } + } + } + } + return false; + }; +} + +function elementMatcher( matchers ) { + return matchers.length > 1 ? + function( elem, context, xml ) { + var i = matchers.length; + while ( i-- ) { + if ( !matchers[ i ]( elem, context, xml ) ) { + return false; + } + } + return true; + } : + matchers[ 0 ]; +} + +function multipleContexts( selector, contexts, results ) { + var i = 0, + len = contexts.length; + for ( ; i < len; i++ ) { + Sizzle( selector, contexts[ i ], results ); + } + return results; +} + +function condense( unmatched, map, filter, context, xml ) { + var elem, + newUnmatched = [], + i = 0, + len = unmatched.length, + mapped = map != null; + + for ( ; i < len; i++ ) { + if ( ( elem = unmatched[ i ] ) ) { + if ( !filter || filter( elem, context, xml ) ) { + newUnmatched.push( elem ); + if ( mapped ) { + map.push( i ); + } + } + } + } + + return newUnmatched; +} + +function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { + if ( postFilter && !postFilter[ expando ] ) { + postFilter = setMatcher( postFilter ); + } + if ( postFinder && !postFinder[ expando ] ) { + postFinder = setMatcher( postFinder, postSelector ); + } + return markFunction( function( seed, results, context, xml ) { + var temp, i, elem, + preMap = [], + postMap = [], + preexisting = results.length, + + // Get initial elements from seed or context + elems = seed || multipleContexts( + selector || "*", + context.nodeType ? [ context ] : context, + [] + ), + + // Prefilter to get matcher input, preserving a map for seed-results synchronization + matcherIn = preFilter && ( seed || !selector ) ? + condense( elems, preMap, preFilter, context, xml ) : + elems, + + matcherOut = matcher ? + + // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, + postFinder || ( seed ? preFilter : preexisting || postFilter ) ? + + // ...intermediate processing is necessary + [] : + + // ...otherwise use results directly + results : + matcherIn; + + // Find primary matches + if ( matcher ) { + matcher( matcherIn, matcherOut, context, xml ); + } + + // Apply postFilter + if ( postFilter ) { + temp = condense( matcherOut, postMap ); + postFilter( temp, [], context, xml ); + + // Un-match failing elements by moving them back to matcherIn + i = temp.length; + while ( i-- ) { + if ( ( elem = temp[ i ] ) ) { + matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); + } + } + } + + if ( seed ) { + if ( postFinder || preFilter ) { + if ( postFinder ) { + + // Get the final matcherOut by condensing this intermediate into postFinder contexts + temp = []; + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) ) { + + // Restore matcherIn since elem is not yet a final match + temp.push( ( matcherIn[ i ] = elem ) ); + } + } + postFinder( null, ( matcherOut = [] ), temp, xml ); + } + + // Move matched elements from seed to results to keep them synchronized + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) && + ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { + + seed[ temp ] = !( results[ temp ] = elem ); + } + } + } + + // Add elements to results, through postFinder if defined + } else { + matcherOut = condense( + matcherOut === results ? + matcherOut.splice( preexisting, matcherOut.length ) : + matcherOut + ); + if ( postFinder ) { + postFinder( null, results, matcherOut, xml ); + } else { + push.apply( results, matcherOut ); + } + } + } ); +} + +function matcherFromTokens( tokens ) { + var checkContext, matcher, j, + len = tokens.length, + leadingRelative = Expr.relative[ tokens[ 0 ].type ], + implicitRelative = leadingRelative || Expr.relative[ " " ], + i = leadingRelative ? 1 : 0, + + // The foundational matcher ensures that elements are reachable from top-level context(s) + matchContext = addCombinator( function( elem ) { + return elem === checkContext; + }, implicitRelative, true ), + matchAnyContext = addCombinator( function( elem ) { + return indexOf( checkContext, elem ) > -1; + }, implicitRelative, true ), + matchers = [ function( elem, context, xml ) { + var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( + ( checkContext = context ).nodeType ? + matchContext( elem, context, xml ) : + matchAnyContext( elem, context, xml ) ); + + // Avoid hanging onto element (issue #299) + checkContext = null; + return ret; + } ]; + + for ( ; i < len; i++ ) { + if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { + matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; + } else { + matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); + + // Return special upon seeing a positional matcher + if ( matcher[ expando ] ) { + + // Find the next relative operator (if any) for proper handling + j = ++i; + for ( ; j < len; j++ ) { + if ( Expr.relative[ tokens[ j ].type ] ) { + break; + } + } + return setMatcher( + i > 1 && elementMatcher( matchers ), + i > 1 && toSelector( + + // If the preceding token was a descendant combinator, insert an implicit any-element `*` + tokens + .slice( 0, i - 1 ) + .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) + ).replace( rtrim, "$1" ), + matcher, + i < j && matcherFromTokens( tokens.slice( i, j ) ), + j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), + j < len && toSelector( tokens ) + ); + } + matchers.push( matcher ); + } + } + + return elementMatcher( matchers ); +} + +function matcherFromGroupMatchers( elementMatchers, setMatchers ) { + var bySet = setMatchers.length > 0, + byElement = elementMatchers.length > 0, + superMatcher = function( seed, context, xml, results, outermost ) { + var elem, j, matcher, + matchedCount = 0, + i = "0", + unmatched = seed && [], + setMatched = [], + contextBackup = outermostContext, + + // We must always have either seed elements or outermost context + elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), + + // Use integer dirruns iff this is the outermost matcher + dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), + len = elems.length; + + if ( outermost ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + outermostContext = context == document || context || outermost; + } + + // Add elements passing elementMatchers directly to results + // Support: IE<9, Safari + // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id + for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { + if ( byElement && elem ) { + j = 0; + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( !context && elem.ownerDocument != document ) { + setDocument( elem ); + xml = !documentIsHTML; + } + while ( ( matcher = elementMatchers[ j++ ] ) ) { + if ( matcher( elem, context || document, xml ) ) { + results.push( elem ); + break; + } + } + if ( outermost ) { + dirruns = dirrunsUnique; + } + } + + // Track unmatched elements for set filters + if ( bySet ) { + + // They will have gone through all possible matchers + if ( ( elem = !matcher && elem ) ) { + matchedCount--; + } + + // Lengthen the array for every element, matched or not + if ( seed ) { + unmatched.push( elem ); + } + } + } + + // `i` is now the count of elements visited above, and adding it to `matchedCount` + // makes the latter nonnegative. + matchedCount += i; + + // Apply set filters to unmatched elements + // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` + // equals `i`), unless we didn't visit _any_ elements in the above loop because we have + // no element matchers and no seed. + // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that + // case, which will result in a "00" `matchedCount` that differs from `i` but is also + // numerically zero. + if ( bySet && i !== matchedCount ) { + j = 0; + while ( ( matcher = setMatchers[ j++ ] ) ) { + matcher( unmatched, setMatched, context, xml ); + } + + if ( seed ) { + + // Reintegrate element matches to eliminate the need for sorting + if ( matchedCount > 0 ) { + while ( i-- ) { + if ( !( unmatched[ i ] || setMatched[ i ] ) ) { + setMatched[ i ] = pop.call( results ); + } + } + } + + // Discard index placeholder values to get only actual matches + setMatched = condense( setMatched ); + } + + // Add matches to results + push.apply( results, setMatched ); + + // Seedless set matches succeeding multiple successful matchers stipulate sorting + if ( outermost && !seed && setMatched.length > 0 && + ( matchedCount + setMatchers.length ) > 1 ) { + + Sizzle.uniqueSort( results ); + } + } + + // Override manipulation of globals by nested matchers + if ( outermost ) { + dirruns = dirrunsUnique; + outermostContext = contextBackup; + } + + return unmatched; + }; + + return bySet ? + markFunction( superMatcher ) : + superMatcher; +} + +compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { + var i, + setMatchers = [], + elementMatchers = [], + cached = compilerCache[ selector + " " ]; + + if ( !cached ) { + + // Generate a function of recursive functions that can be used to check each element + if ( !match ) { + match = tokenize( selector ); + } + i = match.length; + while ( i-- ) { + cached = matcherFromTokens( match[ i ] ); + if ( cached[ expando ] ) { + setMatchers.push( cached ); + } else { + elementMatchers.push( cached ); + } + } + + // Cache the compiled function + cached = compilerCache( + selector, + matcherFromGroupMatchers( elementMatchers, setMatchers ) + ); + + // Save selector and tokenization + cached.selector = selector; + } + return cached; +}; + +/** + * A low-level selection function that works with Sizzle's compiled + * selector functions + * @param {String|Function} selector A selector or a pre-compiled + * selector function built with Sizzle.compile + * @param {Element} context + * @param {Array} [results] + * @param {Array} [seed] A set of elements to match against + */ +select = Sizzle.select = function( selector, context, results, seed ) { + var i, tokens, token, type, find, + compiled = typeof selector === "function" && selector, + match = !seed && tokenize( ( selector = compiled.selector || selector ) ); + + results = results || []; + + // Try to minimize operations if there is only one selector in the list and no seed + // (the latter of which guarantees us context) + if ( match.length === 1 ) { + + // Reduce context if the leading compound selector is an ID + tokens = match[ 0 ] = match[ 0 ].slice( 0 ); + if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && + context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { + + context = ( Expr.find[ "ID" ]( token.matches[ 0 ] + .replace( runescape, funescape ), context ) || [] )[ 0 ]; + if ( !context ) { + return results; + + // Precompiled matchers will still verify ancestry, so step up a level + } else if ( compiled ) { + context = context.parentNode; + } + + selector = selector.slice( tokens.shift().value.length ); + } + + // Fetch a seed set for right-to-left matching + i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; + while ( i-- ) { + token = tokens[ i ]; + + // Abort if we hit a combinator + if ( Expr.relative[ ( type = token.type ) ] ) { + break; + } + if ( ( find = Expr.find[ type ] ) ) { + + // Search, expanding context for leading sibling combinators + if ( ( seed = find( + token.matches[ 0 ].replace( runescape, funescape ), + rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || + context + ) ) ) { + + // If seed is empty or no tokens remain, we can return early + tokens.splice( i, 1 ); + selector = seed.length && toSelector( tokens ); + if ( !selector ) { + push.apply( results, seed ); + return results; + } + + break; + } + } + } + } + + // Compile and execute a filtering function if one is not provided + // Provide `match` to avoid retokenization if we modified the selector above + ( compiled || compile( selector, match ) )( + seed, + context, + !documentIsHTML, + results, + !context || rsibling.test( selector ) && testContext( context.parentNode ) || context + ); + return results; +}; + +// One-time assignments + +// Sort stability +support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; + +// Support: Chrome 14-35+ +// Always assume duplicates if they aren't passed to the comparison function +support.detectDuplicates = !!hasDuplicate; + +// Initialize against the default document +setDocument(); + +// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) +// Detached nodes confoundingly follow *each other* +support.sortDetached = assert( function( el ) { + + // Should return 1, but returns 4 (following) + return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; +} ); + +// Support: IE<8 +// Prevent attribute/property "interpolation" +// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx +if ( !assert( function( el ) { + el.innerHTML = ""; + return el.firstChild.getAttribute( "href" ) === "#"; +} ) ) { + addHandle( "type|href|height|width", function( elem, name, isXML ) { + if ( !isXML ) { + return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); + } + } ); +} + +// Support: IE<9 +// Use defaultValue in place of getAttribute("value") +if ( !support.attributes || !assert( function( el ) { + el.innerHTML = ""; + el.firstChild.setAttribute( "value", "" ); + return el.firstChild.getAttribute( "value" ) === ""; +} ) ) { + addHandle( "value", function( elem, _name, isXML ) { + if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { + return elem.defaultValue; + } + } ); +} + +// Support: IE<9 +// Use getAttributeNode to fetch booleans when getAttribute lies +if ( !assert( function( el ) { + return el.getAttribute( "disabled" ) == null; +} ) ) { + addHandle( booleans, function( elem, name, isXML ) { + var val; + if ( !isXML ) { + return elem[ name ] === true ? name.toLowerCase() : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; + } + } ); +} + +return Sizzle; + +} )( window ); + + + +jQuery.find = Sizzle; +jQuery.expr = Sizzle.selectors; + +// Deprecated +jQuery.expr[ ":" ] = jQuery.expr.pseudos; +jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; +jQuery.text = Sizzle.getText; +jQuery.isXMLDoc = Sizzle.isXML; +jQuery.contains = Sizzle.contains; +jQuery.escapeSelector = Sizzle.escape; + + + + +var dir = function( elem, dir, until ) { + var matched = [], + truncate = until !== undefined; + + while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { + if ( elem.nodeType === 1 ) { + if ( truncate && jQuery( elem ).is( until ) ) { + break; + } + matched.push( elem ); + } + } + return matched; +}; + + +var siblings = function( n, elem ) { + var matched = []; + + for ( ; n; n = n.nextSibling ) { + if ( n.nodeType === 1 && n !== elem ) { + matched.push( n ); + } + } + + return matched; +}; + + +var rneedsContext = jQuery.expr.match.needsContext; + + + +function nodeName( elem, name ) { + + return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); + +}; +var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); + + + +// Implement the identical functionality for filter and not +function winnow( elements, qualifier, not ) { + if ( isFunction( qualifier ) ) { + return jQuery.grep( elements, function( elem, i ) { + return !!qualifier.call( elem, i, elem ) !== not; + } ); + } + + // Single element + if ( qualifier.nodeType ) { + return jQuery.grep( elements, function( elem ) { + return ( elem === qualifier ) !== not; + } ); + } + + // Arraylike of elements (jQuery, arguments, Array) + if ( typeof qualifier !== "string" ) { + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not; + } ); + } + + // Filtered directly for both simple and complex selectors + return jQuery.filter( qualifier, elements, not ); +} + +jQuery.filter = function( expr, elems, not ) { + var elem = elems[ 0 ]; + + if ( not ) { + expr = ":not(" + expr + ")"; + } + + if ( elems.length === 1 && elem.nodeType === 1 ) { + return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; + } + + return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { + return elem.nodeType === 1; + } ) ); +}; + +jQuery.fn.extend( { + find: function( selector ) { + var i, ret, + len = this.length, + self = this; + + if ( typeof selector !== "string" ) { + return this.pushStack( jQuery( selector ).filter( function() { + for ( i = 0; i < len; i++ ) { + if ( jQuery.contains( self[ i ], this ) ) { + return true; + } + } + } ) ); + } + + ret = this.pushStack( [] ); + + for ( i = 0; i < len; i++ ) { + jQuery.find( selector, self[ i ], ret ); + } + + return len > 1 ? jQuery.uniqueSort( ret ) : ret; + }, + filter: function( selector ) { + return this.pushStack( winnow( this, selector || [], false ) ); + }, + not: function( selector ) { + return this.pushStack( winnow( this, selector || [], true ) ); + }, + is: function( selector ) { + return !!winnow( + this, + + // If this is a positional/relative selector, check membership in the returned set + // so $("p:first").is("p:last") won't return true for a doc with two "p". + typeof selector === "string" && rneedsContext.test( selector ) ? + jQuery( selector ) : + selector || [], + false + ).length; + } +} ); + + +// Initialize a jQuery object + + +// A central reference to the root jQuery(document) +var rootjQuery, + + // A simple way to check for HTML strings + // Prioritize #id over to avoid XSS via location.hash (#9521) + // Strict HTML recognition (#11290: must start with <) + // Shortcut simple #id case for speed + rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, + + init = jQuery.fn.init = function( selector, context, root ) { + var match, elem; + + // HANDLE: $(""), $(null), $(undefined), $(false) + if ( !selector ) { + return this; + } + + // Method init() accepts an alternate rootjQuery + // so migrate can support jQuery.sub (gh-2101) + root = root || rootjQuery; + + // Handle HTML strings + if ( typeof selector === "string" ) { + if ( selector[ 0 ] === "<" && + selector[ selector.length - 1 ] === ">" && + selector.length >= 3 ) { + + // Assume that strings that start and end with <> are HTML and skip the regex check + match = [ null, selector, null ]; + + } else { + match = rquickExpr.exec( selector ); + } + + // Match html or make sure no context is specified for #id + if ( match && ( match[ 1 ] || !context ) ) { + + // HANDLE: $(html) -> $(array) + if ( match[ 1 ] ) { + context = context instanceof jQuery ? context[ 0 ] : context; + + // Option to run scripts is true for back-compat + // Intentionally let the error be thrown if parseHTML is not present + jQuery.merge( this, jQuery.parseHTML( + match[ 1 ], + context && context.nodeType ? context.ownerDocument || context : document, + true + ) ); + + // HANDLE: $(html, props) + if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { + for ( match in context ) { + + // Properties of context are called as methods if possible + if ( isFunction( this[ match ] ) ) { + this[ match ]( context[ match ] ); + + // ...and otherwise set as attributes + } else { + this.attr( match, context[ match ] ); + } + } + } + + return this; + + // HANDLE: $(#id) + } else { + elem = document.getElementById( match[ 2 ] ); + + if ( elem ) { + + // Inject the element directly into the jQuery object + this[ 0 ] = elem; + this.length = 1; + } + return this; + } + + // HANDLE: $(expr, $(...)) + } else if ( !context || context.jquery ) { + return ( context || root ).find( selector ); + + // HANDLE: $(expr, context) + // (which is just equivalent to: $(context).find(expr) + } else { + return this.constructor( context ).find( selector ); + } + + // HANDLE: $(DOMElement) + } else if ( selector.nodeType ) { + this[ 0 ] = selector; + this.length = 1; + return this; + + // HANDLE: $(function) + // Shortcut for document ready + } else if ( isFunction( selector ) ) { + return root.ready !== undefined ? + root.ready( selector ) : + + // Execute immediately if ready is not present + selector( jQuery ); + } + + return jQuery.makeArray( selector, this ); + }; + +// Give the init function the jQuery prototype for later instantiation +init.prototype = jQuery.fn; + +// Initialize central reference +rootjQuery = jQuery( document ); + + +var rparentsprev = /^(?:parents|prev(?:Until|All))/, + + // Methods guaranteed to produce a unique set when starting from a unique set + guaranteedUnique = { + children: true, + contents: true, + next: true, + prev: true + }; + +jQuery.fn.extend( { + has: function( target ) { + var targets = jQuery( target, this ), + l = targets.length; + + return this.filter( function() { + var i = 0; + for ( ; i < l; i++ ) { + if ( jQuery.contains( this, targets[ i ] ) ) { + return true; + } + } + } ); + }, + + closest: function( selectors, context ) { + var cur, + i = 0, + l = this.length, + matched = [], + targets = typeof selectors !== "string" && jQuery( selectors ); + + // Positional selectors never match, since there's no _selection_ context + if ( !rneedsContext.test( selectors ) ) { + for ( ; i < l; i++ ) { + for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { + + // Always skip document fragments + if ( cur.nodeType < 11 && ( targets ? + targets.index( cur ) > -1 : + + // Don't pass non-elements to Sizzle + cur.nodeType === 1 && + jQuery.find.matchesSelector( cur, selectors ) ) ) { + + matched.push( cur ); + break; + } + } + } + } + + return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); + }, + + // Determine the position of an element within the set + index: function( elem ) { + + // No argument, return index in parent + if ( !elem ) { + return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; + } + + // Index in selector + if ( typeof elem === "string" ) { + return indexOf.call( jQuery( elem ), this[ 0 ] ); + } + + // Locate the position of the desired element + return indexOf.call( this, + + // If it receives a jQuery object, the first element is used + elem.jquery ? elem[ 0 ] : elem + ); + }, + + add: function( selector, context ) { + return this.pushStack( + jQuery.uniqueSort( + jQuery.merge( this.get(), jQuery( selector, context ) ) + ) + ); + }, + + addBack: function( selector ) { + return this.add( selector == null ? + this.prevObject : this.prevObject.filter( selector ) + ); + } +} ); + +function sibling( cur, dir ) { + while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} + return cur; +} + +jQuery.each( { + parent: function( elem ) { + var parent = elem.parentNode; + return parent && parent.nodeType !== 11 ? parent : null; + }, + parents: function( elem ) { + return dir( elem, "parentNode" ); + }, + parentsUntil: function( elem, _i, until ) { + return dir( elem, "parentNode", until ); + }, + next: function( elem ) { + return sibling( elem, "nextSibling" ); + }, + prev: function( elem ) { + return sibling( elem, "previousSibling" ); + }, + nextAll: function( elem ) { + return dir( elem, "nextSibling" ); + }, + prevAll: function( elem ) { + return dir( elem, "previousSibling" ); + }, + nextUntil: function( elem, _i, until ) { + return dir( elem, "nextSibling", until ); + }, + prevUntil: function( elem, _i, until ) { + return dir( elem, "previousSibling", until ); + }, + siblings: function( elem ) { + return siblings( ( elem.parentNode || {} ).firstChild, elem ); + }, + children: function( elem ) { + return siblings( elem.firstChild ); + }, + contents: function( elem ) { + if ( elem.contentDocument != null && + + // Support: IE 11+ + // elements with no `data` attribute has an object + // `contentDocument` with a `null` prototype. + getProto( elem.contentDocument ) ) { + + return elem.contentDocument; + } + + // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only + // Treat the template element as a regular one in browsers that + // don't support it. + if ( nodeName( elem, "template" ) ) { + elem = elem.content || elem; + } + + return jQuery.merge( [], elem.childNodes ); + } +}, function( name, fn ) { + jQuery.fn[ name ] = function( until, selector ) { + var matched = jQuery.map( this, fn, until ); + + if ( name.slice( -5 ) !== "Until" ) { + selector = until; + } + + if ( selector && typeof selector === "string" ) { + matched = jQuery.filter( selector, matched ); + } + + if ( this.length > 1 ) { + + // Remove duplicates + if ( !guaranteedUnique[ name ] ) { + jQuery.uniqueSort( matched ); + } + + // Reverse order for parents* and prev-derivatives + if ( rparentsprev.test( name ) ) { + matched.reverse(); + } + } + + return this.pushStack( matched ); + }; +} ); +var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); + + + +// Convert String-formatted options into Object-formatted ones +function createOptions( options ) { + var object = {}; + jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { + object[ flag ] = true; + } ); + return object; +} + +/* + * Create a callback list using the following parameters: + * + * options: an optional list of space-separated options that will change how + * the callback list behaves or a more traditional option object + * + * By default a callback list will act like an event callback list and can be + * "fired" multiple times. + * + * Possible options: + * + * once: will ensure the callback list can only be fired once (like a Deferred) + * + * memory: will keep track of previous values and will call any callback added + * after the list has been fired right away with the latest "memorized" + * values (like a Deferred) + * + * unique: will ensure a callback can only be added once (no duplicate in the list) + * + * stopOnFalse: interrupt callings when a callback returns false + * + */ +jQuery.Callbacks = function( options ) { + + // Convert options from String-formatted to Object-formatted if needed + // (we check in cache first) + options = typeof options === "string" ? + createOptions( options ) : + jQuery.extend( {}, options ); + + var // Flag to know if list is currently firing + firing, + + // Last fire value for non-forgettable lists + memory, + + // Flag to know if list was already fired + fired, + + // Flag to prevent firing + locked, + + // Actual callback list + list = [], + + // Queue of execution data for repeatable lists + queue = [], + + // Index of currently firing callback (modified by add/remove as needed) + firingIndex = -1, + + // Fire callbacks + fire = function() { + + // Enforce single-firing + locked = locked || options.once; + + // Execute callbacks for all pending executions, + // respecting firingIndex overrides and runtime changes + fired = firing = true; + for ( ; queue.length; firingIndex = -1 ) { + memory = queue.shift(); + while ( ++firingIndex < list.length ) { + + // Run callback and check for early termination + if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && + options.stopOnFalse ) { + + // Jump to end and forget the data so .add doesn't re-fire + firingIndex = list.length; + memory = false; + } + } + } + + // Forget the data if we're done with it + if ( !options.memory ) { + memory = false; + } + + firing = false; + + // Clean up if we're done firing for good + if ( locked ) { + + // Keep an empty list if we have data for future add calls + if ( memory ) { + list = []; + + // Otherwise, this object is spent + } else { + list = ""; + } + } + }, + + // Actual Callbacks object + self = { + + // Add a callback or a collection of callbacks to the list + add: function() { + if ( list ) { + + // If we have memory from a past run, we should fire after adding + if ( memory && !firing ) { + firingIndex = list.length - 1; + queue.push( memory ); + } + + ( function add( args ) { + jQuery.each( args, function( _, arg ) { + if ( isFunction( arg ) ) { + if ( !options.unique || !self.has( arg ) ) { + list.push( arg ); + } + } else if ( arg && arg.length && toType( arg ) !== "string" ) { + + // Inspect recursively + add( arg ); + } + } ); + } )( arguments ); + + if ( memory && !firing ) { + fire(); + } + } + return this; + }, + + // Remove a callback from the list + remove: function() { + jQuery.each( arguments, function( _, arg ) { + var index; + while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { + list.splice( index, 1 ); + + // Handle firing indexes + if ( index <= firingIndex ) { + firingIndex--; + } + } + } ); + return this; + }, + + // Check if a given callback is in the list. + // If no argument is given, return whether or not list has callbacks attached. + has: function( fn ) { + return fn ? + jQuery.inArray( fn, list ) > -1 : + list.length > 0; + }, + + // Remove all callbacks from the list + empty: function() { + if ( list ) { + list = []; + } + return this; + }, + + // Disable .fire and .add + // Abort any current/pending executions + // Clear all callbacks and values + disable: function() { + locked = queue = []; + list = memory = ""; + return this; + }, + disabled: function() { + return !list; + }, + + // Disable .fire + // Also disable .add unless we have memory (since it would have no effect) + // Abort any pending executions + lock: function() { + locked = queue = []; + if ( !memory && !firing ) { + list = memory = ""; + } + return this; + }, + locked: function() { + return !!locked; + }, + + // Call all callbacks with the given context and arguments + fireWith: function( context, args ) { + if ( !locked ) { + args = args || []; + args = [ context, args.slice ? args.slice() : args ]; + queue.push( args ); + if ( !firing ) { + fire(); + } + } + return this; + }, + + // Call all the callbacks with the given arguments + fire: function() { + self.fireWith( this, arguments ); + return this; + }, + + // To know if the callbacks have already been called at least once + fired: function() { + return !!fired; + } + }; + + return self; +}; + + +function Identity( v ) { + return v; +} +function Thrower( ex ) { + throw ex; +} + +function adoptValue( value, resolve, reject, noValue ) { + var method; + + try { + + // Check for promise aspect first to privilege synchronous behavior + if ( value && isFunction( ( method = value.promise ) ) ) { + method.call( value ).done( resolve ).fail( reject ); + + // Other thenables + } else if ( value && isFunction( ( method = value.then ) ) ) { + method.call( value, resolve, reject ); + + // Other non-thenables + } else { + + // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: + // * false: [ value ].slice( 0 ) => resolve( value ) + // * true: [ value ].slice( 1 ) => resolve() + resolve.apply( undefined, [ value ].slice( noValue ) ); + } + + // For Promises/A+, convert exceptions into rejections + // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in + // Deferred#then to conditionally suppress rejection. + } catch ( value ) { + + // Support: Android 4.0 only + // Strict mode functions invoked without .call/.apply get global-object context + reject.apply( undefined, [ value ] ); + } +} + +jQuery.extend( { + + Deferred: function( func ) { + var tuples = [ + + // action, add listener, callbacks, + // ... .then handlers, argument index, [final state] + [ "notify", "progress", jQuery.Callbacks( "memory" ), + jQuery.Callbacks( "memory" ), 2 ], + [ "resolve", "done", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 0, "resolved" ], + [ "reject", "fail", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 1, "rejected" ] + ], + state = "pending", + promise = { + state: function() { + return state; + }, + always: function() { + deferred.done( arguments ).fail( arguments ); + return this; + }, + "catch": function( fn ) { + return promise.then( null, fn ); + }, + + // Keep pipe for back-compat + pipe: function( /* fnDone, fnFail, fnProgress */ ) { + var fns = arguments; + + return jQuery.Deferred( function( newDefer ) { + jQuery.each( tuples, function( _i, tuple ) { + + // Map tuples (progress, done, fail) to arguments (done, fail, progress) + var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; + + // deferred.progress(function() { bind to newDefer or newDefer.notify }) + // deferred.done(function() { bind to newDefer or newDefer.resolve }) + // deferred.fail(function() { bind to newDefer or newDefer.reject }) + deferred[ tuple[ 1 ] ]( function() { + var returned = fn && fn.apply( this, arguments ); + if ( returned && isFunction( returned.promise ) ) { + returned.promise() + .progress( newDefer.notify ) + .done( newDefer.resolve ) + .fail( newDefer.reject ); + } else { + newDefer[ tuple[ 0 ] + "With" ]( + this, + fn ? [ returned ] : arguments + ); + } + } ); + } ); + fns = null; + } ).promise(); + }, + then: function( onFulfilled, onRejected, onProgress ) { + var maxDepth = 0; + function resolve( depth, deferred, handler, special ) { + return function() { + var that = this, + args = arguments, + mightThrow = function() { + var returned, then; + + // Support: Promises/A+ section 2.3.3.3.3 + // https://promisesaplus.com/#point-59 + // Ignore double-resolution attempts + if ( depth < maxDepth ) { + return; + } + + returned = handler.apply( that, args ); + + // Support: Promises/A+ section 2.3.1 + // https://promisesaplus.com/#point-48 + if ( returned === deferred.promise() ) { + throw new TypeError( "Thenable self-resolution" ); + } + + // Support: Promises/A+ sections 2.3.3.1, 3.5 + // https://promisesaplus.com/#point-54 + // https://promisesaplus.com/#point-75 + // Retrieve `then` only once + then = returned && + + // Support: Promises/A+ section 2.3.4 + // https://promisesaplus.com/#point-64 + // Only check objects and functions for thenability + ( typeof returned === "object" || + typeof returned === "function" ) && + returned.then; + + // Handle a returned thenable + if ( isFunction( then ) ) { + + // Special processors (notify) just wait for resolution + if ( special ) { + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ) + ); + + // Normal processors (resolve) also hook into progress + } else { + + // ...and disregard older resolution values + maxDepth++; + + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ), + resolve( maxDepth, deferred, Identity, + deferred.notifyWith ) + ); + } + + // Handle all other returned values + } else { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Identity ) { + that = undefined; + args = [ returned ]; + } + + // Process the value(s) + // Default process is resolve + ( special || deferred.resolveWith )( that, args ); + } + }, + + // Only normal processors (resolve) catch and reject exceptions + process = special ? + mightThrow : + function() { + try { + mightThrow(); + } catch ( e ) { + + if ( jQuery.Deferred.exceptionHook ) { + jQuery.Deferred.exceptionHook( e, + process.stackTrace ); + } + + // Support: Promises/A+ section 2.3.3.3.4.1 + // https://promisesaplus.com/#point-61 + // Ignore post-resolution exceptions + if ( depth + 1 >= maxDepth ) { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Thrower ) { + that = undefined; + args = [ e ]; + } + + deferred.rejectWith( that, args ); + } + } + }; + + // Support: Promises/A+ section 2.3.3.3.1 + // https://promisesaplus.com/#point-57 + // Re-resolve promises immediately to dodge false rejection from + // subsequent errors + if ( depth ) { + process(); + } else { + + // Call an optional hook to record the stack, in case of exception + // since it's otherwise lost when execution goes async + if ( jQuery.Deferred.getStackHook ) { + process.stackTrace = jQuery.Deferred.getStackHook(); + } + window.setTimeout( process ); + } + }; + } + + return jQuery.Deferred( function( newDefer ) { + + // progress_handlers.add( ... ) + tuples[ 0 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onProgress ) ? + onProgress : + Identity, + newDefer.notifyWith + ) + ); + + // fulfilled_handlers.add( ... ) + tuples[ 1 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onFulfilled ) ? + onFulfilled : + Identity + ) + ); + + // rejected_handlers.add( ... ) + tuples[ 2 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onRejected ) ? + onRejected : + Thrower + ) + ); + } ).promise(); + }, + + // Get a promise for this deferred + // If obj is provided, the promise aspect is added to the object + promise: function( obj ) { + return obj != null ? jQuery.extend( obj, promise ) : promise; + } + }, + deferred = {}; + + // Add list-specific methods + jQuery.each( tuples, function( i, tuple ) { + var list = tuple[ 2 ], + stateString = tuple[ 5 ]; + + // promise.progress = list.add + // promise.done = list.add + // promise.fail = list.add + promise[ tuple[ 1 ] ] = list.add; + + // Handle state + if ( stateString ) { + list.add( + function() { + + // state = "resolved" (i.e., fulfilled) + // state = "rejected" + state = stateString; + }, + + // rejected_callbacks.disable + // fulfilled_callbacks.disable + tuples[ 3 - i ][ 2 ].disable, + + // rejected_handlers.disable + // fulfilled_handlers.disable + tuples[ 3 - i ][ 3 ].disable, + + // progress_callbacks.lock + tuples[ 0 ][ 2 ].lock, + + // progress_handlers.lock + tuples[ 0 ][ 3 ].lock + ); + } + + // progress_handlers.fire + // fulfilled_handlers.fire + // rejected_handlers.fire + list.add( tuple[ 3 ].fire ); + + // deferred.notify = function() { deferred.notifyWith(...) } + // deferred.resolve = function() { deferred.resolveWith(...) } + // deferred.reject = function() { deferred.rejectWith(...) } + deferred[ tuple[ 0 ] ] = function() { + deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); + return this; + }; + + // deferred.notifyWith = list.fireWith + // deferred.resolveWith = list.fireWith + // deferred.rejectWith = list.fireWith + deferred[ tuple[ 0 ] + "With" ] = list.fireWith; + } ); + + // Make the deferred a promise + promise.promise( deferred ); + + // Call given func if any + if ( func ) { + func.call( deferred, deferred ); + } + + // All done! + return deferred; + }, + + // Deferred helper + when: function( singleValue ) { + var + + // count of uncompleted subordinates + remaining = arguments.length, + + // count of unprocessed arguments + i = remaining, + + // subordinate fulfillment data + resolveContexts = Array( i ), + resolveValues = slice.call( arguments ), + + // the master Deferred + master = jQuery.Deferred(), + + // subordinate callback factory + updateFunc = function( i ) { + return function( value ) { + resolveContexts[ i ] = this; + resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; + if ( !( --remaining ) ) { + master.resolveWith( resolveContexts, resolveValues ); + } + }; + }; + + // Single- and empty arguments are adopted like Promise.resolve + if ( remaining <= 1 ) { + adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, + !remaining ); + + // Use .then() to unwrap secondary thenables (cf. gh-3000) + if ( master.state() === "pending" || + isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { + + return master.then(); + } + } + + // Multiple arguments are aggregated like Promise.all array elements + while ( i-- ) { + adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); + } + + return master.promise(); + } +} ); + + +// These usually indicate a programmer mistake during development, +// warn about them ASAP rather than swallowing them by default. +var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; + +jQuery.Deferred.exceptionHook = function( error, stack ) { + + // Support: IE 8 - 9 only + // Console exists when dev tools are open, which can happen at any time + if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { + window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); + } +}; + + + + +jQuery.readyException = function( error ) { + window.setTimeout( function() { + throw error; + } ); +}; + + + + +// The deferred used on DOM ready +var readyList = jQuery.Deferred(); + +jQuery.fn.ready = function( fn ) { + + readyList + .then( fn ) + + // Wrap jQuery.readyException in a function so that the lookup + // happens at the time of error handling instead of callback + // registration. + .catch( function( error ) { + jQuery.readyException( error ); + } ); + + return this; +}; + +jQuery.extend( { + + // Is the DOM ready to be used? Set to true once it occurs. + isReady: false, + + // A counter to track how many items to wait for before + // the ready event fires. See #6781 + readyWait: 1, + + // Handle when the DOM is ready + ready: function( wait ) { + + // Abort if there are pending holds or we're already ready + if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { + return; + } + + // Remember that the DOM is ready + jQuery.isReady = true; + + // If a normal DOM Ready event fired, decrement, and wait if need be + if ( wait !== true && --jQuery.readyWait > 0 ) { + return; + } + + // If there are functions bound, to execute + readyList.resolveWith( document, [ jQuery ] ); + } +} ); + +jQuery.ready.then = readyList.then; + +// The ready event handler and self cleanup method +function completed() { + document.removeEventListener( "DOMContentLoaded", completed ); + window.removeEventListener( "load", completed ); + jQuery.ready(); +} + +// Catch cases where $(document).ready() is called +// after the browser event has already occurred. +// Support: IE <=9 - 10 only +// Older IE sometimes signals "interactive" too soon +if ( document.readyState === "complete" || + ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { + + // Handle it asynchronously to allow scripts the opportunity to delay ready + window.setTimeout( jQuery.ready ); + +} else { + + // Use the handy event callback + document.addEventListener( "DOMContentLoaded", completed ); + + // A fallback to window.onload, that will always work + window.addEventListener( "load", completed ); +} + + + + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( toType( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( !isFunction( value ) ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, _key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +}; + + +// Matches dashed string for camelizing +var rmsPrefix = /^-ms-/, + rdashAlpha = /-([a-z])/g; + +// Used by camelCase as callback to replace() +function fcamelCase( _all, letter ) { + return letter.toUpperCase(); +} + +// Convert dashed to camelCase; used by the css and data modules +// Support: IE <=9 - 11, Edge 12 - 15 +// Microsoft forgot to hump their vendor prefix (#9572) +function camelCase( string ) { + return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); +} +var acceptData = function( owner ) { + + // Accepts only: + // - Node + // - Node.ELEMENT_NODE + // - Node.DOCUMENT_NODE + // - Object + // - Any + return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); +}; + + + + +function Data() { + this.expando = jQuery.expando + Data.uid++; +} + +Data.uid = 1; + +Data.prototype = { + + cache: function( owner ) { + + // Check if the owner object already has a cache + var value = owner[ this.expando ]; + + // If not, create one + if ( !value ) { + value = {}; + + // We can accept data for non-element nodes in modern browsers, + // but we should not, see #8335. + // Always return an empty object. + if ( acceptData( owner ) ) { + + // If it is a node unlikely to be stringify-ed or looped over + // use plain assignment + if ( owner.nodeType ) { + owner[ this.expando ] = value; + + // Otherwise secure it in a non-enumerable property + // configurable must be true to allow the property to be + // deleted when data is removed + } else { + Object.defineProperty( owner, this.expando, { + value: value, + configurable: true + } ); + } + } + } + + return value; + }, + set: function( owner, data, value ) { + var prop, + cache = this.cache( owner ); + + // Handle: [ owner, key, value ] args + // Always use camelCase key (gh-2257) + if ( typeof data === "string" ) { + cache[ camelCase( data ) ] = value; + + // Handle: [ owner, { properties } ] args + } else { + + // Copy the properties one-by-one to the cache object + for ( prop in data ) { + cache[ camelCase( prop ) ] = data[ prop ]; + } + } + return cache; + }, + get: function( owner, key ) { + return key === undefined ? + this.cache( owner ) : + + // Always use camelCase key (gh-2257) + owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; + }, + access: function( owner, key, value ) { + + // In cases where either: + // + // 1. No key was specified + // 2. A string key was specified, but no value provided + // + // Take the "read" path and allow the get method to determine + // which value to return, respectively either: + // + // 1. The entire cache object + // 2. The data stored at the key + // + if ( key === undefined || + ( ( key && typeof key === "string" ) && value === undefined ) ) { + + return this.get( owner, key ); + } + + // When the key is not a string, or both a key and value + // are specified, set or extend (existing objects) with either: + // + // 1. An object of properties + // 2. A key and value + // + this.set( owner, key, value ); + + // Since the "set" path can have two possible entry points + // return the expected data based on which path was taken[*] + return value !== undefined ? value : key; + }, + remove: function( owner, key ) { + var i, + cache = owner[ this.expando ]; + + if ( cache === undefined ) { + return; + } + + if ( key !== undefined ) { + + // Support array or space separated string of keys + if ( Array.isArray( key ) ) { + + // If key is an array of keys... + // We always set camelCase keys, so remove that. + key = key.map( camelCase ); + } else { + key = camelCase( key ); + + // If a key with the spaces exists, use it. + // Otherwise, create an array by matching non-whitespace + key = key in cache ? + [ key ] : + ( key.match( rnothtmlwhite ) || [] ); + } + + i = key.length; + + while ( i-- ) { + delete cache[ key[ i ] ]; + } + } + + // Remove the expando if there's no more data + if ( key === undefined || jQuery.isEmptyObject( cache ) ) { + + // Support: Chrome <=35 - 45 + // Webkit & Blink performance suffers when deleting properties + // from DOM nodes, so set to undefined instead + // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) + if ( owner.nodeType ) { + owner[ this.expando ] = undefined; + } else { + delete owner[ this.expando ]; + } + } + }, + hasData: function( owner ) { + var cache = owner[ this.expando ]; + return cache !== undefined && !jQuery.isEmptyObject( cache ); + } +}; +var dataPriv = new Data(); + +var dataUser = new Data(); + + + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11 only + // The attrs elements can be null (#14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + + +jQuery.extend( { + queue: function( elem, type, data ) { + var queue; + + if ( elem ) { + type = ( type || "fx" ) + "queue"; + queue = dataPriv.get( elem, type ); + + // Speed up dequeue by getting out quickly if this is just a lookup + if ( data ) { + if ( !queue || Array.isArray( data ) ) { + queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); + } else { + queue.push( data ); + } + } + return queue || []; + } + }, + + dequeue: function( elem, type ) { + type = type || "fx"; + + var queue = jQuery.queue( elem, type ), + startLength = queue.length, + fn = queue.shift(), + hooks = jQuery._queueHooks( elem, type ), + next = function() { + jQuery.dequeue( elem, type ); + }; + + // If the fx queue is dequeued, always remove the progress sentinel + if ( fn === "inprogress" ) { + fn = queue.shift(); + startLength--; + } + + if ( fn ) { + + // Add a progress sentinel to prevent the fx queue from being + // automatically dequeued + if ( type === "fx" ) { + queue.unshift( "inprogress" ); + } + + // Clear up the last queue stop function + delete hooks.stop; + fn.call( elem, next, hooks ); + } + + if ( !startLength && hooks ) { + hooks.empty.fire(); + } + }, + + // Not public - generate a queueHooks object, or return the current one + _queueHooks: function( elem, type ) { + var key = type + "queueHooks"; + return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { + empty: jQuery.Callbacks( "once memory" ).add( function() { + dataPriv.remove( elem, [ type + "queue", key ] ); + } ) + } ); + } +} ); + +jQuery.fn.extend( { + queue: function( type, data ) { + var setter = 2; + + if ( typeof type !== "string" ) { + data = type; + type = "fx"; + setter--; + } + + if ( arguments.length < setter ) { + return jQuery.queue( this[ 0 ], type ); + } + + return data === undefined ? + this : + this.each( function() { + var queue = jQuery.queue( this, type, data ); + + // Ensure a hooks for this queue + jQuery._queueHooks( this, type ); + + if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { + jQuery.dequeue( this, type ); + } + } ); + }, + dequeue: function( type ) { + return this.each( function() { + jQuery.dequeue( this, type ); + } ); + }, + clearQueue: function( type ) { + return this.queue( type || "fx", [] ); + }, + + // Get a promise resolved when queues of a certain type + // are emptied (fx is the type by default) + promise: function( type, obj ) { + var tmp, + count = 1, + defer = jQuery.Deferred(), + elements = this, + i = this.length, + resolve = function() { + if ( !( --count ) ) { + defer.resolveWith( elements, [ elements ] ); + } + }; + + if ( typeof type !== "string" ) { + obj = type; + type = undefined; + } + type = type || "fx"; + + while ( i-- ) { + tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); + if ( tmp && tmp.empty ) { + count++; + tmp.empty.add( resolve ); + } + } + resolve(); + return defer.promise( obj ); + } +} ); +var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; + +var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); + + +var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; + +var documentElement = document.documentElement; + + + + var isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ); + }, + composed = { composed: true }; + + // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only + // Check attachment across shadow DOM boundaries when possible (gh-3504) + // Support: iOS 10.0-10.2 only + // Early iOS 10 versions support `attachShadow` but not `getRootNode`, + // leading to errors. We need to check for `getRootNode`. + if ( documentElement.getRootNode ) { + isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ) || + elem.getRootNode( composed ) === elem.ownerDocument; + }; + } +var isHiddenWithinTree = function( elem, el ) { + + // isHiddenWithinTree might be called from jQuery#filter function; + // in that case, element will be second argument + elem = el || elem; + + // Inline style trumps all + return elem.style.display === "none" || + elem.style.display === "" && + + // Otherwise, check computed style + // Support: Firefox <=43 - 45 + // Disconnected elements can have computed display: none, so first confirm that elem is + // in the document. + isAttached( elem ) && + + jQuery.css( elem, "display" ) === "none"; + }; + + + +function adjustCSS( elem, prop, valueParts, tween ) { + var adjusted, scale, + maxIterations = 20, + currentValue = tween ? + function() { + return tween.cur(); + } : + function() { + return jQuery.css( elem, prop, "" ); + }, + initial = currentValue(), + unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), + + // Starting value computation is required for potential unit mismatches + initialInUnit = elem.nodeType && + ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && + rcssNum.exec( jQuery.css( elem, prop ) ); + + if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { + + // Support: Firefox <=54 + // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) + initial = initial / 2; + + // Trust units reported by jQuery.css + unit = unit || initialInUnit[ 3 ]; + + // Iteratively approximate from a nonzero starting point + initialInUnit = +initial || 1; + + while ( maxIterations-- ) { + + // Evaluate and update our best guess (doubling guesses that zero out). + // Finish if the scale equals or crosses 1 (making the old*new product non-positive). + jQuery.style( elem, prop, initialInUnit + unit ); + if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { + maxIterations = 0; + } + initialInUnit = initialInUnit / scale; + + } + + initialInUnit = initialInUnit * 2; + jQuery.style( elem, prop, initialInUnit + unit ); + + // Make sure we update the tween properties later on + valueParts = valueParts || []; + } + + if ( valueParts ) { + initialInUnit = +initialInUnit || +initial || 0; + + // Apply relative offset (+=/-=) if specified + adjusted = valueParts[ 1 ] ? + initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : + +valueParts[ 2 ]; + if ( tween ) { + tween.unit = unit; + tween.start = initialInUnit; + tween.end = adjusted; + } + } + return adjusted; +} + + +var defaultDisplayMap = {}; + +function getDefaultDisplay( elem ) { + var temp, + doc = elem.ownerDocument, + nodeName = elem.nodeName, + display = defaultDisplayMap[ nodeName ]; + + if ( display ) { + return display; + } + + temp = doc.body.appendChild( doc.createElement( nodeName ) ); + display = jQuery.css( temp, "display" ); + + temp.parentNode.removeChild( temp ); + + if ( display === "none" ) { + display = "block"; + } + defaultDisplayMap[ nodeName ] = display; + + return display; +} + +function showHide( elements, show ) { + var display, elem, + values = [], + index = 0, + length = elements.length; + + // Determine new display value for elements that need to change + for ( ; index < length; index++ ) { + elem = elements[ index ]; + if ( !elem.style ) { + continue; + } + + display = elem.style.display; + if ( show ) { + + // Since we force visibility upon cascade-hidden elements, an immediate (and slow) + // check is required in this first loop unless we have a nonempty display value (either + // inline or about-to-be-restored) + if ( display === "none" ) { + values[ index ] = dataPriv.get( elem, "display" ) || null; + if ( !values[ index ] ) { + elem.style.display = ""; + } + } + if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { + values[ index ] = getDefaultDisplay( elem ); + } + } else { + if ( display !== "none" ) { + values[ index ] = "none"; + + // Remember what we're overwriting + dataPriv.set( elem, "display", display ); + } + } + } + + // Set the display of the elements in a second loop to avoid constant reflow + for ( index = 0; index < length; index++ ) { + if ( values[ index ] != null ) { + elements[ index ].style.display = values[ index ]; + } + } + + return elements; +} + +jQuery.fn.extend( { + show: function() { + return showHide( this, true ); + }, + hide: function() { + return showHide( this ); + }, + toggle: function( state ) { + if ( typeof state === "boolean" ) { + return state ? this.show() : this.hide(); + } + + return this.each( function() { + if ( isHiddenWithinTree( this ) ) { + jQuery( this ).show(); + } else { + jQuery( this ).hide(); + } + } ); + } +} ); +var rcheckableType = ( /^(?:checkbox|radio)$/i ); + +var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); + +var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); + + + +( function() { + var fragment = document.createDocumentFragment(), + div = fragment.appendChild( document.createElement( "div" ) ), + input = document.createElement( "input" ); + + // Support: Android 4.0 - 4.3 only + // Check state lost if the name is set (#11217) + // Support: Windows Web Apps (WWA) + // `name` and `type` must use .setAttribute for WWA (#14901) + input.setAttribute( "type", "radio" ); + input.setAttribute( "checked", "checked" ); + input.setAttribute( "name", "t" ); + + div.appendChild( input ); + + // Support: Android <=4.1 only + // Older WebKit doesn't clone checked state correctly in fragments + support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; + + // Support: IE <=11 only + // Make sure textarea (and checkbox) defaultValue is properly cloned + div.innerHTML = ""; + support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; + + // Support: IE <=9 only + // IE <=9 replaces "; + support.option = !!div.lastChild; +} )(); + + +// We have to close these tags to support XHTML (#13200) +var wrapMap = { + + // XHTML parsers do not magically insert elements in the + // same way that tag soup parsers do. So we cannot shorten + // this by omitting or other required elements. + thead: [ 1, "", "
" ], + col: [ 2, "", "
" ], + tr: [ 2, "", "
" ], + td: [ 3, "", "
" ], + + _default: [ 0, "", "" ] +}; + +wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; +wrapMap.th = wrapMap.td; + +// Support: IE <=9 only +if ( !support.option ) { + wrapMap.optgroup = wrapMap.option = [ 1, "" ]; +} + + +function getAll( context, tag ) { + + // Support: IE <=9 - 11 only + // Use typeof to avoid zero-argument method invocation on host objects (#15151) + var ret; + + if ( typeof context.getElementsByTagName !== "undefined" ) { + ret = context.getElementsByTagName( tag || "*" ); + + } else if ( typeof context.querySelectorAll !== "undefined" ) { + ret = context.querySelectorAll( tag || "*" ); + + } else { + ret = []; + } + + if ( tag === undefined || tag && nodeName( context, tag ) ) { + return jQuery.merge( [ context ], ret ); + } + + return ret; +} + + +// Mark scripts as having already been evaluated +function setGlobalEval( elems, refElements ) { + var i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + dataPriv.set( + elems[ i ], + "globalEval", + !refElements || dataPriv.get( refElements[ i ], "globalEval" ) + ); + } +} + + +var rhtml = /<|&#?\w+;/; + +function buildFragment( elems, context, scripts, selection, ignored ) { + var elem, tmp, tag, wrap, attached, j, + fragment = context.createDocumentFragment(), + nodes = [], + i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + elem = elems[ i ]; + + if ( elem || elem === 0 ) { + + // Add nodes directly + if ( toType( elem ) === "object" ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); + + // Convert non-html into a text node + } else if ( !rhtml.test( elem ) ) { + nodes.push( context.createTextNode( elem ) ); + + // Convert html into DOM nodes + } else { + tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); + + // Deserialize a standard representation + tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); + wrap = wrapMap[ tag ] || wrapMap._default; + tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; + + // Descend through wrappers to the right content + j = wrap[ 0 ]; + while ( j-- ) { + tmp = tmp.lastChild; + } + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, tmp.childNodes ); + + // Remember the top-level container + tmp = fragment.firstChild; + + // Ensure the created nodes are orphaned (#12392) + tmp.textContent = ""; + } + } + } + + // Remove wrapper from fragment + fragment.textContent = ""; + + i = 0; + while ( ( elem = nodes[ i++ ] ) ) { + + // Skip elements already in the context collection (trac-4087) + if ( selection && jQuery.inArray( elem, selection ) > -1 ) { + if ( ignored ) { + ignored.push( elem ); + } + continue; + } + + attached = isAttached( elem ); + + // Append to fragment + tmp = getAll( fragment.appendChild( elem ), "script" ); + + // Preserve script evaluation history + if ( attached ) { + setGlobalEval( tmp ); + } + + // Capture executables + if ( scripts ) { + j = 0; + while ( ( elem = tmp[ j++ ] ) ) { + if ( rscriptType.test( elem.type || "" ) ) { + scripts.push( elem ); + } + } + } + } + + return fragment; +} + + +var + rkeyEvent = /^key/, + rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, + rtypenamespace = /^([^.]*)(?:\.(.+)|)/; + +function returnTrue() { + return true; +} + +function returnFalse() { + return false; +} + +// Support: IE <=9 - 11+ +// focus() and blur() are asynchronous, except when they are no-op. +// So expect focus to be synchronous when the element is already active, +// and blur to be synchronous when the element is not already active. +// (focus and blur are always synchronous in other supported browsers, +// this just defines when we can count on it). +function expectSync( elem, type ) { + return ( elem === safeActiveElement() ) === ( type === "focus" ); +} + +// Support: IE <=9 only +// Accessing document.activeElement can throw unexpectedly +// https://bugs.jquery.com/ticket/13393 +function safeActiveElement() { + try { + return document.activeElement; + } catch ( err ) { } +} + +function on( elem, types, selector, data, fn, one ) { + var origFn, type; + + // Types can be a map of types/handlers + if ( typeof types === "object" ) { + + // ( types-Object, selector, data ) + if ( typeof selector !== "string" ) { + + // ( types-Object, data ) + data = data || selector; + selector = undefined; + } + for ( type in types ) { + on( elem, type, selector, data, types[ type ], one ); + } + return elem; + } + + if ( data == null && fn == null ) { + + // ( types, fn ) + fn = selector; + data = selector = undefined; + } else if ( fn == null ) { + if ( typeof selector === "string" ) { + + // ( types, selector, fn ) + fn = data; + data = undefined; + } else { + + // ( types, data, fn ) + fn = data; + data = selector; + selector = undefined; + } + } + if ( fn === false ) { + fn = returnFalse; + } else if ( !fn ) { + return elem; + } + + if ( one === 1 ) { + origFn = fn; + fn = function( event ) { + + // Can use an empty set, since event contains the info + jQuery().off( event ); + return origFn.apply( this, arguments ); + }; + + // Use same guid so caller can remove using origFn + fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); + } + return elem.each( function() { + jQuery.event.add( this, types, fn, data, selector ); + } ); +} + +/* + * Helper functions for managing events -- not part of the public interface. + * Props to Dean Edwards' addEvent library for many of the ideas. + */ +jQuery.event = { + + global: {}, + + add: function( elem, types, handler, data, selector ) { + + var handleObjIn, eventHandle, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.get( elem ); + + // Only attach events to objects that accept data + if ( !acceptData( elem ) ) { + return; + } + + // Caller can pass in an object of custom data in lieu of the handler + if ( handler.handler ) { + handleObjIn = handler; + handler = handleObjIn.handler; + selector = handleObjIn.selector; + } + + // Ensure that invalid selectors throw exceptions at attach time + // Evaluate against documentElement in case elem is a non-element node (e.g., document) + if ( selector ) { + jQuery.find.matchesSelector( documentElement, selector ); + } + + // Make sure that the handler has a unique ID, used to find/remove it later + if ( !handler.guid ) { + handler.guid = jQuery.guid++; + } + + // Init the element's event structure and main handler, if this is the first + if ( !( events = elemData.events ) ) { + events = elemData.events = Object.create( null ); + } + if ( !( eventHandle = elemData.handle ) ) { + eventHandle = elemData.handle = function( e ) { + + // Discard the second event of a jQuery.event.trigger() and + // when an event is called after a page has unloaded + return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? + jQuery.event.dispatch.apply( elem, arguments ) : undefined; + }; + } + + // Handle multiple events separated by a space + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // There *must* be a type, no attaching namespace-only handlers + if ( !type ) { + continue; + } + + // If event changes its type, use the special event handlers for the changed type + special = jQuery.event.special[ type ] || {}; + + // If selector defined, determine special event api type, otherwise given type + type = ( selector ? special.delegateType : special.bindType ) || type; + + // Update special based on newly reset type + special = jQuery.event.special[ type ] || {}; + + // handleObj is passed to all event handlers + handleObj = jQuery.extend( { + type: type, + origType: origType, + data: data, + handler: handler, + guid: handler.guid, + selector: selector, + needsContext: selector && jQuery.expr.match.needsContext.test( selector ), + namespace: namespaces.join( "." ) + }, handleObjIn ); + + // Init the event handler queue if we're the first + if ( !( handlers = events[ type ] ) ) { + handlers = events[ type ] = []; + handlers.delegateCount = 0; + + // Only use addEventListener if the special events handler returns false + if ( !special.setup || + special.setup.call( elem, data, namespaces, eventHandle ) === false ) { + + if ( elem.addEventListener ) { + elem.addEventListener( type, eventHandle ); + } + } + } + + if ( special.add ) { + special.add.call( elem, handleObj ); + + if ( !handleObj.handler.guid ) { + handleObj.handler.guid = handler.guid; + } + } + + // Add to the element's handler list, delegates in front + if ( selector ) { + handlers.splice( handlers.delegateCount++, 0, handleObj ); + } else { + handlers.push( handleObj ); + } + + // Keep track of which events have ever been used, for event optimization + jQuery.event.global[ type ] = true; + } + + }, + + // Detach an event or set of events from an element + remove: function( elem, types, handler, selector, mappedTypes ) { + + var j, origCount, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); + + if ( !elemData || !( events = elemData.events ) ) { + return; + } + + // Once for each type.namespace in types; type may be omitted + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // Unbind all events (on this namespace, if provided) for the element + if ( !type ) { + for ( type in events ) { + jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); + } + continue; + } + + special = jQuery.event.special[ type ] || {}; + type = ( selector ? special.delegateType : special.bindType ) || type; + handlers = events[ type ] || []; + tmp = tmp[ 2 ] && + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); + + // Remove matching events + origCount = j = handlers.length; + while ( j-- ) { + handleObj = handlers[ j ]; + + if ( ( mappedTypes || origType === handleObj.origType ) && + ( !handler || handler.guid === handleObj.guid ) && + ( !tmp || tmp.test( handleObj.namespace ) ) && + ( !selector || selector === handleObj.selector || + selector === "**" && handleObj.selector ) ) { + handlers.splice( j, 1 ); + + if ( handleObj.selector ) { + handlers.delegateCount--; + } + if ( special.remove ) { + special.remove.call( elem, handleObj ); + } + } + } + + // Remove generic event handler if we removed something and no more handlers exist + // (avoids potential for endless recursion during removal of special event handlers) + if ( origCount && !handlers.length ) { + if ( !special.teardown || + special.teardown.call( elem, namespaces, elemData.handle ) === false ) { + + jQuery.removeEvent( elem, type, elemData.handle ); + } + + delete events[ type ]; + } + } + + // Remove data and the expando if it's no longer used + if ( jQuery.isEmptyObject( events ) ) { + dataPriv.remove( elem, "handle events" ); + } + }, + + dispatch: function( nativeEvent ) { + + var i, j, ret, matched, handleObj, handlerQueue, + args = new Array( arguments.length ), + + // Make a writable jQuery.Event from the native event object + event = jQuery.event.fix( nativeEvent ), + + handlers = ( + dataPriv.get( this, "events" ) || Object.create( null ) + )[ event.type ] || [], + special = jQuery.event.special[ event.type ] || {}; + + // Use the fix-ed jQuery.Event rather than the (read-only) native event + args[ 0 ] = event; + + for ( i = 1; i < arguments.length; i++ ) { + args[ i ] = arguments[ i ]; + } + + event.delegateTarget = this; + + // Call the preDispatch hook for the mapped type, and let it bail if desired + if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { + return; + } + + // Determine handlers + handlerQueue = jQuery.event.handlers.call( this, event, handlers ); + + // Run delegates first; they may want to stop propagation beneath us + i = 0; + while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { + event.currentTarget = matched.elem; + + j = 0; + while ( ( handleObj = matched.handlers[ j++ ] ) && + !event.isImmediatePropagationStopped() ) { + + // If the event is namespaced, then each handler is only invoked if it is + // specially universal or its namespaces are a superset of the event's. + if ( !event.rnamespace || handleObj.namespace === false || + event.rnamespace.test( handleObj.namespace ) ) { + + event.handleObj = handleObj; + event.data = handleObj.data; + + ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || + handleObj.handler ).apply( matched.elem, args ); + + if ( ret !== undefined ) { + if ( ( event.result = ret ) === false ) { + event.preventDefault(); + event.stopPropagation(); + } + } + } + } + } + + // Call the postDispatch hook for the mapped type + if ( special.postDispatch ) { + special.postDispatch.call( this, event ); + } + + return event.result; + }, + + handlers: function( event, handlers ) { + var i, handleObj, sel, matchedHandlers, matchedSelectors, + handlerQueue = [], + delegateCount = handlers.delegateCount, + cur = event.target; + + // Find delegate handlers + if ( delegateCount && + + // Support: IE <=9 + // Black-hole SVG instance trees (trac-13180) + cur.nodeType && + + // Support: Firefox <=42 + // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) + // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click + // Support: IE 11 only + // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) + !( event.type === "click" && event.button >= 1 ) ) { + + for ( ; cur !== this; cur = cur.parentNode || this ) { + + // Don't check non-elements (#13208) + // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) + if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { + matchedHandlers = []; + matchedSelectors = {}; + for ( i = 0; i < delegateCount; i++ ) { + handleObj = handlers[ i ]; + + // Don't conflict with Object.prototype properties (#13203) + sel = handleObj.selector + " "; + + if ( matchedSelectors[ sel ] === undefined ) { + matchedSelectors[ sel ] = handleObj.needsContext ? + jQuery( sel, this ).index( cur ) > -1 : + jQuery.find( sel, this, null, [ cur ] ).length; + } + if ( matchedSelectors[ sel ] ) { + matchedHandlers.push( handleObj ); + } + } + if ( matchedHandlers.length ) { + handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); + } + } + } + } + + // Add the remaining (directly-bound) handlers + cur = this; + if ( delegateCount < handlers.length ) { + handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); + } + + return handlerQueue; + }, + + addProp: function( name, hook ) { + Object.defineProperty( jQuery.Event.prototype, name, { + enumerable: true, + configurable: true, + + get: isFunction( hook ) ? + function() { + if ( this.originalEvent ) { + return hook( this.originalEvent ); + } + } : + function() { + if ( this.originalEvent ) { + return this.originalEvent[ name ]; + } + }, + + set: function( value ) { + Object.defineProperty( this, name, { + enumerable: true, + configurable: true, + writable: true, + value: value + } ); + } + } ); + }, + + fix: function( originalEvent ) { + return originalEvent[ jQuery.expando ] ? + originalEvent : + new jQuery.Event( originalEvent ); + }, + + special: { + load: { + + // Prevent triggered image.load events from bubbling to window.load + noBubble: true + }, + click: { + + // Utilize native event to ensure correct state for checkable inputs + setup: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Claim the first handler + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + // dataPriv.set( el, "click", ... ) + leverageNative( el, "click", returnTrue ); + } + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Force setup before triggering a click + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + leverageNative( el, "click" ); + } + + // Return non-false to allow normal event-path propagation + return true; + }, + + // For cross-browser consistency, suppress native .click() on links + // Also prevent it if we're currently inside a leveraged native-event stack + _default: function( event ) { + var target = event.target; + return rcheckableType.test( target.type ) && + target.click && nodeName( target, "input" ) && + dataPriv.get( target, "click" ) || + nodeName( target, "a" ); + } + }, + + beforeunload: { + postDispatch: function( event ) { + + // Support: Firefox 20+ + // Firefox doesn't alert if the returnValue field is not set. + if ( event.result !== undefined && event.originalEvent ) { + event.originalEvent.returnValue = event.result; + } + } + } + } +}; + +// Ensure the presence of an event listener that handles manually-triggered +// synthetic events by interrupting progress until reinvoked in response to +// *native* events that it fires directly, ensuring that state changes have +// already occurred before other listeners are invoked. +function leverageNative( el, type, expectSync ) { + + // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add + if ( !expectSync ) { + if ( dataPriv.get( el, type ) === undefined ) { + jQuery.event.add( el, type, returnTrue ); + } + return; + } + + // Register the controller as a special universal handler for all event namespaces + dataPriv.set( el, type, false ); + jQuery.event.add( el, type, { + namespace: false, + handler: function( event ) { + var notAsync, result, + saved = dataPriv.get( this, type ); + + if ( ( event.isTrigger & 1 ) && this[ type ] ) { + + // Interrupt processing of the outer synthetic .trigger()ed event + // Saved data should be false in such cases, but might be a leftover capture object + // from an async native handler (gh-4350) + if ( !saved.length ) { + + // Store arguments for use when handling the inner native event + // There will always be at least one argument (an event object), so this array + // will not be confused with a leftover capture object. + saved = slice.call( arguments ); + dataPriv.set( this, type, saved ); + + // Trigger the native event and capture its result + // Support: IE <=9 - 11+ + // focus() and blur() are asynchronous + notAsync = expectSync( this, type ); + this[ type ](); + result = dataPriv.get( this, type ); + if ( saved !== result || notAsync ) { + dataPriv.set( this, type, false ); + } else { + result = {}; + } + if ( saved !== result ) { + + // Cancel the outer synthetic event + event.stopImmediatePropagation(); + event.preventDefault(); + return result.value; + } + + // If this is an inner synthetic event for an event with a bubbling surrogate + // (focus or blur), assume that the surrogate already propagated from triggering the + // native event and prevent that from happening again here. + // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the + // bubbling surrogate propagates *after* the non-bubbling base), but that seems + // less bad than duplication. + } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { + event.stopPropagation(); + } + + // If this is a native event triggered above, everything is now in order + // Fire an inner synthetic event with the original arguments + } else if ( saved.length ) { + + // ...and capture the result + dataPriv.set( this, type, { + value: jQuery.event.trigger( + + // Support: IE <=9 - 11+ + // Extend with the prototype to reset the above stopImmediatePropagation() + jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), + saved.slice( 1 ), + this + ) + } ); + + // Abort handling of the native event + event.stopImmediatePropagation(); + } + } + } ); +} + +jQuery.removeEvent = function( elem, type, handle ) { + + // This "if" is needed for plain objects + if ( elem.removeEventListener ) { + elem.removeEventListener( type, handle ); + } +}; + +jQuery.Event = function( src, props ) { + + // Allow instantiation without the 'new' keyword + if ( !( this instanceof jQuery.Event ) ) { + return new jQuery.Event( src, props ); + } + + // Event object + if ( src && src.type ) { + this.originalEvent = src; + this.type = src.type; + + // Events bubbling up the document may have been marked as prevented + // by a handler lower down the tree; reflect the correct value. + this.isDefaultPrevented = src.defaultPrevented || + src.defaultPrevented === undefined && + + // Support: Android <=2.3 only + src.returnValue === false ? + returnTrue : + returnFalse; + + // Create target properties + // Support: Safari <=6 - 7 only + // Target should not be a text node (#504, #13143) + this.target = ( src.target && src.target.nodeType === 3 ) ? + src.target.parentNode : + src.target; + + this.currentTarget = src.currentTarget; + this.relatedTarget = src.relatedTarget; + + // Event type + } else { + this.type = src; + } + + // Put explicitly provided properties onto the event object + if ( props ) { + jQuery.extend( this, props ); + } + + // Create a timestamp if incoming event doesn't have one + this.timeStamp = src && src.timeStamp || Date.now(); + + // Mark it as fixed + this[ jQuery.expando ] = true; +}; + +// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding +// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html +jQuery.Event.prototype = { + constructor: jQuery.Event, + isDefaultPrevented: returnFalse, + isPropagationStopped: returnFalse, + isImmediatePropagationStopped: returnFalse, + isSimulated: false, + + preventDefault: function() { + var e = this.originalEvent; + + this.isDefaultPrevented = returnTrue; + + if ( e && !this.isSimulated ) { + e.preventDefault(); + } + }, + stopPropagation: function() { + var e = this.originalEvent; + + this.isPropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopPropagation(); + } + }, + stopImmediatePropagation: function() { + var e = this.originalEvent; + + this.isImmediatePropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopImmediatePropagation(); + } + + this.stopPropagation(); + } +}; + +// Includes all common event props including KeyEvent and MouseEvent specific props +jQuery.each( { + altKey: true, + bubbles: true, + cancelable: true, + changedTouches: true, + ctrlKey: true, + detail: true, + eventPhase: true, + metaKey: true, + pageX: true, + pageY: true, + shiftKey: true, + view: true, + "char": true, + code: true, + charCode: true, + key: true, + keyCode: true, + button: true, + buttons: true, + clientX: true, + clientY: true, + offsetX: true, + offsetY: true, + pointerId: true, + pointerType: true, + screenX: true, + screenY: true, + targetTouches: true, + toElement: true, + touches: true, + + which: function( event ) { + var button = event.button; + + // Add which for key events + if ( event.which == null && rkeyEvent.test( event.type ) ) { + return event.charCode != null ? event.charCode : event.keyCode; + } + + // Add which for click: 1 === left; 2 === middle; 3 === right + if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { + if ( button & 1 ) { + return 1; + } + + if ( button & 2 ) { + return 3; + } + + if ( button & 4 ) { + return 2; + } + + return 0; + } + + return event.which; + } +}, jQuery.event.addProp ); + +jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { + jQuery.event.special[ type ] = { + + // Utilize native event if possible so blur/focus sequence is correct + setup: function() { + + // Claim the first handler + // dataPriv.set( this, "focus", ... ) + // dataPriv.set( this, "blur", ... ) + leverageNative( this, type, expectSync ); + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function() { + + // Force setup before trigger + leverageNative( this, type ); + + // Return non-false to allow normal event-path propagation + return true; + }, + + delegateType: delegateType + }; +} ); + +// Create mouseenter/leave events using mouseover/out and event-time checks +// so that event delegation works in jQuery. +// Do the same for pointerenter/pointerleave and pointerover/pointerout +// +// Support: Safari 7 only +// Safari sends mouseenter too often; see: +// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 +// for the description of the bug (it existed in older Chrome versions as well). +jQuery.each( { + mouseenter: "mouseover", + mouseleave: "mouseout", + pointerenter: "pointerover", + pointerleave: "pointerout" +}, function( orig, fix ) { + jQuery.event.special[ orig ] = { + delegateType: fix, + bindType: fix, + + handle: function( event ) { + var ret, + target = this, + related = event.relatedTarget, + handleObj = event.handleObj; + + // For mouseenter/leave call the handler if related is outside the target. + // NB: No relatedTarget if the mouse left/entered the browser window + if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { + event.type = handleObj.origType; + ret = handleObj.handler.apply( this, arguments ); + event.type = fix; + } + return ret; + } + }; +} ); + +jQuery.fn.extend( { + + on: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn ); + }, + one: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn, 1 ); + }, + off: function( types, selector, fn ) { + var handleObj, type; + if ( types && types.preventDefault && types.handleObj ) { + + // ( event ) dispatched jQuery.Event + handleObj = types.handleObj; + jQuery( types.delegateTarget ).off( + handleObj.namespace ? + handleObj.origType + "." + handleObj.namespace : + handleObj.origType, + handleObj.selector, + handleObj.handler + ); + return this; + } + if ( typeof types === "object" ) { + + // ( types-object [, selector] ) + for ( type in types ) { + this.off( type, selector, types[ type ] ); + } + return this; + } + if ( selector === false || typeof selector === "function" ) { + + // ( types [, fn] ) + fn = selector; + selector = undefined; + } + if ( fn === false ) { + fn = returnFalse; + } + return this.each( function() { + jQuery.event.remove( this, types, fn, selector ); + } ); + } +} ); + + +var + + // Support: IE <=10 - 11, Edge 12 - 13 only + // In IE/Edge using regex groups here causes severe slowdowns. + // See https://connect.microsoft.com/IE/feedback/details/1736512/ + rnoInnerhtml = /\s*$/g; + +// Prefer a tbody over its parent table for containing new rows +function manipulationTarget( elem, content ) { + if ( nodeName( elem, "table" ) && + nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { + + return jQuery( elem ).children( "tbody" )[ 0 ] || elem; + } + + return elem; +} + +// Replace/restore the type attribute of script elements for safe DOM manipulation +function disableScript( elem ) { + elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; + return elem; +} +function restoreScript( elem ) { + if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { + elem.type = elem.type.slice( 5 ); + } else { + elem.removeAttribute( "type" ); + } + + return elem; +} + +function cloneCopyEvent( src, dest ) { + var i, l, type, pdataOld, udataOld, udataCur, events; + + if ( dest.nodeType !== 1 ) { + return; + } + + // 1. Copy private data: events, handlers, etc. + if ( dataPriv.hasData( src ) ) { + pdataOld = dataPriv.get( src ); + events = pdataOld.events; + + if ( events ) { + dataPriv.remove( dest, "handle events" ); + + for ( type in events ) { + for ( i = 0, l = events[ type ].length; i < l; i++ ) { + jQuery.event.add( dest, type, events[ type ][ i ] ); + } + } + } + } + + // 2. Copy user data + if ( dataUser.hasData( src ) ) { + udataOld = dataUser.access( src ); + udataCur = jQuery.extend( {}, udataOld ); + + dataUser.set( dest, udataCur ); + } +} + +// Fix IE bugs, see support tests +function fixInput( src, dest ) { + var nodeName = dest.nodeName.toLowerCase(); + + // Fails to persist the checked state of a cloned checkbox or radio button. + if ( nodeName === "input" && rcheckableType.test( src.type ) ) { + dest.checked = src.checked; + + // Fails to return the selected option to the default selected state when cloning options + } else if ( nodeName === "input" || nodeName === "textarea" ) { + dest.defaultValue = src.defaultValue; + } +} + +function domManip( collection, args, callback, ignored ) { + + // Flatten any nested arrays + args = flat( args ); + + var fragment, first, scripts, hasScripts, node, doc, + i = 0, + l = collection.length, + iNoClone = l - 1, + value = args[ 0 ], + valueIsFunction = isFunction( value ); + + // We can't cloneNode fragments that contain checked, in WebKit + if ( valueIsFunction || + ( l > 1 && typeof value === "string" && + !support.checkClone && rchecked.test( value ) ) ) { + return collection.each( function( index ) { + var self = collection.eq( index ); + if ( valueIsFunction ) { + args[ 0 ] = value.call( this, index, self.html() ); + } + domManip( self, args, callback, ignored ); + } ); + } + + if ( l ) { + fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); + first = fragment.firstChild; + + if ( fragment.childNodes.length === 1 ) { + fragment = first; + } + + // Require either new content or an interest in ignored elements to invoke the callback + if ( first || ignored ) { + scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); + hasScripts = scripts.length; + + // Use the original fragment for the last item + // instead of the first because it can end up + // being emptied incorrectly in certain situations (#8070). + for ( ; i < l; i++ ) { + node = fragment; + + if ( i !== iNoClone ) { + node = jQuery.clone( node, true, true ); + + // Keep references to cloned scripts for later restoration + if ( hasScripts ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( scripts, getAll( node, "script" ) ); + } + } + + callback.call( collection[ i ], node, i ); + } + + if ( hasScripts ) { + doc = scripts[ scripts.length - 1 ].ownerDocument; + + // Reenable scripts + jQuery.map( scripts, restoreScript ); + + // Evaluate executable scripts on first document insertion + for ( i = 0; i < hasScripts; i++ ) { + node = scripts[ i ]; + if ( rscriptType.test( node.type || "" ) && + !dataPriv.access( node, "globalEval" ) && + jQuery.contains( doc, node ) ) { + + if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { + + // Optional AJAX dependency, but won't run scripts if not present + if ( jQuery._evalUrl && !node.noModule ) { + jQuery._evalUrl( node.src, { + nonce: node.nonce || node.getAttribute( "nonce" ) + }, doc ); + } + } else { + DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); + } + } + } + } + } + } + + return collection; +} + +function remove( elem, selector, keepData ) { + var node, + nodes = selector ? jQuery.filter( selector, elem ) : elem, + i = 0; + + for ( ; ( node = nodes[ i ] ) != null; i++ ) { + if ( !keepData && node.nodeType === 1 ) { + jQuery.cleanData( getAll( node ) ); + } + + if ( node.parentNode ) { + if ( keepData && isAttached( node ) ) { + setGlobalEval( getAll( node, "script" ) ); + } + node.parentNode.removeChild( node ); + } + } + + return elem; +} + +jQuery.extend( { + htmlPrefilter: function( html ) { + return html; + }, + + clone: function( elem, dataAndEvents, deepDataAndEvents ) { + var i, l, srcElements, destElements, + clone = elem.cloneNode( true ), + inPage = isAttached( elem ); + + // Fix IE cloning issues + if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && + !jQuery.isXMLDoc( elem ) ) { + + // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 + destElements = getAll( clone ); + srcElements = getAll( elem ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + fixInput( srcElements[ i ], destElements[ i ] ); + } + } + + // Copy the events from the original to the clone + if ( dataAndEvents ) { + if ( deepDataAndEvents ) { + srcElements = srcElements || getAll( elem ); + destElements = destElements || getAll( clone ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + cloneCopyEvent( srcElements[ i ], destElements[ i ] ); + } + } else { + cloneCopyEvent( elem, clone ); + } + } + + // Preserve script evaluation history + destElements = getAll( clone, "script" ); + if ( destElements.length > 0 ) { + setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); + } + + // Return the cloned set + return clone; + }, + + cleanData: function( elems ) { + var data, elem, type, + special = jQuery.event.special, + i = 0; + + for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { + if ( acceptData( elem ) ) { + if ( ( data = elem[ dataPriv.expando ] ) ) { + if ( data.events ) { + for ( type in data.events ) { + if ( special[ type ] ) { + jQuery.event.remove( elem, type ); + + // This is a shortcut to avoid jQuery.event.remove's overhead + } else { + jQuery.removeEvent( elem, type, data.handle ); + } + } + } + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataPriv.expando ] = undefined; + } + if ( elem[ dataUser.expando ] ) { + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataUser.expando ] = undefined; + } + } + } + } +} ); + +jQuery.fn.extend( { + detach: function( selector ) { + return remove( this, selector, true ); + }, + + remove: function( selector ) { + return remove( this, selector ); + }, + + text: function( value ) { + return access( this, function( value ) { + return value === undefined ? + jQuery.text( this ) : + this.empty().each( function() { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + this.textContent = value; + } + } ); + }, null, value, arguments.length ); + }, + + append: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.appendChild( elem ); + } + } ); + }, + + prepend: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.insertBefore( elem, target.firstChild ); + } + } ); + }, + + before: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this ); + } + } ); + }, + + after: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this.nextSibling ); + } + } ); + }, + + empty: function() { + var elem, + i = 0; + + for ( ; ( elem = this[ i ] ) != null; i++ ) { + if ( elem.nodeType === 1 ) { + + // Prevent memory leaks + jQuery.cleanData( getAll( elem, false ) ); + + // Remove any remaining nodes + elem.textContent = ""; + } + } + + return this; + }, + + clone: function( dataAndEvents, deepDataAndEvents ) { + dataAndEvents = dataAndEvents == null ? false : dataAndEvents; + deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; + + return this.map( function() { + return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); + } ); + }, + + html: function( value ) { + return access( this, function( value ) { + var elem = this[ 0 ] || {}, + i = 0, + l = this.length; + + if ( value === undefined && elem.nodeType === 1 ) { + return elem.innerHTML; + } + + // See if we can take a shortcut and just use innerHTML + if ( typeof value === "string" && !rnoInnerhtml.test( value ) && + !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { + + value = jQuery.htmlPrefilter( value ); + + try { + for ( ; i < l; i++ ) { + elem = this[ i ] || {}; + + // Remove element nodes and prevent memory leaks + if ( elem.nodeType === 1 ) { + jQuery.cleanData( getAll( elem, false ) ); + elem.innerHTML = value; + } + } + + elem = 0; + + // If using innerHTML throws an exception, use the fallback method + } catch ( e ) {} + } + + if ( elem ) { + this.empty().append( value ); + } + }, null, value, arguments.length ); + }, + + replaceWith: function() { + var ignored = []; + + // Make the changes, replacing each non-ignored context element with the new content + return domManip( this, arguments, function( elem ) { + var parent = this.parentNode; + + if ( jQuery.inArray( this, ignored ) < 0 ) { + jQuery.cleanData( getAll( this ) ); + if ( parent ) { + parent.replaceChild( elem, this ); + } + } + + // Force callback invocation + }, ignored ); + } +} ); + +jQuery.each( { + appendTo: "append", + prependTo: "prepend", + insertBefore: "before", + insertAfter: "after", + replaceAll: "replaceWith" +}, function( name, original ) { + jQuery.fn[ name ] = function( selector ) { + var elems, + ret = [], + insert = jQuery( selector ), + last = insert.length - 1, + i = 0; + + for ( ; i <= last; i++ ) { + elems = i === last ? this : this.clone( true ); + jQuery( insert[ i ] )[ original ]( elems ); + + // Support: Android <=4.0 only, PhantomJS 1 only + // .get() because push.apply(_, arraylike) throws on ancient WebKit + push.apply( ret, elems.get() ); + } + + return this.pushStack( ret ); + }; +} ); +var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); + +var getStyles = function( elem ) { + + // Support: IE <=11 only, Firefox <=30 (#15098, #14150) + // IE throws on elements created in popups + // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" + var view = elem.ownerDocument.defaultView; + + if ( !view || !view.opener ) { + view = window; + } + + return view.getComputedStyle( elem ); + }; + +var swap = function( elem, options, callback ) { + var ret, name, + old = {}; + + // Remember the old values, and insert the new ones + for ( name in options ) { + old[ name ] = elem.style[ name ]; + elem.style[ name ] = options[ name ]; + } + + ret = callback.call( elem ); + + // Revert the old values + for ( name in options ) { + elem.style[ name ] = old[ name ]; + } + + return ret; +}; + + +var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); + + + +( function() { + + // Executing both pixelPosition & boxSizingReliable tests require only one layout + // so they're executed at the same time to save the second computation. + function computeStyleTests() { + + // This is a singleton, we need to execute it only once + if ( !div ) { + return; + } + + container.style.cssText = "position:absolute;left:-11111px;width:60px;" + + "margin-top:1px;padding:0;border:0"; + div.style.cssText = + "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + + "margin:auto;border:1px;padding:1px;" + + "width:60%;top:1%"; + documentElement.appendChild( container ).appendChild( div ); + + var divStyle = window.getComputedStyle( div ); + pixelPositionVal = divStyle.top !== "1%"; + + // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 + reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; + + // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 + // Some styles come back with percentage values, even though they shouldn't + div.style.right = "60%"; + pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; + + // Support: IE 9 - 11 only + // Detect misreporting of content dimensions for box-sizing:border-box elements + boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; + + // Support: IE 9 only + // Detect overflow:scroll screwiness (gh-3699) + // Support: Chrome <=64 + // Don't get tricked when zoom affects offsetWidth (gh-4029) + div.style.position = "absolute"; + scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; + + documentElement.removeChild( container ); + + // Nullify the div so it wouldn't be stored in the memory and + // it will also be a sign that checks already performed + div = null; + } + + function roundPixelMeasures( measure ) { + return Math.round( parseFloat( measure ) ); + } + + var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, + reliableTrDimensionsVal, reliableMarginLeftVal, + container = document.createElement( "div" ), + div = document.createElement( "div" ); + + // Finish early in limited (non-browser) environments + if ( !div.style ) { + return; + } + + // Support: IE <=9 - 11 only + // Style of cloned element affects source element cloned (#8908) + div.style.backgroundClip = "content-box"; + div.cloneNode( true ).style.backgroundClip = ""; + support.clearCloneStyle = div.style.backgroundClip === "content-box"; + + jQuery.extend( support, { + boxSizingReliable: function() { + computeStyleTests(); + return boxSizingReliableVal; + }, + pixelBoxStyles: function() { + computeStyleTests(); + return pixelBoxStylesVal; + }, + pixelPosition: function() { + computeStyleTests(); + return pixelPositionVal; + }, + reliableMarginLeft: function() { + computeStyleTests(); + return reliableMarginLeftVal; + }, + scrollboxSize: function() { + computeStyleTests(); + return scrollboxSizeVal; + }, + + // Support: IE 9 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Behavior in IE 9 is more subtle than in newer versions & it passes + // some versions of this test; make sure not to make it pass there! + reliableTrDimensions: function() { + var table, tr, trChild, trStyle; + if ( reliableTrDimensionsVal == null ) { + table = document.createElement( "table" ); + tr = document.createElement( "tr" ); + trChild = document.createElement( "div" ); + + table.style.cssText = "position:absolute;left:-11111px"; + tr.style.height = "1px"; + trChild.style.height = "9px"; + + documentElement + .appendChild( table ) + .appendChild( tr ) + .appendChild( trChild ); + + trStyle = window.getComputedStyle( tr ); + reliableTrDimensionsVal = parseInt( trStyle.height ) > 3; + + documentElement.removeChild( table ); + } + return reliableTrDimensionsVal; + } + } ); +} )(); + + +function curCSS( elem, name, computed ) { + var width, minWidth, maxWidth, ret, + + // Support: Firefox 51+ + // Retrieving style before computed somehow + // fixes an issue with getting wrong values + // on detached elements + style = elem.style; + + computed = computed || getStyles( elem ); + + // getPropertyValue is needed for: + // .css('filter') (IE 9 only, #12537) + // .css('--customProperty) (#3144) + if ( computed ) { + ret = computed.getPropertyValue( name ) || computed[ name ]; + + if ( ret === "" && !isAttached( elem ) ) { + ret = jQuery.style( elem, name ); + } + + // A tribute to the "awesome hack by Dean Edwards" + // Android Browser returns percentage for some values, + // but width seems to be reliably pixels. + // This is against the CSSOM draft spec: + // https://drafts.csswg.org/cssom/#resolved-values + if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { + + // Remember the original values + width = style.width; + minWidth = style.minWidth; + maxWidth = style.maxWidth; + + // Put in the new values to get a computed value out + style.minWidth = style.maxWidth = style.width = ret; + ret = computed.width; + + // Revert the changed values + style.width = width; + style.minWidth = minWidth; + style.maxWidth = maxWidth; + } + } + + return ret !== undefined ? + + // Support: IE <=9 - 11 only + // IE returns zIndex value as an integer. + ret + "" : + ret; +} + + +function addGetHookIf( conditionFn, hookFn ) { + + // Define the hook, we'll check on the first run if it's really needed. + return { + get: function() { + if ( conditionFn() ) { + + // Hook not needed (or it's not possible to use it due + // to missing dependency), remove it. + delete this.get; + return; + } + + // Hook needed; redefine it so that the support test is not executed again. + return ( this.get = hookFn ).apply( this, arguments ); + } + }; +} + + +var cssPrefixes = [ "Webkit", "Moz", "ms" ], + emptyStyle = document.createElement( "div" ).style, + vendorProps = {}; + +// Return a vendor-prefixed property or undefined +function vendorPropName( name ) { + + // Check for vendor prefixed names + var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), + i = cssPrefixes.length; + + while ( i-- ) { + name = cssPrefixes[ i ] + capName; + if ( name in emptyStyle ) { + return name; + } + } +} + +// Return a potentially-mapped jQuery.cssProps or vendor prefixed property +function finalPropName( name ) { + var final = jQuery.cssProps[ name ] || vendorProps[ name ]; + + if ( final ) { + return final; + } + if ( name in emptyStyle ) { + return name; + } + return vendorProps[ name ] = vendorPropName( name ) || name; +} + + +var + + // Swappable if display is none or starts with table + // except "table", "table-cell", or "table-caption" + // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display + rdisplayswap = /^(none|table(?!-c[ea]).+)/, + rcustomProp = /^--/, + cssShow = { position: "absolute", visibility: "hidden", display: "block" }, + cssNormalTransform = { + letterSpacing: "0", + fontWeight: "400" + }; + +function setPositiveNumber( _elem, value, subtract ) { + + // Any relative (+/-) values have already been + // normalized at this point + var matches = rcssNum.exec( value ); + return matches ? + + // Guard against undefined "subtract", e.g., when used as in cssHooks + Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : + value; +} + +function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { + var i = dimension === "width" ? 1 : 0, + extra = 0, + delta = 0; + + // Adjustment may not be necessary + if ( box === ( isBorderBox ? "border" : "content" ) ) { + return 0; + } + + for ( ; i < 4; i += 2 ) { + + // Both box models exclude margin + if ( box === "margin" ) { + delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); + } + + // If we get here with a content-box, we're seeking "padding" or "border" or "margin" + if ( !isBorderBox ) { + + // Add padding + delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + + // For "border" or "margin", add border + if ( box !== "padding" ) { + delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + + // But still keep track of it otherwise + } else { + extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + + // If we get here with a border-box (content + padding + border), we're seeking "content" or + // "padding" or "margin" + } else { + + // For "content", subtract padding + if ( box === "content" ) { + delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + } + + // For "content" or "padding", subtract border + if ( box !== "margin" ) { + delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } + } + + // Account for positive content-box scroll gutter when requested by providing computedVal + if ( !isBorderBox && computedVal >= 0 ) { + + // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border + // Assuming integer scroll gutter, subtract the rest and round down + delta += Math.max( 0, Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + computedVal - + delta - + extra - + 0.5 + + // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter + // Use an explicit zero to avoid NaN (gh-3964) + ) ) || 0; + } + + return delta; +} + +function getWidthOrHeight( elem, dimension, extra ) { + + // Start with computed style + var styles = getStyles( elem ), + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). + // Fake content-box until we know it's needed to know the true value. + boxSizingNeeded = !support.boxSizingReliable() || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + valueIsBorderBox = isBorderBox, + + val = curCSS( elem, dimension, styles ), + offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); + + // Support: Firefox <=54 + // Return a confounding non-pixel value or feign ignorance, as appropriate. + if ( rnumnonpx.test( val ) ) { + if ( !extra ) { + return val; + } + val = "auto"; + } + + + // Support: IE 9 - 11 only + // Use offsetWidth/offsetHeight for when box sizing is unreliable. + // In those cases, the computed value can be trusted to be border-box. + if ( ( !support.boxSizingReliable() && isBorderBox || + + // Support: IE 10 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Interestingly, in some cases IE 9 doesn't suffer from this issue. + !support.reliableTrDimensions() && nodeName( elem, "tr" ) || + + // Fall back to offsetWidth/offsetHeight when value is "auto" + // This happens for inline elements with no explicit setting (gh-3571) + val === "auto" || + + // Support: Android <=4.1 - 4.3 only + // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) + !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && + + // Make sure the element is visible & connected + elem.getClientRects().length ) { + + isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; + + // Where available, offsetWidth/offsetHeight approximate border box dimensions. + // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the + // retrieved value as a content box dimension. + valueIsBorderBox = offsetProp in elem; + if ( valueIsBorderBox ) { + val = elem[ offsetProp ]; + } + } + + // Normalize "" and auto + val = parseFloat( val ) || 0; + + // Adjust for the element's box model + return ( val + + boxModelAdjustment( + elem, + dimension, + extra || ( isBorderBox ? "border" : "content" ), + valueIsBorderBox, + styles, + + // Provide the current computed size to request scroll gutter calculation (gh-3589) + val + ) + ) + "px"; +} + +jQuery.extend( { + + // Add in style property hooks for overriding the default + // behavior of getting and setting a style property + cssHooks: { + opacity: { + get: function( elem, computed ) { + if ( computed ) { + + // We should always get a number back from opacity + var ret = curCSS( elem, "opacity" ); + return ret === "" ? "1" : ret; + } + } + } + }, + + // Don't automatically add "px" to these possibly-unitless properties + cssNumber: { + "animationIterationCount": true, + "columnCount": true, + "fillOpacity": true, + "flexGrow": true, + "flexShrink": true, + "fontWeight": true, + "gridArea": true, + "gridColumn": true, + "gridColumnEnd": true, + "gridColumnStart": true, + "gridRow": true, + "gridRowEnd": true, + "gridRowStart": true, + "lineHeight": true, + "opacity": true, + "order": true, + "orphans": true, + "widows": true, + "zIndex": true, + "zoom": true + }, + + // Add in properties whose names you wish to fix before + // setting or getting the value + cssProps: {}, + + // Get and set the style property on a DOM Node + style: function( elem, name, value, extra ) { + + // Don't set styles on text and comment nodes + if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { + return; + } + + // Make sure that we're working with the right name + var ret, type, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ), + style = elem.style; + + // Make sure that we're working with the right name. We don't + // want to query the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Gets hook for the prefixed version, then unprefixed version + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // Check if we're setting a value + if ( value !== undefined ) { + type = typeof value; + + // Convert "+=" or "-=" to relative numbers (#7345) + if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { + value = adjustCSS( elem, name, ret ); + + // Fixes bug #9237 + type = "number"; + } + + // Make sure that null and NaN values aren't set (#7116) + if ( value == null || value !== value ) { + return; + } + + // If a number was passed in, add the unit (except for certain CSS properties) + // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append + // "px" to a few hardcoded values. + if ( type === "number" && !isCustomProp ) { + value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); + } + + // background-* props affect original clone's values + if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { + style[ name ] = "inherit"; + } + + // If a hook was provided, use that value, otherwise just set the specified value + if ( !hooks || !( "set" in hooks ) || + ( value = hooks.set( elem, value, extra ) ) !== undefined ) { + + if ( isCustomProp ) { + style.setProperty( name, value ); + } else { + style[ name ] = value; + } + } + + } else { + + // If a hook was provided get the non-computed value from there + if ( hooks && "get" in hooks && + ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { + + return ret; + } + + // Otherwise just get the value from the style object + return style[ name ]; + } + }, + + css: function( elem, name, extra, styles ) { + var val, num, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ); + + // Make sure that we're working with the right name. We don't + // want to modify the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Try prefixed name followed by the unprefixed name + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // If a hook was provided get the computed value from there + if ( hooks && "get" in hooks ) { + val = hooks.get( elem, true, extra ); + } + + // Otherwise, if a way to get the computed value exists, use that + if ( val === undefined ) { + val = curCSS( elem, name, styles ); + } + + // Convert "normal" to computed value + if ( val === "normal" && name in cssNormalTransform ) { + val = cssNormalTransform[ name ]; + } + + // Make numeric if forced or a qualifier was provided and val looks numeric + if ( extra === "" || extra ) { + num = parseFloat( val ); + return extra === true || isFinite( num ) ? num || 0 : val; + } + + return val; + } +} ); + +jQuery.each( [ "height", "width" ], function( _i, dimension ) { + jQuery.cssHooks[ dimension ] = { + get: function( elem, computed, extra ) { + if ( computed ) { + + // Certain elements can have dimension info if we invisibly show them + // but it must have a current display style that would benefit + return rdisplayswap.test( jQuery.css( elem, "display" ) ) && + + // Support: Safari 8+ + // Table columns in Safari have non-zero offsetWidth & zero + // getBoundingClientRect().width unless display is changed. + // Support: IE <=11 only + // Running getBoundingClientRect on a disconnected node + // in IE throws an error. + ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? + swap( elem, cssShow, function() { + return getWidthOrHeight( elem, dimension, extra ); + } ) : + getWidthOrHeight( elem, dimension, extra ); + } + }, + + set: function( elem, value, extra ) { + var matches, + styles = getStyles( elem ), + + // Only read styles.position if the test has a chance to fail + // to avoid forcing a reflow. + scrollboxSizeBuggy = !support.scrollboxSize() && + styles.position === "absolute", + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) + boxSizingNeeded = scrollboxSizeBuggy || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + subtract = extra ? + boxModelAdjustment( + elem, + dimension, + extra, + isBorderBox, + styles + ) : + 0; + + // Account for unreliable border-box dimensions by comparing offset* to computed and + // faking a content-box to get border and padding (gh-3699) + if ( isBorderBox && scrollboxSizeBuggy ) { + subtract -= Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + parseFloat( styles[ dimension ] ) - + boxModelAdjustment( elem, dimension, "border", false, styles ) - + 0.5 + ); + } + + // Convert to pixels if value adjustment is needed + if ( subtract && ( matches = rcssNum.exec( value ) ) && + ( matches[ 3 ] || "px" ) !== "px" ) { + + elem.style[ dimension ] = value; + value = jQuery.css( elem, dimension ); + } + + return setPositiveNumber( elem, value, subtract ); + } + }; +} ); + +jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, + function( elem, computed ) { + if ( computed ) { + return ( parseFloat( curCSS( elem, "marginLeft" ) ) || + elem.getBoundingClientRect().left - + swap( elem, { marginLeft: 0 }, function() { + return elem.getBoundingClientRect().left; + } ) + ) + "px"; + } + } +); + +// These hooks are used by animate to expand properties +jQuery.each( { + margin: "", + padding: "", + border: "Width" +}, function( prefix, suffix ) { + jQuery.cssHooks[ prefix + suffix ] = { + expand: function( value ) { + var i = 0, + expanded = {}, + + // Assumes a single number if not a string + parts = typeof value === "string" ? value.split( " " ) : [ value ]; + + for ( ; i < 4; i++ ) { + expanded[ prefix + cssExpand[ i ] + suffix ] = + parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; + } + + return expanded; + } + }; + + if ( prefix !== "margin" ) { + jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; + } +} ); + +jQuery.fn.extend( { + css: function( name, value ) { + return access( this, function( elem, name, value ) { + var styles, len, + map = {}, + i = 0; + + if ( Array.isArray( name ) ) { + styles = getStyles( elem ); + len = name.length; + + for ( ; i < len; i++ ) { + map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); + } + + return map; + } + + return value !== undefined ? + jQuery.style( elem, name, value ) : + jQuery.css( elem, name ); + }, name, value, arguments.length > 1 ); + } +} ); + + +function Tween( elem, options, prop, end, easing ) { + return new Tween.prototype.init( elem, options, prop, end, easing ); +} +jQuery.Tween = Tween; + +Tween.prototype = { + constructor: Tween, + init: function( elem, options, prop, end, easing, unit ) { + this.elem = elem; + this.prop = prop; + this.easing = easing || jQuery.easing._default; + this.options = options; + this.start = this.now = this.cur(); + this.end = end; + this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); + }, + cur: function() { + var hooks = Tween.propHooks[ this.prop ]; + + return hooks && hooks.get ? + hooks.get( this ) : + Tween.propHooks._default.get( this ); + }, + run: function( percent ) { + var eased, + hooks = Tween.propHooks[ this.prop ]; + + if ( this.options.duration ) { + this.pos = eased = jQuery.easing[ this.easing ]( + percent, this.options.duration * percent, 0, 1, this.options.duration + ); + } else { + this.pos = eased = percent; + } + this.now = ( this.end - this.start ) * eased + this.start; + + if ( this.options.step ) { + this.options.step.call( this.elem, this.now, this ); + } + + if ( hooks && hooks.set ) { + hooks.set( this ); + } else { + Tween.propHooks._default.set( this ); + } + return this; + } +}; + +Tween.prototype.init.prototype = Tween.prototype; + +Tween.propHooks = { + _default: { + get: function( tween ) { + var result; + + // Use a property on the element directly when it is not a DOM element, + // or when there is no matching style property that exists. + if ( tween.elem.nodeType !== 1 || + tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { + return tween.elem[ tween.prop ]; + } + + // Passing an empty string as a 3rd parameter to .css will automatically + // attempt a parseFloat and fallback to a string if the parse fails. + // Simple values such as "10px" are parsed to Float; + // complex values such as "rotate(1rad)" are returned as-is. + result = jQuery.css( tween.elem, tween.prop, "" ); + + // Empty strings, null, undefined and "auto" are converted to 0. + return !result || result === "auto" ? 0 : result; + }, + set: function( tween ) { + + // Use step hook for back compat. + // Use cssHook if its there. + // Use .style if available and use plain properties where available. + if ( jQuery.fx.step[ tween.prop ] ) { + jQuery.fx.step[ tween.prop ]( tween ); + } else if ( tween.elem.nodeType === 1 && ( + jQuery.cssHooks[ tween.prop ] || + tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { + jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); + } else { + tween.elem[ tween.prop ] = tween.now; + } + } + } +}; + +// Support: IE <=9 only +// Panic based approach to setting things on disconnected nodes +Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { + set: function( tween ) { + if ( tween.elem.nodeType && tween.elem.parentNode ) { + tween.elem[ tween.prop ] = tween.now; + } + } +}; + +jQuery.easing = { + linear: function( p ) { + return p; + }, + swing: function( p ) { + return 0.5 - Math.cos( p * Math.PI ) / 2; + }, + _default: "swing" +}; + +jQuery.fx = Tween.prototype.init; + +// Back compat <1.8 extension point +jQuery.fx.step = {}; + + + + +var + fxNow, inProgress, + rfxtypes = /^(?:toggle|show|hide)$/, + rrun = /queueHooks$/; + +function schedule() { + if ( inProgress ) { + if ( document.hidden === false && window.requestAnimationFrame ) { + window.requestAnimationFrame( schedule ); + } else { + window.setTimeout( schedule, jQuery.fx.interval ); + } + + jQuery.fx.tick(); + } +} + +// Animations created synchronously will run synchronously +function createFxNow() { + window.setTimeout( function() { + fxNow = undefined; + } ); + return ( fxNow = Date.now() ); +} + +// Generate parameters to create a standard animation +function genFx( type, includeWidth ) { + var which, + i = 0, + attrs = { height: type }; + + // If we include width, step value is 1 to do all cssExpand values, + // otherwise step value is 2 to skip over Left and Right + includeWidth = includeWidth ? 1 : 0; + for ( ; i < 4; i += 2 - includeWidth ) { + which = cssExpand[ i ]; + attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; + } + + if ( includeWidth ) { + attrs.opacity = attrs.width = type; + } + + return attrs; +} + +function createTween( value, prop, animation ) { + var tween, + collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), + index = 0, + length = collection.length; + for ( ; index < length; index++ ) { + if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { + + // We're done with this property + return tween; + } + } +} + +function defaultPrefilter( elem, props, opts ) { + var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, + isBox = "width" in props || "height" in props, + anim = this, + orig = {}, + style = elem.style, + hidden = elem.nodeType && isHiddenWithinTree( elem ), + dataShow = dataPriv.get( elem, "fxshow" ); + + // Queue-skipping animations hijack the fx hooks + if ( !opts.queue ) { + hooks = jQuery._queueHooks( elem, "fx" ); + if ( hooks.unqueued == null ) { + hooks.unqueued = 0; + oldfire = hooks.empty.fire; + hooks.empty.fire = function() { + if ( !hooks.unqueued ) { + oldfire(); + } + }; + } + hooks.unqueued++; + + anim.always( function() { + + // Ensure the complete handler is called before this completes + anim.always( function() { + hooks.unqueued--; + if ( !jQuery.queue( elem, "fx" ).length ) { + hooks.empty.fire(); + } + } ); + } ); + } + + // Detect show/hide animations + for ( prop in props ) { + value = props[ prop ]; + if ( rfxtypes.test( value ) ) { + delete props[ prop ]; + toggle = toggle || value === "toggle"; + if ( value === ( hidden ? "hide" : "show" ) ) { + + // Pretend to be hidden if this is a "show" and + // there is still data from a stopped show/hide + if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { + hidden = true; + + // Ignore all other no-op show/hide data + } else { + continue; + } + } + orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); + } + } + + // Bail out if this is a no-op like .hide().hide() + propTween = !jQuery.isEmptyObject( props ); + if ( !propTween && jQuery.isEmptyObject( orig ) ) { + return; + } + + // Restrict "overflow" and "display" styles during box animations + if ( isBox && elem.nodeType === 1 ) { + + // Support: IE <=9 - 11, Edge 12 - 15 + // Record all 3 overflow attributes because IE does not infer the shorthand + // from identically-valued overflowX and overflowY and Edge just mirrors + // the overflowX value there. + opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; + + // Identify a display type, preferring old show/hide data over the CSS cascade + restoreDisplay = dataShow && dataShow.display; + if ( restoreDisplay == null ) { + restoreDisplay = dataPriv.get( elem, "display" ); + } + display = jQuery.css( elem, "display" ); + if ( display === "none" ) { + if ( restoreDisplay ) { + display = restoreDisplay; + } else { + + // Get nonempty value(s) by temporarily forcing visibility + showHide( [ elem ], true ); + restoreDisplay = elem.style.display || restoreDisplay; + display = jQuery.css( elem, "display" ); + showHide( [ elem ] ); + } + } + + // Animate inline elements as inline-block + if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { + if ( jQuery.css( elem, "float" ) === "none" ) { + + // Restore the original display value at the end of pure show/hide animations + if ( !propTween ) { + anim.done( function() { + style.display = restoreDisplay; + } ); + if ( restoreDisplay == null ) { + display = style.display; + restoreDisplay = display === "none" ? "" : display; + } + } + style.display = "inline-block"; + } + } + } + + if ( opts.overflow ) { + style.overflow = "hidden"; + anim.always( function() { + style.overflow = opts.overflow[ 0 ]; + style.overflowX = opts.overflow[ 1 ]; + style.overflowY = opts.overflow[ 2 ]; + } ); + } + + // Implement show/hide animations + propTween = false; + for ( prop in orig ) { + + // General show/hide setup for this element animation + if ( !propTween ) { + if ( dataShow ) { + if ( "hidden" in dataShow ) { + hidden = dataShow.hidden; + } + } else { + dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); + } + + // Store hidden/visible for toggle so `.stop().toggle()` "reverses" + if ( toggle ) { + dataShow.hidden = !hidden; + } + + // Show elements before animating them + if ( hidden ) { + showHide( [ elem ], true ); + } + + /* eslint-disable no-loop-func */ + + anim.done( function() { + + /* eslint-enable no-loop-func */ + + // The final step of a "hide" animation is actually hiding the element + if ( !hidden ) { + showHide( [ elem ] ); + } + dataPriv.remove( elem, "fxshow" ); + for ( prop in orig ) { + jQuery.style( elem, prop, orig[ prop ] ); + } + } ); + } + + // Per-property setup + propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); + if ( !( prop in dataShow ) ) { + dataShow[ prop ] = propTween.start; + if ( hidden ) { + propTween.end = propTween.start; + propTween.start = 0; + } + } + } +} + +function propFilter( props, specialEasing ) { + var index, name, easing, value, hooks; + + // camelCase, specialEasing and expand cssHook pass + for ( index in props ) { + name = camelCase( index ); + easing = specialEasing[ name ]; + value = props[ index ]; + if ( Array.isArray( value ) ) { + easing = value[ 1 ]; + value = props[ index ] = value[ 0 ]; + } + + if ( index !== name ) { + props[ name ] = value; + delete props[ index ]; + } + + hooks = jQuery.cssHooks[ name ]; + if ( hooks && "expand" in hooks ) { + value = hooks.expand( value ); + delete props[ name ]; + + // Not quite $.extend, this won't overwrite existing keys. + // Reusing 'index' because we have the correct "name" + for ( index in value ) { + if ( !( index in props ) ) { + props[ index ] = value[ index ]; + specialEasing[ index ] = easing; + } + } + } else { + specialEasing[ name ] = easing; + } + } +} + +function Animation( elem, properties, options ) { + var result, + stopped, + index = 0, + length = Animation.prefilters.length, + deferred = jQuery.Deferred().always( function() { + + // Don't match elem in the :animated selector + delete tick.elem; + } ), + tick = function() { + if ( stopped ) { + return false; + } + var currentTime = fxNow || createFxNow(), + remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), + + // Support: Android 2.3 only + // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) + temp = remaining / animation.duration || 0, + percent = 1 - temp, + index = 0, + length = animation.tweens.length; + + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( percent ); + } + + deferred.notifyWith( elem, [ animation, percent, remaining ] ); + + // If there's more to do, yield + if ( percent < 1 && length ) { + return remaining; + } + + // If this was an empty animation, synthesize a final progress notification + if ( !length ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + } + + // Resolve the animation and report its conclusion + deferred.resolveWith( elem, [ animation ] ); + return false; + }, + animation = deferred.promise( { + elem: elem, + props: jQuery.extend( {}, properties ), + opts: jQuery.extend( true, { + specialEasing: {}, + easing: jQuery.easing._default + }, options ), + originalProperties: properties, + originalOptions: options, + startTime: fxNow || createFxNow(), + duration: options.duration, + tweens: [], + createTween: function( prop, end ) { + var tween = jQuery.Tween( elem, animation.opts, prop, end, + animation.opts.specialEasing[ prop ] || animation.opts.easing ); + animation.tweens.push( tween ); + return tween; + }, + stop: function( gotoEnd ) { + var index = 0, + + // If we are going to the end, we want to run all the tweens + // otherwise we skip this part + length = gotoEnd ? animation.tweens.length : 0; + if ( stopped ) { + return this; + } + stopped = true; + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( 1 ); + } + + // Resolve when we played the last frame; otherwise, reject + if ( gotoEnd ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + deferred.resolveWith( elem, [ animation, gotoEnd ] ); + } else { + deferred.rejectWith( elem, [ animation, gotoEnd ] ); + } + return this; + } + } ), + props = animation.props; + + propFilter( props, animation.opts.specialEasing ); + + for ( ; index < length; index++ ) { + result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); + if ( result ) { + if ( isFunction( result.stop ) ) { + jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = + result.stop.bind( result ); + } + return result; + } + } + + jQuery.map( props, createTween, animation ); + + if ( isFunction( animation.opts.start ) ) { + animation.opts.start.call( elem, animation ); + } + + // Attach callbacks from options + animation + .progress( animation.opts.progress ) + .done( animation.opts.done, animation.opts.complete ) + .fail( animation.opts.fail ) + .always( animation.opts.always ); + + jQuery.fx.timer( + jQuery.extend( tick, { + elem: elem, + anim: animation, + queue: animation.opts.queue + } ) + ); + + return animation; +} + +jQuery.Animation = jQuery.extend( Animation, { + + tweeners: { + "*": [ function( prop, value ) { + var tween = this.createTween( prop, value ); + adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); + return tween; + } ] + }, + + tweener: function( props, callback ) { + if ( isFunction( props ) ) { + callback = props; + props = [ "*" ]; + } else { + props = props.match( rnothtmlwhite ); + } + + var prop, + index = 0, + length = props.length; + + for ( ; index < length; index++ ) { + prop = props[ index ]; + Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; + Animation.tweeners[ prop ].unshift( callback ); + } + }, + + prefilters: [ defaultPrefilter ], + + prefilter: function( callback, prepend ) { + if ( prepend ) { + Animation.prefilters.unshift( callback ); + } else { + Animation.prefilters.push( callback ); + } + } +} ); + +jQuery.speed = function( speed, easing, fn ) { + var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { + complete: fn || !fn && easing || + isFunction( speed ) && speed, + duration: speed, + easing: fn && easing || easing && !isFunction( easing ) && easing + }; + + // Go to the end state if fx are off + if ( jQuery.fx.off ) { + opt.duration = 0; + + } else { + if ( typeof opt.duration !== "number" ) { + if ( opt.duration in jQuery.fx.speeds ) { + opt.duration = jQuery.fx.speeds[ opt.duration ]; + + } else { + opt.duration = jQuery.fx.speeds._default; + } + } + } + + // Normalize opt.queue - true/undefined/null -> "fx" + if ( opt.queue == null || opt.queue === true ) { + opt.queue = "fx"; + } + + // Queueing + opt.old = opt.complete; + + opt.complete = function() { + if ( isFunction( opt.old ) ) { + opt.old.call( this ); + } + + if ( opt.queue ) { + jQuery.dequeue( this, opt.queue ); + } + }; + + return opt; +}; + +jQuery.fn.extend( { + fadeTo: function( speed, to, easing, callback ) { + + // Show any hidden elements after setting opacity to 0 + return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() + + // Animate to the value specified + .end().animate( { opacity: to }, speed, easing, callback ); + }, + animate: function( prop, speed, easing, callback ) { + var empty = jQuery.isEmptyObject( prop ), + optall = jQuery.speed( speed, easing, callback ), + doAnimation = function() { + + // Operate on a copy of prop so per-property easing won't be lost + var anim = Animation( this, jQuery.extend( {}, prop ), optall ); + + // Empty animations, or finishing resolves immediately + if ( empty || dataPriv.get( this, "finish" ) ) { + anim.stop( true ); + } + }; + doAnimation.finish = doAnimation; + + return empty || optall.queue === false ? + this.each( doAnimation ) : + this.queue( optall.queue, doAnimation ); + }, + stop: function( type, clearQueue, gotoEnd ) { + var stopQueue = function( hooks ) { + var stop = hooks.stop; + delete hooks.stop; + stop( gotoEnd ); + }; + + if ( typeof type !== "string" ) { + gotoEnd = clearQueue; + clearQueue = type; + type = undefined; + } + if ( clearQueue ) { + this.queue( type || "fx", [] ); + } + + return this.each( function() { + var dequeue = true, + index = type != null && type + "queueHooks", + timers = jQuery.timers, + data = dataPriv.get( this ); + + if ( index ) { + if ( data[ index ] && data[ index ].stop ) { + stopQueue( data[ index ] ); + } + } else { + for ( index in data ) { + if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { + stopQueue( data[ index ] ); + } + } + } + + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && + ( type == null || timers[ index ].queue === type ) ) { + + timers[ index ].anim.stop( gotoEnd ); + dequeue = false; + timers.splice( index, 1 ); + } + } + + // Start the next in the queue if the last step wasn't forced. + // Timers currently will call their complete callbacks, which + // will dequeue but only if they were gotoEnd. + if ( dequeue || !gotoEnd ) { + jQuery.dequeue( this, type ); + } + } ); + }, + finish: function( type ) { + if ( type !== false ) { + type = type || "fx"; + } + return this.each( function() { + var index, + data = dataPriv.get( this ), + queue = data[ type + "queue" ], + hooks = data[ type + "queueHooks" ], + timers = jQuery.timers, + length = queue ? queue.length : 0; + + // Enable finishing flag on private data + data.finish = true; + + // Empty the queue first + jQuery.queue( this, type, [] ); + + if ( hooks && hooks.stop ) { + hooks.stop.call( this, true ); + } + + // Look for any active animations, and finish them + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && timers[ index ].queue === type ) { + timers[ index ].anim.stop( true ); + timers.splice( index, 1 ); + } + } + + // Look for any animations in the old queue and finish them + for ( index = 0; index < length; index++ ) { + if ( queue[ index ] && queue[ index ].finish ) { + queue[ index ].finish.call( this ); + } + } + + // Turn off finishing flag + delete data.finish; + } ); + } +} ); + +jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { + var cssFn = jQuery.fn[ name ]; + jQuery.fn[ name ] = function( speed, easing, callback ) { + return speed == null || typeof speed === "boolean" ? + cssFn.apply( this, arguments ) : + this.animate( genFx( name, true ), speed, easing, callback ); + }; +} ); + +// Generate shortcuts for custom animations +jQuery.each( { + slideDown: genFx( "show" ), + slideUp: genFx( "hide" ), + slideToggle: genFx( "toggle" ), + fadeIn: { opacity: "show" }, + fadeOut: { opacity: "hide" }, + fadeToggle: { opacity: "toggle" } +}, function( name, props ) { + jQuery.fn[ name ] = function( speed, easing, callback ) { + return this.animate( props, speed, easing, callback ); + }; +} ); + +jQuery.timers = []; +jQuery.fx.tick = function() { + var timer, + i = 0, + timers = jQuery.timers; + + fxNow = Date.now(); + + for ( ; i < timers.length; i++ ) { + timer = timers[ i ]; + + // Run the timer and safely remove it when done (allowing for external removal) + if ( !timer() && timers[ i ] === timer ) { + timers.splice( i--, 1 ); + } + } + + if ( !timers.length ) { + jQuery.fx.stop(); + } + fxNow = undefined; +}; + +jQuery.fx.timer = function( timer ) { + jQuery.timers.push( timer ); + jQuery.fx.start(); +}; + +jQuery.fx.interval = 13; +jQuery.fx.start = function() { + if ( inProgress ) { + return; + } + + inProgress = true; + schedule(); +}; + +jQuery.fx.stop = function() { + inProgress = null; +}; + +jQuery.fx.speeds = { + slow: 600, + fast: 200, + + // Default speed + _default: 400 +}; + + +// Based off of the plugin by Clint Helfers, with permission. +// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ +jQuery.fn.delay = function( time, type ) { + time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; + type = type || "fx"; + + return this.queue( type, function( next, hooks ) { + var timeout = window.setTimeout( next, time ); + hooks.stop = function() { + window.clearTimeout( timeout ); + }; + } ); +}; + + +( function() { + var input = document.createElement( "input" ), + select = document.createElement( "select" ), + opt = select.appendChild( document.createElement( "option" ) ); + + input.type = "checkbox"; + + // Support: Android <=4.3 only + // Default value for a checkbox should be "on" + support.checkOn = input.value !== ""; + + // Support: IE <=11 only + // Must access selectedIndex to make default options select + support.optSelected = opt.selected; + + // Support: IE <=11 only + // An input loses its value after becoming a radio + input = document.createElement( "input" ); + input.value = "t"; + input.type = "radio"; + support.radioValue = input.value === "t"; +} )(); + + +var boolHook, + attrHandle = jQuery.expr.attrHandle; + +jQuery.fn.extend( { + attr: function( name, value ) { + return access( this, jQuery.attr, name, value, arguments.length > 1 ); + }, + + removeAttr: function( name ) { + return this.each( function() { + jQuery.removeAttr( this, name ); + } ); + } +} ); + +jQuery.extend( { + attr: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set attributes on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + // Fallback to prop when attributes are not supported + if ( typeof elem.getAttribute === "undefined" ) { + return jQuery.prop( elem, name, value ); + } + + // Attribute hooks are determined by the lowercase version + // Grab necessary hook if one is defined + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + hooks = jQuery.attrHooks[ name.toLowerCase() ] || + ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); + } + + if ( value !== undefined ) { + if ( value === null ) { + jQuery.removeAttr( elem, name ); + return; + } + + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + elem.setAttribute( name, value + "" ); + return value; + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + ret = jQuery.find.attr( elem, name ); + + // Non-existent attributes return null, we normalize to undefined + return ret == null ? undefined : ret; + }, + + attrHooks: { + type: { + set: function( elem, value ) { + if ( !support.radioValue && value === "radio" && + nodeName( elem, "input" ) ) { + var val = elem.value; + elem.setAttribute( "type", value ); + if ( val ) { + elem.value = val; + } + return value; + } + } + } + }, + + removeAttr: function( elem, value ) { + var name, + i = 0, + + // Attribute names can contain non-HTML whitespace characters + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + attrNames = value && value.match( rnothtmlwhite ); + + if ( attrNames && elem.nodeType === 1 ) { + while ( ( name = attrNames[ i++ ] ) ) { + elem.removeAttribute( name ); + } + } + } +} ); + +// Hooks for boolean attributes +boolHook = { + set: function( elem, value, name ) { + if ( value === false ) { + + // Remove boolean attributes when set to false + jQuery.removeAttr( elem, name ); + } else { + elem.setAttribute( name, name ); + } + return name; + } +}; + +jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { + var getter = attrHandle[ name ] || jQuery.find.attr; + + attrHandle[ name ] = function( elem, name, isXML ) { + var ret, handle, + lowercaseName = name.toLowerCase(); + + if ( !isXML ) { + + // Avoid an infinite loop by temporarily removing this function from the getter + handle = attrHandle[ lowercaseName ]; + attrHandle[ lowercaseName ] = ret; + ret = getter( elem, name, isXML ) != null ? + lowercaseName : + null; + attrHandle[ lowercaseName ] = handle; + } + return ret; + }; +} ); + + + + +var rfocusable = /^(?:input|select|textarea|button)$/i, + rclickable = /^(?:a|area)$/i; + +jQuery.fn.extend( { + prop: function( name, value ) { + return access( this, jQuery.prop, name, value, arguments.length > 1 ); + }, + + removeProp: function( name ) { + return this.each( function() { + delete this[ jQuery.propFix[ name ] || name ]; + } ); + } +} ); + +jQuery.extend( { + prop: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set properties on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + + // Fix name and attach hooks + name = jQuery.propFix[ name ] || name; + hooks = jQuery.propHooks[ name ]; + } + + if ( value !== undefined ) { + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + return ( elem[ name ] = value ); + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + return elem[ name ]; + }, + + propHooks: { + tabIndex: { + get: function( elem ) { + + // Support: IE <=9 - 11 only + // elem.tabIndex doesn't always return the + // correct value when it hasn't been explicitly set + // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ + // Use proper attribute retrieval(#12072) + var tabindex = jQuery.find.attr( elem, "tabindex" ); + + if ( tabindex ) { + return parseInt( tabindex, 10 ); + } + + if ( + rfocusable.test( elem.nodeName ) || + rclickable.test( elem.nodeName ) && + elem.href + ) { + return 0; + } + + return -1; + } + } + }, + + propFix: { + "for": "htmlFor", + "class": "className" + } +} ); + +// Support: IE <=11 only +// Accessing the selectedIndex property +// forces the browser to respect setting selected +// on the option +// The getter ensures a default option is selected +// when in an optgroup +// eslint rule "no-unused-expressions" is disabled for this code +// since it considers such accessions noop +if ( !support.optSelected ) { + jQuery.propHooks.selected = { + get: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent && parent.parentNode ) { + parent.parentNode.selectedIndex; + } + return null; + }, + set: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent ) { + parent.selectedIndex; + + if ( parent.parentNode ) { + parent.parentNode.selectedIndex; + } + } + } + }; +} + +jQuery.each( [ + "tabIndex", + "readOnly", + "maxLength", + "cellSpacing", + "cellPadding", + "rowSpan", + "colSpan", + "useMap", + "frameBorder", + "contentEditable" +], function() { + jQuery.propFix[ this.toLowerCase() ] = this; +} ); + + + + + // Strip and collapse whitespace according to HTML spec + // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace + function stripAndCollapse( value ) { + var tokens = value.match( rnothtmlwhite ) || []; + return tokens.join( " " ); + } + + +function getClass( elem ) { + return elem.getAttribute && elem.getAttribute( "class" ) || ""; +} + +function classesToArray( value ) { + if ( Array.isArray( value ) ) { + return value; + } + if ( typeof value === "string" ) { + return value.match( rnothtmlwhite ) || []; + } + return []; +} + +jQuery.fn.extend( { + addClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + if ( cur.indexOf( " " + clazz + " " ) < 0 ) { + cur += clazz + " "; + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + removeClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( !arguments.length ) { + return this.attr( "class", "" ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + + // This expression is here for better compressibility (see addClass) + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + + // Remove *all* instances + while ( cur.indexOf( " " + clazz + " " ) > -1 ) { + cur = cur.replace( " " + clazz + " ", " " ); + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + toggleClass: function( value, stateVal ) { + var type = typeof value, + isValidValue = type === "string" || Array.isArray( value ); + + if ( typeof stateVal === "boolean" && isValidValue ) { + return stateVal ? this.addClass( value ) : this.removeClass( value ); + } + + if ( isFunction( value ) ) { + return this.each( function( i ) { + jQuery( this ).toggleClass( + value.call( this, i, getClass( this ), stateVal ), + stateVal + ); + } ); + } + + return this.each( function() { + var className, i, self, classNames; + + if ( isValidValue ) { + + // Toggle individual class names + i = 0; + self = jQuery( this ); + classNames = classesToArray( value ); + + while ( ( className = classNames[ i++ ] ) ) { + + // Check each className given, space separated list + if ( self.hasClass( className ) ) { + self.removeClass( className ); + } else { + self.addClass( className ); + } + } + + // Toggle whole class name + } else if ( value === undefined || type === "boolean" ) { + className = getClass( this ); + if ( className ) { + + // Store className if set + dataPriv.set( this, "__className__", className ); + } + + // If the element has a class name or if we're passed `false`, + // then remove the whole classname (if there was one, the above saved it). + // Otherwise bring back whatever was previously saved (if anything), + // falling back to the empty string if nothing was stored. + if ( this.setAttribute ) { + this.setAttribute( "class", + className || value === false ? + "" : + dataPriv.get( this, "__className__" ) || "" + ); + } + } + } ); + }, + + hasClass: function( selector ) { + var className, elem, + i = 0; + + className = " " + selector + " "; + while ( ( elem = this[ i++ ] ) ) { + if ( elem.nodeType === 1 && + ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { + return true; + } + } + + return false; + } +} ); + + + + +var rreturn = /\r/g; + +jQuery.fn.extend( { + val: function( value ) { + var hooks, ret, valueIsFunction, + elem = this[ 0 ]; + + if ( !arguments.length ) { + if ( elem ) { + hooks = jQuery.valHooks[ elem.type ] || + jQuery.valHooks[ elem.nodeName.toLowerCase() ]; + + if ( hooks && + "get" in hooks && + ( ret = hooks.get( elem, "value" ) ) !== undefined + ) { + return ret; + } + + ret = elem.value; + + // Handle most common string cases + if ( typeof ret === "string" ) { + return ret.replace( rreturn, "" ); + } + + // Handle cases where value is null/undef or number + return ret == null ? "" : ret; + } + + return; + } + + valueIsFunction = isFunction( value ); + + return this.each( function( i ) { + var val; + + if ( this.nodeType !== 1 ) { + return; + } + + if ( valueIsFunction ) { + val = value.call( this, i, jQuery( this ).val() ); + } else { + val = value; + } + + // Treat null/undefined as ""; convert numbers to string + if ( val == null ) { + val = ""; + + } else if ( typeof val === "number" ) { + val += ""; + + } else if ( Array.isArray( val ) ) { + val = jQuery.map( val, function( value ) { + return value == null ? "" : value + ""; + } ); + } + + hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; + + // If set returns undefined, fall back to normal setting + if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { + this.value = val; + } + } ); + } +} ); + +jQuery.extend( { + valHooks: { + option: { + get: function( elem ) { + + var val = jQuery.find.attr( elem, "value" ); + return val != null ? + val : + + // Support: IE <=10 - 11 only + // option.text throws exceptions (#14686, #14858) + // Strip and collapse whitespace + // https://html.spec.whatwg.org/#strip-and-collapse-whitespace + stripAndCollapse( jQuery.text( elem ) ); + } + }, + select: { + get: function( elem ) { + var value, option, i, + options = elem.options, + index = elem.selectedIndex, + one = elem.type === "select-one", + values = one ? null : [], + max = one ? index + 1 : options.length; + + if ( index < 0 ) { + i = max; + + } else { + i = one ? index : 0; + } + + // Loop through all the selected options + for ( ; i < max; i++ ) { + option = options[ i ]; + + // Support: IE <=9 only + // IE8-9 doesn't update selected after form reset (#2551) + if ( ( option.selected || i === index ) && + + // Don't return options that are disabled or in a disabled optgroup + !option.disabled && + ( !option.parentNode.disabled || + !nodeName( option.parentNode, "optgroup" ) ) ) { + + // Get the specific value for the option + value = jQuery( option ).val(); + + // We don't need an array for one selects + if ( one ) { + return value; + } + + // Multi-Selects return an array + values.push( value ); + } + } + + return values; + }, + + set: function( elem, value ) { + var optionSet, option, + options = elem.options, + values = jQuery.makeArray( value ), + i = options.length; + + while ( i-- ) { + option = options[ i ]; + + /* eslint-disable no-cond-assign */ + + if ( option.selected = + jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 + ) { + optionSet = true; + } + + /* eslint-enable no-cond-assign */ + } + + // Force browsers to behave consistently when non-matching value is set + if ( !optionSet ) { + elem.selectedIndex = -1; + } + return values; + } + } + } +} ); + +// Radios and checkboxes getter/setter +jQuery.each( [ "radio", "checkbox" ], function() { + jQuery.valHooks[ this ] = { + set: function( elem, value ) { + if ( Array.isArray( value ) ) { + return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); + } + } + }; + if ( !support.checkOn ) { + jQuery.valHooks[ this ].get = function( elem ) { + return elem.getAttribute( "value" ) === null ? "on" : elem.value; + }; + } +} ); + + + + +// Return jQuery for attributes-only inclusion + + +support.focusin = "onfocusin" in window; + + +var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, + stopPropagationCallback = function( e ) { + e.stopPropagation(); + }; + +jQuery.extend( jQuery.event, { + + trigger: function( event, data, elem, onlyHandlers ) { + + var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, + eventPath = [ elem || document ], + type = hasOwn.call( event, "type" ) ? event.type : event, + namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; + + cur = lastElement = tmp = elem = elem || document; + + // Don't do events on text and comment nodes + if ( elem.nodeType === 3 || elem.nodeType === 8 ) { + return; + } + + // focus/blur morphs to focusin/out; ensure we're not firing them right now + if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { + return; + } + + if ( type.indexOf( "." ) > -1 ) { + + // Namespaced trigger; create a regexp to match event type in handle() + namespaces = type.split( "." ); + type = namespaces.shift(); + namespaces.sort(); + } + ontype = type.indexOf( ":" ) < 0 && "on" + type; + + // Caller can pass in a jQuery.Event object, Object, or just an event type string + event = event[ jQuery.expando ] ? + event : + new jQuery.Event( type, typeof event === "object" && event ); + + // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) + event.isTrigger = onlyHandlers ? 2 : 3; + event.namespace = namespaces.join( "." ); + event.rnamespace = event.namespace ? + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : + null; + + // Clean up the event in case it is being reused + event.result = undefined; + if ( !event.target ) { + event.target = elem; + } + + // Clone any incoming data and prepend the event, creating the handler arg list + data = data == null ? + [ event ] : + jQuery.makeArray( data, [ event ] ); + + // Allow special events to draw outside the lines + special = jQuery.event.special[ type ] || {}; + if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { + return; + } + + // Determine event propagation path in advance, per W3C events spec (#9951) + // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) + if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { + + bubbleType = special.delegateType || type; + if ( !rfocusMorph.test( bubbleType + type ) ) { + cur = cur.parentNode; + } + for ( ; cur; cur = cur.parentNode ) { + eventPath.push( cur ); + tmp = cur; + } + + // Only add window if we got to document (e.g., not plain obj or detached DOM) + if ( tmp === ( elem.ownerDocument || document ) ) { + eventPath.push( tmp.defaultView || tmp.parentWindow || window ); + } + } + + // Fire handlers on the event path + i = 0; + while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { + lastElement = cur; + event.type = i > 1 ? + bubbleType : + special.bindType || type; + + // jQuery handler + handle = ( + dataPriv.get( cur, "events" ) || Object.create( null ) + )[ event.type ] && + dataPriv.get( cur, "handle" ); + if ( handle ) { + handle.apply( cur, data ); + } + + // Native handler + handle = ontype && cur[ ontype ]; + if ( handle && handle.apply && acceptData( cur ) ) { + event.result = handle.apply( cur, data ); + if ( event.result === false ) { + event.preventDefault(); + } + } + } + event.type = type; + + // If nobody prevented the default action, do it now + if ( !onlyHandlers && !event.isDefaultPrevented() ) { + + if ( ( !special._default || + special._default.apply( eventPath.pop(), data ) === false ) && + acceptData( elem ) ) { + + // Call a native DOM method on the target with the same name as the event. + // Don't do default actions on window, that's where global variables be (#6170) + if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { + + // Don't re-trigger an onFOO event when we call its FOO() method + tmp = elem[ ontype ]; + + if ( tmp ) { + elem[ ontype ] = null; + } + + // Prevent re-triggering of the same event, since we already bubbled it above + jQuery.event.triggered = type; + + if ( event.isPropagationStopped() ) { + lastElement.addEventListener( type, stopPropagationCallback ); + } + + elem[ type ](); + + if ( event.isPropagationStopped() ) { + lastElement.removeEventListener( type, stopPropagationCallback ); + } + + jQuery.event.triggered = undefined; + + if ( tmp ) { + elem[ ontype ] = tmp; + } + } + } + } + + return event.result; + }, + + // Piggyback on a donor event to simulate a different one + // Used only for `focus(in | out)` events + simulate: function( type, elem, event ) { + var e = jQuery.extend( + new jQuery.Event(), + event, + { + type: type, + isSimulated: true + } + ); + + jQuery.event.trigger( e, null, elem ); + } + +} ); + +jQuery.fn.extend( { + + trigger: function( type, data ) { + return this.each( function() { + jQuery.event.trigger( type, data, this ); + } ); + }, + triggerHandler: function( type, data ) { + var elem = this[ 0 ]; + if ( elem ) { + return jQuery.event.trigger( type, data, elem, true ); + } + } +} ); + + +// Support: Firefox <=44 +// Firefox doesn't have focus(in | out) events +// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 +// +// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 +// focus(in | out) events fire after focus & blur events, +// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order +// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 +if ( !support.focusin ) { + jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { + + // Attach a single capturing handler on the document while someone wants focusin/focusout + var handler = function( event ) { + jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); + }; + + jQuery.event.special[ fix ] = { + setup: function() { + + // Handle: regular nodes (via `this.ownerDocument`), window + // (via `this.document`) & document (via `this`). + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ); + + if ( !attaches ) { + doc.addEventListener( orig, handler, true ); + } + dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); + }, + teardown: function() { + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ) - 1; + + if ( !attaches ) { + doc.removeEventListener( orig, handler, true ); + dataPriv.remove( doc, fix ); + + } else { + dataPriv.access( doc, fix, attaches ); + } + } + }; + } ); +} +var location = window.location; + +var nonce = { guid: Date.now() }; + +var rquery = ( /\?/ ); + + + +// Cross-browser xml parsing +jQuery.parseXML = function( data ) { + var xml; + if ( !data || typeof data !== "string" ) { + return null; + } + + // Support: IE 9 - 11 only + // IE throws on parseFromString with invalid input. + try { + xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); + } catch ( e ) { + xml = undefined; + } + + if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { + jQuery.error( "Invalid XML: " + data ); + } + return xml; +}; + + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && toType( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = isFunction( valueOrFunction ) ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + if ( a == null ) { + return ""; + } + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ) + .filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ) + .map( function( _i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + + +var + r20 = /%20/g, + rhash = /#.*$/, + rantiCache = /([?&])_=[^&]*/, + rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, + + // #7653, #8125, #8152: local protocol detection + rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, + rnoContent = /^(?:GET|HEAD)$/, + rprotocol = /^\/\//, + + /* Prefilters + * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) + * 2) These are called: + * - BEFORE asking for a transport + * - AFTER param serialization (s.data is a string if s.processData is true) + * 3) key is the dataType + * 4) the catchall symbol "*" can be used + * 5) execution will start with transport dataType and THEN continue down to "*" if needed + */ + prefilters = {}, + + /* Transports bindings + * 1) key is the dataType + * 2) the catchall symbol "*" can be used + * 3) selection will start with transport dataType and THEN go to "*" if needed + */ + transports = {}, + + // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression + allTypes = "*/".concat( "*" ), + + // Anchor tag for parsing the document origin + originAnchor = document.createElement( "a" ); + originAnchor.href = location.href; + +// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport +function addToPrefiltersOrTransports( structure ) { + + // dataTypeExpression is optional and defaults to "*" + return function( dataTypeExpression, func ) { + + if ( typeof dataTypeExpression !== "string" ) { + func = dataTypeExpression; + dataTypeExpression = "*"; + } + + var dataType, + i = 0, + dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; + + if ( isFunction( func ) ) { + + // For each dataType in the dataTypeExpression + while ( ( dataType = dataTypes[ i++ ] ) ) { + + // Prepend if requested + if ( dataType[ 0 ] === "+" ) { + dataType = dataType.slice( 1 ) || "*"; + ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); + + // Otherwise append + } else { + ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); + } + } + } + }; +} + +// Base inspection function for prefilters and transports +function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { + + var inspected = {}, + seekingTransport = ( structure === transports ); + + function inspect( dataType ) { + var selected; + inspected[ dataType ] = true; + jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { + var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); + if ( typeof dataTypeOrTransport === "string" && + !seekingTransport && !inspected[ dataTypeOrTransport ] ) { + + options.dataTypes.unshift( dataTypeOrTransport ); + inspect( dataTypeOrTransport ); + return false; + } else if ( seekingTransport ) { + return !( selected = dataTypeOrTransport ); + } + } ); + return selected; + } + + return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); +} + +// A special extend for ajax options +// that takes "flat" options (not to be deep extended) +// Fixes #9887 +function ajaxExtend( target, src ) { + var key, deep, + flatOptions = jQuery.ajaxSettings.flatOptions || {}; + + for ( key in src ) { + if ( src[ key ] !== undefined ) { + ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; + } + } + if ( deep ) { + jQuery.extend( true, target, deep ); + } + + return target; +} + +/* Handles responses to an ajax request: + * - finds the right dataType (mediates between content-type and expected dataType) + * - returns the corresponding response + */ +function ajaxHandleResponses( s, jqXHR, responses ) { + + var ct, type, finalDataType, firstDataType, + contents = s.contents, + dataTypes = s.dataTypes; + + // Remove auto dataType and get content-type in the process + while ( dataTypes[ 0 ] === "*" ) { + dataTypes.shift(); + if ( ct === undefined ) { + ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); + } + } + + // Check if we're dealing with a known content-type + if ( ct ) { + for ( type in contents ) { + if ( contents[ type ] && contents[ type ].test( ct ) ) { + dataTypes.unshift( type ); + break; + } + } + } + + // Check to see if we have a response for the expected dataType + if ( dataTypes[ 0 ] in responses ) { + finalDataType = dataTypes[ 0 ]; + } else { + + // Try convertible dataTypes + for ( type in responses ) { + if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { + finalDataType = type; + break; + } + if ( !firstDataType ) { + firstDataType = type; + } + } + + // Or just use first one + finalDataType = finalDataType || firstDataType; + } + + // If we found a dataType + // We add the dataType to the list if needed + // and return the corresponding response + if ( finalDataType ) { + if ( finalDataType !== dataTypes[ 0 ] ) { + dataTypes.unshift( finalDataType ); + } + return responses[ finalDataType ]; + } +} + +/* Chain conversions given the request and the original response + * Also sets the responseXXX fields on the jqXHR instance + */ +function ajaxConvert( s, response, jqXHR, isSuccess ) { + var conv2, current, conv, tmp, prev, + converters = {}, + + // Work with a copy of dataTypes in case we need to modify it for conversion + dataTypes = s.dataTypes.slice(); + + // Create converters map with lowercased keys + if ( dataTypes[ 1 ] ) { + for ( conv in s.converters ) { + converters[ conv.toLowerCase() ] = s.converters[ conv ]; + } + } + + current = dataTypes.shift(); + + // Convert to each sequential dataType + while ( current ) { + + if ( s.responseFields[ current ] ) { + jqXHR[ s.responseFields[ current ] ] = response; + } + + // Apply the dataFilter if provided + if ( !prev && isSuccess && s.dataFilter ) { + response = s.dataFilter( response, s.dataType ); + } + + prev = current; + current = dataTypes.shift(); + + if ( current ) { + + // There's only work to do if current dataType is non-auto + if ( current === "*" ) { + + current = prev; + + // Convert response if prev dataType is non-auto and differs from current + } else if ( prev !== "*" && prev !== current ) { + + // Seek a direct converter + conv = converters[ prev + " " + current ] || converters[ "* " + current ]; + + // If none found, seek a pair + if ( !conv ) { + for ( conv2 in converters ) { + + // If conv2 outputs current + tmp = conv2.split( " " ); + if ( tmp[ 1 ] === current ) { + + // If prev can be converted to accepted input + conv = converters[ prev + " " + tmp[ 0 ] ] || + converters[ "* " + tmp[ 0 ] ]; + if ( conv ) { + + // Condense equivalence converters + if ( conv === true ) { + conv = converters[ conv2 ]; + + // Otherwise, insert the intermediate dataType + } else if ( converters[ conv2 ] !== true ) { + current = tmp[ 0 ]; + dataTypes.unshift( tmp[ 1 ] ); + } + break; + } + } + } + } + + // Apply converter (if not an equivalence) + if ( conv !== true ) { + + // Unless errors are allowed to bubble, catch and return them + if ( conv && s.throws ) { + response = conv( response ); + } else { + try { + response = conv( response ); + } catch ( e ) { + return { + state: "parsererror", + error: conv ? e : "No conversion from " + prev + " to " + current + }; + } + } + } + } + } + } + + return { state: "success", data: response }; +} + +jQuery.extend( { + + // Counter for holding the number of active queries + active: 0, + + // Last-Modified header cache for next request + lastModified: {}, + etag: {}, + + ajaxSettings: { + url: location.href, + type: "GET", + isLocal: rlocalProtocol.test( location.protocol ), + global: true, + processData: true, + async: true, + contentType: "application/x-www-form-urlencoded; charset=UTF-8", + + /* + timeout: 0, + data: null, + dataType: null, + username: null, + password: null, + cache: null, + throws: false, + traditional: false, + headers: {}, + */ + + accepts: { + "*": allTypes, + text: "text/plain", + html: "text/html", + xml: "application/xml, text/xml", + json: "application/json, text/javascript" + }, + + contents: { + xml: /\bxml\b/, + html: /\bhtml/, + json: /\bjson\b/ + }, + + responseFields: { + xml: "responseXML", + text: "responseText", + json: "responseJSON" + }, + + // Data converters + // Keys separate source (or catchall "*") and destination types with a single space + converters: { + + // Convert anything to text + "* text": String, + + // Text to html (true = no transformation) + "text html": true, + + // Evaluate text as a json expression + "text json": JSON.parse, + + // Parse text as xml + "text xml": jQuery.parseXML + }, + + // For options that shouldn't be deep extended: + // you can add your own custom options here if + // and when you create one that shouldn't be + // deep extended (see ajaxExtend) + flatOptions: { + url: true, + context: true + } + }, + + // Creates a full fledged settings object into target + // with both ajaxSettings and settings fields. + // If target is omitted, writes into ajaxSettings. + ajaxSetup: function( target, settings ) { + return settings ? + + // Building a settings object + ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : + + // Extending ajaxSettings + ajaxExtend( jQuery.ajaxSettings, target ); + }, + + ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), + ajaxTransport: addToPrefiltersOrTransports( transports ), + + // Main method + ajax: function( url, options ) { + + // If url is an object, simulate pre-1.5 signature + if ( typeof url === "object" ) { + options = url; + url = undefined; + } + + // Force options to be an object + options = options || {}; + + var transport, + + // URL without anti-cache param + cacheURL, + + // Response headers + responseHeadersString, + responseHeaders, + + // timeout handle + timeoutTimer, + + // Url cleanup var + urlAnchor, + + // Request state (becomes false upon send and true upon completion) + completed, + + // To know if global events are to be dispatched + fireGlobals, + + // Loop variable + i, + + // uncached part of the url + uncached, + + // Create the final options object + s = jQuery.ajaxSetup( {}, options ), + + // Callbacks context + callbackContext = s.context || s, + + // Context for global events is callbackContext if it is a DOM node or jQuery collection + globalEventContext = s.context && + ( callbackContext.nodeType || callbackContext.jquery ) ? + jQuery( callbackContext ) : + jQuery.event, + + // Deferreds + deferred = jQuery.Deferred(), + completeDeferred = jQuery.Callbacks( "once memory" ), + + // Status-dependent callbacks + statusCode = s.statusCode || {}, + + // Headers (they are sent all at once) + requestHeaders = {}, + requestHeadersNames = {}, + + // Default abort message + strAbort = "canceled", + + // Fake xhr + jqXHR = { + readyState: 0, + + // Builds headers hashtable if needed + getResponseHeader: function( key ) { + var match; + if ( completed ) { + if ( !responseHeaders ) { + responseHeaders = {}; + while ( ( match = rheaders.exec( responseHeadersString ) ) ) { + responseHeaders[ match[ 1 ].toLowerCase() + " " ] = + ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) + .concat( match[ 2 ] ); + } + } + match = responseHeaders[ key.toLowerCase() + " " ]; + } + return match == null ? null : match.join( ", " ); + }, + + // Raw string + getAllResponseHeaders: function() { + return completed ? responseHeadersString : null; + }, + + // Caches the header + setRequestHeader: function( name, value ) { + if ( completed == null ) { + name = requestHeadersNames[ name.toLowerCase() ] = + requestHeadersNames[ name.toLowerCase() ] || name; + requestHeaders[ name ] = value; + } + return this; + }, + + // Overrides response content-type header + overrideMimeType: function( type ) { + if ( completed == null ) { + s.mimeType = type; + } + return this; + }, + + // Status-dependent callbacks + statusCode: function( map ) { + var code; + if ( map ) { + if ( completed ) { + + // Execute the appropriate callbacks + jqXHR.always( map[ jqXHR.status ] ); + } else { + + // Lazy-add the new callbacks in a way that preserves old ones + for ( code in map ) { + statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; + } + } + } + return this; + }, + + // Cancel the request + abort: function( statusText ) { + var finalText = statusText || strAbort; + if ( transport ) { + transport.abort( finalText ); + } + done( 0, finalText ); + return this; + } + }; + + // Attach deferreds + deferred.promise( jqXHR ); + + // Add protocol if not provided (prefilters might expect it) + // Handle falsy url in the settings object (#10093: consistency with old signature) + // We also use the url parameter if available + s.url = ( ( url || s.url || location.href ) + "" ) + .replace( rprotocol, location.protocol + "//" ); + + // Alias method option to type as per ticket #12004 + s.type = options.method || options.type || s.method || s.type; + + // Extract dataTypes list + s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; + + // A cross-domain request is in order when the origin doesn't match the current origin. + if ( s.crossDomain == null ) { + urlAnchor = document.createElement( "a" ); + + // Support: IE <=8 - 11, Edge 12 - 15 + // IE throws exception on accessing the href property if url is malformed, + // e.g. http://example.com:80x/ + try { + urlAnchor.href = s.url; + + // Support: IE <=8 - 11 only + // Anchor's host property isn't correctly set when s.url is relative + urlAnchor.href = urlAnchor.href; + s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== + urlAnchor.protocol + "//" + urlAnchor.host; + } catch ( e ) { + + // If there is an error parsing the URL, assume it is crossDomain, + // it can be rejected by the transport if it is invalid + s.crossDomain = true; + } + } + + // Convert data if not already a string + if ( s.data && s.processData && typeof s.data !== "string" ) { + s.data = jQuery.param( s.data, s.traditional ); + } + + // Apply prefilters + inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); + + // If request was aborted inside a prefilter, stop there + if ( completed ) { + return jqXHR; + } + + // We can fire global events as of now if asked to + // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) + fireGlobals = jQuery.event && s.global; + + // Watch for a new set of requests + if ( fireGlobals && jQuery.active++ === 0 ) { + jQuery.event.trigger( "ajaxStart" ); + } + + // Uppercase the type + s.type = s.type.toUpperCase(); + + // Determine if request has content + s.hasContent = !rnoContent.test( s.type ); + + // Save the URL in case we're toying with the If-Modified-Since + // and/or If-None-Match header later on + // Remove hash to simplify url manipulation + cacheURL = s.url.replace( rhash, "" ); + + // More options handling for requests with no content + if ( !s.hasContent ) { + + // Remember the hash so we can put it back + uncached = s.url.slice( cacheURL.length ); + + // If data is available and should be processed, append data to url + if ( s.data && ( s.processData || typeof s.data === "string" ) ) { + cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; + + // #9682: remove data so that it's not used in an eventual retry + delete s.data; + } + + // Add or update anti-cache param if needed + if ( s.cache === false ) { + cacheURL = cacheURL.replace( rantiCache, "$1" ); + uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + + uncached; + } + + // Put hash and anti-cache on the URL that will be requested (gh-1732) + s.url = cacheURL + uncached; + + // Change '%20' to '+' if this is encoded form body content (gh-2658) + } else if ( s.data && s.processData && + ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { + s.data = s.data.replace( r20, "+" ); + } + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + if ( jQuery.lastModified[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); + } + if ( jQuery.etag[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); + } + } + + // Set the correct header, if data is being sent + if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { + jqXHR.setRequestHeader( "Content-Type", s.contentType ); + } + + // Set the Accepts header for the server, depending on the dataType + jqXHR.setRequestHeader( + "Accept", + s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? + s.accepts[ s.dataTypes[ 0 ] ] + + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : + s.accepts[ "*" ] + ); + + // Check for headers option + for ( i in s.headers ) { + jqXHR.setRequestHeader( i, s.headers[ i ] ); + } + + // Allow custom headers/mimetypes and early abort + if ( s.beforeSend && + ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { + + // Abort if not done already and return + return jqXHR.abort(); + } + + // Aborting is no longer a cancellation + strAbort = "abort"; + + // Install callbacks on deferreds + completeDeferred.add( s.complete ); + jqXHR.done( s.success ); + jqXHR.fail( s.error ); + + // Get transport + transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); + + // If no transport, we auto-abort + if ( !transport ) { + done( -1, "No Transport" ); + } else { + jqXHR.readyState = 1; + + // Send global event + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); + } + + // If request was aborted inside ajaxSend, stop there + if ( completed ) { + return jqXHR; + } + + // Timeout + if ( s.async && s.timeout > 0 ) { + timeoutTimer = window.setTimeout( function() { + jqXHR.abort( "timeout" ); + }, s.timeout ); + } + + try { + completed = false; + transport.send( requestHeaders, done ); + } catch ( e ) { + + // Rethrow post-completion exceptions + if ( completed ) { + throw e; + } + + // Propagate others as results + done( -1, e ); + } + } + + // Callback for when everything is done + function done( status, nativeStatusText, responses, headers ) { + var isSuccess, success, error, response, modified, + statusText = nativeStatusText; + + // Ignore repeat invocations + if ( completed ) { + return; + } + + completed = true; + + // Clear timeout if it exists + if ( timeoutTimer ) { + window.clearTimeout( timeoutTimer ); + } + + // Dereference transport for early garbage collection + // (no matter how long the jqXHR object will be used) + transport = undefined; + + // Cache response headers + responseHeadersString = headers || ""; + + // Set readyState + jqXHR.readyState = status > 0 ? 4 : 0; + + // Determine if successful + isSuccess = status >= 200 && status < 300 || status === 304; + + // Get response data + if ( responses ) { + response = ajaxHandleResponses( s, jqXHR, responses ); + } + + // Use a noop converter for missing script + if ( !isSuccess && jQuery.inArray( "script", s.dataTypes ) > -1 ) { + s.converters[ "text script" ] = function() {}; + } + + // Convert no matter what (that way responseXXX fields are always set) + response = ajaxConvert( s, response, jqXHR, isSuccess ); + + // If successful, handle type chaining + if ( isSuccess ) { + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + modified = jqXHR.getResponseHeader( "Last-Modified" ); + if ( modified ) { + jQuery.lastModified[ cacheURL ] = modified; + } + modified = jqXHR.getResponseHeader( "etag" ); + if ( modified ) { + jQuery.etag[ cacheURL ] = modified; + } + } + + // if no content + if ( status === 204 || s.type === "HEAD" ) { + statusText = "nocontent"; + + // if not modified + } else if ( status === 304 ) { + statusText = "notmodified"; + + // If we have data, let's convert it + } else { + statusText = response.state; + success = response.data; + error = response.error; + isSuccess = !error; + } + } else { + + // Extract error from statusText and normalize for non-aborts + error = statusText; + if ( status || !statusText ) { + statusText = "error"; + if ( status < 0 ) { + status = 0; + } + } + } + + // Set data for the fake xhr object + jqXHR.status = status; + jqXHR.statusText = ( nativeStatusText || statusText ) + ""; + + // Success/Error + if ( isSuccess ) { + deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); + } else { + deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); + } + + // Status-dependent callbacks + jqXHR.statusCode( statusCode ); + statusCode = undefined; + + if ( fireGlobals ) { + globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", + [ jqXHR, s, isSuccess ? success : error ] ); + } + + // Complete + completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); + + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); + + // Handle the global AJAX counter + if ( !( --jQuery.active ) ) { + jQuery.event.trigger( "ajaxStop" ); + } + } + } + + return jqXHR; + }, + + getJSON: function( url, data, callback ) { + return jQuery.get( url, data, callback, "json" ); + }, + + getScript: function( url, callback ) { + return jQuery.get( url, undefined, callback, "script" ); + } +} ); + +jQuery.each( [ "get", "post" ], function( _i, method ) { + jQuery[ method ] = function( url, data, callback, type ) { + + // Shift arguments if data argument was omitted + if ( isFunction( data ) ) { + type = type || callback; + callback = data; + data = undefined; + } + + // The url can be an options object (which then must have .url) + return jQuery.ajax( jQuery.extend( { + url: url, + type: method, + dataType: type, + data: data, + success: callback + }, jQuery.isPlainObject( url ) && url ) ); + }; +} ); + +jQuery.ajaxPrefilter( function( s ) { + var i; + for ( i in s.headers ) { + if ( i.toLowerCase() === "content-type" ) { + s.contentType = s.headers[ i ] || ""; + } + } +} ); + + +jQuery._evalUrl = function( url, options, doc ) { + return jQuery.ajax( { + url: url, + + // Make this explicit, since user can override this through ajaxSetup (#11264) + type: "GET", + dataType: "script", + cache: true, + async: false, + global: false, + + // Only evaluate the response if it is successful (gh-4126) + // dataFilter is not invoked for failure responses, so using it instead + // of the default converter is kludgy but it works. + converters: { + "text script": function() {} + }, + dataFilter: function( response ) { + jQuery.globalEval( response, options, doc ); + } + } ); +}; + + +jQuery.fn.extend( { + wrapAll: function( html ) { + var wrap; + + if ( this[ 0 ] ) { + if ( isFunction( html ) ) { + html = html.call( this[ 0 ] ); + } + + // The elements to wrap the target around + wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); + + if ( this[ 0 ].parentNode ) { + wrap.insertBefore( this[ 0 ] ); + } + + wrap.map( function() { + var elem = this; + + while ( elem.firstElementChild ) { + elem = elem.firstElementChild; + } + + return elem; + } ).append( this ); + } + + return this; + }, + + wrapInner: function( html ) { + if ( isFunction( html ) ) { + return this.each( function( i ) { + jQuery( this ).wrapInner( html.call( this, i ) ); + } ); + } + + return this.each( function() { + var self = jQuery( this ), + contents = self.contents(); + + if ( contents.length ) { + contents.wrapAll( html ); + + } else { + self.append( html ); + } + } ); + }, + + wrap: function( html ) { + var htmlIsFunction = isFunction( html ); + + return this.each( function( i ) { + jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); + } ); + }, + + unwrap: function( selector ) { + this.parent( selector ).not( "body" ).each( function() { + jQuery( this ).replaceWith( this.childNodes ); + } ); + return this; + } +} ); + + +jQuery.expr.pseudos.hidden = function( elem ) { + return !jQuery.expr.pseudos.visible( elem ); +}; +jQuery.expr.pseudos.visible = function( elem ) { + return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); +}; + + + + +jQuery.ajaxSettings.xhr = function() { + try { + return new window.XMLHttpRequest(); + } catch ( e ) {} +}; + +var xhrSuccessStatus = { + + // File protocol always yields status code 0, assume 200 + 0: 200, + + // Support: IE <=9 only + // #1450: sometimes IE returns 1223 when it should be 204 + 1223: 204 + }, + xhrSupported = jQuery.ajaxSettings.xhr(); + +support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); +support.ajax = xhrSupported = !!xhrSupported; + +jQuery.ajaxTransport( function( options ) { + var callback, errorCallback; + + // Cross domain only allowed if supported through XMLHttpRequest + if ( support.cors || xhrSupported && !options.crossDomain ) { + return { + send: function( headers, complete ) { + var i, + xhr = options.xhr(); + + xhr.open( + options.type, + options.url, + options.async, + options.username, + options.password + ); + + // Apply custom fields if provided + if ( options.xhrFields ) { + for ( i in options.xhrFields ) { + xhr[ i ] = options.xhrFields[ i ]; + } + } + + // Override mime type if needed + if ( options.mimeType && xhr.overrideMimeType ) { + xhr.overrideMimeType( options.mimeType ); + } + + // X-Requested-With header + // For cross-domain requests, seeing as conditions for a preflight are + // akin to a jigsaw puzzle, we simply never set it to be sure. + // (it can always be set on a per-request basis or even using ajaxSetup) + // For same-domain requests, won't change header if already provided. + if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { + headers[ "X-Requested-With" ] = "XMLHttpRequest"; + } + + // Set headers + for ( i in headers ) { + xhr.setRequestHeader( i, headers[ i ] ); + } + + // Callback + callback = function( type ) { + return function() { + if ( callback ) { + callback = errorCallback = xhr.onload = + xhr.onerror = xhr.onabort = xhr.ontimeout = + xhr.onreadystatechange = null; + + if ( type === "abort" ) { + xhr.abort(); + } else if ( type === "error" ) { + + // Support: IE <=9 only + // On a manual native abort, IE9 throws + // errors on any property access that is not readyState + if ( typeof xhr.status !== "number" ) { + complete( 0, "error" ); + } else { + complete( + + // File: protocol always yields status 0; see #8605, #14207 + xhr.status, + xhr.statusText + ); + } + } else { + complete( + xhrSuccessStatus[ xhr.status ] || xhr.status, + xhr.statusText, + + // Support: IE <=9 only + // IE9 has no XHR2 but throws on binary (trac-11426) + // For XHR2 non-text, let the caller handle it (gh-2498) + ( xhr.responseType || "text" ) !== "text" || + typeof xhr.responseText !== "string" ? + { binary: xhr.response } : + { text: xhr.responseText }, + xhr.getAllResponseHeaders() + ); + } + } + }; + }; + + // Listen to events + xhr.onload = callback(); + errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); + + // Support: IE 9 only + // Use onreadystatechange to replace onabort + // to handle uncaught aborts + if ( xhr.onabort !== undefined ) { + xhr.onabort = errorCallback; + } else { + xhr.onreadystatechange = function() { + + // Check readyState before timeout as it changes + if ( xhr.readyState === 4 ) { + + // Allow onerror to be called first, + // but that will not handle a native abort + // Also, save errorCallback to a variable + // as xhr.onerror cannot be accessed + window.setTimeout( function() { + if ( callback ) { + errorCallback(); + } + } ); + } + }; + } + + // Create the abort callback + callback = callback( "abort" ); + + try { + + // Do send the request (this may raise an exception) + xhr.send( options.hasContent && options.data || null ); + } catch ( e ) { + + // #14683: Only rethrow if this hasn't been notified as an error yet + if ( callback ) { + throw e; + } + } + }, + + abort: function() { + if ( callback ) { + callback(); + } + } + }; + } +} ); + + + + +// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) +jQuery.ajaxPrefilter( function( s ) { + if ( s.crossDomain ) { + s.contents.script = false; + } +} ); + +// Install script dataType +jQuery.ajaxSetup( { + accepts: { + script: "text/javascript, application/javascript, " + + "application/ecmascript, application/x-ecmascript" + }, + contents: { + script: /\b(?:java|ecma)script\b/ + }, + converters: { + "text script": function( text ) { + jQuery.globalEval( text ); + return text; + } + } +} ); + +// Handle cache's special case and crossDomain +jQuery.ajaxPrefilter( "script", function( s ) { + if ( s.cache === undefined ) { + s.cache = false; + } + if ( s.crossDomain ) { + s.type = "GET"; + } +} ); + +// Bind script tag hack transport +jQuery.ajaxTransport( "script", function( s ) { + + // This transport only deals with cross domain or forced-by-attrs requests + if ( s.crossDomain || s.scriptAttrs ) { + var script, callback; + return { + send: function( _, complete ) { + script = jQuery( " + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • <no title>
  • +
  • +
  • +
+
+
+
+
+ +
+

Workspace is not required for FFTs of following sizes:

+
    +
  • Powers of 2 up to 32768

  • +
  • Powers of 3 up to 19683

  • +
  • Powers of 5 up to 15625

  • +
  • Powers of 6 up to 1296

  • +
  • Powers of 7 up to 2401

  • +
  • Powers of 10 up to 10000

  • +
  • Powers of 11 up to 1331

  • +
  • Powers of 12 up to 1728

  • +
+
+
In the future versions of cuFFTDx:
    +
  • Workspace requirement may be removed for other configurations.

  • +
  • FFT configurations that do not require workspace will continue to do so.

  • +
+
+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/index.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/index.html new file mode 100644 index 0000000000000..b3ed34489ccdb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/index.html @@ -0,0 +1,277 @@ + + + + + + cuFFTDx API Reference — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • cuFFTDx API Reference
  • +
  • +
  • +
+
+
+
+
+ +
+

cuFFTDx API Reference

+

Here you can find a description of the main components of the cuFFTDx library, with usage examples.

+ +
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/methods.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/methods.html new file mode 100644 index 0000000000000..c84574ae26969 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/methods.html @@ -0,0 +1,450 @@ + + + + + + Execution Methods — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Execution Methods

+

These methods are used to run the FFT operation.

+

A code example:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<128>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<float>() + cufftdx::Block() );
+
+using complex_type = typename FFT::value_type;
+
+__global__ kernel(... /* arguments */) {
+
+  // Shared memory pointer
+  extern __shared__ complex_type shared_mem[];
+
+  // Register data
+  complex_type thread_data[FFT::storage_size];
+
+  // Load data into registers (thread_data)
+  // ...
+
+  FFT().execute(thread_data, shared_mem);
+
+  // Store results (thread_data) into global memory
+}
+
+
+
+

Thread Execute Method

+
void FFT().execute<typename T>(T* input)
+
+
+

Runs the FFT operation defined by the FFT descriptor. T can be any type (such as float2 or double2), +as long as its alignment and element size are the same as those of FFT::value_type.

+

This method is available if the descriptor has been constructed using the Thread Operator and +cufftdx::is_complete_fft_execution is true.

+

input array should be in the per-thread local memory (registers). input must fit FFT::storage_size +elements of type FFT::value_type.

+
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+
+
+

Block Execute Method

+
// #1
+void FFT().execute<typename T>(T* input, void* shared_memory, FFT::workspace_type& workspace)
+
+// #2: Version of #1 for FFTs which don't require workspace
+void FFT().execute<typename T>(T* input, void* shared_memory)
+
+// #3: Execute with input data in shared memory
+void FFT().execute<typename T>(T* shared_memory_input, FFT::workspace_type& workspace)
+
+// #4: Version of #3 for FFTs which don't require workspace
+void FFT().execute<typename T>(T* shared_memory_input)
+
+
+

Runs the FFT operation defined by the FFT descriptor. T can be any type (such as float2 or double2), +as long as its alignment and element size are the same as those of FFT::value_type. +Pointers input, shared_memory, shared_memory_input should be aligned to alignof(FFT::value_type).

+

This method is available if the descriptor has been constructed using the Block Operator +and cufftdx::is_complete_fft_execution is true.

+

When FFT::requires_workspace is false, overloads #2 and #4 can be used. Otherwise, user has to use +methods #1 or #3 and pass a reference to a workspace.

+

In methods #1 and #2 input is in local memory (registers), and shared_memory is a pointer to a shared memory of size +FFT::shared_memory_size bytes. The operation is in-place meaning the results are stored in input. input must +fit FFT::storage_size elements of type FFT::value_type.

+

In methods #3 and #4 the input data is passed in shared memory (shared_memory_input). The operation is in-place, meaning +the results are stored back to shared_memory_input. These methods don’t require an additional shared_memory pointer +to be passed, as shared_memory_input will be used for the required communication between threads. Thus, shared_memory_input +must fit all input and output values, and can’t be smaller than FFT::shared_memory_size bytes +(i.e. shared memory size in bytes is a maximum of FFT::shared_memory_size, FFT::ffts_per_block * <FFT_input_size_in_bytes>, and +FFT::ffts_per_block * <FFT_output_size_in_bytes>) bytes).

+
+

Warning

+

It is not guaranteed that executions of the same FFTs (size, direction, type, precision) but with different

+ +

will produce bit-identical results.

+
+
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+
+

Value Format

+

For complex numbers of single and double precision, the first value in a complex number is the real part and the second is +the imaginary part.

+

Processing of half (fp16) precision FFTs in cuFFTDx is implicitly batched, that is, single computation processes two FFT +batches. cuFFTDx expects that a complex number of half precision has 2 real parts and 2 imaginary parts in that order +(i.e real_1, real_2, imaginary_1, imaginary_2). Real values of half precision (for R2C and C2R FFTs) follows the same logic and +each should contain two real values. See also FFT::implicit_type_batching trait.

+
+
+

Input/Output Data Format

+

This section describes the input and output data format.

+
+

Data In Registers

+

N-th thread (indexing from 0) participating in the FFT should include the following values of FFT in its input +values: n + FFT::stride * i where i is an index in input. Results are later stored in input following to the same rule.

+

See also FFT::stride.

+
+

Example

+

0-th thread of 8-point FFT with FFT::stride equal to 2 should have values 0, 2, 4, and 6 in its input.

+
+
+
+

Data In Shared Memory

+

The input values of the FFT should be stored in shared_memory_input in natural order. Results are stored in shared_memory_input +following to the same rule.

+
+
+
+
+

Make Workspace Function

+
template<class FFT>
+auto cufftdx::make_workspace<FFT>(cudaError_t& error)
+
+
+

cufftdx::make_workspace<FFT>(cudaError_t&) is a helper function for creating workspace required for block execute(...) method +when FFT::requires_workspace is true. FFT is type of FFT descriptor. +If after calling the function error is not cudaSuccess the workspace was not created correctly and is invalid.

+
    +
  • If FFT::requires_workspace trait is false, user doesn’t have to create workspace.

  • +
  • Workspace can be created for FFT with FFT::requires_workspace equal to false: such workspace is an empty workspace with no global memory allocation.

  • +
  • Workspace object is valid only for FFT it was created for.

  • +
  • Workspace object can allocate global memory, however never more than FFT::workspace_size, +and it’s responsible for freeing it.

  • +
  • Workspace can’t be used concurrently since all copies share the same underlying global memory allocation. Using workspace concurrently will result in memory races.

  • +
  • Allocated global memory is freed upon destruction of the last copy of created workspace object.

  • +
  • Workspace object can be implicitly cast to FFT::workspace_type.

  • +
+
+

Note

+
+

Workspace is not required for FFTs of following sizes:

+
    +
  • Powers of 2 up to 32768

  • +
  • Powers of 3 up to 19683

  • +
  • Powers of 5 up to 15625

  • +
  • Powers of 6 up to 1296

  • +
  • Powers of 7 up to 2401

  • +
  • Powers of 10 up to 10000

  • +
  • Powers of 11 up to 1331

  • +
  • Powers of 12 up to 1728

  • +
+
+
In the future versions of cuFFTDx:
    +
  • Workspace requirement may be removed for other configurations.

  • +
  • FFT configurations that do not require workspace will continue to do so.

  • +
+
+
+
+
+
+

Warning

+

FFT::workspace_type object doesn’t track lifetime of underlying memory, and +is only valid within a lifetime of workspace object it was casted from.

+
+
+

Warning

+

Type returned by cufftdx::make_workspace<FFT>(cudaError_t&) can be different for different FFT descriptions, +and is not the same as FFT::workspace_type. User should use auto when +creating a workspace object. Example:

+
// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    // ...
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+}
+
+// Create workspace
+cudaError_t error = 0;
+auto workspace = cufftdx::make_workspace<FFT>(error);
+
+// ...
+
+// Run kernel with FFT
+block_fft_kernel<FFT><<<1, FFT::block_dim, FFT::shared_memory_size>>>(data, workspace);
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/operators.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/operators.html new file mode 100644 index 0000000000000..94e4541fd0264 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/operators.html @@ -0,0 +1,572 @@ + + + + + + Operators — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Operators

+

Operators are used to describe the FFT operation to solve, and to configure the execution. They are divided into +Description Operators and Execution Operators.

+
+
+

Description Operators

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Operator

Default value

Description

Size<unsigned int S>

Not set.

Size S of the FFT to calculate.

Direction<fft_direction>

Not set.

Direction of the FFT, either fft_direction::inverse or fft_direction::forward.

Type<fft_type>

fft_type::c2c

Types of input and output data (C2C, R2C, C2R).

Precision<P>

float

Precision P of the floating-point values used to compute the FFT: double, float or __half.

SM<unsigned int CC>

Not set.

Target CUDA architecure for which the FFT function should be generated.

+

Description operators define the FFT operation to be solved. Combined with Execution Operators, they form a +complete FFT descriptor that can be executed on a GPU.

+

Operators are added to construct the FFT descriptor type. For example, for a forward FFT operation consisting of a FFT with 8 double elements per thread:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<8>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<double>() + cufftdx::Thread() );
+
+
+
+
For an FFT descriptor to be complete, the following is required:
+
+
+
+

Size Operator

+
cufftdx::Size<unsigned int S>()
+
+
+

Sets the size S of the FFT operation to compute.

+

There is no default size.

+
+
Restrictions:
    +
  • S must be greater than 1.

  • +
+
+
+
+
+

Direction Operator

+
cufftdx::Direction<cufftdx::fft_direction>()
+
+
+

Sets the direction of the FFT, either fft_direction::inverse or fft_direction::forward.

+

There is no default direction.

+

If the FFT is constructed with the Type<R2C> operator, direction is assumed to be forward and +a direction operator is not necessary.

+

If the FFT is constructed with the Type<C2R> operator, direction is assumed to be inverse and +a direction operator is not necessary.

+
+
Restrictions:
    +
  • fft_direction::forward requires Type<C2C> or Type<R2C>.

  • +
  • fft_direction::inverse requires Type<C2C> or Type<C2R>.

  • +
+
+
+
+
+

Type Operator

+
cufftdx::Type<cufftdx::fft_type>()
+
+
+

Sets the type of the FFT to compute, either fft_type::c2c for complex-to-complex; fft_type::r2c for +real-to-complex; or fft_type::c2r for complex-to-real.

+

The default type is fft_type::c2c.

+
+
Restrictions:
    +
  • fft_type::r2c requires fft_direction::forward. If no direction is specified, it is assumed to be fft_direction::forward.

  • +
  • fft_type::c2r requires fft_direction::inverse. If no direction is specified, it is assumed to be fft_direction::inverse.

  • +
  • cuFFDx performs unnormalized Fast Fourier Transform calculations.

  • +
+
+
+
+
+

Precision Operator

+
cufftdx::Precision<__half>()
+
+cufftdx::Precision<float>()
+
+cufftdx::Precision<double>()
+
+
+

Sets the floating-point precision used to compute the FFT. This is the type of the values used for input and output, as well as the +underlying type of the values used to compute the FFT.

+

The default precision is float.

+
+
+

SM Operator

+
cufftdx::SM<unsigned int CC>()
+
+
+

Sets the target architecture CC for the underlying FFT function to use. Supported architectures are:

+
    +
  • Volta: 700 and 720 (sm_70, sm_72),

  • +
  • Turing: 750 (sm_75), and

  • +
  • Ampere: 800, 860 (sm_80, sm_86).

  • +
+
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+
+
+
+
+

Execution Operators

+ +++++ + + + + + + + + + + + + + + + + +

Operator

Default value

Description

Thread

Not set.

Creates FFT thread execution object.

Block

Not set.

Creates FFT block execution object. See Block Configuration Operators.

+

Execution operators configure how the FFT operation will run on the GPU. Combined with Description Operators, they form a +complete FFT descriptor that can be executed on a GPU.

+

Operators are added to construct the FFT descriptor type. For example, for a forward FFT operation consisting of two FFTs with 128 float elements each, running simultaneously in one CUDA block:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<128>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<float>() + cufftdx::Block()
+                    + cufftdx::ElementsPerThread<8>() + cufftdx::FFTsPerBlock<2>() );
+
+
+
+

Thread Operator

+
cufftdx::Thread()
+
+
+

Sets the FFT operation to run in a thread context. The FFT operation will simultaneously run a single, independent FFT (described using Description Operators) per thread.

+

Each thread will compute one FFT of the size defined by the Size Operator.

+
+
Restrictions:
    +
  • Is mutually exclusive with Block operator

  • +
  • Compilation will fail when used with block-only operators: FFTsPerBlock, ElementsPerThread, BlockDim.

  • +
  • With Precision<__half> restricts Size to range \([2, 32]\).

  • +
  • With Precision<float> restricts Size to range \([2, 32]\).

  • +
  • With Precision<double> restricts Size to range \([2, 16]\).

  • +
+
+
+
+
+

Block Operator

+
cufftdx::Block()
+
+
+

Generates a collective FFT operation to run in a single CUDA block. One or more threads will cooperate to compute the +collective FFT operation.

+

The number of FFTs to compute, as well as the number of threads used to calculate each FFT, can be configured using +Block Configuration Operators.

+
+
Restrictions:
    +
  • Is mutually exclusive with Thread operator

  • +
  • Unless a BlockDim Operator is used, the collective FFT operation can only be executed +inside a 2D block of sizes:

    +
    +
      +
    • blockDim.x = size_of<Description>::value/Description::elements_per_thread.

    • +
    • blockDim.y = Description::ffts_per_block.

    • +
    • blockDim.z = 1.

    • +
    +
    +
  • +
  • BlockDim Operator is not implemented yet.

  • +
  • Operator cufftdx::Precision<__half>() restricts cufftdx::Size<U>() to range \([2, 32768]\).

  • +
  • Operator cufftdx::Precision<float>() restricts cufftdx::Size<U>() to range \([2, 32768]\).

  • +
  • Operator cufftdx::Precision<double>() restricts cufftdx::Size<U>() to range \([2, 16384]\).

  • +
+
+
+
+
+

Block Configuration Operators

+ +++++ + + + + + + + + + + + + + + + + + + + + +

Operators

Default value

Description

FFTsPerBlock<unsigned int F>

1

Number F of FFTs calculated per CUDA block.

ElementsPerThread<unsigned int E>

Heuristic.

Number E of FFT values per CUDA thread.

BlockDim<unsigned int X, Y, Z>

Not set.

Required for executing block FFT within block +with custom dimensions.

+

Block-configuration operators allow the user to tune how the collective FFT operation will run on a single CUDA block.

+
+

Note

+

Block configuration operators can only be used with Block Operator.

+
+
+

Warning

+

It is not guaranteed that executions of the same FFTs (size, direction, type, precision) but with different

+ +

will produce bit-identical results.

+
+
+

FFTs Per Block Operator

+
cufftdx::FFTsPerBlock<unsigned int>()
+
+
+

Sets the number of FFT to compute in parallel within a single CUDA block. Each FFT is computed concurrently by a +separate group of threads.

+

The default is one FFT per block.

+
+
+

Elements Per Thread Operator

+
cufftdx::ElementsPerThread<unsigned int>()
+
+
+

Sets the number of FFT elements to be computed by each thread.

+

The default is determined heuristically to target performance.

+

Restrictions:

+
    +
  • If FFT::requires_workspace if false, it must be a divisor of the requested FFT size.

  • +
  • If FFT::requires_workspace if true, it must be a power of two smaller than the size of the FFT.

  • +
  • Must be in range \([2; 32]\) for cufftdx::Precision<float>() and cufftdx::Precision<__half>().

  • +
  • Must be in range \([2; 16]\) for cufftdx::Precision<double>().

  • +
+
+
+

BlockDim Operator

+
struct cufftdx::BlockDim<unsigned int X, unsigned int Y, unsigned int Z>()
+
+
+

Sets the CUDA block size to (X, Y, Z), to configure the execution.

+

Using this operator, the user can run the collective FFT operation with 2D or 3D CUDA blocks.

+

Default BlockDim size:

+
    +
  • blockDim.x = size_of<Description>::value/Description::elements_per_thread.

  • +
  • blockDim.y = Description::ffts_per_block.

  • +
  • blockDim.z = 1.

  • +
+

See FFT::block_dim.

+
+

Note

+

BlockDim operator is not implemented yet.

+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/traits.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/traits.html new file mode 100644 index 0000000000000..3ce82fbecb1d5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/api/traits.html @@ -0,0 +1,776 @@ + + + + + + Traits — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Traits

+

Traits provide the user with information about the FFT description constructed using Operators. They are divided into +Description Traits and Execution Traits.

+
+
+

Description Traits

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Trait

Default value

Description

size_of<Description>::value

None.

Size of the FFT to compute.

type_of<Description>::value

fft_type::c2c

Type of the FFT operation, either fft_type::c2c, fft_type::r2c or fft_type::c2r.

direction_of<Description>::value

See Direction Trait.

Direction of the FFT operation, either fft_direction::inverse or fft_direction::forward.

precision_of<Description>

float

Type of the underlying floating-point values used to compute the FFT: double, float or __half.

is_fft<Description>

None.

true if Description is an FFT description, formed with Description Operators.

is_fft_execution<Description>

None.

true if Description is an FFT description, configured with Execution Operators.

is_complete_fft<Description>

None.

true if Description is a valid FFT description, formed with Description Operators.

is_complete_fft_execution<Description>

None.

true if is_complete_fft<Description> is true and is_fft_execution<Description> is true.

+

Description traits can be retrieved from an FFT descriptor using the helper functions provided. For example:

+
#include <iostream>
+#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<8>() + cufftdx::Type<fft_type::c2c>()
+                      + cufftdx::Direction<fft_direction::forward>()
+                      + cufftdx::Precision<double>() + cufftdx::Thread() );
+
+if(cufftdx::is_complete<FFT>::value)
+  std::cout << "Size of the FFT operation: " << cufftdx::size_of<FFT>::value << std::endl;
+
+
+
+

Size Trait

+
cufftdx::size_of<FFT>::value
+
+
+

Size of the FFT to compute, as set by Size Operator.

+

There is no default size. If the descriptor was not created using a Size Operator, compilation will fail with an error message.

+
+
+

Type Trait

+
cufftdx::type_of<FFT>::value
+
+
+

Type of the FFT operation, as set by Type Operator.

+

The default type is complex-to-complex, fft_type::c2c.

+
+
+

Direction Trait

+
cufftdx::direction_of<FFT>::value
+
+
+

Direction of the FFT operation, as set by Direction Operator.

+

Default direction:

+
+
    +
  • If the FFT type is fft_type::r2c, the default direction is fft_direction::forward.

  • +
  • If the FFT type is fft_type::c2r, the default direction is fft_direction::inverse.

  • +
  • For any other type, there is no default direction. If the descriptor was not created using a Direction Operator, compilation will fail with an error message.

  • +
+
+
+
+

Precision Trait

+
cufftdx::precision_of<FFT>::type
+
+
+

Floating-point precision of the FFT operation, as set by Precision Operator.

+

The default precision is float.

+
+
+

Is FFT? Trait

+
cufftdx::is_fft<FFT>::value
+
+
+

Trait is true if the descriptor is an FFT description, formed with Description Operators.

+

There is no default value. The descriptor either is or is not an FFT description.

+
+
+

Is FFT Execution? Trait

+
cufftdx::is_fft_execution<FFT>::value
+
+
+

Trait is true if the descriptor is an FFT description, formed with Description Operators and a Execution Operators.

+

There is no default value. The descriptor either is or is not an FFT description including an Execution Operators.

+
+
+

Is FFT-complete? Trait

+
cufftdx::is_complete_fft<FFT>::value
+
+
+

Trait is true if the descriptor is a complete FFT description, formed with Description Operators.

+
+

Note

+

Complete in this context means that the descriptor has been formed with all the necessary Description Operators and it is only missing an Execution Operators to be able to run.

+
+

For an FFT descriptor to be complete, the following is required:

+ +

There is no default value. The descriptor either is or is not an FFT-complete description.

+
+
+

Is FFT-complete Execution? Trait

+
cufftdx::is_complete_fft_execution<FFT>::value
+
+
+

Trait is true if both cufftdx::is_fft_execution and cufftdx::is_complete_fft are true.

+
+

Note

+

If cufftdx::is_complete_fft_execution trait is true for a descriptor FFT, then we can use the Execution Methods +to compute the FFT.

+
+

There is no default value.

+
+
+
+
+

Execution Traits

+

Execution traits can be retrieved directly from an FFT descriptor that has been configured with Execution Operators. +The available execution traits depend on the operator used to build the descriptor; either a Thread Operator or a Block Operator.

+
+

Thread Traits

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Trait

Default value

Description

Description::value_type

detail::complex<float>

Complex type of the underlying data used to compute the FFT.

Description::input_type

Description::value_type

Type of the underlying data used as input for the FFT.

Description::output_type

Description::value_type

Type of the underlying data used as output for the FFT.

Description::implicit_type_batching

2 if cufftdx::precision_of<FFT>::type is __half, otherwise - 1

Number of values from different FFTs batched into one element of type Description::value_type.

Description::elements_per_thread

size_of<Description>::value

Number of FFT elements to be computed per thread.

Description::storage_size

Description::elements_per_thread

Number of Description::value_type elements that each thread must allocate to compute the FFT.

Description::stride

Always 1

Stride between elements of the thread FFT held by each thread in its input

+

Thread traits can be retrieved from descriptors built with Thread Operator.

+

For example:

+
#include <cufftdx.hpp>
+
+using FFT          = decltype(cufftdx::Size<8>() + cufftdx::Type<fft_type::c2c>()
+                            + cufftdx::Direction<fft_direction::forward>()
+                            + cufftdx::Precision<double>() + Thread());
+
+
+// Retrieve the FFT data type
+using complex_type = typename FFT::value_type;
+
+// Retrieve the number of elements per thread
+auto elements_per_thread = FFT::elements_per_thread;
+
+
+
+

Value Type Trait

+
FFT::value_type
+
+
+

Complex type of the underlying data used for FFT computation.

+

The default type is cufftdx::detail::complex<float>, as defined in the types.hpp header file.

+
+
+

Input Type Trait

+
FFT::input_type
+
+
+

Complex type of the underlying data used as input of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Output Type Trait

+
FFT::output_type
+
+
+

Complex type of the underlying data used as output of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Implicit Type Batching Trait

+
FFT::implicit_type_batching
+
+
+

Number of values from different FFTs batched into one element of type Description::value_type used in FFT computation. If +it’s higher than one it means that Thread FFT object calculates multiple FFTs in one go.

+

The value is 2 if cufftdx::precision_of<FFT>::type is __half, and 1 otherwise.

+
+

Note

+

Please note that in future releases of cuFFTDx FFT::implicit_type_batching may be replaced, and/or extended.

+
+
+
+

Elements Per Thread Trait

+
FFT::elements_per_thread
+
+
+

Number of FFT elements of the type returned by Value Type Trait that each thread will compute.

+

The default value is the same as Size Trait.

+
+
+

Storage Size Trait

+
FFT::storage_size
+
+
+

Number of Description::value_type elements that each thread must allocate to compute the FFT.

+

The default value is the same as Elements Per Thread Trait.

+
+
+

Stride Size Trait

+
FFT::stride
+
+
+

Stride between elements of the FFT held by each thread in input.

+

For thread FFT FFT::stride is always 1.

+
+
+
+

Block Traits

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Trait

Default value

Description

Description::value_type

detail::complex<float>

Complex type of the underlying data used to compute the FFT.

Description::input_type

Description::value_type

Type of the underlying data used as input for the FFT.

Description::output_type

Description::value_type

Type of the underlying data used as output for the FFT.

Description::workspace_type

Description::workspace_type

Device-side type of workspace required for FFT computation.

Description::implicit_type_batching

2 if cufftdx::precision_of<FFT>::type is __half, otherwise - 1

Number of values from different FFTs batched into one element of type Description::value_type.

Description::elements_per_thread

Heuristic.

Number of FFT elements to be computer per thread.

Description::storage_size

Determined by Description::elements_per_thread

Number of Description::value_type elements that each thread must allocate to compute the FFT.

Description::stride

Determined by Description::elements_per_thread and size of the FFT

Stride between elements of the block FFT held by each thread in its input

Description::ffts_per_block

1

Number of FFTs to compute by a CUDA block in this FFT operation.

Description::suggested_ffts_per_block

Heuristic.

Suggested number of FFTs to compute by a CUDA block to target maximum performance.

Description::shared_memory_size

Determined from Description::ffts_per_block and Description::elements_per_thread

Size of the shared memory in bytes.

Description::block_dim

See Block Dim Trait.

dim3 of the CUDA block to compute the FFT operation.

Description::max_threads_per_block

Determined from Description::block_dim

Total number of threads in the CUDA block.

Description::requires_workspace

True if FFT implementation requires extra workspace; otherwise - false.

Determines if it’s required to allocate extra workspace in global memory using cufftdx::make_workspace(cudaError_t&).

Description::workspace_size

0 if Description::workspace_size is true, otherwise > 0.

Size of global memory required for workspace (in bytes).

+

Block traits can be retrieved from descriptors built with Block Operator.

+

For example:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<128>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<float>() + cufftdx::Block()
+                    + cufftdx::ElementsPerThread<8>() + cufftdx::FFTsPerBlock<2>() );
+
+// Retrieve the FFT data type
+using complex_type = typename FFT::value_type;
+
+// Allocate managed memory for input/output
+complex_type* data;
+auto          size       = FFT::ffts_per_block * cufftdx::size_of<FFT>::value;
+auto          size_bytes = size * sizeof(complex_type);
+
+cudaMallocManaged(&data, size_bytes);
+
+
+
+

Value Type Trait

+
FFT::value_type
+
+
+

Complex type of the underlying data used for FFT computation.

+

The default type is cufftdx::detail::complex<float>, as defined in the types.hpp header file.

+
+
+

Input Type Trait

+
FFT::input_type
+
+
+

Complex type of the underlying data used as input of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Output Type Trait

+
FFT::output_type
+
+
+

Complex type of the underlying data used as output of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Workspace Type Trait

+
FFT::workspace_type
+
+
+

Type of a workspace required by execute(...) function of FFT. User should check if FFT requires a workspace +using Description::requires_workspace trait, and create one with cufftdx::make_workspace<FFT>(cudaError_t&).

+

See Make Workspace Function for more details about workspace.

+
+

Warning

+

FFT::workspace_type object doesn’t track lifetime of underlying memory, and is only valid within a lifetime of +workspace object it was casted from.

+
+
+

Warning

+

Type returned by cufftdx::make_workspace<FFT>(cudaError_t&) can be different for different FFT descriptions, +and is not the same as FFT::workspace_type. User should use auto when creating a workspace object.

+
+
+
+

Implicit Type Batching Trait

+
FFT::implicit_type_batching
+
+
+

Number of values from different FFTs batched into one element of type Description::value_type used in FFT computation. If +it’s higher than one it means that Block FFT object calculates multiple FFTs in one go.

+

The value is 2 if cufftdx::precision_of<FFT>::type is __half, and 1 otherwise.

+
+

Note

+

Please note that in future releases of cuFFTDx FFT::implicit_type_batching may be replaced, and/or extended.

+
+
+
+

Elements Per Thread Trait

+
FFT::elements_per_thread
+
+
+

Number of FFT elements of the type returned by _valuetype-thread-trait-label that each thread will compute.

+

The default value is the same as Size Trait.

+
+
+

Storage Size Trait

+
FFT::storage_size
+
+
+

Number of Description::value_type elements that each thread must allocate to compute the FFT.

+

The default value is the same as Elements Per Thread Trait.

+
+
+

Stride Size Trait

+
FFT::stride
+
+
+

Stride between elements of the block FFT held by each thread in its input.

+

See also expected input data format.

+
+

Example

+

0-th thread of 8-point FFT with FFT::stride equal to 2 should have values 0, 2, 4, and 6 in its input.

+
+
+
+

FFTs Per Block Trait

+
FFT::ffts_per_block
+
+
+

Number of FFTs to compute in parallel within a CUDA block, as part of the collective FFT operation.

+

The default value is 1.

+
+
+

Suggested FFTs Per Block Trait

+
FFT::suggested_ffts_per_block
+
+
+

Suggested number of FFTs to compute in parallel within a CUDA block, as part of the collective FFT operation, to maximize performance.

+

The default value is heuristic, and depends on the size of the FFT, the number of elements per thread, and other parameters.

+
+
+

Shared Memory Size Trait

+
FFT::shared_memory_size
+
+
+

Size of the required shared memory for the FFT operation to execute, in bytes.

+

The default value is determined from FFTs Per Block Trait and Elements Per Thread Trait.

+
+
+

Block Dim Trait

+
FFT::block_dim
+
+
+

BlockDim<unsigned int X, Y, Z> with x = size_of<Description>::value/Description::elements_per_thread, y = Description::ffts_per_block, z = 1

+
+
+

Max Threads Per Block Trait

+
FFT::max_threads_per_block
+
+
+

Maximum number of threads for the FFT in the CUDA block.

+

The default value is determined from FFTs Per Block Trait and Elements Per Thread Trait.

+
+
+

Requires Workspace Trait

+
FFT::requires_workspace
+
+
+

Boolean value. If true, a workspace must be created and passed to the FFT::execute(...) method (see block execute methods). +Otherwise, it’s not necessary to create and pass a workspace. Workspace can be created using +cufftdx::make_workspace<FFT>(cudaError_t&) function. Workspace created for FFT which does not require one +will be empty and won’t allocate any global memory.

+
+
+

Workspace Size Trait

+
FFT::workspace_size
+
+
+

Informs how much global memory will be allocated by required workspace. If Description::workspace_size is false +it’s 0; otherwise it’s greater than zero.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/genindex.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/genindex.html new file mode 100644 index 0000000000000..9b7a8823bcc26 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/genindex.html @@ -0,0 +1,254 @@ + + + + + + Index — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Index
  • +
  • +
  • +
+
+
+
+
+ + +

Index

+ +
+ +
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/index.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/index.html new file mode 100644 index 0000000000000..547847a24ac4c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/index.html @@ -0,0 +1,315 @@ + + + + + + NVIDIA cuFFTDx — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • NVIDIA cuFFTDx
  • +
  • +
  • +
+
+
+
+
+ +
+

NVIDIA cuFFTDx

+

The cuFFT Device Extensions (cuFFTDx) library enables you to perform Fast Fourier Transform (FFT) calculations +inside your CUDA kernel. Fusing FFT with other operations can decrease the latency and improve the performance of +your application.

+
+
The documentation consists of two main components:
+
+
+
+
+

Highlights

+
    +
  • Fast Fourier Transform (FFT) embeddable into a CUDA kernel

  • +
  • High performance, no unnecessary data movement from and to global memory

  • +
  • Customizability, options to adjust selection of FFT routine for different needs (size, precision, batches etc.)

  • +
  • Ability to fuse FFT kernels with other operations saving global memory trips

  • +
  • Compatibility with future versions of the CUDA Toolkit

  • +
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/introduction.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/introduction.html new file mode 100644 index 0000000000000..f22270ed5f69b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/introduction.html @@ -0,0 +1,634 @@ + + + + + + First FFT using cuFFTDx — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • First FFT using cuFFTDx
  • +
  • +
  • +
+
+
+
+
+ +
+

First FFT using cuFFTDx

+

In the following example, we will calculate an FFT of size 128 using a standalone +kernel. We start with an empty CUDA kernel:

+
// Empty kernel to compute an FFT of size 128 using float
+__global__ void fft_128_float(float2* data) {
+
+}
+
+
+

First, we have to provide an FFT description to the cuFFTDx library. A cuFFTDx transform description +is built using C++ constructs that are evaluated at compile time. A correctly-defined FFT must include +the problem size, the precision used (float, double, etc.), the type of operation (complex-to-complex, +real-to-complex, etc.), and its direction (forward, or inverse). We add the following lines:

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>());
+}
+
+
+

In order to encode the FFT properties, cuFFTDx provides operators Size Operator, +Precision Operator, Type Operator, and Direction Operator. +Listed operators can be combined by using the addition operator (+).

+

To obtain a fully usable CUDA FFT kernel, we need to provide three additional +pieces of information. The first one is how many FFTs we would like to compute, +the second one is how to map the calculations into a CUDA block, and the +last one is what CUDA architecture we are targeting.

+

In cuFFTDx, we specify how many FFTs we want to compute using the FFTs Per Block Operator. +It defines how many FFT to do in parallel inside of a single CUDA block. Let us +add that operator:

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+// and one FFT per block
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>());
+}
+
+
+

To map the computing of the FFT to the CUDA block, we use the Elements Per Thread Operator. +This operator determines the number of registers required per thread and the exact implementation +to be used. It also influences the required CUDA block size. We add that operator to the description:

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+// and one FFT per block with 8 elements per thread
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>());
+}
+
+
+

Finally, we use the SM Operator to indicate the target CUDA architecure +on which we want to build the FFT descriptor. Each GPU architecture can use different +parameters. Therefore, the choice of architecture potentially affects the configuration +to maximize performance. For this example, we target Volta GPUs (SM<700>()):

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+// and one FFT per block with 8 elements per thread
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>());
+}
+
+
+

Once the FFT description is fully formed, we can finalize it by adding the +Block Operator. It indicates that we are asking for the +collective FFT operation to be performed by a single CUDA block. The operator +verifies correctness of the description, and it is a type of Execution Operators, +(the other being the Thread Operator).

+
#include <cufftdx.hpp>
+
+// Kernel containing a fully-formed descriptor of an
+// FFT of size 128 using float and one FFT per block
+// with 8 elements per thread, targeting Volta arch
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+}
+
+
+
+

What next?

+

FFT descriptions can be instantiated into objects. Forming the object has +no computational cost, and should be seen as a handle. The FFT descriptor object +provides a compute method, execute(...) that performs the requested FFT.

+
#include <cufftdx.hpp>
+
+// Kernel containing a fully-formed descriptor of an FFT and its
+// execution
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+
+  // Execute FFT
+  FFT().execute(/*What are the arguments?*/);
+}
+
+
+

cuFFTDx operations require registers and shared memory to operate. Users can query the FFT descriptor +for needed resources.

+
#include <cufftdx.hpp>;
+
+// Kernel containing a fully-formed descriptor of an FFT and its
+// execution, where each thread allocates data in registers
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                      + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                      + ElementsPerThread<8>() + SM<700>() + Block())
+
+  using complex_type = typename FFT::value_type;
+
+  complex_type thread_data[FFT::storage_size];
+
+  extern __shared__ complex_type shared_mem[];
+
+  // Execute FFT
+  FFT().execute(thread_data, shared_mem);
+}
+
+
+

Some FFTs, depending on the selected size, may also require additional global memory workspace, +which needs to be allocated on host and passed to the kernel. You can check if you have to create workspace +using FFT::requires_workspace <requiresworkspace-block-trait-label> trait.

+
#include <cufftdx.hpp>
+
+using namespace cufftdx;
+
+using FFT = decltype(Size<151>() + Precision<double>() + Type<fft_type::c2c>()
+                    + Direction<fft_direction::inverse>() + FFTsPerBlock<2>()
+                    + ElementsPerThread<16>() + SM<700>() + Block());
+
+// Kernel containing a fully-formed descriptor of an FFT and its
+// execution, where each thread allocates data in registers
+__global__ void fft_128_float(float2* data, typename FFT::workspace_type workspace) {
+  using complex_type = typename FFT::value_type;
+
+  complex_type thread_data[FFT::storage_size];
+
+  extern __shared__ complex_type shared_mem[];
+
+  // Execute FFT
+  FFT().execute(thread_data, shared_mem, workspace);
+}
+
+
+

To launch a kernel we need to know the block size and required amount of shared memory needed to perform the FFT +operation. Both are fixed and determined by the FFT description.

+

Since we defined the FFT description in device code, information about the +block size needs to be propagated to the host. When all parameters are fully specified, +all GPU architectures use the same block size, so the kernel can be launched in +the same manner for all architectures.

+
#include <cufftdx.hpp>
+
+// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    using complex_type = typename FFT::value_type;
+
+    complex_type thread_data[FFT::storage_size];
+
+    extern __shared__ complex_type shared_mem[];
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+}
+
+// Host function, data is a managed memory pointer
+void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+
+  using complex_type = typename FFT::value_type;
+
+  cudaError_t error_code = cudaSuccess;
+  auto workspace = make_workspace<FFT>(error_code);
+
+  block_fft_kernel<FFT><<<1, FFT::block_dim, FFT::shared_memory_size>>>((complex_type*)data, workspace);
+}
+
+
+

If we also add input/output operations to global memory, we obtain a kernel that is +equivalent to the cuFFT kernel for size 128.

+
#include <cufftdx.hpp>
+
+// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    using namespace cufftdx;
+
+    using complex_type = typename FFT::value_type;
+
+    // Local array and copy data into it
+    complex_type thread_data[FFT::storage_size];
+
+    const int stride = size_of<FFT>::value / FFT::elements_per_thread;
+
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      thread_data[i].x = data[threadIdx.x + i * stride].x;
+      thread_data[i].y = data[threadIdx.x + i * stride].y;
+    };
+
+    extern __shared__ complex_type shared_mem[];
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+
+    // Save results
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      data[threadIdx.x + i * stride].x = thread_data[i].x;
+      data[threadIdx.x + i * stride].y = thread_data[i].y;
+    };
+}
+
+// Host function, data is a managed memory pointer
+void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+
+  using complex_type = typename FFT::value_type;
+
+  cudaError_t error_code = cudaSuccess;
+  auto workspace = make_workspace<FFT>(error_code);
+
+  block_fft_kernel<FFT><<<1, FFT::block_dim, FFT::shared_memory_size>>>((complex_type*)data, workspace);
+}
+
+
+

Unlike cuFFT, cuFFTDx does not require moving data back to global memory after +executing a FFT operation. This is a major performance advantage.

+
+
+

Compilation

+

In order to compile we only need to pass the location of the cuFFTDx library (the directory with the cufftdx.hpp file).

+
nvcc -std=c++11 -arch sm_70 -O3 -I<path_to_cuFFTDx_location> my_fft_kernel_128.cu -o my_fft_kernel_128
+
+
+
+

Note

+

Since version 0.3.0 cuFFTDx has an experimental support for compilation with NVRTC.

+
+
+
+
+

Your next custom FFT kernels

+

For real world use cases, it is likely we will need more than a single kernel. +A single use case, aiming at obtaining the maximum performance on multiple architectures, +may require a number of different implementations. cuFFTDx was designed +to handle this burden automatically, while offering users full control over +the implementation details.

+

cuFFTDx allows user to defer the definition of certain details of the implementation +(such as the number of FFT elements computed per thread, or the number of FFTs per block) +to the library. Let us apply this to our previous kernel:

+
#include <cufftdx.hpp>
+
+// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    using namespace cufftdx;
+
+    using complex_type = typename FFT::value_type;
+
+    // Local array and copy data into it
+    complex_type thread_data[FFT::storage_size];
+
+    const int stride = size_of<FFT>::value / FFT::elements_per_thread;
+
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      thread_data[i].x = data[threadIdx.x + i * stride].x;
+      thread_data[i].y = data[threadIdx.x + i * stride].y;
+    };
+
+    extern __shared__ complex_type shared_mem[];
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+
+    // Save results
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      data[threadIdx.x + i * stride].x = thread_data[i].x;
+      data[threadIdx.x + i * stride].y = thread_data[i].y;
+    };
+}
+
+// Host function, data is managed memory pointer
+void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  // Create a complete descriptor
+  using FFTComplete = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                             + Direction<fft_direction::forward>() + SM<700>());
+
+  if(is_complete_fft<FFTComplete>::value == true) {
+
+    // Retrieve suggested elements per block and FFTs per block and use them
+    // to create a complete descriptor
+    using FFTExecution = decltype(FFTComplete()
+                                + ElementsPerThread<FFTComplete::elements_per_thread>()
+                                + FFTsPerBlock<FFTComplete::suggested_ffts_per_block>()
+                                + Block());
+
+    using complex_type = typename FFTExecution::value_type;
+
+    cudaError_t error_code = cudaSuccess;
+    auto workspace = make_workspace<FFT>(error_code);
+
+    block_fft_kernel<FFTExecution><<<1, FFTExecution::block_dim, FFTExecution::shared_memory_size>>>(
+        (complex_type*)data, workspace
+    );
+  }
+}
+
+
+

To retrieve the optimal parameters, we require a complete descriptor (as indicated by +cufftdx::is_complete_fft). This is because some of the details are only available +after the FFT operation has been fully described, and the target architecture has been +identified. SM Operator compiled on the host allows the user to query +launch parameters for a particular architecture.

+
+

What happens under the hood?

+
+
Expression templates

The cuFFTDx API is using a variation of a C++ technique called expression templates. +We use expression templates to allow the user to construct compile-time objects that +describe the FFT calculation to compute. Compile-time C++ mechanisms allow cuFFTDx to +attach optimized FFT routines to the object, and expose them as a compute method +that can be called by the user.

+
+
Header only

cuFFTDx FFT routines are shipped as optimized inline PTX.

+
+
+
+
+

Why?

+

For a library to be useful, it needs to abstract functionality in a future-proof manner. +By future-proof we mean that an existing user code should not need to be modified +in the future, and new functionality should consist of simple extensions to the +existing code. On the CUDA platform, this requires adapting to quickly evolving +GPU hardware.

+

cuFFTDx approaches future-proofing in two ways. On one hand, the API is a +source-level abstraction which decouples the library from ABI changes. +Along with the PTX code in headers, cuFFTDx is forward-compatible with any CUDA +toolkit, driver and compiler that supports hardware that cuFFDx was released for. +PTX can be recompiled by the CUDA compiler to run on future GPU architectures.

+

On the other hand, the API organization allows preserving operators describing +what gets computed and how. New features depending on type can either be picked up +automatically if code defers implementation choices to the library, or require +adding operators to an existing expression.

+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/objects.inv b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/objects.inv new file mode 100644 index 0000000000000..7f7f67075acbf Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/objects.inv differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/performance.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/performance.html new file mode 100644 index 0000000000000..e0628f1eb3ec8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/performance.html @@ -0,0 +1,335 @@ + + + + + + Achieving high performance — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Achieving high performance
  • +
  • +
  • +
+
+
+
+
+ +
+

Achieving high performance

+

In High-Performance Computing, the ability to write customized code enables +users to target better performance. In the case of cuFFTDx, the potential for +performance improvement of existing FFT applications is high, but it greatly +depends on how the library is used. Taking the regular cuFFT library as +baseline, the performance may be up to one order of magnitude better or worse. +For this reason porting existing sources to cuFFTDx should always be done in +parallel with performance analysis. Below we list general advice that +may help in this process.

+
+

General advice

+
    +
  • Try library-provided default settings to start with best compute performance

  • +
  • Best parameters for compute bound and memory bound kernels might not be identical

  • +
  • Ensure FFT kernel runs enough blocks to fill a GPU for peak performance

  • +
  • Merge adjacent memory bound kernels (pre- and post-processing) with an FFT kernel to save global memory trips

  • +
+
+
+

Memory management

+
    +
  • Avoid reading/writing data from global memory

  • +
  • Ensure global memory reads/writes are coalesced (increase the value of FFTs Per Block Operator if needed)

  • +
  • Use shared memory or extra registers to store the temporary data

  • +
+
+
+

Kernel fusion

+
    +
  • For complex kernels consider adjusting FFT operation to match user kernel +(ie. tweaking Elements Per Thread Operator will change required CUDA block size). Upcoming versions of +cuFFTDx will offer more customization options.

  • +
  • For simple operations consider merging operations into FFT kernel optimized +for FFT performance.

  • +
+
+
+

Advanced

+ +
+ +
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/release_notes.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/release_notes.html new file mode 100644 index 0000000000000..1c86b9d57e70e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/release_notes.html @@ -0,0 +1,290 @@ + + + + + + Release Notes — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Release Notes
  • +
  • +
  • +
+
+
+
+
+ +
+

Release Notes

+

This section includes significant changes, new features, performance improvements, and various issues. Unless noted, +listed issues should not impact functionality. When functionality is impacted, we offer a work-around to avoid the issue (if available).

+
+

1.0.0

+

The first general availability (GA) release of cuFFTDx library.

+
+

New Features

+ +
+
+

Resolved Issues

+
    +
  • ptxas warning program uses 32-bit address on line XXX which is conflicting with .address_size 64 shouldn’t appear anymore.

  • +
+
+
+
+

0.3.1

+

The last early access (EA) release of cuFFTDx library.

+
+

Known Issues

+
    +
  • ptxas warning about pointer size conflict:

    +
    ptxas warning : Program uses 32-bit address on line 'XXX' which is conflicting with .address_size 64
    +
    +
    +

    This warning may appear when compiling, but it does not impact functionality or performance.

    +
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/requirements_func.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/requirements_func.html new file mode 100644 index 0000000000000..e8a5ca86f54fc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/requirements_func.html @@ -0,0 +1,396 @@ + + + + + + Requirements and Functionality — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Requirements and Functionality
  • +
  • +
  • +
+
+
+
+
+ +
+

Requirements and Functionality

+
+
+

Requirements

+

The cuFFTDx library is a CUDA C++ header only library. Therefore, the list of required software to use the library is relatively small. User needs:

+
    +
  • CUDA Toolkit 11.0 or newer

  • +
  • Supported CUDA compiler

  • +
  • Supported host compiler (C++17 required)

  • +
  • (Optionally) CMake (version 3.18 or greater)

  • +
+
+

Supported Compilers

+

CUDA Compilers:

+
    +
  • NVCC 11.0.194+ (CUDA Toolkit 11.0 or newer)

  • +
  • (Experimental support) NVRTC 11.0.194+ (CUDA Toolkit 11.0 or newer)

  • +
+

Host / C++ Compilers:

+
    +
  • GCC 7+

  • +
  • Clang 9+ (only on Linux/WSL2)

  • +
  • Compiling with MSVC (Windows) is not supported

  • +
+
+

Note

+

cuFFTDx emits errors for unsupported versions of compilers, which can be silenced by defining CUFFTDX_IGNORE_DEPRECATED_COMPILER +during compilation. cuFFTDx is not guaranteed to work with versions of compilers that are not supported in cuFTTDx.

+
+
+

Note

+

cuFFTDx emits errors for unsupported versions of C++ standard, which can be silenced by defining CUFFTDX_IGNORE_DEPRECATED_DIALECT +during compilation. cuFFTDx is not guaranteed to work with versions of C++ standard that are not supported in cuFTTDx.

+
+
+
+
+

Supported Functionality

+
+
Supported functions include:
    +
  • Create block descriptors that run collective FFT operations (with one or more threads collaborating to compute one or more FFTs) in a single CUDA block. See Block Operator.

  • +
  • Create thread descriptors that run a single FFT operation per thread. This function might require more expertise with cuFFTDx in order to obtain correct results with higher performance. See Thread Operator.

  • +
  • Bi-directional information flow, from the user to the descriptor via Operators and from the descriptor to the user via Traits.

  • +
  • Target specific GPU architectures using the SM Operator. This enables users to configure the descriptor with suggested parameters to target performance.

  • +
+
+
+

cuFFTDx supports selected FFT sizes in the range [0; max_size] and all sizes in the range [0; max_size/2], where max_size depends on precision, type, +and CUDA architecture. However, not every combination of size, precision, elements per thread, and FFTs per block is correct and available. The following +table summarizes the available configurations:

+ +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Type

Precision

Thread FFT Sizes

Block FFT Sizes

Architecture

Size Range

    +
  • Complex-to-complex

  • +
  • Real-to-complex

  • +
  • Complex-to-real

  • +
+

half

All sizes in range: [2; 32]

75

[2; 4096]

70;72;86

[2; 16384]

80

[2; 32768]

float

All sizes in range: [2; 32]

75

[2; 4096]

70;72;86

[2; 16384]

80

[2; 32768]

double

All sizes in range: [2; 16]

75

[2; 2048]

70;72;86

[2; 8192]

80

[2; 16384]

+
+

Note

+

cuFFTDx 0.3.0 added preliminary support for all sizes in range of [0; max_size/2]. Most sizes will require you to create additional workspace with global memory allocation. See Make Workspace Function +for more details about workspace. You can check if a given FFT requires with FFT::requires_workspace trait.

+
+
+

Workspace is not required for FFTs of following sizes:

+
    +
  • Powers of 2 up to 32768

  • +
  • Powers of 3 up to 19683

  • +
  • Powers of 5 up to 15625

  • +
  • Powers of 6 up to 1296

  • +
  • Powers of 7 up to 2401

  • +
  • Powers of 10 up to 10000

  • +
  • Powers of 11 up to 1331

  • +
  • Powers of 12 up to 1728

  • +
+
+
In the future versions of cuFFTDx:
    +
  • Workspace requirement may be removed for other configurations.

  • +
  • FFT configurations that do not require workspace will continue to do so.

  • +
+
+
+
+
+
Functionality not yet supported include:
    +
  • Input/output stored in global memory. Input data must be in registers (local memory) or shared memory.

  • +
  • The BlockDim Operator, which enables fine-grain customization of the CUDA block dimensions.

  • +
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/search.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/search.html new file mode 100644 index 0000000000000..469152f361622 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/search.html @@ -0,0 +1,269 @@ + + + + + + Search — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Search
  • +
  • +
  • +
+
+
+
+
+ + + + +
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/searchindex.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/searchindex.js new file mode 100644 index 0000000000000..354e51f853d56 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({docnames:["api/empty_workspace_list","api/index","api/methods","api/operators","api/traits","index","introduction","performance","release_notes","requirements_func","warnings/bit_identical","warnings/bit_identical_sm"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,sphinx:56},filenames:["api/empty_workspace_list.rst","api/index.rst","api/methods.rst","api/operators.rst","api/traits.rst","index.rst","introduction.rst","performance.rst","release_notes.rst","requirements_func.rst","warnings/bit_identical.rst","warnings/bit_identical_sm.rst"],objects:{},objnames:{},objtypes:{},terms:{"0":[2,4,5,6,9],"1":[2,3,4,5,6,7],"10":[0,2,9],"10000":[0,2,9],"11":[0,2,6,9],"12":[0,2,9],"128":[2,3,4,6],"1296":[0,2,9],"1331":[0,2,9],"151":6,"15625":[0,2,9],"16":[3,6,9],"16384":[3,9],"17":9,"1728":[0,2,9],"18":9,"194":9,"19683":[0,2,9],"2":[0,2,3,4,6,7,9],"2048":9,"2401":[0,2,9],"2d":3,"3":[0,2,5,6,7,9],"32":[3,8,9],"32768":[0,2,3,9],"3d":3,"4":[2,4,7],"4096":9,"5":[0,2,7,9],"6":[0,2,4,7,9],"64":8,"7":[0,2,7,9],"70":9,"700":[3,6],"72":9,"720":3,"75":9,"750":3,"8":[2,3,4,6],"80":9,"800":3,"8192":9,"86":9,"860":3,"9":9,"abstract":6,"boolean":4,"byte":[2,4],"case":[6,7],"class":[2,6],"const":6,"default":[3,4,7,8],"do":[0,2,6,9],"final":6,"float":[2,3,4,6,9],"function":[1,3,4,5,6,7,8],"int":[3,4,6],"long":2,"new":6,"return":[2,4],"true":[2,3,4,6],"try":7,"void":[2,6],"while":6,A:[2,5,6],By:6,For:[2,3,4,6,7],If:[2,3,4,6],In:[0,6,7,9],Is:3,It:[2,3,6,10,11],Not:3,On:6,One:[3,4],The:[2,3,4,5,6,8,9],There:[3,4],These:2,To:6,With:3,__global__:[2,6],__half:[3,4],__launch_bounds__:[2,6],__shared__:[2,6],_valuetyp:4,a100:8,abi:6,abil:[5,7],abl:4,about:[4,6,8,9],access:8,achiev:5,ad:[3,4,6,8,9],adapt:6,add:6,addit:[2,6,9],address:8,address_s:8,adjac:7,adjust:[5,7],advanc:5,advantag:6,advic:5,affect:6,after:[2,6],aim:6,align:2,alignof:2,all:[2,4,6,9],alloc:[2,4,6,9],allow:[3,6],along:6,also:[2,4,6],alwai:[4,7],amount:6,amper:[3,7],an:[2,3,4,5,6,7],analysi:7,ani:[2,4,6],anymor:8,api:[5,6,7,8],appear:8,appli:6,applic:[5,7],approach:6,ar:[2,3,4,6,7,9],arch:6,architectur:[2,3,6,9,11],architecur:[3,6],argument:[2,6],around:8,arrai:[2,6],ask:6,assum:3,attach:6,auto:[2,4,6],automat:6,avail:[2,4,6,7,8,9],avoid:[7,8],back:[2,6],baselin:7,batch:[2,5],becaus:6,been:[2,4,6],being:6,below:7,best:7,better:7,between:[2,4],bi:9,bit:[2,3,8,10,11],block:[1,6,7,8,9,10],block_dim:[2,3,4,6],block_fft_kernel:[2,6],blockdim:[2,4,9,10],both:[4,6],bound:7,build:[4,6],built:[4,6],burden:6,c2c:[2,3,4,6],c2r:[2,3,4],c:[6,7,9],calcul:[2,3,4,5,6,7,10],call:[2,6],can:[1,2,3,4,5,6,9],cast:[2,4],cc:3,certain:6,chang:[6,7,8],check:[4,6,9],choic:6,clang:9,cmake:9,coalesc:7,code:[2,6,7],collabor:9,collect:[3,4,6,9],com:7,combin:[3,6,9],commun:2,compat:[5,6],compil:[3,4,5,8],complet:[3,6],complex:[2,3,4,6,7,9],complex_typ:[2,4,6],compon:[1,5],comprehens:5,comput:[2,3,4,6,7,9],concurr:[2,3],configur:[0,2,4,6,9],conflict:8,consid:7,consist:[3,5,6],construct:[2,3,4,6],contain:[2,6],context:[3,4],continu:[0,2,9],control:6,cooper:3,copi:[2,6],correct:[6,9],correctli:[2,6],cost:6,cout:4,creat:[2,3,4,6,9],cu:6,cuda:[2,3,4,5,6,7,9,10,11],cudaerror_t:[2,4,6],cudamallocmanag:4,cudaoccupancymaxactiveblockspermultiprocessor:7,cudasuccess:[2,6],cuffdx:[3,6],cufft:[5,6,7],cufftdx:[0,2,3,4,7,8,9],cufftdx_ignore_deprecated_compil:9,cufftdx_ignore_deprecated_dialect:9,cufttdx:9,custom:[3,5,7,9],customiz:5,data:[3,4,5,6,7,9],decltyp:[2,3,4,6],decoupl:6,decreas:5,defer:6,defin:[2,3,4,6,9],definit:6,depend:[4,6,7,9],describ:[2,3,6],descript:[1,2,6],descriptor:[2,3,4,6,9],design:6,destruct:2,detail:[4,6,9],determin:[3,4,6,7],devic:[4,5,6],differ:[2,3,4,5,6,10,11],dim3:4,dimens:[2,3,9,10],direct:[2,6,9,10],direction_of:4,directli:4,directori:6,divid:[3,4],divisor:3,doc:7,document:[5,8],doe:[4,6,8],doesn:[2,4],don:2,done:7,doubl:[2,3,4,6,9],double2:2,driver:6,dure:9,e:[2,3],ea:8,each:[2,3,4,6],earli:8,either:[3,4,6],element:[2,6,7,9,10],elements_per_thread:[3,4,6],elementsperthread:[2,3,4,6,8,10],embedd:5,emit:9,empti:[2,4,6],enabl:[5,7,9],encod:6,endl:4,enough:7,ensur:7,entir:7,equal:[2,4],equival:6,error:[2,4,9],error_cod:6,etc:[5,6],evalu:6,everi:9,evolv:6,exact:6,exactli:[2,3,11],exampl:[1,3,6],exclus:3,execut:[1,5,6,8,10,11],exist:[6,7],expect:[2,4],experiment:[6,9],expertis:9,expos:6,express:6,extend:4,extens:[5,6],extern:[2,6],extra:[4,7],f:3,fail:[3,4],fals:[2,3,4],fast:[3,5],featur:6,fft:[0,2,5,7,8,9,10,11],fft_128_float:6,fft_direct:[2,3,4,6],fft_input_size_in_byt:2,fft_output_size_in_byt:2,fft_type:[2,3,4,6],fftcomplet:6,fftexecut:6,ffts_per_block:[2,3,4],fftsperblock:[2,3,4,6,8,10],file:[4,6],fill:7,find:1,fine:9,first:[2,5,8],fit:2,fix:6,float2:[2,6],flow:9,follow:[0,2,3,4,6,9],form:[3,4,6],format:4,forward:[2,3,4,6],fourier:[3,5],fp16:2,free:2,freed:2,from:[2,4,5,6,7,9],full:[6,8],fulli:6,further:5,fuse:5,fusion:5,futur:[0,2,4,5,6,9],ga:8,gcc:9,gener:[3,5,8],get:6,given:9,global:[2,4,5,6,7,9],go:4,gpu:[2,3,6,7,9,11],grain:9,greater:[3,4,9],greatli:7,group:3,group__cudart__occup:7,guarante:[2,3,9,10,11],guid:7,ha:[2,4,6],half:[2,9],hand:6,handl:6,happen:5,hardwar:6,have:[2,4,6],header:[4,6,9],held:4,help:7,helper:[2,4],here:1,heurist:[3,4],high:5,higher:[4,9],hood:5,host:[6,9],how:[3,4,6,7],howev:[2,9],hpp:[2,3,4,6],html:7,http:7,i:[2,6],ident:[2,3,7,10,11],identifi:6,ie:7,imaginari:2,imaginary_1:2,imaginary_2:2,impact:8,implement:[3,4,6],implicit_type_batch:[2,4],implicitli:2,improv:[5,7,8],includ:[2,3,4,6,8,9],increas:7,independ:3,index:[2,7],indic:6,influenc:6,inform:[4,6,9],inlin:6,input:[3,6,9],input_typ:4,insid:[3,5,6],instanti:6,invalid:2,invers:[3,4,6],iostream:4,is_complet:4,is_complete_fft:[4,6],is_complete_fft_execut:[2,4],is_fft:4,is_fft_execut:4,its:[2,4,6],kernel:[2,5,8],know:6,label:[4,6],last:[2,6,8],latenc:5,later:2,launch:[6,7],let:6,level:6,librari:[1,5,6,7,8,9],lifetim:[2,4],like:6,line:[6,8],linux:9,list:[6,7,8,9],load:[2,7],local:[2,6,9],locat:6,logic:2,lose:7,magnitud:7,mai:[0,2,4,6,7,8,9],main:[1,5],major:6,make:[1,4,9],make_workspac:[2,4,6],manag:[4,5,6],mani:6,manner:6,map:6,match:7,max_siz:9,max_threads_per_block:[2,4,6],maxim:[4,6],maximum:[2,4,6],mean:[2,4,6],mechan:6,memori:[5,6,9],merg:7,messag:4,method:[1,4,5,6,8],might:[7,9],miss:4,modifi:6,more:[2,3,4,6,7,9],most:9,move:6,movement:5,msvc:9,much:4,multipl:[4,6],must:[2,3,4,6,9],mutual:3,my_fft_kernel_128:6,n:2,namespac:6,natur:2,necessari:[3,4],need:[5,6,7,9],never:2,newer:9,next:5,none:4,note:[4,5],nsight:7,nsightcomput:7,number:[2,3,4,6,10],nvcc:[6,9],nvidia:7,nvrtc:[6,9],o3:6,o:6,object:[2,3,4,6],obtain:[6,9],occup:7,offer:[6,7,8],onc:6,one:[3,4,6,7,9],onli:[2,3,4,6,9],oper:[1,2,4,5,6,7,9],optim:[6,7,8],optimum:7,option:[5,7,9],order:[2,6,7,9],organ:6,other:[0,2,4,5,6,9],otherwis:[2,4],our:6,output:[3,6,9],output_typ:4,over:6,overload:2,overview:5,p:3,parallel:[3,4,6,7],paramet:[4,6,7,9],part:[2,4],particip:2,particular:6,pass:[2,4,6],path_to_cufftdx_loc:6,peak:7,per:[2,6,7,9,10],perform:[3,4,5,6,8,9],pick:6,piec:6,place:2,platform:6,pleas:4,point:[2,3,4],pointer:[2,6,8],port:7,post:7,potenti:[6,7],power:[0,2,3,8,9],practic:7,pre:7,precis:[2,5,6,9,10],precision_of:4,preliminari:9,preserv:6,previou:6,problem:6,process:[2,7],produc:[2,3,10,11],program:8,proof:6,propag:6,properti:6,provid:[4,5,6,7],ptx:6,ptxa:8,queri:6,quick:5,quickli:6,r2c:[2,3,4],race:2,rang:[3,9],read:5,real:[2,3,6,9],real_1:2,real_2:2,reason:7,recompil:6,refer:[2,5],regist:[6,7,9],regular:7,rel:9,releas:[4,5,6],remov:[0,2,9],replac:4,request:[3,6],requir:[0,2,3,5,6,7],requires_workspac:[2,3,4,6,9],requiresworkspac:6,resourc:[6,7],respons:2,restor:8,restrict:3,result:[2,3,6,9,10,11],retriev:[4,6],routin:[5,6],rule:2,run:[2,3,4,6,7,9],runtim:7,s:[2,3,4],same:[2,3,4,6,10,11],save:[5,6,7],second:[2,6],section:[2,8],see:[2,3,4,8,9],seen:6,select:[5,6,9],separ:[3,7],set:[3,4,7],share:[6,7,8,9],shared_mem:[2,6],shared_memori:2,shared_memory_input:2,shared_memory_s:[2,4,6],ship:6,should:[2,3,4,6,7,8],shouldn:8,side:4,signific:8,silenc:9,simpl:[6,7],simultan:3,sinc:[2,6],singl:[2,3,6,9],size:[0,2,5,6,7,8,9,10],size_byt:4,size_of:[3,4,6],sizeof:4,sm70:8,sm80:8,sm:[4,6,9],sm_70:[3,6],sm_72:3,sm_75:3,sm_80:3,sm_86:3,small:9,smaller:[2,3],so:[0,2,6,9],softwar:9,solv:3,some:6,sourc:[6,7],specif:9,specifi:[3,6],standalon:6,standard:9,start:[5,6,7],std:[4,6],storage_s:[2,4,6],store:[2,7,9],stream:7,stride:[2,6,8],struct:3,suggest:[6,9],suggested_ffts_per_block:[4,6],summar:9,support:[3,5,6],t:[2,4,8],tabl:9,take:7,target:[3,4,6,7,8,9],techniqu:6,templat:[2,6],temporari:7,th:[2,4],than:[2,3,4,6],thei:[3,4],them:6,therefor:[6,9],thi:[2,3,4,6,7,8,9],those:2,thread:[1,6,7,9,10],thread_data:[2,6],threadidx:6,three:6,thu:2,time:6,toolkit:[5,6,9],total:4,track:[2,4],trait:[1,2,5,6,9],transform:[3,5,6],trip:[5,7],tune:[3,7],ture:[3,7],tweak:7,two:[2,3,5,6,8],type:[2,6,9,10],type_of:4,typenam:[2,4,6],u:3,under:5,underli:[2,3,4],unless:[3,4,8],unlik:6,unnecessari:5,unnorm:3,unsign:[3,4],unsupport:9,up:[0,2,6,7,9],upcom:7,upon:2,us:[2,3,4,5,7,8,9],usabl:6,usag:1,user:[2,3,4,6,7,9],v100:8,valid:[2,4],valu:[3,6,7,8],value_typ:[2,4,6],variat:6,variou:8,verifi:6,version:[0,2,5,6,7,9],via:9,volta:[3,6,7],wa:[2,4,6],wai:6,want:6,warn:8,we:[4,6,7,8],well:3,what:[5,7],when:[2,3,4,6,8],where:[2,6,9],which:[2,3,4,6,8,9],why:5,window:9,within:[2,3,4],without:7,won:4,work:[8,9],workspac:[0,1,6,9],workspace_s:[2,4],workspace_typ:[2,4,6],world:6,wors:7,would:6,write:7,wsl2:9,x:[3,4,6],xxx:8,y:[3,4,6],yet:[3,9],you:[1,5,6,9],your:5,z:[3,4],zero:4},titles:["<no title>","cuFFTDx API Reference","Execution Methods","Operators","Traits","NVIDIA cuFFTDx","First FFT using cuFFTDx","Achieving high performance","Release Notes","Requirements and Functionality","<no title>","<no title>"],titleterms:{"0":8,"1":8,"3":8,"function":[2,9],"new":8,In:2,Is:4,achiev:7,advanc:7,advic:7,api:1,batch:4,block:[2,3,4],blockdim:3,compil:[6,9],complet:4,configur:3,cufftdx:[1,5,6],custom:6,data:2,descript:[3,4],dim:4,direct:[3,4],element:[3,4],exampl:[2,4],execut:[2,3,4],featur:8,fft:[3,4,6],first:6,format:2,further:7,fusion:7,gener:7,guid:5,happen:6,high:7,highlight:5,hood:6,implicit:4,input:[2,4],issu:8,kernel:[6,7],known:8,make:2,manag:7,max:4,memori:[2,4,7],method:2,next:6,note:8,nvidia:5,oper:3,output:[2,4],per:[3,4],perform:7,precis:[3,4],read:7,refer:[1,7],regist:2,releas:8,requir:[4,9],resolv:8,share:[2,4],size:[3,4],sm:3,storag:4,stride:4,suggest:4,support:9,thread:[2,3,4],trait:4,type:[3,4],under:6,us:6,user:5,valu:[2,4],what:6,why:6,workspac:[2,4],your:6}}) \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/warnings/bit_identical.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/warnings/bit_identical.html new file mode 100644 index 0000000000000..9b02de0cfa38d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/warnings/bit_identical.html @@ -0,0 +1,258 @@ + + + + + + <no title> — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • <no title>
  • +
  • +
  • +
+
+
+
+
+ +
+

Warning

+

It is not guaranteed that executions of the same FFTs (size, direction, type, precision) but with different

+ +

will produce bit-identical results.

+
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/warnings/bit_identical_sm.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/warnings/bit_identical_sm.html new file mode 100644 index 0000000000000..96837921c736c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/docs/cufftdx/warnings/bit_identical_sm.html @@ -0,0 +1,253 @@ + + + + + + <no title> — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • <no title>
  • +
  • +
  • +
+
+
+
+
+ +
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/CMakeLists.txt b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/CMakeLists.txt new file mode 100644 index 0000000000000..8b857323eefff --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/CMakeLists.txt @@ -0,0 +1,267 @@ +cmake_minimum_required(VERSION 3.18.0) + +# cuFFTDxExamples project +project(cuFFTDxExamples LANGUAGES CXX CUDA) + +# Find CUDA Toolkit packaged, required for NVRTC sample +find_package(CUDAToolkit) + +# Project options +option(USE_MATHDX_PACKAGE "Use mathDx package to find cuFFTDx" ON) +option(USE_CUFFTDX_PACKAGE "Use cuFFTDx package to find cuFFTDx" OFF) + +if(DEFINED cufftdx_ROOT OR DEFINED ENV{cufftdx_ROOT}) + SET(USE_CUFFTDX_PACKAGE ON CACHE BOOL "Use cuFFTDx package to find cuFFTDx" FORCE) + SET(USE_MATHDX_PACKAGE OFF CACHE BOOL "Use mathDx package to find cuFFTDx" FORCE) +endif() + +if(DEFINED mathdx_ROOT OR DEFINED ENV{mathdx_ROOT}) + SET(USE_CUFFTDX_PACKAGE OFF CACHE BOOL "Use cuFFTDx package to find cuFFTDx" FORCE) + SET(USE_MATHDX_PACKAGE ON CACHE BOOL "Use mathDx package to find cuFFTDx" FORCE) +endif() + +if(NOT TARGET cufftdx) + if(USE_MATHDX_PACKAGE) + message(STATUS "Using mathDx package to find cuFFTDx") + # Find mathDx and cuFFTDx (mathDx's component) + # Default path: "/opt/nvidia/mathdx/22.2", path to mathDx can be passed cmake in mathdx_ROOT variable + find_package(mathdx REQUIRED COMPONENTS cufftdx CONFIG + PATHS + "${PROJECT_SOURCE_DIR}/../.." # example/cufftdx + "${PROJECT_SOURCE_DIR}/../../.." # include/cufftdx/example + "/opt/nvidia/mathdx/22.2" + ) + elseif(USE_CUFFTDX_PACKAGE) + message(STATUS "Using cuFFTDx package to find cuFFTDx") + # Find cuFFTDx + # Default path: "/opt/nvidia/mathdx/22.2/include/cufftdx", path to cuFFTDx can be passed cmake in cufftdx_ROOT variable + find_package(cufftdx REQUIRED CONFIG PATHS "/opt/nvidia/mathdx/22.2/include/cufftdx" "${PROJECT_SOURCE_DIR}/../../cufftdx") + else() + message(FATAL_ERROR "No cuFFTDx package found") + endif() +endif() + +if((NOT TARGET cufftdx) AND (NOT CUFFTDX_TEST_RELEASED_PACKAGE) AND (NOT MATHDX_TEST_RELEASED_PACKAGE)) + # Targeted CUDA Architectures, see https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES + if(CUFFTDX_TARGET_ARCHS) + set(CUFFTDX_TARGET_ARCHS 70;75;80 CACHE + STRING "[LEGACY] List of targeted cuFFTDx Example CUDA architectures (compute capabilities), for example \"70;75\". Can't be older than 70." + ) + list(SORT CUFFTDX_TARGET_ARCHS) + # Remove unsupported architectures + list(REMOVE_ITEM CUFFTDX_TARGET_ARCHS 30;32;35;37;50;52;53;60;61;62) + + # Translate legacy option CUFFTDX_TARGET_ARCHS into CUFFTDX_CUDA_ARCHITECTURES + set(CUFFTDX_TARGET_ARCHS_TRANSLATED) + foreach(ARCH ${CUFFTDX_TARGET_ARCHS}) + list(APPEND CUFFTDX_TARGET_ARCHS_TRANSLATED ${ARCH}-real) + endforeach() + set(CUFFTDX_CUDA_ARCHITECTURES ${CUFFTDX_TARGET_ARCHS_TRANSLATED} CACHE + STRING "List of targeted cuFFTDx CUDA architectures, for example \"70-real;75-real;80\"" + ) + else() + set(CUFFTDX_CUDA_ARCHITECTURES 70-real;75-real;80-real CACHE + STRING "List of targeted cuFFTDX CUDA architectures, for example \"70-real;75-real;80\"" + ) + # Remove unsupported architectures + list(REMOVE_ITEM CUFFTDX_CUDA_ARCHITECTURES 30;32;35;37;50;52;53;60;61;62) + list(REMOVE_ITEM CUFFTDX_CUDA_ARCHITECTURES 30-real;32-real;35-real;37-real;50-real;52-real;53-real;60-real;61-real;62-real) + list(REMOVE_ITEM CUFFTDX_CUDA_ARCHITECTURES 30-virtual;32-virtual;35-virtual;37-virtual;50-virtual;52-virtual;53-virtual;60-virtual;61-virtual;62-virtual) + endif() + message(STATUS "Targeted cuFFTDx Examples CUDA Architectures: ${CUFFTDX_CUDA_ARCHITECTURES}") + + # Global CXX/CUDA flags + if(NOT MSVC) + set(CUFFTDX_CUDA_CXX_FLAGS "${CUFFTDX_CUDA_CXX_FLAGS} -Wall -Wextra") + else() + add_definitions(-D_CRT_SECURE_NO_WARNINGS) + add_definitions(-D_CRT_NONSTDC_NO_WARNINGS) + add_definitions(-D_SCL_SECURE_NO_WARNINGS) + add_definitions(-DNOMINMAX) + set(CUFFTDX_CUDA_CXX_FLAGS "${CUFFT_CUDA_CXX_FLAGS} /W3") # Warning level + set(CUFFTDX_CUDA_CXX_FLAGS "${CUFFT_CUDA_CXX_FLAGS} /WX") # All warnings are errors + endif() + + # Global CXX flags/options + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CXX_EXTENSIONS OFF) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CUFFTDX_CUDA_CXX_FLAGS}") + + # Global CUDA CXX flags/options + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(CMAKE_CUDA_EXTENSIONS OFF) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${CUFFTDX_CUDA_CXX_FLAGS}\"") + + # Clang + if(BUILD_CUFFTDX) + if(CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*") + # clang complains about unused function in CUDA system headers + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-unused-function") + endif() + endif() + + # CUDA Architectures + set(CMAKE_CUDA_ARCHITECTURES OFF) + + # Enable testing (ctest) + enable_testing() +endif() + +# ############################################################### +# add_cufftdx_example +# ############################################################### +function(add_cufftdx_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) + list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE) + set_source_files_properties(${EXAMPLE_SOURCES} PROPERTIES LANGUAGE CUDA) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + $,mathdx::cufftdx,cufftdx::cufftdx> + ) + if(NOT TARGET cufftdx) + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/cufftdx" + ) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_ARCHITECTURES "${CUFFTDX_CUDA_ARCHITECTURES}" + ) + target_compile_options(${EXAMPLE_TARGET} + PRIVATE + "$<$:SHELL:-Xfatbin -compress-all>" + ) + add_test(NAME ${EXAMPLE_NAME} COMMAND ${EXAMPLE_TARGET}) + set_tests_properties(${EXAMPLE_NAME} + PROPERTIES + LABELS "CUFFTDX_EXAMPLE" + ) + add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET}) +endfunction() + +# ############################################################### +# add_cufft_and_cufftdx_example +# ############################################################### +function(add_cufft_and_cufftdx_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) + list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE) + set_source_files_properties(${EXAMPLE_SOURCES} PROPERTIES LANGUAGE CUDA) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + $,mathdx::cufftdx,cufftdx::cufftdx> + ) + if(CUFFTDX_EXAMPLES_CUFFT_CALLBACK) + if(TARGET cufft) + target_link_libraries(${EXAMPLE_TARGET} PRIVATE cufft_static) + else() + target_link_libraries(${EXAMPLE_TARGET} PRIVATE CUDA::cufft_static) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) + target_compile_definitions(${EXAMPLE_TARGET} PRIVATE CUFFTDX_EXAMPLES_CUFFT_CALLBACK) + else() + if(TARGET cufft) + target_link_libraries(${EXAMPLE_TARGET} PRIVATE cufft) + else() + target_link_libraries(${EXAMPLE_TARGET} PRIVATE CUDA::cufft) + endif() + endif() + if(NOT TARGET cufftdx) + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/cufftdx" + ) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_ARCHITECTURES "${CUFFTDX_CUDA_ARCHITECTURES}" + ) + target_compile_options(${EXAMPLE_TARGET} + PRIVATE + "$<$:SHELL:-Xfatbin -compress-all>" + ) + add_test(NAME ${EXAMPLE_NAME} COMMAND ${EXAMPLE_TARGET}) + set_tests_properties(${EXAMPLE_NAME} + PROPERTIES + LABELS "CUFFTDX_EXAMPLE" + ) + add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET}) +endfunction() + +# ############################################################### +# add_cufftdx_nvrtc_example +# ############################################################### +function(add_cufftdx_nvrtc_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) + list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE) + set_source_files_properties(${EXAMPLE_SOURCES} PROPERTIES LANGUAGE CUDA) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + $,mathdx::cufftdx,cufftdx::cufftdx> + CUDA::nvrtc + ) + if(NOT TARGET cufftdx) + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/cufftdx" + ) + target_compile_definitions(${EXAMPLE_TARGET} + PRIVATE + CUDA_INCLUDE_DIR="${CUDAToolkit_INCLUDE_DIRS}" + CUFFTDX_INCLUDE_DIRS="${cufftdx_INCLUDE_DIRS}" + ) + else() + target_compile_definitions(${EXAMPLE_TARGET} + PRIVATE + CUDA_INCLUDE_DIR="${CUDAToolkit_INCLUDE_DIRS}" + CUFFTDX_INCLUDE_DIRS="${CMAKE_SOURCE_DIR}/libcufftdx/include\\\;${CMAKE_BINARY_DIR}/libcufftdx/include" + ) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_ARCHITECTURES "${CUFFTDX_CUDA_ARCHITECTURES}" + ) + add_test(NAME ${EXAMPLE_NAME} COMMAND ${EXAMPLE_TARGET}) + set_tests_properties(${EXAMPLE_NAME} + PROPERTIES + LABELS "CUFFTDX_EXAMPLE" + ) + add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET}) +endfunction() + +# ############################################################### +# cuFFTDx Examples +# ############################################################### + +add_custom_target(cufftdx_examples) + +# CUFFTDX_EXAMPLES_CUFFT_CALLBACK +option(CUFFTDX_EXAMPLES_CUFFT_CALLBACK "Build cuFFTDx convolution_performance example with cuFFT callback" OFF) + +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_thread" simple_fft_thread.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_thread_fp16" simple_fft_thread_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block" simple_fft_block.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_half2" simple_fft_block_half2.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_fp16" simple_fft_block_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_r2c" simple_fft_block_r2c.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_r2c_fp16" simple_fft_block_r2c_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_c2r" simple_fft_block_c2r.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_c2r_fp16" simple_fft_block_c2r_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_shared" simple_fft_block_shared.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_std_complex" simple_fft_block_std_complex.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_cub_io" simple_fft_block_cub_io.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.convolution" convolution.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.convolution_r2c_c2r" convolution_r2c_c2r.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.block_fft_performance" block_fft_performance.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.block_fft_performance_many" block_fft_performance_many.cu) +add_cufft_and_cufftdx_example(cufftdx_examples "cuFFTDx.example.convolution_performance" convolution_performance.cu) +add_cufftdx_nvrtc_example(cufftdx_examples "cuFFTDx.example.nvrtc_fft_thread" nvrtc_fft_thread.cu) \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/Makefile b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/Makefile new file mode 100644 index 0000000000000..4481f8532f920 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/Makefile @@ -0,0 +1,35 @@ +NVCC=nvcc +NVCC_FLAGS=-std=c++17 -O3 --generate-code arch=compute_70,code=sm_70 --generate-code arch=compute_75,code=sm_75 --generate-code arch=compute_80,code=sm_80 --generate-code arch=compute_86,code=sm_86 + +CUFFTDX_INCLUDE_DIR=../include/ +CUDA_BIN_DIR=$(shell dirname `which $(NVCC)`) +CUDA_INCLUDE_DIR=$(CUDA_BIN_DIR)/../include +NVRTC_DEFINES=-DCUDA_INCLUDE_DIR="\"$(CUDA_INCLUDE_DIR)\"" -DCUFFTDX_INCLUDE_DIRS="\"$(CUFFTDX_INCLUDE_DIR)\"" + +SRCS=$(filter-out nvrtc_fft_thread.cu convolution_performance.cu, $(wildcard *.cu)) +TARGETS=$(patsubst %.cu,%,$(SRCS)) + +NVRTC_SRCS=$(wildcard nvrtc_*.cu) +NVRTC_TARGETS=$(patsubst %.cu,%,$(NVRTC_SRCS)) + +CUFFT_SRCS=convolution_performance.cu +CUFFT_TARGETS=convolution_performance + +$(TARGETS): %: %.cu + $(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) + +$(NVRTC_TARGETS): %: %.cu + $(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(NVRTC_DEFINES) -lnvrtc -lcuda + +$(CUFFT_TARGETS): %: %.cu + $(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(NVRTC_DEFINES) -lcuda -lcufft + +.PHONY: all clean + +all: $(TARGETS) $(NVRTC_TARGETS) $(CUFFT_TARGETS) + $(echo $(NVRTC_TARGETS)) + +clean: + rm -f $(TARGETS) $(NVRTC_TARGETS) $(CUFFT_TARGETS) + +.DEFAULT_GOAL := all diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance.cu new file mode 100644 index 0000000000000..2236983fce1ff --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance.cu @@ -0,0 +1,32 @@ +#include +#include +#include +#include + +#include "block_fft_performance.hpp" + +template +void block_fft_performance() { + using namespace cufftdx; + + using fft_base = decltype(Block() + Type() + Direction() + + Precision() + SM()); + + static constexpr unsigned int elements_per_thread = 8; + static constexpr unsigned int fft_size = 512; + static constexpr unsigned int ffts_per_block = 1; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)) + benchmark_block_fft(stream, true); + CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); +} + +template +struct block_fft_performance_functor { + void operator()() { return block_fft_performance(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance.hpp new file mode 100644 index 0000000000000..af3d40258d0cf --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance.hpp @@ -0,0 +1,185 @@ +#ifndef CUFFTDX_EXAMPLE_BLOCK_FFT_PERFORMANCE_HPP_ +#define CUFFTDX_EXAMPLE_BLOCK_FFT_PERFORMANCE_HPP_ + +#include +#include +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" +#include "random.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data, + unsigned int repeats, + typename FFT::workspace_type workspace) { + using complex_type = typename FFT::value_type; + extern __shared__ complex_type shared_mem[]; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + +// Execute FFT +#pragma unroll 1 + for (unsigned int i = 0; i < repeats; i++) { + FFT().execute(thread_data, shared_mem, workspace); + } + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +template +struct measure { + // Returns execution time in ms + template + static float execution(Kernel&& kernel, cudaStream_t stream) { + cudaEvent_t startEvent, stopEvent; + CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + for (size_t i = 0; i < WarmUpRuns; i++) { + kernel(); + } + + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream)); + kernel(); + CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + float time; + CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent)); + return time; + } +}; + +template +void benchmark_block_fft(const cudaStream_t& stream, bool verbose = false) { + using namespace cufftdx; + + // Create complete FFT description, only now we can query EPT and suggested FFTs per block + using FFT_complete = decltype(FFTBase() + Size()); + + static constexpr unsigned int inside_repeats = 4000; + static constexpr unsigned int kernel_repeats = 1; + static constexpr unsigned int warm_up_runs = 1; + + static constexpr unsigned int fft_size = S; + static constexpr unsigned int elements_per_thread = UseSuggested ? FFT_complete::elements_per_thread : EPT; + static constexpr unsigned int ffts_per_block = UseSuggested ? FFT_complete::suggested_ffts_per_block : FPB; + + using FFT = decltype(FFT_complete() + ElementsPerThread() + FFTsPerBlock()); + using complex_type = typename FFT::value_type; + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, FFT::shared_memory_size)); + + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + block_fft_kernel, + FFT::block_dim.x * FFT::block_dim.y * FFT::block_dim.z, + FFT::shared_memory_size)); + + unsigned int multiprocessor_count = example::get_multiprocessor_count(); + unsigned int cuda_blocks = blocks_per_multiprocessor * multiprocessor_count; + + // The memory required to run fft (number of complex_type values that must be allocated). + // For r2c, the input consists of fft_size real numbers and the output consists of (fft_size / 2 + 1) complex numbers. + // One memory block will be used to store input and output, so the memory block must fit + // max((fft_size + 1) / 2, fft_size / 2 + 1) = (fft_size / 2 + 1) complex numbers. + // For c2r, the input consists of (fft_size / 2 + 1) complex numbers and the output consists of fft_size real numbers, + // so the minimal required memory size is the same. + unsigned int input_size = + ffts_per_block * cuda_blocks * (type_of::value == fft_type::c2c ? fft_size : (fft_size / 2 + 1)); + + // Host data + std::vector input = + example::get_random_complex_data(input_size, -10, 10); + + // Device data + complex_type* device_buffer; + auto size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, size_bytes)); + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + CUDA_CHECK_AND_EXIT(error_code); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + + // Measure performance of N trials + double ms_n = measure<>::execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + block_fft_kernel<<>>( + device_buffer, inside_repeats, workspace); + } + }, + stream); + + // Check kernel error + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Measure performance of 2*N trials + double ms_n2 = measure<>::execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + block_fft_kernel<<>>( + device_buffer, 2 * inside_repeats, workspace); + } + }, + stream); + + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + // Time for N repeats without overhead + auto time_n = ms_n2 - ms_n; + double gflops = 1.0 * kernel_repeats * inside_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time_n / 1000000.0; + + static const std::string fft_type_name = type_of::value == fft_type::c2c ? "c2c" : + (type_of::value == fft_type::c2r ? "c2r" : + "r2c"); + if (verbose) { + std::cout << "FFT type: " << fft_type_name << std::endl; + std::cout << "FFT size: " << fft_size << std::endl; + std::cout << "FFTs elements per thread: " << FFT::elements_per_thread << std::endl; + std::cout << "FFTs per block: " << ffts_per_block << std::endl; + std::cout << "CUDA blocks: " << cuda_blocks << std::endl; + std::cout << "Blocks per multiprocessor: " << blocks_per_multiprocessor << std::endl; + std::cout << "FFTs run: " << ffts_per_block * cuda_blocks << std::endl; + std::cout << "Shared memory: " << FFT::shared_memory_size << std::endl; + std::cout << "Avg Time [ms_n]: " << time_n / (inside_repeats * kernel_repeats) << std::endl; + std::cout << "Time (all) [ms_n]: " << time_n << std::endl; + std::cout << "Performance [GFLOPS]: " << gflops << std::endl; + } else { + std::cout << fft_type_name << ", " << fft_size << ", " << gflops << ", " + << time_n / (inside_repeats * kernel_repeats) << ", " << std::endl; + } +} + +#endif // CUFFTDX_EXAMPLE_BLOCK_FFT_PERFORMANCE_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance_many.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance_many.cu new file mode 100644 index 0000000000000..6eadfcd2acda7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_fft_performance_many.cu @@ -0,0 +1,82 @@ +#include +#include +#include +#include + +#include "block_fft_performance.hpp" + +template +void block_fft_performance(const cudaStream_t& stream, bool verbose) { + using namespace cufftdx; + + using FFT_base = decltype(Block() + Type() + Precision() + SM()); + + using FFT_with_direction = typename std:: + conditional()), FFT_base>::type; + + benchmark_block_fft(stream, verbose); + + if (verbose) + std::cout << std::endl; +} + +template +struct block_fft_performance_functor { + void operator()() { + using namespace cufftdx; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)) + + bool default_verbose = false; + + + // To specify EPT and FPB values, set UsedSuggested to false. + // FFTDirection is used if and only if FFTType is C2C. + // Below is an example of a test run with specified EPT and FPB values. + + block_fft_performance(stream, + default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); + } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_io.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_io.hpp new file mode 100644 index 0000000000000..9c82a24985564 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/block_io.hpp @@ -0,0 +1,394 @@ + +#ifndef CUFFTDX_EXAMPLE_BLOCK_IO_HPP_ +#define CUFFTDX_EXAMPLE_BLOCK_IO_HPP_ + +#include "fp16_common.hpp" + +namespace example { + namespace __io { + template + inline __device__ cufftdx::complex<__half2> convert_to_rrii(const cufftdx::complex<__half2>& value) { + return to_rrii(value); + } + template<> + inline __device__ cufftdx::complex<__half2> convert_to_rrii(const cufftdx::complex<__half2>& value) { + return value; + } + template + inline __device__ cufftdx::complex<__half2> convert_to_riri(const cufftdx::complex<__half2>& value) { + return to_riri(value); + } + template<> + inline __device__ cufftdx::complex<__half2> convert_to_riri(const cufftdx::complex<__half2>& value) { + return value; + } + } // namespace __io + + template + struct io { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + static inline __device__ unsigned int stride_size() { + return FFT::stride; + } + + static inline __device__ unsigned int batch_offset(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return cufftdx::size_of::value * global_fft_id; + } + + template + static inline __device__ void copy(const DataType* source, DataType* target, unsigned int n) { + unsigned int stride = blockDim.x * blockDim.y; + unsigned int index = threadIdx.y * blockDim.x + threadIdx.x; + for (int step = 0; step < FFT::elements_per_thread; step++) { + if (index < n) { + target[index] = source[index]; + } + index += stride; + } + } + + template + static inline __device__ void load_to_smem(const DataType* global, unsigned char* shared) { + if (cufftdx::type_of::value == cufftdx::fft_type::c2c) { + unsigned int input_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(global), + reinterpret_cast(shared), + input_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::c2r) { + unsigned int input_length = blockDim.y * ((cufftdx::size_of::value / 2) + 1); + copy(reinterpret_cast(global), + reinterpret_cast(shared), + input_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::r2c) { + unsigned int input_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(global), + reinterpret_cast(shared), + input_length); + } + __syncthreads(); + } + + template + static inline __device__ void store_from_smem(const unsigned char* shared, DataType* global) { + __syncthreads(); + if (cufftdx::type_of::value == cufftdx::fft_type::c2c) { + unsigned int output_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(shared), + reinterpret_cast(global), + output_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::c2r) { + unsigned int output_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(shared), + reinterpret_cast(global), + output_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::r2c) { + unsigned int output_length = blockDim.y * ((cufftdx::size_of::value / 2) + 1); + copy(reinterpret_cast(shared), + reinterpret_cast(global), + output_length); + } + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto load(const void* input, + ComplexType* thread_data, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return load_c2c((ComplexType*)input, thread_data, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto load(const void* input, + ComplexType* thread_data, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return load_c2r((ComplexType*)input, thread_data, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto load(const void* input, + ComplexType* thread_data, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return load_r2c((scalar_type*)input, thread_data, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto store(const ComplexType* thread_data, + void* output, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return store_c2c(thread_data, (ComplexType*)output, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto store(const ComplexType* thread_data, + void* output, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return store_c2r(thread_data, (scalar_type*)output, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto store(const ComplexType* thread_data, + void* output, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return store_r2c(thread_data, (ComplexType*)output, local_fft_id); + } + + // input - global input with all FFTs + // thread_data - local thread array to load values from input to + // local_fft_id - ID of FFT batch in CUDA block + template + static inline __device__ void load_c2c(const ComplexType* input, + ComplexType* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = input[index]; + index += stride; + } + } + + // If InputInRRIILayout is false, then function assumes that values in input are in RIRI + // layout, and before loading them to thread_data they are converted to RRII layout. + // Otherwise, if InputInRRIILayout is true, then function assumes values in input are in RRII + // layout, and don't need to be converted before loading to thread_data. + template + static inline __device__ void load(const cufftdx::complex<__half2>* input, + cufftdx::complex<__half2>* thread_data, + unsigned int local_fft_id) { + static_assert(std::is_same>::value, + "This can be only used with half precision FFTs"); + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = __io::convert_to_rrii(input[index]); + index += stride; + } + } + + template + static inline __device__ void store_c2c(const ComplexType* thread_data, + ComplexType* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = thread_data[i]; + index += stride; + } + } + + // Function assumes that values in thread_data are in RRII layout. + // If OutputInRRIILayout is false, values are saved into output in RIRI layout; otherwise - in RRII. + template + static inline __device__ void store(const cufftdx::complex<__half2>* thread_data, + cufftdx::complex<__half2>* output, + unsigned int local_fft_id) { + static_assert(std::is_same>::value, + "This can be only used with half precision FFTs"); + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = __io::convert_to_riri(thread_data[i]); + index += stride; + } + } + + static inline __device__ unsigned int batch_offset_r2c(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return ((cufftdx::size_of::value / 2) + 1) * global_fft_id; + } + + template + static inline __device__ void load_r2c(const scalar_type* input, + ComplexType* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + reinterpret_cast(thread_data)[i] = input[index]; + index += stride; + } + } + + template + static inline __device__ void store_r2c(const ComplexType* thread_data, + ComplexType* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset_r2c(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + output[index] = thread_data[i]; + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_store = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to store one more element + constexpr unsigned int values_left_to_store = + threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft); + if (threadIdx.x < values_left_to_store) { + output[index] = thread_data[FFT::elements_per_thread / 2]; + } + } + + // Function assumes that values in thread_data are in RRII layout. + // If OutputInRRIILayout is false, values are saved into output in RIRI layout; otherwise - in RRII. + template + static inline __device__ void store_r2c(const cufftdx::complex<__half2>* thread_data, + cufftdx::complex<__half2>* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset_r2c(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + output[index] = __io::convert_to_riri(thread_data[i]); + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_store = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to store one more element + constexpr unsigned int values_left_to_store = + threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft); + if (threadIdx.x < values_left_to_store) { + output[index] = __io::convert_to_riri(thread_data[FFT::elements_per_thread / 2]); + } + } + + static inline __device__ unsigned int batch_offset_c2r(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return ((cufftdx::size_of::value / 2) + 1) * global_fft_id; + } + + template + static inline __device__ void load_c2r(const ComplexType* input, + ComplexType* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset_c2r(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + thread_data[i] = input[index]; + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_load = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element + constexpr unsigned int values_left_to_load = + threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft); + if (threadIdx.x < values_left_to_load) { + thread_data[FFT::elements_per_thread / 2] = input[index]; + } + } + + // If InputInRRIILayout is false, then function assumes that values in input are in RIRI + // layout, and before loading them to thread_data they are converted to RRII layout. + // Otherwise, if InputInRRIILayout is true, then function assumes values in input are in RRII + // layout, and don't need to be converted before loading to thread_data. + template + static inline __device__ void load_c2r(const cufftdx::complex<__half2>* input, + cufftdx::complex<__half2>* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset_c2r(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + thread_data[i] = __io::convert_to_rrii(input[index]); + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_load = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element + constexpr unsigned int values_left_to_load = + threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft); + if (threadIdx.x < values_left_to_load) { + thread_data[FFT::elements_per_thread / 2] = __io::convert_to_rrii(input[index]); + } + } + + template + static inline __device__ void store_c2r(const ComplexType* thread_data, + scalar_type* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = reinterpret_cast(thread_data)[i]; + index += stride; + } + } + }; + + template + struct io_fp16 { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + static_assert(std::is_same::value, "This IO class is only for half precision FFTs"); + + static inline __device__ unsigned int stride_size() { + return cufftdx::size_of::value / FFT::elements_per_thread; + } + + static inline __device__ unsigned int batch_offset(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return cufftdx::size_of::value * global_fft_id; + } + + static inline __device__ void load(const __half2* input, complex_type* thread_data, unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + const unsigned int batch_stride = FFT::ffts_per_block * cufftdx::size_of::value * blockDim.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = to_rrii(input[index], input[index + batch_stride]); + index += stride; + } + } + + static inline __device__ void store(const complex_type* thread_data, + __half2* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + const unsigned int batch_stride = FFT::ffts_per_block * cufftdx::size_of::value * blockDim.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = to_ri1(thread_data[i]); + output[index + batch_stride] = to_ri2(thread_data[i]); + index += stride; + } + } + }; +} // namespace example + +#endif // CUFFTDX_EXAMPLE_BLOCK_IO_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/common.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/common.hpp new file mode 100644 index 0000000000000..f6e96ea221c17 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/common.hpp @@ -0,0 +1,76 @@ +#ifndef CUFFTDX_EXAMPLE_COMMON_HPP_ +#define CUFFTDX_EXAMPLE_COMMON_HPP_ + +#include + +#ifndef CUDA_CHECK_AND_EXIT +# define CUDA_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != cudaSuccess) { \ + std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#endif // CUDA_CHECK_AND_EXIT + +#ifndef CUFFT_CHECK_AND_EXIT +# define CUFFT_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != CUFFT_SUCCESS) { \ + std::cout << status << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#endif // CUFFT_CHECK_AND_EXIT + +namespace example { + inline unsigned int get_cuda_device_arch() { + int device; + CUDA_CHECK_AND_EXIT(cudaGetDevice(&device)); + + int major = 0; + int minor = 0; + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + + return static_cast(major) * 100 + static_cast(minor) * 10; + } + + inline unsigned int get_multiprocessor_count(int device) { + int multiprocessor_count = 0; + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device)); + return multiprocessor_count; + } + + inline unsigned int get_multiprocessor_count() { + int device = 0; + CUDA_CHECK_AND_EXIT(cudaGetDevice(&device)); + return get_multiprocessor_count(device); + } + + template class Functor> + inline int sm_runner() { + // Get CUDA device compute capability + const auto cuda_device_arch = get_cuda_device_arch(); + + switch (cuda_device_arch) { + // All SM supported by cuFFTDx + case 700: Functor<700>()(); return 0; + case 720: Functor<720>()(); return 0; + case 750: Functor<750>()(); return 0; + case 800: Functor<800>()(); return 0; + case 860: Functor<860>()(); return 0; + default: { + if (cuda_device_arch > 800) { + Functor<800>()(); + return 0; + } + } + } + return 1; + } +} // namespace example + +#endif // CUFFTDX_EXAMPLE_COMMON_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution.cu new file mode 100644 index 0000000000000..099836bafa11a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution.cu @@ -0,0 +1,101 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void convolution_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Scale values + scalar_type scale = 1.0 / cufftdx::size_of::value; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i].x *= scale; + thread_data[i].y *= scale; + } + + // Execute inverse FFT + IFFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// This example demonstrates how to use cuFFTDx t operform a convolution using one-dimensional FFTs. +// +// One block is run, it calculates two 128-point convolutions by first doing forward FFT, then +// applying pointwise operation, and ending with inverse FFT. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void convolution() { + using namespace cufftdx; + + static constexpr unsigned int ffts_per_block = 2; + static constexpr unsigned int fft_size = 128; + // FFT_base defined common options for FFT and IFFT. FFT_base is not a complete FFT description. + // In order to complete FFT description directions are specified: forward for FFT, inverse for IFFT. + using FFT_base = decltype(Block() + Size() + Type() + Precision() + + ElementsPerThread<8>() + FFTsPerBlock() + SM()); + using FFT = decltype(FFT_base() + Direction()); + using IFFT = decltype(FFT_base() + Direction()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = ffts_per_block * fft_size; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + convolution_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes convolution kernel with FFT::block_dim threads in CUDA block + convolution_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct convolution_functor { + void operator()() { return convolution(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution_performance.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution_performance.cu new file mode 100644 index 0000000000000..aeb0e71403700 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution_performance.cu @@ -0,0 +1,384 @@ +#include +#include +#include +#include + +#include +#include +#include +#ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK +#include +#endif + +#include "block_io.hpp" +#include "common.hpp" +#include "random.hpp" + +// Returns execution time in ms +template +float measure_execution(Kernel&& kernel, cudaStream_t stream) { + cudaEvent_t startEvent, stopEvent; + CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + for (size_t i = 0; i < WarmUpRuns; i++) { + kernel(); + } + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream)); + kernel(); + CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + float time; + CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent)); + return time; +} + + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void convolution_kernel(typename FFT::value_type* data, + typename FFT::workspace_type workspace, + typename IFFT::workspace_type workspace_inverse) { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem, workspace); + + // Scale values + scalar_type scale = 1.0 / cufftdx::size_of::value; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i].x *= scale; + thread_data[i].y *= scale; + } + + // Execute inverse FFT + IFFT().execute(thread_data, shared_mem, workspace_inverse); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// Scaling kernel; transforms data between cuFFTs. +template +__global__ void scaling_kernel(cufftComplex* data, + const unsigned int input_size, + const unsigned int ept) { + + static constexpr float scale = 1.0 / fft_size; + + cufftComplex temp; + unsigned int index = blockDim.x * blockIdx.x + threadIdx.x; + + for (int i = 0; i < ept; i++) { + if (index < input_size) { + temp = data[index]; + temp.x *= scale; + temp.y *= scale; + data[index] = temp; + index += blockDim.x * gridDim.x; + } + } +} + +#ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK +template +__device__ cufftComplex scaling_callback(void *dataIn, + size_t offset, + void *callerInfo, + void *sharedPtr) { + static constexpr float scale = 1.0 / fft_size; + + cufftComplex value = static_cast(dataIn)[offset]; + value.x *= scale; + value.y *= scale; + return value; +} + +__device__ __managed__ cufftCallbackLoadC scaling_callback_ptr = scaling_callback<128>; +#endif + +template +double measure_cufftdx(const unsigned int& kernel_repeats, + const unsigned int& cuda_blocks, + typename FFT::value_type* device_buffer, + cudaStream_t stream) { + + using namespace cufftdx; + using complex_type = typename FFT::value_type; + + // create workspaces for FFT and IFFT + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + CUDA_CHECK_AND_EXIT(error_code); + auto workspace_inverse = make_workspace(error_code); + CUDA_CHECK_AND_EXIT(error_code); + + // run cuFFTDx + double time = measure_execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + // There are (ffts_per_block * fft_size * cuda_blocks) elements + convolution_kernel<<>>( + device_buffer, workspace, workspace_inverse); + } + }, + stream); + + return time; +} + +template +double measure_cufft(const unsigned int& kernel_repeats, + const unsigned int& batch_size, + cufftComplex* device_buffer, + cudaStream_t stream) { + + static constexpr unsigned int block_dim_scaling_kernel = 1024; + + // Calculating parameters for scaling_kernel execution. + // Get maximum number of running CUDA blocks per multiprocessor. + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + scaling_kernel, + block_dim_scaling_kernel, + 0)); + + // Get maximum number of CUDA blocks running on all multiprocessors. + // This many CUDA blocks will be run for simple_kernel. + const unsigned int cuda_blocks = blocks_per_multiprocessor * example::get_multiprocessor_count(); + + const unsigned int input_length = fft_size * batch_size; + const unsigned int elements_per_block = (input_length + cuda_blocks - 1) / cuda_blocks; + const unsigned int elements_per_thread = (elements_per_block + block_dim_scaling_kernel - 1) / block_dim_scaling_kernel; + + // prepare cuFFT runs + cufftHandle plan; + CUFFT_CHECK_AND_EXIT(cufftPlan1d(&plan, fft_size, CUFFT_C2C, batch_size)); + CUFFT_CHECK_AND_EXIT(cufftSetStream(plan, stream)); + + // run convolution + double time_cufft = measure_execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + + if (cufftExecC2C(plan, device_buffer, device_buffer, CUFFT_FORWARD) != CUFFT_SUCCESS) { + fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); + return; + } + + scaling_kernel + <<>>(device_buffer, input_length, elements_per_thread); + + if (cufftExecC2C(plan, device_buffer, device_buffer, CUFFT_INVERSE) != CUFFT_SUCCESS) { + fprintf(stderr, "CUFFT error: ExecC2C Inverse failed"); + return; + } + } + }, + stream); + + CUFFT_CHECK_AND_EXIT(cufftDestroy(plan)); + return time_cufft; +} + +#ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK +template +double measure_cufft_callback(const unsigned int& kernel_repeats, + const unsigned int& batch_size, + cufftComplex* device_buffer, + cudaStream_t stream) { + + static constexpr unsigned int block_dim_scaling_kernel = 1024; + + // Calculating parameters for scaling_kernel execution. + // Get maximum number of running CUDA blocks per multiprocessor. + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + scaling_kernel, + block_dim_scaling_kernel, + 0)); + // prepare cuFFT runs + cufftHandle plan_in; + CUFFT_CHECK_AND_EXIT(cufftPlan1d(&plan_in, fft_size, CUFFT_C2C, batch_size)); + CUFFT_CHECK_AND_EXIT(cufftSetStream(plan_in, stream)); + cufftHandle plan_out; + CUFFT_CHECK_AND_EXIT(cufftPlan1d(&plan_out, fft_size, CUFFT_C2C, batch_size)); + CUFFT_CHECK_AND_EXIT(cufftSetStream(plan_out, stream)); + + // Set input callback + CUFFT_CHECK_AND_EXIT(cufftXtSetCallback(plan_in, + reinterpret_cast(&scaling_callback_ptr), + CUFFT_CB_LD_COMPLEX, + nullptr)); + + // run convolution + double time_cufft = measure_execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + CUFFT_CHECK_AND_EXIT(cufftExecC2C(plan_in, device_buffer, device_buffer, CUFFT_FORWARD)); + CUFFT_CHECK_AND_EXIT(cufftExecC2C(plan_out, device_buffer, device_buffer, CUFFT_INVERSE)); + } + }, + stream); + + CUFFT_CHECK_AND_EXIT(cufftDestroy(plan_in)); + CUFFT_CHECK_AND_EXIT(cufftDestroy(plan_out)); + return time_cufft; +} +#endif // CUFFTDX_EXAMPLES_CUFFT_CALLBACK + +// This example compares performance of cuFFT and cuFFTDx when performing C2C convolution. +// Data is generated on host, copied to device buffer and processed by FFTs. +// Each cuFFTDx execution runs one kernel, each cuFFT execution - three kernels. +// The experiment runs with the following principles: +// - at least 1GB of data is allocated in GPU and transformed by both convolutions, +// - for cuFFTDx kernel run, number of CUDA blocks is divisible +// by maximum number of CUDA blocks that can run simultaneously on the GPU. +template +void convolution() { + using namespace cufftdx; + + static constexpr unsigned int minimum_input_size_bytes = (1 << 30); // At least one GB of data will be processed by FFTs. + static constexpr unsigned int fft_size = 512; + static constexpr unsigned int kernel_repeats = 10; + static constexpr unsigned int warm_up_runs = 1; + static constexpr bool verbose = true; + + static constexpr bool use_suggested = true; // Whether to use suggested FPB and EPT values or custom. + static constexpr unsigned int custom_ffts_per_block = 2; + static constexpr unsigned int custom_elements_per_thread = 8; + + // To determine the total input length (number of fft batches to run), the maximum number of + // simultanously running cuFFTDx CUDA blocks is calculated. + + // Declaration of cuFFTDx run + using fft_incomplete = decltype(Block() + Size() + Type() + Precision() + SM()); + using fft_base = decltype(fft_incomplete() + Direction()); + using ifft_base = decltype(fft_incomplete() + Direction()); + + static constexpr unsigned int elements_per_thread = use_suggested ? fft_base::elements_per_thread : custom_elements_per_thread; + static constexpr unsigned int ffts_per_block = use_suggested ? fft_base::suggested_ffts_per_block : custom_ffts_per_block; + + using fft = decltype(fft_base() + ElementsPerThread() + FFTsPerBlock()); + using ifft = decltype(ifft_base() + ElementsPerThread() + FFTsPerBlock()); + using complex_type = typename fft::value_type; + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + convolution_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, fft::shared_memory_size)); + + // Get maximum number of running CUDA blocks per multiprocessor + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + convolution_kernel, + fft::block_dim.x * fft::block_dim.y * fft::block_dim.z, + fft::shared_memory_size)); + + // Get maximum number of CUDA blocks running on all multiprocessors + const unsigned int device_blocks = blocks_per_multiprocessor * example::get_multiprocessor_count(); + + // Input size in bytes if device_blocks CUDA blocks were run. + const unsigned int data_size_device_blocks_bytes = device_blocks * ffts_per_block * fft_size * sizeof(complex_type); + + // cuda_blocks = minimal number of CUDA blocks to run, such that: + // - cuda_blocks is divisible by device_blocks, + // - total input size is not less than minimum_input_size_bytes. + // executed_blocks_multiplyer = cuda_blocks / device_blocks + const unsigned int executed_blocks_multiplyer = + (minimum_input_size_bytes + data_size_device_blocks_bytes - 1) / data_size_device_blocks_bytes; + const unsigned int cuda_blocks = device_blocks * executed_blocks_multiplyer; + const unsigned int input_length = ffts_per_block * cuda_blocks * fft_size; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + // Host data + std::vector input = + example::get_random_complex_data(input_length, -10, 10); + + // Device data + complex_type* device_buffer; + auto input_size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, input_size_bytes)); + + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), input_size_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Measure performance + double time_cufftdx = measure_cufftdx(kernel_repeats, cuda_blocks, device_buffer, stream); + double time_cufft = measure_cufft( + kernel_repeats, cuda_blocks * ffts_per_block, (cufftComplex*)device_buffer, stream); + #ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK + double time_cufft_cb = measure_cufft_callback( + kernel_repeats, cuda_blocks * ffts_per_block, (cufftComplex*)device_buffer, stream); + #endif + + CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + // Report results. + auto report_time_and_performance = [&](std::string name, double time) -> void { + double gflops = 1.0 * kernel_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time / 1000000.0; + + std::cout << std::endl; + std::cout << name << std::endl; + std::cout << "Avg Time [ms_n]: " << time / kernel_repeats << std::endl; + std::cout << "Time (all) [ms_n]: " << time << std::endl; + std::cout << "Performance [GFLOPS]: " << gflops << std::endl; + }; + + if (verbose) { + std::cout << "FFT size: " << fft_size << std::endl; + std::cout << "FFTs run: " << ffts_per_block * cuda_blocks << std::endl; + report_time_and_performance("cuFFTDx", time_cufftdx); + std::cout << "FFTs elements per thread: " << fft::elements_per_thread << std::endl; + std::cout << "FFTs per block: " << ffts_per_block << std::endl; + std::cout << "CUDA blocks: " << cuda_blocks << std::endl; + std::cout << "Blocks per multiprocessor: " << blocks_per_multiprocessor << std::endl; + + report_time_and_performance("cuFFT", time_cufft); + #ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK + report_time_and_performance("cuFFT Callback", time_cufft_cb); + #endif + } else { + double gflops_cufftdx = 1.0 * kernel_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time_cufftdx / 1000000.0; + double gflops_cufft = 1.0 * kernel_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time_cufft / 1000000.0; + std::cout << fft_size << ": " << std::endl + << gflops_cufftdx << ", " << time_cufftdx / kernel_repeats << ", " << std::endl + << gflops_cufft << ", " << time_cufft / kernel_repeats; + } +} + +template +struct convolution_functor { + void operator()() { + return convolution(); + } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution_r2c_c2r.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution_r2c_c2r.cu new file mode 100644 index 0000000000000..3e0052b954975 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/convolution_r2c_c2r.cu @@ -0,0 +1,102 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFTR2C::max_threads_per_block) __global__ void convolution_kernel(cufftdx::precision_of_t* data) { + using complex_type = typename FFTR2C::value_type; + using scalar_type = typename complex_type::value_type; + + // Local array for thread + complex_type thread_data[FFTR2C::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFTR2C::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_r2c(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFTR2C().execute(thread_data, shared_mem); + + // Scale values + scalar_type scale = 1.0 / cufftdx::size_of::value; + for (unsigned int i = 0; i < FFTR2C::elements_per_thread; i++) { + thread_data[i].x *= scale; + thread_data[i].y *= scale; + } + + // Execute inverse FFT + FFTC2R().execute(thread_data, shared_mem); + + // Save results + example::io::store_c2r(thread_data, data, local_fft_id); +} + +// This example demonstrates how to use cuFFTDx t operform a convolution using one-dimensional FFTs. +// +// One block is run, it calculates two 128-point convolutions by first doing forward FFT, then +// applying pointwise operation, and ending with inverse FFT. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void convolution() { + using namespace cufftdx; + + static constexpr unsigned int ffts_per_block = 2; + static constexpr unsigned int fft_size = 128; + // FFT_base defined common options for FFT and IFFT. FFT_base is not a complete FFT description. + // In order to complete FFT description directions are specified: forward for FFT, inverse for IFFT. + using FFT_base = decltype(Block() + Size() + Precision() + + ElementsPerThread<8>() + FFTsPerBlock() + SM()); + using FFTR2C = decltype(FFT_base() + Type()); + using FFTC2R = decltype(FFT_base() + Type()); + using real_type = precision_of_t; + + // Allocate managed memory for input/output + real_type* data; + auto size = ffts_per_block * fft_size; + auto size_bytes = size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = float(i); + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i] << std::endl; + } + + const auto shared_memory_size = std::max(FFTR2C::shared_memory_size, FFTC2R::shared_memory_size); + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + convolution_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size)); + + // Invokes convolution kernel with FFT::block_dim threads in CUDA block + convolution_kernel<<<1, FFTR2C::block_dim, shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i] << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct convolution_functor { + void operator()() { return convolution(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/fp16_common.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/fp16_common.hpp new file mode 100644 index 0000000000000..afa6eb28d2b20 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/fp16_common.hpp @@ -0,0 +1,71 @@ + +#ifndef CUFFTDX_EXAMPLE_FP16_COMMON_HPP_ +#define CUFFTDX_EXAMPLE_FP16_COMMON_HPP_ + +namespace example { + // Changes layout of complex<__half2> value from ((Real, Imag), (Real, Imag)) layout to + // ((Real, Real), (Imag, Imag)) layout. + __device__ __host__ __forceinline__ cufftdx::complex<__half2> to_rrii( + cufftdx::complex<__half2> riri) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + cufftdx::complex<__half2> rrii(__lows2half2(riri.x, riri.y), + __highs2half2(riri.x, riri.y)); +#else + cufftdx::complex<__half2> rrii(__half2 {riri.x.x, riri.y.x}, + __half2 {riri.x.y, riri.y.y}); +#endif + return rrii; + } + + // Converts to __half complex values to complex<__half2> in ((Real, Real), (Imag, Imag)) layout. + __device__ __host__ __forceinline__ cufftdx::complex<__half2> to_rrii( + __half2 ri1, + __half2 ri2) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + cufftdx::complex<__half2> rrii(__lows2half2(ri1, ri2), + __highs2half2(ri1, ri2)); +#else + cufftdx::complex<__half2> rrii(__half2 {ri1.x, ri2.x}, + __half2 {ri1.y, ri2.y}); +#endif + return rrii; + } + + // Changes layout of complex<__half2> value from ((Real, Real), (Imag, Imag)) layout to + // ((Real, Imag), (Real, Imag)) layout. + __device__ __host__ __forceinline__ cufftdx::complex<__half2> to_riri( + cufftdx::complex<__half2> rrii) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + cufftdx::complex<__half2> riri(__lows2half2(rrii.x, rrii.y), + __highs2half2(rrii.x, rrii.y)); +#else + cufftdx::complex<__half2> riri(__half2 {rrii.x.x, rrii.y.x}, + __half2 {rrii.x.y, rrii.y.y}); +#endif + return riri; + } + + // Return the first half complex number (as __half2) from complex<__half2> value with + // ((Real, Real), (Imag, Imag)) layout. + // Example: for rrii equal to ((1,2), (3,4)), it return __half2 (1, 3). + __device__ __host__ __forceinline__ __half2 to_ri1(cufftdx::complex<__half2> rrii) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + return __lows2half2(rrii.x, rrii.y); +#else + return __half2 {rrii.x.x, rrii.y.x}; +#endif + } + + // Return the second half complex number (as __half2) from complex<__half2> value with + // ((Real, Real), (Imag, Imag)) layout. + // Example: for rrii equal to ((1,2), (3,4)), it return __half2 (2, 4). + __device__ __host__ __forceinline__ __half2 to_ri2(cufftdx::complex<__half2> rrii) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + return __highs2half2(rrii.x, rrii.y); +#else + return __half2 {rrii.x.y, rrii.y.y}; +#endif + } +} // namespace example + +#endif // CUFFTDX_EXAMPLE_FP16_COMMON_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/nvrtc_fft_thread.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/nvrtc_fft_thread.cu new file mode 100644 index 0000000000000..cda77c8f201b8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/nvrtc_fft_thread.cu @@ -0,0 +1,198 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "common.hpp" + +#define NVRTC_SAFE_CALL(x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " << nvrtcGetErrorString(result) << '\n'; \ + exit(1); \ + } \ + } while (0) + +const char* thread_fft_kernel = R"kernel( +#include + +using namespace cufftdx; + +// FFT +using size_desc = Size; +using dir_desc = Direction; +using type_c2c = Type; +using FFT = decltype(size_desc() + dir_desc() + type_c2c() + Thread() + Precision()); + +extern "C" __global__ void thread_fft_kernel(typename FFT::value_type *data) +{ + // Local array for thread + typename FFT::value_type thread_data[FFT::storage_size]; + + // Load data from global memory to registers. + // thread_data should have all input data in order. + unsigned int index = threadIdx.x * FFT::elements_per_thread; + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = data[index + i]; + } + + // Execute FFT + FFT().execute(thread_data); + + // Save results + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + data[index + i] = thread_data[i]; + } +} +)kernel"; + +int main(int, char**) { + // Define FFT + using namespace cufftdx; + + static constexpr unsigned int fft_size = 16; + + // FFT Operators + using size_desc = Size; + using dir_desc = Direction; + using type_c2c = Type; + using FFT = decltype(size_desc() + dir_desc() + type_c2c() + Thread() + Precision()); + using value_type = typename FFT::value_type; + + std::string fft_size_definition = "-DFFT_SIZE=" + std::to_string(fft_size); + // Parse cuFFTDx include dirs + std::vector cufftdx_include_dirs_array; + { + std::string cufftdx_include_dirs = CUFFTDX_INCLUDE_DIRS; + std::string delim = ";"; + auto start = 0U; + auto end = cufftdx_include_dirs.find(delim); + while (end != std::string::npos) { + cufftdx_include_dirs_array.push_back("--include-path=" + cufftdx_include_dirs.substr(start, end - start)); + start = end + delim.length(); + end = cufftdx_include_dirs.find(delim, start); + } + cufftdx_include_dirs_array.push_back("--include-path=" + cufftdx_include_dirs.substr(start, end - start)); + } + + // Get architecture of current device + int device; + CUDA_CHECK_AND_EXIT(cudaGetDevice(&device)); + int major = 0; + int minor = 0; + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + std::string gpu_architecture_option = + "--gpu-architecture=compute_" + std::to_string(major * 10 + minor); + + // Create a program + nvrtcProgram program; + NVRTC_SAFE_CALL(nvrtcCreateProgram(&program, // program + thread_fft_kernel, // buffer + "thread_fft_kernel.cu", // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + + // Prepare compilation options + std::vector opts = { + "--std=c++17", + "--device-as-default-execution-space", + "--include-path=" CUDA_INCLUDE_DIR // Add path to CUDA include directory + }; + // Include cuFFTDx dir in opts + for (auto& d : cufftdx_include_dirs_array) { + opts.push_back(d.c_str()); + } + // Add FFT_SIZE definition + opts.push_back(fft_size_definition.c_str()); + // Add gpu-architecture flag + opts.push_back(gpu_architecture_option.c_str()); + + nvrtcResult compileResult = nvrtcCompileProgram(program, // program + opts.size(), // numOptions + opts.data()); // options + + // Obtain compilation log from the program + if (compileResult != NVRTC_SUCCESS) { + size_t log_size; + NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(program, &log_size)); + char* log = new char[log_size]; + NVRTC_SAFE_CALL(nvrtcGetProgramLog(program, log)); + std::cout << log << '\n'; + delete[] log; + std::exit(1); + } + + // Obtain PTX from the program. + size_t ptx_size; + NVRTC_SAFE_CALL(nvrtcGetPTXSize(program, &ptx_size)); + char* ptx = new char[ptx_size]; + NVRTC_SAFE_CALL(nvrtcGetPTX(program, ptx)); + + // Destroy the program. + NVRTC_SAFE_CALL(nvrtcDestroyProgram(&program)); + + // Load the generated PTX and get a handle to the thread_fft_kernel + CUcontext context; + CUmodule module; + CUfunction kernel; + CUDA_CHECK_AND_EXIT(cudaFree(0)); // Initialize CUDA context + CUDA_CHECK_AND_EXIT(cuCtxGetCurrent(&context)); // Get current context + CUDA_CHECK_AND_EXIT(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); + CUDA_CHECK_AND_EXIT(cuModuleGetFunction(&kernel, module, "thread_fft_kernel")); + + // Generate input for execution + std::vector host_input(cufftdx::size_of::value); + float i = 0.0f; + for (auto& v : host_input) { + v.x = i++; + v.y = 0; + } + + size_t fft_buffer_size = cufftdx::size_of::value * sizeof(value_type); + void* device_values; + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_values, fft_buffer_size)); + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_values, host_input.data(), fft_buffer_size, cudaMemcpyHostToDevice)); + + // Execute thread_fft_kernel + void* args[] = {&device_values}; + CUDA_CHECK_AND_EXIT(cuLaunchKernel(kernel, + 1, // number of blocks + 1, + 1, + 1, // number of threads + 1, + 1, + 0, // no shared memory + NULL, // NULL stream + args, + 0)); + CUDA_CHECK_AND_EXIT(cuCtxSynchronize()); + + // Retrieve and print output. + std::vector host_output(cufftdx::size_of::value); + CUDA_CHECK_AND_EXIT(cudaMemcpy(host_output.data(), device_values, fft_buffer_size, cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < cufftdx::size_of::value; ++i) { + std::cout << i << ": (" << host_output[i].x << ", " << host_output[i].y << ")" << std::endl; + } + + // Release resources. + CUDA_CHECK_AND_EXIT(cudaFree(device_values)); + CUDA_CHECK_AND_EXIT(cuModuleUnload(module)); + + double expected_value = (fft_size * (fft_size + 1)) / 2; + if ((host_output[0].x - expected_value) > 0.01) { + std::cout << "Failed" << std::endl; + return 1; + } + std::cout << "Success" << std::endl; + return 0; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/random.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/random.hpp new file mode 100644 index 0000000000000..c05f75c960697 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/random.hpp @@ -0,0 +1,51 @@ + +#ifndef CUFFTDX_EXAMPLE_RANDOM_HPP_ +#define CUFFTDX_EXAMPLE_RANDOM_HPP_ + +#include +#include +#include +#include + +#include +#include + +namespace example { + template + inline auto get_random_complex_data(size_t size, T min, T max) -> + typename std::enable_if::value, + std::vector>>::type { + using complex_type = cufftdx::make_complex_type_t; + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector output(size); + std::generate(output.begin(), output.end(), [&]() { + return complex_type {distribution(gen), distribution(gen)}; + }); + return output; + } + + template + inline auto get_random_complex_data(size_t size, T min, T max) -> + typename std::enable_if::value, + std::vector>>::type { + using complex_type = cufftdx::make_complex_type_t<__half2>; + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector output(size); + std::generate(output.begin(), output.end(), [&]() { + auto xx = __float2half(distribution(gen)); + auto xy = __float2half(distribution(gen)); + auto yx = __float2half(distribution(gen)); + auto yy = __float2half(distribution(gen)); + auto x = __half2 {xx, xy}; + auto y = __half2 {yx, yy}; + return complex_type {x, y}; + }); + return output; + } +} // namespace example + +#endif // CUFFTDX_EXAMPLE_RANDOM_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block.cu new file mode 100644 index 0000000000000..facaf08e09a24 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block.cu @@ -0,0 +1,86 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_c2r.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_c2r.cu new file mode 100644 index 0000000000000..7d91ae8c20a9e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_c2r.cu @@ -0,0 +1,94 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_c2r(typename FFT::value_type* input_data, cufftdx::precision_of_t* output_data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_c2r(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_c2r(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional complex-to-real transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2R float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// Notice different sizes of input and output buffer, and C2R load and store operations in the kernel. +template +void simple_block_fft_c2r() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + using real_type = typename complex_type::value_type; + + // Allocate managed memory for input/output + complex_type* input_data; + auto input_size = FFT::ffts_per_block * (cufftdx::size_of::value / 2 + 1); + auto input_size_bytes = input_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + input_data[i] = complex_type {float(i), -float(i)}; + } + real_type* output_data; + auto output_size = FFT::ffts_per_block * cufftdx::size_of::value; + auto output_size_bytes = output_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << input_data[i].x << " " << input_data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_c2r, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_c2r<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << output_data[i] << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_c2r_functor { + void operator()() { return simple_block_fft_c2r(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_c2r_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_c2r_fp16.cu new file mode 100644 index 0000000000000..3d4de92a31277 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_c2r_fp16.cu @@ -0,0 +1,105 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_c2r_fp16(ComplexType* input_data, ScalarType* output_data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_c2r(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_c2r(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional complex-to-real transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point C2R half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft_c2r_fp16() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<8>() + FFTsPerBlock<4>() + SM()); + using complex_type = typename FFT::value_type; // complex<__half2> + using real_type = typename complex_type::value_type; // __half2 + + // Allocate managed memory for input/output + // For performance reasons half precision cuFFTDx FFTs has an implicit batching of 2 FFTs. This means that: + // * Used complex type is complex<__half2>, and real type is __half2. + // * Every thread processes values from two batches simultaneously using __half2 as the base type. + // * Number of FFTs per block must be a multiple of 2. + // * Complex data is processed in ((Real1, Real2), (Imag1, Imag2)) layout, where (Real1, Imag1) is a value from + // one batch, and (Real2, Imag2) is from a different batch. + // * Real data is process using __half2 in (Real1, Real2) layout, where Real1 is a value from one batch, and + // Real2 is from a different batch. + constexpr size_t implicit_batching = FFT::implicit_type_batching; + complex_type* input_data; + auto input_size = FFT::ffts_per_block / implicit_batching * (cufftdx::size_of::value / 2 + 1); + auto input_size_bytes = input_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + float v1 = i; + float v2 = i + input_size; + // Populate input with complex values in ((Real, Imag), (Real, Imag)) layout + input_data[i] = complex_type {__half2 {v1, -v1}, __half2 {v2, -v2}}; + } + real_type* output_data; + auto output_size = FFT::ffts_per_block / implicit_batching * cufftdx::size_of::value; + auto output_size_bytes = output_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << __half2float(input_data[i].x.x) << " " << __half2float(input_data[i].x.y) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_c2r_fp16, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_c2r_fp16<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(output_data[i].x) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_c2r_fp16_functor { + void operator()() { return simple_block_fft_c2r_fp16(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_cub_io.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_cub_io.cu new file mode 100644 index 0000000000000..0ff802239b3ee --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_cub_io.cu @@ -0,0 +1,114 @@ +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +#if CUB_VERSION < 101300 +int main(int, char**) { + std::cout << "Example disabled, BLOCK_LOAD_STRIPED/BLOCK_STORE_STRIPED is only supported since CUB 1.13 (CUDA 11.5)" << std::endl; + return 0; +} +#else + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // FFT::stride == FFT::block_dim.x in most cases + using BlockLoad = cub::BlockLoad ; + using BlockStore = cub::BlockStore; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // ID of FFT in CUDA grid + unsigned int global_fft_id = + (FFT::ffts_per_block == 1) ? local_fft_id : ( blockIdx.x * FFT::ffts_per_block + local_fft_id); + + // Load data from global memory to registers + auto fft_data = data + (global_fft_id * cufftdx::size_of::value); + BlockLoad().Load(fft_data, thread_data, cufftdx::size_of::value, complex_type { 0.0, 0.0 }); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + BlockStore().Store(fft_data, thread_data, cufftdx::size_of::value); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. CUB +// library is used for IO in kernel. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + auto sum = data[0].x; + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + if(std::abs(sum - ((cufftdx::size_of::value-1) * cufftdx::size_of::value / 2)) > 0.1) { + std::cout << "Failed" << std::endl; + return; + } + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} +#endif // CUB_VERSION < 101300 diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_fp16.cu new file mode 100644 index 0000000000000..6f8e031242cb8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_fp16.cu @@ -0,0 +1,102 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" +#include "fp16_common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point C2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// +// Here, we're using complex with ((Real, Imag), (Real, Imag)) layout as the type of the input/output +// data passed to kernel, and later on the device layout is changed into RRII when values are being loaded. +template +void simple_block_fft_complex_half2() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<8>() + FFTsPerBlock<4>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + // For performance reasons half precision cuFFTDx FFTs has an implicit batching of 2 FFTs. This means that: + // * Used complex type is complex<__half2>, and real type is __half2. + // * Every thread processes values from two batches simultaneously using __half2 as the base type. + // * Number of FFTs per block must be a multiple of 2. + // * Complex data is processed in ((Real1, Real2), (Imag1, Imag2)) layout, where (Real1, Imag1) is a value from + // one batch, and (Real2, Imag2) is from a different batch. + // * Real data is process using __half2 in (Real1, Real2) layout, where Real1 is a value from one batch, and + // Real2 is from a different batch. + constexpr size_t implicit_batching = FFT::implicit_type_batching; + auto size = FFT::ffts_per_block / implicit_batching * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + float v1 = i; + float v2 = i + size; + // Populate input with complex values in ((Real, Imag), (Real, Imag)) layout + data[i] = complex_type {__half2 {v1, -v1}, __half2 {v2, -v2}}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x.x) << " " << __half2float(data[i].x.y) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x.x) << " " << __half2float(data[i].x.y) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_complex_half2_functor { + void operator()() { return simple_block_fft_complex_half2(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_half2.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_half2.cu new file mode 100644 index 0000000000000..f01b0d2ac6365 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_half2.cu @@ -0,0 +1,90 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" +#include "fp16_common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(__half2* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io_fp16::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io_fp16::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point C2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// +// Here, we're using __half2 as the type of the input/output data passed to kernel, and later on +// the device we use special example::io_fp16 struct template to load values from two batches +// into an array of complex with ((Real, Real), (Imag, Imag)) layout. +template +void simple_block_fft_half2() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<8>() + FFTsPerBlock<4>() + SM()); + + // Allocate managed memory for input/output + __half2* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(__half2); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = __half2 {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x) << " " << __half2float(data[i].y) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x) << " " << __half2float(data[i].y) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_half2_functor { + void operator()() { return simple_block_fft_half2(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_r2c.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_r2c.cu new file mode 100644 index 0000000000000..bfa795f47c451 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_r2c.cu @@ -0,0 +1,93 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_r2c(ScalarType* input_data, ComplexType* output_data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_r2c(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_r2c(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional real-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point R2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// Notice different sizes of input and output buffer, and R2C load and store operations in the kernel. +template +void simple_block_fft_r2c() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + using real_type = typename complex_type::value_type; + + // Allocate managed memory for input/output + real_type* input_data; + auto input_size = FFT::ffts_per_block * cufftdx::size_of::value; + auto input_size_bytes = input_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + input_data[i] = float(i); + } + complex_type* output_data; + auto output_size = FFT::ffts_per_block * (cufftdx::size_of::value / 2 + 1); + auto output_size_bytes = output_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << input_data[i] << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_r2c, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_r2c<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << output_data[i].x << " " << output_data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_r2c_functor { + void operator()() { return simple_block_fft_r2c(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_r2c_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_r2c_fp16.cu new file mode 100644 index 0000000000000..17545c63d41a1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_r2c_fp16.cu @@ -0,0 +1,101 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_r2c_fp16(ScalarType* input_data, ComplexType* output_data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_r2c(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_r2c(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional real-to-complex transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point R2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft_r2c_fp16() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<16>() + FFTsPerBlock<4>() + SM()); + using complex_type = typename FFT::value_type; // complex<__half2> + using real_type = typename complex_type::value_type; // __half2 + + // Allocate managed memory for input/output + real_type* input_data; + // For performance reasons half precision cuFFTDx FFTs has an implicit batching of 2 FFTs. This means that: + // * Used complex type is complex<__half2>, and real type is __half2. + // * Every thread processes values from two batches simultaneously using __half2 as the base type. + // * Number of FFTs per block must be a multiple of 2. + // * Complex data is processed in ((Real1, Real2), (Imag1, Imag2)) layout, where (Real1, Imag1) is a value from + // one batch, and (Real2, Imag2) is from a different batch. + // * Real data is process using __half2 in (Real1, Real2) layout, where Real1 is a value from one batch, and + // Real2 is from a different batch. + constexpr size_t implicit_batching = FFT::implicit_type_batching; + auto input_size = FFT::ffts_per_block / implicit_batching * cufftdx::size_of::value; + auto input_size_bytes = input_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + input_data[i] = __half2 {float(i), float(i + input_size)}; + } + complex_type* output_data; + auto output_size = FFT::ffts_per_block / implicit_batching * (cufftdx::size_of::value / 2 + 1); + auto output_size_bytes = output_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(input_data[i].x) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_r2c_fp16, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_r2c_fp16<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << __half2float(output_data[i].x.x) << " " << __half2float(output_data[i].x.y) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_r2c_fp16_functor { + void operator()() { return simple_block_fft_r2c_fp16(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_shared.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_shared.cu new file mode 100644 index 0000000000000..00dda003cbb07 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_shared.cu @@ -0,0 +1,85 @@ +#include +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + extern __shared__ unsigned char shared_mem[]; + + auto this_block_data = data + cufftdx::size_of::value * FFT::ffts_per_block * blockIdx.x; + + example::io::load_to_smem(this_block_data, shared_mem); + + FFT().execute(reinterpret_cast(shared_mem)); + + example::io::store_from_smem(shared_mem, this_block_data); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + // Shared memory must fit input data and must be big enough to run FFT + auto shared_memory_size = std::max((unsigned int)FFT::shared_memory_size, (unsigned int)size_bytes); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_std_complex.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_std_complex.cu new file mode 100644 index 0000000000000..2b69581eec3cc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_block_std_complex.cu @@ -0,0 +1,100 @@ +#include +#include + +// Check if used version of libcu++ supports cuda::std::complex +#include +#if _LIBCUDACXX_CUDA_API_VERSION < 001004000 +int main(int, char**) { + std::cout << "Example disabled, cuda::std::complex is only supported since libcu++ 1.4.0 (CUDA 11.3)" << std::endl; + return 0; +} +#else + +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(ComplexType* data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + // Use cuda::std::complex instead of FFT::value_type + using complex_type = cuda::std::complex::type>; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].real() << " " << data[i].imag() << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].real() << " " << data[i].imag() << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} + +#endif // (_LIBCUDACXX_CUDA_API_VERSION < 001004000) diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_thread.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_thread.cu new file mode 100644 index 0000000000000..8ad7f5558b31d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_thread.cu @@ -0,0 +1,81 @@ +#include +#include + +#include +#include + +#include "common.hpp" + +template +__global__ void thread_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // Load data from global memory to registers. + // thread_data should have all input data in order. + unsigned int index = threadIdx.x * FFT::elements_per_thread; + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = data[index + i]; + } + + // Execute FFT + FFT().execute(thread_data); + + // Save results + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + data[index + i] = thread_data[i]; + } +} + +// In this example a one-dimensional complex-to-complex transform is perform by a CUDA thread. +// +// Four (threads_count) threads are run, and each thread calculates 8-point (fft_size) C2C double precision FFT. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +int main(int, char**) { + using namespace cufftdx; + + // Number of threads to execute + static constexpr unsigned int threads_count = 4; + + // FFT is defined, its: size, type, direction, precision. Thread() operator informs that FFT will be executed on thread level. + using FFT = decltype(Thread() + Size<8>() + Type() + Direction() + + Precision()); + using complex_type = typename FFT::value_type; + + // Host data + std::vector input(cufftdx::size_of::value * threads_count); + for (size_t i = 0; i < input.size(); i++) { + input[i] = complex_type {double(i), -double(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << input[i].x << " " << input[i].y << std::endl; + } + + // Device data + complex_type* device_buffer; + auto size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, size_bytes)); + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + + // Invokes kernel with 'threads_count' threads in block, each thread calculates one FFT of size + thread_fft_kernel<<<1, threads_count>>>(device_buffer); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Copy device to host + std::vector output(input.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(output.data(), device_buffer, size_bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << output[i].x << " " << output[i].y << std::endl; + } + + std::cout << "Success" << std::endl; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_thread_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_thread_fp16.cu new file mode 100644 index 0000000000000..bee19442fcbfa --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/example/cufftdx/simple_fft_thread_fp16.cu @@ -0,0 +1,93 @@ +#include +#include + +#include +#include + +#include "common.hpp" +#include "fp16_common.hpp" + +template +__global__ void thread_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // Load data from global memory to registers. + // thread_data should have all input data in order. + unsigned int index = threadIdx.x * FFT::elements_per_thread; + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + // complex values are processed with assumtion that they are in RRII layout, + // but data has them in RIRI layout. example::to_rrii converts RIRI to RRII. + thread_data[i] = example::to_rrii(data[index + i]); + } + + // Execute FFT + FFT().execute(thread_data); + + // Save results + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + // converting back form RRII to RIRI layout + data[index + i] = example::to_riri(thread_data[i]); + } +} + +// In this example a one-dimensional half-precision complex-to-complex transform is perform by each CUDA thread. +// +// Three (threads_count) threads are run, and each thread calculates two 8-point (fft_size) C2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// +// Note: In half precision cuFFTDx uses complex type and processes values in implicit batches of two FFTs, ie. +// each thread processes two FFTs. +int main(int, char**) { + using namespace cufftdx; + + // Number of threads to execute + // In case of half precision each thread caluclates two FFTs + static constexpr unsigned int threads_count = 3; + + // FFT is defined, its: size, type, direction, precision. + // Thread() operator informs that FFT will be executed on a thread level. + using FFT = decltype(Thread() + Size<8>() + Type() + Direction() + + Precision<__half>()); + using complex_type = typename FFT::value_type; + + // Host data + std::vector input(cufftdx::size_of::value); + for (size_t i = 0; i < input.size(); i++) { + float v1 = i; + float v2 = i + input.size(); + // Populate input with complex values in ((Real, Imag), (Real, Imag)) layout + input[i] = complex_type {__half2 {v1, -v1}, __half2 {v2, -v2}}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(input[i].x.x) << " " << __half2float(input[i].x.y) << std::endl; + } + + // Device data + complex_type* device_buffer; + auto size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, size_bytes)); + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + + // Invokes kernel with 'threads_count' threads in block, each thread calculates two FFTs of size 8 + thread_fft_kernel<<<1, threads_count>>>(device_buffer); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Copy device to host + std::vector output(input.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(output.data(), device_buffer, size_bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(output[i].x.x) << " " << __half2float(output[i].x.y) << std::endl; + } + + std::cout << "Success" << std::endl; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx.hpp new file mode 100644 index 0000000000000..6b7a4ab246161 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx.hpp @@ -0,0 +1,11 @@ +// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. +#ifndef MATHDX_CUFFTDX_HPP_ +#define MATHDX_CUFFTDX_HPP_ +#include "cufftdx/include/cufftdx.hpp" +#endif // MATHDX_CUFFTDX_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/basic.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/basic.css new file mode 100644 index 0000000000000..bf18350b65c61 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/basic.css @@ -0,0 +1,906 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 450px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a.brackets:before, +span.brackets > a:before{ + content: "["; +} + +a.brackets:after, +span.brackets > a:after { + content: "]"; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +table.footnote td, table.footnote th { + border: 0 !important; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +dl.footnote > dt, +dl.citation > dt { + float: left; + margin-right: 0.5em; +} + +dl.footnote > dd, +dl.citation > dd { + margin-bottom: 0em; +} + +dl.footnote > dd:after, +dl.citation > dd:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dt:after { + content: ":"; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/badge_only.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/badge_only.css new file mode 100644 index 0000000000000..e380325bc6e27 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/badge_only.css @@ -0,0 +1 @@ +.fa:before{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}@font-face{font-family:FontAwesome;font-style:normal;font-weight:400;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#FontAwesome) format("svg")}.fa:before{font-family:FontAwesome;font-style:normal;font-weight:400;line-height:1}.fa:before,a .fa{text-decoration:inherit}.fa:before,a .fa,li .fa{display:inline-block}li .fa-large:before{width:1.875em}ul.fas{list-style-type:none;margin-left:2em;text-indent:-.8em}ul.fas li .fa{width:.8em}ul.fas li .fa-large:before{vertical-align:baseline}.fa-book:before,.icon-book:before{content:"\f02d"}.fa-caret-down:before,.icon-caret-down:before{content:"\f0d7"}.fa-caret-up:before,.icon-caret-up:before{content:"\f0d8"}.fa-caret-left:before,.icon-caret-left:before{content:"\f0d9"}.fa-caret-right:before,.icon-caret-right:before{content:"\f0da"}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60}.rst-versions .rst-current-version:after{clear:both;content:"";display:block}.rst-versions .rst-current-version .fa{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}} \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Bold.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 0000000000000..6cb60000181db Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Bold.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 0000000000000..7059e23142aae Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Regular.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 0000000000000..f815f63f99da8 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Regular.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 0000000000000..f2c76e5bda18a Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.eot b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.eot new file mode 100644 index 0000000000000..e9f60ca953f93 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.eot differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.svg b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.svg new file mode 100644 index 0000000000000..855c845e538b6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserved. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.ttf b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 0000000000000..35acda2fa1196 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.ttf differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.woff new file mode 100644 index 0000000000000..400014a4b06ee Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 0000000000000..4d13fc60404b9 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/fontawesome-webfont.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold-italic.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold-italic.woff new file mode 100644 index 0000000000000..88ad05b9ff413 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold-italic.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold-italic.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 0000000000000..c4e3d804b57b6 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold-italic.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold.woff new file mode 100644 index 0000000000000..c6dff51f063cc Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold.woff2 new file mode 100644 index 0000000000000..bb195043cfc07 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-bold.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal-italic.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal-italic.woff new file mode 100644 index 0000000000000..76114bc033622 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal-italic.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal-italic.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 0000000000000..3404f37e2e312 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal-italic.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal.woff b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal.woff new file mode 100644 index 0000000000000..ae1307ff5f4c4 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal.woff differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal.woff2 b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal.woff2 new file mode 100644 index 0000000000000..3bf9843328a63 Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/fonts/lato-normal.woff2 differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/theme.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/theme.css new file mode 100644 index 0000000000000..0d9ae7e1a45b8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/css/theme.css @@ -0,0 +1,4 @@ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,.wy-nav-top a,.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs li{display:inline-block}.wy-breadcrumbs li.wy-breadcrumbs-aside{float:right}.wy-breadcrumbs li a{display:inline-block;padding:5px}.wy-breadcrumbs li a:first-child{padding-left:0}.rst-content .wy-breadcrumbs li tt,.wy-breadcrumbs li .rst-content tt,.wy-breadcrumbs li code{padding:5px;border:none;background:none}.rst-content .wy-breadcrumbs li tt.literal,.wy-breadcrumbs li .rst-content tt.literal,.wy-breadcrumbs li code.literal{color:#404040}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.field-list>dt:after,html.writer-html5 .rst-content dl.footnote>dt:after{content:":"}html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.footnote>dt>span.brackets{margin-right:.5rem}html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{font-style:italic}html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.footnote>dd p,html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{font-size:inherit;line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.field-list)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dl:not(.field-list)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/cufftdx_override.css b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/cufftdx_override.css new file mode 100644 index 0000000000000..8355a4ea4aa2e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/cufftdx_override.css @@ -0,0 +1,3 @@ +.wy-nav-content { +max-width: 1240px !important; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/doctools.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/doctools.js new file mode 100644 index 0000000000000..e509e48349c55 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/doctools.js @@ -0,0 +1,326 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for all documentation. + * + * :copyright: Copyright 2007-2022 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + +/** + * make the code below compatible with browsers without + * an installed firebug like debugger +if (!window.console || !console.firebug) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", + "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", + "profile", "profileEnd"]; + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +} + */ + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} + +/** + * Small JavaScript module for the documentation. + */ +var Documentation = { + + init : function() { + this.fixFirefoxAnchorBug(); + this.highlightSearchWords(); + this.initIndexTable(); + if (DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) { + this.initOnKeyListeners(); + } + }, + + /** + * i18n support + */ + TRANSLATIONS : {}, + PLURAL_EXPR : function(n) { return n === 1 ? 0 : 1; }, + LOCALE : 'unknown', + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext : function(string) { + var translated = Documentation.TRANSLATIONS[string]; + if (typeof translated === 'undefined') + return string; + return (typeof translated === 'string') ? translated : translated[0]; + }, + + ngettext : function(singular, plural, n) { + var translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated === 'undefined') + return (n == 1) ? singular : plural; + return translated[Documentation.PLURALEXPR(n)]; + }, + + addTranslations : function(catalog) { + for (var key in catalog.messages) + this.TRANSLATIONS[key] = catalog.messages[key]; + this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); + this.LOCALE = catalog.locale; + }, + + /** + * add context elements like header anchor links + */ + addContextElements : function() { + $('div[id] > :header:first').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this headline')). + appendTo(this); + }); + $('dt[id]').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this definition')). + appendTo(this); + }); + }, + + /** + * workaround a firefox stupidity + * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 + */ + fixFirefoxAnchorBug : function() { + if (document.location.hash && $.browser.mozilla) + window.setTimeout(function() { + document.location.href += ''; + }, 10); + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords : function() { + var params = $.getQueryParameters(); + var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; + if (terms.length) { + var body = $('div.body'); + if (!body.length) { + body = $('body'); + } + window.setTimeout(function() { + $.each(terms, function() { + body.highlightText(this.toLowerCase(), 'highlighted'); + }); + }, 10); + $('') + .appendTo($('#searchbox')); + } + }, + + /** + * init the domain index toggle buttons + */ + initIndexTable : function() { + var togglers = $('img.toggler').click(function() { + var src = $(this).attr('src'); + var idnum = $(this).attr('id').substr(7); + $('tr.cg-' + idnum).toggle(); + if (src.substr(-9) === 'minus.png') + $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); + else + $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); + }).css('display', ''); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { + togglers.click(); + } + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords : function() { + $('#searchbox .highlight-link').fadeOut(300); + $('span.highlighted').removeClass('highlighted'); + var url = new URL(window.location); + url.searchParams.delete('highlight'); + window.history.replaceState({}, '', url); + }, + + /** + * make the url absolute + */ + makeURL : function(relativeURL) { + return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; + }, + + /** + * get the current relative url + */ + getCurrentURL : function() { + var path = document.location.pathname; + var parts = path.split(/\//); + $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { + if (this === '..') + parts.pop(); + }); + var url = parts.join('/'); + return path.substring(url.lastIndexOf('/') + 1, path.length - 1); + }, + + initOnKeyListeners: function() { + $(document).keydown(function(event) { + var activeElementType = document.activeElement.tagName; + // don't navigate when in search box, textarea, dropdown or button + if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT' + && activeElementType !== 'BUTTON' && !event.altKey && !event.ctrlKey && !event.metaKey + && !event.shiftKey) { + switch (event.keyCode) { + case 37: // left + var prevHref = $('link[rel="prev"]').prop('href'); + if (prevHref) { + window.location.href = prevHref; + return false; + } + break; + case 39: // right + var nextHref = $('link[rel="next"]').prop('href'); + if (nextHref) { + window.location.href = nextHref; + return false; + } + break; + } + } + }); + } +}; + +// quick alias for translations +_ = Documentation.gettext; + +$(document).ready(function() { + Documentation.init(); +}); diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/documentation_options.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/documentation_options.js new file mode 100644 index 0000000000000..9af54603b9c00 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/documentation_options.js @@ -0,0 +1,12 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '1.0.0', + LANGUAGE: 'None', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: false, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false +}; \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/file.png b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/file.png new file mode 100644 index 0000000000000..a858a410e4faa Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/file.png differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/jquery-3.5.1.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/jquery-3.5.1.js new file mode 100644 index 0000000000000..50937333b99a5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/_static/jquery-3.5.1.js @@ -0,0 +1,10872 @@ +/*! + * jQuery JavaScript Library v3.5.1 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2020-05-04T22:49Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var flat = arr.flat ? function( array ) { + return arr.flat.call( array ); +} : function( array ) { + return arr.concat.apply( [], array ); +}; + + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + +var isFunction = function isFunction( obj ) { + + // Support: Chrome <=57, Firefox <=52 + // In some browsers, typeof returns "function" for HTML elements + // (i.e., `typeof document.createElement( "object" ) === "function"`). + // We don't want to classify *any* DOM node as a function. + return typeof obj === "function" && typeof obj.nodeType !== "number"; + }; + + +var isWindow = function isWindow( obj ) { + return obj != null && obj === obj.window; + }; + + +var document = window.document; + + + + var preservedScriptAttributes = { + type: true, + src: true, + nonce: true, + noModule: true + }; + + function DOMEval( code, node, doc ) { + doc = doc || document; + + var i, val, + script = doc.createElement( "script" ); + + script.text = code; + if ( node ) { + for ( i in preservedScriptAttributes ) { + + // Support: Firefox 64+, Edge 18+ + // Some browsers don't support the "nonce" property on scripts. + // On the other hand, just using `getAttribute` is not enough as + // the `nonce` attribute is reset to an empty string whenever it + // becomes browsing-context connected. + // See https://github.com/whatwg/html/issues/2369 + // See https://html.spec.whatwg.org/#nonce-attributes + // The `node.getAttribute` check was added for the sake of + // `jQuery.globalEval` so that it can fake a nonce-containing node + // via an object. + val = node[ i ] || node.getAttribute && node.getAttribute( i ); + if ( val ) { + script.setAttribute( i, val ); + } + } + } + doc.head.appendChild( script ).parentNode.removeChild( script ); + } + + +function toType( obj ) { + if ( obj == null ) { + return obj + ""; + } + + // Support: Android <=2.3 only (functionish RegExp) + return typeof obj === "object" || typeof obj === "function" ? + class2type[ toString.call( obj ) ] || "object" : + typeof obj; +} +/* global Symbol */ +// Defining this global in .eslintrc.json would create a danger of using the global +// unguarded in another place, it seems safer to define global only for this module + + + +var + version = "3.5.1", + + // Define a local copy of jQuery + jQuery = function( selector, context ) { + + // The jQuery object is actually just the init constructor 'enhanced' + // Need init if jQuery is called (just allow error to be thrown if not included) + return new jQuery.fn.init( selector, context ); + }; + +jQuery.fn = jQuery.prototype = { + + // The current version of jQuery being used + jquery: version, + + constructor: jQuery, + + // The default length of a jQuery object is 0 + length: 0, + + toArray: function() { + return slice.call( this ); + }, + + // Get the Nth element in the matched element set OR + // Get the whole matched element set as a clean array + get: function( num ) { + + // Return all the elements in a clean array + if ( num == null ) { + return slice.call( this ); + } + + // Return just the one element from the set + return num < 0 ? this[ num + this.length ] : this[ num ]; + }, + + // Take an array of elements and push it onto the stack + // (returning the new matched element set) + pushStack: function( elems ) { + + // Build a new jQuery matched element set + var ret = jQuery.merge( this.constructor(), elems ); + + // Add the old object onto the stack (as a reference) + ret.prevObject = this; + + // Return the newly-formed element set + return ret; + }, + + // Execute a callback for every element in the matched set. + each: function( callback ) { + return jQuery.each( this, callback ); + }, + + map: function( callback ) { + return this.pushStack( jQuery.map( this, function( elem, i ) { + return callback.call( elem, i, elem ); + } ) ); + }, + + slice: function() { + return this.pushStack( slice.apply( this, arguments ) ); + }, + + first: function() { + return this.eq( 0 ); + }, + + last: function() { + return this.eq( -1 ); + }, + + even: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return ( i + 1 ) % 2; + } ) ); + }, + + odd: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return i % 2; + } ) ); + }, + + eq: function( i ) { + var len = this.length, + j = +i + ( i < 0 ? len : 0 ); + return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); + }, + + end: function() { + return this.prevObject || this.constructor(); + }, + + // For internal use only. + // Behaves like an Array's method, not like a jQuery method. + push: push, + sort: arr.sort, + splice: arr.splice +}; + +jQuery.extend = jQuery.fn.extend = function() { + var options, name, src, copy, copyIsArray, clone, + target = arguments[ 0 ] || {}, + i = 1, + length = arguments.length, + deep = false; + + // Handle a deep copy situation + if ( typeof target === "boolean" ) { + deep = target; + + // Skip the boolean and the target + target = arguments[ i ] || {}; + i++; + } + + // Handle case when target is a string or something (possible in deep copy) + if ( typeof target !== "object" && !isFunction( target ) ) { + target = {}; + } + + // Extend jQuery itself if only one argument is passed + if ( i === length ) { + target = this; + i--; + } + + for ( ; i < length; i++ ) { + + // Only deal with non-null/undefined values + if ( ( options = arguments[ i ] ) != null ) { + + // Extend the base object + for ( name in options ) { + copy = options[ name ]; + + // Prevent Object.prototype pollution + // Prevent never-ending loop + if ( name === "__proto__" || target === copy ) { + continue; + } + + // Recurse if we're merging plain objects or arrays + if ( deep && copy && ( jQuery.isPlainObject( copy ) || + ( copyIsArray = Array.isArray( copy ) ) ) ) { + src = target[ name ]; + + // Ensure proper type for the source value + if ( copyIsArray && !Array.isArray( src ) ) { + clone = []; + } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { + clone = {}; + } else { + clone = src; + } + copyIsArray = false; + + // Never move original objects, clone them + target[ name ] = jQuery.extend( deep, clone, copy ); + + // Don't bring in undefined values + } else if ( copy !== undefined ) { + target[ name ] = copy; + } + } + } + } + + // Return the modified object + return target; +}; + +jQuery.extend( { + + // Unique for each copy of jQuery on the page + expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), + + // Assume jQuery is ready without the ready module + isReady: true, + + error: function( msg ) { + throw new Error( msg ); + }, + + noop: function() {}, + + isPlainObject: function( obj ) { + var proto, Ctor; + + // Detect obvious negatives + // Use toString instead of jQuery.type to catch host objects + if ( !obj || toString.call( obj ) !== "[object Object]" ) { + return false; + } + + proto = getProto( obj ); + + // Objects with no prototype (e.g., `Object.create( null )`) are plain + if ( !proto ) { + return true; + } + + // Objects with prototype are plain iff they were constructed by a global Object function + Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; + return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; + }, + + isEmptyObject: function( obj ) { + var name; + + for ( name in obj ) { + return false; + } + return true; + }, + + // Evaluates a script in a provided context; falls back to the global one + // if not specified. + globalEval: function( code, options, doc ) { + DOMEval( code, { nonce: options && options.nonce }, doc ); + }, + + each: function( obj, callback ) { + var length, i = 0; + + if ( isArrayLike( obj ) ) { + length = obj.length; + for ( ; i < length; i++ ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } else { + for ( i in obj ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } + + return obj; + }, + + // results is for internal usage only + makeArray: function( arr, results ) { + var ret = results || []; + + if ( arr != null ) { + if ( isArrayLike( Object( arr ) ) ) { + jQuery.merge( ret, + typeof arr === "string" ? + [ arr ] : arr + ); + } else { + push.call( ret, arr ); + } + } + + return ret; + }, + + inArray: function( elem, arr, i ) { + return arr == null ? -1 : indexOf.call( arr, elem, i ); + }, + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + merge: function( first, second ) { + var len = +second.length, + j = 0, + i = first.length; + + for ( ; j < len; j++ ) { + first[ i++ ] = second[ j ]; + } + + first.length = i; + + return first; + }, + + grep: function( elems, callback, invert ) { + var callbackInverse, + matches = [], + i = 0, + length = elems.length, + callbackExpect = !invert; + + // Go through the array, only saving the items + // that pass the validator function + for ( ; i < length; i++ ) { + callbackInverse = !callback( elems[ i ], i ); + if ( callbackInverse !== callbackExpect ) { + matches.push( elems[ i ] ); + } + } + + return matches; + }, + + // arg is for internal usage only + map: function( elems, callback, arg ) { + var length, value, + i = 0, + ret = []; + + // Go through the array, translating each of the items to their new values + if ( isArrayLike( elems ) ) { + length = elems.length; + for ( ; i < length; i++ ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + + // Go through every key on the object, + } else { + for ( i in elems ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + } + + // Flatten any nested arrays + return flat( ret ); + }, + + // A global GUID counter for objects + guid: 1, + + // jQuery.support is not used in Core but other projects attach their + // properties to it so it needs to exist. + support: support +} ); + +if ( typeof Symbol === "function" ) { + jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; +} + +// Populate the class2type map +jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), +function( _i, name ) { + class2type[ "[object " + name + "]" ] = name.toLowerCase(); +} ); + +function isArrayLike( obj ) { + + // Support: real iOS 8.2 only (not reproducible in simulator) + // `in` check used to prevent JIT error (gh-2145) + // hasOwn isn't used here due to false negatives + // regarding Nodelist length in IE + var length = !!obj && "length" in obj && obj.length, + type = toType( obj ); + + if ( isFunction( obj ) || isWindow( obj ) ) { + return false; + } + + return type === "array" || length === 0 || + typeof length === "number" && length > 0 && ( length - 1 ) in obj; +} +var Sizzle = +/*! + * Sizzle CSS Selector Engine v2.3.5 + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://js.foundation/ + * + * Date: 2020-03-14 + */ +( function( window ) { +var i, + support, + Expr, + getText, + isXML, + tokenize, + compile, + select, + outermostContext, + sortInput, + hasDuplicate, + + // Local document vars + setDocument, + document, + docElem, + documentIsHTML, + rbuggyQSA, + rbuggyMatches, + matches, + contains, + + // Instance-specific data + expando = "sizzle" + 1 * new Date(), + preferredDoc = window.document, + dirruns = 0, + done = 0, + classCache = createCache(), + tokenCache = createCache(), + compilerCache = createCache(), + nonnativeSelectorCache = createCache(), + sortOrder = function( a, b ) { + if ( a === b ) { + hasDuplicate = true; + } + return 0; + }, + + // Instance methods + hasOwn = ( {} ).hasOwnProperty, + arr = [], + pop = arr.pop, + pushNative = arr.push, + push = arr.push, + slice = arr.slice, + + // Use a stripped-down indexOf as it's faster than native + // https://jsperf.com/thor-indexof-vs-for/5 + indexOf = function( list, elem ) { + var i = 0, + len = list.length; + for ( ; i < len; i++ ) { + if ( list[ i ] === elem ) { + return i; + } + } + return -1; + }, + + booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + + "ismap|loop|multiple|open|readonly|required|scoped", + + // Regular expressions + + // http://www.w3.org/TR/css3-selectors/#whitespace + whitespace = "[\\x20\\t\\r\\n\\f]", + + // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram + identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + + "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", + + // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors + attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + + + // Operator (capture 2) + "*([*^$|!~]?=)" + whitespace + + + // "Attribute values must be CSS identifiers [capture 5] + // or strings [capture 3 or capture 4]" + "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + + whitespace + "*\\]", + + pseudos = ":(" + identifier + ")(?:\\((" + + + // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: + // 1. quoted (capture 3; capture 4 or capture 5) + "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + + + // 2. simple (capture 6) + "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + + + // 3. anything else (capture 2) + ".*" + + ")\\)|)", + + // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter + rwhitespace = new RegExp( whitespace + "+", "g" ), + rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + + whitespace + "+$", "g" ), + + rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), + rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + + "*" ), + rdescend = new RegExp( whitespace + "|>" ), + + rpseudo = new RegExp( pseudos ), + ridentifier = new RegExp( "^" + identifier + "$" ), + + matchExpr = { + "ID": new RegExp( "^#(" + identifier + ")" ), + "CLASS": new RegExp( "^\\.(" + identifier + ")" ), + "TAG": new RegExp( "^(" + identifier + "|[*])" ), + "ATTR": new RegExp( "^" + attributes ), + "PSEUDO": new RegExp( "^" + pseudos ), + "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + + whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + + whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), + "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), + + // For use in libraries implementing .is() + // We use this for POS matching in `select` + "needsContext": new RegExp( "^" + whitespace + + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) + }, + + rhtml = /HTML$/i, + rinputs = /^(?:input|select|textarea|button)$/i, + rheader = /^h\d$/i, + + rnative = /^[^{]+\{\s*\[native \w/, + + // Easily-parseable/retrievable ID or TAG or CLASS selectors + rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, + + rsibling = /[+~]/, + + // CSS escapes + // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters + runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), + funescape = function( escape, nonHex ) { + var high = "0x" + escape.slice( 1 ) - 0x10000; + + return nonHex ? + + // Strip the backslash prefix from a non-hex escape sequence + nonHex : + + // Replace a hexadecimal escape sequence with the encoded Unicode code point + // Support: IE <=11+ + // For values outside the Basic Multilingual Plane (BMP), manually construct a + // surrogate pair + high < 0 ? + String.fromCharCode( high + 0x10000 ) : + String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); + }, + + // CSS string/identifier serialization + // https://drafts.csswg.org/cssom/#common-serializing-idioms + rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, + fcssescape = function( ch, asCodePoint ) { + if ( asCodePoint ) { + + // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER + if ( ch === "\0" ) { + return "\uFFFD"; + } + + // Control characters and (dependent upon position) numbers get escaped as code points + return ch.slice( 0, -1 ) + "\\" + + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; + } + + // Other potentially-special ASCII characters get backslash-escaped + return "\\" + ch; + }, + + // Used for iframes + // See setDocument() + // Removing the function wrapper causes a "Permission Denied" + // error in IE + unloadHandler = function() { + setDocument(); + }, + + inDisabledFieldset = addCombinator( + function( elem ) { + return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; + }, + { dir: "parentNode", next: "legend" } + ); + +// Optimize for push.apply( _, NodeList ) +try { + push.apply( + ( arr = slice.call( preferredDoc.childNodes ) ), + preferredDoc.childNodes + ); + + // Support: Android<4.0 + // Detect silently failing push.apply + // eslint-disable-next-line no-unused-expressions + arr[ preferredDoc.childNodes.length ].nodeType; +} catch ( e ) { + push = { apply: arr.length ? + + // Leverage slice if possible + function( target, els ) { + pushNative.apply( target, slice.call( els ) ); + } : + + // Support: IE<9 + // Otherwise append directly + function( target, els ) { + var j = target.length, + i = 0; + + // Can't trust NodeList.length + while ( ( target[ j++ ] = els[ i++ ] ) ) {} + target.length = j - 1; + } + }; +} + +function Sizzle( selector, context, results, seed ) { + var m, i, elem, nid, match, groups, newSelector, + newContext = context && context.ownerDocument, + + // nodeType defaults to 9, since context defaults to document + nodeType = context ? context.nodeType : 9; + + results = results || []; + + // Return early from calls with invalid selector or context + if ( typeof selector !== "string" || !selector || + nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { + + return results; + } + + // Try to shortcut find operations (as opposed to filters) in HTML documents + if ( !seed ) { + setDocument( context ); + context = context || document; + + if ( documentIsHTML ) { + + // If the selector is sufficiently simple, try using a "get*By*" DOM method + // (excepting DocumentFragment context, where the methods don't exist) + if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { + + // ID selector + if ( ( m = match[ 1 ] ) ) { + + // Document context + if ( nodeType === 9 ) { + if ( ( elem = context.getElementById( m ) ) ) { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( elem.id === m ) { + results.push( elem ); + return results; + } + } else { + return results; + } + + // Element context + } else { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( newContext && ( elem = newContext.getElementById( m ) ) && + contains( context, elem ) && + elem.id === m ) { + + results.push( elem ); + return results; + } + } + + // Type selector + } else if ( match[ 2 ] ) { + push.apply( results, context.getElementsByTagName( selector ) ); + return results; + + // Class selector + } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && + context.getElementsByClassName ) { + + push.apply( results, context.getElementsByClassName( m ) ); + return results; + } + } + + // Take advantage of querySelectorAll + if ( support.qsa && + !nonnativeSelectorCache[ selector + " " ] && + ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && + + // Support: IE 8 only + // Exclude object elements + ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { + + newSelector = selector; + newContext = context; + + // qSA considers elements outside a scoping root when evaluating child or + // descendant combinators, which is not what we want. + // In such cases, we work around the behavior by prefixing every selector in the + // list with an ID selector referencing the scope context. + // The technique has to be used as well when a leading combinator is used + // as such selectors are not recognized by querySelectorAll. + // Thanks to Andrew Dupont for this technique. + if ( nodeType === 1 && + ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { + + // Expand context for sibling selectors + newContext = rsibling.test( selector ) && testContext( context.parentNode ) || + context; + + // We can use :scope instead of the ID hack if the browser + // supports it & if we're not changing the context. + if ( newContext !== context || !support.scope ) { + + // Capture the context ID, setting it first if necessary + if ( ( nid = context.getAttribute( "id" ) ) ) { + nid = nid.replace( rcssescape, fcssescape ); + } else { + context.setAttribute( "id", ( nid = expando ) ); + } + } + + // Prefix every selector in the list + groups = tokenize( selector ); + i = groups.length; + while ( i-- ) { + groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + + toSelector( groups[ i ] ); + } + newSelector = groups.join( "," ); + } + + try { + push.apply( results, + newContext.querySelectorAll( newSelector ) + ); + return results; + } catch ( qsaError ) { + nonnativeSelectorCache( selector, true ); + } finally { + if ( nid === expando ) { + context.removeAttribute( "id" ); + } + } + } + } + } + + // All others + return select( selector.replace( rtrim, "$1" ), context, results, seed ); +} + +/** + * Create key-value caches of limited size + * @returns {function(string, object)} Returns the Object data after storing it on itself with + * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) + * deleting the oldest entry + */ +function createCache() { + var keys = []; + + function cache( key, value ) { + + // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) + if ( keys.push( key + " " ) > Expr.cacheLength ) { + + // Only keep the most recent entries + delete cache[ keys.shift() ]; + } + return ( cache[ key + " " ] = value ); + } + return cache; +} + +/** + * Mark a function for special use by Sizzle + * @param {Function} fn The function to mark + */ +function markFunction( fn ) { + fn[ expando ] = true; + return fn; +} + +/** + * Support testing using an element + * @param {Function} fn Passed the created element and returns a boolean result + */ +function assert( fn ) { + var el = document.createElement( "fieldset" ); + + try { + return !!fn( el ); + } catch ( e ) { + return false; + } finally { + + // Remove from its parent by default + if ( el.parentNode ) { + el.parentNode.removeChild( el ); + } + + // release memory in IE + el = null; + } +} + +/** + * Adds the same handler for all of the specified attrs + * @param {String} attrs Pipe-separated list of attributes + * @param {Function} handler The method that will be applied + */ +function addHandle( attrs, handler ) { + var arr = attrs.split( "|" ), + i = arr.length; + + while ( i-- ) { + Expr.attrHandle[ arr[ i ] ] = handler; + } +} + +/** + * Checks document order of two siblings + * @param {Element} a + * @param {Element} b + * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b + */ +function siblingCheck( a, b ) { + var cur = b && a, + diff = cur && a.nodeType === 1 && b.nodeType === 1 && + a.sourceIndex - b.sourceIndex; + + // Use IE sourceIndex if available on both nodes + if ( diff ) { + return diff; + } + + // Check if b follows a + if ( cur ) { + while ( ( cur = cur.nextSibling ) ) { + if ( cur === b ) { + return -1; + } + } + } + + return a ? 1 : -1; +} + +/** + * Returns a function to use in pseudos for input types + * @param {String} type + */ +function createInputPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for buttons + * @param {String} type + */ +function createButtonPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return ( name === "input" || name === "button" ) && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for :enabled/:disabled + * @param {Boolean} disabled true for :disabled; false for :enabled + */ +function createDisabledPseudo( disabled ) { + + // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable + return function( elem ) { + + // Only certain elements can match :enabled or :disabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled + if ( "form" in elem ) { + + // Check for inherited disabledness on relevant non-disabled elements: + // * listed form-associated elements in a disabled fieldset + // https://html.spec.whatwg.org/multipage/forms.html#category-listed + // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled + // * option elements in a disabled optgroup + // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled + // All such elements have a "form" property. + if ( elem.parentNode && elem.disabled === false ) { + + // Option elements defer to a parent optgroup if present + if ( "label" in elem ) { + if ( "label" in elem.parentNode ) { + return elem.parentNode.disabled === disabled; + } else { + return elem.disabled === disabled; + } + } + + // Support: IE 6 - 11 + // Use the isDisabled shortcut property to check for disabled fieldset ancestors + return elem.isDisabled === disabled || + + // Where there is no isDisabled, check manually + /* jshint -W018 */ + elem.isDisabled !== !disabled && + inDisabledFieldset( elem ) === disabled; + } + + return elem.disabled === disabled; + + // Try to winnow out elements that can't be disabled before trusting the disabled property. + // Some victims get caught in our net (label, legend, menu, track), but it shouldn't + // even exist on them, let alone have a boolean value. + } else if ( "label" in elem ) { + return elem.disabled === disabled; + } + + // Remaining elements are neither :enabled nor :disabled + return false; + }; +} + +/** + * Returns a function to use in pseudos for positionals + * @param {Function} fn + */ +function createPositionalPseudo( fn ) { + return markFunction( function( argument ) { + argument = +argument; + return markFunction( function( seed, matches ) { + var j, + matchIndexes = fn( [], seed.length, argument ), + i = matchIndexes.length; + + // Match elements found at the specified indexes + while ( i-- ) { + if ( seed[ ( j = matchIndexes[ i ] ) ] ) { + seed[ j ] = !( matches[ j ] = seed[ j ] ); + } + } + } ); + } ); +} + +/** + * Checks a node for validity as a Sizzle context + * @param {Element|Object=} context + * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value + */ +function testContext( context ) { + return context && typeof context.getElementsByTagName !== "undefined" && context; +} + +// Expose support vars for convenience +support = Sizzle.support = {}; + +/** + * Detects XML nodes + * @param {Element|Object} elem An element or a document + * @returns {Boolean} True iff elem is a non-HTML XML node + */ +isXML = Sizzle.isXML = function( elem ) { + var namespace = elem.namespaceURI, + docElem = ( elem.ownerDocument || elem ).documentElement; + + // Support: IE <=8 + // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes + // https://bugs.jquery.com/ticket/4833 + return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); +}; + +/** + * Sets document-related variables once based on the current document + * @param {Element|Object} [doc] An element or document object to use to set the document + * @returns {Object} Returns the current document + */ +setDocument = Sizzle.setDocument = function( node ) { + var hasCompare, subWindow, + doc = node ? node.ownerDocument || node : preferredDoc; + + // Return early if doc is invalid or already selected + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { + return document; + } + + // Update global variables + document = doc; + docElem = document.documentElement; + documentIsHTML = !isXML( document ); + + // Support: IE 9 - 11+, Edge 12 - 18+ + // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( preferredDoc != document && + ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { + + // Support: IE 11, Edge + if ( subWindow.addEventListener ) { + subWindow.addEventListener( "unload", unloadHandler, false ); + + // Support: IE 9 - 10 only + } else if ( subWindow.attachEvent ) { + subWindow.attachEvent( "onunload", unloadHandler ); + } + } + + // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, + // Safari 4 - 5 only, Opera <=11.6 - 12.x only + // IE/Edge & older browsers don't support the :scope pseudo-class. + // Support: Safari 6.0 only + // Safari 6.0 supports :scope but it's an alias of :root there. + support.scope = assert( function( el ) { + docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); + return typeof el.querySelectorAll !== "undefined" && + !el.querySelectorAll( ":scope fieldset div" ).length; + } ); + + /* Attributes + ---------------------------------------------------------------------- */ + + // Support: IE<8 + // Verify that getAttribute really returns attributes and not properties + // (excepting IE8 booleans) + support.attributes = assert( function( el ) { + el.className = "i"; + return !el.getAttribute( "className" ); + } ); + + /* getElement(s)By* + ---------------------------------------------------------------------- */ + + // Check if getElementsByTagName("*") returns only elements + support.getElementsByTagName = assert( function( el ) { + el.appendChild( document.createComment( "" ) ); + return !el.getElementsByTagName( "*" ).length; + } ); + + // Support: IE<9 + support.getElementsByClassName = rnative.test( document.getElementsByClassName ); + + // Support: IE<10 + // Check if getElementById returns elements by name + // The broken getElementById methods don't pick up programmatically-set names, + // so use a roundabout getElementsByName test + support.getById = assert( function( el ) { + docElem.appendChild( el ).id = expando; + return !document.getElementsByName || !document.getElementsByName( expando ).length; + } ); + + // ID filter and find + if ( support.getById ) { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + return elem.getAttribute( "id" ) === attrId; + }; + }; + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var elem = context.getElementById( id ); + return elem ? [ elem ] : []; + } + }; + } else { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + var node = typeof elem.getAttributeNode !== "undefined" && + elem.getAttributeNode( "id" ); + return node && node.value === attrId; + }; + }; + + // Support: IE 6 - 7 only + // getElementById is not reliable as a find shortcut + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var node, i, elems, + elem = context.getElementById( id ); + + if ( elem ) { + + // Verify the id attribute + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + + // Fall back on getElementsByName + elems = context.getElementsByName( id ); + i = 0; + while ( ( elem = elems[ i++ ] ) ) { + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + } + } + + return []; + } + }; + } + + // Tag + Expr.find[ "TAG" ] = support.getElementsByTagName ? + function( tag, context ) { + if ( typeof context.getElementsByTagName !== "undefined" ) { + return context.getElementsByTagName( tag ); + + // DocumentFragment nodes don't have gEBTN + } else if ( support.qsa ) { + return context.querySelectorAll( tag ); + } + } : + + function( tag, context ) { + var elem, + tmp = [], + i = 0, + + // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too + results = context.getElementsByTagName( tag ); + + // Filter out possible comments + if ( tag === "*" ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem.nodeType === 1 ) { + tmp.push( elem ); + } + } + + return tmp; + } + return results; + }; + + // Class + Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { + if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { + return context.getElementsByClassName( className ); + } + }; + + /* QSA/matchesSelector + ---------------------------------------------------------------------- */ + + // QSA and matchesSelector support + + // matchesSelector(:active) reports false when true (IE9/Opera 11.5) + rbuggyMatches = []; + + // qSa(:focus) reports false when true (Chrome 21) + // We allow this because of a bug in IE8/9 that throws an error + // whenever `document.activeElement` is accessed on an iframe + // So, we allow :focus to pass through QSA all the time to avoid the IE error + // See https://bugs.jquery.com/ticket/13378 + rbuggyQSA = []; + + if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { + + // Build QSA regex + // Regex strategy adopted from Diego Perini + assert( function( el ) { + + var input; + + // Select is set to empty string on purpose + // This is to test IE's treatment of not explicitly + // setting a boolean content attribute, + // since its presence should be enough + // https://bugs.jquery.com/ticket/12359 + docElem.appendChild( el ).innerHTML = "" + + ""; + + // Support: IE8, Opera 11-12.16 + // Nothing should be selected when empty strings follow ^= or $= or *= + // The test attribute must be unknown in Opera but "safe" for WinRT + // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section + if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { + rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); + } + + // Support: IE8 + // Boolean attributes and "value" are not treated correctly + if ( !el.querySelectorAll( "[selected]" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); + } + + // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ + if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { + rbuggyQSA.push( "~=" ); + } + + // Support: IE 11+, Edge 15 - 18+ + // IE 11/Edge don't find elements on a `[name='']` query in some cases. + // Adding a temporary attribute to the document before the selection works + // around the issue. + // Interestingly, IE 10 & older don't seem to have the issue. + input = document.createElement( "input" ); + input.setAttribute( "name", "" ); + el.appendChild( input ); + if ( !el.querySelectorAll( "[name='']" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + + whitespace + "*(?:''|\"\")" ); + } + + // Webkit/Opera - :checked should return selected option elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + // IE8 throws error here and will not see later tests + if ( !el.querySelectorAll( ":checked" ).length ) { + rbuggyQSA.push( ":checked" ); + } + + // Support: Safari 8+, iOS 8+ + // https://bugs.webkit.org/show_bug.cgi?id=136851 + // In-page `selector#id sibling-combinator selector` fails + if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { + rbuggyQSA.push( ".#.+[+~]" ); + } + + // Support: Firefox <=3.6 - 5 only + // Old Firefox doesn't throw on a badly-escaped identifier. + el.querySelectorAll( "\\\f" ); + rbuggyQSA.push( "[\\r\\n\\f]" ); + } ); + + assert( function( el ) { + el.innerHTML = "" + + ""; + + // Support: Windows 8 Native Apps + // The type and name attributes are restricted during .innerHTML assignment + var input = document.createElement( "input" ); + input.setAttribute( "type", "hidden" ); + el.appendChild( input ).setAttribute( "name", "D" ); + + // Support: IE8 + // Enforce case-sensitivity of name attribute + if ( el.querySelectorAll( "[name=d]" ).length ) { + rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); + } + + // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) + // IE8 throws error here and will not see later tests + if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: IE9-11+ + // IE's :disabled selector does not pick up the children of disabled fieldsets + docElem.appendChild( el ).disabled = true; + if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: Opera 10 - 11 only + // Opera 10-11 does not throw on post-comma invalid pseudos + el.querySelectorAll( "*,:x" ); + rbuggyQSA.push( ",.*:" ); + } ); + } + + if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || + docElem.webkitMatchesSelector || + docElem.mozMatchesSelector || + docElem.oMatchesSelector || + docElem.msMatchesSelector ) ) ) ) { + + assert( function( el ) { + + // Check to see if it's possible to do matchesSelector + // on a disconnected node (IE 9) + support.disconnectedMatch = matches.call( el, "*" ); + + // This should fail with an exception + // Gecko does not error, returns false instead + matches.call( el, "[s!='']:x" ); + rbuggyMatches.push( "!=", pseudos ); + } ); + } + + rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); + rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); + + /* Contains + ---------------------------------------------------------------------- */ + hasCompare = rnative.test( docElem.compareDocumentPosition ); + + // Element contains another + // Purposefully self-exclusive + // As in, an element does not contain itself + contains = hasCompare || rnative.test( docElem.contains ) ? + function( a, b ) { + var adown = a.nodeType === 9 ? a.documentElement : a, + bup = b && b.parentNode; + return a === bup || !!( bup && bup.nodeType === 1 && ( + adown.contains ? + adown.contains( bup ) : + a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 + ) ); + } : + function( a, b ) { + if ( b ) { + while ( ( b = b.parentNode ) ) { + if ( b === a ) { + return true; + } + } + } + return false; + }; + + /* Sorting + ---------------------------------------------------------------------- */ + + // Document order sorting + sortOrder = hasCompare ? + function( a, b ) { + + // Flag for duplicate removal + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + // Sort on method existence if only one input has compareDocumentPosition + var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; + if ( compare ) { + return compare; + } + + // Calculate position if both inputs belong to the same document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? + a.compareDocumentPosition( b ) : + + // Otherwise we know they are disconnected + 1; + + // Disconnected nodes + if ( compare & 1 || + ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { + + // Choose the first element that is related to our preferred document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( a == document || a.ownerDocument == preferredDoc && + contains( preferredDoc, a ) ) { + return -1; + } + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( b == document || b.ownerDocument == preferredDoc && + contains( preferredDoc, b ) ) { + return 1; + } + + // Maintain original order + return sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + } + + return compare & 4 ? -1 : 1; + } : + function( a, b ) { + + // Exit early if the nodes are identical + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + var cur, + i = 0, + aup = a.parentNode, + bup = b.parentNode, + ap = [ a ], + bp = [ b ]; + + // Parentless nodes are either documents or disconnected + if ( !aup || !bup ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + return a == document ? -1 : + b == document ? 1 : + /* eslint-enable eqeqeq */ + aup ? -1 : + bup ? 1 : + sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + + // If the nodes are siblings, we can do a quick check + } else if ( aup === bup ) { + return siblingCheck( a, b ); + } + + // Otherwise we need full lists of their ancestors for comparison + cur = a; + while ( ( cur = cur.parentNode ) ) { + ap.unshift( cur ); + } + cur = b; + while ( ( cur = cur.parentNode ) ) { + bp.unshift( cur ); + } + + // Walk down the tree looking for a discrepancy + while ( ap[ i ] === bp[ i ] ) { + i++; + } + + return i ? + + // Do a sibling check if the nodes have a common ancestor + siblingCheck( ap[ i ], bp[ i ] ) : + + // Otherwise nodes in our document sort first + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + ap[ i ] == preferredDoc ? -1 : + bp[ i ] == preferredDoc ? 1 : + /* eslint-enable eqeqeq */ + 0; + }; + + return document; +}; + +Sizzle.matches = function( expr, elements ) { + return Sizzle( expr, null, null, elements ); +}; + +Sizzle.matchesSelector = function( elem, expr ) { + setDocument( elem ); + + if ( support.matchesSelector && documentIsHTML && + !nonnativeSelectorCache[ expr + " " ] && + ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && + ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { + + try { + var ret = matches.call( elem, expr ); + + // IE 9's matchesSelector returns false on disconnected nodes + if ( ret || support.disconnectedMatch || + + // As well, disconnected nodes are said to be in a document + // fragment in IE 9 + elem.document && elem.document.nodeType !== 11 ) { + return ret; + } + } catch ( e ) { + nonnativeSelectorCache( expr, true ); + } + } + + return Sizzle( expr, document, null, [ elem ] ).length > 0; +}; + +Sizzle.contains = function( context, elem ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( context.ownerDocument || context ) != document ) { + setDocument( context ); + } + return contains( context, elem ); +}; + +Sizzle.attr = function( elem, name ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( elem.ownerDocument || elem ) != document ) { + setDocument( elem ); + } + + var fn = Expr.attrHandle[ name.toLowerCase() ], + + // Don't get fooled by Object.prototype properties (jQuery #13807) + val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? + fn( elem, name, !documentIsHTML ) : + undefined; + + return val !== undefined ? + val : + support.attributes || !documentIsHTML ? + elem.getAttribute( name ) : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; +}; + +Sizzle.escape = function( sel ) { + return ( sel + "" ).replace( rcssescape, fcssescape ); +}; + +Sizzle.error = function( msg ) { + throw new Error( "Syntax error, unrecognized expression: " + msg ); +}; + +/** + * Document sorting and removing duplicates + * @param {ArrayLike} results + */ +Sizzle.uniqueSort = function( results ) { + var elem, + duplicates = [], + j = 0, + i = 0; + + // Unless we *know* we can detect duplicates, assume their presence + hasDuplicate = !support.detectDuplicates; + sortInput = !support.sortStable && results.slice( 0 ); + results.sort( sortOrder ); + + if ( hasDuplicate ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem === results[ i ] ) { + j = duplicates.push( i ); + } + } + while ( j-- ) { + results.splice( duplicates[ j ], 1 ); + } + } + + // Clear input after sorting to release objects + // See https://github.com/jquery/sizzle/pull/225 + sortInput = null; + + return results; +}; + +/** + * Utility function for retrieving the text value of an array of DOM nodes + * @param {Array|Element} elem + */ +getText = Sizzle.getText = function( elem ) { + var node, + ret = "", + i = 0, + nodeType = elem.nodeType; + + if ( !nodeType ) { + + // If no nodeType, this is expected to be an array + while ( ( node = elem[ i++ ] ) ) { + + // Do not traverse comment nodes + ret += getText( node ); + } + } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { + + // Use textContent for elements + // innerText usage removed for consistency of new lines (jQuery #11153) + if ( typeof elem.textContent === "string" ) { + return elem.textContent; + } else { + + // Traverse its children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + ret += getText( elem ); + } + } + } else if ( nodeType === 3 || nodeType === 4 ) { + return elem.nodeValue; + } + + // Do not include comment or processing instruction nodes + + return ret; +}; + +Expr = Sizzle.selectors = { + + // Can be adjusted by the user + cacheLength: 50, + + createPseudo: markFunction, + + match: matchExpr, + + attrHandle: {}, + + find: {}, + + relative: { + ">": { dir: "parentNode", first: true }, + " ": { dir: "parentNode" }, + "+": { dir: "previousSibling", first: true }, + "~": { dir: "previousSibling" } + }, + + preFilter: { + "ATTR": function( match ) { + match[ 1 ] = match[ 1 ].replace( runescape, funescape ); + + // Move the given value to match[3] whether quoted or unquoted + match[ 3 ] = ( match[ 3 ] || match[ 4 ] || + match[ 5 ] || "" ).replace( runescape, funescape ); + + if ( match[ 2 ] === "~=" ) { + match[ 3 ] = " " + match[ 3 ] + " "; + } + + return match.slice( 0, 4 ); + }, + + "CHILD": function( match ) { + + /* matches from matchExpr["CHILD"] + 1 type (only|nth|...) + 2 what (child|of-type) + 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) + 4 xn-component of xn+y argument ([+-]?\d*n|) + 5 sign of xn-component + 6 x of xn-component + 7 sign of y-component + 8 y of y-component + */ + match[ 1 ] = match[ 1 ].toLowerCase(); + + if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { + + // nth-* requires argument + if ( !match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + // numeric x and y parameters for Expr.filter.CHILD + // remember that false/true cast respectively to 0/1 + match[ 4 ] = +( match[ 4 ] ? + match[ 5 ] + ( match[ 6 ] || 1 ) : + 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); + match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); + + // other types prohibit arguments + } else if ( match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + return match; + }, + + "PSEUDO": function( match ) { + var excess, + unquoted = !match[ 6 ] && match[ 2 ]; + + if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { + return null; + } + + // Accept quoted arguments as-is + if ( match[ 3 ] ) { + match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; + + // Strip excess characters from unquoted arguments + } else if ( unquoted && rpseudo.test( unquoted ) && + + // Get excess from tokenize (recursively) + ( excess = tokenize( unquoted, true ) ) && + + // advance to the next closing parenthesis + ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { + + // excess is a negative index + match[ 0 ] = match[ 0 ].slice( 0, excess ); + match[ 2 ] = unquoted.slice( 0, excess ); + } + + // Return only captures needed by the pseudo filter method (type and argument) + return match.slice( 0, 3 ); + } + }, + + filter: { + + "TAG": function( nodeNameSelector ) { + var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); + return nodeNameSelector === "*" ? + function() { + return true; + } : + function( elem ) { + return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; + }; + }, + + "CLASS": function( className ) { + var pattern = classCache[ className + " " ]; + + return pattern || + ( pattern = new RegExp( "(^|" + whitespace + + ")" + className + "(" + whitespace + "|$)" ) ) && classCache( + className, function( elem ) { + return pattern.test( + typeof elem.className === "string" && elem.className || + typeof elem.getAttribute !== "undefined" && + elem.getAttribute( "class" ) || + "" + ); + } ); + }, + + "ATTR": function( name, operator, check ) { + return function( elem ) { + var result = Sizzle.attr( elem, name ); + + if ( result == null ) { + return operator === "!="; + } + if ( !operator ) { + return true; + } + + result += ""; + + /* eslint-disable max-len */ + + return operator === "=" ? result === check : + operator === "!=" ? result !== check : + operator === "^=" ? check && result.indexOf( check ) === 0 : + operator === "*=" ? check && result.indexOf( check ) > -1 : + operator === "$=" ? check && result.slice( -check.length ) === check : + operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : + operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : + false; + /* eslint-enable max-len */ + + }; + }, + + "CHILD": function( type, what, _argument, first, last ) { + var simple = type.slice( 0, 3 ) !== "nth", + forward = type.slice( -4 ) !== "last", + ofType = what === "of-type"; + + return first === 1 && last === 0 ? + + // Shortcut for :nth-*(n) + function( elem ) { + return !!elem.parentNode; + } : + + function( elem, _context, xml ) { + var cache, uniqueCache, outerCache, node, nodeIndex, start, + dir = simple !== forward ? "nextSibling" : "previousSibling", + parent = elem.parentNode, + name = ofType && elem.nodeName.toLowerCase(), + useCache = !xml && !ofType, + diff = false; + + if ( parent ) { + + // :(first|last|only)-(child|of-type) + if ( simple ) { + while ( dir ) { + node = elem; + while ( ( node = node[ dir ] ) ) { + if ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) { + + return false; + } + } + + // Reverse direction for :only-* (if we haven't yet done so) + start = dir = type === "only" && !start && "nextSibling"; + } + return true; + } + + start = [ forward ? parent.firstChild : parent.lastChild ]; + + // non-xml :nth-child(...) stores cache data on `parent` + if ( forward && useCache ) { + + // Seek `elem` from a previously-cached index + + // ...in a gzip-friendly way + node = parent; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex && cache[ 2 ]; + node = nodeIndex && parent.childNodes[ nodeIndex ]; + + while ( ( node = ++nodeIndex && node && node[ dir ] || + + // Fallback to seeking `elem` from the start + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + // When found, cache indexes on `parent` and break + if ( node.nodeType === 1 && ++diff && node === elem ) { + uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; + break; + } + } + + } else { + + // Use previously-cached element index if available + if ( useCache ) { + + // ...in a gzip-friendly way + node = elem; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex; + } + + // xml :nth-child(...) + // or :nth-last-child(...) or :nth(-last)?-of-type(...) + if ( diff === false ) { + + // Use the same loop as above to seek `elem` from the start + while ( ( node = ++nodeIndex && node && node[ dir ] || + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + if ( ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) && + ++diff ) { + + // Cache the index of each encountered element + if ( useCache ) { + outerCache = node[ expando ] || + ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + uniqueCache[ type ] = [ dirruns, diff ]; + } + + if ( node === elem ) { + break; + } + } + } + } + } + + // Incorporate the offset, then check against cycle size + diff -= last; + return diff === first || ( diff % first === 0 && diff / first >= 0 ); + } + }; + }, + + "PSEUDO": function( pseudo, argument ) { + + // pseudo-class names are case-insensitive + // http://www.w3.org/TR/selectors/#pseudo-classes + // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters + // Remember that setFilters inherits from pseudos + var args, + fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || + Sizzle.error( "unsupported pseudo: " + pseudo ); + + // The user may use createPseudo to indicate that + // arguments are needed to create the filter function + // just as Sizzle does + if ( fn[ expando ] ) { + return fn( argument ); + } + + // But maintain support for old signatures + if ( fn.length > 1 ) { + args = [ pseudo, pseudo, "", argument ]; + return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? + markFunction( function( seed, matches ) { + var idx, + matched = fn( seed, argument ), + i = matched.length; + while ( i-- ) { + idx = indexOf( seed, matched[ i ] ); + seed[ idx ] = !( matches[ idx ] = matched[ i ] ); + } + } ) : + function( elem ) { + return fn( elem, 0, args ); + }; + } + + return fn; + } + }, + + pseudos: { + + // Potentially complex pseudos + "not": markFunction( function( selector ) { + + // Trim the selector passed to compile + // to avoid treating leading and trailing + // spaces as combinators + var input = [], + results = [], + matcher = compile( selector.replace( rtrim, "$1" ) ); + + return matcher[ expando ] ? + markFunction( function( seed, matches, _context, xml ) { + var elem, + unmatched = matcher( seed, null, xml, [] ), + i = seed.length; + + // Match elements unmatched by `matcher` + while ( i-- ) { + if ( ( elem = unmatched[ i ] ) ) { + seed[ i ] = !( matches[ i ] = elem ); + } + } + } ) : + function( elem, _context, xml ) { + input[ 0 ] = elem; + matcher( input, null, xml, results ); + + // Don't keep the element (issue #299) + input[ 0 ] = null; + return !results.pop(); + }; + } ), + + "has": markFunction( function( selector ) { + return function( elem ) { + return Sizzle( selector, elem ).length > 0; + }; + } ), + + "contains": markFunction( function( text ) { + text = text.replace( runescape, funescape ); + return function( elem ) { + return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; + }; + } ), + + // "Whether an element is represented by a :lang() selector + // is based solely on the element's language value + // being equal to the identifier C, + // or beginning with the identifier C immediately followed by "-". + // The matching of C against the element's language value is performed case-insensitively. + // The identifier C does not have to be a valid language name." + // http://www.w3.org/TR/selectors/#lang-pseudo + "lang": markFunction( function( lang ) { + + // lang value must be a valid identifier + if ( !ridentifier.test( lang || "" ) ) { + Sizzle.error( "unsupported lang: " + lang ); + } + lang = lang.replace( runescape, funescape ).toLowerCase(); + return function( elem ) { + var elemLang; + do { + if ( ( elemLang = documentIsHTML ? + elem.lang : + elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { + + elemLang = elemLang.toLowerCase(); + return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; + } + } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); + return false; + }; + } ), + + // Miscellaneous + "target": function( elem ) { + var hash = window.location && window.location.hash; + return hash && hash.slice( 1 ) === elem.id; + }, + + "root": function( elem ) { + return elem === docElem; + }, + + "focus": function( elem ) { + return elem === document.activeElement && + ( !document.hasFocus || document.hasFocus() ) && + !!( elem.type || elem.href || ~elem.tabIndex ); + }, + + // Boolean properties + "enabled": createDisabledPseudo( false ), + "disabled": createDisabledPseudo( true ), + + "checked": function( elem ) { + + // In CSS3, :checked should return both checked and selected elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + var nodeName = elem.nodeName.toLowerCase(); + return ( nodeName === "input" && !!elem.checked ) || + ( nodeName === "option" && !!elem.selected ); + }, + + "selected": function( elem ) { + + // Accessing this property makes selected-by-default + // options in Safari work properly + if ( elem.parentNode ) { + // eslint-disable-next-line no-unused-expressions + elem.parentNode.selectedIndex; + } + + return elem.selected === true; + }, + + // Contents + "empty": function( elem ) { + + // http://www.w3.org/TR/selectors/#empty-pseudo + // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), + // but not by others (comment: 8; processing instruction: 7; etc.) + // nodeType < 6 works because attributes (2) do not appear as children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + if ( elem.nodeType < 6 ) { + return false; + } + } + return true; + }, + + "parent": function( elem ) { + return !Expr.pseudos[ "empty" ]( elem ); + }, + + // Element/input types + "header": function( elem ) { + return rheader.test( elem.nodeName ); + }, + + "input": function( elem ) { + return rinputs.test( elem.nodeName ); + }, + + "button": function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === "button" || name === "button"; + }, + + "text": function( elem ) { + var attr; + return elem.nodeName.toLowerCase() === "input" && + elem.type === "text" && + + // Support: IE<8 + // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" + ( ( attr = elem.getAttribute( "type" ) ) == null || + attr.toLowerCase() === "text" ); + }, + + // Position-in-collection + "first": createPositionalPseudo( function() { + return [ 0 ]; + } ), + + "last": createPositionalPseudo( function( _matchIndexes, length ) { + return [ length - 1 ]; + } ), + + "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { + return [ argument < 0 ? argument + length : argument ]; + } ), + + "even": createPositionalPseudo( function( matchIndexes, length ) { + var i = 0; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "odd": createPositionalPseudo( function( matchIndexes, length ) { + var i = 1; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? + argument + length : + argument > length ? + length : + argument; + for ( ; --i >= 0; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; ++i < length; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ) + } +}; + +Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; + +// Add button/input type pseudos +for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { + Expr.pseudos[ i ] = createInputPseudo( i ); +} +for ( i in { submit: true, reset: true } ) { + Expr.pseudos[ i ] = createButtonPseudo( i ); +} + +// Easy API for creating new setFilters +function setFilters() {} +setFilters.prototype = Expr.filters = Expr.pseudos; +Expr.setFilters = new setFilters(); + +tokenize = Sizzle.tokenize = function( selector, parseOnly ) { + var matched, match, tokens, type, + soFar, groups, preFilters, + cached = tokenCache[ selector + " " ]; + + if ( cached ) { + return parseOnly ? 0 : cached.slice( 0 ); + } + + soFar = selector; + groups = []; + preFilters = Expr.preFilter; + + while ( soFar ) { + + // Comma and first run + if ( !matched || ( match = rcomma.exec( soFar ) ) ) { + if ( match ) { + + // Don't consume trailing commas as valid + soFar = soFar.slice( match[ 0 ].length ) || soFar; + } + groups.push( ( tokens = [] ) ); + } + + matched = false; + + // Combinators + if ( ( match = rcombinators.exec( soFar ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + + // Cast descendant combinators to space + type: match[ 0 ].replace( rtrim, " " ) + } ); + soFar = soFar.slice( matched.length ); + } + + // Filters + for ( type in Expr.filter ) { + if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || + ( match = preFilters[ type ]( match ) ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + type: type, + matches: match + } ); + soFar = soFar.slice( matched.length ); + } + } + + if ( !matched ) { + break; + } + } + + // Return the length of the invalid excess + // if we're just parsing + // Otherwise, throw an error or return tokens + return parseOnly ? + soFar.length : + soFar ? + Sizzle.error( selector ) : + + // Cache the tokens + tokenCache( selector, groups ).slice( 0 ); +}; + +function toSelector( tokens ) { + var i = 0, + len = tokens.length, + selector = ""; + for ( ; i < len; i++ ) { + selector += tokens[ i ].value; + } + return selector; +} + +function addCombinator( matcher, combinator, base ) { + var dir = combinator.dir, + skip = combinator.next, + key = skip || dir, + checkNonElements = base && key === "parentNode", + doneName = done++; + + return combinator.first ? + + // Check against closest ancestor/preceding element + function( elem, context, xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + return matcher( elem, context, xml ); + } + } + return false; + } : + + // Check against all ancestor/preceding elements + function( elem, context, xml ) { + var oldCache, uniqueCache, outerCache, + newCache = [ dirruns, doneName ]; + + // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching + if ( xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + if ( matcher( elem, context, xml ) ) { + return true; + } + } + } + } else { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + outerCache = elem[ expando ] || ( elem[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ elem.uniqueID ] || + ( outerCache[ elem.uniqueID ] = {} ); + + if ( skip && skip === elem.nodeName.toLowerCase() ) { + elem = elem[ dir ] || elem; + } else if ( ( oldCache = uniqueCache[ key ] ) && + oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { + + // Assign to newCache so results back-propagate to previous elements + return ( newCache[ 2 ] = oldCache[ 2 ] ); + } else { + + // Reuse newcache so results back-propagate to previous elements + uniqueCache[ key ] = newCache; + + // A match means we're done; a fail means we have to keep checking + if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { + return true; + } + } + } + } + } + return false; + }; +} + +function elementMatcher( matchers ) { + return matchers.length > 1 ? + function( elem, context, xml ) { + var i = matchers.length; + while ( i-- ) { + if ( !matchers[ i ]( elem, context, xml ) ) { + return false; + } + } + return true; + } : + matchers[ 0 ]; +} + +function multipleContexts( selector, contexts, results ) { + var i = 0, + len = contexts.length; + for ( ; i < len; i++ ) { + Sizzle( selector, contexts[ i ], results ); + } + return results; +} + +function condense( unmatched, map, filter, context, xml ) { + var elem, + newUnmatched = [], + i = 0, + len = unmatched.length, + mapped = map != null; + + for ( ; i < len; i++ ) { + if ( ( elem = unmatched[ i ] ) ) { + if ( !filter || filter( elem, context, xml ) ) { + newUnmatched.push( elem ); + if ( mapped ) { + map.push( i ); + } + } + } + } + + return newUnmatched; +} + +function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { + if ( postFilter && !postFilter[ expando ] ) { + postFilter = setMatcher( postFilter ); + } + if ( postFinder && !postFinder[ expando ] ) { + postFinder = setMatcher( postFinder, postSelector ); + } + return markFunction( function( seed, results, context, xml ) { + var temp, i, elem, + preMap = [], + postMap = [], + preexisting = results.length, + + // Get initial elements from seed or context + elems = seed || multipleContexts( + selector || "*", + context.nodeType ? [ context ] : context, + [] + ), + + // Prefilter to get matcher input, preserving a map for seed-results synchronization + matcherIn = preFilter && ( seed || !selector ) ? + condense( elems, preMap, preFilter, context, xml ) : + elems, + + matcherOut = matcher ? + + // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, + postFinder || ( seed ? preFilter : preexisting || postFilter ) ? + + // ...intermediate processing is necessary + [] : + + // ...otherwise use results directly + results : + matcherIn; + + // Find primary matches + if ( matcher ) { + matcher( matcherIn, matcherOut, context, xml ); + } + + // Apply postFilter + if ( postFilter ) { + temp = condense( matcherOut, postMap ); + postFilter( temp, [], context, xml ); + + // Un-match failing elements by moving them back to matcherIn + i = temp.length; + while ( i-- ) { + if ( ( elem = temp[ i ] ) ) { + matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); + } + } + } + + if ( seed ) { + if ( postFinder || preFilter ) { + if ( postFinder ) { + + // Get the final matcherOut by condensing this intermediate into postFinder contexts + temp = []; + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) ) { + + // Restore matcherIn since elem is not yet a final match + temp.push( ( matcherIn[ i ] = elem ) ); + } + } + postFinder( null, ( matcherOut = [] ), temp, xml ); + } + + // Move matched elements from seed to results to keep them synchronized + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) && + ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { + + seed[ temp ] = !( results[ temp ] = elem ); + } + } + } + + // Add elements to results, through postFinder if defined + } else { + matcherOut = condense( + matcherOut === results ? + matcherOut.splice( preexisting, matcherOut.length ) : + matcherOut + ); + if ( postFinder ) { + postFinder( null, results, matcherOut, xml ); + } else { + push.apply( results, matcherOut ); + } + } + } ); +} + +function matcherFromTokens( tokens ) { + var checkContext, matcher, j, + len = tokens.length, + leadingRelative = Expr.relative[ tokens[ 0 ].type ], + implicitRelative = leadingRelative || Expr.relative[ " " ], + i = leadingRelative ? 1 : 0, + + // The foundational matcher ensures that elements are reachable from top-level context(s) + matchContext = addCombinator( function( elem ) { + return elem === checkContext; + }, implicitRelative, true ), + matchAnyContext = addCombinator( function( elem ) { + return indexOf( checkContext, elem ) > -1; + }, implicitRelative, true ), + matchers = [ function( elem, context, xml ) { + var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( + ( checkContext = context ).nodeType ? + matchContext( elem, context, xml ) : + matchAnyContext( elem, context, xml ) ); + + // Avoid hanging onto element (issue #299) + checkContext = null; + return ret; + } ]; + + for ( ; i < len; i++ ) { + if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { + matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; + } else { + matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); + + // Return special upon seeing a positional matcher + if ( matcher[ expando ] ) { + + // Find the next relative operator (if any) for proper handling + j = ++i; + for ( ; j < len; j++ ) { + if ( Expr.relative[ tokens[ j ].type ] ) { + break; + } + } + return setMatcher( + i > 1 && elementMatcher( matchers ), + i > 1 && toSelector( + + // If the preceding token was a descendant combinator, insert an implicit any-element `*` + tokens + .slice( 0, i - 1 ) + .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) + ).replace( rtrim, "$1" ), + matcher, + i < j && matcherFromTokens( tokens.slice( i, j ) ), + j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), + j < len && toSelector( tokens ) + ); + } + matchers.push( matcher ); + } + } + + return elementMatcher( matchers ); +} + +function matcherFromGroupMatchers( elementMatchers, setMatchers ) { + var bySet = setMatchers.length > 0, + byElement = elementMatchers.length > 0, + superMatcher = function( seed, context, xml, results, outermost ) { + var elem, j, matcher, + matchedCount = 0, + i = "0", + unmatched = seed && [], + setMatched = [], + contextBackup = outermostContext, + + // We must always have either seed elements or outermost context + elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), + + // Use integer dirruns iff this is the outermost matcher + dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), + len = elems.length; + + if ( outermost ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + outermostContext = context == document || context || outermost; + } + + // Add elements passing elementMatchers directly to results + // Support: IE<9, Safari + // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id + for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { + if ( byElement && elem ) { + j = 0; + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( !context && elem.ownerDocument != document ) { + setDocument( elem ); + xml = !documentIsHTML; + } + while ( ( matcher = elementMatchers[ j++ ] ) ) { + if ( matcher( elem, context || document, xml ) ) { + results.push( elem ); + break; + } + } + if ( outermost ) { + dirruns = dirrunsUnique; + } + } + + // Track unmatched elements for set filters + if ( bySet ) { + + // They will have gone through all possible matchers + if ( ( elem = !matcher && elem ) ) { + matchedCount--; + } + + // Lengthen the array for every element, matched or not + if ( seed ) { + unmatched.push( elem ); + } + } + } + + // `i` is now the count of elements visited above, and adding it to `matchedCount` + // makes the latter nonnegative. + matchedCount += i; + + // Apply set filters to unmatched elements + // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` + // equals `i`), unless we didn't visit _any_ elements in the above loop because we have + // no element matchers and no seed. + // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that + // case, which will result in a "00" `matchedCount` that differs from `i` but is also + // numerically zero. + if ( bySet && i !== matchedCount ) { + j = 0; + while ( ( matcher = setMatchers[ j++ ] ) ) { + matcher( unmatched, setMatched, context, xml ); + } + + if ( seed ) { + + // Reintegrate element matches to eliminate the need for sorting + if ( matchedCount > 0 ) { + while ( i-- ) { + if ( !( unmatched[ i ] || setMatched[ i ] ) ) { + setMatched[ i ] = pop.call( results ); + } + } + } + + // Discard index placeholder values to get only actual matches + setMatched = condense( setMatched ); + } + + // Add matches to results + push.apply( results, setMatched ); + + // Seedless set matches succeeding multiple successful matchers stipulate sorting + if ( outermost && !seed && setMatched.length > 0 && + ( matchedCount + setMatchers.length ) > 1 ) { + + Sizzle.uniqueSort( results ); + } + } + + // Override manipulation of globals by nested matchers + if ( outermost ) { + dirruns = dirrunsUnique; + outermostContext = contextBackup; + } + + return unmatched; + }; + + return bySet ? + markFunction( superMatcher ) : + superMatcher; +} + +compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { + var i, + setMatchers = [], + elementMatchers = [], + cached = compilerCache[ selector + " " ]; + + if ( !cached ) { + + // Generate a function of recursive functions that can be used to check each element + if ( !match ) { + match = tokenize( selector ); + } + i = match.length; + while ( i-- ) { + cached = matcherFromTokens( match[ i ] ); + if ( cached[ expando ] ) { + setMatchers.push( cached ); + } else { + elementMatchers.push( cached ); + } + } + + // Cache the compiled function + cached = compilerCache( + selector, + matcherFromGroupMatchers( elementMatchers, setMatchers ) + ); + + // Save selector and tokenization + cached.selector = selector; + } + return cached; +}; + +/** + * A low-level selection function that works with Sizzle's compiled + * selector functions + * @param {String|Function} selector A selector or a pre-compiled + * selector function built with Sizzle.compile + * @param {Element} context + * @param {Array} [results] + * @param {Array} [seed] A set of elements to match against + */ +select = Sizzle.select = function( selector, context, results, seed ) { + var i, tokens, token, type, find, + compiled = typeof selector === "function" && selector, + match = !seed && tokenize( ( selector = compiled.selector || selector ) ); + + results = results || []; + + // Try to minimize operations if there is only one selector in the list and no seed + // (the latter of which guarantees us context) + if ( match.length === 1 ) { + + // Reduce context if the leading compound selector is an ID + tokens = match[ 0 ] = match[ 0 ].slice( 0 ); + if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && + context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { + + context = ( Expr.find[ "ID" ]( token.matches[ 0 ] + .replace( runescape, funescape ), context ) || [] )[ 0 ]; + if ( !context ) { + return results; + + // Precompiled matchers will still verify ancestry, so step up a level + } else if ( compiled ) { + context = context.parentNode; + } + + selector = selector.slice( tokens.shift().value.length ); + } + + // Fetch a seed set for right-to-left matching + i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; + while ( i-- ) { + token = tokens[ i ]; + + // Abort if we hit a combinator + if ( Expr.relative[ ( type = token.type ) ] ) { + break; + } + if ( ( find = Expr.find[ type ] ) ) { + + // Search, expanding context for leading sibling combinators + if ( ( seed = find( + token.matches[ 0 ].replace( runescape, funescape ), + rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || + context + ) ) ) { + + // If seed is empty or no tokens remain, we can return early + tokens.splice( i, 1 ); + selector = seed.length && toSelector( tokens ); + if ( !selector ) { + push.apply( results, seed ); + return results; + } + + break; + } + } + } + } + + // Compile and execute a filtering function if one is not provided + // Provide `match` to avoid retokenization if we modified the selector above + ( compiled || compile( selector, match ) )( + seed, + context, + !documentIsHTML, + results, + !context || rsibling.test( selector ) && testContext( context.parentNode ) || context + ); + return results; +}; + +// One-time assignments + +// Sort stability +support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; + +// Support: Chrome 14-35+ +// Always assume duplicates if they aren't passed to the comparison function +support.detectDuplicates = !!hasDuplicate; + +// Initialize against the default document +setDocument(); + +// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) +// Detached nodes confoundingly follow *each other* +support.sortDetached = assert( function( el ) { + + // Should return 1, but returns 4 (following) + return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; +} ); + +// Support: IE<8 +// Prevent attribute/property "interpolation" +// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx +if ( !assert( function( el ) { + el.innerHTML = ""; + return el.firstChild.getAttribute( "href" ) === "#"; +} ) ) { + addHandle( "type|href|height|width", function( elem, name, isXML ) { + if ( !isXML ) { + return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); + } + } ); +} + +// Support: IE<9 +// Use defaultValue in place of getAttribute("value") +if ( !support.attributes || !assert( function( el ) { + el.innerHTML = ""; + el.firstChild.setAttribute( "value", "" ); + return el.firstChild.getAttribute( "value" ) === ""; +} ) ) { + addHandle( "value", function( elem, _name, isXML ) { + if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { + return elem.defaultValue; + } + } ); +} + +// Support: IE<9 +// Use getAttributeNode to fetch booleans when getAttribute lies +if ( !assert( function( el ) { + return el.getAttribute( "disabled" ) == null; +} ) ) { + addHandle( booleans, function( elem, name, isXML ) { + var val; + if ( !isXML ) { + return elem[ name ] === true ? name.toLowerCase() : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; + } + } ); +} + +return Sizzle; + +} )( window ); + + + +jQuery.find = Sizzle; +jQuery.expr = Sizzle.selectors; + +// Deprecated +jQuery.expr[ ":" ] = jQuery.expr.pseudos; +jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; +jQuery.text = Sizzle.getText; +jQuery.isXMLDoc = Sizzle.isXML; +jQuery.contains = Sizzle.contains; +jQuery.escapeSelector = Sizzle.escape; + + + + +var dir = function( elem, dir, until ) { + var matched = [], + truncate = until !== undefined; + + while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { + if ( elem.nodeType === 1 ) { + if ( truncate && jQuery( elem ).is( until ) ) { + break; + } + matched.push( elem ); + } + } + return matched; +}; + + +var siblings = function( n, elem ) { + var matched = []; + + for ( ; n; n = n.nextSibling ) { + if ( n.nodeType === 1 && n !== elem ) { + matched.push( n ); + } + } + + return matched; +}; + + +var rneedsContext = jQuery.expr.match.needsContext; + + + +function nodeName( elem, name ) { + + return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); + +}; +var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); + + + +// Implement the identical functionality for filter and not +function winnow( elements, qualifier, not ) { + if ( isFunction( qualifier ) ) { + return jQuery.grep( elements, function( elem, i ) { + return !!qualifier.call( elem, i, elem ) !== not; + } ); + } + + // Single element + if ( qualifier.nodeType ) { + return jQuery.grep( elements, function( elem ) { + return ( elem === qualifier ) !== not; + } ); + } + + // Arraylike of elements (jQuery, arguments, Array) + if ( typeof qualifier !== "string" ) { + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not; + } ); + } + + // Filtered directly for both simple and complex selectors + return jQuery.filter( qualifier, elements, not ); +} + +jQuery.filter = function( expr, elems, not ) { + var elem = elems[ 0 ]; + + if ( not ) { + expr = ":not(" + expr + ")"; + } + + if ( elems.length === 1 && elem.nodeType === 1 ) { + return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; + } + + return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { + return elem.nodeType === 1; + } ) ); +}; + +jQuery.fn.extend( { + find: function( selector ) { + var i, ret, + len = this.length, + self = this; + + if ( typeof selector !== "string" ) { + return this.pushStack( jQuery( selector ).filter( function() { + for ( i = 0; i < len; i++ ) { + if ( jQuery.contains( self[ i ], this ) ) { + return true; + } + } + } ) ); + } + + ret = this.pushStack( [] ); + + for ( i = 0; i < len; i++ ) { + jQuery.find( selector, self[ i ], ret ); + } + + return len > 1 ? jQuery.uniqueSort( ret ) : ret; + }, + filter: function( selector ) { + return this.pushStack( winnow( this, selector || [], false ) ); + }, + not: function( selector ) { + return this.pushStack( winnow( this, selector || [], true ) ); + }, + is: function( selector ) { + return !!winnow( + this, + + // If this is a positional/relative selector, check membership in the returned set + // so $("p:first").is("p:last") won't return true for a doc with two "p". + typeof selector === "string" && rneedsContext.test( selector ) ? + jQuery( selector ) : + selector || [], + false + ).length; + } +} ); + + +// Initialize a jQuery object + + +// A central reference to the root jQuery(document) +var rootjQuery, + + // A simple way to check for HTML strings + // Prioritize #id over to avoid XSS via location.hash (#9521) + // Strict HTML recognition (#11290: must start with <) + // Shortcut simple #id case for speed + rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, + + init = jQuery.fn.init = function( selector, context, root ) { + var match, elem; + + // HANDLE: $(""), $(null), $(undefined), $(false) + if ( !selector ) { + return this; + } + + // Method init() accepts an alternate rootjQuery + // so migrate can support jQuery.sub (gh-2101) + root = root || rootjQuery; + + // Handle HTML strings + if ( typeof selector === "string" ) { + if ( selector[ 0 ] === "<" && + selector[ selector.length - 1 ] === ">" && + selector.length >= 3 ) { + + // Assume that strings that start and end with <> are HTML and skip the regex check + match = [ null, selector, null ]; + + } else { + match = rquickExpr.exec( selector ); + } + + // Match html or make sure no context is specified for #id + if ( match && ( match[ 1 ] || !context ) ) { + + // HANDLE: $(html) -> $(array) + if ( match[ 1 ] ) { + context = context instanceof jQuery ? context[ 0 ] : context; + + // Option to run scripts is true for back-compat + // Intentionally let the error be thrown if parseHTML is not present + jQuery.merge( this, jQuery.parseHTML( + match[ 1 ], + context && context.nodeType ? context.ownerDocument || context : document, + true + ) ); + + // HANDLE: $(html, props) + if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { + for ( match in context ) { + + // Properties of context are called as methods if possible + if ( isFunction( this[ match ] ) ) { + this[ match ]( context[ match ] ); + + // ...and otherwise set as attributes + } else { + this.attr( match, context[ match ] ); + } + } + } + + return this; + + // HANDLE: $(#id) + } else { + elem = document.getElementById( match[ 2 ] ); + + if ( elem ) { + + // Inject the element directly into the jQuery object + this[ 0 ] = elem; + this.length = 1; + } + return this; + } + + // HANDLE: $(expr, $(...)) + } else if ( !context || context.jquery ) { + return ( context || root ).find( selector ); + + // HANDLE: $(expr, context) + // (which is just equivalent to: $(context).find(expr) + } else { + return this.constructor( context ).find( selector ); + } + + // HANDLE: $(DOMElement) + } else if ( selector.nodeType ) { + this[ 0 ] = selector; + this.length = 1; + return this; + + // HANDLE: $(function) + // Shortcut for document ready + } else if ( isFunction( selector ) ) { + return root.ready !== undefined ? + root.ready( selector ) : + + // Execute immediately if ready is not present + selector( jQuery ); + } + + return jQuery.makeArray( selector, this ); + }; + +// Give the init function the jQuery prototype for later instantiation +init.prototype = jQuery.fn; + +// Initialize central reference +rootjQuery = jQuery( document ); + + +var rparentsprev = /^(?:parents|prev(?:Until|All))/, + + // Methods guaranteed to produce a unique set when starting from a unique set + guaranteedUnique = { + children: true, + contents: true, + next: true, + prev: true + }; + +jQuery.fn.extend( { + has: function( target ) { + var targets = jQuery( target, this ), + l = targets.length; + + return this.filter( function() { + var i = 0; + for ( ; i < l; i++ ) { + if ( jQuery.contains( this, targets[ i ] ) ) { + return true; + } + } + } ); + }, + + closest: function( selectors, context ) { + var cur, + i = 0, + l = this.length, + matched = [], + targets = typeof selectors !== "string" && jQuery( selectors ); + + // Positional selectors never match, since there's no _selection_ context + if ( !rneedsContext.test( selectors ) ) { + for ( ; i < l; i++ ) { + for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { + + // Always skip document fragments + if ( cur.nodeType < 11 && ( targets ? + targets.index( cur ) > -1 : + + // Don't pass non-elements to Sizzle + cur.nodeType === 1 && + jQuery.find.matchesSelector( cur, selectors ) ) ) { + + matched.push( cur ); + break; + } + } + } + } + + return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); + }, + + // Determine the position of an element within the set + index: function( elem ) { + + // No argument, return index in parent + if ( !elem ) { + return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; + } + + // Index in selector + if ( typeof elem === "string" ) { + return indexOf.call( jQuery( elem ), this[ 0 ] ); + } + + // Locate the position of the desired element + return indexOf.call( this, + + // If it receives a jQuery object, the first element is used + elem.jquery ? elem[ 0 ] : elem + ); + }, + + add: function( selector, context ) { + return this.pushStack( + jQuery.uniqueSort( + jQuery.merge( this.get(), jQuery( selector, context ) ) + ) + ); + }, + + addBack: function( selector ) { + return this.add( selector == null ? + this.prevObject : this.prevObject.filter( selector ) + ); + } +} ); + +function sibling( cur, dir ) { + while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} + return cur; +} + +jQuery.each( { + parent: function( elem ) { + var parent = elem.parentNode; + return parent && parent.nodeType !== 11 ? parent : null; + }, + parents: function( elem ) { + return dir( elem, "parentNode" ); + }, + parentsUntil: function( elem, _i, until ) { + return dir( elem, "parentNode", until ); + }, + next: function( elem ) { + return sibling( elem, "nextSibling" ); + }, + prev: function( elem ) { + return sibling( elem, "previousSibling" ); + }, + nextAll: function( elem ) { + return dir( elem, "nextSibling" ); + }, + prevAll: function( elem ) { + return dir( elem, "previousSibling" ); + }, + nextUntil: function( elem, _i, until ) { + return dir( elem, "nextSibling", until ); + }, + prevUntil: function( elem, _i, until ) { + return dir( elem, "previousSibling", until ); + }, + siblings: function( elem ) { + return siblings( ( elem.parentNode || {} ).firstChild, elem ); + }, + children: function( elem ) { + return siblings( elem.firstChild ); + }, + contents: function( elem ) { + if ( elem.contentDocument != null && + + // Support: IE 11+ + // elements with no `data` attribute has an object + // `contentDocument` with a `null` prototype. + getProto( elem.contentDocument ) ) { + + return elem.contentDocument; + } + + // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only + // Treat the template element as a regular one in browsers that + // don't support it. + if ( nodeName( elem, "template" ) ) { + elem = elem.content || elem; + } + + return jQuery.merge( [], elem.childNodes ); + } +}, function( name, fn ) { + jQuery.fn[ name ] = function( until, selector ) { + var matched = jQuery.map( this, fn, until ); + + if ( name.slice( -5 ) !== "Until" ) { + selector = until; + } + + if ( selector && typeof selector === "string" ) { + matched = jQuery.filter( selector, matched ); + } + + if ( this.length > 1 ) { + + // Remove duplicates + if ( !guaranteedUnique[ name ] ) { + jQuery.uniqueSort( matched ); + } + + // Reverse order for parents* and prev-derivatives + if ( rparentsprev.test( name ) ) { + matched.reverse(); + } + } + + return this.pushStack( matched ); + }; +} ); +var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); + + + +// Convert String-formatted options into Object-formatted ones +function createOptions( options ) { + var object = {}; + jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { + object[ flag ] = true; + } ); + return object; +} + +/* + * Create a callback list using the following parameters: + * + * options: an optional list of space-separated options that will change how + * the callback list behaves or a more traditional option object + * + * By default a callback list will act like an event callback list and can be + * "fired" multiple times. + * + * Possible options: + * + * once: will ensure the callback list can only be fired once (like a Deferred) + * + * memory: will keep track of previous values and will call any callback added + * after the list has been fired right away with the latest "memorized" + * values (like a Deferred) + * + * unique: will ensure a callback can only be added once (no duplicate in the list) + * + * stopOnFalse: interrupt callings when a callback returns false + * + */ +jQuery.Callbacks = function( options ) { + + // Convert options from String-formatted to Object-formatted if needed + // (we check in cache first) + options = typeof options === "string" ? + createOptions( options ) : + jQuery.extend( {}, options ); + + var // Flag to know if list is currently firing + firing, + + // Last fire value for non-forgettable lists + memory, + + // Flag to know if list was already fired + fired, + + // Flag to prevent firing + locked, + + // Actual callback list + list = [], + + // Queue of execution data for repeatable lists + queue = [], + + // Index of currently firing callback (modified by add/remove as needed) + firingIndex = -1, + + // Fire callbacks + fire = function() { + + // Enforce single-firing + locked = locked || options.once; + + // Execute callbacks for all pending executions, + // respecting firingIndex overrides and runtime changes + fired = firing = true; + for ( ; queue.length; firingIndex = -1 ) { + memory = queue.shift(); + while ( ++firingIndex < list.length ) { + + // Run callback and check for early termination + if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && + options.stopOnFalse ) { + + // Jump to end and forget the data so .add doesn't re-fire + firingIndex = list.length; + memory = false; + } + } + } + + // Forget the data if we're done with it + if ( !options.memory ) { + memory = false; + } + + firing = false; + + // Clean up if we're done firing for good + if ( locked ) { + + // Keep an empty list if we have data for future add calls + if ( memory ) { + list = []; + + // Otherwise, this object is spent + } else { + list = ""; + } + } + }, + + // Actual Callbacks object + self = { + + // Add a callback or a collection of callbacks to the list + add: function() { + if ( list ) { + + // If we have memory from a past run, we should fire after adding + if ( memory && !firing ) { + firingIndex = list.length - 1; + queue.push( memory ); + } + + ( function add( args ) { + jQuery.each( args, function( _, arg ) { + if ( isFunction( arg ) ) { + if ( !options.unique || !self.has( arg ) ) { + list.push( arg ); + } + } else if ( arg && arg.length && toType( arg ) !== "string" ) { + + // Inspect recursively + add( arg ); + } + } ); + } )( arguments ); + + if ( memory && !firing ) { + fire(); + } + } + return this; + }, + + // Remove a callback from the list + remove: function() { + jQuery.each( arguments, function( _, arg ) { + var index; + while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { + list.splice( index, 1 ); + + // Handle firing indexes + if ( index <= firingIndex ) { + firingIndex--; + } + } + } ); + return this; + }, + + // Check if a given callback is in the list. + // If no argument is given, return whether or not list has callbacks attached. + has: function( fn ) { + return fn ? + jQuery.inArray( fn, list ) > -1 : + list.length > 0; + }, + + // Remove all callbacks from the list + empty: function() { + if ( list ) { + list = []; + } + return this; + }, + + // Disable .fire and .add + // Abort any current/pending executions + // Clear all callbacks and values + disable: function() { + locked = queue = []; + list = memory = ""; + return this; + }, + disabled: function() { + return !list; + }, + + // Disable .fire + // Also disable .add unless we have memory (since it would have no effect) + // Abort any pending executions + lock: function() { + locked = queue = []; + if ( !memory && !firing ) { + list = memory = ""; + } + return this; + }, + locked: function() { + return !!locked; + }, + + // Call all callbacks with the given context and arguments + fireWith: function( context, args ) { + if ( !locked ) { + args = args || []; + args = [ context, args.slice ? args.slice() : args ]; + queue.push( args ); + if ( !firing ) { + fire(); + } + } + return this; + }, + + // Call all the callbacks with the given arguments + fire: function() { + self.fireWith( this, arguments ); + return this; + }, + + // To know if the callbacks have already been called at least once + fired: function() { + return !!fired; + } + }; + + return self; +}; + + +function Identity( v ) { + return v; +} +function Thrower( ex ) { + throw ex; +} + +function adoptValue( value, resolve, reject, noValue ) { + var method; + + try { + + // Check for promise aspect first to privilege synchronous behavior + if ( value && isFunction( ( method = value.promise ) ) ) { + method.call( value ).done( resolve ).fail( reject ); + + // Other thenables + } else if ( value && isFunction( ( method = value.then ) ) ) { + method.call( value, resolve, reject ); + + // Other non-thenables + } else { + + // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: + // * false: [ value ].slice( 0 ) => resolve( value ) + // * true: [ value ].slice( 1 ) => resolve() + resolve.apply( undefined, [ value ].slice( noValue ) ); + } + + // For Promises/A+, convert exceptions into rejections + // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in + // Deferred#then to conditionally suppress rejection. + } catch ( value ) { + + // Support: Android 4.0 only + // Strict mode functions invoked without .call/.apply get global-object context + reject.apply( undefined, [ value ] ); + } +} + +jQuery.extend( { + + Deferred: function( func ) { + var tuples = [ + + // action, add listener, callbacks, + // ... .then handlers, argument index, [final state] + [ "notify", "progress", jQuery.Callbacks( "memory" ), + jQuery.Callbacks( "memory" ), 2 ], + [ "resolve", "done", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 0, "resolved" ], + [ "reject", "fail", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 1, "rejected" ] + ], + state = "pending", + promise = { + state: function() { + return state; + }, + always: function() { + deferred.done( arguments ).fail( arguments ); + return this; + }, + "catch": function( fn ) { + return promise.then( null, fn ); + }, + + // Keep pipe for back-compat + pipe: function( /* fnDone, fnFail, fnProgress */ ) { + var fns = arguments; + + return jQuery.Deferred( function( newDefer ) { + jQuery.each( tuples, function( _i, tuple ) { + + // Map tuples (progress, done, fail) to arguments (done, fail, progress) + var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; + + // deferred.progress(function() { bind to newDefer or newDefer.notify }) + // deferred.done(function() { bind to newDefer or newDefer.resolve }) + // deferred.fail(function() { bind to newDefer or newDefer.reject }) + deferred[ tuple[ 1 ] ]( function() { + var returned = fn && fn.apply( this, arguments ); + if ( returned && isFunction( returned.promise ) ) { + returned.promise() + .progress( newDefer.notify ) + .done( newDefer.resolve ) + .fail( newDefer.reject ); + } else { + newDefer[ tuple[ 0 ] + "With" ]( + this, + fn ? [ returned ] : arguments + ); + } + } ); + } ); + fns = null; + } ).promise(); + }, + then: function( onFulfilled, onRejected, onProgress ) { + var maxDepth = 0; + function resolve( depth, deferred, handler, special ) { + return function() { + var that = this, + args = arguments, + mightThrow = function() { + var returned, then; + + // Support: Promises/A+ section 2.3.3.3.3 + // https://promisesaplus.com/#point-59 + // Ignore double-resolution attempts + if ( depth < maxDepth ) { + return; + } + + returned = handler.apply( that, args ); + + // Support: Promises/A+ section 2.3.1 + // https://promisesaplus.com/#point-48 + if ( returned === deferred.promise() ) { + throw new TypeError( "Thenable self-resolution" ); + } + + // Support: Promises/A+ sections 2.3.3.1, 3.5 + // https://promisesaplus.com/#point-54 + // https://promisesaplus.com/#point-75 + // Retrieve `then` only once + then = returned && + + // Support: Promises/A+ section 2.3.4 + // https://promisesaplus.com/#point-64 + // Only check objects and functions for thenability + ( typeof returned === "object" || + typeof returned === "function" ) && + returned.then; + + // Handle a returned thenable + if ( isFunction( then ) ) { + + // Special processors (notify) just wait for resolution + if ( special ) { + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ) + ); + + // Normal processors (resolve) also hook into progress + } else { + + // ...and disregard older resolution values + maxDepth++; + + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ), + resolve( maxDepth, deferred, Identity, + deferred.notifyWith ) + ); + } + + // Handle all other returned values + } else { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Identity ) { + that = undefined; + args = [ returned ]; + } + + // Process the value(s) + // Default process is resolve + ( special || deferred.resolveWith )( that, args ); + } + }, + + // Only normal processors (resolve) catch and reject exceptions + process = special ? + mightThrow : + function() { + try { + mightThrow(); + } catch ( e ) { + + if ( jQuery.Deferred.exceptionHook ) { + jQuery.Deferred.exceptionHook( e, + process.stackTrace ); + } + + // Support: Promises/A+ section 2.3.3.3.4.1 + // https://promisesaplus.com/#point-61 + // Ignore post-resolution exceptions + if ( depth + 1 >= maxDepth ) { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Thrower ) { + that = undefined; + args = [ e ]; + } + + deferred.rejectWith( that, args ); + } + } + }; + + // Support: Promises/A+ section 2.3.3.3.1 + // https://promisesaplus.com/#point-57 + // Re-resolve promises immediately to dodge false rejection from + // subsequent errors + if ( depth ) { + process(); + } else { + + // Call an optional hook to record the stack, in case of exception + // since it's otherwise lost when execution goes async + if ( jQuery.Deferred.getStackHook ) { + process.stackTrace = jQuery.Deferred.getStackHook(); + } + window.setTimeout( process ); + } + }; + } + + return jQuery.Deferred( function( newDefer ) { + + // progress_handlers.add( ... ) + tuples[ 0 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onProgress ) ? + onProgress : + Identity, + newDefer.notifyWith + ) + ); + + // fulfilled_handlers.add( ... ) + tuples[ 1 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onFulfilled ) ? + onFulfilled : + Identity + ) + ); + + // rejected_handlers.add( ... ) + tuples[ 2 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onRejected ) ? + onRejected : + Thrower + ) + ); + } ).promise(); + }, + + // Get a promise for this deferred + // If obj is provided, the promise aspect is added to the object + promise: function( obj ) { + return obj != null ? jQuery.extend( obj, promise ) : promise; + } + }, + deferred = {}; + + // Add list-specific methods + jQuery.each( tuples, function( i, tuple ) { + var list = tuple[ 2 ], + stateString = tuple[ 5 ]; + + // promise.progress = list.add + // promise.done = list.add + // promise.fail = list.add + promise[ tuple[ 1 ] ] = list.add; + + // Handle state + if ( stateString ) { + list.add( + function() { + + // state = "resolved" (i.e., fulfilled) + // state = "rejected" + state = stateString; + }, + + // rejected_callbacks.disable + // fulfilled_callbacks.disable + tuples[ 3 - i ][ 2 ].disable, + + // rejected_handlers.disable + // fulfilled_handlers.disable + tuples[ 3 - i ][ 3 ].disable, + + // progress_callbacks.lock + tuples[ 0 ][ 2 ].lock, + + // progress_handlers.lock + tuples[ 0 ][ 3 ].lock + ); + } + + // progress_handlers.fire + // fulfilled_handlers.fire + // rejected_handlers.fire + list.add( tuple[ 3 ].fire ); + + // deferred.notify = function() { deferred.notifyWith(...) } + // deferred.resolve = function() { deferred.resolveWith(...) } + // deferred.reject = function() { deferred.rejectWith(...) } + deferred[ tuple[ 0 ] ] = function() { + deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); + return this; + }; + + // deferred.notifyWith = list.fireWith + // deferred.resolveWith = list.fireWith + // deferred.rejectWith = list.fireWith + deferred[ tuple[ 0 ] + "With" ] = list.fireWith; + } ); + + // Make the deferred a promise + promise.promise( deferred ); + + // Call given func if any + if ( func ) { + func.call( deferred, deferred ); + } + + // All done! + return deferred; + }, + + // Deferred helper + when: function( singleValue ) { + var + + // count of uncompleted subordinates + remaining = arguments.length, + + // count of unprocessed arguments + i = remaining, + + // subordinate fulfillment data + resolveContexts = Array( i ), + resolveValues = slice.call( arguments ), + + // the master Deferred + master = jQuery.Deferred(), + + // subordinate callback factory + updateFunc = function( i ) { + return function( value ) { + resolveContexts[ i ] = this; + resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; + if ( !( --remaining ) ) { + master.resolveWith( resolveContexts, resolveValues ); + } + }; + }; + + // Single- and empty arguments are adopted like Promise.resolve + if ( remaining <= 1 ) { + adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, + !remaining ); + + // Use .then() to unwrap secondary thenables (cf. gh-3000) + if ( master.state() === "pending" || + isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { + + return master.then(); + } + } + + // Multiple arguments are aggregated like Promise.all array elements + while ( i-- ) { + adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); + } + + return master.promise(); + } +} ); + + +// These usually indicate a programmer mistake during development, +// warn about them ASAP rather than swallowing them by default. +var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; + +jQuery.Deferred.exceptionHook = function( error, stack ) { + + // Support: IE 8 - 9 only + // Console exists when dev tools are open, which can happen at any time + if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { + window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); + } +}; + + + + +jQuery.readyException = function( error ) { + window.setTimeout( function() { + throw error; + } ); +}; + + + + +// The deferred used on DOM ready +var readyList = jQuery.Deferred(); + +jQuery.fn.ready = function( fn ) { + + readyList + .then( fn ) + + // Wrap jQuery.readyException in a function so that the lookup + // happens at the time of error handling instead of callback + // registration. + .catch( function( error ) { + jQuery.readyException( error ); + } ); + + return this; +}; + +jQuery.extend( { + + // Is the DOM ready to be used? Set to true once it occurs. + isReady: false, + + // A counter to track how many items to wait for before + // the ready event fires. See #6781 + readyWait: 1, + + // Handle when the DOM is ready + ready: function( wait ) { + + // Abort if there are pending holds or we're already ready + if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { + return; + } + + // Remember that the DOM is ready + jQuery.isReady = true; + + // If a normal DOM Ready event fired, decrement, and wait if need be + if ( wait !== true && --jQuery.readyWait > 0 ) { + return; + } + + // If there are functions bound, to execute + readyList.resolveWith( document, [ jQuery ] ); + } +} ); + +jQuery.ready.then = readyList.then; + +// The ready event handler and self cleanup method +function completed() { + document.removeEventListener( "DOMContentLoaded", completed ); + window.removeEventListener( "load", completed ); + jQuery.ready(); +} + +// Catch cases where $(document).ready() is called +// after the browser event has already occurred. +// Support: IE <=9 - 10 only +// Older IE sometimes signals "interactive" too soon +if ( document.readyState === "complete" || + ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { + + // Handle it asynchronously to allow scripts the opportunity to delay ready + window.setTimeout( jQuery.ready ); + +} else { + + // Use the handy event callback + document.addEventListener( "DOMContentLoaded", completed ); + + // A fallback to window.onload, that will always work + window.addEventListener( "load", completed ); +} + + + + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( toType( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( !isFunction( value ) ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, _key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +}; + + +// Matches dashed string for camelizing +var rmsPrefix = /^-ms-/, + rdashAlpha = /-([a-z])/g; + +// Used by camelCase as callback to replace() +function fcamelCase( _all, letter ) { + return letter.toUpperCase(); +} + +// Convert dashed to camelCase; used by the css and data modules +// Support: IE <=9 - 11, Edge 12 - 15 +// Microsoft forgot to hump their vendor prefix (#9572) +function camelCase( string ) { + return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); +} +var acceptData = function( owner ) { + + // Accepts only: + // - Node + // - Node.ELEMENT_NODE + // - Node.DOCUMENT_NODE + // - Object + // - Any + return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); +}; + + + + +function Data() { + this.expando = jQuery.expando + Data.uid++; +} + +Data.uid = 1; + +Data.prototype = { + + cache: function( owner ) { + + // Check if the owner object already has a cache + var value = owner[ this.expando ]; + + // If not, create one + if ( !value ) { + value = {}; + + // We can accept data for non-element nodes in modern browsers, + // but we should not, see #8335. + // Always return an empty object. + if ( acceptData( owner ) ) { + + // If it is a node unlikely to be stringify-ed or looped over + // use plain assignment + if ( owner.nodeType ) { + owner[ this.expando ] = value; + + // Otherwise secure it in a non-enumerable property + // configurable must be true to allow the property to be + // deleted when data is removed + } else { + Object.defineProperty( owner, this.expando, { + value: value, + configurable: true + } ); + } + } + } + + return value; + }, + set: function( owner, data, value ) { + var prop, + cache = this.cache( owner ); + + // Handle: [ owner, key, value ] args + // Always use camelCase key (gh-2257) + if ( typeof data === "string" ) { + cache[ camelCase( data ) ] = value; + + // Handle: [ owner, { properties } ] args + } else { + + // Copy the properties one-by-one to the cache object + for ( prop in data ) { + cache[ camelCase( prop ) ] = data[ prop ]; + } + } + return cache; + }, + get: function( owner, key ) { + return key === undefined ? + this.cache( owner ) : + + // Always use camelCase key (gh-2257) + owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; + }, + access: function( owner, key, value ) { + + // In cases where either: + // + // 1. No key was specified + // 2. A string key was specified, but no value provided + // + // Take the "read" path and allow the get method to determine + // which value to return, respectively either: + // + // 1. The entire cache object + // 2. The data stored at the key + // + if ( key === undefined || + ( ( key && typeof key === "string" ) && value === undefined ) ) { + + return this.get( owner, key ); + } + + // When the key is not a string, or both a key and value + // are specified, set or extend (existing objects) with either: + // + // 1. An object of properties + // 2. A key and value + // + this.set( owner, key, value ); + + // Since the "set" path can have two possible entry points + // return the expected data based on which path was taken[*] + return value !== undefined ? value : key; + }, + remove: function( owner, key ) { + var i, + cache = owner[ this.expando ]; + + if ( cache === undefined ) { + return; + } + + if ( key !== undefined ) { + + // Support array or space separated string of keys + if ( Array.isArray( key ) ) { + + // If key is an array of keys... + // We always set camelCase keys, so remove that. + key = key.map( camelCase ); + } else { + key = camelCase( key ); + + // If a key with the spaces exists, use it. + // Otherwise, create an array by matching non-whitespace + key = key in cache ? + [ key ] : + ( key.match( rnothtmlwhite ) || [] ); + } + + i = key.length; + + while ( i-- ) { + delete cache[ key[ i ] ]; + } + } + + // Remove the expando if there's no more data + if ( key === undefined || jQuery.isEmptyObject( cache ) ) { + + // Support: Chrome <=35 - 45 + // Webkit & Blink performance suffers when deleting properties + // from DOM nodes, so set to undefined instead + // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) + if ( owner.nodeType ) { + owner[ this.expando ] = undefined; + } else { + delete owner[ this.expando ]; + } + } + }, + hasData: function( owner ) { + var cache = owner[ this.expando ]; + return cache !== undefined && !jQuery.isEmptyObject( cache ); + } +}; +var dataPriv = new Data(); + +var dataUser = new Data(); + + + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11 only + // The attrs elements can be null (#14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + + +jQuery.extend( { + queue: function( elem, type, data ) { + var queue; + + if ( elem ) { + type = ( type || "fx" ) + "queue"; + queue = dataPriv.get( elem, type ); + + // Speed up dequeue by getting out quickly if this is just a lookup + if ( data ) { + if ( !queue || Array.isArray( data ) ) { + queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); + } else { + queue.push( data ); + } + } + return queue || []; + } + }, + + dequeue: function( elem, type ) { + type = type || "fx"; + + var queue = jQuery.queue( elem, type ), + startLength = queue.length, + fn = queue.shift(), + hooks = jQuery._queueHooks( elem, type ), + next = function() { + jQuery.dequeue( elem, type ); + }; + + // If the fx queue is dequeued, always remove the progress sentinel + if ( fn === "inprogress" ) { + fn = queue.shift(); + startLength--; + } + + if ( fn ) { + + // Add a progress sentinel to prevent the fx queue from being + // automatically dequeued + if ( type === "fx" ) { + queue.unshift( "inprogress" ); + } + + // Clear up the last queue stop function + delete hooks.stop; + fn.call( elem, next, hooks ); + } + + if ( !startLength && hooks ) { + hooks.empty.fire(); + } + }, + + // Not public - generate a queueHooks object, or return the current one + _queueHooks: function( elem, type ) { + var key = type + "queueHooks"; + return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { + empty: jQuery.Callbacks( "once memory" ).add( function() { + dataPriv.remove( elem, [ type + "queue", key ] ); + } ) + } ); + } +} ); + +jQuery.fn.extend( { + queue: function( type, data ) { + var setter = 2; + + if ( typeof type !== "string" ) { + data = type; + type = "fx"; + setter--; + } + + if ( arguments.length < setter ) { + return jQuery.queue( this[ 0 ], type ); + } + + return data === undefined ? + this : + this.each( function() { + var queue = jQuery.queue( this, type, data ); + + // Ensure a hooks for this queue + jQuery._queueHooks( this, type ); + + if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { + jQuery.dequeue( this, type ); + } + } ); + }, + dequeue: function( type ) { + return this.each( function() { + jQuery.dequeue( this, type ); + } ); + }, + clearQueue: function( type ) { + return this.queue( type || "fx", [] ); + }, + + // Get a promise resolved when queues of a certain type + // are emptied (fx is the type by default) + promise: function( type, obj ) { + var tmp, + count = 1, + defer = jQuery.Deferred(), + elements = this, + i = this.length, + resolve = function() { + if ( !( --count ) ) { + defer.resolveWith( elements, [ elements ] ); + } + }; + + if ( typeof type !== "string" ) { + obj = type; + type = undefined; + } + type = type || "fx"; + + while ( i-- ) { + tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); + if ( tmp && tmp.empty ) { + count++; + tmp.empty.add( resolve ); + } + } + resolve(); + return defer.promise( obj ); + } +} ); +var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; + +var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); + + +var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; + +var documentElement = document.documentElement; + + + + var isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ); + }, + composed = { composed: true }; + + // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only + // Check attachment across shadow DOM boundaries when possible (gh-3504) + // Support: iOS 10.0-10.2 only + // Early iOS 10 versions support `attachShadow` but not `getRootNode`, + // leading to errors. We need to check for `getRootNode`. + if ( documentElement.getRootNode ) { + isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ) || + elem.getRootNode( composed ) === elem.ownerDocument; + }; + } +var isHiddenWithinTree = function( elem, el ) { + + // isHiddenWithinTree might be called from jQuery#filter function; + // in that case, element will be second argument + elem = el || elem; + + // Inline style trumps all + return elem.style.display === "none" || + elem.style.display === "" && + + // Otherwise, check computed style + // Support: Firefox <=43 - 45 + // Disconnected elements can have computed display: none, so first confirm that elem is + // in the document. + isAttached( elem ) && + + jQuery.css( elem, "display" ) === "none"; + }; + + + +function adjustCSS( elem, prop, valueParts, tween ) { + var adjusted, scale, + maxIterations = 20, + currentValue = tween ? + function() { + return tween.cur(); + } : + function() { + return jQuery.css( elem, prop, "" ); + }, + initial = currentValue(), + unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), + + // Starting value computation is required for potential unit mismatches + initialInUnit = elem.nodeType && + ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && + rcssNum.exec( jQuery.css( elem, prop ) ); + + if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { + + // Support: Firefox <=54 + // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) + initial = initial / 2; + + // Trust units reported by jQuery.css + unit = unit || initialInUnit[ 3 ]; + + // Iteratively approximate from a nonzero starting point + initialInUnit = +initial || 1; + + while ( maxIterations-- ) { + + // Evaluate and update our best guess (doubling guesses that zero out). + // Finish if the scale equals or crosses 1 (making the old*new product non-positive). + jQuery.style( elem, prop, initialInUnit + unit ); + if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { + maxIterations = 0; + } + initialInUnit = initialInUnit / scale; + + } + + initialInUnit = initialInUnit * 2; + jQuery.style( elem, prop, initialInUnit + unit ); + + // Make sure we update the tween properties later on + valueParts = valueParts || []; + } + + if ( valueParts ) { + initialInUnit = +initialInUnit || +initial || 0; + + // Apply relative offset (+=/-=) if specified + adjusted = valueParts[ 1 ] ? + initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : + +valueParts[ 2 ]; + if ( tween ) { + tween.unit = unit; + tween.start = initialInUnit; + tween.end = adjusted; + } + } + return adjusted; +} + + +var defaultDisplayMap = {}; + +function getDefaultDisplay( elem ) { + var temp, + doc = elem.ownerDocument, + nodeName = elem.nodeName, + display = defaultDisplayMap[ nodeName ]; + + if ( display ) { + return display; + } + + temp = doc.body.appendChild( doc.createElement( nodeName ) ); + display = jQuery.css( temp, "display" ); + + temp.parentNode.removeChild( temp ); + + if ( display === "none" ) { + display = "block"; + } + defaultDisplayMap[ nodeName ] = display; + + return display; +} + +function showHide( elements, show ) { + var display, elem, + values = [], + index = 0, + length = elements.length; + + // Determine new display value for elements that need to change + for ( ; index < length; index++ ) { + elem = elements[ index ]; + if ( !elem.style ) { + continue; + } + + display = elem.style.display; + if ( show ) { + + // Since we force visibility upon cascade-hidden elements, an immediate (and slow) + // check is required in this first loop unless we have a nonempty display value (either + // inline or about-to-be-restored) + if ( display === "none" ) { + values[ index ] = dataPriv.get( elem, "display" ) || null; + if ( !values[ index ] ) { + elem.style.display = ""; + } + } + if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { + values[ index ] = getDefaultDisplay( elem ); + } + } else { + if ( display !== "none" ) { + values[ index ] = "none"; + + // Remember what we're overwriting + dataPriv.set( elem, "display", display ); + } + } + } + + // Set the display of the elements in a second loop to avoid constant reflow + for ( index = 0; index < length; index++ ) { + if ( values[ index ] != null ) { + elements[ index ].style.display = values[ index ]; + } + } + + return elements; +} + +jQuery.fn.extend( { + show: function() { + return showHide( this, true ); + }, + hide: function() { + return showHide( this ); + }, + toggle: function( state ) { + if ( typeof state === "boolean" ) { + return state ? this.show() : this.hide(); + } + + return this.each( function() { + if ( isHiddenWithinTree( this ) ) { + jQuery( this ).show(); + } else { + jQuery( this ).hide(); + } + } ); + } +} ); +var rcheckableType = ( /^(?:checkbox|radio)$/i ); + +var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); + +var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); + + + +( function() { + var fragment = document.createDocumentFragment(), + div = fragment.appendChild( document.createElement( "div" ) ), + input = document.createElement( "input" ); + + // Support: Android 4.0 - 4.3 only + // Check state lost if the name is set (#11217) + // Support: Windows Web Apps (WWA) + // `name` and `type` must use .setAttribute for WWA (#14901) + input.setAttribute( "type", "radio" ); + input.setAttribute( "checked", "checked" ); + input.setAttribute( "name", "t" ); + + div.appendChild( input ); + + // Support: Android <=4.1 only + // Older WebKit doesn't clone checked state correctly in fragments + support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; + + // Support: IE <=11 only + // Make sure textarea (and checkbox) defaultValue is properly cloned + div.innerHTML = ""; + support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; + + // Support: IE <=9 only + // IE <=9 replaces "; + support.option = !!div.lastChild; +} )(); + + +// We have to close these tags to support XHTML (#13200) +var wrapMap = { + + // XHTML parsers do not magically insert elements in the + // same way that tag soup parsers do. So we cannot shorten + // this by omitting or other required elements. + thead: [ 1, "", "
" ], + col: [ 2, "", "
" ], + tr: [ 2, "", "
" ], + td: [ 3, "", "
" ], + + _default: [ 0, "", "" ] +}; + +wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; +wrapMap.th = wrapMap.td; + +// Support: IE <=9 only +if ( !support.option ) { + wrapMap.optgroup = wrapMap.option = [ 1, "" ]; +} + + +function getAll( context, tag ) { + + // Support: IE <=9 - 11 only + // Use typeof to avoid zero-argument method invocation on host objects (#15151) + var ret; + + if ( typeof context.getElementsByTagName !== "undefined" ) { + ret = context.getElementsByTagName( tag || "*" ); + + } else if ( typeof context.querySelectorAll !== "undefined" ) { + ret = context.querySelectorAll( tag || "*" ); + + } else { + ret = []; + } + + if ( tag === undefined || tag && nodeName( context, tag ) ) { + return jQuery.merge( [ context ], ret ); + } + + return ret; +} + + +// Mark scripts as having already been evaluated +function setGlobalEval( elems, refElements ) { + var i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + dataPriv.set( + elems[ i ], + "globalEval", + !refElements || dataPriv.get( refElements[ i ], "globalEval" ) + ); + } +} + + +var rhtml = /<|&#?\w+;/; + +function buildFragment( elems, context, scripts, selection, ignored ) { + var elem, tmp, tag, wrap, attached, j, + fragment = context.createDocumentFragment(), + nodes = [], + i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + elem = elems[ i ]; + + if ( elem || elem === 0 ) { + + // Add nodes directly + if ( toType( elem ) === "object" ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); + + // Convert non-html into a text node + } else if ( !rhtml.test( elem ) ) { + nodes.push( context.createTextNode( elem ) ); + + // Convert html into DOM nodes + } else { + tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); + + // Deserialize a standard representation + tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); + wrap = wrapMap[ tag ] || wrapMap._default; + tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; + + // Descend through wrappers to the right content + j = wrap[ 0 ]; + while ( j-- ) { + tmp = tmp.lastChild; + } + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, tmp.childNodes ); + + // Remember the top-level container + tmp = fragment.firstChild; + + // Ensure the created nodes are orphaned (#12392) + tmp.textContent = ""; + } + } + } + + // Remove wrapper from fragment + fragment.textContent = ""; + + i = 0; + while ( ( elem = nodes[ i++ ] ) ) { + + // Skip elements already in the context collection (trac-4087) + if ( selection && jQuery.inArray( elem, selection ) > -1 ) { + if ( ignored ) { + ignored.push( elem ); + } + continue; + } + + attached = isAttached( elem ); + + // Append to fragment + tmp = getAll( fragment.appendChild( elem ), "script" ); + + // Preserve script evaluation history + if ( attached ) { + setGlobalEval( tmp ); + } + + // Capture executables + if ( scripts ) { + j = 0; + while ( ( elem = tmp[ j++ ] ) ) { + if ( rscriptType.test( elem.type || "" ) ) { + scripts.push( elem ); + } + } + } + } + + return fragment; +} + + +var + rkeyEvent = /^key/, + rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, + rtypenamespace = /^([^.]*)(?:\.(.+)|)/; + +function returnTrue() { + return true; +} + +function returnFalse() { + return false; +} + +// Support: IE <=9 - 11+ +// focus() and blur() are asynchronous, except when they are no-op. +// So expect focus to be synchronous when the element is already active, +// and blur to be synchronous when the element is not already active. +// (focus and blur are always synchronous in other supported browsers, +// this just defines when we can count on it). +function expectSync( elem, type ) { + return ( elem === safeActiveElement() ) === ( type === "focus" ); +} + +// Support: IE <=9 only +// Accessing document.activeElement can throw unexpectedly +// https://bugs.jquery.com/ticket/13393 +function safeActiveElement() { + try { + return document.activeElement; + } catch ( err ) { } +} + +function on( elem, types, selector, data, fn, one ) { + var origFn, type; + + // Types can be a map of types/handlers + if ( typeof types === "object" ) { + + // ( types-Object, selector, data ) + if ( typeof selector !== "string" ) { + + // ( types-Object, data ) + data = data || selector; + selector = undefined; + } + for ( type in types ) { + on( elem, type, selector, data, types[ type ], one ); + } + return elem; + } + + if ( data == null && fn == null ) { + + // ( types, fn ) + fn = selector; + data = selector = undefined; + } else if ( fn == null ) { + if ( typeof selector === "string" ) { + + // ( types, selector, fn ) + fn = data; + data = undefined; + } else { + + // ( types, data, fn ) + fn = data; + data = selector; + selector = undefined; + } + } + if ( fn === false ) { + fn = returnFalse; + } else if ( !fn ) { + return elem; + } + + if ( one === 1 ) { + origFn = fn; + fn = function( event ) { + + // Can use an empty set, since event contains the info + jQuery().off( event ); + return origFn.apply( this, arguments ); + }; + + // Use same guid so caller can remove using origFn + fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); + } + return elem.each( function() { + jQuery.event.add( this, types, fn, data, selector ); + } ); +} + +/* + * Helper functions for managing events -- not part of the public interface. + * Props to Dean Edwards' addEvent library for many of the ideas. + */ +jQuery.event = { + + global: {}, + + add: function( elem, types, handler, data, selector ) { + + var handleObjIn, eventHandle, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.get( elem ); + + // Only attach events to objects that accept data + if ( !acceptData( elem ) ) { + return; + } + + // Caller can pass in an object of custom data in lieu of the handler + if ( handler.handler ) { + handleObjIn = handler; + handler = handleObjIn.handler; + selector = handleObjIn.selector; + } + + // Ensure that invalid selectors throw exceptions at attach time + // Evaluate against documentElement in case elem is a non-element node (e.g., document) + if ( selector ) { + jQuery.find.matchesSelector( documentElement, selector ); + } + + // Make sure that the handler has a unique ID, used to find/remove it later + if ( !handler.guid ) { + handler.guid = jQuery.guid++; + } + + // Init the element's event structure and main handler, if this is the first + if ( !( events = elemData.events ) ) { + events = elemData.events = Object.create( null ); + } + if ( !( eventHandle = elemData.handle ) ) { + eventHandle = elemData.handle = function( e ) { + + // Discard the second event of a jQuery.event.trigger() and + // when an event is called after a page has unloaded + return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? + jQuery.event.dispatch.apply( elem, arguments ) : undefined; + }; + } + + // Handle multiple events separated by a space + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // There *must* be a type, no attaching namespace-only handlers + if ( !type ) { + continue; + } + + // If event changes its type, use the special event handlers for the changed type + special = jQuery.event.special[ type ] || {}; + + // If selector defined, determine special event api type, otherwise given type + type = ( selector ? special.delegateType : special.bindType ) || type; + + // Update special based on newly reset type + special = jQuery.event.special[ type ] || {}; + + // handleObj is passed to all event handlers + handleObj = jQuery.extend( { + type: type, + origType: origType, + data: data, + handler: handler, + guid: handler.guid, + selector: selector, + needsContext: selector && jQuery.expr.match.needsContext.test( selector ), + namespace: namespaces.join( "." ) + }, handleObjIn ); + + // Init the event handler queue if we're the first + if ( !( handlers = events[ type ] ) ) { + handlers = events[ type ] = []; + handlers.delegateCount = 0; + + // Only use addEventListener if the special events handler returns false + if ( !special.setup || + special.setup.call( elem, data, namespaces, eventHandle ) === false ) { + + if ( elem.addEventListener ) { + elem.addEventListener( type, eventHandle ); + } + } + } + + if ( special.add ) { + special.add.call( elem, handleObj ); + + if ( !handleObj.handler.guid ) { + handleObj.handler.guid = handler.guid; + } + } + + // Add to the element's handler list, delegates in front + if ( selector ) { + handlers.splice( handlers.delegateCount++, 0, handleObj ); + } else { + handlers.push( handleObj ); + } + + // Keep track of which events have ever been used, for event optimization + jQuery.event.global[ type ] = true; + } + + }, + + // Detach an event or set of events from an element + remove: function( elem, types, handler, selector, mappedTypes ) { + + var j, origCount, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); + + if ( !elemData || !( events = elemData.events ) ) { + return; + } + + // Once for each type.namespace in types; type may be omitted + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // Unbind all events (on this namespace, if provided) for the element + if ( !type ) { + for ( type in events ) { + jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); + } + continue; + } + + special = jQuery.event.special[ type ] || {}; + type = ( selector ? special.delegateType : special.bindType ) || type; + handlers = events[ type ] || []; + tmp = tmp[ 2 ] && + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); + + // Remove matching events + origCount = j = handlers.length; + while ( j-- ) { + handleObj = handlers[ j ]; + + if ( ( mappedTypes || origType === handleObj.origType ) && + ( !handler || handler.guid === handleObj.guid ) && + ( !tmp || tmp.test( handleObj.namespace ) ) && + ( !selector || selector === handleObj.selector || + selector === "**" && handleObj.selector ) ) { + handlers.splice( j, 1 ); + + if ( handleObj.selector ) { + handlers.delegateCount--; + } + if ( special.remove ) { + special.remove.call( elem, handleObj ); + } + } + } + + // Remove generic event handler if we removed something and no more handlers exist + // (avoids potential for endless recursion during removal of special event handlers) + if ( origCount && !handlers.length ) { + if ( !special.teardown || + special.teardown.call( elem, namespaces, elemData.handle ) === false ) { + + jQuery.removeEvent( elem, type, elemData.handle ); + } + + delete events[ type ]; + } + } + + // Remove data and the expando if it's no longer used + if ( jQuery.isEmptyObject( events ) ) { + dataPriv.remove( elem, "handle events" ); + } + }, + + dispatch: function( nativeEvent ) { + + var i, j, ret, matched, handleObj, handlerQueue, + args = new Array( arguments.length ), + + // Make a writable jQuery.Event from the native event object + event = jQuery.event.fix( nativeEvent ), + + handlers = ( + dataPriv.get( this, "events" ) || Object.create( null ) + )[ event.type ] || [], + special = jQuery.event.special[ event.type ] || {}; + + // Use the fix-ed jQuery.Event rather than the (read-only) native event + args[ 0 ] = event; + + for ( i = 1; i < arguments.length; i++ ) { + args[ i ] = arguments[ i ]; + } + + event.delegateTarget = this; + + // Call the preDispatch hook for the mapped type, and let it bail if desired + if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { + return; + } + + // Determine handlers + handlerQueue = jQuery.event.handlers.call( this, event, handlers ); + + // Run delegates first; they may want to stop propagation beneath us + i = 0; + while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { + event.currentTarget = matched.elem; + + j = 0; + while ( ( handleObj = matched.handlers[ j++ ] ) && + !event.isImmediatePropagationStopped() ) { + + // If the event is namespaced, then each handler is only invoked if it is + // specially universal or its namespaces are a superset of the event's. + if ( !event.rnamespace || handleObj.namespace === false || + event.rnamespace.test( handleObj.namespace ) ) { + + event.handleObj = handleObj; + event.data = handleObj.data; + + ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || + handleObj.handler ).apply( matched.elem, args ); + + if ( ret !== undefined ) { + if ( ( event.result = ret ) === false ) { + event.preventDefault(); + event.stopPropagation(); + } + } + } + } + } + + // Call the postDispatch hook for the mapped type + if ( special.postDispatch ) { + special.postDispatch.call( this, event ); + } + + return event.result; + }, + + handlers: function( event, handlers ) { + var i, handleObj, sel, matchedHandlers, matchedSelectors, + handlerQueue = [], + delegateCount = handlers.delegateCount, + cur = event.target; + + // Find delegate handlers + if ( delegateCount && + + // Support: IE <=9 + // Black-hole SVG instance trees (trac-13180) + cur.nodeType && + + // Support: Firefox <=42 + // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) + // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click + // Support: IE 11 only + // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) + !( event.type === "click" && event.button >= 1 ) ) { + + for ( ; cur !== this; cur = cur.parentNode || this ) { + + // Don't check non-elements (#13208) + // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) + if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { + matchedHandlers = []; + matchedSelectors = {}; + for ( i = 0; i < delegateCount; i++ ) { + handleObj = handlers[ i ]; + + // Don't conflict with Object.prototype properties (#13203) + sel = handleObj.selector + " "; + + if ( matchedSelectors[ sel ] === undefined ) { + matchedSelectors[ sel ] = handleObj.needsContext ? + jQuery( sel, this ).index( cur ) > -1 : + jQuery.find( sel, this, null, [ cur ] ).length; + } + if ( matchedSelectors[ sel ] ) { + matchedHandlers.push( handleObj ); + } + } + if ( matchedHandlers.length ) { + handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); + } + } + } + } + + // Add the remaining (directly-bound) handlers + cur = this; + if ( delegateCount < handlers.length ) { + handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); + } + + return handlerQueue; + }, + + addProp: function( name, hook ) { + Object.defineProperty( jQuery.Event.prototype, name, { + enumerable: true, + configurable: true, + + get: isFunction( hook ) ? + function() { + if ( this.originalEvent ) { + return hook( this.originalEvent ); + } + } : + function() { + if ( this.originalEvent ) { + return this.originalEvent[ name ]; + } + }, + + set: function( value ) { + Object.defineProperty( this, name, { + enumerable: true, + configurable: true, + writable: true, + value: value + } ); + } + } ); + }, + + fix: function( originalEvent ) { + return originalEvent[ jQuery.expando ] ? + originalEvent : + new jQuery.Event( originalEvent ); + }, + + special: { + load: { + + // Prevent triggered image.load events from bubbling to window.load + noBubble: true + }, + click: { + + // Utilize native event to ensure correct state for checkable inputs + setup: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Claim the first handler + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + // dataPriv.set( el, "click", ... ) + leverageNative( el, "click", returnTrue ); + } + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Force setup before triggering a click + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + leverageNative( el, "click" ); + } + + // Return non-false to allow normal event-path propagation + return true; + }, + + // For cross-browser consistency, suppress native .click() on links + // Also prevent it if we're currently inside a leveraged native-event stack + _default: function( event ) { + var target = event.target; + return rcheckableType.test( target.type ) && + target.click && nodeName( target, "input" ) && + dataPriv.get( target, "click" ) || + nodeName( target, "a" ); + } + }, + + beforeunload: { + postDispatch: function( event ) { + + // Support: Firefox 20+ + // Firefox doesn't alert if the returnValue field is not set. + if ( event.result !== undefined && event.originalEvent ) { + event.originalEvent.returnValue = event.result; + } + } + } + } +}; + +// Ensure the presence of an event listener that handles manually-triggered +// synthetic events by interrupting progress until reinvoked in response to +// *native* events that it fires directly, ensuring that state changes have +// already occurred before other listeners are invoked. +function leverageNative( el, type, expectSync ) { + + // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add + if ( !expectSync ) { + if ( dataPriv.get( el, type ) === undefined ) { + jQuery.event.add( el, type, returnTrue ); + } + return; + } + + // Register the controller as a special universal handler for all event namespaces + dataPriv.set( el, type, false ); + jQuery.event.add( el, type, { + namespace: false, + handler: function( event ) { + var notAsync, result, + saved = dataPriv.get( this, type ); + + if ( ( event.isTrigger & 1 ) && this[ type ] ) { + + // Interrupt processing of the outer synthetic .trigger()ed event + // Saved data should be false in such cases, but might be a leftover capture object + // from an async native handler (gh-4350) + if ( !saved.length ) { + + // Store arguments for use when handling the inner native event + // There will always be at least one argument (an event object), so this array + // will not be confused with a leftover capture object. + saved = slice.call( arguments ); + dataPriv.set( this, type, saved ); + + // Trigger the native event and capture its result + // Support: IE <=9 - 11+ + // focus() and blur() are asynchronous + notAsync = expectSync( this, type ); + this[ type ](); + result = dataPriv.get( this, type ); + if ( saved !== result || notAsync ) { + dataPriv.set( this, type, false ); + } else { + result = {}; + } + if ( saved !== result ) { + + // Cancel the outer synthetic event + event.stopImmediatePropagation(); + event.preventDefault(); + return result.value; + } + + // If this is an inner synthetic event for an event with a bubbling surrogate + // (focus or blur), assume that the surrogate already propagated from triggering the + // native event and prevent that from happening again here. + // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the + // bubbling surrogate propagates *after* the non-bubbling base), but that seems + // less bad than duplication. + } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { + event.stopPropagation(); + } + + // If this is a native event triggered above, everything is now in order + // Fire an inner synthetic event with the original arguments + } else if ( saved.length ) { + + // ...and capture the result + dataPriv.set( this, type, { + value: jQuery.event.trigger( + + // Support: IE <=9 - 11+ + // Extend with the prototype to reset the above stopImmediatePropagation() + jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), + saved.slice( 1 ), + this + ) + } ); + + // Abort handling of the native event + event.stopImmediatePropagation(); + } + } + } ); +} + +jQuery.removeEvent = function( elem, type, handle ) { + + // This "if" is needed for plain objects + if ( elem.removeEventListener ) { + elem.removeEventListener( type, handle ); + } +}; + +jQuery.Event = function( src, props ) { + + // Allow instantiation without the 'new' keyword + if ( !( this instanceof jQuery.Event ) ) { + return new jQuery.Event( src, props ); + } + + // Event object + if ( src && src.type ) { + this.originalEvent = src; + this.type = src.type; + + // Events bubbling up the document may have been marked as prevented + // by a handler lower down the tree; reflect the correct value. + this.isDefaultPrevented = src.defaultPrevented || + src.defaultPrevented === undefined && + + // Support: Android <=2.3 only + src.returnValue === false ? + returnTrue : + returnFalse; + + // Create target properties + // Support: Safari <=6 - 7 only + // Target should not be a text node (#504, #13143) + this.target = ( src.target && src.target.nodeType === 3 ) ? + src.target.parentNode : + src.target; + + this.currentTarget = src.currentTarget; + this.relatedTarget = src.relatedTarget; + + // Event type + } else { + this.type = src; + } + + // Put explicitly provided properties onto the event object + if ( props ) { + jQuery.extend( this, props ); + } + + // Create a timestamp if incoming event doesn't have one + this.timeStamp = src && src.timeStamp || Date.now(); + + // Mark it as fixed + this[ jQuery.expando ] = true; +}; + +// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding +// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html +jQuery.Event.prototype = { + constructor: jQuery.Event, + isDefaultPrevented: returnFalse, + isPropagationStopped: returnFalse, + isImmediatePropagationStopped: returnFalse, + isSimulated: false, + + preventDefault: function() { + var e = this.originalEvent; + + this.isDefaultPrevented = returnTrue; + + if ( e && !this.isSimulated ) { + e.preventDefault(); + } + }, + stopPropagation: function() { + var e = this.originalEvent; + + this.isPropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopPropagation(); + } + }, + stopImmediatePropagation: function() { + var e = this.originalEvent; + + this.isImmediatePropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopImmediatePropagation(); + } + + this.stopPropagation(); + } +}; + +// Includes all common event props including KeyEvent and MouseEvent specific props +jQuery.each( { + altKey: true, + bubbles: true, + cancelable: true, + changedTouches: true, + ctrlKey: true, + detail: true, + eventPhase: true, + metaKey: true, + pageX: true, + pageY: true, + shiftKey: true, + view: true, + "char": true, + code: true, + charCode: true, + key: true, + keyCode: true, + button: true, + buttons: true, + clientX: true, + clientY: true, + offsetX: true, + offsetY: true, + pointerId: true, + pointerType: true, + screenX: true, + screenY: true, + targetTouches: true, + toElement: true, + touches: true, + + which: function( event ) { + var button = event.button; + + // Add which for key events + if ( event.which == null && rkeyEvent.test( event.type ) ) { + return event.charCode != null ? event.charCode : event.keyCode; + } + + // Add which for click: 1 === left; 2 === middle; 3 === right + if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { + if ( button & 1 ) { + return 1; + } + + if ( button & 2 ) { + return 3; + } + + if ( button & 4 ) { + return 2; + } + + return 0; + } + + return event.which; + } +}, jQuery.event.addProp ); + +jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { + jQuery.event.special[ type ] = { + + // Utilize native event if possible so blur/focus sequence is correct + setup: function() { + + // Claim the first handler + // dataPriv.set( this, "focus", ... ) + // dataPriv.set( this, "blur", ... ) + leverageNative( this, type, expectSync ); + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function() { + + // Force setup before trigger + leverageNative( this, type ); + + // Return non-false to allow normal event-path propagation + return true; + }, + + delegateType: delegateType + }; +} ); + +// Create mouseenter/leave events using mouseover/out and event-time checks +// so that event delegation works in jQuery. +// Do the same for pointerenter/pointerleave and pointerover/pointerout +// +// Support: Safari 7 only +// Safari sends mouseenter too often; see: +// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 +// for the description of the bug (it existed in older Chrome versions as well). +jQuery.each( { + mouseenter: "mouseover", + mouseleave: "mouseout", + pointerenter: "pointerover", + pointerleave: "pointerout" +}, function( orig, fix ) { + jQuery.event.special[ orig ] = { + delegateType: fix, + bindType: fix, + + handle: function( event ) { + var ret, + target = this, + related = event.relatedTarget, + handleObj = event.handleObj; + + // For mouseenter/leave call the handler if related is outside the target. + // NB: No relatedTarget if the mouse left/entered the browser window + if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { + event.type = handleObj.origType; + ret = handleObj.handler.apply( this, arguments ); + event.type = fix; + } + return ret; + } + }; +} ); + +jQuery.fn.extend( { + + on: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn ); + }, + one: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn, 1 ); + }, + off: function( types, selector, fn ) { + var handleObj, type; + if ( types && types.preventDefault && types.handleObj ) { + + // ( event ) dispatched jQuery.Event + handleObj = types.handleObj; + jQuery( types.delegateTarget ).off( + handleObj.namespace ? + handleObj.origType + "." + handleObj.namespace : + handleObj.origType, + handleObj.selector, + handleObj.handler + ); + return this; + } + if ( typeof types === "object" ) { + + // ( types-object [, selector] ) + for ( type in types ) { + this.off( type, selector, types[ type ] ); + } + return this; + } + if ( selector === false || typeof selector === "function" ) { + + // ( types [, fn] ) + fn = selector; + selector = undefined; + } + if ( fn === false ) { + fn = returnFalse; + } + return this.each( function() { + jQuery.event.remove( this, types, fn, selector ); + } ); + } +} ); + + +var + + // Support: IE <=10 - 11, Edge 12 - 13 only + // In IE/Edge using regex groups here causes severe slowdowns. + // See https://connect.microsoft.com/IE/feedback/details/1736512/ + rnoInnerhtml = /\s*$/g; + +// Prefer a tbody over its parent table for containing new rows +function manipulationTarget( elem, content ) { + if ( nodeName( elem, "table" ) && + nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { + + return jQuery( elem ).children( "tbody" )[ 0 ] || elem; + } + + return elem; +} + +// Replace/restore the type attribute of script elements for safe DOM manipulation +function disableScript( elem ) { + elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; + return elem; +} +function restoreScript( elem ) { + if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { + elem.type = elem.type.slice( 5 ); + } else { + elem.removeAttribute( "type" ); + } + + return elem; +} + +function cloneCopyEvent( src, dest ) { + var i, l, type, pdataOld, udataOld, udataCur, events; + + if ( dest.nodeType !== 1 ) { + return; + } + + // 1. Copy private data: events, handlers, etc. + if ( dataPriv.hasData( src ) ) { + pdataOld = dataPriv.get( src ); + events = pdataOld.events; + + if ( events ) { + dataPriv.remove( dest, "handle events" ); + + for ( type in events ) { + for ( i = 0, l = events[ type ].length; i < l; i++ ) { + jQuery.event.add( dest, type, events[ type ][ i ] ); + } + } + } + } + + // 2. Copy user data + if ( dataUser.hasData( src ) ) { + udataOld = dataUser.access( src ); + udataCur = jQuery.extend( {}, udataOld ); + + dataUser.set( dest, udataCur ); + } +} + +// Fix IE bugs, see support tests +function fixInput( src, dest ) { + var nodeName = dest.nodeName.toLowerCase(); + + // Fails to persist the checked state of a cloned checkbox or radio button. + if ( nodeName === "input" && rcheckableType.test( src.type ) ) { + dest.checked = src.checked; + + // Fails to return the selected option to the default selected state when cloning options + } else if ( nodeName === "input" || nodeName === "textarea" ) { + dest.defaultValue = src.defaultValue; + } +} + +function domManip( collection, args, callback, ignored ) { + + // Flatten any nested arrays + args = flat( args ); + + var fragment, first, scripts, hasScripts, node, doc, + i = 0, + l = collection.length, + iNoClone = l - 1, + value = args[ 0 ], + valueIsFunction = isFunction( value ); + + // We can't cloneNode fragments that contain checked, in WebKit + if ( valueIsFunction || + ( l > 1 && typeof value === "string" && + !support.checkClone && rchecked.test( value ) ) ) { + return collection.each( function( index ) { + var self = collection.eq( index ); + if ( valueIsFunction ) { + args[ 0 ] = value.call( this, index, self.html() ); + } + domManip( self, args, callback, ignored ); + } ); + } + + if ( l ) { + fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); + first = fragment.firstChild; + + if ( fragment.childNodes.length === 1 ) { + fragment = first; + } + + // Require either new content or an interest in ignored elements to invoke the callback + if ( first || ignored ) { + scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); + hasScripts = scripts.length; + + // Use the original fragment for the last item + // instead of the first because it can end up + // being emptied incorrectly in certain situations (#8070). + for ( ; i < l; i++ ) { + node = fragment; + + if ( i !== iNoClone ) { + node = jQuery.clone( node, true, true ); + + // Keep references to cloned scripts for later restoration + if ( hasScripts ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( scripts, getAll( node, "script" ) ); + } + } + + callback.call( collection[ i ], node, i ); + } + + if ( hasScripts ) { + doc = scripts[ scripts.length - 1 ].ownerDocument; + + // Reenable scripts + jQuery.map( scripts, restoreScript ); + + // Evaluate executable scripts on first document insertion + for ( i = 0; i < hasScripts; i++ ) { + node = scripts[ i ]; + if ( rscriptType.test( node.type || "" ) && + !dataPriv.access( node, "globalEval" ) && + jQuery.contains( doc, node ) ) { + + if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { + + // Optional AJAX dependency, but won't run scripts if not present + if ( jQuery._evalUrl && !node.noModule ) { + jQuery._evalUrl( node.src, { + nonce: node.nonce || node.getAttribute( "nonce" ) + }, doc ); + } + } else { + DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); + } + } + } + } + } + } + + return collection; +} + +function remove( elem, selector, keepData ) { + var node, + nodes = selector ? jQuery.filter( selector, elem ) : elem, + i = 0; + + for ( ; ( node = nodes[ i ] ) != null; i++ ) { + if ( !keepData && node.nodeType === 1 ) { + jQuery.cleanData( getAll( node ) ); + } + + if ( node.parentNode ) { + if ( keepData && isAttached( node ) ) { + setGlobalEval( getAll( node, "script" ) ); + } + node.parentNode.removeChild( node ); + } + } + + return elem; +} + +jQuery.extend( { + htmlPrefilter: function( html ) { + return html; + }, + + clone: function( elem, dataAndEvents, deepDataAndEvents ) { + var i, l, srcElements, destElements, + clone = elem.cloneNode( true ), + inPage = isAttached( elem ); + + // Fix IE cloning issues + if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && + !jQuery.isXMLDoc( elem ) ) { + + // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 + destElements = getAll( clone ); + srcElements = getAll( elem ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + fixInput( srcElements[ i ], destElements[ i ] ); + } + } + + // Copy the events from the original to the clone + if ( dataAndEvents ) { + if ( deepDataAndEvents ) { + srcElements = srcElements || getAll( elem ); + destElements = destElements || getAll( clone ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + cloneCopyEvent( srcElements[ i ], destElements[ i ] ); + } + } else { + cloneCopyEvent( elem, clone ); + } + } + + // Preserve script evaluation history + destElements = getAll( clone, "script" ); + if ( destElements.length > 0 ) { + setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); + } + + // Return the cloned set + return clone; + }, + + cleanData: function( elems ) { + var data, elem, type, + special = jQuery.event.special, + i = 0; + + for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { + if ( acceptData( elem ) ) { + if ( ( data = elem[ dataPriv.expando ] ) ) { + if ( data.events ) { + for ( type in data.events ) { + if ( special[ type ] ) { + jQuery.event.remove( elem, type ); + + // This is a shortcut to avoid jQuery.event.remove's overhead + } else { + jQuery.removeEvent( elem, type, data.handle ); + } + } + } + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataPriv.expando ] = undefined; + } + if ( elem[ dataUser.expando ] ) { + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataUser.expando ] = undefined; + } + } + } + } +} ); + +jQuery.fn.extend( { + detach: function( selector ) { + return remove( this, selector, true ); + }, + + remove: function( selector ) { + return remove( this, selector ); + }, + + text: function( value ) { + return access( this, function( value ) { + return value === undefined ? + jQuery.text( this ) : + this.empty().each( function() { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + this.textContent = value; + } + } ); + }, null, value, arguments.length ); + }, + + append: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.appendChild( elem ); + } + } ); + }, + + prepend: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.insertBefore( elem, target.firstChild ); + } + } ); + }, + + before: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this ); + } + } ); + }, + + after: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this.nextSibling ); + } + } ); + }, + + empty: function() { + var elem, + i = 0; + + for ( ; ( elem = this[ i ] ) != null; i++ ) { + if ( elem.nodeType === 1 ) { + + // Prevent memory leaks + jQuery.cleanData( getAll( elem, false ) ); + + // Remove any remaining nodes + elem.textContent = ""; + } + } + + return this; + }, + + clone: function( dataAndEvents, deepDataAndEvents ) { + dataAndEvents = dataAndEvents == null ? false : dataAndEvents; + deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; + + return this.map( function() { + return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); + } ); + }, + + html: function( value ) { + return access( this, function( value ) { + var elem = this[ 0 ] || {}, + i = 0, + l = this.length; + + if ( value === undefined && elem.nodeType === 1 ) { + return elem.innerHTML; + } + + // See if we can take a shortcut and just use innerHTML + if ( typeof value === "string" && !rnoInnerhtml.test( value ) && + !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { + + value = jQuery.htmlPrefilter( value ); + + try { + for ( ; i < l; i++ ) { + elem = this[ i ] || {}; + + // Remove element nodes and prevent memory leaks + if ( elem.nodeType === 1 ) { + jQuery.cleanData( getAll( elem, false ) ); + elem.innerHTML = value; + } + } + + elem = 0; + + // If using innerHTML throws an exception, use the fallback method + } catch ( e ) {} + } + + if ( elem ) { + this.empty().append( value ); + } + }, null, value, arguments.length ); + }, + + replaceWith: function() { + var ignored = []; + + // Make the changes, replacing each non-ignored context element with the new content + return domManip( this, arguments, function( elem ) { + var parent = this.parentNode; + + if ( jQuery.inArray( this, ignored ) < 0 ) { + jQuery.cleanData( getAll( this ) ); + if ( parent ) { + parent.replaceChild( elem, this ); + } + } + + // Force callback invocation + }, ignored ); + } +} ); + +jQuery.each( { + appendTo: "append", + prependTo: "prepend", + insertBefore: "before", + insertAfter: "after", + replaceAll: "replaceWith" +}, function( name, original ) { + jQuery.fn[ name ] = function( selector ) { + var elems, + ret = [], + insert = jQuery( selector ), + last = insert.length - 1, + i = 0; + + for ( ; i <= last; i++ ) { + elems = i === last ? this : this.clone( true ); + jQuery( insert[ i ] )[ original ]( elems ); + + // Support: Android <=4.0 only, PhantomJS 1 only + // .get() because push.apply(_, arraylike) throws on ancient WebKit + push.apply( ret, elems.get() ); + } + + return this.pushStack( ret ); + }; +} ); +var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); + +var getStyles = function( elem ) { + + // Support: IE <=11 only, Firefox <=30 (#15098, #14150) + // IE throws on elements created in popups + // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" + var view = elem.ownerDocument.defaultView; + + if ( !view || !view.opener ) { + view = window; + } + + return view.getComputedStyle( elem ); + }; + +var swap = function( elem, options, callback ) { + var ret, name, + old = {}; + + // Remember the old values, and insert the new ones + for ( name in options ) { + old[ name ] = elem.style[ name ]; + elem.style[ name ] = options[ name ]; + } + + ret = callback.call( elem ); + + // Revert the old values + for ( name in options ) { + elem.style[ name ] = old[ name ]; + } + + return ret; +}; + + +var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); + + + +( function() { + + // Executing both pixelPosition & boxSizingReliable tests require only one layout + // so they're executed at the same time to save the second computation. + function computeStyleTests() { + + // This is a singleton, we need to execute it only once + if ( !div ) { + return; + } + + container.style.cssText = "position:absolute;left:-11111px;width:60px;" + + "margin-top:1px;padding:0;border:0"; + div.style.cssText = + "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + + "margin:auto;border:1px;padding:1px;" + + "width:60%;top:1%"; + documentElement.appendChild( container ).appendChild( div ); + + var divStyle = window.getComputedStyle( div ); + pixelPositionVal = divStyle.top !== "1%"; + + // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 + reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; + + // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 + // Some styles come back with percentage values, even though they shouldn't + div.style.right = "60%"; + pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; + + // Support: IE 9 - 11 only + // Detect misreporting of content dimensions for box-sizing:border-box elements + boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; + + // Support: IE 9 only + // Detect overflow:scroll screwiness (gh-3699) + // Support: Chrome <=64 + // Don't get tricked when zoom affects offsetWidth (gh-4029) + div.style.position = "absolute"; + scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; + + documentElement.removeChild( container ); + + // Nullify the div so it wouldn't be stored in the memory and + // it will also be a sign that checks already performed + div = null; + } + + function roundPixelMeasures( measure ) { + return Math.round( parseFloat( measure ) ); + } + + var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, + reliableTrDimensionsVal, reliableMarginLeftVal, + container = document.createElement( "div" ), + div = document.createElement( "div" ); + + // Finish early in limited (non-browser) environments + if ( !div.style ) { + return; + } + + // Support: IE <=9 - 11 only + // Style of cloned element affects source element cloned (#8908) + div.style.backgroundClip = "content-box"; + div.cloneNode( true ).style.backgroundClip = ""; + support.clearCloneStyle = div.style.backgroundClip === "content-box"; + + jQuery.extend( support, { + boxSizingReliable: function() { + computeStyleTests(); + return boxSizingReliableVal; + }, + pixelBoxStyles: function() { + computeStyleTests(); + return pixelBoxStylesVal; + }, + pixelPosition: function() { + computeStyleTests(); + return pixelPositionVal; + }, + reliableMarginLeft: function() { + computeStyleTests(); + return reliableMarginLeftVal; + }, + scrollboxSize: function() { + computeStyleTests(); + return scrollboxSizeVal; + }, + + // Support: IE 9 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Behavior in IE 9 is more subtle than in newer versions & it passes + // some versions of this test; make sure not to make it pass there! + reliableTrDimensions: function() { + var table, tr, trChild, trStyle; + if ( reliableTrDimensionsVal == null ) { + table = document.createElement( "table" ); + tr = document.createElement( "tr" ); + trChild = document.createElement( "div" ); + + table.style.cssText = "position:absolute;left:-11111px"; + tr.style.height = "1px"; + trChild.style.height = "9px"; + + documentElement + .appendChild( table ) + .appendChild( tr ) + .appendChild( trChild ); + + trStyle = window.getComputedStyle( tr ); + reliableTrDimensionsVal = parseInt( trStyle.height ) > 3; + + documentElement.removeChild( table ); + } + return reliableTrDimensionsVal; + } + } ); +} )(); + + +function curCSS( elem, name, computed ) { + var width, minWidth, maxWidth, ret, + + // Support: Firefox 51+ + // Retrieving style before computed somehow + // fixes an issue with getting wrong values + // on detached elements + style = elem.style; + + computed = computed || getStyles( elem ); + + // getPropertyValue is needed for: + // .css('filter') (IE 9 only, #12537) + // .css('--customProperty) (#3144) + if ( computed ) { + ret = computed.getPropertyValue( name ) || computed[ name ]; + + if ( ret === "" && !isAttached( elem ) ) { + ret = jQuery.style( elem, name ); + } + + // A tribute to the "awesome hack by Dean Edwards" + // Android Browser returns percentage for some values, + // but width seems to be reliably pixels. + // This is against the CSSOM draft spec: + // https://drafts.csswg.org/cssom/#resolved-values + if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { + + // Remember the original values + width = style.width; + minWidth = style.minWidth; + maxWidth = style.maxWidth; + + // Put in the new values to get a computed value out + style.minWidth = style.maxWidth = style.width = ret; + ret = computed.width; + + // Revert the changed values + style.width = width; + style.minWidth = minWidth; + style.maxWidth = maxWidth; + } + } + + return ret !== undefined ? + + // Support: IE <=9 - 11 only + // IE returns zIndex value as an integer. + ret + "" : + ret; +} + + +function addGetHookIf( conditionFn, hookFn ) { + + // Define the hook, we'll check on the first run if it's really needed. + return { + get: function() { + if ( conditionFn() ) { + + // Hook not needed (or it's not possible to use it due + // to missing dependency), remove it. + delete this.get; + return; + } + + // Hook needed; redefine it so that the support test is not executed again. + return ( this.get = hookFn ).apply( this, arguments ); + } + }; +} + + +var cssPrefixes = [ "Webkit", "Moz", "ms" ], + emptyStyle = document.createElement( "div" ).style, + vendorProps = {}; + +// Return a vendor-prefixed property or undefined +function vendorPropName( name ) { + + // Check for vendor prefixed names + var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), + i = cssPrefixes.length; + + while ( i-- ) { + name = cssPrefixes[ i ] + capName; + if ( name in emptyStyle ) { + return name; + } + } +} + +// Return a potentially-mapped jQuery.cssProps or vendor prefixed property +function finalPropName( name ) { + var final = jQuery.cssProps[ name ] || vendorProps[ name ]; + + if ( final ) { + return final; + } + if ( name in emptyStyle ) { + return name; + } + return vendorProps[ name ] = vendorPropName( name ) || name; +} + + +var + + // Swappable if display is none or starts with table + // except "table", "table-cell", or "table-caption" + // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display + rdisplayswap = /^(none|table(?!-c[ea]).+)/, + rcustomProp = /^--/, + cssShow = { position: "absolute", visibility: "hidden", display: "block" }, + cssNormalTransform = { + letterSpacing: "0", + fontWeight: "400" + }; + +function setPositiveNumber( _elem, value, subtract ) { + + // Any relative (+/-) values have already been + // normalized at this point + var matches = rcssNum.exec( value ); + return matches ? + + // Guard against undefined "subtract", e.g., when used as in cssHooks + Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : + value; +} + +function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { + var i = dimension === "width" ? 1 : 0, + extra = 0, + delta = 0; + + // Adjustment may not be necessary + if ( box === ( isBorderBox ? "border" : "content" ) ) { + return 0; + } + + for ( ; i < 4; i += 2 ) { + + // Both box models exclude margin + if ( box === "margin" ) { + delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); + } + + // If we get here with a content-box, we're seeking "padding" or "border" or "margin" + if ( !isBorderBox ) { + + // Add padding + delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + + // For "border" or "margin", add border + if ( box !== "padding" ) { + delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + + // But still keep track of it otherwise + } else { + extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + + // If we get here with a border-box (content + padding + border), we're seeking "content" or + // "padding" or "margin" + } else { + + // For "content", subtract padding + if ( box === "content" ) { + delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + } + + // For "content" or "padding", subtract border + if ( box !== "margin" ) { + delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } + } + + // Account for positive content-box scroll gutter when requested by providing computedVal + if ( !isBorderBox && computedVal >= 0 ) { + + // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border + // Assuming integer scroll gutter, subtract the rest and round down + delta += Math.max( 0, Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + computedVal - + delta - + extra - + 0.5 + + // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter + // Use an explicit zero to avoid NaN (gh-3964) + ) ) || 0; + } + + return delta; +} + +function getWidthOrHeight( elem, dimension, extra ) { + + // Start with computed style + var styles = getStyles( elem ), + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). + // Fake content-box until we know it's needed to know the true value. + boxSizingNeeded = !support.boxSizingReliable() || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + valueIsBorderBox = isBorderBox, + + val = curCSS( elem, dimension, styles ), + offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); + + // Support: Firefox <=54 + // Return a confounding non-pixel value or feign ignorance, as appropriate. + if ( rnumnonpx.test( val ) ) { + if ( !extra ) { + return val; + } + val = "auto"; + } + + + // Support: IE 9 - 11 only + // Use offsetWidth/offsetHeight for when box sizing is unreliable. + // In those cases, the computed value can be trusted to be border-box. + if ( ( !support.boxSizingReliable() && isBorderBox || + + // Support: IE 10 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Interestingly, in some cases IE 9 doesn't suffer from this issue. + !support.reliableTrDimensions() && nodeName( elem, "tr" ) || + + // Fall back to offsetWidth/offsetHeight when value is "auto" + // This happens for inline elements with no explicit setting (gh-3571) + val === "auto" || + + // Support: Android <=4.1 - 4.3 only + // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) + !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && + + // Make sure the element is visible & connected + elem.getClientRects().length ) { + + isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; + + // Where available, offsetWidth/offsetHeight approximate border box dimensions. + // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the + // retrieved value as a content box dimension. + valueIsBorderBox = offsetProp in elem; + if ( valueIsBorderBox ) { + val = elem[ offsetProp ]; + } + } + + // Normalize "" and auto + val = parseFloat( val ) || 0; + + // Adjust for the element's box model + return ( val + + boxModelAdjustment( + elem, + dimension, + extra || ( isBorderBox ? "border" : "content" ), + valueIsBorderBox, + styles, + + // Provide the current computed size to request scroll gutter calculation (gh-3589) + val + ) + ) + "px"; +} + +jQuery.extend( { + + // Add in style property hooks for overriding the default + // behavior of getting and setting a style property + cssHooks: { + opacity: { + get: function( elem, computed ) { + if ( computed ) { + + // We should always get a number back from opacity + var ret = curCSS( elem, "opacity" ); + return ret === "" ? "1" : ret; + } + } + } + }, + + // Don't automatically add "px" to these possibly-unitless properties + cssNumber: { + "animationIterationCount": true, + "columnCount": true, + "fillOpacity": true, + "flexGrow": true, + "flexShrink": true, + "fontWeight": true, + "gridArea": true, + "gridColumn": true, + "gridColumnEnd": true, + "gridColumnStart": true, + "gridRow": true, + "gridRowEnd": true, + "gridRowStart": true, + "lineHeight": true, + "opacity": true, + "order": true, + "orphans": true, + "widows": true, + "zIndex": true, + "zoom": true + }, + + // Add in properties whose names you wish to fix before + // setting or getting the value + cssProps: {}, + + // Get and set the style property on a DOM Node + style: function( elem, name, value, extra ) { + + // Don't set styles on text and comment nodes + if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { + return; + } + + // Make sure that we're working with the right name + var ret, type, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ), + style = elem.style; + + // Make sure that we're working with the right name. We don't + // want to query the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Gets hook for the prefixed version, then unprefixed version + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // Check if we're setting a value + if ( value !== undefined ) { + type = typeof value; + + // Convert "+=" or "-=" to relative numbers (#7345) + if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { + value = adjustCSS( elem, name, ret ); + + // Fixes bug #9237 + type = "number"; + } + + // Make sure that null and NaN values aren't set (#7116) + if ( value == null || value !== value ) { + return; + } + + // If a number was passed in, add the unit (except for certain CSS properties) + // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append + // "px" to a few hardcoded values. + if ( type === "number" && !isCustomProp ) { + value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); + } + + // background-* props affect original clone's values + if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { + style[ name ] = "inherit"; + } + + // If a hook was provided, use that value, otherwise just set the specified value + if ( !hooks || !( "set" in hooks ) || + ( value = hooks.set( elem, value, extra ) ) !== undefined ) { + + if ( isCustomProp ) { + style.setProperty( name, value ); + } else { + style[ name ] = value; + } + } + + } else { + + // If a hook was provided get the non-computed value from there + if ( hooks && "get" in hooks && + ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { + + return ret; + } + + // Otherwise just get the value from the style object + return style[ name ]; + } + }, + + css: function( elem, name, extra, styles ) { + var val, num, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ); + + // Make sure that we're working with the right name. We don't + // want to modify the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Try prefixed name followed by the unprefixed name + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // If a hook was provided get the computed value from there + if ( hooks && "get" in hooks ) { + val = hooks.get( elem, true, extra ); + } + + // Otherwise, if a way to get the computed value exists, use that + if ( val === undefined ) { + val = curCSS( elem, name, styles ); + } + + // Convert "normal" to computed value + if ( val === "normal" && name in cssNormalTransform ) { + val = cssNormalTransform[ name ]; + } + + // Make numeric if forced or a qualifier was provided and val looks numeric + if ( extra === "" || extra ) { + num = parseFloat( val ); + return extra === true || isFinite( num ) ? num || 0 : val; + } + + return val; + } +} ); + +jQuery.each( [ "height", "width" ], function( _i, dimension ) { + jQuery.cssHooks[ dimension ] = { + get: function( elem, computed, extra ) { + if ( computed ) { + + // Certain elements can have dimension info if we invisibly show them + // but it must have a current display style that would benefit + return rdisplayswap.test( jQuery.css( elem, "display" ) ) && + + // Support: Safari 8+ + // Table columns in Safari have non-zero offsetWidth & zero + // getBoundingClientRect().width unless display is changed. + // Support: IE <=11 only + // Running getBoundingClientRect on a disconnected node + // in IE throws an error. + ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? + swap( elem, cssShow, function() { + return getWidthOrHeight( elem, dimension, extra ); + } ) : + getWidthOrHeight( elem, dimension, extra ); + } + }, + + set: function( elem, value, extra ) { + var matches, + styles = getStyles( elem ), + + // Only read styles.position if the test has a chance to fail + // to avoid forcing a reflow. + scrollboxSizeBuggy = !support.scrollboxSize() && + styles.position === "absolute", + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) + boxSizingNeeded = scrollboxSizeBuggy || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + subtract = extra ? + boxModelAdjustment( + elem, + dimension, + extra, + isBorderBox, + styles + ) : + 0; + + // Account for unreliable border-box dimensions by comparing offset* to computed and + // faking a content-box to get border and padding (gh-3699) + if ( isBorderBox && scrollboxSizeBuggy ) { + subtract -= Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + parseFloat( styles[ dimension ] ) - + boxModelAdjustment( elem, dimension, "border", false, styles ) - + 0.5 + ); + } + + // Convert to pixels if value adjustment is needed + if ( subtract && ( matches = rcssNum.exec( value ) ) && + ( matches[ 3 ] || "px" ) !== "px" ) { + + elem.style[ dimension ] = value; + value = jQuery.css( elem, dimension ); + } + + return setPositiveNumber( elem, value, subtract ); + } + }; +} ); + +jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, + function( elem, computed ) { + if ( computed ) { + return ( parseFloat( curCSS( elem, "marginLeft" ) ) || + elem.getBoundingClientRect().left - + swap( elem, { marginLeft: 0 }, function() { + return elem.getBoundingClientRect().left; + } ) + ) + "px"; + } + } +); + +// These hooks are used by animate to expand properties +jQuery.each( { + margin: "", + padding: "", + border: "Width" +}, function( prefix, suffix ) { + jQuery.cssHooks[ prefix + suffix ] = { + expand: function( value ) { + var i = 0, + expanded = {}, + + // Assumes a single number if not a string + parts = typeof value === "string" ? value.split( " " ) : [ value ]; + + for ( ; i < 4; i++ ) { + expanded[ prefix + cssExpand[ i ] + suffix ] = + parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; + } + + return expanded; + } + }; + + if ( prefix !== "margin" ) { + jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; + } +} ); + +jQuery.fn.extend( { + css: function( name, value ) { + return access( this, function( elem, name, value ) { + var styles, len, + map = {}, + i = 0; + + if ( Array.isArray( name ) ) { + styles = getStyles( elem ); + len = name.length; + + for ( ; i < len; i++ ) { + map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); + } + + return map; + } + + return value !== undefined ? + jQuery.style( elem, name, value ) : + jQuery.css( elem, name ); + }, name, value, arguments.length > 1 ); + } +} ); + + +function Tween( elem, options, prop, end, easing ) { + return new Tween.prototype.init( elem, options, prop, end, easing ); +} +jQuery.Tween = Tween; + +Tween.prototype = { + constructor: Tween, + init: function( elem, options, prop, end, easing, unit ) { + this.elem = elem; + this.prop = prop; + this.easing = easing || jQuery.easing._default; + this.options = options; + this.start = this.now = this.cur(); + this.end = end; + this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); + }, + cur: function() { + var hooks = Tween.propHooks[ this.prop ]; + + return hooks && hooks.get ? + hooks.get( this ) : + Tween.propHooks._default.get( this ); + }, + run: function( percent ) { + var eased, + hooks = Tween.propHooks[ this.prop ]; + + if ( this.options.duration ) { + this.pos = eased = jQuery.easing[ this.easing ]( + percent, this.options.duration * percent, 0, 1, this.options.duration + ); + } else { + this.pos = eased = percent; + } + this.now = ( this.end - this.start ) * eased + this.start; + + if ( this.options.step ) { + this.options.step.call( this.elem, this.now, this ); + } + + if ( hooks && hooks.set ) { + hooks.set( this ); + } else { + Tween.propHooks._default.set( this ); + } + return this; + } +}; + +Tween.prototype.init.prototype = Tween.prototype; + +Tween.propHooks = { + _default: { + get: function( tween ) { + var result; + + // Use a property on the element directly when it is not a DOM element, + // or when there is no matching style property that exists. + if ( tween.elem.nodeType !== 1 || + tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { + return tween.elem[ tween.prop ]; + } + + // Passing an empty string as a 3rd parameter to .css will automatically + // attempt a parseFloat and fallback to a string if the parse fails. + // Simple values such as "10px" are parsed to Float; + // complex values such as "rotate(1rad)" are returned as-is. + result = jQuery.css( tween.elem, tween.prop, "" ); + + // Empty strings, null, undefined and "auto" are converted to 0. + return !result || result === "auto" ? 0 : result; + }, + set: function( tween ) { + + // Use step hook for back compat. + // Use cssHook if its there. + // Use .style if available and use plain properties where available. + if ( jQuery.fx.step[ tween.prop ] ) { + jQuery.fx.step[ tween.prop ]( tween ); + } else if ( tween.elem.nodeType === 1 && ( + jQuery.cssHooks[ tween.prop ] || + tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { + jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); + } else { + tween.elem[ tween.prop ] = tween.now; + } + } + } +}; + +// Support: IE <=9 only +// Panic based approach to setting things on disconnected nodes +Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { + set: function( tween ) { + if ( tween.elem.nodeType && tween.elem.parentNode ) { + tween.elem[ tween.prop ] = tween.now; + } + } +}; + +jQuery.easing = { + linear: function( p ) { + return p; + }, + swing: function( p ) { + return 0.5 - Math.cos( p * Math.PI ) / 2; + }, + _default: "swing" +}; + +jQuery.fx = Tween.prototype.init; + +// Back compat <1.8 extension point +jQuery.fx.step = {}; + + + + +var + fxNow, inProgress, + rfxtypes = /^(?:toggle|show|hide)$/, + rrun = /queueHooks$/; + +function schedule() { + if ( inProgress ) { + if ( document.hidden === false && window.requestAnimationFrame ) { + window.requestAnimationFrame( schedule ); + } else { + window.setTimeout( schedule, jQuery.fx.interval ); + } + + jQuery.fx.tick(); + } +} + +// Animations created synchronously will run synchronously +function createFxNow() { + window.setTimeout( function() { + fxNow = undefined; + } ); + return ( fxNow = Date.now() ); +} + +// Generate parameters to create a standard animation +function genFx( type, includeWidth ) { + var which, + i = 0, + attrs = { height: type }; + + // If we include width, step value is 1 to do all cssExpand values, + // otherwise step value is 2 to skip over Left and Right + includeWidth = includeWidth ? 1 : 0; + for ( ; i < 4; i += 2 - includeWidth ) { + which = cssExpand[ i ]; + attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; + } + + if ( includeWidth ) { + attrs.opacity = attrs.width = type; + } + + return attrs; +} + +function createTween( value, prop, animation ) { + var tween, + collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), + index = 0, + length = collection.length; + for ( ; index < length; index++ ) { + if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { + + // We're done with this property + return tween; + } + } +} + +function defaultPrefilter( elem, props, opts ) { + var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, + isBox = "width" in props || "height" in props, + anim = this, + orig = {}, + style = elem.style, + hidden = elem.nodeType && isHiddenWithinTree( elem ), + dataShow = dataPriv.get( elem, "fxshow" ); + + // Queue-skipping animations hijack the fx hooks + if ( !opts.queue ) { + hooks = jQuery._queueHooks( elem, "fx" ); + if ( hooks.unqueued == null ) { + hooks.unqueued = 0; + oldfire = hooks.empty.fire; + hooks.empty.fire = function() { + if ( !hooks.unqueued ) { + oldfire(); + } + }; + } + hooks.unqueued++; + + anim.always( function() { + + // Ensure the complete handler is called before this completes + anim.always( function() { + hooks.unqueued--; + if ( !jQuery.queue( elem, "fx" ).length ) { + hooks.empty.fire(); + } + } ); + } ); + } + + // Detect show/hide animations + for ( prop in props ) { + value = props[ prop ]; + if ( rfxtypes.test( value ) ) { + delete props[ prop ]; + toggle = toggle || value === "toggle"; + if ( value === ( hidden ? "hide" : "show" ) ) { + + // Pretend to be hidden if this is a "show" and + // there is still data from a stopped show/hide + if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { + hidden = true; + + // Ignore all other no-op show/hide data + } else { + continue; + } + } + orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); + } + } + + // Bail out if this is a no-op like .hide().hide() + propTween = !jQuery.isEmptyObject( props ); + if ( !propTween && jQuery.isEmptyObject( orig ) ) { + return; + } + + // Restrict "overflow" and "display" styles during box animations + if ( isBox && elem.nodeType === 1 ) { + + // Support: IE <=9 - 11, Edge 12 - 15 + // Record all 3 overflow attributes because IE does not infer the shorthand + // from identically-valued overflowX and overflowY and Edge just mirrors + // the overflowX value there. + opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; + + // Identify a display type, preferring old show/hide data over the CSS cascade + restoreDisplay = dataShow && dataShow.display; + if ( restoreDisplay == null ) { + restoreDisplay = dataPriv.get( elem, "display" ); + } + display = jQuery.css( elem, "display" ); + if ( display === "none" ) { + if ( restoreDisplay ) { + display = restoreDisplay; + } else { + + // Get nonempty value(s) by temporarily forcing visibility + showHide( [ elem ], true ); + restoreDisplay = elem.style.display || restoreDisplay; + display = jQuery.css( elem, "display" ); + showHide( [ elem ] ); + } + } + + // Animate inline elements as inline-block + if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { + if ( jQuery.css( elem, "float" ) === "none" ) { + + // Restore the original display value at the end of pure show/hide animations + if ( !propTween ) { + anim.done( function() { + style.display = restoreDisplay; + } ); + if ( restoreDisplay == null ) { + display = style.display; + restoreDisplay = display === "none" ? "" : display; + } + } + style.display = "inline-block"; + } + } + } + + if ( opts.overflow ) { + style.overflow = "hidden"; + anim.always( function() { + style.overflow = opts.overflow[ 0 ]; + style.overflowX = opts.overflow[ 1 ]; + style.overflowY = opts.overflow[ 2 ]; + } ); + } + + // Implement show/hide animations + propTween = false; + for ( prop in orig ) { + + // General show/hide setup for this element animation + if ( !propTween ) { + if ( dataShow ) { + if ( "hidden" in dataShow ) { + hidden = dataShow.hidden; + } + } else { + dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); + } + + // Store hidden/visible for toggle so `.stop().toggle()` "reverses" + if ( toggle ) { + dataShow.hidden = !hidden; + } + + // Show elements before animating them + if ( hidden ) { + showHide( [ elem ], true ); + } + + /* eslint-disable no-loop-func */ + + anim.done( function() { + + /* eslint-enable no-loop-func */ + + // The final step of a "hide" animation is actually hiding the element + if ( !hidden ) { + showHide( [ elem ] ); + } + dataPriv.remove( elem, "fxshow" ); + for ( prop in orig ) { + jQuery.style( elem, prop, orig[ prop ] ); + } + } ); + } + + // Per-property setup + propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); + if ( !( prop in dataShow ) ) { + dataShow[ prop ] = propTween.start; + if ( hidden ) { + propTween.end = propTween.start; + propTween.start = 0; + } + } + } +} + +function propFilter( props, specialEasing ) { + var index, name, easing, value, hooks; + + // camelCase, specialEasing and expand cssHook pass + for ( index in props ) { + name = camelCase( index ); + easing = specialEasing[ name ]; + value = props[ index ]; + if ( Array.isArray( value ) ) { + easing = value[ 1 ]; + value = props[ index ] = value[ 0 ]; + } + + if ( index !== name ) { + props[ name ] = value; + delete props[ index ]; + } + + hooks = jQuery.cssHooks[ name ]; + if ( hooks && "expand" in hooks ) { + value = hooks.expand( value ); + delete props[ name ]; + + // Not quite $.extend, this won't overwrite existing keys. + // Reusing 'index' because we have the correct "name" + for ( index in value ) { + if ( !( index in props ) ) { + props[ index ] = value[ index ]; + specialEasing[ index ] = easing; + } + } + } else { + specialEasing[ name ] = easing; + } + } +} + +function Animation( elem, properties, options ) { + var result, + stopped, + index = 0, + length = Animation.prefilters.length, + deferred = jQuery.Deferred().always( function() { + + // Don't match elem in the :animated selector + delete tick.elem; + } ), + tick = function() { + if ( stopped ) { + return false; + } + var currentTime = fxNow || createFxNow(), + remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), + + // Support: Android 2.3 only + // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) + temp = remaining / animation.duration || 0, + percent = 1 - temp, + index = 0, + length = animation.tweens.length; + + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( percent ); + } + + deferred.notifyWith( elem, [ animation, percent, remaining ] ); + + // If there's more to do, yield + if ( percent < 1 && length ) { + return remaining; + } + + // If this was an empty animation, synthesize a final progress notification + if ( !length ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + } + + // Resolve the animation and report its conclusion + deferred.resolveWith( elem, [ animation ] ); + return false; + }, + animation = deferred.promise( { + elem: elem, + props: jQuery.extend( {}, properties ), + opts: jQuery.extend( true, { + specialEasing: {}, + easing: jQuery.easing._default + }, options ), + originalProperties: properties, + originalOptions: options, + startTime: fxNow || createFxNow(), + duration: options.duration, + tweens: [], + createTween: function( prop, end ) { + var tween = jQuery.Tween( elem, animation.opts, prop, end, + animation.opts.specialEasing[ prop ] || animation.opts.easing ); + animation.tweens.push( tween ); + return tween; + }, + stop: function( gotoEnd ) { + var index = 0, + + // If we are going to the end, we want to run all the tweens + // otherwise we skip this part + length = gotoEnd ? animation.tweens.length : 0; + if ( stopped ) { + return this; + } + stopped = true; + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( 1 ); + } + + // Resolve when we played the last frame; otherwise, reject + if ( gotoEnd ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + deferred.resolveWith( elem, [ animation, gotoEnd ] ); + } else { + deferred.rejectWith( elem, [ animation, gotoEnd ] ); + } + return this; + } + } ), + props = animation.props; + + propFilter( props, animation.opts.specialEasing ); + + for ( ; index < length; index++ ) { + result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); + if ( result ) { + if ( isFunction( result.stop ) ) { + jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = + result.stop.bind( result ); + } + return result; + } + } + + jQuery.map( props, createTween, animation ); + + if ( isFunction( animation.opts.start ) ) { + animation.opts.start.call( elem, animation ); + } + + // Attach callbacks from options + animation + .progress( animation.opts.progress ) + .done( animation.opts.done, animation.opts.complete ) + .fail( animation.opts.fail ) + .always( animation.opts.always ); + + jQuery.fx.timer( + jQuery.extend( tick, { + elem: elem, + anim: animation, + queue: animation.opts.queue + } ) + ); + + return animation; +} + +jQuery.Animation = jQuery.extend( Animation, { + + tweeners: { + "*": [ function( prop, value ) { + var tween = this.createTween( prop, value ); + adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); + return tween; + } ] + }, + + tweener: function( props, callback ) { + if ( isFunction( props ) ) { + callback = props; + props = [ "*" ]; + } else { + props = props.match( rnothtmlwhite ); + } + + var prop, + index = 0, + length = props.length; + + for ( ; index < length; index++ ) { + prop = props[ index ]; + Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; + Animation.tweeners[ prop ].unshift( callback ); + } + }, + + prefilters: [ defaultPrefilter ], + + prefilter: function( callback, prepend ) { + if ( prepend ) { + Animation.prefilters.unshift( callback ); + } else { + Animation.prefilters.push( callback ); + } + } +} ); + +jQuery.speed = function( speed, easing, fn ) { + var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { + complete: fn || !fn && easing || + isFunction( speed ) && speed, + duration: speed, + easing: fn && easing || easing && !isFunction( easing ) && easing + }; + + // Go to the end state if fx are off + if ( jQuery.fx.off ) { + opt.duration = 0; + + } else { + if ( typeof opt.duration !== "number" ) { + if ( opt.duration in jQuery.fx.speeds ) { + opt.duration = jQuery.fx.speeds[ opt.duration ]; + + } else { + opt.duration = jQuery.fx.speeds._default; + } + } + } + + // Normalize opt.queue - true/undefined/null -> "fx" + if ( opt.queue == null || opt.queue === true ) { + opt.queue = "fx"; + } + + // Queueing + opt.old = opt.complete; + + opt.complete = function() { + if ( isFunction( opt.old ) ) { + opt.old.call( this ); + } + + if ( opt.queue ) { + jQuery.dequeue( this, opt.queue ); + } + }; + + return opt; +}; + +jQuery.fn.extend( { + fadeTo: function( speed, to, easing, callback ) { + + // Show any hidden elements after setting opacity to 0 + return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() + + // Animate to the value specified + .end().animate( { opacity: to }, speed, easing, callback ); + }, + animate: function( prop, speed, easing, callback ) { + var empty = jQuery.isEmptyObject( prop ), + optall = jQuery.speed( speed, easing, callback ), + doAnimation = function() { + + // Operate on a copy of prop so per-property easing won't be lost + var anim = Animation( this, jQuery.extend( {}, prop ), optall ); + + // Empty animations, or finishing resolves immediately + if ( empty || dataPriv.get( this, "finish" ) ) { + anim.stop( true ); + } + }; + doAnimation.finish = doAnimation; + + return empty || optall.queue === false ? + this.each( doAnimation ) : + this.queue( optall.queue, doAnimation ); + }, + stop: function( type, clearQueue, gotoEnd ) { + var stopQueue = function( hooks ) { + var stop = hooks.stop; + delete hooks.stop; + stop( gotoEnd ); + }; + + if ( typeof type !== "string" ) { + gotoEnd = clearQueue; + clearQueue = type; + type = undefined; + } + if ( clearQueue ) { + this.queue( type || "fx", [] ); + } + + return this.each( function() { + var dequeue = true, + index = type != null && type + "queueHooks", + timers = jQuery.timers, + data = dataPriv.get( this ); + + if ( index ) { + if ( data[ index ] && data[ index ].stop ) { + stopQueue( data[ index ] ); + } + } else { + for ( index in data ) { + if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { + stopQueue( data[ index ] ); + } + } + } + + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && + ( type == null || timers[ index ].queue === type ) ) { + + timers[ index ].anim.stop( gotoEnd ); + dequeue = false; + timers.splice( index, 1 ); + } + } + + // Start the next in the queue if the last step wasn't forced. + // Timers currently will call their complete callbacks, which + // will dequeue but only if they were gotoEnd. + if ( dequeue || !gotoEnd ) { + jQuery.dequeue( this, type ); + } + } ); + }, + finish: function( type ) { + if ( type !== false ) { + type = type || "fx"; + } + return this.each( function() { + var index, + data = dataPriv.get( this ), + queue = data[ type + "queue" ], + hooks = data[ type + "queueHooks" ], + timers = jQuery.timers, + length = queue ? queue.length : 0; + + // Enable finishing flag on private data + data.finish = true; + + // Empty the queue first + jQuery.queue( this, type, [] ); + + if ( hooks && hooks.stop ) { + hooks.stop.call( this, true ); + } + + // Look for any active animations, and finish them + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && timers[ index ].queue === type ) { + timers[ index ].anim.stop( true ); + timers.splice( index, 1 ); + } + } + + // Look for any animations in the old queue and finish them + for ( index = 0; index < length; index++ ) { + if ( queue[ index ] && queue[ index ].finish ) { + queue[ index ].finish.call( this ); + } + } + + // Turn off finishing flag + delete data.finish; + } ); + } +} ); + +jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { + var cssFn = jQuery.fn[ name ]; + jQuery.fn[ name ] = function( speed, easing, callback ) { + return speed == null || typeof speed === "boolean" ? + cssFn.apply( this, arguments ) : + this.animate( genFx( name, true ), speed, easing, callback ); + }; +} ); + +// Generate shortcuts for custom animations +jQuery.each( { + slideDown: genFx( "show" ), + slideUp: genFx( "hide" ), + slideToggle: genFx( "toggle" ), + fadeIn: { opacity: "show" }, + fadeOut: { opacity: "hide" }, + fadeToggle: { opacity: "toggle" } +}, function( name, props ) { + jQuery.fn[ name ] = function( speed, easing, callback ) { + return this.animate( props, speed, easing, callback ); + }; +} ); + +jQuery.timers = []; +jQuery.fx.tick = function() { + var timer, + i = 0, + timers = jQuery.timers; + + fxNow = Date.now(); + + for ( ; i < timers.length; i++ ) { + timer = timers[ i ]; + + // Run the timer and safely remove it when done (allowing for external removal) + if ( !timer() && timers[ i ] === timer ) { + timers.splice( i--, 1 ); + } + } + + if ( !timers.length ) { + jQuery.fx.stop(); + } + fxNow = undefined; +}; + +jQuery.fx.timer = function( timer ) { + jQuery.timers.push( timer ); + jQuery.fx.start(); +}; + +jQuery.fx.interval = 13; +jQuery.fx.start = function() { + if ( inProgress ) { + return; + } + + inProgress = true; + schedule(); +}; + +jQuery.fx.stop = function() { + inProgress = null; +}; + +jQuery.fx.speeds = { + slow: 600, + fast: 200, + + // Default speed + _default: 400 +}; + + +// Based off of the plugin by Clint Helfers, with permission. +// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ +jQuery.fn.delay = function( time, type ) { + time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; + type = type || "fx"; + + return this.queue( type, function( next, hooks ) { + var timeout = window.setTimeout( next, time ); + hooks.stop = function() { + window.clearTimeout( timeout ); + }; + } ); +}; + + +( function() { + var input = document.createElement( "input" ), + select = document.createElement( "select" ), + opt = select.appendChild( document.createElement( "option" ) ); + + input.type = "checkbox"; + + // Support: Android <=4.3 only + // Default value for a checkbox should be "on" + support.checkOn = input.value !== ""; + + // Support: IE <=11 only + // Must access selectedIndex to make default options select + support.optSelected = opt.selected; + + // Support: IE <=11 only + // An input loses its value after becoming a radio + input = document.createElement( "input" ); + input.value = "t"; + input.type = "radio"; + support.radioValue = input.value === "t"; +} )(); + + +var boolHook, + attrHandle = jQuery.expr.attrHandle; + +jQuery.fn.extend( { + attr: function( name, value ) { + return access( this, jQuery.attr, name, value, arguments.length > 1 ); + }, + + removeAttr: function( name ) { + return this.each( function() { + jQuery.removeAttr( this, name ); + } ); + } +} ); + +jQuery.extend( { + attr: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set attributes on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + // Fallback to prop when attributes are not supported + if ( typeof elem.getAttribute === "undefined" ) { + return jQuery.prop( elem, name, value ); + } + + // Attribute hooks are determined by the lowercase version + // Grab necessary hook if one is defined + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + hooks = jQuery.attrHooks[ name.toLowerCase() ] || + ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); + } + + if ( value !== undefined ) { + if ( value === null ) { + jQuery.removeAttr( elem, name ); + return; + } + + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + elem.setAttribute( name, value + "" ); + return value; + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + ret = jQuery.find.attr( elem, name ); + + // Non-existent attributes return null, we normalize to undefined + return ret == null ? undefined : ret; + }, + + attrHooks: { + type: { + set: function( elem, value ) { + if ( !support.radioValue && value === "radio" && + nodeName( elem, "input" ) ) { + var val = elem.value; + elem.setAttribute( "type", value ); + if ( val ) { + elem.value = val; + } + return value; + } + } + } + }, + + removeAttr: function( elem, value ) { + var name, + i = 0, + + // Attribute names can contain non-HTML whitespace characters + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + attrNames = value && value.match( rnothtmlwhite ); + + if ( attrNames && elem.nodeType === 1 ) { + while ( ( name = attrNames[ i++ ] ) ) { + elem.removeAttribute( name ); + } + } + } +} ); + +// Hooks for boolean attributes +boolHook = { + set: function( elem, value, name ) { + if ( value === false ) { + + // Remove boolean attributes when set to false + jQuery.removeAttr( elem, name ); + } else { + elem.setAttribute( name, name ); + } + return name; + } +}; + +jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { + var getter = attrHandle[ name ] || jQuery.find.attr; + + attrHandle[ name ] = function( elem, name, isXML ) { + var ret, handle, + lowercaseName = name.toLowerCase(); + + if ( !isXML ) { + + // Avoid an infinite loop by temporarily removing this function from the getter + handle = attrHandle[ lowercaseName ]; + attrHandle[ lowercaseName ] = ret; + ret = getter( elem, name, isXML ) != null ? + lowercaseName : + null; + attrHandle[ lowercaseName ] = handle; + } + return ret; + }; +} ); + + + + +var rfocusable = /^(?:input|select|textarea|button)$/i, + rclickable = /^(?:a|area)$/i; + +jQuery.fn.extend( { + prop: function( name, value ) { + return access( this, jQuery.prop, name, value, arguments.length > 1 ); + }, + + removeProp: function( name ) { + return this.each( function() { + delete this[ jQuery.propFix[ name ] || name ]; + } ); + } +} ); + +jQuery.extend( { + prop: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set properties on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + + // Fix name and attach hooks + name = jQuery.propFix[ name ] || name; + hooks = jQuery.propHooks[ name ]; + } + + if ( value !== undefined ) { + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + return ( elem[ name ] = value ); + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + return elem[ name ]; + }, + + propHooks: { + tabIndex: { + get: function( elem ) { + + // Support: IE <=9 - 11 only + // elem.tabIndex doesn't always return the + // correct value when it hasn't been explicitly set + // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ + // Use proper attribute retrieval(#12072) + var tabindex = jQuery.find.attr( elem, "tabindex" ); + + if ( tabindex ) { + return parseInt( tabindex, 10 ); + } + + if ( + rfocusable.test( elem.nodeName ) || + rclickable.test( elem.nodeName ) && + elem.href + ) { + return 0; + } + + return -1; + } + } + }, + + propFix: { + "for": "htmlFor", + "class": "className" + } +} ); + +// Support: IE <=11 only +// Accessing the selectedIndex property +// forces the browser to respect setting selected +// on the option +// The getter ensures a default option is selected +// when in an optgroup +// eslint rule "no-unused-expressions" is disabled for this code +// since it considers such accessions noop +if ( !support.optSelected ) { + jQuery.propHooks.selected = { + get: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent && parent.parentNode ) { + parent.parentNode.selectedIndex; + } + return null; + }, + set: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent ) { + parent.selectedIndex; + + if ( parent.parentNode ) { + parent.parentNode.selectedIndex; + } + } + } + }; +} + +jQuery.each( [ + "tabIndex", + "readOnly", + "maxLength", + "cellSpacing", + "cellPadding", + "rowSpan", + "colSpan", + "useMap", + "frameBorder", + "contentEditable" +], function() { + jQuery.propFix[ this.toLowerCase() ] = this; +} ); + + + + + // Strip and collapse whitespace according to HTML spec + // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace + function stripAndCollapse( value ) { + var tokens = value.match( rnothtmlwhite ) || []; + return tokens.join( " " ); + } + + +function getClass( elem ) { + return elem.getAttribute && elem.getAttribute( "class" ) || ""; +} + +function classesToArray( value ) { + if ( Array.isArray( value ) ) { + return value; + } + if ( typeof value === "string" ) { + return value.match( rnothtmlwhite ) || []; + } + return []; +} + +jQuery.fn.extend( { + addClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + if ( cur.indexOf( " " + clazz + " " ) < 0 ) { + cur += clazz + " "; + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + removeClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( !arguments.length ) { + return this.attr( "class", "" ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + + // This expression is here for better compressibility (see addClass) + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + + // Remove *all* instances + while ( cur.indexOf( " " + clazz + " " ) > -1 ) { + cur = cur.replace( " " + clazz + " ", " " ); + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + toggleClass: function( value, stateVal ) { + var type = typeof value, + isValidValue = type === "string" || Array.isArray( value ); + + if ( typeof stateVal === "boolean" && isValidValue ) { + return stateVal ? this.addClass( value ) : this.removeClass( value ); + } + + if ( isFunction( value ) ) { + return this.each( function( i ) { + jQuery( this ).toggleClass( + value.call( this, i, getClass( this ), stateVal ), + stateVal + ); + } ); + } + + return this.each( function() { + var className, i, self, classNames; + + if ( isValidValue ) { + + // Toggle individual class names + i = 0; + self = jQuery( this ); + classNames = classesToArray( value ); + + while ( ( className = classNames[ i++ ] ) ) { + + // Check each className given, space separated list + if ( self.hasClass( className ) ) { + self.removeClass( className ); + } else { + self.addClass( className ); + } + } + + // Toggle whole class name + } else if ( value === undefined || type === "boolean" ) { + className = getClass( this ); + if ( className ) { + + // Store className if set + dataPriv.set( this, "__className__", className ); + } + + // If the element has a class name or if we're passed `false`, + // then remove the whole classname (if there was one, the above saved it). + // Otherwise bring back whatever was previously saved (if anything), + // falling back to the empty string if nothing was stored. + if ( this.setAttribute ) { + this.setAttribute( "class", + className || value === false ? + "" : + dataPriv.get( this, "__className__" ) || "" + ); + } + } + } ); + }, + + hasClass: function( selector ) { + var className, elem, + i = 0; + + className = " " + selector + " "; + while ( ( elem = this[ i++ ] ) ) { + if ( elem.nodeType === 1 && + ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { + return true; + } + } + + return false; + } +} ); + + + + +var rreturn = /\r/g; + +jQuery.fn.extend( { + val: function( value ) { + var hooks, ret, valueIsFunction, + elem = this[ 0 ]; + + if ( !arguments.length ) { + if ( elem ) { + hooks = jQuery.valHooks[ elem.type ] || + jQuery.valHooks[ elem.nodeName.toLowerCase() ]; + + if ( hooks && + "get" in hooks && + ( ret = hooks.get( elem, "value" ) ) !== undefined + ) { + return ret; + } + + ret = elem.value; + + // Handle most common string cases + if ( typeof ret === "string" ) { + return ret.replace( rreturn, "" ); + } + + // Handle cases where value is null/undef or number + return ret == null ? "" : ret; + } + + return; + } + + valueIsFunction = isFunction( value ); + + return this.each( function( i ) { + var val; + + if ( this.nodeType !== 1 ) { + return; + } + + if ( valueIsFunction ) { + val = value.call( this, i, jQuery( this ).val() ); + } else { + val = value; + } + + // Treat null/undefined as ""; convert numbers to string + if ( val == null ) { + val = ""; + + } else if ( typeof val === "number" ) { + val += ""; + + } else if ( Array.isArray( val ) ) { + val = jQuery.map( val, function( value ) { + return value == null ? "" : value + ""; + } ); + } + + hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; + + // If set returns undefined, fall back to normal setting + if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { + this.value = val; + } + } ); + } +} ); + +jQuery.extend( { + valHooks: { + option: { + get: function( elem ) { + + var val = jQuery.find.attr( elem, "value" ); + return val != null ? + val : + + // Support: IE <=10 - 11 only + // option.text throws exceptions (#14686, #14858) + // Strip and collapse whitespace + // https://html.spec.whatwg.org/#strip-and-collapse-whitespace + stripAndCollapse( jQuery.text( elem ) ); + } + }, + select: { + get: function( elem ) { + var value, option, i, + options = elem.options, + index = elem.selectedIndex, + one = elem.type === "select-one", + values = one ? null : [], + max = one ? index + 1 : options.length; + + if ( index < 0 ) { + i = max; + + } else { + i = one ? index : 0; + } + + // Loop through all the selected options + for ( ; i < max; i++ ) { + option = options[ i ]; + + // Support: IE <=9 only + // IE8-9 doesn't update selected after form reset (#2551) + if ( ( option.selected || i === index ) && + + // Don't return options that are disabled or in a disabled optgroup + !option.disabled && + ( !option.parentNode.disabled || + !nodeName( option.parentNode, "optgroup" ) ) ) { + + // Get the specific value for the option + value = jQuery( option ).val(); + + // We don't need an array for one selects + if ( one ) { + return value; + } + + // Multi-Selects return an array + values.push( value ); + } + } + + return values; + }, + + set: function( elem, value ) { + var optionSet, option, + options = elem.options, + values = jQuery.makeArray( value ), + i = options.length; + + while ( i-- ) { + option = options[ i ]; + + /* eslint-disable no-cond-assign */ + + if ( option.selected = + jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 + ) { + optionSet = true; + } + + /* eslint-enable no-cond-assign */ + } + + // Force browsers to behave consistently when non-matching value is set + if ( !optionSet ) { + elem.selectedIndex = -1; + } + return values; + } + } + } +} ); + +// Radios and checkboxes getter/setter +jQuery.each( [ "radio", "checkbox" ], function() { + jQuery.valHooks[ this ] = { + set: function( elem, value ) { + if ( Array.isArray( value ) ) { + return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); + } + } + }; + if ( !support.checkOn ) { + jQuery.valHooks[ this ].get = function( elem ) { + return elem.getAttribute( "value" ) === null ? "on" : elem.value; + }; + } +} ); + + + + +// Return jQuery for attributes-only inclusion + + +support.focusin = "onfocusin" in window; + + +var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, + stopPropagationCallback = function( e ) { + e.stopPropagation(); + }; + +jQuery.extend( jQuery.event, { + + trigger: function( event, data, elem, onlyHandlers ) { + + var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, + eventPath = [ elem || document ], + type = hasOwn.call( event, "type" ) ? event.type : event, + namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; + + cur = lastElement = tmp = elem = elem || document; + + // Don't do events on text and comment nodes + if ( elem.nodeType === 3 || elem.nodeType === 8 ) { + return; + } + + // focus/blur morphs to focusin/out; ensure we're not firing them right now + if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { + return; + } + + if ( type.indexOf( "." ) > -1 ) { + + // Namespaced trigger; create a regexp to match event type in handle() + namespaces = type.split( "." ); + type = namespaces.shift(); + namespaces.sort(); + } + ontype = type.indexOf( ":" ) < 0 && "on" + type; + + // Caller can pass in a jQuery.Event object, Object, or just an event type string + event = event[ jQuery.expando ] ? + event : + new jQuery.Event( type, typeof event === "object" && event ); + + // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) + event.isTrigger = onlyHandlers ? 2 : 3; + event.namespace = namespaces.join( "." ); + event.rnamespace = event.namespace ? + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : + null; + + // Clean up the event in case it is being reused + event.result = undefined; + if ( !event.target ) { + event.target = elem; + } + + // Clone any incoming data and prepend the event, creating the handler arg list + data = data == null ? + [ event ] : + jQuery.makeArray( data, [ event ] ); + + // Allow special events to draw outside the lines + special = jQuery.event.special[ type ] || {}; + if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { + return; + } + + // Determine event propagation path in advance, per W3C events spec (#9951) + // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) + if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { + + bubbleType = special.delegateType || type; + if ( !rfocusMorph.test( bubbleType + type ) ) { + cur = cur.parentNode; + } + for ( ; cur; cur = cur.parentNode ) { + eventPath.push( cur ); + tmp = cur; + } + + // Only add window if we got to document (e.g., not plain obj or detached DOM) + if ( tmp === ( elem.ownerDocument || document ) ) { + eventPath.push( tmp.defaultView || tmp.parentWindow || window ); + } + } + + // Fire handlers on the event path + i = 0; + while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { + lastElement = cur; + event.type = i > 1 ? + bubbleType : + special.bindType || type; + + // jQuery handler + handle = ( + dataPriv.get( cur, "events" ) || Object.create( null ) + )[ event.type ] && + dataPriv.get( cur, "handle" ); + if ( handle ) { + handle.apply( cur, data ); + } + + // Native handler + handle = ontype && cur[ ontype ]; + if ( handle && handle.apply && acceptData( cur ) ) { + event.result = handle.apply( cur, data ); + if ( event.result === false ) { + event.preventDefault(); + } + } + } + event.type = type; + + // If nobody prevented the default action, do it now + if ( !onlyHandlers && !event.isDefaultPrevented() ) { + + if ( ( !special._default || + special._default.apply( eventPath.pop(), data ) === false ) && + acceptData( elem ) ) { + + // Call a native DOM method on the target with the same name as the event. + // Don't do default actions on window, that's where global variables be (#6170) + if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { + + // Don't re-trigger an onFOO event when we call its FOO() method + tmp = elem[ ontype ]; + + if ( tmp ) { + elem[ ontype ] = null; + } + + // Prevent re-triggering of the same event, since we already bubbled it above + jQuery.event.triggered = type; + + if ( event.isPropagationStopped() ) { + lastElement.addEventListener( type, stopPropagationCallback ); + } + + elem[ type ](); + + if ( event.isPropagationStopped() ) { + lastElement.removeEventListener( type, stopPropagationCallback ); + } + + jQuery.event.triggered = undefined; + + if ( tmp ) { + elem[ ontype ] = tmp; + } + } + } + } + + return event.result; + }, + + // Piggyback on a donor event to simulate a different one + // Used only for `focus(in | out)` events + simulate: function( type, elem, event ) { + var e = jQuery.extend( + new jQuery.Event(), + event, + { + type: type, + isSimulated: true + } + ); + + jQuery.event.trigger( e, null, elem ); + } + +} ); + +jQuery.fn.extend( { + + trigger: function( type, data ) { + return this.each( function() { + jQuery.event.trigger( type, data, this ); + } ); + }, + triggerHandler: function( type, data ) { + var elem = this[ 0 ]; + if ( elem ) { + return jQuery.event.trigger( type, data, elem, true ); + } + } +} ); + + +// Support: Firefox <=44 +// Firefox doesn't have focus(in | out) events +// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 +// +// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 +// focus(in | out) events fire after focus & blur events, +// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order +// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 +if ( !support.focusin ) { + jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { + + // Attach a single capturing handler on the document while someone wants focusin/focusout + var handler = function( event ) { + jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); + }; + + jQuery.event.special[ fix ] = { + setup: function() { + + // Handle: regular nodes (via `this.ownerDocument`), window + // (via `this.document`) & document (via `this`). + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ); + + if ( !attaches ) { + doc.addEventListener( orig, handler, true ); + } + dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); + }, + teardown: function() { + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ) - 1; + + if ( !attaches ) { + doc.removeEventListener( orig, handler, true ); + dataPriv.remove( doc, fix ); + + } else { + dataPriv.access( doc, fix, attaches ); + } + } + }; + } ); +} +var location = window.location; + +var nonce = { guid: Date.now() }; + +var rquery = ( /\?/ ); + + + +// Cross-browser xml parsing +jQuery.parseXML = function( data ) { + var xml; + if ( !data || typeof data !== "string" ) { + return null; + } + + // Support: IE 9 - 11 only + // IE throws on parseFromString with invalid input. + try { + xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); + } catch ( e ) { + xml = undefined; + } + + if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { + jQuery.error( "Invalid XML: " + data ); + } + return xml; +}; + + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && toType( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = isFunction( valueOrFunction ) ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + if ( a == null ) { + return ""; + } + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ) + .filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ) + .map( function( _i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + + +var + r20 = /%20/g, + rhash = /#.*$/, + rantiCache = /([?&])_=[^&]*/, + rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, + + // #7653, #8125, #8152: local protocol detection + rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, + rnoContent = /^(?:GET|HEAD)$/, + rprotocol = /^\/\//, + + /* Prefilters + * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) + * 2) These are called: + * - BEFORE asking for a transport + * - AFTER param serialization (s.data is a string if s.processData is true) + * 3) key is the dataType + * 4) the catchall symbol "*" can be used + * 5) execution will start with transport dataType and THEN continue down to "*" if needed + */ + prefilters = {}, + + /* Transports bindings + * 1) key is the dataType + * 2) the catchall symbol "*" can be used + * 3) selection will start with transport dataType and THEN go to "*" if needed + */ + transports = {}, + + // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression + allTypes = "*/".concat( "*" ), + + // Anchor tag for parsing the document origin + originAnchor = document.createElement( "a" ); + originAnchor.href = location.href; + +// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport +function addToPrefiltersOrTransports( structure ) { + + // dataTypeExpression is optional and defaults to "*" + return function( dataTypeExpression, func ) { + + if ( typeof dataTypeExpression !== "string" ) { + func = dataTypeExpression; + dataTypeExpression = "*"; + } + + var dataType, + i = 0, + dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; + + if ( isFunction( func ) ) { + + // For each dataType in the dataTypeExpression + while ( ( dataType = dataTypes[ i++ ] ) ) { + + // Prepend if requested + if ( dataType[ 0 ] === "+" ) { + dataType = dataType.slice( 1 ) || "*"; + ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); + + // Otherwise append + } else { + ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); + } + } + } + }; +} + +// Base inspection function for prefilters and transports +function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { + + var inspected = {}, + seekingTransport = ( structure === transports ); + + function inspect( dataType ) { + var selected; + inspected[ dataType ] = true; + jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { + var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); + if ( typeof dataTypeOrTransport === "string" && + !seekingTransport && !inspected[ dataTypeOrTransport ] ) { + + options.dataTypes.unshift( dataTypeOrTransport ); + inspect( dataTypeOrTransport ); + return false; + } else if ( seekingTransport ) { + return !( selected = dataTypeOrTransport ); + } + } ); + return selected; + } + + return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); +} + +// A special extend for ajax options +// that takes "flat" options (not to be deep extended) +// Fixes #9887 +function ajaxExtend( target, src ) { + var key, deep, + flatOptions = jQuery.ajaxSettings.flatOptions || {}; + + for ( key in src ) { + if ( src[ key ] !== undefined ) { + ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; + } + } + if ( deep ) { + jQuery.extend( true, target, deep ); + } + + return target; +} + +/* Handles responses to an ajax request: + * - finds the right dataType (mediates between content-type and expected dataType) + * - returns the corresponding response + */ +function ajaxHandleResponses( s, jqXHR, responses ) { + + var ct, type, finalDataType, firstDataType, + contents = s.contents, + dataTypes = s.dataTypes; + + // Remove auto dataType and get content-type in the process + while ( dataTypes[ 0 ] === "*" ) { + dataTypes.shift(); + if ( ct === undefined ) { + ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); + } + } + + // Check if we're dealing with a known content-type + if ( ct ) { + for ( type in contents ) { + if ( contents[ type ] && contents[ type ].test( ct ) ) { + dataTypes.unshift( type ); + break; + } + } + } + + // Check to see if we have a response for the expected dataType + if ( dataTypes[ 0 ] in responses ) { + finalDataType = dataTypes[ 0 ]; + } else { + + // Try convertible dataTypes + for ( type in responses ) { + if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { + finalDataType = type; + break; + } + if ( !firstDataType ) { + firstDataType = type; + } + } + + // Or just use first one + finalDataType = finalDataType || firstDataType; + } + + // If we found a dataType + // We add the dataType to the list if needed + // and return the corresponding response + if ( finalDataType ) { + if ( finalDataType !== dataTypes[ 0 ] ) { + dataTypes.unshift( finalDataType ); + } + return responses[ finalDataType ]; + } +} + +/* Chain conversions given the request and the original response + * Also sets the responseXXX fields on the jqXHR instance + */ +function ajaxConvert( s, response, jqXHR, isSuccess ) { + var conv2, current, conv, tmp, prev, + converters = {}, + + // Work with a copy of dataTypes in case we need to modify it for conversion + dataTypes = s.dataTypes.slice(); + + // Create converters map with lowercased keys + if ( dataTypes[ 1 ] ) { + for ( conv in s.converters ) { + converters[ conv.toLowerCase() ] = s.converters[ conv ]; + } + } + + current = dataTypes.shift(); + + // Convert to each sequential dataType + while ( current ) { + + if ( s.responseFields[ current ] ) { + jqXHR[ s.responseFields[ current ] ] = response; + } + + // Apply the dataFilter if provided + if ( !prev && isSuccess && s.dataFilter ) { + response = s.dataFilter( response, s.dataType ); + } + + prev = current; + current = dataTypes.shift(); + + if ( current ) { + + // There's only work to do if current dataType is non-auto + if ( current === "*" ) { + + current = prev; + + // Convert response if prev dataType is non-auto and differs from current + } else if ( prev !== "*" && prev !== current ) { + + // Seek a direct converter + conv = converters[ prev + " " + current ] || converters[ "* " + current ]; + + // If none found, seek a pair + if ( !conv ) { + for ( conv2 in converters ) { + + // If conv2 outputs current + tmp = conv2.split( " " ); + if ( tmp[ 1 ] === current ) { + + // If prev can be converted to accepted input + conv = converters[ prev + " " + tmp[ 0 ] ] || + converters[ "* " + tmp[ 0 ] ]; + if ( conv ) { + + // Condense equivalence converters + if ( conv === true ) { + conv = converters[ conv2 ]; + + // Otherwise, insert the intermediate dataType + } else if ( converters[ conv2 ] !== true ) { + current = tmp[ 0 ]; + dataTypes.unshift( tmp[ 1 ] ); + } + break; + } + } + } + } + + // Apply converter (if not an equivalence) + if ( conv !== true ) { + + // Unless errors are allowed to bubble, catch and return them + if ( conv && s.throws ) { + response = conv( response ); + } else { + try { + response = conv( response ); + } catch ( e ) { + return { + state: "parsererror", + error: conv ? e : "No conversion from " + prev + " to " + current + }; + } + } + } + } + } + } + + return { state: "success", data: response }; +} + +jQuery.extend( { + + // Counter for holding the number of active queries + active: 0, + + // Last-Modified header cache for next request + lastModified: {}, + etag: {}, + + ajaxSettings: { + url: location.href, + type: "GET", + isLocal: rlocalProtocol.test( location.protocol ), + global: true, + processData: true, + async: true, + contentType: "application/x-www-form-urlencoded; charset=UTF-8", + + /* + timeout: 0, + data: null, + dataType: null, + username: null, + password: null, + cache: null, + throws: false, + traditional: false, + headers: {}, + */ + + accepts: { + "*": allTypes, + text: "text/plain", + html: "text/html", + xml: "application/xml, text/xml", + json: "application/json, text/javascript" + }, + + contents: { + xml: /\bxml\b/, + html: /\bhtml/, + json: /\bjson\b/ + }, + + responseFields: { + xml: "responseXML", + text: "responseText", + json: "responseJSON" + }, + + // Data converters + // Keys separate source (or catchall "*") and destination types with a single space + converters: { + + // Convert anything to text + "* text": String, + + // Text to html (true = no transformation) + "text html": true, + + // Evaluate text as a json expression + "text json": JSON.parse, + + // Parse text as xml + "text xml": jQuery.parseXML + }, + + // For options that shouldn't be deep extended: + // you can add your own custom options here if + // and when you create one that shouldn't be + // deep extended (see ajaxExtend) + flatOptions: { + url: true, + context: true + } + }, + + // Creates a full fledged settings object into target + // with both ajaxSettings and settings fields. + // If target is omitted, writes into ajaxSettings. + ajaxSetup: function( target, settings ) { + return settings ? + + // Building a settings object + ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : + + // Extending ajaxSettings + ajaxExtend( jQuery.ajaxSettings, target ); + }, + + ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), + ajaxTransport: addToPrefiltersOrTransports( transports ), + + // Main method + ajax: function( url, options ) { + + // If url is an object, simulate pre-1.5 signature + if ( typeof url === "object" ) { + options = url; + url = undefined; + } + + // Force options to be an object + options = options || {}; + + var transport, + + // URL without anti-cache param + cacheURL, + + // Response headers + responseHeadersString, + responseHeaders, + + // timeout handle + timeoutTimer, + + // Url cleanup var + urlAnchor, + + // Request state (becomes false upon send and true upon completion) + completed, + + // To know if global events are to be dispatched + fireGlobals, + + // Loop variable + i, + + // uncached part of the url + uncached, + + // Create the final options object + s = jQuery.ajaxSetup( {}, options ), + + // Callbacks context + callbackContext = s.context || s, + + // Context for global events is callbackContext if it is a DOM node or jQuery collection + globalEventContext = s.context && + ( callbackContext.nodeType || callbackContext.jquery ) ? + jQuery( callbackContext ) : + jQuery.event, + + // Deferreds + deferred = jQuery.Deferred(), + completeDeferred = jQuery.Callbacks( "once memory" ), + + // Status-dependent callbacks + statusCode = s.statusCode || {}, + + // Headers (they are sent all at once) + requestHeaders = {}, + requestHeadersNames = {}, + + // Default abort message + strAbort = "canceled", + + // Fake xhr + jqXHR = { + readyState: 0, + + // Builds headers hashtable if needed + getResponseHeader: function( key ) { + var match; + if ( completed ) { + if ( !responseHeaders ) { + responseHeaders = {}; + while ( ( match = rheaders.exec( responseHeadersString ) ) ) { + responseHeaders[ match[ 1 ].toLowerCase() + " " ] = + ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) + .concat( match[ 2 ] ); + } + } + match = responseHeaders[ key.toLowerCase() + " " ]; + } + return match == null ? null : match.join( ", " ); + }, + + // Raw string + getAllResponseHeaders: function() { + return completed ? responseHeadersString : null; + }, + + // Caches the header + setRequestHeader: function( name, value ) { + if ( completed == null ) { + name = requestHeadersNames[ name.toLowerCase() ] = + requestHeadersNames[ name.toLowerCase() ] || name; + requestHeaders[ name ] = value; + } + return this; + }, + + // Overrides response content-type header + overrideMimeType: function( type ) { + if ( completed == null ) { + s.mimeType = type; + } + return this; + }, + + // Status-dependent callbacks + statusCode: function( map ) { + var code; + if ( map ) { + if ( completed ) { + + // Execute the appropriate callbacks + jqXHR.always( map[ jqXHR.status ] ); + } else { + + // Lazy-add the new callbacks in a way that preserves old ones + for ( code in map ) { + statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; + } + } + } + return this; + }, + + // Cancel the request + abort: function( statusText ) { + var finalText = statusText || strAbort; + if ( transport ) { + transport.abort( finalText ); + } + done( 0, finalText ); + return this; + } + }; + + // Attach deferreds + deferred.promise( jqXHR ); + + // Add protocol if not provided (prefilters might expect it) + // Handle falsy url in the settings object (#10093: consistency with old signature) + // We also use the url parameter if available + s.url = ( ( url || s.url || location.href ) + "" ) + .replace( rprotocol, location.protocol + "//" ); + + // Alias method option to type as per ticket #12004 + s.type = options.method || options.type || s.method || s.type; + + // Extract dataTypes list + s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; + + // A cross-domain request is in order when the origin doesn't match the current origin. + if ( s.crossDomain == null ) { + urlAnchor = document.createElement( "a" ); + + // Support: IE <=8 - 11, Edge 12 - 15 + // IE throws exception on accessing the href property if url is malformed, + // e.g. http://example.com:80x/ + try { + urlAnchor.href = s.url; + + // Support: IE <=8 - 11 only + // Anchor's host property isn't correctly set when s.url is relative + urlAnchor.href = urlAnchor.href; + s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== + urlAnchor.protocol + "//" + urlAnchor.host; + } catch ( e ) { + + // If there is an error parsing the URL, assume it is crossDomain, + // it can be rejected by the transport if it is invalid + s.crossDomain = true; + } + } + + // Convert data if not already a string + if ( s.data && s.processData && typeof s.data !== "string" ) { + s.data = jQuery.param( s.data, s.traditional ); + } + + // Apply prefilters + inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); + + // If request was aborted inside a prefilter, stop there + if ( completed ) { + return jqXHR; + } + + // We can fire global events as of now if asked to + // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) + fireGlobals = jQuery.event && s.global; + + // Watch for a new set of requests + if ( fireGlobals && jQuery.active++ === 0 ) { + jQuery.event.trigger( "ajaxStart" ); + } + + // Uppercase the type + s.type = s.type.toUpperCase(); + + // Determine if request has content + s.hasContent = !rnoContent.test( s.type ); + + // Save the URL in case we're toying with the If-Modified-Since + // and/or If-None-Match header later on + // Remove hash to simplify url manipulation + cacheURL = s.url.replace( rhash, "" ); + + // More options handling for requests with no content + if ( !s.hasContent ) { + + // Remember the hash so we can put it back + uncached = s.url.slice( cacheURL.length ); + + // If data is available and should be processed, append data to url + if ( s.data && ( s.processData || typeof s.data === "string" ) ) { + cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; + + // #9682: remove data so that it's not used in an eventual retry + delete s.data; + } + + // Add or update anti-cache param if needed + if ( s.cache === false ) { + cacheURL = cacheURL.replace( rantiCache, "$1" ); + uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + + uncached; + } + + // Put hash and anti-cache on the URL that will be requested (gh-1732) + s.url = cacheURL + uncached; + + // Change '%20' to '+' if this is encoded form body content (gh-2658) + } else if ( s.data && s.processData && + ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { + s.data = s.data.replace( r20, "+" ); + } + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + if ( jQuery.lastModified[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); + } + if ( jQuery.etag[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); + } + } + + // Set the correct header, if data is being sent + if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { + jqXHR.setRequestHeader( "Content-Type", s.contentType ); + } + + // Set the Accepts header for the server, depending on the dataType + jqXHR.setRequestHeader( + "Accept", + s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? + s.accepts[ s.dataTypes[ 0 ] ] + + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : + s.accepts[ "*" ] + ); + + // Check for headers option + for ( i in s.headers ) { + jqXHR.setRequestHeader( i, s.headers[ i ] ); + } + + // Allow custom headers/mimetypes and early abort + if ( s.beforeSend && + ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { + + // Abort if not done already and return + return jqXHR.abort(); + } + + // Aborting is no longer a cancellation + strAbort = "abort"; + + // Install callbacks on deferreds + completeDeferred.add( s.complete ); + jqXHR.done( s.success ); + jqXHR.fail( s.error ); + + // Get transport + transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); + + // If no transport, we auto-abort + if ( !transport ) { + done( -1, "No Transport" ); + } else { + jqXHR.readyState = 1; + + // Send global event + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); + } + + // If request was aborted inside ajaxSend, stop there + if ( completed ) { + return jqXHR; + } + + // Timeout + if ( s.async && s.timeout > 0 ) { + timeoutTimer = window.setTimeout( function() { + jqXHR.abort( "timeout" ); + }, s.timeout ); + } + + try { + completed = false; + transport.send( requestHeaders, done ); + } catch ( e ) { + + // Rethrow post-completion exceptions + if ( completed ) { + throw e; + } + + // Propagate others as results + done( -1, e ); + } + } + + // Callback for when everything is done + function done( status, nativeStatusText, responses, headers ) { + var isSuccess, success, error, response, modified, + statusText = nativeStatusText; + + // Ignore repeat invocations + if ( completed ) { + return; + } + + completed = true; + + // Clear timeout if it exists + if ( timeoutTimer ) { + window.clearTimeout( timeoutTimer ); + } + + // Dereference transport for early garbage collection + // (no matter how long the jqXHR object will be used) + transport = undefined; + + // Cache response headers + responseHeadersString = headers || ""; + + // Set readyState + jqXHR.readyState = status > 0 ? 4 : 0; + + // Determine if successful + isSuccess = status >= 200 && status < 300 || status === 304; + + // Get response data + if ( responses ) { + response = ajaxHandleResponses( s, jqXHR, responses ); + } + + // Use a noop converter for missing script + if ( !isSuccess && jQuery.inArray( "script", s.dataTypes ) > -1 ) { + s.converters[ "text script" ] = function() {}; + } + + // Convert no matter what (that way responseXXX fields are always set) + response = ajaxConvert( s, response, jqXHR, isSuccess ); + + // If successful, handle type chaining + if ( isSuccess ) { + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + modified = jqXHR.getResponseHeader( "Last-Modified" ); + if ( modified ) { + jQuery.lastModified[ cacheURL ] = modified; + } + modified = jqXHR.getResponseHeader( "etag" ); + if ( modified ) { + jQuery.etag[ cacheURL ] = modified; + } + } + + // if no content + if ( status === 204 || s.type === "HEAD" ) { + statusText = "nocontent"; + + // if not modified + } else if ( status === 304 ) { + statusText = "notmodified"; + + // If we have data, let's convert it + } else { + statusText = response.state; + success = response.data; + error = response.error; + isSuccess = !error; + } + } else { + + // Extract error from statusText and normalize for non-aborts + error = statusText; + if ( status || !statusText ) { + statusText = "error"; + if ( status < 0 ) { + status = 0; + } + } + } + + // Set data for the fake xhr object + jqXHR.status = status; + jqXHR.statusText = ( nativeStatusText || statusText ) + ""; + + // Success/Error + if ( isSuccess ) { + deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); + } else { + deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); + } + + // Status-dependent callbacks + jqXHR.statusCode( statusCode ); + statusCode = undefined; + + if ( fireGlobals ) { + globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", + [ jqXHR, s, isSuccess ? success : error ] ); + } + + // Complete + completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); + + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); + + // Handle the global AJAX counter + if ( !( --jQuery.active ) ) { + jQuery.event.trigger( "ajaxStop" ); + } + } + } + + return jqXHR; + }, + + getJSON: function( url, data, callback ) { + return jQuery.get( url, data, callback, "json" ); + }, + + getScript: function( url, callback ) { + return jQuery.get( url, undefined, callback, "script" ); + } +} ); + +jQuery.each( [ "get", "post" ], function( _i, method ) { + jQuery[ method ] = function( url, data, callback, type ) { + + // Shift arguments if data argument was omitted + if ( isFunction( data ) ) { + type = type || callback; + callback = data; + data = undefined; + } + + // The url can be an options object (which then must have .url) + return jQuery.ajax( jQuery.extend( { + url: url, + type: method, + dataType: type, + data: data, + success: callback + }, jQuery.isPlainObject( url ) && url ) ); + }; +} ); + +jQuery.ajaxPrefilter( function( s ) { + var i; + for ( i in s.headers ) { + if ( i.toLowerCase() === "content-type" ) { + s.contentType = s.headers[ i ] || ""; + } + } +} ); + + +jQuery._evalUrl = function( url, options, doc ) { + return jQuery.ajax( { + url: url, + + // Make this explicit, since user can override this through ajaxSetup (#11264) + type: "GET", + dataType: "script", + cache: true, + async: false, + global: false, + + // Only evaluate the response if it is successful (gh-4126) + // dataFilter is not invoked for failure responses, so using it instead + // of the default converter is kludgy but it works. + converters: { + "text script": function() {} + }, + dataFilter: function( response ) { + jQuery.globalEval( response, options, doc ); + } + } ); +}; + + +jQuery.fn.extend( { + wrapAll: function( html ) { + var wrap; + + if ( this[ 0 ] ) { + if ( isFunction( html ) ) { + html = html.call( this[ 0 ] ); + } + + // The elements to wrap the target around + wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); + + if ( this[ 0 ].parentNode ) { + wrap.insertBefore( this[ 0 ] ); + } + + wrap.map( function() { + var elem = this; + + while ( elem.firstElementChild ) { + elem = elem.firstElementChild; + } + + return elem; + } ).append( this ); + } + + return this; + }, + + wrapInner: function( html ) { + if ( isFunction( html ) ) { + return this.each( function( i ) { + jQuery( this ).wrapInner( html.call( this, i ) ); + } ); + } + + return this.each( function() { + var self = jQuery( this ), + contents = self.contents(); + + if ( contents.length ) { + contents.wrapAll( html ); + + } else { + self.append( html ); + } + } ); + }, + + wrap: function( html ) { + var htmlIsFunction = isFunction( html ); + + return this.each( function( i ) { + jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); + } ); + }, + + unwrap: function( selector ) { + this.parent( selector ).not( "body" ).each( function() { + jQuery( this ).replaceWith( this.childNodes ); + } ); + return this; + } +} ); + + +jQuery.expr.pseudos.hidden = function( elem ) { + return !jQuery.expr.pseudos.visible( elem ); +}; +jQuery.expr.pseudos.visible = function( elem ) { + return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); +}; + + + + +jQuery.ajaxSettings.xhr = function() { + try { + return new window.XMLHttpRequest(); + } catch ( e ) {} +}; + +var xhrSuccessStatus = { + + // File protocol always yields status code 0, assume 200 + 0: 200, + + // Support: IE <=9 only + // #1450: sometimes IE returns 1223 when it should be 204 + 1223: 204 + }, + xhrSupported = jQuery.ajaxSettings.xhr(); + +support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); +support.ajax = xhrSupported = !!xhrSupported; + +jQuery.ajaxTransport( function( options ) { + var callback, errorCallback; + + // Cross domain only allowed if supported through XMLHttpRequest + if ( support.cors || xhrSupported && !options.crossDomain ) { + return { + send: function( headers, complete ) { + var i, + xhr = options.xhr(); + + xhr.open( + options.type, + options.url, + options.async, + options.username, + options.password + ); + + // Apply custom fields if provided + if ( options.xhrFields ) { + for ( i in options.xhrFields ) { + xhr[ i ] = options.xhrFields[ i ]; + } + } + + // Override mime type if needed + if ( options.mimeType && xhr.overrideMimeType ) { + xhr.overrideMimeType( options.mimeType ); + } + + // X-Requested-With header + // For cross-domain requests, seeing as conditions for a preflight are + // akin to a jigsaw puzzle, we simply never set it to be sure. + // (it can always be set on a per-request basis or even using ajaxSetup) + // For same-domain requests, won't change header if already provided. + if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { + headers[ "X-Requested-With" ] = "XMLHttpRequest"; + } + + // Set headers + for ( i in headers ) { + xhr.setRequestHeader( i, headers[ i ] ); + } + + // Callback + callback = function( type ) { + return function() { + if ( callback ) { + callback = errorCallback = xhr.onload = + xhr.onerror = xhr.onabort = xhr.ontimeout = + xhr.onreadystatechange = null; + + if ( type === "abort" ) { + xhr.abort(); + } else if ( type === "error" ) { + + // Support: IE <=9 only + // On a manual native abort, IE9 throws + // errors on any property access that is not readyState + if ( typeof xhr.status !== "number" ) { + complete( 0, "error" ); + } else { + complete( + + // File: protocol always yields status 0; see #8605, #14207 + xhr.status, + xhr.statusText + ); + } + } else { + complete( + xhrSuccessStatus[ xhr.status ] || xhr.status, + xhr.statusText, + + // Support: IE <=9 only + // IE9 has no XHR2 but throws on binary (trac-11426) + // For XHR2 non-text, let the caller handle it (gh-2498) + ( xhr.responseType || "text" ) !== "text" || + typeof xhr.responseText !== "string" ? + { binary: xhr.response } : + { text: xhr.responseText }, + xhr.getAllResponseHeaders() + ); + } + } + }; + }; + + // Listen to events + xhr.onload = callback(); + errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); + + // Support: IE 9 only + // Use onreadystatechange to replace onabort + // to handle uncaught aborts + if ( xhr.onabort !== undefined ) { + xhr.onabort = errorCallback; + } else { + xhr.onreadystatechange = function() { + + // Check readyState before timeout as it changes + if ( xhr.readyState === 4 ) { + + // Allow onerror to be called first, + // but that will not handle a native abort + // Also, save errorCallback to a variable + // as xhr.onerror cannot be accessed + window.setTimeout( function() { + if ( callback ) { + errorCallback(); + } + } ); + } + }; + } + + // Create the abort callback + callback = callback( "abort" ); + + try { + + // Do send the request (this may raise an exception) + xhr.send( options.hasContent && options.data || null ); + } catch ( e ) { + + // #14683: Only rethrow if this hasn't been notified as an error yet + if ( callback ) { + throw e; + } + } + }, + + abort: function() { + if ( callback ) { + callback(); + } + } + }; + } +} ); + + + + +// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) +jQuery.ajaxPrefilter( function( s ) { + if ( s.crossDomain ) { + s.contents.script = false; + } +} ); + +// Install script dataType +jQuery.ajaxSetup( { + accepts: { + script: "text/javascript, application/javascript, " + + "application/ecmascript, application/x-ecmascript" + }, + contents: { + script: /\b(?:java|ecma)script\b/ + }, + converters: { + "text script": function( text ) { + jQuery.globalEval( text ); + return text; + } + } +} ); + +// Handle cache's special case and crossDomain +jQuery.ajaxPrefilter( "script", function( s ) { + if ( s.cache === undefined ) { + s.cache = false; + } + if ( s.crossDomain ) { + s.type = "GET"; + } +} ); + +// Bind script tag hack transport +jQuery.ajaxTransport( "script", function( s ) { + + // This transport only deals with cross domain or forced-by-attrs requests + if ( s.crossDomain || s.scriptAttrs ) { + var script, callback; + return { + send: function( _, complete ) { + script = jQuery( " + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • <no title>
  • +
  • +
  • +
+
+
+
+
+ +
+

Workspace is not required for FFTs of following sizes:

+
    +
  • Powers of 2 up to 32768

  • +
  • Powers of 3 up to 19683

  • +
  • Powers of 5 up to 15625

  • +
  • Powers of 6 up to 1296

  • +
  • Powers of 7 up to 2401

  • +
  • Powers of 10 up to 10000

  • +
  • Powers of 11 up to 1331

  • +
  • Powers of 12 up to 1728

  • +
+
+
In the future versions of cuFFTDx:
    +
  • Workspace requirement may be removed for other configurations.

  • +
  • FFT configurations that do not require workspace will continue to do so.

  • +
+
+
+
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/index.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/index.html new file mode 100644 index 0000000000000..b3ed34489ccdb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/index.html @@ -0,0 +1,277 @@ + + + + + + cuFFTDx API Reference — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • cuFFTDx API Reference
  • +
  • +
  • +
+
+
+
+
+ +
+

cuFFTDx API Reference

+

Here you can find a description of the main components of the cuFFTDx library, with usage examples.

+ +
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/methods.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/methods.html new file mode 100644 index 0000000000000..c84574ae26969 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/methods.html @@ -0,0 +1,450 @@ + + + + + + Execution Methods — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Execution Methods

+

These methods are used to run the FFT operation.

+

A code example:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<128>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<float>() + cufftdx::Block() );
+
+using complex_type = typename FFT::value_type;
+
+__global__ kernel(... /* arguments */) {
+
+  // Shared memory pointer
+  extern __shared__ complex_type shared_mem[];
+
+  // Register data
+  complex_type thread_data[FFT::storage_size];
+
+  // Load data into registers (thread_data)
+  // ...
+
+  FFT().execute(thread_data, shared_mem);
+
+  // Store results (thread_data) into global memory
+}
+
+
+
+

Thread Execute Method

+
void FFT().execute<typename T>(T* input)
+
+
+

Runs the FFT operation defined by the FFT descriptor. T can be any type (such as float2 or double2), +as long as its alignment and element size are the same as those of FFT::value_type.

+

This method is available if the descriptor has been constructed using the Thread Operator and +cufftdx::is_complete_fft_execution is true.

+

input array should be in the per-thread local memory (registers). input must fit FFT::storage_size +elements of type FFT::value_type.

+
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+
+
+

Block Execute Method

+
// #1
+void FFT().execute<typename T>(T* input, void* shared_memory, FFT::workspace_type& workspace)
+
+// #2: Version of #1 for FFTs which don't require workspace
+void FFT().execute<typename T>(T* input, void* shared_memory)
+
+// #3: Execute with input data in shared memory
+void FFT().execute<typename T>(T* shared_memory_input, FFT::workspace_type& workspace)
+
+// #4: Version of #3 for FFTs which don't require workspace
+void FFT().execute<typename T>(T* shared_memory_input)
+
+
+

Runs the FFT operation defined by the FFT descriptor. T can be any type (such as float2 or double2), +as long as its alignment and element size are the same as those of FFT::value_type. +Pointers input, shared_memory, shared_memory_input should be aligned to alignof(FFT::value_type).

+

This method is available if the descriptor has been constructed using the Block Operator +and cufftdx::is_complete_fft_execution is true.

+

When FFT::requires_workspace is false, overloads #2 and #4 can be used. Otherwise, user has to use +methods #1 or #3 and pass a reference to a workspace.

+

In methods #1 and #2 input is in local memory (registers), and shared_memory is a pointer to a shared memory of size +FFT::shared_memory_size bytes. The operation is in-place meaning the results are stored in input. input must +fit FFT::storage_size elements of type FFT::value_type.

+

In methods #3 and #4 the input data is passed in shared memory (shared_memory_input). The operation is in-place, meaning +the results are stored back to shared_memory_input. These methods don’t require an additional shared_memory pointer +to be passed, as shared_memory_input will be used for the required communication between threads. Thus, shared_memory_input +must fit all input and output values, and can’t be smaller than FFT::shared_memory_size bytes +(i.e. shared memory size in bytes is a maximum of FFT::shared_memory_size, FFT::ffts_per_block * <FFT_input_size_in_bytes>, and +FFT::ffts_per_block * <FFT_output_size_in_bytes>) bytes).

+
+

Warning

+

It is not guaranteed that executions of the same FFTs (size, direction, type, precision) but with different

+ +

will produce bit-identical results.

+
+
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+
+

Value Format

+

For complex numbers of single and double precision, the first value in a complex number is the real part and the second is +the imaginary part.

+

Processing of half (fp16) precision FFTs in cuFFTDx is implicitly batched, that is, single computation processes two FFT +batches. cuFFTDx expects that a complex number of half precision has 2 real parts and 2 imaginary parts in that order +(i.e real_1, real_2, imaginary_1, imaginary_2). Real values of half precision (for R2C and C2R FFTs) follows the same logic and +each should contain two real values. See also FFT::implicit_type_batching trait.

+
+
+

Input/Output Data Format

+

This section describes the input and output data format.

+
+

Data In Registers

+

N-th thread (indexing from 0) participating in the FFT should include the following values of FFT in its input +values: n + FFT::stride * i where i is an index in input. Results are later stored in input following to the same rule.

+

See also FFT::stride.

+
+

Example

+

0-th thread of 8-point FFT with FFT::stride equal to 2 should have values 0, 2, 4, and 6 in its input.

+
+
+
+

Data In Shared Memory

+

The input values of the FFT should be stored in shared_memory_input in natural order. Results are stored in shared_memory_input +following to the same rule.

+
+
+
+
+

Make Workspace Function

+
template<class FFT>
+auto cufftdx::make_workspace<FFT>(cudaError_t& error)
+
+
+

cufftdx::make_workspace<FFT>(cudaError_t&) is a helper function for creating workspace required for block execute(...) method +when FFT::requires_workspace is true. FFT is type of FFT descriptor. +If after calling the function error is not cudaSuccess the workspace was not created correctly and is invalid.

+
    +
  • If FFT::requires_workspace trait is false, user doesn’t have to create workspace.

  • +
  • Workspace can be created for FFT with FFT::requires_workspace equal to false: such workspace is an empty workspace with no global memory allocation.

  • +
  • Workspace object is valid only for FFT it was created for.

  • +
  • Workspace object can allocate global memory, however never more than FFT::workspace_size, +and it’s responsible for freeing it.

  • +
  • Workspace can’t be used concurrently since all copies share the same underlying global memory allocation. Using workspace concurrently will result in memory races.

  • +
  • Allocated global memory is freed upon destruction of the last copy of created workspace object.

  • +
  • Workspace object can be implicitly cast to FFT::workspace_type.

  • +
+
+

Note

+
+

Workspace is not required for FFTs of following sizes:

+
    +
  • Powers of 2 up to 32768

  • +
  • Powers of 3 up to 19683

  • +
  • Powers of 5 up to 15625

  • +
  • Powers of 6 up to 1296

  • +
  • Powers of 7 up to 2401

  • +
  • Powers of 10 up to 10000

  • +
  • Powers of 11 up to 1331

  • +
  • Powers of 12 up to 1728

  • +
+
+
In the future versions of cuFFTDx:
    +
  • Workspace requirement may be removed for other configurations.

  • +
  • FFT configurations that do not require workspace will continue to do so.

  • +
+
+
+
+
+
+

Warning

+

FFT::workspace_type object doesn’t track lifetime of underlying memory, and +is only valid within a lifetime of workspace object it was casted from.

+
+
+

Warning

+

Type returned by cufftdx::make_workspace<FFT>(cudaError_t&) can be different for different FFT descriptions, +and is not the same as FFT::workspace_type. User should use auto when +creating a workspace object. Example:

+
// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    // ...
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+}
+
+// Create workspace
+cudaError_t error = 0;
+auto workspace = cufftdx::make_workspace<FFT>(error);
+
+// ...
+
+// Run kernel with FFT
+block_fft_kernel<FFT><<<1, FFT::block_dim, FFT::shared_memory_size>>>(data, workspace);
+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/operators.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/operators.html new file mode 100644 index 0000000000000..94e4541fd0264 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/operators.html @@ -0,0 +1,572 @@ + + + + + + Operators — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Operators

+

Operators are used to describe the FFT operation to solve, and to configure the execution. They are divided into +Description Operators and Execution Operators.

+
+
+

Description Operators

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Operator

Default value

Description

Size<unsigned int S>

Not set.

Size S of the FFT to calculate.

Direction<fft_direction>

Not set.

Direction of the FFT, either fft_direction::inverse or fft_direction::forward.

Type<fft_type>

fft_type::c2c

Types of input and output data (C2C, R2C, C2R).

Precision<P>

float

Precision P of the floating-point values used to compute the FFT: double, float or __half.

SM<unsigned int CC>

Not set.

Target CUDA architecure for which the FFT function should be generated.

+

Description operators define the FFT operation to be solved. Combined with Execution Operators, they form a +complete FFT descriptor that can be executed on a GPU.

+

Operators are added to construct the FFT descriptor type. For example, for a forward FFT operation consisting of a FFT with 8 double elements per thread:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<8>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<double>() + cufftdx::Thread() );
+
+
+
+
For an FFT descriptor to be complete, the following is required:
+
+
+
+

Size Operator

+
cufftdx::Size<unsigned int S>()
+
+
+

Sets the size S of the FFT operation to compute.

+

There is no default size.

+
+
Restrictions:
    +
  • S must be greater than 1.

  • +
+
+
+
+
+

Direction Operator

+
cufftdx::Direction<cufftdx::fft_direction>()
+
+
+

Sets the direction of the FFT, either fft_direction::inverse or fft_direction::forward.

+

There is no default direction.

+

If the FFT is constructed with the Type<R2C> operator, direction is assumed to be forward and +a direction operator is not necessary.

+

If the FFT is constructed with the Type<C2R> operator, direction is assumed to be inverse and +a direction operator is not necessary.

+
+
Restrictions:
    +
  • fft_direction::forward requires Type<C2C> or Type<R2C>.

  • +
  • fft_direction::inverse requires Type<C2C> or Type<C2R>.

  • +
+
+
+
+
+

Type Operator

+
cufftdx::Type<cufftdx::fft_type>()
+
+
+

Sets the type of the FFT to compute, either fft_type::c2c for complex-to-complex; fft_type::r2c for +real-to-complex; or fft_type::c2r for complex-to-real.

+

The default type is fft_type::c2c.

+
+
Restrictions:
    +
  • fft_type::r2c requires fft_direction::forward. If no direction is specified, it is assumed to be fft_direction::forward.

  • +
  • fft_type::c2r requires fft_direction::inverse. If no direction is specified, it is assumed to be fft_direction::inverse.

  • +
  • cuFFDx performs unnormalized Fast Fourier Transform calculations.

  • +
+
+
+
+
+

Precision Operator

+
cufftdx::Precision<__half>()
+
+cufftdx::Precision<float>()
+
+cufftdx::Precision<double>()
+
+
+

Sets the floating-point precision used to compute the FFT. This is the type of the values used for input and output, as well as the +underlying type of the values used to compute the FFT.

+

The default precision is float.

+
+
+

SM Operator

+
cufftdx::SM<unsigned int CC>()
+
+
+

Sets the target architecture CC for the underlying FFT function to use. Supported architectures are:

+
    +
  • Volta: 700 and 720 (sm_70, sm_72),

  • +
  • Turing: 750 (sm_75), and

  • +
  • Ampere: 800, 860 (sm_80, sm_86).

  • +
+
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+
+
+
+
+

Execution Operators

+ +++++ + + + + + + + + + + + + + + + + +

Operator

Default value

Description

Thread

Not set.

Creates FFT thread execution object.

Block

Not set.

Creates FFT block execution object. See Block Configuration Operators.

+

Execution operators configure how the FFT operation will run on the GPU. Combined with Description Operators, they form a +complete FFT descriptor that can be executed on a GPU.

+

Operators are added to construct the FFT descriptor type. For example, for a forward FFT operation consisting of two FFTs with 128 float elements each, running simultaneously in one CUDA block:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<128>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<float>() + cufftdx::Block()
+                    + cufftdx::ElementsPerThread<8>() + cufftdx::FFTsPerBlock<2>() );
+
+
+
+

Thread Operator

+
cufftdx::Thread()
+
+
+

Sets the FFT operation to run in a thread context. The FFT operation will simultaneously run a single, independent FFT (described using Description Operators) per thread.

+

Each thread will compute one FFT of the size defined by the Size Operator.

+
+
Restrictions:
    +
  • Is mutually exclusive with Block operator

  • +
  • Compilation will fail when used with block-only operators: FFTsPerBlock, ElementsPerThread, BlockDim.

  • +
  • With Precision<__half> restricts Size to range \([2, 32]\).

  • +
  • With Precision<float> restricts Size to range \([2, 32]\).

  • +
  • With Precision<double> restricts Size to range \([2, 16]\).

  • +
+
+
+
+
+

Block Operator

+
cufftdx::Block()
+
+
+

Generates a collective FFT operation to run in a single CUDA block. One or more threads will cooperate to compute the +collective FFT operation.

+

The number of FFTs to compute, as well as the number of threads used to calculate each FFT, can be configured using +Block Configuration Operators.

+
+
Restrictions:
    +
  • Is mutually exclusive with Thread operator

  • +
  • Unless a BlockDim Operator is used, the collective FFT operation can only be executed +inside a 2D block of sizes:

    +
    +
      +
    • blockDim.x = size_of<Description>::value/Description::elements_per_thread.

    • +
    • blockDim.y = Description::ffts_per_block.

    • +
    • blockDim.z = 1.

    • +
    +
    +
  • +
  • BlockDim Operator is not implemented yet.

  • +
  • Operator cufftdx::Precision<__half>() restricts cufftdx::Size<U>() to range \([2, 32768]\).

  • +
  • Operator cufftdx::Precision<float>() restricts cufftdx::Size<U>() to range \([2, 32768]\).

  • +
  • Operator cufftdx::Precision<double>() restricts cufftdx::Size<U>() to range \([2, 16384]\).

  • +
+
+
+
+
+

Block Configuration Operators

+ +++++ + + + + + + + + + + + + + + + + + + + + +

Operators

Default value

Description

FFTsPerBlock<unsigned int F>

1

Number F of FFTs calculated per CUDA block.

ElementsPerThread<unsigned int E>

Heuristic.

Number E of FFT values per CUDA thread.

BlockDim<unsigned int X, Y, Z>

Not set.

Required for executing block FFT within block +with custom dimensions.

+

Block-configuration operators allow the user to tune how the collective FFT operation will run on a single CUDA block.

+
+

Note

+

Block configuration operators can only be used with Block Operator.

+
+
+

Warning

+

It is not guaranteed that executions of the same FFTs (size, direction, type, precision) but with different

+ +

will produce bit-identical results.

+
+
+

FFTs Per Block Operator

+
cufftdx::FFTsPerBlock<unsigned int>()
+
+
+

Sets the number of FFT to compute in parallel within a single CUDA block. Each FFT is computed concurrently by a +separate group of threads.

+

The default is one FFT per block.

+
+
+

Elements Per Thread Operator

+
cufftdx::ElementsPerThread<unsigned int>()
+
+
+

Sets the number of FFT elements to be computed by each thread.

+

The default is determined heuristically to target performance.

+

Restrictions:

+
    +
  • If FFT::requires_workspace if false, it must be a divisor of the requested FFT size.

  • +
  • If FFT::requires_workspace if true, it must be a power of two smaller than the size of the FFT.

  • +
  • Must be in range \([2; 32]\) for cufftdx::Precision<float>() and cufftdx::Precision<__half>().

  • +
  • Must be in range \([2; 16]\) for cufftdx::Precision<double>().

  • +
+
+
+

BlockDim Operator

+
struct cufftdx::BlockDim<unsigned int X, unsigned int Y, unsigned int Z>()
+
+
+

Sets the CUDA block size to (X, Y, Z), to configure the execution.

+

Using this operator, the user can run the collective FFT operation with 2D or 3D CUDA blocks.

+

Default BlockDim size:

+
    +
  • blockDim.x = size_of<Description>::value/Description::elements_per_thread.

  • +
  • blockDim.y = Description::ffts_per_block.

  • +
  • blockDim.z = 1.

  • +
+

See FFT::block_dim.

+
+

Note

+

BlockDim operator is not implemented yet.

+
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/traits.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/traits.html new file mode 100644 index 0000000000000..3ce82fbecb1d5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/api/traits.html @@ -0,0 +1,776 @@ + + + + + + Traits — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +
+

Traits

+

Traits provide the user with information about the FFT description constructed using Operators. They are divided into +Description Traits and Execution Traits.

+
+
+

Description Traits

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Trait

Default value

Description

size_of<Description>::value

None.

Size of the FFT to compute.

type_of<Description>::value

fft_type::c2c

Type of the FFT operation, either fft_type::c2c, fft_type::r2c or fft_type::c2r.

direction_of<Description>::value

See Direction Trait.

Direction of the FFT operation, either fft_direction::inverse or fft_direction::forward.

precision_of<Description>

float

Type of the underlying floating-point values used to compute the FFT: double, float or __half.

is_fft<Description>

None.

true if Description is an FFT description, formed with Description Operators.

is_fft_execution<Description>

None.

true if Description is an FFT description, configured with Execution Operators.

is_complete_fft<Description>

None.

true if Description is a valid FFT description, formed with Description Operators.

is_complete_fft_execution<Description>

None.

true if is_complete_fft<Description> is true and is_fft_execution<Description> is true.

+

Description traits can be retrieved from an FFT descriptor using the helper functions provided. For example:

+
#include <iostream>
+#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<8>() + cufftdx::Type<fft_type::c2c>()
+                      + cufftdx::Direction<fft_direction::forward>()
+                      + cufftdx::Precision<double>() + cufftdx::Thread() );
+
+if(cufftdx::is_complete<FFT>::value)
+  std::cout << "Size of the FFT operation: " << cufftdx::size_of<FFT>::value << std::endl;
+
+
+
+

Size Trait

+
cufftdx::size_of<FFT>::value
+
+
+

Size of the FFT to compute, as set by Size Operator.

+

There is no default size. If the descriptor was not created using a Size Operator, compilation will fail with an error message.

+
+
+

Type Trait

+
cufftdx::type_of<FFT>::value
+
+
+

Type of the FFT operation, as set by Type Operator.

+

The default type is complex-to-complex, fft_type::c2c.

+
+
+

Direction Trait

+
cufftdx::direction_of<FFT>::value
+
+
+

Direction of the FFT operation, as set by Direction Operator.

+

Default direction:

+
+
    +
  • If the FFT type is fft_type::r2c, the default direction is fft_direction::forward.

  • +
  • If the FFT type is fft_type::c2r, the default direction is fft_direction::inverse.

  • +
  • For any other type, there is no default direction. If the descriptor was not created using a Direction Operator, compilation will fail with an error message.

  • +
+
+
+
+

Precision Trait

+
cufftdx::precision_of<FFT>::type
+
+
+

Floating-point precision of the FFT operation, as set by Precision Operator.

+

The default precision is float.

+
+
+

Is FFT? Trait

+
cufftdx::is_fft<FFT>::value
+
+
+

Trait is true if the descriptor is an FFT description, formed with Description Operators.

+

There is no default value. The descriptor either is or is not an FFT description.

+
+
+

Is FFT Execution? Trait

+
cufftdx::is_fft_execution<FFT>::value
+
+
+

Trait is true if the descriptor is an FFT description, formed with Description Operators and a Execution Operators.

+

There is no default value. The descriptor either is or is not an FFT description including an Execution Operators.

+
+
+

Is FFT-complete? Trait

+
cufftdx::is_complete_fft<FFT>::value
+
+
+

Trait is true if the descriptor is a complete FFT description, formed with Description Operators.

+
+

Note

+

Complete in this context means that the descriptor has been formed with all the necessary Description Operators and it is only missing an Execution Operators to be able to run.

+
+

For an FFT descriptor to be complete, the following is required:

+ +

There is no default value. The descriptor either is or is not an FFT-complete description.

+
+
+

Is FFT-complete Execution? Trait

+
cufftdx::is_complete_fft_execution<FFT>::value
+
+
+

Trait is true if both cufftdx::is_fft_execution and cufftdx::is_complete_fft are true.

+
+

Note

+

If cufftdx::is_complete_fft_execution trait is true for a descriptor FFT, then we can use the Execution Methods +to compute the FFT.

+
+

There is no default value.

+
+
+
+
+

Execution Traits

+

Execution traits can be retrieved directly from an FFT descriptor that has been configured with Execution Operators. +The available execution traits depend on the operator used to build the descriptor; either a Thread Operator or a Block Operator.

+
+

Thread Traits

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Trait

Default value

Description

Description::value_type

detail::complex<float>

Complex type of the underlying data used to compute the FFT.

Description::input_type

Description::value_type

Type of the underlying data used as input for the FFT.

Description::output_type

Description::value_type

Type of the underlying data used as output for the FFT.

Description::implicit_type_batching

2 if cufftdx::precision_of<FFT>::type is __half, otherwise - 1

Number of values from different FFTs batched into one element of type Description::value_type.

Description::elements_per_thread

size_of<Description>::value

Number of FFT elements to be computed per thread.

Description::storage_size

Description::elements_per_thread

Number of Description::value_type elements that each thread must allocate to compute the FFT.

Description::stride

Always 1

Stride between elements of the thread FFT held by each thread in its input

+

Thread traits can be retrieved from descriptors built with Thread Operator.

+

For example:

+
#include <cufftdx.hpp>
+
+using FFT          = decltype(cufftdx::Size<8>() + cufftdx::Type<fft_type::c2c>()
+                            + cufftdx::Direction<fft_direction::forward>()
+                            + cufftdx::Precision<double>() + Thread());
+
+
+// Retrieve the FFT data type
+using complex_type = typename FFT::value_type;
+
+// Retrieve the number of elements per thread
+auto elements_per_thread = FFT::elements_per_thread;
+
+
+
+

Value Type Trait

+
FFT::value_type
+
+
+

Complex type of the underlying data used for FFT computation.

+

The default type is cufftdx::detail::complex<float>, as defined in the types.hpp header file.

+
+
+

Input Type Trait

+
FFT::input_type
+
+
+

Complex type of the underlying data used as input of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Output Type Trait

+
FFT::output_type
+
+
+

Complex type of the underlying data used as output of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Implicit Type Batching Trait

+
FFT::implicit_type_batching
+
+
+

Number of values from different FFTs batched into one element of type Description::value_type used in FFT computation. If +it’s higher than one it means that Thread FFT object calculates multiple FFTs in one go.

+

The value is 2 if cufftdx::precision_of<FFT>::type is __half, and 1 otherwise.

+
+

Note

+

Please note that in future releases of cuFFTDx FFT::implicit_type_batching may be replaced, and/or extended.

+
+
+
+

Elements Per Thread Trait

+
FFT::elements_per_thread
+
+
+

Number of FFT elements of the type returned by Value Type Trait that each thread will compute.

+

The default value is the same as Size Trait.

+
+
+

Storage Size Trait

+
FFT::storage_size
+
+
+

Number of Description::value_type elements that each thread must allocate to compute the FFT.

+

The default value is the same as Elements Per Thread Trait.

+
+
+

Stride Size Trait

+
FFT::stride
+
+
+

Stride between elements of the FFT held by each thread in input.

+

For thread FFT FFT::stride is always 1.

+
+
+
+

Block Traits

+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Trait

Default value

Description

Description::value_type

detail::complex<float>

Complex type of the underlying data used to compute the FFT.

Description::input_type

Description::value_type

Type of the underlying data used as input for the FFT.

Description::output_type

Description::value_type

Type of the underlying data used as output for the FFT.

Description::workspace_type

Description::workspace_type

Device-side type of workspace required for FFT computation.

Description::implicit_type_batching

2 if cufftdx::precision_of<FFT>::type is __half, otherwise - 1

Number of values from different FFTs batched into one element of type Description::value_type.

Description::elements_per_thread

Heuristic.

Number of FFT elements to be computer per thread.

Description::storage_size

Determined by Description::elements_per_thread

Number of Description::value_type elements that each thread must allocate to compute the FFT.

Description::stride

Determined by Description::elements_per_thread and size of the FFT

Stride between elements of the block FFT held by each thread in its input

Description::ffts_per_block

1

Number of FFTs to compute by a CUDA block in this FFT operation.

Description::suggested_ffts_per_block

Heuristic.

Suggested number of FFTs to compute by a CUDA block to target maximum performance.

Description::shared_memory_size

Determined from Description::ffts_per_block and Description::elements_per_thread

Size of the shared memory in bytes.

Description::block_dim

See Block Dim Trait.

dim3 of the CUDA block to compute the FFT operation.

Description::max_threads_per_block

Determined from Description::block_dim

Total number of threads in the CUDA block.

Description::requires_workspace

True if FFT implementation requires extra workspace; otherwise - false.

Determines if it’s required to allocate extra workspace in global memory using cufftdx::make_workspace(cudaError_t&).

Description::workspace_size

0 if Description::workspace_size is true, otherwise > 0.

Size of global memory required for workspace (in bytes).

+

Block traits can be retrieved from descriptors built with Block Operator.

+

For example:

+
#include <cufftdx.hpp>
+
+using FFT = decltype( cufftdx::Size<128>() + cufftdx::Type<fft_type::c2c>()
+                    + cufftdx::Direction<fft_direction::forward>()
+                    + cufftdx::Precision<float>() + cufftdx::Block()
+                    + cufftdx::ElementsPerThread<8>() + cufftdx::FFTsPerBlock<2>() );
+
+// Retrieve the FFT data type
+using complex_type = typename FFT::value_type;
+
+// Allocate managed memory for input/output
+complex_type* data;
+auto          size       = FFT::ffts_per_block * cufftdx::size_of<FFT>::value;
+auto          size_bytes = size * sizeof(complex_type);
+
+cudaMallocManaged(&data, size_bytes);
+
+
+
+

Value Type Trait

+
FFT::value_type
+
+
+

Complex type of the underlying data used for FFT computation.

+

The default type is cufftdx::detail::complex<float>, as defined in the types.hpp header file.

+
+
+

Input Type Trait

+
FFT::input_type
+
+
+

Complex type of the underlying data used as input of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Output Type Trait

+
FFT::output_type
+
+
+

Complex type of the underlying data used as output of the FFT computation.

+

The default type is the same as Value Type Trait.

+
+
+

Workspace Type Trait

+
FFT::workspace_type
+
+
+

Type of a workspace required by execute(...) function of FFT. User should check if FFT requires a workspace +using Description::requires_workspace trait, and create one with cufftdx::make_workspace<FFT>(cudaError_t&).

+

See Make Workspace Function for more details about workspace.

+
+

Warning

+

FFT::workspace_type object doesn’t track lifetime of underlying memory, and is only valid within a lifetime of +workspace object it was casted from.

+
+
+

Warning

+

Type returned by cufftdx::make_workspace<FFT>(cudaError_t&) can be different for different FFT descriptions, +and is not the same as FFT::workspace_type. User should use auto when creating a workspace object.

+
+
+
+

Implicit Type Batching Trait

+
FFT::implicit_type_batching
+
+
+

Number of values from different FFTs batched into one element of type Description::value_type used in FFT computation. If +it’s higher than one it means that Block FFT object calculates multiple FFTs in one go.

+

The value is 2 if cufftdx::precision_of<FFT>::type is __half, and 1 otherwise.

+
+

Note

+

Please note that in future releases of cuFFTDx FFT::implicit_type_batching may be replaced, and/or extended.

+
+
+
+

Elements Per Thread Trait

+
FFT::elements_per_thread
+
+
+

Number of FFT elements of the type returned by _valuetype-thread-trait-label that each thread will compute.

+

The default value is the same as Size Trait.

+
+
+

Storage Size Trait

+
FFT::storage_size
+
+
+

Number of Description::value_type elements that each thread must allocate to compute the FFT.

+

The default value is the same as Elements Per Thread Trait.

+
+
+

Stride Size Trait

+
FFT::stride
+
+
+

Stride between elements of the block FFT held by each thread in its input.

+

See also expected input data format.

+
+

Example

+

0-th thread of 8-point FFT with FFT::stride equal to 2 should have values 0, 2, 4, and 6 in its input.

+
+
+
+

FFTs Per Block Trait

+
FFT::ffts_per_block
+
+
+

Number of FFTs to compute in parallel within a CUDA block, as part of the collective FFT operation.

+

The default value is 1.

+
+
+

Suggested FFTs Per Block Trait

+
FFT::suggested_ffts_per_block
+
+
+

Suggested number of FFTs to compute in parallel within a CUDA block, as part of the collective FFT operation, to maximize performance.

+

The default value is heuristic, and depends on the size of the FFT, the number of elements per thread, and other parameters.

+
+
+

Shared Memory Size Trait

+
FFT::shared_memory_size
+
+
+

Size of the required shared memory for the FFT operation to execute, in bytes.

+

The default value is determined from FFTs Per Block Trait and Elements Per Thread Trait.

+
+
+

Block Dim Trait

+
FFT::block_dim
+
+
+

BlockDim<unsigned int X, Y, Z> with x = size_of<Description>::value/Description::elements_per_thread, y = Description::ffts_per_block, z = 1

+
+
+

Max Threads Per Block Trait

+
FFT::max_threads_per_block
+
+
+

Maximum number of threads for the FFT in the CUDA block.

+

The default value is determined from FFTs Per Block Trait and Elements Per Thread Trait.

+
+
+

Requires Workspace Trait

+
FFT::requires_workspace
+
+
+

Boolean value. If true, a workspace must be created and passed to the FFT::execute(...) method (see block execute methods). +Otherwise, it’s not necessary to create and pass a workspace. Workspace can be created using +cufftdx::make_workspace<FFT>(cudaError_t&) function. Workspace created for FFT which does not require one +will be empty and won’t allocate any global memory.

+
+
+

Workspace Size Trait

+
FFT::workspace_size
+
+
+

Informs how much global memory will be allocated by required workspace. If Description::workspace_size is false +it’s 0; otherwise it’s greater than zero.

+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/genindex.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/genindex.html new file mode 100644 index 0000000000000..9b7a8823bcc26 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/genindex.html @@ -0,0 +1,254 @@ + + + + + + Index — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Index
  • +
  • +
  • +
+
+
+
+
+ + +

Index

+ +
+ +
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/index.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/index.html new file mode 100644 index 0000000000000..547847a24ac4c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/index.html @@ -0,0 +1,315 @@ + + + + + + NVIDIA cuFFTDx — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • NVIDIA cuFFTDx
  • +
  • +
  • +
+
+
+
+
+ +
+

NVIDIA cuFFTDx

+

The cuFFT Device Extensions (cuFFTDx) library enables you to perform Fast Fourier Transform (FFT) calculations +inside your CUDA kernel. Fusing FFT with other operations can decrease the latency and improve the performance of +your application.

+
+
The documentation consists of two main components:
+
+
+
+
+

Highlights

+
    +
  • Fast Fourier Transform (FFT) embeddable into a CUDA kernel

  • +
  • High performance, no unnecessary data movement from and to global memory

  • +
  • Customizability, options to adjust selection of FFT routine for different needs (size, precision, batches etc.)

  • +
  • Ability to fuse FFT kernels with other operations saving global memory trips

  • +
  • Compatibility with future versions of the CUDA Toolkit

  • +
+
+
+ +
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/introduction.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/introduction.html new file mode 100644 index 0000000000000..f22270ed5f69b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/introduction.html @@ -0,0 +1,634 @@ + + + + + + First FFT using cuFFTDx — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • First FFT using cuFFTDx
  • +
  • +
  • +
+
+
+
+
+ +
+

First FFT using cuFFTDx

+

In the following example, we will calculate an FFT of size 128 using a standalone +kernel. We start with an empty CUDA kernel:

+
// Empty kernel to compute an FFT of size 128 using float
+__global__ void fft_128_float(float2* data) {
+
+}
+
+
+

First, we have to provide an FFT description to the cuFFTDx library. A cuFFTDx transform description +is built using C++ constructs that are evaluated at compile time. A correctly-defined FFT must include +the problem size, the precision used (float, double, etc.), the type of operation (complex-to-complex, +real-to-complex, etc.), and its direction (forward, or inverse). We add the following lines:

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>());
+}
+
+
+

In order to encode the FFT properties, cuFFTDx provides operators Size Operator, +Precision Operator, Type Operator, and Direction Operator. +Listed operators can be combined by using the addition operator (+).

+

To obtain a fully usable CUDA FFT kernel, we need to provide three additional +pieces of information. The first one is how many FFTs we would like to compute, +the second one is how to map the calculations into a CUDA block, and the +last one is what CUDA architecture we are targeting.

+

In cuFFTDx, we specify how many FFTs we want to compute using the FFTs Per Block Operator. +It defines how many FFT to do in parallel inside of a single CUDA block. Let us +add that operator:

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+// and one FFT per block
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>());
+}
+
+
+

To map the computing of the FFT to the CUDA block, we use the Elements Per Thread Operator. +This operator determines the number of registers required per thread and the exact implementation +to be used. It also influences the required CUDA block size. We add that operator to the description:

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+// and one FFT per block with 8 elements per thread
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>());
+}
+
+
+

Finally, we use the SM Operator to indicate the target CUDA architecure +on which we want to build the FFT descriptor. Each GPU architecture can use different +parameters. Therefore, the choice of architecture potentially affects the configuration +to maximize performance. For this example, we target Volta GPUs (SM<700>()):

+
#include <cufftdx.hpp>
+
+// Kernel containing a descriptor of an FFT of size 128 using float
+// and one FFT per block with 8 elements per thread
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>());
+}
+
+
+

Once the FFT description is fully formed, we can finalize it by adding the +Block Operator. It indicates that we are asking for the +collective FFT operation to be performed by a single CUDA block. The operator +verifies correctness of the description, and it is a type of Execution Operators, +(the other being the Thread Operator).

+
#include <cufftdx.hpp>
+
+// Kernel containing a fully-formed descriptor of an
+// FFT of size 128 using float and one FFT per block
+// with 8 elements per thread, targeting Volta arch
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+}
+
+
+
+

What next?

+

FFT descriptions can be instantiated into objects. Forming the object has +no computational cost, and should be seen as a handle. The FFT descriptor object +provides a compute method, execute(...) that performs the requested FFT.

+
#include <cufftdx.hpp>
+
+// Kernel containing a fully-formed descriptor of an FFT and its
+// execution
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+
+  // Execute FFT
+  FFT().execute(/*What are the arguments?*/);
+}
+
+
+

cuFFTDx operations require registers and shared memory to operate. Users can query the FFT descriptor +for needed resources.

+
#include <cufftdx.hpp>;
+
+// Kernel containing a fully-formed descriptor of an FFT and its
+// execution, where each thread allocates data in registers
+__global__ void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                      + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                      + ElementsPerThread<8>() + SM<700>() + Block())
+
+  using complex_type = typename FFT::value_type;
+
+  complex_type thread_data[FFT::storage_size];
+
+  extern __shared__ complex_type shared_mem[];
+
+  // Execute FFT
+  FFT().execute(thread_data, shared_mem);
+}
+
+
+

Some FFTs, depending on the selected size, may also require additional global memory workspace, +which needs to be allocated on host and passed to the kernel. You can check if you have to create workspace +using FFT::requires_workspace <requiresworkspace-block-trait-label> trait.

+
#include <cufftdx.hpp>
+
+using namespace cufftdx;
+
+using FFT = decltype(Size<151>() + Precision<double>() + Type<fft_type::c2c>()
+                    + Direction<fft_direction::inverse>() + FFTsPerBlock<2>()
+                    + ElementsPerThread<16>() + SM<700>() + Block());
+
+// Kernel containing a fully-formed descriptor of an FFT and its
+// execution, where each thread allocates data in registers
+__global__ void fft_128_float(float2* data, typename FFT::workspace_type workspace) {
+  using complex_type = typename FFT::value_type;
+
+  complex_type thread_data[FFT::storage_size];
+
+  extern __shared__ complex_type shared_mem[];
+
+  // Execute FFT
+  FFT().execute(thread_data, shared_mem, workspace);
+}
+
+
+

To launch a kernel we need to know the block size and required amount of shared memory needed to perform the FFT +operation. Both are fixed and determined by the FFT description.

+

Since we defined the FFT description in device code, information about the +block size needs to be propagated to the host. When all parameters are fully specified, +all GPU architectures use the same block size, so the kernel can be launched in +the same manner for all architectures.

+
#include <cufftdx.hpp>
+
+// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    using complex_type = typename FFT::value_type;
+
+    complex_type thread_data[FFT::storage_size];
+
+    extern __shared__ complex_type shared_mem[];
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+}
+
+// Host function, data is a managed memory pointer
+void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+
+  using complex_type = typename FFT::value_type;
+
+  cudaError_t error_code = cudaSuccess;
+  auto workspace = make_workspace<FFT>(error_code);
+
+  block_fft_kernel<FFT><<<1, FFT::block_dim, FFT::shared_memory_size>>>((complex_type*)data, workspace);
+}
+
+
+

If we also add input/output operations to global memory, we obtain a kernel that is +equivalent to the cuFFT kernel for size 128.

+
#include <cufftdx.hpp>
+
+// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    using namespace cufftdx;
+
+    using complex_type = typename FFT::value_type;
+
+    // Local array and copy data into it
+    complex_type thread_data[FFT::storage_size];
+
+    const int stride = size_of<FFT>::value / FFT::elements_per_thread;
+
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      thread_data[i].x = data[threadIdx.x + i * stride].x;
+      thread_data[i].y = data[threadIdx.x + i * stride].y;
+    };
+
+    extern __shared__ complex_type shared_mem[];
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+
+    // Save results
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      data[threadIdx.x + i * stride].x = thread_data[i].x;
+      data[threadIdx.x + i * stride].y = thread_data[i].y;
+    };
+}
+
+// Host function, data is a managed memory pointer
+void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  using FFT = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                     + Direction<fft_direction::forward>() + FFTsPerBlock<1>()
+                     + ElementsPerThread<8>() + SM<700>() + Block());
+
+  using complex_type = typename FFT::value_type;
+
+  cudaError_t error_code = cudaSuccess;
+  auto workspace = make_workspace<FFT>(error_code);
+
+  block_fft_kernel<FFT><<<1, FFT::block_dim, FFT::shared_memory_size>>>((complex_type*)data, workspace);
+}
+
+
+

Unlike cuFFT, cuFFTDx does not require moving data back to global memory after +executing a FFT operation. This is a major performance advantage.

+
+
+

Compilation

+

In order to compile we only need to pass the location of the cuFFTDx library (the directory with the cufftdx.hpp file).

+
nvcc -std=c++11 -arch sm_70 -O3 -I<path_to_cuFFTDx_location> my_fft_kernel_128.cu -o my_fft_kernel_128
+
+
+
+

Note

+

Since version 0.3.0 cuFFTDx has an experimental support for compilation with NVRTC.

+
+
+
+
+

Your next custom FFT kernels

+

For real world use cases, it is likely we will need more than a single kernel. +A single use case, aiming at obtaining the maximum performance on multiple architectures, +may require a number of different implementations. cuFFTDx was designed +to handle this burden automatically, while offering users full control over +the implementation details.

+

cuFFTDx allows user to defer the definition of certain details of the implementation +(such as the number of FFT elements computed per thread, or the number of FFTs per block) +to the library. Let us apply this to our previous kernel:

+
#include <cufftdx.hpp>
+
+// Kernel
+template<class FFT>
+__launch_bounds__(FFT::max_threads_per_block)
+__global__ void block_fft_kernel(typename FFT::value_type* data, typename FFT::workspace_type workspace) {
+    using namespace cufftdx;
+
+    using complex_type = typename FFT::value_type;
+
+    // Local array and copy data into it
+    complex_type thread_data[FFT::storage_size];
+
+    const int stride = size_of<FFT>::value / FFT::elements_per_thread;
+
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      thread_data[i].x = data[threadIdx.x + i * stride].x;
+      thread_data[i].y = data[threadIdx.x + i * stride].y;
+    };
+
+    extern __shared__ complex_type shared_mem[];
+
+    // Execute FFT
+    FFT().execute(thread_data, shared_mem, workspace);
+
+    // Save results
+    for (int i = 0; i < FFT::elements_per_thread; ++i){
+      data[threadIdx.x + i * stride].x = thread_data[i].x;
+      data[threadIdx.x + i * stride].y = thread_data[i].y;
+    };
+}
+
+// Host function, data is managed memory pointer
+void fft_128_float(float2* data) {
+  using namespace cufftdx;
+
+  // Create a complete descriptor
+  using FFTComplete = decltype(Size<128>() + Precision<float>() + Type<fft_type::c2c>()
+                             + Direction<fft_direction::forward>() + SM<700>());
+
+  if(is_complete_fft<FFTComplete>::value == true) {
+
+    // Retrieve suggested elements per block and FFTs per block and use them
+    // to create a complete descriptor
+    using FFTExecution = decltype(FFTComplete()
+                                + ElementsPerThread<FFTComplete::elements_per_thread>()
+                                + FFTsPerBlock<FFTComplete::suggested_ffts_per_block>()
+                                + Block());
+
+    using complex_type = typename FFTExecution::value_type;
+
+    cudaError_t error_code = cudaSuccess;
+    auto workspace = make_workspace<FFT>(error_code);
+
+    block_fft_kernel<FFTExecution><<<1, FFTExecution::block_dim, FFTExecution::shared_memory_size>>>(
+        (complex_type*)data, workspace
+    );
+  }
+}
+
+
+

To retrieve the optimal parameters, we require a complete descriptor (as indicated by +cufftdx::is_complete_fft). This is because some of the details are only available +after the FFT operation has been fully described, and the target architecture has been +identified. SM Operator compiled on the host allows the user to query +launch parameters for a particular architecture.

+
+

What happens under the hood?

+
+
Expression templates

The cuFFTDx API is using a variation of a C++ technique called expression templates. +We use expression templates to allow the user to construct compile-time objects that +describe the FFT calculation to compute. Compile-time C++ mechanisms allow cuFFTDx to +attach optimized FFT routines to the object, and expose them as a compute method +that can be called by the user.

+
+
Header only

cuFFTDx FFT routines are shipped as optimized inline PTX.

+
+
+
+
+

Why?

+

For a library to be useful, it needs to abstract functionality in a future-proof manner. +By future-proof we mean that an existing user code should not need to be modified +in the future, and new functionality should consist of simple extensions to the +existing code. On the CUDA platform, this requires adapting to quickly evolving +GPU hardware.

+

cuFFTDx approaches future-proofing in two ways. On one hand, the API is a +source-level abstraction which decouples the library from ABI changes. +Along with the PTX code in headers, cuFFTDx is forward-compatible with any CUDA +toolkit, driver and compiler that supports hardware that cuFFDx was released for. +PTX can be recompiled by the CUDA compiler to run on future GPU architectures.

+

On the other hand, the API organization allows preserving operators describing +what gets computed and how. New features depending on type can either be picked up +automatically if code defers implementation choices to the library, or require +adding operators to an existing expression.

+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/objects.inv b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/objects.inv new file mode 100644 index 0000000000000..7f7f67075acbf Binary files /dev/null and b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/objects.inv differ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/performance.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/performance.html new file mode 100644 index 0000000000000..e0628f1eb3ec8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/performance.html @@ -0,0 +1,335 @@ + + + + + + Achieving high performance — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Achieving high performance
  • +
  • +
  • +
+
+
+
+
+ +
+

Achieving high performance

+

In High-Performance Computing, the ability to write customized code enables +users to target better performance. In the case of cuFFTDx, the potential for +performance improvement of existing FFT applications is high, but it greatly +depends on how the library is used. Taking the regular cuFFT library as +baseline, the performance may be up to one order of magnitude better or worse. +For this reason porting existing sources to cuFFTDx should always be done in +parallel with performance analysis. Below we list general advice that +may help in this process.

+
+

General advice

+
    +
  • Try library-provided default settings to start with best compute performance

  • +
  • Best parameters for compute bound and memory bound kernels might not be identical

  • +
  • Ensure FFT kernel runs enough blocks to fill a GPU for peak performance

  • +
  • Merge adjacent memory bound kernels (pre- and post-processing) with an FFT kernel to save global memory trips

  • +
+
+
+

Memory management

+
    +
  • Avoid reading/writing data from global memory

  • +
  • Ensure global memory reads/writes are coalesced (increase the value of FFTs Per Block Operator if needed)

  • +
  • Use shared memory or extra registers to store the temporary data

  • +
+
+
+

Kernel fusion

+
    +
  • For complex kernels consider adjusting FFT operation to match user kernel +(ie. tweaking Elements Per Thread Operator will change required CUDA block size). Upcoming versions of +cuFFTDx will offer more customization options.

  • +
  • For simple operations consider merging operations into FFT kernel optimized +for FFT performance.

  • +
+
+
+

Advanced

+ +
+ +
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/release_notes.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/release_notes.html new file mode 100644 index 0000000000000..1c86b9d57e70e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/release_notes.html @@ -0,0 +1,290 @@ + + + + + + Release Notes — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Release Notes
  • +
  • +
  • +
+
+
+
+
+ +
+

Release Notes

+

This section includes significant changes, new features, performance improvements, and various issues. Unless noted, +listed issues should not impact functionality. When functionality is impacted, we offer a work-around to avoid the issue (if available).

+
+

1.0.0

+

The first general availability (GA) release of cuFFTDx library.

+
+

New Features

+ +
+
+

Resolved Issues

+
    +
  • ptxas warning program uses 32-bit address on line XXX which is conflicting with .address_size 64 shouldn’t appear anymore.

  • +
+
+
+
+

0.3.1

+

The last early access (EA) release of cuFFTDx library.

+
+

Known Issues

+
    +
  • ptxas warning about pointer size conflict:

    +
    ptxas warning : Program uses 32-bit address on line 'XXX' which is conflicting with .address_size 64
    +
    +
    +

    This warning may appear when compiling, but it does not impact functionality or performance.

    +
  • +
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/requirements_func.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/requirements_func.html new file mode 100644 index 0000000000000..e8a5ca86f54fc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/requirements_func.html @@ -0,0 +1,396 @@ + + + + + + Requirements and Functionality — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Requirements and Functionality
  • +
  • +
  • +
+
+
+
+
+ +
+

Requirements and Functionality

+
+
+

Requirements

+

The cuFFTDx library is a CUDA C++ header only library. Therefore, the list of required software to use the library is relatively small. User needs:

+
    +
  • CUDA Toolkit 11.0 or newer

  • +
  • Supported CUDA compiler

  • +
  • Supported host compiler (C++17 required)

  • +
  • (Optionally) CMake (version 3.18 or greater)

  • +
+
+

Supported Compilers

+

CUDA Compilers:

+
    +
  • NVCC 11.0.194+ (CUDA Toolkit 11.0 or newer)

  • +
  • (Experimental support) NVRTC 11.0.194+ (CUDA Toolkit 11.0 or newer)

  • +
+

Host / C++ Compilers:

+
    +
  • GCC 7+

  • +
  • Clang 9+ (only on Linux/WSL2)

  • +
  • Compiling with MSVC (Windows) is not supported

  • +
+
+

Note

+

cuFFTDx emits errors for unsupported versions of compilers, which can be silenced by defining CUFFTDX_IGNORE_DEPRECATED_COMPILER +during compilation. cuFFTDx is not guaranteed to work with versions of compilers that are not supported in cuFTTDx.

+
+
+

Note

+

cuFFTDx emits errors for unsupported versions of C++ standard, which can be silenced by defining CUFFTDX_IGNORE_DEPRECATED_DIALECT +during compilation. cuFFTDx is not guaranteed to work with versions of C++ standard that are not supported in cuFTTDx.

+
+
+
+
+

Supported Functionality

+
+
Supported functions include:
    +
  • Create block descriptors that run collective FFT operations (with one or more threads collaborating to compute one or more FFTs) in a single CUDA block. See Block Operator.

  • +
  • Create thread descriptors that run a single FFT operation per thread. This function might require more expertise with cuFFTDx in order to obtain correct results with higher performance. See Thread Operator.

  • +
  • Bi-directional information flow, from the user to the descriptor via Operators and from the descriptor to the user via Traits.

  • +
  • Target specific GPU architectures using the SM Operator. This enables users to configure the descriptor with suggested parameters to target performance.

  • +
+
+
+

cuFFTDx supports selected FFT sizes in the range [0; max_size] and all sizes in the range [0; max_size/2], where max_size depends on precision, type, +and CUDA architecture. However, not every combination of size, precision, elements per thread, and FFTs per block is correct and available. The following +table summarizes the available configurations:

+ +++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Type

Precision

Thread FFT Sizes

Block FFT Sizes

Architecture

Size Range

    +
  • Complex-to-complex

  • +
  • Real-to-complex

  • +
  • Complex-to-real

  • +
+

half

All sizes in range: [2; 32]

75

[2; 4096]

70;72;86

[2; 16384]

80

[2; 32768]

float

All sizes in range: [2; 32]

75

[2; 4096]

70;72;86

[2; 16384]

80

[2; 32768]

double

All sizes in range: [2; 16]

75

[2; 2048]

70;72;86

[2; 8192]

80

[2; 16384]

+
+

Note

+

cuFFTDx 0.3.0 added preliminary support for all sizes in range of [0; max_size/2]. Most sizes will require you to create additional workspace with global memory allocation. See Make Workspace Function +for more details about workspace. You can check if a given FFT requires with FFT::requires_workspace trait.

+
+
+

Workspace is not required for FFTs of following sizes:

+
    +
  • Powers of 2 up to 32768

  • +
  • Powers of 3 up to 19683

  • +
  • Powers of 5 up to 15625

  • +
  • Powers of 6 up to 1296

  • +
  • Powers of 7 up to 2401

  • +
  • Powers of 10 up to 10000

  • +
  • Powers of 11 up to 1331

  • +
  • Powers of 12 up to 1728

  • +
+
+
In the future versions of cuFFTDx:
    +
  • Workspace requirement may be removed for other configurations.

  • +
  • FFT configurations that do not require workspace will continue to do so.

  • +
+
+
+
+
+
Functionality not yet supported include:
    +
  • Input/output stored in global memory. Input data must be in registers (local memory) or shared memory.

  • +
  • The BlockDim Operator, which enables fine-grain customization of the CUDA block dimensions.

  • +
+
+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/search.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/search.html new file mode 100644 index 0000000000000..469152f361622 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/search.html @@ -0,0 +1,269 @@ + + + + + + Search — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • Search
  • +
  • +
  • +
+
+
+
+
+ + + + +
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/searchindex.js b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/searchindex.js new file mode 100644 index 0000000000000..354e51f853d56 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({docnames:["api/empty_workspace_list","api/index","api/methods","api/operators","api/traits","index","introduction","performance","release_notes","requirements_func","warnings/bit_identical","warnings/bit_identical_sm"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,sphinx:56},filenames:["api/empty_workspace_list.rst","api/index.rst","api/methods.rst","api/operators.rst","api/traits.rst","index.rst","introduction.rst","performance.rst","release_notes.rst","requirements_func.rst","warnings/bit_identical.rst","warnings/bit_identical_sm.rst"],objects:{},objnames:{},objtypes:{},terms:{"0":[2,4,5,6,9],"1":[2,3,4,5,6,7],"10":[0,2,9],"10000":[0,2,9],"11":[0,2,6,9],"12":[0,2,9],"128":[2,3,4,6],"1296":[0,2,9],"1331":[0,2,9],"151":6,"15625":[0,2,9],"16":[3,6,9],"16384":[3,9],"17":9,"1728":[0,2,9],"18":9,"194":9,"19683":[0,2,9],"2":[0,2,3,4,6,7,9],"2048":9,"2401":[0,2,9],"2d":3,"3":[0,2,5,6,7,9],"32":[3,8,9],"32768":[0,2,3,9],"3d":3,"4":[2,4,7],"4096":9,"5":[0,2,7,9],"6":[0,2,4,7,9],"64":8,"7":[0,2,7,9],"70":9,"700":[3,6],"72":9,"720":3,"75":9,"750":3,"8":[2,3,4,6],"80":9,"800":3,"8192":9,"86":9,"860":3,"9":9,"abstract":6,"boolean":4,"byte":[2,4],"case":[6,7],"class":[2,6],"const":6,"default":[3,4,7,8],"do":[0,2,6,9],"final":6,"float":[2,3,4,6,9],"function":[1,3,4,5,6,7,8],"int":[3,4,6],"long":2,"new":6,"return":[2,4],"true":[2,3,4,6],"try":7,"void":[2,6],"while":6,A:[2,5,6],By:6,For:[2,3,4,6,7],If:[2,3,4,6],In:[0,6,7,9],Is:3,It:[2,3,6,10,11],Not:3,On:6,One:[3,4],The:[2,3,4,5,6,8,9],There:[3,4],These:2,To:6,With:3,__global__:[2,6],__half:[3,4],__launch_bounds__:[2,6],__shared__:[2,6],_valuetyp:4,a100:8,abi:6,abil:[5,7],abl:4,about:[4,6,8,9],access:8,achiev:5,ad:[3,4,6,8,9],adapt:6,add:6,addit:[2,6,9],address:8,address_s:8,adjac:7,adjust:[5,7],advanc:5,advantag:6,advic:5,affect:6,after:[2,6],aim:6,align:2,alignof:2,all:[2,4,6,9],alloc:[2,4,6,9],allow:[3,6],along:6,also:[2,4,6],alwai:[4,7],amount:6,amper:[3,7],an:[2,3,4,5,6,7],analysi:7,ani:[2,4,6],anymor:8,api:[5,6,7,8],appear:8,appli:6,applic:[5,7],approach:6,ar:[2,3,4,6,7,9],arch:6,architectur:[2,3,6,9,11],architecur:[3,6],argument:[2,6],around:8,arrai:[2,6],ask:6,assum:3,attach:6,auto:[2,4,6],automat:6,avail:[2,4,6,7,8,9],avoid:[7,8],back:[2,6],baselin:7,batch:[2,5],becaus:6,been:[2,4,6],being:6,below:7,best:7,better:7,between:[2,4],bi:9,bit:[2,3,8,10,11],block:[1,6,7,8,9,10],block_dim:[2,3,4,6],block_fft_kernel:[2,6],blockdim:[2,4,9,10],both:[4,6],bound:7,build:[4,6],built:[4,6],burden:6,c2c:[2,3,4,6],c2r:[2,3,4],c:[6,7,9],calcul:[2,3,4,5,6,7,10],call:[2,6],can:[1,2,3,4,5,6,9],cast:[2,4],cc:3,certain:6,chang:[6,7,8],check:[4,6,9],choic:6,clang:9,cmake:9,coalesc:7,code:[2,6,7],collabor:9,collect:[3,4,6,9],com:7,combin:[3,6,9],commun:2,compat:[5,6],compil:[3,4,5,8],complet:[3,6],complex:[2,3,4,6,7,9],complex_typ:[2,4,6],compon:[1,5],comprehens:5,comput:[2,3,4,6,7,9],concurr:[2,3],configur:[0,2,4,6,9],conflict:8,consid:7,consist:[3,5,6],construct:[2,3,4,6],contain:[2,6],context:[3,4],continu:[0,2,9],control:6,cooper:3,copi:[2,6],correct:[6,9],correctli:[2,6],cost:6,cout:4,creat:[2,3,4,6,9],cu:6,cuda:[2,3,4,5,6,7,9,10,11],cudaerror_t:[2,4,6],cudamallocmanag:4,cudaoccupancymaxactiveblockspermultiprocessor:7,cudasuccess:[2,6],cuffdx:[3,6],cufft:[5,6,7],cufftdx:[0,2,3,4,7,8,9],cufftdx_ignore_deprecated_compil:9,cufftdx_ignore_deprecated_dialect:9,cufttdx:9,custom:[3,5,7,9],customiz:5,data:[3,4,5,6,7,9],decltyp:[2,3,4,6],decoupl:6,decreas:5,defer:6,defin:[2,3,4,6,9],definit:6,depend:[4,6,7,9],describ:[2,3,6],descript:[1,2,6],descriptor:[2,3,4,6,9],design:6,destruct:2,detail:[4,6,9],determin:[3,4,6,7],devic:[4,5,6],differ:[2,3,4,5,6,10,11],dim3:4,dimens:[2,3,9,10],direct:[2,6,9,10],direction_of:4,directli:4,directori:6,divid:[3,4],divisor:3,doc:7,document:[5,8],doe:[4,6,8],doesn:[2,4],don:2,done:7,doubl:[2,3,4,6,9],double2:2,driver:6,dure:9,e:[2,3],ea:8,each:[2,3,4,6],earli:8,either:[3,4,6],element:[2,6,7,9,10],elements_per_thread:[3,4,6],elementsperthread:[2,3,4,6,8,10],embedd:5,emit:9,empti:[2,4,6],enabl:[5,7,9],encod:6,endl:4,enough:7,ensur:7,entir:7,equal:[2,4],equival:6,error:[2,4,9],error_cod:6,etc:[5,6],evalu:6,everi:9,evolv:6,exact:6,exactli:[2,3,11],exampl:[1,3,6],exclus:3,execut:[1,5,6,8,10,11],exist:[6,7],expect:[2,4],experiment:[6,9],expertis:9,expos:6,express:6,extend:4,extens:[5,6],extern:[2,6],extra:[4,7],f:3,fail:[3,4],fals:[2,3,4],fast:[3,5],featur:6,fft:[0,2,5,7,8,9,10,11],fft_128_float:6,fft_direct:[2,3,4,6],fft_input_size_in_byt:2,fft_output_size_in_byt:2,fft_type:[2,3,4,6],fftcomplet:6,fftexecut:6,ffts_per_block:[2,3,4],fftsperblock:[2,3,4,6,8,10],file:[4,6],fill:7,find:1,fine:9,first:[2,5,8],fit:2,fix:6,float2:[2,6],flow:9,follow:[0,2,3,4,6,9],form:[3,4,6],format:4,forward:[2,3,4,6],fourier:[3,5],fp16:2,free:2,freed:2,from:[2,4,5,6,7,9],full:[6,8],fulli:6,further:5,fuse:5,fusion:5,futur:[0,2,4,5,6,9],ga:8,gcc:9,gener:[3,5,8],get:6,given:9,global:[2,4,5,6,7,9],go:4,gpu:[2,3,6,7,9,11],grain:9,greater:[3,4,9],greatli:7,group:3,group__cudart__occup:7,guarante:[2,3,9,10,11],guid:7,ha:[2,4,6],half:[2,9],hand:6,handl:6,happen:5,hardwar:6,have:[2,4,6],header:[4,6,9],held:4,help:7,helper:[2,4],here:1,heurist:[3,4],high:5,higher:[4,9],hood:5,host:[6,9],how:[3,4,6,7],howev:[2,9],hpp:[2,3,4,6],html:7,http:7,i:[2,6],ident:[2,3,7,10,11],identifi:6,ie:7,imaginari:2,imaginary_1:2,imaginary_2:2,impact:8,implement:[3,4,6],implicit_type_batch:[2,4],implicitli:2,improv:[5,7,8],includ:[2,3,4,6,8,9],increas:7,independ:3,index:[2,7],indic:6,influenc:6,inform:[4,6,9],inlin:6,input:[3,6,9],input_typ:4,insid:[3,5,6],instanti:6,invalid:2,invers:[3,4,6],iostream:4,is_complet:4,is_complete_fft:[4,6],is_complete_fft_execut:[2,4],is_fft:4,is_fft_execut:4,its:[2,4,6],kernel:[2,5,8],know:6,label:[4,6],last:[2,6,8],latenc:5,later:2,launch:[6,7],let:6,level:6,librari:[1,5,6,7,8,9],lifetim:[2,4],like:6,line:[6,8],linux:9,list:[6,7,8,9],load:[2,7],local:[2,6,9],locat:6,logic:2,lose:7,magnitud:7,mai:[0,2,4,6,7,8,9],main:[1,5],major:6,make:[1,4,9],make_workspac:[2,4,6],manag:[4,5,6],mani:6,manner:6,map:6,match:7,max_siz:9,max_threads_per_block:[2,4,6],maxim:[4,6],maximum:[2,4,6],mean:[2,4,6],mechan:6,memori:[5,6,9],merg:7,messag:4,method:[1,4,5,6,8],might:[7,9],miss:4,modifi:6,more:[2,3,4,6,7,9],most:9,move:6,movement:5,msvc:9,much:4,multipl:[4,6],must:[2,3,4,6,9],mutual:3,my_fft_kernel_128:6,n:2,namespac:6,natur:2,necessari:[3,4],need:[5,6,7,9],never:2,newer:9,next:5,none:4,note:[4,5],nsight:7,nsightcomput:7,number:[2,3,4,6,10],nvcc:[6,9],nvidia:7,nvrtc:[6,9],o3:6,o:6,object:[2,3,4,6],obtain:[6,9],occup:7,offer:[6,7,8],onc:6,one:[3,4,6,7,9],onli:[2,3,4,6,9],oper:[1,2,4,5,6,7,9],optim:[6,7,8],optimum:7,option:[5,7,9],order:[2,6,7,9],organ:6,other:[0,2,4,5,6,9],otherwis:[2,4],our:6,output:[3,6,9],output_typ:4,over:6,overload:2,overview:5,p:3,parallel:[3,4,6,7],paramet:[4,6,7,9],part:[2,4],particip:2,particular:6,pass:[2,4,6],path_to_cufftdx_loc:6,peak:7,per:[2,6,7,9,10],perform:[3,4,5,6,8,9],pick:6,piec:6,place:2,platform:6,pleas:4,point:[2,3,4],pointer:[2,6,8],port:7,post:7,potenti:[6,7],power:[0,2,3,8,9],practic:7,pre:7,precis:[2,5,6,9,10],precision_of:4,preliminari:9,preserv:6,previou:6,problem:6,process:[2,7],produc:[2,3,10,11],program:8,proof:6,propag:6,properti:6,provid:[4,5,6,7],ptx:6,ptxa:8,queri:6,quick:5,quickli:6,r2c:[2,3,4],race:2,rang:[3,9],read:5,real:[2,3,6,9],real_1:2,real_2:2,reason:7,recompil:6,refer:[2,5],regist:[6,7,9],regular:7,rel:9,releas:[4,5,6],remov:[0,2,9],replac:4,request:[3,6],requir:[0,2,3,5,6,7],requires_workspac:[2,3,4,6,9],requiresworkspac:6,resourc:[6,7],respons:2,restor:8,restrict:3,result:[2,3,6,9,10,11],retriev:[4,6],routin:[5,6],rule:2,run:[2,3,4,6,7,9],runtim:7,s:[2,3,4],same:[2,3,4,6,10,11],save:[5,6,7],second:[2,6],section:[2,8],see:[2,3,4,8,9],seen:6,select:[5,6,9],separ:[3,7],set:[3,4,7],share:[6,7,8,9],shared_mem:[2,6],shared_memori:2,shared_memory_input:2,shared_memory_s:[2,4,6],ship:6,should:[2,3,4,6,7,8],shouldn:8,side:4,signific:8,silenc:9,simpl:[6,7],simultan:3,sinc:[2,6],singl:[2,3,6,9],size:[0,2,5,6,7,8,9,10],size_byt:4,size_of:[3,4,6],sizeof:4,sm70:8,sm80:8,sm:[4,6,9],sm_70:[3,6],sm_72:3,sm_75:3,sm_80:3,sm_86:3,small:9,smaller:[2,3],so:[0,2,6,9],softwar:9,solv:3,some:6,sourc:[6,7],specif:9,specifi:[3,6],standalon:6,standard:9,start:[5,6,7],std:[4,6],storage_s:[2,4,6],store:[2,7,9],stream:7,stride:[2,6,8],struct:3,suggest:[6,9],suggested_ffts_per_block:[4,6],summar:9,support:[3,5,6],t:[2,4,8],tabl:9,take:7,target:[3,4,6,7,8,9],techniqu:6,templat:[2,6],temporari:7,th:[2,4],than:[2,3,4,6],thei:[3,4],them:6,therefor:[6,9],thi:[2,3,4,6,7,8,9],those:2,thread:[1,6,7,9,10],thread_data:[2,6],threadidx:6,three:6,thu:2,time:6,toolkit:[5,6,9],total:4,track:[2,4],trait:[1,2,5,6,9],transform:[3,5,6],trip:[5,7],tune:[3,7],ture:[3,7],tweak:7,two:[2,3,5,6,8],type:[2,6,9,10],type_of:4,typenam:[2,4,6],u:3,under:5,underli:[2,3,4],unless:[3,4,8],unlik:6,unnecessari:5,unnorm:3,unsign:[3,4],unsupport:9,up:[0,2,6,7,9],upcom:7,upon:2,us:[2,3,4,5,7,8,9],usabl:6,usag:1,user:[2,3,4,6,7,9],v100:8,valid:[2,4],valu:[3,6,7,8],value_typ:[2,4,6],variat:6,variou:8,verifi:6,version:[0,2,5,6,7,9],via:9,volta:[3,6,7],wa:[2,4,6],wai:6,want:6,warn:8,we:[4,6,7,8],well:3,what:[5,7],when:[2,3,4,6,8],where:[2,6,9],which:[2,3,4,6,8,9],why:5,window:9,within:[2,3,4],without:7,won:4,work:[8,9],workspac:[0,1,6,9],workspace_s:[2,4],workspace_typ:[2,4,6],world:6,wors:7,would:6,write:7,wsl2:9,x:[3,4,6],xxx:8,y:[3,4,6],yet:[3,9],you:[1,5,6,9],your:5,z:[3,4],zero:4},titles:["<no title>","cuFFTDx API Reference","Execution Methods","Operators","Traits","NVIDIA cuFFTDx","First FFT using cuFFTDx","Achieving high performance","Release Notes","Requirements and Functionality","<no title>","<no title>"],titleterms:{"0":8,"1":8,"3":8,"function":[2,9],"new":8,In:2,Is:4,achiev:7,advanc:7,advic:7,api:1,batch:4,block:[2,3,4],blockdim:3,compil:[6,9],complet:4,configur:3,cufftdx:[1,5,6],custom:6,data:2,descript:[3,4],dim:4,direct:[3,4],element:[3,4],exampl:[2,4],execut:[2,3,4],featur:8,fft:[3,4,6],first:6,format:2,further:7,fusion:7,gener:7,guid:5,happen:6,high:7,highlight:5,hood:6,implicit:4,input:[2,4],issu:8,kernel:[6,7],known:8,make:2,manag:7,max:4,memori:[2,4,7],method:2,next:6,note:8,nvidia:5,oper:3,output:[2,4],per:[3,4],perform:7,precis:[3,4],read:7,refer:[1,7],regist:2,releas:8,requir:[4,9],resolv:8,share:[2,4],size:[3,4],sm:3,storag:4,stride:4,suggest:4,support:9,thread:[2,3,4],trait:4,type:[3,4],under:6,us:6,user:5,valu:[2,4],what:6,why:6,workspac:[2,4],your:6}}) \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/warnings/bit_identical.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/warnings/bit_identical.html new file mode 100644 index 0000000000000..9b02de0cfa38d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/warnings/bit_identical.html @@ -0,0 +1,258 @@ + + + + + + <no title> — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • <no title>
  • +
  • +
  • +
+
+
+
+
+ +
+

Warning

+

It is not guaranteed that executions of the same FFTs (size, direction, type, precision) but with different

+ +

will produce bit-identical results.

+
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/warnings/bit_identical_sm.html b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/warnings/bit_identical_sm.html new file mode 100644 index 0000000000000..96837921c736c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/docs/warnings/bit_identical_sm.html @@ -0,0 +1,253 @@ + + + + + + <no title> — cuFFTDx 1.0.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • »
  • +
  • <no title>
  • +
  • +
  • +
+
+
+
+
+ +
+

Warning

+

It is not guaranteed that executions of exactly the same FFTs on GPUs of different CUDA architectures will produce +bit-identical results.

+
+ + +
+
+
+ +
+ +
+

© Copyright 2022, NVIDIA Corporation.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/CMakeLists.txt b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/CMakeLists.txt new file mode 100644 index 0000000000000..8b857323eefff --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/CMakeLists.txt @@ -0,0 +1,267 @@ +cmake_minimum_required(VERSION 3.18.0) + +# cuFFTDxExamples project +project(cuFFTDxExamples LANGUAGES CXX CUDA) + +# Find CUDA Toolkit packaged, required for NVRTC sample +find_package(CUDAToolkit) + +# Project options +option(USE_MATHDX_PACKAGE "Use mathDx package to find cuFFTDx" ON) +option(USE_CUFFTDX_PACKAGE "Use cuFFTDx package to find cuFFTDx" OFF) + +if(DEFINED cufftdx_ROOT OR DEFINED ENV{cufftdx_ROOT}) + SET(USE_CUFFTDX_PACKAGE ON CACHE BOOL "Use cuFFTDx package to find cuFFTDx" FORCE) + SET(USE_MATHDX_PACKAGE OFF CACHE BOOL "Use mathDx package to find cuFFTDx" FORCE) +endif() + +if(DEFINED mathdx_ROOT OR DEFINED ENV{mathdx_ROOT}) + SET(USE_CUFFTDX_PACKAGE OFF CACHE BOOL "Use cuFFTDx package to find cuFFTDx" FORCE) + SET(USE_MATHDX_PACKAGE ON CACHE BOOL "Use mathDx package to find cuFFTDx" FORCE) +endif() + +if(NOT TARGET cufftdx) + if(USE_MATHDX_PACKAGE) + message(STATUS "Using mathDx package to find cuFFTDx") + # Find mathDx and cuFFTDx (mathDx's component) + # Default path: "/opt/nvidia/mathdx/22.2", path to mathDx can be passed cmake in mathdx_ROOT variable + find_package(mathdx REQUIRED COMPONENTS cufftdx CONFIG + PATHS + "${PROJECT_SOURCE_DIR}/../.." # example/cufftdx + "${PROJECT_SOURCE_DIR}/../../.." # include/cufftdx/example + "/opt/nvidia/mathdx/22.2" + ) + elseif(USE_CUFFTDX_PACKAGE) + message(STATUS "Using cuFFTDx package to find cuFFTDx") + # Find cuFFTDx + # Default path: "/opt/nvidia/mathdx/22.2/include/cufftdx", path to cuFFTDx can be passed cmake in cufftdx_ROOT variable + find_package(cufftdx REQUIRED CONFIG PATHS "/opt/nvidia/mathdx/22.2/include/cufftdx" "${PROJECT_SOURCE_DIR}/../../cufftdx") + else() + message(FATAL_ERROR "No cuFFTDx package found") + endif() +endif() + +if((NOT TARGET cufftdx) AND (NOT CUFFTDX_TEST_RELEASED_PACKAGE) AND (NOT MATHDX_TEST_RELEASED_PACKAGE)) + # Targeted CUDA Architectures, see https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES + if(CUFFTDX_TARGET_ARCHS) + set(CUFFTDX_TARGET_ARCHS 70;75;80 CACHE + STRING "[LEGACY] List of targeted cuFFTDx Example CUDA architectures (compute capabilities), for example \"70;75\". Can't be older than 70." + ) + list(SORT CUFFTDX_TARGET_ARCHS) + # Remove unsupported architectures + list(REMOVE_ITEM CUFFTDX_TARGET_ARCHS 30;32;35;37;50;52;53;60;61;62) + + # Translate legacy option CUFFTDX_TARGET_ARCHS into CUFFTDX_CUDA_ARCHITECTURES + set(CUFFTDX_TARGET_ARCHS_TRANSLATED) + foreach(ARCH ${CUFFTDX_TARGET_ARCHS}) + list(APPEND CUFFTDX_TARGET_ARCHS_TRANSLATED ${ARCH}-real) + endforeach() + set(CUFFTDX_CUDA_ARCHITECTURES ${CUFFTDX_TARGET_ARCHS_TRANSLATED} CACHE + STRING "List of targeted cuFFTDx CUDA architectures, for example \"70-real;75-real;80\"" + ) + else() + set(CUFFTDX_CUDA_ARCHITECTURES 70-real;75-real;80-real CACHE + STRING "List of targeted cuFFTDX CUDA architectures, for example \"70-real;75-real;80\"" + ) + # Remove unsupported architectures + list(REMOVE_ITEM CUFFTDX_CUDA_ARCHITECTURES 30;32;35;37;50;52;53;60;61;62) + list(REMOVE_ITEM CUFFTDX_CUDA_ARCHITECTURES 30-real;32-real;35-real;37-real;50-real;52-real;53-real;60-real;61-real;62-real) + list(REMOVE_ITEM CUFFTDX_CUDA_ARCHITECTURES 30-virtual;32-virtual;35-virtual;37-virtual;50-virtual;52-virtual;53-virtual;60-virtual;61-virtual;62-virtual) + endif() + message(STATUS "Targeted cuFFTDx Examples CUDA Architectures: ${CUFFTDX_CUDA_ARCHITECTURES}") + + # Global CXX/CUDA flags + if(NOT MSVC) + set(CUFFTDX_CUDA_CXX_FLAGS "${CUFFTDX_CUDA_CXX_FLAGS} -Wall -Wextra") + else() + add_definitions(-D_CRT_SECURE_NO_WARNINGS) + add_definitions(-D_CRT_NONSTDC_NO_WARNINGS) + add_definitions(-D_SCL_SECURE_NO_WARNINGS) + add_definitions(-DNOMINMAX) + set(CUFFTDX_CUDA_CXX_FLAGS "${CUFFT_CUDA_CXX_FLAGS} /W3") # Warning level + set(CUFFTDX_CUDA_CXX_FLAGS "${CUFFT_CUDA_CXX_FLAGS} /WX") # All warnings are errors + endif() + + # Global CXX flags/options + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CXX_EXTENSIONS OFF) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CUFFTDX_CUDA_CXX_FLAGS}") + + # Global CUDA CXX flags/options + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(CMAKE_CUDA_EXTENSIONS OFF) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${CUFFTDX_CUDA_CXX_FLAGS}\"") + + # Clang + if(BUILD_CUFFTDX) + if(CMAKE_CUDA_HOST_COMPILER MATCHES ".*clang.*") + # clang complains about unused function in CUDA system headers + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-unused-function") + endif() + endif() + + # CUDA Architectures + set(CMAKE_CUDA_ARCHITECTURES OFF) + + # Enable testing (ctest) + enable_testing() +endif() + +# ############################################################### +# add_cufftdx_example +# ############################################################### +function(add_cufftdx_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) + list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE) + set_source_files_properties(${EXAMPLE_SOURCES} PROPERTIES LANGUAGE CUDA) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + $,mathdx::cufftdx,cufftdx::cufftdx> + ) + if(NOT TARGET cufftdx) + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/cufftdx" + ) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_ARCHITECTURES "${CUFFTDX_CUDA_ARCHITECTURES}" + ) + target_compile_options(${EXAMPLE_TARGET} + PRIVATE + "$<$:SHELL:-Xfatbin -compress-all>" + ) + add_test(NAME ${EXAMPLE_NAME} COMMAND ${EXAMPLE_TARGET}) + set_tests_properties(${EXAMPLE_NAME} + PROPERTIES + LABELS "CUFFTDX_EXAMPLE" + ) + add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET}) +endfunction() + +# ############################################################### +# add_cufft_and_cufftdx_example +# ############################################################### +function(add_cufft_and_cufftdx_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) + list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE) + set_source_files_properties(${EXAMPLE_SOURCES} PROPERTIES LANGUAGE CUDA) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + $,mathdx::cufftdx,cufftdx::cufftdx> + ) + if(CUFFTDX_EXAMPLES_CUFFT_CALLBACK) + if(TARGET cufft) + target_link_libraries(${EXAMPLE_TARGET} PRIVATE cufft_static) + else() + target_link_libraries(${EXAMPLE_TARGET} PRIVATE CUDA::cufft_static) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) + target_compile_definitions(${EXAMPLE_TARGET} PRIVATE CUFFTDX_EXAMPLES_CUFFT_CALLBACK) + else() + if(TARGET cufft) + target_link_libraries(${EXAMPLE_TARGET} PRIVATE cufft) + else() + target_link_libraries(${EXAMPLE_TARGET} PRIVATE CUDA::cufft) + endif() + endif() + if(NOT TARGET cufftdx) + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/cufftdx" + ) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_ARCHITECTURES "${CUFFTDX_CUDA_ARCHITECTURES}" + ) + target_compile_options(${EXAMPLE_TARGET} + PRIVATE + "$<$:SHELL:-Xfatbin -compress-all>" + ) + add_test(NAME ${EXAMPLE_NAME} COMMAND ${EXAMPLE_TARGET}) + set_tests_properties(${EXAMPLE_NAME} + PROPERTIES + LABELS "CUFFTDX_EXAMPLE" + ) + add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET}) +endfunction() + +# ############################################################### +# add_cufftdx_nvrtc_example +# ############################################################### +function(add_cufftdx_nvrtc_example GROUP_TARGET EXAMPLE_NAME EXAMPLE_SOURCES) + list(GET EXAMPLE_SOURCES 0 EXAMPLE_MAIN_SOURCE) + get_filename_component(EXAMPLE_TARGET ${EXAMPLE_MAIN_SOURCE} NAME_WE) + set_source_files_properties(${EXAMPLE_SOURCES} PROPERTIES LANGUAGE CUDA) + add_executable(${EXAMPLE_TARGET} ${EXAMPLE_SOURCES}) + target_link_libraries(${EXAMPLE_TARGET} + PRIVATE + $,mathdx::cufftdx,cufftdx::cufftdx> + CUDA::nvrtc + ) + if(NOT TARGET cufftdx) + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/example/cufftdx" + ) + target_compile_definitions(${EXAMPLE_TARGET} + PRIVATE + CUDA_INCLUDE_DIR="${CUDAToolkit_INCLUDE_DIRS}" + CUFFTDX_INCLUDE_DIRS="${cufftdx_INCLUDE_DIRS}" + ) + else() + target_compile_definitions(${EXAMPLE_TARGET} + PRIVATE + CUDA_INCLUDE_DIR="${CUDAToolkit_INCLUDE_DIRS}" + CUFFTDX_INCLUDE_DIRS="${CMAKE_SOURCE_DIR}/libcufftdx/include\\\;${CMAKE_BINARY_DIR}/libcufftdx/include" + ) + endif() + set_target_properties(${EXAMPLE_TARGET} + PROPERTIES + CUDA_ARCHITECTURES "${CUFFTDX_CUDA_ARCHITECTURES}" + ) + add_test(NAME ${EXAMPLE_NAME} COMMAND ${EXAMPLE_TARGET}) + set_tests_properties(${EXAMPLE_NAME} + PROPERTIES + LABELS "CUFFTDX_EXAMPLE" + ) + add_dependencies(${GROUP_TARGET} ${EXAMPLE_TARGET}) +endfunction() + +# ############################################################### +# cuFFTDx Examples +# ############################################################### + +add_custom_target(cufftdx_examples) + +# CUFFTDX_EXAMPLES_CUFFT_CALLBACK +option(CUFFTDX_EXAMPLES_CUFFT_CALLBACK "Build cuFFTDx convolution_performance example with cuFFT callback" OFF) + +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_thread" simple_fft_thread.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_thread_fp16" simple_fft_thread_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block" simple_fft_block.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_half2" simple_fft_block_half2.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_fp16" simple_fft_block_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_r2c" simple_fft_block_r2c.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_r2c_fp16" simple_fft_block_r2c_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_c2r" simple_fft_block_c2r.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_c2r_fp16" simple_fft_block_c2r_fp16.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_shared" simple_fft_block_shared.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_std_complex" simple_fft_block_std_complex.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.simple_fft_block_cub_io" simple_fft_block_cub_io.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.convolution" convolution.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.convolution_r2c_c2r" convolution_r2c_c2r.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.block_fft_performance" block_fft_performance.cu) +add_cufftdx_example(cufftdx_examples "cuFFTDx.example.block_fft_performance_many" block_fft_performance_many.cu) +add_cufft_and_cufftdx_example(cufftdx_examples "cuFFTDx.example.convolution_performance" convolution_performance.cu) +add_cufftdx_nvrtc_example(cufftdx_examples "cuFFTDx.example.nvrtc_fft_thread" nvrtc_fft_thread.cu) \ No newline at end of file diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/Makefile b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/Makefile new file mode 100644 index 0000000000000..4481f8532f920 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/Makefile @@ -0,0 +1,35 @@ +NVCC=nvcc +NVCC_FLAGS=-std=c++17 -O3 --generate-code arch=compute_70,code=sm_70 --generate-code arch=compute_75,code=sm_75 --generate-code arch=compute_80,code=sm_80 --generate-code arch=compute_86,code=sm_86 + +CUFFTDX_INCLUDE_DIR=../include/ +CUDA_BIN_DIR=$(shell dirname `which $(NVCC)`) +CUDA_INCLUDE_DIR=$(CUDA_BIN_DIR)/../include +NVRTC_DEFINES=-DCUDA_INCLUDE_DIR="\"$(CUDA_INCLUDE_DIR)\"" -DCUFFTDX_INCLUDE_DIRS="\"$(CUFFTDX_INCLUDE_DIR)\"" + +SRCS=$(filter-out nvrtc_fft_thread.cu convolution_performance.cu, $(wildcard *.cu)) +TARGETS=$(patsubst %.cu,%,$(SRCS)) + +NVRTC_SRCS=$(wildcard nvrtc_*.cu) +NVRTC_TARGETS=$(patsubst %.cu,%,$(NVRTC_SRCS)) + +CUFFT_SRCS=convolution_performance.cu +CUFFT_TARGETS=convolution_performance + +$(TARGETS): %: %.cu + $(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) + +$(NVRTC_TARGETS): %: %.cu + $(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(NVRTC_DEFINES) -lnvrtc -lcuda + +$(CUFFT_TARGETS): %: %.cu + $(NVCC) -o $@ $< $(NVCC_FLAGS) -I$(CUFFTDX_INCLUDE_DIR) $(NVRTC_DEFINES) -lcuda -lcufft + +.PHONY: all clean + +all: $(TARGETS) $(NVRTC_TARGETS) $(CUFFT_TARGETS) + $(echo $(NVRTC_TARGETS)) + +clean: + rm -f $(TARGETS) $(NVRTC_TARGETS) $(CUFFT_TARGETS) + +.DEFAULT_GOAL := all diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance.cu new file mode 100644 index 0000000000000..2236983fce1ff --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance.cu @@ -0,0 +1,32 @@ +#include +#include +#include +#include + +#include "block_fft_performance.hpp" + +template +void block_fft_performance() { + using namespace cufftdx; + + using fft_base = decltype(Block() + Type() + Direction() + + Precision() + SM()); + + static constexpr unsigned int elements_per_thread = 8; + static constexpr unsigned int fft_size = 512; + static constexpr unsigned int ffts_per_block = 1; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)) + benchmark_block_fft(stream, true); + CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); +} + +template +struct block_fft_performance_functor { + void operator()() { return block_fft_performance(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance.hpp new file mode 100644 index 0000000000000..af3d40258d0cf --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance.hpp @@ -0,0 +1,185 @@ +#ifndef CUFFTDX_EXAMPLE_BLOCK_FFT_PERFORMANCE_HPP_ +#define CUFFTDX_EXAMPLE_BLOCK_FFT_PERFORMANCE_HPP_ + +#include +#include +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" +#include "random.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data, + unsigned int repeats, + typename FFT::workspace_type workspace) { + using complex_type = typename FFT::value_type; + extern __shared__ complex_type shared_mem[]; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + +// Execute FFT +#pragma unroll 1 + for (unsigned int i = 0; i < repeats; i++) { + FFT().execute(thread_data, shared_mem, workspace); + } + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +template +struct measure { + // Returns execution time in ms + template + static float execution(Kernel&& kernel, cudaStream_t stream) { + cudaEvent_t startEvent, stopEvent; + CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + for (size_t i = 0; i < WarmUpRuns; i++) { + kernel(); + } + + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream)); + kernel(); + CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + float time; + CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent)); + return time; + } +}; + +template +void benchmark_block_fft(const cudaStream_t& stream, bool verbose = false) { + using namespace cufftdx; + + // Create complete FFT description, only now we can query EPT and suggested FFTs per block + using FFT_complete = decltype(FFTBase() + Size()); + + static constexpr unsigned int inside_repeats = 4000; + static constexpr unsigned int kernel_repeats = 1; + static constexpr unsigned int warm_up_runs = 1; + + static constexpr unsigned int fft_size = S; + static constexpr unsigned int elements_per_thread = UseSuggested ? FFT_complete::elements_per_thread : EPT; + static constexpr unsigned int ffts_per_block = UseSuggested ? FFT_complete::suggested_ffts_per_block : FPB; + + using FFT = decltype(FFT_complete() + ElementsPerThread() + FFTsPerBlock()); + using complex_type = typename FFT::value_type; + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, FFT::shared_memory_size)); + + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + block_fft_kernel, + FFT::block_dim.x * FFT::block_dim.y * FFT::block_dim.z, + FFT::shared_memory_size)); + + unsigned int multiprocessor_count = example::get_multiprocessor_count(); + unsigned int cuda_blocks = blocks_per_multiprocessor * multiprocessor_count; + + // The memory required to run fft (number of complex_type values that must be allocated). + // For r2c, the input consists of fft_size real numbers and the output consists of (fft_size / 2 + 1) complex numbers. + // One memory block will be used to store input and output, so the memory block must fit + // max((fft_size + 1) / 2, fft_size / 2 + 1) = (fft_size / 2 + 1) complex numbers. + // For c2r, the input consists of (fft_size / 2 + 1) complex numbers and the output consists of fft_size real numbers, + // so the minimal required memory size is the same. + unsigned int input_size = + ffts_per_block * cuda_blocks * (type_of::value == fft_type::c2c ? fft_size : (fft_size / 2 + 1)); + + // Host data + std::vector input = + example::get_random_complex_data(input_size, -10, 10); + + // Device data + complex_type* device_buffer; + auto size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, size_bytes)); + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + CUDA_CHECK_AND_EXIT(error_code); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + + // Measure performance of N trials + double ms_n = measure<>::execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + block_fft_kernel<<>>( + device_buffer, inside_repeats, workspace); + } + }, + stream); + + // Check kernel error + CUDA_CHECK_AND_EXIT(cudaGetLastError()); + + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Measure performance of 2*N trials + double ms_n2 = measure<>::execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + block_fft_kernel<<>>( + device_buffer, 2 * inside_repeats, workspace); + } + }, + stream); + + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + // Time for N repeats without overhead + auto time_n = ms_n2 - ms_n; + double gflops = 1.0 * kernel_repeats * inside_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time_n / 1000000.0; + + static const std::string fft_type_name = type_of::value == fft_type::c2c ? "c2c" : + (type_of::value == fft_type::c2r ? "c2r" : + "r2c"); + if (verbose) { + std::cout << "FFT type: " << fft_type_name << std::endl; + std::cout << "FFT size: " << fft_size << std::endl; + std::cout << "FFTs elements per thread: " << FFT::elements_per_thread << std::endl; + std::cout << "FFTs per block: " << ffts_per_block << std::endl; + std::cout << "CUDA blocks: " << cuda_blocks << std::endl; + std::cout << "Blocks per multiprocessor: " << blocks_per_multiprocessor << std::endl; + std::cout << "FFTs run: " << ffts_per_block * cuda_blocks << std::endl; + std::cout << "Shared memory: " << FFT::shared_memory_size << std::endl; + std::cout << "Avg Time [ms_n]: " << time_n / (inside_repeats * kernel_repeats) << std::endl; + std::cout << "Time (all) [ms_n]: " << time_n << std::endl; + std::cout << "Performance [GFLOPS]: " << gflops << std::endl; + } else { + std::cout << fft_type_name << ", " << fft_size << ", " << gflops << ", " + << time_n / (inside_repeats * kernel_repeats) << ", " << std::endl; + } +} + +#endif // CUFFTDX_EXAMPLE_BLOCK_FFT_PERFORMANCE_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance_many.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance_many.cu new file mode 100644 index 0000000000000..6eadfcd2acda7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_fft_performance_many.cu @@ -0,0 +1,82 @@ +#include +#include +#include +#include + +#include "block_fft_performance.hpp" + +template +void block_fft_performance(const cudaStream_t& stream, bool verbose) { + using namespace cufftdx; + + using FFT_base = decltype(Block() + Type() + Precision() + SM()); + + using FFT_with_direction = typename std:: + conditional()), FFT_base>::type; + + benchmark_block_fft(stream, verbose); + + if (verbose) + std::cout << std::endl; +} + +template +struct block_fft_performance_functor { + void operator()() { + using namespace cufftdx; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)) + + bool default_verbose = false; + + + // To specify EPT and FPB values, set UsedSuggested to false. + // FFTDirection is used if and only if FFTType is C2C. + // Below is an example of a test run with specified EPT and FPB values. + + block_fft_performance(stream, + default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + block_fft_performance(stream, default_verbose); + + CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); + } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_io.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_io.hpp new file mode 100644 index 0000000000000..9c82a24985564 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/block_io.hpp @@ -0,0 +1,394 @@ + +#ifndef CUFFTDX_EXAMPLE_BLOCK_IO_HPP_ +#define CUFFTDX_EXAMPLE_BLOCK_IO_HPP_ + +#include "fp16_common.hpp" + +namespace example { + namespace __io { + template + inline __device__ cufftdx::complex<__half2> convert_to_rrii(const cufftdx::complex<__half2>& value) { + return to_rrii(value); + } + template<> + inline __device__ cufftdx::complex<__half2> convert_to_rrii(const cufftdx::complex<__half2>& value) { + return value; + } + template + inline __device__ cufftdx::complex<__half2> convert_to_riri(const cufftdx::complex<__half2>& value) { + return to_riri(value); + } + template<> + inline __device__ cufftdx::complex<__half2> convert_to_riri(const cufftdx::complex<__half2>& value) { + return value; + } + } // namespace __io + + template + struct io { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + static inline __device__ unsigned int stride_size() { + return FFT::stride; + } + + static inline __device__ unsigned int batch_offset(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return cufftdx::size_of::value * global_fft_id; + } + + template + static inline __device__ void copy(const DataType* source, DataType* target, unsigned int n) { + unsigned int stride = blockDim.x * blockDim.y; + unsigned int index = threadIdx.y * blockDim.x + threadIdx.x; + for (int step = 0; step < FFT::elements_per_thread; step++) { + if (index < n) { + target[index] = source[index]; + } + index += stride; + } + } + + template + static inline __device__ void load_to_smem(const DataType* global, unsigned char* shared) { + if (cufftdx::type_of::value == cufftdx::fft_type::c2c) { + unsigned int input_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(global), + reinterpret_cast(shared), + input_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::c2r) { + unsigned int input_length = blockDim.y * ((cufftdx::size_of::value / 2) + 1); + copy(reinterpret_cast(global), + reinterpret_cast(shared), + input_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::r2c) { + unsigned int input_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(global), + reinterpret_cast(shared), + input_length); + } + __syncthreads(); + } + + template + static inline __device__ void store_from_smem(const unsigned char* shared, DataType* global) { + __syncthreads(); + if (cufftdx::type_of::value == cufftdx::fft_type::c2c) { + unsigned int output_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(shared), + reinterpret_cast(global), + output_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::c2r) { + unsigned int output_length = blockDim.y * cufftdx::size_of::value; + copy(reinterpret_cast(shared), + reinterpret_cast(global), + output_length); + } else if (cufftdx::type_of::value == cufftdx::fft_type::r2c) { + unsigned int output_length = blockDim.y * ((cufftdx::size_of::value / 2) + 1); + copy(reinterpret_cast(shared), + reinterpret_cast(global), + output_length); + } + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto load(const void* input, + ComplexType* thread_data, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return load_c2c((ComplexType*)input, thread_data, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto load(const void* input, + ComplexType* thread_data, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return load_c2r((ComplexType*)input, thread_data, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto load(const void* input, + ComplexType* thread_data, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return load_r2c((scalar_type*)input, thread_data, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto store(const ComplexType* thread_data, + void* output, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return store_c2c(thread_data, (ComplexType*)output, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto store(const ComplexType* thread_data, + void* output, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return store_c2r(thread_data, (scalar_type*)output, local_fft_id); + } + + template::value, class ComplexType = complex_type> + static inline __device__ auto store(const ComplexType* thread_data, + void* output, + const unsigned int local_fft_id) -> + typename std::enable_if::type { + return store_r2c(thread_data, (ComplexType*)output, local_fft_id); + } + + // input - global input with all FFTs + // thread_data - local thread array to load values from input to + // local_fft_id - ID of FFT batch in CUDA block + template + static inline __device__ void load_c2c(const ComplexType* input, + ComplexType* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = input[index]; + index += stride; + } + } + + // If InputInRRIILayout is false, then function assumes that values in input are in RIRI + // layout, and before loading them to thread_data they are converted to RRII layout. + // Otherwise, if InputInRRIILayout is true, then function assumes values in input are in RRII + // layout, and don't need to be converted before loading to thread_data. + template + static inline __device__ void load(const cufftdx::complex<__half2>* input, + cufftdx::complex<__half2>* thread_data, + unsigned int local_fft_id) { + static_assert(std::is_same>::value, + "This can be only used with half precision FFTs"); + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = __io::convert_to_rrii(input[index]); + index += stride; + } + } + + template + static inline __device__ void store_c2c(const ComplexType* thread_data, + ComplexType* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = thread_data[i]; + index += stride; + } + } + + // Function assumes that values in thread_data are in RRII layout. + // If OutputInRRIILayout is false, values are saved into output in RIRI layout; otherwise - in RRII. + template + static inline __device__ void store(const cufftdx::complex<__half2>* thread_data, + cufftdx::complex<__half2>* output, + unsigned int local_fft_id) { + static_assert(std::is_same>::value, + "This can be only used with half precision FFTs"); + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = __io::convert_to_riri(thread_data[i]); + index += stride; + } + } + + static inline __device__ unsigned int batch_offset_r2c(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return ((cufftdx::size_of::value / 2) + 1) * global_fft_id; + } + + template + static inline __device__ void load_r2c(const scalar_type* input, + ComplexType* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + reinterpret_cast(thread_data)[i] = input[index]; + index += stride; + } + } + + template + static inline __device__ void store_r2c(const ComplexType* thread_data, + ComplexType* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset_r2c(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + output[index] = thread_data[i]; + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_store = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to store one more element + constexpr unsigned int values_left_to_store = + threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft); + if (threadIdx.x < values_left_to_store) { + output[index] = thread_data[FFT::elements_per_thread / 2]; + } + } + + // Function assumes that values in thread_data are in RRII layout. + // If OutputInRRIILayout is false, values are saved into output in RIRI layout; otherwise - in RRII. + template + static inline __device__ void store_r2c(const cufftdx::complex<__half2>* thread_data, + cufftdx::complex<__half2>* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset_r2c(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + output[index] = __io::convert_to_riri(thread_data[i]); + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_store = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to store one more element + constexpr unsigned int values_left_to_store = + threads_per_fft == 1 ? 1 : (output_values_to_store % threads_per_fft); + if (threadIdx.x < values_left_to_store) { + output[index] = __io::convert_to_riri(thread_data[FFT::elements_per_thread / 2]); + } + } + + static inline __device__ unsigned int batch_offset_c2r(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return ((cufftdx::size_of::value / 2) + 1) * global_fft_id; + } + + template + static inline __device__ void load_c2r(const ComplexType* input, + ComplexType* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset_c2r(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + thread_data[i] = input[index]; + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_load = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element + constexpr unsigned int values_left_to_load = + threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft); + if (threadIdx.x < values_left_to_load) { + thread_data[FFT::elements_per_thread / 2] = input[index]; + } + } + + // If InputInRRIILayout is false, then function assumes that values in input are in RIRI + // layout, and before loading them to thread_data they are converted to RRII layout. + // Otherwise, if InputInRRIILayout is true, then function assumes values in input are in RRII + // layout, and don't need to be converted before loading to thread_data. + template + static inline __device__ void load_c2r(const cufftdx::complex<__half2>* input, + cufftdx::complex<__half2>* thread_data, + unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset_c2r(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread / 2; i++) { + thread_data[i] = __io::convert_to_rrii(input[index]); + index += stride; + } + constexpr unsigned int threads_per_fft = cufftdx::size_of::value / FFT::elements_per_thread; + constexpr unsigned int output_values_to_load = (cufftdx::size_of::value / 2) + 1; + // threads_per_fft == 1 means that EPT == SIZE, so we need to load one more element + constexpr unsigned int values_left_to_load = + threads_per_fft == 1 ? 1 : (output_values_to_load % threads_per_fft); + if (threadIdx.x < values_left_to_load) { + thread_data[FFT::elements_per_thread / 2] = __io::convert_to_rrii(input[index]); + } + } + + template + static inline __device__ void store_c2r(const ComplexType* thread_data, + scalar_type* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = reinterpret_cast(thread_data)[i]; + index += stride; + } + } + }; + + template + struct io_fp16 { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + static_assert(std::is_same::value, "This IO class is only for half precision FFTs"); + + static inline __device__ unsigned int stride_size() { + return cufftdx::size_of::value / FFT::elements_per_thread; + } + + static inline __device__ unsigned int batch_offset(unsigned int local_fft_id) { + unsigned int global_fft_id = + FFT::ffts_per_block == 1 ? blockIdx.x : (blockIdx.x * FFT::ffts_per_block + local_fft_id); + return cufftdx::size_of::value * global_fft_id; + } + + static inline __device__ void load(const __half2* input, complex_type* thread_data, unsigned int local_fft_id) { + // Calculate global offset of FFT batch + const unsigned int offset = batch_offset(local_fft_id); + // Get stride, this shows how elements from batch should be split between threads + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + const unsigned int batch_stride = FFT::ffts_per_block * cufftdx::size_of::value * blockDim.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = to_rrii(input[index], input[index + batch_stride]); + index += stride; + } + } + + static inline __device__ void store(const complex_type* thread_data, + __half2* output, + unsigned int local_fft_id) { + const unsigned int offset = batch_offset(local_fft_id); + const unsigned int stride = stride_size(); + unsigned int index = offset + threadIdx.x; + const unsigned int batch_stride = FFT::ffts_per_block * cufftdx::size_of::value * blockDim.x; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + output[index] = to_ri1(thread_data[i]); + output[index + batch_stride] = to_ri2(thread_data[i]); + index += stride; + } + } + }; +} // namespace example + +#endif // CUFFTDX_EXAMPLE_BLOCK_IO_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/common.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/common.hpp new file mode 100644 index 0000000000000..f6e96ea221c17 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/common.hpp @@ -0,0 +1,76 @@ +#ifndef CUFFTDX_EXAMPLE_COMMON_HPP_ +#define CUFFTDX_EXAMPLE_COMMON_HPP_ + +#include + +#ifndef CUDA_CHECK_AND_EXIT +# define CUDA_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != cudaSuccess) { \ + std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#endif // CUDA_CHECK_AND_EXIT + +#ifndef CUFFT_CHECK_AND_EXIT +# define CUFFT_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != CUFFT_SUCCESS) { \ + std::cout << status << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#endif // CUFFT_CHECK_AND_EXIT + +namespace example { + inline unsigned int get_cuda_device_arch() { + int device; + CUDA_CHECK_AND_EXIT(cudaGetDevice(&device)); + + int major = 0; + int minor = 0; + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + + return static_cast(major) * 100 + static_cast(minor) * 10; + } + + inline unsigned int get_multiprocessor_count(int device) { + int multiprocessor_count = 0; + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device)); + return multiprocessor_count; + } + + inline unsigned int get_multiprocessor_count() { + int device = 0; + CUDA_CHECK_AND_EXIT(cudaGetDevice(&device)); + return get_multiprocessor_count(device); + } + + template class Functor> + inline int sm_runner() { + // Get CUDA device compute capability + const auto cuda_device_arch = get_cuda_device_arch(); + + switch (cuda_device_arch) { + // All SM supported by cuFFTDx + case 700: Functor<700>()(); return 0; + case 720: Functor<720>()(); return 0; + case 750: Functor<750>()(); return 0; + case 800: Functor<800>()(); return 0; + case 860: Functor<860>()(); return 0; + default: { + if (cuda_device_arch > 800) { + Functor<800>()(); + return 0; + } + } + } + return 1; + } +} // namespace example + +#endif // CUFFTDX_EXAMPLE_COMMON_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution.cu new file mode 100644 index 0000000000000..099836bafa11a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution.cu @@ -0,0 +1,101 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void convolution_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Scale values + scalar_type scale = 1.0 / cufftdx::size_of::value; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i].x *= scale; + thread_data[i].y *= scale; + } + + // Execute inverse FFT + IFFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// This example demonstrates how to use cuFFTDx t operform a convolution using one-dimensional FFTs. +// +// One block is run, it calculates two 128-point convolutions by first doing forward FFT, then +// applying pointwise operation, and ending with inverse FFT. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void convolution() { + using namespace cufftdx; + + static constexpr unsigned int ffts_per_block = 2; + static constexpr unsigned int fft_size = 128; + // FFT_base defined common options for FFT and IFFT. FFT_base is not a complete FFT description. + // In order to complete FFT description directions are specified: forward for FFT, inverse for IFFT. + using FFT_base = decltype(Block() + Size() + Type() + Precision() + + ElementsPerThread<8>() + FFTsPerBlock() + SM()); + using FFT = decltype(FFT_base() + Direction()); + using IFFT = decltype(FFT_base() + Direction()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = ffts_per_block * fft_size; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + convolution_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes convolution kernel with FFT::block_dim threads in CUDA block + convolution_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct convolution_functor { + void operator()() { return convolution(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution_performance.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution_performance.cu new file mode 100644 index 0000000000000..aeb0e71403700 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution_performance.cu @@ -0,0 +1,384 @@ +#include +#include +#include +#include + +#include +#include +#include +#ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK +#include +#endif + +#include "block_io.hpp" +#include "common.hpp" +#include "random.hpp" + +// Returns execution time in ms +template +float measure_execution(Kernel&& kernel, cudaStream_t stream) { + cudaEvent_t startEvent, stopEvent; + CUDA_CHECK_AND_EXIT(cudaEventCreate(&startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventCreate(&stopEvent)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + for (size_t i = 0; i < WarmUpRuns; i++) { + kernel(); + } + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + CUDA_CHECK_AND_EXIT(cudaEventRecord(startEvent, stream)); + kernel(); + CUDA_CHECK_AND_EXIT(cudaEventRecord(stopEvent, stream)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + float time; + CUDA_CHECK_AND_EXIT(cudaEventElapsedTime(&time, startEvent, stopEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(startEvent)); + CUDA_CHECK_AND_EXIT(cudaEventDestroy(stopEvent)); + return time; +} + + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void convolution_kernel(typename FFT::value_type* data, + typename FFT::workspace_type workspace, + typename IFFT::workspace_type workspace_inverse) { + using complex_type = typename FFT::value_type; + using scalar_type = typename complex_type::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem, workspace); + + // Scale values + scalar_type scale = 1.0 / cufftdx::size_of::value; + for (unsigned int i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i].x *= scale; + thread_data[i].y *= scale; + } + + // Execute inverse FFT + IFFT().execute(thread_data, shared_mem, workspace_inverse); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// Scaling kernel; transforms data between cuFFTs. +template +__global__ void scaling_kernel(cufftComplex* data, + const unsigned int input_size, + const unsigned int ept) { + + static constexpr float scale = 1.0 / fft_size; + + cufftComplex temp; + unsigned int index = blockDim.x * blockIdx.x + threadIdx.x; + + for (int i = 0; i < ept; i++) { + if (index < input_size) { + temp = data[index]; + temp.x *= scale; + temp.y *= scale; + data[index] = temp; + index += blockDim.x * gridDim.x; + } + } +} + +#ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK +template +__device__ cufftComplex scaling_callback(void *dataIn, + size_t offset, + void *callerInfo, + void *sharedPtr) { + static constexpr float scale = 1.0 / fft_size; + + cufftComplex value = static_cast(dataIn)[offset]; + value.x *= scale; + value.y *= scale; + return value; +} + +__device__ __managed__ cufftCallbackLoadC scaling_callback_ptr = scaling_callback<128>; +#endif + +template +double measure_cufftdx(const unsigned int& kernel_repeats, + const unsigned int& cuda_blocks, + typename FFT::value_type* device_buffer, + cudaStream_t stream) { + + using namespace cufftdx; + using complex_type = typename FFT::value_type; + + // create workspaces for FFT and IFFT + cudaError_t error_code = cudaSuccess; + auto workspace = make_workspace(error_code); + CUDA_CHECK_AND_EXIT(error_code); + auto workspace_inverse = make_workspace(error_code); + CUDA_CHECK_AND_EXIT(error_code); + + // run cuFFTDx + double time = measure_execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + // There are (ffts_per_block * fft_size * cuda_blocks) elements + convolution_kernel<<>>( + device_buffer, workspace, workspace_inverse); + } + }, + stream); + + return time; +} + +template +double measure_cufft(const unsigned int& kernel_repeats, + const unsigned int& batch_size, + cufftComplex* device_buffer, + cudaStream_t stream) { + + static constexpr unsigned int block_dim_scaling_kernel = 1024; + + // Calculating parameters for scaling_kernel execution. + // Get maximum number of running CUDA blocks per multiprocessor. + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + scaling_kernel, + block_dim_scaling_kernel, + 0)); + + // Get maximum number of CUDA blocks running on all multiprocessors. + // This many CUDA blocks will be run for simple_kernel. + const unsigned int cuda_blocks = blocks_per_multiprocessor * example::get_multiprocessor_count(); + + const unsigned int input_length = fft_size * batch_size; + const unsigned int elements_per_block = (input_length + cuda_blocks - 1) / cuda_blocks; + const unsigned int elements_per_thread = (elements_per_block + block_dim_scaling_kernel - 1) / block_dim_scaling_kernel; + + // prepare cuFFT runs + cufftHandle plan; + CUFFT_CHECK_AND_EXIT(cufftPlan1d(&plan, fft_size, CUFFT_C2C, batch_size)); + CUFFT_CHECK_AND_EXIT(cufftSetStream(plan, stream)); + + // run convolution + double time_cufft = measure_execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + + if (cufftExecC2C(plan, device_buffer, device_buffer, CUFFT_FORWARD) != CUFFT_SUCCESS) { + fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); + return; + } + + scaling_kernel + <<>>(device_buffer, input_length, elements_per_thread); + + if (cufftExecC2C(plan, device_buffer, device_buffer, CUFFT_INVERSE) != CUFFT_SUCCESS) { + fprintf(stderr, "CUFFT error: ExecC2C Inverse failed"); + return; + } + } + }, + stream); + + CUFFT_CHECK_AND_EXIT(cufftDestroy(plan)); + return time_cufft; +} + +#ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK +template +double measure_cufft_callback(const unsigned int& kernel_repeats, + const unsigned int& batch_size, + cufftComplex* device_buffer, + cudaStream_t stream) { + + static constexpr unsigned int block_dim_scaling_kernel = 1024; + + // Calculating parameters for scaling_kernel execution. + // Get maximum number of running CUDA blocks per multiprocessor. + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + scaling_kernel, + block_dim_scaling_kernel, + 0)); + // prepare cuFFT runs + cufftHandle plan_in; + CUFFT_CHECK_AND_EXIT(cufftPlan1d(&plan_in, fft_size, CUFFT_C2C, batch_size)); + CUFFT_CHECK_AND_EXIT(cufftSetStream(plan_in, stream)); + cufftHandle plan_out; + CUFFT_CHECK_AND_EXIT(cufftPlan1d(&plan_out, fft_size, CUFFT_C2C, batch_size)); + CUFFT_CHECK_AND_EXIT(cufftSetStream(plan_out, stream)); + + // Set input callback + CUFFT_CHECK_AND_EXIT(cufftXtSetCallback(plan_in, + reinterpret_cast(&scaling_callback_ptr), + CUFFT_CB_LD_COMPLEX, + nullptr)); + + // run convolution + double time_cufft = measure_execution( + [&]() { + for (unsigned int i = 0; i < kernel_repeats; i++) { + CUFFT_CHECK_AND_EXIT(cufftExecC2C(plan_in, device_buffer, device_buffer, CUFFT_FORWARD)); + CUFFT_CHECK_AND_EXIT(cufftExecC2C(plan_out, device_buffer, device_buffer, CUFFT_INVERSE)); + } + }, + stream); + + CUFFT_CHECK_AND_EXIT(cufftDestroy(plan_in)); + CUFFT_CHECK_AND_EXIT(cufftDestroy(plan_out)); + return time_cufft; +} +#endif // CUFFTDX_EXAMPLES_CUFFT_CALLBACK + +// This example compares performance of cuFFT and cuFFTDx when performing C2C convolution. +// Data is generated on host, copied to device buffer and processed by FFTs. +// Each cuFFTDx execution runs one kernel, each cuFFT execution - three kernels. +// The experiment runs with the following principles: +// - at least 1GB of data is allocated in GPU and transformed by both convolutions, +// - for cuFFTDx kernel run, number of CUDA blocks is divisible +// by maximum number of CUDA blocks that can run simultaneously on the GPU. +template +void convolution() { + using namespace cufftdx; + + static constexpr unsigned int minimum_input_size_bytes = (1 << 30); // At least one GB of data will be processed by FFTs. + static constexpr unsigned int fft_size = 512; + static constexpr unsigned int kernel_repeats = 10; + static constexpr unsigned int warm_up_runs = 1; + static constexpr bool verbose = true; + + static constexpr bool use_suggested = true; // Whether to use suggested FPB and EPT values or custom. + static constexpr unsigned int custom_ffts_per_block = 2; + static constexpr unsigned int custom_elements_per_thread = 8; + + // To determine the total input length (number of fft batches to run), the maximum number of + // simultanously running cuFFTDx CUDA blocks is calculated. + + // Declaration of cuFFTDx run + using fft_incomplete = decltype(Block() + Size() + Type() + Precision() + SM()); + using fft_base = decltype(fft_incomplete() + Direction()); + using ifft_base = decltype(fft_incomplete() + Direction()); + + static constexpr unsigned int elements_per_thread = use_suggested ? fft_base::elements_per_thread : custom_elements_per_thread; + static constexpr unsigned int ffts_per_block = use_suggested ? fft_base::suggested_ffts_per_block : custom_ffts_per_block; + + using fft = decltype(fft_base() + ElementsPerThread() + FFTsPerBlock()); + using ifft = decltype(ifft_base() + ElementsPerThread() + FFTsPerBlock()); + using complex_type = typename fft::value_type; + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + convolution_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, fft::shared_memory_size)); + + // Get maximum number of running CUDA blocks per multiprocessor + int blocks_per_multiprocessor = 0; + CUDA_CHECK_AND_EXIT( + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_multiprocessor, + convolution_kernel, + fft::block_dim.x * fft::block_dim.y * fft::block_dim.z, + fft::shared_memory_size)); + + // Get maximum number of CUDA blocks running on all multiprocessors + const unsigned int device_blocks = blocks_per_multiprocessor * example::get_multiprocessor_count(); + + // Input size in bytes if device_blocks CUDA blocks were run. + const unsigned int data_size_device_blocks_bytes = device_blocks * ffts_per_block * fft_size * sizeof(complex_type); + + // cuda_blocks = minimal number of CUDA blocks to run, such that: + // - cuda_blocks is divisible by device_blocks, + // - total input size is not less than minimum_input_size_bytes. + // executed_blocks_multiplyer = cuda_blocks / device_blocks + const unsigned int executed_blocks_multiplyer = + (minimum_input_size_bytes + data_size_device_blocks_bytes - 1) / data_size_device_blocks_bytes; + const unsigned int cuda_blocks = device_blocks * executed_blocks_multiplyer; + const unsigned int input_length = ffts_per_block * cuda_blocks * fft_size; + + cudaStream_t stream; + CUDA_CHECK_AND_EXIT(cudaStreamCreate(&stream)); + + // Host data + std::vector input = + example::get_random_complex_data(input_length, -10, 10); + + // Device data + complex_type* device_buffer; + auto input_size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, input_size_bytes)); + + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), input_size_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Measure performance + double time_cufftdx = measure_cufftdx(kernel_repeats, cuda_blocks, device_buffer, stream); + double time_cufft = measure_cufft( + kernel_repeats, cuda_blocks * ffts_per_block, (cufftComplex*)device_buffer, stream); + #ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK + double time_cufft_cb = measure_cufft_callback( + kernel_repeats, cuda_blocks * ffts_per_block, (cufftComplex*)device_buffer, stream); + #endif + + CUDA_CHECK_AND_EXIT(cudaStreamDestroy(stream)); + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + // Report results. + auto report_time_and_performance = [&](std::string name, double time) -> void { + double gflops = 1.0 * kernel_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time / 1000000.0; + + std::cout << std::endl; + std::cout << name << std::endl; + std::cout << "Avg Time [ms_n]: " << time / kernel_repeats << std::endl; + std::cout << "Time (all) [ms_n]: " << time << std::endl; + std::cout << "Performance [GFLOPS]: " << gflops << std::endl; + }; + + if (verbose) { + std::cout << "FFT size: " << fft_size << std::endl; + std::cout << "FFTs run: " << ffts_per_block * cuda_blocks << std::endl; + report_time_and_performance("cuFFTDx", time_cufftdx); + std::cout << "FFTs elements per thread: " << fft::elements_per_thread << std::endl; + std::cout << "FFTs per block: " << ffts_per_block << std::endl; + std::cout << "CUDA blocks: " << cuda_blocks << std::endl; + std::cout << "Blocks per multiprocessor: " << blocks_per_multiprocessor << std::endl; + + report_time_and_performance("cuFFT", time_cufft); + #ifdef CUFFTDX_EXAMPLES_CUFFT_CALLBACK + report_time_and_performance("cuFFT Callback", time_cufft_cb); + #endif + } else { + double gflops_cufftdx = 1.0 * kernel_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time_cufftdx / 1000000.0; + double gflops_cufft = 1.0 * kernel_repeats * ffts_per_block * cuda_blocks * 5.0 * fft_size * + (std::log(fft_size) / std::log(2)) / time_cufft / 1000000.0; + std::cout << fft_size << ": " << std::endl + << gflops_cufftdx << ", " << time_cufftdx / kernel_repeats << ", " << std::endl + << gflops_cufft << ", " << time_cufft / kernel_repeats; + } +} + +template +struct convolution_functor { + void operator()() { + return convolution(); + } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution_r2c_c2r.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution_r2c_c2r.cu new file mode 100644 index 0000000000000..3e0052b954975 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/convolution_r2c_c2r.cu @@ -0,0 +1,102 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFTR2C::max_threads_per_block) __global__ void convolution_kernel(cufftdx::precision_of_t* data) { + using complex_type = typename FFTR2C::value_type; + using scalar_type = typename complex_type::value_type; + + // Local array for thread + complex_type thread_data[FFTR2C::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFTR2C::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_r2c(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFTR2C().execute(thread_data, shared_mem); + + // Scale values + scalar_type scale = 1.0 / cufftdx::size_of::value; + for (unsigned int i = 0; i < FFTR2C::elements_per_thread; i++) { + thread_data[i].x *= scale; + thread_data[i].y *= scale; + } + + // Execute inverse FFT + FFTC2R().execute(thread_data, shared_mem); + + // Save results + example::io::store_c2r(thread_data, data, local_fft_id); +} + +// This example demonstrates how to use cuFFTDx t operform a convolution using one-dimensional FFTs. +// +// One block is run, it calculates two 128-point convolutions by first doing forward FFT, then +// applying pointwise operation, and ending with inverse FFT. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void convolution() { + using namespace cufftdx; + + static constexpr unsigned int ffts_per_block = 2; + static constexpr unsigned int fft_size = 128; + // FFT_base defined common options for FFT and IFFT. FFT_base is not a complete FFT description. + // In order to complete FFT description directions are specified: forward for FFT, inverse for IFFT. + using FFT_base = decltype(Block() + Size() + Precision() + + ElementsPerThread<8>() + FFTsPerBlock() + SM()); + using FFTR2C = decltype(FFT_base() + Type()); + using FFTC2R = decltype(FFT_base() + Type()); + using real_type = precision_of_t; + + // Allocate managed memory for input/output + real_type* data; + auto size = ffts_per_block * fft_size; + auto size_bytes = size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = float(i); + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i] << std::endl; + } + + const auto shared_memory_size = std::max(FFTR2C::shared_memory_size, FFTC2R::shared_memory_size); + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + convolution_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size)); + + // Invokes convolution kernel with FFT::block_dim threads in CUDA block + convolution_kernel<<<1, FFTR2C::block_dim, shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < fft_size; i++) { + std::cout << data[i] << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct convolution_functor { + void operator()() { return convolution(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/fp16_common.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/fp16_common.hpp new file mode 100644 index 0000000000000..afa6eb28d2b20 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/fp16_common.hpp @@ -0,0 +1,71 @@ + +#ifndef CUFFTDX_EXAMPLE_FP16_COMMON_HPP_ +#define CUFFTDX_EXAMPLE_FP16_COMMON_HPP_ + +namespace example { + // Changes layout of complex<__half2> value from ((Real, Imag), (Real, Imag)) layout to + // ((Real, Real), (Imag, Imag)) layout. + __device__ __host__ __forceinline__ cufftdx::complex<__half2> to_rrii( + cufftdx::complex<__half2> riri) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + cufftdx::complex<__half2> rrii(__lows2half2(riri.x, riri.y), + __highs2half2(riri.x, riri.y)); +#else + cufftdx::complex<__half2> rrii(__half2 {riri.x.x, riri.y.x}, + __half2 {riri.x.y, riri.y.y}); +#endif + return rrii; + } + + // Converts to __half complex values to complex<__half2> in ((Real, Real), (Imag, Imag)) layout. + __device__ __host__ __forceinline__ cufftdx::complex<__half2> to_rrii( + __half2 ri1, + __half2 ri2) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + cufftdx::complex<__half2> rrii(__lows2half2(ri1, ri2), + __highs2half2(ri1, ri2)); +#else + cufftdx::complex<__half2> rrii(__half2 {ri1.x, ri2.x}, + __half2 {ri1.y, ri2.y}); +#endif + return rrii; + } + + // Changes layout of complex<__half2> value from ((Real, Real), (Imag, Imag)) layout to + // ((Real, Imag), (Real, Imag)) layout. + __device__ __host__ __forceinline__ cufftdx::complex<__half2> to_riri( + cufftdx::complex<__half2> rrii) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + cufftdx::complex<__half2> riri(__lows2half2(rrii.x, rrii.y), + __highs2half2(rrii.x, rrii.y)); +#else + cufftdx::complex<__half2> riri(__half2 {rrii.x.x, rrii.y.x}, + __half2 {rrii.x.y, rrii.y.y}); +#endif + return riri; + } + + // Return the first half complex number (as __half2) from complex<__half2> value with + // ((Real, Real), (Imag, Imag)) layout. + // Example: for rrii equal to ((1,2), (3,4)), it return __half2 (1, 3). + __device__ __host__ __forceinline__ __half2 to_ri1(cufftdx::complex<__half2> rrii) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + return __lows2half2(rrii.x, rrii.y); +#else + return __half2 {rrii.x.x, rrii.y.x}; +#endif + } + + // Return the second half complex number (as __half2) from complex<__half2> value with + // ((Real, Real), (Imag, Imag)) layout. + // Example: for rrii equal to ((1,2), (3,4)), it return __half2 (2, 4). + __device__ __host__ __forceinline__ __half2 to_ri2(cufftdx::complex<__half2> rrii) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ > 0) + return __highs2half2(rrii.x, rrii.y); +#else + return __half2 {rrii.x.y, rrii.y.y}; +#endif + } +} // namespace example + +#endif // CUFFTDX_EXAMPLE_FP16_COMMON_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/nvrtc_fft_thread.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/nvrtc_fft_thread.cu new file mode 100644 index 0000000000000..cda77c8f201b8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/nvrtc_fft_thread.cu @@ -0,0 +1,198 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "common.hpp" + +#define NVRTC_SAFE_CALL(x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " #x " failed with error " << nvrtcGetErrorString(result) << '\n'; \ + exit(1); \ + } \ + } while (0) + +const char* thread_fft_kernel = R"kernel( +#include + +using namespace cufftdx; + +// FFT +using size_desc = Size; +using dir_desc = Direction; +using type_c2c = Type; +using FFT = decltype(size_desc() + dir_desc() + type_c2c() + Thread() + Precision()); + +extern "C" __global__ void thread_fft_kernel(typename FFT::value_type *data) +{ + // Local array for thread + typename FFT::value_type thread_data[FFT::storage_size]; + + // Load data from global memory to registers. + // thread_data should have all input data in order. + unsigned int index = threadIdx.x * FFT::elements_per_thread; + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = data[index + i]; + } + + // Execute FFT + FFT().execute(thread_data); + + // Save results + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + data[index + i] = thread_data[i]; + } +} +)kernel"; + +int main(int, char**) { + // Define FFT + using namespace cufftdx; + + static constexpr unsigned int fft_size = 16; + + // FFT Operators + using size_desc = Size; + using dir_desc = Direction; + using type_c2c = Type; + using FFT = decltype(size_desc() + dir_desc() + type_c2c() + Thread() + Precision()); + using value_type = typename FFT::value_type; + + std::string fft_size_definition = "-DFFT_SIZE=" + std::to_string(fft_size); + // Parse cuFFTDx include dirs + std::vector cufftdx_include_dirs_array; + { + std::string cufftdx_include_dirs = CUFFTDX_INCLUDE_DIRS; + std::string delim = ";"; + auto start = 0U; + auto end = cufftdx_include_dirs.find(delim); + while (end != std::string::npos) { + cufftdx_include_dirs_array.push_back("--include-path=" + cufftdx_include_dirs.substr(start, end - start)); + start = end + delim.length(); + end = cufftdx_include_dirs.find(delim, start); + } + cufftdx_include_dirs_array.push_back("--include-path=" + cufftdx_include_dirs.substr(start, end - start)); + } + + // Get architecture of current device + int device; + CUDA_CHECK_AND_EXIT(cudaGetDevice(&device)); + int major = 0; + int minor = 0; + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + CUDA_CHECK_AND_EXIT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device)); + std::string gpu_architecture_option = + "--gpu-architecture=compute_" + std::to_string(major * 10 + minor); + + // Create a program + nvrtcProgram program; + NVRTC_SAFE_CALL(nvrtcCreateProgram(&program, // program + thread_fft_kernel, // buffer + "thread_fft_kernel.cu", // name + 0, // numHeaders + NULL, // headers + NULL)); // includeNames + + // Prepare compilation options + std::vector opts = { + "--std=c++17", + "--device-as-default-execution-space", + "--include-path=" CUDA_INCLUDE_DIR // Add path to CUDA include directory + }; + // Include cuFFTDx dir in opts + for (auto& d : cufftdx_include_dirs_array) { + opts.push_back(d.c_str()); + } + // Add FFT_SIZE definition + opts.push_back(fft_size_definition.c_str()); + // Add gpu-architecture flag + opts.push_back(gpu_architecture_option.c_str()); + + nvrtcResult compileResult = nvrtcCompileProgram(program, // program + opts.size(), // numOptions + opts.data()); // options + + // Obtain compilation log from the program + if (compileResult != NVRTC_SUCCESS) { + size_t log_size; + NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(program, &log_size)); + char* log = new char[log_size]; + NVRTC_SAFE_CALL(nvrtcGetProgramLog(program, log)); + std::cout << log << '\n'; + delete[] log; + std::exit(1); + } + + // Obtain PTX from the program. + size_t ptx_size; + NVRTC_SAFE_CALL(nvrtcGetPTXSize(program, &ptx_size)); + char* ptx = new char[ptx_size]; + NVRTC_SAFE_CALL(nvrtcGetPTX(program, ptx)); + + // Destroy the program. + NVRTC_SAFE_CALL(nvrtcDestroyProgram(&program)); + + // Load the generated PTX and get a handle to the thread_fft_kernel + CUcontext context; + CUmodule module; + CUfunction kernel; + CUDA_CHECK_AND_EXIT(cudaFree(0)); // Initialize CUDA context + CUDA_CHECK_AND_EXIT(cuCtxGetCurrent(&context)); // Get current context + CUDA_CHECK_AND_EXIT(cuModuleLoadDataEx(&module, ptx, 0, 0, 0)); + CUDA_CHECK_AND_EXIT(cuModuleGetFunction(&kernel, module, "thread_fft_kernel")); + + // Generate input for execution + std::vector host_input(cufftdx::size_of::value); + float i = 0.0f; + for (auto& v : host_input) { + v.x = i++; + v.y = 0; + } + + size_t fft_buffer_size = cufftdx::size_of::value * sizeof(value_type); + void* device_values; + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_values, fft_buffer_size)); + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_values, host_input.data(), fft_buffer_size, cudaMemcpyHostToDevice)); + + // Execute thread_fft_kernel + void* args[] = {&device_values}; + CUDA_CHECK_AND_EXIT(cuLaunchKernel(kernel, + 1, // number of blocks + 1, + 1, + 1, // number of threads + 1, + 1, + 0, // no shared memory + NULL, // NULL stream + args, + 0)); + CUDA_CHECK_AND_EXIT(cuCtxSynchronize()); + + // Retrieve and print output. + std::vector host_output(cufftdx::size_of::value); + CUDA_CHECK_AND_EXIT(cudaMemcpy(host_output.data(), device_values, fft_buffer_size, cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < cufftdx::size_of::value; ++i) { + std::cout << i << ": (" << host_output[i].x << ", " << host_output[i].y << ")" << std::endl; + } + + // Release resources. + CUDA_CHECK_AND_EXIT(cudaFree(device_values)); + CUDA_CHECK_AND_EXIT(cuModuleUnload(module)); + + double expected_value = (fft_size * (fft_size + 1)) / 2; + if ((host_output[0].x - expected_value) > 0.01) { + std::cout << "Failed" << std::endl; + return 1; + } + std::cout << "Success" << std::endl; + return 0; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/random.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/random.hpp new file mode 100644 index 0000000000000..c05f75c960697 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/random.hpp @@ -0,0 +1,51 @@ + +#ifndef CUFFTDX_EXAMPLE_RANDOM_HPP_ +#define CUFFTDX_EXAMPLE_RANDOM_HPP_ + +#include +#include +#include +#include + +#include +#include + +namespace example { + template + inline auto get_random_complex_data(size_t size, T min, T max) -> + typename std::enable_if::value, + std::vector>>::type { + using complex_type = cufftdx::make_complex_type_t; + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector output(size); + std::generate(output.begin(), output.end(), [&]() { + return complex_type {distribution(gen), distribution(gen)}; + }); + return output; + } + + template + inline auto get_random_complex_data(size_t size, T min, T max) -> + typename std::enable_if::value, + std::vector>>::type { + using complex_type = cufftdx::make_complex_type_t<__half2>; + std::random_device rd; + std::default_random_engine gen(rd()); + std::uniform_real_distribution distribution(min, max); + std::vector output(size); + std::generate(output.begin(), output.end(), [&]() { + auto xx = __float2half(distribution(gen)); + auto xy = __float2half(distribution(gen)); + auto yx = __float2half(distribution(gen)); + auto yy = __float2half(distribution(gen)); + auto x = __half2 {xx, xy}; + auto y = __half2 {yx, yy}; + return complex_type {x, y}; + }); + return output; + } +} // namespace example + +#endif // CUFFTDX_EXAMPLE_RANDOM_HPP_ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block.cu new file mode 100644 index 0000000000000..facaf08e09a24 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block.cu @@ -0,0 +1,86 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_c2r.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_c2r.cu new file mode 100644 index 0000000000000..7d91ae8c20a9e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_c2r.cu @@ -0,0 +1,94 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_c2r(typename FFT::value_type* input_data, cufftdx::precision_of_t* output_data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_c2r(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_c2r(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional complex-to-real transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2R float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// Notice different sizes of input and output buffer, and C2R load and store operations in the kernel. +template +void simple_block_fft_c2r() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + using real_type = typename complex_type::value_type; + + // Allocate managed memory for input/output + complex_type* input_data; + auto input_size = FFT::ffts_per_block * (cufftdx::size_of::value / 2 + 1); + auto input_size_bytes = input_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + input_data[i] = complex_type {float(i), -float(i)}; + } + real_type* output_data; + auto output_size = FFT::ffts_per_block * cufftdx::size_of::value; + auto output_size_bytes = output_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << input_data[i].x << " " << input_data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_c2r, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_c2r<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << output_data[i] << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_c2r_functor { + void operator()() { return simple_block_fft_c2r(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_c2r_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_c2r_fp16.cu new file mode 100644 index 0000000000000..3d4de92a31277 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_c2r_fp16.cu @@ -0,0 +1,105 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_c2r_fp16(ComplexType* input_data, ScalarType* output_data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_c2r(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_c2r(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional complex-to-real transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point C2R half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft_c2r_fp16() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<8>() + FFTsPerBlock<4>() + SM()); + using complex_type = typename FFT::value_type; // complex<__half2> + using real_type = typename complex_type::value_type; // __half2 + + // Allocate managed memory for input/output + // For performance reasons half precision cuFFTDx FFTs has an implicit batching of 2 FFTs. This means that: + // * Used complex type is complex<__half2>, and real type is __half2. + // * Every thread processes values from two batches simultaneously using __half2 as the base type. + // * Number of FFTs per block must be a multiple of 2. + // * Complex data is processed in ((Real1, Real2), (Imag1, Imag2)) layout, where (Real1, Imag1) is a value from + // one batch, and (Real2, Imag2) is from a different batch. + // * Real data is process using __half2 in (Real1, Real2) layout, where Real1 is a value from one batch, and + // Real2 is from a different batch. + constexpr size_t implicit_batching = FFT::implicit_type_batching; + complex_type* input_data; + auto input_size = FFT::ffts_per_block / implicit_batching * (cufftdx::size_of::value / 2 + 1); + auto input_size_bytes = input_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + float v1 = i; + float v2 = i + input_size; + // Populate input with complex values in ((Real, Imag), (Real, Imag)) layout + input_data[i] = complex_type {__half2 {v1, -v1}, __half2 {v2, -v2}}; + } + real_type* output_data; + auto output_size = FFT::ffts_per_block / implicit_batching * cufftdx::size_of::value; + auto output_size_bytes = output_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << __half2float(input_data[i].x.x) << " " << __half2float(input_data[i].x.y) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_c2r_fp16, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_c2r_fp16<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(output_data[i].x) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_c2r_fp16_functor { + void operator()() { return simple_block_fft_c2r_fp16(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_cub_io.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_cub_io.cu new file mode 100644 index 0000000000000..0ff802239b3ee --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_cub_io.cu @@ -0,0 +1,114 @@ +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +#if CUB_VERSION < 101300 +int main(int, char**) { + std::cout << "Example disabled, BLOCK_LOAD_STRIPED/BLOCK_STORE_STRIPED is only supported since CUB 1.13 (CUDA 11.5)" << std::endl; + return 0; +} +#else + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // FFT::stride == FFT::block_dim.x in most cases + using BlockLoad = cub::BlockLoad ; + using BlockStore = cub::BlockStore; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // ID of FFT in CUDA grid + unsigned int global_fft_id = + (FFT::ffts_per_block == 1) ? local_fft_id : ( blockIdx.x * FFT::ffts_per_block + local_fft_id); + + // Load data from global memory to registers + auto fft_data = data + (global_fft_id * cufftdx::size_of::value); + BlockLoad().Load(fft_data, thread_data, cufftdx::size_of::value, complex_type { 0.0, 0.0 }); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + BlockStore().Store(fft_data, thread_data, cufftdx::size_of::value); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. CUB +// library is used for IO in kernel. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + auto sum = data[0].x; + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + if(std::abs(sum - ((cufftdx::size_of::value-1) * cufftdx::size_of::value / 2)) > 0.1) { + std::cout << "Failed" << std::endl; + return; + } + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} +#endif // CUB_VERSION < 101300 diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_fp16.cu new file mode 100644 index 0000000000000..6f8e031242cb8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_fp16.cu @@ -0,0 +1,102 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" +#include "fp16_common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point C2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// +// Here, we're using complex with ((Real, Imag), (Real, Imag)) layout as the type of the input/output +// data passed to kernel, and later on the device layout is changed into RRII when values are being loaded. +template +void simple_block_fft_complex_half2() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<8>() + FFTsPerBlock<4>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + // For performance reasons half precision cuFFTDx FFTs has an implicit batching of 2 FFTs. This means that: + // * Used complex type is complex<__half2>, and real type is __half2. + // * Every thread processes values from two batches simultaneously using __half2 as the base type. + // * Number of FFTs per block must be a multiple of 2. + // * Complex data is processed in ((Real1, Real2), (Imag1, Imag2)) layout, where (Real1, Imag1) is a value from + // one batch, and (Real2, Imag2) is from a different batch. + // * Real data is process using __half2 in (Real1, Real2) layout, where Real1 is a value from one batch, and + // Real2 is from a different batch. + constexpr size_t implicit_batching = FFT::implicit_type_batching; + auto size = FFT::ffts_per_block / implicit_batching * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + float v1 = i; + float v2 = i + size; + // Populate input with complex values in ((Real, Imag), (Real, Imag)) layout + data[i] = complex_type {__half2 {v1, -v1}, __half2 {v2, -v2}}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x.x) << " " << __half2float(data[i].x.y) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x.x) << " " << __half2float(data[i].x.y) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_complex_half2_functor { + void operator()() { return simple_block_fft_complex_half2(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_half2.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_half2.cu new file mode 100644 index 0000000000000..f01b0d2ac6365 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_half2.cu @@ -0,0 +1,90 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" +#include "fp16_common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(__half2* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io_fp16::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io_fp16::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point C2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// +// Here, we're using __half2 as the type of the input/output data passed to kernel, and later on +// the device we use special example::io_fp16 struct template to load values from two batches +// into an array of complex with ((Real, Real), (Imag, Imag)) layout. +template +void simple_block_fft_half2() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<8>() + FFTsPerBlock<4>() + SM()); + + // Allocate managed memory for input/output + __half2* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(__half2); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = __half2 {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x) << " " << __half2float(data[i].y) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(data[i].x) << " " << __half2float(data[i].y) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_half2_functor { + void operator()() { return simple_block_fft_half2(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_r2c.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_r2c.cu new file mode 100644 index 0000000000000..bfa795f47c451 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_r2c.cu @@ -0,0 +1,93 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_r2c(ScalarType* input_data, ComplexType* output_data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_r2c(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_r2c(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional real-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point R2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// Notice different sizes of input and output buffer, and R2C load and store operations in the kernel. +template +void simple_block_fft_r2c() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + using real_type = typename complex_type::value_type; + + // Allocate managed memory for input/output + real_type* input_data; + auto input_size = FFT::ffts_per_block * cufftdx::size_of::value; + auto input_size_bytes = input_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + input_data[i] = float(i); + } + complex_type* output_data; + auto output_size = FFT::ffts_per_block * (cufftdx::size_of::value / 2 + 1); + auto output_size_bytes = output_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << input_data[i] << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_r2c, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_r2c<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << output_data[i].x << " " << output_data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_r2c_functor { + void operator()() { return simple_block_fft_r2c(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_r2c_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_r2c_fp16.cu new file mode 100644 index 0000000000000..17545c63d41a1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_r2c_fp16.cu @@ -0,0 +1,101 @@ +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ + void block_fft_kernel_r2c_fp16(ScalarType* input_data, ComplexType* output_data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load_r2c(input_data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store_r2c(thread_data, output_data, local_fft_id); +} + +// In this example a one-dimensional real-to-complex transform is performed by a CUDA block. +// +// One block is run, and it calculates four 128-point R2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft_r2c_fp16() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision<__half>() + ElementsPerThread<16>() + FFTsPerBlock<4>() + SM()); + using complex_type = typename FFT::value_type; // complex<__half2> + using real_type = typename complex_type::value_type; // __half2 + + // Allocate managed memory for input/output + real_type* input_data; + // For performance reasons half precision cuFFTDx FFTs has an implicit batching of 2 FFTs. This means that: + // * Used complex type is complex<__half2>, and real type is __half2. + // * Every thread processes values from two batches simultaneously using __half2 as the base type. + // * Number of FFTs per block must be a multiple of 2. + // * Complex data is processed in ((Real1, Real2), (Imag1, Imag2)) layout, where (Real1, Imag1) is a value from + // one batch, and (Real2, Imag2) is from a different batch. + // * Real data is process using __half2 in (Real1, Real2) layout, where Real1 is a value from one batch, and + // Real2 is from a different batch. + constexpr size_t implicit_batching = FFT::implicit_type_batching; + auto input_size = FFT::ffts_per_block / implicit_batching * cufftdx::size_of::value; + auto input_size_bytes = input_size * sizeof(real_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&input_data, input_size_bytes)); + for (size_t i = 0; i < input_size; i++) { + input_data[i] = __half2 {float(i), float(i + input_size)}; + } + complex_type* output_data; + auto output_size = FFT::ffts_per_block / implicit_batching * (cufftdx::size_of::value / 2 + 1); + auto output_size_bytes = output_size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&output_data, output_size_bytes)); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(input_data[i].x) << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel_r2c_fp16, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel_r2c_fp16<<<1, FFT::block_dim, FFT::shared_memory_size>>>(input_data, output_data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < (cufftdx::size_of::value / 2 + 1); i++) { + std::cout << __half2float(output_data[i].x.x) << " " << __half2float(output_data[i].x.y) << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(input_data)); + CUDA_CHECK_AND_EXIT(cudaFree(output_data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_r2c_fp16_functor { + void operator()() { return simple_block_fft_r2c_fp16(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_shared.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_shared.cu new file mode 100644 index 0000000000000..00dda003cbb07 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_shared.cu @@ -0,0 +1,85 @@ +#include +#include +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + extern __shared__ unsigned char shared_mem[]; + + auto this_block_data = data + cufftdx::size_of::value * FFT::ffts_per_block * blockIdx.x; + + example::io::load_to_smem(this_block_data, shared_mem); + + FFT().execute(reinterpret_cast(shared_mem)); + + example::io::store_from_smem(shared_mem, this_block_data); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + using complex_type = typename FFT::value_type; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + // Shared memory must fit input data and must be big enough to run FFT + auto shared_memory_size = std::max((unsigned int)FFT::shared_memory_size, (unsigned int)size_bytes); + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].x << " " << data[i].y << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_std_complex.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_std_complex.cu new file mode 100644 index 0000000000000..2b69581eec3cc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_block_std_complex.cu @@ -0,0 +1,100 @@ +#include +#include + +// Check if used version of libcu++ supports cuda::std::complex +#include +#if _LIBCUDACXX_CUDA_API_VERSION < 001004000 +int main(int, char**) { + std::cout << "Example disabled, cuda::std::complex is only supported since libcu++ 1.4.0 (CUDA 11.3)" << std::endl; + return 0; +} +#else + +#include + +#include +#include + +#include "block_io.hpp" +#include "common.hpp" + +template +__launch_bounds__(FFT::max_threads_per_block) __global__ void block_fft_kernel(ComplexType* data) { + using complex_type = ComplexType; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // ID of FFT in CUDA block, in range [0; FFT::ffts_per_block) + const unsigned int local_fft_id = threadIdx.y; + // Load data from global memory to registers + example::io::load(data, thread_data, local_fft_id); + + // Execute FFT + extern __shared__ complex_type shared_mem[]; + FFT().execute(thread_data, shared_mem); + + // Save results + example::io::store(thread_data, data, local_fft_id); +} + +// In this example a one-dimensional complex-to-complex transform is performed by a CUDA block. +// +// One block is run, it calculates two 128-point C2C float precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +template +void simple_block_fft() { + using namespace cufftdx; + + // FFT is defined, its: size, type, direction, precision. Block() operator informs that FFT + // will be executed on block level. Shared memory is required for co-operation between threads. + // Additionally, + using FFT = decltype(Block() + Size<128>() + Type() + Direction() + + Precision() + ElementsPerThread<8>() + FFTsPerBlock<2>() + SM()); + // Use cuda::std::complex instead of FFT::value_type + using complex_type = cuda::std::complex::type>; + + // Allocate managed memory for input/output + complex_type* data; + auto size = FFT::ffts_per_block * cufftdx::size_of::value; + auto size_bytes = size * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMallocManaged(&data, size_bytes)); + for (size_t i = 0; i < size; i++) { + data[i] = complex_type {float(i), -float(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].real() << " " << data[i].imag() << std::endl; + } + + // Increase max shared memory if needed + CUDA_CHECK_AND_EXIT(cudaFuncSetAttribute( + block_fft_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + FFT::shared_memory_size)); + + // Invokes kernel with FFT::block_dim threads in CUDA block + block_fft_kernel<<<1, FFT::block_dim, FFT::shared_memory_size>>>(data); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << data[i].real() << " " << data[i].imag() << std::endl; + } + + CUDA_CHECK_AND_EXIT(cudaFree(data)); + std::cout << "Success" << std::endl; +} + +template +struct simple_block_fft_functor { + void operator()() { return simple_block_fft(); } +}; + +int main(int, char**) { + return example::sm_runner(); +} + +#endif // (_LIBCUDACXX_CUDA_API_VERSION < 001004000) diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_thread.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_thread.cu new file mode 100644 index 0000000000000..8ad7f5558b31d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_thread.cu @@ -0,0 +1,81 @@ +#include +#include + +#include +#include + +#include "common.hpp" + +template +__global__ void thread_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // Load data from global memory to registers. + // thread_data should have all input data in order. + unsigned int index = threadIdx.x * FFT::elements_per_thread; + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + thread_data[i] = data[index + i]; + } + + // Execute FFT + FFT().execute(thread_data); + + // Save results + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + data[index + i] = thread_data[i]; + } +} + +// In this example a one-dimensional complex-to-complex transform is perform by a CUDA thread. +// +// Four (threads_count) threads are run, and each thread calculates 8-point (fft_size) C2C double precision FFT. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +int main(int, char**) { + using namespace cufftdx; + + // Number of threads to execute + static constexpr unsigned int threads_count = 4; + + // FFT is defined, its: size, type, direction, precision. Thread() operator informs that FFT will be executed on thread level. + using FFT = decltype(Thread() + Size<8>() + Type() + Direction() + + Precision()); + using complex_type = typename FFT::value_type; + + // Host data + std::vector input(cufftdx::size_of::value * threads_count); + for (size_t i = 0; i < input.size(); i++) { + input[i] = complex_type {double(i), -double(i)}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << input[i].x << " " << input[i].y << std::endl; + } + + // Device data + complex_type* device_buffer; + auto size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, size_bytes)); + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + + // Invokes kernel with 'threads_count' threads in block, each thread calculates one FFT of size + thread_fft_kernel<<<1, threads_count>>>(device_buffer); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Copy device to host + std::vector output(input.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(output.data(), device_buffer, size_bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << output[i].x << " " << output[i].y << std::endl; + } + + std::cout << "Success" << std::endl; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_thread_fp16.cu b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_thread_fp16.cu new file mode 100644 index 0000000000000..bee19442fcbfa --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/example/simple_fft_thread_fp16.cu @@ -0,0 +1,93 @@ +#include +#include + +#include +#include + +#include "common.hpp" +#include "fp16_common.hpp" + +template +__global__ void thread_fft_kernel(typename FFT::value_type* data) { + using complex_type = typename FFT::value_type; + + // Local array for thread + complex_type thread_data[FFT::storage_size]; + + // Load data from global memory to registers. + // thread_data should have all input data in order. + unsigned int index = threadIdx.x * FFT::elements_per_thread; + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + // complex values are processed with assumtion that they are in RRII layout, + // but data has them in RIRI layout. example::to_rrii converts RIRI to RRII. + thread_data[i] = example::to_rrii(data[index + i]); + } + + // Execute FFT + FFT().execute(thread_data); + + // Save results + for (size_t i = 0; i < FFT::elements_per_thread; i++) { + // converting back form RRII to RIRI layout + data[index + i] = example::to_riri(thread_data[i]); + } +} + +// In this example a one-dimensional half-precision complex-to-complex transform is perform by each CUDA thread. +// +// Three (threads_count) threads are run, and each thread calculates two 8-point (fft_size) C2C half precision FFTs. +// Data is generated on host, copied to device buffer, and then results are copied back to host. +// +// Note: In half precision cuFFTDx uses complex type and processes values in implicit batches of two FFTs, ie. +// each thread processes two FFTs. +int main(int, char**) { + using namespace cufftdx; + + // Number of threads to execute + // In case of half precision each thread caluclates two FFTs + static constexpr unsigned int threads_count = 3; + + // FFT is defined, its: size, type, direction, precision. + // Thread() operator informs that FFT will be executed on a thread level. + using FFT = decltype(Thread() + Size<8>() + Type() + Direction() + + Precision<__half>()); + using complex_type = typename FFT::value_type; + + // Host data + std::vector input(cufftdx::size_of::value); + for (size_t i = 0; i < input.size(); i++) { + float v1 = i; + float v2 = i + input.size(); + // Populate input with complex values in ((Real, Imag), (Real, Imag)) layout + input[i] = complex_type {__half2 {v1, -v1}, __half2 {v2, -v2}}; + } + + std::cout << "input [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(input[i].x.x) << " " << __half2float(input[i].x.y) << std::endl; + } + + // Device data + complex_type* device_buffer; + auto size_bytes = input.size() * sizeof(complex_type); + CUDA_CHECK_AND_EXIT(cudaMalloc(&device_buffer, size_bytes)); + // Copy host to device + CUDA_CHECK_AND_EXIT(cudaMemcpy(device_buffer, input.data(), size_bytes, cudaMemcpyHostToDevice)); + + // Invokes kernel with 'threads_count' threads in block, each thread calculates two FFTs of size 8 + thread_fft_kernel<<<1, threads_count>>>(device_buffer); + CUDA_CHECK_AND_EXIT(cudaPeekAtLastError()); + CUDA_CHECK_AND_EXIT(cudaDeviceSynchronize()); + + // Copy device to host + std::vector output(input.size()); + CUDA_CHECK_AND_EXIT(cudaMemcpy(output.data(), device_buffer, size_bytes, cudaMemcpyDeviceToHost)); + CUDA_CHECK_AND_EXIT(cudaFree(device_buffer)); + + std::cout << "output [1st FFT]:\n"; + for (size_t i = 0; i < cufftdx::size_of::value; i++) { + std::cout << __half2float(output[i].x.x) << " " << __half2float(output[i].x.y) << std::endl; + } + + std::cout << "Success" << std::endl; +} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/cufftdx.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/cufftdx.hpp new file mode 100644 index 0000000000000..65226c2996ecb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/cufftdx.hpp @@ -0,0 +1,27 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_HPP__ +#define CUFFTDX_HPP__ + +#include "cufftdx_version.hpp" + +#include "detail/config.hpp" +#include "detail/fft_description.hpp" +#include "detail/fft_execution.hpp" +#include "detail/system_checks.hpp" +#include "detail/workspace.hpp" +#include "operators.hpp" +#include "traits.hpp" +#include "types.hpp" + +/// \file +/// +/// cuFFTDx header + +#endif // CUFFTDX_HPP__ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/cufftdx_version.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/cufftdx_version.hpp new file mode 100644 index 0000000000000..b48620fc5b5c1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/cufftdx_version.hpp @@ -0,0 +1,30 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_VERSION_HPP__ +#define CUFFTDX_VERSION_HPP__ + +/// \def CUFFTDX_VERSION +/// \brief cuFFTDx library version +/// +/// +/// @note +/// CUFFTDX_VERSION / 1000 - major version
+/// CUFFTDX_VERSION / 100 % 100 - minor version
+/// CUFFTDX_VERSION % 100 - patch level
+#define CUFFTDX_VERSION 1000 + +#ifndef DOXYGEN_SHOULD_SKIP_THIS + +#define CUFFTDX_VERSION_MAJOR 1 +#define CUFFTDX_VERSION_MINOR 0 +#define CUFFTDX_VERSION_PATCH 0 + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +#endif // CUFFTDX_VERSION_HPP__ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/database.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/database.hpp new file mode 100644 index 0000000000000..b9fc759bc6577 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/database.hpp @@ -0,0 +1,55 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DATABASE_DATABASE_HPP +#define CUFFTDX_DATABASE_DATABASE_HPP + +#include "detail/block_fft.hpp" + +namespace cufftdx { + namespace database { + namespace detail { + #include "lut_fp32.hpp.inc" + #include "lut_fp64.hpp.inc" + + + #include "records/700/database_fp16_fwd.hpp.inc" + #include "records/700/database_fp16_inv.hpp.inc" + #include "records/700/database_fp32_fwd.hpp.inc" + #include "records/700/database_fp32_inv.hpp.inc" + #include "records/700/database_fp64_fwd.hpp.inc" + #include "records/700/database_fp64_inv.hpp.inc" + + #include "records/800/database_fp16_fwd.hpp.inc" + #include "records/800/database_fp16_inv.hpp.inc" + #include "records/800/database_fp32_fwd.hpp.inc" + #include "records/800/database_fp32_inv.hpp.inc" + #include "records/800/database_fp64_fwd.hpp.inc" + #include "records/800/database_fp64_inv.hpp.inc" + +#ifndef __HALF2_TO_UI +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#endif + + #include "records/definitions_fp16_fwd.hpp.inc" + #include "records/definitions_fp16_inv.hpp.inc" + #include "records/definitions_fp32_fwd.hpp.inc" + #include "records/definitions_fp32_inv.hpp.inc" + #include "records/definitions_fp64_fwd.hpp.inc" + #include "records/definitions_fp64_inv.hpp.inc" + +#ifdef __HALF2_TO_UI +#undef __HALF2_TO_UI +#endif + + } // namespace detail + } // namespace database +} // namespace cufftdx + +#endif // CUFFTDX_DATABASE_DATABASE_HPP + diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/detail/block_fft.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/detail/block_fft.hpp new file mode 100644 index 0000000000000..3b9946d7a3c2c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/detail/block_fft.hpp @@ -0,0 +1,145 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DATABASE_DETAIL_BLOCK_FFT_HPP +#define CUFFTDX_DATABASE_DETAIL_BLOCK_FFT_HPP + +#include "cuda_fp16.h" + +#include "../../operators.hpp" +#include "../../traits/detail/make_complex_type.hpp" +#include "type_list.hpp" + +namespace cufftdx { + namespace database { + namespace detail { + template + struct block_fft_record { + static constexpr bool defined = false; + }; + + // Forward SM72 records to SM70 records + template + struct block_fft_record: + public block_fft_record {}; + + // Forward SM75 records to SM70 records + template + struct block_fft_record: + public block_fft_record {}; + + // Forward SM86 records to SM70 records + template + struct block_fft_record: + public block_fft_record {}; + + // Forward R2C records to C2C records + template + struct block_fft_record: + public block_fft_record {}; + template + struct block_fft_record: + public block_fft_record {}; + + // Forward C2R records to C2C records + template + struct block_fft_record: + public block_fft_record {}; + template + struct block_fft_record: + public block_fft_record {}; + + template + struct block_fft_implementation { + static constexpr unsigned int elements_per_thread = ElementsPerThread; + static constexpr unsigned int storage_size = StorageSize; + static constexpr unsigned int threads_per_fft = ThreadsPerFFT; + static constexpr unsigned int ffts_per_block = FFTsPerBlock; + static constexpr unsigned int shared_memory_size = SharedMemorySize; + static constexpr unsigned int function_id = FunctionId; + }; + + template + struct enforce_trp { + static constexpr bool matches = true; + }; + + template + struct enforce_trp { + static constexpr bool matches = + sizeof(PrecisionType) * Implementation::elements_per_thread * Implementation::threads_per_fft == + Implementation::shared_memory_size; + }; + + template + struct enforce_trp { + static constexpr bool matches = + sizeof(PrecisionType) * 2 * Implementation::elements_per_thread * Implementation::threads_per_fft == + Implementation::shared_memory_size; + }; + + // Selects block_fft_implementation from type_list based on ElementsPerThread, + // if there is no such implementation in list search_by_ept::type is set to void. + template + struct search_by_ept; + + template + struct search_by_ept> { + using type = typename CUFFTDX_STD::conditional< + (Implementation::elements_per_thread == ElementsPerThread && + (Implementation::threads_per_fft == 1 || + enforce_trp::matches)), + Implementation, + void>::type; + }; + + template + struct search_by_ept> { + using type = typename CUFFTDX_STD::conditional< + (Head::elements_per_thread == ElementsPerThread && + (Head::threads_per_fft == 1 || enforce_trp::matches)), + Head, + typename search_by_ept>::type>:: + type; + }; + + template + __device__ void cufftdx_private_function(typename cufftdx::detail::make_complex_type::cufftdx_type* rmem, + unsigned smem); + + + template + __device__ void cufftdx_private_function_wrapper(typename cufftdx::detail::make_complex_type::cufftdx_type* rmem, + void* smem) { + unsigned smem32 = static_cast(__cvta_generic_to_shared(smem)); + cufftdx_private_function(rmem, smem32); + } + } // namespace detail + } // namespace database +} // namespace cufftdx + +#endif // CUFFTDX_DATABASE_DETAIL_BLOCK_FFT_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/detail/type_list.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/detail/type_list.hpp new file mode 100644 index 0000000000000..965d5d4ecdec9 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/detail/type_list.hpp @@ -0,0 +1,33 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DATABASE_DETAIL_TYPE_LIST_HPP +#define CUFFTDX_DATABASE_DETAIL_TYPE_LIST_HPP + +namespace cufftdx { + namespace database { + namespace detail { + template + struct type_list_element; + + template + struct type_list {}; + + template + struct type_list_element>: + type_list_element> {}; + + template + struct type_list_element<0, type_list> { + using type = Head; + }; + } // namespace detail + } // namespace database +} // namespace cufftdx + +#endif // CUFFTDX_DATABASE_DETAIL_TYPE_LIST_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..7cc4b4e24f5b5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp16_fwd.hpp.inc @@ -0,0 +1,10160 @@ +#ifndef CUFFTDX_FFT_10000_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_10000_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<941, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<316>; +.reg .b32 r<4162>; +.reg .b64 rd<9>; +mov.u32 r4137, %tid.y; +shl.b32 r4138, r4137, 1; +mov.u32 r4139, %20; +mad.lo.s32 r4140, r4138, 40000, r4139; +mov.u32 r4141, %tid.x; +mov.f32 f276, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1, {low, high}; +} +mov.f32 f282, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2, {low, high}; +} +mov.f32 f284, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3, {low, high}; +} +mov.f32 f286, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %25, %37; +} +{ +add.f16x2 r12, %21, r9; +} +{ +add.f16x2 r15, %29, %33; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %26, %38; +} +{ +add.f16x2 r24, %22, r21; +} +{ +add.f16x2 r27, %30, %34; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %25, %37; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %21, r36; +} +{ +add.f16x2 r42, %29, %33; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %26, %38; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %30, %34; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %25, %37; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %21, r72; +} +{ +add.f16x2 r78, %29, %33; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %26, %38; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %30, %34; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %25, %37; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %21, r108; +} +{ +add.f16x2 r114, %29, %33; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %26, %38; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %30, %34; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %25, %37; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %21, r144; +} +{ +add.f16x2 r150, %29, %33; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %26, %38; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %30, %34; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %26, %38; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %22, r180; +} +{ +add.f16x2 r186, %30, %34; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %25, %37; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %29, %33; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %26, %38; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %22, r216; +} +{ +add.f16x2 r222, %30, %34; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %25, %37; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %29, %33; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %26, %38; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %22, r252; +} +{ +add.f16x2 r258, %30, %34; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %25, %37; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %29, %33; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %26, %38; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %22, r288; +} +{ +add.f16x2 r294, %30, %34; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %25, %37; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %29, %33; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %27, %39; +} +{ +add.f16x2 r332, %23, r329; +} +{ +add.f16x2 r335, %31, %35; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %28, %40; +} +{ +add.f16x2 r344, %24, r341; +} +{ +add.f16x2 r347, %32, %36; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %27, %39; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %23, r356; +} +{ +add.f16x2 r362, %31, %35; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %28, %40; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %32, %36; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %27, %39; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %23, r392; +} +{ +add.f16x2 r398, %31, %35; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %28, %40; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %32, %36; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %27, %39; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %23, r428; +} +{ +add.f16x2 r434, %31, %35; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %28, %40; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %32, %36; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %27, %39; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %23, r464; +} +{ +add.f16x2 r470, %31, %35; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %28, %40; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %32, %36; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %28, %40; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %24, r500; +} +{ +add.f16x2 r506, %32, %36; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %27, %39; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %31, %35; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %28, %40; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %24, r536; +} +{ +add.f16x2 r542, %32, %36; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %27, %39; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %31, %35; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %28, %40; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %24, r572; +} +{ +add.f16x2 r578, %32, %36; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %27, %39; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %31, %35; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %28, %40; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %24, r608; +} +{ +add.f16x2 r614, %32, %36; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %27, %39; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %31, %35; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +mov.f32 f272, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r644, {low, high}; +} +mov.f32 f280, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r648, {low, high}; +} +mov.f32 f243, 0fBF800000; +{ +mul.f16x2 r659, r386, r641; +} +{ +mul.f16x2 r662, r530, r642; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r386, r642; +} +{ +fma.rn.f16x2 r671, r530, r641, r668; +} +{ +mul.f16x2 r675, r458, r643; +} +{ +mul.f16x2 r678, r602, r644; +} +{ +sub.f16x2 r681, r675, r678; +} +{ +mul.f16x2 r684, r458, r644; +} +{ +fma.rn.f16x2 r687, r602, r643, r684; +} +{ +mul.f16x2 r691, r494, r645; +} +{ +mul.f16x2 r694, r638, r646; +} +{ +sub.f16x2 r697, r691, r694; +} +{ +mul.f16x2 r700, r494, r646; +} +{ +fma.rn.f16x2 r703, r638, r645, r700; +} +{ +mul.f16x2 r707, r422, r647; +} +{ +mul.f16x2 r710, r566, r648; +} +{ +sub.f16x2 r713, r707, r710; +} +{ +mul.f16x2 r716, r422, r648; +} +{ +fma.rn.f16x2 r719, r566, r647, r716; +} +{ +add.f16x2 r723, r18, r338; +} +{ +add.f16x2 r726, r30, r350; +} +{ +sub.f16x2 r729, r18, r338; +} +{ +sub.f16x2 r732, r30, r350; +} +{ +add.f16x2 r735, r66, r665; +} +{ +add.f16x2 r738, r210, r671; +} +{ +sub.f16x2 r741, r66, r665; +} +{ +sub.f16x2 r744, r210, r671; +} +{ +add.f16x2 r747, r138, r681; +} +{ +add.f16x2 r750, r282, r687; +} +{ +sub.f16x2 r753, r138, r681; +} +{ +sub.f16x2 r756, r282, r687; +} +{ +add.f16x2 r759, r174, r697; +} +{ +add.f16x2 r762, r318, r703; +} +{ +sub.f16x2 r765, r174, r697; +} +{ +sub.f16x2 r768, r318, r703; +} +{ +add.f16x2 r771, r102, r713; +} +{ +add.f16x2 r774, r246, r719; +} +{ +sub.f16x2 r777, r102, r713; +} +{ +sub.f16x2 r780, r246, r719; +} +mul.wide.u32 rd2, r4141, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r4142, rd3; +mul.lo.s32 r4143, r4142, 1000; +sub.s32 r4144, r4141, r4143; +shr.u64 rd4, rd2, 37; +cvt.u32.u64 r4145, rd4; +and.b32 r4146, r4145, 134217726; +mad.lo.s32 r4147, r4146, 40000, r4140; +cvt.rn.f32.u32 f307, r4144; +mul.f32 f308, f307, 0f3A24B5BE; +cos.approx.f32 f61, f308; +sin.approx.f32 f309, f308; +neg.f32 f62, f309; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r783, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r788, {high, high}; +} +{ +mul.f16x2 r790, r738, r788; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r735, r786, r793; +} +{ +mul.f16x2 r799, r735, r788; +} +{ +fma.rn.f16x2 r802, r738, r786, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r808, {high, high}; +} +mov.f32 f244, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r808, r810; +} +{ +mul.f16x2 r814, r783, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r817, {high, low}; +} +{ +fma.rn.f16x2 r819, r811, r817, r814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r825, {high, high}; +} +{ +mul.f16x2 r827, r750, r825; +} +{ +neg.f16x2 r830, r827; +} +{ +fma.rn.f16x2 r832, r747, r823, r830; +} +{ +mul.f16x2 r836, r747, r825; +} +{ +fma.rn.f16x2 r839, r750, r823, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r845, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r847, {low, high}; +} +{ +mul.f16x2 r848, r845, r847; +} +{ +mul.f16x2 r851, r819, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r854, {high, low}; +} +{ +fma.rn.f16x2 r856, r848, r854, r851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r860, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r862, {high, high}; +} +{ +mul.f16x2 r864, r762, r862; +} +{ +neg.f16x2 r867, r864; +} +{ +fma.rn.f16x2 r869, r759, r860, r867; +} +{ +mul.f16x2 r873, r759, r862; +} +{ +fma.rn.f16x2 r876, r762, r860, r873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r882, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r884, {low, high}; +} +{ +mul.f16x2 r885, r882, r884; +} +{ +mul.f16x2 r888, r856, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r891, {high, low}; +} +{ +fma.rn.f16x2 r893, r885, r891, r888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r897, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r899, {high, high}; +} +{ +mul.f16x2 r901, r774, r899; +} +{ +neg.f16x2 r904, r901; +} +{ +fma.rn.f16x2 r906, r771, r897, r904; +} +{ +mul.f16x2 r910, r771, r899; +} +{ +fma.rn.f16x2 r913, r774, r897, r910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r919, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r921, {low, high}; +} +{ +mul.f16x2 r922, r919, r921; +} +{ +mul.f16x2 r925, r893, r917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r928, {high, low}; +} +{ +fma.rn.f16x2 r930, r922, r928, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r934, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r936, {high, high}; +} +{ +mul.f16x2 r938, r732, r936; +} +{ +neg.f16x2 r941, r938; +} +{ +fma.rn.f16x2 r943, r729, r934, r941; +} +{ +mul.f16x2 r947, r729, r936; +} +{ +fma.rn.f16x2 r950, r732, r934, r947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r956, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r958, {low, high}; +} +{ +mul.f16x2 r959, r956, r958; +} +{ +mul.f16x2 r962, r930, r954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r965, {high, low}; +} +{ +fma.rn.f16x2 r967, r959, r965, r962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r971, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r973, {high, high}; +} +{ +mul.f16x2 r975, r744, r973; +} +{ +neg.f16x2 r978, r975; +} +{ +fma.rn.f16x2 r980, r741, r971, r978; +} +{ +mul.f16x2 r984, r741, r973; +} +{ +fma.rn.f16x2 r987, r744, r971, r984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r991, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r993, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r995, {low, high}; +} +{ +mul.f16x2 r996, r993, r995; +} +{ +mul.f16x2 r999, r967, r991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r1002, {high, low}; +} +{ +fma.rn.f16x2 r1004, r996, r1002, r999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1008, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1010, {high, high}; +} +{ +mul.f16x2 r1012, r756, r1010; +} +{ +neg.f16x2 r1015, r1012; +} +{ +fma.rn.f16x2 r1017, r753, r1008, r1015; +} +{ +mul.f16x2 r1021, r753, r1010; +} +{ +fma.rn.f16x2 r1024, r756, r1008, r1021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1028, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1030, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1032, {low, high}; +} +{ +mul.f16x2 r1033, r1030, r1032; +} +{ +mul.f16x2 r1036, r1004, r1028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1039, {high, low}; +} +{ +fma.rn.f16x2 r1041, r1033, r1039, r1036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1045, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1047, {high, high}; +} +{ +mul.f16x2 r1049, r768, r1047; +} +{ +neg.f16x2 r1052, r1049; +} +{ +fma.rn.f16x2 r1054, r765, r1045, r1052; +} +{ +mul.f16x2 r1058, r765, r1047; +} +{ +fma.rn.f16x2 r1061, r768, r1045, r1058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1065, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1067, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1069, {low, high}; +} +{ +mul.f16x2 r1070, r1067, r1069; +} +{ +mul.f16x2 r1073, r1041, r1065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1076, {high, low}; +} +{ +fma.rn.f16x2 r1078, r1070, r1076, r1073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1082, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1084, {high, high}; +} +{ +mul.f16x2 r1086, r780, r1084; +} +{ +neg.f16x2 r1089, r1086; +} +{ +fma.rn.f16x2 r1091, r777, r1082, r1089; +} +{ +mul.f16x2 r1095, r777, r1084; +} +{ +fma.rn.f16x2 r1098, r780, r1082, r1095; +} +barrier.sync 0; +mad.lo.s32 r4148, r4144, 80, r4147; +st.shared.v2.f32 [r4148], {r723, r726}; +st.shared.v2.f32 [r4148+8], {r795, r802}; +st.shared.v2.f32 [r4148+16], {r832, r839}; +st.shared.v2.f32 [r4148+24], {r869, r876}; +st.shared.v2.f32 [r4148+32], {r906, r913}; +st.shared.v2.f32 [r4148+40], {r943, r950}; +st.shared.v2.f32 [r4148+48], {r980, r987}; +st.shared.v2.f32 [r4148+56], {r1017, r1024}; +st.shared.v2.f32 [r4148+64], {r1054, r1061}; +st.shared.v2.f32 [r4148+72], {r1091, r1098}; +barrier.sync 0; +mad.lo.s32 r4149, r4144, -72, r4148; +ld.shared.u32 r1131, [r4149]; +ld.shared.u32 r1143, [r4149+4]; +ld.shared.u32 r1451, [r4149+8000]; +ld.shared.u32 r1463, [r4149+8004]; +ld.shared.u32 r1128, [r4149+16000]; +ld.shared.u32 r1140, [r4149+16004]; +ld.shared.u32 r1448, [r4149+24000]; +ld.shared.u32 r1460, [r4149+24004]; +ld.shared.u32 r1134, [r4149+32000]; +ld.shared.u32 r1146, [r4149+32004]; +ld.shared.u32 r1454, [r4149+40000]; +ld.shared.u32 r1466, [r4149+40004]; +ld.shared.u32 r1135, [r4149+48000]; +ld.shared.u32 r1147, [r4149+48004]; +ld.shared.u32 r1455, [r4149+56000]; +ld.shared.u32 r1467, [r4149+56004]; +ld.shared.u32 r1129, [r4149+64000]; +ld.shared.u32 r1141, [r4149+64004]; +ld.shared.u32 r1449, [r4149+72000]; +ld.shared.u32 r1461, [r4149+72004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +add.f16x2 r1127, r1128, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1130, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1142, r1145; +} +{ +add.f16x2 r1151, r1128, r1129; +} +{ +mul.f16x2 r1154, r1151, r1119; +} +{ +add.f16x2 r1157, r1131, r1154; +} +{ +add.f16x2 r1160, r1134, r1135; +} +{ +mul.f16x2 r1163, r1160, r1121; +} +{ +add.f16x2 r1166, r1157, r1163; +} +{ +sub.f16x2 r1169, r1140, r1141; +} +{ +mul.f16x2 r1172, r1169, r1120; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1122; +} +{ +add.f16x2 r1181, r1172, r1178; +} +{ +sub.f16x2 r1184, r1166, r1181; +} +{ +add.f16x2 r1187, r1128, r1129; +} +{ +mul.f16x2 r1190, r1187, r1119; +} +{ +add.f16x2 r1193, r1131, r1190; +} +{ +add.f16x2 r1196, r1134, r1135; +} +{ +mul.f16x2 r1199, r1196, r1121; +} +{ +add.f16x2 r1202, r1193, r1199; +} +{ +sub.f16x2 r1205, r1140, r1141; +} +{ +mul.f16x2 r1208, r1205, r1120; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1122; +} +{ +add.f16x2 r1217, r1208, r1214; +} +{ +add.f16x2 r1220, r1202, r1217; +} +{ +add.f16x2 r1223, r1128, r1129; +} +{ +mul.f16x2 r1226, r1223, r1121; +} +{ +add.f16x2 r1229, r1131, r1226; +} +{ +add.f16x2 r1232, r1134, r1135; +} +{ +mul.f16x2 r1235, r1232, r1123; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1140, r1141; +} +{ +mul.f16x2 r1244, r1241, r1122; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1125; +} +{ +add.f16x2 r1253, r1244, r1250; +} +{ +sub.f16x2 r1256, r1238, r1253; +} +{ +add.f16x2 r1259, r1128, r1129; +} +{ +mul.f16x2 r1262, r1259, r1121; +} +{ +add.f16x2 r1265, r1131, r1262; +} +{ +add.f16x2 r1268, r1134, r1135; +} +{ +mul.f16x2 r1271, r1268, r1123; +} +{ +add.f16x2 r1274, r1265, r1271; +} +{ +sub.f16x2 r1277, r1140, r1141; +} +{ +mul.f16x2 r1280, r1277, r1122; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1125; +} +{ +add.f16x2 r1289, r1280, r1286; +} +{ +add.f16x2 r1292, r1274, r1289; +} +{ +add.f16x2 r1295, r1140, r1141; +} +{ +mul.f16x2 r1298, r1295, r1119; +} +{ +add.f16x2 r1301, r1143, r1298; +} +{ +add.f16x2 r1304, r1146, r1147; +} +{ +mul.f16x2 r1307, r1304, r1121; +} +{ +add.f16x2 r1310, r1301, r1307; +} +{ +sub.f16x2 r1313, r1128, r1129; +} +{ +mul.f16x2 r1316, r1313, r1120; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1122; +} +{ +add.f16x2 r1325, r1316, r1322; +} +{ +add.f16x2 r1328, r1310, r1325; +} +{ +add.f16x2 r1331, r1140, r1141; +} +{ +mul.f16x2 r1334, r1331, r1119; +} +{ +add.f16x2 r1337, r1143, r1334; +} +{ +add.f16x2 r1340, r1146, r1147; +} +{ +mul.f16x2 r1343, r1340, r1121; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +sub.f16x2 r1349, r1128, r1129; +} +{ +mul.f16x2 r1352, r1349, r1120; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1122; +} +{ +add.f16x2 r1361, r1352, r1358; +} +{ +sub.f16x2 r1364, r1346, r1361; +} +{ +add.f16x2 r1367, r1140, r1141; +} +{ +mul.f16x2 r1370, r1367, r1121; +} +{ +add.f16x2 r1373, r1143, r1370; +} +{ +add.f16x2 r1376, r1146, r1147; +} +{ +mul.f16x2 r1379, r1376, r1123; +} +{ +add.f16x2 r1382, r1373, r1379; +} +{ +sub.f16x2 r1385, r1128, r1129; +} +{ +mul.f16x2 r1388, r1385, r1122; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1125; +} +{ +add.f16x2 r1397, r1388, r1394; +} +{ +add.f16x2 r1400, r1382, r1397; +} +{ +add.f16x2 r1403, r1140, r1141; +} +{ +mul.f16x2 r1406, r1403, r1121; +} +{ +add.f16x2 r1409, r1143, r1406; +} +{ +add.f16x2 r1412, r1146, r1147; +} +{ +mul.f16x2 r1415, r1412, r1123; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +sub.f16x2 r1421, r1128, r1129; +} +{ +mul.f16x2 r1424, r1421, r1122; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1125; +} +{ +add.f16x2 r1433, r1424, r1430; +} +{ +sub.f16x2 r1436, r1418, r1433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1444, {low, high}; +} +{ +neg.f16x2 r1445, r1444; +} +{ +add.f16x2 r1447, r1448, r1449; +} +{ +add.f16x2 r1450, r1451, r1447; +} +{ +add.f16x2 r1453, r1454, r1455; +} +{ +add.f16x2 r1456, r1450, r1453; +} +{ +add.f16x2 r1459, r1460, r1461; +} +{ +add.f16x2 r1462, r1463, r1459; +} +{ +add.f16x2 r1465, r1466, r1467; +} +{ +add.f16x2 r1468, r1462, r1465; +} +{ +add.f16x2 r1471, r1448, r1449; +} +{ +mul.f16x2 r1474, r1471, r1439; +} +{ +add.f16x2 r1477, r1451, r1474; +} +{ +add.f16x2 r1480, r1454, r1455; +} +{ +mul.f16x2 r1483, r1480, r1441; +} +{ +add.f16x2 r1486, r1477, r1483; +} +{ +sub.f16x2 r1489, r1460, r1461; +} +{ +mul.f16x2 r1492, r1489, r1440; +} +{ +sub.f16x2 r1495, r1466, r1467; +} +{ +mul.f16x2 r1498, r1495, r1442; +} +{ +add.f16x2 r1501, r1492, r1498; +} +{ +sub.f16x2 r1504, r1486, r1501; +} +{ +add.f16x2 r1507, r1448, r1449; +} +{ +mul.f16x2 r1510, r1507, r1439; +} +{ +add.f16x2 r1513, r1451, r1510; +} +{ +add.f16x2 r1516, r1454, r1455; +} +{ +mul.f16x2 r1519, r1516, r1441; +} +{ +add.f16x2 r1522, r1513, r1519; +} +{ +sub.f16x2 r1525, r1460, r1461; +} +{ +mul.f16x2 r1528, r1525, r1440; +} +{ +sub.f16x2 r1531, r1466, r1467; +} +{ +mul.f16x2 r1534, r1531, r1442; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +add.f16x2 r1540, r1522, r1537; +} +{ +add.f16x2 r1543, r1448, r1449; +} +{ +mul.f16x2 r1546, r1543, r1441; +} +{ +add.f16x2 r1549, r1451, r1546; +} +{ +add.f16x2 r1552, r1454, r1455; +} +{ +mul.f16x2 r1555, r1552, r1443; +} +{ +add.f16x2 r1558, r1549, r1555; +} +{ +sub.f16x2 r1561, r1460, r1461; +} +{ +mul.f16x2 r1564, r1561, r1442; +} +{ +sub.f16x2 r1567, r1466, r1467; +} +{ +mul.f16x2 r1570, r1567, r1445; +} +{ +add.f16x2 r1573, r1564, r1570; +} +{ +sub.f16x2 r1576, r1558, r1573; +} +{ +add.f16x2 r1579, r1448, r1449; +} +{ +mul.f16x2 r1582, r1579, r1441; +} +{ +add.f16x2 r1585, r1451, r1582; +} +{ +add.f16x2 r1588, r1454, r1455; +} +{ +mul.f16x2 r1591, r1588, r1443; +} +{ +add.f16x2 r1594, r1585, r1591; +} +{ +sub.f16x2 r1597, r1460, r1461; +} +{ +mul.f16x2 r1600, r1597, r1442; +} +{ +sub.f16x2 r1603, r1466, r1467; +} +{ +mul.f16x2 r1606, r1603, r1445; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +add.f16x2 r1612, r1594, r1609; +} +{ +add.f16x2 r1615, r1460, r1461; +} +{ +mul.f16x2 r1618, r1615, r1439; +} +{ +add.f16x2 r1621, r1463, r1618; +} +{ +add.f16x2 r1624, r1466, r1467; +} +{ +mul.f16x2 r1627, r1624, r1441; +} +{ +add.f16x2 r1630, r1621, r1627; +} +{ +sub.f16x2 r1633, r1448, r1449; +} +{ +mul.f16x2 r1636, r1633, r1440; +} +{ +sub.f16x2 r1639, r1454, r1455; +} +{ +mul.f16x2 r1642, r1639, r1442; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +add.f16x2 r1648, r1630, r1645; +} +{ +add.f16x2 r1651, r1460, r1461; +} +{ +mul.f16x2 r1654, r1651, r1439; +} +{ +add.f16x2 r1657, r1463, r1654; +} +{ +add.f16x2 r1660, r1466, r1467; +} +{ +mul.f16x2 r1663, r1660, r1441; +} +{ +add.f16x2 r1666, r1657, r1663; +} +{ +sub.f16x2 r1669, r1448, r1449; +} +{ +mul.f16x2 r1672, r1669, r1440; +} +{ +sub.f16x2 r1675, r1454, r1455; +} +{ +mul.f16x2 r1678, r1675, r1442; +} +{ +add.f16x2 r1681, r1672, r1678; +} +{ +sub.f16x2 r1684, r1666, r1681; +} +{ +add.f16x2 r1687, r1460, r1461; +} +{ +mul.f16x2 r1690, r1687, r1441; +} +{ +add.f16x2 r1693, r1463, r1690; +} +{ +add.f16x2 r1696, r1466, r1467; +} +{ +mul.f16x2 r1699, r1696, r1443; +} +{ +add.f16x2 r1702, r1693, r1699; +} +{ +sub.f16x2 r1705, r1448, r1449; +} +{ +mul.f16x2 r1708, r1705, r1442; +} +{ +sub.f16x2 r1711, r1454, r1455; +} +{ +mul.f16x2 r1714, r1711, r1445; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +add.f16x2 r1720, r1702, r1717; +} +{ +add.f16x2 r1723, r1460, r1461; +} +{ +mul.f16x2 r1726, r1723, r1441; +} +{ +add.f16x2 r1729, r1463, r1726; +} +{ +add.f16x2 r1732, r1466, r1467; +} +{ +mul.f16x2 r1735, r1732, r1443; +} +{ +add.f16x2 r1738, r1729, r1735; +} +{ +sub.f16x2 r1741, r1448, r1449; +} +{ +mul.f16x2 r1744, r1741, r1442; +} +{ +sub.f16x2 r1747, r1454, r1455; +} +{ +mul.f16x2 r1750, r1747, r1445; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, r1738, r1753; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1759, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1760, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1761, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1766, {low, high}; +} +{ +mul.f16x2 r1777, r1504, r1759; +} +{ +mul.f16x2 r1780, r1648, r1760; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1504, r1760; +} +{ +fma.rn.f16x2 r1789, r1648, r1759, r1786; +} +{ +mul.f16x2 r1793, r1576, r1761; +} +{ +mul.f16x2 r1796, r1720, r1762; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1576, r1762; +} +{ +fma.rn.f16x2 r1805, r1720, r1761, r1802; +} +{ +mul.f16x2 r1809, r1612, r1763; +} +{ +mul.f16x2 r1812, r1756, r1764; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1612, r1764; +} +{ +fma.rn.f16x2 r1821, r1756, r1763, r1818; +} +{ +mul.f16x2 r1825, r1540, r1765; +} +{ +mul.f16x2 r1828, r1684, r1766; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1540, r1766; +} +{ +fma.rn.f16x2 r1837, r1684, r1765, r1834; +} +{ +add.f16x2 r1841, r1136, r1456; +} +{ +add.f16x2 r1844, r1148, r1468; +} +{ +sub.f16x2 r1847, r1136, r1456; +} +{ +sub.f16x2 r1850, r1148, r1468; +} +{ +add.f16x2 r1853, r1184, r1783; +} +{ +add.f16x2 r1856, r1328, r1789; +} +{ +sub.f16x2 r1859, r1184, r1783; +} +{ +sub.f16x2 r1862, r1328, r1789; +} +{ +add.f16x2 r1865, r1256, r1799; +} +{ +add.f16x2 r1868, r1400, r1805; +} +{ +sub.f16x2 r1871, r1256, r1799; +} +{ +sub.f16x2 r1874, r1400, r1805; +} +{ +add.f16x2 r1877, r1292, r1815; +} +{ +add.f16x2 r1880, r1436, r1821; +} +{ +sub.f16x2 r1883, r1292, r1815; +} +{ +sub.f16x2 r1886, r1436, r1821; +} +{ +add.f16x2 r1889, r1220, r1831; +} +{ +add.f16x2 r1892, r1364, r1837; +} +{ +sub.f16x2 r1895, r1220, r1831; +} +{ +sub.f16x2 r1898, r1364, r1837; +} +mul.wide.u32 rd5, r4144, -858993459; +shr.u64 rd6, rd5, 35; +cvt.u32.u64 r4150, rd6; +cvt.rn.f32.u32 f310, r4150; +mul.f32 f311, f310, 0f3BCDE32E; +cos.approx.f32 f143, f311; +sin.approx.f32 f312, f311; +neg.f32 f144, f312; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1901, {low, high}; +} +mul.lo.s32 r4151, r4150, 10; +sub.s32 r4152, r4144, r4151; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1904, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1906, {high, high}; +} +{ +mul.f16x2 r1908, r1856, r1906; +} +{ +neg.f16x2 r1911, r1908; +} +{ +fma.rn.f16x2 r1913, r1853, r1904, r1911; +} +{ +mul.f16x2 r1917, r1853, r1906; +} +{ +fma.rn.f16x2 r1920, r1856, r1904, r1917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1924, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1926, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1928, {low, high}; +} +{ +mul.f16x2 r1929, r1926, r1928; +} +{ +mul.f16x2 r1932, r1901, r1924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1935, {high, low}; +} +{ +fma.rn.f16x2 r1937, r1929, r1935, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1941, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1943, {high, high}; +} +{ +mul.f16x2 r1945, r1868, r1943; +} +{ +neg.f16x2 r1948, r1945; +} +{ +fma.rn.f16x2 r1950, r1865, r1941, r1948; +} +{ +mul.f16x2 r1954, r1865, r1943; +} +{ +fma.rn.f16x2 r1957, r1868, r1941, r1954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1961, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1963, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1965, {low, high}; +} +{ +mul.f16x2 r1966, r1963, r1965; +} +{ +mul.f16x2 r1969, r1937, r1961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1972, {high, low}; +} +{ +fma.rn.f16x2 r1974, r1966, r1972, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1978, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1980, {high, high}; +} +{ +mul.f16x2 r1982, r1880, r1980; +} +{ +neg.f16x2 r1985, r1982; +} +{ +fma.rn.f16x2 r1987, r1877, r1978, r1985; +} +{ +mul.f16x2 r1991, r1877, r1980; +} +{ +fma.rn.f16x2 r1994, r1880, r1978, r1991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1998, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2000, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2002, {low, high}; +} +{ +mul.f16x2 r2003, r2000, r2002; +} +{ +mul.f16x2 r2006, r1974, r1998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r2009, {high, low}; +} +{ +fma.rn.f16x2 r2011, r2003, r2009, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2015, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2017, {high, high}; +} +{ +mul.f16x2 r2019, r1892, r2017; +} +{ +neg.f16x2 r2022, r2019; +} +{ +fma.rn.f16x2 r2024, r1889, r2015, r2022; +} +{ +mul.f16x2 r2028, r1889, r2017; +} +{ +fma.rn.f16x2 r2031, r1892, r2015, r2028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2035, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2037, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2039, {low, high}; +} +{ +mul.f16x2 r2040, r2037, r2039; +} +{ +mul.f16x2 r2043, r2011, r2035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2046, {high, low}; +} +{ +fma.rn.f16x2 r2048, r2040, r2046, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2052, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2054, {high, high}; +} +{ +mul.f16x2 r2056, r1850, r2054; +} +{ +neg.f16x2 r2059, r2056; +} +{ +fma.rn.f16x2 r2061, r1847, r2052, r2059; +} +{ +mul.f16x2 r2065, r1847, r2054; +} +{ +fma.rn.f16x2 r2068, r1850, r2052, r2065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2072, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2074, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2076, {low, high}; +} +{ +mul.f16x2 r2077, r2074, r2076; +} +{ +mul.f16x2 r2080, r2048, r2072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2083, {high, low}; +} +{ +fma.rn.f16x2 r2085, r2077, r2083, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2089, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2091, {high, high}; +} +{ +mul.f16x2 r2093, r1862, r2091; +} +{ +neg.f16x2 r2096, r2093; +} +{ +fma.rn.f16x2 r2098, r1859, r2089, r2096; +} +{ +mul.f16x2 r2102, r1859, r2091; +} +{ +fma.rn.f16x2 r2105, r1862, r2089, r2102; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2109, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2111, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2113, {low, high}; +} +{ +mul.f16x2 r2114, r2111, r2113; +} +{ +mul.f16x2 r2117, r2085, r2109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2120, {high, low}; +} +{ +fma.rn.f16x2 r2122, r2114, r2120, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2128, {high, high}; +} +{ +mul.f16x2 r2130, r1874, r2128; +} +{ +neg.f16x2 r2133, r2130; +} +{ +fma.rn.f16x2 r2135, r1871, r2126, r2133; +} +{ +mul.f16x2 r2139, r1871, r2128; +} +{ +fma.rn.f16x2 r2142, r1874, r2126, r2139; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2146, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2148, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2150, {low, high}; +} +{ +mul.f16x2 r2151, r2148, r2150; +} +{ +mul.f16x2 r2154, r2122, r2146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2157, {high, low}; +} +{ +fma.rn.f16x2 r2159, r2151, r2157, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2163, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2165, {high, high}; +} +{ +mul.f16x2 r2167, r1886, r2165; +} +{ +neg.f16x2 r2170, r2167; +} +{ +fma.rn.f16x2 r2172, r1883, r2163, r2170; +} +{ +mul.f16x2 r2176, r1883, r2165; +} +{ +fma.rn.f16x2 r2179, r1886, r2163, r2176; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2183, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2185, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2187, {low, high}; +} +{ +mul.f16x2 r2188, r2185, r2187; +} +{ +mul.f16x2 r2191, r2159, r2183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2194, {high, low}; +} +{ +fma.rn.f16x2 r2196, r2188, r2194, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2202, {high, high}; +} +{ +mul.f16x2 r2204, r1898, r2202; +} +{ +neg.f16x2 r2207, r2204; +} +{ +fma.rn.f16x2 r2209, r1895, r2200, r2207; +} +{ +mul.f16x2 r2213, r1895, r2202; +} +{ +fma.rn.f16x2 r2216, r1898, r2200, r2213; +} +shl.b32 r4153, r4152, 3; +add.s32 r4154, r4147, r4153; +barrier.sync 0; +mad.lo.s32 r4155, r4150, 800, r4154; +st.shared.u32 [r4155], r1841; +st.shared.u32 [r4155+4], r1844; +st.shared.u32 [r4155+80], r1913; +st.shared.u32 [r4155+84], r1920; +st.shared.u32 [r4155+160], r1950; +st.shared.u32 [r4155+164], r1957; +st.shared.u32 [r4155+240], r1987; +st.shared.u32 [r4155+244], r1994; +st.shared.u32 [r4155+320], r2024; +st.shared.u32 [r4155+324], r2031; +st.shared.u32 [r4155+400], r2061; +st.shared.u32 [r4155+404], r2068; +st.shared.u32 [r4155+480], r2098; +st.shared.u32 [r4155+484], r2105; +st.shared.u32 [r4155+560], r2135; +st.shared.u32 [r4155+564], r2142; +st.shared.u32 [r4155+640], r2172; +st.shared.u32 [r4155+644], r2179; +st.shared.u32 [r4155+720], r2209; +st.shared.u32 [r4155+724], r2216; +barrier.sync 0; +ld.shared.u32 r2249, [r4149]; +ld.shared.u32 r2261, [r4149+4]; +ld.shared.u32 r2569, [r4149+8000]; +ld.shared.u32 r2581, [r4149+8004]; +ld.shared.u32 r2246, [r4149+16000]; +ld.shared.u32 r2258, [r4149+16004]; +ld.shared.u32 r2566, [r4149+24000]; +ld.shared.u32 r2578, [r4149+24004]; +ld.shared.u32 r2252, [r4149+32000]; +ld.shared.u32 r2264, [r4149+32004]; +ld.shared.u32 r2572, [r4149+40000]; +ld.shared.u32 r2584, [r4149+40004]; +ld.shared.u32 r2253, [r4149+48000]; +ld.shared.u32 r2265, [r4149+48004]; +ld.shared.u32 r2573, [r4149+56000]; +ld.shared.u32 r2585, [r4149+56004]; +ld.shared.u32 r2247, [r4149+64000]; +ld.shared.u32 r2259, [r4149+64004]; +ld.shared.u32 r2567, [r4149+72000]; +ld.shared.u32 r2579, [r4149+72004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +add.f16x2 r2245, r2246, r2247; +} +{ +add.f16x2 r2248, r2249, r2245; +} +{ +add.f16x2 r2251, r2252, r2253; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r2258, r2259; +} +{ +add.f16x2 r2260, r2261, r2257; +} +{ +add.f16x2 r2263, r2264, r2265; +} +{ +add.f16x2 r2266, r2260, r2263; +} +{ +add.f16x2 r2269, r2246, r2247; +} +{ +mul.f16x2 r2272, r2269, r2237; +} +{ +add.f16x2 r2275, r2249, r2272; +} +{ +add.f16x2 r2278, r2252, r2253; +} +{ +mul.f16x2 r2281, r2278, r2239; +} +{ +add.f16x2 r2284, r2275, r2281; +} +{ +sub.f16x2 r2287, r2258, r2259; +} +{ +mul.f16x2 r2290, r2287, r2238; +} +{ +sub.f16x2 r2293, r2264, r2265; +} +{ +mul.f16x2 r2296, r2293, r2240; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r2284, r2299; +} +{ +add.f16x2 r2305, r2246, r2247; +} +{ +mul.f16x2 r2308, r2305, r2237; +} +{ +add.f16x2 r2311, r2249, r2308; +} +{ +add.f16x2 r2314, r2252, r2253; +} +{ +mul.f16x2 r2317, r2314, r2239; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r2258, r2259; +} +{ +mul.f16x2 r2326, r2323, r2238; +} +{ +sub.f16x2 r2329, r2264, r2265; +} +{ +mul.f16x2 r2332, r2329, r2240; +} +{ +add.f16x2 r2335, r2326, r2332; +} +{ +add.f16x2 r2338, r2320, r2335; +} +{ +add.f16x2 r2341, r2246, r2247; +} +{ +mul.f16x2 r2344, r2341, r2239; +} +{ +add.f16x2 r2347, r2249, r2344; +} +{ +add.f16x2 r2350, r2252, r2253; +} +{ +mul.f16x2 r2353, r2350, r2241; +} +{ +add.f16x2 r2356, r2347, r2353; +} +{ +sub.f16x2 r2359, r2258, r2259; +} +{ +mul.f16x2 r2362, r2359, r2240; +} +{ +sub.f16x2 r2365, r2264, r2265; +} +{ +mul.f16x2 r2368, r2365, r2243; +} +{ +add.f16x2 r2371, r2362, r2368; +} +{ +sub.f16x2 r2374, r2356, r2371; +} +{ +add.f16x2 r2377, r2246, r2247; +} +{ +mul.f16x2 r2380, r2377, r2239; +} +{ +add.f16x2 r2383, r2249, r2380; +} +{ +add.f16x2 r2386, r2252, r2253; +} +{ +mul.f16x2 r2389, r2386, r2241; +} +{ +add.f16x2 r2392, r2383, r2389; +} +{ +sub.f16x2 r2395, r2258, r2259; +} +{ +mul.f16x2 r2398, r2395, r2240; +} +{ +sub.f16x2 r2401, r2264, r2265; +} +{ +mul.f16x2 r2404, r2401, r2243; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +add.f16x2 r2410, r2392, r2407; +} +{ +add.f16x2 r2413, r2258, r2259; +} +{ +mul.f16x2 r2416, r2413, r2237; +} +{ +add.f16x2 r2419, r2261, r2416; +} +{ +add.f16x2 r2422, r2264, r2265; +} +{ +mul.f16x2 r2425, r2422, r2239; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r2246, r2247; +} +{ +mul.f16x2 r2434, r2431, r2238; +} +{ +sub.f16x2 r2437, r2252, r2253; +} +{ +mul.f16x2 r2440, r2437, r2240; +} +{ +add.f16x2 r2443, r2434, r2440; +} +{ +add.f16x2 r2446, r2428, r2443; +} +{ +add.f16x2 r2449, r2258, r2259; +} +{ +mul.f16x2 r2452, r2449, r2237; +} +{ +add.f16x2 r2455, r2261, r2452; +} +{ +add.f16x2 r2458, r2264, r2265; +} +{ +mul.f16x2 r2461, r2458, r2239; +} +{ +add.f16x2 r2464, r2455, r2461; +} +{ +sub.f16x2 r2467, r2246, r2247; +} +{ +mul.f16x2 r2470, r2467, r2238; +} +{ +sub.f16x2 r2473, r2252, r2253; +} +{ +mul.f16x2 r2476, r2473, r2240; +} +{ +add.f16x2 r2479, r2470, r2476; +} +{ +sub.f16x2 r2482, r2464, r2479; +} +{ +add.f16x2 r2485, r2258, r2259; +} +{ +mul.f16x2 r2488, r2485, r2239; +} +{ +add.f16x2 r2491, r2261, r2488; +} +{ +add.f16x2 r2494, r2264, r2265; +} +{ +mul.f16x2 r2497, r2494, r2241; +} +{ +add.f16x2 r2500, r2491, r2497; +} +{ +sub.f16x2 r2503, r2246, r2247; +} +{ +mul.f16x2 r2506, r2503, r2240; +} +{ +sub.f16x2 r2509, r2252, r2253; +} +{ +mul.f16x2 r2512, r2509, r2243; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +add.f16x2 r2518, r2500, r2515; +} +{ +add.f16x2 r2521, r2258, r2259; +} +{ +mul.f16x2 r2524, r2521, r2239; +} +{ +add.f16x2 r2527, r2261, r2524; +} +{ +add.f16x2 r2530, r2264, r2265; +} +{ +mul.f16x2 r2533, r2530, r2241; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r2246, r2247; +} +{ +mul.f16x2 r2542, r2539, r2240; +} +{ +sub.f16x2 r2545, r2252, r2253; +} +{ +mul.f16x2 r2548, r2545, r2243; +} +{ +add.f16x2 r2551, r2542, r2548; +} +{ +sub.f16x2 r2554, r2536, r2551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2557, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2558, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2562, {low, high}; +} +{ +neg.f16x2 r2563, r2562; +} +{ +add.f16x2 r2565, r2566, r2567; +} +{ +add.f16x2 r2568, r2569, r2565; +} +{ +add.f16x2 r2571, r2572, r2573; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2566, r2567; +} +{ +mul.f16x2 r2592, r2589, r2557; +} +{ +add.f16x2 r2595, r2569, r2592; +} +{ +add.f16x2 r2598, r2572, r2573; +} +{ +mul.f16x2 r2601, r2598, r2559; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +sub.f16x2 r2607, r2578, r2579; +} +{ +mul.f16x2 r2610, r2607, r2558; +} +{ +sub.f16x2 r2613, r2584, r2585; +} +{ +mul.f16x2 r2616, r2613, r2560; +} +{ +add.f16x2 r2619, r2610, r2616; +} +{ +sub.f16x2 r2622, r2604, r2619; +} +{ +add.f16x2 r2625, r2566, r2567; +} +{ +mul.f16x2 r2628, r2625, r2557; +} +{ +add.f16x2 r2631, r2569, r2628; +} +{ +add.f16x2 r2634, r2572, r2573; +} +{ +mul.f16x2 r2637, r2634, r2559; +} +{ +add.f16x2 r2640, r2631, r2637; +} +{ +sub.f16x2 r2643, r2578, r2579; +} +{ +mul.f16x2 r2646, r2643, r2558; +} +{ +sub.f16x2 r2649, r2584, r2585; +} +{ +mul.f16x2 r2652, r2649, r2560; +} +{ +add.f16x2 r2655, r2646, r2652; +} +{ +add.f16x2 r2658, r2640, r2655; +} +{ +add.f16x2 r2661, r2566, r2567; +} +{ +mul.f16x2 r2664, r2661, r2559; +} +{ +add.f16x2 r2667, r2569, r2664; +} +{ +add.f16x2 r2670, r2572, r2573; +} +{ +mul.f16x2 r2673, r2670, r2561; +} +{ +add.f16x2 r2676, r2667, r2673; +} +{ +sub.f16x2 r2679, r2578, r2579; +} +{ +mul.f16x2 r2682, r2679, r2560; +} +{ +sub.f16x2 r2685, r2584, r2585; +} +{ +mul.f16x2 r2688, r2685, r2563; +} +{ +add.f16x2 r2691, r2682, r2688; +} +{ +sub.f16x2 r2694, r2676, r2691; +} +{ +add.f16x2 r2697, r2566, r2567; +} +{ +mul.f16x2 r2700, r2697, r2559; +} +{ +add.f16x2 r2703, r2569, r2700; +} +{ +add.f16x2 r2706, r2572, r2573; +} +{ +mul.f16x2 r2709, r2706, r2561; +} +{ +add.f16x2 r2712, r2703, r2709; +} +{ +sub.f16x2 r2715, r2578, r2579; +} +{ +mul.f16x2 r2718, r2715, r2560; +} +{ +sub.f16x2 r2721, r2584, r2585; +} +{ +mul.f16x2 r2724, r2721, r2563; +} +{ +add.f16x2 r2727, r2718, r2724; +} +{ +add.f16x2 r2730, r2712, r2727; +} +{ +add.f16x2 r2733, r2578, r2579; +} +{ +mul.f16x2 r2736, r2733, r2557; +} +{ +add.f16x2 r2739, r2581, r2736; +} +{ +add.f16x2 r2742, r2584, r2585; +} +{ +mul.f16x2 r2745, r2742, r2559; +} +{ +add.f16x2 r2748, r2739, r2745; +} +{ +sub.f16x2 r2751, r2566, r2567; +} +{ +mul.f16x2 r2754, r2751, r2558; +} +{ +sub.f16x2 r2757, r2572, r2573; +} +{ +mul.f16x2 r2760, r2757, r2560; +} +{ +add.f16x2 r2763, r2754, r2760; +} +{ +add.f16x2 r2766, r2748, r2763; +} +{ +add.f16x2 r2769, r2578, r2579; +} +{ +mul.f16x2 r2772, r2769, r2557; +} +{ +add.f16x2 r2775, r2581, r2772; +} +{ +add.f16x2 r2778, r2584, r2585; +} +{ +mul.f16x2 r2781, r2778, r2559; +} +{ +add.f16x2 r2784, r2775, r2781; +} +{ +sub.f16x2 r2787, r2566, r2567; +} +{ +mul.f16x2 r2790, r2787, r2558; +} +{ +sub.f16x2 r2793, r2572, r2573; +} +{ +mul.f16x2 r2796, r2793, r2560; +} +{ +add.f16x2 r2799, r2790, r2796; +} +{ +sub.f16x2 r2802, r2784, r2799; +} +{ +add.f16x2 r2805, r2578, r2579; +} +{ +mul.f16x2 r2808, r2805, r2559; +} +{ +add.f16x2 r2811, r2581, r2808; +} +{ +add.f16x2 r2814, r2584, r2585; +} +{ +mul.f16x2 r2817, r2814, r2561; +} +{ +add.f16x2 r2820, r2811, r2817; +} +{ +sub.f16x2 r2823, r2566, r2567; +} +{ +mul.f16x2 r2826, r2823, r2560; +} +{ +sub.f16x2 r2829, r2572, r2573; +} +{ +mul.f16x2 r2832, r2829, r2563; +} +{ +add.f16x2 r2835, r2826, r2832; +} +{ +add.f16x2 r2838, r2820, r2835; +} +{ +add.f16x2 r2841, r2578, r2579; +} +{ +mul.f16x2 r2844, r2841, r2559; +} +{ +add.f16x2 r2847, r2581, r2844; +} +{ +add.f16x2 r2850, r2584, r2585; +} +{ +mul.f16x2 r2853, r2850, r2561; +} +{ +add.f16x2 r2856, r2847, r2853; +} +{ +sub.f16x2 r2859, r2566, r2567; +} +{ +mul.f16x2 r2862, r2859, r2560; +} +{ +sub.f16x2 r2865, r2572, r2573; +} +{ +mul.f16x2 r2868, r2865, r2563; +} +{ +add.f16x2 r2871, r2862, r2868; +} +{ +sub.f16x2 r2874, r2856, r2871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r2877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2878, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2884, {low, high}; +} +{ +mul.f16x2 r2895, r2622, r2877; +} +{ +mul.f16x2 r2898, r2766, r2878; +} +{ +sub.f16x2 r2901, r2895, r2898; +} +{ +mul.f16x2 r2904, r2622, r2878; +} +{ +fma.rn.f16x2 r2907, r2766, r2877, r2904; +} +{ +mul.f16x2 r2911, r2694, r2879; +} +{ +mul.f16x2 r2914, r2838, r2880; +} +{ +sub.f16x2 r2917, r2911, r2914; +} +{ +mul.f16x2 r2920, r2694, r2880; +} +{ +fma.rn.f16x2 r2923, r2838, r2879, r2920; +} +{ +mul.f16x2 r2927, r2730, r2881; +} +{ +mul.f16x2 r2930, r2874, r2882; +} +{ +sub.f16x2 r2933, r2927, r2930; +} +{ +mul.f16x2 r2936, r2730, r2882; +} +{ +fma.rn.f16x2 r2939, r2874, r2881, r2936; +} +{ +mul.f16x2 r2943, r2658, r2883; +} +{ +mul.f16x2 r2946, r2802, r2884; +} +{ +sub.f16x2 r2949, r2943, r2946; +} +{ +mul.f16x2 r2952, r2658, r2884; +} +{ +fma.rn.f16x2 r2955, r2802, r2883, r2952; +} +{ +add.f16x2 r2959, r2254, r2574; +} +{ +add.f16x2 r2962, r2266, r2586; +} +{ +sub.f16x2 r2965, r2254, r2574; +} +{ +sub.f16x2 r2968, r2266, r2586; +} +{ +add.f16x2 r2971, r2302, r2901; +} +{ +add.f16x2 r2974, r2446, r2907; +} +{ +sub.f16x2 r2977, r2302, r2901; +} +{ +sub.f16x2 r2980, r2446, r2907; +} +{ +add.f16x2 r2983, r2374, r2917; +} +{ +add.f16x2 r2986, r2518, r2923; +} +{ +sub.f16x2 r2989, r2374, r2917; +} +{ +sub.f16x2 r2992, r2518, r2923; +} +{ +add.f16x2 r2995, r2410, r2933; +} +{ +add.f16x2 r2998, r2554, r2939; +} +{ +sub.f16x2 r3001, r2410, r2933; +} +{ +sub.f16x2 r3004, r2554, r2939; +} +{ +add.f16x2 r3007, r2338, r2949; +} +{ +add.f16x2 r3010, r2482, r2955; +} +{ +sub.f16x2 r3013, r2338, r2949; +} +{ +sub.f16x2 r3016, r2482, r2955; +} +mul.wide.u32 rd7, r4144, 1374389535; +shr.u64 rd8, rd7, 37; +cvt.u32.u64 r4156, rd8; +cvt.rn.f32.u32 f313, r4156; +mul.f32 f314, f313, 0f3D80ADFD; +cos.approx.f32 f225, f314; +sin.approx.f32 f315, f314; +neg.f32 f226, f315; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r3019, {low, high}; +} +mul.lo.s32 r4157, r4156, 100; +sub.s32 r4158, r4144, r4157; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2974, r3024; +} +{ +neg.f16x2 r3029, r3026; +} +{ +fma.rn.f16x2 r3031, r2971, r3022, r3029; +} +{ +mul.f16x2 r3035, r2971, r3024; +} +{ +fma.rn.f16x2 r3038, r2974, r3022, r3035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3019, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2986, r3061; +} +{ +neg.f16x2 r3066, r3063; +} +{ +fma.rn.f16x2 r3068, r2983, r3059, r3066; +} +{ +mul.f16x2 r3072, r2983, r3061; +} +{ +fma.rn.f16x2 r3075, r2986, r3059, r3072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2998, r3098; +} +{ +neg.f16x2 r3103, r3100; +} +{ +fma.rn.f16x2 r3105, r2995, r3096, r3103; +} +{ +mul.f16x2 r3109, r2995, r3098; +} +{ +fma.rn.f16x2 r3112, r2998, r3096, r3109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r3010, r3135; +} +{ +neg.f16x2 r3140, r3137; +} +{ +fma.rn.f16x2 r3142, r3007, r3133, r3140; +} +{ +mul.f16x2 r3146, r3007, r3135; +} +{ +fma.rn.f16x2 r3149, r3010, r3133, r3146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2968, r3172; +} +{ +neg.f16x2 r3177, r3174; +} +{ +fma.rn.f16x2 r3179, r2965, r3170, r3177; +} +{ +mul.f16x2 r3183, r2965, r3172; +} +{ +fma.rn.f16x2 r3186, r2968, r3170, r3183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2980, r3209; +} +{ +neg.f16x2 r3214, r3211; +} +{ +fma.rn.f16x2 r3216, r2977, r3207, r3214; +} +{ +mul.f16x2 r3220, r2977, r3209; +} +{ +fma.rn.f16x2 r3223, r2980, r3207, r3220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2992, r3246; +} +{ +neg.f16x2 r3251, r3248; +} +{ +fma.rn.f16x2 r3253, r2989, r3244, r3251; +} +{ +mul.f16x2 r3257, r2989, r3246; +} +{ +fma.rn.f16x2 r3260, r2992, r3244, r3257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r3004, r3283; +} +{ +neg.f16x2 r3288, r3285; +} +{ +fma.rn.f16x2 r3290, r3001, r3281, r3288; +} +{ +mul.f16x2 r3294, r3001, r3283; +} +{ +fma.rn.f16x2 r3297, r3004, r3281, r3294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r3016, r3320; +} +{ +neg.f16x2 r3325, r3322; +} +{ +fma.rn.f16x2 r3327, r3013, r3318, r3325; +} +{ +mul.f16x2 r3331, r3013, r3320; +} +{ +fma.rn.f16x2 r3334, r3016, r3318, r3331; +} +shl.b32 r4159, r4158, 3; +add.s32 r4160, r4147, r4159; +barrier.sync 0; +mad.lo.s32 r4161, r4156, 8000, r4160; +st.shared.u32 [r4161], r2959; +st.shared.u32 [r4161+4], r2962; +st.shared.u32 [r4161+800], r3031; +st.shared.u32 [r4161+804], r3038; +st.shared.u32 [r4161+1600], r3068; +st.shared.u32 [r4161+1604], r3075; +st.shared.u32 [r4161+2400], r3105; +st.shared.u32 [r4161+2404], r3112; +st.shared.u32 [r4161+3200], r3142; +st.shared.u32 [r4161+3204], r3149; +st.shared.u32 [r4161+4000], r3179; +st.shared.u32 [r4161+4004], r3186; +st.shared.u32 [r4161+4800], r3216; +st.shared.u32 [r4161+4804], r3223; +st.shared.u32 [r4161+5600], r3253; +st.shared.u32 [r4161+5604], r3260; +st.shared.u32 [r4161+6400], r3290; +st.shared.u32 [r4161+6404], r3297; +st.shared.u32 [r4161+7200], r3327; +st.shared.u32 [r4161+7204], r3334; +barrier.sync 0; +ld.shared.u32 r3367, [r4149]; +ld.shared.u32 r3379, [r4149+4]; +ld.shared.u32 r3687, [r4149+8000]; +ld.shared.u32 r3699, [r4149+8004]; +ld.shared.u32 r3364, [r4149+16000]; +ld.shared.u32 r3376, [r4149+16004]; +ld.shared.u32 r3684, [r4149+24000]; +ld.shared.u32 r3696, [r4149+24004]; +ld.shared.u32 r3370, [r4149+32000]; +ld.shared.u32 r3382, [r4149+32004]; +ld.shared.u32 r3690, [r4149+40000]; +ld.shared.u32 r3702, [r4149+40004]; +ld.shared.u32 r3371, [r4149+48000]; +ld.shared.u32 r3383, [r4149+48004]; +ld.shared.u32 r3691, [r4149+56000]; +ld.shared.u32 r3703, [r4149+56004]; +ld.shared.u32 r3365, [r4149+64000]; +ld.shared.u32 r3377, [r4149+64004]; +ld.shared.u32 r3685, [r4149+72000]; +ld.shared.u32 r3697, [r4149+72004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r3358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3360, {low, high}; +} +{ +neg.f16x2 r3361, r3360; +} +{ +add.f16x2 r3363, r3364, r3365; +} +{ +add.f16x2 r3366, r3367, r3363; +} +{ +add.f16x2 r3369, r3370, r3371; +} +{ +add.f16x2 r3372, r3366, r3369; +} +{ +add.f16x2 r3375, r3376, r3377; +} +{ +add.f16x2 r3378, r3379, r3375; +} +{ +add.f16x2 r3381, r3382, r3383; +} +{ +add.f16x2 r3384, r3378, r3381; +} +{ +add.f16x2 r3387, r3364, r3365; +} +{ +mul.f16x2 r3390, r3387, r3355; +} +{ +add.f16x2 r3393, r3367, r3390; +} +{ +add.f16x2 r3396, r3370, r3371; +} +{ +mul.f16x2 r3399, r3396, r3357; +} +{ +add.f16x2 r3402, r3393, r3399; +} +{ +sub.f16x2 r3405, r3376, r3377; +} +{ +mul.f16x2 r3408, r3405, r3356; +} +{ +sub.f16x2 r3411, r3382, r3383; +} +{ +mul.f16x2 r3414, r3411, r3358; +} +{ +add.f16x2 r3417, r3408, r3414; +} +{ +sub.f16x2 r3420, r3402, r3417; +} +{ +add.f16x2 r3423, r3364, r3365; +} +{ +mul.f16x2 r3426, r3423, r3355; +} +{ +add.f16x2 r3429, r3367, r3426; +} +{ +add.f16x2 r3432, r3370, r3371; +} +{ +mul.f16x2 r3435, r3432, r3357; +} +{ +add.f16x2 r3438, r3429, r3435; +} +{ +sub.f16x2 r3441, r3376, r3377; +} +{ +mul.f16x2 r3444, r3441, r3356; +} +{ +sub.f16x2 r3447, r3382, r3383; +} +{ +mul.f16x2 r3450, r3447, r3358; +} +{ +add.f16x2 r3453, r3444, r3450; +} +{ +add.f16x2 r3456, r3438, r3453; +} +{ +add.f16x2 r3459, r3364, r3365; +} +{ +mul.f16x2 r3462, r3459, r3357; +} +{ +add.f16x2 r3465, r3367, r3462; +} +{ +add.f16x2 r3468, r3370, r3371; +} +{ +mul.f16x2 r3471, r3468, r3359; +} +{ +add.f16x2 r3474, r3465, r3471; +} +{ +sub.f16x2 r3477, r3376, r3377; +} +{ +mul.f16x2 r3480, r3477, r3358; +} +{ +sub.f16x2 r3483, r3382, r3383; +} +{ +mul.f16x2 r3486, r3483, r3361; +} +{ +add.f16x2 r3489, r3480, r3486; +} +{ +sub.f16x2 r3492, r3474, r3489; +} +{ +add.f16x2 r3495, r3364, r3365; +} +{ +mul.f16x2 r3498, r3495, r3357; +} +{ +add.f16x2 r3501, r3367, r3498; +} +{ +add.f16x2 r3504, r3370, r3371; +} +{ +mul.f16x2 r3507, r3504, r3359; +} +{ +add.f16x2 r3510, r3501, r3507; +} +{ +sub.f16x2 r3513, r3376, r3377; +} +{ +mul.f16x2 r3516, r3513, r3358; +} +{ +sub.f16x2 r3519, r3382, r3383; +} +{ +mul.f16x2 r3522, r3519, r3361; +} +{ +add.f16x2 r3525, r3516, r3522; +} +{ +add.f16x2 r3528, r3510, r3525; +} +{ +add.f16x2 r3531, r3376, r3377; +} +{ +mul.f16x2 r3534, r3531, r3355; +} +{ +add.f16x2 r3537, r3379, r3534; +} +{ +add.f16x2 r3540, r3382, r3383; +} +{ +mul.f16x2 r3543, r3540, r3357; +} +{ +add.f16x2 r3546, r3537, r3543; +} +{ +sub.f16x2 r3549, r3364, r3365; +} +{ +mul.f16x2 r3552, r3549, r3356; +} +{ +sub.f16x2 r3555, r3370, r3371; +} +{ +mul.f16x2 r3558, r3555, r3358; +} +{ +add.f16x2 r3561, r3552, r3558; +} +{ +add.f16x2 r3564, r3546, r3561; +} +{ +add.f16x2 r3567, r3376, r3377; +} +{ +mul.f16x2 r3570, r3567, r3355; +} +{ +add.f16x2 r3573, r3379, r3570; +} +{ +add.f16x2 r3576, r3382, r3383; +} +{ +mul.f16x2 r3579, r3576, r3357; +} +{ +add.f16x2 r3582, r3573, r3579; +} +{ +sub.f16x2 r3585, r3364, r3365; +} +{ +mul.f16x2 r3588, r3585, r3356; +} +{ +sub.f16x2 r3591, r3370, r3371; +} +{ +mul.f16x2 r3594, r3591, r3358; +} +{ +add.f16x2 r3597, r3588, r3594; +} +{ +sub.f16x2 r3600, r3582, r3597; +} +{ +add.f16x2 r3603, r3376, r3377; +} +{ +mul.f16x2 r3606, r3603, r3357; +} +{ +add.f16x2 r3609, r3379, r3606; +} +{ +add.f16x2 r3612, r3382, r3383; +} +{ +mul.f16x2 r3615, r3612, r3359; +} +{ +add.f16x2 r3618, r3609, r3615; +} +{ +sub.f16x2 r3621, r3364, r3365; +} +{ +mul.f16x2 r3624, r3621, r3358; +} +{ +sub.f16x2 r3627, r3370, r3371; +} +{ +mul.f16x2 r3630, r3627, r3361; +} +{ +add.f16x2 r3633, r3624, r3630; +} +{ +add.f16x2 r3636, r3618, r3633; +} +{ +add.f16x2 r3639, r3376, r3377; +} +{ +mul.f16x2 r3642, r3639, r3357; +} +{ +add.f16x2 r3645, r3379, r3642; +} +{ +add.f16x2 r3648, r3382, r3383; +} +{ +mul.f16x2 r3651, r3648, r3359; +} +{ +add.f16x2 r3654, r3645, r3651; +} +{ +sub.f16x2 r3657, r3364, r3365; +} +{ +mul.f16x2 r3660, r3657, r3358; +} +{ +sub.f16x2 r3663, r3370, r3371; +} +{ +mul.f16x2 r3666, r3663, r3361; +} +{ +add.f16x2 r3669, r3660, r3666; +} +{ +sub.f16x2 r3672, r3654, r3669; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3675, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r3678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3680, {low, high}; +} +{ +neg.f16x2 r3681, r3680; +} +{ +add.f16x2 r3683, r3684, r3685; +} +{ +add.f16x2 r3686, r3687, r3683; +} +{ +add.f16x2 r3689, r3690, r3691; +} +{ +add.f16x2 r3692, r3686, r3689; +} +{ +add.f16x2 r3695, r3696, r3697; +} +{ +add.f16x2 r3698, r3699, r3695; +} +{ +add.f16x2 r3701, r3702, r3703; +} +{ +add.f16x2 r3704, r3698, r3701; +} +{ +add.f16x2 r3707, r3684, r3685; +} +{ +mul.f16x2 r3710, r3707, r3675; +} +{ +add.f16x2 r3713, r3687, r3710; +} +{ +add.f16x2 r3716, r3690, r3691; +} +{ +mul.f16x2 r3719, r3716, r3677; +} +{ +add.f16x2 r3722, r3713, r3719; +} +{ +sub.f16x2 r3725, r3696, r3697; +} +{ +mul.f16x2 r3728, r3725, r3676; +} +{ +sub.f16x2 r3731, r3702, r3703; +} +{ +mul.f16x2 r3734, r3731, r3678; +} +{ +add.f16x2 r3737, r3728, r3734; +} +{ +sub.f16x2 r3740, r3722, r3737; +} +{ +add.f16x2 r3743, r3684, r3685; +} +{ +mul.f16x2 r3746, r3743, r3675; +} +{ +add.f16x2 r3749, r3687, r3746; +} +{ +add.f16x2 r3752, r3690, r3691; +} +{ +mul.f16x2 r3755, r3752, r3677; +} +{ +add.f16x2 r3758, r3749, r3755; +} +{ +sub.f16x2 r3761, r3696, r3697; +} +{ +mul.f16x2 r3764, r3761, r3676; +} +{ +sub.f16x2 r3767, r3702, r3703; +} +{ +mul.f16x2 r3770, r3767, r3678; +} +{ +add.f16x2 r3773, r3764, r3770; +} +{ +add.f16x2 r3776, r3758, r3773; +} +{ +add.f16x2 r3779, r3684, r3685; +} +{ +mul.f16x2 r3782, r3779, r3677; +} +{ +add.f16x2 r3785, r3687, r3782; +} +{ +add.f16x2 r3788, r3690, r3691; +} +{ +mul.f16x2 r3791, r3788, r3679; +} +{ +add.f16x2 r3794, r3785, r3791; +} +{ +sub.f16x2 r3797, r3696, r3697; +} +{ +mul.f16x2 r3800, r3797, r3678; +} +{ +sub.f16x2 r3803, r3702, r3703; +} +{ +mul.f16x2 r3806, r3803, r3681; +} +{ +add.f16x2 r3809, r3800, r3806; +} +{ +sub.f16x2 r3812, r3794, r3809; +} +{ +add.f16x2 r3815, r3684, r3685; +} +{ +mul.f16x2 r3818, r3815, r3677; +} +{ +add.f16x2 r3821, r3687, r3818; +} +{ +add.f16x2 r3824, r3690, r3691; +} +{ +mul.f16x2 r3827, r3824, r3679; +} +{ +add.f16x2 r3830, r3821, r3827; +} +{ +sub.f16x2 r3833, r3696, r3697; +} +{ +mul.f16x2 r3836, r3833, r3678; +} +{ +sub.f16x2 r3839, r3702, r3703; +} +{ +mul.f16x2 r3842, r3839, r3681; +} +{ +add.f16x2 r3845, r3836, r3842; +} +{ +add.f16x2 r3848, r3830, r3845; +} +{ +add.f16x2 r3851, r3696, r3697; +} +{ +mul.f16x2 r3854, r3851, r3675; +} +{ +add.f16x2 r3857, r3699, r3854; +} +{ +add.f16x2 r3860, r3702, r3703; +} +{ +mul.f16x2 r3863, r3860, r3677; +} +{ +add.f16x2 r3866, r3857, r3863; +} +{ +sub.f16x2 r3869, r3684, r3685; +} +{ +mul.f16x2 r3872, r3869, r3676; +} +{ +sub.f16x2 r3875, r3690, r3691; +} +{ +mul.f16x2 r3878, r3875, r3678; +} +{ +add.f16x2 r3881, r3872, r3878; +} +{ +add.f16x2 r3884, r3866, r3881; +} +{ +add.f16x2 r3887, r3696, r3697; +} +{ +mul.f16x2 r3890, r3887, r3675; +} +{ +add.f16x2 r3893, r3699, r3890; +} +{ +add.f16x2 r3896, r3702, r3703; +} +{ +mul.f16x2 r3899, r3896, r3677; +} +{ +add.f16x2 r3902, r3893, r3899; +} +{ +sub.f16x2 r3905, r3684, r3685; +} +{ +mul.f16x2 r3908, r3905, r3676; +} +{ +sub.f16x2 r3911, r3690, r3691; +} +{ +mul.f16x2 r3914, r3911, r3678; +} +{ +add.f16x2 r3917, r3908, r3914; +} +{ +sub.f16x2 r3920, r3902, r3917; +} +{ +add.f16x2 r3923, r3696, r3697; +} +{ +mul.f16x2 r3926, r3923, r3677; +} +{ +add.f16x2 r3929, r3699, r3926; +} +{ +add.f16x2 r3932, r3702, r3703; +} +{ +mul.f16x2 r3935, r3932, r3679; +} +{ +add.f16x2 r3938, r3929, r3935; +} +{ +sub.f16x2 r3941, r3684, r3685; +} +{ +mul.f16x2 r3944, r3941, r3678; +} +{ +sub.f16x2 r3947, r3690, r3691; +} +{ +mul.f16x2 r3950, r3947, r3681; +} +{ +add.f16x2 r3953, r3944, r3950; +} +{ +add.f16x2 r3956, r3938, r3953; +} +{ +add.f16x2 r3959, r3696, r3697; +} +{ +mul.f16x2 r3962, r3959, r3677; +} +{ +add.f16x2 r3965, r3699, r3962; +} +{ +add.f16x2 r3968, r3702, r3703; +} +{ +mul.f16x2 r3971, r3968, r3679; +} +{ +add.f16x2 r3974, r3965, r3971; +} +{ +sub.f16x2 r3977, r3684, r3685; +} +{ +mul.f16x2 r3980, r3977, r3678; +} +{ +sub.f16x2 r3983, r3690, r3691; +} +{ +mul.f16x2 r3986, r3983, r3681; +} +{ +add.f16x2 r3989, r3980, r3986; +} +{ +sub.f16x2 r3992, r3974, r3989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r3995, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r3996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3998, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r3999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r4000, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r4001, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4002, {low, high}; +} +{ +mul.f16x2 r4013, r3740, r3995; +} +{ +mul.f16x2 r4016, r3884, r3996; +} +{ +sub.f16x2 r4019, r4013, r4016; +} +{ +mul.f16x2 r4022, r3740, r3996; +} +{ +fma.rn.f16x2 r4025, r3884, r3995, r4022; +} +{ +mul.f16x2 r4029, r3812, r3997; +} +{ +mul.f16x2 r4032, r3956, r3998; +} +{ +sub.f16x2 r4035, r4029, r4032; +} +{ +mul.f16x2 r4038, r3812, r3998; +} +{ +fma.rn.f16x2 r4041, r3956, r3997, r4038; +} +{ +mul.f16x2 r4045, r3848, r3999; +} +{ +mul.f16x2 r4048, r3992, r4000; +} +{ +sub.f16x2 r4051, r4045, r4048; +} +{ +mul.f16x2 r4054, r3848, r4000; +} +{ +fma.rn.f16x2 r4057, r3992, r3999, r4054; +} +{ +mul.f16x2 r4061, r3776, r4001; +} +{ +mul.f16x2 r4064, r3920, r4002; +} +{ +sub.f16x2 r4067, r4061, r4064; +} +{ +mul.f16x2 r4070, r3776, r4002; +} +{ +fma.rn.f16x2 r4073, r3920, r4001, r4070; +} +{ +add.f16x2 %0, r3372, r3692; +} +{ +add.f16x2 %1, r3384, r3704; +} +{ +sub.f16x2 %10, r3372, r3692; +} +{ +sub.f16x2 %11, r3384, r3704; +} +{ +add.f16x2 %2, r3420, r4019; +} +{ +add.f16x2 %3, r3564, r4025; +} +{ +sub.f16x2 %12, r3420, r4019; +} +{ +sub.f16x2 %13, r3564, r4025; +} +{ +add.f16x2 %4, r3492, r4035; +} +{ +add.f16x2 %5, r3636, r4041; +} +{ +sub.f16x2 %14, r3492, r4035; +} +{ +sub.f16x2 %15, r3636, r4041; +} +{ +add.f16x2 %6, r3528, r4051; +} +{ +add.f16x2 %7, r3672, r4057; +} +{ +sub.f16x2 %16, r3528, r4051; +} +{ +sub.f16x2 %17, r3672, r4057; +} +{ +add.f16x2 %8, r3456, r4067; +} +{ +add.f16x2 %9, r3600, r4073; +} +{ +sub.f16x2 %18, r3456, r4067; +} +{ +sub.f16x2 %19, r3600, r4073; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<942, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<316>; +.reg .b32 r<4159>; +.reg .b64 rd<8>; +mov.u32 r4137, %tid.y; +mov.u32 r4138, %20; +mad.lo.s32 r4139, r4137, 40000, r4138; +mov.u32 r4140, %tid.x; +mov.f32 f276, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1, {low, high}; +} +mov.f32 f282, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2, {low, high}; +} +mov.f32 f284, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3, {low, high}; +} +mov.f32 f286, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %25, %37; +} +{ +add.f16x2 r12, %21, r9; +} +{ +add.f16x2 r15, %29, %33; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %26, %38; +} +{ +add.f16x2 r24, %22, r21; +} +{ +add.f16x2 r27, %30, %34; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %25, %37; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %21, r36; +} +{ +add.f16x2 r42, %29, %33; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %26, %38; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %30, %34; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %25, %37; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %21, r72; +} +{ +add.f16x2 r78, %29, %33; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %26, %38; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %30, %34; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %25, %37; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %21, r108; +} +{ +add.f16x2 r114, %29, %33; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %26, %38; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %30, %34; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %25, %37; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %21, r144; +} +{ +add.f16x2 r150, %29, %33; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %26, %38; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %30, %34; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %26, %38; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %22, r180; +} +{ +add.f16x2 r186, %30, %34; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %25, %37; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %29, %33; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %26, %38; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %22, r216; +} +{ +add.f16x2 r222, %30, %34; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %25, %37; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %29, %33; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %26, %38; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %22, r252; +} +{ +add.f16x2 r258, %30, %34; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %25, %37; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %29, %33; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %26, %38; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %22, r288; +} +{ +add.f16x2 r294, %30, %34; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %25, %37; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %29, %33; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %27, %39; +} +{ +add.f16x2 r332, %23, r329; +} +{ +add.f16x2 r335, %31, %35; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %28, %40; +} +{ +add.f16x2 r344, %24, r341; +} +{ +add.f16x2 r347, %32, %36; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %27, %39; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %23, r356; +} +{ +add.f16x2 r362, %31, %35; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %28, %40; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %32, %36; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %27, %39; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %23, r392; +} +{ +add.f16x2 r398, %31, %35; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %28, %40; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %32, %36; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %27, %39; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %23, r428; +} +{ +add.f16x2 r434, %31, %35; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %28, %40; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %32, %36; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %27, %39; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %23, r464; +} +{ +add.f16x2 r470, %31, %35; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %28, %40; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %32, %36; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %28, %40; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %24, r500; +} +{ +add.f16x2 r506, %32, %36; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %27, %39; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %31, %35; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %28, %40; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %24, r536; +} +{ +add.f16x2 r542, %32, %36; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %27, %39; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %31, %35; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %28, %40; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %24, r572; +} +{ +add.f16x2 r578, %32, %36; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %27, %39; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %31, %35; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %28, %40; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %24, r608; +} +{ +add.f16x2 r614, %32, %36; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %27, %39; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %31, %35; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +mov.f32 f272, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r644, {low, high}; +} +mov.f32 f280, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r648, {low, high}; +} +mov.f32 f243, 0fBF800000; +{ +mul.f16x2 r659, r386, r641; +} +{ +mul.f16x2 r662, r530, r642; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r386, r642; +} +{ +fma.rn.f16x2 r671, r530, r641, r668; +} +{ +mul.f16x2 r675, r458, r643; +} +{ +mul.f16x2 r678, r602, r644; +} +{ +sub.f16x2 r681, r675, r678; +} +{ +mul.f16x2 r684, r458, r644; +} +{ +fma.rn.f16x2 r687, r602, r643, r684; +} +{ +mul.f16x2 r691, r494, r645; +} +{ +mul.f16x2 r694, r638, r646; +} +{ +sub.f16x2 r697, r691, r694; +} +{ +mul.f16x2 r700, r494, r646; +} +{ +fma.rn.f16x2 r703, r638, r645, r700; +} +{ +mul.f16x2 r707, r422, r647; +} +{ +mul.f16x2 r710, r566, r648; +} +{ +sub.f16x2 r713, r707, r710; +} +{ +mul.f16x2 r716, r422, r648; +} +{ +fma.rn.f16x2 r719, r566, r647, r716; +} +{ +add.f16x2 r723, r18, r338; +} +{ +add.f16x2 r726, r30, r350; +} +{ +sub.f16x2 r729, r18, r338; +} +{ +sub.f16x2 r732, r30, r350; +} +{ +add.f16x2 r735, r66, r665; +} +{ +add.f16x2 r738, r210, r671; +} +{ +sub.f16x2 r741, r66, r665; +} +{ +sub.f16x2 r744, r210, r671; +} +{ +add.f16x2 r747, r138, r681; +} +{ +add.f16x2 r750, r282, r687; +} +{ +sub.f16x2 r753, r138, r681; +} +{ +sub.f16x2 r756, r282, r687; +} +{ +add.f16x2 r759, r174, r697; +} +{ +add.f16x2 r762, r318, r703; +} +{ +sub.f16x2 r765, r174, r697; +} +{ +sub.f16x2 r768, r318, r703; +} +{ +add.f16x2 r771, r102, r713; +} +{ +add.f16x2 r774, r246, r719; +} +{ +sub.f16x2 r777, r102, r713; +} +{ +sub.f16x2 r780, r246, r719; +} +mul.wide.u32 rd2, r4140, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r4141, rd3; +mul.lo.s32 r4142, r4141, 1000; +sub.s32 r4143, r4140, r4142; +mad.lo.s32 r4144, r4141, 40000, r4139; +cvt.rn.f32.u32 f307, r4143; +mul.f32 f308, f307, 0f3A24B5BE; +cos.approx.f32 f61, f308; +sin.approx.f32 f309, f308; +neg.f32 f62, f309; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r783, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r788, {high, high}; +} +{ +mul.f16x2 r790, r738, r788; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r735, r786, r793; +} +{ +mul.f16x2 r799, r735, r788; +} +{ +fma.rn.f16x2 r802, r738, r786, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r808, {high, high}; +} +mov.f32 f244, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r808, r810; +} +{ +mul.f16x2 r814, r783, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r817, {high, low}; +} +{ +fma.rn.f16x2 r819, r811, r817, r814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r825, {high, high}; +} +{ +mul.f16x2 r827, r750, r825; +} +{ +neg.f16x2 r830, r827; +} +{ +fma.rn.f16x2 r832, r747, r823, r830; +} +{ +mul.f16x2 r836, r747, r825; +} +{ +fma.rn.f16x2 r839, r750, r823, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r845, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r847, {low, high}; +} +{ +mul.f16x2 r848, r845, r847; +} +{ +mul.f16x2 r851, r819, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r854, {high, low}; +} +{ +fma.rn.f16x2 r856, r848, r854, r851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r860, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r862, {high, high}; +} +{ +mul.f16x2 r864, r762, r862; +} +{ +neg.f16x2 r867, r864; +} +{ +fma.rn.f16x2 r869, r759, r860, r867; +} +{ +mul.f16x2 r873, r759, r862; +} +{ +fma.rn.f16x2 r876, r762, r860, r873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r882, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r884, {low, high}; +} +{ +mul.f16x2 r885, r882, r884; +} +{ +mul.f16x2 r888, r856, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r891, {high, low}; +} +{ +fma.rn.f16x2 r893, r885, r891, r888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r897, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r899, {high, high}; +} +{ +mul.f16x2 r901, r774, r899; +} +{ +neg.f16x2 r904, r901; +} +{ +fma.rn.f16x2 r906, r771, r897, r904; +} +{ +mul.f16x2 r910, r771, r899; +} +{ +fma.rn.f16x2 r913, r774, r897, r910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r919, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r921, {low, high}; +} +{ +mul.f16x2 r922, r919, r921; +} +{ +mul.f16x2 r925, r893, r917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r928, {high, low}; +} +{ +fma.rn.f16x2 r930, r922, r928, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r934, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r936, {high, high}; +} +{ +mul.f16x2 r938, r732, r936; +} +{ +neg.f16x2 r941, r938; +} +{ +fma.rn.f16x2 r943, r729, r934, r941; +} +{ +mul.f16x2 r947, r729, r936; +} +{ +fma.rn.f16x2 r950, r732, r934, r947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r956, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r958, {low, high}; +} +{ +mul.f16x2 r959, r956, r958; +} +{ +mul.f16x2 r962, r930, r954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r965, {high, low}; +} +{ +fma.rn.f16x2 r967, r959, r965, r962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r971, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r973, {high, high}; +} +{ +mul.f16x2 r975, r744, r973; +} +{ +neg.f16x2 r978, r975; +} +{ +fma.rn.f16x2 r980, r741, r971, r978; +} +{ +mul.f16x2 r984, r741, r973; +} +{ +fma.rn.f16x2 r987, r744, r971, r984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r991, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r993, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r995, {low, high}; +} +{ +mul.f16x2 r996, r993, r995; +} +{ +mul.f16x2 r999, r967, r991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r1002, {high, low}; +} +{ +fma.rn.f16x2 r1004, r996, r1002, r999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1008, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1010, {high, high}; +} +{ +mul.f16x2 r1012, r756, r1010; +} +{ +neg.f16x2 r1015, r1012; +} +{ +fma.rn.f16x2 r1017, r753, r1008, r1015; +} +{ +mul.f16x2 r1021, r753, r1010; +} +{ +fma.rn.f16x2 r1024, r756, r1008, r1021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1028, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1030, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1032, {low, high}; +} +{ +mul.f16x2 r1033, r1030, r1032; +} +{ +mul.f16x2 r1036, r1004, r1028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1039, {high, low}; +} +{ +fma.rn.f16x2 r1041, r1033, r1039, r1036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1045, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1047, {high, high}; +} +{ +mul.f16x2 r1049, r768, r1047; +} +{ +neg.f16x2 r1052, r1049; +} +{ +fma.rn.f16x2 r1054, r765, r1045, r1052; +} +{ +mul.f16x2 r1058, r765, r1047; +} +{ +fma.rn.f16x2 r1061, r768, r1045, r1058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1065, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1067, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1069, {low, high}; +} +{ +mul.f16x2 r1070, r1067, r1069; +} +{ +mul.f16x2 r1073, r1041, r1065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1076, {high, low}; +} +{ +fma.rn.f16x2 r1078, r1070, r1076, r1073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1082, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1084, {high, high}; +} +{ +mul.f16x2 r1086, r780, r1084; +} +{ +neg.f16x2 r1089, r1086; +} +{ +fma.rn.f16x2 r1091, r777, r1082, r1089; +} +{ +mul.f16x2 r1095, r777, r1084; +} +{ +fma.rn.f16x2 r1098, r780, r1082, r1095; +} +barrier.sync 0; +mad.lo.s32 r4145, r4143, 40, r4144; +st.shared.v2.f32 [r4145], {r723, r795}; +st.shared.v2.f32 [r4145+8], {r832, r869}; +st.shared.v2.f32 [r4145+16], {r906, r943}; +st.shared.v2.f32 [r4145+24], {r980, r1017}; +st.shared.v2.f32 [r4145+32], {r1054, r1091}; +barrier.sync 0; +mad.lo.s32 r4146, r4143, -36, r4145; +ld.shared.u32 r1131, [r4146]; +ld.shared.u32 r1451, [r4146+4000]; +ld.shared.u32 r1128, [r4146+8000]; +ld.shared.u32 r1448, [r4146+12000]; +ld.shared.u32 r1134, [r4146+16000]; +ld.shared.u32 r1454, [r4146+20000]; +ld.shared.u32 r1135, [r4146+24000]; +ld.shared.u32 r1455, [r4146+28000]; +ld.shared.u32 r1129, [r4146+32000]; +ld.shared.u32 r1449, [r4146+36000]; +barrier.sync 0; +st.shared.v2.f32 [r4145], {r726, r802}; +st.shared.v2.f32 [r4145+8], {r839, r876}; +st.shared.v2.f32 [r4145+16], {r913, r950}; +st.shared.v2.f32 [r4145+24], {r987, r1024}; +st.shared.v2.f32 [r4145+32], {r1061, r1098}; +barrier.sync 0; +ld.shared.u32 r1143, [r4146]; +ld.shared.u32 r1463, [r4146+4000]; +ld.shared.u32 r1140, [r4146+8000]; +ld.shared.u32 r1460, [r4146+12000]; +ld.shared.u32 r1146, [r4146+16000]; +ld.shared.u32 r1466, [r4146+20000]; +ld.shared.u32 r1147, [r4146+24000]; +ld.shared.u32 r1467, [r4146+28000]; +ld.shared.u32 r1141, [r4146+32000]; +ld.shared.u32 r1461, [r4146+36000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +add.f16x2 r1127, r1128, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1130, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1142, r1145; +} +{ +add.f16x2 r1151, r1128, r1129; +} +{ +mul.f16x2 r1154, r1151, r1119; +} +{ +add.f16x2 r1157, r1131, r1154; +} +{ +add.f16x2 r1160, r1134, r1135; +} +{ +mul.f16x2 r1163, r1160, r1121; +} +{ +add.f16x2 r1166, r1157, r1163; +} +{ +sub.f16x2 r1169, r1140, r1141; +} +{ +mul.f16x2 r1172, r1169, r1120; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1122; +} +{ +add.f16x2 r1181, r1172, r1178; +} +{ +sub.f16x2 r1184, r1166, r1181; +} +{ +add.f16x2 r1187, r1128, r1129; +} +{ +mul.f16x2 r1190, r1187, r1119; +} +{ +add.f16x2 r1193, r1131, r1190; +} +{ +add.f16x2 r1196, r1134, r1135; +} +{ +mul.f16x2 r1199, r1196, r1121; +} +{ +add.f16x2 r1202, r1193, r1199; +} +{ +sub.f16x2 r1205, r1140, r1141; +} +{ +mul.f16x2 r1208, r1205, r1120; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1122; +} +{ +add.f16x2 r1217, r1208, r1214; +} +{ +add.f16x2 r1220, r1202, r1217; +} +{ +add.f16x2 r1223, r1128, r1129; +} +{ +mul.f16x2 r1226, r1223, r1121; +} +{ +add.f16x2 r1229, r1131, r1226; +} +{ +add.f16x2 r1232, r1134, r1135; +} +{ +mul.f16x2 r1235, r1232, r1123; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1140, r1141; +} +{ +mul.f16x2 r1244, r1241, r1122; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1125; +} +{ +add.f16x2 r1253, r1244, r1250; +} +{ +sub.f16x2 r1256, r1238, r1253; +} +{ +add.f16x2 r1259, r1128, r1129; +} +{ +mul.f16x2 r1262, r1259, r1121; +} +{ +add.f16x2 r1265, r1131, r1262; +} +{ +add.f16x2 r1268, r1134, r1135; +} +{ +mul.f16x2 r1271, r1268, r1123; +} +{ +add.f16x2 r1274, r1265, r1271; +} +{ +sub.f16x2 r1277, r1140, r1141; +} +{ +mul.f16x2 r1280, r1277, r1122; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1125; +} +{ +add.f16x2 r1289, r1280, r1286; +} +{ +add.f16x2 r1292, r1274, r1289; +} +{ +add.f16x2 r1295, r1140, r1141; +} +{ +mul.f16x2 r1298, r1295, r1119; +} +{ +add.f16x2 r1301, r1143, r1298; +} +{ +add.f16x2 r1304, r1146, r1147; +} +{ +mul.f16x2 r1307, r1304, r1121; +} +{ +add.f16x2 r1310, r1301, r1307; +} +{ +sub.f16x2 r1313, r1128, r1129; +} +{ +mul.f16x2 r1316, r1313, r1120; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1122; +} +{ +add.f16x2 r1325, r1316, r1322; +} +{ +add.f16x2 r1328, r1310, r1325; +} +{ +add.f16x2 r1331, r1140, r1141; +} +{ +mul.f16x2 r1334, r1331, r1119; +} +{ +add.f16x2 r1337, r1143, r1334; +} +{ +add.f16x2 r1340, r1146, r1147; +} +{ +mul.f16x2 r1343, r1340, r1121; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +sub.f16x2 r1349, r1128, r1129; +} +{ +mul.f16x2 r1352, r1349, r1120; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1122; +} +{ +add.f16x2 r1361, r1352, r1358; +} +{ +sub.f16x2 r1364, r1346, r1361; +} +{ +add.f16x2 r1367, r1140, r1141; +} +{ +mul.f16x2 r1370, r1367, r1121; +} +{ +add.f16x2 r1373, r1143, r1370; +} +{ +add.f16x2 r1376, r1146, r1147; +} +{ +mul.f16x2 r1379, r1376, r1123; +} +{ +add.f16x2 r1382, r1373, r1379; +} +{ +sub.f16x2 r1385, r1128, r1129; +} +{ +mul.f16x2 r1388, r1385, r1122; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1125; +} +{ +add.f16x2 r1397, r1388, r1394; +} +{ +add.f16x2 r1400, r1382, r1397; +} +{ +add.f16x2 r1403, r1140, r1141; +} +{ +mul.f16x2 r1406, r1403, r1121; +} +{ +add.f16x2 r1409, r1143, r1406; +} +{ +add.f16x2 r1412, r1146, r1147; +} +{ +mul.f16x2 r1415, r1412, r1123; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +sub.f16x2 r1421, r1128, r1129; +} +{ +mul.f16x2 r1424, r1421, r1122; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1125; +} +{ +add.f16x2 r1433, r1424, r1430; +} +{ +sub.f16x2 r1436, r1418, r1433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1444, {low, high}; +} +{ +neg.f16x2 r1445, r1444; +} +{ +add.f16x2 r1447, r1448, r1449; +} +{ +add.f16x2 r1450, r1451, r1447; +} +{ +add.f16x2 r1453, r1454, r1455; +} +{ +add.f16x2 r1456, r1450, r1453; +} +{ +add.f16x2 r1459, r1460, r1461; +} +{ +add.f16x2 r1462, r1463, r1459; +} +{ +add.f16x2 r1465, r1466, r1467; +} +{ +add.f16x2 r1468, r1462, r1465; +} +{ +add.f16x2 r1471, r1448, r1449; +} +{ +mul.f16x2 r1474, r1471, r1439; +} +{ +add.f16x2 r1477, r1451, r1474; +} +{ +add.f16x2 r1480, r1454, r1455; +} +{ +mul.f16x2 r1483, r1480, r1441; +} +{ +add.f16x2 r1486, r1477, r1483; +} +{ +sub.f16x2 r1489, r1460, r1461; +} +{ +mul.f16x2 r1492, r1489, r1440; +} +{ +sub.f16x2 r1495, r1466, r1467; +} +{ +mul.f16x2 r1498, r1495, r1442; +} +{ +add.f16x2 r1501, r1492, r1498; +} +{ +sub.f16x2 r1504, r1486, r1501; +} +{ +add.f16x2 r1507, r1448, r1449; +} +{ +mul.f16x2 r1510, r1507, r1439; +} +{ +add.f16x2 r1513, r1451, r1510; +} +{ +add.f16x2 r1516, r1454, r1455; +} +{ +mul.f16x2 r1519, r1516, r1441; +} +{ +add.f16x2 r1522, r1513, r1519; +} +{ +sub.f16x2 r1525, r1460, r1461; +} +{ +mul.f16x2 r1528, r1525, r1440; +} +{ +sub.f16x2 r1531, r1466, r1467; +} +{ +mul.f16x2 r1534, r1531, r1442; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +add.f16x2 r1540, r1522, r1537; +} +{ +add.f16x2 r1543, r1448, r1449; +} +{ +mul.f16x2 r1546, r1543, r1441; +} +{ +add.f16x2 r1549, r1451, r1546; +} +{ +add.f16x2 r1552, r1454, r1455; +} +{ +mul.f16x2 r1555, r1552, r1443; +} +{ +add.f16x2 r1558, r1549, r1555; +} +{ +sub.f16x2 r1561, r1460, r1461; +} +{ +mul.f16x2 r1564, r1561, r1442; +} +{ +sub.f16x2 r1567, r1466, r1467; +} +{ +mul.f16x2 r1570, r1567, r1445; +} +{ +add.f16x2 r1573, r1564, r1570; +} +{ +sub.f16x2 r1576, r1558, r1573; +} +{ +add.f16x2 r1579, r1448, r1449; +} +{ +mul.f16x2 r1582, r1579, r1441; +} +{ +add.f16x2 r1585, r1451, r1582; +} +{ +add.f16x2 r1588, r1454, r1455; +} +{ +mul.f16x2 r1591, r1588, r1443; +} +{ +add.f16x2 r1594, r1585, r1591; +} +{ +sub.f16x2 r1597, r1460, r1461; +} +{ +mul.f16x2 r1600, r1597, r1442; +} +{ +sub.f16x2 r1603, r1466, r1467; +} +{ +mul.f16x2 r1606, r1603, r1445; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +add.f16x2 r1612, r1594, r1609; +} +{ +add.f16x2 r1615, r1460, r1461; +} +{ +mul.f16x2 r1618, r1615, r1439; +} +{ +add.f16x2 r1621, r1463, r1618; +} +{ +add.f16x2 r1624, r1466, r1467; +} +{ +mul.f16x2 r1627, r1624, r1441; +} +{ +add.f16x2 r1630, r1621, r1627; +} +{ +sub.f16x2 r1633, r1448, r1449; +} +{ +mul.f16x2 r1636, r1633, r1440; +} +{ +sub.f16x2 r1639, r1454, r1455; +} +{ +mul.f16x2 r1642, r1639, r1442; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +add.f16x2 r1648, r1630, r1645; +} +{ +add.f16x2 r1651, r1460, r1461; +} +{ +mul.f16x2 r1654, r1651, r1439; +} +{ +add.f16x2 r1657, r1463, r1654; +} +{ +add.f16x2 r1660, r1466, r1467; +} +{ +mul.f16x2 r1663, r1660, r1441; +} +{ +add.f16x2 r1666, r1657, r1663; +} +{ +sub.f16x2 r1669, r1448, r1449; +} +{ +mul.f16x2 r1672, r1669, r1440; +} +{ +sub.f16x2 r1675, r1454, r1455; +} +{ +mul.f16x2 r1678, r1675, r1442; +} +{ +add.f16x2 r1681, r1672, r1678; +} +{ +sub.f16x2 r1684, r1666, r1681; +} +{ +add.f16x2 r1687, r1460, r1461; +} +{ +mul.f16x2 r1690, r1687, r1441; +} +{ +add.f16x2 r1693, r1463, r1690; +} +{ +add.f16x2 r1696, r1466, r1467; +} +{ +mul.f16x2 r1699, r1696, r1443; +} +{ +add.f16x2 r1702, r1693, r1699; +} +{ +sub.f16x2 r1705, r1448, r1449; +} +{ +mul.f16x2 r1708, r1705, r1442; +} +{ +sub.f16x2 r1711, r1454, r1455; +} +{ +mul.f16x2 r1714, r1711, r1445; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +add.f16x2 r1720, r1702, r1717; +} +{ +add.f16x2 r1723, r1460, r1461; +} +{ +mul.f16x2 r1726, r1723, r1441; +} +{ +add.f16x2 r1729, r1463, r1726; +} +{ +add.f16x2 r1732, r1466, r1467; +} +{ +mul.f16x2 r1735, r1732, r1443; +} +{ +add.f16x2 r1738, r1729, r1735; +} +{ +sub.f16x2 r1741, r1448, r1449; +} +{ +mul.f16x2 r1744, r1741, r1442; +} +{ +sub.f16x2 r1747, r1454, r1455; +} +{ +mul.f16x2 r1750, r1747, r1445; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, r1738, r1753; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1759, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1760, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1761, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1766, {low, high}; +} +{ +mul.f16x2 r1777, r1504, r1759; +} +{ +mul.f16x2 r1780, r1648, r1760; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1504, r1760; +} +{ +fma.rn.f16x2 r1789, r1648, r1759, r1786; +} +{ +mul.f16x2 r1793, r1576, r1761; +} +{ +mul.f16x2 r1796, r1720, r1762; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1576, r1762; +} +{ +fma.rn.f16x2 r1805, r1720, r1761, r1802; +} +{ +mul.f16x2 r1809, r1612, r1763; +} +{ +mul.f16x2 r1812, r1756, r1764; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1612, r1764; +} +{ +fma.rn.f16x2 r1821, r1756, r1763, r1818; +} +{ +mul.f16x2 r1825, r1540, r1765; +} +{ +mul.f16x2 r1828, r1684, r1766; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1540, r1766; +} +{ +fma.rn.f16x2 r1837, r1684, r1765, r1834; +} +{ +add.f16x2 r1841, r1136, r1456; +} +{ +add.f16x2 r1844, r1148, r1468; +} +{ +sub.f16x2 r1847, r1136, r1456; +} +{ +sub.f16x2 r1850, r1148, r1468; +} +{ +add.f16x2 r1853, r1184, r1783; +} +{ +add.f16x2 r1856, r1328, r1789; +} +{ +sub.f16x2 r1859, r1184, r1783; +} +{ +sub.f16x2 r1862, r1328, r1789; +} +{ +add.f16x2 r1865, r1256, r1799; +} +{ +add.f16x2 r1868, r1400, r1805; +} +{ +sub.f16x2 r1871, r1256, r1799; +} +{ +sub.f16x2 r1874, r1400, r1805; +} +{ +add.f16x2 r1877, r1292, r1815; +} +{ +add.f16x2 r1880, r1436, r1821; +} +{ +sub.f16x2 r1883, r1292, r1815; +} +{ +sub.f16x2 r1886, r1436, r1821; +} +{ +add.f16x2 r1889, r1220, r1831; +} +{ +add.f16x2 r1892, r1364, r1837; +} +{ +sub.f16x2 r1895, r1220, r1831; +} +{ +sub.f16x2 r1898, r1364, r1837; +} +mul.wide.u32 rd4, r4143, -858993459; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r4147, rd5; +mul.lo.s32 r4148, r4147, 10; +sub.s32 r4149, r4143, r4148; +shl.b32 r4150, r4149, 2; +add.s32 r4151, r4144, r4150; +cvt.rn.f32.u32 f310, r4147; +mul.f32 f311, f310, 0f3BCDE32E; +cos.approx.f32 f143, f311; +sin.approx.f32 f312, f311; +neg.f32 f144, f312; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1901, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1904, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1906, {high, high}; +} +{ +mul.f16x2 r1908, r1856, r1906; +} +{ +neg.f16x2 r1911, r1908; +} +{ +fma.rn.f16x2 r1913, r1853, r1904, r1911; +} +{ +mul.f16x2 r1917, r1853, r1906; +} +{ +fma.rn.f16x2 r1920, r1856, r1904, r1917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1924, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1926, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1928, {low, high}; +} +{ +mul.f16x2 r1929, r1926, r1928; +} +{ +mul.f16x2 r1932, r1901, r1924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1935, {high, low}; +} +{ +fma.rn.f16x2 r1937, r1929, r1935, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1941, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1943, {high, high}; +} +{ +mul.f16x2 r1945, r1868, r1943; +} +{ +neg.f16x2 r1948, r1945; +} +{ +fma.rn.f16x2 r1950, r1865, r1941, r1948; +} +{ +mul.f16x2 r1954, r1865, r1943; +} +{ +fma.rn.f16x2 r1957, r1868, r1941, r1954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1961, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1963, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1965, {low, high}; +} +{ +mul.f16x2 r1966, r1963, r1965; +} +{ +mul.f16x2 r1969, r1937, r1961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1972, {high, low}; +} +{ +fma.rn.f16x2 r1974, r1966, r1972, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1978, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1980, {high, high}; +} +{ +mul.f16x2 r1982, r1880, r1980; +} +{ +neg.f16x2 r1985, r1982; +} +{ +fma.rn.f16x2 r1987, r1877, r1978, r1985; +} +{ +mul.f16x2 r1991, r1877, r1980; +} +{ +fma.rn.f16x2 r1994, r1880, r1978, r1991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1998, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2000, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2002, {low, high}; +} +{ +mul.f16x2 r2003, r2000, r2002; +} +{ +mul.f16x2 r2006, r1974, r1998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r2009, {high, low}; +} +{ +fma.rn.f16x2 r2011, r2003, r2009, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2015, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2017, {high, high}; +} +{ +mul.f16x2 r2019, r1892, r2017; +} +{ +neg.f16x2 r2022, r2019; +} +{ +fma.rn.f16x2 r2024, r1889, r2015, r2022; +} +{ +mul.f16x2 r2028, r1889, r2017; +} +{ +fma.rn.f16x2 r2031, r1892, r2015, r2028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2035, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2037, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2039, {low, high}; +} +{ +mul.f16x2 r2040, r2037, r2039; +} +{ +mul.f16x2 r2043, r2011, r2035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2046, {high, low}; +} +{ +fma.rn.f16x2 r2048, r2040, r2046, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2052, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2054, {high, high}; +} +{ +mul.f16x2 r2056, r1850, r2054; +} +{ +neg.f16x2 r2059, r2056; +} +{ +fma.rn.f16x2 r2061, r1847, r2052, r2059; +} +{ +mul.f16x2 r2065, r1847, r2054; +} +{ +fma.rn.f16x2 r2068, r1850, r2052, r2065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2072, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2074, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2076, {low, high}; +} +{ +mul.f16x2 r2077, r2074, r2076; +} +{ +mul.f16x2 r2080, r2048, r2072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2083, {high, low}; +} +{ +fma.rn.f16x2 r2085, r2077, r2083, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2089, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2091, {high, high}; +} +{ +mul.f16x2 r2093, r1862, r2091; +} +{ +neg.f16x2 r2096, r2093; +} +{ +fma.rn.f16x2 r2098, r1859, r2089, r2096; +} +{ +mul.f16x2 r2102, r1859, r2091; +} +{ +fma.rn.f16x2 r2105, r1862, r2089, r2102; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2109, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2111, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2113, {low, high}; +} +{ +mul.f16x2 r2114, r2111, r2113; +} +{ +mul.f16x2 r2117, r2085, r2109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2120, {high, low}; +} +{ +fma.rn.f16x2 r2122, r2114, r2120, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2128, {high, high}; +} +{ +mul.f16x2 r2130, r1874, r2128; +} +{ +neg.f16x2 r2133, r2130; +} +{ +fma.rn.f16x2 r2135, r1871, r2126, r2133; +} +{ +mul.f16x2 r2139, r1871, r2128; +} +{ +fma.rn.f16x2 r2142, r1874, r2126, r2139; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2146, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2148, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2150, {low, high}; +} +{ +mul.f16x2 r2151, r2148, r2150; +} +{ +mul.f16x2 r2154, r2122, r2146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2157, {high, low}; +} +{ +fma.rn.f16x2 r2159, r2151, r2157, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2163, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2165, {high, high}; +} +{ +mul.f16x2 r2167, r1886, r2165; +} +{ +neg.f16x2 r2170, r2167; +} +{ +fma.rn.f16x2 r2172, r1883, r2163, r2170; +} +{ +mul.f16x2 r2176, r1883, r2165; +} +{ +fma.rn.f16x2 r2179, r1886, r2163, r2176; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2183, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2185, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2187, {low, high}; +} +{ +mul.f16x2 r2188, r2185, r2187; +} +{ +mul.f16x2 r2191, r2159, r2183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2194, {high, low}; +} +{ +fma.rn.f16x2 r2196, r2188, r2194, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2202, {high, high}; +} +{ +mul.f16x2 r2204, r1898, r2202; +} +{ +neg.f16x2 r2207, r2204; +} +{ +fma.rn.f16x2 r2209, r1895, r2200, r2207; +} +{ +mul.f16x2 r2213, r1895, r2202; +} +{ +fma.rn.f16x2 r2216, r1898, r2200, r2213; +} +barrier.sync 0; +mad.lo.s32 r4152, r4147, 400, r4151; +st.shared.u32 [r4152], r1841; +st.shared.u32 [r4152+40], r1913; +st.shared.u32 [r4152+80], r1950; +st.shared.u32 [r4152+120], r1987; +st.shared.u32 [r4152+160], r2024; +st.shared.u32 [r4152+200], r2061; +st.shared.u32 [r4152+240], r2098; +st.shared.u32 [r4152+280], r2135; +st.shared.u32 [r4152+320], r2172; +st.shared.u32 [r4152+360], r2209; +barrier.sync 0; +ld.shared.u32 r2249, [r4146]; +ld.shared.u32 r2569, [r4146+4000]; +ld.shared.u32 r2246, [r4146+8000]; +ld.shared.u32 r2566, [r4146+12000]; +ld.shared.u32 r2252, [r4146+16000]; +ld.shared.u32 r2572, [r4146+20000]; +ld.shared.u32 r2253, [r4146+24000]; +ld.shared.u32 r2573, [r4146+28000]; +ld.shared.u32 r2247, [r4146+32000]; +ld.shared.u32 r2567, [r4146+36000]; +barrier.sync 0; +st.shared.u32 [r4152], r1844; +st.shared.u32 [r4152+40], r1920; +st.shared.u32 [r4152+80], r1957; +st.shared.u32 [r4152+120], r1994; +st.shared.u32 [r4152+160], r2031; +st.shared.u32 [r4152+200], r2068; +st.shared.u32 [r4152+240], r2105; +st.shared.u32 [r4152+280], r2142; +st.shared.u32 [r4152+320], r2179; +st.shared.u32 [r4152+360], r2216; +barrier.sync 0; +ld.shared.u32 r2261, [r4146]; +ld.shared.u32 r2581, [r4146+4000]; +ld.shared.u32 r2258, [r4146+8000]; +ld.shared.u32 r2578, [r4146+12000]; +ld.shared.u32 r2264, [r4146+16000]; +ld.shared.u32 r2584, [r4146+20000]; +ld.shared.u32 r2265, [r4146+24000]; +ld.shared.u32 r2585, [r4146+28000]; +ld.shared.u32 r2259, [r4146+32000]; +ld.shared.u32 r2579, [r4146+36000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +add.f16x2 r2245, r2246, r2247; +} +{ +add.f16x2 r2248, r2249, r2245; +} +{ +add.f16x2 r2251, r2252, r2253; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r2258, r2259; +} +{ +add.f16x2 r2260, r2261, r2257; +} +{ +add.f16x2 r2263, r2264, r2265; +} +{ +add.f16x2 r2266, r2260, r2263; +} +{ +add.f16x2 r2269, r2246, r2247; +} +{ +mul.f16x2 r2272, r2269, r2237; +} +{ +add.f16x2 r2275, r2249, r2272; +} +{ +add.f16x2 r2278, r2252, r2253; +} +{ +mul.f16x2 r2281, r2278, r2239; +} +{ +add.f16x2 r2284, r2275, r2281; +} +{ +sub.f16x2 r2287, r2258, r2259; +} +{ +mul.f16x2 r2290, r2287, r2238; +} +{ +sub.f16x2 r2293, r2264, r2265; +} +{ +mul.f16x2 r2296, r2293, r2240; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r2284, r2299; +} +{ +add.f16x2 r2305, r2246, r2247; +} +{ +mul.f16x2 r2308, r2305, r2237; +} +{ +add.f16x2 r2311, r2249, r2308; +} +{ +add.f16x2 r2314, r2252, r2253; +} +{ +mul.f16x2 r2317, r2314, r2239; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r2258, r2259; +} +{ +mul.f16x2 r2326, r2323, r2238; +} +{ +sub.f16x2 r2329, r2264, r2265; +} +{ +mul.f16x2 r2332, r2329, r2240; +} +{ +add.f16x2 r2335, r2326, r2332; +} +{ +add.f16x2 r2338, r2320, r2335; +} +{ +add.f16x2 r2341, r2246, r2247; +} +{ +mul.f16x2 r2344, r2341, r2239; +} +{ +add.f16x2 r2347, r2249, r2344; +} +{ +add.f16x2 r2350, r2252, r2253; +} +{ +mul.f16x2 r2353, r2350, r2241; +} +{ +add.f16x2 r2356, r2347, r2353; +} +{ +sub.f16x2 r2359, r2258, r2259; +} +{ +mul.f16x2 r2362, r2359, r2240; +} +{ +sub.f16x2 r2365, r2264, r2265; +} +{ +mul.f16x2 r2368, r2365, r2243; +} +{ +add.f16x2 r2371, r2362, r2368; +} +{ +sub.f16x2 r2374, r2356, r2371; +} +{ +add.f16x2 r2377, r2246, r2247; +} +{ +mul.f16x2 r2380, r2377, r2239; +} +{ +add.f16x2 r2383, r2249, r2380; +} +{ +add.f16x2 r2386, r2252, r2253; +} +{ +mul.f16x2 r2389, r2386, r2241; +} +{ +add.f16x2 r2392, r2383, r2389; +} +{ +sub.f16x2 r2395, r2258, r2259; +} +{ +mul.f16x2 r2398, r2395, r2240; +} +{ +sub.f16x2 r2401, r2264, r2265; +} +{ +mul.f16x2 r2404, r2401, r2243; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +add.f16x2 r2410, r2392, r2407; +} +{ +add.f16x2 r2413, r2258, r2259; +} +{ +mul.f16x2 r2416, r2413, r2237; +} +{ +add.f16x2 r2419, r2261, r2416; +} +{ +add.f16x2 r2422, r2264, r2265; +} +{ +mul.f16x2 r2425, r2422, r2239; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r2246, r2247; +} +{ +mul.f16x2 r2434, r2431, r2238; +} +{ +sub.f16x2 r2437, r2252, r2253; +} +{ +mul.f16x2 r2440, r2437, r2240; +} +{ +add.f16x2 r2443, r2434, r2440; +} +{ +add.f16x2 r2446, r2428, r2443; +} +{ +add.f16x2 r2449, r2258, r2259; +} +{ +mul.f16x2 r2452, r2449, r2237; +} +{ +add.f16x2 r2455, r2261, r2452; +} +{ +add.f16x2 r2458, r2264, r2265; +} +{ +mul.f16x2 r2461, r2458, r2239; +} +{ +add.f16x2 r2464, r2455, r2461; +} +{ +sub.f16x2 r2467, r2246, r2247; +} +{ +mul.f16x2 r2470, r2467, r2238; +} +{ +sub.f16x2 r2473, r2252, r2253; +} +{ +mul.f16x2 r2476, r2473, r2240; +} +{ +add.f16x2 r2479, r2470, r2476; +} +{ +sub.f16x2 r2482, r2464, r2479; +} +{ +add.f16x2 r2485, r2258, r2259; +} +{ +mul.f16x2 r2488, r2485, r2239; +} +{ +add.f16x2 r2491, r2261, r2488; +} +{ +add.f16x2 r2494, r2264, r2265; +} +{ +mul.f16x2 r2497, r2494, r2241; +} +{ +add.f16x2 r2500, r2491, r2497; +} +{ +sub.f16x2 r2503, r2246, r2247; +} +{ +mul.f16x2 r2506, r2503, r2240; +} +{ +sub.f16x2 r2509, r2252, r2253; +} +{ +mul.f16x2 r2512, r2509, r2243; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +add.f16x2 r2518, r2500, r2515; +} +{ +add.f16x2 r2521, r2258, r2259; +} +{ +mul.f16x2 r2524, r2521, r2239; +} +{ +add.f16x2 r2527, r2261, r2524; +} +{ +add.f16x2 r2530, r2264, r2265; +} +{ +mul.f16x2 r2533, r2530, r2241; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r2246, r2247; +} +{ +mul.f16x2 r2542, r2539, r2240; +} +{ +sub.f16x2 r2545, r2252, r2253; +} +{ +mul.f16x2 r2548, r2545, r2243; +} +{ +add.f16x2 r2551, r2542, r2548; +} +{ +sub.f16x2 r2554, r2536, r2551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2557, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2558, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2562, {low, high}; +} +{ +neg.f16x2 r2563, r2562; +} +{ +add.f16x2 r2565, r2566, r2567; +} +{ +add.f16x2 r2568, r2569, r2565; +} +{ +add.f16x2 r2571, r2572, r2573; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2566, r2567; +} +{ +mul.f16x2 r2592, r2589, r2557; +} +{ +add.f16x2 r2595, r2569, r2592; +} +{ +add.f16x2 r2598, r2572, r2573; +} +{ +mul.f16x2 r2601, r2598, r2559; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +sub.f16x2 r2607, r2578, r2579; +} +{ +mul.f16x2 r2610, r2607, r2558; +} +{ +sub.f16x2 r2613, r2584, r2585; +} +{ +mul.f16x2 r2616, r2613, r2560; +} +{ +add.f16x2 r2619, r2610, r2616; +} +{ +sub.f16x2 r2622, r2604, r2619; +} +{ +add.f16x2 r2625, r2566, r2567; +} +{ +mul.f16x2 r2628, r2625, r2557; +} +{ +add.f16x2 r2631, r2569, r2628; +} +{ +add.f16x2 r2634, r2572, r2573; +} +{ +mul.f16x2 r2637, r2634, r2559; +} +{ +add.f16x2 r2640, r2631, r2637; +} +{ +sub.f16x2 r2643, r2578, r2579; +} +{ +mul.f16x2 r2646, r2643, r2558; +} +{ +sub.f16x2 r2649, r2584, r2585; +} +{ +mul.f16x2 r2652, r2649, r2560; +} +{ +add.f16x2 r2655, r2646, r2652; +} +{ +add.f16x2 r2658, r2640, r2655; +} +{ +add.f16x2 r2661, r2566, r2567; +} +{ +mul.f16x2 r2664, r2661, r2559; +} +{ +add.f16x2 r2667, r2569, r2664; +} +{ +add.f16x2 r2670, r2572, r2573; +} +{ +mul.f16x2 r2673, r2670, r2561; +} +{ +add.f16x2 r2676, r2667, r2673; +} +{ +sub.f16x2 r2679, r2578, r2579; +} +{ +mul.f16x2 r2682, r2679, r2560; +} +{ +sub.f16x2 r2685, r2584, r2585; +} +{ +mul.f16x2 r2688, r2685, r2563; +} +{ +add.f16x2 r2691, r2682, r2688; +} +{ +sub.f16x2 r2694, r2676, r2691; +} +{ +add.f16x2 r2697, r2566, r2567; +} +{ +mul.f16x2 r2700, r2697, r2559; +} +{ +add.f16x2 r2703, r2569, r2700; +} +{ +add.f16x2 r2706, r2572, r2573; +} +{ +mul.f16x2 r2709, r2706, r2561; +} +{ +add.f16x2 r2712, r2703, r2709; +} +{ +sub.f16x2 r2715, r2578, r2579; +} +{ +mul.f16x2 r2718, r2715, r2560; +} +{ +sub.f16x2 r2721, r2584, r2585; +} +{ +mul.f16x2 r2724, r2721, r2563; +} +{ +add.f16x2 r2727, r2718, r2724; +} +{ +add.f16x2 r2730, r2712, r2727; +} +{ +add.f16x2 r2733, r2578, r2579; +} +{ +mul.f16x2 r2736, r2733, r2557; +} +{ +add.f16x2 r2739, r2581, r2736; +} +{ +add.f16x2 r2742, r2584, r2585; +} +{ +mul.f16x2 r2745, r2742, r2559; +} +{ +add.f16x2 r2748, r2739, r2745; +} +{ +sub.f16x2 r2751, r2566, r2567; +} +{ +mul.f16x2 r2754, r2751, r2558; +} +{ +sub.f16x2 r2757, r2572, r2573; +} +{ +mul.f16x2 r2760, r2757, r2560; +} +{ +add.f16x2 r2763, r2754, r2760; +} +{ +add.f16x2 r2766, r2748, r2763; +} +{ +add.f16x2 r2769, r2578, r2579; +} +{ +mul.f16x2 r2772, r2769, r2557; +} +{ +add.f16x2 r2775, r2581, r2772; +} +{ +add.f16x2 r2778, r2584, r2585; +} +{ +mul.f16x2 r2781, r2778, r2559; +} +{ +add.f16x2 r2784, r2775, r2781; +} +{ +sub.f16x2 r2787, r2566, r2567; +} +{ +mul.f16x2 r2790, r2787, r2558; +} +{ +sub.f16x2 r2793, r2572, r2573; +} +{ +mul.f16x2 r2796, r2793, r2560; +} +{ +add.f16x2 r2799, r2790, r2796; +} +{ +sub.f16x2 r2802, r2784, r2799; +} +{ +add.f16x2 r2805, r2578, r2579; +} +{ +mul.f16x2 r2808, r2805, r2559; +} +{ +add.f16x2 r2811, r2581, r2808; +} +{ +add.f16x2 r2814, r2584, r2585; +} +{ +mul.f16x2 r2817, r2814, r2561; +} +{ +add.f16x2 r2820, r2811, r2817; +} +{ +sub.f16x2 r2823, r2566, r2567; +} +{ +mul.f16x2 r2826, r2823, r2560; +} +{ +sub.f16x2 r2829, r2572, r2573; +} +{ +mul.f16x2 r2832, r2829, r2563; +} +{ +add.f16x2 r2835, r2826, r2832; +} +{ +add.f16x2 r2838, r2820, r2835; +} +{ +add.f16x2 r2841, r2578, r2579; +} +{ +mul.f16x2 r2844, r2841, r2559; +} +{ +add.f16x2 r2847, r2581, r2844; +} +{ +add.f16x2 r2850, r2584, r2585; +} +{ +mul.f16x2 r2853, r2850, r2561; +} +{ +add.f16x2 r2856, r2847, r2853; +} +{ +sub.f16x2 r2859, r2566, r2567; +} +{ +mul.f16x2 r2862, r2859, r2560; +} +{ +sub.f16x2 r2865, r2572, r2573; +} +{ +mul.f16x2 r2868, r2865, r2563; +} +{ +add.f16x2 r2871, r2862, r2868; +} +{ +sub.f16x2 r2874, r2856, r2871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r2877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2878, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2884, {low, high}; +} +{ +mul.f16x2 r2895, r2622, r2877; +} +{ +mul.f16x2 r2898, r2766, r2878; +} +{ +sub.f16x2 r2901, r2895, r2898; +} +{ +mul.f16x2 r2904, r2622, r2878; +} +{ +fma.rn.f16x2 r2907, r2766, r2877, r2904; +} +{ +mul.f16x2 r2911, r2694, r2879; +} +{ +mul.f16x2 r2914, r2838, r2880; +} +{ +sub.f16x2 r2917, r2911, r2914; +} +{ +mul.f16x2 r2920, r2694, r2880; +} +{ +fma.rn.f16x2 r2923, r2838, r2879, r2920; +} +{ +mul.f16x2 r2927, r2730, r2881; +} +{ +mul.f16x2 r2930, r2874, r2882; +} +{ +sub.f16x2 r2933, r2927, r2930; +} +{ +mul.f16x2 r2936, r2730, r2882; +} +{ +fma.rn.f16x2 r2939, r2874, r2881, r2936; +} +{ +mul.f16x2 r2943, r2658, r2883; +} +{ +mul.f16x2 r2946, r2802, r2884; +} +{ +sub.f16x2 r2949, r2943, r2946; +} +{ +mul.f16x2 r2952, r2658, r2884; +} +{ +fma.rn.f16x2 r2955, r2802, r2883, r2952; +} +{ +add.f16x2 r2959, r2254, r2574; +} +{ +add.f16x2 r2962, r2266, r2586; +} +{ +sub.f16x2 r2965, r2254, r2574; +} +{ +sub.f16x2 r2968, r2266, r2586; +} +{ +add.f16x2 r2971, r2302, r2901; +} +{ +add.f16x2 r2974, r2446, r2907; +} +{ +sub.f16x2 r2977, r2302, r2901; +} +{ +sub.f16x2 r2980, r2446, r2907; +} +{ +add.f16x2 r2983, r2374, r2917; +} +{ +add.f16x2 r2986, r2518, r2923; +} +{ +sub.f16x2 r2989, r2374, r2917; +} +{ +sub.f16x2 r2992, r2518, r2923; +} +{ +add.f16x2 r2995, r2410, r2933; +} +{ +add.f16x2 r2998, r2554, r2939; +} +{ +sub.f16x2 r3001, r2410, r2933; +} +{ +sub.f16x2 r3004, r2554, r2939; +} +{ +add.f16x2 r3007, r2338, r2949; +} +{ +add.f16x2 r3010, r2482, r2955; +} +{ +sub.f16x2 r3013, r2338, r2949; +} +{ +sub.f16x2 r3016, r2482, r2955; +} +mul.wide.u32 rd6, r4143, 1374389535; +shr.u64 rd7, rd6, 37; +cvt.u32.u64 r4153, rd7; +mul.lo.s32 r4154, r4153, 100; +sub.s32 r4155, r4143, r4154; +shl.b32 r4156, r4155, 2; +add.s32 r4157, r4144, r4156; +cvt.rn.f32.u32 f313, r4153; +mul.f32 f314, f313, 0f3D80ADFD; +cos.approx.f32 f225, f314; +sin.approx.f32 f315, f314; +neg.f32 f226, f315; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r3019, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2974, r3024; +} +{ +neg.f16x2 r3029, r3026; +} +{ +fma.rn.f16x2 r3031, r2971, r3022, r3029; +} +{ +mul.f16x2 r3035, r2971, r3024; +} +{ +fma.rn.f16x2 r3038, r2974, r3022, r3035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3019, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2986, r3061; +} +{ +neg.f16x2 r3066, r3063; +} +{ +fma.rn.f16x2 r3068, r2983, r3059, r3066; +} +{ +mul.f16x2 r3072, r2983, r3061; +} +{ +fma.rn.f16x2 r3075, r2986, r3059, r3072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2998, r3098; +} +{ +neg.f16x2 r3103, r3100; +} +{ +fma.rn.f16x2 r3105, r2995, r3096, r3103; +} +{ +mul.f16x2 r3109, r2995, r3098; +} +{ +fma.rn.f16x2 r3112, r2998, r3096, r3109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r3010, r3135; +} +{ +neg.f16x2 r3140, r3137; +} +{ +fma.rn.f16x2 r3142, r3007, r3133, r3140; +} +{ +mul.f16x2 r3146, r3007, r3135; +} +{ +fma.rn.f16x2 r3149, r3010, r3133, r3146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2968, r3172; +} +{ +neg.f16x2 r3177, r3174; +} +{ +fma.rn.f16x2 r3179, r2965, r3170, r3177; +} +{ +mul.f16x2 r3183, r2965, r3172; +} +{ +fma.rn.f16x2 r3186, r2968, r3170, r3183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2980, r3209; +} +{ +neg.f16x2 r3214, r3211; +} +{ +fma.rn.f16x2 r3216, r2977, r3207, r3214; +} +{ +mul.f16x2 r3220, r2977, r3209; +} +{ +fma.rn.f16x2 r3223, r2980, r3207, r3220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2992, r3246; +} +{ +neg.f16x2 r3251, r3248; +} +{ +fma.rn.f16x2 r3253, r2989, r3244, r3251; +} +{ +mul.f16x2 r3257, r2989, r3246; +} +{ +fma.rn.f16x2 r3260, r2992, r3244, r3257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r3004, r3283; +} +{ +neg.f16x2 r3288, r3285; +} +{ +fma.rn.f16x2 r3290, r3001, r3281, r3288; +} +{ +mul.f16x2 r3294, r3001, r3283; +} +{ +fma.rn.f16x2 r3297, r3004, r3281, r3294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3019; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r3016, r3320; +} +{ +neg.f16x2 r3325, r3322; +} +{ +fma.rn.f16x2 r3327, r3013, r3318, r3325; +} +{ +mul.f16x2 r3331, r3013, r3320; +} +{ +fma.rn.f16x2 r3334, r3016, r3318, r3331; +} +barrier.sync 0; +mad.lo.s32 r4158, r4153, 4000, r4157; +st.shared.u32 [r4158], r2959; +st.shared.u32 [r4158+400], r3031; +st.shared.u32 [r4158+800], r3068; +st.shared.u32 [r4158+1200], r3105; +st.shared.u32 [r4158+1600], r3142; +st.shared.u32 [r4158+2000], r3179; +st.shared.u32 [r4158+2400], r3216; +st.shared.u32 [r4158+2800], r3253; +st.shared.u32 [r4158+3200], r3290; +st.shared.u32 [r4158+3600], r3327; +barrier.sync 0; +ld.shared.u32 r3367, [r4146]; +ld.shared.u32 r3687, [r4146+4000]; +ld.shared.u32 r3364, [r4146+8000]; +ld.shared.u32 r3684, [r4146+12000]; +ld.shared.u32 r3370, [r4146+16000]; +ld.shared.u32 r3690, [r4146+20000]; +ld.shared.u32 r3371, [r4146+24000]; +ld.shared.u32 r3691, [r4146+28000]; +ld.shared.u32 r3365, [r4146+32000]; +ld.shared.u32 r3685, [r4146+36000]; +barrier.sync 0; +st.shared.u32 [r4158], r2962; +st.shared.u32 [r4158+400], r3038; +st.shared.u32 [r4158+800], r3075; +st.shared.u32 [r4158+1200], r3112; +st.shared.u32 [r4158+1600], r3149; +st.shared.u32 [r4158+2000], r3186; +st.shared.u32 [r4158+2400], r3223; +st.shared.u32 [r4158+2800], r3260; +st.shared.u32 [r4158+3200], r3297; +st.shared.u32 [r4158+3600], r3334; +barrier.sync 0; +ld.shared.u32 r3379, [r4146]; +ld.shared.u32 r3699, [r4146+4000]; +ld.shared.u32 r3376, [r4146+8000]; +ld.shared.u32 r3696, [r4146+12000]; +ld.shared.u32 r3382, [r4146+16000]; +ld.shared.u32 r3702, [r4146+20000]; +ld.shared.u32 r3383, [r4146+24000]; +ld.shared.u32 r3703, [r4146+28000]; +ld.shared.u32 r3377, [r4146+32000]; +ld.shared.u32 r3697, [r4146+36000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r3358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3360, {low, high}; +} +{ +neg.f16x2 r3361, r3360; +} +{ +add.f16x2 r3363, r3364, r3365; +} +{ +add.f16x2 r3366, r3367, r3363; +} +{ +add.f16x2 r3369, r3370, r3371; +} +{ +add.f16x2 r3372, r3366, r3369; +} +{ +add.f16x2 r3375, r3376, r3377; +} +{ +add.f16x2 r3378, r3379, r3375; +} +{ +add.f16x2 r3381, r3382, r3383; +} +{ +add.f16x2 r3384, r3378, r3381; +} +{ +add.f16x2 r3387, r3364, r3365; +} +{ +mul.f16x2 r3390, r3387, r3355; +} +{ +add.f16x2 r3393, r3367, r3390; +} +{ +add.f16x2 r3396, r3370, r3371; +} +{ +mul.f16x2 r3399, r3396, r3357; +} +{ +add.f16x2 r3402, r3393, r3399; +} +{ +sub.f16x2 r3405, r3376, r3377; +} +{ +mul.f16x2 r3408, r3405, r3356; +} +{ +sub.f16x2 r3411, r3382, r3383; +} +{ +mul.f16x2 r3414, r3411, r3358; +} +{ +add.f16x2 r3417, r3408, r3414; +} +{ +sub.f16x2 r3420, r3402, r3417; +} +{ +add.f16x2 r3423, r3364, r3365; +} +{ +mul.f16x2 r3426, r3423, r3355; +} +{ +add.f16x2 r3429, r3367, r3426; +} +{ +add.f16x2 r3432, r3370, r3371; +} +{ +mul.f16x2 r3435, r3432, r3357; +} +{ +add.f16x2 r3438, r3429, r3435; +} +{ +sub.f16x2 r3441, r3376, r3377; +} +{ +mul.f16x2 r3444, r3441, r3356; +} +{ +sub.f16x2 r3447, r3382, r3383; +} +{ +mul.f16x2 r3450, r3447, r3358; +} +{ +add.f16x2 r3453, r3444, r3450; +} +{ +add.f16x2 r3456, r3438, r3453; +} +{ +add.f16x2 r3459, r3364, r3365; +} +{ +mul.f16x2 r3462, r3459, r3357; +} +{ +add.f16x2 r3465, r3367, r3462; +} +{ +add.f16x2 r3468, r3370, r3371; +} +{ +mul.f16x2 r3471, r3468, r3359; +} +{ +add.f16x2 r3474, r3465, r3471; +} +{ +sub.f16x2 r3477, r3376, r3377; +} +{ +mul.f16x2 r3480, r3477, r3358; +} +{ +sub.f16x2 r3483, r3382, r3383; +} +{ +mul.f16x2 r3486, r3483, r3361; +} +{ +add.f16x2 r3489, r3480, r3486; +} +{ +sub.f16x2 r3492, r3474, r3489; +} +{ +add.f16x2 r3495, r3364, r3365; +} +{ +mul.f16x2 r3498, r3495, r3357; +} +{ +add.f16x2 r3501, r3367, r3498; +} +{ +add.f16x2 r3504, r3370, r3371; +} +{ +mul.f16x2 r3507, r3504, r3359; +} +{ +add.f16x2 r3510, r3501, r3507; +} +{ +sub.f16x2 r3513, r3376, r3377; +} +{ +mul.f16x2 r3516, r3513, r3358; +} +{ +sub.f16x2 r3519, r3382, r3383; +} +{ +mul.f16x2 r3522, r3519, r3361; +} +{ +add.f16x2 r3525, r3516, r3522; +} +{ +add.f16x2 r3528, r3510, r3525; +} +{ +add.f16x2 r3531, r3376, r3377; +} +{ +mul.f16x2 r3534, r3531, r3355; +} +{ +add.f16x2 r3537, r3379, r3534; +} +{ +add.f16x2 r3540, r3382, r3383; +} +{ +mul.f16x2 r3543, r3540, r3357; +} +{ +add.f16x2 r3546, r3537, r3543; +} +{ +sub.f16x2 r3549, r3364, r3365; +} +{ +mul.f16x2 r3552, r3549, r3356; +} +{ +sub.f16x2 r3555, r3370, r3371; +} +{ +mul.f16x2 r3558, r3555, r3358; +} +{ +add.f16x2 r3561, r3552, r3558; +} +{ +add.f16x2 r3564, r3546, r3561; +} +{ +add.f16x2 r3567, r3376, r3377; +} +{ +mul.f16x2 r3570, r3567, r3355; +} +{ +add.f16x2 r3573, r3379, r3570; +} +{ +add.f16x2 r3576, r3382, r3383; +} +{ +mul.f16x2 r3579, r3576, r3357; +} +{ +add.f16x2 r3582, r3573, r3579; +} +{ +sub.f16x2 r3585, r3364, r3365; +} +{ +mul.f16x2 r3588, r3585, r3356; +} +{ +sub.f16x2 r3591, r3370, r3371; +} +{ +mul.f16x2 r3594, r3591, r3358; +} +{ +add.f16x2 r3597, r3588, r3594; +} +{ +sub.f16x2 r3600, r3582, r3597; +} +{ +add.f16x2 r3603, r3376, r3377; +} +{ +mul.f16x2 r3606, r3603, r3357; +} +{ +add.f16x2 r3609, r3379, r3606; +} +{ +add.f16x2 r3612, r3382, r3383; +} +{ +mul.f16x2 r3615, r3612, r3359; +} +{ +add.f16x2 r3618, r3609, r3615; +} +{ +sub.f16x2 r3621, r3364, r3365; +} +{ +mul.f16x2 r3624, r3621, r3358; +} +{ +sub.f16x2 r3627, r3370, r3371; +} +{ +mul.f16x2 r3630, r3627, r3361; +} +{ +add.f16x2 r3633, r3624, r3630; +} +{ +add.f16x2 r3636, r3618, r3633; +} +{ +add.f16x2 r3639, r3376, r3377; +} +{ +mul.f16x2 r3642, r3639, r3357; +} +{ +add.f16x2 r3645, r3379, r3642; +} +{ +add.f16x2 r3648, r3382, r3383; +} +{ +mul.f16x2 r3651, r3648, r3359; +} +{ +add.f16x2 r3654, r3645, r3651; +} +{ +sub.f16x2 r3657, r3364, r3365; +} +{ +mul.f16x2 r3660, r3657, r3358; +} +{ +sub.f16x2 r3663, r3370, r3371; +} +{ +mul.f16x2 r3666, r3663, r3361; +} +{ +add.f16x2 r3669, r3660, r3666; +} +{ +sub.f16x2 r3672, r3654, r3669; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3675, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r3678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3680, {low, high}; +} +{ +neg.f16x2 r3681, r3680; +} +{ +add.f16x2 r3683, r3684, r3685; +} +{ +add.f16x2 r3686, r3687, r3683; +} +{ +add.f16x2 r3689, r3690, r3691; +} +{ +add.f16x2 r3692, r3686, r3689; +} +{ +add.f16x2 r3695, r3696, r3697; +} +{ +add.f16x2 r3698, r3699, r3695; +} +{ +add.f16x2 r3701, r3702, r3703; +} +{ +add.f16x2 r3704, r3698, r3701; +} +{ +add.f16x2 r3707, r3684, r3685; +} +{ +mul.f16x2 r3710, r3707, r3675; +} +{ +add.f16x2 r3713, r3687, r3710; +} +{ +add.f16x2 r3716, r3690, r3691; +} +{ +mul.f16x2 r3719, r3716, r3677; +} +{ +add.f16x2 r3722, r3713, r3719; +} +{ +sub.f16x2 r3725, r3696, r3697; +} +{ +mul.f16x2 r3728, r3725, r3676; +} +{ +sub.f16x2 r3731, r3702, r3703; +} +{ +mul.f16x2 r3734, r3731, r3678; +} +{ +add.f16x2 r3737, r3728, r3734; +} +{ +sub.f16x2 r3740, r3722, r3737; +} +{ +add.f16x2 r3743, r3684, r3685; +} +{ +mul.f16x2 r3746, r3743, r3675; +} +{ +add.f16x2 r3749, r3687, r3746; +} +{ +add.f16x2 r3752, r3690, r3691; +} +{ +mul.f16x2 r3755, r3752, r3677; +} +{ +add.f16x2 r3758, r3749, r3755; +} +{ +sub.f16x2 r3761, r3696, r3697; +} +{ +mul.f16x2 r3764, r3761, r3676; +} +{ +sub.f16x2 r3767, r3702, r3703; +} +{ +mul.f16x2 r3770, r3767, r3678; +} +{ +add.f16x2 r3773, r3764, r3770; +} +{ +add.f16x2 r3776, r3758, r3773; +} +{ +add.f16x2 r3779, r3684, r3685; +} +{ +mul.f16x2 r3782, r3779, r3677; +} +{ +add.f16x2 r3785, r3687, r3782; +} +{ +add.f16x2 r3788, r3690, r3691; +} +{ +mul.f16x2 r3791, r3788, r3679; +} +{ +add.f16x2 r3794, r3785, r3791; +} +{ +sub.f16x2 r3797, r3696, r3697; +} +{ +mul.f16x2 r3800, r3797, r3678; +} +{ +sub.f16x2 r3803, r3702, r3703; +} +{ +mul.f16x2 r3806, r3803, r3681; +} +{ +add.f16x2 r3809, r3800, r3806; +} +{ +sub.f16x2 r3812, r3794, r3809; +} +{ +add.f16x2 r3815, r3684, r3685; +} +{ +mul.f16x2 r3818, r3815, r3677; +} +{ +add.f16x2 r3821, r3687, r3818; +} +{ +add.f16x2 r3824, r3690, r3691; +} +{ +mul.f16x2 r3827, r3824, r3679; +} +{ +add.f16x2 r3830, r3821, r3827; +} +{ +sub.f16x2 r3833, r3696, r3697; +} +{ +mul.f16x2 r3836, r3833, r3678; +} +{ +sub.f16x2 r3839, r3702, r3703; +} +{ +mul.f16x2 r3842, r3839, r3681; +} +{ +add.f16x2 r3845, r3836, r3842; +} +{ +add.f16x2 r3848, r3830, r3845; +} +{ +add.f16x2 r3851, r3696, r3697; +} +{ +mul.f16x2 r3854, r3851, r3675; +} +{ +add.f16x2 r3857, r3699, r3854; +} +{ +add.f16x2 r3860, r3702, r3703; +} +{ +mul.f16x2 r3863, r3860, r3677; +} +{ +add.f16x2 r3866, r3857, r3863; +} +{ +sub.f16x2 r3869, r3684, r3685; +} +{ +mul.f16x2 r3872, r3869, r3676; +} +{ +sub.f16x2 r3875, r3690, r3691; +} +{ +mul.f16x2 r3878, r3875, r3678; +} +{ +add.f16x2 r3881, r3872, r3878; +} +{ +add.f16x2 r3884, r3866, r3881; +} +{ +add.f16x2 r3887, r3696, r3697; +} +{ +mul.f16x2 r3890, r3887, r3675; +} +{ +add.f16x2 r3893, r3699, r3890; +} +{ +add.f16x2 r3896, r3702, r3703; +} +{ +mul.f16x2 r3899, r3896, r3677; +} +{ +add.f16x2 r3902, r3893, r3899; +} +{ +sub.f16x2 r3905, r3684, r3685; +} +{ +mul.f16x2 r3908, r3905, r3676; +} +{ +sub.f16x2 r3911, r3690, r3691; +} +{ +mul.f16x2 r3914, r3911, r3678; +} +{ +add.f16x2 r3917, r3908, r3914; +} +{ +sub.f16x2 r3920, r3902, r3917; +} +{ +add.f16x2 r3923, r3696, r3697; +} +{ +mul.f16x2 r3926, r3923, r3677; +} +{ +add.f16x2 r3929, r3699, r3926; +} +{ +add.f16x2 r3932, r3702, r3703; +} +{ +mul.f16x2 r3935, r3932, r3679; +} +{ +add.f16x2 r3938, r3929, r3935; +} +{ +sub.f16x2 r3941, r3684, r3685; +} +{ +mul.f16x2 r3944, r3941, r3678; +} +{ +sub.f16x2 r3947, r3690, r3691; +} +{ +mul.f16x2 r3950, r3947, r3681; +} +{ +add.f16x2 r3953, r3944, r3950; +} +{ +add.f16x2 r3956, r3938, r3953; +} +{ +add.f16x2 r3959, r3696, r3697; +} +{ +mul.f16x2 r3962, r3959, r3677; +} +{ +add.f16x2 r3965, r3699, r3962; +} +{ +add.f16x2 r3968, r3702, r3703; +} +{ +mul.f16x2 r3971, r3968, r3679; +} +{ +add.f16x2 r3974, r3965, r3971; +} +{ +sub.f16x2 r3977, r3684, r3685; +} +{ +mul.f16x2 r3980, r3977, r3678; +} +{ +sub.f16x2 r3983, r3690, r3691; +} +{ +mul.f16x2 r3986, r3983, r3681; +} +{ +add.f16x2 r3989, r3980, r3986; +} +{ +sub.f16x2 r3992, r3974, r3989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r3995, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r3996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3998, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r3999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r4000, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r4001, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4002, {low, high}; +} +{ +mul.f16x2 r4013, r3740, r3995; +} +{ +mul.f16x2 r4016, r3884, r3996; +} +{ +sub.f16x2 r4019, r4013, r4016; +} +{ +mul.f16x2 r4022, r3740, r3996; +} +{ +fma.rn.f16x2 r4025, r3884, r3995, r4022; +} +{ +mul.f16x2 r4029, r3812, r3997; +} +{ +mul.f16x2 r4032, r3956, r3998; +} +{ +sub.f16x2 r4035, r4029, r4032; +} +{ +mul.f16x2 r4038, r3812, r3998; +} +{ +fma.rn.f16x2 r4041, r3956, r3997, r4038; +} +{ +mul.f16x2 r4045, r3848, r3999; +} +{ +mul.f16x2 r4048, r3992, r4000; +} +{ +sub.f16x2 r4051, r4045, r4048; +} +{ +mul.f16x2 r4054, r3848, r4000; +} +{ +fma.rn.f16x2 r4057, r3992, r3999, r4054; +} +{ +mul.f16x2 r4061, r3776, r4001; +} +{ +mul.f16x2 r4064, r3920, r4002; +} +{ +sub.f16x2 r4067, r4061, r4064; +} +{ +mul.f16x2 r4070, r3776, r4002; +} +{ +fma.rn.f16x2 r4073, r3920, r4001, r4070; +} +{ +add.f16x2 %0, r3372, r3692; +} +{ +add.f16x2 %1, r3384, r3704; +} +{ +sub.f16x2 %10, r3372, r3692; +} +{ +sub.f16x2 %11, r3384, r3704; +} +{ +add.f16x2 %2, r3420, r4019; +} +{ +add.f16x2 %3, r3564, r4025; +} +{ +sub.f16x2 %12, r3420, r4019; +} +{ +sub.f16x2 %13, r3564, r4025; +} +{ +add.f16x2 %4, r3492, r4035; +} +{ +add.f16x2 %5, r3636, r4041; +} +{ +sub.f16x2 %14, r3492, r4035; +} +{ +sub.f16x2 %15, r3636, r4041; +} +{ +add.f16x2 %6, r3528, r4051; +} +{ +add.f16x2 %7, r3672, r4057; +} +{ +sub.f16x2 %16, r3528, r4051; +} +{ +sub.f16x2 %17, r3672, r4057; +} +{ +add.f16x2 %8, r3456, r4067; +} +{ +add.f16x2 %9, r3600, r4073; +} +{ +sub.f16x2 %18, r3456, r4067; +} +{ +sub.f16x2 %19, r3600, r4073; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..a303734b06ee8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp16_inv.hpp.inc @@ -0,0 +1,10212 @@ +#ifndef CUFFTDX_FFT_10000_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_10000_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1143, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<316>; +.reg .b32 r<4178>; +.reg .b64 rd<9>; +mov.u32 r4153, %tid.y; +shl.b32 r4154, r4153, 1; +mov.u32 r4155, %20; +mad.lo.s32 r4156, r4154, 40000, r4155; +mov.u32 r4157, %tid.x; +mov.f32 f276, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1, {low, high}; +} +mov.f32 f270, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f284, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r5, {low, high}; +} +mov.f32 f266, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %25, %37; +} +{ +add.f16x2 r14, %21, r11; +} +{ +add.f16x2 r17, %29, %33; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %26, %38; +} +{ +add.f16x2 r26, %22, r23; +} +{ +add.f16x2 r29, %30, %34; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %25, %37; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %21, r38; +} +{ +add.f16x2 r44, %29, %33; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %26, %38; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %30, %34; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %25, %37; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %21, r74; +} +{ +add.f16x2 r80, %29, %33; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %26, %38; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %30, %34; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %25, %37; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %21, r110; +} +{ +add.f16x2 r116, %29, %33; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %26, %38; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %30, %34; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %25, %37; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %21, r146; +} +{ +add.f16x2 r152, %29, %33; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %26, %38; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %30, %34; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %26, %38; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %22, r182; +} +{ +add.f16x2 r188, %30, %34; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %25, %37; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %29, %33; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %26, %38; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %22, r218; +} +{ +add.f16x2 r224, %30, %34; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %25, %37; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %29, %33; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %26, %38; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %22, r254; +} +{ +add.f16x2 r260, %30, %34; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %25, %37; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %29, %33; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %26, %38; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %22, r290; +} +{ +add.f16x2 r296, %30, %34; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %25, %37; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %29, %33; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %27, %39; +} +{ +add.f16x2 r336, %23, r333; +} +{ +add.f16x2 r339, %31, %35; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %28, %40; +} +{ +add.f16x2 r348, %24, r345; +} +{ +add.f16x2 r351, %32, %36; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %27, %39; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %23, r360; +} +{ +add.f16x2 r366, %31, %35; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %28, %40; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %32, %36; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %27, %39; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %23, r396; +} +{ +add.f16x2 r402, %31, %35; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %28, %40; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %32, %36; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %27, %39; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %23, r432; +} +{ +add.f16x2 r438, %31, %35; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %28, %40; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %32, %36; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %27, %39; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %23, r468; +} +{ +add.f16x2 r474, %31, %35; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %28, %40; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %32, %36; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %28, %40; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %24, r504; +} +{ +add.f16x2 r510, %32, %36; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %27, %39; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %31, %35; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %28, %40; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %24, r540; +} +{ +add.f16x2 r546, %32, %36; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %27, %39; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %31, %35; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %28, %40; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %24, r576; +} +{ +add.f16x2 r582, %32, %36; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %27, %39; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %31, %35; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %28, %40; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %24, r612; +} +{ +add.f16x2 r618, %32, %36; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %27, %39; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %31, %35; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +mov.f32 f272, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r645, {low, high}; +} +mov.f32 f286, 0f3F167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r647, {low, high}; +} +mov.f32 f282, 0f3F737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r648, {low, high}; +} +mov.f32 f280, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r652, {low, high}; +} +mov.f32 f243, 0fBF800000; +{ +mul.f16x2 r663, r390, r645; +} +{ +mul.f16x2 r666, r534, r646; +} +{ +sub.f16x2 r669, r663, r666; +} +{ +mul.f16x2 r672, r390, r646; +} +{ +fma.rn.f16x2 r675, r534, r645, r672; +} +{ +mul.f16x2 r679, r462, r647; +} +{ +mul.f16x2 r682, r606, r648; +} +{ +sub.f16x2 r685, r679, r682; +} +{ +mul.f16x2 r688, r462, r648; +} +{ +fma.rn.f16x2 r691, r606, r647, r688; +} +{ +mul.f16x2 r695, r498, r649; +} +{ +mul.f16x2 r698, r642, r650; +} +{ +sub.f16x2 r701, r695, r698; +} +{ +mul.f16x2 r704, r498, r650; +} +{ +fma.rn.f16x2 r707, r642, r649, r704; +} +{ +mul.f16x2 r711, r426, r651; +} +{ +mul.f16x2 r714, r570, r652; +} +{ +sub.f16x2 r717, r711, r714; +} +{ +mul.f16x2 r720, r426, r652; +} +{ +fma.rn.f16x2 r723, r570, r651, r720; +} +{ +add.f16x2 r727, r20, r342; +} +{ +add.f16x2 r730, r32, r354; +} +{ +sub.f16x2 r733, r20, r342; +} +{ +sub.f16x2 r736, r32, r354; +} +{ +add.f16x2 r739, r68, r669; +} +{ +add.f16x2 r742, r212, r675; +} +{ +sub.f16x2 r745, r68, r669; +} +{ +sub.f16x2 r748, r212, r675; +} +{ +add.f16x2 r751, r140, r685; +} +{ +add.f16x2 r754, r284, r691; +} +{ +sub.f16x2 r757, r140, r685; +} +{ +sub.f16x2 r760, r284, r691; +} +{ +add.f16x2 r763, r176, r701; +} +{ +add.f16x2 r766, r320, r707; +} +{ +sub.f16x2 r769, r176, r701; +} +{ +sub.f16x2 r772, r320, r707; +} +{ +add.f16x2 r775, r104, r717; +} +{ +add.f16x2 r778, r248, r723; +} +{ +sub.f16x2 r781, r104, r717; +} +{ +sub.f16x2 r784, r248, r723; +} +mul.wide.u32 rd2, r4157, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r4158, rd3; +mul.lo.s32 r4159, r4158, 1000; +sub.s32 r4160, r4157, r4159; +shr.u64 rd4, rd2, 37; +cvt.u32.u64 r4161, rd4; +and.b32 r4162, r4161, 134217726; +mad.lo.s32 r4163, r4162, 40000, r4156; +cvt.rn.f32.u32 f307, r4160; +mul.f32 f308, f307, 0f3A24B5BE; +cos.approx.f32 f61, f308; +sin.approx.f32 f309, f308; +neg.f32 f62, f309; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r787, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r792, {high, high}; +} +{ +mul.f16x2 r794, r742, r792; +} +{ +fma.rn.f16x2 r797, r739, r790, r794; +} +{ +mul.f16x2 r801, r739, r792; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r742, r790, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r812, {high, high}; +} +mov.f32 f244, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r812, r814; +} +{ +mul.f16x2 r818, r787, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r821, {high, low}; +} +{ +fma.rn.f16x2 r823, r815, r821, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r829, {high, high}; +} +{ +mul.f16x2 r831, r754, r829; +} +{ +fma.rn.f16x2 r834, r751, r827, r831; +} +{ +mul.f16x2 r838, r751, r829; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r754, r827, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r851, {low, high}; +} +{ +mul.f16x2 r852, r849, r851; +} +{ +mul.f16x2 r855, r823, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r858, {high, low}; +} +{ +fma.rn.f16x2 r860, r852, r858, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r766, r866; +} +{ +fma.rn.f16x2 r871, r763, r864, r868; +} +{ +mul.f16x2 r875, r763, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r766, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r860, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r778, r903; +} +{ +fma.rn.f16x2 r908, r775, r901, r905; +} +{ +mul.f16x2 r912, r775, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r778, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r736, r940; +} +{ +fma.rn.f16x2 r945, r733, r938, r942; +} +{ +mul.f16x2 r949, r733, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r736, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r748, r977; +} +{ +fma.rn.f16x2 r982, r745, r975, r979; +} +{ +mul.f16x2 r986, r745, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r748, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r760, r1014; +} +{ +fma.rn.f16x2 r1019, r757, r1012, r1016; +} +{ +mul.f16x2 r1023, r757, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r760, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r772, r1051; +} +{ +fma.rn.f16x2 r1056, r769, r1049, r1053; +} +{ +mul.f16x2 r1060, r769, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r772, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r784, r1088; +} +{ +fma.rn.f16x2 r1093, r781, r1086, r1090; +} +{ +mul.f16x2 r1097, r781, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r784, r1086, r1100; +} +barrier.sync 0; +mad.lo.s32 r4164, r4160, 80, r4163; +st.shared.v2.f32 [r4164], {r727, r730}; +st.shared.v2.f32 [r4164+8], {r797, r806}; +st.shared.v2.f32 [r4164+16], {r834, r843}; +st.shared.v2.f32 [r4164+24], {r871, r880}; +st.shared.v2.f32 [r4164+32], {r908, r917}; +st.shared.v2.f32 [r4164+40], {r945, r954}; +st.shared.v2.f32 [r4164+48], {r982, r991}; +st.shared.v2.f32 [r4164+56], {r1019, r1028}; +st.shared.v2.f32 [r4164+64], {r1056, r1065}; +st.shared.v2.f32 [r4164+72], {r1093, r1102}; +barrier.sync 0; +mad.lo.s32 r4165, r4160, -72, r4164; +ld.shared.u32 r1137, [r4165]; +ld.shared.u32 r1149, [r4165+4]; +ld.shared.u32 r1459, [r4165+8000]; +ld.shared.u32 r1471, [r4165+8004]; +ld.shared.u32 r1134, [r4165+16000]; +ld.shared.u32 r1146, [r4165+16004]; +ld.shared.u32 r1456, [r4165+24000]; +ld.shared.u32 r1468, [r4165+24004]; +ld.shared.u32 r1140, [r4165+32000]; +ld.shared.u32 r1152, [r4165+32004]; +ld.shared.u32 r1462, [r4165+40000]; +ld.shared.u32 r1474, [r4165+40004]; +ld.shared.u32 r1141, [r4165+48000]; +ld.shared.u32 r1153, [r4165+48004]; +ld.shared.u32 r1463, [r4165+56000]; +ld.shared.u32 r1475, [r4165+56004]; +ld.shared.u32 r1135, [r4165+64000]; +ld.shared.u32 r1147, [r4165+64004]; +ld.shared.u32 r1457, [r4165+72000]; +ld.shared.u32 r1469, [r4165+72004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1128, {low, high}; +} +{ +neg.f16x2 r1129, r1128; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1132, {low, high}; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1137, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1136, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1149, r1145; +} +{ +add.f16x2 r1151, r1152, r1153; +} +{ +add.f16x2 r1154, r1148, r1151; +} +{ +add.f16x2 r1157, r1134, r1135; +} +{ +mul.f16x2 r1160, r1157, r1123; +} +{ +add.f16x2 r1163, r1137, r1160; +} +{ +add.f16x2 r1166, r1140, r1141; +} +{ +mul.f16x2 r1169, r1166, r1127; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1125; +} +{ +sub.f16x2 r1181, r1152, r1153; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, r1134, r1135; +} +{ +mul.f16x2 r1196, r1193, r1123; +} +{ +add.f16x2 r1199, r1137, r1196; +} +{ +add.f16x2 r1202, r1140, r1141; +} +{ +mul.f16x2 r1205, r1202, r1127; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1125; +} +{ +sub.f16x2 r1217, r1152, r1153; +} +{ +mul.f16x2 r1220, r1217, r1129; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, r1134, r1135; +} +{ +mul.f16x2 r1232, r1229, r1127; +} +{ +add.f16x2 r1235, r1137, r1232; +} +{ +add.f16x2 r1238, r1140, r1141; +} +{ +mul.f16x2 r1241, r1238, r1131; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1129; +} +{ +sub.f16x2 r1253, r1152, r1153; +} +{ +mul.f16x2 r1256, r1253, r1132; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +sub.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, r1134, r1135; +} +{ +mul.f16x2 r1268, r1265, r1127; +} +{ +add.f16x2 r1271, r1137, r1268; +} +{ +add.f16x2 r1274, r1140, r1141; +} +{ +mul.f16x2 r1277, r1274, r1131; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1129; +} +{ +sub.f16x2 r1289, r1152, r1153; +} +{ +mul.f16x2 r1292, r1289, r1132; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +add.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, r1146, r1147; +} +{ +mul.f16x2 r1304, r1301, r1123; +} +{ +add.f16x2 r1307, r1149, r1304; +} +{ +add.f16x2 r1310, r1152, r1153; +} +{ +mul.f16x2 r1313, r1310, r1127; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1125; +} +{ +sub.f16x2 r1325, r1140, r1141; +} +{ +mul.f16x2 r1328, r1325, r1129; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, r1146, r1147; +} +{ +mul.f16x2 r1340, r1337, r1123; +} +{ +add.f16x2 r1343, r1149, r1340; +} +{ +add.f16x2 r1346, r1152, r1153; +} +{ +mul.f16x2 r1349, r1346, r1127; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1125; +} +{ +sub.f16x2 r1361, r1140, r1141; +} +{ +mul.f16x2 r1364, r1361, r1129; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +add.f16x2 r1373, r1146, r1147; +} +{ +mul.f16x2 r1376, r1373, r1127; +} +{ +add.f16x2 r1379, r1149, r1376; +} +{ +add.f16x2 r1382, r1152, r1153; +} +{ +mul.f16x2 r1385, r1382, r1131; +} +{ +add.f16x2 r1388, r1379, r1385; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1129; +} +{ +sub.f16x2 r1397, r1140, r1141; +} +{ +mul.f16x2 r1400, r1397, r1132; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +add.f16x2 r1406, r1388, r1403; +} +{ +add.f16x2 r1409, r1146, r1147; +} +{ +mul.f16x2 r1412, r1409, r1127; +} +{ +add.f16x2 r1415, r1149, r1412; +} +{ +add.f16x2 r1418, r1152, r1153; +} +{ +mul.f16x2 r1421, r1418, r1131; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1129; +} +{ +sub.f16x2 r1433, r1140, r1141; +} +{ +mul.f16x2 r1436, r1433, r1132; +} +{ +add.f16x2 r1439, r1430, r1436; +} +{ +sub.f16x2 r1442, r1424, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1446, {low, high}; +} +{ +neg.f16x2 r1447, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1450, {low, high}; +} +{ +neg.f16x2 r1451, r1450; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1454, {low, high}; +} +{ +add.f16x2 r1455, r1456, r1457; +} +{ +add.f16x2 r1458, r1459, r1455; +} +{ +add.f16x2 r1461, r1462, r1463; +} +{ +add.f16x2 r1464, r1458, r1461; +} +{ +add.f16x2 r1467, r1468, r1469; +} +{ +add.f16x2 r1470, r1471, r1467; +} +{ +add.f16x2 r1473, r1474, r1475; +} +{ +add.f16x2 r1476, r1470, r1473; +} +{ +add.f16x2 r1479, r1456, r1457; +} +{ +mul.f16x2 r1482, r1479, r1445; +} +{ +add.f16x2 r1485, r1459, r1482; +} +{ +add.f16x2 r1488, r1462, r1463; +} +{ +mul.f16x2 r1491, r1488, r1449; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, r1468, r1469; +} +{ +mul.f16x2 r1500, r1497, r1447; +} +{ +sub.f16x2 r1503, r1474, r1475; +} +{ +mul.f16x2 r1506, r1503, r1451; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +sub.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, r1456, r1457; +} +{ +mul.f16x2 r1518, r1515, r1445; +} +{ +add.f16x2 r1521, r1459, r1518; +} +{ +add.f16x2 r1524, r1462, r1463; +} +{ +mul.f16x2 r1527, r1524, r1449; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, r1468, r1469; +} +{ +mul.f16x2 r1536, r1533, r1447; +} +{ +sub.f16x2 r1539, r1474, r1475; +} +{ +mul.f16x2 r1542, r1539, r1451; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +add.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, r1456, r1457; +} +{ +mul.f16x2 r1554, r1551, r1449; +} +{ +add.f16x2 r1557, r1459, r1554; +} +{ +add.f16x2 r1560, r1462, r1463; +} +{ +mul.f16x2 r1563, r1560, r1453; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, r1468, r1469; +} +{ +mul.f16x2 r1572, r1569, r1451; +} +{ +sub.f16x2 r1575, r1474, r1475; +} +{ +mul.f16x2 r1578, r1575, r1454; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +sub.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, r1456, r1457; +} +{ +mul.f16x2 r1590, r1587, r1449; +} +{ +add.f16x2 r1593, r1459, r1590; +} +{ +add.f16x2 r1596, r1462, r1463; +} +{ +mul.f16x2 r1599, r1596, r1453; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, r1468, r1469; +} +{ +mul.f16x2 r1608, r1605, r1451; +} +{ +sub.f16x2 r1611, r1474, r1475; +} +{ +mul.f16x2 r1614, r1611, r1454; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +add.f16x2 r1620, r1602, r1617; +} +{ +add.f16x2 r1623, r1468, r1469; +} +{ +mul.f16x2 r1626, r1623, r1445; +} +{ +add.f16x2 r1629, r1471, r1626; +} +{ +add.f16x2 r1632, r1474, r1475; +} +{ +mul.f16x2 r1635, r1632, r1449; +} +{ +add.f16x2 r1638, r1629, r1635; +} +{ +sub.f16x2 r1641, r1456, r1457; +} +{ +mul.f16x2 r1644, r1641, r1447; +} +{ +sub.f16x2 r1647, r1462, r1463; +} +{ +mul.f16x2 r1650, r1647, r1451; +} +{ +add.f16x2 r1653, r1644, r1650; +} +{ +add.f16x2 r1656, r1638, r1653; +} +{ +add.f16x2 r1659, r1468, r1469; +} +{ +mul.f16x2 r1662, r1659, r1445; +} +{ +add.f16x2 r1665, r1471, r1662; +} +{ +add.f16x2 r1668, r1474, r1475; +} +{ +mul.f16x2 r1671, r1668, r1449; +} +{ +add.f16x2 r1674, r1665, r1671; +} +{ +sub.f16x2 r1677, r1456, r1457; +} +{ +mul.f16x2 r1680, r1677, r1447; +} +{ +sub.f16x2 r1683, r1462, r1463; +} +{ +mul.f16x2 r1686, r1683, r1451; +} +{ +add.f16x2 r1689, r1680, r1686; +} +{ +sub.f16x2 r1692, r1674, r1689; +} +{ +add.f16x2 r1695, r1468, r1469; +} +{ +mul.f16x2 r1698, r1695, r1449; +} +{ +add.f16x2 r1701, r1471, r1698; +} +{ +add.f16x2 r1704, r1474, r1475; +} +{ +mul.f16x2 r1707, r1704, r1453; +} +{ +add.f16x2 r1710, r1701, r1707; +} +{ +sub.f16x2 r1713, r1456, r1457; +} +{ +mul.f16x2 r1716, r1713, r1451; +} +{ +sub.f16x2 r1719, r1462, r1463; +} +{ +mul.f16x2 r1722, r1719, r1454; +} +{ +add.f16x2 r1725, r1716, r1722; +} +{ +add.f16x2 r1728, r1710, r1725; +} +{ +add.f16x2 r1731, r1468, r1469; +} +{ +mul.f16x2 r1734, r1731, r1449; +} +{ +add.f16x2 r1737, r1471, r1734; +} +{ +add.f16x2 r1740, r1474, r1475; +} +{ +mul.f16x2 r1743, r1740, r1453; +} +{ +add.f16x2 r1746, r1737, r1743; +} +{ +sub.f16x2 r1749, r1456, r1457; +} +{ +mul.f16x2 r1752, r1749, r1451; +} +{ +sub.f16x2 r1755, r1462, r1463; +} +{ +mul.f16x2 r1758, r1755, r1454; +} +{ +add.f16x2 r1761, r1752, r1758; +} +{ +sub.f16x2 r1764, r1746, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1785, r1512, r1767; +} +{ +mul.f16x2 r1788, r1656, r1768; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1512, r1768; +} +{ +fma.rn.f16x2 r1797, r1656, r1767, r1794; +} +{ +mul.f16x2 r1801, r1584, r1769; +} +{ +mul.f16x2 r1804, r1728, r1770; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1584, r1770; +} +{ +fma.rn.f16x2 r1813, r1728, r1769, r1810; +} +{ +mul.f16x2 r1817, r1620, r1771; +} +{ +mul.f16x2 r1820, r1764, r1772; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1620, r1772; +} +{ +fma.rn.f16x2 r1829, r1764, r1771, r1826; +} +{ +mul.f16x2 r1833, r1548, r1773; +} +{ +mul.f16x2 r1836, r1692, r1774; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1548, r1774; +} +{ +fma.rn.f16x2 r1845, r1692, r1773, r1842; +} +{ +add.f16x2 r1849, r1142, r1464; +} +{ +add.f16x2 r1852, r1154, r1476; +} +{ +sub.f16x2 r1855, r1142, r1464; +} +{ +sub.f16x2 r1858, r1154, r1476; +} +{ +add.f16x2 r1861, r1190, r1791; +} +{ +add.f16x2 r1864, r1334, r1797; +} +{ +sub.f16x2 r1867, r1190, r1791; +} +{ +sub.f16x2 r1870, r1334, r1797; +} +{ +add.f16x2 r1873, r1262, r1807; +} +{ +add.f16x2 r1876, r1406, r1813; +} +{ +sub.f16x2 r1879, r1262, r1807; +} +{ +sub.f16x2 r1882, r1406, r1813; +} +{ +add.f16x2 r1885, r1298, r1823; +} +{ +add.f16x2 r1888, r1442, r1829; +} +{ +sub.f16x2 r1891, r1298, r1823; +} +{ +sub.f16x2 r1894, r1442, r1829; +} +{ +add.f16x2 r1897, r1226, r1839; +} +{ +add.f16x2 r1900, r1370, r1845; +} +{ +sub.f16x2 r1903, r1226, r1839; +} +{ +sub.f16x2 r1906, r1370, r1845; +} +mul.wide.u32 rd5, r4160, -858993459; +shr.u64 rd6, rd5, 35; +cvt.u32.u64 r4166, rd6; +cvt.rn.f32.u32 f310, r4166; +mul.f32 f311, f310, 0f3BCDE32E; +cos.approx.f32 f143, f311; +sin.approx.f32 f312, f311; +neg.f32 f144, f312; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1909, {low, high}; +} +mul.lo.s32 r4167, r4166, 10; +sub.s32 r4168, r4160, r4167; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1912, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1914, {high, high}; +} +{ +mul.f16x2 r1916, r1864, r1914; +} +{ +fma.rn.f16x2 r1919, r1861, r1912, r1916; +} +{ +mul.f16x2 r1923, r1861, r1914; +} +{ +neg.f16x2 r1926, r1923; +} +{ +fma.rn.f16x2 r1928, r1864, r1912, r1926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1932, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1934, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1936, {low, high}; +} +{ +mul.f16x2 r1937, r1934, r1936; +} +{ +mul.f16x2 r1940, r1909, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1943, {high, low}; +} +{ +fma.rn.f16x2 r1945, r1937, r1943, r1940; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1949, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1951, {high, high}; +} +{ +mul.f16x2 r1953, r1876, r1951; +} +{ +fma.rn.f16x2 r1956, r1873, r1949, r1953; +} +{ +mul.f16x2 r1960, r1873, r1951; +} +{ +neg.f16x2 r1963, r1960; +} +{ +fma.rn.f16x2 r1965, r1876, r1949, r1963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1969, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1971, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1973, {low, high}; +} +{ +mul.f16x2 r1974, r1971, r1973; +} +{ +mul.f16x2 r1977, r1945, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1980, {high, low}; +} +{ +fma.rn.f16x2 r1982, r1974, r1980, r1977; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1986, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1988, {high, high}; +} +{ +mul.f16x2 r1990, r1888, r1988; +} +{ +fma.rn.f16x2 r1993, r1885, r1986, r1990; +} +{ +mul.f16x2 r1997, r1885, r1988; +} +{ +neg.f16x2 r2000, r1997; +} +{ +fma.rn.f16x2 r2002, r1888, r1986, r2000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2006, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2008, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2010, {low, high}; +} +{ +mul.f16x2 r2011, r2008, r2010; +} +{ +mul.f16x2 r2014, r1982, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r2017, {high, low}; +} +{ +fma.rn.f16x2 r2019, r2011, r2017, r2014; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2023, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2025, {high, high}; +} +{ +mul.f16x2 r2027, r1900, r2025; +} +{ +fma.rn.f16x2 r2030, r1897, r2023, r2027; +} +{ +mul.f16x2 r2034, r1897, r2025; +} +{ +neg.f16x2 r2037, r2034; +} +{ +fma.rn.f16x2 r2039, r1900, r2023, r2037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2043, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2045, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r2045, r2047; +} +{ +mul.f16x2 r2051, r2019, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2054, {high, low}; +} +{ +fma.rn.f16x2 r2056, r2048, r2054, r2051; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2060, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2062, {high, high}; +} +{ +mul.f16x2 r2064, r1858, r2062; +} +{ +fma.rn.f16x2 r2067, r1855, r2060, r2064; +} +{ +mul.f16x2 r2071, r1855, r2062; +} +{ +neg.f16x2 r2074, r2071; +} +{ +fma.rn.f16x2 r2076, r1858, r2060, r2074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2080, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2082, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r2082, r2084; +} +{ +mul.f16x2 r2088, r2056, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2091, {high, low}; +} +{ +fma.rn.f16x2 r2093, r2085, r2091, r2088; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2097, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2099, {high, high}; +} +{ +mul.f16x2 r2101, r1870, r2099; +} +{ +fma.rn.f16x2 r2104, r1867, r2097, r2101; +} +{ +mul.f16x2 r2108, r1867, r2099; +} +{ +neg.f16x2 r2111, r2108; +} +{ +fma.rn.f16x2 r2113, r1870, r2097, r2111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2117, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2119, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2121, {low, high}; +} +{ +mul.f16x2 r2122, r2119, r2121; +} +{ +mul.f16x2 r2125, r2093, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2128, {high, low}; +} +{ +fma.rn.f16x2 r2130, r2122, r2128, r2125; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2134, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2136, {high, high}; +} +{ +mul.f16x2 r2138, r1882, r2136; +} +{ +fma.rn.f16x2 r2141, r1879, r2134, r2138; +} +{ +mul.f16x2 r2145, r1879, r2136; +} +{ +neg.f16x2 r2148, r2145; +} +{ +fma.rn.f16x2 r2150, r1882, r2134, r2148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2154, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2156, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2158, {low, high}; +} +{ +mul.f16x2 r2159, r2156, r2158; +} +{ +mul.f16x2 r2162, r2130, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2165, {high, low}; +} +{ +fma.rn.f16x2 r2167, r2159, r2165, r2162; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2171, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2173, {high, high}; +} +{ +mul.f16x2 r2175, r1894, r2173; +} +{ +fma.rn.f16x2 r2178, r1891, r2171, r2175; +} +{ +mul.f16x2 r2182, r1891, r2173; +} +{ +neg.f16x2 r2185, r2182; +} +{ +fma.rn.f16x2 r2187, r1894, r2171, r2185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2191, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2193, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2195, {low, high}; +} +{ +mul.f16x2 r2196, r2193, r2195; +} +{ +mul.f16x2 r2199, r2167, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2202, {high, low}; +} +{ +fma.rn.f16x2 r2204, r2196, r2202, r2199; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2208, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2210, {high, high}; +} +{ +mul.f16x2 r2212, r1906, r2210; +} +{ +fma.rn.f16x2 r2215, r1903, r2208, r2212; +} +{ +mul.f16x2 r2219, r1903, r2210; +} +{ +neg.f16x2 r2222, r2219; +} +{ +fma.rn.f16x2 r2224, r1906, r2208, r2222; +} +shl.b32 r4169, r4168, 3; +add.s32 r4170, r4163, r4169; +barrier.sync 0; +mad.lo.s32 r4171, r4166, 800, r4170; +st.shared.u32 [r4171], r1849; +st.shared.u32 [r4171+4], r1852; +st.shared.u32 [r4171+80], r1919; +st.shared.u32 [r4171+84], r1928; +st.shared.u32 [r4171+160], r1956; +st.shared.u32 [r4171+164], r1965; +st.shared.u32 [r4171+240], r1993; +st.shared.u32 [r4171+244], r2002; +st.shared.u32 [r4171+320], r2030; +st.shared.u32 [r4171+324], r2039; +st.shared.u32 [r4171+400], r2067; +st.shared.u32 [r4171+404], r2076; +st.shared.u32 [r4171+480], r2104; +st.shared.u32 [r4171+484], r2113; +st.shared.u32 [r4171+560], r2141; +st.shared.u32 [r4171+564], r2150; +st.shared.u32 [r4171+640], r2178; +st.shared.u32 [r4171+644], r2187; +st.shared.u32 [r4171+720], r2215; +st.shared.u32 [r4171+724], r2224; +barrier.sync 0; +ld.shared.u32 r2259, [r4165]; +ld.shared.u32 r2271, [r4165+4]; +ld.shared.u32 r2581, [r4165+8000]; +ld.shared.u32 r2593, [r4165+8004]; +ld.shared.u32 r2256, [r4165+16000]; +ld.shared.u32 r2268, [r4165+16004]; +ld.shared.u32 r2578, [r4165+24000]; +ld.shared.u32 r2590, [r4165+24004]; +ld.shared.u32 r2262, [r4165+32000]; +ld.shared.u32 r2274, [r4165+32004]; +ld.shared.u32 r2584, [r4165+40000]; +ld.shared.u32 r2596, [r4165+40004]; +ld.shared.u32 r2263, [r4165+48000]; +ld.shared.u32 r2275, [r4165+48004]; +ld.shared.u32 r2585, [r4165+56000]; +ld.shared.u32 r2597, [r4165+56004]; +ld.shared.u32 r2257, [r4165+64000]; +ld.shared.u32 r2269, [r4165+64004]; +ld.shared.u32 r2579, [r4165+72000]; +ld.shared.u32 r2591, [r4165+72004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2246, {low, high}; +} +{ +neg.f16x2 r2247, r2246; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r2250, {low, high}; +} +{ +neg.f16x2 r2251, r2250; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2254, {low, high}; +} +{ +add.f16x2 r2255, r2256, r2257; +} +{ +add.f16x2 r2258, r2259, r2255; +} +{ +add.f16x2 r2261, r2262, r2263; +} +{ +add.f16x2 r2264, r2258, r2261; +} +{ +add.f16x2 r2267, r2268, r2269; +} +{ +add.f16x2 r2270, r2271, r2267; +} +{ +add.f16x2 r2273, r2274, r2275; +} +{ +add.f16x2 r2276, r2270, r2273; +} +{ +add.f16x2 r2279, r2256, r2257; +} +{ +mul.f16x2 r2282, r2279, r2245; +} +{ +add.f16x2 r2285, r2259, r2282; +} +{ +add.f16x2 r2288, r2262, r2263; +} +{ +mul.f16x2 r2291, r2288, r2249; +} +{ +add.f16x2 r2294, r2285, r2291; +} +{ +sub.f16x2 r2297, r2268, r2269; +} +{ +mul.f16x2 r2300, r2297, r2247; +} +{ +sub.f16x2 r2303, r2274, r2275; +} +{ +mul.f16x2 r2306, r2303, r2251; +} +{ +add.f16x2 r2309, r2300, r2306; +} +{ +sub.f16x2 r2312, r2294, r2309; +} +{ +add.f16x2 r2315, r2256, r2257; +} +{ +mul.f16x2 r2318, r2315, r2245; +} +{ +add.f16x2 r2321, r2259, r2318; +} +{ +add.f16x2 r2324, r2262, r2263; +} +{ +mul.f16x2 r2327, r2324, r2249; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +sub.f16x2 r2333, r2268, r2269; +} +{ +mul.f16x2 r2336, r2333, r2247; +} +{ +sub.f16x2 r2339, r2274, r2275; +} +{ +mul.f16x2 r2342, r2339, r2251; +} +{ +add.f16x2 r2345, r2336, r2342; +} +{ +add.f16x2 r2348, r2330, r2345; +} +{ +add.f16x2 r2351, r2256, r2257; +} +{ +mul.f16x2 r2354, r2351, r2249; +} +{ +add.f16x2 r2357, r2259, r2354; +} +{ +add.f16x2 r2360, r2262, r2263; +} +{ +mul.f16x2 r2363, r2360, r2253; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +sub.f16x2 r2369, r2268, r2269; +} +{ +mul.f16x2 r2372, r2369, r2251; +} +{ +sub.f16x2 r2375, r2274, r2275; +} +{ +mul.f16x2 r2378, r2375, r2254; +} +{ +add.f16x2 r2381, r2372, r2378; +} +{ +sub.f16x2 r2384, r2366, r2381; +} +{ +add.f16x2 r2387, r2256, r2257; +} +{ +mul.f16x2 r2390, r2387, r2249; +} +{ +add.f16x2 r2393, r2259, r2390; +} +{ +add.f16x2 r2396, r2262, r2263; +} +{ +mul.f16x2 r2399, r2396, r2253; +} +{ +add.f16x2 r2402, r2393, r2399; +} +{ +sub.f16x2 r2405, r2268, r2269; +} +{ +mul.f16x2 r2408, r2405, r2251; +} +{ +sub.f16x2 r2411, r2274, r2275; +} +{ +mul.f16x2 r2414, r2411, r2254; +} +{ +add.f16x2 r2417, r2408, r2414; +} +{ +add.f16x2 r2420, r2402, r2417; +} +{ +add.f16x2 r2423, r2268, r2269; +} +{ +mul.f16x2 r2426, r2423, r2245; +} +{ +add.f16x2 r2429, r2271, r2426; +} +{ +add.f16x2 r2432, r2274, r2275; +} +{ +mul.f16x2 r2435, r2432, r2249; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +sub.f16x2 r2441, r2256, r2257; +} +{ +mul.f16x2 r2444, r2441, r2247; +} +{ +sub.f16x2 r2447, r2262, r2263; +} +{ +mul.f16x2 r2450, r2447, r2251; +} +{ +add.f16x2 r2453, r2444, r2450; +} +{ +add.f16x2 r2456, r2438, r2453; +} +{ +add.f16x2 r2459, r2268, r2269; +} +{ +mul.f16x2 r2462, r2459, r2245; +} +{ +add.f16x2 r2465, r2271, r2462; +} +{ +add.f16x2 r2468, r2274, r2275; +} +{ +mul.f16x2 r2471, r2468, r2249; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +sub.f16x2 r2477, r2256, r2257; +} +{ +mul.f16x2 r2480, r2477, r2247; +} +{ +sub.f16x2 r2483, r2262, r2263; +} +{ +mul.f16x2 r2486, r2483, r2251; +} +{ +add.f16x2 r2489, r2480, r2486; +} +{ +sub.f16x2 r2492, r2474, r2489; +} +{ +add.f16x2 r2495, r2268, r2269; +} +{ +mul.f16x2 r2498, r2495, r2249; +} +{ +add.f16x2 r2501, r2271, r2498; +} +{ +add.f16x2 r2504, r2274, r2275; +} +{ +mul.f16x2 r2507, r2504, r2253; +} +{ +add.f16x2 r2510, r2501, r2507; +} +{ +sub.f16x2 r2513, r2256, r2257; +} +{ +mul.f16x2 r2516, r2513, r2251; +} +{ +sub.f16x2 r2519, r2262, r2263; +} +{ +mul.f16x2 r2522, r2519, r2254; +} +{ +add.f16x2 r2525, r2516, r2522; +} +{ +add.f16x2 r2528, r2510, r2525; +} +{ +add.f16x2 r2531, r2268, r2269; +} +{ +mul.f16x2 r2534, r2531, r2249; +} +{ +add.f16x2 r2537, r2271, r2534; +} +{ +add.f16x2 r2540, r2274, r2275; +} +{ +mul.f16x2 r2543, r2540, r2253; +} +{ +add.f16x2 r2546, r2537, r2543; +} +{ +sub.f16x2 r2549, r2256, r2257; +} +{ +mul.f16x2 r2552, r2549, r2251; +} +{ +sub.f16x2 r2555, r2262, r2263; +} +{ +mul.f16x2 r2558, r2555, r2254; +} +{ +add.f16x2 r2561, r2552, r2558; +} +{ +sub.f16x2 r2564, r2546, r2561; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2568, {low, high}; +} +{ +neg.f16x2 r2569, r2568; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r2572, {low, high}; +} +{ +neg.f16x2 r2573, r2572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2576, {low, high}; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2590, r2591; +} +{ +add.f16x2 r2592, r2593, r2589; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2592, r2595; +} +{ +add.f16x2 r2601, r2578, r2579; +} +{ +mul.f16x2 r2604, r2601, r2567; +} +{ +add.f16x2 r2607, r2581, r2604; +} +{ +add.f16x2 r2610, r2584, r2585; +} +{ +mul.f16x2 r2613, r2610, r2571; +} +{ +add.f16x2 r2616, r2607, r2613; +} +{ +sub.f16x2 r2619, r2590, r2591; +} +{ +mul.f16x2 r2622, r2619, r2569; +} +{ +sub.f16x2 r2625, r2596, r2597; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r2622, r2628; +} +{ +sub.f16x2 r2634, r2616, r2631; +} +{ +add.f16x2 r2637, r2578, r2579; +} +{ +mul.f16x2 r2640, r2637, r2567; +} +{ +add.f16x2 r2643, r2581, r2640; +} +{ +add.f16x2 r2646, r2584, r2585; +} +{ +mul.f16x2 r2649, r2646, r2571; +} +{ +add.f16x2 r2652, r2643, r2649; +} +{ +sub.f16x2 r2655, r2590, r2591; +} +{ +mul.f16x2 r2658, r2655, r2569; +} +{ +sub.f16x2 r2661, r2596, r2597; +} +{ +mul.f16x2 r2664, r2661, r2573; +} +{ +add.f16x2 r2667, r2658, r2664; +} +{ +add.f16x2 r2670, r2652, r2667; +} +{ +add.f16x2 r2673, r2578, r2579; +} +{ +mul.f16x2 r2676, r2673, r2571; +} +{ +add.f16x2 r2679, r2581, r2676; +} +{ +add.f16x2 r2682, r2584, r2585; +} +{ +mul.f16x2 r2685, r2682, r2575; +} +{ +add.f16x2 r2688, r2679, r2685; +} +{ +sub.f16x2 r2691, r2590, r2591; +} +{ +mul.f16x2 r2694, r2691, r2573; +} +{ +sub.f16x2 r2697, r2596, r2597; +} +{ +mul.f16x2 r2700, r2697, r2576; +} +{ +add.f16x2 r2703, r2694, r2700; +} +{ +sub.f16x2 r2706, r2688, r2703; +} +{ +add.f16x2 r2709, r2578, r2579; +} +{ +mul.f16x2 r2712, r2709, r2571; +} +{ +add.f16x2 r2715, r2581, r2712; +} +{ +add.f16x2 r2718, r2584, r2585; +} +{ +mul.f16x2 r2721, r2718, r2575; +} +{ +add.f16x2 r2724, r2715, r2721; +} +{ +sub.f16x2 r2727, r2590, r2591; +} +{ +mul.f16x2 r2730, r2727, r2573; +} +{ +sub.f16x2 r2733, r2596, r2597; +} +{ +mul.f16x2 r2736, r2733, r2576; +} +{ +add.f16x2 r2739, r2730, r2736; +} +{ +add.f16x2 r2742, r2724, r2739; +} +{ +add.f16x2 r2745, r2590, r2591; +} +{ +mul.f16x2 r2748, r2745, r2567; +} +{ +add.f16x2 r2751, r2593, r2748; +} +{ +add.f16x2 r2754, r2596, r2597; +} +{ +mul.f16x2 r2757, r2754, r2571; +} +{ +add.f16x2 r2760, r2751, r2757; +} +{ +sub.f16x2 r2763, r2578, r2579; +} +{ +mul.f16x2 r2766, r2763, r2569; +} +{ +sub.f16x2 r2769, r2584, r2585; +} +{ +mul.f16x2 r2772, r2769, r2573; +} +{ +add.f16x2 r2775, r2766, r2772; +} +{ +add.f16x2 r2778, r2760, r2775; +} +{ +add.f16x2 r2781, r2590, r2591; +} +{ +mul.f16x2 r2784, r2781, r2567; +} +{ +add.f16x2 r2787, r2593, r2784; +} +{ +add.f16x2 r2790, r2596, r2597; +} +{ +mul.f16x2 r2793, r2790, r2571; +} +{ +add.f16x2 r2796, r2787, r2793; +} +{ +sub.f16x2 r2799, r2578, r2579; +} +{ +mul.f16x2 r2802, r2799, r2569; +} +{ +sub.f16x2 r2805, r2584, r2585; +} +{ +mul.f16x2 r2808, r2805, r2573; +} +{ +add.f16x2 r2811, r2802, r2808; +} +{ +sub.f16x2 r2814, r2796, r2811; +} +{ +add.f16x2 r2817, r2590, r2591; +} +{ +mul.f16x2 r2820, r2817, r2571; +} +{ +add.f16x2 r2823, r2593, r2820; +} +{ +add.f16x2 r2826, r2596, r2597; +} +{ +mul.f16x2 r2829, r2826, r2575; +} +{ +add.f16x2 r2832, r2823, r2829; +} +{ +sub.f16x2 r2835, r2578, r2579; +} +{ +mul.f16x2 r2838, r2835, r2573; +} +{ +sub.f16x2 r2841, r2584, r2585; +} +{ +mul.f16x2 r2844, r2841, r2576; +} +{ +add.f16x2 r2847, r2838, r2844; +} +{ +add.f16x2 r2850, r2832, r2847; +} +{ +add.f16x2 r2853, r2590, r2591; +} +{ +mul.f16x2 r2856, r2853, r2571; +} +{ +add.f16x2 r2859, r2593, r2856; +} +{ +add.f16x2 r2862, r2596, r2597; +} +{ +mul.f16x2 r2865, r2862, r2575; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +sub.f16x2 r2871, r2578, r2579; +} +{ +mul.f16x2 r2874, r2871, r2573; +} +{ +sub.f16x2 r2877, r2584, r2585; +} +{ +mul.f16x2 r2880, r2877, r2576; +} +{ +add.f16x2 r2883, r2874, r2880; +} +{ +sub.f16x2 r2886, r2868, r2883; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2891, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2892, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r2893, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2894, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2896, {low, high}; +} +{ +mul.f16x2 r2907, r2634, r2889; +} +{ +mul.f16x2 r2910, r2778, r2890; +} +{ +sub.f16x2 r2913, r2907, r2910; +} +{ +mul.f16x2 r2916, r2634, r2890; +} +{ +fma.rn.f16x2 r2919, r2778, r2889, r2916; +} +{ +mul.f16x2 r2923, r2706, r2891; +} +{ +mul.f16x2 r2926, r2850, r2892; +} +{ +sub.f16x2 r2929, r2923, r2926; +} +{ +mul.f16x2 r2932, r2706, r2892; +} +{ +fma.rn.f16x2 r2935, r2850, r2891, r2932; +} +{ +mul.f16x2 r2939, r2742, r2893; +} +{ +mul.f16x2 r2942, r2886, r2894; +} +{ +sub.f16x2 r2945, r2939, r2942; +} +{ +mul.f16x2 r2948, r2742, r2894; +} +{ +fma.rn.f16x2 r2951, r2886, r2893, r2948; +} +{ +mul.f16x2 r2955, r2670, r2895; +} +{ +mul.f16x2 r2958, r2814, r2896; +} +{ +sub.f16x2 r2961, r2955, r2958; +} +{ +mul.f16x2 r2964, r2670, r2896; +} +{ +fma.rn.f16x2 r2967, r2814, r2895, r2964; +} +{ +add.f16x2 r2971, r2264, r2586; +} +{ +add.f16x2 r2974, r2276, r2598; +} +{ +sub.f16x2 r2977, r2264, r2586; +} +{ +sub.f16x2 r2980, r2276, r2598; +} +{ +add.f16x2 r2983, r2312, r2913; +} +{ +add.f16x2 r2986, r2456, r2919; +} +{ +sub.f16x2 r2989, r2312, r2913; +} +{ +sub.f16x2 r2992, r2456, r2919; +} +{ +add.f16x2 r2995, r2384, r2929; +} +{ +add.f16x2 r2998, r2528, r2935; +} +{ +sub.f16x2 r3001, r2384, r2929; +} +{ +sub.f16x2 r3004, r2528, r2935; +} +{ +add.f16x2 r3007, r2420, r2945; +} +{ +add.f16x2 r3010, r2564, r2951; +} +{ +sub.f16x2 r3013, r2420, r2945; +} +{ +sub.f16x2 r3016, r2564, r2951; +} +{ +add.f16x2 r3019, r2348, r2961; +} +{ +add.f16x2 r3022, r2492, r2967; +} +{ +sub.f16x2 r3025, r2348, r2961; +} +{ +sub.f16x2 r3028, r2492, r2967; +} +mul.wide.u32 rd7, r4160, 1374389535; +shr.u64 rd8, rd7, 37; +cvt.u32.u64 r4172, rd8; +cvt.rn.f32.u32 f313, r4172; +mul.f32 f314, f313, 0f3D80ADFD; +cos.approx.f32 f225, f314; +sin.approx.f32 f315, f314; +neg.f32 f226, f315; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r3031, {low, high}; +} +mul.lo.s32 r4173, r4172, 100; +sub.s32 r4174, r4160, r4173; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3036, {high, high}; +} +{ +mul.f16x2 r3038, r2986, r3036; +} +{ +fma.rn.f16x2 r3041, r2983, r3034, r3038; +} +{ +mul.f16x2 r3045, r2983, r3036; +} +{ +neg.f16x2 r3048, r3045; +} +{ +fma.rn.f16x2 r3050, r2986, r3034, r3048; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3054, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3056, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3058, {low, high}; +} +{ +mul.f16x2 r3059, r3056, r3058; +} +{ +mul.f16x2 r3062, r3031, r3054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3065, {high, low}; +} +{ +fma.rn.f16x2 r3067, r3059, r3065, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3067; +mov.b32 r3071, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3067; +mov.b32 r3073, {high, high}; +} +{ +mul.f16x2 r3075, r2998, r3073; +} +{ +fma.rn.f16x2 r3078, r2995, r3071, r3075; +} +{ +mul.f16x2 r3082, r2995, r3073; +} +{ +neg.f16x2 r3085, r3082; +} +{ +fma.rn.f16x2 r3087, r2998, r3071, r3085; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3091, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3093, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3095, {low, high}; +} +{ +mul.f16x2 r3096, r3093, r3095; +} +{ +mul.f16x2 r3099, r3067, r3091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3067; +mov.b32 r3102, {high, low}; +} +{ +fma.rn.f16x2 r3104, r3096, r3102, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3104; +mov.b32 r3108, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3104; +mov.b32 r3110, {high, high}; +} +{ +mul.f16x2 r3112, r3010, r3110; +} +{ +fma.rn.f16x2 r3115, r3007, r3108, r3112; +} +{ +mul.f16x2 r3119, r3007, r3110; +} +{ +neg.f16x2 r3122, r3119; +} +{ +fma.rn.f16x2 r3124, r3010, r3108, r3122; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3130, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3132, {low, high}; +} +{ +mul.f16x2 r3133, r3130, r3132; +} +{ +mul.f16x2 r3136, r3104, r3128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3104; +mov.b32 r3139, {high, low}; +} +{ +fma.rn.f16x2 r3141, r3133, r3139, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3141; +mov.b32 r3145, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3141; +mov.b32 r3147, {high, high}; +} +{ +mul.f16x2 r3149, r3022, r3147; +} +{ +fma.rn.f16x2 r3152, r3019, r3145, r3149; +} +{ +mul.f16x2 r3156, r3019, r3147; +} +{ +neg.f16x2 r3159, r3156; +} +{ +fma.rn.f16x2 r3161, r3022, r3145, r3159; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3165, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3167, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3169, {low, high}; +} +{ +mul.f16x2 r3170, r3167, r3169; +} +{ +mul.f16x2 r3173, r3141, r3165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3141; +mov.b32 r3176, {high, low}; +} +{ +fma.rn.f16x2 r3178, r3170, r3176, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3178; +mov.b32 r3182, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3178; +mov.b32 r3184, {high, high}; +} +{ +mul.f16x2 r3186, r2980, r3184; +} +{ +fma.rn.f16x2 r3189, r2977, r3182, r3186; +} +{ +mul.f16x2 r3193, r2977, r3184; +} +{ +neg.f16x2 r3196, r3193; +} +{ +fma.rn.f16x2 r3198, r2980, r3182, r3196; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3202, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3204, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3206, {low, high}; +} +{ +mul.f16x2 r3207, r3204, r3206; +} +{ +mul.f16x2 r3210, r3178, r3202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3178; +mov.b32 r3213, {high, low}; +} +{ +fma.rn.f16x2 r3215, r3207, r3213, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3215; +mov.b32 r3219, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3215; +mov.b32 r3221, {high, high}; +} +{ +mul.f16x2 r3223, r2992, r3221; +} +{ +fma.rn.f16x2 r3226, r2989, r3219, r3223; +} +{ +mul.f16x2 r3230, r2989, r3221; +} +{ +neg.f16x2 r3233, r3230; +} +{ +fma.rn.f16x2 r3235, r2992, r3219, r3233; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3239, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3241, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3243, {low, high}; +} +{ +mul.f16x2 r3244, r3241, r3243; +} +{ +mul.f16x2 r3247, r3215, r3239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3215; +mov.b32 r3250, {high, low}; +} +{ +fma.rn.f16x2 r3252, r3244, r3250, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3252; +mov.b32 r3256, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3252; +mov.b32 r3258, {high, high}; +} +{ +mul.f16x2 r3260, r3004, r3258; +} +{ +fma.rn.f16x2 r3263, r3001, r3256, r3260; +} +{ +mul.f16x2 r3267, r3001, r3258; +} +{ +neg.f16x2 r3270, r3267; +} +{ +fma.rn.f16x2 r3272, r3004, r3256, r3270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3278, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3280, {low, high}; +} +{ +mul.f16x2 r3281, r3278, r3280; +} +{ +mul.f16x2 r3284, r3252, r3276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3252; +mov.b32 r3287, {high, low}; +} +{ +fma.rn.f16x2 r3289, r3281, r3287, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3289; +mov.b32 r3293, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3289; +mov.b32 r3295, {high, high}; +} +{ +mul.f16x2 r3297, r3016, r3295; +} +{ +fma.rn.f16x2 r3300, r3013, r3293, r3297; +} +{ +mul.f16x2 r3304, r3013, r3295; +} +{ +neg.f16x2 r3307, r3304; +} +{ +fma.rn.f16x2 r3309, r3016, r3293, r3307; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3313, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3315, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3317, {low, high}; +} +{ +mul.f16x2 r3318, r3315, r3317; +} +{ +mul.f16x2 r3321, r3289, r3313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3289; +mov.b32 r3324, {high, low}; +} +{ +fma.rn.f16x2 r3326, r3318, r3324, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3326; +mov.b32 r3330, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3326; +mov.b32 r3332, {high, high}; +} +{ +mul.f16x2 r3334, r3028, r3332; +} +{ +fma.rn.f16x2 r3337, r3025, r3330, r3334; +} +{ +mul.f16x2 r3341, r3025, r3332; +} +{ +neg.f16x2 r3344, r3341; +} +{ +fma.rn.f16x2 r3346, r3028, r3330, r3344; +} +shl.b32 r4175, r4174, 3; +add.s32 r4176, r4163, r4175; +barrier.sync 0; +mad.lo.s32 r4177, r4172, 8000, r4176; +st.shared.u32 [r4177], r2971; +st.shared.u32 [r4177+4], r2974; +st.shared.u32 [r4177+800], r3041; +st.shared.u32 [r4177+804], r3050; +st.shared.u32 [r4177+1600], r3078; +st.shared.u32 [r4177+1604], r3087; +st.shared.u32 [r4177+2400], r3115; +st.shared.u32 [r4177+2404], r3124; +st.shared.u32 [r4177+3200], r3152; +st.shared.u32 [r4177+3204], r3161; +st.shared.u32 [r4177+4000], r3189; +st.shared.u32 [r4177+4004], r3198; +st.shared.u32 [r4177+4800], r3226; +st.shared.u32 [r4177+4804], r3235; +st.shared.u32 [r4177+5600], r3263; +st.shared.u32 [r4177+5604], r3272; +st.shared.u32 [r4177+6400], r3300; +st.shared.u32 [r4177+6404], r3309; +st.shared.u32 [r4177+7200], r3337; +st.shared.u32 [r4177+7204], r3346; +barrier.sync 0; +ld.shared.u32 r3381, [r4165]; +ld.shared.u32 r3393, [r4165+4]; +ld.shared.u32 r3703, [r4165+8000]; +ld.shared.u32 r3715, [r4165+8004]; +ld.shared.u32 r3378, [r4165+16000]; +ld.shared.u32 r3390, [r4165+16004]; +ld.shared.u32 r3700, [r4165+24000]; +ld.shared.u32 r3712, [r4165+24004]; +ld.shared.u32 r3384, [r4165+32000]; +ld.shared.u32 r3396, [r4165+32004]; +ld.shared.u32 r3706, [r4165+40000]; +ld.shared.u32 r3718, [r4165+40004]; +ld.shared.u32 r3385, [r4165+48000]; +ld.shared.u32 r3397, [r4165+48004]; +ld.shared.u32 r3707, [r4165+56000]; +ld.shared.u32 r3719, [r4165+56004]; +ld.shared.u32 r3379, [r4165+64000]; +ld.shared.u32 r3391, [r4165+64004]; +ld.shared.u32 r3701, [r4165+72000]; +ld.shared.u32 r3713, [r4165+72004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3367, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3368, {low, high}; +} +{ +neg.f16x2 r3369, r3368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r3372, {low, high}; +} +{ +neg.f16x2 r3373, r3372; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3375, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3376, {low, high}; +} +{ +add.f16x2 r3377, r3378, r3379; +} +{ +add.f16x2 r3380, r3381, r3377; +} +{ +add.f16x2 r3383, r3384, r3385; +} +{ +add.f16x2 r3386, r3380, r3383; +} +{ +add.f16x2 r3389, r3390, r3391; +} +{ +add.f16x2 r3392, r3393, r3389; +} +{ +add.f16x2 r3395, r3396, r3397; +} +{ +add.f16x2 r3398, r3392, r3395; +} +{ +add.f16x2 r3401, r3378, r3379; +} +{ +mul.f16x2 r3404, r3401, r3367; +} +{ +add.f16x2 r3407, r3381, r3404; +} +{ +add.f16x2 r3410, r3384, r3385; +} +{ +mul.f16x2 r3413, r3410, r3371; +} +{ +add.f16x2 r3416, r3407, r3413; +} +{ +sub.f16x2 r3419, r3390, r3391; +} +{ +mul.f16x2 r3422, r3419, r3369; +} +{ +sub.f16x2 r3425, r3396, r3397; +} +{ +mul.f16x2 r3428, r3425, r3373; +} +{ +add.f16x2 r3431, r3422, r3428; +} +{ +sub.f16x2 r3434, r3416, r3431; +} +{ +add.f16x2 r3437, r3378, r3379; +} +{ +mul.f16x2 r3440, r3437, r3367; +} +{ +add.f16x2 r3443, r3381, r3440; +} +{ +add.f16x2 r3446, r3384, r3385; +} +{ +mul.f16x2 r3449, r3446, r3371; +} +{ +add.f16x2 r3452, r3443, r3449; +} +{ +sub.f16x2 r3455, r3390, r3391; +} +{ +mul.f16x2 r3458, r3455, r3369; +} +{ +sub.f16x2 r3461, r3396, r3397; +} +{ +mul.f16x2 r3464, r3461, r3373; +} +{ +add.f16x2 r3467, r3458, r3464; +} +{ +add.f16x2 r3470, r3452, r3467; +} +{ +add.f16x2 r3473, r3378, r3379; +} +{ +mul.f16x2 r3476, r3473, r3371; +} +{ +add.f16x2 r3479, r3381, r3476; +} +{ +add.f16x2 r3482, r3384, r3385; +} +{ +mul.f16x2 r3485, r3482, r3375; +} +{ +add.f16x2 r3488, r3479, r3485; +} +{ +sub.f16x2 r3491, r3390, r3391; +} +{ +mul.f16x2 r3494, r3491, r3373; +} +{ +sub.f16x2 r3497, r3396, r3397; +} +{ +mul.f16x2 r3500, r3497, r3376; +} +{ +add.f16x2 r3503, r3494, r3500; +} +{ +sub.f16x2 r3506, r3488, r3503; +} +{ +add.f16x2 r3509, r3378, r3379; +} +{ +mul.f16x2 r3512, r3509, r3371; +} +{ +add.f16x2 r3515, r3381, r3512; +} +{ +add.f16x2 r3518, r3384, r3385; +} +{ +mul.f16x2 r3521, r3518, r3375; +} +{ +add.f16x2 r3524, r3515, r3521; +} +{ +sub.f16x2 r3527, r3390, r3391; +} +{ +mul.f16x2 r3530, r3527, r3373; +} +{ +sub.f16x2 r3533, r3396, r3397; +} +{ +mul.f16x2 r3536, r3533, r3376; +} +{ +add.f16x2 r3539, r3530, r3536; +} +{ +add.f16x2 r3542, r3524, r3539; +} +{ +add.f16x2 r3545, r3390, r3391; +} +{ +mul.f16x2 r3548, r3545, r3367; +} +{ +add.f16x2 r3551, r3393, r3548; +} +{ +add.f16x2 r3554, r3396, r3397; +} +{ +mul.f16x2 r3557, r3554, r3371; +} +{ +add.f16x2 r3560, r3551, r3557; +} +{ +sub.f16x2 r3563, r3378, r3379; +} +{ +mul.f16x2 r3566, r3563, r3369; +} +{ +sub.f16x2 r3569, r3384, r3385; +} +{ +mul.f16x2 r3572, r3569, r3373; +} +{ +add.f16x2 r3575, r3566, r3572; +} +{ +add.f16x2 r3578, r3560, r3575; +} +{ +add.f16x2 r3581, r3390, r3391; +} +{ +mul.f16x2 r3584, r3581, r3367; +} +{ +add.f16x2 r3587, r3393, r3584; +} +{ +add.f16x2 r3590, r3396, r3397; +} +{ +mul.f16x2 r3593, r3590, r3371; +} +{ +add.f16x2 r3596, r3587, r3593; +} +{ +sub.f16x2 r3599, r3378, r3379; +} +{ +mul.f16x2 r3602, r3599, r3369; +} +{ +sub.f16x2 r3605, r3384, r3385; +} +{ +mul.f16x2 r3608, r3605, r3373; +} +{ +add.f16x2 r3611, r3602, r3608; +} +{ +sub.f16x2 r3614, r3596, r3611; +} +{ +add.f16x2 r3617, r3390, r3391; +} +{ +mul.f16x2 r3620, r3617, r3371; +} +{ +add.f16x2 r3623, r3393, r3620; +} +{ +add.f16x2 r3626, r3396, r3397; +} +{ +mul.f16x2 r3629, r3626, r3375; +} +{ +add.f16x2 r3632, r3623, r3629; +} +{ +sub.f16x2 r3635, r3378, r3379; +} +{ +mul.f16x2 r3638, r3635, r3373; +} +{ +sub.f16x2 r3641, r3384, r3385; +} +{ +mul.f16x2 r3644, r3641, r3376; +} +{ +add.f16x2 r3647, r3638, r3644; +} +{ +add.f16x2 r3650, r3632, r3647; +} +{ +add.f16x2 r3653, r3390, r3391; +} +{ +mul.f16x2 r3656, r3653, r3371; +} +{ +add.f16x2 r3659, r3393, r3656; +} +{ +add.f16x2 r3662, r3396, r3397; +} +{ +mul.f16x2 r3665, r3662, r3375; +} +{ +add.f16x2 r3668, r3659, r3665; +} +{ +sub.f16x2 r3671, r3378, r3379; +} +{ +mul.f16x2 r3674, r3671, r3373; +} +{ +sub.f16x2 r3677, r3384, r3385; +} +{ +mul.f16x2 r3680, r3677, r3376; +} +{ +add.f16x2 r3683, r3674, r3680; +} +{ +sub.f16x2 r3686, r3668, r3683; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3689, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3690, {low, high}; +} +{ +neg.f16x2 r3691, r3690; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r3694, {low, high}; +} +{ +neg.f16x2 r3695, r3694; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3698, {low, high}; +} +{ +add.f16x2 r3699, r3700, r3701; +} +{ +add.f16x2 r3702, r3703, r3699; +} +{ +add.f16x2 r3705, r3706, r3707; +} +{ +add.f16x2 r3708, r3702, r3705; +} +{ +add.f16x2 r3711, r3712, r3713; +} +{ +add.f16x2 r3714, r3715, r3711; +} +{ +add.f16x2 r3717, r3718, r3719; +} +{ +add.f16x2 r3720, r3714, r3717; +} +{ +add.f16x2 r3723, r3700, r3701; +} +{ +mul.f16x2 r3726, r3723, r3689; +} +{ +add.f16x2 r3729, r3703, r3726; +} +{ +add.f16x2 r3732, r3706, r3707; +} +{ +mul.f16x2 r3735, r3732, r3693; +} +{ +add.f16x2 r3738, r3729, r3735; +} +{ +sub.f16x2 r3741, r3712, r3713; +} +{ +mul.f16x2 r3744, r3741, r3691; +} +{ +sub.f16x2 r3747, r3718, r3719; +} +{ +mul.f16x2 r3750, r3747, r3695; +} +{ +add.f16x2 r3753, r3744, r3750; +} +{ +sub.f16x2 r3756, r3738, r3753; +} +{ +add.f16x2 r3759, r3700, r3701; +} +{ +mul.f16x2 r3762, r3759, r3689; +} +{ +add.f16x2 r3765, r3703, r3762; +} +{ +add.f16x2 r3768, r3706, r3707; +} +{ +mul.f16x2 r3771, r3768, r3693; +} +{ +add.f16x2 r3774, r3765, r3771; +} +{ +sub.f16x2 r3777, r3712, r3713; +} +{ +mul.f16x2 r3780, r3777, r3691; +} +{ +sub.f16x2 r3783, r3718, r3719; +} +{ +mul.f16x2 r3786, r3783, r3695; +} +{ +add.f16x2 r3789, r3780, r3786; +} +{ +add.f16x2 r3792, r3774, r3789; +} +{ +add.f16x2 r3795, r3700, r3701; +} +{ +mul.f16x2 r3798, r3795, r3693; +} +{ +add.f16x2 r3801, r3703, r3798; +} +{ +add.f16x2 r3804, r3706, r3707; +} +{ +mul.f16x2 r3807, r3804, r3697; +} +{ +add.f16x2 r3810, r3801, r3807; +} +{ +sub.f16x2 r3813, r3712, r3713; +} +{ +mul.f16x2 r3816, r3813, r3695; +} +{ +sub.f16x2 r3819, r3718, r3719; +} +{ +mul.f16x2 r3822, r3819, r3698; +} +{ +add.f16x2 r3825, r3816, r3822; +} +{ +sub.f16x2 r3828, r3810, r3825; +} +{ +add.f16x2 r3831, r3700, r3701; +} +{ +mul.f16x2 r3834, r3831, r3693; +} +{ +add.f16x2 r3837, r3703, r3834; +} +{ +add.f16x2 r3840, r3706, r3707; +} +{ +mul.f16x2 r3843, r3840, r3697; +} +{ +add.f16x2 r3846, r3837, r3843; +} +{ +sub.f16x2 r3849, r3712, r3713; +} +{ +mul.f16x2 r3852, r3849, r3695; +} +{ +sub.f16x2 r3855, r3718, r3719; +} +{ +mul.f16x2 r3858, r3855, r3698; +} +{ +add.f16x2 r3861, r3852, r3858; +} +{ +add.f16x2 r3864, r3846, r3861; +} +{ +add.f16x2 r3867, r3712, r3713; +} +{ +mul.f16x2 r3870, r3867, r3689; +} +{ +add.f16x2 r3873, r3715, r3870; +} +{ +add.f16x2 r3876, r3718, r3719; +} +{ +mul.f16x2 r3879, r3876, r3693; +} +{ +add.f16x2 r3882, r3873, r3879; +} +{ +sub.f16x2 r3885, r3700, r3701; +} +{ +mul.f16x2 r3888, r3885, r3691; +} +{ +sub.f16x2 r3891, r3706, r3707; +} +{ +mul.f16x2 r3894, r3891, r3695; +} +{ +add.f16x2 r3897, r3888, r3894; +} +{ +add.f16x2 r3900, r3882, r3897; +} +{ +add.f16x2 r3903, r3712, r3713; +} +{ +mul.f16x2 r3906, r3903, r3689; +} +{ +add.f16x2 r3909, r3715, r3906; +} +{ +add.f16x2 r3912, r3718, r3719; +} +{ +mul.f16x2 r3915, r3912, r3693; +} +{ +add.f16x2 r3918, r3909, r3915; +} +{ +sub.f16x2 r3921, r3700, r3701; +} +{ +mul.f16x2 r3924, r3921, r3691; +} +{ +sub.f16x2 r3927, r3706, r3707; +} +{ +mul.f16x2 r3930, r3927, r3695; +} +{ +add.f16x2 r3933, r3924, r3930; +} +{ +sub.f16x2 r3936, r3918, r3933; +} +{ +add.f16x2 r3939, r3712, r3713; +} +{ +mul.f16x2 r3942, r3939, r3693; +} +{ +add.f16x2 r3945, r3715, r3942; +} +{ +add.f16x2 r3948, r3718, r3719; +} +{ +mul.f16x2 r3951, r3948, r3697; +} +{ +add.f16x2 r3954, r3945, r3951; +} +{ +sub.f16x2 r3957, r3700, r3701; +} +{ +mul.f16x2 r3960, r3957, r3695; +} +{ +sub.f16x2 r3963, r3706, r3707; +} +{ +mul.f16x2 r3966, r3963, r3698; +} +{ +add.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3954, r3969; +} +{ +add.f16x2 r3975, r3712, r3713; +} +{ +mul.f16x2 r3978, r3975, r3693; +} +{ +add.f16x2 r3981, r3715, r3978; +} +{ +add.f16x2 r3984, r3718, r3719; +} +{ +mul.f16x2 r3987, r3984, r3697; +} +{ +add.f16x2 r3990, r3981, r3987; +} +{ +sub.f16x2 r3993, r3700, r3701; +} +{ +mul.f16x2 r3996, r3993, r3695; +} +{ +sub.f16x2 r3999, r3706, r3707; +} +{ +mul.f16x2 r4002, r3999, r3698; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +sub.f16x2 r4008, r3990, r4005; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r4011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r4013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r4014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r4015, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r4016, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r4017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4018, {low, high}; +} +{ +mul.f16x2 r4029, r3756, r4011; +} +{ +mul.f16x2 r4032, r3900, r4012; +} +{ +sub.f16x2 r4035, r4029, r4032; +} +{ +mul.f16x2 r4038, r3756, r4012; +} +{ +fma.rn.f16x2 r4041, r3900, r4011, r4038; +} +{ +mul.f16x2 r4045, r3828, r4013; +} +{ +mul.f16x2 r4048, r3972, r4014; +} +{ +sub.f16x2 r4051, r4045, r4048; +} +{ +mul.f16x2 r4054, r3828, r4014; +} +{ +fma.rn.f16x2 r4057, r3972, r4013, r4054; +} +{ +mul.f16x2 r4061, r3864, r4015; +} +{ +mul.f16x2 r4064, r4008, r4016; +} +{ +sub.f16x2 r4067, r4061, r4064; +} +{ +mul.f16x2 r4070, r3864, r4016; +} +{ +fma.rn.f16x2 r4073, r4008, r4015, r4070; +} +{ +mul.f16x2 r4077, r3792, r4017; +} +{ +mul.f16x2 r4080, r3936, r4018; +} +{ +sub.f16x2 r4083, r4077, r4080; +} +{ +mul.f16x2 r4086, r3792, r4018; +} +{ +fma.rn.f16x2 r4089, r3936, r4017, r4086; +} +{ +add.f16x2 %0, r3386, r3708; +} +{ +add.f16x2 %1, r3398, r3720; +} +{ +sub.f16x2 %10, r3386, r3708; +} +{ +sub.f16x2 %11, r3398, r3720; +} +{ +add.f16x2 %2, r3434, r4035; +} +{ +add.f16x2 %3, r3578, r4041; +} +{ +sub.f16x2 %12, r3434, r4035; +} +{ +sub.f16x2 %13, r3578, r4041; +} +{ +add.f16x2 %4, r3506, r4051; +} +{ +add.f16x2 %5, r3650, r4057; +} +{ +sub.f16x2 %14, r3506, r4051; +} +{ +sub.f16x2 %15, r3650, r4057; +} +{ +add.f16x2 %6, r3542, r4067; +} +{ +add.f16x2 %7, r3686, r4073; +} +{ +sub.f16x2 %16, r3542, r4067; +} +{ +sub.f16x2 %17, r3686, r4073; +} +{ +add.f16x2 %8, r3470, r4083; +} +{ +add.f16x2 %9, r3614, r4089; +} +{ +sub.f16x2 %18, r3470, r4083; +} +{ +sub.f16x2 %19, r3614, r4089; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1144, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<316>; +.reg .b32 r<4175>; +.reg .b64 rd<8>; +mov.u32 r4153, %tid.y; +mov.u32 r4154, %20; +mad.lo.s32 r4155, r4153, 40000, r4154; +mov.u32 r4156, %tid.x; +mov.f32 f276, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1, {low, high}; +} +mov.f32 f270, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f284, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r5, {low, high}; +} +mov.f32 f266, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %25, %37; +} +{ +add.f16x2 r14, %21, r11; +} +{ +add.f16x2 r17, %29, %33; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %26, %38; +} +{ +add.f16x2 r26, %22, r23; +} +{ +add.f16x2 r29, %30, %34; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %25, %37; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %21, r38; +} +{ +add.f16x2 r44, %29, %33; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %26, %38; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %30, %34; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %25, %37; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %21, r74; +} +{ +add.f16x2 r80, %29, %33; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %26, %38; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %30, %34; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %25, %37; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %21, r110; +} +{ +add.f16x2 r116, %29, %33; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %26, %38; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %30, %34; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %25, %37; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %21, r146; +} +{ +add.f16x2 r152, %29, %33; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %26, %38; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %30, %34; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %26, %38; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %22, r182; +} +{ +add.f16x2 r188, %30, %34; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %25, %37; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %29, %33; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %26, %38; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %22, r218; +} +{ +add.f16x2 r224, %30, %34; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %25, %37; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %29, %33; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %26, %38; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %22, r254; +} +{ +add.f16x2 r260, %30, %34; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %25, %37; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %29, %33; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %26, %38; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %22, r290; +} +{ +add.f16x2 r296, %30, %34; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %25, %37; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %29, %33; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %27, %39; +} +{ +add.f16x2 r336, %23, r333; +} +{ +add.f16x2 r339, %31, %35; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %28, %40; +} +{ +add.f16x2 r348, %24, r345; +} +{ +add.f16x2 r351, %32, %36; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %27, %39; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %23, r360; +} +{ +add.f16x2 r366, %31, %35; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %28, %40; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %32, %36; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %27, %39; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %23, r396; +} +{ +add.f16x2 r402, %31, %35; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %28, %40; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %32, %36; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %27, %39; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %23, r432; +} +{ +add.f16x2 r438, %31, %35; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %28, %40; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %32, %36; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %27, %39; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %23, r468; +} +{ +add.f16x2 r474, %31, %35; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %28, %40; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %32, %36; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %28, %40; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %24, r504; +} +{ +add.f16x2 r510, %32, %36; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %27, %39; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %31, %35; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %28, %40; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %24, r540; +} +{ +add.f16x2 r546, %32, %36; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %27, %39; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %31, %35; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %28, %40; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %24, r576; +} +{ +add.f16x2 r582, %32, %36; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %27, %39; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %31, %35; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %28, %40; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %24, r612; +} +{ +add.f16x2 r618, %32, %36; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %27, %39; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %31, %35; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +mov.f32 f272, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r645, {low, high}; +} +mov.f32 f286, 0f3F167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r647, {low, high}; +} +mov.f32 f282, 0f3F737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r648, {low, high}; +} +mov.f32 f280, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r652, {low, high}; +} +mov.f32 f243, 0fBF800000; +{ +mul.f16x2 r663, r390, r645; +} +{ +mul.f16x2 r666, r534, r646; +} +{ +sub.f16x2 r669, r663, r666; +} +{ +mul.f16x2 r672, r390, r646; +} +{ +fma.rn.f16x2 r675, r534, r645, r672; +} +{ +mul.f16x2 r679, r462, r647; +} +{ +mul.f16x2 r682, r606, r648; +} +{ +sub.f16x2 r685, r679, r682; +} +{ +mul.f16x2 r688, r462, r648; +} +{ +fma.rn.f16x2 r691, r606, r647, r688; +} +{ +mul.f16x2 r695, r498, r649; +} +{ +mul.f16x2 r698, r642, r650; +} +{ +sub.f16x2 r701, r695, r698; +} +{ +mul.f16x2 r704, r498, r650; +} +{ +fma.rn.f16x2 r707, r642, r649, r704; +} +{ +mul.f16x2 r711, r426, r651; +} +{ +mul.f16x2 r714, r570, r652; +} +{ +sub.f16x2 r717, r711, r714; +} +{ +mul.f16x2 r720, r426, r652; +} +{ +fma.rn.f16x2 r723, r570, r651, r720; +} +{ +add.f16x2 r727, r20, r342; +} +{ +add.f16x2 r730, r32, r354; +} +{ +sub.f16x2 r733, r20, r342; +} +{ +sub.f16x2 r736, r32, r354; +} +{ +add.f16x2 r739, r68, r669; +} +{ +add.f16x2 r742, r212, r675; +} +{ +sub.f16x2 r745, r68, r669; +} +{ +sub.f16x2 r748, r212, r675; +} +{ +add.f16x2 r751, r140, r685; +} +{ +add.f16x2 r754, r284, r691; +} +{ +sub.f16x2 r757, r140, r685; +} +{ +sub.f16x2 r760, r284, r691; +} +{ +add.f16x2 r763, r176, r701; +} +{ +add.f16x2 r766, r320, r707; +} +{ +sub.f16x2 r769, r176, r701; +} +{ +sub.f16x2 r772, r320, r707; +} +{ +add.f16x2 r775, r104, r717; +} +{ +add.f16x2 r778, r248, r723; +} +{ +sub.f16x2 r781, r104, r717; +} +{ +sub.f16x2 r784, r248, r723; +} +mul.wide.u32 rd2, r4156, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r4157, rd3; +mul.lo.s32 r4158, r4157, 1000; +sub.s32 r4159, r4156, r4158; +mad.lo.s32 r4160, r4157, 40000, r4155; +cvt.rn.f32.u32 f307, r4159; +mul.f32 f308, f307, 0f3A24B5BE; +cos.approx.f32 f61, f308; +sin.approx.f32 f309, f308; +neg.f32 f62, f309; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r787, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r792, {high, high}; +} +{ +mul.f16x2 r794, r742, r792; +} +{ +fma.rn.f16x2 r797, r739, r790, r794; +} +{ +mul.f16x2 r801, r739, r792; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r742, r790, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r812, {high, high}; +} +mov.f32 f244, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r812, r814; +} +{ +mul.f16x2 r818, r787, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r821, {high, low}; +} +{ +fma.rn.f16x2 r823, r815, r821, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r829, {high, high}; +} +{ +mul.f16x2 r831, r754, r829; +} +{ +fma.rn.f16x2 r834, r751, r827, r831; +} +{ +mul.f16x2 r838, r751, r829; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r754, r827, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r851, {low, high}; +} +{ +mul.f16x2 r852, r849, r851; +} +{ +mul.f16x2 r855, r823, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r858, {high, low}; +} +{ +fma.rn.f16x2 r860, r852, r858, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r766, r866; +} +{ +fma.rn.f16x2 r871, r763, r864, r868; +} +{ +mul.f16x2 r875, r763, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r766, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r860, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r778, r903; +} +{ +fma.rn.f16x2 r908, r775, r901, r905; +} +{ +mul.f16x2 r912, r775, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r778, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r736, r940; +} +{ +fma.rn.f16x2 r945, r733, r938, r942; +} +{ +mul.f16x2 r949, r733, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r736, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r748, r977; +} +{ +fma.rn.f16x2 r982, r745, r975, r979; +} +{ +mul.f16x2 r986, r745, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r748, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r760, r1014; +} +{ +fma.rn.f16x2 r1019, r757, r1012, r1016; +} +{ +mul.f16x2 r1023, r757, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r760, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r772, r1051; +} +{ +fma.rn.f16x2 r1056, r769, r1049, r1053; +} +{ +mul.f16x2 r1060, r769, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r772, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r784, r1088; +} +{ +fma.rn.f16x2 r1093, r781, r1086, r1090; +} +{ +mul.f16x2 r1097, r781, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r784, r1086, r1100; +} +barrier.sync 0; +mad.lo.s32 r4161, r4159, 40, r4160; +st.shared.v2.f32 [r4161], {r727, r797}; +st.shared.v2.f32 [r4161+8], {r834, r871}; +st.shared.v2.f32 [r4161+16], {r908, r945}; +st.shared.v2.f32 [r4161+24], {r982, r1019}; +st.shared.v2.f32 [r4161+32], {r1056, r1093}; +barrier.sync 0; +mad.lo.s32 r4162, r4159, -36, r4161; +ld.shared.u32 r1137, [r4162]; +ld.shared.u32 r1459, [r4162+4000]; +ld.shared.u32 r1134, [r4162+8000]; +ld.shared.u32 r1456, [r4162+12000]; +ld.shared.u32 r1140, [r4162+16000]; +ld.shared.u32 r1462, [r4162+20000]; +ld.shared.u32 r1141, [r4162+24000]; +ld.shared.u32 r1463, [r4162+28000]; +ld.shared.u32 r1135, [r4162+32000]; +ld.shared.u32 r1457, [r4162+36000]; +barrier.sync 0; +st.shared.v2.f32 [r4161], {r730, r806}; +st.shared.v2.f32 [r4161+8], {r843, r880}; +st.shared.v2.f32 [r4161+16], {r917, r954}; +st.shared.v2.f32 [r4161+24], {r991, r1028}; +st.shared.v2.f32 [r4161+32], {r1065, r1102}; +barrier.sync 0; +ld.shared.u32 r1149, [r4162]; +ld.shared.u32 r1471, [r4162+4000]; +ld.shared.u32 r1146, [r4162+8000]; +ld.shared.u32 r1468, [r4162+12000]; +ld.shared.u32 r1152, [r4162+16000]; +ld.shared.u32 r1474, [r4162+20000]; +ld.shared.u32 r1153, [r4162+24000]; +ld.shared.u32 r1475, [r4162+28000]; +ld.shared.u32 r1147, [r4162+32000]; +ld.shared.u32 r1469, [r4162+36000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1128, {low, high}; +} +{ +neg.f16x2 r1129, r1128; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1132, {low, high}; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1137, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1136, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1149, r1145; +} +{ +add.f16x2 r1151, r1152, r1153; +} +{ +add.f16x2 r1154, r1148, r1151; +} +{ +add.f16x2 r1157, r1134, r1135; +} +{ +mul.f16x2 r1160, r1157, r1123; +} +{ +add.f16x2 r1163, r1137, r1160; +} +{ +add.f16x2 r1166, r1140, r1141; +} +{ +mul.f16x2 r1169, r1166, r1127; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1125; +} +{ +sub.f16x2 r1181, r1152, r1153; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, r1134, r1135; +} +{ +mul.f16x2 r1196, r1193, r1123; +} +{ +add.f16x2 r1199, r1137, r1196; +} +{ +add.f16x2 r1202, r1140, r1141; +} +{ +mul.f16x2 r1205, r1202, r1127; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1125; +} +{ +sub.f16x2 r1217, r1152, r1153; +} +{ +mul.f16x2 r1220, r1217, r1129; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, r1134, r1135; +} +{ +mul.f16x2 r1232, r1229, r1127; +} +{ +add.f16x2 r1235, r1137, r1232; +} +{ +add.f16x2 r1238, r1140, r1141; +} +{ +mul.f16x2 r1241, r1238, r1131; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1129; +} +{ +sub.f16x2 r1253, r1152, r1153; +} +{ +mul.f16x2 r1256, r1253, r1132; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +sub.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, r1134, r1135; +} +{ +mul.f16x2 r1268, r1265, r1127; +} +{ +add.f16x2 r1271, r1137, r1268; +} +{ +add.f16x2 r1274, r1140, r1141; +} +{ +mul.f16x2 r1277, r1274, r1131; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1129; +} +{ +sub.f16x2 r1289, r1152, r1153; +} +{ +mul.f16x2 r1292, r1289, r1132; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +add.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, r1146, r1147; +} +{ +mul.f16x2 r1304, r1301, r1123; +} +{ +add.f16x2 r1307, r1149, r1304; +} +{ +add.f16x2 r1310, r1152, r1153; +} +{ +mul.f16x2 r1313, r1310, r1127; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1125; +} +{ +sub.f16x2 r1325, r1140, r1141; +} +{ +mul.f16x2 r1328, r1325, r1129; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, r1146, r1147; +} +{ +mul.f16x2 r1340, r1337, r1123; +} +{ +add.f16x2 r1343, r1149, r1340; +} +{ +add.f16x2 r1346, r1152, r1153; +} +{ +mul.f16x2 r1349, r1346, r1127; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1125; +} +{ +sub.f16x2 r1361, r1140, r1141; +} +{ +mul.f16x2 r1364, r1361, r1129; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +add.f16x2 r1373, r1146, r1147; +} +{ +mul.f16x2 r1376, r1373, r1127; +} +{ +add.f16x2 r1379, r1149, r1376; +} +{ +add.f16x2 r1382, r1152, r1153; +} +{ +mul.f16x2 r1385, r1382, r1131; +} +{ +add.f16x2 r1388, r1379, r1385; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1129; +} +{ +sub.f16x2 r1397, r1140, r1141; +} +{ +mul.f16x2 r1400, r1397, r1132; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +add.f16x2 r1406, r1388, r1403; +} +{ +add.f16x2 r1409, r1146, r1147; +} +{ +mul.f16x2 r1412, r1409, r1127; +} +{ +add.f16x2 r1415, r1149, r1412; +} +{ +add.f16x2 r1418, r1152, r1153; +} +{ +mul.f16x2 r1421, r1418, r1131; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1129; +} +{ +sub.f16x2 r1433, r1140, r1141; +} +{ +mul.f16x2 r1436, r1433, r1132; +} +{ +add.f16x2 r1439, r1430, r1436; +} +{ +sub.f16x2 r1442, r1424, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1446, {low, high}; +} +{ +neg.f16x2 r1447, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1450, {low, high}; +} +{ +neg.f16x2 r1451, r1450; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1454, {low, high}; +} +{ +add.f16x2 r1455, r1456, r1457; +} +{ +add.f16x2 r1458, r1459, r1455; +} +{ +add.f16x2 r1461, r1462, r1463; +} +{ +add.f16x2 r1464, r1458, r1461; +} +{ +add.f16x2 r1467, r1468, r1469; +} +{ +add.f16x2 r1470, r1471, r1467; +} +{ +add.f16x2 r1473, r1474, r1475; +} +{ +add.f16x2 r1476, r1470, r1473; +} +{ +add.f16x2 r1479, r1456, r1457; +} +{ +mul.f16x2 r1482, r1479, r1445; +} +{ +add.f16x2 r1485, r1459, r1482; +} +{ +add.f16x2 r1488, r1462, r1463; +} +{ +mul.f16x2 r1491, r1488, r1449; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, r1468, r1469; +} +{ +mul.f16x2 r1500, r1497, r1447; +} +{ +sub.f16x2 r1503, r1474, r1475; +} +{ +mul.f16x2 r1506, r1503, r1451; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +sub.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, r1456, r1457; +} +{ +mul.f16x2 r1518, r1515, r1445; +} +{ +add.f16x2 r1521, r1459, r1518; +} +{ +add.f16x2 r1524, r1462, r1463; +} +{ +mul.f16x2 r1527, r1524, r1449; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, r1468, r1469; +} +{ +mul.f16x2 r1536, r1533, r1447; +} +{ +sub.f16x2 r1539, r1474, r1475; +} +{ +mul.f16x2 r1542, r1539, r1451; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +add.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, r1456, r1457; +} +{ +mul.f16x2 r1554, r1551, r1449; +} +{ +add.f16x2 r1557, r1459, r1554; +} +{ +add.f16x2 r1560, r1462, r1463; +} +{ +mul.f16x2 r1563, r1560, r1453; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, r1468, r1469; +} +{ +mul.f16x2 r1572, r1569, r1451; +} +{ +sub.f16x2 r1575, r1474, r1475; +} +{ +mul.f16x2 r1578, r1575, r1454; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +sub.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, r1456, r1457; +} +{ +mul.f16x2 r1590, r1587, r1449; +} +{ +add.f16x2 r1593, r1459, r1590; +} +{ +add.f16x2 r1596, r1462, r1463; +} +{ +mul.f16x2 r1599, r1596, r1453; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, r1468, r1469; +} +{ +mul.f16x2 r1608, r1605, r1451; +} +{ +sub.f16x2 r1611, r1474, r1475; +} +{ +mul.f16x2 r1614, r1611, r1454; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +add.f16x2 r1620, r1602, r1617; +} +{ +add.f16x2 r1623, r1468, r1469; +} +{ +mul.f16x2 r1626, r1623, r1445; +} +{ +add.f16x2 r1629, r1471, r1626; +} +{ +add.f16x2 r1632, r1474, r1475; +} +{ +mul.f16x2 r1635, r1632, r1449; +} +{ +add.f16x2 r1638, r1629, r1635; +} +{ +sub.f16x2 r1641, r1456, r1457; +} +{ +mul.f16x2 r1644, r1641, r1447; +} +{ +sub.f16x2 r1647, r1462, r1463; +} +{ +mul.f16x2 r1650, r1647, r1451; +} +{ +add.f16x2 r1653, r1644, r1650; +} +{ +add.f16x2 r1656, r1638, r1653; +} +{ +add.f16x2 r1659, r1468, r1469; +} +{ +mul.f16x2 r1662, r1659, r1445; +} +{ +add.f16x2 r1665, r1471, r1662; +} +{ +add.f16x2 r1668, r1474, r1475; +} +{ +mul.f16x2 r1671, r1668, r1449; +} +{ +add.f16x2 r1674, r1665, r1671; +} +{ +sub.f16x2 r1677, r1456, r1457; +} +{ +mul.f16x2 r1680, r1677, r1447; +} +{ +sub.f16x2 r1683, r1462, r1463; +} +{ +mul.f16x2 r1686, r1683, r1451; +} +{ +add.f16x2 r1689, r1680, r1686; +} +{ +sub.f16x2 r1692, r1674, r1689; +} +{ +add.f16x2 r1695, r1468, r1469; +} +{ +mul.f16x2 r1698, r1695, r1449; +} +{ +add.f16x2 r1701, r1471, r1698; +} +{ +add.f16x2 r1704, r1474, r1475; +} +{ +mul.f16x2 r1707, r1704, r1453; +} +{ +add.f16x2 r1710, r1701, r1707; +} +{ +sub.f16x2 r1713, r1456, r1457; +} +{ +mul.f16x2 r1716, r1713, r1451; +} +{ +sub.f16x2 r1719, r1462, r1463; +} +{ +mul.f16x2 r1722, r1719, r1454; +} +{ +add.f16x2 r1725, r1716, r1722; +} +{ +add.f16x2 r1728, r1710, r1725; +} +{ +add.f16x2 r1731, r1468, r1469; +} +{ +mul.f16x2 r1734, r1731, r1449; +} +{ +add.f16x2 r1737, r1471, r1734; +} +{ +add.f16x2 r1740, r1474, r1475; +} +{ +mul.f16x2 r1743, r1740, r1453; +} +{ +add.f16x2 r1746, r1737, r1743; +} +{ +sub.f16x2 r1749, r1456, r1457; +} +{ +mul.f16x2 r1752, r1749, r1451; +} +{ +sub.f16x2 r1755, r1462, r1463; +} +{ +mul.f16x2 r1758, r1755, r1454; +} +{ +add.f16x2 r1761, r1752, r1758; +} +{ +sub.f16x2 r1764, r1746, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1785, r1512, r1767; +} +{ +mul.f16x2 r1788, r1656, r1768; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1512, r1768; +} +{ +fma.rn.f16x2 r1797, r1656, r1767, r1794; +} +{ +mul.f16x2 r1801, r1584, r1769; +} +{ +mul.f16x2 r1804, r1728, r1770; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1584, r1770; +} +{ +fma.rn.f16x2 r1813, r1728, r1769, r1810; +} +{ +mul.f16x2 r1817, r1620, r1771; +} +{ +mul.f16x2 r1820, r1764, r1772; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1620, r1772; +} +{ +fma.rn.f16x2 r1829, r1764, r1771, r1826; +} +{ +mul.f16x2 r1833, r1548, r1773; +} +{ +mul.f16x2 r1836, r1692, r1774; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1548, r1774; +} +{ +fma.rn.f16x2 r1845, r1692, r1773, r1842; +} +{ +add.f16x2 r1849, r1142, r1464; +} +{ +add.f16x2 r1852, r1154, r1476; +} +{ +sub.f16x2 r1855, r1142, r1464; +} +{ +sub.f16x2 r1858, r1154, r1476; +} +{ +add.f16x2 r1861, r1190, r1791; +} +{ +add.f16x2 r1864, r1334, r1797; +} +{ +sub.f16x2 r1867, r1190, r1791; +} +{ +sub.f16x2 r1870, r1334, r1797; +} +{ +add.f16x2 r1873, r1262, r1807; +} +{ +add.f16x2 r1876, r1406, r1813; +} +{ +sub.f16x2 r1879, r1262, r1807; +} +{ +sub.f16x2 r1882, r1406, r1813; +} +{ +add.f16x2 r1885, r1298, r1823; +} +{ +add.f16x2 r1888, r1442, r1829; +} +{ +sub.f16x2 r1891, r1298, r1823; +} +{ +sub.f16x2 r1894, r1442, r1829; +} +{ +add.f16x2 r1897, r1226, r1839; +} +{ +add.f16x2 r1900, r1370, r1845; +} +{ +sub.f16x2 r1903, r1226, r1839; +} +{ +sub.f16x2 r1906, r1370, r1845; +} +mul.wide.u32 rd4, r4159, -858993459; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r4163, rd5; +mul.lo.s32 r4164, r4163, 10; +sub.s32 r4165, r4159, r4164; +shl.b32 r4166, r4165, 2; +add.s32 r4167, r4160, r4166; +cvt.rn.f32.u32 f310, r4163; +mul.f32 f311, f310, 0f3BCDE32E; +cos.approx.f32 f143, f311; +sin.approx.f32 f312, f311; +neg.f32 f144, f312; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1912, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1914, {high, high}; +} +{ +mul.f16x2 r1916, r1864, r1914; +} +{ +fma.rn.f16x2 r1919, r1861, r1912, r1916; +} +{ +mul.f16x2 r1923, r1861, r1914; +} +{ +neg.f16x2 r1926, r1923; +} +{ +fma.rn.f16x2 r1928, r1864, r1912, r1926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1932, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1934, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1936, {low, high}; +} +{ +mul.f16x2 r1937, r1934, r1936; +} +{ +mul.f16x2 r1940, r1909, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1943, {high, low}; +} +{ +fma.rn.f16x2 r1945, r1937, r1943, r1940; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1949, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1951, {high, high}; +} +{ +mul.f16x2 r1953, r1876, r1951; +} +{ +fma.rn.f16x2 r1956, r1873, r1949, r1953; +} +{ +mul.f16x2 r1960, r1873, r1951; +} +{ +neg.f16x2 r1963, r1960; +} +{ +fma.rn.f16x2 r1965, r1876, r1949, r1963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1969, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1971, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r1973, {low, high}; +} +{ +mul.f16x2 r1974, r1971, r1973; +} +{ +mul.f16x2 r1977, r1945, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1980, {high, low}; +} +{ +fma.rn.f16x2 r1982, r1974, r1980, r1977; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1986, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1988, {high, high}; +} +{ +mul.f16x2 r1990, r1888, r1988; +} +{ +fma.rn.f16x2 r1993, r1885, r1986, r1990; +} +{ +mul.f16x2 r1997, r1885, r1988; +} +{ +neg.f16x2 r2000, r1997; +} +{ +fma.rn.f16x2 r2002, r1888, r1986, r2000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2006, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2008, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2010, {low, high}; +} +{ +mul.f16x2 r2011, r2008, r2010; +} +{ +mul.f16x2 r2014, r1982, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r2017, {high, low}; +} +{ +fma.rn.f16x2 r2019, r2011, r2017, r2014; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2023, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2025, {high, high}; +} +{ +mul.f16x2 r2027, r1900, r2025; +} +{ +fma.rn.f16x2 r2030, r1897, r2023, r2027; +} +{ +mul.f16x2 r2034, r1897, r2025; +} +{ +neg.f16x2 r2037, r2034; +} +{ +fma.rn.f16x2 r2039, r1900, r2023, r2037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2043, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2045, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r2045, r2047; +} +{ +mul.f16x2 r2051, r2019, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2054, {high, low}; +} +{ +fma.rn.f16x2 r2056, r2048, r2054, r2051; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2060, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2062, {high, high}; +} +{ +mul.f16x2 r2064, r1858, r2062; +} +{ +fma.rn.f16x2 r2067, r1855, r2060, r2064; +} +{ +mul.f16x2 r2071, r1855, r2062; +} +{ +neg.f16x2 r2074, r2071; +} +{ +fma.rn.f16x2 r2076, r1858, r2060, r2074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2080, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2082, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r2082, r2084; +} +{ +mul.f16x2 r2088, r2056, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2091, {high, low}; +} +{ +fma.rn.f16x2 r2093, r2085, r2091, r2088; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2097, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2099, {high, high}; +} +{ +mul.f16x2 r2101, r1870, r2099; +} +{ +fma.rn.f16x2 r2104, r1867, r2097, r2101; +} +{ +mul.f16x2 r2108, r1867, r2099; +} +{ +neg.f16x2 r2111, r2108; +} +{ +fma.rn.f16x2 r2113, r1870, r2097, r2111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2117, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2119, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2121, {low, high}; +} +{ +mul.f16x2 r2122, r2119, r2121; +} +{ +mul.f16x2 r2125, r2093, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2128, {high, low}; +} +{ +fma.rn.f16x2 r2130, r2122, r2128, r2125; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2134, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2136, {high, high}; +} +{ +mul.f16x2 r2138, r1882, r2136; +} +{ +fma.rn.f16x2 r2141, r1879, r2134, r2138; +} +{ +mul.f16x2 r2145, r1879, r2136; +} +{ +neg.f16x2 r2148, r2145; +} +{ +fma.rn.f16x2 r2150, r1882, r2134, r2148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2154, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2156, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2158, {low, high}; +} +{ +mul.f16x2 r2159, r2156, r2158; +} +{ +mul.f16x2 r2162, r2130, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2165, {high, low}; +} +{ +fma.rn.f16x2 r2167, r2159, r2165, r2162; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2171, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2173, {high, high}; +} +{ +mul.f16x2 r2175, r1894, r2173; +} +{ +fma.rn.f16x2 r2178, r1891, r2171, r2175; +} +{ +mul.f16x2 r2182, r1891, r2173; +} +{ +neg.f16x2 r2185, r2182; +} +{ +fma.rn.f16x2 r2187, r1894, r2171, r2185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2191, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2193, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r2195, {low, high}; +} +{ +mul.f16x2 r2196, r2193, r2195; +} +{ +mul.f16x2 r2199, r2167, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2202, {high, low}; +} +{ +fma.rn.f16x2 r2204, r2196, r2202, r2199; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2208, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2210, {high, high}; +} +{ +mul.f16x2 r2212, r1906, r2210; +} +{ +fma.rn.f16x2 r2215, r1903, r2208, r2212; +} +{ +mul.f16x2 r2219, r1903, r2210; +} +{ +neg.f16x2 r2222, r2219; +} +{ +fma.rn.f16x2 r2224, r1906, r2208, r2222; +} +barrier.sync 0; +mad.lo.s32 r4168, r4163, 400, r4167; +st.shared.u32 [r4168], r1849; +st.shared.u32 [r4168+40], r1919; +st.shared.u32 [r4168+80], r1956; +st.shared.u32 [r4168+120], r1993; +st.shared.u32 [r4168+160], r2030; +st.shared.u32 [r4168+200], r2067; +st.shared.u32 [r4168+240], r2104; +st.shared.u32 [r4168+280], r2141; +st.shared.u32 [r4168+320], r2178; +st.shared.u32 [r4168+360], r2215; +barrier.sync 0; +ld.shared.u32 r2259, [r4162]; +ld.shared.u32 r2581, [r4162+4000]; +ld.shared.u32 r2256, [r4162+8000]; +ld.shared.u32 r2578, [r4162+12000]; +ld.shared.u32 r2262, [r4162+16000]; +ld.shared.u32 r2584, [r4162+20000]; +ld.shared.u32 r2263, [r4162+24000]; +ld.shared.u32 r2585, [r4162+28000]; +ld.shared.u32 r2257, [r4162+32000]; +ld.shared.u32 r2579, [r4162+36000]; +barrier.sync 0; +st.shared.u32 [r4168], r1852; +st.shared.u32 [r4168+40], r1928; +st.shared.u32 [r4168+80], r1965; +st.shared.u32 [r4168+120], r2002; +st.shared.u32 [r4168+160], r2039; +st.shared.u32 [r4168+200], r2076; +st.shared.u32 [r4168+240], r2113; +st.shared.u32 [r4168+280], r2150; +st.shared.u32 [r4168+320], r2187; +st.shared.u32 [r4168+360], r2224; +barrier.sync 0; +ld.shared.u32 r2271, [r4162]; +ld.shared.u32 r2593, [r4162+4000]; +ld.shared.u32 r2268, [r4162+8000]; +ld.shared.u32 r2590, [r4162+12000]; +ld.shared.u32 r2274, [r4162+16000]; +ld.shared.u32 r2596, [r4162+20000]; +ld.shared.u32 r2275, [r4162+24000]; +ld.shared.u32 r2597, [r4162+28000]; +ld.shared.u32 r2269, [r4162+32000]; +ld.shared.u32 r2591, [r4162+36000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2246, {low, high}; +} +{ +neg.f16x2 r2247, r2246; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r2250, {low, high}; +} +{ +neg.f16x2 r2251, r2250; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2254, {low, high}; +} +{ +add.f16x2 r2255, r2256, r2257; +} +{ +add.f16x2 r2258, r2259, r2255; +} +{ +add.f16x2 r2261, r2262, r2263; +} +{ +add.f16x2 r2264, r2258, r2261; +} +{ +add.f16x2 r2267, r2268, r2269; +} +{ +add.f16x2 r2270, r2271, r2267; +} +{ +add.f16x2 r2273, r2274, r2275; +} +{ +add.f16x2 r2276, r2270, r2273; +} +{ +add.f16x2 r2279, r2256, r2257; +} +{ +mul.f16x2 r2282, r2279, r2245; +} +{ +add.f16x2 r2285, r2259, r2282; +} +{ +add.f16x2 r2288, r2262, r2263; +} +{ +mul.f16x2 r2291, r2288, r2249; +} +{ +add.f16x2 r2294, r2285, r2291; +} +{ +sub.f16x2 r2297, r2268, r2269; +} +{ +mul.f16x2 r2300, r2297, r2247; +} +{ +sub.f16x2 r2303, r2274, r2275; +} +{ +mul.f16x2 r2306, r2303, r2251; +} +{ +add.f16x2 r2309, r2300, r2306; +} +{ +sub.f16x2 r2312, r2294, r2309; +} +{ +add.f16x2 r2315, r2256, r2257; +} +{ +mul.f16x2 r2318, r2315, r2245; +} +{ +add.f16x2 r2321, r2259, r2318; +} +{ +add.f16x2 r2324, r2262, r2263; +} +{ +mul.f16x2 r2327, r2324, r2249; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +sub.f16x2 r2333, r2268, r2269; +} +{ +mul.f16x2 r2336, r2333, r2247; +} +{ +sub.f16x2 r2339, r2274, r2275; +} +{ +mul.f16x2 r2342, r2339, r2251; +} +{ +add.f16x2 r2345, r2336, r2342; +} +{ +add.f16x2 r2348, r2330, r2345; +} +{ +add.f16x2 r2351, r2256, r2257; +} +{ +mul.f16x2 r2354, r2351, r2249; +} +{ +add.f16x2 r2357, r2259, r2354; +} +{ +add.f16x2 r2360, r2262, r2263; +} +{ +mul.f16x2 r2363, r2360, r2253; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +sub.f16x2 r2369, r2268, r2269; +} +{ +mul.f16x2 r2372, r2369, r2251; +} +{ +sub.f16x2 r2375, r2274, r2275; +} +{ +mul.f16x2 r2378, r2375, r2254; +} +{ +add.f16x2 r2381, r2372, r2378; +} +{ +sub.f16x2 r2384, r2366, r2381; +} +{ +add.f16x2 r2387, r2256, r2257; +} +{ +mul.f16x2 r2390, r2387, r2249; +} +{ +add.f16x2 r2393, r2259, r2390; +} +{ +add.f16x2 r2396, r2262, r2263; +} +{ +mul.f16x2 r2399, r2396, r2253; +} +{ +add.f16x2 r2402, r2393, r2399; +} +{ +sub.f16x2 r2405, r2268, r2269; +} +{ +mul.f16x2 r2408, r2405, r2251; +} +{ +sub.f16x2 r2411, r2274, r2275; +} +{ +mul.f16x2 r2414, r2411, r2254; +} +{ +add.f16x2 r2417, r2408, r2414; +} +{ +add.f16x2 r2420, r2402, r2417; +} +{ +add.f16x2 r2423, r2268, r2269; +} +{ +mul.f16x2 r2426, r2423, r2245; +} +{ +add.f16x2 r2429, r2271, r2426; +} +{ +add.f16x2 r2432, r2274, r2275; +} +{ +mul.f16x2 r2435, r2432, r2249; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +sub.f16x2 r2441, r2256, r2257; +} +{ +mul.f16x2 r2444, r2441, r2247; +} +{ +sub.f16x2 r2447, r2262, r2263; +} +{ +mul.f16x2 r2450, r2447, r2251; +} +{ +add.f16x2 r2453, r2444, r2450; +} +{ +add.f16x2 r2456, r2438, r2453; +} +{ +add.f16x2 r2459, r2268, r2269; +} +{ +mul.f16x2 r2462, r2459, r2245; +} +{ +add.f16x2 r2465, r2271, r2462; +} +{ +add.f16x2 r2468, r2274, r2275; +} +{ +mul.f16x2 r2471, r2468, r2249; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +sub.f16x2 r2477, r2256, r2257; +} +{ +mul.f16x2 r2480, r2477, r2247; +} +{ +sub.f16x2 r2483, r2262, r2263; +} +{ +mul.f16x2 r2486, r2483, r2251; +} +{ +add.f16x2 r2489, r2480, r2486; +} +{ +sub.f16x2 r2492, r2474, r2489; +} +{ +add.f16x2 r2495, r2268, r2269; +} +{ +mul.f16x2 r2498, r2495, r2249; +} +{ +add.f16x2 r2501, r2271, r2498; +} +{ +add.f16x2 r2504, r2274, r2275; +} +{ +mul.f16x2 r2507, r2504, r2253; +} +{ +add.f16x2 r2510, r2501, r2507; +} +{ +sub.f16x2 r2513, r2256, r2257; +} +{ +mul.f16x2 r2516, r2513, r2251; +} +{ +sub.f16x2 r2519, r2262, r2263; +} +{ +mul.f16x2 r2522, r2519, r2254; +} +{ +add.f16x2 r2525, r2516, r2522; +} +{ +add.f16x2 r2528, r2510, r2525; +} +{ +add.f16x2 r2531, r2268, r2269; +} +{ +mul.f16x2 r2534, r2531, r2249; +} +{ +add.f16x2 r2537, r2271, r2534; +} +{ +add.f16x2 r2540, r2274, r2275; +} +{ +mul.f16x2 r2543, r2540, r2253; +} +{ +add.f16x2 r2546, r2537, r2543; +} +{ +sub.f16x2 r2549, r2256, r2257; +} +{ +mul.f16x2 r2552, r2549, r2251; +} +{ +sub.f16x2 r2555, r2262, r2263; +} +{ +mul.f16x2 r2558, r2555, r2254; +} +{ +add.f16x2 r2561, r2552, r2558; +} +{ +sub.f16x2 r2564, r2546, r2561; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2568, {low, high}; +} +{ +neg.f16x2 r2569, r2568; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r2572, {low, high}; +} +{ +neg.f16x2 r2573, r2572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r2576, {low, high}; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2590, r2591; +} +{ +add.f16x2 r2592, r2593, r2589; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2592, r2595; +} +{ +add.f16x2 r2601, r2578, r2579; +} +{ +mul.f16x2 r2604, r2601, r2567; +} +{ +add.f16x2 r2607, r2581, r2604; +} +{ +add.f16x2 r2610, r2584, r2585; +} +{ +mul.f16x2 r2613, r2610, r2571; +} +{ +add.f16x2 r2616, r2607, r2613; +} +{ +sub.f16x2 r2619, r2590, r2591; +} +{ +mul.f16x2 r2622, r2619, r2569; +} +{ +sub.f16x2 r2625, r2596, r2597; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r2622, r2628; +} +{ +sub.f16x2 r2634, r2616, r2631; +} +{ +add.f16x2 r2637, r2578, r2579; +} +{ +mul.f16x2 r2640, r2637, r2567; +} +{ +add.f16x2 r2643, r2581, r2640; +} +{ +add.f16x2 r2646, r2584, r2585; +} +{ +mul.f16x2 r2649, r2646, r2571; +} +{ +add.f16x2 r2652, r2643, r2649; +} +{ +sub.f16x2 r2655, r2590, r2591; +} +{ +mul.f16x2 r2658, r2655, r2569; +} +{ +sub.f16x2 r2661, r2596, r2597; +} +{ +mul.f16x2 r2664, r2661, r2573; +} +{ +add.f16x2 r2667, r2658, r2664; +} +{ +add.f16x2 r2670, r2652, r2667; +} +{ +add.f16x2 r2673, r2578, r2579; +} +{ +mul.f16x2 r2676, r2673, r2571; +} +{ +add.f16x2 r2679, r2581, r2676; +} +{ +add.f16x2 r2682, r2584, r2585; +} +{ +mul.f16x2 r2685, r2682, r2575; +} +{ +add.f16x2 r2688, r2679, r2685; +} +{ +sub.f16x2 r2691, r2590, r2591; +} +{ +mul.f16x2 r2694, r2691, r2573; +} +{ +sub.f16x2 r2697, r2596, r2597; +} +{ +mul.f16x2 r2700, r2697, r2576; +} +{ +add.f16x2 r2703, r2694, r2700; +} +{ +sub.f16x2 r2706, r2688, r2703; +} +{ +add.f16x2 r2709, r2578, r2579; +} +{ +mul.f16x2 r2712, r2709, r2571; +} +{ +add.f16x2 r2715, r2581, r2712; +} +{ +add.f16x2 r2718, r2584, r2585; +} +{ +mul.f16x2 r2721, r2718, r2575; +} +{ +add.f16x2 r2724, r2715, r2721; +} +{ +sub.f16x2 r2727, r2590, r2591; +} +{ +mul.f16x2 r2730, r2727, r2573; +} +{ +sub.f16x2 r2733, r2596, r2597; +} +{ +mul.f16x2 r2736, r2733, r2576; +} +{ +add.f16x2 r2739, r2730, r2736; +} +{ +add.f16x2 r2742, r2724, r2739; +} +{ +add.f16x2 r2745, r2590, r2591; +} +{ +mul.f16x2 r2748, r2745, r2567; +} +{ +add.f16x2 r2751, r2593, r2748; +} +{ +add.f16x2 r2754, r2596, r2597; +} +{ +mul.f16x2 r2757, r2754, r2571; +} +{ +add.f16x2 r2760, r2751, r2757; +} +{ +sub.f16x2 r2763, r2578, r2579; +} +{ +mul.f16x2 r2766, r2763, r2569; +} +{ +sub.f16x2 r2769, r2584, r2585; +} +{ +mul.f16x2 r2772, r2769, r2573; +} +{ +add.f16x2 r2775, r2766, r2772; +} +{ +add.f16x2 r2778, r2760, r2775; +} +{ +add.f16x2 r2781, r2590, r2591; +} +{ +mul.f16x2 r2784, r2781, r2567; +} +{ +add.f16x2 r2787, r2593, r2784; +} +{ +add.f16x2 r2790, r2596, r2597; +} +{ +mul.f16x2 r2793, r2790, r2571; +} +{ +add.f16x2 r2796, r2787, r2793; +} +{ +sub.f16x2 r2799, r2578, r2579; +} +{ +mul.f16x2 r2802, r2799, r2569; +} +{ +sub.f16x2 r2805, r2584, r2585; +} +{ +mul.f16x2 r2808, r2805, r2573; +} +{ +add.f16x2 r2811, r2802, r2808; +} +{ +sub.f16x2 r2814, r2796, r2811; +} +{ +add.f16x2 r2817, r2590, r2591; +} +{ +mul.f16x2 r2820, r2817, r2571; +} +{ +add.f16x2 r2823, r2593, r2820; +} +{ +add.f16x2 r2826, r2596, r2597; +} +{ +mul.f16x2 r2829, r2826, r2575; +} +{ +add.f16x2 r2832, r2823, r2829; +} +{ +sub.f16x2 r2835, r2578, r2579; +} +{ +mul.f16x2 r2838, r2835, r2573; +} +{ +sub.f16x2 r2841, r2584, r2585; +} +{ +mul.f16x2 r2844, r2841, r2576; +} +{ +add.f16x2 r2847, r2838, r2844; +} +{ +add.f16x2 r2850, r2832, r2847; +} +{ +add.f16x2 r2853, r2590, r2591; +} +{ +mul.f16x2 r2856, r2853, r2571; +} +{ +add.f16x2 r2859, r2593, r2856; +} +{ +add.f16x2 r2862, r2596, r2597; +} +{ +mul.f16x2 r2865, r2862, r2575; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +sub.f16x2 r2871, r2578, r2579; +} +{ +mul.f16x2 r2874, r2871, r2573; +} +{ +sub.f16x2 r2877, r2584, r2585; +} +{ +mul.f16x2 r2880, r2877, r2576; +} +{ +add.f16x2 r2883, r2874, r2880; +} +{ +sub.f16x2 r2886, r2868, r2883; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r2891, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2892, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r2893, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2894, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r2896, {low, high}; +} +{ +mul.f16x2 r2907, r2634, r2889; +} +{ +mul.f16x2 r2910, r2778, r2890; +} +{ +sub.f16x2 r2913, r2907, r2910; +} +{ +mul.f16x2 r2916, r2634, r2890; +} +{ +fma.rn.f16x2 r2919, r2778, r2889, r2916; +} +{ +mul.f16x2 r2923, r2706, r2891; +} +{ +mul.f16x2 r2926, r2850, r2892; +} +{ +sub.f16x2 r2929, r2923, r2926; +} +{ +mul.f16x2 r2932, r2706, r2892; +} +{ +fma.rn.f16x2 r2935, r2850, r2891, r2932; +} +{ +mul.f16x2 r2939, r2742, r2893; +} +{ +mul.f16x2 r2942, r2886, r2894; +} +{ +sub.f16x2 r2945, r2939, r2942; +} +{ +mul.f16x2 r2948, r2742, r2894; +} +{ +fma.rn.f16x2 r2951, r2886, r2893, r2948; +} +{ +mul.f16x2 r2955, r2670, r2895; +} +{ +mul.f16x2 r2958, r2814, r2896; +} +{ +sub.f16x2 r2961, r2955, r2958; +} +{ +mul.f16x2 r2964, r2670, r2896; +} +{ +fma.rn.f16x2 r2967, r2814, r2895, r2964; +} +{ +add.f16x2 r2971, r2264, r2586; +} +{ +add.f16x2 r2974, r2276, r2598; +} +{ +sub.f16x2 r2977, r2264, r2586; +} +{ +sub.f16x2 r2980, r2276, r2598; +} +{ +add.f16x2 r2983, r2312, r2913; +} +{ +add.f16x2 r2986, r2456, r2919; +} +{ +sub.f16x2 r2989, r2312, r2913; +} +{ +sub.f16x2 r2992, r2456, r2919; +} +{ +add.f16x2 r2995, r2384, r2929; +} +{ +add.f16x2 r2998, r2528, r2935; +} +{ +sub.f16x2 r3001, r2384, r2929; +} +{ +sub.f16x2 r3004, r2528, r2935; +} +{ +add.f16x2 r3007, r2420, r2945; +} +{ +add.f16x2 r3010, r2564, r2951; +} +{ +sub.f16x2 r3013, r2420, r2945; +} +{ +sub.f16x2 r3016, r2564, r2951; +} +{ +add.f16x2 r3019, r2348, r2961; +} +{ +add.f16x2 r3022, r2492, r2967; +} +{ +sub.f16x2 r3025, r2348, r2961; +} +{ +sub.f16x2 r3028, r2492, r2967; +} +mul.wide.u32 rd6, r4159, 1374389535; +shr.u64 rd7, rd6, 37; +cvt.u32.u64 r4169, rd7; +mul.lo.s32 r4170, r4169, 100; +sub.s32 r4171, r4159, r4170; +shl.b32 r4172, r4171, 2; +add.s32 r4173, r4160, r4172; +cvt.rn.f32.u32 f313, r4169; +mul.f32 f314, f313, 0f3D80ADFD; +cos.approx.f32 f225, f314; +sin.approx.f32 f315, f314; +neg.f32 f226, f315; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r3031, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3036, {high, high}; +} +{ +mul.f16x2 r3038, r2986, r3036; +} +{ +fma.rn.f16x2 r3041, r2983, r3034, r3038; +} +{ +mul.f16x2 r3045, r2983, r3036; +} +{ +neg.f16x2 r3048, r3045; +} +{ +fma.rn.f16x2 r3050, r2986, r3034, r3048; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3054, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3056, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3058, {low, high}; +} +{ +mul.f16x2 r3059, r3056, r3058; +} +{ +mul.f16x2 r3062, r3031, r3054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3065, {high, low}; +} +{ +fma.rn.f16x2 r3067, r3059, r3065, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3067; +mov.b32 r3071, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3067; +mov.b32 r3073, {high, high}; +} +{ +mul.f16x2 r3075, r2998, r3073; +} +{ +fma.rn.f16x2 r3078, r2995, r3071, r3075; +} +{ +mul.f16x2 r3082, r2995, r3073; +} +{ +neg.f16x2 r3085, r3082; +} +{ +fma.rn.f16x2 r3087, r2998, r3071, r3085; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3091, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3093, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3095, {low, high}; +} +{ +mul.f16x2 r3096, r3093, r3095; +} +{ +mul.f16x2 r3099, r3067, r3091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3067; +mov.b32 r3102, {high, low}; +} +{ +fma.rn.f16x2 r3104, r3096, r3102, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3104; +mov.b32 r3108, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3104; +mov.b32 r3110, {high, high}; +} +{ +mul.f16x2 r3112, r3010, r3110; +} +{ +fma.rn.f16x2 r3115, r3007, r3108, r3112; +} +{ +mul.f16x2 r3119, r3007, r3110; +} +{ +neg.f16x2 r3122, r3119; +} +{ +fma.rn.f16x2 r3124, r3010, r3108, r3122; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3130, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3132, {low, high}; +} +{ +mul.f16x2 r3133, r3130, r3132; +} +{ +mul.f16x2 r3136, r3104, r3128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3104; +mov.b32 r3139, {high, low}; +} +{ +fma.rn.f16x2 r3141, r3133, r3139, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3141; +mov.b32 r3145, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3141; +mov.b32 r3147, {high, high}; +} +{ +mul.f16x2 r3149, r3022, r3147; +} +{ +fma.rn.f16x2 r3152, r3019, r3145, r3149; +} +{ +mul.f16x2 r3156, r3019, r3147; +} +{ +neg.f16x2 r3159, r3156; +} +{ +fma.rn.f16x2 r3161, r3022, r3145, r3159; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3165, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3167, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3169, {low, high}; +} +{ +mul.f16x2 r3170, r3167, r3169; +} +{ +mul.f16x2 r3173, r3141, r3165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3141; +mov.b32 r3176, {high, low}; +} +{ +fma.rn.f16x2 r3178, r3170, r3176, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3178; +mov.b32 r3182, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3178; +mov.b32 r3184, {high, high}; +} +{ +mul.f16x2 r3186, r2980, r3184; +} +{ +fma.rn.f16x2 r3189, r2977, r3182, r3186; +} +{ +mul.f16x2 r3193, r2977, r3184; +} +{ +neg.f16x2 r3196, r3193; +} +{ +fma.rn.f16x2 r3198, r2980, r3182, r3196; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3202, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3204, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3206, {low, high}; +} +{ +mul.f16x2 r3207, r3204, r3206; +} +{ +mul.f16x2 r3210, r3178, r3202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3178; +mov.b32 r3213, {high, low}; +} +{ +fma.rn.f16x2 r3215, r3207, r3213, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3215; +mov.b32 r3219, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3215; +mov.b32 r3221, {high, high}; +} +{ +mul.f16x2 r3223, r2992, r3221; +} +{ +fma.rn.f16x2 r3226, r2989, r3219, r3223; +} +{ +mul.f16x2 r3230, r2989, r3221; +} +{ +neg.f16x2 r3233, r3230; +} +{ +fma.rn.f16x2 r3235, r2992, r3219, r3233; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3239, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3241, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3243, {low, high}; +} +{ +mul.f16x2 r3244, r3241, r3243; +} +{ +mul.f16x2 r3247, r3215, r3239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3215; +mov.b32 r3250, {high, low}; +} +{ +fma.rn.f16x2 r3252, r3244, r3250, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3252; +mov.b32 r3256, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3252; +mov.b32 r3258, {high, high}; +} +{ +mul.f16x2 r3260, r3004, r3258; +} +{ +fma.rn.f16x2 r3263, r3001, r3256, r3260; +} +{ +mul.f16x2 r3267, r3001, r3258; +} +{ +neg.f16x2 r3270, r3267; +} +{ +fma.rn.f16x2 r3272, r3004, r3256, r3270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3278, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3280, {low, high}; +} +{ +mul.f16x2 r3281, r3278, r3280; +} +{ +mul.f16x2 r3284, r3252, r3276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3252; +mov.b32 r3287, {high, low}; +} +{ +fma.rn.f16x2 r3289, r3281, r3287, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3289; +mov.b32 r3293, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3289; +mov.b32 r3295, {high, high}; +} +{ +mul.f16x2 r3297, r3016, r3295; +} +{ +fma.rn.f16x2 r3300, r3013, r3293, r3297; +} +{ +mul.f16x2 r3304, r3013, r3295; +} +{ +neg.f16x2 r3307, r3304; +} +{ +fma.rn.f16x2 r3309, r3016, r3293, r3307; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3313, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3031; +mov.b32 r3315, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f243; +cvt.rn.f16.f32 high, f244; +mov.b32 r3317, {low, high}; +} +{ +mul.f16x2 r3318, r3315, r3317; +} +{ +mul.f16x2 r3321, r3289, r3313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3289; +mov.b32 r3324, {high, low}; +} +{ +fma.rn.f16x2 r3326, r3318, r3324, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3326; +mov.b32 r3330, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3326; +mov.b32 r3332, {high, high}; +} +{ +mul.f16x2 r3334, r3028, r3332; +} +{ +fma.rn.f16x2 r3337, r3025, r3330, r3334; +} +{ +mul.f16x2 r3341, r3025, r3332; +} +{ +neg.f16x2 r3344, r3341; +} +{ +fma.rn.f16x2 r3346, r3028, r3330, r3344; +} +barrier.sync 0; +mad.lo.s32 r4174, r4169, 4000, r4173; +st.shared.u32 [r4174], r2971; +st.shared.u32 [r4174+400], r3041; +st.shared.u32 [r4174+800], r3078; +st.shared.u32 [r4174+1200], r3115; +st.shared.u32 [r4174+1600], r3152; +st.shared.u32 [r4174+2000], r3189; +st.shared.u32 [r4174+2400], r3226; +st.shared.u32 [r4174+2800], r3263; +st.shared.u32 [r4174+3200], r3300; +st.shared.u32 [r4174+3600], r3337; +barrier.sync 0; +ld.shared.u32 r3381, [r4162]; +ld.shared.u32 r3703, [r4162+4000]; +ld.shared.u32 r3378, [r4162+8000]; +ld.shared.u32 r3700, [r4162+12000]; +ld.shared.u32 r3384, [r4162+16000]; +ld.shared.u32 r3706, [r4162+20000]; +ld.shared.u32 r3385, [r4162+24000]; +ld.shared.u32 r3707, [r4162+28000]; +ld.shared.u32 r3379, [r4162+32000]; +ld.shared.u32 r3701, [r4162+36000]; +barrier.sync 0; +st.shared.u32 [r4174], r2974; +st.shared.u32 [r4174+400], r3050; +st.shared.u32 [r4174+800], r3087; +st.shared.u32 [r4174+1200], r3124; +st.shared.u32 [r4174+1600], r3161; +st.shared.u32 [r4174+2000], r3198; +st.shared.u32 [r4174+2400], r3235; +st.shared.u32 [r4174+2800], r3272; +st.shared.u32 [r4174+3200], r3309; +st.shared.u32 [r4174+3600], r3346; +barrier.sync 0; +ld.shared.u32 r3393, [r4162]; +ld.shared.u32 r3715, [r4162+4000]; +ld.shared.u32 r3390, [r4162+8000]; +ld.shared.u32 r3712, [r4162+12000]; +ld.shared.u32 r3396, [r4162+16000]; +ld.shared.u32 r3718, [r4162+20000]; +ld.shared.u32 r3397, [r4162+24000]; +ld.shared.u32 r3719, [r4162+28000]; +ld.shared.u32 r3391, [r4162+32000]; +ld.shared.u32 r3713, [r4162+36000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3367, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3368, {low, high}; +} +{ +neg.f16x2 r3369, r3368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r3372, {low, high}; +} +{ +neg.f16x2 r3373, r3372; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3375, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3376, {low, high}; +} +{ +add.f16x2 r3377, r3378, r3379; +} +{ +add.f16x2 r3380, r3381, r3377; +} +{ +add.f16x2 r3383, r3384, r3385; +} +{ +add.f16x2 r3386, r3380, r3383; +} +{ +add.f16x2 r3389, r3390, r3391; +} +{ +add.f16x2 r3392, r3393, r3389; +} +{ +add.f16x2 r3395, r3396, r3397; +} +{ +add.f16x2 r3398, r3392, r3395; +} +{ +add.f16x2 r3401, r3378, r3379; +} +{ +mul.f16x2 r3404, r3401, r3367; +} +{ +add.f16x2 r3407, r3381, r3404; +} +{ +add.f16x2 r3410, r3384, r3385; +} +{ +mul.f16x2 r3413, r3410, r3371; +} +{ +add.f16x2 r3416, r3407, r3413; +} +{ +sub.f16x2 r3419, r3390, r3391; +} +{ +mul.f16x2 r3422, r3419, r3369; +} +{ +sub.f16x2 r3425, r3396, r3397; +} +{ +mul.f16x2 r3428, r3425, r3373; +} +{ +add.f16x2 r3431, r3422, r3428; +} +{ +sub.f16x2 r3434, r3416, r3431; +} +{ +add.f16x2 r3437, r3378, r3379; +} +{ +mul.f16x2 r3440, r3437, r3367; +} +{ +add.f16x2 r3443, r3381, r3440; +} +{ +add.f16x2 r3446, r3384, r3385; +} +{ +mul.f16x2 r3449, r3446, r3371; +} +{ +add.f16x2 r3452, r3443, r3449; +} +{ +sub.f16x2 r3455, r3390, r3391; +} +{ +mul.f16x2 r3458, r3455, r3369; +} +{ +sub.f16x2 r3461, r3396, r3397; +} +{ +mul.f16x2 r3464, r3461, r3373; +} +{ +add.f16x2 r3467, r3458, r3464; +} +{ +add.f16x2 r3470, r3452, r3467; +} +{ +add.f16x2 r3473, r3378, r3379; +} +{ +mul.f16x2 r3476, r3473, r3371; +} +{ +add.f16x2 r3479, r3381, r3476; +} +{ +add.f16x2 r3482, r3384, r3385; +} +{ +mul.f16x2 r3485, r3482, r3375; +} +{ +add.f16x2 r3488, r3479, r3485; +} +{ +sub.f16x2 r3491, r3390, r3391; +} +{ +mul.f16x2 r3494, r3491, r3373; +} +{ +sub.f16x2 r3497, r3396, r3397; +} +{ +mul.f16x2 r3500, r3497, r3376; +} +{ +add.f16x2 r3503, r3494, r3500; +} +{ +sub.f16x2 r3506, r3488, r3503; +} +{ +add.f16x2 r3509, r3378, r3379; +} +{ +mul.f16x2 r3512, r3509, r3371; +} +{ +add.f16x2 r3515, r3381, r3512; +} +{ +add.f16x2 r3518, r3384, r3385; +} +{ +mul.f16x2 r3521, r3518, r3375; +} +{ +add.f16x2 r3524, r3515, r3521; +} +{ +sub.f16x2 r3527, r3390, r3391; +} +{ +mul.f16x2 r3530, r3527, r3373; +} +{ +sub.f16x2 r3533, r3396, r3397; +} +{ +mul.f16x2 r3536, r3533, r3376; +} +{ +add.f16x2 r3539, r3530, r3536; +} +{ +add.f16x2 r3542, r3524, r3539; +} +{ +add.f16x2 r3545, r3390, r3391; +} +{ +mul.f16x2 r3548, r3545, r3367; +} +{ +add.f16x2 r3551, r3393, r3548; +} +{ +add.f16x2 r3554, r3396, r3397; +} +{ +mul.f16x2 r3557, r3554, r3371; +} +{ +add.f16x2 r3560, r3551, r3557; +} +{ +sub.f16x2 r3563, r3378, r3379; +} +{ +mul.f16x2 r3566, r3563, r3369; +} +{ +sub.f16x2 r3569, r3384, r3385; +} +{ +mul.f16x2 r3572, r3569, r3373; +} +{ +add.f16x2 r3575, r3566, r3572; +} +{ +add.f16x2 r3578, r3560, r3575; +} +{ +add.f16x2 r3581, r3390, r3391; +} +{ +mul.f16x2 r3584, r3581, r3367; +} +{ +add.f16x2 r3587, r3393, r3584; +} +{ +add.f16x2 r3590, r3396, r3397; +} +{ +mul.f16x2 r3593, r3590, r3371; +} +{ +add.f16x2 r3596, r3587, r3593; +} +{ +sub.f16x2 r3599, r3378, r3379; +} +{ +mul.f16x2 r3602, r3599, r3369; +} +{ +sub.f16x2 r3605, r3384, r3385; +} +{ +mul.f16x2 r3608, r3605, r3373; +} +{ +add.f16x2 r3611, r3602, r3608; +} +{ +sub.f16x2 r3614, r3596, r3611; +} +{ +add.f16x2 r3617, r3390, r3391; +} +{ +mul.f16x2 r3620, r3617, r3371; +} +{ +add.f16x2 r3623, r3393, r3620; +} +{ +add.f16x2 r3626, r3396, r3397; +} +{ +mul.f16x2 r3629, r3626, r3375; +} +{ +add.f16x2 r3632, r3623, r3629; +} +{ +sub.f16x2 r3635, r3378, r3379; +} +{ +mul.f16x2 r3638, r3635, r3373; +} +{ +sub.f16x2 r3641, r3384, r3385; +} +{ +mul.f16x2 r3644, r3641, r3376; +} +{ +add.f16x2 r3647, r3638, r3644; +} +{ +add.f16x2 r3650, r3632, r3647; +} +{ +add.f16x2 r3653, r3390, r3391; +} +{ +mul.f16x2 r3656, r3653, r3371; +} +{ +add.f16x2 r3659, r3393, r3656; +} +{ +add.f16x2 r3662, r3396, r3397; +} +{ +mul.f16x2 r3665, r3662, r3375; +} +{ +add.f16x2 r3668, r3659, r3665; +} +{ +sub.f16x2 r3671, r3378, r3379; +} +{ +mul.f16x2 r3674, r3671, r3373; +} +{ +sub.f16x2 r3677, r3384, r3385; +} +{ +mul.f16x2 r3680, r3677, r3376; +} +{ +add.f16x2 r3683, r3674, r3680; +} +{ +sub.f16x2 r3686, r3668, r3683; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3689, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3690, {low, high}; +} +{ +neg.f16x2 r3691, r3690; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r3694, {low, high}; +} +{ +neg.f16x2 r3695, r3694; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r3697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r3698, {low, high}; +} +{ +add.f16x2 r3699, r3700, r3701; +} +{ +add.f16x2 r3702, r3703, r3699; +} +{ +add.f16x2 r3705, r3706, r3707; +} +{ +add.f16x2 r3708, r3702, r3705; +} +{ +add.f16x2 r3711, r3712, r3713; +} +{ +add.f16x2 r3714, r3715, r3711; +} +{ +add.f16x2 r3717, r3718, r3719; +} +{ +add.f16x2 r3720, r3714, r3717; +} +{ +add.f16x2 r3723, r3700, r3701; +} +{ +mul.f16x2 r3726, r3723, r3689; +} +{ +add.f16x2 r3729, r3703, r3726; +} +{ +add.f16x2 r3732, r3706, r3707; +} +{ +mul.f16x2 r3735, r3732, r3693; +} +{ +add.f16x2 r3738, r3729, r3735; +} +{ +sub.f16x2 r3741, r3712, r3713; +} +{ +mul.f16x2 r3744, r3741, r3691; +} +{ +sub.f16x2 r3747, r3718, r3719; +} +{ +mul.f16x2 r3750, r3747, r3695; +} +{ +add.f16x2 r3753, r3744, r3750; +} +{ +sub.f16x2 r3756, r3738, r3753; +} +{ +add.f16x2 r3759, r3700, r3701; +} +{ +mul.f16x2 r3762, r3759, r3689; +} +{ +add.f16x2 r3765, r3703, r3762; +} +{ +add.f16x2 r3768, r3706, r3707; +} +{ +mul.f16x2 r3771, r3768, r3693; +} +{ +add.f16x2 r3774, r3765, r3771; +} +{ +sub.f16x2 r3777, r3712, r3713; +} +{ +mul.f16x2 r3780, r3777, r3691; +} +{ +sub.f16x2 r3783, r3718, r3719; +} +{ +mul.f16x2 r3786, r3783, r3695; +} +{ +add.f16x2 r3789, r3780, r3786; +} +{ +add.f16x2 r3792, r3774, r3789; +} +{ +add.f16x2 r3795, r3700, r3701; +} +{ +mul.f16x2 r3798, r3795, r3693; +} +{ +add.f16x2 r3801, r3703, r3798; +} +{ +add.f16x2 r3804, r3706, r3707; +} +{ +mul.f16x2 r3807, r3804, r3697; +} +{ +add.f16x2 r3810, r3801, r3807; +} +{ +sub.f16x2 r3813, r3712, r3713; +} +{ +mul.f16x2 r3816, r3813, r3695; +} +{ +sub.f16x2 r3819, r3718, r3719; +} +{ +mul.f16x2 r3822, r3819, r3698; +} +{ +add.f16x2 r3825, r3816, r3822; +} +{ +sub.f16x2 r3828, r3810, r3825; +} +{ +add.f16x2 r3831, r3700, r3701; +} +{ +mul.f16x2 r3834, r3831, r3693; +} +{ +add.f16x2 r3837, r3703, r3834; +} +{ +add.f16x2 r3840, r3706, r3707; +} +{ +mul.f16x2 r3843, r3840, r3697; +} +{ +add.f16x2 r3846, r3837, r3843; +} +{ +sub.f16x2 r3849, r3712, r3713; +} +{ +mul.f16x2 r3852, r3849, r3695; +} +{ +sub.f16x2 r3855, r3718, r3719; +} +{ +mul.f16x2 r3858, r3855, r3698; +} +{ +add.f16x2 r3861, r3852, r3858; +} +{ +add.f16x2 r3864, r3846, r3861; +} +{ +add.f16x2 r3867, r3712, r3713; +} +{ +mul.f16x2 r3870, r3867, r3689; +} +{ +add.f16x2 r3873, r3715, r3870; +} +{ +add.f16x2 r3876, r3718, r3719; +} +{ +mul.f16x2 r3879, r3876, r3693; +} +{ +add.f16x2 r3882, r3873, r3879; +} +{ +sub.f16x2 r3885, r3700, r3701; +} +{ +mul.f16x2 r3888, r3885, r3691; +} +{ +sub.f16x2 r3891, r3706, r3707; +} +{ +mul.f16x2 r3894, r3891, r3695; +} +{ +add.f16x2 r3897, r3888, r3894; +} +{ +add.f16x2 r3900, r3882, r3897; +} +{ +add.f16x2 r3903, r3712, r3713; +} +{ +mul.f16x2 r3906, r3903, r3689; +} +{ +add.f16x2 r3909, r3715, r3906; +} +{ +add.f16x2 r3912, r3718, r3719; +} +{ +mul.f16x2 r3915, r3912, r3693; +} +{ +add.f16x2 r3918, r3909, r3915; +} +{ +sub.f16x2 r3921, r3700, r3701; +} +{ +mul.f16x2 r3924, r3921, r3691; +} +{ +sub.f16x2 r3927, r3706, r3707; +} +{ +mul.f16x2 r3930, r3927, r3695; +} +{ +add.f16x2 r3933, r3924, r3930; +} +{ +sub.f16x2 r3936, r3918, r3933; +} +{ +add.f16x2 r3939, r3712, r3713; +} +{ +mul.f16x2 r3942, r3939, r3693; +} +{ +add.f16x2 r3945, r3715, r3942; +} +{ +add.f16x2 r3948, r3718, r3719; +} +{ +mul.f16x2 r3951, r3948, r3697; +} +{ +add.f16x2 r3954, r3945, r3951; +} +{ +sub.f16x2 r3957, r3700, r3701; +} +{ +mul.f16x2 r3960, r3957, r3695; +} +{ +sub.f16x2 r3963, r3706, r3707; +} +{ +mul.f16x2 r3966, r3963, r3698; +} +{ +add.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3954, r3969; +} +{ +add.f16x2 r3975, r3712, r3713; +} +{ +mul.f16x2 r3978, r3975, r3693; +} +{ +add.f16x2 r3981, r3715, r3978; +} +{ +add.f16x2 r3984, r3718, r3719; +} +{ +mul.f16x2 r3987, r3984, r3697; +} +{ +add.f16x2 r3990, r3981, r3987; +} +{ +sub.f16x2 r3993, r3700, r3701; +} +{ +mul.f16x2 r3996, r3993, r3695; +} +{ +sub.f16x2 r3999, r3706, r3707; +} +{ +mul.f16x2 r4002, r3999, r3698; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +sub.f16x2 r4008, r3990, r4005; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r4011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r4013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r4014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r4015, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r4016, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r4017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r4018, {low, high}; +} +{ +mul.f16x2 r4029, r3756, r4011; +} +{ +mul.f16x2 r4032, r3900, r4012; +} +{ +sub.f16x2 r4035, r4029, r4032; +} +{ +mul.f16x2 r4038, r3756, r4012; +} +{ +fma.rn.f16x2 r4041, r3900, r4011, r4038; +} +{ +mul.f16x2 r4045, r3828, r4013; +} +{ +mul.f16x2 r4048, r3972, r4014; +} +{ +sub.f16x2 r4051, r4045, r4048; +} +{ +mul.f16x2 r4054, r3828, r4014; +} +{ +fma.rn.f16x2 r4057, r3972, r4013, r4054; +} +{ +mul.f16x2 r4061, r3864, r4015; +} +{ +mul.f16x2 r4064, r4008, r4016; +} +{ +sub.f16x2 r4067, r4061, r4064; +} +{ +mul.f16x2 r4070, r3864, r4016; +} +{ +fma.rn.f16x2 r4073, r4008, r4015, r4070; +} +{ +mul.f16x2 r4077, r3792, r4017; +} +{ +mul.f16x2 r4080, r3936, r4018; +} +{ +sub.f16x2 r4083, r4077, r4080; +} +{ +mul.f16x2 r4086, r3792, r4018; +} +{ +fma.rn.f16x2 r4089, r3936, r4017, r4086; +} +{ +add.f16x2 %0, r3386, r3708; +} +{ +add.f16x2 %1, r3398, r3720; +} +{ +sub.f16x2 %10, r3386, r3708; +} +{ +sub.f16x2 %11, r3398, r3720; +} +{ +add.f16x2 %2, r3434, r4035; +} +{ +add.f16x2 %3, r3578, r4041; +} +{ +sub.f16x2 %12, r3434, r4035; +} +{ +sub.f16x2 %13, r3578, r4041; +} +{ +add.f16x2 %4, r3506, r4051; +} +{ +add.f16x2 %5, r3650, r4057; +} +{ +sub.f16x2 %14, r3506, r4051; +} +{ +sub.f16x2 %15, r3650, r4057; +} +{ +add.f16x2 %6, r3542, r4067; +} +{ +add.f16x2 %7, r3686, r4073; +} +{ +sub.f16x2 %16, r3542, r4067; +} +{ +sub.f16x2 %17, r3686, r4073; +} +{ +add.f16x2 %8, r3470, r4083; +} +{ +add.f16x2 %9, r3614, r4089; +} +{ +sub.f16x2 %18, r3470, r4083; +} +{ +sub.f16x2 %19, r3614, r4089; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..4c44a5c794b2b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp32_fwd.hpp.inc @@ -0,0 +1,1832 @@ +#ifndef CUFFTDX_FFT_10000_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_10000_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<195, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<940>; +.reg .b32 r<23>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 80000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %29, %45; +add.f32 f42, %24, f41; +add.f32 f43, %34, %40; +add.f32 f44, f43, f42; +add.f32 f45, %31, %47; +add.f32 f46, %25, f45; +add.f32 f47, %36, %41; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %24; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %31, %47; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %36, %41; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %24, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %25; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %29, %45; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %34, %40; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %25, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %32, %48; +add.f32 f86, %26, f85; +add.f32 f87, %37, %42; +add.f32 f88, f87, f86; +add.f32 f89, %33, %49; +add.f32 f90, %28, f89; +add.f32 f91, %39, %44; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %26; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %33, %49; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %39, %44; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %26, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %28; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %32, %48; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %37, %42; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %28, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +sub.f32 f149, f44, f88; +sub.f32 f150, f48, f92; +add.f32 f151, f57, f131; +add.f32 f152, f75, f133; +sub.f32 f153, f57, f131; +sub.f32 f154, f75, f133; +add.f32 f155, f65, f136; +add.f32 f156, f83, f138; +sub.f32 f157, f65, f136; +sub.f32 f158, f83, f138; +add.f32 f159, f66, f141; +add.f32 f160, f84, f143; +sub.f32 f161, f66, f141; +sub.f32 f162, f84, f143; +add.f32 f163, f58, f146; +add.f32 f164, f76, f148; +sub.f32 f165, f58, f146; +sub.f32 f166, f76, f148; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 80000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f167, f168}, [rd6]; +mul.f32 f171, f167, f151; +mul.f32 f172, f168, f152; +mul.f32 f173, f167, f152; +mul.f32 f174, f167, f167; +mul.f32 f175, f168, f168; +sub.f32 f176, f174, f175; +mul.f32 f177, f168, f167; +fma.rn.f32 f178, f168, f167, f177; +mul.f32 f179, f176, f155; +mul.f32 f180, f178, f156; +mul.f32 f181, f176, f156; +mul.f32 f182, f167, f176; +mul.f32 f183, f168, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f167, f178; +fma.rn.f32 f186, f168, f176, f185; +mul.f32 f187, f184, f159; +mul.f32 f188, f186, f160; +mul.f32 f189, f184, f160; +mul.f32 f190, f167, f184; +mul.f32 f191, f168, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f167, f186; +fma.rn.f32 f194, f168, f184, f193; +mul.f32 f195, f192, f163; +mul.f32 f196, f194, f164; +mul.f32 f197, f192, f164; +mul.f32 f198, f167, f192; +mul.f32 f199, f168, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f167, f194; +fma.rn.f32 f202, f168, f192, f201; +mul.f32 f203, f200, f149; +mul.f32 f204, f202, f150; +mul.f32 f205, f200, f150; +mul.f32 f206, f167, f200; +mul.f32 f207, f168, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f167, f202; +fma.rn.f32 f210, f168, f200, f209; +mul.f32 f211, f208, f153; +mul.f32 f212, f210, f154; +mul.f32 f213, f208, f154; +mul.f32 f214, f167, f208; +mul.f32 f215, f168, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f167, f210; +fma.rn.f32 f218, f168, f208, f217; +mul.f32 f219, f216, f157; +mul.f32 f220, f218, f158; +mul.f32 f221, f216, f158; +mul.f32 f222, f167, f216; +mul.f32 f223, f168, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f167, f218; +fma.rn.f32 f226, f168, f216, f225; +mul.f32 f227, f224, f161; +mul.f32 f228, f226, f162; +mul.f32 f229, f224, f162; +mul.f32 f230, f167, f224; +mul.f32 f231, f168, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f167, f226; +fma.rn.f32 f234, f168, f224, f233; +mul.f32 f235, f232, f165; +mul.f32 f236, f234, f166; +mul.f32 f237, f232, f166; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f32 f238, f48, f92; +add.f32 f239, f44, f88; +st.shared.v2.f32 [r9], {f239, f238}; +fma.rn.f32 f240, f168, f151, f173; +sub.f32 f241, f171, f172; +st.shared.v2.f32 [r9+8], {f241, f240}; +fma.rn.f32 f242, f178, f155, f181; +sub.f32 f243, f179, f180; +st.shared.v2.f32 [r9+16], {f243, f242}; +fma.rn.f32 f244, f186, f159, f189; +sub.f32 f245, f187, f188; +st.shared.v2.f32 [r9+24], {f245, f244}; +sub.f32 f246, f195, f196; +fma.rn.f32 f247, f194, f163, f197; +st.shared.v2.f32 [r9+32], {f246, f247}; +fma.rn.f32 f248, f202, f149, f205; +sub.f32 f249, f203, f204; +st.shared.v2.f32 [r9+40], {f249, f248}; +fma.rn.f32 f250, f210, f153, f213; +sub.f32 f251, f211, f212; +st.shared.v2.f32 [r9+48], {f251, f250}; +fma.rn.f32 f252, f218, f157, f221; +sub.f32 f253, f219, f220; +st.shared.v2.f32 [r9+56], {f253, f252}; +fma.rn.f32 f254, f226, f161, f229; +sub.f32 f255, f227, f228; +st.shared.v2.f32 [r9+64], {f255, f254}; +fma.rn.f32 f256, f234, f165, f237; +sub.f32 f257, f235, f236; +st.shared.v2.f32 [r9+72], {f257, f256}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.v2.f32 {f258, f259}, [r10]; +ld.shared.v2.f32 {f262, f263}, [r10+8000]; +ld.shared.v2.f32 {f266, f267}, [r10+16000]; +ld.shared.v2.f32 {f270, f271}, [r10+24000]; +ld.shared.v2.f32 {f274, f275}, [r10+32000]; +ld.shared.v2.f32 {f278, f279}, [r10+40000]; +ld.shared.v2.f32 {f282, f283}, [r10+48000]; +ld.shared.v2.f32 {f286, f287}, [r10+56000]; +ld.shared.v2.f32 {f290, f291}, [r10+64000]; +ld.shared.v2.f32 {f294, f295}, [r10+72000]; +add.f32 f298, f266, f290; +add.f32 f299, f258, f298; +add.f32 f300, f274, f282; +add.f32 f301, f300, f299; +add.f32 f302, f267, f291; +add.f32 f303, f259, f302; +add.f32 f304, f275, f283; +add.f32 f305, f304, f303; +fma.rn.f32 f306, f298, 0f3E9E377A, f258; +mul.f32 f307, f300, 0f3F4F1BBD; +sub.f32 f308, f306, f307; +sub.f32 f309, f267, f291; +mul.f32 f310, f309, 0f3F737871; +sub.f32 f311, f275, f283; +mul.f32 f312, f311, 0fBF167918; +sub.f32 f313, f312, f310; +sub.f32 f314, f308, f313; +add.f32 f315, f313, f308; +mul.f32 f316, f298, 0f3F4F1BBD; +sub.f32 f317, f258, f316; +fma.rn.f32 f318, f300, 0f3E9E377A, f317; +mul.f32 f319, f309, 0f3F167918; +mul.f32 f320, f311, 0f3F737871; +sub.f32 f321, f320, f319; +sub.f32 f322, f318, f321; +add.f32 f323, f321, f318; +fma.rn.f32 f324, f302, 0f3E9E377A, f259; +mul.f32 f325, f304, 0f3F4F1BBD; +sub.f32 f326, f324, f325; +sub.f32 f327, f266, f290; +mul.f32 f328, f327, 0f3F737871; +sub.f32 f329, f274, f282; +mul.f32 f330, f329, 0fBF167918; +sub.f32 f331, f330, f328; +add.f32 f332, f331, f326; +sub.f32 f333, f326, f331; +mul.f32 f334, f302, 0f3F4F1BBD; +sub.f32 f335, f259, f334; +fma.rn.f32 f336, f304, 0f3E9E377A, f335; +mul.f32 f337, f327, 0f3F167918; +mul.f32 f338, f329, 0f3F737871; +sub.f32 f339, f338, f337; +add.f32 f340, f339, f336; +sub.f32 f341, f336, f339; +add.f32 f342, f270, f294; +add.f32 f343, f262, f342; +add.f32 f344, f278, f286; +add.f32 f345, f344, f343; +add.f32 f346, f271, f295; +add.f32 f347, f263, f346; +add.f32 f348, f279, f287; +add.f32 f349, f348, f347; +fma.rn.f32 f350, f342, 0f3E9E377A, f262; +mul.f32 f351, f344, 0f3F4F1BBD; +sub.f32 f352, f350, f351; +sub.f32 f353, f271, f295; +mul.f32 f354, f353, 0f3F737871; +sub.f32 f355, f279, f287; +mul.f32 f356, f355, 0fBF167918; +sub.f32 f357, f356, f354; +sub.f32 f358, f352, f357; +add.f32 f359, f357, f352; +mul.f32 f360, f342, 0f3F4F1BBD; +sub.f32 f361, f262, f360; +fma.rn.f32 f362, f344, 0f3E9E377A, f361; +mul.f32 f363, f353, 0f3F167918; +mul.f32 f364, f355, 0f3F737871; +sub.f32 f365, f364, f363; +sub.f32 f366, f362, f365; +add.f32 f367, f365, f362; +fma.rn.f32 f368, f346, 0f3E9E377A, f263; +mul.f32 f369, f348, 0f3F4F1BBD; +sub.f32 f370, f368, f369; +sub.f32 f371, f270, f294; +mul.f32 f372, f371, 0f3F737871; +sub.f32 f373, f278, f286; +mul.f32 f374, f373, 0fBF167918; +sub.f32 f375, f374, f372; +add.f32 f376, f375, f370; +sub.f32 f377, f370, f375; +mul.f32 f378, f346, 0f3F4F1BBD; +sub.f32 f379, f263, f378; +fma.rn.f32 f380, f348, 0f3E9E377A, f379; +mul.f32 f381, f371, 0f3F167918; +mul.f32 f382, f373, 0f3F737871; +sub.f32 f383, f382, f381; +add.f32 f384, f383, f380; +sub.f32 f385, f380, f383; +mul.f32 f386, f358, 0f3F4F1BBD; +mul.f32 f387, f376, 0fBF167918; +sub.f32 f388, f386, f387; +mul.f32 f389, f376, 0f3F4F1BBD; +fma.rn.f32 f390, f358, 0fBF167918, f389; +mul.f32 f391, f366, 0f3E9E377A; +mul.f32 f392, f384, 0fBF737871; +sub.f32 f393, f391, f392; +mul.f32 f394, f384, 0f3E9E377A; +fma.rn.f32 f395, f366, 0fBF737871, f394; +mul.f32 f396, f367, 0fBE9E377A; +mul.f32 f397, f385, 0fBF737871; +sub.f32 f398, f396, f397; +mul.f32 f399, f385, 0fBE9E377A; +fma.rn.f32 f400, f367, 0fBF737871, f399; +mul.f32 f401, f359, 0fBF4F1BBD; +mul.f32 f402, f377, 0fBF167918; +sub.f32 f403, f401, f402; +mul.f32 f404, f377, 0fBF4F1BBD; +fma.rn.f32 f405, f359, 0fBF167918, f404; +sub.f32 f406, f301, f345; +sub.f32 f407, f305, f349; +add.f32 f408, f314, f388; +add.f32 f409, f332, f390; +sub.f32 f410, f314, f388; +sub.f32 f411, f332, f390; +add.f32 f412, f322, f393; +add.f32 f413, f340, f395; +sub.f32 f414, f322, f393; +sub.f32 f415, f340, f395; +add.f32 f416, f323, f398; +add.f32 f417, f341, f400; +sub.f32 f418, f323, f398; +sub.f32 f419, f341, f400; +add.f32 f420, f315, f403; +add.f32 f421, f333, f405; +sub.f32 f422, f315, f403; +sub.f32 f423, f333, f405; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f424, f425}, [rd11]; +mul.f32 f428, f424, f408; +mul.f32 f429, f425, f409; +mul.f32 f430, f424, f409; +mul.f32 f431, f424, f424; +mul.f32 f432, f425, f425; +sub.f32 f433, f431, f432; +mul.f32 f434, f425, f424; +fma.rn.f32 f435, f425, f424, f434; +mul.f32 f436, f433, f412; +mul.f32 f437, f435, f413; +mul.f32 f438, f433, f413; +mul.f32 f439, f424, f433; +mul.f32 f440, f425, f435; +sub.f32 f441, f439, f440; +mul.f32 f442, f424, f435; +fma.rn.f32 f443, f425, f433, f442; +mul.f32 f444, f441, f416; +mul.f32 f445, f443, f417; +mul.f32 f446, f441, f417; +mul.f32 f447, f424, f441; +mul.f32 f448, f425, f443; +sub.f32 f449, f447, f448; +mul.f32 f450, f424, f443; +fma.rn.f32 f451, f425, f441, f450; +mul.f32 f452, f449, f420; +mul.f32 f453, f451, f421; +mul.f32 f454, f449, f421; +mul.f32 f455, f424, f449; +mul.f32 f456, f425, f451; +sub.f32 f457, f455, f456; +mul.f32 f458, f424, f451; +fma.rn.f32 f459, f425, f449, f458; +mul.f32 f460, f457, f406; +mul.f32 f461, f459, f407; +mul.f32 f462, f457, f407; +mul.f32 f463, f424, f457; +mul.f32 f464, f425, f459; +sub.f32 f465, f463, f464; +mul.f32 f466, f424, f459; +fma.rn.f32 f467, f425, f457, f466; +mul.f32 f468, f465, f410; +mul.f32 f469, f467, f411; +mul.f32 f470, f465, f411; +mul.f32 f471, f424, f465; +mul.f32 f472, f425, f467; +sub.f32 f473, f471, f472; +mul.f32 f474, f424, f467; +fma.rn.f32 f475, f425, f465, f474; +mul.f32 f476, f473, f414; +mul.f32 f477, f475, f415; +mul.f32 f478, f473, f415; +mul.f32 f479, f424, f473; +mul.f32 f480, f425, f475; +sub.f32 f481, f479, f480; +mul.f32 f482, f424, f475; +fma.rn.f32 f483, f425, f473, f482; +mul.f32 f484, f481, f418; +mul.f32 f485, f483, f419; +mul.f32 f486, f481, f419; +mul.f32 f487, f424, f481; +mul.f32 f488, f425, f483; +sub.f32 f489, f487, f488; +mul.f32 f490, f424, f483; +fma.rn.f32 f491, f425, f481, f490; +mul.f32 f492, f489, f422; +mul.f32 f493, f491, f423; +mul.f32 f494, f489, f423; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +add.f32 f495, f305, f349; +add.f32 f496, f301, f345; +st.shared.v2.f32 [r16], {f496, f495}; +fma.rn.f32 f497, f425, f408, f430; +sub.f32 f498, f428, f429; +st.shared.v2.f32 [r16+80], {f498, f497}; +fma.rn.f32 f499, f435, f412, f438; +sub.f32 f500, f436, f437; +st.shared.v2.f32 [r16+160], {f500, f499}; +fma.rn.f32 f501, f443, f416, f446; +sub.f32 f502, f444, f445; +st.shared.v2.f32 [r16+240], {f502, f501}; +fma.rn.f32 f503, f451, f420, f454; +sub.f32 f504, f452, f453; +st.shared.v2.f32 [r16+320], {f504, f503}; +sub.f32 f505, f460, f461; +fma.rn.f32 f506, f459, f406, f462; +st.shared.v2.f32 [r16+400], {f505, f506}; +sub.f32 f507, f468, f469; +fma.rn.f32 f508, f467, f410, f470; +st.shared.v2.f32 [r16+480], {f507, f508}; +fma.rn.f32 f509, f475, f414, f478; +sub.f32 f510, f476, f477; +st.shared.v2.f32 [r16+560], {f510, f509}; +fma.rn.f32 f511, f483, f418, f486; +sub.f32 f512, f484, f485; +st.shared.v2.f32 [r16+640], {f512, f511}; +fma.rn.f32 f513, f491, f422, f494; +sub.f32 f514, f492, f493; +st.shared.v2.f32 [r16+720], {f514, f513}; +barrier.sync 0; +ld.shared.v2.f32 {f515, f516}, [r10]; +ld.shared.v2.f32 {f519, f520}, [r10+8000]; +ld.shared.v2.f32 {f523, f524}, [r10+16000]; +ld.shared.v2.f32 {f527, f528}, [r10+24000]; +ld.shared.v2.f32 {f531, f532}, [r10+32000]; +ld.shared.v2.f32 {f535, f536}, [r10+40000]; +ld.shared.v2.f32 {f539, f540}, [r10+48000]; +ld.shared.v2.f32 {f543, f544}, [r10+56000]; +ld.shared.v2.f32 {f547, f548}, [r10+64000]; +ld.shared.v2.f32 {f551, f552}, [r10+72000]; +add.f32 f555, f523, f547; +add.f32 f556, f515, f555; +add.f32 f557, f531, f539; +add.f32 f558, f557, f556; +add.f32 f559, f524, f548; +add.f32 f560, f516, f559; +add.f32 f561, f532, f540; +add.f32 f562, f561, f560; +fma.rn.f32 f563, f555, 0f3E9E377A, f515; +mul.f32 f564, f557, 0f3F4F1BBD; +sub.f32 f565, f563, f564; +sub.f32 f566, f524, f548; +mul.f32 f567, f566, 0f3F737871; +sub.f32 f568, f532, f540; +mul.f32 f569, f568, 0fBF167918; +sub.f32 f570, f569, f567; +sub.f32 f571, f565, f570; +add.f32 f572, f570, f565; +mul.f32 f573, f555, 0f3F4F1BBD; +sub.f32 f574, f515, f573; +fma.rn.f32 f575, f557, 0f3E9E377A, f574; +mul.f32 f576, f566, 0f3F167918; +mul.f32 f577, f568, 0f3F737871; +sub.f32 f578, f577, f576; +sub.f32 f579, f575, f578; +add.f32 f580, f578, f575; +fma.rn.f32 f581, f559, 0f3E9E377A, f516; +mul.f32 f582, f561, 0f3F4F1BBD; +sub.f32 f583, f581, f582; +sub.f32 f584, f523, f547; +mul.f32 f585, f584, 0f3F737871; +sub.f32 f586, f531, f539; +mul.f32 f587, f586, 0fBF167918; +sub.f32 f588, f587, f585; +add.f32 f589, f588, f583; +sub.f32 f590, f583, f588; +mul.f32 f591, f559, 0f3F4F1BBD; +sub.f32 f592, f516, f591; +fma.rn.f32 f593, f561, 0f3E9E377A, f592; +mul.f32 f594, f584, 0f3F167918; +mul.f32 f595, f586, 0f3F737871; +sub.f32 f596, f595, f594; +add.f32 f597, f596, f593; +sub.f32 f598, f593, f596; +add.f32 f599, f527, f551; +add.f32 f600, f519, f599; +add.f32 f601, f535, f543; +add.f32 f602, f601, f600; +add.f32 f603, f528, f552; +add.f32 f604, f520, f603; +add.f32 f605, f536, f544; +add.f32 f606, f605, f604; +fma.rn.f32 f607, f599, 0f3E9E377A, f519; +mul.f32 f608, f601, 0f3F4F1BBD; +sub.f32 f609, f607, f608; +sub.f32 f610, f528, f552; +mul.f32 f611, f610, 0f3F737871; +sub.f32 f612, f536, f544; +mul.f32 f613, f612, 0fBF167918; +sub.f32 f614, f613, f611; +sub.f32 f615, f609, f614; +add.f32 f616, f614, f609; +mul.f32 f617, f599, 0f3F4F1BBD; +sub.f32 f618, f519, f617; +fma.rn.f32 f619, f601, 0f3E9E377A, f618; +mul.f32 f620, f610, 0f3F167918; +mul.f32 f621, f612, 0f3F737871; +sub.f32 f622, f621, f620; +sub.f32 f623, f619, f622; +add.f32 f624, f622, f619; +fma.rn.f32 f625, f603, 0f3E9E377A, f520; +mul.f32 f626, f605, 0f3F4F1BBD; +sub.f32 f627, f625, f626; +sub.f32 f628, f527, f551; +mul.f32 f629, f628, 0f3F737871; +sub.f32 f630, f535, f543; +mul.f32 f631, f630, 0fBF167918; +sub.f32 f632, f631, f629; +add.f32 f633, f632, f627; +sub.f32 f634, f627, f632; +mul.f32 f635, f603, 0f3F4F1BBD; +sub.f32 f636, f520, f635; +fma.rn.f32 f637, f605, 0f3E9E377A, f636; +mul.f32 f638, f628, 0f3F167918; +mul.f32 f639, f630, 0f3F737871; +sub.f32 f640, f639, f638; +add.f32 f641, f640, f637; +sub.f32 f642, f637, f640; +mul.f32 f643, f615, 0f3F4F1BBD; +mul.f32 f644, f633, 0fBF167918; +sub.f32 f645, f643, f644; +mul.f32 f646, f633, 0f3F4F1BBD; +fma.rn.f32 f647, f615, 0fBF167918, f646; +mul.f32 f648, f623, 0f3E9E377A; +mul.f32 f649, f641, 0fBF737871; +sub.f32 f650, f648, f649; +mul.f32 f651, f641, 0f3E9E377A; +fma.rn.f32 f652, f623, 0fBF737871, f651; +mul.f32 f653, f624, 0fBE9E377A; +mul.f32 f654, f642, 0fBF737871; +sub.f32 f655, f653, f654; +mul.f32 f656, f642, 0fBE9E377A; +fma.rn.f32 f657, f624, 0fBF737871, f656; +mul.f32 f658, f616, 0fBF4F1BBD; +mul.f32 f659, f634, 0fBF167918; +sub.f32 f660, f658, f659; +mul.f32 f661, f634, 0fBF4F1BBD; +fma.rn.f32 f662, f616, 0fBF167918, f661; +sub.f32 f663, f558, f602; +sub.f32 f664, f562, f606; +add.f32 f665, f571, f645; +add.f32 f666, f589, f647; +sub.f32 f667, f571, f645; +sub.f32 f668, f589, f647; +add.f32 f669, f579, f650; +add.f32 f670, f597, f652; +sub.f32 f671, f579, f650; +sub.f32 f672, f597, f652; +add.f32 f673, f580, f655; +add.f32 f674, f598, f657; +sub.f32 f675, f580, f655; +sub.f32 f676, f598, f657; +add.f32 f677, f572, f660; +add.f32 f678, f590, f662; +sub.f32 f679, f572, f660; +sub.f32 f680, f590, f662; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 8; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f681, f682}, [rd16]; +mul.f32 f685, f681, f665; +mul.f32 f686, f682, f666; +mul.f32 f687, f681, f666; +mul.f32 f688, f681, f681; +mul.f32 f689, f682, f682; +sub.f32 f690, f688, f689; +mul.f32 f691, f682, f681; +fma.rn.f32 f692, f682, f681, f691; +mul.f32 f693, f690, f669; +mul.f32 f694, f692, f670; +mul.f32 f695, f690, f670; +mul.f32 f696, f681, f690; +mul.f32 f697, f682, f692; +sub.f32 f698, f696, f697; +mul.f32 f699, f681, f692; +fma.rn.f32 f700, f682, f690, f699; +mul.f32 f701, f698, f673; +mul.f32 f702, f700, f674; +mul.f32 f703, f698, f674; +mul.f32 f704, f681, f698; +mul.f32 f705, f682, f700; +sub.f32 f706, f704, f705; +mul.f32 f707, f681, f700; +fma.rn.f32 f708, f682, f698, f707; +mul.f32 f709, f706, f677; +mul.f32 f710, f708, f678; +mul.f32 f711, f706, f678; +mul.f32 f712, f681, f706; +mul.f32 f713, f682, f708; +sub.f32 f714, f712, f713; +mul.f32 f715, f681, f708; +fma.rn.f32 f716, f682, f706, f715; +mul.f32 f717, f714, f663; +mul.f32 f718, f716, f664; +mul.f32 f719, f714, f664; +mul.f32 f720, f681, f714; +mul.f32 f721, f682, f716; +sub.f32 f722, f720, f721; +mul.f32 f723, f681, f716; +fma.rn.f32 f724, f682, f714, f723; +mul.f32 f725, f722, f667; +mul.f32 f726, f724, f668; +mul.f32 f727, f722, f668; +mul.f32 f728, f681, f722; +mul.f32 f729, f682, f724; +sub.f32 f730, f728, f729; +mul.f32 f731, f681, f724; +fma.rn.f32 f732, f682, f722, f731; +mul.f32 f733, f730, f671; +mul.f32 f734, f732, f672; +mul.f32 f735, f730, f672; +mul.f32 f736, f681, f730; +mul.f32 f737, f682, f732; +sub.f32 f738, f736, f737; +mul.f32 f739, f681, f732; +fma.rn.f32 f740, f682, f730, f739; +mul.f32 f741, f738, f675; +mul.f32 f742, f740, f676; +mul.f32 f743, f738, f676; +mul.f32 f744, f681, f738; +mul.f32 f745, f682, f740; +sub.f32 f746, f744, f745; +mul.f32 f747, f681, f740; +fma.rn.f32 f748, f682, f738, f747; +mul.f32 f749, f746, f679; +mul.f32 f750, f748, f680; +mul.f32 f751, f746, f680; +shl.b32 r20, r19, 3; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 8000, r21; +add.f32 f752, f562, f606; +add.f32 f753, f558, f602; +st.shared.v2.f32 [r22], {f753, f752}; +fma.rn.f32 f754, f682, f665, f687; +sub.f32 f755, f685, f686; +st.shared.v2.f32 [r22+800], {f755, f754}; +fma.rn.f32 f756, f692, f669, f695; +sub.f32 f757, f693, f694; +st.shared.v2.f32 [r22+1600], {f757, f756}; +fma.rn.f32 f758, f700, f673, f703; +sub.f32 f759, f701, f702; +st.shared.v2.f32 [r22+2400], {f759, f758}; +fma.rn.f32 f760, f708, f677, f711; +sub.f32 f761, f709, f710; +st.shared.v2.f32 [r22+3200], {f761, f760}; +sub.f32 f762, f717, f718; +fma.rn.f32 f763, f716, f663, f719; +st.shared.v2.f32 [r22+4000], {f762, f763}; +sub.f32 f764, f725, f726; +fma.rn.f32 f765, f724, f667, f727; +st.shared.v2.f32 [r22+4800], {f764, f765}; +fma.rn.f32 f766, f732, f671, f735; +sub.f32 f767, f733, f734; +st.shared.v2.f32 [r22+5600], {f767, f766}; +fma.rn.f32 f768, f740, f675, f743; +sub.f32 f769, f741, f742; +st.shared.v2.f32 [r22+6400], {f769, f768}; +fma.rn.f32 f770, f748, f679, f751; +sub.f32 f771, f749, f750; +st.shared.v2.f32 [r22+7200], {f771, f770}; +barrier.sync 0; +ld.shared.v2.f32 {f772, f773}, [r10]; +ld.shared.v2.f32 {f776, f777}, [r10+8000]; +ld.shared.v2.f32 {f780, f781}, [r10+16000]; +ld.shared.v2.f32 {f784, f785}, [r10+24000]; +ld.shared.v2.f32 {f788, f789}, [r10+32000]; +ld.shared.v2.f32 {f792, f793}, [r10+40000]; +ld.shared.v2.f32 {f796, f797}, [r10+48000]; +ld.shared.v2.f32 {f800, f801}, [r10+56000]; +ld.shared.v2.f32 {f804, f805}, [r10+64000]; +ld.shared.v2.f32 {f808, f809}, [r10+72000]; +add.f32 f812, f780, f804; +add.f32 f813, f772, f812; +add.f32 f814, f788, f796; +add.f32 f815, f814, f813; +add.f32 f816, f781, f805; +add.f32 f817, f773, f816; +add.f32 f818, f789, f797; +add.f32 f819, f818, f817; +fma.rn.f32 f820, f812, 0f3E9E377A, f772; +mul.f32 f821, f814, 0f3F4F1BBD; +sub.f32 f822, f820, f821; +sub.f32 f823, f781, f805; +mul.f32 f824, f823, 0f3F737871; +sub.f32 f825, f789, f797; +mul.f32 f826, f825, 0fBF167918; +sub.f32 f827, f826, f824; +sub.f32 f828, f822, f827; +add.f32 f829, f827, f822; +mul.f32 f830, f812, 0f3F4F1BBD; +sub.f32 f831, f772, f830; +fma.rn.f32 f832, f814, 0f3E9E377A, f831; +mul.f32 f833, f823, 0f3F167918; +mul.f32 f834, f825, 0f3F737871; +sub.f32 f835, f834, f833; +sub.f32 f836, f832, f835; +add.f32 f837, f835, f832; +fma.rn.f32 f838, f816, 0f3E9E377A, f773; +mul.f32 f839, f818, 0f3F4F1BBD; +sub.f32 f840, f838, f839; +sub.f32 f841, f780, f804; +mul.f32 f842, f841, 0f3F737871; +sub.f32 f843, f788, f796; +mul.f32 f844, f843, 0fBF167918; +sub.f32 f845, f844, f842; +add.f32 f846, f845, f840; +sub.f32 f847, f840, f845; +mul.f32 f848, f816, 0f3F4F1BBD; +sub.f32 f849, f773, f848; +fma.rn.f32 f850, f818, 0f3E9E377A, f849; +mul.f32 f851, f841, 0f3F167918; +mul.f32 f852, f843, 0f3F737871; +sub.f32 f853, f852, f851; +add.f32 f854, f853, f850; +sub.f32 f855, f850, f853; +add.f32 f856, f784, f808; +add.f32 f857, f776, f856; +add.f32 f858, f792, f800; +add.f32 f859, f858, f857; +add.f32 f860, f785, f809; +add.f32 f861, f777, f860; +add.f32 f862, f793, f801; +add.f32 f863, f862, f861; +fma.rn.f32 f864, f856, 0f3E9E377A, f776; +mul.f32 f865, f858, 0f3F4F1BBD; +sub.f32 f866, f864, f865; +sub.f32 f867, f785, f809; +mul.f32 f868, f867, 0f3F737871; +sub.f32 f869, f793, f801; +mul.f32 f870, f869, 0fBF167918; +sub.f32 f871, f870, f868; +sub.f32 f872, f866, f871; +add.f32 f873, f871, f866; +mul.f32 f874, f856, 0f3F4F1BBD; +sub.f32 f875, f776, f874; +fma.rn.f32 f876, f858, 0f3E9E377A, f875; +mul.f32 f877, f867, 0f3F167918; +mul.f32 f878, f869, 0f3F737871; +sub.f32 f879, f878, f877; +sub.f32 f880, f876, f879; +add.f32 f881, f879, f876; +fma.rn.f32 f882, f860, 0f3E9E377A, f777; +mul.f32 f883, f862, 0f3F4F1BBD; +sub.f32 f884, f882, f883; +sub.f32 f885, f784, f808; +mul.f32 f886, f885, 0f3F737871; +sub.f32 f887, f792, f800; +mul.f32 f888, f887, 0fBF167918; +sub.f32 f889, f888, f886; +add.f32 f890, f889, f884; +sub.f32 f891, f884, f889; +mul.f32 f892, f860, 0f3F4F1BBD; +sub.f32 f893, f777, f892; +fma.rn.f32 f894, f862, 0f3E9E377A, f893; +mul.f32 f895, f885, 0f3F167918; +mul.f32 f896, f887, 0f3F737871; +sub.f32 f897, f896, f895; +add.f32 f898, f897, f894; +sub.f32 f899, f894, f897; +mul.f32 f900, f872, 0f3F4F1BBD; +mul.f32 f901, f890, 0fBF167918; +sub.f32 f902, f900, f901; +mul.f32 f903, f890, 0f3F4F1BBD; +fma.rn.f32 f904, f872, 0fBF167918, f903; +mul.f32 f905, f880, 0f3E9E377A; +mul.f32 f906, f898, 0fBF737871; +sub.f32 f907, f905, f906; +mul.f32 f908, f898, 0f3E9E377A; +fma.rn.f32 f909, f880, 0fBF737871, f908; +mul.f32 f910, f881, 0fBE9E377A; +mul.f32 f911, f899, 0fBF737871; +sub.f32 f912, f910, f911; +mul.f32 f913, f899, 0fBE9E377A; +fma.rn.f32 f914, f881, 0fBF737871, f913; +mul.f32 f915, f873, 0fBF4F1BBD; +mul.f32 f916, f891, 0fBF167918; +sub.f32 f917, f915, f916; +mul.f32 f918, f891, 0fBF4F1BBD; +fma.rn.f32 f919, f873, 0fBF167918, f918; +add.f32 %1, f819, f863; +add.f32 %0, f815, f859; +add.f32 %3, f846, f904; +add.f32 %2, f828, f902; +add.f32 %5, f854, f909; +add.f32 %4, f836, f907; +add.f32 %7, f855, f914; +add.f32 %6, f837, f912; +add.f32 %9, f847, f919; +add.f32 %8, f829, f917; +sub.f32 %11, f819, f863; +sub.f32 %10, f815, f859; +sub.f32 %13, f846, f904; +sub.f32 %12, f828, f902; +sub.f32 %15, f854, f909; +sub.f32 %14, f836, f907; +sub.f32 %17, f855, f914; +sub.f32 %16, f837, f912; +sub.f32 %19, f847, f919; +sub.f32 %18, f829, f917; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_10000), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<196, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<880>; +.reg .b32 r<23>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 40000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %29, %45; +add.f32 f42, %24, f41; +add.f32 f43, %34, %40; +add.f32 f44, f43, f42; +add.f32 f45, %31, %47; +add.f32 f46, %25, f45; +add.f32 f47, %36, %41; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %24; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %31, %47; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %36, %41; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %24, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %25; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %29, %45; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %34, %40; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %25, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %32, %48; +add.f32 f86, %26, f85; +add.f32 f87, %37, %42; +add.f32 f88, f87, f86; +add.f32 f89, %33, %49; +add.f32 f90, %28, f89; +add.f32 f91, %39, %44; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %26; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %33, %49; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %39, %44; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %26, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %28; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %32, %48; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %37, %42; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %28, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +add.f32 f149, f44, f88; +add.f32 f150, f48, f92; +sub.f32 f151, f44, f88; +sub.f32 f152, f48, f92; +add.f32 f153, f57, f131; +add.f32 f154, f75, f133; +sub.f32 f155, f57, f131; +sub.f32 f156, f75, f133; +add.f32 f157, f65, f136; +add.f32 f158, f83, f138; +sub.f32 f159, f65, f136; +sub.f32 f160, f83, f138; +add.f32 f161, f66, f141; +add.f32 f162, f84, f143; +sub.f32 f163, f66, f141; +sub.f32 f164, f84, f143; +add.f32 f165, f58, f146; +add.f32 f166, f76, f148; +sub.f32 f167, f58, f146; +sub.f32 f168, f76, f148; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f169, f170}, [rd6]; +mul.f32 f173, f169, f153; +mul.f32 f174, f170, f154; +sub.f32 f175, f173, f174; +mul.f32 f176, f169, f154; +fma.rn.f32 f177, f170, f153, f176; +mul.f32 f178, f169, f169; +mul.f32 f179, f170, f170; +sub.f32 f180, f178, f179; +mul.f32 f181, f170, f169; +fma.rn.f32 f182, f170, f169, f181; +mul.f32 f183, f180, f157; +mul.f32 f184, f182, f158; +sub.f32 f185, f183, f184; +mul.f32 f186, f180, f158; +fma.rn.f32 f187, f182, f157, f186; +mul.f32 f188, f169, f180; +mul.f32 f189, f170, f182; +sub.f32 f190, f188, f189; +mul.f32 f191, f169, f182; +fma.rn.f32 f192, f170, f180, f191; +mul.f32 f193, f190, f161; +mul.f32 f194, f192, f162; +sub.f32 f195, f193, f194; +mul.f32 f196, f190, f162; +fma.rn.f32 f197, f192, f161, f196; +mul.f32 f198, f169, f190; +mul.f32 f199, f170, f192; +sub.f32 f200, f198, f199; +mul.f32 f201, f169, f192; +fma.rn.f32 f202, f170, f190, f201; +mul.f32 f203, f200, f165; +mul.f32 f204, f202, f166; +sub.f32 f205, f203, f204; +mul.f32 f206, f200, f166; +fma.rn.f32 f207, f202, f165, f206; +mul.f32 f208, f169, f200; +mul.f32 f209, f170, f202; +sub.f32 f210, f208, f209; +mul.f32 f211, f169, f202; +fma.rn.f32 f212, f170, f200, f211; +mul.f32 f213, f210, f151; +mul.f32 f214, f212, f152; +sub.f32 f215, f213, f214; +mul.f32 f216, f210, f152; +fma.rn.f32 f217, f212, f151, f216; +mul.f32 f218, f169, f210; +mul.f32 f219, f170, f212; +sub.f32 f220, f218, f219; +mul.f32 f221, f169, f212; +fma.rn.f32 f222, f170, f210, f221; +mul.f32 f223, f220, f155; +mul.f32 f224, f222, f156; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, f156; +fma.rn.f32 f227, f222, f155, f226; +mul.f32 f228, f169, f220; +mul.f32 f229, f170, f222; +sub.f32 f230, f228, f229; +mul.f32 f231, f169, f222; +fma.rn.f32 f232, f170, f220, f231; +mul.f32 f233, f230, f159; +mul.f32 f234, f232, f160; +sub.f32 f235, f233, f234; +mul.f32 f236, f230, f160; +fma.rn.f32 f237, f232, f159, f236; +mul.f32 f238, f169, f230; +mul.f32 f239, f170, f232; +sub.f32 f240, f238, f239; +mul.f32 f241, f169, f232; +fma.rn.f32 f242, f170, f230, f241; +mul.f32 f243, f240, f163; +mul.f32 f244, f242, f164; +sub.f32 f245, f243, f244; +mul.f32 f246, f240, f164; +fma.rn.f32 f247, f242, f163, f246; +mul.f32 f248, f169, f240; +mul.f32 f249, f170, f242; +sub.f32 f250, f248, f249; +mul.f32 f251, f169, f242; +fma.rn.f32 f252, f170, f240, f251; +mul.f32 f253, f250, f167; +mul.f32 f254, f252, f168; +sub.f32 f255, f253, f254; +mul.f32 f256, f250, f168; +fma.rn.f32 f257, f252, f167, f256; +mad.lo.s32 r8, r5, 40000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.v2.f32 [r9], {f149, f175}; +st.shared.v2.f32 [r9+8], {f185, f195}; +st.shared.v2.f32 [r9+16], {f205, f215}; +st.shared.v2.f32 [r9+24], {f225, f235}; +st.shared.v2.f32 [r9+32], {f245, f255}; +barrier.sync 0; +mad.lo.s32 r10, r7, -36, r9; +ld.shared.f32 f258, [r10]; +ld.shared.f32 f259, [r10+4000]; +ld.shared.f32 f260, [r10+8000]; +ld.shared.f32 f261, [r10+12000]; +ld.shared.f32 f262, [r10+16000]; +ld.shared.f32 f263, [r10+20000]; +ld.shared.f32 f264, [r10+24000]; +ld.shared.f32 f265, [r10+28000]; +ld.shared.f32 f266, [r10+32000]; +ld.shared.f32 f267, [r10+36000]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f150, f177}; +st.shared.v2.f32 [r9+8], {f187, f197}; +st.shared.v2.f32 [r9+16], {f207, f217}; +st.shared.v2.f32 [r9+24], {f227, f237}; +st.shared.v2.f32 [r9+32], {f247, f257}; +barrier.sync 0; +ld.shared.f32 f268, [r10]; +ld.shared.f32 f269, [r10+4000]; +ld.shared.f32 f270, [r10+8000]; +ld.shared.f32 f271, [r10+12000]; +ld.shared.f32 f272, [r10+16000]; +ld.shared.f32 f273, [r10+20000]; +ld.shared.f32 f274, [r10+24000]; +ld.shared.f32 f275, [r10+28000]; +ld.shared.f32 f276, [r10+32000]; +ld.shared.f32 f277, [r10+36000]; +add.f32 f278, f260, f266; +add.f32 f279, f258, f278; +add.f32 f280, f262, f264; +add.f32 f281, f280, f279; +add.f32 f282, f270, f276; +add.f32 f283, f268, f282; +add.f32 f284, f272, f274; +add.f32 f285, f284, f283; +fma.rn.f32 f286, f278, 0f3E9E377A, f258; +mul.f32 f287, f280, 0f3F4F1BBD; +sub.f32 f288, f286, f287; +sub.f32 f289, f270, f276; +mul.f32 f290, f289, 0f3F737871; +sub.f32 f291, f272, f274; +mul.f32 f292, f291, 0fBF167918; +sub.f32 f293, f292, f290; +sub.f32 f294, f288, f293; +add.f32 f295, f293, f288; +mul.f32 f296, f278, 0f3F4F1BBD; +sub.f32 f297, f258, f296; +fma.rn.f32 f298, f280, 0f3E9E377A, f297; +mul.f32 f299, f289, 0f3F167918; +mul.f32 f300, f291, 0f3F737871; +sub.f32 f301, f300, f299; +sub.f32 f302, f298, f301; +add.f32 f303, f301, f298; +fma.rn.f32 f304, f282, 0f3E9E377A, f268; +mul.f32 f305, f284, 0f3F4F1BBD; +sub.f32 f306, f304, f305; +sub.f32 f307, f260, f266; +mul.f32 f308, f307, 0f3F737871; +sub.f32 f309, f262, f264; +mul.f32 f310, f309, 0fBF167918; +sub.f32 f311, f310, f308; +add.f32 f312, f311, f306; +sub.f32 f313, f306, f311; +mul.f32 f314, f282, 0f3F4F1BBD; +sub.f32 f315, f268, f314; +fma.rn.f32 f316, f284, 0f3E9E377A, f315; +mul.f32 f317, f307, 0f3F167918; +mul.f32 f318, f309, 0f3F737871; +sub.f32 f319, f318, f317; +add.f32 f320, f319, f316; +sub.f32 f321, f316, f319; +add.f32 f322, f261, f267; +add.f32 f323, f259, f322; +add.f32 f324, f263, f265; +add.f32 f325, f324, f323; +add.f32 f326, f271, f277; +add.f32 f327, f269, f326; +add.f32 f328, f273, f275; +add.f32 f329, f328, f327; +fma.rn.f32 f330, f322, 0f3E9E377A, f259; +mul.f32 f331, f324, 0f3F4F1BBD; +sub.f32 f332, f330, f331; +sub.f32 f333, f271, f277; +mul.f32 f334, f333, 0f3F737871; +sub.f32 f335, f273, f275; +mul.f32 f336, f335, 0fBF167918; +sub.f32 f337, f336, f334; +sub.f32 f338, f332, f337; +add.f32 f339, f337, f332; +mul.f32 f340, f322, 0f3F4F1BBD; +sub.f32 f341, f259, f340; +fma.rn.f32 f342, f324, 0f3E9E377A, f341; +mul.f32 f343, f333, 0f3F167918; +mul.f32 f344, f335, 0f3F737871; +sub.f32 f345, f344, f343; +sub.f32 f346, f342, f345; +add.f32 f347, f345, f342; +fma.rn.f32 f348, f326, 0f3E9E377A, f269; +mul.f32 f349, f328, 0f3F4F1BBD; +sub.f32 f350, f348, f349; +sub.f32 f351, f261, f267; +mul.f32 f352, f351, 0f3F737871; +sub.f32 f353, f263, f265; +mul.f32 f354, f353, 0fBF167918; +sub.f32 f355, f354, f352; +add.f32 f356, f355, f350; +sub.f32 f357, f350, f355; +mul.f32 f358, f326, 0f3F4F1BBD; +sub.f32 f359, f269, f358; +fma.rn.f32 f360, f328, 0f3E9E377A, f359; +mul.f32 f361, f351, 0f3F167918; +mul.f32 f362, f353, 0f3F737871; +sub.f32 f363, f362, f361; +add.f32 f364, f363, f360; +sub.f32 f365, f360, f363; +mul.f32 f366, f338, 0f3F4F1BBD; +mul.f32 f367, f356, 0fBF167918; +sub.f32 f368, f366, f367; +mul.f32 f369, f356, 0f3F4F1BBD; +fma.rn.f32 f370, f338, 0fBF167918, f369; +mul.f32 f371, f346, 0f3E9E377A; +mul.f32 f372, f364, 0fBF737871; +sub.f32 f373, f371, f372; +mul.f32 f374, f364, 0f3E9E377A; +fma.rn.f32 f375, f346, 0fBF737871, f374; +mul.f32 f376, f347, 0fBE9E377A; +mul.f32 f377, f365, 0fBF737871; +sub.f32 f378, f376, f377; +mul.f32 f379, f365, 0fBE9E377A; +fma.rn.f32 f380, f347, 0fBF737871, f379; +mul.f32 f381, f339, 0fBF4F1BBD; +mul.f32 f382, f357, 0fBF167918; +sub.f32 f383, f381, f382; +mul.f32 f384, f357, 0fBF4F1BBD; +fma.rn.f32 f385, f339, 0fBF167918, f384; +add.f32 f386, f281, f325; +add.f32 f387, f285, f329; +sub.f32 f388, f281, f325; +sub.f32 f389, f285, f329; +add.f32 f390, f294, f368; +add.f32 f391, f312, f370; +sub.f32 f392, f294, f368; +sub.f32 f393, f312, f370; +add.f32 f394, f302, f373; +add.f32 f395, f320, f375; +sub.f32 f396, f302, f373; +sub.f32 f397, f320, f375; +add.f32 f398, f303, f378; +add.f32 f399, f321, f380; +sub.f32 f400, f303, f378; +sub.f32 f401, f321, f380; +add.f32 f402, f295, f383; +add.f32 f403, f313, f385; +sub.f32 f404, f295, f383; +sub.f32 f405, f313, f385; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f406, f407}, [rd11]; +mul.f32 f410, f406, f390; +mul.f32 f411, f407, f391; +sub.f32 f412, f410, f411; +mul.f32 f413, f406, f391; +fma.rn.f32 f414, f407, f390, f413; +mul.f32 f415, f406, f406; +mul.f32 f416, f407, f407; +sub.f32 f417, f415, f416; +mul.f32 f418, f407, f406; +fma.rn.f32 f419, f407, f406, f418; +mul.f32 f420, f417, f394; +mul.f32 f421, f419, f395; +sub.f32 f422, f420, f421; +mul.f32 f423, f417, f395; +fma.rn.f32 f424, f419, f394, f423; +mul.f32 f425, f406, f417; +mul.f32 f426, f407, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f406, f419; +fma.rn.f32 f429, f407, f417, f428; +mul.f32 f430, f427, f398; +mul.f32 f431, f429, f399; +sub.f32 f432, f430, f431; +mul.f32 f433, f427, f399; +fma.rn.f32 f434, f429, f398, f433; +mul.f32 f435, f406, f427; +mul.f32 f436, f407, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f406, f429; +fma.rn.f32 f439, f407, f427, f438; +mul.f32 f440, f437, f402; +mul.f32 f441, f439, f403; +sub.f32 f442, f440, f441; +mul.f32 f443, f437, f403; +fma.rn.f32 f444, f439, f402, f443; +mul.f32 f445, f406, f437; +mul.f32 f446, f407, f439; +sub.f32 f447, f445, f446; +mul.f32 f448, f406, f439; +fma.rn.f32 f449, f407, f437, f448; +mul.f32 f450, f447, f388; +mul.f32 f451, f449, f389; +sub.f32 f452, f450, f451; +mul.f32 f453, f447, f389; +fma.rn.f32 f454, f449, f388, f453; +mul.f32 f455, f406, f447; +mul.f32 f456, f407, f449; +sub.f32 f457, f455, f456; +mul.f32 f458, f406, f449; +fma.rn.f32 f459, f407, f447, f458; +mul.f32 f460, f457, f392; +mul.f32 f461, f459, f393; +sub.f32 f462, f460, f461; +mul.f32 f463, f457, f393; +fma.rn.f32 f464, f459, f392, f463; +mul.f32 f465, f406, f457; +mul.f32 f466, f407, f459; +sub.f32 f467, f465, f466; +mul.f32 f468, f406, f459; +fma.rn.f32 f469, f407, f457, f468; +mul.f32 f470, f467, f396; +mul.f32 f471, f469, f397; +sub.f32 f472, f470, f471; +mul.f32 f473, f467, f397; +fma.rn.f32 f474, f469, f396, f473; +mul.f32 f475, f406, f467; +mul.f32 f476, f407, f469; +sub.f32 f477, f475, f476; +mul.f32 f478, f406, f469; +fma.rn.f32 f479, f407, f467, f478; +mul.f32 f480, f477, f400; +mul.f32 f481, f479, f401; +sub.f32 f482, f480, f481; +mul.f32 f483, f477, f401; +fma.rn.f32 f484, f479, f400, f483; +mul.f32 f485, f406, f477; +mul.f32 f486, f407, f479; +sub.f32 f487, f485, f486; +mul.f32 f488, f406, f479; +fma.rn.f32 f489, f407, f477, f488; +mul.f32 f490, f487, f404; +mul.f32 f491, f489, f405; +sub.f32 f492, f490, f491; +mul.f32 f493, f487, f405; +fma.rn.f32 f494, f489, f404, f493; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 400, r15; +st.shared.f32 [r16], f386; +st.shared.f32 [r16+40], f412; +st.shared.f32 [r16+80], f422; +st.shared.f32 [r16+120], f432; +st.shared.f32 [r16+160], f442; +st.shared.f32 [r16+200], f452; +st.shared.f32 [r16+240], f462; +st.shared.f32 [r16+280], f472; +st.shared.f32 [r16+320], f482; +st.shared.f32 [r16+360], f492; +barrier.sync 0; +ld.shared.f32 f495, [r10]; +ld.shared.f32 f496, [r10+4000]; +ld.shared.f32 f497, [r10+8000]; +ld.shared.f32 f498, [r10+12000]; +ld.shared.f32 f499, [r10+16000]; +ld.shared.f32 f500, [r10+20000]; +ld.shared.f32 f501, [r10+24000]; +ld.shared.f32 f502, [r10+28000]; +ld.shared.f32 f503, [r10+32000]; +ld.shared.f32 f504, [r10+36000]; +barrier.sync 0; +st.shared.f32 [r16], f387; +st.shared.f32 [r16+40], f414; +st.shared.f32 [r16+80], f424; +st.shared.f32 [r16+120], f434; +st.shared.f32 [r16+160], f444; +st.shared.f32 [r16+200], f454; +st.shared.f32 [r16+240], f464; +st.shared.f32 [r16+280], f474; +st.shared.f32 [r16+320], f484; +st.shared.f32 [r16+360], f494; +barrier.sync 0; +ld.shared.f32 f505, [r10]; +ld.shared.f32 f506, [r10+4000]; +ld.shared.f32 f507, [r10+8000]; +ld.shared.f32 f508, [r10+12000]; +ld.shared.f32 f509, [r10+16000]; +ld.shared.f32 f510, [r10+20000]; +ld.shared.f32 f511, [r10+24000]; +ld.shared.f32 f512, [r10+28000]; +ld.shared.f32 f513, [r10+32000]; +ld.shared.f32 f514, [r10+36000]; +add.f32 f515, f497, f503; +add.f32 f516, f495, f515; +add.f32 f517, f499, f501; +add.f32 f518, f517, f516; +add.f32 f519, f507, f513; +add.f32 f520, f505, f519; +add.f32 f521, f509, f511; +add.f32 f522, f521, f520; +fma.rn.f32 f523, f515, 0f3E9E377A, f495; +mul.f32 f524, f517, 0f3F4F1BBD; +sub.f32 f525, f523, f524; +sub.f32 f526, f507, f513; +mul.f32 f527, f526, 0f3F737871; +sub.f32 f528, f509, f511; +mul.f32 f529, f528, 0fBF167918; +sub.f32 f530, f529, f527; +sub.f32 f531, f525, f530; +add.f32 f532, f530, f525; +mul.f32 f533, f515, 0f3F4F1BBD; +sub.f32 f534, f495, f533; +fma.rn.f32 f535, f517, 0f3E9E377A, f534; +mul.f32 f536, f526, 0f3F167918; +mul.f32 f537, f528, 0f3F737871; +sub.f32 f538, f537, f536; +sub.f32 f539, f535, f538; +add.f32 f540, f538, f535; +fma.rn.f32 f541, f519, 0f3E9E377A, f505; +mul.f32 f542, f521, 0f3F4F1BBD; +sub.f32 f543, f541, f542; +sub.f32 f544, f497, f503; +mul.f32 f545, f544, 0f3F737871; +sub.f32 f546, f499, f501; +mul.f32 f547, f546, 0fBF167918; +sub.f32 f548, f547, f545; +add.f32 f549, f548, f543; +sub.f32 f550, f543, f548; +mul.f32 f551, f519, 0f3F4F1BBD; +sub.f32 f552, f505, f551; +fma.rn.f32 f553, f521, 0f3E9E377A, f552; +mul.f32 f554, f544, 0f3F167918; +mul.f32 f555, f546, 0f3F737871; +sub.f32 f556, f555, f554; +add.f32 f557, f556, f553; +sub.f32 f558, f553, f556; +add.f32 f559, f498, f504; +add.f32 f560, f496, f559; +add.f32 f561, f500, f502; +add.f32 f562, f561, f560; +add.f32 f563, f508, f514; +add.f32 f564, f506, f563; +add.f32 f565, f510, f512; +add.f32 f566, f565, f564; +fma.rn.f32 f567, f559, 0f3E9E377A, f496; +mul.f32 f568, f561, 0f3F4F1BBD; +sub.f32 f569, f567, f568; +sub.f32 f570, f508, f514; +mul.f32 f571, f570, 0f3F737871; +sub.f32 f572, f510, f512; +mul.f32 f573, f572, 0fBF167918; +sub.f32 f574, f573, f571; +sub.f32 f575, f569, f574; +add.f32 f576, f574, f569; +mul.f32 f577, f559, 0f3F4F1BBD; +sub.f32 f578, f496, f577; +fma.rn.f32 f579, f561, 0f3E9E377A, f578; +mul.f32 f580, f570, 0f3F167918; +mul.f32 f581, f572, 0f3F737871; +sub.f32 f582, f581, f580; +sub.f32 f583, f579, f582; +add.f32 f584, f582, f579; +fma.rn.f32 f585, f563, 0f3E9E377A, f506; +mul.f32 f586, f565, 0f3F4F1BBD; +sub.f32 f587, f585, f586; +sub.f32 f588, f498, f504; +mul.f32 f589, f588, 0f3F737871; +sub.f32 f590, f500, f502; +mul.f32 f591, f590, 0fBF167918; +sub.f32 f592, f591, f589; +add.f32 f593, f592, f587; +sub.f32 f594, f587, f592; +mul.f32 f595, f563, 0f3F4F1BBD; +sub.f32 f596, f506, f595; +fma.rn.f32 f597, f565, 0f3E9E377A, f596; +mul.f32 f598, f588, 0f3F167918; +mul.f32 f599, f590, 0f3F737871; +sub.f32 f600, f599, f598; +add.f32 f601, f600, f597; +sub.f32 f602, f597, f600; +mul.f32 f603, f575, 0f3F4F1BBD; +mul.f32 f604, f593, 0fBF167918; +sub.f32 f605, f603, f604; +mul.f32 f606, f593, 0f3F4F1BBD; +fma.rn.f32 f607, f575, 0fBF167918, f606; +mul.f32 f608, f583, 0f3E9E377A; +mul.f32 f609, f601, 0fBF737871; +sub.f32 f610, f608, f609; +mul.f32 f611, f601, 0f3E9E377A; +fma.rn.f32 f612, f583, 0fBF737871, f611; +mul.f32 f613, f584, 0fBE9E377A; +mul.f32 f614, f602, 0fBF737871; +sub.f32 f615, f613, f614; +mul.f32 f616, f602, 0fBE9E377A; +fma.rn.f32 f617, f584, 0fBF737871, f616; +mul.f32 f618, f576, 0fBF4F1BBD; +mul.f32 f619, f594, 0fBF167918; +sub.f32 f620, f618, f619; +mul.f32 f621, f594, 0fBF4F1BBD; +fma.rn.f32 f622, f576, 0fBF167918, f621; +add.f32 f623, f518, f562; +add.f32 f624, f522, f566; +sub.f32 f625, f518, f562; +sub.f32 f626, f522, f566; +add.f32 f627, f531, f605; +add.f32 f628, f549, f607; +sub.f32 f629, f531, f605; +sub.f32 f630, f549, f607; +add.f32 f631, f539, f610; +add.f32 f632, f557, f612; +sub.f32 f633, f539, f610; +sub.f32 f634, f557, f612; +add.f32 f635, f540, f615; +add.f32 f636, f558, f617; +sub.f32 f637, f540, f615; +sub.f32 f638, f558, f617; +add.f32 f639, f532, f620; +add.f32 f640, f550, f622; +sub.f32 f641, f532, f620; +sub.f32 f642, f550, f622; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 8; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f643, f644}, [rd16]; +mul.f32 f647, f643, f627; +mul.f32 f648, f644, f628; +sub.f32 f649, f647, f648; +mul.f32 f650, f643, f628; +fma.rn.f32 f651, f644, f627, f650; +mul.f32 f652, f643, f643; +mul.f32 f653, f644, f644; +sub.f32 f654, f652, f653; +mul.f32 f655, f644, f643; +fma.rn.f32 f656, f644, f643, f655; +mul.f32 f657, f654, f631; +mul.f32 f658, f656, f632; +sub.f32 f659, f657, f658; +mul.f32 f660, f654, f632; +fma.rn.f32 f661, f656, f631, f660; +mul.f32 f662, f643, f654; +mul.f32 f663, f644, f656; +sub.f32 f664, f662, f663; +mul.f32 f665, f643, f656; +fma.rn.f32 f666, f644, f654, f665; +mul.f32 f667, f664, f635; +mul.f32 f668, f666, f636; +sub.f32 f669, f667, f668; +mul.f32 f670, f664, f636; +fma.rn.f32 f671, f666, f635, f670; +mul.f32 f672, f643, f664; +mul.f32 f673, f644, f666; +sub.f32 f674, f672, f673; +mul.f32 f675, f643, f666; +fma.rn.f32 f676, f644, f664, f675; +mul.f32 f677, f674, f639; +mul.f32 f678, f676, f640; +sub.f32 f679, f677, f678; +mul.f32 f680, f674, f640; +fma.rn.f32 f681, f676, f639, f680; +mul.f32 f682, f643, f674; +mul.f32 f683, f644, f676; +sub.f32 f684, f682, f683; +mul.f32 f685, f643, f676; +fma.rn.f32 f686, f644, f674, f685; +mul.f32 f687, f684, f625; +mul.f32 f688, f686, f626; +sub.f32 f689, f687, f688; +mul.f32 f690, f684, f626; +fma.rn.f32 f691, f686, f625, f690; +mul.f32 f692, f643, f684; +mul.f32 f693, f644, f686; +sub.f32 f694, f692, f693; +mul.f32 f695, f643, f686; +fma.rn.f32 f696, f644, f684, f695; +mul.f32 f697, f694, f629; +mul.f32 f698, f696, f630; +sub.f32 f699, f697, f698; +mul.f32 f700, f694, f630; +fma.rn.f32 f701, f696, f629, f700; +mul.f32 f702, f643, f694; +mul.f32 f703, f644, f696; +sub.f32 f704, f702, f703; +mul.f32 f705, f643, f696; +fma.rn.f32 f706, f644, f694, f705; +mul.f32 f707, f704, f633; +mul.f32 f708, f706, f634; +sub.f32 f709, f707, f708; +mul.f32 f710, f704, f634; +fma.rn.f32 f711, f706, f633, f710; +mul.f32 f712, f643, f704; +mul.f32 f713, f644, f706; +sub.f32 f714, f712, f713; +mul.f32 f715, f643, f706; +fma.rn.f32 f716, f644, f704, f715; +mul.f32 f717, f714, f637; +mul.f32 f718, f716, f638; +sub.f32 f719, f717, f718; +mul.f32 f720, f714, f638; +fma.rn.f32 f721, f716, f637, f720; +mul.f32 f722, f643, f714; +mul.f32 f723, f644, f716; +sub.f32 f724, f722, f723; +mul.f32 f725, f643, f716; +fma.rn.f32 f726, f644, f714, f725; +mul.f32 f727, f724, f641; +mul.f32 f728, f726, f642; +sub.f32 f729, f727, f728; +mul.f32 f730, f724, f642; +fma.rn.f32 f731, f726, f641, f730; +shl.b32 r20, r19, 2; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 4000, r21; +st.shared.f32 [r22], f623; +st.shared.f32 [r22+400], f649; +st.shared.f32 [r22+800], f659; +st.shared.f32 [r22+1200], f669; +st.shared.f32 [r22+1600], f679; +st.shared.f32 [r22+2000], f689; +st.shared.f32 [r22+2400], f699; +st.shared.f32 [r22+2800], f709; +st.shared.f32 [r22+3200], f719; +st.shared.f32 [r22+3600], f729; +barrier.sync 0; +ld.shared.f32 f732, [r10]; +ld.shared.f32 f733, [r10+4000]; +ld.shared.f32 f734, [r10+8000]; +ld.shared.f32 f735, [r10+12000]; +ld.shared.f32 f736, [r10+16000]; +ld.shared.f32 f737, [r10+20000]; +ld.shared.f32 f738, [r10+24000]; +ld.shared.f32 f739, [r10+28000]; +ld.shared.f32 f740, [r10+32000]; +ld.shared.f32 f741, [r10+36000]; +barrier.sync 0; +st.shared.f32 [r22], f624; +st.shared.f32 [r22+400], f651; +st.shared.f32 [r22+800], f661; +st.shared.f32 [r22+1200], f671; +st.shared.f32 [r22+1600], f681; +st.shared.f32 [r22+2000], f691; +st.shared.f32 [r22+2400], f701; +st.shared.f32 [r22+2800], f711; +st.shared.f32 [r22+3200], f721; +st.shared.f32 [r22+3600], f731; +barrier.sync 0; +ld.shared.f32 f742, [r10]; +ld.shared.f32 f743, [r10+4000]; +ld.shared.f32 f744, [r10+8000]; +ld.shared.f32 f745, [r10+12000]; +ld.shared.f32 f746, [r10+16000]; +ld.shared.f32 f747, [r10+20000]; +ld.shared.f32 f748, [r10+24000]; +ld.shared.f32 f749, [r10+28000]; +ld.shared.f32 f750, [r10+32000]; +ld.shared.f32 f751, [r10+36000]; +add.f32 f752, f734, f740; +add.f32 f753, f732, f752; +add.f32 f754, f736, f738; +add.f32 f755, f754, f753; +add.f32 f756, f744, f750; +add.f32 f757, f742, f756; +add.f32 f758, f746, f748; +add.f32 f759, f758, f757; +fma.rn.f32 f760, f752, 0f3E9E377A, f732; +mul.f32 f761, f754, 0f3F4F1BBD; +sub.f32 f762, f760, f761; +sub.f32 f763, f744, f750; +mul.f32 f764, f763, 0f3F737871; +sub.f32 f765, f746, f748; +mul.f32 f766, f765, 0fBF167918; +sub.f32 f767, f766, f764; +sub.f32 f768, f762, f767; +add.f32 f769, f767, f762; +mul.f32 f770, f752, 0f3F4F1BBD; +sub.f32 f771, f732, f770; +fma.rn.f32 f772, f754, 0f3E9E377A, f771; +mul.f32 f773, f763, 0f3F167918; +mul.f32 f774, f765, 0f3F737871; +sub.f32 f775, f774, f773; +sub.f32 f776, f772, f775; +add.f32 f777, f775, f772; +fma.rn.f32 f778, f756, 0f3E9E377A, f742; +mul.f32 f779, f758, 0f3F4F1BBD; +sub.f32 f780, f778, f779; +sub.f32 f781, f734, f740; +mul.f32 f782, f781, 0f3F737871; +sub.f32 f783, f736, f738; +mul.f32 f784, f783, 0fBF167918; +sub.f32 f785, f784, f782; +add.f32 f786, f785, f780; +sub.f32 f787, f780, f785; +mul.f32 f788, f756, 0f3F4F1BBD; +sub.f32 f789, f742, f788; +fma.rn.f32 f790, f758, 0f3E9E377A, f789; +mul.f32 f791, f781, 0f3F167918; +mul.f32 f792, f783, 0f3F737871; +sub.f32 f793, f792, f791; +add.f32 f794, f793, f790; +sub.f32 f795, f790, f793; +add.f32 f796, f735, f741; +add.f32 f797, f733, f796; +add.f32 f798, f737, f739; +add.f32 f799, f798, f797; +add.f32 f800, f745, f751; +add.f32 f801, f743, f800; +add.f32 f802, f747, f749; +add.f32 f803, f802, f801; +fma.rn.f32 f804, f796, 0f3E9E377A, f733; +mul.f32 f805, f798, 0f3F4F1BBD; +sub.f32 f806, f804, f805; +sub.f32 f807, f745, f751; +mul.f32 f808, f807, 0f3F737871; +sub.f32 f809, f747, f749; +mul.f32 f810, f809, 0fBF167918; +sub.f32 f811, f810, f808; +sub.f32 f812, f806, f811; +add.f32 f813, f811, f806; +mul.f32 f814, f796, 0f3F4F1BBD; +sub.f32 f815, f733, f814; +fma.rn.f32 f816, f798, 0f3E9E377A, f815; +mul.f32 f817, f807, 0f3F167918; +mul.f32 f818, f809, 0f3F737871; +sub.f32 f819, f818, f817; +sub.f32 f820, f816, f819; +add.f32 f821, f819, f816; +fma.rn.f32 f822, f800, 0f3E9E377A, f743; +mul.f32 f823, f802, 0f3F4F1BBD; +sub.f32 f824, f822, f823; +sub.f32 f825, f735, f741; +mul.f32 f826, f825, 0f3F737871; +sub.f32 f827, f737, f739; +mul.f32 f828, f827, 0fBF167918; +sub.f32 f829, f828, f826; +add.f32 f830, f829, f824; +sub.f32 f831, f824, f829; +mul.f32 f832, f800, 0f3F4F1BBD; +sub.f32 f833, f743, f832; +fma.rn.f32 f834, f802, 0f3E9E377A, f833; +mul.f32 f835, f825, 0f3F167918; +mul.f32 f836, f827, 0f3F737871; +sub.f32 f837, f836, f835; +add.f32 f838, f837, f834; +sub.f32 f839, f834, f837; +mul.f32 f840, f812, 0f3F4F1BBD; +mul.f32 f841, f830, 0fBF167918; +sub.f32 f842, f840, f841; +mul.f32 f843, f830, 0f3F4F1BBD; +fma.rn.f32 f844, f812, 0fBF167918, f843; +mul.f32 f845, f820, 0f3E9E377A; +mul.f32 f846, f838, 0fBF737871; +sub.f32 f847, f845, f846; +mul.f32 f848, f838, 0f3E9E377A; +fma.rn.f32 f849, f820, 0fBF737871, f848; +mul.f32 f850, f821, 0fBE9E377A; +mul.f32 f851, f839, 0fBF737871; +sub.f32 f852, f850, f851; +mul.f32 f853, f839, 0fBE9E377A; +fma.rn.f32 f854, f821, 0fBF737871, f853; +mul.f32 f855, f813, 0fBF4F1BBD; +mul.f32 f856, f831, 0fBF167918; +sub.f32 f857, f855, f856; +mul.f32 f858, f831, 0fBF4F1BBD; +fma.rn.f32 f859, f813, 0fBF167918, f858; +add.f32 %0, f755, f799; +add.f32 %1, f759, f803; +add.f32 %3, f786, f844; +add.f32 %2, f768, f842; +add.f32 %5, f794, f849; +add.f32 %4, f776, f847; +add.f32 %7, f795, f854; +add.f32 %6, f777, f852; +add.f32 %9, f787, f859; +add.f32 %8, f769, f857; +sub.f32 %10, f755, f799; +sub.f32 %11, f759, f803; +sub.f32 %13, f786, f844; +sub.f32 %12, f768, f842; +sub.f32 %15, f794, f849; +sub.f32 %14, f776, f847; +sub.f32 %17, f795, f854; +sub.f32 %16, f777, f852; +sub.f32 %19, f787, f859; +sub.f32 %18, f769, f857; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_10000), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..b6b51374a41e6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp32_inv.hpp.inc @@ -0,0 +1,1800 @@ +#ifndef CUFFTDX_FFT_10000_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_10000_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<397, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<924>; +.reg .b32 r<23>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 80000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %29, %45; +add.f32 f42, %24, f41; +add.f32 f43, %34, %40; +add.f32 f44, f43, f42; +add.f32 f45, %31, %47; +add.f32 f46, %25, f45; +add.f32 f47, %36, %41; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %24; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %31, %47; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %36, %41; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %24, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %25; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %29, %45; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %34, %40; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %25, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %32, %48; +add.f32 f84, %26, f83; +add.f32 f85, %37, %42; +add.f32 f86, f85, f84; +add.f32 f87, %33, %49; +add.f32 f88, %28, f87; +add.f32 f89, %39, %44; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %26; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %33, %49; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %39, %44; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %26, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %28; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %32, %48; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %37, %42; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %28, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +sub.f32 f145, f44, f86; +sub.f32 f146, f48, f90; +add.f32 f147, f56, f127; +add.f32 f148, f73, f129; +sub.f32 f149, f56, f127; +sub.f32 f150, f73, f129; +add.f32 f151, f64, f132; +add.f32 f152, f81, f134; +sub.f32 f153, f64, f132; +sub.f32 f154, f81, f134; +add.f32 f155, f65, f137; +add.f32 f156, f82, f139; +sub.f32 f157, f65, f137; +sub.f32 f158, f82, f139; +add.f32 f159, f57, f142; +add.f32 f160, f74, f144; +sub.f32 f161, f57, f142; +sub.f32 f162, f74, f144; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 80000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f163, f164}, [rd6]; +mul.f32 f167, f148, f164; +mul.f32 f168, f147, f164; +mul.f32 f169, f163, f148; +mul.f32 f170, f163, f163; +mul.f32 f171, f164, f164; +sub.f32 f172, f170, f171; +mul.f32 f173, f164, f163; +fma.rn.f32 f174, f164, f163, f173; +mul.f32 f175, f152, f174; +mul.f32 f176, f151, f174; +mul.f32 f177, f172, f152; +mul.f32 f178, f163, f172; +mul.f32 f179, f164, f174; +sub.f32 f180, f178, f179; +mul.f32 f181, f163, f174; +fma.rn.f32 f182, f164, f172, f181; +mul.f32 f183, f156, f182; +mul.f32 f184, f155, f182; +mul.f32 f185, f180, f156; +mul.f32 f186, f163, f180; +mul.f32 f187, f164, f182; +sub.f32 f188, f186, f187; +mul.f32 f189, f163, f182; +fma.rn.f32 f190, f164, f180, f189; +mul.f32 f191, f160, f190; +mul.f32 f192, f159, f190; +mul.f32 f193, f188, f160; +mul.f32 f194, f163, f188; +mul.f32 f195, f164, f190; +sub.f32 f196, f194, f195; +mul.f32 f197, f163, f190; +fma.rn.f32 f198, f164, f188, f197; +mul.f32 f199, f146, f198; +mul.f32 f200, f145, f198; +mul.f32 f201, f196, f146; +mul.f32 f202, f163, f196; +mul.f32 f203, f164, f198; +sub.f32 f204, f202, f203; +mul.f32 f205, f163, f198; +fma.rn.f32 f206, f164, f196, f205; +mul.f32 f207, f150, f206; +mul.f32 f208, f149, f206; +mul.f32 f209, f204, f150; +mul.f32 f210, f163, f204; +mul.f32 f211, f164, f206; +sub.f32 f212, f210, f211; +mul.f32 f213, f163, f206; +fma.rn.f32 f214, f164, f204, f213; +mul.f32 f215, f154, f214; +mul.f32 f216, f153, f214; +mul.f32 f217, f212, f154; +mul.f32 f218, f163, f212; +mul.f32 f219, f164, f214; +sub.f32 f220, f218, f219; +mul.f32 f221, f163, f214; +fma.rn.f32 f222, f164, f212, f221; +mul.f32 f223, f158, f222; +mul.f32 f224, f157, f222; +mul.f32 f225, f220, f158; +mul.f32 f226, f163, f220; +mul.f32 f227, f164, f222; +sub.f32 f228, f226, f227; +mul.f32 f229, f163, f222; +fma.rn.f32 f230, f164, f220, f229; +mul.f32 f231, f162, f230; +mul.f32 f232, f161, f230; +mul.f32 f233, f228, f162; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f32 f234, f48, f90; +add.f32 f235, f44, f86; +st.shared.v2.f32 [r9], {f235, f234}; +fma.rn.f32 f236, f163, f147, f167; +sub.f32 f237, f169, f168; +st.shared.v2.f32 [r9+8], {f236, f237}; +fma.rn.f32 f238, f172, f151, f175; +sub.f32 f239, f177, f176; +st.shared.v2.f32 [r9+16], {f238, f239}; +fma.rn.f32 f240, f180, f155, f183; +sub.f32 f241, f185, f184; +st.shared.v2.f32 [r9+24], {f240, f241}; +sub.f32 f242, f193, f192; +fma.rn.f32 f243, f188, f159, f191; +st.shared.v2.f32 [r9+32], {f243, f242}; +fma.rn.f32 f244, f196, f145, f199; +sub.f32 f245, f201, f200; +st.shared.v2.f32 [r9+40], {f244, f245}; +fma.rn.f32 f246, f204, f149, f207; +sub.f32 f247, f209, f208; +st.shared.v2.f32 [r9+48], {f246, f247}; +fma.rn.f32 f248, f212, f153, f215; +sub.f32 f249, f217, f216; +st.shared.v2.f32 [r9+56], {f248, f249}; +fma.rn.f32 f250, f220, f157, f223; +sub.f32 f251, f225, f224; +st.shared.v2.f32 [r9+64], {f250, f251}; +fma.rn.f32 f252, f228, f161, f231; +sub.f32 f253, f233, f232; +st.shared.v2.f32 [r9+72], {f252, f253}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.v2.f32 {f254, f255}, [r10]; +ld.shared.v2.f32 {f258, f259}, [r10+8000]; +ld.shared.v2.f32 {f262, f263}, [r10+16000]; +ld.shared.v2.f32 {f266, f267}, [r10+24000]; +ld.shared.v2.f32 {f270, f271}, [r10+32000]; +ld.shared.v2.f32 {f274, f275}, [r10+40000]; +ld.shared.v2.f32 {f278, f279}, [r10+48000]; +ld.shared.v2.f32 {f282, f283}, [r10+56000]; +ld.shared.v2.f32 {f286, f287}, [r10+64000]; +ld.shared.v2.f32 {f290, f291}, [r10+72000]; +add.f32 f294, f262, f286; +add.f32 f295, f254, f294; +add.f32 f296, f270, f278; +add.f32 f297, f296, f295; +add.f32 f298, f263, f287; +add.f32 f299, f255, f298; +add.f32 f300, f271, f279; +add.f32 f301, f300, f299; +fma.rn.f32 f302, f294, 0f3E9E377A, f254; +mul.f32 f303, f296, 0f3F4F1BBD; +sub.f32 f304, f302, f303; +sub.f32 f305, f263, f287; +mul.f32 f306, f305, 0f3F737871; +sub.f32 f307, f271, f279; +fma.rn.f32 f308, f307, 0f3F167918, f306; +sub.f32 f309, f304, f308; +add.f32 f310, f308, f304; +mul.f32 f311, f294, 0f3F4F1BBD; +sub.f32 f312, f254, f311; +fma.rn.f32 f313, f296, 0f3E9E377A, f312; +mul.f32 f314, f305, 0f3F167918; +mul.f32 f315, f307, 0f3F737871; +sub.f32 f316, f314, f315; +sub.f32 f317, f313, f316; +add.f32 f318, f316, f313; +fma.rn.f32 f319, f298, 0f3E9E377A, f255; +mul.f32 f320, f300, 0f3F4F1BBD; +sub.f32 f321, f319, f320; +sub.f32 f322, f262, f286; +mul.f32 f323, f322, 0f3F737871; +sub.f32 f324, f270, f278; +fma.rn.f32 f325, f324, 0f3F167918, f323; +add.f32 f326, f325, f321; +sub.f32 f327, f321, f325; +mul.f32 f328, f298, 0f3F4F1BBD; +sub.f32 f329, f255, f328; +fma.rn.f32 f330, f300, 0f3E9E377A, f329; +mul.f32 f331, f322, 0f3F167918; +mul.f32 f332, f324, 0f3F737871; +sub.f32 f333, f331, f332; +add.f32 f334, f333, f330; +sub.f32 f335, f330, f333; +add.f32 f336, f266, f290; +add.f32 f337, f258, f336; +add.f32 f338, f274, f282; +add.f32 f339, f338, f337; +add.f32 f340, f267, f291; +add.f32 f341, f259, f340; +add.f32 f342, f275, f283; +add.f32 f343, f342, f341; +fma.rn.f32 f344, f336, 0f3E9E377A, f258; +mul.f32 f345, f338, 0f3F4F1BBD; +sub.f32 f346, f344, f345; +sub.f32 f347, f267, f291; +mul.f32 f348, f347, 0f3F737871; +sub.f32 f349, f275, f283; +fma.rn.f32 f350, f349, 0f3F167918, f348; +sub.f32 f351, f346, f350; +add.f32 f352, f350, f346; +mul.f32 f353, f336, 0f3F4F1BBD; +sub.f32 f354, f258, f353; +fma.rn.f32 f355, f338, 0f3E9E377A, f354; +mul.f32 f356, f347, 0f3F167918; +mul.f32 f357, f349, 0f3F737871; +sub.f32 f358, f356, f357; +sub.f32 f359, f355, f358; +add.f32 f360, f358, f355; +fma.rn.f32 f361, f340, 0f3E9E377A, f259; +mul.f32 f362, f342, 0f3F4F1BBD; +sub.f32 f363, f361, f362; +sub.f32 f364, f266, f290; +mul.f32 f365, f364, 0f3F737871; +sub.f32 f366, f274, f282; +fma.rn.f32 f367, f366, 0f3F167918, f365; +add.f32 f368, f367, f363; +sub.f32 f369, f363, f367; +mul.f32 f370, f340, 0f3F4F1BBD; +sub.f32 f371, f259, f370; +fma.rn.f32 f372, f342, 0f3E9E377A, f371; +mul.f32 f373, f364, 0f3F167918; +mul.f32 f374, f366, 0f3F737871; +sub.f32 f375, f373, f374; +add.f32 f376, f375, f372; +sub.f32 f377, f372, f375; +mul.f32 f378, f351, 0f3F4F1BBD; +mul.f32 f379, f368, 0f3F167918; +sub.f32 f380, f378, f379; +mul.f32 f381, f368, 0f3F4F1BBD; +fma.rn.f32 f382, f351, 0f3F167918, f381; +mul.f32 f383, f359, 0f3E9E377A; +mul.f32 f384, f376, 0f3F737871; +sub.f32 f385, f383, f384; +mul.f32 f386, f376, 0f3E9E377A; +fma.rn.f32 f387, f359, 0f3F737871, f386; +mul.f32 f388, f360, 0fBE9E377A; +mul.f32 f389, f377, 0f3F737871; +sub.f32 f390, f388, f389; +mul.f32 f391, f377, 0fBE9E377A; +fma.rn.f32 f392, f360, 0f3F737871, f391; +mul.f32 f393, f352, 0fBF4F1BBD; +mul.f32 f394, f369, 0f3F167918; +sub.f32 f395, f393, f394; +mul.f32 f396, f369, 0fBF4F1BBD; +fma.rn.f32 f397, f352, 0f3F167918, f396; +sub.f32 f398, f297, f339; +sub.f32 f399, f301, f343; +add.f32 f400, f309, f380; +add.f32 f401, f326, f382; +sub.f32 f402, f309, f380; +sub.f32 f403, f326, f382; +add.f32 f404, f317, f385; +add.f32 f405, f334, f387; +sub.f32 f406, f317, f385; +sub.f32 f407, f334, f387; +add.f32 f408, f318, f390; +add.f32 f409, f335, f392; +sub.f32 f410, f318, f390; +sub.f32 f411, f335, f392; +add.f32 f412, f310, f395; +add.f32 f413, f327, f397; +sub.f32 f414, f310, f395; +sub.f32 f415, f327, f397; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f416, f417}, [rd11]; +mul.f32 f420, f401, f417; +mul.f32 f421, f400, f417; +mul.f32 f422, f416, f401; +mul.f32 f423, f416, f416; +mul.f32 f424, f417, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f417, f416; +fma.rn.f32 f427, f417, f416, f426; +mul.f32 f428, f405, f427; +mul.f32 f429, f404, f427; +mul.f32 f430, f425, f405; +mul.f32 f431, f416, f425; +mul.f32 f432, f417, f427; +sub.f32 f433, f431, f432; +mul.f32 f434, f416, f427; +fma.rn.f32 f435, f417, f425, f434; +mul.f32 f436, f409, f435; +mul.f32 f437, f408, f435; +mul.f32 f438, f433, f409; +mul.f32 f439, f416, f433; +mul.f32 f440, f417, f435; +sub.f32 f441, f439, f440; +mul.f32 f442, f416, f435; +fma.rn.f32 f443, f417, f433, f442; +mul.f32 f444, f413, f443; +mul.f32 f445, f412, f443; +mul.f32 f446, f441, f413; +mul.f32 f447, f416, f441; +mul.f32 f448, f417, f443; +sub.f32 f449, f447, f448; +mul.f32 f450, f416, f443; +fma.rn.f32 f451, f417, f441, f450; +mul.f32 f452, f399, f451; +mul.f32 f453, f398, f451; +mul.f32 f454, f449, f399; +mul.f32 f455, f416, f449; +mul.f32 f456, f417, f451; +sub.f32 f457, f455, f456; +mul.f32 f458, f416, f451; +fma.rn.f32 f459, f417, f449, f458; +mul.f32 f460, f403, f459; +mul.f32 f461, f402, f459; +mul.f32 f462, f457, f403; +mul.f32 f463, f416, f457; +mul.f32 f464, f417, f459; +sub.f32 f465, f463, f464; +mul.f32 f466, f416, f459; +fma.rn.f32 f467, f417, f457, f466; +mul.f32 f468, f407, f467; +mul.f32 f469, f406, f467; +mul.f32 f470, f465, f407; +mul.f32 f471, f416, f465; +mul.f32 f472, f417, f467; +sub.f32 f473, f471, f472; +mul.f32 f474, f416, f467; +fma.rn.f32 f475, f417, f465, f474; +mul.f32 f476, f411, f475; +mul.f32 f477, f410, f475; +mul.f32 f478, f473, f411; +mul.f32 f479, f416, f473; +mul.f32 f480, f417, f475; +sub.f32 f481, f479, f480; +mul.f32 f482, f416, f475; +fma.rn.f32 f483, f417, f473, f482; +mul.f32 f484, f415, f483; +mul.f32 f485, f414, f483; +mul.f32 f486, f481, f415; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +add.f32 f487, f301, f343; +add.f32 f488, f297, f339; +st.shared.v2.f32 [r16], {f488, f487}; +fma.rn.f32 f489, f416, f400, f420; +sub.f32 f490, f422, f421; +st.shared.v2.f32 [r16+80], {f489, f490}; +fma.rn.f32 f491, f425, f404, f428; +sub.f32 f492, f430, f429; +st.shared.v2.f32 [r16+160], {f491, f492}; +fma.rn.f32 f493, f433, f408, f436; +sub.f32 f494, f438, f437; +st.shared.v2.f32 [r16+240], {f493, f494}; +fma.rn.f32 f495, f441, f412, f444; +sub.f32 f496, f446, f445; +st.shared.v2.f32 [r16+320], {f495, f496}; +sub.f32 f497, f454, f453; +fma.rn.f32 f498, f449, f398, f452; +st.shared.v2.f32 [r16+400], {f498, f497}; +sub.f32 f499, f462, f461; +fma.rn.f32 f500, f457, f402, f460; +st.shared.v2.f32 [r16+480], {f500, f499}; +fma.rn.f32 f501, f465, f406, f468; +sub.f32 f502, f470, f469; +st.shared.v2.f32 [r16+560], {f501, f502}; +fma.rn.f32 f503, f473, f410, f476; +sub.f32 f504, f478, f477; +st.shared.v2.f32 [r16+640], {f503, f504}; +fma.rn.f32 f505, f481, f414, f484; +sub.f32 f506, f486, f485; +st.shared.v2.f32 [r16+720], {f505, f506}; +barrier.sync 0; +ld.shared.v2.f32 {f507, f508}, [r10]; +ld.shared.v2.f32 {f511, f512}, [r10+8000]; +ld.shared.v2.f32 {f515, f516}, [r10+16000]; +ld.shared.v2.f32 {f519, f520}, [r10+24000]; +ld.shared.v2.f32 {f523, f524}, [r10+32000]; +ld.shared.v2.f32 {f527, f528}, [r10+40000]; +ld.shared.v2.f32 {f531, f532}, [r10+48000]; +ld.shared.v2.f32 {f535, f536}, [r10+56000]; +ld.shared.v2.f32 {f539, f540}, [r10+64000]; +ld.shared.v2.f32 {f543, f544}, [r10+72000]; +add.f32 f547, f515, f539; +add.f32 f548, f507, f547; +add.f32 f549, f523, f531; +add.f32 f550, f549, f548; +add.f32 f551, f516, f540; +add.f32 f552, f508, f551; +add.f32 f553, f524, f532; +add.f32 f554, f553, f552; +fma.rn.f32 f555, f547, 0f3E9E377A, f507; +mul.f32 f556, f549, 0f3F4F1BBD; +sub.f32 f557, f555, f556; +sub.f32 f558, f516, f540; +mul.f32 f559, f558, 0f3F737871; +sub.f32 f560, f524, f532; +fma.rn.f32 f561, f560, 0f3F167918, f559; +sub.f32 f562, f557, f561; +add.f32 f563, f561, f557; +mul.f32 f564, f547, 0f3F4F1BBD; +sub.f32 f565, f507, f564; +fma.rn.f32 f566, f549, 0f3E9E377A, f565; +mul.f32 f567, f558, 0f3F167918; +mul.f32 f568, f560, 0f3F737871; +sub.f32 f569, f567, f568; +sub.f32 f570, f566, f569; +add.f32 f571, f569, f566; +fma.rn.f32 f572, f551, 0f3E9E377A, f508; +mul.f32 f573, f553, 0f3F4F1BBD; +sub.f32 f574, f572, f573; +sub.f32 f575, f515, f539; +mul.f32 f576, f575, 0f3F737871; +sub.f32 f577, f523, f531; +fma.rn.f32 f578, f577, 0f3F167918, f576; +add.f32 f579, f578, f574; +sub.f32 f580, f574, f578; +mul.f32 f581, f551, 0f3F4F1BBD; +sub.f32 f582, f508, f581; +fma.rn.f32 f583, f553, 0f3E9E377A, f582; +mul.f32 f584, f575, 0f3F167918; +mul.f32 f585, f577, 0f3F737871; +sub.f32 f586, f584, f585; +add.f32 f587, f586, f583; +sub.f32 f588, f583, f586; +add.f32 f589, f519, f543; +add.f32 f590, f511, f589; +add.f32 f591, f527, f535; +add.f32 f592, f591, f590; +add.f32 f593, f520, f544; +add.f32 f594, f512, f593; +add.f32 f595, f528, f536; +add.f32 f596, f595, f594; +fma.rn.f32 f597, f589, 0f3E9E377A, f511; +mul.f32 f598, f591, 0f3F4F1BBD; +sub.f32 f599, f597, f598; +sub.f32 f600, f520, f544; +mul.f32 f601, f600, 0f3F737871; +sub.f32 f602, f528, f536; +fma.rn.f32 f603, f602, 0f3F167918, f601; +sub.f32 f604, f599, f603; +add.f32 f605, f603, f599; +mul.f32 f606, f589, 0f3F4F1BBD; +sub.f32 f607, f511, f606; +fma.rn.f32 f608, f591, 0f3E9E377A, f607; +mul.f32 f609, f600, 0f3F167918; +mul.f32 f610, f602, 0f3F737871; +sub.f32 f611, f609, f610; +sub.f32 f612, f608, f611; +add.f32 f613, f611, f608; +fma.rn.f32 f614, f593, 0f3E9E377A, f512; +mul.f32 f615, f595, 0f3F4F1BBD; +sub.f32 f616, f614, f615; +sub.f32 f617, f519, f543; +mul.f32 f618, f617, 0f3F737871; +sub.f32 f619, f527, f535; +fma.rn.f32 f620, f619, 0f3F167918, f618; +add.f32 f621, f620, f616; +sub.f32 f622, f616, f620; +mul.f32 f623, f593, 0f3F4F1BBD; +sub.f32 f624, f512, f623; +fma.rn.f32 f625, f595, 0f3E9E377A, f624; +mul.f32 f626, f617, 0f3F167918; +mul.f32 f627, f619, 0f3F737871; +sub.f32 f628, f626, f627; +add.f32 f629, f628, f625; +sub.f32 f630, f625, f628; +mul.f32 f631, f604, 0f3F4F1BBD; +mul.f32 f632, f621, 0f3F167918; +sub.f32 f633, f631, f632; +mul.f32 f634, f621, 0f3F4F1BBD; +fma.rn.f32 f635, f604, 0f3F167918, f634; +mul.f32 f636, f612, 0f3E9E377A; +mul.f32 f637, f629, 0f3F737871; +sub.f32 f638, f636, f637; +mul.f32 f639, f629, 0f3E9E377A; +fma.rn.f32 f640, f612, 0f3F737871, f639; +mul.f32 f641, f613, 0fBE9E377A; +mul.f32 f642, f630, 0f3F737871; +sub.f32 f643, f641, f642; +mul.f32 f644, f630, 0fBE9E377A; +fma.rn.f32 f645, f613, 0f3F737871, f644; +mul.f32 f646, f605, 0fBF4F1BBD; +mul.f32 f647, f622, 0f3F167918; +sub.f32 f648, f646, f647; +mul.f32 f649, f622, 0fBF4F1BBD; +fma.rn.f32 f650, f605, 0f3F167918, f649; +sub.f32 f651, f550, f592; +sub.f32 f652, f554, f596; +add.f32 f653, f562, f633; +add.f32 f654, f579, f635; +sub.f32 f655, f562, f633; +sub.f32 f656, f579, f635; +add.f32 f657, f570, f638; +add.f32 f658, f587, f640; +sub.f32 f659, f570, f638; +sub.f32 f660, f587, f640; +add.f32 f661, f571, f643; +add.f32 f662, f588, f645; +sub.f32 f663, f571, f643; +sub.f32 f664, f588, f645; +add.f32 f665, f563, f648; +add.f32 f666, f580, f650; +sub.f32 f667, f563, f648; +sub.f32 f668, f580, f650; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 8; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f669, f670}, [rd16]; +mul.f32 f673, f654, f670; +mul.f32 f674, f653, f670; +mul.f32 f675, f669, f654; +mul.f32 f676, f669, f669; +mul.f32 f677, f670, f670; +sub.f32 f678, f676, f677; +mul.f32 f679, f670, f669; +fma.rn.f32 f680, f670, f669, f679; +mul.f32 f681, f658, f680; +mul.f32 f682, f657, f680; +mul.f32 f683, f678, f658; +mul.f32 f684, f669, f678; +mul.f32 f685, f670, f680; +sub.f32 f686, f684, f685; +mul.f32 f687, f669, f680; +fma.rn.f32 f688, f670, f678, f687; +mul.f32 f689, f662, f688; +mul.f32 f690, f661, f688; +mul.f32 f691, f686, f662; +mul.f32 f692, f669, f686; +mul.f32 f693, f670, f688; +sub.f32 f694, f692, f693; +mul.f32 f695, f669, f688; +fma.rn.f32 f696, f670, f686, f695; +mul.f32 f697, f666, f696; +mul.f32 f698, f665, f696; +mul.f32 f699, f694, f666; +mul.f32 f700, f669, f694; +mul.f32 f701, f670, f696; +sub.f32 f702, f700, f701; +mul.f32 f703, f669, f696; +fma.rn.f32 f704, f670, f694, f703; +mul.f32 f705, f652, f704; +mul.f32 f706, f651, f704; +mul.f32 f707, f702, f652; +mul.f32 f708, f669, f702; +mul.f32 f709, f670, f704; +sub.f32 f710, f708, f709; +mul.f32 f711, f669, f704; +fma.rn.f32 f712, f670, f702, f711; +mul.f32 f713, f656, f712; +mul.f32 f714, f655, f712; +mul.f32 f715, f710, f656; +mul.f32 f716, f669, f710; +mul.f32 f717, f670, f712; +sub.f32 f718, f716, f717; +mul.f32 f719, f669, f712; +fma.rn.f32 f720, f670, f710, f719; +mul.f32 f721, f660, f720; +mul.f32 f722, f659, f720; +mul.f32 f723, f718, f660; +mul.f32 f724, f669, f718; +mul.f32 f725, f670, f720; +sub.f32 f726, f724, f725; +mul.f32 f727, f669, f720; +fma.rn.f32 f728, f670, f718, f727; +mul.f32 f729, f664, f728; +mul.f32 f730, f663, f728; +mul.f32 f731, f726, f664; +mul.f32 f732, f669, f726; +mul.f32 f733, f670, f728; +sub.f32 f734, f732, f733; +mul.f32 f735, f669, f728; +fma.rn.f32 f736, f670, f726, f735; +mul.f32 f737, f668, f736; +mul.f32 f738, f667, f736; +mul.f32 f739, f734, f668; +shl.b32 r20, r19, 3; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 8000, r21; +add.f32 f740, f554, f596; +add.f32 f741, f550, f592; +st.shared.v2.f32 [r22], {f741, f740}; +fma.rn.f32 f742, f669, f653, f673; +sub.f32 f743, f675, f674; +st.shared.v2.f32 [r22+800], {f742, f743}; +fma.rn.f32 f744, f678, f657, f681; +sub.f32 f745, f683, f682; +st.shared.v2.f32 [r22+1600], {f744, f745}; +fma.rn.f32 f746, f686, f661, f689; +sub.f32 f747, f691, f690; +st.shared.v2.f32 [r22+2400], {f746, f747}; +fma.rn.f32 f748, f694, f665, f697; +sub.f32 f749, f699, f698; +st.shared.v2.f32 [r22+3200], {f748, f749}; +sub.f32 f750, f707, f706; +fma.rn.f32 f751, f702, f651, f705; +st.shared.v2.f32 [r22+4000], {f751, f750}; +sub.f32 f752, f715, f714; +fma.rn.f32 f753, f710, f655, f713; +st.shared.v2.f32 [r22+4800], {f753, f752}; +fma.rn.f32 f754, f718, f659, f721; +sub.f32 f755, f723, f722; +st.shared.v2.f32 [r22+5600], {f754, f755}; +fma.rn.f32 f756, f726, f663, f729; +sub.f32 f757, f731, f730; +st.shared.v2.f32 [r22+6400], {f756, f757}; +fma.rn.f32 f758, f734, f667, f737; +sub.f32 f759, f739, f738; +st.shared.v2.f32 [r22+7200], {f758, f759}; +barrier.sync 0; +ld.shared.v2.f32 {f760, f761}, [r10]; +ld.shared.v2.f32 {f764, f765}, [r10+8000]; +ld.shared.v2.f32 {f768, f769}, [r10+16000]; +ld.shared.v2.f32 {f772, f773}, [r10+24000]; +ld.shared.v2.f32 {f776, f777}, [r10+32000]; +ld.shared.v2.f32 {f780, f781}, [r10+40000]; +ld.shared.v2.f32 {f784, f785}, [r10+48000]; +ld.shared.v2.f32 {f788, f789}, [r10+56000]; +ld.shared.v2.f32 {f792, f793}, [r10+64000]; +ld.shared.v2.f32 {f796, f797}, [r10+72000]; +add.f32 f800, f768, f792; +add.f32 f801, f760, f800; +add.f32 f802, f776, f784; +add.f32 f803, f802, f801; +add.f32 f804, f769, f793; +add.f32 f805, f761, f804; +add.f32 f806, f777, f785; +add.f32 f807, f806, f805; +fma.rn.f32 f808, f800, 0f3E9E377A, f760; +mul.f32 f809, f802, 0f3F4F1BBD; +sub.f32 f810, f808, f809; +sub.f32 f811, f769, f793; +mul.f32 f812, f811, 0f3F737871; +sub.f32 f813, f777, f785; +fma.rn.f32 f814, f813, 0f3F167918, f812; +sub.f32 f815, f810, f814; +add.f32 f816, f814, f810; +mul.f32 f817, f800, 0f3F4F1BBD; +sub.f32 f818, f760, f817; +fma.rn.f32 f819, f802, 0f3E9E377A, f818; +mul.f32 f820, f811, 0f3F167918; +mul.f32 f821, f813, 0f3F737871; +sub.f32 f822, f820, f821; +sub.f32 f823, f819, f822; +add.f32 f824, f822, f819; +fma.rn.f32 f825, f804, 0f3E9E377A, f761; +mul.f32 f826, f806, 0f3F4F1BBD; +sub.f32 f827, f825, f826; +sub.f32 f828, f768, f792; +mul.f32 f829, f828, 0f3F737871; +sub.f32 f830, f776, f784; +fma.rn.f32 f831, f830, 0f3F167918, f829; +add.f32 f832, f831, f827; +sub.f32 f833, f827, f831; +mul.f32 f834, f804, 0f3F4F1BBD; +sub.f32 f835, f761, f834; +fma.rn.f32 f836, f806, 0f3E9E377A, f835; +mul.f32 f837, f828, 0f3F167918; +mul.f32 f838, f830, 0f3F737871; +sub.f32 f839, f837, f838; +add.f32 f840, f839, f836; +sub.f32 f841, f836, f839; +add.f32 f842, f772, f796; +add.f32 f843, f764, f842; +add.f32 f844, f780, f788; +add.f32 f845, f844, f843; +add.f32 f846, f773, f797; +add.f32 f847, f765, f846; +add.f32 f848, f781, f789; +add.f32 f849, f848, f847; +fma.rn.f32 f850, f842, 0f3E9E377A, f764; +mul.f32 f851, f844, 0f3F4F1BBD; +sub.f32 f852, f850, f851; +sub.f32 f853, f773, f797; +mul.f32 f854, f853, 0f3F737871; +sub.f32 f855, f781, f789; +fma.rn.f32 f856, f855, 0f3F167918, f854; +sub.f32 f857, f852, f856; +add.f32 f858, f856, f852; +mul.f32 f859, f842, 0f3F4F1BBD; +sub.f32 f860, f764, f859; +fma.rn.f32 f861, f844, 0f3E9E377A, f860; +mul.f32 f862, f853, 0f3F167918; +mul.f32 f863, f855, 0f3F737871; +sub.f32 f864, f862, f863; +sub.f32 f865, f861, f864; +add.f32 f866, f864, f861; +fma.rn.f32 f867, f846, 0f3E9E377A, f765; +mul.f32 f868, f848, 0f3F4F1BBD; +sub.f32 f869, f867, f868; +sub.f32 f870, f772, f796; +mul.f32 f871, f870, 0f3F737871; +sub.f32 f872, f780, f788; +fma.rn.f32 f873, f872, 0f3F167918, f871; +add.f32 f874, f873, f869; +sub.f32 f875, f869, f873; +mul.f32 f876, f846, 0f3F4F1BBD; +sub.f32 f877, f765, f876; +fma.rn.f32 f878, f848, 0f3E9E377A, f877; +mul.f32 f879, f870, 0f3F167918; +mul.f32 f880, f872, 0f3F737871; +sub.f32 f881, f879, f880; +add.f32 f882, f881, f878; +sub.f32 f883, f878, f881; +mul.f32 f884, f857, 0f3F4F1BBD; +mul.f32 f885, f874, 0f3F167918; +sub.f32 f886, f884, f885; +mul.f32 f887, f874, 0f3F4F1BBD; +fma.rn.f32 f888, f857, 0f3F167918, f887; +mul.f32 f889, f865, 0f3E9E377A; +mul.f32 f890, f882, 0f3F737871; +sub.f32 f891, f889, f890; +mul.f32 f892, f882, 0f3E9E377A; +fma.rn.f32 f893, f865, 0f3F737871, f892; +mul.f32 f894, f866, 0fBE9E377A; +mul.f32 f895, f883, 0f3F737871; +sub.f32 f896, f894, f895; +mul.f32 f897, f883, 0fBE9E377A; +fma.rn.f32 f898, f866, 0f3F737871, f897; +mul.f32 f899, f858, 0fBF4F1BBD; +mul.f32 f900, f875, 0f3F167918; +sub.f32 f901, f899, f900; +mul.f32 f902, f875, 0fBF4F1BBD; +fma.rn.f32 f903, f858, 0f3F167918, f902; +add.f32 %1, f807, f849; +add.f32 %0, f803, f845; +add.f32 %3, f832, f888; +add.f32 %2, f815, f886; +add.f32 %5, f840, f893; +add.f32 %4, f823, f891; +add.f32 %7, f841, f898; +add.f32 %6, f824, f896; +add.f32 %9, f833, f903; +add.f32 %8, f816, f901; +sub.f32 %11, f807, f849; +sub.f32 %10, f803, f845; +sub.f32 %13, f832, f888; +sub.f32 %12, f815, f886; +sub.f32 %15, f840, f893; +sub.f32 %14, f823, f891; +sub.f32 %17, f841, f898; +sub.f32 %16, f824, f896; +sub.f32 %19, f833, f903; +sub.f32 %18, f816, f901; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_10000), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<398, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<864>; +.reg .b32 r<23>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 40000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %29, %45; +add.f32 f42, %24, f41; +add.f32 f43, %34, %40; +add.f32 f44, f43, f42; +add.f32 f45, %31, %47; +add.f32 f46, %25, f45; +add.f32 f47, %36, %41; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %24; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %31, %47; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %36, %41; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %24, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %25; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %29, %45; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %34, %40; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %25, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %32, %48; +add.f32 f84, %26, f83; +add.f32 f85, %37, %42; +add.f32 f86, f85, f84; +add.f32 f87, %33, %49; +add.f32 f88, %28, f87; +add.f32 f89, %39, %44; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %26; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %33, %49; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %39, %44; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %26, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %28; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %32, %48; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %37, %42; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %28, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +add.f32 f145, f44, f86; +add.f32 f146, f48, f90; +sub.f32 f147, f44, f86; +sub.f32 f148, f48, f90; +add.f32 f149, f56, f127; +add.f32 f150, f73, f129; +sub.f32 f151, f56, f127; +sub.f32 f152, f73, f129; +add.f32 f153, f64, f132; +add.f32 f154, f81, f134; +sub.f32 f155, f64, f132; +sub.f32 f156, f81, f134; +add.f32 f157, f65, f137; +add.f32 f158, f82, f139; +sub.f32 f159, f65, f137; +sub.f32 f160, f82, f139; +add.f32 f161, f57, f142; +add.f32 f162, f74, f144; +sub.f32 f163, f57, f142; +sub.f32 f164, f74, f144; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f165, f166}, [rd6]; +mul.f32 f169, f150, f166; +fma.rn.f32 f170, f165, f149, f169; +mul.f32 f171, f149, f166; +mul.f32 f172, f165, f150; +sub.f32 f173, f172, f171; +mul.f32 f174, f165, f165; +mul.f32 f175, f166, f166; +sub.f32 f176, f174, f175; +mul.f32 f177, f166, f165; +fma.rn.f32 f178, f166, f165, f177; +mul.f32 f179, f154, f178; +fma.rn.f32 f180, f176, f153, f179; +mul.f32 f181, f153, f178; +mul.f32 f182, f176, f154; +sub.f32 f183, f182, f181; +mul.f32 f184, f165, f176; +mul.f32 f185, f166, f178; +sub.f32 f186, f184, f185; +mul.f32 f187, f165, f178; +fma.rn.f32 f188, f166, f176, f187; +mul.f32 f189, f158, f188; +fma.rn.f32 f190, f186, f157, f189; +mul.f32 f191, f157, f188; +mul.f32 f192, f186, f158; +sub.f32 f193, f192, f191; +mul.f32 f194, f165, f186; +mul.f32 f195, f166, f188; +sub.f32 f196, f194, f195; +mul.f32 f197, f165, f188; +fma.rn.f32 f198, f166, f186, f197; +mul.f32 f199, f162, f198; +fma.rn.f32 f200, f196, f161, f199; +mul.f32 f201, f161, f198; +mul.f32 f202, f196, f162; +sub.f32 f203, f202, f201; +mul.f32 f204, f165, f196; +mul.f32 f205, f166, f198; +sub.f32 f206, f204, f205; +mul.f32 f207, f165, f198; +fma.rn.f32 f208, f166, f196, f207; +mul.f32 f209, f148, f208; +fma.rn.f32 f210, f206, f147, f209; +mul.f32 f211, f147, f208; +mul.f32 f212, f206, f148; +sub.f32 f213, f212, f211; +mul.f32 f214, f165, f206; +mul.f32 f215, f166, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f165, f208; +fma.rn.f32 f218, f166, f206, f217; +mul.f32 f219, f152, f218; +fma.rn.f32 f220, f216, f151, f219; +mul.f32 f221, f151, f218; +mul.f32 f222, f216, f152; +sub.f32 f223, f222, f221; +mul.f32 f224, f165, f216; +mul.f32 f225, f166, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f165, f218; +fma.rn.f32 f228, f166, f216, f227; +mul.f32 f229, f156, f228; +fma.rn.f32 f230, f226, f155, f229; +mul.f32 f231, f155, f228; +mul.f32 f232, f226, f156; +sub.f32 f233, f232, f231; +mul.f32 f234, f165, f226; +mul.f32 f235, f166, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f165, f228; +fma.rn.f32 f238, f166, f226, f237; +mul.f32 f239, f160, f238; +fma.rn.f32 f240, f236, f159, f239; +mul.f32 f241, f159, f238; +mul.f32 f242, f236, f160; +sub.f32 f243, f242, f241; +mul.f32 f244, f165, f236; +mul.f32 f245, f166, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f165, f238; +fma.rn.f32 f248, f166, f236, f247; +mul.f32 f249, f164, f248; +fma.rn.f32 f250, f246, f163, f249; +mul.f32 f251, f163, f248; +mul.f32 f252, f246, f164; +sub.f32 f253, f252, f251; +mad.lo.s32 r8, r5, 40000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.v2.f32 [r9], {f145, f170}; +st.shared.v2.f32 [r9+8], {f180, f190}; +st.shared.v2.f32 [r9+16], {f200, f210}; +st.shared.v2.f32 [r9+24], {f220, f230}; +st.shared.v2.f32 [r9+32], {f240, f250}; +barrier.sync 0; +mad.lo.s32 r10, r7, -36, r9; +ld.shared.f32 f254, [r10]; +ld.shared.f32 f255, [r10+4000]; +ld.shared.f32 f256, [r10+8000]; +ld.shared.f32 f257, [r10+12000]; +ld.shared.f32 f258, [r10+16000]; +ld.shared.f32 f259, [r10+20000]; +ld.shared.f32 f260, [r10+24000]; +ld.shared.f32 f261, [r10+28000]; +ld.shared.f32 f262, [r10+32000]; +ld.shared.f32 f263, [r10+36000]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f146, f173}; +st.shared.v2.f32 [r9+8], {f183, f193}; +st.shared.v2.f32 [r9+16], {f203, f213}; +st.shared.v2.f32 [r9+24], {f223, f233}; +st.shared.v2.f32 [r9+32], {f243, f253}; +barrier.sync 0; +ld.shared.f32 f264, [r10]; +ld.shared.f32 f265, [r10+4000]; +ld.shared.f32 f266, [r10+8000]; +ld.shared.f32 f267, [r10+12000]; +ld.shared.f32 f268, [r10+16000]; +ld.shared.f32 f269, [r10+20000]; +ld.shared.f32 f270, [r10+24000]; +ld.shared.f32 f271, [r10+28000]; +ld.shared.f32 f272, [r10+32000]; +ld.shared.f32 f273, [r10+36000]; +add.f32 f274, f256, f262; +add.f32 f275, f254, f274; +add.f32 f276, f258, f260; +add.f32 f277, f276, f275; +add.f32 f278, f266, f272; +add.f32 f279, f264, f278; +add.f32 f280, f268, f270; +add.f32 f281, f280, f279; +fma.rn.f32 f282, f274, 0f3E9E377A, f254; +mul.f32 f283, f276, 0f3F4F1BBD; +sub.f32 f284, f282, f283; +sub.f32 f285, f266, f272; +mul.f32 f286, f285, 0f3F737871; +sub.f32 f287, f268, f270; +fma.rn.f32 f288, f287, 0f3F167918, f286; +sub.f32 f289, f284, f288; +add.f32 f290, f288, f284; +mul.f32 f291, f274, 0f3F4F1BBD; +sub.f32 f292, f254, f291; +fma.rn.f32 f293, f276, 0f3E9E377A, f292; +mul.f32 f294, f285, 0f3F167918; +mul.f32 f295, f287, 0f3F737871; +sub.f32 f296, f294, f295; +sub.f32 f297, f293, f296; +add.f32 f298, f296, f293; +fma.rn.f32 f299, f278, 0f3E9E377A, f264; +mul.f32 f300, f280, 0f3F4F1BBD; +sub.f32 f301, f299, f300; +sub.f32 f302, f256, f262; +mul.f32 f303, f302, 0f3F737871; +sub.f32 f304, f258, f260; +fma.rn.f32 f305, f304, 0f3F167918, f303; +add.f32 f306, f305, f301; +sub.f32 f307, f301, f305; +mul.f32 f308, f278, 0f3F4F1BBD; +sub.f32 f309, f264, f308; +fma.rn.f32 f310, f280, 0f3E9E377A, f309; +mul.f32 f311, f302, 0f3F167918; +mul.f32 f312, f304, 0f3F737871; +sub.f32 f313, f311, f312; +add.f32 f314, f313, f310; +sub.f32 f315, f310, f313; +add.f32 f316, f257, f263; +add.f32 f317, f255, f316; +add.f32 f318, f259, f261; +add.f32 f319, f318, f317; +add.f32 f320, f267, f273; +add.f32 f321, f265, f320; +add.f32 f322, f269, f271; +add.f32 f323, f322, f321; +fma.rn.f32 f324, f316, 0f3E9E377A, f255; +mul.f32 f325, f318, 0f3F4F1BBD; +sub.f32 f326, f324, f325; +sub.f32 f327, f267, f273; +mul.f32 f328, f327, 0f3F737871; +sub.f32 f329, f269, f271; +fma.rn.f32 f330, f329, 0f3F167918, f328; +sub.f32 f331, f326, f330; +add.f32 f332, f330, f326; +mul.f32 f333, f316, 0f3F4F1BBD; +sub.f32 f334, f255, f333; +fma.rn.f32 f335, f318, 0f3E9E377A, f334; +mul.f32 f336, f327, 0f3F167918; +mul.f32 f337, f329, 0f3F737871; +sub.f32 f338, f336, f337; +sub.f32 f339, f335, f338; +add.f32 f340, f338, f335; +fma.rn.f32 f341, f320, 0f3E9E377A, f265; +mul.f32 f342, f322, 0f3F4F1BBD; +sub.f32 f343, f341, f342; +sub.f32 f344, f257, f263; +mul.f32 f345, f344, 0f3F737871; +sub.f32 f346, f259, f261; +fma.rn.f32 f347, f346, 0f3F167918, f345; +add.f32 f348, f347, f343; +sub.f32 f349, f343, f347; +mul.f32 f350, f320, 0f3F4F1BBD; +sub.f32 f351, f265, f350; +fma.rn.f32 f352, f322, 0f3E9E377A, f351; +mul.f32 f353, f344, 0f3F167918; +mul.f32 f354, f346, 0f3F737871; +sub.f32 f355, f353, f354; +add.f32 f356, f355, f352; +sub.f32 f357, f352, f355; +mul.f32 f358, f331, 0f3F4F1BBD; +mul.f32 f359, f348, 0f3F167918; +sub.f32 f360, f358, f359; +mul.f32 f361, f348, 0f3F4F1BBD; +fma.rn.f32 f362, f331, 0f3F167918, f361; +mul.f32 f363, f339, 0f3E9E377A; +mul.f32 f364, f356, 0f3F737871; +sub.f32 f365, f363, f364; +mul.f32 f366, f356, 0f3E9E377A; +fma.rn.f32 f367, f339, 0f3F737871, f366; +mul.f32 f368, f340, 0fBE9E377A; +mul.f32 f369, f357, 0f3F737871; +sub.f32 f370, f368, f369; +mul.f32 f371, f357, 0fBE9E377A; +fma.rn.f32 f372, f340, 0f3F737871, f371; +mul.f32 f373, f332, 0fBF4F1BBD; +mul.f32 f374, f349, 0f3F167918; +sub.f32 f375, f373, f374; +mul.f32 f376, f349, 0fBF4F1BBD; +fma.rn.f32 f377, f332, 0f3F167918, f376; +add.f32 f378, f277, f319; +add.f32 f379, f281, f323; +sub.f32 f380, f277, f319; +sub.f32 f381, f281, f323; +add.f32 f382, f289, f360; +add.f32 f383, f306, f362; +sub.f32 f384, f289, f360; +sub.f32 f385, f306, f362; +add.f32 f386, f297, f365; +add.f32 f387, f314, f367; +sub.f32 f388, f297, f365; +sub.f32 f389, f314, f367; +add.f32 f390, f298, f370; +add.f32 f391, f315, f372; +sub.f32 f392, f298, f370; +sub.f32 f393, f315, f372; +add.f32 f394, f290, f375; +add.f32 f395, f307, f377; +sub.f32 f396, f290, f375; +sub.f32 f397, f307, f377; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f398, f399}, [rd11]; +mul.f32 f402, f383, f399; +fma.rn.f32 f403, f398, f382, f402; +mul.f32 f404, f382, f399; +mul.f32 f405, f398, f383; +sub.f32 f406, f405, f404; +mul.f32 f407, f398, f398; +mul.f32 f408, f399, f399; +sub.f32 f409, f407, f408; +mul.f32 f410, f399, f398; +fma.rn.f32 f411, f399, f398, f410; +mul.f32 f412, f387, f411; +fma.rn.f32 f413, f409, f386, f412; +mul.f32 f414, f386, f411; +mul.f32 f415, f409, f387; +sub.f32 f416, f415, f414; +mul.f32 f417, f398, f409; +mul.f32 f418, f399, f411; +sub.f32 f419, f417, f418; +mul.f32 f420, f398, f411; +fma.rn.f32 f421, f399, f409, f420; +mul.f32 f422, f391, f421; +fma.rn.f32 f423, f419, f390, f422; +mul.f32 f424, f390, f421; +mul.f32 f425, f419, f391; +sub.f32 f426, f425, f424; +mul.f32 f427, f398, f419; +mul.f32 f428, f399, f421; +sub.f32 f429, f427, f428; +mul.f32 f430, f398, f421; +fma.rn.f32 f431, f399, f419, f430; +mul.f32 f432, f395, f431; +fma.rn.f32 f433, f429, f394, f432; +mul.f32 f434, f394, f431; +mul.f32 f435, f429, f395; +sub.f32 f436, f435, f434; +mul.f32 f437, f398, f429; +mul.f32 f438, f399, f431; +sub.f32 f439, f437, f438; +mul.f32 f440, f398, f431; +fma.rn.f32 f441, f399, f429, f440; +mul.f32 f442, f381, f441; +fma.rn.f32 f443, f439, f380, f442; +mul.f32 f444, f380, f441; +mul.f32 f445, f439, f381; +sub.f32 f446, f445, f444; +mul.f32 f447, f398, f439; +mul.f32 f448, f399, f441; +sub.f32 f449, f447, f448; +mul.f32 f450, f398, f441; +fma.rn.f32 f451, f399, f439, f450; +mul.f32 f452, f385, f451; +fma.rn.f32 f453, f449, f384, f452; +mul.f32 f454, f384, f451; +mul.f32 f455, f449, f385; +sub.f32 f456, f455, f454; +mul.f32 f457, f398, f449; +mul.f32 f458, f399, f451; +sub.f32 f459, f457, f458; +mul.f32 f460, f398, f451; +fma.rn.f32 f461, f399, f449, f460; +mul.f32 f462, f389, f461; +fma.rn.f32 f463, f459, f388, f462; +mul.f32 f464, f388, f461; +mul.f32 f465, f459, f389; +sub.f32 f466, f465, f464; +mul.f32 f467, f398, f459; +mul.f32 f468, f399, f461; +sub.f32 f469, f467, f468; +mul.f32 f470, f398, f461; +fma.rn.f32 f471, f399, f459, f470; +mul.f32 f472, f393, f471; +fma.rn.f32 f473, f469, f392, f472; +mul.f32 f474, f392, f471; +mul.f32 f475, f469, f393; +sub.f32 f476, f475, f474; +mul.f32 f477, f398, f469; +mul.f32 f478, f399, f471; +sub.f32 f479, f477, f478; +mul.f32 f480, f398, f471; +fma.rn.f32 f481, f399, f469, f480; +mul.f32 f482, f397, f481; +fma.rn.f32 f483, f479, f396, f482; +mul.f32 f484, f396, f481; +mul.f32 f485, f479, f397; +sub.f32 f486, f485, f484; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 400, r15; +st.shared.f32 [r16], f378; +st.shared.f32 [r16+40], f403; +st.shared.f32 [r16+80], f413; +st.shared.f32 [r16+120], f423; +st.shared.f32 [r16+160], f433; +st.shared.f32 [r16+200], f443; +st.shared.f32 [r16+240], f453; +st.shared.f32 [r16+280], f463; +st.shared.f32 [r16+320], f473; +st.shared.f32 [r16+360], f483; +barrier.sync 0; +ld.shared.f32 f487, [r10]; +ld.shared.f32 f488, [r10+4000]; +ld.shared.f32 f489, [r10+8000]; +ld.shared.f32 f490, [r10+12000]; +ld.shared.f32 f491, [r10+16000]; +ld.shared.f32 f492, [r10+20000]; +ld.shared.f32 f493, [r10+24000]; +ld.shared.f32 f494, [r10+28000]; +ld.shared.f32 f495, [r10+32000]; +ld.shared.f32 f496, [r10+36000]; +barrier.sync 0; +st.shared.f32 [r16], f379; +st.shared.f32 [r16+40], f406; +st.shared.f32 [r16+80], f416; +st.shared.f32 [r16+120], f426; +st.shared.f32 [r16+160], f436; +st.shared.f32 [r16+200], f446; +st.shared.f32 [r16+240], f456; +st.shared.f32 [r16+280], f466; +st.shared.f32 [r16+320], f476; +st.shared.f32 [r16+360], f486; +barrier.sync 0; +ld.shared.f32 f497, [r10]; +ld.shared.f32 f498, [r10+4000]; +ld.shared.f32 f499, [r10+8000]; +ld.shared.f32 f500, [r10+12000]; +ld.shared.f32 f501, [r10+16000]; +ld.shared.f32 f502, [r10+20000]; +ld.shared.f32 f503, [r10+24000]; +ld.shared.f32 f504, [r10+28000]; +ld.shared.f32 f505, [r10+32000]; +ld.shared.f32 f506, [r10+36000]; +add.f32 f507, f489, f495; +add.f32 f508, f487, f507; +add.f32 f509, f491, f493; +add.f32 f510, f509, f508; +add.f32 f511, f499, f505; +add.f32 f512, f497, f511; +add.f32 f513, f501, f503; +add.f32 f514, f513, f512; +fma.rn.f32 f515, f507, 0f3E9E377A, f487; +mul.f32 f516, f509, 0f3F4F1BBD; +sub.f32 f517, f515, f516; +sub.f32 f518, f499, f505; +mul.f32 f519, f518, 0f3F737871; +sub.f32 f520, f501, f503; +fma.rn.f32 f521, f520, 0f3F167918, f519; +sub.f32 f522, f517, f521; +add.f32 f523, f521, f517; +mul.f32 f524, f507, 0f3F4F1BBD; +sub.f32 f525, f487, f524; +fma.rn.f32 f526, f509, 0f3E9E377A, f525; +mul.f32 f527, f518, 0f3F167918; +mul.f32 f528, f520, 0f3F737871; +sub.f32 f529, f527, f528; +sub.f32 f530, f526, f529; +add.f32 f531, f529, f526; +fma.rn.f32 f532, f511, 0f3E9E377A, f497; +mul.f32 f533, f513, 0f3F4F1BBD; +sub.f32 f534, f532, f533; +sub.f32 f535, f489, f495; +mul.f32 f536, f535, 0f3F737871; +sub.f32 f537, f491, f493; +fma.rn.f32 f538, f537, 0f3F167918, f536; +add.f32 f539, f538, f534; +sub.f32 f540, f534, f538; +mul.f32 f541, f511, 0f3F4F1BBD; +sub.f32 f542, f497, f541; +fma.rn.f32 f543, f513, 0f3E9E377A, f542; +mul.f32 f544, f535, 0f3F167918; +mul.f32 f545, f537, 0f3F737871; +sub.f32 f546, f544, f545; +add.f32 f547, f546, f543; +sub.f32 f548, f543, f546; +add.f32 f549, f490, f496; +add.f32 f550, f488, f549; +add.f32 f551, f492, f494; +add.f32 f552, f551, f550; +add.f32 f553, f500, f506; +add.f32 f554, f498, f553; +add.f32 f555, f502, f504; +add.f32 f556, f555, f554; +fma.rn.f32 f557, f549, 0f3E9E377A, f488; +mul.f32 f558, f551, 0f3F4F1BBD; +sub.f32 f559, f557, f558; +sub.f32 f560, f500, f506; +mul.f32 f561, f560, 0f3F737871; +sub.f32 f562, f502, f504; +fma.rn.f32 f563, f562, 0f3F167918, f561; +sub.f32 f564, f559, f563; +add.f32 f565, f563, f559; +mul.f32 f566, f549, 0f3F4F1BBD; +sub.f32 f567, f488, f566; +fma.rn.f32 f568, f551, 0f3E9E377A, f567; +mul.f32 f569, f560, 0f3F167918; +mul.f32 f570, f562, 0f3F737871; +sub.f32 f571, f569, f570; +sub.f32 f572, f568, f571; +add.f32 f573, f571, f568; +fma.rn.f32 f574, f553, 0f3E9E377A, f498; +mul.f32 f575, f555, 0f3F4F1BBD; +sub.f32 f576, f574, f575; +sub.f32 f577, f490, f496; +mul.f32 f578, f577, 0f3F737871; +sub.f32 f579, f492, f494; +fma.rn.f32 f580, f579, 0f3F167918, f578; +add.f32 f581, f580, f576; +sub.f32 f582, f576, f580; +mul.f32 f583, f553, 0f3F4F1BBD; +sub.f32 f584, f498, f583; +fma.rn.f32 f585, f555, 0f3E9E377A, f584; +mul.f32 f586, f577, 0f3F167918; +mul.f32 f587, f579, 0f3F737871; +sub.f32 f588, f586, f587; +add.f32 f589, f588, f585; +sub.f32 f590, f585, f588; +mul.f32 f591, f564, 0f3F4F1BBD; +mul.f32 f592, f581, 0f3F167918; +sub.f32 f593, f591, f592; +mul.f32 f594, f581, 0f3F4F1BBD; +fma.rn.f32 f595, f564, 0f3F167918, f594; +mul.f32 f596, f572, 0f3E9E377A; +mul.f32 f597, f589, 0f3F737871; +sub.f32 f598, f596, f597; +mul.f32 f599, f589, 0f3E9E377A; +fma.rn.f32 f600, f572, 0f3F737871, f599; +mul.f32 f601, f573, 0fBE9E377A; +mul.f32 f602, f590, 0f3F737871; +sub.f32 f603, f601, f602; +mul.f32 f604, f590, 0fBE9E377A; +fma.rn.f32 f605, f573, 0f3F737871, f604; +mul.f32 f606, f565, 0fBF4F1BBD; +mul.f32 f607, f582, 0f3F167918; +sub.f32 f608, f606, f607; +mul.f32 f609, f582, 0fBF4F1BBD; +fma.rn.f32 f610, f565, 0f3F167918, f609; +add.f32 f611, f510, f552; +add.f32 f612, f514, f556; +sub.f32 f613, f510, f552; +sub.f32 f614, f514, f556; +add.f32 f615, f522, f593; +add.f32 f616, f539, f595; +sub.f32 f617, f522, f593; +sub.f32 f618, f539, f595; +add.f32 f619, f530, f598; +add.f32 f620, f547, f600; +sub.f32 f621, f530, f598; +sub.f32 f622, f547, f600; +add.f32 f623, f531, f603; +add.f32 f624, f548, f605; +sub.f32 f625, f531, f603; +sub.f32 f626, f548, f605; +add.f32 f627, f523, f608; +add.f32 f628, f540, f610; +sub.f32 f629, f523, f608; +sub.f32 f630, f540, f610; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 8; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f631, f632}, [rd16]; +mul.f32 f635, f616, f632; +fma.rn.f32 f636, f631, f615, f635; +mul.f32 f637, f615, f632; +mul.f32 f638, f631, f616; +sub.f32 f639, f638, f637; +mul.f32 f640, f631, f631; +mul.f32 f641, f632, f632; +sub.f32 f642, f640, f641; +mul.f32 f643, f632, f631; +fma.rn.f32 f644, f632, f631, f643; +mul.f32 f645, f620, f644; +fma.rn.f32 f646, f642, f619, f645; +mul.f32 f647, f619, f644; +mul.f32 f648, f642, f620; +sub.f32 f649, f648, f647; +mul.f32 f650, f631, f642; +mul.f32 f651, f632, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f631, f644; +fma.rn.f32 f654, f632, f642, f653; +mul.f32 f655, f624, f654; +fma.rn.f32 f656, f652, f623, f655; +mul.f32 f657, f623, f654; +mul.f32 f658, f652, f624; +sub.f32 f659, f658, f657; +mul.f32 f660, f631, f652; +mul.f32 f661, f632, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f631, f654; +fma.rn.f32 f664, f632, f652, f663; +mul.f32 f665, f628, f664; +fma.rn.f32 f666, f662, f627, f665; +mul.f32 f667, f627, f664; +mul.f32 f668, f662, f628; +sub.f32 f669, f668, f667; +mul.f32 f670, f631, f662; +mul.f32 f671, f632, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f631, f664; +fma.rn.f32 f674, f632, f662, f673; +mul.f32 f675, f614, f674; +fma.rn.f32 f676, f672, f613, f675; +mul.f32 f677, f613, f674; +mul.f32 f678, f672, f614; +sub.f32 f679, f678, f677; +mul.f32 f680, f631, f672; +mul.f32 f681, f632, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f631, f674; +fma.rn.f32 f684, f632, f672, f683; +mul.f32 f685, f618, f684; +fma.rn.f32 f686, f682, f617, f685; +mul.f32 f687, f617, f684; +mul.f32 f688, f682, f618; +sub.f32 f689, f688, f687; +mul.f32 f690, f631, f682; +mul.f32 f691, f632, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f631, f684; +fma.rn.f32 f694, f632, f682, f693; +mul.f32 f695, f622, f694; +fma.rn.f32 f696, f692, f621, f695; +mul.f32 f697, f621, f694; +mul.f32 f698, f692, f622; +sub.f32 f699, f698, f697; +mul.f32 f700, f631, f692; +mul.f32 f701, f632, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f631, f694; +fma.rn.f32 f704, f632, f692, f703; +mul.f32 f705, f626, f704; +fma.rn.f32 f706, f702, f625, f705; +mul.f32 f707, f625, f704; +mul.f32 f708, f702, f626; +sub.f32 f709, f708, f707; +mul.f32 f710, f631, f702; +mul.f32 f711, f632, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f631, f704; +fma.rn.f32 f714, f632, f702, f713; +mul.f32 f715, f630, f714; +fma.rn.f32 f716, f712, f629, f715; +mul.f32 f717, f629, f714; +mul.f32 f718, f712, f630; +sub.f32 f719, f718, f717; +shl.b32 r20, r19, 2; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 4000, r21; +st.shared.f32 [r22], f611; +st.shared.f32 [r22+400], f636; +st.shared.f32 [r22+800], f646; +st.shared.f32 [r22+1200], f656; +st.shared.f32 [r22+1600], f666; +st.shared.f32 [r22+2000], f676; +st.shared.f32 [r22+2400], f686; +st.shared.f32 [r22+2800], f696; +st.shared.f32 [r22+3200], f706; +st.shared.f32 [r22+3600], f716; +barrier.sync 0; +ld.shared.f32 f720, [r10]; +ld.shared.f32 f721, [r10+4000]; +ld.shared.f32 f722, [r10+8000]; +ld.shared.f32 f723, [r10+12000]; +ld.shared.f32 f724, [r10+16000]; +ld.shared.f32 f725, [r10+20000]; +ld.shared.f32 f726, [r10+24000]; +ld.shared.f32 f727, [r10+28000]; +ld.shared.f32 f728, [r10+32000]; +ld.shared.f32 f729, [r10+36000]; +barrier.sync 0; +st.shared.f32 [r22], f612; +st.shared.f32 [r22+400], f639; +st.shared.f32 [r22+800], f649; +st.shared.f32 [r22+1200], f659; +st.shared.f32 [r22+1600], f669; +st.shared.f32 [r22+2000], f679; +st.shared.f32 [r22+2400], f689; +st.shared.f32 [r22+2800], f699; +st.shared.f32 [r22+3200], f709; +st.shared.f32 [r22+3600], f719; +barrier.sync 0; +ld.shared.f32 f730, [r10]; +ld.shared.f32 f731, [r10+4000]; +ld.shared.f32 f732, [r10+8000]; +ld.shared.f32 f733, [r10+12000]; +ld.shared.f32 f734, [r10+16000]; +ld.shared.f32 f735, [r10+20000]; +ld.shared.f32 f736, [r10+24000]; +ld.shared.f32 f737, [r10+28000]; +ld.shared.f32 f738, [r10+32000]; +ld.shared.f32 f739, [r10+36000]; +add.f32 f740, f722, f728; +add.f32 f741, f720, f740; +add.f32 f742, f724, f726; +add.f32 f743, f742, f741; +add.f32 f744, f732, f738; +add.f32 f745, f730, f744; +add.f32 f746, f734, f736; +add.f32 f747, f746, f745; +fma.rn.f32 f748, f740, 0f3E9E377A, f720; +mul.f32 f749, f742, 0f3F4F1BBD; +sub.f32 f750, f748, f749; +sub.f32 f751, f732, f738; +mul.f32 f752, f751, 0f3F737871; +sub.f32 f753, f734, f736; +fma.rn.f32 f754, f753, 0f3F167918, f752; +sub.f32 f755, f750, f754; +add.f32 f756, f754, f750; +mul.f32 f757, f740, 0f3F4F1BBD; +sub.f32 f758, f720, f757; +fma.rn.f32 f759, f742, 0f3E9E377A, f758; +mul.f32 f760, f751, 0f3F167918; +mul.f32 f761, f753, 0f3F737871; +sub.f32 f762, f760, f761; +sub.f32 f763, f759, f762; +add.f32 f764, f762, f759; +fma.rn.f32 f765, f744, 0f3E9E377A, f730; +mul.f32 f766, f746, 0f3F4F1BBD; +sub.f32 f767, f765, f766; +sub.f32 f768, f722, f728; +mul.f32 f769, f768, 0f3F737871; +sub.f32 f770, f724, f726; +fma.rn.f32 f771, f770, 0f3F167918, f769; +add.f32 f772, f771, f767; +sub.f32 f773, f767, f771; +mul.f32 f774, f744, 0f3F4F1BBD; +sub.f32 f775, f730, f774; +fma.rn.f32 f776, f746, 0f3E9E377A, f775; +mul.f32 f777, f768, 0f3F167918; +mul.f32 f778, f770, 0f3F737871; +sub.f32 f779, f777, f778; +add.f32 f780, f779, f776; +sub.f32 f781, f776, f779; +add.f32 f782, f723, f729; +add.f32 f783, f721, f782; +add.f32 f784, f725, f727; +add.f32 f785, f784, f783; +add.f32 f786, f733, f739; +add.f32 f787, f731, f786; +add.f32 f788, f735, f737; +add.f32 f789, f788, f787; +fma.rn.f32 f790, f782, 0f3E9E377A, f721; +mul.f32 f791, f784, 0f3F4F1BBD; +sub.f32 f792, f790, f791; +sub.f32 f793, f733, f739; +mul.f32 f794, f793, 0f3F737871; +sub.f32 f795, f735, f737; +fma.rn.f32 f796, f795, 0f3F167918, f794; +sub.f32 f797, f792, f796; +add.f32 f798, f796, f792; +mul.f32 f799, f782, 0f3F4F1BBD; +sub.f32 f800, f721, f799; +fma.rn.f32 f801, f784, 0f3E9E377A, f800; +mul.f32 f802, f793, 0f3F167918; +mul.f32 f803, f795, 0f3F737871; +sub.f32 f804, f802, f803; +sub.f32 f805, f801, f804; +add.f32 f806, f804, f801; +fma.rn.f32 f807, f786, 0f3E9E377A, f731; +mul.f32 f808, f788, 0f3F4F1BBD; +sub.f32 f809, f807, f808; +sub.f32 f810, f723, f729; +mul.f32 f811, f810, 0f3F737871; +sub.f32 f812, f725, f727; +fma.rn.f32 f813, f812, 0f3F167918, f811; +add.f32 f814, f813, f809; +sub.f32 f815, f809, f813; +mul.f32 f816, f786, 0f3F4F1BBD; +sub.f32 f817, f731, f816; +fma.rn.f32 f818, f788, 0f3E9E377A, f817; +mul.f32 f819, f810, 0f3F167918; +mul.f32 f820, f812, 0f3F737871; +sub.f32 f821, f819, f820; +add.f32 f822, f821, f818; +sub.f32 f823, f818, f821; +mul.f32 f824, f797, 0f3F4F1BBD; +mul.f32 f825, f814, 0f3F167918; +sub.f32 f826, f824, f825; +mul.f32 f827, f814, 0f3F4F1BBD; +fma.rn.f32 f828, f797, 0f3F167918, f827; +mul.f32 f829, f805, 0f3E9E377A; +mul.f32 f830, f822, 0f3F737871; +sub.f32 f831, f829, f830; +mul.f32 f832, f822, 0f3E9E377A; +fma.rn.f32 f833, f805, 0f3F737871, f832; +mul.f32 f834, f806, 0fBE9E377A; +mul.f32 f835, f823, 0f3F737871; +sub.f32 f836, f834, f835; +mul.f32 f837, f823, 0fBE9E377A; +fma.rn.f32 f838, f806, 0f3F737871, f837; +mul.f32 f839, f798, 0fBF4F1BBD; +mul.f32 f840, f815, 0f3F167918; +sub.f32 f841, f839, f840; +mul.f32 f842, f815, 0fBF4F1BBD; +fma.rn.f32 f843, f798, 0f3F167918, f842; +add.f32 %0, f743, f785; +add.f32 %1, f747, f789; +add.f32 %3, f772, f828; +add.f32 %2, f755, f826; +add.f32 %5, f780, f833; +add.f32 %4, f763, f831; +add.f32 %7, f781, f838; +add.f32 %6, f764, f836; +add.f32 %9, f773, f843; +add.f32 %8, f756, f841; +sub.f32 %10, f743, f785; +sub.f32 %11, f747, f789; +sub.f32 %13, f772, f828; +sub.f32 %12, f755, f826; +sub.f32 %15, f780, f833; +sub.f32 %14, f763, f831; +sub.f32 %17, f781, f838; +sub.f32 %16, f764, f836; +sub.f32 %19, f773, f843; +sub.f32 %18, f756, f841; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_10000), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..6c28c93a7c65b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp64_fwd.hpp.inc @@ -0,0 +1,1808 @@ +#ifndef CUFFTDX_FFT_10000_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_10000_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1166, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<23>; +.reg .f64 fd<937>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 160000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %29, %45; +add.f64 fd42, %24, fd41; +add.f64 fd43, %34, %40; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %31, %47; +add.f64 fd46, %25, fd45; +add.f64 fd47, %36, %41; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %24; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %31, %47; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %36, %41; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %24, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %25; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %29, %45; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %34, %40; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %25, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %32, %48; +add.f64 fd86, %26, fd85; +add.f64 fd87, %37, %42; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %33, %49; +add.f64 fd90, %28, fd89; +add.f64 fd91, %39, %44; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %26; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %33, %49; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %39, %44; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %26, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %28; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %32, %48; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %37, %42; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %28, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +sub.f64 fd149, fd44, fd88; +sub.f64 fd150, fd48, fd92; +add.f64 fd151, fd57, fd131; +add.f64 fd152, fd75, fd133; +sub.f64 fd153, fd57, fd131; +sub.f64 fd154, fd75, fd133; +add.f64 fd155, fd65, fd136; +add.f64 fd156, fd83, fd138; +sub.f64 fd157, fd65, fd136; +sub.f64 fd158, fd83, fd138; +add.f64 fd159, fd66, fd141; +add.f64 fd160, fd84, fd143; +sub.f64 fd161, fd66, fd141; +sub.f64 fd162, fd84, fd143; +add.f64 fd163, fd58, fd146; +add.f64 fd164, fd76, fd148; +sub.f64 fd165, fd58, fd146; +sub.f64 fd166, fd76, fd148; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 160000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd167, fd168}, [rd6]; +mul.f64 fd171, fd167, fd151; +mul.f64 fd172, fd168, fd152; +mul.f64 fd173, fd167, fd152; +mul.f64 fd174, fd167, fd167; +mul.f64 fd175, fd168, fd168; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd168, fd167; +fma.rn.f64 fd178, fd168, fd167, fd177; +mul.f64 fd179, fd176, fd155; +mul.f64 fd180, fd178, fd156; +mul.f64 fd181, fd176, fd156; +mul.f64 fd182, fd167, fd176; +mul.f64 fd183, fd168, fd178; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd167, fd178; +fma.rn.f64 fd186, fd168, fd176, fd185; +mul.f64 fd187, fd184, fd159; +mul.f64 fd188, fd186, fd160; +mul.f64 fd189, fd184, fd160; +mul.f64 fd190, fd167, fd184; +mul.f64 fd191, fd168, fd186; +sub.f64 fd192, fd190, fd191; +mul.f64 fd193, fd167, fd186; +fma.rn.f64 fd194, fd168, fd184, fd193; +mul.f64 fd195, fd192, fd163; +mul.f64 fd196, fd194, fd164; +mul.f64 fd197, fd192, fd164; +mul.f64 fd198, fd167, fd192; +mul.f64 fd199, fd168, fd194; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd167, fd194; +fma.rn.f64 fd202, fd168, fd192, fd201; +mul.f64 fd203, fd200, fd149; +mul.f64 fd204, fd202, fd150; +mul.f64 fd205, fd200, fd150; +ld.global.v2.f64 {fd206, fd207}, [rd6+16000]; +mul.f64 fd210, fd206, fd153; +mul.f64 fd211, fd207, fd154; +mul.f64 fd212, fd206, fd154; +mul.f64 fd213, fd167, fd206; +mul.f64 fd214, fd168, fd207; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd167, fd207; +fma.rn.f64 fd217, fd168, fd206, fd216; +mul.f64 fd218, fd215, fd157; +mul.f64 fd219, fd217, fd158; +mul.f64 fd220, fd215, fd158; +mul.f64 fd221, fd167, fd215; +mul.f64 fd222, fd168, fd217; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd167, fd217; +fma.rn.f64 fd225, fd168, fd215, fd224; +mul.f64 fd226, fd223, fd161; +mul.f64 fd227, fd225, fd162; +mul.f64 fd228, fd223, fd162; +mul.f64 fd229, fd167, fd223; +mul.f64 fd230, fd168, fd225; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd167, fd225; +fma.rn.f64 fd233, fd168, fd223, fd232; +mul.f64 fd234, fd231, fd165; +mul.f64 fd235, fd233, fd166; +mul.f64 fd236, fd231, fd166; +barrier.sync 0; +mad.lo.s32 r9, r7, 160, r8; +add.f64 fd237, fd48, fd92; +add.f64 fd238, fd44, fd88; +st.shared.v2.f64 [r9], {fd238, fd237}; +fma.rn.f64 fd239, fd168, fd151, fd173; +sub.f64 fd240, fd171, fd172; +st.shared.v2.f64 [r9+16], {fd240, fd239}; +fma.rn.f64 fd241, fd178, fd155, fd181; +sub.f64 fd242, fd179, fd180; +st.shared.v2.f64 [r9+32], {fd242, fd241}; +fma.rn.f64 fd243, fd186, fd159, fd189; +sub.f64 fd244, fd187, fd188; +st.shared.v2.f64 [r9+48], {fd244, fd243}; +sub.f64 fd245, fd195, fd196; +fma.rn.f64 fd246, fd194, fd163, fd197; +st.shared.v2.f64 [r9+64], {fd245, fd246}; +fma.rn.f64 fd247, fd202, fd149, fd205; +sub.f64 fd248, fd203, fd204; +st.shared.v2.f64 [r9+80], {fd248, fd247}; +fma.rn.f64 fd249, fd207, fd153, fd212; +sub.f64 fd250, fd210, fd211; +st.shared.v2.f64 [r9+96], {fd250, fd249}; +fma.rn.f64 fd251, fd217, fd157, fd220; +sub.f64 fd252, fd218, fd219; +st.shared.v2.f64 [r9+112], {fd252, fd251}; +fma.rn.f64 fd253, fd225, fd161, fd228; +sub.f64 fd254, fd226, fd227; +st.shared.v2.f64 [r9+128], {fd254, fd253}; +sub.f64 fd255, fd234, fd235; +fma.rn.f64 fd256, fd233, fd165, fd236; +st.shared.v2.f64 [r9+144], {fd255, fd256}; +barrier.sync 0; +mad.lo.s32 r10, r7, -144, r9; +ld.shared.v2.f64 {fd257, fd258}, [r10]; +ld.shared.v2.f64 {fd261, fd262}, [r10+16000]; +ld.shared.v2.f64 {fd265, fd266}, [r10+32000]; +ld.shared.v2.f64 {fd269, fd270}, [r10+48000]; +ld.shared.v2.f64 {fd273, fd274}, [r10+64000]; +ld.shared.v2.f64 {fd277, fd278}, [r10+80000]; +ld.shared.v2.f64 {fd281, fd282}, [r10+96000]; +ld.shared.v2.f64 {fd285, fd286}, [r10+112000]; +ld.shared.v2.f64 {fd289, fd290}, [r10+128000]; +ld.shared.v2.f64 {fd293, fd294}, [r10+144000]; +add.f64 fd297, fd265, fd289; +add.f64 fd298, fd257, fd297; +add.f64 fd299, fd273, fd281; +add.f64 fd300, fd299, fd298; +add.f64 fd301, fd266, fd290; +add.f64 fd302, fd258, fd301; +add.f64 fd303, fd274, fd282; +add.f64 fd304, fd303, fd302; +fma.rn.f64 fd305, fd297, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd306, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd307, fd305, fd306; +sub.f64 fd308, fd266, fd290; +mul.f64 fd309, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd310, fd274, fd282; +mul.f64 fd311, fd310, 0dBFE2CF2304755A5E; +sub.f64 fd312, fd311, fd309; +sub.f64 fd313, fd307, fd312; +add.f64 fd314, fd312, fd307; +mul.f64 fd315, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd316, fd257, fd315; +fma.rn.f64 fd317, fd299, 0d3FD3C6EF372FE950, fd316; +mul.f64 fd318, fd308, 0d3FE2CF2304755A5E; +mul.f64 fd319, fd310, 0d3FEE6F0E134454FF; +sub.f64 fd320, fd319, fd318; +sub.f64 fd321, fd317, fd320; +add.f64 fd322, fd320, fd317; +fma.rn.f64 fd323, fd301, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd324, fd303, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd265, fd289; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd273, fd281; +mul.f64 fd329, fd328, 0dBFE2CF2304755A5E; +sub.f64 fd330, fd329, fd327; +add.f64 fd331, fd330, fd325; +sub.f64 fd332, fd325, fd330; +mul.f64 fd333, fd301, 0d3FE9E3779B97F4A8; +sub.f64 fd334, fd258, fd333; +fma.rn.f64 fd335, fd303, 0d3FD3C6EF372FE950, fd334; +mul.f64 fd336, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd337, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd338, fd337, fd336; +add.f64 fd339, fd338, fd335; +sub.f64 fd340, fd335, fd338; +add.f64 fd341, fd269, fd293; +add.f64 fd342, fd261, fd341; +add.f64 fd343, fd277, fd285; +add.f64 fd344, fd343, fd342; +add.f64 fd345, fd270, fd294; +add.f64 fd346, fd262, fd345; +add.f64 fd347, fd278, fd286; +add.f64 fd348, fd347, fd346; +fma.rn.f64 fd349, fd341, 0d3FD3C6EF372FE950, fd261; +mul.f64 fd350, fd343, 0d3FE9E3779B97F4A8; +sub.f64 fd351, fd349, fd350; +sub.f64 fd352, fd270, fd294; +mul.f64 fd353, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd354, fd278, fd286; +mul.f64 fd355, fd354, 0dBFE2CF2304755A5E; +sub.f64 fd356, fd355, fd353; +sub.f64 fd357, fd351, fd356; +add.f64 fd358, fd356, fd351; +mul.f64 fd359, fd341, 0d3FE9E3779B97F4A8; +sub.f64 fd360, fd261, fd359; +fma.rn.f64 fd361, fd343, 0d3FD3C6EF372FE950, fd360; +mul.f64 fd362, fd352, 0d3FE2CF2304755A5E; +mul.f64 fd363, fd354, 0d3FEE6F0E134454FF; +sub.f64 fd364, fd363, fd362; +sub.f64 fd365, fd361, fd364; +add.f64 fd366, fd364, fd361; +fma.rn.f64 fd367, fd345, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd368, fd347, 0d3FE9E3779B97F4A8; +sub.f64 fd369, fd367, fd368; +sub.f64 fd370, fd269, fd293; +mul.f64 fd371, fd370, 0d3FEE6F0E134454FF; +sub.f64 fd372, fd277, fd285; +mul.f64 fd373, fd372, 0dBFE2CF2304755A5E; +sub.f64 fd374, fd373, fd371; +add.f64 fd375, fd374, fd369; +sub.f64 fd376, fd369, fd374; +mul.f64 fd377, fd345, 0d3FE9E3779B97F4A8; +sub.f64 fd378, fd262, fd377; +fma.rn.f64 fd379, fd347, 0d3FD3C6EF372FE950, fd378; +mul.f64 fd380, fd370, 0d3FE2CF2304755A5E; +mul.f64 fd381, fd372, 0d3FEE6F0E134454FF; +sub.f64 fd382, fd381, fd380; +add.f64 fd383, fd382, fd379; +sub.f64 fd384, fd379, fd382; +mul.f64 fd385, fd357, 0d3FE9E3779B97F4A8; +mul.f64 fd386, fd375, 0dBFE2CF2304755A5E; +sub.f64 fd387, fd385, fd386; +mul.f64 fd388, fd375, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd389, fd357, 0dBFE2CF2304755A5E, fd388; +mul.f64 fd390, fd365, 0d3FD3C6EF372FE950; +mul.f64 fd391, fd383, 0dBFEE6F0E134454FF; +sub.f64 fd392, fd390, fd391; +mul.f64 fd393, fd383, 0d3FD3C6EF372FE950; +fma.rn.f64 fd394, fd365, 0dBFEE6F0E134454FF, fd393; +mul.f64 fd395, fd366, 0dBFD3C6EF372FE950; +mul.f64 fd396, fd384, 0dBFEE6F0E134454FF; +sub.f64 fd397, fd395, fd396; +mul.f64 fd398, fd384, 0dBFD3C6EF372FE950; +fma.rn.f64 fd399, fd366, 0dBFEE6F0E134454FF, fd398; +mul.f64 fd400, fd358, 0dBFE9E3779B97F4A8; +mul.f64 fd401, fd376, 0dBFE2CF2304755A5E; +sub.f64 fd402, fd400, fd401; +mul.f64 fd403, fd376, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd404, fd358, 0dBFE2CF2304755A5E, fd403; +sub.f64 fd405, fd300, fd344; +sub.f64 fd406, fd304, fd348; +add.f64 fd407, fd313, fd387; +add.f64 fd408, fd331, fd389; +sub.f64 fd409, fd313, fd387; +sub.f64 fd410, fd331, fd389; +add.f64 fd411, fd321, fd392; +add.f64 fd412, fd339, fd394; +sub.f64 fd413, fd321, fd392; +sub.f64 fd414, fd339, fd394; +add.f64 fd415, fd322, fd397; +add.f64 fd416, fd340, fd399; +sub.f64 fd417, fd322, fd397; +sub.f64 fd418, fd340, fd399; +add.f64 fd419, fd314, fd402; +add.f64 fd420, fd332, fd404; +sub.f64 fd421, fd314, fd402; +sub.f64 fd422, fd332, fd404; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd423, fd424}, [rd11]; +mul.f64 fd427, fd423, fd407; +mul.f64 fd428, fd424, fd408; +mul.f64 fd429, fd423, fd408; +mul.f64 fd430, fd423, fd423; +mul.f64 fd431, fd424, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd424, fd423; +fma.rn.f64 fd434, fd424, fd423, fd433; +mul.f64 fd435, fd432, fd411; +mul.f64 fd436, fd434, fd412; +mul.f64 fd437, fd432, fd412; +mul.f64 fd438, fd423, fd432; +mul.f64 fd439, fd424, fd434; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd423, fd434; +fma.rn.f64 fd442, fd424, fd432, fd441; +mul.f64 fd443, fd440, fd415; +mul.f64 fd444, fd442, fd416; +mul.f64 fd445, fd440, fd416; +mul.f64 fd446, fd423, fd440; +mul.f64 fd447, fd424, fd442; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd423, fd442; +fma.rn.f64 fd450, fd424, fd440, fd449; +mul.f64 fd451, fd448, fd419; +mul.f64 fd452, fd450, fd420; +mul.f64 fd453, fd448, fd420; +mul.f64 fd454, fd423, fd448; +mul.f64 fd455, fd424, fd450; +sub.f64 fd456, fd454, fd455; +mul.f64 fd457, fd423, fd450; +fma.rn.f64 fd458, fd424, fd448, fd457; +mul.f64 fd459, fd456, fd405; +mul.f64 fd460, fd458, fd406; +mul.f64 fd461, fd456, fd406; +ld.global.v2.f64 {fd462, fd463}, [rd11+1600]; +mul.f64 fd466, fd462, fd409; +mul.f64 fd467, fd463, fd410; +mul.f64 fd468, fd462, fd410; +mul.f64 fd469, fd423, fd462; +mul.f64 fd470, fd424, fd463; +sub.f64 fd471, fd469, fd470; +mul.f64 fd472, fd423, fd463; +fma.rn.f64 fd473, fd424, fd462, fd472; +mul.f64 fd474, fd471, fd413; +mul.f64 fd475, fd473, fd414; +mul.f64 fd476, fd471, fd414; +mul.f64 fd477, fd423, fd471; +mul.f64 fd478, fd424, fd473; +sub.f64 fd479, fd477, fd478; +mul.f64 fd480, fd423, fd473; +fma.rn.f64 fd481, fd424, fd471, fd480; +mul.f64 fd482, fd479, fd417; +mul.f64 fd483, fd481, fd418; +mul.f64 fd484, fd479, fd418; +mul.f64 fd485, fd423, fd479; +mul.f64 fd486, fd424, fd481; +sub.f64 fd487, fd485, fd486; +mul.f64 fd488, fd423, fd481; +fma.rn.f64 fd489, fd424, fd479, fd488; +mul.f64 fd490, fd487, fd421; +mul.f64 fd491, fd489, fd422; +mul.f64 fd492, fd487, fd422; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1600, r15; +add.f64 fd493, fd304, fd348; +add.f64 fd494, fd300, fd344; +st.shared.v2.f64 [r16], {fd494, fd493}; +fma.rn.f64 fd495, fd424, fd407, fd429; +sub.f64 fd496, fd427, fd428; +st.shared.v2.f64 [r16+160], {fd496, fd495}; +fma.rn.f64 fd497, fd434, fd411, fd437; +sub.f64 fd498, fd435, fd436; +st.shared.v2.f64 [r16+320], {fd498, fd497}; +fma.rn.f64 fd499, fd442, fd415, fd445; +sub.f64 fd500, fd443, fd444; +st.shared.v2.f64 [r16+480], {fd500, fd499}; +fma.rn.f64 fd501, fd450, fd419, fd453; +sub.f64 fd502, fd451, fd452; +st.shared.v2.f64 [r16+640], {fd502, fd501}; +sub.f64 fd503, fd459, fd460; +fma.rn.f64 fd504, fd458, fd405, fd461; +st.shared.v2.f64 [r16+800], {fd503, fd504}; +fma.rn.f64 fd505, fd463, fd409, fd468; +sub.f64 fd506, fd466, fd467; +st.shared.v2.f64 [r16+960], {fd506, fd505}; +fma.rn.f64 fd507, fd473, fd413, fd476; +sub.f64 fd508, fd474, fd475; +st.shared.v2.f64 [r16+1120], {fd508, fd507}; +fma.rn.f64 fd509, fd481, fd417, fd484; +sub.f64 fd510, fd482, fd483; +st.shared.v2.f64 [r16+1280], {fd510, fd509}; +fma.rn.f64 fd511, fd489, fd421, fd492; +sub.f64 fd512, fd490, fd491; +st.shared.v2.f64 [r16+1440], {fd512, fd511}; +barrier.sync 0; +ld.shared.v2.f64 {fd513, fd514}, [r10]; +ld.shared.v2.f64 {fd517, fd518}, [r10+16000]; +ld.shared.v2.f64 {fd521, fd522}, [r10+32000]; +ld.shared.v2.f64 {fd525, fd526}, [r10+48000]; +ld.shared.v2.f64 {fd529, fd530}, [r10+64000]; +ld.shared.v2.f64 {fd533, fd534}, [r10+80000]; +ld.shared.v2.f64 {fd537, fd538}, [r10+96000]; +ld.shared.v2.f64 {fd541, fd542}, [r10+112000]; +ld.shared.v2.f64 {fd545, fd546}, [r10+128000]; +ld.shared.v2.f64 {fd549, fd550}, [r10+144000]; +add.f64 fd553, fd521, fd545; +add.f64 fd554, fd513, fd553; +add.f64 fd555, fd529, fd537; +add.f64 fd556, fd555, fd554; +add.f64 fd557, fd522, fd546; +add.f64 fd558, fd514, fd557; +add.f64 fd559, fd530, fd538; +add.f64 fd560, fd559, fd558; +fma.rn.f64 fd561, fd553, 0d3FD3C6EF372FE950, fd513; +mul.f64 fd562, fd555, 0d3FE9E3779B97F4A8; +sub.f64 fd563, fd561, fd562; +sub.f64 fd564, fd522, fd546; +mul.f64 fd565, fd564, 0d3FEE6F0E134454FF; +sub.f64 fd566, fd530, fd538; +mul.f64 fd567, fd566, 0dBFE2CF2304755A5E; +sub.f64 fd568, fd567, fd565; +sub.f64 fd569, fd563, fd568; +add.f64 fd570, fd568, fd563; +mul.f64 fd571, fd553, 0d3FE9E3779B97F4A8; +sub.f64 fd572, fd513, fd571; +fma.rn.f64 fd573, fd555, 0d3FD3C6EF372FE950, fd572; +mul.f64 fd574, fd564, 0d3FE2CF2304755A5E; +mul.f64 fd575, fd566, 0d3FEE6F0E134454FF; +sub.f64 fd576, fd575, fd574; +sub.f64 fd577, fd573, fd576; +add.f64 fd578, fd576, fd573; +fma.rn.f64 fd579, fd557, 0d3FD3C6EF372FE950, fd514; +mul.f64 fd580, fd559, 0d3FE9E3779B97F4A8; +sub.f64 fd581, fd579, fd580; +sub.f64 fd582, fd521, fd545; +mul.f64 fd583, fd582, 0d3FEE6F0E134454FF; +sub.f64 fd584, fd529, fd537; +mul.f64 fd585, fd584, 0dBFE2CF2304755A5E; +sub.f64 fd586, fd585, fd583; +add.f64 fd587, fd586, fd581; +sub.f64 fd588, fd581, fd586; +mul.f64 fd589, fd557, 0d3FE9E3779B97F4A8; +sub.f64 fd590, fd514, fd589; +fma.rn.f64 fd591, fd559, 0d3FD3C6EF372FE950, fd590; +mul.f64 fd592, fd582, 0d3FE2CF2304755A5E; +mul.f64 fd593, fd584, 0d3FEE6F0E134454FF; +sub.f64 fd594, fd593, fd592; +add.f64 fd595, fd594, fd591; +sub.f64 fd596, fd591, fd594; +add.f64 fd597, fd525, fd549; +add.f64 fd598, fd517, fd597; +add.f64 fd599, fd533, fd541; +add.f64 fd600, fd599, fd598; +add.f64 fd601, fd526, fd550; +add.f64 fd602, fd518, fd601; +add.f64 fd603, fd534, fd542; +add.f64 fd604, fd603, fd602; +fma.rn.f64 fd605, fd597, 0d3FD3C6EF372FE950, fd517; +mul.f64 fd606, fd599, 0d3FE9E3779B97F4A8; +sub.f64 fd607, fd605, fd606; +sub.f64 fd608, fd526, fd550; +mul.f64 fd609, fd608, 0d3FEE6F0E134454FF; +sub.f64 fd610, fd534, fd542; +mul.f64 fd611, fd610, 0dBFE2CF2304755A5E; +sub.f64 fd612, fd611, fd609; +sub.f64 fd613, fd607, fd612; +add.f64 fd614, fd612, fd607; +mul.f64 fd615, fd597, 0d3FE9E3779B97F4A8; +sub.f64 fd616, fd517, fd615; +fma.rn.f64 fd617, fd599, 0d3FD3C6EF372FE950, fd616; +mul.f64 fd618, fd608, 0d3FE2CF2304755A5E; +mul.f64 fd619, fd610, 0d3FEE6F0E134454FF; +sub.f64 fd620, fd619, fd618; +sub.f64 fd621, fd617, fd620; +add.f64 fd622, fd620, fd617; +fma.rn.f64 fd623, fd601, 0d3FD3C6EF372FE950, fd518; +mul.f64 fd624, fd603, 0d3FE9E3779B97F4A8; +sub.f64 fd625, fd623, fd624; +sub.f64 fd626, fd525, fd549; +mul.f64 fd627, fd626, 0d3FEE6F0E134454FF; +sub.f64 fd628, fd533, fd541; +mul.f64 fd629, fd628, 0dBFE2CF2304755A5E; +sub.f64 fd630, fd629, fd627; +add.f64 fd631, fd630, fd625; +sub.f64 fd632, fd625, fd630; +mul.f64 fd633, fd601, 0d3FE9E3779B97F4A8; +sub.f64 fd634, fd518, fd633; +fma.rn.f64 fd635, fd603, 0d3FD3C6EF372FE950, fd634; +mul.f64 fd636, fd626, 0d3FE2CF2304755A5E; +mul.f64 fd637, fd628, 0d3FEE6F0E134454FF; +sub.f64 fd638, fd637, fd636; +add.f64 fd639, fd638, fd635; +sub.f64 fd640, fd635, fd638; +mul.f64 fd641, fd613, 0d3FE9E3779B97F4A8; +mul.f64 fd642, fd631, 0dBFE2CF2304755A5E; +sub.f64 fd643, fd641, fd642; +mul.f64 fd644, fd631, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd645, fd613, 0dBFE2CF2304755A5E, fd644; +mul.f64 fd646, fd621, 0d3FD3C6EF372FE950; +mul.f64 fd647, fd639, 0dBFEE6F0E134454FF; +sub.f64 fd648, fd646, fd647; +mul.f64 fd649, fd639, 0d3FD3C6EF372FE950; +fma.rn.f64 fd650, fd621, 0dBFEE6F0E134454FF, fd649; +mul.f64 fd651, fd622, 0dBFD3C6EF372FE950; +mul.f64 fd652, fd640, 0dBFEE6F0E134454FF; +sub.f64 fd653, fd651, fd652; +mul.f64 fd654, fd640, 0dBFD3C6EF372FE950; +fma.rn.f64 fd655, fd622, 0dBFEE6F0E134454FF, fd654; +mul.f64 fd656, fd614, 0dBFE9E3779B97F4A8; +mul.f64 fd657, fd632, 0dBFE2CF2304755A5E; +sub.f64 fd658, fd656, fd657; +mul.f64 fd659, fd632, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd660, fd614, 0dBFE2CF2304755A5E, fd659; +sub.f64 fd661, fd556, fd600; +sub.f64 fd662, fd560, fd604; +add.f64 fd663, fd569, fd643; +add.f64 fd664, fd587, fd645; +sub.f64 fd665, fd569, fd643; +sub.f64 fd666, fd587, fd645; +add.f64 fd667, fd577, fd648; +add.f64 fd668, fd595, fd650; +sub.f64 fd669, fd577, fd648; +sub.f64 fd670, fd595, fd650; +add.f64 fd671, fd578, fd653; +add.f64 fd672, fd596, fd655; +sub.f64 fd673, fd578, fd653; +sub.f64 fd674, fd596, fd655; +add.f64 fd675, fd570, fd658; +add.f64 fd676, fd588, fd660; +sub.f64 fd677, fd570, fd658; +sub.f64 fd678, fd588, fd660; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 16; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd679, fd680}, [rd16]; +mul.f64 fd683, fd679, fd663; +mul.f64 fd684, fd680, fd664; +mul.f64 fd685, fd679, fd664; +mul.f64 fd686, fd679, fd679; +mul.f64 fd687, fd680, fd680; +sub.f64 fd688, fd686, fd687; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd691, fd688, fd667; +mul.f64 fd692, fd690, fd668; +mul.f64 fd693, fd688, fd668; +mul.f64 fd694, fd679, fd688; +mul.f64 fd695, fd680, fd690; +sub.f64 fd696, fd694, fd695; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd699, fd696, fd671; +mul.f64 fd700, fd698, fd672; +mul.f64 fd701, fd696, fd672; +mul.f64 fd702, fd679, fd696; +mul.f64 fd703, fd680, fd698; +sub.f64 fd704, fd702, fd703; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd707, fd704, fd675; +mul.f64 fd708, fd706, fd676; +mul.f64 fd709, fd704, fd676; +mul.f64 fd710, fd679, fd704; +mul.f64 fd711, fd680, fd706; +sub.f64 fd712, fd710, fd711; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd715, fd712, fd661; +mul.f64 fd716, fd714, fd662; +mul.f64 fd717, fd712, fd662; +ld.global.v2.f64 {fd718, fd719}, [rd16+160]; +mul.f64 fd722, fd718, fd665; +mul.f64 fd723, fd719, fd666; +mul.f64 fd724, fd718, fd666; +mul.f64 fd725, fd679, fd718; +mul.f64 fd726, fd680, fd719; +sub.f64 fd727, fd725, fd726; +mul.f64 fd728, fd679, fd719; +fma.rn.f64 fd729, fd680, fd718, fd728; +mul.f64 fd730, fd727, fd669; +mul.f64 fd731, fd729, fd670; +mul.f64 fd732, fd727, fd670; +mul.f64 fd733, fd679, fd727; +mul.f64 fd734, fd680, fd729; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd679, fd729; +fma.rn.f64 fd737, fd680, fd727, fd736; +mul.f64 fd738, fd735, fd673; +mul.f64 fd739, fd737, fd674; +mul.f64 fd740, fd735, fd674; +mul.f64 fd741, fd679, fd735; +mul.f64 fd742, fd680, fd737; +sub.f64 fd743, fd741, fd742; +mul.f64 fd744, fd679, fd737; +fma.rn.f64 fd745, fd680, fd735, fd744; +mul.f64 fd746, fd743, fd677; +mul.f64 fd747, fd745, fd678; +mul.f64 fd748, fd743, fd678; +shl.b32 r20, r19, 4; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 16000, r21; +add.f64 fd749, fd560, fd604; +add.f64 fd750, fd556, fd600; +st.shared.v2.f64 [r22], {fd750, fd749}; +fma.rn.f64 fd751, fd680, fd663, fd685; +sub.f64 fd752, fd683, fd684; +st.shared.v2.f64 [r22+1600], {fd752, fd751}; +fma.rn.f64 fd753, fd690, fd667, fd693; +sub.f64 fd754, fd691, fd692; +st.shared.v2.f64 [r22+3200], {fd754, fd753}; +fma.rn.f64 fd755, fd698, fd671, fd701; +sub.f64 fd756, fd699, fd700; +st.shared.v2.f64 [r22+4800], {fd756, fd755}; +fma.rn.f64 fd757, fd706, fd675, fd709; +sub.f64 fd758, fd707, fd708; +st.shared.v2.f64 [r22+6400], {fd758, fd757}; +sub.f64 fd759, fd715, fd716; +fma.rn.f64 fd760, fd714, fd661, fd717; +st.shared.v2.f64 [r22+8000], {fd759, fd760}; +fma.rn.f64 fd761, fd719, fd665, fd724; +sub.f64 fd762, fd722, fd723; +st.shared.v2.f64 [r22+9600], {fd762, fd761}; +fma.rn.f64 fd763, fd729, fd669, fd732; +sub.f64 fd764, fd730, fd731; +st.shared.v2.f64 [r22+11200], {fd764, fd763}; +fma.rn.f64 fd765, fd737, fd673, fd740; +sub.f64 fd766, fd738, fd739; +st.shared.v2.f64 [r22+12800], {fd766, fd765}; +fma.rn.f64 fd767, fd745, fd677, fd748; +sub.f64 fd768, fd746, fd747; +st.shared.v2.f64 [r22+14400], {fd768, fd767}; +barrier.sync 0; +ld.shared.v2.f64 {fd769, fd770}, [r10]; +ld.shared.v2.f64 {fd773, fd774}, [r10+16000]; +ld.shared.v2.f64 {fd777, fd778}, [r10+32000]; +ld.shared.v2.f64 {fd781, fd782}, [r10+48000]; +ld.shared.v2.f64 {fd785, fd786}, [r10+64000]; +ld.shared.v2.f64 {fd789, fd790}, [r10+80000]; +ld.shared.v2.f64 {fd793, fd794}, [r10+96000]; +ld.shared.v2.f64 {fd797, fd798}, [r10+112000]; +ld.shared.v2.f64 {fd801, fd802}, [r10+128000]; +ld.shared.v2.f64 {fd805, fd806}, [r10+144000]; +add.f64 fd809, fd777, fd801; +add.f64 fd810, fd769, fd809; +add.f64 fd811, fd785, fd793; +add.f64 fd812, fd811, fd810; +add.f64 fd813, fd778, fd802; +add.f64 fd814, fd770, fd813; +add.f64 fd815, fd786, fd794; +add.f64 fd816, fd815, fd814; +fma.rn.f64 fd817, fd809, 0d3FD3C6EF372FE950, fd769; +mul.f64 fd818, fd811, 0d3FE9E3779B97F4A8; +sub.f64 fd819, fd817, fd818; +sub.f64 fd820, fd778, fd802; +mul.f64 fd821, fd820, 0d3FEE6F0E134454FF; +sub.f64 fd822, fd786, fd794; +mul.f64 fd823, fd822, 0dBFE2CF2304755A5E; +sub.f64 fd824, fd823, fd821; +sub.f64 fd825, fd819, fd824; +add.f64 fd826, fd824, fd819; +mul.f64 fd827, fd809, 0d3FE9E3779B97F4A8; +sub.f64 fd828, fd769, fd827; +fma.rn.f64 fd829, fd811, 0d3FD3C6EF372FE950, fd828; +mul.f64 fd830, fd820, 0d3FE2CF2304755A5E; +mul.f64 fd831, fd822, 0d3FEE6F0E134454FF; +sub.f64 fd832, fd831, fd830; +sub.f64 fd833, fd829, fd832; +add.f64 fd834, fd832, fd829; +fma.rn.f64 fd835, fd813, 0d3FD3C6EF372FE950, fd770; +mul.f64 fd836, fd815, 0d3FE9E3779B97F4A8; +sub.f64 fd837, fd835, fd836; +sub.f64 fd838, fd777, fd801; +mul.f64 fd839, fd838, 0d3FEE6F0E134454FF; +sub.f64 fd840, fd785, fd793; +mul.f64 fd841, fd840, 0dBFE2CF2304755A5E; +sub.f64 fd842, fd841, fd839; +add.f64 fd843, fd842, fd837; +sub.f64 fd844, fd837, fd842; +mul.f64 fd845, fd813, 0d3FE9E3779B97F4A8; +sub.f64 fd846, fd770, fd845; +fma.rn.f64 fd847, fd815, 0d3FD3C6EF372FE950, fd846; +mul.f64 fd848, fd838, 0d3FE2CF2304755A5E; +mul.f64 fd849, fd840, 0d3FEE6F0E134454FF; +sub.f64 fd850, fd849, fd848; +add.f64 fd851, fd850, fd847; +sub.f64 fd852, fd847, fd850; +add.f64 fd853, fd781, fd805; +add.f64 fd854, fd773, fd853; +add.f64 fd855, fd789, fd797; +add.f64 fd856, fd855, fd854; +add.f64 fd857, fd782, fd806; +add.f64 fd858, fd774, fd857; +add.f64 fd859, fd790, fd798; +add.f64 fd860, fd859, fd858; +fma.rn.f64 fd861, fd853, 0d3FD3C6EF372FE950, fd773; +mul.f64 fd862, fd855, 0d3FE9E3779B97F4A8; +sub.f64 fd863, fd861, fd862; +sub.f64 fd864, fd782, fd806; +mul.f64 fd865, fd864, 0d3FEE6F0E134454FF; +sub.f64 fd866, fd790, fd798; +mul.f64 fd867, fd866, 0dBFE2CF2304755A5E; +sub.f64 fd868, fd867, fd865; +sub.f64 fd869, fd863, fd868; +add.f64 fd870, fd868, fd863; +mul.f64 fd871, fd853, 0d3FE9E3779B97F4A8; +sub.f64 fd872, fd773, fd871; +fma.rn.f64 fd873, fd855, 0d3FD3C6EF372FE950, fd872; +mul.f64 fd874, fd864, 0d3FE2CF2304755A5E; +mul.f64 fd875, fd866, 0d3FEE6F0E134454FF; +sub.f64 fd876, fd875, fd874; +sub.f64 fd877, fd873, fd876; +add.f64 fd878, fd876, fd873; +fma.rn.f64 fd879, fd857, 0d3FD3C6EF372FE950, fd774; +mul.f64 fd880, fd859, 0d3FE9E3779B97F4A8; +sub.f64 fd881, fd879, fd880; +sub.f64 fd882, fd781, fd805; +mul.f64 fd883, fd882, 0d3FEE6F0E134454FF; +sub.f64 fd884, fd789, fd797; +mul.f64 fd885, fd884, 0dBFE2CF2304755A5E; +sub.f64 fd886, fd885, fd883; +add.f64 fd887, fd886, fd881; +sub.f64 fd888, fd881, fd886; +mul.f64 fd889, fd857, 0d3FE9E3779B97F4A8; +sub.f64 fd890, fd774, fd889; +fma.rn.f64 fd891, fd859, 0d3FD3C6EF372FE950, fd890; +mul.f64 fd892, fd882, 0d3FE2CF2304755A5E; +mul.f64 fd893, fd884, 0d3FEE6F0E134454FF; +sub.f64 fd894, fd893, fd892; +add.f64 fd895, fd894, fd891; +sub.f64 fd896, fd891, fd894; +mul.f64 fd897, fd869, 0d3FE9E3779B97F4A8; +mul.f64 fd898, fd887, 0dBFE2CF2304755A5E; +sub.f64 fd899, fd897, fd898; +mul.f64 fd900, fd887, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd901, fd869, 0dBFE2CF2304755A5E, fd900; +mul.f64 fd902, fd877, 0d3FD3C6EF372FE950; +mul.f64 fd903, fd895, 0dBFEE6F0E134454FF; +sub.f64 fd904, fd902, fd903; +mul.f64 fd905, fd895, 0d3FD3C6EF372FE950; +fma.rn.f64 fd906, fd877, 0dBFEE6F0E134454FF, fd905; +mul.f64 fd907, fd878, 0dBFD3C6EF372FE950; +mul.f64 fd908, fd896, 0dBFEE6F0E134454FF; +sub.f64 fd909, fd907, fd908; +mul.f64 fd910, fd896, 0dBFD3C6EF372FE950; +fma.rn.f64 fd911, fd878, 0dBFEE6F0E134454FF, fd910; +mul.f64 fd912, fd870, 0dBFE9E3779B97F4A8; +mul.f64 fd913, fd888, 0dBFE2CF2304755A5E; +sub.f64 fd914, fd912, fd913; +mul.f64 fd915, fd888, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd916, fd870, 0dBFE2CF2304755A5E, fd915; +add.f64 %1, fd816, fd860; +add.f64 %0, fd812, fd856; +add.f64 %3, fd843, fd901; +add.f64 %2, fd825, fd899; +add.f64 %5, fd851, fd906; +add.f64 %4, fd833, fd904; +add.f64 %7, fd852, fd911; +add.f64 %6, fd834, fd909; +add.f64 %9, fd844, fd916; +add.f64 %8, fd826, fd914; +sub.f64 %11, fd816, fd860; +sub.f64 %10, fd812, fd856; +sub.f64 %13, fd843, fd901; +sub.f64 %12, fd825, fd899; +sub.f64 %15, fd851, fd906; +sub.f64 %14, fd833, fd904; +sub.f64 %17, fd852, fd911; +sub.f64 %16, fd834, fd909; +sub.f64 %19, fd844, fd916; +sub.f64 %18, fd826, fd914; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_10000), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1167, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<23>; +.reg .f64 fd<877>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 80000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %29, %45; +add.f64 fd42, %24, fd41; +add.f64 fd43, %34, %40; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %31, %47; +add.f64 fd46, %25, fd45; +add.f64 fd47, %36, %41; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %24; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %31, %47; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %36, %41; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %24, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %25; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %29, %45; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %34, %40; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %25, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %32, %48; +add.f64 fd86, %26, fd85; +add.f64 fd87, %37, %42; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %33, %49; +add.f64 fd90, %28, fd89; +add.f64 fd91, %39, %44; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %26; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %33, %49; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %39, %44; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %26, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %28; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %32, %48; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %37, %42; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %28, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +add.f64 fd149, fd44, fd88; +add.f64 fd150, fd48, fd92; +sub.f64 fd151, fd44, fd88; +sub.f64 fd152, fd48, fd92; +add.f64 fd153, fd57, fd131; +add.f64 fd154, fd75, fd133; +sub.f64 fd155, fd57, fd131; +sub.f64 fd156, fd75, fd133; +add.f64 fd157, fd65, fd136; +add.f64 fd158, fd83, fd138; +sub.f64 fd159, fd65, fd136; +sub.f64 fd160, fd83, fd138; +add.f64 fd161, fd66, fd141; +add.f64 fd162, fd84, fd143; +sub.f64 fd163, fd66, fd141; +sub.f64 fd164, fd84, fd143; +add.f64 fd165, fd58, fd146; +add.f64 fd166, fd76, fd148; +sub.f64 fd167, fd58, fd146; +sub.f64 fd168, fd76, fd148; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd169, fd170}, [rd6]; +mul.f64 fd173, fd169, fd153; +mul.f64 fd174, fd170, fd154; +sub.f64 fd175, fd173, fd174; +mul.f64 fd176, fd169, fd154; +fma.rn.f64 fd177, fd170, fd153, fd176; +mul.f64 fd178, fd169, fd169; +mul.f64 fd179, fd170, fd170; +sub.f64 fd180, fd178, fd179; +mul.f64 fd181, fd170, fd169; +fma.rn.f64 fd182, fd170, fd169, fd181; +mul.f64 fd183, fd180, fd157; +mul.f64 fd184, fd182, fd158; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd180, fd158; +fma.rn.f64 fd187, fd182, fd157, fd186; +mul.f64 fd188, fd169, fd180; +mul.f64 fd189, fd170, fd182; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd169, fd182; +fma.rn.f64 fd192, fd170, fd180, fd191; +mul.f64 fd193, fd190, fd161; +mul.f64 fd194, fd192, fd162; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd190, fd162; +fma.rn.f64 fd197, fd192, fd161, fd196; +mul.f64 fd198, fd169, fd190; +mul.f64 fd199, fd170, fd192; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd169, fd192; +fma.rn.f64 fd202, fd170, fd190, fd201; +mul.f64 fd203, fd200, fd165; +mul.f64 fd204, fd202, fd166; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd166; +fma.rn.f64 fd207, fd202, fd165, fd206; +mul.f64 fd208, fd169, fd200; +mul.f64 fd209, fd170, fd202; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd169, fd202; +fma.rn.f64 fd212, fd170, fd200, fd211; +mul.f64 fd213, fd210, fd151; +mul.f64 fd214, fd212, fd152; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd210, fd152; +fma.rn.f64 fd217, fd212, fd151, fd216; +ld.global.v2.f64 {fd218, fd219}, [rd6+16000]; +mul.f64 fd222, fd218, fd155; +mul.f64 fd223, fd219, fd156; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd218, fd156; +fma.rn.f64 fd226, fd219, fd155, fd225; +mul.f64 fd227, fd169, fd218; +mul.f64 fd228, fd170, fd219; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd169, fd219; +fma.rn.f64 fd231, fd170, fd218, fd230; +mul.f64 fd232, fd229, fd159; +mul.f64 fd233, fd231, fd160; +sub.f64 fd234, fd232, fd233; +mul.f64 fd235, fd229, fd160; +fma.rn.f64 fd236, fd231, fd159, fd235; +mul.f64 fd237, fd169, fd229; +mul.f64 fd238, fd170, fd231; +sub.f64 fd239, fd237, fd238; +mul.f64 fd240, fd169, fd231; +fma.rn.f64 fd241, fd170, fd229, fd240; +mul.f64 fd242, fd239, fd163; +mul.f64 fd243, fd241, fd164; +sub.f64 fd244, fd242, fd243; +mul.f64 fd245, fd239, fd164; +fma.rn.f64 fd246, fd241, fd163, fd245; +mul.f64 fd247, fd169, fd239; +mul.f64 fd248, fd170, fd241; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd169, fd241; +fma.rn.f64 fd251, fd170, fd239, fd250; +mul.f64 fd252, fd249, fd167; +mul.f64 fd253, fd251, fd168; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd249, fd168; +fma.rn.f64 fd256, fd251, fd167, fd255; +mad.lo.s32 r8, r5, 80000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +st.shared.v2.f64 [r9], {fd149, fd175}; +st.shared.v2.f64 [r9+16], {fd185, fd195}; +st.shared.v2.f64 [r9+32], {fd205, fd215}; +st.shared.v2.f64 [r9+48], {fd224, fd234}; +st.shared.v2.f64 [r9+64], {fd244, fd254}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.f64 fd257, [r10]; +ld.shared.f64 fd258, [r10+8000]; +ld.shared.f64 fd259, [r10+16000]; +ld.shared.f64 fd260, [r10+24000]; +ld.shared.f64 fd261, [r10+32000]; +ld.shared.f64 fd262, [r10+40000]; +ld.shared.f64 fd263, [r10+48000]; +ld.shared.f64 fd264, [r10+56000]; +ld.shared.f64 fd265, [r10+64000]; +ld.shared.f64 fd266, [r10+72000]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd150, fd177}; +st.shared.v2.f64 [r9+16], {fd187, fd197}; +st.shared.v2.f64 [r9+32], {fd207, fd217}; +st.shared.v2.f64 [r9+48], {fd226, fd236}; +st.shared.v2.f64 [r9+64], {fd246, fd256}; +barrier.sync 0; +ld.shared.f64 fd267, [r10]; +ld.shared.f64 fd268, [r10+8000]; +ld.shared.f64 fd269, [r10+16000]; +ld.shared.f64 fd270, [r10+24000]; +ld.shared.f64 fd271, [r10+32000]; +ld.shared.f64 fd272, [r10+40000]; +ld.shared.f64 fd273, [r10+48000]; +ld.shared.f64 fd274, [r10+56000]; +ld.shared.f64 fd275, [r10+64000]; +ld.shared.f64 fd276, [r10+72000]; +add.f64 fd277, fd259, fd265; +add.f64 fd278, fd257, fd277; +add.f64 fd279, fd261, fd263; +add.f64 fd280, fd279, fd278; +add.f64 fd281, fd269, fd275; +add.f64 fd282, fd267, fd281; +add.f64 fd283, fd271, fd273; +add.f64 fd284, fd283, fd282; +fma.rn.f64 fd285, fd277, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, fd269, fd275; +mul.f64 fd289, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd290, fd271, fd273; +mul.f64 fd291, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd291, fd289; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, fd257, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +fma.rn.f64 fd303, fd281, 0d3FD3C6EF372FE950, fd267; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, fd259, fd265; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd261, fd263; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, fd267, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +add.f64 fd321, fd260, fd266; +add.f64 fd322, fd258, fd321; +add.f64 fd323, fd262, fd264; +add.f64 fd324, fd323, fd322; +add.f64 fd325, fd270, fd276; +add.f64 fd326, fd268, fd325; +add.f64 fd327, fd272, fd274; +add.f64 fd328, fd327, fd326; +fma.rn.f64 fd329, fd321, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd330, fd323, 0d3FE9E3779B97F4A8; +sub.f64 fd331, fd329, fd330; +sub.f64 fd332, fd270, fd276; +mul.f64 fd333, fd332, 0d3FEE6F0E134454FF; +sub.f64 fd334, fd272, fd274; +mul.f64 fd335, fd334, 0dBFE2CF2304755A5E; +sub.f64 fd336, fd335, fd333; +sub.f64 fd337, fd331, fd336; +add.f64 fd338, fd336, fd331; +mul.f64 fd339, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd340, fd258, fd339; +fma.rn.f64 fd341, fd323, 0d3FD3C6EF372FE950, fd340; +mul.f64 fd342, fd332, 0d3FE2CF2304755A5E; +mul.f64 fd343, fd334, 0d3FEE6F0E134454FF; +sub.f64 fd344, fd343, fd342; +sub.f64 fd345, fd341, fd344; +add.f64 fd346, fd344, fd341; +fma.rn.f64 fd347, fd325, 0d3FD3C6EF372FE950, fd268; +mul.f64 fd348, fd327, 0d3FE9E3779B97F4A8; +sub.f64 fd349, fd347, fd348; +sub.f64 fd350, fd260, fd266; +mul.f64 fd351, fd350, 0d3FEE6F0E134454FF; +sub.f64 fd352, fd262, fd264; +mul.f64 fd353, fd352, 0dBFE2CF2304755A5E; +sub.f64 fd354, fd353, fd351; +add.f64 fd355, fd354, fd349; +sub.f64 fd356, fd349, fd354; +mul.f64 fd357, fd325, 0d3FE9E3779B97F4A8; +sub.f64 fd358, fd268, fd357; +fma.rn.f64 fd359, fd327, 0d3FD3C6EF372FE950, fd358; +mul.f64 fd360, fd350, 0d3FE2CF2304755A5E; +mul.f64 fd361, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd362, fd361, fd360; +add.f64 fd363, fd362, fd359; +sub.f64 fd364, fd359, fd362; +mul.f64 fd365, fd337, 0d3FE9E3779B97F4A8; +mul.f64 fd366, fd355, 0dBFE2CF2304755A5E; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd355, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd369, fd337, 0dBFE2CF2304755A5E, fd368; +mul.f64 fd370, fd345, 0d3FD3C6EF372FE950; +mul.f64 fd371, fd363, 0dBFEE6F0E134454FF; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd363, 0d3FD3C6EF372FE950; +fma.rn.f64 fd374, fd345, 0dBFEE6F0E134454FF, fd373; +mul.f64 fd375, fd346, 0dBFD3C6EF372FE950; +mul.f64 fd376, fd364, 0dBFEE6F0E134454FF; +sub.f64 fd377, fd375, fd376; +mul.f64 fd378, fd364, 0dBFD3C6EF372FE950; +fma.rn.f64 fd379, fd346, 0dBFEE6F0E134454FF, fd378; +mul.f64 fd380, fd338, 0dBFE9E3779B97F4A8; +mul.f64 fd381, fd356, 0dBFE2CF2304755A5E; +sub.f64 fd382, fd380, fd381; +mul.f64 fd383, fd356, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd384, fd338, 0dBFE2CF2304755A5E, fd383; +add.f64 fd385, fd280, fd324; +add.f64 fd386, fd284, fd328; +sub.f64 fd387, fd280, fd324; +sub.f64 fd388, fd284, fd328; +add.f64 fd389, fd293, fd367; +add.f64 fd390, fd311, fd369; +sub.f64 fd391, fd293, fd367; +sub.f64 fd392, fd311, fd369; +add.f64 fd393, fd301, fd372; +add.f64 fd394, fd319, fd374; +sub.f64 fd395, fd301, fd372; +sub.f64 fd396, fd319, fd374; +add.f64 fd397, fd302, fd377; +add.f64 fd398, fd320, fd379; +sub.f64 fd399, fd302, fd377; +sub.f64 fd400, fd320, fd379; +add.f64 fd401, fd294, fd382; +add.f64 fd402, fd312, fd384; +sub.f64 fd403, fd294, fd382; +sub.f64 fd404, fd312, fd384; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd405, fd406}, [rd11]; +mul.f64 fd409, fd405, fd389; +mul.f64 fd410, fd406, fd390; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd405, fd390; +fma.rn.f64 fd413, fd406, fd389, fd412; +mul.f64 fd414, fd405, fd405; +mul.f64 fd415, fd406, fd406; +sub.f64 fd416, fd414, fd415; +mul.f64 fd417, fd406, fd405; +fma.rn.f64 fd418, fd406, fd405, fd417; +mul.f64 fd419, fd416, fd393; +mul.f64 fd420, fd418, fd394; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd416, fd394; +fma.rn.f64 fd423, fd418, fd393, fd422; +mul.f64 fd424, fd405, fd416; +mul.f64 fd425, fd406, fd418; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd405, fd418; +fma.rn.f64 fd428, fd406, fd416, fd427; +mul.f64 fd429, fd426, fd397; +mul.f64 fd430, fd428, fd398; +sub.f64 fd431, fd429, fd430; +mul.f64 fd432, fd426, fd398; +fma.rn.f64 fd433, fd428, fd397, fd432; +mul.f64 fd434, fd405, fd426; +mul.f64 fd435, fd406, fd428; +sub.f64 fd436, fd434, fd435; +mul.f64 fd437, fd405, fd428; +fma.rn.f64 fd438, fd406, fd426, fd437; +mul.f64 fd439, fd436, fd401; +mul.f64 fd440, fd438, fd402; +sub.f64 fd441, fd439, fd440; +mul.f64 fd442, fd436, fd402; +fma.rn.f64 fd443, fd438, fd401, fd442; +mul.f64 fd444, fd405, fd436; +mul.f64 fd445, fd406, fd438; +sub.f64 fd446, fd444, fd445; +mul.f64 fd447, fd405, fd438; +fma.rn.f64 fd448, fd406, fd436, fd447; +mul.f64 fd449, fd446, fd387; +mul.f64 fd450, fd448, fd388; +sub.f64 fd451, fd449, fd450; +mul.f64 fd452, fd446, fd388; +fma.rn.f64 fd453, fd448, fd387, fd452; +ld.global.v2.f64 {fd454, fd455}, [rd11+1600]; +mul.f64 fd458, fd454, fd391; +mul.f64 fd459, fd455, fd392; +sub.f64 fd460, fd458, fd459; +mul.f64 fd461, fd454, fd392; +fma.rn.f64 fd462, fd455, fd391, fd461; +mul.f64 fd463, fd405, fd454; +mul.f64 fd464, fd406, fd455; +sub.f64 fd465, fd463, fd464; +mul.f64 fd466, fd405, fd455; +fma.rn.f64 fd467, fd406, fd454, fd466; +mul.f64 fd468, fd465, fd395; +mul.f64 fd469, fd467, fd396; +sub.f64 fd470, fd468, fd469; +mul.f64 fd471, fd465, fd396; +fma.rn.f64 fd472, fd467, fd395, fd471; +mul.f64 fd473, fd405, fd465; +mul.f64 fd474, fd406, fd467; +sub.f64 fd475, fd473, fd474; +mul.f64 fd476, fd405, fd467; +fma.rn.f64 fd477, fd406, fd465, fd476; +mul.f64 fd478, fd475, fd399; +mul.f64 fd479, fd477, fd400; +sub.f64 fd480, fd478, fd479; +mul.f64 fd481, fd475, fd400; +fma.rn.f64 fd482, fd477, fd399, fd481; +mul.f64 fd483, fd405, fd475; +mul.f64 fd484, fd406, fd477; +sub.f64 fd485, fd483, fd484; +mul.f64 fd486, fd405, fd477; +fma.rn.f64 fd487, fd406, fd475, fd486; +mul.f64 fd488, fd485, fd403; +mul.f64 fd489, fd487, fd404; +sub.f64 fd490, fd488, fd489; +mul.f64 fd491, fd485, fd404; +fma.rn.f64 fd492, fd487, fd403, fd491; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +st.shared.f64 [r16], fd385; +st.shared.f64 [r16+80], fd411; +st.shared.f64 [r16+160], fd421; +st.shared.f64 [r16+240], fd431; +st.shared.f64 [r16+320], fd441; +st.shared.f64 [r16+400], fd451; +st.shared.f64 [r16+480], fd460; +st.shared.f64 [r16+560], fd470; +st.shared.f64 [r16+640], fd480; +st.shared.f64 [r16+720], fd490; +barrier.sync 0; +ld.shared.f64 fd493, [r10]; +ld.shared.f64 fd494, [r10+8000]; +ld.shared.f64 fd495, [r10+16000]; +ld.shared.f64 fd496, [r10+24000]; +ld.shared.f64 fd497, [r10+32000]; +ld.shared.f64 fd498, [r10+40000]; +ld.shared.f64 fd499, [r10+48000]; +ld.shared.f64 fd500, [r10+56000]; +ld.shared.f64 fd501, [r10+64000]; +ld.shared.f64 fd502, [r10+72000]; +barrier.sync 0; +st.shared.f64 [r16], fd386; +st.shared.f64 [r16+80], fd413; +st.shared.f64 [r16+160], fd423; +st.shared.f64 [r16+240], fd433; +st.shared.f64 [r16+320], fd443; +st.shared.f64 [r16+400], fd453; +st.shared.f64 [r16+480], fd462; +st.shared.f64 [r16+560], fd472; +st.shared.f64 [r16+640], fd482; +st.shared.f64 [r16+720], fd492; +barrier.sync 0; +ld.shared.f64 fd503, [r10]; +ld.shared.f64 fd504, [r10+8000]; +ld.shared.f64 fd505, [r10+16000]; +ld.shared.f64 fd506, [r10+24000]; +ld.shared.f64 fd507, [r10+32000]; +ld.shared.f64 fd508, [r10+40000]; +ld.shared.f64 fd509, [r10+48000]; +ld.shared.f64 fd510, [r10+56000]; +ld.shared.f64 fd511, [r10+64000]; +ld.shared.f64 fd512, [r10+72000]; +add.f64 fd513, fd495, fd501; +add.f64 fd514, fd493, fd513; +add.f64 fd515, fd497, fd499; +add.f64 fd516, fd515, fd514; +add.f64 fd517, fd505, fd511; +add.f64 fd518, fd503, fd517; +add.f64 fd519, fd507, fd509; +add.f64 fd520, fd519, fd518; +fma.rn.f64 fd521, fd513, 0d3FD3C6EF372FE950, fd493; +mul.f64 fd522, fd515, 0d3FE9E3779B97F4A8; +sub.f64 fd523, fd521, fd522; +sub.f64 fd524, fd505, fd511; +mul.f64 fd525, fd524, 0d3FEE6F0E134454FF; +sub.f64 fd526, fd507, fd509; +mul.f64 fd527, fd526, 0dBFE2CF2304755A5E; +sub.f64 fd528, fd527, fd525; +sub.f64 fd529, fd523, fd528; +add.f64 fd530, fd528, fd523; +mul.f64 fd531, fd513, 0d3FE9E3779B97F4A8; +sub.f64 fd532, fd493, fd531; +fma.rn.f64 fd533, fd515, 0d3FD3C6EF372FE950, fd532; +mul.f64 fd534, fd524, 0d3FE2CF2304755A5E; +mul.f64 fd535, fd526, 0d3FEE6F0E134454FF; +sub.f64 fd536, fd535, fd534; +sub.f64 fd537, fd533, fd536; +add.f64 fd538, fd536, fd533; +fma.rn.f64 fd539, fd517, 0d3FD3C6EF372FE950, fd503; +mul.f64 fd540, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd541, fd539, fd540; +sub.f64 fd542, fd495, fd501; +mul.f64 fd543, fd542, 0d3FEE6F0E134454FF; +sub.f64 fd544, fd497, fd499; +mul.f64 fd545, fd544, 0dBFE2CF2304755A5E; +sub.f64 fd546, fd545, fd543; +add.f64 fd547, fd546, fd541; +sub.f64 fd548, fd541, fd546; +mul.f64 fd549, fd517, 0d3FE9E3779B97F4A8; +sub.f64 fd550, fd503, fd549; +fma.rn.f64 fd551, fd519, 0d3FD3C6EF372FE950, fd550; +mul.f64 fd552, fd542, 0d3FE2CF2304755A5E; +mul.f64 fd553, fd544, 0d3FEE6F0E134454FF; +sub.f64 fd554, fd553, fd552; +add.f64 fd555, fd554, fd551; +sub.f64 fd556, fd551, fd554; +add.f64 fd557, fd496, fd502; +add.f64 fd558, fd494, fd557; +add.f64 fd559, fd498, fd500; +add.f64 fd560, fd559, fd558; +add.f64 fd561, fd506, fd512; +add.f64 fd562, fd504, fd561; +add.f64 fd563, fd508, fd510; +add.f64 fd564, fd563, fd562; +fma.rn.f64 fd565, fd557, 0d3FD3C6EF372FE950, fd494; +mul.f64 fd566, fd559, 0d3FE9E3779B97F4A8; +sub.f64 fd567, fd565, fd566; +sub.f64 fd568, fd506, fd512; +mul.f64 fd569, fd568, 0d3FEE6F0E134454FF; +sub.f64 fd570, fd508, fd510; +mul.f64 fd571, fd570, 0dBFE2CF2304755A5E; +sub.f64 fd572, fd571, fd569; +sub.f64 fd573, fd567, fd572; +add.f64 fd574, fd572, fd567; +mul.f64 fd575, fd557, 0d3FE9E3779B97F4A8; +sub.f64 fd576, fd494, fd575; +fma.rn.f64 fd577, fd559, 0d3FD3C6EF372FE950, fd576; +mul.f64 fd578, fd568, 0d3FE2CF2304755A5E; +mul.f64 fd579, fd570, 0d3FEE6F0E134454FF; +sub.f64 fd580, fd579, fd578; +sub.f64 fd581, fd577, fd580; +add.f64 fd582, fd580, fd577; +fma.rn.f64 fd583, fd561, 0d3FD3C6EF372FE950, fd504; +mul.f64 fd584, fd563, 0d3FE9E3779B97F4A8; +sub.f64 fd585, fd583, fd584; +sub.f64 fd586, fd496, fd502; +mul.f64 fd587, fd586, 0d3FEE6F0E134454FF; +sub.f64 fd588, fd498, fd500; +mul.f64 fd589, fd588, 0dBFE2CF2304755A5E; +sub.f64 fd590, fd589, fd587; +add.f64 fd591, fd590, fd585; +sub.f64 fd592, fd585, fd590; +mul.f64 fd593, fd561, 0d3FE9E3779B97F4A8; +sub.f64 fd594, fd504, fd593; +fma.rn.f64 fd595, fd563, 0d3FD3C6EF372FE950, fd594; +mul.f64 fd596, fd586, 0d3FE2CF2304755A5E; +mul.f64 fd597, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd598, fd597, fd596; +add.f64 fd599, fd598, fd595; +sub.f64 fd600, fd595, fd598; +mul.f64 fd601, fd573, 0d3FE9E3779B97F4A8; +mul.f64 fd602, fd591, 0dBFE2CF2304755A5E; +sub.f64 fd603, fd601, fd602; +mul.f64 fd604, fd591, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd605, fd573, 0dBFE2CF2304755A5E, fd604; +mul.f64 fd606, fd581, 0d3FD3C6EF372FE950; +mul.f64 fd607, fd599, 0dBFEE6F0E134454FF; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd599, 0d3FD3C6EF372FE950; +fma.rn.f64 fd610, fd581, 0dBFEE6F0E134454FF, fd609; +mul.f64 fd611, fd582, 0dBFD3C6EF372FE950; +mul.f64 fd612, fd600, 0dBFEE6F0E134454FF; +sub.f64 fd613, fd611, fd612; +mul.f64 fd614, fd600, 0dBFD3C6EF372FE950; +fma.rn.f64 fd615, fd582, 0dBFEE6F0E134454FF, fd614; +mul.f64 fd616, fd574, 0dBFE9E3779B97F4A8; +mul.f64 fd617, fd592, 0dBFE2CF2304755A5E; +sub.f64 fd618, fd616, fd617; +mul.f64 fd619, fd592, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd620, fd574, 0dBFE2CF2304755A5E, fd619; +add.f64 fd621, fd516, fd560; +add.f64 fd622, fd520, fd564; +sub.f64 fd623, fd516, fd560; +sub.f64 fd624, fd520, fd564; +add.f64 fd625, fd529, fd603; +add.f64 fd626, fd547, fd605; +sub.f64 fd627, fd529, fd603; +sub.f64 fd628, fd547, fd605; +add.f64 fd629, fd537, fd608; +add.f64 fd630, fd555, fd610; +sub.f64 fd631, fd537, fd608; +sub.f64 fd632, fd555, fd610; +add.f64 fd633, fd538, fd613; +add.f64 fd634, fd556, fd615; +sub.f64 fd635, fd538, fd613; +sub.f64 fd636, fd556, fd615; +add.f64 fd637, fd530, fd618; +add.f64 fd638, fd548, fd620; +sub.f64 fd639, fd530, fd618; +sub.f64 fd640, fd548, fd620; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 16; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd641, fd642}, [rd16]; +mul.f64 fd645, fd641, fd625; +mul.f64 fd646, fd642, fd626; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd641, fd626; +fma.rn.f64 fd649, fd642, fd625, fd648; +mul.f64 fd650, fd641, fd641; +mul.f64 fd651, fd642, fd642; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd642, fd641; +fma.rn.f64 fd654, fd642, fd641, fd653; +mul.f64 fd655, fd652, fd629; +mul.f64 fd656, fd654, fd630; +sub.f64 fd657, fd655, fd656; +mul.f64 fd658, fd652, fd630; +fma.rn.f64 fd659, fd654, fd629, fd658; +mul.f64 fd660, fd641, fd652; +mul.f64 fd661, fd642, fd654; +sub.f64 fd662, fd660, fd661; +mul.f64 fd663, fd641, fd654; +fma.rn.f64 fd664, fd642, fd652, fd663; +mul.f64 fd665, fd662, fd633; +mul.f64 fd666, fd664, fd634; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd662, fd634; +fma.rn.f64 fd669, fd664, fd633, fd668; +mul.f64 fd670, fd641, fd662; +mul.f64 fd671, fd642, fd664; +sub.f64 fd672, fd670, fd671; +mul.f64 fd673, fd641, fd664; +fma.rn.f64 fd674, fd642, fd662, fd673; +mul.f64 fd675, fd672, fd637; +mul.f64 fd676, fd674, fd638; +sub.f64 fd677, fd675, fd676; +mul.f64 fd678, fd672, fd638; +fma.rn.f64 fd679, fd674, fd637, fd678; +mul.f64 fd680, fd641, fd672; +mul.f64 fd681, fd642, fd674; +sub.f64 fd682, fd680, fd681; +mul.f64 fd683, fd641, fd674; +fma.rn.f64 fd684, fd642, fd672, fd683; +mul.f64 fd685, fd682, fd623; +mul.f64 fd686, fd684, fd624; +sub.f64 fd687, fd685, fd686; +mul.f64 fd688, fd682, fd624; +fma.rn.f64 fd689, fd684, fd623, fd688; +ld.global.v2.f64 {fd690, fd691}, [rd16+160]; +mul.f64 fd694, fd690, fd627; +mul.f64 fd695, fd691, fd628; +sub.f64 fd696, fd694, fd695; +mul.f64 fd697, fd690, fd628; +fma.rn.f64 fd698, fd691, fd627, fd697; +mul.f64 fd699, fd641, fd690; +mul.f64 fd700, fd642, fd691; +sub.f64 fd701, fd699, fd700; +mul.f64 fd702, fd641, fd691; +fma.rn.f64 fd703, fd642, fd690, fd702; +mul.f64 fd704, fd701, fd631; +mul.f64 fd705, fd703, fd632; +sub.f64 fd706, fd704, fd705; +mul.f64 fd707, fd701, fd632; +fma.rn.f64 fd708, fd703, fd631, fd707; +mul.f64 fd709, fd641, fd701; +mul.f64 fd710, fd642, fd703; +sub.f64 fd711, fd709, fd710; +mul.f64 fd712, fd641, fd703; +fma.rn.f64 fd713, fd642, fd701, fd712; +mul.f64 fd714, fd711, fd635; +mul.f64 fd715, fd713, fd636; +sub.f64 fd716, fd714, fd715; +mul.f64 fd717, fd711, fd636; +fma.rn.f64 fd718, fd713, fd635, fd717; +mul.f64 fd719, fd641, fd711; +mul.f64 fd720, fd642, fd713; +sub.f64 fd721, fd719, fd720; +mul.f64 fd722, fd641, fd713; +fma.rn.f64 fd723, fd642, fd711, fd722; +mul.f64 fd724, fd721, fd639; +mul.f64 fd725, fd723, fd640; +sub.f64 fd726, fd724, fd725; +mul.f64 fd727, fd721, fd640; +fma.rn.f64 fd728, fd723, fd639, fd727; +shl.b32 r20, r19, 3; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 8000, r21; +st.shared.f64 [r22], fd621; +st.shared.f64 [r22+800], fd647; +st.shared.f64 [r22+1600], fd657; +st.shared.f64 [r22+2400], fd667; +st.shared.f64 [r22+3200], fd677; +st.shared.f64 [r22+4000], fd687; +st.shared.f64 [r22+4800], fd696; +st.shared.f64 [r22+5600], fd706; +st.shared.f64 [r22+6400], fd716; +st.shared.f64 [r22+7200], fd726; +barrier.sync 0; +ld.shared.f64 fd729, [r10]; +ld.shared.f64 fd730, [r10+8000]; +ld.shared.f64 fd731, [r10+16000]; +ld.shared.f64 fd732, [r10+24000]; +ld.shared.f64 fd733, [r10+32000]; +ld.shared.f64 fd734, [r10+40000]; +ld.shared.f64 fd735, [r10+48000]; +ld.shared.f64 fd736, [r10+56000]; +ld.shared.f64 fd737, [r10+64000]; +ld.shared.f64 fd738, [r10+72000]; +barrier.sync 0; +st.shared.f64 [r22], fd622; +st.shared.f64 [r22+800], fd649; +st.shared.f64 [r22+1600], fd659; +st.shared.f64 [r22+2400], fd669; +st.shared.f64 [r22+3200], fd679; +st.shared.f64 [r22+4000], fd689; +st.shared.f64 [r22+4800], fd698; +st.shared.f64 [r22+5600], fd708; +st.shared.f64 [r22+6400], fd718; +st.shared.f64 [r22+7200], fd728; +barrier.sync 0; +ld.shared.f64 fd739, [r10]; +ld.shared.f64 fd740, [r10+8000]; +ld.shared.f64 fd741, [r10+16000]; +ld.shared.f64 fd742, [r10+24000]; +ld.shared.f64 fd743, [r10+32000]; +ld.shared.f64 fd744, [r10+40000]; +ld.shared.f64 fd745, [r10+48000]; +ld.shared.f64 fd746, [r10+56000]; +ld.shared.f64 fd747, [r10+64000]; +ld.shared.f64 fd748, [r10+72000]; +add.f64 fd749, fd731, fd737; +add.f64 fd750, fd729, fd749; +add.f64 fd751, fd733, fd735; +add.f64 fd752, fd751, fd750; +add.f64 fd753, fd741, fd747; +add.f64 fd754, fd739, fd753; +add.f64 fd755, fd743, fd745; +add.f64 fd756, fd755, fd754; +fma.rn.f64 fd757, fd749, 0d3FD3C6EF372FE950, fd729; +mul.f64 fd758, fd751, 0d3FE9E3779B97F4A8; +sub.f64 fd759, fd757, fd758; +sub.f64 fd760, fd741, fd747; +mul.f64 fd761, fd760, 0d3FEE6F0E134454FF; +sub.f64 fd762, fd743, fd745; +mul.f64 fd763, fd762, 0dBFE2CF2304755A5E; +sub.f64 fd764, fd763, fd761; +sub.f64 fd765, fd759, fd764; +add.f64 fd766, fd764, fd759; +mul.f64 fd767, fd749, 0d3FE9E3779B97F4A8; +sub.f64 fd768, fd729, fd767; +fma.rn.f64 fd769, fd751, 0d3FD3C6EF372FE950, fd768; +mul.f64 fd770, fd760, 0d3FE2CF2304755A5E; +mul.f64 fd771, fd762, 0d3FEE6F0E134454FF; +sub.f64 fd772, fd771, fd770; +sub.f64 fd773, fd769, fd772; +add.f64 fd774, fd772, fd769; +fma.rn.f64 fd775, fd753, 0d3FD3C6EF372FE950, fd739; +mul.f64 fd776, fd755, 0d3FE9E3779B97F4A8; +sub.f64 fd777, fd775, fd776; +sub.f64 fd778, fd731, fd737; +mul.f64 fd779, fd778, 0d3FEE6F0E134454FF; +sub.f64 fd780, fd733, fd735; +mul.f64 fd781, fd780, 0dBFE2CF2304755A5E; +sub.f64 fd782, fd781, fd779; +add.f64 fd783, fd782, fd777; +sub.f64 fd784, fd777, fd782; +mul.f64 fd785, fd753, 0d3FE9E3779B97F4A8; +sub.f64 fd786, fd739, fd785; +fma.rn.f64 fd787, fd755, 0d3FD3C6EF372FE950, fd786; +mul.f64 fd788, fd778, 0d3FE2CF2304755A5E; +mul.f64 fd789, fd780, 0d3FEE6F0E134454FF; +sub.f64 fd790, fd789, fd788; +add.f64 fd791, fd790, fd787; +sub.f64 fd792, fd787, fd790; +add.f64 fd793, fd732, fd738; +add.f64 fd794, fd730, fd793; +add.f64 fd795, fd734, fd736; +add.f64 fd796, fd795, fd794; +add.f64 fd797, fd742, fd748; +add.f64 fd798, fd740, fd797; +add.f64 fd799, fd744, fd746; +add.f64 fd800, fd799, fd798; +fma.rn.f64 fd801, fd793, 0d3FD3C6EF372FE950, fd730; +mul.f64 fd802, fd795, 0d3FE9E3779B97F4A8; +sub.f64 fd803, fd801, fd802; +sub.f64 fd804, fd742, fd748; +mul.f64 fd805, fd804, 0d3FEE6F0E134454FF; +sub.f64 fd806, fd744, fd746; +mul.f64 fd807, fd806, 0dBFE2CF2304755A5E; +sub.f64 fd808, fd807, fd805; +sub.f64 fd809, fd803, fd808; +add.f64 fd810, fd808, fd803; +mul.f64 fd811, fd793, 0d3FE9E3779B97F4A8; +sub.f64 fd812, fd730, fd811; +fma.rn.f64 fd813, fd795, 0d3FD3C6EF372FE950, fd812; +mul.f64 fd814, fd804, 0d3FE2CF2304755A5E; +mul.f64 fd815, fd806, 0d3FEE6F0E134454FF; +sub.f64 fd816, fd815, fd814; +sub.f64 fd817, fd813, fd816; +add.f64 fd818, fd816, fd813; +fma.rn.f64 fd819, fd797, 0d3FD3C6EF372FE950, fd740; +mul.f64 fd820, fd799, 0d3FE9E3779B97F4A8; +sub.f64 fd821, fd819, fd820; +sub.f64 fd822, fd732, fd738; +mul.f64 fd823, fd822, 0d3FEE6F0E134454FF; +sub.f64 fd824, fd734, fd736; +mul.f64 fd825, fd824, 0dBFE2CF2304755A5E; +sub.f64 fd826, fd825, fd823; +add.f64 fd827, fd826, fd821; +sub.f64 fd828, fd821, fd826; +mul.f64 fd829, fd797, 0d3FE9E3779B97F4A8; +sub.f64 fd830, fd740, fd829; +fma.rn.f64 fd831, fd799, 0d3FD3C6EF372FE950, fd830; +mul.f64 fd832, fd822, 0d3FE2CF2304755A5E; +mul.f64 fd833, fd824, 0d3FEE6F0E134454FF; +sub.f64 fd834, fd833, fd832; +add.f64 fd835, fd834, fd831; +sub.f64 fd836, fd831, fd834; +mul.f64 fd837, fd809, 0d3FE9E3779B97F4A8; +mul.f64 fd838, fd827, 0dBFE2CF2304755A5E; +sub.f64 fd839, fd837, fd838; +mul.f64 fd840, fd827, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd841, fd809, 0dBFE2CF2304755A5E, fd840; +mul.f64 fd842, fd817, 0d3FD3C6EF372FE950; +mul.f64 fd843, fd835, 0dBFEE6F0E134454FF; +sub.f64 fd844, fd842, fd843; +mul.f64 fd845, fd835, 0d3FD3C6EF372FE950; +fma.rn.f64 fd846, fd817, 0dBFEE6F0E134454FF, fd845; +mul.f64 fd847, fd818, 0dBFD3C6EF372FE950; +mul.f64 fd848, fd836, 0dBFEE6F0E134454FF; +sub.f64 fd849, fd847, fd848; +mul.f64 fd850, fd836, 0dBFD3C6EF372FE950; +fma.rn.f64 fd851, fd818, 0dBFEE6F0E134454FF, fd850; +mul.f64 fd852, fd810, 0dBFE9E3779B97F4A8; +mul.f64 fd853, fd828, 0dBFE2CF2304755A5E; +sub.f64 fd854, fd852, fd853; +mul.f64 fd855, fd828, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd856, fd810, 0dBFE2CF2304755A5E, fd855; +add.f64 %0, fd752, fd796; +add.f64 %1, fd756, fd800; +add.f64 %3, fd783, fd841; +add.f64 %2, fd765, fd839; +add.f64 %5, fd791, fd846; +add.f64 %4, fd773, fd844; +add.f64 %7, fd792, fd851; +add.f64 %6, fd774, fd849; +add.f64 %9, fd784, fd856; +add.f64 %8, fd766, fd854; +sub.f64 %10, fd752, fd796; +sub.f64 %11, fd756, fd800; +sub.f64 %13, fd783, fd841; +sub.f64 %12, fd765, fd839; +sub.f64 %15, fd791, fd846; +sub.f64 %14, fd773, fd844; +sub.f64 %17, fd792, fd851; +sub.f64 %16, fd774, fd849; +sub.f64 %19, fd784, fd856; +sub.f64 %18, fd766, fd854; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_10000), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..ee49496d56aa2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10000_fp64_inv.hpp.inc @@ -0,0 +1,1776 @@ +#ifndef CUFFTDX_FFT_10000_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_10000_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1172, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<23>; +.reg .f64 fd<921>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 160000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %29, %45; +add.f64 fd42, %24, fd41; +add.f64 fd43, %34, %40; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %31, %47; +add.f64 fd46, %25, fd45; +add.f64 fd47, %36, %41; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %24; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %31, %47; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %36, %41; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %24, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %25; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %29, %45; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %34, %40; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %25, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %32, %48; +add.f64 fd84, %26, fd83; +add.f64 fd85, %37, %42; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %33, %49; +add.f64 fd88, %28, fd87; +add.f64 fd89, %39, %44; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %26; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %33, %49; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %39, %44; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %26, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %28; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %32, %48; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %37, %42; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %28, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +sub.f64 fd145, fd44, fd86; +sub.f64 fd146, fd48, fd90; +add.f64 fd147, fd56, fd127; +add.f64 fd148, fd73, fd129; +sub.f64 fd149, fd56, fd127; +sub.f64 fd150, fd73, fd129; +add.f64 fd151, fd64, fd132; +add.f64 fd152, fd81, fd134; +sub.f64 fd153, fd64, fd132; +sub.f64 fd154, fd81, fd134; +add.f64 fd155, fd65, fd137; +add.f64 fd156, fd82, fd139; +sub.f64 fd157, fd65, fd137; +sub.f64 fd158, fd82, fd139; +add.f64 fd159, fd57, fd142; +add.f64 fd160, fd74, fd144; +sub.f64 fd161, fd57, fd142; +sub.f64 fd162, fd74, fd144; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 160000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd163, fd164}, [rd6]; +mul.f64 fd167, fd148, fd164; +mul.f64 fd168, fd147, fd164; +mul.f64 fd169, fd163, fd148; +mul.f64 fd170, fd163, fd163; +mul.f64 fd171, fd164, fd164; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd164, fd163; +fma.rn.f64 fd174, fd164, fd163, fd173; +mul.f64 fd175, fd152, fd174; +mul.f64 fd176, fd151, fd174; +mul.f64 fd177, fd172, fd152; +mul.f64 fd178, fd163, fd172; +mul.f64 fd179, fd164, fd174; +sub.f64 fd180, fd178, fd179; +mul.f64 fd181, fd163, fd174; +fma.rn.f64 fd182, fd164, fd172, fd181; +mul.f64 fd183, fd156, fd182; +mul.f64 fd184, fd155, fd182; +mul.f64 fd185, fd180, fd156; +mul.f64 fd186, fd163, fd180; +mul.f64 fd187, fd164, fd182; +sub.f64 fd188, fd186, fd187; +mul.f64 fd189, fd163, fd182; +fma.rn.f64 fd190, fd164, fd180, fd189; +mul.f64 fd191, fd160, fd190; +mul.f64 fd192, fd159, fd190; +mul.f64 fd193, fd188, fd160; +mul.f64 fd194, fd163, fd188; +mul.f64 fd195, fd164, fd190; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd163, fd190; +fma.rn.f64 fd198, fd164, fd188, fd197; +mul.f64 fd199, fd146, fd198; +mul.f64 fd200, fd145, fd198; +mul.f64 fd201, fd196, fd146; +ld.global.v2.f64 {fd202, fd203}, [rd6+16000]; +mul.f64 fd206, fd150, fd203; +mul.f64 fd207, fd149, fd203; +mul.f64 fd208, fd202, fd150; +mul.f64 fd209, fd163, fd202; +mul.f64 fd210, fd164, fd203; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd163, fd203; +fma.rn.f64 fd213, fd164, fd202, fd212; +mul.f64 fd214, fd154, fd213; +mul.f64 fd215, fd153, fd213; +mul.f64 fd216, fd211, fd154; +mul.f64 fd217, fd163, fd211; +mul.f64 fd218, fd164, fd213; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd163, fd213; +fma.rn.f64 fd221, fd164, fd211, fd220; +mul.f64 fd222, fd158, fd221; +mul.f64 fd223, fd157, fd221; +mul.f64 fd224, fd219, fd158; +mul.f64 fd225, fd163, fd219; +mul.f64 fd226, fd164, fd221; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd163, fd221; +fma.rn.f64 fd229, fd164, fd219, fd228; +mul.f64 fd230, fd162, fd229; +mul.f64 fd231, fd161, fd229; +mul.f64 fd232, fd227, fd162; +barrier.sync 0; +mad.lo.s32 r9, r7, 160, r8; +add.f64 fd233, fd48, fd90; +add.f64 fd234, fd44, fd86; +st.shared.v2.f64 [r9], {fd234, fd233}; +fma.rn.f64 fd235, fd163, fd147, fd167; +sub.f64 fd236, fd169, fd168; +st.shared.v2.f64 [r9+16], {fd235, fd236}; +fma.rn.f64 fd237, fd172, fd151, fd175; +sub.f64 fd238, fd177, fd176; +st.shared.v2.f64 [r9+32], {fd237, fd238}; +fma.rn.f64 fd239, fd180, fd155, fd183; +sub.f64 fd240, fd185, fd184; +st.shared.v2.f64 [r9+48], {fd239, fd240}; +sub.f64 fd241, fd193, fd192; +fma.rn.f64 fd242, fd188, fd159, fd191; +st.shared.v2.f64 [r9+64], {fd242, fd241}; +fma.rn.f64 fd243, fd196, fd145, fd199; +sub.f64 fd244, fd201, fd200; +st.shared.v2.f64 [r9+80], {fd243, fd244}; +fma.rn.f64 fd245, fd202, fd149, fd206; +sub.f64 fd246, fd208, fd207; +st.shared.v2.f64 [r9+96], {fd245, fd246}; +fma.rn.f64 fd247, fd211, fd153, fd214; +sub.f64 fd248, fd216, fd215; +st.shared.v2.f64 [r9+112], {fd247, fd248}; +fma.rn.f64 fd249, fd219, fd157, fd222; +sub.f64 fd250, fd224, fd223; +st.shared.v2.f64 [r9+128], {fd249, fd250}; +sub.f64 fd251, fd232, fd231; +fma.rn.f64 fd252, fd227, fd161, fd230; +st.shared.v2.f64 [r9+144], {fd252, fd251}; +barrier.sync 0; +mad.lo.s32 r10, r7, -144, r9; +ld.shared.v2.f64 {fd253, fd254}, [r10]; +ld.shared.v2.f64 {fd257, fd258}, [r10+16000]; +ld.shared.v2.f64 {fd261, fd262}, [r10+32000]; +ld.shared.v2.f64 {fd265, fd266}, [r10+48000]; +ld.shared.v2.f64 {fd269, fd270}, [r10+64000]; +ld.shared.v2.f64 {fd273, fd274}, [r10+80000]; +ld.shared.v2.f64 {fd277, fd278}, [r10+96000]; +ld.shared.v2.f64 {fd281, fd282}, [r10+112000]; +ld.shared.v2.f64 {fd285, fd286}, [r10+128000]; +ld.shared.v2.f64 {fd289, fd290}, [r10+144000]; +add.f64 fd293, fd261, fd285; +add.f64 fd294, fd253, fd293; +add.f64 fd295, fd269, fd277; +add.f64 fd296, fd295, fd294; +add.f64 fd297, fd262, fd286; +add.f64 fd298, fd254, fd297; +add.f64 fd299, fd270, fd278; +add.f64 fd300, fd299, fd298; +fma.rn.f64 fd301, fd293, 0d3FD3C6EF372FE950, fd253; +mul.f64 fd302, fd295, 0d3FE9E3779B97F4A8; +sub.f64 fd303, fd301, fd302; +sub.f64 fd304, fd262, fd286; +mul.f64 fd305, fd304, 0d3FEE6F0E134454FF; +sub.f64 fd306, fd270, fd278; +fma.rn.f64 fd307, fd306, 0d3FE2CF2304755A5E, fd305; +sub.f64 fd308, fd303, fd307; +add.f64 fd309, fd307, fd303; +mul.f64 fd310, fd293, 0d3FE9E3779B97F4A8; +sub.f64 fd311, fd253, fd310; +fma.rn.f64 fd312, fd295, 0d3FD3C6EF372FE950, fd311; +mul.f64 fd313, fd304, 0d3FE2CF2304755A5E; +mul.f64 fd314, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd315, fd313, fd314; +sub.f64 fd316, fd312, fd315; +add.f64 fd317, fd315, fd312; +fma.rn.f64 fd318, fd297, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd319, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd320, fd318, fd319; +sub.f64 fd321, fd261, fd285; +mul.f64 fd322, fd321, 0d3FEE6F0E134454FF; +sub.f64 fd323, fd269, fd277; +fma.rn.f64 fd324, fd323, 0d3FE2CF2304755A5E, fd322; +add.f64 fd325, fd324, fd320; +sub.f64 fd326, fd320, fd324; +mul.f64 fd327, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd328, fd254, fd327; +fma.rn.f64 fd329, fd299, 0d3FD3C6EF372FE950, fd328; +mul.f64 fd330, fd321, 0d3FE2CF2304755A5E; +mul.f64 fd331, fd323, 0d3FEE6F0E134454FF; +sub.f64 fd332, fd330, fd331; +add.f64 fd333, fd332, fd329; +sub.f64 fd334, fd329, fd332; +add.f64 fd335, fd265, fd289; +add.f64 fd336, fd257, fd335; +add.f64 fd337, fd273, fd281; +add.f64 fd338, fd337, fd336; +add.f64 fd339, fd266, fd290; +add.f64 fd340, fd258, fd339; +add.f64 fd341, fd274, fd282; +add.f64 fd342, fd341, fd340; +fma.rn.f64 fd343, fd335, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd344, fd337, 0d3FE9E3779B97F4A8; +sub.f64 fd345, fd343, fd344; +sub.f64 fd346, fd266, fd290; +mul.f64 fd347, fd346, 0d3FEE6F0E134454FF; +sub.f64 fd348, fd274, fd282; +fma.rn.f64 fd349, fd348, 0d3FE2CF2304755A5E, fd347; +sub.f64 fd350, fd345, fd349; +add.f64 fd351, fd349, fd345; +mul.f64 fd352, fd335, 0d3FE9E3779B97F4A8; +sub.f64 fd353, fd257, fd352; +fma.rn.f64 fd354, fd337, 0d3FD3C6EF372FE950, fd353; +mul.f64 fd355, fd346, 0d3FE2CF2304755A5E; +mul.f64 fd356, fd348, 0d3FEE6F0E134454FF; +sub.f64 fd357, fd355, fd356; +sub.f64 fd358, fd354, fd357; +add.f64 fd359, fd357, fd354; +fma.rn.f64 fd360, fd339, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd361, fd341, 0d3FE9E3779B97F4A8; +sub.f64 fd362, fd360, fd361; +sub.f64 fd363, fd265, fd289; +mul.f64 fd364, fd363, 0d3FEE6F0E134454FF; +sub.f64 fd365, fd273, fd281; +fma.rn.f64 fd366, fd365, 0d3FE2CF2304755A5E, fd364; +add.f64 fd367, fd366, fd362; +sub.f64 fd368, fd362, fd366; +mul.f64 fd369, fd339, 0d3FE9E3779B97F4A8; +sub.f64 fd370, fd258, fd369; +fma.rn.f64 fd371, fd341, 0d3FD3C6EF372FE950, fd370; +mul.f64 fd372, fd363, 0d3FE2CF2304755A5E; +mul.f64 fd373, fd365, 0d3FEE6F0E134454FF; +sub.f64 fd374, fd372, fd373; +add.f64 fd375, fd374, fd371; +sub.f64 fd376, fd371, fd374; +mul.f64 fd377, fd350, 0d3FE9E3779B97F4A8; +mul.f64 fd378, fd367, 0d3FE2CF2304755A5E; +sub.f64 fd379, fd377, fd378; +mul.f64 fd380, fd367, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd381, fd350, 0d3FE2CF2304755A5E, fd380; +mul.f64 fd382, fd358, 0d3FD3C6EF372FE950; +mul.f64 fd383, fd375, 0d3FEE6F0E134454FF; +sub.f64 fd384, fd382, fd383; +mul.f64 fd385, fd375, 0d3FD3C6EF372FE950; +fma.rn.f64 fd386, fd358, 0d3FEE6F0E134454FF, fd385; +mul.f64 fd387, fd359, 0dBFD3C6EF372FE950; +mul.f64 fd388, fd376, 0d3FEE6F0E134454FF; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd376, 0dBFD3C6EF372FE950; +fma.rn.f64 fd391, fd359, 0d3FEE6F0E134454FF, fd390; +mul.f64 fd392, fd351, 0dBFE9E3779B97F4A8; +mul.f64 fd393, fd368, 0d3FE2CF2304755A5E; +sub.f64 fd394, fd392, fd393; +mul.f64 fd395, fd368, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd396, fd351, 0d3FE2CF2304755A5E, fd395; +sub.f64 fd397, fd296, fd338; +sub.f64 fd398, fd300, fd342; +add.f64 fd399, fd308, fd379; +add.f64 fd400, fd325, fd381; +sub.f64 fd401, fd308, fd379; +sub.f64 fd402, fd325, fd381; +add.f64 fd403, fd316, fd384; +add.f64 fd404, fd333, fd386; +sub.f64 fd405, fd316, fd384; +sub.f64 fd406, fd333, fd386; +add.f64 fd407, fd317, fd389; +add.f64 fd408, fd334, fd391; +sub.f64 fd409, fd317, fd389; +sub.f64 fd410, fd334, fd391; +add.f64 fd411, fd309, fd394; +add.f64 fd412, fd326, fd396; +sub.f64 fd413, fd309, fd394; +sub.f64 fd414, fd326, fd396; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd415, fd416}, [rd11]; +mul.f64 fd419, fd400, fd416; +mul.f64 fd420, fd399, fd416; +mul.f64 fd421, fd415, fd400; +mul.f64 fd422, fd415, fd415; +mul.f64 fd423, fd416, fd416; +sub.f64 fd424, fd422, fd423; +mul.f64 fd425, fd416, fd415; +fma.rn.f64 fd426, fd416, fd415, fd425; +mul.f64 fd427, fd404, fd426; +mul.f64 fd428, fd403, fd426; +mul.f64 fd429, fd424, fd404; +mul.f64 fd430, fd415, fd424; +mul.f64 fd431, fd416, fd426; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd415, fd426; +fma.rn.f64 fd434, fd416, fd424, fd433; +mul.f64 fd435, fd408, fd434; +mul.f64 fd436, fd407, fd434; +mul.f64 fd437, fd432, fd408; +mul.f64 fd438, fd415, fd432; +mul.f64 fd439, fd416, fd434; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd415, fd434; +fma.rn.f64 fd442, fd416, fd432, fd441; +mul.f64 fd443, fd412, fd442; +mul.f64 fd444, fd411, fd442; +mul.f64 fd445, fd440, fd412; +mul.f64 fd446, fd415, fd440; +mul.f64 fd447, fd416, fd442; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd415, fd442; +fma.rn.f64 fd450, fd416, fd440, fd449; +mul.f64 fd451, fd398, fd450; +mul.f64 fd452, fd397, fd450; +mul.f64 fd453, fd448, fd398; +ld.global.v2.f64 {fd454, fd455}, [rd11+1600]; +mul.f64 fd458, fd402, fd455; +mul.f64 fd459, fd401, fd455; +mul.f64 fd460, fd454, fd402; +mul.f64 fd461, fd415, fd454; +mul.f64 fd462, fd416, fd455; +sub.f64 fd463, fd461, fd462; +mul.f64 fd464, fd415, fd455; +fma.rn.f64 fd465, fd416, fd454, fd464; +mul.f64 fd466, fd406, fd465; +mul.f64 fd467, fd405, fd465; +mul.f64 fd468, fd463, fd406; +mul.f64 fd469, fd415, fd463; +mul.f64 fd470, fd416, fd465; +sub.f64 fd471, fd469, fd470; +mul.f64 fd472, fd415, fd465; +fma.rn.f64 fd473, fd416, fd463, fd472; +mul.f64 fd474, fd410, fd473; +mul.f64 fd475, fd409, fd473; +mul.f64 fd476, fd471, fd410; +mul.f64 fd477, fd415, fd471; +mul.f64 fd478, fd416, fd473; +sub.f64 fd479, fd477, fd478; +mul.f64 fd480, fd415, fd473; +fma.rn.f64 fd481, fd416, fd471, fd480; +mul.f64 fd482, fd414, fd481; +mul.f64 fd483, fd413, fd481; +mul.f64 fd484, fd479, fd414; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1600, r15; +add.f64 fd485, fd300, fd342; +add.f64 fd486, fd296, fd338; +st.shared.v2.f64 [r16], {fd486, fd485}; +fma.rn.f64 fd487, fd415, fd399, fd419; +sub.f64 fd488, fd421, fd420; +st.shared.v2.f64 [r16+160], {fd487, fd488}; +fma.rn.f64 fd489, fd424, fd403, fd427; +sub.f64 fd490, fd429, fd428; +st.shared.v2.f64 [r16+320], {fd489, fd490}; +fma.rn.f64 fd491, fd432, fd407, fd435; +sub.f64 fd492, fd437, fd436; +st.shared.v2.f64 [r16+480], {fd491, fd492}; +fma.rn.f64 fd493, fd440, fd411, fd443; +sub.f64 fd494, fd445, fd444; +st.shared.v2.f64 [r16+640], {fd493, fd494}; +sub.f64 fd495, fd453, fd452; +fma.rn.f64 fd496, fd448, fd397, fd451; +st.shared.v2.f64 [r16+800], {fd496, fd495}; +fma.rn.f64 fd497, fd454, fd401, fd458; +sub.f64 fd498, fd460, fd459; +st.shared.v2.f64 [r16+960], {fd497, fd498}; +fma.rn.f64 fd499, fd463, fd405, fd466; +sub.f64 fd500, fd468, fd467; +st.shared.v2.f64 [r16+1120], {fd499, fd500}; +fma.rn.f64 fd501, fd471, fd409, fd474; +sub.f64 fd502, fd476, fd475; +st.shared.v2.f64 [r16+1280], {fd501, fd502}; +fma.rn.f64 fd503, fd479, fd413, fd482; +sub.f64 fd504, fd484, fd483; +st.shared.v2.f64 [r16+1440], {fd503, fd504}; +barrier.sync 0; +ld.shared.v2.f64 {fd505, fd506}, [r10]; +ld.shared.v2.f64 {fd509, fd510}, [r10+16000]; +ld.shared.v2.f64 {fd513, fd514}, [r10+32000]; +ld.shared.v2.f64 {fd517, fd518}, [r10+48000]; +ld.shared.v2.f64 {fd521, fd522}, [r10+64000]; +ld.shared.v2.f64 {fd525, fd526}, [r10+80000]; +ld.shared.v2.f64 {fd529, fd530}, [r10+96000]; +ld.shared.v2.f64 {fd533, fd534}, [r10+112000]; +ld.shared.v2.f64 {fd537, fd538}, [r10+128000]; +ld.shared.v2.f64 {fd541, fd542}, [r10+144000]; +add.f64 fd545, fd513, fd537; +add.f64 fd546, fd505, fd545; +add.f64 fd547, fd521, fd529; +add.f64 fd548, fd547, fd546; +add.f64 fd549, fd514, fd538; +add.f64 fd550, fd506, fd549; +add.f64 fd551, fd522, fd530; +add.f64 fd552, fd551, fd550; +fma.rn.f64 fd553, fd545, 0d3FD3C6EF372FE950, fd505; +mul.f64 fd554, fd547, 0d3FE9E3779B97F4A8; +sub.f64 fd555, fd553, fd554; +sub.f64 fd556, fd514, fd538; +mul.f64 fd557, fd556, 0d3FEE6F0E134454FF; +sub.f64 fd558, fd522, fd530; +fma.rn.f64 fd559, fd558, 0d3FE2CF2304755A5E, fd557; +sub.f64 fd560, fd555, fd559; +add.f64 fd561, fd559, fd555; +mul.f64 fd562, fd545, 0d3FE9E3779B97F4A8; +sub.f64 fd563, fd505, fd562; +fma.rn.f64 fd564, fd547, 0d3FD3C6EF372FE950, fd563; +mul.f64 fd565, fd556, 0d3FE2CF2304755A5E; +mul.f64 fd566, fd558, 0d3FEE6F0E134454FF; +sub.f64 fd567, fd565, fd566; +sub.f64 fd568, fd564, fd567; +add.f64 fd569, fd567, fd564; +fma.rn.f64 fd570, fd549, 0d3FD3C6EF372FE950, fd506; +mul.f64 fd571, fd551, 0d3FE9E3779B97F4A8; +sub.f64 fd572, fd570, fd571; +sub.f64 fd573, fd513, fd537; +mul.f64 fd574, fd573, 0d3FEE6F0E134454FF; +sub.f64 fd575, fd521, fd529; +fma.rn.f64 fd576, fd575, 0d3FE2CF2304755A5E, fd574; +add.f64 fd577, fd576, fd572; +sub.f64 fd578, fd572, fd576; +mul.f64 fd579, fd549, 0d3FE9E3779B97F4A8; +sub.f64 fd580, fd506, fd579; +fma.rn.f64 fd581, fd551, 0d3FD3C6EF372FE950, fd580; +mul.f64 fd582, fd573, 0d3FE2CF2304755A5E; +mul.f64 fd583, fd575, 0d3FEE6F0E134454FF; +sub.f64 fd584, fd582, fd583; +add.f64 fd585, fd584, fd581; +sub.f64 fd586, fd581, fd584; +add.f64 fd587, fd517, fd541; +add.f64 fd588, fd509, fd587; +add.f64 fd589, fd525, fd533; +add.f64 fd590, fd589, fd588; +add.f64 fd591, fd518, fd542; +add.f64 fd592, fd510, fd591; +add.f64 fd593, fd526, fd534; +add.f64 fd594, fd593, fd592; +fma.rn.f64 fd595, fd587, 0d3FD3C6EF372FE950, fd509; +mul.f64 fd596, fd589, 0d3FE9E3779B97F4A8; +sub.f64 fd597, fd595, fd596; +sub.f64 fd598, fd518, fd542; +mul.f64 fd599, fd598, 0d3FEE6F0E134454FF; +sub.f64 fd600, fd526, fd534; +fma.rn.f64 fd601, fd600, 0d3FE2CF2304755A5E, fd599; +sub.f64 fd602, fd597, fd601; +add.f64 fd603, fd601, fd597; +mul.f64 fd604, fd587, 0d3FE9E3779B97F4A8; +sub.f64 fd605, fd509, fd604; +fma.rn.f64 fd606, fd589, 0d3FD3C6EF372FE950, fd605; +mul.f64 fd607, fd598, 0d3FE2CF2304755A5E; +mul.f64 fd608, fd600, 0d3FEE6F0E134454FF; +sub.f64 fd609, fd607, fd608; +sub.f64 fd610, fd606, fd609; +add.f64 fd611, fd609, fd606; +fma.rn.f64 fd612, fd591, 0d3FD3C6EF372FE950, fd510; +mul.f64 fd613, fd593, 0d3FE9E3779B97F4A8; +sub.f64 fd614, fd612, fd613; +sub.f64 fd615, fd517, fd541; +mul.f64 fd616, fd615, 0d3FEE6F0E134454FF; +sub.f64 fd617, fd525, fd533; +fma.rn.f64 fd618, fd617, 0d3FE2CF2304755A5E, fd616; +add.f64 fd619, fd618, fd614; +sub.f64 fd620, fd614, fd618; +mul.f64 fd621, fd591, 0d3FE9E3779B97F4A8; +sub.f64 fd622, fd510, fd621; +fma.rn.f64 fd623, fd593, 0d3FD3C6EF372FE950, fd622; +mul.f64 fd624, fd615, 0d3FE2CF2304755A5E; +mul.f64 fd625, fd617, 0d3FEE6F0E134454FF; +sub.f64 fd626, fd624, fd625; +add.f64 fd627, fd626, fd623; +sub.f64 fd628, fd623, fd626; +mul.f64 fd629, fd602, 0d3FE9E3779B97F4A8; +mul.f64 fd630, fd619, 0d3FE2CF2304755A5E; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd619, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd633, fd602, 0d3FE2CF2304755A5E, fd632; +mul.f64 fd634, fd610, 0d3FD3C6EF372FE950; +mul.f64 fd635, fd627, 0d3FEE6F0E134454FF; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd627, 0d3FD3C6EF372FE950; +fma.rn.f64 fd638, fd610, 0d3FEE6F0E134454FF, fd637; +mul.f64 fd639, fd611, 0dBFD3C6EF372FE950; +mul.f64 fd640, fd628, 0d3FEE6F0E134454FF; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd628, 0dBFD3C6EF372FE950; +fma.rn.f64 fd643, fd611, 0d3FEE6F0E134454FF, fd642; +mul.f64 fd644, fd603, 0dBFE9E3779B97F4A8; +mul.f64 fd645, fd620, 0d3FE2CF2304755A5E; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd620, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd648, fd603, 0d3FE2CF2304755A5E, fd647; +sub.f64 fd649, fd548, fd590; +sub.f64 fd650, fd552, fd594; +add.f64 fd651, fd560, fd631; +add.f64 fd652, fd577, fd633; +sub.f64 fd653, fd560, fd631; +sub.f64 fd654, fd577, fd633; +add.f64 fd655, fd568, fd636; +add.f64 fd656, fd585, fd638; +sub.f64 fd657, fd568, fd636; +sub.f64 fd658, fd585, fd638; +add.f64 fd659, fd569, fd641; +add.f64 fd660, fd586, fd643; +sub.f64 fd661, fd569, fd641; +sub.f64 fd662, fd586, fd643; +add.f64 fd663, fd561, fd646; +add.f64 fd664, fd578, fd648; +sub.f64 fd665, fd561, fd646; +sub.f64 fd666, fd578, fd648; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 16; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd667, fd668}, [rd16]; +mul.f64 fd671, fd652, fd668; +mul.f64 fd672, fd651, fd668; +mul.f64 fd673, fd667, fd652; +mul.f64 fd674, fd667, fd667; +mul.f64 fd675, fd668, fd668; +sub.f64 fd676, fd674, fd675; +mul.f64 fd677, fd668, fd667; +fma.rn.f64 fd678, fd668, fd667, fd677; +mul.f64 fd679, fd656, fd678; +mul.f64 fd680, fd655, fd678; +mul.f64 fd681, fd676, fd656; +mul.f64 fd682, fd667, fd676; +mul.f64 fd683, fd668, fd678; +sub.f64 fd684, fd682, fd683; +mul.f64 fd685, fd667, fd678; +fma.rn.f64 fd686, fd668, fd676, fd685; +mul.f64 fd687, fd660, fd686; +mul.f64 fd688, fd659, fd686; +mul.f64 fd689, fd684, fd660; +mul.f64 fd690, fd667, fd684; +mul.f64 fd691, fd668, fd686; +sub.f64 fd692, fd690, fd691; +mul.f64 fd693, fd667, fd686; +fma.rn.f64 fd694, fd668, fd684, fd693; +mul.f64 fd695, fd664, fd694; +mul.f64 fd696, fd663, fd694; +mul.f64 fd697, fd692, fd664; +mul.f64 fd698, fd667, fd692; +mul.f64 fd699, fd668, fd694; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd667, fd694; +fma.rn.f64 fd702, fd668, fd692, fd701; +mul.f64 fd703, fd650, fd702; +mul.f64 fd704, fd649, fd702; +mul.f64 fd705, fd700, fd650; +ld.global.v2.f64 {fd706, fd707}, [rd16+160]; +mul.f64 fd710, fd654, fd707; +mul.f64 fd711, fd653, fd707; +mul.f64 fd712, fd706, fd654; +mul.f64 fd713, fd667, fd706; +mul.f64 fd714, fd668, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd667, fd707; +fma.rn.f64 fd717, fd668, fd706, fd716; +mul.f64 fd718, fd658, fd717; +mul.f64 fd719, fd657, fd717; +mul.f64 fd720, fd715, fd658; +mul.f64 fd721, fd667, fd715; +mul.f64 fd722, fd668, fd717; +sub.f64 fd723, fd721, fd722; +mul.f64 fd724, fd667, fd717; +fma.rn.f64 fd725, fd668, fd715, fd724; +mul.f64 fd726, fd662, fd725; +mul.f64 fd727, fd661, fd725; +mul.f64 fd728, fd723, fd662; +mul.f64 fd729, fd667, fd723; +mul.f64 fd730, fd668, fd725; +sub.f64 fd731, fd729, fd730; +mul.f64 fd732, fd667, fd725; +fma.rn.f64 fd733, fd668, fd723, fd732; +mul.f64 fd734, fd666, fd733; +mul.f64 fd735, fd665, fd733; +mul.f64 fd736, fd731, fd666; +shl.b32 r20, r19, 4; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 16000, r21; +add.f64 fd737, fd552, fd594; +add.f64 fd738, fd548, fd590; +st.shared.v2.f64 [r22], {fd738, fd737}; +fma.rn.f64 fd739, fd667, fd651, fd671; +sub.f64 fd740, fd673, fd672; +st.shared.v2.f64 [r22+1600], {fd739, fd740}; +fma.rn.f64 fd741, fd676, fd655, fd679; +sub.f64 fd742, fd681, fd680; +st.shared.v2.f64 [r22+3200], {fd741, fd742}; +fma.rn.f64 fd743, fd684, fd659, fd687; +sub.f64 fd744, fd689, fd688; +st.shared.v2.f64 [r22+4800], {fd743, fd744}; +fma.rn.f64 fd745, fd692, fd663, fd695; +sub.f64 fd746, fd697, fd696; +st.shared.v2.f64 [r22+6400], {fd745, fd746}; +sub.f64 fd747, fd705, fd704; +fma.rn.f64 fd748, fd700, fd649, fd703; +st.shared.v2.f64 [r22+8000], {fd748, fd747}; +fma.rn.f64 fd749, fd706, fd653, fd710; +sub.f64 fd750, fd712, fd711; +st.shared.v2.f64 [r22+9600], {fd749, fd750}; +fma.rn.f64 fd751, fd715, fd657, fd718; +sub.f64 fd752, fd720, fd719; +st.shared.v2.f64 [r22+11200], {fd751, fd752}; +fma.rn.f64 fd753, fd723, fd661, fd726; +sub.f64 fd754, fd728, fd727; +st.shared.v2.f64 [r22+12800], {fd753, fd754}; +fma.rn.f64 fd755, fd731, fd665, fd734; +sub.f64 fd756, fd736, fd735; +st.shared.v2.f64 [r22+14400], {fd755, fd756}; +barrier.sync 0; +ld.shared.v2.f64 {fd757, fd758}, [r10]; +ld.shared.v2.f64 {fd761, fd762}, [r10+16000]; +ld.shared.v2.f64 {fd765, fd766}, [r10+32000]; +ld.shared.v2.f64 {fd769, fd770}, [r10+48000]; +ld.shared.v2.f64 {fd773, fd774}, [r10+64000]; +ld.shared.v2.f64 {fd777, fd778}, [r10+80000]; +ld.shared.v2.f64 {fd781, fd782}, [r10+96000]; +ld.shared.v2.f64 {fd785, fd786}, [r10+112000]; +ld.shared.v2.f64 {fd789, fd790}, [r10+128000]; +ld.shared.v2.f64 {fd793, fd794}, [r10+144000]; +add.f64 fd797, fd765, fd789; +add.f64 fd798, fd757, fd797; +add.f64 fd799, fd773, fd781; +add.f64 fd800, fd799, fd798; +add.f64 fd801, fd766, fd790; +add.f64 fd802, fd758, fd801; +add.f64 fd803, fd774, fd782; +add.f64 fd804, fd803, fd802; +fma.rn.f64 fd805, fd797, 0d3FD3C6EF372FE950, fd757; +mul.f64 fd806, fd799, 0d3FE9E3779B97F4A8; +sub.f64 fd807, fd805, fd806; +sub.f64 fd808, fd766, fd790; +mul.f64 fd809, fd808, 0d3FEE6F0E134454FF; +sub.f64 fd810, fd774, fd782; +fma.rn.f64 fd811, fd810, 0d3FE2CF2304755A5E, fd809; +sub.f64 fd812, fd807, fd811; +add.f64 fd813, fd811, fd807; +mul.f64 fd814, fd797, 0d3FE9E3779B97F4A8; +sub.f64 fd815, fd757, fd814; +fma.rn.f64 fd816, fd799, 0d3FD3C6EF372FE950, fd815; +mul.f64 fd817, fd808, 0d3FE2CF2304755A5E; +mul.f64 fd818, fd810, 0d3FEE6F0E134454FF; +sub.f64 fd819, fd817, fd818; +sub.f64 fd820, fd816, fd819; +add.f64 fd821, fd819, fd816; +fma.rn.f64 fd822, fd801, 0d3FD3C6EF372FE950, fd758; +mul.f64 fd823, fd803, 0d3FE9E3779B97F4A8; +sub.f64 fd824, fd822, fd823; +sub.f64 fd825, fd765, fd789; +mul.f64 fd826, fd825, 0d3FEE6F0E134454FF; +sub.f64 fd827, fd773, fd781; +fma.rn.f64 fd828, fd827, 0d3FE2CF2304755A5E, fd826; +add.f64 fd829, fd828, fd824; +sub.f64 fd830, fd824, fd828; +mul.f64 fd831, fd801, 0d3FE9E3779B97F4A8; +sub.f64 fd832, fd758, fd831; +fma.rn.f64 fd833, fd803, 0d3FD3C6EF372FE950, fd832; +mul.f64 fd834, fd825, 0d3FE2CF2304755A5E; +mul.f64 fd835, fd827, 0d3FEE6F0E134454FF; +sub.f64 fd836, fd834, fd835; +add.f64 fd837, fd836, fd833; +sub.f64 fd838, fd833, fd836; +add.f64 fd839, fd769, fd793; +add.f64 fd840, fd761, fd839; +add.f64 fd841, fd777, fd785; +add.f64 fd842, fd841, fd840; +add.f64 fd843, fd770, fd794; +add.f64 fd844, fd762, fd843; +add.f64 fd845, fd778, fd786; +add.f64 fd846, fd845, fd844; +fma.rn.f64 fd847, fd839, 0d3FD3C6EF372FE950, fd761; +mul.f64 fd848, fd841, 0d3FE9E3779B97F4A8; +sub.f64 fd849, fd847, fd848; +sub.f64 fd850, fd770, fd794; +mul.f64 fd851, fd850, 0d3FEE6F0E134454FF; +sub.f64 fd852, fd778, fd786; +fma.rn.f64 fd853, fd852, 0d3FE2CF2304755A5E, fd851; +sub.f64 fd854, fd849, fd853; +add.f64 fd855, fd853, fd849; +mul.f64 fd856, fd839, 0d3FE9E3779B97F4A8; +sub.f64 fd857, fd761, fd856; +fma.rn.f64 fd858, fd841, 0d3FD3C6EF372FE950, fd857; +mul.f64 fd859, fd850, 0d3FE2CF2304755A5E; +mul.f64 fd860, fd852, 0d3FEE6F0E134454FF; +sub.f64 fd861, fd859, fd860; +sub.f64 fd862, fd858, fd861; +add.f64 fd863, fd861, fd858; +fma.rn.f64 fd864, fd843, 0d3FD3C6EF372FE950, fd762; +mul.f64 fd865, fd845, 0d3FE9E3779B97F4A8; +sub.f64 fd866, fd864, fd865; +sub.f64 fd867, fd769, fd793; +mul.f64 fd868, fd867, 0d3FEE6F0E134454FF; +sub.f64 fd869, fd777, fd785; +fma.rn.f64 fd870, fd869, 0d3FE2CF2304755A5E, fd868; +add.f64 fd871, fd870, fd866; +sub.f64 fd872, fd866, fd870; +mul.f64 fd873, fd843, 0d3FE9E3779B97F4A8; +sub.f64 fd874, fd762, fd873; +fma.rn.f64 fd875, fd845, 0d3FD3C6EF372FE950, fd874; +mul.f64 fd876, fd867, 0d3FE2CF2304755A5E; +mul.f64 fd877, fd869, 0d3FEE6F0E134454FF; +sub.f64 fd878, fd876, fd877; +add.f64 fd879, fd878, fd875; +sub.f64 fd880, fd875, fd878; +mul.f64 fd881, fd854, 0d3FE9E3779B97F4A8; +mul.f64 fd882, fd871, 0d3FE2CF2304755A5E; +sub.f64 fd883, fd881, fd882; +mul.f64 fd884, fd871, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd885, fd854, 0d3FE2CF2304755A5E, fd884; +mul.f64 fd886, fd862, 0d3FD3C6EF372FE950; +mul.f64 fd887, fd879, 0d3FEE6F0E134454FF; +sub.f64 fd888, fd886, fd887; +mul.f64 fd889, fd879, 0d3FD3C6EF372FE950; +fma.rn.f64 fd890, fd862, 0d3FEE6F0E134454FF, fd889; +mul.f64 fd891, fd863, 0dBFD3C6EF372FE950; +mul.f64 fd892, fd880, 0d3FEE6F0E134454FF; +sub.f64 fd893, fd891, fd892; +mul.f64 fd894, fd880, 0dBFD3C6EF372FE950; +fma.rn.f64 fd895, fd863, 0d3FEE6F0E134454FF, fd894; +mul.f64 fd896, fd855, 0dBFE9E3779B97F4A8; +mul.f64 fd897, fd872, 0d3FE2CF2304755A5E; +sub.f64 fd898, fd896, fd897; +mul.f64 fd899, fd872, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd900, fd855, 0d3FE2CF2304755A5E, fd899; +add.f64 %1, fd804, fd846; +add.f64 %0, fd800, fd842; +add.f64 %3, fd829, fd885; +add.f64 %2, fd812, fd883; +add.f64 %5, fd837, fd890; +add.f64 %4, fd820, fd888; +add.f64 %7, fd838, fd895; +add.f64 %6, fd821, fd893; +add.f64 %9, fd830, fd900; +add.f64 %8, fd813, fd898; +sub.f64 %11, fd804, fd846; +sub.f64 %10, fd800, fd842; +sub.f64 %13, fd829, fd885; +sub.f64 %12, fd812, fd883; +sub.f64 %15, fd837, fd890; +sub.f64 %14, fd820, fd888; +sub.f64 %17, fd838, fd895; +sub.f64 %16, fd821, fd893; +sub.f64 %19, fd830, fd900; +sub.f64 %18, fd813, fd898; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_10000), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1173, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<23>; +.reg .f64 fd<861>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 80000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %29, %45; +add.f64 fd42, %24, fd41; +add.f64 fd43, %34, %40; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %31, %47; +add.f64 fd46, %25, fd45; +add.f64 fd47, %36, %41; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %24; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %31, %47; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %36, %41; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %24, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %25; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %29, %45; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %34, %40; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %25, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %32, %48; +add.f64 fd84, %26, fd83; +add.f64 fd85, %37, %42; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %33, %49; +add.f64 fd88, %28, fd87; +add.f64 fd89, %39, %44; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %26; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %33, %49; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %39, %44; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %26, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %28; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %32, %48; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %37, %42; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %28, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +add.f64 fd145, fd44, fd86; +add.f64 fd146, fd48, fd90; +sub.f64 fd147, fd44, fd86; +sub.f64 fd148, fd48, fd90; +add.f64 fd149, fd56, fd127; +add.f64 fd150, fd73, fd129; +sub.f64 fd151, fd56, fd127; +sub.f64 fd152, fd73, fd129; +add.f64 fd153, fd64, fd132; +add.f64 fd154, fd81, fd134; +sub.f64 fd155, fd64, fd132; +sub.f64 fd156, fd81, fd134; +add.f64 fd157, fd65, fd137; +add.f64 fd158, fd82, fd139; +sub.f64 fd159, fd65, fd137; +sub.f64 fd160, fd82, fd139; +add.f64 fd161, fd57, fd142; +add.f64 fd162, fd74, fd144; +sub.f64 fd163, fd57, fd142; +sub.f64 fd164, fd74, fd144; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 1000; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd165, fd166}, [rd6]; +mul.f64 fd169, fd150, fd166; +fma.rn.f64 fd170, fd165, fd149, fd169; +mul.f64 fd171, fd149, fd166; +mul.f64 fd172, fd165, fd150; +sub.f64 fd173, fd172, fd171; +mul.f64 fd174, fd165, fd165; +mul.f64 fd175, fd166, fd166; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd166, fd165; +fma.rn.f64 fd178, fd166, fd165, fd177; +mul.f64 fd179, fd154, fd178; +fma.rn.f64 fd180, fd176, fd153, fd179; +mul.f64 fd181, fd153, fd178; +mul.f64 fd182, fd176, fd154; +sub.f64 fd183, fd182, fd181; +mul.f64 fd184, fd165, fd176; +mul.f64 fd185, fd166, fd178; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd165, fd178; +fma.rn.f64 fd188, fd166, fd176, fd187; +mul.f64 fd189, fd158, fd188; +fma.rn.f64 fd190, fd186, fd157, fd189; +mul.f64 fd191, fd157, fd188; +mul.f64 fd192, fd186, fd158; +sub.f64 fd193, fd192, fd191; +mul.f64 fd194, fd165, fd186; +mul.f64 fd195, fd166, fd188; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd165, fd188; +fma.rn.f64 fd198, fd166, fd186, fd197; +mul.f64 fd199, fd162, fd198; +fma.rn.f64 fd200, fd196, fd161, fd199; +mul.f64 fd201, fd161, fd198; +mul.f64 fd202, fd196, fd162; +sub.f64 fd203, fd202, fd201; +mul.f64 fd204, fd165, fd196; +mul.f64 fd205, fd166, fd198; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd165, fd198; +fma.rn.f64 fd208, fd166, fd196, fd207; +mul.f64 fd209, fd148, fd208; +fma.rn.f64 fd210, fd206, fd147, fd209; +mul.f64 fd211, fd147, fd208; +mul.f64 fd212, fd206, fd148; +sub.f64 fd213, fd212, fd211; +ld.global.v2.f64 {fd214, fd215}, [rd6+16000]; +mul.f64 fd218, fd152, fd215; +fma.rn.f64 fd219, fd214, fd151, fd218; +mul.f64 fd220, fd151, fd215; +mul.f64 fd221, fd214, fd152; +sub.f64 fd222, fd221, fd220; +mul.f64 fd223, fd165, fd214; +mul.f64 fd224, fd166, fd215; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd165, fd215; +fma.rn.f64 fd227, fd166, fd214, fd226; +mul.f64 fd228, fd156, fd227; +fma.rn.f64 fd229, fd225, fd155, fd228; +mul.f64 fd230, fd155, fd227; +mul.f64 fd231, fd225, fd156; +sub.f64 fd232, fd231, fd230; +mul.f64 fd233, fd165, fd225; +mul.f64 fd234, fd166, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd165, fd227; +fma.rn.f64 fd237, fd166, fd225, fd236; +mul.f64 fd238, fd160, fd237; +fma.rn.f64 fd239, fd235, fd159, fd238; +mul.f64 fd240, fd159, fd237; +mul.f64 fd241, fd235, fd160; +sub.f64 fd242, fd241, fd240; +mul.f64 fd243, fd165, fd235; +mul.f64 fd244, fd166, fd237; +sub.f64 fd245, fd243, fd244; +mul.f64 fd246, fd165, fd237; +fma.rn.f64 fd247, fd166, fd235, fd246; +mul.f64 fd248, fd164, fd247; +fma.rn.f64 fd249, fd245, fd163, fd248; +mul.f64 fd250, fd163, fd247; +mul.f64 fd251, fd245, fd164; +sub.f64 fd252, fd251, fd250; +mad.lo.s32 r8, r5, 80000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +st.shared.v2.f64 [r9], {fd145, fd170}; +st.shared.v2.f64 [r9+16], {fd180, fd190}; +st.shared.v2.f64 [r9+32], {fd200, fd210}; +st.shared.v2.f64 [r9+48], {fd219, fd229}; +st.shared.v2.f64 [r9+64], {fd239, fd249}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.f64 fd253, [r10]; +ld.shared.f64 fd254, [r10+8000]; +ld.shared.f64 fd255, [r10+16000]; +ld.shared.f64 fd256, [r10+24000]; +ld.shared.f64 fd257, [r10+32000]; +ld.shared.f64 fd258, [r10+40000]; +ld.shared.f64 fd259, [r10+48000]; +ld.shared.f64 fd260, [r10+56000]; +ld.shared.f64 fd261, [r10+64000]; +ld.shared.f64 fd262, [r10+72000]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd146, fd173}; +st.shared.v2.f64 [r9+16], {fd183, fd193}; +st.shared.v2.f64 [r9+32], {fd203, fd213}; +st.shared.v2.f64 [r9+48], {fd222, fd232}; +st.shared.v2.f64 [r9+64], {fd242, fd252}; +barrier.sync 0; +ld.shared.f64 fd263, [r10]; +ld.shared.f64 fd264, [r10+8000]; +ld.shared.f64 fd265, [r10+16000]; +ld.shared.f64 fd266, [r10+24000]; +ld.shared.f64 fd267, [r10+32000]; +ld.shared.f64 fd268, [r10+40000]; +ld.shared.f64 fd269, [r10+48000]; +ld.shared.f64 fd270, [r10+56000]; +ld.shared.f64 fd271, [r10+64000]; +ld.shared.f64 fd272, [r10+72000]; +add.f64 fd273, fd255, fd261; +add.f64 fd274, fd253, fd273; +add.f64 fd275, fd257, fd259; +add.f64 fd276, fd275, fd274; +add.f64 fd277, fd265, fd271; +add.f64 fd278, fd263, fd277; +add.f64 fd279, fd267, fd269; +add.f64 fd280, fd279, fd278; +fma.rn.f64 fd281, fd273, 0d3FD3C6EF372FE950, fd253; +mul.f64 fd282, fd275, 0d3FE9E3779B97F4A8; +sub.f64 fd283, fd281, fd282; +sub.f64 fd284, fd265, fd271; +mul.f64 fd285, fd284, 0d3FEE6F0E134454FF; +sub.f64 fd286, fd267, fd269; +fma.rn.f64 fd287, fd286, 0d3FE2CF2304755A5E, fd285; +sub.f64 fd288, fd283, fd287; +add.f64 fd289, fd287, fd283; +mul.f64 fd290, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd291, fd253, fd290; +fma.rn.f64 fd292, fd275, 0d3FD3C6EF372FE950, fd291; +mul.f64 fd293, fd284, 0d3FE2CF2304755A5E; +mul.f64 fd294, fd286, 0d3FEE6F0E134454FF; +sub.f64 fd295, fd293, fd294; +sub.f64 fd296, fd292, fd295; +add.f64 fd297, fd295, fd292; +fma.rn.f64 fd298, fd277, 0d3FD3C6EF372FE950, fd263; +mul.f64 fd299, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd300, fd298, fd299; +sub.f64 fd301, fd255, fd261; +mul.f64 fd302, fd301, 0d3FEE6F0E134454FF; +sub.f64 fd303, fd257, fd259; +fma.rn.f64 fd304, fd303, 0d3FE2CF2304755A5E, fd302; +add.f64 fd305, fd304, fd300; +sub.f64 fd306, fd300, fd304; +mul.f64 fd307, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd308, fd263, fd307; +fma.rn.f64 fd309, fd279, 0d3FD3C6EF372FE950, fd308; +mul.f64 fd310, fd301, 0d3FE2CF2304755A5E; +mul.f64 fd311, fd303, 0d3FEE6F0E134454FF; +sub.f64 fd312, fd310, fd311; +add.f64 fd313, fd312, fd309; +sub.f64 fd314, fd309, fd312; +add.f64 fd315, fd256, fd262; +add.f64 fd316, fd254, fd315; +add.f64 fd317, fd258, fd260; +add.f64 fd318, fd317, fd316; +add.f64 fd319, fd266, fd272; +add.f64 fd320, fd264, fd319; +add.f64 fd321, fd268, fd270; +add.f64 fd322, fd321, fd320; +fma.rn.f64 fd323, fd315, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd324, fd317, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd266, fd272; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd268, fd270; +fma.rn.f64 fd329, fd328, 0d3FE2CF2304755A5E, fd327; +sub.f64 fd330, fd325, fd329; +add.f64 fd331, fd329, fd325; +mul.f64 fd332, fd315, 0d3FE9E3779B97F4A8; +sub.f64 fd333, fd254, fd332; +fma.rn.f64 fd334, fd317, 0d3FD3C6EF372FE950, fd333; +mul.f64 fd335, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd336, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd337, fd335, fd336; +sub.f64 fd338, fd334, fd337; +add.f64 fd339, fd337, fd334; +fma.rn.f64 fd340, fd319, 0d3FD3C6EF372FE950, fd264; +mul.f64 fd341, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd342, fd340, fd341; +sub.f64 fd343, fd256, fd262; +mul.f64 fd344, fd343, 0d3FEE6F0E134454FF; +sub.f64 fd345, fd258, fd260; +fma.rn.f64 fd346, fd345, 0d3FE2CF2304755A5E, fd344; +add.f64 fd347, fd346, fd342; +sub.f64 fd348, fd342, fd346; +mul.f64 fd349, fd319, 0d3FE9E3779B97F4A8; +sub.f64 fd350, fd264, fd349; +fma.rn.f64 fd351, fd321, 0d3FD3C6EF372FE950, fd350; +mul.f64 fd352, fd343, 0d3FE2CF2304755A5E; +mul.f64 fd353, fd345, 0d3FEE6F0E134454FF; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd354, fd351; +sub.f64 fd356, fd351, fd354; +mul.f64 fd357, fd330, 0d3FE9E3779B97F4A8; +mul.f64 fd358, fd347, 0d3FE2CF2304755A5E; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd347, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd361, fd330, 0d3FE2CF2304755A5E, fd360; +mul.f64 fd362, fd338, 0d3FD3C6EF372FE950; +mul.f64 fd363, fd355, 0d3FEE6F0E134454FF; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd355, 0d3FD3C6EF372FE950; +fma.rn.f64 fd366, fd338, 0d3FEE6F0E134454FF, fd365; +mul.f64 fd367, fd339, 0dBFD3C6EF372FE950; +mul.f64 fd368, fd356, 0d3FEE6F0E134454FF; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd356, 0dBFD3C6EF372FE950; +fma.rn.f64 fd371, fd339, 0d3FEE6F0E134454FF, fd370; +mul.f64 fd372, fd331, 0dBFE9E3779B97F4A8; +mul.f64 fd373, fd348, 0d3FE2CF2304755A5E; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd348, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd376, fd331, 0d3FE2CF2304755A5E, fd375; +add.f64 fd377, fd276, fd318; +add.f64 fd378, fd280, fd322; +sub.f64 fd379, fd276, fd318; +sub.f64 fd380, fd280, fd322; +add.f64 fd381, fd288, fd359; +add.f64 fd382, fd305, fd361; +sub.f64 fd383, fd288, fd359; +sub.f64 fd384, fd305, fd361; +add.f64 fd385, fd296, fd364; +add.f64 fd386, fd313, fd366; +sub.f64 fd387, fd296, fd364; +sub.f64 fd388, fd313, fd366; +add.f64 fd389, fd297, fd369; +add.f64 fd390, fd314, fd371; +sub.f64 fd391, fd297, fd369; +sub.f64 fd392, fd314, fd371; +add.f64 fd393, fd289, fd374; +add.f64 fd394, fd306, fd376; +sub.f64 fd395, fd289, fd374; +sub.f64 fd396, fd306, fd376; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd397, fd398}, [rd11]; +mul.f64 fd401, fd382, fd398; +fma.rn.f64 fd402, fd397, fd381, fd401; +mul.f64 fd403, fd381, fd398; +mul.f64 fd404, fd397, fd382; +sub.f64 fd405, fd404, fd403; +mul.f64 fd406, fd397, fd397; +mul.f64 fd407, fd398, fd398; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd398, fd397; +fma.rn.f64 fd410, fd398, fd397, fd409; +mul.f64 fd411, fd386, fd410; +fma.rn.f64 fd412, fd408, fd385, fd411; +mul.f64 fd413, fd385, fd410; +mul.f64 fd414, fd408, fd386; +sub.f64 fd415, fd414, fd413; +mul.f64 fd416, fd397, fd408; +mul.f64 fd417, fd398, fd410; +sub.f64 fd418, fd416, fd417; +mul.f64 fd419, fd397, fd410; +fma.rn.f64 fd420, fd398, fd408, fd419; +mul.f64 fd421, fd390, fd420; +fma.rn.f64 fd422, fd418, fd389, fd421; +mul.f64 fd423, fd389, fd420; +mul.f64 fd424, fd418, fd390; +sub.f64 fd425, fd424, fd423; +mul.f64 fd426, fd397, fd418; +mul.f64 fd427, fd398, fd420; +sub.f64 fd428, fd426, fd427; +mul.f64 fd429, fd397, fd420; +fma.rn.f64 fd430, fd398, fd418, fd429; +mul.f64 fd431, fd394, fd430; +fma.rn.f64 fd432, fd428, fd393, fd431; +mul.f64 fd433, fd393, fd430; +mul.f64 fd434, fd428, fd394; +sub.f64 fd435, fd434, fd433; +mul.f64 fd436, fd397, fd428; +mul.f64 fd437, fd398, fd430; +sub.f64 fd438, fd436, fd437; +mul.f64 fd439, fd397, fd430; +fma.rn.f64 fd440, fd398, fd428, fd439; +mul.f64 fd441, fd380, fd440; +fma.rn.f64 fd442, fd438, fd379, fd441; +mul.f64 fd443, fd379, fd440; +mul.f64 fd444, fd438, fd380; +sub.f64 fd445, fd444, fd443; +ld.global.v2.f64 {fd446, fd447}, [rd11+1600]; +mul.f64 fd450, fd384, fd447; +fma.rn.f64 fd451, fd446, fd383, fd450; +mul.f64 fd452, fd383, fd447; +mul.f64 fd453, fd446, fd384; +sub.f64 fd454, fd453, fd452; +mul.f64 fd455, fd397, fd446; +mul.f64 fd456, fd398, fd447; +sub.f64 fd457, fd455, fd456; +mul.f64 fd458, fd397, fd447; +fma.rn.f64 fd459, fd398, fd446, fd458; +mul.f64 fd460, fd388, fd459; +fma.rn.f64 fd461, fd457, fd387, fd460; +mul.f64 fd462, fd387, fd459; +mul.f64 fd463, fd457, fd388; +sub.f64 fd464, fd463, fd462; +mul.f64 fd465, fd397, fd457; +mul.f64 fd466, fd398, fd459; +sub.f64 fd467, fd465, fd466; +mul.f64 fd468, fd397, fd459; +fma.rn.f64 fd469, fd398, fd457, fd468; +mul.f64 fd470, fd392, fd469; +fma.rn.f64 fd471, fd467, fd391, fd470; +mul.f64 fd472, fd391, fd469; +mul.f64 fd473, fd467, fd392; +sub.f64 fd474, fd473, fd472; +mul.f64 fd475, fd397, fd467; +mul.f64 fd476, fd398, fd469; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd397, fd469; +fma.rn.f64 fd479, fd398, fd467, fd478; +mul.f64 fd480, fd396, fd479; +fma.rn.f64 fd481, fd477, fd395, fd480; +mul.f64 fd482, fd395, fd479; +mul.f64 fd483, fd477, fd396; +sub.f64 fd484, fd483, fd482; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +st.shared.f64 [r16], fd377; +st.shared.f64 [r16+80], fd402; +st.shared.f64 [r16+160], fd412; +st.shared.f64 [r16+240], fd422; +st.shared.f64 [r16+320], fd432; +st.shared.f64 [r16+400], fd442; +st.shared.f64 [r16+480], fd451; +st.shared.f64 [r16+560], fd461; +st.shared.f64 [r16+640], fd471; +st.shared.f64 [r16+720], fd481; +barrier.sync 0; +ld.shared.f64 fd485, [r10]; +ld.shared.f64 fd486, [r10+8000]; +ld.shared.f64 fd487, [r10+16000]; +ld.shared.f64 fd488, [r10+24000]; +ld.shared.f64 fd489, [r10+32000]; +ld.shared.f64 fd490, [r10+40000]; +ld.shared.f64 fd491, [r10+48000]; +ld.shared.f64 fd492, [r10+56000]; +ld.shared.f64 fd493, [r10+64000]; +ld.shared.f64 fd494, [r10+72000]; +barrier.sync 0; +st.shared.f64 [r16], fd378; +st.shared.f64 [r16+80], fd405; +st.shared.f64 [r16+160], fd415; +st.shared.f64 [r16+240], fd425; +st.shared.f64 [r16+320], fd435; +st.shared.f64 [r16+400], fd445; +st.shared.f64 [r16+480], fd454; +st.shared.f64 [r16+560], fd464; +st.shared.f64 [r16+640], fd474; +st.shared.f64 [r16+720], fd484; +barrier.sync 0; +ld.shared.f64 fd495, [r10]; +ld.shared.f64 fd496, [r10+8000]; +ld.shared.f64 fd497, [r10+16000]; +ld.shared.f64 fd498, [r10+24000]; +ld.shared.f64 fd499, [r10+32000]; +ld.shared.f64 fd500, [r10+40000]; +ld.shared.f64 fd501, [r10+48000]; +ld.shared.f64 fd502, [r10+56000]; +ld.shared.f64 fd503, [r10+64000]; +ld.shared.f64 fd504, [r10+72000]; +add.f64 fd505, fd487, fd493; +add.f64 fd506, fd485, fd505; +add.f64 fd507, fd489, fd491; +add.f64 fd508, fd507, fd506; +add.f64 fd509, fd497, fd503; +add.f64 fd510, fd495, fd509; +add.f64 fd511, fd499, fd501; +add.f64 fd512, fd511, fd510; +fma.rn.f64 fd513, fd505, 0d3FD3C6EF372FE950, fd485; +mul.f64 fd514, fd507, 0d3FE9E3779B97F4A8; +sub.f64 fd515, fd513, fd514; +sub.f64 fd516, fd497, fd503; +mul.f64 fd517, fd516, 0d3FEE6F0E134454FF; +sub.f64 fd518, fd499, fd501; +fma.rn.f64 fd519, fd518, 0d3FE2CF2304755A5E, fd517; +sub.f64 fd520, fd515, fd519; +add.f64 fd521, fd519, fd515; +mul.f64 fd522, fd505, 0d3FE9E3779B97F4A8; +sub.f64 fd523, fd485, fd522; +fma.rn.f64 fd524, fd507, 0d3FD3C6EF372FE950, fd523; +mul.f64 fd525, fd516, 0d3FE2CF2304755A5E; +mul.f64 fd526, fd518, 0d3FEE6F0E134454FF; +sub.f64 fd527, fd525, fd526; +sub.f64 fd528, fd524, fd527; +add.f64 fd529, fd527, fd524; +fma.rn.f64 fd530, fd509, 0d3FD3C6EF372FE950, fd495; +mul.f64 fd531, fd511, 0d3FE9E3779B97F4A8; +sub.f64 fd532, fd530, fd531; +sub.f64 fd533, fd487, fd493; +mul.f64 fd534, fd533, 0d3FEE6F0E134454FF; +sub.f64 fd535, fd489, fd491; +fma.rn.f64 fd536, fd535, 0d3FE2CF2304755A5E, fd534; +add.f64 fd537, fd536, fd532; +sub.f64 fd538, fd532, fd536; +mul.f64 fd539, fd509, 0d3FE9E3779B97F4A8; +sub.f64 fd540, fd495, fd539; +fma.rn.f64 fd541, fd511, 0d3FD3C6EF372FE950, fd540; +mul.f64 fd542, fd533, 0d3FE2CF2304755A5E; +mul.f64 fd543, fd535, 0d3FEE6F0E134454FF; +sub.f64 fd544, fd542, fd543; +add.f64 fd545, fd544, fd541; +sub.f64 fd546, fd541, fd544; +add.f64 fd547, fd488, fd494; +add.f64 fd548, fd486, fd547; +add.f64 fd549, fd490, fd492; +add.f64 fd550, fd549, fd548; +add.f64 fd551, fd498, fd504; +add.f64 fd552, fd496, fd551; +add.f64 fd553, fd500, fd502; +add.f64 fd554, fd553, fd552; +fma.rn.f64 fd555, fd547, 0d3FD3C6EF372FE950, fd486; +mul.f64 fd556, fd549, 0d3FE9E3779B97F4A8; +sub.f64 fd557, fd555, fd556; +sub.f64 fd558, fd498, fd504; +mul.f64 fd559, fd558, 0d3FEE6F0E134454FF; +sub.f64 fd560, fd500, fd502; +fma.rn.f64 fd561, fd560, 0d3FE2CF2304755A5E, fd559; +sub.f64 fd562, fd557, fd561; +add.f64 fd563, fd561, fd557; +mul.f64 fd564, fd547, 0d3FE9E3779B97F4A8; +sub.f64 fd565, fd486, fd564; +fma.rn.f64 fd566, fd549, 0d3FD3C6EF372FE950, fd565; +mul.f64 fd567, fd558, 0d3FE2CF2304755A5E; +mul.f64 fd568, fd560, 0d3FEE6F0E134454FF; +sub.f64 fd569, fd567, fd568; +sub.f64 fd570, fd566, fd569; +add.f64 fd571, fd569, fd566; +fma.rn.f64 fd572, fd551, 0d3FD3C6EF372FE950, fd496; +mul.f64 fd573, fd553, 0d3FE9E3779B97F4A8; +sub.f64 fd574, fd572, fd573; +sub.f64 fd575, fd488, fd494; +mul.f64 fd576, fd575, 0d3FEE6F0E134454FF; +sub.f64 fd577, fd490, fd492; +fma.rn.f64 fd578, fd577, 0d3FE2CF2304755A5E, fd576; +add.f64 fd579, fd578, fd574; +sub.f64 fd580, fd574, fd578; +mul.f64 fd581, fd551, 0d3FE9E3779B97F4A8; +sub.f64 fd582, fd496, fd581; +fma.rn.f64 fd583, fd553, 0d3FD3C6EF372FE950, fd582; +mul.f64 fd584, fd575, 0d3FE2CF2304755A5E; +mul.f64 fd585, fd577, 0d3FEE6F0E134454FF; +sub.f64 fd586, fd584, fd585; +add.f64 fd587, fd586, fd583; +sub.f64 fd588, fd583, fd586; +mul.f64 fd589, fd562, 0d3FE9E3779B97F4A8; +mul.f64 fd590, fd579, 0d3FE2CF2304755A5E; +sub.f64 fd591, fd589, fd590; +mul.f64 fd592, fd579, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd593, fd562, 0d3FE2CF2304755A5E, fd592; +mul.f64 fd594, fd570, 0d3FD3C6EF372FE950; +mul.f64 fd595, fd587, 0d3FEE6F0E134454FF; +sub.f64 fd596, fd594, fd595; +mul.f64 fd597, fd587, 0d3FD3C6EF372FE950; +fma.rn.f64 fd598, fd570, 0d3FEE6F0E134454FF, fd597; +mul.f64 fd599, fd571, 0dBFD3C6EF372FE950; +mul.f64 fd600, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd588, 0dBFD3C6EF372FE950; +fma.rn.f64 fd603, fd571, 0d3FEE6F0E134454FF, fd602; +mul.f64 fd604, fd563, 0dBFE9E3779B97F4A8; +mul.f64 fd605, fd580, 0d3FE2CF2304755A5E; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd580, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd608, fd563, 0d3FE2CF2304755A5E, fd607; +add.f64 fd609, fd508, fd550; +add.f64 fd610, fd512, fd554; +sub.f64 fd611, fd508, fd550; +sub.f64 fd612, fd512, fd554; +add.f64 fd613, fd520, fd591; +add.f64 fd614, fd537, fd593; +sub.f64 fd615, fd520, fd591; +sub.f64 fd616, fd537, fd593; +add.f64 fd617, fd528, fd596; +add.f64 fd618, fd545, fd598; +sub.f64 fd619, fd528, fd596; +sub.f64 fd620, fd545, fd598; +add.f64 fd621, fd529, fd601; +add.f64 fd622, fd546, fd603; +sub.f64 fd623, fd529, fd601; +sub.f64 fd624, fd546, fd603; +add.f64 fd625, fd521, fd606; +add.f64 fd626, fd538, fd608; +sub.f64 fd627, fd521, fd606; +sub.f64 fd628, fd538, fd608; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 37; +cvt.u32.u64 r17, rd13; +mul.lo.s32 r18, r17, 100; +sub.s32 r19, r7, r18; +mul.wide.u32 rd14, r17, 16; +mov.u64 rd15, %23; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd629, fd630}, [rd16]; +mul.f64 fd633, fd614, fd630; +fma.rn.f64 fd634, fd629, fd613, fd633; +mul.f64 fd635, fd613, fd630; +mul.f64 fd636, fd629, fd614; +sub.f64 fd637, fd636, fd635; +mul.f64 fd638, fd629, fd629; +mul.f64 fd639, fd630, fd630; +sub.f64 fd640, fd638, fd639; +mul.f64 fd641, fd630, fd629; +fma.rn.f64 fd642, fd630, fd629, fd641; +mul.f64 fd643, fd618, fd642; +fma.rn.f64 fd644, fd640, fd617, fd643; +mul.f64 fd645, fd617, fd642; +mul.f64 fd646, fd640, fd618; +sub.f64 fd647, fd646, fd645; +mul.f64 fd648, fd629, fd640; +mul.f64 fd649, fd630, fd642; +sub.f64 fd650, fd648, fd649; +mul.f64 fd651, fd629, fd642; +fma.rn.f64 fd652, fd630, fd640, fd651; +mul.f64 fd653, fd622, fd652; +fma.rn.f64 fd654, fd650, fd621, fd653; +mul.f64 fd655, fd621, fd652; +mul.f64 fd656, fd650, fd622; +sub.f64 fd657, fd656, fd655; +mul.f64 fd658, fd629, fd650; +mul.f64 fd659, fd630, fd652; +sub.f64 fd660, fd658, fd659; +mul.f64 fd661, fd629, fd652; +fma.rn.f64 fd662, fd630, fd650, fd661; +mul.f64 fd663, fd626, fd662; +fma.rn.f64 fd664, fd660, fd625, fd663; +mul.f64 fd665, fd625, fd662; +mul.f64 fd666, fd660, fd626; +sub.f64 fd667, fd666, fd665; +mul.f64 fd668, fd629, fd660; +mul.f64 fd669, fd630, fd662; +sub.f64 fd670, fd668, fd669; +mul.f64 fd671, fd629, fd662; +fma.rn.f64 fd672, fd630, fd660, fd671; +mul.f64 fd673, fd612, fd672; +fma.rn.f64 fd674, fd670, fd611, fd673; +mul.f64 fd675, fd611, fd672; +mul.f64 fd676, fd670, fd612; +sub.f64 fd677, fd676, fd675; +ld.global.v2.f64 {fd678, fd679}, [rd16+160]; +mul.f64 fd682, fd616, fd679; +fma.rn.f64 fd683, fd678, fd615, fd682; +mul.f64 fd684, fd615, fd679; +mul.f64 fd685, fd678, fd616; +sub.f64 fd686, fd685, fd684; +mul.f64 fd687, fd629, fd678; +mul.f64 fd688, fd630, fd679; +sub.f64 fd689, fd687, fd688; +mul.f64 fd690, fd629, fd679; +fma.rn.f64 fd691, fd630, fd678, fd690; +mul.f64 fd692, fd620, fd691; +fma.rn.f64 fd693, fd689, fd619, fd692; +mul.f64 fd694, fd619, fd691; +mul.f64 fd695, fd689, fd620; +sub.f64 fd696, fd695, fd694; +mul.f64 fd697, fd629, fd689; +mul.f64 fd698, fd630, fd691; +sub.f64 fd699, fd697, fd698; +mul.f64 fd700, fd629, fd691; +fma.rn.f64 fd701, fd630, fd689, fd700; +mul.f64 fd702, fd624, fd701; +fma.rn.f64 fd703, fd699, fd623, fd702; +mul.f64 fd704, fd623, fd701; +mul.f64 fd705, fd699, fd624; +sub.f64 fd706, fd705, fd704; +mul.f64 fd707, fd629, fd699; +mul.f64 fd708, fd630, fd701; +sub.f64 fd709, fd707, fd708; +mul.f64 fd710, fd629, fd701; +fma.rn.f64 fd711, fd630, fd699, fd710; +mul.f64 fd712, fd628, fd711; +fma.rn.f64 fd713, fd709, fd627, fd712; +mul.f64 fd714, fd627, fd711; +mul.f64 fd715, fd709, fd628; +sub.f64 fd716, fd715, fd714; +shl.b32 r20, r19, 3; +add.s32 r21, r8, r20; +barrier.sync 0; +mad.lo.s32 r22, r17, 8000, r21; +st.shared.f64 [r22], fd609; +st.shared.f64 [r22+800], fd634; +st.shared.f64 [r22+1600], fd644; +st.shared.f64 [r22+2400], fd654; +st.shared.f64 [r22+3200], fd664; +st.shared.f64 [r22+4000], fd674; +st.shared.f64 [r22+4800], fd683; +st.shared.f64 [r22+5600], fd693; +st.shared.f64 [r22+6400], fd703; +st.shared.f64 [r22+7200], fd713; +barrier.sync 0; +ld.shared.f64 fd717, [r10]; +ld.shared.f64 fd718, [r10+8000]; +ld.shared.f64 fd719, [r10+16000]; +ld.shared.f64 fd720, [r10+24000]; +ld.shared.f64 fd721, [r10+32000]; +ld.shared.f64 fd722, [r10+40000]; +ld.shared.f64 fd723, [r10+48000]; +ld.shared.f64 fd724, [r10+56000]; +ld.shared.f64 fd725, [r10+64000]; +ld.shared.f64 fd726, [r10+72000]; +barrier.sync 0; +st.shared.f64 [r22], fd610; +st.shared.f64 [r22+800], fd637; +st.shared.f64 [r22+1600], fd647; +st.shared.f64 [r22+2400], fd657; +st.shared.f64 [r22+3200], fd667; +st.shared.f64 [r22+4000], fd677; +st.shared.f64 [r22+4800], fd686; +st.shared.f64 [r22+5600], fd696; +st.shared.f64 [r22+6400], fd706; +st.shared.f64 [r22+7200], fd716; +barrier.sync 0; +ld.shared.f64 fd727, [r10]; +ld.shared.f64 fd728, [r10+8000]; +ld.shared.f64 fd729, [r10+16000]; +ld.shared.f64 fd730, [r10+24000]; +ld.shared.f64 fd731, [r10+32000]; +ld.shared.f64 fd732, [r10+40000]; +ld.shared.f64 fd733, [r10+48000]; +ld.shared.f64 fd734, [r10+56000]; +ld.shared.f64 fd735, [r10+64000]; +ld.shared.f64 fd736, [r10+72000]; +add.f64 fd737, fd719, fd725; +add.f64 fd738, fd717, fd737; +add.f64 fd739, fd721, fd723; +add.f64 fd740, fd739, fd738; +add.f64 fd741, fd729, fd735; +add.f64 fd742, fd727, fd741; +add.f64 fd743, fd731, fd733; +add.f64 fd744, fd743, fd742; +fma.rn.f64 fd745, fd737, 0d3FD3C6EF372FE950, fd717; +mul.f64 fd746, fd739, 0d3FE9E3779B97F4A8; +sub.f64 fd747, fd745, fd746; +sub.f64 fd748, fd729, fd735; +mul.f64 fd749, fd748, 0d3FEE6F0E134454FF; +sub.f64 fd750, fd731, fd733; +fma.rn.f64 fd751, fd750, 0d3FE2CF2304755A5E, fd749; +sub.f64 fd752, fd747, fd751; +add.f64 fd753, fd751, fd747; +mul.f64 fd754, fd737, 0d3FE9E3779B97F4A8; +sub.f64 fd755, fd717, fd754; +fma.rn.f64 fd756, fd739, 0d3FD3C6EF372FE950, fd755; +mul.f64 fd757, fd748, 0d3FE2CF2304755A5E; +mul.f64 fd758, fd750, 0d3FEE6F0E134454FF; +sub.f64 fd759, fd757, fd758; +sub.f64 fd760, fd756, fd759; +add.f64 fd761, fd759, fd756; +fma.rn.f64 fd762, fd741, 0d3FD3C6EF372FE950, fd727; +mul.f64 fd763, fd743, 0d3FE9E3779B97F4A8; +sub.f64 fd764, fd762, fd763; +sub.f64 fd765, fd719, fd725; +mul.f64 fd766, fd765, 0d3FEE6F0E134454FF; +sub.f64 fd767, fd721, fd723; +fma.rn.f64 fd768, fd767, 0d3FE2CF2304755A5E, fd766; +add.f64 fd769, fd768, fd764; +sub.f64 fd770, fd764, fd768; +mul.f64 fd771, fd741, 0d3FE9E3779B97F4A8; +sub.f64 fd772, fd727, fd771; +fma.rn.f64 fd773, fd743, 0d3FD3C6EF372FE950, fd772; +mul.f64 fd774, fd765, 0d3FE2CF2304755A5E; +mul.f64 fd775, fd767, 0d3FEE6F0E134454FF; +sub.f64 fd776, fd774, fd775; +add.f64 fd777, fd776, fd773; +sub.f64 fd778, fd773, fd776; +add.f64 fd779, fd720, fd726; +add.f64 fd780, fd718, fd779; +add.f64 fd781, fd722, fd724; +add.f64 fd782, fd781, fd780; +add.f64 fd783, fd730, fd736; +add.f64 fd784, fd728, fd783; +add.f64 fd785, fd732, fd734; +add.f64 fd786, fd785, fd784; +fma.rn.f64 fd787, fd779, 0d3FD3C6EF372FE950, fd718; +mul.f64 fd788, fd781, 0d3FE9E3779B97F4A8; +sub.f64 fd789, fd787, fd788; +sub.f64 fd790, fd730, fd736; +mul.f64 fd791, fd790, 0d3FEE6F0E134454FF; +sub.f64 fd792, fd732, fd734; +fma.rn.f64 fd793, fd792, 0d3FE2CF2304755A5E, fd791; +sub.f64 fd794, fd789, fd793; +add.f64 fd795, fd793, fd789; +mul.f64 fd796, fd779, 0d3FE9E3779B97F4A8; +sub.f64 fd797, fd718, fd796; +fma.rn.f64 fd798, fd781, 0d3FD3C6EF372FE950, fd797; +mul.f64 fd799, fd790, 0d3FE2CF2304755A5E; +mul.f64 fd800, fd792, 0d3FEE6F0E134454FF; +sub.f64 fd801, fd799, fd800; +sub.f64 fd802, fd798, fd801; +add.f64 fd803, fd801, fd798; +fma.rn.f64 fd804, fd783, 0d3FD3C6EF372FE950, fd728; +mul.f64 fd805, fd785, 0d3FE9E3779B97F4A8; +sub.f64 fd806, fd804, fd805; +sub.f64 fd807, fd720, fd726; +mul.f64 fd808, fd807, 0d3FEE6F0E134454FF; +sub.f64 fd809, fd722, fd724; +fma.rn.f64 fd810, fd809, 0d3FE2CF2304755A5E, fd808; +add.f64 fd811, fd810, fd806; +sub.f64 fd812, fd806, fd810; +mul.f64 fd813, fd783, 0d3FE9E3779B97F4A8; +sub.f64 fd814, fd728, fd813; +fma.rn.f64 fd815, fd785, 0d3FD3C6EF372FE950, fd814; +mul.f64 fd816, fd807, 0d3FE2CF2304755A5E; +mul.f64 fd817, fd809, 0d3FEE6F0E134454FF; +sub.f64 fd818, fd816, fd817; +add.f64 fd819, fd818, fd815; +sub.f64 fd820, fd815, fd818; +mul.f64 fd821, fd794, 0d3FE9E3779B97F4A8; +mul.f64 fd822, fd811, 0d3FE2CF2304755A5E; +sub.f64 fd823, fd821, fd822; +mul.f64 fd824, fd811, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd825, fd794, 0d3FE2CF2304755A5E, fd824; +mul.f64 fd826, fd802, 0d3FD3C6EF372FE950; +mul.f64 fd827, fd819, 0d3FEE6F0E134454FF; +sub.f64 fd828, fd826, fd827; +mul.f64 fd829, fd819, 0d3FD3C6EF372FE950; +fma.rn.f64 fd830, fd802, 0d3FEE6F0E134454FF, fd829; +mul.f64 fd831, fd803, 0dBFD3C6EF372FE950; +mul.f64 fd832, fd820, 0d3FEE6F0E134454FF; +sub.f64 fd833, fd831, fd832; +mul.f64 fd834, fd820, 0dBFD3C6EF372FE950; +fma.rn.f64 fd835, fd803, 0d3FEE6F0E134454FF, fd834; +mul.f64 fd836, fd795, 0dBFE9E3779B97F4A8; +mul.f64 fd837, fd812, 0d3FE2CF2304755A5E; +sub.f64 fd838, fd836, fd837; +mul.f64 fd839, fd812, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd840, fd795, 0d3FE2CF2304755A5E, fd839; +add.f64 %0, fd740, fd782; +add.f64 %1, fd744, fd786; +add.f64 %3, fd769, fd825; +add.f64 %2, fd752, fd823; +add.f64 %5, fd777, fd830; +add.f64 %4, fd760, fd828; +add.f64 %7, fd778, fd835; +add.f64 %6, fd761, fd833; +add.f64 %9, fd770, fd840; +add.f64 %8, fd753, fd838; +sub.f64 %10, fd740, fd782; +sub.f64 %11, fd744, fd786; +sub.f64 %13, fd769, fd825; +sub.f64 %12, fd752, fd823; +sub.f64 %15, fd777, fd830; +sub.f64 %14, fd760, fd828; +sub.f64 %17, fd778, fd835; +sub.f64 %16, fd761, fd833; +sub.f64 %19, fd770, fd840; +sub.f64 %18, fd753, fd838; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_10000), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..29f70caaa1b95 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp16_fwd.hpp.inc @@ -0,0 +1,7366 @@ +#ifndef CUFFTDX_FFT_1000_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_1000_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<939, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<231>; +.reg .b32 r<3038>; +.reg .b64 rd<7>; +mov.u32 r3019, %tid.y; +shl.b32 r3020, r3019, 1; +mov.u32 r3021, %20; +mad.lo.s32 r3022, r3020, 4000, r3021; +mov.u32 r3023, %tid.x; +mov.f32 f194, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1, {low, high}; +} +mov.f32 f200, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2, {low, high}; +} +mov.f32 f202, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r3, {low, high}; +} +mov.f32 f204, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %25, %37; +} +{ +add.f16x2 r12, %21, r9; +} +{ +add.f16x2 r15, %29, %33; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %26, %38; +} +{ +add.f16x2 r24, %22, r21; +} +{ +add.f16x2 r27, %30, %34; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %25, %37; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %21, r36; +} +{ +add.f16x2 r42, %29, %33; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %26, %38; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %30, %34; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %25, %37; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %21, r72; +} +{ +add.f16x2 r78, %29, %33; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %26, %38; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %30, %34; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %25, %37; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %21, r108; +} +{ +add.f16x2 r114, %29, %33; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %26, %38; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %30, %34; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %25, %37; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %21, r144; +} +{ +add.f16x2 r150, %29, %33; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %26, %38; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %30, %34; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %26, %38; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %22, r180; +} +{ +add.f16x2 r186, %30, %34; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %25, %37; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %29, %33; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %26, %38; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %22, r216; +} +{ +add.f16x2 r222, %30, %34; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %25, %37; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %29, %33; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %26, %38; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %22, r252; +} +{ +add.f16x2 r258, %30, %34; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %25, %37; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %29, %33; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %26, %38; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %22, r288; +} +{ +add.f16x2 r294, %30, %34; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %25, %37; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %29, %33; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %27, %39; +} +{ +add.f16x2 r332, %23, r329; +} +{ +add.f16x2 r335, %31, %35; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %28, %40; +} +{ +add.f16x2 r344, %24, r341; +} +{ +add.f16x2 r347, %32, %36; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %27, %39; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %23, r356; +} +{ +add.f16x2 r362, %31, %35; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %28, %40; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %32, %36; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %27, %39; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %23, r392; +} +{ +add.f16x2 r398, %31, %35; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %28, %40; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %32, %36; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %27, %39; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %23, r428; +} +{ +add.f16x2 r434, %31, %35; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %28, %40; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %32, %36; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %27, %39; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %23, r464; +} +{ +add.f16x2 r470, %31, %35; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %28, %40; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %32, %36; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %28, %40; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %24, r500; +} +{ +add.f16x2 r506, %32, %36; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %27, %39; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %31, %35; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %28, %40; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %24, r536; +} +{ +add.f16x2 r542, %32, %36; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %27, %39; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %31, %35; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %28, %40; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %24, r572; +} +{ +add.f16x2 r578, %32, %36; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %27, %39; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %31, %35; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %28, %40; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %24, r608; +} +{ +add.f16x2 r614, %32, %36; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %27, %39; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %31, %35; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +mov.f32 f190, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r644, {low, high}; +} +mov.f32 f198, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r648, {low, high}; +} +mov.f32 f161, 0fBF800000; +{ +mul.f16x2 r659, r386, r641; +} +{ +mul.f16x2 r662, r530, r642; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r386, r642; +} +{ +fma.rn.f16x2 r671, r530, r641, r668; +} +{ +mul.f16x2 r675, r458, r643; +} +{ +mul.f16x2 r678, r602, r644; +} +{ +sub.f16x2 r681, r675, r678; +} +{ +mul.f16x2 r684, r458, r644; +} +{ +fma.rn.f16x2 r687, r602, r643, r684; +} +{ +mul.f16x2 r691, r494, r645; +} +{ +mul.f16x2 r694, r638, r646; +} +{ +sub.f16x2 r697, r691, r694; +} +{ +mul.f16x2 r700, r494, r646; +} +{ +fma.rn.f16x2 r703, r638, r645, r700; +} +{ +mul.f16x2 r707, r422, r647; +} +{ +mul.f16x2 r710, r566, r648; +} +{ +sub.f16x2 r713, r707, r710; +} +{ +mul.f16x2 r716, r422, r648; +} +{ +fma.rn.f16x2 r719, r566, r647, r716; +} +{ +add.f16x2 r723, r18, r338; +} +{ +add.f16x2 r726, r30, r350; +} +{ +sub.f16x2 r729, r18, r338; +} +{ +sub.f16x2 r732, r30, r350; +} +{ +add.f16x2 r735, r66, r665; +} +{ +add.f16x2 r738, r210, r671; +} +{ +sub.f16x2 r741, r66, r665; +} +{ +sub.f16x2 r744, r210, r671; +} +{ +add.f16x2 r747, r138, r681; +} +{ +add.f16x2 r750, r282, r687; +} +{ +sub.f16x2 r753, r138, r681; +} +{ +sub.f16x2 r756, r282, r687; +} +{ +add.f16x2 r759, r174, r697; +} +{ +add.f16x2 r762, r318, r703; +} +{ +sub.f16x2 r765, r174, r697; +} +{ +sub.f16x2 r768, r318, r703; +} +{ +add.f16x2 r771, r102, r713; +} +{ +add.f16x2 r774, r246, r719; +} +{ +sub.f16x2 r777, r102, r713; +} +{ +sub.f16x2 r780, r246, r719; +} +mul.wide.u32 rd2, r3023, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r3024, rd3; +mul.lo.s32 r3025, r3024, 100; +sub.s32 r3026, r3023, r3025; +shr.u64 rd4, rd2, 36; +cvt.u32.u64 r3027, rd4; +and.b32 r3028, r3027, 268435454; +mad.lo.s32 r3029, r3028, 4000, r3022; +cvt.rn.f32.u32 f225, r3026; +mul.f32 f226, f225, 0f3BCDE32E; +cos.approx.f32 f61, f226; +sin.approx.f32 f227, f226; +neg.f32 f62, f227; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r783, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r788, {high, high}; +} +{ +mul.f16x2 r790, r738, r788; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r735, r786, r793; +} +{ +mul.f16x2 r799, r735, r788; +} +{ +fma.rn.f16x2 r802, r738, r786, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r808, {high, high}; +} +mov.f32 f162, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r808, r810; +} +{ +mul.f16x2 r814, r783, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r817, {high, low}; +} +{ +fma.rn.f16x2 r819, r811, r817, r814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r825, {high, high}; +} +{ +mul.f16x2 r827, r750, r825; +} +{ +neg.f16x2 r830, r827; +} +{ +fma.rn.f16x2 r832, r747, r823, r830; +} +{ +mul.f16x2 r836, r747, r825; +} +{ +fma.rn.f16x2 r839, r750, r823, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r845, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r847, {low, high}; +} +{ +mul.f16x2 r848, r845, r847; +} +{ +mul.f16x2 r851, r819, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r854, {high, low}; +} +{ +fma.rn.f16x2 r856, r848, r854, r851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r860, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r862, {high, high}; +} +{ +mul.f16x2 r864, r762, r862; +} +{ +neg.f16x2 r867, r864; +} +{ +fma.rn.f16x2 r869, r759, r860, r867; +} +{ +mul.f16x2 r873, r759, r862; +} +{ +fma.rn.f16x2 r876, r762, r860, r873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r882, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r884, {low, high}; +} +{ +mul.f16x2 r885, r882, r884; +} +{ +mul.f16x2 r888, r856, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r891, {high, low}; +} +{ +fma.rn.f16x2 r893, r885, r891, r888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r897, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r899, {high, high}; +} +{ +mul.f16x2 r901, r774, r899; +} +{ +neg.f16x2 r904, r901; +} +{ +fma.rn.f16x2 r906, r771, r897, r904; +} +{ +mul.f16x2 r910, r771, r899; +} +{ +fma.rn.f16x2 r913, r774, r897, r910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r919, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r921, {low, high}; +} +{ +mul.f16x2 r922, r919, r921; +} +{ +mul.f16x2 r925, r893, r917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r928, {high, low}; +} +{ +fma.rn.f16x2 r930, r922, r928, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r934, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r936, {high, high}; +} +{ +mul.f16x2 r938, r732, r936; +} +{ +neg.f16x2 r941, r938; +} +{ +fma.rn.f16x2 r943, r729, r934, r941; +} +{ +mul.f16x2 r947, r729, r936; +} +{ +fma.rn.f16x2 r950, r732, r934, r947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r956, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r958, {low, high}; +} +{ +mul.f16x2 r959, r956, r958; +} +{ +mul.f16x2 r962, r930, r954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r965, {high, low}; +} +{ +fma.rn.f16x2 r967, r959, r965, r962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r971, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r973, {high, high}; +} +{ +mul.f16x2 r975, r744, r973; +} +{ +neg.f16x2 r978, r975; +} +{ +fma.rn.f16x2 r980, r741, r971, r978; +} +{ +mul.f16x2 r984, r741, r973; +} +{ +fma.rn.f16x2 r987, r744, r971, r984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r991, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r993, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r995, {low, high}; +} +{ +mul.f16x2 r996, r993, r995; +} +{ +mul.f16x2 r999, r967, r991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r1002, {high, low}; +} +{ +fma.rn.f16x2 r1004, r996, r1002, r999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1008, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1010, {high, high}; +} +{ +mul.f16x2 r1012, r756, r1010; +} +{ +neg.f16x2 r1015, r1012; +} +{ +fma.rn.f16x2 r1017, r753, r1008, r1015; +} +{ +mul.f16x2 r1021, r753, r1010; +} +{ +fma.rn.f16x2 r1024, r756, r1008, r1021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1028, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1030, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1032, {low, high}; +} +{ +mul.f16x2 r1033, r1030, r1032; +} +{ +mul.f16x2 r1036, r1004, r1028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1039, {high, low}; +} +{ +fma.rn.f16x2 r1041, r1033, r1039, r1036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1045, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1047, {high, high}; +} +{ +mul.f16x2 r1049, r768, r1047; +} +{ +neg.f16x2 r1052, r1049; +} +{ +fma.rn.f16x2 r1054, r765, r1045, r1052; +} +{ +mul.f16x2 r1058, r765, r1047; +} +{ +fma.rn.f16x2 r1061, r768, r1045, r1058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1065, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1067, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1069, {low, high}; +} +{ +mul.f16x2 r1070, r1067, r1069; +} +{ +mul.f16x2 r1073, r1041, r1065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1076, {high, low}; +} +{ +fma.rn.f16x2 r1078, r1070, r1076, r1073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1082, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1084, {high, high}; +} +{ +mul.f16x2 r1086, r780, r1084; +} +{ +neg.f16x2 r1089, r1086; +} +{ +fma.rn.f16x2 r1091, r777, r1082, r1089; +} +{ +mul.f16x2 r1095, r777, r1084; +} +{ +fma.rn.f16x2 r1098, r780, r1082, r1095; +} +barrier.sync 0; +mad.lo.s32 r3030, r3026, 80, r3029; +st.shared.v2.f32 [r3030], {r723, r726}; +st.shared.v2.f32 [r3030+8], {r795, r802}; +st.shared.v2.f32 [r3030+16], {r832, r839}; +st.shared.v2.f32 [r3030+24], {r869, r876}; +st.shared.v2.f32 [r3030+32], {r906, r913}; +st.shared.v2.f32 [r3030+40], {r943, r950}; +st.shared.v2.f32 [r3030+48], {r980, r987}; +st.shared.v2.f32 [r3030+56], {r1017, r1024}; +st.shared.v2.f32 [r3030+64], {r1054, r1061}; +st.shared.v2.f32 [r3030+72], {r1091, r1098}; +barrier.sync 0; +mad.lo.s32 r3031, r3026, -72, r3030; +ld.shared.u32 r1131, [r3031]; +ld.shared.u32 r1143, [r3031+4]; +ld.shared.u32 r1451, [r3031+800]; +ld.shared.u32 r1463, [r3031+804]; +ld.shared.u32 r1128, [r3031+1600]; +ld.shared.u32 r1140, [r3031+1604]; +ld.shared.u32 r1448, [r3031+2400]; +ld.shared.u32 r1460, [r3031+2404]; +ld.shared.u32 r1134, [r3031+3200]; +ld.shared.u32 r1146, [r3031+3204]; +ld.shared.u32 r1454, [r3031+4000]; +ld.shared.u32 r1466, [r3031+4004]; +ld.shared.u32 r1135, [r3031+4800]; +ld.shared.u32 r1147, [r3031+4804]; +ld.shared.u32 r1455, [r3031+5600]; +ld.shared.u32 r1467, [r3031+5604]; +ld.shared.u32 r1129, [r3031+6400]; +ld.shared.u32 r1141, [r3031+6404]; +ld.shared.u32 r1449, [r3031+7200]; +ld.shared.u32 r1461, [r3031+7204]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +add.f16x2 r1127, r1128, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1130, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1142, r1145; +} +{ +add.f16x2 r1151, r1128, r1129; +} +{ +mul.f16x2 r1154, r1151, r1119; +} +{ +add.f16x2 r1157, r1131, r1154; +} +{ +add.f16x2 r1160, r1134, r1135; +} +{ +mul.f16x2 r1163, r1160, r1121; +} +{ +add.f16x2 r1166, r1157, r1163; +} +{ +sub.f16x2 r1169, r1140, r1141; +} +{ +mul.f16x2 r1172, r1169, r1120; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1122; +} +{ +add.f16x2 r1181, r1172, r1178; +} +{ +sub.f16x2 r1184, r1166, r1181; +} +{ +add.f16x2 r1187, r1128, r1129; +} +{ +mul.f16x2 r1190, r1187, r1119; +} +{ +add.f16x2 r1193, r1131, r1190; +} +{ +add.f16x2 r1196, r1134, r1135; +} +{ +mul.f16x2 r1199, r1196, r1121; +} +{ +add.f16x2 r1202, r1193, r1199; +} +{ +sub.f16x2 r1205, r1140, r1141; +} +{ +mul.f16x2 r1208, r1205, r1120; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1122; +} +{ +add.f16x2 r1217, r1208, r1214; +} +{ +add.f16x2 r1220, r1202, r1217; +} +{ +add.f16x2 r1223, r1128, r1129; +} +{ +mul.f16x2 r1226, r1223, r1121; +} +{ +add.f16x2 r1229, r1131, r1226; +} +{ +add.f16x2 r1232, r1134, r1135; +} +{ +mul.f16x2 r1235, r1232, r1123; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1140, r1141; +} +{ +mul.f16x2 r1244, r1241, r1122; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1125; +} +{ +add.f16x2 r1253, r1244, r1250; +} +{ +sub.f16x2 r1256, r1238, r1253; +} +{ +add.f16x2 r1259, r1128, r1129; +} +{ +mul.f16x2 r1262, r1259, r1121; +} +{ +add.f16x2 r1265, r1131, r1262; +} +{ +add.f16x2 r1268, r1134, r1135; +} +{ +mul.f16x2 r1271, r1268, r1123; +} +{ +add.f16x2 r1274, r1265, r1271; +} +{ +sub.f16x2 r1277, r1140, r1141; +} +{ +mul.f16x2 r1280, r1277, r1122; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1125; +} +{ +add.f16x2 r1289, r1280, r1286; +} +{ +add.f16x2 r1292, r1274, r1289; +} +{ +add.f16x2 r1295, r1140, r1141; +} +{ +mul.f16x2 r1298, r1295, r1119; +} +{ +add.f16x2 r1301, r1143, r1298; +} +{ +add.f16x2 r1304, r1146, r1147; +} +{ +mul.f16x2 r1307, r1304, r1121; +} +{ +add.f16x2 r1310, r1301, r1307; +} +{ +sub.f16x2 r1313, r1128, r1129; +} +{ +mul.f16x2 r1316, r1313, r1120; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1122; +} +{ +add.f16x2 r1325, r1316, r1322; +} +{ +add.f16x2 r1328, r1310, r1325; +} +{ +add.f16x2 r1331, r1140, r1141; +} +{ +mul.f16x2 r1334, r1331, r1119; +} +{ +add.f16x2 r1337, r1143, r1334; +} +{ +add.f16x2 r1340, r1146, r1147; +} +{ +mul.f16x2 r1343, r1340, r1121; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +sub.f16x2 r1349, r1128, r1129; +} +{ +mul.f16x2 r1352, r1349, r1120; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1122; +} +{ +add.f16x2 r1361, r1352, r1358; +} +{ +sub.f16x2 r1364, r1346, r1361; +} +{ +add.f16x2 r1367, r1140, r1141; +} +{ +mul.f16x2 r1370, r1367, r1121; +} +{ +add.f16x2 r1373, r1143, r1370; +} +{ +add.f16x2 r1376, r1146, r1147; +} +{ +mul.f16x2 r1379, r1376, r1123; +} +{ +add.f16x2 r1382, r1373, r1379; +} +{ +sub.f16x2 r1385, r1128, r1129; +} +{ +mul.f16x2 r1388, r1385, r1122; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1125; +} +{ +add.f16x2 r1397, r1388, r1394; +} +{ +add.f16x2 r1400, r1382, r1397; +} +{ +add.f16x2 r1403, r1140, r1141; +} +{ +mul.f16x2 r1406, r1403, r1121; +} +{ +add.f16x2 r1409, r1143, r1406; +} +{ +add.f16x2 r1412, r1146, r1147; +} +{ +mul.f16x2 r1415, r1412, r1123; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +sub.f16x2 r1421, r1128, r1129; +} +{ +mul.f16x2 r1424, r1421, r1122; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1125; +} +{ +add.f16x2 r1433, r1424, r1430; +} +{ +sub.f16x2 r1436, r1418, r1433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1444, {low, high}; +} +{ +neg.f16x2 r1445, r1444; +} +{ +add.f16x2 r1447, r1448, r1449; +} +{ +add.f16x2 r1450, r1451, r1447; +} +{ +add.f16x2 r1453, r1454, r1455; +} +{ +add.f16x2 r1456, r1450, r1453; +} +{ +add.f16x2 r1459, r1460, r1461; +} +{ +add.f16x2 r1462, r1463, r1459; +} +{ +add.f16x2 r1465, r1466, r1467; +} +{ +add.f16x2 r1468, r1462, r1465; +} +{ +add.f16x2 r1471, r1448, r1449; +} +{ +mul.f16x2 r1474, r1471, r1439; +} +{ +add.f16x2 r1477, r1451, r1474; +} +{ +add.f16x2 r1480, r1454, r1455; +} +{ +mul.f16x2 r1483, r1480, r1441; +} +{ +add.f16x2 r1486, r1477, r1483; +} +{ +sub.f16x2 r1489, r1460, r1461; +} +{ +mul.f16x2 r1492, r1489, r1440; +} +{ +sub.f16x2 r1495, r1466, r1467; +} +{ +mul.f16x2 r1498, r1495, r1442; +} +{ +add.f16x2 r1501, r1492, r1498; +} +{ +sub.f16x2 r1504, r1486, r1501; +} +{ +add.f16x2 r1507, r1448, r1449; +} +{ +mul.f16x2 r1510, r1507, r1439; +} +{ +add.f16x2 r1513, r1451, r1510; +} +{ +add.f16x2 r1516, r1454, r1455; +} +{ +mul.f16x2 r1519, r1516, r1441; +} +{ +add.f16x2 r1522, r1513, r1519; +} +{ +sub.f16x2 r1525, r1460, r1461; +} +{ +mul.f16x2 r1528, r1525, r1440; +} +{ +sub.f16x2 r1531, r1466, r1467; +} +{ +mul.f16x2 r1534, r1531, r1442; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +add.f16x2 r1540, r1522, r1537; +} +{ +add.f16x2 r1543, r1448, r1449; +} +{ +mul.f16x2 r1546, r1543, r1441; +} +{ +add.f16x2 r1549, r1451, r1546; +} +{ +add.f16x2 r1552, r1454, r1455; +} +{ +mul.f16x2 r1555, r1552, r1443; +} +{ +add.f16x2 r1558, r1549, r1555; +} +{ +sub.f16x2 r1561, r1460, r1461; +} +{ +mul.f16x2 r1564, r1561, r1442; +} +{ +sub.f16x2 r1567, r1466, r1467; +} +{ +mul.f16x2 r1570, r1567, r1445; +} +{ +add.f16x2 r1573, r1564, r1570; +} +{ +sub.f16x2 r1576, r1558, r1573; +} +{ +add.f16x2 r1579, r1448, r1449; +} +{ +mul.f16x2 r1582, r1579, r1441; +} +{ +add.f16x2 r1585, r1451, r1582; +} +{ +add.f16x2 r1588, r1454, r1455; +} +{ +mul.f16x2 r1591, r1588, r1443; +} +{ +add.f16x2 r1594, r1585, r1591; +} +{ +sub.f16x2 r1597, r1460, r1461; +} +{ +mul.f16x2 r1600, r1597, r1442; +} +{ +sub.f16x2 r1603, r1466, r1467; +} +{ +mul.f16x2 r1606, r1603, r1445; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +add.f16x2 r1612, r1594, r1609; +} +{ +add.f16x2 r1615, r1460, r1461; +} +{ +mul.f16x2 r1618, r1615, r1439; +} +{ +add.f16x2 r1621, r1463, r1618; +} +{ +add.f16x2 r1624, r1466, r1467; +} +{ +mul.f16x2 r1627, r1624, r1441; +} +{ +add.f16x2 r1630, r1621, r1627; +} +{ +sub.f16x2 r1633, r1448, r1449; +} +{ +mul.f16x2 r1636, r1633, r1440; +} +{ +sub.f16x2 r1639, r1454, r1455; +} +{ +mul.f16x2 r1642, r1639, r1442; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +add.f16x2 r1648, r1630, r1645; +} +{ +add.f16x2 r1651, r1460, r1461; +} +{ +mul.f16x2 r1654, r1651, r1439; +} +{ +add.f16x2 r1657, r1463, r1654; +} +{ +add.f16x2 r1660, r1466, r1467; +} +{ +mul.f16x2 r1663, r1660, r1441; +} +{ +add.f16x2 r1666, r1657, r1663; +} +{ +sub.f16x2 r1669, r1448, r1449; +} +{ +mul.f16x2 r1672, r1669, r1440; +} +{ +sub.f16x2 r1675, r1454, r1455; +} +{ +mul.f16x2 r1678, r1675, r1442; +} +{ +add.f16x2 r1681, r1672, r1678; +} +{ +sub.f16x2 r1684, r1666, r1681; +} +{ +add.f16x2 r1687, r1460, r1461; +} +{ +mul.f16x2 r1690, r1687, r1441; +} +{ +add.f16x2 r1693, r1463, r1690; +} +{ +add.f16x2 r1696, r1466, r1467; +} +{ +mul.f16x2 r1699, r1696, r1443; +} +{ +add.f16x2 r1702, r1693, r1699; +} +{ +sub.f16x2 r1705, r1448, r1449; +} +{ +mul.f16x2 r1708, r1705, r1442; +} +{ +sub.f16x2 r1711, r1454, r1455; +} +{ +mul.f16x2 r1714, r1711, r1445; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +add.f16x2 r1720, r1702, r1717; +} +{ +add.f16x2 r1723, r1460, r1461; +} +{ +mul.f16x2 r1726, r1723, r1441; +} +{ +add.f16x2 r1729, r1463, r1726; +} +{ +add.f16x2 r1732, r1466, r1467; +} +{ +mul.f16x2 r1735, r1732, r1443; +} +{ +add.f16x2 r1738, r1729, r1735; +} +{ +sub.f16x2 r1741, r1448, r1449; +} +{ +mul.f16x2 r1744, r1741, r1442; +} +{ +sub.f16x2 r1747, r1454, r1455; +} +{ +mul.f16x2 r1750, r1747, r1445; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, r1738, r1753; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1759, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1760, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1761, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1766, {low, high}; +} +{ +mul.f16x2 r1777, r1504, r1759; +} +{ +mul.f16x2 r1780, r1648, r1760; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1504, r1760; +} +{ +fma.rn.f16x2 r1789, r1648, r1759, r1786; +} +{ +mul.f16x2 r1793, r1576, r1761; +} +{ +mul.f16x2 r1796, r1720, r1762; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1576, r1762; +} +{ +fma.rn.f16x2 r1805, r1720, r1761, r1802; +} +{ +mul.f16x2 r1809, r1612, r1763; +} +{ +mul.f16x2 r1812, r1756, r1764; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1612, r1764; +} +{ +fma.rn.f16x2 r1821, r1756, r1763, r1818; +} +{ +mul.f16x2 r1825, r1540, r1765; +} +{ +mul.f16x2 r1828, r1684, r1766; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1540, r1766; +} +{ +fma.rn.f16x2 r1837, r1684, r1765, r1834; +} +{ +add.f16x2 r1841, r1136, r1456; +} +{ +add.f16x2 r1844, r1148, r1468; +} +{ +sub.f16x2 r1847, r1136, r1456; +} +{ +sub.f16x2 r1850, r1148, r1468; +} +{ +add.f16x2 r1853, r1184, r1783; +} +{ +add.f16x2 r1856, r1328, r1789; +} +{ +sub.f16x2 r1859, r1184, r1783; +} +{ +sub.f16x2 r1862, r1328, r1789; +} +{ +add.f16x2 r1865, r1256, r1799; +} +{ +add.f16x2 r1868, r1400, r1805; +} +{ +sub.f16x2 r1871, r1256, r1799; +} +{ +sub.f16x2 r1874, r1400, r1805; +} +{ +add.f16x2 r1877, r1292, r1815; +} +{ +add.f16x2 r1880, r1436, r1821; +} +{ +sub.f16x2 r1883, r1292, r1815; +} +{ +sub.f16x2 r1886, r1436, r1821; +} +{ +add.f16x2 r1889, r1220, r1831; +} +{ +add.f16x2 r1892, r1364, r1837; +} +{ +sub.f16x2 r1895, r1220, r1831; +} +{ +sub.f16x2 r1898, r1364, r1837; +} +mul.wide.u32 rd5, r3026, -858993459; +shr.u64 rd6, rd5, 35; +cvt.u32.u64 r3032, rd6; +cvt.rn.f32.u32 f228, r3032; +mul.f32 f229, f228, 0f3D80ADFD; +cos.approx.f32 f143, f229; +sin.approx.f32 f230, f229; +neg.f32 f144, f230; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1901, {low, high}; +} +mul.lo.s32 r3033, r3032, 10; +sub.s32 r3034, r3026, r3033; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1904, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1906, {high, high}; +} +{ +mul.f16x2 r1908, r1856, r1906; +} +{ +neg.f16x2 r1911, r1908; +} +{ +fma.rn.f16x2 r1913, r1853, r1904, r1911; +} +{ +mul.f16x2 r1917, r1853, r1906; +} +{ +fma.rn.f16x2 r1920, r1856, r1904, r1917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1924, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1926, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1928, {low, high}; +} +{ +mul.f16x2 r1929, r1926, r1928; +} +{ +mul.f16x2 r1932, r1901, r1924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1935, {high, low}; +} +{ +fma.rn.f16x2 r1937, r1929, r1935, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1941, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1943, {high, high}; +} +{ +mul.f16x2 r1945, r1868, r1943; +} +{ +neg.f16x2 r1948, r1945; +} +{ +fma.rn.f16x2 r1950, r1865, r1941, r1948; +} +{ +mul.f16x2 r1954, r1865, r1943; +} +{ +fma.rn.f16x2 r1957, r1868, r1941, r1954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1961, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1963, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1965, {low, high}; +} +{ +mul.f16x2 r1966, r1963, r1965; +} +{ +mul.f16x2 r1969, r1937, r1961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1972, {high, low}; +} +{ +fma.rn.f16x2 r1974, r1966, r1972, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1978, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1980, {high, high}; +} +{ +mul.f16x2 r1982, r1880, r1980; +} +{ +neg.f16x2 r1985, r1982; +} +{ +fma.rn.f16x2 r1987, r1877, r1978, r1985; +} +{ +mul.f16x2 r1991, r1877, r1980; +} +{ +fma.rn.f16x2 r1994, r1880, r1978, r1991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1998, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2000, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2002, {low, high}; +} +{ +mul.f16x2 r2003, r2000, r2002; +} +{ +mul.f16x2 r2006, r1974, r1998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r2009, {high, low}; +} +{ +fma.rn.f16x2 r2011, r2003, r2009, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2015, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2017, {high, high}; +} +{ +mul.f16x2 r2019, r1892, r2017; +} +{ +neg.f16x2 r2022, r2019; +} +{ +fma.rn.f16x2 r2024, r1889, r2015, r2022; +} +{ +mul.f16x2 r2028, r1889, r2017; +} +{ +fma.rn.f16x2 r2031, r1892, r2015, r2028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2035, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2037, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2039, {low, high}; +} +{ +mul.f16x2 r2040, r2037, r2039; +} +{ +mul.f16x2 r2043, r2011, r2035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2046, {high, low}; +} +{ +fma.rn.f16x2 r2048, r2040, r2046, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2052, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2054, {high, high}; +} +{ +mul.f16x2 r2056, r1850, r2054; +} +{ +neg.f16x2 r2059, r2056; +} +{ +fma.rn.f16x2 r2061, r1847, r2052, r2059; +} +{ +mul.f16x2 r2065, r1847, r2054; +} +{ +fma.rn.f16x2 r2068, r1850, r2052, r2065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2072, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2074, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2076, {low, high}; +} +{ +mul.f16x2 r2077, r2074, r2076; +} +{ +mul.f16x2 r2080, r2048, r2072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2083, {high, low}; +} +{ +fma.rn.f16x2 r2085, r2077, r2083, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2089, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2091, {high, high}; +} +{ +mul.f16x2 r2093, r1862, r2091; +} +{ +neg.f16x2 r2096, r2093; +} +{ +fma.rn.f16x2 r2098, r1859, r2089, r2096; +} +{ +mul.f16x2 r2102, r1859, r2091; +} +{ +fma.rn.f16x2 r2105, r1862, r2089, r2102; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2109, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2111, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2113, {low, high}; +} +{ +mul.f16x2 r2114, r2111, r2113; +} +{ +mul.f16x2 r2117, r2085, r2109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2120, {high, low}; +} +{ +fma.rn.f16x2 r2122, r2114, r2120, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2128, {high, high}; +} +{ +mul.f16x2 r2130, r1874, r2128; +} +{ +neg.f16x2 r2133, r2130; +} +{ +fma.rn.f16x2 r2135, r1871, r2126, r2133; +} +{ +mul.f16x2 r2139, r1871, r2128; +} +{ +fma.rn.f16x2 r2142, r1874, r2126, r2139; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2146, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2148, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2150, {low, high}; +} +{ +mul.f16x2 r2151, r2148, r2150; +} +{ +mul.f16x2 r2154, r2122, r2146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2157, {high, low}; +} +{ +fma.rn.f16x2 r2159, r2151, r2157, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2163, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2165, {high, high}; +} +{ +mul.f16x2 r2167, r1886, r2165; +} +{ +neg.f16x2 r2170, r2167; +} +{ +fma.rn.f16x2 r2172, r1883, r2163, r2170; +} +{ +mul.f16x2 r2176, r1883, r2165; +} +{ +fma.rn.f16x2 r2179, r1886, r2163, r2176; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2183, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2185, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2187, {low, high}; +} +{ +mul.f16x2 r2188, r2185, r2187; +} +{ +mul.f16x2 r2191, r2159, r2183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2194, {high, low}; +} +{ +fma.rn.f16x2 r2196, r2188, r2194, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2202, {high, high}; +} +{ +mul.f16x2 r2204, r1898, r2202; +} +{ +neg.f16x2 r2207, r2204; +} +{ +fma.rn.f16x2 r2209, r1895, r2200, r2207; +} +{ +mul.f16x2 r2213, r1895, r2202; +} +{ +fma.rn.f16x2 r2216, r1898, r2200, r2213; +} +shl.b32 r3035, r3034, 3; +add.s32 r3036, r3029, r3035; +barrier.sync 0; +mad.lo.s32 r3037, r3032, 800, r3036; +st.shared.u32 [r3037], r1841; +st.shared.u32 [r3037+4], r1844; +st.shared.u32 [r3037+80], r1913; +st.shared.u32 [r3037+84], r1920; +st.shared.u32 [r3037+160], r1950; +st.shared.u32 [r3037+164], r1957; +st.shared.u32 [r3037+240], r1987; +st.shared.u32 [r3037+244], r1994; +st.shared.u32 [r3037+320], r2024; +st.shared.u32 [r3037+324], r2031; +st.shared.u32 [r3037+400], r2061; +st.shared.u32 [r3037+404], r2068; +st.shared.u32 [r3037+480], r2098; +st.shared.u32 [r3037+484], r2105; +st.shared.u32 [r3037+560], r2135; +st.shared.u32 [r3037+564], r2142; +st.shared.u32 [r3037+640], r2172; +st.shared.u32 [r3037+644], r2179; +st.shared.u32 [r3037+720], r2209; +st.shared.u32 [r3037+724], r2216; +barrier.sync 0; +ld.shared.u32 r2249, [r3031]; +ld.shared.u32 r2261, [r3031+4]; +ld.shared.u32 r2569, [r3031+800]; +ld.shared.u32 r2581, [r3031+804]; +ld.shared.u32 r2246, [r3031+1600]; +ld.shared.u32 r2258, [r3031+1604]; +ld.shared.u32 r2566, [r3031+2400]; +ld.shared.u32 r2578, [r3031+2404]; +ld.shared.u32 r2252, [r3031+3200]; +ld.shared.u32 r2264, [r3031+3204]; +ld.shared.u32 r2572, [r3031+4000]; +ld.shared.u32 r2584, [r3031+4004]; +ld.shared.u32 r2253, [r3031+4800]; +ld.shared.u32 r2265, [r3031+4804]; +ld.shared.u32 r2573, [r3031+5600]; +ld.shared.u32 r2585, [r3031+5604]; +ld.shared.u32 r2247, [r3031+6400]; +ld.shared.u32 r2259, [r3031+6404]; +ld.shared.u32 r2567, [r3031+7200]; +ld.shared.u32 r2579, [r3031+7204]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +add.f16x2 r2245, r2246, r2247; +} +{ +add.f16x2 r2248, r2249, r2245; +} +{ +add.f16x2 r2251, r2252, r2253; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r2258, r2259; +} +{ +add.f16x2 r2260, r2261, r2257; +} +{ +add.f16x2 r2263, r2264, r2265; +} +{ +add.f16x2 r2266, r2260, r2263; +} +{ +add.f16x2 r2269, r2246, r2247; +} +{ +mul.f16x2 r2272, r2269, r2237; +} +{ +add.f16x2 r2275, r2249, r2272; +} +{ +add.f16x2 r2278, r2252, r2253; +} +{ +mul.f16x2 r2281, r2278, r2239; +} +{ +add.f16x2 r2284, r2275, r2281; +} +{ +sub.f16x2 r2287, r2258, r2259; +} +{ +mul.f16x2 r2290, r2287, r2238; +} +{ +sub.f16x2 r2293, r2264, r2265; +} +{ +mul.f16x2 r2296, r2293, r2240; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r2284, r2299; +} +{ +add.f16x2 r2305, r2246, r2247; +} +{ +mul.f16x2 r2308, r2305, r2237; +} +{ +add.f16x2 r2311, r2249, r2308; +} +{ +add.f16x2 r2314, r2252, r2253; +} +{ +mul.f16x2 r2317, r2314, r2239; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r2258, r2259; +} +{ +mul.f16x2 r2326, r2323, r2238; +} +{ +sub.f16x2 r2329, r2264, r2265; +} +{ +mul.f16x2 r2332, r2329, r2240; +} +{ +add.f16x2 r2335, r2326, r2332; +} +{ +add.f16x2 r2338, r2320, r2335; +} +{ +add.f16x2 r2341, r2246, r2247; +} +{ +mul.f16x2 r2344, r2341, r2239; +} +{ +add.f16x2 r2347, r2249, r2344; +} +{ +add.f16x2 r2350, r2252, r2253; +} +{ +mul.f16x2 r2353, r2350, r2241; +} +{ +add.f16x2 r2356, r2347, r2353; +} +{ +sub.f16x2 r2359, r2258, r2259; +} +{ +mul.f16x2 r2362, r2359, r2240; +} +{ +sub.f16x2 r2365, r2264, r2265; +} +{ +mul.f16x2 r2368, r2365, r2243; +} +{ +add.f16x2 r2371, r2362, r2368; +} +{ +sub.f16x2 r2374, r2356, r2371; +} +{ +add.f16x2 r2377, r2246, r2247; +} +{ +mul.f16x2 r2380, r2377, r2239; +} +{ +add.f16x2 r2383, r2249, r2380; +} +{ +add.f16x2 r2386, r2252, r2253; +} +{ +mul.f16x2 r2389, r2386, r2241; +} +{ +add.f16x2 r2392, r2383, r2389; +} +{ +sub.f16x2 r2395, r2258, r2259; +} +{ +mul.f16x2 r2398, r2395, r2240; +} +{ +sub.f16x2 r2401, r2264, r2265; +} +{ +mul.f16x2 r2404, r2401, r2243; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +add.f16x2 r2410, r2392, r2407; +} +{ +add.f16x2 r2413, r2258, r2259; +} +{ +mul.f16x2 r2416, r2413, r2237; +} +{ +add.f16x2 r2419, r2261, r2416; +} +{ +add.f16x2 r2422, r2264, r2265; +} +{ +mul.f16x2 r2425, r2422, r2239; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r2246, r2247; +} +{ +mul.f16x2 r2434, r2431, r2238; +} +{ +sub.f16x2 r2437, r2252, r2253; +} +{ +mul.f16x2 r2440, r2437, r2240; +} +{ +add.f16x2 r2443, r2434, r2440; +} +{ +add.f16x2 r2446, r2428, r2443; +} +{ +add.f16x2 r2449, r2258, r2259; +} +{ +mul.f16x2 r2452, r2449, r2237; +} +{ +add.f16x2 r2455, r2261, r2452; +} +{ +add.f16x2 r2458, r2264, r2265; +} +{ +mul.f16x2 r2461, r2458, r2239; +} +{ +add.f16x2 r2464, r2455, r2461; +} +{ +sub.f16x2 r2467, r2246, r2247; +} +{ +mul.f16x2 r2470, r2467, r2238; +} +{ +sub.f16x2 r2473, r2252, r2253; +} +{ +mul.f16x2 r2476, r2473, r2240; +} +{ +add.f16x2 r2479, r2470, r2476; +} +{ +sub.f16x2 r2482, r2464, r2479; +} +{ +add.f16x2 r2485, r2258, r2259; +} +{ +mul.f16x2 r2488, r2485, r2239; +} +{ +add.f16x2 r2491, r2261, r2488; +} +{ +add.f16x2 r2494, r2264, r2265; +} +{ +mul.f16x2 r2497, r2494, r2241; +} +{ +add.f16x2 r2500, r2491, r2497; +} +{ +sub.f16x2 r2503, r2246, r2247; +} +{ +mul.f16x2 r2506, r2503, r2240; +} +{ +sub.f16x2 r2509, r2252, r2253; +} +{ +mul.f16x2 r2512, r2509, r2243; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +add.f16x2 r2518, r2500, r2515; +} +{ +add.f16x2 r2521, r2258, r2259; +} +{ +mul.f16x2 r2524, r2521, r2239; +} +{ +add.f16x2 r2527, r2261, r2524; +} +{ +add.f16x2 r2530, r2264, r2265; +} +{ +mul.f16x2 r2533, r2530, r2241; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r2246, r2247; +} +{ +mul.f16x2 r2542, r2539, r2240; +} +{ +sub.f16x2 r2545, r2252, r2253; +} +{ +mul.f16x2 r2548, r2545, r2243; +} +{ +add.f16x2 r2551, r2542, r2548; +} +{ +sub.f16x2 r2554, r2536, r2551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2557, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2558, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2562, {low, high}; +} +{ +neg.f16x2 r2563, r2562; +} +{ +add.f16x2 r2565, r2566, r2567; +} +{ +add.f16x2 r2568, r2569, r2565; +} +{ +add.f16x2 r2571, r2572, r2573; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2566, r2567; +} +{ +mul.f16x2 r2592, r2589, r2557; +} +{ +add.f16x2 r2595, r2569, r2592; +} +{ +add.f16x2 r2598, r2572, r2573; +} +{ +mul.f16x2 r2601, r2598, r2559; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +sub.f16x2 r2607, r2578, r2579; +} +{ +mul.f16x2 r2610, r2607, r2558; +} +{ +sub.f16x2 r2613, r2584, r2585; +} +{ +mul.f16x2 r2616, r2613, r2560; +} +{ +add.f16x2 r2619, r2610, r2616; +} +{ +sub.f16x2 r2622, r2604, r2619; +} +{ +add.f16x2 r2625, r2566, r2567; +} +{ +mul.f16x2 r2628, r2625, r2557; +} +{ +add.f16x2 r2631, r2569, r2628; +} +{ +add.f16x2 r2634, r2572, r2573; +} +{ +mul.f16x2 r2637, r2634, r2559; +} +{ +add.f16x2 r2640, r2631, r2637; +} +{ +sub.f16x2 r2643, r2578, r2579; +} +{ +mul.f16x2 r2646, r2643, r2558; +} +{ +sub.f16x2 r2649, r2584, r2585; +} +{ +mul.f16x2 r2652, r2649, r2560; +} +{ +add.f16x2 r2655, r2646, r2652; +} +{ +add.f16x2 r2658, r2640, r2655; +} +{ +add.f16x2 r2661, r2566, r2567; +} +{ +mul.f16x2 r2664, r2661, r2559; +} +{ +add.f16x2 r2667, r2569, r2664; +} +{ +add.f16x2 r2670, r2572, r2573; +} +{ +mul.f16x2 r2673, r2670, r2561; +} +{ +add.f16x2 r2676, r2667, r2673; +} +{ +sub.f16x2 r2679, r2578, r2579; +} +{ +mul.f16x2 r2682, r2679, r2560; +} +{ +sub.f16x2 r2685, r2584, r2585; +} +{ +mul.f16x2 r2688, r2685, r2563; +} +{ +add.f16x2 r2691, r2682, r2688; +} +{ +sub.f16x2 r2694, r2676, r2691; +} +{ +add.f16x2 r2697, r2566, r2567; +} +{ +mul.f16x2 r2700, r2697, r2559; +} +{ +add.f16x2 r2703, r2569, r2700; +} +{ +add.f16x2 r2706, r2572, r2573; +} +{ +mul.f16x2 r2709, r2706, r2561; +} +{ +add.f16x2 r2712, r2703, r2709; +} +{ +sub.f16x2 r2715, r2578, r2579; +} +{ +mul.f16x2 r2718, r2715, r2560; +} +{ +sub.f16x2 r2721, r2584, r2585; +} +{ +mul.f16x2 r2724, r2721, r2563; +} +{ +add.f16x2 r2727, r2718, r2724; +} +{ +add.f16x2 r2730, r2712, r2727; +} +{ +add.f16x2 r2733, r2578, r2579; +} +{ +mul.f16x2 r2736, r2733, r2557; +} +{ +add.f16x2 r2739, r2581, r2736; +} +{ +add.f16x2 r2742, r2584, r2585; +} +{ +mul.f16x2 r2745, r2742, r2559; +} +{ +add.f16x2 r2748, r2739, r2745; +} +{ +sub.f16x2 r2751, r2566, r2567; +} +{ +mul.f16x2 r2754, r2751, r2558; +} +{ +sub.f16x2 r2757, r2572, r2573; +} +{ +mul.f16x2 r2760, r2757, r2560; +} +{ +add.f16x2 r2763, r2754, r2760; +} +{ +add.f16x2 r2766, r2748, r2763; +} +{ +add.f16x2 r2769, r2578, r2579; +} +{ +mul.f16x2 r2772, r2769, r2557; +} +{ +add.f16x2 r2775, r2581, r2772; +} +{ +add.f16x2 r2778, r2584, r2585; +} +{ +mul.f16x2 r2781, r2778, r2559; +} +{ +add.f16x2 r2784, r2775, r2781; +} +{ +sub.f16x2 r2787, r2566, r2567; +} +{ +mul.f16x2 r2790, r2787, r2558; +} +{ +sub.f16x2 r2793, r2572, r2573; +} +{ +mul.f16x2 r2796, r2793, r2560; +} +{ +add.f16x2 r2799, r2790, r2796; +} +{ +sub.f16x2 r2802, r2784, r2799; +} +{ +add.f16x2 r2805, r2578, r2579; +} +{ +mul.f16x2 r2808, r2805, r2559; +} +{ +add.f16x2 r2811, r2581, r2808; +} +{ +add.f16x2 r2814, r2584, r2585; +} +{ +mul.f16x2 r2817, r2814, r2561; +} +{ +add.f16x2 r2820, r2811, r2817; +} +{ +sub.f16x2 r2823, r2566, r2567; +} +{ +mul.f16x2 r2826, r2823, r2560; +} +{ +sub.f16x2 r2829, r2572, r2573; +} +{ +mul.f16x2 r2832, r2829, r2563; +} +{ +add.f16x2 r2835, r2826, r2832; +} +{ +add.f16x2 r2838, r2820, r2835; +} +{ +add.f16x2 r2841, r2578, r2579; +} +{ +mul.f16x2 r2844, r2841, r2559; +} +{ +add.f16x2 r2847, r2581, r2844; +} +{ +add.f16x2 r2850, r2584, r2585; +} +{ +mul.f16x2 r2853, r2850, r2561; +} +{ +add.f16x2 r2856, r2847, r2853; +} +{ +sub.f16x2 r2859, r2566, r2567; +} +{ +mul.f16x2 r2862, r2859, r2560; +} +{ +sub.f16x2 r2865, r2572, r2573; +} +{ +mul.f16x2 r2868, r2865, r2563; +} +{ +add.f16x2 r2871, r2862, r2868; +} +{ +sub.f16x2 r2874, r2856, r2871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r2877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2878, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2884, {low, high}; +} +{ +mul.f16x2 r2895, r2622, r2877; +} +{ +mul.f16x2 r2898, r2766, r2878; +} +{ +sub.f16x2 r2901, r2895, r2898; +} +{ +mul.f16x2 r2904, r2622, r2878; +} +{ +fma.rn.f16x2 r2907, r2766, r2877, r2904; +} +{ +mul.f16x2 r2911, r2694, r2879; +} +{ +mul.f16x2 r2914, r2838, r2880; +} +{ +sub.f16x2 r2917, r2911, r2914; +} +{ +mul.f16x2 r2920, r2694, r2880; +} +{ +fma.rn.f16x2 r2923, r2838, r2879, r2920; +} +{ +mul.f16x2 r2927, r2730, r2881; +} +{ +mul.f16x2 r2930, r2874, r2882; +} +{ +sub.f16x2 r2933, r2927, r2930; +} +{ +mul.f16x2 r2936, r2730, r2882; +} +{ +fma.rn.f16x2 r2939, r2874, r2881, r2936; +} +{ +mul.f16x2 r2943, r2658, r2883; +} +{ +mul.f16x2 r2946, r2802, r2884; +} +{ +sub.f16x2 r2949, r2943, r2946; +} +{ +mul.f16x2 r2952, r2658, r2884; +} +{ +fma.rn.f16x2 r2955, r2802, r2883, r2952; +} +{ +add.f16x2 %0, r2254, r2574; +} +{ +add.f16x2 %1, r2266, r2586; +} +{ +sub.f16x2 %10, r2254, r2574; +} +{ +sub.f16x2 %11, r2266, r2586; +} +{ +add.f16x2 %2, r2302, r2901; +} +{ +add.f16x2 %3, r2446, r2907; +} +{ +sub.f16x2 %12, r2302, r2901; +} +{ +sub.f16x2 %13, r2446, r2907; +} +{ +add.f16x2 %4, r2374, r2917; +} +{ +add.f16x2 %5, r2518, r2923; +} +{ +sub.f16x2 %14, r2374, r2917; +} +{ +sub.f16x2 %15, r2518, r2923; +} +{ +add.f16x2 %6, r2410, r2933; +} +{ +add.f16x2 %7, r2554, r2939; +} +{ +sub.f16x2 %16, r2410, r2933; +} +{ +sub.f16x2 %17, r2554, r2939; +} +{ +add.f16x2 %8, r2338, r2949; +} +{ +add.f16x2 %9, r2482, r2955; +} +{ +sub.f16x2 %18, r2338, r2949; +} +{ +sub.f16x2 %19, r2482, r2955; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<940, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<231>; +.reg .b32 r<3035>; +.reg .b64 rd<6>; +mov.u32 r3019, %tid.y; +mov.u32 r3020, %20; +mad.lo.s32 r3021, r3019, 4000, r3020; +mov.u32 r3022, %tid.x; +mov.f32 f194, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1, {low, high}; +} +mov.f32 f200, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2, {low, high}; +} +mov.f32 f202, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r3, {low, high}; +} +mov.f32 f204, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %25, %37; +} +{ +add.f16x2 r12, %21, r9; +} +{ +add.f16x2 r15, %29, %33; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %26, %38; +} +{ +add.f16x2 r24, %22, r21; +} +{ +add.f16x2 r27, %30, %34; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %25, %37; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %21, r36; +} +{ +add.f16x2 r42, %29, %33; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %26, %38; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %30, %34; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %25, %37; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %21, r72; +} +{ +add.f16x2 r78, %29, %33; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %26, %38; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %30, %34; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %25, %37; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %21, r108; +} +{ +add.f16x2 r114, %29, %33; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %26, %38; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %30, %34; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %25, %37; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %21, r144; +} +{ +add.f16x2 r150, %29, %33; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %26, %38; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %30, %34; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %26, %38; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %22, r180; +} +{ +add.f16x2 r186, %30, %34; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %25, %37; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %29, %33; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %26, %38; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %22, r216; +} +{ +add.f16x2 r222, %30, %34; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %25, %37; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %29, %33; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %26, %38; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %22, r252; +} +{ +add.f16x2 r258, %30, %34; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %25, %37; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %29, %33; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %26, %38; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %22, r288; +} +{ +add.f16x2 r294, %30, %34; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %25, %37; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %29, %33; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %27, %39; +} +{ +add.f16x2 r332, %23, r329; +} +{ +add.f16x2 r335, %31, %35; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %28, %40; +} +{ +add.f16x2 r344, %24, r341; +} +{ +add.f16x2 r347, %32, %36; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %27, %39; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %23, r356; +} +{ +add.f16x2 r362, %31, %35; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %28, %40; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %32, %36; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %27, %39; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %23, r392; +} +{ +add.f16x2 r398, %31, %35; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %28, %40; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %32, %36; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %27, %39; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %23, r428; +} +{ +add.f16x2 r434, %31, %35; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %28, %40; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %32, %36; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %27, %39; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %23, r464; +} +{ +add.f16x2 r470, %31, %35; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %28, %40; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %32, %36; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %28, %40; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %24, r500; +} +{ +add.f16x2 r506, %32, %36; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %27, %39; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %31, %35; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %28, %40; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %24, r536; +} +{ +add.f16x2 r542, %32, %36; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %27, %39; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %31, %35; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %28, %40; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %24, r572; +} +{ +add.f16x2 r578, %32, %36; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %27, %39; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %31, %35; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %28, %40; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %24, r608; +} +{ +add.f16x2 r614, %32, %36; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %27, %39; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %31, %35; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +mov.f32 f190, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r644, {low, high}; +} +mov.f32 f198, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r648, {low, high}; +} +mov.f32 f161, 0fBF800000; +{ +mul.f16x2 r659, r386, r641; +} +{ +mul.f16x2 r662, r530, r642; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r386, r642; +} +{ +fma.rn.f16x2 r671, r530, r641, r668; +} +{ +mul.f16x2 r675, r458, r643; +} +{ +mul.f16x2 r678, r602, r644; +} +{ +sub.f16x2 r681, r675, r678; +} +{ +mul.f16x2 r684, r458, r644; +} +{ +fma.rn.f16x2 r687, r602, r643, r684; +} +{ +mul.f16x2 r691, r494, r645; +} +{ +mul.f16x2 r694, r638, r646; +} +{ +sub.f16x2 r697, r691, r694; +} +{ +mul.f16x2 r700, r494, r646; +} +{ +fma.rn.f16x2 r703, r638, r645, r700; +} +{ +mul.f16x2 r707, r422, r647; +} +{ +mul.f16x2 r710, r566, r648; +} +{ +sub.f16x2 r713, r707, r710; +} +{ +mul.f16x2 r716, r422, r648; +} +{ +fma.rn.f16x2 r719, r566, r647, r716; +} +{ +add.f16x2 r723, r18, r338; +} +{ +add.f16x2 r726, r30, r350; +} +{ +sub.f16x2 r729, r18, r338; +} +{ +sub.f16x2 r732, r30, r350; +} +{ +add.f16x2 r735, r66, r665; +} +{ +add.f16x2 r738, r210, r671; +} +{ +sub.f16x2 r741, r66, r665; +} +{ +sub.f16x2 r744, r210, r671; +} +{ +add.f16x2 r747, r138, r681; +} +{ +add.f16x2 r750, r282, r687; +} +{ +sub.f16x2 r753, r138, r681; +} +{ +sub.f16x2 r756, r282, r687; +} +{ +add.f16x2 r759, r174, r697; +} +{ +add.f16x2 r762, r318, r703; +} +{ +sub.f16x2 r765, r174, r697; +} +{ +sub.f16x2 r768, r318, r703; +} +{ +add.f16x2 r771, r102, r713; +} +{ +add.f16x2 r774, r246, r719; +} +{ +sub.f16x2 r777, r102, r713; +} +{ +sub.f16x2 r780, r246, r719; +} +mul.wide.u32 rd2, r3022, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r3023, rd3; +mul.lo.s32 r3024, r3023, 100; +sub.s32 r3025, r3022, r3024; +mad.lo.s32 r3026, r3023, 4000, r3021; +cvt.rn.f32.u32 f225, r3025; +mul.f32 f226, f225, 0f3BCDE32E; +cos.approx.f32 f61, f226; +sin.approx.f32 f227, f226; +neg.f32 f62, f227; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r783, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r788, {high, high}; +} +{ +mul.f16x2 r790, r738, r788; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r735, r786, r793; +} +{ +mul.f16x2 r799, r735, r788; +} +{ +fma.rn.f16x2 r802, r738, r786, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r808, {high, high}; +} +mov.f32 f162, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r808, r810; +} +{ +mul.f16x2 r814, r783, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r817, {high, low}; +} +{ +fma.rn.f16x2 r819, r811, r817, r814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r825, {high, high}; +} +{ +mul.f16x2 r827, r750, r825; +} +{ +neg.f16x2 r830, r827; +} +{ +fma.rn.f16x2 r832, r747, r823, r830; +} +{ +mul.f16x2 r836, r747, r825; +} +{ +fma.rn.f16x2 r839, r750, r823, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r845, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r847, {low, high}; +} +{ +mul.f16x2 r848, r845, r847; +} +{ +mul.f16x2 r851, r819, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r854, {high, low}; +} +{ +fma.rn.f16x2 r856, r848, r854, r851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r860, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r862, {high, high}; +} +{ +mul.f16x2 r864, r762, r862; +} +{ +neg.f16x2 r867, r864; +} +{ +fma.rn.f16x2 r869, r759, r860, r867; +} +{ +mul.f16x2 r873, r759, r862; +} +{ +fma.rn.f16x2 r876, r762, r860, r873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r882, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r884, {low, high}; +} +{ +mul.f16x2 r885, r882, r884; +} +{ +mul.f16x2 r888, r856, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r891, {high, low}; +} +{ +fma.rn.f16x2 r893, r885, r891, r888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r897, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r899, {high, high}; +} +{ +mul.f16x2 r901, r774, r899; +} +{ +neg.f16x2 r904, r901; +} +{ +fma.rn.f16x2 r906, r771, r897, r904; +} +{ +mul.f16x2 r910, r771, r899; +} +{ +fma.rn.f16x2 r913, r774, r897, r910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r919, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r921, {low, high}; +} +{ +mul.f16x2 r922, r919, r921; +} +{ +mul.f16x2 r925, r893, r917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r928, {high, low}; +} +{ +fma.rn.f16x2 r930, r922, r928, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r934, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r936, {high, high}; +} +{ +mul.f16x2 r938, r732, r936; +} +{ +neg.f16x2 r941, r938; +} +{ +fma.rn.f16x2 r943, r729, r934, r941; +} +{ +mul.f16x2 r947, r729, r936; +} +{ +fma.rn.f16x2 r950, r732, r934, r947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r956, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r958, {low, high}; +} +{ +mul.f16x2 r959, r956, r958; +} +{ +mul.f16x2 r962, r930, r954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r965, {high, low}; +} +{ +fma.rn.f16x2 r967, r959, r965, r962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r971, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r973, {high, high}; +} +{ +mul.f16x2 r975, r744, r973; +} +{ +neg.f16x2 r978, r975; +} +{ +fma.rn.f16x2 r980, r741, r971, r978; +} +{ +mul.f16x2 r984, r741, r973; +} +{ +fma.rn.f16x2 r987, r744, r971, r984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r991, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r993, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r995, {low, high}; +} +{ +mul.f16x2 r996, r993, r995; +} +{ +mul.f16x2 r999, r967, r991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r1002, {high, low}; +} +{ +fma.rn.f16x2 r1004, r996, r1002, r999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1008, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1010, {high, high}; +} +{ +mul.f16x2 r1012, r756, r1010; +} +{ +neg.f16x2 r1015, r1012; +} +{ +fma.rn.f16x2 r1017, r753, r1008, r1015; +} +{ +mul.f16x2 r1021, r753, r1010; +} +{ +fma.rn.f16x2 r1024, r756, r1008, r1021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1028, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1030, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1032, {low, high}; +} +{ +mul.f16x2 r1033, r1030, r1032; +} +{ +mul.f16x2 r1036, r1004, r1028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1039, {high, low}; +} +{ +fma.rn.f16x2 r1041, r1033, r1039, r1036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1045, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1047, {high, high}; +} +{ +mul.f16x2 r1049, r768, r1047; +} +{ +neg.f16x2 r1052, r1049; +} +{ +fma.rn.f16x2 r1054, r765, r1045, r1052; +} +{ +mul.f16x2 r1058, r765, r1047; +} +{ +fma.rn.f16x2 r1061, r768, r1045, r1058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1065, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1067, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1069, {low, high}; +} +{ +mul.f16x2 r1070, r1067, r1069; +} +{ +mul.f16x2 r1073, r1041, r1065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1076, {high, low}; +} +{ +fma.rn.f16x2 r1078, r1070, r1076, r1073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1082, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1084, {high, high}; +} +{ +mul.f16x2 r1086, r780, r1084; +} +{ +neg.f16x2 r1089, r1086; +} +{ +fma.rn.f16x2 r1091, r777, r1082, r1089; +} +{ +mul.f16x2 r1095, r777, r1084; +} +{ +fma.rn.f16x2 r1098, r780, r1082, r1095; +} +barrier.sync 0; +mad.lo.s32 r3027, r3025, 40, r3026; +st.shared.v2.f32 [r3027], {r723, r795}; +st.shared.v2.f32 [r3027+8], {r832, r869}; +st.shared.v2.f32 [r3027+16], {r906, r943}; +st.shared.v2.f32 [r3027+24], {r980, r1017}; +st.shared.v2.f32 [r3027+32], {r1054, r1091}; +barrier.sync 0; +mad.lo.s32 r3028, r3025, -36, r3027; +ld.shared.u32 r1131, [r3028]; +ld.shared.u32 r1451, [r3028+400]; +ld.shared.u32 r1128, [r3028+800]; +ld.shared.u32 r1448, [r3028+1200]; +ld.shared.u32 r1134, [r3028+1600]; +ld.shared.u32 r1454, [r3028+2000]; +ld.shared.u32 r1135, [r3028+2400]; +ld.shared.u32 r1455, [r3028+2800]; +ld.shared.u32 r1129, [r3028+3200]; +ld.shared.u32 r1449, [r3028+3600]; +barrier.sync 0; +st.shared.v2.f32 [r3027], {r726, r802}; +st.shared.v2.f32 [r3027+8], {r839, r876}; +st.shared.v2.f32 [r3027+16], {r913, r950}; +st.shared.v2.f32 [r3027+24], {r987, r1024}; +st.shared.v2.f32 [r3027+32], {r1061, r1098}; +barrier.sync 0; +ld.shared.u32 r1143, [r3028]; +ld.shared.u32 r1463, [r3028+400]; +ld.shared.u32 r1140, [r3028+800]; +ld.shared.u32 r1460, [r3028+1200]; +ld.shared.u32 r1146, [r3028+1600]; +ld.shared.u32 r1466, [r3028+2000]; +ld.shared.u32 r1147, [r3028+2400]; +ld.shared.u32 r1467, [r3028+2800]; +ld.shared.u32 r1141, [r3028+3200]; +ld.shared.u32 r1461, [r3028+3600]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +add.f16x2 r1127, r1128, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1130, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1142, r1145; +} +{ +add.f16x2 r1151, r1128, r1129; +} +{ +mul.f16x2 r1154, r1151, r1119; +} +{ +add.f16x2 r1157, r1131, r1154; +} +{ +add.f16x2 r1160, r1134, r1135; +} +{ +mul.f16x2 r1163, r1160, r1121; +} +{ +add.f16x2 r1166, r1157, r1163; +} +{ +sub.f16x2 r1169, r1140, r1141; +} +{ +mul.f16x2 r1172, r1169, r1120; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1122; +} +{ +add.f16x2 r1181, r1172, r1178; +} +{ +sub.f16x2 r1184, r1166, r1181; +} +{ +add.f16x2 r1187, r1128, r1129; +} +{ +mul.f16x2 r1190, r1187, r1119; +} +{ +add.f16x2 r1193, r1131, r1190; +} +{ +add.f16x2 r1196, r1134, r1135; +} +{ +mul.f16x2 r1199, r1196, r1121; +} +{ +add.f16x2 r1202, r1193, r1199; +} +{ +sub.f16x2 r1205, r1140, r1141; +} +{ +mul.f16x2 r1208, r1205, r1120; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1122; +} +{ +add.f16x2 r1217, r1208, r1214; +} +{ +add.f16x2 r1220, r1202, r1217; +} +{ +add.f16x2 r1223, r1128, r1129; +} +{ +mul.f16x2 r1226, r1223, r1121; +} +{ +add.f16x2 r1229, r1131, r1226; +} +{ +add.f16x2 r1232, r1134, r1135; +} +{ +mul.f16x2 r1235, r1232, r1123; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1140, r1141; +} +{ +mul.f16x2 r1244, r1241, r1122; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1125; +} +{ +add.f16x2 r1253, r1244, r1250; +} +{ +sub.f16x2 r1256, r1238, r1253; +} +{ +add.f16x2 r1259, r1128, r1129; +} +{ +mul.f16x2 r1262, r1259, r1121; +} +{ +add.f16x2 r1265, r1131, r1262; +} +{ +add.f16x2 r1268, r1134, r1135; +} +{ +mul.f16x2 r1271, r1268, r1123; +} +{ +add.f16x2 r1274, r1265, r1271; +} +{ +sub.f16x2 r1277, r1140, r1141; +} +{ +mul.f16x2 r1280, r1277, r1122; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1125; +} +{ +add.f16x2 r1289, r1280, r1286; +} +{ +add.f16x2 r1292, r1274, r1289; +} +{ +add.f16x2 r1295, r1140, r1141; +} +{ +mul.f16x2 r1298, r1295, r1119; +} +{ +add.f16x2 r1301, r1143, r1298; +} +{ +add.f16x2 r1304, r1146, r1147; +} +{ +mul.f16x2 r1307, r1304, r1121; +} +{ +add.f16x2 r1310, r1301, r1307; +} +{ +sub.f16x2 r1313, r1128, r1129; +} +{ +mul.f16x2 r1316, r1313, r1120; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1122; +} +{ +add.f16x2 r1325, r1316, r1322; +} +{ +add.f16x2 r1328, r1310, r1325; +} +{ +add.f16x2 r1331, r1140, r1141; +} +{ +mul.f16x2 r1334, r1331, r1119; +} +{ +add.f16x2 r1337, r1143, r1334; +} +{ +add.f16x2 r1340, r1146, r1147; +} +{ +mul.f16x2 r1343, r1340, r1121; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +sub.f16x2 r1349, r1128, r1129; +} +{ +mul.f16x2 r1352, r1349, r1120; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1122; +} +{ +add.f16x2 r1361, r1352, r1358; +} +{ +sub.f16x2 r1364, r1346, r1361; +} +{ +add.f16x2 r1367, r1140, r1141; +} +{ +mul.f16x2 r1370, r1367, r1121; +} +{ +add.f16x2 r1373, r1143, r1370; +} +{ +add.f16x2 r1376, r1146, r1147; +} +{ +mul.f16x2 r1379, r1376, r1123; +} +{ +add.f16x2 r1382, r1373, r1379; +} +{ +sub.f16x2 r1385, r1128, r1129; +} +{ +mul.f16x2 r1388, r1385, r1122; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1125; +} +{ +add.f16x2 r1397, r1388, r1394; +} +{ +add.f16x2 r1400, r1382, r1397; +} +{ +add.f16x2 r1403, r1140, r1141; +} +{ +mul.f16x2 r1406, r1403, r1121; +} +{ +add.f16x2 r1409, r1143, r1406; +} +{ +add.f16x2 r1412, r1146, r1147; +} +{ +mul.f16x2 r1415, r1412, r1123; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +sub.f16x2 r1421, r1128, r1129; +} +{ +mul.f16x2 r1424, r1421, r1122; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1125; +} +{ +add.f16x2 r1433, r1424, r1430; +} +{ +sub.f16x2 r1436, r1418, r1433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1444, {low, high}; +} +{ +neg.f16x2 r1445, r1444; +} +{ +add.f16x2 r1447, r1448, r1449; +} +{ +add.f16x2 r1450, r1451, r1447; +} +{ +add.f16x2 r1453, r1454, r1455; +} +{ +add.f16x2 r1456, r1450, r1453; +} +{ +add.f16x2 r1459, r1460, r1461; +} +{ +add.f16x2 r1462, r1463, r1459; +} +{ +add.f16x2 r1465, r1466, r1467; +} +{ +add.f16x2 r1468, r1462, r1465; +} +{ +add.f16x2 r1471, r1448, r1449; +} +{ +mul.f16x2 r1474, r1471, r1439; +} +{ +add.f16x2 r1477, r1451, r1474; +} +{ +add.f16x2 r1480, r1454, r1455; +} +{ +mul.f16x2 r1483, r1480, r1441; +} +{ +add.f16x2 r1486, r1477, r1483; +} +{ +sub.f16x2 r1489, r1460, r1461; +} +{ +mul.f16x2 r1492, r1489, r1440; +} +{ +sub.f16x2 r1495, r1466, r1467; +} +{ +mul.f16x2 r1498, r1495, r1442; +} +{ +add.f16x2 r1501, r1492, r1498; +} +{ +sub.f16x2 r1504, r1486, r1501; +} +{ +add.f16x2 r1507, r1448, r1449; +} +{ +mul.f16x2 r1510, r1507, r1439; +} +{ +add.f16x2 r1513, r1451, r1510; +} +{ +add.f16x2 r1516, r1454, r1455; +} +{ +mul.f16x2 r1519, r1516, r1441; +} +{ +add.f16x2 r1522, r1513, r1519; +} +{ +sub.f16x2 r1525, r1460, r1461; +} +{ +mul.f16x2 r1528, r1525, r1440; +} +{ +sub.f16x2 r1531, r1466, r1467; +} +{ +mul.f16x2 r1534, r1531, r1442; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +add.f16x2 r1540, r1522, r1537; +} +{ +add.f16x2 r1543, r1448, r1449; +} +{ +mul.f16x2 r1546, r1543, r1441; +} +{ +add.f16x2 r1549, r1451, r1546; +} +{ +add.f16x2 r1552, r1454, r1455; +} +{ +mul.f16x2 r1555, r1552, r1443; +} +{ +add.f16x2 r1558, r1549, r1555; +} +{ +sub.f16x2 r1561, r1460, r1461; +} +{ +mul.f16x2 r1564, r1561, r1442; +} +{ +sub.f16x2 r1567, r1466, r1467; +} +{ +mul.f16x2 r1570, r1567, r1445; +} +{ +add.f16x2 r1573, r1564, r1570; +} +{ +sub.f16x2 r1576, r1558, r1573; +} +{ +add.f16x2 r1579, r1448, r1449; +} +{ +mul.f16x2 r1582, r1579, r1441; +} +{ +add.f16x2 r1585, r1451, r1582; +} +{ +add.f16x2 r1588, r1454, r1455; +} +{ +mul.f16x2 r1591, r1588, r1443; +} +{ +add.f16x2 r1594, r1585, r1591; +} +{ +sub.f16x2 r1597, r1460, r1461; +} +{ +mul.f16x2 r1600, r1597, r1442; +} +{ +sub.f16x2 r1603, r1466, r1467; +} +{ +mul.f16x2 r1606, r1603, r1445; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +add.f16x2 r1612, r1594, r1609; +} +{ +add.f16x2 r1615, r1460, r1461; +} +{ +mul.f16x2 r1618, r1615, r1439; +} +{ +add.f16x2 r1621, r1463, r1618; +} +{ +add.f16x2 r1624, r1466, r1467; +} +{ +mul.f16x2 r1627, r1624, r1441; +} +{ +add.f16x2 r1630, r1621, r1627; +} +{ +sub.f16x2 r1633, r1448, r1449; +} +{ +mul.f16x2 r1636, r1633, r1440; +} +{ +sub.f16x2 r1639, r1454, r1455; +} +{ +mul.f16x2 r1642, r1639, r1442; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +add.f16x2 r1648, r1630, r1645; +} +{ +add.f16x2 r1651, r1460, r1461; +} +{ +mul.f16x2 r1654, r1651, r1439; +} +{ +add.f16x2 r1657, r1463, r1654; +} +{ +add.f16x2 r1660, r1466, r1467; +} +{ +mul.f16x2 r1663, r1660, r1441; +} +{ +add.f16x2 r1666, r1657, r1663; +} +{ +sub.f16x2 r1669, r1448, r1449; +} +{ +mul.f16x2 r1672, r1669, r1440; +} +{ +sub.f16x2 r1675, r1454, r1455; +} +{ +mul.f16x2 r1678, r1675, r1442; +} +{ +add.f16x2 r1681, r1672, r1678; +} +{ +sub.f16x2 r1684, r1666, r1681; +} +{ +add.f16x2 r1687, r1460, r1461; +} +{ +mul.f16x2 r1690, r1687, r1441; +} +{ +add.f16x2 r1693, r1463, r1690; +} +{ +add.f16x2 r1696, r1466, r1467; +} +{ +mul.f16x2 r1699, r1696, r1443; +} +{ +add.f16x2 r1702, r1693, r1699; +} +{ +sub.f16x2 r1705, r1448, r1449; +} +{ +mul.f16x2 r1708, r1705, r1442; +} +{ +sub.f16x2 r1711, r1454, r1455; +} +{ +mul.f16x2 r1714, r1711, r1445; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +add.f16x2 r1720, r1702, r1717; +} +{ +add.f16x2 r1723, r1460, r1461; +} +{ +mul.f16x2 r1726, r1723, r1441; +} +{ +add.f16x2 r1729, r1463, r1726; +} +{ +add.f16x2 r1732, r1466, r1467; +} +{ +mul.f16x2 r1735, r1732, r1443; +} +{ +add.f16x2 r1738, r1729, r1735; +} +{ +sub.f16x2 r1741, r1448, r1449; +} +{ +mul.f16x2 r1744, r1741, r1442; +} +{ +sub.f16x2 r1747, r1454, r1455; +} +{ +mul.f16x2 r1750, r1747, r1445; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, r1738, r1753; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1759, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1760, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1761, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1766, {low, high}; +} +{ +mul.f16x2 r1777, r1504, r1759; +} +{ +mul.f16x2 r1780, r1648, r1760; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1504, r1760; +} +{ +fma.rn.f16x2 r1789, r1648, r1759, r1786; +} +{ +mul.f16x2 r1793, r1576, r1761; +} +{ +mul.f16x2 r1796, r1720, r1762; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1576, r1762; +} +{ +fma.rn.f16x2 r1805, r1720, r1761, r1802; +} +{ +mul.f16x2 r1809, r1612, r1763; +} +{ +mul.f16x2 r1812, r1756, r1764; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1612, r1764; +} +{ +fma.rn.f16x2 r1821, r1756, r1763, r1818; +} +{ +mul.f16x2 r1825, r1540, r1765; +} +{ +mul.f16x2 r1828, r1684, r1766; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1540, r1766; +} +{ +fma.rn.f16x2 r1837, r1684, r1765, r1834; +} +{ +add.f16x2 r1841, r1136, r1456; +} +{ +add.f16x2 r1844, r1148, r1468; +} +{ +sub.f16x2 r1847, r1136, r1456; +} +{ +sub.f16x2 r1850, r1148, r1468; +} +{ +add.f16x2 r1853, r1184, r1783; +} +{ +add.f16x2 r1856, r1328, r1789; +} +{ +sub.f16x2 r1859, r1184, r1783; +} +{ +sub.f16x2 r1862, r1328, r1789; +} +{ +add.f16x2 r1865, r1256, r1799; +} +{ +add.f16x2 r1868, r1400, r1805; +} +{ +sub.f16x2 r1871, r1256, r1799; +} +{ +sub.f16x2 r1874, r1400, r1805; +} +{ +add.f16x2 r1877, r1292, r1815; +} +{ +add.f16x2 r1880, r1436, r1821; +} +{ +sub.f16x2 r1883, r1292, r1815; +} +{ +sub.f16x2 r1886, r1436, r1821; +} +{ +add.f16x2 r1889, r1220, r1831; +} +{ +add.f16x2 r1892, r1364, r1837; +} +{ +sub.f16x2 r1895, r1220, r1831; +} +{ +sub.f16x2 r1898, r1364, r1837; +} +mul.wide.u32 rd4, r3025, -858993459; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r3029, rd5; +mul.lo.s32 r3030, r3029, 10; +sub.s32 r3031, r3025, r3030; +shl.b32 r3032, r3031, 2; +add.s32 r3033, r3026, r3032; +cvt.rn.f32.u32 f228, r3029; +mul.f32 f229, f228, 0f3D80ADFD; +cos.approx.f32 f143, f229; +sin.approx.f32 f230, f229; +neg.f32 f144, f230; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1901, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1904, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1906, {high, high}; +} +{ +mul.f16x2 r1908, r1856, r1906; +} +{ +neg.f16x2 r1911, r1908; +} +{ +fma.rn.f16x2 r1913, r1853, r1904, r1911; +} +{ +mul.f16x2 r1917, r1853, r1906; +} +{ +fma.rn.f16x2 r1920, r1856, r1904, r1917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1924, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1926, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1928, {low, high}; +} +{ +mul.f16x2 r1929, r1926, r1928; +} +{ +mul.f16x2 r1932, r1901, r1924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1935, {high, low}; +} +{ +fma.rn.f16x2 r1937, r1929, r1935, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1941, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1943, {high, high}; +} +{ +mul.f16x2 r1945, r1868, r1943; +} +{ +neg.f16x2 r1948, r1945; +} +{ +fma.rn.f16x2 r1950, r1865, r1941, r1948; +} +{ +mul.f16x2 r1954, r1865, r1943; +} +{ +fma.rn.f16x2 r1957, r1868, r1941, r1954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1961, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1963, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1965, {low, high}; +} +{ +mul.f16x2 r1966, r1963, r1965; +} +{ +mul.f16x2 r1969, r1937, r1961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1937; +mov.b32 r1972, {high, low}; +} +{ +fma.rn.f16x2 r1974, r1966, r1972, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1978, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r1980, {high, high}; +} +{ +mul.f16x2 r1982, r1880, r1980; +} +{ +neg.f16x2 r1985, r1982; +} +{ +fma.rn.f16x2 r1987, r1877, r1978, r1985; +} +{ +mul.f16x2 r1991, r1877, r1980; +} +{ +fma.rn.f16x2 r1994, r1880, r1978, r1991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1998, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2000, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2002, {low, high}; +} +{ +mul.f16x2 r2003, r2000, r2002; +} +{ +mul.f16x2 r2006, r1974, r1998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1974; +mov.b32 r2009, {high, low}; +} +{ +fma.rn.f16x2 r2011, r2003, r2009, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2015, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2017, {high, high}; +} +{ +mul.f16x2 r2019, r1892, r2017; +} +{ +neg.f16x2 r2022, r2019; +} +{ +fma.rn.f16x2 r2024, r1889, r2015, r2022; +} +{ +mul.f16x2 r2028, r1889, r2017; +} +{ +fma.rn.f16x2 r2031, r1892, r2015, r2028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2035, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2037, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2039, {low, high}; +} +{ +mul.f16x2 r2040, r2037, r2039; +} +{ +mul.f16x2 r2043, r2011, r2035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2011; +mov.b32 r2046, {high, low}; +} +{ +fma.rn.f16x2 r2048, r2040, r2046, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2052, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2054, {high, high}; +} +{ +mul.f16x2 r2056, r1850, r2054; +} +{ +neg.f16x2 r2059, r2056; +} +{ +fma.rn.f16x2 r2061, r1847, r2052, r2059; +} +{ +mul.f16x2 r2065, r1847, r2054; +} +{ +fma.rn.f16x2 r2068, r1850, r2052, r2065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2072, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2074, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2076, {low, high}; +} +{ +mul.f16x2 r2077, r2074, r2076; +} +{ +mul.f16x2 r2080, r2048, r2072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2048; +mov.b32 r2083, {high, low}; +} +{ +fma.rn.f16x2 r2085, r2077, r2083, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2089, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2091, {high, high}; +} +{ +mul.f16x2 r2093, r1862, r2091; +} +{ +neg.f16x2 r2096, r2093; +} +{ +fma.rn.f16x2 r2098, r1859, r2089, r2096; +} +{ +mul.f16x2 r2102, r1859, r2091; +} +{ +fma.rn.f16x2 r2105, r1862, r2089, r2102; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2109, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2111, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2113, {low, high}; +} +{ +mul.f16x2 r2114, r2111, r2113; +} +{ +mul.f16x2 r2117, r2085, r2109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2085; +mov.b32 r2120, {high, low}; +} +{ +fma.rn.f16x2 r2122, r2114, r2120, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2128, {high, high}; +} +{ +mul.f16x2 r2130, r1874, r2128; +} +{ +neg.f16x2 r2133, r2130; +} +{ +fma.rn.f16x2 r2135, r1871, r2126, r2133; +} +{ +mul.f16x2 r2139, r1871, r2128; +} +{ +fma.rn.f16x2 r2142, r1874, r2126, r2139; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2146, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2148, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2150, {low, high}; +} +{ +mul.f16x2 r2151, r2148, r2150; +} +{ +mul.f16x2 r2154, r2122, r2146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2122; +mov.b32 r2157, {high, low}; +} +{ +fma.rn.f16x2 r2159, r2151, r2157, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2163, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2165, {high, high}; +} +{ +mul.f16x2 r2167, r1886, r2165; +} +{ +neg.f16x2 r2170, r2167; +} +{ +fma.rn.f16x2 r2172, r1883, r2163, r2170; +} +{ +mul.f16x2 r2176, r1883, r2165; +} +{ +fma.rn.f16x2 r2179, r1886, r2163, r2176; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2183, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r2185, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2187, {low, high}; +} +{ +mul.f16x2 r2188, r2185, r2187; +} +{ +mul.f16x2 r2191, r2159, r2183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2159; +mov.b32 r2194, {high, low}; +} +{ +fma.rn.f16x2 r2196, r2188, r2194, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2196; +mov.b32 r2202, {high, high}; +} +{ +mul.f16x2 r2204, r1898, r2202; +} +{ +neg.f16x2 r2207, r2204; +} +{ +fma.rn.f16x2 r2209, r1895, r2200, r2207; +} +{ +mul.f16x2 r2213, r1895, r2202; +} +{ +fma.rn.f16x2 r2216, r1898, r2200, r2213; +} +barrier.sync 0; +mad.lo.s32 r3034, r3029, 400, r3033; +st.shared.u32 [r3034], r1841; +st.shared.u32 [r3034+40], r1913; +st.shared.u32 [r3034+80], r1950; +st.shared.u32 [r3034+120], r1987; +st.shared.u32 [r3034+160], r2024; +st.shared.u32 [r3034+200], r2061; +st.shared.u32 [r3034+240], r2098; +st.shared.u32 [r3034+280], r2135; +st.shared.u32 [r3034+320], r2172; +st.shared.u32 [r3034+360], r2209; +barrier.sync 0; +ld.shared.u32 r2249, [r3028]; +ld.shared.u32 r2569, [r3028+400]; +ld.shared.u32 r2246, [r3028+800]; +ld.shared.u32 r2566, [r3028+1200]; +ld.shared.u32 r2252, [r3028+1600]; +ld.shared.u32 r2572, [r3028+2000]; +ld.shared.u32 r2253, [r3028+2400]; +ld.shared.u32 r2573, [r3028+2800]; +ld.shared.u32 r2247, [r3028+3200]; +ld.shared.u32 r2567, [r3028+3600]; +barrier.sync 0; +st.shared.u32 [r3034], r1844; +st.shared.u32 [r3034+40], r1920; +st.shared.u32 [r3034+80], r1957; +st.shared.u32 [r3034+120], r1994; +st.shared.u32 [r3034+160], r2031; +st.shared.u32 [r3034+200], r2068; +st.shared.u32 [r3034+240], r2105; +st.shared.u32 [r3034+280], r2142; +st.shared.u32 [r3034+320], r2179; +st.shared.u32 [r3034+360], r2216; +barrier.sync 0; +ld.shared.u32 r2261, [r3028]; +ld.shared.u32 r2581, [r3028+400]; +ld.shared.u32 r2258, [r3028+800]; +ld.shared.u32 r2578, [r3028+1200]; +ld.shared.u32 r2264, [r3028+1600]; +ld.shared.u32 r2584, [r3028+2000]; +ld.shared.u32 r2265, [r3028+2400]; +ld.shared.u32 r2585, [r3028+2800]; +ld.shared.u32 r2259, [r3028+3200]; +ld.shared.u32 r2579, [r3028+3600]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +add.f16x2 r2245, r2246, r2247; +} +{ +add.f16x2 r2248, r2249, r2245; +} +{ +add.f16x2 r2251, r2252, r2253; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r2258, r2259; +} +{ +add.f16x2 r2260, r2261, r2257; +} +{ +add.f16x2 r2263, r2264, r2265; +} +{ +add.f16x2 r2266, r2260, r2263; +} +{ +add.f16x2 r2269, r2246, r2247; +} +{ +mul.f16x2 r2272, r2269, r2237; +} +{ +add.f16x2 r2275, r2249, r2272; +} +{ +add.f16x2 r2278, r2252, r2253; +} +{ +mul.f16x2 r2281, r2278, r2239; +} +{ +add.f16x2 r2284, r2275, r2281; +} +{ +sub.f16x2 r2287, r2258, r2259; +} +{ +mul.f16x2 r2290, r2287, r2238; +} +{ +sub.f16x2 r2293, r2264, r2265; +} +{ +mul.f16x2 r2296, r2293, r2240; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r2284, r2299; +} +{ +add.f16x2 r2305, r2246, r2247; +} +{ +mul.f16x2 r2308, r2305, r2237; +} +{ +add.f16x2 r2311, r2249, r2308; +} +{ +add.f16x2 r2314, r2252, r2253; +} +{ +mul.f16x2 r2317, r2314, r2239; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r2258, r2259; +} +{ +mul.f16x2 r2326, r2323, r2238; +} +{ +sub.f16x2 r2329, r2264, r2265; +} +{ +mul.f16x2 r2332, r2329, r2240; +} +{ +add.f16x2 r2335, r2326, r2332; +} +{ +add.f16x2 r2338, r2320, r2335; +} +{ +add.f16x2 r2341, r2246, r2247; +} +{ +mul.f16x2 r2344, r2341, r2239; +} +{ +add.f16x2 r2347, r2249, r2344; +} +{ +add.f16x2 r2350, r2252, r2253; +} +{ +mul.f16x2 r2353, r2350, r2241; +} +{ +add.f16x2 r2356, r2347, r2353; +} +{ +sub.f16x2 r2359, r2258, r2259; +} +{ +mul.f16x2 r2362, r2359, r2240; +} +{ +sub.f16x2 r2365, r2264, r2265; +} +{ +mul.f16x2 r2368, r2365, r2243; +} +{ +add.f16x2 r2371, r2362, r2368; +} +{ +sub.f16x2 r2374, r2356, r2371; +} +{ +add.f16x2 r2377, r2246, r2247; +} +{ +mul.f16x2 r2380, r2377, r2239; +} +{ +add.f16x2 r2383, r2249, r2380; +} +{ +add.f16x2 r2386, r2252, r2253; +} +{ +mul.f16x2 r2389, r2386, r2241; +} +{ +add.f16x2 r2392, r2383, r2389; +} +{ +sub.f16x2 r2395, r2258, r2259; +} +{ +mul.f16x2 r2398, r2395, r2240; +} +{ +sub.f16x2 r2401, r2264, r2265; +} +{ +mul.f16x2 r2404, r2401, r2243; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +add.f16x2 r2410, r2392, r2407; +} +{ +add.f16x2 r2413, r2258, r2259; +} +{ +mul.f16x2 r2416, r2413, r2237; +} +{ +add.f16x2 r2419, r2261, r2416; +} +{ +add.f16x2 r2422, r2264, r2265; +} +{ +mul.f16x2 r2425, r2422, r2239; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r2246, r2247; +} +{ +mul.f16x2 r2434, r2431, r2238; +} +{ +sub.f16x2 r2437, r2252, r2253; +} +{ +mul.f16x2 r2440, r2437, r2240; +} +{ +add.f16x2 r2443, r2434, r2440; +} +{ +add.f16x2 r2446, r2428, r2443; +} +{ +add.f16x2 r2449, r2258, r2259; +} +{ +mul.f16x2 r2452, r2449, r2237; +} +{ +add.f16x2 r2455, r2261, r2452; +} +{ +add.f16x2 r2458, r2264, r2265; +} +{ +mul.f16x2 r2461, r2458, r2239; +} +{ +add.f16x2 r2464, r2455, r2461; +} +{ +sub.f16x2 r2467, r2246, r2247; +} +{ +mul.f16x2 r2470, r2467, r2238; +} +{ +sub.f16x2 r2473, r2252, r2253; +} +{ +mul.f16x2 r2476, r2473, r2240; +} +{ +add.f16x2 r2479, r2470, r2476; +} +{ +sub.f16x2 r2482, r2464, r2479; +} +{ +add.f16x2 r2485, r2258, r2259; +} +{ +mul.f16x2 r2488, r2485, r2239; +} +{ +add.f16x2 r2491, r2261, r2488; +} +{ +add.f16x2 r2494, r2264, r2265; +} +{ +mul.f16x2 r2497, r2494, r2241; +} +{ +add.f16x2 r2500, r2491, r2497; +} +{ +sub.f16x2 r2503, r2246, r2247; +} +{ +mul.f16x2 r2506, r2503, r2240; +} +{ +sub.f16x2 r2509, r2252, r2253; +} +{ +mul.f16x2 r2512, r2509, r2243; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +add.f16x2 r2518, r2500, r2515; +} +{ +add.f16x2 r2521, r2258, r2259; +} +{ +mul.f16x2 r2524, r2521, r2239; +} +{ +add.f16x2 r2527, r2261, r2524; +} +{ +add.f16x2 r2530, r2264, r2265; +} +{ +mul.f16x2 r2533, r2530, r2241; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r2246, r2247; +} +{ +mul.f16x2 r2542, r2539, r2240; +} +{ +sub.f16x2 r2545, r2252, r2253; +} +{ +mul.f16x2 r2548, r2545, r2243; +} +{ +add.f16x2 r2551, r2542, r2548; +} +{ +sub.f16x2 r2554, r2536, r2551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2557, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2558, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2562, {low, high}; +} +{ +neg.f16x2 r2563, r2562; +} +{ +add.f16x2 r2565, r2566, r2567; +} +{ +add.f16x2 r2568, r2569, r2565; +} +{ +add.f16x2 r2571, r2572, r2573; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2566, r2567; +} +{ +mul.f16x2 r2592, r2589, r2557; +} +{ +add.f16x2 r2595, r2569, r2592; +} +{ +add.f16x2 r2598, r2572, r2573; +} +{ +mul.f16x2 r2601, r2598, r2559; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +sub.f16x2 r2607, r2578, r2579; +} +{ +mul.f16x2 r2610, r2607, r2558; +} +{ +sub.f16x2 r2613, r2584, r2585; +} +{ +mul.f16x2 r2616, r2613, r2560; +} +{ +add.f16x2 r2619, r2610, r2616; +} +{ +sub.f16x2 r2622, r2604, r2619; +} +{ +add.f16x2 r2625, r2566, r2567; +} +{ +mul.f16x2 r2628, r2625, r2557; +} +{ +add.f16x2 r2631, r2569, r2628; +} +{ +add.f16x2 r2634, r2572, r2573; +} +{ +mul.f16x2 r2637, r2634, r2559; +} +{ +add.f16x2 r2640, r2631, r2637; +} +{ +sub.f16x2 r2643, r2578, r2579; +} +{ +mul.f16x2 r2646, r2643, r2558; +} +{ +sub.f16x2 r2649, r2584, r2585; +} +{ +mul.f16x2 r2652, r2649, r2560; +} +{ +add.f16x2 r2655, r2646, r2652; +} +{ +add.f16x2 r2658, r2640, r2655; +} +{ +add.f16x2 r2661, r2566, r2567; +} +{ +mul.f16x2 r2664, r2661, r2559; +} +{ +add.f16x2 r2667, r2569, r2664; +} +{ +add.f16x2 r2670, r2572, r2573; +} +{ +mul.f16x2 r2673, r2670, r2561; +} +{ +add.f16x2 r2676, r2667, r2673; +} +{ +sub.f16x2 r2679, r2578, r2579; +} +{ +mul.f16x2 r2682, r2679, r2560; +} +{ +sub.f16x2 r2685, r2584, r2585; +} +{ +mul.f16x2 r2688, r2685, r2563; +} +{ +add.f16x2 r2691, r2682, r2688; +} +{ +sub.f16x2 r2694, r2676, r2691; +} +{ +add.f16x2 r2697, r2566, r2567; +} +{ +mul.f16x2 r2700, r2697, r2559; +} +{ +add.f16x2 r2703, r2569, r2700; +} +{ +add.f16x2 r2706, r2572, r2573; +} +{ +mul.f16x2 r2709, r2706, r2561; +} +{ +add.f16x2 r2712, r2703, r2709; +} +{ +sub.f16x2 r2715, r2578, r2579; +} +{ +mul.f16x2 r2718, r2715, r2560; +} +{ +sub.f16x2 r2721, r2584, r2585; +} +{ +mul.f16x2 r2724, r2721, r2563; +} +{ +add.f16x2 r2727, r2718, r2724; +} +{ +add.f16x2 r2730, r2712, r2727; +} +{ +add.f16x2 r2733, r2578, r2579; +} +{ +mul.f16x2 r2736, r2733, r2557; +} +{ +add.f16x2 r2739, r2581, r2736; +} +{ +add.f16x2 r2742, r2584, r2585; +} +{ +mul.f16x2 r2745, r2742, r2559; +} +{ +add.f16x2 r2748, r2739, r2745; +} +{ +sub.f16x2 r2751, r2566, r2567; +} +{ +mul.f16x2 r2754, r2751, r2558; +} +{ +sub.f16x2 r2757, r2572, r2573; +} +{ +mul.f16x2 r2760, r2757, r2560; +} +{ +add.f16x2 r2763, r2754, r2760; +} +{ +add.f16x2 r2766, r2748, r2763; +} +{ +add.f16x2 r2769, r2578, r2579; +} +{ +mul.f16x2 r2772, r2769, r2557; +} +{ +add.f16x2 r2775, r2581, r2772; +} +{ +add.f16x2 r2778, r2584, r2585; +} +{ +mul.f16x2 r2781, r2778, r2559; +} +{ +add.f16x2 r2784, r2775, r2781; +} +{ +sub.f16x2 r2787, r2566, r2567; +} +{ +mul.f16x2 r2790, r2787, r2558; +} +{ +sub.f16x2 r2793, r2572, r2573; +} +{ +mul.f16x2 r2796, r2793, r2560; +} +{ +add.f16x2 r2799, r2790, r2796; +} +{ +sub.f16x2 r2802, r2784, r2799; +} +{ +add.f16x2 r2805, r2578, r2579; +} +{ +mul.f16x2 r2808, r2805, r2559; +} +{ +add.f16x2 r2811, r2581, r2808; +} +{ +add.f16x2 r2814, r2584, r2585; +} +{ +mul.f16x2 r2817, r2814, r2561; +} +{ +add.f16x2 r2820, r2811, r2817; +} +{ +sub.f16x2 r2823, r2566, r2567; +} +{ +mul.f16x2 r2826, r2823, r2560; +} +{ +sub.f16x2 r2829, r2572, r2573; +} +{ +mul.f16x2 r2832, r2829, r2563; +} +{ +add.f16x2 r2835, r2826, r2832; +} +{ +add.f16x2 r2838, r2820, r2835; +} +{ +add.f16x2 r2841, r2578, r2579; +} +{ +mul.f16x2 r2844, r2841, r2559; +} +{ +add.f16x2 r2847, r2581, r2844; +} +{ +add.f16x2 r2850, r2584, r2585; +} +{ +mul.f16x2 r2853, r2850, r2561; +} +{ +add.f16x2 r2856, r2847, r2853; +} +{ +sub.f16x2 r2859, r2566, r2567; +} +{ +mul.f16x2 r2862, r2859, r2560; +} +{ +sub.f16x2 r2865, r2572, r2573; +} +{ +mul.f16x2 r2868, r2865, r2563; +} +{ +add.f16x2 r2871, r2862, r2868; +} +{ +sub.f16x2 r2874, r2856, r2871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r2877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2878, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2884, {low, high}; +} +{ +mul.f16x2 r2895, r2622, r2877; +} +{ +mul.f16x2 r2898, r2766, r2878; +} +{ +sub.f16x2 r2901, r2895, r2898; +} +{ +mul.f16x2 r2904, r2622, r2878; +} +{ +fma.rn.f16x2 r2907, r2766, r2877, r2904; +} +{ +mul.f16x2 r2911, r2694, r2879; +} +{ +mul.f16x2 r2914, r2838, r2880; +} +{ +sub.f16x2 r2917, r2911, r2914; +} +{ +mul.f16x2 r2920, r2694, r2880; +} +{ +fma.rn.f16x2 r2923, r2838, r2879, r2920; +} +{ +mul.f16x2 r2927, r2730, r2881; +} +{ +mul.f16x2 r2930, r2874, r2882; +} +{ +sub.f16x2 r2933, r2927, r2930; +} +{ +mul.f16x2 r2936, r2730, r2882; +} +{ +fma.rn.f16x2 r2939, r2874, r2881, r2936; +} +{ +mul.f16x2 r2943, r2658, r2883; +} +{ +mul.f16x2 r2946, r2802, r2884; +} +{ +sub.f16x2 r2949, r2943, r2946; +} +{ +mul.f16x2 r2952, r2658, r2884; +} +{ +fma.rn.f16x2 r2955, r2802, r2883, r2952; +} +{ +add.f16x2 %0, r2254, r2574; +} +{ +add.f16x2 %1, r2266, r2586; +} +{ +sub.f16x2 %10, r2254, r2574; +} +{ +sub.f16x2 %11, r2266, r2586; +} +{ +add.f16x2 %2, r2302, r2901; +} +{ +add.f16x2 %3, r2446, r2907; +} +{ +sub.f16x2 %12, r2302, r2901; +} +{ +sub.f16x2 %13, r2446, r2907; +} +{ +add.f16x2 %4, r2374, r2917; +} +{ +add.f16x2 %5, r2518, r2923; +} +{ +sub.f16x2 %14, r2374, r2917; +} +{ +sub.f16x2 %15, r2518, r2923; +} +{ +add.f16x2 %6, r2410, r2933; +} +{ +add.f16x2 %7, r2554, r2939; +} +{ +sub.f16x2 %16, r2410, r2933; +} +{ +sub.f16x2 %17, r2554, r2939; +} +{ +add.f16x2 %8, r2338, r2949; +} +{ +add.f16x2 %9, r2482, r2955; +} +{ +sub.f16x2 %18, r2338, r2949; +} +{ +sub.f16x2 %19, r2482, r2955; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..6b8943fc22da8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp16_inv.hpp.inc @@ -0,0 +1,7406 @@ +#ifndef CUFFTDX_FFT_1000_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_1000_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1141, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<231>; +.reg .b32 r<3050>; +.reg .b64 rd<7>; +mov.u32 r3031, %tid.y; +shl.b32 r3032, r3031, 1; +mov.u32 r3033, %20; +mad.lo.s32 r3034, r3032, 4000, r3033; +mov.u32 r3035, %tid.x; +mov.f32 f194, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1, {low, high}; +} +mov.f32 f188, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f202, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r5, {low, high}; +} +mov.f32 f184, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %25, %37; +} +{ +add.f16x2 r14, %21, r11; +} +{ +add.f16x2 r17, %29, %33; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %26, %38; +} +{ +add.f16x2 r26, %22, r23; +} +{ +add.f16x2 r29, %30, %34; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %25, %37; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %21, r38; +} +{ +add.f16x2 r44, %29, %33; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %26, %38; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %30, %34; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %25, %37; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %21, r74; +} +{ +add.f16x2 r80, %29, %33; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %26, %38; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %30, %34; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %25, %37; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %21, r110; +} +{ +add.f16x2 r116, %29, %33; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %26, %38; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %30, %34; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %25, %37; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %21, r146; +} +{ +add.f16x2 r152, %29, %33; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %26, %38; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %30, %34; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %26, %38; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %22, r182; +} +{ +add.f16x2 r188, %30, %34; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %25, %37; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %29, %33; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %26, %38; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %22, r218; +} +{ +add.f16x2 r224, %30, %34; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %25, %37; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %29, %33; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %26, %38; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %22, r254; +} +{ +add.f16x2 r260, %30, %34; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %25, %37; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %29, %33; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %26, %38; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %22, r290; +} +{ +add.f16x2 r296, %30, %34; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %25, %37; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %29, %33; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %27, %39; +} +{ +add.f16x2 r336, %23, r333; +} +{ +add.f16x2 r339, %31, %35; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %28, %40; +} +{ +add.f16x2 r348, %24, r345; +} +{ +add.f16x2 r351, %32, %36; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %27, %39; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %23, r360; +} +{ +add.f16x2 r366, %31, %35; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %28, %40; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %32, %36; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %27, %39; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %23, r396; +} +{ +add.f16x2 r402, %31, %35; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %28, %40; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %32, %36; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %27, %39; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %23, r432; +} +{ +add.f16x2 r438, %31, %35; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %28, %40; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %32, %36; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %27, %39; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %23, r468; +} +{ +add.f16x2 r474, %31, %35; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %28, %40; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %32, %36; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %28, %40; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %24, r504; +} +{ +add.f16x2 r510, %32, %36; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %27, %39; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %31, %35; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %28, %40; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %24, r540; +} +{ +add.f16x2 r546, %32, %36; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %27, %39; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %31, %35; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %28, %40; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %24, r576; +} +{ +add.f16x2 r582, %32, %36; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %27, %39; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %31, %35; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %28, %40; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %24, r612; +} +{ +add.f16x2 r618, %32, %36; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %27, %39; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %31, %35; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +mov.f32 f190, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r645, {low, high}; +} +mov.f32 f204, 0f3F167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r647, {low, high}; +} +mov.f32 f200, 0f3F737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r648, {low, high}; +} +mov.f32 f198, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r652, {low, high}; +} +mov.f32 f161, 0fBF800000; +{ +mul.f16x2 r663, r390, r645; +} +{ +mul.f16x2 r666, r534, r646; +} +{ +sub.f16x2 r669, r663, r666; +} +{ +mul.f16x2 r672, r390, r646; +} +{ +fma.rn.f16x2 r675, r534, r645, r672; +} +{ +mul.f16x2 r679, r462, r647; +} +{ +mul.f16x2 r682, r606, r648; +} +{ +sub.f16x2 r685, r679, r682; +} +{ +mul.f16x2 r688, r462, r648; +} +{ +fma.rn.f16x2 r691, r606, r647, r688; +} +{ +mul.f16x2 r695, r498, r649; +} +{ +mul.f16x2 r698, r642, r650; +} +{ +sub.f16x2 r701, r695, r698; +} +{ +mul.f16x2 r704, r498, r650; +} +{ +fma.rn.f16x2 r707, r642, r649, r704; +} +{ +mul.f16x2 r711, r426, r651; +} +{ +mul.f16x2 r714, r570, r652; +} +{ +sub.f16x2 r717, r711, r714; +} +{ +mul.f16x2 r720, r426, r652; +} +{ +fma.rn.f16x2 r723, r570, r651, r720; +} +{ +add.f16x2 r727, r20, r342; +} +{ +add.f16x2 r730, r32, r354; +} +{ +sub.f16x2 r733, r20, r342; +} +{ +sub.f16x2 r736, r32, r354; +} +{ +add.f16x2 r739, r68, r669; +} +{ +add.f16x2 r742, r212, r675; +} +{ +sub.f16x2 r745, r68, r669; +} +{ +sub.f16x2 r748, r212, r675; +} +{ +add.f16x2 r751, r140, r685; +} +{ +add.f16x2 r754, r284, r691; +} +{ +sub.f16x2 r757, r140, r685; +} +{ +sub.f16x2 r760, r284, r691; +} +{ +add.f16x2 r763, r176, r701; +} +{ +add.f16x2 r766, r320, r707; +} +{ +sub.f16x2 r769, r176, r701; +} +{ +sub.f16x2 r772, r320, r707; +} +{ +add.f16x2 r775, r104, r717; +} +{ +add.f16x2 r778, r248, r723; +} +{ +sub.f16x2 r781, r104, r717; +} +{ +sub.f16x2 r784, r248, r723; +} +mul.wide.u32 rd2, r3035, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r3036, rd3; +mul.lo.s32 r3037, r3036, 100; +sub.s32 r3038, r3035, r3037; +shr.u64 rd4, rd2, 36; +cvt.u32.u64 r3039, rd4; +and.b32 r3040, r3039, 268435454; +mad.lo.s32 r3041, r3040, 4000, r3034; +cvt.rn.f32.u32 f225, r3038; +mul.f32 f226, f225, 0f3BCDE32E; +cos.approx.f32 f61, f226; +sin.approx.f32 f227, f226; +neg.f32 f62, f227; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r787, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r792, {high, high}; +} +{ +mul.f16x2 r794, r742, r792; +} +{ +fma.rn.f16x2 r797, r739, r790, r794; +} +{ +mul.f16x2 r801, r739, r792; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r742, r790, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r812, {high, high}; +} +mov.f32 f162, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r812, r814; +} +{ +mul.f16x2 r818, r787, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r821, {high, low}; +} +{ +fma.rn.f16x2 r823, r815, r821, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r829, {high, high}; +} +{ +mul.f16x2 r831, r754, r829; +} +{ +fma.rn.f16x2 r834, r751, r827, r831; +} +{ +mul.f16x2 r838, r751, r829; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r754, r827, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r851, {low, high}; +} +{ +mul.f16x2 r852, r849, r851; +} +{ +mul.f16x2 r855, r823, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r858, {high, low}; +} +{ +fma.rn.f16x2 r860, r852, r858, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r766, r866; +} +{ +fma.rn.f16x2 r871, r763, r864, r868; +} +{ +mul.f16x2 r875, r763, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r766, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r860, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r778, r903; +} +{ +fma.rn.f16x2 r908, r775, r901, r905; +} +{ +mul.f16x2 r912, r775, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r778, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r736, r940; +} +{ +fma.rn.f16x2 r945, r733, r938, r942; +} +{ +mul.f16x2 r949, r733, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r736, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r748, r977; +} +{ +fma.rn.f16x2 r982, r745, r975, r979; +} +{ +mul.f16x2 r986, r745, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r748, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r760, r1014; +} +{ +fma.rn.f16x2 r1019, r757, r1012, r1016; +} +{ +mul.f16x2 r1023, r757, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r760, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r772, r1051; +} +{ +fma.rn.f16x2 r1056, r769, r1049, r1053; +} +{ +mul.f16x2 r1060, r769, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r772, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r784, r1088; +} +{ +fma.rn.f16x2 r1093, r781, r1086, r1090; +} +{ +mul.f16x2 r1097, r781, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r784, r1086, r1100; +} +barrier.sync 0; +mad.lo.s32 r3042, r3038, 80, r3041; +st.shared.v2.f32 [r3042], {r727, r730}; +st.shared.v2.f32 [r3042+8], {r797, r806}; +st.shared.v2.f32 [r3042+16], {r834, r843}; +st.shared.v2.f32 [r3042+24], {r871, r880}; +st.shared.v2.f32 [r3042+32], {r908, r917}; +st.shared.v2.f32 [r3042+40], {r945, r954}; +st.shared.v2.f32 [r3042+48], {r982, r991}; +st.shared.v2.f32 [r3042+56], {r1019, r1028}; +st.shared.v2.f32 [r3042+64], {r1056, r1065}; +st.shared.v2.f32 [r3042+72], {r1093, r1102}; +barrier.sync 0; +mad.lo.s32 r3043, r3038, -72, r3042; +ld.shared.u32 r1137, [r3043]; +ld.shared.u32 r1149, [r3043+4]; +ld.shared.u32 r1459, [r3043+800]; +ld.shared.u32 r1471, [r3043+804]; +ld.shared.u32 r1134, [r3043+1600]; +ld.shared.u32 r1146, [r3043+1604]; +ld.shared.u32 r1456, [r3043+2400]; +ld.shared.u32 r1468, [r3043+2404]; +ld.shared.u32 r1140, [r3043+3200]; +ld.shared.u32 r1152, [r3043+3204]; +ld.shared.u32 r1462, [r3043+4000]; +ld.shared.u32 r1474, [r3043+4004]; +ld.shared.u32 r1141, [r3043+4800]; +ld.shared.u32 r1153, [r3043+4804]; +ld.shared.u32 r1463, [r3043+5600]; +ld.shared.u32 r1475, [r3043+5604]; +ld.shared.u32 r1135, [r3043+6400]; +ld.shared.u32 r1147, [r3043+6404]; +ld.shared.u32 r1457, [r3043+7200]; +ld.shared.u32 r1469, [r3043+7204]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1128, {low, high}; +} +{ +neg.f16x2 r1129, r1128; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1132, {low, high}; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1137, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1136, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1149, r1145; +} +{ +add.f16x2 r1151, r1152, r1153; +} +{ +add.f16x2 r1154, r1148, r1151; +} +{ +add.f16x2 r1157, r1134, r1135; +} +{ +mul.f16x2 r1160, r1157, r1123; +} +{ +add.f16x2 r1163, r1137, r1160; +} +{ +add.f16x2 r1166, r1140, r1141; +} +{ +mul.f16x2 r1169, r1166, r1127; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1125; +} +{ +sub.f16x2 r1181, r1152, r1153; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, r1134, r1135; +} +{ +mul.f16x2 r1196, r1193, r1123; +} +{ +add.f16x2 r1199, r1137, r1196; +} +{ +add.f16x2 r1202, r1140, r1141; +} +{ +mul.f16x2 r1205, r1202, r1127; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1125; +} +{ +sub.f16x2 r1217, r1152, r1153; +} +{ +mul.f16x2 r1220, r1217, r1129; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, r1134, r1135; +} +{ +mul.f16x2 r1232, r1229, r1127; +} +{ +add.f16x2 r1235, r1137, r1232; +} +{ +add.f16x2 r1238, r1140, r1141; +} +{ +mul.f16x2 r1241, r1238, r1131; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1129; +} +{ +sub.f16x2 r1253, r1152, r1153; +} +{ +mul.f16x2 r1256, r1253, r1132; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +sub.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, r1134, r1135; +} +{ +mul.f16x2 r1268, r1265, r1127; +} +{ +add.f16x2 r1271, r1137, r1268; +} +{ +add.f16x2 r1274, r1140, r1141; +} +{ +mul.f16x2 r1277, r1274, r1131; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1129; +} +{ +sub.f16x2 r1289, r1152, r1153; +} +{ +mul.f16x2 r1292, r1289, r1132; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +add.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, r1146, r1147; +} +{ +mul.f16x2 r1304, r1301, r1123; +} +{ +add.f16x2 r1307, r1149, r1304; +} +{ +add.f16x2 r1310, r1152, r1153; +} +{ +mul.f16x2 r1313, r1310, r1127; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1125; +} +{ +sub.f16x2 r1325, r1140, r1141; +} +{ +mul.f16x2 r1328, r1325, r1129; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, r1146, r1147; +} +{ +mul.f16x2 r1340, r1337, r1123; +} +{ +add.f16x2 r1343, r1149, r1340; +} +{ +add.f16x2 r1346, r1152, r1153; +} +{ +mul.f16x2 r1349, r1346, r1127; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1125; +} +{ +sub.f16x2 r1361, r1140, r1141; +} +{ +mul.f16x2 r1364, r1361, r1129; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +add.f16x2 r1373, r1146, r1147; +} +{ +mul.f16x2 r1376, r1373, r1127; +} +{ +add.f16x2 r1379, r1149, r1376; +} +{ +add.f16x2 r1382, r1152, r1153; +} +{ +mul.f16x2 r1385, r1382, r1131; +} +{ +add.f16x2 r1388, r1379, r1385; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1129; +} +{ +sub.f16x2 r1397, r1140, r1141; +} +{ +mul.f16x2 r1400, r1397, r1132; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +add.f16x2 r1406, r1388, r1403; +} +{ +add.f16x2 r1409, r1146, r1147; +} +{ +mul.f16x2 r1412, r1409, r1127; +} +{ +add.f16x2 r1415, r1149, r1412; +} +{ +add.f16x2 r1418, r1152, r1153; +} +{ +mul.f16x2 r1421, r1418, r1131; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1129; +} +{ +sub.f16x2 r1433, r1140, r1141; +} +{ +mul.f16x2 r1436, r1433, r1132; +} +{ +add.f16x2 r1439, r1430, r1436; +} +{ +sub.f16x2 r1442, r1424, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1446, {low, high}; +} +{ +neg.f16x2 r1447, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1450, {low, high}; +} +{ +neg.f16x2 r1451, r1450; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1454, {low, high}; +} +{ +add.f16x2 r1455, r1456, r1457; +} +{ +add.f16x2 r1458, r1459, r1455; +} +{ +add.f16x2 r1461, r1462, r1463; +} +{ +add.f16x2 r1464, r1458, r1461; +} +{ +add.f16x2 r1467, r1468, r1469; +} +{ +add.f16x2 r1470, r1471, r1467; +} +{ +add.f16x2 r1473, r1474, r1475; +} +{ +add.f16x2 r1476, r1470, r1473; +} +{ +add.f16x2 r1479, r1456, r1457; +} +{ +mul.f16x2 r1482, r1479, r1445; +} +{ +add.f16x2 r1485, r1459, r1482; +} +{ +add.f16x2 r1488, r1462, r1463; +} +{ +mul.f16x2 r1491, r1488, r1449; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, r1468, r1469; +} +{ +mul.f16x2 r1500, r1497, r1447; +} +{ +sub.f16x2 r1503, r1474, r1475; +} +{ +mul.f16x2 r1506, r1503, r1451; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +sub.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, r1456, r1457; +} +{ +mul.f16x2 r1518, r1515, r1445; +} +{ +add.f16x2 r1521, r1459, r1518; +} +{ +add.f16x2 r1524, r1462, r1463; +} +{ +mul.f16x2 r1527, r1524, r1449; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, r1468, r1469; +} +{ +mul.f16x2 r1536, r1533, r1447; +} +{ +sub.f16x2 r1539, r1474, r1475; +} +{ +mul.f16x2 r1542, r1539, r1451; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +add.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, r1456, r1457; +} +{ +mul.f16x2 r1554, r1551, r1449; +} +{ +add.f16x2 r1557, r1459, r1554; +} +{ +add.f16x2 r1560, r1462, r1463; +} +{ +mul.f16x2 r1563, r1560, r1453; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, r1468, r1469; +} +{ +mul.f16x2 r1572, r1569, r1451; +} +{ +sub.f16x2 r1575, r1474, r1475; +} +{ +mul.f16x2 r1578, r1575, r1454; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +sub.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, r1456, r1457; +} +{ +mul.f16x2 r1590, r1587, r1449; +} +{ +add.f16x2 r1593, r1459, r1590; +} +{ +add.f16x2 r1596, r1462, r1463; +} +{ +mul.f16x2 r1599, r1596, r1453; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, r1468, r1469; +} +{ +mul.f16x2 r1608, r1605, r1451; +} +{ +sub.f16x2 r1611, r1474, r1475; +} +{ +mul.f16x2 r1614, r1611, r1454; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +add.f16x2 r1620, r1602, r1617; +} +{ +add.f16x2 r1623, r1468, r1469; +} +{ +mul.f16x2 r1626, r1623, r1445; +} +{ +add.f16x2 r1629, r1471, r1626; +} +{ +add.f16x2 r1632, r1474, r1475; +} +{ +mul.f16x2 r1635, r1632, r1449; +} +{ +add.f16x2 r1638, r1629, r1635; +} +{ +sub.f16x2 r1641, r1456, r1457; +} +{ +mul.f16x2 r1644, r1641, r1447; +} +{ +sub.f16x2 r1647, r1462, r1463; +} +{ +mul.f16x2 r1650, r1647, r1451; +} +{ +add.f16x2 r1653, r1644, r1650; +} +{ +add.f16x2 r1656, r1638, r1653; +} +{ +add.f16x2 r1659, r1468, r1469; +} +{ +mul.f16x2 r1662, r1659, r1445; +} +{ +add.f16x2 r1665, r1471, r1662; +} +{ +add.f16x2 r1668, r1474, r1475; +} +{ +mul.f16x2 r1671, r1668, r1449; +} +{ +add.f16x2 r1674, r1665, r1671; +} +{ +sub.f16x2 r1677, r1456, r1457; +} +{ +mul.f16x2 r1680, r1677, r1447; +} +{ +sub.f16x2 r1683, r1462, r1463; +} +{ +mul.f16x2 r1686, r1683, r1451; +} +{ +add.f16x2 r1689, r1680, r1686; +} +{ +sub.f16x2 r1692, r1674, r1689; +} +{ +add.f16x2 r1695, r1468, r1469; +} +{ +mul.f16x2 r1698, r1695, r1449; +} +{ +add.f16x2 r1701, r1471, r1698; +} +{ +add.f16x2 r1704, r1474, r1475; +} +{ +mul.f16x2 r1707, r1704, r1453; +} +{ +add.f16x2 r1710, r1701, r1707; +} +{ +sub.f16x2 r1713, r1456, r1457; +} +{ +mul.f16x2 r1716, r1713, r1451; +} +{ +sub.f16x2 r1719, r1462, r1463; +} +{ +mul.f16x2 r1722, r1719, r1454; +} +{ +add.f16x2 r1725, r1716, r1722; +} +{ +add.f16x2 r1728, r1710, r1725; +} +{ +add.f16x2 r1731, r1468, r1469; +} +{ +mul.f16x2 r1734, r1731, r1449; +} +{ +add.f16x2 r1737, r1471, r1734; +} +{ +add.f16x2 r1740, r1474, r1475; +} +{ +mul.f16x2 r1743, r1740, r1453; +} +{ +add.f16x2 r1746, r1737, r1743; +} +{ +sub.f16x2 r1749, r1456, r1457; +} +{ +mul.f16x2 r1752, r1749, r1451; +} +{ +sub.f16x2 r1755, r1462, r1463; +} +{ +mul.f16x2 r1758, r1755, r1454; +} +{ +add.f16x2 r1761, r1752, r1758; +} +{ +sub.f16x2 r1764, r1746, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1785, r1512, r1767; +} +{ +mul.f16x2 r1788, r1656, r1768; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1512, r1768; +} +{ +fma.rn.f16x2 r1797, r1656, r1767, r1794; +} +{ +mul.f16x2 r1801, r1584, r1769; +} +{ +mul.f16x2 r1804, r1728, r1770; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1584, r1770; +} +{ +fma.rn.f16x2 r1813, r1728, r1769, r1810; +} +{ +mul.f16x2 r1817, r1620, r1771; +} +{ +mul.f16x2 r1820, r1764, r1772; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1620, r1772; +} +{ +fma.rn.f16x2 r1829, r1764, r1771, r1826; +} +{ +mul.f16x2 r1833, r1548, r1773; +} +{ +mul.f16x2 r1836, r1692, r1774; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1548, r1774; +} +{ +fma.rn.f16x2 r1845, r1692, r1773, r1842; +} +{ +add.f16x2 r1849, r1142, r1464; +} +{ +add.f16x2 r1852, r1154, r1476; +} +{ +sub.f16x2 r1855, r1142, r1464; +} +{ +sub.f16x2 r1858, r1154, r1476; +} +{ +add.f16x2 r1861, r1190, r1791; +} +{ +add.f16x2 r1864, r1334, r1797; +} +{ +sub.f16x2 r1867, r1190, r1791; +} +{ +sub.f16x2 r1870, r1334, r1797; +} +{ +add.f16x2 r1873, r1262, r1807; +} +{ +add.f16x2 r1876, r1406, r1813; +} +{ +sub.f16x2 r1879, r1262, r1807; +} +{ +sub.f16x2 r1882, r1406, r1813; +} +{ +add.f16x2 r1885, r1298, r1823; +} +{ +add.f16x2 r1888, r1442, r1829; +} +{ +sub.f16x2 r1891, r1298, r1823; +} +{ +sub.f16x2 r1894, r1442, r1829; +} +{ +add.f16x2 r1897, r1226, r1839; +} +{ +add.f16x2 r1900, r1370, r1845; +} +{ +sub.f16x2 r1903, r1226, r1839; +} +{ +sub.f16x2 r1906, r1370, r1845; +} +mul.wide.u32 rd5, r3038, -858993459; +shr.u64 rd6, rd5, 35; +cvt.u32.u64 r3044, rd6; +cvt.rn.f32.u32 f228, r3044; +mul.f32 f229, f228, 0f3D80ADFD; +cos.approx.f32 f143, f229; +sin.approx.f32 f230, f229; +neg.f32 f144, f230; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1909, {low, high}; +} +mul.lo.s32 r3045, r3044, 10; +sub.s32 r3046, r3038, r3045; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1912, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1914, {high, high}; +} +{ +mul.f16x2 r1916, r1864, r1914; +} +{ +fma.rn.f16x2 r1919, r1861, r1912, r1916; +} +{ +mul.f16x2 r1923, r1861, r1914; +} +{ +neg.f16x2 r1926, r1923; +} +{ +fma.rn.f16x2 r1928, r1864, r1912, r1926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1932, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1934, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1936, {low, high}; +} +{ +mul.f16x2 r1937, r1934, r1936; +} +{ +mul.f16x2 r1940, r1909, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1943, {high, low}; +} +{ +fma.rn.f16x2 r1945, r1937, r1943, r1940; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1949, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1951, {high, high}; +} +{ +mul.f16x2 r1953, r1876, r1951; +} +{ +fma.rn.f16x2 r1956, r1873, r1949, r1953; +} +{ +mul.f16x2 r1960, r1873, r1951; +} +{ +neg.f16x2 r1963, r1960; +} +{ +fma.rn.f16x2 r1965, r1876, r1949, r1963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1969, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1971, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1973, {low, high}; +} +{ +mul.f16x2 r1974, r1971, r1973; +} +{ +mul.f16x2 r1977, r1945, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1980, {high, low}; +} +{ +fma.rn.f16x2 r1982, r1974, r1980, r1977; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1986, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1988, {high, high}; +} +{ +mul.f16x2 r1990, r1888, r1988; +} +{ +fma.rn.f16x2 r1993, r1885, r1986, r1990; +} +{ +mul.f16x2 r1997, r1885, r1988; +} +{ +neg.f16x2 r2000, r1997; +} +{ +fma.rn.f16x2 r2002, r1888, r1986, r2000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2006, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2008, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2010, {low, high}; +} +{ +mul.f16x2 r2011, r2008, r2010; +} +{ +mul.f16x2 r2014, r1982, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r2017, {high, low}; +} +{ +fma.rn.f16x2 r2019, r2011, r2017, r2014; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2023, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2025, {high, high}; +} +{ +mul.f16x2 r2027, r1900, r2025; +} +{ +fma.rn.f16x2 r2030, r1897, r2023, r2027; +} +{ +mul.f16x2 r2034, r1897, r2025; +} +{ +neg.f16x2 r2037, r2034; +} +{ +fma.rn.f16x2 r2039, r1900, r2023, r2037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2043, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2045, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r2045, r2047; +} +{ +mul.f16x2 r2051, r2019, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2054, {high, low}; +} +{ +fma.rn.f16x2 r2056, r2048, r2054, r2051; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2060, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2062, {high, high}; +} +{ +mul.f16x2 r2064, r1858, r2062; +} +{ +fma.rn.f16x2 r2067, r1855, r2060, r2064; +} +{ +mul.f16x2 r2071, r1855, r2062; +} +{ +neg.f16x2 r2074, r2071; +} +{ +fma.rn.f16x2 r2076, r1858, r2060, r2074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2080, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2082, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r2082, r2084; +} +{ +mul.f16x2 r2088, r2056, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2091, {high, low}; +} +{ +fma.rn.f16x2 r2093, r2085, r2091, r2088; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2097, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2099, {high, high}; +} +{ +mul.f16x2 r2101, r1870, r2099; +} +{ +fma.rn.f16x2 r2104, r1867, r2097, r2101; +} +{ +mul.f16x2 r2108, r1867, r2099; +} +{ +neg.f16x2 r2111, r2108; +} +{ +fma.rn.f16x2 r2113, r1870, r2097, r2111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2117, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2119, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2121, {low, high}; +} +{ +mul.f16x2 r2122, r2119, r2121; +} +{ +mul.f16x2 r2125, r2093, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2128, {high, low}; +} +{ +fma.rn.f16x2 r2130, r2122, r2128, r2125; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2134, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2136, {high, high}; +} +{ +mul.f16x2 r2138, r1882, r2136; +} +{ +fma.rn.f16x2 r2141, r1879, r2134, r2138; +} +{ +mul.f16x2 r2145, r1879, r2136; +} +{ +neg.f16x2 r2148, r2145; +} +{ +fma.rn.f16x2 r2150, r1882, r2134, r2148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2154, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2156, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2158, {low, high}; +} +{ +mul.f16x2 r2159, r2156, r2158; +} +{ +mul.f16x2 r2162, r2130, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2165, {high, low}; +} +{ +fma.rn.f16x2 r2167, r2159, r2165, r2162; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2171, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2173, {high, high}; +} +{ +mul.f16x2 r2175, r1894, r2173; +} +{ +fma.rn.f16x2 r2178, r1891, r2171, r2175; +} +{ +mul.f16x2 r2182, r1891, r2173; +} +{ +neg.f16x2 r2185, r2182; +} +{ +fma.rn.f16x2 r2187, r1894, r2171, r2185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2191, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2193, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2195, {low, high}; +} +{ +mul.f16x2 r2196, r2193, r2195; +} +{ +mul.f16x2 r2199, r2167, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2202, {high, low}; +} +{ +fma.rn.f16x2 r2204, r2196, r2202, r2199; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2208, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2210, {high, high}; +} +{ +mul.f16x2 r2212, r1906, r2210; +} +{ +fma.rn.f16x2 r2215, r1903, r2208, r2212; +} +{ +mul.f16x2 r2219, r1903, r2210; +} +{ +neg.f16x2 r2222, r2219; +} +{ +fma.rn.f16x2 r2224, r1906, r2208, r2222; +} +shl.b32 r3047, r3046, 3; +add.s32 r3048, r3041, r3047; +barrier.sync 0; +mad.lo.s32 r3049, r3044, 800, r3048; +st.shared.u32 [r3049], r1849; +st.shared.u32 [r3049+4], r1852; +st.shared.u32 [r3049+80], r1919; +st.shared.u32 [r3049+84], r1928; +st.shared.u32 [r3049+160], r1956; +st.shared.u32 [r3049+164], r1965; +st.shared.u32 [r3049+240], r1993; +st.shared.u32 [r3049+244], r2002; +st.shared.u32 [r3049+320], r2030; +st.shared.u32 [r3049+324], r2039; +st.shared.u32 [r3049+400], r2067; +st.shared.u32 [r3049+404], r2076; +st.shared.u32 [r3049+480], r2104; +st.shared.u32 [r3049+484], r2113; +st.shared.u32 [r3049+560], r2141; +st.shared.u32 [r3049+564], r2150; +st.shared.u32 [r3049+640], r2178; +st.shared.u32 [r3049+644], r2187; +st.shared.u32 [r3049+720], r2215; +st.shared.u32 [r3049+724], r2224; +barrier.sync 0; +ld.shared.u32 r2259, [r3043]; +ld.shared.u32 r2271, [r3043+4]; +ld.shared.u32 r2581, [r3043+800]; +ld.shared.u32 r2593, [r3043+804]; +ld.shared.u32 r2256, [r3043+1600]; +ld.shared.u32 r2268, [r3043+1604]; +ld.shared.u32 r2578, [r3043+2400]; +ld.shared.u32 r2590, [r3043+2404]; +ld.shared.u32 r2262, [r3043+3200]; +ld.shared.u32 r2274, [r3043+3204]; +ld.shared.u32 r2584, [r3043+4000]; +ld.shared.u32 r2596, [r3043+4004]; +ld.shared.u32 r2263, [r3043+4800]; +ld.shared.u32 r2275, [r3043+4804]; +ld.shared.u32 r2585, [r3043+5600]; +ld.shared.u32 r2597, [r3043+5604]; +ld.shared.u32 r2257, [r3043+6400]; +ld.shared.u32 r2269, [r3043+6404]; +ld.shared.u32 r2579, [r3043+7200]; +ld.shared.u32 r2591, [r3043+7204]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2246, {low, high}; +} +{ +neg.f16x2 r2247, r2246; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r2250, {low, high}; +} +{ +neg.f16x2 r2251, r2250; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2254, {low, high}; +} +{ +add.f16x2 r2255, r2256, r2257; +} +{ +add.f16x2 r2258, r2259, r2255; +} +{ +add.f16x2 r2261, r2262, r2263; +} +{ +add.f16x2 r2264, r2258, r2261; +} +{ +add.f16x2 r2267, r2268, r2269; +} +{ +add.f16x2 r2270, r2271, r2267; +} +{ +add.f16x2 r2273, r2274, r2275; +} +{ +add.f16x2 r2276, r2270, r2273; +} +{ +add.f16x2 r2279, r2256, r2257; +} +{ +mul.f16x2 r2282, r2279, r2245; +} +{ +add.f16x2 r2285, r2259, r2282; +} +{ +add.f16x2 r2288, r2262, r2263; +} +{ +mul.f16x2 r2291, r2288, r2249; +} +{ +add.f16x2 r2294, r2285, r2291; +} +{ +sub.f16x2 r2297, r2268, r2269; +} +{ +mul.f16x2 r2300, r2297, r2247; +} +{ +sub.f16x2 r2303, r2274, r2275; +} +{ +mul.f16x2 r2306, r2303, r2251; +} +{ +add.f16x2 r2309, r2300, r2306; +} +{ +sub.f16x2 r2312, r2294, r2309; +} +{ +add.f16x2 r2315, r2256, r2257; +} +{ +mul.f16x2 r2318, r2315, r2245; +} +{ +add.f16x2 r2321, r2259, r2318; +} +{ +add.f16x2 r2324, r2262, r2263; +} +{ +mul.f16x2 r2327, r2324, r2249; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +sub.f16x2 r2333, r2268, r2269; +} +{ +mul.f16x2 r2336, r2333, r2247; +} +{ +sub.f16x2 r2339, r2274, r2275; +} +{ +mul.f16x2 r2342, r2339, r2251; +} +{ +add.f16x2 r2345, r2336, r2342; +} +{ +add.f16x2 r2348, r2330, r2345; +} +{ +add.f16x2 r2351, r2256, r2257; +} +{ +mul.f16x2 r2354, r2351, r2249; +} +{ +add.f16x2 r2357, r2259, r2354; +} +{ +add.f16x2 r2360, r2262, r2263; +} +{ +mul.f16x2 r2363, r2360, r2253; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +sub.f16x2 r2369, r2268, r2269; +} +{ +mul.f16x2 r2372, r2369, r2251; +} +{ +sub.f16x2 r2375, r2274, r2275; +} +{ +mul.f16x2 r2378, r2375, r2254; +} +{ +add.f16x2 r2381, r2372, r2378; +} +{ +sub.f16x2 r2384, r2366, r2381; +} +{ +add.f16x2 r2387, r2256, r2257; +} +{ +mul.f16x2 r2390, r2387, r2249; +} +{ +add.f16x2 r2393, r2259, r2390; +} +{ +add.f16x2 r2396, r2262, r2263; +} +{ +mul.f16x2 r2399, r2396, r2253; +} +{ +add.f16x2 r2402, r2393, r2399; +} +{ +sub.f16x2 r2405, r2268, r2269; +} +{ +mul.f16x2 r2408, r2405, r2251; +} +{ +sub.f16x2 r2411, r2274, r2275; +} +{ +mul.f16x2 r2414, r2411, r2254; +} +{ +add.f16x2 r2417, r2408, r2414; +} +{ +add.f16x2 r2420, r2402, r2417; +} +{ +add.f16x2 r2423, r2268, r2269; +} +{ +mul.f16x2 r2426, r2423, r2245; +} +{ +add.f16x2 r2429, r2271, r2426; +} +{ +add.f16x2 r2432, r2274, r2275; +} +{ +mul.f16x2 r2435, r2432, r2249; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +sub.f16x2 r2441, r2256, r2257; +} +{ +mul.f16x2 r2444, r2441, r2247; +} +{ +sub.f16x2 r2447, r2262, r2263; +} +{ +mul.f16x2 r2450, r2447, r2251; +} +{ +add.f16x2 r2453, r2444, r2450; +} +{ +add.f16x2 r2456, r2438, r2453; +} +{ +add.f16x2 r2459, r2268, r2269; +} +{ +mul.f16x2 r2462, r2459, r2245; +} +{ +add.f16x2 r2465, r2271, r2462; +} +{ +add.f16x2 r2468, r2274, r2275; +} +{ +mul.f16x2 r2471, r2468, r2249; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +sub.f16x2 r2477, r2256, r2257; +} +{ +mul.f16x2 r2480, r2477, r2247; +} +{ +sub.f16x2 r2483, r2262, r2263; +} +{ +mul.f16x2 r2486, r2483, r2251; +} +{ +add.f16x2 r2489, r2480, r2486; +} +{ +sub.f16x2 r2492, r2474, r2489; +} +{ +add.f16x2 r2495, r2268, r2269; +} +{ +mul.f16x2 r2498, r2495, r2249; +} +{ +add.f16x2 r2501, r2271, r2498; +} +{ +add.f16x2 r2504, r2274, r2275; +} +{ +mul.f16x2 r2507, r2504, r2253; +} +{ +add.f16x2 r2510, r2501, r2507; +} +{ +sub.f16x2 r2513, r2256, r2257; +} +{ +mul.f16x2 r2516, r2513, r2251; +} +{ +sub.f16x2 r2519, r2262, r2263; +} +{ +mul.f16x2 r2522, r2519, r2254; +} +{ +add.f16x2 r2525, r2516, r2522; +} +{ +add.f16x2 r2528, r2510, r2525; +} +{ +add.f16x2 r2531, r2268, r2269; +} +{ +mul.f16x2 r2534, r2531, r2249; +} +{ +add.f16x2 r2537, r2271, r2534; +} +{ +add.f16x2 r2540, r2274, r2275; +} +{ +mul.f16x2 r2543, r2540, r2253; +} +{ +add.f16x2 r2546, r2537, r2543; +} +{ +sub.f16x2 r2549, r2256, r2257; +} +{ +mul.f16x2 r2552, r2549, r2251; +} +{ +sub.f16x2 r2555, r2262, r2263; +} +{ +mul.f16x2 r2558, r2555, r2254; +} +{ +add.f16x2 r2561, r2552, r2558; +} +{ +sub.f16x2 r2564, r2546, r2561; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2568, {low, high}; +} +{ +neg.f16x2 r2569, r2568; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r2572, {low, high}; +} +{ +neg.f16x2 r2573, r2572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2576, {low, high}; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2590, r2591; +} +{ +add.f16x2 r2592, r2593, r2589; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2592, r2595; +} +{ +add.f16x2 r2601, r2578, r2579; +} +{ +mul.f16x2 r2604, r2601, r2567; +} +{ +add.f16x2 r2607, r2581, r2604; +} +{ +add.f16x2 r2610, r2584, r2585; +} +{ +mul.f16x2 r2613, r2610, r2571; +} +{ +add.f16x2 r2616, r2607, r2613; +} +{ +sub.f16x2 r2619, r2590, r2591; +} +{ +mul.f16x2 r2622, r2619, r2569; +} +{ +sub.f16x2 r2625, r2596, r2597; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r2622, r2628; +} +{ +sub.f16x2 r2634, r2616, r2631; +} +{ +add.f16x2 r2637, r2578, r2579; +} +{ +mul.f16x2 r2640, r2637, r2567; +} +{ +add.f16x2 r2643, r2581, r2640; +} +{ +add.f16x2 r2646, r2584, r2585; +} +{ +mul.f16x2 r2649, r2646, r2571; +} +{ +add.f16x2 r2652, r2643, r2649; +} +{ +sub.f16x2 r2655, r2590, r2591; +} +{ +mul.f16x2 r2658, r2655, r2569; +} +{ +sub.f16x2 r2661, r2596, r2597; +} +{ +mul.f16x2 r2664, r2661, r2573; +} +{ +add.f16x2 r2667, r2658, r2664; +} +{ +add.f16x2 r2670, r2652, r2667; +} +{ +add.f16x2 r2673, r2578, r2579; +} +{ +mul.f16x2 r2676, r2673, r2571; +} +{ +add.f16x2 r2679, r2581, r2676; +} +{ +add.f16x2 r2682, r2584, r2585; +} +{ +mul.f16x2 r2685, r2682, r2575; +} +{ +add.f16x2 r2688, r2679, r2685; +} +{ +sub.f16x2 r2691, r2590, r2591; +} +{ +mul.f16x2 r2694, r2691, r2573; +} +{ +sub.f16x2 r2697, r2596, r2597; +} +{ +mul.f16x2 r2700, r2697, r2576; +} +{ +add.f16x2 r2703, r2694, r2700; +} +{ +sub.f16x2 r2706, r2688, r2703; +} +{ +add.f16x2 r2709, r2578, r2579; +} +{ +mul.f16x2 r2712, r2709, r2571; +} +{ +add.f16x2 r2715, r2581, r2712; +} +{ +add.f16x2 r2718, r2584, r2585; +} +{ +mul.f16x2 r2721, r2718, r2575; +} +{ +add.f16x2 r2724, r2715, r2721; +} +{ +sub.f16x2 r2727, r2590, r2591; +} +{ +mul.f16x2 r2730, r2727, r2573; +} +{ +sub.f16x2 r2733, r2596, r2597; +} +{ +mul.f16x2 r2736, r2733, r2576; +} +{ +add.f16x2 r2739, r2730, r2736; +} +{ +add.f16x2 r2742, r2724, r2739; +} +{ +add.f16x2 r2745, r2590, r2591; +} +{ +mul.f16x2 r2748, r2745, r2567; +} +{ +add.f16x2 r2751, r2593, r2748; +} +{ +add.f16x2 r2754, r2596, r2597; +} +{ +mul.f16x2 r2757, r2754, r2571; +} +{ +add.f16x2 r2760, r2751, r2757; +} +{ +sub.f16x2 r2763, r2578, r2579; +} +{ +mul.f16x2 r2766, r2763, r2569; +} +{ +sub.f16x2 r2769, r2584, r2585; +} +{ +mul.f16x2 r2772, r2769, r2573; +} +{ +add.f16x2 r2775, r2766, r2772; +} +{ +add.f16x2 r2778, r2760, r2775; +} +{ +add.f16x2 r2781, r2590, r2591; +} +{ +mul.f16x2 r2784, r2781, r2567; +} +{ +add.f16x2 r2787, r2593, r2784; +} +{ +add.f16x2 r2790, r2596, r2597; +} +{ +mul.f16x2 r2793, r2790, r2571; +} +{ +add.f16x2 r2796, r2787, r2793; +} +{ +sub.f16x2 r2799, r2578, r2579; +} +{ +mul.f16x2 r2802, r2799, r2569; +} +{ +sub.f16x2 r2805, r2584, r2585; +} +{ +mul.f16x2 r2808, r2805, r2573; +} +{ +add.f16x2 r2811, r2802, r2808; +} +{ +sub.f16x2 r2814, r2796, r2811; +} +{ +add.f16x2 r2817, r2590, r2591; +} +{ +mul.f16x2 r2820, r2817, r2571; +} +{ +add.f16x2 r2823, r2593, r2820; +} +{ +add.f16x2 r2826, r2596, r2597; +} +{ +mul.f16x2 r2829, r2826, r2575; +} +{ +add.f16x2 r2832, r2823, r2829; +} +{ +sub.f16x2 r2835, r2578, r2579; +} +{ +mul.f16x2 r2838, r2835, r2573; +} +{ +sub.f16x2 r2841, r2584, r2585; +} +{ +mul.f16x2 r2844, r2841, r2576; +} +{ +add.f16x2 r2847, r2838, r2844; +} +{ +add.f16x2 r2850, r2832, r2847; +} +{ +add.f16x2 r2853, r2590, r2591; +} +{ +mul.f16x2 r2856, r2853, r2571; +} +{ +add.f16x2 r2859, r2593, r2856; +} +{ +add.f16x2 r2862, r2596, r2597; +} +{ +mul.f16x2 r2865, r2862, r2575; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +sub.f16x2 r2871, r2578, r2579; +} +{ +mul.f16x2 r2874, r2871, r2573; +} +{ +sub.f16x2 r2877, r2584, r2585; +} +{ +mul.f16x2 r2880, r2877, r2576; +} +{ +add.f16x2 r2883, r2874, r2880; +} +{ +sub.f16x2 r2886, r2868, r2883; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2891, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2892, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r2893, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2894, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2896, {low, high}; +} +{ +mul.f16x2 r2907, r2634, r2889; +} +{ +mul.f16x2 r2910, r2778, r2890; +} +{ +sub.f16x2 r2913, r2907, r2910; +} +{ +mul.f16x2 r2916, r2634, r2890; +} +{ +fma.rn.f16x2 r2919, r2778, r2889, r2916; +} +{ +mul.f16x2 r2923, r2706, r2891; +} +{ +mul.f16x2 r2926, r2850, r2892; +} +{ +sub.f16x2 r2929, r2923, r2926; +} +{ +mul.f16x2 r2932, r2706, r2892; +} +{ +fma.rn.f16x2 r2935, r2850, r2891, r2932; +} +{ +mul.f16x2 r2939, r2742, r2893; +} +{ +mul.f16x2 r2942, r2886, r2894; +} +{ +sub.f16x2 r2945, r2939, r2942; +} +{ +mul.f16x2 r2948, r2742, r2894; +} +{ +fma.rn.f16x2 r2951, r2886, r2893, r2948; +} +{ +mul.f16x2 r2955, r2670, r2895; +} +{ +mul.f16x2 r2958, r2814, r2896; +} +{ +sub.f16x2 r2961, r2955, r2958; +} +{ +mul.f16x2 r2964, r2670, r2896; +} +{ +fma.rn.f16x2 r2967, r2814, r2895, r2964; +} +{ +add.f16x2 %0, r2264, r2586; +} +{ +add.f16x2 %1, r2276, r2598; +} +{ +sub.f16x2 %10, r2264, r2586; +} +{ +sub.f16x2 %11, r2276, r2598; +} +{ +add.f16x2 %2, r2312, r2913; +} +{ +add.f16x2 %3, r2456, r2919; +} +{ +sub.f16x2 %12, r2312, r2913; +} +{ +sub.f16x2 %13, r2456, r2919; +} +{ +add.f16x2 %4, r2384, r2929; +} +{ +add.f16x2 %5, r2528, r2935; +} +{ +sub.f16x2 %14, r2384, r2929; +} +{ +sub.f16x2 %15, r2528, r2935; +} +{ +add.f16x2 %6, r2420, r2945; +} +{ +add.f16x2 %7, r2564, r2951; +} +{ +sub.f16x2 %16, r2420, r2945; +} +{ +sub.f16x2 %17, r2564, r2951; +} +{ +add.f16x2 %8, r2348, r2961; +} +{ +add.f16x2 %9, r2492, r2967; +} +{ +sub.f16x2 %18, r2348, r2961; +} +{ +sub.f16x2 %19, r2492, r2967; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1142, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<231>; +.reg .b32 r<3047>; +.reg .b64 rd<6>; +mov.u32 r3031, %tid.y; +mov.u32 r3032, %20; +mad.lo.s32 r3033, r3031, 4000, r3032; +mov.u32 r3034, %tid.x; +mov.f32 f194, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1, {low, high}; +} +mov.f32 f188, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f202, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r5, {low, high}; +} +mov.f32 f184, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %25, %37; +} +{ +add.f16x2 r14, %21, r11; +} +{ +add.f16x2 r17, %29, %33; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %26, %38; +} +{ +add.f16x2 r26, %22, r23; +} +{ +add.f16x2 r29, %30, %34; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %25, %37; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %21, r38; +} +{ +add.f16x2 r44, %29, %33; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %26, %38; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %30, %34; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %25, %37; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %21, r74; +} +{ +add.f16x2 r80, %29, %33; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %26, %38; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %30, %34; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %25, %37; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %21, r110; +} +{ +add.f16x2 r116, %29, %33; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %26, %38; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %30, %34; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %25, %37; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %21, r146; +} +{ +add.f16x2 r152, %29, %33; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %26, %38; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %30, %34; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %26, %38; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %22, r182; +} +{ +add.f16x2 r188, %30, %34; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %25, %37; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %29, %33; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %26, %38; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %22, r218; +} +{ +add.f16x2 r224, %30, %34; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %25, %37; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %29, %33; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %26, %38; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %22, r254; +} +{ +add.f16x2 r260, %30, %34; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %25, %37; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %29, %33; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %26, %38; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %22, r290; +} +{ +add.f16x2 r296, %30, %34; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %25, %37; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %29, %33; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %27, %39; +} +{ +add.f16x2 r336, %23, r333; +} +{ +add.f16x2 r339, %31, %35; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %28, %40; +} +{ +add.f16x2 r348, %24, r345; +} +{ +add.f16x2 r351, %32, %36; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %27, %39; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %23, r360; +} +{ +add.f16x2 r366, %31, %35; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %28, %40; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %32, %36; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %27, %39; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %23, r396; +} +{ +add.f16x2 r402, %31, %35; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %28, %40; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %32, %36; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %27, %39; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %23, r432; +} +{ +add.f16x2 r438, %31, %35; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %28, %40; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %32, %36; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %27, %39; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %23, r468; +} +{ +add.f16x2 r474, %31, %35; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %28, %40; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %32, %36; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %28, %40; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %24, r504; +} +{ +add.f16x2 r510, %32, %36; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %27, %39; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %31, %35; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %28, %40; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %24, r540; +} +{ +add.f16x2 r546, %32, %36; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %27, %39; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %31, %35; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %28, %40; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %24, r576; +} +{ +add.f16x2 r582, %32, %36; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %27, %39; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %31, %35; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %28, %40; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %24, r612; +} +{ +add.f16x2 r618, %32, %36; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %27, %39; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %31, %35; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +mov.f32 f190, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r645, {low, high}; +} +mov.f32 f204, 0f3F167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r647, {low, high}; +} +mov.f32 f200, 0f3F737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r648, {low, high}; +} +mov.f32 f198, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r652, {low, high}; +} +mov.f32 f161, 0fBF800000; +{ +mul.f16x2 r663, r390, r645; +} +{ +mul.f16x2 r666, r534, r646; +} +{ +sub.f16x2 r669, r663, r666; +} +{ +mul.f16x2 r672, r390, r646; +} +{ +fma.rn.f16x2 r675, r534, r645, r672; +} +{ +mul.f16x2 r679, r462, r647; +} +{ +mul.f16x2 r682, r606, r648; +} +{ +sub.f16x2 r685, r679, r682; +} +{ +mul.f16x2 r688, r462, r648; +} +{ +fma.rn.f16x2 r691, r606, r647, r688; +} +{ +mul.f16x2 r695, r498, r649; +} +{ +mul.f16x2 r698, r642, r650; +} +{ +sub.f16x2 r701, r695, r698; +} +{ +mul.f16x2 r704, r498, r650; +} +{ +fma.rn.f16x2 r707, r642, r649, r704; +} +{ +mul.f16x2 r711, r426, r651; +} +{ +mul.f16x2 r714, r570, r652; +} +{ +sub.f16x2 r717, r711, r714; +} +{ +mul.f16x2 r720, r426, r652; +} +{ +fma.rn.f16x2 r723, r570, r651, r720; +} +{ +add.f16x2 r727, r20, r342; +} +{ +add.f16x2 r730, r32, r354; +} +{ +sub.f16x2 r733, r20, r342; +} +{ +sub.f16x2 r736, r32, r354; +} +{ +add.f16x2 r739, r68, r669; +} +{ +add.f16x2 r742, r212, r675; +} +{ +sub.f16x2 r745, r68, r669; +} +{ +sub.f16x2 r748, r212, r675; +} +{ +add.f16x2 r751, r140, r685; +} +{ +add.f16x2 r754, r284, r691; +} +{ +sub.f16x2 r757, r140, r685; +} +{ +sub.f16x2 r760, r284, r691; +} +{ +add.f16x2 r763, r176, r701; +} +{ +add.f16x2 r766, r320, r707; +} +{ +sub.f16x2 r769, r176, r701; +} +{ +sub.f16x2 r772, r320, r707; +} +{ +add.f16x2 r775, r104, r717; +} +{ +add.f16x2 r778, r248, r723; +} +{ +sub.f16x2 r781, r104, r717; +} +{ +sub.f16x2 r784, r248, r723; +} +mul.wide.u32 rd2, r3034, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r3035, rd3; +mul.lo.s32 r3036, r3035, 100; +sub.s32 r3037, r3034, r3036; +mad.lo.s32 r3038, r3035, 4000, r3033; +cvt.rn.f32.u32 f225, r3037; +mul.f32 f226, f225, 0f3BCDE32E; +cos.approx.f32 f61, f226; +sin.approx.f32 f227, f226; +neg.f32 f62, f227; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r787, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r792, {high, high}; +} +{ +mul.f16x2 r794, r742, r792; +} +{ +fma.rn.f16x2 r797, r739, r790, r794; +} +{ +mul.f16x2 r801, r739, r792; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r742, r790, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r812, {high, high}; +} +mov.f32 f162, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r812, r814; +} +{ +mul.f16x2 r818, r787, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r821, {high, low}; +} +{ +fma.rn.f16x2 r823, r815, r821, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r829, {high, high}; +} +{ +mul.f16x2 r831, r754, r829; +} +{ +fma.rn.f16x2 r834, r751, r827, r831; +} +{ +mul.f16x2 r838, r751, r829; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r754, r827, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r851, {low, high}; +} +{ +mul.f16x2 r852, r849, r851; +} +{ +mul.f16x2 r855, r823, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r858, {high, low}; +} +{ +fma.rn.f16x2 r860, r852, r858, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r766, r866; +} +{ +fma.rn.f16x2 r871, r763, r864, r868; +} +{ +mul.f16x2 r875, r763, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r766, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r860, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r778, r903; +} +{ +fma.rn.f16x2 r908, r775, r901, r905; +} +{ +mul.f16x2 r912, r775, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r778, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r736, r940; +} +{ +fma.rn.f16x2 r945, r733, r938, r942; +} +{ +mul.f16x2 r949, r733, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r736, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r748, r977; +} +{ +fma.rn.f16x2 r982, r745, r975, r979; +} +{ +mul.f16x2 r986, r745, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r748, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r760, r1014; +} +{ +fma.rn.f16x2 r1019, r757, r1012, r1016; +} +{ +mul.f16x2 r1023, r757, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r760, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r772, r1051; +} +{ +fma.rn.f16x2 r1056, r769, r1049, r1053; +} +{ +mul.f16x2 r1060, r769, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r772, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r784, r1088; +} +{ +fma.rn.f16x2 r1093, r781, r1086, r1090; +} +{ +mul.f16x2 r1097, r781, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r784, r1086, r1100; +} +barrier.sync 0; +mad.lo.s32 r3039, r3037, 40, r3038; +st.shared.v2.f32 [r3039], {r727, r797}; +st.shared.v2.f32 [r3039+8], {r834, r871}; +st.shared.v2.f32 [r3039+16], {r908, r945}; +st.shared.v2.f32 [r3039+24], {r982, r1019}; +st.shared.v2.f32 [r3039+32], {r1056, r1093}; +barrier.sync 0; +mad.lo.s32 r3040, r3037, -36, r3039; +ld.shared.u32 r1137, [r3040]; +ld.shared.u32 r1459, [r3040+400]; +ld.shared.u32 r1134, [r3040+800]; +ld.shared.u32 r1456, [r3040+1200]; +ld.shared.u32 r1140, [r3040+1600]; +ld.shared.u32 r1462, [r3040+2000]; +ld.shared.u32 r1141, [r3040+2400]; +ld.shared.u32 r1463, [r3040+2800]; +ld.shared.u32 r1135, [r3040+3200]; +ld.shared.u32 r1457, [r3040+3600]; +barrier.sync 0; +st.shared.v2.f32 [r3039], {r730, r806}; +st.shared.v2.f32 [r3039+8], {r843, r880}; +st.shared.v2.f32 [r3039+16], {r917, r954}; +st.shared.v2.f32 [r3039+24], {r991, r1028}; +st.shared.v2.f32 [r3039+32], {r1065, r1102}; +barrier.sync 0; +ld.shared.u32 r1149, [r3040]; +ld.shared.u32 r1471, [r3040+400]; +ld.shared.u32 r1146, [r3040+800]; +ld.shared.u32 r1468, [r3040+1200]; +ld.shared.u32 r1152, [r3040+1600]; +ld.shared.u32 r1474, [r3040+2000]; +ld.shared.u32 r1153, [r3040+2400]; +ld.shared.u32 r1475, [r3040+2800]; +ld.shared.u32 r1147, [r3040+3200]; +ld.shared.u32 r1469, [r3040+3600]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1128, {low, high}; +} +{ +neg.f16x2 r1129, r1128; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1132, {low, high}; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1137, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1136, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1149, r1145; +} +{ +add.f16x2 r1151, r1152, r1153; +} +{ +add.f16x2 r1154, r1148, r1151; +} +{ +add.f16x2 r1157, r1134, r1135; +} +{ +mul.f16x2 r1160, r1157, r1123; +} +{ +add.f16x2 r1163, r1137, r1160; +} +{ +add.f16x2 r1166, r1140, r1141; +} +{ +mul.f16x2 r1169, r1166, r1127; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1125; +} +{ +sub.f16x2 r1181, r1152, r1153; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, r1134, r1135; +} +{ +mul.f16x2 r1196, r1193, r1123; +} +{ +add.f16x2 r1199, r1137, r1196; +} +{ +add.f16x2 r1202, r1140, r1141; +} +{ +mul.f16x2 r1205, r1202, r1127; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1125; +} +{ +sub.f16x2 r1217, r1152, r1153; +} +{ +mul.f16x2 r1220, r1217, r1129; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, r1134, r1135; +} +{ +mul.f16x2 r1232, r1229, r1127; +} +{ +add.f16x2 r1235, r1137, r1232; +} +{ +add.f16x2 r1238, r1140, r1141; +} +{ +mul.f16x2 r1241, r1238, r1131; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1129; +} +{ +sub.f16x2 r1253, r1152, r1153; +} +{ +mul.f16x2 r1256, r1253, r1132; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +sub.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, r1134, r1135; +} +{ +mul.f16x2 r1268, r1265, r1127; +} +{ +add.f16x2 r1271, r1137, r1268; +} +{ +add.f16x2 r1274, r1140, r1141; +} +{ +mul.f16x2 r1277, r1274, r1131; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1129; +} +{ +sub.f16x2 r1289, r1152, r1153; +} +{ +mul.f16x2 r1292, r1289, r1132; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +add.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, r1146, r1147; +} +{ +mul.f16x2 r1304, r1301, r1123; +} +{ +add.f16x2 r1307, r1149, r1304; +} +{ +add.f16x2 r1310, r1152, r1153; +} +{ +mul.f16x2 r1313, r1310, r1127; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1125; +} +{ +sub.f16x2 r1325, r1140, r1141; +} +{ +mul.f16x2 r1328, r1325, r1129; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, r1146, r1147; +} +{ +mul.f16x2 r1340, r1337, r1123; +} +{ +add.f16x2 r1343, r1149, r1340; +} +{ +add.f16x2 r1346, r1152, r1153; +} +{ +mul.f16x2 r1349, r1346, r1127; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1125; +} +{ +sub.f16x2 r1361, r1140, r1141; +} +{ +mul.f16x2 r1364, r1361, r1129; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +add.f16x2 r1373, r1146, r1147; +} +{ +mul.f16x2 r1376, r1373, r1127; +} +{ +add.f16x2 r1379, r1149, r1376; +} +{ +add.f16x2 r1382, r1152, r1153; +} +{ +mul.f16x2 r1385, r1382, r1131; +} +{ +add.f16x2 r1388, r1379, r1385; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1129; +} +{ +sub.f16x2 r1397, r1140, r1141; +} +{ +mul.f16x2 r1400, r1397, r1132; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +add.f16x2 r1406, r1388, r1403; +} +{ +add.f16x2 r1409, r1146, r1147; +} +{ +mul.f16x2 r1412, r1409, r1127; +} +{ +add.f16x2 r1415, r1149, r1412; +} +{ +add.f16x2 r1418, r1152, r1153; +} +{ +mul.f16x2 r1421, r1418, r1131; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1129; +} +{ +sub.f16x2 r1433, r1140, r1141; +} +{ +mul.f16x2 r1436, r1433, r1132; +} +{ +add.f16x2 r1439, r1430, r1436; +} +{ +sub.f16x2 r1442, r1424, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1446, {low, high}; +} +{ +neg.f16x2 r1447, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1450, {low, high}; +} +{ +neg.f16x2 r1451, r1450; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1454, {low, high}; +} +{ +add.f16x2 r1455, r1456, r1457; +} +{ +add.f16x2 r1458, r1459, r1455; +} +{ +add.f16x2 r1461, r1462, r1463; +} +{ +add.f16x2 r1464, r1458, r1461; +} +{ +add.f16x2 r1467, r1468, r1469; +} +{ +add.f16x2 r1470, r1471, r1467; +} +{ +add.f16x2 r1473, r1474, r1475; +} +{ +add.f16x2 r1476, r1470, r1473; +} +{ +add.f16x2 r1479, r1456, r1457; +} +{ +mul.f16x2 r1482, r1479, r1445; +} +{ +add.f16x2 r1485, r1459, r1482; +} +{ +add.f16x2 r1488, r1462, r1463; +} +{ +mul.f16x2 r1491, r1488, r1449; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, r1468, r1469; +} +{ +mul.f16x2 r1500, r1497, r1447; +} +{ +sub.f16x2 r1503, r1474, r1475; +} +{ +mul.f16x2 r1506, r1503, r1451; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +sub.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, r1456, r1457; +} +{ +mul.f16x2 r1518, r1515, r1445; +} +{ +add.f16x2 r1521, r1459, r1518; +} +{ +add.f16x2 r1524, r1462, r1463; +} +{ +mul.f16x2 r1527, r1524, r1449; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, r1468, r1469; +} +{ +mul.f16x2 r1536, r1533, r1447; +} +{ +sub.f16x2 r1539, r1474, r1475; +} +{ +mul.f16x2 r1542, r1539, r1451; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +add.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, r1456, r1457; +} +{ +mul.f16x2 r1554, r1551, r1449; +} +{ +add.f16x2 r1557, r1459, r1554; +} +{ +add.f16x2 r1560, r1462, r1463; +} +{ +mul.f16x2 r1563, r1560, r1453; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, r1468, r1469; +} +{ +mul.f16x2 r1572, r1569, r1451; +} +{ +sub.f16x2 r1575, r1474, r1475; +} +{ +mul.f16x2 r1578, r1575, r1454; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +sub.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, r1456, r1457; +} +{ +mul.f16x2 r1590, r1587, r1449; +} +{ +add.f16x2 r1593, r1459, r1590; +} +{ +add.f16x2 r1596, r1462, r1463; +} +{ +mul.f16x2 r1599, r1596, r1453; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, r1468, r1469; +} +{ +mul.f16x2 r1608, r1605, r1451; +} +{ +sub.f16x2 r1611, r1474, r1475; +} +{ +mul.f16x2 r1614, r1611, r1454; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +add.f16x2 r1620, r1602, r1617; +} +{ +add.f16x2 r1623, r1468, r1469; +} +{ +mul.f16x2 r1626, r1623, r1445; +} +{ +add.f16x2 r1629, r1471, r1626; +} +{ +add.f16x2 r1632, r1474, r1475; +} +{ +mul.f16x2 r1635, r1632, r1449; +} +{ +add.f16x2 r1638, r1629, r1635; +} +{ +sub.f16x2 r1641, r1456, r1457; +} +{ +mul.f16x2 r1644, r1641, r1447; +} +{ +sub.f16x2 r1647, r1462, r1463; +} +{ +mul.f16x2 r1650, r1647, r1451; +} +{ +add.f16x2 r1653, r1644, r1650; +} +{ +add.f16x2 r1656, r1638, r1653; +} +{ +add.f16x2 r1659, r1468, r1469; +} +{ +mul.f16x2 r1662, r1659, r1445; +} +{ +add.f16x2 r1665, r1471, r1662; +} +{ +add.f16x2 r1668, r1474, r1475; +} +{ +mul.f16x2 r1671, r1668, r1449; +} +{ +add.f16x2 r1674, r1665, r1671; +} +{ +sub.f16x2 r1677, r1456, r1457; +} +{ +mul.f16x2 r1680, r1677, r1447; +} +{ +sub.f16x2 r1683, r1462, r1463; +} +{ +mul.f16x2 r1686, r1683, r1451; +} +{ +add.f16x2 r1689, r1680, r1686; +} +{ +sub.f16x2 r1692, r1674, r1689; +} +{ +add.f16x2 r1695, r1468, r1469; +} +{ +mul.f16x2 r1698, r1695, r1449; +} +{ +add.f16x2 r1701, r1471, r1698; +} +{ +add.f16x2 r1704, r1474, r1475; +} +{ +mul.f16x2 r1707, r1704, r1453; +} +{ +add.f16x2 r1710, r1701, r1707; +} +{ +sub.f16x2 r1713, r1456, r1457; +} +{ +mul.f16x2 r1716, r1713, r1451; +} +{ +sub.f16x2 r1719, r1462, r1463; +} +{ +mul.f16x2 r1722, r1719, r1454; +} +{ +add.f16x2 r1725, r1716, r1722; +} +{ +add.f16x2 r1728, r1710, r1725; +} +{ +add.f16x2 r1731, r1468, r1469; +} +{ +mul.f16x2 r1734, r1731, r1449; +} +{ +add.f16x2 r1737, r1471, r1734; +} +{ +add.f16x2 r1740, r1474, r1475; +} +{ +mul.f16x2 r1743, r1740, r1453; +} +{ +add.f16x2 r1746, r1737, r1743; +} +{ +sub.f16x2 r1749, r1456, r1457; +} +{ +mul.f16x2 r1752, r1749, r1451; +} +{ +sub.f16x2 r1755, r1462, r1463; +} +{ +mul.f16x2 r1758, r1755, r1454; +} +{ +add.f16x2 r1761, r1752, r1758; +} +{ +sub.f16x2 r1764, r1746, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1785, r1512, r1767; +} +{ +mul.f16x2 r1788, r1656, r1768; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1512, r1768; +} +{ +fma.rn.f16x2 r1797, r1656, r1767, r1794; +} +{ +mul.f16x2 r1801, r1584, r1769; +} +{ +mul.f16x2 r1804, r1728, r1770; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1584, r1770; +} +{ +fma.rn.f16x2 r1813, r1728, r1769, r1810; +} +{ +mul.f16x2 r1817, r1620, r1771; +} +{ +mul.f16x2 r1820, r1764, r1772; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1620, r1772; +} +{ +fma.rn.f16x2 r1829, r1764, r1771, r1826; +} +{ +mul.f16x2 r1833, r1548, r1773; +} +{ +mul.f16x2 r1836, r1692, r1774; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1548, r1774; +} +{ +fma.rn.f16x2 r1845, r1692, r1773, r1842; +} +{ +add.f16x2 r1849, r1142, r1464; +} +{ +add.f16x2 r1852, r1154, r1476; +} +{ +sub.f16x2 r1855, r1142, r1464; +} +{ +sub.f16x2 r1858, r1154, r1476; +} +{ +add.f16x2 r1861, r1190, r1791; +} +{ +add.f16x2 r1864, r1334, r1797; +} +{ +sub.f16x2 r1867, r1190, r1791; +} +{ +sub.f16x2 r1870, r1334, r1797; +} +{ +add.f16x2 r1873, r1262, r1807; +} +{ +add.f16x2 r1876, r1406, r1813; +} +{ +sub.f16x2 r1879, r1262, r1807; +} +{ +sub.f16x2 r1882, r1406, r1813; +} +{ +add.f16x2 r1885, r1298, r1823; +} +{ +add.f16x2 r1888, r1442, r1829; +} +{ +sub.f16x2 r1891, r1298, r1823; +} +{ +sub.f16x2 r1894, r1442, r1829; +} +{ +add.f16x2 r1897, r1226, r1839; +} +{ +add.f16x2 r1900, r1370, r1845; +} +{ +sub.f16x2 r1903, r1226, r1839; +} +{ +sub.f16x2 r1906, r1370, r1845; +} +mul.wide.u32 rd4, r3037, -858993459; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r3041, rd5; +mul.lo.s32 r3042, r3041, 10; +sub.s32 r3043, r3037, r3042; +shl.b32 r3044, r3043, 2; +add.s32 r3045, r3038, r3044; +cvt.rn.f32.u32 f228, r3041; +mul.f32 f229, f228, 0f3D80ADFD; +cos.approx.f32 f143, f229; +sin.approx.f32 f230, f229; +neg.f32 f144, f230; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f143; +cvt.rn.f16.f32 high, f144; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1912, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1914, {high, high}; +} +{ +mul.f16x2 r1916, r1864, r1914; +} +{ +fma.rn.f16x2 r1919, r1861, r1912, r1916; +} +{ +mul.f16x2 r1923, r1861, r1914; +} +{ +neg.f16x2 r1926, r1923; +} +{ +fma.rn.f16x2 r1928, r1864, r1912, r1926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1932, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1934, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1936, {low, high}; +} +{ +mul.f16x2 r1937, r1934, r1936; +} +{ +mul.f16x2 r1940, r1909, r1932; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1943, {high, low}; +} +{ +fma.rn.f16x2 r1945, r1937, r1943, r1940; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1949, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1951, {high, high}; +} +{ +mul.f16x2 r1953, r1876, r1951; +} +{ +fma.rn.f16x2 r1956, r1873, r1949, r1953; +} +{ +mul.f16x2 r1960, r1873, r1951; +} +{ +neg.f16x2 r1963, r1960; +} +{ +fma.rn.f16x2 r1965, r1876, r1949, r1963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1969, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r1971, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r1973, {low, high}; +} +{ +mul.f16x2 r1974, r1971, r1973; +} +{ +mul.f16x2 r1977, r1945, r1969; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1945; +mov.b32 r1980, {high, low}; +} +{ +fma.rn.f16x2 r1982, r1974, r1980, r1977; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1986, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r1988, {high, high}; +} +{ +mul.f16x2 r1990, r1888, r1988; +} +{ +fma.rn.f16x2 r1993, r1885, r1986, r1990; +} +{ +mul.f16x2 r1997, r1885, r1988; +} +{ +neg.f16x2 r2000, r1997; +} +{ +fma.rn.f16x2 r2002, r1888, r1986, r2000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2006, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2008, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2010, {low, high}; +} +{ +mul.f16x2 r2011, r2008, r2010; +} +{ +mul.f16x2 r2014, r1982, r2006; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1982; +mov.b32 r2017, {high, low}; +} +{ +fma.rn.f16x2 r2019, r2011, r2017, r2014; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2023, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2025, {high, high}; +} +{ +mul.f16x2 r2027, r1900, r2025; +} +{ +fma.rn.f16x2 r2030, r1897, r2023, r2027; +} +{ +mul.f16x2 r2034, r1897, r2025; +} +{ +neg.f16x2 r2037, r2034; +} +{ +fma.rn.f16x2 r2039, r1900, r2023, r2037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2043, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2045, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r2045, r2047; +} +{ +mul.f16x2 r2051, r2019, r2043; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2019; +mov.b32 r2054, {high, low}; +} +{ +fma.rn.f16x2 r2056, r2048, r2054, r2051; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2060, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2062, {high, high}; +} +{ +mul.f16x2 r2064, r1858, r2062; +} +{ +fma.rn.f16x2 r2067, r1855, r2060, r2064; +} +{ +mul.f16x2 r2071, r1855, r2062; +} +{ +neg.f16x2 r2074, r2071; +} +{ +fma.rn.f16x2 r2076, r1858, r2060, r2074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2080, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2082, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r2082, r2084; +} +{ +mul.f16x2 r2088, r2056, r2080; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2056; +mov.b32 r2091, {high, low}; +} +{ +fma.rn.f16x2 r2093, r2085, r2091, r2088; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2097, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2099, {high, high}; +} +{ +mul.f16x2 r2101, r1870, r2099; +} +{ +fma.rn.f16x2 r2104, r1867, r2097, r2101; +} +{ +mul.f16x2 r2108, r1867, r2099; +} +{ +neg.f16x2 r2111, r2108; +} +{ +fma.rn.f16x2 r2113, r1870, r2097, r2111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2117, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2119, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2121, {low, high}; +} +{ +mul.f16x2 r2122, r2119, r2121; +} +{ +mul.f16x2 r2125, r2093, r2117; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2093; +mov.b32 r2128, {high, low}; +} +{ +fma.rn.f16x2 r2130, r2122, r2128, r2125; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2134, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2136, {high, high}; +} +{ +mul.f16x2 r2138, r1882, r2136; +} +{ +fma.rn.f16x2 r2141, r1879, r2134, r2138; +} +{ +mul.f16x2 r2145, r1879, r2136; +} +{ +neg.f16x2 r2148, r2145; +} +{ +fma.rn.f16x2 r2150, r1882, r2134, r2148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2154, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2156, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2158, {low, high}; +} +{ +mul.f16x2 r2159, r2156, r2158; +} +{ +mul.f16x2 r2162, r2130, r2154; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2130; +mov.b32 r2165, {high, low}; +} +{ +fma.rn.f16x2 r2167, r2159, r2165, r2162; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2171, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2173, {high, high}; +} +{ +mul.f16x2 r2175, r1894, r2173; +} +{ +fma.rn.f16x2 r2178, r1891, r2171, r2175; +} +{ +mul.f16x2 r2182, r1891, r2173; +} +{ +neg.f16x2 r2185, r2182; +} +{ +fma.rn.f16x2 r2187, r1894, r2171, r2185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2191, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1909; +mov.b32 r2193, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f161; +cvt.rn.f16.f32 high, f162; +mov.b32 r2195, {low, high}; +} +{ +mul.f16x2 r2196, r2193, r2195; +} +{ +mul.f16x2 r2199, r2167, r2191; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2167; +mov.b32 r2202, {high, low}; +} +{ +fma.rn.f16x2 r2204, r2196, r2202, r2199; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2208, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2204; +mov.b32 r2210, {high, high}; +} +{ +mul.f16x2 r2212, r1906, r2210; +} +{ +fma.rn.f16x2 r2215, r1903, r2208, r2212; +} +{ +mul.f16x2 r2219, r1903, r2210; +} +{ +neg.f16x2 r2222, r2219; +} +{ +fma.rn.f16x2 r2224, r1906, r2208, r2222; +} +barrier.sync 0; +mad.lo.s32 r3046, r3041, 400, r3045; +st.shared.u32 [r3046], r1849; +st.shared.u32 [r3046+40], r1919; +st.shared.u32 [r3046+80], r1956; +st.shared.u32 [r3046+120], r1993; +st.shared.u32 [r3046+160], r2030; +st.shared.u32 [r3046+200], r2067; +st.shared.u32 [r3046+240], r2104; +st.shared.u32 [r3046+280], r2141; +st.shared.u32 [r3046+320], r2178; +st.shared.u32 [r3046+360], r2215; +barrier.sync 0; +ld.shared.u32 r2259, [r3040]; +ld.shared.u32 r2581, [r3040+400]; +ld.shared.u32 r2256, [r3040+800]; +ld.shared.u32 r2578, [r3040+1200]; +ld.shared.u32 r2262, [r3040+1600]; +ld.shared.u32 r2584, [r3040+2000]; +ld.shared.u32 r2263, [r3040+2400]; +ld.shared.u32 r2585, [r3040+2800]; +ld.shared.u32 r2257, [r3040+3200]; +ld.shared.u32 r2579, [r3040+3600]; +barrier.sync 0; +st.shared.u32 [r3046], r1852; +st.shared.u32 [r3046+40], r1928; +st.shared.u32 [r3046+80], r1965; +st.shared.u32 [r3046+120], r2002; +st.shared.u32 [r3046+160], r2039; +st.shared.u32 [r3046+200], r2076; +st.shared.u32 [r3046+240], r2113; +st.shared.u32 [r3046+280], r2150; +st.shared.u32 [r3046+320], r2187; +st.shared.u32 [r3046+360], r2224; +barrier.sync 0; +ld.shared.u32 r2271, [r3040]; +ld.shared.u32 r2593, [r3040+400]; +ld.shared.u32 r2268, [r3040+800]; +ld.shared.u32 r2590, [r3040+1200]; +ld.shared.u32 r2274, [r3040+1600]; +ld.shared.u32 r2596, [r3040+2000]; +ld.shared.u32 r2275, [r3040+2400]; +ld.shared.u32 r2597, [r3040+2800]; +ld.shared.u32 r2269, [r3040+3200]; +ld.shared.u32 r2591, [r3040+3600]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2246, {low, high}; +} +{ +neg.f16x2 r2247, r2246; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r2250, {low, high}; +} +{ +neg.f16x2 r2251, r2250; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2254, {low, high}; +} +{ +add.f16x2 r2255, r2256, r2257; +} +{ +add.f16x2 r2258, r2259, r2255; +} +{ +add.f16x2 r2261, r2262, r2263; +} +{ +add.f16x2 r2264, r2258, r2261; +} +{ +add.f16x2 r2267, r2268, r2269; +} +{ +add.f16x2 r2270, r2271, r2267; +} +{ +add.f16x2 r2273, r2274, r2275; +} +{ +add.f16x2 r2276, r2270, r2273; +} +{ +add.f16x2 r2279, r2256, r2257; +} +{ +mul.f16x2 r2282, r2279, r2245; +} +{ +add.f16x2 r2285, r2259, r2282; +} +{ +add.f16x2 r2288, r2262, r2263; +} +{ +mul.f16x2 r2291, r2288, r2249; +} +{ +add.f16x2 r2294, r2285, r2291; +} +{ +sub.f16x2 r2297, r2268, r2269; +} +{ +mul.f16x2 r2300, r2297, r2247; +} +{ +sub.f16x2 r2303, r2274, r2275; +} +{ +mul.f16x2 r2306, r2303, r2251; +} +{ +add.f16x2 r2309, r2300, r2306; +} +{ +sub.f16x2 r2312, r2294, r2309; +} +{ +add.f16x2 r2315, r2256, r2257; +} +{ +mul.f16x2 r2318, r2315, r2245; +} +{ +add.f16x2 r2321, r2259, r2318; +} +{ +add.f16x2 r2324, r2262, r2263; +} +{ +mul.f16x2 r2327, r2324, r2249; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +sub.f16x2 r2333, r2268, r2269; +} +{ +mul.f16x2 r2336, r2333, r2247; +} +{ +sub.f16x2 r2339, r2274, r2275; +} +{ +mul.f16x2 r2342, r2339, r2251; +} +{ +add.f16x2 r2345, r2336, r2342; +} +{ +add.f16x2 r2348, r2330, r2345; +} +{ +add.f16x2 r2351, r2256, r2257; +} +{ +mul.f16x2 r2354, r2351, r2249; +} +{ +add.f16x2 r2357, r2259, r2354; +} +{ +add.f16x2 r2360, r2262, r2263; +} +{ +mul.f16x2 r2363, r2360, r2253; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +sub.f16x2 r2369, r2268, r2269; +} +{ +mul.f16x2 r2372, r2369, r2251; +} +{ +sub.f16x2 r2375, r2274, r2275; +} +{ +mul.f16x2 r2378, r2375, r2254; +} +{ +add.f16x2 r2381, r2372, r2378; +} +{ +sub.f16x2 r2384, r2366, r2381; +} +{ +add.f16x2 r2387, r2256, r2257; +} +{ +mul.f16x2 r2390, r2387, r2249; +} +{ +add.f16x2 r2393, r2259, r2390; +} +{ +add.f16x2 r2396, r2262, r2263; +} +{ +mul.f16x2 r2399, r2396, r2253; +} +{ +add.f16x2 r2402, r2393, r2399; +} +{ +sub.f16x2 r2405, r2268, r2269; +} +{ +mul.f16x2 r2408, r2405, r2251; +} +{ +sub.f16x2 r2411, r2274, r2275; +} +{ +mul.f16x2 r2414, r2411, r2254; +} +{ +add.f16x2 r2417, r2408, r2414; +} +{ +add.f16x2 r2420, r2402, r2417; +} +{ +add.f16x2 r2423, r2268, r2269; +} +{ +mul.f16x2 r2426, r2423, r2245; +} +{ +add.f16x2 r2429, r2271, r2426; +} +{ +add.f16x2 r2432, r2274, r2275; +} +{ +mul.f16x2 r2435, r2432, r2249; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +sub.f16x2 r2441, r2256, r2257; +} +{ +mul.f16x2 r2444, r2441, r2247; +} +{ +sub.f16x2 r2447, r2262, r2263; +} +{ +mul.f16x2 r2450, r2447, r2251; +} +{ +add.f16x2 r2453, r2444, r2450; +} +{ +add.f16x2 r2456, r2438, r2453; +} +{ +add.f16x2 r2459, r2268, r2269; +} +{ +mul.f16x2 r2462, r2459, r2245; +} +{ +add.f16x2 r2465, r2271, r2462; +} +{ +add.f16x2 r2468, r2274, r2275; +} +{ +mul.f16x2 r2471, r2468, r2249; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +sub.f16x2 r2477, r2256, r2257; +} +{ +mul.f16x2 r2480, r2477, r2247; +} +{ +sub.f16x2 r2483, r2262, r2263; +} +{ +mul.f16x2 r2486, r2483, r2251; +} +{ +add.f16x2 r2489, r2480, r2486; +} +{ +sub.f16x2 r2492, r2474, r2489; +} +{ +add.f16x2 r2495, r2268, r2269; +} +{ +mul.f16x2 r2498, r2495, r2249; +} +{ +add.f16x2 r2501, r2271, r2498; +} +{ +add.f16x2 r2504, r2274, r2275; +} +{ +mul.f16x2 r2507, r2504, r2253; +} +{ +add.f16x2 r2510, r2501, r2507; +} +{ +sub.f16x2 r2513, r2256, r2257; +} +{ +mul.f16x2 r2516, r2513, r2251; +} +{ +sub.f16x2 r2519, r2262, r2263; +} +{ +mul.f16x2 r2522, r2519, r2254; +} +{ +add.f16x2 r2525, r2516, r2522; +} +{ +add.f16x2 r2528, r2510, r2525; +} +{ +add.f16x2 r2531, r2268, r2269; +} +{ +mul.f16x2 r2534, r2531, r2249; +} +{ +add.f16x2 r2537, r2271, r2534; +} +{ +add.f16x2 r2540, r2274, r2275; +} +{ +mul.f16x2 r2543, r2540, r2253; +} +{ +add.f16x2 r2546, r2537, r2543; +} +{ +sub.f16x2 r2549, r2256, r2257; +} +{ +mul.f16x2 r2552, r2549, r2251; +} +{ +sub.f16x2 r2555, r2262, r2263; +} +{ +mul.f16x2 r2558, r2555, r2254; +} +{ +add.f16x2 r2561, r2552, r2558; +} +{ +sub.f16x2 r2564, r2546, r2561; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2568, {low, high}; +} +{ +neg.f16x2 r2569, r2568; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r2572, {low, high}; +} +{ +neg.f16x2 r2573, r2572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r2576, {low, high}; +} +{ +add.f16x2 r2577, r2578, r2579; +} +{ +add.f16x2 r2580, r2581, r2577; +} +{ +add.f16x2 r2583, r2584, r2585; +} +{ +add.f16x2 r2586, r2580, r2583; +} +{ +add.f16x2 r2589, r2590, r2591; +} +{ +add.f16x2 r2592, r2593, r2589; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2592, r2595; +} +{ +add.f16x2 r2601, r2578, r2579; +} +{ +mul.f16x2 r2604, r2601, r2567; +} +{ +add.f16x2 r2607, r2581, r2604; +} +{ +add.f16x2 r2610, r2584, r2585; +} +{ +mul.f16x2 r2613, r2610, r2571; +} +{ +add.f16x2 r2616, r2607, r2613; +} +{ +sub.f16x2 r2619, r2590, r2591; +} +{ +mul.f16x2 r2622, r2619, r2569; +} +{ +sub.f16x2 r2625, r2596, r2597; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r2622, r2628; +} +{ +sub.f16x2 r2634, r2616, r2631; +} +{ +add.f16x2 r2637, r2578, r2579; +} +{ +mul.f16x2 r2640, r2637, r2567; +} +{ +add.f16x2 r2643, r2581, r2640; +} +{ +add.f16x2 r2646, r2584, r2585; +} +{ +mul.f16x2 r2649, r2646, r2571; +} +{ +add.f16x2 r2652, r2643, r2649; +} +{ +sub.f16x2 r2655, r2590, r2591; +} +{ +mul.f16x2 r2658, r2655, r2569; +} +{ +sub.f16x2 r2661, r2596, r2597; +} +{ +mul.f16x2 r2664, r2661, r2573; +} +{ +add.f16x2 r2667, r2658, r2664; +} +{ +add.f16x2 r2670, r2652, r2667; +} +{ +add.f16x2 r2673, r2578, r2579; +} +{ +mul.f16x2 r2676, r2673, r2571; +} +{ +add.f16x2 r2679, r2581, r2676; +} +{ +add.f16x2 r2682, r2584, r2585; +} +{ +mul.f16x2 r2685, r2682, r2575; +} +{ +add.f16x2 r2688, r2679, r2685; +} +{ +sub.f16x2 r2691, r2590, r2591; +} +{ +mul.f16x2 r2694, r2691, r2573; +} +{ +sub.f16x2 r2697, r2596, r2597; +} +{ +mul.f16x2 r2700, r2697, r2576; +} +{ +add.f16x2 r2703, r2694, r2700; +} +{ +sub.f16x2 r2706, r2688, r2703; +} +{ +add.f16x2 r2709, r2578, r2579; +} +{ +mul.f16x2 r2712, r2709, r2571; +} +{ +add.f16x2 r2715, r2581, r2712; +} +{ +add.f16x2 r2718, r2584, r2585; +} +{ +mul.f16x2 r2721, r2718, r2575; +} +{ +add.f16x2 r2724, r2715, r2721; +} +{ +sub.f16x2 r2727, r2590, r2591; +} +{ +mul.f16x2 r2730, r2727, r2573; +} +{ +sub.f16x2 r2733, r2596, r2597; +} +{ +mul.f16x2 r2736, r2733, r2576; +} +{ +add.f16x2 r2739, r2730, r2736; +} +{ +add.f16x2 r2742, r2724, r2739; +} +{ +add.f16x2 r2745, r2590, r2591; +} +{ +mul.f16x2 r2748, r2745, r2567; +} +{ +add.f16x2 r2751, r2593, r2748; +} +{ +add.f16x2 r2754, r2596, r2597; +} +{ +mul.f16x2 r2757, r2754, r2571; +} +{ +add.f16x2 r2760, r2751, r2757; +} +{ +sub.f16x2 r2763, r2578, r2579; +} +{ +mul.f16x2 r2766, r2763, r2569; +} +{ +sub.f16x2 r2769, r2584, r2585; +} +{ +mul.f16x2 r2772, r2769, r2573; +} +{ +add.f16x2 r2775, r2766, r2772; +} +{ +add.f16x2 r2778, r2760, r2775; +} +{ +add.f16x2 r2781, r2590, r2591; +} +{ +mul.f16x2 r2784, r2781, r2567; +} +{ +add.f16x2 r2787, r2593, r2784; +} +{ +add.f16x2 r2790, r2596, r2597; +} +{ +mul.f16x2 r2793, r2790, r2571; +} +{ +add.f16x2 r2796, r2787, r2793; +} +{ +sub.f16x2 r2799, r2578, r2579; +} +{ +mul.f16x2 r2802, r2799, r2569; +} +{ +sub.f16x2 r2805, r2584, r2585; +} +{ +mul.f16x2 r2808, r2805, r2573; +} +{ +add.f16x2 r2811, r2802, r2808; +} +{ +sub.f16x2 r2814, r2796, r2811; +} +{ +add.f16x2 r2817, r2590, r2591; +} +{ +mul.f16x2 r2820, r2817, r2571; +} +{ +add.f16x2 r2823, r2593, r2820; +} +{ +add.f16x2 r2826, r2596, r2597; +} +{ +mul.f16x2 r2829, r2826, r2575; +} +{ +add.f16x2 r2832, r2823, r2829; +} +{ +sub.f16x2 r2835, r2578, r2579; +} +{ +mul.f16x2 r2838, r2835, r2573; +} +{ +sub.f16x2 r2841, r2584, r2585; +} +{ +mul.f16x2 r2844, r2841, r2576; +} +{ +add.f16x2 r2847, r2838, r2844; +} +{ +add.f16x2 r2850, r2832, r2847; +} +{ +add.f16x2 r2853, r2590, r2591; +} +{ +mul.f16x2 r2856, r2853, r2571; +} +{ +add.f16x2 r2859, r2593, r2856; +} +{ +add.f16x2 r2862, r2596, r2597; +} +{ +mul.f16x2 r2865, r2862, r2575; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +sub.f16x2 r2871, r2578, r2579; +} +{ +mul.f16x2 r2874, r2871, r2573; +} +{ +sub.f16x2 r2877, r2584, r2585; +} +{ +mul.f16x2 r2880, r2877, r2576; +} +{ +add.f16x2 r2883, r2874, r2880; +} +{ +sub.f16x2 r2886, r2868, r2883; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r2891, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2892, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r2893, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2894, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f204; +cvt.rn.f16.f32 high, f204; +mov.b32 r2896, {low, high}; +} +{ +mul.f16x2 r2907, r2634, r2889; +} +{ +mul.f16x2 r2910, r2778, r2890; +} +{ +sub.f16x2 r2913, r2907, r2910; +} +{ +mul.f16x2 r2916, r2634, r2890; +} +{ +fma.rn.f16x2 r2919, r2778, r2889, r2916; +} +{ +mul.f16x2 r2923, r2706, r2891; +} +{ +mul.f16x2 r2926, r2850, r2892; +} +{ +sub.f16x2 r2929, r2923, r2926; +} +{ +mul.f16x2 r2932, r2706, r2892; +} +{ +fma.rn.f16x2 r2935, r2850, r2891, r2932; +} +{ +mul.f16x2 r2939, r2742, r2893; +} +{ +mul.f16x2 r2942, r2886, r2894; +} +{ +sub.f16x2 r2945, r2939, r2942; +} +{ +mul.f16x2 r2948, r2742, r2894; +} +{ +fma.rn.f16x2 r2951, r2886, r2893, r2948; +} +{ +mul.f16x2 r2955, r2670, r2895; +} +{ +mul.f16x2 r2958, r2814, r2896; +} +{ +sub.f16x2 r2961, r2955, r2958; +} +{ +mul.f16x2 r2964, r2670, r2896; +} +{ +fma.rn.f16x2 r2967, r2814, r2895, r2964; +} +{ +add.f16x2 %0, r2264, r2586; +} +{ +add.f16x2 %1, r2276, r2598; +} +{ +sub.f16x2 %10, r2264, r2586; +} +{ +sub.f16x2 %11, r2276, r2598; +} +{ +add.f16x2 %2, r2312, r2913; +} +{ +add.f16x2 %3, r2456, r2919; +} +{ +sub.f16x2 %12, r2312, r2913; +} +{ +sub.f16x2 %13, r2456, r2919; +} +{ +add.f16x2 %4, r2384, r2929; +} +{ +add.f16x2 %5, r2528, r2935; +} +{ +sub.f16x2 %14, r2384, r2929; +} +{ +sub.f16x2 %15, r2528, r2935; +} +{ +add.f16x2 %6, r2420, r2945; +} +{ +add.f16x2 %7, r2564, r2951; +} +{ +sub.f16x2 %16, r2420, r2945; +} +{ +sub.f16x2 %17, r2564, r2951; +} +{ +add.f16x2 %8, r2348, r2961; +} +{ +add.f16x2 %9, r2492, r2967; +} +{ +sub.f16x2 %18, r2348, r2961; +} +{ +sub.f16x2 %19, r2492, r2967; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..a1e851ea29d7d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp32_fwd.hpp.inc @@ -0,0 +1,1316 @@ +#ifndef CUFFTDX_FFT_1000_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_1000_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<193, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<683>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 8000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %28, %44; +add.f32 f42, %23, f41; +add.f32 f43, %33, %39; +add.f32 f44, f43, f42; +add.f32 f45, %30, %46; +add.f32 f46, %24, f45; +add.f32 f47, %35, %40; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %23; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %30, %46; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %35, %40; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %23, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %24; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %28, %44; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %33, %39; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %24, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %31, %47; +add.f32 f86, %25, f85; +add.f32 f87, %36, %41; +add.f32 f88, f87, f86; +add.f32 f89, %32, %48; +add.f32 f90, %27, f89; +add.f32 f91, %38, %43; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %25; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %32, %48; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %38, %43; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %25, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %27; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %31, %47; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %36, %41; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %27, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +sub.f32 f149, f44, f88; +sub.f32 f150, f48, f92; +add.f32 f151, f57, f131; +add.f32 f152, f75, f133; +sub.f32 f153, f57, f131; +sub.f32 f154, f75, f133; +add.f32 f155, f65, f136; +add.f32 f156, f83, f138; +sub.f32 f157, f65, f136; +sub.f32 f158, f83, f138; +add.f32 f159, f66, f141; +add.f32 f160, f84, f143; +sub.f32 f161, f66, f141; +sub.f32 f162, f84, f143; +add.f32 f163, f58, f146; +add.f32 f164, f76, f148; +sub.f32 f165, f58, f146; +sub.f32 f166, f76, f148; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 8000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f167, f168}, [rd6]; +mul.f32 f171, f167, f151; +mul.f32 f172, f168, f152; +mul.f32 f173, f167, f152; +mul.f32 f174, f167, f167; +mul.f32 f175, f168, f168; +sub.f32 f176, f174, f175; +mul.f32 f177, f168, f167; +fma.rn.f32 f178, f168, f167, f177; +mul.f32 f179, f176, f155; +mul.f32 f180, f178, f156; +mul.f32 f181, f176, f156; +mul.f32 f182, f167, f176; +mul.f32 f183, f168, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f167, f178; +fma.rn.f32 f186, f168, f176, f185; +mul.f32 f187, f184, f159; +mul.f32 f188, f186, f160; +mul.f32 f189, f184, f160; +mul.f32 f190, f167, f184; +mul.f32 f191, f168, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f167, f186; +fma.rn.f32 f194, f168, f184, f193; +mul.f32 f195, f192, f163; +mul.f32 f196, f194, f164; +mul.f32 f197, f192, f164; +mul.f32 f198, f167, f192; +mul.f32 f199, f168, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f167, f194; +fma.rn.f32 f202, f168, f192, f201; +mul.f32 f203, f200, f149; +mul.f32 f204, f202, f150; +mul.f32 f205, f200, f150; +mul.f32 f206, f167, f200; +mul.f32 f207, f168, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f167, f202; +fma.rn.f32 f210, f168, f200, f209; +mul.f32 f211, f208, f153; +mul.f32 f212, f210, f154; +mul.f32 f213, f208, f154; +mul.f32 f214, f167, f208; +mul.f32 f215, f168, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f167, f210; +fma.rn.f32 f218, f168, f208, f217; +mul.f32 f219, f216, f157; +mul.f32 f220, f218, f158; +mul.f32 f221, f216, f158; +mul.f32 f222, f167, f216; +mul.f32 f223, f168, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f167, f218; +fma.rn.f32 f226, f168, f216, f225; +mul.f32 f227, f224, f161; +mul.f32 f228, f226, f162; +mul.f32 f229, f224, f162; +mul.f32 f230, f167, f224; +mul.f32 f231, f168, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f167, f226; +fma.rn.f32 f234, f168, f224, f233; +mul.f32 f235, f232, f165; +mul.f32 f236, f234, f166; +mul.f32 f237, f232, f166; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f32 f238, f48, f92; +add.f32 f239, f44, f88; +st.shared.v2.f32 [r9], {f239, f238}; +fma.rn.f32 f240, f168, f151, f173; +sub.f32 f241, f171, f172; +st.shared.v2.f32 [r9+8], {f241, f240}; +fma.rn.f32 f242, f178, f155, f181; +sub.f32 f243, f179, f180; +st.shared.v2.f32 [r9+16], {f243, f242}; +fma.rn.f32 f244, f186, f159, f189; +sub.f32 f245, f187, f188; +st.shared.v2.f32 [r9+24], {f245, f244}; +sub.f32 f246, f195, f196; +fma.rn.f32 f247, f194, f163, f197; +st.shared.v2.f32 [r9+32], {f246, f247}; +fma.rn.f32 f248, f202, f149, f205; +sub.f32 f249, f203, f204; +st.shared.v2.f32 [r9+40], {f249, f248}; +fma.rn.f32 f250, f210, f153, f213; +sub.f32 f251, f211, f212; +st.shared.v2.f32 [r9+48], {f251, f250}; +fma.rn.f32 f252, f218, f157, f221; +sub.f32 f253, f219, f220; +st.shared.v2.f32 [r9+56], {f253, f252}; +fma.rn.f32 f254, f226, f161, f229; +sub.f32 f255, f227, f228; +st.shared.v2.f32 [r9+64], {f255, f254}; +fma.rn.f32 f256, f234, f165, f237; +sub.f32 f257, f235, f236; +st.shared.v2.f32 [r9+72], {f257, f256}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.v2.f32 {f258, f259}, [r10]; +ld.shared.v2.f32 {f262, f263}, [r10+800]; +ld.shared.v2.f32 {f266, f267}, [r10+1600]; +ld.shared.v2.f32 {f270, f271}, [r10+2400]; +ld.shared.v2.f32 {f274, f275}, [r10+3200]; +ld.shared.v2.f32 {f278, f279}, [r10+4000]; +ld.shared.v2.f32 {f282, f283}, [r10+4800]; +ld.shared.v2.f32 {f286, f287}, [r10+5600]; +ld.shared.v2.f32 {f290, f291}, [r10+6400]; +ld.shared.v2.f32 {f294, f295}, [r10+7200]; +add.f32 f298, f266, f290; +add.f32 f299, f258, f298; +add.f32 f300, f274, f282; +add.f32 f301, f300, f299; +add.f32 f302, f267, f291; +add.f32 f303, f259, f302; +add.f32 f304, f275, f283; +add.f32 f305, f304, f303; +fma.rn.f32 f306, f298, 0f3E9E377A, f258; +mul.f32 f307, f300, 0f3F4F1BBD; +sub.f32 f308, f306, f307; +sub.f32 f309, f267, f291; +mul.f32 f310, f309, 0f3F737871; +sub.f32 f311, f275, f283; +mul.f32 f312, f311, 0fBF167918; +sub.f32 f313, f312, f310; +sub.f32 f314, f308, f313; +add.f32 f315, f313, f308; +mul.f32 f316, f298, 0f3F4F1BBD; +sub.f32 f317, f258, f316; +fma.rn.f32 f318, f300, 0f3E9E377A, f317; +mul.f32 f319, f309, 0f3F167918; +mul.f32 f320, f311, 0f3F737871; +sub.f32 f321, f320, f319; +sub.f32 f322, f318, f321; +add.f32 f323, f321, f318; +fma.rn.f32 f324, f302, 0f3E9E377A, f259; +mul.f32 f325, f304, 0f3F4F1BBD; +sub.f32 f326, f324, f325; +sub.f32 f327, f266, f290; +mul.f32 f328, f327, 0f3F737871; +sub.f32 f329, f274, f282; +mul.f32 f330, f329, 0fBF167918; +sub.f32 f331, f330, f328; +add.f32 f332, f331, f326; +sub.f32 f333, f326, f331; +mul.f32 f334, f302, 0f3F4F1BBD; +sub.f32 f335, f259, f334; +fma.rn.f32 f336, f304, 0f3E9E377A, f335; +mul.f32 f337, f327, 0f3F167918; +mul.f32 f338, f329, 0f3F737871; +sub.f32 f339, f338, f337; +add.f32 f340, f339, f336; +sub.f32 f341, f336, f339; +add.f32 f342, f270, f294; +add.f32 f343, f262, f342; +add.f32 f344, f278, f286; +add.f32 f345, f344, f343; +add.f32 f346, f271, f295; +add.f32 f347, f263, f346; +add.f32 f348, f279, f287; +add.f32 f349, f348, f347; +fma.rn.f32 f350, f342, 0f3E9E377A, f262; +mul.f32 f351, f344, 0f3F4F1BBD; +sub.f32 f352, f350, f351; +sub.f32 f353, f271, f295; +mul.f32 f354, f353, 0f3F737871; +sub.f32 f355, f279, f287; +mul.f32 f356, f355, 0fBF167918; +sub.f32 f357, f356, f354; +sub.f32 f358, f352, f357; +add.f32 f359, f357, f352; +mul.f32 f360, f342, 0f3F4F1BBD; +sub.f32 f361, f262, f360; +fma.rn.f32 f362, f344, 0f3E9E377A, f361; +mul.f32 f363, f353, 0f3F167918; +mul.f32 f364, f355, 0f3F737871; +sub.f32 f365, f364, f363; +sub.f32 f366, f362, f365; +add.f32 f367, f365, f362; +fma.rn.f32 f368, f346, 0f3E9E377A, f263; +mul.f32 f369, f348, 0f3F4F1BBD; +sub.f32 f370, f368, f369; +sub.f32 f371, f270, f294; +mul.f32 f372, f371, 0f3F737871; +sub.f32 f373, f278, f286; +mul.f32 f374, f373, 0fBF167918; +sub.f32 f375, f374, f372; +add.f32 f376, f375, f370; +sub.f32 f377, f370, f375; +mul.f32 f378, f346, 0f3F4F1BBD; +sub.f32 f379, f263, f378; +fma.rn.f32 f380, f348, 0f3E9E377A, f379; +mul.f32 f381, f371, 0f3F167918; +mul.f32 f382, f373, 0f3F737871; +sub.f32 f383, f382, f381; +add.f32 f384, f383, f380; +sub.f32 f385, f380, f383; +mul.f32 f386, f358, 0f3F4F1BBD; +mul.f32 f387, f376, 0fBF167918; +sub.f32 f388, f386, f387; +mul.f32 f389, f376, 0f3F4F1BBD; +fma.rn.f32 f390, f358, 0fBF167918, f389; +mul.f32 f391, f366, 0f3E9E377A; +mul.f32 f392, f384, 0fBF737871; +sub.f32 f393, f391, f392; +mul.f32 f394, f384, 0f3E9E377A; +fma.rn.f32 f395, f366, 0fBF737871, f394; +mul.f32 f396, f367, 0fBE9E377A; +mul.f32 f397, f385, 0fBF737871; +sub.f32 f398, f396, f397; +mul.f32 f399, f385, 0fBE9E377A; +fma.rn.f32 f400, f367, 0fBF737871, f399; +mul.f32 f401, f359, 0fBF4F1BBD; +mul.f32 f402, f377, 0fBF167918; +sub.f32 f403, f401, f402; +mul.f32 f404, f377, 0fBF4F1BBD; +fma.rn.f32 f405, f359, 0fBF167918, f404; +sub.f32 f406, f301, f345; +sub.f32 f407, f305, f349; +add.f32 f408, f314, f388; +add.f32 f409, f332, f390; +sub.f32 f410, f314, f388; +sub.f32 f411, f332, f390; +add.f32 f412, f322, f393; +add.f32 f413, f340, f395; +sub.f32 f414, f322, f393; +sub.f32 f415, f340, f395; +add.f32 f416, f323, f398; +add.f32 f417, f341, f400; +sub.f32 f418, f323, f398; +sub.f32 f419, f341, f400; +add.f32 f420, f315, f403; +add.f32 f421, f333, f405; +sub.f32 f422, f315, f403; +sub.f32 f423, f333, f405; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f424, f425}, [rd11]; +mul.f32 f428, f424, f408; +mul.f32 f429, f425, f409; +mul.f32 f430, f424, f409; +mul.f32 f431, f424, f424; +mul.f32 f432, f425, f425; +sub.f32 f433, f431, f432; +mul.f32 f434, f425, f424; +fma.rn.f32 f435, f425, f424, f434; +mul.f32 f436, f433, f412; +mul.f32 f437, f435, f413; +mul.f32 f438, f433, f413; +mul.f32 f439, f424, f433; +mul.f32 f440, f425, f435; +sub.f32 f441, f439, f440; +mul.f32 f442, f424, f435; +fma.rn.f32 f443, f425, f433, f442; +mul.f32 f444, f441, f416; +mul.f32 f445, f443, f417; +mul.f32 f446, f441, f417; +mul.f32 f447, f424, f441; +mul.f32 f448, f425, f443; +sub.f32 f449, f447, f448; +mul.f32 f450, f424, f443; +fma.rn.f32 f451, f425, f441, f450; +mul.f32 f452, f449, f420; +mul.f32 f453, f451, f421; +mul.f32 f454, f449, f421; +mul.f32 f455, f424, f449; +mul.f32 f456, f425, f451; +sub.f32 f457, f455, f456; +mul.f32 f458, f424, f451; +fma.rn.f32 f459, f425, f449, f458; +mul.f32 f460, f457, f406; +mul.f32 f461, f459, f407; +mul.f32 f462, f457, f407; +mul.f32 f463, f424, f457; +mul.f32 f464, f425, f459; +sub.f32 f465, f463, f464; +mul.f32 f466, f424, f459; +fma.rn.f32 f467, f425, f457, f466; +mul.f32 f468, f465, f410; +mul.f32 f469, f467, f411; +mul.f32 f470, f465, f411; +mul.f32 f471, f424, f465; +mul.f32 f472, f425, f467; +sub.f32 f473, f471, f472; +mul.f32 f474, f424, f467; +fma.rn.f32 f475, f425, f465, f474; +mul.f32 f476, f473, f414; +mul.f32 f477, f475, f415; +mul.f32 f478, f473, f415; +mul.f32 f479, f424, f473; +mul.f32 f480, f425, f475; +sub.f32 f481, f479, f480; +mul.f32 f482, f424, f475; +fma.rn.f32 f483, f425, f473, f482; +mul.f32 f484, f481, f418; +mul.f32 f485, f483, f419; +mul.f32 f486, f481, f419; +mul.f32 f487, f424, f481; +mul.f32 f488, f425, f483; +sub.f32 f489, f487, f488; +mul.f32 f490, f424, f483; +fma.rn.f32 f491, f425, f481, f490; +mul.f32 f492, f489, f422; +mul.f32 f493, f491, f423; +mul.f32 f494, f489, f423; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +add.f32 f495, f305, f349; +add.f32 f496, f301, f345; +st.shared.v2.f32 [r16], {f496, f495}; +fma.rn.f32 f497, f425, f408, f430; +sub.f32 f498, f428, f429; +st.shared.v2.f32 [r16+80], {f498, f497}; +fma.rn.f32 f499, f435, f412, f438; +sub.f32 f500, f436, f437; +st.shared.v2.f32 [r16+160], {f500, f499}; +fma.rn.f32 f501, f443, f416, f446; +sub.f32 f502, f444, f445; +st.shared.v2.f32 [r16+240], {f502, f501}; +fma.rn.f32 f503, f451, f420, f454; +sub.f32 f504, f452, f453; +st.shared.v2.f32 [r16+320], {f504, f503}; +sub.f32 f505, f460, f461; +fma.rn.f32 f506, f459, f406, f462; +st.shared.v2.f32 [r16+400], {f505, f506}; +sub.f32 f507, f468, f469; +fma.rn.f32 f508, f467, f410, f470; +st.shared.v2.f32 [r16+480], {f507, f508}; +fma.rn.f32 f509, f475, f414, f478; +sub.f32 f510, f476, f477; +st.shared.v2.f32 [r16+560], {f510, f509}; +fma.rn.f32 f511, f483, f418, f486; +sub.f32 f512, f484, f485; +st.shared.v2.f32 [r16+640], {f512, f511}; +fma.rn.f32 f513, f491, f422, f494; +sub.f32 f514, f492, f493; +st.shared.v2.f32 [r16+720], {f514, f513}; +barrier.sync 0; +ld.shared.v2.f32 {f515, f516}, [r10]; +ld.shared.v2.f32 {f519, f520}, [r10+800]; +ld.shared.v2.f32 {f523, f524}, [r10+1600]; +ld.shared.v2.f32 {f527, f528}, [r10+2400]; +ld.shared.v2.f32 {f531, f532}, [r10+3200]; +ld.shared.v2.f32 {f535, f536}, [r10+4000]; +ld.shared.v2.f32 {f539, f540}, [r10+4800]; +ld.shared.v2.f32 {f543, f544}, [r10+5600]; +ld.shared.v2.f32 {f547, f548}, [r10+6400]; +ld.shared.v2.f32 {f551, f552}, [r10+7200]; +add.f32 f555, f523, f547; +add.f32 f556, f515, f555; +add.f32 f557, f531, f539; +add.f32 f558, f557, f556; +add.f32 f559, f524, f548; +add.f32 f560, f516, f559; +add.f32 f561, f532, f540; +add.f32 f562, f561, f560; +fma.rn.f32 f563, f555, 0f3E9E377A, f515; +mul.f32 f564, f557, 0f3F4F1BBD; +sub.f32 f565, f563, f564; +sub.f32 f566, f524, f548; +mul.f32 f567, f566, 0f3F737871; +sub.f32 f568, f532, f540; +mul.f32 f569, f568, 0fBF167918; +sub.f32 f570, f569, f567; +sub.f32 f571, f565, f570; +add.f32 f572, f570, f565; +mul.f32 f573, f555, 0f3F4F1BBD; +sub.f32 f574, f515, f573; +fma.rn.f32 f575, f557, 0f3E9E377A, f574; +mul.f32 f576, f566, 0f3F167918; +mul.f32 f577, f568, 0f3F737871; +sub.f32 f578, f577, f576; +sub.f32 f579, f575, f578; +add.f32 f580, f578, f575; +fma.rn.f32 f581, f559, 0f3E9E377A, f516; +mul.f32 f582, f561, 0f3F4F1BBD; +sub.f32 f583, f581, f582; +sub.f32 f584, f523, f547; +mul.f32 f585, f584, 0f3F737871; +sub.f32 f586, f531, f539; +mul.f32 f587, f586, 0fBF167918; +sub.f32 f588, f587, f585; +add.f32 f589, f588, f583; +sub.f32 f590, f583, f588; +mul.f32 f591, f559, 0f3F4F1BBD; +sub.f32 f592, f516, f591; +fma.rn.f32 f593, f561, 0f3E9E377A, f592; +mul.f32 f594, f584, 0f3F167918; +mul.f32 f595, f586, 0f3F737871; +sub.f32 f596, f595, f594; +add.f32 f597, f596, f593; +sub.f32 f598, f593, f596; +add.f32 f599, f527, f551; +add.f32 f600, f519, f599; +add.f32 f601, f535, f543; +add.f32 f602, f601, f600; +add.f32 f603, f528, f552; +add.f32 f604, f520, f603; +add.f32 f605, f536, f544; +add.f32 f606, f605, f604; +fma.rn.f32 f607, f599, 0f3E9E377A, f519; +mul.f32 f608, f601, 0f3F4F1BBD; +sub.f32 f609, f607, f608; +sub.f32 f610, f528, f552; +mul.f32 f611, f610, 0f3F737871; +sub.f32 f612, f536, f544; +mul.f32 f613, f612, 0fBF167918; +sub.f32 f614, f613, f611; +sub.f32 f615, f609, f614; +add.f32 f616, f614, f609; +mul.f32 f617, f599, 0f3F4F1BBD; +sub.f32 f618, f519, f617; +fma.rn.f32 f619, f601, 0f3E9E377A, f618; +mul.f32 f620, f610, 0f3F167918; +mul.f32 f621, f612, 0f3F737871; +sub.f32 f622, f621, f620; +sub.f32 f623, f619, f622; +add.f32 f624, f622, f619; +fma.rn.f32 f625, f603, 0f3E9E377A, f520; +mul.f32 f626, f605, 0f3F4F1BBD; +sub.f32 f627, f625, f626; +sub.f32 f628, f527, f551; +mul.f32 f629, f628, 0f3F737871; +sub.f32 f630, f535, f543; +mul.f32 f631, f630, 0fBF167918; +sub.f32 f632, f631, f629; +add.f32 f633, f632, f627; +sub.f32 f634, f627, f632; +mul.f32 f635, f603, 0f3F4F1BBD; +sub.f32 f636, f520, f635; +fma.rn.f32 f637, f605, 0f3E9E377A, f636; +mul.f32 f638, f628, 0f3F167918; +mul.f32 f639, f630, 0f3F737871; +sub.f32 f640, f639, f638; +add.f32 f641, f640, f637; +sub.f32 f642, f637, f640; +mul.f32 f643, f615, 0f3F4F1BBD; +mul.f32 f644, f633, 0fBF167918; +sub.f32 f645, f643, f644; +mul.f32 f646, f633, 0f3F4F1BBD; +fma.rn.f32 f647, f615, 0fBF167918, f646; +mul.f32 f648, f623, 0f3E9E377A; +mul.f32 f649, f641, 0fBF737871; +sub.f32 f650, f648, f649; +mul.f32 f651, f641, 0f3E9E377A; +fma.rn.f32 f652, f623, 0fBF737871, f651; +mul.f32 f653, f624, 0fBE9E377A; +mul.f32 f654, f642, 0fBF737871; +sub.f32 f655, f653, f654; +mul.f32 f656, f642, 0fBE9E377A; +fma.rn.f32 f657, f624, 0fBF737871, f656; +mul.f32 f658, f616, 0fBF4F1BBD; +mul.f32 f659, f634, 0fBF167918; +sub.f32 f660, f658, f659; +mul.f32 f661, f634, 0fBF4F1BBD; +fma.rn.f32 f662, f616, 0fBF167918, f661; +add.f32 %1, f562, f606; +add.f32 %0, f558, f602; +add.f32 %3, f589, f647; +add.f32 %2, f571, f645; +add.f32 %5, f597, f652; +add.f32 %4, f579, f650; +add.f32 %7, f598, f657; +add.f32 %6, f580, f655; +add.f32 %9, f590, f662; +add.f32 %8, f572, f660; +sub.f32 %11, f562, f606; +sub.f32 %10, f558, f602; +sub.f32 %13, f589, f647; +sub.f32 %12, f571, f645; +sub.f32 %15, f597, f652; +sub.f32 %14, f579, f650; +sub.f32 %17, f598, f657; +sub.f32 %16, f580, f655; +sub.f32 %19, f590, f662; +sub.f32 %18, f572, f660; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<194, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<643>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 4000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %28, %44; +add.f32 f42, %23, f41; +add.f32 f43, %33, %39; +add.f32 f44, f43, f42; +add.f32 f45, %30, %46; +add.f32 f46, %24, f45; +add.f32 f47, %35, %40; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %23; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %30, %46; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %35, %40; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %23, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %24; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %28, %44; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %33, %39; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %24, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %31, %47; +add.f32 f86, %25, f85; +add.f32 f87, %36, %41; +add.f32 f88, f87, f86; +add.f32 f89, %32, %48; +add.f32 f90, %27, f89; +add.f32 f91, %38, %43; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %25; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %32, %48; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %38, %43; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %25, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %27; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %31, %47; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %36, %41; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %27, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +add.f32 f149, f44, f88; +add.f32 f150, f48, f92; +sub.f32 f151, f44, f88; +sub.f32 f152, f48, f92; +add.f32 f153, f57, f131; +add.f32 f154, f75, f133; +sub.f32 f155, f57, f131; +sub.f32 f156, f75, f133; +add.f32 f157, f65, f136; +add.f32 f158, f83, f138; +sub.f32 f159, f65, f136; +sub.f32 f160, f83, f138; +add.f32 f161, f66, f141; +add.f32 f162, f84, f143; +sub.f32 f163, f66, f141; +sub.f32 f164, f84, f143; +add.f32 f165, f58, f146; +add.f32 f166, f76, f148; +sub.f32 f167, f58, f146; +sub.f32 f168, f76, f148; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f169, f170}, [rd6]; +mul.f32 f173, f169, f153; +mul.f32 f174, f170, f154; +sub.f32 f175, f173, f174; +mul.f32 f176, f169, f154; +fma.rn.f32 f177, f170, f153, f176; +mul.f32 f178, f169, f169; +mul.f32 f179, f170, f170; +sub.f32 f180, f178, f179; +mul.f32 f181, f170, f169; +fma.rn.f32 f182, f170, f169, f181; +mul.f32 f183, f180, f157; +mul.f32 f184, f182, f158; +sub.f32 f185, f183, f184; +mul.f32 f186, f180, f158; +fma.rn.f32 f187, f182, f157, f186; +mul.f32 f188, f169, f180; +mul.f32 f189, f170, f182; +sub.f32 f190, f188, f189; +mul.f32 f191, f169, f182; +fma.rn.f32 f192, f170, f180, f191; +mul.f32 f193, f190, f161; +mul.f32 f194, f192, f162; +sub.f32 f195, f193, f194; +mul.f32 f196, f190, f162; +fma.rn.f32 f197, f192, f161, f196; +mul.f32 f198, f169, f190; +mul.f32 f199, f170, f192; +sub.f32 f200, f198, f199; +mul.f32 f201, f169, f192; +fma.rn.f32 f202, f170, f190, f201; +mul.f32 f203, f200, f165; +mul.f32 f204, f202, f166; +sub.f32 f205, f203, f204; +mul.f32 f206, f200, f166; +fma.rn.f32 f207, f202, f165, f206; +mul.f32 f208, f169, f200; +mul.f32 f209, f170, f202; +sub.f32 f210, f208, f209; +mul.f32 f211, f169, f202; +fma.rn.f32 f212, f170, f200, f211; +mul.f32 f213, f210, f151; +mul.f32 f214, f212, f152; +sub.f32 f215, f213, f214; +mul.f32 f216, f210, f152; +fma.rn.f32 f217, f212, f151, f216; +mul.f32 f218, f169, f210; +mul.f32 f219, f170, f212; +sub.f32 f220, f218, f219; +mul.f32 f221, f169, f212; +fma.rn.f32 f222, f170, f210, f221; +mul.f32 f223, f220, f155; +mul.f32 f224, f222, f156; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, f156; +fma.rn.f32 f227, f222, f155, f226; +mul.f32 f228, f169, f220; +mul.f32 f229, f170, f222; +sub.f32 f230, f228, f229; +mul.f32 f231, f169, f222; +fma.rn.f32 f232, f170, f220, f231; +mul.f32 f233, f230, f159; +mul.f32 f234, f232, f160; +sub.f32 f235, f233, f234; +mul.f32 f236, f230, f160; +fma.rn.f32 f237, f232, f159, f236; +mul.f32 f238, f169, f230; +mul.f32 f239, f170, f232; +sub.f32 f240, f238, f239; +mul.f32 f241, f169, f232; +fma.rn.f32 f242, f170, f230, f241; +mul.f32 f243, f240, f163; +mul.f32 f244, f242, f164; +sub.f32 f245, f243, f244; +mul.f32 f246, f240, f164; +fma.rn.f32 f247, f242, f163, f246; +mul.f32 f248, f169, f240; +mul.f32 f249, f170, f242; +sub.f32 f250, f248, f249; +mul.f32 f251, f169, f242; +fma.rn.f32 f252, f170, f240, f251; +mul.f32 f253, f250, f167; +mul.f32 f254, f252, f168; +sub.f32 f255, f253, f254; +mul.f32 f256, f250, f168; +fma.rn.f32 f257, f252, f167, f256; +mad.lo.s32 r8, r5, 4000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.v2.f32 [r9], {f149, f175}; +st.shared.v2.f32 [r9+8], {f185, f195}; +st.shared.v2.f32 [r9+16], {f205, f215}; +st.shared.v2.f32 [r9+24], {f225, f235}; +st.shared.v2.f32 [r9+32], {f245, f255}; +barrier.sync 0; +mad.lo.s32 r10, r7, -36, r9; +ld.shared.f32 f258, [r10]; +ld.shared.f32 f259, [r10+400]; +ld.shared.f32 f260, [r10+800]; +ld.shared.f32 f261, [r10+1200]; +ld.shared.f32 f262, [r10+1600]; +ld.shared.f32 f263, [r10+2000]; +ld.shared.f32 f264, [r10+2400]; +ld.shared.f32 f265, [r10+2800]; +ld.shared.f32 f266, [r10+3200]; +ld.shared.f32 f267, [r10+3600]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f150, f177}; +st.shared.v2.f32 [r9+8], {f187, f197}; +st.shared.v2.f32 [r9+16], {f207, f217}; +st.shared.v2.f32 [r9+24], {f227, f237}; +st.shared.v2.f32 [r9+32], {f247, f257}; +barrier.sync 0; +ld.shared.f32 f268, [r10]; +ld.shared.f32 f269, [r10+400]; +ld.shared.f32 f270, [r10+800]; +ld.shared.f32 f271, [r10+1200]; +ld.shared.f32 f272, [r10+1600]; +ld.shared.f32 f273, [r10+2000]; +ld.shared.f32 f274, [r10+2400]; +ld.shared.f32 f275, [r10+2800]; +ld.shared.f32 f276, [r10+3200]; +ld.shared.f32 f277, [r10+3600]; +add.f32 f278, f260, f266; +add.f32 f279, f258, f278; +add.f32 f280, f262, f264; +add.f32 f281, f280, f279; +add.f32 f282, f270, f276; +add.f32 f283, f268, f282; +add.f32 f284, f272, f274; +add.f32 f285, f284, f283; +fma.rn.f32 f286, f278, 0f3E9E377A, f258; +mul.f32 f287, f280, 0f3F4F1BBD; +sub.f32 f288, f286, f287; +sub.f32 f289, f270, f276; +mul.f32 f290, f289, 0f3F737871; +sub.f32 f291, f272, f274; +mul.f32 f292, f291, 0fBF167918; +sub.f32 f293, f292, f290; +sub.f32 f294, f288, f293; +add.f32 f295, f293, f288; +mul.f32 f296, f278, 0f3F4F1BBD; +sub.f32 f297, f258, f296; +fma.rn.f32 f298, f280, 0f3E9E377A, f297; +mul.f32 f299, f289, 0f3F167918; +mul.f32 f300, f291, 0f3F737871; +sub.f32 f301, f300, f299; +sub.f32 f302, f298, f301; +add.f32 f303, f301, f298; +fma.rn.f32 f304, f282, 0f3E9E377A, f268; +mul.f32 f305, f284, 0f3F4F1BBD; +sub.f32 f306, f304, f305; +sub.f32 f307, f260, f266; +mul.f32 f308, f307, 0f3F737871; +sub.f32 f309, f262, f264; +mul.f32 f310, f309, 0fBF167918; +sub.f32 f311, f310, f308; +add.f32 f312, f311, f306; +sub.f32 f313, f306, f311; +mul.f32 f314, f282, 0f3F4F1BBD; +sub.f32 f315, f268, f314; +fma.rn.f32 f316, f284, 0f3E9E377A, f315; +mul.f32 f317, f307, 0f3F167918; +mul.f32 f318, f309, 0f3F737871; +sub.f32 f319, f318, f317; +add.f32 f320, f319, f316; +sub.f32 f321, f316, f319; +add.f32 f322, f261, f267; +add.f32 f323, f259, f322; +add.f32 f324, f263, f265; +add.f32 f325, f324, f323; +add.f32 f326, f271, f277; +add.f32 f327, f269, f326; +add.f32 f328, f273, f275; +add.f32 f329, f328, f327; +fma.rn.f32 f330, f322, 0f3E9E377A, f259; +mul.f32 f331, f324, 0f3F4F1BBD; +sub.f32 f332, f330, f331; +sub.f32 f333, f271, f277; +mul.f32 f334, f333, 0f3F737871; +sub.f32 f335, f273, f275; +mul.f32 f336, f335, 0fBF167918; +sub.f32 f337, f336, f334; +sub.f32 f338, f332, f337; +add.f32 f339, f337, f332; +mul.f32 f340, f322, 0f3F4F1BBD; +sub.f32 f341, f259, f340; +fma.rn.f32 f342, f324, 0f3E9E377A, f341; +mul.f32 f343, f333, 0f3F167918; +mul.f32 f344, f335, 0f3F737871; +sub.f32 f345, f344, f343; +sub.f32 f346, f342, f345; +add.f32 f347, f345, f342; +fma.rn.f32 f348, f326, 0f3E9E377A, f269; +mul.f32 f349, f328, 0f3F4F1BBD; +sub.f32 f350, f348, f349; +sub.f32 f351, f261, f267; +mul.f32 f352, f351, 0f3F737871; +sub.f32 f353, f263, f265; +mul.f32 f354, f353, 0fBF167918; +sub.f32 f355, f354, f352; +add.f32 f356, f355, f350; +sub.f32 f357, f350, f355; +mul.f32 f358, f326, 0f3F4F1BBD; +sub.f32 f359, f269, f358; +fma.rn.f32 f360, f328, 0f3E9E377A, f359; +mul.f32 f361, f351, 0f3F167918; +mul.f32 f362, f353, 0f3F737871; +sub.f32 f363, f362, f361; +add.f32 f364, f363, f360; +sub.f32 f365, f360, f363; +mul.f32 f366, f338, 0f3F4F1BBD; +mul.f32 f367, f356, 0fBF167918; +sub.f32 f368, f366, f367; +mul.f32 f369, f356, 0f3F4F1BBD; +fma.rn.f32 f370, f338, 0fBF167918, f369; +mul.f32 f371, f346, 0f3E9E377A; +mul.f32 f372, f364, 0fBF737871; +sub.f32 f373, f371, f372; +mul.f32 f374, f364, 0f3E9E377A; +fma.rn.f32 f375, f346, 0fBF737871, f374; +mul.f32 f376, f347, 0fBE9E377A; +mul.f32 f377, f365, 0fBF737871; +sub.f32 f378, f376, f377; +mul.f32 f379, f365, 0fBE9E377A; +fma.rn.f32 f380, f347, 0fBF737871, f379; +mul.f32 f381, f339, 0fBF4F1BBD; +mul.f32 f382, f357, 0fBF167918; +sub.f32 f383, f381, f382; +mul.f32 f384, f357, 0fBF4F1BBD; +fma.rn.f32 f385, f339, 0fBF167918, f384; +add.f32 f386, f281, f325; +add.f32 f387, f285, f329; +sub.f32 f388, f281, f325; +sub.f32 f389, f285, f329; +add.f32 f390, f294, f368; +add.f32 f391, f312, f370; +sub.f32 f392, f294, f368; +sub.f32 f393, f312, f370; +add.f32 f394, f302, f373; +add.f32 f395, f320, f375; +sub.f32 f396, f302, f373; +sub.f32 f397, f320, f375; +add.f32 f398, f303, f378; +add.f32 f399, f321, f380; +sub.f32 f400, f303, f378; +sub.f32 f401, f321, f380; +add.f32 f402, f295, f383; +add.f32 f403, f313, f385; +sub.f32 f404, f295, f383; +sub.f32 f405, f313, f385; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f406, f407}, [rd11]; +mul.f32 f410, f406, f390; +mul.f32 f411, f407, f391; +sub.f32 f412, f410, f411; +mul.f32 f413, f406, f391; +fma.rn.f32 f414, f407, f390, f413; +mul.f32 f415, f406, f406; +mul.f32 f416, f407, f407; +sub.f32 f417, f415, f416; +mul.f32 f418, f407, f406; +fma.rn.f32 f419, f407, f406, f418; +mul.f32 f420, f417, f394; +mul.f32 f421, f419, f395; +sub.f32 f422, f420, f421; +mul.f32 f423, f417, f395; +fma.rn.f32 f424, f419, f394, f423; +mul.f32 f425, f406, f417; +mul.f32 f426, f407, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f406, f419; +fma.rn.f32 f429, f407, f417, f428; +mul.f32 f430, f427, f398; +mul.f32 f431, f429, f399; +sub.f32 f432, f430, f431; +mul.f32 f433, f427, f399; +fma.rn.f32 f434, f429, f398, f433; +mul.f32 f435, f406, f427; +mul.f32 f436, f407, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f406, f429; +fma.rn.f32 f439, f407, f427, f438; +mul.f32 f440, f437, f402; +mul.f32 f441, f439, f403; +sub.f32 f442, f440, f441; +mul.f32 f443, f437, f403; +fma.rn.f32 f444, f439, f402, f443; +mul.f32 f445, f406, f437; +mul.f32 f446, f407, f439; +sub.f32 f447, f445, f446; +mul.f32 f448, f406, f439; +fma.rn.f32 f449, f407, f437, f448; +mul.f32 f450, f447, f388; +mul.f32 f451, f449, f389; +sub.f32 f452, f450, f451; +mul.f32 f453, f447, f389; +fma.rn.f32 f454, f449, f388, f453; +mul.f32 f455, f406, f447; +mul.f32 f456, f407, f449; +sub.f32 f457, f455, f456; +mul.f32 f458, f406, f449; +fma.rn.f32 f459, f407, f447, f458; +mul.f32 f460, f457, f392; +mul.f32 f461, f459, f393; +sub.f32 f462, f460, f461; +mul.f32 f463, f457, f393; +fma.rn.f32 f464, f459, f392, f463; +mul.f32 f465, f406, f457; +mul.f32 f466, f407, f459; +sub.f32 f467, f465, f466; +mul.f32 f468, f406, f459; +fma.rn.f32 f469, f407, f457, f468; +mul.f32 f470, f467, f396; +mul.f32 f471, f469, f397; +sub.f32 f472, f470, f471; +mul.f32 f473, f467, f397; +fma.rn.f32 f474, f469, f396, f473; +mul.f32 f475, f406, f467; +mul.f32 f476, f407, f469; +sub.f32 f477, f475, f476; +mul.f32 f478, f406, f469; +fma.rn.f32 f479, f407, f467, f478; +mul.f32 f480, f477, f400; +mul.f32 f481, f479, f401; +sub.f32 f482, f480, f481; +mul.f32 f483, f477, f401; +fma.rn.f32 f484, f479, f400, f483; +mul.f32 f485, f406, f477; +mul.f32 f486, f407, f479; +sub.f32 f487, f485, f486; +mul.f32 f488, f406, f479; +fma.rn.f32 f489, f407, f477, f488; +mul.f32 f490, f487, f404; +mul.f32 f491, f489, f405; +sub.f32 f492, f490, f491; +mul.f32 f493, f487, f405; +fma.rn.f32 f494, f489, f404, f493; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 400, r15; +st.shared.f32 [r16], f386; +st.shared.f32 [r16+40], f412; +st.shared.f32 [r16+80], f422; +st.shared.f32 [r16+120], f432; +st.shared.f32 [r16+160], f442; +st.shared.f32 [r16+200], f452; +st.shared.f32 [r16+240], f462; +st.shared.f32 [r16+280], f472; +st.shared.f32 [r16+320], f482; +st.shared.f32 [r16+360], f492; +barrier.sync 0; +ld.shared.f32 f495, [r10]; +ld.shared.f32 f496, [r10+400]; +ld.shared.f32 f497, [r10+800]; +ld.shared.f32 f498, [r10+1200]; +ld.shared.f32 f499, [r10+1600]; +ld.shared.f32 f500, [r10+2000]; +ld.shared.f32 f501, [r10+2400]; +ld.shared.f32 f502, [r10+2800]; +ld.shared.f32 f503, [r10+3200]; +ld.shared.f32 f504, [r10+3600]; +barrier.sync 0; +st.shared.f32 [r16], f387; +st.shared.f32 [r16+40], f414; +st.shared.f32 [r16+80], f424; +st.shared.f32 [r16+120], f434; +st.shared.f32 [r16+160], f444; +st.shared.f32 [r16+200], f454; +st.shared.f32 [r16+240], f464; +st.shared.f32 [r16+280], f474; +st.shared.f32 [r16+320], f484; +st.shared.f32 [r16+360], f494; +barrier.sync 0; +ld.shared.f32 f505, [r10]; +ld.shared.f32 f506, [r10+400]; +ld.shared.f32 f507, [r10+800]; +ld.shared.f32 f508, [r10+1200]; +ld.shared.f32 f509, [r10+1600]; +ld.shared.f32 f510, [r10+2000]; +ld.shared.f32 f511, [r10+2400]; +ld.shared.f32 f512, [r10+2800]; +ld.shared.f32 f513, [r10+3200]; +ld.shared.f32 f514, [r10+3600]; +add.f32 f515, f497, f503; +add.f32 f516, f495, f515; +add.f32 f517, f499, f501; +add.f32 f518, f517, f516; +add.f32 f519, f507, f513; +add.f32 f520, f505, f519; +add.f32 f521, f509, f511; +add.f32 f522, f521, f520; +fma.rn.f32 f523, f515, 0f3E9E377A, f495; +mul.f32 f524, f517, 0f3F4F1BBD; +sub.f32 f525, f523, f524; +sub.f32 f526, f507, f513; +mul.f32 f527, f526, 0f3F737871; +sub.f32 f528, f509, f511; +mul.f32 f529, f528, 0fBF167918; +sub.f32 f530, f529, f527; +sub.f32 f531, f525, f530; +add.f32 f532, f530, f525; +mul.f32 f533, f515, 0f3F4F1BBD; +sub.f32 f534, f495, f533; +fma.rn.f32 f535, f517, 0f3E9E377A, f534; +mul.f32 f536, f526, 0f3F167918; +mul.f32 f537, f528, 0f3F737871; +sub.f32 f538, f537, f536; +sub.f32 f539, f535, f538; +add.f32 f540, f538, f535; +fma.rn.f32 f541, f519, 0f3E9E377A, f505; +mul.f32 f542, f521, 0f3F4F1BBD; +sub.f32 f543, f541, f542; +sub.f32 f544, f497, f503; +mul.f32 f545, f544, 0f3F737871; +sub.f32 f546, f499, f501; +mul.f32 f547, f546, 0fBF167918; +sub.f32 f548, f547, f545; +add.f32 f549, f548, f543; +sub.f32 f550, f543, f548; +mul.f32 f551, f519, 0f3F4F1BBD; +sub.f32 f552, f505, f551; +fma.rn.f32 f553, f521, 0f3E9E377A, f552; +mul.f32 f554, f544, 0f3F167918; +mul.f32 f555, f546, 0f3F737871; +sub.f32 f556, f555, f554; +add.f32 f557, f556, f553; +sub.f32 f558, f553, f556; +add.f32 f559, f498, f504; +add.f32 f560, f496, f559; +add.f32 f561, f500, f502; +add.f32 f562, f561, f560; +add.f32 f563, f508, f514; +add.f32 f564, f506, f563; +add.f32 f565, f510, f512; +add.f32 f566, f565, f564; +fma.rn.f32 f567, f559, 0f3E9E377A, f496; +mul.f32 f568, f561, 0f3F4F1BBD; +sub.f32 f569, f567, f568; +sub.f32 f570, f508, f514; +mul.f32 f571, f570, 0f3F737871; +sub.f32 f572, f510, f512; +mul.f32 f573, f572, 0fBF167918; +sub.f32 f574, f573, f571; +sub.f32 f575, f569, f574; +add.f32 f576, f574, f569; +mul.f32 f577, f559, 0f3F4F1BBD; +sub.f32 f578, f496, f577; +fma.rn.f32 f579, f561, 0f3E9E377A, f578; +mul.f32 f580, f570, 0f3F167918; +mul.f32 f581, f572, 0f3F737871; +sub.f32 f582, f581, f580; +sub.f32 f583, f579, f582; +add.f32 f584, f582, f579; +fma.rn.f32 f585, f563, 0f3E9E377A, f506; +mul.f32 f586, f565, 0f3F4F1BBD; +sub.f32 f587, f585, f586; +sub.f32 f588, f498, f504; +mul.f32 f589, f588, 0f3F737871; +sub.f32 f590, f500, f502; +mul.f32 f591, f590, 0fBF167918; +sub.f32 f592, f591, f589; +add.f32 f593, f592, f587; +sub.f32 f594, f587, f592; +mul.f32 f595, f563, 0f3F4F1BBD; +sub.f32 f596, f506, f595; +fma.rn.f32 f597, f565, 0f3E9E377A, f596; +mul.f32 f598, f588, 0f3F167918; +mul.f32 f599, f590, 0f3F737871; +sub.f32 f600, f599, f598; +add.f32 f601, f600, f597; +sub.f32 f602, f597, f600; +mul.f32 f603, f575, 0f3F4F1BBD; +mul.f32 f604, f593, 0fBF167918; +sub.f32 f605, f603, f604; +mul.f32 f606, f593, 0f3F4F1BBD; +fma.rn.f32 f607, f575, 0fBF167918, f606; +mul.f32 f608, f583, 0f3E9E377A; +mul.f32 f609, f601, 0fBF737871; +sub.f32 f610, f608, f609; +mul.f32 f611, f601, 0f3E9E377A; +fma.rn.f32 f612, f583, 0fBF737871, f611; +mul.f32 f613, f584, 0fBE9E377A; +mul.f32 f614, f602, 0fBF737871; +sub.f32 f615, f613, f614; +mul.f32 f616, f602, 0fBE9E377A; +fma.rn.f32 f617, f584, 0fBF737871, f616; +mul.f32 f618, f576, 0fBF4F1BBD; +mul.f32 f619, f594, 0fBF167918; +sub.f32 f620, f618, f619; +mul.f32 f621, f594, 0fBF4F1BBD; +fma.rn.f32 f622, f576, 0fBF167918, f621; +add.f32 %0, f518, f562; +add.f32 %1, f522, f566; +add.f32 %3, f549, f607; +add.f32 %2, f531, f605; +add.f32 %5, f557, f612; +add.f32 %4, f539, f610; +add.f32 %7, f558, f617; +add.f32 %6, f540, f615; +add.f32 %9, f550, f622; +add.f32 %8, f532, f620; +sub.f32 %10, f518, f562; +sub.f32 %11, f522, f566; +sub.f32 %13, f549, f607; +sub.f32 %12, f531, f605; +sub.f32 %15, f557, f612; +sub.f32 %14, f539, f610; +sub.f32 %17, f558, f617; +sub.f32 %16, f540, f615; +sub.f32 %19, f550, f622; +sub.f32 %18, f532, f620; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..38d0be4d3488f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp32_inv.hpp.inc @@ -0,0 +1,1292 @@ +#ifndef CUFFTDX_FFT_1000_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_1000_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<395, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<671>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 8000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %28, %44; +add.f32 f42, %23, f41; +add.f32 f43, %33, %39; +add.f32 f44, f43, f42; +add.f32 f45, %30, %46; +add.f32 f46, %24, f45; +add.f32 f47, %35, %40; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %23; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %30, %46; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %35, %40; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %23, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %24; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %28, %44; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %33, %39; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %24, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %31, %47; +add.f32 f84, %25, f83; +add.f32 f85, %36, %41; +add.f32 f86, f85, f84; +add.f32 f87, %32, %48; +add.f32 f88, %27, f87; +add.f32 f89, %38, %43; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %25; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %32, %48; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %38, %43; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %25, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %27; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %31, %47; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %36, %41; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %27, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +sub.f32 f145, f44, f86; +sub.f32 f146, f48, f90; +add.f32 f147, f56, f127; +add.f32 f148, f73, f129; +sub.f32 f149, f56, f127; +sub.f32 f150, f73, f129; +add.f32 f151, f64, f132; +add.f32 f152, f81, f134; +sub.f32 f153, f64, f132; +sub.f32 f154, f81, f134; +add.f32 f155, f65, f137; +add.f32 f156, f82, f139; +sub.f32 f157, f65, f137; +sub.f32 f158, f82, f139; +add.f32 f159, f57, f142; +add.f32 f160, f74, f144; +sub.f32 f161, f57, f142; +sub.f32 f162, f74, f144; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 8000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f163, f164}, [rd6]; +mul.f32 f167, f148, f164; +mul.f32 f168, f147, f164; +mul.f32 f169, f163, f148; +mul.f32 f170, f163, f163; +mul.f32 f171, f164, f164; +sub.f32 f172, f170, f171; +mul.f32 f173, f164, f163; +fma.rn.f32 f174, f164, f163, f173; +mul.f32 f175, f152, f174; +mul.f32 f176, f151, f174; +mul.f32 f177, f172, f152; +mul.f32 f178, f163, f172; +mul.f32 f179, f164, f174; +sub.f32 f180, f178, f179; +mul.f32 f181, f163, f174; +fma.rn.f32 f182, f164, f172, f181; +mul.f32 f183, f156, f182; +mul.f32 f184, f155, f182; +mul.f32 f185, f180, f156; +mul.f32 f186, f163, f180; +mul.f32 f187, f164, f182; +sub.f32 f188, f186, f187; +mul.f32 f189, f163, f182; +fma.rn.f32 f190, f164, f180, f189; +mul.f32 f191, f160, f190; +mul.f32 f192, f159, f190; +mul.f32 f193, f188, f160; +mul.f32 f194, f163, f188; +mul.f32 f195, f164, f190; +sub.f32 f196, f194, f195; +mul.f32 f197, f163, f190; +fma.rn.f32 f198, f164, f188, f197; +mul.f32 f199, f146, f198; +mul.f32 f200, f145, f198; +mul.f32 f201, f196, f146; +mul.f32 f202, f163, f196; +mul.f32 f203, f164, f198; +sub.f32 f204, f202, f203; +mul.f32 f205, f163, f198; +fma.rn.f32 f206, f164, f196, f205; +mul.f32 f207, f150, f206; +mul.f32 f208, f149, f206; +mul.f32 f209, f204, f150; +mul.f32 f210, f163, f204; +mul.f32 f211, f164, f206; +sub.f32 f212, f210, f211; +mul.f32 f213, f163, f206; +fma.rn.f32 f214, f164, f204, f213; +mul.f32 f215, f154, f214; +mul.f32 f216, f153, f214; +mul.f32 f217, f212, f154; +mul.f32 f218, f163, f212; +mul.f32 f219, f164, f214; +sub.f32 f220, f218, f219; +mul.f32 f221, f163, f214; +fma.rn.f32 f222, f164, f212, f221; +mul.f32 f223, f158, f222; +mul.f32 f224, f157, f222; +mul.f32 f225, f220, f158; +mul.f32 f226, f163, f220; +mul.f32 f227, f164, f222; +sub.f32 f228, f226, f227; +mul.f32 f229, f163, f222; +fma.rn.f32 f230, f164, f220, f229; +mul.f32 f231, f162, f230; +mul.f32 f232, f161, f230; +mul.f32 f233, f228, f162; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f32 f234, f48, f90; +add.f32 f235, f44, f86; +st.shared.v2.f32 [r9], {f235, f234}; +fma.rn.f32 f236, f163, f147, f167; +sub.f32 f237, f169, f168; +st.shared.v2.f32 [r9+8], {f236, f237}; +fma.rn.f32 f238, f172, f151, f175; +sub.f32 f239, f177, f176; +st.shared.v2.f32 [r9+16], {f238, f239}; +fma.rn.f32 f240, f180, f155, f183; +sub.f32 f241, f185, f184; +st.shared.v2.f32 [r9+24], {f240, f241}; +sub.f32 f242, f193, f192; +fma.rn.f32 f243, f188, f159, f191; +st.shared.v2.f32 [r9+32], {f243, f242}; +fma.rn.f32 f244, f196, f145, f199; +sub.f32 f245, f201, f200; +st.shared.v2.f32 [r9+40], {f244, f245}; +fma.rn.f32 f246, f204, f149, f207; +sub.f32 f247, f209, f208; +st.shared.v2.f32 [r9+48], {f246, f247}; +fma.rn.f32 f248, f212, f153, f215; +sub.f32 f249, f217, f216; +st.shared.v2.f32 [r9+56], {f248, f249}; +fma.rn.f32 f250, f220, f157, f223; +sub.f32 f251, f225, f224; +st.shared.v2.f32 [r9+64], {f250, f251}; +fma.rn.f32 f252, f228, f161, f231; +sub.f32 f253, f233, f232; +st.shared.v2.f32 [r9+72], {f252, f253}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.v2.f32 {f254, f255}, [r10]; +ld.shared.v2.f32 {f258, f259}, [r10+800]; +ld.shared.v2.f32 {f262, f263}, [r10+1600]; +ld.shared.v2.f32 {f266, f267}, [r10+2400]; +ld.shared.v2.f32 {f270, f271}, [r10+3200]; +ld.shared.v2.f32 {f274, f275}, [r10+4000]; +ld.shared.v2.f32 {f278, f279}, [r10+4800]; +ld.shared.v2.f32 {f282, f283}, [r10+5600]; +ld.shared.v2.f32 {f286, f287}, [r10+6400]; +ld.shared.v2.f32 {f290, f291}, [r10+7200]; +add.f32 f294, f262, f286; +add.f32 f295, f254, f294; +add.f32 f296, f270, f278; +add.f32 f297, f296, f295; +add.f32 f298, f263, f287; +add.f32 f299, f255, f298; +add.f32 f300, f271, f279; +add.f32 f301, f300, f299; +fma.rn.f32 f302, f294, 0f3E9E377A, f254; +mul.f32 f303, f296, 0f3F4F1BBD; +sub.f32 f304, f302, f303; +sub.f32 f305, f263, f287; +mul.f32 f306, f305, 0f3F737871; +sub.f32 f307, f271, f279; +fma.rn.f32 f308, f307, 0f3F167918, f306; +sub.f32 f309, f304, f308; +add.f32 f310, f308, f304; +mul.f32 f311, f294, 0f3F4F1BBD; +sub.f32 f312, f254, f311; +fma.rn.f32 f313, f296, 0f3E9E377A, f312; +mul.f32 f314, f305, 0f3F167918; +mul.f32 f315, f307, 0f3F737871; +sub.f32 f316, f314, f315; +sub.f32 f317, f313, f316; +add.f32 f318, f316, f313; +fma.rn.f32 f319, f298, 0f3E9E377A, f255; +mul.f32 f320, f300, 0f3F4F1BBD; +sub.f32 f321, f319, f320; +sub.f32 f322, f262, f286; +mul.f32 f323, f322, 0f3F737871; +sub.f32 f324, f270, f278; +fma.rn.f32 f325, f324, 0f3F167918, f323; +add.f32 f326, f325, f321; +sub.f32 f327, f321, f325; +mul.f32 f328, f298, 0f3F4F1BBD; +sub.f32 f329, f255, f328; +fma.rn.f32 f330, f300, 0f3E9E377A, f329; +mul.f32 f331, f322, 0f3F167918; +mul.f32 f332, f324, 0f3F737871; +sub.f32 f333, f331, f332; +add.f32 f334, f333, f330; +sub.f32 f335, f330, f333; +add.f32 f336, f266, f290; +add.f32 f337, f258, f336; +add.f32 f338, f274, f282; +add.f32 f339, f338, f337; +add.f32 f340, f267, f291; +add.f32 f341, f259, f340; +add.f32 f342, f275, f283; +add.f32 f343, f342, f341; +fma.rn.f32 f344, f336, 0f3E9E377A, f258; +mul.f32 f345, f338, 0f3F4F1BBD; +sub.f32 f346, f344, f345; +sub.f32 f347, f267, f291; +mul.f32 f348, f347, 0f3F737871; +sub.f32 f349, f275, f283; +fma.rn.f32 f350, f349, 0f3F167918, f348; +sub.f32 f351, f346, f350; +add.f32 f352, f350, f346; +mul.f32 f353, f336, 0f3F4F1BBD; +sub.f32 f354, f258, f353; +fma.rn.f32 f355, f338, 0f3E9E377A, f354; +mul.f32 f356, f347, 0f3F167918; +mul.f32 f357, f349, 0f3F737871; +sub.f32 f358, f356, f357; +sub.f32 f359, f355, f358; +add.f32 f360, f358, f355; +fma.rn.f32 f361, f340, 0f3E9E377A, f259; +mul.f32 f362, f342, 0f3F4F1BBD; +sub.f32 f363, f361, f362; +sub.f32 f364, f266, f290; +mul.f32 f365, f364, 0f3F737871; +sub.f32 f366, f274, f282; +fma.rn.f32 f367, f366, 0f3F167918, f365; +add.f32 f368, f367, f363; +sub.f32 f369, f363, f367; +mul.f32 f370, f340, 0f3F4F1BBD; +sub.f32 f371, f259, f370; +fma.rn.f32 f372, f342, 0f3E9E377A, f371; +mul.f32 f373, f364, 0f3F167918; +mul.f32 f374, f366, 0f3F737871; +sub.f32 f375, f373, f374; +add.f32 f376, f375, f372; +sub.f32 f377, f372, f375; +mul.f32 f378, f351, 0f3F4F1BBD; +mul.f32 f379, f368, 0f3F167918; +sub.f32 f380, f378, f379; +mul.f32 f381, f368, 0f3F4F1BBD; +fma.rn.f32 f382, f351, 0f3F167918, f381; +mul.f32 f383, f359, 0f3E9E377A; +mul.f32 f384, f376, 0f3F737871; +sub.f32 f385, f383, f384; +mul.f32 f386, f376, 0f3E9E377A; +fma.rn.f32 f387, f359, 0f3F737871, f386; +mul.f32 f388, f360, 0fBE9E377A; +mul.f32 f389, f377, 0f3F737871; +sub.f32 f390, f388, f389; +mul.f32 f391, f377, 0fBE9E377A; +fma.rn.f32 f392, f360, 0f3F737871, f391; +mul.f32 f393, f352, 0fBF4F1BBD; +mul.f32 f394, f369, 0f3F167918; +sub.f32 f395, f393, f394; +mul.f32 f396, f369, 0fBF4F1BBD; +fma.rn.f32 f397, f352, 0f3F167918, f396; +sub.f32 f398, f297, f339; +sub.f32 f399, f301, f343; +add.f32 f400, f309, f380; +add.f32 f401, f326, f382; +sub.f32 f402, f309, f380; +sub.f32 f403, f326, f382; +add.f32 f404, f317, f385; +add.f32 f405, f334, f387; +sub.f32 f406, f317, f385; +sub.f32 f407, f334, f387; +add.f32 f408, f318, f390; +add.f32 f409, f335, f392; +sub.f32 f410, f318, f390; +sub.f32 f411, f335, f392; +add.f32 f412, f310, f395; +add.f32 f413, f327, f397; +sub.f32 f414, f310, f395; +sub.f32 f415, f327, f397; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f416, f417}, [rd11]; +mul.f32 f420, f401, f417; +mul.f32 f421, f400, f417; +mul.f32 f422, f416, f401; +mul.f32 f423, f416, f416; +mul.f32 f424, f417, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f417, f416; +fma.rn.f32 f427, f417, f416, f426; +mul.f32 f428, f405, f427; +mul.f32 f429, f404, f427; +mul.f32 f430, f425, f405; +mul.f32 f431, f416, f425; +mul.f32 f432, f417, f427; +sub.f32 f433, f431, f432; +mul.f32 f434, f416, f427; +fma.rn.f32 f435, f417, f425, f434; +mul.f32 f436, f409, f435; +mul.f32 f437, f408, f435; +mul.f32 f438, f433, f409; +mul.f32 f439, f416, f433; +mul.f32 f440, f417, f435; +sub.f32 f441, f439, f440; +mul.f32 f442, f416, f435; +fma.rn.f32 f443, f417, f433, f442; +mul.f32 f444, f413, f443; +mul.f32 f445, f412, f443; +mul.f32 f446, f441, f413; +mul.f32 f447, f416, f441; +mul.f32 f448, f417, f443; +sub.f32 f449, f447, f448; +mul.f32 f450, f416, f443; +fma.rn.f32 f451, f417, f441, f450; +mul.f32 f452, f399, f451; +mul.f32 f453, f398, f451; +mul.f32 f454, f449, f399; +mul.f32 f455, f416, f449; +mul.f32 f456, f417, f451; +sub.f32 f457, f455, f456; +mul.f32 f458, f416, f451; +fma.rn.f32 f459, f417, f449, f458; +mul.f32 f460, f403, f459; +mul.f32 f461, f402, f459; +mul.f32 f462, f457, f403; +mul.f32 f463, f416, f457; +mul.f32 f464, f417, f459; +sub.f32 f465, f463, f464; +mul.f32 f466, f416, f459; +fma.rn.f32 f467, f417, f457, f466; +mul.f32 f468, f407, f467; +mul.f32 f469, f406, f467; +mul.f32 f470, f465, f407; +mul.f32 f471, f416, f465; +mul.f32 f472, f417, f467; +sub.f32 f473, f471, f472; +mul.f32 f474, f416, f467; +fma.rn.f32 f475, f417, f465, f474; +mul.f32 f476, f411, f475; +mul.f32 f477, f410, f475; +mul.f32 f478, f473, f411; +mul.f32 f479, f416, f473; +mul.f32 f480, f417, f475; +sub.f32 f481, f479, f480; +mul.f32 f482, f416, f475; +fma.rn.f32 f483, f417, f473, f482; +mul.f32 f484, f415, f483; +mul.f32 f485, f414, f483; +mul.f32 f486, f481, f415; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +add.f32 f487, f301, f343; +add.f32 f488, f297, f339; +st.shared.v2.f32 [r16], {f488, f487}; +fma.rn.f32 f489, f416, f400, f420; +sub.f32 f490, f422, f421; +st.shared.v2.f32 [r16+80], {f489, f490}; +fma.rn.f32 f491, f425, f404, f428; +sub.f32 f492, f430, f429; +st.shared.v2.f32 [r16+160], {f491, f492}; +fma.rn.f32 f493, f433, f408, f436; +sub.f32 f494, f438, f437; +st.shared.v2.f32 [r16+240], {f493, f494}; +fma.rn.f32 f495, f441, f412, f444; +sub.f32 f496, f446, f445; +st.shared.v2.f32 [r16+320], {f495, f496}; +sub.f32 f497, f454, f453; +fma.rn.f32 f498, f449, f398, f452; +st.shared.v2.f32 [r16+400], {f498, f497}; +sub.f32 f499, f462, f461; +fma.rn.f32 f500, f457, f402, f460; +st.shared.v2.f32 [r16+480], {f500, f499}; +fma.rn.f32 f501, f465, f406, f468; +sub.f32 f502, f470, f469; +st.shared.v2.f32 [r16+560], {f501, f502}; +fma.rn.f32 f503, f473, f410, f476; +sub.f32 f504, f478, f477; +st.shared.v2.f32 [r16+640], {f503, f504}; +fma.rn.f32 f505, f481, f414, f484; +sub.f32 f506, f486, f485; +st.shared.v2.f32 [r16+720], {f505, f506}; +barrier.sync 0; +ld.shared.v2.f32 {f507, f508}, [r10]; +ld.shared.v2.f32 {f511, f512}, [r10+800]; +ld.shared.v2.f32 {f515, f516}, [r10+1600]; +ld.shared.v2.f32 {f519, f520}, [r10+2400]; +ld.shared.v2.f32 {f523, f524}, [r10+3200]; +ld.shared.v2.f32 {f527, f528}, [r10+4000]; +ld.shared.v2.f32 {f531, f532}, [r10+4800]; +ld.shared.v2.f32 {f535, f536}, [r10+5600]; +ld.shared.v2.f32 {f539, f540}, [r10+6400]; +ld.shared.v2.f32 {f543, f544}, [r10+7200]; +add.f32 f547, f515, f539; +add.f32 f548, f507, f547; +add.f32 f549, f523, f531; +add.f32 f550, f549, f548; +add.f32 f551, f516, f540; +add.f32 f552, f508, f551; +add.f32 f553, f524, f532; +add.f32 f554, f553, f552; +fma.rn.f32 f555, f547, 0f3E9E377A, f507; +mul.f32 f556, f549, 0f3F4F1BBD; +sub.f32 f557, f555, f556; +sub.f32 f558, f516, f540; +mul.f32 f559, f558, 0f3F737871; +sub.f32 f560, f524, f532; +fma.rn.f32 f561, f560, 0f3F167918, f559; +sub.f32 f562, f557, f561; +add.f32 f563, f561, f557; +mul.f32 f564, f547, 0f3F4F1BBD; +sub.f32 f565, f507, f564; +fma.rn.f32 f566, f549, 0f3E9E377A, f565; +mul.f32 f567, f558, 0f3F167918; +mul.f32 f568, f560, 0f3F737871; +sub.f32 f569, f567, f568; +sub.f32 f570, f566, f569; +add.f32 f571, f569, f566; +fma.rn.f32 f572, f551, 0f3E9E377A, f508; +mul.f32 f573, f553, 0f3F4F1BBD; +sub.f32 f574, f572, f573; +sub.f32 f575, f515, f539; +mul.f32 f576, f575, 0f3F737871; +sub.f32 f577, f523, f531; +fma.rn.f32 f578, f577, 0f3F167918, f576; +add.f32 f579, f578, f574; +sub.f32 f580, f574, f578; +mul.f32 f581, f551, 0f3F4F1BBD; +sub.f32 f582, f508, f581; +fma.rn.f32 f583, f553, 0f3E9E377A, f582; +mul.f32 f584, f575, 0f3F167918; +mul.f32 f585, f577, 0f3F737871; +sub.f32 f586, f584, f585; +add.f32 f587, f586, f583; +sub.f32 f588, f583, f586; +add.f32 f589, f519, f543; +add.f32 f590, f511, f589; +add.f32 f591, f527, f535; +add.f32 f592, f591, f590; +add.f32 f593, f520, f544; +add.f32 f594, f512, f593; +add.f32 f595, f528, f536; +add.f32 f596, f595, f594; +fma.rn.f32 f597, f589, 0f3E9E377A, f511; +mul.f32 f598, f591, 0f3F4F1BBD; +sub.f32 f599, f597, f598; +sub.f32 f600, f520, f544; +mul.f32 f601, f600, 0f3F737871; +sub.f32 f602, f528, f536; +fma.rn.f32 f603, f602, 0f3F167918, f601; +sub.f32 f604, f599, f603; +add.f32 f605, f603, f599; +mul.f32 f606, f589, 0f3F4F1BBD; +sub.f32 f607, f511, f606; +fma.rn.f32 f608, f591, 0f3E9E377A, f607; +mul.f32 f609, f600, 0f3F167918; +mul.f32 f610, f602, 0f3F737871; +sub.f32 f611, f609, f610; +sub.f32 f612, f608, f611; +add.f32 f613, f611, f608; +fma.rn.f32 f614, f593, 0f3E9E377A, f512; +mul.f32 f615, f595, 0f3F4F1BBD; +sub.f32 f616, f614, f615; +sub.f32 f617, f519, f543; +mul.f32 f618, f617, 0f3F737871; +sub.f32 f619, f527, f535; +fma.rn.f32 f620, f619, 0f3F167918, f618; +add.f32 f621, f620, f616; +sub.f32 f622, f616, f620; +mul.f32 f623, f593, 0f3F4F1BBD; +sub.f32 f624, f512, f623; +fma.rn.f32 f625, f595, 0f3E9E377A, f624; +mul.f32 f626, f617, 0f3F167918; +mul.f32 f627, f619, 0f3F737871; +sub.f32 f628, f626, f627; +add.f32 f629, f628, f625; +sub.f32 f630, f625, f628; +mul.f32 f631, f604, 0f3F4F1BBD; +mul.f32 f632, f621, 0f3F167918; +sub.f32 f633, f631, f632; +mul.f32 f634, f621, 0f3F4F1BBD; +fma.rn.f32 f635, f604, 0f3F167918, f634; +mul.f32 f636, f612, 0f3E9E377A; +mul.f32 f637, f629, 0f3F737871; +sub.f32 f638, f636, f637; +mul.f32 f639, f629, 0f3E9E377A; +fma.rn.f32 f640, f612, 0f3F737871, f639; +mul.f32 f641, f613, 0fBE9E377A; +mul.f32 f642, f630, 0f3F737871; +sub.f32 f643, f641, f642; +mul.f32 f644, f630, 0fBE9E377A; +fma.rn.f32 f645, f613, 0f3F737871, f644; +mul.f32 f646, f605, 0fBF4F1BBD; +mul.f32 f647, f622, 0f3F167918; +sub.f32 f648, f646, f647; +mul.f32 f649, f622, 0fBF4F1BBD; +fma.rn.f32 f650, f605, 0f3F167918, f649; +add.f32 %1, f554, f596; +add.f32 %0, f550, f592; +add.f32 %3, f579, f635; +add.f32 %2, f562, f633; +add.f32 %5, f587, f640; +add.f32 %4, f570, f638; +add.f32 %7, f588, f645; +add.f32 %6, f571, f643; +add.f32 %9, f580, f650; +add.f32 %8, f563, f648; +sub.f32 %11, f554, f596; +sub.f32 %10, f550, f592; +sub.f32 %13, f579, f635; +sub.f32 %12, f562, f633; +sub.f32 %15, f587, f640; +sub.f32 %14, f570, f638; +sub.f32 %17, f588, f645; +sub.f32 %16, f571, f643; +sub.f32 %19, f580, f650; +sub.f32 %18, f563, f648; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<396, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<631>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 4000, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %28, %44; +add.f32 f42, %23, f41; +add.f32 f43, %33, %39; +add.f32 f44, f43, f42; +add.f32 f45, %30, %46; +add.f32 f46, %24, f45; +add.f32 f47, %35, %40; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %23; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %30, %46; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %35, %40; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %23, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %24; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %28, %44; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %33, %39; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %24, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %31, %47; +add.f32 f84, %25, f83; +add.f32 f85, %36, %41; +add.f32 f86, f85, f84; +add.f32 f87, %32, %48; +add.f32 f88, %27, f87; +add.f32 f89, %38, %43; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %25; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %32, %48; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %38, %43; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %25, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %27; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %31, %47; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %36, %41; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %27, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +add.f32 f145, f44, f86; +add.f32 f146, f48, f90; +sub.f32 f147, f44, f86; +sub.f32 f148, f48, f90; +add.f32 f149, f56, f127; +add.f32 f150, f73, f129; +sub.f32 f151, f56, f127; +sub.f32 f152, f73, f129; +add.f32 f153, f64, f132; +add.f32 f154, f81, f134; +sub.f32 f155, f64, f132; +sub.f32 f156, f81, f134; +add.f32 f157, f65, f137; +add.f32 f158, f82, f139; +sub.f32 f159, f65, f137; +sub.f32 f160, f82, f139; +add.f32 f161, f57, f142; +add.f32 f162, f74, f144; +sub.f32 f163, f57, f142; +sub.f32 f164, f74, f144; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f165, f166}, [rd6]; +mul.f32 f169, f150, f166; +fma.rn.f32 f170, f165, f149, f169; +mul.f32 f171, f149, f166; +mul.f32 f172, f165, f150; +sub.f32 f173, f172, f171; +mul.f32 f174, f165, f165; +mul.f32 f175, f166, f166; +sub.f32 f176, f174, f175; +mul.f32 f177, f166, f165; +fma.rn.f32 f178, f166, f165, f177; +mul.f32 f179, f154, f178; +fma.rn.f32 f180, f176, f153, f179; +mul.f32 f181, f153, f178; +mul.f32 f182, f176, f154; +sub.f32 f183, f182, f181; +mul.f32 f184, f165, f176; +mul.f32 f185, f166, f178; +sub.f32 f186, f184, f185; +mul.f32 f187, f165, f178; +fma.rn.f32 f188, f166, f176, f187; +mul.f32 f189, f158, f188; +fma.rn.f32 f190, f186, f157, f189; +mul.f32 f191, f157, f188; +mul.f32 f192, f186, f158; +sub.f32 f193, f192, f191; +mul.f32 f194, f165, f186; +mul.f32 f195, f166, f188; +sub.f32 f196, f194, f195; +mul.f32 f197, f165, f188; +fma.rn.f32 f198, f166, f186, f197; +mul.f32 f199, f162, f198; +fma.rn.f32 f200, f196, f161, f199; +mul.f32 f201, f161, f198; +mul.f32 f202, f196, f162; +sub.f32 f203, f202, f201; +mul.f32 f204, f165, f196; +mul.f32 f205, f166, f198; +sub.f32 f206, f204, f205; +mul.f32 f207, f165, f198; +fma.rn.f32 f208, f166, f196, f207; +mul.f32 f209, f148, f208; +fma.rn.f32 f210, f206, f147, f209; +mul.f32 f211, f147, f208; +mul.f32 f212, f206, f148; +sub.f32 f213, f212, f211; +mul.f32 f214, f165, f206; +mul.f32 f215, f166, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f165, f208; +fma.rn.f32 f218, f166, f206, f217; +mul.f32 f219, f152, f218; +fma.rn.f32 f220, f216, f151, f219; +mul.f32 f221, f151, f218; +mul.f32 f222, f216, f152; +sub.f32 f223, f222, f221; +mul.f32 f224, f165, f216; +mul.f32 f225, f166, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f165, f218; +fma.rn.f32 f228, f166, f216, f227; +mul.f32 f229, f156, f228; +fma.rn.f32 f230, f226, f155, f229; +mul.f32 f231, f155, f228; +mul.f32 f232, f226, f156; +sub.f32 f233, f232, f231; +mul.f32 f234, f165, f226; +mul.f32 f235, f166, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f165, f228; +fma.rn.f32 f238, f166, f226, f237; +mul.f32 f239, f160, f238; +fma.rn.f32 f240, f236, f159, f239; +mul.f32 f241, f159, f238; +mul.f32 f242, f236, f160; +sub.f32 f243, f242, f241; +mul.f32 f244, f165, f236; +mul.f32 f245, f166, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f165, f238; +fma.rn.f32 f248, f166, f236, f247; +mul.f32 f249, f164, f248; +fma.rn.f32 f250, f246, f163, f249; +mul.f32 f251, f163, f248; +mul.f32 f252, f246, f164; +sub.f32 f253, f252, f251; +mad.lo.s32 r8, r5, 4000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.v2.f32 [r9], {f145, f170}; +st.shared.v2.f32 [r9+8], {f180, f190}; +st.shared.v2.f32 [r9+16], {f200, f210}; +st.shared.v2.f32 [r9+24], {f220, f230}; +st.shared.v2.f32 [r9+32], {f240, f250}; +barrier.sync 0; +mad.lo.s32 r10, r7, -36, r9; +ld.shared.f32 f254, [r10]; +ld.shared.f32 f255, [r10+400]; +ld.shared.f32 f256, [r10+800]; +ld.shared.f32 f257, [r10+1200]; +ld.shared.f32 f258, [r10+1600]; +ld.shared.f32 f259, [r10+2000]; +ld.shared.f32 f260, [r10+2400]; +ld.shared.f32 f261, [r10+2800]; +ld.shared.f32 f262, [r10+3200]; +ld.shared.f32 f263, [r10+3600]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f146, f173}; +st.shared.v2.f32 [r9+8], {f183, f193}; +st.shared.v2.f32 [r9+16], {f203, f213}; +st.shared.v2.f32 [r9+24], {f223, f233}; +st.shared.v2.f32 [r9+32], {f243, f253}; +barrier.sync 0; +ld.shared.f32 f264, [r10]; +ld.shared.f32 f265, [r10+400]; +ld.shared.f32 f266, [r10+800]; +ld.shared.f32 f267, [r10+1200]; +ld.shared.f32 f268, [r10+1600]; +ld.shared.f32 f269, [r10+2000]; +ld.shared.f32 f270, [r10+2400]; +ld.shared.f32 f271, [r10+2800]; +ld.shared.f32 f272, [r10+3200]; +ld.shared.f32 f273, [r10+3600]; +add.f32 f274, f256, f262; +add.f32 f275, f254, f274; +add.f32 f276, f258, f260; +add.f32 f277, f276, f275; +add.f32 f278, f266, f272; +add.f32 f279, f264, f278; +add.f32 f280, f268, f270; +add.f32 f281, f280, f279; +fma.rn.f32 f282, f274, 0f3E9E377A, f254; +mul.f32 f283, f276, 0f3F4F1BBD; +sub.f32 f284, f282, f283; +sub.f32 f285, f266, f272; +mul.f32 f286, f285, 0f3F737871; +sub.f32 f287, f268, f270; +fma.rn.f32 f288, f287, 0f3F167918, f286; +sub.f32 f289, f284, f288; +add.f32 f290, f288, f284; +mul.f32 f291, f274, 0f3F4F1BBD; +sub.f32 f292, f254, f291; +fma.rn.f32 f293, f276, 0f3E9E377A, f292; +mul.f32 f294, f285, 0f3F167918; +mul.f32 f295, f287, 0f3F737871; +sub.f32 f296, f294, f295; +sub.f32 f297, f293, f296; +add.f32 f298, f296, f293; +fma.rn.f32 f299, f278, 0f3E9E377A, f264; +mul.f32 f300, f280, 0f3F4F1BBD; +sub.f32 f301, f299, f300; +sub.f32 f302, f256, f262; +mul.f32 f303, f302, 0f3F737871; +sub.f32 f304, f258, f260; +fma.rn.f32 f305, f304, 0f3F167918, f303; +add.f32 f306, f305, f301; +sub.f32 f307, f301, f305; +mul.f32 f308, f278, 0f3F4F1BBD; +sub.f32 f309, f264, f308; +fma.rn.f32 f310, f280, 0f3E9E377A, f309; +mul.f32 f311, f302, 0f3F167918; +mul.f32 f312, f304, 0f3F737871; +sub.f32 f313, f311, f312; +add.f32 f314, f313, f310; +sub.f32 f315, f310, f313; +add.f32 f316, f257, f263; +add.f32 f317, f255, f316; +add.f32 f318, f259, f261; +add.f32 f319, f318, f317; +add.f32 f320, f267, f273; +add.f32 f321, f265, f320; +add.f32 f322, f269, f271; +add.f32 f323, f322, f321; +fma.rn.f32 f324, f316, 0f3E9E377A, f255; +mul.f32 f325, f318, 0f3F4F1BBD; +sub.f32 f326, f324, f325; +sub.f32 f327, f267, f273; +mul.f32 f328, f327, 0f3F737871; +sub.f32 f329, f269, f271; +fma.rn.f32 f330, f329, 0f3F167918, f328; +sub.f32 f331, f326, f330; +add.f32 f332, f330, f326; +mul.f32 f333, f316, 0f3F4F1BBD; +sub.f32 f334, f255, f333; +fma.rn.f32 f335, f318, 0f3E9E377A, f334; +mul.f32 f336, f327, 0f3F167918; +mul.f32 f337, f329, 0f3F737871; +sub.f32 f338, f336, f337; +sub.f32 f339, f335, f338; +add.f32 f340, f338, f335; +fma.rn.f32 f341, f320, 0f3E9E377A, f265; +mul.f32 f342, f322, 0f3F4F1BBD; +sub.f32 f343, f341, f342; +sub.f32 f344, f257, f263; +mul.f32 f345, f344, 0f3F737871; +sub.f32 f346, f259, f261; +fma.rn.f32 f347, f346, 0f3F167918, f345; +add.f32 f348, f347, f343; +sub.f32 f349, f343, f347; +mul.f32 f350, f320, 0f3F4F1BBD; +sub.f32 f351, f265, f350; +fma.rn.f32 f352, f322, 0f3E9E377A, f351; +mul.f32 f353, f344, 0f3F167918; +mul.f32 f354, f346, 0f3F737871; +sub.f32 f355, f353, f354; +add.f32 f356, f355, f352; +sub.f32 f357, f352, f355; +mul.f32 f358, f331, 0f3F4F1BBD; +mul.f32 f359, f348, 0f3F167918; +sub.f32 f360, f358, f359; +mul.f32 f361, f348, 0f3F4F1BBD; +fma.rn.f32 f362, f331, 0f3F167918, f361; +mul.f32 f363, f339, 0f3E9E377A; +mul.f32 f364, f356, 0f3F737871; +sub.f32 f365, f363, f364; +mul.f32 f366, f356, 0f3E9E377A; +fma.rn.f32 f367, f339, 0f3F737871, f366; +mul.f32 f368, f340, 0fBE9E377A; +mul.f32 f369, f357, 0f3F737871; +sub.f32 f370, f368, f369; +mul.f32 f371, f357, 0fBE9E377A; +fma.rn.f32 f372, f340, 0f3F737871, f371; +mul.f32 f373, f332, 0fBF4F1BBD; +mul.f32 f374, f349, 0f3F167918; +sub.f32 f375, f373, f374; +mul.f32 f376, f349, 0fBF4F1BBD; +fma.rn.f32 f377, f332, 0f3F167918, f376; +add.f32 f378, f277, f319; +add.f32 f379, f281, f323; +sub.f32 f380, f277, f319; +sub.f32 f381, f281, f323; +add.f32 f382, f289, f360; +add.f32 f383, f306, f362; +sub.f32 f384, f289, f360; +sub.f32 f385, f306, f362; +add.f32 f386, f297, f365; +add.f32 f387, f314, f367; +sub.f32 f388, f297, f365; +sub.f32 f389, f314, f367; +add.f32 f390, f298, f370; +add.f32 f391, f315, f372; +sub.f32 f392, f298, f370; +sub.f32 f393, f315, f372; +add.f32 f394, f290, f375; +add.f32 f395, f307, f377; +sub.f32 f396, f290, f375; +sub.f32 f397, f307, f377; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f398, f399}, [rd11]; +mul.f32 f402, f383, f399; +fma.rn.f32 f403, f398, f382, f402; +mul.f32 f404, f382, f399; +mul.f32 f405, f398, f383; +sub.f32 f406, f405, f404; +mul.f32 f407, f398, f398; +mul.f32 f408, f399, f399; +sub.f32 f409, f407, f408; +mul.f32 f410, f399, f398; +fma.rn.f32 f411, f399, f398, f410; +mul.f32 f412, f387, f411; +fma.rn.f32 f413, f409, f386, f412; +mul.f32 f414, f386, f411; +mul.f32 f415, f409, f387; +sub.f32 f416, f415, f414; +mul.f32 f417, f398, f409; +mul.f32 f418, f399, f411; +sub.f32 f419, f417, f418; +mul.f32 f420, f398, f411; +fma.rn.f32 f421, f399, f409, f420; +mul.f32 f422, f391, f421; +fma.rn.f32 f423, f419, f390, f422; +mul.f32 f424, f390, f421; +mul.f32 f425, f419, f391; +sub.f32 f426, f425, f424; +mul.f32 f427, f398, f419; +mul.f32 f428, f399, f421; +sub.f32 f429, f427, f428; +mul.f32 f430, f398, f421; +fma.rn.f32 f431, f399, f419, f430; +mul.f32 f432, f395, f431; +fma.rn.f32 f433, f429, f394, f432; +mul.f32 f434, f394, f431; +mul.f32 f435, f429, f395; +sub.f32 f436, f435, f434; +mul.f32 f437, f398, f429; +mul.f32 f438, f399, f431; +sub.f32 f439, f437, f438; +mul.f32 f440, f398, f431; +fma.rn.f32 f441, f399, f429, f440; +mul.f32 f442, f381, f441; +fma.rn.f32 f443, f439, f380, f442; +mul.f32 f444, f380, f441; +mul.f32 f445, f439, f381; +sub.f32 f446, f445, f444; +mul.f32 f447, f398, f439; +mul.f32 f448, f399, f441; +sub.f32 f449, f447, f448; +mul.f32 f450, f398, f441; +fma.rn.f32 f451, f399, f439, f450; +mul.f32 f452, f385, f451; +fma.rn.f32 f453, f449, f384, f452; +mul.f32 f454, f384, f451; +mul.f32 f455, f449, f385; +sub.f32 f456, f455, f454; +mul.f32 f457, f398, f449; +mul.f32 f458, f399, f451; +sub.f32 f459, f457, f458; +mul.f32 f460, f398, f451; +fma.rn.f32 f461, f399, f449, f460; +mul.f32 f462, f389, f461; +fma.rn.f32 f463, f459, f388, f462; +mul.f32 f464, f388, f461; +mul.f32 f465, f459, f389; +sub.f32 f466, f465, f464; +mul.f32 f467, f398, f459; +mul.f32 f468, f399, f461; +sub.f32 f469, f467, f468; +mul.f32 f470, f398, f461; +fma.rn.f32 f471, f399, f459, f470; +mul.f32 f472, f393, f471; +fma.rn.f32 f473, f469, f392, f472; +mul.f32 f474, f392, f471; +mul.f32 f475, f469, f393; +sub.f32 f476, f475, f474; +mul.f32 f477, f398, f469; +mul.f32 f478, f399, f471; +sub.f32 f479, f477, f478; +mul.f32 f480, f398, f471; +fma.rn.f32 f481, f399, f469, f480; +mul.f32 f482, f397, f481; +fma.rn.f32 f483, f479, f396, f482; +mul.f32 f484, f396, f481; +mul.f32 f485, f479, f397; +sub.f32 f486, f485, f484; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 400, r15; +st.shared.f32 [r16], f378; +st.shared.f32 [r16+40], f403; +st.shared.f32 [r16+80], f413; +st.shared.f32 [r16+120], f423; +st.shared.f32 [r16+160], f433; +st.shared.f32 [r16+200], f443; +st.shared.f32 [r16+240], f453; +st.shared.f32 [r16+280], f463; +st.shared.f32 [r16+320], f473; +st.shared.f32 [r16+360], f483; +barrier.sync 0; +ld.shared.f32 f487, [r10]; +ld.shared.f32 f488, [r10+400]; +ld.shared.f32 f489, [r10+800]; +ld.shared.f32 f490, [r10+1200]; +ld.shared.f32 f491, [r10+1600]; +ld.shared.f32 f492, [r10+2000]; +ld.shared.f32 f493, [r10+2400]; +ld.shared.f32 f494, [r10+2800]; +ld.shared.f32 f495, [r10+3200]; +ld.shared.f32 f496, [r10+3600]; +barrier.sync 0; +st.shared.f32 [r16], f379; +st.shared.f32 [r16+40], f406; +st.shared.f32 [r16+80], f416; +st.shared.f32 [r16+120], f426; +st.shared.f32 [r16+160], f436; +st.shared.f32 [r16+200], f446; +st.shared.f32 [r16+240], f456; +st.shared.f32 [r16+280], f466; +st.shared.f32 [r16+320], f476; +st.shared.f32 [r16+360], f486; +barrier.sync 0; +ld.shared.f32 f497, [r10]; +ld.shared.f32 f498, [r10+400]; +ld.shared.f32 f499, [r10+800]; +ld.shared.f32 f500, [r10+1200]; +ld.shared.f32 f501, [r10+1600]; +ld.shared.f32 f502, [r10+2000]; +ld.shared.f32 f503, [r10+2400]; +ld.shared.f32 f504, [r10+2800]; +ld.shared.f32 f505, [r10+3200]; +ld.shared.f32 f506, [r10+3600]; +add.f32 f507, f489, f495; +add.f32 f508, f487, f507; +add.f32 f509, f491, f493; +add.f32 f510, f509, f508; +add.f32 f511, f499, f505; +add.f32 f512, f497, f511; +add.f32 f513, f501, f503; +add.f32 f514, f513, f512; +fma.rn.f32 f515, f507, 0f3E9E377A, f487; +mul.f32 f516, f509, 0f3F4F1BBD; +sub.f32 f517, f515, f516; +sub.f32 f518, f499, f505; +mul.f32 f519, f518, 0f3F737871; +sub.f32 f520, f501, f503; +fma.rn.f32 f521, f520, 0f3F167918, f519; +sub.f32 f522, f517, f521; +add.f32 f523, f521, f517; +mul.f32 f524, f507, 0f3F4F1BBD; +sub.f32 f525, f487, f524; +fma.rn.f32 f526, f509, 0f3E9E377A, f525; +mul.f32 f527, f518, 0f3F167918; +mul.f32 f528, f520, 0f3F737871; +sub.f32 f529, f527, f528; +sub.f32 f530, f526, f529; +add.f32 f531, f529, f526; +fma.rn.f32 f532, f511, 0f3E9E377A, f497; +mul.f32 f533, f513, 0f3F4F1BBD; +sub.f32 f534, f532, f533; +sub.f32 f535, f489, f495; +mul.f32 f536, f535, 0f3F737871; +sub.f32 f537, f491, f493; +fma.rn.f32 f538, f537, 0f3F167918, f536; +add.f32 f539, f538, f534; +sub.f32 f540, f534, f538; +mul.f32 f541, f511, 0f3F4F1BBD; +sub.f32 f542, f497, f541; +fma.rn.f32 f543, f513, 0f3E9E377A, f542; +mul.f32 f544, f535, 0f3F167918; +mul.f32 f545, f537, 0f3F737871; +sub.f32 f546, f544, f545; +add.f32 f547, f546, f543; +sub.f32 f548, f543, f546; +add.f32 f549, f490, f496; +add.f32 f550, f488, f549; +add.f32 f551, f492, f494; +add.f32 f552, f551, f550; +add.f32 f553, f500, f506; +add.f32 f554, f498, f553; +add.f32 f555, f502, f504; +add.f32 f556, f555, f554; +fma.rn.f32 f557, f549, 0f3E9E377A, f488; +mul.f32 f558, f551, 0f3F4F1BBD; +sub.f32 f559, f557, f558; +sub.f32 f560, f500, f506; +mul.f32 f561, f560, 0f3F737871; +sub.f32 f562, f502, f504; +fma.rn.f32 f563, f562, 0f3F167918, f561; +sub.f32 f564, f559, f563; +add.f32 f565, f563, f559; +mul.f32 f566, f549, 0f3F4F1BBD; +sub.f32 f567, f488, f566; +fma.rn.f32 f568, f551, 0f3E9E377A, f567; +mul.f32 f569, f560, 0f3F167918; +mul.f32 f570, f562, 0f3F737871; +sub.f32 f571, f569, f570; +sub.f32 f572, f568, f571; +add.f32 f573, f571, f568; +fma.rn.f32 f574, f553, 0f3E9E377A, f498; +mul.f32 f575, f555, 0f3F4F1BBD; +sub.f32 f576, f574, f575; +sub.f32 f577, f490, f496; +mul.f32 f578, f577, 0f3F737871; +sub.f32 f579, f492, f494; +fma.rn.f32 f580, f579, 0f3F167918, f578; +add.f32 f581, f580, f576; +sub.f32 f582, f576, f580; +mul.f32 f583, f553, 0f3F4F1BBD; +sub.f32 f584, f498, f583; +fma.rn.f32 f585, f555, 0f3E9E377A, f584; +mul.f32 f586, f577, 0f3F167918; +mul.f32 f587, f579, 0f3F737871; +sub.f32 f588, f586, f587; +add.f32 f589, f588, f585; +sub.f32 f590, f585, f588; +mul.f32 f591, f564, 0f3F4F1BBD; +mul.f32 f592, f581, 0f3F167918; +sub.f32 f593, f591, f592; +mul.f32 f594, f581, 0f3F4F1BBD; +fma.rn.f32 f595, f564, 0f3F167918, f594; +mul.f32 f596, f572, 0f3E9E377A; +mul.f32 f597, f589, 0f3F737871; +sub.f32 f598, f596, f597; +mul.f32 f599, f589, 0f3E9E377A; +fma.rn.f32 f600, f572, 0f3F737871, f599; +mul.f32 f601, f573, 0fBE9E377A; +mul.f32 f602, f590, 0f3F737871; +sub.f32 f603, f601, f602; +mul.f32 f604, f590, 0fBE9E377A; +fma.rn.f32 f605, f573, 0f3F737871, f604; +mul.f32 f606, f565, 0fBF4F1BBD; +mul.f32 f607, f582, 0f3F167918; +sub.f32 f608, f606, f607; +mul.f32 f609, f582, 0fBF4F1BBD; +fma.rn.f32 f610, f565, 0f3F167918, f609; +add.f32 %0, f510, f552; +add.f32 %1, f514, f556; +add.f32 %3, f539, f595; +add.f32 %2, f522, f593; +add.f32 %5, f547, f600; +add.f32 %4, f530, f598; +add.f32 %7, f548, f605; +add.f32 %6, f531, f603; +add.f32 %9, f540, f610; +add.f32 %8, f523, f608; +sub.f32 %10, f510, f552; +sub.f32 %11, f514, f556; +sub.f32 %13, f539, f595; +sub.f32 %12, f522, f593; +sub.f32 %15, f547, f600; +sub.f32 %14, f530, f598; +sub.f32 %17, f548, f605; +sub.f32 %16, f531, f603; +sub.f32 %19, f540, f610; +sub.f32 %18, f523, f608; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_1000), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..4e7f76732ea1e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp64_fwd.hpp.inc @@ -0,0 +1,1300 @@ +#ifndef CUFFTDX_FFT_1000_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_1000_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<568, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<681>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 16000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %28, %44; +add.f64 fd42, %23, fd41; +add.f64 fd43, %33, %39; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %30, %46; +add.f64 fd46, %24, fd45; +add.f64 fd47, %35, %40; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %23; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %30, %46; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %35, %40; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %23, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %24; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %28, %44; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %33, %39; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %24, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %31, %47; +add.f64 fd86, %25, fd85; +add.f64 fd87, %36, %41; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %32, %48; +add.f64 fd90, %27, fd89; +add.f64 fd91, %38, %43; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %25; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %32, %48; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %38, %43; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %25, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %27; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %31, %47; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %36, %41; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %27, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +sub.f64 fd149, fd44, fd88; +sub.f64 fd150, fd48, fd92; +add.f64 fd151, fd57, fd131; +add.f64 fd152, fd75, fd133; +sub.f64 fd153, fd57, fd131; +sub.f64 fd154, fd75, fd133; +add.f64 fd155, fd65, fd136; +add.f64 fd156, fd83, fd138; +sub.f64 fd157, fd65, fd136; +sub.f64 fd158, fd83, fd138; +add.f64 fd159, fd66, fd141; +add.f64 fd160, fd84, fd143; +sub.f64 fd161, fd66, fd141; +sub.f64 fd162, fd84, fd143; +add.f64 fd163, fd58, fd146; +add.f64 fd164, fd76, fd148; +sub.f64 fd165, fd58, fd146; +sub.f64 fd166, fd76, fd148; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 16000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd167, fd168}, [rd6]; +mul.f64 fd171, fd167, fd151; +mul.f64 fd172, fd168, fd152; +mul.f64 fd173, fd167, fd152; +mul.f64 fd174, fd167, fd167; +mul.f64 fd175, fd168, fd168; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd168, fd167; +fma.rn.f64 fd178, fd168, fd167, fd177; +mul.f64 fd179, fd176, fd155; +mul.f64 fd180, fd178, fd156; +mul.f64 fd181, fd176, fd156; +mul.f64 fd182, fd167, fd176; +mul.f64 fd183, fd168, fd178; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd167, fd178; +fma.rn.f64 fd186, fd168, fd176, fd185; +mul.f64 fd187, fd184, fd159; +mul.f64 fd188, fd186, fd160; +mul.f64 fd189, fd184, fd160; +mul.f64 fd190, fd167, fd184; +mul.f64 fd191, fd168, fd186; +sub.f64 fd192, fd190, fd191; +mul.f64 fd193, fd167, fd186; +fma.rn.f64 fd194, fd168, fd184, fd193; +mul.f64 fd195, fd192, fd163; +mul.f64 fd196, fd194, fd164; +mul.f64 fd197, fd192, fd164; +mul.f64 fd198, fd167, fd192; +mul.f64 fd199, fd168, fd194; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd167, fd194; +fma.rn.f64 fd202, fd168, fd192, fd201; +mul.f64 fd203, fd200, fd149; +mul.f64 fd204, fd202, fd150; +mul.f64 fd205, fd200, fd150; +ld.global.v2.f64 {fd206, fd207}, [rd6+1600]; +mul.f64 fd210, fd206, fd153; +mul.f64 fd211, fd207, fd154; +mul.f64 fd212, fd206, fd154; +mul.f64 fd213, fd167, fd206; +mul.f64 fd214, fd168, fd207; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd167, fd207; +fma.rn.f64 fd217, fd168, fd206, fd216; +mul.f64 fd218, fd215, fd157; +mul.f64 fd219, fd217, fd158; +mul.f64 fd220, fd215, fd158; +mul.f64 fd221, fd167, fd215; +mul.f64 fd222, fd168, fd217; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd167, fd217; +fma.rn.f64 fd225, fd168, fd215, fd224; +mul.f64 fd226, fd223, fd161; +mul.f64 fd227, fd225, fd162; +mul.f64 fd228, fd223, fd162; +mul.f64 fd229, fd167, fd223; +mul.f64 fd230, fd168, fd225; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd167, fd225; +fma.rn.f64 fd233, fd168, fd223, fd232; +mul.f64 fd234, fd231, fd165; +mul.f64 fd235, fd233, fd166; +mul.f64 fd236, fd231, fd166; +barrier.sync 0; +mad.lo.s32 r9, r7, 160, r8; +add.f64 fd237, fd48, fd92; +add.f64 fd238, fd44, fd88; +st.shared.v2.f64 [r9], {fd238, fd237}; +fma.rn.f64 fd239, fd168, fd151, fd173; +sub.f64 fd240, fd171, fd172; +st.shared.v2.f64 [r9+16], {fd240, fd239}; +fma.rn.f64 fd241, fd178, fd155, fd181; +sub.f64 fd242, fd179, fd180; +st.shared.v2.f64 [r9+32], {fd242, fd241}; +fma.rn.f64 fd243, fd186, fd159, fd189; +sub.f64 fd244, fd187, fd188; +st.shared.v2.f64 [r9+48], {fd244, fd243}; +sub.f64 fd245, fd195, fd196; +fma.rn.f64 fd246, fd194, fd163, fd197; +st.shared.v2.f64 [r9+64], {fd245, fd246}; +fma.rn.f64 fd247, fd202, fd149, fd205; +sub.f64 fd248, fd203, fd204; +st.shared.v2.f64 [r9+80], {fd248, fd247}; +fma.rn.f64 fd249, fd207, fd153, fd212; +sub.f64 fd250, fd210, fd211; +st.shared.v2.f64 [r9+96], {fd250, fd249}; +fma.rn.f64 fd251, fd217, fd157, fd220; +sub.f64 fd252, fd218, fd219; +st.shared.v2.f64 [r9+112], {fd252, fd251}; +fma.rn.f64 fd253, fd225, fd161, fd228; +sub.f64 fd254, fd226, fd227; +st.shared.v2.f64 [r9+128], {fd254, fd253}; +sub.f64 fd255, fd234, fd235; +fma.rn.f64 fd256, fd233, fd165, fd236; +st.shared.v2.f64 [r9+144], {fd255, fd256}; +barrier.sync 0; +mad.lo.s32 r10, r7, -144, r9; +ld.shared.v2.f64 {fd257, fd258}, [r10]; +ld.shared.v2.f64 {fd261, fd262}, [r10+1600]; +ld.shared.v2.f64 {fd265, fd266}, [r10+3200]; +ld.shared.v2.f64 {fd269, fd270}, [r10+4800]; +ld.shared.v2.f64 {fd273, fd274}, [r10+6400]; +ld.shared.v2.f64 {fd277, fd278}, [r10+8000]; +ld.shared.v2.f64 {fd281, fd282}, [r10+9600]; +ld.shared.v2.f64 {fd285, fd286}, [r10+11200]; +ld.shared.v2.f64 {fd289, fd290}, [r10+12800]; +ld.shared.v2.f64 {fd293, fd294}, [r10+14400]; +add.f64 fd297, fd265, fd289; +add.f64 fd298, fd257, fd297; +add.f64 fd299, fd273, fd281; +add.f64 fd300, fd299, fd298; +add.f64 fd301, fd266, fd290; +add.f64 fd302, fd258, fd301; +add.f64 fd303, fd274, fd282; +add.f64 fd304, fd303, fd302; +fma.rn.f64 fd305, fd297, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd306, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd307, fd305, fd306; +sub.f64 fd308, fd266, fd290; +mul.f64 fd309, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd310, fd274, fd282; +mul.f64 fd311, fd310, 0dBFE2CF2304755A5E; +sub.f64 fd312, fd311, fd309; +sub.f64 fd313, fd307, fd312; +add.f64 fd314, fd312, fd307; +mul.f64 fd315, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd316, fd257, fd315; +fma.rn.f64 fd317, fd299, 0d3FD3C6EF372FE950, fd316; +mul.f64 fd318, fd308, 0d3FE2CF2304755A5E; +mul.f64 fd319, fd310, 0d3FEE6F0E134454FF; +sub.f64 fd320, fd319, fd318; +sub.f64 fd321, fd317, fd320; +add.f64 fd322, fd320, fd317; +fma.rn.f64 fd323, fd301, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd324, fd303, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd265, fd289; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd273, fd281; +mul.f64 fd329, fd328, 0dBFE2CF2304755A5E; +sub.f64 fd330, fd329, fd327; +add.f64 fd331, fd330, fd325; +sub.f64 fd332, fd325, fd330; +mul.f64 fd333, fd301, 0d3FE9E3779B97F4A8; +sub.f64 fd334, fd258, fd333; +fma.rn.f64 fd335, fd303, 0d3FD3C6EF372FE950, fd334; +mul.f64 fd336, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd337, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd338, fd337, fd336; +add.f64 fd339, fd338, fd335; +sub.f64 fd340, fd335, fd338; +add.f64 fd341, fd269, fd293; +add.f64 fd342, fd261, fd341; +add.f64 fd343, fd277, fd285; +add.f64 fd344, fd343, fd342; +add.f64 fd345, fd270, fd294; +add.f64 fd346, fd262, fd345; +add.f64 fd347, fd278, fd286; +add.f64 fd348, fd347, fd346; +fma.rn.f64 fd349, fd341, 0d3FD3C6EF372FE950, fd261; +mul.f64 fd350, fd343, 0d3FE9E3779B97F4A8; +sub.f64 fd351, fd349, fd350; +sub.f64 fd352, fd270, fd294; +mul.f64 fd353, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd354, fd278, fd286; +mul.f64 fd355, fd354, 0dBFE2CF2304755A5E; +sub.f64 fd356, fd355, fd353; +sub.f64 fd357, fd351, fd356; +add.f64 fd358, fd356, fd351; +mul.f64 fd359, fd341, 0d3FE9E3779B97F4A8; +sub.f64 fd360, fd261, fd359; +fma.rn.f64 fd361, fd343, 0d3FD3C6EF372FE950, fd360; +mul.f64 fd362, fd352, 0d3FE2CF2304755A5E; +mul.f64 fd363, fd354, 0d3FEE6F0E134454FF; +sub.f64 fd364, fd363, fd362; +sub.f64 fd365, fd361, fd364; +add.f64 fd366, fd364, fd361; +fma.rn.f64 fd367, fd345, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd368, fd347, 0d3FE9E3779B97F4A8; +sub.f64 fd369, fd367, fd368; +sub.f64 fd370, fd269, fd293; +mul.f64 fd371, fd370, 0d3FEE6F0E134454FF; +sub.f64 fd372, fd277, fd285; +mul.f64 fd373, fd372, 0dBFE2CF2304755A5E; +sub.f64 fd374, fd373, fd371; +add.f64 fd375, fd374, fd369; +sub.f64 fd376, fd369, fd374; +mul.f64 fd377, fd345, 0d3FE9E3779B97F4A8; +sub.f64 fd378, fd262, fd377; +fma.rn.f64 fd379, fd347, 0d3FD3C6EF372FE950, fd378; +mul.f64 fd380, fd370, 0d3FE2CF2304755A5E; +mul.f64 fd381, fd372, 0d3FEE6F0E134454FF; +sub.f64 fd382, fd381, fd380; +add.f64 fd383, fd382, fd379; +sub.f64 fd384, fd379, fd382; +mul.f64 fd385, fd357, 0d3FE9E3779B97F4A8; +mul.f64 fd386, fd375, 0dBFE2CF2304755A5E; +sub.f64 fd387, fd385, fd386; +mul.f64 fd388, fd375, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd389, fd357, 0dBFE2CF2304755A5E, fd388; +mul.f64 fd390, fd365, 0d3FD3C6EF372FE950; +mul.f64 fd391, fd383, 0dBFEE6F0E134454FF; +sub.f64 fd392, fd390, fd391; +mul.f64 fd393, fd383, 0d3FD3C6EF372FE950; +fma.rn.f64 fd394, fd365, 0dBFEE6F0E134454FF, fd393; +mul.f64 fd395, fd366, 0dBFD3C6EF372FE950; +mul.f64 fd396, fd384, 0dBFEE6F0E134454FF; +sub.f64 fd397, fd395, fd396; +mul.f64 fd398, fd384, 0dBFD3C6EF372FE950; +fma.rn.f64 fd399, fd366, 0dBFEE6F0E134454FF, fd398; +mul.f64 fd400, fd358, 0dBFE9E3779B97F4A8; +mul.f64 fd401, fd376, 0dBFE2CF2304755A5E; +sub.f64 fd402, fd400, fd401; +mul.f64 fd403, fd376, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd404, fd358, 0dBFE2CF2304755A5E, fd403; +sub.f64 fd405, fd300, fd344; +sub.f64 fd406, fd304, fd348; +add.f64 fd407, fd313, fd387; +add.f64 fd408, fd331, fd389; +sub.f64 fd409, fd313, fd387; +sub.f64 fd410, fd331, fd389; +add.f64 fd411, fd321, fd392; +add.f64 fd412, fd339, fd394; +sub.f64 fd413, fd321, fd392; +sub.f64 fd414, fd339, fd394; +add.f64 fd415, fd322, fd397; +add.f64 fd416, fd340, fd399; +sub.f64 fd417, fd322, fd397; +sub.f64 fd418, fd340, fd399; +add.f64 fd419, fd314, fd402; +add.f64 fd420, fd332, fd404; +sub.f64 fd421, fd314, fd402; +sub.f64 fd422, fd332, fd404; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd423, fd424}, [rd11]; +mul.f64 fd427, fd423, fd407; +mul.f64 fd428, fd424, fd408; +mul.f64 fd429, fd423, fd408; +mul.f64 fd430, fd423, fd423; +mul.f64 fd431, fd424, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd424, fd423; +fma.rn.f64 fd434, fd424, fd423, fd433; +mul.f64 fd435, fd432, fd411; +mul.f64 fd436, fd434, fd412; +mul.f64 fd437, fd432, fd412; +mul.f64 fd438, fd423, fd432; +mul.f64 fd439, fd424, fd434; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd423, fd434; +fma.rn.f64 fd442, fd424, fd432, fd441; +mul.f64 fd443, fd440, fd415; +mul.f64 fd444, fd442, fd416; +mul.f64 fd445, fd440, fd416; +mul.f64 fd446, fd423, fd440; +mul.f64 fd447, fd424, fd442; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd423, fd442; +fma.rn.f64 fd450, fd424, fd440, fd449; +mul.f64 fd451, fd448, fd419; +mul.f64 fd452, fd450, fd420; +mul.f64 fd453, fd448, fd420; +mul.f64 fd454, fd423, fd448; +mul.f64 fd455, fd424, fd450; +sub.f64 fd456, fd454, fd455; +mul.f64 fd457, fd423, fd450; +fma.rn.f64 fd458, fd424, fd448, fd457; +mul.f64 fd459, fd456, fd405; +mul.f64 fd460, fd458, fd406; +mul.f64 fd461, fd456, fd406; +ld.global.v2.f64 {fd462, fd463}, [rd11+160]; +mul.f64 fd466, fd462, fd409; +mul.f64 fd467, fd463, fd410; +mul.f64 fd468, fd462, fd410; +mul.f64 fd469, fd423, fd462; +mul.f64 fd470, fd424, fd463; +sub.f64 fd471, fd469, fd470; +mul.f64 fd472, fd423, fd463; +fma.rn.f64 fd473, fd424, fd462, fd472; +mul.f64 fd474, fd471, fd413; +mul.f64 fd475, fd473, fd414; +mul.f64 fd476, fd471, fd414; +mul.f64 fd477, fd423, fd471; +mul.f64 fd478, fd424, fd473; +sub.f64 fd479, fd477, fd478; +mul.f64 fd480, fd423, fd473; +fma.rn.f64 fd481, fd424, fd471, fd480; +mul.f64 fd482, fd479, fd417; +mul.f64 fd483, fd481, fd418; +mul.f64 fd484, fd479, fd418; +mul.f64 fd485, fd423, fd479; +mul.f64 fd486, fd424, fd481; +sub.f64 fd487, fd485, fd486; +mul.f64 fd488, fd423, fd481; +fma.rn.f64 fd489, fd424, fd479, fd488; +mul.f64 fd490, fd487, fd421; +mul.f64 fd491, fd489, fd422; +mul.f64 fd492, fd487, fd422; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1600, r15; +add.f64 fd493, fd304, fd348; +add.f64 fd494, fd300, fd344; +st.shared.v2.f64 [r16], {fd494, fd493}; +fma.rn.f64 fd495, fd424, fd407, fd429; +sub.f64 fd496, fd427, fd428; +st.shared.v2.f64 [r16+160], {fd496, fd495}; +fma.rn.f64 fd497, fd434, fd411, fd437; +sub.f64 fd498, fd435, fd436; +st.shared.v2.f64 [r16+320], {fd498, fd497}; +fma.rn.f64 fd499, fd442, fd415, fd445; +sub.f64 fd500, fd443, fd444; +st.shared.v2.f64 [r16+480], {fd500, fd499}; +fma.rn.f64 fd501, fd450, fd419, fd453; +sub.f64 fd502, fd451, fd452; +st.shared.v2.f64 [r16+640], {fd502, fd501}; +sub.f64 fd503, fd459, fd460; +fma.rn.f64 fd504, fd458, fd405, fd461; +st.shared.v2.f64 [r16+800], {fd503, fd504}; +fma.rn.f64 fd505, fd463, fd409, fd468; +sub.f64 fd506, fd466, fd467; +st.shared.v2.f64 [r16+960], {fd506, fd505}; +fma.rn.f64 fd507, fd473, fd413, fd476; +sub.f64 fd508, fd474, fd475; +st.shared.v2.f64 [r16+1120], {fd508, fd507}; +fma.rn.f64 fd509, fd481, fd417, fd484; +sub.f64 fd510, fd482, fd483; +st.shared.v2.f64 [r16+1280], {fd510, fd509}; +fma.rn.f64 fd511, fd489, fd421, fd492; +sub.f64 fd512, fd490, fd491; +st.shared.v2.f64 [r16+1440], {fd512, fd511}; +barrier.sync 0; +ld.shared.v2.f64 {fd513, fd514}, [r10]; +ld.shared.v2.f64 {fd517, fd518}, [r10+1600]; +ld.shared.v2.f64 {fd521, fd522}, [r10+3200]; +ld.shared.v2.f64 {fd525, fd526}, [r10+4800]; +ld.shared.v2.f64 {fd529, fd530}, [r10+6400]; +ld.shared.v2.f64 {fd533, fd534}, [r10+8000]; +ld.shared.v2.f64 {fd537, fd538}, [r10+9600]; +ld.shared.v2.f64 {fd541, fd542}, [r10+11200]; +ld.shared.v2.f64 {fd545, fd546}, [r10+12800]; +ld.shared.v2.f64 {fd549, fd550}, [r10+14400]; +add.f64 fd553, fd521, fd545; +add.f64 fd554, fd513, fd553; +add.f64 fd555, fd529, fd537; +add.f64 fd556, fd555, fd554; +add.f64 fd557, fd522, fd546; +add.f64 fd558, fd514, fd557; +add.f64 fd559, fd530, fd538; +add.f64 fd560, fd559, fd558; +fma.rn.f64 fd561, fd553, 0d3FD3C6EF372FE950, fd513; +mul.f64 fd562, fd555, 0d3FE9E3779B97F4A8; +sub.f64 fd563, fd561, fd562; +sub.f64 fd564, fd522, fd546; +mul.f64 fd565, fd564, 0d3FEE6F0E134454FF; +sub.f64 fd566, fd530, fd538; +mul.f64 fd567, fd566, 0dBFE2CF2304755A5E; +sub.f64 fd568, fd567, fd565; +sub.f64 fd569, fd563, fd568; +add.f64 fd570, fd568, fd563; +mul.f64 fd571, fd553, 0d3FE9E3779B97F4A8; +sub.f64 fd572, fd513, fd571; +fma.rn.f64 fd573, fd555, 0d3FD3C6EF372FE950, fd572; +mul.f64 fd574, fd564, 0d3FE2CF2304755A5E; +mul.f64 fd575, fd566, 0d3FEE6F0E134454FF; +sub.f64 fd576, fd575, fd574; +sub.f64 fd577, fd573, fd576; +add.f64 fd578, fd576, fd573; +fma.rn.f64 fd579, fd557, 0d3FD3C6EF372FE950, fd514; +mul.f64 fd580, fd559, 0d3FE9E3779B97F4A8; +sub.f64 fd581, fd579, fd580; +sub.f64 fd582, fd521, fd545; +mul.f64 fd583, fd582, 0d3FEE6F0E134454FF; +sub.f64 fd584, fd529, fd537; +mul.f64 fd585, fd584, 0dBFE2CF2304755A5E; +sub.f64 fd586, fd585, fd583; +add.f64 fd587, fd586, fd581; +sub.f64 fd588, fd581, fd586; +mul.f64 fd589, fd557, 0d3FE9E3779B97F4A8; +sub.f64 fd590, fd514, fd589; +fma.rn.f64 fd591, fd559, 0d3FD3C6EF372FE950, fd590; +mul.f64 fd592, fd582, 0d3FE2CF2304755A5E; +mul.f64 fd593, fd584, 0d3FEE6F0E134454FF; +sub.f64 fd594, fd593, fd592; +add.f64 fd595, fd594, fd591; +sub.f64 fd596, fd591, fd594; +add.f64 fd597, fd525, fd549; +add.f64 fd598, fd517, fd597; +add.f64 fd599, fd533, fd541; +add.f64 fd600, fd599, fd598; +add.f64 fd601, fd526, fd550; +add.f64 fd602, fd518, fd601; +add.f64 fd603, fd534, fd542; +add.f64 fd604, fd603, fd602; +fma.rn.f64 fd605, fd597, 0d3FD3C6EF372FE950, fd517; +mul.f64 fd606, fd599, 0d3FE9E3779B97F4A8; +sub.f64 fd607, fd605, fd606; +sub.f64 fd608, fd526, fd550; +mul.f64 fd609, fd608, 0d3FEE6F0E134454FF; +sub.f64 fd610, fd534, fd542; +mul.f64 fd611, fd610, 0dBFE2CF2304755A5E; +sub.f64 fd612, fd611, fd609; +sub.f64 fd613, fd607, fd612; +add.f64 fd614, fd612, fd607; +mul.f64 fd615, fd597, 0d3FE9E3779B97F4A8; +sub.f64 fd616, fd517, fd615; +fma.rn.f64 fd617, fd599, 0d3FD3C6EF372FE950, fd616; +mul.f64 fd618, fd608, 0d3FE2CF2304755A5E; +mul.f64 fd619, fd610, 0d3FEE6F0E134454FF; +sub.f64 fd620, fd619, fd618; +sub.f64 fd621, fd617, fd620; +add.f64 fd622, fd620, fd617; +fma.rn.f64 fd623, fd601, 0d3FD3C6EF372FE950, fd518; +mul.f64 fd624, fd603, 0d3FE9E3779B97F4A8; +sub.f64 fd625, fd623, fd624; +sub.f64 fd626, fd525, fd549; +mul.f64 fd627, fd626, 0d3FEE6F0E134454FF; +sub.f64 fd628, fd533, fd541; +mul.f64 fd629, fd628, 0dBFE2CF2304755A5E; +sub.f64 fd630, fd629, fd627; +add.f64 fd631, fd630, fd625; +sub.f64 fd632, fd625, fd630; +mul.f64 fd633, fd601, 0d3FE9E3779B97F4A8; +sub.f64 fd634, fd518, fd633; +fma.rn.f64 fd635, fd603, 0d3FD3C6EF372FE950, fd634; +mul.f64 fd636, fd626, 0d3FE2CF2304755A5E; +mul.f64 fd637, fd628, 0d3FEE6F0E134454FF; +sub.f64 fd638, fd637, fd636; +add.f64 fd639, fd638, fd635; +sub.f64 fd640, fd635, fd638; +mul.f64 fd641, fd613, 0d3FE9E3779B97F4A8; +mul.f64 fd642, fd631, 0dBFE2CF2304755A5E; +sub.f64 fd643, fd641, fd642; +mul.f64 fd644, fd631, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd645, fd613, 0dBFE2CF2304755A5E, fd644; +mul.f64 fd646, fd621, 0d3FD3C6EF372FE950; +mul.f64 fd647, fd639, 0dBFEE6F0E134454FF; +sub.f64 fd648, fd646, fd647; +mul.f64 fd649, fd639, 0d3FD3C6EF372FE950; +fma.rn.f64 fd650, fd621, 0dBFEE6F0E134454FF, fd649; +mul.f64 fd651, fd622, 0dBFD3C6EF372FE950; +mul.f64 fd652, fd640, 0dBFEE6F0E134454FF; +sub.f64 fd653, fd651, fd652; +mul.f64 fd654, fd640, 0dBFD3C6EF372FE950; +fma.rn.f64 fd655, fd622, 0dBFEE6F0E134454FF, fd654; +mul.f64 fd656, fd614, 0dBFE9E3779B97F4A8; +mul.f64 fd657, fd632, 0dBFE2CF2304755A5E; +sub.f64 fd658, fd656, fd657; +mul.f64 fd659, fd632, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd660, fd614, 0dBFE2CF2304755A5E, fd659; +add.f64 %1, fd560, fd604; +add.f64 %0, fd556, fd600; +add.f64 %3, fd587, fd645; +add.f64 %2, fd569, fd643; +add.f64 %5, fd595, fd650; +add.f64 %4, fd577, fd648; +add.f64 %7, fd596, fd655; +add.f64 %6, fd578, fd653; +add.f64 %9, fd588, fd660; +add.f64 %8, fd570, fd658; +sub.f64 %11, fd560, fd604; +sub.f64 %10, fd556, fd600; +sub.f64 %13, fd587, fd645; +sub.f64 %12, fd569, fd643; +sub.f64 %15, fd595, fd650; +sub.f64 %14, fd577, fd648; +sub.f64 %17, fd596, fd655; +sub.f64 %16, fd578, fd653; +sub.f64 %19, fd588, fd660; +sub.f64 %18, fd570, fd658; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<569, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<641>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 8000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %28, %44; +add.f64 fd42, %23, fd41; +add.f64 fd43, %33, %39; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %30, %46; +add.f64 fd46, %24, fd45; +add.f64 fd47, %35, %40; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %23; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %30, %46; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %35, %40; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %23, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %24; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %28, %44; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %33, %39; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %24, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %31, %47; +add.f64 fd86, %25, fd85; +add.f64 fd87, %36, %41; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %32, %48; +add.f64 fd90, %27, fd89; +add.f64 fd91, %38, %43; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %25; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %32, %48; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %38, %43; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %25, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %27; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %31, %47; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %36, %41; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %27, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +add.f64 fd149, fd44, fd88; +add.f64 fd150, fd48, fd92; +sub.f64 fd151, fd44, fd88; +sub.f64 fd152, fd48, fd92; +add.f64 fd153, fd57, fd131; +add.f64 fd154, fd75, fd133; +sub.f64 fd155, fd57, fd131; +sub.f64 fd156, fd75, fd133; +add.f64 fd157, fd65, fd136; +add.f64 fd158, fd83, fd138; +sub.f64 fd159, fd65, fd136; +sub.f64 fd160, fd83, fd138; +add.f64 fd161, fd66, fd141; +add.f64 fd162, fd84, fd143; +sub.f64 fd163, fd66, fd141; +sub.f64 fd164, fd84, fd143; +add.f64 fd165, fd58, fd146; +add.f64 fd166, fd76, fd148; +sub.f64 fd167, fd58, fd146; +sub.f64 fd168, fd76, fd148; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd169, fd170}, [rd6]; +mul.f64 fd173, fd169, fd153; +mul.f64 fd174, fd170, fd154; +sub.f64 fd175, fd173, fd174; +mul.f64 fd176, fd169, fd154; +fma.rn.f64 fd177, fd170, fd153, fd176; +mul.f64 fd178, fd169, fd169; +mul.f64 fd179, fd170, fd170; +sub.f64 fd180, fd178, fd179; +mul.f64 fd181, fd170, fd169; +fma.rn.f64 fd182, fd170, fd169, fd181; +mul.f64 fd183, fd180, fd157; +mul.f64 fd184, fd182, fd158; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd180, fd158; +fma.rn.f64 fd187, fd182, fd157, fd186; +mul.f64 fd188, fd169, fd180; +mul.f64 fd189, fd170, fd182; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd169, fd182; +fma.rn.f64 fd192, fd170, fd180, fd191; +mul.f64 fd193, fd190, fd161; +mul.f64 fd194, fd192, fd162; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd190, fd162; +fma.rn.f64 fd197, fd192, fd161, fd196; +mul.f64 fd198, fd169, fd190; +mul.f64 fd199, fd170, fd192; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd169, fd192; +fma.rn.f64 fd202, fd170, fd190, fd201; +mul.f64 fd203, fd200, fd165; +mul.f64 fd204, fd202, fd166; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd166; +fma.rn.f64 fd207, fd202, fd165, fd206; +mul.f64 fd208, fd169, fd200; +mul.f64 fd209, fd170, fd202; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd169, fd202; +fma.rn.f64 fd212, fd170, fd200, fd211; +mul.f64 fd213, fd210, fd151; +mul.f64 fd214, fd212, fd152; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd210, fd152; +fma.rn.f64 fd217, fd212, fd151, fd216; +ld.global.v2.f64 {fd218, fd219}, [rd6+1600]; +mul.f64 fd222, fd218, fd155; +mul.f64 fd223, fd219, fd156; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd218, fd156; +fma.rn.f64 fd226, fd219, fd155, fd225; +mul.f64 fd227, fd169, fd218; +mul.f64 fd228, fd170, fd219; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd169, fd219; +fma.rn.f64 fd231, fd170, fd218, fd230; +mul.f64 fd232, fd229, fd159; +mul.f64 fd233, fd231, fd160; +sub.f64 fd234, fd232, fd233; +mul.f64 fd235, fd229, fd160; +fma.rn.f64 fd236, fd231, fd159, fd235; +mul.f64 fd237, fd169, fd229; +mul.f64 fd238, fd170, fd231; +sub.f64 fd239, fd237, fd238; +mul.f64 fd240, fd169, fd231; +fma.rn.f64 fd241, fd170, fd229, fd240; +mul.f64 fd242, fd239, fd163; +mul.f64 fd243, fd241, fd164; +sub.f64 fd244, fd242, fd243; +mul.f64 fd245, fd239, fd164; +fma.rn.f64 fd246, fd241, fd163, fd245; +mul.f64 fd247, fd169, fd239; +mul.f64 fd248, fd170, fd241; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd169, fd241; +fma.rn.f64 fd251, fd170, fd239, fd250; +mul.f64 fd252, fd249, fd167; +mul.f64 fd253, fd251, fd168; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd249, fd168; +fma.rn.f64 fd256, fd251, fd167, fd255; +mad.lo.s32 r8, r5, 8000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +st.shared.v2.f64 [r9], {fd149, fd175}; +st.shared.v2.f64 [r9+16], {fd185, fd195}; +st.shared.v2.f64 [r9+32], {fd205, fd215}; +st.shared.v2.f64 [r9+48], {fd224, fd234}; +st.shared.v2.f64 [r9+64], {fd244, fd254}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.f64 fd257, [r10]; +ld.shared.f64 fd258, [r10+800]; +ld.shared.f64 fd259, [r10+1600]; +ld.shared.f64 fd260, [r10+2400]; +ld.shared.f64 fd261, [r10+3200]; +ld.shared.f64 fd262, [r10+4000]; +ld.shared.f64 fd263, [r10+4800]; +ld.shared.f64 fd264, [r10+5600]; +ld.shared.f64 fd265, [r10+6400]; +ld.shared.f64 fd266, [r10+7200]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd150, fd177}; +st.shared.v2.f64 [r9+16], {fd187, fd197}; +st.shared.v2.f64 [r9+32], {fd207, fd217}; +st.shared.v2.f64 [r9+48], {fd226, fd236}; +st.shared.v2.f64 [r9+64], {fd246, fd256}; +barrier.sync 0; +ld.shared.f64 fd267, [r10]; +ld.shared.f64 fd268, [r10+800]; +ld.shared.f64 fd269, [r10+1600]; +ld.shared.f64 fd270, [r10+2400]; +ld.shared.f64 fd271, [r10+3200]; +ld.shared.f64 fd272, [r10+4000]; +ld.shared.f64 fd273, [r10+4800]; +ld.shared.f64 fd274, [r10+5600]; +ld.shared.f64 fd275, [r10+6400]; +ld.shared.f64 fd276, [r10+7200]; +add.f64 fd277, fd259, fd265; +add.f64 fd278, fd257, fd277; +add.f64 fd279, fd261, fd263; +add.f64 fd280, fd279, fd278; +add.f64 fd281, fd269, fd275; +add.f64 fd282, fd267, fd281; +add.f64 fd283, fd271, fd273; +add.f64 fd284, fd283, fd282; +fma.rn.f64 fd285, fd277, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, fd269, fd275; +mul.f64 fd289, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd290, fd271, fd273; +mul.f64 fd291, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd291, fd289; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, fd257, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +fma.rn.f64 fd303, fd281, 0d3FD3C6EF372FE950, fd267; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, fd259, fd265; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd261, fd263; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, fd267, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +add.f64 fd321, fd260, fd266; +add.f64 fd322, fd258, fd321; +add.f64 fd323, fd262, fd264; +add.f64 fd324, fd323, fd322; +add.f64 fd325, fd270, fd276; +add.f64 fd326, fd268, fd325; +add.f64 fd327, fd272, fd274; +add.f64 fd328, fd327, fd326; +fma.rn.f64 fd329, fd321, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd330, fd323, 0d3FE9E3779B97F4A8; +sub.f64 fd331, fd329, fd330; +sub.f64 fd332, fd270, fd276; +mul.f64 fd333, fd332, 0d3FEE6F0E134454FF; +sub.f64 fd334, fd272, fd274; +mul.f64 fd335, fd334, 0dBFE2CF2304755A5E; +sub.f64 fd336, fd335, fd333; +sub.f64 fd337, fd331, fd336; +add.f64 fd338, fd336, fd331; +mul.f64 fd339, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd340, fd258, fd339; +fma.rn.f64 fd341, fd323, 0d3FD3C6EF372FE950, fd340; +mul.f64 fd342, fd332, 0d3FE2CF2304755A5E; +mul.f64 fd343, fd334, 0d3FEE6F0E134454FF; +sub.f64 fd344, fd343, fd342; +sub.f64 fd345, fd341, fd344; +add.f64 fd346, fd344, fd341; +fma.rn.f64 fd347, fd325, 0d3FD3C6EF372FE950, fd268; +mul.f64 fd348, fd327, 0d3FE9E3779B97F4A8; +sub.f64 fd349, fd347, fd348; +sub.f64 fd350, fd260, fd266; +mul.f64 fd351, fd350, 0d3FEE6F0E134454FF; +sub.f64 fd352, fd262, fd264; +mul.f64 fd353, fd352, 0dBFE2CF2304755A5E; +sub.f64 fd354, fd353, fd351; +add.f64 fd355, fd354, fd349; +sub.f64 fd356, fd349, fd354; +mul.f64 fd357, fd325, 0d3FE9E3779B97F4A8; +sub.f64 fd358, fd268, fd357; +fma.rn.f64 fd359, fd327, 0d3FD3C6EF372FE950, fd358; +mul.f64 fd360, fd350, 0d3FE2CF2304755A5E; +mul.f64 fd361, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd362, fd361, fd360; +add.f64 fd363, fd362, fd359; +sub.f64 fd364, fd359, fd362; +mul.f64 fd365, fd337, 0d3FE9E3779B97F4A8; +mul.f64 fd366, fd355, 0dBFE2CF2304755A5E; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd355, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd369, fd337, 0dBFE2CF2304755A5E, fd368; +mul.f64 fd370, fd345, 0d3FD3C6EF372FE950; +mul.f64 fd371, fd363, 0dBFEE6F0E134454FF; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd363, 0d3FD3C6EF372FE950; +fma.rn.f64 fd374, fd345, 0dBFEE6F0E134454FF, fd373; +mul.f64 fd375, fd346, 0dBFD3C6EF372FE950; +mul.f64 fd376, fd364, 0dBFEE6F0E134454FF; +sub.f64 fd377, fd375, fd376; +mul.f64 fd378, fd364, 0dBFD3C6EF372FE950; +fma.rn.f64 fd379, fd346, 0dBFEE6F0E134454FF, fd378; +mul.f64 fd380, fd338, 0dBFE9E3779B97F4A8; +mul.f64 fd381, fd356, 0dBFE2CF2304755A5E; +sub.f64 fd382, fd380, fd381; +mul.f64 fd383, fd356, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd384, fd338, 0dBFE2CF2304755A5E, fd383; +add.f64 fd385, fd280, fd324; +add.f64 fd386, fd284, fd328; +sub.f64 fd387, fd280, fd324; +sub.f64 fd388, fd284, fd328; +add.f64 fd389, fd293, fd367; +add.f64 fd390, fd311, fd369; +sub.f64 fd391, fd293, fd367; +sub.f64 fd392, fd311, fd369; +add.f64 fd393, fd301, fd372; +add.f64 fd394, fd319, fd374; +sub.f64 fd395, fd301, fd372; +sub.f64 fd396, fd319, fd374; +add.f64 fd397, fd302, fd377; +add.f64 fd398, fd320, fd379; +sub.f64 fd399, fd302, fd377; +sub.f64 fd400, fd320, fd379; +add.f64 fd401, fd294, fd382; +add.f64 fd402, fd312, fd384; +sub.f64 fd403, fd294, fd382; +sub.f64 fd404, fd312, fd384; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd405, fd406}, [rd11]; +mul.f64 fd409, fd405, fd389; +mul.f64 fd410, fd406, fd390; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd405, fd390; +fma.rn.f64 fd413, fd406, fd389, fd412; +mul.f64 fd414, fd405, fd405; +mul.f64 fd415, fd406, fd406; +sub.f64 fd416, fd414, fd415; +mul.f64 fd417, fd406, fd405; +fma.rn.f64 fd418, fd406, fd405, fd417; +mul.f64 fd419, fd416, fd393; +mul.f64 fd420, fd418, fd394; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd416, fd394; +fma.rn.f64 fd423, fd418, fd393, fd422; +mul.f64 fd424, fd405, fd416; +mul.f64 fd425, fd406, fd418; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd405, fd418; +fma.rn.f64 fd428, fd406, fd416, fd427; +mul.f64 fd429, fd426, fd397; +mul.f64 fd430, fd428, fd398; +sub.f64 fd431, fd429, fd430; +mul.f64 fd432, fd426, fd398; +fma.rn.f64 fd433, fd428, fd397, fd432; +mul.f64 fd434, fd405, fd426; +mul.f64 fd435, fd406, fd428; +sub.f64 fd436, fd434, fd435; +mul.f64 fd437, fd405, fd428; +fma.rn.f64 fd438, fd406, fd426, fd437; +mul.f64 fd439, fd436, fd401; +mul.f64 fd440, fd438, fd402; +sub.f64 fd441, fd439, fd440; +mul.f64 fd442, fd436, fd402; +fma.rn.f64 fd443, fd438, fd401, fd442; +mul.f64 fd444, fd405, fd436; +mul.f64 fd445, fd406, fd438; +sub.f64 fd446, fd444, fd445; +mul.f64 fd447, fd405, fd438; +fma.rn.f64 fd448, fd406, fd436, fd447; +mul.f64 fd449, fd446, fd387; +mul.f64 fd450, fd448, fd388; +sub.f64 fd451, fd449, fd450; +mul.f64 fd452, fd446, fd388; +fma.rn.f64 fd453, fd448, fd387, fd452; +ld.global.v2.f64 {fd454, fd455}, [rd11+160]; +mul.f64 fd458, fd454, fd391; +mul.f64 fd459, fd455, fd392; +sub.f64 fd460, fd458, fd459; +mul.f64 fd461, fd454, fd392; +fma.rn.f64 fd462, fd455, fd391, fd461; +mul.f64 fd463, fd405, fd454; +mul.f64 fd464, fd406, fd455; +sub.f64 fd465, fd463, fd464; +mul.f64 fd466, fd405, fd455; +fma.rn.f64 fd467, fd406, fd454, fd466; +mul.f64 fd468, fd465, fd395; +mul.f64 fd469, fd467, fd396; +sub.f64 fd470, fd468, fd469; +mul.f64 fd471, fd465, fd396; +fma.rn.f64 fd472, fd467, fd395, fd471; +mul.f64 fd473, fd405, fd465; +mul.f64 fd474, fd406, fd467; +sub.f64 fd475, fd473, fd474; +mul.f64 fd476, fd405, fd467; +fma.rn.f64 fd477, fd406, fd465, fd476; +mul.f64 fd478, fd475, fd399; +mul.f64 fd479, fd477, fd400; +sub.f64 fd480, fd478, fd479; +mul.f64 fd481, fd475, fd400; +fma.rn.f64 fd482, fd477, fd399, fd481; +mul.f64 fd483, fd405, fd475; +mul.f64 fd484, fd406, fd477; +sub.f64 fd485, fd483, fd484; +mul.f64 fd486, fd405, fd477; +fma.rn.f64 fd487, fd406, fd475, fd486; +mul.f64 fd488, fd485, fd403; +mul.f64 fd489, fd487, fd404; +sub.f64 fd490, fd488, fd489; +mul.f64 fd491, fd485, fd404; +fma.rn.f64 fd492, fd487, fd403, fd491; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +st.shared.f64 [r16], fd385; +st.shared.f64 [r16+80], fd411; +st.shared.f64 [r16+160], fd421; +st.shared.f64 [r16+240], fd431; +st.shared.f64 [r16+320], fd441; +st.shared.f64 [r16+400], fd451; +st.shared.f64 [r16+480], fd460; +st.shared.f64 [r16+560], fd470; +st.shared.f64 [r16+640], fd480; +st.shared.f64 [r16+720], fd490; +barrier.sync 0; +ld.shared.f64 fd493, [r10]; +ld.shared.f64 fd494, [r10+800]; +ld.shared.f64 fd495, [r10+1600]; +ld.shared.f64 fd496, [r10+2400]; +ld.shared.f64 fd497, [r10+3200]; +ld.shared.f64 fd498, [r10+4000]; +ld.shared.f64 fd499, [r10+4800]; +ld.shared.f64 fd500, [r10+5600]; +ld.shared.f64 fd501, [r10+6400]; +ld.shared.f64 fd502, [r10+7200]; +barrier.sync 0; +st.shared.f64 [r16], fd386; +st.shared.f64 [r16+80], fd413; +st.shared.f64 [r16+160], fd423; +st.shared.f64 [r16+240], fd433; +st.shared.f64 [r16+320], fd443; +st.shared.f64 [r16+400], fd453; +st.shared.f64 [r16+480], fd462; +st.shared.f64 [r16+560], fd472; +st.shared.f64 [r16+640], fd482; +st.shared.f64 [r16+720], fd492; +barrier.sync 0; +ld.shared.f64 fd503, [r10]; +ld.shared.f64 fd504, [r10+800]; +ld.shared.f64 fd505, [r10+1600]; +ld.shared.f64 fd506, [r10+2400]; +ld.shared.f64 fd507, [r10+3200]; +ld.shared.f64 fd508, [r10+4000]; +ld.shared.f64 fd509, [r10+4800]; +ld.shared.f64 fd510, [r10+5600]; +ld.shared.f64 fd511, [r10+6400]; +ld.shared.f64 fd512, [r10+7200]; +add.f64 fd513, fd495, fd501; +add.f64 fd514, fd493, fd513; +add.f64 fd515, fd497, fd499; +add.f64 fd516, fd515, fd514; +add.f64 fd517, fd505, fd511; +add.f64 fd518, fd503, fd517; +add.f64 fd519, fd507, fd509; +add.f64 fd520, fd519, fd518; +fma.rn.f64 fd521, fd513, 0d3FD3C6EF372FE950, fd493; +mul.f64 fd522, fd515, 0d3FE9E3779B97F4A8; +sub.f64 fd523, fd521, fd522; +sub.f64 fd524, fd505, fd511; +mul.f64 fd525, fd524, 0d3FEE6F0E134454FF; +sub.f64 fd526, fd507, fd509; +mul.f64 fd527, fd526, 0dBFE2CF2304755A5E; +sub.f64 fd528, fd527, fd525; +sub.f64 fd529, fd523, fd528; +add.f64 fd530, fd528, fd523; +mul.f64 fd531, fd513, 0d3FE9E3779B97F4A8; +sub.f64 fd532, fd493, fd531; +fma.rn.f64 fd533, fd515, 0d3FD3C6EF372FE950, fd532; +mul.f64 fd534, fd524, 0d3FE2CF2304755A5E; +mul.f64 fd535, fd526, 0d3FEE6F0E134454FF; +sub.f64 fd536, fd535, fd534; +sub.f64 fd537, fd533, fd536; +add.f64 fd538, fd536, fd533; +fma.rn.f64 fd539, fd517, 0d3FD3C6EF372FE950, fd503; +mul.f64 fd540, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd541, fd539, fd540; +sub.f64 fd542, fd495, fd501; +mul.f64 fd543, fd542, 0d3FEE6F0E134454FF; +sub.f64 fd544, fd497, fd499; +mul.f64 fd545, fd544, 0dBFE2CF2304755A5E; +sub.f64 fd546, fd545, fd543; +add.f64 fd547, fd546, fd541; +sub.f64 fd548, fd541, fd546; +mul.f64 fd549, fd517, 0d3FE9E3779B97F4A8; +sub.f64 fd550, fd503, fd549; +fma.rn.f64 fd551, fd519, 0d3FD3C6EF372FE950, fd550; +mul.f64 fd552, fd542, 0d3FE2CF2304755A5E; +mul.f64 fd553, fd544, 0d3FEE6F0E134454FF; +sub.f64 fd554, fd553, fd552; +add.f64 fd555, fd554, fd551; +sub.f64 fd556, fd551, fd554; +add.f64 fd557, fd496, fd502; +add.f64 fd558, fd494, fd557; +add.f64 fd559, fd498, fd500; +add.f64 fd560, fd559, fd558; +add.f64 fd561, fd506, fd512; +add.f64 fd562, fd504, fd561; +add.f64 fd563, fd508, fd510; +add.f64 fd564, fd563, fd562; +fma.rn.f64 fd565, fd557, 0d3FD3C6EF372FE950, fd494; +mul.f64 fd566, fd559, 0d3FE9E3779B97F4A8; +sub.f64 fd567, fd565, fd566; +sub.f64 fd568, fd506, fd512; +mul.f64 fd569, fd568, 0d3FEE6F0E134454FF; +sub.f64 fd570, fd508, fd510; +mul.f64 fd571, fd570, 0dBFE2CF2304755A5E; +sub.f64 fd572, fd571, fd569; +sub.f64 fd573, fd567, fd572; +add.f64 fd574, fd572, fd567; +mul.f64 fd575, fd557, 0d3FE9E3779B97F4A8; +sub.f64 fd576, fd494, fd575; +fma.rn.f64 fd577, fd559, 0d3FD3C6EF372FE950, fd576; +mul.f64 fd578, fd568, 0d3FE2CF2304755A5E; +mul.f64 fd579, fd570, 0d3FEE6F0E134454FF; +sub.f64 fd580, fd579, fd578; +sub.f64 fd581, fd577, fd580; +add.f64 fd582, fd580, fd577; +fma.rn.f64 fd583, fd561, 0d3FD3C6EF372FE950, fd504; +mul.f64 fd584, fd563, 0d3FE9E3779B97F4A8; +sub.f64 fd585, fd583, fd584; +sub.f64 fd586, fd496, fd502; +mul.f64 fd587, fd586, 0d3FEE6F0E134454FF; +sub.f64 fd588, fd498, fd500; +mul.f64 fd589, fd588, 0dBFE2CF2304755A5E; +sub.f64 fd590, fd589, fd587; +add.f64 fd591, fd590, fd585; +sub.f64 fd592, fd585, fd590; +mul.f64 fd593, fd561, 0d3FE9E3779B97F4A8; +sub.f64 fd594, fd504, fd593; +fma.rn.f64 fd595, fd563, 0d3FD3C6EF372FE950, fd594; +mul.f64 fd596, fd586, 0d3FE2CF2304755A5E; +mul.f64 fd597, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd598, fd597, fd596; +add.f64 fd599, fd598, fd595; +sub.f64 fd600, fd595, fd598; +mul.f64 fd601, fd573, 0d3FE9E3779B97F4A8; +mul.f64 fd602, fd591, 0dBFE2CF2304755A5E; +sub.f64 fd603, fd601, fd602; +mul.f64 fd604, fd591, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd605, fd573, 0dBFE2CF2304755A5E, fd604; +mul.f64 fd606, fd581, 0d3FD3C6EF372FE950; +mul.f64 fd607, fd599, 0dBFEE6F0E134454FF; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd599, 0d3FD3C6EF372FE950; +fma.rn.f64 fd610, fd581, 0dBFEE6F0E134454FF, fd609; +mul.f64 fd611, fd582, 0dBFD3C6EF372FE950; +mul.f64 fd612, fd600, 0dBFEE6F0E134454FF; +sub.f64 fd613, fd611, fd612; +mul.f64 fd614, fd600, 0dBFD3C6EF372FE950; +fma.rn.f64 fd615, fd582, 0dBFEE6F0E134454FF, fd614; +mul.f64 fd616, fd574, 0dBFE9E3779B97F4A8; +mul.f64 fd617, fd592, 0dBFE2CF2304755A5E; +sub.f64 fd618, fd616, fd617; +mul.f64 fd619, fd592, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd620, fd574, 0dBFE2CF2304755A5E, fd619; +add.f64 %0, fd516, fd560; +add.f64 %1, fd520, fd564; +add.f64 %3, fd547, fd605; +add.f64 %2, fd529, fd603; +add.f64 %5, fd555, fd610; +add.f64 %4, fd537, fd608; +add.f64 %7, fd556, fd615; +add.f64 %6, fd538, fd613; +add.f64 %9, fd548, fd620; +add.f64 %8, fd530, fd618; +sub.f64 %10, fd516, fd560; +sub.f64 %11, fd520, fd564; +sub.f64 %13, fd547, fd605; +sub.f64 %12, fd529, fd603; +sub.f64 %15, fd555, fd610; +sub.f64 %14, fd537, fd608; +sub.f64 %17, fd556, fd615; +sub.f64 %16, fd538, fd613; +sub.f64 %19, fd548, fd620; +sub.f64 %18, fd530, fd618; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..13224824441c5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1000_fp64_inv.hpp.inc @@ -0,0 +1,1276 @@ +#ifndef CUFFTDX_FFT_1000_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_1000_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<739, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<669>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 16000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %28, %44; +add.f64 fd42, %23, fd41; +add.f64 fd43, %33, %39; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %30, %46; +add.f64 fd46, %24, fd45; +add.f64 fd47, %35, %40; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %23; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %30, %46; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %35, %40; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %23, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %24; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %28, %44; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %33, %39; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %24, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %31, %47; +add.f64 fd84, %25, fd83; +add.f64 fd85, %36, %41; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %32, %48; +add.f64 fd88, %27, fd87; +add.f64 fd89, %38, %43; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %25; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %32, %48; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %38, %43; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %25, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %27; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %31, %47; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %36, %41; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %27, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +sub.f64 fd145, fd44, fd86; +sub.f64 fd146, fd48, fd90; +add.f64 fd147, fd56, fd127; +add.f64 fd148, fd73, fd129; +sub.f64 fd149, fd56, fd127; +sub.f64 fd150, fd73, fd129; +add.f64 fd151, fd64, fd132; +add.f64 fd152, fd81, fd134; +sub.f64 fd153, fd64, fd132; +sub.f64 fd154, fd81, fd134; +add.f64 fd155, fd65, fd137; +add.f64 fd156, fd82, fd139; +sub.f64 fd157, fd65, fd137; +sub.f64 fd158, fd82, fd139; +add.f64 fd159, fd57, fd142; +add.f64 fd160, fd74, fd144; +sub.f64 fd161, fd57, fd142; +sub.f64 fd162, fd74, fd144; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 16000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd163, fd164}, [rd6]; +mul.f64 fd167, fd148, fd164; +mul.f64 fd168, fd147, fd164; +mul.f64 fd169, fd163, fd148; +mul.f64 fd170, fd163, fd163; +mul.f64 fd171, fd164, fd164; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd164, fd163; +fma.rn.f64 fd174, fd164, fd163, fd173; +mul.f64 fd175, fd152, fd174; +mul.f64 fd176, fd151, fd174; +mul.f64 fd177, fd172, fd152; +mul.f64 fd178, fd163, fd172; +mul.f64 fd179, fd164, fd174; +sub.f64 fd180, fd178, fd179; +mul.f64 fd181, fd163, fd174; +fma.rn.f64 fd182, fd164, fd172, fd181; +mul.f64 fd183, fd156, fd182; +mul.f64 fd184, fd155, fd182; +mul.f64 fd185, fd180, fd156; +mul.f64 fd186, fd163, fd180; +mul.f64 fd187, fd164, fd182; +sub.f64 fd188, fd186, fd187; +mul.f64 fd189, fd163, fd182; +fma.rn.f64 fd190, fd164, fd180, fd189; +mul.f64 fd191, fd160, fd190; +mul.f64 fd192, fd159, fd190; +mul.f64 fd193, fd188, fd160; +mul.f64 fd194, fd163, fd188; +mul.f64 fd195, fd164, fd190; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd163, fd190; +fma.rn.f64 fd198, fd164, fd188, fd197; +mul.f64 fd199, fd146, fd198; +mul.f64 fd200, fd145, fd198; +mul.f64 fd201, fd196, fd146; +ld.global.v2.f64 {fd202, fd203}, [rd6+1600]; +mul.f64 fd206, fd150, fd203; +mul.f64 fd207, fd149, fd203; +mul.f64 fd208, fd202, fd150; +mul.f64 fd209, fd163, fd202; +mul.f64 fd210, fd164, fd203; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd163, fd203; +fma.rn.f64 fd213, fd164, fd202, fd212; +mul.f64 fd214, fd154, fd213; +mul.f64 fd215, fd153, fd213; +mul.f64 fd216, fd211, fd154; +mul.f64 fd217, fd163, fd211; +mul.f64 fd218, fd164, fd213; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd163, fd213; +fma.rn.f64 fd221, fd164, fd211, fd220; +mul.f64 fd222, fd158, fd221; +mul.f64 fd223, fd157, fd221; +mul.f64 fd224, fd219, fd158; +mul.f64 fd225, fd163, fd219; +mul.f64 fd226, fd164, fd221; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd163, fd221; +fma.rn.f64 fd229, fd164, fd219, fd228; +mul.f64 fd230, fd162, fd229; +mul.f64 fd231, fd161, fd229; +mul.f64 fd232, fd227, fd162; +barrier.sync 0; +mad.lo.s32 r9, r7, 160, r8; +add.f64 fd233, fd48, fd90; +add.f64 fd234, fd44, fd86; +st.shared.v2.f64 [r9], {fd234, fd233}; +fma.rn.f64 fd235, fd163, fd147, fd167; +sub.f64 fd236, fd169, fd168; +st.shared.v2.f64 [r9+16], {fd235, fd236}; +fma.rn.f64 fd237, fd172, fd151, fd175; +sub.f64 fd238, fd177, fd176; +st.shared.v2.f64 [r9+32], {fd237, fd238}; +fma.rn.f64 fd239, fd180, fd155, fd183; +sub.f64 fd240, fd185, fd184; +st.shared.v2.f64 [r9+48], {fd239, fd240}; +sub.f64 fd241, fd193, fd192; +fma.rn.f64 fd242, fd188, fd159, fd191; +st.shared.v2.f64 [r9+64], {fd242, fd241}; +fma.rn.f64 fd243, fd196, fd145, fd199; +sub.f64 fd244, fd201, fd200; +st.shared.v2.f64 [r9+80], {fd243, fd244}; +fma.rn.f64 fd245, fd202, fd149, fd206; +sub.f64 fd246, fd208, fd207; +st.shared.v2.f64 [r9+96], {fd245, fd246}; +fma.rn.f64 fd247, fd211, fd153, fd214; +sub.f64 fd248, fd216, fd215; +st.shared.v2.f64 [r9+112], {fd247, fd248}; +fma.rn.f64 fd249, fd219, fd157, fd222; +sub.f64 fd250, fd224, fd223; +st.shared.v2.f64 [r9+128], {fd249, fd250}; +sub.f64 fd251, fd232, fd231; +fma.rn.f64 fd252, fd227, fd161, fd230; +st.shared.v2.f64 [r9+144], {fd252, fd251}; +barrier.sync 0; +mad.lo.s32 r10, r7, -144, r9; +ld.shared.v2.f64 {fd253, fd254}, [r10]; +ld.shared.v2.f64 {fd257, fd258}, [r10+1600]; +ld.shared.v2.f64 {fd261, fd262}, [r10+3200]; +ld.shared.v2.f64 {fd265, fd266}, [r10+4800]; +ld.shared.v2.f64 {fd269, fd270}, [r10+6400]; +ld.shared.v2.f64 {fd273, fd274}, [r10+8000]; +ld.shared.v2.f64 {fd277, fd278}, [r10+9600]; +ld.shared.v2.f64 {fd281, fd282}, [r10+11200]; +ld.shared.v2.f64 {fd285, fd286}, [r10+12800]; +ld.shared.v2.f64 {fd289, fd290}, [r10+14400]; +add.f64 fd293, fd261, fd285; +add.f64 fd294, fd253, fd293; +add.f64 fd295, fd269, fd277; +add.f64 fd296, fd295, fd294; +add.f64 fd297, fd262, fd286; +add.f64 fd298, fd254, fd297; +add.f64 fd299, fd270, fd278; +add.f64 fd300, fd299, fd298; +fma.rn.f64 fd301, fd293, 0d3FD3C6EF372FE950, fd253; +mul.f64 fd302, fd295, 0d3FE9E3779B97F4A8; +sub.f64 fd303, fd301, fd302; +sub.f64 fd304, fd262, fd286; +mul.f64 fd305, fd304, 0d3FEE6F0E134454FF; +sub.f64 fd306, fd270, fd278; +fma.rn.f64 fd307, fd306, 0d3FE2CF2304755A5E, fd305; +sub.f64 fd308, fd303, fd307; +add.f64 fd309, fd307, fd303; +mul.f64 fd310, fd293, 0d3FE9E3779B97F4A8; +sub.f64 fd311, fd253, fd310; +fma.rn.f64 fd312, fd295, 0d3FD3C6EF372FE950, fd311; +mul.f64 fd313, fd304, 0d3FE2CF2304755A5E; +mul.f64 fd314, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd315, fd313, fd314; +sub.f64 fd316, fd312, fd315; +add.f64 fd317, fd315, fd312; +fma.rn.f64 fd318, fd297, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd319, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd320, fd318, fd319; +sub.f64 fd321, fd261, fd285; +mul.f64 fd322, fd321, 0d3FEE6F0E134454FF; +sub.f64 fd323, fd269, fd277; +fma.rn.f64 fd324, fd323, 0d3FE2CF2304755A5E, fd322; +add.f64 fd325, fd324, fd320; +sub.f64 fd326, fd320, fd324; +mul.f64 fd327, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd328, fd254, fd327; +fma.rn.f64 fd329, fd299, 0d3FD3C6EF372FE950, fd328; +mul.f64 fd330, fd321, 0d3FE2CF2304755A5E; +mul.f64 fd331, fd323, 0d3FEE6F0E134454FF; +sub.f64 fd332, fd330, fd331; +add.f64 fd333, fd332, fd329; +sub.f64 fd334, fd329, fd332; +add.f64 fd335, fd265, fd289; +add.f64 fd336, fd257, fd335; +add.f64 fd337, fd273, fd281; +add.f64 fd338, fd337, fd336; +add.f64 fd339, fd266, fd290; +add.f64 fd340, fd258, fd339; +add.f64 fd341, fd274, fd282; +add.f64 fd342, fd341, fd340; +fma.rn.f64 fd343, fd335, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd344, fd337, 0d3FE9E3779B97F4A8; +sub.f64 fd345, fd343, fd344; +sub.f64 fd346, fd266, fd290; +mul.f64 fd347, fd346, 0d3FEE6F0E134454FF; +sub.f64 fd348, fd274, fd282; +fma.rn.f64 fd349, fd348, 0d3FE2CF2304755A5E, fd347; +sub.f64 fd350, fd345, fd349; +add.f64 fd351, fd349, fd345; +mul.f64 fd352, fd335, 0d3FE9E3779B97F4A8; +sub.f64 fd353, fd257, fd352; +fma.rn.f64 fd354, fd337, 0d3FD3C6EF372FE950, fd353; +mul.f64 fd355, fd346, 0d3FE2CF2304755A5E; +mul.f64 fd356, fd348, 0d3FEE6F0E134454FF; +sub.f64 fd357, fd355, fd356; +sub.f64 fd358, fd354, fd357; +add.f64 fd359, fd357, fd354; +fma.rn.f64 fd360, fd339, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd361, fd341, 0d3FE9E3779B97F4A8; +sub.f64 fd362, fd360, fd361; +sub.f64 fd363, fd265, fd289; +mul.f64 fd364, fd363, 0d3FEE6F0E134454FF; +sub.f64 fd365, fd273, fd281; +fma.rn.f64 fd366, fd365, 0d3FE2CF2304755A5E, fd364; +add.f64 fd367, fd366, fd362; +sub.f64 fd368, fd362, fd366; +mul.f64 fd369, fd339, 0d3FE9E3779B97F4A8; +sub.f64 fd370, fd258, fd369; +fma.rn.f64 fd371, fd341, 0d3FD3C6EF372FE950, fd370; +mul.f64 fd372, fd363, 0d3FE2CF2304755A5E; +mul.f64 fd373, fd365, 0d3FEE6F0E134454FF; +sub.f64 fd374, fd372, fd373; +add.f64 fd375, fd374, fd371; +sub.f64 fd376, fd371, fd374; +mul.f64 fd377, fd350, 0d3FE9E3779B97F4A8; +mul.f64 fd378, fd367, 0d3FE2CF2304755A5E; +sub.f64 fd379, fd377, fd378; +mul.f64 fd380, fd367, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd381, fd350, 0d3FE2CF2304755A5E, fd380; +mul.f64 fd382, fd358, 0d3FD3C6EF372FE950; +mul.f64 fd383, fd375, 0d3FEE6F0E134454FF; +sub.f64 fd384, fd382, fd383; +mul.f64 fd385, fd375, 0d3FD3C6EF372FE950; +fma.rn.f64 fd386, fd358, 0d3FEE6F0E134454FF, fd385; +mul.f64 fd387, fd359, 0dBFD3C6EF372FE950; +mul.f64 fd388, fd376, 0d3FEE6F0E134454FF; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd376, 0dBFD3C6EF372FE950; +fma.rn.f64 fd391, fd359, 0d3FEE6F0E134454FF, fd390; +mul.f64 fd392, fd351, 0dBFE9E3779B97F4A8; +mul.f64 fd393, fd368, 0d3FE2CF2304755A5E; +sub.f64 fd394, fd392, fd393; +mul.f64 fd395, fd368, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd396, fd351, 0d3FE2CF2304755A5E, fd395; +sub.f64 fd397, fd296, fd338; +sub.f64 fd398, fd300, fd342; +add.f64 fd399, fd308, fd379; +add.f64 fd400, fd325, fd381; +sub.f64 fd401, fd308, fd379; +sub.f64 fd402, fd325, fd381; +add.f64 fd403, fd316, fd384; +add.f64 fd404, fd333, fd386; +sub.f64 fd405, fd316, fd384; +sub.f64 fd406, fd333, fd386; +add.f64 fd407, fd317, fd389; +add.f64 fd408, fd334, fd391; +sub.f64 fd409, fd317, fd389; +sub.f64 fd410, fd334, fd391; +add.f64 fd411, fd309, fd394; +add.f64 fd412, fd326, fd396; +sub.f64 fd413, fd309, fd394; +sub.f64 fd414, fd326, fd396; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd415, fd416}, [rd11]; +mul.f64 fd419, fd400, fd416; +mul.f64 fd420, fd399, fd416; +mul.f64 fd421, fd415, fd400; +mul.f64 fd422, fd415, fd415; +mul.f64 fd423, fd416, fd416; +sub.f64 fd424, fd422, fd423; +mul.f64 fd425, fd416, fd415; +fma.rn.f64 fd426, fd416, fd415, fd425; +mul.f64 fd427, fd404, fd426; +mul.f64 fd428, fd403, fd426; +mul.f64 fd429, fd424, fd404; +mul.f64 fd430, fd415, fd424; +mul.f64 fd431, fd416, fd426; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd415, fd426; +fma.rn.f64 fd434, fd416, fd424, fd433; +mul.f64 fd435, fd408, fd434; +mul.f64 fd436, fd407, fd434; +mul.f64 fd437, fd432, fd408; +mul.f64 fd438, fd415, fd432; +mul.f64 fd439, fd416, fd434; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd415, fd434; +fma.rn.f64 fd442, fd416, fd432, fd441; +mul.f64 fd443, fd412, fd442; +mul.f64 fd444, fd411, fd442; +mul.f64 fd445, fd440, fd412; +mul.f64 fd446, fd415, fd440; +mul.f64 fd447, fd416, fd442; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd415, fd442; +fma.rn.f64 fd450, fd416, fd440, fd449; +mul.f64 fd451, fd398, fd450; +mul.f64 fd452, fd397, fd450; +mul.f64 fd453, fd448, fd398; +ld.global.v2.f64 {fd454, fd455}, [rd11+160]; +mul.f64 fd458, fd402, fd455; +mul.f64 fd459, fd401, fd455; +mul.f64 fd460, fd454, fd402; +mul.f64 fd461, fd415, fd454; +mul.f64 fd462, fd416, fd455; +sub.f64 fd463, fd461, fd462; +mul.f64 fd464, fd415, fd455; +fma.rn.f64 fd465, fd416, fd454, fd464; +mul.f64 fd466, fd406, fd465; +mul.f64 fd467, fd405, fd465; +mul.f64 fd468, fd463, fd406; +mul.f64 fd469, fd415, fd463; +mul.f64 fd470, fd416, fd465; +sub.f64 fd471, fd469, fd470; +mul.f64 fd472, fd415, fd465; +fma.rn.f64 fd473, fd416, fd463, fd472; +mul.f64 fd474, fd410, fd473; +mul.f64 fd475, fd409, fd473; +mul.f64 fd476, fd471, fd410; +mul.f64 fd477, fd415, fd471; +mul.f64 fd478, fd416, fd473; +sub.f64 fd479, fd477, fd478; +mul.f64 fd480, fd415, fd473; +fma.rn.f64 fd481, fd416, fd471, fd480; +mul.f64 fd482, fd414, fd481; +mul.f64 fd483, fd413, fd481; +mul.f64 fd484, fd479, fd414; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1600, r15; +add.f64 fd485, fd300, fd342; +add.f64 fd486, fd296, fd338; +st.shared.v2.f64 [r16], {fd486, fd485}; +fma.rn.f64 fd487, fd415, fd399, fd419; +sub.f64 fd488, fd421, fd420; +st.shared.v2.f64 [r16+160], {fd487, fd488}; +fma.rn.f64 fd489, fd424, fd403, fd427; +sub.f64 fd490, fd429, fd428; +st.shared.v2.f64 [r16+320], {fd489, fd490}; +fma.rn.f64 fd491, fd432, fd407, fd435; +sub.f64 fd492, fd437, fd436; +st.shared.v2.f64 [r16+480], {fd491, fd492}; +fma.rn.f64 fd493, fd440, fd411, fd443; +sub.f64 fd494, fd445, fd444; +st.shared.v2.f64 [r16+640], {fd493, fd494}; +sub.f64 fd495, fd453, fd452; +fma.rn.f64 fd496, fd448, fd397, fd451; +st.shared.v2.f64 [r16+800], {fd496, fd495}; +fma.rn.f64 fd497, fd454, fd401, fd458; +sub.f64 fd498, fd460, fd459; +st.shared.v2.f64 [r16+960], {fd497, fd498}; +fma.rn.f64 fd499, fd463, fd405, fd466; +sub.f64 fd500, fd468, fd467; +st.shared.v2.f64 [r16+1120], {fd499, fd500}; +fma.rn.f64 fd501, fd471, fd409, fd474; +sub.f64 fd502, fd476, fd475; +st.shared.v2.f64 [r16+1280], {fd501, fd502}; +fma.rn.f64 fd503, fd479, fd413, fd482; +sub.f64 fd504, fd484, fd483; +st.shared.v2.f64 [r16+1440], {fd503, fd504}; +barrier.sync 0; +ld.shared.v2.f64 {fd505, fd506}, [r10]; +ld.shared.v2.f64 {fd509, fd510}, [r10+1600]; +ld.shared.v2.f64 {fd513, fd514}, [r10+3200]; +ld.shared.v2.f64 {fd517, fd518}, [r10+4800]; +ld.shared.v2.f64 {fd521, fd522}, [r10+6400]; +ld.shared.v2.f64 {fd525, fd526}, [r10+8000]; +ld.shared.v2.f64 {fd529, fd530}, [r10+9600]; +ld.shared.v2.f64 {fd533, fd534}, [r10+11200]; +ld.shared.v2.f64 {fd537, fd538}, [r10+12800]; +ld.shared.v2.f64 {fd541, fd542}, [r10+14400]; +add.f64 fd545, fd513, fd537; +add.f64 fd546, fd505, fd545; +add.f64 fd547, fd521, fd529; +add.f64 fd548, fd547, fd546; +add.f64 fd549, fd514, fd538; +add.f64 fd550, fd506, fd549; +add.f64 fd551, fd522, fd530; +add.f64 fd552, fd551, fd550; +fma.rn.f64 fd553, fd545, 0d3FD3C6EF372FE950, fd505; +mul.f64 fd554, fd547, 0d3FE9E3779B97F4A8; +sub.f64 fd555, fd553, fd554; +sub.f64 fd556, fd514, fd538; +mul.f64 fd557, fd556, 0d3FEE6F0E134454FF; +sub.f64 fd558, fd522, fd530; +fma.rn.f64 fd559, fd558, 0d3FE2CF2304755A5E, fd557; +sub.f64 fd560, fd555, fd559; +add.f64 fd561, fd559, fd555; +mul.f64 fd562, fd545, 0d3FE9E3779B97F4A8; +sub.f64 fd563, fd505, fd562; +fma.rn.f64 fd564, fd547, 0d3FD3C6EF372FE950, fd563; +mul.f64 fd565, fd556, 0d3FE2CF2304755A5E; +mul.f64 fd566, fd558, 0d3FEE6F0E134454FF; +sub.f64 fd567, fd565, fd566; +sub.f64 fd568, fd564, fd567; +add.f64 fd569, fd567, fd564; +fma.rn.f64 fd570, fd549, 0d3FD3C6EF372FE950, fd506; +mul.f64 fd571, fd551, 0d3FE9E3779B97F4A8; +sub.f64 fd572, fd570, fd571; +sub.f64 fd573, fd513, fd537; +mul.f64 fd574, fd573, 0d3FEE6F0E134454FF; +sub.f64 fd575, fd521, fd529; +fma.rn.f64 fd576, fd575, 0d3FE2CF2304755A5E, fd574; +add.f64 fd577, fd576, fd572; +sub.f64 fd578, fd572, fd576; +mul.f64 fd579, fd549, 0d3FE9E3779B97F4A8; +sub.f64 fd580, fd506, fd579; +fma.rn.f64 fd581, fd551, 0d3FD3C6EF372FE950, fd580; +mul.f64 fd582, fd573, 0d3FE2CF2304755A5E; +mul.f64 fd583, fd575, 0d3FEE6F0E134454FF; +sub.f64 fd584, fd582, fd583; +add.f64 fd585, fd584, fd581; +sub.f64 fd586, fd581, fd584; +add.f64 fd587, fd517, fd541; +add.f64 fd588, fd509, fd587; +add.f64 fd589, fd525, fd533; +add.f64 fd590, fd589, fd588; +add.f64 fd591, fd518, fd542; +add.f64 fd592, fd510, fd591; +add.f64 fd593, fd526, fd534; +add.f64 fd594, fd593, fd592; +fma.rn.f64 fd595, fd587, 0d3FD3C6EF372FE950, fd509; +mul.f64 fd596, fd589, 0d3FE9E3779B97F4A8; +sub.f64 fd597, fd595, fd596; +sub.f64 fd598, fd518, fd542; +mul.f64 fd599, fd598, 0d3FEE6F0E134454FF; +sub.f64 fd600, fd526, fd534; +fma.rn.f64 fd601, fd600, 0d3FE2CF2304755A5E, fd599; +sub.f64 fd602, fd597, fd601; +add.f64 fd603, fd601, fd597; +mul.f64 fd604, fd587, 0d3FE9E3779B97F4A8; +sub.f64 fd605, fd509, fd604; +fma.rn.f64 fd606, fd589, 0d3FD3C6EF372FE950, fd605; +mul.f64 fd607, fd598, 0d3FE2CF2304755A5E; +mul.f64 fd608, fd600, 0d3FEE6F0E134454FF; +sub.f64 fd609, fd607, fd608; +sub.f64 fd610, fd606, fd609; +add.f64 fd611, fd609, fd606; +fma.rn.f64 fd612, fd591, 0d3FD3C6EF372FE950, fd510; +mul.f64 fd613, fd593, 0d3FE9E3779B97F4A8; +sub.f64 fd614, fd612, fd613; +sub.f64 fd615, fd517, fd541; +mul.f64 fd616, fd615, 0d3FEE6F0E134454FF; +sub.f64 fd617, fd525, fd533; +fma.rn.f64 fd618, fd617, 0d3FE2CF2304755A5E, fd616; +add.f64 fd619, fd618, fd614; +sub.f64 fd620, fd614, fd618; +mul.f64 fd621, fd591, 0d3FE9E3779B97F4A8; +sub.f64 fd622, fd510, fd621; +fma.rn.f64 fd623, fd593, 0d3FD3C6EF372FE950, fd622; +mul.f64 fd624, fd615, 0d3FE2CF2304755A5E; +mul.f64 fd625, fd617, 0d3FEE6F0E134454FF; +sub.f64 fd626, fd624, fd625; +add.f64 fd627, fd626, fd623; +sub.f64 fd628, fd623, fd626; +mul.f64 fd629, fd602, 0d3FE9E3779B97F4A8; +mul.f64 fd630, fd619, 0d3FE2CF2304755A5E; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd619, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd633, fd602, 0d3FE2CF2304755A5E, fd632; +mul.f64 fd634, fd610, 0d3FD3C6EF372FE950; +mul.f64 fd635, fd627, 0d3FEE6F0E134454FF; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd627, 0d3FD3C6EF372FE950; +fma.rn.f64 fd638, fd610, 0d3FEE6F0E134454FF, fd637; +mul.f64 fd639, fd611, 0dBFD3C6EF372FE950; +mul.f64 fd640, fd628, 0d3FEE6F0E134454FF; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd628, 0dBFD3C6EF372FE950; +fma.rn.f64 fd643, fd611, 0d3FEE6F0E134454FF, fd642; +mul.f64 fd644, fd603, 0dBFE9E3779B97F4A8; +mul.f64 fd645, fd620, 0d3FE2CF2304755A5E; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd620, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd648, fd603, 0d3FE2CF2304755A5E, fd647; +add.f64 %1, fd552, fd594; +add.f64 %0, fd548, fd590; +add.f64 %3, fd577, fd633; +add.f64 %2, fd560, fd631; +add.f64 %5, fd585, fd638; +add.f64 %4, fd568, fd636; +add.f64 %7, fd586, fd643; +add.f64 %6, fd569, fd641; +add.f64 %9, fd578, fd648; +add.f64 %8, fd561, fd646; +sub.f64 %11, fd552, fd594; +sub.f64 %10, fd548, fd590; +sub.f64 %13, fd577, fd633; +sub.f64 %12, fd560, fd631; +sub.f64 %15, fd585, fd638; +sub.f64 %14, fd568, fd636; +sub.f64 %17, fd586, fd643; +sub.f64 %16, fd569, fd641; +sub.f64 %19, fd578, fd648; +sub.f64 %18, fd561, fd646; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<740, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<629>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 8000, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %28, %44; +add.f64 fd42, %23, fd41; +add.f64 fd43, %33, %39; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %30, %46; +add.f64 fd46, %24, fd45; +add.f64 fd47, %35, %40; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %23; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %30, %46; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %35, %40; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %23, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %24; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %28, %44; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %33, %39; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %24, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %31, %47; +add.f64 fd84, %25, fd83; +add.f64 fd85, %36, %41; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %32, %48; +add.f64 fd88, %27, fd87; +add.f64 fd89, %38, %43; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %25; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %32, %48; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %38, %43; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %25, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %27; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %31, %47; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %36, %41; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %27, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +add.f64 fd145, fd44, fd86; +add.f64 fd146, fd48, fd90; +sub.f64 fd147, fd44, fd86; +sub.f64 fd148, fd48, fd90; +add.f64 fd149, fd56, fd127; +add.f64 fd150, fd73, fd129; +sub.f64 fd151, fd56, fd127; +sub.f64 fd152, fd73, fd129; +add.f64 fd153, fd64, fd132; +add.f64 fd154, fd81, fd134; +sub.f64 fd155, fd64, fd132; +sub.f64 fd156, fd81, fd134; +add.f64 fd157, fd65, fd137; +add.f64 fd158, fd82, fd139; +sub.f64 fd159, fd65, fd137; +sub.f64 fd160, fd82, fd139; +add.f64 fd161, fd57, fd142; +add.f64 fd162, fd74, fd144; +sub.f64 fd163, fd57, fd142; +sub.f64 fd164, fd74, fd144; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 100; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd165, fd166}, [rd6]; +mul.f64 fd169, fd150, fd166; +fma.rn.f64 fd170, fd165, fd149, fd169; +mul.f64 fd171, fd149, fd166; +mul.f64 fd172, fd165, fd150; +sub.f64 fd173, fd172, fd171; +mul.f64 fd174, fd165, fd165; +mul.f64 fd175, fd166, fd166; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd166, fd165; +fma.rn.f64 fd178, fd166, fd165, fd177; +mul.f64 fd179, fd154, fd178; +fma.rn.f64 fd180, fd176, fd153, fd179; +mul.f64 fd181, fd153, fd178; +mul.f64 fd182, fd176, fd154; +sub.f64 fd183, fd182, fd181; +mul.f64 fd184, fd165, fd176; +mul.f64 fd185, fd166, fd178; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd165, fd178; +fma.rn.f64 fd188, fd166, fd176, fd187; +mul.f64 fd189, fd158, fd188; +fma.rn.f64 fd190, fd186, fd157, fd189; +mul.f64 fd191, fd157, fd188; +mul.f64 fd192, fd186, fd158; +sub.f64 fd193, fd192, fd191; +mul.f64 fd194, fd165, fd186; +mul.f64 fd195, fd166, fd188; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd165, fd188; +fma.rn.f64 fd198, fd166, fd186, fd197; +mul.f64 fd199, fd162, fd198; +fma.rn.f64 fd200, fd196, fd161, fd199; +mul.f64 fd201, fd161, fd198; +mul.f64 fd202, fd196, fd162; +sub.f64 fd203, fd202, fd201; +mul.f64 fd204, fd165, fd196; +mul.f64 fd205, fd166, fd198; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd165, fd198; +fma.rn.f64 fd208, fd166, fd196, fd207; +mul.f64 fd209, fd148, fd208; +fma.rn.f64 fd210, fd206, fd147, fd209; +mul.f64 fd211, fd147, fd208; +mul.f64 fd212, fd206, fd148; +sub.f64 fd213, fd212, fd211; +ld.global.v2.f64 {fd214, fd215}, [rd6+1600]; +mul.f64 fd218, fd152, fd215; +fma.rn.f64 fd219, fd214, fd151, fd218; +mul.f64 fd220, fd151, fd215; +mul.f64 fd221, fd214, fd152; +sub.f64 fd222, fd221, fd220; +mul.f64 fd223, fd165, fd214; +mul.f64 fd224, fd166, fd215; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd165, fd215; +fma.rn.f64 fd227, fd166, fd214, fd226; +mul.f64 fd228, fd156, fd227; +fma.rn.f64 fd229, fd225, fd155, fd228; +mul.f64 fd230, fd155, fd227; +mul.f64 fd231, fd225, fd156; +sub.f64 fd232, fd231, fd230; +mul.f64 fd233, fd165, fd225; +mul.f64 fd234, fd166, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd165, fd227; +fma.rn.f64 fd237, fd166, fd225, fd236; +mul.f64 fd238, fd160, fd237; +fma.rn.f64 fd239, fd235, fd159, fd238; +mul.f64 fd240, fd159, fd237; +mul.f64 fd241, fd235, fd160; +sub.f64 fd242, fd241, fd240; +mul.f64 fd243, fd165, fd235; +mul.f64 fd244, fd166, fd237; +sub.f64 fd245, fd243, fd244; +mul.f64 fd246, fd165, fd237; +fma.rn.f64 fd247, fd166, fd235, fd246; +mul.f64 fd248, fd164, fd247; +fma.rn.f64 fd249, fd245, fd163, fd248; +mul.f64 fd250, fd163, fd247; +mul.f64 fd251, fd245, fd164; +sub.f64 fd252, fd251, fd250; +mad.lo.s32 r8, r5, 8000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +st.shared.v2.f64 [r9], {fd145, fd170}; +st.shared.v2.f64 [r9+16], {fd180, fd190}; +st.shared.v2.f64 [r9+32], {fd200, fd210}; +st.shared.v2.f64 [r9+48], {fd219, fd229}; +st.shared.v2.f64 [r9+64], {fd239, fd249}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.f64 fd253, [r10]; +ld.shared.f64 fd254, [r10+800]; +ld.shared.f64 fd255, [r10+1600]; +ld.shared.f64 fd256, [r10+2400]; +ld.shared.f64 fd257, [r10+3200]; +ld.shared.f64 fd258, [r10+4000]; +ld.shared.f64 fd259, [r10+4800]; +ld.shared.f64 fd260, [r10+5600]; +ld.shared.f64 fd261, [r10+6400]; +ld.shared.f64 fd262, [r10+7200]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd146, fd173}; +st.shared.v2.f64 [r9+16], {fd183, fd193}; +st.shared.v2.f64 [r9+32], {fd203, fd213}; +st.shared.v2.f64 [r9+48], {fd222, fd232}; +st.shared.v2.f64 [r9+64], {fd242, fd252}; +barrier.sync 0; +ld.shared.f64 fd263, [r10]; +ld.shared.f64 fd264, [r10+800]; +ld.shared.f64 fd265, [r10+1600]; +ld.shared.f64 fd266, [r10+2400]; +ld.shared.f64 fd267, [r10+3200]; +ld.shared.f64 fd268, [r10+4000]; +ld.shared.f64 fd269, [r10+4800]; +ld.shared.f64 fd270, [r10+5600]; +ld.shared.f64 fd271, [r10+6400]; +ld.shared.f64 fd272, [r10+7200]; +add.f64 fd273, fd255, fd261; +add.f64 fd274, fd253, fd273; +add.f64 fd275, fd257, fd259; +add.f64 fd276, fd275, fd274; +add.f64 fd277, fd265, fd271; +add.f64 fd278, fd263, fd277; +add.f64 fd279, fd267, fd269; +add.f64 fd280, fd279, fd278; +fma.rn.f64 fd281, fd273, 0d3FD3C6EF372FE950, fd253; +mul.f64 fd282, fd275, 0d3FE9E3779B97F4A8; +sub.f64 fd283, fd281, fd282; +sub.f64 fd284, fd265, fd271; +mul.f64 fd285, fd284, 0d3FEE6F0E134454FF; +sub.f64 fd286, fd267, fd269; +fma.rn.f64 fd287, fd286, 0d3FE2CF2304755A5E, fd285; +sub.f64 fd288, fd283, fd287; +add.f64 fd289, fd287, fd283; +mul.f64 fd290, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd291, fd253, fd290; +fma.rn.f64 fd292, fd275, 0d3FD3C6EF372FE950, fd291; +mul.f64 fd293, fd284, 0d3FE2CF2304755A5E; +mul.f64 fd294, fd286, 0d3FEE6F0E134454FF; +sub.f64 fd295, fd293, fd294; +sub.f64 fd296, fd292, fd295; +add.f64 fd297, fd295, fd292; +fma.rn.f64 fd298, fd277, 0d3FD3C6EF372FE950, fd263; +mul.f64 fd299, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd300, fd298, fd299; +sub.f64 fd301, fd255, fd261; +mul.f64 fd302, fd301, 0d3FEE6F0E134454FF; +sub.f64 fd303, fd257, fd259; +fma.rn.f64 fd304, fd303, 0d3FE2CF2304755A5E, fd302; +add.f64 fd305, fd304, fd300; +sub.f64 fd306, fd300, fd304; +mul.f64 fd307, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd308, fd263, fd307; +fma.rn.f64 fd309, fd279, 0d3FD3C6EF372FE950, fd308; +mul.f64 fd310, fd301, 0d3FE2CF2304755A5E; +mul.f64 fd311, fd303, 0d3FEE6F0E134454FF; +sub.f64 fd312, fd310, fd311; +add.f64 fd313, fd312, fd309; +sub.f64 fd314, fd309, fd312; +add.f64 fd315, fd256, fd262; +add.f64 fd316, fd254, fd315; +add.f64 fd317, fd258, fd260; +add.f64 fd318, fd317, fd316; +add.f64 fd319, fd266, fd272; +add.f64 fd320, fd264, fd319; +add.f64 fd321, fd268, fd270; +add.f64 fd322, fd321, fd320; +fma.rn.f64 fd323, fd315, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd324, fd317, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd266, fd272; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd268, fd270; +fma.rn.f64 fd329, fd328, 0d3FE2CF2304755A5E, fd327; +sub.f64 fd330, fd325, fd329; +add.f64 fd331, fd329, fd325; +mul.f64 fd332, fd315, 0d3FE9E3779B97F4A8; +sub.f64 fd333, fd254, fd332; +fma.rn.f64 fd334, fd317, 0d3FD3C6EF372FE950, fd333; +mul.f64 fd335, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd336, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd337, fd335, fd336; +sub.f64 fd338, fd334, fd337; +add.f64 fd339, fd337, fd334; +fma.rn.f64 fd340, fd319, 0d3FD3C6EF372FE950, fd264; +mul.f64 fd341, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd342, fd340, fd341; +sub.f64 fd343, fd256, fd262; +mul.f64 fd344, fd343, 0d3FEE6F0E134454FF; +sub.f64 fd345, fd258, fd260; +fma.rn.f64 fd346, fd345, 0d3FE2CF2304755A5E, fd344; +add.f64 fd347, fd346, fd342; +sub.f64 fd348, fd342, fd346; +mul.f64 fd349, fd319, 0d3FE9E3779B97F4A8; +sub.f64 fd350, fd264, fd349; +fma.rn.f64 fd351, fd321, 0d3FD3C6EF372FE950, fd350; +mul.f64 fd352, fd343, 0d3FE2CF2304755A5E; +mul.f64 fd353, fd345, 0d3FEE6F0E134454FF; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd354, fd351; +sub.f64 fd356, fd351, fd354; +mul.f64 fd357, fd330, 0d3FE9E3779B97F4A8; +mul.f64 fd358, fd347, 0d3FE2CF2304755A5E; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd347, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd361, fd330, 0d3FE2CF2304755A5E, fd360; +mul.f64 fd362, fd338, 0d3FD3C6EF372FE950; +mul.f64 fd363, fd355, 0d3FEE6F0E134454FF; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd355, 0d3FD3C6EF372FE950; +fma.rn.f64 fd366, fd338, 0d3FEE6F0E134454FF, fd365; +mul.f64 fd367, fd339, 0dBFD3C6EF372FE950; +mul.f64 fd368, fd356, 0d3FEE6F0E134454FF; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd356, 0dBFD3C6EF372FE950; +fma.rn.f64 fd371, fd339, 0d3FEE6F0E134454FF, fd370; +mul.f64 fd372, fd331, 0dBFE9E3779B97F4A8; +mul.f64 fd373, fd348, 0d3FE2CF2304755A5E; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd348, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd376, fd331, 0d3FE2CF2304755A5E, fd375; +add.f64 fd377, fd276, fd318; +add.f64 fd378, fd280, fd322; +sub.f64 fd379, fd276, fd318; +sub.f64 fd380, fd280, fd322; +add.f64 fd381, fd288, fd359; +add.f64 fd382, fd305, fd361; +sub.f64 fd383, fd288, fd359; +sub.f64 fd384, fd305, fd361; +add.f64 fd385, fd296, fd364; +add.f64 fd386, fd313, fd366; +sub.f64 fd387, fd296, fd364; +sub.f64 fd388, fd313, fd366; +add.f64 fd389, fd297, fd369; +add.f64 fd390, fd314, fd371; +sub.f64 fd391, fd297, fd369; +sub.f64 fd392, fd314, fd371; +add.f64 fd393, fd289, fd374; +add.f64 fd394, fd306, fd376; +sub.f64 fd395, fd289, fd374; +sub.f64 fd396, fd306, fd376; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 10; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %22; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd397, fd398}, [rd11]; +mul.f64 fd401, fd382, fd398; +fma.rn.f64 fd402, fd397, fd381, fd401; +mul.f64 fd403, fd381, fd398; +mul.f64 fd404, fd397, fd382; +sub.f64 fd405, fd404, fd403; +mul.f64 fd406, fd397, fd397; +mul.f64 fd407, fd398, fd398; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd398, fd397; +fma.rn.f64 fd410, fd398, fd397, fd409; +mul.f64 fd411, fd386, fd410; +fma.rn.f64 fd412, fd408, fd385, fd411; +mul.f64 fd413, fd385, fd410; +mul.f64 fd414, fd408, fd386; +sub.f64 fd415, fd414, fd413; +mul.f64 fd416, fd397, fd408; +mul.f64 fd417, fd398, fd410; +sub.f64 fd418, fd416, fd417; +mul.f64 fd419, fd397, fd410; +fma.rn.f64 fd420, fd398, fd408, fd419; +mul.f64 fd421, fd390, fd420; +fma.rn.f64 fd422, fd418, fd389, fd421; +mul.f64 fd423, fd389, fd420; +mul.f64 fd424, fd418, fd390; +sub.f64 fd425, fd424, fd423; +mul.f64 fd426, fd397, fd418; +mul.f64 fd427, fd398, fd420; +sub.f64 fd428, fd426, fd427; +mul.f64 fd429, fd397, fd420; +fma.rn.f64 fd430, fd398, fd418, fd429; +mul.f64 fd431, fd394, fd430; +fma.rn.f64 fd432, fd428, fd393, fd431; +mul.f64 fd433, fd393, fd430; +mul.f64 fd434, fd428, fd394; +sub.f64 fd435, fd434, fd433; +mul.f64 fd436, fd397, fd428; +mul.f64 fd437, fd398, fd430; +sub.f64 fd438, fd436, fd437; +mul.f64 fd439, fd397, fd430; +fma.rn.f64 fd440, fd398, fd428, fd439; +mul.f64 fd441, fd380, fd440; +fma.rn.f64 fd442, fd438, fd379, fd441; +mul.f64 fd443, fd379, fd440; +mul.f64 fd444, fd438, fd380; +sub.f64 fd445, fd444, fd443; +ld.global.v2.f64 {fd446, fd447}, [rd11+160]; +mul.f64 fd450, fd384, fd447; +fma.rn.f64 fd451, fd446, fd383, fd450; +mul.f64 fd452, fd383, fd447; +mul.f64 fd453, fd446, fd384; +sub.f64 fd454, fd453, fd452; +mul.f64 fd455, fd397, fd446; +mul.f64 fd456, fd398, fd447; +sub.f64 fd457, fd455, fd456; +mul.f64 fd458, fd397, fd447; +fma.rn.f64 fd459, fd398, fd446, fd458; +mul.f64 fd460, fd388, fd459; +fma.rn.f64 fd461, fd457, fd387, fd460; +mul.f64 fd462, fd387, fd459; +mul.f64 fd463, fd457, fd388; +sub.f64 fd464, fd463, fd462; +mul.f64 fd465, fd397, fd457; +mul.f64 fd466, fd398, fd459; +sub.f64 fd467, fd465, fd466; +mul.f64 fd468, fd397, fd459; +fma.rn.f64 fd469, fd398, fd457, fd468; +mul.f64 fd470, fd392, fd469; +fma.rn.f64 fd471, fd467, fd391, fd470; +mul.f64 fd472, fd391, fd469; +mul.f64 fd473, fd467, fd392; +sub.f64 fd474, fd473, fd472; +mul.f64 fd475, fd397, fd467; +mul.f64 fd476, fd398, fd469; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd397, fd469; +fma.rn.f64 fd479, fd398, fd467, fd478; +mul.f64 fd480, fd396, fd479; +fma.rn.f64 fd481, fd477, fd395, fd480; +mul.f64 fd482, fd395, fd479; +mul.f64 fd483, fd477, fd396; +sub.f64 fd484, fd483, fd482; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 800, r15; +st.shared.f64 [r16], fd377; +st.shared.f64 [r16+80], fd402; +st.shared.f64 [r16+160], fd412; +st.shared.f64 [r16+240], fd422; +st.shared.f64 [r16+320], fd432; +st.shared.f64 [r16+400], fd442; +st.shared.f64 [r16+480], fd451; +st.shared.f64 [r16+560], fd461; +st.shared.f64 [r16+640], fd471; +st.shared.f64 [r16+720], fd481; +barrier.sync 0; +ld.shared.f64 fd485, [r10]; +ld.shared.f64 fd486, [r10+800]; +ld.shared.f64 fd487, [r10+1600]; +ld.shared.f64 fd488, [r10+2400]; +ld.shared.f64 fd489, [r10+3200]; +ld.shared.f64 fd490, [r10+4000]; +ld.shared.f64 fd491, [r10+4800]; +ld.shared.f64 fd492, [r10+5600]; +ld.shared.f64 fd493, [r10+6400]; +ld.shared.f64 fd494, [r10+7200]; +barrier.sync 0; +st.shared.f64 [r16], fd378; +st.shared.f64 [r16+80], fd405; +st.shared.f64 [r16+160], fd415; +st.shared.f64 [r16+240], fd425; +st.shared.f64 [r16+320], fd435; +st.shared.f64 [r16+400], fd445; +st.shared.f64 [r16+480], fd454; +st.shared.f64 [r16+560], fd464; +st.shared.f64 [r16+640], fd474; +st.shared.f64 [r16+720], fd484; +barrier.sync 0; +ld.shared.f64 fd495, [r10]; +ld.shared.f64 fd496, [r10+800]; +ld.shared.f64 fd497, [r10+1600]; +ld.shared.f64 fd498, [r10+2400]; +ld.shared.f64 fd499, [r10+3200]; +ld.shared.f64 fd500, [r10+4000]; +ld.shared.f64 fd501, [r10+4800]; +ld.shared.f64 fd502, [r10+5600]; +ld.shared.f64 fd503, [r10+6400]; +ld.shared.f64 fd504, [r10+7200]; +add.f64 fd505, fd487, fd493; +add.f64 fd506, fd485, fd505; +add.f64 fd507, fd489, fd491; +add.f64 fd508, fd507, fd506; +add.f64 fd509, fd497, fd503; +add.f64 fd510, fd495, fd509; +add.f64 fd511, fd499, fd501; +add.f64 fd512, fd511, fd510; +fma.rn.f64 fd513, fd505, 0d3FD3C6EF372FE950, fd485; +mul.f64 fd514, fd507, 0d3FE9E3779B97F4A8; +sub.f64 fd515, fd513, fd514; +sub.f64 fd516, fd497, fd503; +mul.f64 fd517, fd516, 0d3FEE6F0E134454FF; +sub.f64 fd518, fd499, fd501; +fma.rn.f64 fd519, fd518, 0d3FE2CF2304755A5E, fd517; +sub.f64 fd520, fd515, fd519; +add.f64 fd521, fd519, fd515; +mul.f64 fd522, fd505, 0d3FE9E3779B97F4A8; +sub.f64 fd523, fd485, fd522; +fma.rn.f64 fd524, fd507, 0d3FD3C6EF372FE950, fd523; +mul.f64 fd525, fd516, 0d3FE2CF2304755A5E; +mul.f64 fd526, fd518, 0d3FEE6F0E134454FF; +sub.f64 fd527, fd525, fd526; +sub.f64 fd528, fd524, fd527; +add.f64 fd529, fd527, fd524; +fma.rn.f64 fd530, fd509, 0d3FD3C6EF372FE950, fd495; +mul.f64 fd531, fd511, 0d3FE9E3779B97F4A8; +sub.f64 fd532, fd530, fd531; +sub.f64 fd533, fd487, fd493; +mul.f64 fd534, fd533, 0d3FEE6F0E134454FF; +sub.f64 fd535, fd489, fd491; +fma.rn.f64 fd536, fd535, 0d3FE2CF2304755A5E, fd534; +add.f64 fd537, fd536, fd532; +sub.f64 fd538, fd532, fd536; +mul.f64 fd539, fd509, 0d3FE9E3779B97F4A8; +sub.f64 fd540, fd495, fd539; +fma.rn.f64 fd541, fd511, 0d3FD3C6EF372FE950, fd540; +mul.f64 fd542, fd533, 0d3FE2CF2304755A5E; +mul.f64 fd543, fd535, 0d3FEE6F0E134454FF; +sub.f64 fd544, fd542, fd543; +add.f64 fd545, fd544, fd541; +sub.f64 fd546, fd541, fd544; +add.f64 fd547, fd488, fd494; +add.f64 fd548, fd486, fd547; +add.f64 fd549, fd490, fd492; +add.f64 fd550, fd549, fd548; +add.f64 fd551, fd498, fd504; +add.f64 fd552, fd496, fd551; +add.f64 fd553, fd500, fd502; +add.f64 fd554, fd553, fd552; +fma.rn.f64 fd555, fd547, 0d3FD3C6EF372FE950, fd486; +mul.f64 fd556, fd549, 0d3FE9E3779B97F4A8; +sub.f64 fd557, fd555, fd556; +sub.f64 fd558, fd498, fd504; +mul.f64 fd559, fd558, 0d3FEE6F0E134454FF; +sub.f64 fd560, fd500, fd502; +fma.rn.f64 fd561, fd560, 0d3FE2CF2304755A5E, fd559; +sub.f64 fd562, fd557, fd561; +add.f64 fd563, fd561, fd557; +mul.f64 fd564, fd547, 0d3FE9E3779B97F4A8; +sub.f64 fd565, fd486, fd564; +fma.rn.f64 fd566, fd549, 0d3FD3C6EF372FE950, fd565; +mul.f64 fd567, fd558, 0d3FE2CF2304755A5E; +mul.f64 fd568, fd560, 0d3FEE6F0E134454FF; +sub.f64 fd569, fd567, fd568; +sub.f64 fd570, fd566, fd569; +add.f64 fd571, fd569, fd566; +fma.rn.f64 fd572, fd551, 0d3FD3C6EF372FE950, fd496; +mul.f64 fd573, fd553, 0d3FE9E3779B97F4A8; +sub.f64 fd574, fd572, fd573; +sub.f64 fd575, fd488, fd494; +mul.f64 fd576, fd575, 0d3FEE6F0E134454FF; +sub.f64 fd577, fd490, fd492; +fma.rn.f64 fd578, fd577, 0d3FE2CF2304755A5E, fd576; +add.f64 fd579, fd578, fd574; +sub.f64 fd580, fd574, fd578; +mul.f64 fd581, fd551, 0d3FE9E3779B97F4A8; +sub.f64 fd582, fd496, fd581; +fma.rn.f64 fd583, fd553, 0d3FD3C6EF372FE950, fd582; +mul.f64 fd584, fd575, 0d3FE2CF2304755A5E; +mul.f64 fd585, fd577, 0d3FEE6F0E134454FF; +sub.f64 fd586, fd584, fd585; +add.f64 fd587, fd586, fd583; +sub.f64 fd588, fd583, fd586; +mul.f64 fd589, fd562, 0d3FE9E3779B97F4A8; +mul.f64 fd590, fd579, 0d3FE2CF2304755A5E; +sub.f64 fd591, fd589, fd590; +mul.f64 fd592, fd579, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd593, fd562, 0d3FE2CF2304755A5E, fd592; +mul.f64 fd594, fd570, 0d3FD3C6EF372FE950; +mul.f64 fd595, fd587, 0d3FEE6F0E134454FF; +sub.f64 fd596, fd594, fd595; +mul.f64 fd597, fd587, 0d3FD3C6EF372FE950; +fma.rn.f64 fd598, fd570, 0d3FEE6F0E134454FF, fd597; +mul.f64 fd599, fd571, 0dBFD3C6EF372FE950; +mul.f64 fd600, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd588, 0dBFD3C6EF372FE950; +fma.rn.f64 fd603, fd571, 0d3FEE6F0E134454FF, fd602; +mul.f64 fd604, fd563, 0dBFE9E3779B97F4A8; +mul.f64 fd605, fd580, 0d3FE2CF2304755A5E; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd580, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd608, fd563, 0d3FE2CF2304755A5E, fd607; +add.f64 %0, fd508, fd550; +add.f64 %1, fd512, fd554; +add.f64 %3, fd537, fd593; +add.f64 %2, fd520, fd591; +add.f64 %5, fd545, fd598; +add.f64 %4, fd528, fd596; +add.f64 %7, fd546, fd603; +add.f64 %6, fd529, fd601; +add.f64 %9, fd538, fd608; +add.f64 %8, fd521, fd606; +sub.f64 %10, fd508, fd550; +sub.f64 %11, fd512, fd554; +sub.f64 %13, fd537, fd593; +sub.f64 %12, fd520, fd591; +sub.f64 %15, fd545, fd598; +sub.f64 %14, fd528, fd596; +sub.f64 %17, fd546, fd603; +sub.f64 %16, fd529, fd601; +sub.f64 %19, fd538, fd608; +sub.f64 %18, fd521, fd606; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_1000), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..2b36447598ca3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp16_fwd.hpp.inc @@ -0,0 +1,4572 @@ +#ifndef CUFFTDX_FFT_100_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_100_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<938, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<146>; +.reg .b32 r<1914>; +.reg .b64 rd<5>; +mov.u32 r1901, %tid.y; +shl.b32 r1902, r1901, 1; +mov.u32 r1903, %20; +mad.lo.s32 r1904, r1902, 400, r1903; +mov.u32 r1905, %tid.x; +mov.f32 f112, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1, {low, high}; +} +mov.f32 f118, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r2, {low, high}; +} +mov.f32 f120, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r3, {low, high}; +} +mov.f32 f122, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %25, %37; +} +{ +add.f16x2 r12, %21, r9; +} +{ +add.f16x2 r15, %29, %33; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %26, %38; +} +{ +add.f16x2 r24, %22, r21; +} +{ +add.f16x2 r27, %30, %34; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %25, %37; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %21, r36; +} +{ +add.f16x2 r42, %29, %33; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %26, %38; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %30, %34; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %25, %37; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %21, r72; +} +{ +add.f16x2 r78, %29, %33; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %26, %38; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %30, %34; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %25, %37; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %21, r108; +} +{ +add.f16x2 r114, %29, %33; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %26, %38; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %30, %34; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %25, %37; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %21, r144; +} +{ +add.f16x2 r150, %29, %33; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %26, %38; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %30, %34; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %26, %38; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %22, r180; +} +{ +add.f16x2 r186, %30, %34; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %25, %37; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %29, %33; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %26, %38; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %22, r216; +} +{ +add.f16x2 r222, %30, %34; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %25, %37; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %29, %33; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %26, %38; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %22, r252; +} +{ +add.f16x2 r258, %30, %34; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %25, %37; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %29, %33; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %26, %38; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %22, r288; +} +{ +add.f16x2 r294, %30, %34; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %25, %37; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %29, %33; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %27, %39; +} +{ +add.f16x2 r332, %23, r329; +} +{ +add.f16x2 r335, %31, %35; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %28, %40; +} +{ +add.f16x2 r344, %24, r341; +} +{ +add.f16x2 r347, %32, %36; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %27, %39; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %23, r356; +} +{ +add.f16x2 r362, %31, %35; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %28, %40; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %32, %36; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %27, %39; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %23, r392; +} +{ +add.f16x2 r398, %31, %35; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %28, %40; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %32, %36; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %27, %39; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %23, r428; +} +{ +add.f16x2 r434, %31, %35; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %28, %40; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %32, %36; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %27, %39; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %23, r464; +} +{ +add.f16x2 r470, %31, %35; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %28, %40; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %32, %36; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %28, %40; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %24, r500; +} +{ +add.f16x2 r506, %32, %36; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %27, %39; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %31, %35; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %28, %40; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %24, r536; +} +{ +add.f16x2 r542, %32, %36; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %27, %39; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %31, %35; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %28, %40; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %24, r572; +} +{ +add.f16x2 r578, %32, %36; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %27, %39; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %31, %35; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %28, %40; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %24, r608; +} +{ +add.f16x2 r614, %32, %36; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %27, %39; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %31, %35; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +mov.f32 f108, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r644, {low, high}; +} +mov.f32 f116, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r648, {low, high}; +} +mov.f32 f79, 0fBF800000; +{ +mul.f16x2 r659, r386, r641; +} +{ +mul.f16x2 r662, r530, r642; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r386, r642; +} +{ +fma.rn.f16x2 r671, r530, r641, r668; +} +{ +mul.f16x2 r675, r458, r643; +} +{ +mul.f16x2 r678, r602, r644; +} +{ +sub.f16x2 r681, r675, r678; +} +{ +mul.f16x2 r684, r458, r644; +} +{ +fma.rn.f16x2 r687, r602, r643, r684; +} +{ +mul.f16x2 r691, r494, r645; +} +{ +mul.f16x2 r694, r638, r646; +} +{ +sub.f16x2 r697, r691, r694; +} +{ +mul.f16x2 r700, r494, r646; +} +{ +fma.rn.f16x2 r703, r638, r645, r700; +} +{ +mul.f16x2 r707, r422, r647; +} +{ +mul.f16x2 r710, r566, r648; +} +{ +sub.f16x2 r713, r707, r710; +} +{ +mul.f16x2 r716, r422, r648; +} +{ +fma.rn.f16x2 r719, r566, r647, r716; +} +{ +add.f16x2 r723, r18, r338; +} +{ +add.f16x2 r726, r30, r350; +} +{ +sub.f16x2 r729, r18, r338; +} +{ +sub.f16x2 r732, r30, r350; +} +{ +add.f16x2 r735, r66, r665; +} +{ +add.f16x2 r738, r210, r671; +} +{ +sub.f16x2 r741, r66, r665; +} +{ +sub.f16x2 r744, r210, r671; +} +{ +add.f16x2 r747, r138, r681; +} +{ +add.f16x2 r750, r282, r687; +} +{ +sub.f16x2 r753, r138, r681; +} +{ +sub.f16x2 r756, r282, r687; +} +{ +add.f16x2 r759, r174, r697; +} +{ +add.f16x2 r762, r318, r703; +} +{ +sub.f16x2 r765, r174, r697; +} +{ +sub.f16x2 r768, r318, r703; +} +{ +add.f16x2 r771, r102, r713; +} +{ +add.f16x2 r774, r246, r719; +} +{ +sub.f16x2 r777, r102, r713; +} +{ +sub.f16x2 r780, r246, r719; +} +mul.wide.u32 rd2, r1905, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1906, rd3; +mul.lo.s32 r1907, r1906, 10; +sub.s32 r1908, r1905, r1907; +shr.u64 rd4, rd2, 34; +cvt.u32.u64 r1909, rd4; +and.b32 r1910, r1909, 1073741822; +mad.lo.s32 r1911, r1910, 400, r1904; +cvt.rn.f32.u32 f143, r1908; +mul.f32 f144, f143, 0f3D80ADFD; +cos.approx.f32 f61, f144; +sin.approx.f32 f145, f144; +neg.f32 f62, f145; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r783, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r788, {high, high}; +} +{ +mul.f16x2 r790, r738, r788; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r735, r786, r793; +} +{ +mul.f16x2 r799, r735, r788; +} +{ +fma.rn.f16x2 r802, r738, r786, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r808, {high, high}; +} +mov.f32 f80, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r808, r810; +} +{ +mul.f16x2 r814, r783, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r817, {high, low}; +} +{ +fma.rn.f16x2 r819, r811, r817, r814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r825, {high, high}; +} +{ +mul.f16x2 r827, r750, r825; +} +{ +neg.f16x2 r830, r827; +} +{ +fma.rn.f16x2 r832, r747, r823, r830; +} +{ +mul.f16x2 r836, r747, r825; +} +{ +fma.rn.f16x2 r839, r750, r823, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r845, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r847, {low, high}; +} +{ +mul.f16x2 r848, r845, r847; +} +{ +mul.f16x2 r851, r819, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r854, {high, low}; +} +{ +fma.rn.f16x2 r856, r848, r854, r851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r860, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r862, {high, high}; +} +{ +mul.f16x2 r864, r762, r862; +} +{ +neg.f16x2 r867, r864; +} +{ +fma.rn.f16x2 r869, r759, r860, r867; +} +{ +mul.f16x2 r873, r759, r862; +} +{ +fma.rn.f16x2 r876, r762, r860, r873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r882, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r884, {low, high}; +} +{ +mul.f16x2 r885, r882, r884; +} +{ +mul.f16x2 r888, r856, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r891, {high, low}; +} +{ +fma.rn.f16x2 r893, r885, r891, r888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r897, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r899, {high, high}; +} +{ +mul.f16x2 r901, r774, r899; +} +{ +neg.f16x2 r904, r901; +} +{ +fma.rn.f16x2 r906, r771, r897, r904; +} +{ +mul.f16x2 r910, r771, r899; +} +{ +fma.rn.f16x2 r913, r774, r897, r910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r919, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r921, {low, high}; +} +{ +mul.f16x2 r922, r919, r921; +} +{ +mul.f16x2 r925, r893, r917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r928, {high, low}; +} +{ +fma.rn.f16x2 r930, r922, r928, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r934, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r936, {high, high}; +} +{ +mul.f16x2 r938, r732, r936; +} +{ +neg.f16x2 r941, r938; +} +{ +fma.rn.f16x2 r943, r729, r934, r941; +} +{ +mul.f16x2 r947, r729, r936; +} +{ +fma.rn.f16x2 r950, r732, r934, r947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r956, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r958, {low, high}; +} +{ +mul.f16x2 r959, r956, r958; +} +{ +mul.f16x2 r962, r930, r954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r965, {high, low}; +} +{ +fma.rn.f16x2 r967, r959, r965, r962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r971, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r973, {high, high}; +} +{ +mul.f16x2 r975, r744, r973; +} +{ +neg.f16x2 r978, r975; +} +{ +fma.rn.f16x2 r980, r741, r971, r978; +} +{ +mul.f16x2 r984, r741, r973; +} +{ +fma.rn.f16x2 r987, r744, r971, r984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r991, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r993, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r995, {low, high}; +} +{ +mul.f16x2 r996, r993, r995; +} +{ +mul.f16x2 r999, r967, r991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r1002, {high, low}; +} +{ +fma.rn.f16x2 r1004, r996, r1002, r999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1008, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1010, {high, high}; +} +{ +mul.f16x2 r1012, r756, r1010; +} +{ +neg.f16x2 r1015, r1012; +} +{ +fma.rn.f16x2 r1017, r753, r1008, r1015; +} +{ +mul.f16x2 r1021, r753, r1010; +} +{ +fma.rn.f16x2 r1024, r756, r1008, r1021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1028, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1030, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1032, {low, high}; +} +{ +mul.f16x2 r1033, r1030, r1032; +} +{ +mul.f16x2 r1036, r1004, r1028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1039, {high, low}; +} +{ +fma.rn.f16x2 r1041, r1033, r1039, r1036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1045, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1047, {high, high}; +} +{ +mul.f16x2 r1049, r768, r1047; +} +{ +neg.f16x2 r1052, r1049; +} +{ +fma.rn.f16x2 r1054, r765, r1045, r1052; +} +{ +mul.f16x2 r1058, r765, r1047; +} +{ +fma.rn.f16x2 r1061, r768, r1045, r1058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1065, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1067, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1069, {low, high}; +} +{ +mul.f16x2 r1070, r1067, r1069; +} +{ +mul.f16x2 r1073, r1041, r1065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1076, {high, low}; +} +{ +fma.rn.f16x2 r1078, r1070, r1076, r1073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1082, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1084, {high, high}; +} +{ +mul.f16x2 r1086, r780, r1084; +} +{ +neg.f16x2 r1089, r1086; +} +{ +fma.rn.f16x2 r1091, r777, r1082, r1089; +} +{ +mul.f16x2 r1095, r777, r1084; +} +{ +fma.rn.f16x2 r1098, r780, r1082, r1095; +} +barrier.sync 0; +mad.lo.s32 r1912, r1908, 80, r1911; +st.shared.v2.f32 [r1912], {r723, r726}; +st.shared.v2.f32 [r1912+8], {r795, r802}; +st.shared.v2.f32 [r1912+16], {r832, r839}; +st.shared.v2.f32 [r1912+24], {r869, r876}; +st.shared.v2.f32 [r1912+32], {r906, r913}; +st.shared.v2.f32 [r1912+40], {r943, r950}; +st.shared.v2.f32 [r1912+48], {r980, r987}; +st.shared.v2.f32 [r1912+56], {r1017, r1024}; +st.shared.v2.f32 [r1912+64], {r1054, r1061}; +st.shared.v2.f32 [r1912+72], {r1091, r1098}; +barrier.sync 0; +mad.lo.s32 r1913, r1908, -72, r1912; +ld.shared.u32 r1131, [r1913]; +ld.shared.u32 r1143, [r1913+4]; +ld.shared.u32 r1451, [r1913+80]; +ld.shared.u32 r1463, [r1913+84]; +ld.shared.u32 r1128, [r1913+160]; +ld.shared.u32 r1140, [r1913+164]; +ld.shared.u32 r1448, [r1913+240]; +ld.shared.u32 r1460, [r1913+244]; +ld.shared.u32 r1134, [r1913+320]; +ld.shared.u32 r1146, [r1913+324]; +ld.shared.u32 r1454, [r1913+400]; +ld.shared.u32 r1466, [r1913+404]; +ld.shared.u32 r1135, [r1913+480]; +ld.shared.u32 r1147, [r1913+484]; +ld.shared.u32 r1455, [r1913+560]; +ld.shared.u32 r1467, [r1913+564]; +ld.shared.u32 r1129, [r1913+640]; +ld.shared.u32 r1141, [r1913+644]; +ld.shared.u32 r1449, [r1913+720]; +ld.shared.u32 r1461, [r1913+724]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +add.f16x2 r1127, r1128, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1130, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1142, r1145; +} +{ +add.f16x2 r1151, r1128, r1129; +} +{ +mul.f16x2 r1154, r1151, r1119; +} +{ +add.f16x2 r1157, r1131, r1154; +} +{ +add.f16x2 r1160, r1134, r1135; +} +{ +mul.f16x2 r1163, r1160, r1121; +} +{ +add.f16x2 r1166, r1157, r1163; +} +{ +sub.f16x2 r1169, r1140, r1141; +} +{ +mul.f16x2 r1172, r1169, r1120; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1122; +} +{ +add.f16x2 r1181, r1172, r1178; +} +{ +sub.f16x2 r1184, r1166, r1181; +} +{ +add.f16x2 r1187, r1128, r1129; +} +{ +mul.f16x2 r1190, r1187, r1119; +} +{ +add.f16x2 r1193, r1131, r1190; +} +{ +add.f16x2 r1196, r1134, r1135; +} +{ +mul.f16x2 r1199, r1196, r1121; +} +{ +add.f16x2 r1202, r1193, r1199; +} +{ +sub.f16x2 r1205, r1140, r1141; +} +{ +mul.f16x2 r1208, r1205, r1120; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1122; +} +{ +add.f16x2 r1217, r1208, r1214; +} +{ +add.f16x2 r1220, r1202, r1217; +} +{ +add.f16x2 r1223, r1128, r1129; +} +{ +mul.f16x2 r1226, r1223, r1121; +} +{ +add.f16x2 r1229, r1131, r1226; +} +{ +add.f16x2 r1232, r1134, r1135; +} +{ +mul.f16x2 r1235, r1232, r1123; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1140, r1141; +} +{ +mul.f16x2 r1244, r1241, r1122; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1125; +} +{ +add.f16x2 r1253, r1244, r1250; +} +{ +sub.f16x2 r1256, r1238, r1253; +} +{ +add.f16x2 r1259, r1128, r1129; +} +{ +mul.f16x2 r1262, r1259, r1121; +} +{ +add.f16x2 r1265, r1131, r1262; +} +{ +add.f16x2 r1268, r1134, r1135; +} +{ +mul.f16x2 r1271, r1268, r1123; +} +{ +add.f16x2 r1274, r1265, r1271; +} +{ +sub.f16x2 r1277, r1140, r1141; +} +{ +mul.f16x2 r1280, r1277, r1122; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1125; +} +{ +add.f16x2 r1289, r1280, r1286; +} +{ +add.f16x2 r1292, r1274, r1289; +} +{ +add.f16x2 r1295, r1140, r1141; +} +{ +mul.f16x2 r1298, r1295, r1119; +} +{ +add.f16x2 r1301, r1143, r1298; +} +{ +add.f16x2 r1304, r1146, r1147; +} +{ +mul.f16x2 r1307, r1304, r1121; +} +{ +add.f16x2 r1310, r1301, r1307; +} +{ +sub.f16x2 r1313, r1128, r1129; +} +{ +mul.f16x2 r1316, r1313, r1120; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1122; +} +{ +add.f16x2 r1325, r1316, r1322; +} +{ +add.f16x2 r1328, r1310, r1325; +} +{ +add.f16x2 r1331, r1140, r1141; +} +{ +mul.f16x2 r1334, r1331, r1119; +} +{ +add.f16x2 r1337, r1143, r1334; +} +{ +add.f16x2 r1340, r1146, r1147; +} +{ +mul.f16x2 r1343, r1340, r1121; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +sub.f16x2 r1349, r1128, r1129; +} +{ +mul.f16x2 r1352, r1349, r1120; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1122; +} +{ +add.f16x2 r1361, r1352, r1358; +} +{ +sub.f16x2 r1364, r1346, r1361; +} +{ +add.f16x2 r1367, r1140, r1141; +} +{ +mul.f16x2 r1370, r1367, r1121; +} +{ +add.f16x2 r1373, r1143, r1370; +} +{ +add.f16x2 r1376, r1146, r1147; +} +{ +mul.f16x2 r1379, r1376, r1123; +} +{ +add.f16x2 r1382, r1373, r1379; +} +{ +sub.f16x2 r1385, r1128, r1129; +} +{ +mul.f16x2 r1388, r1385, r1122; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1125; +} +{ +add.f16x2 r1397, r1388, r1394; +} +{ +add.f16x2 r1400, r1382, r1397; +} +{ +add.f16x2 r1403, r1140, r1141; +} +{ +mul.f16x2 r1406, r1403, r1121; +} +{ +add.f16x2 r1409, r1143, r1406; +} +{ +add.f16x2 r1412, r1146, r1147; +} +{ +mul.f16x2 r1415, r1412, r1123; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +sub.f16x2 r1421, r1128, r1129; +} +{ +mul.f16x2 r1424, r1421, r1122; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1125; +} +{ +add.f16x2 r1433, r1424, r1430; +} +{ +sub.f16x2 r1436, r1418, r1433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1444, {low, high}; +} +{ +neg.f16x2 r1445, r1444; +} +{ +add.f16x2 r1447, r1448, r1449; +} +{ +add.f16x2 r1450, r1451, r1447; +} +{ +add.f16x2 r1453, r1454, r1455; +} +{ +add.f16x2 r1456, r1450, r1453; +} +{ +add.f16x2 r1459, r1460, r1461; +} +{ +add.f16x2 r1462, r1463, r1459; +} +{ +add.f16x2 r1465, r1466, r1467; +} +{ +add.f16x2 r1468, r1462, r1465; +} +{ +add.f16x2 r1471, r1448, r1449; +} +{ +mul.f16x2 r1474, r1471, r1439; +} +{ +add.f16x2 r1477, r1451, r1474; +} +{ +add.f16x2 r1480, r1454, r1455; +} +{ +mul.f16x2 r1483, r1480, r1441; +} +{ +add.f16x2 r1486, r1477, r1483; +} +{ +sub.f16x2 r1489, r1460, r1461; +} +{ +mul.f16x2 r1492, r1489, r1440; +} +{ +sub.f16x2 r1495, r1466, r1467; +} +{ +mul.f16x2 r1498, r1495, r1442; +} +{ +add.f16x2 r1501, r1492, r1498; +} +{ +sub.f16x2 r1504, r1486, r1501; +} +{ +add.f16x2 r1507, r1448, r1449; +} +{ +mul.f16x2 r1510, r1507, r1439; +} +{ +add.f16x2 r1513, r1451, r1510; +} +{ +add.f16x2 r1516, r1454, r1455; +} +{ +mul.f16x2 r1519, r1516, r1441; +} +{ +add.f16x2 r1522, r1513, r1519; +} +{ +sub.f16x2 r1525, r1460, r1461; +} +{ +mul.f16x2 r1528, r1525, r1440; +} +{ +sub.f16x2 r1531, r1466, r1467; +} +{ +mul.f16x2 r1534, r1531, r1442; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +add.f16x2 r1540, r1522, r1537; +} +{ +add.f16x2 r1543, r1448, r1449; +} +{ +mul.f16x2 r1546, r1543, r1441; +} +{ +add.f16x2 r1549, r1451, r1546; +} +{ +add.f16x2 r1552, r1454, r1455; +} +{ +mul.f16x2 r1555, r1552, r1443; +} +{ +add.f16x2 r1558, r1549, r1555; +} +{ +sub.f16x2 r1561, r1460, r1461; +} +{ +mul.f16x2 r1564, r1561, r1442; +} +{ +sub.f16x2 r1567, r1466, r1467; +} +{ +mul.f16x2 r1570, r1567, r1445; +} +{ +add.f16x2 r1573, r1564, r1570; +} +{ +sub.f16x2 r1576, r1558, r1573; +} +{ +add.f16x2 r1579, r1448, r1449; +} +{ +mul.f16x2 r1582, r1579, r1441; +} +{ +add.f16x2 r1585, r1451, r1582; +} +{ +add.f16x2 r1588, r1454, r1455; +} +{ +mul.f16x2 r1591, r1588, r1443; +} +{ +add.f16x2 r1594, r1585, r1591; +} +{ +sub.f16x2 r1597, r1460, r1461; +} +{ +mul.f16x2 r1600, r1597, r1442; +} +{ +sub.f16x2 r1603, r1466, r1467; +} +{ +mul.f16x2 r1606, r1603, r1445; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +add.f16x2 r1612, r1594, r1609; +} +{ +add.f16x2 r1615, r1460, r1461; +} +{ +mul.f16x2 r1618, r1615, r1439; +} +{ +add.f16x2 r1621, r1463, r1618; +} +{ +add.f16x2 r1624, r1466, r1467; +} +{ +mul.f16x2 r1627, r1624, r1441; +} +{ +add.f16x2 r1630, r1621, r1627; +} +{ +sub.f16x2 r1633, r1448, r1449; +} +{ +mul.f16x2 r1636, r1633, r1440; +} +{ +sub.f16x2 r1639, r1454, r1455; +} +{ +mul.f16x2 r1642, r1639, r1442; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +add.f16x2 r1648, r1630, r1645; +} +{ +add.f16x2 r1651, r1460, r1461; +} +{ +mul.f16x2 r1654, r1651, r1439; +} +{ +add.f16x2 r1657, r1463, r1654; +} +{ +add.f16x2 r1660, r1466, r1467; +} +{ +mul.f16x2 r1663, r1660, r1441; +} +{ +add.f16x2 r1666, r1657, r1663; +} +{ +sub.f16x2 r1669, r1448, r1449; +} +{ +mul.f16x2 r1672, r1669, r1440; +} +{ +sub.f16x2 r1675, r1454, r1455; +} +{ +mul.f16x2 r1678, r1675, r1442; +} +{ +add.f16x2 r1681, r1672, r1678; +} +{ +sub.f16x2 r1684, r1666, r1681; +} +{ +add.f16x2 r1687, r1460, r1461; +} +{ +mul.f16x2 r1690, r1687, r1441; +} +{ +add.f16x2 r1693, r1463, r1690; +} +{ +add.f16x2 r1696, r1466, r1467; +} +{ +mul.f16x2 r1699, r1696, r1443; +} +{ +add.f16x2 r1702, r1693, r1699; +} +{ +sub.f16x2 r1705, r1448, r1449; +} +{ +mul.f16x2 r1708, r1705, r1442; +} +{ +sub.f16x2 r1711, r1454, r1455; +} +{ +mul.f16x2 r1714, r1711, r1445; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +add.f16x2 r1720, r1702, r1717; +} +{ +add.f16x2 r1723, r1460, r1461; +} +{ +mul.f16x2 r1726, r1723, r1441; +} +{ +add.f16x2 r1729, r1463, r1726; +} +{ +add.f16x2 r1732, r1466, r1467; +} +{ +mul.f16x2 r1735, r1732, r1443; +} +{ +add.f16x2 r1738, r1729, r1735; +} +{ +sub.f16x2 r1741, r1448, r1449; +} +{ +mul.f16x2 r1744, r1741, r1442; +} +{ +sub.f16x2 r1747, r1454, r1455; +} +{ +mul.f16x2 r1750, r1747, r1445; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, r1738, r1753; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1759, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1760, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1761, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1766, {low, high}; +} +{ +mul.f16x2 r1777, r1504, r1759; +} +{ +mul.f16x2 r1780, r1648, r1760; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1504, r1760; +} +{ +fma.rn.f16x2 r1789, r1648, r1759, r1786; +} +{ +mul.f16x2 r1793, r1576, r1761; +} +{ +mul.f16x2 r1796, r1720, r1762; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1576, r1762; +} +{ +fma.rn.f16x2 r1805, r1720, r1761, r1802; +} +{ +mul.f16x2 r1809, r1612, r1763; +} +{ +mul.f16x2 r1812, r1756, r1764; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1612, r1764; +} +{ +fma.rn.f16x2 r1821, r1756, r1763, r1818; +} +{ +mul.f16x2 r1825, r1540, r1765; +} +{ +mul.f16x2 r1828, r1684, r1766; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1540, r1766; +} +{ +fma.rn.f16x2 r1837, r1684, r1765, r1834; +} +{ +add.f16x2 %0, r1136, r1456; +} +{ +add.f16x2 %1, r1148, r1468; +} +{ +sub.f16x2 %10, r1136, r1456; +} +{ +sub.f16x2 %11, r1148, r1468; +} +{ +add.f16x2 %2, r1184, r1783; +} +{ +add.f16x2 %3, r1328, r1789; +} +{ +sub.f16x2 %12, r1184, r1783; +} +{ +sub.f16x2 %13, r1328, r1789; +} +{ +add.f16x2 %4, r1256, r1799; +} +{ +add.f16x2 %5, r1400, r1805; +} +{ +sub.f16x2 %14, r1256, r1799; +} +{ +sub.f16x2 %15, r1400, r1805; +} +{ +add.f16x2 %6, r1292, r1815; +} +{ +add.f16x2 %7, r1436, r1821; +} +{ +sub.f16x2 %16, r1292, r1815; +} +{ +sub.f16x2 %17, r1436, r1821; +} +{ +add.f16x2 %8, r1220, r1831; +} +{ +add.f16x2 %9, r1364, r1837; +} +{ +sub.f16x2 %18, r1220, r1831; +} +{ +sub.f16x2 %19, r1364, r1837; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<937, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<146>; +.reg .b32 r<1911>; +.reg .b64 rd<4>; +mov.u32 r1901, %tid.y; +mov.u32 r1902, %20; +mad.lo.s32 r1903, r1901, 400, r1902; +mov.u32 r1904, %tid.x; +mov.f32 f112, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1, {low, high}; +} +mov.f32 f118, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r2, {low, high}; +} +mov.f32 f120, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r3, {low, high}; +} +mov.f32 f122, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %25, %37; +} +{ +add.f16x2 r12, %21, r9; +} +{ +add.f16x2 r15, %29, %33; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %26, %38; +} +{ +add.f16x2 r24, %22, r21; +} +{ +add.f16x2 r27, %30, %34; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %25, %37; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %21, r36; +} +{ +add.f16x2 r42, %29, %33; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %26, %38; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %30, %34; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %25, %37; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %21, r72; +} +{ +add.f16x2 r78, %29, %33; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %26, %38; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %30, %34; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %25, %37; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %21, r108; +} +{ +add.f16x2 r114, %29, %33; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %26, %38; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %30, %34; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %25, %37; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %21, r144; +} +{ +add.f16x2 r150, %29, %33; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %26, %38; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %30, %34; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %26, %38; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %22, r180; +} +{ +add.f16x2 r186, %30, %34; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %25, %37; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %29, %33; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %26, %38; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %22, r216; +} +{ +add.f16x2 r222, %30, %34; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %25, %37; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %29, %33; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %26, %38; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %22, r252; +} +{ +add.f16x2 r258, %30, %34; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %25, %37; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %29, %33; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %26, %38; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %22, r288; +} +{ +add.f16x2 r294, %30, %34; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %25, %37; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %29, %33; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %27, %39; +} +{ +add.f16x2 r332, %23, r329; +} +{ +add.f16x2 r335, %31, %35; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %28, %40; +} +{ +add.f16x2 r344, %24, r341; +} +{ +add.f16x2 r347, %32, %36; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %27, %39; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %23, r356; +} +{ +add.f16x2 r362, %31, %35; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %28, %40; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %32, %36; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %27, %39; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %23, r392; +} +{ +add.f16x2 r398, %31, %35; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %28, %40; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %32, %36; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %27, %39; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %23, r428; +} +{ +add.f16x2 r434, %31, %35; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %28, %40; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %32, %36; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %27, %39; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %23, r464; +} +{ +add.f16x2 r470, %31, %35; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %28, %40; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %32, %36; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %28, %40; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %24, r500; +} +{ +add.f16x2 r506, %32, %36; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %27, %39; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %31, %35; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %28, %40; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %24, r536; +} +{ +add.f16x2 r542, %32, %36; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %27, %39; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %31, %35; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %28, %40; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %24, r572; +} +{ +add.f16x2 r578, %32, %36; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %27, %39; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %31, %35; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %28, %40; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %24, r608; +} +{ +add.f16x2 r614, %32, %36; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %27, %39; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %31, %35; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +mov.f32 f108, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r644, {low, high}; +} +mov.f32 f116, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r648, {low, high}; +} +mov.f32 f79, 0fBF800000; +{ +mul.f16x2 r659, r386, r641; +} +{ +mul.f16x2 r662, r530, r642; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r386, r642; +} +{ +fma.rn.f16x2 r671, r530, r641, r668; +} +{ +mul.f16x2 r675, r458, r643; +} +{ +mul.f16x2 r678, r602, r644; +} +{ +sub.f16x2 r681, r675, r678; +} +{ +mul.f16x2 r684, r458, r644; +} +{ +fma.rn.f16x2 r687, r602, r643, r684; +} +{ +mul.f16x2 r691, r494, r645; +} +{ +mul.f16x2 r694, r638, r646; +} +{ +sub.f16x2 r697, r691, r694; +} +{ +mul.f16x2 r700, r494, r646; +} +{ +fma.rn.f16x2 r703, r638, r645, r700; +} +{ +mul.f16x2 r707, r422, r647; +} +{ +mul.f16x2 r710, r566, r648; +} +{ +sub.f16x2 r713, r707, r710; +} +{ +mul.f16x2 r716, r422, r648; +} +{ +fma.rn.f16x2 r719, r566, r647, r716; +} +{ +add.f16x2 r723, r18, r338; +} +{ +add.f16x2 r726, r30, r350; +} +{ +sub.f16x2 r729, r18, r338; +} +{ +sub.f16x2 r732, r30, r350; +} +{ +add.f16x2 r735, r66, r665; +} +{ +add.f16x2 r738, r210, r671; +} +{ +sub.f16x2 r741, r66, r665; +} +{ +sub.f16x2 r744, r210, r671; +} +{ +add.f16x2 r747, r138, r681; +} +{ +add.f16x2 r750, r282, r687; +} +{ +sub.f16x2 r753, r138, r681; +} +{ +sub.f16x2 r756, r282, r687; +} +{ +add.f16x2 r759, r174, r697; +} +{ +add.f16x2 r762, r318, r703; +} +{ +sub.f16x2 r765, r174, r697; +} +{ +sub.f16x2 r768, r318, r703; +} +{ +add.f16x2 r771, r102, r713; +} +{ +add.f16x2 r774, r246, r719; +} +{ +sub.f16x2 r777, r102, r713; +} +{ +sub.f16x2 r780, r246, r719; +} +mul.wide.u32 rd2, r1904, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1905, rd3; +mul.lo.s32 r1906, r1905, 10; +sub.s32 r1907, r1904, r1906; +mad.lo.s32 r1908, r1905, 400, r1903; +cvt.rn.f32.u32 f143, r1907; +mul.f32 f144, f143, 0f3D80ADFD; +cos.approx.f32 f61, f144; +sin.approx.f32 f145, f144; +neg.f32 f62, f145; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r783, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r788, {high, high}; +} +{ +mul.f16x2 r790, r738, r788; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r735, r786, r793; +} +{ +mul.f16x2 r799, r735, r788; +} +{ +fma.rn.f16x2 r802, r738, r786, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r808, {high, high}; +} +mov.f32 f80, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r808, r810; +} +{ +mul.f16x2 r814, r783, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r817, {high, low}; +} +{ +fma.rn.f16x2 r819, r811, r817, r814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r825, {high, high}; +} +{ +mul.f16x2 r827, r750, r825; +} +{ +neg.f16x2 r830, r827; +} +{ +fma.rn.f16x2 r832, r747, r823, r830; +} +{ +mul.f16x2 r836, r747, r825; +} +{ +fma.rn.f16x2 r839, r750, r823, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r845, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r847, {low, high}; +} +{ +mul.f16x2 r848, r845, r847; +} +{ +mul.f16x2 r851, r819, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r819; +mov.b32 r854, {high, low}; +} +{ +fma.rn.f16x2 r856, r848, r854, r851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r860, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r862, {high, high}; +} +{ +mul.f16x2 r864, r762, r862; +} +{ +neg.f16x2 r867, r864; +} +{ +fma.rn.f16x2 r869, r759, r860, r867; +} +{ +mul.f16x2 r873, r759, r862; +} +{ +fma.rn.f16x2 r876, r762, r860, r873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r882, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r884, {low, high}; +} +{ +mul.f16x2 r885, r882, r884; +} +{ +mul.f16x2 r888, r856, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r856; +mov.b32 r891, {high, low}; +} +{ +fma.rn.f16x2 r893, r885, r891, r888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r897, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r899, {high, high}; +} +{ +mul.f16x2 r901, r774, r899; +} +{ +neg.f16x2 r904, r901; +} +{ +fma.rn.f16x2 r906, r771, r897, r904; +} +{ +mul.f16x2 r910, r771, r899; +} +{ +fma.rn.f16x2 r913, r774, r897, r910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r919, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r921, {low, high}; +} +{ +mul.f16x2 r922, r919, r921; +} +{ +mul.f16x2 r925, r893, r917; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r893; +mov.b32 r928, {high, low}; +} +{ +fma.rn.f16x2 r930, r922, r928, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r934, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r936, {high, high}; +} +{ +mul.f16x2 r938, r732, r936; +} +{ +neg.f16x2 r941, r938; +} +{ +fma.rn.f16x2 r943, r729, r934, r941; +} +{ +mul.f16x2 r947, r729, r936; +} +{ +fma.rn.f16x2 r950, r732, r934, r947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r956, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r958, {low, high}; +} +{ +mul.f16x2 r959, r956, r958; +} +{ +mul.f16x2 r962, r930, r954; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r930; +mov.b32 r965, {high, low}; +} +{ +fma.rn.f16x2 r967, r959, r965, r962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r971, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r973, {high, high}; +} +{ +mul.f16x2 r975, r744, r973; +} +{ +neg.f16x2 r978, r975; +} +{ +fma.rn.f16x2 r980, r741, r971, r978; +} +{ +mul.f16x2 r984, r741, r973; +} +{ +fma.rn.f16x2 r987, r744, r971, r984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r991, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r993, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r995, {low, high}; +} +{ +mul.f16x2 r996, r993, r995; +} +{ +mul.f16x2 r999, r967, r991; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r967; +mov.b32 r1002, {high, low}; +} +{ +fma.rn.f16x2 r1004, r996, r1002, r999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1008, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1010, {high, high}; +} +{ +mul.f16x2 r1012, r756, r1010; +} +{ +neg.f16x2 r1015, r1012; +} +{ +fma.rn.f16x2 r1017, r753, r1008, r1015; +} +{ +mul.f16x2 r1021, r753, r1010; +} +{ +fma.rn.f16x2 r1024, r756, r1008, r1021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1028, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1030, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1032, {low, high}; +} +{ +mul.f16x2 r1033, r1030, r1032; +} +{ +mul.f16x2 r1036, r1004, r1028; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1004; +mov.b32 r1039, {high, low}; +} +{ +fma.rn.f16x2 r1041, r1033, r1039, r1036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1045, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1047, {high, high}; +} +{ +mul.f16x2 r1049, r768, r1047; +} +{ +neg.f16x2 r1052, r1049; +} +{ +fma.rn.f16x2 r1054, r765, r1045, r1052; +} +{ +mul.f16x2 r1058, r765, r1047; +} +{ +fma.rn.f16x2 r1061, r768, r1045, r1058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1065, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r783; +mov.b32 r1067, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1069, {low, high}; +} +{ +mul.f16x2 r1070, r1067, r1069; +} +{ +mul.f16x2 r1073, r1041, r1065; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1041; +mov.b32 r1076, {high, low}; +} +{ +fma.rn.f16x2 r1078, r1070, r1076, r1073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1082, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1078; +mov.b32 r1084, {high, high}; +} +{ +mul.f16x2 r1086, r780, r1084; +} +{ +neg.f16x2 r1089, r1086; +} +{ +fma.rn.f16x2 r1091, r777, r1082, r1089; +} +{ +mul.f16x2 r1095, r777, r1084; +} +{ +fma.rn.f16x2 r1098, r780, r1082, r1095; +} +barrier.sync 0; +mad.lo.s32 r1909, r1907, 40, r1908; +st.shared.v2.f32 [r1909], {r723, r795}; +st.shared.v2.f32 [r1909+8], {r832, r869}; +st.shared.v2.f32 [r1909+16], {r906, r943}; +st.shared.v2.f32 [r1909+24], {r980, r1017}; +st.shared.v2.f32 [r1909+32], {r1054, r1091}; +barrier.sync 0; +mad.lo.s32 r1910, r1907, -36, r1909; +ld.shared.u32 r1131, [r1910]; +ld.shared.u32 r1451, [r1910+40]; +ld.shared.u32 r1128, [r1910+80]; +ld.shared.u32 r1448, [r1910+120]; +ld.shared.u32 r1134, [r1910+160]; +ld.shared.u32 r1454, [r1910+200]; +ld.shared.u32 r1135, [r1910+240]; +ld.shared.u32 r1455, [r1910+280]; +ld.shared.u32 r1129, [r1910+320]; +ld.shared.u32 r1449, [r1910+360]; +barrier.sync 0; +st.shared.v2.f32 [r1909], {r726, r802}; +st.shared.v2.f32 [r1909+8], {r839, r876}; +st.shared.v2.f32 [r1909+16], {r913, r950}; +st.shared.v2.f32 [r1909+24], {r987, r1024}; +st.shared.v2.f32 [r1909+32], {r1061, r1098}; +barrier.sync 0; +ld.shared.u32 r1143, [r1910]; +ld.shared.u32 r1463, [r1910+40]; +ld.shared.u32 r1140, [r1910+80]; +ld.shared.u32 r1460, [r1910+120]; +ld.shared.u32 r1146, [r1910+160]; +ld.shared.u32 r1466, [r1910+200]; +ld.shared.u32 r1147, [r1910+240]; +ld.shared.u32 r1467, [r1910+280]; +ld.shared.u32 r1141, [r1910+320]; +ld.shared.u32 r1461, [r1910+360]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +add.f16x2 r1127, r1128, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1130, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1142, r1145; +} +{ +add.f16x2 r1151, r1128, r1129; +} +{ +mul.f16x2 r1154, r1151, r1119; +} +{ +add.f16x2 r1157, r1131, r1154; +} +{ +add.f16x2 r1160, r1134, r1135; +} +{ +mul.f16x2 r1163, r1160, r1121; +} +{ +add.f16x2 r1166, r1157, r1163; +} +{ +sub.f16x2 r1169, r1140, r1141; +} +{ +mul.f16x2 r1172, r1169, r1120; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1122; +} +{ +add.f16x2 r1181, r1172, r1178; +} +{ +sub.f16x2 r1184, r1166, r1181; +} +{ +add.f16x2 r1187, r1128, r1129; +} +{ +mul.f16x2 r1190, r1187, r1119; +} +{ +add.f16x2 r1193, r1131, r1190; +} +{ +add.f16x2 r1196, r1134, r1135; +} +{ +mul.f16x2 r1199, r1196, r1121; +} +{ +add.f16x2 r1202, r1193, r1199; +} +{ +sub.f16x2 r1205, r1140, r1141; +} +{ +mul.f16x2 r1208, r1205, r1120; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1122; +} +{ +add.f16x2 r1217, r1208, r1214; +} +{ +add.f16x2 r1220, r1202, r1217; +} +{ +add.f16x2 r1223, r1128, r1129; +} +{ +mul.f16x2 r1226, r1223, r1121; +} +{ +add.f16x2 r1229, r1131, r1226; +} +{ +add.f16x2 r1232, r1134, r1135; +} +{ +mul.f16x2 r1235, r1232, r1123; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1140, r1141; +} +{ +mul.f16x2 r1244, r1241, r1122; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1125; +} +{ +add.f16x2 r1253, r1244, r1250; +} +{ +sub.f16x2 r1256, r1238, r1253; +} +{ +add.f16x2 r1259, r1128, r1129; +} +{ +mul.f16x2 r1262, r1259, r1121; +} +{ +add.f16x2 r1265, r1131, r1262; +} +{ +add.f16x2 r1268, r1134, r1135; +} +{ +mul.f16x2 r1271, r1268, r1123; +} +{ +add.f16x2 r1274, r1265, r1271; +} +{ +sub.f16x2 r1277, r1140, r1141; +} +{ +mul.f16x2 r1280, r1277, r1122; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1125; +} +{ +add.f16x2 r1289, r1280, r1286; +} +{ +add.f16x2 r1292, r1274, r1289; +} +{ +add.f16x2 r1295, r1140, r1141; +} +{ +mul.f16x2 r1298, r1295, r1119; +} +{ +add.f16x2 r1301, r1143, r1298; +} +{ +add.f16x2 r1304, r1146, r1147; +} +{ +mul.f16x2 r1307, r1304, r1121; +} +{ +add.f16x2 r1310, r1301, r1307; +} +{ +sub.f16x2 r1313, r1128, r1129; +} +{ +mul.f16x2 r1316, r1313, r1120; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1122; +} +{ +add.f16x2 r1325, r1316, r1322; +} +{ +add.f16x2 r1328, r1310, r1325; +} +{ +add.f16x2 r1331, r1140, r1141; +} +{ +mul.f16x2 r1334, r1331, r1119; +} +{ +add.f16x2 r1337, r1143, r1334; +} +{ +add.f16x2 r1340, r1146, r1147; +} +{ +mul.f16x2 r1343, r1340, r1121; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +sub.f16x2 r1349, r1128, r1129; +} +{ +mul.f16x2 r1352, r1349, r1120; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1122; +} +{ +add.f16x2 r1361, r1352, r1358; +} +{ +sub.f16x2 r1364, r1346, r1361; +} +{ +add.f16x2 r1367, r1140, r1141; +} +{ +mul.f16x2 r1370, r1367, r1121; +} +{ +add.f16x2 r1373, r1143, r1370; +} +{ +add.f16x2 r1376, r1146, r1147; +} +{ +mul.f16x2 r1379, r1376, r1123; +} +{ +add.f16x2 r1382, r1373, r1379; +} +{ +sub.f16x2 r1385, r1128, r1129; +} +{ +mul.f16x2 r1388, r1385, r1122; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1125; +} +{ +add.f16x2 r1397, r1388, r1394; +} +{ +add.f16x2 r1400, r1382, r1397; +} +{ +add.f16x2 r1403, r1140, r1141; +} +{ +mul.f16x2 r1406, r1403, r1121; +} +{ +add.f16x2 r1409, r1143, r1406; +} +{ +add.f16x2 r1412, r1146, r1147; +} +{ +mul.f16x2 r1415, r1412, r1123; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +sub.f16x2 r1421, r1128, r1129; +} +{ +mul.f16x2 r1424, r1421, r1122; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1125; +} +{ +add.f16x2 r1433, r1424, r1430; +} +{ +sub.f16x2 r1436, r1418, r1433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1444, {low, high}; +} +{ +neg.f16x2 r1445, r1444; +} +{ +add.f16x2 r1447, r1448, r1449; +} +{ +add.f16x2 r1450, r1451, r1447; +} +{ +add.f16x2 r1453, r1454, r1455; +} +{ +add.f16x2 r1456, r1450, r1453; +} +{ +add.f16x2 r1459, r1460, r1461; +} +{ +add.f16x2 r1462, r1463, r1459; +} +{ +add.f16x2 r1465, r1466, r1467; +} +{ +add.f16x2 r1468, r1462, r1465; +} +{ +add.f16x2 r1471, r1448, r1449; +} +{ +mul.f16x2 r1474, r1471, r1439; +} +{ +add.f16x2 r1477, r1451, r1474; +} +{ +add.f16x2 r1480, r1454, r1455; +} +{ +mul.f16x2 r1483, r1480, r1441; +} +{ +add.f16x2 r1486, r1477, r1483; +} +{ +sub.f16x2 r1489, r1460, r1461; +} +{ +mul.f16x2 r1492, r1489, r1440; +} +{ +sub.f16x2 r1495, r1466, r1467; +} +{ +mul.f16x2 r1498, r1495, r1442; +} +{ +add.f16x2 r1501, r1492, r1498; +} +{ +sub.f16x2 r1504, r1486, r1501; +} +{ +add.f16x2 r1507, r1448, r1449; +} +{ +mul.f16x2 r1510, r1507, r1439; +} +{ +add.f16x2 r1513, r1451, r1510; +} +{ +add.f16x2 r1516, r1454, r1455; +} +{ +mul.f16x2 r1519, r1516, r1441; +} +{ +add.f16x2 r1522, r1513, r1519; +} +{ +sub.f16x2 r1525, r1460, r1461; +} +{ +mul.f16x2 r1528, r1525, r1440; +} +{ +sub.f16x2 r1531, r1466, r1467; +} +{ +mul.f16x2 r1534, r1531, r1442; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +add.f16x2 r1540, r1522, r1537; +} +{ +add.f16x2 r1543, r1448, r1449; +} +{ +mul.f16x2 r1546, r1543, r1441; +} +{ +add.f16x2 r1549, r1451, r1546; +} +{ +add.f16x2 r1552, r1454, r1455; +} +{ +mul.f16x2 r1555, r1552, r1443; +} +{ +add.f16x2 r1558, r1549, r1555; +} +{ +sub.f16x2 r1561, r1460, r1461; +} +{ +mul.f16x2 r1564, r1561, r1442; +} +{ +sub.f16x2 r1567, r1466, r1467; +} +{ +mul.f16x2 r1570, r1567, r1445; +} +{ +add.f16x2 r1573, r1564, r1570; +} +{ +sub.f16x2 r1576, r1558, r1573; +} +{ +add.f16x2 r1579, r1448, r1449; +} +{ +mul.f16x2 r1582, r1579, r1441; +} +{ +add.f16x2 r1585, r1451, r1582; +} +{ +add.f16x2 r1588, r1454, r1455; +} +{ +mul.f16x2 r1591, r1588, r1443; +} +{ +add.f16x2 r1594, r1585, r1591; +} +{ +sub.f16x2 r1597, r1460, r1461; +} +{ +mul.f16x2 r1600, r1597, r1442; +} +{ +sub.f16x2 r1603, r1466, r1467; +} +{ +mul.f16x2 r1606, r1603, r1445; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +add.f16x2 r1612, r1594, r1609; +} +{ +add.f16x2 r1615, r1460, r1461; +} +{ +mul.f16x2 r1618, r1615, r1439; +} +{ +add.f16x2 r1621, r1463, r1618; +} +{ +add.f16x2 r1624, r1466, r1467; +} +{ +mul.f16x2 r1627, r1624, r1441; +} +{ +add.f16x2 r1630, r1621, r1627; +} +{ +sub.f16x2 r1633, r1448, r1449; +} +{ +mul.f16x2 r1636, r1633, r1440; +} +{ +sub.f16x2 r1639, r1454, r1455; +} +{ +mul.f16x2 r1642, r1639, r1442; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +add.f16x2 r1648, r1630, r1645; +} +{ +add.f16x2 r1651, r1460, r1461; +} +{ +mul.f16x2 r1654, r1651, r1439; +} +{ +add.f16x2 r1657, r1463, r1654; +} +{ +add.f16x2 r1660, r1466, r1467; +} +{ +mul.f16x2 r1663, r1660, r1441; +} +{ +add.f16x2 r1666, r1657, r1663; +} +{ +sub.f16x2 r1669, r1448, r1449; +} +{ +mul.f16x2 r1672, r1669, r1440; +} +{ +sub.f16x2 r1675, r1454, r1455; +} +{ +mul.f16x2 r1678, r1675, r1442; +} +{ +add.f16x2 r1681, r1672, r1678; +} +{ +sub.f16x2 r1684, r1666, r1681; +} +{ +add.f16x2 r1687, r1460, r1461; +} +{ +mul.f16x2 r1690, r1687, r1441; +} +{ +add.f16x2 r1693, r1463, r1690; +} +{ +add.f16x2 r1696, r1466, r1467; +} +{ +mul.f16x2 r1699, r1696, r1443; +} +{ +add.f16x2 r1702, r1693, r1699; +} +{ +sub.f16x2 r1705, r1448, r1449; +} +{ +mul.f16x2 r1708, r1705, r1442; +} +{ +sub.f16x2 r1711, r1454, r1455; +} +{ +mul.f16x2 r1714, r1711, r1445; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +add.f16x2 r1720, r1702, r1717; +} +{ +add.f16x2 r1723, r1460, r1461; +} +{ +mul.f16x2 r1726, r1723, r1441; +} +{ +add.f16x2 r1729, r1463, r1726; +} +{ +add.f16x2 r1732, r1466, r1467; +} +{ +mul.f16x2 r1735, r1732, r1443; +} +{ +add.f16x2 r1738, r1729, r1735; +} +{ +sub.f16x2 r1741, r1448, r1449; +} +{ +mul.f16x2 r1744, r1741, r1442; +} +{ +sub.f16x2 r1747, r1454, r1455; +} +{ +mul.f16x2 r1750, r1747, r1445; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, r1738, r1753; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1759, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1760, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1761, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1766, {low, high}; +} +{ +mul.f16x2 r1777, r1504, r1759; +} +{ +mul.f16x2 r1780, r1648, r1760; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1504, r1760; +} +{ +fma.rn.f16x2 r1789, r1648, r1759, r1786; +} +{ +mul.f16x2 r1793, r1576, r1761; +} +{ +mul.f16x2 r1796, r1720, r1762; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1576, r1762; +} +{ +fma.rn.f16x2 r1805, r1720, r1761, r1802; +} +{ +mul.f16x2 r1809, r1612, r1763; +} +{ +mul.f16x2 r1812, r1756, r1764; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1612, r1764; +} +{ +fma.rn.f16x2 r1821, r1756, r1763, r1818; +} +{ +mul.f16x2 r1825, r1540, r1765; +} +{ +mul.f16x2 r1828, r1684, r1766; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1540, r1766; +} +{ +fma.rn.f16x2 r1837, r1684, r1765, r1834; +} +{ +add.f16x2 %0, r1136, r1456; +} +{ +add.f16x2 %1, r1148, r1468; +} +{ +sub.f16x2 %10, r1136, r1456; +} +{ +sub.f16x2 %11, r1148, r1468; +} +{ +add.f16x2 %2, r1184, r1783; +} +{ +add.f16x2 %3, r1328, r1789; +} +{ +sub.f16x2 %12, r1184, r1783; +} +{ +sub.f16x2 %13, r1328, r1789; +} +{ +add.f16x2 %4, r1256, r1799; +} +{ +add.f16x2 %5, r1400, r1805; +} +{ +sub.f16x2 %14, r1256, r1799; +} +{ +sub.f16x2 %15, r1400, r1805; +} +{ +add.f16x2 %6, r1292, r1815; +} +{ +add.f16x2 %7, r1436, r1821; +} +{ +sub.f16x2 %16, r1292, r1815; +} +{ +sub.f16x2 %17, r1436, r1821; +} +{ +add.f16x2 %8, r1220, r1831; +} +{ +add.f16x2 %9, r1364, r1837; +} +{ +sub.f16x2 %18, r1220, r1831; +} +{ +sub.f16x2 %19, r1364, r1837; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..b911667974629 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp16_inv.hpp.inc @@ -0,0 +1,4600 @@ +#ifndef CUFFTDX_FFT_100_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_100_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1140, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<146>; +.reg .b32 r<1922>; +.reg .b64 rd<5>; +mov.u32 r1909, %tid.y; +shl.b32 r1910, r1909, 1; +mov.u32 r1911, %20; +mad.lo.s32 r1912, r1910, 400, r1911; +mov.u32 r1913, %tid.x; +mov.f32 f112, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1, {low, high}; +} +mov.f32 f106, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f120, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r5, {low, high}; +} +mov.f32 f102, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %25, %37; +} +{ +add.f16x2 r14, %21, r11; +} +{ +add.f16x2 r17, %29, %33; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %26, %38; +} +{ +add.f16x2 r26, %22, r23; +} +{ +add.f16x2 r29, %30, %34; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %25, %37; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %21, r38; +} +{ +add.f16x2 r44, %29, %33; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %26, %38; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %30, %34; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %25, %37; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %21, r74; +} +{ +add.f16x2 r80, %29, %33; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %26, %38; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %30, %34; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %25, %37; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %21, r110; +} +{ +add.f16x2 r116, %29, %33; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %26, %38; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %30, %34; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %25, %37; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %21, r146; +} +{ +add.f16x2 r152, %29, %33; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %26, %38; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %30, %34; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %26, %38; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %22, r182; +} +{ +add.f16x2 r188, %30, %34; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %25, %37; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %29, %33; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %26, %38; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %22, r218; +} +{ +add.f16x2 r224, %30, %34; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %25, %37; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %29, %33; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %26, %38; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %22, r254; +} +{ +add.f16x2 r260, %30, %34; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %25, %37; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %29, %33; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %26, %38; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %22, r290; +} +{ +add.f16x2 r296, %30, %34; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %25, %37; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %29, %33; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %27, %39; +} +{ +add.f16x2 r336, %23, r333; +} +{ +add.f16x2 r339, %31, %35; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %28, %40; +} +{ +add.f16x2 r348, %24, r345; +} +{ +add.f16x2 r351, %32, %36; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %27, %39; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %23, r360; +} +{ +add.f16x2 r366, %31, %35; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %28, %40; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %32, %36; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %27, %39; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %23, r396; +} +{ +add.f16x2 r402, %31, %35; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %28, %40; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %32, %36; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %27, %39; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %23, r432; +} +{ +add.f16x2 r438, %31, %35; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %28, %40; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %32, %36; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %27, %39; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %23, r468; +} +{ +add.f16x2 r474, %31, %35; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %28, %40; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %32, %36; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %28, %40; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %24, r504; +} +{ +add.f16x2 r510, %32, %36; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %27, %39; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %31, %35; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %28, %40; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %24, r540; +} +{ +add.f16x2 r546, %32, %36; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %27, %39; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %31, %35; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %28, %40; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %24, r576; +} +{ +add.f16x2 r582, %32, %36; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %27, %39; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %31, %35; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %28, %40; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %24, r612; +} +{ +add.f16x2 r618, %32, %36; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %27, %39; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %31, %35; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +mov.f32 f108, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r645, {low, high}; +} +mov.f32 f122, 0f3F167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r647, {low, high}; +} +mov.f32 f118, 0f3F737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r648, {low, high}; +} +mov.f32 f116, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r652, {low, high}; +} +mov.f32 f79, 0fBF800000; +{ +mul.f16x2 r663, r390, r645; +} +{ +mul.f16x2 r666, r534, r646; +} +{ +sub.f16x2 r669, r663, r666; +} +{ +mul.f16x2 r672, r390, r646; +} +{ +fma.rn.f16x2 r675, r534, r645, r672; +} +{ +mul.f16x2 r679, r462, r647; +} +{ +mul.f16x2 r682, r606, r648; +} +{ +sub.f16x2 r685, r679, r682; +} +{ +mul.f16x2 r688, r462, r648; +} +{ +fma.rn.f16x2 r691, r606, r647, r688; +} +{ +mul.f16x2 r695, r498, r649; +} +{ +mul.f16x2 r698, r642, r650; +} +{ +sub.f16x2 r701, r695, r698; +} +{ +mul.f16x2 r704, r498, r650; +} +{ +fma.rn.f16x2 r707, r642, r649, r704; +} +{ +mul.f16x2 r711, r426, r651; +} +{ +mul.f16x2 r714, r570, r652; +} +{ +sub.f16x2 r717, r711, r714; +} +{ +mul.f16x2 r720, r426, r652; +} +{ +fma.rn.f16x2 r723, r570, r651, r720; +} +{ +add.f16x2 r727, r20, r342; +} +{ +add.f16x2 r730, r32, r354; +} +{ +sub.f16x2 r733, r20, r342; +} +{ +sub.f16x2 r736, r32, r354; +} +{ +add.f16x2 r739, r68, r669; +} +{ +add.f16x2 r742, r212, r675; +} +{ +sub.f16x2 r745, r68, r669; +} +{ +sub.f16x2 r748, r212, r675; +} +{ +add.f16x2 r751, r140, r685; +} +{ +add.f16x2 r754, r284, r691; +} +{ +sub.f16x2 r757, r140, r685; +} +{ +sub.f16x2 r760, r284, r691; +} +{ +add.f16x2 r763, r176, r701; +} +{ +add.f16x2 r766, r320, r707; +} +{ +sub.f16x2 r769, r176, r701; +} +{ +sub.f16x2 r772, r320, r707; +} +{ +add.f16x2 r775, r104, r717; +} +{ +add.f16x2 r778, r248, r723; +} +{ +sub.f16x2 r781, r104, r717; +} +{ +sub.f16x2 r784, r248, r723; +} +mul.wide.u32 rd2, r1913, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1914, rd3; +mul.lo.s32 r1915, r1914, 10; +sub.s32 r1916, r1913, r1915; +shr.u64 rd4, rd2, 34; +cvt.u32.u64 r1917, rd4; +and.b32 r1918, r1917, 1073741822; +mad.lo.s32 r1919, r1918, 400, r1912; +cvt.rn.f32.u32 f143, r1916; +mul.f32 f144, f143, 0f3D80ADFD; +cos.approx.f32 f61, f144; +sin.approx.f32 f145, f144; +neg.f32 f62, f145; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r787, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r792, {high, high}; +} +{ +mul.f16x2 r794, r742, r792; +} +{ +fma.rn.f16x2 r797, r739, r790, r794; +} +{ +mul.f16x2 r801, r739, r792; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r742, r790, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r812, {high, high}; +} +mov.f32 f80, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r812, r814; +} +{ +mul.f16x2 r818, r787, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r821, {high, low}; +} +{ +fma.rn.f16x2 r823, r815, r821, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r829, {high, high}; +} +{ +mul.f16x2 r831, r754, r829; +} +{ +fma.rn.f16x2 r834, r751, r827, r831; +} +{ +mul.f16x2 r838, r751, r829; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r754, r827, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r851, {low, high}; +} +{ +mul.f16x2 r852, r849, r851; +} +{ +mul.f16x2 r855, r823, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r858, {high, low}; +} +{ +fma.rn.f16x2 r860, r852, r858, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r766, r866; +} +{ +fma.rn.f16x2 r871, r763, r864, r868; +} +{ +mul.f16x2 r875, r763, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r766, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r860, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r778, r903; +} +{ +fma.rn.f16x2 r908, r775, r901, r905; +} +{ +mul.f16x2 r912, r775, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r778, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r736, r940; +} +{ +fma.rn.f16x2 r945, r733, r938, r942; +} +{ +mul.f16x2 r949, r733, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r736, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r748, r977; +} +{ +fma.rn.f16x2 r982, r745, r975, r979; +} +{ +mul.f16x2 r986, r745, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r748, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r760, r1014; +} +{ +fma.rn.f16x2 r1019, r757, r1012, r1016; +} +{ +mul.f16x2 r1023, r757, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r760, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r772, r1051; +} +{ +fma.rn.f16x2 r1056, r769, r1049, r1053; +} +{ +mul.f16x2 r1060, r769, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r772, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r784, r1088; +} +{ +fma.rn.f16x2 r1093, r781, r1086, r1090; +} +{ +mul.f16x2 r1097, r781, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r784, r1086, r1100; +} +barrier.sync 0; +mad.lo.s32 r1920, r1916, 80, r1919; +st.shared.v2.f32 [r1920], {r727, r730}; +st.shared.v2.f32 [r1920+8], {r797, r806}; +st.shared.v2.f32 [r1920+16], {r834, r843}; +st.shared.v2.f32 [r1920+24], {r871, r880}; +st.shared.v2.f32 [r1920+32], {r908, r917}; +st.shared.v2.f32 [r1920+40], {r945, r954}; +st.shared.v2.f32 [r1920+48], {r982, r991}; +st.shared.v2.f32 [r1920+56], {r1019, r1028}; +st.shared.v2.f32 [r1920+64], {r1056, r1065}; +st.shared.v2.f32 [r1920+72], {r1093, r1102}; +barrier.sync 0; +mad.lo.s32 r1921, r1916, -72, r1920; +ld.shared.u32 r1137, [r1921]; +ld.shared.u32 r1149, [r1921+4]; +ld.shared.u32 r1459, [r1921+80]; +ld.shared.u32 r1471, [r1921+84]; +ld.shared.u32 r1134, [r1921+160]; +ld.shared.u32 r1146, [r1921+164]; +ld.shared.u32 r1456, [r1921+240]; +ld.shared.u32 r1468, [r1921+244]; +ld.shared.u32 r1140, [r1921+320]; +ld.shared.u32 r1152, [r1921+324]; +ld.shared.u32 r1462, [r1921+400]; +ld.shared.u32 r1474, [r1921+404]; +ld.shared.u32 r1141, [r1921+480]; +ld.shared.u32 r1153, [r1921+484]; +ld.shared.u32 r1463, [r1921+560]; +ld.shared.u32 r1475, [r1921+564]; +ld.shared.u32 r1135, [r1921+640]; +ld.shared.u32 r1147, [r1921+644]; +ld.shared.u32 r1457, [r1921+720]; +ld.shared.u32 r1469, [r1921+724]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1128, {low, high}; +} +{ +neg.f16x2 r1129, r1128; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1132, {low, high}; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1137, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1136, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1149, r1145; +} +{ +add.f16x2 r1151, r1152, r1153; +} +{ +add.f16x2 r1154, r1148, r1151; +} +{ +add.f16x2 r1157, r1134, r1135; +} +{ +mul.f16x2 r1160, r1157, r1123; +} +{ +add.f16x2 r1163, r1137, r1160; +} +{ +add.f16x2 r1166, r1140, r1141; +} +{ +mul.f16x2 r1169, r1166, r1127; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1125; +} +{ +sub.f16x2 r1181, r1152, r1153; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, r1134, r1135; +} +{ +mul.f16x2 r1196, r1193, r1123; +} +{ +add.f16x2 r1199, r1137, r1196; +} +{ +add.f16x2 r1202, r1140, r1141; +} +{ +mul.f16x2 r1205, r1202, r1127; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1125; +} +{ +sub.f16x2 r1217, r1152, r1153; +} +{ +mul.f16x2 r1220, r1217, r1129; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, r1134, r1135; +} +{ +mul.f16x2 r1232, r1229, r1127; +} +{ +add.f16x2 r1235, r1137, r1232; +} +{ +add.f16x2 r1238, r1140, r1141; +} +{ +mul.f16x2 r1241, r1238, r1131; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1129; +} +{ +sub.f16x2 r1253, r1152, r1153; +} +{ +mul.f16x2 r1256, r1253, r1132; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +sub.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, r1134, r1135; +} +{ +mul.f16x2 r1268, r1265, r1127; +} +{ +add.f16x2 r1271, r1137, r1268; +} +{ +add.f16x2 r1274, r1140, r1141; +} +{ +mul.f16x2 r1277, r1274, r1131; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1129; +} +{ +sub.f16x2 r1289, r1152, r1153; +} +{ +mul.f16x2 r1292, r1289, r1132; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +add.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, r1146, r1147; +} +{ +mul.f16x2 r1304, r1301, r1123; +} +{ +add.f16x2 r1307, r1149, r1304; +} +{ +add.f16x2 r1310, r1152, r1153; +} +{ +mul.f16x2 r1313, r1310, r1127; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1125; +} +{ +sub.f16x2 r1325, r1140, r1141; +} +{ +mul.f16x2 r1328, r1325, r1129; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, r1146, r1147; +} +{ +mul.f16x2 r1340, r1337, r1123; +} +{ +add.f16x2 r1343, r1149, r1340; +} +{ +add.f16x2 r1346, r1152, r1153; +} +{ +mul.f16x2 r1349, r1346, r1127; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1125; +} +{ +sub.f16x2 r1361, r1140, r1141; +} +{ +mul.f16x2 r1364, r1361, r1129; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +add.f16x2 r1373, r1146, r1147; +} +{ +mul.f16x2 r1376, r1373, r1127; +} +{ +add.f16x2 r1379, r1149, r1376; +} +{ +add.f16x2 r1382, r1152, r1153; +} +{ +mul.f16x2 r1385, r1382, r1131; +} +{ +add.f16x2 r1388, r1379, r1385; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1129; +} +{ +sub.f16x2 r1397, r1140, r1141; +} +{ +mul.f16x2 r1400, r1397, r1132; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +add.f16x2 r1406, r1388, r1403; +} +{ +add.f16x2 r1409, r1146, r1147; +} +{ +mul.f16x2 r1412, r1409, r1127; +} +{ +add.f16x2 r1415, r1149, r1412; +} +{ +add.f16x2 r1418, r1152, r1153; +} +{ +mul.f16x2 r1421, r1418, r1131; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1129; +} +{ +sub.f16x2 r1433, r1140, r1141; +} +{ +mul.f16x2 r1436, r1433, r1132; +} +{ +add.f16x2 r1439, r1430, r1436; +} +{ +sub.f16x2 r1442, r1424, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1446, {low, high}; +} +{ +neg.f16x2 r1447, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1450, {low, high}; +} +{ +neg.f16x2 r1451, r1450; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1454, {low, high}; +} +{ +add.f16x2 r1455, r1456, r1457; +} +{ +add.f16x2 r1458, r1459, r1455; +} +{ +add.f16x2 r1461, r1462, r1463; +} +{ +add.f16x2 r1464, r1458, r1461; +} +{ +add.f16x2 r1467, r1468, r1469; +} +{ +add.f16x2 r1470, r1471, r1467; +} +{ +add.f16x2 r1473, r1474, r1475; +} +{ +add.f16x2 r1476, r1470, r1473; +} +{ +add.f16x2 r1479, r1456, r1457; +} +{ +mul.f16x2 r1482, r1479, r1445; +} +{ +add.f16x2 r1485, r1459, r1482; +} +{ +add.f16x2 r1488, r1462, r1463; +} +{ +mul.f16x2 r1491, r1488, r1449; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, r1468, r1469; +} +{ +mul.f16x2 r1500, r1497, r1447; +} +{ +sub.f16x2 r1503, r1474, r1475; +} +{ +mul.f16x2 r1506, r1503, r1451; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +sub.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, r1456, r1457; +} +{ +mul.f16x2 r1518, r1515, r1445; +} +{ +add.f16x2 r1521, r1459, r1518; +} +{ +add.f16x2 r1524, r1462, r1463; +} +{ +mul.f16x2 r1527, r1524, r1449; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, r1468, r1469; +} +{ +mul.f16x2 r1536, r1533, r1447; +} +{ +sub.f16x2 r1539, r1474, r1475; +} +{ +mul.f16x2 r1542, r1539, r1451; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +add.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, r1456, r1457; +} +{ +mul.f16x2 r1554, r1551, r1449; +} +{ +add.f16x2 r1557, r1459, r1554; +} +{ +add.f16x2 r1560, r1462, r1463; +} +{ +mul.f16x2 r1563, r1560, r1453; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, r1468, r1469; +} +{ +mul.f16x2 r1572, r1569, r1451; +} +{ +sub.f16x2 r1575, r1474, r1475; +} +{ +mul.f16x2 r1578, r1575, r1454; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +sub.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, r1456, r1457; +} +{ +mul.f16x2 r1590, r1587, r1449; +} +{ +add.f16x2 r1593, r1459, r1590; +} +{ +add.f16x2 r1596, r1462, r1463; +} +{ +mul.f16x2 r1599, r1596, r1453; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, r1468, r1469; +} +{ +mul.f16x2 r1608, r1605, r1451; +} +{ +sub.f16x2 r1611, r1474, r1475; +} +{ +mul.f16x2 r1614, r1611, r1454; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +add.f16x2 r1620, r1602, r1617; +} +{ +add.f16x2 r1623, r1468, r1469; +} +{ +mul.f16x2 r1626, r1623, r1445; +} +{ +add.f16x2 r1629, r1471, r1626; +} +{ +add.f16x2 r1632, r1474, r1475; +} +{ +mul.f16x2 r1635, r1632, r1449; +} +{ +add.f16x2 r1638, r1629, r1635; +} +{ +sub.f16x2 r1641, r1456, r1457; +} +{ +mul.f16x2 r1644, r1641, r1447; +} +{ +sub.f16x2 r1647, r1462, r1463; +} +{ +mul.f16x2 r1650, r1647, r1451; +} +{ +add.f16x2 r1653, r1644, r1650; +} +{ +add.f16x2 r1656, r1638, r1653; +} +{ +add.f16x2 r1659, r1468, r1469; +} +{ +mul.f16x2 r1662, r1659, r1445; +} +{ +add.f16x2 r1665, r1471, r1662; +} +{ +add.f16x2 r1668, r1474, r1475; +} +{ +mul.f16x2 r1671, r1668, r1449; +} +{ +add.f16x2 r1674, r1665, r1671; +} +{ +sub.f16x2 r1677, r1456, r1457; +} +{ +mul.f16x2 r1680, r1677, r1447; +} +{ +sub.f16x2 r1683, r1462, r1463; +} +{ +mul.f16x2 r1686, r1683, r1451; +} +{ +add.f16x2 r1689, r1680, r1686; +} +{ +sub.f16x2 r1692, r1674, r1689; +} +{ +add.f16x2 r1695, r1468, r1469; +} +{ +mul.f16x2 r1698, r1695, r1449; +} +{ +add.f16x2 r1701, r1471, r1698; +} +{ +add.f16x2 r1704, r1474, r1475; +} +{ +mul.f16x2 r1707, r1704, r1453; +} +{ +add.f16x2 r1710, r1701, r1707; +} +{ +sub.f16x2 r1713, r1456, r1457; +} +{ +mul.f16x2 r1716, r1713, r1451; +} +{ +sub.f16x2 r1719, r1462, r1463; +} +{ +mul.f16x2 r1722, r1719, r1454; +} +{ +add.f16x2 r1725, r1716, r1722; +} +{ +add.f16x2 r1728, r1710, r1725; +} +{ +add.f16x2 r1731, r1468, r1469; +} +{ +mul.f16x2 r1734, r1731, r1449; +} +{ +add.f16x2 r1737, r1471, r1734; +} +{ +add.f16x2 r1740, r1474, r1475; +} +{ +mul.f16x2 r1743, r1740, r1453; +} +{ +add.f16x2 r1746, r1737, r1743; +} +{ +sub.f16x2 r1749, r1456, r1457; +} +{ +mul.f16x2 r1752, r1749, r1451; +} +{ +sub.f16x2 r1755, r1462, r1463; +} +{ +mul.f16x2 r1758, r1755, r1454; +} +{ +add.f16x2 r1761, r1752, r1758; +} +{ +sub.f16x2 r1764, r1746, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1785, r1512, r1767; +} +{ +mul.f16x2 r1788, r1656, r1768; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1512, r1768; +} +{ +fma.rn.f16x2 r1797, r1656, r1767, r1794; +} +{ +mul.f16x2 r1801, r1584, r1769; +} +{ +mul.f16x2 r1804, r1728, r1770; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1584, r1770; +} +{ +fma.rn.f16x2 r1813, r1728, r1769, r1810; +} +{ +mul.f16x2 r1817, r1620, r1771; +} +{ +mul.f16x2 r1820, r1764, r1772; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1620, r1772; +} +{ +fma.rn.f16x2 r1829, r1764, r1771, r1826; +} +{ +mul.f16x2 r1833, r1548, r1773; +} +{ +mul.f16x2 r1836, r1692, r1774; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1548, r1774; +} +{ +fma.rn.f16x2 r1845, r1692, r1773, r1842; +} +{ +add.f16x2 %0, r1142, r1464; +} +{ +add.f16x2 %1, r1154, r1476; +} +{ +sub.f16x2 %10, r1142, r1464; +} +{ +sub.f16x2 %11, r1154, r1476; +} +{ +add.f16x2 %2, r1190, r1791; +} +{ +add.f16x2 %3, r1334, r1797; +} +{ +sub.f16x2 %12, r1190, r1791; +} +{ +sub.f16x2 %13, r1334, r1797; +} +{ +add.f16x2 %4, r1262, r1807; +} +{ +add.f16x2 %5, r1406, r1813; +} +{ +sub.f16x2 %14, r1262, r1807; +} +{ +sub.f16x2 %15, r1406, r1813; +} +{ +add.f16x2 %6, r1298, r1823; +} +{ +add.f16x2 %7, r1442, r1829; +} +{ +sub.f16x2 %16, r1298, r1823; +} +{ +sub.f16x2 %17, r1442, r1829; +} +{ +add.f16x2 %8, r1226, r1839; +} +{ +add.f16x2 %9, r1370, r1845; +} +{ +sub.f16x2 %18, r1226, r1839; +} +{ +sub.f16x2 %19, r1370, r1845; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1139, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<146>; +.reg .b32 r<1919>; +.reg .b64 rd<4>; +mov.u32 r1909, %tid.y; +mov.u32 r1910, %20; +mad.lo.s32 r1911, r1909, 400, r1910; +mov.u32 r1912, %tid.x; +mov.f32 f112, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1, {low, high}; +} +mov.f32 f106, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f120, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r5, {low, high}; +} +mov.f32 f102, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %25, %37; +} +{ +add.f16x2 r14, %21, r11; +} +{ +add.f16x2 r17, %29, %33; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %26, %38; +} +{ +add.f16x2 r26, %22, r23; +} +{ +add.f16x2 r29, %30, %34; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %25, %37; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %21, r38; +} +{ +add.f16x2 r44, %29, %33; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %26, %38; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %30, %34; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %25, %37; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %21, r74; +} +{ +add.f16x2 r80, %29, %33; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %26, %38; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %30, %34; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %25, %37; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %21, r110; +} +{ +add.f16x2 r116, %29, %33; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %26, %38; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %30, %34; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %25, %37; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %21, r146; +} +{ +add.f16x2 r152, %29, %33; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %26, %38; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %30, %34; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %26, %38; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %22, r182; +} +{ +add.f16x2 r188, %30, %34; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %25, %37; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %29, %33; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %26, %38; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %22, r218; +} +{ +add.f16x2 r224, %30, %34; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %25, %37; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %29, %33; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %26, %38; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %22, r254; +} +{ +add.f16x2 r260, %30, %34; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %25, %37; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %29, %33; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %26, %38; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %22, r290; +} +{ +add.f16x2 r296, %30, %34; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %25, %37; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %29, %33; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %27, %39; +} +{ +add.f16x2 r336, %23, r333; +} +{ +add.f16x2 r339, %31, %35; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %28, %40; +} +{ +add.f16x2 r348, %24, r345; +} +{ +add.f16x2 r351, %32, %36; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %27, %39; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %23, r360; +} +{ +add.f16x2 r366, %31, %35; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %28, %40; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %32, %36; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %27, %39; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %23, r396; +} +{ +add.f16x2 r402, %31, %35; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %28, %40; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %32, %36; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %27, %39; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %23, r432; +} +{ +add.f16x2 r438, %31, %35; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %28, %40; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %32, %36; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %27, %39; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %23, r468; +} +{ +add.f16x2 r474, %31, %35; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %28, %40; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %32, %36; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %28, %40; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %24, r504; +} +{ +add.f16x2 r510, %32, %36; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %27, %39; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %31, %35; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %28, %40; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %24, r540; +} +{ +add.f16x2 r546, %32, %36; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %27, %39; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %31, %35; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %28, %40; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %24, r576; +} +{ +add.f16x2 r582, %32, %36; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %27, %39; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %31, %35; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %28, %40; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %24, r612; +} +{ +add.f16x2 r618, %32, %36; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %27, %39; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %31, %35; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +mov.f32 f108, 0f3F4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r645, {low, high}; +} +mov.f32 f122, 0f3F167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r647, {low, high}; +} +mov.f32 f118, 0f3F737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r648, {low, high}; +} +mov.f32 f116, 0fBE9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r652, {low, high}; +} +mov.f32 f79, 0fBF800000; +{ +mul.f16x2 r663, r390, r645; +} +{ +mul.f16x2 r666, r534, r646; +} +{ +sub.f16x2 r669, r663, r666; +} +{ +mul.f16x2 r672, r390, r646; +} +{ +fma.rn.f16x2 r675, r534, r645, r672; +} +{ +mul.f16x2 r679, r462, r647; +} +{ +mul.f16x2 r682, r606, r648; +} +{ +sub.f16x2 r685, r679, r682; +} +{ +mul.f16x2 r688, r462, r648; +} +{ +fma.rn.f16x2 r691, r606, r647, r688; +} +{ +mul.f16x2 r695, r498, r649; +} +{ +mul.f16x2 r698, r642, r650; +} +{ +sub.f16x2 r701, r695, r698; +} +{ +mul.f16x2 r704, r498, r650; +} +{ +fma.rn.f16x2 r707, r642, r649, r704; +} +{ +mul.f16x2 r711, r426, r651; +} +{ +mul.f16x2 r714, r570, r652; +} +{ +sub.f16x2 r717, r711, r714; +} +{ +mul.f16x2 r720, r426, r652; +} +{ +fma.rn.f16x2 r723, r570, r651, r720; +} +{ +add.f16x2 r727, r20, r342; +} +{ +add.f16x2 r730, r32, r354; +} +{ +sub.f16x2 r733, r20, r342; +} +{ +sub.f16x2 r736, r32, r354; +} +{ +add.f16x2 r739, r68, r669; +} +{ +add.f16x2 r742, r212, r675; +} +{ +sub.f16x2 r745, r68, r669; +} +{ +sub.f16x2 r748, r212, r675; +} +{ +add.f16x2 r751, r140, r685; +} +{ +add.f16x2 r754, r284, r691; +} +{ +sub.f16x2 r757, r140, r685; +} +{ +sub.f16x2 r760, r284, r691; +} +{ +add.f16x2 r763, r176, r701; +} +{ +add.f16x2 r766, r320, r707; +} +{ +sub.f16x2 r769, r176, r701; +} +{ +sub.f16x2 r772, r320, r707; +} +{ +add.f16x2 r775, r104, r717; +} +{ +add.f16x2 r778, r248, r723; +} +{ +sub.f16x2 r781, r104, r717; +} +{ +sub.f16x2 r784, r248, r723; +} +mul.wide.u32 rd2, r1912, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1913, rd3; +mul.lo.s32 r1914, r1913, 10; +sub.s32 r1915, r1912, r1914; +mad.lo.s32 r1916, r1913, 400, r1911; +cvt.rn.f32.u32 f143, r1915; +mul.f32 f144, f143, 0f3D80ADFD; +cos.approx.f32 f61, f144; +sin.approx.f32 f145, f144; +neg.f32 f62, f145; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r787, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r792, {high, high}; +} +{ +mul.f16x2 r794, r742, r792; +} +{ +fma.rn.f16x2 r797, r739, r790, r794; +} +{ +mul.f16x2 r801, r739, r792; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r742, r790, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r812, {high, high}; +} +mov.f32 f80, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r812, r814; +} +{ +mul.f16x2 r818, r787, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r821, {high, low}; +} +{ +fma.rn.f16x2 r823, r815, r821, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r829, {high, high}; +} +{ +mul.f16x2 r831, r754, r829; +} +{ +fma.rn.f16x2 r834, r751, r827, r831; +} +{ +mul.f16x2 r838, r751, r829; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r754, r827, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r851, {low, high}; +} +{ +mul.f16x2 r852, r849, r851; +} +{ +mul.f16x2 r855, r823, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r823; +mov.b32 r858, {high, low}; +} +{ +fma.rn.f16x2 r860, r852, r858, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r766, r866; +} +{ +fma.rn.f16x2 r871, r763, r864, r868; +} +{ +mul.f16x2 r875, r763, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r766, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r860, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r860; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r778, r903; +} +{ +fma.rn.f16x2 r908, r775, r901, r905; +} +{ +mul.f16x2 r912, r775, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r778, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r736, r940; +} +{ +fma.rn.f16x2 r945, r733, r938, r942; +} +{ +mul.f16x2 r949, r733, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r736, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r748, r977; +} +{ +fma.rn.f16x2 r982, r745, r975, r979; +} +{ +mul.f16x2 r986, r745, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r748, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r760, r1014; +} +{ +fma.rn.f16x2 r1019, r757, r1012, r1016; +} +{ +mul.f16x2 r1023, r757, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r760, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r772, r1051; +} +{ +fma.rn.f16x2 r1056, r769, r1049, r1053; +} +{ +mul.f16x2 r1060, r769, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r772, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r787; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f79; +cvt.rn.f16.f32 high, f80; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r784, r1088; +} +{ +fma.rn.f16x2 r1093, r781, r1086, r1090; +} +{ +mul.f16x2 r1097, r781, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r784, r1086, r1100; +} +barrier.sync 0; +mad.lo.s32 r1917, r1915, 40, r1916; +st.shared.v2.f32 [r1917], {r727, r797}; +st.shared.v2.f32 [r1917+8], {r834, r871}; +st.shared.v2.f32 [r1917+16], {r908, r945}; +st.shared.v2.f32 [r1917+24], {r982, r1019}; +st.shared.v2.f32 [r1917+32], {r1056, r1093}; +barrier.sync 0; +mad.lo.s32 r1918, r1915, -36, r1917; +ld.shared.u32 r1137, [r1918]; +ld.shared.u32 r1459, [r1918+40]; +ld.shared.u32 r1134, [r1918+80]; +ld.shared.u32 r1456, [r1918+120]; +ld.shared.u32 r1140, [r1918+160]; +ld.shared.u32 r1462, [r1918+200]; +ld.shared.u32 r1141, [r1918+240]; +ld.shared.u32 r1463, [r1918+280]; +ld.shared.u32 r1135, [r1918+320]; +ld.shared.u32 r1457, [r1918+360]; +barrier.sync 0; +st.shared.v2.f32 [r1917], {r730, r806}; +st.shared.v2.f32 [r1917+8], {r843, r880}; +st.shared.v2.f32 [r1917+16], {r917, r954}; +st.shared.v2.f32 [r1917+24], {r991, r1028}; +st.shared.v2.f32 [r1917+32], {r1065, r1102}; +barrier.sync 0; +ld.shared.u32 r1149, [r1918]; +ld.shared.u32 r1471, [r1918+40]; +ld.shared.u32 r1146, [r1918+80]; +ld.shared.u32 r1468, [r1918+120]; +ld.shared.u32 r1152, [r1918+160]; +ld.shared.u32 r1474, [r1918+200]; +ld.shared.u32 r1153, [r1918+240]; +ld.shared.u32 r1475, [r1918+280]; +ld.shared.u32 r1147, [r1918+320]; +ld.shared.u32 r1469, [r1918+360]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1124, {low, high}; +} +{ +neg.f16x2 r1125, r1124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1128, {low, high}; +} +{ +neg.f16x2 r1129, r1128; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1132, {low, high}; +} +{ +add.f16x2 r1133, r1134, r1135; +} +{ +add.f16x2 r1136, r1137, r1133; +} +{ +add.f16x2 r1139, r1140, r1141; +} +{ +add.f16x2 r1142, r1136, r1139; +} +{ +add.f16x2 r1145, r1146, r1147; +} +{ +add.f16x2 r1148, r1149, r1145; +} +{ +add.f16x2 r1151, r1152, r1153; +} +{ +add.f16x2 r1154, r1148, r1151; +} +{ +add.f16x2 r1157, r1134, r1135; +} +{ +mul.f16x2 r1160, r1157, r1123; +} +{ +add.f16x2 r1163, r1137, r1160; +} +{ +add.f16x2 r1166, r1140, r1141; +} +{ +mul.f16x2 r1169, r1166, r1127; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, r1146, r1147; +} +{ +mul.f16x2 r1178, r1175, r1125; +} +{ +sub.f16x2 r1181, r1152, r1153; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, r1134, r1135; +} +{ +mul.f16x2 r1196, r1193, r1123; +} +{ +add.f16x2 r1199, r1137, r1196; +} +{ +add.f16x2 r1202, r1140, r1141; +} +{ +mul.f16x2 r1205, r1202, r1127; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, r1146, r1147; +} +{ +mul.f16x2 r1214, r1211, r1125; +} +{ +sub.f16x2 r1217, r1152, r1153; +} +{ +mul.f16x2 r1220, r1217, r1129; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, r1134, r1135; +} +{ +mul.f16x2 r1232, r1229, r1127; +} +{ +add.f16x2 r1235, r1137, r1232; +} +{ +add.f16x2 r1238, r1140, r1141; +} +{ +mul.f16x2 r1241, r1238, r1131; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, r1146, r1147; +} +{ +mul.f16x2 r1250, r1247, r1129; +} +{ +sub.f16x2 r1253, r1152, r1153; +} +{ +mul.f16x2 r1256, r1253, r1132; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +sub.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, r1134, r1135; +} +{ +mul.f16x2 r1268, r1265, r1127; +} +{ +add.f16x2 r1271, r1137, r1268; +} +{ +add.f16x2 r1274, r1140, r1141; +} +{ +mul.f16x2 r1277, r1274, r1131; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, r1146, r1147; +} +{ +mul.f16x2 r1286, r1283, r1129; +} +{ +sub.f16x2 r1289, r1152, r1153; +} +{ +mul.f16x2 r1292, r1289, r1132; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +add.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, r1146, r1147; +} +{ +mul.f16x2 r1304, r1301, r1123; +} +{ +add.f16x2 r1307, r1149, r1304; +} +{ +add.f16x2 r1310, r1152, r1153; +} +{ +mul.f16x2 r1313, r1310, r1127; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, r1134, r1135; +} +{ +mul.f16x2 r1322, r1319, r1125; +} +{ +sub.f16x2 r1325, r1140, r1141; +} +{ +mul.f16x2 r1328, r1325, r1129; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, r1146, r1147; +} +{ +mul.f16x2 r1340, r1337, r1123; +} +{ +add.f16x2 r1343, r1149, r1340; +} +{ +add.f16x2 r1346, r1152, r1153; +} +{ +mul.f16x2 r1349, r1346, r1127; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, r1134, r1135; +} +{ +mul.f16x2 r1358, r1355, r1125; +} +{ +sub.f16x2 r1361, r1140, r1141; +} +{ +mul.f16x2 r1364, r1361, r1129; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +add.f16x2 r1373, r1146, r1147; +} +{ +mul.f16x2 r1376, r1373, r1127; +} +{ +add.f16x2 r1379, r1149, r1376; +} +{ +add.f16x2 r1382, r1152, r1153; +} +{ +mul.f16x2 r1385, r1382, r1131; +} +{ +add.f16x2 r1388, r1379, r1385; +} +{ +sub.f16x2 r1391, r1134, r1135; +} +{ +mul.f16x2 r1394, r1391, r1129; +} +{ +sub.f16x2 r1397, r1140, r1141; +} +{ +mul.f16x2 r1400, r1397, r1132; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +add.f16x2 r1406, r1388, r1403; +} +{ +add.f16x2 r1409, r1146, r1147; +} +{ +mul.f16x2 r1412, r1409, r1127; +} +{ +add.f16x2 r1415, r1149, r1412; +} +{ +add.f16x2 r1418, r1152, r1153; +} +{ +mul.f16x2 r1421, r1418, r1131; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +sub.f16x2 r1427, r1134, r1135; +} +{ +mul.f16x2 r1430, r1427, r1129; +} +{ +sub.f16x2 r1433, r1140, r1141; +} +{ +mul.f16x2 r1436, r1433, r1132; +} +{ +add.f16x2 r1439, r1430, r1436; +} +{ +sub.f16x2 r1442, r1424, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1446, {low, high}; +} +{ +neg.f16x2 r1447, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1450, {low, high}; +} +{ +neg.f16x2 r1451, r1450; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1454, {low, high}; +} +{ +add.f16x2 r1455, r1456, r1457; +} +{ +add.f16x2 r1458, r1459, r1455; +} +{ +add.f16x2 r1461, r1462, r1463; +} +{ +add.f16x2 r1464, r1458, r1461; +} +{ +add.f16x2 r1467, r1468, r1469; +} +{ +add.f16x2 r1470, r1471, r1467; +} +{ +add.f16x2 r1473, r1474, r1475; +} +{ +add.f16x2 r1476, r1470, r1473; +} +{ +add.f16x2 r1479, r1456, r1457; +} +{ +mul.f16x2 r1482, r1479, r1445; +} +{ +add.f16x2 r1485, r1459, r1482; +} +{ +add.f16x2 r1488, r1462, r1463; +} +{ +mul.f16x2 r1491, r1488, r1449; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, r1468, r1469; +} +{ +mul.f16x2 r1500, r1497, r1447; +} +{ +sub.f16x2 r1503, r1474, r1475; +} +{ +mul.f16x2 r1506, r1503, r1451; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +sub.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, r1456, r1457; +} +{ +mul.f16x2 r1518, r1515, r1445; +} +{ +add.f16x2 r1521, r1459, r1518; +} +{ +add.f16x2 r1524, r1462, r1463; +} +{ +mul.f16x2 r1527, r1524, r1449; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, r1468, r1469; +} +{ +mul.f16x2 r1536, r1533, r1447; +} +{ +sub.f16x2 r1539, r1474, r1475; +} +{ +mul.f16x2 r1542, r1539, r1451; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +add.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, r1456, r1457; +} +{ +mul.f16x2 r1554, r1551, r1449; +} +{ +add.f16x2 r1557, r1459, r1554; +} +{ +add.f16x2 r1560, r1462, r1463; +} +{ +mul.f16x2 r1563, r1560, r1453; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, r1468, r1469; +} +{ +mul.f16x2 r1572, r1569, r1451; +} +{ +sub.f16x2 r1575, r1474, r1475; +} +{ +mul.f16x2 r1578, r1575, r1454; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +sub.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, r1456, r1457; +} +{ +mul.f16x2 r1590, r1587, r1449; +} +{ +add.f16x2 r1593, r1459, r1590; +} +{ +add.f16x2 r1596, r1462, r1463; +} +{ +mul.f16x2 r1599, r1596, r1453; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, r1468, r1469; +} +{ +mul.f16x2 r1608, r1605, r1451; +} +{ +sub.f16x2 r1611, r1474, r1475; +} +{ +mul.f16x2 r1614, r1611, r1454; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +add.f16x2 r1620, r1602, r1617; +} +{ +add.f16x2 r1623, r1468, r1469; +} +{ +mul.f16x2 r1626, r1623, r1445; +} +{ +add.f16x2 r1629, r1471, r1626; +} +{ +add.f16x2 r1632, r1474, r1475; +} +{ +mul.f16x2 r1635, r1632, r1449; +} +{ +add.f16x2 r1638, r1629, r1635; +} +{ +sub.f16x2 r1641, r1456, r1457; +} +{ +mul.f16x2 r1644, r1641, r1447; +} +{ +sub.f16x2 r1647, r1462, r1463; +} +{ +mul.f16x2 r1650, r1647, r1451; +} +{ +add.f16x2 r1653, r1644, r1650; +} +{ +add.f16x2 r1656, r1638, r1653; +} +{ +add.f16x2 r1659, r1468, r1469; +} +{ +mul.f16x2 r1662, r1659, r1445; +} +{ +add.f16x2 r1665, r1471, r1662; +} +{ +add.f16x2 r1668, r1474, r1475; +} +{ +mul.f16x2 r1671, r1668, r1449; +} +{ +add.f16x2 r1674, r1665, r1671; +} +{ +sub.f16x2 r1677, r1456, r1457; +} +{ +mul.f16x2 r1680, r1677, r1447; +} +{ +sub.f16x2 r1683, r1462, r1463; +} +{ +mul.f16x2 r1686, r1683, r1451; +} +{ +add.f16x2 r1689, r1680, r1686; +} +{ +sub.f16x2 r1692, r1674, r1689; +} +{ +add.f16x2 r1695, r1468, r1469; +} +{ +mul.f16x2 r1698, r1695, r1449; +} +{ +add.f16x2 r1701, r1471, r1698; +} +{ +add.f16x2 r1704, r1474, r1475; +} +{ +mul.f16x2 r1707, r1704, r1453; +} +{ +add.f16x2 r1710, r1701, r1707; +} +{ +sub.f16x2 r1713, r1456, r1457; +} +{ +mul.f16x2 r1716, r1713, r1451; +} +{ +sub.f16x2 r1719, r1462, r1463; +} +{ +mul.f16x2 r1722, r1719, r1454; +} +{ +add.f16x2 r1725, r1716, r1722; +} +{ +add.f16x2 r1728, r1710, r1725; +} +{ +add.f16x2 r1731, r1468, r1469; +} +{ +mul.f16x2 r1734, r1731, r1449; +} +{ +add.f16x2 r1737, r1471, r1734; +} +{ +add.f16x2 r1740, r1474, r1475; +} +{ +mul.f16x2 r1743, r1740, r1453; +} +{ +add.f16x2 r1746, r1737, r1743; +} +{ +sub.f16x2 r1749, r1456, r1457; +} +{ +mul.f16x2 r1752, r1749, r1451; +} +{ +sub.f16x2 r1755, r1462, r1463; +} +{ +mul.f16x2 r1758, r1755, r1454; +} +{ +add.f16x2 r1761, r1752, r1758; +} +{ +sub.f16x2 r1764, r1746, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f112; +cvt.rn.f16.f32 high, f112; +mov.b32 r1769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f118; +cvt.rn.f16.f32 high, f118; +mov.b32 r1772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f120; +cvt.rn.f16.f32 high, f120; +mov.b32 r1773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1785, r1512, r1767; +} +{ +mul.f16x2 r1788, r1656, r1768; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1512, r1768; +} +{ +fma.rn.f16x2 r1797, r1656, r1767, r1794; +} +{ +mul.f16x2 r1801, r1584, r1769; +} +{ +mul.f16x2 r1804, r1728, r1770; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1584, r1770; +} +{ +fma.rn.f16x2 r1813, r1728, r1769, r1810; +} +{ +mul.f16x2 r1817, r1620, r1771; +} +{ +mul.f16x2 r1820, r1764, r1772; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1620, r1772; +} +{ +fma.rn.f16x2 r1829, r1764, r1771, r1826; +} +{ +mul.f16x2 r1833, r1548, r1773; +} +{ +mul.f16x2 r1836, r1692, r1774; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1548, r1774; +} +{ +fma.rn.f16x2 r1845, r1692, r1773, r1842; +} +{ +add.f16x2 %0, r1142, r1464; +} +{ +add.f16x2 %1, r1154, r1476; +} +{ +sub.f16x2 %10, r1142, r1464; +} +{ +sub.f16x2 %11, r1154, r1476; +} +{ +add.f16x2 %2, r1190, r1791; +} +{ +add.f16x2 %3, r1334, r1797; +} +{ +sub.f16x2 %12, r1190, r1791; +} +{ +sub.f16x2 %13, r1334, r1797; +} +{ +add.f16x2 %4, r1262, r1807; +} +{ +add.f16x2 %5, r1406, r1813; +} +{ +sub.f16x2 %14, r1262, r1807; +} +{ +sub.f16x2 %15, r1406, r1813; +} +{ +add.f16x2 %6, r1298, r1823; +} +{ +add.f16x2 %7, r1442, r1829; +} +{ +sub.f16x2 %16, r1298, r1823; +} +{ +sub.f16x2 %17, r1442, r1829; +} +{ +add.f16x2 %8, r1226, r1839; +} +{ +add.f16x2 %9, r1370, r1845; +} +{ +sub.f16x2 %18, r1226, r1839; +} +{ +sub.f16x2 %19, r1370, r1845; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..513800e27a77c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp32_fwd.hpp.inc @@ -0,0 +1,800 @@ +#ifndef CUFFTDX_FFT_100_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_100_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<192, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<426>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 800, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %27, %43; +add.f32 f42, %22, f41; +add.f32 f43, %32, %38; +add.f32 f44, f43, f42; +add.f32 f45, %29, %45; +add.f32 f46, %23, f45; +add.f32 f47, %34, %39; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %22; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %29, %45; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %34, %39; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %22, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %23; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %27, %43; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %32, %38; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %23, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %30, %46; +add.f32 f86, %24, f85; +add.f32 f87, %35, %40; +add.f32 f88, f87, f86; +add.f32 f89, %31, %47; +add.f32 f90, %26, f89; +add.f32 f91, %37, %42; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %24; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %31, %47; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %37, %42; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %24, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %26; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %30, %46; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %35, %40; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %26, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +sub.f32 f149, f44, f88; +sub.f32 f150, f48, f92; +add.f32 f151, f57, f131; +add.f32 f152, f75, f133; +sub.f32 f153, f57, f131; +sub.f32 f154, f75, f133; +add.f32 f155, f65, f136; +add.f32 f156, f83, f138; +sub.f32 f157, f65, f136; +sub.f32 f158, f83, f138; +add.f32 f159, f66, f141; +add.f32 f160, f84, f143; +sub.f32 f161, f66, f141; +sub.f32 f162, f84, f143; +add.f32 f163, f58, f146; +add.f32 f164, f76, f148; +sub.f32 f165, f58, f146; +sub.f32 f166, f76, f148; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 800, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f167, f168}, [rd6]; +mul.f32 f171, f167, f151; +mul.f32 f172, f168, f152; +mul.f32 f173, f167, f152; +mul.f32 f174, f167, f167; +mul.f32 f175, f168, f168; +sub.f32 f176, f174, f175; +mul.f32 f177, f168, f167; +fma.rn.f32 f178, f168, f167, f177; +mul.f32 f179, f176, f155; +mul.f32 f180, f178, f156; +mul.f32 f181, f176, f156; +mul.f32 f182, f167, f176; +mul.f32 f183, f168, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f167, f178; +fma.rn.f32 f186, f168, f176, f185; +mul.f32 f187, f184, f159; +mul.f32 f188, f186, f160; +mul.f32 f189, f184, f160; +mul.f32 f190, f167, f184; +mul.f32 f191, f168, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f167, f186; +fma.rn.f32 f194, f168, f184, f193; +mul.f32 f195, f192, f163; +mul.f32 f196, f194, f164; +mul.f32 f197, f192, f164; +mul.f32 f198, f167, f192; +mul.f32 f199, f168, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f167, f194; +fma.rn.f32 f202, f168, f192, f201; +mul.f32 f203, f200, f149; +mul.f32 f204, f202, f150; +mul.f32 f205, f200, f150; +mul.f32 f206, f167, f200; +mul.f32 f207, f168, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f167, f202; +fma.rn.f32 f210, f168, f200, f209; +mul.f32 f211, f208, f153; +mul.f32 f212, f210, f154; +mul.f32 f213, f208, f154; +mul.f32 f214, f167, f208; +mul.f32 f215, f168, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f167, f210; +fma.rn.f32 f218, f168, f208, f217; +mul.f32 f219, f216, f157; +mul.f32 f220, f218, f158; +mul.f32 f221, f216, f158; +mul.f32 f222, f167, f216; +mul.f32 f223, f168, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f167, f218; +fma.rn.f32 f226, f168, f216, f225; +mul.f32 f227, f224, f161; +mul.f32 f228, f226, f162; +mul.f32 f229, f224, f162; +mul.f32 f230, f167, f224; +mul.f32 f231, f168, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f167, f226; +fma.rn.f32 f234, f168, f224, f233; +mul.f32 f235, f232, f165; +mul.f32 f236, f234, f166; +mul.f32 f237, f232, f166; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f32 f238, f48, f92; +add.f32 f239, f44, f88; +st.shared.v2.f32 [r9], {f239, f238}; +fma.rn.f32 f240, f168, f151, f173; +sub.f32 f241, f171, f172; +st.shared.v2.f32 [r9+8], {f241, f240}; +fma.rn.f32 f242, f178, f155, f181; +sub.f32 f243, f179, f180; +st.shared.v2.f32 [r9+16], {f243, f242}; +fma.rn.f32 f244, f186, f159, f189; +sub.f32 f245, f187, f188; +st.shared.v2.f32 [r9+24], {f245, f244}; +sub.f32 f246, f195, f196; +fma.rn.f32 f247, f194, f163, f197; +st.shared.v2.f32 [r9+32], {f246, f247}; +fma.rn.f32 f248, f202, f149, f205; +sub.f32 f249, f203, f204; +st.shared.v2.f32 [r9+40], {f249, f248}; +fma.rn.f32 f250, f210, f153, f213; +sub.f32 f251, f211, f212; +st.shared.v2.f32 [r9+48], {f251, f250}; +fma.rn.f32 f252, f218, f157, f221; +sub.f32 f253, f219, f220; +st.shared.v2.f32 [r9+56], {f253, f252}; +fma.rn.f32 f254, f226, f161, f229; +sub.f32 f255, f227, f228; +st.shared.v2.f32 [r9+64], {f255, f254}; +fma.rn.f32 f256, f234, f165, f237; +sub.f32 f257, f235, f236; +st.shared.v2.f32 [r9+72], {f257, f256}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.v2.f32 {f258, f259}, [r10]; +ld.shared.v2.f32 {f262, f263}, [r10+80]; +ld.shared.v2.f32 {f266, f267}, [r10+160]; +ld.shared.v2.f32 {f270, f271}, [r10+240]; +ld.shared.v2.f32 {f274, f275}, [r10+320]; +ld.shared.v2.f32 {f278, f279}, [r10+400]; +ld.shared.v2.f32 {f282, f283}, [r10+480]; +ld.shared.v2.f32 {f286, f287}, [r10+560]; +ld.shared.v2.f32 {f290, f291}, [r10+640]; +ld.shared.v2.f32 {f294, f295}, [r10+720]; +add.f32 f298, f266, f290; +add.f32 f299, f258, f298; +add.f32 f300, f274, f282; +add.f32 f301, f300, f299; +add.f32 f302, f267, f291; +add.f32 f303, f259, f302; +add.f32 f304, f275, f283; +add.f32 f305, f304, f303; +fma.rn.f32 f306, f298, 0f3E9E377A, f258; +mul.f32 f307, f300, 0f3F4F1BBD; +sub.f32 f308, f306, f307; +sub.f32 f309, f267, f291; +mul.f32 f310, f309, 0f3F737871; +sub.f32 f311, f275, f283; +mul.f32 f312, f311, 0fBF167918; +sub.f32 f313, f312, f310; +sub.f32 f314, f308, f313; +add.f32 f315, f313, f308; +mul.f32 f316, f298, 0f3F4F1BBD; +sub.f32 f317, f258, f316; +fma.rn.f32 f318, f300, 0f3E9E377A, f317; +mul.f32 f319, f309, 0f3F167918; +mul.f32 f320, f311, 0f3F737871; +sub.f32 f321, f320, f319; +sub.f32 f322, f318, f321; +add.f32 f323, f321, f318; +fma.rn.f32 f324, f302, 0f3E9E377A, f259; +mul.f32 f325, f304, 0f3F4F1BBD; +sub.f32 f326, f324, f325; +sub.f32 f327, f266, f290; +mul.f32 f328, f327, 0f3F737871; +sub.f32 f329, f274, f282; +mul.f32 f330, f329, 0fBF167918; +sub.f32 f331, f330, f328; +add.f32 f332, f331, f326; +sub.f32 f333, f326, f331; +mul.f32 f334, f302, 0f3F4F1BBD; +sub.f32 f335, f259, f334; +fma.rn.f32 f336, f304, 0f3E9E377A, f335; +mul.f32 f337, f327, 0f3F167918; +mul.f32 f338, f329, 0f3F737871; +sub.f32 f339, f338, f337; +add.f32 f340, f339, f336; +sub.f32 f341, f336, f339; +add.f32 f342, f270, f294; +add.f32 f343, f262, f342; +add.f32 f344, f278, f286; +add.f32 f345, f344, f343; +add.f32 f346, f271, f295; +add.f32 f347, f263, f346; +add.f32 f348, f279, f287; +add.f32 f349, f348, f347; +fma.rn.f32 f350, f342, 0f3E9E377A, f262; +mul.f32 f351, f344, 0f3F4F1BBD; +sub.f32 f352, f350, f351; +sub.f32 f353, f271, f295; +mul.f32 f354, f353, 0f3F737871; +sub.f32 f355, f279, f287; +mul.f32 f356, f355, 0fBF167918; +sub.f32 f357, f356, f354; +sub.f32 f358, f352, f357; +add.f32 f359, f357, f352; +mul.f32 f360, f342, 0f3F4F1BBD; +sub.f32 f361, f262, f360; +fma.rn.f32 f362, f344, 0f3E9E377A, f361; +mul.f32 f363, f353, 0f3F167918; +mul.f32 f364, f355, 0f3F737871; +sub.f32 f365, f364, f363; +sub.f32 f366, f362, f365; +add.f32 f367, f365, f362; +fma.rn.f32 f368, f346, 0f3E9E377A, f263; +mul.f32 f369, f348, 0f3F4F1BBD; +sub.f32 f370, f368, f369; +sub.f32 f371, f270, f294; +mul.f32 f372, f371, 0f3F737871; +sub.f32 f373, f278, f286; +mul.f32 f374, f373, 0fBF167918; +sub.f32 f375, f374, f372; +add.f32 f376, f375, f370; +sub.f32 f377, f370, f375; +mul.f32 f378, f346, 0f3F4F1BBD; +sub.f32 f379, f263, f378; +fma.rn.f32 f380, f348, 0f3E9E377A, f379; +mul.f32 f381, f371, 0f3F167918; +mul.f32 f382, f373, 0f3F737871; +sub.f32 f383, f382, f381; +add.f32 f384, f383, f380; +sub.f32 f385, f380, f383; +mul.f32 f386, f358, 0f3F4F1BBD; +mul.f32 f387, f376, 0fBF167918; +sub.f32 f388, f386, f387; +mul.f32 f389, f376, 0f3F4F1BBD; +fma.rn.f32 f390, f358, 0fBF167918, f389; +mul.f32 f391, f366, 0f3E9E377A; +mul.f32 f392, f384, 0fBF737871; +sub.f32 f393, f391, f392; +mul.f32 f394, f384, 0f3E9E377A; +fma.rn.f32 f395, f366, 0fBF737871, f394; +mul.f32 f396, f367, 0fBE9E377A; +mul.f32 f397, f385, 0fBF737871; +sub.f32 f398, f396, f397; +mul.f32 f399, f385, 0fBE9E377A; +fma.rn.f32 f400, f367, 0fBF737871, f399; +mul.f32 f401, f359, 0fBF4F1BBD; +mul.f32 f402, f377, 0fBF167918; +sub.f32 f403, f401, f402; +mul.f32 f404, f377, 0fBF4F1BBD; +fma.rn.f32 f405, f359, 0fBF167918, f404; +add.f32 %1, f305, f349; +add.f32 %0, f301, f345; +add.f32 %3, f332, f390; +add.f32 %2, f314, f388; +add.f32 %5, f340, f395; +add.f32 %4, f322, f393; +add.f32 %7, f341, f400; +add.f32 %6, f323, f398; +add.f32 %9, f333, f405; +add.f32 %8, f315, f403; +sub.f32 %11, f305, f349; +sub.f32 %10, f301, f345; +sub.f32 %13, f332, f390; +sub.f32 %12, f314, f388; +sub.f32 %15, f340, f395; +sub.f32 %14, f322, f393; +sub.f32 %17, f341, f400; +sub.f32 %16, f323, f398; +sub.f32 %19, f333, f405; +sub.f32 %18, f315, f403; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<191, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<406>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 400, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %27, %43; +add.f32 f42, %22, f41; +add.f32 f43, %32, %38; +add.f32 f44, f43, f42; +add.f32 f45, %29, %45; +add.f32 f46, %23, f45; +add.f32 f47, %34, %39; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %22; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %29, %45; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %34, %39; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %22, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %23; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %27, %43; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %32, %38; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %23, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %30, %46; +add.f32 f86, %24, f85; +add.f32 f87, %35, %40; +add.f32 f88, f87, f86; +add.f32 f89, %31, %47; +add.f32 f90, %26, f89; +add.f32 f91, %37, %42; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %24; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %31, %47; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %37, %42; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %24, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %26; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %30, %46; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %35, %40; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %26, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +add.f32 f149, f44, f88; +add.f32 f150, f48, f92; +sub.f32 f151, f44, f88; +sub.f32 f152, f48, f92; +add.f32 f153, f57, f131; +add.f32 f154, f75, f133; +sub.f32 f155, f57, f131; +sub.f32 f156, f75, f133; +add.f32 f157, f65, f136; +add.f32 f158, f83, f138; +sub.f32 f159, f65, f136; +sub.f32 f160, f83, f138; +add.f32 f161, f66, f141; +add.f32 f162, f84, f143; +sub.f32 f163, f66, f141; +sub.f32 f164, f84, f143; +add.f32 f165, f58, f146; +add.f32 f166, f76, f148; +sub.f32 f167, f58, f146; +sub.f32 f168, f76, f148; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f169, f170}, [rd6]; +mul.f32 f173, f169, f153; +mul.f32 f174, f170, f154; +sub.f32 f175, f173, f174; +mul.f32 f176, f169, f154; +fma.rn.f32 f177, f170, f153, f176; +mul.f32 f178, f169, f169; +mul.f32 f179, f170, f170; +sub.f32 f180, f178, f179; +mul.f32 f181, f170, f169; +fma.rn.f32 f182, f170, f169, f181; +mul.f32 f183, f180, f157; +mul.f32 f184, f182, f158; +sub.f32 f185, f183, f184; +mul.f32 f186, f180, f158; +fma.rn.f32 f187, f182, f157, f186; +mul.f32 f188, f169, f180; +mul.f32 f189, f170, f182; +sub.f32 f190, f188, f189; +mul.f32 f191, f169, f182; +fma.rn.f32 f192, f170, f180, f191; +mul.f32 f193, f190, f161; +mul.f32 f194, f192, f162; +sub.f32 f195, f193, f194; +mul.f32 f196, f190, f162; +fma.rn.f32 f197, f192, f161, f196; +mul.f32 f198, f169, f190; +mul.f32 f199, f170, f192; +sub.f32 f200, f198, f199; +mul.f32 f201, f169, f192; +fma.rn.f32 f202, f170, f190, f201; +mul.f32 f203, f200, f165; +mul.f32 f204, f202, f166; +sub.f32 f205, f203, f204; +mul.f32 f206, f200, f166; +fma.rn.f32 f207, f202, f165, f206; +mul.f32 f208, f169, f200; +mul.f32 f209, f170, f202; +sub.f32 f210, f208, f209; +mul.f32 f211, f169, f202; +fma.rn.f32 f212, f170, f200, f211; +mul.f32 f213, f210, f151; +mul.f32 f214, f212, f152; +sub.f32 f215, f213, f214; +mul.f32 f216, f210, f152; +fma.rn.f32 f217, f212, f151, f216; +mul.f32 f218, f169, f210; +mul.f32 f219, f170, f212; +sub.f32 f220, f218, f219; +mul.f32 f221, f169, f212; +fma.rn.f32 f222, f170, f210, f221; +mul.f32 f223, f220, f155; +mul.f32 f224, f222, f156; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, f156; +fma.rn.f32 f227, f222, f155, f226; +mul.f32 f228, f169, f220; +mul.f32 f229, f170, f222; +sub.f32 f230, f228, f229; +mul.f32 f231, f169, f222; +fma.rn.f32 f232, f170, f220, f231; +mul.f32 f233, f230, f159; +mul.f32 f234, f232, f160; +sub.f32 f235, f233, f234; +mul.f32 f236, f230, f160; +fma.rn.f32 f237, f232, f159, f236; +mul.f32 f238, f169, f230; +mul.f32 f239, f170, f232; +sub.f32 f240, f238, f239; +mul.f32 f241, f169, f232; +fma.rn.f32 f242, f170, f230, f241; +mul.f32 f243, f240, f163; +mul.f32 f244, f242, f164; +sub.f32 f245, f243, f244; +mul.f32 f246, f240, f164; +fma.rn.f32 f247, f242, f163, f246; +mul.f32 f248, f169, f240; +mul.f32 f249, f170, f242; +sub.f32 f250, f248, f249; +mul.f32 f251, f169, f242; +fma.rn.f32 f252, f170, f240, f251; +mul.f32 f253, f250, f167; +mul.f32 f254, f252, f168; +sub.f32 f255, f253, f254; +mul.f32 f256, f250, f168; +fma.rn.f32 f257, f252, f167, f256; +mad.lo.s32 r8, r5, 400, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.v2.f32 [r9], {f149, f175}; +st.shared.v2.f32 [r9+8], {f185, f195}; +st.shared.v2.f32 [r9+16], {f205, f215}; +st.shared.v2.f32 [r9+24], {f225, f235}; +st.shared.v2.f32 [r9+32], {f245, f255}; +barrier.sync 0; +mad.lo.s32 r10, r7, -36, r9; +ld.shared.f32 f258, [r10]; +ld.shared.f32 f259, [r10+40]; +ld.shared.f32 f260, [r10+80]; +ld.shared.f32 f261, [r10+120]; +ld.shared.f32 f262, [r10+160]; +ld.shared.f32 f263, [r10+200]; +ld.shared.f32 f264, [r10+240]; +ld.shared.f32 f265, [r10+280]; +ld.shared.f32 f266, [r10+320]; +ld.shared.f32 f267, [r10+360]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f150, f177}; +st.shared.v2.f32 [r9+8], {f187, f197}; +st.shared.v2.f32 [r9+16], {f207, f217}; +st.shared.v2.f32 [r9+24], {f227, f237}; +st.shared.v2.f32 [r9+32], {f247, f257}; +barrier.sync 0; +ld.shared.f32 f268, [r10]; +ld.shared.f32 f269, [r10+40]; +ld.shared.f32 f270, [r10+80]; +ld.shared.f32 f271, [r10+120]; +ld.shared.f32 f272, [r10+160]; +ld.shared.f32 f273, [r10+200]; +ld.shared.f32 f274, [r10+240]; +ld.shared.f32 f275, [r10+280]; +ld.shared.f32 f276, [r10+320]; +ld.shared.f32 f277, [r10+360]; +add.f32 f278, f260, f266; +add.f32 f279, f258, f278; +add.f32 f280, f262, f264; +add.f32 f281, f280, f279; +add.f32 f282, f270, f276; +add.f32 f283, f268, f282; +add.f32 f284, f272, f274; +add.f32 f285, f284, f283; +fma.rn.f32 f286, f278, 0f3E9E377A, f258; +mul.f32 f287, f280, 0f3F4F1BBD; +sub.f32 f288, f286, f287; +sub.f32 f289, f270, f276; +mul.f32 f290, f289, 0f3F737871; +sub.f32 f291, f272, f274; +mul.f32 f292, f291, 0fBF167918; +sub.f32 f293, f292, f290; +sub.f32 f294, f288, f293; +add.f32 f295, f293, f288; +mul.f32 f296, f278, 0f3F4F1BBD; +sub.f32 f297, f258, f296; +fma.rn.f32 f298, f280, 0f3E9E377A, f297; +mul.f32 f299, f289, 0f3F167918; +mul.f32 f300, f291, 0f3F737871; +sub.f32 f301, f300, f299; +sub.f32 f302, f298, f301; +add.f32 f303, f301, f298; +fma.rn.f32 f304, f282, 0f3E9E377A, f268; +mul.f32 f305, f284, 0f3F4F1BBD; +sub.f32 f306, f304, f305; +sub.f32 f307, f260, f266; +mul.f32 f308, f307, 0f3F737871; +sub.f32 f309, f262, f264; +mul.f32 f310, f309, 0fBF167918; +sub.f32 f311, f310, f308; +add.f32 f312, f311, f306; +sub.f32 f313, f306, f311; +mul.f32 f314, f282, 0f3F4F1BBD; +sub.f32 f315, f268, f314; +fma.rn.f32 f316, f284, 0f3E9E377A, f315; +mul.f32 f317, f307, 0f3F167918; +mul.f32 f318, f309, 0f3F737871; +sub.f32 f319, f318, f317; +add.f32 f320, f319, f316; +sub.f32 f321, f316, f319; +add.f32 f322, f261, f267; +add.f32 f323, f259, f322; +add.f32 f324, f263, f265; +add.f32 f325, f324, f323; +add.f32 f326, f271, f277; +add.f32 f327, f269, f326; +add.f32 f328, f273, f275; +add.f32 f329, f328, f327; +fma.rn.f32 f330, f322, 0f3E9E377A, f259; +mul.f32 f331, f324, 0f3F4F1BBD; +sub.f32 f332, f330, f331; +sub.f32 f333, f271, f277; +mul.f32 f334, f333, 0f3F737871; +sub.f32 f335, f273, f275; +mul.f32 f336, f335, 0fBF167918; +sub.f32 f337, f336, f334; +sub.f32 f338, f332, f337; +add.f32 f339, f337, f332; +mul.f32 f340, f322, 0f3F4F1BBD; +sub.f32 f341, f259, f340; +fma.rn.f32 f342, f324, 0f3E9E377A, f341; +mul.f32 f343, f333, 0f3F167918; +mul.f32 f344, f335, 0f3F737871; +sub.f32 f345, f344, f343; +sub.f32 f346, f342, f345; +add.f32 f347, f345, f342; +fma.rn.f32 f348, f326, 0f3E9E377A, f269; +mul.f32 f349, f328, 0f3F4F1BBD; +sub.f32 f350, f348, f349; +sub.f32 f351, f261, f267; +mul.f32 f352, f351, 0f3F737871; +sub.f32 f353, f263, f265; +mul.f32 f354, f353, 0fBF167918; +sub.f32 f355, f354, f352; +add.f32 f356, f355, f350; +sub.f32 f357, f350, f355; +mul.f32 f358, f326, 0f3F4F1BBD; +sub.f32 f359, f269, f358; +fma.rn.f32 f360, f328, 0f3E9E377A, f359; +mul.f32 f361, f351, 0f3F167918; +mul.f32 f362, f353, 0f3F737871; +sub.f32 f363, f362, f361; +add.f32 f364, f363, f360; +sub.f32 f365, f360, f363; +mul.f32 f366, f338, 0f3F4F1BBD; +mul.f32 f367, f356, 0fBF167918; +sub.f32 f368, f366, f367; +mul.f32 f369, f356, 0f3F4F1BBD; +fma.rn.f32 f370, f338, 0fBF167918, f369; +mul.f32 f371, f346, 0f3E9E377A; +mul.f32 f372, f364, 0fBF737871; +sub.f32 f373, f371, f372; +mul.f32 f374, f364, 0f3E9E377A; +fma.rn.f32 f375, f346, 0fBF737871, f374; +mul.f32 f376, f347, 0fBE9E377A; +mul.f32 f377, f365, 0fBF737871; +sub.f32 f378, f376, f377; +mul.f32 f379, f365, 0fBE9E377A; +fma.rn.f32 f380, f347, 0fBF737871, f379; +mul.f32 f381, f339, 0fBF4F1BBD; +mul.f32 f382, f357, 0fBF167918; +sub.f32 f383, f381, f382; +mul.f32 f384, f357, 0fBF4F1BBD; +fma.rn.f32 f385, f339, 0fBF167918, f384; +add.f32 %0, f281, f325; +add.f32 %1, f285, f329; +add.f32 %3, f312, f370; +add.f32 %2, f294, f368; +add.f32 %5, f320, f375; +add.f32 %4, f302, f373; +add.f32 %7, f321, f380; +add.f32 %6, f303, f378; +add.f32 %9, f313, f385; +add.f32 %8, f295, f383; +sub.f32 %10, f281, f325; +sub.f32 %11, f285, f329; +sub.f32 %13, f312, f370; +sub.f32 %12, f294, f368; +sub.f32 %15, f320, f375; +sub.f32 %14, f302, f373; +sub.f32 %17, f321, f380; +sub.f32 %16, f303, f378; +sub.f32 %19, f313, f385; +sub.f32 %18, f295, f383; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..faf97435ed286 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp32_inv.hpp.inc @@ -0,0 +1,784 @@ +#ifndef CUFFTDX_FFT_100_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_100_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<394, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<418>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 800, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %27, %43; +add.f32 f42, %22, f41; +add.f32 f43, %32, %38; +add.f32 f44, f43, f42; +add.f32 f45, %29, %45; +add.f32 f46, %23, f45; +add.f32 f47, %34, %39; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %22; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %29, %45; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %34, %39; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %22, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %23; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %27, %43; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %32, %38; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %23, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %30, %46; +add.f32 f84, %24, f83; +add.f32 f85, %35, %40; +add.f32 f86, f85, f84; +add.f32 f87, %31, %47; +add.f32 f88, %26, f87; +add.f32 f89, %37, %42; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %24; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %31, %47; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %37, %42; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %24, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %26; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %30, %46; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %35, %40; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %26, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +sub.f32 f145, f44, f86; +sub.f32 f146, f48, f90; +add.f32 f147, f56, f127; +add.f32 f148, f73, f129; +sub.f32 f149, f56, f127; +sub.f32 f150, f73, f129; +add.f32 f151, f64, f132; +add.f32 f152, f81, f134; +sub.f32 f153, f64, f132; +sub.f32 f154, f81, f134; +add.f32 f155, f65, f137; +add.f32 f156, f82, f139; +sub.f32 f157, f65, f137; +sub.f32 f158, f82, f139; +add.f32 f159, f57, f142; +add.f32 f160, f74, f144; +sub.f32 f161, f57, f142; +sub.f32 f162, f74, f144; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 800, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f163, f164}, [rd6]; +mul.f32 f167, f148, f164; +mul.f32 f168, f147, f164; +mul.f32 f169, f163, f148; +mul.f32 f170, f163, f163; +mul.f32 f171, f164, f164; +sub.f32 f172, f170, f171; +mul.f32 f173, f164, f163; +fma.rn.f32 f174, f164, f163, f173; +mul.f32 f175, f152, f174; +mul.f32 f176, f151, f174; +mul.f32 f177, f172, f152; +mul.f32 f178, f163, f172; +mul.f32 f179, f164, f174; +sub.f32 f180, f178, f179; +mul.f32 f181, f163, f174; +fma.rn.f32 f182, f164, f172, f181; +mul.f32 f183, f156, f182; +mul.f32 f184, f155, f182; +mul.f32 f185, f180, f156; +mul.f32 f186, f163, f180; +mul.f32 f187, f164, f182; +sub.f32 f188, f186, f187; +mul.f32 f189, f163, f182; +fma.rn.f32 f190, f164, f180, f189; +mul.f32 f191, f160, f190; +mul.f32 f192, f159, f190; +mul.f32 f193, f188, f160; +mul.f32 f194, f163, f188; +mul.f32 f195, f164, f190; +sub.f32 f196, f194, f195; +mul.f32 f197, f163, f190; +fma.rn.f32 f198, f164, f188, f197; +mul.f32 f199, f146, f198; +mul.f32 f200, f145, f198; +mul.f32 f201, f196, f146; +mul.f32 f202, f163, f196; +mul.f32 f203, f164, f198; +sub.f32 f204, f202, f203; +mul.f32 f205, f163, f198; +fma.rn.f32 f206, f164, f196, f205; +mul.f32 f207, f150, f206; +mul.f32 f208, f149, f206; +mul.f32 f209, f204, f150; +mul.f32 f210, f163, f204; +mul.f32 f211, f164, f206; +sub.f32 f212, f210, f211; +mul.f32 f213, f163, f206; +fma.rn.f32 f214, f164, f204, f213; +mul.f32 f215, f154, f214; +mul.f32 f216, f153, f214; +mul.f32 f217, f212, f154; +mul.f32 f218, f163, f212; +mul.f32 f219, f164, f214; +sub.f32 f220, f218, f219; +mul.f32 f221, f163, f214; +fma.rn.f32 f222, f164, f212, f221; +mul.f32 f223, f158, f222; +mul.f32 f224, f157, f222; +mul.f32 f225, f220, f158; +mul.f32 f226, f163, f220; +mul.f32 f227, f164, f222; +sub.f32 f228, f226, f227; +mul.f32 f229, f163, f222; +fma.rn.f32 f230, f164, f220, f229; +mul.f32 f231, f162, f230; +mul.f32 f232, f161, f230; +mul.f32 f233, f228, f162; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f32 f234, f48, f90; +add.f32 f235, f44, f86; +st.shared.v2.f32 [r9], {f235, f234}; +fma.rn.f32 f236, f163, f147, f167; +sub.f32 f237, f169, f168; +st.shared.v2.f32 [r9+8], {f236, f237}; +fma.rn.f32 f238, f172, f151, f175; +sub.f32 f239, f177, f176; +st.shared.v2.f32 [r9+16], {f238, f239}; +fma.rn.f32 f240, f180, f155, f183; +sub.f32 f241, f185, f184; +st.shared.v2.f32 [r9+24], {f240, f241}; +sub.f32 f242, f193, f192; +fma.rn.f32 f243, f188, f159, f191; +st.shared.v2.f32 [r9+32], {f243, f242}; +fma.rn.f32 f244, f196, f145, f199; +sub.f32 f245, f201, f200; +st.shared.v2.f32 [r9+40], {f244, f245}; +fma.rn.f32 f246, f204, f149, f207; +sub.f32 f247, f209, f208; +st.shared.v2.f32 [r9+48], {f246, f247}; +fma.rn.f32 f248, f212, f153, f215; +sub.f32 f249, f217, f216; +st.shared.v2.f32 [r9+56], {f248, f249}; +fma.rn.f32 f250, f220, f157, f223; +sub.f32 f251, f225, f224; +st.shared.v2.f32 [r9+64], {f250, f251}; +fma.rn.f32 f252, f228, f161, f231; +sub.f32 f253, f233, f232; +st.shared.v2.f32 [r9+72], {f252, f253}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.v2.f32 {f254, f255}, [r10]; +ld.shared.v2.f32 {f258, f259}, [r10+80]; +ld.shared.v2.f32 {f262, f263}, [r10+160]; +ld.shared.v2.f32 {f266, f267}, [r10+240]; +ld.shared.v2.f32 {f270, f271}, [r10+320]; +ld.shared.v2.f32 {f274, f275}, [r10+400]; +ld.shared.v2.f32 {f278, f279}, [r10+480]; +ld.shared.v2.f32 {f282, f283}, [r10+560]; +ld.shared.v2.f32 {f286, f287}, [r10+640]; +ld.shared.v2.f32 {f290, f291}, [r10+720]; +add.f32 f294, f262, f286; +add.f32 f295, f254, f294; +add.f32 f296, f270, f278; +add.f32 f297, f296, f295; +add.f32 f298, f263, f287; +add.f32 f299, f255, f298; +add.f32 f300, f271, f279; +add.f32 f301, f300, f299; +fma.rn.f32 f302, f294, 0f3E9E377A, f254; +mul.f32 f303, f296, 0f3F4F1BBD; +sub.f32 f304, f302, f303; +sub.f32 f305, f263, f287; +mul.f32 f306, f305, 0f3F737871; +sub.f32 f307, f271, f279; +fma.rn.f32 f308, f307, 0f3F167918, f306; +sub.f32 f309, f304, f308; +add.f32 f310, f308, f304; +mul.f32 f311, f294, 0f3F4F1BBD; +sub.f32 f312, f254, f311; +fma.rn.f32 f313, f296, 0f3E9E377A, f312; +mul.f32 f314, f305, 0f3F167918; +mul.f32 f315, f307, 0f3F737871; +sub.f32 f316, f314, f315; +sub.f32 f317, f313, f316; +add.f32 f318, f316, f313; +fma.rn.f32 f319, f298, 0f3E9E377A, f255; +mul.f32 f320, f300, 0f3F4F1BBD; +sub.f32 f321, f319, f320; +sub.f32 f322, f262, f286; +mul.f32 f323, f322, 0f3F737871; +sub.f32 f324, f270, f278; +fma.rn.f32 f325, f324, 0f3F167918, f323; +add.f32 f326, f325, f321; +sub.f32 f327, f321, f325; +mul.f32 f328, f298, 0f3F4F1BBD; +sub.f32 f329, f255, f328; +fma.rn.f32 f330, f300, 0f3E9E377A, f329; +mul.f32 f331, f322, 0f3F167918; +mul.f32 f332, f324, 0f3F737871; +sub.f32 f333, f331, f332; +add.f32 f334, f333, f330; +sub.f32 f335, f330, f333; +add.f32 f336, f266, f290; +add.f32 f337, f258, f336; +add.f32 f338, f274, f282; +add.f32 f339, f338, f337; +add.f32 f340, f267, f291; +add.f32 f341, f259, f340; +add.f32 f342, f275, f283; +add.f32 f343, f342, f341; +fma.rn.f32 f344, f336, 0f3E9E377A, f258; +mul.f32 f345, f338, 0f3F4F1BBD; +sub.f32 f346, f344, f345; +sub.f32 f347, f267, f291; +mul.f32 f348, f347, 0f3F737871; +sub.f32 f349, f275, f283; +fma.rn.f32 f350, f349, 0f3F167918, f348; +sub.f32 f351, f346, f350; +add.f32 f352, f350, f346; +mul.f32 f353, f336, 0f3F4F1BBD; +sub.f32 f354, f258, f353; +fma.rn.f32 f355, f338, 0f3E9E377A, f354; +mul.f32 f356, f347, 0f3F167918; +mul.f32 f357, f349, 0f3F737871; +sub.f32 f358, f356, f357; +sub.f32 f359, f355, f358; +add.f32 f360, f358, f355; +fma.rn.f32 f361, f340, 0f3E9E377A, f259; +mul.f32 f362, f342, 0f3F4F1BBD; +sub.f32 f363, f361, f362; +sub.f32 f364, f266, f290; +mul.f32 f365, f364, 0f3F737871; +sub.f32 f366, f274, f282; +fma.rn.f32 f367, f366, 0f3F167918, f365; +add.f32 f368, f367, f363; +sub.f32 f369, f363, f367; +mul.f32 f370, f340, 0f3F4F1BBD; +sub.f32 f371, f259, f370; +fma.rn.f32 f372, f342, 0f3E9E377A, f371; +mul.f32 f373, f364, 0f3F167918; +mul.f32 f374, f366, 0f3F737871; +sub.f32 f375, f373, f374; +add.f32 f376, f375, f372; +sub.f32 f377, f372, f375; +mul.f32 f378, f351, 0f3F4F1BBD; +mul.f32 f379, f368, 0f3F167918; +sub.f32 f380, f378, f379; +mul.f32 f381, f368, 0f3F4F1BBD; +fma.rn.f32 f382, f351, 0f3F167918, f381; +mul.f32 f383, f359, 0f3E9E377A; +mul.f32 f384, f376, 0f3F737871; +sub.f32 f385, f383, f384; +mul.f32 f386, f376, 0f3E9E377A; +fma.rn.f32 f387, f359, 0f3F737871, f386; +mul.f32 f388, f360, 0fBE9E377A; +mul.f32 f389, f377, 0f3F737871; +sub.f32 f390, f388, f389; +mul.f32 f391, f377, 0fBE9E377A; +fma.rn.f32 f392, f360, 0f3F737871, f391; +mul.f32 f393, f352, 0fBF4F1BBD; +mul.f32 f394, f369, 0f3F167918; +sub.f32 f395, f393, f394; +mul.f32 f396, f369, 0fBF4F1BBD; +fma.rn.f32 f397, f352, 0f3F167918, f396; +add.f32 %1, f301, f343; +add.f32 %0, f297, f339; +add.f32 %3, f326, f382; +add.f32 %2, f309, f380; +add.f32 %5, f334, f387; +add.f32 %4, f317, f385; +add.f32 %7, f335, f392; +add.f32 %6, f318, f390; +add.f32 %9, f327, f397; +add.f32 %8, f310, f395; +sub.f32 %11, f301, f343; +sub.f32 %10, f297, f339; +sub.f32 %13, f326, f382; +sub.f32 %12, f309, f380; +sub.f32 %15, f334, f387; +sub.f32 %14, f317, f385; +sub.f32 %17, f335, f392; +sub.f32 %16, f318, f390; +sub.f32 %19, f327, f397; +sub.f32 %18, f310, f395; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<393, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<398>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 400, r2; +mov.u32 r4, %tid.x; +add.f32 f41, %27, %43; +add.f32 f42, %22, f41; +add.f32 f43, %32, %38; +add.f32 f44, f43, f42; +add.f32 f45, %29, %45; +add.f32 f46, %23, f45; +add.f32 f47, %34, %39; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %22; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %29, %45; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %34, %39; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %22, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %23; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %27, %43; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %32, %38; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %23, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %30, %46; +add.f32 f84, %24, f83; +add.f32 f85, %35, %40; +add.f32 f86, f85, f84; +add.f32 f87, %31, %47; +add.f32 f88, %26, f87; +add.f32 f89, %37, %42; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %24; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %31, %47; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %37, %42; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %24, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %26; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %30, %46; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %35, %40; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %26, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +add.f32 f145, f44, f86; +add.f32 f146, f48, f90; +sub.f32 f147, f44, f86; +sub.f32 f148, f48, f90; +add.f32 f149, f56, f127; +add.f32 f150, f73, f129; +sub.f32 f151, f56, f127; +sub.f32 f152, f73, f129; +add.f32 f153, f64, f132; +add.f32 f154, f81, f134; +sub.f32 f155, f64, f132; +sub.f32 f156, f81, f134; +add.f32 f157, f65, f137; +add.f32 f158, f82, f139; +sub.f32 f159, f65, f137; +sub.f32 f160, f82, f139; +add.f32 f161, f57, f142; +add.f32 f162, f74, f144; +sub.f32 f163, f57, f142; +sub.f32 f164, f74, f144; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f165, f166}, [rd6]; +mul.f32 f169, f150, f166; +fma.rn.f32 f170, f165, f149, f169; +mul.f32 f171, f149, f166; +mul.f32 f172, f165, f150; +sub.f32 f173, f172, f171; +mul.f32 f174, f165, f165; +mul.f32 f175, f166, f166; +sub.f32 f176, f174, f175; +mul.f32 f177, f166, f165; +fma.rn.f32 f178, f166, f165, f177; +mul.f32 f179, f154, f178; +fma.rn.f32 f180, f176, f153, f179; +mul.f32 f181, f153, f178; +mul.f32 f182, f176, f154; +sub.f32 f183, f182, f181; +mul.f32 f184, f165, f176; +mul.f32 f185, f166, f178; +sub.f32 f186, f184, f185; +mul.f32 f187, f165, f178; +fma.rn.f32 f188, f166, f176, f187; +mul.f32 f189, f158, f188; +fma.rn.f32 f190, f186, f157, f189; +mul.f32 f191, f157, f188; +mul.f32 f192, f186, f158; +sub.f32 f193, f192, f191; +mul.f32 f194, f165, f186; +mul.f32 f195, f166, f188; +sub.f32 f196, f194, f195; +mul.f32 f197, f165, f188; +fma.rn.f32 f198, f166, f186, f197; +mul.f32 f199, f162, f198; +fma.rn.f32 f200, f196, f161, f199; +mul.f32 f201, f161, f198; +mul.f32 f202, f196, f162; +sub.f32 f203, f202, f201; +mul.f32 f204, f165, f196; +mul.f32 f205, f166, f198; +sub.f32 f206, f204, f205; +mul.f32 f207, f165, f198; +fma.rn.f32 f208, f166, f196, f207; +mul.f32 f209, f148, f208; +fma.rn.f32 f210, f206, f147, f209; +mul.f32 f211, f147, f208; +mul.f32 f212, f206, f148; +sub.f32 f213, f212, f211; +mul.f32 f214, f165, f206; +mul.f32 f215, f166, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f165, f208; +fma.rn.f32 f218, f166, f206, f217; +mul.f32 f219, f152, f218; +fma.rn.f32 f220, f216, f151, f219; +mul.f32 f221, f151, f218; +mul.f32 f222, f216, f152; +sub.f32 f223, f222, f221; +mul.f32 f224, f165, f216; +mul.f32 f225, f166, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f165, f218; +fma.rn.f32 f228, f166, f216, f227; +mul.f32 f229, f156, f228; +fma.rn.f32 f230, f226, f155, f229; +mul.f32 f231, f155, f228; +mul.f32 f232, f226, f156; +sub.f32 f233, f232, f231; +mul.f32 f234, f165, f226; +mul.f32 f235, f166, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f165, f228; +fma.rn.f32 f238, f166, f226, f237; +mul.f32 f239, f160, f238; +fma.rn.f32 f240, f236, f159, f239; +mul.f32 f241, f159, f238; +mul.f32 f242, f236, f160; +sub.f32 f243, f242, f241; +mul.f32 f244, f165, f236; +mul.f32 f245, f166, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f165, f238; +fma.rn.f32 f248, f166, f236, f247; +mul.f32 f249, f164, f248; +fma.rn.f32 f250, f246, f163, f249; +mul.f32 f251, f163, f248; +mul.f32 f252, f246, f164; +sub.f32 f253, f252, f251; +mad.lo.s32 r8, r5, 400, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.v2.f32 [r9], {f145, f170}; +st.shared.v2.f32 [r9+8], {f180, f190}; +st.shared.v2.f32 [r9+16], {f200, f210}; +st.shared.v2.f32 [r9+24], {f220, f230}; +st.shared.v2.f32 [r9+32], {f240, f250}; +barrier.sync 0; +mad.lo.s32 r10, r7, -36, r9; +ld.shared.f32 f254, [r10]; +ld.shared.f32 f255, [r10+40]; +ld.shared.f32 f256, [r10+80]; +ld.shared.f32 f257, [r10+120]; +ld.shared.f32 f258, [r10+160]; +ld.shared.f32 f259, [r10+200]; +ld.shared.f32 f260, [r10+240]; +ld.shared.f32 f261, [r10+280]; +ld.shared.f32 f262, [r10+320]; +ld.shared.f32 f263, [r10+360]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f146, f173}; +st.shared.v2.f32 [r9+8], {f183, f193}; +st.shared.v2.f32 [r9+16], {f203, f213}; +st.shared.v2.f32 [r9+24], {f223, f233}; +st.shared.v2.f32 [r9+32], {f243, f253}; +barrier.sync 0; +ld.shared.f32 f264, [r10]; +ld.shared.f32 f265, [r10+40]; +ld.shared.f32 f266, [r10+80]; +ld.shared.f32 f267, [r10+120]; +ld.shared.f32 f268, [r10+160]; +ld.shared.f32 f269, [r10+200]; +ld.shared.f32 f270, [r10+240]; +ld.shared.f32 f271, [r10+280]; +ld.shared.f32 f272, [r10+320]; +ld.shared.f32 f273, [r10+360]; +add.f32 f274, f256, f262; +add.f32 f275, f254, f274; +add.f32 f276, f258, f260; +add.f32 f277, f276, f275; +add.f32 f278, f266, f272; +add.f32 f279, f264, f278; +add.f32 f280, f268, f270; +add.f32 f281, f280, f279; +fma.rn.f32 f282, f274, 0f3E9E377A, f254; +mul.f32 f283, f276, 0f3F4F1BBD; +sub.f32 f284, f282, f283; +sub.f32 f285, f266, f272; +mul.f32 f286, f285, 0f3F737871; +sub.f32 f287, f268, f270; +fma.rn.f32 f288, f287, 0f3F167918, f286; +sub.f32 f289, f284, f288; +add.f32 f290, f288, f284; +mul.f32 f291, f274, 0f3F4F1BBD; +sub.f32 f292, f254, f291; +fma.rn.f32 f293, f276, 0f3E9E377A, f292; +mul.f32 f294, f285, 0f3F167918; +mul.f32 f295, f287, 0f3F737871; +sub.f32 f296, f294, f295; +sub.f32 f297, f293, f296; +add.f32 f298, f296, f293; +fma.rn.f32 f299, f278, 0f3E9E377A, f264; +mul.f32 f300, f280, 0f3F4F1BBD; +sub.f32 f301, f299, f300; +sub.f32 f302, f256, f262; +mul.f32 f303, f302, 0f3F737871; +sub.f32 f304, f258, f260; +fma.rn.f32 f305, f304, 0f3F167918, f303; +add.f32 f306, f305, f301; +sub.f32 f307, f301, f305; +mul.f32 f308, f278, 0f3F4F1BBD; +sub.f32 f309, f264, f308; +fma.rn.f32 f310, f280, 0f3E9E377A, f309; +mul.f32 f311, f302, 0f3F167918; +mul.f32 f312, f304, 0f3F737871; +sub.f32 f313, f311, f312; +add.f32 f314, f313, f310; +sub.f32 f315, f310, f313; +add.f32 f316, f257, f263; +add.f32 f317, f255, f316; +add.f32 f318, f259, f261; +add.f32 f319, f318, f317; +add.f32 f320, f267, f273; +add.f32 f321, f265, f320; +add.f32 f322, f269, f271; +add.f32 f323, f322, f321; +fma.rn.f32 f324, f316, 0f3E9E377A, f255; +mul.f32 f325, f318, 0f3F4F1BBD; +sub.f32 f326, f324, f325; +sub.f32 f327, f267, f273; +mul.f32 f328, f327, 0f3F737871; +sub.f32 f329, f269, f271; +fma.rn.f32 f330, f329, 0f3F167918, f328; +sub.f32 f331, f326, f330; +add.f32 f332, f330, f326; +mul.f32 f333, f316, 0f3F4F1BBD; +sub.f32 f334, f255, f333; +fma.rn.f32 f335, f318, 0f3E9E377A, f334; +mul.f32 f336, f327, 0f3F167918; +mul.f32 f337, f329, 0f3F737871; +sub.f32 f338, f336, f337; +sub.f32 f339, f335, f338; +add.f32 f340, f338, f335; +fma.rn.f32 f341, f320, 0f3E9E377A, f265; +mul.f32 f342, f322, 0f3F4F1BBD; +sub.f32 f343, f341, f342; +sub.f32 f344, f257, f263; +mul.f32 f345, f344, 0f3F737871; +sub.f32 f346, f259, f261; +fma.rn.f32 f347, f346, 0f3F167918, f345; +add.f32 f348, f347, f343; +sub.f32 f349, f343, f347; +mul.f32 f350, f320, 0f3F4F1BBD; +sub.f32 f351, f265, f350; +fma.rn.f32 f352, f322, 0f3E9E377A, f351; +mul.f32 f353, f344, 0f3F167918; +mul.f32 f354, f346, 0f3F737871; +sub.f32 f355, f353, f354; +add.f32 f356, f355, f352; +sub.f32 f357, f352, f355; +mul.f32 f358, f331, 0f3F4F1BBD; +mul.f32 f359, f348, 0f3F167918; +sub.f32 f360, f358, f359; +mul.f32 f361, f348, 0f3F4F1BBD; +fma.rn.f32 f362, f331, 0f3F167918, f361; +mul.f32 f363, f339, 0f3E9E377A; +mul.f32 f364, f356, 0f3F737871; +sub.f32 f365, f363, f364; +mul.f32 f366, f356, 0f3E9E377A; +fma.rn.f32 f367, f339, 0f3F737871, f366; +mul.f32 f368, f340, 0fBE9E377A; +mul.f32 f369, f357, 0f3F737871; +sub.f32 f370, f368, f369; +mul.f32 f371, f357, 0fBE9E377A; +fma.rn.f32 f372, f340, 0f3F737871, f371; +mul.f32 f373, f332, 0fBF4F1BBD; +mul.f32 f374, f349, 0f3F167918; +sub.f32 f375, f373, f374; +mul.f32 f376, f349, 0fBF4F1BBD; +fma.rn.f32 f377, f332, 0f3F167918, f376; +add.f32 %0, f277, f319; +add.f32 %1, f281, f323; +add.f32 %3, f306, f362; +add.f32 %2, f289, f360; +add.f32 %5, f314, f367; +add.f32 %4, f297, f365; +add.f32 %7, f315, f372; +add.f32 %6, f298, f370; +add.f32 %9, f307, f377; +add.f32 %8, f290, f375; +sub.f32 %10, f277, f319; +sub.f32 %11, f281, f323; +sub.f32 %13, f306, f362; +sub.f32 %12, f289, f360; +sub.f32 %15, f314, f367; +sub.f32 %14, f297, f365; +sub.f32 %17, f315, f372; +sub.f32 %16, f298, f370; +sub.f32 %19, f307, f377; +sub.f32 %18, f290, f375; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "r"(smem), "l"(lut_sp_10_100), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..eea1f6ec16bf0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp64_fwd.hpp.inc @@ -0,0 +1,792 @@ +#ifndef CUFFTDX_FFT_100_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_100_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<567, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<405>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 800, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %27, %43; +add.f64 fd42, %22, fd41; +add.f64 fd43, %32, %38; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %29, %45; +add.f64 fd46, %23, fd45; +add.f64 fd47, %34, %39; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %22; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %29, %45; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %34, %39; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %22, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %23; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %27, %43; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %32, %38; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %23, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %30, %46; +add.f64 fd86, %24, fd85; +add.f64 fd87, %35, %40; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %31, %47; +add.f64 fd90, %26, fd89; +add.f64 fd91, %37, %42; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %24; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %31, %47; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %37, %42; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %24, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %26; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %30, %46; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %35, %40; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %26, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +add.f64 fd149, fd44, fd88; +add.f64 fd150, fd48, fd92; +sub.f64 fd151, fd44, fd88; +sub.f64 fd152, fd48, fd92; +add.f64 fd153, fd57, fd131; +add.f64 fd154, fd75, fd133; +sub.f64 fd155, fd57, fd131; +sub.f64 fd156, fd75, fd133; +add.f64 fd157, fd65, fd136; +add.f64 fd158, fd83, fd138; +sub.f64 fd159, fd65, fd136; +sub.f64 fd160, fd83, fd138; +add.f64 fd161, fd66, fd141; +add.f64 fd162, fd84, fd143; +sub.f64 fd163, fd66, fd141; +sub.f64 fd164, fd84, fd143; +add.f64 fd165, fd58, fd146; +add.f64 fd166, fd76, fd148; +sub.f64 fd167, fd58, fd146; +sub.f64 fd168, fd76, fd148; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd169, fd170}, [rd6]; +mul.f64 fd173, fd169, fd153; +mul.f64 fd174, fd170, fd154; +sub.f64 fd175, fd173, fd174; +mul.f64 fd176, fd169, fd154; +fma.rn.f64 fd177, fd170, fd153, fd176; +mul.f64 fd178, fd169, fd169; +mul.f64 fd179, fd170, fd170; +sub.f64 fd180, fd178, fd179; +mul.f64 fd181, fd170, fd169; +fma.rn.f64 fd182, fd170, fd169, fd181; +mul.f64 fd183, fd180, fd157; +mul.f64 fd184, fd182, fd158; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd180, fd158; +fma.rn.f64 fd187, fd182, fd157, fd186; +mul.f64 fd188, fd169, fd180; +mul.f64 fd189, fd170, fd182; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd169, fd182; +fma.rn.f64 fd192, fd170, fd180, fd191; +mul.f64 fd193, fd190, fd161; +mul.f64 fd194, fd192, fd162; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd190, fd162; +fma.rn.f64 fd197, fd192, fd161, fd196; +mul.f64 fd198, fd169, fd190; +mul.f64 fd199, fd170, fd192; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd169, fd192; +fma.rn.f64 fd202, fd170, fd190, fd201; +mul.f64 fd203, fd200, fd165; +mul.f64 fd204, fd202, fd166; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd166; +fma.rn.f64 fd207, fd202, fd165, fd206; +mul.f64 fd208, fd169, fd200; +mul.f64 fd209, fd170, fd202; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd169, fd202; +fma.rn.f64 fd212, fd170, fd200, fd211; +mul.f64 fd213, fd210, fd151; +mul.f64 fd214, fd212, fd152; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd210, fd152; +fma.rn.f64 fd217, fd212, fd151, fd216; +ld.global.v2.f64 {fd218, fd219}, [rd6+160]; +mul.f64 fd222, fd218, fd155; +mul.f64 fd223, fd219, fd156; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd218, fd156; +fma.rn.f64 fd226, fd219, fd155, fd225; +mul.f64 fd227, fd169, fd218; +mul.f64 fd228, fd170, fd219; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd169, fd219; +fma.rn.f64 fd231, fd170, fd218, fd230; +mul.f64 fd232, fd229, fd159; +mul.f64 fd233, fd231, fd160; +sub.f64 fd234, fd232, fd233; +mul.f64 fd235, fd229, fd160; +fma.rn.f64 fd236, fd231, fd159, fd235; +mul.f64 fd237, fd169, fd229; +mul.f64 fd238, fd170, fd231; +sub.f64 fd239, fd237, fd238; +mul.f64 fd240, fd169, fd231; +fma.rn.f64 fd241, fd170, fd229, fd240; +mul.f64 fd242, fd239, fd163; +mul.f64 fd243, fd241, fd164; +sub.f64 fd244, fd242, fd243; +mul.f64 fd245, fd239, fd164; +fma.rn.f64 fd246, fd241, fd163, fd245; +mul.f64 fd247, fd169, fd239; +mul.f64 fd248, fd170, fd241; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd169, fd241; +fma.rn.f64 fd251, fd170, fd239, fd250; +mul.f64 fd252, fd249, fd167; +mul.f64 fd253, fd251, fd168; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd249, fd168; +fma.rn.f64 fd256, fd251, fd167, fd255; +mad.lo.s32 r8, r5, 800, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +st.shared.v2.f64 [r9], {fd149, fd175}; +st.shared.v2.f64 [r9+16], {fd185, fd195}; +st.shared.v2.f64 [r9+32], {fd205, fd215}; +st.shared.v2.f64 [r9+48], {fd224, fd234}; +st.shared.v2.f64 [r9+64], {fd244, fd254}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.f64 fd257, [r10]; +ld.shared.f64 fd258, [r10+80]; +ld.shared.f64 fd259, [r10+160]; +ld.shared.f64 fd260, [r10+240]; +ld.shared.f64 fd261, [r10+320]; +ld.shared.f64 fd262, [r10+400]; +ld.shared.f64 fd263, [r10+480]; +ld.shared.f64 fd264, [r10+560]; +ld.shared.f64 fd265, [r10+640]; +ld.shared.f64 fd266, [r10+720]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd150, fd177}; +st.shared.v2.f64 [r9+16], {fd187, fd197}; +st.shared.v2.f64 [r9+32], {fd207, fd217}; +st.shared.v2.f64 [r9+48], {fd226, fd236}; +st.shared.v2.f64 [r9+64], {fd246, fd256}; +barrier.sync 0; +ld.shared.f64 fd267, [r10]; +ld.shared.f64 fd268, [r10+80]; +ld.shared.f64 fd269, [r10+160]; +ld.shared.f64 fd270, [r10+240]; +ld.shared.f64 fd271, [r10+320]; +ld.shared.f64 fd272, [r10+400]; +ld.shared.f64 fd273, [r10+480]; +ld.shared.f64 fd274, [r10+560]; +ld.shared.f64 fd275, [r10+640]; +ld.shared.f64 fd276, [r10+720]; +add.f64 fd277, fd259, fd265; +add.f64 fd278, fd257, fd277; +add.f64 fd279, fd261, fd263; +add.f64 fd280, fd279, fd278; +add.f64 fd281, fd269, fd275; +add.f64 fd282, fd267, fd281; +add.f64 fd283, fd271, fd273; +add.f64 fd284, fd283, fd282; +fma.rn.f64 fd285, fd277, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, fd269, fd275; +mul.f64 fd289, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd290, fd271, fd273; +mul.f64 fd291, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd291, fd289; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, fd257, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +fma.rn.f64 fd303, fd281, 0d3FD3C6EF372FE950, fd267; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, fd259, fd265; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd261, fd263; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, fd267, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +add.f64 fd321, fd260, fd266; +add.f64 fd322, fd258, fd321; +add.f64 fd323, fd262, fd264; +add.f64 fd324, fd323, fd322; +add.f64 fd325, fd270, fd276; +add.f64 fd326, fd268, fd325; +add.f64 fd327, fd272, fd274; +add.f64 fd328, fd327, fd326; +fma.rn.f64 fd329, fd321, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd330, fd323, 0d3FE9E3779B97F4A8; +sub.f64 fd331, fd329, fd330; +sub.f64 fd332, fd270, fd276; +mul.f64 fd333, fd332, 0d3FEE6F0E134454FF; +sub.f64 fd334, fd272, fd274; +mul.f64 fd335, fd334, 0dBFE2CF2304755A5E; +sub.f64 fd336, fd335, fd333; +sub.f64 fd337, fd331, fd336; +add.f64 fd338, fd336, fd331; +mul.f64 fd339, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd340, fd258, fd339; +fma.rn.f64 fd341, fd323, 0d3FD3C6EF372FE950, fd340; +mul.f64 fd342, fd332, 0d3FE2CF2304755A5E; +mul.f64 fd343, fd334, 0d3FEE6F0E134454FF; +sub.f64 fd344, fd343, fd342; +sub.f64 fd345, fd341, fd344; +add.f64 fd346, fd344, fd341; +fma.rn.f64 fd347, fd325, 0d3FD3C6EF372FE950, fd268; +mul.f64 fd348, fd327, 0d3FE9E3779B97F4A8; +sub.f64 fd349, fd347, fd348; +sub.f64 fd350, fd260, fd266; +mul.f64 fd351, fd350, 0d3FEE6F0E134454FF; +sub.f64 fd352, fd262, fd264; +mul.f64 fd353, fd352, 0dBFE2CF2304755A5E; +sub.f64 fd354, fd353, fd351; +add.f64 fd355, fd354, fd349; +sub.f64 fd356, fd349, fd354; +mul.f64 fd357, fd325, 0d3FE9E3779B97F4A8; +sub.f64 fd358, fd268, fd357; +fma.rn.f64 fd359, fd327, 0d3FD3C6EF372FE950, fd358; +mul.f64 fd360, fd350, 0d3FE2CF2304755A5E; +mul.f64 fd361, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd362, fd361, fd360; +add.f64 fd363, fd362, fd359; +sub.f64 fd364, fd359, fd362; +mul.f64 fd365, fd337, 0d3FE9E3779B97F4A8; +mul.f64 fd366, fd355, 0dBFE2CF2304755A5E; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd355, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd369, fd337, 0dBFE2CF2304755A5E, fd368; +mul.f64 fd370, fd345, 0d3FD3C6EF372FE950; +mul.f64 fd371, fd363, 0dBFEE6F0E134454FF; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd363, 0d3FD3C6EF372FE950; +fma.rn.f64 fd374, fd345, 0dBFEE6F0E134454FF, fd373; +mul.f64 fd375, fd346, 0dBFD3C6EF372FE950; +mul.f64 fd376, fd364, 0dBFEE6F0E134454FF; +sub.f64 fd377, fd375, fd376; +mul.f64 fd378, fd364, 0dBFD3C6EF372FE950; +fma.rn.f64 fd379, fd346, 0dBFEE6F0E134454FF, fd378; +mul.f64 fd380, fd338, 0dBFE9E3779B97F4A8; +mul.f64 fd381, fd356, 0dBFE2CF2304755A5E; +sub.f64 fd382, fd380, fd381; +mul.f64 fd383, fd356, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd384, fd338, 0dBFE2CF2304755A5E, fd383; +add.f64 %0, fd280, fd324; +add.f64 %1, fd284, fd328; +add.f64 %3, fd311, fd369; +add.f64 %2, fd293, fd367; +add.f64 %5, fd319, fd374; +add.f64 %4, fd301, fd372; +add.f64 %7, fd320, fd379; +add.f64 %6, fd302, fd377; +add.f64 %9, fd312, fd384; +add.f64 %8, fd294, fd382; +sub.f64 %10, fd280, fd324; +sub.f64 %11, fd284, fd328; +sub.f64 %13, fd311, fd369; +sub.f64 %12, fd293, fd367; +sub.f64 %15, fd319, fd374; +sub.f64 %14, fd301, fd372; +sub.f64 %17, fd320, fd379; +sub.f64 %16, fd302, fd377; +sub.f64 %19, fd312, fd384; +sub.f64 %18, fd294, fd382; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<566, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<425>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 1600, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %27, %43; +add.f64 fd42, %22, fd41; +add.f64 fd43, %32, %38; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %29, %45; +add.f64 fd46, %23, fd45; +add.f64 fd47, %34, %39; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %22; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %29, %45; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %34, %39; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %22, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %23; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %27, %43; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %32, %38; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %23, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %30, %46; +add.f64 fd86, %24, fd85; +add.f64 fd87, %35, %40; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %31, %47; +add.f64 fd90, %26, fd89; +add.f64 fd91, %37, %42; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %24; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %31, %47; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %37, %42; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %24, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %26; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %30, %46; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %35, %40; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %26, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +sub.f64 fd149, fd44, fd88; +sub.f64 fd150, fd48, fd92; +add.f64 fd151, fd57, fd131; +add.f64 fd152, fd75, fd133; +sub.f64 fd153, fd57, fd131; +sub.f64 fd154, fd75, fd133; +add.f64 fd155, fd65, fd136; +add.f64 fd156, fd83, fd138; +sub.f64 fd157, fd65, fd136; +sub.f64 fd158, fd83, fd138; +add.f64 fd159, fd66, fd141; +add.f64 fd160, fd84, fd143; +sub.f64 fd161, fd66, fd141; +sub.f64 fd162, fd84, fd143; +add.f64 fd163, fd58, fd146; +add.f64 fd164, fd76, fd148; +sub.f64 fd165, fd58, fd146; +sub.f64 fd166, fd76, fd148; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1600, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd167, fd168}, [rd6]; +mul.f64 fd171, fd167, fd151; +mul.f64 fd172, fd168, fd152; +mul.f64 fd173, fd167, fd152; +mul.f64 fd174, fd167, fd167; +mul.f64 fd175, fd168, fd168; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd168, fd167; +fma.rn.f64 fd178, fd168, fd167, fd177; +mul.f64 fd179, fd176, fd155; +mul.f64 fd180, fd178, fd156; +mul.f64 fd181, fd176, fd156; +mul.f64 fd182, fd167, fd176; +mul.f64 fd183, fd168, fd178; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd167, fd178; +fma.rn.f64 fd186, fd168, fd176, fd185; +mul.f64 fd187, fd184, fd159; +mul.f64 fd188, fd186, fd160; +mul.f64 fd189, fd184, fd160; +mul.f64 fd190, fd167, fd184; +mul.f64 fd191, fd168, fd186; +sub.f64 fd192, fd190, fd191; +mul.f64 fd193, fd167, fd186; +fma.rn.f64 fd194, fd168, fd184, fd193; +mul.f64 fd195, fd192, fd163; +mul.f64 fd196, fd194, fd164; +mul.f64 fd197, fd192, fd164; +mul.f64 fd198, fd167, fd192; +mul.f64 fd199, fd168, fd194; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd167, fd194; +fma.rn.f64 fd202, fd168, fd192, fd201; +mul.f64 fd203, fd200, fd149; +mul.f64 fd204, fd202, fd150; +mul.f64 fd205, fd200, fd150; +ld.global.v2.f64 {fd206, fd207}, [rd6+160]; +mul.f64 fd210, fd206, fd153; +mul.f64 fd211, fd207, fd154; +mul.f64 fd212, fd206, fd154; +mul.f64 fd213, fd167, fd206; +mul.f64 fd214, fd168, fd207; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd167, fd207; +fma.rn.f64 fd217, fd168, fd206, fd216; +mul.f64 fd218, fd215, fd157; +mul.f64 fd219, fd217, fd158; +mul.f64 fd220, fd215, fd158; +mul.f64 fd221, fd167, fd215; +mul.f64 fd222, fd168, fd217; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd167, fd217; +fma.rn.f64 fd225, fd168, fd215, fd224; +mul.f64 fd226, fd223, fd161; +mul.f64 fd227, fd225, fd162; +mul.f64 fd228, fd223, fd162; +mul.f64 fd229, fd167, fd223; +mul.f64 fd230, fd168, fd225; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd167, fd225; +fma.rn.f64 fd233, fd168, fd223, fd232; +mul.f64 fd234, fd231, fd165; +mul.f64 fd235, fd233, fd166; +mul.f64 fd236, fd231, fd166; +barrier.sync 0; +mad.lo.s32 r9, r7, 160, r8; +add.f64 fd237, fd48, fd92; +add.f64 fd238, fd44, fd88; +st.shared.v2.f64 [r9], {fd238, fd237}; +fma.rn.f64 fd239, fd168, fd151, fd173; +sub.f64 fd240, fd171, fd172; +st.shared.v2.f64 [r9+16], {fd240, fd239}; +fma.rn.f64 fd241, fd178, fd155, fd181; +sub.f64 fd242, fd179, fd180; +st.shared.v2.f64 [r9+32], {fd242, fd241}; +fma.rn.f64 fd243, fd186, fd159, fd189; +sub.f64 fd244, fd187, fd188; +st.shared.v2.f64 [r9+48], {fd244, fd243}; +sub.f64 fd245, fd195, fd196; +fma.rn.f64 fd246, fd194, fd163, fd197; +st.shared.v2.f64 [r9+64], {fd245, fd246}; +fma.rn.f64 fd247, fd202, fd149, fd205; +sub.f64 fd248, fd203, fd204; +st.shared.v2.f64 [r9+80], {fd248, fd247}; +fma.rn.f64 fd249, fd207, fd153, fd212; +sub.f64 fd250, fd210, fd211; +st.shared.v2.f64 [r9+96], {fd250, fd249}; +fma.rn.f64 fd251, fd217, fd157, fd220; +sub.f64 fd252, fd218, fd219; +st.shared.v2.f64 [r9+112], {fd252, fd251}; +fma.rn.f64 fd253, fd225, fd161, fd228; +sub.f64 fd254, fd226, fd227; +st.shared.v2.f64 [r9+128], {fd254, fd253}; +sub.f64 fd255, fd234, fd235; +fma.rn.f64 fd256, fd233, fd165, fd236; +st.shared.v2.f64 [r9+144], {fd255, fd256}; +barrier.sync 0; +mad.lo.s32 r10, r7, -144, r9; +ld.shared.v2.f64 {fd257, fd258}, [r10]; +ld.shared.v2.f64 {fd261, fd262}, [r10+160]; +ld.shared.v2.f64 {fd265, fd266}, [r10+320]; +ld.shared.v2.f64 {fd269, fd270}, [r10+480]; +ld.shared.v2.f64 {fd273, fd274}, [r10+640]; +ld.shared.v2.f64 {fd277, fd278}, [r10+800]; +ld.shared.v2.f64 {fd281, fd282}, [r10+960]; +ld.shared.v2.f64 {fd285, fd286}, [r10+1120]; +ld.shared.v2.f64 {fd289, fd290}, [r10+1280]; +ld.shared.v2.f64 {fd293, fd294}, [r10+1440]; +add.f64 fd297, fd265, fd289; +add.f64 fd298, fd257, fd297; +add.f64 fd299, fd273, fd281; +add.f64 fd300, fd299, fd298; +add.f64 fd301, fd266, fd290; +add.f64 fd302, fd258, fd301; +add.f64 fd303, fd274, fd282; +add.f64 fd304, fd303, fd302; +fma.rn.f64 fd305, fd297, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd306, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd307, fd305, fd306; +sub.f64 fd308, fd266, fd290; +mul.f64 fd309, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd310, fd274, fd282; +mul.f64 fd311, fd310, 0dBFE2CF2304755A5E; +sub.f64 fd312, fd311, fd309; +sub.f64 fd313, fd307, fd312; +add.f64 fd314, fd312, fd307; +mul.f64 fd315, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd316, fd257, fd315; +fma.rn.f64 fd317, fd299, 0d3FD3C6EF372FE950, fd316; +mul.f64 fd318, fd308, 0d3FE2CF2304755A5E; +mul.f64 fd319, fd310, 0d3FEE6F0E134454FF; +sub.f64 fd320, fd319, fd318; +sub.f64 fd321, fd317, fd320; +add.f64 fd322, fd320, fd317; +fma.rn.f64 fd323, fd301, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd324, fd303, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd265, fd289; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd273, fd281; +mul.f64 fd329, fd328, 0dBFE2CF2304755A5E; +sub.f64 fd330, fd329, fd327; +add.f64 fd331, fd330, fd325; +sub.f64 fd332, fd325, fd330; +mul.f64 fd333, fd301, 0d3FE9E3779B97F4A8; +sub.f64 fd334, fd258, fd333; +fma.rn.f64 fd335, fd303, 0d3FD3C6EF372FE950, fd334; +mul.f64 fd336, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd337, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd338, fd337, fd336; +add.f64 fd339, fd338, fd335; +sub.f64 fd340, fd335, fd338; +add.f64 fd341, fd269, fd293; +add.f64 fd342, fd261, fd341; +add.f64 fd343, fd277, fd285; +add.f64 fd344, fd343, fd342; +add.f64 fd345, fd270, fd294; +add.f64 fd346, fd262, fd345; +add.f64 fd347, fd278, fd286; +add.f64 fd348, fd347, fd346; +fma.rn.f64 fd349, fd341, 0d3FD3C6EF372FE950, fd261; +mul.f64 fd350, fd343, 0d3FE9E3779B97F4A8; +sub.f64 fd351, fd349, fd350; +sub.f64 fd352, fd270, fd294; +mul.f64 fd353, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd354, fd278, fd286; +mul.f64 fd355, fd354, 0dBFE2CF2304755A5E; +sub.f64 fd356, fd355, fd353; +sub.f64 fd357, fd351, fd356; +add.f64 fd358, fd356, fd351; +mul.f64 fd359, fd341, 0d3FE9E3779B97F4A8; +sub.f64 fd360, fd261, fd359; +fma.rn.f64 fd361, fd343, 0d3FD3C6EF372FE950, fd360; +mul.f64 fd362, fd352, 0d3FE2CF2304755A5E; +mul.f64 fd363, fd354, 0d3FEE6F0E134454FF; +sub.f64 fd364, fd363, fd362; +sub.f64 fd365, fd361, fd364; +add.f64 fd366, fd364, fd361; +fma.rn.f64 fd367, fd345, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd368, fd347, 0d3FE9E3779B97F4A8; +sub.f64 fd369, fd367, fd368; +sub.f64 fd370, fd269, fd293; +mul.f64 fd371, fd370, 0d3FEE6F0E134454FF; +sub.f64 fd372, fd277, fd285; +mul.f64 fd373, fd372, 0dBFE2CF2304755A5E; +sub.f64 fd374, fd373, fd371; +add.f64 fd375, fd374, fd369; +sub.f64 fd376, fd369, fd374; +mul.f64 fd377, fd345, 0d3FE9E3779B97F4A8; +sub.f64 fd378, fd262, fd377; +fma.rn.f64 fd379, fd347, 0d3FD3C6EF372FE950, fd378; +mul.f64 fd380, fd370, 0d3FE2CF2304755A5E; +mul.f64 fd381, fd372, 0d3FEE6F0E134454FF; +sub.f64 fd382, fd381, fd380; +add.f64 fd383, fd382, fd379; +sub.f64 fd384, fd379, fd382; +mul.f64 fd385, fd357, 0d3FE9E3779B97F4A8; +mul.f64 fd386, fd375, 0dBFE2CF2304755A5E; +sub.f64 fd387, fd385, fd386; +mul.f64 fd388, fd375, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd389, fd357, 0dBFE2CF2304755A5E, fd388; +mul.f64 fd390, fd365, 0d3FD3C6EF372FE950; +mul.f64 fd391, fd383, 0dBFEE6F0E134454FF; +sub.f64 fd392, fd390, fd391; +mul.f64 fd393, fd383, 0d3FD3C6EF372FE950; +fma.rn.f64 fd394, fd365, 0dBFEE6F0E134454FF, fd393; +mul.f64 fd395, fd366, 0dBFD3C6EF372FE950; +mul.f64 fd396, fd384, 0dBFEE6F0E134454FF; +sub.f64 fd397, fd395, fd396; +mul.f64 fd398, fd384, 0dBFD3C6EF372FE950; +fma.rn.f64 fd399, fd366, 0dBFEE6F0E134454FF, fd398; +mul.f64 fd400, fd358, 0dBFE9E3779B97F4A8; +mul.f64 fd401, fd376, 0dBFE2CF2304755A5E; +sub.f64 fd402, fd400, fd401; +mul.f64 fd403, fd376, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd404, fd358, 0dBFE2CF2304755A5E, fd403; +add.f64 %1, fd304, fd348; +add.f64 %0, fd300, fd344; +add.f64 %3, fd331, fd389; +add.f64 %2, fd313, fd387; +add.f64 %5, fd339, fd394; +add.f64 %4, fd321, fd392; +add.f64 %7, fd340, fd399; +add.f64 %6, fd322, fd397; +add.f64 %9, fd332, fd404; +add.f64 %8, fd314, fd402; +sub.f64 %11, fd304, fd348; +sub.f64 %10, fd300, fd344; +sub.f64 %13, fd331, fd389; +sub.f64 %12, fd313, fd387; +sub.f64 %15, fd339, fd394; +sub.f64 %14, fd321, fd392; +sub.f64 %17, fd340, fd399; +sub.f64 %16, fd322, fd397; +sub.f64 %19, fd332, fd404; +sub.f64 %18, fd314, fd402; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..e455712785687 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_100_fp64_inv.hpp.inc @@ -0,0 +1,776 @@ +#ifndef CUFFTDX_FFT_100_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_100_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<738, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<397>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 800, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %27, %43; +add.f64 fd42, %22, fd41; +add.f64 fd43, %32, %38; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %29, %45; +add.f64 fd46, %23, fd45; +add.f64 fd47, %34, %39; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %22; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %29, %45; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %34, %39; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %22, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %23; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %27, %43; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %32, %38; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %23, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %30, %46; +add.f64 fd84, %24, fd83; +add.f64 fd85, %35, %40; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %31, %47; +add.f64 fd88, %26, fd87; +add.f64 fd89, %37, %42; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %24; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %31, %47; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %37, %42; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %24, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %26; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %30, %46; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %35, %40; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %26, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +add.f64 fd145, fd44, fd86; +add.f64 fd146, fd48, fd90; +sub.f64 fd147, fd44, fd86; +sub.f64 fd148, fd48, fd90; +add.f64 fd149, fd56, fd127; +add.f64 fd150, fd73, fd129; +sub.f64 fd151, fd56, fd127; +sub.f64 fd152, fd73, fd129; +add.f64 fd153, fd64, fd132; +add.f64 fd154, fd81, fd134; +sub.f64 fd155, fd64, fd132; +sub.f64 fd156, fd81, fd134; +add.f64 fd157, fd65, fd137; +add.f64 fd158, fd82, fd139; +sub.f64 fd159, fd65, fd137; +sub.f64 fd160, fd82, fd139; +add.f64 fd161, fd57, fd142; +add.f64 fd162, fd74, fd144; +sub.f64 fd163, fd57, fd142; +sub.f64 fd164, fd74, fd144; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd165, fd166}, [rd6]; +mul.f64 fd169, fd150, fd166; +fma.rn.f64 fd170, fd165, fd149, fd169; +mul.f64 fd171, fd149, fd166; +mul.f64 fd172, fd165, fd150; +sub.f64 fd173, fd172, fd171; +mul.f64 fd174, fd165, fd165; +mul.f64 fd175, fd166, fd166; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd166, fd165; +fma.rn.f64 fd178, fd166, fd165, fd177; +mul.f64 fd179, fd154, fd178; +fma.rn.f64 fd180, fd176, fd153, fd179; +mul.f64 fd181, fd153, fd178; +mul.f64 fd182, fd176, fd154; +sub.f64 fd183, fd182, fd181; +mul.f64 fd184, fd165, fd176; +mul.f64 fd185, fd166, fd178; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd165, fd178; +fma.rn.f64 fd188, fd166, fd176, fd187; +mul.f64 fd189, fd158, fd188; +fma.rn.f64 fd190, fd186, fd157, fd189; +mul.f64 fd191, fd157, fd188; +mul.f64 fd192, fd186, fd158; +sub.f64 fd193, fd192, fd191; +mul.f64 fd194, fd165, fd186; +mul.f64 fd195, fd166, fd188; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd165, fd188; +fma.rn.f64 fd198, fd166, fd186, fd197; +mul.f64 fd199, fd162, fd198; +fma.rn.f64 fd200, fd196, fd161, fd199; +mul.f64 fd201, fd161, fd198; +mul.f64 fd202, fd196, fd162; +sub.f64 fd203, fd202, fd201; +mul.f64 fd204, fd165, fd196; +mul.f64 fd205, fd166, fd198; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd165, fd198; +fma.rn.f64 fd208, fd166, fd196, fd207; +mul.f64 fd209, fd148, fd208; +fma.rn.f64 fd210, fd206, fd147, fd209; +mul.f64 fd211, fd147, fd208; +mul.f64 fd212, fd206, fd148; +sub.f64 fd213, fd212, fd211; +ld.global.v2.f64 {fd214, fd215}, [rd6+160]; +mul.f64 fd218, fd152, fd215; +fma.rn.f64 fd219, fd214, fd151, fd218; +mul.f64 fd220, fd151, fd215; +mul.f64 fd221, fd214, fd152; +sub.f64 fd222, fd221, fd220; +mul.f64 fd223, fd165, fd214; +mul.f64 fd224, fd166, fd215; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd165, fd215; +fma.rn.f64 fd227, fd166, fd214, fd226; +mul.f64 fd228, fd156, fd227; +fma.rn.f64 fd229, fd225, fd155, fd228; +mul.f64 fd230, fd155, fd227; +mul.f64 fd231, fd225, fd156; +sub.f64 fd232, fd231, fd230; +mul.f64 fd233, fd165, fd225; +mul.f64 fd234, fd166, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd165, fd227; +fma.rn.f64 fd237, fd166, fd225, fd236; +mul.f64 fd238, fd160, fd237; +fma.rn.f64 fd239, fd235, fd159, fd238; +mul.f64 fd240, fd159, fd237; +mul.f64 fd241, fd235, fd160; +sub.f64 fd242, fd241, fd240; +mul.f64 fd243, fd165, fd235; +mul.f64 fd244, fd166, fd237; +sub.f64 fd245, fd243, fd244; +mul.f64 fd246, fd165, fd237; +fma.rn.f64 fd247, fd166, fd235, fd246; +mul.f64 fd248, fd164, fd247; +fma.rn.f64 fd249, fd245, fd163, fd248; +mul.f64 fd250, fd163, fd247; +mul.f64 fd251, fd245, fd164; +sub.f64 fd252, fd251, fd250; +mad.lo.s32 r8, r5, 800, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +st.shared.v2.f64 [r9], {fd145, fd170}; +st.shared.v2.f64 [r9+16], {fd180, fd190}; +st.shared.v2.f64 [r9+32], {fd200, fd210}; +st.shared.v2.f64 [r9+48], {fd219, fd229}; +st.shared.v2.f64 [r9+64], {fd239, fd249}; +barrier.sync 0; +mad.lo.s32 r10, r7, -72, r9; +ld.shared.f64 fd253, [r10]; +ld.shared.f64 fd254, [r10+80]; +ld.shared.f64 fd255, [r10+160]; +ld.shared.f64 fd256, [r10+240]; +ld.shared.f64 fd257, [r10+320]; +ld.shared.f64 fd258, [r10+400]; +ld.shared.f64 fd259, [r10+480]; +ld.shared.f64 fd260, [r10+560]; +ld.shared.f64 fd261, [r10+640]; +ld.shared.f64 fd262, [r10+720]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd146, fd173}; +st.shared.v2.f64 [r9+16], {fd183, fd193}; +st.shared.v2.f64 [r9+32], {fd203, fd213}; +st.shared.v2.f64 [r9+48], {fd222, fd232}; +st.shared.v2.f64 [r9+64], {fd242, fd252}; +barrier.sync 0; +ld.shared.f64 fd263, [r10]; +ld.shared.f64 fd264, [r10+80]; +ld.shared.f64 fd265, [r10+160]; +ld.shared.f64 fd266, [r10+240]; +ld.shared.f64 fd267, [r10+320]; +ld.shared.f64 fd268, [r10+400]; +ld.shared.f64 fd269, [r10+480]; +ld.shared.f64 fd270, [r10+560]; +ld.shared.f64 fd271, [r10+640]; +ld.shared.f64 fd272, [r10+720]; +add.f64 fd273, fd255, fd261; +add.f64 fd274, fd253, fd273; +add.f64 fd275, fd257, fd259; +add.f64 fd276, fd275, fd274; +add.f64 fd277, fd265, fd271; +add.f64 fd278, fd263, fd277; +add.f64 fd279, fd267, fd269; +add.f64 fd280, fd279, fd278; +fma.rn.f64 fd281, fd273, 0d3FD3C6EF372FE950, fd253; +mul.f64 fd282, fd275, 0d3FE9E3779B97F4A8; +sub.f64 fd283, fd281, fd282; +sub.f64 fd284, fd265, fd271; +mul.f64 fd285, fd284, 0d3FEE6F0E134454FF; +sub.f64 fd286, fd267, fd269; +fma.rn.f64 fd287, fd286, 0d3FE2CF2304755A5E, fd285; +sub.f64 fd288, fd283, fd287; +add.f64 fd289, fd287, fd283; +mul.f64 fd290, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd291, fd253, fd290; +fma.rn.f64 fd292, fd275, 0d3FD3C6EF372FE950, fd291; +mul.f64 fd293, fd284, 0d3FE2CF2304755A5E; +mul.f64 fd294, fd286, 0d3FEE6F0E134454FF; +sub.f64 fd295, fd293, fd294; +sub.f64 fd296, fd292, fd295; +add.f64 fd297, fd295, fd292; +fma.rn.f64 fd298, fd277, 0d3FD3C6EF372FE950, fd263; +mul.f64 fd299, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd300, fd298, fd299; +sub.f64 fd301, fd255, fd261; +mul.f64 fd302, fd301, 0d3FEE6F0E134454FF; +sub.f64 fd303, fd257, fd259; +fma.rn.f64 fd304, fd303, 0d3FE2CF2304755A5E, fd302; +add.f64 fd305, fd304, fd300; +sub.f64 fd306, fd300, fd304; +mul.f64 fd307, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd308, fd263, fd307; +fma.rn.f64 fd309, fd279, 0d3FD3C6EF372FE950, fd308; +mul.f64 fd310, fd301, 0d3FE2CF2304755A5E; +mul.f64 fd311, fd303, 0d3FEE6F0E134454FF; +sub.f64 fd312, fd310, fd311; +add.f64 fd313, fd312, fd309; +sub.f64 fd314, fd309, fd312; +add.f64 fd315, fd256, fd262; +add.f64 fd316, fd254, fd315; +add.f64 fd317, fd258, fd260; +add.f64 fd318, fd317, fd316; +add.f64 fd319, fd266, fd272; +add.f64 fd320, fd264, fd319; +add.f64 fd321, fd268, fd270; +add.f64 fd322, fd321, fd320; +fma.rn.f64 fd323, fd315, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd324, fd317, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd266, fd272; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd268, fd270; +fma.rn.f64 fd329, fd328, 0d3FE2CF2304755A5E, fd327; +sub.f64 fd330, fd325, fd329; +add.f64 fd331, fd329, fd325; +mul.f64 fd332, fd315, 0d3FE9E3779B97F4A8; +sub.f64 fd333, fd254, fd332; +fma.rn.f64 fd334, fd317, 0d3FD3C6EF372FE950, fd333; +mul.f64 fd335, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd336, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd337, fd335, fd336; +sub.f64 fd338, fd334, fd337; +add.f64 fd339, fd337, fd334; +fma.rn.f64 fd340, fd319, 0d3FD3C6EF372FE950, fd264; +mul.f64 fd341, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd342, fd340, fd341; +sub.f64 fd343, fd256, fd262; +mul.f64 fd344, fd343, 0d3FEE6F0E134454FF; +sub.f64 fd345, fd258, fd260; +fma.rn.f64 fd346, fd345, 0d3FE2CF2304755A5E, fd344; +add.f64 fd347, fd346, fd342; +sub.f64 fd348, fd342, fd346; +mul.f64 fd349, fd319, 0d3FE9E3779B97F4A8; +sub.f64 fd350, fd264, fd349; +fma.rn.f64 fd351, fd321, 0d3FD3C6EF372FE950, fd350; +mul.f64 fd352, fd343, 0d3FE2CF2304755A5E; +mul.f64 fd353, fd345, 0d3FEE6F0E134454FF; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd354, fd351; +sub.f64 fd356, fd351, fd354; +mul.f64 fd357, fd330, 0d3FE9E3779B97F4A8; +mul.f64 fd358, fd347, 0d3FE2CF2304755A5E; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd347, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd361, fd330, 0d3FE2CF2304755A5E, fd360; +mul.f64 fd362, fd338, 0d3FD3C6EF372FE950; +mul.f64 fd363, fd355, 0d3FEE6F0E134454FF; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd355, 0d3FD3C6EF372FE950; +fma.rn.f64 fd366, fd338, 0d3FEE6F0E134454FF, fd365; +mul.f64 fd367, fd339, 0dBFD3C6EF372FE950; +mul.f64 fd368, fd356, 0d3FEE6F0E134454FF; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd356, 0dBFD3C6EF372FE950; +fma.rn.f64 fd371, fd339, 0d3FEE6F0E134454FF, fd370; +mul.f64 fd372, fd331, 0dBFE9E3779B97F4A8; +mul.f64 fd373, fd348, 0d3FE2CF2304755A5E; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd348, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd376, fd331, 0d3FE2CF2304755A5E, fd375; +add.f64 %0, fd276, fd318; +add.f64 %1, fd280, fd322; +add.f64 %3, fd305, fd361; +add.f64 %2, fd288, fd359; +add.f64 %5, fd313, fd366; +add.f64 %4, fd296, fd364; +add.f64 %7, fd314, fd371; +add.f64 %6, fd297, fd369; +add.f64 %9, fd306, fd376; +add.f64 %8, fd289, fd374; +sub.f64 %10, fd276, fd318; +sub.f64 %11, fd280, fd322; +sub.f64 %13, fd305, fd361; +sub.f64 %12, fd288, fd359; +sub.f64 %15, fd313, fd366; +sub.f64 %14, fd296, fd364; +sub.f64 %17, fd314, fd371; +sub.f64 %16, fd297, fd369; +sub.f64 %19, fd306, fd376; +sub.f64 %18, fd289, fd374; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<737, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<417>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %20; +mad.lo.s32 r3, r1, 1600, r2; +mov.u32 r4, %tid.x; +add.f64 fd41, %27, %43; +add.f64 fd42, %22, fd41; +add.f64 fd43, %32, %38; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %29, %45; +add.f64 fd46, %23, fd45; +add.f64 fd47, %34, %39; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %22; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %29, %45; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %34, %39; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %22, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %23; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %27, %43; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %32, %38; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %23, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %30, %46; +add.f64 fd84, %24, fd83; +add.f64 fd85, %35, %40; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %31, %47; +add.f64 fd88, %26, fd87; +add.f64 fd89, %37, %42; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %24; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %31, %47; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %37, %42; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %24, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %26; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %30, %46; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %35, %40; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %26, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +sub.f64 fd145, fd44, fd86; +sub.f64 fd146, fd48, fd90; +add.f64 fd147, fd56, fd127; +add.f64 fd148, fd73, fd129; +sub.f64 fd149, fd56, fd127; +sub.f64 fd150, fd73, fd129; +add.f64 fd151, fd64, fd132; +add.f64 fd152, fd81, fd134; +sub.f64 fd153, fd64, fd132; +sub.f64 fd154, fd81, fd134; +add.f64 fd155, fd65, fd137; +add.f64 fd156, fd82, fd139; +sub.f64 fd157, fd65, fd137; +sub.f64 fd158, fd82, fd139; +add.f64 fd159, fd57, fd142; +add.f64 fd160, fd74, fd144; +sub.f64 fd161, fd57, fd142; +sub.f64 fd162, fd74, fd144; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 10; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1600, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %21; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd163, fd164}, [rd6]; +mul.f64 fd167, fd148, fd164; +mul.f64 fd168, fd147, fd164; +mul.f64 fd169, fd163, fd148; +mul.f64 fd170, fd163, fd163; +mul.f64 fd171, fd164, fd164; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd164, fd163; +fma.rn.f64 fd174, fd164, fd163, fd173; +mul.f64 fd175, fd152, fd174; +mul.f64 fd176, fd151, fd174; +mul.f64 fd177, fd172, fd152; +mul.f64 fd178, fd163, fd172; +mul.f64 fd179, fd164, fd174; +sub.f64 fd180, fd178, fd179; +mul.f64 fd181, fd163, fd174; +fma.rn.f64 fd182, fd164, fd172, fd181; +mul.f64 fd183, fd156, fd182; +mul.f64 fd184, fd155, fd182; +mul.f64 fd185, fd180, fd156; +mul.f64 fd186, fd163, fd180; +mul.f64 fd187, fd164, fd182; +sub.f64 fd188, fd186, fd187; +mul.f64 fd189, fd163, fd182; +fma.rn.f64 fd190, fd164, fd180, fd189; +mul.f64 fd191, fd160, fd190; +mul.f64 fd192, fd159, fd190; +mul.f64 fd193, fd188, fd160; +mul.f64 fd194, fd163, fd188; +mul.f64 fd195, fd164, fd190; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd163, fd190; +fma.rn.f64 fd198, fd164, fd188, fd197; +mul.f64 fd199, fd146, fd198; +mul.f64 fd200, fd145, fd198; +mul.f64 fd201, fd196, fd146; +ld.global.v2.f64 {fd202, fd203}, [rd6+160]; +mul.f64 fd206, fd150, fd203; +mul.f64 fd207, fd149, fd203; +mul.f64 fd208, fd202, fd150; +mul.f64 fd209, fd163, fd202; +mul.f64 fd210, fd164, fd203; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd163, fd203; +fma.rn.f64 fd213, fd164, fd202, fd212; +mul.f64 fd214, fd154, fd213; +mul.f64 fd215, fd153, fd213; +mul.f64 fd216, fd211, fd154; +mul.f64 fd217, fd163, fd211; +mul.f64 fd218, fd164, fd213; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd163, fd213; +fma.rn.f64 fd221, fd164, fd211, fd220; +mul.f64 fd222, fd158, fd221; +mul.f64 fd223, fd157, fd221; +mul.f64 fd224, fd219, fd158; +mul.f64 fd225, fd163, fd219; +mul.f64 fd226, fd164, fd221; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd163, fd221; +fma.rn.f64 fd229, fd164, fd219, fd228; +mul.f64 fd230, fd162, fd229; +mul.f64 fd231, fd161, fd229; +mul.f64 fd232, fd227, fd162; +barrier.sync 0; +mad.lo.s32 r9, r7, 160, r8; +add.f64 fd233, fd48, fd90; +add.f64 fd234, fd44, fd86; +st.shared.v2.f64 [r9], {fd234, fd233}; +fma.rn.f64 fd235, fd163, fd147, fd167; +sub.f64 fd236, fd169, fd168; +st.shared.v2.f64 [r9+16], {fd235, fd236}; +fma.rn.f64 fd237, fd172, fd151, fd175; +sub.f64 fd238, fd177, fd176; +st.shared.v2.f64 [r9+32], {fd237, fd238}; +fma.rn.f64 fd239, fd180, fd155, fd183; +sub.f64 fd240, fd185, fd184; +st.shared.v2.f64 [r9+48], {fd239, fd240}; +sub.f64 fd241, fd193, fd192; +fma.rn.f64 fd242, fd188, fd159, fd191; +st.shared.v2.f64 [r9+64], {fd242, fd241}; +fma.rn.f64 fd243, fd196, fd145, fd199; +sub.f64 fd244, fd201, fd200; +st.shared.v2.f64 [r9+80], {fd243, fd244}; +fma.rn.f64 fd245, fd202, fd149, fd206; +sub.f64 fd246, fd208, fd207; +st.shared.v2.f64 [r9+96], {fd245, fd246}; +fma.rn.f64 fd247, fd211, fd153, fd214; +sub.f64 fd248, fd216, fd215; +st.shared.v2.f64 [r9+112], {fd247, fd248}; +fma.rn.f64 fd249, fd219, fd157, fd222; +sub.f64 fd250, fd224, fd223; +st.shared.v2.f64 [r9+128], {fd249, fd250}; +sub.f64 fd251, fd232, fd231; +fma.rn.f64 fd252, fd227, fd161, fd230; +st.shared.v2.f64 [r9+144], {fd252, fd251}; +barrier.sync 0; +mad.lo.s32 r10, r7, -144, r9; +ld.shared.v2.f64 {fd253, fd254}, [r10]; +ld.shared.v2.f64 {fd257, fd258}, [r10+160]; +ld.shared.v2.f64 {fd261, fd262}, [r10+320]; +ld.shared.v2.f64 {fd265, fd266}, [r10+480]; +ld.shared.v2.f64 {fd269, fd270}, [r10+640]; +ld.shared.v2.f64 {fd273, fd274}, [r10+800]; +ld.shared.v2.f64 {fd277, fd278}, [r10+960]; +ld.shared.v2.f64 {fd281, fd282}, [r10+1120]; +ld.shared.v2.f64 {fd285, fd286}, [r10+1280]; +ld.shared.v2.f64 {fd289, fd290}, [r10+1440]; +add.f64 fd293, fd261, fd285; +add.f64 fd294, fd253, fd293; +add.f64 fd295, fd269, fd277; +add.f64 fd296, fd295, fd294; +add.f64 fd297, fd262, fd286; +add.f64 fd298, fd254, fd297; +add.f64 fd299, fd270, fd278; +add.f64 fd300, fd299, fd298; +fma.rn.f64 fd301, fd293, 0d3FD3C6EF372FE950, fd253; +mul.f64 fd302, fd295, 0d3FE9E3779B97F4A8; +sub.f64 fd303, fd301, fd302; +sub.f64 fd304, fd262, fd286; +mul.f64 fd305, fd304, 0d3FEE6F0E134454FF; +sub.f64 fd306, fd270, fd278; +fma.rn.f64 fd307, fd306, 0d3FE2CF2304755A5E, fd305; +sub.f64 fd308, fd303, fd307; +add.f64 fd309, fd307, fd303; +mul.f64 fd310, fd293, 0d3FE9E3779B97F4A8; +sub.f64 fd311, fd253, fd310; +fma.rn.f64 fd312, fd295, 0d3FD3C6EF372FE950, fd311; +mul.f64 fd313, fd304, 0d3FE2CF2304755A5E; +mul.f64 fd314, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd315, fd313, fd314; +sub.f64 fd316, fd312, fd315; +add.f64 fd317, fd315, fd312; +fma.rn.f64 fd318, fd297, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd319, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd320, fd318, fd319; +sub.f64 fd321, fd261, fd285; +mul.f64 fd322, fd321, 0d3FEE6F0E134454FF; +sub.f64 fd323, fd269, fd277; +fma.rn.f64 fd324, fd323, 0d3FE2CF2304755A5E, fd322; +add.f64 fd325, fd324, fd320; +sub.f64 fd326, fd320, fd324; +mul.f64 fd327, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd328, fd254, fd327; +fma.rn.f64 fd329, fd299, 0d3FD3C6EF372FE950, fd328; +mul.f64 fd330, fd321, 0d3FE2CF2304755A5E; +mul.f64 fd331, fd323, 0d3FEE6F0E134454FF; +sub.f64 fd332, fd330, fd331; +add.f64 fd333, fd332, fd329; +sub.f64 fd334, fd329, fd332; +add.f64 fd335, fd265, fd289; +add.f64 fd336, fd257, fd335; +add.f64 fd337, fd273, fd281; +add.f64 fd338, fd337, fd336; +add.f64 fd339, fd266, fd290; +add.f64 fd340, fd258, fd339; +add.f64 fd341, fd274, fd282; +add.f64 fd342, fd341, fd340; +fma.rn.f64 fd343, fd335, 0d3FD3C6EF372FE950, fd257; +mul.f64 fd344, fd337, 0d3FE9E3779B97F4A8; +sub.f64 fd345, fd343, fd344; +sub.f64 fd346, fd266, fd290; +mul.f64 fd347, fd346, 0d3FEE6F0E134454FF; +sub.f64 fd348, fd274, fd282; +fma.rn.f64 fd349, fd348, 0d3FE2CF2304755A5E, fd347; +sub.f64 fd350, fd345, fd349; +add.f64 fd351, fd349, fd345; +mul.f64 fd352, fd335, 0d3FE9E3779B97F4A8; +sub.f64 fd353, fd257, fd352; +fma.rn.f64 fd354, fd337, 0d3FD3C6EF372FE950, fd353; +mul.f64 fd355, fd346, 0d3FE2CF2304755A5E; +mul.f64 fd356, fd348, 0d3FEE6F0E134454FF; +sub.f64 fd357, fd355, fd356; +sub.f64 fd358, fd354, fd357; +add.f64 fd359, fd357, fd354; +fma.rn.f64 fd360, fd339, 0d3FD3C6EF372FE950, fd258; +mul.f64 fd361, fd341, 0d3FE9E3779B97F4A8; +sub.f64 fd362, fd360, fd361; +sub.f64 fd363, fd265, fd289; +mul.f64 fd364, fd363, 0d3FEE6F0E134454FF; +sub.f64 fd365, fd273, fd281; +fma.rn.f64 fd366, fd365, 0d3FE2CF2304755A5E, fd364; +add.f64 fd367, fd366, fd362; +sub.f64 fd368, fd362, fd366; +mul.f64 fd369, fd339, 0d3FE9E3779B97F4A8; +sub.f64 fd370, fd258, fd369; +fma.rn.f64 fd371, fd341, 0d3FD3C6EF372FE950, fd370; +mul.f64 fd372, fd363, 0d3FE2CF2304755A5E; +mul.f64 fd373, fd365, 0d3FEE6F0E134454FF; +sub.f64 fd374, fd372, fd373; +add.f64 fd375, fd374, fd371; +sub.f64 fd376, fd371, fd374; +mul.f64 fd377, fd350, 0d3FE9E3779B97F4A8; +mul.f64 fd378, fd367, 0d3FE2CF2304755A5E; +sub.f64 fd379, fd377, fd378; +mul.f64 fd380, fd367, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd381, fd350, 0d3FE2CF2304755A5E, fd380; +mul.f64 fd382, fd358, 0d3FD3C6EF372FE950; +mul.f64 fd383, fd375, 0d3FEE6F0E134454FF; +sub.f64 fd384, fd382, fd383; +mul.f64 fd385, fd375, 0d3FD3C6EF372FE950; +fma.rn.f64 fd386, fd358, 0d3FEE6F0E134454FF, fd385; +mul.f64 fd387, fd359, 0dBFD3C6EF372FE950; +mul.f64 fd388, fd376, 0d3FEE6F0E134454FF; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd376, 0dBFD3C6EF372FE950; +fma.rn.f64 fd391, fd359, 0d3FEE6F0E134454FF, fd390; +mul.f64 fd392, fd351, 0dBFE9E3779B97F4A8; +mul.f64 fd393, fd368, 0d3FE2CF2304755A5E; +sub.f64 fd394, fd392, fd393; +mul.f64 fd395, fd368, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd396, fd351, 0d3FE2CF2304755A5E, fd395; +add.f64 %1, fd300, fd342; +add.f64 %0, fd296, fd338; +add.f64 %3, fd325, fd381; +add.f64 %2, fd308, fd379; +add.f64 %5, fd333, fd386; +add.f64 %4, fd316, fd384; +add.f64 %7, fd334, fd391; +add.f64 %6, fd317, fd389; +add.f64 %9, fd326, fd396; +add.f64 %8, fd309, fd394; +sub.f64 %11, fd300, fd342; +sub.f64 %10, fd296, fd338; +sub.f64 %13, fd325, fd381; +sub.f64 %12, fd308, fd379; +sub.f64 %15, fd333, fd386; +sub.f64 %14, fd316, fd384; +sub.f64 %17, fd334, fd391; +sub.f64 %16, fd317, fd389; +sub.f64 %19, fd326, fd396; +sub.f64 %18, fd309, fd394; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "r"(smem), "l"(lut_dp_10_100), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..0eef510f6eb9f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp16_fwd.hpp.inc @@ -0,0 +1,24942 @@ +#ifndef CUFFTDX_FFT_1024_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_1024_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<829, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<784>; +.reg .b32 r<4651>; +.reg .b64 rd<3>; +mov.u32 r4575, %tid.y; +shl.b32 r4576, r4575, 13; +mov.u32 r4577, %64; +add.s32 r4578, r4577, r4576; +mov.u32 r4579, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f668, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r101, {low, high}; +} +mov.f32 f702, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4580, r4579, 31; +shl.b32 r4581, r4579, 8; +and.b32 r4582, r4581, -8192; +add.s32 r4583, r4578, r4582; +cvt.rn.f32.u32 f779, r4580; +mul.f32 f780, f779, 0f3BC90FDB; +cos.approx.f32 f357, f780; +sin.approx.f32 f781, f780; +neg.f32 f358, f781; +mov.f32 f783, 0fBF800000; +mov.f32 f782, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r4584, r4581, 7936; +add.s32 r4585, r4583, r4584; +st.shared.v4.f32 [r4585], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r4585+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r4585+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r4585+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r4585+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r4585+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r4585+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r4585+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r4585+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r4585+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r4585+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r4585+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r4585+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r4585+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r4585+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r4585+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r4586, r4580, -248, r4585; +ld.shared.u32 r2864, [r4586]; +ld.shared.u32 r2867, [r4586+4]; +ld.shared.u32 r3480, [r4586+256]; +ld.shared.u32 r3483, [r4586+260]; +ld.shared.u32 r3060, [r4586+512]; +ld.shared.u32 r3063, [r4586+516]; +ld.shared.u32 r3676, [r4586+768]; +ld.shared.u32 r3679, [r4586+772]; +ld.shared.u32 r2914, [r4586+1024]; +ld.shared.u32 r2917, [r4586+1028]; +ld.shared.u32 r3530, [r4586+1280]; +ld.shared.u32 r3533, [r4586+1284]; +ld.shared.u32 r3110, [r4586+1536]; +ld.shared.u32 r3113, [r4586+1540]; +ld.shared.u32 r3726, [r4586+1792]; +ld.shared.u32 r3729, [r4586+1796]; +ld.shared.u32 r2876, [r4586+2048]; +ld.shared.u32 r2879, [r4586+2052]; +ld.shared.u32 r3492, [r4586+2304]; +ld.shared.u32 r3495, [r4586+2308]; +ld.shared.u32 r3072, [r4586+2560]; +ld.shared.u32 r3075, [r4586+2564]; +ld.shared.u32 r3688, [r4586+2816]; +ld.shared.u32 r3691, [r4586+2820]; +ld.shared.u32 r2926, [r4586+3072]; +ld.shared.u32 r2929, [r4586+3076]; +ld.shared.u32 r3542, [r4586+3328]; +ld.shared.u32 r3545, [r4586+3332]; +ld.shared.u32 r3122, [r4586+3584]; +ld.shared.u32 r3125, [r4586+3588]; +ld.shared.u32 r3738, [r4586+3840]; +ld.shared.u32 r3741, [r4586+3844]; +ld.shared.u32 r2865, [r4586+4096]; +ld.shared.u32 r2868, [r4586+4100]; +ld.shared.u32 r3481, [r4586+4352]; +ld.shared.u32 r3484, [r4586+4356]; +ld.shared.u32 r3061, [r4586+4608]; +ld.shared.u32 r3064, [r4586+4612]; +ld.shared.u32 r3677, [r4586+4864]; +ld.shared.u32 r3680, [r4586+4868]; +ld.shared.u32 r2915, [r4586+5120]; +ld.shared.u32 r2918, [r4586+5124]; +ld.shared.u32 r3531, [r4586+5376]; +ld.shared.u32 r3534, [r4586+5380]; +ld.shared.u32 r3111, [r4586+5632]; +ld.shared.u32 r3114, [r4586+5636]; +ld.shared.u32 r3727, [r4586+5888]; +ld.shared.u32 r3730, [r4586+5892]; +ld.shared.u32 r2877, [r4586+6144]; +ld.shared.u32 r2880, [r4586+6148]; +ld.shared.u32 r3493, [r4586+6400]; +ld.shared.u32 r3496, [r4586+6404]; +ld.shared.u32 r3073, [r4586+6656]; +ld.shared.u32 r3076, [r4586+6660]; +ld.shared.u32 r3689, [r4586+6912]; +ld.shared.u32 r3692, [r4586+6916]; +ld.shared.u32 r2927, [r4586+7168]; +ld.shared.u32 r2930, [r4586+7172]; +ld.shared.u32 r3543, [r4586+7424]; +ld.shared.u32 r3546, [r4586+7428]; +ld.shared.u32 r3123, [r4586+7680]; +ld.shared.u32 r3126, [r4586+7684]; +ld.shared.u32 r3739, [r4586+7936]; +ld.shared.u32 r3742, [r4586+7940]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 %0, r3383, r3999; +} +{ +add.f16x2 %1, r3386, r4002; +} +{ +sub.f16x2 %32, r3383, r3999; +} +{ +sub.f16x2 %33, r3386, r4002; +} +{ +add.f16x2 %2, r3395, r4163; +} +{ +add.f16x2 %3, r3398, r4169; +} +{ +sub.f16x2 %34, r3395, r4163; +} +{ +sub.f16x2 %35, r3398, r4169; +} +{ +add.f16x2 %4, r3407, r4179; +} +{ +add.f16x2 %5, r3410, r4185; +} +{ +sub.f16x2 %36, r3407, r4179; +} +{ +sub.f16x2 %37, r3410, r4185; +} +{ +add.f16x2 %6, r3419, r4195; +} +{ +add.f16x2 %7, r3422, r4201; +} +{ +sub.f16x2 %38, r3419, r4195; +} +{ +sub.f16x2 %39, r3422, r4201; +} +{ +add.f16x2 %8, r3431, r4211; +} +{ +add.f16x2 %9, r3434, r4217; +} +{ +sub.f16x2 %40, r3431, r4211; +} +{ +sub.f16x2 %41, r3434, r4217; +} +{ +add.f16x2 %10, r3443, r4227; +} +{ +add.f16x2 %11, r3446, r4233; +} +{ +sub.f16x2 %42, r3443, r4227; +} +{ +sub.f16x2 %43, r3446, r4233; +} +{ +add.f16x2 %12, r3455, r4243; +} +{ +add.f16x2 %13, r3458, r4249; +} +{ +sub.f16x2 %44, r3455, r4243; +} +{ +sub.f16x2 %45, r3458, r4249; +} +{ +add.f16x2 %14, r3467, r4259; +} +{ +add.f16x2 %15, r3470, r4265; +} +{ +sub.f16x2 %46, r3467, r4259; +} +{ +sub.f16x2 %47, r3470, r4265; +} +{ +add.f16x2 %16, r3389, r4008; +} +{ +add.f16x2 %17, r3392, r4269; +} +{ +sub.f16x2 %48, r3389, r4008; +} +{ +sub.f16x2 %49, r3392, r4269; +} +{ +add.f16x2 %18, r3401, r4277; +} +{ +add.f16x2 %19, r3404, r4283; +} +{ +sub.f16x2 %50, r3401, r4277; +} +{ +sub.f16x2 %51, r3404, r4283; +} +{ +add.f16x2 %20, r3413, r4293; +} +{ +add.f16x2 %21, r3416, r4299; +} +{ +sub.f16x2 %52, r3413, r4293; +} +{ +sub.f16x2 %53, r3416, r4299; +} +{ +add.f16x2 %22, r3425, r4309; +} +{ +add.f16x2 %23, r3428, r4315; +} +{ +sub.f16x2 %54, r3425, r4309; +} +{ +sub.f16x2 %55, r3428, r4315; +} +{ +add.f16x2 %24, r3437, r4325; +} +{ +add.f16x2 %25, r3440, r4331; +} +{ +sub.f16x2 %56, r3437, r4325; +} +{ +sub.f16x2 %57, r3440, r4331; +} +{ +add.f16x2 %26, r3449, r4341; +} +{ +add.f16x2 %27, r3452, r4347; +} +{ +sub.f16x2 %58, r3449, r4341; +} +{ +sub.f16x2 %59, r3452, r4347; +} +{ +add.f16x2 %28, r3461, r4357; +} +{ +add.f16x2 %29, r3464, r4363; +} +{ +sub.f16x2 %60, r3461, r4357; +} +{ +sub.f16x2 %61, r3464, r4363; +} +{ +add.f16x2 %30, r3473, r4373; +} +{ +add.f16x2 %31, r3476, r4379; +} +{ +sub.f16x2 %62, r3473, r4373; +} +{ +sub.f16x2 %63, r3476, r4379; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<827, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2569>; +.reg .b64 rd<2>; +mov.u32 r2549, %tid.y; +shl.b32 r2550, r2549, 12; +mov.u32 r2551, %32; +add.s32 r2552, r2551, r2550; +mov.u32 r2553, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f212, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r101, {low, high}; +} +mov.f32 f230, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2554, r2553, 63; +shl.b32 r2555, r2553, 6; +and.b32 r2556, r2555, -4096; +add.s32 r2557, r2552, r2556; +cvt.rn.f32.u32 f301, r2554; +mul.f32 f302, f301, 0f3BC90FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2558, r2555, 4032; +add.s32 r2559, r2557, r2558; +st.shared.v4.f32 [r2559], {r521, r629, r666, r703}; +st.shared.v4.f32 [r2559+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r2559+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r2559+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r2560, r2554, -60, r2559; +ld.shared.u32 r1176, [r2560]; +ld.shared.u32 r1372, [r2560+256]; +ld.shared.u32 r1226, [r2560+512]; +ld.shared.u32 r1422, [r2560+768]; +ld.shared.u32 r1188, [r2560+1024]; +ld.shared.u32 r1384, [r2560+1280]; +ld.shared.u32 r1238, [r2560+1536]; +ld.shared.u32 r1434, [r2560+1792]; +ld.shared.u32 r1177, [r2560+2048]; +ld.shared.u32 r1373, [r2560+2304]; +ld.shared.u32 r1227, [r2560+2560]; +ld.shared.u32 r1423, [r2560+2816]; +ld.shared.u32 r1189, [r2560+3072]; +ld.shared.u32 r1385, [r2560+3328]; +ld.shared.u32 r1239, [r2560+3584]; +ld.shared.u32 r1435, [r2560+3840]; +barrier.sync 0; +st.shared.v4.f32 [r2559], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2559+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2559+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2559+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2560]; +ld.shared.u32 r1375, [r2560+256]; +ld.shared.u32 r1229, [r2560+512]; +ld.shared.u32 r1425, [r2560+768]; +ld.shared.u32 r1191, [r2560+1024]; +ld.shared.u32 r1387, [r2560+1280]; +ld.shared.u32 r1241, [r2560+1536]; +ld.shared.u32 r1437, [r2560+1792]; +ld.shared.u32 r1180, [r2560+2048]; +ld.shared.u32 r1376, [r2560+2304]; +ld.shared.u32 r1230, [r2560+2560]; +ld.shared.u32 r1426, [r2560+2816]; +ld.shared.u32 r1192, [r2560+3072]; +ld.shared.u32 r1388, [r2560+3328]; +ld.shared.u32 r1242, [r2560+3584]; +ld.shared.u32 r1438, [r2560+3840]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2561, r2553, 48; +bfe.u32 r2562, r2553, 4, 2; +shl.b32 r2563, r2553, 2; +and.b32 r2564, r2563, 60; +add.s32 r2565, r2557, r2564; +cvt.rn.f32.u32 f304, r2562; +mul.f32 f305, f304, 0f3DC90FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +barrier.sync 0; +and.b32 r2566, r2555, 3072; +add.s32 r2567, r2565, r2566; +st.shared.u32 [r2567], r1695; +st.shared.u32 [r2567+64], r1803; +st.shared.u32 [r2567+128], r1840; +st.shared.u32 [r2567+192], r1877; +st.shared.u32 [r2567+256], r1914; +st.shared.u32 [r2567+320], r1951; +st.shared.u32 [r2567+384], r1988; +st.shared.u32 [r2567+448], r2025; +st.shared.u32 [r2567+512], r2062; +st.shared.u32 [r2567+576], r2099; +st.shared.u32 [r2567+640], r2136; +st.shared.u32 [r2567+704], r2173; +st.shared.u32 [r2567+768], r2210; +st.shared.u32 [r2567+832], r2247; +st.shared.u32 [r2567+896], r2284; +st.shared.u32 [r2567+960], r2321; +barrier.sync 0; +mad.lo.s32 r2568, r2561, -60, r2567; +ld.shared.u32 r2350, [r2568]; +ld.shared.u32 r2400, [r2568+256]; +ld.shared.u32 r2450, [r2568+512]; +ld.shared.u32 r2500, [r2568+768]; +ld.shared.u32 r2362, [r2568+1024]; +ld.shared.u32 r2412, [r2568+1280]; +ld.shared.u32 r2462, [r2568+1536]; +ld.shared.u32 r2512, [r2568+1792]; +ld.shared.u32 r2351, [r2568+2048]; +ld.shared.u32 r2401, [r2568+2304]; +ld.shared.u32 r2451, [r2568+2560]; +ld.shared.u32 r2501, [r2568+2816]; +ld.shared.u32 r2363, [r2568+3072]; +ld.shared.u32 r2413, [r2568+3328]; +ld.shared.u32 r2463, [r2568+3584]; +ld.shared.u32 r2513, [r2568+3840]; +barrier.sync 0; +st.shared.u32 [r2567], r1698; +st.shared.u32 [r2567+64], r1810; +st.shared.u32 [r2567+128], r1847; +st.shared.u32 [r2567+192], r1884; +st.shared.u32 [r2567+256], r1921; +st.shared.u32 [r2567+320], r1958; +st.shared.u32 [r2567+384], r1995; +st.shared.u32 [r2567+448], r2032; +st.shared.u32 [r2567+512], r2069; +st.shared.u32 [r2567+576], r2106; +st.shared.u32 [r2567+640], r2143; +st.shared.u32 [r2567+704], r2180; +st.shared.u32 [r2567+768], r2217; +st.shared.u32 [r2567+832], r2254; +st.shared.u32 [r2567+896], r2291; +st.shared.u32 [r2567+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2568]; +ld.shared.u32 r2403, [r2568+256]; +ld.shared.u32 r2453, [r2568+512]; +ld.shared.u32 r2503, [r2568+768]; +ld.shared.u32 r2365, [r2568+1024]; +ld.shared.u32 r2415, [r2568+1280]; +ld.shared.u32 r2465, [r2568+1536]; +ld.shared.u32 r2515, [r2568+1792]; +ld.shared.u32 r2354, [r2568+2048]; +ld.shared.u32 r2404, [r2568+2304]; +ld.shared.u32 r2454, [r2568+2560]; +ld.shared.u32 r2504, [r2568+2816]; +ld.shared.u32 r2366, [r2568+3072]; +ld.shared.u32 r2416, [r2568+3328]; +ld.shared.u32 r2466, [r2568+3584]; +ld.shared.u32 r2516, [r2568+3840]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 %0, r2349, r2361; +} +{ +add.f16x2 %1, r2352, r2364; +} +{ +sub.f16x2 %16, r2349, r2361; +} +{ +sub.f16x2 %17, r2352, r2364; +} +{ +add.f16x2 %8, r2355, r2370; +} +{ +add.f16x2 %9, r2358, r2373; +} +{ +sub.f16x2 %24, r2355, r2370; +} +{ +sub.f16x2 %25, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 %2, r2399, r2411; +} +{ +add.f16x2 %3, r2402, r2414; +} +{ +sub.f16x2 %18, r2399, r2411; +} +{ +sub.f16x2 %19, r2402, r2414; +} +{ +add.f16x2 %10, r2405, r2420; +} +{ +add.f16x2 %11, r2408, r2423; +} +{ +sub.f16x2 %26, r2405, r2420; +} +{ +sub.f16x2 %27, r2408, r2423; +} +{ +add.f16x2 r2449, r2450, r2451; +} +{ +add.f16x2 r2452, r2453, r2454; +} +{ +sub.f16x2 r2455, r2450, r2451; +} +{ +sub.f16x2 r2458, r2453, r2454; +} +{ +add.f16x2 r2461, r2462, r2463; +} +{ +add.f16x2 r2464, r2465, r2466; +} +{ +sub.f16x2 r2467, r2462, r2463; +} +{ +sub.f16x2 r2470, r2465, r2466; +} +{ +neg.f16x2 r2473, r2467; +} +{ +add.f16x2 %4, r2449, r2461; +} +{ +add.f16x2 %5, r2452, r2464; +} +{ +sub.f16x2 %20, r2449, r2461; +} +{ +sub.f16x2 %21, r2452, r2464; +} +{ +add.f16x2 %12, r2455, r2470; +} +{ +add.f16x2 %13, r2458, r2473; +} +{ +sub.f16x2 %28, r2455, r2470; +} +{ +sub.f16x2 %29, r2458, r2473; +} +{ +add.f16x2 r2499, r2500, r2501; +} +{ +add.f16x2 r2502, r2503, r2504; +} +{ +sub.f16x2 r2505, r2500, r2501; +} +{ +sub.f16x2 r2508, r2503, r2504; +} +{ +add.f16x2 r2511, r2512, r2513; +} +{ +add.f16x2 r2514, r2515, r2516; +} +{ +sub.f16x2 r2517, r2512, r2513; +} +{ +sub.f16x2 r2520, r2515, r2516; +} +{ +neg.f16x2 r2523, r2517; +} +{ +add.f16x2 %6, r2499, r2511; +} +{ +add.f16x2 %7, r2502, r2514; +} +{ +sub.f16x2 %22, r2499, r2511; +} +{ +sub.f16x2 %23, r2502, r2514; +} +{ +add.f16x2 %14, r2505, r2520; +} +{ +add.f16x2 %15, r2508, r2523; +} +{ +sub.f16x2 %30, r2505, r2520; +} +{ +sub.f16x2 %31, r2508, r2523; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<830, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1450>; +.reg .b64 rd<2>; +mov.u32 r1423, %tid.y; +shl.b32 r1424, r1423, 12; +mov.u32 r1425, %16; +add.s32 r1426, r1425, r1424; +mov.u32 r1427, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f94, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r101, {low, high}; +} +mov.f32 f104, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f136, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1428, r1427, 127; +shl.b32 r1429, r1427, 5; +and.b32 r1430, r1429, -4096; +add.s32 r1431, r1426, r1430; +cvt.rn.f32.u32 f139, r1428; +mul.f32 f140, f139, 0f3BC90FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1432, r1429, 4064; +add.s32 r1433, r1431, r1432; +st.shared.v4.f32 [r1433], {r149, r209, r246, r283}; +st.shared.v4.f32 [r1433+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r1434, r1428, -28, r1433; +ld.shared.u32 r460, [r1434]; +ld.shared.u32 r510, [r1434+512]; +ld.shared.u32 r472, [r1434+1024]; +ld.shared.u32 r522, [r1434+1536]; +ld.shared.u32 r461, [r1434+2048]; +ld.shared.u32 r511, [r1434+2560]; +ld.shared.u32 r473, [r1434+3072]; +ld.shared.u32 r523, [r1434+3584]; +barrier.sync 0; +st.shared.v4.f32 [r1433], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1433+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1434]; +ld.shared.u32 r513, [r1434+512]; +ld.shared.u32 r475, [r1434+1024]; +ld.shared.u32 r525, [r1434+1536]; +ld.shared.u32 r464, [r1434+2048]; +ld.shared.u32 r514, [r1434+2560]; +ld.shared.u32 r476, [r1434+3072]; +ld.shared.u32 r526, [r1434+3584]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1435, r1427, 120; +bfe.u32 r1436, r1427, 3, 4; +shl.b32 r1437, r1427, 2; +and.b32 r1438, r1437, 28; +add.s32 r1439, r1431, r1438; +cvt.rn.f32.u32 f142, r1436; +mul.f32 f143, f142, 0f3D490FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r1440, r1429, 3840; +add.s32 r1441, r1439, r1440; +st.shared.u32 [r1441], r607; +st.shared.u32 [r1441+32], r667; +st.shared.u32 [r1441+64], r704; +st.shared.u32 [r1441+96], r741; +st.shared.u32 [r1441+128], r778; +st.shared.u32 [r1441+160], r815; +st.shared.u32 [r1441+192], r852; +st.shared.u32 [r1441+224], r889; +barrier.sync 0; +mad.lo.s32 r1442, r1435, -28, r1441; +ld.shared.u32 r918, [r1442]; +ld.shared.u32 r968, [r1442+512]; +ld.shared.u32 r930, [r1442+1024]; +ld.shared.u32 r980, [r1442+1536]; +ld.shared.u32 r919, [r1442+2048]; +ld.shared.u32 r969, [r1442+2560]; +ld.shared.u32 r931, [r1442+3072]; +ld.shared.u32 r981, [r1442+3584]; +barrier.sync 0; +st.shared.u32 [r1441], r610; +st.shared.u32 [r1441+32], r674; +st.shared.u32 [r1441+64], r711; +st.shared.u32 [r1441+96], r748; +st.shared.u32 [r1441+128], r785; +st.shared.u32 [r1441+160], r822; +st.shared.u32 [r1441+192], r859; +st.shared.u32 [r1441+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1442]; +ld.shared.u32 r971, [r1442+512]; +ld.shared.u32 r933, [r1442+1024]; +ld.shared.u32 r983, [r1442+1536]; +ld.shared.u32 r922, [r1442+2048]; +ld.shared.u32 r972, [r1442+2560]; +ld.shared.u32 r934, [r1442+3072]; +ld.shared.u32 r984, [r1442+3584]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1443, r1427, 64; +bfe.u32 r1444, r1427, 6, 1; +and.b32 r1445, r1437, 252; +add.s32 r1446, r1431, r1445; +cvt.rn.f32.u32 f145, r1444; +mul.f32 f146, f145, 0f3EC90FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +barrier.sync 0; +and.b32 r1447, r1429, 2048; +add.s32 r1448, r1446, r1447; +st.shared.u32 [r1448], r1065; +st.shared.u32 [r1448+256], r1125; +st.shared.u32 [r1448+512], r1162; +st.shared.u32 [r1448+768], r1199; +st.shared.u32 [r1448+1024], r1236; +st.shared.u32 [r1448+1280], r1273; +st.shared.u32 [r1448+1536], r1310; +st.shared.u32 [r1448+1792], r1347; +barrier.sync 0; +mad.lo.s32 r1449, r1443, -28, r1448; +ld.shared.u32 r1376, [r1449]; +ld.shared.u32 r1388, [r1449+512]; +ld.shared.u32 r1400, [r1449+1024]; +ld.shared.u32 r1412, [r1449+1536]; +ld.shared.u32 r1377, [r1449+2048]; +ld.shared.u32 r1389, [r1449+2560]; +ld.shared.u32 r1401, [r1449+3072]; +ld.shared.u32 r1413, [r1449+3584]; +barrier.sync 0; +st.shared.u32 [r1448], r1068; +st.shared.u32 [r1448+256], r1132; +st.shared.u32 [r1448+512], r1169; +st.shared.u32 [r1448+768], r1206; +st.shared.u32 [r1448+1024], r1243; +st.shared.u32 [r1448+1280], r1280; +st.shared.u32 [r1448+1536], r1317; +st.shared.u32 [r1448+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1449]; +ld.shared.u32 r1391, [r1449+512]; +ld.shared.u32 r1403, [r1449+1024]; +ld.shared.u32 r1415, [r1449+1536]; +ld.shared.u32 r1380, [r1449+2048]; +ld.shared.u32 r1392, [r1449+2560]; +ld.shared.u32 r1404, [r1449+3072]; +ld.shared.u32 r1416, [r1449+3584]; +{ +add.f16x2 %0, r1376, r1377; +} +{ +add.f16x2 %1, r1379, r1380; +} +{ +sub.f16x2 %8, r1376, r1377; +} +{ +sub.f16x2 %9, r1379, r1380; +} +{ +add.f16x2 %2, r1388, r1389; +} +{ +add.f16x2 %3, r1391, r1392; +} +{ +sub.f16x2 %10, r1388, r1389; +} +{ +sub.f16x2 %11, r1391, r1392; +} +{ +add.f16x2 %4, r1400, r1401; +} +{ +add.f16x2 %5, r1403, r1404; +} +{ +sub.f16x2 %12, r1400, r1401; +} +{ +sub.f16x2 %13, r1403, r1404; +} +{ +add.f16x2 %6, r1412, r1413; +} +{ +add.f16x2 %7, r1415, r1416; +} +{ +sub.f16x2 %14, r1412, r1413; +} +{ +sub.f16x2 %15, r1415, r1416; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<831, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<784>; +.reg .b32 r<4651>; +.reg .b64 rd<3>; +mov.u32 r4575, %tid.y; +shl.b32 r4576, r4575, 12; +mov.u32 r4577, %64; +add.s32 r4578, r4577, r4576; +mov.u32 r4579, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f668, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r101, {low, high}; +} +mov.f32 f702, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4580, r4579, 31; +shl.b32 r4581, r4579, 7; +and.b32 r4582, r4581, -4096; +add.s32 r4583, r4578, r4582; +cvt.rn.f32.u32 f779, r4580; +mul.f32 f780, f779, 0f3BC90FDB; +cos.approx.f32 f357, f780; +sin.approx.f32 f781, f780; +neg.f32 f358, f781; +mov.f32 f783, 0fBF800000; +mov.f32 f782, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r4584, r4581, 3968; +add.s32 r4585, r4583, r4584; +st.shared.v4.f32 [r4585], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r4585+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r4585+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r4585+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r4585+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r4585+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r4585+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r4585+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r4586, r4580, -124, r4585; +ld.shared.u32 r2864, [r4586]; +ld.shared.u32 r3480, [r4586+128]; +ld.shared.u32 r3060, [r4586+256]; +ld.shared.u32 r3676, [r4586+384]; +ld.shared.u32 r2914, [r4586+512]; +ld.shared.u32 r3530, [r4586+640]; +ld.shared.u32 r3110, [r4586+768]; +ld.shared.u32 r3726, [r4586+896]; +ld.shared.u32 r2876, [r4586+1024]; +ld.shared.u32 r3492, [r4586+1152]; +ld.shared.u32 r3072, [r4586+1280]; +ld.shared.u32 r3688, [r4586+1408]; +ld.shared.u32 r2926, [r4586+1536]; +ld.shared.u32 r3542, [r4586+1664]; +ld.shared.u32 r3122, [r4586+1792]; +ld.shared.u32 r3738, [r4586+1920]; +ld.shared.u32 r2865, [r4586+2048]; +ld.shared.u32 r3481, [r4586+2176]; +ld.shared.u32 r3061, [r4586+2304]; +ld.shared.u32 r3677, [r4586+2432]; +ld.shared.u32 r2915, [r4586+2560]; +ld.shared.u32 r3531, [r4586+2688]; +ld.shared.u32 r3111, [r4586+2816]; +ld.shared.u32 r3727, [r4586+2944]; +ld.shared.u32 r2877, [r4586+3072]; +ld.shared.u32 r3493, [r4586+3200]; +ld.shared.u32 r3073, [r4586+3328]; +ld.shared.u32 r3689, [r4586+3456]; +ld.shared.u32 r2927, [r4586+3584]; +ld.shared.u32 r3543, [r4586+3712]; +ld.shared.u32 r3123, [r4586+3840]; +ld.shared.u32 r3739, [r4586+3968]; +barrier.sync 0; +st.shared.v4.f32 [r4585], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r4585+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r4585+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r4585+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r4585+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r4585+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r4585+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r4585+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r4586]; +ld.shared.u32 r3483, [r4586+128]; +ld.shared.u32 r3063, [r4586+256]; +ld.shared.u32 r3679, [r4586+384]; +ld.shared.u32 r2917, [r4586+512]; +ld.shared.u32 r3533, [r4586+640]; +ld.shared.u32 r3113, [r4586+768]; +ld.shared.u32 r3729, [r4586+896]; +ld.shared.u32 r2879, [r4586+1024]; +ld.shared.u32 r3495, [r4586+1152]; +ld.shared.u32 r3075, [r4586+1280]; +ld.shared.u32 r3691, [r4586+1408]; +ld.shared.u32 r2929, [r4586+1536]; +ld.shared.u32 r3545, [r4586+1664]; +ld.shared.u32 r3125, [r4586+1792]; +ld.shared.u32 r3741, [r4586+1920]; +ld.shared.u32 r2868, [r4586+2048]; +ld.shared.u32 r3484, [r4586+2176]; +ld.shared.u32 r3064, [r4586+2304]; +ld.shared.u32 r3680, [r4586+2432]; +ld.shared.u32 r2918, [r4586+2560]; +ld.shared.u32 r3534, [r4586+2688]; +ld.shared.u32 r3114, [r4586+2816]; +ld.shared.u32 r3730, [r4586+2944]; +ld.shared.u32 r2880, [r4586+3072]; +ld.shared.u32 r3496, [r4586+3200]; +ld.shared.u32 r3076, [r4586+3328]; +ld.shared.u32 r3692, [r4586+3456]; +ld.shared.u32 r2930, [r4586+3584]; +ld.shared.u32 r3546, [r4586+3712]; +ld.shared.u32 r3126, [r4586+3840]; +ld.shared.u32 r3742, [r4586+3968]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 %0, r3383, r3999; +} +{ +add.f16x2 %1, r3386, r4002; +} +{ +sub.f16x2 %32, r3383, r3999; +} +{ +sub.f16x2 %33, r3386, r4002; +} +{ +add.f16x2 %2, r3395, r4163; +} +{ +add.f16x2 %3, r3398, r4169; +} +{ +sub.f16x2 %34, r3395, r4163; +} +{ +sub.f16x2 %35, r3398, r4169; +} +{ +add.f16x2 %4, r3407, r4179; +} +{ +add.f16x2 %5, r3410, r4185; +} +{ +sub.f16x2 %36, r3407, r4179; +} +{ +sub.f16x2 %37, r3410, r4185; +} +{ +add.f16x2 %6, r3419, r4195; +} +{ +add.f16x2 %7, r3422, r4201; +} +{ +sub.f16x2 %38, r3419, r4195; +} +{ +sub.f16x2 %39, r3422, r4201; +} +{ +add.f16x2 %8, r3431, r4211; +} +{ +add.f16x2 %9, r3434, r4217; +} +{ +sub.f16x2 %40, r3431, r4211; +} +{ +sub.f16x2 %41, r3434, r4217; +} +{ +add.f16x2 %10, r3443, r4227; +} +{ +add.f16x2 %11, r3446, r4233; +} +{ +sub.f16x2 %42, r3443, r4227; +} +{ +sub.f16x2 %43, r3446, r4233; +} +{ +add.f16x2 %12, r3455, r4243; +} +{ +add.f16x2 %13, r3458, r4249; +} +{ +sub.f16x2 %44, r3455, r4243; +} +{ +sub.f16x2 %45, r3458, r4249; +} +{ +add.f16x2 %14, r3467, r4259; +} +{ +add.f16x2 %15, r3470, r4265; +} +{ +sub.f16x2 %46, r3467, r4259; +} +{ +sub.f16x2 %47, r3470, r4265; +} +{ +add.f16x2 %16, r3389, r4008; +} +{ +add.f16x2 %17, r3392, r4269; +} +{ +sub.f16x2 %48, r3389, r4008; +} +{ +sub.f16x2 %49, r3392, r4269; +} +{ +add.f16x2 %18, r3401, r4277; +} +{ +add.f16x2 %19, r3404, r4283; +} +{ +sub.f16x2 %50, r3401, r4277; +} +{ +sub.f16x2 %51, r3404, r4283; +} +{ +add.f16x2 %20, r3413, r4293; +} +{ +add.f16x2 %21, r3416, r4299; +} +{ +sub.f16x2 %52, r3413, r4293; +} +{ +sub.f16x2 %53, r3416, r4299; +} +{ +add.f16x2 %22, r3425, r4309; +} +{ +add.f16x2 %23, r3428, r4315; +} +{ +sub.f16x2 %54, r3425, r4309; +} +{ +sub.f16x2 %55, r3428, r4315; +} +{ +add.f16x2 %24, r3437, r4325; +} +{ +add.f16x2 %25, r3440, r4331; +} +{ +sub.f16x2 %56, r3437, r4325; +} +{ +sub.f16x2 %57, r3440, r4331; +} +{ +add.f16x2 %26, r3449, r4341; +} +{ +add.f16x2 %27, r3452, r4347; +} +{ +sub.f16x2 %58, r3449, r4341; +} +{ +sub.f16x2 %59, r3452, r4347; +} +{ +add.f16x2 %28, r3461, r4357; +} +{ +add.f16x2 %29, r3464, r4363; +} +{ +sub.f16x2 %60, r3461, r4357; +} +{ +sub.f16x2 %61, r3464, r4363; +} +{ +add.f16x2 %30, r3473, r4373; +} +{ +add.f16x2 %31, r3476, r4379; +} +{ +sub.f16x2 %62, r3473, r4373; +} +{ +sub.f16x2 %63, r3476, r4379; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<828, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1450>; +.reg .b64 rd<2>; +mov.u32 r1423, %tid.y; +shl.b32 r1424, r1423, 13; +mov.u32 r1425, %16; +add.s32 r1426, r1425, r1424; +mov.u32 r1427, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f94, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r101, {low, high}; +} +mov.f32 f104, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f136, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1428, r1427, 127; +shl.b32 r1429, r1427, 6; +and.b32 r1430, r1429, -8192; +add.s32 r1431, r1426, r1430; +cvt.rn.f32.u32 f139, r1428; +mul.f32 f140, f139, 0f3BC90FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1432, r1429, 8128; +add.s32 r1433, r1431, r1432; +st.shared.v4.f32 [r1433], {r149, r152, r209, r216}; +st.shared.v4.f32 [r1433+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r1433+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r1433+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r1434, r1428, -56, r1433; +ld.shared.u32 r460, [r1434]; +ld.shared.u32 r463, [r1434+4]; +ld.shared.u32 r510, [r1434+1024]; +ld.shared.u32 r513, [r1434+1028]; +ld.shared.u32 r472, [r1434+2048]; +ld.shared.u32 r475, [r1434+2052]; +ld.shared.u32 r522, [r1434+3072]; +ld.shared.u32 r525, [r1434+3076]; +ld.shared.u32 r461, [r1434+4096]; +ld.shared.u32 r464, [r1434+4100]; +ld.shared.u32 r511, [r1434+5120]; +ld.shared.u32 r514, [r1434+5124]; +ld.shared.u32 r473, [r1434+6144]; +ld.shared.u32 r476, [r1434+6148]; +ld.shared.u32 r523, [r1434+7168]; +ld.shared.u32 r526, [r1434+7172]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1435, r1427, 120; +bfe.u32 r1436, r1427, 3, 4; +cvt.rn.f32.u32 f142, r1436; +mul.f32 f143, f142, 0f3D490FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r1437, r1427, 3; +and.b32 r1438, r1437, 56; +add.s32 r1439, r1431, r1438; +barrier.sync 0; +and.b32 r1440, r1429, 7680; +add.s32 r1441, r1439, r1440; +st.shared.u32 [r1441], r607; +st.shared.u32 [r1441+4], r610; +st.shared.u32 [r1441+64], r667; +st.shared.u32 [r1441+68], r674; +st.shared.u32 [r1441+128], r704; +st.shared.u32 [r1441+132], r711; +st.shared.u32 [r1441+192], r741; +st.shared.u32 [r1441+196], r748; +st.shared.u32 [r1441+256], r778; +st.shared.u32 [r1441+260], r785; +st.shared.u32 [r1441+320], r815; +st.shared.u32 [r1441+324], r822; +st.shared.u32 [r1441+384], r852; +st.shared.u32 [r1441+388], r859; +st.shared.u32 [r1441+448], r889; +st.shared.u32 [r1441+452], r896; +barrier.sync 0; +mad.lo.s32 r1442, r1435, -56, r1441; +ld.shared.u32 r918, [r1442]; +ld.shared.u32 r921, [r1442+4]; +ld.shared.u32 r968, [r1442+1024]; +ld.shared.u32 r971, [r1442+1028]; +ld.shared.u32 r930, [r1442+2048]; +ld.shared.u32 r933, [r1442+2052]; +ld.shared.u32 r980, [r1442+3072]; +ld.shared.u32 r983, [r1442+3076]; +ld.shared.u32 r919, [r1442+4096]; +ld.shared.u32 r922, [r1442+4100]; +ld.shared.u32 r969, [r1442+5120]; +ld.shared.u32 r972, [r1442+5124]; +ld.shared.u32 r931, [r1442+6144]; +ld.shared.u32 r934, [r1442+6148]; +ld.shared.u32 r981, [r1442+7168]; +ld.shared.u32 r984, [r1442+7172]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1443, r1427, 64; +bfe.u32 r1444, r1427, 6, 1; +cvt.rn.f32.u32 f145, r1444; +mul.f32 f146, f145, 0f3EC90FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +and.b32 r1445, r1437, 504; +add.s32 r1446, r1431, r1445; +barrier.sync 0; +and.b32 r1447, r1429, 4096; +add.s32 r1448, r1446, r1447; +st.shared.u32 [r1448], r1065; +st.shared.u32 [r1448+4], r1068; +st.shared.u32 [r1448+512], r1125; +st.shared.u32 [r1448+516], r1132; +st.shared.u32 [r1448+1024], r1162; +st.shared.u32 [r1448+1028], r1169; +st.shared.u32 [r1448+1536], r1199; +st.shared.u32 [r1448+1540], r1206; +st.shared.u32 [r1448+2048], r1236; +st.shared.u32 [r1448+2052], r1243; +st.shared.u32 [r1448+2560], r1273; +st.shared.u32 [r1448+2564], r1280; +st.shared.u32 [r1448+3072], r1310; +st.shared.u32 [r1448+3076], r1317; +st.shared.u32 [r1448+3584], r1347; +st.shared.u32 [r1448+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1449, r1443, -56, r1448; +ld.shared.u32 r1376, [r1449]; +ld.shared.u32 r1379, [r1449+4]; +ld.shared.u32 r1388, [r1449+1024]; +ld.shared.u32 r1391, [r1449+1028]; +ld.shared.u32 r1400, [r1449+2048]; +ld.shared.u32 r1403, [r1449+2052]; +ld.shared.u32 r1412, [r1449+3072]; +ld.shared.u32 r1415, [r1449+3076]; +ld.shared.u32 r1377, [r1449+4096]; +ld.shared.u32 r1380, [r1449+4100]; +ld.shared.u32 r1389, [r1449+5120]; +ld.shared.u32 r1392, [r1449+5124]; +ld.shared.u32 r1401, [r1449+6144]; +ld.shared.u32 r1404, [r1449+6148]; +ld.shared.u32 r1413, [r1449+7168]; +ld.shared.u32 r1416, [r1449+7172]; +{ +add.f16x2 %0, r1376, r1377; +} +{ +add.f16x2 %1, r1379, r1380; +} +{ +sub.f16x2 %8, r1376, r1377; +} +{ +sub.f16x2 %9, r1379, r1380; +} +{ +add.f16x2 %2, r1388, r1389; +} +{ +add.f16x2 %3, r1391, r1392; +} +{ +sub.f16x2 %10, r1388, r1389; +} +{ +sub.f16x2 %11, r1391, r1392; +} +{ +add.f16x2 %4, r1400, r1401; +} +{ +add.f16x2 %5, r1403, r1404; +} +{ +sub.f16x2 %12, r1400, r1401; +} +{ +sub.f16x2 %13, r1403, r1404; +} +{ +add.f16x2 %6, r1412, r1413; +} +{ +add.f16x2 %7, r1415, r1416; +} +{ +sub.f16x2 %14, r1412, r1413; +} +{ +sub.f16x2 %15, r1415, r1416; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<832, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2569>; +.reg .b64 rd<2>; +mov.u32 r2549, %tid.y; +shl.b32 r2550, r2549, 13; +mov.u32 r2551, %32; +add.s32 r2552, r2551, r2550; +mov.u32 r2553, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f212, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r101, {low, high}; +} +mov.f32 f230, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2554, r2553, 63; +shl.b32 r2555, r2553, 7; +and.b32 r2556, r2555, -8192; +add.s32 r2557, r2552, r2556; +cvt.rn.f32.u32 f301, r2554; +mul.f32 f302, f301, 0f3BC90FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2558, r2555, 8064; +add.s32 r2559, r2557, r2558; +st.shared.v4.f32 [r2559], {r521, r524, r629, r636}; +st.shared.v4.f32 [r2559+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r2559+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r2559+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r2559+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r2559+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r2559+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r2559+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r2560, r2554, -120, r2559; +ld.shared.u32 r1176, [r2560]; +ld.shared.u32 r1179, [r2560+4]; +ld.shared.u32 r1372, [r2560+512]; +ld.shared.u32 r1375, [r2560+516]; +ld.shared.u32 r1226, [r2560+1024]; +ld.shared.u32 r1229, [r2560+1028]; +ld.shared.u32 r1422, [r2560+1536]; +ld.shared.u32 r1425, [r2560+1540]; +ld.shared.u32 r1188, [r2560+2048]; +ld.shared.u32 r1191, [r2560+2052]; +ld.shared.u32 r1384, [r2560+2560]; +ld.shared.u32 r1387, [r2560+2564]; +ld.shared.u32 r1238, [r2560+3072]; +ld.shared.u32 r1241, [r2560+3076]; +ld.shared.u32 r1434, [r2560+3584]; +ld.shared.u32 r1437, [r2560+3588]; +ld.shared.u32 r1177, [r2560+4096]; +ld.shared.u32 r1180, [r2560+4100]; +ld.shared.u32 r1373, [r2560+4608]; +ld.shared.u32 r1376, [r2560+4612]; +ld.shared.u32 r1227, [r2560+5120]; +ld.shared.u32 r1230, [r2560+5124]; +ld.shared.u32 r1423, [r2560+5632]; +ld.shared.u32 r1426, [r2560+5636]; +ld.shared.u32 r1189, [r2560+6144]; +ld.shared.u32 r1192, [r2560+6148]; +ld.shared.u32 r1385, [r2560+6656]; +ld.shared.u32 r1388, [r2560+6660]; +ld.shared.u32 r1239, [r2560+7168]; +ld.shared.u32 r1242, [r2560+7172]; +ld.shared.u32 r1435, [r2560+7680]; +ld.shared.u32 r1438, [r2560+7684]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2561, r2553, 48; +bfe.u32 r2562, r2553, 4, 2; +cvt.rn.f32.u32 f304, r2562; +mul.f32 f305, f304, 0f3DC90FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +shl.b32 r2563, r2553, 3; +and.b32 r2564, r2563, 120; +add.s32 r2565, r2557, r2564; +barrier.sync 0; +and.b32 r2566, r2555, 6144; +add.s32 r2567, r2565, r2566; +st.shared.u32 [r2567], r1695; +st.shared.u32 [r2567+4], r1698; +st.shared.u32 [r2567+128], r1803; +st.shared.u32 [r2567+132], r1810; +st.shared.u32 [r2567+256], r1840; +st.shared.u32 [r2567+260], r1847; +st.shared.u32 [r2567+384], r1877; +st.shared.u32 [r2567+388], r1884; +st.shared.u32 [r2567+512], r1914; +st.shared.u32 [r2567+516], r1921; +st.shared.u32 [r2567+640], r1951; +st.shared.u32 [r2567+644], r1958; +st.shared.u32 [r2567+768], r1988; +st.shared.u32 [r2567+772], r1995; +st.shared.u32 [r2567+896], r2025; +st.shared.u32 [r2567+900], r2032; +st.shared.u32 [r2567+1024], r2062; +st.shared.u32 [r2567+1028], r2069; +st.shared.u32 [r2567+1152], r2099; +st.shared.u32 [r2567+1156], r2106; +st.shared.u32 [r2567+1280], r2136; +st.shared.u32 [r2567+1284], r2143; +st.shared.u32 [r2567+1408], r2173; +st.shared.u32 [r2567+1412], r2180; +st.shared.u32 [r2567+1536], r2210; +st.shared.u32 [r2567+1540], r2217; +st.shared.u32 [r2567+1664], r2247; +st.shared.u32 [r2567+1668], r2254; +st.shared.u32 [r2567+1792], r2284; +st.shared.u32 [r2567+1796], r2291; +st.shared.u32 [r2567+1920], r2321; +st.shared.u32 [r2567+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2568, r2561, -120, r2567; +ld.shared.u32 r2350, [r2568]; +ld.shared.u32 r2353, [r2568+4]; +ld.shared.u32 r2400, [r2568+512]; +ld.shared.u32 r2403, [r2568+516]; +ld.shared.u32 r2450, [r2568+1024]; +ld.shared.u32 r2453, [r2568+1028]; +ld.shared.u32 r2500, [r2568+1536]; +ld.shared.u32 r2503, [r2568+1540]; +ld.shared.u32 r2362, [r2568+2048]; +ld.shared.u32 r2365, [r2568+2052]; +ld.shared.u32 r2412, [r2568+2560]; +ld.shared.u32 r2415, [r2568+2564]; +ld.shared.u32 r2462, [r2568+3072]; +ld.shared.u32 r2465, [r2568+3076]; +ld.shared.u32 r2512, [r2568+3584]; +ld.shared.u32 r2515, [r2568+3588]; +ld.shared.u32 r2351, [r2568+4096]; +ld.shared.u32 r2354, [r2568+4100]; +ld.shared.u32 r2401, [r2568+4608]; +ld.shared.u32 r2404, [r2568+4612]; +ld.shared.u32 r2451, [r2568+5120]; +ld.shared.u32 r2454, [r2568+5124]; +ld.shared.u32 r2501, [r2568+5632]; +ld.shared.u32 r2504, [r2568+5636]; +ld.shared.u32 r2363, [r2568+6144]; +ld.shared.u32 r2366, [r2568+6148]; +ld.shared.u32 r2413, [r2568+6656]; +ld.shared.u32 r2416, [r2568+6660]; +ld.shared.u32 r2463, [r2568+7168]; +ld.shared.u32 r2466, [r2568+7172]; +ld.shared.u32 r2513, [r2568+7680]; +ld.shared.u32 r2516, [r2568+7684]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 %0, r2349, r2361; +} +{ +add.f16x2 %1, r2352, r2364; +} +{ +sub.f16x2 %16, r2349, r2361; +} +{ +sub.f16x2 %17, r2352, r2364; +} +{ +add.f16x2 %8, r2355, r2370; +} +{ +add.f16x2 %9, r2358, r2373; +} +{ +sub.f16x2 %24, r2355, r2370; +} +{ +sub.f16x2 %25, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 %2, r2399, r2411; +} +{ +add.f16x2 %3, r2402, r2414; +} +{ +sub.f16x2 %18, r2399, r2411; +} +{ +sub.f16x2 %19, r2402, r2414; +} +{ +add.f16x2 %10, r2405, r2420; +} +{ +add.f16x2 %11, r2408, r2423; +} +{ +sub.f16x2 %26, r2405, r2420; +} +{ +sub.f16x2 %27, r2408, r2423; +} +{ +add.f16x2 r2449, r2450, r2451; +} +{ +add.f16x2 r2452, r2453, r2454; +} +{ +sub.f16x2 r2455, r2450, r2451; +} +{ +sub.f16x2 r2458, r2453, r2454; +} +{ +add.f16x2 r2461, r2462, r2463; +} +{ +add.f16x2 r2464, r2465, r2466; +} +{ +sub.f16x2 r2467, r2462, r2463; +} +{ +sub.f16x2 r2470, r2465, r2466; +} +{ +neg.f16x2 r2473, r2467; +} +{ +add.f16x2 %4, r2449, r2461; +} +{ +add.f16x2 %5, r2452, r2464; +} +{ +sub.f16x2 %20, r2449, r2461; +} +{ +sub.f16x2 %21, r2452, r2464; +} +{ +add.f16x2 %12, r2455, r2470; +} +{ +add.f16x2 %13, r2458, r2473; +} +{ +sub.f16x2 %28, r2455, r2470; +} +{ +sub.f16x2 %29, r2458, r2473; +} +{ +add.f16x2 r2499, r2500, r2501; +} +{ +add.f16x2 r2502, r2503, r2504; +} +{ +sub.f16x2 r2505, r2500, r2501; +} +{ +sub.f16x2 r2508, r2503, r2504; +} +{ +add.f16x2 r2511, r2512, r2513; +} +{ +add.f16x2 r2514, r2515, r2516; +} +{ +sub.f16x2 r2517, r2512, r2513; +} +{ +sub.f16x2 r2520, r2515, r2516; +} +{ +neg.f16x2 r2523, r2517; +} +{ +add.f16x2 %6, r2499, r2511; +} +{ +add.f16x2 %7, r2502, r2514; +} +{ +sub.f16x2 %22, r2499, r2511; +} +{ +sub.f16x2 %23, r2502, r2514; +} +{ +add.f16x2 %14, r2505, r2520; +} +{ +add.f16x2 %15, r2508, r2523; +} +{ +sub.f16x2 %30, r2505, r2520; +} +{ +sub.f16x2 %31, r2508, r2523; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<833, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<57>; +.reg .b32 r<749>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r29; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r32; +} +{ +add.f16x2 r52, r20, r35; +} +{ +sub.f16x2 r55, r17, r32; +} +{ +sub.f16x2 r58, r20, r35; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 255; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r10, 255; +mov.f32 f56, 0f3BC90F88; +@p1 bra LBB6_2; +cos.approx.f32 f56, f1; +LBB6_2: +sin.approx.f32 f46, f1; +neg.f32 f7, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f7; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +neg.f16x2 r71, r68; +} +{ +fma.rn.f16x2 r73, r49, r64, r71; +} +{ +mul.f16x2 r77, r49, r66; +} +{ +fma.rn.f16x2 r80, r52, r64, r77; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f42, 0fBF800000; +mov.f32 f43, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +neg.f16x2 r108, r105; +} +{ +fma.rn.f16x2 r110, r43, r101, r108; +} +{ +mul.f16x2 r114, r43, r103; +} +{ +fma.rn.f16x2 r117, r46, r101, r114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +neg.f16x2 r145, r142; +} +{ +fma.rn.f16x2 r147, r55, r138, r145; +} +{ +mul.f16x2 r151, r55, r140; +} +{ +fma.rn.f16x2 r154, r58, r138, r151; +} +barrier.sync 0; +mov.u32 r717, %tid.y; +shl.b32 r718, r717, 13; +mov.u32 r719, %8; +add.s32 r720, r719, r718; +shl.b32 r721, r9, 5; +and.b32 r722, r721, -8192; +add.s32 r723, r720, r722; +shl.b32 r724, r10, 5; +add.s32 r725, r723, r724; +st.shared.v4.f32 [r725], {r37, r40, r73, r80}; +st.shared.v4.f32 [r725+16], {r110, r117, r147, r154}; +barrier.sync 0; +mad.lo.s32 r726, r10, -24, r725; +ld.shared.u32 r176, [r726]; +ld.shared.u32 r179, [r726+4]; +ld.shared.u32 r188, [r726+2048]; +ld.shared.u32 r191, [r726+2052]; +ld.shared.u32 r177, [r726+4096]; +ld.shared.u32 r180, [r726+4100]; +ld.shared.u32 r189, [r726+6144]; +ld.shared.u32 r192, [r726+6148]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r193; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r196; +} +{ +add.f16x2 r216, r184, r199; +} +{ +sub.f16x2 r219, r181, r196; +} +{ +sub.f16x2 r222, r184, r199; +} +and.b32 r727, r9, 252; +bfe.u32 r728, r9, 2, 6; +cvt.rn.f32.u32 f47, r728; +mul.f32 f48, f47, 0f3CC90FDB; +cos.approx.f32 f16, f48; +sin.approx.f32 f49, f48; +neg.f32 f17, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f17; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +neg.f16x2 r235, r232; +} +{ +fma.rn.f16x2 r237, r213, r228, r235; +} +{ +mul.f16x2 r241, r213, r230; +} +{ +fma.rn.f16x2 r244, r216, r228, r241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +neg.f16x2 r272, r269; +} +{ +fma.rn.f16x2 r274, r207, r265, r272; +} +{ +mul.f16x2 r278, r207, r267; +} +{ +fma.rn.f16x2 r281, r210, r265, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +neg.f16x2 r309, r306; +} +{ +fma.rn.f16x2 r311, r219, r302, r309; +} +{ +mul.f16x2 r315, r219, r304; +} +{ +fma.rn.f16x2 r318, r222, r302, r315; +} +shl.b32 r729, r9, 3; +and.b32 r730, r729, 24; +add.s32 r731, r723, r730; +barrier.sync 0; +and.b32 r732, r721, 8064; +add.s32 r733, r731, r732; +st.shared.u32 [r733], r201; +st.shared.u32 [r733+4], r204; +st.shared.u32 [r733+32], r237; +st.shared.u32 [r733+36], r244; +st.shared.u32 [r733+64], r274; +st.shared.u32 [r733+68], r281; +st.shared.u32 [r733+96], r311; +st.shared.u32 [r733+100], r318; +barrier.sync 0; +mad.lo.s32 r734, r727, -24, r733; +ld.shared.u32 r340, [r734]; +ld.shared.u32 r343, [r734+4]; +ld.shared.u32 r352, [r734+2048]; +ld.shared.u32 r355, [r734+2052]; +ld.shared.u32 r341, [r734+4096]; +ld.shared.u32 r344, [r734+4100]; +ld.shared.u32 r353, [r734+6144]; +ld.shared.u32 r356, [r734+6148]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r357; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r360; +} +{ +add.f16x2 r380, r348, r363; +} +{ +sub.f16x2 r383, r345, r360; +} +{ +sub.f16x2 r386, r348, r363; +} +and.b32 r735, r9, 240; +bfe.u32 r736, r9, 4, 4; +cvt.rn.f32.u32 f50, r736; +mul.f32 f51, f50, 0f3DC90FDB; +cos.approx.f32 f26, f51; +sin.approx.f32 f52, f51; +neg.f32 f27, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f27; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r377, r392, r399; +} +{ +mul.f16x2 r405, r377, r394; +} +{ +fma.rn.f16x2 r408, r380, r392, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r371, r429, r436; +} +{ +mul.f16x2 r442, r371, r431; +} +{ +fma.rn.f16x2 r445, r374, r429, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +neg.f16x2 r473, r470; +} +{ +fma.rn.f16x2 r475, r383, r466, r473; +} +{ +mul.f16x2 r479, r383, r468; +} +{ +fma.rn.f16x2 r482, r386, r466, r479; +} +and.b32 r737, r729, 120; +add.s32 r738, r723, r737; +barrier.sync 0; +and.b32 r739, r721, 7680; +add.s32 r740, r738, r739; +st.shared.u32 [r740], r365; +st.shared.u32 [r740+4], r368; +st.shared.u32 [r740+128], r401; +st.shared.u32 [r740+132], r408; +st.shared.u32 [r740+256], r438; +st.shared.u32 [r740+260], r445; +st.shared.u32 [r740+384], r475; +st.shared.u32 [r740+388], r482; +barrier.sync 0; +mad.lo.s32 r741, r735, -24, r740; +ld.shared.u32 r504, [r741]; +ld.shared.u32 r507, [r741+4]; +ld.shared.u32 r516, [r741+2048]; +ld.shared.u32 r519, [r741+2052]; +ld.shared.u32 r505, [r741+4096]; +ld.shared.u32 r508, [r741+4100]; +ld.shared.u32 r517, [r741+6144]; +ld.shared.u32 r520, [r741+6148]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r521; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r524; +} +{ +add.f16x2 r544, r512, r527; +} +{ +sub.f16x2 r547, r509, r524; +} +{ +sub.f16x2 r550, r512, r527; +} +and.b32 r742, r9, 192; +bfe.u32 r743, r9, 6, 2; +cvt.rn.f32.u32 f53, r743; +mul.f32 f54, f53, 0f3EC90FDB; +cos.approx.f32 f36, f54; +sin.approx.f32 f55, f54; +neg.f32 f37, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +neg.f16x2 r563, r560; +} +{ +fma.rn.f16x2 r565, r541, r556, r563; +} +{ +mul.f16x2 r569, r541, r558; +} +{ +fma.rn.f16x2 r572, r544, r556, r569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +neg.f16x2 r600, r597; +} +{ +fma.rn.f16x2 r602, r535, r593, r600; +} +{ +mul.f16x2 r606, r535, r595; +} +{ +fma.rn.f16x2 r609, r538, r593, r606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +neg.f16x2 r637, r634; +} +{ +fma.rn.f16x2 r639, r547, r630, r637; +} +{ +mul.f16x2 r643, r547, r632; +} +{ +fma.rn.f16x2 r646, r550, r630, r643; +} +and.b32 r744, r729, 504; +add.s32 r745, r723, r744; +barrier.sync 0; +and.b32 r746, r721, 6144; +add.s32 r747, r745, r746; +st.shared.u32 [r747], r529; +st.shared.u32 [r747+4], r532; +st.shared.u32 [r747+512], r565; +st.shared.u32 [r747+516], r572; +st.shared.u32 [r747+1024], r602; +st.shared.u32 [r747+1028], r609; +st.shared.u32 [r747+1536], r639; +st.shared.u32 [r747+1540], r646; +barrier.sync 0; +mad.lo.s32 r748, r742, -24, r747; +ld.shared.u32 r668, [r748]; +ld.shared.u32 r671, [r748+4]; +ld.shared.u32 r680, [r748+2048]; +ld.shared.u32 r683, [r748+2052]; +ld.shared.u32 r669, [r748+4096]; +ld.shared.u32 r672, [r748+4100]; +ld.shared.u32 r681, [r748+6144]; +ld.shared.u32 r684, [r748+6148]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 %0, r667, r679; +} +{ +add.f16x2 %1, r670, r682; +} +{ +sub.f16x2 %4, r667, r679; +} +{ +sub.f16x2 %5, r670, r682; +} +{ +add.f16x2 %2, r673, r688; +} +{ +add.f16x2 %3, r676, r691; +} +{ +sub.f16x2 %6, r673, r688; +} +{ +sub.f16x2 %7, r676, r691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<834, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<57>; +.reg .b32 r<749>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r29; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r32; +} +{ +add.f16x2 r52, r20, r35; +} +{ +sub.f16x2 r55, r17, r32; +} +{ +sub.f16x2 r58, r20, r35; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 255; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r10, 255; +mov.f32 f56, 0f3BC90F88; +@p1 bra LBB7_2; +cos.approx.f32 f56, f1; +LBB7_2: +mov.u32 r717, %tid.y; +shl.b32 r718, r717, 12; +mov.u32 r719, %8; +add.s32 r720, r719, r718; +shl.b32 r721, r9, 4; +and.b32 r722, r721, -4096; +add.s32 r723, r720, r722; +sin.approx.f32 f46, f1; +neg.f32 f7, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f7; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +neg.f16x2 r71, r68; +} +{ +fma.rn.f16x2 r73, r49, r64, r71; +} +{ +mul.f16x2 r77, r49, r66; +} +{ +fma.rn.f16x2 r80, r52, r64, r77; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f42, 0fBF800000; +mov.f32 f43, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +neg.f16x2 r108, r105; +} +{ +fma.rn.f16x2 r110, r43, r101, r108; +} +{ +mul.f16x2 r114, r43, r103; +} +{ +fma.rn.f16x2 r117, r46, r101, r114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +neg.f16x2 r145, r142; +} +{ +fma.rn.f16x2 r147, r55, r138, r145; +} +{ +mul.f16x2 r151, r55, r140; +} +{ +fma.rn.f16x2 r154, r58, r138, r151; +} +barrier.sync 0; +shl.b32 r724, r10, 4; +add.s32 r725, r723, r724; +st.shared.v4.f32 [r725], {r37, r73, r110, r147}; +barrier.sync 0; +mad.lo.s32 r726, r10, -12, r725; +ld.shared.u32 r176, [r726]; +ld.shared.u32 r188, [r726+1024]; +ld.shared.u32 r177, [r726+2048]; +ld.shared.u32 r189, [r726+3072]; +barrier.sync 0; +st.shared.v4.f32 [r725], {r40, r80, r117, r154}; +barrier.sync 0; +ld.shared.u32 r179, [r726]; +ld.shared.u32 r191, [r726+1024]; +ld.shared.u32 r180, [r726+2048]; +ld.shared.u32 r192, [r726+3072]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r193; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r196; +} +{ +add.f16x2 r216, r184, r199; +} +{ +sub.f16x2 r219, r181, r196; +} +{ +sub.f16x2 r222, r184, r199; +} +and.b32 r727, r9, 252; +bfe.u32 r728, r9, 2, 6; +shl.b32 r729, r9, 2; +and.b32 r730, r729, 12; +add.s32 r731, r723, r730; +cvt.rn.f32.u32 f47, r728; +mul.f32 f48, f47, 0f3CC90FDB; +cos.approx.f32 f16, f48; +sin.approx.f32 f49, f48; +neg.f32 f17, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f17; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +neg.f16x2 r235, r232; +} +{ +fma.rn.f16x2 r237, r213, r228, r235; +} +{ +mul.f16x2 r241, r213, r230; +} +{ +fma.rn.f16x2 r244, r216, r228, r241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +neg.f16x2 r272, r269; +} +{ +fma.rn.f16x2 r274, r207, r265, r272; +} +{ +mul.f16x2 r278, r207, r267; +} +{ +fma.rn.f16x2 r281, r210, r265, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +neg.f16x2 r309, r306; +} +{ +fma.rn.f16x2 r311, r219, r302, r309; +} +{ +mul.f16x2 r315, r219, r304; +} +{ +fma.rn.f16x2 r318, r222, r302, r315; +} +barrier.sync 0; +and.b32 r732, r721, 4032; +add.s32 r733, r731, r732; +st.shared.u32 [r733], r201; +st.shared.u32 [r733+16], r237; +st.shared.u32 [r733+32], r274; +st.shared.u32 [r733+48], r311; +barrier.sync 0; +mad.lo.s32 r734, r727, -12, r733; +ld.shared.u32 r340, [r734]; +ld.shared.u32 r352, [r734+1024]; +ld.shared.u32 r341, [r734+2048]; +ld.shared.u32 r353, [r734+3072]; +barrier.sync 0; +st.shared.u32 [r733], r204; +st.shared.u32 [r733+16], r244; +st.shared.u32 [r733+32], r281; +st.shared.u32 [r733+48], r318; +barrier.sync 0; +ld.shared.u32 r343, [r734]; +ld.shared.u32 r355, [r734+1024]; +ld.shared.u32 r344, [r734+2048]; +ld.shared.u32 r356, [r734+3072]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r357; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r360; +} +{ +add.f16x2 r380, r348, r363; +} +{ +sub.f16x2 r383, r345, r360; +} +{ +sub.f16x2 r386, r348, r363; +} +and.b32 r735, r9, 240; +bfe.u32 r736, r9, 4, 4; +and.b32 r737, r729, 60; +add.s32 r738, r723, r737; +cvt.rn.f32.u32 f50, r736; +mul.f32 f51, f50, 0f3DC90FDB; +cos.approx.f32 f26, f51; +sin.approx.f32 f52, f51; +neg.f32 f27, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f27; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r377, r392, r399; +} +{ +mul.f16x2 r405, r377, r394; +} +{ +fma.rn.f16x2 r408, r380, r392, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r371, r429, r436; +} +{ +mul.f16x2 r442, r371, r431; +} +{ +fma.rn.f16x2 r445, r374, r429, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +neg.f16x2 r473, r470; +} +{ +fma.rn.f16x2 r475, r383, r466, r473; +} +{ +mul.f16x2 r479, r383, r468; +} +{ +fma.rn.f16x2 r482, r386, r466, r479; +} +barrier.sync 0; +and.b32 r739, r721, 3840; +add.s32 r740, r738, r739; +st.shared.u32 [r740], r365; +st.shared.u32 [r740+64], r401; +st.shared.u32 [r740+128], r438; +st.shared.u32 [r740+192], r475; +barrier.sync 0; +mad.lo.s32 r741, r735, -12, r740; +ld.shared.u32 r504, [r741]; +ld.shared.u32 r516, [r741+1024]; +ld.shared.u32 r505, [r741+2048]; +ld.shared.u32 r517, [r741+3072]; +barrier.sync 0; +st.shared.u32 [r740], r368; +st.shared.u32 [r740+64], r408; +st.shared.u32 [r740+128], r445; +st.shared.u32 [r740+192], r482; +barrier.sync 0; +ld.shared.u32 r507, [r741]; +ld.shared.u32 r519, [r741+1024]; +ld.shared.u32 r508, [r741+2048]; +ld.shared.u32 r520, [r741+3072]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r521; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r524; +} +{ +add.f16x2 r544, r512, r527; +} +{ +sub.f16x2 r547, r509, r524; +} +{ +sub.f16x2 r550, r512, r527; +} +and.b32 r742, r9, 192; +bfe.u32 r743, r9, 6, 2; +and.b32 r744, r729, 252; +add.s32 r745, r723, r744; +cvt.rn.f32.u32 f53, r743; +mul.f32 f54, f53, 0f3EC90FDB; +cos.approx.f32 f36, f54; +sin.approx.f32 f55, f54; +neg.f32 f37, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +neg.f16x2 r563, r560; +} +{ +fma.rn.f16x2 r565, r541, r556, r563; +} +{ +mul.f16x2 r569, r541, r558; +} +{ +fma.rn.f16x2 r572, r544, r556, r569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +neg.f16x2 r600, r597; +} +{ +fma.rn.f16x2 r602, r535, r593, r600; +} +{ +mul.f16x2 r606, r535, r595; +} +{ +fma.rn.f16x2 r609, r538, r593, r606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +neg.f16x2 r637, r634; +} +{ +fma.rn.f16x2 r639, r547, r630, r637; +} +{ +mul.f16x2 r643, r547, r632; +} +{ +fma.rn.f16x2 r646, r550, r630, r643; +} +barrier.sync 0; +and.b32 r746, r721, 3072; +add.s32 r747, r745, r746; +st.shared.u32 [r747], r529; +st.shared.u32 [r747+256], r565; +st.shared.u32 [r747+512], r602; +st.shared.u32 [r747+768], r639; +barrier.sync 0; +mad.lo.s32 r748, r742, -12, r747; +ld.shared.u32 r668, [r748]; +ld.shared.u32 r680, [r748+1024]; +ld.shared.u32 r669, [r748+2048]; +ld.shared.u32 r681, [r748+3072]; +barrier.sync 0; +st.shared.u32 [r747], r532; +st.shared.u32 [r747+256], r572; +st.shared.u32 [r747+512], r609; +st.shared.u32 [r747+768], r646; +barrier.sync 0; +ld.shared.u32 r671, [r748]; +ld.shared.u32 r683, [r748+1024]; +ld.shared.u32 r672, [r748+2048]; +ld.shared.u32 r684, [r748+3072]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 %0, r667, r679; +} +{ +add.f16x2 %1, r670, r682; +} +{ +sub.f16x2 %4, r667, r679; +} +{ +sub.f16x2 %5, r670, r682; +} +{ +add.f16x2 %2, r673, r688; +} +{ +add.f16x2 %3, r676, r691; +} +{ +sub.f16x2 %6, r673, r688; +} +{ +sub.f16x2 %7, r676, r691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<835, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<86>; +.reg .b32 r<555>; +.reg .b64 rd<2>; +{ +add.f16x2 r7, %5, %7; +} +{ +add.f16x2 r10, %6, %8; +} +{ +sub.f16x2 r13, %5, %7; +} +{ +sub.f16x2 r16, %6, %8; +} +mov.u32 r5, %tid.x; +and.b32 r6, r5, 511; +cvt.rn.f32.u32 f5, r6; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r6, 255; +mov.f32 f85, 0f3BC90F88; +@p1 bra LBB8_2; +cos.approx.f32 f85, f1; +LBB8_2: +sin.approx.f32 f60, f1; +neg.f32 f7, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f7; +mov.b32 r19, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r22, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r24, {high, high}; +} +{ +mul.f16x2 r26, r16, r24; +} +{ +neg.f16x2 r29, r26; +} +{ +fma.rn.f16x2 r31, r13, r22, r29; +} +{ +mul.f16x2 r35, r13, r24; +} +{ +fma.rn.f16x2 r38, r16, r22, r35; +} +barrier.sync 0; +mov.u32 r487, %tid.y; +shl.b32 r488, r487, 13; +mov.u32 r489, %4; +add.s32 r490, r489, r488; +shl.b32 r491, r5, 4; +and.b32 r492, r491, -8192; +add.s32 r493, r490, r492; +shl.b32 r494, r6, 4; +add.s32 r495, r493, r494; +st.shared.v2.f32 [r495], {r7, r10}; +st.shared.v2.f32 [r495+8], {r31, r38}; +barrier.sync 0; +shl.b32 r496, r6, 3; +sub.s32 r497, r495, r496; +ld.shared.u32 r60, [r497]; +ld.shared.u32 r63, [r497+4]; +ld.shared.u32 r61, [r497+4096]; +ld.shared.u32 r64, [r497+4100]; +{ +add.f16x2 r59, r60, r61; +} +{ +add.f16x2 r62, r63, r64; +} +{ +sub.f16x2 r65, r60, r61; +} +{ +sub.f16x2 r68, r63, r64; +} +bfe.u32 r498, r5, 1, 8; +cvt.rn.f32.u32 f61, r498; +mul.f32 f62, f61, 0f3C490FDB; +cos.approx.f32 f12, f62; +sin.approx.f32 f63, f62; +neg.f32 f13, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f13; +mov.b32 r71, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r76, {high, high}; +} +{ +mul.f16x2 r78, r68, r76; +} +{ +neg.f16x2 r81, r78; +} +{ +fma.rn.f16x2 r83, r65, r74, r81; +} +{ +mul.f16x2 r87, r65, r76; +} +{ +fma.rn.f16x2 r90, r68, r74, r87; +} +shl.b32 r499, r5, 3; +and.b32 r500, r499, 8; +add.s32 r501, r493, r500; +barrier.sync 0; +and.b32 r502, r491, 8160; +add.s32 r503, r501, r502; +st.shared.u32 [r503], r59; +st.shared.u32 [r503+4], r62; +st.shared.u32 [r503+16], r83; +st.shared.u32 [r503+20], r90; +barrier.sync 0; +and.b32 r504, r499, 4080; +sub.s32 r505, r503, r504; +ld.shared.u32 r112, [r505]; +ld.shared.u32 r115, [r505+4]; +ld.shared.u32 r113, [r505+4096]; +ld.shared.u32 r116, [r505+4100]; +{ +add.f16x2 r111, r112, r113; +} +{ +add.f16x2 r114, r115, r116; +} +{ +sub.f16x2 r117, r112, r113; +} +{ +sub.f16x2 r120, r115, r116; +} +bfe.u32 r506, r5, 2, 7; +cvt.rn.f32.u32 f64, r506; +mul.f32 f65, f64, 0f3CC90FDB; +cos.approx.f32 f18, f65; +sin.approx.f32 f66, f65; +neg.f32 f19, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f19; +mov.b32 r123, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r128, {high, high}; +} +{ +mul.f16x2 r130, r120, r128; +} +{ +neg.f16x2 r133, r130; +} +{ +fma.rn.f16x2 r135, r117, r126, r133; +} +{ +mul.f16x2 r139, r117, r128; +} +{ +fma.rn.f16x2 r142, r120, r126, r139; +} +and.b32 r507, r499, 24; +add.s32 r508, r493, r507; +barrier.sync 0; +and.b32 r509, r491, 8128; +add.s32 r510, r508, r509; +st.shared.u32 [r510], r111; +st.shared.u32 [r510+4], r114; +st.shared.u32 [r510+32], r135; +st.shared.u32 [r510+36], r142; +barrier.sync 0; +and.b32 r511, r499, 4064; +sub.s32 r512, r510, r511; +ld.shared.u32 r164, [r512]; +ld.shared.u32 r167, [r512+4]; +ld.shared.u32 r165, [r512+4096]; +ld.shared.u32 r168, [r512+4100]; +{ +add.f16x2 r163, r164, r165; +} +{ +add.f16x2 r166, r167, r168; +} +{ +sub.f16x2 r169, r164, r165; +} +{ +sub.f16x2 r172, r167, r168; +} +bfe.u32 r513, r5, 3, 6; +cvt.rn.f32.u32 f67, r513; +mul.f32 f68, f67, 0f3D490FDB; +cos.approx.f32 f24, f68; +sin.approx.f32 f69, f68; +neg.f32 f25, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f24; +cvt.rn.f16.f32 high, f25; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r178, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r180, {high, high}; +} +{ +mul.f16x2 r182, r172, r180; +} +{ +neg.f16x2 r185, r182; +} +{ +fma.rn.f16x2 r187, r169, r178, r185; +} +{ +mul.f16x2 r191, r169, r180; +} +{ +fma.rn.f16x2 r194, r172, r178, r191; +} +and.b32 r514, r499, 56; +add.s32 r515, r493, r514; +barrier.sync 0; +and.b32 r516, r491, 8064; +add.s32 r517, r515, r516; +st.shared.u32 [r517], r163; +st.shared.u32 [r517+4], r166; +st.shared.u32 [r517+64], r187; +st.shared.u32 [r517+68], r194; +barrier.sync 0; +and.b32 r518, r499, 4032; +sub.s32 r519, r517, r518; +ld.shared.u32 r216, [r519]; +ld.shared.u32 r219, [r519+4]; +ld.shared.u32 r217, [r519+4096]; +ld.shared.u32 r220, [r519+4100]; +{ +add.f16x2 r215, r216, r217; +} +{ +add.f16x2 r218, r219, r220; +} +{ +sub.f16x2 r221, r216, r217; +} +{ +sub.f16x2 r224, r219, r220; +} +bfe.u32 r520, r5, 4, 5; +cvt.rn.f32.u32 f70, r520; +mul.f32 f71, f70, 0f3DC90FDB; +cos.approx.f32 f30, f71; +sin.approx.f32 f72, f71; +neg.f32 f31, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f31; +mov.b32 r227, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r230, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r232, {high, high}; +} +{ +mul.f16x2 r234, r224, r232; +} +{ +neg.f16x2 r237, r234; +} +{ +fma.rn.f16x2 r239, r221, r230, r237; +} +{ +mul.f16x2 r243, r221, r232; +} +{ +fma.rn.f16x2 r246, r224, r230, r243; +} +and.b32 r521, r499, 120; +add.s32 r522, r493, r521; +barrier.sync 0; +and.b32 r523, r491, 7936; +add.s32 r524, r522, r523; +st.shared.u32 [r524], r215; +st.shared.u32 [r524+4], r218; +st.shared.u32 [r524+128], r239; +st.shared.u32 [r524+132], r246; +barrier.sync 0; +and.b32 r525, r499, 3968; +sub.s32 r526, r524, r525; +ld.shared.u32 r268, [r526]; +ld.shared.u32 r271, [r526+4]; +ld.shared.u32 r269, [r526+4096]; +ld.shared.u32 r272, [r526+4100]; +{ +add.f16x2 r267, r268, r269; +} +{ +add.f16x2 r270, r271, r272; +} +{ +sub.f16x2 r273, r268, r269; +} +{ +sub.f16x2 r276, r271, r272; +} +bfe.u32 r527, r5, 5, 4; +cvt.rn.f32.u32 f73, r527; +mul.f32 f74, f73, 0f3E490FDB; +cos.approx.f32 f36, f74; +sin.approx.f32 f75, f74; +neg.f32 f37, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r279, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r282, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r284, {high, high}; +} +{ +mul.f16x2 r286, r276, r284; +} +{ +neg.f16x2 r289, r286; +} +{ +fma.rn.f16x2 r291, r273, r282, r289; +} +{ +mul.f16x2 r295, r273, r284; +} +{ +fma.rn.f16x2 r298, r276, r282, r295; +} +and.b32 r528, r499, 248; +add.s32 r529, r493, r528; +barrier.sync 0; +and.b32 r530, r491, 7680; +add.s32 r531, r529, r530; +st.shared.u32 [r531], r267; +st.shared.u32 [r531+4], r270; +st.shared.u32 [r531+256], r291; +st.shared.u32 [r531+260], r298; +barrier.sync 0; +and.b32 r532, r499, 3840; +sub.s32 r533, r531, r532; +ld.shared.u32 r320, [r533]; +ld.shared.u32 r323, [r533+4]; +ld.shared.u32 r321, [r533+4096]; +ld.shared.u32 r324, [r533+4100]; +{ +add.f16x2 r319, r320, r321; +} +{ +add.f16x2 r322, r323, r324; +} +{ +sub.f16x2 r325, r320, r321; +} +{ +sub.f16x2 r328, r323, r324; +} +bfe.u32 r534, r5, 6, 3; +cvt.rn.f32.u32 f76, r534; +mul.f32 f77, f76, 0f3EC90FDB; +cos.approx.f32 f42, f77; +sin.approx.f32 f78, f77; +neg.f32 f43, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r334, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r336, {high, high}; +} +{ +mul.f16x2 r338, r328, r336; +} +{ +neg.f16x2 r341, r338; +} +{ +fma.rn.f16x2 r343, r325, r334, r341; +} +{ +mul.f16x2 r347, r325, r336; +} +{ +fma.rn.f16x2 r350, r328, r334, r347; +} +and.b32 r535, r499, 504; +add.s32 r536, r493, r535; +barrier.sync 0; +and.b32 r537, r491, 7168; +add.s32 r538, r536, r537; +st.shared.u32 [r538], r319; +st.shared.u32 [r538+4], r322; +st.shared.u32 [r538+512], r343; +st.shared.u32 [r538+516], r350; +barrier.sync 0; +and.b32 r539, r499, 3584; +sub.s32 r540, r538, r539; +ld.shared.u32 r372, [r540]; +ld.shared.u32 r375, [r540+4]; +ld.shared.u32 r373, [r540+4096]; +ld.shared.u32 r376, [r540+4100]; +{ +add.f16x2 r371, r372, r373; +} +{ +add.f16x2 r374, r375, r376; +} +{ +sub.f16x2 r377, r372, r373; +} +{ +sub.f16x2 r380, r375, r376; +} +bfe.u32 r541, r5, 7, 2; +cvt.rn.f32.u32 f79, r541; +mul.f32 f80, f79, 0f3F490FDB; +cos.approx.f32 f48, f80; +sin.approx.f32 f81, f80; +neg.f32 f49, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f49; +mov.b32 r383, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r388, {high, high}; +} +{ +mul.f16x2 r390, r380, r388; +} +{ +neg.f16x2 r393, r390; +} +{ +fma.rn.f16x2 r395, r377, r386, r393; +} +{ +mul.f16x2 r399, r377, r388; +} +{ +fma.rn.f16x2 r402, r380, r386, r399; +} +and.b32 r542, r499, 1016; +add.s32 r543, r493, r542; +barrier.sync 0; +and.b32 r544, r491, 6144; +add.s32 r545, r543, r544; +st.shared.u32 [r545], r371; +st.shared.u32 [r545+4], r374; +st.shared.u32 [r545+1024], r395; +st.shared.u32 [r545+1028], r402; +barrier.sync 0; +and.b32 r546, r499, 3072; +sub.s32 r547, r545, r546; +ld.shared.u32 r424, [r547]; +ld.shared.u32 r427, [r547+4]; +ld.shared.u32 r425, [r547+4096]; +ld.shared.u32 r428, [r547+4100]; +{ +add.f16x2 r423, r424, r425; +} +{ +add.f16x2 r426, r427, r428; +} +{ +sub.f16x2 r429, r424, r425; +} +{ +sub.f16x2 r432, r427, r428; +} +bfe.u32 r548, r5, 8, 1; +cvt.rn.f32.u32 f82, r548; +mul.f32 f83, f82, 0f3FC90FDB; +cos.approx.f32 f54, f83; +sin.approx.f32 f84, f83; +neg.f32 f55, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f55; +mov.b32 r435, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r438, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r440, {high, high}; +} +{ +mul.f16x2 r442, r432, r440; +} +{ +neg.f16x2 r445, r442; +} +{ +fma.rn.f16x2 r447, r429, r438, r445; +} +{ +mul.f16x2 r451, r429, r440; +} +{ +fma.rn.f16x2 r454, r432, r438, r451; +} +and.b32 r549, r499, 2040; +add.s32 r550, r493, r549; +barrier.sync 0; +and.b32 r551, r491, 4096; +add.s32 r552, r550, r551; +st.shared.u32 [r552], r423; +st.shared.u32 [r552+4], r426; +st.shared.u32 [r552+2048], r447; +st.shared.u32 [r552+2052], r454; +barrier.sync 0; +and.b32 r553, r499, 2048; +sub.s32 r554, r552, r553; +ld.shared.u32 r476, [r554]; +ld.shared.u32 r479, [r554+4]; +ld.shared.u32 r477, [r554+4096]; +ld.shared.u32 r480, [r554+4100]; +{ +add.f16x2 %0, r476, r477; +} +{ +add.f16x2 %1, r479, r480; +} +{ +sub.f16x2 %2, r476, r477; +} +{ +sub.f16x2 %3, r479, r480; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<836, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<86>; +.reg .b32 r<556>; +.reg .b64 rd<2>; +{ +add.f16x2 r5, %5, %7; +} +{ +add.f16x2 r8, %6, %8; +} +{ +sub.f16x2 r11, %5, %7; +} +{ +sub.f16x2 r14, %6, %8; +} +mov.u32 r17, %tid.x; +and.b32 r18, r17, 511; +cvt.rn.f32.u32 f5, r18; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r18, 255; +mov.f32 f85, 0f3BC90F88; +@p1 bra LBB9_2; +cos.approx.f32 f85, f1; +LBB9_2: +mov.u32 r487, %tid.y; +shl.b32 r488, r487, 12; +mov.u32 r489, %4; +add.s32 r490, r489, r488; +shl.b32 r492, r17, 3; +and.b32 r493, r492, -4096; +add.s32 r494, r490, r493; +sin.approx.f32 f60, f1; +neg.f32 f7, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f7; +mov.b32 r19, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r22, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r24, {high, high}; +} +{ +mul.f16x2 r26, r14, r24; +} +{ +neg.f16x2 r29, r26; +} +{ +fma.rn.f16x2 r31, r11, r22, r29; +} +{ +mul.f16x2 r35, r11, r24; +} +{ +fma.rn.f16x2 r38, r14, r22, r35; +} +barrier.sync 0; +and.b32 r495, r492, 4088; +add.s32 r496, r494, r495; +st.shared.v2.f32 [r496], {r5, r31}; +barrier.sync 0; +shl.b32 r497, r17, 2; +and.b32 r498, r497, 2044; +sub.s32 r499, r496, r498; +ld.shared.u32 r60, [r499]; +ld.shared.u32 r61, [r499+2048]; +barrier.sync 0; +st.shared.v2.f32 [r496], {r8, r38}; +barrier.sync 0; +ld.shared.u32 r63, [r499]; +ld.shared.u32 r64, [r499+2048]; +{ +add.f16x2 r59, r60, r61; +} +{ +add.f16x2 r62, r63, r64; +} +{ +sub.f16x2 r65, r60, r61; +} +{ +sub.f16x2 r68, r63, r64; +} +bfe.u32 r500, r17, 1, 8; +and.b32 r501, r497, 4; +add.s32 r502, r494, r501; +cvt.rn.f32.u32 f61, r500; +mul.f32 f62, f61, 0f3C490FDB; +cos.approx.f32 f12, f62; +sin.approx.f32 f63, f62; +neg.f32 f13, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f13; +mov.b32 r71, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r76, {high, high}; +} +{ +mul.f16x2 r78, r68, r76; +} +{ +neg.f16x2 r81, r78; +} +{ +fma.rn.f16x2 r83, r65, r74, r81; +} +{ +mul.f16x2 r87, r65, r76; +} +{ +fma.rn.f16x2 r90, r68, r74, r87; +} +barrier.sync 0; +and.b32 r503, r492, 4080; +add.s32 r504, r502, r503; +st.shared.u32 [r504], r59; +st.shared.u32 [r504+8], r83; +barrier.sync 0; +and.b32 r505, r497, 2040; +sub.s32 r506, r504, r505; +ld.shared.u32 r112, [r506]; +ld.shared.u32 r113, [r506+2048]; +barrier.sync 0; +st.shared.u32 [r504], r62; +st.shared.u32 [r504+8], r90; +barrier.sync 0; +ld.shared.u32 r115, [r506]; +ld.shared.u32 r116, [r506+2048]; +{ +add.f16x2 r111, r112, r113; +} +{ +add.f16x2 r114, r115, r116; +} +{ +sub.f16x2 r117, r112, r113; +} +{ +sub.f16x2 r120, r115, r116; +} +bfe.u32 r507, r17, 2, 7; +and.b32 r508, r497, 12; +add.s32 r509, r494, r508; +cvt.rn.f32.u32 f64, r507; +mul.f32 f65, f64, 0f3CC90FDB; +cos.approx.f32 f18, f65; +sin.approx.f32 f66, f65; +neg.f32 f19, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f19; +mov.b32 r123, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r128, {high, high}; +} +{ +mul.f16x2 r130, r120, r128; +} +{ +neg.f16x2 r133, r130; +} +{ +fma.rn.f16x2 r135, r117, r126, r133; +} +{ +mul.f16x2 r139, r117, r128; +} +{ +fma.rn.f16x2 r142, r120, r126, r139; +} +barrier.sync 0; +and.b32 r510, r492, 4064; +add.s32 r511, r509, r510; +st.shared.u32 [r511], r111; +st.shared.u32 [r511+16], r135; +barrier.sync 0; +and.b32 r512, r497, 2032; +sub.s32 r513, r511, r512; +ld.shared.u32 r164, [r513]; +ld.shared.u32 r165, [r513+2048]; +barrier.sync 0; +st.shared.u32 [r511], r114; +st.shared.u32 [r511+16], r142; +barrier.sync 0; +ld.shared.u32 r167, [r513]; +ld.shared.u32 r168, [r513+2048]; +{ +add.f16x2 r163, r164, r165; +} +{ +add.f16x2 r166, r167, r168; +} +{ +sub.f16x2 r169, r164, r165; +} +{ +sub.f16x2 r172, r167, r168; +} +bfe.u32 r514, r17, 3, 6; +and.b32 r515, r497, 28; +add.s32 r516, r494, r515; +cvt.rn.f32.u32 f67, r514; +mul.f32 f68, f67, 0f3D490FDB; +cos.approx.f32 f24, f68; +sin.approx.f32 f69, f68; +neg.f32 f25, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f24; +cvt.rn.f16.f32 high, f25; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r178, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r180, {high, high}; +} +{ +mul.f16x2 r182, r172, r180; +} +{ +neg.f16x2 r185, r182; +} +{ +fma.rn.f16x2 r187, r169, r178, r185; +} +{ +mul.f16x2 r191, r169, r180; +} +{ +fma.rn.f16x2 r194, r172, r178, r191; +} +barrier.sync 0; +and.b32 r517, r492, 4032; +add.s32 r518, r516, r517; +st.shared.u32 [r518], r163; +st.shared.u32 [r518+32], r187; +barrier.sync 0; +and.b32 r519, r497, 2016; +sub.s32 r520, r518, r519; +ld.shared.u32 r216, [r520]; +ld.shared.u32 r217, [r520+2048]; +barrier.sync 0; +st.shared.u32 [r518], r166; +st.shared.u32 [r518+32], r194; +barrier.sync 0; +ld.shared.u32 r219, [r520]; +ld.shared.u32 r220, [r520+2048]; +{ +add.f16x2 r215, r216, r217; +} +{ +add.f16x2 r218, r219, r220; +} +{ +sub.f16x2 r221, r216, r217; +} +{ +sub.f16x2 r224, r219, r220; +} +bfe.u32 r521, r17, 4, 5; +and.b32 r522, r497, 60; +add.s32 r523, r494, r522; +cvt.rn.f32.u32 f70, r521; +mul.f32 f71, f70, 0f3DC90FDB; +cos.approx.f32 f30, f71; +sin.approx.f32 f72, f71; +neg.f32 f31, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f31; +mov.b32 r227, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r230, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r232, {high, high}; +} +{ +mul.f16x2 r234, r224, r232; +} +{ +neg.f16x2 r237, r234; +} +{ +fma.rn.f16x2 r239, r221, r230, r237; +} +{ +mul.f16x2 r243, r221, r232; +} +{ +fma.rn.f16x2 r246, r224, r230, r243; +} +barrier.sync 0; +and.b32 r524, r492, 3968; +add.s32 r525, r523, r524; +st.shared.u32 [r525], r215; +st.shared.u32 [r525+64], r239; +barrier.sync 0; +and.b32 r526, r497, 1984; +sub.s32 r527, r525, r526; +ld.shared.u32 r268, [r527]; +ld.shared.u32 r269, [r527+2048]; +barrier.sync 0; +st.shared.u32 [r525], r218; +st.shared.u32 [r525+64], r246; +barrier.sync 0; +ld.shared.u32 r271, [r527]; +ld.shared.u32 r272, [r527+2048]; +{ +add.f16x2 r267, r268, r269; +} +{ +add.f16x2 r270, r271, r272; +} +{ +sub.f16x2 r273, r268, r269; +} +{ +sub.f16x2 r276, r271, r272; +} +bfe.u32 r528, r17, 5, 4; +and.b32 r529, r497, 124; +add.s32 r530, r494, r529; +cvt.rn.f32.u32 f73, r528; +mul.f32 f74, f73, 0f3E490FDB; +cos.approx.f32 f36, f74; +sin.approx.f32 f75, f74; +neg.f32 f37, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r279, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r282, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r284, {high, high}; +} +{ +mul.f16x2 r286, r276, r284; +} +{ +neg.f16x2 r289, r286; +} +{ +fma.rn.f16x2 r291, r273, r282, r289; +} +{ +mul.f16x2 r295, r273, r284; +} +{ +fma.rn.f16x2 r298, r276, r282, r295; +} +barrier.sync 0; +and.b32 r531, r492, 3840; +add.s32 r532, r530, r531; +st.shared.u32 [r532], r267; +st.shared.u32 [r532+128], r291; +barrier.sync 0; +and.b32 r533, r497, 1920; +sub.s32 r534, r532, r533; +ld.shared.u32 r320, [r534]; +ld.shared.u32 r321, [r534+2048]; +barrier.sync 0; +st.shared.u32 [r532], r270; +st.shared.u32 [r532+128], r298; +barrier.sync 0; +ld.shared.u32 r323, [r534]; +ld.shared.u32 r324, [r534+2048]; +{ +add.f16x2 r319, r320, r321; +} +{ +add.f16x2 r322, r323, r324; +} +{ +sub.f16x2 r325, r320, r321; +} +{ +sub.f16x2 r328, r323, r324; +} +bfe.u32 r535, r17, 6, 3; +and.b32 r536, r497, 252; +add.s32 r537, r494, r536; +cvt.rn.f32.u32 f76, r535; +mul.f32 f77, f76, 0f3EC90FDB; +cos.approx.f32 f42, f77; +sin.approx.f32 f78, f77; +neg.f32 f43, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r334, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r336, {high, high}; +} +{ +mul.f16x2 r338, r328, r336; +} +{ +neg.f16x2 r341, r338; +} +{ +fma.rn.f16x2 r343, r325, r334, r341; +} +{ +mul.f16x2 r347, r325, r336; +} +{ +fma.rn.f16x2 r350, r328, r334, r347; +} +barrier.sync 0; +and.b32 r538, r492, 3584; +add.s32 r539, r537, r538; +st.shared.u32 [r539], r319; +st.shared.u32 [r539+256], r343; +barrier.sync 0; +and.b32 r540, r497, 1792; +sub.s32 r541, r539, r540; +ld.shared.u32 r372, [r541]; +ld.shared.u32 r373, [r541+2048]; +barrier.sync 0; +st.shared.u32 [r539], r322; +st.shared.u32 [r539+256], r350; +barrier.sync 0; +ld.shared.u32 r375, [r541]; +ld.shared.u32 r376, [r541+2048]; +{ +add.f16x2 r371, r372, r373; +} +{ +add.f16x2 r374, r375, r376; +} +{ +sub.f16x2 r377, r372, r373; +} +{ +sub.f16x2 r380, r375, r376; +} +bfe.u32 r542, r17, 7, 2; +and.b32 r543, r497, 508; +add.s32 r544, r494, r543; +cvt.rn.f32.u32 f79, r542; +mul.f32 f80, f79, 0f3F490FDB; +cos.approx.f32 f48, f80; +sin.approx.f32 f81, f80; +neg.f32 f49, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f49; +mov.b32 r383, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r388, {high, high}; +} +{ +mul.f16x2 r390, r380, r388; +} +{ +neg.f16x2 r393, r390; +} +{ +fma.rn.f16x2 r395, r377, r386, r393; +} +{ +mul.f16x2 r399, r377, r388; +} +{ +fma.rn.f16x2 r402, r380, r386, r399; +} +barrier.sync 0; +and.b32 r545, r492, 3072; +add.s32 r546, r544, r545; +st.shared.u32 [r546], r371; +st.shared.u32 [r546+512], r395; +barrier.sync 0; +and.b32 r547, r497, 1536; +sub.s32 r548, r546, r547; +ld.shared.u32 r424, [r548]; +ld.shared.u32 r425, [r548+2048]; +barrier.sync 0; +st.shared.u32 [r546], r374; +st.shared.u32 [r546+512], r402; +barrier.sync 0; +ld.shared.u32 r427, [r548]; +ld.shared.u32 r428, [r548+2048]; +{ +add.f16x2 r423, r424, r425; +} +{ +add.f16x2 r426, r427, r428; +} +{ +sub.f16x2 r429, r424, r425; +} +{ +sub.f16x2 r432, r427, r428; +} +bfe.u32 r549, r17, 8, 1; +and.b32 r550, r497, 1020; +add.s32 r551, r494, r550; +cvt.rn.f32.u32 f82, r549; +mul.f32 f83, f82, 0f3FC90FDB; +cos.approx.f32 f54, f83; +sin.approx.f32 f84, f83; +neg.f32 f55, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f55; +mov.b32 r435, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r438, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r440, {high, high}; +} +{ +mul.f16x2 r442, r432, r440; +} +{ +neg.f16x2 r445, r442; +} +{ +fma.rn.f16x2 r447, r429, r438, r445; +} +{ +mul.f16x2 r451, r429, r440; +} +{ +fma.rn.f16x2 r454, r432, r438, r451; +} +barrier.sync 0; +and.b32 r552, r492, 2048; +add.s32 r553, r551, r552; +st.shared.u32 [r553], r423; +st.shared.u32 [r553+1024], r447; +barrier.sync 0; +and.b32 r554, r497, 1024; +sub.s32 r555, r553, r554; +ld.shared.u32 r476, [r555]; +ld.shared.u32 r477, [r555+2048]; +barrier.sync 0; +st.shared.u32 [r553], r426; +st.shared.u32 [r553+1024], r454; +barrier.sync 0; +ld.shared.u32 r479, [r555]; +ld.shared.u32 r480, [r555+2048]; +{ +add.f16x2 %0, r476, r477; +} +{ +add.f16x2 %1, r479, r480; +} +{ +sub.f16x2 %2, r476, r477; +} +{ +sub.f16x2 %3, r479, r480; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..08e0f4a8f9e92 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp16_inv.hpp.inc @@ -0,0 +1,24942 @@ +#ifndef CUFFTDX_FFT_1024_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_1024_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1031, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<784>; +.reg .b32 r<4651>; +.reg .b64 rd<3>; +mov.u32 r4575, %tid.y; +shl.b32 r4576, r4575, 13; +mov.u32 r4577, %64; +add.s32 r4578, r4577, r4576; +mov.u32 r4579, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f702, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +mov.f32 f700, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %114, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %114, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %113; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %113; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4580, r4579, 31; +shl.b32 r4581, r4579, 8; +and.b32 r4582, r4581, -8192; +add.s32 r4583, r4578, r4582; +cvt.rn.f32.u32 f779, r4580; +mul.f32 f780, f779, 0f3BC90FDB; +cos.approx.f32 f357, f780; +sin.approx.f32 f781, f780; +neg.f32 f358, f781; +mov.f32 f783, 0fBF800000; +mov.f32 f782, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r4584, r4581, 7936; +add.s32 r4585, r4583, r4584; +st.shared.v4.f32 [r4585], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r4585+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r4585+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r4585+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r4585+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r4585+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r4585+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r4585+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r4585+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r4585+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r4585+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r4585+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r4585+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r4585+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r4585+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r4585+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r4586, r4580, -248, r4585; +ld.shared.u32 r2864, [r4586]; +ld.shared.u32 r2867, [r4586+4]; +ld.shared.u32 r3480, [r4586+256]; +ld.shared.u32 r3483, [r4586+260]; +ld.shared.u32 r3060, [r4586+512]; +ld.shared.u32 r3063, [r4586+516]; +ld.shared.u32 r3676, [r4586+768]; +ld.shared.u32 r3679, [r4586+772]; +ld.shared.u32 r2914, [r4586+1024]; +ld.shared.u32 r2917, [r4586+1028]; +ld.shared.u32 r3530, [r4586+1280]; +ld.shared.u32 r3533, [r4586+1284]; +ld.shared.u32 r3110, [r4586+1536]; +ld.shared.u32 r3113, [r4586+1540]; +ld.shared.u32 r3726, [r4586+1792]; +ld.shared.u32 r3729, [r4586+1796]; +ld.shared.u32 r2876, [r4586+2048]; +ld.shared.u32 r2879, [r4586+2052]; +ld.shared.u32 r3492, [r4586+2304]; +ld.shared.u32 r3495, [r4586+2308]; +ld.shared.u32 r3072, [r4586+2560]; +ld.shared.u32 r3075, [r4586+2564]; +ld.shared.u32 r3688, [r4586+2816]; +ld.shared.u32 r3691, [r4586+2820]; +ld.shared.u32 r2926, [r4586+3072]; +ld.shared.u32 r2929, [r4586+3076]; +ld.shared.u32 r3542, [r4586+3328]; +ld.shared.u32 r3545, [r4586+3332]; +ld.shared.u32 r3122, [r4586+3584]; +ld.shared.u32 r3125, [r4586+3588]; +ld.shared.u32 r3738, [r4586+3840]; +ld.shared.u32 r3741, [r4586+3844]; +ld.shared.u32 r2865, [r4586+4096]; +ld.shared.u32 r2868, [r4586+4100]; +ld.shared.u32 r3481, [r4586+4352]; +ld.shared.u32 r3484, [r4586+4356]; +ld.shared.u32 r3061, [r4586+4608]; +ld.shared.u32 r3064, [r4586+4612]; +ld.shared.u32 r3677, [r4586+4864]; +ld.shared.u32 r3680, [r4586+4868]; +ld.shared.u32 r2915, [r4586+5120]; +ld.shared.u32 r2918, [r4586+5124]; +ld.shared.u32 r3531, [r4586+5376]; +ld.shared.u32 r3534, [r4586+5380]; +ld.shared.u32 r3111, [r4586+5632]; +ld.shared.u32 r3114, [r4586+5636]; +ld.shared.u32 r3727, [r4586+5888]; +ld.shared.u32 r3730, [r4586+5892]; +ld.shared.u32 r2877, [r4586+6144]; +ld.shared.u32 r2880, [r4586+6148]; +ld.shared.u32 r3493, [r4586+6400]; +ld.shared.u32 r3496, [r4586+6404]; +ld.shared.u32 r3073, [r4586+6656]; +ld.shared.u32 r3076, [r4586+6660]; +ld.shared.u32 r3689, [r4586+6912]; +ld.shared.u32 r3692, [r4586+6916]; +ld.shared.u32 r2927, [r4586+7168]; +ld.shared.u32 r2930, [r4586+7172]; +ld.shared.u32 r3543, [r4586+7424]; +ld.shared.u32 r3546, [r4586+7428]; +ld.shared.u32 r3123, [r4586+7680]; +ld.shared.u32 r3126, [r4586+7684]; +ld.shared.u32 r3739, [r4586+7936]; +ld.shared.u32 r3742, [r4586+7940]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 %0, r3383, r3999; +} +{ +add.f16x2 %1, r3386, r4002; +} +{ +sub.f16x2 %32, r3383, r3999; +} +{ +sub.f16x2 %33, r3386, r4002; +} +{ +add.f16x2 %2, r3395, r4163; +} +{ +add.f16x2 %3, r3398, r4169; +} +{ +sub.f16x2 %34, r3395, r4163; +} +{ +sub.f16x2 %35, r3398, r4169; +} +{ +add.f16x2 %4, r3407, r4179; +} +{ +add.f16x2 %5, r3410, r4185; +} +{ +sub.f16x2 %36, r3407, r4179; +} +{ +sub.f16x2 %37, r3410, r4185; +} +{ +add.f16x2 %6, r3419, r4195; +} +{ +add.f16x2 %7, r3422, r4201; +} +{ +sub.f16x2 %38, r3419, r4195; +} +{ +sub.f16x2 %39, r3422, r4201; +} +{ +add.f16x2 %8, r3431, r4211; +} +{ +add.f16x2 %9, r3434, r4217; +} +{ +sub.f16x2 %40, r3431, r4211; +} +{ +sub.f16x2 %41, r3434, r4217; +} +{ +add.f16x2 %10, r3443, r4227; +} +{ +add.f16x2 %11, r3446, r4233; +} +{ +sub.f16x2 %42, r3443, r4227; +} +{ +sub.f16x2 %43, r3446, r4233; +} +{ +add.f16x2 %12, r3455, r4243; +} +{ +add.f16x2 %13, r3458, r4249; +} +{ +sub.f16x2 %44, r3455, r4243; +} +{ +sub.f16x2 %45, r3458, r4249; +} +{ +add.f16x2 %14, r3467, r4259; +} +{ +add.f16x2 %15, r3470, r4265; +} +{ +sub.f16x2 %46, r3467, r4259; +} +{ +sub.f16x2 %47, r3470, r4265; +} +{ +add.f16x2 %16, r3389, r4269; +} +{ +add.f16x2 %17, r3392, r4005; +} +{ +sub.f16x2 %48, r3389, r4269; +} +{ +sub.f16x2 %49, r3392, r4005; +} +{ +add.f16x2 %18, r3401, r4277; +} +{ +add.f16x2 %19, r3404, r4283; +} +{ +sub.f16x2 %50, r3401, r4277; +} +{ +sub.f16x2 %51, r3404, r4283; +} +{ +add.f16x2 %20, r3413, r4293; +} +{ +add.f16x2 %21, r3416, r4299; +} +{ +sub.f16x2 %52, r3413, r4293; +} +{ +sub.f16x2 %53, r3416, r4299; +} +{ +add.f16x2 %22, r3425, r4309; +} +{ +add.f16x2 %23, r3428, r4315; +} +{ +sub.f16x2 %54, r3425, r4309; +} +{ +sub.f16x2 %55, r3428, r4315; +} +{ +add.f16x2 %24, r3437, r4325; +} +{ +add.f16x2 %25, r3440, r4331; +} +{ +sub.f16x2 %56, r3437, r4325; +} +{ +sub.f16x2 %57, r3440, r4331; +} +{ +add.f16x2 %26, r3449, r4341; +} +{ +add.f16x2 %27, r3452, r4347; +} +{ +sub.f16x2 %58, r3449, r4341; +} +{ +sub.f16x2 %59, r3452, r4347; +} +{ +add.f16x2 %28, r3461, r4357; +} +{ +add.f16x2 %29, r3464, r4363; +} +{ +sub.f16x2 %60, r3461, r4357; +} +{ +sub.f16x2 %61, r3464, r4363; +} +{ +add.f16x2 %30, r3473, r4373; +} +{ +add.f16x2 %31, r3476, r4379; +} +{ +sub.f16x2 %62, r3473, r4373; +} +{ +sub.f16x2 %63, r3476, r4379; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1029, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2569>; +.reg .b64 rd<2>; +mov.u32 r2549, %tid.y; +shl.b32 r2550, r2549, 12; +mov.u32 r2551, %32; +add.s32 r2552, r2551, r2550; +mov.u32 r2553, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f230, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f228, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2554, r2553, 63; +shl.b32 r2555, r2553, 6; +and.b32 r2556, r2555, -4096; +add.s32 r2557, r2552, r2556; +cvt.rn.f32.u32 f301, r2554; +mul.f32 f302, f301, 0f3BC90FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2558, r2555, 4032; +add.s32 r2559, r2557, r2558; +st.shared.v4.f32 [r2559], {r521, r627, r664, r701}; +st.shared.v4.f32 [r2559+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r2559+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r2559+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r2560, r2554, -60, r2559; +ld.shared.u32 r1176, [r2560]; +ld.shared.u32 r1372, [r2560+256]; +ld.shared.u32 r1226, [r2560+512]; +ld.shared.u32 r1422, [r2560+768]; +ld.shared.u32 r1188, [r2560+1024]; +ld.shared.u32 r1384, [r2560+1280]; +ld.shared.u32 r1238, [r2560+1536]; +ld.shared.u32 r1434, [r2560+1792]; +ld.shared.u32 r1177, [r2560+2048]; +ld.shared.u32 r1373, [r2560+2304]; +ld.shared.u32 r1227, [r2560+2560]; +ld.shared.u32 r1423, [r2560+2816]; +ld.shared.u32 r1189, [r2560+3072]; +ld.shared.u32 r1385, [r2560+3328]; +ld.shared.u32 r1239, [r2560+3584]; +ld.shared.u32 r1435, [r2560+3840]; +barrier.sync 0; +st.shared.v4.f32 [r2559], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2559+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2559+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2559+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2560]; +ld.shared.u32 r1375, [r2560+256]; +ld.shared.u32 r1229, [r2560+512]; +ld.shared.u32 r1425, [r2560+768]; +ld.shared.u32 r1191, [r2560+1024]; +ld.shared.u32 r1387, [r2560+1280]; +ld.shared.u32 r1241, [r2560+1536]; +ld.shared.u32 r1437, [r2560+1792]; +ld.shared.u32 r1180, [r2560+2048]; +ld.shared.u32 r1376, [r2560+2304]; +ld.shared.u32 r1230, [r2560+2560]; +ld.shared.u32 r1426, [r2560+2816]; +ld.shared.u32 r1192, [r2560+3072]; +ld.shared.u32 r1388, [r2560+3328]; +ld.shared.u32 r1242, [r2560+3584]; +ld.shared.u32 r1438, [r2560+3840]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2561, r2553, 48; +bfe.u32 r2562, r2553, 4, 2; +shl.b32 r2563, r2553, 2; +and.b32 r2564, r2563, 60; +add.s32 r2565, r2557, r2564; +cvt.rn.f32.u32 f304, r2562; +mul.f32 f305, f304, 0f3DC90FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +barrier.sync 0; +and.b32 r2566, r2555, 3072; +add.s32 r2567, r2565, r2566; +st.shared.u32 [r2567], r1695; +st.shared.u32 [r2567+64], r1801; +st.shared.u32 [r2567+128], r1838; +st.shared.u32 [r2567+192], r1875; +st.shared.u32 [r2567+256], r1912; +st.shared.u32 [r2567+320], r1949; +st.shared.u32 [r2567+384], r1986; +st.shared.u32 [r2567+448], r2023; +st.shared.u32 [r2567+512], r2060; +st.shared.u32 [r2567+576], r2097; +st.shared.u32 [r2567+640], r2134; +st.shared.u32 [r2567+704], r2171; +st.shared.u32 [r2567+768], r2208; +st.shared.u32 [r2567+832], r2245; +st.shared.u32 [r2567+896], r2282; +st.shared.u32 [r2567+960], r2319; +barrier.sync 0; +mad.lo.s32 r2568, r2561, -60, r2567; +ld.shared.u32 r2350, [r2568]; +ld.shared.u32 r2400, [r2568+256]; +ld.shared.u32 r2450, [r2568+512]; +ld.shared.u32 r2500, [r2568+768]; +ld.shared.u32 r2362, [r2568+1024]; +ld.shared.u32 r2412, [r2568+1280]; +ld.shared.u32 r2462, [r2568+1536]; +ld.shared.u32 r2512, [r2568+1792]; +ld.shared.u32 r2351, [r2568+2048]; +ld.shared.u32 r2401, [r2568+2304]; +ld.shared.u32 r2451, [r2568+2560]; +ld.shared.u32 r2501, [r2568+2816]; +ld.shared.u32 r2363, [r2568+3072]; +ld.shared.u32 r2413, [r2568+3328]; +ld.shared.u32 r2463, [r2568+3584]; +ld.shared.u32 r2513, [r2568+3840]; +barrier.sync 0; +st.shared.u32 [r2567], r1698; +st.shared.u32 [r2567+64], r1810; +st.shared.u32 [r2567+128], r1847; +st.shared.u32 [r2567+192], r1884; +st.shared.u32 [r2567+256], r1921; +st.shared.u32 [r2567+320], r1958; +st.shared.u32 [r2567+384], r1995; +st.shared.u32 [r2567+448], r2032; +st.shared.u32 [r2567+512], r2069; +st.shared.u32 [r2567+576], r2106; +st.shared.u32 [r2567+640], r2143; +st.shared.u32 [r2567+704], r2180; +st.shared.u32 [r2567+768], r2217; +st.shared.u32 [r2567+832], r2254; +st.shared.u32 [r2567+896], r2291; +st.shared.u32 [r2567+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2568]; +ld.shared.u32 r2403, [r2568+256]; +ld.shared.u32 r2453, [r2568+512]; +ld.shared.u32 r2503, [r2568+768]; +ld.shared.u32 r2365, [r2568+1024]; +ld.shared.u32 r2415, [r2568+1280]; +ld.shared.u32 r2465, [r2568+1536]; +ld.shared.u32 r2515, [r2568+1792]; +ld.shared.u32 r2354, [r2568+2048]; +ld.shared.u32 r2404, [r2568+2304]; +ld.shared.u32 r2454, [r2568+2560]; +ld.shared.u32 r2504, [r2568+2816]; +ld.shared.u32 r2366, [r2568+3072]; +ld.shared.u32 r2416, [r2568+3328]; +ld.shared.u32 r2466, [r2568+3584]; +ld.shared.u32 r2516, [r2568+3840]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 %0, r2349, r2361; +} +{ +add.f16x2 %1, r2352, r2364; +} +{ +sub.f16x2 %16, r2349, r2361; +} +{ +sub.f16x2 %17, r2352, r2364; +} +{ +add.f16x2 %8, r2355, r2373; +} +{ +add.f16x2 %9, r2358, r2367; +} +{ +sub.f16x2 %24, r2355, r2373; +} +{ +sub.f16x2 %25, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 %2, r2399, r2411; +} +{ +add.f16x2 %3, r2402, r2414; +} +{ +sub.f16x2 %18, r2399, r2411; +} +{ +sub.f16x2 %19, r2402, r2414; +} +{ +add.f16x2 %10, r2405, r2423; +} +{ +add.f16x2 %11, r2408, r2417; +} +{ +sub.f16x2 %26, r2405, r2423; +} +{ +sub.f16x2 %27, r2408, r2417; +} +{ +add.f16x2 r2449, r2450, r2451; +} +{ +add.f16x2 r2452, r2453, r2454; +} +{ +sub.f16x2 r2455, r2450, r2451; +} +{ +sub.f16x2 r2458, r2453, r2454; +} +{ +add.f16x2 r2461, r2462, r2463; +} +{ +add.f16x2 r2464, r2465, r2466; +} +{ +sub.f16x2 r2467, r2462, r2463; +} +{ +sub.f16x2 r2470, r2465, r2466; +} +{ +neg.f16x2 r2473, r2470; +} +{ +add.f16x2 %4, r2449, r2461; +} +{ +add.f16x2 %5, r2452, r2464; +} +{ +sub.f16x2 %20, r2449, r2461; +} +{ +sub.f16x2 %21, r2452, r2464; +} +{ +add.f16x2 %12, r2455, r2473; +} +{ +add.f16x2 %13, r2458, r2467; +} +{ +sub.f16x2 %28, r2455, r2473; +} +{ +sub.f16x2 %29, r2458, r2467; +} +{ +add.f16x2 r2499, r2500, r2501; +} +{ +add.f16x2 r2502, r2503, r2504; +} +{ +sub.f16x2 r2505, r2500, r2501; +} +{ +sub.f16x2 r2508, r2503, r2504; +} +{ +add.f16x2 r2511, r2512, r2513; +} +{ +add.f16x2 r2514, r2515, r2516; +} +{ +sub.f16x2 r2517, r2512, r2513; +} +{ +sub.f16x2 r2520, r2515, r2516; +} +{ +neg.f16x2 r2523, r2520; +} +{ +add.f16x2 %6, r2499, r2511; +} +{ +add.f16x2 %7, r2502, r2514; +} +{ +sub.f16x2 %22, r2499, r2511; +} +{ +sub.f16x2 %23, r2502, r2514; +} +{ +add.f16x2 %14, r2505, r2523; +} +{ +add.f16x2 %15, r2508, r2517; +} +{ +sub.f16x2 %30, r2505, r2523; +} +{ +sub.f16x2 %31, r2508, r2517; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1032, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1450>; +.reg .b64 rd<2>; +mov.u32 r1423, %tid.y; +shl.b32 r1424, r1423, 12; +mov.u32 r1425, %16; +add.s32 r1426, r1425, r1424; +mov.u32 r1427, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f104, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f136, 0f3F800000; +mov.f32 f102, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1428, r1427, 127; +shl.b32 r1429, r1427, 5; +and.b32 r1430, r1429, -4096; +add.s32 r1431, r1426, r1430; +cvt.rn.f32.u32 f139, r1428; +mul.f32 f140, f139, 0f3BC90FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1432, r1429, 4064; +add.s32 r1433, r1431, r1432; +st.shared.v4.f32 [r1433], {r149, r207, r244, r281}; +st.shared.v4.f32 [r1433+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r1434, r1428, -28, r1433; +ld.shared.u32 r460, [r1434]; +ld.shared.u32 r510, [r1434+512]; +ld.shared.u32 r472, [r1434+1024]; +ld.shared.u32 r522, [r1434+1536]; +ld.shared.u32 r461, [r1434+2048]; +ld.shared.u32 r511, [r1434+2560]; +ld.shared.u32 r473, [r1434+3072]; +ld.shared.u32 r523, [r1434+3584]; +barrier.sync 0; +st.shared.v4.f32 [r1433], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1433+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1434]; +ld.shared.u32 r513, [r1434+512]; +ld.shared.u32 r475, [r1434+1024]; +ld.shared.u32 r525, [r1434+1536]; +ld.shared.u32 r464, [r1434+2048]; +ld.shared.u32 r514, [r1434+2560]; +ld.shared.u32 r476, [r1434+3072]; +ld.shared.u32 r526, [r1434+3584]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1435, r1427, 120; +bfe.u32 r1436, r1427, 3, 4; +shl.b32 r1437, r1427, 2; +and.b32 r1438, r1437, 28; +add.s32 r1439, r1431, r1438; +cvt.rn.f32.u32 f142, r1436; +mul.f32 f143, f142, 0f3D490FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r1440, r1429, 3840; +add.s32 r1441, r1439, r1440; +st.shared.u32 [r1441], r607; +st.shared.u32 [r1441+32], r665; +st.shared.u32 [r1441+64], r702; +st.shared.u32 [r1441+96], r739; +st.shared.u32 [r1441+128], r776; +st.shared.u32 [r1441+160], r813; +st.shared.u32 [r1441+192], r850; +st.shared.u32 [r1441+224], r887; +barrier.sync 0; +mad.lo.s32 r1442, r1435, -28, r1441; +ld.shared.u32 r918, [r1442]; +ld.shared.u32 r968, [r1442+512]; +ld.shared.u32 r930, [r1442+1024]; +ld.shared.u32 r980, [r1442+1536]; +ld.shared.u32 r919, [r1442+2048]; +ld.shared.u32 r969, [r1442+2560]; +ld.shared.u32 r931, [r1442+3072]; +ld.shared.u32 r981, [r1442+3584]; +barrier.sync 0; +st.shared.u32 [r1441], r610; +st.shared.u32 [r1441+32], r674; +st.shared.u32 [r1441+64], r711; +st.shared.u32 [r1441+96], r748; +st.shared.u32 [r1441+128], r785; +st.shared.u32 [r1441+160], r822; +st.shared.u32 [r1441+192], r859; +st.shared.u32 [r1441+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1442]; +ld.shared.u32 r971, [r1442+512]; +ld.shared.u32 r933, [r1442+1024]; +ld.shared.u32 r983, [r1442+1536]; +ld.shared.u32 r922, [r1442+2048]; +ld.shared.u32 r972, [r1442+2560]; +ld.shared.u32 r934, [r1442+3072]; +ld.shared.u32 r984, [r1442+3584]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1443, r1427, 64; +bfe.u32 r1444, r1427, 6, 1; +and.b32 r1445, r1437, 252; +add.s32 r1446, r1431, r1445; +cvt.rn.f32.u32 f145, r1444; +mul.f32 f146, f145, 0f3EC90FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +barrier.sync 0; +and.b32 r1447, r1429, 2048; +add.s32 r1448, r1446, r1447; +st.shared.u32 [r1448], r1065; +st.shared.u32 [r1448+256], r1123; +st.shared.u32 [r1448+512], r1160; +st.shared.u32 [r1448+768], r1197; +st.shared.u32 [r1448+1024], r1234; +st.shared.u32 [r1448+1280], r1271; +st.shared.u32 [r1448+1536], r1308; +st.shared.u32 [r1448+1792], r1345; +barrier.sync 0; +mad.lo.s32 r1449, r1443, -28, r1448; +ld.shared.u32 r1376, [r1449]; +ld.shared.u32 r1388, [r1449+512]; +ld.shared.u32 r1400, [r1449+1024]; +ld.shared.u32 r1412, [r1449+1536]; +ld.shared.u32 r1377, [r1449+2048]; +ld.shared.u32 r1389, [r1449+2560]; +ld.shared.u32 r1401, [r1449+3072]; +ld.shared.u32 r1413, [r1449+3584]; +barrier.sync 0; +st.shared.u32 [r1448], r1068; +st.shared.u32 [r1448+256], r1132; +st.shared.u32 [r1448+512], r1169; +st.shared.u32 [r1448+768], r1206; +st.shared.u32 [r1448+1024], r1243; +st.shared.u32 [r1448+1280], r1280; +st.shared.u32 [r1448+1536], r1317; +st.shared.u32 [r1448+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1449]; +ld.shared.u32 r1391, [r1449+512]; +ld.shared.u32 r1403, [r1449+1024]; +ld.shared.u32 r1415, [r1449+1536]; +ld.shared.u32 r1380, [r1449+2048]; +ld.shared.u32 r1392, [r1449+2560]; +ld.shared.u32 r1404, [r1449+3072]; +ld.shared.u32 r1416, [r1449+3584]; +{ +add.f16x2 %0, r1376, r1377; +} +{ +add.f16x2 %1, r1379, r1380; +} +{ +sub.f16x2 %8, r1376, r1377; +} +{ +sub.f16x2 %9, r1379, r1380; +} +{ +add.f16x2 %2, r1388, r1389; +} +{ +add.f16x2 %3, r1391, r1392; +} +{ +sub.f16x2 %10, r1388, r1389; +} +{ +sub.f16x2 %11, r1391, r1392; +} +{ +add.f16x2 %4, r1400, r1401; +} +{ +add.f16x2 %5, r1403, r1404; +} +{ +sub.f16x2 %12, r1400, r1401; +} +{ +sub.f16x2 %13, r1403, r1404; +} +{ +add.f16x2 %6, r1412, r1413; +} +{ +add.f16x2 %7, r1415, r1416; +} +{ +sub.f16x2 %14, r1412, r1413; +} +{ +sub.f16x2 %15, r1415, r1416; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1033, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<784>; +.reg .b32 r<4651>; +.reg .b64 rd<3>; +mov.u32 r4575, %tid.y; +shl.b32 r4576, r4575, 12; +mov.u32 r4577, %64; +add.s32 r4578, r4577, r4576; +mov.u32 r4579, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f702, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +mov.f32 f700, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %114, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %114, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %113; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %113; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4580, r4579, 31; +shl.b32 r4581, r4579, 7; +and.b32 r4582, r4581, -4096; +add.s32 r4583, r4578, r4582; +cvt.rn.f32.u32 f779, r4580; +mul.f32 f780, f779, 0f3BC90FDB; +cos.approx.f32 f357, f780; +sin.approx.f32 f781, f780; +neg.f32 f358, f781; +mov.f32 f783, 0fBF800000; +mov.f32 f782, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f783; +cvt.rn.f16.f32 high, f782; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r4584, r4581, 3968; +add.s32 r4585, r4583, r4584; +st.shared.v4.f32 [r4585], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r4585+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r4585+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r4585+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r4585+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r4585+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r4585+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r4585+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r4586, r4580, -124, r4585; +ld.shared.u32 r2864, [r4586]; +ld.shared.u32 r3480, [r4586+128]; +ld.shared.u32 r3060, [r4586+256]; +ld.shared.u32 r3676, [r4586+384]; +ld.shared.u32 r2914, [r4586+512]; +ld.shared.u32 r3530, [r4586+640]; +ld.shared.u32 r3110, [r4586+768]; +ld.shared.u32 r3726, [r4586+896]; +ld.shared.u32 r2876, [r4586+1024]; +ld.shared.u32 r3492, [r4586+1152]; +ld.shared.u32 r3072, [r4586+1280]; +ld.shared.u32 r3688, [r4586+1408]; +ld.shared.u32 r2926, [r4586+1536]; +ld.shared.u32 r3542, [r4586+1664]; +ld.shared.u32 r3122, [r4586+1792]; +ld.shared.u32 r3738, [r4586+1920]; +ld.shared.u32 r2865, [r4586+2048]; +ld.shared.u32 r3481, [r4586+2176]; +ld.shared.u32 r3061, [r4586+2304]; +ld.shared.u32 r3677, [r4586+2432]; +ld.shared.u32 r2915, [r4586+2560]; +ld.shared.u32 r3531, [r4586+2688]; +ld.shared.u32 r3111, [r4586+2816]; +ld.shared.u32 r3727, [r4586+2944]; +ld.shared.u32 r2877, [r4586+3072]; +ld.shared.u32 r3493, [r4586+3200]; +ld.shared.u32 r3073, [r4586+3328]; +ld.shared.u32 r3689, [r4586+3456]; +ld.shared.u32 r2927, [r4586+3584]; +ld.shared.u32 r3543, [r4586+3712]; +ld.shared.u32 r3123, [r4586+3840]; +ld.shared.u32 r3739, [r4586+3968]; +barrier.sync 0; +st.shared.v4.f32 [r4585], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r4585+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r4585+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r4585+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r4585+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r4585+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r4585+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r4585+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r4586]; +ld.shared.u32 r3483, [r4586+128]; +ld.shared.u32 r3063, [r4586+256]; +ld.shared.u32 r3679, [r4586+384]; +ld.shared.u32 r2917, [r4586+512]; +ld.shared.u32 r3533, [r4586+640]; +ld.shared.u32 r3113, [r4586+768]; +ld.shared.u32 r3729, [r4586+896]; +ld.shared.u32 r2879, [r4586+1024]; +ld.shared.u32 r3495, [r4586+1152]; +ld.shared.u32 r3075, [r4586+1280]; +ld.shared.u32 r3691, [r4586+1408]; +ld.shared.u32 r2929, [r4586+1536]; +ld.shared.u32 r3545, [r4586+1664]; +ld.shared.u32 r3125, [r4586+1792]; +ld.shared.u32 r3741, [r4586+1920]; +ld.shared.u32 r2868, [r4586+2048]; +ld.shared.u32 r3484, [r4586+2176]; +ld.shared.u32 r3064, [r4586+2304]; +ld.shared.u32 r3680, [r4586+2432]; +ld.shared.u32 r2918, [r4586+2560]; +ld.shared.u32 r3534, [r4586+2688]; +ld.shared.u32 r3114, [r4586+2816]; +ld.shared.u32 r3730, [r4586+2944]; +ld.shared.u32 r2880, [r4586+3072]; +ld.shared.u32 r3496, [r4586+3200]; +ld.shared.u32 r3076, [r4586+3328]; +ld.shared.u32 r3692, [r4586+3456]; +ld.shared.u32 r2930, [r4586+3584]; +ld.shared.u32 r3546, [r4586+3712]; +ld.shared.u32 r3126, [r4586+3840]; +ld.shared.u32 r3742, [r4586+3968]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 %0, r3383, r3999; +} +{ +add.f16x2 %1, r3386, r4002; +} +{ +sub.f16x2 %32, r3383, r3999; +} +{ +sub.f16x2 %33, r3386, r4002; +} +{ +add.f16x2 %2, r3395, r4163; +} +{ +add.f16x2 %3, r3398, r4169; +} +{ +sub.f16x2 %34, r3395, r4163; +} +{ +sub.f16x2 %35, r3398, r4169; +} +{ +add.f16x2 %4, r3407, r4179; +} +{ +add.f16x2 %5, r3410, r4185; +} +{ +sub.f16x2 %36, r3407, r4179; +} +{ +sub.f16x2 %37, r3410, r4185; +} +{ +add.f16x2 %6, r3419, r4195; +} +{ +add.f16x2 %7, r3422, r4201; +} +{ +sub.f16x2 %38, r3419, r4195; +} +{ +sub.f16x2 %39, r3422, r4201; +} +{ +add.f16x2 %8, r3431, r4211; +} +{ +add.f16x2 %9, r3434, r4217; +} +{ +sub.f16x2 %40, r3431, r4211; +} +{ +sub.f16x2 %41, r3434, r4217; +} +{ +add.f16x2 %10, r3443, r4227; +} +{ +add.f16x2 %11, r3446, r4233; +} +{ +sub.f16x2 %42, r3443, r4227; +} +{ +sub.f16x2 %43, r3446, r4233; +} +{ +add.f16x2 %12, r3455, r4243; +} +{ +add.f16x2 %13, r3458, r4249; +} +{ +sub.f16x2 %44, r3455, r4243; +} +{ +sub.f16x2 %45, r3458, r4249; +} +{ +add.f16x2 %14, r3467, r4259; +} +{ +add.f16x2 %15, r3470, r4265; +} +{ +sub.f16x2 %46, r3467, r4259; +} +{ +sub.f16x2 %47, r3470, r4265; +} +{ +add.f16x2 %16, r3389, r4269; +} +{ +add.f16x2 %17, r3392, r4005; +} +{ +sub.f16x2 %48, r3389, r4269; +} +{ +sub.f16x2 %49, r3392, r4005; +} +{ +add.f16x2 %18, r3401, r4277; +} +{ +add.f16x2 %19, r3404, r4283; +} +{ +sub.f16x2 %50, r3401, r4277; +} +{ +sub.f16x2 %51, r3404, r4283; +} +{ +add.f16x2 %20, r3413, r4293; +} +{ +add.f16x2 %21, r3416, r4299; +} +{ +sub.f16x2 %52, r3413, r4293; +} +{ +sub.f16x2 %53, r3416, r4299; +} +{ +add.f16x2 %22, r3425, r4309; +} +{ +add.f16x2 %23, r3428, r4315; +} +{ +sub.f16x2 %54, r3425, r4309; +} +{ +sub.f16x2 %55, r3428, r4315; +} +{ +add.f16x2 %24, r3437, r4325; +} +{ +add.f16x2 %25, r3440, r4331; +} +{ +sub.f16x2 %56, r3437, r4325; +} +{ +sub.f16x2 %57, r3440, r4331; +} +{ +add.f16x2 %26, r3449, r4341; +} +{ +add.f16x2 %27, r3452, r4347; +} +{ +sub.f16x2 %58, r3449, r4341; +} +{ +sub.f16x2 %59, r3452, r4347; +} +{ +add.f16x2 %28, r3461, r4357; +} +{ +add.f16x2 %29, r3464, r4363; +} +{ +sub.f16x2 %60, r3461, r4357; +} +{ +sub.f16x2 %61, r3464, r4363; +} +{ +add.f16x2 %30, r3473, r4373; +} +{ +add.f16x2 %31, r3476, r4379; +} +{ +sub.f16x2 %62, r3473, r4373; +} +{ +sub.f16x2 %63, r3476, r4379; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1030, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1450>; +.reg .b64 rd<2>; +mov.u32 r1423, %tid.y; +shl.b32 r1424, r1423, 13; +mov.u32 r1425, %16; +add.s32 r1426, r1425, r1424; +mov.u32 r1427, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f104, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f136, 0f3F800000; +mov.f32 f102, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1428, r1427, 127; +shl.b32 r1429, r1427, 6; +and.b32 r1430, r1429, -8192; +add.s32 r1431, r1426, r1430; +cvt.rn.f32.u32 f139, r1428; +mul.f32 f140, f139, 0f3BC90FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1432, r1429, 8128; +add.s32 r1433, r1431, r1432; +st.shared.v4.f32 [r1433], {r149, r152, r207, r216}; +st.shared.v4.f32 [r1433+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r1433+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r1433+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r1434, r1428, -56, r1433; +ld.shared.u32 r460, [r1434]; +ld.shared.u32 r463, [r1434+4]; +ld.shared.u32 r510, [r1434+1024]; +ld.shared.u32 r513, [r1434+1028]; +ld.shared.u32 r472, [r1434+2048]; +ld.shared.u32 r475, [r1434+2052]; +ld.shared.u32 r522, [r1434+3072]; +ld.shared.u32 r525, [r1434+3076]; +ld.shared.u32 r461, [r1434+4096]; +ld.shared.u32 r464, [r1434+4100]; +ld.shared.u32 r511, [r1434+5120]; +ld.shared.u32 r514, [r1434+5124]; +ld.shared.u32 r473, [r1434+6144]; +ld.shared.u32 r476, [r1434+6148]; +ld.shared.u32 r523, [r1434+7168]; +ld.shared.u32 r526, [r1434+7172]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1435, r1427, 120; +bfe.u32 r1436, r1427, 3, 4; +cvt.rn.f32.u32 f142, r1436; +mul.f32 f143, f142, 0f3D490FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r1437, r1427, 3; +and.b32 r1438, r1437, 56; +add.s32 r1439, r1431, r1438; +barrier.sync 0; +and.b32 r1440, r1429, 7680; +add.s32 r1441, r1439, r1440; +st.shared.u32 [r1441], r607; +st.shared.u32 [r1441+4], r610; +st.shared.u32 [r1441+64], r665; +st.shared.u32 [r1441+68], r674; +st.shared.u32 [r1441+128], r702; +st.shared.u32 [r1441+132], r711; +st.shared.u32 [r1441+192], r739; +st.shared.u32 [r1441+196], r748; +st.shared.u32 [r1441+256], r776; +st.shared.u32 [r1441+260], r785; +st.shared.u32 [r1441+320], r813; +st.shared.u32 [r1441+324], r822; +st.shared.u32 [r1441+384], r850; +st.shared.u32 [r1441+388], r859; +st.shared.u32 [r1441+448], r887; +st.shared.u32 [r1441+452], r896; +barrier.sync 0; +mad.lo.s32 r1442, r1435, -56, r1441; +ld.shared.u32 r918, [r1442]; +ld.shared.u32 r921, [r1442+4]; +ld.shared.u32 r968, [r1442+1024]; +ld.shared.u32 r971, [r1442+1028]; +ld.shared.u32 r930, [r1442+2048]; +ld.shared.u32 r933, [r1442+2052]; +ld.shared.u32 r980, [r1442+3072]; +ld.shared.u32 r983, [r1442+3076]; +ld.shared.u32 r919, [r1442+4096]; +ld.shared.u32 r922, [r1442+4100]; +ld.shared.u32 r969, [r1442+5120]; +ld.shared.u32 r972, [r1442+5124]; +ld.shared.u32 r931, [r1442+6144]; +ld.shared.u32 r934, [r1442+6148]; +ld.shared.u32 r981, [r1442+7168]; +ld.shared.u32 r984, [r1442+7172]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1443, r1427, 64; +bfe.u32 r1444, r1427, 6, 1; +cvt.rn.f32.u32 f145, r1444; +mul.f32 f146, f145, 0f3EC90FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +and.b32 r1445, r1437, 504; +add.s32 r1446, r1431, r1445; +barrier.sync 0; +and.b32 r1447, r1429, 4096; +add.s32 r1448, r1446, r1447; +st.shared.u32 [r1448], r1065; +st.shared.u32 [r1448+4], r1068; +st.shared.u32 [r1448+512], r1123; +st.shared.u32 [r1448+516], r1132; +st.shared.u32 [r1448+1024], r1160; +st.shared.u32 [r1448+1028], r1169; +st.shared.u32 [r1448+1536], r1197; +st.shared.u32 [r1448+1540], r1206; +st.shared.u32 [r1448+2048], r1234; +st.shared.u32 [r1448+2052], r1243; +st.shared.u32 [r1448+2560], r1271; +st.shared.u32 [r1448+2564], r1280; +st.shared.u32 [r1448+3072], r1308; +st.shared.u32 [r1448+3076], r1317; +st.shared.u32 [r1448+3584], r1345; +st.shared.u32 [r1448+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1449, r1443, -56, r1448; +ld.shared.u32 r1376, [r1449]; +ld.shared.u32 r1379, [r1449+4]; +ld.shared.u32 r1388, [r1449+1024]; +ld.shared.u32 r1391, [r1449+1028]; +ld.shared.u32 r1400, [r1449+2048]; +ld.shared.u32 r1403, [r1449+2052]; +ld.shared.u32 r1412, [r1449+3072]; +ld.shared.u32 r1415, [r1449+3076]; +ld.shared.u32 r1377, [r1449+4096]; +ld.shared.u32 r1380, [r1449+4100]; +ld.shared.u32 r1389, [r1449+5120]; +ld.shared.u32 r1392, [r1449+5124]; +ld.shared.u32 r1401, [r1449+6144]; +ld.shared.u32 r1404, [r1449+6148]; +ld.shared.u32 r1413, [r1449+7168]; +ld.shared.u32 r1416, [r1449+7172]; +{ +add.f16x2 %0, r1376, r1377; +} +{ +add.f16x2 %1, r1379, r1380; +} +{ +sub.f16x2 %8, r1376, r1377; +} +{ +sub.f16x2 %9, r1379, r1380; +} +{ +add.f16x2 %2, r1388, r1389; +} +{ +add.f16x2 %3, r1391, r1392; +} +{ +sub.f16x2 %10, r1388, r1389; +} +{ +sub.f16x2 %11, r1391, r1392; +} +{ +add.f16x2 %4, r1400, r1401; +} +{ +add.f16x2 %5, r1403, r1404; +} +{ +sub.f16x2 %12, r1400, r1401; +} +{ +sub.f16x2 %13, r1403, r1404; +} +{ +add.f16x2 %6, r1412, r1413; +} +{ +add.f16x2 %7, r1415, r1416; +} +{ +sub.f16x2 %14, r1412, r1413; +} +{ +sub.f16x2 %15, r1415, r1416; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1034, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2569>; +.reg .b64 rd<2>; +mov.u32 r2549, %tid.y; +shl.b32 r2550, r2549, 13; +mov.u32 r2551, %32; +add.s32 r2552, r2551, r2550; +mov.u32 r2553, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f230, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f228, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2554, r2553, 63; +shl.b32 r2555, r2553, 7; +and.b32 r2556, r2555, -8192; +add.s32 r2557, r2552, r2556; +cvt.rn.f32.u32 f301, r2554; +mul.f32 f302, f301, 0f3BC90FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2558, r2555, 8064; +add.s32 r2559, r2557, r2558; +st.shared.v4.f32 [r2559], {r521, r524, r627, r636}; +st.shared.v4.f32 [r2559+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r2559+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r2559+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r2559+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r2559+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r2559+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r2559+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r2560, r2554, -120, r2559; +ld.shared.u32 r1176, [r2560]; +ld.shared.u32 r1179, [r2560+4]; +ld.shared.u32 r1372, [r2560+512]; +ld.shared.u32 r1375, [r2560+516]; +ld.shared.u32 r1226, [r2560+1024]; +ld.shared.u32 r1229, [r2560+1028]; +ld.shared.u32 r1422, [r2560+1536]; +ld.shared.u32 r1425, [r2560+1540]; +ld.shared.u32 r1188, [r2560+2048]; +ld.shared.u32 r1191, [r2560+2052]; +ld.shared.u32 r1384, [r2560+2560]; +ld.shared.u32 r1387, [r2560+2564]; +ld.shared.u32 r1238, [r2560+3072]; +ld.shared.u32 r1241, [r2560+3076]; +ld.shared.u32 r1434, [r2560+3584]; +ld.shared.u32 r1437, [r2560+3588]; +ld.shared.u32 r1177, [r2560+4096]; +ld.shared.u32 r1180, [r2560+4100]; +ld.shared.u32 r1373, [r2560+4608]; +ld.shared.u32 r1376, [r2560+4612]; +ld.shared.u32 r1227, [r2560+5120]; +ld.shared.u32 r1230, [r2560+5124]; +ld.shared.u32 r1423, [r2560+5632]; +ld.shared.u32 r1426, [r2560+5636]; +ld.shared.u32 r1189, [r2560+6144]; +ld.shared.u32 r1192, [r2560+6148]; +ld.shared.u32 r1385, [r2560+6656]; +ld.shared.u32 r1388, [r2560+6660]; +ld.shared.u32 r1239, [r2560+7168]; +ld.shared.u32 r1242, [r2560+7172]; +ld.shared.u32 r1435, [r2560+7680]; +ld.shared.u32 r1438, [r2560+7684]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2561, r2553, 48; +bfe.u32 r2562, r2553, 4, 2; +cvt.rn.f32.u32 f304, r2562; +mul.f32 f305, f304, 0f3DC90FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +shl.b32 r2563, r2553, 3; +and.b32 r2564, r2563, 120; +add.s32 r2565, r2557, r2564; +barrier.sync 0; +and.b32 r2566, r2555, 6144; +add.s32 r2567, r2565, r2566; +st.shared.u32 [r2567], r1695; +st.shared.u32 [r2567+4], r1698; +st.shared.u32 [r2567+128], r1801; +st.shared.u32 [r2567+132], r1810; +st.shared.u32 [r2567+256], r1838; +st.shared.u32 [r2567+260], r1847; +st.shared.u32 [r2567+384], r1875; +st.shared.u32 [r2567+388], r1884; +st.shared.u32 [r2567+512], r1912; +st.shared.u32 [r2567+516], r1921; +st.shared.u32 [r2567+640], r1949; +st.shared.u32 [r2567+644], r1958; +st.shared.u32 [r2567+768], r1986; +st.shared.u32 [r2567+772], r1995; +st.shared.u32 [r2567+896], r2023; +st.shared.u32 [r2567+900], r2032; +st.shared.u32 [r2567+1024], r2060; +st.shared.u32 [r2567+1028], r2069; +st.shared.u32 [r2567+1152], r2097; +st.shared.u32 [r2567+1156], r2106; +st.shared.u32 [r2567+1280], r2134; +st.shared.u32 [r2567+1284], r2143; +st.shared.u32 [r2567+1408], r2171; +st.shared.u32 [r2567+1412], r2180; +st.shared.u32 [r2567+1536], r2208; +st.shared.u32 [r2567+1540], r2217; +st.shared.u32 [r2567+1664], r2245; +st.shared.u32 [r2567+1668], r2254; +st.shared.u32 [r2567+1792], r2282; +st.shared.u32 [r2567+1796], r2291; +st.shared.u32 [r2567+1920], r2319; +st.shared.u32 [r2567+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2568, r2561, -120, r2567; +ld.shared.u32 r2350, [r2568]; +ld.shared.u32 r2353, [r2568+4]; +ld.shared.u32 r2400, [r2568+512]; +ld.shared.u32 r2403, [r2568+516]; +ld.shared.u32 r2450, [r2568+1024]; +ld.shared.u32 r2453, [r2568+1028]; +ld.shared.u32 r2500, [r2568+1536]; +ld.shared.u32 r2503, [r2568+1540]; +ld.shared.u32 r2362, [r2568+2048]; +ld.shared.u32 r2365, [r2568+2052]; +ld.shared.u32 r2412, [r2568+2560]; +ld.shared.u32 r2415, [r2568+2564]; +ld.shared.u32 r2462, [r2568+3072]; +ld.shared.u32 r2465, [r2568+3076]; +ld.shared.u32 r2512, [r2568+3584]; +ld.shared.u32 r2515, [r2568+3588]; +ld.shared.u32 r2351, [r2568+4096]; +ld.shared.u32 r2354, [r2568+4100]; +ld.shared.u32 r2401, [r2568+4608]; +ld.shared.u32 r2404, [r2568+4612]; +ld.shared.u32 r2451, [r2568+5120]; +ld.shared.u32 r2454, [r2568+5124]; +ld.shared.u32 r2501, [r2568+5632]; +ld.shared.u32 r2504, [r2568+5636]; +ld.shared.u32 r2363, [r2568+6144]; +ld.shared.u32 r2366, [r2568+6148]; +ld.shared.u32 r2413, [r2568+6656]; +ld.shared.u32 r2416, [r2568+6660]; +ld.shared.u32 r2463, [r2568+7168]; +ld.shared.u32 r2466, [r2568+7172]; +ld.shared.u32 r2513, [r2568+7680]; +ld.shared.u32 r2516, [r2568+7684]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 %0, r2349, r2361; +} +{ +add.f16x2 %1, r2352, r2364; +} +{ +sub.f16x2 %16, r2349, r2361; +} +{ +sub.f16x2 %17, r2352, r2364; +} +{ +add.f16x2 %8, r2355, r2373; +} +{ +add.f16x2 %9, r2358, r2367; +} +{ +sub.f16x2 %24, r2355, r2373; +} +{ +sub.f16x2 %25, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 %2, r2399, r2411; +} +{ +add.f16x2 %3, r2402, r2414; +} +{ +sub.f16x2 %18, r2399, r2411; +} +{ +sub.f16x2 %19, r2402, r2414; +} +{ +add.f16x2 %10, r2405, r2423; +} +{ +add.f16x2 %11, r2408, r2417; +} +{ +sub.f16x2 %26, r2405, r2423; +} +{ +sub.f16x2 %27, r2408, r2417; +} +{ +add.f16x2 r2449, r2450, r2451; +} +{ +add.f16x2 r2452, r2453, r2454; +} +{ +sub.f16x2 r2455, r2450, r2451; +} +{ +sub.f16x2 r2458, r2453, r2454; +} +{ +add.f16x2 r2461, r2462, r2463; +} +{ +add.f16x2 r2464, r2465, r2466; +} +{ +sub.f16x2 r2467, r2462, r2463; +} +{ +sub.f16x2 r2470, r2465, r2466; +} +{ +neg.f16x2 r2473, r2470; +} +{ +add.f16x2 %4, r2449, r2461; +} +{ +add.f16x2 %5, r2452, r2464; +} +{ +sub.f16x2 %20, r2449, r2461; +} +{ +sub.f16x2 %21, r2452, r2464; +} +{ +add.f16x2 %12, r2455, r2473; +} +{ +add.f16x2 %13, r2458, r2467; +} +{ +sub.f16x2 %28, r2455, r2473; +} +{ +sub.f16x2 %29, r2458, r2467; +} +{ +add.f16x2 r2499, r2500, r2501; +} +{ +add.f16x2 r2502, r2503, r2504; +} +{ +sub.f16x2 r2505, r2500, r2501; +} +{ +sub.f16x2 r2508, r2503, r2504; +} +{ +add.f16x2 r2511, r2512, r2513; +} +{ +add.f16x2 r2514, r2515, r2516; +} +{ +sub.f16x2 r2517, r2512, r2513; +} +{ +sub.f16x2 r2520, r2515, r2516; +} +{ +neg.f16x2 r2523, r2520; +} +{ +add.f16x2 %6, r2499, r2511; +} +{ +add.f16x2 %7, r2502, r2514; +} +{ +sub.f16x2 %22, r2499, r2511; +} +{ +sub.f16x2 %23, r2502, r2514; +} +{ +add.f16x2 %14, r2505, r2523; +} +{ +add.f16x2 %15, r2508, r2517; +} +{ +sub.f16x2 %30, r2505, r2523; +} +{ +sub.f16x2 %31, r2508, r2517; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1035, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<57>; +.reg .b32 r<749>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r32; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r35; +} +{ +add.f16x2 r52, r20, r29; +} +{ +sub.f16x2 r55, r17, r35; +} +{ +sub.f16x2 r58, r20, r29; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 255; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r10, 255; +mov.f32 f56, 0f3BC90F88; +@p1 bra LBB6_2; +cos.approx.f32 f56, f1; +LBB6_2: +sin.approx.f32 f46, f1; +neg.f32 f7, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f7; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +fma.rn.f16x2 r71, r49, r64, r68; +} +{ +mul.f16x2 r75, r49, r66; +} +{ +neg.f16x2 r78, r75; +} +{ +fma.rn.f16x2 r80, r52, r64, r78; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f42, 0fBF800000; +mov.f32 f43, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +fma.rn.f16x2 r108, r43, r101, r105; +} +{ +mul.f16x2 r112, r43, r103; +} +{ +neg.f16x2 r115, r112; +} +{ +fma.rn.f16x2 r117, r46, r101, r115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +fma.rn.f16x2 r145, r55, r138, r142; +} +{ +mul.f16x2 r149, r55, r140; +} +{ +neg.f16x2 r152, r149; +} +{ +fma.rn.f16x2 r154, r58, r138, r152; +} +barrier.sync 0; +mov.u32 r717, %tid.y; +shl.b32 r718, r717, 13; +mov.u32 r719, %8; +add.s32 r720, r719, r718; +shl.b32 r721, r9, 5; +and.b32 r722, r721, -8192; +add.s32 r723, r720, r722; +shl.b32 r724, r10, 5; +add.s32 r725, r723, r724; +st.shared.v4.f32 [r725], {r37, r40, r71, r80}; +st.shared.v4.f32 [r725+16], {r108, r117, r145, r154}; +barrier.sync 0; +mad.lo.s32 r726, r10, -24, r725; +ld.shared.u32 r176, [r726]; +ld.shared.u32 r179, [r726+4]; +ld.shared.u32 r188, [r726+2048]; +ld.shared.u32 r191, [r726+2052]; +ld.shared.u32 r177, [r726+4096]; +ld.shared.u32 r180, [r726+4100]; +ld.shared.u32 r189, [r726+6144]; +ld.shared.u32 r192, [r726+6148]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r196; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r199; +} +{ +add.f16x2 r216, r184, r193; +} +{ +sub.f16x2 r219, r181, r199; +} +{ +sub.f16x2 r222, r184, r193; +} +and.b32 r727, r9, 252; +bfe.u32 r728, r9, 2, 6; +cvt.rn.f32.u32 f47, r728; +mul.f32 f48, f47, 0f3CC90FDB; +cos.approx.f32 f16, f48; +sin.approx.f32 f49, f48; +neg.f32 f17, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f17; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +fma.rn.f16x2 r235, r213, r228, r232; +} +{ +mul.f16x2 r239, r213, r230; +} +{ +neg.f16x2 r242, r239; +} +{ +fma.rn.f16x2 r244, r216, r228, r242; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +fma.rn.f16x2 r272, r207, r265, r269; +} +{ +mul.f16x2 r276, r207, r267; +} +{ +neg.f16x2 r279, r276; +} +{ +fma.rn.f16x2 r281, r210, r265, r279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +fma.rn.f16x2 r309, r219, r302, r306; +} +{ +mul.f16x2 r313, r219, r304; +} +{ +neg.f16x2 r316, r313; +} +{ +fma.rn.f16x2 r318, r222, r302, r316; +} +shl.b32 r729, r9, 3; +and.b32 r730, r729, 24; +add.s32 r731, r723, r730; +barrier.sync 0; +and.b32 r732, r721, 8064; +add.s32 r733, r731, r732; +st.shared.u32 [r733], r201; +st.shared.u32 [r733+4], r204; +st.shared.u32 [r733+32], r235; +st.shared.u32 [r733+36], r244; +st.shared.u32 [r733+64], r272; +st.shared.u32 [r733+68], r281; +st.shared.u32 [r733+96], r309; +st.shared.u32 [r733+100], r318; +barrier.sync 0; +mad.lo.s32 r734, r727, -24, r733; +ld.shared.u32 r340, [r734]; +ld.shared.u32 r343, [r734+4]; +ld.shared.u32 r352, [r734+2048]; +ld.shared.u32 r355, [r734+2052]; +ld.shared.u32 r341, [r734+4096]; +ld.shared.u32 r344, [r734+4100]; +ld.shared.u32 r353, [r734+6144]; +ld.shared.u32 r356, [r734+6148]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r360; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r363; +} +{ +add.f16x2 r380, r348, r357; +} +{ +sub.f16x2 r383, r345, r363; +} +{ +sub.f16x2 r386, r348, r357; +} +and.b32 r735, r9, 240; +bfe.u32 r736, r9, 4, 4; +cvt.rn.f32.u32 f50, r736; +mul.f32 f51, f50, 0f3DC90FDB; +cos.approx.f32 f26, f51; +sin.approx.f32 f52, f51; +neg.f32 f27, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f27; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +fma.rn.f16x2 r399, r377, r392, r396; +} +{ +mul.f16x2 r403, r377, r394; +} +{ +neg.f16x2 r406, r403; +} +{ +fma.rn.f16x2 r408, r380, r392, r406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +fma.rn.f16x2 r436, r371, r429, r433; +} +{ +mul.f16x2 r440, r371, r431; +} +{ +neg.f16x2 r443, r440; +} +{ +fma.rn.f16x2 r445, r374, r429, r443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +fma.rn.f16x2 r473, r383, r466, r470; +} +{ +mul.f16x2 r477, r383, r468; +} +{ +neg.f16x2 r480, r477; +} +{ +fma.rn.f16x2 r482, r386, r466, r480; +} +and.b32 r737, r729, 120; +add.s32 r738, r723, r737; +barrier.sync 0; +and.b32 r739, r721, 7680; +add.s32 r740, r738, r739; +st.shared.u32 [r740], r365; +st.shared.u32 [r740+4], r368; +st.shared.u32 [r740+128], r399; +st.shared.u32 [r740+132], r408; +st.shared.u32 [r740+256], r436; +st.shared.u32 [r740+260], r445; +st.shared.u32 [r740+384], r473; +st.shared.u32 [r740+388], r482; +barrier.sync 0; +mad.lo.s32 r741, r735, -24, r740; +ld.shared.u32 r504, [r741]; +ld.shared.u32 r507, [r741+4]; +ld.shared.u32 r516, [r741+2048]; +ld.shared.u32 r519, [r741+2052]; +ld.shared.u32 r505, [r741+4096]; +ld.shared.u32 r508, [r741+4100]; +ld.shared.u32 r517, [r741+6144]; +ld.shared.u32 r520, [r741+6148]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r524; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r527; +} +{ +add.f16x2 r544, r512, r521; +} +{ +sub.f16x2 r547, r509, r527; +} +{ +sub.f16x2 r550, r512, r521; +} +and.b32 r742, r9, 192; +bfe.u32 r743, r9, 6, 2; +cvt.rn.f32.u32 f53, r743; +mul.f32 f54, f53, 0f3EC90FDB; +cos.approx.f32 f36, f54; +sin.approx.f32 f55, f54; +neg.f32 f37, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +fma.rn.f16x2 r563, r541, r556, r560; +} +{ +mul.f16x2 r567, r541, r558; +} +{ +neg.f16x2 r570, r567; +} +{ +fma.rn.f16x2 r572, r544, r556, r570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +fma.rn.f16x2 r600, r535, r593, r597; +} +{ +mul.f16x2 r604, r535, r595; +} +{ +neg.f16x2 r607, r604; +} +{ +fma.rn.f16x2 r609, r538, r593, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +fma.rn.f16x2 r637, r547, r630, r634; +} +{ +mul.f16x2 r641, r547, r632; +} +{ +neg.f16x2 r644, r641; +} +{ +fma.rn.f16x2 r646, r550, r630, r644; +} +and.b32 r744, r729, 504; +add.s32 r745, r723, r744; +barrier.sync 0; +and.b32 r746, r721, 6144; +add.s32 r747, r745, r746; +st.shared.u32 [r747], r529; +st.shared.u32 [r747+4], r532; +st.shared.u32 [r747+512], r563; +st.shared.u32 [r747+516], r572; +st.shared.u32 [r747+1024], r600; +st.shared.u32 [r747+1028], r609; +st.shared.u32 [r747+1536], r637; +st.shared.u32 [r747+1540], r646; +barrier.sync 0; +mad.lo.s32 r748, r742, -24, r747; +ld.shared.u32 r668, [r748]; +ld.shared.u32 r671, [r748+4]; +ld.shared.u32 r680, [r748+2048]; +ld.shared.u32 r683, [r748+2052]; +ld.shared.u32 r669, [r748+4096]; +ld.shared.u32 r672, [r748+4100]; +ld.shared.u32 r681, [r748+6144]; +ld.shared.u32 r684, [r748+6148]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 %0, r667, r679; +} +{ +add.f16x2 %1, r670, r682; +} +{ +sub.f16x2 %4, r667, r679; +} +{ +sub.f16x2 %5, r670, r682; +} +{ +add.f16x2 %2, r673, r691; +} +{ +add.f16x2 %3, r676, r685; +} +{ +sub.f16x2 %6, r673, r691; +} +{ +sub.f16x2 %7, r676, r685; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1036, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<57>; +.reg .b32 r<749>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r32; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r35; +} +{ +add.f16x2 r52, r20, r29; +} +{ +sub.f16x2 r55, r17, r35; +} +{ +sub.f16x2 r58, r20, r29; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 255; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r10, 255; +mov.f32 f56, 0f3BC90F88; +@p1 bra LBB7_2; +cos.approx.f32 f56, f1; +LBB7_2: +mov.u32 r717, %tid.y; +shl.b32 r718, r717, 12; +mov.u32 r719, %8; +add.s32 r720, r719, r718; +shl.b32 r721, r9, 4; +and.b32 r722, r721, -4096; +add.s32 r723, r720, r722; +sin.approx.f32 f46, f1; +neg.f32 f7, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f7; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +fma.rn.f16x2 r71, r49, r64, r68; +} +{ +mul.f16x2 r75, r49, r66; +} +{ +neg.f16x2 r78, r75; +} +{ +fma.rn.f16x2 r80, r52, r64, r78; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f42, 0fBF800000; +mov.f32 f43, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +fma.rn.f16x2 r108, r43, r101, r105; +} +{ +mul.f16x2 r112, r43, r103; +} +{ +neg.f16x2 r115, r112; +} +{ +fma.rn.f16x2 r117, r46, r101, r115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +fma.rn.f16x2 r145, r55, r138, r142; +} +{ +mul.f16x2 r149, r55, r140; +} +{ +neg.f16x2 r152, r149; +} +{ +fma.rn.f16x2 r154, r58, r138, r152; +} +barrier.sync 0; +shl.b32 r724, r10, 4; +add.s32 r725, r723, r724; +st.shared.v4.f32 [r725], {r37, r71, r108, r145}; +barrier.sync 0; +mad.lo.s32 r726, r10, -12, r725; +ld.shared.u32 r176, [r726]; +ld.shared.u32 r188, [r726+1024]; +ld.shared.u32 r177, [r726+2048]; +ld.shared.u32 r189, [r726+3072]; +barrier.sync 0; +st.shared.v4.f32 [r725], {r40, r80, r117, r154}; +barrier.sync 0; +ld.shared.u32 r179, [r726]; +ld.shared.u32 r191, [r726+1024]; +ld.shared.u32 r180, [r726+2048]; +ld.shared.u32 r192, [r726+3072]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r196; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r199; +} +{ +add.f16x2 r216, r184, r193; +} +{ +sub.f16x2 r219, r181, r199; +} +{ +sub.f16x2 r222, r184, r193; +} +and.b32 r727, r9, 252; +bfe.u32 r728, r9, 2, 6; +shl.b32 r729, r9, 2; +and.b32 r730, r729, 12; +add.s32 r731, r723, r730; +cvt.rn.f32.u32 f47, r728; +mul.f32 f48, f47, 0f3CC90FDB; +cos.approx.f32 f16, f48; +sin.approx.f32 f49, f48; +neg.f32 f17, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f17; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +fma.rn.f16x2 r235, r213, r228, r232; +} +{ +mul.f16x2 r239, r213, r230; +} +{ +neg.f16x2 r242, r239; +} +{ +fma.rn.f16x2 r244, r216, r228, r242; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +fma.rn.f16x2 r272, r207, r265, r269; +} +{ +mul.f16x2 r276, r207, r267; +} +{ +neg.f16x2 r279, r276; +} +{ +fma.rn.f16x2 r281, r210, r265, r279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +fma.rn.f16x2 r309, r219, r302, r306; +} +{ +mul.f16x2 r313, r219, r304; +} +{ +neg.f16x2 r316, r313; +} +{ +fma.rn.f16x2 r318, r222, r302, r316; +} +barrier.sync 0; +and.b32 r732, r721, 4032; +add.s32 r733, r731, r732; +st.shared.u32 [r733], r201; +st.shared.u32 [r733+16], r235; +st.shared.u32 [r733+32], r272; +st.shared.u32 [r733+48], r309; +barrier.sync 0; +mad.lo.s32 r734, r727, -12, r733; +ld.shared.u32 r340, [r734]; +ld.shared.u32 r352, [r734+1024]; +ld.shared.u32 r341, [r734+2048]; +ld.shared.u32 r353, [r734+3072]; +barrier.sync 0; +st.shared.u32 [r733], r204; +st.shared.u32 [r733+16], r244; +st.shared.u32 [r733+32], r281; +st.shared.u32 [r733+48], r318; +barrier.sync 0; +ld.shared.u32 r343, [r734]; +ld.shared.u32 r355, [r734+1024]; +ld.shared.u32 r344, [r734+2048]; +ld.shared.u32 r356, [r734+3072]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r360; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r363; +} +{ +add.f16x2 r380, r348, r357; +} +{ +sub.f16x2 r383, r345, r363; +} +{ +sub.f16x2 r386, r348, r357; +} +and.b32 r735, r9, 240; +bfe.u32 r736, r9, 4, 4; +and.b32 r737, r729, 60; +add.s32 r738, r723, r737; +cvt.rn.f32.u32 f50, r736; +mul.f32 f51, f50, 0f3DC90FDB; +cos.approx.f32 f26, f51; +sin.approx.f32 f52, f51; +neg.f32 f27, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f27; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +fma.rn.f16x2 r399, r377, r392, r396; +} +{ +mul.f16x2 r403, r377, r394; +} +{ +neg.f16x2 r406, r403; +} +{ +fma.rn.f16x2 r408, r380, r392, r406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +fma.rn.f16x2 r436, r371, r429, r433; +} +{ +mul.f16x2 r440, r371, r431; +} +{ +neg.f16x2 r443, r440; +} +{ +fma.rn.f16x2 r445, r374, r429, r443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +fma.rn.f16x2 r473, r383, r466, r470; +} +{ +mul.f16x2 r477, r383, r468; +} +{ +neg.f16x2 r480, r477; +} +{ +fma.rn.f16x2 r482, r386, r466, r480; +} +barrier.sync 0; +and.b32 r739, r721, 3840; +add.s32 r740, r738, r739; +st.shared.u32 [r740], r365; +st.shared.u32 [r740+64], r399; +st.shared.u32 [r740+128], r436; +st.shared.u32 [r740+192], r473; +barrier.sync 0; +mad.lo.s32 r741, r735, -12, r740; +ld.shared.u32 r504, [r741]; +ld.shared.u32 r516, [r741+1024]; +ld.shared.u32 r505, [r741+2048]; +ld.shared.u32 r517, [r741+3072]; +barrier.sync 0; +st.shared.u32 [r740], r368; +st.shared.u32 [r740+64], r408; +st.shared.u32 [r740+128], r445; +st.shared.u32 [r740+192], r482; +barrier.sync 0; +ld.shared.u32 r507, [r741]; +ld.shared.u32 r519, [r741+1024]; +ld.shared.u32 r508, [r741+2048]; +ld.shared.u32 r520, [r741+3072]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r524; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r527; +} +{ +add.f16x2 r544, r512, r521; +} +{ +sub.f16x2 r547, r509, r527; +} +{ +sub.f16x2 r550, r512, r521; +} +and.b32 r742, r9, 192; +bfe.u32 r743, r9, 6, 2; +and.b32 r744, r729, 252; +add.s32 r745, r723, r744; +cvt.rn.f32.u32 f53, r743; +mul.f32 f54, f53, 0f3EC90FDB; +cos.approx.f32 f36, f54; +sin.approx.f32 f55, f54; +neg.f32 f37, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +fma.rn.f16x2 r563, r541, r556, r560; +} +{ +mul.f16x2 r567, r541, r558; +} +{ +neg.f16x2 r570, r567; +} +{ +fma.rn.f16x2 r572, r544, r556, r570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +fma.rn.f16x2 r600, r535, r593, r597; +} +{ +mul.f16x2 r604, r535, r595; +} +{ +neg.f16x2 r607, r604; +} +{ +fma.rn.f16x2 r609, r538, r593, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +fma.rn.f16x2 r637, r547, r630, r634; +} +{ +mul.f16x2 r641, r547, r632; +} +{ +neg.f16x2 r644, r641; +} +{ +fma.rn.f16x2 r646, r550, r630, r644; +} +barrier.sync 0; +and.b32 r746, r721, 3072; +add.s32 r747, r745, r746; +st.shared.u32 [r747], r529; +st.shared.u32 [r747+256], r563; +st.shared.u32 [r747+512], r600; +st.shared.u32 [r747+768], r637; +barrier.sync 0; +mad.lo.s32 r748, r742, -12, r747; +ld.shared.u32 r668, [r748]; +ld.shared.u32 r680, [r748+1024]; +ld.shared.u32 r669, [r748+2048]; +ld.shared.u32 r681, [r748+3072]; +barrier.sync 0; +st.shared.u32 [r747], r532; +st.shared.u32 [r747+256], r572; +st.shared.u32 [r747+512], r609; +st.shared.u32 [r747+768], r646; +barrier.sync 0; +ld.shared.u32 r671, [r748]; +ld.shared.u32 r683, [r748+1024]; +ld.shared.u32 r672, [r748+2048]; +ld.shared.u32 r684, [r748+3072]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 %0, r667, r679; +} +{ +add.f16x2 %1, r670, r682; +} +{ +sub.f16x2 %4, r667, r679; +} +{ +sub.f16x2 %5, r670, r682; +} +{ +add.f16x2 %2, r673, r691; +} +{ +add.f16x2 %3, r676, r685; +} +{ +sub.f16x2 %6, r673, r691; +} +{ +sub.f16x2 %7, r676, r685; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1037, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<86>; +.reg .b32 r<555>; +.reg .b64 rd<2>; +{ +add.f16x2 r7, %5, %7; +} +{ +add.f16x2 r10, %6, %8; +} +{ +sub.f16x2 r13, %5, %7; +} +{ +sub.f16x2 r16, %6, %8; +} +mov.u32 r5, %tid.x; +and.b32 r6, r5, 511; +cvt.rn.f32.u32 f5, r6; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r6, 255; +mov.f32 f85, 0f3BC90F88; +@p1 bra LBB8_2; +cos.approx.f32 f85, f1; +LBB8_2: +sin.approx.f32 f60, f1; +neg.f32 f7, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f7; +mov.b32 r19, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r22, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r24, {high, high}; +} +{ +mul.f16x2 r26, r16, r24; +} +{ +fma.rn.f16x2 r29, r13, r22, r26; +} +{ +mul.f16x2 r33, r13, r24; +} +{ +neg.f16x2 r36, r33; +} +{ +fma.rn.f16x2 r38, r16, r22, r36; +} +barrier.sync 0; +mov.u32 r487, %tid.y; +shl.b32 r488, r487, 13; +mov.u32 r489, %4; +add.s32 r490, r489, r488; +shl.b32 r491, r5, 4; +and.b32 r492, r491, -8192; +add.s32 r493, r490, r492; +shl.b32 r494, r6, 4; +add.s32 r495, r493, r494; +st.shared.v2.f32 [r495], {r7, r10}; +st.shared.v2.f32 [r495+8], {r29, r38}; +barrier.sync 0; +shl.b32 r496, r6, 3; +sub.s32 r497, r495, r496; +ld.shared.u32 r60, [r497]; +ld.shared.u32 r63, [r497+4]; +ld.shared.u32 r61, [r497+4096]; +ld.shared.u32 r64, [r497+4100]; +{ +add.f16x2 r59, r60, r61; +} +{ +add.f16x2 r62, r63, r64; +} +{ +sub.f16x2 r65, r60, r61; +} +{ +sub.f16x2 r68, r63, r64; +} +bfe.u32 r498, r5, 1, 8; +cvt.rn.f32.u32 f61, r498; +mul.f32 f62, f61, 0f3C490FDB; +cos.approx.f32 f12, f62; +sin.approx.f32 f63, f62; +neg.f32 f13, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f13; +mov.b32 r71, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r76, {high, high}; +} +{ +mul.f16x2 r78, r68, r76; +} +{ +fma.rn.f16x2 r81, r65, r74, r78; +} +{ +mul.f16x2 r85, r65, r76; +} +{ +neg.f16x2 r88, r85; +} +{ +fma.rn.f16x2 r90, r68, r74, r88; +} +shl.b32 r499, r5, 3; +and.b32 r500, r499, 8; +add.s32 r501, r493, r500; +barrier.sync 0; +and.b32 r502, r491, 8160; +add.s32 r503, r501, r502; +st.shared.u32 [r503], r59; +st.shared.u32 [r503+4], r62; +st.shared.u32 [r503+16], r81; +st.shared.u32 [r503+20], r90; +barrier.sync 0; +and.b32 r504, r499, 4080; +sub.s32 r505, r503, r504; +ld.shared.u32 r112, [r505]; +ld.shared.u32 r115, [r505+4]; +ld.shared.u32 r113, [r505+4096]; +ld.shared.u32 r116, [r505+4100]; +{ +add.f16x2 r111, r112, r113; +} +{ +add.f16x2 r114, r115, r116; +} +{ +sub.f16x2 r117, r112, r113; +} +{ +sub.f16x2 r120, r115, r116; +} +bfe.u32 r506, r5, 2, 7; +cvt.rn.f32.u32 f64, r506; +mul.f32 f65, f64, 0f3CC90FDB; +cos.approx.f32 f18, f65; +sin.approx.f32 f66, f65; +neg.f32 f19, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f19; +mov.b32 r123, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r128, {high, high}; +} +{ +mul.f16x2 r130, r120, r128; +} +{ +fma.rn.f16x2 r133, r117, r126, r130; +} +{ +mul.f16x2 r137, r117, r128; +} +{ +neg.f16x2 r140, r137; +} +{ +fma.rn.f16x2 r142, r120, r126, r140; +} +and.b32 r507, r499, 24; +add.s32 r508, r493, r507; +barrier.sync 0; +and.b32 r509, r491, 8128; +add.s32 r510, r508, r509; +st.shared.u32 [r510], r111; +st.shared.u32 [r510+4], r114; +st.shared.u32 [r510+32], r133; +st.shared.u32 [r510+36], r142; +barrier.sync 0; +and.b32 r511, r499, 4064; +sub.s32 r512, r510, r511; +ld.shared.u32 r164, [r512]; +ld.shared.u32 r167, [r512+4]; +ld.shared.u32 r165, [r512+4096]; +ld.shared.u32 r168, [r512+4100]; +{ +add.f16x2 r163, r164, r165; +} +{ +add.f16x2 r166, r167, r168; +} +{ +sub.f16x2 r169, r164, r165; +} +{ +sub.f16x2 r172, r167, r168; +} +bfe.u32 r513, r5, 3, 6; +cvt.rn.f32.u32 f67, r513; +mul.f32 f68, f67, 0f3D490FDB; +cos.approx.f32 f24, f68; +sin.approx.f32 f69, f68; +neg.f32 f25, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f24; +cvt.rn.f16.f32 high, f25; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r178, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r180, {high, high}; +} +{ +mul.f16x2 r182, r172, r180; +} +{ +fma.rn.f16x2 r185, r169, r178, r182; +} +{ +mul.f16x2 r189, r169, r180; +} +{ +neg.f16x2 r192, r189; +} +{ +fma.rn.f16x2 r194, r172, r178, r192; +} +and.b32 r514, r499, 56; +add.s32 r515, r493, r514; +barrier.sync 0; +and.b32 r516, r491, 8064; +add.s32 r517, r515, r516; +st.shared.u32 [r517], r163; +st.shared.u32 [r517+4], r166; +st.shared.u32 [r517+64], r185; +st.shared.u32 [r517+68], r194; +barrier.sync 0; +and.b32 r518, r499, 4032; +sub.s32 r519, r517, r518; +ld.shared.u32 r216, [r519]; +ld.shared.u32 r219, [r519+4]; +ld.shared.u32 r217, [r519+4096]; +ld.shared.u32 r220, [r519+4100]; +{ +add.f16x2 r215, r216, r217; +} +{ +add.f16x2 r218, r219, r220; +} +{ +sub.f16x2 r221, r216, r217; +} +{ +sub.f16x2 r224, r219, r220; +} +bfe.u32 r520, r5, 4, 5; +cvt.rn.f32.u32 f70, r520; +mul.f32 f71, f70, 0f3DC90FDB; +cos.approx.f32 f30, f71; +sin.approx.f32 f72, f71; +neg.f32 f31, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f31; +mov.b32 r227, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r230, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r232, {high, high}; +} +{ +mul.f16x2 r234, r224, r232; +} +{ +fma.rn.f16x2 r237, r221, r230, r234; +} +{ +mul.f16x2 r241, r221, r232; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r224, r230, r244; +} +and.b32 r521, r499, 120; +add.s32 r522, r493, r521; +barrier.sync 0; +and.b32 r523, r491, 7936; +add.s32 r524, r522, r523; +st.shared.u32 [r524], r215; +st.shared.u32 [r524+4], r218; +st.shared.u32 [r524+128], r237; +st.shared.u32 [r524+132], r246; +barrier.sync 0; +and.b32 r525, r499, 3968; +sub.s32 r526, r524, r525; +ld.shared.u32 r268, [r526]; +ld.shared.u32 r271, [r526+4]; +ld.shared.u32 r269, [r526+4096]; +ld.shared.u32 r272, [r526+4100]; +{ +add.f16x2 r267, r268, r269; +} +{ +add.f16x2 r270, r271, r272; +} +{ +sub.f16x2 r273, r268, r269; +} +{ +sub.f16x2 r276, r271, r272; +} +bfe.u32 r527, r5, 5, 4; +cvt.rn.f32.u32 f73, r527; +mul.f32 f74, f73, 0f3E490FDB; +cos.approx.f32 f36, f74; +sin.approx.f32 f75, f74; +neg.f32 f37, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r279, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r282, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r284, {high, high}; +} +{ +mul.f16x2 r286, r276, r284; +} +{ +fma.rn.f16x2 r289, r273, r282, r286; +} +{ +mul.f16x2 r293, r273, r284; +} +{ +neg.f16x2 r296, r293; +} +{ +fma.rn.f16x2 r298, r276, r282, r296; +} +and.b32 r528, r499, 248; +add.s32 r529, r493, r528; +barrier.sync 0; +and.b32 r530, r491, 7680; +add.s32 r531, r529, r530; +st.shared.u32 [r531], r267; +st.shared.u32 [r531+4], r270; +st.shared.u32 [r531+256], r289; +st.shared.u32 [r531+260], r298; +barrier.sync 0; +and.b32 r532, r499, 3840; +sub.s32 r533, r531, r532; +ld.shared.u32 r320, [r533]; +ld.shared.u32 r323, [r533+4]; +ld.shared.u32 r321, [r533+4096]; +ld.shared.u32 r324, [r533+4100]; +{ +add.f16x2 r319, r320, r321; +} +{ +add.f16x2 r322, r323, r324; +} +{ +sub.f16x2 r325, r320, r321; +} +{ +sub.f16x2 r328, r323, r324; +} +bfe.u32 r534, r5, 6, 3; +cvt.rn.f32.u32 f76, r534; +mul.f32 f77, f76, 0f3EC90FDB; +cos.approx.f32 f42, f77; +sin.approx.f32 f78, f77; +neg.f32 f43, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r334, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r336, {high, high}; +} +{ +mul.f16x2 r338, r328, r336; +} +{ +fma.rn.f16x2 r341, r325, r334, r338; +} +{ +mul.f16x2 r345, r325, r336; +} +{ +neg.f16x2 r348, r345; +} +{ +fma.rn.f16x2 r350, r328, r334, r348; +} +and.b32 r535, r499, 504; +add.s32 r536, r493, r535; +barrier.sync 0; +and.b32 r537, r491, 7168; +add.s32 r538, r536, r537; +st.shared.u32 [r538], r319; +st.shared.u32 [r538+4], r322; +st.shared.u32 [r538+512], r341; +st.shared.u32 [r538+516], r350; +barrier.sync 0; +and.b32 r539, r499, 3584; +sub.s32 r540, r538, r539; +ld.shared.u32 r372, [r540]; +ld.shared.u32 r375, [r540+4]; +ld.shared.u32 r373, [r540+4096]; +ld.shared.u32 r376, [r540+4100]; +{ +add.f16x2 r371, r372, r373; +} +{ +add.f16x2 r374, r375, r376; +} +{ +sub.f16x2 r377, r372, r373; +} +{ +sub.f16x2 r380, r375, r376; +} +bfe.u32 r541, r5, 7, 2; +cvt.rn.f32.u32 f79, r541; +mul.f32 f80, f79, 0f3F490FDB; +cos.approx.f32 f48, f80; +sin.approx.f32 f81, f80; +neg.f32 f49, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f49; +mov.b32 r383, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r388, {high, high}; +} +{ +mul.f16x2 r390, r380, r388; +} +{ +fma.rn.f16x2 r393, r377, r386, r390; +} +{ +mul.f16x2 r397, r377, r388; +} +{ +neg.f16x2 r400, r397; +} +{ +fma.rn.f16x2 r402, r380, r386, r400; +} +and.b32 r542, r499, 1016; +add.s32 r543, r493, r542; +barrier.sync 0; +and.b32 r544, r491, 6144; +add.s32 r545, r543, r544; +st.shared.u32 [r545], r371; +st.shared.u32 [r545+4], r374; +st.shared.u32 [r545+1024], r393; +st.shared.u32 [r545+1028], r402; +barrier.sync 0; +and.b32 r546, r499, 3072; +sub.s32 r547, r545, r546; +ld.shared.u32 r424, [r547]; +ld.shared.u32 r427, [r547+4]; +ld.shared.u32 r425, [r547+4096]; +ld.shared.u32 r428, [r547+4100]; +{ +add.f16x2 r423, r424, r425; +} +{ +add.f16x2 r426, r427, r428; +} +{ +sub.f16x2 r429, r424, r425; +} +{ +sub.f16x2 r432, r427, r428; +} +bfe.u32 r548, r5, 8, 1; +cvt.rn.f32.u32 f82, r548; +mul.f32 f83, f82, 0f3FC90FDB; +cos.approx.f32 f54, f83; +sin.approx.f32 f84, f83; +neg.f32 f55, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f55; +mov.b32 r435, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r438, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r440, {high, high}; +} +{ +mul.f16x2 r442, r432, r440; +} +{ +fma.rn.f16x2 r445, r429, r438, r442; +} +{ +mul.f16x2 r449, r429, r440; +} +{ +neg.f16x2 r452, r449; +} +{ +fma.rn.f16x2 r454, r432, r438, r452; +} +and.b32 r549, r499, 2040; +add.s32 r550, r493, r549; +barrier.sync 0; +and.b32 r551, r491, 4096; +add.s32 r552, r550, r551; +st.shared.u32 [r552], r423; +st.shared.u32 [r552+4], r426; +st.shared.u32 [r552+2048], r445; +st.shared.u32 [r552+2052], r454; +barrier.sync 0; +and.b32 r553, r499, 2048; +sub.s32 r554, r552, r553; +ld.shared.u32 r476, [r554]; +ld.shared.u32 r479, [r554+4]; +ld.shared.u32 r477, [r554+4096]; +ld.shared.u32 r480, [r554+4100]; +{ +add.f16x2 %0, r476, r477; +} +{ +add.f16x2 %1, r479, r480; +} +{ +sub.f16x2 %2, r476, r477; +} +{ +sub.f16x2 %3, r479, r480; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1038, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<86>; +.reg .b32 r<556>; +.reg .b64 rd<2>; +{ +add.f16x2 r5, %5, %7; +} +{ +add.f16x2 r8, %6, %8; +} +{ +sub.f16x2 r11, %5, %7; +} +{ +sub.f16x2 r14, %6, %8; +} +mov.u32 r17, %tid.x; +and.b32 r18, r17, 511; +cvt.rn.f32.u32 f5, r18; +mul.f32 f1, f5, 0f3BC90FDB; +setp.eq.s32 p1, r18, 255; +mov.f32 f85, 0f3BC90F88; +@p1 bra LBB9_2; +cos.approx.f32 f85, f1; +LBB9_2: +mov.u32 r487, %tid.y; +shl.b32 r488, r487, 12; +mov.u32 r489, %4; +add.s32 r490, r489, r488; +shl.b32 r492, r17, 3; +and.b32 r493, r492, -4096; +add.s32 r494, r490, r493; +sin.approx.f32 f60, f1; +neg.f32 f7, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f7; +mov.b32 r19, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r22, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r19; +mov.b32 r24, {high, high}; +} +{ +mul.f16x2 r26, r14, r24; +} +{ +fma.rn.f16x2 r29, r11, r22, r26; +} +{ +mul.f16x2 r33, r11, r24; +} +{ +neg.f16x2 r36, r33; +} +{ +fma.rn.f16x2 r38, r14, r22, r36; +} +barrier.sync 0; +and.b32 r495, r492, 4088; +add.s32 r496, r494, r495; +st.shared.v2.f32 [r496], {r5, r29}; +barrier.sync 0; +shl.b32 r497, r17, 2; +and.b32 r498, r497, 2044; +sub.s32 r499, r496, r498; +ld.shared.u32 r60, [r499]; +ld.shared.u32 r61, [r499+2048]; +barrier.sync 0; +st.shared.v2.f32 [r496], {r8, r38}; +barrier.sync 0; +ld.shared.u32 r63, [r499]; +ld.shared.u32 r64, [r499+2048]; +{ +add.f16x2 r59, r60, r61; +} +{ +add.f16x2 r62, r63, r64; +} +{ +sub.f16x2 r65, r60, r61; +} +{ +sub.f16x2 r68, r63, r64; +} +bfe.u32 r500, r17, 1, 8; +and.b32 r501, r497, 4; +add.s32 r502, r494, r501; +cvt.rn.f32.u32 f61, r500; +mul.f32 f62, f61, 0f3C490FDB; +cos.approx.f32 f12, f62; +sin.approx.f32 f63, f62; +neg.f32 f13, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f13; +mov.b32 r71, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r76, {high, high}; +} +{ +mul.f16x2 r78, r68, r76; +} +{ +fma.rn.f16x2 r81, r65, r74, r78; +} +{ +mul.f16x2 r85, r65, r76; +} +{ +neg.f16x2 r88, r85; +} +{ +fma.rn.f16x2 r90, r68, r74, r88; +} +barrier.sync 0; +and.b32 r503, r492, 4080; +add.s32 r504, r502, r503; +st.shared.u32 [r504], r59; +st.shared.u32 [r504+8], r81; +barrier.sync 0; +and.b32 r505, r497, 2040; +sub.s32 r506, r504, r505; +ld.shared.u32 r112, [r506]; +ld.shared.u32 r113, [r506+2048]; +barrier.sync 0; +st.shared.u32 [r504], r62; +st.shared.u32 [r504+8], r90; +barrier.sync 0; +ld.shared.u32 r115, [r506]; +ld.shared.u32 r116, [r506+2048]; +{ +add.f16x2 r111, r112, r113; +} +{ +add.f16x2 r114, r115, r116; +} +{ +sub.f16x2 r117, r112, r113; +} +{ +sub.f16x2 r120, r115, r116; +} +bfe.u32 r507, r17, 2, 7; +and.b32 r508, r497, 12; +add.s32 r509, r494, r508; +cvt.rn.f32.u32 f64, r507; +mul.f32 f65, f64, 0f3CC90FDB; +cos.approx.f32 f18, f65; +sin.approx.f32 f66, f65; +neg.f32 f19, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f19; +mov.b32 r123, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r126, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r128, {high, high}; +} +{ +mul.f16x2 r130, r120, r128; +} +{ +fma.rn.f16x2 r133, r117, r126, r130; +} +{ +mul.f16x2 r137, r117, r128; +} +{ +neg.f16x2 r140, r137; +} +{ +fma.rn.f16x2 r142, r120, r126, r140; +} +barrier.sync 0; +and.b32 r510, r492, 4064; +add.s32 r511, r509, r510; +st.shared.u32 [r511], r111; +st.shared.u32 [r511+16], r133; +barrier.sync 0; +and.b32 r512, r497, 2032; +sub.s32 r513, r511, r512; +ld.shared.u32 r164, [r513]; +ld.shared.u32 r165, [r513+2048]; +barrier.sync 0; +st.shared.u32 [r511], r114; +st.shared.u32 [r511+16], r142; +barrier.sync 0; +ld.shared.u32 r167, [r513]; +ld.shared.u32 r168, [r513+2048]; +{ +add.f16x2 r163, r164, r165; +} +{ +add.f16x2 r166, r167, r168; +} +{ +sub.f16x2 r169, r164, r165; +} +{ +sub.f16x2 r172, r167, r168; +} +bfe.u32 r514, r17, 3, 6; +and.b32 r515, r497, 28; +add.s32 r516, r494, r515; +cvt.rn.f32.u32 f67, r514; +mul.f32 f68, f67, 0f3D490FDB; +cos.approx.f32 f24, f68; +sin.approx.f32 f69, f68; +neg.f32 f25, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f24; +cvt.rn.f16.f32 high, f25; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r178, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r175; +mov.b32 r180, {high, high}; +} +{ +mul.f16x2 r182, r172, r180; +} +{ +fma.rn.f16x2 r185, r169, r178, r182; +} +{ +mul.f16x2 r189, r169, r180; +} +{ +neg.f16x2 r192, r189; +} +{ +fma.rn.f16x2 r194, r172, r178, r192; +} +barrier.sync 0; +and.b32 r517, r492, 4032; +add.s32 r518, r516, r517; +st.shared.u32 [r518], r163; +st.shared.u32 [r518+32], r185; +barrier.sync 0; +and.b32 r519, r497, 2016; +sub.s32 r520, r518, r519; +ld.shared.u32 r216, [r520]; +ld.shared.u32 r217, [r520+2048]; +barrier.sync 0; +st.shared.u32 [r518], r166; +st.shared.u32 [r518+32], r194; +barrier.sync 0; +ld.shared.u32 r219, [r520]; +ld.shared.u32 r220, [r520+2048]; +{ +add.f16x2 r215, r216, r217; +} +{ +add.f16x2 r218, r219, r220; +} +{ +sub.f16x2 r221, r216, r217; +} +{ +sub.f16x2 r224, r219, r220; +} +bfe.u32 r521, r17, 4, 5; +and.b32 r522, r497, 60; +add.s32 r523, r494, r522; +cvt.rn.f32.u32 f70, r521; +mul.f32 f71, f70, 0f3DC90FDB; +cos.approx.f32 f30, f71; +sin.approx.f32 f72, f71; +neg.f32 f31, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f31; +mov.b32 r227, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r230, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r227; +mov.b32 r232, {high, high}; +} +{ +mul.f16x2 r234, r224, r232; +} +{ +fma.rn.f16x2 r237, r221, r230, r234; +} +{ +mul.f16x2 r241, r221, r232; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r224, r230, r244; +} +barrier.sync 0; +and.b32 r524, r492, 3968; +add.s32 r525, r523, r524; +st.shared.u32 [r525], r215; +st.shared.u32 [r525+64], r237; +barrier.sync 0; +and.b32 r526, r497, 1984; +sub.s32 r527, r525, r526; +ld.shared.u32 r268, [r527]; +ld.shared.u32 r269, [r527+2048]; +barrier.sync 0; +st.shared.u32 [r525], r218; +st.shared.u32 [r525+64], r246; +barrier.sync 0; +ld.shared.u32 r271, [r527]; +ld.shared.u32 r272, [r527+2048]; +{ +add.f16x2 r267, r268, r269; +} +{ +add.f16x2 r270, r271, r272; +} +{ +sub.f16x2 r273, r268, r269; +} +{ +sub.f16x2 r276, r271, r272; +} +bfe.u32 r528, r17, 5, 4; +and.b32 r529, r497, 124; +add.s32 r530, r494, r529; +cvt.rn.f32.u32 f73, r528; +mul.f32 f74, f73, 0f3E490FDB; +cos.approx.f32 f36, f74; +sin.approx.f32 f75, f74; +neg.f32 f37, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f37; +mov.b32 r279, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r282, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r279; +mov.b32 r284, {high, high}; +} +{ +mul.f16x2 r286, r276, r284; +} +{ +fma.rn.f16x2 r289, r273, r282, r286; +} +{ +mul.f16x2 r293, r273, r284; +} +{ +neg.f16x2 r296, r293; +} +{ +fma.rn.f16x2 r298, r276, r282, r296; +} +barrier.sync 0; +and.b32 r531, r492, 3840; +add.s32 r532, r530, r531; +st.shared.u32 [r532], r267; +st.shared.u32 [r532+128], r289; +barrier.sync 0; +and.b32 r533, r497, 1920; +sub.s32 r534, r532, r533; +ld.shared.u32 r320, [r534]; +ld.shared.u32 r321, [r534+2048]; +barrier.sync 0; +st.shared.u32 [r532], r270; +st.shared.u32 [r532+128], r298; +barrier.sync 0; +ld.shared.u32 r323, [r534]; +ld.shared.u32 r324, [r534+2048]; +{ +add.f16x2 r319, r320, r321; +} +{ +add.f16x2 r322, r323, r324; +} +{ +sub.f16x2 r325, r320, r321; +} +{ +sub.f16x2 r328, r323, r324; +} +bfe.u32 r535, r17, 6, 3; +and.b32 r536, r497, 252; +add.s32 r537, r494, r536; +cvt.rn.f32.u32 f76, r535; +mul.f32 f77, f76, 0f3EC90FDB; +cos.approx.f32 f42, f77; +sin.approx.f32 f78, f77; +neg.f32 f43, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r334, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r331; +mov.b32 r336, {high, high}; +} +{ +mul.f16x2 r338, r328, r336; +} +{ +fma.rn.f16x2 r341, r325, r334, r338; +} +{ +mul.f16x2 r345, r325, r336; +} +{ +neg.f16x2 r348, r345; +} +{ +fma.rn.f16x2 r350, r328, r334, r348; +} +barrier.sync 0; +and.b32 r538, r492, 3584; +add.s32 r539, r537, r538; +st.shared.u32 [r539], r319; +st.shared.u32 [r539+256], r341; +barrier.sync 0; +and.b32 r540, r497, 1792; +sub.s32 r541, r539, r540; +ld.shared.u32 r372, [r541]; +ld.shared.u32 r373, [r541+2048]; +barrier.sync 0; +st.shared.u32 [r539], r322; +st.shared.u32 [r539+256], r350; +barrier.sync 0; +ld.shared.u32 r375, [r541]; +ld.shared.u32 r376, [r541+2048]; +{ +add.f16x2 r371, r372, r373; +} +{ +add.f16x2 r374, r375, r376; +} +{ +sub.f16x2 r377, r372, r373; +} +{ +sub.f16x2 r380, r375, r376; +} +bfe.u32 r542, r17, 7, 2; +and.b32 r543, r497, 508; +add.s32 r544, r494, r543; +cvt.rn.f32.u32 f79, r542; +mul.f32 f80, f79, 0f3F490FDB; +cos.approx.f32 f48, f80; +sin.approx.f32 f81, f80; +neg.f32 f49, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f49; +mov.b32 r383, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r383; +mov.b32 r388, {high, high}; +} +{ +mul.f16x2 r390, r380, r388; +} +{ +fma.rn.f16x2 r393, r377, r386, r390; +} +{ +mul.f16x2 r397, r377, r388; +} +{ +neg.f16x2 r400, r397; +} +{ +fma.rn.f16x2 r402, r380, r386, r400; +} +barrier.sync 0; +and.b32 r545, r492, 3072; +add.s32 r546, r544, r545; +st.shared.u32 [r546], r371; +st.shared.u32 [r546+512], r393; +barrier.sync 0; +and.b32 r547, r497, 1536; +sub.s32 r548, r546, r547; +ld.shared.u32 r424, [r548]; +ld.shared.u32 r425, [r548+2048]; +barrier.sync 0; +st.shared.u32 [r546], r374; +st.shared.u32 [r546+512], r402; +barrier.sync 0; +ld.shared.u32 r427, [r548]; +ld.shared.u32 r428, [r548+2048]; +{ +add.f16x2 r423, r424, r425; +} +{ +add.f16x2 r426, r427, r428; +} +{ +sub.f16x2 r429, r424, r425; +} +{ +sub.f16x2 r432, r427, r428; +} +bfe.u32 r549, r17, 8, 1; +and.b32 r550, r497, 1020; +add.s32 r551, r494, r550; +cvt.rn.f32.u32 f82, r549; +mul.f32 f83, f82, 0f3FC90FDB; +cos.approx.f32 f54, f83; +sin.approx.f32 f84, f83; +neg.f32 f55, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f55; +mov.b32 r435, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r438, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r435; +mov.b32 r440, {high, high}; +} +{ +mul.f16x2 r442, r432, r440; +} +{ +fma.rn.f16x2 r445, r429, r438, r442; +} +{ +mul.f16x2 r449, r429, r440; +} +{ +neg.f16x2 r452, r449; +} +{ +fma.rn.f16x2 r454, r432, r438, r452; +} +barrier.sync 0; +and.b32 r552, r492, 2048; +add.s32 r553, r551, r552; +st.shared.u32 [r553], r423; +st.shared.u32 [r553+1024], r445; +barrier.sync 0; +and.b32 r554, r497, 1024; +sub.s32 r555, r553, r554; +ld.shared.u32 r476, [r555]; +ld.shared.u32 r477, [r555+2048]; +barrier.sync 0; +st.shared.u32 [r553], r426; +st.shared.u32 [r553+1024], r454; +barrier.sync 0; +ld.shared.u32 r479, [r555]; +ld.shared.u32 r480, [r555+2048]; +{ +add.f16x2 %0, r476, r477; +} +{ +add.f16x2 %1, r479, r480; +} +{ +sub.f16x2 %2, r476, r477; +} +{ +sub.f16x2 %3, r479, r480; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..7231893cd999c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp32_fwd.hpp.inc @@ -0,0 +1,6616 @@ +#ifndef CUFFTDX_FFT_1024_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_1024_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<83, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1872>; +.reg .b32 r<24>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1867, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1865, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1864, f1867, f1865; +sub.f32 f140, f1867, f1865; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1863, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1860, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1858, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1857, f1860, f1858; +sub.f32 f156, f1860, f1858; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1856, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1856, 0fBF3504F3; +mul.f32 f1855, f157, 0f3F3504F3; +sub.f32 f163, f1855, f162; +mul.f32 f164, f1856, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1854, f1864, f1857; +sub.f32 f173, f1864, f1857; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1853, f1863, f165; +sub.f32 f177, f1863, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1852, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1851, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1849, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1846, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1845, f1849, f1846; +sub.f32 f197, f1849, f1846; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1844, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1842, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1840, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1839, f1842, f1840; +sub.f32 f213, f1842, f1840; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1838, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1838, 0fBF3504F3; +mul.f32 f1837, f214, 0f3F3504F3; +sub.f32 f220, f1837, f219; +mul.f32 f221, f1838, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1836, f1845, f1839; +sub.f32 f230, f1845, f1839; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1835, f1844, f222; +sub.f32 f234, f1844, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1834, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1833, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1831, f231, 0f3F6C835E; +mul.f32 f1832, f1835, 0fBEC3EF15; +sub.f32 f245, f1831, f1832; +mul.f32 f246, f1835, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1829, f235, 0f3F3504F3; +mul.f32 f1830, f1834, 0fBF3504F3; +sub.f32 f250, f1829, f1830; +mul.f32 f251, f1834, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1827, f239, 0f3EC3EF15; +mul.f32 f1828, f1833, 0fBF6C835E; +sub.f32 f255, f1827, f1828; +mul.f32 f256, f1833, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1825, f233, 0fBEC3EF15; +mul.f32 f1826, f234, 0fBF6C835E; +sub.f32 f260, f1825, f1826; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1823, f241, 0fBF6C835E; +mul.f32 f1824, f242, 0fBEC3EF15; +sub.f32 f269, f1823, f1824; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1822, f1854, f1836; +sub.f32 f275, f1854, f1836; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1821, f1853, f247; +sub.f32 f279, f1853, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1820, f1852, f252; +sub.f32 f283, f1852, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1819, f1851, f257; +sub.f32 f287, f1851, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1818, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1817, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1816, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1815, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1812, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1810, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1809, f1812, f1810; +sub.f32 f315, f1812, f1810; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1808, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1806, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1803, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1802, f1806, f1803; +sub.f32 f331, f1806, f1803; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1801, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1799, f332, 0f3F3504F3; +mul.f32 f1800, f1801, 0fBF3504F3; +sub.f32 f338, f1799, f1800; +mul.f32 f339, f1801, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1798, f1809, f1802; +sub.f32 f348, f1809, f1802; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1797, f1808, f340; +sub.f32 f352, f1808, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1796, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1795, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1793, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1791, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1790, f1793, f1791; +sub.f32 f372, f1793, f1791; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1789, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1786, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1785, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1784, f1786, f1785; +sub.f32 f388, f1786, f1785; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1783, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1781, f389, 0f3F3504F3; +mul.f32 f1782, f1783, 0fBF3504F3; +sub.f32 f395, f1781, f1782; +mul.f32 f396, f1783, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1780, f1790, f1784; +sub.f32 f405, f1790, f1784; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1779, f1789, f397; +sub.f32 f409, f1789, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1778, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1777, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1779, 0fBEC3EF15; +mul.f32 f1776, f406, 0f3F6C835E; +sub.f32 f420, f1776, f419; +mul.f32 f421, f1779, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1778, 0fBF3504F3; +mul.f32 f1775, f410, 0f3F3504F3; +sub.f32 f425, f1775, f424; +mul.f32 f426, f1778, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1773, f414, 0f3EC3EF15; +mul.f32 f1774, f1777, 0fBF6C835E; +sub.f32 f430, f1773, f1774; +mul.f32 f431, f1777, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1771, f408, 0fBEC3EF15; +mul.f32 f1772, f409, 0fBF6C835E; +sub.f32 f435, f1771, f1772; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1770, f416, 0fBF6C835E; +sub.f32 f444, f1770, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1769, f1798, f1780; +sub.f32 f450, f1798, f1780; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1768, f1797, f422; +sub.f32 f454, f1797, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1767, f1796, f427; +sub.f32 f458, f1796, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1766, f1795, f432; +sub.f32 f462, f1795, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1765, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1764, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1763, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1762, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1768, 0fBE47C5C2; +mul.f32 f1761, f451, 0f3F7B14BE; +sub.f32 f481, f1761, f480; +mul.f32 f482, f1768, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1767, 0fBEC3EF15; +mul.f32 f1760, f455, 0f3F6C835E; +sub.f32 f486, f1760, f485; +mul.f32 f487, f1767, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1766, 0fBF0E39DA; +mul.f32 f1759, f459, 0f3F54DB31; +sub.f32 f491, f1759, f490; +mul.f32 f492, f1766, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1765, 0fBF3504F3; +mul.f32 f1758, f463, 0f3F3504F3; +sub.f32 f496, f1758, f495; +mul.f32 f497, f1765, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1756, f467, 0f3F0E39DA; +mul.f32 f1757, f1764, 0fBF54DB31; +sub.f32 f501, f1756, f1757; +mul.f32 f502, f1764, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1754, f471, 0f3EC3EF15; +mul.f32 f1755, f1763, 0fBF6C835E; +sub.f32 f506, f1754, f1755; +mul.f32 f507, f1763, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1752, f475, 0f3E47C5C2; +mul.f32 f1753, f1762, 0fBF7B14BE; +sub.f32 f511, f1752, f1753; +mul.f32 f512, f1762, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1750, f453, 0fBE47C5C2; +mul.f32 f1751, f454, 0fBF7B14BE; +sub.f32 f516, f1750, f1751; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1749, f457, 0fBEC3EF15; +sub.f32 f521, f1749, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1748, f461, 0fBF0E39DA; +sub.f32 f526, f1748, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1746, f469, 0fBF54DB31; +mul.f32 f1747, f470, 0fBF0E39DA; +sub.f32 f535, f1746, f1747; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1745, f473, 0fBF6C835E; +sub.f32 f540, f1745, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1744, f477, 0fBF7B14BE; +sub.f32 f545, f1744, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1743, f1821, f483; +sub.f32 f553, f1821, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1742, f1820, f488; +sub.f32 f557, f1820, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1741, f1819, f493; +sub.f32 f561, f1819, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1740, f1818, f498; +sub.f32 f565, f1818, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f1739, f1817, f503; +sub.f32 f569, f1817, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f1738, f1816, f508; +sub.f32 f573, f1816, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f1737, f1815, f513; +sub.f32 f577, f1815, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f1736, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f1735, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f1734, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f1733, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f1732, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1731, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1730, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1729, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +and.b32 r14, r15, 31; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f1743; +mul.f32 f616, f610, f1743; +mul.f32 f618, f611, f611; +mul.f32 f1728, f610, f610; +sub.f32 f619, f1728, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f1742; +mul.f32 f624, f619, f1742; +mul.f32 f626, f611, f621; +mul.f32 f1727, f610, f619; +sub.f32 f627, f1727, f626; +mul.f32 f1726, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f1741; +mul.f32 f632, f627, f1741; +mul.f32 f1724, f610, f627; +mul.f32 f1725, f611, f629; +sub.f32 f635, f1724, f1725; +mul.f32 f1723, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f1740; +mul.f32 f640, f635, f1740; +mul.f32 f642, f611, f637; +mul.f32 f1722, f610, f635; +sub.f32 f643, f1722, f642; +mul.f32 f1721, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f1739; +mul.f32 f648, f643, f1739; +mul.f32 f1719, f610, f643; +mul.f32 f1720, f611, f645; +sub.f32 f651, f1719, f1720; +mul.f32 f1718, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f1738; +mul.f32 f656, f651, f1738; +mul.f32 f658, f611, f653; +mul.f32 f1717, f610, f651; +sub.f32 f659, f1717, f658; +mul.f32 f1716, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f1737; +mul.f32 f664, f659, f1737; +mul.f32 f666, f611, f661; +mul.f32 f1715, f610, f659; +sub.f32 f667, f1715, f666; +mul.f32 f1714, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f1736; +mul.f32 f672, f667, f1736; +mul.f32 f1712, f610, f667; +mul.f32 f1713, f611, f669; +sub.f32 f675, f1712, f1713; +mul.f32 f1711, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f1735; +mul.f32 f680, f675, f1735; +mul.f32 f682, f611, f677; +mul.f32 f1710, f610, f675; +sub.f32 f683, f1710, f682; +mul.f32 f1709, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f1734; +mul.f32 f688, f683, f1734; +mul.f32 f690, f611, f685; +mul.f32 f1708, f610, f683; +sub.f32 f691, f1708, f690; +mul.f32 f1707, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f1733; +mul.f32 f696, f691, f1733; +mul.f32 f1705, f610, f691; +mul.f32 f1706, f611, f693; +sub.f32 f699, f1705, f1706; +mul.f32 f1704, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f1732; +mul.f32 f704, f699, f1732; +mul.f32 f706, f611, f701; +mul.f32 f1703, f610, f699; +sub.f32 f707, f1703, f706; +mul.f32 f1702, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f1731; +mul.f32 f712, f707, f1731; +mul.f32 f1700, f610, f707; +mul.f32 f1701, f611, f709; +sub.f32 f715, f1700, f1701; +mul.f32 f1699, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f1730; +mul.f32 f720, f715, f1730; +mul.f32 f722, f611, f717; +mul.f32 f1698, f610, f715; +sub.f32 f723, f1698, f722; +mul.f32 f1697, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f1729; +mul.f32 f728, f723, f1729; +mul.f32 f730, f611, f725; +mul.f32 f1696, f610, f723; +sub.f32 f731, f1696, f730; +mul.f32 f1695, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1694, f1822, f1769; +mul.f32 f735, f733, f1694; +mul.f32 f736, f731, f1694; +mul.f32 f1692, f610, f731; +mul.f32 f1693, f611, f733; +sub.f32 f739, f1692, f1693; +sub.f32 f1691, f272, f447; +mul.f32 f1690, f731, f1691; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1689, f610, f739; +sub.f32 f747, f1689, f746; +mul.f32 f1688, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1687, f610, f747; +sub.f32 f755, f1687, f754; +mul.f32 f1686, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f1684, f610, f755; +mul.f32 f1685, f611, f757; +sub.f32 f763, f1684, f1685; +mul.f32 f1683, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1682, f610, f763; +sub.f32 f771, f1682, f770; +mul.f32 f1681, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f1679, f610, f771; +mul.f32 f1680, f611, f773; +sub.f32 f779, f1679, f1680; +mul.f32 f1678, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1677, f610, f779; +sub.f32 f787, f1677, f786; +mul.f32 f1676, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1675, f610, f787; +sub.f32 f795, f1675, f794; +mul.f32 f1674, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f1672, f610, f795; +mul.f32 f1673, f611, f797; +sub.f32 f803, f1672, f1673; +mul.f32 f1671, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1670, f610, f803; +sub.f32 f811, f1670, f810; +mul.f32 f1669, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1668, f610, f811; +sub.f32 f819, f1668, f818; +mul.f32 f1667, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f1665, f610, f819; +mul.f32 f1666, f611, f821; +sub.f32 f827, f1665, f1666; +mul.f32 f1664, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1663, f610, f827; +sub.f32 f835, f1663, f834; +mul.f32 f1662, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f1660, f610, f835; +mul.f32 f1661, f611, f837; +sub.f32 f843, f1660, f1661; +mul.f32 f1659, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1658, f610, f843; +sub.f32 f851, f1658, f850; +mul.f32 f1657, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f1656, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r17, %tid.x; +shl.b32 r16, r17, 8; +barrier.sync 0; +and.b32 r11, r16, 7936; +add.s32 r12, r9, r11; +sub.f32 f1870, f1822, f1769; +mul.f32 f1869, f733, f1870; +add.f32 f857, f1822, f1769; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 31; +sub.f32 f1871, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 31; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 31; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f1657, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f1726, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f1723, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f1721, f639; +sub.f32 f867, f1718, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f1716, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f1714, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f1711, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f1709, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f1707, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f1704, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f1702, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f1699, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f1697, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f1695, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f1871, f736; +sub.f32 f890, f1690, f1869; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f1688, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f1686, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f1683, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f1681, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f1678, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f1676, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f1674, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f1671, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f1669, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f1667, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f1664, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f1662, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f1659, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f1656, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +mad.lo.s32 r13, r22, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+256]; +ld.shared.v2.f32 {f929, f930}, [r13+512]; +ld.shared.v2.f32 {f933, f934}, [r13+768]; +ld.shared.v2.f32 {f937, f938}, [r13+1024]; +ld.shared.v2.f32 {f941, f942}, [r13+1280]; +ld.shared.v2.f32 {f945, f946}, [r13+1536]; +ld.shared.v2.f32 {f949, f950}, [r13+1792]; +ld.shared.v2.f32 {f953, f954}, [r13+2048]; +ld.shared.v2.f32 {f957, f958}, [r13+2304]; +ld.shared.v2.f32 {f961, f962}, [r13+2560]; +ld.shared.v2.f32 {f965, f966}, [r13+2816]; +ld.shared.v2.f32 {f969, f970}, [r13+3072]; +ld.shared.v2.f32 {f973, f974}, [r13+3328]; +ld.shared.v2.f32 {f977, f978}, [r13+3584]; +ld.shared.v2.f32 {f981, f982}, [r13+3840]; +ld.shared.v2.f32 {f985, f986}, [r13+4096]; +ld.shared.v2.f32 {f989, f990}, [r13+4352]; +ld.shared.v2.f32 {f993, f994}, [r13+4608]; +ld.shared.v2.f32 {f997, f998}, [r13+4864]; +ld.shared.v2.f32 {f1001, f1002}, [r13+5120]; +ld.shared.v2.f32 {f1005, f1006}, [r13+5376]; +ld.shared.v2.f32 {f1009, f1010}, [r13+5632]; +ld.shared.v2.f32 {f1013, f1014}, [r13+5888]; +ld.shared.v2.f32 {f1017, f1018}, [r13+6144]; +ld.shared.v2.f32 {f1021, f1022}, [r13+6400]; +ld.shared.v2.f32 {f1025, f1026}, [r13+6656]; +ld.shared.v2.f32 {f1029, f1030}, [r13+6912]; +ld.shared.v2.f32 {f1033, f1034}, [r13+7168]; +ld.shared.v2.f32 {f1037, f1038}, [r13+7424]; +ld.shared.v2.f32 {f1041, f1042}, [r13+7680]; +ld.shared.v2.f32 {f1045, f1046}, [r13+7936]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1655, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1654, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f1653, f1655, f1654; +sub.f32 f1060, f1655, f1654; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f1652, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f1651, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f1650, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f1649, f1651, f1650; +sub.f32 f1076, f1651, f1650; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f1648, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f1648, 0fBF3504F3; +mul.f32 f1647, f1077, 0f3F3504F3; +sub.f32 f1083, f1647, f1082; +mul.f32 f1084, f1648, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f1646, f1653, f1649; +sub.f32 f1093, f1653, f1649; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f1645, f1652, f1085; +sub.f32 f1097, f1652, f1085; +add.f32 f1098, f1059, f1076; +sub.f32 f1100, f1059, f1076; +sub.f32 f1644, f1060, f1075; +add.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1088; +sub.f32 f1104, f1063, f1088; +add.f32 f1643, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f1642, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f1641, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f1640, f1642, f1641; +sub.f32 f1117, f1642, f1641; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f1639, f1109, f1112; +add.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f1638, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f1637, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f1636, f1638, f1637; +sub.f32 f1133, f1638, f1637; +add.f32 f1134, f1124, f1129; +sub.f32 f1136, f1124, f1129; +sub.f32 f1635, f1125, f1128; +add.f32 f1137, f1125, f1128; +mul.f32 f1139, f1635, 0fBF3504F3; +mul.f32 f1634, f1134, 0f3F3504F3; +sub.f32 f1140, f1634, f1139; +mul.f32 f1141, f1635, 0f3F3504F3; +fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141; +mul.f32 f1143, f1136, 0fBF3504F3; +mul.f32 f1144, f1137, 0fBF3504F3; +sub.f32 f1145, f1143, f1144; +add.f32 f1146, f1143, f1144; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f1633, f1640, f1636; +sub.f32 f1150, f1640, f1636; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f1632, f1639, f1142; +sub.f32 f1154, f1639, f1142; +add.f32 f1155, f1116, f1133; +sub.f32 f1157, f1116, f1133; +sub.f32 f1631, f1117, f1132; +add.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1145; +sub.f32 f1161, f1120, f1145; +add.f32 f1630, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f1628, f1151, 0f3F6C835E; +mul.f32 f1629, f1632, 0fBEC3EF15; +sub.f32 f1165, f1628, f1629; +mul.f32 f1166, f1632, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166; +mul.f32 f1626, f1155, 0f3F3504F3; +mul.f32 f1627, f1631, 0fBF3504F3; +sub.f32 f1170, f1626, f1627; +mul.f32 f1171, f1631, 0f3F3504F3; +fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171; +mul.f32 f1174, f1630, 0fBF6C835E; +mul.f32 f1625, f1159, 0f3EC3EF15; +sub.f32 f1175, f1625, f1174; +mul.f32 f1176, f1630, 0f3EC3EF15; +fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176; +mul.f32 f1179, f1154, 0fBF6C835E; +mul.f32 f1624, f1153, 0fBEC3EF15; +sub.f32 f1180, f1624, f1179; +mul.f32 f1181, f1154, 0fBEC3EF15; +fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181; +mul.f32 f1183, f1157, 0fBF3504F3; +mul.f32 f1184, f1158, 0fBF3504F3; +sub.f32 f1185, f1183, f1184; +add.f32 f1186, f1183, f1184; +mul.f32 f1622, f1161, 0fBF6C835E; +mul.f32 f1623, f1162, 0fBEC3EF15; +sub.f32 f1189, f1622, f1623; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f1621, f1646, f1633; +sub.f32 f1195, f1646, f1633; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f1620, f1645, f1167; +sub.f32 f1199, f1645, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f1619, f1644, f1172; +sub.f32 f1203, f1644, f1172; +add.f32 f1204, f1102, f1175; +sub.f32 f1206, f1102, f1175; +add.f32 f1618, f1643, f1177; +sub.f32 f1207, f1643, f1177; +add.f32 f1208, f1092, f1150; +sub.f32 f1210, f1092, f1150; +sub.f32 f1617, f1093, f1149; +add.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1180; +sub.f32 f1214, f1096, f1180; +add.f32 f1616, f1097, f1182; +sub.f32 f1215, f1097, f1182; +add.f32 f1216, f1100, f1185; +sub.f32 f1218, f1100, f1185; +add.f32 f1615, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f1614, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f1613, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f1612, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f1611, f1613, f1612; +sub.f32 f1235, f1613, f1612; +add.f32 f1236, f1226, f1231; +sub.f32 f1238, f1226, f1231; +sub.f32 f1610, f1227, f1230; +add.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f1609, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f1608, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f1607, f1609, f1608; +sub.f32 f1251, f1609, f1608; +add.f32 f1252, f1242, f1247; +sub.f32 f1254, f1242, f1247; +sub.f32 f1606, f1243, f1246; +add.f32 f1255, f1243, f1246; +mul.f32 f1257, f1606, 0fBF3504F3; +mul.f32 f1605, f1252, 0f3F3504F3; +sub.f32 f1258, f1605, f1257; +mul.f32 f1259, f1606, 0f3F3504F3; +fma.rn.f32 f1260, f1252, 0fBF3504F3, f1259; +mul.f32 f1261, f1254, 0fBF3504F3; +mul.f32 f1262, f1255, 0fBF3504F3; +sub.f32 f1263, f1261, f1262; +add.f32 f1264, f1261, f1262; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f1604, f1611, f1607; +sub.f32 f1268, f1611, f1607; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f1603, f1610, f1260; +sub.f32 f1272, f1610, f1260; +add.f32 f1273, f1234, f1251; +sub.f32 f1275, f1234, f1251; +sub.f32 f1602, f1235, f1250; +add.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1263; +sub.f32 f1279, f1238, f1263; +add.f32 f1601, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f1600, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f1599, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f1598, f1600, f1599; +sub.f32 f1292, f1600, f1599; +add.f32 f1293, f1283, f1288; +sub.f32 f1295, f1283, f1288; +sub.f32 f1597, f1284, f1287; +add.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f1596, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f1595, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f1594, f1596, f1595; +sub.f32 f1308, f1596, f1595; +add.f32 f1309, f1299, f1304; +sub.f32 f1311, f1299, f1304; +sub.f32 f1593, f1300, f1303; +add.f32 f1312, f1300, f1303; +mul.f32 f1314, f1593, 0fBF3504F3; +mul.f32 f1592, f1309, 0f3F3504F3; +sub.f32 f1315, f1592, f1314; +mul.f32 f1316, f1593, 0f3F3504F3; +fma.rn.f32 f1317, f1309, 0fBF3504F3, f1316; +mul.f32 f1318, f1311, 0fBF3504F3; +mul.f32 f1319, f1312, 0fBF3504F3; +sub.f32 f1320, f1318, f1319; +add.f32 f1321, f1318, f1319; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f1591, f1598, f1594; +sub.f32 f1325, f1598, f1594; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f1590, f1597, f1317; +sub.f32 f1329, f1597, f1317; +add.f32 f1330, f1291, f1308; +sub.f32 f1332, f1291, f1308; +sub.f32 f1589, f1292, f1307; +add.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1320; +sub.f32 f1336, f1295, f1320; +add.f32 f1588, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f1586, f1326, 0f3F6C835E; +mul.f32 f1587, f1590, 0fBEC3EF15; +sub.f32 f1340, f1586, f1587; +mul.f32 f1341, f1590, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0fBEC3EF15, f1341; +mul.f32 f1584, f1330, 0f3F3504F3; +mul.f32 f1585, f1589, 0fBF3504F3; +sub.f32 f1345, f1584, f1585; +mul.f32 f1346, f1589, 0f3F3504F3; +fma.rn.f32 f1347, f1330, 0fBF3504F3, f1346; +mul.f32 f1582, f1334, 0f3EC3EF15; +mul.f32 f1583, f1588, 0fBF6C835E; +sub.f32 f1350, f1582, f1583; +mul.f32 f1351, f1588, 0f3EC3EF15; +fma.rn.f32 f1352, f1334, 0fBF6C835E, f1351; +mul.f32 f1580, f1328, 0fBEC3EF15; +mul.f32 f1581, f1329, 0fBF6C835E; +sub.f32 f1355, f1580, f1581; +mul.f32 f1356, f1329, 0fBEC3EF15; +fma.rn.f32 f1357, f1328, 0fBF6C835E, f1356; +mul.f32 f1358, f1332, 0fBF3504F3; +mul.f32 f1359, f1333, 0fBF3504F3; +sub.f32 f1360, f1358, f1359; +add.f32 f1361, f1358, f1359; +mul.f32 f1578, f1336, 0fBF6C835E; +mul.f32 f1579, f1337, 0fBEC3EF15; +sub.f32 f1364, f1578, f1579; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0fBEC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f1577, f1604, f1591; +sub.f32 f1370, f1604, f1591; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f1576, f1603, f1342; +sub.f32 f1374, f1603, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f1575, f1602, f1347; +sub.f32 f1378, f1602, f1347; +add.f32 f1379, f1277, f1350; +sub.f32 f1381, f1277, f1350; +add.f32 f1574, f1601, f1352; +sub.f32 f1382, f1601, f1352; +add.f32 f1383, f1267, f1325; +sub.f32 f1385, f1267, f1325; +sub.f32 f1573, f1268, f1324; +add.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1355; +sub.f32 f1389, f1271, f1355; +add.f32 f1572, f1272, f1357; +sub.f32 f1390, f1272, f1357; +add.f32 f1391, f1275, f1360; +sub.f32 f1393, f1275, f1360; +add.f32 f1571, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f1570, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f1576, 0fBE47C5C2; +mul.f32 f1569, f1371, 0f3F7B14BE; +sub.f32 f1401, f1569, f1400; +mul.f32 f1402, f1576, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0fBE47C5C2, f1402; +mul.f32 f1405, f1575, 0fBEC3EF15; +mul.f32 f1568, f1375, 0f3F6C835E; +sub.f32 f1406, f1568, f1405; +mul.f32 f1407, f1575, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0fBEC3EF15, f1407; +mul.f32 f1566, f1379, 0f3F54DB31; +mul.f32 f1567, f1574, 0fBF0E39DA; +sub.f32 f1411, f1566, f1567; +mul.f32 f1412, f1574, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0fBF0E39DA, f1412; +mul.f32 f1564, f1383, 0f3F3504F3; +mul.f32 f1565, f1573, 0fBF3504F3; +sub.f32 f1416, f1564, f1565; +mul.f32 f1417, f1573, 0f3F3504F3; +fma.rn.f32 f1418, f1383, 0fBF3504F3, f1417; +mul.f32 f1562, f1387, 0f3F0E39DA; +mul.f32 f1563, f1572, 0fBF54DB31; +sub.f32 f1421, f1562, f1563; +mul.f32 f1422, f1572, 0f3F0E39DA; +fma.rn.f32 f1423, f1387, 0fBF54DB31, f1422; +mul.f32 f1560, f1391, 0f3EC3EF15; +mul.f32 f1561, f1571, 0fBF6C835E; +sub.f32 f1426, f1560, f1561; +mul.f32 f1427, f1571, 0f3EC3EF15; +fma.rn.f32 f1428, f1391, 0fBF6C835E, f1427; +mul.f32 f1430, f1570, 0fBF7B14BE; +mul.f32 f1559, f1395, 0f3E47C5C2; +sub.f32 f1431, f1559, f1430; +mul.f32 f1432, f1570, 0f3E47C5C2; +fma.rn.f32 f1433, f1395, 0fBF7B14BE, f1432; +mul.f32 f1435, f1374, 0fBF7B14BE; +mul.f32 f1558, f1373, 0fBE47C5C2; +sub.f32 f1436, f1558, f1435; +mul.f32 f1437, f1374, 0fBE47C5C2; +fma.rn.f32 f1438, f1373, 0fBF7B14BE, f1437; +mul.f32 f1440, f1378, 0fBF6C835E; +mul.f32 f1557, f1377, 0fBEC3EF15; +sub.f32 f1441, f1557, f1440; +mul.f32 f1442, f1378, 0fBEC3EF15; +fma.rn.f32 f1443, f1377, 0fBF6C835E, f1442; +mul.f32 f1445, f1382, 0fBF54DB31; +mul.f32 f1556, f1381, 0fBF0E39DA; +sub.f32 f1446, f1556, f1445; +mul.f32 f1447, f1382, 0fBF0E39DA; +fma.rn.f32 f1448, f1381, 0fBF54DB31, f1447; +mul.f32 f1449, f1385, 0fBF3504F3; +mul.f32 f1450, f1386, 0fBF3504F3; +sub.f32 f1451, f1449, f1450; +add.f32 f1452, f1449, f1450; +mul.f32 f1454, f1390, 0fBF0E39DA; +mul.f32 f1555, f1389, 0fBF54DB31; +sub.f32 f1455, f1555, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0fBF0E39DA, f1456; +mul.f32 f1459, f1394, 0fBEC3EF15; +mul.f32 f1554, f1393, 0fBF6C835E; +sub.f32 f1460, f1554, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0fBEC3EF15, f1461; +mul.f32 f1464, f1398, 0fBE47C5C2; +mul.f32 f1553, f1397, 0fBF7B14BE; +sub.f32 f1465, f1553, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0fBE47C5C2, f1466; +add.f32 %0, f1192, f1367; +add.f32 %1, f1621, f1577; +add.f32 %2, f1196, f1401; +add.f32 %3, f1620, f1403; +add.f32 %4, f1200, f1406; +add.f32 %5, f1619, f1408; +add.f32 %6, f1204, f1411; +add.f32 %7, f1618, f1413; +add.f32 %9, f1617, f1418; +add.f32 %8, f1208, f1416; +add.f32 %11, f1616, f1423; +add.f32 %10, f1212, f1421; +add.f32 %12, f1216, f1426; +add.f32 %13, f1615, f1428; +add.f32 %14, f1220, f1431; +add.f32 %15, f1614, f1433; +add.f32 %16, f1194, f1370; +sub.f32 %17, f1195, f1369; +add.f32 %18, f1198, f1436; +add.f32 %19, f1199, f1438; +add.f32 %21, f1203, f1443; +add.f32 %20, f1202, f1441; +add.f32 %23, f1207, f1448; +add.f32 %22, f1206, f1446; +add.f32 %25, f1211, f1452; +add.f32 %24, f1210, f1451; +add.f32 %26, f1214, f1455; +add.f32 %27, f1215, f1457; +add.f32 %28, f1218, f1460; +add.f32 %29, f1219, f1462; +add.f32 %30, f1222, f1465; +add.f32 %31, f1223, f1467; +sub.f32 %33, f1621, f1577; +sub.f32 %32, f1192, f1367; +sub.f32 %35, f1620, f1403; +sub.f32 %34, f1196, f1401; +sub.f32 %37, f1619, f1408; +sub.f32 %36, f1200, f1406; +sub.f32 %39, f1618, f1413; +sub.f32 %38, f1204, f1411; +sub.f32 %41, f1617, f1418; +sub.f32 %40, f1208, f1416; +sub.f32 %43, f1616, f1423; +sub.f32 %42, f1212, f1421; +sub.f32 %45, f1615, f1428; +sub.f32 %44, f1216, f1426; +sub.f32 %47, f1614, f1433; +sub.f32 %46, f1220, f1431; +add.f32 %49, f1195, f1369; +sub.f32 %48, f1194, f1370; +sub.f32 %51, f1199, f1438; +sub.f32 %50, f1198, f1436; +sub.f32 %53, f1203, f1443; +sub.f32 %52, f1202, f1441; +sub.f32 %55, f1207, f1448; +sub.f32 %54, f1206, f1446; +sub.f32 %57, f1211, f1452; +sub.f32 %56, f1210, f1451; +sub.f32 %59, f1215, f1457; +sub.f32 %58, f1214, f1455; +sub.f32 %61, f1219, f1462; +sub.f32 %60, f1218, f1460; +sub.f32 %63, f1223, f1467; +sub.f32 %62, f1222, f1465; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<81, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<841>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4032; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+256]; +ld.shared.f32 f391, [r13+512]; +ld.shared.f32 f392, [r13+768]; +ld.shared.f32 f393, [r13+1024]; +ld.shared.f32 f394, [r13+1280]; +ld.shared.f32 f395, [r13+1536]; +ld.shared.f32 f396, [r13+1792]; +ld.shared.f32 f397, [r13+2048]; +ld.shared.f32 f398, [r13+2304]; +ld.shared.f32 f399, [r13+2560]; +ld.shared.f32 f400, [r13+2816]; +ld.shared.f32 f401, [r13+3072]; +ld.shared.f32 f402, [r13+3328]; +ld.shared.f32 f403, [r13+3584]; +ld.shared.f32 f404, [r13+3840]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+256]; +ld.shared.f32 f407, [r13+512]; +ld.shared.f32 f408, [r13+768]; +ld.shared.f32 f409, [r13+1024]; +ld.shared.f32 f410, [r13+1280]; +ld.shared.f32 f411, [r13+1536]; +ld.shared.f32 f412, [r13+1792]; +ld.shared.f32 f413, [r13+2048]; +ld.shared.f32 f414, [r13+2304]; +ld.shared.f32 f415, [r13+2560]; +ld.shared.f32 f416, [r13+2816]; +ld.shared.f32 f417, [r13+3072]; +ld.shared.f32 f418, [r13+3328]; +ld.shared.f32 f419, [r13+3584]; +ld.shared.f32 f420, [r13+3840]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f544; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f544; +add.f32 f576, f474, f547; +add.f32 f577, f475, f549; +sub.f32 f578, f474, f547; +sub.f32 f579, f475, f549; +add.f32 f580, f464, f522; +sub.f32 f581, f465, f521; +sub.f32 f582, f464, f522; +add.f32 f583, f465, f521; +add.f32 f584, f468, f552; +add.f32 f585, f469, f554; +sub.f32 f586, f468, f552; +sub.f32 f587, f469, f554; +add.f32 f588, f472, f557; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f557; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 48; +bfe.u32 r15, r5, 4, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f596, f568; +mul.f32 f601, f597, f569; +sub.f32 f602, f600, f601; +mul.f32 f603, f596, f569; +fma.rn.f32 f604, f597, f568, f603; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f607, f572; +mul.f32 f611, f609, f573; +sub.f32 f612, f610, f611; +mul.f32 f613, f607, f573; +fma.rn.f32 f614, f609, f572, f613; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f617, f576; +mul.f32 f621, f619, f577; +sub.f32 f622, f620, f621; +mul.f32 f623, f617, f577; +fma.rn.f32 f624, f619, f576, f623; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f627, f580; +mul.f32 f631, f629, f581; +sub.f32 f632, f630, f631; +mul.f32 f633, f627, f581; +fma.rn.f32 f634, f629, f580, f633; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f637, f584; +mul.f32 f641, f639, f585; +sub.f32 f642, f640, f641; +mul.f32 f643, f637, f585; +fma.rn.f32 f644, f639, f584, f643; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f647, f588; +mul.f32 f651, f649, f589; +sub.f32 f652, f650, f651; +mul.f32 f653, f647, f589; +fma.rn.f32 f654, f649, f588, f653; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f657, f592; +mul.f32 f661, f659, f593; +sub.f32 f662, f660, f661; +mul.f32 f663, f657, f593; +fma.rn.f32 f664, f659, f592, f663; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f667, f566; +mul.f32 f671, f669, f567; +sub.f32 f672, f670, f671; +mul.f32 f673, f667, f567; +fma.rn.f32 f674, f669, f566, f673; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f677, f570; +mul.f32 f681, f679, f571; +sub.f32 f682, f680, f681; +mul.f32 f683, f677, f571; +fma.rn.f32 f684, f679, f570, f683; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f687, f574; +mul.f32 f691, f689, f575; +sub.f32 f692, f690, f691; +mul.f32 f693, f687, f575; +fma.rn.f32 f694, f689, f574, f693; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f697, f578; +mul.f32 f701, f699, f579; +sub.f32 f702, f700, f701; +mul.f32 f703, f697, f579; +fma.rn.f32 f704, f699, f578, f703; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f707, f582; +mul.f32 f711, f709, f583; +sub.f32 f712, f710, f711; +mul.f32 f713, f707, f583; +fma.rn.f32 f714, f709, f582, f713; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f717, f586; +mul.f32 f721, f719, f587; +sub.f32 f722, f720, f721; +mul.f32 f723, f717, f587; +fma.rn.f32 f724, f719, f586, f723; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f727, f590; +mul.f32 f731, f729, f591; +sub.f32 f732, f730, f731; +mul.f32 f733, f727, f591; +fma.rn.f32 f734, f729, f590, f733; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f737, f594; +mul.f32 f741, f739, f595; +sub.f32 f742, f740, f741; +mul.f32 f743, f737, f595; +fma.rn.f32 f744, f739, f594, f743; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 3072; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f602; +st.shared.f32 [r20+128], f612; +st.shared.f32 [r20+192], f622; +st.shared.f32 [r20+256], f632; +st.shared.f32 [r20+320], f642; +st.shared.f32 [r20+384], f652; +st.shared.f32 [r20+448], f662; +st.shared.f32 [r20+512], f672; +st.shared.f32 [r20+576], f682; +st.shared.f32 [r20+640], f692; +st.shared.f32 [r20+704], f702; +st.shared.f32 [r20+768], f712; +st.shared.f32 [r20+832], f722; +st.shared.f32 [r20+896], f732; +st.shared.f32 [r20+960], f742; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+256]; +ld.shared.f32 f747, [r21+512]; +ld.shared.f32 f748, [r21+768]; +ld.shared.f32 f749, [r21+1024]; +ld.shared.f32 f750, [r21+1280]; +ld.shared.f32 f751, [r21+1536]; +ld.shared.f32 f752, [r21+1792]; +ld.shared.f32 f753, [r21+2048]; +ld.shared.f32 f754, [r21+2304]; +ld.shared.f32 f755, [r21+2560]; +ld.shared.f32 f756, [r21+2816]; +ld.shared.f32 f757, [r21+3072]; +ld.shared.f32 f758, [r21+3328]; +ld.shared.f32 f759, [r21+3584]; +ld.shared.f32 f760, [r21+3840]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+256]; +ld.shared.f32 f763, [r21+512]; +ld.shared.f32 f764, [r21+768]; +ld.shared.f32 f765, [r21+1024]; +ld.shared.f32 f766, [r21+1280]; +ld.shared.f32 f767, [r21+1536]; +ld.shared.f32 f768, [r21+1792]; +ld.shared.f32 f769, [r21+2048]; +ld.shared.f32 f770, [r21+2304]; +ld.shared.f32 f771, [r21+2560]; +ld.shared.f32 f772, [r21+2816]; +ld.shared.f32 f773, [r21+3072]; +ld.shared.f32 f774, [r21+3328]; +ld.shared.f32 f775, [r21+3584]; +ld.shared.f32 f776, [r21+3840]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f746, f754; +add.f32 f786, f762, f770; +sub.f32 f787, f746, f754; +sub.f32 f788, f762, f770; +add.f32 f789, f750, f758; +add.f32 f790, f766, f774; +sub.f32 f791, f750, f758; +sub.f32 f792, f766, f774; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f748, f756; +add.f32 f802, f764, f772; +sub.f32 f803, f748, f756; +sub.f32 f804, f764, f772; +add.f32 f805, f752, f760; +add.f32 f806, f768, f776; +sub.f32 f807, f752, f760; +sub.f32 f808, f768, f776; +add.f32 %0, f777, f781; +add.f32 %1, f778, f782; +add.f32 %2, f785, f789; +add.f32 %3, f786, f790; +add.f32 %4, f793, f797; +add.f32 %5, f794, f798; +add.f32 %6, f801, f805; +add.f32 %7, f802, f806; +sub.f32 %9, f780, f783; +add.f32 %8, f779, f784; +sub.f32 %11, f788, f791; +add.f32 %10, f787, f792; +sub.f32 %13, f796, f799; +add.f32 %12, f795, f800; +sub.f32 %15, f804, f807; +add.f32 %14, f803, f808; +sub.f32 %16, f777, f781; +sub.f32 %17, f778, f782; +sub.f32 %18, f785, f789; +sub.f32 %19, f786, f790; +sub.f32 %20, f793, f797; +sub.f32 %21, f794, f798; +sub.f32 %22, f801, f805; +sub.f32 %23, f802, f806; +add.f32 %25, f780, f783; +sub.f32 %24, f779, f784; +add.f32 %27, f788, f791; +sub.f32 %26, f787, f792; +add.f32 %29, f796, f799; +sub.f32 %28, f795, f800; +add.f32 %31, f804, f807; +sub.f32 %30, f803, f808; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<84, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<475>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4064; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+512]; +ld.shared.f32 f161, [r13+1024]; +ld.shared.f32 f162, [r13+1536]; +ld.shared.f32 f163, [r13+2048]; +ld.shared.f32 f164, [r13+2560]; +ld.shared.f32 f165, [r13+3072]; +ld.shared.f32 f166, [r13+3584]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+512]; +ld.shared.f32 f169, [r13+1024]; +ld.shared.f32 f170, [r13+1536]; +ld.shared.f32 f171, [r13+2048]; +ld.shared.f32 f172, [r13+2560]; +ld.shared.f32 f173, [r13+3072]; +ld.shared.f32 f174, [r13+3584]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 120; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 3840; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+512]; +ld.shared.f32 f303, [r20+1024]; +ld.shared.f32 f304, [r20+1536]; +ld.shared.f32 f305, [r20+2048]; +ld.shared.f32 f306, [r20+2560]; +ld.shared.f32 f307, [r20+3072]; +ld.shared.f32 f308, [r20+3584]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+512]; +ld.shared.f32 f311, [r20+1024]; +ld.shared.f32 f312, [r20+1536]; +ld.shared.f32 f313, [r20+2048]; +ld.shared.f32 f314, [r20+2560]; +ld.shared.f32 f315, [r20+3072]; +ld.shared.f32 f316, [r20+3584]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +add.f32 f329, f319, f324; +sub.f32 f330, f320, f323; +sub.f32 f331, f319, f324; +add.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +add.f32 f345, f335, f340; +sub.f32 f346, f336, f339; +sub.f32 f347, f335, f340; +add.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0fBF3504F3; +sub.f32 f351, f349, f350; +mul.f32 f352, f346, 0f3F3504F3; +fma.rn.f32 f353, f345, 0fBF3504F3, f352; +mul.f32 f354, f347, 0fBF3504F3; +mul.f32 f355, f348, 0fBF3504F3; +sub.f32 f356, f354, f355; +add.f32 f357, f354, f355; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f353; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f353; +add.f32 f366, f327, f344; +sub.f32 f367, f328, f343; +sub.f32 f368, f327, f344; +add.f32 f369, f328, f343; +add.f32 f370, f331, f356; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f356; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 64; +bfe.u32 r22, r5, 6, 1; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f374, f362; +mul.f32 f379, f375, f363; +sub.f32 f380, f378, f379; +mul.f32 f381, f374, f363; +fma.rn.f32 f382, f375, f362, f381; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f385, f366; +mul.f32 f389, f387, f367; +sub.f32 f390, f388, f389; +mul.f32 f391, f385, f367; +fma.rn.f32 f392, f387, f366, f391; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f395, f370; +mul.f32 f399, f397, f371; +sub.f32 f400, f398, f399; +mul.f32 f401, f395, f371; +fma.rn.f32 f402, f397, f370, f401; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f405, f360; +mul.f32 f409, f407, f361; +sub.f32 f410, f408, f409; +mul.f32 f411, f405, f361; +fma.rn.f32 f412, f407, f360, f411; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f415, f364; +mul.f32 f419, f417, f365; +sub.f32 f420, f418, f419; +mul.f32 f421, f415, f365; +fma.rn.f32 f422, f417, f364, f421; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f425, f368; +mul.f32 f429, f427, f369; +sub.f32 f430, f428, f429; +mul.f32 f431, f425, f369; +fma.rn.f32 f432, f427, f368, f431; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f435, f372; +mul.f32 f439, f437, f373; +sub.f32 f440, f438, f439; +mul.f32 f441, f435, f373; +fma.rn.f32 f442, f437, f372, f441; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 2048; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f380; +st.shared.f32 [r26+512], f390; +st.shared.f32 [r26+768], f400; +st.shared.f32 [r26+1024], f410; +st.shared.f32 [r26+1280], f420; +st.shared.f32 [r26+1536], f430; +st.shared.f32 [r26+1792], f440; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+512]; +ld.shared.f32 f445, [r27+1024]; +ld.shared.f32 f446, [r27+1536]; +ld.shared.f32 f447, [r27+2048]; +ld.shared.f32 f448, [r27+2560]; +ld.shared.f32 f449, [r27+3072]; +ld.shared.f32 f450, [r27+3584]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+512]; +ld.shared.f32 f453, [r27+1024]; +ld.shared.f32 f454, [r27+1536]; +ld.shared.f32 f455, [r27+2048]; +ld.shared.f32 f456, [r27+2560]; +ld.shared.f32 f457, [r27+3072]; +ld.shared.f32 f458, [r27+3584]; +add.f32 %0, f443, f447; +add.f32 %1, f451, f455; +add.f32 %2, f444, f448; +add.f32 %3, f452, f456; +add.f32 %4, f445, f449; +add.f32 %5, f453, f457; +add.f32 %6, f446, f450; +add.f32 %7, f454, f458; +sub.f32 %8, f443, f447; +sub.f32 %9, f451, f455; +sub.f32 %10, f444, f448; +sub.f32 %11, f452, f456; +sub.f32 %12, f445, f449; +sub.f32 %13, f453, f457; +sub.f32 %14, f446, f450; +sub.f32 %15, f454, f458; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<85, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1821>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1819, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1817, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1816, f1819, f1817; +sub.f32 f140, f1819, f1817; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1815, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1812, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1810, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1809, f1812, f1810; +sub.f32 f156, f1812, f1810; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1808, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1808, 0fBF3504F3; +mul.f32 f1807, f157, 0f3F3504F3; +sub.f32 f163, f1807, f162; +mul.f32 f164, f1808, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1806, f1816, f1809; +sub.f32 f173, f1816, f1809; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1805, f1815, f165; +sub.f32 f177, f1815, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1804, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1803, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1801, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1798, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1797, f1801, f1798; +sub.f32 f197, f1801, f1798; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1796, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1794, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1792, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1791, f1794, f1792; +sub.f32 f213, f1794, f1792; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1790, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1790, 0fBF3504F3; +mul.f32 f1789, f214, 0f3F3504F3; +sub.f32 f220, f1789, f219; +mul.f32 f221, f1790, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1788, f1797, f1791; +sub.f32 f230, f1797, f1791; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1787, f1796, f222; +sub.f32 f234, f1796, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1786, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1785, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1783, f231, 0f3F6C835E; +mul.f32 f1784, f1787, 0fBEC3EF15; +sub.f32 f245, f1783, f1784; +mul.f32 f246, f1787, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1781, f235, 0f3F3504F3; +mul.f32 f1782, f1786, 0fBF3504F3; +sub.f32 f250, f1781, f1782; +mul.f32 f251, f1786, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1779, f239, 0f3EC3EF15; +mul.f32 f1780, f1785, 0fBF6C835E; +sub.f32 f255, f1779, f1780; +mul.f32 f256, f1785, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1777, f233, 0fBEC3EF15; +mul.f32 f1778, f234, 0fBF6C835E; +sub.f32 f260, f1777, f1778; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1775, f241, 0fBF6C835E; +mul.f32 f1776, f242, 0fBEC3EF15; +sub.f32 f269, f1775, f1776; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1774, f1806, f1788; +sub.f32 f275, f1806, f1788; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1773, f1805, f247; +sub.f32 f279, f1805, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1772, f1804, f252; +sub.f32 f283, f1804, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1771, f1803, f257; +sub.f32 f287, f1803, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1770, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1769, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1768, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1767, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1764, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1762, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1761, f1764, f1762; +sub.f32 f315, f1764, f1762; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1760, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1758, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1755, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1754, f1758, f1755; +sub.f32 f331, f1758, f1755; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1753, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1751, f332, 0f3F3504F3; +mul.f32 f1752, f1753, 0fBF3504F3; +sub.f32 f338, f1751, f1752; +mul.f32 f339, f1753, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1750, f1761, f1754; +sub.f32 f348, f1761, f1754; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1749, f1760, f340; +sub.f32 f352, f1760, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1748, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1747, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1745, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1743, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1742, f1745, f1743; +sub.f32 f372, f1745, f1743; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1741, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1738, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1737, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1736, f1738, f1737; +sub.f32 f388, f1738, f1737; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1735, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1733, f389, 0f3F3504F3; +mul.f32 f1734, f1735, 0fBF3504F3; +sub.f32 f395, f1733, f1734; +mul.f32 f396, f1735, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1732, f1742, f1736; +sub.f32 f405, f1742, f1736; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1731, f1741, f397; +sub.f32 f409, f1741, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1730, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1729, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1731, 0fBEC3EF15; +mul.f32 f1728, f406, 0f3F6C835E; +sub.f32 f420, f1728, f419; +mul.f32 f421, f1731, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1730, 0fBF3504F3; +mul.f32 f1727, f410, 0f3F3504F3; +sub.f32 f425, f1727, f424; +mul.f32 f426, f1730, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1725, f414, 0f3EC3EF15; +mul.f32 f1726, f1729, 0fBF6C835E; +sub.f32 f430, f1725, f1726; +mul.f32 f431, f1729, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1723, f408, 0fBEC3EF15; +mul.f32 f1724, f409, 0fBF6C835E; +sub.f32 f435, f1723, f1724; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1722, f416, 0fBF6C835E; +sub.f32 f444, f1722, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1721, f1750, f1732; +sub.f32 f450, f1750, f1732; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1720, f1749, f422; +sub.f32 f454, f1749, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1719, f1748, f427; +sub.f32 f458, f1748, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1718, f1747, f432; +sub.f32 f462, f1747, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1717, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1716, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1715, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1714, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1720, 0fBE47C5C2; +mul.f32 f1713, f451, 0f3F7B14BE; +sub.f32 f481, f1713, f480; +mul.f32 f482, f1720, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1719, 0fBEC3EF15; +mul.f32 f1712, f455, 0f3F6C835E; +sub.f32 f486, f1712, f485; +mul.f32 f487, f1719, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1718, 0fBF0E39DA; +mul.f32 f1711, f459, 0f3F54DB31; +sub.f32 f491, f1711, f490; +mul.f32 f492, f1718, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1717, 0fBF3504F3; +mul.f32 f1710, f463, 0f3F3504F3; +sub.f32 f496, f1710, f495; +mul.f32 f497, f1717, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1708, f467, 0f3F0E39DA; +mul.f32 f1709, f1716, 0fBF54DB31; +sub.f32 f501, f1708, f1709; +mul.f32 f502, f1716, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1706, f471, 0f3EC3EF15; +mul.f32 f1707, f1715, 0fBF6C835E; +sub.f32 f506, f1706, f1707; +mul.f32 f507, f1715, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1704, f475, 0f3E47C5C2; +mul.f32 f1705, f1714, 0fBF7B14BE; +sub.f32 f511, f1704, f1705; +mul.f32 f512, f1714, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1702, f453, 0fBE47C5C2; +mul.f32 f1703, f454, 0fBF7B14BE; +sub.f32 f516, f1702, f1703; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1701, f457, 0fBEC3EF15; +sub.f32 f521, f1701, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1700, f461, 0fBF0E39DA; +sub.f32 f526, f1700, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1698, f469, 0fBF54DB31; +mul.f32 f1699, f470, 0fBF0E39DA; +sub.f32 f535, f1698, f1699; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1697, f473, 0fBF6C835E; +sub.f32 f540, f1697, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1696, f477, 0fBF7B14BE; +sub.f32 f545, f1696, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1695, f1774, f1721; +sub.f32 f551, f1774, f1721; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1694, f1773, f483; +sub.f32 f555, f1773, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1693, f1772, f488; +sub.f32 f559, f1772, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1692, f1771, f493; +sub.f32 f563, f1771, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1691, f1770, f498; +sub.f32 f567, f1770, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f1690, f1769, f503; +sub.f32 f571, f1769, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f1689, f1768, f508; +sub.f32 f575, f1768, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f1688, f1767, f513; +sub.f32 f579, f1767, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f1687, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f1686, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f1685, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f1684, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f1683, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1682, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1681, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1680, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f1694; +mul.f32 f1679, f612, f552; +sub.f32 f618, f1679, f617; +mul.f32 f619, f612, f1694; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f1677, f612, f612; +mul.f32 f1678, f613, f613; +sub.f32 f623, f1677, f1678; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f1675, f623, f556; +mul.f32 f1676, f625, f1693; +sub.f32 f628, f1675, f1676; +mul.f32 f629, f623, f1693; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f1673, f612, f623; +mul.f32 f1674, f613, f625; +sub.f32 f633, f1673, f1674; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f1671, f633, f560; +mul.f32 f1672, f635, f1692; +sub.f32 f638, f1671, f1672; +mul.f32 f639, f633, f1692; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f1670, f612, f633; +sub.f32 f643, f1670, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f1691; +mul.f32 f1669, f643, f564; +sub.f32 f648, f1669, f647; +mul.f32 f649, f643, f1691; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f1668, f612, f643; +sub.f32 f653, f1668, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f1690; +mul.f32 f1667, f653, f568; +sub.f32 f658, f1667, f657; +mul.f32 f659, f653, f1690; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f1666, f612, f653; +sub.f32 f663, f1666, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f1664, f663, f572; +mul.f32 f1665, f665, f1689; +sub.f32 f668, f1664, f1665; +mul.f32 f669, f663, f1689; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f1662, f612, f663; +mul.f32 f1663, f613, f665; +sub.f32 f673, f1662, f1663; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f1660, f673, f576; +mul.f32 f1661, f675, f1688; +sub.f32 f678, f1660, f1661; +mul.f32 f679, f673, f1688; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f1658, f612, f673; +mul.f32 f1659, f613, f675; +sub.f32 f683, f1658, f1659; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f1687; +mul.f32 f1657, f683, f580; +sub.f32 f688, f1657, f687; +mul.f32 f689, f683, f1687; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f1656, f612, f683; +sub.f32 f693, f1656, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f1686; +mul.f32 f1655, f693, f584; +sub.f32 f698, f1655, f697; +mul.f32 f699, f693, f1686; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f1654, f612, f693; +sub.f32 f703, f1654, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f1685; +mul.f32 f1653, f703, f588; +sub.f32 f708, f1653, f707; +mul.f32 f709, f703, f1685; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f1651, f612, f703; +mul.f32 f1652, f613, f705; +sub.f32 f713, f1651, f1652; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f1649, f713, f592; +mul.f32 f1650, f715, f1684; +sub.f32 f718, f1649, f1650; +mul.f32 f719, f713, f1684; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f1647, f612, f713; +mul.f32 f1648, f613, f715; +sub.f32 f723, f1647, f1648; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f1645, f723, f596; +mul.f32 f1646, f725, f1683; +sub.f32 f728, f1645, f1646; +mul.f32 f729, f723, f1683; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f1644, f612, f723; +sub.f32 f733, f1644, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f1682; +mul.f32 f1643, f733, f600; +sub.f32 f738, f1643, f737; +mul.f32 f739, f733, f1682; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f1642, f612, f733; +sub.f32 f743, f1642, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f1681; +mul.f32 f1641, f743, f604; +sub.f32 f748, f1641, f747; +mul.f32 f749, f743, f1681; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f1640, f612, f743; +sub.f32 f753, f1640, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f1680; +mul.f32 f1639, f753, f608; +sub.f32 f758, f1639, f757; +mul.f32 f759, f753, f1680; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f1637, f612, f753; +mul.f32 f1638, f613, f755; +sub.f32 f763, f1637, f1638; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f1635, f763, f550; +mul.f32 f1636, f765, f551; +sub.f32 f768, f1635, f1636; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f1633, f612, f763; +mul.f32 f1634, f613, f765; +sub.f32 f773, f1633, f1634; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f1632, f773, f554; +sub.f32 f778, f1632, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f1631, f612, f773; +sub.f32 f783, f1631, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f1630, f783, f558; +sub.f32 f788, f1630, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f1629, f612, f783; +sub.f32 f793, f1629, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f1628, f793, f562; +sub.f32 f798, f1628, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f1627, f612, f793; +sub.f32 f803, f1627, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f1625, f803, f566; +mul.f32 f1626, f805, f567; +sub.f32 f808, f1625, f1626; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f1623, f612, f803; +mul.f32 f1624, f613, f805; +sub.f32 f813, f1623, f1624; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f1621, f813, f570; +mul.f32 f1622, f815, f571; +sub.f32 f818, f1621, f1622; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f1619, f612, f813; +mul.f32 f1620, f613, f815; +sub.f32 f823, f1619, f1620; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f1618, f823, f574; +sub.f32 f828, f1618, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f1617, f612, f823; +sub.f32 f833, f1617, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f1616, f833, f578; +sub.f32 f838, f1616, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f1615, f612, f833; +sub.f32 f843, f1615, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f1614, f843, f582; +sub.f32 f848, f1614, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f1612, f612, f843; +mul.f32 f1613, f613, f845; +sub.f32 f853, f1612, f1613; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f1610, f853, f586; +mul.f32 f1611, f855, f587; +sub.f32 f858, f1610, f1611; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f1608, f612, f853; +mul.f32 f1609, f613, f855; +sub.f32 f863, f1608, f1609; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f1606, f863, f590; +mul.f32 f1607, f865, f591; +sub.f32 f868, f1606, f1607; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f1605, f612, f863; +sub.f32 f873, f1605, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f1604, f873, f594; +sub.f32 f878, f1604, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f1603, f612, f873; +sub.f32 f883, f1603, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f1602, f883, f598; +sub.f32 f888, f1602, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f1601, f612, f883; +sub.f32 f893, f1601, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f1599, f893, f602; +mul.f32 f1600, f895, f603; +sub.f32 f898, f1599, f1600; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f1597, f612, f893; +mul.f32 f1598, f613, f895; +sub.f32 f903, f1597, f1598; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f1595, f903, f606; +mul.f32 f1596, f905, f607; +sub.f32 f908, f1595, f1596; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f1593, f612, f903; +mul.f32 f1594, f613, f905; +sub.f32 f913, f1593, f1594; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f1592, f913, f610; +sub.f32 f918, f1592, f917; +mov.u32 r17, %tid.x; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +and.b32 r14, r17, 31; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 3968; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+128]; +ld.shared.f32 f923, [r13+256]; +ld.shared.f32 f924, [r13+384]; +ld.shared.f32 f925, [r13+512]; +ld.shared.f32 f926, [r13+640]; +ld.shared.f32 f927, [r13+768]; +ld.shared.f32 f928, [r13+896]; +ld.shared.f32 f929, [r13+1024]; +ld.shared.f32 f930, [r13+1152]; +ld.shared.f32 f931, [r13+1280]; +ld.shared.f32 f932, [r13+1408]; +ld.shared.f32 f933, [r13+1536]; +ld.shared.f32 f934, [r13+1664]; +ld.shared.f32 f935, [r13+1792]; +ld.shared.f32 f936, [r13+1920]; +ld.shared.f32 f937, [r13+2048]; +ld.shared.f32 f938, [r13+2176]; +ld.shared.f32 f939, [r13+2304]; +ld.shared.f32 f940, [r13+2432]; +ld.shared.f32 f941, [r13+2560]; +ld.shared.f32 f942, [r13+2688]; +ld.shared.f32 f943, [r13+2816]; +ld.shared.f32 f944, [r13+2944]; +ld.shared.f32 f945, [r13+3072]; +ld.shared.f32 f946, [r13+3200]; +ld.shared.f32 f947, [r13+3328]; +ld.shared.f32 f948, [r13+3456]; +ld.shared.f32 f949, [r13+3584]; +ld.shared.f32 f950, [r13+3712]; +ld.shared.f32 f951, [r13+3840]; +ld.shared.f32 f952, [r13+3968]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1695, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+128]; +ld.shared.f32 f955, [r13+256]; +ld.shared.f32 f956, [r13+384]; +ld.shared.f32 f957, [r13+512]; +ld.shared.f32 f958, [r13+640]; +ld.shared.f32 f959, [r13+768]; +ld.shared.f32 f960, [r13+896]; +ld.shared.f32 f961, [r13+1024]; +ld.shared.f32 f962, [r13+1152]; +ld.shared.f32 f963, [r13+1280]; +ld.shared.f32 f964, [r13+1408]; +ld.shared.f32 f965, [r13+1536]; +ld.shared.f32 f966, [r13+1664]; +ld.shared.f32 f967, [r13+1792]; +ld.shared.f32 f968, [r13+1920]; +ld.shared.f32 f969, [r13+2048]; +ld.shared.f32 f970, [r13+2176]; +ld.shared.f32 f971, [r13+2304]; +ld.shared.f32 f972, [r13+2432]; +ld.shared.f32 f973, [r13+2560]; +ld.shared.f32 f974, [r13+2688]; +ld.shared.f32 f975, [r13+2816]; +ld.shared.f32 f976, [r13+2944]; +ld.shared.f32 f977, [r13+3072]; +ld.shared.f32 f978, [r13+3200]; +ld.shared.f32 f979, [r13+3328]; +ld.shared.f32 f980, [r13+3456]; +ld.shared.f32 f981, [r13+3584]; +ld.shared.f32 f982, [r13+3712]; +ld.shared.f32 f983, [r13+3840]; +ld.shared.f32 f984, [r13+3968]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1591, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1590, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f1589, f1591, f1590; +sub.f32 f996, f1591, f1590; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f1588, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f1587, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f1586, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f1585, f1587, f1586; +sub.f32 f1012, f1587, f1586; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f1584, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f1584, 0fBF3504F3; +mul.f32 f1583, f1013, 0f3F3504F3; +sub.f32 f1019, f1583, f1018; +mul.f32 f1020, f1584, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f1582, f1589, f1585; +sub.f32 f1029, f1589, f1585; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f1581, f1588, f1021; +sub.f32 f1033, f1588, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f1580, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f1579, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f1578, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f1577, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f1576, f1578, f1577; +sub.f32 f1053, f1578, f1577; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f1575, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f1574, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f1573, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f1572, f1574, f1573; +sub.f32 f1069, f1574, f1573; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f1571, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f1571, 0fBF3504F3; +mul.f32 f1570, f1070, 0f3F3504F3; +sub.f32 f1076, f1570, f1075; +mul.f32 f1077, f1571, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f1569, f1576, f1572; +sub.f32 f1086, f1576, f1572; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f1568, f1575, f1078; +sub.f32 f1090, f1575, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f1567, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f1566, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f1564, f1087, 0f3F6C835E; +mul.f32 f1565, f1568, 0fBEC3EF15; +sub.f32 f1101, f1564, f1565; +mul.f32 f1102, f1568, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f1562, f1091, 0f3F3504F3; +mul.f32 f1563, f1567, 0fBF3504F3; +sub.f32 f1106, f1562, f1563; +mul.f32 f1107, f1567, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f1566, 0fBF6C835E; +mul.f32 f1561, f1095, 0f3EC3EF15; +sub.f32 f1111, f1561, f1110; +mul.f32 f1112, f1566, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f1560, f1089, 0fBEC3EF15; +sub.f32 f1116, f1560, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f1558, f1097, 0fBF6C835E; +mul.f32 f1559, f1098, 0fBEC3EF15; +sub.f32 f1125, f1558, f1559; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f1557, f1582, f1569; +sub.f32 f1131, f1582, f1569; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f1556, f1581, f1103; +sub.f32 f1135, f1581, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f1555, f1580, f1108; +sub.f32 f1139, f1580, f1108; +add.f32 f1140, f1038, f1111; +sub.f32 f1142, f1038, f1111; +add.f32 f1554, f1579, f1113; +sub.f32 f1143, f1579, f1113; +add.f32 f1144, f1028, f1086; +sub.f32 f1146, f1028, f1086; +sub.f32 f1553, f1029, f1085; +add.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1116; +sub.f32 f1150, f1032, f1116; +add.f32 f1552, f1033, f1118; +sub.f32 f1151, f1033, f1118; +add.f32 f1152, f1036, f1121; +sub.f32 f1154, f1036, f1121; +add.f32 f1551, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f1550, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f1549, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f1548, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f1547, f1549, f1548; +sub.f32 f1171, f1549, f1548; +add.f32 f1172, f1162, f1167; +sub.f32 f1174, f1162, f1167; +sub.f32 f1546, f1163, f1166; +add.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f1545, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f1544, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f1543, f1545, f1544; +sub.f32 f1187, f1545, f1544; +add.f32 f1188, f1178, f1183; +sub.f32 f1190, f1178, f1183; +sub.f32 f1542, f1179, f1182; +add.f32 f1191, f1179, f1182; +mul.f32 f1193, f1542, 0fBF3504F3; +mul.f32 f1541, f1188, 0f3F3504F3; +sub.f32 f1194, f1541, f1193; +mul.f32 f1195, f1542, 0f3F3504F3; +fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195; +mul.f32 f1197, f1190, 0fBF3504F3; +mul.f32 f1198, f1191, 0fBF3504F3; +sub.f32 f1199, f1197, f1198; +add.f32 f1200, f1197, f1198; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f1540, f1547, f1543; +sub.f32 f1204, f1547, f1543; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f1539, f1546, f1196; +sub.f32 f1208, f1546, f1196; +add.f32 f1209, f1170, f1187; +sub.f32 f1211, f1170, f1187; +sub.f32 f1538, f1171, f1186; +add.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1199; +sub.f32 f1215, f1174, f1199; +add.f32 f1537, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f1536, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f1535, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f1534, f1536, f1535; +sub.f32 f1228, f1536, f1535; +add.f32 f1229, f1219, f1224; +sub.f32 f1231, f1219, f1224; +sub.f32 f1533, f1220, f1223; +add.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f1532, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f1531, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f1530, f1532, f1531; +sub.f32 f1244, f1532, f1531; +add.f32 f1245, f1235, f1240; +sub.f32 f1247, f1235, f1240; +sub.f32 f1529, f1236, f1239; +add.f32 f1248, f1236, f1239; +mul.f32 f1250, f1529, 0fBF3504F3; +mul.f32 f1528, f1245, 0f3F3504F3; +sub.f32 f1251, f1528, f1250; +mul.f32 f1252, f1529, 0f3F3504F3; +fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252; +mul.f32 f1254, f1247, 0fBF3504F3; +mul.f32 f1255, f1248, 0fBF3504F3; +sub.f32 f1256, f1254, f1255; +add.f32 f1257, f1254, f1255; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f1527, f1534, f1530; +sub.f32 f1261, f1534, f1530; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f1526, f1533, f1253; +sub.f32 f1265, f1533, f1253; +add.f32 f1266, f1227, f1244; +sub.f32 f1268, f1227, f1244; +sub.f32 f1525, f1228, f1243; +add.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1256; +sub.f32 f1272, f1231, f1256; +add.f32 f1524, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f1522, f1262, 0f3F6C835E; +mul.f32 f1523, f1526, 0fBEC3EF15; +sub.f32 f1276, f1522, f1523; +mul.f32 f1277, f1526, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277; +mul.f32 f1520, f1266, 0f3F3504F3; +mul.f32 f1521, f1525, 0fBF3504F3; +sub.f32 f1281, f1520, f1521; +mul.f32 f1282, f1525, 0f3F3504F3; +fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282; +mul.f32 f1518, f1270, 0f3EC3EF15; +mul.f32 f1519, f1524, 0fBF6C835E; +sub.f32 f1286, f1518, f1519; +mul.f32 f1287, f1524, 0f3EC3EF15; +fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287; +mul.f32 f1516, f1264, 0fBEC3EF15; +mul.f32 f1517, f1265, 0fBF6C835E; +sub.f32 f1291, f1516, f1517; +mul.f32 f1292, f1265, 0fBEC3EF15; +fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292; +mul.f32 f1294, f1268, 0fBF3504F3; +mul.f32 f1295, f1269, 0fBF3504F3; +sub.f32 f1296, f1294, f1295; +add.f32 f1297, f1294, f1295; +mul.f32 f1514, f1272, 0fBF6C835E; +mul.f32 f1515, f1273, 0fBEC3EF15; +sub.f32 f1300, f1514, f1515; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f1513, f1540, f1527; +sub.f32 f1306, f1540, f1527; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f1512, f1539, f1278; +sub.f32 f1310, f1539, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f1511, f1538, f1283; +sub.f32 f1314, f1538, f1283; +add.f32 f1315, f1213, f1286; +sub.f32 f1317, f1213, f1286; +add.f32 f1510, f1537, f1288; +sub.f32 f1318, f1537, f1288; +add.f32 f1319, f1203, f1261; +sub.f32 f1321, f1203, f1261; +sub.f32 f1509, f1204, f1260; +add.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1291; +sub.f32 f1325, f1207, f1291; +add.f32 f1508, f1208, f1293; +sub.f32 f1326, f1208, f1293; +add.f32 f1327, f1211, f1296; +sub.f32 f1329, f1211, f1296; +add.f32 f1507, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f1506, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f1512, 0fBE47C5C2; +mul.f32 f1505, f1307, 0f3F7B14BE; +sub.f32 f1337, f1505, f1336; +mul.f32 f1338, f1512, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338; +mul.f32 f1341, f1511, 0fBEC3EF15; +mul.f32 f1504, f1311, 0f3F6C835E; +sub.f32 f1342, f1504, f1341; +mul.f32 f1343, f1511, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343; +mul.f32 f1502, f1315, 0f3F54DB31; +mul.f32 f1503, f1510, 0fBF0E39DA; +sub.f32 f1347, f1502, f1503; +mul.f32 f1348, f1510, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348; +mul.f32 f1500, f1319, 0f3F3504F3; +mul.f32 f1501, f1509, 0fBF3504F3; +sub.f32 f1352, f1500, f1501; +mul.f32 f1353, f1509, 0f3F3504F3; +fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353; +mul.f32 f1498, f1323, 0f3F0E39DA; +mul.f32 f1499, f1508, 0fBF54DB31; +sub.f32 f1357, f1498, f1499; +mul.f32 f1358, f1508, 0f3F0E39DA; +fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358; +mul.f32 f1496, f1327, 0f3EC3EF15; +mul.f32 f1497, f1507, 0fBF6C835E; +sub.f32 f1362, f1496, f1497; +mul.f32 f1363, f1507, 0f3EC3EF15; +fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363; +mul.f32 f1366, f1506, 0fBF7B14BE; +mul.f32 f1495, f1331, 0f3E47C5C2; +sub.f32 f1367, f1495, f1366; +mul.f32 f1368, f1506, 0f3E47C5C2; +fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368; +mul.f32 f1371, f1310, 0fBF7B14BE; +mul.f32 f1494, f1309, 0fBE47C5C2; +sub.f32 f1372, f1494, f1371; +mul.f32 f1373, f1310, 0fBE47C5C2; +fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373; +mul.f32 f1376, f1314, 0fBF6C835E; +mul.f32 f1493, f1313, 0fBEC3EF15; +sub.f32 f1377, f1493, f1376; +mul.f32 f1378, f1314, 0fBEC3EF15; +fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378; +mul.f32 f1381, f1318, 0fBF54DB31; +mul.f32 f1492, f1317, 0fBF0E39DA; +sub.f32 f1382, f1492, f1381; +mul.f32 f1383, f1318, 0fBF0E39DA; +fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383; +mul.f32 f1385, f1321, 0fBF3504F3; +mul.f32 f1386, f1322, 0fBF3504F3; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1385, f1386; +mul.f32 f1390, f1326, 0fBF0E39DA; +mul.f32 f1491, f1325, 0fBF54DB31; +sub.f32 f1391, f1491, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392; +mul.f32 f1395, f1330, 0fBEC3EF15; +mul.f32 f1490, f1329, 0fBF6C835E; +sub.f32 f1396, f1490, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397; +mul.f32 f1400, f1334, 0fBE47C5C2; +mul.f32 f1489, f1333, 0fBF7B14BE; +sub.f32 f1401, f1489, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402; +add.f32 %1, f1557, f1513; +add.f32 %0, f1128, f1303; +add.f32 %2, f1132, f1337; +add.f32 %3, f1556, f1339; +add.f32 %4, f1136, f1342; +add.f32 %5, f1555, f1344; +add.f32 %6, f1140, f1347; +add.f32 %7, f1554, f1349; +add.f32 %9, f1553, f1354; +add.f32 %8, f1144, f1352; +add.f32 %11, f1552, f1359; +add.f32 %10, f1148, f1357; +add.f32 %12, f1152, f1362; +add.f32 %13, f1551, f1364; +add.f32 %14, f1156, f1367; +add.f32 %15, f1550, f1369; +add.f32 %16, f1130, f1306; +sub.f32 %17, f1131, f1305; +add.f32 %18, f1134, f1372; +add.f32 %19, f1135, f1374; +add.f32 %21, f1139, f1379; +add.f32 %20, f1138, f1377; +add.f32 %23, f1143, f1384; +add.f32 %22, f1142, f1382; +add.f32 %25, f1147, f1388; +add.f32 %24, f1146, f1387; +add.f32 %26, f1150, f1391; +add.f32 %27, f1151, f1393; +add.f32 %28, f1154, f1396; +add.f32 %29, f1155, f1398; +add.f32 %30, f1158, f1401; +add.f32 %31, f1159, f1403; +sub.f32 %32, f1128, f1303; +sub.f32 %33, f1557, f1513; +sub.f32 %35, f1556, f1339; +sub.f32 %34, f1132, f1337; +sub.f32 %37, f1555, f1344; +sub.f32 %36, f1136, f1342; +sub.f32 %39, f1554, f1349; +sub.f32 %38, f1140, f1347; +sub.f32 %41, f1553, f1354; +sub.f32 %40, f1144, f1352; +sub.f32 %43, f1552, f1359; +sub.f32 %42, f1148, f1357; +sub.f32 %45, f1551, f1364; +sub.f32 %44, f1152, f1362; +sub.f32 %47, f1550, f1369; +sub.f32 %46, f1156, f1367; +add.f32 %49, f1131, f1305; +sub.f32 %48, f1130, f1306; +sub.f32 %51, f1135, f1374; +sub.f32 %50, f1134, f1372; +sub.f32 %53, f1139, f1379; +sub.f32 %52, f1138, f1377; +sub.f32 %55, f1143, f1384; +sub.f32 %54, f1142, f1382; +sub.f32 %57, f1147, f1388; +sub.f32 %56, f1146, f1387; +sub.f32 %59, f1151, f1393; +sub.f32 %58, f1150, f1391; +sub.f32 %61, f1155, f1398; +sub.f32 %60, f1154, f1396; +sub.f32 %63, f1159, f1403; +sub.f32 %62, f1158, f1401; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<82, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<523>; +.reg .b32 r<27>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 8128; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+1024]; +ld.shared.v2.f32 {f167, f168}, [r13+2048]; +ld.shared.v2.f32 {f171, f172}, [r13+3072]; +ld.shared.v2.f32 {f175, f176}, [r13+4096]; +ld.shared.v2.f32 {f179, f180}, [r13+5120]; +ld.shared.v2.f32 {f183, f184}, [r13+6144]; +ld.shared.v2.f32 {f187, f188}, [r13+7168]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 120; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 7680; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+1024]; +ld.shared.v2.f32 {f325, f326}, [r19+2048]; +ld.shared.v2.f32 {f329, f330}, [r19+3072]; +ld.shared.v2.f32 {f333, f334}, [r19+4096]; +ld.shared.v2.f32 {f337, f338}, [r19+5120]; +ld.shared.v2.f32 {f341, f342}, [r19+6144]; +ld.shared.v2.f32 {f345, f346}, [r19+7168]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +add.f32 f361, f351, f356; +sub.f32 f362, f352, f355; +sub.f32 f363, f351, f356; +add.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +add.f32 f377, f367, f372; +sub.f32 f378, f368, f371; +sub.f32 f379, f367, f372; +add.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0fBF3504F3; +sub.f32 f383, f381, f382; +mul.f32 f384, f378, 0f3F3504F3; +fma.rn.f32 f385, f377, 0fBF3504F3, f384; +mul.f32 f386, f379, 0fBF3504F3; +mul.f32 f387, f380, 0fBF3504F3; +sub.f32 f388, f386, f387; +add.f32 f389, f386, f387; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f385; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f385; +add.f32 f396, f359, f376; +sub.f32 f397, f360, f375; +sub.f32 f398, f359, f376; +add.f32 f399, f360, f375; +add.f32 f400, f363, f388; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f388; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 64; +bfe.u32 r21, r5, 6, 1; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f404, f392; +mul.f32 f409, f405, f393; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f413, f396; +mul.f32 f417, f415, f397; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f421, f400; +mul.f32 f425, f423, f401; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f429, f390; +mul.f32 f433, f431, f391; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f437, f394; +mul.f32 f441, f439, f395; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f445, f398; +mul.f32 f449, f447, f399; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f453, f402; +mul.f32 f457, f455, f403; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 4096; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f405, f392, f410; +sub.f32 f462, f408, f409; +st.shared.v2.f32 [r25+512], {f462, f461}; +fma.rn.f32 f463, f415, f396, f418; +sub.f32 f464, f416, f417; +st.shared.v2.f32 [r25+1024], {f464, f463}; +fma.rn.f32 f465, f423, f400, f426; +sub.f32 f466, f424, f425; +st.shared.v2.f32 [r25+1536], {f466, f465}; +sub.f32 f467, f432, f433; +fma.rn.f32 f468, f431, f390, f434; +st.shared.v2.f32 [r25+2048], {f467, f468}; +fma.rn.f32 f469, f439, f394, f442; +sub.f32 f470, f440, f441; +st.shared.v2.f32 [r25+2560], {f470, f469}; +fma.rn.f32 f471, f447, f398, f450; +sub.f32 f472, f448, f449; +st.shared.v2.f32 [r25+3072], {f472, f471}; +fma.rn.f32 f473, f455, f402, f458; +sub.f32 f474, f456, f457; +st.shared.v2.f32 [r25+3584], {f474, f473}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+1024]; +ld.shared.v2.f32 {f483, f484}, [r26+2048]; +ld.shared.v2.f32 {f487, f488}, [r26+3072]; +ld.shared.v2.f32 {f491, f492}, [r26+4096]; +ld.shared.v2.f32 {f495, f496}, [r26+5120]; +ld.shared.v2.f32 {f499, f500}, [r26+6144]; +ld.shared.v2.f32 {f503, f504}, [r26+7168]; +add.f32 %1, f476, f492; +add.f32 %0, f475, f491; +add.f32 %3, f480, f496; +add.f32 %2, f479, f495; +add.f32 %5, f484, f500; +add.f32 %4, f483, f499; +add.f32 %7, f488, f504; +add.f32 %6, f487, f503; +sub.f32 %9, f476, f492; +sub.f32 %8, f475, f491; +sub.f32 %11, f480, f496; +sub.f32 %10, f479, f495; +sub.f32 %13, f484, f500; +sub.f32 %12, f483, f499; +sub.f32 %15, f488, f504; +sub.f32 %14, f487, f503; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<86, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1097>; +.reg .b32 r<35>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1088, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1086, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1085, f1088, f1086; +sub.f32 f76, f1088, f1086; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f1084, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1081, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1079, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1078, f1081, f1079; +sub.f32 f92, f1081, f1079; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f1077, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f1077, 0fBF3504F3; +mul.f32 f1076, f93, 0f3F3504F3; +sub.f32 f99, f1076, f98; +mul.f32 f100, f1077, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1075, f1085, f1078; +sub.f32 f109, f1085, f1078; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1074, f1084, f101; +sub.f32 f113, f1084, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f1073, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f1072, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1070, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1067, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1066, f1070, f1067; +sub.f32 f133, f1070, f1067; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f1065, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1063, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1061, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1060, f1063, f1061; +sub.f32 f149, f1063, f1061; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f1059, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f1059, 0fBF3504F3; +mul.f32 f1058, f150, 0f3F3504F3; +sub.f32 f156, f1058, f155; +mul.f32 f157, f1059, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1057, f1066, f1060; +sub.f32 f166, f1066, f1060; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1056, f1065, f158; +sub.f32 f170, f1065, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f1055, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f1054, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1052, f167, 0f3F6C835E; +mul.f32 f1053, f1056, 0fBEC3EF15; +sub.f32 f181, f1052, f1053; +mul.f32 f182, f1056, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f1050, f171, 0f3F3504F3; +mul.f32 f1051, f1055, 0fBF3504F3; +sub.f32 f186, f1050, f1051; +mul.f32 f187, f1055, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f1048, f175, 0f3EC3EF15; +mul.f32 f1049, f1054, 0fBF6C835E; +sub.f32 f191, f1048, f1049; +mul.f32 f192, f1054, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f1046, f169, 0fBEC3EF15; +mul.f32 f1047, f170, 0fBF6C835E; +sub.f32 f196, f1046, f1047; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f1044, f177, 0fBF6C835E; +mul.f32 f1045, f178, 0fBEC3EF15; +sub.f32 f205, f1044, f1045; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1043, f1074, f183; +sub.f32 f213, f1074, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1042, f1073, f188; +sub.f32 f217, f1073, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f1041, f1072, f193; +sub.f32 f221, f1072, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f1040, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f1039, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f1038, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1037, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f1043; +mul.f32 f244, f238, f1043; +mul.f32 f246, f239, f239; +mul.f32 f1036, f238, f238; +sub.f32 f247, f1036, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f1042; +mul.f32 f252, f247, f1042; +mul.f32 f1034, f238, f247; +mul.f32 f1035, f239, f249; +sub.f32 f255, f1034, f1035; +mul.f32 f1033, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f1041; +mul.f32 f260, f255, f1041; +mul.f32 f262, f239, f257; +mul.f32 f1032, f238, f255; +sub.f32 f263, f1032, f262; +mul.f32 f1031, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f1040; +mul.f32 f268, f263, f1040; +mul.f32 f270, f239, f265; +mul.f32 f1030, f238, f263; +sub.f32 f271, f1030, f270; +mul.f32 f1029, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f1039; +mul.f32 f276, f271, f1039; +mul.f32 f1027, f238, f271; +mul.f32 f1028, f239, f273; +sub.f32 f279, f1027, f1028; +mul.f32 f1026, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f1038; +mul.f32 f284, f279, f1038; +mul.f32 f286, f239, f281; +mul.f32 f1025, f238, f279; +sub.f32 f287, f1025, f286; +mul.f32 f1024, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f1037; +mul.f32 f292, f287, f1037; +mul.f32 f294, f239, f289; +mul.f32 f1023, f238, f287; +sub.f32 f295, f1023, f294; +mul.f32 f1022, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1021, f1075, f1057; +mul.f32 f299, f297, f1021; +mul.f32 f300, f295, f1021; +mul.f32 f1019, f238, f295; +mul.f32 f1020, f239, f297; +sub.f32 f303, f1019, f1020; +sub.f32 f1018, f106, f163; +mul.f32 f1017, f295, f1018; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1016, f238, f303; +sub.f32 f311, f1016, f310; +mul.f32 f1015, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f1013, f238, f311; +mul.f32 f1014, f239, f313; +sub.f32 f319, f1013, f1014; +mul.f32 f1012, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1011, f238, f319; +sub.f32 f327, f1011, f326; +mul.f32 f1010, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1009, f238, f327; +sub.f32 f335, f1009, f334; +mul.f32 f1008, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f1006, f238, f335; +mul.f32 f1007, f239, f337; +sub.f32 f343, f1006, f1007; +mul.f32 f1005, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1004, f238, f343; +sub.f32 f351, f1004, f350; +mul.f32 f1003, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f1002, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f1094, f1075, f1057; +mul.f32 f1093, f297, f1094; +barrier.sync 0; +and.b32 r11, r7, 8064; +add.s32 r12, r9, r11; +add.f32 f357, f1075, f1057; +sub.f32 f1091, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r34, %tid.x; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f1003, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f1033, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f1031, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f1029, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f1026, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f1024, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f1022, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f1091, f300; +sub.f32 f374, f1017, f1093; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f1015, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f1012, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f1010, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f1008, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f1005, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f1002, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +and.b32 r21, r34, 63; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+512]; +ld.shared.v2.f32 {f397, f398}, [r13+1024]; +ld.shared.v2.f32 {f401, f402}, [r13+1536]; +ld.shared.v2.f32 {f405, f406}, [r13+2048]; +ld.shared.v2.f32 {f409, f410}, [r13+2560]; +ld.shared.v2.f32 {f413, f414}, [r13+3072]; +ld.shared.v2.f32 {f417, f418}, [r13+3584]; +ld.shared.v2.f32 {f421, f422}, [r13+4096]; +ld.shared.v2.f32 {f425, f426}, [r13+4608]; +ld.shared.v2.f32 {f429, f430}, [r13+5120]; +ld.shared.v2.f32 {f433, f434}, [r13+5632]; +ld.shared.v2.f32 {f437, f438}, [r13+6144]; +ld.shared.v2.f32 {f441, f442}, [r13+6656]; +ld.shared.v2.f32 {f445, f446}, [r13+7168]; +ld.shared.v2.f32 {f449, f450}, [r13+7680]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1001, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1000, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f999, f1001, f1000; +sub.f32 f464, f1001, f1000; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f998, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f997, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f996, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f995, f997, f996; +sub.f32 f480, f997, f996; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f994, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f992, f481, 0f3F3504F3; +mul.f32 f993, f994, 0fBF3504F3; +sub.f32 f487, f992, f993; +mul.f32 f488, f994, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f991, f999, f995; +sub.f32 f497, f999, f995; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f990, f998, f489; +sub.f32 f501, f998, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f989, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f988, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f987, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f986, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f985, f987, f986; +sub.f32 f521, f987, f986; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f984, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f983, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f982, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f981, f983, f982; +sub.f32 f537, f983, f982; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f980, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f978, f538, 0f3F3504F3; +mul.f32 f979, f980, 0fBF3504F3; +sub.f32 f544, f978, f979; +mul.f32 f545, f980, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f977, f985, f981; +sub.f32 f554, f985, f981; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f976, f984, f546; +sub.f32 f558, f984, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f975, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f974, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f976, 0fBEC3EF15; +mul.f32 f973, f555, 0f3F6C835E; +sub.f32 f569, f973, f568; +mul.f32 f570, f976, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f975, 0fBF3504F3; +mul.f32 f972, f559, 0f3F3504F3; +sub.f32 f574, f972, f573; +mul.f32 f575, f975, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f970, f563, 0f3EC3EF15; +mul.f32 f971, f974, 0fBF6C835E; +sub.f32 f579, f970, f971; +mul.f32 f580, f974, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f968, f557, 0fBEC3EF15; +mul.f32 f969, f558, 0fBF6C835E; +sub.f32 f584, f968, f969; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f967, f565, 0fBF6C835E; +sub.f32 f593, f967, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f966, f990, f571; +sub.f32 f601, f990, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f965, f989, f576; +sub.f32 f605, f989, f576; +add.f32 f606, f506, f579; +sub.f32 f608, f506, f579; +add.f32 f964, f988, f581; +sub.f32 f609, f988, f581; +add.f32 f610, f496, f554; +sub.f32 f612, f496, f554; +sub.f32 f963, f497, f553; +add.f32 f613, f497, f553; +add.f32 f614, f500, f584; +sub.f32 f616, f500, f584; +add.f32 f962, f501, f586; +sub.f32 f617, f501, f586; +add.f32 f618, f504, f589; +sub.f32 f620, f504, f589; +add.f32 f961, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f960, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r34, 48; +bfe.u32 r15, r34, 4, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f631, f627, f966; +mul.f32 f632, f626, f966; +mul.f32 f634, f627, f627; +mul.f32 f959, f626, f626; +sub.f32 f635, f959, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f639, f637, f965; +mul.f32 f640, f635, f965; +mul.f32 f957, f626, f635; +mul.f32 f958, f627, f637; +sub.f32 f643, f957, f958; +mul.f32 f956, f635, f602; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f647, f645, f964; +mul.f32 f648, f643, f964; +mul.f32 f650, f627, f645; +mul.f32 f955, f626, f643; +sub.f32 f651, f955, f650; +mul.f32 f954, f643, f606; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f655, f653, f963; +mul.f32 f656, f651, f963; +mul.f32 f658, f627, f653; +mul.f32 f953, f626, f651; +sub.f32 f659, f953, f658; +mul.f32 f952, f651, f610; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f663, f661, f962; +mul.f32 f664, f659, f962; +mul.f32 f950, f626, f659; +mul.f32 f951, f627, f661; +sub.f32 f667, f950, f951; +mul.f32 f949, f659, f614; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f671, f669, f961; +mul.f32 f672, f667, f961; +mul.f32 f674, f627, f669; +mul.f32 f948, f626, f667; +sub.f32 f675, f948, f674; +mul.f32 f947, f667, f618; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f679, f677, f960; +mul.f32 f680, f675, f960; +mul.f32 f682, f627, f677; +mul.f32 f946, f626, f675; +sub.f32 f683, f946, f682; +mul.f32 f945, f675, f622; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f944, f991, f977; +mul.f32 f687, f685, f944; +mul.f32 f688, f683, f944; +mul.f32 f942, f626, f683; +mul.f32 f943, f627, f685; +sub.f32 f691, f942, f943; +sub.f32 f941, f494, f551; +mul.f32 f940, f683, f941; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f695, f693, f601; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f939, f626, f691; +sub.f32 f699, f939, f698; +mul.f32 f938, f691, f600; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f703, f701, f605; +mul.f32 f704, f699, f605; +mul.f32 f936, f626, f699; +mul.f32 f937, f627, f701; +sub.f32 f707, f936, f937; +mul.f32 f935, f699, f604; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f711, f709, f609; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f934, f626, f707; +sub.f32 f715, f934, f714; +mul.f32 f933, f707, f608; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f719, f717, f613; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f932, f626, f715; +sub.f32 f723, f932, f722; +mul.f32 f931, f715, f612; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f727, f725, f617; +mul.f32 f728, f723, f617; +mul.f32 f929, f626, f723; +mul.f32 f930, f627, f725; +sub.f32 f731, f929, f930; +mul.f32 f928, f723, f616; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f735, f733, f621; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f927, f626, f731; +sub.f32 f739, f927, f738; +mul.f32 f926, f626, f598; +mul.f32 f740, f626, f733; +mul.f32 f925, f731, f620; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f739, f624; +mul.f32 f743, f741, f625; +mul.f32 f744, f739, f625; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 3; +and.b32 r16, r23, 120; +add.s32 r17, r9, r16; +mov.u32 r26, %tid.x; +shl.b32 r25, r26, 7; +barrier.sync 0; +and.b32 r18, r25, 6144; +add.s32 r19, r17, r18; +mov.u32 r28, %tid.x; +and.b32 r27, r28, 48; +add.f32 f745, f991, f977; +sub.f32 f1092, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 48; +fma.rn.f32 f747, f627, f598, f632; +sub.f32 f748, f926, f631; +st.shared.v2.f32 [r19+128], {f748, f747}; +fma.rn.f32 f749, f637, f602, f640; +sub.f32 f750, f956, f639; +st.shared.v2.f32 [r19+256], {f750, f749}; +fma.rn.f32 f751, f645, f606, f648; +sub.f32 f752, f954, f647; +st.shared.v2.f32 [r19+384], {f752, f751}; +fma.rn.f32 f753, f653, f610, f656; +sub.f32 f754, f952, f655; +st.shared.v2.f32 [r19+512], {f754, f753}; +sub.f32 f755, f949, f663; +fma.rn.f32 f756, f661, f614, f664; +st.shared.v2.f32 [r19+640], {f755, f756}; +fma.rn.f32 f757, f669, f618, f672; +sub.f32 f758, f947, f671; +st.shared.v2.f32 [r19+768], {f758, f757}; +fma.rn.f32 f759, f677, f622, f680; +sub.f32 f760, f945, f679; +st.shared.v2.f32 [r19+896], {f760, f759}; +fma.rn.f32 f761, f685, f1092, f688; +sub.f32 f762, f940, f687; +st.shared.v2.f32 [r19+1024], {f762, f761}; +fma.rn.f32 f763, f693, f600, f696; +sub.f32 f764, f938, f695; +st.shared.v2.f32 [r19+1152], {f764, f763}; +fma.rn.f32 f765, f701, f604, f704; +sub.f32 f766, f935, f703; +st.shared.v2.f32 [r19+1280], {f766, f765}; +fma.rn.f32 f767, f709, f608, f712; +sub.f32 f768, f933, f711; +st.shared.v2.f32 [r19+1408], {f768, f767}; +fma.rn.f32 f769, f717, f612, f720; +sub.f32 f770, f931, f719; +st.shared.v2.f32 [r19+1536], {f770, f769}; +fma.rn.f32 f771, f725, f616, f728; +sub.f32 f772, f928, f727; +st.shared.v2.f32 [r19+1664], {f772, f771}; +fma.rn.f32 f773, f733, f620, f736; +sub.f32 f774, f925, f735; +st.shared.v2.f32 [r19+1792], {f774, f773}; +fma.rn.f32 f775, f741, f624, f744; +sub.f32 f776, f742, f743; +st.shared.v2.f32 [r19+1920], {f776, f775}; +barrier.sync 0; +mad.lo.s32 r20, r30, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+512]; +ld.shared.v2.f32 {f785, f786}, [r20+1024]; +ld.shared.v2.f32 {f789, f790}, [r20+1536]; +ld.shared.v2.f32 {f793, f794}, [r20+2048]; +ld.shared.v2.f32 {f797, f798}, [r20+2560]; +ld.shared.v2.f32 {f801, f802}, [r20+3072]; +ld.shared.v2.f32 {f805, f806}, [r20+3584]; +ld.shared.v2.f32 {f809, f810}, [r20+4096]; +ld.shared.v2.f32 {f813, f814}, [r20+4608]; +ld.shared.v2.f32 {f817, f818}, [r20+5120]; +ld.shared.v2.f32 {f821, f822}, [r20+5632]; +ld.shared.v2.f32 {f825, f826}, [r20+6144]; +ld.shared.v2.f32 {f829, f830}, [r20+6656]; +ld.shared.v2.f32 {f833, f834}, [r20+7168]; +ld.shared.v2.f32 {f837, f838}, [r20+7680]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f924, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f923, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f781, f813; +sub.f32 f851, f781, f813; +add.f32 f922, f782, f814; +sub.f32 f852, f782, f814; +add.f32 f853, f797, f829; +sub.f32 f855, f797, f829; +add.f32 f921, f798, f830; +sub.f32 f856, f798, f830; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f920, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f919, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f789, f821; +sub.f32 f867, f789, f821; +add.f32 f918, f790, f822; +sub.f32 f868, f790, f822; +add.f32 f869, f805, f837; +sub.f32 f871, f805, f837; +add.f32 f917, f806, f838; +sub.f32 f872, f806, f838; +add.f32 %1, f924, f923; +add.f32 %0, f841, f845; +add.f32 %2, f849, f853; +add.f32 %3, f922, f921; +add.f32 %4, f857, f861; +add.f32 %5, f920, f919; +add.f32 %6, f865, f869; +add.f32 %7, f918, f917; +add.f32 %8, f843, f848; +sub.f32 %9, f844, f847; +sub.f32 %11, f852, f855; +add.f32 %10, f851, f856; +sub.f32 %13, f860, f863; +add.f32 %12, f859, f864; +sub.f32 %15, f868, f871; +add.f32 %14, f867, f872; +sub.f32 %17, f924, f923; +sub.f32 %16, f841, f845; +sub.f32 %19, f922, f921; +sub.f32 %18, f849, f853; +sub.f32 %21, f920, f919; +sub.f32 %20, f857, f861; +sub.f32 %23, f918, f917; +sub.f32 %22, f865, f869; +add.f32 %25, f844, f847; +sub.f32 %24, f843, f848; +add.f32 %27, f852, f855; +sub.f32 %26, f851, f856; +add.f32 %29, f860, f863; +sub.f32 %28, f859, f864; +add.f32 %31, f868, f871; +sub.f32 %30, f867, f872; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<87, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<277>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 8160; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+2048]; +ld.shared.v2.f32 {f70, f71}, [r13+4096]; +ld.shared.v2.f32 {f74, f75}, [r13+6144]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 8064; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+2048]; +ld.shared.v2.f32 {f131, f132}, [r20+4096]; +ld.shared.v2.f32 {f135, f136}, [r20+6144]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +add.f32 f149, f141, f146; +sub.f32 f150, f142, f145; +sub.f32 f151, f141, f146; +add.f32 f152, f142, f145; +and.b32 r21, r5, 240; +bfe.u32 r22, r5, 4, 4; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f153, f149; +mul.f32 f158, f154, f150; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f162, f147; +mul.f32 f166, f164, f148; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f170, f151; +mul.f32 f174, f172, f152; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 7680; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f154, f149, f159; +sub.f32 f179, f157, f158; +st.shared.v2.f32 [r26+128], {f179, f178}; +fma.rn.f32 f180, f164, f147, f167; +sub.f32 f181, f165, f166; +st.shared.v2.f32 [r26+256], {f181, f180}; +sub.f32 f182, f173, f174; +fma.rn.f32 f183, f172, f151, f175; +st.shared.v2.f32 [r26+384], {f182, f183}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+2048]; +ld.shared.v2.f32 {f192, f193}, [r27+4096]; +ld.shared.v2.f32 {f196, f197}, [r27+6144]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +add.f32 f210, f202, f207; +sub.f32 f211, f203, f206; +sub.f32 f212, f202, f207; +add.f32 f213, f203, f206; +and.b32 r28, r5, 192; +bfe.u32 r29, r5, 6, 2; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f214, f210; +mul.f32 f219, f215, f211; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f223, f208; +mul.f32 f227, f225, f209; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f231, f212; +mul.f32 f235, f233, f213; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 6144; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f215, f210, f220; +sub.f32 f240, f218, f219; +st.shared.v2.f32 [r33+512], {f240, f239}; +fma.rn.f32 f241, f225, f208, f228; +sub.f32 f242, f226, f227; +st.shared.v2.f32 [r33+1024], {f242, f241}; +sub.f32 f243, f234, f235; +fma.rn.f32 f244, f233, f212, f236; +st.shared.v2.f32 [r33+1536], {f243, f244}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+2048]; +ld.shared.v2.f32 {f253, f254}, [r34+4096]; +ld.shared.v2.f32 {f257, f258}, [r34+6144]; +add.f32 f261, f245, f253; +add.f32 f262, f246, f254; +sub.f32 f263, f245, f253; +sub.f32 f264, f246, f254; +add.f32 f265, f249, f257; +add.f32 f266, f250, f258; +sub.f32 f267, f249, f257; +sub.f32 f268, f250, f258; +add.f32 %1, f262, f266; +add.f32 %0, f261, f265; +sub.f32 %3, f264, f267; +add.f32 %2, f263, f268; +sub.f32 %5, f262, f266; +sub.f32 %4, f261, f265; +add.f32 %7, f264, f267; +sub.f32 %6, f263, f268; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<88, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<245>; +.reg .b32 r<36>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4080; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+1024]; +ld.shared.f32 f64, [r13+2048]; +ld.shared.f32 f65, [r13+3072]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+1024]; +ld.shared.f32 f68, [r13+2048]; +ld.shared.f32 f69, [r13+3072]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 4032; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+1024]; +ld.shared.f32 f117, [r21+2048]; +ld.shared.f32 f118, [r21+3072]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+1024]; +ld.shared.f32 f121, [r21+2048]; +ld.shared.f32 f122, [r21+3072]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +add.f32 f135, f125, f130; +sub.f32 f136, f126, f129; +sub.f32 f137, f125, f130; +add.f32 f138, f126, f129; +and.b32 r22, r5, 240; +bfe.u32 r23, r5, 4, 4; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f139, f135; +mul.f32 f144, f140, f136; +sub.f32 f145, f143, f144; +mul.f32 f146, f139, f136; +fma.rn.f32 f147, f140, f135, f146; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f150, f133; +mul.f32 f154, f152, f134; +sub.f32 f155, f153, f154; +mul.f32 f156, f150, f134; +fma.rn.f32 f157, f152, f133, f156; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f160, f137; +mul.f32 f164, f162, f138; +sub.f32 f165, f163, f164; +mul.f32 f166, f160, f138; +fma.rn.f32 f167, f162, f137, f166; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 3840; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f145; +st.shared.f32 [r27+128], f155; +st.shared.f32 [r27+192], f165; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+1024]; +ld.shared.f32 f170, [r28+2048]; +ld.shared.f32 f171, [r28+3072]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+1024]; +ld.shared.f32 f174, [r28+2048]; +ld.shared.f32 f175, [r28+3072]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +add.f32 f188, f178, f183; +sub.f32 f189, f179, f182; +sub.f32 f190, f178, f183; +add.f32 f191, f179, f182; +and.b32 r29, r5, 192; +bfe.u32 r30, r5, 6, 2; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f192, f188; +mul.f32 f197, f193, f189; +sub.f32 f198, f196, f197; +mul.f32 f199, f192, f189; +fma.rn.f32 f200, f193, f188, f199; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f203, f186; +mul.f32 f207, f205, f187; +sub.f32 f208, f206, f207; +mul.f32 f209, f203, f187; +fma.rn.f32 f210, f205, f186, f209; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f213, f190; +mul.f32 f217, f215, f191; +sub.f32 f218, f216, f217; +mul.f32 f219, f213, f191; +fma.rn.f32 f220, f215, f190, f219; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 3072; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f198; +st.shared.f32 [r34+512], f208; +st.shared.f32 [r34+768], f218; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+1024]; +ld.shared.f32 f223, [r35+2048]; +ld.shared.f32 f224, [r35+3072]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+1024]; +ld.shared.f32 f227, [r35+2048]; +ld.shared.f32 f228, [r35+3072]; +add.f32 f229, f221, f223; +add.f32 f230, f225, f227; +sub.f32 f231, f221, f223; +sub.f32 f232, f225, f227; +add.f32 f233, f222, f224; +add.f32 f234, f226, f228; +sub.f32 f235, f222, f224; +sub.f32 f236, f226, f228; +add.f32 %0, f229, f233; +add.f32 %1, f230, f234; +sub.f32 %3, f232, f235; +add.f32 %2, f231, f236; +sub.f32 %4, f229, f233; +sub.f32 %5, f230, f234; +add.f32 %7, f232, f235; +sub.f32 %6, f231, f236; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<89, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<202>; +.reg .b32 r<70>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %14, %16; +sub.f32 f10, %15, %17; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 8176; +add.s32 r11, r8, r10; +add.f32 f18, %15, %17; +add.f32 f19, %14, %16; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 4088; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+4096]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8160; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 4080; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+4096]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8128; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 4064; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+4096]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 504; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 8064; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 4032; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+4096]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 5; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f95, f93; +mul.f32 f100, f96, f94; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 7936; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f96, f93, f101; +sub.f32 f105, f99, f100; +st.shared.v2.f32 [r39+128], {f105, f104}; +barrier.sync 0; +and.b32 r40, r9, 3968; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+4096]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f116, f114; +mul.f32 f121, f117, f115; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7680; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f117, f114, f122; +sub.f32 f126, f120, f121; +st.shared.v2.f32 [r46+256], {f126, f125}; +barrier.sync 0; +and.b32 r47, r9, 3840; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+4096]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f137, f135; +mul.f32 f142, f138, f136; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 7168; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f138, f135, f143; +sub.f32 f147, f141, f142; +st.shared.v2.f32 [r53+512], {f147, f146}; +barrier.sync 0; +and.b32 r54, r9, 3584; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+4096]; +sub.f32 f156, f148, f152; +sub.f32 f157, f149, f153; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f158, f159}, [rd26]; +mul.f32 f162, f158, f156; +mul.f32 f163, f159, f157; +mul.f32 f164, f158, f157; +and.b32 r57, r9, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 6144; +add.s32 r60, r58, r59; +add.f32 f165, f149, f153; +add.f32 f166, f148, f152; +st.shared.v2.f32 [r60], {f166, f165}; +fma.rn.f32 f167, f159, f156, f164; +sub.f32 f168, f162, f163; +st.shared.v2.f32 [r60+1024], {f168, f167}; +barrier.sync 0; +and.b32 r61, r9, 3072; +sub.s32 r62, r60, r61; +ld.shared.v2.f32 {f169, f170}, [r62]; +ld.shared.v2.f32 {f173, f174}, [r62+4096]; +sub.f32 f177, f169, f173; +sub.f32 f178, f170, f174; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f179, f180}, [rd29]; +mul.f32 f183, f179, f177; +mul.f32 f184, f180, f178; +mul.f32 f185, f179, f178; +and.b32 r64, r9, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 4096; +add.s32 r67, r65, r66; +add.f32 f186, f170, f174; +add.f32 f187, f169, f173; +st.shared.v2.f32 [r67], {f187, f186}; +fma.rn.f32 f188, f180, f177, f185; +sub.f32 f189, f183, f184; +st.shared.v2.f32 [r67+2048], {f189, f188}; +barrier.sync 0; +and.b32 r68, r9, 2048; +sub.s32 r69, r67, r68; +ld.shared.v2.f32 {f190, f191}, [r69]; +ld.shared.v2.f32 {f194, f195}, [r69+4096]; +add.f32 %1, f191, f195; +add.f32 %0, f190, f194; +sub.f32 %3, f191, f195; +sub.f32 %2, f190, f194; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<90, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<166>; +.reg .b32 r<70>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %14, %16; +add.f32 f10, %15, %17; +sub.f32 f11, %14, %16; +sub.f32 f12, %15, %17; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 4088; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 2044; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+2048]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+2048]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4080; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 2040; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+2048]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+2048]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 4064; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 2032; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+2048]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+2048]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 504; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 4032; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 2016; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+2048]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+2048]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 5; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f81, f79; +mul.f32 f86, f82, f80; +sub.f32 f87, f85, f86; +mul.f32 f88, f81, f80; +fma.rn.f32 f89, f82, f79, f88; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3968; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f87; +barrier.sync 0; +and.b32 r40, r11, 1984; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+2048]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+2048]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f98, f96; +mul.f32 f103, f99, f97; +sub.f32 f104, f102, f103; +mul.f32 f105, f98, f97; +fma.rn.f32 f106, f99, f96, f105; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3840; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f104; +barrier.sync 0; +and.b32 r47, r11, 1920; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+2048]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+2048]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f115, f113; +mul.f32 f120, f116, f114; +sub.f32 f121, f119, f120; +mul.f32 f122, f115, f114; +fma.rn.f32 f123, f116, f113, f122; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 3584; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f121; +barrier.sync 0; +and.b32 r54, r11, 1792; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+2048]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+2048]; +add.f32 f128, f124, f125; +add.f32 f129, f126, f127; +sub.f32 f130, f124, f125; +sub.f32 f131, f126, f127; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f132, f133}, [rd26]; +mul.f32 f136, f132, f130; +mul.f32 f137, f133, f131; +sub.f32 f138, f136, f137; +mul.f32 f139, f132, f131; +fma.rn.f32 f140, f133, f130, f139; +and.b32 r57, r11, 508; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 3072; +add.s32 r60, r58, r59; +st.shared.f32 [r60], f128; +st.shared.f32 [r60+512], f138; +barrier.sync 0; +and.b32 r61, r11, 1536; +sub.s32 r62, r60, r61; +ld.shared.f32 f141, [r62]; +ld.shared.f32 f142, [r62+2048]; +barrier.sync 0; +st.shared.f32 [r60], f129; +st.shared.f32 [r60+512], f140; +barrier.sync 0; +ld.shared.f32 f143, [r62]; +ld.shared.f32 f144, [r62+2048]; +add.f32 f145, f141, f142; +add.f32 f146, f143, f144; +sub.f32 f147, f141, f142; +sub.f32 f148, f143, f144; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f149, f150}, [rd29]; +mul.f32 f153, f149, f147; +mul.f32 f154, f150, f148; +sub.f32 f155, f153, f154; +mul.f32 f156, f149, f148; +fma.rn.f32 f157, f150, f147, f156; +and.b32 r64, r11, 1020; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 2048; +add.s32 r67, r65, r66; +st.shared.f32 [r67], f145; +st.shared.f32 [r67+1024], f155; +barrier.sync 0; +and.b32 r68, r11, 1024; +sub.s32 r69, r67, r68; +ld.shared.f32 f158, [r69]; +ld.shared.f32 f159, [r69+2048]; +barrier.sync 0; +st.shared.f32 [r67], f146; +st.shared.f32 [r67+1024], f157; +barrier.sync 0; +ld.shared.f32 f160, [r69]; +ld.shared.f32 f161, [r69+2048]; +add.f32 %0, f158, f159; +add.f32 %1, f160, f161; +sub.f32 %2, f158, f159; +sub.f32 %3, f160, f161; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..70b4319f5c370 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp32_inv.hpp.inc @@ -0,0 +1,6614 @@ +#ifndef CUFFTDX_FFT_1024_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_1024_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<285, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1873>; +.reg .b32 r<24>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1868, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1866, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1865, f1868, f1866; +sub.f32 f140, f1868, f1866; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1864, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1861, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1859, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1858, f1861, f1859; +sub.f32 f156, f1861, f1859; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1857, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1857, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1855, f159, 0fBF3504F3; +mul.f32 f1856, f160, 0f3F3504F3; +sub.f32 f167, f1855, f1856; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1854, f1865, f1858; +sub.f32 f173, f1865, f1858; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1853, f1864, f164; +sub.f32 f177, f1864, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1852, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1851, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1849, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1846, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1845, f1849, f1846; +sub.f32 f197, f1849, f1846; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1844, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1842, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1840, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1839, f1842, f1840; +sub.f32 f213, f1842, f1840; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1838, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1838, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1836, f216, 0fBF3504F3; +mul.f32 f1837, f217, 0f3F3504F3; +sub.f32 f224, f1836, f1837; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1835, f1845, f1839; +sub.f32 f230, f1845, f1839; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1834, f1844, f221; +sub.f32 f234, f1844, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1833, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1832, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1830, f231, 0f3F6C835E; +mul.f32 f1831, f1834, 0f3EC3EF15; +sub.f32 f245, f1830, f1831; +mul.f32 f246, f1834, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1833, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1832, 0f3F6C835E; +mul.f32 f1829, f239, 0f3EC3EF15; +sub.f32 f254, f1829, f253; +mul.f32 f255, f1832, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1828, f233, 0fBEC3EF15; +sub.f32 f259, f1828, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1826, f237, 0fBF3504F3; +mul.f32 f1827, f238, 0f3F3504F3; +sub.f32 f264, f1826, f1827; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1824, f241, 0fBF6C835E; +mul.f32 f1825, f242, 0f3EC3EF15; +sub.f32 f269, f1824, f1825; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1823, f1854, f1835; +sub.f32 f275, f1854, f1835; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1822, f1853, f247; +sub.f32 f279, f1853, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1821, f1852, f251; +sub.f32 f283, f1852, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1820, f1851, f256; +sub.f32 f287, f1851, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1819, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1818, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1817, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1816, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1813, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1811, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1810, f1813, f1811; +sub.f32 f315, f1813, f1811; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1809, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1807, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1804, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1803, f1807, f1804; +sub.f32 f331, f1807, f1804; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1802, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1802, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1801, f334, 0fBF3504F3; +sub.f32 f342, f1801, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1800, f1810, f1803; +sub.f32 f348, f1810, f1803; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1799, f1809, f339; +sub.f32 f352, f1809, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1798, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1797, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1795, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1793, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1792, f1795, f1793; +sub.f32 f372, f1795, f1793; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1791, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1788, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1787, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1786, f1788, f1787; +sub.f32 f388, f1788, f1787; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1785, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1785, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1783, f391, 0fBF3504F3; +mul.f32 f1784, f392, 0f3F3504F3; +sub.f32 f399, f1783, f1784; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1782, f1792, f1786; +sub.f32 f405, f1792, f1786; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1781, f1791, f396; +sub.f32 f409, f1791, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1780, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1779, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1781, 0f3EC3EF15; +mul.f32 f1778, f406, 0f3F6C835E; +sub.f32 f420, f1778, f419; +mul.f32 f421, f1781, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1780, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1779, 0f3F6C835E; +mul.f32 f1777, f414, 0f3EC3EF15; +sub.f32 f429, f1777, f428; +mul.f32 f430, f1779, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1776, f408, 0fBEC3EF15; +sub.f32 f434, f1776, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1775, f412, 0fBF3504F3; +sub.f32 f439, f1775, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1774, f416, 0fBF6C835E; +sub.f32 f444, f1774, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1773, f1800, f1782; +sub.f32 f450, f1800, f1782; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1772, f1799, f422; +sub.f32 f454, f1799, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1771, f1798, f426; +sub.f32 f458, f1798, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1770, f1797, f431; +sub.f32 f462, f1797, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1769, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1768, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1767, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1766, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1772, 0f3E47C5C2; +mul.f32 f1765, f451, 0f3F7B14BE; +sub.f32 f481, f1765, f480; +mul.f32 f482, f1772, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1771, 0f3EC3EF15; +mul.f32 f1764, f455, 0f3F6C835E; +sub.f32 f486, f1764, f485; +mul.f32 f487, f1771, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1770, 0f3F0E39DA; +mul.f32 f1763, f459, 0f3F54DB31; +sub.f32 f491, f1763, f490; +mul.f32 f492, f1770, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1769, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1768, 0f3F54DB31; +mul.f32 f1762, f467, 0f3F0E39DA; +sub.f32 f500, f1762, f499; +mul.f32 f501, f1768, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1767, 0f3F6C835E; +mul.f32 f1761, f471, 0f3EC3EF15; +sub.f32 f505, f1761, f504; +mul.f32 f506, f1767, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1766, 0f3F7B14BE; +mul.f32 f1760, f475, 0f3E47C5C2; +sub.f32 f510, f1760, f509; +mul.f32 f511, f1766, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1759, f453, 0fBE47C5C2; +sub.f32 f515, f1759, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1757, f457, 0fBEC3EF15; +mul.f32 f1758, f458, 0f3F6C835E; +sub.f32 f520, f1757, f1758; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1755, f461, 0fBF0E39DA; +mul.f32 f1756, f462, 0f3F54DB31; +sub.f32 f525, f1755, f1756; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1753, f465, 0fBF3504F3; +mul.f32 f1754, f466, 0f3F3504F3; +sub.f32 f530, f1753, f1754; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1751, f469, 0fBF54DB31; +mul.f32 f1752, f470, 0f3F0E39DA; +sub.f32 f535, f1751, f1752; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1750, f473, 0fBF6C835E; +sub.f32 f540, f1750, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1749, f477, 0fBF7B14BE; +sub.f32 f545, f1749, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1748, f1822, f483; +sub.f32 f553, f1822, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1747, f1821, f488; +sub.f32 f557, f1821, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1746, f1820, f493; +sub.f32 f561, f1820, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1745, f1819, f497; +sub.f32 f565, f1819, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f1744, f1818, f502; +sub.f32 f569, f1818, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f1743, f1817, f507; +sub.f32 f573, f1817, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f1742, f1816, f512; +sub.f32 f577, f1816, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f1741, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f1740, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f1739, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f1738, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f1737, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1736, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1735, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1734, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +and.b32 r14, r15, 31; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f1748, f611; +mul.f32 f616, f610, f1748; +mul.f32 f618, f611, f611; +mul.f32 f1733, f610, f610; +sub.f32 f619, f1733, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f1747, f621; +mul.f32 f624, f619, f1747; +mul.f32 f626, f611, f621; +mul.f32 f1732, f610, f619; +sub.f32 f627, f1732, f626; +mul.f32 f1731, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f1746, f629; +mul.f32 f632, f627, f1746; +mul.f32 f1729, f610, f627; +mul.f32 f1730, f611, f629; +sub.f32 f635, f1729, f1730; +mul.f32 f1728, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f1745, f637; +mul.f32 f640, f635, f1745; +mul.f32 f642, f611, f637; +mul.f32 f1727, f610, f635; +sub.f32 f643, f1727, f642; +mul.f32 f1726, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f1744, f645; +mul.f32 f648, f643, f1744; +mul.f32 f1724, f610, f643; +mul.f32 f1725, f611, f645; +sub.f32 f651, f1724, f1725; +mul.f32 f1723, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f1743, f653; +mul.f32 f656, f651, f1743; +mul.f32 f658, f611, f653; +mul.f32 f1722, f610, f651; +sub.f32 f659, f1722, f658; +mul.f32 f1721, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f1742, f661; +mul.f32 f664, f659, f1742; +mul.f32 f666, f611, f661; +mul.f32 f1720, f610, f659; +sub.f32 f667, f1720, f666; +mul.f32 f1719, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f1741, f669; +mul.f32 f672, f667, f1741; +mul.f32 f1717, f610, f667; +mul.f32 f1718, f611, f669; +sub.f32 f675, f1717, f1718; +mul.f32 f1716, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f1740, f677; +mul.f32 f680, f675, f1740; +mul.f32 f682, f611, f677; +mul.f32 f1715, f610, f675; +sub.f32 f683, f1715, f682; +mul.f32 f1714, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f1739, f685; +mul.f32 f688, f683, f1739; +mul.f32 f690, f611, f685; +mul.f32 f1713, f610, f683; +sub.f32 f691, f1713, f690; +mul.f32 f1712, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f1738, f693; +mul.f32 f696, f691, f1738; +mul.f32 f1710, f610, f691; +mul.f32 f1711, f611, f693; +sub.f32 f699, f1710, f1711; +mul.f32 f1709, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f1737, f701; +mul.f32 f704, f699, f1737; +mul.f32 f706, f611, f701; +mul.f32 f1708, f610, f699; +sub.f32 f707, f1708, f706; +mul.f32 f1707, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f1736, f709; +mul.f32 f712, f707, f1736; +mul.f32 f1705, f610, f707; +mul.f32 f1706, f611, f709; +sub.f32 f715, f1705, f1706; +mul.f32 f1704, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f1735, f717; +mul.f32 f720, f715, f1735; +mul.f32 f722, f611, f717; +mul.f32 f1703, f610, f715; +sub.f32 f723, f1703, f722; +mul.f32 f1702, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f1734, f725; +mul.f32 f728, f723, f1734; +mul.f32 f730, f611, f725; +mul.f32 f1701, f610, f723; +sub.f32 f731, f1701, f730; +mul.f32 f1700, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1699, f1823, f1773; +mul.f32 f734, f1699, f733; +mul.f32 f736, f731, f1699; +mul.f32 f1697, f610, f731; +mul.f32 f1698, f611, f733; +sub.f32 f739, f1697, f1698; +sub.f32 f1696, f272, f447; +mul.f32 f1695, f1696, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1694, f610, f739; +sub.f32 f747, f1694, f746; +mul.f32 f1693, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1692, f610, f747; +sub.f32 f755, f1692, f754; +mul.f32 f1691, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f1689, f610, f755; +mul.f32 f1690, f611, f757; +sub.f32 f763, f1689, f1690; +mul.f32 f1688, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1687, f610, f763; +sub.f32 f771, f1687, f770; +mul.f32 f1686, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f1684, f610, f771; +mul.f32 f1685, f611, f773; +sub.f32 f779, f1684, f1685; +mul.f32 f1683, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1682, f610, f779; +sub.f32 f787, f1682, f786; +mul.f32 f1681, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1680, f610, f787; +sub.f32 f795, f1680, f794; +mul.f32 f1679, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f1677, f610, f795; +mul.f32 f1678, f611, f797; +sub.f32 f803, f1677, f1678; +mul.f32 f1676, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1675, f610, f803; +sub.f32 f811, f1675, f810; +mul.f32 f1674, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1673, f610, f811; +sub.f32 f819, f1673, f818; +mul.f32 f1672, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f1670, f610, f819; +mul.f32 f1671, f611, f821; +sub.f32 f827, f1670, f1671; +mul.f32 f1669, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1668, f610, f827; +sub.f32 f835, f1668, f834; +mul.f32 f1667, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f1665, f610, f835; +mul.f32 f1666, f611, f837; +sub.f32 f843, f1665, f1666; +mul.f32 f1664, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1663, f610, f843; +sub.f32 f851, f1663, f850; +mul.f32 f1662, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f1661, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 7936; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 31; +sub.f32 f1871, f1823, f1773; +mul.f32 f1870, f731, f1871; +add.f32 f857, f1823, f1773; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 31; +sub.f32 f1872, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 31; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 31; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f1661; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f1731; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f1728; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f1726; +sub.f32 f867, f648, f1723; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f1721; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f1719; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f1716; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f1714; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f1712; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f1709; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f1707; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f1704; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f1702; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f1700; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f1872, f734; +sub.f32 f890, f1870, f1695; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f1693; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f1691; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f1688; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f1686; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f1683; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f1681; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f1679; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f1676; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f1674; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f1672; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f1669; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f1667; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f1664; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f1662; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +mad.lo.s32 r13, r22, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+256]; +ld.shared.v2.f32 {f929, f930}, [r13+512]; +ld.shared.v2.f32 {f933, f934}, [r13+768]; +ld.shared.v2.f32 {f937, f938}, [r13+1024]; +ld.shared.v2.f32 {f941, f942}, [r13+1280]; +ld.shared.v2.f32 {f945, f946}, [r13+1536]; +ld.shared.v2.f32 {f949, f950}, [r13+1792]; +ld.shared.v2.f32 {f953, f954}, [r13+2048]; +ld.shared.v2.f32 {f957, f958}, [r13+2304]; +ld.shared.v2.f32 {f961, f962}, [r13+2560]; +ld.shared.v2.f32 {f965, f966}, [r13+2816]; +ld.shared.v2.f32 {f969, f970}, [r13+3072]; +ld.shared.v2.f32 {f973, f974}, [r13+3328]; +ld.shared.v2.f32 {f977, f978}, [r13+3584]; +ld.shared.v2.f32 {f981, f982}, [r13+3840]; +ld.shared.v2.f32 {f985, f986}, [r13+4096]; +ld.shared.v2.f32 {f989, f990}, [r13+4352]; +ld.shared.v2.f32 {f993, f994}, [r13+4608]; +ld.shared.v2.f32 {f997, f998}, [r13+4864]; +ld.shared.v2.f32 {f1001, f1002}, [r13+5120]; +ld.shared.v2.f32 {f1005, f1006}, [r13+5376]; +ld.shared.v2.f32 {f1009, f1010}, [r13+5632]; +ld.shared.v2.f32 {f1013, f1014}, [r13+5888]; +ld.shared.v2.f32 {f1017, f1018}, [r13+6144]; +ld.shared.v2.f32 {f1021, f1022}, [r13+6400]; +ld.shared.v2.f32 {f1025, f1026}, [r13+6656]; +ld.shared.v2.f32 {f1029, f1030}, [r13+6912]; +ld.shared.v2.f32 {f1033, f1034}, [r13+7168]; +ld.shared.v2.f32 {f1037, f1038}, [r13+7424]; +ld.shared.v2.f32 {f1041, f1042}, [r13+7680]; +ld.shared.v2.f32 {f1045, f1046}, [r13+7936]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1660, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1659, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f1658, f1660, f1659; +sub.f32 f1060, f1660, f1659; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f1657, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f1656, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f1655, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f1654, f1656, f1655; +sub.f32 f1076, f1656, f1655; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f1653, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f1653, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f1651, f1079, 0fBF3504F3; +mul.f32 f1652, f1080, 0f3F3504F3; +sub.f32 f1087, f1651, f1652; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f1650, f1658, f1654; +sub.f32 f1093, f1658, f1654; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f1649, f1657, f1084; +sub.f32 f1097, f1657, f1084; +sub.f32 f1098, f1059, f1076; +add.f32 f1100, f1059, f1076; +add.f32 f1648, f1060, f1075; +sub.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1087; +sub.f32 f1104, f1063, f1087; +add.f32 f1647, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f1646, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f1645, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f1644, f1646, f1645; +sub.f32 f1117, f1646, f1645; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f1643, f1109, f1112; +sub.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f1642, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f1641, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f1640, f1642, f1641; +sub.f32 f1133, f1642, f1641; +sub.f32 f1134, f1124, f1129; +add.f32 f1136, f1124, f1129; +add.f32 f1639, f1125, f1128; +sub.f32 f1137, f1125, f1128; +mul.f32 f1138, f1134, 0f3F3504F3; +mul.f32 f1139, f1639, 0f3F3504F3; +sub.f32 f1140, f1138, f1139; +add.f32 f1141, f1138, f1139; +mul.f32 f1637, f1136, 0fBF3504F3; +mul.f32 f1638, f1137, 0f3F3504F3; +sub.f32 f1144, f1637, f1638; +mul.f32 f1145, f1137, 0fBF3504F3; +fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f1636, f1644, f1640; +sub.f32 f1150, f1644, f1640; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f1635, f1643, f1141; +sub.f32 f1154, f1643, f1141; +sub.f32 f1155, f1116, f1133; +add.f32 f1157, f1116, f1133; +add.f32 f1634, f1117, f1132; +sub.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1144; +sub.f32 f1161, f1120, f1144; +add.f32 f1633, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f1631, f1151, 0f3F6C835E; +mul.f32 f1632, f1635, 0f3EC3EF15; +sub.f32 f1165, f1631, f1632; +mul.f32 f1166, f1635, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166; +mul.f32 f1168, f1155, 0f3F3504F3; +mul.f32 f1169, f1634, 0f3F3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +mul.f32 f1629, f1159, 0f3EC3EF15; +mul.f32 f1630, f1633, 0f3F6C835E; +sub.f32 f1174, f1629, f1630; +mul.f32 f1175, f1633, 0f3EC3EF15; +fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175; +mul.f32 f1627, f1153, 0fBEC3EF15; +mul.f32 f1628, f1154, 0f3F6C835E; +sub.f32 f1179, f1627, f1628; +mul.f32 f1180, f1154, 0fBEC3EF15; +fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180; +mul.f32 f1625, f1157, 0fBF3504F3; +mul.f32 f1626, f1158, 0f3F3504F3; +sub.f32 f1184, f1625, f1626; +mul.f32 f1185, f1158, 0fBF3504F3; +fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185; +mul.f32 f1623, f1161, 0fBF6C835E; +mul.f32 f1624, f1162, 0f3EC3EF15; +sub.f32 f1189, f1623, f1624; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f1622, f1650, f1636; +sub.f32 f1195, f1650, f1636; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f1621, f1649, f1167; +sub.f32 f1199, f1649, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f1620, f1648, f1171; +sub.f32 f1203, f1648, f1171; +add.f32 f1204, f1102, f1174; +sub.f32 f1206, f1102, f1174; +add.f32 f1619, f1647, f1176; +sub.f32 f1207, f1647, f1176; +sub.f32 f1208, f1092, f1150; +add.f32 f1210, f1092, f1150; +add.f32 f1618, f1093, f1149; +sub.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1179; +sub.f32 f1214, f1096, f1179; +add.f32 f1617, f1097, f1181; +sub.f32 f1215, f1097, f1181; +add.f32 f1216, f1100, f1184; +sub.f32 f1218, f1100, f1184; +add.f32 f1616, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f1615, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f1614, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f1613, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f1612, f1614, f1613; +sub.f32 f1235, f1614, f1613; +sub.f32 f1236, f1226, f1231; +add.f32 f1238, f1226, f1231; +add.f32 f1611, f1227, f1230; +sub.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f1610, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f1609, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f1608, f1610, f1609; +sub.f32 f1251, f1610, f1609; +sub.f32 f1252, f1242, f1247; +add.f32 f1254, f1242, f1247; +add.f32 f1607, f1243, f1246; +sub.f32 f1255, f1243, f1246; +mul.f32 f1256, f1252, 0f3F3504F3; +mul.f32 f1257, f1607, 0f3F3504F3; +sub.f32 f1258, f1256, f1257; +add.f32 f1259, f1256, f1257; +mul.f32 f1605, f1254, 0fBF3504F3; +mul.f32 f1606, f1255, 0f3F3504F3; +sub.f32 f1262, f1605, f1606; +mul.f32 f1263, f1255, 0fBF3504F3; +fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f1604, f1612, f1608; +sub.f32 f1268, f1612, f1608; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f1603, f1611, f1259; +sub.f32 f1272, f1611, f1259; +sub.f32 f1273, f1234, f1251; +add.f32 f1275, f1234, f1251; +add.f32 f1602, f1235, f1250; +sub.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1262; +sub.f32 f1279, f1238, f1262; +add.f32 f1601, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f1600, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f1599, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f1598, f1600, f1599; +sub.f32 f1292, f1600, f1599; +sub.f32 f1293, f1283, f1288; +add.f32 f1295, f1283, f1288; +add.f32 f1597, f1284, f1287; +sub.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f1596, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f1595, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f1594, f1596, f1595; +sub.f32 f1308, f1596, f1595; +sub.f32 f1309, f1299, f1304; +add.f32 f1311, f1299, f1304; +add.f32 f1593, f1300, f1303; +sub.f32 f1312, f1300, f1303; +mul.f32 f1313, f1309, 0f3F3504F3; +mul.f32 f1314, f1593, 0f3F3504F3; +sub.f32 f1315, f1313, f1314; +add.f32 f1316, f1313, f1314; +mul.f32 f1591, f1311, 0fBF3504F3; +mul.f32 f1592, f1312, 0f3F3504F3; +sub.f32 f1319, f1591, f1592; +mul.f32 f1320, f1312, 0fBF3504F3; +fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f1590, f1598, f1594; +sub.f32 f1325, f1598, f1594; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f1589, f1597, f1316; +sub.f32 f1329, f1597, f1316; +sub.f32 f1330, f1291, f1308; +add.f32 f1332, f1291, f1308; +add.f32 f1588, f1292, f1307; +sub.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1319; +sub.f32 f1336, f1295, f1319; +add.f32 f1587, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f1585, f1326, 0f3F6C835E; +mul.f32 f1586, f1589, 0f3EC3EF15; +sub.f32 f1340, f1585, f1586; +mul.f32 f1341, f1589, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341; +mul.f32 f1343, f1330, 0f3F3504F3; +mul.f32 f1344, f1588, 0f3F3504F3; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1343, f1344; +mul.f32 f1348, f1587, 0f3F6C835E; +mul.f32 f1584, f1334, 0f3EC3EF15; +sub.f32 f1349, f1584, f1348; +mul.f32 f1350, f1587, 0f3EC3EF15; +fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350; +mul.f32 f1353, f1329, 0f3F6C835E; +mul.f32 f1583, f1328, 0fBEC3EF15; +sub.f32 f1354, f1583, f1353; +mul.f32 f1355, f1329, 0fBEC3EF15; +fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355; +mul.f32 f1581, f1332, 0fBF3504F3; +mul.f32 f1582, f1333, 0f3F3504F3; +sub.f32 f1359, f1581, f1582; +mul.f32 f1360, f1333, 0fBF3504F3; +fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360; +mul.f32 f1579, f1336, 0fBF6C835E; +mul.f32 f1580, f1337, 0f3EC3EF15; +sub.f32 f1364, f1579, f1580; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f1578, f1604, f1590; +sub.f32 f1370, f1604, f1590; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f1577, f1603, f1342; +sub.f32 f1374, f1603, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f1576, f1602, f1346; +sub.f32 f1378, f1602, f1346; +add.f32 f1379, f1277, f1349; +sub.f32 f1381, f1277, f1349; +add.f32 f1575, f1601, f1351; +sub.f32 f1382, f1601, f1351; +sub.f32 f1383, f1267, f1325; +add.f32 f1385, f1267, f1325; +add.f32 f1574, f1268, f1324; +sub.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1354; +sub.f32 f1389, f1271, f1354; +add.f32 f1573, f1272, f1356; +sub.f32 f1390, f1272, f1356; +add.f32 f1391, f1275, f1359; +sub.f32 f1393, f1275, f1359; +add.f32 f1572, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f1571, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f1577, 0f3E47C5C2; +mul.f32 f1570, f1371, 0f3F7B14BE; +sub.f32 f1401, f1570, f1400; +mul.f32 f1402, f1577, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402; +mul.f32 f1405, f1576, 0f3EC3EF15; +mul.f32 f1569, f1375, 0f3F6C835E; +sub.f32 f1406, f1569, f1405; +mul.f32 f1407, f1576, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407; +mul.f32 f1567, f1379, 0f3F54DB31; +mul.f32 f1568, f1575, 0f3F0E39DA; +sub.f32 f1411, f1567, f1568; +mul.f32 f1412, f1575, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412; +mul.f32 f1414, f1383, 0f3F3504F3; +mul.f32 f1415, f1574, 0f3F3504F3; +sub.f32 f1416, f1414, f1415; +add.f32 f1417, f1414, f1415; +mul.f32 f1419, f1573, 0f3F54DB31; +mul.f32 f1566, f1387, 0f3F0E39DA; +sub.f32 f1420, f1566, f1419; +mul.f32 f1421, f1573, 0f3F0E39DA; +fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421; +mul.f32 f1424, f1572, 0f3F6C835E; +mul.f32 f1565, f1391, 0f3EC3EF15; +sub.f32 f1425, f1565, f1424; +mul.f32 f1426, f1572, 0f3EC3EF15; +fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426; +mul.f32 f1563, f1395, 0f3E47C5C2; +mul.f32 f1564, f1571, 0f3F7B14BE; +sub.f32 f1430, f1563, f1564; +mul.f32 f1431, f1571, 0f3E47C5C2; +fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431; +mul.f32 f1561, f1373, 0fBE47C5C2; +mul.f32 f1562, f1374, 0f3F7B14BE; +sub.f32 f1435, f1561, f1562; +mul.f32 f1436, f1374, 0fBE47C5C2; +fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436; +mul.f32 f1559, f1377, 0fBEC3EF15; +mul.f32 f1560, f1378, 0f3F6C835E; +sub.f32 f1440, f1559, f1560; +mul.f32 f1441, f1378, 0fBEC3EF15; +fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441; +mul.f32 f1557, f1381, 0fBF0E39DA; +mul.f32 f1558, f1382, 0f3F54DB31; +sub.f32 f1445, f1557, f1558; +mul.f32 f1446, f1382, 0fBF0E39DA; +fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446; +mul.f32 f1449, f1386, 0f3F3504F3; +mul.f32 f1556, f1385, 0fBF3504F3; +sub.f32 f1450, f1556, f1449; +mul.f32 f1451, f1386, 0fBF3504F3; +fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451; +mul.f32 f1454, f1390, 0f3F0E39DA; +mul.f32 f1555, f1389, 0fBF54DB31; +sub.f32 f1455, f1555, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456; +mul.f32 f1459, f1394, 0f3EC3EF15; +mul.f32 f1554, f1393, 0fBF6C835E; +sub.f32 f1460, f1554, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461; +mul.f32 f1464, f1398, 0f3E47C5C2; +mul.f32 f1553, f1397, 0fBF7B14BE; +sub.f32 f1465, f1553, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466; +add.f32 %0, f1192, f1367; +add.f32 %1, f1622, f1578; +add.f32 %2, f1196, f1401; +add.f32 %3, f1621, f1403; +add.f32 %4, f1200, f1406; +add.f32 %5, f1620, f1408; +add.f32 %6, f1204, f1411; +add.f32 %7, f1619, f1413; +add.f32 %9, f1618, f1417; +add.f32 %8, f1208, f1416; +add.f32 %11, f1617, f1422; +add.f32 %10, f1212, f1420; +add.f32 %12, f1216, f1425; +add.f32 %13, f1616, f1427; +add.f32 %14, f1220, f1430; +add.f32 %15, f1615, f1432; +sub.f32 %16, f1194, f1370; +add.f32 %17, f1195, f1369; +add.f32 %18, f1198, f1435; +add.f32 %19, f1199, f1437; +add.f32 %21, f1203, f1442; +add.f32 %20, f1202, f1440; +add.f32 %23, f1207, f1447; +add.f32 %22, f1206, f1445; +add.f32 %25, f1211, f1452; +add.f32 %24, f1210, f1450; +add.f32 %26, f1214, f1455; +add.f32 %27, f1215, f1457; +add.f32 %28, f1218, f1460; +add.f32 %29, f1219, f1462; +add.f32 %30, f1222, f1465; +add.f32 %31, f1223, f1467; +sub.f32 %33, f1622, f1578; +sub.f32 %32, f1192, f1367; +sub.f32 %35, f1621, f1403; +sub.f32 %34, f1196, f1401; +sub.f32 %37, f1620, f1408; +sub.f32 %36, f1200, f1406; +sub.f32 %39, f1619, f1413; +sub.f32 %38, f1204, f1411; +sub.f32 %41, f1618, f1417; +sub.f32 %40, f1208, f1416; +sub.f32 %43, f1617, f1422; +sub.f32 %42, f1212, f1420; +sub.f32 %45, f1616, f1427; +sub.f32 %44, f1216, f1425; +sub.f32 %47, f1615, f1432; +sub.f32 %46, f1220, f1430; +sub.f32 %49, f1195, f1369; +add.f32 %48, f1194, f1370; +sub.f32 %51, f1199, f1437; +sub.f32 %50, f1198, f1435; +sub.f32 %53, f1203, f1442; +sub.f32 %52, f1202, f1440; +sub.f32 %55, f1207, f1447; +sub.f32 %54, f1206, f1445; +sub.f32 %57, f1211, f1452; +sub.f32 %56, f1210, f1450; +sub.f32 %59, f1215, f1457; +sub.f32 %58, f1214, f1455; +sub.f32 %61, f1219, f1462; +sub.f32 %60, f1218, f1460; +sub.f32 %63, f1223, f1467; +sub.f32 %62, f1222, f1465; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<283, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<841>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4032; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+256]; +ld.shared.f32 f391, [r13+512]; +ld.shared.f32 f392, [r13+768]; +ld.shared.f32 f393, [r13+1024]; +ld.shared.f32 f394, [r13+1280]; +ld.shared.f32 f395, [r13+1536]; +ld.shared.f32 f396, [r13+1792]; +ld.shared.f32 f397, [r13+2048]; +ld.shared.f32 f398, [r13+2304]; +ld.shared.f32 f399, [r13+2560]; +ld.shared.f32 f400, [r13+2816]; +ld.shared.f32 f401, [r13+3072]; +ld.shared.f32 f402, [r13+3328]; +ld.shared.f32 f403, [r13+3584]; +ld.shared.f32 f404, [r13+3840]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+256]; +ld.shared.f32 f407, [r13+512]; +ld.shared.f32 f408, [r13+768]; +ld.shared.f32 f409, [r13+1024]; +ld.shared.f32 f410, [r13+1280]; +ld.shared.f32 f411, [r13+1536]; +ld.shared.f32 f412, [r13+1792]; +ld.shared.f32 f413, [r13+2048]; +ld.shared.f32 f414, [r13+2304]; +ld.shared.f32 f415, [r13+2560]; +ld.shared.f32 f416, [r13+2816]; +ld.shared.f32 f417, [r13+3072]; +ld.shared.f32 f418, [r13+3328]; +ld.shared.f32 f419, [r13+3584]; +ld.shared.f32 f420, [r13+3840]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f543; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f543; +add.f32 f576, f474, f546; +add.f32 f577, f475, f548; +sub.f32 f578, f474, f546; +sub.f32 f579, f475, f548; +sub.f32 f580, f464, f522; +add.f32 f581, f465, f521; +add.f32 f582, f464, f522; +sub.f32 f583, f465, f521; +add.f32 f584, f468, f551; +add.f32 f585, f469, f553; +sub.f32 f586, f468, f551; +sub.f32 f587, f469, f553; +add.f32 f588, f472, f556; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f556; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 48; +bfe.u32 r15, r5, 4, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f569, f597; +fma.rn.f32 f601, f596, f568, f600; +mul.f32 f602, f568, f597; +mul.f32 f603, f596, f569; +sub.f32 f604, f603, f602; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f573, f609; +fma.rn.f32 f611, f607, f572, f610; +mul.f32 f612, f572, f609; +mul.f32 f613, f607, f573; +sub.f32 f614, f613, f612; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f577, f619; +fma.rn.f32 f621, f617, f576, f620; +mul.f32 f622, f576, f619; +mul.f32 f623, f617, f577; +sub.f32 f624, f623, f622; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f581, f629; +fma.rn.f32 f631, f627, f580, f630; +mul.f32 f632, f580, f629; +mul.f32 f633, f627, f581; +sub.f32 f634, f633, f632; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f585, f639; +fma.rn.f32 f641, f637, f584, f640; +mul.f32 f642, f584, f639; +mul.f32 f643, f637, f585; +sub.f32 f644, f643, f642; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f589, f649; +fma.rn.f32 f651, f647, f588, f650; +mul.f32 f652, f588, f649; +mul.f32 f653, f647, f589; +sub.f32 f654, f653, f652; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f593, f659; +fma.rn.f32 f661, f657, f592, f660; +mul.f32 f662, f592, f659; +mul.f32 f663, f657, f593; +sub.f32 f664, f663, f662; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f567, f669; +fma.rn.f32 f671, f667, f566, f670; +mul.f32 f672, f566, f669; +mul.f32 f673, f667, f567; +sub.f32 f674, f673, f672; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f571, f679; +fma.rn.f32 f681, f677, f570, f680; +mul.f32 f682, f570, f679; +mul.f32 f683, f677, f571; +sub.f32 f684, f683, f682; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f575, f689; +fma.rn.f32 f691, f687, f574, f690; +mul.f32 f692, f574, f689; +mul.f32 f693, f687, f575; +sub.f32 f694, f693, f692; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f579, f699; +fma.rn.f32 f701, f697, f578, f700; +mul.f32 f702, f578, f699; +mul.f32 f703, f697, f579; +sub.f32 f704, f703, f702; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f583, f709; +fma.rn.f32 f711, f707, f582, f710; +mul.f32 f712, f582, f709; +mul.f32 f713, f707, f583; +sub.f32 f714, f713, f712; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f587, f719; +fma.rn.f32 f721, f717, f586, f720; +mul.f32 f722, f586, f719; +mul.f32 f723, f717, f587; +sub.f32 f724, f723, f722; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f591, f729; +fma.rn.f32 f731, f727, f590, f730; +mul.f32 f732, f590, f729; +mul.f32 f733, f727, f591; +sub.f32 f734, f733, f732; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f595, f739; +fma.rn.f32 f741, f737, f594, f740; +mul.f32 f742, f594, f739; +mul.f32 f743, f737, f595; +sub.f32 f744, f743, f742; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 3072; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f601; +st.shared.f32 [r20+128], f611; +st.shared.f32 [r20+192], f621; +st.shared.f32 [r20+256], f631; +st.shared.f32 [r20+320], f641; +st.shared.f32 [r20+384], f651; +st.shared.f32 [r20+448], f661; +st.shared.f32 [r20+512], f671; +st.shared.f32 [r20+576], f681; +st.shared.f32 [r20+640], f691; +st.shared.f32 [r20+704], f701; +st.shared.f32 [r20+768], f711; +st.shared.f32 [r20+832], f721; +st.shared.f32 [r20+896], f731; +st.shared.f32 [r20+960], f741; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+256]; +ld.shared.f32 f747, [r21+512]; +ld.shared.f32 f748, [r21+768]; +ld.shared.f32 f749, [r21+1024]; +ld.shared.f32 f750, [r21+1280]; +ld.shared.f32 f751, [r21+1536]; +ld.shared.f32 f752, [r21+1792]; +ld.shared.f32 f753, [r21+2048]; +ld.shared.f32 f754, [r21+2304]; +ld.shared.f32 f755, [r21+2560]; +ld.shared.f32 f756, [r21+2816]; +ld.shared.f32 f757, [r21+3072]; +ld.shared.f32 f758, [r21+3328]; +ld.shared.f32 f759, [r21+3584]; +ld.shared.f32 f760, [r21+3840]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+256]; +ld.shared.f32 f763, [r21+512]; +ld.shared.f32 f764, [r21+768]; +ld.shared.f32 f765, [r21+1024]; +ld.shared.f32 f766, [r21+1280]; +ld.shared.f32 f767, [r21+1536]; +ld.shared.f32 f768, [r21+1792]; +ld.shared.f32 f769, [r21+2048]; +ld.shared.f32 f770, [r21+2304]; +ld.shared.f32 f771, [r21+2560]; +ld.shared.f32 f772, [r21+2816]; +ld.shared.f32 f773, [r21+3072]; +ld.shared.f32 f774, [r21+3328]; +ld.shared.f32 f775, [r21+3584]; +ld.shared.f32 f776, [r21+3840]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f746, f754; +add.f32 f786, f762, f770; +sub.f32 f787, f746, f754; +sub.f32 f788, f762, f770; +add.f32 f789, f750, f758; +add.f32 f790, f766, f774; +sub.f32 f791, f750, f758; +sub.f32 f792, f766, f774; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f748, f756; +add.f32 f802, f764, f772; +sub.f32 f803, f748, f756; +sub.f32 f804, f764, f772; +add.f32 f805, f752, f760; +add.f32 f806, f768, f776; +sub.f32 f807, f752, f760; +sub.f32 f808, f768, f776; +add.f32 %0, f777, f781; +add.f32 %1, f778, f782; +add.f32 %2, f785, f789; +add.f32 %3, f786, f790; +add.f32 %4, f793, f797; +add.f32 %5, f794, f798; +add.f32 %6, f801, f805; +add.f32 %7, f802, f806; +add.f32 %9, f780, f783; +sub.f32 %8, f779, f784; +add.f32 %11, f788, f791; +sub.f32 %10, f787, f792; +add.f32 %13, f796, f799; +sub.f32 %12, f795, f800; +add.f32 %15, f804, f807; +sub.f32 %14, f803, f808; +sub.f32 %16, f777, f781; +sub.f32 %17, f778, f782; +sub.f32 %18, f785, f789; +sub.f32 %19, f786, f790; +sub.f32 %20, f793, f797; +sub.f32 %21, f794, f798; +sub.f32 %22, f801, f805; +sub.f32 %23, f802, f806; +sub.f32 %25, f780, f783; +add.f32 %24, f779, f784; +sub.f32 %27, f788, f791; +add.f32 %26, f787, f792; +sub.f32 %29, f796, f799; +add.f32 %28, f795, f800; +sub.f32 %31, f804, f807; +add.f32 %30, f803, f808; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<286, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<475>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4064; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+512]; +ld.shared.f32 f161, [r13+1024]; +ld.shared.f32 f162, [r13+1536]; +ld.shared.f32 f163, [r13+2048]; +ld.shared.f32 f164, [r13+2560]; +ld.shared.f32 f165, [r13+3072]; +ld.shared.f32 f166, [r13+3584]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+512]; +ld.shared.f32 f169, [r13+1024]; +ld.shared.f32 f170, [r13+1536]; +ld.shared.f32 f171, [r13+2048]; +ld.shared.f32 f172, [r13+2560]; +ld.shared.f32 f173, [r13+3072]; +ld.shared.f32 f174, [r13+3584]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 120; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 3840; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+512]; +ld.shared.f32 f303, [r20+1024]; +ld.shared.f32 f304, [r20+1536]; +ld.shared.f32 f305, [r20+2048]; +ld.shared.f32 f306, [r20+2560]; +ld.shared.f32 f307, [r20+3072]; +ld.shared.f32 f308, [r20+3584]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+512]; +ld.shared.f32 f311, [r20+1024]; +ld.shared.f32 f312, [r20+1536]; +ld.shared.f32 f313, [r20+2048]; +ld.shared.f32 f314, [r20+2560]; +ld.shared.f32 f315, [r20+3072]; +ld.shared.f32 f316, [r20+3584]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +sub.f32 f329, f319, f324; +add.f32 f330, f320, f323; +add.f32 f331, f319, f324; +sub.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +sub.f32 f345, f335, f340; +add.f32 f346, f336, f339; +add.f32 f347, f335, f340; +sub.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0f3F3504F3; +sub.f32 f351, f349, f350; +add.f32 f352, f349, f350; +mul.f32 f353, f347, 0fBF3504F3; +mul.f32 f354, f348, 0f3F3504F3; +sub.f32 f355, f353, f354; +mul.f32 f356, f348, 0fBF3504F3; +fma.rn.f32 f357, f347, 0f3F3504F3, f356; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f352; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f352; +sub.f32 f366, f327, f344; +add.f32 f367, f328, f343; +add.f32 f368, f327, f344; +sub.f32 f369, f328, f343; +add.f32 f370, f331, f355; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f355; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 64; +bfe.u32 r22, r5, 6, 1; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f363, f375; +fma.rn.f32 f379, f374, f362, f378; +mul.f32 f380, f362, f375; +mul.f32 f381, f374, f363; +sub.f32 f382, f381, f380; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f367, f387; +fma.rn.f32 f389, f385, f366, f388; +mul.f32 f390, f366, f387; +mul.f32 f391, f385, f367; +sub.f32 f392, f391, f390; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f371, f397; +fma.rn.f32 f399, f395, f370, f398; +mul.f32 f400, f370, f397; +mul.f32 f401, f395, f371; +sub.f32 f402, f401, f400; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f361, f407; +fma.rn.f32 f409, f405, f360, f408; +mul.f32 f410, f360, f407; +mul.f32 f411, f405, f361; +sub.f32 f412, f411, f410; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f365, f417; +fma.rn.f32 f419, f415, f364, f418; +mul.f32 f420, f364, f417; +mul.f32 f421, f415, f365; +sub.f32 f422, f421, f420; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f369, f427; +fma.rn.f32 f429, f425, f368, f428; +mul.f32 f430, f368, f427; +mul.f32 f431, f425, f369; +sub.f32 f432, f431, f430; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f373, f437; +fma.rn.f32 f439, f435, f372, f438; +mul.f32 f440, f372, f437; +mul.f32 f441, f435, f373; +sub.f32 f442, f441, f440; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 2048; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f379; +st.shared.f32 [r26+512], f389; +st.shared.f32 [r26+768], f399; +st.shared.f32 [r26+1024], f409; +st.shared.f32 [r26+1280], f419; +st.shared.f32 [r26+1536], f429; +st.shared.f32 [r26+1792], f439; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+512]; +ld.shared.f32 f445, [r27+1024]; +ld.shared.f32 f446, [r27+1536]; +ld.shared.f32 f447, [r27+2048]; +ld.shared.f32 f448, [r27+2560]; +ld.shared.f32 f449, [r27+3072]; +ld.shared.f32 f450, [r27+3584]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+512]; +ld.shared.f32 f453, [r27+1024]; +ld.shared.f32 f454, [r27+1536]; +ld.shared.f32 f455, [r27+2048]; +ld.shared.f32 f456, [r27+2560]; +ld.shared.f32 f457, [r27+3072]; +ld.shared.f32 f458, [r27+3584]; +add.f32 %0, f443, f447; +add.f32 %1, f451, f455; +add.f32 %2, f444, f448; +add.f32 %3, f452, f456; +add.f32 %4, f445, f449; +add.f32 %5, f453, f457; +add.f32 %6, f446, f450; +add.f32 %7, f454, f458; +sub.f32 %8, f443, f447; +sub.f32 %9, f451, f455; +sub.f32 %10, f444, f448; +sub.f32 %11, f452, f456; +sub.f32 %12, f445, f449; +sub.f32 %13, f453, f457; +sub.f32 %14, f446, f450; +sub.f32 %15, f454, f458; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<287, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1778>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1776, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1774, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1773, f1776, f1774; +sub.f32 f140, f1776, f1774; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1772, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1769, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1767, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1766, f1769, f1767; +sub.f32 f156, f1769, f1767; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1765, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1765, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1763, f159, 0fBF3504F3; +mul.f32 f1764, f160, 0f3F3504F3; +sub.f32 f167, f1763, f1764; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1762, f1773, f1766; +sub.f32 f173, f1773, f1766; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1761, f1772, f164; +sub.f32 f177, f1772, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1760, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1759, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1757, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1754, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1753, f1757, f1754; +sub.f32 f197, f1757, f1754; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1752, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1750, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1748, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1747, f1750, f1748; +sub.f32 f213, f1750, f1748; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1746, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1746, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1744, f216, 0fBF3504F3; +mul.f32 f1745, f217, 0f3F3504F3; +sub.f32 f224, f1744, f1745; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1743, f1753, f1747; +sub.f32 f230, f1753, f1747; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1742, f1752, f221; +sub.f32 f234, f1752, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1741, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1740, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1738, f231, 0f3F6C835E; +mul.f32 f1739, f1742, 0f3EC3EF15; +sub.f32 f245, f1738, f1739; +mul.f32 f246, f1742, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1741, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1740, 0f3F6C835E; +mul.f32 f1737, f239, 0f3EC3EF15; +sub.f32 f254, f1737, f253; +mul.f32 f255, f1740, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1736, f233, 0fBEC3EF15; +sub.f32 f259, f1736, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1734, f237, 0fBF3504F3; +mul.f32 f1735, f238, 0f3F3504F3; +sub.f32 f264, f1734, f1735; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1732, f241, 0fBF6C835E; +mul.f32 f1733, f242, 0f3EC3EF15; +sub.f32 f269, f1732, f1733; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1731, f1762, f1743; +sub.f32 f275, f1762, f1743; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1730, f1761, f247; +sub.f32 f279, f1761, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1729, f1760, f251; +sub.f32 f283, f1760, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1728, f1759, f256; +sub.f32 f287, f1759, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1727, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1726, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1725, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1724, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1721, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1719, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1718, f1721, f1719; +sub.f32 f315, f1721, f1719; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1717, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1715, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1712, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1711, f1715, f1712; +sub.f32 f331, f1715, f1712; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1710, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1710, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1709, f334, 0fBF3504F3; +sub.f32 f342, f1709, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1708, f1718, f1711; +sub.f32 f348, f1718, f1711; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1707, f1717, f339; +sub.f32 f352, f1717, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1706, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1705, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1703, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1701, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1700, f1703, f1701; +sub.f32 f372, f1703, f1701; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1699, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1696, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1695, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1694, f1696, f1695; +sub.f32 f388, f1696, f1695; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1693, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1693, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1691, f391, 0fBF3504F3; +mul.f32 f1692, f392, 0f3F3504F3; +sub.f32 f399, f1691, f1692; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1690, f1700, f1694; +sub.f32 f405, f1700, f1694; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1689, f1699, f396; +sub.f32 f409, f1699, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1688, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1687, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1689, 0f3EC3EF15; +mul.f32 f1686, f406, 0f3F6C835E; +sub.f32 f420, f1686, f419; +mul.f32 f421, f1689, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1688, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1687, 0f3F6C835E; +mul.f32 f1685, f414, 0f3EC3EF15; +sub.f32 f429, f1685, f428; +mul.f32 f430, f1687, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1684, f408, 0fBEC3EF15; +sub.f32 f434, f1684, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1683, f412, 0fBF3504F3; +sub.f32 f439, f1683, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1682, f416, 0fBF6C835E; +sub.f32 f444, f1682, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1681, f1708, f1690; +sub.f32 f450, f1708, f1690; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1680, f1707, f422; +sub.f32 f454, f1707, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1679, f1706, f426; +sub.f32 f458, f1706, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1678, f1705, f431; +sub.f32 f462, f1705, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1677, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1676, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1675, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1674, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1680, 0f3E47C5C2; +mul.f32 f1673, f451, 0f3F7B14BE; +sub.f32 f481, f1673, f480; +mul.f32 f482, f1680, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1679, 0f3EC3EF15; +mul.f32 f1672, f455, 0f3F6C835E; +sub.f32 f486, f1672, f485; +mul.f32 f487, f1679, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1678, 0f3F0E39DA; +mul.f32 f1671, f459, 0f3F54DB31; +sub.f32 f491, f1671, f490; +mul.f32 f492, f1678, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1677, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1676, 0f3F54DB31; +mul.f32 f1670, f467, 0f3F0E39DA; +sub.f32 f500, f1670, f499; +mul.f32 f501, f1676, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1675, 0f3F6C835E; +mul.f32 f1669, f471, 0f3EC3EF15; +sub.f32 f505, f1669, f504; +mul.f32 f506, f1675, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1674, 0f3F7B14BE; +mul.f32 f1668, f475, 0f3E47C5C2; +sub.f32 f510, f1668, f509; +mul.f32 f511, f1674, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1667, f453, 0fBE47C5C2; +sub.f32 f515, f1667, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1665, f457, 0fBEC3EF15; +mul.f32 f1666, f458, 0f3F6C835E; +sub.f32 f520, f1665, f1666; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1663, f461, 0fBF0E39DA; +mul.f32 f1664, f462, 0f3F54DB31; +sub.f32 f525, f1663, f1664; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1661, f465, 0fBF3504F3; +mul.f32 f1662, f466, 0f3F3504F3; +sub.f32 f530, f1661, f1662; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1659, f469, 0fBF54DB31; +mul.f32 f1660, f470, 0f3F0E39DA; +sub.f32 f535, f1659, f1660; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1658, f473, 0fBF6C835E; +sub.f32 f540, f1658, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1657, f477, 0fBF7B14BE; +sub.f32 f545, f1657, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1656, f1731, f1681; +sub.f32 f551, f1731, f1681; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1655, f1730, f483; +sub.f32 f555, f1730, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1654, f1729, f488; +sub.f32 f559, f1729, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1653, f1728, f493; +sub.f32 f563, f1728, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1652, f1727, f497; +sub.f32 f567, f1727, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f1651, f1726, f502; +sub.f32 f571, f1726, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f1650, f1725, f507; +sub.f32 f575, f1725, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f1649, f1724, f512; +sub.f32 f579, f1724, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f1648, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f1647, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f1646, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f1645, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f1644, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1643, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1642, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1641, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f1655, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f1655; +sub.f32 f620, f619, f618; +mul.f32 f1639, f612, f612; +mul.f32 f1640, f613, f613; +sub.f32 f623, f1639, f1640; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f1654, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f1654; +sub.f32 f630, f629, f628; +mul.f32 f1637, f612, f623; +mul.f32 f1638, f613, f625; +sub.f32 f633, f1637, f1638; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f1653, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f1653; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f1636, f612, f633; +sub.f32 f643, f1636, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f1652, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f1652; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f1635, f612, f643; +sub.f32 f653, f1635, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f1651, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f1651; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f1634, f612, f653; +sub.f32 f663, f1634, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f1650, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f1650; +sub.f32 f670, f669, f668; +mul.f32 f1632, f612, f663; +mul.f32 f1633, f613, f665; +sub.f32 f673, f1632, f1633; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f1649, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f1649; +sub.f32 f680, f679, f678; +mul.f32 f1630, f612, f673; +mul.f32 f1631, f613, f675; +sub.f32 f683, f1630, f1631; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f1648, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f1648; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f1629, f612, f683; +sub.f32 f693, f1629, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f1647, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f1647; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f1628, f612, f693; +sub.f32 f703, f1628, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f1646, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f1646; +sub.f32 f710, f709, f708; +mul.f32 f1626, f612, f703; +mul.f32 f1627, f613, f705; +sub.f32 f713, f1626, f1627; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f1645, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f1645; +sub.f32 f720, f719, f718; +mul.f32 f1624, f612, f713; +mul.f32 f1625, f613, f715; +sub.f32 f723, f1624, f1625; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f1644, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f1644; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f1623, f612, f723; +sub.f32 f733, f1623, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f1643, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f1643; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f1622, f612, f733; +sub.f32 f743, f1622, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f1642, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f1642; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f1621, f612, f743; +sub.f32 f753, f1621, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f1641, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f1641; +sub.f32 f760, f759, f758; +mul.f32 f1619, f612, f753; +mul.f32 f1620, f613, f755; +sub.f32 f763, f1619, f1620; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f1617, f612, f763; +mul.f32 f1618, f613, f765; +sub.f32 f773, f1617, f1618; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f1616, f612, f773; +sub.f32 f783, f1616, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f1615, f612, f783; +sub.f32 f793, f1615, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f1614, f612, f793; +sub.f32 f803, f1614, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f1612, f612, f803; +mul.f32 f1613, f613, f805; +sub.f32 f813, f1612, f1613; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f1610, f612, f813; +mul.f32 f1611, f613, f815; +sub.f32 f823, f1610, f1611; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f1609, f612, f823; +sub.f32 f833, f1609, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f1608, f612, f833; +sub.f32 f843, f1608, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f1606, f612, f843; +mul.f32 f1607, f613, f845; +sub.f32 f853, f1606, f1607; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f1604, f612, f853; +mul.f32 f1605, f613, f855; +sub.f32 f863, f1604, f1605; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f1603, f612, f863; +sub.f32 f873, f1603, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f1602, f612, f873; +sub.f32 f883, f1602, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f1601, f612, f883; +sub.f32 f893, f1601, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f1599, f612, f893; +mul.f32 f1600, f613, f895; +sub.f32 f903, f1599, f1600; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f1597, f612, f903; +mul.f32 f1598, f613, f905; +sub.f32 f913, f1597, f1598; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mov.u32 r17, %tid.x; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +and.b32 r14, r17, 31; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 3968; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+128]; +ld.shared.f32 f923, [r13+256]; +ld.shared.f32 f924, [r13+384]; +ld.shared.f32 f925, [r13+512]; +ld.shared.f32 f926, [r13+640]; +ld.shared.f32 f927, [r13+768]; +ld.shared.f32 f928, [r13+896]; +ld.shared.f32 f929, [r13+1024]; +ld.shared.f32 f930, [r13+1152]; +ld.shared.f32 f931, [r13+1280]; +ld.shared.f32 f932, [r13+1408]; +ld.shared.f32 f933, [r13+1536]; +ld.shared.f32 f934, [r13+1664]; +ld.shared.f32 f935, [r13+1792]; +ld.shared.f32 f936, [r13+1920]; +ld.shared.f32 f937, [r13+2048]; +ld.shared.f32 f938, [r13+2176]; +ld.shared.f32 f939, [r13+2304]; +ld.shared.f32 f940, [r13+2432]; +ld.shared.f32 f941, [r13+2560]; +ld.shared.f32 f942, [r13+2688]; +ld.shared.f32 f943, [r13+2816]; +ld.shared.f32 f944, [r13+2944]; +ld.shared.f32 f945, [r13+3072]; +ld.shared.f32 f946, [r13+3200]; +ld.shared.f32 f947, [r13+3328]; +ld.shared.f32 f948, [r13+3456]; +ld.shared.f32 f949, [r13+3584]; +ld.shared.f32 f950, [r13+3712]; +ld.shared.f32 f951, [r13+3840]; +ld.shared.f32 f952, [r13+3968]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1656, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+128]; +ld.shared.f32 f955, [r13+256]; +ld.shared.f32 f956, [r13+384]; +ld.shared.f32 f957, [r13+512]; +ld.shared.f32 f958, [r13+640]; +ld.shared.f32 f959, [r13+768]; +ld.shared.f32 f960, [r13+896]; +ld.shared.f32 f961, [r13+1024]; +ld.shared.f32 f962, [r13+1152]; +ld.shared.f32 f963, [r13+1280]; +ld.shared.f32 f964, [r13+1408]; +ld.shared.f32 f965, [r13+1536]; +ld.shared.f32 f966, [r13+1664]; +ld.shared.f32 f967, [r13+1792]; +ld.shared.f32 f968, [r13+1920]; +ld.shared.f32 f969, [r13+2048]; +ld.shared.f32 f970, [r13+2176]; +ld.shared.f32 f971, [r13+2304]; +ld.shared.f32 f972, [r13+2432]; +ld.shared.f32 f973, [r13+2560]; +ld.shared.f32 f974, [r13+2688]; +ld.shared.f32 f975, [r13+2816]; +ld.shared.f32 f976, [r13+2944]; +ld.shared.f32 f977, [r13+3072]; +ld.shared.f32 f978, [r13+3200]; +ld.shared.f32 f979, [r13+3328]; +ld.shared.f32 f980, [r13+3456]; +ld.shared.f32 f981, [r13+3584]; +ld.shared.f32 f982, [r13+3712]; +ld.shared.f32 f983, [r13+3840]; +ld.shared.f32 f984, [r13+3968]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1596, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1595, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f1594, f1596, f1595; +sub.f32 f996, f1596, f1595; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f1593, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f1592, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f1591, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f1590, f1592, f1591; +sub.f32 f1012, f1592, f1591; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f1589, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f1589, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f1587, f1015, 0fBF3504F3; +mul.f32 f1588, f1016, 0f3F3504F3; +sub.f32 f1023, f1587, f1588; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f1586, f1594, f1590; +sub.f32 f1029, f1594, f1590; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f1585, f1593, f1020; +sub.f32 f1033, f1593, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f1584, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f1583, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f1582, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f1581, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f1580, f1582, f1581; +sub.f32 f1053, f1582, f1581; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f1579, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f1578, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f1577, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f1576, f1578, f1577; +sub.f32 f1069, f1578, f1577; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f1575, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f1575, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f1573, f1072, 0fBF3504F3; +mul.f32 f1574, f1073, 0f3F3504F3; +sub.f32 f1080, f1573, f1574; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f1572, f1580, f1576; +sub.f32 f1086, f1580, f1576; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f1571, f1579, f1077; +sub.f32 f1090, f1579, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f1570, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f1569, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f1567, f1087, 0f3F6C835E; +mul.f32 f1568, f1571, 0f3EC3EF15; +sub.f32 f1101, f1567, f1568; +mul.f32 f1102, f1571, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f1570, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f1565, f1095, 0f3EC3EF15; +mul.f32 f1566, f1569, 0f3F6C835E; +sub.f32 f1110, f1565, f1566; +mul.f32 f1111, f1569, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f1563, f1089, 0fBEC3EF15; +mul.f32 f1564, f1090, 0f3F6C835E; +sub.f32 f1115, f1563, f1564; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f1561, f1093, 0fBF3504F3; +mul.f32 f1562, f1094, 0f3F3504F3; +sub.f32 f1120, f1561, f1562; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f1559, f1097, 0fBF6C835E; +mul.f32 f1560, f1098, 0f3EC3EF15; +sub.f32 f1125, f1559, f1560; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f1558, f1586, f1572; +sub.f32 f1131, f1586, f1572; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f1557, f1585, f1103; +sub.f32 f1135, f1585, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f1556, f1584, f1107; +sub.f32 f1139, f1584, f1107; +add.f32 f1140, f1038, f1110; +sub.f32 f1142, f1038, f1110; +add.f32 f1555, f1583, f1112; +sub.f32 f1143, f1583, f1112; +sub.f32 f1144, f1028, f1086; +add.f32 f1146, f1028, f1086; +add.f32 f1554, f1029, f1085; +sub.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1115; +sub.f32 f1150, f1032, f1115; +add.f32 f1553, f1033, f1117; +sub.f32 f1151, f1033, f1117; +add.f32 f1152, f1036, f1120; +sub.f32 f1154, f1036, f1120; +add.f32 f1552, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f1551, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f1550, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f1549, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f1548, f1550, f1549; +sub.f32 f1171, f1550, f1549; +sub.f32 f1172, f1162, f1167; +add.f32 f1174, f1162, f1167; +add.f32 f1547, f1163, f1166; +sub.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f1546, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f1545, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f1544, f1546, f1545; +sub.f32 f1187, f1546, f1545; +sub.f32 f1188, f1178, f1183; +add.f32 f1190, f1178, f1183; +add.f32 f1543, f1179, f1182; +sub.f32 f1191, f1179, f1182; +mul.f32 f1192, f1188, 0f3F3504F3; +mul.f32 f1193, f1543, 0f3F3504F3; +sub.f32 f1194, f1192, f1193; +add.f32 f1195, f1192, f1193; +mul.f32 f1541, f1190, 0fBF3504F3; +mul.f32 f1542, f1191, 0f3F3504F3; +sub.f32 f1198, f1541, f1542; +mul.f32 f1199, f1191, 0fBF3504F3; +fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f1540, f1548, f1544; +sub.f32 f1204, f1548, f1544; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f1539, f1547, f1195; +sub.f32 f1208, f1547, f1195; +sub.f32 f1209, f1170, f1187; +add.f32 f1211, f1170, f1187; +add.f32 f1538, f1171, f1186; +sub.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1198; +sub.f32 f1215, f1174, f1198; +add.f32 f1537, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f1536, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f1535, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f1534, f1536, f1535; +sub.f32 f1228, f1536, f1535; +sub.f32 f1229, f1219, f1224; +add.f32 f1231, f1219, f1224; +add.f32 f1533, f1220, f1223; +sub.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f1532, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f1531, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f1530, f1532, f1531; +sub.f32 f1244, f1532, f1531; +sub.f32 f1245, f1235, f1240; +add.f32 f1247, f1235, f1240; +add.f32 f1529, f1236, f1239; +sub.f32 f1248, f1236, f1239; +mul.f32 f1249, f1245, 0f3F3504F3; +mul.f32 f1250, f1529, 0f3F3504F3; +sub.f32 f1251, f1249, f1250; +add.f32 f1252, f1249, f1250; +mul.f32 f1527, f1247, 0fBF3504F3; +mul.f32 f1528, f1248, 0f3F3504F3; +sub.f32 f1255, f1527, f1528; +mul.f32 f1256, f1248, 0fBF3504F3; +fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f1526, f1534, f1530; +sub.f32 f1261, f1534, f1530; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f1525, f1533, f1252; +sub.f32 f1265, f1533, f1252; +sub.f32 f1266, f1227, f1244; +add.f32 f1268, f1227, f1244; +add.f32 f1524, f1228, f1243; +sub.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1255; +sub.f32 f1272, f1231, f1255; +add.f32 f1523, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f1521, f1262, 0f3F6C835E; +mul.f32 f1522, f1525, 0f3EC3EF15; +sub.f32 f1276, f1521, f1522; +mul.f32 f1277, f1525, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277; +mul.f32 f1279, f1266, 0f3F3504F3; +mul.f32 f1280, f1524, 0f3F3504F3; +sub.f32 f1281, f1279, f1280; +add.f32 f1282, f1279, f1280; +mul.f32 f1284, f1523, 0f3F6C835E; +mul.f32 f1520, f1270, 0f3EC3EF15; +sub.f32 f1285, f1520, f1284; +mul.f32 f1286, f1523, 0f3EC3EF15; +fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286; +mul.f32 f1289, f1265, 0f3F6C835E; +mul.f32 f1519, f1264, 0fBEC3EF15; +sub.f32 f1290, f1519, f1289; +mul.f32 f1291, f1265, 0fBEC3EF15; +fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291; +mul.f32 f1517, f1268, 0fBF3504F3; +mul.f32 f1518, f1269, 0f3F3504F3; +sub.f32 f1295, f1517, f1518; +mul.f32 f1296, f1269, 0fBF3504F3; +fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296; +mul.f32 f1515, f1272, 0fBF6C835E; +mul.f32 f1516, f1273, 0f3EC3EF15; +sub.f32 f1300, f1515, f1516; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f1514, f1540, f1526; +sub.f32 f1306, f1540, f1526; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f1513, f1539, f1278; +sub.f32 f1310, f1539, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f1512, f1538, f1282; +sub.f32 f1314, f1538, f1282; +add.f32 f1315, f1213, f1285; +sub.f32 f1317, f1213, f1285; +add.f32 f1511, f1537, f1287; +sub.f32 f1318, f1537, f1287; +sub.f32 f1319, f1203, f1261; +add.f32 f1321, f1203, f1261; +add.f32 f1510, f1204, f1260; +sub.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1290; +sub.f32 f1325, f1207, f1290; +add.f32 f1509, f1208, f1292; +sub.f32 f1326, f1208, f1292; +add.f32 f1327, f1211, f1295; +sub.f32 f1329, f1211, f1295; +add.f32 f1508, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f1507, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f1513, 0f3E47C5C2; +mul.f32 f1506, f1307, 0f3F7B14BE; +sub.f32 f1337, f1506, f1336; +mul.f32 f1338, f1513, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338; +mul.f32 f1341, f1512, 0f3EC3EF15; +mul.f32 f1505, f1311, 0f3F6C835E; +sub.f32 f1342, f1505, f1341; +mul.f32 f1343, f1512, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343; +mul.f32 f1503, f1315, 0f3F54DB31; +mul.f32 f1504, f1511, 0f3F0E39DA; +sub.f32 f1347, f1503, f1504; +mul.f32 f1348, f1511, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348; +mul.f32 f1350, f1319, 0f3F3504F3; +mul.f32 f1351, f1510, 0f3F3504F3; +sub.f32 f1352, f1350, f1351; +add.f32 f1353, f1350, f1351; +mul.f32 f1355, f1509, 0f3F54DB31; +mul.f32 f1502, f1323, 0f3F0E39DA; +sub.f32 f1356, f1502, f1355; +mul.f32 f1357, f1509, 0f3F0E39DA; +fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357; +mul.f32 f1360, f1508, 0f3F6C835E; +mul.f32 f1501, f1327, 0f3EC3EF15; +sub.f32 f1361, f1501, f1360; +mul.f32 f1362, f1508, 0f3EC3EF15; +fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362; +mul.f32 f1499, f1331, 0f3E47C5C2; +mul.f32 f1500, f1507, 0f3F7B14BE; +sub.f32 f1366, f1499, f1500; +mul.f32 f1367, f1507, 0f3E47C5C2; +fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367; +mul.f32 f1497, f1309, 0fBE47C5C2; +mul.f32 f1498, f1310, 0f3F7B14BE; +sub.f32 f1371, f1497, f1498; +mul.f32 f1372, f1310, 0fBE47C5C2; +fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372; +mul.f32 f1495, f1313, 0fBEC3EF15; +mul.f32 f1496, f1314, 0f3F6C835E; +sub.f32 f1376, f1495, f1496; +mul.f32 f1377, f1314, 0fBEC3EF15; +fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377; +mul.f32 f1493, f1317, 0fBF0E39DA; +mul.f32 f1494, f1318, 0f3F54DB31; +sub.f32 f1381, f1493, f1494; +mul.f32 f1382, f1318, 0fBF0E39DA; +fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382; +mul.f32 f1385, f1322, 0f3F3504F3; +mul.f32 f1492, f1321, 0fBF3504F3; +sub.f32 f1386, f1492, f1385; +mul.f32 f1387, f1322, 0fBF3504F3; +fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387; +mul.f32 f1390, f1326, 0f3F0E39DA; +mul.f32 f1491, f1325, 0fBF54DB31; +sub.f32 f1391, f1491, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392; +mul.f32 f1395, f1330, 0f3EC3EF15; +mul.f32 f1490, f1329, 0fBF6C835E; +sub.f32 f1396, f1490, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397; +mul.f32 f1400, f1334, 0f3E47C5C2; +mul.f32 f1489, f1333, 0fBF7B14BE; +sub.f32 f1401, f1489, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402; +add.f32 %1, f1558, f1514; +add.f32 %0, f1128, f1303; +add.f32 %2, f1132, f1337; +add.f32 %3, f1557, f1339; +add.f32 %4, f1136, f1342; +add.f32 %5, f1556, f1344; +add.f32 %6, f1140, f1347; +add.f32 %7, f1555, f1349; +add.f32 %9, f1554, f1353; +add.f32 %8, f1144, f1352; +add.f32 %11, f1553, f1358; +add.f32 %10, f1148, f1356; +add.f32 %12, f1152, f1361; +add.f32 %13, f1552, f1363; +add.f32 %14, f1156, f1366; +add.f32 %15, f1551, f1368; +sub.f32 %16, f1130, f1306; +add.f32 %17, f1131, f1305; +add.f32 %18, f1134, f1371; +add.f32 %19, f1135, f1373; +add.f32 %21, f1139, f1378; +add.f32 %20, f1138, f1376; +add.f32 %23, f1143, f1383; +add.f32 %22, f1142, f1381; +add.f32 %25, f1147, f1388; +add.f32 %24, f1146, f1386; +add.f32 %26, f1150, f1391; +add.f32 %27, f1151, f1393; +add.f32 %28, f1154, f1396; +add.f32 %29, f1155, f1398; +add.f32 %30, f1158, f1401; +add.f32 %31, f1159, f1403; +sub.f32 %32, f1128, f1303; +sub.f32 %33, f1558, f1514; +sub.f32 %35, f1557, f1339; +sub.f32 %34, f1132, f1337; +sub.f32 %37, f1556, f1344; +sub.f32 %36, f1136, f1342; +sub.f32 %39, f1555, f1349; +sub.f32 %38, f1140, f1347; +sub.f32 %41, f1554, f1353; +sub.f32 %40, f1144, f1352; +sub.f32 %43, f1553, f1358; +sub.f32 %42, f1148, f1356; +sub.f32 %45, f1552, f1363; +sub.f32 %44, f1152, f1361; +sub.f32 %47, f1551, f1368; +sub.f32 %46, f1156, f1366; +sub.f32 %49, f1131, f1305; +add.f32 %48, f1130, f1306; +sub.f32 %51, f1135, f1373; +sub.f32 %50, f1134, f1371; +sub.f32 %53, f1139, f1378; +sub.f32 %52, f1138, f1376; +sub.f32 %55, f1143, f1383; +sub.f32 %54, f1142, f1381; +sub.f32 %57, f1147, f1388; +sub.f32 %56, f1146, f1386; +sub.f32 %59, f1151, f1393; +sub.f32 %58, f1150, f1391; +sub.f32 %61, f1155, f1398; +sub.f32 %60, f1154, f1396; +sub.f32 %63, f1159, f1403; +sub.f32 %62, f1158, f1401; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<284, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<523>; +.reg .b32 r<27>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 8128; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+1024]; +ld.shared.v2.f32 {f167, f168}, [r13+2048]; +ld.shared.v2.f32 {f171, f172}, [r13+3072]; +ld.shared.v2.f32 {f175, f176}, [r13+4096]; +ld.shared.v2.f32 {f179, f180}, [r13+5120]; +ld.shared.v2.f32 {f183, f184}, [r13+6144]; +ld.shared.v2.f32 {f187, f188}, [r13+7168]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 120; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 7680; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+1024]; +ld.shared.v2.f32 {f325, f326}, [r19+2048]; +ld.shared.v2.f32 {f329, f330}, [r19+3072]; +ld.shared.v2.f32 {f333, f334}, [r19+4096]; +ld.shared.v2.f32 {f337, f338}, [r19+5120]; +ld.shared.v2.f32 {f341, f342}, [r19+6144]; +ld.shared.v2.f32 {f345, f346}, [r19+7168]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +sub.f32 f361, f351, f356; +add.f32 f362, f352, f355; +add.f32 f363, f351, f356; +sub.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +sub.f32 f377, f367, f372; +add.f32 f378, f368, f371; +add.f32 f379, f367, f372; +sub.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0f3F3504F3; +sub.f32 f383, f381, f382; +add.f32 f384, f381, f382; +mul.f32 f385, f379, 0fBF3504F3; +mul.f32 f386, f380, 0f3F3504F3; +sub.f32 f387, f385, f386; +mul.f32 f388, f380, 0fBF3504F3; +fma.rn.f32 f389, f379, 0f3F3504F3, f388; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f384; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f384; +sub.f32 f396, f359, f376; +add.f32 f397, f360, f375; +add.f32 f398, f359, f376; +sub.f32 f399, f360, f375; +add.f32 f400, f363, f387; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f387; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 64; +bfe.u32 r21, r5, 6, 1; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f393, f405; +mul.f32 f409, f392, f405; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f397, f415; +mul.f32 f417, f396, f415; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f401, f423; +mul.f32 f425, f400, f423; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f391, f431; +mul.f32 f433, f390, f431; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f395, f439; +mul.f32 f441, f394, f439; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f399, f447; +mul.f32 f449, f398, f447; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f403, f455; +mul.f32 f457, f402, f455; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 4096; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f404, f392, f408; +sub.f32 f462, f410, f409; +st.shared.v2.f32 [r25+512], {f461, f462}; +fma.rn.f32 f463, f413, f396, f416; +sub.f32 f464, f418, f417; +st.shared.v2.f32 [r25+1024], {f463, f464}; +fma.rn.f32 f465, f421, f400, f424; +sub.f32 f466, f426, f425; +st.shared.v2.f32 [r25+1536], {f465, f466}; +sub.f32 f467, f434, f433; +fma.rn.f32 f468, f429, f390, f432; +st.shared.v2.f32 [r25+2048], {f468, f467}; +fma.rn.f32 f469, f437, f394, f440; +sub.f32 f470, f442, f441; +st.shared.v2.f32 [r25+2560], {f469, f470}; +fma.rn.f32 f471, f445, f398, f448; +sub.f32 f472, f450, f449; +st.shared.v2.f32 [r25+3072], {f471, f472}; +fma.rn.f32 f473, f453, f402, f456; +sub.f32 f474, f458, f457; +st.shared.v2.f32 [r25+3584], {f473, f474}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+1024]; +ld.shared.v2.f32 {f483, f484}, [r26+2048]; +ld.shared.v2.f32 {f487, f488}, [r26+3072]; +ld.shared.v2.f32 {f491, f492}, [r26+4096]; +ld.shared.v2.f32 {f495, f496}, [r26+5120]; +ld.shared.v2.f32 {f499, f500}, [r26+6144]; +ld.shared.v2.f32 {f503, f504}, [r26+7168]; +add.f32 %1, f476, f492; +add.f32 %0, f475, f491; +add.f32 %3, f480, f496; +add.f32 %2, f479, f495; +add.f32 %5, f484, f500; +add.f32 %4, f483, f499; +add.f32 %7, f488, f504; +add.f32 %6, f487, f503; +sub.f32 %9, f476, f492; +sub.f32 %8, f475, f491; +sub.f32 %11, f480, f496; +sub.f32 %10, f479, f495; +sub.f32 %13, f484, f500; +sub.f32 %12, f483, f499; +sub.f32 %15, f488, f504; +sub.f32 %14, f487, f503; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<288, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1092>; +.reg .b32 r<35>; +.reg .b64 rd<11>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1084, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1082, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1081, f1084, f1082; +sub.f32 f76, f1084, f1082; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f1080, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1077, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1075, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1074, f1077, f1075; +sub.f32 f92, f1077, f1075; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f1073, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f1073, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f1071, f95, 0fBF3504F3; +mul.f32 f1072, f96, 0f3F3504F3; +sub.f32 f103, f1071, f1072; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1070, f1081, f1074; +sub.f32 f109, f1081, f1074; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1069, f1080, f100; +sub.f32 f113, f1080, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f1068, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f1067, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1065, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1062, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1061, f1065, f1062; +sub.f32 f133, f1065, f1062; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f1060, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1058, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1056, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1055, f1058, f1056; +sub.f32 f149, f1058, f1056; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f1054, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f1054, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f1052, f152, 0fBF3504F3; +mul.f32 f1053, f153, 0f3F3504F3; +sub.f32 f160, f1052, f1053; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1051, f1061, f1055; +sub.f32 f166, f1061, f1055; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1050, f1060, f157; +sub.f32 f170, f1060, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f1049, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f1048, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1046, f167, 0f3F6C835E; +mul.f32 f1047, f1050, 0f3EC3EF15; +sub.f32 f181, f1046, f1047; +mul.f32 f182, f1050, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f1049, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f1048, 0f3F6C835E; +mul.f32 f1045, f175, 0f3EC3EF15; +sub.f32 f190, f1045, f189; +mul.f32 f191, f1048, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f1044, f169, 0fBEC3EF15; +sub.f32 f195, f1044, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f1042, f173, 0fBF3504F3; +mul.f32 f1043, f174, 0f3F3504F3; +sub.f32 f200, f1042, f1043; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f1040, f177, 0fBF6C835E; +mul.f32 f1041, f178, 0f3EC3EF15; +sub.f32 f205, f1040, f1041; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1039, f1069, f183; +sub.f32 f213, f1069, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1038, f1068, f187; +sub.f32 f217, f1068, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f1037, f1067, f192; +sub.f32 f221, f1067, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f1036, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f1035, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f1034, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1033, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f1039, f239; +mul.f32 f244, f238, f1039; +mul.f32 f246, f239, f239; +mul.f32 f1032, f238, f238; +sub.f32 f247, f1032, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f1038, f249; +mul.f32 f252, f247, f1038; +mul.f32 f1030, f238, f247; +mul.f32 f1031, f239, f249; +sub.f32 f255, f1030, f1031; +mul.f32 f1029, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f1037, f257; +mul.f32 f260, f255, f1037; +mul.f32 f262, f239, f257; +mul.f32 f1028, f238, f255; +sub.f32 f263, f1028, f262; +mul.f32 f1027, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f1036, f265; +mul.f32 f268, f263, f1036; +mul.f32 f270, f239, f265; +mul.f32 f1026, f238, f263; +sub.f32 f271, f1026, f270; +mul.f32 f1025, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f1035, f273; +mul.f32 f276, f271, f1035; +mul.f32 f1023, f238, f271; +mul.f32 f1024, f239, f273; +sub.f32 f279, f1023, f1024; +mul.f32 f1022, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f1034, f281; +mul.f32 f284, f279, f1034; +mul.f32 f286, f239, f281; +mul.f32 f1021, f238, f279; +sub.f32 f287, f1021, f286; +mul.f32 f1020, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f1033, f289; +mul.f32 f292, f287, f1033; +mul.f32 f294, f239, f289; +mul.f32 f1019, f238, f287; +sub.f32 f295, f1019, f294; +mul.f32 f1018, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1017, f1070, f1051; +mul.f32 f298, f1017, f297; +mul.f32 f300, f295, f1017; +mul.f32 f1015, f238, f295; +mul.f32 f1016, f239, f297; +sub.f32 f303, f1015, f1016; +sub.f32 f1014, f106, f163; +mul.f32 f1013, f1014, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1012, f238, f303; +sub.f32 f311, f1012, f310; +mul.f32 f1011, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f1009, f238, f311; +mul.f32 f1010, f239, f313; +sub.f32 f319, f1009, f1010; +mul.f32 f1008, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1007, f238, f319; +sub.f32 f327, f1007, f326; +mul.f32 f1006, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1005, f238, f327; +sub.f32 f335, f1005, f334; +mul.f32 f1004, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f1002, f238, f335; +mul.f32 f1003, f239, f337; +sub.f32 f343, f1002, f1003; +mul.f32 f1001, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1000, f238, f343; +sub.f32 f351, f1000, f350; +mul.f32 f999, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f998, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 8064; +add.s32 r12, r9, r11; +add.f32 f357, f1070, f1051; +sub.f32 f1086, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r34, %tid.x; +shl.b32 r28, r34, 7; +shl.b32 r24, r34, 3; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f998; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f1029; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f1027; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f1025; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f1022; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f1020; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f1018; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f1086, f298; +sub.f32 f374, f300, f1013; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f1011; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f1008; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f1006; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f1004; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f1001; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f999; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +and.b32 r21, r34, 63; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+512]; +ld.shared.v2.f32 {f397, f398}, [r13+1024]; +ld.shared.v2.f32 {f401, f402}, [r13+1536]; +ld.shared.v2.f32 {f405, f406}, [r13+2048]; +ld.shared.v2.f32 {f409, f410}, [r13+2560]; +ld.shared.v2.f32 {f413, f414}, [r13+3072]; +ld.shared.v2.f32 {f417, f418}, [r13+3584]; +ld.shared.v2.f32 {f421, f422}, [r13+4096]; +ld.shared.v2.f32 {f425, f426}, [r13+4608]; +ld.shared.v2.f32 {f429, f430}, [r13+5120]; +ld.shared.v2.f32 {f433, f434}, [r13+5632]; +ld.shared.v2.f32 {f437, f438}, [r13+6144]; +ld.shared.v2.f32 {f441, f442}, [r13+6656]; +ld.shared.v2.f32 {f445, f446}, [r13+7168]; +ld.shared.v2.f32 {f449, f450}, [r13+7680]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f997, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f996, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f995, f997, f996; +sub.f32 f464, f997, f996; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f994, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f993, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f992, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f991, f993, f992; +sub.f32 f480, f993, f992; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f990, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f990, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f989, f483, 0fBF3504F3; +sub.f32 f491, f989, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f988, f995, f991; +sub.f32 f497, f995, f991; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f987, f994, f488; +sub.f32 f501, f994, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f986, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f985, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f984, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f983, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f982, f984, f983; +sub.f32 f521, f984, f983; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f981, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f980, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f979, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f978, f980, f979; +sub.f32 f537, f980, f979; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f977, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f977, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f976, f540, 0fBF3504F3; +sub.f32 f548, f976, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f975, f982, f978; +sub.f32 f554, f982, f978; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f974, f981, f545; +sub.f32 f558, f981, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f973, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f972, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f974, 0f3EC3EF15; +mul.f32 f971, f555, 0f3F6C835E; +sub.f32 f569, f971, f568; +mul.f32 f570, f974, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f973, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f972, 0f3F6C835E; +mul.f32 f970, f563, 0f3EC3EF15; +sub.f32 f578, f970, f577; +mul.f32 f579, f972, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f969, f557, 0fBEC3EF15; +sub.f32 f583, f969, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f968, f561, 0fBF3504F3; +sub.f32 f588, f968, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f967, f565, 0fBF6C835E; +sub.f32 f593, f967, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f966, f987, f571; +sub.f32 f601, f987, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f965, f986, f575; +sub.f32 f605, f986, f575; +add.f32 f606, f506, f578; +sub.f32 f608, f506, f578; +add.f32 f964, f985, f580; +sub.f32 f609, f985, f580; +sub.f32 f610, f496, f554; +add.f32 f612, f496, f554; +add.f32 f963, f497, f553; +sub.f32 f613, f497, f553; +add.f32 f614, f500, f583; +sub.f32 f616, f500, f583; +add.f32 f962, f501, f585; +sub.f32 f617, f501, f585; +add.f32 f618, f504, f588; +sub.f32 f620, f504, f588; +add.f32 f961, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f960, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r34, 48; +bfe.u32 r15, r34, 4, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f630, f966, f627; +mul.f32 f632, f626, f966; +mul.f32 f634, f627, f627; +mul.f32 f959, f626, f626; +sub.f32 f635, f959, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f638, f965, f637; +mul.f32 f640, f635, f965; +mul.f32 f957, f626, f635; +mul.f32 f958, f627, f637; +sub.f32 f643, f957, f958; +mul.f32 f956, f602, f637; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f646, f964, f645; +mul.f32 f648, f643, f964; +mul.f32 f650, f627, f645; +mul.f32 f955, f626, f643; +sub.f32 f651, f955, f650; +mul.f32 f954, f606, f645; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f654, f963, f653; +mul.f32 f656, f651, f963; +mul.f32 f658, f627, f653; +mul.f32 f953, f626, f651; +sub.f32 f659, f953, f658; +mul.f32 f952, f610, f653; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f662, f962, f661; +mul.f32 f664, f659, f962; +mul.f32 f950, f626, f659; +mul.f32 f951, f627, f661; +sub.f32 f667, f950, f951; +mul.f32 f949, f614, f661; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f670, f961, f669; +mul.f32 f672, f667, f961; +mul.f32 f674, f627, f669; +mul.f32 f948, f626, f667; +sub.f32 f675, f948, f674; +mul.f32 f947, f618, f669; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f678, f960, f677; +mul.f32 f680, f675, f960; +mul.f32 f682, f627, f677; +mul.f32 f946, f626, f675; +sub.f32 f683, f946, f682; +mul.f32 f945, f622, f677; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f944, f988, f975; +mul.f32 f686, f944, f685; +mul.f32 f688, f683, f944; +mul.f32 f942, f626, f683; +mul.f32 f943, f627, f685; +sub.f32 f691, f942, f943; +sub.f32 f941, f494, f551; +mul.f32 f940, f941, f685; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f694, f601, f693; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f939, f626, f691; +sub.f32 f699, f939, f698; +mul.f32 f938, f600, f693; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f702, f605, f701; +mul.f32 f704, f699, f605; +mul.f32 f936, f626, f699; +mul.f32 f937, f627, f701; +sub.f32 f707, f936, f937; +mul.f32 f935, f604, f701; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f710, f609, f709; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f934, f626, f707; +sub.f32 f715, f934, f714; +mul.f32 f933, f608, f709; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f718, f613, f717; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f932, f626, f715; +sub.f32 f723, f932, f722; +mul.f32 f931, f612, f717; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f726, f617, f725; +mul.f32 f728, f723, f617; +mul.f32 f929, f626, f723; +mul.f32 f930, f627, f725; +sub.f32 f731, f929, f930; +mul.f32 f928, f616, f725; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f734, f621, f733; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f927, f626, f731; +sub.f32 f739, f927, f738; +mul.f32 f926, f620, f733; +mul.f32 f740, f626, f733; +mul.f32 f925, f598, f627; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f625, f741; +mul.f32 f743, f624, f741; +mul.f32 f744, f739, f625; +and.b32 r16, r24, 120; +add.s32 r17, r9, r16; +sub.f32 f1088, f988, f975; +mul.f32 f1087, f683, f1088; +barrier.sync 0; +and.b32 r18, r28, 6144; +add.s32 r19, r17, r18; +sub.f32 f1090, f988, f975; +mul.f32 f1089, f683, f1090; +add.f32 f745, f988, f975; +sub.f32 f1091, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 48; +fma.rn.f32 f747, f626, f598, f630; +sub.f32 f748, f632, f925; +st.shared.v2.f32 [r19+128], {f747, f748}; +fma.rn.f32 f749, f635, f602, f638; +sub.f32 f750, f640, f956; +st.shared.v2.f32 [r19+256], {f749, f750}; +fma.rn.f32 f751, f643, f606, f646; +sub.f32 f752, f648, f954; +st.shared.v2.f32 [r19+384], {f751, f752}; +fma.rn.f32 f753, f651, f610, f654; +sub.f32 f754, f656, f952; +st.shared.v2.f32 [r19+512], {f753, f754}; +sub.f32 f755, f664, f949; +fma.rn.f32 f756, f659, f614, f662; +st.shared.v2.f32 [r19+640], {f756, f755}; +fma.rn.f32 f757, f667, f618, f670; +sub.f32 f758, f672, f947; +st.shared.v2.f32 [r19+768], {f757, f758}; +fma.rn.f32 f759, f675, f622, f678; +sub.f32 f760, f680, f945; +st.shared.v2.f32 [r19+896], {f759, f760}; +fma.rn.f32 f761, f683, f1091, f686; +sub.f32 f762, f1089, f940; +st.shared.v2.f32 [r19+1024], {f761, f762}; +fma.rn.f32 f763, f691, f600, f694; +sub.f32 f764, f696, f938; +st.shared.v2.f32 [r19+1152], {f763, f764}; +fma.rn.f32 f765, f699, f604, f702; +sub.f32 f766, f704, f935; +st.shared.v2.f32 [r19+1280], {f765, f766}; +fma.rn.f32 f767, f707, f608, f710; +sub.f32 f768, f712, f933; +st.shared.v2.f32 [r19+1408], {f767, f768}; +fma.rn.f32 f769, f715, f612, f718; +sub.f32 f770, f720, f931; +st.shared.v2.f32 [r19+1536], {f769, f770}; +fma.rn.f32 f771, f723, f616, f726; +sub.f32 f772, f728, f928; +st.shared.v2.f32 [r19+1664], {f771, f772}; +fma.rn.f32 f773, f731, f620, f734; +sub.f32 f774, f736, f926; +st.shared.v2.f32 [r19+1792], {f773, f774}; +fma.rn.f32 f775, f739, f624, f742; +sub.f32 f776, f744, f743; +st.shared.v2.f32 [r19+1920], {f775, f776}; +barrier.sync 0; +mad.lo.s32 r20, r26, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+512]; +ld.shared.v2.f32 {f785, f786}, [r20+1024]; +ld.shared.v2.f32 {f789, f790}, [r20+1536]; +ld.shared.v2.f32 {f793, f794}, [r20+2048]; +ld.shared.v2.f32 {f797, f798}, [r20+2560]; +ld.shared.v2.f32 {f801, f802}, [r20+3072]; +ld.shared.v2.f32 {f805, f806}, [r20+3584]; +ld.shared.v2.f32 {f809, f810}, [r20+4096]; +ld.shared.v2.f32 {f813, f814}, [r20+4608]; +ld.shared.v2.f32 {f817, f818}, [r20+5120]; +ld.shared.v2.f32 {f821, f822}, [r20+5632]; +ld.shared.v2.f32 {f825, f826}, [r20+6144]; +ld.shared.v2.f32 {f829, f830}, [r20+6656]; +ld.shared.v2.f32 {f833, f834}, [r20+7168]; +ld.shared.v2.f32 {f837, f838}, [r20+7680]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f924, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f923, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f781, f813; +sub.f32 f851, f781, f813; +add.f32 f922, f782, f814; +sub.f32 f852, f782, f814; +add.f32 f853, f797, f829; +sub.f32 f855, f797, f829; +add.f32 f921, f798, f830; +sub.f32 f856, f798, f830; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f920, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f919, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f789, f821; +sub.f32 f867, f789, f821; +add.f32 f918, f790, f822; +sub.f32 f868, f790, f822; +add.f32 f869, f805, f837; +sub.f32 f871, f805, f837; +add.f32 f917, f806, f838; +sub.f32 f872, f806, f838; +add.f32 %1, f924, f923; +add.f32 %0, f841, f845; +add.f32 %2, f849, f853; +add.f32 %3, f922, f921; +add.f32 %4, f857, f861; +add.f32 %5, f920, f919; +add.f32 %6, f865, f869; +add.f32 %7, f918, f917; +sub.f32 %8, f843, f848; +add.f32 %9, f844, f847; +add.f32 %11, f852, f855; +sub.f32 %10, f851, f856; +add.f32 %13, f860, f863; +sub.f32 %12, f859, f864; +add.f32 %15, f868, f871; +sub.f32 %14, f867, f872; +sub.f32 %17, f924, f923; +sub.f32 %16, f841, f845; +sub.f32 %19, f922, f921; +sub.f32 %18, f849, f853; +sub.f32 %21, f920, f919; +sub.f32 %20, f857, f861; +sub.f32 %23, f918, f917; +sub.f32 %22, f865, f869; +sub.f32 %25, f844, f847; +add.f32 %24, f843, f848; +sub.f32 %27, f852, f855; +add.f32 %26, f851, f856; +sub.f32 %29, f860, f863; +add.f32 %28, f859, f864; +sub.f32 %31, f868, f871; +add.f32 %30, f867, f872; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<289, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<277>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 8160; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+2048]; +ld.shared.v2.f32 {f70, f71}, [r13+4096]; +ld.shared.v2.f32 {f74, f75}, [r13+6144]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 8064; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+2048]; +ld.shared.v2.f32 {f131, f132}, [r20+4096]; +ld.shared.v2.f32 {f135, f136}, [r20+6144]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +sub.f32 f149, f141, f146; +add.f32 f150, f142, f145; +add.f32 f151, f141, f146; +sub.f32 f152, f142, f145; +and.b32 r21, r5, 240; +bfe.u32 r22, r5, 4, 4; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f150, f154; +mul.f32 f158, f149, f154; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f148, f164; +mul.f32 f166, f147, f164; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f152, f172; +mul.f32 f174, f151, f172; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 7680; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f153, f149, f157; +sub.f32 f179, f159, f158; +st.shared.v2.f32 [r26+128], {f178, f179}; +fma.rn.f32 f180, f162, f147, f165; +sub.f32 f181, f167, f166; +st.shared.v2.f32 [r26+256], {f180, f181}; +sub.f32 f182, f175, f174; +fma.rn.f32 f183, f170, f151, f173; +st.shared.v2.f32 [r26+384], {f183, f182}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+2048]; +ld.shared.v2.f32 {f192, f193}, [r27+4096]; +ld.shared.v2.f32 {f196, f197}, [r27+6144]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +sub.f32 f210, f202, f207; +add.f32 f211, f203, f206; +add.f32 f212, f202, f207; +sub.f32 f213, f203, f206; +and.b32 r28, r5, 192; +bfe.u32 r29, r5, 6, 2; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f211, f215; +mul.f32 f219, f210, f215; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f209, f225; +mul.f32 f227, f208, f225; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f213, f233; +mul.f32 f235, f212, f233; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 6144; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f214, f210, f218; +sub.f32 f240, f220, f219; +st.shared.v2.f32 [r33+512], {f239, f240}; +fma.rn.f32 f241, f223, f208, f226; +sub.f32 f242, f228, f227; +st.shared.v2.f32 [r33+1024], {f241, f242}; +sub.f32 f243, f236, f235; +fma.rn.f32 f244, f231, f212, f234; +st.shared.v2.f32 [r33+1536], {f244, f243}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+2048]; +ld.shared.v2.f32 {f253, f254}, [r34+4096]; +ld.shared.v2.f32 {f257, f258}, [r34+6144]; +add.f32 f261, f245, f253; +add.f32 f262, f246, f254; +sub.f32 f263, f245, f253; +sub.f32 f264, f246, f254; +add.f32 f265, f249, f257; +add.f32 f266, f250, f258; +sub.f32 f267, f249, f257; +sub.f32 f268, f250, f258; +add.f32 %1, f262, f266; +add.f32 %0, f261, f265; +add.f32 %3, f264, f267; +sub.f32 %2, f263, f268; +sub.f32 %5, f262, f266; +sub.f32 %4, f261, f265; +sub.f32 %7, f264, f267; +add.f32 %6, f263, f268; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<290, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<245>; +.reg .b32 r<36>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4080; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+1024]; +ld.shared.f32 f64, [r13+2048]; +ld.shared.f32 f65, [r13+3072]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+1024]; +ld.shared.f32 f68, [r13+2048]; +ld.shared.f32 f69, [r13+3072]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 4032; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+1024]; +ld.shared.f32 f117, [r21+2048]; +ld.shared.f32 f118, [r21+3072]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+1024]; +ld.shared.f32 f121, [r21+2048]; +ld.shared.f32 f122, [r21+3072]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +sub.f32 f135, f125, f130; +add.f32 f136, f126, f129; +add.f32 f137, f125, f130; +sub.f32 f138, f126, f129; +and.b32 r22, r5, 240; +bfe.u32 r23, r5, 4, 4; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f136, f140; +fma.rn.f32 f144, f139, f135, f143; +mul.f32 f145, f135, f140; +mul.f32 f146, f139, f136; +sub.f32 f147, f146, f145; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f134, f152; +fma.rn.f32 f154, f150, f133, f153; +mul.f32 f155, f133, f152; +mul.f32 f156, f150, f134; +sub.f32 f157, f156, f155; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f138, f162; +fma.rn.f32 f164, f160, f137, f163; +mul.f32 f165, f137, f162; +mul.f32 f166, f160, f138; +sub.f32 f167, f166, f165; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 3840; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f144; +st.shared.f32 [r27+128], f154; +st.shared.f32 [r27+192], f164; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+1024]; +ld.shared.f32 f170, [r28+2048]; +ld.shared.f32 f171, [r28+3072]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+1024]; +ld.shared.f32 f174, [r28+2048]; +ld.shared.f32 f175, [r28+3072]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +sub.f32 f188, f178, f183; +add.f32 f189, f179, f182; +add.f32 f190, f178, f183; +sub.f32 f191, f179, f182; +and.b32 r29, r5, 192; +bfe.u32 r30, r5, 6, 2; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f189, f193; +fma.rn.f32 f197, f192, f188, f196; +mul.f32 f198, f188, f193; +mul.f32 f199, f192, f189; +sub.f32 f200, f199, f198; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f187, f205; +fma.rn.f32 f207, f203, f186, f206; +mul.f32 f208, f186, f205; +mul.f32 f209, f203, f187; +sub.f32 f210, f209, f208; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f191, f215; +fma.rn.f32 f217, f213, f190, f216; +mul.f32 f218, f190, f215; +mul.f32 f219, f213, f191; +sub.f32 f220, f219, f218; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 3072; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f197; +st.shared.f32 [r34+512], f207; +st.shared.f32 [r34+768], f217; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+1024]; +ld.shared.f32 f223, [r35+2048]; +ld.shared.f32 f224, [r35+3072]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+1024]; +ld.shared.f32 f227, [r35+2048]; +ld.shared.f32 f228, [r35+3072]; +add.f32 f229, f221, f223; +add.f32 f230, f225, f227; +sub.f32 f231, f221, f223; +sub.f32 f232, f225, f227; +add.f32 f233, f222, f224; +add.f32 f234, f226, f228; +sub.f32 f235, f222, f224; +sub.f32 f236, f226, f228; +add.f32 %0, f229, f233; +add.f32 %1, f230, f234; +add.f32 %3, f232, f235; +sub.f32 %2, f231, f236; +sub.f32 %4, f229, f233; +sub.f32 %5, f230, f234; +sub.f32 %7, f232, f235; +add.f32 %6, f231, f236; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<291, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<202>; +.reg .b32 r<70>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %14, %16; +sub.f32 f10, %15, %17; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 8176; +add.s32 r11, r8, r10; +add.f32 f18, %15, %17; +add.f32 f19, %14, %16; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 4088; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+4096]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8160; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 4080; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+4096]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8128; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 4064; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+4096]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 504; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 8064; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 4032; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+4096]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 5; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f94, f96; +mul.f32 f100, f93, f96; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 7936; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f95, f93, f99; +sub.f32 f105, f101, f100; +st.shared.v2.f32 [r39+128], {f104, f105}; +barrier.sync 0; +and.b32 r40, r9, 3968; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+4096]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f115, f117; +mul.f32 f121, f114, f117; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7680; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f116, f114, f120; +sub.f32 f126, f122, f121; +st.shared.v2.f32 [r46+256], {f125, f126}; +barrier.sync 0; +and.b32 r47, r9, 3840; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+4096]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f136, f138; +mul.f32 f142, f135, f138; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 7168; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f137, f135, f141; +sub.f32 f147, f143, f142; +st.shared.v2.f32 [r53+512], {f146, f147}; +barrier.sync 0; +and.b32 r54, r9, 3584; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+4096]; +sub.f32 f156, f148, f152; +sub.f32 f157, f149, f153; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f158, f159}, [rd26]; +mul.f32 f162, f157, f159; +mul.f32 f163, f156, f159; +mul.f32 f164, f158, f157; +and.b32 r57, r9, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 6144; +add.s32 r60, r58, r59; +add.f32 f165, f149, f153; +add.f32 f166, f148, f152; +st.shared.v2.f32 [r60], {f166, f165}; +fma.rn.f32 f167, f158, f156, f162; +sub.f32 f168, f164, f163; +st.shared.v2.f32 [r60+1024], {f167, f168}; +barrier.sync 0; +and.b32 r61, r9, 3072; +sub.s32 r62, r60, r61; +ld.shared.v2.f32 {f169, f170}, [r62]; +ld.shared.v2.f32 {f173, f174}, [r62+4096]; +sub.f32 f177, f169, f173; +sub.f32 f178, f170, f174; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f179, f180}, [rd29]; +mul.f32 f183, f178, f180; +mul.f32 f184, f177, f180; +mul.f32 f185, f179, f178; +and.b32 r64, r9, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 4096; +add.s32 r67, r65, r66; +add.f32 f186, f170, f174; +add.f32 f187, f169, f173; +st.shared.v2.f32 [r67], {f187, f186}; +fma.rn.f32 f188, f179, f177, f183; +sub.f32 f189, f185, f184; +st.shared.v2.f32 [r67+2048], {f188, f189}; +barrier.sync 0; +and.b32 r68, r9, 2048; +sub.s32 r69, r67, r68; +ld.shared.v2.f32 {f190, f191}, [r69]; +ld.shared.v2.f32 {f194, f195}, [r69+4096]; +add.f32 %1, f191, f195; +add.f32 %0, f190, f194; +sub.f32 %3, f191, f195; +sub.f32 %2, f190, f194; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<292, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<166>; +.reg .b32 r<70>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %14, %16; +add.f32 f10, %15, %17; +sub.f32 f11, %14, %16; +sub.f32 f12, %15, %17; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 4088; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 2044; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+2048]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+2048]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4080; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 2040; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+2048]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+2048]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 4064; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 2032; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+2048]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+2048]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 504; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 4032; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 2016; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+2048]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+2048]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 5; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f80, f82; +fma.rn.f32 f86, f81, f79, f85; +mul.f32 f87, f79, f82; +mul.f32 f88, f81, f80; +sub.f32 f89, f88, f87; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3968; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f86; +barrier.sync 0; +and.b32 r40, r11, 1984; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+2048]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+2048]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f97, f99; +fma.rn.f32 f103, f98, f96, f102; +mul.f32 f104, f96, f99; +mul.f32 f105, f98, f97; +sub.f32 f106, f105, f104; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3840; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f103; +barrier.sync 0; +and.b32 r47, r11, 1920; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+2048]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+2048]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f114, f116; +fma.rn.f32 f120, f115, f113, f119; +mul.f32 f121, f113, f116; +mul.f32 f122, f115, f114; +sub.f32 f123, f122, f121; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 3584; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f120; +barrier.sync 0; +and.b32 r54, r11, 1792; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+2048]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+2048]; +add.f32 f128, f124, f125; +add.f32 f129, f126, f127; +sub.f32 f130, f124, f125; +sub.f32 f131, f126, f127; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f132, f133}, [rd26]; +mul.f32 f136, f131, f133; +fma.rn.f32 f137, f132, f130, f136; +mul.f32 f138, f130, f133; +mul.f32 f139, f132, f131; +sub.f32 f140, f139, f138; +and.b32 r57, r11, 508; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 3072; +add.s32 r60, r58, r59; +st.shared.f32 [r60], f128; +st.shared.f32 [r60+512], f137; +barrier.sync 0; +and.b32 r61, r11, 1536; +sub.s32 r62, r60, r61; +ld.shared.f32 f141, [r62]; +ld.shared.f32 f142, [r62+2048]; +barrier.sync 0; +st.shared.f32 [r60], f129; +st.shared.f32 [r60+512], f140; +barrier.sync 0; +ld.shared.f32 f143, [r62]; +ld.shared.f32 f144, [r62+2048]; +add.f32 f145, f141, f142; +add.f32 f146, f143, f144; +sub.f32 f147, f141, f142; +sub.f32 f148, f143, f144; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f149, f150}, [rd29]; +mul.f32 f153, f148, f150; +fma.rn.f32 f154, f149, f147, f153; +mul.f32 f155, f147, f150; +mul.f32 f156, f149, f148; +sub.f32 f157, f156, f155; +and.b32 r64, r11, 1020; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 2048; +add.s32 r67, r65, r66; +st.shared.f32 [r67], f145; +st.shared.f32 [r67+1024], f154; +barrier.sync 0; +and.b32 r68, r11, 1024; +sub.s32 r69, r67, r68; +ld.shared.f32 f158, [r69]; +ld.shared.f32 f159, [r69+2048]; +barrier.sync 0; +st.shared.f32 [r67], f146; +st.shared.f32 [r67+1024], f157; +barrier.sync 0; +ld.shared.f32 f160, [r69]; +ld.shared.f32 f161, [r69+2048]; +add.f32 %0, f158, f159; +add.f32 %1, f160, f161; +sub.f32 %2, f158, f159; +sub.f32 %3, f160, f161; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..cf506a5d95ebc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp64_fwd.hpp.inc @@ -0,0 +1,3829 @@ +#ifndef CUFFTDX_FFT_1024_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_1024_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<476, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<29>; +.reg .f64 fd<472>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+2048]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8128; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+1024]; +ld.shared.f64 fd160, [r13+2048]; +ld.shared.f64 fd161, [r13+3072]; +ld.shared.f64 fd162, [r13+4096]; +ld.shared.f64 fd163, [r13+5120]; +ld.shared.f64 fd164, [r13+6144]; +ld.shared.f64 fd165, [r13+7168]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+1024]; +ld.shared.f64 fd168, [r13+2048]; +ld.shared.f64 fd169, [r13+3072]; +ld.shared.f64 fd170, [r13+4096]; +ld.shared.f64 fd171, [r13+5120]; +ld.shared.f64 fd172, [r13+6144]; +ld.shared.f64 fd173, [r13+7168]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 120; +bfe.u32 r15, r5, 3, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+256]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 7680; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+1024]; +ld.shared.f64 fd301, [r21+2048]; +ld.shared.f64 fd302, [r21+3072]; +ld.shared.f64 fd303, [r21+4096]; +ld.shared.f64 fd304, [r21+5120]; +ld.shared.f64 fd305, [r21+6144]; +ld.shared.f64 fd306, [r21+7168]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+1024]; +ld.shared.f64 fd309, [r21+2048]; +ld.shared.f64 fd310, [r21+3072]; +ld.shared.f64 fd311, [r21+4096]; +ld.shared.f64 fd312, [r21+5120]; +ld.shared.f64 fd313, [r21+6144]; +ld.shared.f64 fd314, [r21+7168]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +add.f64 fd327, fd317, fd322; +sub.f64 fd328, fd318, fd321; +sub.f64 fd329, fd317, fd322; +add.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +add.f64 fd343, fd333, fd338; +sub.f64 fd344, fd334, fd337; +sub.f64 fd345, fd333, fd338; +add.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0dBFE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd344, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd351, fd343, 0dBFE6A09E667F3BCD, fd350; +mul.f64 fd352, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd353, fd346, 0dBFE6A09E667F3BCD; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd352, fd353; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd351; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd351; +add.f64 fd364, fd325, fd342; +sub.f64 fd365, fd326, fd341; +sub.f64 fd366, fd325, fd342; +add.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd354; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd354; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 64; +bfe.u32 r23, r5, 6, 1; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd372, fd360; +mul.f64 fd377, fd373, fd361; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd372, fd361; +fma.rn.f64 fd380, fd373, fd360, fd379; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd383, fd364; +mul.f64 fd387, fd385, fd365; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd383, fd365; +fma.rn.f64 fd390, fd385, fd364, fd389; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd393, fd368; +mul.f64 fd397, fd395, fd369; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd393, fd369; +fma.rn.f64 fd400, fd395, fd368, fd399; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd403, fd358; +mul.f64 fd407, fd405, fd359; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd403, fd359; +fma.rn.f64 fd410, fd405, fd358, fd409; +ld.global.v2.f64 {fd411, fd412}, [rd11+32]; +mul.f64 fd415, fd411, fd362; +mul.f64 fd416, fd412, fd363; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd411, fd363; +fma.rn.f64 fd419, fd412, fd362, fd418; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd422, fd366; +mul.f64 fd426, fd424, fd367; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd422, fd367; +fma.rn.f64 fd429, fd424, fd366, fd428; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd432, fd370; +mul.f64 fd436, fd434, fd371; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd432, fd371; +fma.rn.f64 fd439, fd434, fd370, fd438; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 4096; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd378; +st.shared.f64 [r27+1024], fd388; +st.shared.f64 [r27+1536], fd398; +st.shared.f64 [r27+2048], fd408; +st.shared.f64 [r27+2560], fd417; +st.shared.f64 [r27+3072], fd427; +st.shared.f64 [r27+3584], fd437; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+1024]; +ld.shared.f64 fd442, [r28+2048]; +ld.shared.f64 fd443, [r28+3072]; +ld.shared.f64 fd444, [r28+4096]; +ld.shared.f64 fd445, [r28+5120]; +ld.shared.f64 fd446, [r28+6144]; +ld.shared.f64 fd447, [r28+7168]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+1024]; +ld.shared.f64 fd450, [r28+2048]; +ld.shared.f64 fd451, [r28+3072]; +ld.shared.f64 fd452, [r28+4096]; +ld.shared.f64 fd453, [r28+5120]; +ld.shared.f64 fd454, [r28+6144]; +ld.shared.f64 fd455, [r28+7168]; +add.f64 %0, fd440, fd444; +add.f64 %1, fd448, fd452; +add.f64 %2, fd441, fd445; +add.f64 %3, fd449, fd453; +add.f64 %4, fd442, fd446; +add.f64 %5, fd450, fd454; +add.f64 %6, fd443, fd447; +add.f64 %7, fd451, fd455; +sub.f64 %8, fd440, fd444; +sub.f64 %9, fd448, fd452; +sub.f64 %10, fd441, fd445; +sub.f64 %11, fd449, fd453; +sub.f64 %12, fd442, fd446; +sub.f64 %13, fd450, fd454; +sub.f64 %14, fd443, fd447; +sub.f64 %15, fd451, fd455; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<477, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<839>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+1024]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8064; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+512]; +ld.shared.f64 fd390, [r13+1024]; +ld.shared.f64 fd391, [r13+1536]; +ld.shared.f64 fd392, [r13+2048]; +ld.shared.f64 fd393, [r13+2560]; +ld.shared.f64 fd394, [r13+3072]; +ld.shared.f64 fd395, [r13+3584]; +ld.shared.f64 fd396, [r13+4096]; +ld.shared.f64 fd397, [r13+4608]; +ld.shared.f64 fd398, [r13+5120]; +ld.shared.f64 fd399, [r13+5632]; +ld.shared.f64 fd400, [r13+6144]; +ld.shared.f64 fd401, [r13+6656]; +ld.shared.f64 fd402, [r13+7168]; +ld.shared.f64 fd403, [r13+7680]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+512]; +ld.shared.f64 fd406, [r13+1024]; +ld.shared.f64 fd407, [r13+1536]; +ld.shared.f64 fd408, [r13+2048]; +ld.shared.f64 fd409, [r13+2560]; +ld.shared.f64 fd410, [r13+3072]; +ld.shared.f64 fd411, [r13+3584]; +ld.shared.f64 fd412, [r13+4096]; +ld.shared.f64 fd413, [r13+4608]; +ld.shared.f64 fd414, [r13+5120]; +ld.shared.f64 fd415, [r13+5632]; +ld.shared.f64 fd416, [r13+6144]; +ld.shared.f64 fd417, [r13+6656]; +ld.shared.f64 fd418, [r13+7168]; +ld.shared.f64 fd419, [r13+7680]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd543; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd543; +add.f64 fd575, fd473, fd546; +add.f64 fd576, fd474, fd548; +sub.f64 fd577, fd473, fd546; +sub.f64 fd578, fd474, fd548; +add.f64 fd579, fd463, fd521; +sub.f64 fd580, fd464, fd520; +sub.f64 fd581, fd463, fd521; +add.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd551; +add.f64 fd584, fd468, fd553; +sub.f64 fd585, fd467, fd551; +sub.f64 fd586, fd468, fd553; +add.f64 fd587, fd471, fd556; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd556; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 48; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd595, fd567; +mul.f64 fd600, fd596, fd568; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd595, fd568; +fma.rn.f64 fd603, fd596, fd567, fd602; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd606, fd571; +mul.f64 fd610, fd608, fd572; +sub.f64 fd611, fd609, fd610; +mul.f64 fd612, fd606, fd572; +fma.rn.f64 fd613, fd608, fd571, fd612; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd616, fd575; +mul.f64 fd620, fd618, fd576; +sub.f64 fd621, fd619, fd620; +mul.f64 fd622, fd616, fd576; +fma.rn.f64 fd623, fd618, fd575, fd622; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd626, fd579; +mul.f64 fd630, fd628, fd580; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd626, fd580; +fma.rn.f64 fd633, fd628, fd579, fd632; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd636, fd583; +mul.f64 fd640, fd638, fd584; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd636, fd584; +fma.rn.f64 fd643, fd638, fd583, fd642; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd646, fd587; +mul.f64 fd650, fd648, fd588; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd646, fd588; +fma.rn.f64 fd653, fd648, fd587, fd652; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd656, fd591; +mul.f64 fd660, fd658, fd592; +sub.f64 fd661, fd659, fd660; +mul.f64 fd662, fd656, fd592; +fma.rn.f64 fd663, fd658, fd591, fd662; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd666, fd565; +mul.f64 fd670, fd668, fd566; +sub.f64 fd671, fd669, fd670; +mul.f64 fd672, fd666, fd566; +fma.rn.f64 fd673, fd668, fd565, fd672; +ld.global.v2.f64 {fd674, fd675}, [rd8+64]; +mul.f64 fd678, fd674, fd569; +mul.f64 fd679, fd675, fd570; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd674, fd570; +fma.rn.f64 fd682, fd675, fd569, fd681; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd685, fd573; +mul.f64 fd689, fd687, fd574; +sub.f64 fd690, fd688, fd689; +mul.f64 fd691, fd685, fd574; +fma.rn.f64 fd692, fd687, fd573, fd691; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd695, fd577; +mul.f64 fd699, fd697, fd578; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd695, fd578; +fma.rn.f64 fd702, fd697, fd577, fd701; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd705, fd581; +mul.f64 fd709, fd707, fd582; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd705, fd582; +fma.rn.f64 fd712, fd707, fd581, fd711; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd715, fd585; +mul.f64 fd719, fd717, fd586; +sub.f64 fd720, fd718, fd719; +mul.f64 fd721, fd715, fd586; +fma.rn.f64 fd722, fd717, fd585, fd721; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd725, fd589; +mul.f64 fd729, fd727, fd590; +sub.f64 fd730, fd728, fd729; +mul.f64 fd731, fd725, fd590; +fma.rn.f64 fd732, fd727, fd589, fd731; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd735, fd593; +mul.f64 fd739, fd737, fd594; +sub.f64 fd740, fd738, fd739; +mul.f64 fd741, fd735, fd594; +fma.rn.f64 fd742, fd737, fd593, fd741; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 6144; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd601; +st.shared.f64 [r19+256], fd611; +st.shared.f64 [r19+384], fd621; +st.shared.f64 [r19+512], fd631; +st.shared.f64 [r19+640], fd641; +st.shared.f64 [r19+768], fd651; +st.shared.f64 [r19+896], fd661; +st.shared.f64 [r19+1024], fd671; +st.shared.f64 [r19+1152], fd680; +st.shared.f64 [r19+1280], fd690; +st.shared.f64 [r19+1408], fd700; +st.shared.f64 [r19+1536], fd710; +st.shared.f64 [r19+1664], fd720; +st.shared.f64 [r19+1792], fd730; +st.shared.f64 [r19+1920], fd740; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+512]; +ld.shared.f64 fd745, [r20+1024]; +ld.shared.f64 fd746, [r20+1536]; +ld.shared.f64 fd747, [r20+2048]; +ld.shared.f64 fd748, [r20+2560]; +ld.shared.f64 fd749, [r20+3072]; +ld.shared.f64 fd750, [r20+3584]; +ld.shared.f64 fd751, [r20+4096]; +ld.shared.f64 fd752, [r20+4608]; +ld.shared.f64 fd753, [r20+5120]; +ld.shared.f64 fd754, [r20+5632]; +ld.shared.f64 fd755, [r20+6144]; +ld.shared.f64 fd756, [r20+6656]; +ld.shared.f64 fd757, [r20+7168]; +ld.shared.f64 fd758, [r20+7680]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+512]; +ld.shared.f64 fd761, [r20+1024]; +ld.shared.f64 fd762, [r20+1536]; +ld.shared.f64 fd763, [r20+2048]; +ld.shared.f64 fd764, [r20+2560]; +ld.shared.f64 fd765, [r20+3072]; +ld.shared.f64 fd766, [r20+3584]; +ld.shared.f64 fd767, [r20+4096]; +ld.shared.f64 fd768, [r20+4608]; +ld.shared.f64 fd769, [r20+5120]; +ld.shared.f64 fd770, [r20+5632]; +ld.shared.f64 fd771, [r20+6144]; +ld.shared.f64 fd772, [r20+6656]; +ld.shared.f64 fd773, [r20+7168]; +ld.shared.f64 fd774, [r20+7680]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd744, fd752; +add.f64 fd784, fd760, fd768; +sub.f64 fd785, fd744, fd752; +sub.f64 fd786, fd760, fd768; +add.f64 fd787, fd748, fd756; +add.f64 fd788, fd764, fd772; +sub.f64 fd789, fd748, fd756; +sub.f64 fd790, fd764, fd772; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd746, fd754; +add.f64 fd800, fd762, fd770; +sub.f64 fd801, fd746, fd754; +sub.f64 fd802, fd762, fd770; +add.f64 fd803, fd750, fd758; +add.f64 fd804, fd766, fd774; +sub.f64 fd805, fd750, fd758; +sub.f64 fd806, fd766, fd774; +add.f64 %0, fd775, fd779; +add.f64 %1, fd776, fd780; +add.f64 %2, fd783, fd787; +add.f64 %3, fd784, fd788; +add.f64 %4, fd791, fd795; +add.f64 %5, fd792, fd796; +add.f64 %6, fd799, fd803; +add.f64 %7, fd800, fd804; +sub.f64 %9, fd778, fd781; +add.f64 %8, fd777, fd782; +sub.f64 %11, fd786, fd789; +add.f64 %10, fd785, fd790; +sub.f64 %13, fd794, fd797; +add.f64 %12, fd793, fd798; +sub.f64 %15, fd802, fd805; +add.f64 %14, fd801, fd806; +sub.f64 %16, fd775, fd779; +sub.f64 %17, fd776, fd780; +sub.f64 %18, fd783, fd787; +sub.f64 %19, fd784, fd788; +sub.f64 %20, fd791, fd795; +sub.f64 %21, fd792, fd796; +sub.f64 %22, fd799, fd803; +sub.f64 %23, fd800, fd804; +add.f64 %25, fd778, fd781; +sub.f64 %24, fd777, fd782; +add.f64 %27, fd786, fd789; +sub.f64 %26, fd785, fd790; +add.f64 %29, fd794, fd797; +sub.f64 %28, fd793, fd798; +add.f64 %31, fd802, fd805; +sub.f64 %30, fd801, fd806; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_1024), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<480, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<45>; +.reg .f64 fd<1092>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1080, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1078, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1077, fd1080, fd1078; +sub.f64 fd76, fd1080, fd1078; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd1076, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1073, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1071, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1070, fd1073, fd1071; +sub.f64 fd92, fd1073, fd1071; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd1069, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd1069, 0dBFE6A09E667F3BCD; +mul.f64 fd1068, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd1068, fd98; +mul.f64 fd100, fd1069, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1067, fd1077, fd1070; +sub.f64 fd109, fd1077, fd1070; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1066, fd1076, fd101; +sub.f64 fd113, fd1076, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd1065, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd1064, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1062, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1059, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1058, fd1062, fd1059; +sub.f64 fd133, fd1062, fd1059; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd1057, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1055, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1053, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1052, fd1055, fd1053; +sub.f64 fd149, fd1055, fd1053; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd1051, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd1051, 0dBFE6A09E667F3BCD; +mul.f64 fd1050, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd1050, fd155; +mul.f64 fd157, fd1051, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1049, fd1058, fd1052; +sub.f64 fd166, fd1058, fd1052; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1048, fd1057, fd158; +sub.f64 fd170, fd1057, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd1047, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd1046, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1044, fd167, 0d3FED906BCF328D46; +mul.f64 fd1045, fd1048, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd1044, fd1045; +mul.f64 fd182, fd1048, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd1042, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd1043, fd1047, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd1042, fd1043; +mul.f64 fd187, fd1047, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd1040, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd1041, fd1046, 0dBFED906BCF328D46; +sub.f64 fd191, fd1040, fd1041; +mul.f64 fd192, fd1046, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd1038, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd1039, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd1038, fd1039; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd1036, fd177, 0dBFED906BCF328D46; +mul.f64 fd1037, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd1036, fd1037; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1035, fd1066, fd183; +sub.f64 fd213, fd1066, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1034, fd1065, fd188; +sub.f64 fd217, fd1065, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd1033, fd1064, fd193; +sub.f64 fd221, fd1064, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd1032, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd1031, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd1030, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1029, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd1035; +mul.f64 fd244, fd238, fd1035; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1028, fd238, fd238; +sub.f64 fd247, fd1028, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd1034; +mul.f64 fd252, fd247, fd1034; +mul.f64 fd1026, fd238, fd247; +mul.f64 fd1027, fd239, fd249; +sub.f64 fd255, fd1026, fd1027; +mul.f64 fd1025, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd1033; +mul.f64 fd260, fd255, fd1033; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1024, fd238, fd255; +sub.f64 fd263, fd1024, fd262; +mul.f64 fd1023, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd1032; +mul.f64 fd268, fd263, fd1032; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1022, fd238, fd263; +sub.f64 fd271, fd1022, fd270; +mul.f64 fd1021, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd1031; +mul.f64 fd276, fd271, fd1031; +mul.f64 fd1019, fd238, fd271; +mul.f64 fd1020, fd239, fd273; +sub.f64 fd279, fd1019, fd1020; +mul.f64 fd1018, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd1030; +mul.f64 fd284, fd279, fd1030; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1017, fd238, fd279; +sub.f64 fd287, fd1017, fd286; +mul.f64 fd1016, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd1029; +mul.f64 fd292, fd287, fd1029; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1015, fd238, fd287; +sub.f64 fd295, fd1015, fd294; +mul.f64 fd1014, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1013, fd1067, fd1049; +sub.f64 fd1012, fd106, fd163; +mul.f64 fd298, fd295, fd1012; +mul.f64 fd299, fd297, fd1013; +mul.f64 fd300, fd295, fd1013; +ld.global.v2.f64 {fd301, fd302}, [rd5+1024]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1010, fd238, fd301; +mul.f64 fd1011, fd239, fd302; +sub.f64 fd310, fd1010, fd1011; +mul.f64 fd1009, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1008, fd238, fd310; +sub.f64 fd318, fd1008, fd317; +mul.f64 fd1007, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1005, fd238, fd318; +mul.f64 fd1006, fd239, fd320; +sub.f64 fd326, fd1005, fd1006; +mul.f64 fd1004, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1002, fd238, fd326; +mul.f64 fd1003, fd239, fd328; +sub.f64 fd334, fd1002, fd1003; +mul.f64 fd1001, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1000, fd238, fd334; +sub.f64 fd342, fd1000, fd341; +mul.f64 fd999, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd997, fd238, fd342; +mul.f64 fd998, fd239, fd344; +sub.f64 fd350, fd997, fd998; +mul.f64 fd996, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd995, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r39, %tid.x; +shl.b32 r31, r39, 8; +barrier.sync 0; +and.b32 r11, r31, 16128; +add.s32 r12, r9, r11; +sub.f64 fd1088, fd1067, fd1049; +mul.f64 fd1087, fd297, fd1088; +add.f64 fd356, fd1067, fd1049; +mov.u32 r38, %tid.x; +shl.b32 r30, r38, 8; +and.b32 r23, r30, 16128; +add.s32 r22, r9, r23; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r22], {fd357, fd356}; +mov.u32 r44, %tid.x; +shl.b32 r36, r44, 8; +shl.b32 r28, r44, 4; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd996, fd243; +st.shared.v2.f64 [r22+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd1025, fd251; +st.shared.v2.f64 [r22+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd1023, fd259; +st.shared.v2.f64 [r22+48], {fd363, fd362}; +sub.f64 fd364, fd1021, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r22+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd1018, fd275; +st.shared.v2.f64 [r22+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd1016, fd283; +st.shared.v2.f64 [r22+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd1014, fd291; +st.shared.v2.f64 [r22+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd1012, fd300; +sub.f64 fd373, fd298, fd1087; +st.shared.v2.f64 [r22+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd1009, fd306; +st.shared.v2.f64 [r22+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd1007, fd314; +st.shared.v2.f64 [r22+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd1004, fd322; +st.shared.v2.f64 [r22+176], {fd379, fd378}; +sub.f64 fd380, fd1001, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r22+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd999, fd338; +st.shared.v2.f64 [r22+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd995, fd346; +st.shared.v2.f64 [r22+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r22+240], {fd387, fd386}; +barrier.sync 0; +and.b32 r20, r44, 63; +mad.lo.s32 r13, r20, -240, r22; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+1024]; +ld.shared.v2.f64 {fd396, fd397}, [r13+2048]; +ld.shared.v2.f64 {fd400, fd401}, [r13+3072]; +ld.shared.v2.f64 {fd404, fd405}, [r13+4096]; +ld.shared.v2.f64 {fd408, fd409}, [r13+5120]; +ld.shared.v2.f64 {fd412, fd413}, [r13+6144]; +ld.shared.v2.f64 {fd416, fd417}, [r13+7168]; +ld.shared.v2.f64 {fd420, fd421}, [r13+8192]; +ld.shared.v2.f64 {fd424, fd425}, [r13+9216]; +ld.shared.v2.f64 {fd428, fd429}, [r13+10240]; +ld.shared.v2.f64 {fd432, fd433}, [r13+11264]; +ld.shared.v2.f64 {fd436, fd437}, [r13+12288]; +ld.shared.v2.f64 {fd440, fd441}, [r13+13312]; +ld.shared.v2.f64 {fd444, fd445}, [r13+14336]; +ld.shared.v2.f64 {fd448, fd449}, [r13+15360]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd994, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd993, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd992, fd994, fd993; +sub.f64 fd463, fd994, fd993; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd991, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd990, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd989, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd988, fd990, fd989; +sub.f64 fd479, fd990, fd989; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd987, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd987, 0dBFE6A09E667F3BCD; +mul.f64 fd986, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd986, fd485; +mul.f64 fd487, fd987, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd985, fd992, fd988; +sub.f64 fd496, fd992, fd988; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd984, fd991, fd488; +sub.f64 fd500, fd991, fd488; +add.f64 fd501, fd462, fd479; +sub.f64 fd503, fd462, fd479; +sub.f64 fd983, fd463, fd478; +add.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd491; +sub.f64 fd507, fd466, fd491; +add.f64 fd982, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd981, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd980, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd979, fd981, fd980; +sub.f64 fd520, fd981, fd980; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd978, fd512, fd515; +add.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd977, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd976, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd975, fd977, fd976; +sub.f64 fd536, fd977, fd976; +add.f64 fd537, fd527, fd532; +sub.f64 fd539, fd527, fd532; +sub.f64 fd974, fd528, fd531; +add.f64 fd540, fd528, fd531; +mul.f64 fd542, fd974, 0dBFE6A09E667F3BCD; +mul.f64 fd973, fd537, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd973, fd542; +mul.f64 fd544, fd974, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd545, fd537, 0dBFE6A09E667F3BCD, fd544; +mul.f64 fd546, fd539, 0dBFE6A09E667F3BCD; +mul.f64 fd547, fd540, 0dBFE6A09E667F3BCD; +sub.f64 fd548, fd546, fd547; +add.f64 fd549, fd546, fd547; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd972, fd979, fd975; +sub.f64 fd553, fd979, fd975; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd971, fd978, fd545; +sub.f64 fd557, fd978, fd545; +add.f64 fd558, fd519, fd536; +sub.f64 fd560, fd519, fd536; +sub.f64 fd970, fd520, fd535; +add.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd548; +sub.f64 fd564, fd523, fd548; +add.f64 fd969, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd967, fd554, 0d3FED906BCF328D46; +mul.f64 fd968, fd971, 0dBFD87DE2A6AEA963; +sub.f64 fd568, fd967, fd968; +mul.f64 fd569, fd971, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0dBFD87DE2A6AEA963, fd569; +mul.f64 fd572, fd970, 0dBFE6A09E667F3BCD; +mul.f64 fd966, fd558, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd966, fd572; +mul.f64 fd574, fd970, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd575, fd558, 0dBFE6A09E667F3BCD, fd574; +mul.f64 fd577, fd969, 0dBFED906BCF328D46; +mul.f64 fd965, fd562, 0d3FD87DE2A6AEA963; +sub.f64 fd578, fd965, fd577; +mul.f64 fd579, fd969, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd580, fd562, 0dBFED906BCF328D46, fd579; +mul.f64 fd582, fd557, 0dBFED906BCF328D46; +mul.f64 fd964, fd556, 0dBFD87DE2A6AEA963; +sub.f64 fd583, fd964, fd582; +mul.f64 fd584, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd585, fd556, 0dBFED906BCF328D46, fd584; +mul.f64 fd586, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd561, 0dBFE6A09E667F3BCD; +sub.f64 fd588, fd586, fd587; +add.f64 fd589, fd586, fd587; +mul.f64 fd591, fd565, 0dBFD87DE2A6AEA963; +mul.f64 fd963, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd963, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0dBFD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd962, fd984, fd570; +sub.f64 fd600, fd984, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd961, fd983, fd575; +sub.f64 fd604, fd983, fd575; +add.f64 fd605, fd505, fd578; +sub.f64 fd607, fd505, fd578; +add.f64 fd960, fd982, fd580; +sub.f64 fd608, fd982, fd580; +add.f64 fd609, fd495, fd553; +sub.f64 fd611, fd495, fd553; +sub.f64 fd959, fd496, fd552; +add.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd583; +sub.f64 fd615, fd499, fd583; +add.f64 fd958, fd500, fd585; +sub.f64 fd616, fd500, fd585; +add.f64 fd617, fd503, fd588; +sub.f64 fd619, fd503, fd588; +add.f64 fd957, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd956, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r44, 48; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd630, fd626, fd962; +mul.f64 fd631, fd625, fd962; +mul.f64 fd633, fd626, fd626; +mul.f64 fd955, fd625, fd625; +sub.f64 fd634, fd955, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd638, fd636, fd961; +mul.f64 fd639, fd634, fd961; +mul.f64 fd953, fd625, fd634; +mul.f64 fd954, fd626, fd636; +sub.f64 fd642, fd953, fd954; +mul.f64 fd952, fd634, fd601; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd646, fd644, fd960; +mul.f64 fd647, fd642, fd960; +mul.f64 fd649, fd626, fd644; +mul.f64 fd951, fd625, fd642; +sub.f64 fd650, fd951, fd649; +mul.f64 fd950, fd642, fd605; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd654, fd652, fd959; +mul.f64 fd655, fd650, fd959; +mul.f64 fd948, fd625, fd650; +mul.f64 fd949, fd626, fd652; +sub.f64 fd658, fd948, fd949; +mul.f64 fd947, fd650, fd609; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd662, fd660, fd958; +mul.f64 fd663, fd658, fd958; +mul.f64 fd945, fd625, fd658; +mul.f64 fd946, fd626, fd660; +sub.f64 fd666, fd945, fd946; +mul.f64 fd944, fd658, fd613; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd670, fd668, fd957; +mul.f64 fd671, fd666, fd957; +mul.f64 fd673, fd626, fd668; +mul.f64 fd943, fd625, fd666; +sub.f64 fd674, fd943, fd673; +mul.f64 fd942, fd666, fd617; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd678, fd676, fd956; +mul.f64 fd679, fd674, fd956; +mul.f64 fd940, fd625, fd674; +mul.f64 fd941, fd626, fd676; +sub.f64 fd682, fd940, fd941; +mul.f64 fd939, fd674, fd621; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd938, fd985, fd972; +sub.f64 fd937, fd493, fd550; +mul.f64 fd685, fd682, fd937; +mul.f64 fd686, fd684, fd938; +mul.f64 fd687, fd682, fd938; +ld.global.v2.f64 {fd688, fd689}, [rd8+64]; +mul.f64 fd693, fd689, fd600; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd936, fd625, fd688; +sub.f64 fd697, fd936, fd696; +mul.f64 fd935, fd688, fd599; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd701, fd699, fd604; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd934, fd625, fd697; +sub.f64 fd705, fd934, fd704; +mul.f64 fd933, fd697, fd603; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd709, fd707, fd608; +mul.f64 fd710, fd705, fd608; +mul.f64 fd931, fd625, fd705; +mul.f64 fd932, fd626, fd707; +sub.f64 fd713, fd931, fd932; +mul.f64 fd930, fd705, fd607; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd717, fd715, fd612; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd929, fd625, fd713; +sub.f64 fd721, fd929, fd720; +mul.f64 fd928, fd713, fd611; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd725, fd723, fd616; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd927, fd625, fd721; +sub.f64 fd729, fd927, fd728; +mul.f64 fd926, fd721, fd615; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd733, fd731, fd620; +mul.f64 fd734, fd729, fd620; +mul.f64 fd924, fd625, fd729; +mul.f64 fd925, fd626, fd731; +sub.f64 fd737, fd924, fd925; +mul.f64 fd923, fd625, fd597; +mul.f64 fd738, fd625, fd731; +mul.f64 fd922, fd729, fd619; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd737, fd623; +mul.f64 fd741, fd739, fd624; +mul.f64 fd742, fd737, fd624; +sub.f64 fd1083, fd985, fd972; +mul.f64 fd1082, fd684, fd1083; +mov.u32 r35, %tid.x; +shl.b32 r34, r35, 4; +and.b32 r15, r34, 240; +add.s32 r16, r9, r15; +sub.f64 fd1085, fd985, fd972; +mul.f64 fd1084, fd684, fd1085; +barrier.sync 0; +and.b32 r17, r36, 12288; +add.s32 r18, r16, r17; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 48; +add.f64 fd743, fd985, fd972; +sub.f64 fd1089, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r33, %tid.x; +and.b32 r32, r33, 48; +fma.rn.f64 fd745, fd626, fd597, fd631; +sub.f64 fd746, fd923, fd630; +st.shared.v2.f64 [r18+256], {fd746, fd745}; +fma.rn.f64 fd747, fd636, fd601, fd639; +sub.f64 fd748, fd952, fd638; +st.shared.v2.f64 [r18+512], {fd748, fd747}; +fma.rn.f64 fd749, fd644, fd605, fd647; +sub.f64 fd750, fd950, fd646; +st.shared.v2.f64 [r18+768], {fd750, fd749}; +fma.rn.f64 fd751, fd652, fd609, fd655; +sub.f64 fd752, fd947, fd654; +st.shared.v2.f64 [r18+1024], {fd752, fd751}; +sub.f64 fd753, fd944, fd662; +fma.rn.f64 fd754, fd660, fd613, fd663; +st.shared.v2.f64 [r18+1280], {fd753, fd754}; +fma.rn.f64 fd755, fd668, fd617, fd671; +sub.f64 fd756, fd942, fd670; +st.shared.v2.f64 [r18+1536], {fd756, fd755}; +fma.rn.f64 fd757, fd676, fd621, fd679; +sub.f64 fd758, fd939, fd678; +st.shared.v2.f64 [r18+1792], {fd758, fd757}; +fma.rn.f64 fd759, fd684, fd1089, fd687; +sub.f64 fd760, fd685, fd1084; +st.shared.v2.f64 [r18+2048], {fd760, fd759}; +fma.rn.f64 fd761, fd689, fd599, fd694; +sub.f64 fd762, fd935, fd693; +st.shared.v2.f64 [r18+2304], {fd762, fd761}; +fma.rn.f64 fd763, fd699, fd603, fd702; +sub.f64 fd764, fd933, fd701; +st.shared.v2.f64 [r18+2560], {fd764, fd763}; +fma.rn.f64 fd765, fd707, fd607, fd710; +sub.f64 fd766, fd930, fd709; +st.shared.v2.f64 [r18+2816], {fd766, fd765}; +fma.rn.f64 fd767, fd715, fd611, fd718; +sub.f64 fd768, fd928, fd717; +st.shared.v2.f64 [r18+3072], {fd768, fd767}; +sub.f64 fd769, fd926, fd725; +fma.rn.f64 fd770, fd723, fd615, fd726; +st.shared.v2.f64 [r18+3328], {fd769, fd770}; +fma.rn.f64 fd771, fd731, fd619, fd734; +sub.f64 fd772, fd922, fd733; +st.shared.v2.f64 [r18+3584], {fd772, fd771}; +fma.rn.f64 fd773, fd739, fd623, fd742; +sub.f64 fd774, fd740, fd741; +st.shared.v2.f64 [r18+3840], {fd774, fd773}; +barrier.sync 0; +mad.lo.s32 r19, r32, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+1024]; +ld.shared.v2.f64 {fd783, fd784}, [r19+2048]; +ld.shared.v2.f64 {fd787, fd788}, [r19+3072]; +ld.shared.v2.f64 {fd791, fd792}, [r19+4096]; +ld.shared.v2.f64 {fd795, fd796}, [r19+5120]; +ld.shared.v2.f64 {fd799, fd800}, [r19+6144]; +ld.shared.v2.f64 {fd803, fd804}, [r19+7168]; +ld.shared.v2.f64 {fd807, fd808}, [r19+8192]; +ld.shared.v2.f64 {fd811, fd812}, [r19+9216]; +ld.shared.v2.f64 {fd815, fd816}, [r19+10240]; +ld.shared.v2.f64 {fd819, fd820}, [r19+11264]; +ld.shared.v2.f64 {fd823, fd824}, [r19+12288]; +ld.shared.v2.f64 {fd827, fd828}, [r19+13312]; +ld.shared.v2.f64 {fd831, fd832}, [r19+14336]; +ld.shared.v2.f64 {fd835, fd836}, [r19+15360]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd921, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd920, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd779, fd811; +sub.f64 fd849, fd779, fd811; +add.f64 fd919, fd780, fd812; +sub.f64 fd850, fd780, fd812; +add.f64 fd851, fd795, fd827; +sub.f64 fd853, fd795, fd827; +add.f64 fd918, fd796, fd828; +sub.f64 fd854, fd796, fd828; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd917, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd916, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd787, fd819; +sub.f64 fd865, fd787, fd819; +add.f64 fd915, fd788, fd820; +sub.f64 fd866, fd788, fd820; +add.f64 fd867, fd803, fd835; +sub.f64 fd869, fd803, fd835; +add.f64 fd914, fd804, fd836; +sub.f64 fd870, fd804, fd836; +add.f64 %0, fd839, fd843; +add.f64 %1, fd921, fd920; +add.f64 %2, fd847, fd851; +add.f64 %3, fd919, fd918; +add.f64 %4, fd855, fd859; +add.f64 %5, fd917, fd916; +add.f64 %7, fd915, fd914; +add.f64 %6, fd863, fd867; +sub.f64 %9, fd842, fd845; +add.f64 %8, fd841, fd846; +sub.f64 %11, fd850, fd853; +add.f64 %10, fd849, fd854; +add.f64 %12, fd857, fd862; +sub.f64 %13, fd858, fd861; +add.f64 %14, fd865, fd870; +sub.f64 %15, fd866, fd869; +sub.f64 %17, fd921, fd920; +sub.f64 %16, fd839, fd843; +sub.f64 %19, fd919, fd918; +sub.f64 %18, fd847, fd851; +sub.f64 %21, fd917, fd916; +sub.f64 %20, fd855, fd859; +sub.f64 %23, fd915, fd914; +sub.f64 %22, fd863, fd867; +add.f64 %25, fd842, fd845; +sub.f64 %24, fd841, fd846; +add.f64 %27, fd850, fd853; +sub.f64 %26, fd849, fd854; +add.f64 %29, fd858, fd861; +sub.f64 %28, fd857, fd862; +add.f64 %31, fd866, fd869; +sub.f64 %30, fd865, fd870; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_1024), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<478, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<241>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+4096]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8160; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+2048]; +ld.shared.f64 fd63, [r13+4096]; +ld.shared.f64 fd64, [r13+6144]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+2048]; +ld.shared.f64 fd67, [r13+4096]; +ld.shared.f64 fd68, [r13+6144]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+1024]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 8064; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+2048]; +ld.shared.f64 fd115, [r21+4096]; +ld.shared.f64 fd116, [r21+6144]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+2048]; +ld.shared.f64 fd119, [r21+4096]; +ld.shared.f64 fd120, [r21+6144]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +add.f64 fd133, fd123, fd128; +sub.f64 fd134, fd124, fd127; +sub.f64 fd135, fd123, fd128; +add.f64 fd136, fd124, fd127; +and.b32 r22, r5, 240; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd137, fd133; +mul.f64 fd142, fd138, fd134; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd137, fd134; +fma.rn.f64 fd145, fd138, fd133, fd144; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd148, fd131; +mul.f64 fd152, fd150, fd132; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd148, fd132; +fma.rn.f64 fd155, fd150, fd131, fd154; +ld.global.v2.f64 {fd156, fd157}, [rd11+256]; +mul.f64 fd160, fd156, fd135; +mul.f64 fd161, fd157, fd136; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd156, fd136; +fma.rn.f64 fd164, fd157, fd135, fd163; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 7680; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd143; +st.shared.f64 [r26+256], fd153; +st.shared.f64 [r26+384], fd162; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+2048]; +ld.shared.f64 fd167, [r27+4096]; +ld.shared.f64 fd168, [r27+6144]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+2048]; +ld.shared.f64 fd171, [r27+4096]; +ld.shared.f64 fd172, [r27+6144]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +add.f64 fd185, fd175, fd180; +sub.f64 fd186, fd176, fd179; +sub.f64 fd187, fd175, fd180; +add.f64 fd188, fd176, fd179; +and.b32 r28, r5, 192; +bfe.u32 r29, r5, 6, 2; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd189, fd185; +mul.f64 fd194, fd190, fd186; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd189, fd186; +fma.rn.f64 fd197, fd190, fd185, fd196; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd200, fd183; +mul.f64 fd204, fd202, fd184; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd184; +fma.rn.f64 fd207, fd202, fd183, fd206; +ld.global.v2.f64 {fd208, fd209}, [rd14+64]; +mul.f64 fd212, fd208, fd187; +mul.f64 fd213, fd209, fd188; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd208, fd188; +fma.rn.f64 fd216, fd209, fd187, fd215; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 6144; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd195; +st.shared.f64 [r33+1024], fd205; +st.shared.f64 [r33+1536], fd214; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+2048]; +ld.shared.f64 fd219, [r34+4096]; +ld.shared.f64 fd220, [r34+6144]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+2048]; +ld.shared.f64 fd223, [r34+4096]; +ld.shared.f64 fd224, [r34+6144]; +add.f64 fd225, fd217, fd219; +add.f64 fd226, fd221, fd223; +sub.f64 fd227, fd217, fd219; +sub.f64 fd228, fd221, fd223; +add.f64 fd229, fd218, fd220; +add.f64 fd230, fd222, fd224; +sub.f64 fd231, fd218, fd220; +sub.f64 fd232, fd222, fd224; +add.f64 %0, fd225, fd229; +add.f64 %1, fd226, fd230; +sub.f64 %3, fd228, fd231; +add.f64 %2, fd227, fd232; +sub.f64 %4, fd225, fd229; +sub.f64 %5, fd226, fd230; +add.f64 %7, fd228, fd231; +sub.f64 %6, fd227, fd232; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<479, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<273>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+4096]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 16320; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+4096]; +ld.shared.v2.f64 {fd69, fd70}, [r13+8192]; +ld.shared.v2.f64 {fd73, fd74}, [r13+12288]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+1024]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 16128; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+4096]; +ld.shared.v2.f64 {fd129, fd130}, [r20+8192]; +ld.shared.v2.f64 {fd133, fd134}, [r20+12288]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd144; +sub.f64 fd148, fd140, fd143; +sub.f64 fd149, fd139, fd144; +add.f64 fd150, fd140, fd143; +and.b32 r21, r5, 240; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd151, fd147; +mul.f64 fd156, fd152, fd148; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd145; +mul.f64 fd164, fd162, fd146; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+256]; +mul.f64 fd170, fd166, fd149; +mul.f64 fd171, fd167, fd150; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 15360; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd152, fd147, fd157; +sub.f64 fd176, fd155, fd156; +st.shared.v2.f64 [r25+256], {fd176, fd175}; +fma.rn.f64 fd177, fd162, fd145, fd165; +sub.f64 fd178, fd163, fd164; +st.shared.v2.f64 [r25+512], {fd178, fd177}; +fma.rn.f64 fd179, fd167, fd149, fd172; +sub.f64 fd180, fd170, fd171; +st.shared.v2.f64 [r25+768], {fd180, fd179}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+4096]; +ld.shared.v2.f64 {fd189, fd190}, [r26+8192]; +ld.shared.v2.f64 {fd193, fd194}, [r26+12288]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +add.f64 fd207, fd199, fd204; +sub.f64 fd208, fd200, fd203; +sub.f64 fd209, fd199, fd204; +add.f64 fd210, fd200, fd203; +and.b32 r27, r5, 192; +bfe.u32 r28, r5, 6, 2; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd211, fd207; +mul.f64 fd216, fd212, fd208; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd220, fd205; +mul.f64 fd224, fd222, fd206; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+64]; +mul.f64 fd230, fd226, fd209; +mul.f64 fd231, fd227, fd210; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 12288; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd212, fd207, fd217; +sub.f64 fd236, fd215, fd216; +st.shared.v2.f64 [r32+1024], {fd236, fd235}; +fma.rn.f64 fd237, fd222, fd205, fd225; +sub.f64 fd238, fd223, fd224; +st.shared.v2.f64 [r32+2048], {fd238, fd237}; +fma.rn.f64 fd239, fd227, fd209, fd232; +sub.f64 fd240, fd230, fd231; +st.shared.v2.f64 [r32+3072], {fd240, fd239}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+4096]; +ld.shared.v2.f64 {fd249, fd250}, [r33+8192]; +ld.shared.v2.f64 {fd253, fd254}, [r33+12288]; +add.f64 fd257, fd241, fd249; +add.f64 fd258, fd242, fd250; +sub.f64 fd259, fd241, fd249; +sub.f64 fd260, fd242, fd250; +add.f64 fd261, fd245, fd253; +add.f64 fd262, fd246, fd254; +sub.f64 fd263, fd245, fd253; +sub.f64 fd264, fd246, fd254; +add.f64 %1, fd258, fd262; +add.f64 %0, fd257, fd261; +sub.f64 %3, fd260, fd263; +add.f64 %2, fd259, fd264; +sub.f64 %5, fd258, fd262; +sub.f64 %4, fd257, fd261; +add.f64 %7, fd260, fd263; +sub.f64 %6, fd259, fd264; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<481, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<520>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+2048]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 16256; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+2048]; +ld.shared.v2.f64 {fd166, fd167}, [r13+4096]; +ld.shared.v2.f64 {fd170, fd171}, [r13+6144]; +ld.shared.v2.f64 {fd174, fd175}, [r13+8192]; +ld.shared.v2.f64 {fd178, fd179}, [r13+10240]; +ld.shared.v2.f64 {fd182, fd183}, [r13+12288]; +ld.shared.v2.f64 {fd186, fd187}, [r13+14336]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 120; +bfe.u32 r15, r5, 3, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+256]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 15360; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+2048]; +ld.shared.v2.f64 {fd323, fd324}, [r20+4096]; +ld.shared.v2.f64 {fd327, fd328}, [r20+6144]; +ld.shared.v2.f64 {fd331, fd332}, [r20+8192]; +ld.shared.v2.f64 {fd335, fd336}, [r20+10240]; +ld.shared.v2.f64 {fd339, fd340}, [r20+12288]; +ld.shared.v2.f64 {fd343, fd344}, [r20+14336]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +add.f64 fd359, fd349, fd354; +sub.f64 fd360, fd350, fd353; +sub.f64 fd361, fd349, fd354; +add.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +add.f64 fd375, fd365, fd370; +sub.f64 fd376, fd366, fd369; +sub.f64 fd377, fd365, fd370; +add.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0dBFE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd383, fd375, 0dBFE6A09E667F3BCD, fd382; +mul.f64 fd384, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd385, fd378, 0dBFE6A09E667F3BCD; +sub.f64 fd386, fd384, fd385; +add.f64 fd387, fd384, fd385; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd383; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd383; +add.f64 fd394, fd357, fd374; +sub.f64 fd395, fd358, fd373; +sub.f64 fd396, fd357, fd374; +add.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd386; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd386; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 64; +bfe.u32 r22, r5, 6, 1; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd402, fd390; +mul.f64 fd407, fd403, fd391; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd411, fd394; +mul.f64 fd415, fd413, fd395; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd419, fd398; +mul.f64 fd423, fd421, fd399; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd427, fd388; +mul.f64 fd431, fd429, fd389; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+32]; +mul.f64 fd437, fd433, fd392; +mul.f64 fd438, fd434, fd393; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd442, fd396; +mul.f64 fd446, fd444, fd397; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd450, fd400; +mul.f64 fd454, fd452, fd401; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 8192; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd403, fd390, fd408; +sub.f64 fd459, fd406, fd407; +st.shared.v2.f64 [r26+1024], {fd459, fd458}; +fma.rn.f64 fd460, fd413, fd394, fd416; +sub.f64 fd461, fd414, fd415; +st.shared.v2.f64 [r26+2048], {fd461, fd460}; +fma.rn.f64 fd462, fd421, fd398, fd424; +sub.f64 fd463, fd422, fd423; +st.shared.v2.f64 [r26+3072], {fd463, fd462}; +sub.f64 fd464, fd430, fd431; +fma.rn.f64 fd465, fd429, fd388, fd432; +st.shared.v2.f64 [r26+4096], {fd464, fd465}; +fma.rn.f64 fd466, fd434, fd392, fd439; +sub.f64 fd467, fd437, fd438; +st.shared.v2.f64 [r26+5120], {fd467, fd466}; +fma.rn.f64 fd468, fd444, fd396, fd447; +sub.f64 fd469, fd445, fd446; +st.shared.v2.f64 [r26+6144], {fd469, fd468}; +fma.rn.f64 fd470, fd452, fd400, fd455; +sub.f64 fd471, fd453, fd454; +st.shared.v2.f64 [r26+7168], {fd471, fd470}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+2048]; +ld.shared.v2.f64 {fd480, fd481}, [r27+4096]; +ld.shared.v2.f64 {fd484, fd485}, [r27+6144]; +ld.shared.v2.f64 {fd488, fd489}, [r27+8192]; +ld.shared.v2.f64 {fd492, fd493}, [r27+10240]; +ld.shared.v2.f64 {fd496, fd497}, [r27+12288]; +ld.shared.v2.f64 {fd500, fd501}, [r27+14336]; +add.f64 %1, fd473, fd489; +add.f64 %0, fd472, fd488; +add.f64 %3, fd477, fd493; +add.f64 %2, fd476, fd492; +add.f64 %5, fd481, fd497; +add.f64 %4, fd480, fd496; +add.f64 %7, fd485, fd501; +add.f64 %6, fd484, fd500; +sub.f64 %9, fd473, fd489; +sub.f64 %8, fd472, fd488; +sub.f64 %11, fd477, fd493; +sub.f64 %10, fd476, fd492; +sub.f64 %13, fd481, fd497; +sub.f64 %12, fd480, fd496; +sub.f64 %15, fd485, fd501; +sub.f64 %14, fd484, fd500; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<483, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<70>; +.reg .f64 fd<166>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %14, %16; +add.f64 fd10, %15, %17; +sub.f64 fd11, %14, %16; +sub.f64 fd12, %15, %17; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 8176; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 4088; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+4096]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+4096]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8160; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 4080; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+4096]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+4096]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8128; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 4064; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+4096]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+4096]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 6; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 8064; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 4032; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+4096]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+4096]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 496; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd81, fd79; +mul.f64 fd86, fd82, fd80; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd81, fd80; +fma.rn.f64 fd89, fd82, fd79, fd88; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 7936; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd87; +barrier.sync 0; +and.b32 r40, r11, 3968; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+4096]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+4096]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd98, fd96; +mul.f64 fd103, fd99, fd97; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd98, fd97; +fma.rn.f64 fd106, fd99, fd96, fd105; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7680; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd104; +barrier.sync 0; +and.b32 r47, r11, 3840; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+4096]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+4096]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd115, fd113; +mul.f64 fd120, fd116, fd114; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd115, fd114; +fma.rn.f64 fd123, fd116, fd113, fd122; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 7168; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd121; +barrier.sync 0; +and.b32 r54, r11, 3584; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+4096]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+4096]; +add.f64 fd128, fd124, fd125; +add.f64 fd129, fd126, fd127; +sub.f64 fd130, fd124, fd125; +sub.f64 fd131, fd126, fd127; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd132, fd133}, [rd26]; +mul.f64 fd136, fd132, fd130; +mul.f64 fd137, fd133, fd131; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd132, fd131; +fma.rn.f64 fd140, fd133, fd130, fd139; +and.b32 r57, r11, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 6144; +add.s32 r60, r58, r59; +st.shared.f64 [r60], fd128; +st.shared.f64 [r60+1024], fd138; +barrier.sync 0; +and.b32 r61, r11, 3072; +sub.s32 r62, r60, r61; +ld.shared.f64 fd141, [r62]; +ld.shared.f64 fd142, [r62+4096]; +barrier.sync 0; +st.shared.f64 [r60], fd129; +st.shared.f64 [r60+1024], fd140; +barrier.sync 0; +ld.shared.f64 fd143, [r62]; +ld.shared.f64 fd144, [r62+4096]; +add.f64 fd145, fd141, fd142; +add.f64 fd146, fd143, fd144; +sub.f64 fd147, fd141, fd142; +sub.f64 fd148, fd143, fd144; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd149, fd150}, [rd29]; +mul.f64 fd153, fd149, fd147; +mul.f64 fd154, fd150, fd148; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd149, fd148; +fma.rn.f64 fd157, fd150, fd147, fd156; +and.b32 r64, r11, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 4096; +add.s32 r67, r65, r66; +st.shared.f64 [r67], fd145; +st.shared.f64 [r67+2048], fd155; +barrier.sync 0; +and.b32 r68, r11, 2048; +sub.s32 r69, r67, r68; +ld.shared.f64 fd158, [r69]; +ld.shared.f64 fd159, [r69+4096]; +barrier.sync 0; +st.shared.f64 [r67], fd146; +st.shared.f64 [r67+2048], fd157; +barrier.sync 0; +ld.shared.f64 fd160, [r69]; +ld.shared.f64 fd161, [r69+4096]; +add.f64 %0, fd158, fd159; +add.f64 %1, fd160, fd161; +sub.f64 %2, fd158, fd159; +sub.f64 %3, fd160, fd161; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<482, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<70>; +.reg .f64 fd<202>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %14, %16; +sub.f64 fd10, %15, %17; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -16384; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 16352; +add.s32 r11, r8, r10; +add.f64 fd18, %15, %17; +add.f64 fd19, %14, %16; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 8176; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+8192]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16320; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 8160; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+8192]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 16256; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 8128; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+8192]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 6; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 16128; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 8064; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+8192]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 496; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd95, fd93; +mul.f64 fd100, fd96, fd94; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 15872; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd96, fd93, fd101; +sub.f64 fd105, fd99, fd100; +st.shared.v2.f64 [r39+256], {fd105, fd104}; +barrier.sync 0; +and.b32 r40, r9, 7936; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+8192]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd116, fd114; +mul.f64 fd121, fd117, fd115; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 15360; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd117, fd114, fd122; +sub.f64 fd126, fd120, fd121; +st.shared.v2.f64 [r46+512], {fd126, fd125}; +barrier.sync 0; +and.b32 r47, r9, 7680; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+8192]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd137, fd135; +mul.f64 fd142, fd138, fd136; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 14336; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd138, fd135, fd143; +sub.f64 fd147, fd141, fd142; +st.shared.v2.f64 [r53+1024], {fd147, fd146}; +barrier.sync 0; +and.b32 r54, r9, 7168; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+8192]; +sub.f64 fd156, fd148, fd152; +sub.f64 fd157, fd149, fd153; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd158, fd159}, [rd26]; +mul.f64 fd162, fd158, fd156; +mul.f64 fd163, fd159, fd157; +mul.f64 fd164, fd158, fd157; +and.b32 r57, r9, 2032; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 12288; +add.s32 r60, r58, r59; +add.f64 fd165, fd149, fd153; +add.f64 fd166, fd148, fd152; +st.shared.v2.f64 [r60], {fd166, fd165}; +fma.rn.f64 fd167, fd159, fd156, fd164; +sub.f64 fd168, fd162, fd163; +st.shared.v2.f64 [r60+2048], {fd168, fd167}; +barrier.sync 0; +and.b32 r61, r9, 6144; +sub.s32 r62, r60, r61; +ld.shared.v2.f64 {fd169, fd170}, [r62]; +ld.shared.v2.f64 {fd173, fd174}, [r62+8192]; +sub.f64 fd177, fd169, fd173; +sub.f64 fd178, fd170, fd174; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd179, fd180}, [rd29]; +mul.f64 fd183, fd179, fd177; +mul.f64 fd184, fd180, fd178; +mul.f64 fd185, fd179, fd178; +and.b32 r64, r9, 4080; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 8192; +add.s32 r67, r65, r66; +add.f64 fd186, fd170, fd174; +add.f64 fd187, fd169, fd173; +st.shared.v2.f64 [r67], {fd187, fd186}; +fma.rn.f64 fd188, fd180, fd177, fd185; +sub.f64 fd189, fd183, fd184; +st.shared.v2.f64 [r67+4096], {fd189, fd188}; +barrier.sync 0; +and.b32 r68, r9, 4096; +sub.s32 r69, r67, r68; +ld.shared.v2.f64 {fd190, fd191}, [r69]; +ld.shared.v2.f64 {fd194, fd195}, [r69+8192]; +add.f64 %1, fd191, fd195; +add.f64 %0, fd190, fd194; +sub.f64 %3, fd191, fd195; +sub.f64 %2, fd190, fd194; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..eb4e6abbe9ccd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1024_fp64_inv.hpp.inc @@ -0,0 +1,3821 @@ +#ifndef CUFFTDX_FFT_1024_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_1024_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<647, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<29>; +.reg .f64 fd<472>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+2048]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8128; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+1024]; +ld.shared.f64 fd160, [r13+2048]; +ld.shared.f64 fd161, [r13+3072]; +ld.shared.f64 fd162, [r13+4096]; +ld.shared.f64 fd163, [r13+5120]; +ld.shared.f64 fd164, [r13+6144]; +ld.shared.f64 fd165, [r13+7168]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+1024]; +ld.shared.f64 fd168, [r13+2048]; +ld.shared.f64 fd169, [r13+3072]; +ld.shared.f64 fd170, [r13+4096]; +ld.shared.f64 fd171, [r13+5120]; +ld.shared.f64 fd172, [r13+6144]; +ld.shared.f64 fd173, [r13+7168]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 120; +bfe.u32 r15, r5, 3, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+256]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 7680; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+1024]; +ld.shared.f64 fd301, [r21+2048]; +ld.shared.f64 fd302, [r21+3072]; +ld.shared.f64 fd303, [r21+4096]; +ld.shared.f64 fd304, [r21+5120]; +ld.shared.f64 fd305, [r21+6144]; +ld.shared.f64 fd306, [r21+7168]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+1024]; +ld.shared.f64 fd309, [r21+2048]; +ld.shared.f64 fd310, [r21+3072]; +ld.shared.f64 fd311, [r21+4096]; +ld.shared.f64 fd312, [r21+5120]; +ld.shared.f64 fd313, [r21+6144]; +ld.shared.f64 fd314, [r21+7168]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +sub.f64 fd327, fd317, fd322; +add.f64 fd328, fd318, fd321; +add.f64 fd329, fd317, fd322; +sub.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +sub.f64 fd343, fd333, fd338; +add.f64 fd344, fd334, fd337; +add.f64 fd345, fd333, fd338; +sub.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0d3FE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +add.f64 fd350, fd347, fd348; +mul.f64 fd351, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd352, fd346, 0d3FE6A09E667F3BCD; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd346, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd355, fd345, 0d3FE6A09E667F3BCD, fd354; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd350; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd350; +sub.f64 fd364, fd325, fd342; +add.f64 fd365, fd326, fd341; +add.f64 fd366, fd325, fd342; +sub.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd353; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd353; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 64; +bfe.u32 r23, r5, 6, 1; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd361, fd373; +fma.rn.f64 fd377, fd372, fd360, fd376; +mul.f64 fd378, fd360, fd373; +mul.f64 fd379, fd372, fd361; +sub.f64 fd380, fd379, fd378; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd365, fd385; +fma.rn.f64 fd387, fd383, fd364, fd386; +mul.f64 fd388, fd364, fd385; +mul.f64 fd389, fd383, fd365; +sub.f64 fd390, fd389, fd388; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd369, fd395; +fma.rn.f64 fd397, fd393, fd368, fd396; +mul.f64 fd398, fd368, fd395; +mul.f64 fd399, fd393, fd369; +sub.f64 fd400, fd399, fd398; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd359, fd405; +fma.rn.f64 fd407, fd403, fd358, fd406; +mul.f64 fd408, fd358, fd405; +mul.f64 fd409, fd403, fd359; +sub.f64 fd410, fd409, fd408; +ld.global.v2.f64 {fd411, fd412}, [rd11+32]; +mul.f64 fd415, fd363, fd412; +fma.rn.f64 fd416, fd411, fd362, fd415; +mul.f64 fd417, fd362, fd412; +mul.f64 fd418, fd411, fd363; +sub.f64 fd419, fd418, fd417; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd367, fd424; +fma.rn.f64 fd426, fd422, fd366, fd425; +mul.f64 fd427, fd366, fd424; +mul.f64 fd428, fd422, fd367; +sub.f64 fd429, fd428, fd427; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd371, fd434; +fma.rn.f64 fd436, fd432, fd370, fd435; +mul.f64 fd437, fd370, fd434; +mul.f64 fd438, fd432, fd371; +sub.f64 fd439, fd438, fd437; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 4096; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd377; +st.shared.f64 [r27+1024], fd387; +st.shared.f64 [r27+1536], fd397; +st.shared.f64 [r27+2048], fd407; +st.shared.f64 [r27+2560], fd416; +st.shared.f64 [r27+3072], fd426; +st.shared.f64 [r27+3584], fd436; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+1024]; +ld.shared.f64 fd442, [r28+2048]; +ld.shared.f64 fd443, [r28+3072]; +ld.shared.f64 fd444, [r28+4096]; +ld.shared.f64 fd445, [r28+5120]; +ld.shared.f64 fd446, [r28+6144]; +ld.shared.f64 fd447, [r28+7168]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+1024]; +ld.shared.f64 fd450, [r28+2048]; +ld.shared.f64 fd451, [r28+3072]; +ld.shared.f64 fd452, [r28+4096]; +ld.shared.f64 fd453, [r28+5120]; +ld.shared.f64 fd454, [r28+6144]; +ld.shared.f64 fd455, [r28+7168]; +add.f64 %0, fd440, fd444; +add.f64 %1, fd448, fd452; +add.f64 %2, fd441, fd445; +add.f64 %3, fd449, fd453; +add.f64 %4, fd442, fd446; +add.f64 %5, fd450, fd454; +add.f64 %6, fd443, fd447; +add.f64 %7, fd451, fd455; +sub.f64 %8, fd440, fd444; +sub.f64 %9, fd448, fd452; +sub.f64 %10, fd441, fd445; +sub.f64 %11, fd449, fd453; +sub.f64 %12, fd442, fd446; +sub.f64 %13, fd450, fd454; +sub.f64 %14, fd443, fd447; +sub.f64 %15, fd451, fd455; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<648, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<839>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+1024]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8064; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+512]; +ld.shared.f64 fd390, [r13+1024]; +ld.shared.f64 fd391, [r13+1536]; +ld.shared.f64 fd392, [r13+2048]; +ld.shared.f64 fd393, [r13+2560]; +ld.shared.f64 fd394, [r13+3072]; +ld.shared.f64 fd395, [r13+3584]; +ld.shared.f64 fd396, [r13+4096]; +ld.shared.f64 fd397, [r13+4608]; +ld.shared.f64 fd398, [r13+5120]; +ld.shared.f64 fd399, [r13+5632]; +ld.shared.f64 fd400, [r13+6144]; +ld.shared.f64 fd401, [r13+6656]; +ld.shared.f64 fd402, [r13+7168]; +ld.shared.f64 fd403, [r13+7680]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+512]; +ld.shared.f64 fd406, [r13+1024]; +ld.shared.f64 fd407, [r13+1536]; +ld.shared.f64 fd408, [r13+2048]; +ld.shared.f64 fd409, [r13+2560]; +ld.shared.f64 fd410, [r13+3072]; +ld.shared.f64 fd411, [r13+3584]; +ld.shared.f64 fd412, [r13+4096]; +ld.shared.f64 fd413, [r13+4608]; +ld.shared.f64 fd414, [r13+5120]; +ld.shared.f64 fd415, [r13+5632]; +ld.shared.f64 fd416, [r13+6144]; +ld.shared.f64 fd417, [r13+6656]; +ld.shared.f64 fd418, [r13+7168]; +ld.shared.f64 fd419, [r13+7680]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd542; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd542; +add.f64 fd575, fd473, fd545; +add.f64 fd576, fd474, fd547; +sub.f64 fd577, fd473, fd545; +sub.f64 fd578, fd474, fd547; +sub.f64 fd579, fd463, fd521; +add.f64 fd580, fd464, fd520; +add.f64 fd581, fd463, fd521; +sub.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd550; +add.f64 fd584, fd468, fd552; +sub.f64 fd585, fd467, fd550; +sub.f64 fd586, fd468, fd552; +add.f64 fd587, fd471, fd555; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd555; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 48; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd568, fd596; +fma.rn.f64 fd600, fd595, fd567, fd599; +mul.f64 fd601, fd567, fd596; +mul.f64 fd602, fd595, fd568; +sub.f64 fd603, fd602, fd601; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd572, fd608; +fma.rn.f64 fd610, fd606, fd571, fd609; +mul.f64 fd611, fd571, fd608; +mul.f64 fd612, fd606, fd572; +sub.f64 fd613, fd612, fd611; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd576, fd618; +fma.rn.f64 fd620, fd616, fd575, fd619; +mul.f64 fd621, fd575, fd618; +mul.f64 fd622, fd616, fd576; +sub.f64 fd623, fd622, fd621; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd580, fd628; +fma.rn.f64 fd630, fd626, fd579, fd629; +mul.f64 fd631, fd579, fd628; +mul.f64 fd632, fd626, fd580; +sub.f64 fd633, fd632, fd631; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd584, fd638; +fma.rn.f64 fd640, fd636, fd583, fd639; +mul.f64 fd641, fd583, fd638; +mul.f64 fd642, fd636, fd584; +sub.f64 fd643, fd642, fd641; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd588, fd648; +fma.rn.f64 fd650, fd646, fd587, fd649; +mul.f64 fd651, fd587, fd648; +mul.f64 fd652, fd646, fd588; +sub.f64 fd653, fd652, fd651; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd592, fd658; +fma.rn.f64 fd660, fd656, fd591, fd659; +mul.f64 fd661, fd591, fd658; +mul.f64 fd662, fd656, fd592; +sub.f64 fd663, fd662, fd661; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd566, fd668; +fma.rn.f64 fd670, fd666, fd565, fd669; +mul.f64 fd671, fd565, fd668; +mul.f64 fd672, fd666, fd566; +sub.f64 fd673, fd672, fd671; +ld.global.v2.f64 {fd674, fd675}, [rd8+64]; +mul.f64 fd678, fd570, fd675; +fma.rn.f64 fd679, fd674, fd569, fd678; +mul.f64 fd680, fd569, fd675; +mul.f64 fd681, fd674, fd570; +sub.f64 fd682, fd681, fd680; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd574, fd687; +fma.rn.f64 fd689, fd685, fd573, fd688; +mul.f64 fd690, fd573, fd687; +mul.f64 fd691, fd685, fd574; +sub.f64 fd692, fd691, fd690; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd578, fd697; +fma.rn.f64 fd699, fd695, fd577, fd698; +mul.f64 fd700, fd577, fd697; +mul.f64 fd701, fd695, fd578; +sub.f64 fd702, fd701, fd700; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd582, fd707; +fma.rn.f64 fd709, fd705, fd581, fd708; +mul.f64 fd710, fd581, fd707; +mul.f64 fd711, fd705, fd582; +sub.f64 fd712, fd711, fd710; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd586, fd717; +fma.rn.f64 fd719, fd715, fd585, fd718; +mul.f64 fd720, fd585, fd717; +mul.f64 fd721, fd715, fd586; +sub.f64 fd722, fd721, fd720; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd590, fd727; +fma.rn.f64 fd729, fd725, fd589, fd728; +mul.f64 fd730, fd589, fd727; +mul.f64 fd731, fd725, fd590; +sub.f64 fd732, fd731, fd730; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd594, fd737; +fma.rn.f64 fd739, fd735, fd593, fd738; +mul.f64 fd740, fd593, fd737; +mul.f64 fd741, fd735, fd594; +sub.f64 fd742, fd741, fd740; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 6144; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd600; +st.shared.f64 [r19+256], fd610; +st.shared.f64 [r19+384], fd620; +st.shared.f64 [r19+512], fd630; +st.shared.f64 [r19+640], fd640; +st.shared.f64 [r19+768], fd650; +st.shared.f64 [r19+896], fd660; +st.shared.f64 [r19+1024], fd670; +st.shared.f64 [r19+1152], fd679; +st.shared.f64 [r19+1280], fd689; +st.shared.f64 [r19+1408], fd699; +st.shared.f64 [r19+1536], fd709; +st.shared.f64 [r19+1664], fd719; +st.shared.f64 [r19+1792], fd729; +st.shared.f64 [r19+1920], fd739; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+512]; +ld.shared.f64 fd745, [r20+1024]; +ld.shared.f64 fd746, [r20+1536]; +ld.shared.f64 fd747, [r20+2048]; +ld.shared.f64 fd748, [r20+2560]; +ld.shared.f64 fd749, [r20+3072]; +ld.shared.f64 fd750, [r20+3584]; +ld.shared.f64 fd751, [r20+4096]; +ld.shared.f64 fd752, [r20+4608]; +ld.shared.f64 fd753, [r20+5120]; +ld.shared.f64 fd754, [r20+5632]; +ld.shared.f64 fd755, [r20+6144]; +ld.shared.f64 fd756, [r20+6656]; +ld.shared.f64 fd757, [r20+7168]; +ld.shared.f64 fd758, [r20+7680]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+512]; +ld.shared.f64 fd761, [r20+1024]; +ld.shared.f64 fd762, [r20+1536]; +ld.shared.f64 fd763, [r20+2048]; +ld.shared.f64 fd764, [r20+2560]; +ld.shared.f64 fd765, [r20+3072]; +ld.shared.f64 fd766, [r20+3584]; +ld.shared.f64 fd767, [r20+4096]; +ld.shared.f64 fd768, [r20+4608]; +ld.shared.f64 fd769, [r20+5120]; +ld.shared.f64 fd770, [r20+5632]; +ld.shared.f64 fd771, [r20+6144]; +ld.shared.f64 fd772, [r20+6656]; +ld.shared.f64 fd773, [r20+7168]; +ld.shared.f64 fd774, [r20+7680]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd744, fd752; +add.f64 fd784, fd760, fd768; +sub.f64 fd785, fd744, fd752; +sub.f64 fd786, fd760, fd768; +add.f64 fd787, fd748, fd756; +add.f64 fd788, fd764, fd772; +sub.f64 fd789, fd748, fd756; +sub.f64 fd790, fd764, fd772; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd746, fd754; +add.f64 fd800, fd762, fd770; +sub.f64 fd801, fd746, fd754; +sub.f64 fd802, fd762, fd770; +add.f64 fd803, fd750, fd758; +add.f64 fd804, fd766, fd774; +sub.f64 fd805, fd750, fd758; +sub.f64 fd806, fd766, fd774; +add.f64 %0, fd775, fd779; +add.f64 %1, fd776, fd780; +add.f64 %2, fd783, fd787; +add.f64 %3, fd784, fd788; +add.f64 %4, fd791, fd795; +add.f64 %5, fd792, fd796; +add.f64 %6, fd799, fd803; +add.f64 %7, fd800, fd804; +add.f64 %9, fd778, fd781; +sub.f64 %8, fd777, fd782; +add.f64 %11, fd786, fd789; +sub.f64 %10, fd785, fd790; +add.f64 %13, fd794, fd797; +sub.f64 %12, fd793, fd798; +add.f64 %15, fd802, fd805; +sub.f64 %14, fd801, fd806; +sub.f64 %16, fd775, fd779; +sub.f64 %17, fd776, fd780; +sub.f64 %18, fd783, fd787; +sub.f64 %19, fd784, fd788; +sub.f64 %20, fd791, fd795; +sub.f64 %21, fd792, fd796; +sub.f64 %22, fd799, fd803; +sub.f64 %23, fd800, fd804; +sub.f64 %25, fd778, fd781; +add.f64 %24, fd777, fd782; +sub.f64 %27, fd786, fd789; +add.f64 %26, fd785, fd790; +sub.f64 %29, fd794, fd797; +add.f64 %28, fd793, fd798; +sub.f64 %31, fd802, fd805; +add.f64 %30, fd801, fd806; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_1024), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<651, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<1092>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1083, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1081, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1080, fd1083, fd1081; +sub.f64 fd76, fd1083, fd1081; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd1079, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1076, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1074, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1073, fd1076, fd1074; +sub.f64 fd92, fd1076, fd1074; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd1072, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd1072, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd1070, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd1071, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd1070, fd1071; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1069, fd1080, fd1073; +sub.f64 fd109, fd1080, fd1073; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1068, fd1079, fd100; +sub.f64 fd113, fd1079, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd1067, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd1066, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1064, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1061, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1060, fd1064, fd1061; +sub.f64 fd133, fd1064, fd1061; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd1059, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1057, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1055, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1054, fd1057, fd1055; +sub.f64 fd149, fd1057, fd1055; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd1053, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd1053, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd1051, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd1052, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd1051, fd1052; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1050, fd1060, fd1054; +sub.f64 fd166, fd1060, fd1054; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1049, fd1059, fd157; +sub.f64 fd170, fd1059, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd1048, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd1047, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1045, fd167, 0d3FED906BCF328D46; +mul.f64 fd1046, fd1049, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd1045, fd1046; +mul.f64 fd182, fd1049, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd1048, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd1047, 0d3FED906BCF328D46; +mul.f64 fd1044, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd1044, fd189; +mul.f64 fd191, fd1047, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd1043, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd1043, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd1041, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd1042, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd1041, fd1042; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd1039, fd177, 0dBFED906BCF328D46; +mul.f64 fd1040, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd1039, fd1040; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1038, fd1068, fd183; +sub.f64 fd213, fd1068, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1037, fd1067, fd187; +sub.f64 fd217, fd1067, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd1036, fd1066, fd192; +sub.f64 fd221, fd1066, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd1035, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd1034, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd1033, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1032, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd1038, fd239; +mul.f64 fd244, fd238, fd1038; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1031, fd238, fd238; +sub.f64 fd247, fd1031, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd1037, fd249; +mul.f64 fd252, fd247, fd1037; +mul.f64 fd1029, fd238, fd247; +mul.f64 fd1030, fd239, fd249; +sub.f64 fd255, fd1029, fd1030; +mul.f64 fd1028, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd1036, fd257; +mul.f64 fd260, fd255, fd1036; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1027, fd238, fd255; +sub.f64 fd263, fd1027, fd262; +mul.f64 fd1026, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd1035, fd265; +mul.f64 fd268, fd263, fd1035; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1025, fd238, fd263; +sub.f64 fd271, fd1025, fd270; +mul.f64 fd1024, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd1034, fd273; +mul.f64 fd276, fd271, fd1034; +mul.f64 fd1022, fd238, fd271; +mul.f64 fd1023, fd239, fd273; +sub.f64 fd279, fd1022, fd1023; +mul.f64 fd1021, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd1033, fd281; +mul.f64 fd284, fd279, fd1033; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1020, fd238, fd279; +sub.f64 fd287, fd1020, fd286; +mul.f64 fd1019, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd1032, fd289; +mul.f64 fd292, fd287, fd1032; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1018, fd238, fd287; +sub.f64 fd295, fd1018, fd294; +mul.f64 fd1017, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1016, fd1069, fd1050; +mul.f64 fd298, fd1016, fd297; +sub.f64 fd1015, fd106, fd163; +mul.f64 fd299, fd1015, fd297; +mul.f64 fd300, fd295, fd1016; +ld.global.v2.f64 {fd301, fd302}, [rd5+1024]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1013, fd238, fd301; +mul.f64 fd1014, fd239, fd302; +sub.f64 fd310, fd1013, fd1014; +mul.f64 fd1012, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1011, fd238, fd310; +sub.f64 fd318, fd1011, fd317; +mul.f64 fd1010, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1008, fd238, fd318; +mul.f64 fd1009, fd239, fd320; +sub.f64 fd326, fd1008, fd1009; +mul.f64 fd1007, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1005, fd238, fd326; +mul.f64 fd1006, fd239, fd328; +sub.f64 fd334, fd1005, fd1006; +mul.f64 fd1004, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1003, fd238, fd334; +sub.f64 fd342, fd1003, fd341; +mul.f64 fd1002, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1000, fd238, fd342; +mul.f64 fd1001, fd239, fd344; +sub.f64 fd350, fd1000, fd1001; +mul.f64 fd999, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd998, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 16128; +add.s32 r12, r9, r11; +mov.u32 r35, %tid.x; +shl.b32 r34, r35, 8; +add.f64 fd356, fd1069, fd1050; +sub.f64 fd1089, fd106, fd163; +and.b32 r23, r34, 16128; +add.s32 r22, r9, r23; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r22], {fd357, fd356}; +mov.u32 r33, %tid.x; +shl.b32 r32, r33, 4; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd998; +st.shared.v2.f64 [r22+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd1028; +st.shared.v2.f64 [r22+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd1026; +st.shared.v2.f64 [r22+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd1024; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r22+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd1021; +st.shared.v2.f64 [r22+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd1019; +st.shared.v2.f64 [r22+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd1017; +st.shared.v2.f64 [r22+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd1089, fd298; +sub.f64 fd373, fd300, fd299; +st.shared.v2.f64 [r22+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd1012; +st.shared.v2.f64 [r22+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd1010; +st.shared.v2.f64 [r22+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd1007; +st.shared.v2.f64 [r22+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd1004; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r22+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd1002; +st.shared.v2.f64 [r22+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd999; +st.shared.v2.f64 [r22+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r22+240], {fd386, fd387}; +barrier.sync 0; +and.b32 r20, r33, 63; +mad.lo.s32 r13, r20, -240, r22; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+1024]; +ld.shared.v2.f64 {fd396, fd397}, [r13+2048]; +ld.shared.v2.f64 {fd400, fd401}, [r13+3072]; +ld.shared.v2.f64 {fd404, fd405}, [r13+4096]; +ld.shared.v2.f64 {fd408, fd409}, [r13+5120]; +ld.shared.v2.f64 {fd412, fd413}, [r13+6144]; +ld.shared.v2.f64 {fd416, fd417}, [r13+7168]; +ld.shared.v2.f64 {fd420, fd421}, [r13+8192]; +ld.shared.v2.f64 {fd424, fd425}, [r13+9216]; +ld.shared.v2.f64 {fd428, fd429}, [r13+10240]; +ld.shared.v2.f64 {fd432, fd433}, [r13+11264]; +ld.shared.v2.f64 {fd436, fd437}, [r13+12288]; +ld.shared.v2.f64 {fd440, fd441}, [r13+13312]; +ld.shared.v2.f64 {fd444, fd445}, [r13+14336]; +ld.shared.v2.f64 {fd448, fd449}, [r13+15360]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd997, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd996, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd995, fd997, fd996; +sub.f64 fd463, fd997, fd996; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd994, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd993, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd992, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd991, fd993, fd992; +sub.f64 fd479, fd993, fd992; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd990, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd990, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd989, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd989, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd988, fd995, fd991; +sub.f64 fd496, fd995, fd991; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd987, fd994, fd487; +sub.f64 fd500, fd994, fd487; +sub.f64 fd501, fd462, fd479; +add.f64 fd503, fd462, fd479; +add.f64 fd986, fd463, fd478; +sub.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd490; +sub.f64 fd507, fd466, fd490; +add.f64 fd985, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd984, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd983, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd982, fd984, fd983; +sub.f64 fd520, fd984, fd983; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd981, fd512, fd515; +sub.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd980, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd979, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd978, fd980, fd979; +sub.f64 fd536, fd980, fd979; +sub.f64 fd537, fd527, fd532; +add.f64 fd539, fd527, fd532; +add.f64 fd977, fd528, fd531; +sub.f64 fd540, fd528, fd531; +mul.f64 fd541, fd537, 0d3FE6A09E667F3BCD; +mul.f64 fd542, fd977, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +mul.f64 fd546, fd540, 0d3FE6A09E667F3BCD; +mul.f64 fd976, fd539, 0dBFE6A09E667F3BCD; +sub.f64 fd547, fd976, fd546; +mul.f64 fd548, fd540, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd549, fd539, 0d3FE6A09E667F3BCD, fd548; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd975, fd982, fd978; +sub.f64 fd553, fd982, fd978; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd974, fd981, fd544; +sub.f64 fd557, fd981, fd544; +sub.f64 fd558, fd519, fd536; +add.f64 fd560, fd519, fd536; +add.f64 fd973, fd520, fd535; +sub.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd547; +sub.f64 fd564, fd523, fd547; +add.f64 fd972, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd970, fd554, 0d3FED906BCF328D46; +mul.f64 fd971, fd974, 0d3FD87DE2A6AEA963; +sub.f64 fd568, fd970, fd971; +mul.f64 fd569, fd974, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0d3FD87DE2A6AEA963, fd569; +mul.f64 fd571, fd558, 0d3FE6A09E667F3BCD; +mul.f64 fd572, fd973, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd571, fd572; +add.f64 fd574, fd571, fd572; +mul.f64 fd968, fd562, 0d3FD87DE2A6AEA963; +mul.f64 fd969, fd972, 0d3FED906BCF328D46; +sub.f64 fd577, fd968, fd969; +mul.f64 fd578, fd972, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd579, fd562, 0d3FED906BCF328D46, fd578; +mul.f64 fd966, fd556, 0dBFD87DE2A6AEA963; +mul.f64 fd967, fd557, 0d3FED906BCF328D46; +sub.f64 fd582, fd966, fd967; +mul.f64 fd583, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd584, fd556, 0d3FED906BCF328D46, fd583; +mul.f64 fd964, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd965, fd561, 0d3FE6A09E667F3BCD; +sub.f64 fd587, fd964, fd965; +mul.f64 fd588, fd561, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd589, fd560, 0d3FE6A09E667F3BCD, fd588; +mul.f64 fd591, fd565, 0d3FD87DE2A6AEA963; +mul.f64 fd963, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd963, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0d3FD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd962, fd987, fd570; +sub.f64 fd600, fd987, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd961, fd986, fd574; +sub.f64 fd604, fd986, fd574; +add.f64 fd605, fd505, fd577; +sub.f64 fd607, fd505, fd577; +add.f64 fd960, fd985, fd579; +sub.f64 fd608, fd985, fd579; +sub.f64 fd609, fd495, fd553; +add.f64 fd611, fd495, fd553; +add.f64 fd959, fd496, fd552; +sub.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd582; +sub.f64 fd615, fd499, fd582; +add.f64 fd958, fd500, fd584; +sub.f64 fd616, fd500, fd584; +add.f64 fd617, fd503, fd587; +sub.f64 fd619, fd503, fd587; +add.f64 fd957, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd956, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r33, 48; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd629, fd962, fd626; +mul.f64 fd631, fd625, fd962; +mul.f64 fd633, fd626, fd626; +mul.f64 fd955, fd625, fd625; +sub.f64 fd634, fd955, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd637, fd961, fd636; +mul.f64 fd639, fd634, fd961; +mul.f64 fd953, fd625, fd634; +mul.f64 fd954, fd626, fd636; +sub.f64 fd642, fd953, fd954; +mul.f64 fd952, fd601, fd636; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd645, fd960, fd644; +mul.f64 fd647, fd642, fd960; +mul.f64 fd649, fd626, fd644; +mul.f64 fd951, fd625, fd642; +sub.f64 fd650, fd951, fd649; +mul.f64 fd950, fd605, fd644; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd653, fd959, fd652; +mul.f64 fd655, fd650, fd959; +mul.f64 fd948, fd625, fd650; +mul.f64 fd949, fd626, fd652; +sub.f64 fd658, fd948, fd949; +mul.f64 fd947, fd609, fd652; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd661, fd958, fd660; +mul.f64 fd663, fd658, fd958; +mul.f64 fd945, fd625, fd658; +mul.f64 fd946, fd626, fd660; +sub.f64 fd666, fd945, fd946; +mul.f64 fd944, fd613, fd660; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd669, fd957, fd668; +mul.f64 fd671, fd666, fd957; +mul.f64 fd673, fd626, fd668; +mul.f64 fd943, fd625, fd666; +sub.f64 fd674, fd943, fd673; +mul.f64 fd942, fd617, fd668; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd677, fd956, fd676; +mul.f64 fd679, fd674, fd956; +mul.f64 fd940, fd625, fd674; +mul.f64 fd941, fd626, fd676; +sub.f64 fd682, fd940, fd941; +mul.f64 fd939, fd621, fd676; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd938, fd988, fd975; +mul.f64 fd685, fd938, fd684; +sub.f64 fd937, fd493, fd550; +mul.f64 fd686, fd937, fd684; +mul.f64 fd687, fd682, fd938; +ld.global.v2.f64 {fd688, fd689}, [rd8+64]; +mul.f64 fd692, fd600, fd689; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd936, fd625, fd688; +sub.f64 fd697, fd936, fd696; +mul.f64 fd935, fd599, fd689; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd700, fd604, fd699; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd934, fd625, fd697; +sub.f64 fd705, fd934, fd704; +mul.f64 fd933, fd603, fd699; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd708, fd608, fd707; +mul.f64 fd710, fd705, fd608; +mul.f64 fd931, fd625, fd705; +mul.f64 fd932, fd626, fd707; +sub.f64 fd713, fd931, fd932; +mul.f64 fd930, fd607, fd707; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd716, fd612, fd715; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd929, fd625, fd713; +sub.f64 fd721, fd929, fd720; +mul.f64 fd928, fd611, fd715; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd724, fd616, fd723; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd927, fd625, fd721; +sub.f64 fd729, fd927, fd728; +mul.f64 fd926, fd615, fd723; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd732, fd620, fd731; +mul.f64 fd734, fd729, fd620; +mul.f64 fd924, fd625, fd729; +mul.f64 fd925, fd626, fd731; +sub.f64 fd737, fd924, fd925; +mul.f64 fd923, fd619, fd731; +mul.f64 fd738, fd625, fd731; +mul.f64 fd922, fd597, fd626; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd624, fd739; +mul.f64 fd741, fd623, fd739; +mul.f64 fd742, fd737, fd624; +and.b32 r15, r32, 240; +add.s32 r16, r9, r15; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 8; +barrier.sync 0; +and.b32 r17, r27, 12288; +add.s32 r18, r16, r17; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 48; +add.f64 fd743, fd988, fd975; +sub.f64 fd1088, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r37, %tid.x; +and.b32 r36, r37, 48; +fma.rn.f64 fd745, fd625, fd597, fd629; +sub.f64 fd746, fd631, fd922; +st.shared.v2.f64 [r18+256], {fd745, fd746}; +fma.rn.f64 fd747, fd634, fd601, fd637; +sub.f64 fd748, fd639, fd952; +st.shared.v2.f64 [r18+512], {fd747, fd748}; +fma.rn.f64 fd749, fd642, fd605, fd645; +sub.f64 fd750, fd647, fd950; +st.shared.v2.f64 [r18+768], {fd749, fd750}; +fma.rn.f64 fd751, fd650, fd609, fd653; +sub.f64 fd752, fd655, fd947; +st.shared.v2.f64 [r18+1024], {fd751, fd752}; +sub.f64 fd753, fd663, fd944; +fma.rn.f64 fd754, fd658, fd613, fd661; +st.shared.v2.f64 [r18+1280], {fd754, fd753}; +fma.rn.f64 fd755, fd666, fd617, fd669; +sub.f64 fd756, fd671, fd942; +st.shared.v2.f64 [r18+1536], {fd755, fd756}; +fma.rn.f64 fd757, fd674, fd621, fd677; +sub.f64 fd758, fd679, fd939; +st.shared.v2.f64 [r18+1792], {fd757, fd758}; +fma.rn.f64 fd759, fd682, fd1088, fd685; +sub.f64 fd760, fd687, fd686; +st.shared.v2.f64 [r18+2048], {fd759, fd760}; +fma.rn.f64 fd761, fd688, fd599, fd692; +sub.f64 fd762, fd694, fd935; +st.shared.v2.f64 [r18+2304], {fd761, fd762}; +fma.rn.f64 fd763, fd697, fd603, fd700; +sub.f64 fd764, fd702, fd933; +st.shared.v2.f64 [r18+2560], {fd763, fd764}; +fma.rn.f64 fd765, fd705, fd607, fd708; +sub.f64 fd766, fd710, fd930; +st.shared.v2.f64 [r18+2816], {fd765, fd766}; +fma.rn.f64 fd767, fd713, fd611, fd716; +sub.f64 fd768, fd718, fd928; +st.shared.v2.f64 [r18+3072], {fd767, fd768}; +sub.f64 fd769, fd726, fd926; +fma.rn.f64 fd770, fd721, fd615, fd724; +st.shared.v2.f64 [r18+3328], {fd770, fd769}; +fma.rn.f64 fd771, fd729, fd619, fd732; +sub.f64 fd772, fd734, fd923; +st.shared.v2.f64 [r18+3584], {fd771, fd772}; +fma.rn.f64 fd773, fd737, fd623, fd740; +sub.f64 fd774, fd742, fd741; +st.shared.v2.f64 [r18+3840], {fd773, fd774}; +barrier.sync 0; +mad.lo.s32 r19, r36, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+1024]; +ld.shared.v2.f64 {fd783, fd784}, [r19+2048]; +ld.shared.v2.f64 {fd787, fd788}, [r19+3072]; +ld.shared.v2.f64 {fd791, fd792}, [r19+4096]; +ld.shared.v2.f64 {fd795, fd796}, [r19+5120]; +ld.shared.v2.f64 {fd799, fd800}, [r19+6144]; +ld.shared.v2.f64 {fd803, fd804}, [r19+7168]; +ld.shared.v2.f64 {fd807, fd808}, [r19+8192]; +ld.shared.v2.f64 {fd811, fd812}, [r19+9216]; +ld.shared.v2.f64 {fd815, fd816}, [r19+10240]; +ld.shared.v2.f64 {fd819, fd820}, [r19+11264]; +ld.shared.v2.f64 {fd823, fd824}, [r19+12288]; +ld.shared.v2.f64 {fd827, fd828}, [r19+13312]; +ld.shared.v2.f64 {fd831, fd832}, [r19+14336]; +ld.shared.v2.f64 {fd835, fd836}, [r19+15360]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd921, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd920, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd779, fd811; +sub.f64 fd849, fd779, fd811; +add.f64 fd919, fd780, fd812; +sub.f64 fd850, fd780, fd812; +add.f64 fd851, fd795, fd827; +sub.f64 fd853, fd795, fd827; +add.f64 fd918, fd796, fd828; +sub.f64 fd854, fd796, fd828; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd917, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd916, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd787, fd819; +sub.f64 fd865, fd787, fd819; +add.f64 fd915, fd788, fd820; +sub.f64 fd866, fd788, fd820; +add.f64 fd867, fd803, fd835; +sub.f64 fd869, fd803, fd835; +add.f64 fd914, fd804, fd836; +sub.f64 fd870, fd804, fd836; +add.f64 %0, fd839, fd843; +add.f64 %1, fd921, fd920; +add.f64 %2, fd847, fd851; +add.f64 %3, fd919, fd918; +add.f64 %4, fd855, fd859; +add.f64 %5, fd917, fd916; +add.f64 %7, fd915, fd914; +add.f64 %6, fd863, fd867; +add.f64 %9, fd842, fd845; +sub.f64 %8, fd841, fd846; +add.f64 %11, fd850, fd853; +sub.f64 %10, fd849, fd854; +sub.f64 %12, fd857, fd862; +add.f64 %13, fd858, fd861; +sub.f64 %14, fd865, fd870; +add.f64 %15, fd866, fd869; +sub.f64 %17, fd921, fd920; +sub.f64 %16, fd839, fd843; +sub.f64 %19, fd919, fd918; +sub.f64 %18, fd847, fd851; +sub.f64 %21, fd917, fd916; +sub.f64 %20, fd855, fd859; +sub.f64 %23, fd915, fd914; +sub.f64 %22, fd863, fd867; +sub.f64 %25, fd842, fd845; +add.f64 %24, fd841, fd846; +sub.f64 %27, fd850, fd853; +add.f64 %26, fd849, fd854; +sub.f64 %29, fd858, fd861; +add.f64 %28, fd857, fd862; +sub.f64 %31, fd866, fd869; +add.f64 %30, fd865, fd870; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_1024), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<649, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<241>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+4096]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8160; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+2048]; +ld.shared.f64 fd63, [r13+4096]; +ld.shared.f64 fd64, [r13+6144]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+2048]; +ld.shared.f64 fd67, [r13+4096]; +ld.shared.f64 fd68, [r13+6144]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+1024]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 8064; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+2048]; +ld.shared.f64 fd115, [r21+4096]; +ld.shared.f64 fd116, [r21+6144]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+2048]; +ld.shared.f64 fd119, [r21+4096]; +ld.shared.f64 fd120, [r21+6144]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd128; +add.f64 fd134, fd124, fd127; +add.f64 fd135, fd123, fd128; +sub.f64 fd136, fd124, fd127; +and.b32 r22, r5, 240; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd134, fd138; +fma.rn.f64 fd142, fd137, fd133, fd141; +mul.f64 fd143, fd133, fd138; +mul.f64 fd144, fd137, fd134; +sub.f64 fd145, fd144, fd143; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd132, fd150; +fma.rn.f64 fd152, fd148, fd131, fd151; +mul.f64 fd153, fd131, fd150; +mul.f64 fd154, fd148, fd132; +sub.f64 fd155, fd154, fd153; +ld.global.v2.f64 {fd156, fd157}, [rd11+256]; +mul.f64 fd160, fd136, fd157; +fma.rn.f64 fd161, fd156, fd135, fd160; +mul.f64 fd162, fd135, fd157; +mul.f64 fd163, fd156, fd136; +sub.f64 fd164, fd163, fd162; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 7680; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd142; +st.shared.f64 [r26+256], fd152; +st.shared.f64 [r26+384], fd161; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+2048]; +ld.shared.f64 fd167, [r27+4096]; +ld.shared.f64 fd168, [r27+6144]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+2048]; +ld.shared.f64 fd171, [r27+4096]; +ld.shared.f64 fd172, [r27+6144]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd180; +add.f64 fd186, fd176, fd179; +add.f64 fd187, fd175, fd180; +sub.f64 fd188, fd176, fd179; +and.b32 r28, r5, 192; +bfe.u32 r29, r5, 6, 2; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd186, fd190; +fma.rn.f64 fd194, fd189, fd185, fd193; +mul.f64 fd195, fd185, fd190; +mul.f64 fd196, fd189, fd186; +sub.f64 fd197, fd196, fd195; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd184, fd202; +fma.rn.f64 fd204, fd200, fd183, fd203; +mul.f64 fd205, fd183, fd202; +mul.f64 fd206, fd200, fd184; +sub.f64 fd207, fd206, fd205; +ld.global.v2.f64 {fd208, fd209}, [rd14+64]; +mul.f64 fd212, fd188, fd209; +fma.rn.f64 fd213, fd208, fd187, fd212; +mul.f64 fd214, fd187, fd209; +mul.f64 fd215, fd208, fd188; +sub.f64 fd216, fd215, fd214; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 6144; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd194; +st.shared.f64 [r33+1024], fd204; +st.shared.f64 [r33+1536], fd213; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+2048]; +ld.shared.f64 fd219, [r34+4096]; +ld.shared.f64 fd220, [r34+6144]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+2048]; +ld.shared.f64 fd223, [r34+4096]; +ld.shared.f64 fd224, [r34+6144]; +add.f64 fd225, fd217, fd219; +add.f64 fd226, fd221, fd223; +sub.f64 fd227, fd217, fd219; +sub.f64 fd228, fd221, fd223; +add.f64 fd229, fd218, fd220; +add.f64 fd230, fd222, fd224; +sub.f64 fd231, fd218, fd220; +sub.f64 fd232, fd222, fd224; +add.f64 %0, fd225, fd229; +add.f64 %1, fd226, fd230; +add.f64 %3, fd228, fd231; +sub.f64 %2, fd227, fd232; +sub.f64 %4, fd225, fd229; +sub.f64 %5, fd226, fd230; +sub.f64 %7, fd228, fd231; +add.f64 %6, fd227, fd232; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<650, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<273>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+4096]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 16320; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+4096]; +ld.shared.v2.f64 {fd69, fd70}, [r13+8192]; +ld.shared.v2.f64 {fd73, fd74}, [r13+12288]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 252; +bfe.u32 r15, r5, 2, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+1024]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 16128; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+4096]; +ld.shared.v2.f64 {fd129, fd130}, [r20+8192]; +ld.shared.v2.f64 {fd133, fd134}, [r20+12288]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +sub.f64 fd147, fd139, fd144; +add.f64 fd148, fd140, fd143; +add.f64 fd149, fd139, fd144; +sub.f64 fd150, fd140, fd143; +and.b32 r21, r5, 240; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd148, fd152; +mul.f64 fd156, fd147, fd152; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd146, fd162; +mul.f64 fd164, fd145, fd162; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+256]; +mul.f64 fd170, fd150, fd167; +mul.f64 fd171, fd149, fd167; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 15360; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd151, fd147, fd155; +sub.f64 fd176, fd157, fd156; +st.shared.v2.f64 [r25+256], {fd175, fd176}; +fma.rn.f64 fd177, fd160, fd145, fd163; +sub.f64 fd178, fd165, fd164; +st.shared.v2.f64 [r25+512], {fd177, fd178}; +fma.rn.f64 fd179, fd166, fd149, fd170; +sub.f64 fd180, fd172, fd171; +st.shared.v2.f64 [r25+768], {fd179, fd180}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+4096]; +ld.shared.v2.f64 {fd189, fd190}, [r26+8192]; +ld.shared.v2.f64 {fd193, fd194}, [r26+12288]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +sub.f64 fd207, fd199, fd204; +add.f64 fd208, fd200, fd203; +add.f64 fd209, fd199, fd204; +sub.f64 fd210, fd200, fd203; +and.b32 r27, r5, 192; +bfe.u32 r28, r5, 6, 2; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd208, fd212; +mul.f64 fd216, fd207, fd212; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd206, fd222; +mul.f64 fd224, fd205, fd222; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+64]; +mul.f64 fd230, fd210, fd227; +mul.f64 fd231, fd209, fd227; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 12288; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd211, fd207, fd215; +sub.f64 fd236, fd217, fd216; +st.shared.v2.f64 [r32+1024], {fd235, fd236}; +fma.rn.f64 fd237, fd220, fd205, fd223; +sub.f64 fd238, fd225, fd224; +st.shared.v2.f64 [r32+2048], {fd237, fd238}; +fma.rn.f64 fd239, fd226, fd209, fd230; +sub.f64 fd240, fd232, fd231; +st.shared.v2.f64 [r32+3072], {fd239, fd240}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+4096]; +ld.shared.v2.f64 {fd249, fd250}, [r33+8192]; +ld.shared.v2.f64 {fd253, fd254}, [r33+12288]; +add.f64 fd257, fd241, fd249; +add.f64 fd258, fd242, fd250; +sub.f64 fd259, fd241, fd249; +sub.f64 fd260, fd242, fd250; +add.f64 fd261, fd245, fd253; +add.f64 fd262, fd246, fd254; +sub.f64 fd263, fd245, fd253; +sub.f64 fd264, fd246, fd254; +add.f64 %1, fd258, fd262; +add.f64 %0, fd257, fd261; +add.f64 %3, fd260, fd263; +sub.f64 %2, fd259, fd264; +sub.f64 %5, fd258, fd262; +sub.f64 %4, fd257, fd261; +sub.f64 %7, fd260, fd263; +add.f64 %6, fd259, fd264; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<652, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<520>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+2048]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 16256; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+2048]; +ld.shared.v2.f64 {fd166, fd167}, [r13+4096]; +ld.shared.v2.f64 {fd170, fd171}, [r13+6144]; +ld.shared.v2.f64 {fd174, fd175}, [r13+8192]; +ld.shared.v2.f64 {fd178, fd179}, [r13+10240]; +ld.shared.v2.f64 {fd182, fd183}, [r13+12288]; +ld.shared.v2.f64 {fd186, fd187}, [r13+14336]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 120; +bfe.u32 r15, r5, 3, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+256]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 15360; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+2048]; +ld.shared.v2.f64 {fd323, fd324}, [r20+4096]; +ld.shared.v2.f64 {fd327, fd328}, [r20+6144]; +ld.shared.v2.f64 {fd331, fd332}, [r20+8192]; +ld.shared.v2.f64 {fd335, fd336}, [r20+10240]; +ld.shared.v2.f64 {fd339, fd340}, [r20+12288]; +ld.shared.v2.f64 {fd343, fd344}, [r20+14336]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +sub.f64 fd359, fd349, fd354; +add.f64 fd360, fd350, fd353; +add.f64 fd361, fd349, fd354; +sub.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +sub.f64 fd375, fd365, fd370; +add.f64 fd376, fd366, fd369; +add.f64 fd377, fd365, fd370; +sub.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0d3FE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +add.f64 fd382, fd379, fd380; +mul.f64 fd383, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd384, fd378, 0d3FE6A09E667F3BCD; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd378, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd387, fd377, 0d3FE6A09E667F3BCD, fd386; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd382; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd382; +sub.f64 fd394, fd357, fd374; +add.f64 fd395, fd358, fd373; +add.f64 fd396, fd357, fd374; +sub.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd385; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd385; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 64; +bfe.u32 r22, r5, 6, 1; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd391, fd403; +mul.f64 fd407, fd390, fd403; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd395, fd413; +mul.f64 fd415, fd394, fd413; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd399, fd421; +mul.f64 fd423, fd398, fd421; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd389, fd429; +mul.f64 fd431, fd388, fd429; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+32]; +mul.f64 fd437, fd393, fd434; +mul.f64 fd438, fd392, fd434; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd397, fd444; +mul.f64 fd446, fd396, fd444; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd401, fd452; +mul.f64 fd454, fd400, fd452; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 8192; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd402, fd390, fd406; +sub.f64 fd459, fd408, fd407; +st.shared.v2.f64 [r26+1024], {fd458, fd459}; +fma.rn.f64 fd460, fd411, fd394, fd414; +sub.f64 fd461, fd416, fd415; +st.shared.v2.f64 [r26+2048], {fd460, fd461}; +fma.rn.f64 fd462, fd419, fd398, fd422; +sub.f64 fd463, fd424, fd423; +st.shared.v2.f64 [r26+3072], {fd462, fd463}; +sub.f64 fd464, fd432, fd431; +fma.rn.f64 fd465, fd427, fd388, fd430; +st.shared.v2.f64 [r26+4096], {fd465, fd464}; +fma.rn.f64 fd466, fd433, fd392, fd437; +sub.f64 fd467, fd439, fd438; +st.shared.v2.f64 [r26+5120], {fd466, fd467}; +fma.rn.f64 fd468, fd442, fd396, fd445; +sub.f64 fd469, fd447, fd446; +st.shared.v2.f64 [r26+6144], {fd468, fd469}; +fma.rn.f64 fd470, fd450, fd400, fd453; +sub.f64 fd471, fd455, fd454; +st.shared.v2.f64 [r26+7168], {fd470, fd471}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+2048]; +ld.shared.v2.f64 {fd480, fd481}, [r27+4096]; +ld.shared.v2.f64 {fd484, fd485}, [r27+6144]; +ld.shared.v2.f64 {fd488, fd489}, [r27+8192]; +ld.shared.v2.f64 {fd492, fd493}, [r27+10240]; +ld.shared.v2.f64 {fd496, fd497}, [r27+12288]; +ld.shared.v2.f64 {fd500, fd501}, [r27+14336]; +add.f64 %1, fd473, fd489; +add.f64 %0, fd472, fd488; +add.f64 %3, fd477, fd493; +add.f64 %2, fd476, fd492; +add.f64 %5, fd481, fd497; +add.f64 %4, fd480, fd496; +add.f64 %7, fd485, fd501; +add.f64 %6, fd484, fd500; +sub.f64 %9, fd473, fd489; +sub.f64 %8, fd472, fd488; +sub.f64 %11, fd477, fd493; +sub.f64 %10, fd476, fd492; +sub.f64 %13, fd481, fd497; +sub.f64 %12, fd480, fd496; +sub.f64 %15, fd485, fd501; +sub.f64 %14, fd484, fd500; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<654, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<70>; +.reg .f64 fd<166>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %14, %16; +add.f64 fd10, %15, %17; +sub.f64 fd11, %14, %16; +sub.f64 fd12, %15, %17; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 8176; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 4088; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+4096]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+4096]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8160; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 4080; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+4096]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+4096]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8128; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 4064; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+4096]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+4096]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 6; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 8064; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 4032; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+4096]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+4096]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 496; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd80, fd82; +fma.rn.f64 fd86, fd81, fd79, fd85; +mul.f64 fd87, fd79, fd82; +mul.f64 fd88, fd81, fd80; +sub.f64 fd89, fd88, fd87; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 7936; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd86; +barrier.sync 0; +and.b32 r40, r11, 3968; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+4096]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+4096]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd97, fd99; +fma.rn.f64 fd103, fd98, fd96, fd102; +mul.f64 fd104, fd96, fd99; +mul.f64 fd105, fd98, fd97; +sub.f64 fd106, fd105, fd104; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7680; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd103; +barrier.sync 0; +and.b32 r47, r11, 3840; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+4096]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+4096]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd114, fd116; +fma.rn.f64 fd120, fd115, fd113, fd119; +mul.f64 fd121, fd113, fd116; +mul.f64 fd122, fd115, fd114; +sub.f64 fd123, fd122, fd121; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 7168; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd120; +barrier.sync 0; +and.b32 r54, r11, 3584; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+4096]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+4096]; +add.f64 fd128, fd124, fd125; +add.f64 fd129, fd126, fd127; +sub.f64 fd130, fd124, fd125; +sub.f64 fd131, fd126, fd127; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd132, fd133}, [rd26]; +mul.f64 fd136, fd131, fd133; +fma.rn.f64 fd137, fd132, fd130, fd136; +mul.f64 fd138, fd130, fd133; +mul.f64 fd139, fd132, fd131; +sub.f64 fd140, fd139, fd138; +and.b32 r57, r11, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 6144; +add.s32 r60, r58, r59; +st.shared.f64 [r60], fd128; +st.shared.f64 [r60+1024], fd137; +barrier.sync 0; +and.b32 r61, r11, 3072; +sub.s32 r62, r60, r61; +ld.shared.f64 fd141, [r62]; +ld.shared.f64 fd142, [r62+4096]; +barrier.sync 0; +st.shared.f64 [r60], fd129; +st.shared.f64 [r60+1024], fd140; +barrier.sync 0; +ld.shared.f64 fd143, [r62]; +ld.shared.f64 fd144, [r62+4096]; +add.f64 fd145, fd141, fd142; +add.f64 fd146, fd143, fd144; +sub.f64 fd147, fd141, fd142; +sub.f64 fd148, fd143, fd144; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd149, fd150}, [rd29]; +mul.f64 fd153, fd148, fd150; +fma.rn.f64 fd154, fd149, fd147, fd153; +mul.f64 fd155, fd147, fd150; +mul.f64 fd156, fd149, fd148; +sub.f64 fd157, fd156, fd155; +and.b32 r64, r11, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 4096; +add.s32 r67, r65, r66; +st.shared.f64 [r67], fd145; +st.shared.f64 [r67+2048], fd154; +barrier.sync 0; +and.b32 r68, r11, 2048; +sub.s32 r69, r67, r68; +ld.shared.f64 fd158, [r69]; +ld.shared.f64 fd159, [r69+4096]; +barrier.sync 0; +st.shared.f64 [r67], fd146; +st.shared.f64 [r67+2048], fd157; +barrier.sync 0; +ld.shared.f64 fd160, [r69]; +ld.shared.f64 fd161, [r69+4096]; +add.f64 %0, fd158, fd159; +add.f64 %1, fd160, fd161; +sub.f64 %2, fd158, fd159; +sub.f64 %3, fd160, fd161; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<653, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<70>; +.reg .f64 fd<202>; +.reg .b64 rd<30>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %14, %16; +sub.f64 fd10, %15, %17; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -16384; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 16352; +add.s32 r11, r8, r10; +add.f64 fd18, %15, %17; +add.f64 fd19, %14, %16; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 8176; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+8192]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 8; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16320; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 8160; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+8192]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 7; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 16256; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 8128; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+8192]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 6; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 16128; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 8064; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+8192]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 496; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd94, fd96; +mul.f64 fd100, fd93, fd96; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 15872; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd95, fd93, fd99; +sub.f64 fd105, fd101, fd100; +st.shared.v2.f64 [r39+256], {fd104, fd105}; +barrier.sync 0; +and.b32 r40, r9, 7936; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+8192]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 4; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd115, fd117; +mul.f64 fd121, fd114, fd117; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 15360; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd116, fd114, fd120; +sub.f64 fd126, fd122, fd121; +st.shared.v2.f64 [r46+512], {fd125, fd126}; +barrier.sync 0; +and.b32 r47, r9, 7680; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+8192]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 3; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd136, fd138; +mul.f64 fd142, fd135, fd138; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 14336; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd137, fd135, fd141; +sub.f64 fd147, fd143, fd142; +st.shared.v2.f64 [r53+1024], {fd146, fd147}; +barrier.sync 0; +and.b32 r54, r9, 7168; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+8192]; +sub.f64 fd156, fd148, fd152; +sub.f64 fd157, fd149, fd153; +bfe.u32 r56, r5, 7, 2; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd158, fd159}, [rd26]; +mul.f64 fd162, fd157, fd159; +mul.f64 fd163, fd156, fd159; +mul.f64 fd164, fd158, fd157; +and.b32 r57, r9, 2032; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 12288; +add.s32 r60, r58, r59; +add.f64 fd165, fd149, fd153; +add.f64 fd166, fd148, fd152; +st.shared.v2.f64 [r60], {fd166, fd165}; +fma.rn.f64 fd167, fd158, fd156, fd162; +sub.f64 fd168, fd164, fd163; +st.shared.v2.f64 [r60+2048], {fd167, fd168}; +barrier.sync 0; +and.b32 r61, r9, 6144; +sub.s32 r62, r60, r61; +ld.shared.v2.f64 {fd169, fd170}, [r62]; +ld.shared.v2.f64 {fd173, fd174}, [r62+8192]; +sub.f64 fd177, fd169, fd173; +sub.f64 fd178, fd170, fd174; +bfe.u32 r63, r5, 8, 1; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd179, fd180}, [rd29]; +mul.f64 fd183, fd178, fd180; +mul.f64 fd184, fd177, fd180; +mul.f64 fd185, fd179, fd178; +and.b32 r64, r9, 4080; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 8192; +add.s32 r67, r65, r66; +add.f64 fd186, fd170, fd174; +add.f64 fd187, fd169, fd173; +st.shared.v2.f64 [r67], {fd187, fd186}; +fma.rn.f64 fd188, fd179, fd177, fd183; +sub.f64 fd189, fd185, fd184; +st.shared.v2.f64 [r67+4096], {fd188, fd189}; +barrier.sync 0; +and.b32 r68, r9, 4096; +sub.s32 r69, r67, r68; +ld.shared.v2.f64 {fd190, fd191}, [r69]; +ld.shared.v2.f64 {fd194, fd195}, [r69+8192]; +add.f64 %1, fd191, fd195; +add.f64 %0, fd190, fd194; +sub.f64 %3, fd191, fd195; +sub.f64 %2, fd190, fd194; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..fc20326019fe1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp16_fwd.hpp.inc @@ -0,0 +1,854 @@ +#ifndef CUFFTDX_FFT_10_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_10_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<936, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<35>; +.reg .b32 r<749>; +.reg .f64 fd<31>; +.reg .b64 rd<2>; +mov.f64 fd15, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd15; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd18, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd18; +} +mov.b32 r228, {rs2, rs2}; +mov.f64 fd19, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs3, fd19; +} +mov.b32 r282, {rs3, rs3}; +mov.f64 fd20, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs4, fd20; +} +mov.b32 r300, {rs4, rs4}; +{ +cvt.rn.f16.f64 rs5, fd15; +} +mov.b32 r291, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd18; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r306, {rs7, rs7}; +{ +add.f16x2 r1, %24, %36; +} +{ +add.f16x2 r4, %20, r1; +} +{ +add.f16x2 r7, %28, %32; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %25, %37; +} +{ +add.f16x2 r16, %21, r13; +} +{ +add.f16x2 r19, %29, %33; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %24, %36; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %20, r28; +} +{ +add.f16x2 r34, %28, %32; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %25, %37; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %29, %33; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %24, %36; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %20, r64; +} +{ +add.f16x2 r70, %28, %32; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %25, %37; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %29, %33; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %24, %36; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %20, r100; +} +{ +add.f16x2 r106, %28, %32; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %25, %37; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %29, %33; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %24, %36; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %20, r136; +} +{ +add.f16x2 r142, %28, %32; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %25, %37; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %29, %33; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %25, %37; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %21, r172; +} +{ +add.f16x2 r178, %29, %33; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %24, %36; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %28, %32; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %25, %37; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %21, r208; +} +{ +add.f16x2 r214, %29, %33; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %24, %36; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %28, %32; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %25, %37; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %21, r244; +} +{ +add.f16x2 r250, %29, %33; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %24, %36; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %28, %32; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %25, %37; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %21, r280; +} +{ +add.f16x2 r286, %29, %33; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %24, %36; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %28, %32; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs9, fd15; +} +mov.b32 r522, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd18; +} +mov.b32 r540, {rs10, rs10}; +{ +cvt.rn.f16.f64 rs11, fd19; +} +mov.b32 r594, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd20; +} +mov.b32 r612, {rs12, rs12}; +{ +cvt.rn.f16.f64 rs13, fd15; +} +mov.b32 r603, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd18; +} +{ +neg.f16 rs15, rs14; +} +mov.b32 r618, {rs15, rs15}; +{ +add.f16x2 r313, %26, %38; +} +{ +add.f16x2 r316, %22, r313; +} +{ +add.f16x2 r319, %30, %34; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %27, %39; +} +{ +add.f16x2 r328, %23, r325; +} +{ +add.f16x2 r331, %31, %35; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %26, %38; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %22, r340; +} +{ +add.f16x2 r346, %30, %34; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %27, %39; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %31, %35; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %26, %38; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %22, r376; +} +{ +add.f16x2 r382, %30, %34; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %27, %39; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %31, %35; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %26, %38; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %22, r412; +} +{ +add.f16x2 r418, %30, %34; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %27, %39; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %31, %35; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %26, %38; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %22, r448; +} +{ +add.f16x2 r454, %30, %34; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %27, %39; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %31, %35; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %27, %39; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %23, r484; +} +{ +add.f16x2 r490, %31, %35; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %26, %38; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %30, %34; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %27, %39; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %23, r520; +} +{ +add.f16x2 r526, %31, %35; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %26, %38; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %30, %34; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %27, %39; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %23, r556; +} +{ +add.f16x2 r562, %31, %35; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %26, %38; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %30, %34; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %27, %39; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %23, r592; +} +{ +add.f16x2 r598, %31, %35; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %26, %38; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %30, %34; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +mov.f64 fd13, 0d3FE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs17, fd13; +} +{ +cvt.rn.f16.f64 rs18, fd20; +} +{ +cvt.rn.f16.f64 rs19, fd15; +} +{ +cvt.rn.f16.f64 rs20, fd18; +} +mov.f64 fd17, 0dBFD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs21, fd17; +} +{ +cvt.rn.f16.f64 rs22, fd18; +} +{ +cvt.rn.f16.f64 rs23, fd19; +} +{ +cvt.rn.f16.f64 rs24, fd20; +} +mov.b32 r639, {rs17, rs17}; +{ +mul.f16x2 r625, r370, r639; +} +mov.b32 r636, {rs18, rs18}; +{ +mul.f16x2 r628, r514, r636; +} +{ +sub.f16x2 r631, r625, r628; +} +{ +mul.f16x2 r634, r370, r636; +} +{ +fma.rn.f16x2 r637, r514, r639, r634; +} +mov.b32 r655, {rs19, rs19}; +{ +mul.f16x2 r641, r442, r655; +} +mov.b32 r652, {rs20, rs20}; +{ +mul.f16x2 r644, r586, r652; +} +{ +sub.f16x2 r647, r641, r644; +} +{ +mul.f16x2 r650, r442, r652; +} +{ +fma.rn.f16x2 r653, r586, r655, r650; +} +mov.b32 r671, {rs21, rs21}; +{ +mul.f16x2 r657, r478, r671; +} +mov.b32 r668, {rs22, rs22}; +{ +mul.f16x2 r660, r622, r668; +} +{ +sub.f16x2 r663, r657, r660; +} +{ +mul.f16x2 r666, r478, r668; +} +{ +fma.rn.f16x2 r669, r622, r671, r666; +} +mov.b32 r687, {rs23, rs23}; +{ +mul.f16x2 r673, r406, r687; +} +mov.b32 r684, {rs24, rs24}; +{ +mul.f16x2 r676, r550, r684; +} +{ +sub.f16x2 r679, r673, r676; +} +{ +mul.f16x2 r682, r406, r684; +} +{ +fma.rn.f16x2 r685, r550, r687, r682; +} +{ +add.f16x2 %0, r10, r322; +} +{ +add.f16x2 %1, r22, r334; +} +{ +sub.f16x2 %10, r10, r322; +} +{ +sub.f16x2 %11, r22, r334; +} +{ +add.f16x2 %2, r58, r631; +} +{ +add.f16x2 %3, r202, r637; +} +{ +sub.f16x2 %12, r58, r631; +} +{ +sub.f16x2 %13, r202, r637; +} +{ +add.f16x2 %4, r130, r647; +} +{ +add.f16x2 %5, r274, r653; +} +{ +sub.f16x2 %14, r130, r647; +} +{ +sub.f16x2 %15, r274, r653; +} +{ +add.f16x2 %6, r166, r663; +} +{ +add.f16x2 %7, r310, r669; +} +{ +sub.f16x2 %16, r166, r663; +} +{ +sub.f16x2 %17, r310, r669; +} +{ +add.f16x2 %8, r94, r679; +} +{ +add.f16x2 %9, r238, r685; +} +{ +sub.f16x2 %18, r94, r679; +} +{ +sub.f16x2 %19, r238, r685; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..aa454d22f55ac --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp16_inv.hpp.inc @@ -0,0 +1,862 @@ +#ifndef CUFFTDX_FFT_10_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_10_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1138, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<39>; +.reg .b32 r<749>; +.reg .f64 fd<31>; +.reg .b64 rd<2>; +mov.f64 fd15, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd15; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd12, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd12; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r228, {rs3, rs3}; +mov.f64 fd19, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs5, fd19; +} +mov.b32 r282, {rs5, rs5}; +mov.f64 fd10, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs6, fd10; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r300, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs9, fd15; +} +mov.b32 r291, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd12; +} +mov.b32 r306, {rs10, rs10}; +{ +add.f16x2 r1, %24, %36; +} +{ +add.f16x2 r4, %20, r1; +} +{ +add.f16x2 r7, %28, %32; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %25, %37; +} +{ +add.f16x2 r16, %21, r13; +} +{ +add.f16x2 r19, %29, %33; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %24, %36; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %20, r28; +} +{ +add.f16x2 r34, %28, %32; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %25, %37; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %29, %33; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %24, %36; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %20, r64; +} +{ +add.f16x2 r70, %28, %32; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %25, %37; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %29, %33; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %24, %36; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %20, r100; +} +{ +add.f16x2 r106, %28, %32; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %25, %37; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %29, %33; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %24, %36; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %20, r136; +} +{ +add.f16x2 r142, %28, %32; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %25, %37; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %29, %33; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %25, %37; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %21, r172; +} +{ +add.f16x2 r178, %29, %33; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %24, %36; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %28, %32; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %25, %37; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %21, r208; +} +{ +add.f16x2 r214, %29, %33; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %24, %36; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %28, %32; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %25, %37; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %21, r244; +} +{ +add.f16x2 r250, %29, %33; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %24, %36; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %28, %32; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %25, %37; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %21, r280; +} +{ +add.f16x2 r286, %29, %33; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %24, %36; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %28, %32; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs11, fd15; +} +mov.b32 r522, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd12; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r540, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs15, fd19; +} +mov.b32 r594, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd10; +} +{ +neg.f16 rs17, rs16; +} +mov.b32 r612, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs19, fd15; +} +mov.b32 r603, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd12; +} +mov.b32 r618, {rs20, rs20}; +{ +add.f16x2 r313, %26, %38; +} +{ +add.f16x2 r316, %22, r313; +} +{ +add.f16x2 r319, %30, %34; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %27, %39; +} +{ +add.f16x2 r328, %23, r325; +} +{ +add.f16x2 r331, %31, %35; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %26, %38; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %22, r340; +} +{ +add.f16x2 r346, %30, %34; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %27, %39; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %31, %35; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %26, %38; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %22, r376; +} +{ +add.f16x2 r382, %30, %34; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %27, %39; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %31, %35; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %26, %38; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %22, r412; +} +{ +add.f16x2 r418, %30, %34; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %27, %39; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %31, %35; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %26, %38; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %22, r448; +} +{ +add.f16x2 r454, %30, %34; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %27, %39; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %31, %35; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %27, %39; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %23, r484; +} +{ +add.f16x2 r490, %31, %35; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %26, %38; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %30, %34; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %27, %39; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %23, r520; +} +{ +add.f16x2 r526, %31, %35; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %26, %38; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %30, %34; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %27, %39; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %23, r556; +} +{ +add.f16x2 r562, %31, %35; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %26, %38; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %30, %34; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %27, %39; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %23, r592; +} +{ +add.f16x2 r598, %31, %35; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %26, %38; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %30, %34; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +mov.f64 fd13, 0d3FE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs21, fd13; +} +mov.f64 fd20, 0d3FE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs22, fd20; +} +{ +cvt.rn.f16.f64 rs23, fd15; +} +mov.f64 fd18, 0d3FEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs24, fd18; +} +mov.f64 fd17, 0dBFD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs25, fd17; +} +{ +cvt.rn.f16.f64 rs26, fd18; +} +{ +cvt.rn.f16.f64 rs27, fd19; +} +{ +cvt.rn.f16.f64 rs28, fd20; +} +mov.b32 r639, {rs21, rs21}; +{ +mul.f16x2 r625, r370, r639; +} +mov.b32 r636, {rs22, rs22}; +{ +mul.f16x2 r628, r514, r636; +} +{ +sub.f16x2 r631, r625, r628; +} +{ +mul.f16x2 r634, r370, r636; +} +{ +fma.rn.f16x2 r637, r514, r639, r634; +} +mov.b32 r655, {rs23, rs23}; +{ +mul.f16x2 r641, r442, r655; +} +mov.b32 r652, {rs24, rs24}; +{ +mul.f16x2 r644, r586, r652; +} +{ +sub.f16x2 r647, r641, r644; +} +{ +mul.f16x2 r650, r442, r652; +} +{ +fma.rn.f16x2 r653, r586, r655, r650; +} +mov.b32 r671, {rs25, rs25}; +{ +mul.f16x2 r657, r478, r671; +} +mov.b32 r668, {rs26, rs26}; +{ +mul.f16x2 r660, r622, r668; +} +{ +sub.f16x2 r663, r657, r660; +} +{ +mul.f16x2 r666, r478, r668; +} +{ +fma.rn.f16x2 r669, r622, r671, r666; +} +mov.b32 r687, {rs27, rs27}; +{ +mul.f16x2 r673, r406, r687; +} +mov.b32 r684, {rs28, rs28}; +{ +mul.f16x2 r676, r550, r684; +} +{ +sub.f16x2 r679, r673, r676; +} +{ +mul.f16x2 r682, r406, r684; +} +{ +fma.rn.f16x2 r685, r550, r687, r682; +} +{ +add.f16x2 %0, r10, r322; +} +{ +add.f16x2 %1, r22, r334; +} +{ +sub.f16x2 %10, r10, r322; +} +{ +sub.f16x2 %11, r22, r334; +} +{ +add.f16x2 %2, r58, r631; +} +{ +add.f16x2 %3, r202, r637; +} +{ +sub.f16x2 %12, r58, r631; +} +{ +sub.f16x2 %13, r202, r637; +} +{ +add.f16x2 %4, r130, r647; +} +{ +add.f16x2 %5, r274, r653; +} +{ +sub.f16x2 %14, r130, r647; +} +{ +sub.f16x2 %15, r274, r653; +} +{ +add.f16x2 %6, r166, r663; +} +{ +add.f16x2 %7, r310, r669; +} +{ +sub.f16x2 %16, r166, r663; +} +{ +sub.f16x2 %17, r310, r669; +} +{ +add.f16x2 %8, r94, r679; +} +{ +add.f16x2 %9, r238, r685; +} +{ +sub.f16x2 %18, r94, r679; +} +{ +sub.f16x2 %19, r238, r685; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..04499063a4c1a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp32_fwd.hpp.inc @@ -0,0 +1,144 @@ +#ifndef CUFFTDX_FFT_10_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_10_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<190, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<169>; +.reg .b64 rd<2>; +add.f32 f41, %25, %41; +add.f32 f42, %20, f41; +add.f32 f43, %30, %36; +add.f32 f44, f43, f42; +add.f32 f45, %27, %43; +add.f32 f46, %21, f45; +add.f32 f47, %32, %37; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %20; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %27, %43; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %32, %37; +mul.f32 f55, f54, 0fBF167918; +sub.f32 f56, f55, f53; +sub.f32 f57, f51, f56; +add.f32 f58, f56, f51; +mul.f32 f59, f41, 0f3F4F1BBD; +sub.f32 f60, %20, f59; +fma.rn.f32 f61, f43, 0f3E9E377A, f60; +mul.f32 f62, f52, 0f3F167918; +mul.f32 f63, f54, 0f3F737871; +sub.f32 f64, f63, f62; +sub.f32 f65, f61, f64; +add.f32 f66, f64, f61; +fma.rn.f32 f67, f45, 0f3E9E377A, %21; +mul.f32 f68, f47, 0f3F4F1BBD; +sub.f32 f69, f67, f68; +sub.f32 f70, %25, %41; +mul.f32 f71, f70, 0f3F737871; +sub.f32 f72, %30, %36; +mul.f32 f73, f72, 0fBF167918; +sub.f32 f74, f73, f71; +add.f32 f75, f74, f69; +sub.f32 f76, f69, f74; +mul.f32 f77, f45, 0f3F4F1BBD; +sub.f32 f78, %21, f77; +fma.rn.f32 f79, f47, 0f3E9E377A, f78; +mul.f32 f80, f70, 0f3F167918; +mul.f32 f81, f72, 0f3F737871; +sub.f32 f82, f81, f80; +add.f32 f83, f82, f79; +sub.f32 f84, f79, f82; +add.f32 f85, %28, %44; +add.f32 f86, %22, f85; +add.f32 f87, %33, %38; +add.f32 f88, f87, f86; +add.f32 f89, %29, %45; +add.f32 f90, %24, f89; +add.f32 f91, %35, %40; +add.f32 f92, f91, f90; +fma.rn.f32 f93, f85, 0f3E9E377A, %22; +mul.f32 f94, f87, 0f3F4F1BBD; +sub.f32 f95, f93, f94; +sub.f32 f96, %29, %45; +mul.f32 f97, f96, 0f3F737871; +sub.f32 f98, %35, %40; +mul.f32 f99, f98, 0fBF167918; +sub.f32 f100, f99, f97; +sub.f32 f101, f95, f100; +add.f32 f102, f100, f95; +mul.f32 f103, f85, 0f3F4F1BBD; +sub.f32 f104, %22, f103; +fma.rn.f32 f105, f87, 0f3E9E377A, f104; +mul.f32 f106, f96, 0f3F167918; +mul.f32 f107, f98, 0f3F737871; +sub.f32 f108, f107, f106; +sub.f32 f109, f105, f108; +add.f32 f110, f108, f105; +fma.rn.f32 f111, f89, 0f3E9E377A, %24; +mul.f32 f112, f91, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %28, %44; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %33, %38; +mul.f32 f117, f116, 0fBF167918; +sub.f32 f118, f117, f115; +add.f32 f119, f118, f113; +sub.f32 f120, f113, f118; +mul.f32 f121, f89, 0f3F4F1BBD; +sub.f32 f122, %24, f121; +fma.rn.f32 f123, f91, 0f3E9E377A, f122; +mul.f32 f124, f114, 0f3F167918; +mul.f32 f125, f116, 0f3F737871; +sub.f32 f126, f125, f124; +add.f32 f127, f126, f123; +sub.f32 f128, f123, f126; +mul.f32 f129, f101, 0f3F4F1BBD; +mul.f32 f130, f119, 0fBF167918; +sub.f32 f131, f129, f130; +mul.f32 f132, f119, 0f3F4F1BBD; +fma.rn.f32 f133, f101, 0fBF167918, f132; +mul.f32 f134, f109, 0f3E9E377A; +mul.f32 f135, f127, 0fBF737871; +sub.f32 f136, f134, f135; +mul.f32 f137, f127, 0f3E9E377A; +fma.rn.f32 f138, f109, 0fBF737871, f137; +mul.f32 f139, f110, 0fBE9E377A; +mul.f32 f140, f128, 0fBF737871; +sub.f32 f141, f139, f140; +mul.f32 f142, f128, 0fBE9E377A; +fma.rn.f32 f143, f110, 0fBF737871, f142; +mul.f32 f144, f102, 0fBF4F1BBD; +mul.f32 f145, f120, 0fBF167918; +sub.f32 f146, f144, f145; +mul.f32 f147, f120, 0fBF4F1BBD; +fma.rn.f32 f148, f102, 0fBF167918, f147; +add.f32 %1, f48, f92; +add.f32 %0, f44, f88; +add.f32 %3, f75, f133; +add.f32 %2, f57, f131; +add.f32 %5, f83, f138; +add.f32 %4, f65, f136; +add.f32 %7, f84, f143; +add.f32 %6, f66, f141; +add.f32 %9, f76, f148; +add.f32 %8, f58, f146; +sub.f32 %11, f48, f92; +sub.f32 %10, f44, f88; +sub.f32 %13, f75, f133; +sub.f32 %12, f57, f131; +sub.f32 %15, f83, f138; +sub.f32 %14, f65, f136; +sub.f32 %17, f84, f143; +sub.f32 %16, f66, f141; +sub.f32 %19, f76, f148; +sub.f32 %18, f58, f146; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..632afe80144cc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp32_inv.hpp.inc @@ -0,0 +1,140 @@ +#ifndef CUFFTDX_FFT_10_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_10_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<392, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<165>; +.reg .b64 rd<2>; +add.f32 f41, %25, %41; +add.f32 f42, %20, f41; +add.f32 f43, %30, %36; +add.f32 f44, f43, f42; +add.f32 f45, %27, %43; +add.f32 f46, %21, f45; +add.f32 f47, %32, %37; +add.f32 f48, f47, f46; +fma.rn.f32 f49, f41, 0f3E9E377A, %20; +mul.f32 f50, f43, 0f3F4F1BBD; +sub.f32 f51, f49, f50; +sub.f32 f52, %27, %43; +mul.f32 f53, f52, 0f3F737871; +sub.f32 f54, %32, %37; +fma.rn.f32 f55, f54, 0f3F167918, f53; +sub.f32 f56, f51, f55; +add.f32 f57, f55, f51; +mul.f32 f58, f41, 0f3F4F1BBD; +sub.f32 f59, %20, f58; +fma.rn.f32 f60, f43, 0f3E9E377A, f59; +mul.f32 f61, f52, 0f3F167918; +mul.f32 f62, f54, 0f3F737871; +sub.f32 f63, f61, f62; +sub.f32 f64, f60, f63; +add.f32 f65, f63, f60; +fma.rn.f32 f66, f45, 0f3E9E377A, %21; +mul.f32 f67, f47, 0f3F4F1BBD; +sub.f32 f68, f66, f67; +sub.f32 f69, %25, %41; +mul.f32 f70, f69, 0f3F737871; +sub.f32 f71, %30, %36; +fma.rn.f32 f72, f71, 0f3F167918, f70; +add.f32 f73, f72, f68; +sub.f32 f74, f68, f72; +mul.f32 f75, f45, 0f3F4F1BBD; +sub.f32 f76, %21, f75; +fma.rn.f32 f77, f47, 0f3E9E377A, f76; +mul.f32 f78, f69, 0f3F167918; +mul.f32 f79, f71, 0f3F737871; +sub.f32 f80, f78, f79; +add.f32 f81, f80, f77; +sub.f32 f82, f77, f80; +add.f32 f83, %28, %44; +add.f32 f84, %22, f83; +add.f32 f85, %33, %38; +add.f32 f86, f85, f84; +add.f32 f87, %29, %45; +add.f32 f88, %24, f87; +add.f32 f89, %35, %40; +add.f32 f90, f89, f88; +fma.rn.f32 f91, f83, 0f3E9E377A, %22; +mul.f32 f92, f85, 0f3F4F1BBD; +sub.f32 f93, f91, f92; +sub.f32 f94, %29, %45; +mul.f32 f95, f94, 0f3F737871; +sub.f32 f96, %35, %40; +fma.rn.f32 f97, f96, 0f3F167918, f95; +sub.f32 f98, f93, f97; +add.f32 f99, f97, f93; +mul.f32 f100, f83, 0f3F4F1BBD; +sub.f32 f101, %22, f100; +fma.rn.f32 f102, f85, 0f3E9E377A, f101; +mul.f32 f103, f94, 0f3F167918; +mul.f32 f104, f96, 0f3F737871; +sub.f32 f105, f103, f104; +sub.f32 f106, f102, f105; +add.f32 f107, f105, f102; +fma.rn.f32 f108, f87, 0f3E9E377A, %24; +mul.f32 f109, f89, 0f3F4F1BBD; +sub.f32 f110, f108, f109; +sub.f32 f111, %28, %44; +mul.f32 f112, f111, 0f3F737871; +sub.f32 f113, %33, %38; +fma.rn.f32 f114, f113, 0f3F167918, f112; +add.f32 f115, f114, f110; +sub.f32 f116, f110, f114; +mul.f32 f117, f87, 0f3F4F1BBD; +sub.f32 f118, %24, f117; +fma.rn.f32 f119, f89, 0f3E9E377A, f118; +mul.f32 f120, f111, 0f3F167918; +mul.f32 f121, f113, 0f3F737871; +sub.f32 f122, f120, f121; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +mul.f32 f125, f98, 0f3F4F1BBD; +mul.f32 f126, f115, 0f3F167918; +sub.f32 f127, f125, f126; +mul.f32 f128, f115, 0f3F4F1BBD; +fma.rn.f32 f129, f98, 0f3F167918, f128; +mul.f32 f130, f106, 0f3E9E377A; +mul.f32 f131, f123, 0f3F737871; +sub.f32 f132, f130, f131; +mul.f32 f133, f123, 0f3E9E377A; +fma.rn.f32 f134, f106, 0f3F737871, f133; +mul.f32 f135, f107, 0fBE9E377A; +mul.f32 f136, f124, 0f3F737871; +sub.f32 f137, f135, f136; +mul.f32 f138, f124, 0fBE9E377A; +fma.rn.f32 f139, f107, 0f3F737871, f138; +mul.f32 f140, f99, 0fBF4F1BBD; +mul.f32 f141, f116, 0f3F167918; +sub.f32 f142, f140, f141; +mul.f32 f143, f116, 0fBF4F1BBD; +fma.rn.f32 f144, f99, 0f3F167918, f143; +add.f32 %1, f48, f90; +add.f32 %0, f44, f86; +add.f32 %3, f73, f129; +add.f32 %2, f56, f127; +add.f32 %5, f81, f134; +add.f32 %4, f64, f132; +add.f32 %7, f82, f139; +add.f32 %6, f65, f137; +add.f32 %9, f74, f144; +add.f32 %8, f57, f142; +sub.f32 %11, f48, f90; +sub.f32 %10, f44, f86; +sub.f32 %13, f73, f129; +sub.f32 %12, f56, f127; +sub.f32 %15, f81, f134; +sub.f32 %14, f64, f132; +sub.f32 %17, f82, f139; +sub.f32 %16, f65, f137; +sub.f32 %19, f74, f144; +sub.f32 %18, f57, f142; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..3f8aeb6459c4f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp64_fwd.hpp.inc @@ -0,0 +1,144 @@ +#ifndef CUFFTDX_FFT_10_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_10_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<565, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<169>; +.reg .b64 rd<2>; +add.f64 fd41, %25, %41; +add.f64 fd42, %20, fd41; +add.f64 fd43, %30, %36; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %27, %43; +add.f64 fd46, %21, fd45; +add.f64 fd47, %32, %37; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %20; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %27, %43; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %32, %37; +mul.f64 fd55, fd54, 0dBFE2CF2304755A5E; +sub.f64 fd56, fd55, fd53; +sub.f64 fd57, fd51, fd56; +add.f64 fd58, fd56, fd51; +mul.f64 fd59, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd60, %20, fd59; +fma.rn.f64 fd61, fd43, 0d3FD3C6EF372FE950, fd60; +mul.f64 fd62, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd63, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd64, fd63, fd62; +sub.f64 fd65, fd61, fd64; +add.f64 fd66, fd64, fd61; +fma.rn.f64 fd67, fd45, 0d3FD3C6EF372FE950, %21; +mul.f64 fd68, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd69, fd67, fd68; +sub.f64 fd70, %25, %41; +mul.f64 fd71, fd70, 0d3FEE6F0E134454FF; +sub.f64 fd72, %30, %36; +mul.f64 fd73, fd72, 0dBFE2CF2304755A5E; +sub.f64 fd74, fd73, fd71; +add.f64 fd75, fd74, fd69; +sub.f64 fd76, fd69, fd74; +mul.f64 fd77, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd78, %21, fd77; +fma.rn.f64 fd79, fd47, 0d3FD3C6EF372FE950, fd78; +mul.f64 fd80, fd70, 0d3FE2CF2304755A5E; +mul.f64 fd81, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd82, fd81, fd80; +add.f64 fd83, fd82, fd79; +sub.f64 fd84, fd79, fd82; +add.f64 fd85, %28, %44; +add.f64 fd86, %22, fd85; +add.f64 fd87, %33, %38; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %29, %45; +add.f64 fd90, %24, fd89; +add.f64 fd91, %35, %40; +add.f64 fd92, fd91, fd90; +fma.rn.f64 fd93, fd85, 0d3FD3C6EF372FE950, %22; +mul.f64 fd94, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd95, fd93, fd94; +sub.f64 fd96, %29, %45; +mul.f64 fd97, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd98, %35, %40; +mul.f64 fd99, fd98, 0dBFE2CF2304755A5E; +sub.f64 fd100, fd99, fd97; +sub.f64 fd101, fd95, fd100; +add.f64 fd102, fd100, fd95; +mul.f64 fd103, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd104, %22, fd103; +fma.rn.f64 fd105, fd87, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd106, fd96, 0d3FE2CF2304755A5E; +mul.f64 fd107, fd98, 0d3FEE6F0E134454FF; +sub.f64 fd108, fd107, fd106; +sub.f64 fd109, fd105, fd108; +add.f64 fd110, fd108, fd105; +fma.rn.f64 fd111, fd89, 0d3FD3C6EF372FE950, %24; +mul.f64 fd112, fd91, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %28, %44; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %33, %38; +mul.f64 fd117, fd116, 0dBFE2CF2304755A5E; +sub.f64 fd118, fd117, fd115; +add.f64 fd119, fd118, fd113; +sub.f64 fd120, fd113, fd118; +mul.f64 fd121, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd122, %24, fd121; +fma.rn.f64 fd123, fd91, 0d3FD3C6EF372FE950, fd122; +mul.f64 fd124, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd125, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd125, fd124; +add.f64 fd127, fd126, fd123; +sub.f64 fd128, fd123, fd126; +mul.f64 fd129, fd101, 0d3FE9E3779B97F4A8; +mul.f64 fd130, fd119, 0dBFE2CF2304755A5E; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd119, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd133, fd101, 0dBFE2CF2304755A5E, fd132; +mul.f64 fd134, fd109, 0d3FD3C6EF372FE950; +mul.f64 fd135, fd127, 0dBFEE6F0E134454FF; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd127, 0d3FD3C6EF372FE950; +fma.rn.f64 fd138, fd109, 0dBFEE6F0E134454FF, fd137; +mul.f64 fd139, fd110, 0dBFD3C6EF372FE950; +mul.f64 fd140, fd128, 0dBFEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +mul.f64 fd142, fd128, 0dBFD3C6EF372FE950; +fma.rn.f64 fd143, fd110, 0dBFEE6F0E134454FF, fd142; +mul.f64 fd144, fd102, 0dBFE9E3779B97F4A8; +mul.f64 fd145, fd120, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd144, fd145; +mul.f64 fd147, fd120, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd148, fd102, 0dBFE2CF2304755A5E, fd147; +add.f64 %1, fd48, fd92; +add.f64 %0, fd44, fd88; +add.f64 %3, fd75, fd133; +add.f64 %2, fd57, fd131; +add.f64 %5, fd83, fd138; +add.f64 %4, fd65, fd136; +add.f64 %7, fd84, fd143; +add.f64 %6, fd66, fd141; +add.f64 %9, fd76, fd148; +add.f64 %8, fd58, fd146; +sub.f64 %11, fd48, fd92; +sub.f64 %10, fd44, fd88; +sub.f64 %13, fd75, fd133; +sub.f64 %12, fd57, fd131; +sub.f64 %15, fd83, fd138; +sub.f64 %14, fd65, fd136; +sub.f64 %17, fd84, fd143; +sub.f64 %16, fd66, fd141; +sub.f64 %19, fd76, fd148; +sub.f64 %18, fd58, fd146; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..e4188c1b685e1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_10_fp64_inv.hpp.inc @@ -0,0 +1,140 @@ +#ifndef CUFFTDX_FFT_10_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_10_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<736, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<165>; +.reg .b64 rd<2>; +add.f64 fd41, %25, %41; +add.f64 fd42, %20, fd41; +add.f64 fd43, %30, %36; +add.f64 fd44, fd43, fd42; +add.f64 fd45, %27, %43; +add.f64 fd46, %21, fd45; +add.f64 fd47, %32, %37; +add.f64 fd48, fd47, fd46; +fma.rn.f64 fd49, fd41, 0d3FD3C6EF372FE950, %20; +mul.f64 fd50, fd43, 0d3FE9E3779B97F4A8; +sub.f64 fd51, fd49, fd50; +sub.f64 fd52, %27, %43; +mul.f64 fd53, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd54, %32, %37; +fma.rn.f64 fd55, fd54, 0d3FE2CF2304755A5E, fd53; +sub.f64 fd56, fd51, fd55; +add.f64 fd57, fd55, fd51; +mul.f64 fd58, fd41, 0d3FE9E3779B97F4A8; +sub.f64 fd59, %20, fd58; +fma.rn.f64 fd60, fd43, 0d3FD3C6EF372FE950, fd59; +mul.f64 fd61, fd52, 0d3FE2CF2304755A5E; +mul.f64 fd62, fd54, 0d3FEE6F0E134454FF; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd60, fd63; +add.f64 fd65, fd63, fd60; +fma.rn.f64 fd66, fd45, 0d3FD3C6EF372FE950, %21; +mul.f64 fd67, fd47, 0d3FE9E3779B97F4A8; +sub.f64 fd68, fd66, fd67; +sub.f64 fd69, %25, %41; +mul.f64 fd70, fd69, 0d3FEE6F0E134454FF; +sub.f64 fd71, %30, %36; +fma.rn.f64 fd72, fd71, 0d3FE2CF2304755A5E, fd70; +add.f64 fd73, fd72, fd68; +sub.f64 fd74, fd68, fd72; +mul.f64 fd75, fd45, 0d3FE9E3779B97F4A8; +sub.f64 fd76, %21, fd75; +fma.rn.f64 fd77, fd47, 0d3FD3C6EF372FE950, fd76; +mul.f64 fd78, fd69, 0d3FE2CF2304755A5E; +mul.f64 fd79, fd71, 0d3FEE6F0E134454FF; +sub.f64 fd80, fd78, fd79; +add.f64 fd81, fd80, fd77; +sub.f64 fd82, fd77, fd80; +add.f64 fd83, %28, %44; +add.f64 fd84, %22, fd83; +add.f64 fd85, %33, %38; +add.f64 fd86, fd85, fd84; +add.f64 fd87, %29, %45; +add.f64 fd88, %24, fd87; +add.f64 fd89, %35, %40; +add.f64 fd90, fd89, fd88; +fma.rn.f64 fd91, fd83, 0d3FD3C6EF372FE950, %22; +mul.f64 fd92, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd93, fd91, fd92; +sub.f64 fd94, %29, %45; +mul.f64 fd95, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd96, %35, %40; +fma.rn.f64 fd97, fd96, 0d3FE2CF2304755A5E, fd95; +sub.f64 fd98, fd93, fd97; +add.f64 fd99, fd97, fd93; +mul.f64 fd100, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd101, %22, fd100; +fma.rn.f64 fd102, fd85, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd103, fd94, 0d3FE2CF2304755A5E; +mul.f64 fd104, fd96, 0d3FEE6F0E134454FF; +sub.f64 fd105, fd103, fd104; +sub.f64 fd106, fd102, fd105; +add.f64 fd107, fd105, fd102; +fma.rn.f64 fd108, fd87, 0d3FD3C6EF372FE950, %24; +mul.f64 fd109, fd89, 0d3FE9E3779B97F4A8; +sub.f64 fd110, fd108, fd109; +sub.f64 fd111, %28, %44; +mul.f64 fd112, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd113, %33, %38; +fma.rn.f64 fd114, fd113, 0d3FE2CF2304755A5E, fd112; +add.f64 fd115, fd114, fd110; +sub.f64 fd116, fd110, fd114; +mul.f64 fd117, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %24, fd117; +fma.rn.f64 fd119, fd89, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd111, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd113, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd120, fd121; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +mul.f64 fd125, fd98, 0d3FE9E3779B97F4A8; +mul.f64 fd126, fd115, 0d3FE2CF2304755A5E; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd115, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd129, fd98, 0d3FE2CF2304755A5E, fd128; +mul.f64 fd130, fd106, 0d3FD3C6EF372FE950; +mul.f64 fd131, fd123, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd123, 0d3FD3C6EF372FE950; +fma.rn.f64 fd134, fd106, 0d3FEE6F0E134454FF, fd133; +mul.f64 fd135, fd107, 0dBFD3C6EF372FE950; +mul.f64 fd136, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd124, 0dBFD3C6EF372FE950; +fma.rn.f64 fd139, fd107, 0d3FEE6F0E134454FF, fd138; +mul.f64 fd140, fd99, 0dBFE9E3779B97F4A8; +mul.f64 fd141, fd116, 0d3FE2CF2304755A5E; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd116, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd144, fd99, 0d3FE2CF2304755A5E, fd143; +add.f64 %1, fd48, fd90; +add.f64 %0, fd44, fd86; +add.f64 %3, fd73, fd129; +add.f64 %2, fd56, fd127; +add.f64 %5, fd81, fd134; +add.f64 %4, fd64, fd132; +add.f64 %7, fd82, fd139; +add.f64 %6, fd65, fd137; +add.f64 %9, fd74, fd144; +add.f64 %8, fd57, fd142; +sub.f64 %11, fd48, fd90; +sub.f64 %10, fd44, fd86; +sub.f64 %13, fd73, fd129; +sub.f64 %12, fd56, fd127; +sub.f64 %15, fd81, fd134; +sub.f64 %14, fd64, fd132; +sub.f64 %17, fd82, fd139; +sub.f64 %16, fd65, fd137; +sub.f64 %19, fd74, fd144; +sub.f64 %18, fd57, fd142; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..40d3ef6249413 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp16_fwd.hpp.inc @@ -0,0 +1,1203 @@ +#ifndef CUFFTDX_FFT_11_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_11_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<924, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<111>; +.reg .b32 r<761>; +.reg .f64 fd<101>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %24, %42; +} +{ +add.f16x2 r4, %25, %43; +} +{ +sub.f16x2 r7, %24, %42; +} +{ +sub.f16x2 r10, %25, %43; +} +{ +add.f16x2 r13, %26, %40; +} +{ +add.f16x2 r16, %27, %41; +} +{ +sub.f16x2 r19, %26, %40; +} +{ +sub.f16x2 r22, %27, %41; +} +{ +add.f16x2 r25, %28, %38; +} +{ +add.f16x2 r28, %29, %39; +} +{ +sub.f16x2 r31, %28, %38; +} +{ +sub.f16x2 r34, %29, %39; +} +{ +add.f16x2 r37, %30, %36; +} +{ +add.f16x2 r40, %31, %37; +} +{ +sub.f16x2 r43, %30, %36; +} +{ +sub.f16x2 r46, %31, %37; +} +{ +add.f16x2 r49, %32, %34; +} +{ +add.f16x2 r52, %33, %35; +} +{ +sub.f16x2 r55, %32, %34; +} +{ +sub.f16x2 r58, %33, %35; +} +{ +add.f16x2 r61, %22, r1; +} +{ +add.f16x2 r64, %23, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 %0, r79, r49; +} +{ +add.f16x2 %1, r82, r52; +} +mov.u32 r628, 0; +cvt.rn.f16.s32 rs1, r628; +mov.b32 r103, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r628; +mov.b32 r115, {rs2, rs2}; +mov.f64 fd87, 0d3FEAEB8C8764F0BA; +{ +cvt.rn.f16.f64 rs3, fd87; +} +mov.b32 r95, {rs3, rs3}; +{ +mul.f16x2 r93, r1, r95; +} +{ +add.f16x2 r96, %22, r93; +} +mov.f64 fd72, 0dBFE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs4, fd72; +} +mov.b32 r101, {rs4, rs4}; +{ +mul.f16x2 r99, r10, r101; +} +{ +add.f16x2 r102, r103, r99; +} +{ +cvt.rn.f16.f64 rs5, fd87; +} +mov.b32 r107, {rs5, rs5}; +{ +mul.f16x2 r105, r4, r107; +} +{ +add.f16x2 r108, %23, r105; +} +{ +cvt.rn.f16.f64 rs6, fd72; +} +mov.b32 r113, {rs6, rs6}; +{ +mul.f16x2 r111, r7, r113; +} +{ +add.f16x2 r114, r115, r111; +} +mov.f64 fd95, 0d3FDA9628D9C712B6; +{ +cvt.rn.f16.f64 rs7, fd95; +} +mov.b32 r119, {rs7, rs7}; +{ +mul.f16x2 r117, r13, r119; +} +{ +add.f16x2 r120, r96, r117; +} +mov.f64 fd24, 0dBFED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs8, fd24; +} +mov.b32 r125, {rs8, rs8}; +{ +mul.f16x2 r123, r22, r125; +} +{ +add.f16x2 r126, r102, r123; +} +{ +cvt.rn.f16.f64 rs9, fd95; +} +mov.b32 r131, {rs9, rs9}; +{ +mul.f16x2 r129, r16, r131; +} +{ +add.f16x2 r132, r108, r129; +} +{ +cvt.rn.f16.f64 rs10, fd24; +} +mov.b32 r137, {rs10, rs10}; +{ +mul.f16x2 r135, r19, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd99, 0dBFC2375F640F44DB; +{ +cvt.rn.f16.f64 rs11, fd99; +} +mov.b32 r143, {rs11, rs11}; +{ +mul.f16x2 r141, r25, r143; +} +{ +add.f16x2 r144, r120, r141; +} +mov.f64 fd100, 0dBFEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs12, fd100; +} +mov.b32 r149, {rs12, rs12}; +{ +mul.f16x2 r147, r34, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs13, fd99; +} +mov.b32 r155, {rs13, rs13}; +{ +mul.f16x2 r153, r28, r155; +} +{ +add.f16x2 r156, r132, r153; +} +{ +cvt.rn.f16.f64 rs14, fd100; +} +mov.b32 r161, {rs14, rs14}; +{ +mul.f16x2 r159, r31, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd91, 0dBFE4F49E7F775887; +{ +cvt.rn.f16.f64 rs15, fd91; +} +mov.b32 r167, {rs15, rs15}; +{ +mul.f16x2 r165, r37, r167; +} +{ +add.f16x2 r168, r144, r165; +} +mov.f64 fd92, 0dBFE82F19BB3A28A1; +{ +cvt.rn.f16.f64 rs16, fd92; +} +mov.b32 r173, {rs16, rs16}; +{ +mul.f16x2 r171, r46, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs17, fd91; +} +mov.b32 r179, {rs17, rs17}; +{ +mul.f16x2 r177, r40, r179; +} +{ +add.f16x2 r180, r156, r177; +} +{ +cvt.rn.f16.f64 rs18, fd92; +} +mov.b32 r185, {rs18, rs18}; +{ +mul.f16x2 r183, r43, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd83, 0dBFEEB42A9BCD5057; +{ +cvt.rn.f16.f64 rs19, fd83; +} +mov.b32 r191, {rs19, rs19}; +{ +mul.f16x2 r189, r49, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd84, 0dBFD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs20, fd84; +} +mov.b32 r197, {rs20, rs20}; +{ +mul.f16x2 r195, r58, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs21, fd83; +} +mov.b32 r203, {rs21, rs21}; +{ +mul.f16x2 r201, r52, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs22, fd84; +} +mov.b32 r209, {rs22, rs22}; +{ +mul.f16x2 r207, r55, r209; +} +{ +add.f16x2 r210, r186, r207; +} +{ +sub.f16x2 %2, r192, r198; +} +{ +add.f16x2 %3, r204, r210; +} +{ +add.f16x2 %20, r192, r198; +} +{ +sub.f16x2 %21, r204, r210; +} +cvt.rn.f16.s32 rs23, r628; +mov.b32 r237, {rs23, rs23}; +cvt.rn.f16.s32 rs24, r628; +mov.b32 r249, {rs24, rs24}; +{ +cvt.rn.f16.f64 rs25, fd95; +} +mov.b32 r229, {rs25, rs25}; +{ +mul.f16x2 r227, r1, r229; +} +{ +add.f16x2 r230, %22, r227; +} +{ +cvt.rn.f16.f64 rs26, fd24; +} +mov.b32 r235, {rs26, rs26}; +{ +mul.f16x2 r233, r10, r235; +} +{ +add.f16x2 r236, r237, r233; +} +{ +cvt.rn.f16.f64 rs27, fd95; +} +mov.b32 r241, {rs27, rs27}; +{ +mul.f16x2 r239, r4, r241; +} +{ +add.f16x2 r242, %23, r239; +} +{ +cvt.rn.f16.f64 rs28, fd24; +} +mov.b32 r247, {rs28, rs28}; +{ +mul.f16x2 r245, r7, r247; +} +{ +add.f16x2 r248, r249, r245; +} +{ +cvt.rn.f16.f64 rs29, fd91; +} +mov.b32 r253, {rs29, rs29}; +{ +mul.f16x2 r251, r13, r253; +} +{ +add.f16x2 r254, r230, r251; +} +{ +cvt.rn.f16.f64 rs30, fd92; +} +mov.b32 r259, {rs30, rs30}; +{ +mul.f16x2 r257, r22, r259; +} +{ +add.f16x2 r260, r236, r257; +} +{ +cvt.rn.f16.f64 rs31, fd91; +} +mov.b32 r265, {rs31, rs31}; +{ +mul.f16x2 r263, r16, r265; +} +{ +add.f16x2 r266, r242, r263; +} +{ +cvt.rn.f16.f64 rs32, fd92; +} +mov.b32 r271, {rs32, rs32}; +{ +mul.f16x2 r269, r19, r271; +} +{ +add.f16x2 r272, r248, r269; +} +{ +cvt.rn.f16.f64 rs33, fd83; +} +mov.b32 r277, {rs33, rs33}; +{ +mul.f16x2 r275, r25, r277; +} +{ +add.f16x2 r278, r254, r275; +} +mov.f64 fd48, 0d3FD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs34, fd48; +} +mov.b32 r283, {rs34, rs34}; +{ +mul.f16x2 r281, r34, r283; +} +{ +add.f16x2 r284, r260, r281; +} +{ +cvt.rn.f16.f64 rs35, fd83; +} +mov.b32 r289, {rs35, rs35}; +{ +mul.f16x2 r287, r28, r289; +} +{ +add.f16x2 r290, r266, r287; +} +{ +cvt.rn.f16.f64 rs36, fd48; +} +mov.b32 r295, {rs36, rs36}; +{ +mul.f16x2 r293, r31, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs37, fd99; +} +mov.b32 r301, {rs37, rs37}; +{ +mul.f16x2 r299, r37, r301; +} +{ +add.f16x2 r302, r278, r299; +} +mov.f64 fd68, 0d3FEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs38, fd68; +} +mov.b32 r307, {rs38, rs38}; +{ +mul.f16x2 r305, r46, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs39, fd99; +} +mov.b32 r313, {rs39, rs39}; +{ +mul.f16x2 r311, r40, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs40, fd68; +} +mov.b32 r319, {rs40, rs40}; +{ +mul.f16x2 r317, r43, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs41, fd87; +} +mov.b32 r325, {rs41, rs41}; +{ +mul.f16x2 r323, r49, r325; +} +{ +add.f16x2 r326, r302, r323; +} +mov.f64 fd88, 0d3FE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs42, fd88; +} +mov.b32 r331, {rs42, rs42}; +{ +mul.f16x2 r329, r58, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs43, fd87; +} +mov.b32 r337, {rs43, rs43}; +{ +mul.f16x2 r335, r52, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs44, fd88; +} +mov.b32 r343, {rs44, rs44}; +{ +mul.f16x2 r341, r55, r343; +} +{ +add.f16x2 r344, r320, r341; +} +{ +sub.f16x2 %4, r326, r332; +} +{ +add.f16x2 %5, r338, r344; +} +{ +add.f16x2 %18, r326, r332; +} +{ +sub.f16x2 %19, r338, r344; +} +cvt.rn.f16.s32 rs45, r628; +mov.b32 r371, {rs45, rs45}; +cvt.rn.f16.s32 rs46, r628; +mov.b32 r383, {rs46, rs46}; +{ +cvt.rn.f16.f64 rs47, fd99; +} +mov.b32 r363, {rs47, rs47}; +{ +mul.f16x2 r361, r1, r363; +} +{ +add.f16x2 r364, %22, r361; +} +{ +cvt.rn.f16.f64 rs48, fd100; +} +mov.b32 r369, {rs48, rs48}; +{ +mul.f16x2 r367, r10, r369; +} +{ +add.f16x2 r370, r371, r367; +} +{ +cvt.rn.f16.f64 rs49, fd99; +} +mov.b32 r375, {rs49, rs49}; +{ +mul.f16x2 r373, r4, r375; +} +{ +add.f16x2 r376, %23, r373; +} +{ +cvt.rn.f16.f64 rs50, fd100; +} +mov.b32 r381, {rs50, rs50}; +{ +mul.f16x2 r379, r7, r381; +} +{ +add.f16x2 r382, r383, r379; +} +{ +cvt.rn.f16.f64 rs51, fd83; +} +mov.b32 r387, {rs51, rs51}; +{ +mul.f16x2 r385, r13, r387; +} +{ +add.f16x2 r388, r364, r385; +} +{ +cvt.rn.f16.f64 rs52, fd48; +} +mov.b32 r393, {rs52, rs52}; +{ +mul.f16x2 r391, r22, r393; +} +{ +add.f16x2 r394, r370, r391; +} +{ +cvt.rn.f16.f64 rs53, fd83; +} +mov.b32 r399, {rs53, rs53}; +{ +mul.f16x2 r397, r16, r399; +} +{ +add.f16x2 r400, r376, r397; +} +{ +cvt.rn.f16.f64 rs54, fd48; +} +mov.b32 r405, {rs54, rs54}; +{ +mul.f16x2 r403, r19, r405; +} +{ +add.f16x2 r406, r382, r403; +} +{ +cvt.rn.f16.f64 rs55, fd95; +} +mov.b32 r411, {rs55, rs55}; +{ +mul.f16x2 r409, r25, r411; +} +{ +add.f16x2 r412, r388, r409; +} +mov.f64 fd96, 0d3FED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs56, fd96; +} +mov.b32 r417, {rs56, rs56}; +{ +mul.f16x2 r415, r34, r417; +} +{ +add.f16x2 r418, r394, r415; +} +{ +cvt.rn.f16.f64 rs57, fd95; +} +mov.b32 r423, {rs57, rs57}; +{ +mul.f16x2 r421, r28, r423; +} +{ +add.f16x2 r424, r400, r421; +} +{ +cvt.rn.f16.f64 rs58, fd96; +} +mov.b32 r429, {rs58, rs58}; +{ +mul.f16x2 r427, r31, r429; +} +{ +add.f16x2 r430, r406, r427; +} +{ +cvt.rn.f16.f64 rs59, fd87; +} +mov.b32 r435, {rs59, rs59}; +{ +mul.f16x2 r433, r37, r435; +} +{ +add.f16x2 r436, r412, r433; +} +{ +cvt.rn.f16.f64 rs60, fd72; +} +mov.b32 r441, {rs60, rs60}; +{ +mul.f16x2 r439, r46, r441; +} +{ +add.f16x2 r442, r418, r439; +} +{ +cvt.rn.f16.f64 rs61, fd87; +} +mov.b32 r447, {rs61, rs61}; +{ +mul.f16x2 r445, r40, r447; +} +{ +add.f16x2 r448, r424, r445; +} +{ +cvt.rn.f16.f64 rs62, fd72; +} +mov.b32 r453, {rs62, rs62}; +{ +mul.f16x2 r451, r43, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs63, fd91; +} +mov.b32 r459, {rs63, rs63}; +{ +mul.f16x2 r457, r49, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs64, fd92; +} +mov.b32 r465, {rs64, rs64}; +{ +mul.f16x2 r463, r58, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs65, fd91; +} +mov.b32 r471, {rs65, rs65}; +{ +mul.f16x2 r469, r52, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs66, fd92; +} +mov.b32 r477, {rs66, rs66}; +{ +mul.f16x2 r475, r55, r477; +} +{ +add.f16x2 r478, r454, r475; +} +{ +sub.f16x2 %6, r460, r466; +} +{ +add.f16x2 %7, r472, r478; +} +{ +add.f16x2 %16, r460, r466; +} +{ +sub.f16x2 %17, r472, r478; +} +cvt.rn.f16.s32 rs67, r628; +mov.b32 r505, {rs67, rs67}; +cvt.rn.f16.s32 rs68, r628; +mov.b32 r517, {rs68, rs68}; +{ +cvt.rn.f16.f64 rs69, fd91; +} +mov.b32 r497, {rs69, rs69}; +{ +mul.f16x2 r495, r1, r497; +} +{ +add.f16x2 r498, %22, r495; +} +{ +cvt.rn.f16.f64 rs70, fd92; +} +mov.b32 r503, {rs70, rs70}; +{ +mul.f16x2 r501, r10, r503; +} +{ +add.f16x2 r504, r505, r501; +} +{ +cvt.rn.f16.f64 rs71, fd91; +} +mov.b32 r509, {rs71, rs71}; +{ +mul.f16x2 r507, r4, r509; +} +{ +add.f16x2 r510, %23, r507; +} +{ +cvt.rn.f16.f64 rs72, fd92; +} +mov.b32 r515, {rs72, rs72}; +{ +mul.f16x2 r513, r7, r515; +} +{ +add.f16x2 r516, r517, r513; +} +{ +cvt.rn.f16.f64 rs73, fd99; +} +mov.b32 r521, {rs73, rs73}; +{ +mul.f16x2 r519, r13, r521; +} +{ +add.f16x2 r522, r498, r519; +} +{ +cvt.rn.f16.f64 rs74, fd68; +} +mov.b32 r527, {rs74, rs74}; +{ +mul.f16x2 r525, r22, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs75, fd99; +} +mov.b32 r533, {rs75, rs75}; +{ +mul.f16x2 r531, r16, r533; +} +{ +add.f16x2 r534, r510, r531; +} +{ +cvt.rn.f16.f64 rs76, fd68; +} +mov.b32 r539, {rs76, rs76}; +{ +mul.f16x2 r537, r19, r539; +} +{ +add.f16x2 r540, r516, r537; +} +{ +cvt.rn.f16.f64 rs77, fd87; +} +mov.b32 r545, {rs77, rs77}; +{ +mul.f16x2 r543, r25, r545; +} +{ +add.f16x2 r546, r522, r543; +} +{ +cvt.rn.f16.f64 rs78, fd72; +} +mov.b32 r551, {rs78, rs78}; +{ +mul.f16x2 r549, r34, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs79, fd87; +} +mov.b32 r557, {rs79, rs79}; +{ +mul.f16x2 r555, r28, r557; +} +{ +add.f16x2 r558, r534, r555; +} +{ +cvt.rn.f16.f64 rs80, fd72; +} +mov.b32 r563, {rs80, rs80}; +{ +mul.f16x2 r561, r31, r563; +} +{ +add.f16x2 r564, r540, r561; +} +{ +cvt.rn.f16.f64 rs81, fd83; +} +mov.b32 r569, {rs81, rs81}; +{ +mul.f16x2 r567, r37, r569; +} +{ +add.f16x2 r570, r546, r567; +} +{ +cvt.rn.f16.f64 rs82, fd84; +} +mov.b32 r575, {rs82, rs82}; +{ +mul.f16x2 r573, r46, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs83, fd83; +} +mov.b32 r581, {rs83, rs83}; +{ +mul.f16x2 r579, r40, r581; +} +{ +add.f16x2 r582, r558, r579; +} +{ +cvt.rn.f16.f64 rs84, fd84; +} +mov.b32 r587, {rs84, rs84}; +{ +mul.f16x2 r585, r43, r587; +} +{ +add.f16x2 r588, r564, r585; +} +{ +cvt.rn.f16.f64 rs85, fd95; +} +mov.b32 r593, {rs85, rs85}; +{ +mul.f16x2 r591, r49, r593; +} +{ +add.f16x2 r594, r570, r591; +} +{ +cvt.rn.f16.f64 rs86, fd96; +} +mov.b32 r599, {rs86, rs86}; +{ +mul.f16x2 r597, r58, r599; +} +{ +add.f16x2 r600, r576, r597; +} +{ +cvt.rn.f16.f64 rs87, fd95; +} +mov.b32 r605, {rs87, rs87}; +{ +mul.f16x2 r603, r52, r605; +} +{ +add.f16x2 r606, r582, r603; +} +{ +cvt.rn.f16.f64 rs88, fd96; +} +mov.b32 r611, {rs88, rs88}; +{ +mul.f16x2 r609, r55, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +sub.f16x2 %8, r594, r600; +} +{ +add.f16x2 %9, r606, r612; +} +{ +add.f16x2 %14, r594, r600; +} +{ +sub.f16x2 %15, r606, r612; +} +cvt.rn.f16.s32 rs89, r628; +mov.b32 r639, {rs89, rs89}; +cvt.rn.f16.s32 rs90, r628; +mov.b32 r651, {rs90, rs90}; +{ +cvt.rn.f16.f64 rs91, fd83; +} +mov.b32 r631, {rs91, rs91}; +{ +mul.f16x2 r629, r1, r631; +} +{ +add.f16x2 r632, %22, r629; +} +{ +cvt.rn.f16.f64 rs92, fd84; +} +mov.b32 r637, {rs92, rs92}; +{ +mul.f16x2 r635, r10, r637; +} +{ +add.f16x2 r638, r639, r635; +} +{ +cvt.rn.f16.f64 rs93, fd83; +} +mov.b32 r643, {rs93, rs93}; +{ +mul.f16x2 r641, r4, r643; +} +{ +add.f16x2 r644, %23, r641; +} +{ +cvt.rn.f16.f64 rs94, fd84; +} +mov.b32 r649, {rs94, rs94}; +{ +mul.f16x2 r647, r7, r649; +} +{ +add.f16x2 r650, r651, r647; +} +{ +cvt.rn.f16.f64 rs95, fd87; +} +mov.b32 r655, {rs95, rs95}; +{ +mul.f16x2 r653, r13, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs96, fd88; +} +mov.b32 r661, {rs96, rs96}; +{ +mul.f16x2 r659, r22, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs97, fd87; +} +mov.b32 r667, {rs97, rs97}; +{ +mul.f16x2 r665, r16, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs98, fd88; +} +mov.b32 r673, {rs98, rs98}; +{ +mul.f16x2 r671, r19, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs99, fd91; +} +mov.b32 r679, {rs99, rs99}; +{ +mul.f16x2 r677, r25, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs100, fd92; +} +mov.b32 r685, {rs100, rs100}; +{ +mul.f16x2 r683, r34, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs101, fd91; +} +mov.b32 r691, {rs101, rs101}; +{ +mul.f16x2 r689, r28, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs102, fd92; +} +mov.b32 r697, {rs102, rs102}; +{ +mul.f16x2 r695, r31, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs103, fd95; +} +mov.b32 r703, {rs103, rs103}; +{ +mul.f16x2 r701, r37, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs104, fd96; +} +mov.b32 r709, {rs104, rs104}; +{ +mul.f16x2 r707, r46, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs105, fd95; +} +mov.b32 r715, {rs105, rs105}; +{ +mul.f16x2 r713, r40, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs106, fd96; +} +mov.b32 r721, {rs106, rs106}; +{ +mul.f16x2 r719, r43, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs107, fd99; +} +mov.b32 r727, {rs107, rs107}; +{ +mul.f16x2 r725, r49, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs108, fd100; +} +mov.b32 r733, {rs108, rs108}; +{ +mul.f16x2 r731, r58, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs109, fd99; +} +mov.b32 r739, {rs109, rs109}; +{ +mul.f16x2 r737, r52, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs110, fd100; +} +mov.b32 r745, {rs110, rs110}; +{ +mul.f16x2 r743, r55, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +sub.f16x2 %10, r728, r734; +} +{ +add.f16x2 %11, r740, r746; +} +{ +add.f16x2 %12, r728, r734; +} +{ +sub.f16x2 %13, r740, r746; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..466423d805a5d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp16_inv.hpp.inc @@ -0,0 +1,1203 @@ +#ifndef CUFFTDX_FFT_11_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_11_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1126, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<111>; +.reg .b32 r<761>; +.reg .f64 fd<101>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %24, %42; +} +{ +add.f16x2 r4, %25, %43; +} +{ +sub.f16x2 r7, %24, %42; +} +{ +sub.f16x2 r10, %25, %43; +} +{ +add.f16x2 r13, %26, %40; +} +{ +add.f16x2 r16, %27, %41; +} +{ +sub.f16x2 r19, %26, %40; +} +{ +sub.f16x2 r22, %27, %41; +} +{ +add.f16x2 r25, %28, %38; +} +{ +add.f16x2 r28, %29, %39; +} +{ +sub.f16x2 r31, %28, %38; +} +{ +sub.f16x2 r34, %29, %39; +} +{ +add.f16x2 r37, %30, %36; +} +{ +add.f16x2 r40, %31, %37; +} +{ +sub.f16x2 r43, %30, %36; +} +{ +sub.f16x2 r46, %31, %37; +} +{ +add.f16x2 r49, %32, %34; +} +{ +add.f16x2 r52, %33, %35; +} +{ +sub.f16x2 r55, %32, %34; +} +{ +sub.f16x2 r58, %33, %35; +} +{ +add.f16x2 r61, %22, r1; +} +{ +add.f16x2 r64, %23, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 %0, r79, r49; +} +{ +add.f16x2 %1, r82, r52; +} +mov.u32 r628, 0; +cvt.rn.f16.s32 rs1, r628; +mov.b32 r103, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r628; +mov.b32 r115, {rs2, rs2}; +mov.f64 fd87, 0d3FEAEB8C8764F0BA; +{ +cvt.rn.f16.f64 rs3, fd87; +} +mov.b32 r95, {rs3, rs3}; +{ +mul.f16x2 r93, r1, r95; +} +{ +add.f16x2 r96, %22, r93; +} +mov.f64 fd72, 0d3FE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs4, fd72; +} +mov.b32 r101, {rs4, rs4}; +{ +mul.f16x2 r99, r10, r101; +} +{ +add.f16x2 r102, r103, r99; +} +{ +cvt.rn.f16.f64 rs5, fd87; +} +mov.b32 r107, {rs5, rs5}; +{ +mul.f16x2 r105, r4, r107; +} +{ +add.f16x2 r108, %23, r105; +} +{ +cvt.rn.f16.f64 rs6, fd72; +} +mov.b32 r113, {rs6, rs6}; +{ +mul.f16x2 r111, r7, r113; +} +{ +add.f16x2 r114, r115, r111; +} +mov.f64 fd95, 0d3FDA9628D9C712B6; +{ +cvt.rn.f16.f64 rs7, fd95; +} +mov.b32 r119, {rs7, rs7}; +{ +mul.f16x2 r117, r13, r119; +} +{ +add.f16x2 r120, r96, r117; +} +mov.f64 fd24, 0d3FED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs8, fd24; +} +mov.b32 r125, {rs8, rs8}; +{ +mul.f16x2 r123, r22, r125; +} +{ +add.f16x2 r126, r102, r123; +} +{ +cvt.rn.f16.f64 rs9, fd95; +} +mov.b32 r131, {rs9, rs9}; +{ +mul.f16x2 r129, r16, r131; +} +{ +add.f16x2 r132, r108, r129; +} +{ +cvt.rn.f16.f64 rs10, fd24; +} +mov.b32 r137, {rs10, rs10}; +{ +mul.f16x2 r135, r19, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd99, 0dBFC2375F640F44DB; +{ +cvt.rn.f16.f64 rs11, fd99; +} +mov.b32 r143, {rs11, rs11}; +{ +mul.f16x2 r141, r25, r143; +} +{ +add.f16x2 r144, r120, r141; +} +mov.f64 fd100, 0d3FEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs12, fd100; +} +mov.b32 r149, {rs12, rs12}; +{ +mul.f16x2 r147, r34, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs13, fd99; +} +mov.b32 r155, {rs13, rs13}; +{ +mul.f16x2 r153, r28, r155; +} +{ +add.f16x2 r156, r132, r153; +} +{ +cvt.rn.f16.f64 rs14, fd100; +} +mov.b32 r161, {rs14, rs14}; +{ +mul.f16x2 r159, r31, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd91, 0dBFE4F49E7F775887; +{ +cvt.rn.f16.f64 rs15, fd91; +} +mov.b32 r167, {rs15, rs15}; +{ +mul.f16x2 r165, r37, r167; +} +{ +add.f16x2 r168, r144, r165; +} +mov.f64 fd92, 0d3FE82F19BB3A28A1; +{ +cvt.rn.f16.f64 rs16, fd92; +} +mov.b32 r173, {rs16, rs16}; +{ +mul.f16x2 r171, r46, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs17, fd91; +} +mov.b32 r179, {rs17, rs17}; +{ +mul.f16x2 r177, r40, r179; +} +{ +add.f16x2 r180, r156, r177; +} +{ +cvt.rn.f16.f64 rs18, fd92; +} +mov.b32 r185, {rs18, rs18}; +{ +mul.f16x2 r183, r43, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd83, 0dBFEEB42A9BCD5057; +{ +cvt.rn.f16.f64 rs19, fd83; +} +mov.b32 r191, {rs19, rs19}; +{ +mul.f16x2 r189, r49, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd84, 0d3FD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs20, fd84; +} +mov.b32 r197, {rs20, rs20}; +{ +mul.f16x2 r195, r58, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs21, fd83; +} +mov.b32 r203, {rs21, rs21}; +{ +mul.f16x2 r201, r52, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs22, fd84; +} +mov.b32 r209, {rs22, rs22}; +{ +mul.f16x2 r207, r55, r209; +} +{ +add.f16x2 r210, r186, r207; +} +{ +sub.f16x2 %2, r192, r198; +} +{ +add.f16x2 %3, r204, r210; +} +{ +add.f16x2 %20, r192, r198; +} +{ +sub.f16x2 %21, r204, r210; +} +cvt.rn.f16.s32 rs23, r628; +mov.b32 r237, {rs23, rs23}; +cvt.rn.f16.s32 rs24, r628; +mov.b32 r249, {rs24, rs24}; +{ +cvt.rn.f16.f64 rs25, fd95; +} +mov.b32 r229, {rs25, rs25}; +{ +mul.f16x2 r227, r1, r229; +} +{ +add.f16x2 r230, %22, r227; +} +{ +cvt.rn.f16.f64 rs26, fd24; +} +mov.b32 r235, {rs26, rs26}; +{ +mul.f16x2 r233, r10, r235; +} +{ +add.f16x2 r236, r237, r233; +} +{ +cvt.rn.f16.f64 rs27, fd95; +} +mov.b32 r241, {rs27, rs27}; +{ +mul.f16x2 r239, r4, r241; +} +{ +add.f16x2 r242, %23, r239; +} +{ +cvt.rn.f16.f64 rs28, fd24; +} +mov.b32 r247, {rs28, rs28}; +{ +mul.f16x2 r245, r7, r247; +} +{ +add.f16x2 r248, r249, r245; +} +{ +cvt.rn.f16.f64 rs29, fd91; +} +mov.b32 r253, {rs29, rs29}; +{ +mul.f16x2 r251, r13, r253; +} +{ +add.f16x2 r254, r230, r251; +} +{ +cvt.rn.f16.f64 rs30, fd92; +} +mov.b32 r259, {rs30, rs30}; +{ +mul.f16x2 r257, r22, r259; +} +{ +add.f16x2 r260, r236, r257; +} +{ +cvt.rn.f16.f64 rs31, fd91; +} +mov.b32 r265, {rs31, rs31}; +{ +mul.f16x2 r263, r16, r265; +} +{ +add.f16x2 r266, r242, r263; +} +{ +cvt.rn.f16.f64 rs32, fd92; +} +mov.b32 r271, {rs32, rs32}; +{ +mul.f16x2 r269, r19, r271; +} +{ +add.f16x2 r272, r248, r269; +} +{ +cvt.rn.f16.f64 rs33, fd83; +} +mov.b32 r277, {rs33, rs33}; +{ +mul.f16x2 r275, r25, r277; +} +{ +add.f16x2 r278, r254, r275; +} +mov.f64 fd48, 0dBFD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs34, fd48; +} +mov.b32 r283, {rs34, rs34}; +{ +mul.f16x2 r281, r34, r283; +} +{ +add.f16x2 r284, r260, r281; +} +{ +cvt.rn.f16.f64 rs35, fd83; +} +mov.b32 r289, {rs35, rs35}; +{ +mul.f16x2 r287, r28, r289; +} +{ +add.f16x2 r290, r266, r287; +} +{ +cvt.rn.f16.f64 rs36, fd48; +} +mov.b32 r295, {rs36, rs36}; +{ +mul.f16x2 r293, r31, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs37, fd99; +} +mov.b32 r301, {rs37, rs37}; +{ +mul.f16x2 r299, r37, r301; +} +{ +add.f16x2 r302, r278, r299; +} +mov.f64 fd68, 0dBFEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs38, fd68; +} +mov.b32 r307, {rs38, rs38}; +{ +mul.f16x2 r305, r46, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs39, fd99; +} +mov.b32 r313, {rs39, rs39}; +{ +mul.f16x2 r311, r40, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs40, fd68; +} +mov.b32 r319, {rs40, rs40}; +{ +mul.f16x2 r317, r43, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs41, fd87; +} +mov.b32 r325, {rs41, rs41}; +{ +mul.f16x2 r323, r49, r325; +} +{ +add.f16x2 r326, r302, r323; +} +mov.f64 fd88, 0dBFE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs42, fd88; +} +mov.b32 r331, {rs42, rs42}; +{ +mul.f16x2 r329, r58, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs43, fd87; +} +mov.b32 r337, {rs43, rs43}; +{ +mul.f16x2 r335, r52, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs44, fd88; +} +mov.b32 r343, {rs44, rs44}; +{ +mul.f16x2 r341, r55, r343; +} +{ +add.f16x2 r344, r320, r341; +} +{ +sub.f16x2 %4, r326, r332; +} +{ +add.f16x2 %5, r338, r344; +} +{ +add.f16x2 %18, r326, r332; +} +{ +sub.f16x2 %19, r338, r344; +} +cvt.rn.f16.s32 rs45, r628; +mov.b32 r371, {rs45, rs45}; +cvt.rn.f16.s32 rs46, r628; +mov.b32 r383, {rs46, rs46}; +{ +cvt.rn.f16.f64 rs47, fd99; +} +mov.b32 r363, {rs47, rs47}; +{ +mul.f16x2 r361, r1, r363; +} +{ +add.f16x2 r364, %22, r361; +} +{ +cvt.rn.f16.f64 rs48, fd100; +} +mov.b32 r369, {rs48, rs48}; +{ +mul.f16x2 r367, r10, r369; +} +{ +add.f16x2 r370, r371, r367; +} +{ +cvt.rn.f16.f64 rs49, fd99; +} +mov.b32 r375, {rs49, rs49}; +{ +mul.f16x2 r373, r4, r375; +} +{ +add.f16x2 r376, %23, r373; +} +{ +cvt.rn.f16.f64 rs50, fd100; +} +mov.b32 r381, {rs50, rs50}; +{ +mul.f16x2 r379, r7, r381; +} +{ +add.f16x2 r382, r383, r379; +} +{ +cvt.rn.f16.f64 rs51, fd83; +} +mov.b32 r387, {rs51, rs51}; +{ +mul.f16x2 r385, r13, r387; +} +{ +add.f16x2 r388, r364, r385; +} +{ +cvt.rn.f16.f64 rs52, fd48; +} +mov.b32 r393, {rs52, rs52}; +{ +mul.f16x2 r391, r22, r393; +} +{ +add.f16x2 r394, r370, r391; +} +{ +cvt.rn.f16.f64 rs53, fd83; +} +mov.b32 r399, {rs53, rs53}; +{ +mul.f16x2 r397, r16, r399; +} +{ +add.f16x2 r400, r376, r397; +} +{ +cvt.rn.f16.f64 rs54, fd48; +} +mov.b32 r405, {rs54, rs54}; +{ +mul.f16x2 r403, r19, r405; +} +{ +add.f16x2 r406, r382, r403; +} +{ +cvt.rn.f16.f64 rs55, fd95; +} +mov.b32 r411, {rs55, rs55}; +{ +mul.f16x2 r409, r25, r411; +} +{ +add.f16x2 r412, r388, r409; +} +mov.f64 fd96, 0dBFED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs56, fd96; +} +mov.b32 r417, {rs56, rs56}; +{ +mul.f16x2 r415, r34, r417; +} +{ +add.f16x2 r418, r394, r415; +} +{ +cvt.rn.f16.f64 rs57, fd95; +} +mov.b32 r423, {rs57, rs57}; +{ +mul.f16x2 r421, r28, r423; +} +{ +add.f16x2 r424, r400, r421; +} +{ +cvt.rn.f16.f64 rs58, fd96; +} +mov.b32 r429, {rs58, rs58}; +{ +mul.f16x2 r427, r31, r429; +} +{ +add.f16x2 r430, r406, r427; +} +{ +cvt.rn.f16.f64 rs59, fd87; +} +mov.b32 r435, {rs59, rs59}; +{ +mul.f16x2 r433, r37, r435; +} +{ +add.f16x2 r436, r412, r433; +} +{ +cvt.rn.f16.f64 rs60, fd72; +} +mov.b32 r441, {rs60, rs60}; +{ +mul.f16x2 r439, r46, r441; +} +{ +add.f16x2 r442, r418, r439; +} +{ +cvt.rn.f16.f64 rs61, fd87; +} +mov.b32 r447, {rs61, rs61}; +{ +mul.f16x2 r445, r40, r447; +} +{ +add.f16x2 r448, r424, r445; +} +{ +cvt.rn.f16.f64 rs62, fd72; +} +mov.b32 r453, {rs62, rs62}; +{ +mul.f16x2 r451, r43, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs63, fd91; +} +mov.b32 r459, {rs63, rs63}; +{ +mul.f16x2 r457, r49, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs64, fd92; +} +mov.b32 r465, {rs64, rs64}; +{ +mul.f16x2 r463, r58, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs65, fd91; +} +mov.b32 r471, {rs65, rs65}; +{ +mul.f16x2 r469, r52, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs66, fd92; +} +mov.b32 r477, {rs66, rs66}; +{ +mul.f16x2 r475, r55, r477; +} +{ +add.f16x2 r478, r454, r475; +} +{ +sub.f16x2 %6, r460, r466; +} +{ +add.f16x2 %7, r472, r478; +} +{ +add.f16x2 %16, r460, r466; +} +{ +sub.f16x2 %17, r472, r478; +} +cvt.rn.f16.s32 rs67, r628; +mov.b32 r505, {rs67, rs67}; +cvt.rn.f16.s32 rs68, r628; +mov.b32 r517, {rs68, rs68}; +{ +cvt.rn.f16.f64 rs69, fd91; +} +mov.b32 r497, {rs69, rs69}; +{ +mul.f16x2 r495, r1, r497; +} +{ +add.f16x2 r498, %22, r495; +} +{ +cvt.rn.f16.f64 rs70, fd92; +} +mov.b32 r503, {rs70, rs70}; +{ +mul.f16x2 r501, r10, r503; +} +{ +add.f16x2 r504, r505, r501; +} +{ +cvt.rn.f16.f64 rs71, fd91; +} +mov.b32 r509, {rs71, rs71}; +{ +mul.f16x2 r507, r4, r509; +} +{ +add.f16x2 r510, %23, r507; +} +{ +cvt.rn.f16.f64 rs72, fd92; +} +mov.b32 r515, {rs72, rs72}; +{ +mul.f16x2 r513, r7, r515; +} +{ +add.f16x2 r516, r517, r513; +} +{ +cvt.rn.f16.f64 rs73, fd99; +} +mov.b32 r521, {rs73, rs73}; +{ +mul.f16x2 r519, r13, r521; +} +{ +add.f16x2 r522, r498, r519; +} +{ +cvt.rn.f16.f64 rs74, fd68; +} +mov.b32 r527, {rs74, rs74}; +{ +mul.f16x2 r525, r22, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs75, fd99; +} +mov.b32 r533, {rs75, rs75}; +{ +mul.f16x2 r531, r16, r533; +} +{ +add.f16x2 r534, r510, r531; +} +{ +cvt.rn.f16.f64 rs76, fd68; +} +mov.b32 r539, {rs76, rs76}; +{ +mul.f16x2 r537, r19, r539; +} +{ +add.f16x2 r540, r516, r537; +} +{ +cvt.rn.f16.f64 rs77, fd87; +} +mov.b32 r545, {rs77, rs77}; +{ +mul.f16x2 r543, r25, r545; +} +{ +add.f16x2 r546, r522, r543; +} +{ +cvt.rn.f16.f64 rs78, fd72; +} +mov.b32 r551, {rs78, rs78}; +{ +mul.f16x2 r549, r34, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs79, fd87; +} +mov.b32 r557, {rs79, rs79}; +{ +mul.f16x2 r555, r28, r557; +} +{ +add.f16x2 r558, r534, r555; +} +{ +cvt.rn.f16.f64 rs80, fd72; +} +mov.b32 r563, {rs80, rs80}; +{ +mul.f16x2 r561, r31, r563; +} +{ +add.f16x2 r564, r540, r561; +} +{ +cvt.rn.f16.f64 rs81, fd83; +} +mov.b32 r569, {rs81, rs81}; +{ +mul.f16x2 r567, r37, r569; +} +{ +add.f16x2 r570, r546, r567; +} +{ +cvt.rn.f16.f64 rs82, fd84; +} +mov.b32 r575, {rs82, rs82}; +{ +mul.f16x2 r573, r46, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs83, fd83; +} +mov.b32 r581, {rs83, rs83}; +{ +mul.f16x2 r579, r40, r581; +} +{ +add.f16x2 r582, r558, r579; +} +{ +cvt.rn.f16.f64 rs84, fd84; +} +mov.b32 r587, {rs84, rs84}; +{ +mul.f16x2 r585, r43, r587; +} +{ +add.f16x2 r588, r564, r585; +} +{ +cvt.rn.f16.f64 rs85, fd95; +} +mov.b32 r593, {rs85, rs85}; +{ +mul.f16x2 r591, r49, r593; +} +{ +add.f16x2 r594, r570, r591; +} +{ +cvt.rn.f16.f64 rs86, fd96; +} +mov.b32 r599, {rs86, rs86}; +{ +mul.f16x2 r597, r58, r599; +} +{ +add.f16x2 r600, r576, r597; +} +{ +cvt.rn.f16.f64 rs87, fd95; +} +mov.b32 r605, {rs87, rs87}; +{ +mul.f16x2 r603, r52, r605; +} +{ +add.f16x2 r606, r582, r603; +} +{ +cvt.rn.f16.f64 rs88, fd96; +} +mov.b32 r611, {rs88, rs88}; +{ +mul.f16x2 r609, r55, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +sub.f16x2 %8, r594, r600; +} +{ +add.f16x2 %9, r606, r612; +} +{ +add.f16x2 %14, r594, r600; +} +{ +sub.f16x2 %15, r606, r612; +} +cvt.rn.f16.s32 rs89, r628; +mov.b32 r639, {rs89, rs89}; +cvt.rn.f16.s32 rs90, r628; +mov.b32 r651, {rs90, rs90}; +{ +cvt.rn.f16.f64 rs91, fd83; +} +mov.b32 r631, {rs91, rs91}; +{ +mul.f16x2 r629, r1, r631; +} +{ +add.f16x2 r632, %22, r629; +} +{ +cvt.rn.f16.f64 rs92, fd84; +} +mov.b32 r637, {rs92, rs92}; +{ +mul.f16x2 r635, r10, r637; +} +{ +add.f16x2 r638, r639, r635; +} +{ +cvt.rn.f16.f64 rs93, fd83; +} +mov.b32 r643, {rs93, rs93}; +{ +mul.f16x2 r641, r4, r643; +} +{ +add.f16x2 r644, %23, r641; +} +{ +cvt.rn.f16.f64 rs94, fd84; +} +mov.b32 r649, {rs94, rs94}; +{ +mul.f16x2 r647, r7, r649; +} +{ +add.f16x2 r650, r651, r647; +} +{ +cvt.rn.f16.f64 rs95, fd87; +} +mov.b32 r655, {rs95, rs95}; +{ +mul.f16x2 r653, r13, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs96, fd88; +} +mov.b32 r661, {rs96, rs96}; +{ +mul.f16x2 r659, r22, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs97, fd87; +} +mov.b32 r667, {rs97, rs97}; +{ +mul.f16x2 r665, r16, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs98, fd88; +} +mov.b32 r673, {rs98, rs98}; +{ +mul.f16x2 r671, r19, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs99, fd91; +} +mov.b32 r679, {rs99, rs99}; +{ +mul.f16x2 r677, r25, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs100, fd92; +} +mov.b32 r685, {rs100, rs100}; +{ +mul.f16x2 r683, r34, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs101, fd91; +} +mov.b32 r691, {rs101, rs101}; +{ +mul.f16x2 r689, r28, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs102, fd92; +} +mov.b32 r697, {rs102, rs102}; +{ +mul.f16x2 r695, r31, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs103, fd95; +} +mov.b32 r703, {rs103, rs103}; +{ +mul.f16x2 r701, r37, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs104, fd96; +} +mov.b32 r709, {rs104, rs104}; +{ +mul.f16x2 r707, r46, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs105, fd95; +} +mov.b32 r715, {rs105, rs105}; +{ +mul.f16x2 r713, r40, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs106, fd96; +} +mov.b32 r721, {rs106, rs106}; +{ +mul.f16x2 r719, r43, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs107, fd99; +} +mov.b32 r727, {rs107, rs107}; +{ +mul.f16x2 r725, r49, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs108, fd100; +} +mov.b32 r733, {rs108, rs108}; +{ +mul.f16x2 r731, r58, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs109, fd99; +} +mov.b32 r739, {rs109, rs109}; +{ +mul.f16x2 r737, r52, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs110, fd100; +} +mov.b32 r745, {rs110, rs110}; +{ +mul.f16x2 r743, r55, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +sub.f16x2 %10, r728, r734; +} +{ +add.f16x2 %11, r740, r746; +} +{ +add.f16x2 %12, r728, r734; +} +{ +sub.f16x2 %13, r740, r746; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..9ad0d95d02717 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp32_fwd.hpp.inc @@ -0,0 +1,166 @@ +#ifndef CUFFTDX_FFT_11_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_11_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<178, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<195>; +.reg .b64 rd<2>; +add.f32 f45, %24, %48; +add.f32 f46, %26, %49; +sub.f32 f47, %24, %48; +sub.f32 f48, %26, %49; +add.f32 f49, %27, %46; +add.f32 f50, %29, %47; +sub.f32 f51, %27, %46; +sub.f32 f52, %29, %47; +add.f32 f53, %30, %43; +add.f32 f54, %31, %45; +sub.f32 f55, %30, %43; +sub.f32 f56, %31, %45; +add.f32 f57, %32, %40; +add.f32 f58, %34, %42; +sub.f32 f59, %32, %40; +sub.f32 f60, %34, %42; +add.f32 f61, %35, %38; +add.f32 f62, %37, %39; +sub.f32 f63, %35, %38; +sub.f32 f64, %37, %39; +add.f32 f65, %22, f45; +add.f32 f66, %23, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +fma.rn.f32 f73, f45, 0f3F575C64, %22; +fma.rn.f32 f74, f48, 0fBF0A6770, 0f00000000; +fma.rn.f32 f75, f46, 0f3F575C64, %23; +fma.rn.f32 f76, f47, 0fBF0A6770, 0f00000000; +fma.rn.f32 f77, f49, 0f3ED4B147, f73; +fma.rn.f32 f78, f52, 0fBF68DDA4, f74; +fma.rn.f32 f79, f50, 0f3ED4B147, f75; +fma.rn.f32 f80, f51, 0fBF68DDA4, f76; +fma.rn.f32 f81, f53, 0fBE11BAFB, f77; +fma.rn.f32 f82, f56, 0fBF7D64F0, f78; +fma.rn.f32 f83, f54, 0fBE11BAFB, f79; +fma.rn.f32 f84, f55, 0fBF7D64F0, f80; +fma.rn.f32 f85, f57, 0fBF27A4F4, f81; +fma.rn.f32 f86, f60, 0fBF4178CE, f82; +fma.rn.f32 f87, f58, 0fBF27A4F4, f83; +fma.rn.f32 f88, f59, 0fBF4178CE, f84; +fma.rn.f32 f89, f61, 0fBF75A155, f85; +fma.rn.f32 f90, f64, 0fBE903F40, f86; +fma.rn.f32 f91, f62, 0fBF75A155, f87; +fma.rn.f32 f92, f63, 0fBE903F40, f88; +fma.rn.f32 f93, f45, 0f3ED4B147, %22; +fma.rn.f32 f94, f48, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f95, f46, 0f3ED4B147, %23; +fma.rn.f32 f96, f47, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f97, f49, 0fBF27A4F4, f93; +fma.rn.f32 f98, f52, 0fBF4178CE, f94; +fma.rn.f32 f99, f50, 0fBF27A4F4, f95; +fma.rn.f32 f100, f51, 0fBF4178CE, f96; +fma.rn.f32 f101, f53, 0fBF75A155, f97; +fma.rn.f32 f102, f56, 0f3E903F40, f98; +fma.rn.f32 f103, f54, 0fBF75A155, f99; +fma.rn.f32 f104, f55, 0f3E903F40, f100; +fma.rn.f32 f105, f57, 0fBE11BAFB, f101; +fma.rn.f32 f106, f60, 0f3F7D64F0, f102; +fma.rn.f32 f107, f58, 0fBE11BAFB, f103; +fma.rn.f32 f108, f59, 0f3F7D64F0, f104; +fma.rn.f32 f109, f61, 0f3F575C64, f105; +fma.rn.f32 f110, f64, 0f3F0A6770, f106; +fma.rn.f32 f111, f62, 0f3F575C64, f107; +fma.rn.f32 f112, f63, 0f3F0A6770, f108; +fma.rn.f32 f113, f45, 0fBE11BAFB, %22; +fma.rn.f32 f114, f48, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f115, f46, 0fBE11BAFB, %23; +fma.rn.f32 f116, f47, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f117, f49, 0fBF75A155, f113; +fma.rn.f32 f118, f52, 0f3E903F40, f114; +fma.rn.f32 f119, f50, 0fBF75A155, f115; +fma.rn.f32 f120, f51, 0f3E903F40, f116; +fma.rn.f32 f121, f53, 0f3ED4B147, f117; +fma.rn.f32 f122, f56, 0f3F68DDA4, f118; +fma.rn.f32 f123, f54, 0f3ED4B147, f119; +fma.rn.f32 f124, f55, 0f3F68DDA4, f120; +fma.rn.f32 f125, f57, 0f3F575C64, f121; +fma.rn.f32 f126, f60, 0fBF0A6770, f122; +fma.rn.f32 f127, f58, 0f3F575C64, f123; +fma.rn.f32 f128, f59, 0fBF0A6770, f124; +fma.rn.f32 f129, f61, 0fBF27A4F4, f125; +fma.rn.f32 f130, f64, 0fBF4178CE, f126; +fma.rn.f32 f131, f62, 0fBF27A4F4, f127; +fma.rn.f32 f132, f63, 0fBF4178CE, f128; +fma.rn.f32 f133, f45, 0fBF27A4F4, %22; +fma.rn.f32 f134, f48, 0fBF4178CE, 0f00000000; +fma.rn.f32 f135, f46, 0fBF27A4F4, %23; +fma.rn.f32 f136, f47, 0fBF4178CE, 0f00000000; +fma.rn.f32 f137, f49, 0fBE11BAFB, f133; +fma.rn.f32 f138, f52, 0f3F7D64F0, f134; +fma.rn.f32 f139, f50, 0fBE11BAFB, f135; +fma.rn.f32 f140, f51, 0f3F7D64F0, f136; +fma.rn.f32 f141, f53, 0f3F575C64, f137; +fma.rn.f32 f142, f56, 0fBF0A6770, f138; +fma.rn.f32 f143, f54, 0f3F575C64, f139; +fma.rn.f32 f144, f55, 0fBF0A6770, f140; +fma.rn.f32 f145, f57, 0fBF75A155, f141; +fma.rn.f32 f146, f60, 0fBE903F40, f142; +fma.rn.f32 f147, f58, 0fBF75A155, f143; +fma.rn.f32 f148, f59, 0fBE903F40, f144; +fma.rn.f32 f149, f61, 0f3ED4B147, f145; +fma.rn.f32 f150, f64, 0f3F68DDA4, f146; +fma.rn.f32 f151, f62, 0f3ED4B147, f147; +fma.rn.f32 f152, f63, 0f3F68DDA4, f148; +fma.rn.f32 f153, f45, 0fBF75A155, %22; +fma.rn.f32 f154, f48, 0fBE903F40, 0f00000000; +fma.rn.f32 f155, f46, 0fBF75A155, %23; +fma.rn.f32 f156, f47, 0fBE903F40, 0f00000000; +fma.rn.f32 f157, f49, 0f3F575C64, f153; +fma.rn.f32 f158, f52, 0f3F0A6770, f154; +fma.rn.f32 f159, f50, 0f3F575C64, f155; +fma.rn.f32 f160, f51, 0f3F0A6770, f156; +fma.rn.f32 f161, f53, 0fBF27A4F4, f157; +fma.rn.f32 f162, f56, 0fBF4178CE, f158; +fma.rn.f32 f163, f54, 0fBF27A4F4, f159; +fma.rn.f32 f164, f55, 0fBF4178CE, f160; +fma.rn.f32 f165, f57, 0f3ED4B147, f161; +fma.rn.f32 f166, f60, 0f3F68DDA4, f162; +fma.rn.f32 f167, f58, 0f3ED4B147, f163; +fma.rn.f32 f168, f59, 0f3F68DDA4, f164; +fma.rn.f32 f169, f61, 0fBE11BAFB, f165; +fma.rn.f32 f170, f64, 0fBF7D64F0, f166; +fma.rn.f32 f171, f62, 0fBE11BAFB, f167; +fma.rn.f32 f172, f63, 0fBF7D64F0, f168; +add.f32 %1, f72, f62; +add.f32 %0, f71, f61; +add.f32 %3, f91, f92; +sub.f32 %2, f89, f90; +add.f32 %5, f111, f112; +sub.f32 %4, f109, f110; +add.f32 %7, f131, f132; +sub.f32 %6, f129, f130; +add.f32 %9, f151, f152; +sub.f32 %8, f149, f150; +add.f32 %11, f171, f172; +sub.f32 %10, f169, f170; +sub.f32 %13, f171, f172; +add.f32 %12, f169, f170; +sub.f32 %15, f151, f152; +add.f32 %14, f149, f150; +sub.f32 %17, f131, f132; +add.f32 %16, f129, f130; +sub.f32 %19, f111, f112; +add.f32 %18, f109, f110; +sub.f32 %21, f91, f92; +add.f32 %20, f89, f90; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..e571df5d5de24 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp32_inv.hpp.inc @@ -0,0 +1,166 @@ +#ifndef CUFFTDX_FFT_11_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_11_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<380, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<195>; +.reg .b64 rd<2>; +add.f32 f45, %24, %48; +add.f32 f46, %26, %49; +sub.f32 f47, %24, %48; +sub.f32 f48, %26, %49; +add.f32 f49, %27, %46; +add.f32 f50, %29, %47; +sub.f32 f51, %27, %46; +sub.f32 f52, %29, %47; +add.f32 f53, %30, %43; +add.f32 f54, %31, %45; +sub.f32 f55, %30, %43; +sub.f32 f56, %31, %45; +add.f32 f57, %32, %40; +add.f32 f58, %34, %42; +sub.f32 f59, %32, %40; +sub.f32 f60, %34, %42; +add.f32 f61, %35, %38; +add.f32 f62, %37, %39; +sub.f32 f63, %35, %38; +sub.f32 f64, %37, %39; +add.f32 f65, %22, f45; +add.f32 f66, %23, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +fma.rn.f32 f73, f45, 0f3F575C64, %22; +fma.rn.f32 f74, f48, 0f3F0A6770, 0f00000000; +fma.rn.f32 f75, f46, 0f3F575C64, %23; +fma.rn.f32 f76, f47, 0f3F0A6770, 0f00000000; +fma.rn.f32 f77, f49, 0f3ED4B147, f73; +fma.rn.f32 f78, f52, 0f3F68DDA4, f74; +fma.rn.f32 f79, f50, 0f3ED4B147, f75; +fma.rn.f32 f80, f51, 0f3F68DDA4, f76; +fma.rn.f32 f81, f53, 0fBE11BAFB, f77; +fma.rn.f32 f82, f56, 0f3F7D64F0, f78; +fma.rn.f32 f83, f54, 0fBE11BAFB, f79; +fma.rn.f32 f84, f55, 0f3F7D64F0, f80; +fma.rn.f32 f85, f57, 0fBF27A4F4, f81; +fma.rn.f32 f86, f60, 0f3F4178CE, f82; +fma.rn.f32 f87, f58, 0fBF27A4F4, f83; +fma.rn.f32 f88, f59, 0f3F4178CE, f84; +fma.rn.f32 f89, f61, 0fBF75A155, f85; +fma.rn.f32 f90, f64, 0f3E903F40, f86; +fma.rn.f32 f91, f62, 0fBF75A155, f87; +fma.rn.f32 f92, f63, 0f3E903F40, f88; +fma.rn.f32 f93, f45, 0f3ED4B147, %22; +fma.rn.f32 f94, f48, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f95, f46, 0f3ED4B147, %23; +fma.rn.f32 f96, f47, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f97, f49, 0fBF27A4F4, f93; +fma.rn.f32 f98, f52, 0f3F4178CE, f94; +fma.rn.f32 f99, f50, 0fBF27A4F4, f95; +fma.rn.f32 f100, f51, 0f3F4178CE, f96; +fma.rn.f32 f101, f53, 0fBF75A155, f97; +fma.rn.f32 f102, f56, 0fBE903F40, f98; +fma.rn.f32 f103, f54, 0fBF75A155, f99; +fma.rn.f32 f104, f55, 0fBE903F40, f100; +fma.rn.f32 f105, f57, 0fBE11BAFB, f101; +fma.rn.f32 f106, f60, 0fBF7D64F0, f102; +fma.rn.f32 f107, f58, 0fBE11BAFB, f103; +fma.rn.f32 f108, f59, 0fBF7D64F0, f104; +fma.rn.f32 f109, f61, 0f3F575C64, f105; +fma.rn.f32 f110, f64, 0fBF0A6770, f106; +fma.rn.f32 f111, f62, 0f3F575C64, f107; +fma.rn.f32 f112, f63, 0fBF0A6770, f108; +fma.rn.f32 f113, f45, 0fBE11BAFB, %22; +fma.rn.f32 f114, f48, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f115, f46, 0fBE11BAFB, %23; +fma.rn.f32 f116, f47, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f117, f49, 0fBF75A155, f113; +fma.rn.f32 f118, f52, 0fBE903F40, f114; +fma.rn.f32 f119, f50, 0fBF75A155, f115; +fma.rn.f32 f120, f51, 0fBE903F40, f116; +fma.rn.f32 f121, f53, 0f3ED4B147, f117; +fma.rn.f32 f122, f56, 0fBF68DDA4, f118; +fma.rn.f32 f123, f54, 0f3ED4B147, f119; +fma.rn.f32 f124, f55, 0fBF68DDA4, f120; +fma.rn.f32 f125, f57, 0f3F575C64, f121; +fma.rn.f32 f126, f60, 0f3F0A6770, f122; +fma.rn.f32 f127, f58, 0f3F575C64, f123; +fma.rn.f32 f128, f59, 0f3F0A6770, f124; +fma.rn.f32 f129, f61, 0fBF27A4F4, f125; +fma.rn.f32 f130, f64, 0f3F4178CE, f126; +fma.rn.f32 f131, f62, 0fBF27A4F4, f127; +fma.rn.f32 f132, f63, 0f3F4178CE, f128; +fma.rn.f32 f133, f45, 0fBF27A4F4, %22; +fma.rn.f32 f134, f48, 0f3F4178CE, 0f00000000; +fma.rn.f32 f135, f46, 0fBF27A4F4, %23; +fma.rn.f32 f136, f47, 0f3F4178CE, 0f00000000; +fma.rn.f32 f137, f49, 0fBE11BAFB, f133; +fma.rn.f32 f138, f52, 0fBF7D64F0, f134; +fma.rn.f32 f139, f50, 0fBE11BAFB, f135; +fma.rn.f32 f140, f51, 0fBF7D64F0, f136; +fma.rn.f32 f141, f53, 0f3F575C64, f137; +fma.rn.f32 f142, f56, 0f3F0A6770, f138; +fma.rn.f32 f143, f54, 0f3F575C64, f139; +fma.rn.f32 f144, f55, 0f3F0A6770, f140; +fma.rn.f32 f145, f57, 0fBF75A155, f141; +fma.rn.f32 f146, f60, 0f3E903F40, f142; +fma.rn.f32 f147, f58, 0fBF75A155, f143; +fma.rn.f32 f148, f59, 0f3E903F40, f144; +fma.rn.f32 f149, f61, 0f3ED4B147, f145; +fma.rn.f32 f150, f64, 0fBF68DDA4, f146; +fma.rn.f32 f151, f62, 0f3ED4B147, f147; +fma.rn.f32 f152, f63, 0fBF68DDA4, f148; +fma.rn.f32 f153, f45, 0fBF75A155, %22; +fma.rn.f32 f154, f48, 0f3E903F40, 0f00000000; +fma.rn.f32 f155, f46, 0fBF75A155, %23; +fma.rn.f32 f156, f47, 0f3E903F40, 0f00000000; +fma.rn.f32 f157, f49, 0f3F575C64, f153; +fma.rn.f32 f158, f52, 0fBF0A6770, f154; +fma.rn.f32 f159, f50, 0f3F575C64, f155; +fma.rn.f32 f160, f51, 0fBF0A6770, f156; +fma.rn.f32 f161, f53, 0fBF27A4F4, f157; +fma.rn.f32 f162, f56, 0f3F4178CE, f158; +fma.rn.f32 f163, f54, 0fBF27A4F4, f159; +fma.rn.f32 f164, f55, 0f3F4178CE, f160; +fma.rn.f32 f165, f57, 0f3ED4B147, f161; +fma.rn.f32 f166, f60, 0fBF68DDA4, f162; +fma.rn.f32 f167, f58, 0f3ED4B147, f163; +fma.rn.f32 f168, f59, 0fBF68DDA4, f164; +fma.rn.f32 f169, f61, 0fBE11BAFB, f165; +fma.rn.f32 f170, f64, 0f3F7D64F0, f166; +fma.rn.f32 f171, f62, 0fBE11BAFB, f167; +fma.rn.f32 f172, f63, 0f3F7D64F0, f168; +add.f32 %1, f72, f62; +add.f32 %0, f71, f61; +add.f32 %3, f91, f92; +sub.f32 %2, f89, f90; +add.f32 %5, f111, f112; +sub.f32 %4, f109, f110; +add.f32 %7, f131, f132; +sub.f32 %6, f129, f130; +add.f32 %9, f151, f152; +sub.f32 %8, f149, f150; +add.f32 %11, f171, f172; +sub.f32 %10, f169, f170; +sub.f32 %13, f171, f172; +add.f32 %12, f169, f170; +sub.f32 %15, f151, f152; +add.f32 %14, f149, f150; +sub.f32 %17, f131, f132; +add.f32 %16, f129, f130; +sub.f32 %19, f111, f112; +add.f32 %18, f109, f110; +sub.f32 %21, f91, f92; +add.f32 %20, f89, f90; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..7b96460a75726 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp64_fwd.hpp.inc @@ -0,0 +1,166 @@ +#ifndef CUFFTDX_FFT_11_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_11_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<553, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<195>; +.reg .b64 rd<2>; +add.f64 fd45, %24, %48; +add.f64 fd46, %26, %49; +sub.f64 fd47, %24, %48; +sub.f64 fd48, %26, %49; +add.f64 fd49, %27, %46; +add.f64 fd50, %29, %47; +sub.f64 fd51, %27, %46; +sub.f64 fd52, %29, %47; +add.f64 fd53, %30, %43; +add.f64 fd54, %31, %45; +sub.f64 fd55, %30, %43; +sub.f64 fd56, %31, %45; +add.f64 fd57, %32, %40; +add.f64 fd58, %34, %42; +sub.f64 fd59, %32, %40; +sub.f64 fd60, %34, %42; +add.f64 fd61, %35, %38; +add.f64 fd62, %37, %39; +sub.f64 fd63, %35, %38; +sub.f64 fd64, %37, %39; +add.f64 fd65, %22, fd45; +add.f64 fd66, %23, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +fma.rn.f64 fd73, fd45, 0d3FEAEB8C8764F0BA, %22; +fma.rn.f64 fd74, fd48, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd75, fd46, 0d3FEAEB8C8764F0BA, %23; +fma.rn.f64 fd76, fd47, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd49, 0d3FDA9628D9C712B6, fd73; +fma.rn.f64 fd78, fd52, 0dBFED1BB48EEE2C13, fd74; +fma.rn.f64 fd79, fd50, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd51, 0dBFED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd53, 0dBFC2375F640F44DB, fd77; +fma.rn.f64 fd82, fd56, 0dBFEFAC9E043842EF, fd78; +fma.rn.f64 fd83, fd54, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd55, 0dBFEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd57, 0dBFE4F49E7F775887, fd81; +fma.rn.f64 fd86, fd60, 0dBFE82F19BB3A28A1, fd82; +fma.rn.f64 fd87, fd58, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd59, 0dBFE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd61, 0dBFEEB42A9BCD5057, fd85; +fma.rn.f64 fd90, fd64, 0dBFD207E7FD768DBF, fd86; +fma.rn.f64 fd91, fd62, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd63, 0dBFD207E7FD768DBF, fd88; +fma.rn.f64 fd93, fd45, 0d3FDA9628D9C712B6, %22; +fma.rn.f64 fd94, fd48, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd95, fd46, 0d3FDA9628D9C712B6, %23; +fma.rn.f64 fd96, fd47, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd97, fd49, 0dBFE4F49E7F775887, fd93; +fma.rn.f64 fd98, fd52, 0dBFE82F19BB3A28A1, fd94; +fma.rn.f64 fd99, fd50, 0dBFE4F49E7F775887, fd95; +fma.rn.f64 fd100, fd51, 0dBFE82F19BB3A28A1, fd96; +fma.rn.f64 fd101, fd53, 0dBFEEB42A9BCD5057, fd97; +fma.rn.f64 fd102, fd56, 0d3FD207E7FD768DBF, fd98; +fma.rn.f64 fd103, fd54, 0dBFEEB42A9BCD5057, fd99; +fma.rn.f64 fd104, fd55, 0d3FD207E7FD768DBF, fd100; +fma.rn.f64 fd105, fd57, 0dBFC2375F640F44DB, fd101; +fma.rn.f64 fd106, fd60, 0d3FEFAC9E043842EF, fd102; +fma.rn.f64 fd107, fd58, 0dBFC2375F640F44DB, fd103; +fma.rn.f64 fd108, fd59, 0d3FEFAC9E043842EF, fd104; +fma.rn.f64 fd109, fd61, 0d3FEAEB8C8764F0BA, fd105; +fma.rn.f64 fd110, fd64, 0d3FE14CEDF8BB580B, fd106; +fma.rn.f64 fd111, fd62, 0d3FEAEB8C8764F0BA, fd107; +fma.rn.f64 fd112, fd63, 0d3FE14CEDF8BB580B, fd108; +fma.rn.f64 fd113, fd45, 0dBFC2375F640F44DB, %22; +fma.rn.f64 fd114, fd48, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd115, fd46, 0dBFC2375F640F44DB, %23; +fma.rn.f64 fd116, fd47, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd117, fd49, 0dBFEEB42A9BCD5057, fd113; +fma.rn.f64 fd118, fd52, 0d3FD207E7FD768DBF, fd114; +fma.rn.f64 fd119, fd50, 0dBFEEB42A9BCD5057, fd115; +fma.rn.f64 fd120, fd51, 0d3FD207E7FD768DBF, fd116; +fma.rn.f64 fd121, fd53, 0d3FDA9628D9C712B6, fd117; +fma.rn.f64 fd122, fd56, 0d3FED1BB48EEE2C13, fd118; +fma.rn.f64 fd123, fd54, 0d3FDA9628D9C712B6, fd119; +fma.rn.f64 fd124, fd55, 0d3FED1BB48EEE2C13, fd120; +fma.rn.f64 fd125, fd57, 0d3FEAEB8C8764F0BA, fd121; +fma.rn.f64 fd126, fd60, 0dBFE14CEDF8BB580B, fd122; +fma.rn.f64 fd127, fd58, 0d3FEAEB8C8764F0BA, fd123; +fma.rn.f64 fd128, fd59, 0dBFE14CEDF8BB580B, fd124; +fma.rn.f64 fd129, fd61, 0dBFE4F49E7F775887, fd125; +fma.rn.f64 fd130, fd64, 0dBFE82F19BB3A28A1, fd126; +fma.rn.f64 fd131, fd62, 0dBFE4F49E7F775887, fd127; +fma.rn.f64 fd132, fd63, 0dBFE82F19BB3A28A1, fd128; +fma.rn.f64 fd133, fd45, 0dBFE4F49E7F775887, %22; +fma.rn.f64 fd134, fd48, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd135, fd46, 0dBFE4F49E7F775887, %23; +fma.rn.f64 fd136, fd47, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd137, fd49, 0dBFC2375F640F44DB, fd133; +fma.rn.f64 fd138, fd52, 0d3FEFAC9E043842EF, fd134; +fma.rn.f64 fd139, fd50, 0dBFC2375F640F44DB, fd135; +fma.rn.f64 fd140, fd51, 0d3FEFAC9E043842EF, fd136; +fma.rn.f64 fd141, fd53, 0d3FEAEB8C8764F0BA, fd137; +fma.rn.f64 fd142, fd56, 0dBFE14CEDF8BB580B, fd138; +fma.rn.f64 fd143, fd54, 0d3FEAEB8C8764F0BA, fd139; +fma.rn.f64 fd144, fd55, 0dBFE14CEDF8BB580B, fd140; +fma.rn.f64 fd145, fd57, 0dBFEEB42A9BCD5057, fd141; +fma.rn.f64 fd146, fd60, 0dBFD207E7FD768DBF, fd142; +fma.rn.f64 fd147, fd58, 0dBFEEB42A9BCD5057, fd143; +fma.rn.f64 fd148, fd59, 0dBFD207E7FD768DBF, fd144; +fma.rn.f64 fd149, fd61, 0d3FDA9628D9C712B6, fd145; +fma.rn.f64 fd150, fd64, 0d3FED1BB48EEE2C13, fd146; +fma.rn.f64 fd151, fd62, 0d3FDA9628D9C712B6, fd147; +fma.rn.f64 fd152, fd63, 0d3FED1BB48EEE2C13, fd148; +fma.rn.f64 fd153, fd45, 0dBFEEB42A9BCD5057, %22; +fma.rn.f64 fd154, fd48, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd155, fd46, 0dBFEEB42A9BCD5057, %23; +fma.rn.f64 fd156, fd47, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd157, fd49, 0d3FEAEB8C8764F0BA, fd153; +fma.rn.f64 fd158, fd52, 0d3FE14CEDF8BB580B, fd154; +fma.rn.f64 fd159, fd50, 0d3FEAEB8C8764F0BA, fd155; +fma.rn.f64 fd160, fd51, 0d3FE14CEDF8BB580B, fd156; +fma.rn.f64 fd161, fd53, 0dBFE4F49E7F775887, fd157; +fma.rn.f64 fd162, fd56, 0dBFE82F19BB3A28A1, fd158; +fma.rn.f64 fd163, fd54, 0dBFE4F49E7F775887, fd159; +fma.rn.f64 fd164, fd55, 0dBFE82F19BB3A28A1, fd160; +fma.rn.f64 fd165, fd57, 0d3FDA9628D9C712B6, fd161; +fma.rn.f64 fd166, fd60, 0d3FED1BB48EEE2C13, fd162; +fma.rn.f64 fd167, fd58, 0d3FDA9628D9C712B6, fd163; +fma.rn.f64 fd168, fd59, 0d3FED1BB48EEE2C13, fd164; +fma.rn.f64 fd169, fd61, 0dBFC2375F640F44DB, fd165; +fma.rn.f64 fd170, fd64, 0dBFEFAC9E043842EF, fd166; +fma.rn.f64 fd171, fd62, 0dBFC2375F640F44DB, fd167; +fma.rn.f64 fd172, fd63, 0dBFEFAC9E043842EF, fd168; +add.f64 %1, fd72, fd62; +add.f64 %0, fd71, fd61; +add.f64 %3, fd91, fd92; +sub.f64 %2, fd89, fd90; +add.f64 %5, fd111, fd112; +sub.f64 %4, fd109, fd110; +add.f64 %7, fd131, fd132; +sub.f64 %6, fd129, fd130; +add.f64 %9, fd151, fd152; +sub.f64 %8, fd149, fd150; +add.f64 %11, fd171, fd172; +sub.f64 %10, fd169, fd170; +sub.f64 %13, fd171, fd172; +add.f64 %12, fd169, fd170; +sub.f64 %15, fd151, fd152; +add.f64 %14, fd149, fd150; +sub.f64 %17, fd131, fd132; +add.f64 %16, fd129, fd130; +sub.f64 %19, fd111, fd112; +add.f64 %18, fd109, fd110; +sub.f64 %21, fd91, fd92; +add.f64 %20, fd89, fd90; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..6a630059948a2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_11_fp64_inv.hpp.inc @@ -0,0 +1,166 @@ +#ifndef CUFFTDX_FFT_11_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_11_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<724, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<195>; +.reg .b64 rd<2>; +add.f64 fd45, %24, %48; +add.f64 fd46, %26, %49; +sub.f64 fd47, %24, %48; +sub.f64 fd48, %26, %49; +add.f64 fd49, %27, %46; +add.f64 fd50, %29, %47; +sub.f64 fd51, %27, %46; +sub.f64 fd52, %29, %47; +add.f64 fd53, %30, %43; +add.f64 fd54, %31, %45; +sub.f64 fd55, %30, %43; +sub.f64 fd56, %31, %45; +add.f64 fd57, %32, %40; +add.f64 fd58, %34, %42; +sub.f64 fd59, %32, %40; +sub.f64 fd60, %34, %42; +add.f64 fd61, %35, %38; +add.f64 fd62, %37, %39; +sub.f64 fd63, %35, %38; +sub.f64 fd64, %37, %39; +add.f64 fd65, %22, fd45; +add.f64 fd66, %23, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +fma.rn.f64 fd73, fd45, 0d3FEAEB8C8764F0BA, %22; +fma.rn.f64 fd74, fd48, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd75, fd46, 0d3FEAEB8C8764F0BA, %23; +fma.rn.f64 fd76, fd47, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd49, 0d3FDA9628D9C712B6, fd73; +fma.rn.f64 fd78, fd52, 0d3FED1BB48EEE2C13, fd74; +fma.rn.f64 fd79, fd50, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd51, 0d3FED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd53, 0dBFC2375F640F44DB, fd77; +fma.rn.f64 fd82, fd56, 0d3FEFAC9E043842EF, fd78; +fma.rn.f64 fd83, fd54, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd55, 0d3FEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd57, 0dBFE4F49E7F775887, fd81; +fma.rn.f64 fd86, fd60, 0d3FE82F19BB3A28A1, fd82; +fma.rn.f64 fd87, fd58, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd59, 0d3FE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd61, 0dBFEEB42A9BCD5057, fd85; +fma.rn.f64 fd90, fd64, 0d3FD207E7FD768DBF, fd86; +fma.rn.f64 fd91, fd62, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd63, 0d3FD207E7FD768DBF, fd88; +fma.rn.f64 fd93, fd45, 0d3FDA9628D9C712B6, %22; +fma.rn.f64 fd94, fd48, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd95, fd46, 0d3FDA9628D9C712B6, %23; +fma.rn.f64 fd96, fd47, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd97, fd49, 0dBFE4F49E7F775887, fd93; +fma.rn.f64 fd98, fd52, 0d3FE82F19BB3A28A1, fd94; +fma.rn.f64 fd99, fd50, 0dBFE4F49E7F775887, fd95; +fma.rn.f64 fd100, fd51, 0d3FE82F19BB3A28A1, fd96; +fma.rn.f64 fd101, fd53, 0dBFEEB42A9BCD5057, fd97; +fma.rn.f64 fd102, fd56, 0dBFD207E7FD768DBF, fd98; +fma.rn.f64 fd103, fd54, 0dBFEEB42A9BCD5057, fd99; +fma.rn.f64 fd104, fd55, 0dBFD207E7FD768DBF, fd100; +fma.rn.f64 fd105, fd57, 0dBFC2375F640F44DB, fd101; +fma.rn.f64 fd106, fd60, 0dBFEFAC9E043842EF, fd102; +fma.rn.f64 fd107, fd58, 0dBFC2375F640F44DB, fd103; +fma.rn.f64 fd108, fd59, 0dBFEFAC9E043842EF, fd104; +fma.rn.f64 fd109, fd61, 0d3FEAEB8C8764F0BA, fd105; +fma.rn.f64 fd110, fd64, 0dBFE14CEDF8BB580B, fd106; +fma.rn.f64 fd111, fd62, 0d3FEAEB8C8764F0BA, fd107; +fma.rn.f64 fd112, fd63, 0dBFE14CEDF8BB580B, fd108; +fma.rn.f64 fd113, fd45, 0dBFC2375F640F44DB, %22; +fma.rn.f64 fd114, fd48, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd115, fd46, 0dBFC2375F640F44DB, %23; +fma.rn.f64 fd116, fd47, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd117, fd49, 0dBFEEB42A9BCD5057, fd113; +fma.rn.f64 fd118, fd52, 0dBFD207E7FD768DBF, fd114; +fma.rn.f64 fd119, fd50, 0dBFEEB42A9BCD5057, fd115; +fma.rn.f64 fd120, fd51, 0dBFD207E7FD768DBF, fd116; +fma.rn.f64 fd121, fd53, 0d3FDA9628D9C712B6, fd117; +fma.rn.f64 fd122, fd56, 0dBFED1BB48EEE2C13, fd118; +fma.rn.f64 fd123, fd54, 0d3FDA9628D9C712B6, fd119; +fma.rn.f64 fd124, fd55, 0dBFED1BB48EEE2C13, fd120; +fma.rn.f64 fd125, fd57, 0d3FEAEB8C8764F0BA, fd121; +fma.rn.f64 fd126, fd60, 0d3FE14CEDF8BB580B, fd122; +fma.rn.f64 fd127, fd58, 0d3FEAEB8C8764F0BA, fd123; +fma.rn.f64 fd128, fd59, 0d3FE14CEDF8BB580B, fd124; +fma.rn.f64 fd129, fd61, 0dBFE4F49E7F775887, fd125; +fma.rn.f64 fd130, fd64, 0d3FE82F19BB3A28A1, fd126; +fma.rn.f64 fd131, fd62, 0dBFE4F49E7F775887, fd127; +fma.rn.f64 fd132, fd63, 0d3FE82F19BB3A28A1, fd128; +fma.rn.f64 fd133, fd45, 0dBFE4F49E7F775887, %22; +fma.rn.f64 fd134, fd48, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd135, fd46, 0dBFE4F49E7F775887, %23; +fma.rn.f64 fd136, fd47, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd137, fd49, 0dBFC2375F640F44DB, fd133; +fma.rn.f64 fd138, fd52, 0dBFEFAC9E043842EF, fd134; +fma.rn.f64 fd139, fd50, 0dBFC2375F640F44DB, fd135; +fma.rn.f64 fd140, fd51, 0dBFEFAC9E043842EF, fd136; +fma.rn.f64 fd141, fd53, 0d3FEAEB8C8764F0BA, fd137; +fma.rn.f64 fd142, fd56, 0d3FE14CEDF8BB580B, fd138; +fma.rn.f64 fd143, fd54, 0d3FEAEB8C8764F0BA, fd139; +fma.rn.f64 fd144, fd55, 0d3FE14CEDF8BB580B, fd140; +fma.rn.f64 fd145, fd57, 0dBFEEB42A9BCD5057, fd141; +fma.rn.f64 fd146, fd60, 0d3FD207E7FD768DBF, fd142; +fma.rn.f64 fd147, fd58, 0dBFEEB42A9BCD5057, fd143; +fma.rn.f64 fd148, fd59, 0d3FD207E7FD768DBF, fd144; +fma.rn.f64 fd149, fd61, 0d3FDA9628D9C712B6, fd145; +fma.rn.f64 fd150, fd64, 0dBFED1BB48EEE2C13, fd146; +fma.rn.f64 fd151, fd62, 0d3FDA9628D9C712B6, fd147; +fma.rn.f64 fd152, fd63, 0dBFED1BB48EEE2C13, fd148; +fma.rn.f64 fd153, fd45, 0dBFEEB42A9BCD5057, %22; +fma.rn.f64 fd154, fd48, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd155, fd46, 0dBFEEB42A9BCD5057, %23; +fma.rn.f64 fd156, fd47, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd157, fd49, 0d3FEAEB8C8764F0BA, fd153; +fma.rn.f64 fd158, fd52, 0dBFE14CEDF8BB580B, fd154; +fma.rn.f64 fd159, fd50, 0d3FEAEB8C8764F0BA, fd155; +fma.rn.f64 fd160, fd51, 0dBFE14CEDF8BB580B, fd156; +fma.rn.f64 fd161, fd53, 0dBFE4F49E7F775887, fd157; +fma.rn.f64 fd162, fd56, 0d3FE82F19BB3A28A1, fd158; +fma.rn.f64 fd163, fd54, 0dBFE4F49E7F775887, fd159; +fma.rn.f64 fd164, fd55, 0d3FE82F19BB3A28A1, fd160; +fma.rn.f64 fd165, fd57, 0d3FDA9628D9C712B6, fd161; +fma.rn.f64 fd166, fd60, 0dBFED1BB48EEE2C13, fd162; +fma.rn.f64 fd167, fd58, 0d3FDA9628D9C712B6, fd163; +fma.rn.f64 fd168, fd59, 0dBFED1BB48EEE2C13, fd164; +fma.rn.f64 fd169, fd61, 0dBFC2375F640F44DB, fd165; +fma.rn.f64 fd170, fd64, 0d3FEFAC9E043842EF, fd166; +fma.rn.f64 fd171, fd62, 0dBFC2375F640F44DB, fd167; +fma.rn.f64 fd172, fd63, 0d3FEFAC9E043842EF, fd168; +add.f64 %1, fd72, fd62; +add.f64 %0, fd71, fd61; +add.f64 %3, fd91, fd92; +sub.f64 %2, fd89, fd90; +add.f64 %5, fd111, fd112; +sub.f64 %4, fd109, fd110; +add.f64 %7, fd131, fd132; +sub.f64 %6, fd129, fd130; +add.f64 %9, fd151, fd152; +sub.f64 %8, fd149, fd150; +add.f64 %11, fd171, fd172; +sub.f64 %10, fd169, fd170; +sub.f64 %13, fd171, fd172; +add.f64 %12, fd169, fd170; +sub.f64 %15, fd151, fd152; +add.f64 %14, fd149, fd150; +sub.f64 %17, fd131, fd132; +add.f64 %16, fd129, fd130; +sub.f64 %19, fd111, fd112; +add.f64 %18, fd109, fd110; +sub.f64 %21, fd91, fd92; +add.f64 %20, fd89, fd90; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..c4f91e24b29d3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp16_fwd.hpp.inc @@ -0,0 +1,6873 @@ +#ifndef CUFFTDX_FFT_121_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_121_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<925, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<468>; +.reg .b32 r<2104>; +.reg .b64 rd<4>; +mov.u32 r2094, %tid.y; +mov.u32 r2095, %22; +mad.lo.s32 r2096, r2094, 968, r2095; +mov.u32 r2097, %tid.x; +{ +add.f16x2 r1, %25, %43; +} +{ +add.f16x2 r4, %26, %44; +} +{ +sub.f16x2 r7, %25, %43; +} +{ +sub.f16x2 r10, %26, %44; +} +{ +add.f16x2 r13, %27, %41; +} +{ +add.f16x2 r16, %28, %42; +} +{ +sub.f16x2 r19, %27, %41; +} +{ +sub.f16x2 r22, %28, %42; +} +{ +add.f16x2 r25, %29, %39; +} +{ +add.f16x2 r28, %30, %40; +} +{ +sub.f16x2 r31, %29, %39; +} +{ +sub.f16x2 r34, %30, %40; +} +{ +add.f16x2 r37, %31, %37; +} +{ +add.f16x2 r40, %32, %38; +} +{ +sub.f16x2 r43, %31, %37; +} +{ +sub.f16x2 r46, %32, %38; +} +{ +add.f16x2 r49, %33, %35; +} +{ +add.f16x2 r52, %34, %36; +} +{ +sub.f16x2 r55, %33, %35; +} +{ +sub.f16x2 r58, %34, %36; +} +{ +add.f16x2 r61, %23, r1; +} +{ +add.f16x2 r64, %24, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 r85, r79, r49; +} +{ +add.f16x2 r88, r82, r52; +} +mov.f32 f424, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r91, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r92, {low, high}; +} +mov.f32 f438, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r93, {low, high}; +} +{ +mul.f16x2 r94, r1, r93; +} +{ +add.f16x2 r97, %23, r94; +} +mov.f32 f404, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r100, {low, high}; +} +{ +mul.f16x2 r101, r10, r100; +} +{ +add.f16x2 r104, r91, r101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r107, {low, high}; +} +{ +mul.f16x2 r108, r4, r107; +} +{ +add.f16x2 r111, %24, r108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r7, r114; +} +{ +add.f16x2 r118, r92, r115; +} +mov.f32 f454, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r121, {low, high}; +} +{ +mul.f16x2 r122, r13, r121; +} +{ +add.f16x2 r125, r97, r122; +} +mov.f32 f300, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r128, {low, high}; +} +{ +mul.f16x2 r129, r22, r128; +} +{ +add.f16x2 r132, r104, r129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r135, {low, high}; +} +{ +mul.f16x2 r136, r16, r135; +} +{ +add.f16x2 r139, r111, r136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r142, {low, high}; +} +{ +mul.f16x2 r143, r19, r142; +} +{ +add.f16x2 r146, r118, r143; +} +mov.f32 f462, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r149, {low, high}; +} +{ +mul.f16x2 r150, r25, r149; +} +{ +add.f16x2 r153, r125, r150; +} +mov.f32 f464, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r156, {low, high}; +} +{ +mul.f16x2 r157, r34, r156; +} +{ +add.f16x2 r160, r132, r157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r163, {low, high}; +} +{ +mul.f16x2 r164, r28, r163; +} +{ +add.f16x2 r167, r139, r164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r170, {low, high}; +} +{ +mul.f16x2 r171, r31, r170; +} +{ +add.f16x2 r174, r146, r171; +} +mov.f32 f446, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r177, {low, high}; +} +{ +mul.f16x2 r178, r37, r177; +} +{ +add.f16x2 r181, r153, r178; +} +mov.f32 f448, 0fBF4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r184, {low, high}; +} +{ +mul.f16x2 r185, r46, r184; +} +{ +add.f16x2 r188, r160, r185; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r191, {low, high}; +} +{ +mul.f16x2 r192, r40, r191; +} +{ +add.f16x2 r195, r167, r192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r198, {low, high}; +} +{ +mul.f16x2 r199, r43, r198; +} +{ +add.f16x2 r202, r174, r199; +} +mov.f32 f430, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r205, {low, high}; +} +{ +mul.f16x2 r206, r49, r205; +} +{ +add.f16x2 r209, r181, r206; +} +mov.f32 f432, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r212, {low, high}; +} +{ +mul.f16x2 r213, r58, r212; +} +{ +add.f16x2 r216, r188, r213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r219, {low, high}; +} +{ +mul.f16x2 r220, r52, r219; +} +{ +add.f16x2 r223, r195, r220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r226, {low, high}; +} +{ +mul.f16x2 r227, r55, r226; +} +{ +add.f16x2 r230, r202, r227; +} +{ +sub.f16x2 r233, r209, r216; +} +{ +add.f16x2 r236, r223, r230; +} +{ +add.f16x2 r239, r209, r216; +} +{ +sub.f16x2 r242, r223, r230; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r247, {low, high}; +} +{ +mul.f16x2 r248, r1, r247; +} +{ +add.f16x2 r251, %23, r248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r254, {low, high}; +} +{ +mul.f16x2 r255, r10, r254; +} +{ +add.f16x2 r258, r245, r255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r4, r261; +} +{ +add.f16x2 r265, %24, r262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r268, {low, high}; +} +{ +mul.f16x2 r269, r7, r268; +} +{ +add.f16x2 r272, r246, r269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r275, {low, high}; +} +{ +mul.f16x2 r276, r13, r275; +} +{ +add.f16x2 r279, r251, r276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r22, r282; +} +{ +add.f16x2 r286, r258, r283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r16, r289; +} +{ +add.f16x2 r293, r265, r290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r296, {low, high}; +} +{ +mul.f16x2 r297, r19, r296; +} +{ +add.f16x2 r300, r272, r297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r303, {low, high}; +} +{ +mul.f16x2 r304, r25, r303; +} +{ +add.f16x2 r307, r279, r304; +} +mov.f32 f352, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r310, {low, high}; +} +{ +mul.f16x2 r311, r34, r310; +} +{ +add.f16x2 r314, r286, r311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r317, {low, high}; +} +{ +mul.f16x2 r318, r28, r317; +} +{ +add.f16x2 r321, r293, r318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r324, {low, high}; +} +{ +mul.f16x2 r325, r31, r324; +} +{ +add.f16x2 r328, r300, r325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r331, {low, high}; +} +{ +mul.f16x2 r332, r37, r331; +} +{ +add.f16x2 r335, r307, r332; +} +mov.f32 f396, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r338, {low, high}; +} +{ +mul.f16x2 r339, r46, r338; +} +{ +add.f16x2 r342, r314, r339; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r345, {low, high}; +} +{ +mul.f16x2 r346, r40, r345; +} +{ +add.f16x2 r349, r321, r346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r43, r352; +} +{ +add.f16x2 r356, r328, r353; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r359, {low, high}; +} +{ +mul.f16x2 r360, r49, r359; +} +{ +add.f16x2 r363, r335, r360; +} +mov.f32 f440, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r366, {low, high}; +} +{ +mul.f16x2 r367, r58, r366; +} +{ +add.f16x2 r370, r342, r367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r373, {low, high}; +} +{ +mul.f16x2 r374, r52, r373; +} +{ +add.f16x2 r377, r349, r374; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r380, {low, high}; +} +{ +mul.f16x2 r381, r55, r380; +} +{ +add.f16x2 r384, r356, r381; +} +{ +sub.f16x2 r387, r363, r370; +} +{ +add.f16x2 r390, r377, r384; +} +{ +add.f16x2 r393, r363, r370; +} +{ +sub.f16x2 r396, r377, r384; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r401, {low, high}; +} +{ +mul.f16x2 r402, r1, r401; +} +{ +add.f16x2 r405, %23, r402; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r408, {low, high}; +} +{ +mul.f16x2 r409, r10, r408; +} +{ +add.f16x2 r412, r399, r409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r415, {low, high}; +} +{ +mul.f16x2 r416, r4, r415; +} +{ +add.f16x2 r419, %24, r416; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r7, r422; +} +{ +add.f16x2 r426, r400, r423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r429, {low, high}; +} +{ +mul.f16x2 r430, r13, r429; +} +{ +add.f16x2 r433, r405, r430; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r436, {low, high}; +} +{ +mul.f16x2 r437, r22, r436; +} +{ +add.f16x2 r440, r412, r437; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r16, r443; +} +{ +add.f16x2 r447, r419, r444; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r450, {low, high}; +} +{ +mul.f16x2 r451, r19, r450; +} +{ +add.f16x2 r454, r426, r451; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r457, {low, high}; +} +{ +mul.f16x2 r458, r25, r457; +} +{ +add.f16x2 r461, r433, r458; +} +mov.f32 f456, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r464, {low, high}; +} +{ +mul.f16x2 r465, r34, r464; +} +{ +add.f16x2 r468, r440, r465; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r471, {low, high}; +} +{ +mul.f16x2 r472, r28, r471; +} +{ +add.f16x2 r475, r447, r472; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r478, {low, high}; +} +{ +mul.f16x2 r479, r31, r478; +} +{ +add.f16x2 r482, r454, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r485, {low, high}; +} +{ +mul.f16x2 r486, r37, r485; +} +{ +add.f16x2 r489, r461, r486; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r492, {low, high}; +} +{ +mul.f16x2 r493, r46, r492; +} +{ +add.f16x2 r496, r468, r493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r499, {low, high}; +} +{ +mul.f16x2 r500, r40, r499; +} +{ +add.f16x2 r503, r475, r500; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r506, {low, high}; +} +{ +mul.f16x2 r507, r43, r506; +} +{ +add.f16x2 r510, r482, r507; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r513, {low, high}; +} +{ +mul.f16x2 r514, r49, r513; +} +{ +add.f16x2 r517, r489, r514; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r520, {low, high}; +} +{ +mul.f16x2 r521, r58, r520; +} +{ +add.f16x2 r524, r496, r521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r527, {low, high}; +} +{ +mul.f16x2 r528, r52, r527; +} +{ +add.f16x2 r531, r503, r528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r534, {low, high}; +} +{ +mul.f16x2 r535, r55, r534; +} +{ +add.f16x2 r538, r510, r535; +} +{ +sub.f16x2 r541, r517, r524; +} +{ +add.f16x2 r544, r531, r538; +} +{ +add.f16x2 r547, r517, r524; +} +{ +sub.f16x2 r550, r531, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r554, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r555, {low, high}; +} +{ +mul.f16x2 r556, r1, r555; +} +{ +add.f16x2 r559, %23, r556; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r562, {low, high}; +} +{ +mul.f16x2 r563, r10, r562; +} +{ +add.f16x2 r566, r553, r563; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r569, {low, high}; +} +{ +mul.f16x2 r570, r4, r569; +} +{ +add.f16x2 r573, %24, r570; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r576, {low, high}; +} +{ +mul.f16x2 r577, r7, r576; +} +{ +add.f16x2 r580, r554, r577; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r583, {low, high}; +} +{ +mul.f16x2 r584, r13, r583; +} +{ +add.f16x2 r587, r559, r584; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r590, {low, high}; +} +{ +mul.f16x2 r591, r22, r590; +} +{ +add.f16x2 r594, r566, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r597, {low, high}; +} +{ +mul.f16x2 r598, r16, r597; +} +{ +add.f16x2 r601, r573, r598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r604, {low, high}; +} +{ +mul.f16x2 r605, r19, r604; +} +{ +add.f16x2 r608, r580, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r25, r611; +} +{ +add.f16x2 r615, r587, r612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r618, {low, high}; +} +{ +mul.f16x2 r619, r34, r618; +} +{ +add.f16x2 r622, r594, r619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r625, {low, high}; +} +{ +mul.f16x2 r626, r28, r625; +} +{ +add.f16x2 r629, r601, r626; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r632, {low, high}; +} +{ +mul.f16x2 r633, r31, r632; +} +{ +add.f16x2 r636, r608, r633; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r639, {low, high}; +} +{ +mul.f16x2 r640, r37, r639; +} +{ +add.f16x2 r643, r615, r640; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r646, {low, high}; +} +{ +mul.f16x2 r647, r46, r646; +} +{ +add.f16x2 r650, r622, r647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r653, {low, high}; +} +{ +mul.f16x2 r654, r40, r653; +} +{ +add.f16x2 r657, r629, r654; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r660, {low, high}; +} +{ +mul.f16x2 r661, r43, r660; +} +{ +add.f16x2 r664, r636, r661; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r667, {low, high}; +} +{ +mul.f16x2 r668, r49, r667; +} +{ +add.f16x2 r671, r643, r668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r674, {low, high}; +} +{ +mul.f16x2 r675, r58, r674; +} +{ +add.f16x2 r678, r650, r675; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r52, r681; +} +{ +add.f16x2 r685, r657, r682; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r55, r688; +} +{ +add.f16x2 r692, r664, r689; +} +{ +sub.f16x2 r695, r671, r678; +} +{ +add.f16x2 r698, r685, r692; +} +{ +add.f16x2 r701, r671, r678; +} +{ +sub.f16x2 r704, r685, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r707, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r709, {low, high}; +} +{ +mul.f16x2 r710, r1, r709; +} +{ +add.f16x2 r713, %23, r710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r10, r716; +} +{ +add.f16x2 r720, r707, r717; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r723, {low, high}; +} +{ +mul.f16x2 r724, r4, r723; +} +{ +add.f16x2 r727, %24, r724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r730, {low, high}; +} +{ +mul.f16x2 r731, r7, r730; +} +{ +add.f16x2 r734, r708, r731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r737, {low, high}; +} +{ +mul.f16x2 r738, r13, r737; +} +{ +add.f16x2 r741, r713, r738; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r22, r744; +} +{ +add.f16x2 r748, r720, r745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r751, {low, high}; +} +{ +mul.f16x2 r752, r16, r751; +} +{ +add.f16x2 r755, r727, r752; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r758, {low, high}; +} +{ +mul.f16x2 r759, r19, r758; +} +{ +add.f16x2 r762, r734, r759; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r25, r765; +} +{ +add.f16x2 r769, r741, r766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r34, r772; +} +{ +add.f16x2 r776, r748, r773; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r779, {low, high}; +} +{ +mul.f16x2 r780, r28, r779; +} +{ +add.f16x2 r783, r755, r780; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r786, {low, high}; +} +{ +mul.f16x2 r787, r31, r786; +} +{ +add.f16x2 r790, r762, r787; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r37, r793; +} +{ +add.f16x2 r797, r769, r794; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r46, r800; +} +{ +add.f16x2 r804, r776, r801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r40, r807; +} +{ +add.f16x2 r811, r783, r808; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r43, r814; +} +{ +add.f16x2 r818, r790, r815; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r49, r821; +} +{ +add.f16x2 r825, r797, r822; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r828, {low, high}; +} +{ +mul.f16x2 r829, r58, r828; +} +{ +add.f16x2 r832, r804, r829; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r52, r835; +} +{ +add.f16x2 r839, r811, r836; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r842, {low, high}; +} +{ +mul.f16x2 r843, r55, r842; +} +{ +add.f16x2 r846, r818, r843; +} +{ +sub.f16x2 r849, r825, r832; +} +{ +add.f16x2 r852, r839, r846; +} +{ +add.f16x2 r855, r825, r832; +} +{ +sub.f16x2 r858, r839, r846; +} +mul.wide.u32 rd2, r2097, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r2098, rd3; +mul.lo.s32 r2099, r2098, 11; +sub.s32 r2100, r2097, r2099; +cvt.rn.f32.u32 f465, r2100; +mul.f32 f466, f465, 0f3D54B191; +cos.approx.f32 f221, f466; +sin.approx.f32 f467, f466; +neg.f32 f222, f467; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f222; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r236, r866; +} +{ +neg.f16x2 r871, r868; +} +{ +fma.rn.f16x2 r873, r233, r864, r871; +} +{ +mul.f16x2 r877, r233, r866; +} +{ +fma.rn.f16x2 r880, r236, r864, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r886, {high, high}; +} +mov.f32 f241, 0fBF800000; +mov.f32 f242, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r861, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r390, r903; +} +{ +neg.f16x2 r908, r905; +} +{ +fma.rn.f16x2 r910, r387, r901, r908; +} +{ +mul.f16x2 r914, r387, r903; +} +{ +fma.rn.f16x2 r917, r390, r901, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r544, r940; +} +{ +neg.f16x2 r945, r942; +} +{ +fma.rn.f16x2 r947, r541, r938, r945; +} +{ +mul.f16x2 r951, r541, r940; +} +{ +fma.rn.f16x2 r954, r544, r938, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r698, r977; +} +{ +neg.f16x2 r982, r979; +} +{ +fma.rn.f16x2 r984, r695, r975, r982; +} +{ +mul.f16x2 r988, r695, r977; +} +{ +fma.rn.f16x2 r991, r698, r975, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r852, r1014; +} +{ +neg.f16x2 r1019, r1016; +} +{ +fma.rn.f16x2 r1021, r849, r1012, r1019; +} +{ +mul.f16x2 r1025, r849, r1014; +} +{ +fma.rn.f16x2 r1028, r852, r1012, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r858, r1051; +} +{ +neg.f16x2 r1056, r1053; +} +{ +fma.rn.f16x2 r1058, r855, r1049, r1056; +} +{ +mul.f16x2 r1062, r855, r1051; +} +{ +fma.rn.f16x2 r1065, r858, r1049, r1062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r704, r1088; +} +{ +neg.f16x2 r1093, r1090; +} +{ +fma.rn.f16x2 r1095, r701, r1086, r1093; +} +{ +mul.f16x2 r1099, r701, r1088; +} +{ +fma.rn.f16x2 r1102, r704, r1086, r1099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1110, {low, high}; +} +{ +mul.f16x2 r1111, r1108, r1110; +} +{ +mul.f16x2 r1114, r1082, r1106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1117, {high, low}; +} +{ +fma.rn.f16x2 r1119, r1111, r1117, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1125, {high, high}; +} +{ +mul.f16x2 r1127, r550, r1125; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r547, r1123, r1130; +} +{ +mul.f16x2 r1136, r547, r1125; +} +{ +fma.rn.f16x2 r1139, r550, r1123, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1147, {low, high}; +} +{ +mul.f16x2 r1148, r1145, r1147; +} +{ +mul.f16x2 r1151, r1119, r1143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1154, {high, low}; +} +{ +fma.rn.f16x2 r1156, r1148, r1154, r1151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1162, {high, high}; +} +{ +mul.f16x2 r1164, r396, r1162; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r393, r1160, r1167; +} +{ +mul.f16x2 r1173, r393, r1162; +} +{ +fma.rn.f16x2 r1176, r396, r1160, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1184, {low, high}; +} +{ +mul.f16x2 r1185, r1182, r1184; +} +{ +mul.f16x2 r1188, r1156, r1180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1191, {high, low}; +} +{ +fma.rn.f16x2 r1193, r1185, r1191, r1188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1199, {high, high}; +} +{ +mul.f16x2 r1201, r242, r1199; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r239, r1197, r1204; +} +{ +mul.f16x2 r1210, r239, r1199; +} +{ +fma.rn.f16x2 r1213, r242, r1197, r1210; +} +mad.lo.s32 r2101, r2098, 968, r2096; +barrier.sync 0; +mad.lo.s32 r2102, r2100, 88, r2101; +st.shared.v2.f32 [r2102], {r85, r88}; +st.shared.v2.f32 [r2102+8], {r873, r880}; +st.shared.v2.f32 [r2102+16], {r910, r917}; +st.shared.v2.f32 [r2102+24], {r947, r954}; +st.shared.v2.f32 [r2102+32], {r984, r991}; +st.shared.v2.f32 [r2102+40], {r1021, r1028}; +st.shared.v2.f32 [r2102+48], {r1058, r1065}; +st.shared.v2.f32 [r2102+56], {r1095, r1102}; +st.shared.v2.f32 [r2102+64], {r1132, r1139}; +st.shared.v2.f32 [r2102+72], {r1169, r1176}; +st.shared.v2.f32 [r2102+80], {r1206, r1213}; +barrier.sync 0; +mad.lo.s32 r2103, r2100, -80, r2102; +ld.shared.u32 r1295, [r2103]; +ld.shared.u32 r1298, [r2103+4]; +ld.shared.u32 r1235, [r2103+88]; +ld.shared.u32 r1238, [r2103+92]; +ld.shared.u32 r1247, [r2103+176]; +ld.shared.u32 r1250, [r2103+180]; +ld.shared.u32 r1259, [r2103+264]; +ld.shared.u32 r1262, [r2103+268]; +ld.shared.u32 r1271, [r2103+352]; +ld.shared.u32 r1274, [r2103+356]; +ld.shared.u32 r1283, [r2103+440]; +ld.shared.u32 r1286, [r2103+444]; +ld.shared.u32 r1284, [r2103+528]; +ld.shared.u32 r1287, [r2103+532]; +ld.shared.u32 r1272, [r2103+616]; +ld.shared.u32 r1275, [r2103+620]; +ld.shared.u32 r1260, [r2103+704]; +ld.shared.u32 r1263, [r2103+708]; +ld.shared.u32 r1248, [r2103+792]; +ld.shared.u32 r1251, [r2103+796]; +ld.shared.u32 r1236, [r2103+880]; +ld.shared.u32 r1239, [r2103+884]; +{ +add.f16x2 r1234, r1235, r1236; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +sub.f16x2 r1240, r1235, r1236; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +add.f16x2 r1246, r1247, r1248; +} +{ +add.f16x2 r1249, r1250, r1251; +} +{ +sub.f16x2 r1252, r1247, r1248; +} +{ +sub.f16x2 r1255, r1250, r1251; +} +{ +add.f16x2 r1258, r1259, r1260; +} +{ +add.f16x2 r1261, r1262, r1263; +} +{ +sub.f16x2 r1264, r1259, r1260; +} +{ +sub.f16x2 r1267, r1262, r1263; +} +{ +add.f16x2 r1270, r1271, r1272; +} +{ +add.f16x2 r1273, r1274, r1275; +} +{ +sub.f16x2 r1276, r1271, r1272; +} +{ +sub.f16x2 r1279, r1274, r1275; +} +{ +add.f16x2 r1282, r1283, r1284; +} +{ +add.f16x2 r1285, r1286, r1287; +} +{ +sub.f16x2 r1288, r1283, r1284; +} +{ +sub.f16x2 r1291, r1286, r1287; +} +{ +add.f16x2 r1294, r1295, r1234; +} +{ +add.f16x2 r1297, r1298, r1237; +} +{ +add.f16x2 r1300, r1294, r1246; +} +{ +add.f16x2 r1303, r1297, r1249; +} +{ +add.f16x2 r1306, r1300, r1258; +} +{ +add.f16x2 r1309, r1303, r1261; +} +{ +add.f16x2 r1312, r1306, r1270; +} +{ +add.f16x2 r1315, r1309, r1273; +} +{ +add.f16x2 %0, r1312, r1282; +} +{ +add.f16x2 %1, r1315, r1285; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1326, {low, high}; +} +{ +mul.f16x2 r1327, r1234, r1326; +} +{ +add.f16x2 r1330, r1295, r1327; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1243, r1333; +} +{ +add.f16x2 r1337, r1324, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1340, {low, high}; +} +{ +mul.f16x2 r1341, r1237, r1340; +} +{ +add.f16x2 r1344, r1298, r1341; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1347, {low, high}; +} +{ +mul.f16x2 r1348, r1240, r1347; +} +{ +add.f16x2 r1351, r1325, r1348; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1354, {low, high}; +} +{ +mul.f16x2 r1355, r1246, r1354; +} +{ +add.f16x2 r1358, r1330, r1355; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1361, {low, high}; +} +{ +mul.f16x2 r1362, r1255, r1361; +} +{ +add.f16x2 r1365, r1337, r1362; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1368, {low, high}; +} +{ +mul.f16x2 r1369, r1249, r1368; +} +{ +add.f16x2 r1372, r1344, r1369; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1375, {low, high}; +} +{ +mul.f16x2 r1376, r1252, r1375; +} +{ +add.f16x2 r1379, r1351, r1376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1382, {low, high}; +} +{ +mul.f16x2 r1383, r1258, r1382; +} +{ +add.f16x2 r1386, r1358, r1383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1389, {low, high}; +} +{ +mul.f16x2 r1390, r1267, r1389; +} +{ +add.f16x2 r1393, r1365, r1390; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1396, {low, high}; +} +{ +mul.f16x2 r1397, r1261, r1396; +} +{ +add.f16x2 r1400, r1372, r1397; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1403, {low, high}; +} +{ +mul.f16x2 r1404, r1264, r1403; +} +{ +add.f16x2 r1407, r1379, r1404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1410, {low, high}; +} +{ +mul.f16x2 r1411, r1270, r1410; +} +{ +add.f16x2 r1414, r1386, r1411; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1417, {low, high}; +} +{ +mul.f16x2 r1418, r1279, r1417; +} +{ +add.f16x2 r1421, r1393, r1418; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1424, {low, high}; +} +{ +mul.f16x2 r1425, r1273, r1424; +} +{ +add.f16x2 r1428, r1400, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1431, {low, high}; +} +{ +mul.f16x2 r1432, r1276, r1431; +} +{ +add.f16x2 r1435, r1407, r1432; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1438, {low, high}; +} +{ +mul.f16x2 r1439, r1282, r1438; +} +{ +add.f16x2 r1442, r1414, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1445, {low, high}; +} +{ +mul.f16x2 r1446, r1291, r1445; +} +{ +add.f16x2 r1449, r1421, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1452, {low, high}; +} +{ +mul.f16x2 r1453, r1285, r1452; +} +{ +add.f16x2 r1456, r1428, r1453; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1459, {low, high}; +} +{ +mul.f16x2 r1460, r1288, r1459; +} +{ +add.f16x2 r1463, r1435, r1460; +} +{ +sub.f16x2 %2, r1442, r1449; +} +{ +add.f16x2 %3, r1456, r1463; +} +{ +add.f16x2 %20, r1442, r1449; +} +{ +sub.f16x2 %21, r1456, r1463; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1481, r1234, r1480; +} +{ +add.f16x2 r1484, r1295, r1481; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1487, {low, high}; +} +{ +mul.f16x2 r1488, r1243, r1487; +} +{ +add.f16x2 r1491, r1478, r1488; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1494, {low, high}; +} +{ +mul.f16x2 r1495, r1237, r1494; +} +{ +add.f16x2 r1498, r1298, r1495; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1501, {low, high}; +} +{ +mul.f16x2 r1502, r1240, r1501; +} +{ +add.f16x2 r1505, r1479, r1502; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1508, {low, high}; +} +{ +mul.f16x2 r1509, r1246, r1508; +} +{ +add.f16x2 r1512, r1484, r1509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1515, {low, high}; +} +{ +mul.f16x2 r1516, r1255, r1515; +} +{ +add.f16x2 r1519, r1491, r1516; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1522, {low, high}; +} +{ +mul.f16x2 r1523, r1249, r1522; +} +{ +add.f16x2 r1526, r1498, r1523; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1529, {low, high}; +} +{ +mul.f16x2 r1530, r1252, r1529; +} +{ +add.f16x2 r1533, r1505, r1530; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1536, {low, high}; +} +{ +mul.f16x2 r1537, r1258, r1536; +} +{ +add.f16x2 r1540, r1512, r1537; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1267, r1543; +} +{ +add.f16x2 r1547, r1519, r1544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1550, {low, high}; +} +{ +mul.f16x2 r1551, r1261, r1550; +} +{ +add.f16x2 r1554, r1526, r1551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1557, {low, high}; +} +{ +mul.f16x2 r1558, r1264, r1557; +} +{ +add.f16x2 r1561, r1533, r1558; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1564, {low, high}; +} +{ +mul.f16x2 r1565, r1270, r1564; +} +{ +add.f16x2 r1568, r1540, r1565; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1571, {low, high}; +} +{ +mul.f16x2 r1572, r1279, r1571; +} +{ +add.f16x2 r1575, r1547, r1572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1578, {low, high}; +} +{ +mul.f16x2 r1579, r1273, r1578; +} +{ +add.f16x2 r1582, r1554, r1579; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1585, {low, high}; +} +{ +mul.f16x2 r1586, r1276, r1585; +} +{ +add.f16x2 r1589, r1561, r1586; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1592, {low, high}; +} +{ +mul.f16x2 r1593, r1282, r1592; +} +{ +add.f16x2 r1596, r1568, r1593; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1599, {low, high}; +} +{ +mul.f16x2 r1600, r1291, r1599; +} +{ +add.f16x2 r1603, r1575, r1600; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1606, {low, high}; +} +{ +mul.f16x2 r1607, r1285, r1606; +} +{ +add.f16x2 r1610, r1582, r1607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1613, {low, high}; +} +{ +mul.f16x2 r1614, r1288, r1613; +} +{ +add.f16x2 r1617, r1589, r1614; +} +{ +sub.f16x2 %4, r1596, r1603; +} +{ +add.f16x2 %5, r1610, r1617; +} +{ +add.f16x2 %18, r1596, r1603; +} +{ +sub.f16x2 %19, r1610, r1617; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1634, {low, high}; +} +{ +mul.f16x2 r1635, r1234, r1634; +} +{ +add.f16x2 r1638, r1295, r1635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1641, {low, high}; +} +{ +mul.f16x2 r1642, r1243, r1641; +} +{ +add.f16x2 r1645, r1632, r1642; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1648, {low, high}; +} +{ +mul.f16x2 r1649, r1237, r1648; +} +{ +add.f16x2 r1652, r1298, r1649; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1655, {low, high}; +} +{ +mul.f16x2 r1656, r1240, r1655; +} +{ +add.f16x2 r1659, r1633, r1656; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1662, {low, high}; +} +{ +mul.f16x2 r1663, r1246, r1662; +} +{ +add.f16x2 r1666, r1638, r1663; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1669, {low, high}; +} +{ +mul.f16x2 r1670, r1255, r1669; +} +{ +add.f16x2 r1673, r1645, r1670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1676, {low, high}; +} +{ +mul.f16x2 r1677, r1249, r1676; +} +{ +add.f16x2 r1680, r1652, r1677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1683, {low, high}; +} +{ +mul.f16x2 r1684, r1252, r1683; +} +{ +add.f16x2 r1687, r1659, r1684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1258, r1690; +} +{ +add.f16x2 r1694, r1666, r1691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1697, {low, high}; +} +{ +mul.f16x2 r1698, r1267, r1697; +} +{ +add.f16x2 r1701, r1673, r1698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1261, r1704; +} +{ +add.f16x2 r1708, r1680, r1705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1711, {low, high}; +} +{ +mul.f16x2 r1712, r1264, r1711; +} +{ +add.f16x2 r1715, r1687, r1712; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1718, {low, high}; +} +{ +mul.f16x2 r1719, r1270, r1718; +} +{ +add.f16x2 r1722, r1694, r1719; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1725, {low, high}; +} +{ +mul.f16x2 r1726, r1279, r1725; +} +{ +add.f16x2 r1729, r1701, r1726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1732, {low, high}; +} +{ +mul.f16x2 r1733, r1273, r1732; +} +{ +add.f16x2 r1736, r1708, r1733; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1739, {low, high}; +} +{ +mul.f16x2 r1740, r1276, r1739; +} +{ +add.f16x2 r1743, r1715, r1740; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1282, r1746; +} +{ +add.f16x2 r1750, r1722, r1747; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1753, {low, high}; +} +{ +mul.f16x2 r1754, r1291, r1753; +} +{ +add.f16x2 r1757, r1729, r1754; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1285, r1760; +} +{ +add.f16x2 r1764, r1736, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1767, {low, high}; +} +{ +mul.f16x2 r1768, r1288, r1767; +} +{ +add.f16x2 r1771, r1743, r1768; +} +{ +sub.f16x2 %6, r1750, r1757; +} +{ +add.f16x2 %7, r1764, r1771; +} +{ +add.f16x2 %16, r1750, r1757; +} +{ +sub.f16x2 %17, r1764, r1771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1786, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1787, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1788, {low, high}; +} +{ +mul.f16x2 r1789, r1234, r1788; +} +{ +add.f16x2 r1792, r1295, r1789; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1795, {low, high}; +} +{ +mul.f16x2 r1796, r1243, r1795; +} +{ +add.f16x2 r1799, r1786, r1796; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1802, {low, high}; +} +{ +mul.f16x2 r1803, r1237, r1802; +} +{ +add.f16x2 r1806, r1298, r1803; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1809, {low, high}; +} +{ +mul.f16x2 r1810, r1240, r1809; +} +{ +add.f16x2 r1813, r1787, r1810; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1816, {low, high}; +} +{ +mul.f16x2 r1817, r1246, r1816; +} +{ +add.f16x2 r1820, r1792, r1817; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1823, {low, high}; +} +{ +mul.f16x2 r1824, r1255, r1823; +} +{ +add.f16x2 r1827, r1799, r1824; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1830, {low, high}; +} +{ +mul.f16x2 r1831, r1249, r1830; +} +{ +add.f16x2 r1834, r1806, r1831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1837, {low, high}; +} +{ +mul.f16x2 r1838, r1252, r1837; +} +{ +add.f16x2 r1841, r1813, r1838; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1844, {low, high}; +} +{ +mul.f16x2 r1845, r1258, r1844; +} +{ +add.f16x2 r1848, r1820, r1845; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1267, r1851; +} +{ +add.f16x2 r1855, r1827, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1858, {low, high}; +} +{ +mul.f16x2 r1859, r1261, r1858; +} +{ +add.f16x2 r1862, r1834, r1859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1865, {low, high}; +} +{ +mul.f16x2 r1866, r1264, r1865; +} +{ +add.f16x2 r1869, r1841, r1866; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1872, {low, high}; +} +{ +mul.f16x2 r1873, r1270, r1872; +} +{ +add.f16x2 r1876, r1848, r1873; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1879, {low, high}; +} +{ +mul.f16x2 r1880, r1279, r1879; +} +{ +add.f16x2 r1883, r1855, r1880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1886, {low, high}; +} +{ +mul.f16x2 r1887, r1273, r1886; +} +{ +add.f16x2 r1890, r1862, r1887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1893, {low, high}; +} +{ +mul.f16x2 r1894, r1276, r1893; +} +{ +add.f16x2 r1897, r1869, r1894; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1900, {low, high}; +} +{ +mul.f16x2 r1901, r1282, r1900; +} +{ +add.f16x2 r1904, r1876, r1901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1907, {low, high}; +} +{ +mul.f16x2 r1908, r1291, r1907; +} +{ +add.f16x2 r1911, r1883, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1914, {low, high}; +} +{ +mul.f16x2 r1915, r1285, r1914; +} +{ +add.f16x2 r1918, r1890, r1915; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1921, {low, high}; +} +{ +mul.f16x2 r1922, r1288, r1921; +} +{ +add.f16x2 r1925, r1897, r1922; +} +{ +sub.f16x2 %8, r1904, r1911; +} +{ +add.f16x2 %9, r1918, r1925; +} +{ +add.f16x2 %14, r1904, r1911; +} +{ +sub.f16x2 %15, r1918, r1925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1942, {low, high}; +} +{ +mul.f16x2 r1943, r1234, r1942; +} +{ +add.f16x2 r1946, r1295, r1943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1949, {low, high}; +} +{ +mul.f16x2 r1950, r1243, r1949; +} +{ +add.f16x2 r1953, r1940, r1950; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1956, {low, high}; +} +{ +mul.f16x2 r1957, r1237, r1956; +} +{ +add.f16x2 r1960, r1298, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1963, {low, high}; +} +{ +mul.f16x2 r1964, r1240, r1963; +} +{ +add.f16x2 r1967, r1941, r1964; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1970, {low, high}; +} +{ +mul.f16x2 r1971, r1246, r1970; +} +{ +add.f16x2 r1974, r1946, r1971; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1977, {low, high}; +} +{ +mul.f16x2 r1978, r1255, r1977; +} +{ +add.f16x2 r1981, r1953, r1978; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1984, {low, high}; +} +{ +mul.f16x2 r1985, r1249, r1984; +} +{ +add.f16x2 r1988, r1960, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1991, {low, high}; +} +{ +mul.f16x2 r1992, r1252, r1991; +} +{ +add.f16x2 r1995, r1967, r1992; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1998, {low, high}; +} +{ +mul.f16x2 r1999, r1258, r1998; +} +{ +add.f16x2 r2002, r1974, r1999; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2005, {low, high}; +} +{ +mul.f16x2 r2006, r1267, r2005; +} +{ +add.f16x2 r2009, r1981, r2006; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r2012, {low, high}; +} +{ +mul.f16x2 r2013, r1261, r2012; +} +{ +add.f16x2 r2016, r1988, r2013; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2019, {low, high}; +} +{ +mul.f16x2 r2020, r1264, r2019; +} +{ +add.f16x2 r2023, r1995, r2020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2026, {low, high}; +} +{ +mul.f16x2 r2027, r1270, r2026; +} +{ +add.f16x2 r2030, r2002, r2027; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2033, {low, high}; +} +{ +mul.f16x2 r2034, r1279, r2033; +} +{ +add.f16x2 r2037, r2009, r2034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r1273, r2040; +} +{ +add.f16x2 r2044, r2016, r2041; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r1276, r2047; +} +{ +add.f16x2 r2051, r2023, r2048; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2054, {low, high}; +} +{ +mul.f16x2 r2055, r1282, r2054; +} +{ +add.f16x2 r2058, r2030, r2055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2061, {low, high}; +} +{ +mul.f16x2 r2062, r1291, r2061; +} +{ +add.f16x2 r2065, r2037, r2062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2068, {low, high}; +} +{ +mul.f16x2 r2069, r1285, r2068; +} +{ +add.f16x2 r2072, r2044, r2069; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2075, {low, high}; +} +{ +mul.f16x2 r2076, r1288, r2075; +} +{ +add.f16x2 r2079, r2051, r2076; +} +{ +sub.f16x2 %10, r2058, r2065; +} +{ +add.f16x2 %11, r2072, r2079; +} +{ +add.f16x2 %12, r2058, r2065; +} +{ +sub.f16x2 %13, r2072, r2079; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<926, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<468>; +.reg .b32 r<2104>; +.reg .b64 rd<4>; +mov.u32 r2094, %tid.y; +mov.u32 r2095, %22; +mad.lo.s32 r2096, r2094, 484, r2095; +mov.u32 r2097, %tid.x; +{ +add.f16x2 r1, %25, %43; +} +{ +add.f16x2 r4, %26, %44; +} +{ +sub.f16x2 r7, %25, %43; +} +{ +sub.f16x2 r10, %26, %44; +} +{ +add.f16x2 r13, %27, %41; +} +{ +add.f16x2 r16, %28, %42; +} +{ +sub.f16x2 r19, %27, %41; +} +{ +sub.f16x2 r22, %28, %42; +} +{ +add.f16x2 r25, %29, %39; +} +{ +add.f16x2 r28, %30, %40; +} +{ +sub.f16x2 r31, %29, %39; +} +{ +sub.f16x2 r34, %30, %40; +} +{ +add.f16x2 r37, %31, %37; +} +{ +add.f16x2 r40, %32, %38; +} +{ +sub.f16x2 r43, %31, %37; +} +{ +sub.f16x2 r46, %32, %38; +} +{ +add.f16x2 r49, %33, %35; +} +{ +add.f16x2 r52, %34, %36; +} +{ +sub.f16x2 r55, %33, %35; +} +{ +sub.f16x2 r58, %34, %36; +} +{ +add.f16x2 r61, %23, r1; +} +{ +add.f16x2 r64, %24, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 r85, r79, r49; +} +{ +add.f16x2 r88, r82, r52; +} +mov.f32 f424, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r91, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r92, {low, high}; +} +mov.f32 f438, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r93, {low, high}; +} +{ +mul.f16x2 r94, r1, r93; +} +{ +add.f16x2 r97, %23, r94; +} +mov.f32 f404, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r100, {low, high}; +} +{ +mul.f16x2 r101, r10, r100; +} +{ +add.f16x2 r104, r91, r101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r107, {low, high}; +} +{ +mul.f16x2 r108, r4, r107; +} +{ +add.f16x2 r111, %24, r108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r7, r114; +} +{ +add.f16x2 r118, r92, r115; +} +mov.f32 f454, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r121, {low, high}; +} +{ +mul.f16x2 r122, r13, r121; +} +{ +add.f16x2 r125, r97, r122; +} +mov.f32 f300, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r128, {low, high}; +} +{ +mul.f16x2 r129, r22, r128; +} +{ +add.f16x2 r132, r104, r129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r135, {low, high}; +} +{ +mul.f16x2 r136, r16, r135; +} +{ +add.f16x2 r139, r111, r136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r142, {low, high}; +} +{ +mul.f16x2 r143, r19, r142; +} +{ +add.f16x2 r146, r118, r143; +} +mov.f32 f462, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r149, {low, high}; +} +{ +mul.f16x2 r150, r25, r149; +} +{ +add.f16x2 r153, r125, r150; +} +mov.f32 f464, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r156, {low, high}; +} +{ +mul.f16x2 r157, r34, r156; +} +{ +add.f16x2 r160, r132, r157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r163, {low, high}; +} +{ +mul.f16x2 r164, r28, r163; +} +{ +add.f16x2 r167, r139, r164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r170, {low, high}; +} +{ +mul.f16x2 r171, r31, r170; +} +{ +add.f16x2 r174, r146, r171; +} +mov.f32 f446, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r177, {low, high}; +} +{ +mul.f16x2 r178, r37, r177; +} +{ +add.f16x2 r181, r153, r178; +} +mov.f32 f448, 0fBF4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r184, {low, high}; +} +{ +mul.f16x2 r185, r46, r184; +} +{ +add.f16x2 r188, r160, r185; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r191, {low, high}; +} +{ +mul.f16x2 r192, r40, r191; +} +{ +add.f16x2 r195, r167, r192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r198, {low, high}; +} +{ +mul.f16x2 r199, r43, r198; +} +{ +add.f16x2 r202, r174, r199; +} +mov.f32 f430, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r205, {low, high}; +} +{ +mul.f16x2 r206, r49, r205; +} +{ +add.f16x2 r209, r181, r206; +} +mov.f32 f432, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r212, {low, high}; +} +{ +mul.f16x2 r213, r58, r212; +} +{ +add.f16x2 r216, r188, r213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r219, {low, high}; +} +{ +mul.f16x2 r220, r52, r219; +} +{ +add.f16x2 r223, r195, r220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r226, {low, high}; +} +{ +mul.f16x2 r227, r55, r226; +} +{ +add.f16x2 r230, r202, r227; +} +{ +sub.f16x2 r233, r209, r216; +} +{ +add.f16x2 r236, r223, r230; +} +{ +add.f16x2 r239, r209, r216; +} +{ +sub.f16x2 r242, r223, r230; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r247, {low, high}; +} +{ +mul.f16x2 r248, r1, r247; +} +{ +add.f16x2 r251, %23, r248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r254, {low, high}; +} +{ +mul.f16x2 r255, r10, r254; +} +{ +add.f16x2 r258, r245, r255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r4, r261; +} +{ +add.f16x2 r265, %24, r262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r268, {low, high}; +} +{ +mul.f16x2 r269, r7, r268; +} +{ +add.f16x2 r272, r246, r269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r275, {low, high}; +} +{ +mul.f16x2 r276, r13, r275; +} +{ +add.f16x2 r279, r251, r276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r22, r282; +} +{ +add.f16x2 r286, r258, r283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r16, r289; +} +{ +add.f16x2 r293, r265, r290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r296, {low, high}; +} +{ +mul.f16x2 r297, r19, r296; +} +{ +add.f16x2 r300, r272, r297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r303, {low, high}; +} +{ +mul.f16x2 r304, r25, r303; +} +{ +add.f16x2 r307, r279, r304; +} +mov.f32 f352, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r310, {low, high}; +} +{ +mul.f16x2 r311, r34, r310; +} +{ +add.f16x2 r314, r286, r311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r317, {low, high}; +} +{ +mul.f16x2 r318, r28, r317; +} +{ +add.f16x2 r321, r293, r318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r324, {low, high}; +} +{ +mul.f16x2 r325, r31, r324; +} +{ +add.f16x2 r328, r300, r325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r331, {low, high}; +} +{ +mul.f16x2 r332, r37, r331; +} +{ +add.f16x2 r335, r307, r332; +} +mov.f32 f396, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r338, {low, high}; +} +{ +mul.f16x2 r339, r46, r338; +} +{ +add.f16x2 r342, r314, r339; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r345, {low, high}; +} +{ +mul.f16x2 r346, r40, r345; +} +{ +add.f16x2 r349, r321, r346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r43, r352; +} +{ +add.f16x2 r356, r328, r353; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r359, {low, high}; +} +{ +mul.f16x2 r360, r49, r359; +} +{ +add.f16x2 r363, r335, r360; +} +mov.f32 f440, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r366, {low, high}; +} +{ +mul.f16x2 r367, r58, r366; +} +{ +add.f16x2 r370, r342, r367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r373, {low, high}; +} +{ +mul.f16x2 r374, r52, r373; +} +{ +add.f16x2 r377, r349, r374; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r380, {low, high}; +} +{ +mul.f16x2 r381, r55, r380; +} +{ +add.f16x2 r384, r356, r381; +} +{ +sub.f16x2 r387, r363, r370; +} +{ +add.f16x2 r390, r377, r384; +} +{ +add.f16x2 r393, r363, r370; +} +{ +sub.f16x2 r396, r377, r384; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r401, {low, high}; +} +{ +mul.f16x2 r402, r1, r401; +} +{ +add.f16x2 r405, %23, r402; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r408, {low, high}; +} +{ +mul.f16x2 r409, r10, r408; +} +{ +add.f16x2 r412, r399, r409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r415, {low, high}; +} +{ +mul.f16x2 r416, r4, r415; +} +{ +add.f16x2 r419, %24, r416; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r7, r422; +} +{ +add.f16x2 r426, r400, r423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r429, {low, high}; +} +{ +mul.f16x2 r430, r13, r429; +} +{ +add.f16x2 r433, r405, r430; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r436, {low, high}; +} +{ +mul.f16x2 r437, r22, r436; +} +{ +add.f16x2 r440, r412, r437; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r16, r443; +} +{ +add.f16x2 r447, r419, r444; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r450, {low, high}; +} +{ +mul.f16x2 r451, r19, r450; +} +{ +add.f16x2 r454, r426, r451; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r457, {low, high}; +} +{ +mul.f16x2 r458, r25, r457; +} +{ +add.f16x2 r461, r433, r458; +} +mov.f32 f456, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r464, {low, high}; +} +{ +mul.f16x2 r465, r34, r464; +} +{ +add.f16x2 r468, r440, r465; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r471, {low, high}; +} +{ +mul.f16x2 r472, r28, r471; +} +{ +add.f16x2 r475, r447, r472; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r478, {low, high}; +} +{ +mul.f16x2 r479, r31, r478; +} +{ +add.f16x2 r482, r454, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r485, {low, high}; +} +{ +mul.f16x2 r486, r37, r485; +} +{ +add.f16x2 r489, r461, r486; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r492, {low, high}; +} +{ +mul.f16x2 r493, r46, r492; +} +{ +add.f16x2 r496, r468, r493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r499, {low, high}; +} +{ +mul.f16x2 r500, r40, r499; +} +{ +add.f16x2 r503, r475, r500; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r506, {low, high}; +} +{ +mul.f16x2 r507, r43, r506; +} +{ +add.f16x2 r510, r482, r507; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r513, {low, high}; +} +{ +mul.f16x2 r514, r49, r513; +} +{ +add.f16x2 r517, r489, r514; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r520, {low, high}; +} +{ +mul.f16x2 r521, r58, r520; +} +{ +add.f16x2 r524, r496, r521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r527, {low, high}; +} +{ +mul.f16x2 r528, r52, r527; +} +{ +add.f16x2 r531, r503, r528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r534, {low, high}; +} +{ +mul.f16x2 r535, r55, r534; +} +{ +add.f16x2 r538, r510, r535; +} +{ +sub.f16x2 r541, r517, r524; +} +{ +add.f16x2 r544, r531, r538; +} +{ +add.f16x2 r547, r517, r524; +} +{ +sub.f16x2 r550, r531, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r554, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r555, {low, high}; +} +{ +mul.f16x2 r556, r1, r555; +} +{ +add.f16x2 r559, %23, r556; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r562, {low, high}; +} +{ +mul.f16x2 r563, r10, r562; +} +{ +add.f16x2 r566, r553, r563; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r569, {low, high}; +} +{ +mul.f16x2 r570, r4, r569; +} +{ +add.f16x2 r573, %24, r570; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r576, {low, high}; +} +{ +mul.f16x2 r577, r7, r576; +} +{ +add.f16x2 r580, r554, r577; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r583, {low, high}; +} +{ +mul.f16x2 r584, r13, r583; +} +{ +add.f16x2 r587, r559, r584; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r590, {low, high}; +} +{ +mul.f16x2 r591, r22, r590; +} +{ +add.f16x2 r594, r566, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r597, {low, high}; +} +{ +mul.f16x2 r598, r16, r597; +} +{ +add.f16x2 r601, r573, r598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r604, {low, high}; +} +{ +mul.f16x2 r605, r19, r604; +} +{ +add.f16x2 r608, r580, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r25, r611; +} +{ +add.f16x2 r615, r587, r612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r618, {low, high}; +} +{ +mul.f16x2 r619, r34, r618; +} +{ +add.f16x2 r622, r594, r619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r625, {low, high}; +} +{ +mul.f16x2 r626, r28, r625; +} +{ +add.f16x2 r629, r601, r626; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r632, {low, high}; +} +{ +mul.f16x2 r633, r31, r632; +} +{ +add.f16x2 r636, r608, r633; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r639, {low, high}; +} +{ +mul.f16x2 r640, r37, r639; +} +{ +add.f16x2 r643, r615, r640; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r646, {low, high}; +} +{ +mul.f16x2 r647, r46, r646; +} +{ +add.f16x2 r650, r622, r647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r653, {low, high}; +} +{ +mul.f16x2 r654, r40, r653; +} +{ +add.f16x2 r657, r629, r654; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r660, {low, high}; +} +{ +mul.f16x2 r661, r43, r660; +} +{ +add.f16x2 r664, r636, r661; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r667, {low, high}; +} +{ +mul.f16x2 r668, r49, r667; +} +{ +add.f16x2 r671, r643, r668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r674, {low, high}; +} +{ +mul.f16x2 r675, r58, r674; +} +{ +add.f16x2 r678, r650, r675; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r52, r681; +} +{ +add.f16x2 r685, r657, r682; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r55, r688; +} +{ +add.f16x2 r692, r664, r689; +} +{ +sub.f16x2 r695, r671, r678; +} +{ +add.f16x2 r698, r685, r692; +} +{ +add.f16x2 r701, r671, r678; +} +{ +sub.f16x2 r704, r685, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r707, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r709, {low, high}; +} +{ +mul.f16x2 r710, r1, r709; +} +{ +add.f16x2 r713, %23, r710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r10, r716; +} +{ +add.f16x2 r720, r707, r717; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r723, {low, high}; +} +{ +mul.f16x2 r724, r4, r723; +} +{ +add.f16x2 r727, %24, r724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r730, {low, high}; +} +{ +mul.f16x2 r731, r7, r730; +} +{ +add.f16x2 r734, r708, r731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r737, {low, high}; +} +{ +mul.f16x2 r738, r13, r737; +} +{ +add.f16x2 r741, r713, r738; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r22, r744; +} +{ +add.f16x2 r748, r720, r745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r751, {low, high}; +} +{ +mul.f16x2 r752, r16, r751; +} +{ +add.f16x2 r755, r727, r752; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r758, {low, high}; +} +{ +mul.f16x2 r759, r19, r758; +} +{ +add.f16x2 r762, r734, r759; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r25, r765; +} +{ +add.f16x2 r769, r741, r766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r34, r772; +} +{ +add.f16x2 r776, r748, r773; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r779, {low, high}; +} +{ +mul.f16x2 r780, r28, r779; +} +{ +add.f16x2 r783, r755, r780; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r786, {low, high}; +} +{ +mul.f16x2 r787, r31, r786; +} +{ +add.f16x2 r790, r762, r787; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r37, r793; +} +{ +add.f16x2 r797, r769, r794; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r46, r800; +} +{ +add.f16x2 r804, r776, r801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r40, r807; +} +{ +add.f16x2 r811, r783, r808; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r43, r814; +} +{ +add.f16x2 r818, r790, r815; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r49, r821; +} +{ +add.f16x2 r825, r797, r822; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r828, {low, high}; +} +{ +mul.f16x2 r829, r58, r828; +} +{ +add.f16x2 r832, r804, r829; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r52, r835; +} +{ +add.f16x2 r839, r811, r836; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r842, {low, high}; +} +{ +mul.f16x2 r843, r55, r842; +} +{ +add.f16x2 r846, r818, r843; +} +{ +sub.f16x2 r849, r825, r832; +} +{ +add.f16x2 r852, r839, r846; +} +{ +add.f16x2 r855, r825, r832; +} +{ +sub.f16x2 r858, r839, r846; +} +mul.wide.u32 rd2, r2097, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r2098, rd3; +mul.lo.s32 r2099, r2098, 11; +sub.s32 r2100, r2097, r2099; +mad.lo.s32 r2101, r2098, 484, r2096; +cvt.rn.f32.u32 f465, r2100; +mul.f32 f466, f465, 0f3D54B191; +cos.approx.f32 f221, f466; +sin.approx.f32 f467, f466; +neg.f32 f222, f467; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f222; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r236, r866; +} +{ +neg.f16x2 r871, r868; +} +{ +fma.rn.f16x2 r873, r233, r864, r871; +} +{ +mul.f16x2 r877, r233, r866; +} +{ +fma.rn.f16x2 r880, r236, r864, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r886, {high, high}; +} +mov.f32 f241, 0fBF800000; +mov.f32 f242, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r861, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r390, r903; +} +{ +neg.f16x2 r908, r905; +} +{ +fma.rn.f16x2 r910, r387, r901, r908; +} +{ +mul.f16x2 r914, r387, r903; +} +{ +fma.rn.f16x2 r917, r390, r901, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r544, r940; +} +{ +neg.f16x2 r945, r942; +} +{ +fma.rn.f16x2 r947, r541, r938, r945; +} +{ +mul.f16x2 r951, r541, r940; +} +{ +fma.rn.f16x2 r954, r544, r938, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r698, r977; +} +{ +neg.f16x2 r982, r979; +} +{ +fma.rn.f16x2 r984, r695, r975, r982; +} +{ +mul.f16x2 r988, r695, r977; +} +{ +fma.rn.f16x2 r991, r698, r975, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r852, r1014; +} +{ +neg.f16x2 r1019, r1016; +} +{ +fma.rn.f16x2 r1021, r849, r1012, r1019; +} +{ +mul.f16x2 r1025, r849, r1014; +} +{ +fma.rn.f16x2 r1028, r852, r1012, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r858, r1051; +} +{ +neg.f16x2 r1056, r1053; +} +{ +fma.rn.f16x2 r1058, r855, r1049, r1056; +} +{ +mul.f16x2 r1062, r855, r1051; +} +{ +fma.rn.f16x2 r1065, r858, r1049, r1062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r704, r1088; +} +{ +neg.f16x2 r1093, r1090; +} +{ +fma.rn.f16x2 r1095, r701, r1086, r1093; +} +{ +mul.f16x2 r1099, r701, r1088; +} +{ +fma.rn.f16x2 r1102, r704, r1086, r1099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1110, {low, high}; +} +{ +mul.f16x2 r1111, r1108, r1110; +} +{ +mul.f16x2 r1114, r1082, r1106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1117, {high, low}; +} +{ +fma.rn.f16x2 r1119, r1111, r1117, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1125, {high, high}; +} +{ +mul.f16x2 r1127, r550, r1125; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r547, r1123, r1130; +} +{ +mul.f16x2 r1136, r547, r1125; +} +{ +fma.rn.f16x2 r1139, r550, r1123, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1147, {low, high}; +} +{ +mul.f16x2 r1148, r1145, r1147; +} +{ +mul.f16x2 r1151, r1119, r1143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1154, {high, low}; +} +{ +fma.rn.f16x2 r1156, r1148, r1154, r1151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1162, {high, high}; +} +{ +mul.f16x2 r1164, r396, r1162; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r393, r1160, r1167; +} +{ +mul.f16x2 r1173, r393, r1162; +} +{ +fma.rn.f16x2 r1176, r396, r1160, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1184, {low, high}; +} +{ +mul.f16x2 r1185, r1182, r1184; +} +{ +mul.f16x2 r1188, r1156, r1180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1191, {high, low}; +} +{ +fma.rn.f16x2 r1193, r1185, r1191, r1188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1199, {high, high}; +} +{ +mul.f16x2 r1201, r242, r1199; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r239, r1197, r1204; +} +{ +mul.f16x2 r1210, r239, r1199; +} +{ +fma.rn.f16x2 r1213, r242, r1197, r1210; +} +barrier.sync 0; +mad.lo.s32 r2102, r2100, 44, r2101; +st.shared.u32 [r2102], r85; +st.shared.u32 [r2102+4], r873; +st.shared.u32 [r2102+8], r910; +st.shared.u32 [r2102+12], r947; +st.shared.u32 [r2102+16], r984; +st.shared.u32 [r2102+20], r1021; +st.shared.u32 [r2102+24], r1058; +st.shared.u32 [r2102+28], r1095; +st.shared.u32 [r2102+32], r1132; +st.shared.u32 [r2102+36], r1169; +st.shared.u32 [r2102+40], r1206; +barrier.sync 0; +mad.lo.s32 r2103, r2100, -40, r2102; +ld.shared.u32 r1295, [r2103]; +ld.shared.u32 r1235, [r2103+44]; +ld.shared.u32 r1247, [r2103+88]; +ld.shared.u32 r1259, [r2103+132]; +ld.shared.u32 r1271, [r2103+176]; +ld.shared.u32 r1283, [r2103+220]; +ld.shared.u32 r1284, [r2103+264]; +ld.shared.u32 r1272, [r2103+308]; +ld.shared.u32 r1260, [r2103+352]; +ld.shared.u32 r1248, [r2103+396]; +ld.shared.u32 r1236, [r2103+440]; +barrier.sync 0; +st.shared.u32 [r2102], r88; +st.shared.u32 [r2102+4], r880; +st.shared.u32 [r2102+8], r917; +st.shared.u32 [r2102+12], r954; +st.shared.u32 [r2102+16], r991; +st.shared.u32 [r2102+20], r1028; +st.shared.u32 [r2102+24], r1065; +st.shared.u32 [r2102+28], r1102; +st.shared.u32 [r2102+32], r1139; +st.shared.u32 [r2102+36], r1176; +st.shared.u32 [r2102+40], r1213; +barrier.sync 0; +ld.shared.u32 r1298, [r2103]; +ld.shared.u32 r1238, [r2103+44]; +ld.shared.u32 r1250, [r2103+88]; +ld.shared.u32 r1262, [r2103+132]; +ld.shared.u32 r1274, [r2103+176]; +ld.shared.u32 r1286, [r2103+220]; +ld.shared.u32 r1287, [r2103+264]; +ld.shared.u32 r1275, [r2103+308]; +ld.shared.u32 r1263, [r2103+352]; +ld.shared.u32 r1251, [r2103+396]; +ld.shared.u32 r1239, [r2103+440]; +{ +add.f16x2 r1234, r1235, r1236; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +sub.f16x2 r1240, r1235, r1236; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +add.f16x2 r1246, r1247, r1248; +} +{ +add.f16x2 r1249, r1250, r1251; +} +{ +sub.f16x2 r1252, r1247, r1248; +} +{ +sub.f16x2 r1255, r1250, r1251; +} +{ +add.f16x2 r1258, r1259, r1260; +} +{ +add.f16x2 r1261, r1262, r1263; +} +{ +sub.f16x2 r1264, r1259, r1260; +} +{ +sub.f16x2 r1267, r1262, r1263; +} +{ +add.f16x2 r1270, r1271, r1272; +} +{ +add.f16x2 r1273, r1274, r1275; +} +{ +sub.f16x2 r1276, r1271, r1272; +} +{ +sub.f16x2 r1279, r1274, r1275; +} +{ +add.f16x2 r1282, r1283, r1284; +} +{ +add.f16x2 r1285, r1286, r1287; +} +{ +sub.f16x2 r1288, r1283, r1284; +} +{ +sub.f16x2 r1291, r1286, r1287; +} +{ +add.f16x2 r1294, r1295, r1234; +} +{ +add.f16x2 r1297, r1298, r1237; +} +{ +add.f16x2 r1300, r1294, r1246; +} +{ +add.f16x2 r1303, r1297, r1249; +} +{ +add.f16x2 r1306, r1300, r1258; +} +{ +add.f16x2 r1309, r1303, r1261; +} +{ +add.f16x2 r1312, r1306, r1270; +} +{ +add.f16x2 r1315, r1309, r1273; +} +{ +add.f16x2 %0, r1312, r1282; +} +{ +add.f16x2 %1, r1315, r1285; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1326, {low, high}; +} +{ +mul.f16x2 r1327, r1234, r1326; +} +{ +add.f16x2 r1330, r1295, r1327; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1243, r1333; +} +{ +add.f16x2 r1337, r1324, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1340, {low, high}; +} +{ +mul.f16x2 r1341, r1237, r1340; +} +{ +add.f16x2 r1344, r1298, r1341; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1347, {low, high}; +} +{ +mul.f16x2 r1348, r1240, r1347; +} +{ +add.f16x2 r1351, r1325, r1348; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1354, {low, high}; +} +{ +mul.f16x2 r1355, r1246, r1354; +} +{ +add.f16x2 r1358, r1330, r1355; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1361, {low, high}; +} +{ +mul.f16x2 r1362, r1255, r1361; +} +{ +add.f16x2 r1365, r1337, r1362; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1368, {low, high}; +} +{ +mul.f16x2 r1369, r1249, r1368; +} +{ +add.f16x2 r1372, r1344, r1369; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1375, {low, high}; +} +{ +mul.f16x2 r1376, r1252, r1375; +} +{ +add.f16x2 r1379, r1351, r1376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1382, {low, high}; +} +{ +mul.f16x2 r1383, r1258, r1382; +} +{ +add.f16x2 r1386, r1358, r1383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1389, {low, high}; +} +{ +mul.f16x2 r1390, r1267, r1389; +} +{ +add.f16x2 r1393, r1365, r1390; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1396, {low, high}; +} +{ +mul.f16x2 r1397, r1261, r1396; +} +{ +add.f16x2 r1400, r1372, r1397; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1403, {low, high}; +} +{ +mul.f16x2 r1404, r1264, r1403; +} +{ +add.f16x2 r1407, r1379, r1404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1410, {low, high}; +} +{ +mul.f16x2 r1411, r1270, r1410; +} +{ +add.f16x2 r1414, r1386, r1411; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1417, {low, high}; +} +{ +mul.f16x2 r1418, r1279, r1417; +} +{ +add.f16x2 r1421, r1393, r1418; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1424, {low, high}; +} +{ +mul.f16x2 r1425, r1273, r1424; +} +{ +add.f16x2 r1428, r1400, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1431, {low, high}; +} +{ +mul.f16x2 r1432, r1276, r1431; +} +{ +add.f16x2 r1435, r1407, r1432; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1438, {low, high}; +} +{ +mul.f16x2 r1439, r1282, r1438; +} +{ +add.f16x2 r1442, r1414, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1445, {low, high}; +} +{ +mul.f16x2 r1446, r1291, r1445; +} +{ +add.f16x2 r1449, r1421, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1452, {low, high}; +} +{ +mul.f16x2 r1453, r1285, r1452; +} +{ +add.f16x2 r1456, r1428, r1453; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1459, {low, high}; +} +{ +mul.f16x2 r1460, r1288, r1459; +} +{ +add.f16x2 r1463, r1435, r1460; +} +{ +sub.f16x2 %2, r1442, r1449; +} +{ +add.f16x2 %3, r1456, r1463; +} +{ +add.f16x2 %20, r1442, r1449; +} +{ +sub.f16x2 %21, r1456, r1463; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1481, r1234, r1480; +} +{ +add.f16x2 r1484, r1295, r1481; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1487, {low, high}; +} +{ +mul.f16x2 r1488, r1243, r1487; +} +{ +add.f16x2 r1491, r1478, r1488; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1494, {low, high}; +} +{ +mul.f16x2 r1495, r1237, r1494; +} +{ +add.f16x2 r1498, r1298, r1495; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1501, {low, high}; +} +{ +mul.f16x2 r1502, r1240, r1501; +} +{ +add.f16x2 r1505, r1479, r1502; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1508, {low, high}; +} +{ +mul.f16x2 r1509, r1246, r1508; +} +{ +add.f16x2 r1512, r1484, r1509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1515, {low, high}; +} +{ +mul.f16x2 r1516, r1255, r1515; +} +{ +add.f16x2 r1519, r1491, r1516; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1522, {low, high}; +} +{ +mul.f16x2 r1523, r1249, r1522; +} +{ +add.f16x2 r1526, r1498, r1523; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1529, {low, high}; +} +{ +mul.f16x2 r1530, r1252, r1529; +} +{ +add.f16x2 r1533, r1505, r1530; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1536, {low, high}; +} +{ +mul.f16x2 r1537, r1258, r1536; +} +{ +add.f16x2 r1540, r1512, r1537; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1267, r1543; +} +{ +add.f16x2 r1547, r1519, r1544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1550, {low, high}; +} +{ +mul.f16x2 r1551, r1261, r1550; +} +{ +add.f16x2 r1554, r1526, r1551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1557, {low, high}; +} +{ +mul.f16x2 r1558, r1264, r1557; +} +{ +add.f16x2 r1561, r1533, r1558; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1564, {low, high}; +} +{ +mul.f16x2 r1565, r1270, r1564; +} +{ +add.f16x2 r1568, r1540, r1565; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1571, {low, high}; +} +{ +mul.f16x2 r1572, r1279, r1571; +} +{ +add.f16x2 r1575, r1547, r1572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1578, {low, high}; +} +{ +mul.f16x2 r1579, r1273, r1578; +} +{ +add.f16x2 r1582, r1554, r1579; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1585, {low, high}; +} +{ +mul.f16x2 r1586, r1276, r1585; +} +{ +add.f16x2 r1589, r1561, r1586; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1592, {low, high}; +} +{ +mul.f16x2 r1593, r1282, r1592; +} +{ +add.f16x2 r1596, r1568, r1593; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1599, {low, high}; +} +{ +mul.f16x2 r1600, r1291, r1599; +} +{ +add.f16x2 r1603, r1575, r1600; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1606, {low, high}; +} +{ +mul.f16x2 r1607, r1285, r1606; +} +{ +add.f16x2 r1610, r1582, r1607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1613, {low, high}; +} +{ +mul.f16x2 r1614, r1288, r1613; +} +{ +add.f16x2 r1617, r1589, r1614; +} +{ +sub.f16x2 %4, r1596, r1603; +} +{ +add.f16x2 %5, r1610, r1617; +} +{ +add.f16x2 %18, r1596, r1603; +} +{ +sub.f16x2 %19, r1610, r1617; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1634, {low, high}; +} +{ +mul.f16x2 r1635, r1234, r1634; +} +{ +add.f16x2 r1638, r1295, r1635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1641, {low, high}; +} +{ +mul.f16x2 r1642, r1243, r1641; +} +{ +add.f16x2 r1645, r1632, r1642; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1648, {low, high}; +} +{ +mul.f16x2 r1649, r1237, r1648; +} +{ +add.f16x2 r1652, r1298, r1649; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1655, {low, high}; +} +{ +mul.f16x2 r1656, r1240, r1655; +} +{ +add.f16x2 r1659, r1633, r1656; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1662, {low, high}; +} +{ +mul.f16x2 r1663, r1246, r1662; +} +{ +add.f16x2 r1666, r1638, r1663; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1669, {low, high}; +} +{ +mul.f16x2 r1670, r1255, r1669; +} +{ +add.f16x2 r1673, r1645, r1670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1676, {low, high}; +} +{ +mul.f16x2 r1677, r1249, r1676; +} +{ +add.f16x2 r1680, r1652, r1677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1683, {low, high}; +} +{ +mul.f16x2 r1684, r1252, r1683; +} +{ +add.f16x2 r1687, r1659, r1684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1258, r1690; +} +{ +add.f16x2 r1694, r1666, r1691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1697, {low, high}; +} +{ +mul.f16x2 r1698, r1267, r1697; +} +{ +add.f16x2 r1701, r1673, r1698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1261, r1704; +} +{ +add.f16x2 r1708, r1680, r1705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1711, {low, high}; +} +{ +mul.f16x2 r1712, r1264, r1711; +} +{ +add.f16x2 r1715, r1687, r1712; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1718, {low, high}; +} +{ +mul.f16x2 r1719, r1270, r1718; +} +{ +add.f16x2 r1722, r1694, r1719; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1725, {low, high}; +} +{ +mul.f16x2 r1726, r1279, r1725; +} +{ +add.f16x2 r1729, r1701, r1726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1732, {low, high}; +} +{ +mul.f16x2 r1733, r1273, r1732; +} +{ +add.f16x2 r1736, r1708, r1733; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1739, {low, high}; +} +{ +mul.f16x2 r1740, r1276, r1739; +} +{ +add.f16x2 r1743, r1715, r1740; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1282, r1746; +} +{ +add.f16x2 r1750, r1722, r1747; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1753, {low, high}; +} +{ +mul.f16x2 r1754, r1291, r1753; +} +{ +add.f16x2 r1757, r1729, r1754; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1285, r1760; +} +{ +add.f16x2 r1764, r1736, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1767, {low, high}; +} +{ +mul.f16x2 r1768, r1288, r1767; +} +{ +add.f16x2 r1771, r1743, r1768; +} +{ +sub.f16x2 %6, r1750, r1757; +} +{ +add.f16x2 %7, r1764, r1771; +} +{ +add.f16x2 %16, r1750, r1757; +} +{ +sub.f16x2 %17, r1764, r1771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1786, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1787, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1788, {low, high}; +} +{ +mul.f16x2 r1789, r1234, r1788; +} +{ +add.f16x2 r1792, r1295, r1789; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1795, {low, high}; +} +{ +mul.f16x2 r1796, r1243, r1795; +} +{ +add.f16x2 r1799, r1786, r1796; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1802, {low, high}; +} +{ +mul.f16x2 r1803, r1237, r1802; +} +{ +add.f16x2 r1806, r1298, r1803; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1809, {low, high}; +} +{ +mul.f16x2 r1810, r1240, r1809; +} +{ +add.f16x2 r1813, r1787, r1810; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1816, {low, high}; +} +{ +mul.f16x2 r1817, r1246, r1816; +} +{ +add.f16x2 r1820, r1792, r1817; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1823, {low, high}; +} +{ +mul.f16x2 r1824, r1255, r1823; +} +{ +add.f16x2 r1827, r1799, r1824; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1830, {low, high}; +} +{ +mul.f16x2 r1831, r1249, r1830; +} +{ +add.f16x2 r1834, r1806, r1831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1837, {low, high}; +} +{ +mul.f16x2 r1838, r1252, r1837; +} +{ +add.f16x2 r1841, r1813, r1838; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1844, {low, high}; +} +{ +mul.f16x2 r1845, r1258, r1844; +} +{ +add.f16x2 r1848, r1820, r1845; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1267, r1851; +} +{ +add.f16x2 r1855, r1827, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1858, {low, high}; +} +{ +mul.f16x2 r1859, r1261, r1858; +} +{ +add.f16x2 r1862, r1834, r1859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1865, {low, high}; +} +{ +mul.f16x2 r1866, r1264, r1865; +} +{ +add.f16x2 r1869, r1841, r1866; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1872, {low, high}; +} +{ +mul.f16x2 r1873, r1270, r1872; +} +{ +add.f16x2 r1876, r1848, r1873; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1879, {low, high}; +} +{ +mul.f16x2 r1880, r1279, r1879; +} +{ +add.f16x2 r1883, r1855, r1880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1886, {low, high}; +} +{ +mul.f16x2 r1887, r1273, r1886; +} +{ +add.f16x2 r1890, r1862, r1887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1893, {low, high}; +} +{ +mul.f16x2 r1894, r1276, r1893; +} +{ +add.f16x2 r1897, r1869, r1894; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1900, {low, high}; +} +{ +mul.f16x2 r1901, r1282, r1900; +} +{ +add.f16x2 r1904, r1876, r1901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1907, {low, high}; +} +{ +mul.f16x2 r1908, r1291, r1907; +} +{ +add.f16x2 r1911, r1883, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1914, {low, high}; +} +{ +mul.f16x2 r1915, r1285, r1914; +} +{ +add.f16x2 r1918, r1890, r1915; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1921, {low, high}; +} +{ +mul.f16x2 r1922, r1288, r1921; +} +{ +add.f16x2 r1925, r1897, r1922; +} +{ +sub.f16x2 %8, r1904, r1911; +} +{ +add.f16x2 %9, r1918, r1925; +} +{ +add.f16x2 %14, r1904, r1911; +} +{ +sub.f16x2 %15, r1918, r1925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1942, {low, high}; +} +{ +mul.f16x2 r1943, r1234, r1942; +} +{ +add.f16x2 r1946, r1295, r1943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1949, {low, high}; +} +{ +mul.f16x2 r1950, r1243, r1949; +} +{ +add.f16x2 r1953, r1940, r1950; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1956, {low, high}; +} +{ +mul.f16x2 r1957, r1237, r1956; +} +{ +add.f16x2 r1960, r1298, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1963, {low, high}; +} +{ +mul.f16x2 r1964, r1240, r1963; +} +{ +add.f16x2 r1967, r1941, r1964; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1970, {low, high}; +} +{ +mul.f16x2 r1971, r1246, r1970; +} +{ +add.f16x2 r1974, r1946, r1971; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1977, {low, high}; +} +{ +mul.f16x2 r1978, r1255, r1977; +} +{ +add.f16x2 r1981, r1953, r1978; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1984, {low, high}; +} +{ +mul.f16x2 r1985, r1249, r1984; +} +{ +add.f16x2 r1988, r1960, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1991, {low, high}; +} +{ +mul.f16x2 r1992, r1252, r1991; +} +{ +add.f16x2 r1995, r1967, r1992; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1998, {low, high}; +} +{ +mul.f16x2 r1999, r1258, r1998; +} +{ +add.f16x2 r2002, r1974, r1999; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2005, {low, high}; +} +{ +mul.f16x2 r2006, r1267, r2005; +} +{ +add.f16x2 r2009, r1981, r2006; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r2012, {low, high}; +} +{ +mul.f16x2 r2013, r1261, r2012; +} +{ +add.f16x2 r2016, r1988, r2013; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2019, {low, high}; +} +{ +mul.f16x2 r2020, r1264, r2019; +} +{ +add.f16x2 r2023, r1995, r2020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2026, {low, high}; +} +{ +mul.f16x2 r2027, r1270, r2026; +} +{ +add.f16x2 r2030, r2002, r2027; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2033, {low, high}; +} +{ +mul.f16x2 r2034, r1279, r2033; +} +{ +add.f16x2 r2037, r2009, r2034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r1273, r2040; +} +{ +add.f16x2 r2044, r2016, r2041; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r1276, r2047; +} +{ +add.f16x2 r2051, r2023, r2048; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2054, {low, high}; +} +{ +mul.f16x2 r2055, r1282, r2054; +} +{ +add.f16x2 r2058, r2030, r2055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2061, {low, high}; +} +{ +mul.f16x2 r2062, r1291, r2061; +} +{ +add.f16x2 r2065, r2037, r2062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2068, {low, high}; +} +{ +mul.f16x2 r2069, r1285, r2068; +} +{ +add.f16x2 r2072, r2044, r2069; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2075, {low, high}; +} +{ +mul.f16x2 r2076, r1288, r2075; +} +{ +add.f16x2 r2079, r2051, r2076; +} +{ +sub.f16x2 %10, r2058, r2065; +} +{ +add.f16x2 %11, r2072, r2079; +} +{ +add.f16x2 %12, r2058, r2065; +} +{ +sub.f16x2 %13, r2072, r2079; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..aba31ee337f9b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp16_inv.hpp.inc @@ -0,0 +1,6873 @@ +#ifndef CUFFTDX_FFT_121_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_121_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1127, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<468>; +.reg .b32 r<2104>; +.reg .b64 rd<4>; +mov.u32 r2094, %tid.y; +mov.u32 r2095, %22; +mad.lo.s32 r2096, r2094, 968, r2095; +mov.u32 r2097, %tid.x; +{ +add.f16x2 r1, %25, %43; +} +{ +add.f16x2 r4, %26, %44; +} +{ +sub.f16x2 r7, %25, %43; +} +{ +sub.f16x2 r10, %26, %44; +} +{ +add.f16x2 r13, %27, %41; +} +{ +add.f16x2 r16, %28, %42; +} +{ +sub.f16x2 r19, %27, %41; +} +{ +sub.f16x2 r22, %28, %42; +} +{ +add.f16x2 r25, %29, %39; +} +{ +add.f16x2 r28, %30, %40; +} +{ +sub.f16x2 r31, %29, %39; +} +{ +sub.f16x2 r34, %30, %40; +} +{ +add.f16x2 r37, %31, %37; +} +{ +add.f16x2 r40, %32, %38; +} +{ +sub.f16x2 r43, %31, %37; +} +{ +sub.f16x2 r46, %32, %38; +} +{ +add.f16x2 r49, %33, %35; +} +{ +add.f16x2 r52, %34, %36; +} +{ +sub.f16x2 r55, %33, %35; +} +{ +sub.f16x2 r58, %34, %36; +} +{ +add.f16x2 r61, %23, r1; +} +{ +add.f16x2 r64, %24, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 r85, r79, r49; +} +{ +add.f16x2 r88, r82, r52; +} +mov.f32 f424, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r91, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r92, {low, high}; +} +mov.f32 f438, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r93, {low, high}; +} +{ +mul.f16x2 r94, r1, r93; +} +{ +add.f16x2 r97, %23, r94; +} +mov.f32 f404, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r100, {low, high}; +} +{ +mul.f16x2 r101, r10, r100; +} +{ +add.f16x2 r104, r91, r101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r107, {low, high}; +} +{ +mul.f16x2 r108, r4, r107; +} +{ +add.f16x2 r111, %24, r108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r7, r114; +} +{ +add.f16x2 r118, r92, r115; +} +mov.f32 f454, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r121, {low, high}; +} +{ +mul.f16x2 r122, r13, r121; +} +{ +add.f16x2 r125, r97, r122; +} +mov.f32 f300, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r128, {low, high}; +} +{ +mul.f16x2 r129, r22, r128; +} +{ +add.f16x2 r132, r104, r129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r135, {low, high}; +} +{ +mul.f16x2 r136, r16, r135; +} +{ +add.f16x2 r139, r111, r136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r142, {low, high}; +} +{ +mul.f16x2 r143, r19, r142; +} +{ +add.f16x2 r146, r118, r143; +} +mov.f32 f462, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r149, {low, high}; +} +{ +mul.f16x2 r150, r25, r149; +} +{ +add.f16x2 r153, r125, r150; +} +mov.f32 f464, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r156, {low, high}; +} +{ +mul.f16x2 r157, r34, r156; +} +{ +add.f16x2 r160, r132, r157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r163, {low, high}; +} +{ +mul.f16x2 r164, r28, r163; +} +{ +add.f16x2 r167, r139, r164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r170, {low, high}; +} +{ +mul.f16x2 r171, r31, r170; +} +{ +add.f16x2 r174, r146, r171; +} +mov.f32 f446, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r177, {low, high}; +} +{ +mul.f16x2 r178, r37, r177; +} +{ +add.f16x2 r181, r153, r178; +} +mov.f32 f448, 0f3F4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r184, {low, high}; +} +{ +mul.f16x2 r185, r46, r184; +} +{ +add.f16x2 r188, r160, r185; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r191, {low, high}; +} +{ +mul.f16x2 r192, r40, r191; +} +{ +add.f16x2 r195, r167, r192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r198, {low, high}; +} +{ +mul.f16x2 r199, r43, r198; +} +{ +add.f16x2 r202, r174, r199; +} +mov.f32 f430, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r205, {low, high}; +} +{ +mul.f16x2 r206, r49, r205; +} +{ +add.f16x2 r209, r181, r206; +} +mov.f32 f432, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r212, {low, high}; +} +{ +mul.f16x2 r213, r58, r212; +} +{ +add.f16x2 r216, r188, r213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r219, {low, high}; +} +{ +mul.f16x2 r220, r52, r219; +} +{ +add.f16x2 r223, r195, r220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r226, {low, high}; +} +{ +mul.f16x2 r227, r55, r226; +} +{ +add.f16x2 r230, r202, r227; +} +{ +sub.f16x2 r233, r209, r216; +} +{ +add.f16x2 r236, r223, r230; +} +{ +add.f16x2 r239, r209, r216; +} +{ +sub.f16x2 r242, r223, r230; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r247, {low, high}; +} +{ +mul.f16x2 r248, r1, r247; +} +{ +add.f16x2 r251, %23, r248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r254, {low, high}; +} +{ +mul.f16x2 r255, r10, r254; +} +{ +add.f16x2 r258, r245, r255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r4, r261; +} +{ +add.f16x2 r265, %24, r262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r268, {low, high}; +} +{ +mul.f16x2 r269, r7, r268; +} +{ +add.f16x2 r272, r246, r269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r275, {low, high}; +} +{ +mul.f16x2 r276, r13, r275; +} +{ +add.f16x2 r279, r251, r276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r22, r282; +} +{ +add.f16x2 r286, r258, r283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r16, r289; +} +{ +add.f16x2 r293, r265, r290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r296, {low, high}; +} +{ +mul.f16x2 r297, r19, r296; +} +{ +add.f16x2 r300, r272, r297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r303, {low, high}; +} +{ +mul.f16x2 r304, r25, r303; +} +{ +add.f16x2 r307, r279, r304; +} +mov.f32 f352, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r310, {low, high}; +} +{ +mul.f16x2 r311, r34, r310; +} +{ +add.f16x2 r314, r286, r311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r317, {low, high}; +} +{ +mul.f16x2 r318, r28, r317; +} +{ +add.f16x2 r321, r293, r318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r324, {low, high}; +} +{ +mul.f16x2 r325, r31, r324; +} +{ +add.f16x2 r328, r300, r325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r331, {low, high}; +} +{ +mul.f16x2 r332, r37, r331; +} +{ +add.f16x2 r335, r307, r332; +} +mov.f32 f396, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r338, {low, high}; +} +{ +mul.f16x2 r339, r46, r338; +} +{ +add.f16x2 r342, r314, r339; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r345, {low, high}; +} +{ +mul.f16x2 r346, r40, r345; +} +{ +add.f16x2 r349, r321, r346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r43, r352; +} +{ +add.f16x2 r356, r328, r353; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r359, {low, high}; +} +{ +mul.f16x2 r360, r49, r359; +} +{ +add.f16x2 r363, r335, r360; +} +mov.f32 f440, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r366, {low, high}; +} +{ +mul.f16x2 r367, r58, r366; +} +{ +add.f16x2 r370, r342, r367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r373, {low, high}; +} +{ +mul.f16x2 r374, r52, r373; +} +{ +add.f16x2 r377, r349, r374; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r380, {low, high}; +} +{ +mul.f16x2 r381, r55, r380; +} +{ +add.f16x2 r384, r356, r381; +} +{ +sub.f16x2 r387, r363, r370; +} +{ +add.f16x2 r390, r377, r384; +} +{ +add.f16x2 r393, r363, r370; +} +{ +sub.f16x2 r396, r377, r384; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r401, {low, high}; +} +{ +mul.f16x2 r402, r1, r401; +} +{ +add.f16x2 r405, %23, r402; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r408, {low, high}; +} +{ +mul.f16x2 r409, r10, r408; +} +{ +add.f16x2 r412, r399, r409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r415, {low, high}; +} +{ +mul.f16x2 r416, r4, r415; +} +{ +add.f16x2 r419, %24, r416; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r7, r422; +} +{ +add.f16x2 r426, r400, r423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r429, {low, high}; +} +{ +mul.f16x2 r430, r13, r429; +} +{ +add.f16x2 r433, r405, r430; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r436, {low, high}; +} +{ +mul.f16x2 r437, r22, r436; +} +{ +add.f16x2 r440, r412, r437; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r16, r443; +} +{ +add.f16x2 r447, r419, r444; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r450, {low, high}; +} +{ +mul.f16x2 r451, r19, r450; +} +{ +add.f16x2 r454, r426, r451; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r457, {low, high}; +} +{ +mul.f16x2 r458, r25, r457; +} +{ +add.f16x2 r461, r433, r458; +} +mov.f32 f456, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r464, {low, high}; +} +{ +mul.f16x2 r465, r34, r464; +} +{ +add.f16x2 r468, r440, r465; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r471, {low, high}; +} +{ +mul.f16x2 r472, r28, r471; +} +{ +add.f16x2 r475, r447, r472; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r478, {low, high}; +} +{ +mul.f16x2 r479, r31, r478; +} +{ +add.f16x2 r482, r454, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r485, {low, high}; +} +{ +mul.f16x2 r486, r37, r485; +} +{ +add.f16x2 r489, r461, r486; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r492, {low, high}; +} +{ +mul.f16x2 r493, r46, r492; +} +{ +add.f16x2 r496, r468, r493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r499, {low, high}; +} +{ +mul.f16x2 r500, r40, r499; +} +{ +add.f16x2 r503, r475, r500; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r506, {low, high}; +} +{ +mul.f16x2 r507, r43, r506; +} +{ +add.f16x2 r510, r482, r507; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r513, {low, high}; +} +{ +mul.f16x2 r514, r49, r513; +} +{ +add.f16x2 r517, r489, r514; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r520, {low, high}; +} +{ +mul.f16x2 r521, r58, r520; +} +{ +add.f16x2 r524, r496, r521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r527, {low, high}; +} +{ +mul.f16x2 r528, r52, r527; +} +{ +add.f16x2 r531, r503, r528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r534, {low, high}; +} +{ +mul.f16x2 r535, r55, r534; +} +{ +add.f16x2 r538, r510, r535; +} +{ +sub.f16x2 r541, r517, r524; +} +{ +add.f16x2 r544, r531, r538; +} +{ +add.f16x2 r547, r517, r524; +} +{ +sub.f16x2 r550, r531, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r554, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r555, {low, high}; +} +{ +mul.f16x2 r556, r1, r555; +} +{ +add.f16x2 r559, %23, r556; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r562, {low, high}; +} +{ +mul.f16x2 r563, r10, r562; +} +{ +add.f16x2 r566, r553, r563; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r569, {low, high}; +} +{ +mul.f16x2 r570, r4, r569; +} +{ +add.f16x2 r573, %24, r570; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r576, {low, high}; +} +{ +mul.f16x2 r577, r7, r576; +} +{ +add.f16x2 r580, r554, r577; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r583, {low, high}; +} +{ +mul.f16x2 r584, r13, r583; +} +{ +add.f16x2 r587, r559, r584; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r590, {low, high}; +} +{ +mul.f16x2 r591, r22, r590; +} +{ +add.f16x2 r594, r566, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r597, {low, high}; +} +{ +mul.f16x2 r598, r16, r597; +} +{ +add.f16x2 r601, r573, r598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r604, {low, high}; +} +{ +mul.f16x2 r605, r19, r604; +} +{ +add.f16x2 r608, r580, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r25, r611; +} +{ +add.f16x2 r615, r587, r612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r618, {low, high}; +} +{ +mul.f16x2 r619, r34, r618; +} +{ +add.f16x2 r622, r594, r619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r625, {low, high}; +} +{ +mul.f16x2 r626, r28, r625; +} +{ +add.f16x2 r629, r601, r626; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r632, {low, high}; +} +{ +mul.f16x2 r633, r31, r632; +} +{ +add.f16x2 r636, r608, r633; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r639, {low, high}; +} +{ +mul.f16x2 r640, r37, r639; +} +{ +add.f16x2 r643, r615, r640; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r646, {low, high}; +} +{ +mul.f16x2 r647, r46, r646; +} +{ +add.f16x2 r650, r622, r647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r653, {low, high}; +} +{ +mul.f16x2 r654, r40, r653; +} +{ +add.f16x2 r657, r629, r654; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r660, {low, high}; +} +{ +mul.f16x2 r661, r43, r660; +} +{ +add.f16x2 r664, r636, r661; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r667, {low, high}; +} +{ +mul.f16x2 r668, r49, r667; +} +{ +add.f16x2 r671, r643, r668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r674, {low, high}; +} +{ +mul.f16x2 r675, r58, r674; +} +{ +add.f16x2 r678, r650, r675; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r52, r681; +} +{ +add.f16x2 r685, r657, r682; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r55, r688; +} +{ +add.f16x2 r692, r664, r689; +} +{ +sub.f16x2 r695, r671, r678; +} +{ +add.f16x2 r698, r685, r692; +} +{ +add.f16x2 r701, r671, r678; +} +{ +sub.f16x2 r704, r685, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r707, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r709, {low, high}; +} +{ +mul.f16x2 r710, r1, r709; +} +{ +add.f16x2 r713, %23, r710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r10, r716; +} +{ +add.f16x2 r720, r707, r717; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r723, {low, high}; +} +{ +mul.f16x2 r724, r4, r723; +} +{ +add.f16x2 r727, %24, r724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r730, {low, high}; +} +{ +mul.f16x2 r731, r7, r730; +} +{ +add.f16x2 r734, r708, r731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r737, {low, high}; +} +{ +mul.f16x2 r738, r13, r737; +} +{ +add.f16x2 r741, r713, r738; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r22, r744; +} +{ +add.f16x2 r748, r720, r745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r751, {low, high}; +} +{ +mul.f16x2 r752, r16, r751; +} +{ +add.f16x2 r755, r727, r752; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r758, {low, high}; +} +{ +mul.f16x2 r759, r19, r758; +} +{ +add.f16x2 r762, r734, r759; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r25, r765; +} +{ +add.f16x2 r769, r741, r766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r34, r772; +} +{ +add.f16x2 r776, r748, r773; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r779, {low, high}; +} +{ +mul.f16x2 r780, r28, r779; +} +{ +add.f16x2 r783, r755, r780; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r786, {low, high}; +} +{ +mul.f16x2 r787, r31, r786; +} +{ +add.f16x2 r790, r762, r787; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r37, r793; +} +{ +add.f16x2 r797, r769, r794; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r46, r800; +} +{ +add.f16x2 r804, r776, r801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r40, r807; +} +{ +add.f16x2 r811, r783, r808; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r43, r814; +} +{ +add.f16x2 r818, r790, r815; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r49, r821; +} +{ +add.f16x2 r825, r797, r822; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r828, {low, high}; +} +{ +mul.f16x2 r829, r58, r828; +} +{ +add.f16x2 r832, r804, r829; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r52, r835; +} +{ +add.f16x2 r839, r811, r836; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r842, {low, high}; +} +{ +mul.f16x2 r843, r55, r842; +} +{ +add.f16x2 r846, r818, r843; +} +{ +sub.f16x2 r849, r825, r832; +} +{ +add.f16x2 r852, r839, r846; +} +{ +add.f16x2 r855, r825, r832; +} +{ +sub.f16x2 r858, r839, r846; +} +mul.wide.u32 rd2, r2097, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r2098, rd3; +mul.lo.s32 r2099, r2098, 11; +sub.s32 r2100, r2097, r2099; +cvt.rn.f32.u32 f465, r2100; +mul.f32 f466, f465, 0f3D54B191; +cos.approx.f32 f221, f466; +sin.approx.f32 f467, f466; +neg.f32 f222, f467; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f222; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r236, r866; +} +{ +fma.rn.f16x2 r871, r233, r864, r868; +} +{ +mul.f16x2 r875, r233, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r236, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r886, {high, high}; +} +mov.f32 f241, 0fBF800000; +mov.f32 f242, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r861, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r390, r903; +} +{ +fma.rn.f16x2 r908, r387, r901, r905; +} +{ +mul.f16x2 r912, r387, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r390, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r544, r940; +} +{ +fma.rn.f16x2 r945, r541, r938, r942; +} +{ +mul.f16x2 r949, r541, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r544, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r698, r977; +} +{ +fma.rn.f16x2 r982, r695, r975, r979; +} +{ +mul.f16x2 r986, r695, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r698, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r852, r1014; +} +{ +fma.rn.f16x2 r1019, r849, r1012, r1016; +} +{ +mul.f16x2 r1023, r849, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r852, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r858, r1051; +} +{ +fma.rn.f16x2 r1056, r855, r1049, r1053; +} +{ +mul.f16x2 r1060, r855, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r858, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r704, r1088; +} +{ +fma.rn.f16x2 r1093, r701, r1086, r1090; +} +{ +mul.f16x2 r1097, r701, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r704, r1086, r1100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1110, {low, high}; +} +{ +mul.f16x2 r1111, r1108, r1110; +} +{ +mul.f16x2 r1114, r1082, r1106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1117, {high, low}; +} +{ +fma.rn.f16x2 r1119, r1111, r1117, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1125, {high, high}; +} +{ +mul.f16x2 r1127, r550, r1125; +} +{ +fma.rn.f16x2 r1130, r547, r1123, r1127; +} +{ +mul.f16x2 r1134, r547, r1125; +} +{ +neg.f16x2 r1137, r1134; +} +{ +fma.rn.f16x2 r1139, r550, r1123, r1137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1147, {low, high}; +} +{ +mul.f16x2 r1148, r1145, r1147; +} +{ +mul.f16x2 r1151, r1119, r1143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1154, {high, low}; +} +{ +fma.rn.f16x2 r1156, r1148, r1154, r1151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1162, {high, high}; +} +{ +mul.f16x2 r1164, r396, r1162; +} +{ +fma.rn.f16x2 r1167, r393, r1160, r1164; +} +{ +mul.f16x2 r1171, r393, r1162; +} +{ +neg.f16x2 r1174, r1171; +} +{ +fma.rn.f16x2 r1176, r396, r1160, r1174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1184, {low, high}; +} +{ +mul.f16x2 r1185, r1182, r1184; +} +{ +mul.f16x2 r1188, r1156, r1180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1191, {high, low}; +} +{ +fma.rn.f16x2 r1193, r1185, r1191, r1188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1199, {high, high}; +} +{ +mul.f16x2 r1201, r242, r1199; +} +{ +fma.rn.f16x2 r1204, r239, r1197, r1201; +} +{ +mul.f16x2 r1208, r239, r1199; +} +{ +neg.f16x2 r1211, r1208; +} +{ +fma.rn.f16x2 r1213, r242, r1197, r1211; +} +mad.lo.s32 r2101, r2098, 968, r2096; +barrier.sync 0; +mad.lo.s32 r2102, r2100, 88, r2101; +st.shared.v2.f32 [r2102], {r85, r88}; +st.shared.v2.f32 [r2102+8], {r871, r880}; +st.shared.v2.f32 [r2102+16], {r908, r917}; +st.shared.v2.f32 [r2102+24], {r945, r954}; +st.shared.v2.f32 [r2102+32], {r982, r991}; +st.shared.v2.f32 [r2102+40], {r1019, r1028}; +st.shared.v2.f32 [r2102+48], {r1056, r1065}; +st.shared.v2.f32 [r2102+56], {r1093, r1102}; +st.shared.v2.f32 [r2102+64], {r1130, r1139}; +st.shared.v2.f32 [r2102+72], {r1167, r1176}; +st.shared.v2.f32 [r2102+80], {r1204, r1213}; +barrier.sync 0; +mad.lo.s32 r2103, r2100, -80, r2102; +ld.shared.u32 r1295, [r2103]; +ld.shared.u32 r1298, [r2103+4]; +ld.shared.u32 r1235, [r2103+88]; +ld.shared.u32 r1238, [r2103+92]; +ld.shared.u32 r1247, [r2103+176]; +ld.shared.u32 r1250, [r2103+180]; +ld.shared.u32 r1259, [r2103+264]; +ld.shared.u32 r1262, [r2103+268]; +ld.shared.u32 r1271, [r2103+352]; +ld.shared.u32 r1274, [r2103+356]; +ld.shared.u32 r1283, [r2103+440]; +ld.shared.u32 r1286, [r2103+444]; +ld.shared.u32 r1284, [r2103+528]; +ld.shared.u32 r1287, [r2103+532]; +ld.shared.u32 r1272, [r2103+616]; +ld.shared.u32 r1275, [r2103+620]; +ld.shared.u32 r1260, [r2103+704]; +ld.shared.u32 r1263, [r2103+708]; +ld.shared.u32 r1248, [r2103+792]; +ld.shared.u32 r1251, [r2103+796]; +ld.shared.u32 r1236, [r2103+880]; +ld.shared.u32 r1239, [r2103+884]; +{ +add.f16x2 r1234, r1235, r1236; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +sub.f16x2 r1240, r1235, r1236; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +add.f16x2 r1246, r1247, r1248; +} +{ +add.f16x2 r1249, r1250, r1251; +} +{ +sub.f16x2 r1252, r1247, r1248; +} +{ +sub.f16x2 r1255, r1250, r1251; +} +{ +add.f16x2 r1258, r1259, r1260; +} +{ +add.f16x2 r1261, r1262, r1263; +} +{ +sub.f16x2 r1264, r1259, r1260; +} +{ +sub.f16x2 r1267, r1262, r1263; +} +{ +add.f16x2 r1270, r1271, r1272; +} +{ +add.f16x2 r1273, r1274, r1275; +} +{ +sub.f16x2 r1276, r1271, r1272; +} +{ +sub.f16x2 r1279, r1274, r1275; +} +{ +add.f16x2 r1282, r1283, r1284; +} +{ +add.f16x2 r1285, r1286, r1287; +} +{ +sub.f16x2 r1288, r1283, r1284; +} +{ +sub.f16x2 r1291, r1286, r1287; +} +{ +add.f16x2 r1294, r1295, r1234; +} +{ +add.f16x2 r1297, r1298, r1237; +} +{ +add.f16x2 r1300, r1294, r1246; +} +{ +add.f16x2 r1303, r1297, r1249; +} +{ +add.f16x2 r1306, r1300, r1258; +} +{ +add.f16x2 r1309, r1303, r1261; +} +{ +add.f16x2 r1312, r1306, r1270; +} +{ +add.f16x2 r1315, r1309, r1273; +} +{ +add.f16x2 %0, r1312, r1282; +} +{ +add.f16x2 %1, r1315, r1285; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1326, {low, high}; +} +{ +mul.f16x2 r1327, r1234, r1326; +} +{ +add.f16x2 r1330, r1295, r1327; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1243, r1333; +} +{ +add.f16x2 r1337, r1324, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1340, {low, high}; +} +{ +mul.f16x2 r1341, r1237, r1340; +} +{ +add.f16x2 r1344, r1298, r1341; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1347, {low, high}; +} +{ +mul.f16x2 r1348, r1240, r1347; +} +{ +add.f16x2 r1351, r1325, r1348; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1354, {low, high}; +} +{ +mul.f16x2 r1355, r1246, r1354; +} +{ +add.f16x2 r1358, r1330, r1355; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1361, {low, high}; +} +{ +mul.f16x2 r1362, r1255, r1361; +} +{ +add.f16x2 r1365, r1337, r1362; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1368, {low, high}; +} +{ +mul.f16x2 r1369, r1249, r1368; +} +{ +add.f16x2 r1372, r1344, r1369; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1375, {low, high}; +} +{ +mul.f16x2 r1376, r1252, r1375; +} +{ +add.f16x2 r1379, r1351, r1376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1382, {low, high}; +} +{ +mul.f16x2 r1383, r1258, r1382; +} +{ +add.f16x2 r1386, r1358, r1383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1389, {low, high}; +} +{ +mul.f16x2 r1390, r1267, r1389; +} +{ +add.f16x2 r1393, r1365, r1390; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1396, {low, high}; +} +{ +mul.f16x2 r1397, r1261, r1396; +} +{ +add.f16x2 r1400, r1372, r1397; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1403, {low, high}; +} +{ +mul.f16x2 r1404, r1264, r1403; +} +{ +add.f16x2 r1407, r1379, r1404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1410, {low, high}; +} +{ +mul.f16x2 r1411, r1270, r1410; +} +{ +add.f16x2 r1414, r1386, r1411; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1417, {low, high}; +} +{ +mul.f16x2 r1418, r1279, r1417; +} +{ +add.f16x2 r1421, r1393, r1418; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1424, {low, high}; +} +{ +mul.f16x2 r1425, r1273, r1424; +} +{ +add.f16x2 r1428, r1400, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1431, {low, high}; +} +{ +mul.f16x2 r1432, r1276, r1431; +} +{ +add.f16x2 r1435, r1407, r1432; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1438, {low, high}; +} +{ +mul.f16x2 r1439, r1282, r1438; +} +{ +add.f16x2 r1442, r1414, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1445, {low, high}; +} +{ +mul.f16x2 r1446, r1291, r1445; +} +{ +add.f16x2 r1449, r1421, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1452, {low, high}; +} +{ +mul.f16x2 r1453, r1285, r1452; +} +{ +add.f16x2 r1456, r1428, r1453; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1459, {low, high}; +} +{ +mul.f16x2 r1460, r1288, r1459; +} +{ +add.f16x2 r1463, r1435, r1460; +} +{ +sub.f16x2 %2, r1442, r1449; +} +{ +add.f16x2 %3, r1456, r1463; +} +{ +add.f16x2 %20, r1442, r1449; +} +{ +sub.f16x2 %21, r1456, r1463; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1481, r1234, r1480; +} +{ +add.f16x2 r1484, r1295, r1481; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1487, {low, high}; +} +{ +mul.f16x2 r1488, r1243, r1487; +} +{ +add.f16x2 r1491, r1478, r1488; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1494, {low, high}; +} +{ +mul.f16x2 r1495, r1237, r1494; +} +{ +add.f16x2 r1498, r1298, r1495; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1501, {low, high}; +} +{ +mul.f16x2 r1502, r1240, r1501; +} +{ +add.f16x2 r1505, r1479, r1502; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1508, {low, high}; +} +{ +mul.f16x2 r1509, r1246, r1508; +} +{ +add.f16x2 r1512, r1484, r1509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1515, {low, high}; +} +{ +mul.f16x2 r1516, r1255, r1515; +} +{ +add.f16x2 r1519, r1491, r1516; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1522, {low, high}; +} +{ +mul.f16x2 r1523, r1249, r1522; +} +{ +add.f16x2 r1526, r1498, r1523; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1529, {low, high}; +} +{ +mul.f16x2 r1530, r1252, r1529; +} +{ +add.f16x2 r1533, r1505, r1530; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1536, {low, high}; +} +{ +mul.f16x2 r1537, r1258, r1536; +} +{ +add.f16x2 r1540, r1512, r1537; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1267, r1543; +} +{ +add.f16x2 r1547, r1519, r1544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1550, {low, high}; +} +{ +mul.f16x2 r1551, r1261, r1550; +} +{ +add.f16x2 r1554, r1526, r1551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1557, {low, high}; +} +{ +mul.f16x2 r1558, r1264, r1557; +} +{ +add.f16x2 r1561, r1533, r1558; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1564, {low, high}; +} +{ +mul.f16x2 r1565, r1270, r1564; +} +{ +add.f16x2 r1568, r1540, r1565; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1571, {low, high}; +} +{ +mul.f16x2 r1572, r1279, r1571; +} +{ +add.f16x2 r1575, r1547, r1572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1578, {low, high}; +} +{ +mul.f16x2 r1579, r1273, r1578; +} +{ +add.f16x2 r1582, r1554, r1579; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1585, {low, high}; +} +{ +mul.f16x2 r1586, r1276, r1585; +} +{ +add.f16x2 r1589, r1561, r1586; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1592, {low, high}; +} +{ +mul.f16x2 r1593, r1282, r1592; +} +{ +add.f16x2 r1596, r1568, r1593; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1599, {low, high}; +} +{ +mul.f16x2 r1600, r1291, r1599; +} +{ +add.f16x2 r1603, r1575, r1600; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1606, {low, high}; +} +{ +mul.f16x2 r1607, r1285, r1606; +} +{ +add.f16x2 r1610, r1582, r1607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1613, {low, high}; +} +{ +mul.f16x2 r1614, r1288, r1613; +} +{ +add.f16x2 r1617, r1589, r1614; +} +{ +sub.f16x2 %4, r1596, r1603; +} +{ +add.f16x2 %5, r1610, r1617; +} +{ +add.f16x2 %18, r1596, r1603; +} +{ +sub.f16x2 %19, r1610, r1617; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1634, {low, high}; +} +{ +mul.f16x2 r1635, r1234, r1634; +} +{ +add.f16x2 r1638, r1295, r1635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1641, {low, high}; +} +{ +mul.f16x2 r1642, r1243, r1641; +} +{ +add.f16x2 r1645, r1632, r1642; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1648, {low, high}; +} +{ +mul.f16x2 r1649, r1237, r1648; +} +{ +add.f16x2 r1652, r1298, r1649; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1655, {low, high}; +} +{ +mul.f16x2 r1656, r1240, r1655; +} +{ +add.f16x2 r1659, r1633, r1656; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1662, {low, high}; +} +{ +mul.f16x2 r1663, r1246, r1662; +} +{ +add.f16x2 r1666, r1638, r1663; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1669, {low, high}; +} +{ +mul.f16x2 r1670, r1255, r1669; +} +{ +add.f16x2 r1673, r1645, r1670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1676, {low, high}; +} +{ +mul.f16x2 r1677, r1249, r1676; +} +{ +add.f16x2 r1680, r1652, r1677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1683, {low, high}; +} +{ +mul.f16x2 r1684, r1252, r1683; +} +{ +add.f16x2 r1687, r1659, r1684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1258, r1690; +} +{ +add.f16x2 r1694, r1666, r1691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1697, {low, high}; +} +{ +mul.f16x2 r1698, r1267, r1697; +} +{ +add.f16x2 r1701, r1673, r1698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1261, r1704; +} +{ +add.f16x2 r1708, r1680, r1705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1711, {low, high}; +} +{ +mul.f16x2 r1712, r1264, r1711; +} +{ +add.f16x2 r1715, r1687, r1712; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1718, {low, high}; +} +{ +mul.f16x2 r1719, r1270, r1718; +} +{ +add.f16x2 r1722, r1694, r1719; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1725, {low, high}; +} +{ +mul.f16x2 r1726, r1279, r1725; +} +{ +add.f16x2 r1729, r1701, r1726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1732, {low, high}; +} +{ +mul.f16x2 r1733, r1273, r1732; +} +{ +add.f16x2 r1736, r1708, r1733; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1739, {low, high}; +} +{ +mul.f16x2 r1740, r1276, r1739; +} +{ +add.f16x2 r1743, r1715, r1740; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1282, r1746; +} +{ +add.f16x2 r1750, r1722, r1747; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1753, {low, high}; +} +{ +mul.f16x2 r1754, r1291, r1753; +} +{ +add.f16x2 r1757, r1729, r1754; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1285, r1760; +} +{ +add.f16x2 r1764, r1736, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1767, {low, high}; +} +{ +mul.f16x2 r1768, r1288, r1767; +} +{ +add.f16x2 r1771, r1743, r1768; +} +{ +sub.f16x2 %6, r1750, r1757; +} +{ +add.f16x2 %7, r1764, r1771; +} +{ +add.f16x2 %16, r1750, r1757; +} +{ +sub.f16x2 %17, r1764, r1771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1786, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1787, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1788, {low, high}; +} +{ +mul.f16x2 r1789, r1234, r1788; +} +{ +add.f16x2 r1792, r1295, r1789; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1795, {low, high}; +} +{ +mul.f16x2 r1796, r1243, r1795; +} +{ +add.f16x2 r1799, r1786, r1796; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1802, {low, high}; +} +{ +mul.f16x2 r1803, r1237, r1802; +} +{ +add.f16x2 r1806, r1298, r1803; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1809, {low, high}; +} +{ +mul.f16x2 r1810, r1240, r1809; +} +{ +add.f16x2 r1813, r1787, r1810; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1816, {low, high}; +} +{ +mul.f16x2 r1817, r1246, r1816; +} +{ +add.f16x2 r1820, r1792, r1817; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1823, {low, high}; +} +{ +mul.f16x2 r1824, r1255, r1823; +} +{ +add.f16x2 r1827, r1799, r1824; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1830, {low, high}; +} +{ +mul.f16x2 r1831, r1249, r1830; +} +{ +add.f16x2 r1834, r1806, r1831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1837, {low, high}; +} +{ +mul.f16x2 r1838, r1252, r1837; +} +{ +add.f16x2 r1841, r1813, r1838; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1844, {low, high}; +} +{ +mul.f16x2 r1845, r1258, r1844; +} +{ +add.f16x2 r1848, r1820, r1845; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1267, r1851; +} +{ +add.f16x2 r1855, r1827, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1858, {low, high}; +} +{ +mul.f16x2 r1859, r1261, r1858; +} +{ +add.f16x2 r1862, r1834, r1859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1865, {low, high}; +} +{ +mul.f16x2 r1866, r1264, r1865; +} +{ +add.f16x2 r1869, r1841, r1866; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1872, {low, high}; +} +{ +mul.f16x2 r1873, r1270, r1872; +} +{ +add.f16x2 r1876, r1848, r1873; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1879, {low, high}; +} +{ +mul.f16x2 r1880, r1279, r1879; +} +{ +add.f16x2 r1883, r1855, r1880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1886, {low, high}; +} +{ +mul.f16x2 r1887, r1273, r1886; +} +{ +add.f16x2 r1890, r1862, r1887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1893, {low, high}; +} +{ +mul.f16x2 r1894, r1276, r1893; +} +{ +add.f16x2 r1897, r1869, r1894; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1900, {low, high}; +} +{ +mul.f16x2 r1901, r1282, r1900; +} +{ +add.f16x2 r1904, r1876, r1901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1907, {low, high}; +} +{ +mul.f16x2 r1908, r1291, r1907; +} +{ +add.f16x2 r1911, r1883, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1914, {low, high}; +} +{ +mul.f16x2 r1915, r1285, r1914; +} +{ +add.f16x2 r1918, r1890, r1915; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1921, {low, high}; +} +{ +mul.f16x2 r1922, r1288, r1921; +} +{ +add.f16x2 r1925, r1897, r1922; +} +{ +sub.f16x2 %8, r1904, r1911; +} +{ +add.f16x2 %9, r1918, r1925; +} +{ +add.f16x2 %14, r1904, r1911; +} +{ +sub.f16x2 %15, r1918, r1925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1942, {low, high}; +} +{ +mul.f16x2 r1943, r1234, r1942; +} +{ +add.f16x2 r1946, r1295, r1943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1949, {low, high}; +} +{ +mul.f16x2 r1950, r1243, r1949; +} +{ +add.f16x2 r1953, r1940, r1950; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1956, {low, high}; +} +{ +mul.f16x2 r1957, r1237, r1956; +} +{ +add.f16x2 r1960, r1298, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1963, {low, high}; +} +{ +mul.f16x2 r1964, r1240, r1963; +} +{ +add.f16x2 r1967, r1941, r1964; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1970, {low, high}; +} +{ +mul.f16x2 r1971, r1246, r1970; +} +{ +add.f16x2 r1974, r1946, r1971; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1977, {low, high}; +} +{ +mul.f16x2 r1978, r1255, r1977; +} +{ +add.f16x2 r1981, r1953, r1978; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1984, {low, high}; +} +{ +mul.f16x2 r1985, r1249, r1984; +} +{ +add.f16x2 r1988, r1960, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1991, {low, high}; +} +{ +mul.f16x2 r1992, r1252, r1991; +} +{ +add.f16x2 r1995, r1967, r1992; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1998, {low, high}; +} +{ +mul.f16x2 r1999, r1258, r1998; +} +{ +add.f16x2 r2002, r1974, r1999; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2005, {low, high}; +} +{ +mul.f16x2 r2006, r1267, r2005; +} +{ +add.f16x2 r2009, r1981, r2006; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r2012, {low, high}; +} +{ +mul.f16x2 r2013, r1261, r2012; +} +{ +add.f16x2 r2016, r1988, r2013; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2019, {low, high}; +} +{ +mul.f16x2 r2020, r1264, r2019; +} +{ +add.f16x2 r2023, r1995, r2020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2026, {low, high}; +} +{ +mul.f16x2 r2027, r1270, r2026; +} +{ +add.f16x2 r2030, r2002, r2027; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2033, {low, high}; +} +{ +mul.f16x2 r2034, r1279, r2033; +} +{ +add.f16x2 r2037, r2009, r2034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r1273, r2040; +} +{ +add.f16x2 r2044, r2016, r2041; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r1276, r2047; +} +{ +add.f16x2 r2051, r2023, r2048; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2054, {low, high}; +} +{ +mul.f16x2 r2055, r1282, r2054; +} +{ +add.f16x2 r2058, r2030, r2055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2061, {low, high}; +} +{ +mul.f16x2 r2062, r1291, r2061; +} +{ +add.f16x2 r2065, r2037, r2062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2068, {low, high}; +} +{ +mul.f16x2 r2069, r1285, r2068; +} +{ +add.f16x2 r2072, r2044, r2069; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2075, {low, high}; +} +{ +mul.f16x2 r2076, r1288, r2075; +} +{ +add.f16x2 r2079, r2051, r2076; +} +{ +sub.f16x2 %10, r2058, r2065; +} +{ +add.f16x2 %11, r2072, r2079; +} +{ +add.f16x2 %12, r2058, r2065; +} +{ +sub.f16x2 %13, r2072, r2079; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1128, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<468>; +.reg .b32 r<2104>; +.reg .b64 rd<4>; +mov.u32 r2094, %tid.y; +mov.u32 r2095, %22; +mad.lo.s32 r2096, r2094, 484, r2095; +mov.u32 r2097, %tid.x; +{ +add.f16x2 r1, %25, %43; +} +{ +add.f16x2 r4, %26, %44; +} +{ +sub.f16x2 r7, %25, %43; +} +{ +sub.f16x2 r10, %26, %44; +} +{ +add.f16x2 r13, %27, %41; +} +{ +add.f16x2 r16, %28, %42; +} +{ +sub.f16x2 r19, %27, %41; +} +{ +sub.f16x2 r22, %28, %42; +} +{ +add.f16x2 r25, %29, %39; +} +{ +add.f16x2 r28, %30, %40; +} +{ +sub.f16x2 r31, %29, %39; +} +{ +sub.f16x2 r34, %30, %40; +} +{ +add.f16x2 r37, %31, %37; +} +{ +add.f16x2 r40, %32, %38; +} +{ +sub.f16x2 r43, %31, %37; +} +{ +sub.f16x2 r46, %32, %38; +} +{ +add.f16x2 r49, %33, %35; +} +{ +add.f16x2 r52, %34, %36; +} +{ +sub.f16x2 r55, %33, %35; +} +{ +sub.f16x2 r58, %34, %36; +} +{ +add.f16x2 r61, %23, r1; +} +{ +add.f16x2 r64, %24, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 r85, r79, r49; +} +{ +add.f16x2 r88, r82, r52; +} +mov.f32 f424, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r91, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r92, {low, high}; +} +mov.f32 f438, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r93, {low, high}; +} +{ +mul.f16x2 r94, r1, r93; +} +{ +add.f16x2 r97, %23, r94; +} +mov.f32 f404, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r100, {low, high}; +} +{ +mul.f16x2 r101, r10, r100; +} +{ +add.f16x2 r104, r91, r101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r107, {low, high}; +} +{ +mul.f16x2 r108, r4, r107; +} +{ +add.f16x2 r111, %24, r108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r7, r114; +} +{ +add.f16x2 r118, r92, r115; +} +mov.f32 f454, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r121, {low, high}; +} +{ +mul.f16x2 r122, r13, r121; +} +{ +add.f16x2 r125, r97, r122; +} +mov.f32 f300, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r128, {low, high}; +} +{ +mul.f16x2 r129, r22, r128; +} +{ +add.f16x2 r132, r104, r129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r135, {low, high}; +} +{ +mul.f16x2 r136, r16, r135; +} +{ +add.f16x2 r139, r111, r136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r142, {low, high}; +} +{ +mul.f16x2 r143, r19, r142; +} +{ +add.f16x2 r146, r118, r143; +} +mov.f32 f462, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r149, {low, high}; +} +{ +mul.f16x2 r150, r25, r149; +} +{ +add.f16x2 r153, r125, r150; +} +mov.f32 f464, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r156, {low, high}; +} +{ +mul.f16x2 r157, r34, r156; +} +{ +add.f16x2 r160, r132, r157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r163, {low, high}; +} +{ +mul.f16x2 r164, r28, r163; +} +{ +add.f16x2 r167, r139, r164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r170, {low, high}; +} +{ +mul.f16x2 r171, r31, r170; +} +{ +add.f16x2 r174, r146, r171; +} +mov.f32 f446, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r177, {low, high}; +} +{ +mul.f16x2 r178, r37, r177; +} +{ +add.f16x2 r181, r153, r178; +} +mov.f32 f448, 0f3F4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r184, {low, high}; +} +{ +mul.f16x2 r185, r46, r184; +} +{ +add.f16x2 r188, r160, r185; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r191, {low, high}; +} +{ +mul.f16x2 r192, r40, r191; +} +{ +add.f16x2 r195, r167, r192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r198, {low, high}; +} +{ +mul.f16x2 r199, r43, r198; +} +{ +add.f16x2 r202, r174, r199; +} +mov.f32 f430, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r205, {low, high}; +} +{ +mul.f16x2 r206, r49, r205; +} +{ +add.f16x2 r209, r181, r206; +} +mov.f32 f432, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r212, {low, high}; +} +{ +mul.f16x2 r213, r58, r212; +} +{ +add.f16x2 r216, r188, r213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r219, {low, high}; +} +{ +mul.f16x2 r220, r52, r219; +} +{ +add.f16x2 r223, r195, r220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r226, {low, high}; +} +{ +mul.f16x2 r227, r55, r226; +} +{ +add.f16x2 r230, r202, r227; +} +{ +sub.f16x2 r233, r209, r216; +} +{ +add.f16x2 r236, r223, r230; +} +{ +add.f16x2 r239, r209, r216; +} +{ +sub.f16x2 r242, r223, r230; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r247, {low, high}; +} +{ +mul.f16x2 r248, r1, r247; +} +{ +add.f16x2 r251, %23, r248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r254, {low, high}; +} +{ +mul.f16x2 r255, r10, r254; +} +{ +add.f16x2 r258, r245, r255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r4, r261; +} +{ +add.f16x2 r265, %24, r262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r268, {low, high}; +} +{ +mul.f16x2 r269, r7, r268; +} +{ +add.f16x2 r272, r246, r269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r275, {low, high}; +} +{ +mul.f16x2 r276, r13, r275; +} +{ +add.f16x2 r279, r251, r276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r22, r282; +} +{ +add.f16x2 r286, r258, r283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r16, r289; +} +{ +add.f16x2 r293, r265, r290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r296, {low, high}; +} +{ +mul.f16x2 r297, r19, r296; +} +{ +add.f16x2 r300, r272, r297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r303, {low, high}; +} +{ +mul.f16x2 r304, r25, r303; +} +{ +add.f16x2 r307, r279, r304; +} +mov.f32 f352, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r310, {low, high}; +} +{ +mul.f16x2 r311, r34, r310; +} +{ +add.f16x2 r314, r286, r311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r317, {low, high}; +} +{ +mul.f16x2 r318, r28, r317; +} +{ +add.f16x2 r321, r293, r318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r324, {low, high}; +} +{ +mul.f16x2 r325, r31, r324; +} +{ +add.f16x2 r328, r300, r325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r331, {low, high}; +} +{ +mul.f16x2 r332, r37, r331; +} +{ +add.f16x2 r335, r307, r332; +} +mov.f32 f396, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r338, {low, high}; +} +{ +mul.f16x2 r339, r46, r338; +} +{ +add.f16x2 r342, r314, r339; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r345, {low, high}; +} +{ +mul.f16x2 r346, r40, r345; +} +{ +add.f16x2 r349, r321, r346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r43, r352; +} +{ +add.f16x2 r356, r328, r353; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r359, {low, high}; +} +{ +mul.f16x2 r360, r49, r359; +} +{ +add.f16x2 r363, r335, r360; +} +mov.f32 f440, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r366, {low, high}; +} +{ +mul.f16x2 r367, r58, r366; +} +{ +add.f16x2 r370, r342, r367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r373, {low, high}; +} +{ +mul.f16x2 r374, r52, r373; +} +{ +add.f16x2 r377, r349, r374; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r380, {low, high}; +} +{ +mul.f16x2 r381, r55, r380; +} +{ +add.f16x2 r384, r356, r381; +} +{ +sub.f16x2 r387, r363, r370; +} +{ +add.f16x2 r390, r377, r384; +} +{ +add.f16x2 r393, r363, r370; +} +{ +sub.f16x2 r396, r377, r384; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r401, {low, high}; +} +{ +mul.f16x2 r402, r1, r401; +} +{ +add.f16x2 r405, %23, r402; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r408, {low, high}; +} +{ +mul.f16x2 r409, r10, r408; +} +{ +add.f16x2 r412, r399, r409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r415, {low, high}; +} +{ +mul.f16x2 r416, r4, r415; +} +{ +add.f16x2 r419, %24, r416; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r7, r422; +} +{ +add.f16x2 r426, r400, r423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r429, {low, high}; +} +{ +mul.f16x2 r430, r13, r429; +} +{ +add.f16x2 r433, r405, r430; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r436, {low, high}; +} +{ +mul.f16x2 r437, r22, r436; +} +{ +add.f16x2 r440, r412, r437; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r16, r443; +} +{ +add.f16x2 r447, r419, r444; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r450, {low, high}; +} +{ +mul.f16x2 r451, r19, r450; +} +{ +add.f16x2 r454, r426, r451; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r457, {low, high}; +} +{ +mul.f16x2 r458, r25, r457; +} +{ +add.f16x2 r461, r433, r458; +} +mov.f32 f456, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r464, {low, high}; +} +{ +mul.f16x2 r465, r34, r464; +} +{ +add.f16x2 r468, r440, r465; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r471, {low, high}; +} +{ +mul.f16x2 r472, r28, r471; +} +{ +add.f16x2 r475, r447, r472; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r478, {low, high}; +} +{ +mul.f16x2 r479, r31, r478; +} +{ +add.f16x2 r482, r454, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r485, {low, high}; +} +{ +mul.f16x2 r486, r37, r485; +} +{ +add.f16x2 r489, r461, r486; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r492, {low, high}; +} +{ +mul.f16x2 r493, r46, r492; +} +{ +add.f16x2 r496, r468, r493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r499, {low, high}; +} +{ +mul.f16x2 r500, r40, r499; +} +{ +add.f16x2 r503, r475, r500; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r506, {low, high}; +} +{ +mul.f16x2 r507, r43, r506; +} +{ +add.f16x2 r510, r482, r507; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r513, {low, high}; +} +{ +mul.f16x2 r514, r49, r513; +} +{ +add.f16x2 r517, r489, r514; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r520, {low, high}; +} +{ +mul.f16x2 r521, r58, r520; +} +{ +add.f16x2 r524, r496, r521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r527, {low, high}; +} +{ +mul.f16x2 r528, r52, r527; +} +{ +add.f16x2 r531, r503, r528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r534, {low, high}; +} +{ +mul.f16x2 r535, r55, r534; +} +{ +add.f16x2 r538, r510, r535; +} +{ +sub.f16x2 r541, r517, r524; +} +{ +add.f16x2 r544, r531, r538; +} +{ +add.f16x2 r547, r517, r524; +} +{ +sub.f16x2 r550, r531, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r554, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r555, {low, high}; +} +{ +mul.f16x2 r556, r1, r555; +} +{ +add.f16x2 r559, %23, r556; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r562, {low, high}; +} +{ +mul.f16x2 r563, r10, r562; +} +{ +add.f16x2 r566, r553, r563; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r569, {low, high}; +} +{ +mul.f16x2 r570, r4, r569; +} +{ +add.f16x2 r573, %24, r570; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r576, {low, high}; +} +{ +mul.f16x2 r577, r7, r576; +} +{ +add.f16x2 r580, r554, r577; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r583, {low, high}; +} +{ +mul.f16x2 r584, r13, r583; +} +{ +add.f16x2 r587, r559, r584; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r590, {low, high}; +} +{ +mul.f16x2 r591, r22, r590; +} +{ +add.f16x2 r594, r566, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r597, {low, high}; +} +{ +mul.f16x2 r598, r16, r597; +} +{ +add.f16x2 r601, r573, r598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r604, {low, high}; +} +{ +mul.f16x2 r605, r19, r604; +} +{ +add.f16x2 r608, r580, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r25, r611; +} +{ +add.f16x2 r615, r587, r612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r618, {low, high}; +} +{ +mul.f16x2 r619, r34, r618; +} +{ +add.f16x2 r622, r594, r619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r625, {low, high}; +} +{ +mul.f16x2 r626, r28, r625; +} +{ +add.f16x2 r629, r601, r626; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r632, {low, high}; +} +{ +mul.f16x2 r633, r31, r632; +} +{ +add.f16x2 r636, r608, r633; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r639, {low, high}; +} +{ +mul.f16x2 r640, r37, r639; +} +{ +add.f16x2 r643, r615, r640; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r646, {low, high}; +} +{ +mul.f16x2 r647, r46, r646; +} +{ +add.f16x2 r650, r622, r647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r653, {low, high}; +} +{ +mul.f16x2 r654, r40, r653; +} +{ +add.f16x2 r657, r629, r654; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r660, {low, high}; +} +{ +mul.f16x2 r661, r43, r660; +} +{ +add.f16x2 r664, r636, r661; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r667, {low, high}; +} +{ +mul.f16x2 r668, r49, r667; +} +{ +add.f16x2 r671, r643, r668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r674, {low, high}; +} +{ +mul.f16x2 r675, r58, r674; +} +{ +add.f16x2 r678, r650, r675; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r52, r681; +} +{ +add.f16x2 r685, r657, r682; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r55, r688; +} +{ +add.f16x2 r692, r664, r689; +} +{ +sub.f16x2 r695, r671, r678; +} +{ +add.f16x2 r698, r685, r692; +} +{ +add.f16x2 r701, r671, r678; +} +{ +sub.f16x2 r704, r685, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r707, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r709, {low, high}; +} +{ +mul.f16x2 r710, r1, r709; +} +{ +add.f16x2 r713, %23, r710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r10, r716; +} +{ +add.f16x2 r720, r707, r717; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r723, {low, high}; +} +{ +mul.f16x2 r724, r4, r723; +} +{ +add.f16x2 r727, %24, r724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r730, {low, high}; +} +{ +mul.f16x2 r731, r7, r730; +} +{ +add.f16x2 r734, r708, r731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r737, {low, high}; +} +{ +mul.f16x2 r738, r13, r737; +} +{ +add.f16x2 r741, r713, r738; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r22, r744; +} +{ +add.f16x2 r748, r720, r745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r751, {low, high}; +} +{ +mul.f16x2 r752, r16, r751; +} +{ +add.f16x2 r755, r727, r752; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r758, {low, high}; +} +{ +mul.f16x2 r759, r19, r758; +} +{ +add.f16x2 r762, r734, r759; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r25, r765; +} +{ +add.f16x2 r769, r741, r766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r34, r772; +} +{ +add.f16x2 r776, r748, r773; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r779, {low, high}; +} +{ +mul.f16x2 r780, r28, r779; +} +{ +add.f16x2 r783, r755, r780; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r786, {low, high}; +} +{ +mul.f16x2 r787, r31, r786; +} +{ +add.f16x2 r790, r762, r787; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r37, r793; +} +{ +add.f16x2 r797, r769, r794; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r46, r800; +} +{ +add.f16x2 r804, r776, r801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r40, r807; +} +{ +add.f16x2 r811, r783, r808; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r814, {low, high}; +} +{ +mul.f16x2 r815, r43, r814; +} +{ +add.f16x2 r818, r790, r815; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r49, r821; +} +{ +add.f16x2 r825, r797, r822; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r828, {low, high}; +} +{ +mul.f16x2 r829, r58, r828; +} +{ +add.f16x2 r832, r804, r829; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r52, r835; +} +{ +add.f16x2 r839, r811, r836; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r842, {low, high}; +} +{ +mul.f16x2 r843, r55, r842; +} +{ +add.f16x2 r846, r818, r843; +} +{ +sub.f16x2 r849, r825, r832; +} +{ +add.f16x2 r852, r839, r846; +} +{ +add.f16x2 r855, r825, r832; +} +{ +sub.f16x2 r858, r839, r846; +} +mul.wide.u32 rd2, r2097, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r2098, rd3; +mul.lo.s32 r2099, r2098, 11; +sub.s32 r2100, r2097, r2099; +mad.lo.s32 r2101, r2098, 484, r2096; +cvt.rn.f32.u32 f465, r2100; +mul.f32 f466, f465, 0f3D54B191; +cos.approx.f32 f221, f466; +sin.approx.f32 f467, f466; +neg.f32 f222, f467; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f222; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r866, {high, high}; +} +{ +mul.f16x2 r868, r236, r866; +} +{ +fma.rn.f16x2 r871, r233, r864, r868; +} +{ +mul.f16x2 r875, r233, r866; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r236, r864, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r886, {high, high}; +} +mov.f32 f241, 0fBF800000; +mov.f32 f242, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r888, {low, high}; +} +{ +mul.f16x2 r889, r886, r888; +} +{ +mul.f16x2 r892, r861, r884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r895, {high, low}; +} +{ +fma.rn.f16x2 r897, r889, r895, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r903, {high, high}; +} +{ +mul.f16x2 r905, r390, r903; +} +{ +fma.rn.f16x2 r908, r387, r901, r905; +} +{ +mul.f16x2 r912, r387, r903; +} +{ +neg.f16x2 r915, r912; +} +{ +fma.rn.f16x2 r917, r390, r901, r915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r925, {low, high}; +} +{ +mul.f16x2 r926, r923, r925; +} +{ +mul.f16x2 r929, r897, r921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r897; +mov.b32 r932, {high, low}; +} +{ +fma.rn.f16x2 r934, r926, r932, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r940, {high, high}; +} +{ +mul.f16x2 r942, r544, r940; +} +{ +fma.rn.f16x2 r945, r541, r938, r942; +} +{ +mul.f16x2 r949, r541, r940; +} +{ +neg.f16x2 r952, r949; +} +{ +fma.rn.f16x2 r954, r544, r938, r952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r962, {low, high}; +} +{ +mul.f16x2 r963, r960, r962; +} +{ +mul.f16x2 r966, r934, r958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r934; +mov.b32 r969, {high, low}; +} +{ +fma.rn.f16x2 r971, r963, r969, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r977, {high, high}; +} +{ +mul.f16x2 r979, r698, r977; +} +{ +fma.rn.f16x2 r982, r695, r975, r979; +} +{ +mul.f16x2 r986, r695, r977; +} +{ +neg.f16x2 r989, r986; +} +{ +fma.rn.f16x2 r991, r698, r975, r989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r999, {low, high}; +} +{ +mul.f16x2 r1000, r997, r999; +} +{ +mul.f16x2 r1003, r971, r995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r971; +mov.b32 r1006, {high, low}; +} +{ +fma.rn.f16x2 r1008, r1000, r1006, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1014, {high, high}; +} +{ +mul.f16x2 r1016, r852, r1014; +} +{ +fma.rn.f16x2 r1019, r849, r1012, r1016; +} +{ +mul.f16x2 r1023, r849, r1014; +} +{ +neg.f16x2 r1026, r1023; +} +{ +fma.rn.f16x2 r1028, r852, r1012, r1026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1036, {low, high}; +} +{ +mul.f16x2 r1037, r1034, r1036; +} +{ +mul.f16x2 r1040, r1008, r1032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1008; +mov.b32 r1043, {high, low}; +} +{ +fma.rn.f16x2 r1045, r1037, r1043, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1051, {high, high}; +} +{ +mul.f16x2 r1053, r858, r1051; +} +{ +fma.rn.f16x2 r1056, r855, r1049, r1053; +} +{ +mul.f16x2 r1060, r855, r1051; +} +{ +neg.f16x2 r1063, r1060; +} +{ +fma.rn.f16x2 r1065, r858, r1049, r1063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1073, {low, high}; +} +{ +mul.f16x2 r1074, r1071, r1073; +} +{ +mul.f16x2 r1077, r1045, r1069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1045; +mov.b32 r1080, {high, low}; +} +{ +fma.rn.f16x2 r1082, r1074, r1080, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1088, {high, high}; +} +{ +mul.f16x2 r1090, r704, r1088; +} +{ +fma.rn.f16x2 r1093, r701, r1086, r1090; +} +{ +mul.f16x2 r1097, r701, r1088; +} +{ +neg.f16x2 r1100, r1097; +} +{ +fma.rn.f16x2 r1102, r704, r1086, r1100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1110, {low, high}; +} +{ +mul.f16x2 r1111, r1108, r1110; +} +{ +mul.f16x2 r1114, r1082, r1106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1082; +mov.b32 r1117, {high, low}; +} +{ +fma.rn.f16x2 r1119, r1111, r1117, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1125, {high, high}; +} +{ +mul.f16x2 r1127, r550, r1125; +} +{ +fma.rn.f16x2 r1130, r547, r1123, r1127; +} +{ +mul.f16x2 r1134, r547, r1125; +} +{ +neg.f16x2 r1137, r1134; +} +{ +fma.rn.f16x2 r1139, r550, r1123, r1137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1147, {low, high}; +} +{ +mul.f16x2 r1148, r1145, r1147; +} +{ +mul.f16x2 r1151, r1119, r1143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1119; +mov.b32 r1154, {high, low}; +} +{ +fma.rn.f16x2 r1156, r1148, r1154, r1151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1162, {high, high}; +} +{ +mul.f16x2 r1164, r396, r1162; +} +{ +fma.rn.f16x2 r1167, r393, r1160, r1164; +} +{ +mul.f16x2 r1171, r393, r1162; +} +{ +neg.f16x2 r1174, r1171; +} +{ +fma.rn.f16x2 r1176, r396, r1160, r1174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r861; +mov.b32 r1182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f241; +cvt.rn.f16.f32 high, f242; +mov.b32 r1184, {low, high}; +} +{ +mul.f16x2 r1185, r1182, r1184; +} +{ +mul.f16x2 r1188, r1156, r1180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1156; +mov.b32 r1191, {high, low}; +} +{ +fma.rn.f16x2 r1193, r1185, r1191, r1188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1193; +mov.b32 r1199, {high, high}; +} +{ +mul.f16x2 r1201, r242, r1199; +} +{ +fma.rn.f16x2 r1204, r239, r1197, r1201; +} +{ +mul.f16x2 r1208, r239, r1199; +} +{ +neg.f16x2 r1211, r1208; +} +{ +fma.rn.f16x2 r1213, r242, r1197, r1211; +} +barrier.sync 0; +mad.lo.s32 r2102, r2100, 44, r2101; +st.shared.u32 [r2102], r85; +st.shared.u32 [r2102+4], r871; +st.shared.u32 [r2102+8], r908; +st.shared.u32 [r2102+12], r945; +st.shared.u32 [r2102+16], r982; +st.shared.u32 [r2102+20], r1019; +st.shared.u32 [r2102+24], r1056; +st.shared.u32 [r2102+28], r1093; +st.shared.u32 [r2102+32], r1130; +st.shared.u32 [r2102+36], r1167; +st.shared.u32 [r2102+40], r1204; +barrier.sync 0; +mad.lo.s32 r2103, r2100, -40, r2102; +ld.shared.u32 r1295, [r2103]; +ld.shared.u32 r1235, [r2103+44]; +ld.shared.u32 r1247, [r2103+88]; +ld.shared.u32 r1259, [r2103+132]; +ld.shared.u32 r1271, [r2103+176]; +ld.shared.u32 r1283, [r2103+220]; +ld.shared.u32 r1284, [r2103+264]; +ld.shared.u32 r1272, [r2103+308]; +ld.shared.u32 r1260, [r2103+352]; +ld.shared.u32 r1248, [r2103+396]; +ld.shared.u32 r1236, [r2103+440]; +barrier.sync 0; +st.shared.u32 [r2102], r88; +st.shared.u32 [r2102+4], r880; +st.shared.u32 [r2102+8], r917; +st.shared.u32 [r2102+12], r954; +st.shared.u32 [r2102+16], r991; +st.shared.u32 [r2102+20], r1028; +st.shared.u32 [r2102+24], r1065; +st.shared.u32 [r2102+28], r1102; +st.shared.u32 [r2102+32], r1139; +st.shared.u32 [r2102+36], r1176; +st.shared.u32 [r2102+40], r1213; +barrier.sync 0; +ld.shared.u32 r1298, [r2103]; +ld.shared.u32 r1238, [r2103+44]; +ld.shared.u32 r1250, [r2103+88]; +ld.shared.u32 r1262, [r2103+132]; +ld.shared.u32 r1274, [r2103+176]; +ld.shared.u32 r1286, [r2103+220]; +ld.shared.u32 r1287, [r2103+264]; +ld.shared.u32 r1275, [r2103+308]; +ld.shared.u32 r1263, [r2103+352]; +ld.shared.u32 r1251, [r2103+396]; +ld.shared.u32 r1239, [r2103+440]; +{ +add.f16x2 r1234, r1235, r1236; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +sub.f16x2 r1240, r1235, r1236; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +add.f16x2 r1246, r1247, r1248; +} +{ +add.f16x2 r1249, r1250, r1251; +} +{ +sub.f16x2 r1252, r1247, r1248; +} +{ +sub.f16x2 r1255, r1250, r1251; +} +{ +add.f16x2 r1258, r1259, r1260; +} +{ +add.f16x2 r1261, r1262, r1263; +} +{ +sub.f16x2 r1264, r1259, r1260; +} +{ +sub.f16x2 r1267, r1262, r1263; +} +{ +add.f16x2 r1270, r1271, r1272; +} +{ +add.f16x2 r1273, r1274, r1275; +} +{ +sub.f16x2 r1276, r1271, r1272; +} +{ +sub.f16x2 r1279, r1274, r1275; +} +{ +add.f16x2 r1282, r1283, r1284; +} +{ +add.f16x2 r1285, r1286, r1287; +} +{ +sub.f16x2 r1288, r1283, r1284; +} +{ +sub.f16x2 r1291, r1286, r1287; +} +{ +add.f16x2 r1294, r1295, r1234; +} +{ +add.f16x2 r1297, r1298, r1237; +} +{ +add.f16x2 r1300, r1294, r1246; +} +{ +add.f16x2 r1303, r1297, r1249; +} +{ +add.f16x2 r1306, r1300, r1258; +} +{ +add.f16x2 r1309, r1303, r1261; +} +{ +add.f16x2 r1312, r1306, r1270; +} +{ +add.f16x2 r1315, r1309, r1273; +} +{ +add.f16x2 %0, r1312, r1282; +} +{ +add.f16x2 %1, r1315, r1285; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1326, {low, high}; +} +{ +mul.f16x2 r1327, r1234, r1326; +} +{ +add.f16x2 r1330, r1295, r1327; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1243, r1333; +} +{ +add.f16x2 r1337, r1324, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1340, {low, high}; +} +{ +mul.f16x2 r1341, r1237, r1340; +} +{ +add.f16x2 r1344, r1298, r1341; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1347, {low, high}; +} +{ +mul.f16x2 r1348, r1240, r1347; +} +{ +add.f16x2 r1351, r1325, r1348; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1354, {low, high}; +} +{ +mul.f16x2 r1355, r1246, r1354; +} +{ +add.f16x2 r1358, r1330, r1355; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1361, {low, high}; +} +{ +mul.f16x2 r1362, r1255, r1361; +} +{ +add.f16x2 r1365, r1337, r1362; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1368, {low, high}; +} +{ +mul.f16x2 r1369, r1249, r1368; +} +{ +add.f16x2 r1372, r1344, r1369; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1375, {low, high}; +} +{ +mul.f16x2 r1376, r1252, r1375; +} +{ +add.f16x2 r1379, r1351, r1376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1382, {low, high}; +} +{ +mul.f16x2 r1383, r1258, r1382; +} +{ +add.f16x2 r1386, r1358, r1383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1389, {low, high}; +} +{ +mul.f16x2 r1390, r1267, r1389; +} +{ +add.f16x2 r1393, r1365, r1390; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1396, {low, high}; +} +{ +mul.f16x2 r1397, r1261, r1396; +} +{ +add.f16x2 r1400, r1372, r1397; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1403, {low, high}; +} +{ +mul.f16x2 r1404, r1264, r1403; +} +{ +add.f16x2 r1407, r1379, r1404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1410, {low, high}; +} +{ +mul.f16x2 r1411, r1270, r1410; +} +{ +add.f16x2 r1414, r1386, r1411; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1417, {low, high}; +} +{ +mul.f16x2 r1418, r1279, r1417; +} +{ +add.f16x2 r1421, r1393, r1418; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1424, {low, high}; +} +{ +mul.f16x2 r1425, r1273, r1424; +} +{ +add.f16x2 r1428, r1400, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1431, {low, high}; +} +{ +mul.f16x2 r1432, r1276, r1431; +} +{ +add.f16x2 r1435, r1407, r1432; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1438, {low, high}; +} +{ +mul.f16x2 r1439, r1282, r1438; +} +{ +add.f16x2 r1442, r1414, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1445, {low, high}; +} +{ +mul.f16x2 r1446, r1291, r1445; +} +{ +add.f16x2 r1449, r1421, r1446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1452, {low, high}; +} +{ +mul.f16x2 r1453, r1285, r1452; +} +{ +add.f16x2 r1456, r1428, r1453; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1459, {low, high}; +} +{ +mul.f16x2 r1460, r1288, r1459; +} +{ +add.f16x2 r1463, r1435, r1460; +} +{ +sub.f16x2 %2, r1442, r1449; +} +{ +add.f16x2 %3, r1456, r1463; +} +{ +add.f16x2 %20, r1442, r1449; +} +{ +sub.f16x2 %21, r1456, r1463; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1481, r1234, r1480; +} +{ +add.f16x2 r1484, r1295, r1481; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1487, {low, high}; +} +{ +mul.f16x2 r1488, r1243, r1487; +} +{ +add.f16x2 r1491, r1478, r1488; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1494, {low, high}; +} +{ +mul.f16x2 r1495, r1237, r1494; +} +{ +add.f16x2 r1498, r1298, r1495; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f300; +cvt.rn.f16.f32 high, f300; +mov.b32 r1501, {low, high}; +} +{ +mul.f16x2 r1502, r1240, r1501; +} +{ +add.f16x2 r1505, r1479, r1502; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1508, {low, high}; +} +{ +mul.f16x2 r1509, r1246, r1508; +} +{ +add.f16x2 r1512, r1484, r1509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1515, {low, high}; +} +{ +mul.f16x2 r1516, r1255, r1515; +} +{ +add.f16x2 r1519, r1491, r1516; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1522, {low, high}; +} +{ +mul.f16x2 r1523, r1249, r1522; +} +{ +add.f16x2 r1526, r1498, r1523; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1529, {low, high}; +} +{ +mul.f16x2 r1530, r1252, r1529; +} +{ +add.f16x2 r1533, r1505, r1530; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1536, {low, high}; +} +{ +mul.f16x2 r1537, r1258, r1536; +} +{ +add.f16x2 r1540, r1512, r1537; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1267, r1543; +} +{ +add.f16x2 r1547, r1519, r1544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1550, {low, high}; +} +{ +mul.f16x2 r1551, r1261, r1550; +} +{ +add.f16x2 r1554, r1526, r1551; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1557, {low, high}; +} +{ +mul.f16x2 r1558, r1264, r1557; +} +{ +add.f16x2 r1561, r1533, r1558; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1564, {low, high}; +} +{ +mul.f16x2 r1565, r1270, r1564; +} +{ +add.f16x2 r1568, r1540, r1565; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1571, {low, high}; +} +{ +mul.f16x2 r1572, r1279, r1571; +} +{ +add.f16x2 r1575, r1547, r1572; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1578, {low, high}; +} +{ +mul.f16x2 r1579, r1273, r1578; +} +{ +add.f16x2 r1582, r1554, r1579; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1585, {low, high}; +} +{ +mul.f16x2 r1586, r1276, r1585; +} +{ +add.f16x2 r1589, r1561, r1586; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1592, {low, high}; +} +{ +mul.f16x2 r1593, r1282, r1592; +} +{ +add.f16x2 r1596, r1568, r1593; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1599, {low, high}; +} +{ +mul.f16x2 r1600, r1291, r1599; +} +{ +add.f16x2 r1603, r1575, r1600; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1606, {low, high}; +} +{ +mul.f16x2 r1607, r1285, r1606; +} +{ +add.f16x2 r1610, r1582, r1607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1613, {low, high}; +} +{ +mul.f16x2 r1614, r1288, r1613; +} +{ +add.f16x2 r1617, r1589, r1614; +} +{ +sub.f16x2 %4, r1596, r1603; +} +{ +add.f16x2 %5, r1610, r1617; +} +{ +add.f16x2 %18, r1596, r1603; +} +{ +sub.f16x2 %19, r1610, r1617; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1634, {low, high}; +} +{ +mul.f16x2 r1635, r1234, r1634; +} +{ +add.f16x2 r1638, r1295, r1635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1641, {low, high}; +} +{ +mul.f16x2 r1642, r1243, r1641; +} +{ +add.f16x2 r1645, r1632, r1642; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1648, {low, high}; +} +{ +mul.f16x2 r1649, r1237, r1648; +} +{ +add.f16x2 r1652, r1298, r1649; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r1655, {low, high}; +} +{ +mul.f16x2 r1656, r1240, r1655; +} +{ +add.f16x2 r1659, r1633, r1656; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1662, {low, high}; +} +{ +mul.f16x2 r1663, r1246, r1662; +} +{ +add.f16x2 r1666, r1638, r1663; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1669, {low, high}; +} +{ +mul.f16x2 r1670, r1255, r1669; +} +{ +add.f16x2 r1673, r1645, r1670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1676, {low, high}; +} +{ +mul.f16x2 r1677, r1249, r1676; +} +{ +add.f16x2 r1680, r1652, r1677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1683, {low, high}; +} +{ +mul.f16x2 r1684, r1252, r1683; +} +{ +add.f16x2 r1687, r1659, r1684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1258, r1690; +} +{ +add.f16x2 r1694, r1666, r1691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1697, {low, high}; +} +{ +mul.f16x2 r1698, r1267, r1697; +} +{ +add.f16x2 r1701, r1673, r1698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1261, r1704; +} +{ +add.f16x2 r1708, r1680, r1705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1711, {low, high}; +} +{ +mul.f16x2 r1712, r1264, r1711; +} +{ +add.f16x2 r1715, r1687, r1712; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1718, {low, high}; +} +{ +mul.f16x2 r1719, r1270, r1718; +} +{ +add.f16x2 r1722, r1694, r1719; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1725, {low, high}; +} +{ +mul.f16x2 r1726, r1279, r1725; +} +{ +add.f16x2 r1729, r1701, r1726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1732, {low, high}; +} +{ +mul.f16x2 r1733, r1273, r1732; +} +{ +add.f16x2 r1736, r1708, r1733; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1739, {low, high}; +} +{ +mul.f16x2 r1740, r1276, r1739; +} +{ +add.f16x2 r1743, r1715, r1740; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1282, r1746; +} +{ +add.f16x2 r1750, r1722, r1747; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1753, {low, high}; +} +{ +mul.f16x2 r1754, r1291, r1753; +} +{ +add.f16x2 r1757, r1729, r1754; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1285, r1760; +} +{ +add.f16x2 r1764, r1736, r1761; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1767, {low, high}; +} +{ +mul.f16x2 r1768, r1288, r1767; +} +{ +add.f16x2 r1771, r1743, r1768; +} +{ +sub.f16x2 %6, r1750, r1757; +} +{ +add.f16x2 %7, r1764, r1771; +} +{ +add.f16x2 %16, r1750, r1757; +} +{ +sub.f16x2 %17, r1764, r1771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1786, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1787, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1788, {low, high}; +} +{ +mul.f16x2 r1789, r1234, r1788; +} +{ +add.f16x2 r1792, r1295, r1789; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1795, {low, high}; +} +{ +mul.f16x2 r1796, r1243, r1795; +} +{ +add.f16x2 r1799, r1786, r1796; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1802, {low, high}; +} +{ +mul.f16x2 r1803, r1237, r1802; +} +{ +add.f16x2 r1806, r1298, r1803; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r1809, {low, high}; +} +{ +mul.f16x2 r1810, r1240, r1809; +} +{ +add.f16x2 r1813, r1787, r1810; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1816, {low, high}; +} +{ +mul.f16x2 r1817, r1246, r1816; +} +{ +add.f16x2 r1820, r1792, r1817; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1823, {low, high}; +} +{ +mul.f16x2 r1824, r1255, r1823; +} +{ +add.f16x2 r1827, r1799, r1824; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r1830, {low, high}; +} +{ +mul.f16x2 r1831, r1249, r1830; +} +{ +add.f16x2 r1834, r1806, r1831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f396; +cvt.rn.f16.f32 high, f396; +mov.b32 r1837, {low, high}; +} +{ +mul.f16x2 r1838, r1252, r1837; +} +{ +add.f16x2 r1841, r1813, r1838; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1844, {low, high}; +} +{ +mul.f16x2 r1845, r1258, r1844; +} +{ +add.f16x2 r1848, r1820, r1845; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1267, r1851; +} +{ +add.f16x2 r1855, r1827, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1858, {low, high}; +} +{ +mul.f16x2 r1859, r1261, r1858; +} +{ +add.f16x2 r1862, r1834, r1859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f404; +cvt.rn.f16.f32 high, f404; +mov.b32 r1865, {low, high}; +} +{ +mul.f16x2 r1866, r1264, r1865; +} +{ +add.f16x2 r1869, r1841, r1866; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1872, {low, high}; +} +{ +mul.f16x2 r1873, r1270, r1872; +} +{ +add.f16x2 r1876, r1848, r1873; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1879, {low, high}; +} +{ +mul.f16x2 r1880, r1279, r1879; +} +{ +add.f16x2 r1883, r1855, r1880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1886, {low, high}; +} +{ +mul.f16x2 r1887, r1273, r1886; +} +{ +add.f16x2 r1890, r1862, r1887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1893, {low, high}; +} +{ +mul.f16x2 r1894, r1276, r1893; +} +{ +add.f16x2 r1897, r1869, r1894; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1900, {low, high}; +} +{ +mul.f16x2 r1901, r1282, r1900; +} +{ +add.f16x2 r1904, r1876, r1901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1907, {low, high}; +} +{ +mul.f16x2 r1908, r1291, r1907; +} +{ +add.f16x2 r1911, r1883, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r1914, {low, high}; +} +{ +mul.f16x2 r1915, r1285, r1914; +} +{ +add.f16x2 r1918, r1890, r1915; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r1921, {low, high}; +} +{ +mul.f16x2 r1922, r1288, r1921; +} +{ +add.f16x2 r1925, r1897, r1922; +} +{ +sub.f16x2 %8, r1904, r1911; +} +{ +add.f16x2 %9, r1918, r1925; +} +{ +add.f16x2 %14, r1904, r1911; +} +{ +sub.f16x2 %15, r1918, r1925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f424; +cvt.rn.f16.f32 high, f424; +mov.b32 r1941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1942, {low, high}; +} +{ +mul.f16x2 r1943, r1234, r1942; +} +{ +add.f16x2 r1946, r1295, r1943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1949, {low, high}; +} +{ +mul.f16x2 r1950, r1243, r1949; +} +{ +add.f16x2 r1953, r1940, r1950; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f430; +cvt.rn.f16.f32 high, f430; +mov.b32 r1956, {low, high}; +} +{ +mul.f16x2 r1957, r1237, r1956; +} +{ +add.f16x2 r1960, r1298, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f432; +cvt.rn.f16.f32 high, f432; +mov.b32 r1963, {low, high}; +} +{ +mul.f16x2 r1964, r1240, r1963; +} +{ +add.f16x2 r1967, r1941, r1964; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1970, {low, high}; +} +{ +mul.f16x2 r1971, r1246, r1970; +} +{ +add.f16x2 r1974, r1946, r1971; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1977, {low, high}; +} +{ +mul.f16x2 r1978, r1255, r1977; +} +{ +add.f16x2 r1981, r1953, r1978; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f438; +cvt.rn.f16.f32 high, f438; +mov.b32 r1984, {low, high}; +} +{ +mul.f16x2 r1985, r1249, r1984; +} +{ +add.f16x2 r1988, r1960, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f440; +cvt.rn.f16.f32 high, f440; +mov.b32 r1991, {low, high}; +} +{ +mul.f16x2 r1992, r1252, r1991; +} +{ +add.f16x2 r1995, r1967, r1992; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r1998, {low, high}; +} +{ +mul.f16x2 r1999, r1258, r1998; +} +{ +add.f16x2 r2002, r1974, r1999; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2005, {low, high}; +} +{ +mul.f16x2 r2006, r1267, r2005; +} +{ +add.f16x2 r2009, r1981, r2006; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f446; +cvt.rn.f16.f32 high, f446; +mov.b32 r2012, {low, high}; +} +{ +mul.f16x2 r2013, r1261, r2012; +} +{ +add.f16x2 r2016, r1988, r2013; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f448; +cvt.rn.f16.f32 high, f448; +mov.b32 r2019, {low, high}; +} +{ +mul.f16x2 r2020, r1264, r2019; +} +{ +add.f16x2 r2023, r1995, r2020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2026, {low, high}; +} +{ +mul.f16x2 r2027, r1270, r2026; +} +{ +add.f16x2 r2030, r2002, r2027; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2033, {low, high}; +} +{ +mul.f16x2 r2034, r1279, r2033; +} +{ +add.f16x2 r2037, r2009, r2034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f454; +cvt.rn.f16.f32 high, f454; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r1273, r2040; +} +{ +add.f16x2 r2044, r2016, r2041; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f456; +mov.b32 r2047, {low, high}; +} +{ +mul.f16x2 r2048, r1276, r2047; +} +{ +add.f16x2 r2051, r2023, r2048; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2054, {low, high}; +} +{ +mul.f16x2 r2055, r1282, r2054; +} +{ +add.f16x2 r2058, r2030, r2055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2061, {low, high}; +} +{ +mul.f16x2 r2062, r1291, r2061; +} +{ +add.f16x2 r2065, r2037, r2062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f462; +cvt.rn.f16.f32 high, f462; +mov.b32 r2068, {low, high}; +} +{ +mul.f16x2 r2069, r1285, r2068; +} +{ +add.f16x2 r2072, r2044, r2069; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f464; +cvt.rn.f16.f32 high, f464; +mov.b32 r2075, {low, high}; +} +{ +mul.f16x2 r2076, r1288, r2075; +} +{ +add.f16x2 r2079, r2051, r2076; +} +{ +sub.f16x2 %10, r2058, r2065; +} +{ +add.f16x2 %11, r2072, r2079; +} +{ +add.f16x2 %12, r2058, r2065; +} +{ +sub.f16x2 %13, r2072, r2079; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..ec0b974dfc4b6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp32_fwd.hpp.inc @@ -0,0 +1,924 @@ +#ifndef CUFFTDX_FFT_121_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_121_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<179, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<488>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 968, r2; +add.f32 f45, %26, %50; +add.f32 f46, %28, %51; +sub.f32 f47, %26, %50; +sub.f32 f48, %28, %51; +add.f32 f49, %29, %48; +add.f32 f50, %31, %49; +sub.f32 f51, %29, %48; +sub.f32 f52, %31, %49; +add.f32 f53, %32, %45; +add.f32 f54, %33, %47; +sub.f32 f55, %32, %45; +sub.f32 f56, %33, %47; +add.f32 f57, %34, %42; +add.f32 f58, %36, %44; +sub.f32 f59, %34, %42; +sub.f32 f60, %36, %44; +add.f32 f61, %37, %40; +add.f32 f62, %39, %41; +sub.f32 f63, %37, %40; +sub.f32 f64, %39, %41; +mov.u32 r4, %tid.x; +add.f32 f65, %24, f45; +add.f32 f66, %25, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +fma.rn.f32 f73, f45, 0f3F575C64, %24; +fma.rn.f32 f74, f48, 0fBF0A6770, 0f00000000; +fma.rn.f32 f75, f46, 0f3F575C64, %25; +fma.rn.f32 f76, f47, 0fBF0A6770, 0f00000000; +fma.rn.f32 f77, f49, 0f3ED4B147, f73; +fma.rn.f32 f78, f52, 0fBF68DDA4, f74; +fma.rn.f32 f79, f50, 0f3ED4B147, f75; +fma.rn.f32 f80, f51, 0fBF68DDA4, f76; +fma.rn.f32 f81, f53, 0fBE11BAFB, f77; +fma.rn.f32 f82, f56, 0fBF7D64F0, f78; +fma.rn.f32 f83, f54, 0fBE11BAFB, f79; +fma.rn.f32 f84, f55, 0fBF7D64F0, f80; +fma.rn.f32 f85, f57, 0fBF27A4F4, f81; +fma.rn.f32 f86, f60, 0fBF4178CE, f82; +fma.rn.f32 f87, f58, 0fBF27A4F4, f83; +fma.rn.f32 f88, f59, 0fBF4178CE, f84; +fma.rn.f32 f89, f61, 0fBF75A155, f85; +fma.rn.f32 f90, f64, 0fBE903F40, f86; +fma.rn.f32 f91, f62, 0fBF75A155, f87; +fma.rn.f32 f92, f63, 0fBE903F40, f88; +sub.f32 f93, f89, f90; +add.f32 f94, f92, f91; +add.f32 f95, f90, f89; +sub.f32 f96, f91, f92; +fma.rn.f32 f97, f45, 0f3ED4B147, %24; +fma.rn.f32 f98, f48, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f99, f46, 0f3ED4B147, %25; +fma.rn.f32 f100, f47, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f101, f49, 0fBF27A4F4, f97; +fma.rn.f32 f102, f52, 0fBF4178CE, f98; +fma.rn.f32 f103, f50, 0fBF27A4F4, f99; +fma.rn.f32 f104, f51, 0fBF4178CE, f100; +fma.rn.f32 f105, f53, 0fBF75A155, f101; +fma.rn.f32 f106, f56, 0f3E903F40, f102; +fma.rn.f32 f107, f54, 0fBF75A155, f103; +fma.rn.f32 f108, f55, 0f3E903F40, f104; +fma.rn.f32 f109, f57, 0fBE11BAFB, f105; +fma.rn.f32 f110, f60, 0f3F7D64F0, f106; +fma.rn.f32 f111, f58, 0fBE11BAFB, f107; +fma.rn.f32 f112, f59, 0f3F7D64F0, f108; +fma.rn.f32 f113, f61, 0f3F575C64, f109; +fma.rn.f32 f114, f64, 0f3F0A6770, f110; +fma.rn.f32 f115, f62, 0f3F575C64, f111; +fma.rn.f32 f116, f63, 0f3F0A6770, f112; +sub.f32 f117, f113, f114; +add.f32 f118, f116, f115; +add.f32 f119, f114, f113; +sub.f32 f120, f115, f116; +fma.rn.f32 f121, f45, 0fBE11BAFB, %24; +fma.rn.f32 f122, f48, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f123, f46, 0fBE11BAFB, %25; +fma.rn.f32 f124, f47, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f125, f49, 0fBF75A155, f121; +fma.rn.f32 f126, f52, 0f3E903F40, f122; +fma.rn.f32 f127, f50, 0fBF75A155, f123; +fma.rn.f32 f128, f51, 0f3E903F40, f124; +fma.rn.f32 f129, f53, 0f3ED4B147, f125; +fma.rn.f32 f130, f56, 0f3F68DDA4, f126; +fma.rn.f32 f131, f54, 0f3ED4B147, f127; +fma.rn.f32 f132, f55, 0f3F68DDA4, f128; +fma.rn.f32 f133, f57, 0f3F575C64, f129; +fma.rn.f32 f134, f60, 0fBF0A6770, f130; +fma.rn.f32 f135, f58, 0f3F575C64, f131; +fma.rn.f32 f136, f59, 0fBF0A6770, f132; +fma.rn.f32 f137, f61, 0fBF27A4F4, f133; +fma.rn.f32 f138, f64, 0fBF4178CE, f134; +fma.rn.f32 f139, f62, 0fBF27A4F4, f135; +fma.rn.f32 f140, f63, 0fBF4178CE, f136; +sub.f32 f141, f137, f138; +add.f32 f142, f140, f139; +add.f32 f143, f138, f137; +sub.f32 f144, f139, f140; +fma.rn.f32 f145, f45, 0fBF27A4F4, %24; +fma.rn.f32 f146, f48, 0fBF4178CE, 0f00000000; +fma.rn.f32 f147, f46, 0fBF27A4F4, %25; +fma.rn.f32 f148, f47, 0fBF4178CE, 0f00000000; +fma.rn.f32 f149, f49, 0fBE11BAFB, f145; +fma.rn.f32 f150, f52, 0f3F7D64F0, f146; +fma.rn.f32 f151, f50, 0fBE11BAFB, f147; +fma.rn.f32 f152, f51, 0f3F7D64F0, f148; +fma.rn.f32 f153, f53, 0f3F575C64, f149; +fma.rn.f32 f154, f56, 0fBF0A6770, f150; +fma.rn.f32 f155, f54, 0f3F575C64, f151; +fma.rn.f32 f156, f55, 0fBF0A6770, f152; +fma.rn.f32 f157, f57, 0fBF75A155, f153; +fma.rn.f32 f158, f60, 0fBE903F40, f154; +fma.rn.f32 f159, f58, 0fBF75A155, f155; +fma.rn.f32 f160, f59, 0fBE903F40, f156; +fma.rn.f32 f161, f61, 0f3ED4B147, f157; +fma.rn.f32 f162, f64, 0f3F68DDA4, f158; +fma.rn.f32 f163, f62, 0f3ED4B147, f159; +fma.rn.f32 f164, f63, 0f3F68DDA4, f160; +sub.f32 f165, f161, f162; +add.f32 f166, f164, f163; +add.f32 f167, f162, f161; +sub.f32 f168, f163, f164; +fma.rn.f32 f169, f45, 0fBF75A155, %24; +fma.rn.f32 f170, f48, 0fBE903F40, 0f00000000; +fma.rn.f32 f171, f46, 0fBF75A155, %25; +fma.rn.f32 f172, f47, 0fBE903F40, 0f00000000; +fma.rn.f32 f173, f49, 0f3F575C64, f169; +fma.rn.f32 f174, f52, 0f3F0A6770, f170; +fma.rn.f32 f175, f50, 0f3F575C64, f171; +fma.rn.f32 f176, f51, 0f3F0A6770, f172; +fma.rn.f32 f177, f53, 0fBF27A4F4, f173; +fma.rn.f32 f178, f56, 0fBF4178CE, f174; +fma.rn.f32 f179, f54, 0fBF27A4F4, f175; +fma.rn.f32 f180, f55, 0fBF4178CE, f176; +fma.rn.f32 f181, f57, 0f3ED4B147, f177; +fma.rn.f32 f182, f60, 0f3F68DDA4, f178; +fma.rn.f32 f183, f58, 0f3ED4B147, f179; +fma.rn.f32 f184, f59, 0f3F68DDA4, f180; +fma.rn.f32 f185, f61, 0fBE11BAFB, f181; +fma.rn.f32 f186, f64, 0fBF7D64F0, f182; +fma.rn.f32 f187, f62, 0fBE11BAFB, f183; +fma.rn.f32 f188, f63, 0fBF7D64F0, f184; +sub.f32 f189, f185, f186; +add.f32 f190, f188, f187; +add.f32 f191, f186, f185; +sub.f32 f192, f187, f188; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 968, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f193, f194}, [rd6]; +mul.f32 f197, f193, f93; +mul.f32 f198, f194, f94; +mul.f32 f199, f193, f94; +mul.f32 f200, f193, f193; +mul.f32 f201, f194, f194; +sub.f32 f202, f200, f201; +mul.f32 f203, f194, f193; +fma.rn.f32 f204, f194, f193, f203; +mul.f32 f205, f202, f117; +mul.f32 f206, f204, f118; +mul.f32 f207, f202, f118; +mul.f32 f208, f193, f202; +mul.f32 f209, f194, f204; +sub.f32 f210, f208, f209; +mul.f32 f211, f193, f204; +fma.rn.f32 f212, f194, f202, f211; +mul.f32 f213, f210, f141; +mul.f32 f214, f212, f142; +mul.f32 f215, f210, f142; +mul.f32 f216, f193, f210; +mul.f32 f217, f194, f212; +sub.f32 f218, f216, f217; +mul.f32 f219, f193, f212; +fma.rn.f32 f220, f194, f210, f219; +mul.f32 f221, f218, f165; +mul.f32 f222, f220, f166; +mul.f32 f223, f218, f166; +mul.f32 f224, f193, f218; +mul.f32 f225, f194, f220; +sub.f32 f226, f224, f225; +mul.f32 f227, f193, f220; +fma.rn.f32 f228, f194, f218, f227; +mul.f32 f229, f226, f189; +mul.f32 f230, f228, f190; +mul.f32 f231, f226, f190; +mul.f32 f232, f193, f226; +mul.f32 f233, f194, f228; +sub.f32 f234, f232, f233; +mul.f32 f235, f193, f228; +fma.rn.f32 f236, f194, f226, f235; +mul.f32 f237, f234, f191; +mul.f32 f238, f236, f192; +mul.f32 f239, f234, f192; +mul.f32 f240, f193, f234; +mul.f32 f241, f194, f236; +sub.f32 f242, f240, f241; +mul.f32 f243, f193, f236; +fma.rn.f32 f244, f194, f234, f243; +mul.f32 f245, f242, f167; +mul.f32 f246, f244, f168; +mul.f32 f247, f242, f168; +mul.f32 f248, f193, f242; +mul.f32 f249, f194, f244; +sub.f32 f250, f248, f249; +mul.f32 f251, f193, f244; +fma.rn.f32 f252, f194, f242, f251; +mul.f32 f253, f250, f143; +mul.f32 f254, f252, f144; +mul.f32 f255, f250, f144; +mul.f32 f256, f193, f250; +mul.f32 f257, f194, f252; +sub.f32 f258, f256, f257; +mul.f32 f259, f193, f252; +fma.rn.f32 f260, f194, f250, f259; +mul.f32 f261, f258, f119; +mul.f32 f262, f260, f120; +mul.f32 f263, f258, f120; +mul.f32 f264, f193, f258; +mul.f32 f265, f194, f260; +sub.f32 f266, f264, f265; +mul.f32 f267, f193, f260; +fma.rn.f32 f268, f194, f258, f267; +mul.f32 f269, f266, f95; +mul.f32 f270, f268, f96; +mul.f32 f271, f266, f96; +barrier.sync 0; +mad.lo.s32 r9, r7, 88, r8; +add.f32 f272, f72, f62; +add.f32 f273, f71, f61; +st.shared.v2.f32 [r9], {f273, f272}; +fma.rn.f32 f274, f194, f93, f199; +sub.f32 f275, f197, f198; +st.shared.v2.f32 [r9+8], {f275, f274}; +fma.rn.f32 f276, f204, f117, f207; +sub.f32 f277, f205, f206; +st.shared.v2.f32 [r9+16], {f277, f276}; +sub.f32 f278, f213, f214; +fma.rn.f32 f279, f212, f141, f215; +st.shared.v2.f32 [r9+24], {f278, f279}; +fma.rn.f32 f280, f220, f165, f223; +sub.f32 f281, f221, f222; +st.shared.v2.f32 [r9+32], {f281, f280}; +fma.rn.f32 f282, f228, f189, f231; +sub.f32 f283, f229, f230; +st.shared.v2.f32 [r9+40], {f283, f282}; +fma.rn.f32 f284, f236, f191, f239; +sub.f32 f285, f237, f238; +st.shared.v2.f32 [r9+48], {f285, f284}; +fma.rn.f32 f286, f244, f167, f247; +sub.f32 f287, f245, f246; +st.shared.v2.f32 [r9+56], {f287, f286}; +fma.rn.f32 f288, f252, f143, f255; +sub.f32 f289, f253, f254; +st.shared.v2.f32 [r9+64], {f289, f288}; +fma.rn.f32 f290, f260, f119, f263; +sub.f32 f291, f261, f262; +st.shared.v2.f32 [r9+72], {f291, f290}; +fma.rn.f32 f292, f268, f95, f271; +sub.f32 f293, f269, f270; +st.shared.v2.f32 [r9+80], {f293, f292}; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.v2.f32 {f294, f295}, [r10]; +ld.shared.v2.f32 {f298, f299}, [r10+88]; +ld.shared.v2.f32 {f302, f303}, [r10+176]; +ld.shared.v2.f32 {f306, f307}, [r10+264]; +ld.shared.v2.f32 {f310, f311}, [r10+352]; +ld.shared.v2.f32 {f314, f315}, [r10+440]; +ld.shared.v2.f32 {f318, f319}, [r10+528]; +ld.shared.v2.f32 {f322, f323}, [r10+616]; +ld.shared.v2.f32 {f326, f327}, [r10+704]; +ld.shared.v2.f32 {f330, f331}, [r10+792]; +ld.shared.v2.f32 {f334, f335}, [r10+880]; +add.f32 f338, f298, f334; +add.f32 f339, f299, f335; +sub.f32 f340, f298, f334; +sub.f32 f341, f299, f335; +add.f32 f342, f302, f330; +add.f32 f343, f303, f331; +sub.f32 f344, f302, f330; +sub.f32 f345, f303, f331; +add.f32 f346, f306, f326; +add.f32 f347, f307, f327; +sub.f32 f348, f306, f326; +sub.f32 f349, f307, f327; +add.f32 f350, f310, f322; +add.f32 f351, f311, f323; +sub.f32 f352, f310, f322; +sub.f32 f353, f311, f323; +add.f32 f354, f314, f318; +add.f32 f355, f315, f319; +sub.f32 f356, f314, f318; +sub.f32 f357, f315, f319; +add.f32 f358, f294, f338; +add.f32 f359, f295, f339; +add.f32 f360, f358, f342; +add.f32 f361, f359, f343; +add.f32 f362, f360, f346; +add.f32 f363, f361, f347; +add.f32 f364, f362, f350; +add.f32 f365, f363, f351; +fma.rn.f32 f366, f338, 0f3F575C64, f294; +fma.rn.f32 f367, f341, 0fBF0A6770, 0f00000000; +fma.rn.f32 f368, f339, 0f3F575C64, f295; +fma.rn.f32 f369, f340, 0fBF0A6770, 0f00000000; +fma.rn.f32 f370, f342, 0f3ED4B147, f366; +fma.rn.f32 f371, f345, 0fBF68DDA4, f367; +fma.rn.f32 f372, f343, 0f3ED4B147, f368; +fma.rn.f32 f373, f344, 0fBF68DDA4, f369; +fma.rn.f32 f374, f346, 0fBE11BAFB, f370; +fma.rn.f32 f375, f349, 0fBF7D64F0, f371; +fma.rn.f32 f376, f347, 0fBE11BAFB, f372; +fma.rn.f32 f377, f348, 0fBF7D64F0, f373; +fma.rn.f32 f378, f350, 0fBF27A4F4, f374; +fma.rn.f32 f379, f353, 0fBF4178CE, f375; +fma.rn.f32 f380, f351, 0fBF27A4F4, f376; +fma.rn.f32 f381, f352, 0fBF4178CE, f377; +fma.rn.f32 f382, f354, 0fBF75A155, f378; +fma.rn.f32 f383, f357, 0fBE903F40, f379; +fma.rn.f32 f384, f355, 0fBF75A155, f380; +fma.rn.f32 f385, f356, 0fBE903F40, f381; +fma.rn.f32 f386, f338, 0f3ED4B147, f294; +fma.rn.f32 f387, f341, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f388, f339, 0f3ED4B147, f295; +fma.rn.f32 f389, f340, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f390, f342, 0fBF27A4F4, f386; +fma.rn.f32 f391, f345, 0fBF4178CE, f387; +fma.rn.f32 f392, f343, 0fBF27A4F4, f388; +fma.rn.f32 f393, f344, 0fBF4178CE, f389; +fma.rn.f32 f394, f346, 0fBF75A155, f390; +fma.rn.f32 f395, f349, 0f3E903F40, f391; +fma.rn.f32 f396, f347, 0fBF75A155, f392; +fma.rn.f32 f397, f348, 0f3E903F40, f393; +fma.rn.f32 f398, f350, 0fBE11BAFB, f394; +fma.rn.f32 f399, f353, 0f3F7D64F0, f395; +fma.rn.f32 f400, f351, 0fBE11BAFB, f396; +fma.rn.f32 f401, f352, 0f3F7D64F0, f397; +fma.rn.f32 f402, f354, 0f3F575C64, f398; +fma.rn.f32 f403, f357, 0f3F0A6770, f399; +fma.rn.f32 f404, f355, 0f3F575C64, f400; +fma.rn.f32 f405, f356, 0f3F0A6770, f401; +fma.rn.f32 f406, f338, 0fBE11BAFB, f294; +fma.rn.f32 f407, f341, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f408, f339, 0fBE11BAFB, f295; +fma.rn.f32 f409, f340, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f410, f342, 0fBF75A155, f406; +fma.rn.f32 f411, f345, 0f3E903F40, f407; +fma.rn.f32 f412, f343, 0fBF75A155, f408; +fma.rn.f32 f413, f344, 0f3E903F40, f409; +fma.rn.f32 f414, f346, 0f3ED4B147, f410; +fma.rn.f32 f415, f349, 0f3F68DDA4, f411; +fma.rn.f32 f416, f347, 0f3ED4B147, f412; +fma.rn.f32 f417, f348, 0f3F68DDA4, f413; +fma.rn.f32 f418, f350, 0f3F575C64, f414; +fma.rn.f32 f419, f353, 0fBF0A6770, f415; +fma.rn.f32 f420, f351, 0f3F575C64, f416; +fma.rn.f32 f421, f352, 0fBF0A6770, f417; +fma.rn.f32 f422, f354, 0fBF27A4F4, f418; +fma.rn.f32 f423, f357, 0fBF4178CE, f419; +fma.rn.f32 f424, f355, 0fBF27A4F4, f420; +fma.rn.f32 f425, f356, 0fBF4178CE, f421; +fma.rn.f32 f426, f338, 0fBF27A4F4, f294; +fma.rn.f32 f427, f341, 0fBF4178CE, 0f00000000; +fma.rn.f32 f428, f339, 0fBF27A4F4, f295; +fma.rn.f32 f429, f340, 0fBF4178CE, 0f00000000; +fma.rn.f32 f430, f342, 0fBE11BAFB, f426; +fma.rn.f32 f431, f345, 0f3F7D64F0, f427; +fma.rn.f32 f432, f343, 0fBE11BAFB, f428; +fma.rn.f32 f433, f344, 0f3F7D64F0, f429; +fma.rn.f32 f434, f346, 0f3F575C64, f430; +fma.rn.f32 f435, f349, 0fBF0A6770, f431; +fma.rn.f32 f436, f347, 0f3F575C64, f432; +fma.rn.f32 f437, f348, 0fBF0A6770, f433; +fma.rn.f32 f438, f350, 0fBF75A155, f434; +fma.rn.f32 f439, f353, 0fBE903F40, f435; +fma.rn.f32 f440, f351, 0fBF75A155, f436; +fma.rn.f32 f441, f352, 0fBE903F40, f437; +fma.rn.f32 f442, f354, 0f3ED4B147, f438; +fma.rn.f32 f443, f357, 0f3F68DDA4, f439; +fma.rn.f32 f444, f355, 0f3ED4B147, f440; +fma.rn.f32 f445, f356, 0f3F68DDA4, f441; +fma.rn.f32 f446, f338, 0fBF75A155, f294; +fma.rn.f32 f447, f341, 0fBE903F40, 0f00000000; +fma.rn.f32 f448, f339, 0fBF75A155, f295; +fma.rn.f32 f449, f340, 0fBE903F40, 0f00000000; +fma.rn.f32 f450, f342, 0f3F575C64, f446; +fma.rn.f32 f451, f345, 0f3F0A6770, f447; +fma.rn.f32 f452, f343, 0f3F575C64, f448; +fma.rn.f32 f453, f344, 0f3F0A6770, f449; +fma.rn.f32 f454, f346, 0fBF27A4F4, f450; +fma.rn.f32 f455, f349, 0fBF4178CE, f451; +fma.rn.f32 f456, f347, 0fBF27A4F4, f452; +fma.rn.f32 f457, f348, 0fBF4178CE, f453; +fma.rn.f32 f458, f350, 0f3ED4B147, f454; +fma.rn.f32 f459, f353, 0f3F68DDA4, f455; +fma.rn.f32 f460, f351, 0f3ED4B147, f456; +fma.rn.f32 f461, f352, 0f3F68DDA4, f457; +fma.rn.f32 f462, f354, 0fBE11BAFB, f458; +fma.rn.f32 f463, f357, 0fBF7D64F0, f459; +fma.rn.f32 f464, f355, 0fBE11BAFB, f460; +fma.rn.f32 f465, f356, 0fBF7D64F0, f461; +add.f32 %1, f365, f355; +add.f32 %0, f364, f354; +add.f32 %3, f385, f384; +sub.f32 %2, f382, f383; +add.f32 %5, f405, f404; +sub.f32 %4, f402, f403; +add.f32 %7, f425, f424; +sub.f32 %6, f422, f423; +add.f32 %9, f445, f444; +sub.f32 %8, f442, f443; +add.f32 %11, f465, f464; +sub.f32 %10, f462, f463; +sub.f32 %13, f464, f465; +add.f32 %12, f463, f462; +sub.f32 %15, f444, f445; +add.f32 %14, f443, f442; +sub.f32 %17, f424, f425; +add.f32 %16, f423, f422; +sub.f32 %19, f404, f405; +add.f32 %18, f403, f402; +sub.f32 %21, f384, f385; +add.f32 %20, f383, f382; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<180, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<466>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 484, r2; +add.f32 f45, %26, %50; +add.f32 f46, %28, %51; +sub.f32 f47, %26, %50; +sub.f32 f48, %28, %51; +add.f32 f49, %29, %48; +add.f32 f50, %31, %49; +sub.f32 f51, %29, %48; +sub.f32 f52, %31, %49; +add.f32 f53, %32, %45; +add.f32 f54, %33, %47; +sub.f32 f55, %32, %45; +sub.f32 f56, %33, %47; +add.f32 f57, %34, %42; +add.f32 f58, %36, %44; +sub.f32 f59, %34, %42; +sub.f32 f60, %36, %44; +add.f32 f61, %37, %40; +add.f32 f62, %39, %41; +sub.f32 f63, %37, %40; +sub.f32 f64, %39, %41; +mov.u32 r4, %tid.x; +add.f32 f65, %24, f45; +add.f32 f66, %25, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +add.f32 f73, f71, f61; +add.f32 f74, f72, f62; +fma.rn.f32 f75, f45, 0f3F575C64, %24; +fma.rn.f32 f76, f48, 0fBF0A6770, 0f00000000; +fma.rn.f32 f77, f46, 0f3F575C64, %25; +fma.rn.f32 f78, f47, 0fBF0A6770, 0f00000000; +fma.rn.f32 f79, f49, 0f3ED4B147, f75; +fma.rn.f32 f80, f52, 0fBF68DDA4, f76; +fma.rn.f32 f81, f50, 0f3ED4B147, f77; +fma.rn.f32 f82, f51, 0fBF68DDA4, f78; +fma.rn.f32 f83, f53, 0fBE11BAFB, f79; +fma.rn.f32 f84, f56, 0fBF7D64F0, f80; +fma.rn.f32 f85, f54, 0fBE11BAFB, f81; +fma.rn.f32 f86, f55, 0fBF7D64F0, f82; +fma.rn.f32 f87, f57, 0fBF27A4F4, f83; +fma.rn.f32 f88, f60, 0fBF4178CE, f84; +fma.rn.f32 f89, f58, 0fBF27A4F4, f85; +fma.rn.f32 f90, f59, 0fBF4178CE, f86; +fma.rn.f32 f91, f61, 0fBF75A155, f87; +fma.rn.f32 f92, f64, 0fBE903F40, f88; +fma.rn.f32 f93, f62, 0fBF75A155, f89; +fma.rn.f32 f94, f63, 0fBE903F40, f90; +sub.f32 f95, f91, f92; +add.f32 f96, f94, f93; +add.f32 f97, f92, f91; +sub.f32 f98, f93, f94; +fma.rn.f32 f99, f45, 0f3ED4B147, %24; +fma.rn.f32 f100, f48, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f101, f46, 0f3ED4B147, %25; +fma.rn.f32 f102, f47, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f103, f49, 0fBF27A4F4, f99; +fma.rn.f32 f104, f52, 0fBF4178CE, f100; +fma.rn.f32 f105, f50, 0fBF27A4F4, f101; +fma.rn.f32 f106, f51, 0fBF4178CE, f102; +fma.rn.f32 f107, f53, 0fBF75A155, f103; +fma.rn.f32 f108, f56, 0f3E903F40, f104; +fma.rn.f32 f109, f54, 0fBF75A155, f105; +fma.rn.f32 f110, f55, 0f3E903F40, f106; +fma.rn.f32 f111, f57, 0fBE11BAFB, f107; +fma.rn.f32 f112, f60, 0f3F7D64F0, f108; +fma.rn.f32 f113, f58, 0fBE11BAFB, f109; +fma.rn.f32 f114, f59, 0f3F7D64F0, f110; +fma.rn.f32 f115, f61, 0f3F575C64, f111; +fma.rn.f32 f116, f64, 0f3F0A6770, f112; +fma.rn.f32 f117, f62, 0f3F575C64, f113; +fma.rn.f32 f118, f63, 0f3F0A6770, f114; +sub.f32 f119, f115, f116; +add.f32 f120, f118, f117; +add.f32 f121, f116, f115; +sub.f32 f122, f117, f118; +fma.rn.f32 f123, f45, 0fBE11BAFB, %24; +fma.rn.f32 f124, f48, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f125, f46, 0fBE11BAFB, %25; +fma.rn.f32 f126, f47, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f127, f49, 0fBF75A155, f123; +fma.rn.f32 f128, f52, 0f3E903F40, f124; +fma.rn.f32 f129, f50, 0fBF75A155, f125; +fma.rn.f32 f130, f51, 0f3E903F40, f126; +fma.rn.f32 f131, f53, 0f3ED4B147, f127; +fma.rn.f32 f132, f56, 0f3F68DDA4, f128; +fma.rn.f32 f133, f54, 0f3ED4B147, f129; +fma.rn.f32 f134, f55, 0f3F68DDA4, f130; +fma.rn.f32 f135, f57, 0f3F575C64, f131; +fma.rn.f32 f136, f60, 0fBF0A6770, f132; +fma.rn.f32 f137, f58, 0f3F575C64, f133; +fma.rn.f32 f138, f59, 0fBF0A6770, f134; +fma.rn.f32 f139, f61, 0fBF27A4F4, f135; +fma.rn.f32 f140, f64, 0fBF4178CE, f136; +fma.rn.f32 f141, f62, 0fBF27A4F4, f137; +fma.rn.f32 f142, f63, 0fBF4178CE, f138; +sub.f32 f143, f139, f140; +add.f32 f144, f142, f141; +add.f32 f145, f140, f139; +sub.f32 f146, f141, f142; +fma.rn.f32 f147, f45, 0fBF27A4F4, %24; +fma.rn.f32 f148, f48, 0fBF4178CE, 0f00000000; +fma.rn.f32 f149, f46, 0fBF27A4F4, %25; +fma.rn.f32 f150, f47, 0fBF4178CE, 0f00000000; +fma.rn.f32 f151, f49, 0fBE11BAFB, f147; +fma.rn.f32 f152, f52, 0f3F7D64F0, f148; +fma.rn.f32 f153, f50, 0fBE11BAFB, f149; +fma.rn.f32 f154, f51, 0f3F7D64F0, f150; +fma.rn.f32 f155, f53, 0f3F575C64, f151; +fma.rn.f32 f156, f56, 0fBF0A6770, f152; +fma.rn.f32 f157, f54, 0f3F575C64, f153; +fma.rn.f32 f158, f55, 0fBF0A6770, f154; +fma.rn.f32 f159, f57, 0fBF75A155, f155; +fma.rn.f32 f160, f60, 0fBE903F40, f156; +fma.rn.f32 f161, f58, 0fBF75A155, f157; +fma.rn.f32 f162, f59, 0fBE903F40, f158; +fma.rn.f32 f163, f61, 0f3ED4B147, f159; +fma.rn.f32 f164, f64, 0f3F68DDA4, f160; +fma.rn.f32 f165, f62, 0f3ED4B147, f161; +fma.rn.f32 f166, f63, 0f3F68DDA4, f162; +sub.f32 f167, f163, f164; +add.f32 f168, f166, f165; +add.f32 f169, f164, f163; +sub.f32 f170, f165, f166; +fma.rn.f32 f171, f45, 0fBF75A155, %24; +fma.rn.f32 f172, f48, 0fBE903F40, 0f00000000; +fma.rn.f32 f173, f46, 0fBF75A155, %25; +fma.rn.f32 f174, f47, 0fBE903F40, 0f00000000; +fma.rn.f32 f175, f49, 0f3F575C64, f171; +fma.rn.f32 f176, f52, 0f3F0A6770, f172; +fma.rn.f32 f177, f50, 0f3F575C64, f173; +fma.rn.f32 f178, f51, 0f3F0A6770, f174; +fma.rn.f32 f179, f53, 0fBF27A4F4, f175; +fma.rn.f32 f180, f56, 0fBF4178CE, f176; +fma.rn.f32 f181, f54, 0fBF27A4F4, f177; +fma.rn.f32 f182, f55, 0fBF4178CE, f178; +fma.rn.f32 f183, f57, 0f3ED4B147, f179; +fma.rn.f32 f184, f60, 0f3F68DDA4, f180; +fma.rn.f32 f185, f58, 0f3ED4B147, f181; +fma.rn.f32 f186, f59, 0f3F68DDA4, f182; +fma.rn.f32 f187, f61, 0fBE11BAFB, f183; +fma.rn.f32 f188, f64, 0fBF7D64F0, f184; +fma.rn.f32 f189, f62, 0fBE11BAFB, f185; +fma.rn.f32 f190, f63, 0fBF7D64F0, f186; +sub.f32 f191, f187, f188; +add.f32 f192, f190, f189; +add.f32 f193, f188, f187; +sub.f32 f194, f189, f190; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f195, f196}, [rd6]; +mul.f32 f199, f195, f95; +mul.f32 f200, f196, f96; +sub.f32 f201, f199, f200; +mul.f32 f202, f195, f96; +fma.rn.f32 f203, f196, f95, f202; +mul.f32 f204, f195, f195; +mul.f32 f205, f196, f196; +sub.f32 f206, f204, f205; +mul.f32 f207, f196, f195; +fma.rn.f32 f208, f196, f195, f207; +mul.f32 f209, f206, f119; +mul.f32 f210, f208, f120; +sub.f32 f211, f209, f210; +mul.f32 f212, f206, f120; +fma.rn.f32 f213, f208, f119, f212; +mul.f32 f214, f195, f206; +mul.f32 f215, f196, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f195, f208; +fma.rn.f32 f218, f196, f206, f217; +mul.f32 f219, f216, f143; +mul.f32 f220, f218, f144; +sub.f32 f221, f219, f220; +mul.f32 f222, f216, f144; +fma.rn.f32 f223, f218, f143, f222; +mul.f32 f224, f195, f216; +mul.f32 f225, f196, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f195, f218; +fma.rn.f32 f228, f196, f216, f227; +mul.f32 f229, f226, f167; +mul.f32 f230, f228, f168; +sub.f32 f231, f229, f230; +mul.f32 f232, f226, f168; +fma.rn.f32 f233, f228, f167, f232; +mul.f32 f234, f195, f226; +mul.f32 f235, f196, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f195, f228; +fma.rn.f32 f238, f196, f226, f237; +mul.f32 f239, f236, f191; +mul.f32 f240, f238, f192; +sub.f32 f241, f239, f240; +mul.f32 f242, f236, f192; +fma.rn.f32 f243, f238, f191, f242; +mul.f32 f244, f195, f236; +mul.f32 f245, f196, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f195, f238; +fma.rn.f32 f248, f196, f236, f247; +mul.f32 f249, f246, f193; +mul.f32 f250, f248, f194; +sub.f32 f251, f249, f250; +mul.f32 f252, f246, f194; +fma.rn.f32 f253, f248, f193, f252; +mul.f32 f254, f195, f246; +mul.f32 f255, f196, f248; +sub.f32 f256, f254, f255; +mul.f32 f257, f195, f248; +fma.rn.f32 f258, f196, f246, f257; +mul.f32 f259, f256, f169; +mul.f32 f260, f258, f170; +sub.f32 f261, f259, f260; +mul.f32 f262, f256, f170; +fma.rn.f32 f263, f258, f169, f262; +mul.f32 f264, f195, f256; +mul.f32 f265, f196, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f195, f258; +fma.rn.f32 f268, f196, f256, f267; +mul.f32 f269, f266, f145; +mul.f32 f270, f268, f146; +sub.f32 f271, f269, f270; +mul.f32 f272, f266, f146; +fma.rn.f32 f273, f268, f145, f272; +mul.f32 f274, f195, f266; +mul.f32 f275, f196, f268; +sub.f32 f276, f274, f275; +mul.f32 f277, f195, f268; +fma.rn.f32 f278, f196, f266, f277; +mul.f32 f279, f276, f121; +mul.f32 f280, f278, f122; +sub.f32 f281, f279, f280; +mul.f32 f282, f276, f122; +fma.rn.f32 f283, f278, f121, f282; +mul.f32 f284, f195, f276; +mul.f32 f285, f196, f278; +sub.f32 f286, f284, f285; +mul.f32 f287, f195, f278; +fma.rn.f32 f288, f196, f276, f287; +mul.f32 f289, f286, f97; +mul.f32 f290, f288, f98; +sub.f32 f291, f289, f290; +mul.f32 f292, f286, f98; +fma.rn.f32 f293, f288, f97, f292; +mad.lo.s32 r8, r5, 484, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 44, r8; +st.shared.f32 [r9], f73; +st.shared.f32 [r9+4], f201; +st.shared.f32 [r9+8], f211; +st.shared.f32 [r9+12], f221; +st.shared.f32 [r9+16], f231; +st.shared.f32 [r9+20], f241; +st.shared.f32 [r9+24], f251; +st.shared.f32 [r9+28], f261; +st.shared.f32 [r9+32], f271; +st.shared.f32 [r9+36], f281; +st.shared.f32 [r9+40], f291; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.f32 f294, [r10]; +ld.shared.f32 f295, [r10+44]; +ld.shared.f32 f296, [r10+88]; +ld.shared.f32 f297, [r10+132]; +ld.shared.f32 f298, [r10+176]; +ld.shared.f32 f299, [r10+220]; +ld.shared.f32 f300, [r10+264]; +ld.shared.f32 f301, [r10+308]; +ld.shared.f32 f302, [r10+352]; +ld.shared.f32 f303, [r10+396]; +ld.shared.f32 f304, [r10+440]; +barrier.sync 0; +st.shared.f32 [r9], f74; +st.shared.f32 [r9+4], f203; +st.shared.f32 [r9+8], f213; +st.shared.f32 [r9+12], f223; +st.shared.f32 [r9+16], f233; +st.shared.f32 [r9+20], f243; +st.shared.f32 [r9+24], f253; +st.shared.f32 [r9+28], f263; +st.shared.f32 [r9+32], f273; +st.shared.f32 [r9+36], f283; +st.shared.f32 [r9+40], f293; +barrier.sync 0; +ld.shared.f32 f305, [r10]; +ld.shared.f32 f306, [r10+44]; +ld.shared.f32 f307, [r10+88]; +ld.shared.f32 f308, [r10+132]; +ld.shared.f32 f309, [r10+176]; +ld.shared.f32 f310, [r10+220]; +ld.shared.f32 f311, [r10+264]; +ld.shared.f32 f312, [r10+308]; +ld.shared.f32 f313, [r10+352]; +ld.shared.f32 f314, [r10+396]; +ld.shared.f32 f315, [r10+440]; +add.f32 f316, f295, f304; +add.f32 f317, f306, f315; +sub.f32 f318, f295, f304; +sub.f32 f319, f306, f315; +add.f32 f320, f296, f303; +add.f32 f321, f307, f314; +sub.f32 f322, f296, f303; +sub.f32 f323, f307, f314; +add.f32 f324, f297, f302; +add.f32 f325, f308, f313; +sub.f32 f326, f297, f302; +sub.f32 f327, f308, f313; +add.f32 f328, f298, f301; +add.f32 f329, f309, f312; +sub.f32 f330, f298, f301; +sub.f32 f331, f309, f312; +add.f32 f332, f299, f300; +add.f32 f333, f310, f311; +sub.f32 f334, f299, f300; +sub.f32 f335, f310, f311; +add.f32 f336, f294, f316; +add.f32 f337, f305, f317; +add.f32 f338, f336, f320; +add.f32 f339, f337, f321; +add.f32 f340, f338, f324; +add.f32 f341, f339, f325; +add.f32 f342, f340, f328; +add.f32 f343, f341, f329; +fma.rn.f32 f344, f316, 0f3F575C64, f294; +fma.rn.f32 f345, f319, 0fBF0A6770, 0f00000000; +fma.rn.f32 f346, f317, 0f3F575C64, f305; +fma.rn.f32 f347, f318, 0fBF0A6770, 0f00000000; +fma.rn.f32 f348, f320, 0f3ED4B147, f344; +fma.rn.f32 f349, f323, 0fBF68DDA4, f345; +fma.rn.f32 f350, f321, 0f3ED4B147, f346; +fma.rn.f32 f351, f322, 0fBF68DDA4, f347; +fma.rn.f32 f352, f324, 0fBE11BAFB, f348; +fma.rn.f32 f353, f327, 0fBF7D64F0, f349; +fma.rn.f32 f354, f325, 0fBE11BAFB, f350; +fma.rn.f32 f355, f326, 0fBF7D64F0, f351; +fma.rn.f32 f356, f328, 0fBF27A4F4, f352; +fma.rn.f32 f357, f331, 0fBF4178CE, f353; +fma.rn.f32 f358, f329, 0fBF27A4F4, f354; +fma.rn.f32 f359, f330, 0fBF4178CE, f355; +fma.rn.f32 f360, f332, 0fBF75A155, f356; +fma.rn.f32 f361, f335, 0fBE903F40, f357; +fma.rn.f32 f362, f333, 0fBF75A155, f358; +fma.rn.f32 f363, f334, 0fBE903F40, f359; +fma.rn.f32 f364, f316, 0f3ED4B147, f294; +fma.rn.f32 f365, f319, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f366, f317, 0f3ED4B147, f305; +fma.rn.f32 f367, f318, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f368, f320, 0fBF27A4F4, f364; +fma.rn.f32 f369, f323, 0fBF4178CE, f365; +fma.rn.f32 f370, f321, 0fBF27A4F4, f366; +fma.rn.f32 f371, f322, 0fBF4178CE, f367; +fma.rn.f32 f372, f324, 0fBF75A155, f368; +fma.rn.f32 f373, f327, 0f3E903F40, f369; +fma.rn.f32 f374, f325, 0fBF75A155, f370; +fma.rn.f32 f375, f326, 0f3E903F40, f371; +fma.rn.f32 f376, f328, 0fBE11BAFB, f372; +fma.rn.f32 f377, f331, 0f3F7D64F0, f373; +fma.rn.f32 f378, f329, 0fBE11BAFB, f374; +fma.rn.f32 f379, f330, 0f3F7D64F0, f375; +fma.rn.f32 f380, f332, 0f3F575C64, f376; +fma.rn.f32 f381, f335, 0f3F0A6770, f377; +fma.rn.f32 f382, f333, 0f3F575C64, f378; +fma.rn.f32 f383, f334, 0f3F0A6770, f379; +fma.rn.f32 f384, f316, 0fBE11BAFB, f294; +fma.rn.f32 f385, f319, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f386, f317, 0fBE11BAFB, f305; +fma.rn.f32 f387, f318, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f388, f320, 0fBF75A155, f384; +fma.rn.f32 f389, f323, 0f3E903F40, f385; +fma.rn.f32 f390, f321, 0fBF75A155, f386; +fma.rn.f32 f391, f322, 0f3E903F40, f387; +fma.rn.f32 f392, f324, 0f3ED4B147, f388; +fma.rn.f32 f393, f327, 0f3F68DDA4, f389; +fma.rn.f32 f394, f325, 0f3ED4B147, f390; +fma.rn.f32 f395, f326, 0f3F68DDA4, f391; +fma.rn.f32 f396, f328, 0f3F575C64, f392; +fma.rn.f32 f397, f331, 0fBF0A6770, f393; +fma.rn.f32 f398, f329, 0f3F575C64, f394; +fma.rn.f32 f399, f330, 0fBF0A6770, f395; +fma.rn.f32 f400, f332, 0fBF27A4F4, f396; +fma.rn.f32 f401, f335, 0fBF4178CE, f397; +fma.rn.f32 f402, f333, 0fBF27A4F4, f398; +fma.rn.f32 f403, f334, 0fBF4178CE, f399; +fma.rn.f32 f404, f316, 0fBF27A4F4, f294; +fma.rn.f32 f405, f319, 0fBF4178CE, 0f00000000; +fma.rn.f32 f406, f317, 0fBF27A4F4, f305; +fma.rn.f32 f407, f318, 0fBF4178CE, 0f00000000; +fma.rn.f32 f408, f320, 0fBE11BAFB, f404; +fma.rn.f32 f409, f323, 0f3F7D64F0, f405; +fma.rn.f32 f410, f321, 0fBE11BAFB, f406; +fma.rn.f32 f411, f322, 0f3F7D64F0, f407; +fma.rn.f32 f412, f324, 0f3F575C64, f408; +fma.rn.f32 f413, f327, 0fBF0A6770, f409; +fma.rn.f32 f414, f325, 0f3F575C64, f410; +fma.rn.f32 f415, f326, 0fBF0A6770, f411; +fma.rn.f32 f416, f328, 0fBF75A155, f412; +fma.rn.f32 f417, f331, 0fBE903F40, f413; +fma.rn.f32 f418, f329, 0fBF75A155, f414; +fma.rn.f32 f419, f330, 0fBE903F40, f415; +fma.rn.f32 f420, f332, 0f3ED4B147, f416; +fma.rn.f32 f421, f335, 0f3F68DDA4, f417; +fma.rn.f32 f422, f333, 0f3ED4B147, f418; +fma.rn.f32 f423, f334, 0f3F68DDA4, f419; +fma.rn.f32 f424, f316, 0fBF75A155, f294; +fma.rn.f32 f425, f319, 0fBE903F40, 0f00000000; +fma.rn.f32 f426, f317, 0fBF75A155, f305; +fma.rn.f32 f427, f318, 0fBE903F40, 0f00000000; +fma.rn.f32 f428, f320, 0f3F575C64, f424; +fma.rn.f32 f429, f323, 0f3F0A6770, f425; +fma.rn.f32 f430, f321, 0f3F575C64, f426; +fma.rn.f32 f431, f322, 0f3F0A6770, f427; +fma.rn.f32 f432, f324, 0fBF27A4F4, f428; +fma.rn.f32 f433, f327, 0fBF4178CE, f429; +fma.rn.f32 f434, f325, 0fBF27A4F4, f430; +fma.rn.f32 f435, f326, 0fBF4178CE, f431; +fma.rn.f32 f436, f328, 0f3ED4B147, f432; +fma.rn.f32 f437, f331, 0f3F68DDA4, f433; +fma.rn.f32 f438, f329, 0f3ED4B147, f434; +fma.rn.f32 f439, f330, 0f3F68DDA4, f435; +fma.rn.f32 f440, f332, 0fBE11BAFB, f436; +fma.rn.f32 f441, f335, 0fBF7D64F0, f437; +fma.rn.f32 f442, f333, 0fBE11BAFB, f438; +fma.rn.f32 f443, f334, 0fBF7D64F0, f439; +add.f32 %0, f342, f332; +add.f32 %1, f343, f333; +add.f32 %3, f363, f362; +sub.f32 %2, f360, f361; +add.f32 %5, f383, f382; +sub.f32 %4, f380, f381; +add.f32 %7, f403, f402; +sub.f32 %6, f400, f401; +add.f32 %9, f423, f422; +sub.f32 %8, f420, f421; +add.f32 %11, f443, f442; +sub.f32 %10, f440, f441; +sub.f32 %13, f442, f443; +add.f32 %12, f441, f440; +sub.f32 %15, f422, f423; +add.f32 %14, f421, f420; +sub.f32 %17, f402, f403; +add.f32 %16, f401, f400; +sub.f32 %19, f382, f383; +add.f32 %18, f381, f380; +sub.f32 %21, f362, f363; +add.f32 %20, f361, f360; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..ba4ee0b3c1a0d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp32_inv.hpp.inc @@ -0,0 +1,924 @@ +#ifndef CUFFTDX_FFT_121_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_121_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<381, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<488>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 968, r2; +add.f32 f45, %26, %50; +add.f32 f46, %28, %51; +sub.f32 f47, %26, %50; +sub.f32 f48, %28, %51; +add.f32 f49, %29, %48; +add.f32 f50, %31, %49; +sub.f32 f51, %29, %48; +sub.f32 f52, %31, %49; +add.f32 f53, %32, %45; +add.f32 f54, %33, %47; +sub.f32 f55, %32, %45; +sub.f32 f56, %33, %47; +add.f32 f57, %34, %42; +add.f32 f58, %36, %44; +sub.f32 f59, %34, %42; +sub.f32 f60, %36, %44; +add.f32 f61, %37, %40; +add.f32 f62, %39, %41; +sub.f32 f63, %37, %40; +sub.f32 f64, %39, %41; +mov.u32 r4, %tid.x; +add.f32 f65, %24, f45; +add.f32 f66, %25, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +fma.rn.f32 f73, f45, 0f3F575C64, %24; +fma.rn.f32 f74, f48, 0f3F0A6770, 0f00000000; +fma.rn.f32 f75, f46, 0f3F575C64, %25; +fma.rn.f32 f76, f47, 0f3F0A6770, 0f00000000; +fma.rn.f32 f77, f49, 0f3ED4B147, f73; +fma.rn.f32 f78, f52, 0f3F68DDA4, f74; +fma.rn.f32 f79, f50, 0f3ED4B147, f75; +fma.rn.f32 f80, f51, 0f3F68DDA4, f76; +fma.rn.f32 f81, f53, 0fBE11BAFB, f77; +fma.rn.f32 f82, f56, 0f3F7D64F0, f78; +fma.rn.f32 f83, f54, 0fBE11BAFB, f79; +fma.rn.f32 f84, f55, 0f3F7D64F0, f80; +fma.rn.f32 f85, f57, 0fBF27A4F4, f81; +fma.rn.f32 f86, f60, 0f3F4178CE, f82; +fma.rn.f32 f87, f58, 0fBF27A4F4, f83; +fma.rn.f32 f88, f59, 0f3F4178CE, f84; +fma.rn.f32 f89, f61, 0fBF75A155, f85; +fma.rn.f32 f90, f64, 0f3E903F40, f86; +fma.rn.f32 f91, f62, 0fBF75A155, f87; +fma.rn.f32 f92, f63, 0f3E903F40, f88; +sub.f32 f93, f89, f90; +add.f32 f94, f92, f91; +add.f32 f95, f90, f89; +sub.f32 f96, f91, f92; +fma.rn.f32 f97, f45, 0f3ED4B147, %24; +fma.rn.f32 f98, f48, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f99, f46, 0f3ED4B147, %25; +fma.rn.f32 f100, f47, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f101, f49, 0fBF27A4F4, f97; +fma.rn.f32 f102, f52, 0f3F4178CE, f98; +fma.rn.f32 f103, f50, 0fBF27A4F4, f99; +fma.rn.f32 f104, f51, 0f3F4178CE, f100; +fma.rn.f32 f105, f53, 0fBF75A155, f101; +fma.rn.f32 f106, f56, 0fBE903F40, f102; +fma.rn.f32 f107, f54, 0fBF75A155, f103; +fma.rn.f32 f108, f55, 0fBE903F40, f104; +fma.rn.f32 f109, f57, 0fBE11BAFB, f105; +fma.rn.f32 f110, f60, 0fBF7D64F0, f106; +fma.rn.f32 f111, f58, 0fBE11BAFB, f107; +fma.rn.f32 f112, f59, 0fBF7D64F0, f108; +fma.rn.f32 f113, f61, 0f3F575C64, f109; +fma.rn.f32 f114, f64, 0fBF0A6770, f110; +fma.rn.f32 f115, f62, 0f3F575C64, f111; +fma.rn.f32 f116, f63, 0fBF0A6770, f112; +sub.f32 f117, f113, f114; +add.f32 f118, f116, f115; +add.f32 f119, f114, f113; +sub.f32 f120, f115, f116; +fma.rn.f32 f121, f45, 0fBE11BAFB, %24; +fma.rn.f32 f122, f48, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f123, f46, 0fBE11BAFB, %25; +fma.rn.f32 f124, f47, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f125, f49, 0fBF75A155, f121; +fma.rn.f32 f126, f52, 0fBE903F40, f122; +fma.rn.f32 f127, f50, 0fBF75A155, f123; +fma.rn.f32 f128, f51, 0fBE903F40, f124; +fma.rn.f32 f129, f53, 0f3ED4B147, f125; +fma.rn.f32 f130, f56, 0fBF68DDA4, f126; +fma.rn.f32 f131, f54, 0f3ED4B147, f127; +fma.rn.f32 f132, f55, 0fBF68DDA4, f128; +fma.rn.f32 f133, f57, 0f3F575C64, f129; +fma.rn.f32 f134, f60, 0f3F0A6770, f130; +fma.rn.f32 f135, f58, 0f3F575C64, f131; +fma.rn.f32 f136, f59, 0f3F0A6770, f132; +fma.rn.f32 f137, f61, 0fBF27A4F4, f133; +fma.rn.f32 f138, f64, 0f3F4178CE, f134; +fma.rn.f32 f139, f62, 0fBF27A4F4, f135; +fma.rn.f32 f140, f63, 0f3F4178CE, f136; +sub.f32 f141, f137, f138; +add.f32 f142, f140, f139; +add.f32 f143, f138, f137; +sub.f32 f144, f139, f140; +fma.rn.f32 f145, f45, 0fBF27A4F4, %24; +fma.rn.f32 f146, f48, 0f3F4178CE, 0f00000000; +fma.rn.f32 f147, f46, 0fBF27A4F4, %25; +fma.rn.f32 f148, f47, 0f3F4178CE, 0f00000000; +fma.rn.f32 f149, f49, 0fBE11BAFB, f145; +fma.rn.f32 f150, f52, 0fBF7D64F0, f146; +fma.rn.f32 f151, f50, 0fBE11BAFB, f147; +fma.rn.f32 f152, f51, 0fBF7D64F0, f148; +fma.rn.f32 f153, f53, 0f3F575C64, f149; +fma.rn.f32 f154, f56, 0f3F0A6770, f150; +fma.rn.f32 f155, f54, 0f3F575C64, f151; +fma.rn.f32 f156, f55, 0f3F0A6770, f152; +fma.rn.f32 f157, f57, 0fBF75A155, f153; +fma.rn.f32 f158, f60, 0f3E903F40, f154; +fma.rn.f32 f159, f58, 0fBF75A155, f155; +fma.rn.f32 f160, f59, 0f3E903F40, f156; +fma.rn.f32 f161, f61, 0f3ED4B147, f157; +fma.rn.f32 f162, f64, 0fBF68DDA4, f158; +fma.rn.f32 f163, f62, 0f3ED4B147, f159; +fma.rn.f32 f164, f63, 0fBF68DDA4, f160; +sub.f32 f165, f161, f162; +add.f32 f166, f164, f163; +add.f32 f167, f162, f161; +sub.f32 f168, f163, f164; +fma.rn.f32 f169, f45, 0fBF75A155, %24; +fma.rn.f32 f170, f48, 0f3E903F40, 0f00000000; +fma.rn.f32 f171, f46, 0fBF75A155, %25; +fma.rn.f32 f172, f47, 0f3E903F40, 0f00000000; +fma.rn.f32 f173, f49, 0f3F575C64, f169; +fma.rn.f32 f174, f52, 0fBF0A6770, f170; +fma.rn.f32 f175, f50, 0f3F575C64, f171; +fma.rn.f32 f176, f51, 0fBF0A6770, f172; +fma.rn.f32 f177, f53, 0fBF27A4F4, f173; +fma.rn.f32 f178, f56, 0f3F4178CE, f174; +fma.rn.f32 f179, f54, 0fBF27A4F4, f175; +fma.rn.f32 f180, f55, 0f3F4178CE, f176; +fma.rn.f32 f181, f57, 0f3ED4B147, f177; +fma.rn.f32 f182, f60, 0fBF68DDA4, f178; +fma.rn.f32 f183, f58, 0f3ED4B147, f179; +fma.rn.f32 f184, f59, 0fBF68DDA4, f180; +fma.rn.f32 f185, f61, 0fBE11BAFB, f181; +fma.rn.f32 f186, f64, 0f3F7D64F0, f182; +fma.rn.f32 f187, f62, 0fBE11BAFB, f183; +fma.rn.f32 f188, f63, 0f3F7D64F0, f184; +sub.f32 f189, f185, f186; +add.f32 f190, f188, f187; +add.f32 f191, f186, f185; +sub.f32 f192, f187, f188; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 968, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f193, f194}, [rd6]; +mul.f32 f197, f94, f194; +mul.f32 f198, f93, f194; +mul.f32 f199, f193, f94; +mul.f32 f200, f193, f193; +mul.f32 f201, f194, f194; +sub.f32 f202, f200, f201; +mul.f32 f203, f194, f193; +fma.rn.f32 f204, f194, f193, f203; +mul.f32 f205, f118, f204; +mul.f32 f206, f117, f204; +mul.f32 f207, f202, f118; +mul.f32 f208, f193, f202; +mul.f32 f209, f194, f204; +sub.f32 f210, f208, f209; +mul.f32 f211, f193, f204; +fma.rn.f32 f212, f194, f202, f211; +mul.f32 f213, f142, f212; +mul.f32 f214, f141, f212; +mul.f32 f215, f210, f142; +mul.f32 f216, f193, f210; +mul.f32 f217, f194, f212; +sub.f32 f218, f216, f217; +mul.f32 f219, f193, f212; +fma.rn.f32 f220, f194, f210, f219; +mul.f32 f221, f166, f220; +mul.f32 f222, f165, f220; +mul.f32 f223, f218, f166; +mul.f32 f224, f193, f218; +mul.f32 f225, f194, f220; +sub.f32 f226, f224, f225; +mul.f32 f227, f193, f220; +fma.rn.f32 f228, f194, f218, f227; +mul.f32 f229, f190, f228; +mul.f32 f230, f189, f228; +mul.f32 f231, f226, f190; +mul.f32 f232, f193, f226; +mul.f32 f233, f194, f228; +sub.f32 f234, f232, f233; +mul.f32 f235, f193, f228; +fma.rn.f32 f236, f194, f226, f235; +mul.f32 f237, f192, f236; +mul.f32 f238, f191, f236; +mul.f32 f239, f234, f192; +mul.f32 f240, f193, f234; +mul.f32 f241, f194, f236; +sub.f32 f242, f240, f241; +mul.f32 f243, f193, f236; +fma.rn.f32 f244, f194, f234, f243; +mul.f32 f245, f168, f244; +mul.f32 f246, f167, f244; +mul.f32 f247, f242, f168; +mul.f32 f248, f193, f242; +mul.f32 f249, f194, f244; +sub.f32 f250, f248, f249; +mul.f32 f251, f193, f244; +fma.rn.f32 f252, f194, f242, f251; +mul.f32 f253, f144, f252; +mul.f32 f254, f143, f252; +mul.f32 f255, f250, f144; +mul.f32 f256, f193, f250; +mul.f32 f257, f194, f252; +sub.f32 f258, f256, f257; +mul.f32 f259, f193, f252; +fma.rn.f32 f260, f194, f250, f259; +mul.f32 f261, f120, f260; +mul.f32 f262, f119, f260; +mul.f32 f263, f258, f120; +mul.f32 f264, f193, f258; +mul.f32 f265, f194, f260; +sub.f32 f266, f264, f265; +mul.f32 f267, f193, f260; +fma.rn.f32 f268, f194, f258, f267; +mul.f32 f269, f96, f268; +mul.f32 f270, f95, f268; +mul.f32 f271, f266, f96; +barrier.sync 0; +mad.lo.s32 r9, r7, 88, r8; +add.f32 f272, f72, f62; +add.f32 f273, f71, f61; +st.shared.v2.f32 [r9], {f273, f272}; +fma.rn.f32 f274, f193, f93, f197; +sub.f32 f275, f199, f198; +st.shared.v2.f32 [r9+8], {f274, f275}; +fma.rn.f32 f276, f202, f117, f205; +sub.f32 f277, f207, f206; +st.shared.v2.f32 [r9+16], {f276, f277}; +sub.f32 f278, f215, f214; +fma.rn.f32 f279, f210, f141, f213; +st.shared.v2.f32 [r9+24], {f279, f278}; +fma.rn.f32 f280, f218, f165, f221; +sub.f32 f281, f223, f222; +st.shared.v2.f32 [r9+32], {f280, f281}; +fma.rn.f32 f282, f226, f189, f229; +sub.f32 f283, f231, f230; +st.shared.v2.f32 [r9+40], {f282, f283}; +fma.rn.f32 f284, f234, f191, f237; +sub.f32 f285, f239, f238; +st.shared.v2.f32 [r9+48], {f284, f285}; +fma.rn.f32 f286, f242, f167, f245; +sub.f32 f287, f247, f246; +st.shared.v2.f32 [r9+56], {f286, f287}; +fma.rn.f32 f288, f250, f143, f253; +sub.f32 f289, f255, f254; +st.shared.v2.f32 [r9+64], {f288, f289}; +fma.rn.f32 f290, f258, f119, f261; +sub.f32 f291, f263, f262; +st.shared.v2.f32 [r9+72], {f290, f291}; +fma.rn.f32 f292, f266, f95, f269; +sub.f32 f293, f271, f270; +st.shared.v2.f32 [r9+80], {f292, f293}; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.v2.f32 {f294, f295}, [r10]; +ld.shared.v2.f32 {f298, f299}, [r10+88]; +ld.shared.v2.f32 {f302, f303}, [r10+176]; +ld.shared.v2.f32 {f306, f307}, [r10+264]; +ld.shared.v2.f32 {f310, f311}, [r10+352]; +ld.shared.v2.f32 {f314, f315}, [r10+440]; +ld.shared.v2.f32 {f318, f319}, [r10+528]; +ld.shared.v2.f32 {f322, f323}, [r10+616]; +ld.shared.v2.f32 {f326, f327}, [r10+704]; +ld.shared.v2.f32 {f330, f331}, [r10+792]; +ld.shared.v2.f32 {f334, f335}, [r10+880]; +add.f32 f338, f298, f334; +add.f32 f339, f299, f335; +sub.f32 f340, f298, f334; +sub.f32 f341, f299, f335; +add.f32 f342, f302, f330; +add.f32 f343, f303, f331; +sub.f32 f344, f302, f330; +sub.f32 f345, f303, f331; +add.f32 f346, f306, f326; +add.f32 f347, f307, f327; +sub.f32 f348, f306, f326; +sub.f32 f349, f307, f327; +add.f32 f350, f310, f322; +add.f32 f351, f311, f323; +sub.f32 f352, f310, f322; +sub.f32 f353, f311, f323; +add.f32 f354, f314, f318; +add.f32 f355, f315, f319; +sub.f32 f356, f314, f318; +sub.f32 f357, f315, f319; +add.f32 f358, f294, f338; +add.f32 f359, f295, f339; +add.f32 f360, f358, f342; +add.f32 f361, f359, f343; +add.f32 f362, f360, f346; +add.f32 f363, f361, f347; +add.f32 f364, f362, f350; +add.f32 f365, f363, f351; +fma.rn.f32 f366, f338, 0f3F575C64, f294; +fma.rn.f32 f367, f341, 0f3F0A6770, 0f00000000; +fma.rn.f32 f368, f339, 0f3F575C64, f295; +fma.rn.f32 f369, f340, 0f3F0A6770, 0f00000000; +fma.rn.f32 f370, f342, 0f3ED4B147, f366; +fma.rn.f32 f371, f345, 0f3F68DDA4, f367; +fma.rn.f32 f372, f343, 0f3ED4B147, f368; +fma.rn.f32 f373, f344, 0f3F68DDA4, f369; +fma.rn.f32 f374, f346, 0fBE11BAFB, f370; +fma.rn.f32 f375, f349, 0f3F7D64F0, f371; +fma.rn.f32 f376, f347, 0fBE11BAFB, f372; +fma.rn.f32 f377, f348, 0f3F7D64F0, f373; +fma.rn.f32 f378, f350, 0fBF27A4F4, f374; +fma.rn.f32 f379, f353, 0f3F4178CE, f375; +fma.rn.f32 f380, f351, 0fBF27A4F4, f376; +fma.rn.f32 f381, f352, 0f3F4178CE, f377; +fma.rn.f32 f382, f354, 0fBF75A155, f378; +fma.rn.f32 f383, f357, 0f3E903F40, f379; +fma.rn.f32 f384, f355, 0fBF75A155, f380; +fma.rn.f32 f385, f356, 0f3E903F40, f381; +fma.rn.f32 f386, f338, 0f3ED4B147, f294; +fma.rn.f32 f387, f341, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f388, f339, 0f3ED4B147, f295; +fma.rn.f32 f389, f340, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f390, f342, 0fBF27A4F4, f386; +fma.rn.f32 f391, f345, 0f3F4178CE, f387; +fma.rn.f32 f392, f343, 0fBF27A4F4, f388; +fma.rn.f32 f393, f344, 0f3F4178CE, f389; +fma.rn.f32 f394, f346, 0fBF75A155, f390; +fma.rn.f32 f395, f349, 0fBE903F40, f391; +fma.rn.f32 f396, f347, 0fBF75A155, f392; +fma.rn.f32 f397, f348, 0fBE903F40, f393; +fma.rn.f32 f398, f350, 0fBE11BAFB, f394; +fma.rn.f32 f399, f353, 0fBF7D64F0, f395; +fma.rn.f32 f400, f351, 0fBE11BAFB, f396; +fma.rn.f32 f401, f352, 0fBF7D64F0, f397; +fma.rn.f32 f402, f354, 0f3F575C64, f398; +fma.rn.f32 f403, f357, 0fBF0A6770, f399; +fma.rn.f32 f404, f355, 0f3F575C64, f400; +fma.rn.f32 f405, f356, 0fBF0A6770, f401; +fma.rn.f32 f406, f338, 0fBE11BAFB, f294; +fma.rn.f32 f407, f341, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f408, f339, 0fBE11BAFB, f295; +fma.rn.f32 f409, f340, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f410, f342, 0fBF75A155, f406; +fma.rn.f32 f411, f345, 0fBE903F40, f407; +fma.rn.f32 f412, f343, 0fBF75A155, f408; +fma.rn.f32 f413, f344, 0fBE903F40, f409; +fma.rn.f32 f414, f346, 0f3ED4B147, f410; +fma.rn.f32 f415, f349, 0fBF68DDA4, f411; +fma.rn.f32 f416, f347, 0f3ED4B147, f412; +fma.rn.f32 f417, f348, 0fBF68DDA4, f413; +fma.rn.f32 f418, f350, 0f3F575C64, f414; +fma.rn.f32 f419, f353, 0f3F0A6770, f415; +fma.rn.f32 f420, f351, 0f3F575C64, f416; +fma.rn.f32 f421, f352, 0f3F0A6770, f417; +fma.rn.f32 f422, f354, 0fBF27A4F4, f418; +fma.rn.f32 f423, f357, 0f3F4178CE, f419; +fma.rn.f32 f424, f355, 0fBF27A4F4, f420; +fma.rn.f32 f425, f356, 0f3F4178CE, f421; +fma.rn.f32 f426, f338, 0fBF27A4F4, f294; +fma.rn.f32 f427, f341, 0f3F4178CE, 0f00000000; +fma.rn.f32 f428, f339, 0fBF27A4F4, f295; +fma.rn.f32 f429, f340, 0f3F4178CE, 0f00000000; +fma.rn.f32 f430, f342, 0fBE11BAFB, f426; +fma.rn.f32 f431, f345, 0fBF7D64F0, f427; +fma.rn.f32 f432, f343, 0fBE11BAFB, f428; +fma.rn.f32 f433, f344, 0fBF7D64F0, f429; +fma.rn.f32 f434, f346, 0f3F575C64, f430; +fma.rn.f32 f435, f349, 0f3F0A6770, f431; +fma.rn.f32 f436, f347, 0f3F575C64, f432; +fma.rn.f32 f437, f348, 0f3F0A6770, f433; +fma.rn.f32 f438, f350, 0fBF75A155, f434; +fma.rn.f32 f439, f353, 0f3E903F40, f435; +fma.rn.f32 f440, f351, 0fBF75A155, f436; +fma.rn.f32 f441, f352, 0f3E903F40, f437; +fma.rn.f32 f442, f354, 0f3ED4B147, f438; +fma.rn.f32 f443, f357, 0fBF68DDA4, f439; +fma.rn.f32 f444, f355, 0f3ED4B147, f440; +fma.rn.f32 f445, f356, 0fBF68DDA4, f441; +fma.rn.f32 f446, f338, 0fBF75A155, f294; +fma.rn.f32 f447, f341, 0f3E903F40, 0f00000000; +fma.rn.f32 f448, f339, 0fBF75A155, f295; +fma.rn.f32 f449, f340, 0f3E903F40, 0f00000000; +fma.rn.f32 f450, f342, 0f3F575C64, f446; +fma.rn.f32 f451, f345, 0fBF0A6770, f447; +fma.rn.f32 f452, f343, 0f3F575C64, f448; +fma.rn.f32 f453, f344, 0fBF0A6770, f449; +fma.rn.f32 f454, f346, 0fBF27A4F4, f450; +fma.rn.f32 f455, f349, 0f3F4178CE, f451; +fma.rn.f32 f456, f347, 0fBF27A4F4, f452; +fma.rn.f32 f457, f348, 0f3F4178CE, f453; +fma.rn.f32 f458, f350, 0f3ED4B147, f454; +fma.rn.f32 f459, f353, 0fBF68DDA4, f455; +fma.rn.f32 f460, f351, 0f3ED4B147, f456; +fma.rn.f32 f461, f352, 0fBF68DDA4, f457; +fma.rn.f32 f462, f354, 0fBE11BAFB, f458; +fma.rn.f32 f463, f357, 0f3F7D64F0, f459; +fma.rn.f32 f464, f355, 0fBE11BAFB, f460; +fma.rn.f32 f465, f356, 0f3F7D64F0, f461; +add.f32 %1, f365, f355; +add.f32 %0, f364, f354; +add.f32 %3, f385, f384; +sub.f32 %2, f382, f383; +add.f32 %5, f405, f404; +sub.f32 %4, f402, f403; +add.f32 %7, f425, f424; +sub.f32 %6, f422, f423; +add.f32 %9, f445, f444; +sub.f32 %8, f442, f443; +add.f32 %11, f465, f464; +sub.f32 %10, f462, f463; +sub.f32 %13, f464, f465; +add.f32 %12, f463, f462; +sub.f32 %15, f444, f445; +add.f32 %14, f443, f442; +sub.f32 %17, f424, f425; +add.f32 %16, f423, f422; +sub.f32 %19, f404, f405; +add.f32 %18, f403, f402; +sub.f32 %21, f384, f385; +add.f32 %20, f383, f382; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<382, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<466>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 484, r2; +add.f32 f45, %26, %50; +add.f32 f46, %28, %51; +sub.f32 f47, %26, %50; +sub.f32 f48, %28, %51; +add.f32 f49, %29, %48; +add.f32 f50, %31, %49; +sub.f32 f51, %29, %48; +sub.f32 f52, %31, %49; +add.f32 f53, %32, %45; +add.f32 f54, %33, %47; +sub.f32 f55, %32, %45; +sub.f32 f56, %33, %47; +add.f32 f57, %34, %42; +add.f32 f58, %36, %44; +sub.f32 f59, %34, %42; +sub.f32 f60, %36, %44; +add.f32 f61, %37, %40; +add.f32 f62, %39, %41; +sub.f32 f63, %37, %40; +sub.f32 f64, %39, %41; +mov.u32 r4, %tid.x; +add.f32 f65, %24, f45; +add.f32 f66, %25, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +add.f32 f73, f71, f61; +add.f32 f74, f72, f62; +fma.rn.f32 f75, f45, 0f3F575C64, %24; +fma.rn.f32 f76, f48, 0f3F0A6770, 0f00000000; +fma.rn.f32 f77, f46, 0f3F575C64, %25; +fma.rn.f32 f78, f47, 0f3F0A6770, 0f00000000; +fma.rn.f32 f79, f49, 0f3ED4B147, f75; +fma.rn.f32 f80, f52, 0f3F68DDA4, f76; +fma.rn.f32 f81, f50, 0f3ED4B147, f77; +fma.rn.f32 f82, f51, 0f3F68DDA4, f78; +fma.rn.f32 f83, f53, 0fBE11BAFB, f79; +fma.rn.f32 f84, f56, 0f3F7D64F0, f80; +fma.rn.f32 f85, f54, 0fBE11BAFB, f81; +fma.rn.f32 f86, f55, 0f3F7D64F0, f82; +fma.rn.f32 f87, f57, 0fBF27A4F4, f83; +fma.rn.f32 f88, f60, 0f3F4178CE, f84; +fma.rn.f32 f89, f58, 0fBF27A4F4, f85; +fma.rn.f32 f90, f59, 0f3F4178CE, f86; +fma.rn.f32 f91, f61, 0fBF75A155, f87; +fma.rn.f32 f92, f64, 0f3E903F40, f88; +fma.rn.f32 f93, f62, 0fBF75A155, f89; +fma.rn.f32 f94, f63, 0f3E903F40, f90; +sub.f32 f95, f91, f92; +add.f32 f96, f94, f93; +add.f32 f97, f92, f91; +sub.f32 f98, f93, f94; +fma.rn.f32 f99, f45, 0f3ED4B147, %24; +fma.rn.f32 f100, f48, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f101, f46, 0f3ED4B147, %25; +fma.rn.f32 f102, f47, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f103, f49, 0fBF27A4F4, f99; +fma.rn.f32 f104, f52, 0f3F4178CE, f100; +fma.rn.f32 f105, f50, 0fBF27A4F4, f101; +fma.rn.f32 f106, f51, 0f3F4178CE, f102; +fma.rn.f32 f107, f53, 0fBF75A155, f103; +fma.rn.f32 f108, f56, 0fBE903F40, f104; +fma.rn.f32 f109, f54, 0fBF75A155, f105; +fma.rn.f32 f110, f55, 0fBE903F40, f106; +fma.rn.f32 f111, f57, 0fBE11BAFB, f107; +fma.rn.f32 f112, f60, 0fBF7D64F0, f108; +fma.rn.f32 f113, f58, 0fBE11BAFB, f109; +fma.rn.f32 f114, f59, 0fBF7D64F0, f110; +fma.rn.f32 f115, f61, 0f3F575C64, f111; +fma.rn.f32 f116, f64, 0fBF0A6770, f112; +fma.rn.f32 f117, f62, 0f3F575C64, f113; +fma.rn.f32 f118, f63, 0fBF0A6770, f114; +sub.f32 f119, f115, f116; +add.f32 f120, f118, f117; +add.f32 f121, f116, f115; +sub.f32 f122, f117, f118; +fma.rn.f32 f123, f45, 0fBE11BAFB, %24; +fma.rn.f32 f124, f48, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f125, f46, 0fBE11BAFB, %25; +fma.rn.f32 f126, f47, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f127, f49, 0fBF75A155, f123; +fma.rn.f32 f128, f52, 0fBE903F40, f124; +fma.rn.f32 f129, f50, 0fBF75A155, f125; +fma.rn.f32 f130, f51, 0fBE903F40, f126; +fma.rn.f32 f131, f53, 0f3ED4B147, f127; +fma.rn.f32 f132, f56, 0fBF68DDA4, f128; +fma.rn.f32 f133, f54, 0f3ED4B147, f129; +fma.rn.f32 f134, f55, 0fBF68DDA4, f130; +fma.rn.f32 f135, f57, 0f3F575C64, f131; +fma.rn.f32 f136, f60, 0f3F0A6770, f132; +fma.rn.f32 f137, f58, 0f3F575C64, f133; +fma.rn.f32 f138, f59, 0f3F0A6770, f134; +fma.rn.f32 f139, f61, 0fBF27A4F4, f135; +fma.rn.f32 f140, f64, 0f3F4178CE, f136; +fma.rn.f32 f141, f62, 0fBF27A4F4, f137; +fma.rn.f32 f142, f63, 0f3F4178CE, f138; +sub.f32 f143, f139, f140; +add.f32 f144, f142, f141; +add.f32 f145, f140, f139; +sub.f32 f146, f141, f142; +fma.rn.f32 f147, f45, 0fBF27A4F4, %24; +fma.rn.f32 f148, f48, 0f3F4178CE, 0f00000000; +fma.rn.f32 f149, f46, 0fBF27A4F4, %25; +fma.rn.f32 f150, f47, 0f3F4178CE, 0f00000000; +fma.rn.f32 f151, f49, 0fBE11BAFB, f147; +fma.rn.f32 f152, f52, 0fBF7D64F0, f148; +fma.rn.f32 f153, f50, 0fBE11BAFB, f149; +fma.rn.f32 f154, f51, 0fBF7D64F0, f150; +fma.rn.f32 f155, f53, 0f3F575C64, f151; +fma.rn.f32 f156, f56, 0f3F0A6770, f152; +fma.rn.f32 f157, f54, 0f3F575C64, f153; +fma.rn.f32 f158, f55, 0f3F0A6770, f154; +fma.rn.f32 f159, f57, 0fBF75A155, f155; +fma.rn.f32 f160, f60, 0f3E903F40, f156; +fma.rn.f32 f161, f58, 0fBF75A155, f157; +fma.rn.f32 f162, f59, 0f3E903F40, f158; +fma.rn.f32 f163, f61, 0f3ED4B147, f159; +fma.rn.f32 f164, f64, 0fBF68DDA4, f160; +fma.rn.f32 f165, f62, 0f3ED4B147, f161; +fma.rn.f32 f166, f63, 0fBF68DDA4, f162; +sub.f32 f167, f163, f164; +add.f32 f168, f166, f165; +add.f32 f169, f164, f163; +sub.f32 f170, f165, f166; +fma.rn.f32 f171, f45, 0fBF75A155, %24; +fma.rn.f32 f172, f48, 0f3E903F40, 0f00000000; +fma.rn.f32 f173, f46, 0fBF75A155, %25; +fma.rn.f32 f174, f47, 0f3E903F40, 0f00000000; +fma.rn.f32 f175, f49, 0f3F575C64, f171; +fma.rn.f32 f176, f52, 0fBF0A6770, f172; +fma.rn.f32 f177, f50, 0f3F575C64, f173; +fma.rn.f32 f178, f51, 0fBF0A6770, f174; +fma.rn.f32 f179, f53, 0fBF27A4F4, f175; +fma.rn.f32 f180, f56, 0f3F4178CE, f176; +fma.rn.f32 f181, f54, 0fBF27A4F4, f177; +fma.rn.f32 f182, f55, 0f3F4178CE, f178; +fma.rn.f32 f183, f57, 0f3ED4B147, f179; +fma.rn.f32 f184, f60, 0fBF68DDA4, f180; +fma.rn.f32 f185, f58, 0f3ED4B147, f181; +fma.rn.f32 f186, f59, 0fBF68DDA4, f182; +fma.rn.f32 f187, f61, 0fBE11BAFB, f183; +fma.rn.f32 f188, f64, 0f3F7D64F0, f184; +fma.rn.f32 f189, f62, 0fBE11BAFB, f185; +fma.rn.f32 f190, f63, 0f3F7D64F0, f186; +sub.f32 f191, f187, f188; +add.f32 f192, f190, f189; +add.f32 f193, f188, f187; +sub.f32 f194, f189, f190; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f195, f196}, [rd6]; +mul.f32 f199, f96, f196; +fma.rn.f32 f200, f195, f95, f199; +mul.f32 f201, f95, f196; +mul.f32 f202, f195, f96; +sub.f32 f203, f202, f201; +mul.f32 f204, f195, f195; +mul.f32 f205, f196, f196; +sub.f32 f206, f204, f205; +mul.f32 f207, f196, f195; +fma.rn.f32 f208, f196, f195, f207; +mul.f32 f209, f120, f208; +fma.rn.f32 f210, f206, f119, f209; +mul.f32 f211, f119, f208; +mul.f32 f212, f206, f120; +sub.f32 f213, f212, f211; +mul.f32 f214, f195, f206; +mul.f32 f215, f196, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f195, f208; +fma.rn.f32 f218, f196, f206, f217; +mul.f32 f219, f144, f218; +fma.rn.f32 f220, f216, f143, f219; +mul.f32 f221, f143, f218; +mul.f32 f222, f216, f144; +sub.f32 f223, f222, f221; +mul.f32 f224, f195, f216; +mul.f32 f225, f196, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f195, f218; +fma.rn.f32 f228, f196, f216, f227; +mul.f32 f229, f168, f228; +fma.rn.f32 f230, f226, f167, f229; +mul.f32 f231, f167, f228; +mul.f32 f232, f226, f168; +sub.f32 f233, f232, f231; +mul.f32 f234, f195, f226; +mul.f32 f235, f196, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f195, f228; +fma.rn.f32 f238, f196, f226, f237; +mul.f32 f239, f192, f238; +fma.rn.f32 f240, f236, f191, f239; +mul.f32 f241, f191, f238; +mul.f32 f242, f236, f192; +sub.f32 f243, f242, f241; +mul.f32 f244, f195, f236; +mul.f32 f245, f196, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f195, f238; +fma.rn.f32 f248, f196, f236, f247; +mul.f32 f249, f194, f248; +fma.rn.f32 f250, f246, f193, f249; +mul.f32 f251, f193, f248; +mul.f32 f252, f246, f194; +sub.f32 f253, f252, f251; +mul.f32 f254, f195, f246; +mul.f32 f255, f196, f248; +sub.f32 f256, f254, f255; +mul.f32 f257, f195, f248; +fma.rn.f32 f258, f196, f246, f257; +mul.f32 f259, f170, f258; +fma.rn.f32 f260, f256, f169, f259; +mul.f32 f261, f169, f258; +mul.f32 f262, f256, f170; +sub.f32 f263, f262, f261; +mul.f32 f264, f195, f256; +mul.f32 f265, f196, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f195, f258; +fma.rn.f32 f268, f196, f256, f267; +mul.f32 f269, f146, f268; +fma.rn.f32 f270, f266, f145, f269; +mul.f32 f271, f145, f268; +mul.f32 f272, f266, f146; +sub.f32 f273, f272, f271; +mul.f32 f274, f195, f266; +mul.f32 f275, f196, f268; +sub.f32 f276, f274, f275; +mul.f32 f277, f195, f268; +fma.rn.f32 f278, f196, f266, f277; +mul.f32 f279, f122, f278; +fma.rn.f32 f280, f276, f121, f279; +mul.f32 f281, f121, f278; +mul.f32 f282, f276, f122; +sub.f32 f283, f282, f281; +mul.f32 f284, f195, f276; +mul.f32 f285, f196, f278; +sub.f32 f286, f284, f285; +mul.f32 f287, f195, f278; +fma.rn.f32 f288, f196, f276, f287; +mul.f32 f289, f98, f288; +fma.rn.f32 f290, f286, f97, f289; +mul.f32 f291, f97, f288; +mul.f32 f292, f286, f98; +sub.f32 f293, f292, f291; +mad.lo.s32 r8, r5, 484, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 44, r8; +st.shared.f32 [r9], f73; +st.shared.f32 [r9+4], f200; +st.shared.f32 [r9+8], f210; +st.shared.f32 [r9+12], f220; +st.shared.f32 [r9+16], f230; +st.shared.f32 [r9+20], f240; +st.shared.f32 [r9+24], f250; +st.shared.f32 [r9+28], f260; +st.shared.f32 [r9+32], f270; +st.shared.f32 [r9+36], f280; +st.shared.f32 [r9+40], f290; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.f32 f294, [r10]; +ld.shared.f32 f295, [r10+44]; +ld.shared.f32 f296, [r10+88]; +ld.shared.f32 f297, [r10+132]; +ld.shared.f32 f298, [r10+176]; +ld.shared.f32 f299, [r10+220]; +ld.shared.f32 f300, [r10+264]; +ld.shared.f32 f301, [r10+308]; +ld.shared.f32 f302, [r10+352]; +ld.shared.f32 f303, [r10+396]; +ld.shared.f32 f304, [r10+440]; +barrier.sync 0; +st.shared.f32 [r9], f74; +st.shared.f32 [r9+4], f203; +st.shared.f32 [r9+8], f213; +st.shared.f32 [r9+12], f223; +st.shared.f32 [r9+16], f233; +st.shared.f32 [r9+20], f243; +st.shared.f32 [r9+24], f253; +st.shared.f32 [r9+28], f263; +st.shared.f32 [r9+32], f273; +st.shared.f32 [r9+36], f283; +st.shared.f32 [r9+40], f293; +barrier.sync 0; +ld.shared.f32 f305, [r10]; +ld.shared.f32 f306, [r10+44]; +ld.shared.f32 f307, [r10+88]; +ld.shared.f32 f308, [r10+132]; +ld.shared.f32 f309, [r10+176]; +ld.shared.f32 f310, [r10+220]; +ld.shared.f32 f311, [r10+264]; +ld.shared.f32 f312, [r10+308]; +ld.shared.f32 f313, [r10+352]; +ld.shared.f32 f314, [r10+396]; +ld.shared.f32 f315, [r10+440]; +add.f32 f316, f295, f304; +add.f32 f317, f306, f315; +sub.f32 f318, f295, f304; +sub.f32 f319, f306, f315; +add.f32 f320, f296, f303; +add.f32 f321, f307, f314; +sub.f32 f322, f296, f303; +sub.f32 f323, f307, f314; +add.f32 f324, f297, f302; +add.f32 f325, f308, f313; +sub.f32 f326, f297, f302; +sub.f32 f327, f308, f313; +add.f32 f328, f298, f301; +add.f32 f329, f309, f312; +sub.f32 f330, f298, f301; +sub.f32 f331, f309, f312; +add.f32 f332, f299, f300; +add.f32 f333, f310, f311; +sub.f32 f334, f299, f300; +sub.f32 f335, f310, f311; +add.f32 f336, f294, f316; +add.f32 f337, f305, f317; +add.f32 f338, f336, f320; +add.f32 f339, f337, f321; +add.f32 f340, f338, f324; +add.f32 f341, f339, f325; +add.f32 f342, f340, f328; +add.f32 f343, f341, f329; +fma.rn.f32 f344, f316, 0f3F575C64, f294; +fma.rn.f32 f345, f319, 0f3F0A6770, 0f00000000; +fma.rn.f32 f346, f317, 0f3F575C64, f305; +fma.rn.f32 f347, f318, 0f3F0A6770, 0f00000000; +fma.rn.f32 f348, f320, 0f3ED4B147, f344; +fma.rn.f32 f349, f323, 0f3F68DDA4, f345; +fma.rn.f32 f350, f321, 0f3ED4B147, f346; +fma.rn.f32 f351, f322, 0f3F68DDA4, f347; +fma.rn.f32 f352, f324, 0fBE11BAFB, f348; +fma.rn.f32 f353, f327, 0f3F7D64F0, f349; +fma.rn.f32 f354, f325, 0fBE11BAFB, f350; +fma.rn.f32 f355, f326, 0f3F7D64F0, f351; +fma.rn.f32 f356, f328, 0fBF27A4F4, f352; +fma.rn.f32 f357, f331, 0f3F4178CE, f353; +fma.rn.f32 f358, f329, 0fBF27A4F4, f354; +fma.rn.f32 f359, f330, 0f3F4178CE, f355; +fma.rn.f32 f360, f332, 0fBF75A155, f356; +fma.rn.f32 f361, f335, 0f3E903F40, f357; +fma.rn.f32 f362, f333, 0fBF75A155, f358; +fma.rn.f32 f363, f334, 0f3E903F40, f359; +fma.rn.f32 f364, f316, 0f3ED4B147, f294; +fma.rn.f32 f365, f319, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f366, f317, 0f3ED4B147, f305; +fma.rn.f32 f367, f318, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f368, f320, 0fBF27A4F4, f364; +fma.rn.f32 f369, f323, 0f3F4178CE, f365; +fma.rn.f32 f370, f321, 0fBF27A4F4, f366; +fma.rn.f32 f371, f322, 0f3F4178CE, f367; +fma.rn.f32 f372, f324, 0fBF75A155, f368; +fma.rn.f32 f373, f327, 0fBE903F40, f369; +fma.rn.f32 f374, f325, 0fBF75A155, f370; +fma.rn.f32 f375, f326, 0fBE903F40, f371; +fma.rn.f32 f376, f328, 0fBE11BAFB, f372; +fma.rn.f32 f377, f331, 0fBF7D64F0, f373; +fma.rn.f32 f378, f329, 0fBE11BAFB, f374; +fma.rn.f32 f379, f330, 0fBF7D64F0, f375; +fma.rn.f32 f380, f332, 0f3F575C64, f376; +fma.rn.f32 f381, f335, 0fBF0A6770, f377; +fma.rn.f32 f382, f333, 0f3F575C64, f378; +fma.rn.f32 f383, f334, 0fBF0A6770, f379; +fma.rn.f32 f384, f316, 0fBE11BAFB, f294; +fma.rn.f32 f385, f319, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f386, f317, 0fBE11BAFB, f305; +fma.rn.f32 f387, f318, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f388, f320, 0fBF75A155, f384; +fma.rn.f32 f389, f323, 0fBE903F40, f385; +fma.rn.f32 f390, f321, 0fBF75A155, f386; +fma.rn.f32 f391, f322, 0fBE903F40, f387; +fma.rn.f32 f392, f324, 0f3ED4B147, f388; +fma.rn.f32 f393, f327, 0fBF68DDA4, f389; +fma.rn.f32 f394, f325, 0f3ED4B147, f390; +fma.rn.f32 f395, f326, 0fBF68DDA4, f391; +fma.rn.f32 f396, f328, 0f3F575C64, f392; +fma.rn.f32 f397, f331, 0f3F0A6770, f393; +fma.rn.f32 f398, f329, 0f3F575C64, f394; +fma.rn.f32 f399, f330, 0f3F0A6770, f395; +fma.rn.f32 f400, f332, 0fBF27A4F4, f396; +fma.rn.f32 f401, f335, 0f3F4178CE, f397; +fma.rn.f32 f402, f333, 0fBF27A4F4, f398; +fma.rn.f32 f403, f334, 0f3F4178CE, f399; +fma.rn.f32 f404, f316, 0fBF27A4F4, f294; +fma.rn.f32 f405, f319, 0f3F4178CE, 0f00000000; +fma.rn.f32 f406, f317, 0fBF27A4F4, f305; +fma.rn.f32 f407, f318, 0f3F4178CE, 0f00000000; +fma.rn.f32 f408, f320, 0fBE11BAFB, f404; +fma.rn.f32 f409, f323, 0fBF7D64F0, f405; +fma.rn.f32 f410, f321, 0fBE11BAFB, f406; +fma.rn.f32 f411, f322, 0fBF7D64F0, f407; +fma.rn.f32 f412, f324, 0f3F575C64, f408; +fma.rn.f32 f413, f327, 0f3F0A6770, f409; +fma.rn.f32 f414, f325, 0f3F575C64, f410; +fma.rn.f32 f415, f326, 0f3F0A6770, f411; +fma.rn.f32 f416, f328, 0fBF75A155, f412; +fma.rn.f32 f417, f331, 0f3E903F40, f413; +fma.rn.f32 f418, f329, 0fBF75A155, f414; +fma.rn.f32 f419, f330, 0f3E903F40, f415; +fma.rn.f32 f420, f332, 0f3ED4B147, f416; +fma.rn.f32 f421, f335, 0fBF68DDA4, f417; +fma.rn.f32 f422, f333, 0f3ED4B147, f418; +fma.rn.f32 f423, f334, 0fBF68DDA4, f419; +fma.rn.f32 f424, f316, 0fBF75A155, f294; +fma.rn.f32 f425, f319, 0f3E903F40, 0f00000000; +fma.rn.f32 f426, f317, 0fBF75A155, f305; +fma.rn.f32 f427, f318, 0f3E903F40, 0f00000000; +fma.rn.f32 f428, f320, 0f3F575C64, f424; +fma.rn.f32 f429, f323, 0fBF0A6770, f425; +fma.rn.f32 f430, f321, 0f3F575C64, f426; +fma.rn.f32 f431, f322, 0fBF0A6770, f427; +fma.rn.f32 f432, f324, 0fBF27A4F4, f428; +fma.rn.f32 f433, f327, 0f3F4178CE, f429; +fma.rn.f32 f434, f325, 0fBF27A4F4, f430; +fma.rn.f32 f435, f326, 0f3F4178CE, f431; +fma.rn.f32 f436, f328, 0f3ED4B147, f432; +fma.rn.f32 f437, f331, 0fBF68DDA4, f433; +fma.rn.f32 f438, f329, 0f3ED4B147, f434; +fma.rn.f32 f439, f330, 0fBF68DDA4, f435; +fma.rn.f32 f440, f332, 0fBE11BAFB, f436; +fma.rn.f32 f441, f335, 0f3F7D64F0, f437; +fma.rn.f32 f442, f333, 0fBE11BAFB, f438; +fma.rn.f32 f443, f334, 0f3F7D64F0, f439; +add.f32 %0, f342, f332; +add.f32 %1, f343, f333; +add.f32 %3, f363, f362; +sub.f32 %2, f360, f361; +add.f32 %5, f383, f382; +sub.f32 %4, f380, f381; +add.f32 %7, f403, f402; +sub.f32 %6, f400, f401; +add.f32 %9, f423, f422; +sub.f32 %8, f420, f421; +add.f32 %11, f443, f442; +sub.f32 %10, f440, f441; +sub.f32 %13, f442, f443; +add.f32 %12, f441, f440; +sub.f32 %15, f422, f423; +add.f32 %14, f421, f420; +sub.f32 %17, f402, f403; +add.f32 %16, f401, f400; +sub.f32 %19, f382, f383; +add.f32 %18, f381, f380; +sub.f32 %21, f362, f363; +add.f32 %20, f361, f360; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..1d3af31dd5101 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp64_fwd.hpp.inc @@ -0,0 +1,916 @@ +#ifndef CUFFTDX_FFT_121_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_121_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<555, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<465>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 968, r2; +add.f64 fd45, %26, %50; +add.f64 fd46, %28, %51; +sub.f64 fd47, %26, %50; +sub.f64 fd48, %28, %51; +add.f64 fd49, %29, %48; +add.f64 fd50, %31, %49; +sub.f64 fd51, %29, %48; +sub.f64 fd52, %31, %49; +add.f64 fd53, %32, %45; +add.f64 fd54, %33, %47; +sub.f64 fd55, %32, %45; +sub.f64 fd56, %33, %47; +add.f64 fd57, %34, %42; +add.f64 fd58, %36, %44; +sub.f64 fd59, %34, %42; +sub.f64 fd60, %36, %44; +add.f64 fd61, %37, %40; +add.f64 fd62, %39, %41; +sub.f64 fd63, %37, %40; +sub.f64 fd64, %39, %41; +mov.u32 r4, %tid.x; +add.f64 fd65, %24, fd45; +add.f64 fd66, %25, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +add.f64 fd73, fd71, fd61; +add.f64 fd74, fd72, fd62; +fma.rn.f64 fd75, fd45, 0d3FEAEB8C8764F0BA, %24; +fma.rn.f64 fd76, fd48, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd46, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd78, fd47, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd79, fd49, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd52, 0dBFED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd50, 0d3FDA9628D9C712B6, fd77; +fma.rn.f64 fd82, fd51, 0dBFED1BB48EEE2C13, fd78; +fma.rn.f64 fd83, fd53, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd56, 0dBFEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd54, 0dBFC2375F640F44DB, fd81; +fma.rn.f64 fd86, fd55, 0dBFEFAC9E043842EF, fd82; +fma.rn.f64 fd87, fd57, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd60, 0dBFE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd58, 0dBFE4F49E7F775887, fd85; +fma.rn.f64 fd90, fd59, 0dBFE82F19BB3A28A1, fd86; +fma.rn.f64 fd91, fd61, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd64, 0dBFD207E7FD768DBF, fd88; +fma.rn.f64 fd93, fd62, 0dBFEEB42A9BCD5057, fd89; +fma.rn.f64 fd94, fd63, 0dBFD207E7FD768DBF, fd90; +sub.f64 fd95, fd91, fd92; +add.f64 fd96, fd94, fd93; +add.f64 fd97, fd92, fd91; +sub.f64 fd98, fd93, fd94; +fma.rn.f64 fd99, fd45, 0d3FDA9628D9C712B6, %24; +fma.rn.f64 fd100, fd48, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd46, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd102, fd47, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd103, fd49, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd52, 0dBFE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd50, 0dBFE4F49E7F775887, fd101; +fma.rn.f64 fd106, fd51, 0dBFE82F19BB3A28A1, fd102; +fma.rn.f64 fd107, fd53, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd56, 0d3FD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd54, 0dBFEEB42A9BCD5057, fd105; +fma.rn.f64 fd110, fd55, 0d3FD207E7FD768DBF, fd106; +fma.rn.f64 fd111, fd57, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd60, 0d3FEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd58, 0dBFC2375F640F44DB, fd109; +fma.rn.f64 fd114, fd59, 0d3FEFAC9E043842EF, fd110; +fma.rn.f64 fd115, fd61, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd64, 0d3FE14CEDF8BB580B, fd112; +fma.rn.f64 fd117, fd62, 0d3FEAEB8C8764F0BA, fd113; +fma.rn.f64 fd118, fd63, 0d3FE14CEDF8BB580B, fd114; +sub.f64 fd119, fd115, fd116; +add.f64 fd120, fd118, fd117; +add.f64 fd121, fd116, fd115; +sub.f64 fd122, fd117, fd118; +fma.rn.f64 fd123, fd45, 0dBFC2375F640F44DB, %24; +fma.rn.f64 fd124, fd48, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd46, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd126, fd47, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd127, fd49, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd52, 0d3FD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd50, 0dBFEEB42A9BCD5057, fd125; +fma.rn.f64 fd130, fd51, 0d3FD207E7FD768DBF, fd126; +fma.rn.f64 fd131, fd53, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd56, 0d3FED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd54, 0d3FDA9628D9C712B6, fd129; +fma.rn.f64 fd134, fd55, 0d3FED1BB48EEE2C13, fd130; +fma.rn.f64 fd135, fd57, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd60, 0dBFE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd58, 0d3FEAEB8C8764F0BA, fd133; +fma.rn.f64 fd138, fd59, 0dBFE14CEDF8BB580B, fd134; +fma.rn.f64 fd139, fd61, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd64, 0dBFE82F19BB3A28A1, fd136; +fma.rn.f64 fd141, fd62, 0dBFE4F49E7F775887, fd137; +fma.rn.f64 fd142, fd63, 0dBFE82F19BB3A28A1, fd138; +sub.f64 fd143, fd139, fd140; +add.f64 fd144, fd142, fd141; +add.f64 fd145, fd140, fd139; +sub.f64 fd146, fd141, fd142; +fma.rn.f64 fd147, fd45, 0dBFE4F49E7F775887, %24; +fma.rn.f64 fd148, fd48, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd46, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd150, fd47, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd151, fd49, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd52, 0d3FEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd50, 0dBFC2375F640F44DB, fd149; +fma.rn.f64 fd154, fd51, 0d3FEFAC9E043842EF, fd150; +fma.rn.f64 fd155, fd53, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd56, 0dBFE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd54, 0d3FEAEB8C8764F0BA, fd153; +fma.rn.f64 fd158, fd55, 0dBFE14CEDF8BB580B, fd154; +fma.rn.f64 fd159, fd57, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd60, 0dBFD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd58, 0dBFEEB42A9BCD5057, fd157; +fma.rn.f64 fd162, fd59, 0dBFD207E7FD768DBF, fd158; +fma.rn.f64 fd163, fd61, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd64, 0d3FED1BB48EEE2C13, fd160; +fma.rn.f64 fd165, fd62, 0d3FDA9628D9C712B6, fd161; +fma.rn.f64 fd166, fd63, 0d3FED1BB48EEE2C13, fd162; +sub.f64 fd167, fd163, fd164; +add.f64 fd168, fd166, fd165; +add.f64 fd169, fd164, fd163; +sub.f64 fd170, fd165, fd166; +fma.rn.f64 fd171, fd45, 0dBFEEB42A9BCD5057, %24; +fma.rn.f64 fd172, fd48, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd46, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd174, fd47, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd175, fd49, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd52, 0d3FE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd50, 0d3FEAEB8C8764F0BA, fd173; +fma.rn.f64 fd178, fd51, 0d3FE14CEDF8BB580B, fd174; +fma.rn.f64 fd179, fd53, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd56, 0dBFE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd54, 0dBFE4F49E7F775887, fd177; +fma.rn.f64 fd182, fd55, 0dBFE82F19BB3A28A1, fd178; +fma.rn.f64 fd183, fd57, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd60, 0d3FED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd58, 0d3FDA9628D9C712B6, fd181; +fma.rn.f64 fd186, fd59, 0d3FED1BB48EEE2C13, fd182; +fma.rn.f64 fd187, fd61, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd64, 0dBFEFAC9E043842EF, fd184; +fma.rn.f64 fd189, fd62, 0dBFC2375F640F44DB, fd185; +fma.rn.f64 fd190, fd63, 0dBFEFAC9E043842EF, fd186; +sub.f64 fd191, fd187, fd188; +add.f64 fd192, fd190, fd189; +add.f64 fd193, fd188, fd187; +sub.f64 fd194, fd189, fd190; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd195, fd196}, [rd6]; +mul.f64 fd199, fd195, fd95; +mul.f64 fd200, fd196, fd96; +sub.f64 fd201, fd199, fd200; +mul.f64 fd202, fd195, fd96; +fma.rn.f64 fd203, fd196, fd95, fd202; +mul.f64 fd204, fd195, fd195; +mul.f64 fd205, fd196, fd196; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd196, fd195; +fma.rn.f64 fd208, fd196, fd195, fd207; +mul.f64 fd209, fd206, fd119; +mul.f64 fd210, fd208, fd120; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd206, fd120; +fma.rn.f64 fd213, fd208, fd119, fd212; +mul.f64 fd214, fd195, fd206; +mul.f64 fd215, fd196, fd208; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd195, fd208; +fma.rn.f64 fd218, fd196, fd206, fd217; +mul.f64 fd219, fd216, fd143; +mul.f64 fd220, fd218, fd144; +sub.f64 fd221, fd219, fd220; +mul.f64 fd222, fd216, fd144; +fma.rn.f64 fd223, fd218, fd143, fd222; +mul.f64 fd224, fd195, fd216; +mul.f64 fd225, fd196, fd218; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd195, fd218; +fma.rn.f64 fd228, fd196, fd216, fd227; +mul.f64 fd229, fd226, fd167; +mul.f64 fd230, fd228, fd168; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd226, fd168; +fma.rn.f64 fd233, fd228, fd167, fd232; +mul.f64 fd234, fd195, fd226; +mul.f64 fd235, fd196, fd228; +sub.f64 fd236, fd234, fd235; +mul.f64 fd237, fd195, fd228; +fma.rn.f64 fd238, fd196, fd226, fd237; +mul.f64 fd239, fd236, fd191; +mul.f64 fd240, fd238, fd192; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd236, fd192; +fma.rn.f64 fd243, fd238, fd191, fd242; +ld.global.v2.f64 {fd244, fd245}, [rd6+176]; +mul.f64 fd248, fd244, fd193; +mul.f64 fd249, fd245, fd194; +sub.f64 fd250, fd248, fd249; +mul.f64 fd251, fd244, fd194; +fma.rn.f64 fd252, fd245, fd193, fd251; +mul.f64 fd253, fd195, fd244; +mul.f64 fd254, fd196, fd245; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd195, fd245; +fma.rn.f64 fd257, fd196, fd244, fd256; +mul.f64 fd258, fd255, fd169; +mul.f64 fd259, fd257, fd170; +sub.f64 fd260, fd258, fd259; +mul.f64 fd261, fd255, fd170; +fma.rn.f64 fd262, fd257, fd169, fd261; +mul.f64 fd263, fd195, fd255; +mul.f64 fd264, fd196, fd257; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd195, fd257; +fma.rn.f64 fd267, fd196, fd255, fd266; +mul.f64 fd268, fd265, fd145; +mul.f64 fd269, fd267, fd146; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd265, fd146; +fma.rn.f64 fd272, fd267, fd145, fd271; +mul.f64 fd273, fd195, fd265; +mul.f64 fd274, fd196, fd267; +sub.f64 fd275, fd273, fd274; +mul.f64 fd276, fd195, fd267; +fma.rn.f64 fd277, fd196, fd265, fd276; +mul.f64 fd278, fd275, fd121; +mul.f64 fd279, fd277, fd122; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd275, fd122; +fma.rn.f64 fd282, fd277, fd121, fd281; +mul.f64 fd283, fd195, fd275; +mul.f64 fd284, fd196, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd195, fd277; +fma.rn.f64 fd287, fd196, fd275, fd286; +mul.f64 fd288, fd285, fd97; +mul.f64 fd289, fd287, fd98; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd285, fd98; +fma.rn.f64 fd292, fd287, fd97, fd291; +mad.lo.s32 r8, r5, 968, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 88, r8; +st.shared.f64 [r9], fd73; +st.shared.f64 [r9+8], fd201; +st.shared.f64 [r9+16], fd211; +st.shared.f64 [r9+24], fd221; +st.shared.f64 [r9+32], fd231; +st.shared.f64 [r9+40], fd241; +st.shared.f64 [r9+48], fd250; +st.shared.f64 [r9+56], fd260; +st.shared.f64 [r9+64], fd270; +st.shared.f64 [r9+72], fd280; +st.shared.f64 [r9+80], fd290; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.f64 fd293, [r10]; +ld.shared.f64 fd294, [r10+88]; +ld.shared.f64 fd295, [r10+176]; +ld.shared.f64 fd296, [r10+264]; +ld.shared.f64 fd297, [r10+352]; +ld.shared.f64 fd298, [r10+440]; +ld.shared.f64 fd299, [r10+528]; +ld.shared.f64 fd300, [r10+616]; +ld.shared.f64 fd301, [r10+704]; +ld.shared.f64 fd302, [r10+792]; +ld.shared.f64 fd303, [r10+880]; +barrier.sync 0; +st.shared.f64 [r9], fd74; +st.shared.f64 [r9+8], fd203; +st.shared.f64 [r9+16], fd213; +st.shared.f64 [r9+24], fd223; +st.shared.f64 [r9+32], fd233; +st.shared.f64 [r9+40], fd243; +st.shared.f64 [r9+48], fd252; +st.shared.f64 [r9+56], fd262; +st.shared.f64 [r9+64], fd272; +st.shared.f64 [r9+72], fd282; +st.shared.f64 [r9+80], fd292; +barrier.sync 0; +ld.shared.f64 fd304, [r10]; +ld.shared.f64 fd305, [r10+88]; +ld.shared.f64 fd306, [r10+176]; +ld.shared.f64 fd307, [r10+264]; +ld.shared.f64 fd308, [r10+352]; +ld.shared.f64 fd309, [r10+440]; +ld.shared.f64 fd310, [r10+528]; +ld.shared.f64 fd311, [r10+616]; +ld.shared.f64 fd312, [r10+704]; +ld.shared.f64 fd313, [r10+792]; +ld.shared.f64 fd314, [r10+880]; +add.f64 fd315, fd294, fd303; +add.f64 fd316, fd305, fd314; +sub.f64 fd317, fd294, fd303; +sub.f64 fd318, fd305, fd314; +add.f64 fd319, fd295, fd302; +add.f64 fd320, fd306, fd313; +sub.f64 fd321, fd295, fd302; +sub.f64 fd322, fd306, fd313; +add.f64 fd323, fd296, fd301; +add.f64 fd324, fd307, fd312; +sub.f64 fd325, fd296, fd301; +sub.f64 fd326, fd307, fd312; +add.f64 fd327, fd297, fd300; +add.f64 fd328, fd308, fd311; +sub.f64 fd329, fd297, fd300; +sub.f64 fd330, fd308, fd311; +add.f64 fd331, fd298, fd299; +add.f64 fd332, fd309, fd310; +sub.f64 fd333, fd298, fd299; +sub.f64 fd334, fd309, fd310; +add.f64 fd335, fd293, fd315; +add.f64 fd336, fd304, fd316; +add.f64 fd337, fd335, fd319; +add.f64 fd338, fd336, fd320; +add.f64 fd339, fd337, fd323; +add.f64 fd340, fd338, fd324; +add.f64 fd341, fd339, fd327; +add.f64 fd342, fd340, fd328; +fma.rn.f64 fd343, fd315, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd344, fd318, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd345, fd316, 0d3FEAEB8C8764F0BA, fd304; +fma.rn.f64 fd346, fd317, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd347, fd319, 0d3FDA9628D9C712B6, fd343; +fma.rn.f64 fd348, fd322, 0dBFED1BB48EEE2C13, fd344; +fma.rn.f64 fd349, fd320, 0d3FDA9628D9C712B6, fd345; +fma.rn.f64 fd350, fd321, 0dBFED1BB48EEE2C13, fd346; +fma.rn.f64 fd351, fd323, 0dBFC2375F640F44DB, fd347; +fma.rn.f64 fd352, fd326, 0dBFEFAC9E043842EF, fd348; +fma.rn.f64 fd353, fd324, 0dBFC2375F640F44DB, fd349; +fma.rn.f64 fd354, fd325, 0dBFEFAC9E043842EF, fd350; +fma.rn.f64 fd355, fd327, 0dBFE4F49E7F775887, fd351; +fma.rn.f64 fd356, fd330, 0dBFE82F19BB3A28A1, fd352; +fma.rn.f64 fd357, fd328, 0dBFE4F49E7F775887, fd353; +fma.rn.f64 fd358, fd329, 0dBFE82F19BB3A28A1, fd354; +fma.rn.f64 fd359, fd331, 0dBFEEB42A9BCD5057, fd355; +fma.rn.f64 fd360, fd334, 0dBFD207E7FD768DBF, fd356; +fma.rn.f64 fd361, fd332, 0dBFEEB42A9BCD5057, fd357; +fma.rn.f64 fd362, fd333, 0dBFD207E7FD768DBF, fd358; +fma.rn.f64 fd363, fd315, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd364, fd318, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd365, fd316, 0d3FDA9628D9C712B6, fd304; +fma.rn.f64 fd366, fd317, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd367, fd319, 0dBFE4F49E7F775887, fd363; +fma.rn.f64 fd368, fd322, 0dBFE82F19BB3A28A1, fd364; +fma.rn.f64 fd369, fd320, 0dBFE4F49E7F775887, fd365; +fma.rn.f64 fd370, fd321, 0dBFE82F19BB3A28A1, fd366; +fma.rn.f64 fd371, fd323, 0dBFEEB42A9BCD5057, fd367; +fma.rn.f64 fd372, fd326, 0d3FD207E7FD768DBF, fd368; +fma.rn.f64 fd373, fd324, 0dBFEEB42A9BCD5057, fd369; +fma.rn.f64 fd374, fd325, 0d3FD207E7FD768DBF, fd370; +fma.rn.f64 fd375, fd327, 0dBFC2375F640F44DB, fd371; +fma.rn.f64 fd376, fd330, 0d3FEFAC9E043842EF, fd372; +fma.rn.f64 fd377, fd328, 0dBFC2375F640F44DB, fd373; +fma.rn.f64 fd378, fd329, 0d3FEFAC9E043842EF, fd374; +fma.rn.f64 fd379, fd331, 0d3FEAEB8C8764F0BA, fd375; +fma.rn.f64 fd380, fd334, 0d3FE14CEDF8BB580B, fd376; +fma.rn.f64 fd381, fd332, 0d3FEAEB8C8764F0BA, fd377; +fma.rn.f64 fd382, fd333, 0d3FE14CEDF8BB580B, fd378; +fma.rn.f64 fd383, fd315, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd384, fd318, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd385, fd316, 0dBFC2375F640F44DB, fd304; +fma.rn.f64 fd386, fd317, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd387, fd319, 0dBFEEB42A9BCD5057, fd383; +fma.rn.f64 fd388, fd322, 0d3FD207E7FD768DBF, fd384; +fma.rn.f64 fd389, fd320, 0dBFEEB42A9BCD5057, fd385; +fma.rn.f64 fd390, fd321, 0d3FD207E7FD768DBF, fd386; +fma.rn.f64 fd391, fd323, 0d3FDA9628D9C712B6, fd387; +fma.rn.f64 fd392, fd326, 0d3FED1BB48EEE2C13, fd388; +fma.rn.f64 fd393, fd324, 0d3FDA9628D9C712B6, fd389; +fma.rn.f64 fd394, fd325, 0d3FED1BB48EEE2C13, fd390; +fma.rn.f64 fd395, fd327, 0d3FEAEB8C8764F0BA, fd391; +fma.rn.f64 fd396, fd330, 0dBFE14CEDF8BB580B, fd392; +fma.rn.f64 fd397, fd328, 0d3FEAEB8C8764F0BA, fd393; +fma.rn.f64 fd398, fd329, 0dBFE14CEDF8BB580B, fd394; +fma.rn.f64 fd399, fd331, 0dBFE4F49E7F775887, fd395; +fma.rn.f64 fd400, fd334, 0dBFE82F19BB3A28A1, fd396; +fma.rn.f64 fd401, fd332, 0dBFE4F49E7F775887, fd397; +fma.rn.f64 fd402, fd333, 0dBFE82F19BB3A28A1, fd398; +fma.rn.f64 fd403, fd315, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd404, fd318, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd405, fd316, 0dBFE4F49E7F775887, fd304; +fma.rn.f64 fd406, fd317, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd407, fd319, 0dBFC2375F640F44DB, fd403; +fma.rn.f64 fd408, fd322, 0d3FEFAC9E043842EF, fd404; +fma.rn.f64 fd409, fd320, 0dBFC2375F640F44DB, fd405; +fma.rn.f64 fd410, fd321, 0d3FEFAC9E043842EF, fd406; +fma.rn.f64 fd411, fd323, 0d3FEAEB8C8764F0BA, fd407; +fma.rn.f64 fd412, fd326, 0dBFE14CEDF8BB580B, fd408; +fma.rn.f64 fd413, fd324, 0d3FEAEB8C8764F0BA, fd409; +fma.rn.f64 fd414, fd325, 0dBFE14CEDF8BB580B, fd410; +fma.rn.f64 fd415, fd327, 0dBFEEB42A9BCD5057, fd411; +fma.rn.f64 fd416, fd330, 0dBFD207E7FD768DBF, fd412; +fma.rn.f64 fd417, fd328, 0dBFEEB42A9BCD5057, fd413; +fma.rn.f64 fd418, fd329, 0dBFD207E7FD768DBF, fd414; +fma.rn.f64 fd419, fd331, 0d3FDA9628D9C712B6, fd415; +fma.rn.f64 fd420, fd334, 0d3FED1BB48EEE2C13, fd416; +fma.rn.f64 fd421, fd332, 0d3FDA9628D9C712B6, fd417; +fma.rn.f64 fd422, fd333, 0d3FED1BB48EEE2C13, fd418; +fma.rn.f64 fd423, fd315, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd424, fd318, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd425, fd316, 0dBFEEB42A9BCD5057, fd304; +fma.rn.f64 fd426, fd317, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd427, fd319, 0d3FEAEB8C8764F0BA, fd423; +fma.rn.f64 fd428, fd322, 0d3FE14CEDF8BB580B, fd424; +fma.rn.f64 fd429, fd320, 0d3FEAEB8C8764F0BA, fd425; +fma.rn.f64 fd430, fd321, 0d3FE14CEDF8BB580B, fd426; +fma.rn.f64 fd431, fd323, 0dBFE4F49E7F775887, fd427; +fma.rn.f64 fd432, fd326, 0dBFE82F19BB3A28A1, fd428; +fma.rn.f64 fd433, fd324, 0dBFE4F49E7F775887, fd429; +fma.rn.f64 fd434, fd325, 0dBFE82F19BB3A28A1, fd430; +fma.rn.f64 fd435, fd327, 0d3FDA9628D9C712B6, fd431; +fma.rn.f64 fd436, fd330, 0d3FED1BB48EEE2C13, fd432; +fma.rn.f64 fd437, fd328, 0d3FDA9628D9C712B6, fd433; +fma.rn.f64 fd438, fd329, 0d3FED1BB48EEE2C13, fd434; +fma.rn.f64 fd439, fd331, 0dBFC2375F640F44DB, fd435; +fma.rn.f64 fd440, fd334, 0dBFEFAC9E043842EF, fd436; +fma.rn.f64 fd441, fd332, 0dBFC2375F640F44DB, fd437; +fma.rn.f64 fd442, fd333, 0dBFEFAC9E043842EF, fd438; +add.f64 %0, fd341, fd331; +add.f64 %1, fd342, fd332; +add.f64 %3, fd362, fd361; +sub.f64 %2, fd359, fd360; +add.f64 %5, fd382, fd381; +sub.f64 %4, fd379, fd380; +add.f64 %7, fd402, fd401; +sub.f64 %6, fd399, fd400; +add.f64 %9, fd422, fd421; +sub.f64 %8, fd419, fd420; +add.f64 %11, fd442, fd441; +sub.f64 %10, fd439, fd440; +sub.f64 %13, fd441, fd442; +add.f64 %12, fd440, fd439; +sub.f64 %15, fd421, fd422; +add.f64 %14, fd420, fd419; +sub.f64 %17, fd401, fd402; +add.f64 %16, fd400, fd399; +sub.f64 %19, fd381, fd382; +add.f64 %18, fd380, fd379; +sub.f64 %21, fd361, fd362; +add.f64 %20, fd360, fd359; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<554, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<487>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 1936, r2; +add.f64 fd45, %26, %50; +add.f64 fd46, %28, %51; +sub.f64 fd47, %26, %50; +sub.f64 fd48, %28, %51; +add.f64 fd49, %29, %48; +add.f64 fd50, %31, %49; +sub.f64 fd51, %29, %48; +sub.f64 fd52, %31, %49; +add.f64 fd53, %32, %45; +add.f64 fd54, %33, %47; +sub.f64 fd55, %32, %45; +sub.f64 fd56, %33, %47; +add.f64 fd57, %34, %42; +add.f64 fd58, %36, %44; +sub.f64 fd59, %34, %42; +sub.f64 fd60, %36, %44; +add.f64 fd61, %37, %40; +add.f64 fd62, %39, %41; +sub.f64 fd63, %37, %40; +sub.f64 fd64, %39, %41; +mov.u32 r4, %tid.x; +add.f64 fd65, %24, fd45; +add.f64 fd66, %25, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +fma.rn.f64 fd73, fd45, 0d3FEAEB8C8764F0BA, %24; +fma.rn.f64 fd74, fd48, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd75, fd46, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd76, fd47, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd49, 0d3FDA9628D9C712B6, fd73; +fma.rn.f64 fd78, fd52, 0dBFED1BB48EEE2C13, fd74; +fma.rn.f64 fd79, fd50, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd51, 0dBFED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd53, 0dBFC2375F640F44DB, fd77; +fma.rn.f64 fd82, fd56, 0dBFEFAC9E043842EF, fd78; +fma.rn.f64 fd83, fd54, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd55, 0dBFEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd57, 0dBFE4F49E7F775887, fd81; +fma.rn.f64 fd86, fd60, 0dBFE82F19BB3A28A1, fd82; +fma.rn.f64 fd87, fd58, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd59, 0dBFE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd61, 0dBFEEB42A9BCD5057, fd85; +fma.rn.f64 fd90, fd64, 0dBFD207E7FD768DBF, fd86; +fma.rn.f64 fd91, fd62, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd63, 0dBFD207E7FD768DBF, fd88; +sub.f64 fd93, fd89, fd90; +add.f64 fd94, fd92, fd91; +add.f64 fd95, fd90, fd89; +sub.f64 fd96, fd91, fd92; +fma.rn.f64 fd97, fd45, 0d3FDA9628D9C712B6, %24; +fma.rn.f64 fd98, fd48, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd99, fd46, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd100, fd47, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd49, 0dBFE4F49E7F775887, fd97; +fma.rn.f64 fd102, fd52, 0dBFE82F19BB3A28A1, fd98; +fma.rn.f64 fd103, fd50, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd51, 0dBFE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd53, 0dBFEEB42A9BCD5057, fd101; +fma.rn.f64 fd106, fd56, 0d3FD207E7FD768DBF, fd102; +fma.rn.f64 fd107, fd54, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd55, 0d3FD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd57, 0dBFC2375F640F44DB, fd105; +fma.rn.f64 fd110, fd60, 0d3FEFAC9E043842EF, fd106; +fma.rn.f64 fd111, fd58, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd59, 0d3FEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd61, 0d3FEAEB8C8764F0BA, fd109; +fma.rn.f64 fd114, fd64, 0d3FE14CEDF8BB580B, fd110; +fma.rn.f64 fd115, fd62, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd63, 0d3FE14CEDF8BB580B, fd112; +sub.f64 fd117, fd113, fd114; +add.f64 fd118, fd116, fd115; +add.f64 fd119, fd114, fd113; +sub.f64 fd120, fd115, fd116; +fma.rn.f64 fd121, fd45, 0dBFC2375F640F44DB, %24; +fma.rn.f64 fd122, fd48, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd123, fd46, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd124, fd47, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd49, 0dBFEEB42A9BCD5057, fd121; +fma.rn.f64 fd126, fd52, 0d3FD207E7FD768DBF, fd122; +fma.rn.f64 fd127, fd50, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd51, 0d3FD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd53, 0d3FDA9628D9C712B6, fd125; +fma.rn.f64 fd130, fd56, 0d3FED1BB48EEE2C13, fd126; +fma.rn.f64 fd131, fd54, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd55, 0d3FED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd57, 0d3FEAEB8C8764F0BA, fd129; +fma.rn.f64 fd134, fd60, 0dBFE14CEDF8BB580B, fd130; +fma.rn.f64 fd135, fd58, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd59, 0dBFE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd61, 0dBFE4F49E7F775887, fd133; +fma.rn.f64 fd138, fd64, 0dBFE82F19BB3A28A1, fd134; +fma.rn.f64 fd139, fd62, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd63, 0dBFE82F19BB3A28A1, fd136; +sub.f64 fd141, fd137, fd138; +add.f64 fd142, fd140, fd139; +add.f64 fd143, fd138, fd137; +sub.f64 fd144, fd139, fd140; +fma.rn.f64 fd145, fd45, 0dBFE4F49E7F775887, %24; +fma.rn.f64 fd146, fd48, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd147, fd46, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd148, fd47, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd49, 0dBFC2375F640F44DB, fd145; +fma.rn.f64 fd150, fd52, 0d3FEFAC9E043842EF, fd146; +fma.rn.f64 fd151, fd50, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd51, 0d3FEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd53, 0d3FEAEB8C8764F0BA, fd149; +fma.rn.f64 fd154, fd56, 0dBFE14CEDF8BB580B, fd150; +fma.rn.f64 fd155, fd54, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd55, 0dBFE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd57, 0dBFEEB42A9BCD5057, fd153; +fma.rn.f64 fd158, fd60, 0dBFD207E7FD768DBF, fd154; +fma.rn.f64 fd159, fd58, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd59, 0dBFD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd61, 0d3FDA9628D9C712B6, fd157; +fma.rn.f64 fd162, fd64, 0d3FED1BB48EEE2C13, fd158; +fma.rn.f64 fd163, fd62, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd63, 0d3FED1BB48EEE2C13, fd160; +sub.f64 fd165, fd161, fd162; +add.f64 fd166, fd164, fd163; +add.f64 fd167, fd162, fd161; +sub.f64 fd168, fd163, fd164; +fma.rn.f64 fd169, fd45, 0dBFEEB42A9BCD5057, %24; +fma.rn.f64 fd170, fd48, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd171, fd46, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd172, fd47, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd49, 0d3FEAEB8C8764F0BA, fd169; +fma.rn.f64 fd174, fd52, 0d3FE14CEDF8BB580B, fd170; +fma.rn.f64 fd175, fd50, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd51, 0d3FE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd53, 0dBFE4F49E7F775887, fd173; +fma.rn.f64 fd178, fd56, 0dBFE82F19BB3A28A1, fd174; +fma.rn.f64 fd179, fd54, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd55, 0dBFE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd57, 0d3FDA9628D9C712B6, fd177; +fma.rn.f64 fd182, fd60, 0d3FED1BB48EEE2C13, fd178; +fma.rn.f64 fd183, fd58, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd59, 0d3FED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd61, 0dBFC2375F640F44DB, fd181; +fma.rn.f64 fd186, fd64, 0dBFEFAC9E043842EF, fd182; +fma.rn.f64 fd187, fd62, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd63, 0dBFEFAC9E043842EF, fd184; +sub.f64 fd189, fd185, fd186; +add.f64 fd190, fd188, fd187; +add.f64 fd191, fd186, fd185; +sub.f64 fd192, fd187, fd188; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1936, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd193, fd194}, [rd6]; +mul.f64 fd197, fd193, fd93; +mul.f64 fd198, fd194, fd94; +mul.f64 fd199, fd193, fd94; +mul.f64 fd200, fd193, fd193; +mul.f64 fd201, fd194, fd194; +sub.f64 fd202, fd200, fd201; +mul.f64 fd203, fd194, fd193; +fma.rn.f64 fd204, fd194, fd193, fd203; +mul.f64 fd205, fd202, fd117; +mul.f64 fd206, fd204, fd118; +mul.f64 fd207, fd202, fd118; +mul.f64 fd208, fd193, fd202; +mul.f64 fd209, fd194, fd204; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd193, fd204; +fma.rn.f64 fd212, fd194, fd202, fd211; +mul.f64 fd213, fd210, fd141; +mul.f64 fd214, fd212, fd142; +mul.f64 fd215, fd210, fd142; +mul.f64 fd216, fd193, fd210; +mul.f64 fd217, fd194, fd212; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd193, fd212; +fma.rn.f64 fd220, fd194, fd210, fd219; +mul.f64 fd221, fd218, fd165; +mul.f64 fd222, fd220, fd166; +mul.f64 fd223, fd218, fd166; +mul.f64 fd224, fd193, fd218; +mul.f64 fd225, fd194, fd220; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd193, fd220; +fma.rn.f64 fd228, fd194, fd218, fd227; +mul.f64 fd229, fd226, fd189; +mul.f64 fd230, fd228, fd190; +mul.f64 fd231, fd226, fd190; +ld.global.v2.f64 {fd232, fd233}, [rd6+176]; +mul.f64 fd236, fd232, fd191; +mul.f64 fd237, fd233, fd192; +mul.f64 fd238, fd232, fd192; +mul.f64 fd239, fd193, fd232; +mul.f64 fd240, fd194, fd233; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd193, fd233; +fma.rn.f64 fd243, fd194, fd232, fd242; +mul.f64 fd244, fd241, fd167; +mul.f64 fd245, fd243, fd168; +mul.f64 fd246, fd241, fd168; +mul.f64 fd247, fd193, fd241; +mul.f64 fd248, fd194, fd243; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd193, fd243; +fma.rn.f64 fd251, fd194, fd241, fd250; +mul.f64 fd252, fd249, fd143; +mul.f64 fd253, fd251, fd144; +mul.f64 fd254, fd249, fd144; +mul.f64 fd255, fd193, fd249; +mul.f64 fd256, fd194, fd251; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd193, fd251; +fma.rn.f64 fd259, fd194, fd249, fd258; +mul.f64 fd260, fd257, fd119; +mul.f64 fd261, fd259, fd120; +mul.f64 fd262, fd257, fd120; +mul.f64 fd263, fd193, fd257; +mul.f64 fd264, fd194, fd259; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd193, fd259; +fma.rn.f64 fd267, fd194, fd257, fd266; +mul.f64 fd268, fd265, fd95; +mul.f64 fd269, fd267, fd96; +mul.f64 fd270, fd265, fd96; +barrier.sync 0; +mad.lo.s32 r9, r7, 176, r8; +add.f64 fd271, fd72, fd62; +add.f64 fd272, fd71, fd61; +st.shared.v2.f64 [r9], {fd272, fd271}; +fma.rn.f64 fd273, fd194, fd93, fd199; +sub.f64 fd274, fd197, fd198; +st.shared.v2.f64 [r9+16], {fd274, fd273}; +fma.rn.f64 fd275, fd204, fd117, fd207; +sub.f64 fd276, fd205, fd206; +st.shared.v2.f64 [r9+32], {fd276, fd275}; +sub.f64 fd277, fd213, fd214; +fma.rn.f64 fd278, fd212, fd141, fd215; +st.shared.v2.f64 [r9+48], {fd277, fd278}; +fma.rn.f64 fd279, fd220, fd165, fd223; +sub.f64 fd280, fd221, fd222; +st.shared.v2.f64 [r9+64], {fd280, fd279}; +fma.rn.f64 fd281, fd228, fd189, fd231; +sub.f64 fd282, fd229, fd230; +st.shared.v2.f64 [r9+80], {fd282, fd281}; +fma.rn.f64 fd283, fd233, fd191, fd238; +sub.f64 fd284, fd236, fd237; +st.shared.v2.f64 [r9+96], {fd284, fd283}; +fma.rn.f64 fd285, fd243, fd167, fd246; +sub.f64 fd286, fd244, fd245; +st.shared.v2.f64 [r9+112], {fd286, fd285}; +sub.f64 fd287, fd252, fd253; +fma.rn.f64 fd288, fd251, fd143, fd254; +st.shared.v2.f64 [r9+128], {fd287, fd288}; +fma.rn.f64 fd289, fd259, fd119, fd262; +sub.f64 fd290, fd260, fd261; +st.shared.v2.f64 [r9+144], {fd290, fd289}; +fma.rn.f64 fd291, fd267, fd95, fd270; +sub.f64 fd292, fd268, fd269; +st.shared.v2.f64 [r9+160], {fd292, fd291}; +barrier.sync 0; +mad.lo.s32 r10, r7, -160, r9; +ld.shared.v2.f64 {fd293, fd294}, [r10]; +ld.shared.v2.f64 {fd297, fd298}, [r10+176]; +ld.shared.v2.f64 {fd301, fd302}, [r10+352]; +ld.shared.v2.f64 {fd305, fd306}, [r10+528]; +ld.shared.v2.f64 {fd309, fd310}, [r10+704]; +ld.shared.v2.f64 {fd313, fd314}, [r10+880]; +ld.shared.v2.f64 {fd317, fd318}, [r10+1056]; +ld.shared.v2.f64 {fd321, fd322}, [r10+1232]; +ld.shared.v2.f64 {fd325, fd326}, [r10+1408]; +ld.shared.v2.f64 {fd329, fd330}, [r10+1584]; +ld.shared.v2.f64 {fd333, fd334}, [r10+1760]; +add.f64 fd337, fd297, fd333; +add.f64 fd338, fd298, fd334; +sub.f64 fd339, fd297, fd333; +sub.f64 fd340, fd298, fd334; +add.f64 fd341, fd301, fd329; +add.f64 fd342, fd302, fd330; +sub.f64 fd343, fd301, fd329; +sub.f64 fd344, fd302, fd330; +add.f64 fd345, fd305, fd325; +add.f64 fd346, fd306, fd326; +sub.f64 fd347, fd305, fd325; +sub.f64 fd348, fd306, fd326; +add.f64 fd349, fd309, fd321; +add.f64 fd350, fd310, fd322; +sub.f64 fd351, fd309, fd321; +sub.f64 fd352, fd310, fd322; +add.f64 fd353, fd313, fd317; +add.f64 fd354, fd314, fd318; +sub.f64 fd355, fd313, fd317; +sub.f64 fd356, fd314, fd318; +add.f64 fd357, fd293, fd337; +add.f64 fd358, fd294, fd338; +add.f64 fd359, fd357, fd341; +add.f64 fd360, fd358, fd342; +add.f64 fd361, fd359, fd345; +add.f64 fd362, fd360, fd346; +add.f64 fd363, fd361, fd349; +add.f64 fd364, fd362, fd350; +fma.rn.f64 fd365, fd337, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd366, fd340, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd367, fd338, 0d3FEAEB8C8764F0BA, fd294; +fma.rn.f64 fd368, fd339, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd369, fd341, 0d3FDA9628D9C712B6, fd365; +fma.rn.f64 fd370, fd344, 0dBFED1BB48EEE2C13, fd366; +fma.rn.f64 fd371, fd342, 0d3FDA9628D9C712B6, fd367; +fma.rn.f64 fd372, fd343, 0dBFED1BB48EEE2C13, fd368; +fma.rn.f64 fd373, fd345, 0dBFC2375F640F44DB, fd369; +fma.rn.f64 fd374, fd348, 0dBFEFAC9E043842EF, fd370; +fma.rn.f64 fd375, fd346, 0dBFC2375F640F44DB, fd371; +fma.rn.f64 fd376, fd347, 0dBFEFAC9E043842EF, fd372; +fma.rn.f64 fd377, fd349, 0dBFE4F49E7F775887, fd373; +fma.rn.f64 fd378, fd352, 0dBFE82F19BB3A28A1, fd374; +fma.rn.f64 fd379, fd350, 0dBFE4F49E7F775887, fd375; +fma.rn.f64 fd380, fd351, 0dBFE82F19BB3A28A1, fd376; +fma.rn.f64 fd381, fd353, 0dBFEEB42A9BCD5057, fd377; +fma.rn.f64 fd382, fd356, 0dBFD207E7FD768DBF, fd378; +fma.rn.f64 fd383, fd354, 0dBFEEB42A9BCD5057, fd379; +fma.rn.f64 fd384, fd355, 0dBFD207E7FD768DBF, fd380; +fma.rn.f64 fd385, fd337, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd386, fd340, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd387, fd338, 0d3FDA9628D9C712B6, fd294; +fma.rn.f64 fd388, fd339, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd389, fd341, 0dBFE4F49E7F775887, fd385; +fma.rn.f64 fd390, fd344, 0dBFE82F19BB3A28A1, fd386; +fma.rn.f64 fd391, fd342, 0dBFE4F49E7F775887, fd387; +fma.rn.f64 fd392, fd343, 0dBFE82F19BB3A28A1, fd388; +fma.rn.f64 fd393, fd345, 0dBFEEB42A9BCD5057, fd389; +fma.rn.f64 fd394, fd348, 0d3FD207E7FD768DBF, fd390; +fma.rn.f64 fd395, fd346, 0dBFEEB42A9BCD5057, fd391; +fma.rn.f64 fd396, fd347, 0d3FD207E7FD768DBF, fd392; +fma.rn.f64 fd397, fd349, 0dBFC2375F640F44DB, fd393; +fma.rn.f64 fd398, fd352, 0d3FEFAC9E043842EF, fd394; +fma.rn.f64 fd399, fd350, 0dBFC2375F640F44DB, fd395; +fma.rn.f64 fd400, fd351, 0d3FEFAC9E043842EF, fd396; +fma.rn.f64 fd401, fd353, 0d3FEAEB8C8764F0BA, fd397; +fma.rn.f64 fd402, fd356, 0d3FE14CEDF8BB580B, fd398; +fma.rn.f64 fd403, fd354, 0d3FEAEB8C8764F0BA, fd399; +fma.rn.f64 fd404, fd355, 0d3FE14CEDF8BB580B, fd400; +fma.rn.f64 fd405, fd337, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd406, fd340, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd407, fd338, 0dBFC2375F640F44DB, fd294; +fma.rn.f64 fd408, fd339, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd409, fd341, 0dBFEEB42A9BCD5057, fd405; +fma.rn.f64 fd410, fd344, 0d3FD207E7FD768DBF, fd406; +fma.rn.f64 fd411, fd342, 0dBFEEB42A9BCD5057, fd407; +fma.rn.f64 fd412, fd343, 0d3FD207E7FD768DBF, fd408; +fma.rn.f64 fd413, fd345, 0d3FDA9628D9C712B6, fd409; +fma.rn.f64 fd414, fd348, 0d3FED1BB48EEE2C13, fd410; +fma.rn.f64 fd415, fd346, 0d3FDA9628D9C712B6, fd411; +fma.rn.f64 fd416, fd347, 0d3FED1BB48EEE2C13, fd412; +fma.rn.f64 fd417, fd349, 0d3FEAEB8C8764F0BA, fd413; +fma.rn.f64 fd418, fd352, 0dBFE14CEDF8BB580B, fd414; +fma.rn.f64 fd419, fd350, 0d3FEAEB8C8764F0BA, fd415; +fma.rn.f64 fd420, fd351, 0dBFE14CEDF8BB580B, fd416; +fma.rn.f64 fd421, fd353, 0dBFE4F49E7F775887, fd417; +fma.rn.f64 fd422, fd356, 0dBFE82F19BB3A28A1, fd418; +fma.rn.f64 fd423, fd354, 0dBFE4F49E7F775887, fd419; +fma.rn.f64 fd424, fd355, 0dBFE82F19BB3A28A1, fd420; +fma.rn.f64 fd425, fd337, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd426, fd340, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd427, fd338, 0dBFE4F49E7F775887, fd294; +fma.rn.f64 fd428, fd339, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd429, fd341, 0dBFC2375F640F44DB, fd425; +fma.rn.f64 fd430, fd344, 0d3FEFAC9E043842EF, fd426; +fma.rn.f64 fd431, fd342, 0dBFC2375F640F44DB, fd427; +fma.rn.f64 fd432, fd343, 0d3FEFAC9E043842EF, fd428; +fma.rn.f64 fd433, fd345, 0d3FEAEB8C8764F0BA, fd429; +fma.rn.f64 fd434, fd348, 0dBFE14CEDF8BB580B, fd430; +fma.rn.f64 fd435, fd346, 0d3FEAEB8C8764F0BA, fd431; +fma.rn.f64 fd436, fd347, 0dBFE14CEDF8BB580B, fd432; +fma.rn.f64 fd437, fd349, 0dBFEEB42A9BCD5057, fd433; +fma.rn.f64 fd438, fd352, 0dBFD207E7FD768DBF, fd434; +fma.rn.f64 fd439, fd350, 0dBFEEB42A9BCD5057, fd435; +fma.rn.f64 fd440, fd351, 0dBFD207E7FD768DBF, fd436; +fma.rn.f64 fd441, fd353, 0d3FDA9628D9C712B6, fd437; +fma.rn.f64 fd442, fd356, 0d3FED1BB48EEE2C13, fd438; +fma.rn.f64 fd443, fd354, 0d3FDA9628D9C712B6, fd439; +fma.rn.f64 fd444, fd355, 0d3FED1BB48EEE2C13, fd440; +fma.rn.f64 fd445, fd337, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd446, fd340, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd447, fd338, 0dBFEEB42A9BCD5057, fd294; +fma.rn.f64 fd448, fd339, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd449, fd341, 0d3FEAEB8C8764F0BA, fd445; +fma.rn.f64 fd450, fd344, 0d3FE14CEDF8BB580B, fd446; +fma.rn.f64 fd451, fd342, 0d3FEAEB8C8764F0BA, fd447; +fma.rn.f64 fd452, fd343, 0d3FE14CEDF8BB580B, fd448; +fma.rn.f64 fd453, fd345, 0dBFE4F49E7F775887, fd449; +fma.rn.f64 fd454, fd348, 0dBFE82F19BB3A28A1, fd450; +fma.rn.f64 fd455, fd346, 0dBFE4F49E7F775887, fd451; +fma.rn.f64 fd456, fd347, 0dBFE82F19BB3A28A1, fd452; +fma.rn.f64 fd457, fd349, 0d3FDA9628D9C712B6, fd453; +fma.rn.f64 fd458, fd352, 0d3FED1BB48EEE2C13, fd454; +fma.rn.f64 fd459, fd350, 0d3FDA9628D9C712B6, fd455; +fma.rn.f64 fd460, fd351, 0d3FED1BB48EEE2C13, fd456; +fma.rn.f64 fd461, fd353, 0dBFC2375F640F44DB, fd457; +fma.rn.f64 fd462, fd356, 0dBFEFAC9E043842EF, fd458; +fma.rn.f64 fd463, fd354, 0dBFC2375F640F44DB, fd459; +fma.rn.f64 fd464, fd355, 0dBFEFAC9E043842EF, fd460; +add.f64 %1, fd364, fd354; +add.f64 %0, fd363, fd353; +add.f64 %3, fd384, fd383; +sub.f64 %2, fd381, fd382; +add.f64 %5, fd404, fd403; +sub.f64 %4, fd401, fd402; +add.f64 %7, fd424, fd423; +sub.f64 %6, fd421, fd422; +add.f64 %9, fd444, fd443; +sub.f64 %8, fd441, fd442; +add.f64 %11, fd464, fd463; +sub.f64 %10, fd461, fd462; +sub.f64 %13, fd463, fd464; +add.f64 %12, fd462, fd461; +sub.f64 %15, fd443, fd444; +add.f64 %14, fd442, fd441; +sub.f64 %17, fd423, fd424; +add.f64 %16, fd422, fd421; +sub.f64 %19, fd403, fd404; +add.f64 %18, fd402, fd401; +sub.f64 %21, fd383, fd384; +add.f64 %20, fd382, fd381; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..46700288ad00e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_121_fp64_inv.hpp.inc @@ -0,0 +1,916 @@ +#ifndef CUFFTDX_FFT_121_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_121_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<726, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<465>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 968, r2; +add.f64 fd45, %26, %50; +add.f64 fd46, %28, %51; +sub.f64 fd47, %26, %50; +sub.f64 fd48, %28, %51; +add.f64 fd49, %29, %48; +add.f64 fd50, %31, %49; +sub.f64 fd51, %29, %48; +sub.f64 fd52, %31, %49; +add.f64 fd53, %32, %45; +add.f64 fd54, %33, %47; +sub.f64 fd55, %32, %45; +sub.f64 fd56, %33, %47; +add.f64 fd57, %34, %42; +add.f64 fd58, %36, %44; +sub.f64 fd59, %34, %42; +sub.f64 fd60, %36, %44; +add.f64 fd61, %37, %40; +add.f64 fd62, %39, %41; +sub.f64 fd63, %37, %40; +sub.f64 fd64, %39, %41; +mov.u32 r4, %tid.x; +add.f64 fd65, %24, fd45; +add.f64 fd66, %25, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +add.f64 fd73, fd71, fd61; +add.f64 fd74, fd72, fd62; +fma.rn.f64 fd75, fd45, 0d3FEAEB8C8764F0BA, %24; +fma.rn.f64 fd76, fd48, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd46, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd78, fd47, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd79, fd49, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd52, 0d3FED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd50, 0d3FDA9628D9C712B6, fd77; +fma.rn.f64 fd82, fd51, 0d3FED1BB48EEE2C13, fd78; +fma.rn.f64 fd83, fd53, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd56, 0d3FEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd54, 0dBFC2375F640F44DB, fd81; +fma.rn.f64 fd86, fd55, 0d3FEFAC9E043842EF, fd82; +fma.rn.f64 fd87, fd57, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd60, 0d3FE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd58, 0dBFE4F49E7F775887, fd85; +fma.rn.f64 fd90, fd59, 0d3FE82F19BB3A28A1, fd86; +fma.rn.f64 fd91, fd61, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd64, 0d3FD207E7FD768DBF, fd88; +fma.rn.f64 fd93, fd62, 0dBFEEB42A9BCD5057, fd89; +fma.rn.f64 fd94, fd63, 0d3FD207E7FD768DBF, fd90; +sub.f64 fd95, fd91, fd92; +add.f64 fd96, fd94, fd93; +add.f64 fd97, fd92, fd91; +sub.f64 fd98, fd93, fd94; +fma.rn.f64 fd99, fd45, 0d3FDA9628D9C712B6, %24; +fma.rn.f64 fd100, fd48, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd46, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd102, fd47, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd103, fd49, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd52, 0d3FE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd50, 0dBFE4F49E7F775887, fd101; +fma.rn.f64 fd106, fd51, 0d3FE82F19BB3A28A1, fd102; +fma.rn.f64 fd107, fd53, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd56, 0dBFD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd54, 0dBFEEB42A9BCD5057, fd105; +fma.rn.f64 fd110, fd55, 0dBFD207E7FD768DBF, fd106; +fma.rn.f64 fd111, fd57, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd60, 0dBFEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd58, 0dBFC2375F640F44DB, fd109; +fma.rn.f64 fd114, fd59, 0dBFEFAC9E043842EF, fd110; +fma.rn.f64 fd115, fd61, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd64, 0dBFE14CEDF8BB580B, fd112; +fma.rn.f64 fd117, fd62, 0d3FEAEB8C8764F0BA, fd113; +fma.rn.f64 fd118, fd63, 0dBFE14CEDF8BB580B, fd114; +sub.f64 fd119, fd115, fd116; +add.f64 fd120, fd118, fd117; +add.f64 fd121, fd116, fd115; +sub.f64 fd122, fd117, fd118; +fma.rn.f64 fd123, fd45, 0dBFC2375F640F44DB, %24; +fma.rn.f64 fd124, fd48, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd46, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd126, fd47, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd127, fd49, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd52, 0dBFD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd50, 0dBFEEB42A9BCD5057, fd125; +fma.rn.f64 fd130, fd51, 0dBFD207E7FD768DBF, fd126; +fma.rn.f64 fd131, fd53, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd56, 0dBFED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd54, 0d3FDA9628D9C712B6, fd129; +fma.rn.f64 fd134, fd55, 0dBFED1BB48EEE2C13, fd130; +fma.rn.f64 fd135, fd57, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd60, 0d3FE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd58, 0d3FEAEB8C8764F0BA, fd133; +fma.rn.f64 fd138, fd59, 0d3FE14CEDF8BB580B, fd134; +fma.rn.f64 fd139, fd61, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd64, 0d3FE82F19BB3A28A1, fd136; +fma.rn.f64 fd141, fd62, 0dBFE4F49E7F775887, fd137; +fma.rn.f64 fd142, fd63, 0d3FE82F19BB3A28A1, fd138; +sub.f64 fd143, fd139, fd140; +add.f64 fd144, fd142, fd141; +add.f64 fd145, fd140, fd139; +sub.f64 fd146, fd141, fd142; +fma.rn.f64 fd147, fd45, 0dBFE4F49E7F775887, %24; +fma.rn.f64 fd148, fd48, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd46, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd150, fd47, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd151, fd49, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd52, 0dBFEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd50, 0dBFC2375F640F44DB, fd149; +fma.rn.f64 fd154, fd51, 0dBFEFAC9E043842EF, fd150; +fma.rn.f64 fd155, fd53, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd56, 0d3FE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd54, 0d3FEAEB8C8764F0BA, fd153; +fma.rn.f64 fd158, fd55, 0d3FE14CEDF8BB580B, fd154; +fma.rn.f64 fd159, fd57, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd60, 0d3FD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd58, 0dBFEEB42A9BCD5057, fd157; +fma.rn.f64 fd162, fd59, 0d3FD207E7FD768DBF, fd158; +fma.rn.f64 fd163, fd61, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd64, 0dBFED1BB48EEE2C13, fd160; +fma.rn.f64 fd165, fd62, 0d3FDA9628D9C712B6, fd161; +fma.rn.f64 fd166, fd63, 0dBFED1BB48EEE2C13, fd162; +sub.f64 fd167, fd163, fd164; +add.f64 fd168, fd166, fd165; +add.f64 fd169, fd164, fd163; +sub.f64 fd170, fd165, fd166; +fma.rn.f64 fd171, fd45, 0dBFEEB42A9BCD5057, %24; +fma.rn.f64 fd172, fd48, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd46, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd174, fd47, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd175, fd49, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd52, 0dBFE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd50, 0d3FEAEB8C8764F0BA, fd173; +fma.rn.f64 fd178, fd51, 0dBFE14CEDF8BB580B, fd174; +fma.rn.f64 fd179, fd53, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd56, 0d3FE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd54, 0dBFE4F49E7F775887, fd177; +fma.rn.f64 fd182, fd55, 0d3FE82F19BB3A28A1, fd178; +fma.rn.f64 fd183, fd57, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd60, 0dBFED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd58, 0d3FDA9628D9C712B6, fd181; +fma.rn.f64 fd186, fd59, 0dBFED1BB48EEE2C13, fd182; +fma.rn.f64 fd187, fd61, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd64, 0d3FEFAC9E043842EF, fd184; +fma.rn.f64 fd189, fd62, 0dBFC2375F640F44DB, fd185; +fma.rn.f64 fd190, fd63, 0d3FEFAC9E043842EF, fd186; +sub.f64 fd191, fd187, fd188; +add.f64 fd192, fd190, fd189; +add.f64 fd193, fd188, fd187; +sub.f64 fd194, fd189, fd190; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd195, fd196}, [rd6]; +mul.f64 fd199, fd96, fd196; +fma.rn.f64 fd200, fd195, fd95, fd199; +mul.f64 fd201, fd95, fd196; +mul.f64 fd202, fd195, fd96; +sub.f64 fd203, fd202, fd201; +mul.f64 fd204, fd195, fd195; +mul.f64 fd205, fd196, fd196; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd196, fd195; +fma.rn.f64 fd208, fd196, fd195, fd207; +mul.f64 fd209, fd120, fd208; +fma.rn.f64 fd210, fd206, fd119, fd209; +mul.f64 fd211, fd119, fd208; +mul.f64 fd212, fd206, fd120; +sub.f64 fd213, fd212, fd211; +mul.f64 fd214, fd195, fd206; +mul.f64 fd215, fd196, fd208; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd195, fd208; +fma.rn.f64 fd218, fd196, fd206, fd217; +mul.f64 fd219, fd144, fd218; +fma.rn.f64 fd220, fd216, fd143, fd219; +mul.f64 fd221, fd143, fd218; +mul.f64 fd222, fd216, fd144; +sub.f64 fd223, fd222, fd221; +mul.f64 fd224, fd195, fd216; +mul.f64 fd225, fd196, fd218; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd195, fd218; +fma.rn.f64 fd228, fd196, fd216, fd227; +mul.f64 fd229, fd168, fd228; +fma.rn.f64 fd230, fd226, fd167, fd229; +mul.f64 fd231, fd167, fd228; +mul.f64 fd232, fd226, fd168; +sub.f64 fd233, fd232, fd231; +mul.f64 fd234, fd195, fd226; +mul.f64 fd235, fd196, fd228; +sub.f64 fd236, fd234, fd235; +mul.f64 fd237, fd195, fd228; +fma.rn.f64 fd238, fd196, fd226, fd237; +mul.f64 fd239, fd192, fd238; +fma.rn.f64 fd240, fd236, fd191, fd239; +mul.f64 fd241, fd191, fd238; +mul.f64 fd242, fd236, fd192; +sub.f64 fd243, fd242, fd241; +ld.global.v2.f64 {fd244, fd245}, [rd6+176]; +mul.f64 fd248, fd194, fd245; +fma.rn.f64 fd249, fd244, fd193, fd248; +mul.f64 fd250, fd193, fd245; +mul.f64 fd251, fd244, fd194; +sub.f64 fd252, fd251, fd250; +mul.f64 fd253, fd195, fd244; +mul.f64 fd254, fd196, fd245; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd195, fd245; +fma.rn.f64 fd257, fd196, fd244, fd256; +mul.f64 fd258, fd170, fd257; +fma.rn.f64 fd259, fd255, fd169, fd258; +mul.f64 fd260, fd169, fd257; +mul.f64 fd261, fd255, fd170; +sub.f64 fd262, fd261, fd260; +mul.f64 fd263, fd195, fd255; +mul.f64 fd264, fd196, fd257; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd195, fd257; +fma.rn.f64 fd267, fd196, fd255, fd266; +mul.f64 fd268, fd146, fd267; +fma.rn.f64 fd269, fd265, fd145, fd268; +mul.f64 fd270, fd145, fd267; +mul.f64 fd271, fd265, fd146; +sub.f64 fd272, fd271, fd270; +mul.f64 fd273, fd195, fd265; +mul.f64 fd274, fd196, fd267; +sub.f64 fd275, fd273, fd274; +mul.f64 fd276, fd195, fd267; +fma.rn.f64 fd277, fd196, fd265, fd276; +mul.f64 fd278, fd122, fd277; +fma.rn.f64 fd279, fd275, fd121, fd278; +mul.f64 fd280, fd121, fd277; +mul.f64 fd281, fd275, fd122; +sub.f64 fd282, fd281, fd280; +mul.f64 fd283, fd195, fd275; +mul.f64 fd284, fd196, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd195, fd277; +fma.rn.f64 fd287, fd196, fd275, fd286; +mul.f64 fd288, fd98, fd287; +fma.rn.f64 fd289, fd285, fd97, fd288; +mul.f64 fd290, fd97, fd287; +mul.f64 fd291, fd285, fd98; +sub.f64 fd292, fd291, fd290; +mad.lo.s32 r8, r5, 968, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 88, r8; +st.shared.f64 [r9], fd73; +st.shared.f64 [r9+8], fd200; +st.shared.f64 [r9+16], fd210; +st.shared.f64 [r9+24], fd220; +st.shared.f64 [r9+32], fd230; +st.shared.f64 [r9+40], fd240; +st.shared.f64 [r9+48], fd249; +st.shared.f64 [r9+56], fd259; +st.shared.f64 [r9+64], fd269; +st.shared.f64 [r9+72], fd279; +st.shared.f64 [r9+80], fd289; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.f64 fd293, [r10]; +ld.shared.f64 fd294, [r10+88]; +ld.shared.f64 fd295, [r10+176]; +ld.shared.f64 fd296, [r10+264]; +ld.shared.f64 fd297, [r10+352]; +ld.shared.f64 fd298, [r10+440]; +ld.shared.f64 fd299, [r10+528]; +ld.shared.f64 fd300, [r10+616]; +ld.shared.f64 fd301, [r10+704]; +ld.shared.f64 fd302, [r10+792]; +ld.shared.f64 fd303, [r10+880]; +barrier.sync 0; +st.shared.f64 [r9], fd74; +st.shared.f64 [r9+8], fd203; +st.shared.f64 [r9+16], fd213; +st.shared.f64 [r9+24], fd223; +st.shared.f64 [r9+32], fd233; +st.shared.f64 [r9+40], fd243; +st.shared.f64 [r9+48], fd252; +st.shared.f64 [r9+56], fd262; +st.shared.f64 [r9+64], fd272; +st.shared.f64 [r9+72], fd282; +st.shared.f64 [r9+80], fd292; +barrier.sync 0; +ld.shared.f64 fd304, [r10]; +ld.shared.f64 fd305, [r10+88]; +ld.shared.f64 fd306, [r10+176]; +ld.shared.f64 fd307, [r10+264]; +ld.shared.f64 fd308, [r10+352]; +ld.shared.f64 fd309, [r10+440]; +ld.shared.f64 fd310, [r10+528]; +ld.shared.f64 fd311, [r10+616]; +ld.shared.f64 fd312, [r10+704]; +ld.shared.f64 fd313, [r10+792]; +ld.shared.f64 fd314, [r10+880]; +add.f64 fd315, fd294, fd303; +add.f64 fd316, fd305, fd314; +sub.f64 fd317, fd294, fd303; +sub.f64 fd318, fd305, fd314; +add.f64 fd319, fd295, fd302; +add.f64 fd320, fd306, fd313; +sub.f64 fd321, fd295, fd302; +sub.f64 fd322, fd306, fd313; +add.f64 fd323, fd296, fd301; +add.f64 fd324, fd307, fd312; +sub.f64 fd325, fd296, fd301; +sub.f64 fd326, fd307, fd312; +add.f64 fd327, fd297, fd300; +add.f64 fd328, fd308, fd311; +sub.f64 fd329, fd297, fd300; +sub.f64 fd330, fd308, fd311; +add.f64 fd331, fd298, fd299; +add.f64 fd332, fd309, fd310; +sub.f64 fd333, fd298, fd299; +sub.f64 fd334, fd309, fd310; +add.f64 fd335, fd293, fd315; +add.f64 fd336, fd304, fd316; +add.f64 fd337, fd335, fd319; +add.f64 fd338, fd336, fd320; +add.f64 fd339, fd337, fd323; +add.f64 fd340, fd338, fd324; +add.f64 fd341, fd339, fd327; +add.f64 fd342, fd340, fd328; +fma.rn.f64 fd343, fd315, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd344, fd318, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd345, fd316, 0d3FEAEB8C8764F0BA, fd304; +fma.rn.f64 fd346, fd317, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd347, fd319, 0d3FDA9628D9C712B6, fd343; +fma.rn.f64 fd348, fd322, 0d3FED1BB48EEE2C13, fd344; +fma.rn.f64 fd349, fd320, 0d3FDA9628D9C712B6, fd345; +fma.rn.f64 fd350, fd321, 0d3FED1BB48EEE2C13, fd346; +fma.rn.f64 fd351, fd323, 0dBFC2375F640F44DB, fd347; +fma.rn.f64 fd352, fd326, 0d3FEFAC9E043842EF, fd348; +fma.rn.f64 fd353, fd324, 0dBFC2375F640F44DB, fd349; +fma.rn.f64 fd354, fd325, 0d3FEFAC9E043842EF, fd350; +fma.rn.f64 fd355, fd327, 0dBFE4F49E7F775887, fd351; +fma.rn.f64 fd356, fd330, 0d3FE82F19BB3A28A1, fd352; +fma.rn.f64 fd357, fd328, 0dBFE4F49E7F775887, fd353; +fma.rn.f64 fd358, fd329, 0d3FE82F19BB3A28A1, fd354; +fma.rn.f64 fd359, fd331, 0dBFEEB42A9BCD5057, fd355; +fma.rn.f64 fd360, fd334, 0d3FD207E7FD768DBF, fd356; +fma.rn.f64 fd361, fd332, 0dBFEEB42A9BCD5057, fd357; +fma.rn.f64 fd362, fd333, 0d3FD207E7FD768DBF, fd358; +fma.rn.f64 fd363, fd315, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd364, fd318, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd365, fd316, 0d3FDA9628D9C712B6, fd304; +fma.rn.f64 fd366, fd317, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd367, fd319, 0dBFE4F49E7F775887, fd363; +fma.rn.f64 fd368, fd322, 0d3FE82F19BB3A28A1, fd364; +fma.rn.f64 fd369, fd320, 0dBFE4F49E7F775887, fd365; +fma.rn.f64 fd370, fd321, 0d3FE82F19BB3A28A1, fd366; +fma.rn.f64 fd371, fd323, 0dBFEEB42A9BCD5057, fd367; +fma.rn.f64 fd372, fd326, 0dBFD207E7FD768DBF, fd368; +fma.rn.f64 fd373, fd324, 0dBFEEB42A9BCD5057, fd369; +fma.rn.f64 fd374, fd325, 0dBFD207E7FD768DBF, fd370; +fma.rn.f64 fd375, fd327, 0dBFC2375F640F44DB, fd371; +fma.rn.f64 fd376, fd330, 0dBFEFAC9E043842EF, fd372; +fma.rn.f64 fd377, fd328, 0dBFC2375F640F44DB, fd373; +fma.rn.f64 fd378, fd329, 0dBFEFAC9E043842EF, fd374; +fma.rn.f64 fd379, fd331, 0d3FEAEB8C8764F0BA, fd375; +fma.rn.f64 fd380, fd334, 0dBFE14CEDF8BB580B, fd376; +fma.rn.f64 fd381, fd332, 0d3FEAEB8C8764F0BA, fd377; +fma.rn.f64 fd382, fd333, 0dBFE14CEDF8BB580B, fd378; +fma.rn.f64 fd383, fd315, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd384, fd318, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd385, fd316, 0dBFC2375F640F44DB, fd304; +fma.rn.f64 fd386, fd317, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd387, fd319, 0dBFEEB42A9BCD5057, fd383; +fma.rn.f64 fd388, fd322, 0dBFD207E7FD768DBF, fd384; +fma.rn.f64 fd389, fd320, 0dBFEEB42A9BCD5057, fd385; +fma.rn.f64 fd390, fd321, 0dBFD207E7FD768DBF, fd386; +fma.rn.f64 fd391, fd323, 0d3FDA9628D9C712B6, fd387; +fma.rn.f64 fd392, fd326, 0dBFED1BB48EEE2C13, fd388; +fma.rn.f64 fd393, fd324, 0d3FDA9628D9C712B6, fd389; +fma.rn.f64 fd394, fd325, 0dBFED1BB48EEE2C13, fd390; +fma.rn.f64 fd395, fd327, 0d3FEAEB8C8764F0BA, fd391; +fma.rn.f64 fd396, fd330, 0d3FE14CEDF8BB580B, fd392; +fma.rn.f64 fd397, fd328, 0d3FEAEB8C8764F0BA, fd393; +fma.rn.f64 fd398, fd329, 0d3FE14CEDF8BB580B, fd394; +fma.rn.f64 fd399, fd331, 0dBFE4F49E7F775887, fd395; +fma.rn.f64 fd400, fd334, 0d3FE82F19BB3A28A1, fd396; +fma.rn.f64 fd401, fd332, 0dBFE4F49E7F775887, fd397; +fma.rn.f64 fd402, fd333, 0d3FE82F19BB3A28A1, fd398; +fma.rn.f64 fd403, fd315, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd404, fd318, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd405, fd316, 0dBFE4F49E7F775887, fd304; +fma.rn.f64 fd406, fd317, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd407, fd319, 0dBFC2375F640F44DB, fd403; +fma.rn.f64 fd408, fd322, 0dBFEFAC9E043842EF, fd404; +fma.rn.f64 fd409, fd320, 0dBFC2375F640F44DB, fd405; +fma.rn.f64 fd410, fd321, 0dBFEFAC9E043842EF, fd406; +fma.rn.f64 fd411, fd323, 0d3FEAEB8C8764F0BA, fd407; +fma.rn.f64 fd412, fd326, 0d3FE14CEDF8BB580B, fd408; +fma.rn.f64 fd413, fd324, 0d3FEAEB8C8764F0BA, fd409; +fma.rn.f64 fd414, fd325, 0d3FE14CEDF8BB580B, fd410; +fma.rn.f64 fd415, fd327, 0dBFEEB42A9BCD5057, fd411; +fma.rn.f64 fd416, fd330, 0d3FD207E7FD768DBF, fd412; +fma.rn.f64 fd417, fd328, 0dBFEEB42A9BCD5057, fd413; +fma.rn.f64 fd418, fd329, 0d3FD207E7FD768DBF, fd414; +fma.rn.f64 fd419, fd331, 0d3FDA9628D9C712B6, fd415; +fma.rn.f64 fd420, fd334, 0dBFED1BB48EEE2C13, fd416; +fma.rn.f64 fd421, fd332, 0d3FDA9628D9C712B6, fd417; +fma.rn.f64 fd422, fd333, 0dBFED1BB48EEE2C13, fd418; +fma.rn.f64 fd423, fd315, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd424, fd318, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd425, fd316, 0dBFEEB42A9BCD5057, fd304; +fma.rn.f64 fd426, fd317, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd427, fd319, 0d3FEAEB8C8764F0BA, fd423; +fma.rn.f64 fd428, fd322, 0dBFE14CEDF8BB580B, fd424; +fma.rn.f64 fd429, fd320, 0d3FEAEB8C8764F0BA, fd425; +fma.rn.f64 fd430, fd321, 0dBFE14CEDF8BB580B, fd426; +fma.rn.f64 fd431, fd323, 0dBFE4F49E7F775887, fd427; +fma.rn.f64 fd432, fd326, 0d3FE82F19BB3A28A1, fd428; +fma.rn.f64 fd433, fd324, 0dBFE4F49E7F775887, fd429; +fma.rn.f64 fd434, fd325, 0d3FE82F19BB3A28A1, fd430; +fma.rn.f64 fd435, fd327, 0d3FDA9628D9C712B6, fd431; +fma.rn.f64 fd436, fd330, 0dBFED1BB48EEE2C13, fd432; +fma.rn.f64 fd437, fd328, 0d3FDA9628D9C712B6, fd433; +fma.rn.f64 fd438, fd329, 0dBFED1BB48EEE2C13, fd434; +fma.rn.f64 fd439, fd331, 0dBFC2375F640F44DB, fd435; +fma.rn.f64 fd440, fd334, 0d3FEFAC9E043842EF, fd436; +fma.rn.f64 fd441, fd332, 0dBFC2375F640F44DB, fd437; +fma.rn.f64 fd442, fd333, 0d3FEFAC9E043842EF, fd438; +add.f64 %0, fd341, fd331; +add.f64 %1, fd342, fd332; +add.f64 %3, fd362, fd361; +sub.f64 %2, fd359, fd360; +add.f64 %5, fd382, fd381; +sub.f64 %4, fd379, fd380; +add.f64 %7, fd402, fd401; +sub.f64 %6, fd399, fd400; +add.f64 %9, fd422, fd421; +sub.f64 %8, fd419, fd420; +add.f64 %11, fd442, fd441; +sub.f64 %10, fd439, fd440; +sub.f64 %13, fd441, fd442; +add.f64 %12, fd440, fd439; +sub.f64 %15, fd421, fd422; +add.f64 %14, fd420, fd419; +sub.f64 %17, fd401, fd402; +add.f64 %16, fd400, fd399; +sub.f64 %19, fd381, fd382; +add.f64 %18, fd380, fd379; +sub.f64 %21, fd361, fd362; +add.f64 %20, fd360, fd359; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<725, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<487>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 1936, r2; +add.f64 fd45, %26, %50; +add.f64 fd46, %28, %51; +sub.f64 fd47, %26, %50; +sub.f64 fd48, %28, %51; +add.f64 fd49, %29, %48; +add.f64 fd50, %31, %49; +sub.f64 fd51, %29, %48; +sub.f64 fd52, %31, %49; +add.f64 fd53, %32, %45; +add.f64 fd54, %33, %47; +sub.f64 fd55, %32, %45; +sub.f64 fd56, %33, %47; +add.f64 fd57, %34, %42; +add.f64 fd58, %36, %44; +sub.f64 fd59, %34, %42; +sub.f64 fd60, %36, %44; +add.f64 fd61, %37, %40; +add.f64 fd62, %39, %41; +sub.f64 fd63, %37, %40; +sub.f64 fd64, %39, %41; +mov.u32 r4, %tid.x; +add.f64 fd65, %24, fd45; +add.f64 fd66, %25, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +fma.rn.f64 fd73, fd45, 0d3FEAEB8C8764F0BA, %24; +fma.rn.f64 fd74, fd48, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd75, fd46, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd76, fd47, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd49, 0d3FDA9628D9C712B6, fd73; +fma.rn.f64 fd78, fd52, 0d3FED1BB48EEE2C13, fd74; +fma.rn.f64 fd79, fd50, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd51, 0d3FED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd53, 0dBFC2375F640F44DB, fd77; +fma.rn.f64 fd82, fd56, 0d3FEFAC9E043842EF, fd78; +fma.rn.f64 fd83, fd54, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd55, 0d3FEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd57, 0dBFE4F49E7F775887, fd81; +fma.rn.f64 fd86, fd60, 0d3FE82F19BB3A28A1, fd82; +fma.rn.f64 fd87, fd58, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd59, 0d3FE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd61, 0dBFEEB42A9BCD5057, fd85; +fma.rn.f64 fd90, fd64, 0d3FD207E7FD768DBF, fd86; +fma.rn.f64 fd91, fd62, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd63, 0d3FD207E7FD768DBF, fd88; +sub.f64 fd93, fd89, fd90; +add.f64 fd94, fd92, fd91; +add.f64 fd95, fd90, fd89; +sub.f64 fd96, fd91, fd92; +fma.rn.f64 fd97, fd45, 0d3FDA9628D9C712B6, %24; +fma.rn.f64 fd98, fd48, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd99, fd46, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd100, fd47, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd49, 0dBFE4F49E7F775887, fd97; +fma.rn.f64 fd102, fd52, 0d3FE82F19BB3A28A1, fd98; +fma.rn.f64 fd103, fd50, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd51, 0d3FE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd53, 0dBFEEB42A9BCD5057, fd101; +fma.rn.f64 fd106, fd56, 0dBFD207E7FD768DBF, fd102; +fma.rn.f64 fd107, fd54, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd55, 0dBFD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd57, 0dBFC2375F640F44DB, fd105; +fma.rn.f64 fd110, fd60, 0dBFEFAC9E043842EF, fd106; +fma.rn.f64 fd111, fd58, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd59, 0dBFEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd61, 0d3FEAEB8C8764F0BA, fd109; +fma.rn.f64 fd114, fd64, 0dBFE14CEDF8BB580B, fd110; +fma.rn.f64 fd115, fd62, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd63, 0dBFE14CEDF8BB580B, fd112; +sub.f64 fd117, fd113, fd114; +add.f64 fd118, fd116, fd115; +add.f64 fd119, fd114, fd113; +sub.f64 fd120, fd115, fd116; +fma.rn.f64 fd121, fd45, 0dBFC2375F640F44DB, %24; +fma.rn.f64 fd122, fd48, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd123, fd46, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd124, fd47, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd49, 0dBFEEB42A9BCD5057, fd121; +fma.rn.f64 fd126, fd52, 0dBFD207E7FD768DBF, fd122; +fma.rn.f64 fd127, fd50, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd51, 0dBFD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd53, 0d3FDA9628D9C712B6, fd125; +fma.rn.f64 fd130, fd56, 0dBFED1BB48EEE2C13, fd126; +fma.rn.f64 fd131, fd54, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd55, 0dBFED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd57, 0d3FEAEB8C8764F0BA, fd129; +fma.rn.f64 fd134, fd60, 0d3FE14CEDF8BB580B, fd130; +fma.rn.f64 fd135, fd58, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd59, 0d3FE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd61, 0dBFE4F49E7F775887, fd133; +fma.rn.f64 fd138, fd64, 0d3FE82F19BB3A28A1, fd134; +fma.rn.f64 fd139, fd62, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd63, 0d3FE82F19BB3A28A1, fd136; +sub.f64 fd141, fd137, fd138; +add.f64 fd142, fd140, fd139; +add.f64 fd143, fd138, fd137; +sub.f64 fd144, fd139, fd140; +fma.rn.f64 fd145, fd45, 0dBFE4F49E7F775887, %24; +fma.rn.f64 fd146, fd48, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd147, fd46, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd148, fd47, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd49, 0dBFC2375F640F44DB, fd145; +fma.rn.f64 fd150, fd52, 0dBFEFAC9E043842EF, fd146; +fma.rn.f64 fd151, fd50, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd51, 0dBFEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd53, 0d3FEAEB8C8764F0BA, fd149; +fma.rn.f64 fd154, fd56, 0d3FE14CEDF8BB580B, fd150; +fma.rn.f64 fd155, fd54, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd55, 0d3FE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd57, 0dBFEEB42A9BCD5057, fd153; +fma.rn.f64 fd158, fd60, 0d3FD207E7FD768DBF, fd154; +fma.rn.f64 fd159, fd58, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd59, 0d3FD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd61, 0d3FDA9628D9C712B6, fd157; +fma.rn.f64 fd162, fd64, 0dBFED1BB48EEE2C13, fd158; +fma.rn.f64 fd163, fd62, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd63, 0dBFED1BB48EEE2C13, fd160; +sub.f64 fd165, fd161, fd162; +add.f64 fd166, fd164, fd163; +add.f64 fd167, fd162, fd161; +sub.f64 fd168, fd163, fd164; +fma.rn.f64 fd169, fd45, 0dBFEEB42A9BCD5057, %24; +fma.rn.f64 fd170, fd48, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd171, fd46, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd172, fd47, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd49, 0d3FEAEB8C8764F0BA, fd169; +fma.rn.f64 fd174, fd52, 0dBFE14CEDF8BB580B, fd170; +fma.rn.f64 fd175, fd50, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd51, 0dBFE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd53, 0dBFE4F49E7F775887, fd173; +fma.rn.f64 fd178, fd56, 0d3FE82F19BB3A28A1, fd174; +fma.rn.f64 fd179, fd54, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd55, 0d3FE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd57, 0d3FDA9628D9C712B6, fd177; +fma.rn.f64 fd182, fd60, 0dBFED1BB48EEE2C13, fd178; +fma.rn.f64 fd183, fd58, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd59, 0dBFED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd61, 0dBFC2375F640F44DB, fd181; +fma.rn.f64 fd186, fd64, 0d3FEFAC9E043842EF, fd182; +fma.rn.f64 fd187, fd62, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd63, 0d3FEFAC9E043842EF, fd184; +sub.f64 fd189, fd185, fd186; +add.f64 fd190, fd188, fd187; +add.f64 fd191, fd186, fd185; +sub.f64 fd192, fd187, fd188; +mul.wide.u32 rd2, r4, -1171354717; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 11; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1936, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd193, fd194}, [rd6]; +mul.f64 fd197, fd94, fd194; +mul.f64 fd198, fd93, fd194; +mul.f64 fd199, fd193, fd94; +mul.f64 fd200, fd193, fd193; +mul.f64 fd201, fd194, fd194; +sub.f64 fd202, fd200, fd201; +mul.f64 fd203, fd194, fd193; +fma.rn.f64 fd204, fd194, fd193, fd203; +mul.f64 fd205, fd118, fd204; +mul.f64 fd206, fd117, fd204; +mul.f64 fd207, fd202, fd118; +mul.f64 fd208, fd193, fd202; +mul.f64 fd209, fd194, fd204; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd193, fd204; +fma.rn.f64 fd212, fd194, fd202, fd211; +mul.f64 fd213, fd142, fd212; +mul.f64 fd214, fd141, fd212; +mul.f64 fd215, fd210, fd142; +mul.f64 fd216, fd193, fd210; +mul.f64 fd217, fd194, fd212; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd193, fd212; +fma.rn.f64 fd220, fd194, fd210, fd219; +mul.f64 fd221, fd166, fd220; +mul.f64 fd222, fd165, fd220; +mul.f64 fd223, fd218, fd166; +mul.f64 fd224, fd193, fd218; +mul.f64 fd225, fd194, fd220; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd193, fd220; +fma.rn.f64 fd228, fd194, fd218, fd227; +mul.f64 fd229, fd190, fd228; +mul.f64 fd230, fd189, fd228; +mul.f64 fd231, fd226, fd190; +ld.global.v2.f64 {fd232, fd233}, [rd6+176]; +mul.f64 fd236, fd192, fd233; +mul.f64 fd237, fd191, fd233; +mul.f64 fd238, fd232, fd192; +mul.f64 fd239, fd193, fd232; +mul.f64 fd240, fd194, fd233; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd193, fd233; +fma.rn.f64 fd243, fd194, fd232, fd242; +mul.f64 fd244, fd168, fd243; +mul.f64 fd245, fd167, fd243; +mul.f64 fd246, fd241, fd168; +mul.f64 fd247, fd193, fd241; +mul.f64 fd248, fd194, fd243; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd193, fd243; +fma.rn.f64 fd251, fd194, fd241, fd250; +mul.f64 fd252, fd144, fd251; +mul.f64 fd253, fd143, fd251; +mul.f64 fd254, fd249, fd144; +mul.f64 fd255, fd193, fd249; +mul.f64 fd256, fd194, fd251; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd193, fd251; +fma.rn.f64 fd259, fd194, fd249, fd258; +mul.f64 fd260, fd120, fd259; +mul.f64 fd261, fd119, fd259; +mul.f64 fd262, fd257, fd120; +mul.f64 fd263, fd193, fd257; +mul.f64 fd264, fd194, fd259; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd193, fd259; +fma.rn.f64 fd267, fd194, fd257, fd266; +mul.f64 fd268, fd96, fd267; +mul.f64 fd269, fd95, fd267; +mul.f64 fd270, fd265, fd96; +barrier.sync 0; +mad.lo.s32 r9, r7, 176, r8; +add.f64 fd271, fd72, fd62; +add.f64 fd272, fd71, fd61; +st.shared.v2.f64 [r9], {fd272, fd271}; +fma.rn.f64 fd273, fd193, fd93, fd197; +sub.f64 fd274, fd199, fd198; +st.shared.v2.f64 [r9+16], {fd273, fd274}; +fma.rn.f64 fd275, fd202, fd117, fd205; +sub.f64 fd276, fd207, fd206; +st.shared.v2.f64 [r9+32], {fd275, fd276}; +sub.f64 fd277, fd215, fd214; +fma.rn.f64 fd278, fd210, fd141, fd213; +st.shared.v2.f64 [r9+48], {fd278, fd277}; +fma.rn.f64 fd279, fd218, fd165, fd221; +sub.f64 fd280, fd223, fd222; +st.shared.v2.f64 [r9+64], {fd279, fd280}; +fma.rn.f64 fd281, fd226, fd189, fd229; +sub.f64 fd282, fd231, fd230; +st.shared.v2.f64 [r9+80], {fd281, fd282}; +fma.rn.f64 fd283, fd232, fd191, fd236; +sub.f64 fd284, fd238, fd237; +st.shared.v2.f64 [r9+96], {fd283, fd284}; +fma.rn.f64 fd285, fd241, fd167, fd244; +sub.f64 fd286, fd246, fd245; +st.shared.v2.f64 [r9+112], {fd285, fd286}; +sub.f64 fd287, fd254, fd253; +fma.rn.f64 fd288, fd249, fd143, fd252; +st.shared.v2.f64 [r9+128], {fd288, fd287}; +fma.rn.f64 fd289, fd257, fd119, fd260; +sub.f64 fd290, fd262, fd261; +st.shared.v2.f64 [r9+144], {fd289, fd290}; +fma.rn.f64 fd291, fd265, fd95, fd268; +sub.f64 fd292, fd270, fd269; +st.shared.v2.f64 [r9+160], {fd291, fd292}; +barrier.sync 0; +mad.lo.s32 r10, r7, -160, r9; +ld.shared.v2.f64 {fd293, fd294}, [r10]; +ld.shared.v2.f64 {fd297, fd298}, [r10+176]; +ld.shared.v2.f64 {fd301, fd302}, [r10+352]; +ld.shared.v2.f64 {fd305, fd306}, [r10+528]; +ld.shared.v2.f64 {fd309, fd310}, [r10+704]; +ld.shared.v2.f64 {fd313, fd314}, [r10+880]; +ld.shared.v2.f64 {fd317, fd318}, [r10+1056]; +ld.shared.v2.f64 {fd321, fd322}, [r10+1232]; +ld.shared.v2.f64 {fd325, fd326}, [r10+1408]; +ld.shared.v2.f64 {fd329, fd330}, [r10+1584]; +ld.shared.v2.f64 {fd333, fd334}, [r10+1760]; +add.f64 fd337, fd297, fd333; +add.f64 fd338, fd298, fd334; +sub.f64 fd339, fd297, fd333; +sub.f64 fd340, fd298, fd334; +add.f64 fd341, fd301, fd329; +add.f64 fd342, fd302, fd330; +sub.f64 fd343, fd301, fd329; +sub.f64 fd344, fd302, fd330; +add.f64 fd345, fd305, fd325; +add.f64 fd346, fd306, fd326; +sub.f64 fd347, fd305, fd325; +sub.f64 fd348, fd306, fd326; +add.f64 fd349, fd309, fd321; +add.f64 fd350, fd310, fd322; +sub.f64 fd351, fd309, fd321; +sub.f64 fd352, fd310, fd322; +add.f64 fd353, fd313, fd317; +add.f64 fd354, fd314, fd318; +sub.f64 fd355, fd313, fd317; +sub.f64 fd356, fd314, fd318; +add.f64 fd357, fd293, fd337; +add.f64 fd358, fd294, fd338; +add.f64 fd359, fd357, fd341; +add.f64 fd360, fd358, fd342; +add.f64 fd361, fd359, fd345; +add.f64 fd362, fd360, fd346; +add.f64 fd363, fd361, fd349; +add.f64 fd364, fd362, fd350; +fma.rn.f64 fd365, fd337, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd366, fd340, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd367, fd338, 0d3FEAEB8C8764F0BA, fd294; +fma.rn.f64 fd368, fd339, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd369, fd341, 0d3FDA9628D9C712B6, fd365; +fma.rn.f64 fd370, fd344, 0d3FED1BB48EEE2C13, fd366; +fma.rn.f64 fd371, fd342, 0d3FDA9628D9C712B6, fd367; +fma.rn.f64 fd372, fd343, 0d3FED1BB48EEE2C13, fd368; +fma.rn.f64 fd373, fd345, 0dBFC2375F640F44DB, fd369; +fma.rn.f64 fd374, fd348, 0d3FEFAC9E043842EF, fd370; +fma.rn.f64 fd375, fd346, 0dBFC2375F640F44DB, fd371; +fma.rn.f64 fd376, fd347, 0d3FEFAC9E043842EF, fd372; +fma.rn.f64 fd377, fd349, 0dBFE4F49E7F775887, fd373; +fma.rn.f64 fd378, fd352, 0d3FE82F19BB3A28A1, fd374; +fma.rn.f64 fd379, fd350, 0dBFE4F49E7F775887, fd375; +fma.rn.f64 fd380, fd351, 0d3FE82F19BB3A28A1, fd376; +fma.rn.f64 fd381, fd353, 0dBFEEB42A9BCD5057, fd377; +fma.rn.f64 fd382, fd356, 0d3FD207E7FD768DBF, fd378; +fma.rn.f64 fd383, fd354, 0dBFEEB42A9BCD5057, fd379; +fma.rn.f64 fd384, fd355, 0d3FD207E7FD768DBF, fd380; +fma.rn.f64 fd385, fd337, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd386, fd340, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd387, fd338, 0d3FDA9628D9C712B6, fd294; +fma.rn.f64 fd388, fd339, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd389, fd341, 0dBFE4F49E7F775887, fd385; +fma.rn.f64 fd390, fd344, 0d3FE82F19BB3A28A1, fd386; +fma.rn.f64 fd391, fd342, 0dBFE4F49E7F775887, fd387; +fma.rn.f64 fd392, fd343, 0d3FE82F19BB3A28A1, fd388; +fma.rn.f64 fd393, fd345, 0dBFEEB42A9BCD5057, fd389; +fma.rn.f64 fd394, fd348, 0dBFD207E7FD768DBF, fd390; +fma.rn.f64 fd395, fd346, 0dBFEEB42A9BCD5057, fd391; +fma.rn.f64 fd396, fd347, 0dBFD207E7FD768DBF, fd392; +fma.rn.f64 fd397, fd349, 0dBFC2375F640F44DB, fd393; +fma.rn.f64 fd398, fd352, 0dBFEFAC9E043842EF, fd394; +fma.rn.f64 fd399, fd350, 0dBFC2375F640F44DB, fd395; +fma.rn.f64 fd400, fd351, 0dBFEFAC9E043842EF, fd396; +fma.rn.f64 fd401, fd353, 0d3FEAEB8C8764F0BA, fd397; +fma.rn.f64 fd402, fd356, 0dBFE14CEDF8BB580B, fd398; +fma.rn.f64 fd403, fd354, 0d3FEAEB8C8764F0BA, fd399; +fma.rn.f64 fd404, fd355, 0dBFE14CEDF8BB580B, fd400; +fma.rn.f64 fd405, fd337, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd406, fd340, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd407, fd338, 0dBFC2375F640F44DB, fd294; +fma.rn.f64 fd408, fd339, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd409, fd341, 0dBFEEB42A9BCD5057, fd405; +fma.rn.f64 fd410, fd344, 0dBFD207E7FD768DBF, fd406; +fma.rn.f64 fd411, fd342, 0dBFEEB42A9BCD5057, fd407; +fma.rn.f64 fd412, fd343, 0dBFD207E7FD768DBF, fd408; +fma.rn.f64 fd413, fd345, 0d3FDA9628D9C712B6, fd409; +fma.rn.f64 fd414, fd348, 0dBFED1BB48EEE2C13, fd410; +fma.rn.f64 fd415, fd346, 0d3FDA9628D9C712B6, fd411; +fma.rn.f64 fd416, fd347, 0dBFED1BB48EEE2C13, fd412; +fma.rn.f64 fd417, fd349, 0d3FEAEB8C8764F0BA, fd413; +fma.rn.f64 fd418, fd352, 0d3FE14CEDF8BB580B, fd414; +fma.rn.f64 fd419, fd350, 0d3FEAEB8C8764F0BA, fd415; +fma.rn.f64 fd420, fd351, 0d3FE14CEDF8BB580B, fd416; +fma.rn.f64 fd421, fd353, 0dBFE4F49E7F775887, fd417; +fma.rn.f64 fd422, fd356, 0d3FE82F19BB3A28A1, fd418; +fma.rn.f64 fd423, fd354, 0dBFE4F49E7F775887, fd419; +fma.rn.f64 fd424, fd355, 0d3FE82F19BB3A28A1, fd420; +fma.rn.f64 fd425, fd337, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd426, fd340, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd427, fd338, 0dBFE4F49E7F775887, fd294; +fma.rn.f64 fd428, fd339, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd429, fd341, 0dBFC2375F640F44DB, fd425; +fma.rn.f64 fd430, fd344, 0dBFEFAC9E043842EF, fd426; +fma.rn.f64 fd431, fd342, 0dBFC2375F640F44DB, fd427; +fma.rn.f64 fd432, fd343, 0dBFEFAC9E043842EF, fd428; +fma.rn.f64 fd433, fd345, 0d3FEAEB8C8764F0BA, fd429; +fma.rn.f64 fd434, fd348, 0d3FE14CEDF8BB580B, fd430; +fma.rn.f64 fd435, fd346, 0d3FEAEB8C8764F0BA, fd431; +fma.rn.f64 fd436, fd347, 0d3FE14CEDF8BB580B, fd432; +fma.rn.f64 fd437, fd349, 0dBFEEB42A9BCD5057, fd433; +fma.rn.f64 fd438, fd352, 0d3FD207E7FD768DBF, fd434; +fma.rn.f64 fd439, fd350, 0dBFEEB42A9BCD5057, fd435; +fma.rn.f64 fd440, fd351, 0d3FD207E7FD768DBF, fd436; +fma.rn.f64 fd441, fd353, 0d3FDA9628D9C712B6, fd437; +fma.rn.f64 fd442, fd356, 0dBFED1BB48EEE2C13, fd438; +fma.rn.f64 fd443, fd354, 0d3FDA9628D9C712B6, fd439; +fma.rn.f64 fd444, fd355, 0dBFED1BB48EEE2C13, fd440; +fma.rn.f64 fd445, fd337, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd446, fd340, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd447, fd338, 0dBFEEB42A9BCD5057, fd294; +fma.rn.f64 fd448, fd339, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd449, fd341, 0d3FEAEB8C8764F0BA, fd445; +fma.rn.f64 fd450, fd344, 0dBFE14CEDF8BB580B, fd446; +fma.rn.f64 fd451, fd342, 0d3FEAEB8C8764F0BA, fd447; +fma.rn.f64 fd452, fd343, 0dBFE14CEDF8BB580B, fd448; +fma.rn.f64 fd453, fd345, 0dBFE4F49E7F775887, fd449; +fma.rn.f64 fd454, fd348, 0d3FE82F19BB3A28A1, fd450; +fma.rn.f64 fd455, fd346, 0dBFE4F49E7F775887, fd451; +fma.rn.f64 fd456, fd347, 0d3FE82F19BB3A28A1, fd452; +fma.rn.f64 fd457, fd349, 0d3FDA9628D9C712B6, fd453; +fma.rn.f64 fd458, fd352, 0dBFED1BB48EEE2C13, fd454; +fma.rn.f64 fd459, fd350, 0d3FDA9628D9C712B6, fd455; +fma.rn.f64 fd460, fd351, 0dBFED1BB48EEE2C13, fd456; +fma.rn.f64 fd461, fd353, 0dBFC2375F640F44DB, fd457; +fma.rn.f64 fd462, fd356, 0d3FEFAC9E043842EF, fd458; +fma.rn.f64 fd463, fd354, 0dBFC2375F640F44DB, fd459; +fma.rn.f64 fd464, fd355, 0d3FEFAC9E043842EF, fd460; +add.f64 %1, fd364, fd354; +add.f64 %0, fd363, fd353; +add.f64 %3, fd384, fd383; +sub.f64 %2, fd381, fd382; +add.f64 %5, fd404, fd403; +sub.f64 %4, fd401, fd402; +add.f64 %7, fd424, fd423; +sub.f64 %6, fd421, fd422; +add.f64 %9, fd444, fd443; +sub.f64 %8, fd441, fd442; +add.f64 %11, fd464, fd463; +sub.f64 %10, fd461, fd462; +sub.f64 %13, fd463, fd464; +add.f64 %12, fd462, fd461; +sub.f64 %15, fd443, fd444; +add.f64 %14, fd442, fd441; +sub.f64 %17, fd423, fd424; +add.f64 %16, fd422, fd421; +sub.f64 %19, fd403, fd404; +add.f64 %18, fd402, fd401; +sub.f64 %21, fd383, fd384; +add.f64 %20, fd382, fd381; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..0600774d3c5ea --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp16_fwd.hpp.inc @@ -0,0 +1,17186 @@ +#ifndef CUFFTDX_FFT_125_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_125_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<905, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<332>; +.reg .b32 r<6058>; +.reg .b64 rd<4>; +mov.u32 r6056, %tid.y; +mov.u32 r6057, %50; +mad.lo.s32 r5998, r6056, 1000, r6057; +mov.u32 r5999, %tid.x; +mov.f32 f326, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1, {low, high}; +} +mov.f32 f328, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2, {low, high}; +} +mov.f32 f322, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r3, {low, high}; +} +mov.f32 f324, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %95, %81; +} +{ +add.f16x2 r12, %54, r9; +} +{ +add.f16x2 r15, %60, %94; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %70, %58; +} +{ +add.f16x2 r24, %79, r21; +} +{ +add.f16x2 r27, %87, %69; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %95, %81; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %54, r36; +} +{ +add.f16x2 r42, %60, %94; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %70, %58; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %87, %69; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %95, %81; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %54, r72; +} +{ +add.f16x2 r78, %60, %94; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %70, %58; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %87, %69; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %95, %81; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %54, r108; +} +{ +add.f16x2 r114, %60, %94; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %70, %58; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %87, %69; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %95, %81; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %54, r144; +} +{ +add.f16x2 r150, %60, %94; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %70, %58; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %87, %69; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %70, %58; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %79, r180; +} +{ +add.f16x2 r186, %87, %69; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %95, %81; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %60, %94; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %70, %58; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %79, r216; +} +{ +add.f16x2 r222, %87, %69; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %95, %81; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %60, %94; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %70, %58; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %79, r252; +} +{ +add.f16x2 r258, %87, %69; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %95, %81; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %60, %94; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %70, %58; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %79, r288; +} +{ +add.f16x2 r294, %87, %69; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %95, %81; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %60, %94; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %97, %83; +} +{ +add.f16x2 r332, %55, r329; +} +{ +add.f16x2 r335, %62, %96; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %72, %61; +} +{ +add.f16x2 r344, %80, r341; +} +{ +add.f16x2 r347, %89, %71; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %97, %83; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %55, r356; +} +{ +add.f16x2 r362, %62, %96; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %72, %61; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %89, %71; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %97, %83; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %55, r392; +} +{ +add.f16x2 r398, %62, %96; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %72, %61; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %89, %71; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %97, %83; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %55, r428; +} +{ +add.f16x2 r434, %62, %96; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %72, %61; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %89, %71; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %97, %83; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %55, r464; +} +{ +add.f16x2 r470, %62, %96; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %72, %61; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %89, %71; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %72, %61; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %80, r500; +} +{ +add.f16x2 r506, %89, %71; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %97, %83; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %62, %96; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %72, %61; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %80, r536; +} +{ +add.f16x2 r542, %89, %71; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %97, %83; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %62, %96; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %72, %61; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %80, r572; +} +{ +add.f16x2 r578, %89, %71; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %97, %83; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %62, %96; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %72, %61; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %80, r608; +} +{ +add.f16x2 r614, %89, %71; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %97, %83; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %62, %96; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +add.f16x2 r649, %99, %85; +} +{ +add.f16x2 r652, %56, r649; +} +{ +add.f16x2 r655, %64, %98; +} +{ +add.f16x2 r658, r652, r655; +} +{ +add.f16x2 r661, %74, %63; +} +{ +add.f16x2 r664, %82, r661; +} +{ +add.f16x2 r667, %91, %73; +} +{ +add.f16x2 r670, r664, r667; +} +{ +add.f16x2 r673, %99, %85; +} +{ +mul.f16x2 r676, r673, r641; +} +{ +add.f16x2 r679, %56, r676; +} +{ +add.f16x2 r682, %64, %98; +} +{ +mul.f16x2 r685, r682, r643; +} +{ +add.f16x2 r688, r679, r685; +} +{ +sub.f16x2 r691, %74, %63; +} +{ +mul.f16x2 r694, r691, r642; +} +{ +sub.f16x2 r697, %91, %73; +} +{ +mul.f16x2 r700, r697, r644; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r688, r703; +} +{ +add.f16x2 r709, %99, %85; +} +{ +mul.f16x2 r712, r709, r641; +} +{ +add.f16x2 r715, %56, r712; +} +{ +add.f16x2 r718, %64, %98; +} +{ +mul.f16x2 r721, r718, r643; +} +{ +add.f16x2 r724, r715, r721; +} +{ +sub.f16x2 r727, %74, %63; +} +{ +mul.f16x2 r730, r727, r642; +} +{ +sub.f16x2 r733, %91, %73; +} +{ +mul.f16x2 r736, r733, r644; +} +{ +add.f16x2 r739, r730, r736; +} +{ +add.f16x2 r742, r724, r739; +} +{ +add.f16x2 r745, %99, %85; +} +{ +mul.f16x2 r748, r745, r643; +} +{ +add.f16x2 r751, %56, r748; +} +{ +add.f16x2 r754, %64, %98; +} +{ +mul.f16x2 r757, r754, r645; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %63; +} +{ +mul.f16x2 r766, r763, r644; +} +{ +sub.f16x2 r769, %91, %73; +} +{ +mul.f16x2 r772, r769, r647; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r760, r775; +} +{ +add.f16x2 r781, %99, %85; +} +{ +mul.f16x2 r784, r781, r643; +} +{ +add.f16x2 r787, %56, r784; +} +{ +add.f16x2 r790, %64, %98; +} +{ +mul.f16x2 r793, r790, r645; +} +{ +add.f16x2 r796, r787, r793; +} +{ +sub.f16x2 r799, %74, %63; +} +{ +mul.f16x2 r802, r799, r644; +} +{ +sub.f16x2 r805, %91, %73; +} +{ +mul.f16x2 r808, r805, r647; +} +{ +add.f16x2 r811, r802, r808; +} +{ +add.f16x2 r814, r796, r811; +} +{ +add.f16x2 r817, %74, %63; +} +{ +mul.f16x2 r820, r817, r641; +} +{ +add.f16x2 r823, %82, r820; +} +{ +add.f16x2 r826, %91, %73; +} +{ +mul.f16x2 r829, r826, r643; +} +{ +add.f16x2 r832, r823, r829; +} +{ +sub.f16x2 r835, %99, %85; +} +{ +mul.f16x2 r838, r835, r642; +} +{ +sub.f16x2 r841, %64, %98; +} +{ +mul.f16x2 r844, r841, r644; +} +{ +add.f16x2 r847, r838, r844; +} +{ +add.f16x2 r850, r832, r847; +} +{ +add.f16x2 r853, %74, %63; +} +{ +mul.f16x2 r856, r853, r641; +} +{ +add.f16x2 r859, %82, r856; +} +{ +add.f16x2 r862, %91, %73; +} +{ +mul.f16x2 r865, r862, r643; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %99, %85; +} +{ +mul.f16x2 r874, r871, r642; +} +{ +sub.f16x2 r877, %64, %98; +} +{ +mul.f16x2 r880, r877, r644; +} +{ +add.f16x2 r883, r874, r880; +} +{ +sub.f16x2 r886, r868, r883; +} +{ +add.f16x2 r889, %74, %63; +} +{ +mul.f16x2 r892, r889, r643; +} +{ +add.f16x2 r895, %82, r892; +} +{ +add.f16x2 r898, %91, %73; +} +{ +mul.f16x2 r901, r898, r645; +} +{ +add.f16x2 r904, r895, r901; +} +{ +sub.f16x2 r907, %99, %85; +} +{ +mul.f16x2 r910, r907, r644; +} +{ +sub.f16x2 r913, %64, %98; +} +{ +mul.f16x2 r916, r913, r647; +} +{ +add.f16x2 r919, r910, r916; +} +{ +add.f16x2 r922, r904, r919; +} +{ +add.f16x2 r925, %74, %63; +} +{ +mul.f16x2 r928, r925, r643; +} +{ +add.f16x2 r931, %82, r928; +} +{ +add.f16x2 r934, %91, %73; +} +{ +mul.f16x2 r937, r934, r645; +} +{ +add.f16x2 r940, r931, r937; +} +{ +sub.f16x2 r943, %99, %85; +} +{ +mul.f16x2 r946, r943, r644; +} +{ +sub.f16x2 r949, %64, %98; +} +{ +mul.f16x2 r952, r949, r647; +} +{ +add.f16x2 r955, r946, r952; +} +{ +sub.f16x2 r958, r940, r955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r966, {low, high}; +} +{ +neg.f16x2 r967, r966; +} +{ +add.f16x2 r969, %51, %88; +} +{ +add.f16x2 r972, %57, r969; +} +{ +add.f16x2 r975, %66, %100; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, %76, %65; +} +{ +add.f16x2 r984, %84, r981; +} +{ +add.f16x2 r987, %92, %75; +} +{ +add.f16x2 r990, r984, r987; +} +{ +add.f16x2 r993, %51, %88; +} +{ +mul.f16x2 r996, r993, r961; +} +{ +add.f16x2 r999, %57, r996; +} +{ +add.f16x2 r1002, %66, %100; +} +{ +mul.f16x2 r1005, r1002, r963; +} +{ +add.f16x2 r1008, r999, r1005; +} +{ +sub.f16x2 r1011, %76, %65; +} +{ +mul.f16x2 r1014, r1011, r962; +} +{ +sub.f16x2 r1017, %92, %75; +} +{ +mul.f16x2 r1020, r1017, r964; +} +{ +add.f16x2 r1023, r1014, r1020; +} +{ +sub.f16x2 r1026, r1008, r1023; +} +{ +add.f16x2 r1029, %51, %88; +} +{ +mul.f16x2 r1032, r1029, r961; +} +{ +add.f16x2 r1035, %57, r1032; +} +{ +add.f16x2 r1038, %66, %100; +} +{ +mul.f16x2 r1041, r1038, r963; +} +{ +add.f16x2 r1044, r1035, r1041; +} +{ +sub.f16x2 r1047, %76, %65; +} +{ +mul.f16x2 r1050, r1047, r962; +} +{ +sub.f16x2 r1053, %92, %75; +} +{ +mul.f16x2 r1056, r1053, r964; +} +{ +add.f16x2 r1059, r1050, r1056; +} +{ +add.f16x2 r1062, r1044, r1059; +} +{ +add.f16x2 r1065, %51, %88; +} +{ +mul.f16x2 r1068, r1065, r963; +} +{ +add.f16x2 r1071, %57, r1068; +} +{ +add.f16x2 r1074, %66, %100; +} +{ +mul.f16x2 r1077, r1074, r965; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +sub.f16x2 r1083, %76, %65; +} +{ +mul.f16x2 r1086, r1083, r964; +} +{ +sub.f16x2 r1089, %92, %75; +} +{ +mul.f16x2 r1092, r1089, r967; +} +{ +add.f16x2 r1095, r1086, r1092; +} +{ +sub.f16x2 r1098, r1080, r1095; +} +{ +add.f16x2 r1101, %51, %88; +} +{ +mul.f16x2 r1104, r1101, r963; +} +{ +add.f16x2 r1107, %57, r1104; +} +{ +add.f16x2 r1110, %66, %100; +} +{ +mul.f16x2 r1113, r1110, r965; +} +{ +add.f16x2 r1116, r1107, r1113; +} +{ +sub.f16x2 r1119, %76, %65; +} +{ +mul.f16x2 r1122, r1119, r964; +} +{ +sub.f16x2 r1125, %92, %75; +} +{ +mul.f16x2 r1128, r1125, r967; +} +{ +add.f16x2 r1131, r1122, r1128; +} +{ +add.f16x2 r1134, r1116, r1131; +} +{ +add.f16x2 r1137, %76, %65; +} +{ +mul.f16x2 r1140, r1137, r961; +} +{ +add.f16x2 r1143, %84, r1140; +} +{ +add.f16x2 r1146, %92, %75; +} +{ +mul.f16x2 r1149, r1146, r963; +} +{ +add.f16x2 r1152, r1143, r1149; +} +{ +sub.f16x2 r1155, %51, %88; +} +{ +mul.f16x2 r1158, r1155, r962; +} +{ +sub.f16x2 r1161, %66, %100; +} +{ +mul.f16x2 r1164, r1161, r964; +} +{ +add.f16x2 r1167, r1158, r1164; +} +{ +add.f16x2 r1170, r1152, r1167; +} +{ +add.f16x2 r1173, %76, %65; +} +{ +mul.f16x2 r1176, r1173, r961; +} +{ +add.f16x2 r1179, %84, r1176; +} +{ +add.f16x2 r1182, %92, %75; +} +{ +mul.f16x2 r1185, r1182, r963; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +sub.f16x2 r1191, %51, %88; +} +{ +mul.f16x2 r1194, r1191, r962; +} +{ +sub.f16x2 r1197, %66, %100; +} +{ +mul.f16x2 r1200, r1197, r964; +} +{ +add.f16x2 r1203, r1194, r1200; +} +{ +sub.f16x2 r1206, r1188, r1203; +} +{ +add.f16x2 r1209, %76, %65; +} +{ +mul.f16x2 r1212, r1209, r963; +} +{ +add.f16x2 r1215, %84, r1212; +} +{ +add.f16x2 r1218, %92, %75; +} +{ +mul.f16x2 r1221, r1218, r965; +} +{ +add.f16x2 r1224, r1215, r1221; +} +{ +sub.f16x2 r1227, %51, %88; +} +{ +mul.f16x2 r1230, r1227, r964; +} +{ +sub.f16x2 r1233, %66, %100; +} +{ +mul.f16x2 r1236, r1233, r967; +} +{ +add.f16x2 r1239, r1230, r1236; +} +{ +add.f16x2 r1242, r1224, r1239; +} +{ +add.f16x2 r1245, %76, %65; +} +{ +mul.f16x2 r1248, r1245, r963; +} +{ +add.f16x2 r1251, %84, r1248; +} +{ +add.f16x2 r1254, %92, %75; +} +{ +mul.f16x2 r1257, r1254, r965; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +sub.f16x2 r1263, %51, %88; +} +{ +mul.f16x2 r1266, r1263, r964; +} +{ +sub.f16x2 r1269, %66, %100; +} +{ +mul.f16x2 r1272, r1269, r967; +} +{ +add.f16x2 r1275, r1266, r1272; +} +{ +sub.f16x2 r1278, r1260, r1275; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1281, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1286, {low, high}; +} +{ +neg.f16x2 r1287, r1286; +} +{ +add.f16x2 r1289, %53, %90; +} +{ +add.f16x2 r1292, %59, r1289; +} +{ +add.f16x2 r1295, %68, %52; +} +{ +add.f16x2 r1298, r1292, r1295; +} +{ +add.f16x2 r1301, %78, %67; +} +{ +add.f16x2 r1304, %86, r1301; +} +{ +add.f16x2 r1307, %93, %77; +} +{ +add.f16x2 r1310, r1304, r1307; +} +{ +add.f16x2 r1313, %53, %90; +} +{ +mul.f16x2 r1316, r1313, r1281; +} +{ +add.f16x2 r1319, %59, r1316; +} +{ +add.f16x2 r1322, %68, %52; +} +{ +mul.f16x2 r1325, r1322, r1283; +} +{ +add.f16x2 r1328, r1319, r1325; +} +{ +sub.f16x2 r1331, %78, %67; +} +{ +mul.f16x2 r1334, r1331, r1282; +} +{ +sub.f16x2 r1337, %93, %77; +} +{ +mul.f16x2 r1340, r1337, r1284; +} +{ +add.f16x2 r1343, r1334, r1340; +} +{ +sub.f16x2 r1346, r1328, r1343; +} +{ +add.f16x2 r1349, %53, %90; +} +{ +mul.f16x2 r1352, r1349, r1281; +} +{ +add.f16x2 r1355, %59, r1352; +} +{ +add.f16x2 r1358, %68, %52; +} +{ +mul.f16x2 r1361, r1358, r1283; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +sub.f16x2 r1367, %78, %67; +} +{ +mul.f16x2 r1370, r1367, r1282; +} +{ +sub.f16x2 r1373, %93, %77; +} +{ +mul.f16x2 r1376, r1373, r1284; +} +{ +add.f16x2 r1379, r1370, r1376; +} +{ +add.f16x2 r1382, r1364, r1379; +} +{ +add.f16x2 r1385, %53, %90; +} +{ +mul.f16x2 r1388, r1385, r1283; +} +{ +add.f16x2 r1391, %59, r1388; +} +{ +add.f16x2 r1394, %68, %52; +} +{ +mul.f16x2 r1397, r1394, r1285; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, %78, %67; +} +{ +mul.f16x2 r1406, r1403, r1284; +} +{ +sub.f16x2 r1409, %93, %77; +} +{ +mul.f16x2 r1412, r1409, r1287; +} +{ +add.f16x2 r1415, r1406, r1412; +} +{ +sub.f16x2 r1418, r1400, r1415; +} +{ +add.f16x2 r1421, %53, %90; +} +{ +mul.f16x2 r1424, r1421, r1283; +} +{ +add.f16x2 r1427, %59, r1424; +} +{ +add.f16x2 r1430, %68, %52; +} +{ +mul.f16x2 r1433, r1430, r1285; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +sub.f16x2 r1439, %78, %67; +} +{ +mul.f16x2 r1442, r1439, r1284; +} +{ +sub.f16x2 r1445, %93, %77; +} +{ +mul.f16x2 r1448, r1445, r1287; +} +{ +add.f16x2 r1451, r1442, r1448; +} +{ +add.f16x2 r1454, r1436, r1451; +} +{ +add.f16x2 r1457, %78, %67; +} +{ +mul.f16x2 r1460, r1457, r1281; +} +{ +add.f16x2 r1463, %86, r1460; +} +{ +add.f16x2 r1466, %93, %77; +} +{ +mul.f16x2 r1469, r1466, r1283; +} +{ +add.f16x2 r1472, r1463, r1469; +} +{ +sub.f16x2 r1475, %53, %90; +} +{ +mul.f16x2 r1478, r1475, r1282; +} +{ +sub.f16x2 r1481, %68, %52; +} +{ +mul.f16x2 r1484, r1481, r1284; +} +{ +add.f16x2 r1487, r1478, r1484; +} +{ +add.f16x2 r1490, r1472, r1487; +} +{ +add.f16x2 r1493, %78, %67; +} +{ +mul.f16x2 r1496, r1493, r1281; +} +{ +add.f16x2 r1499, %86, r1496; +} +{ +add.f16x2 r1502, %93, %77; +} +{ +mul.f16x2 r1505, r1502, r1283; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, %53, %90; +} +{ +mul.f16x2 r1514, r1511, r1282; +} +{ +sub.f16x2 r1517, %68, %52; +} +{ +mul.f16x2 r1520, r1517, r1284; +} +{ +add.f16x2 r1523, r1514, r1520; +} +{ +sub.f16x2 r1526, r1508, r1523; +} +{ +add.f16x2 r1529, %78, %67; +} +{ +mul.f16x2 r1532, r1529, r1283; +} +{ +add.f16x2 r1535, %86, r1532; +} +{ +add.f16x2 r1538, %93, %77; +} +{ +mul.f16x2 r1541, r1538, r1285; +} +{ +add.f16x2 r1544, r1535, r1541; +} +{ +sub.f16x2 r1547, %53, %90; +} +{ +mul.f16x2 r1550, r1547, r1284; +} +{ +sub.f16x2 r1553, %68, %52; +} +{ +mul.f16x2 r1556, r1553, r1287; +} +{ +add.f16x2 r1559, r1550, r1556; +} +{ +add.f16x2 r1562, r1544, r1559; +} +{ +add.f16x2 r1565, %78, %67; +} +{ +mul.f16x2 r1568, r1565, r1283; +} +{ +add.f16x2 r1571, %86, r1568; +} +{ +add.f16x2 r1574, %93, %77; +} +{ +mul.f16x2 r1577, r1574, r1285; +} +{ +add.f16x2 r1580, r1571, r1577; +} +{ +sub.f16x2 r1583, %53, %90; +} +{ +mul.f16x2 r1586, r1583, r1284; +} +{ +sub.f16x2 r1589, %68, %52; +} +{ +mul.f16x2 r1592, r1589, r1287; +} +{ +add.f16x2 r1595, r1586, r1592; +} +{ +sub.f16x2 r1598, r1580, r1595; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1601, {low, high}; +} +mov.f32 f64, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1602, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1603, {low, high}; +} +mov.f32 f68, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1604, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1605, {low, high}; +} +mov.f32 f72, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1606, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1607, {low, high}; +} +mov.f32 f76, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1608, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1611, {low, high}; +} +mov.f32 f84, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1612, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1615, {low, high}; +} +mov.f32 f92, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1616, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1617, {low, high}; +} +mov.f32 f96, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1618, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1623, {low, high}; +} +mov.f32 f108, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1631, {low, high}; +} +mov.f32 f124, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1632, {low, high}; +} +{ +mul.f16x2 r1649, r386, r1601; +} +{ +mul.f16x2 r1652, r530, r1602; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r386, r1602; +} +{ +fma.rn.f16x2 r1661, r530, r1601, r1658; +} +{ +mul.f16x2 r1665, r706, r1603; +} +{ +mul.f16x2 r1668, r850, r1604; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r706, r1604; +} +{ +fma.rn.f16x2 r1677, r850, r1603, r1674; +} +{ +mul.f16x2 r1681, r1026, r1605; +} +{ +mul.f16x2 r1684, r1170, r1606; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r1026, r1606; +} +{ +fma.rn.f16x2 r1693, r1170, r1605, r1690; +} +{ +mul.f16x2 r1697, r1346, r1607; +} +{ +mul.f16x2 r1700, r1490, r1608; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r1346, r1608; +} +{ +fma.rn.f16x2 r1709, r1490, r1607, r1706; +} +{ +mul.f16x2 r1713, r458, r1603; +} +{ +mul.f16x2 r1716, r602, r1604; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r458, r1604; +} +{ +fma.rn.f16x2 r1725, r602, r1603, r1722; +} +{ +mul.f16x2 r1729, r778, r1607; +} +{ +mul.f16x2 r1732, r922, r1608; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r778, r1608; +} +{ +fma.rn.f16x2 r1741, r922, r1607, r1738; +} +{ +mul.f16x2 r1745, r1098, r1611; +} +{ +mul.f16x2 r1748, r1242, r1612; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r1098, r1612; +} +{ +fma.rn.f16x2 r1757, r1242, r1611, r1754; +} +{ +mul.f16x2 r1761, r1418, r1615; +} +{ +mul.f16x2 r1764, r1562, r1616; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r1418, r1616; +} +{ +fma.rn.f16x2 r1773, r1562, r1615, r1770; +} +{ +mul.f16x2 r1777, r494, r1605; +} +{ +mul.f16x2 r1780, r638, r1606; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r494, r1606; +} +{ +fma.rn.f16x2 r1789, r638, r1605, r1786; +} +{ +mul.f16x2 r1793, r814, r1611; +} +{ +mul.f16x2 r1796, r958, r1612; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r814, r1612; +} +{ +fma.rn.f16x2 r1805, r958, r1611, r1802; +} +{ +mul.f16x2 r1809, r1134, r1617; +} +{ +mul.f16x2 r1812, r1278, r1618; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1134, r1618; +} +{ +fma.rn.f16x2 r1821, r1278, r1617, r1818; +} +{ +mul.f16x2 r1825, r1454, r1623; +} +{ +mul.f16x2 r1828, r1598, r1624; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1454, r1624; +} +{ +fma.rn.f16x2 r1837, r1598, r1623, r1834; +} +{ +mul.f16x2 r1841, r422, r1607; +} +{ +mul.f16x2 r1844, r566, r1608; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r422, r1608; +} +{ +fma.rn.f16x2 r1853, r566, r1607, r1850; +} +{ +mul.f16x2 r1857, r742, r1615; +} +{ +mul.f16x2 r1860, r886, r1616; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r742, r1616; +} +{ +fma.rn.f16x2 r1869, r886, r1615, r1866; +} +{ +mul.f16x2 r1873, r1062, r1623; +} +{ +mul.f16x2 r1876, r1206, r1624; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1062, r1624; +} +{ +fma.rn.f16x2 r1885, r1206, r1623, r1882; +} +{ +mul.f16x2 r1889, r1382, r1631; +} +{ +mul.f16x2 r1892, r1526, r1632; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1382, r1632; +} +{ +fma.rn.f16x2 r1901, r1526, r1631, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1905, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1906, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1910, {low, high}; +} +{ +neg.f16x2 r1911, r1910; +} +{ +add.f16x2 r1913, r338, r1298; +} +{ +add.f16x2 r1916, r18, r1913; +} +{ +add.f16x2 r1919, r658, r978; +} +{ +add.f16x2 r1922, r1916, r1919; +} +{ +add.f16x2 r1925, r350, r1310; +} +{ +add.f16x2 r1928, r30, r1925; +} +{ +add.f16x2 r1931, r670, r990; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r338, r1298; +} +{ +mul.f16x2 r1940, r1937, r1905; +} +{ +add.f16x2 r1943, r18, r1940; +} +{ +add.f16x2 r1946, r658, r978; +} +{ +mul.f16x2 r1949, r1946, r1907; +} +{ +add.f16x2 r1952, r1943, r1949; +} +{ +sub.f16x2 r1955, r350, r1310; +} +{ +mul.f16x2 r1958, r1955, r1906; +} +{ +sub.f16x2 r1961, r670, r990; +} +{ +mul.f16x2 r1964, r1961, r1908; +} +{ +add.f16x2 r1967, r1958, r1964; +} +{ +sub.f16x2 r1970, r1952, r1967; +} +{ +add.f16x2 r1973, r338, r1298; +} +{ +mul.f16x2 r1976, r1973, r1905; +} +{ +add.f16x2 r1979, r18, r1976; +} +{ +add.f16x2 r1982, r658, r978; +} +{ +mul.f16x2 r1985, r1982, r1907; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +sub.f16x2 r1991, r350, r1310; +} +{ +mul.f16x2 r1994, r1991, r1906; +} +{ +sub.f16x2 r1997, r670, r990; +} +{ +mul.f16x2 r2000, r1997, r1908; +} +{ +add.f16x2 r2003, r1994, r2000; +} +{ +add.f16x2 r2006, r1988, r2003; +} +{ +add.f16x2 r2009, r338, r1298; +} +{ +mul.f16x2 r2012, r2009, r1907; +} +{ +add.f16x2 r2015, r18, r2012; +} +{ +add.f16x2 r2018, r658, r978; +} +{ +mul.f16x2 r2021, r2018, r1909; +} +{ +add.f16x2 r2024, r2015, r2021; +} +{ +sub.f16x2 r2027, r350, r1310; +} +{ +mul.f16x2 r2030, r2027, r1908; +} +{ +sub.f16x2 r2033, r670, r990; +} +{ +mul.f16x2 r2036, r2033, r1911; +} +{ +add.f16x2 r2039, r2030, r2036; +} +{ +sub.f16x2 r2042, r2024, r2039; +} +{ +add.f16x2 r2045, r338, r1298; +} +{ +mul.f16x2 r2048, r2045, r1907; +} +{ +add.f16x2 r2051, r18, r2048; +} +{ +add.f16x2 r2054, r658, r978; +} +{ +mul.f16x2 r2057, r2054, r1909; +} +{ +add.f16x2 r2060, r2051, r2057; +} +{ +sub.f16x2 r2063, r350, r1310; +} +{ +mul.f16x2 r2066, r2063, r1908; +} +{ +sub.f16x2 r2069, r670, r990; +} +{ +mul.f16x2 r2072, r2069, r1911; +} +{ +add.f16x2 r2075, r2066, r2072; +} +{ +add.f16x2 r2078, r2060, r2075; +} +{ +add.f16x2 r2081, r350, r1310; +} +{ +mul.f16x2 r2084, r2081, r1905; +} +{ +add.f16x2 r2087, r30, r2084; +} +{ +add.f16x2 r2090, r670, r990; +} +{ +mul.f16x2 r2093, r2090, r1907; +} +{ +add.f16x2 r2096, r2087, r2093; +} +{ +sub.f16x2 r2099, r338, r1298; +} +{ +mul.f16x2 r2102, r2099, r1906; +} +{ +sub.f16x2 r2105, r658, r978; +} +{ +mul.f16x2 r2108, r2105, r1908; +} +{ +add.f16x2 r2111, r2102, r2108; +} +{ +add.f16x2 r2114, r2096, r2111; +} +{ +add.f16x2 r2117, r350, r1310; +} +{ +mul.f16x2 r2120, r2117, r1905; +} +{ +add.f16x2 r2123, r30, r2120; +} +{ +add.f16x2 r2126, r670, r990; +} +{ +mul.f16x2 r2129, r2126, r1907; +} +{ +add.f16x2 r2132, r2123, r2129; +} +{ +sub.f16x2 r2135, r338, r1298; +} +{ +mul.f16x2 r2138, r2135, r1906; +} +{ +sub.f16x2 r2141, r658, r978; +} +{ +mul.f16x2 r2144, r2141, r1908; +} +{ +add.f16x2 r2147, r2138, r2144; +} +{ +sub.f16x2 r2150, r2132, r2147; +} +{ +add.f16x2 r2153, r350, r1310; +} +{ +mul.f16x2 r2156, r2153, r1907; +} +{ +add.f16x2 r2159, r30, r2156; +} +{ +add.f16x2 r2162, r670, r990; +} +{ +mul.f16x2 r2165, r2162, r1909; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r338, r1298; +} +{ +mul.f16x2 r2174, r2171, r1908; +} +{ +sub.f16x2 r2177, r658, r978; +} +{ +mul.f16x2 r2180, r2177, r1911; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +add.f16x2 r2186, r2168, r2183; +} +{ +add.f16x2 r2189, r350, r1310; +} +{ +mul.f16x2 r2192, r2189, r1907; +} +{ +add.f16x2 r2195, r30, r2192; +} +{ +add.f16x2 r2198, r670, r990; +} +{ +mul.f16x2 r2201, r2198, r1909; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r338, r1298; +} +{ +mul.f16x2 r2210, r2207, r1908; +} +{ +sub.f16x2 r2213, r658, r978; +} +{ +mul.f16x2 r2216, r2213, r1911; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +sub.f16x2 r2222, r2204, r2219; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2226, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2228, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2230, {low, high}; +} +{ +neg.f16x2 r2231, r2230; +} +{ +add.f16x2 r2233, r1655, r1703; +} +{ +add.f16x2 r2236, r66, r2233; +} +{ +add.f16x2 r2239, r1671, r1687; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, r1661, r1709; +} +{ +add.f16x2 r2248, r210, r2245; +} +{ +add.f16x2 r2251, r1677, r1693; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r1655, r1703; +} +{ +mul.f16x2 r2260, r2257, r2225; +} +{ +add.f16x2 r2263, r66, r2260; +} +{ +add.f16x2 r2266, r1671, r1687; +} +{ +mul.f16x2 r2269, r2266, r2227; +} +{ +add.f16x2 r2272, r2263, r2269; +} +{ +sub.f16x2 r2275, r1661, r1709; +} +{ +mul.f16x2 r2278, r2275, r2226; +} +{ +sub.f16x2 r2281, r1677, r1693; +} +{ +mul.f16x2 r2284, r2281, r2228; +} +{ +add.f16x2 r2287, r2278, r2284; +} +{ +sub.f16x2 r2290, r2272, r2287; +} +{ +add.f16x2 r2293, r1655, r1703; +} +{ +mul.f16x2 r2296, r2293, r2225; +} +{ +add.f16x2 r2299, r66, r2296; +} +{ +add.f16x2 r2302, r1671, r1687; +} +{ +mul.f16x2 r2305, r2302, r2227; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1661, r1709; +} +{ +mul.f16x2 r2314, r2311, r2226; +} +{ +sub.f16x2 r2317, r1677, r1693; +} +{ +mul.f16x2 r2320, r2317, r2228; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +add.f16x2 r2326, r2308, r2323; +} +{ +add.f16x2 r2329, r1655, r1703; +} +{ +mul.f16x2 r2332, r2329, r2227; +} +{ +add.f16x2 r2335, r66, r2332; +} +{ +add.f16x2 r2338, r1671, r1687; +} +{ +mul.f16x2 r2341, r2338, r2229; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1661, r1709; +} +{ +mul.f16x2 r2350, r2347, r2228; +} +{ +sub.f16x2 r2353, r1677, r1693; +} +{ +mul.f16x2 r2356, r2353, r2231; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r2344, r2359; +} +{ +add.f16x2 r2365, r1655, r1703; +} +{ +mul.f16x2 r2368, r2365, r2227; +} +{ +add.f16x2 r2371, r66, r2368; +} +{ +add.f16x2 r2374, r1671, r1687; +} +{ +mul.f16x2 r2377, r2374, r2229; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1661, r1709; +} +{ +mul.f16x2 r2386, r2383, r2228; +} +{ +sub.f16x2 r2389, r1677, r1693; +} +{ +mul.f16x2 r2392, r2389, r2231; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +add.f16x2 r2398, r2380, r2395; +} +{ +add.f16x2 r2401, r1661, r1709; +} +{ +mul.f16x2 r2404, r2401, r2225; +} +{ +add.f16x2 r2407, r210, r2404; +} +{ +add.f16x2 r2410, r1677, r1693; +} +{ +mul.f16x2 r2413, r2410, r2227; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1655, r1703; +} +{ +mul.f16x2 r2422, r2419, r2226; +} +{ +sub.f16x2 r2425, r1671, r1687; +} +{ +mul.f16x2 r2428, r2425, r2228; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +{ +add.f16x2 r2437, r1661, r1709; +} +{ +mul.f16x2 r2440, r2437, r2225; +} +{ +add.f16x2 r2443, r210, r2440; +} +{ +add.f16x2 r2446, r1677, r1693; +} +{ +mul.f16x2 r2449, r2446, r2227; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1655, r1703; +} +{ +mul.f16x2 r2458, r2455, r2226; +} +{ +sub.f16x2 r2461, r1671, r1687; +} +{ +mul.f16x2 r2464, r2461, r2228; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r2452, r2467; +} +{ +add.f16x2 r2473, r1661, r1709; +} +{ +mul.f16x2 r2476, r2473, r2227; +} +{ +add.f16x2 r2479, r210, r2476; +} +{ +add.f16x2 r2482, r1677, r1693; +} +{ +mul.f16x2 r2485, r2482, r2229; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1655, r1703; +} +{ +mul.f16x2 r2494, r2491, r2228; +} +{ +sub.f16x2 r2497, r1671, r1687; +} +{ +mul.f16x2 r2500, r2497, r2231; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +add.f16x2 r2506, r2488, r2503; +} +{ +add.f16x2 r2509, r1661, r1709; +} +{ +mul.f16x2 r2512, r2509, r2227; +} +{ +add.f16x2 r2515, r210, r2512; +} +{ +add.f16x2 r2518, r1677, r1693; +} +{ +mul.f16x2 r2521, r2518, r2229; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1655, r1703; +} +{ +mul.f16x2 r2530, r2527, r2228; +} +{ +sub.f16x2 r2533, r1671, r1687; +} +{ +mul.f16x2 r2536, r2533, r2231; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2524, r2539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2545, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2546, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2547, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2548, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2549, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2550, {low, high}; +} +{ +neg.f16x2 r2551, r2550; +} +{ +add.f16x2 r2553, r1719, r1767; +} +{ +add.f16x2 r2556, r138, r2553; +} +{ +add.f16x2 r2559, r1735, r1751; +} +{ +add.f16x2 r2562, r2556, r2559; +} +{ +add.f16x2 r2565, r1725, r1773; +} +{ +add.f16x2 r2568, r282, r2565; +} +{ +add.f16x2 r2571, r1741, r1757; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r1719, r1767; +} +{ +mul.f16x2 r2580, r2577, r2545; +} +{ +add.f16x2 r2583, r138, r2580; +} +{ +add.f16x2 r2586, r1735, r1751; +} +{ +mul.f16x2 r2589, r2586, r2547; +} +{ +add.f16x2 r2592, r2583, r2589; +} +{ +sub.f16x2 r2595, r1725, r1773; +} +{ +mul.f16x2 r2598, r2595, r2546; +} +{ +sub.f16x2 r2601, r1741, r1757; +} +{ +mul.f16x2 r2604, r2601, r2548; +} +{ +add.f16x2 r2607, r2598, r2604; +} +{ +sub.f16x2 r2610, r2592, r2607; +} +{ +add.f16x2 r2613, r1719, r1767; +} +{ +mul.f16x2 r2616, r2613, r2545; +} +{ +add.f16x2 r2619, r138, r2616; +} +{ +add.f16x2 r2622, r1735, r1751; +} +{ +mul.f16x2 r2625, r2622, r2547; +} +{ +add.f16x2 r2628, r2619, r2625; +} +{ +sub.f16x2 r2631, r1725, r1773; +} +{ +mul.f16x2 r2634, r2631, r2546; +} +{ +sub.f16x2 r2637, r1741, r1757; +} +{ +mul.f16x2 r2640, r2637, r2548; +} +{ +add.f16x2 r2643, r2634, r2640; +} +{ +add.f16x2 r2646, r2628, r2643; +} +{ +add.f16x2 r2649, r1719, r1767; +} +{ +mul.f16x2 r2652, r2649, r2547; +} +{ +add.f16x2 r2655, r138, r2652; +} +{ +add.f16x2 r2658, r1735, r1751; +} +{ +mul.f16x2 r2661, r2658, r2549; +} +{ +add.f16x2 r2664, r2655, r2661; +} +{ +sub.f16x2 r2667, r1725, r1773; +} +{ +mul.f16x2 r2670, r2667, r2548; +} +{ +sub.f16x2 r2673, r1741, r1757; +} +{ +mul.f16x2 r2676, r2673, r2551; +} +{ +add.f16x2 r2679, r2670, r2676; +} +{ +sub.f16x2 r2682, r2664, r2679; +} +{ +add.f16x2 r2685, r1719, r1767; +} +{ +mul.f16x2 r2688, r2685, r2547; +} +{ +add.f16x2 r2691, r138, r2688; +} +{ +add.f16x2 r2694, r1735, r1751; +} +{ +mul.f16x2 r2697, r2694, r2549; +} +{ +add.f16x2 r2700, r2691, r2697; +} +{ +sub.f16x2 r2703, r1725, r1773; +} +{ +mul.f16x2 r2706, r2703, r2548; +} +{ +sub.f16x2 r2709, r1741, r1757; +} +{ +mul.f16x2 r2712, r2709, r2551; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2700, r2715; +} +{ +add.f16x2 r2721, r1725, r1773; +} +{ +mul.f16x2 r2724, r2721, r2545; +} +{ +add.f16x2 r2727, r282, r2724; +} +{ +add.f16x2 r2730, r1741, r1757; +} +{ +mul.f16x2 r2733, r2730, r2547; +} +{ +add.f16x2 r2736, r2727, r2733; +} +{ +sub.f16x2 r2739, r1719, r1767; +} +{ +mul.f16x2 r2742, r2739, r2546; +} +{ +sub.f16x2 r2745, r1735, r1751; +} +{ +mul.f16x2 r2748, r2745, r2548; +} +{ +add.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2736, r2751; +} +{ +add.f16x2 r2757, r1725, r1773; +} +{ +mul.f16x2 r2760, r2757, r2545; +} +{ +add.f16x2 r2763, r282, r2760; +} +{ +add.f16x2 r2766, r1741, r1757; +} +{ +mul.f16x2 r2769, r2766, r2547; +} +{ +add.f16x2 r2772, r2763, r2769; +} +{ +sub.f16x2 r2775, r1719, r1767; +} +{ +mul.f16x2 r2778, r2775, r2546; +} +{ +sub.f16x2 r2781, r1735, r1751; +} +{ +mul.f16x2 r2784, r2781, r2548; +} +{ +add.f16x2 r2787, r2778, r2784; +} +{ +sub.f16x2 r2790, r2772, r2787; +} +{ +add.f16x2 r2793, r1725, r1773; +} +{ +mul.f16x2 r2796, r2793, r2547; +} +{ +add.f16x2 r2799, r282, r2796; +} +{ +add.f16x2 r2802, r1741, r1757; +} +{ +mul.f16x2 r2805, r2802, r2549; +} +{ +add.f16x2 r2808, r2799, r2805; +} +{ +sub.f16x2 r2811, r1719, r1767; +} +{ +mul.f16x2 r2814, r2811, r2548; +} +{ +sub.f16x2 r2817, r1735, r1751; +} +{ +mul.f16x2 r2820, r2817, r2551; +} +{ +add.f16x2 r2823, r2814, r2820; +} +{ +add.f16x2 r2826, r2808, r2823; +} +{ +add.f16x2 r2829, r1725, r1773; +} +{ +mul.f16x2 r2832, r2829, r2547; +} +{ +add.f16x2 r2835, r282, r2832; +} +{ +add.f16x2 r2838, r1741, r1757; +} +{ +mul.f16x2 r2841, r2838, r2549; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r1719, r1767; +} +{ +mul.f16x2 r2850, r2847, r2548; +} +{ +sub.f16x2 r2853, r1735, r1751; +} +{ +mul.f16x2 r2856, r2853, r2551; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2844, r2859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2866, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2868, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2869, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2870, {low, high}; +} +{ +neg.f16x2 r2871, r2870; +} +{ +add.f16x2 r2873, r1783, r1831; +} +{ +add.f16x2 r2876, r174, r2873; +} +{ +add.f16x2 r2879, r1799, r1815; +} +{ +add.f16x2 r2882, r2876, r2879; +} +{ +add.f16x2 r2885, r1789, r1837; +} +{ +add.f16x2 r2888, r318, r2885; +} +{ +add.f16x2 r2891, r1805, r1821; +} +{ +add.f16x2 r2894, r2888, r2891; +} +{ +add.f16x2 r2897, r1783, r1831; +} +{ +mul.f16x2 r2900, r2897, r2865; +} +{ +add.f16x2 r2903, r174, r2900; +} +{ +add.f16x2 r2906, r1799, r1815; +} +{ +mul.f16x2 r2909, r2906, r2867; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +sub.f16x2 r2915, r1789, r1837; +} +{ +mul.f16x2 r2918, r2915, r2866; +} +{ +sub.f16x2 r2921, r1805, r1821; +} +{ +mul.f16x2 r2924, r2921, r2868; +} +{ +add.f16x2 r2927, r2918, r2924; +} +{ +sub.f16x2 r2930, r2912, r2927; +} +{ +add.f16x2 r2933, r1783, r1831; +} +{ +mul.f16x2 r2936, r2933, r2865; +} +{ +add.f16x2 r2939, r174, r2936; +} +{ +add.f16x2 r2942, r1799, r1815; +} +{ +mul.f16x2 r2945, r2942, r2867; +} +{ +add.f16x2 r2948, r2939, r2945; +} +{ +sub.f16x2 r2951, r1789, r1837; +} +{ +mul.f16x2 r2954, r2951, r2866; +} +{ +sub.f16x2 r2957, r1805, r1821; +} +{ +mul.f16x2 r2960, r2957, r2868; +} +{ +add.f16x2 r2963, r2954, r2960; +} +{ +add.f16x2 r2966, r2948, r2963; +} +{ +add.f16x2 r2969, r1783, r1831; +} +{ +mul.f16x2 r2972, r2969, r2867; +} +{ +add.f16x2 r2975, r174, r2972; +} +{ +add.f16x2 r2978, r1799, r1815; +} +{ +mul.f16x2 r2981, r2978, r2869; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 r2987, r1789, r1837; +} +{ +mul.f16x2 r2990, r2987, r2868; +} +{ +sub.f16x2 r2993, r1805, r1821; +} +{ +mul.f16x2 r2996, r2993, r2871; +} +{ +add.f16x2 r2999, r2990, r2996; +} +{ +sub.f16x2 r3002, r2984, r2999; +} +{ +add.f16x2 r3005, r1783, r1831; +} +{ +mul.f16x2 r3008, r3005, r2867; +} +{ +add.f16x2 r3011, r174, r3008; +} +{ +add.f16x2 r3014, r1799, r1815; +} +{ +mul.f16x2 r3017, r3014, r2869; +} +{ +add.f16x2 r3020, r3011, r3017; +} +{ +sub.f16x2 r3023, r1789, r1837; +} +{ +mul.f16x2 r3026, r3023, r2868; +} +{ +sub.f16x2 r3029, r1805, r1821; +} +{ +mul.f16x2 r3032, r3029, r2871; +} +{ +add.f16x2 r3035, r3026, r3032; +} +{ +add.f16x2 r3038, r3020, r3035; +} +{ +add.f16x2 r3041, r1789, r1837; +} +{ +mul.f16x2 r3044, r3041, r2865; +} +{ +add.f16x2 r3047, r318, r3044; +} +{ +add.f16x2 r3050, r1805, r1821; +} +{ +mul.f16x2 r3053, r3050, r2867; +} +{ +add.f16x2 r3056, r3047, r3053; +} +{ +sub.f16x2 r3059, r1783, r1831; +} +{ +mul.f16x2 r3062, r3059, r2866; +} +{ +sub.f16x2 r3065, r1799, r1815; +} +{ +mul.f16x2 r3068, r3065, r2868; +} +{ +add.f16x2 r3071, r3062, r3068; +} +{ +add.f16x2 r3074, r3056, r3071; +} +{ +add.f16x2 r3077, r1789, r1837; +} +{ +mul.f16x2 r3080, r3077, r2865; +} +{ +add.f16x2 r3083, r318, r3080; +} +{ +add.f16x2 r3086, r1805, r1821; +} +{ +mul.f16x2 r3089, r3086, r2867; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 r3095, r1783, r1831; +} +{ +mul.f16x2 r3098, r3095, r2866; +} +{ +sub.f16x2 r3101, r1799, r1815; +} +{ +mul.f16x2 r3104, r3101, r2868; +} +{ +add.f16x2 r3107, r3098, r3104; +} +{ +sub.f16x2 r3110, r3092, r3107; +} +{ +add.f16x2 r3113, r1789, r1837; +} +{ +mul.f16x2 r3116, r3113, r2867; +} +{ +add.f16x2 r3119, r318, r3116; +} +{ +add.f16x2 r3122, r1805, r1821; +} +{ +mul.f16x2 r3125, r3122, r2869; +} +{ +add.f16x2 r3128, r3119, r3125; +} +{ +sub.f16x2 r3131, r1783, r1831; +} +{ +mul.f16x2 r3134, r3131, r2868; +} +{ +sub.f16x2 r3137, r1799, r1815; +} +{ +mul.f16x2 r3140, r3137, r2871; +} +{ +add.f16x2 r3143, r3134, r3140; +} +{ +add.f16x2 r3146, r3128, r3143; +} +{ +add.f16x2 r3149, r1789, r1837; +} +{ +mul.f16x2 r3152, r3149, r2867; +} +{ +add.f16x2 r3155, r318, r3152; +} +{ +add.f16x2 r3158, r1805, r1821; +} +{ +mul.f16x2 r3161, r3158, r2869; +} +{ +add.f16x2 r3164, r3155, r3161; +} +{ +sub.f16x2 r3167, r1783, r1831; +} +{ +mul.f16x2 r3170, r3167, r2868; +} +{ +sub.f16x2 r3173, r1799, r1815; +} +{ +mul.f16x2 r3176, r3173, r2871; +} +{ +add.f16x2 r3179, r3170, r3176; +} +{ +sub.f16x2 r3182, r3164, r3179; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3185, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3186, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r3187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r3188, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3189, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3190, {low, high}; +} +{ +neg.f16x2 r3191, r3190; +} +{ +add.f16x2 r3193, r1847, r1895; +} +{ +add.f16x2 r3196, r102, r3193; +} +{ +add.f16x2 r3199, r1863, r1879; +} +{ +add.f16x2 r3202, r3196, r3199; +} +{ +add.f16x2 r3205, r1853, r1901; +} +{ +add.f16x2 r3208, r246, r3205; +} +{ +add.f16x2 r3211, r1869, r1885; +} +{ +add.f16x2 r3214, r3208, r3211; +} +{ +add.f16x2 r3217, r1847, r1895; +} +{ +mul.f16x2 r3220, r3217, r3185; +} +{ +add.f16x2 r3223, r102, r3220; +} +{ +add.f16x2 r3226, r1863, r1879; +} +{ +mul.f16x2 r3229, r3226, r3187; +} +{ +add.f16x2 r3232, r3223, r3229; +} +{ +sub.f16x2 r3235, r1853, r1901; +} +{ +mul.f16x2 r3238, r3235, r3186; +} +{ +sub.f16x2 r3241, r1869, r1885; +} +{ +mul.f16x2 r3244, r3241, r3188; +} +{ +add.f16x2 r3247, r3238, r3244; +} +{ +sub.f16x2 r3250, r3232, r3247; +} +{ +add.f16x2 r3253, r1847, r1895; +} +{ +mul.f16x2 r3256, r3253, r3185; +} +{ +add.f16x2 r3259, r102, r3256; +} +{ +add.f16x2 r3262, r1863, r1879; +} +{ +mul.f16x2 r3265, r3262, r3187; +} +{ +add.f16x2 r3268, r3259, r3265; +} +{ +sub.f16x2 r3271, r1853, r1901; +} +{ +mul.f16x2 r3274, r3271, r3186; +} +{ +sub.f16x2 r3277, r1869, r1885; +} +{ +mul.f16x2 r3280, r3277, r3188; +} +{ +add.f16x2 r3283, r3274, r3280; +} +{ +add.f16x2 r3286, r3268, r3283; +} +{ +add.f16x2 r3289, r1847, r1895; +} +{ +mul.f16x2 r3292, r3289, r3187; +} +{ +add.f16x2 r3295, r102, r3292; +} +{ +add.f16x2 r3298, r1863, r1879; +} +{ +mul.f16x2 r3301, r3298, r3189; +} +{ +add.f16x2 r3304, r3295, r3301; +} +{ +sub.f16x2 r3307, r1853, r1901; +} +{ +mul.f16x2 r3310, r3307, r3188; +} +{ +sub.f16x2 r3313, r1869, r1885; +} +{ +mul.f16x2 r3316, r3313, r3191; +} +{ +add.f16x2 r3319, r3310, r3316; +} +{ +sub.f16x2 r3322, r3304, r3319; +} +{ +add.f16x2 r3325, r1847, r1895; +} +{ +mul.f16x2 r3328, r3325, r3187; +} +{ +add.f16x2 r3331, r102, r3328; +} +{ +add.f16x2 r3334, r1863, r1879; +} +{ +mul.f16x2 r3337, r3334, r3189; +} +{ +add.f16x2 r3340, r3331, r3337; +} +{ +sub.f16x2 r3343, r1853, r1901; +} +{ +mul.f16x2 r3346, r3343, r3188; +} +{ +sub.f16x2 r3349, r1869, r1885; +} +{ +mul.f16x2 r3352, r3349, r3191; +} +{ +add.f16x2 r3355, r3346, r3352; +} +{ +add.f16x2 r3358, r3340, r3355; +} +{ +add.f16x2 r3361, r1853, r1901; +} +{ +mul.f16x2 r3364, r3361, r3185; +} +{ +add.f16x2 r3367, r246, r3364; +} +{ +add.f16x2 r3370, r1869, r1885; +} +{ +mul.f16x2 r3373, r3370, r3187; +} +{ +add.f16x2 r3376, r3367, r3373; +} +{ +sub.f16x2 r3379, r1847, r1895; +} +{ +mul.f16x2 r3382, r3379, r3186; +} +{ +sub.f16x2 r3385, r1863, r1879; +} +{ +mul.f16x2 r3388, r3385, r3188; +} +{ +add.f16x2 r3391, r3382, r3388; +} +{ +add.f16x2 r3394, r3376, r3391; +} +{ +add.f16x2 r3397, r1853, r1901; +} +{ +mul.f16x2 r3400, r3397, r3185; +} +{ +add.f16x2 r3403, r246, r3400; +} +{ +add.f16x2 r3406, r1869, r1885; +} +{ +mul.f16x2 r3409, r3406, r3187; +} +{ +add.f16x2 r3412, r3403, r3409; +} +{ +sub.f16x2 r3415, r1847, r1895; +} +{ +mul.f16x2 r3418, r3415, r3186; +} +{ +sub.f16x2 r3421, r1863, r1879; +} +{ +mul.f16x2 r3424, r3421, r3188; +} +{ +add.f16x2 r3427, r3418, r3424; +} +{ +sub.f16x2 r3430, r3412, r3427; +} +{ +add.f16x2 r3433, r1853, r1901; +} +{ +mul.f16x2 r3436, r3433, r3187; +} +{ +add.f16x2 r3439, r246, r3436; +} +{ +add.f16x2 r3442, r1869, r1885; +} +{ +mul.f16x2 r3445, r3442, r3189; +} +{ +add.f16x2 r3448, r3439, r3445; +} +{ +sub.f16x2 r3451, r1847, r1895; +} +{ +mul.f16x2 r3454, r3451, r3188; +} +{ +sub.f16x2 r3457, r1863, r1879; +} +{ +mul.f16x2 r3460, r3457, r3191; +} +{ +add.f16x2 r3463, r3454, r3460; +} +{ +add.f16x2 r3466, r3448, r3463; +} +{ +add.f16x2 r3469, r1853, r1901; +} +{ +mul.f16x2 r3472, r3469, r3187; +} +{ +add.f16x2 r3475, r246, r3472; +} +{ +add.f16x2 r3478, r1869, r1885; +} +{ +mul.f16x2 r3481, r3478, r3189; +} +{ +add.f16x2 r3484, r3475, r3481; +} +{ +sub.f16x2 r3487, r1847, r1895; +} +{ +mul.f16x2 r3490, r3487, r3188; +} +{ +sub.f16x2 r3493, r1863, r1879; +} +{ +mul.f16x2 r3496, r3493, r3191; +} +{ +add.f16x2 r3499, r3490, r3496; +} +{ +sub.f16x2 r3502, r3484, r3499; +} +mul.wide.u32 rd2, r5999, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r6000, rd3; +mul.lo.s32 r6001, r6000, 5; +sub.s32 r6002, r5999, r6001; +cvt.rn.f32.u32 f329, r6002; +mul.f32 f330, f329, 0f3D4DE32E; +cos.approx.f32 f217, f330; +sin.approx.f32 f331, f330; +neg.f32 f218, f331; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3505, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3508, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3510, {high, high}; +} +{ +mul.f16x2 r3512, r2254, r3510; +} +{ +neg.f16x2 r3515, r3512; +} +{ +fma.rn.f16x2 r3517, r2242, r3508, r3515; +} +{ +mul.f16x2 r3521, r2242, r3510; +} +{ +fma.rn.f16x2 r3524, r2254, r3508, r3521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3530, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3532, {low, high}; +} +{ +mul.f16x2 r3533, r3530, r3532; +} +{ +mul.f16x2 r3536, r3505, r3528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3539, {high, low}; +} +{ +fma.rn.f16x2 r3541, r3533, r3539, r3536; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3545, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3547, {high, high}; +} +{ +mul.f16x2 r3549, r2574, r3547; +} +{ +neg.f16x2 r3552, r3549; +} +{ +fma.rn.f16x2 r3554, r2562, r3545, r3552; +} +{ +mul.f16x2 r3558, r2562, r3547; +} +{ +fma.rn.f16x2 r3561, r2574, r3545, r3558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3567, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3569, {low, high}; +} +{ +mul.f16x2 r3570, r3567, r3569; +} +{ +mul.f16x2 r3573, r3541, r3565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3576, {high, low}; +} +{ +fma.rn.f16x2 r3578, r3570, r3576, r3573; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3582, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3584, {high, high}; +} +{ +mul.f16x2 r3586, r2894, r3584; +} +{ +neg.f16x2 r3589, r3586; +} +{ +fma.rn.f16x2 r3591, r2882, r3582, r3589; +} +{ +mul.f16x2 r3595, r2882, r3584; +} +{ +fma.rn.f16x2 r3598, r2894, r3582, r3595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3604, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3606, {low, high}; +} +{ +mul.f16x2 r3607, r3604, r3606; +} +{ +mul.f16x2 r3610, r3578, r3602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3613, {high, low}; +} +{ +fma.rn.f16x2 r3615, r3607, r3613, r3610; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3619, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3621, {high, high}; +} +{ +mul.f16x2 r3623, r3214, r3621; +} +{ +neg.f16x2 r3626, r3623; +} +{ +fma.rn.f16x2 r3628, r3202, r3619, r3626; +} +{ +mul.f16x2 r3632, r3202, r3621; +} +{ +fma.rn.f16x2 r3635, r3214, r3619, r3632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3641, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3643, {low, high}; +} +{ +mul.f16x2 r3644, r3641, r3643; +} +{ +mul.f16x2 r3647, r3615, r3639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3650, {high, low}; +} +{ +fma.rn.f16x2 r3652, r3644, r3650, r3647; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3656, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3658, {high, high}; +} +{ +mul.f16x2 r3660, r2114, r3658; +} +{ +neg.f16x2 r3663, r3660; +} +{ +fma.rn.f16x2 r3665, r1970, r3656, r3663; +} +{ +mul.f16x2 r3669, r1970, r3658; +} +{ +fma.rn.f16x2 r3672, r2114, r3656, r3669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3678, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3680, {low, high}; +} +{ +mul.f16x2 r3681, r3678, r3680; +} +{ +mul.f16x2 r3684, r3652, r3676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3687, {high, low}; +} +{ +fma.rn.f16x2 r3689, r3681, r3687, r3684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3693, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3695, {high, high}; +} +{ +mul.f16x2 r3697, r2434, r3695; +} +{ +neg.f16x2 r3700, r3697; +} +{ +fma.rn.f16x2 r3702, r2290, r3693, r3700; +} +{ +mul.f16x2 r3706, r2290, r3695; +} +{ +fma.rn.f16x2 r3709, r2434, r3693, r3706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3715, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3717, {low, high}; +} +{ +mul.f16x2 r3718, r3715, r3717; +} +{ +mul.f16x2 r3721, r3689, r3713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3724, {high, low}; +} +{ +fma.rn.f16x2 r3726, r3718, r3724, r3721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3730, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3732, {high, high}; +} +{ +mul.f16x2 r3734, r2754, r3732; +} +{ +neg.f16x2 r3737, r3734; +} +{ +fma.rn.f16x2 r3739, r2610, r3730, r3737; +} +{ +mul.f16x2 r3743, r2610, r3732; +} +{ +fma.rn.f16x2 r3746, r2754, r3730, r3743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3752, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3754, {low, high}; +} +{ +mul.f16x2 r3755, r3752, r3754; +} +{ +mul.f16x2 r3758, r3726, r3750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3761, {high, low}; +} +{ +fma.rn.f16x2 r3763, r3755, r3761, r3758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3767, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3769, {high, high}; +} +{ +mul.f16x2 r3771, r3074, r3769; +} +{ +neg.f16x2 r3774, r3771; +} +{ +fma.rn.f16x2 r3776, r2930, r3767, r3774; +} +{ +mul.f16x2 r3780, r2930, r3769; +} +{ +fma.rn.f16x2 r3783, r3074, r3767, r3780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3789, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3791, {low, high}; +} +{ +mul.f16x2 r3792, r3789, r3791; +} +{ +mul.f16x2 r3795, r3763, r3787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3798, {high, low}; +} +{ +fma.rn.f16x2 r3800, r3792, r3798, r3795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3804, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3806, {high, high}; +} +{ +mul.f16x2 r3808, r3394, r3806; +} +{ +neg.f16x2 r3811, r3808; +} +{ +fma.rn.f16x2 r3813, r3250, r3804, r3811; +} +{ +mul.f16x2 r3817, r3250, r3806; +} +{ +fma.rn.f16x2 r3820, r3394, r3804, r3817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3826, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3828, {low, high}; +} +{ +mul.f16x2 r3829, r3826, r3828; +} +{ +mul.f16x2 r3832, r3800, r3824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3835, {high, low}; +} +{ +fma.rn.f16x2 r3837, r3829, r3835, r3832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3841, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3843, {high, high}; +} +{ +mul.f16x2 r3845, r2186, r3843; +} +{ +neg.f16x2 r3848, r3845; +} +{ +fma.rn.f16x2 r3850, r2042, r3841, r3848; +} +{ +mul.f16x2 r3854, r2042, r3843; +} +{ +fma.rn.f16x2 r3857, r2186, r3841, r3854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3863, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3865, {low, high}; +} +{ +mul.f16x2 r3866, r3863, r3865; +} +{ +mul.f16x2 r3869, r3837, r3861; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3872, {high, low}; +} +{ +fma.rn.f16x2 r3874, r3866, r3872, r3869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3878, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3880, {high, high}; +} +{ +mul.f16x2 r3882, r2506, r3880; +} +{ +neg.f16x2 r3885, r3882; +} +{ +fma.rn.f16x2 r3887, r2362, r3878, r3885; +} +{ +mul.f16x2 r3891, r2362, r3880; +} +{ +fma.rn.f16x2 r3894, r2506, r3878, r3891; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3900, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3902, {low, high}; +} +{ +mul.f16x2 r3903, r3900, r3902; +} +{ +mul.f16x2 r3906, r3874, r3898; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3909, {high, low}; +} +{ +fma.rn.f16x2 r3911, r3903, r3909, r3906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3915, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3917, {high, high}; +} +{ +mul.f16x2 r3919, r2826, r3917; +} +{ +neg.f16x2 r3922, r3919; +} +{ +fma.rn.f16x2 r3924, r2682, r3915, r3922; +} +{ +mul.f16x2 r3928, r2682, r3917; +} +{ +fma.rn.f16x2 r3931, r2826, r3915, r3928; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3937, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3939, {low, high}; +} +{ +mul.f16x2 r3940, r3937, r3939; +} +{ +mul.f16x2 r3943, r3911, r3935; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3946, {high, low}; +} +{ +fma.rn.f16x2 r3948, r3940, r3946, r3943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3952, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3954, {high, high}; +} +{ +mul.f16x2 r3956, r3146, r3954; +} +{ +neg.f16x2 r3959, r3956; +} +{ +fma.rn.f16x2 r3961, r3002, r3952, r3959; +} +{ +mul.f16x2 r3965, r3002, r3954; +} +{ +fma.rn.f16x2 r3968, r3146, r3952, r3965; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3974, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3976, {low, high}; +} +{ +mul.f16x2 r3977, r3974, r3976; +} +{ +mul.f16x2 r3980, r3948, r3972; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3983, {high, low}; +} +{ +fma.rn.f16x2 r3985, r3977, r3983, r3980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3989, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3991, {high, high}; +} +{ +mul.f16x2 r3993, r3466, r3991; +} +{ +neg.f16x2 r3996, r3993; +} +{ +fma.rn.f16x2 r3998, r3322, r3989, r3996; +} +{ +mul.f16x2 r4002, r3322, r3991; +} +{ +fma.rn.f16x2 r4005, r3466, r3989, r4002; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4011, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4013, {low, high}; +} +{ +mul.f16x2 r4014, r4011, r4013; +} +{ +mul.f16x2 r4017, r3985, r4009; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r4020, {high, low}; +} +{ +fma.rn.f16x2 r4022, r4014, r4020, r4017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4026, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4028, {high, high}; +} +{ +mul.f16x2 r4030, r2222, r4028; +} +{ +neg.f16x2 r4033, r4030; +} +{ +fma.rn.f16x2 r4035, r2078, r4026, r4033; +} +{ +mul.f16x2 r4039, r2078, r4028; +} +{ +fma.rn.f16x2 r4042, r2222, r4026, r4039; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4048, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4050, {low, high}; +} +{ +mul.f16x2 r4051, r4048, r4050; +} +{ +mul.f16x2 r4054, r4022, r4046; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4057, {high, low}; +} +{ +fma.rn.f16x2 r4059, r4051, r4057, r4054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4063, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4065, {high, high}; +} +{ +mul.f16x2 r4067, r2542, r4065; +} +{ +neg.f16x2 r4070, r4067; +} +{ +fma.rn.f16x2 r4072, r2398, r4063, r4070; +} +{ +mul.f16x2 r4076, r2398, r4065; +} +{ +fma.rn.f16x2 r4079, r2542, r4063, r4076; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4085, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4087, {low, high}; +} +{ +mul.f16x2 r4088, r4085, r4087; +} +{ +mul.f16x2 r4091, r4059, r4083; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4094, {high, low}; +} +{ +fma.rn.f16x2 r4096, r4088, r4094, r4091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4100, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4102, {high, high}; +} +{ +mul.f16x2 r4104, r2862, r4102; +} +{ +neg.f16x2 r4107, r4104; +} +{ +fma.rn.f16x2 r4109, r2718, r4100, r4107; +} +{ +mul.f16x2 r4113, r2718, r4102; +} +{ +fma.rn.f16x2 r4116, r2862, r4100, r4113; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4122, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4125, r4122, r4124; +} +{ +mul.f16x2 r4128, r4096, r4120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4131, {high, low}; +} +{ +fma.rn.f16x2 r4133, r4125, r4131, r4128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4137, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4139, {high, high}; +} +{ +mul.f16x2 r4141, r3182, r4139; +} +{ +neg.f16x2 r4144, r4141; +} +{ +fma.rn.f16x2 r4146, r3038, r4137, r4144; +} +{ +mul.f16x2 r4150, r3038, r4139; +} +{ +fma.rn.f16x2 r4153, r3182, r4137, r4150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4159, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4162, r4159, r4161; +} +{ +mul.f16x2 r4165, r4133, r4157; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4168, {high, low}; +} +{ +fma.rn.f16x2 r4170, r4162, r4168, r4165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4174, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4176, {high, high}; +} +{ +mul.f16x2 r4178, r3502, r4176; +} +{ +neg.f16x2 r4181, r4178; +} +{ +fma.rn.f16x2 r4183, r3358, r4174, r4181; +} +{ +mul.f16x2 r4187, r3358, r4176; +} +{ +fma.rn.f16x2 r4190, r3502, r4174, r4187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4196, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4198, {low, high}; +} +{ +mul.f16x2 r4199, r4196, r4198; +} +{ +mul.f16x2 r4202, r4170, r4194; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4205, {high, low}; +} +{ +fma.rn.f16x2 r4207, r4199, r4205, r4202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4211, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4213, {high, high}; +} +{ +mul.f16x2 r4215, r2150, r4213; +} +{ +neg.f16x2 r4218, r4215; +} +{ +fma.rn.f16x2 r4220, r2006, r4211, r4218; +} +{ +mul.f16x2 r4224, r2006, r4213; +} +{ +fma.rn.f16x2 r4227, r2150, r4211, r4224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4233, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4235, {low, high}; +} +{ +mul.f16x2 r4236, r4233, r4235; +} +{ +mul.f16x2 r4239, r4207, r4231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4242, {high, low}; +} +{ +fma.rn.f16x2 r4244, r4236, r4242, r4239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4250, {high, high}; +} +{ +mul.f16x2 r4252, r2470, r4250; +} +{ +neg.f16x2 r4255, r4252; +} +{ +fma.rn.f16x2 r4257, r2326, r4248, r4255; +} +{ +mul.f16x2 r4261, r2326, r4250; +} +{ +fma.rn.f16x2 r4264, r2470, r4248, r4261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4272, {low, high}; +} +{ +mul.f16x2 r4273, r4270, r4272; +} +{ +mul.f16x2 r4276, r4244, r4268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4279, {high, low}; +} +{ +fma.rn.f16x2 r4281, r4273, r4279, r4276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4287, {high, high}; +} +{ +mul.f16x2 r4289, r2790, r4287; +} +{ +neg.f16x2 r4292, r4289; +} +{ +fma.rn.f16x2 r4294, r2646, r4285, r4292; +} +{ +mul.f16x2 r4298, r2646, r4287; +} +{ +fma.rn.f16x2 r4301, r2790, r4285, r4298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4309, {low, high}; +} +{ +mul.f16x2 r4310, r4307, r4309; +} +{ +mul.f16x2 r4313, r4281, r4305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4316, {high, low}; +} +{ +fma.rn.f16x2 r4318, r4310, r4316, r4313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4324, {high, high}; +} +{ +mul.f16x2 r4326, r3110, r4324; +} +{ +neg.f16x2 r4329, r4326; +} +{ +fma.rn.f16x2 r4331, r2966, r4322, r4329; +} +{ +mul.f16x2 r4335, r2966, r4324; +} +{ +fma.rn.f16x2 r4338, r3110, r4322, r4335; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4344, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4346, {low, high}; +} +{ +mul.f16x2 r4347, r4344, r4346; +} +{ +mul.f16x2 r4350, r4318, r4342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4353, {high, low}; +} +{ +fma.rn.f16x2 r4355, r4347, r4353, r4350; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4359, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4361, {high, high}; +} +{ +mul.f16x2 r4363, r3430, r4361; +} +{ +neg.f16x2 r4366, r4363; +} +{ +fma.rn.f16x2 r4368, r3286, r4359, r4366; +} +{ +mul.f16x2 r4372, r3286, r4361; +} +{ +fma.rn.f16x2 r4375, r3430, r4359, r4372; +} +mad.lo.s32 r6003, r6000, 1000, r5998; +barrier.sync 0; +mad.lo.s32 r6004, r6002, 200, r6003; +st.shared.v2.f32 [r6004], {r1922, r1934}; +st.shared.v2.f32 [r6004+8], {r3517, r3524}; +st.shared.v2.f32 [r6004+16], {r3554, r3561}; +st.shared.v2.f32 [r6004+24], {r3591, r3598}; +st.shared.v2.f32 [r6004+32], {r3628, r3635}; +st.shared.v2.f32 [r6004+40], {r3665, r3672}; +st.shared.v2.f32 [r6004+48], {r3702, r3709}; +st.shared.v2.f32 [r6004+56], {r3739, r3746}; +st.shared.v2.f32 [r6004+64], {r3776, r3783}; +st.shared.v2.f32 [r6004+72], {r3813, r3820}; +st.shared.v2.f32 [r6004+80], {r3850, r3857}; +st.shared.v2.f32 [r6004+88], {r3887, r3894}; +st.shared.v2.f32 [r6004+96], {r3924, r3931}; +st.shared.v2.f32 [r6004+104], {r3961, r3968}; +st.shared.v2.f32 [r6004+112], {r3998, r4005}; +st.shared.v2.f32 [r6004+120], {r4035, r4042}; +st.shared.v2.f32 [r6004+128], {r4072, r4079}; +st.shared.v2.f32 [r6004+136], {r4109, r4116}; +st.shared.v2.f32 [r6004+144], {r4146, r4153}; +st.shared.v2.f32 [r6004+152], {r4183, r4190}; +st.shared.v2.f32 [r6004+160], {r4220, r4227}; +st.shared.v2.f32 [r6004+168], {r4257, r4264}; +st.shared.v2.f32 [r6004+176], {r4294, r4301}; +st.shared.v2.f32 [r6004+184], {r4331, r4338}; +st.shared.v2.f32 [r6004+192], {r4368, r4375}; +barrier.sync 0; +mad.lo.s32 r6005, r6002, -192, r6004; +ld.shared.u32 r4408, [r6005]; +ld.shared.u32 r4420, [r6005+4]; +ld.shared.u32 r4728, [r6005+40]; +ld.shared.u32 r4740, [r6005+44]; +ld.shared.u32 r5048, [r6005+80]; +ld.shared.u32 r5060, [r6005+84]; +ld.shared.u32 r5368, [r6005+120]; +ld.shared.u32 r5380, [r6005+124]; +ld.shared.u32 r5688, [r6005+160]; +ld.shared.u32 r5700, [r6005+164]; +ld.shared.u32 r4405, [r6005+200]; +ld.shared.u32 r4417, [r6005+204]; +ld.shared.u32 r4725, [r6005+240]; +ld.shared.u32 r4737, [r6005+244]; +ld.shared.u32 r5045, [r6005+280]; +ld.shared.u32 r5057, [r6005+284]; +ld.shared.u32 r5365, [r6005+320]; +ld.shared.u32 r5377, [r6005+324]; +ld.shared.u32 r5685, [r6005+360]; +ld.shared.u32 r5697, [r6005+364]; +ld.shared.u32 r4411, [r6005+400]; +ld.shared.u32 r4423, [r6005+404]; +ld.shared.u32 r4731, [r6005+440]; +ld.shared.u32 r4743, [r6005+444]; +ld.shared.u32 r5051, [r6005+480]; +ld.shared.u32 r5063, [r6005+484]; +ld.shared.u32 r5371, [r6005+520]; +ld.shared.u32 r5383, [r6005+524]; +ld.shared.u32 r5691, [r6005+560]; +ld.shared.u32 r5703, [r6005+564]; +ld.shared.u32 r4412, [r6005+600]; +ld.shared.u32 r4424, [r6005+604]; +ld.shared.u32 r4732, [r6005+640]; +ld.shared.u32 r4744, [r6005+644]; +ld.shared.u32 r5052, [r6005+680]; +ld.shared.u32 r5064, [r6005+684]; +ld.shared.u32 r5372, [r6005+720]; +ld.shared.u32 r5384, [r6005+724]; +ld.shared.u32 r5692, [r6005+760]; +ld.shared.u32 r5704, [r6005+764]; +ld.shared.u32 r4406, [r6005+800]; +ld.shared.u32 r4418, [r6005+804]; +ld.shared.u32 r4726, [r6005+840]; +ld.shared.u32 r4738, [r6005+844]; +ld.shared.u32 r5046, [r6005+880]; +ld.shared.u32 r5058, [r6005+884]; +ld.shared.u32 r5366, [r6005+920]; +ld.shared.u32 r5378, [r6005+924]; +ld.shared.u32 r5686, [r6005+960]; +ld.shared.u32 r5698, [r6005+964]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4401, {low, high}; +} +{ +neg.f16x2 r4402, r4401; +} +{ +add.f16x2 r4404, r4405, r4406; +} +{ +add.f16x2 r4407, r4408, r4404; +} +{ +add.f16x2 r4410, r4411, r4412; +} +{ +add.f16x2 %0, r4407, r4410; +} +{ +add.f16x2 r4416, r4417, r4418; +} +{ +add.f16x2 r4419, r4420, r4416; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 %1, r4419, r4422; +} +{ +add.f16x2 r4428, r4405, r4406; +} +{ +mul.f16x2 r4431, r4428, r4396; +} +{ +add.f16x2 r4434, r4408, r4431; +} +{ +add.f16x2 r4437, r4411, r4412; +} +{ +mul.f16x2 r4440, r4437, r4398; +} +{ +add.f16x2 r4443, r4434, r4440; +} +{ +sub.f16x2 r4446, r4417, r4418; +} +{ +mul.f16x2 r4449, r4446, r4397; +} +{ +sub.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4399; +} +{ +add.f16x2 r4458, r4449, r4455; +} +{ +sub.f16x2 %10, r4443, r4458; +} +{ +add.f16x2 r4464, r4405, r4406; +} +{ +mul.f16x2 r4467, r4464, r4396; +} +{ +add.f16x2 r4470, r4408, r4467; +} +{ +add.f16x2 r4473, r4411, r4412; +} +{ +mul.f16x2 r4476, r4473, r4398; +} +{ +add.f16x2 r4479, r4470, r4476; +} +{ +sub.f16x2 r4482, r4417, r4418; +} +{ +mul.f16x2 r4485, r4482, r4397; +} +{ +sub.f16x2 r4488, r4423, r4424; +} +{ +mul.f16x2 r4491, r4488, r4399; +} +{ +add.f16x2 r4494, r4485, r4491; +} +{ +add.f16x2 %40, r4479, r4494; +} +{ +add.f16x2 r4500, r4405, r4406; +} +{ +mul.f16x2 r4503, r4500, r4398; +} +{ +add.f16x2 r4506, r4408, r4503; +} +{ +add.f16x2 r4509, r4411, r4412; +} +{ +mul.f16x2 r4512, r4509, r4400; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +sub.f16x2 r4518, r4417, r4418; +} +{ +mul.f16x2 r4521, r4518, r4399; +} +{ +sub.f16x2 r4524, r4423, r4424; +} +{ +mul.f16x2 r4527, r4524, r4402; +} +{ +add.f16x2 r4530, r4521, r4527; +} +{ +sub.f16x2 %20, r4515, r4530; +} +{ +add.f16x2 r4536, r4405, r4406; +} +{ +mul.f16x2 r4539, r4536, r4398; +} +{ +add.f16x2 r4542, r4408, r4539; +} +{ +add.f16x2 r4545, r4411, r4412; +} +{ +mul.f16x2 r4548, r4545, r4400; +} +{ +add.f16x2 r4551, r4542, r4548; +} +{ +sub.f16x2 r4554, r4417, r4418; +} +{ +mul.f16x2 r4557, r4554, r4399; +} +{ +sub.f16x2 r4560, r4423, r4424; +} +{ +mul.f16x2 r4563, r4560, r4402; +} +{ +add.f16x2 r4566, r4557, r4563; +} +{ +add.f16x2 %30, r4551, r4566; +} +{ +add.f16x2 r4572, r4417, r4418; +} +{ +mul.f16x2 r4575, r4572, r4396; +} +{ +add.f16x2 r4578, r4420, r4575; +} +{ +add.f16x2 r4581, r4423, r4424; +} +{ +mul.f16x2 r4584, r4581, r4398; +} +{ +add.f16x2 r4587, r4578, r4584; +} +{ +sub.f16x2 r4590, r4405, r4406; +} +{ +mul.f16x2 r4593, r4590, r4397; +} +{ +sub.f16x2 r4596, r4411, r4412; +} +{ +mul.f16x2 r4599, r4596, r4399; +} +{ +add.f16x2 r4602, r4593, r4599; +} +{ +add.f16x2 %11, r4587, r4602; +} +{ +add.f16x2 r4608, r4417, r4418; +} +{ +mul.f16x2 r4611, r4608, r4396; +} +{ +add.f16x2 r4614, r4420, r4611; +} +{ +add.f16x2 r4617, r4423, r4424; +} +{ +mul.f16x2 r4620, r4617, r4398; +} +{ +add.f16x2 r4623, r4614, r4620; +} +{ +sub.f16x2 r4626, r4405, r4406; +} +{ +mul.f16x2 r4629, r4626, r4397; +} +{ +sub.f16x2 r4632, r4411, r4412; +} +{ +mul.f16x2 r4635, r4632, r4399; +} +{ +add.f16x2 r4638, r4629, r4635; +} +{ +sub.f16x2 %41, r4623, r4638; +} +{ +add.f16x2 r4644, r4417, r4418; +} +{ +mul.f16x2 r4647, r4644, r4398; +} +{ +add.f16x2 r4650, r4420, r4647; +} +{ +add.f16x2 r4653, r4423, r4424; +} +{ +mul.f16x2 r4656, r4653, r4400; +} +{ +add.f16x2 r4659, r4650, r4656; +} +{ +sub.f16x2 r4662, r4405, r4406; +} +{ +mul.f16x2 r4665, r4662, r4399; +} +{ +sub.f16x2 r4668, r4411, r4412; +} +{ +mul.f16x2 r4671, r4668, r4402; +} +{ +add.f16x2 r4674, r4665, r4671; +} +{ +add.f16x2 %21, r4659, r4674; +} +{ +add.f16x2 r4680, r4417, r4418; +} +{ +mul.f16x2 r4683, r4680, r4398; +} +{ +add.f16x2 r4686, r4420, r4683; +} +{ +add.f16x2 r4689, r4423, r4424; +} +{ +mul.f16x2 r4692, r4689, r4400; +} +{ +add.f16x2 r4695, r4686, r4692; +} +{ +sub.f16x2 r4698, r4405, r4406; +} +{ +mul.f16x2 r4701, r4698, r4399; +} +{ +sub.f16x2 r4704, r4411, r4412; +} +{ +mul.f16x2 r4707, r4704, r4402; +} +{ +add.f16x2 r4710, r4701, r4707; +} +{ +sub.f16x2 %31, r4695, r4710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4721, {low, high}; +} +{ +neg.f16x2 r4722, r4721; +} +{ +add.f16x2 r4724, r4725, r4726; +} +{ +add.f16x2 r4727, r4728, r4724; +} +{ +add.f16x2 r4730, r4731, r4732; +} +{ +add.f16x2 %2, r4727, r4730; +} +{ +add.f16x2 r4736, r4737, r4738; +} +{ +add.f16x2 r4739, r4740, r4736; +} +{ +add.f16x2 r4742, r4743, r4744; +} +{ +add.f16x2 %3, r4739, r4742; +} +{ +add.f16x2 r4748, r4725, r4726; +} +{ +mul.f16x2 r4751, r4748, r4716; +} +{ +add.f16x2 r4754, r4728, r4751; +} +{ +add.f16x2 r4757, r4731, r4732; +} +{ +mul.f16x2 r4760, r4757, r4718; +} +{ +add.f16x2 r4763, r4754, r4760; +} +{ +sub.f16x2 r4766, r4737, r4738; +} +{ +mul.f16x2 r4769, r4766, r4717; +} +{ +sub.f16x2 r4772, r4743, r4744; +} +{ +mul.f16x2 r4775, r4772, r4719; +} +{ +add.f16x2 r4778, r4769, r4775; +} +{ +sub.f16x2 %12, r4763, r4778; +} +{ +add.f16x2 r4784, r4725, r4726; +} +{ +mul.f16x2 r4787, r4784, r4716; +} +{ +add.f16x2 r4790, r4728, r4787; +} +{ +add.f16x2 r4793, r4731, r4732; +} +{ +mul.f16x2 r4796, r4793, r4718; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +sub.f16x2 r4802, r4737, r4738; +} +{ +mul.f16x2 r4805, r4802, r4717; +} +{ +sub.f16x2 r4808, r4743, r4744; +} +{ +mul.f16x2 r4811, r4808, r4719; +} +{ +add.f16x2 r4814, r4805, r4811; +} +{ +add.f16x2 %42, r4799, r4814; +} +{ +add.f16x2 r4820, r4725, r4726; +} +{ +mul.f16x2 r4823, r4820, r4718; +} +{ +add.f16x2 r4826, r4728, r4823; +} +{ +add.f16x2 r4829, r4731, r4732; +} +{ +mul.f16x2 r4832, r4829, r4720; +} +{ +add.f16x2 r4835, r4826, r4832; +} +{ +sub.f16x2 r4838, r4737, r4738; +} +{ +mul.f16x2 r4841, r4838, r4719; +} +{ +sub.f16x2 r4844, r4743, r4744; +} +{ +mul.f16x2 r4847, r4844, r4722; +} +{ +add.f16x2 r4850, r4841, r4847; +} +{ +sub.f16x2 %22, r4835, r4850; +} +{ +add.f16x2 r4856, r4725, r4726; +} +{ +mul.f16x2 r4859, r4856, r4718; +} +{ +add.f16x2 r4862, r4728, r4859; +} +{ +add.f16x2 r4865, r4731, r4732; +} +{ +mul.f16x2 r4868, r4865, r4720; +} +{ +add.f16x2 r4871, r4862, r4868; +} +{ +sub.f16x2 r4874, r4737, r4738; +} +{ +mul.f16x2 r4877, r4874, r4719; +} +{ +sub.f16x2 r4880, r4743, r4744; +} +{ +mul.f16x2 r4883, r4880, r4722; +} +{ +add.f16x2 r4886, r4877, r4883; +} +{ +add.f16x2 %32, r4871, r4886; +} +{ +add.f16x2 r4892, r4737, r4738; +} +{ +mul.f16x2 r4895, r4892, r4716; +} +{ +add.f16x2 r4898, r4740, r4895; +} +{ +add.f16x2 r4901, r4743, r4744; +} +{ +mul.f16x2 r4904, r4901, r4718; +} +{ +add.f16x2 r4907, r4898, r4904; +} +{ +sub.f16x2 r4910, r4725, r4726; +} +{ +mul.f16x2 r4913, r4910, r4717; +} +{ +sub.f16x2 r4916, r4731, r4732; +} +{ +mul.f16x2 r4919, r4916, r4719; +} +{ +add.f16x2 r4922, r4913, r4919; +} +{ +add.f16x2 %13, r4907, r4922; +} +{ +add.f16x2 r4928, r4737, r4738; +} +{ +mul.f16x2 r4931, r4928, r4716; +} +{ +add.f16x2 r4934, r4740, r4931; +} +{ +add.f16x2 r4937, r4743, r4744; +} +{ +mul.f16x2 r4940, r4937, r4718; +} +{ +add.f16x2 r4943, r4934, r4940; +} +{ +sub.f16x2 r4946, r4725, r4726; +} +{ +mul.f16x2 r4949, r4946, r4717; +} +{ +sub.f16x2 r4952, r4731, r4732; +} +{ +mul.f16x2 r4955, r4952, r4719; +} +{ +add.f16x2 r4958, r4949, r4955; +} +{ +sub.f16x2 %43, r4943, r4958; +} +{ +add.f16x2 r4964, r4737, r4738; +} +{ +mul.f16x2 r4967, r4964, r4718; +} +{ +add.f16x2 r4970, r4740, r4967; +} +{ +add.f16x2 r4973, r4743, r4744; +} +{ +mul.f16x2 r4976, r4973, r4720; +} +{ +add.f16x2 r4979, r4970, r4976; +} +{ +sub.f16x2 r4982, r4725, r4726; +} +{ +mul.f16x2 r4985, r4982, r4719; +} +{ +sub.f16x2 r4988, r4731, r4732; +} +{ +mul.f16x2 r4991, r4988, r4722; +} +{ +add.f16x2 r4994, r4985, r4991; +} +{ +add.f16x2 %23, r4979, r4994; +} +{ +add.f16x2 r5000, r4737, r4738; +} +{ +mul.f16x2 r5003, r5000, r4718; +} +{ +add.f16x2 r5006, r4740, r5003; +} +{ +add.f16x2 r5009, r4743, r4744; +} +{ +mul.f16x2 r5012, r5009, r4720; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +sub.f16x2 r5018, r4725, r4726; +} +{ +mul.f16x2 r5021, r5018, r4719; +} +{ +sub.f16x2 r5024, r4731, r4732; +} +{ +mul.f16x2 r5027, r5024, r4722; +} +{ +add.f16x2 r5030, r5021, r5027; +} +{ +sub.f16x2 %33, r5015, r5030; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5038, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5039, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5041, {low, high}; +} +{ +neg.f16x2 r5042, r5041; +} +{ +add.f16x2 r5044, r5045, r5046; +} +{ +add.f16x2 r5047, r5048, r5044; +} +{ +add.f16x2 r5050, r5051, r5052; +} +{ +add.f16x2 %4, r5047, r5050; +} +{ +add.f16x2 r5056, r5057, r5058; +} +{ +add.f16x2 r5059, r5060, r5056; +} +{ +add.f16x2 r5062, r5063, r5064; +} +{ +add.f16x2 %5, r5059, r5062; +} +{ +add.f16x2 r5068, r5045, r5046; +} +{ +mul.f16x2 r5071, r5068, r5036; +} +{ +add.f16x2 r5074, r5048, r5071; +} +{ +add.f16x2 r5077, r5051, r5052; +} +{ +mul.f16x2 r5080, r5077, r5038; +} +{ +add.f16x2 r5083, r5074, r5080; +} +{ +sub.f16x2 r5086, r5057, r5058; +} +{ +mul.f16x2 r5089, r5086, r5037; +} +{ +sub.f16x2 r5092, r5063, r5064; +} +{ +mul.f16x2 r5095, r5092, r5039; +} +{ +add.f16x2 r5098, r5089, r5095; +} +{ +sub.f16x2 %14, r5083, r5098; +} +{ +add.f16x2 r5104, r5045, r5046; +} +{ +mul.f16x2 r5107, r5104, r5036; +} +{ +add.f16x2 r5110, r5048, r5107; +} +{ +add.f16x2 r5113, r5051, r5052; +} +{ +mul.f16x2 r5116, r5113, r5038; +} +{ +add.f16x2 r5119, r5110, r5116; +} +{ +sub.f16x2 r5122, r5057, r5058; +} +{ +mul.f16x2 r5125, r5122, r5037; +} +{ +sub.f16x2 r5128, r5063, r5064; +} +{ +mul.f16x2 r5131, r5128, r5039; +} +{ +add.f16x2 r5134, r5125, r5131; +} +{ +add.f16x2 %44, r5119, r5134; +} +{ +add.f16x2 r5140, r5045, r5046; +} +{ +mul.f16x2 r5143, r5140, r5038; +} +{ +add.f16x2 r5146, r5048, r5143; +} +{ +add.f16x2 r5149, r5051, r5052; +} +{ +mul.f16x2 r5152, r5149, r5040; +} +{ +add.f16x2 r5155, r5146, r5152; +} +{ +sub.f16x2 r5158, r5057, r5058; +} +{ +mul.f16x2 r5161, r5158, r5039; +} +{ +sub.f16x2 r5164, r5063, r5064; +} +{ +mul.f16x2 r5167, r5164, r5042; +} +{ +add.f16x2 r5170, r5161, r5167; +} +{ +sub.f16x2 %24, r5155, r5170; +} +{ +add.f16x2 r5176, r5045, r5046; +} +{ +mul.f16x2 r5179, r5176, r5038; +} +{ +add.f16x2 r5182, r5048, r5179; +} +{ +add.f16x2 r5185, r5051, r5052; +} +{ +mul.f16x2 r5188, r5185, r5040; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +sub.f16x2 r5194, r5057, r5058; +} +{ +mul.f16x2 r5197, r5194, r5039; +} +{ +sub.f16x2 r5200, r5063, r5064; +} +{ +mul.f16x2 r5203, r5200, r5042; +} +{ +add.f16x2 r5206, r5197, r5203; +} +{ +add.f16x2 %34, r5191, r5206; +} +{ +add.f16x2 r5212, r5057, r5058; +} +{ +mul.f16x2 r5215, r5212, r5036; +} +{ +add.f16x2 r5218, r5060, r5215; +} +{ +add.f16x2 r5221, r5063, r5064; +} +{ +mul.f16x2 r5224, r5221, r5038; +} +{ +add.f16x2 r5227, r5218, r5224; +} +{ +sub.f16x2 r5230, r5045, r5046; +} +{ +mul.f16x2 r5233, r5230, r5037; +} +{ +sub.f16x2 r5236, r5051, r5052; +} +{ +mul.f16x2 r5239, r5236, r5039; +} +{ +add.f16x2 r5242, r5233, r5239; +} +{ +add.f16x2 %15, r5227, r5242; +} +{ +add.f16x2 r5248, r5057, r5058; +} +{ +mul.f16x2 r5251, r5248, r5036; +} +{ +add.f16x2 r5254, r5060, r5251; +} +{ +add.f16x2 r5257, r5063, r5064; +} +{ +mul.f16x2 r5260, r5257, r5038; +} +{ +add.f16x2 r5263, r5254, r5260; +} +{ +sub.f16x2 r5266, r5045, r5046; +} +{ +mul.f16x2 r5269, r5266, r5037; +} +{ +sub.f16x2 r5272, r5051, r5052; +} +{ +mul.f16x2 r5275, r5272, r5039; +} +{ +add.f16x2 r5278, r5269, r5275; +} +{ +sub.f16x2 %45, r5263, r5278; +} +{ +add.f16x2 r5284, r5057, r5058; +} +{ +mul.f16x2 r5287, r5284, r5038; +} +{ +add.f16x2 r5290, r5060, r5287; +} +{ +add.f16x2 r5293, r5063, r5064; +} +{ +mul.f16x2 r5296, r5293, r5040; +} +{ +add.f16x2 r5299, r5290, r5296; +} +{ +sub.f16x2 r5302, r5045, r5046; +} +{ +mul.f16x2 r5305, r5302, r5039; +} +{ +sub.f16x2 r5308, r5051, r5052; +} +{ +mul.f16x2 r5311, r5308, r5042; +} +{ +add.f16x2 r5314, r5305, r5311; +} +{ +add.f16x2 %25, r5299, r5314; +} +{ +add.f16x2 r5320, r5057, r5058; +} +{ +mul.f16x2 r5323, r5320, r5038; +} +{ +add.f16x2 r5326, r5060, r5323; +} +{ +add.f16x2 r5329, r5063, r5064; +} +{ +mul.f16x2 r5332, r5329, r5040; +} +{ +add.f16x2 r5335, r5326, r5332; +} +{ +sub.f16x2 r5338, r5045, r5046; +} +{ +mul.f16x2 r5341, r5338, r5039; +} +{ +sub.f16x2 r5344, r5051, r5052; +} +{ +mul.f16x2 r5347, r5344, r5042; +} +{ +add.f16x2 r5350, r5341, r5347; +} +{ +sub.f16x2 %35, r5335, r5350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5361, {low, high}; +} +{ +neg.f16x2 r5362, r5361; +} +{ +add.f16x2 r5364, r5365, r5366; +} +{ +add.f16x2 r5367, r5368, r5364; +} +{ +add.f16x2 r5370, r5371, r5372; +} +{ +add.f16x2 %6, r5367, r5370; +} +{ +add.f16x2 r5376, r5377, r5378; +} +{ +add.f16x2 r5379, r5380, r5376; +} +{ +add.f16x2 r5382, r5383, r5384; +} +{ +add.f16x2 %7, r5379, r5382; +} +{ +add.f16x2 r5388, r5365, r5366; +} +{ +mul.f16x2 r5391, r5388, r5356; +} +{ +add.f16x2 r5394, r5368, r5391; +} +{ +add.f16x2 r5397, r5371, r5372; +} +{ +mul.f16x2 r5400, r5397, r5358; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5377, r5378; +} +{ +mul.f16x2 r5409, r5406, r5357; +} +{ +sub.f16x2 r5412, r5383, r5384; +} +{ +mul.f16x2 r5415, r5412, r5359; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +sub.f16x2 %16, r5403, r5418; +} +{ +add.f16x2 r5424, r5365, r5366; +} +{ +mul.f16x2 r5427, r5424, r5356; +} +{ +add.f16x2 r5430, r5368, r5427; +} +{ +add.f16x2 r5433, r5371, r5372; +} +{ +mul.f16x2 r5436, r5433, r5358; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5377, r5378; +} +{ +mul.f16x2 r5445, r5442, r5357; +} +{ +sub.f16x2 r5448, r5383, r5384; +} +{ +mul.f16x2 r5451, r5448, r5359; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +add.f16x2 %46, r5439, r5454; +} +{ +add.f16x2 r5460, r5365, r5366; +} +{ +mul.f16x2 r5463, r5460, r5358; +} +{ +add.f16x2 r5466, r5368, r5463; +} +{ +add.f16x2 r5469, r5371, r5372; +} +{ +mul.f16x2 r5472, r5469, r5360; +} +{ +add.f16x2 r5475, r5466, r5472; +} +{ +sub.f16x2 r5478, r5377, r5378; +} +{ +mul.f16x2 r5481, r5478, r5359; +} +{ +sub.f16x2 r5484, r5383, r5384; +} +{ +mul.f16x2 r5487, r5484, r5362; +} +{ +add.f16x2 r5490, r5481, r5487; +} +{ +sub.f16x2 %26, r5475, r5490; +} +{ +add.f16x2 r5496, r5365, r5366; +} +{ +mul.f16x2 r5499, r5496, r5358; +} +{ +add.f16x2 r5502, r5368, r5499; +} +{ +add.f16x2 r5505, r5371, r5372; +} +{ +mul.f16x2 r5508, r5505, r5360; +} +{ +add.f16x2 r5511, r5502, r5508; +} +{ +sub.f16x2 r5514, r5377, r5378; +} +{ +mul.f16x2 r5517, r5514, r5359; +} +{ +sub.f16x2 r5520, r5383, r5384; +} +{ +mul.f16x2 r5523, r5520, r5362; +} +{ +add.f16x2 r5526, r5517, r5523; +} +{ +add.f16x2 %36, r5511, r5526; +} +{ +add.f16x2 r5532, r5377, r5378; +} +{ +mul.f16x2 r5535, r5532, r5356; +} +{ +add.f16x2 r5538, r5380, r5535; +} +{ +add.f16x2 r5541, r5383, r5384; +} +{ +mul.f16x2 r5544, r5541, r5358; +} +{ +add.f16x2 r5547, r5538, r5544; +} +{ +sub.f16x2 r5550, r5365, r5366; +} +{ +mul.f16x2 r5553, r5550, r5357; +} +{ +sub.f16x2 r5556, r5371, r5372; +} +{ +mul.f16x2 r5559, r5556, r5359; +} +{ +add.f16x2 r5562, r5553, r5559; +} +{ +add.f16x2 %17, r5547, r5562; +} +{ +add.f16x2 r5568, r5377, r5378; +} +{ +mul.f16x2 r5571, r5568, r5356; +} +{ +add.f16x2 r5574, r5380, r5571; +} +{ +add.f16x2 r5577, r5383, r5384; +} +{ +mul.f16x2 r5580, r5577, r5358; +} +{ +add.f16x2 r5583, r5574, r5580; +} +{ +sub.f16x2 r5586, r5365, r5366; +} +{ +mul.f16x2 r5589, r5586, r5357; +} +{ +sub.f16x2 r5592, r5371, r5372; +} +{ +mul.f16x2 r5595, r5592, r5359; +} +{ +add.f16x2 r5598, r5589, r5595; +} +{ +sub.f16x2 %47, r5583, r5598; +} +{ +add.f16x2 r5604, r5377, r5378; +} +{ +mul.f16x2 r5607, r5604, r5358; +} +{ +add.f16x2 r5610, r5380, r5607; +} +{ +add.f16x2 r5613, r5383, r5384; +} +{ +mul.f16x2 r5616, r5613, r5360; +} +{ +add.f16x2 r5619, r5610, r5616; +} +{ +sub.f16x2 r5622, r5365, r5366; +} +{ +mul.f16x2 r5625, r5622, r5359; +} +{ +sub.f16x2 r5628, r5371, r5372; +} +{ +mul.f16x2 r5631, r5628, r5362; +} +{ +add.f16x2 r5634, r5625, r5631; +} +{ +add.f16x2 %27, r5619, r5634; +} +{ +add.f16x2 r5640, r5377, r5378; +} +{ +mul.f16x2 r5643, r5640, r5358; +} +{ +add.f16x2 r5646, r5380, r5643; +} +{ +add.f16x2 r5649, r5383, r5384; +} +{ +mul.f16x2 r5652, r5649, r5360; +} +{ +add.f16x2 r5655, r5646, r5652; +} +{ +sub.f16x2 r5658, r5365, r5366; +} +{ +mul.f16x2 r5661, r5658, r5359; +} +{ +sub.f16x2 r5664, r5371, r5372; +} +{ +mul.f16x2 r5667, r5664, r5362; +} +{ +add.f16x2 r5670, r5661, r5667; +} +{ +sub.f16x2 %37, r5655, r5670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5680, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5681, {low, high}; +} +{ +neg.f16x2 r5682, r5681; +} +{ +add.f16x2 r5684, r5685, r5686; +} +{ +add.f16x2 r5687, r5688, r5684; +} +{ +add.f16x2 r5690, r5691, r5692; +} +{ +add.f16x2 %8, r5687, r5690; +} +{ +add.f16x2 r5696, r5697, r5698; +} +{ +add.f16x2 r5699, r5700, r5696; +} +{ +add.f16x2 r5702, r5703, r5704; +} +{ +add.f16x2 %9, r5699, r5702; +} +{ +add.f16x2 r5708, r5685, r5686; +} +{ +mul.f16x2 r5711, r5708, r5676; +} +{ +add.f16x2 r5714, r5688, r5711; +} +{ +add.f16x2 r5717, r5691, r5692; +} +{ +mul.f16x2 r5720, r5717, r5678; +} +{ +add.f16x2 r5723, r5714, r5720; +} +{ +sub.f16x2 r5726, r5697, r5698; +} +{ +mul.f16x2 r5729, r5726, r5677; +} +{ +sub.f16x2 r5732, r5703, r5704; +} +{ +mul.f16x2 r5735, r5732, r5679; +} +{ +add.f16x2 r5738, r5729, r5735; +} +{ +sub.f16x2 %18, r5723, r5738; +} +{ +add.f16x2 r5744, r5685, r5686; +} +{ +mul.f16x2 r5747, r5744, r5676; +} +{ +add.f16x2 r5750, r5688, r5747; +} +{ +add.f16x2 r5753, r5691, r5692; +} +{ +mul.f16x2 r5756, r5753, r5678; +} +{ +add.f16x2 r5759, r5750, r5756; +} +{ +sub.f16x2 r5762, r5697, r5698; +} +{ +mul.f16x2 r5765, r5762, r5677; +} +{ +sub.f16x2 r5768, r5703, r5704; +} +{ +mul.f16x2 r5771, r5768, r5679; +} +{ +add.f16x2 r5774, r5765, r5771; +} +{ +add.f16x2 %48, r5759, r5774; +} +{ +add.f16x2 r5780, r5685, r5686; +} +{ +mul.f16x2 r5783, r5780, r5678; +} +{ +add.f16x2 r5786, r5688, r5783; +} +{ +add.f16x2 r5789, r5691, r5692; +} +{ +mul.f16x2 r5792, r5789, r5680; +} +{ +add.f16x2 r5795, r5786, r5792; +} +{ +sub.f16x2 r5798, r5697, r5698; +} +{ +mul.f16x2 r5801, r5798, r5679; +} +{ +sub.f16x2 r5804, r5703, r5704; +} +{ +mul.f16x2 r5807, r5804, r5682; +} +{ +add.f16x2 r5810, r5801, r5807; +} +{ +sub.f16x2 %28, r5795, r5810; +} +{ +add.f16x2 r5816, r5685, r5686; +} +{ +mul.f16x2 r5819, r5816, r5678; +} +{ +add.f16x2 r5822, r5688, r5819; +} +{ +add.f16x2 r5825, r5691, r5692; +} +{ +mul.f16x2 r5828, r5825, r5680; +} +{ +add.f16x2 r5831, r5822, r5828; +} +{ +sub.f16x2 r5834, r5697, r5698; +} +{ +mul.f16x2 r5837, r5834, r5679; +} +{ +sub.f16x2 r5840, r5703, r5704; +} +{ +mul.f16x2 r5843, r5840, r5682; +} +{ +add.f16x2 r5846, r5837, r5843; +} +{ +add.f16x2 %38, r5831, r5846; +} +{ +add.f16x2 r5852, r5697, r5698; +} +{ +mul.f16x2 r5855, r5852, r5676; +} +{ +add.f16x2 r5858, r5700, r5855; +} +{ +add.f16x2 r5861, r5703, r5704; +} +{ +mul.f16x2 r5864, r5861, r5678; +} +{ +add.f16x2 r5867, r5858, r5864; +} +{ +sub.f16x2 r5870, r5685, r5686; +} +{ +mul.f16x2 r5873, r5870, r5677; +} +{ +sub.f16x2 r5876, r5691, r5692; +} +{ +mul.f16x2 r5879, r5876, r5679; +} +{ +add.f16x2 r5882, r5873, r5879; +} +{ +add.f16x2 %19, r5867, r5882; +} +{ +add.f16x2 r5888, r5697, r5698; +} +{ +mul.f16x2 r5891, r5888, r5676; +} +{ +add.f16x2 r5894, r5700, r5891; +} +{ +add.f16x2 r5897, r5703, r5704; +} +{ +mul.f16x2 r5900, r5897, r5678; +} +{ +add.f16x2 r5903, r5894, r5900; +} +{ +sub.f16x2 r5906, r5685, r5686; +} +{ +mul.f16x2 r5909, r5906, r5677; +} +{ +sub.f16x2 r5912, r5691, r5692; +} +{ +mul.f16x2 r5915, r5912, r5679; +} +{ +add.f16x2 r5918, r5909, r5915; +} +{ +sub.f16x2 %49, r5903, r5918; +} +{ +add.f16x2 r5924, r5697, r5698; +} +{ +mul.f16x2 r5927, r5924, r5678; +} +{ +add.f16x2 r5930, r5700, r5927; +} +{ +add.f16x2 r5933, r5703, r5704; +} +{ +mul.f16x2 r5936, r5933, r5680; +} +{ +add.f16x2 r5939, r5930, r5936; +} +{ +sub.f16x2 r5942, r5685, r5686; +} +{ +mul.f16x2 r5945, r5942, r5679; +} +{ +sub.f16x2 r5948, r5691, r5692; +} +{ +mul.f16x2 r5951, r5948, r5682; +} +{ +add.f16x2 r5954, r5945, r5951; +} +{ +add.f16x2 %29, r5939, r5954; +} +{ +add.f16x2 r5960, r5697, r5698; +} +{ +mul.f16x2 r5963, r5960, r5678; +} +{ +add.f16x2 r5966, r5700, r5963; +} +{ +add.f16x2 r5969, r5703, r5704; +} +{ +mul.f16x2 r5972, r5969, r5680; +} +{ +add.f16x2 r5975, r5966, r5972; +} +{ +sub.f16x2 r5978, r5685, r5686; +} +{ +mul.f16x2 r5981, r5978, r5679; +} +{ +sub.f16x2 r5984, r5691, r5692; +} +{ +mul.f16x2 r5987, r5984, r5682; +} +{ +add.f16x2 r5990, r5981, r5987; +} +{ +sub.f16x2 %39, r5975, r5990; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[18].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<904, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<332>; +.reg .b32 r<6058>; +.reg .b64 rd<4>; +mov.u32 r6056, %tid.y; +mov.u32 r6057, %50; +mad.lo.s32 r5998, r6056, 500, r6057; +mov.u32 r5999, %tid.x; +mov.f32 f326, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1, {low, high}; +} +mov.f32 f328, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2, {low, high}; +} +mov.f32 f322, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r3, {low, high}; +} +mov.f32 f324, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %95, %81; +} +{ +add.f16x2 r12, %54, r9; +} +{ +add.f16x2 r15, %60, %94; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %70, %58; +} +{ +add.f16x2 r24, %79, r21; +} +{ +add.f16x2 r27, %87, %69; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %95, %81; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %54, r36; +} +{ +add.f16x2 r42, %60, %94; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %70, %58; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %87, %69; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %95, %81; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %54, r72; +} +{ +add.f16x2 r78, %60, %94; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %70, %58; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %87, %69; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %95, %81; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %54, r108; +} +{ +add.f16x2 r114, %60, %94; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %70, %58; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %87, %69; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %95, %81; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %54, r144; +} +{ +add.f16x2 r150, %60, %94; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %70, %58; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %87, %69; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %70, %58; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %79, r180; +} +{ +add.f16x2 r186, %87, %69; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %95, %81; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %60, %94; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %70, %58; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %79, r216; +} +{ +add.f16x2 r222, %87, %69; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %95, %81; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %60, %94; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %70, %58; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %79, r252; +} +{ +add.f16x2 r258, %87, %69; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %95, %81; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %60, %94; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %70, %58; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %79, r288; +} +{ +add.f16x2 r294, %87, %69; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %95, %81; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %60, %94; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %97, %83; +} +{ +add.f16x2 r332, %55, r329; +} +{ +add.f16x2 r335, %62, %96; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %72, %61; +} +{ +add.f16x2 r344, %80, r341; +} +{ +add.f16x2 r347, %89, %71; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %97, %83; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %55, r356; +} +{ +add.f16x2 r362, %62, %96; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %72, %61; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %89, %71; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %97, %83; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %55, r392; +} +{ +add.f16x2 r398, %62, %96; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %72, %61; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %89, %71; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %97, %83; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %55, r428; +} +{ +add.f16x2 r434, %62, %96; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %72, %61; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %89, %71; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %97, %83; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %55, r464; +} +{ +add.f16x2 r470, %62, %96; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %72, %61; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %89, %71; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %72, %61; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %80, r500; +} +{ +add.f16x2 r506, %89, %71; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %97, %83; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %62, %96; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %72, %61; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %80, r536; +} +{ +add.f16x2 r542, %89, %71; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %97, %83; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %62, %96; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %72, %61; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %80, r572; +} +{ +add.f16x2 r578, %89, %71; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %97, %83; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %62, %96; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %72, %61; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %80, r608; +} +{ +add.f16x2 r614, %89, %71; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %97, %83; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %62, %96; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +add.f16x2 r649, %99, %85; +} +{ +add.f16x2 r652, %56, r649; +} +{ +add.f16x2 r655, %64, %98; +} +{ +add.f16x2 r658, r652, r655; +} +{ +add.f16x2 r661, %74, %63; +} +{ +add.f16x2 r664, %82, r661; +} +{ +add.f16x2 r667, %91, %73; +} +{ +add.f16x2 r670, r664, r667; +} +{ +add.f16x2 r673, %99, %85; +} +{ +mul.f16x2 r676, r673, r641; +} +{ +add.f16x2 r679, %56, r676; +} +{ +add.f16x2 r682, %64, %98; +} +{ +mul.f16x2 r685, r682, r643; +} +{ +add.f16x2 r688, r679, r685; +} +{ +sub.f16x2 r691, %74, %63; +} +{ +mul.f16x2 r694, r691, r642; +} +{ +sub.f16x2 r697, %91, %73; +} +{ +mul.f16x2 r700, r697, r644; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r688, r703; +} +{ +add.f16x2 r709, %99, %85; +} +{ +mul.f16x2 r712, r709, r641; +} +{ +add.f16x2 r715, %56, r712; +} +{ +add.f16x2 r718, %64, %98; +} +{ +mul.f16x2 r721, r718, r643; +} +{ +add.f16x2 r724, r715, r721; +} +{ +sub.f16x2 r727, %74, %63; +} +{ +mul.f16x2 r730, r727, r642; +} +{ +sub.f16x2 r733, %91, %73; +} +{ +mul.f16x2 r736, r733, r644; +} +{ +add.f16x2 r739, r730, r736; +} +{ +add.f16x2 r742, r724, r739; +} +{ +add.f16x2 r745, %99, %85; +} +{ +mul.f16x2 r748, r745, r643; +} +{ +add.f16x2 r751, %56, r748; +} +{ +add.f16x2 r754, %64, %98; +} +{ +mul.f16x2 r757, r754, r645; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %63; +} +{ +mul.f16x2 r766, r763, r644; +} +{ +sub.f16x2 r769, %91, %73; +} +{ +mul.f16x2 r772, r769, r647; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r760, r775; +} +{ +add.f16x2 r781, %99, %85; +} +{ +mul.f16x2 r784, r781, r643; +} +{ +add.f16x2 r787, %56, r784; +} +{ +add.f16x2 r790, %64, %98; +} +{ +mul.f16x2 r793, r790, r645; +} +{ +add.f16x2 r796, r787, r793; +} +{ +sub.f16x2 r799, %74, %63; +} +{ +mul.f16x2 r802, r799, r644; +} +{ +sub.f16x2 r805, %91, %73; +} +{ +mul.f16x2 r808, r805, r647; +} +{ +add.f16x2 r811, r802, r808; +} +{ +add.f16x2 r814, r796, r811; +} +{ +add.f16x2 r817, %74, %63; +} +{ +mul.f16x2 r820, r817, r641; +} +{ +add.f16x2 r823, %82, r820; +} +{ +add.f16x2 r826, %91, %73; +} +{ +mul.f16x2 r829, r826, r643; +} +{ +add.f16x2 r832, r823, r829; +} +{ +sub.f16x2 r835, %99, %85; +} +{ +mul.f16x2 r838, r835, r642; +} +{ +sub.f16x2 r841, %64, %98; +} +{ +mul.f16x2 r844, r841, r644; +} +{ +add.f16x2 r847, r838, r844; +} +{ +add.f16x2 r850, r832, r847; +} +{ +add.f16x2 r853, %74, %63; +} +{ +mul.f16x2 r856, r853, r641; +} +{ +add.f16x2 r859, %82, r856; +} +{ +add.f16x2 r862, %91, %73; +} +{ +mul.f16x2 r865, r862, r643; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %99, %85; +} +{ +mul.f16x2 r874, r871, r642; +} +{ +sub.f16x2 r877, %64, %98; +} +{ +mul.f16x2 r880, r877, r644; +} +{ +add.f16x2 r883, r874, r880; +} +{ +sub.f16x2 r886, r868, r883; +} +{ +add.f16x2 r889, %74, %63; +} +{ +mul.f16x2 r892, r889, r643; +} +{ +add.f16x2 r895, %82, r892; +} +{ +add.f16x2 r898, %91, %73; +} +{ +mul.f16x2 r901, r898, r645; +} +{ +add.f16x2 r904, r895, r901; +} +{ +sub.f16x2 r907, %99, %85; +} +{ +mul.f16x2 r910, r907, r644; +} +{ +sub.f16x2 r913, %64, %98; +} +{ +mul.f16x2 r916, r913, r647; +} +{ +add.f16x2 r919, r910, r916; +} +{ +add.f16x2 r922, r904, r919; +} +{ +add.f16x2 r925, %74, %63; +} +{ +mul.f16x2 r928, r925, r643; +} +{ +add.f16x2 r931, %82, r928; +} +{ +add.f16x2 r934, %91, %73; +} +{ +mul.f16x2 r937, r934, r645; +} +{ +add.f16x2 r940, r931, r937; +} +{ +sub.f16x2 r943, %99, %85; +} +{ +mul.f16x2 r946, r943, r644; +} +{ +sub.f16x2 r949, %64, %98; +} +{ +mul.f16x2 r952, r949, r647; +} +{ +add.f16x2 r955, r946, r952; +} +{ +sub.f16x2 r958, r940, r955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r966, {low, high}; +} +{ +neg.f16x2 r967, r966; +} +{ +add.f16x2 r969, %51, %88; +} +{ +add.f16x2 r972, %57, r969; +} +{ +add.f16x2 r975, %66, %100; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, %76, %65; +} +{ +add.f16x2 r984, %84, r981; +} +{ +add.f16x2 r987, %92, %75; +} +{ +add.f16x2 r990, r984, r987; +} +{ +add.f16x2 r993, %51, %88; +} +{ +mul.f16x2 r996, r993, r961; +} +{ +add.f16x2 r999, %57, r996; +} +{ +add.f16x2 r1002, %66, %100; +} +{ +mul.f16x2 r1005, r1002, r963; +} +{ +add.f16x2 r1008, r999, r1005; +} +{ +sub.f16x2 r1011, %76, %65; +} +{ +mul.f16x2 r1014, r1011, r962; +} +{ +sub.f16x2 r1017, %92, %75; +} +{ +mul.f16x2 r1020, r1017, r964; +} +{ +add.f16x2 r1023, r1014, r1020; +} +{ +sub.f16x2 r1026, r1008, r1023; +} +{ +add.f16x2 r1029, %51, %88; +} +{ +mul.f16x2 r1032, r1029, r961; +} +{ +add.f16x2 r1035, %57, r1032; +} +{ +add.f16x2 r1038, %66, %100; +} +{ +mul.f16x2 r1041, r1038, r963; +} +{ +add.f16x2 r1044, r1035, r1041; +} +{ +sub.f16x2 r1047, %76, %65; +} +{ +mul.f16x2 r1050, r1047, r962; +} +{ +sub.f16x2 r1053, %92, %75; +} +{ +mul.f16x2 r1056, r1053, r964; +} +{ +add.f16x2 r1059, r1050, r1056; +} +{ +add.f16x2 r1062, r1044, r1059; +} +{ +add.f16x2 r1065, %51, %88; +} +{ +mul.f16x2 r1068, r1065, r963; +} +{ +add.f16x2 r1071, %57, r1068; +} +{ +add.f16x2 r1074, %66, %100; +} +{ +mul.f16x2 r1077, r1074, r965; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +sub.f16x2 r1083, %76, %65; +} +{ +mul.f16x2 r1086, r1083, r964; +} +{ +sub.f16x2 r1089, %92, %75; +} +{ +mul.f16x2 r1092, r1089, r967; +} +{ +add.f16x2 r1095, r1086, r1092; +} +{ +sub.f16x2 r1098, r1080, r1095; +} +{ +add.f16x2 r1101, %51, %88; +} +{ +mul.f16x2 r1104, r1101, r963; +} +{ +add.f16x2 r1107, %57, r1104; +} +{ +add.f16x2 r1110, %66, %100; +} +{ +mul.f16x2 r1113, r1110, r965; +} +{ +add.f16x2 r1116, r1107, r1113; +} +{ +sub.f16x2 r1119, %76, %65; +} +{ +mul.f16x2 r1122, r1119, r964; +} +{ +sub.f16x2 r1125, %92, %75; +} +{ +mul.f16x2 r1128, r1125, r967; +} +{ +add.f16x2 r1131, r1122, r1128; +} +{ +add.f16x2 r1134, r1116, r1131; +} +{ +add.f16x2 r1137, %76, %65; +} +{ +mul.f16x2 r1140, r1137, r961; +} +{ +add.f16x2 r1143, %84, r1140; +} +{ +add.f16x2 r1146, %92, %75; +} +{ +mul.f16x2 r1149, r1146, r963; +} +{ +add.f16x2 r1152, r1143, r1149; +} +{ +sub.f16x2 r1155, %51, %88; +} +{ +mul.f16x2 r1158, r1155, r962; +} +{ +sub.f16x2 r1161, %66, %100; +} +{ +mul.f16x2 r1164, r1161, r964; +} +{ +add.f16x2 r1167, r1158, r1164; +} +{ +add.f16x2 r1170, r1152, r1167; +} +{ +add.f16x2 r1173, %76, %65; +} +{ +mul.f16x2 r1176, r1173, r961; +} +{ +add.f16x2 r1179, %84, r1176; +} +{ +add.f16x2 r1182, %92, %75; +} +{ +mul.f16x2 r1185, r1182, r963; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +sub.f16x2 r1191, %51, %88; +} +{ +mul.f16x2 r1194, r1191, r962; +} +{ +sub.f16x2 r1197, %66, %100; +} +{ +mul.f16x2 r1200, r1197, r964; +} +{ +add.f16x2 r1203, r1194, r1200; +} +{ +sub.f16x2 r1206, r1188, r1203; +} +{ +add.f16x2 r1209, %76, %65; +} +{ +mul.f16x2 r1212, r1209, r963; +} +{ +add.f16x2 r1215, %84, r1212; +} +{ +add.f16x2 r1218, %92, %75; +} +{ +mul.f16x2 r1221, r1218, r965; +} +{ +add.f16x2 r1224, r1215, r1221; +} +{ +sub.f16x2 r1227, %51, %88; +} +{ +mul.f16x2 r1230, r1227, r964; +} +{ +sub.f16x2 r1233, %66, %100; +} +{ +mul.f16x2 r1236, r1233, r967; +} +{ +add.f16x2 r1239, r1230, r1236; +} +{ +add.f16x2 r1242, r1224, r1239; +} +{ +add.f16x2 r1245, %76, %65; +} +{ +mul.f16x2 r1248, r1245, r963; +} +{ +add.f16x2 r1251, %84, r1248; +} +{ +add.f16x2 r1254, %92, %75; +} +{ +mul.f16x2 r1257, r1254, r965; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +sub.f16x2 r1263, %51, %88; +} +{ +mul.f16x2 r1266, r1263, r964; +} +{ +sub.f16x2 r1269, %66, %100; +} +{ +mul.f16x2 r1272, r1269, r967; +} +{ +add.f16x2 r1275, r1266, r1272; +} +{ +sub.f16x2 r1278, r1260, r1275; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1281, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1286, {low, high}; +} +{ +neg.f16x2 r1287, r1286; +} +{ +add.f16x2 r1289, %53, %90; +} +{ +add.f16x2 r1292, %59, r1289; +} +{ +add.f16x2 r1295, %68, %52; +} +{ +add.f16x2 r1298, r1292, r1295; +} +{ +add.f16x2 r1301, %78, %67; +} +{ +add.f16x2 r1304, %86, r1301; +} +{ +add.f16x2 r1307, %93, %77; +} +{ +add.f16x2 r1310, r1304, r1307; +} +{ +add.f16x2 r1313, %53, %90; +} +{ +mul.f16x2 r1316, r1313, r1281; +} +{ +add.f16x2 r1319, %59, r1316; +} +{ +add.f16x2 r1322, %68, %52; +} +{ +mul.f16x2 r1325, r1322, r1283; +} +{ +add.f16x2 r1328, r1319, r1325; +} +{ +sub.f16x2 r1331, %78, %67; +} +{ +mul.f16x2 r1334, r1331, r1282; +} +{ +sub.f16x2 r1337, %93, %77; +} +{ +mul.f16x2 r1340, r1337, r1284; +} +{ +add.f16x2 r1343, r1334, r1340; +} +{ +sub.f16x2 r1346, r1328, r1343; +} +{ +add.f16x2 r1349, %53, %90; +} +{ +mul.f16x2 r1352, r1349, r1281; +} +{ +add.f16x2 r1355, %59, r1352; +} +{ +add.f16x2 r1358, %68, %52; +} +{ +mul.f16x2 r1361, r1358, r1283; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +sub.f16x2 r1367, %78, %67; +} +{ +mul.f16x2 r1370, r1367, r1282; +} +{ +sub.f16x2 r1373, %93, %77; +} +{ +mul.f16x2 r1376, r1373, r1284; +} +{ +add.f16x2 r1379, r1370, r1376; +} +{ +add.f16x2 r1382, r1364, r1379; +} +{ +add.f16x2 r1385, %53, %90; +} +{ +mul.f16x2 r1388, r1385, r1283; +} +{ +add.f16x2 r1391, %59, r1388; +} +{ +add.f16x2 r1394, %68, %52; +} +{ +mul.f16x2 r1397, r1394, r1285; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, %78, %67; +} +{ +mul.f16x2 r1406, r1403, r1284; +} +{ +sub.f16x2 r1409, %93, %77; +} +{ +mul.f16x2 r1412, r1409, r1287; +} +{ +add.f16x2 r1415, r1406, r1412; +} +{ +sub.f16x2 r1418, r1400, r1415; +} +{ +add.f16x2 r1421, %53, %90; +} +{ +mul.f16x2 r1424, r1421, r1283; +} +{ +add.f16x2 r1427, %59, r1424; +} +{ +add.f16x2 r1430, %68, %52; +} +{ +mul.f16x2 r1433, r1430, r1285; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +sub.f16x2 r1439, %78, %67; +} +{ +mul.f16x2 r1442, r1439, r1284; +} +{ +sub.f16x2 r1445, %93, %77; +} +{ +mul.f16x2 r1448, r1445, r1287; +} +{ +add.f16x2 r1451, r1442, r1448; +} +{ +add.f16x2 r1454, r1436, r1451; +} +{ +add.f16x2 r1457, %78, %67; +} +{ +mul.f16x2 r1460, r1457, r1281; +} +{ +add.f16x2 r1463, %86, r1460; +} +{ +add.f16x2 r1466, %93, %77; +} +{ +mul.f16x2 r1469, r1466, r1283; +} +{ +add.f16x2 r1472, r1463, r1469; +} +{ +sub.f16x2 r1475, %53, %90; +} +{ +mul.f16x2 r1478, r1475, r1282; +} +{ +sub.f16x2 r1481, %68, %52; +} +{ +mul.f16x2 r1484, r1481, r1284; +} +{ +add.f16x2 r1487, r1478, r1484; +} +{ +add.f16x2 r1490, r1472, r1487; +} +{ +add.f16x2 r1493, %78, %67; +} +{ +mul.f16x2 r1496, r1493, r1281; +} +{ +add.f16x2 r1499, %86, r1496; +} +{ +add.f16x2 r1502, %93, %77; +} +{ +mul.f16x2 r1505, r1502, r1283; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, %53, %90; +} +{ +mul.f16x2 r1514, r1511, r1282; +} +{ +sub.f16x2 r1517, %68, %52; +} +{ +mul.f16x2 r1520, r1517, r1284; +} +{ +add.f16x2 r1523, r1514, r1520; +} +{ +sub.f16x2 r1526, r1508, r1523; +} +{ +add.f16x2 r1529, %78, %67; +} +{ +mul.f16x2 r1532, r1529, r1283; +} +{ +add.f16x2 r1535, %86, r1532; +} +{ +add.f16x2 r1538, %93, %77; +} +{ +mul.f16x2 r1541, r1538, r1285; +} +{ +add.f16x2 r1544, r1535, r1541; +} +{ +sub.f16x2 r1547, %53, %90; +} +{ +mul.f16x2 r1550, r1547, r1284; +} +{ +sub.f16x2 r1553, %68, %52; +} +{ +mul.f16x2 r1556, r1553, r1287; +} +{ +add.f16x2 r1559, r1550, r1556; +} +{ +add.f16x2 r1562, r1544, r1559; +} +{ +add.f16x2 r1565, %78, %67; +} +{ +mul.f16x2 r1568, r1565, r1283; +} +{ +add.f16x2 r1571, %86, r1568; +} +{ +add.f16x2 r1574, %93, %77; +} +{ +mul.f16x2 r1577, r1574, r1285; +} +{ +add.f16x2 r1580, r1571, r1577; +} +{ +sub.f16x2 r1583, %53, %90; +} +{ +mul.f16x2 r1586, r1583, r1284; +} +{ +sub.f16x2 r1589, %68, %52; +} +{ +mul.f16x2 r1592, r1589, r1287; +} +{ +add.f16x2 r1595, r1586, r1592; +} +{ +sub.f16x2 r1598, r1580, r1595; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1601, {low, high}; +} +mov.f32 f64, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1602, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1603, {low, high}; +} +mov.f32 f68, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1604, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1605, {low, high}; +} +mov.f32 f72, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1606, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1607, {low, high}; +} +mov.f32 f76, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1608, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1611, {low, high}; +} +mov.f32 f84, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1612, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1615, {low, high}; +} +mov.f32 f92, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1616, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1617, {low, high}; +} +mov.f32 f96, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1618, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1623, {low, high}; +} +mov.f32 f108, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1631, {low, high}; +} +mov.f32 f124, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1632, {low, high}; +} +{ +mul.f16x2 r1649, r386, r1601; +} +{ +mul.f16x2 r1652, r530, r1602; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r386, r1602; +} +{ +fma.rn.f16x2 r1661, r530, r1601, r1658; +} +{ +mul.f16x2 r1665, r706, r1603; +} +{ +mul.f16x2 r1668, r850, r1604; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r706, r1604; +} +{ +fma.rn.f16x2 r1677, r850, r1603, r1674; +} +{ +mul.f16x2 r1681, r1026, r1605; +} +{ +mul.f16x2 r1684, r1170, r1606; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r1026, r1606; +} +{ +fma.rn.f16x2 r1693, r1170, r1605, r1690; +} +{ +mul.f16x2 r1697, r1346, r1607; +} +{ +mul.f16x2 r1700, r1490, r1608; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r1346, r1608; +} +{ +fma.rn.f16x2 r1709, r1490, r1607, r1706; +} +{ +mul.f16x2 r1713, r458, r1603; +} +{ +mul.f16x2 r1716, r602, r1604; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r458, r1604; +} +{ +fma.rn.f16x2 r1725, r602, r1603, r1722; +} +{ +mul.f16x2 r1729, r778, r1607; +} +{ +mul.f16x2 r1732, r922, r1608; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r778, r1608; +} +{ +fma.rn.f16x2 r1741, r922, r1607, r1738; +} +{ +mul.f16x2 r1745, r1098, r1611; +} +{ +mul.f16x2 r1748, r1242, r1612; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r1098, r1612; +} +{ +fma.rn.f16x2 r1757, r1242, r1611, r1754; +} +{ +mul.f16x2 r1761, r1418, r1615; +} +{ +mul.f16x2 r1764, r1562, r1616; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r1418, r1616; +} +{ +fma.rn.f16x2 r1773, r1562, r1615, r1770; +} +{ +mul.f16x2 r1777, r494, r1605; +} +{ +mul.f16x2 r1780, r638, r1606; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r494, r1606; +} +{ +fma.rn.f16x2 r1789, r638, r1605, r1786; +} +{ +mul.f16x2 r1793, r814, r1611; +} +{ +mul.f16x2 r1796, r958, r1612; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r814, r1612; +} +{ +fma.rn.f16x2 r1805, r958, r1611, r1802; +} +{ +mul.f16x2 r1809, r1134, r1617; +} +{ +mul.f16x2 r1812, r1278, r1618; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1134, r1618; +} +{ +fma.rn.f16x2 r1821, r1278, r1617, r1818; +} +{ +mul.f16x2 r1825, r1454, r1623; +} +{ +mul.f16x2 r1828, r1598, r1624; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1454, r1624; +} +{ +fma.rn.f16x2 r1837, r1598, r1623, r1834; +} +{ +mul.f16x2 r1841, r422, r1607; +} +{ +mul.f16x2 r1844, r566, r1608; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r422, r1608; +} +{ +fma.rn.f16x2 r1853, r566, r1607, r1850; +} +{ +mul.f16x2 r1857, r742, r1615; +} +{ +mul.f16x2 r1860, r886, r1616; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r742, r1616; +} +{ +fma.rn.f16x2 r1869, r886, r1615, r1866; +} +{ +mul.f16x2 r1873, r1062, r1623; +} +{ +mul.f16x2 r1876, r1206, r1624; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1062, r1624; +} +{ +fma.rn.f16x2 r1885, r1206, r1623, r1882; +} +{ +mul.f16x2 r1889, r1382, r1631; +} +{ +mul.f16x2 r1892, r1526, r1632; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1382, r1632; +} +{ +fma.rn.f16x2 r1901, r1526, r1631, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1905, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1906, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1910, {low, high}; +} +{ +neg.f16x2 r1911, r1910; +} +{ +add.f16x2 r1913, r338, r1298; +} +{ +add.f16x2 r1916, r18, r1913; +} +{ +add.f16x2 r1919, r658, r978; +} +{ +add.f16x2 r1922, r1916, r1919; +} +{ +add.f16x2 r1925, r350, r1310; +} +{ +add.f16x2 r1928, r30, r1925; +} +{ +add.f16x2 r1931, r670, r990; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r338, r1298; +} +{ +mul.f16x2 r1940, r1937, r1905; +} +{ +add.f16x2 r1943, r18, r1940; +} +{ +add.f16x2 r1946, r658, r978; +} +{ +mul.f16x2 r1949, r1946, r1907; +} +{ +add.f16x2 r1952, r1943, r1949; +} +{ +sub.f16x2 r1955, r350, r1310; +} +{ +mul.f16x2 r1958, r1955, r1906; +} +{ +sub.f16x2 r1961, r670, r990; +} +{ +mul.f16x2 r1964, r1961, r1908; +} +{ +add.f16x2 r1967, r1958, r1964; +} +{ +sub.f16x2 r1970, r1952, r1967; +} +{ +add.f16x2 r1973, r338, r1298; +} +{ +mul.f16x2 r1976, r1973, r1905; +} +{ +add.f16x2 r1979, r18, r1976; +} +{ +add.f16x2 r1982, r658, r978; +} +{ +mul.f16x2 r1985, r1982, r1907; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +sub.f16x2 r1991, r350, r1310; +} +{ +mul.f16x2 r1994, r1991, r1906; +} +{ +sub.f16x2 r1997, r670, r990; +} +{ +mul.f16x2 r2000, r1997, r1908; +} +{ +add.f16x2 r2003, r1994, r2000; +} +{ +add.f16x2 r2006, r1988, r2003; +} +{ +add.f16x2 r2009, r338, r1298; +} +{ +mul.f16x2 r2012, r2009, r1907; +} +{ +add.f16x2 r2015, r18, r2012; +} +{ +add.f16x2 r2018, r658, r978; +} +{ +mul.f16x2 r2021, r2018, r1909; +} +{ +add.f16x2 r2024, r2015, r2021; +} +{ +sub.f16x2 r2027, r350, r1310; +} +{ +mul.f16x2 r2030, r2027, r1908; +} +{ +sub.f16x2 r2033, r670, r990; +} +{ +mul.f16x2 r2036, r2033, r1911; +} +{ +add.f16x2 r2039, r2030, r2036; +} +{ +sub.f16x2 r2042, r2024, r2039; +} +{ +add.f16x2 r2045, r338, r1298; +} +{ +mul.f16x2 r2048, r2045, r1907; +} +{ +add.f16x2 r2051, r18, r2048; +} +{ +add.f16x2 r2054, r658, r978; +} +{ +mul.f16x2 r2057, r2054, r1909; +} +{ +add.f16x2 r2060, r2051, r2057; +} +{ +sub.f16x2 r2063, r350, r1310; +} +{ +mul.f16x2 r2066, r2063, r1908; +} +{ +sub.f16x2 r2069, r670, r990; +} +{ +mul.f16x2 r2072, r2069, r1911; +} +{ +add.f16x2 r2075, r2066, r2072; +} +{ +add.f16x2 r2078, r2060, r2075; +} +{ +add.f16x2 r2081, r350, r1310; +} +{ +mul.f16x2 r2084, r2081, r1905; +} +{ +add.f16x2 r2087, r30, r2084; +} +{ +add.f16x2 r2090, r670, r990; +} +{ +mul.f16x2 r2093, r2090, r1907; +} +{ +add.f16x2 r2096, r2087, r2093; +} +{ +sub.f16x2 r2099, r338, r1298; +} +{ +mul.f16x2 r2102, r2099, r1906; +} +{ +sub.f16x2 r2105, r658, r978; +} +{ +mul.f16x2 r2108, r2105, r1908; +} +{ +add.f16x2 r2111, r2102, r2108; +} +{ +add.f16x2 r2114, r2096, r2111; +} +{ +add.f16x2 r2117, r350, r1310; +} +{ +mul.f16x2 r2120, r2117, r1905; +} +{ +add.f16x2 r2123, r30, r2120; +} +{ +add.f16x2 r2126, r670, r990; +} +{ +mul.f16x2 r2129, r2126, r1907; +} +{ +add.f16x2 r2132, r2123, r2129; +} +{ +sub.f16x2 r2135, r338, r1298; +} +{ +mul.f16x2 r2138, r2135, r1906; +} +{ +sub.f16x2 r2141, r658, r978; +} +{ +mul.f16x2 r2144, r2141, r1908; +} +{ +add.f16x2 r2147, r2138, r2144; +} +{ +sub.f16x2 r2150, r2132, r2147; +} +{ +add.f16x2 r2153, r350, r1310; +} +{ +mul.f16x2 r2156, r2153, r1907; +} +{ +add.f16x2 r2159, r30, r2156; +} +{ +add.f16x2 r2162, r670, r990; +} +{ +mul.f16x2 r2165, r2162, r1909; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r338, r1298; +} +{ +mul.f16x2 r2174, r2171, r1908; +} +{ +sub.f16x2 r2177, r658, r978; +} +{ +mul.f16x2 r2180, r2177, r1911; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +add.f16x2 r2186, r2168, r2183; +} +{ +add.f16x2 r2189, r350, r1310; +} +{ +mul.f16x2 r2192, r2189, r1907; +} +{ +add.f16x2 r2195, r30, r2192; +} +{ +add.f16x2 r2198, r670, r990; +} +{ +mul.f16x2 r2201, r2198, r1909; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r338, r1298; +} +{ +mul.f16x2 r2210, r2207, r1908; +} +{ +sub.f16x2 r2213, r658, r978; +} +{ +mul.f16x2 r2216, r2213, r1911; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +sub.f16x2 r2222, r2204, r2219; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2226, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2228, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2230, {low, high}; +} +{ +neg.f16x2 r2231, r2230; +} +{ +add.f16x2 r2233, r1655, r1703; +} +{ +add.f16x2 r2236, r66, r2233; +} +{ +add.f16x2 r2239, r1671, r1687; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, r1661, r1709; +} +{ +add.f16x2 r2248, r210, r2245; +} +{ +add.f16x2 r2251, r1677, r1693; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r1655, r1703; +} +{ +mul.f16x2 r2260, r2257, r2225; +} +{ +add.f16x2 r2263, r66, r2260; +} +{ +add.f16x2 r2266, r1671, r1687; +} +{ +mul.f16x2 r2269, r2266, r2227; +} +{ +add.f16x2 r2272, r2263, r2269; +} +{ +sub.f16x2 r2275, r1661, r1709; +} +{ +mul.f16x2 r2278, r2275, r2226; +} +{ +sub.f16x2 r2281, r1677, r1693; +} +{ +mul.f16x2 r2284, r2281, r2228; +} +{ +add.f16x2 r2287, r2278, r2284; +} +{ +sub.f16x2 r2290, r2272, r2287; +} +{ +add.f16x2 r2293, r1655, r1703; +} +{ +mul.f16x2 r2296, r2293, r2225; +} +{ +add.f16x2 r2299, r66, r2296; +} +{ +add.f16x2 r2302, r1671, r1687; +} +{ +mul.f16x2 r2305, r2302, r2227; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1661, r1709; +} +{ +mul.f16x2 r2314, r2311, r2226; +} +{ +sub.f16x2 r2317, r1677, r1693; +} +{ +mul.f16x2 r2320, r2317, r2228; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +add.f16x2 r2326, r2308, r2323; +} +{ +add.f16x2 r2329, r1655, r1703; +} +{ +mul.f16x2 r2332, r2329, r2227; +} +{ +add.f16x2 r2335, r66, r2332; +} +{ +add.f16x2 r2338, r1671, r1687; +} +{ +mul.f16x2 r2341, r2338, r2229; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1661, r1709; +} +{ +mul.f16x2 r2350, r2347, r2228; +} +{ +sub.f16x2 r2353, r1677, r1693; +} +{ +mul.f16x2 r2356, r2353, r2231; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r2344, r2359; +} +{ +add.f16x2 r2365, r1655, r1703; +} +{ +mul.f16x2 r2368, r2365, r2227; +} +{ +add.f16x2 r2371, r66, r2368; +} +{ +add.f16x2 r2374, r1671, r1687; +} +{ +mul.f16x2 r2377, r2374, r2229; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1661, r1709; +} +{ +mul.f16x2 r2386, r2383, r2228; +} +{ +sub.f16x2 r2389, r1677, r1693; +} +{ +mul.f16x2 r2392, r2389, r2231; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +add.f16x2 r2398, r2380, r2395; +} +{ +add.f16x2 r2401, r1661, r1709; +} +{ +mul.f16x2 r2404, r2401, r2225; +} +{ +add.f16x2 r2407, r210, r2404; +} +{ +add.f16x2 r2410, r1677, r1693; +} +{ +mul.f16x2 r2413, r2410, r2227; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1655, r1703; +} +{ +mul.f16x2 r2422, r2419, r2226; +} +{ +sub.f16x2 r2425, r1671, r1687; +} +{ +mul.f16x2 r2428, r2425, r2228; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +{ +add.f16x2 r2437, r1661, r1709; +} +{ +mul.f16x2 r2440, r2437, r2225; +} +{ +add.f16x2 r2443, r210, r2440; +} +{ +add.f16x2 r2446, r1677, r1693; +} +{ +mul.f16x2 r2449, r2446, r2227; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1655, r1703; +} +{ +mul.f16x2 r2458, r2455, r2226; +} +{ +sub.f16x2 r2461, r1671, r1687; +} +{ +mul.f16x2 r2464, r2461, r2228; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r2452, r2467; +} +{ +add.f16x2 r2473, r1661, r1709; +} +{ +mul.f16x2 r2476, r2473, r2227; +} +{ +add.f16x2 r2479, r210, r2476; +} +{ +add.f16x2 r2482, r1677, r1693; +} +{ +mul.f16x2 r2485, r2482, r2229; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1655, r1703; +} +{ +mul.f16x2 r2494, r2491, r2228; +} +{ +sub.f16x2 r2497, r1671, r1687; +} +{ +mul.f16x2 r2500, r2497, r2231; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +add.f16x2 r2506, r2488, r2503; +} +{ +add.f16x2 r2509, r1661, r1709; +} +{ +mul.f16x2 r2512, r2509, r2227; +} +{ +add.f16x2 r2515, r210, r2512; +} +{ +add.f16x2 r2518, r1677, r1693; +} +{ +mul.f16x2 r2521, r2518, r2229; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1655, r1703; +} +{ +mul.f16x2 r2530, r2527, r2228; +} +{ +sub.f16x2 r2533, r1671, r1687; +} +{ +mul.f16x2 r2536, r2533, r2231; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2524, r2539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2545, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2546, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2547, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2548, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2549, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2550, {low, high}; +} +{ +neg.f16x2 r2551, r2550; +} +{ +add.f16x2 r2553, r1719, r1767; +} +{ +add.f16x2 r2556, r138, r2553; +} +{ +add.f16x2 r2559, r1735, r1751; +} +{ +add.f16x2 r2562, r2556, r2559; +} +{ +add.f16x2 r2565, r1725, r1773; +} +{ +add.f16x2 r2568, r282, r2565; +} +{ +add.f16x2 r2571, r1741, r1757; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r1719, r1767; +} +{ +mul.f16x2 r2580, r2577, r2545; +} +{ +add.f16x2 r2583, r138, r2580; +} +{ +add.f16x2 r2586, r1735, r1751; +} +{ +mul.f16x2 r2589, r2586, r2547; +} +{ +add.f16x2 r2592, r2583, r2589; +} +{ +sub.f16x2 r2595, r1725, r1773; +} +{ +mul.f16x2 r2598, r2595, r2546; +} +{ +sub.f16x2 r2601, r1741, r1757; +} +{ +mul.f16x2 r2604, r2601, r2548; +} +{ +add.f16x2 r2607, r2598, r2604; +} +{ +sub.f16x2 r2610, r2592, r2607; +} +{ +add.f16x2 r2613, r1719, r1767; +} +{ +mul.f16x2 r2616, r2613, r2545; +} +{ +add.f16x2 r2619, r138, r2616; +} +{ +add.f16x2 r2622, r1735, r1751; +} +{ +mul.f16x2 r2625, r2622, r2547; +} +{ +add.f16x2 r2628, r2619, r2625; +} +{ +sub.f16x2 r2631, r1725, r1773; +} +{ +mul.f16x2 r2634, r2631, r2546; +} +{ +sub.f16x2 r2637, r1741, r1757; +} +{ +mul.f16x2 r2640, r2637, r2548; +} +{ +add.f16x2 r2643, r2634, r2640; +} +{ +add.f16x2 r2646, r2628, r2643; +} +{ +add.f16x2 r2649, r1719, r1767; +} +{ +mul.f16x2 r2652, r2649, r2547; +} +{ +add.f16x2 r2655, r138, r2652; +} +{ +add.f16x2 r2658, r1735, r1751; +} +{ +mul.f16x2 r2661, r2658, r2549; +} +{ +add.f16x2 r2664, r2655, r2661; +} +{ +sub.f16x2 r2667, r1725, r1773; +} +{ +mul.f16x2 r2670, r2667, r2548; +} +{ +sub.f16x2 r2673, r1741, r1757; +} +{ +mul.f16x2 r2676, r2673, r2551; +} +{ +add.f16x2 r2679, r2670, r2676; +} +{ +sub.f16x2 r2682, r2664, r2679; +} +{ +add.f16x2 r2685, r1719, r1767; +} +{ +mul.f16x2 r2688, r2685, r2547; +} +{ +add.f16x2 r2691, r138, r2688; +} +{ +add.f16x2 r2694, r1735, r1751; +} +{ +mul.f16x2 r2697, r2694, r2549; +} +{ +add.f16x2 r2700, r2691, r2697; +} +{ +sub.f16x2 r2703, r1725, r1773; +} +{ +mul.f16x2 r2706, r2703, r2548; +} +{ +sub.f16x2 r2709, r1741, r1757; +} +{ +mul.f16x2 r2712, r2709, r2551; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2700, r2715; +} +{ +add.f16x2 r2721, r1725, r1773; +} +{ +mul.f16x2 r2724, r2721, r2545; +} +{ +add.f16x2 r2727, r282, r2724; +} +{ +add.f16x2 r2730, r1741, r1757; +} +{ +mul.f16x2 r2733, r2730, r2547; +} +{ +add.f16x2 r2736, r2727, r2733; +} +{ +sub.f16x2 r2739, r1719, r1767; +} +{ +mul.f16x2 r2742, r2739, r2546; +} +{ +sub.f16x2 r2745, r1735, r1751; +} +{ +mul.f16x2 r2748, r2745, r2548; +} +{ +add.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2736, r2751; +} +{ +add.f16x2 r2757, r1725, r1773; +} +{ +mul.f16x2 r2760, r2757, r2545; +} +{ +add.f16x2 r2763, r282, r2760; +} +{ +add.f16x2 r2766, r1741, r1757; +} +{ +mul.f16x2 r2769, r2766, r2547; +} +{ +add.f16x2 r2772, r2763, r2769; +} +{ +sub.f16x2 r2775, r1719, r1767; +} +{ +mul.f16x2 r2778, r2775, r2546; +} +{ +sub.f16x2 r2781, r1735, r1751; +} +{ +mul.f16x2 r2784, r2781, r2548; +} +{ +add.f16x2 r2787, r2778, r2784; +} +{ +sub.f16x2 r2790, r2772, r2787; +} +{ +add.f16x2 r2793, r1725, r1773; +} +{ +mul.f16x2 r2796, r2793, r2547; +} +{ +add.f16x2 r2799, r282, r2796; +} +{ +add.f16x2 r2802, r1741, r1757; +} +{ +mul.f16x2 r2805, r2802, r2549; +} +{ +add.f16x2 r2808, r2799, r2805; +} +{ +sub.f16x2 r2811, r1719, r1767; +} +{ +mul.f16x2 r2814, r2811, r2548; +} +{ +sub.f16x2 r2817, r1735, r1751; +} +{ +mul.f16x2 r2820, r2817, r2551; +} +{ +add.f16x2 r2823, r2814, r2820; +} +{ +add.f16x2 r2826, r2808, r2823; +} +{ +add.f16x2 r2829, r1725, r1773; +} +{ +mul.f16x2 r2832, r2829, r2547; +} +{ +add.f16x2 r2835, r282, r2832; +} +{ +add.f16x2 r2838, r1741, r1757; +} +{ +mul.f16x2 r2841, r2838, r2549; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r1719, r1767; +} +{ +mul.f16x2 r2850, r2847, r2548; +} +{ +sub.f16x2 r2853, r1735, r1751; +} +{ +mul.f16x2 r2856, r2853, r2551; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2844, r2859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2866, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2868, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2869, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2870, {low, high}; +} +{ +neg.f16x2 r2871, r2870; +} +{ +add.f16x2 r2873, r1783, r1831; +} +{ +add.f16x2 r2876, r174, r2873; +} +{ +add.f16x2 r2879, r1799, r1815; +} +{ +add.f16x2 r2882, r2876, r2879; +} +{ +add.f16x2 r2885, r1789, r1837; +} +{ +add.f16x2 r2888, r318, r2885; +} +{ +add.f16x2 r2891, r1805, r1821; +} +{ +add.f16x2 r2894, r2888, r2891; +} +{ +add.f16x2 r2897, r1783, r1831; +} +{ +mul.f16x2 r2900, r2897, r2865; +} +{ +add.f16x2 r2903, r174, r2900; +} +{ +add.f16x2 r2906, r1799, r1815; +} +{ +mul.f16x2 r2909, r2906, r2867; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +sub.f16x2 r2915, r1789, r1837; +} +{ +mul.f16x2 r2918, r2915, r2866; +} +{ +sub.f16x2 r2921, r1805, r1821; +} +{ +mul.f16x2 r2924, r2921, r2868; +} +{ +add.f16x2 r2927, r2918, r2924; +} +{ +sub.f16x2 r2930, r2912, r2927; +} +{ +add.f16x2 r2933, r1783, r1831; +} +{ +mul.f16x2 r2936, r2933, r2865; +} +{ +add.f16x2 r2939, r174, r2936; +} +{ +add.f16x2 r2942, r1799, r1815; +} +{ +mul.f16x2 r2945, r2942, r2867; +} +{ +add.f16x2 r2948, r2939, r2945; +} +{ +sub.f16x2 r2951, r1789, r1837; +} +{ +mul.f16x2 r2954, r2951, r2866; +} +{ +sub.f16x2 r2957, r1805, r1821; +} +{ +mul.f16x2 r2960, r2957, r2868; +} +{ +add.f16x2 r2963, r2954, r2960; +} +{ +add.f16x2 r2966, r2948, r2963; +} +{ +add.f16x2 r2969, r1783, r1831; +} +{ +mul.f16x2 r2972, r2969, r2867; +} +{ +add.f16x2 r2975, r174, r2972; +} +{ +add.f16x2 r2978, r1799, r1815; +} +{ +mul.f16x2 r2981, r2978, r2869; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 r2987, r1789, r1837; +} +{ +mul.f16x2 r2990, r2987, r2868; +} +{ +sub.f16x2 r2993, r1805, r1821; +} +{ +mul.f16x2 r2996, r2993, r2871; +} +{ +add.f16x2 r2999, r2990, r2996; +} +{ +sub.f16x2 r3002, r2984, r2999; +} +{ +add.f16x2 r3005, r1783, r1831; +} +{ +mul.f16x2 r3008, r3005, r2867; +} +{ +add.f16x2 r3011, r174, r3008; +} +{ +add.f16x2 r3014, r1799, r1815; +} +{ +mul.f16x2 r3017, r3014, r2869; +} +{ +add.f16x2 r3020, r3011, r3017; +} +{ +sub.f16x2 r3023, r1789, r1837; +} +{ +mul.f16x2 r3026, r3023, r2868; +} +{ +sub.f16x2 r3029, r1805, r1821; +} +{ +mul.f16x2 r3032, r3029, r2871; +} +{ +add.f16x2 r3035, r3026, r3032; +} +{ +add.f16x2 r3038, r3020, r3035; +} +{ +add.f16x2 r3041, r1789, r1837; +} +{ +mul.f16x2 r3044, r3041, r2865; +} +{ +add.f16x2 r3047, r318, r3044; +} +{ +add.f16x2 r3050, r1805, r1821; +} +{ +mul.f16x2 r3053, r3050, r2867; +} +{ +add.f16x2 r3056, r3047, r3053; +} +{ +sub.f16x2 r3059, r1783, r1831; +} +{ +mul.f16x2 r3062, r3059, r2866; +} +{ +sub.f16x2 r3065, r1799, r1815; +} +{ +mul.f16x2 r3068, r3065, r2868; +} +{ +add.f16x2 r3071, r3062, r3068; +} +{ +add.f16x2 r3074, r3056, r3071; +} +{ +add.f16x2 r3077, r1789, r1837; +} +{ +mul.f16x2 r3080, r3077, r2865; +} +{ +add.f16x2 r3083, r318, r3080; +} +{ +add.f16x2 r3086, r1805, r1821; +} +{ +mul.f16x2 r3089, r3086, r2867; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 r3095, r1783, r1831; +} +{ +mul.f16x2 r3098, r3095, r2866; +} +{ +sub.f16x2 r3101, r1799, r1815; +} +{ +mul.f16x2 r3104, r3101, r2868; +} +{ +add.f16x2 r3107, r3098, r3104; +} +{ +sub.f16x2 r3110, r3092, r3107; +} +{ +add.f16x2 r3113, r1789, r1837; +} +{ +mul.f16x2 r3116, r3113, r2867; +} +{ +add.f16x2 r3119, r318, r3116; +} +{ +add.f16x2 r3122, r1805, r1821; +} +{ +mul.f16x2 r3125, r3122, r2869; +} +{ +add.f16x2 r3128, r3119, r3125; +} +{ +sub.f16x2 r3131, r1783, r1831; +} +{ +mul.f16x2 r3134, r3131, r2868; +} +{ +sub.f16x2 r3137, r1799, r1815; +} +{ +mul.f16x2 r3140, r3137, r2871; +} +{ +add.f16x2 r3143, r3134, r3140; +} +{ +add.f16x2 r3146, r3128, r3143; +} +{ +add.f16x2 r3149, r1789, r1837; +} +{ +mul.f16x2 r3152, r3149, r2867; +} +{ +add.f16x2 r3155, r318, r3152; +} +{ +add.f16x2 r3158, r1805, r1821; +} +{ +mul.f16x2 r3161, r3158, r2869; +} +{ +add.f16x2 r3164, r3155, r3161; +} +{ +sub.f16x2 r3167, r1783, r1831; +} +{ +mul.f16x2 r3170, r3167, r2868; +} +{ +sub.f16x2 r3173, r1799, r1815; +} +{ +mul.f16x2 r3176, r3173, r2871; +} +{ +add.f16x2 r3179, r3170, r3176; +} +{ +sub.f16x2 r3182, r3164, r3179; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3185, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3186, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r3187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r3188, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3189, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3190, {low, high}; +} +{ +neg.f16x2 r3191, r3190; +} +{ +add.f16x2 r3193, r1847, r1895; +} +{ +add.f16x2 r3196, r102, r3193; +} +{ +add.f16x2 r3199, r1863, r1879; +} +{ +add.f16x2 r3202, r3196, r3199; +} +{ +add.f16x2 r3205, r1853, r1901; +} +{ +add.f16x2 r3208, r246, r3205; +} +{ +add.f16x2 r3211, r1869, r1885; +} +{ +add.f16x2 r3214, r3208, r3211; +} +{ +add.f16x2 r3217, r1847, r1895; +} +{ +mul.f16x2 r3220, r3217, r3185; +} +{ +add.f16x2 r3223, r102, r3220; +} +{ +add.f16x2 r3226, r1863, r1879; +} +{ +mul.f16x2 r3229, r3226, r3187; +} +{ +add.f16x2 r3232, r3223, r3229; +} +{ +sub.f16x2 r3235, r1853, r1901; +} +{ +mul.f16x2 r3238, r3235, r3186; +} +{ +sub.f16x2 r3241, r1869, r1885; +} +{ +mul.f16x2 r3244, r3241, r3188; +} +{ +add.f16x2 r3247, r3238, r3244; +} +{ +sub.f16x2 r3250, r3232, r3247; +} +{ +add.f16x2 r3253, r1847, r1895; +} +{ +mul.f16x2 r3256, r3253, r3185; +} +{ +add.f16x2 r3259, r102, r3256; +} +{ +add.f16x2 r3262, r1863, r1879; +} +{ +mul.f16x2 r3265, r3262, r3187; +} +{ +add.f16x2 r3268, r3259, r3265; +} +{ +sub.f16x2 r3271, r1853, r1901; +} +{ +mul.f16x2 r3274, r3271, r3186; +} +{ +sub.f16x2 r3277, r1869, r1885; +} +{ +mul.f16x2 r3280, r3277, r3188; +} +{ +add.f16x2 r3283, r3274, r3280; +} +{ +add.f16x2 r3286, r3268, r3283; +} +{ +add.f16x2 r3289, r1847, r1895; +} +{ +mul.f16x2 r3292, r3289, r3187; +} +{ +add.f16x2 r3295, r102, r3292; +} +{ +add.f16x2 r3298, r1863, r1879; +} +{ +mul.f16x2 r3301, r3298, r3189; +} +{ +add.f16x2 r3304, r3295, r3301; +} +{ +sub.f16x2 r3307, r1853, r1901; +} +{ +mul.f16x2 r3310, r3307, r3188; +} +{ +sub.f16x2 r3313, r1869, r1885; +} +{ +mul.f16x2 r3316, r3313, r3191; +} +{ +add.f16x2 r3319, r3310, r3316; +} +{ +sub.f16x2 r3322, r3304, r3319; +} +{ +add.f16x2 r3325, r1847, r1895; +} +{ +mul.f16x2 r3328, r3325, r3187; +} +{ +add.f16x2 r3331, r102, r3328; +} +{ +add.f16x2 r3334, r1863, r1879; +} +{ +mul.f16x2 r3337, r3334, r3189; +} +{ +add.f16x2 r3340, r3331, r3337; +} +{ +sub.f16x2 r3343, r1853, r1901; +} +{ +mul.f16x2 r3346, r3343, r3188; +} +{ +sub.f16x2 r3349, r1869, r1885; +} +{ +mul.f16x2 r3352, r3349, r3191; +} +{ +add.f16x2 r3355, r3346, r3352; +} +{ +add.f16x2 r3358, r3340, r3355; +} +{ +add.f16x2 r3361, r1853, r1901; +} +{ +mul.f16x2 r3364, r3361, r3185; +} +{ +add.f16x2 r3367, r246, r3364; +} +{ +add.f16x2 r3370, r1869, r1885; +} +{ +mul.f16x2 r3373, r3370, r3187; +} +{ +add.f16x2 r3376, r3367, r3373; +} +{ +sub.f16x2 r3379, r1847, r1895; +} +{ +mul.f16x2 r3382, r3379, r3186; +} +{ +sub.f16x2 r3385, r1863, r1879; +} +{ +mul.f16x2 r3388, r3385, r3188; +} +{ +add.f16x2 r3391, r3382, r3388; +} +{ +add.f16x2 r3394, r3376, r3391; +} +{ +add.f16x2 r3397, r1853, r1901; +} +{ +mul.f16x2 r3400, r3397, r3185; +} +{ +add.f16x2 r3403, r246, r3400; +} +{ +add.f16x2 r3406, r1869, r1885; +} +{ +mul.f16x2 r3409, r3406, r3187; +} +{ +add.f16x2 r3412, r3403, r3409; +} +{ +sub.f16x2 r3415, r1847, r1895; +} +{ +mul.f16x2 r3418, r3415, r3186; +} +{ +sub.f16x2 r3421, r1863, r1879; +} +{ +mul.f16x2 r3424, r3421, r3188; +} +{ +add.f16x2 r3427, r3418, r3424; +} +{ +sub.f16x2 r3430, r3412, r3427; +} +{ +add.f16x2 r3433, r1853, r1901; +} +{ +mul.f16x2 r3436, r3433, r3187; +} +{ +add.f16x2 r3439, r246, r3436; +} +{ +add.f16x2 r3442, r1869, r1885; +} +{ +mul.f16x2 r3445, r3442, r3189; +} +{ +add.f16x2 r3448, r3439, r3445; +} +{ +sub.f16x2 r3451, r1847, r1895; +} +{ +mul.f16x2 r3454, r3451, r3188; +} +{ +sub.f16x2 r3457, r1863, r1879; +} +{ +mul.f16x2 r3460, r3457, r3191; +} +{ +add.f16x2 r3463, r3454, r3460; +} +{ +add.f16x2 r3466, r3448, r3463; +} +{ +add.f16x2 r3469, r1853, r1901; +} +{ +mul.f16x2 r3472, r3469, r3187; +} +{ +add.f16x2 r3475, r246, r3472; +} +{ +add.f16x2 r3478, r1869, r1885; +} +{ +mul.f16x2 r3481, r3478, r3189; +} +{ +add.f16x2 r3484, r3475, r3481; +} +{ +sub.f16x2 r3487, r1847, r1895; +} +{ +mul.f16x2 r3490, r3487, r3188; +} +{ +sub.f16x2 r3493, r1863, r1879; +} +{ +mul.f16x2 r3496, r3493, r3191; +} +{ +add.f16x2 r3499, r3490, r3496; +} +{ +sub.f16x2 r3502, r3484, r3499; +} +mul.wide.u32 rd2, r5999, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r6000, rd3; +mul.lo.s32 r6001, r6000, 5; +sub.s32 r6002, r5999, r6001; +mad.lo.s32 r6003, r6000, 500, r5998; +cvt.rn.f32.u32 f329, r6002; +mul.f32 f330, f329, 0f3D4DE32E; +cos.approx.f32 f217, f330; +sin.approx.f32 f331, f330; +neg.f32 f218, f331; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3505, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3508, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3510, {high, high}; +} +{ +mul.f16x2 r3512, r2254, r3510; +} +{ +neg.f16x2 r3515, r3512; +} +{ +fma.rn.f16x2 r3517, r2242, r3508, r3515; +} +{ +mul.f16x2 r3521, r2242, r3510; +} +{ +fma.rn.f16x2 r3524, r2254, r3508, r3521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3530, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3532, {low, high}; +} +{ +mul.f16x2 r3533, r3530, r3532; +} +{ +mul.f16x2 r3536, r3505, r3528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3539, {high, low}; +} +{ +fma.rn.f16x2 r3541, r3533, r3539, r3536; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3545, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3547, {high, high}; +} +{ +mul.f16x2 r3549, r2574, r3547; +} +{ +neg.f16x2 r3552, r3549; +} +{ +fma.rn.f16x2 r3554, r2562, r3545, r3552; +} +{ +mul.f16x2 r3558, r2562, r3547; +} +{ +fma.rn.f16x2 r3561, r2574, r3545, r3558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3567, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3569, {low, high}; +} +{ +mul.f16x2 r3570, r3567, r3569; +} +{ +mul.f16x2 r3573, r3541, r3565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3576, {high, low}; +} +{ +fma.rn.f16x2 r3578, r3570, r3576, r3573; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3582, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3584, {high, high}; +} +{ +mul.f16x2 r3586, r2894, r3584; +} +{ +neg.f16x2 r3589, r3586; +} +{ +fma.rn.f16x2 r3591, r2882, r3582, r3589; +} +{ +mul.f16x2 r3595, r2882, r3584; +} +{ +fma.rn.f16x2 r3598, r2894, r3582, r3595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3604, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3606, {low, high}; +} +{ +mul.f16x2 r3607, r3604, r3606; +} +{ +mul.f16x2 r3610, r3578, r3602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3613, {high, low}; +} +{ +fma.rn.f16x2 r3615, r3607, r3613, r3610; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3619, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3621, {high, high}; +} +{ +mul.f16x2 r3623, r3214, r3621; +} +{ +neg.f16x2 r3626, r3623; +} +{ +fma.rn.f16x2 r3628, r3202, r3619, r3626; +} +{ +mul.f16x2 r3632, r3202, r3621; +} +{ +fma.rn.f16x2 r3635, r3214, r3619, r3632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3641, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3643, {low, high}; +} +{ +mul.f16x2 r3644, r3641, r3643; +} +{ +mul.f16x2 r3647, r3615, r3639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3650, {high, low}; +} +{ +fma.rn.f16x2 r3652, r3644, r3650, r3647; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3656, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3658, {high, high}; +} +{ +mul.f16x2 r3660, r2114, r3658; +} +{ +neg.f16x2 r3663, r3660; +} +{ +fma.rn.f16x2 r3665, r1970, r3656, r3663; +} +{ +mul.f16x2 r3669, r1970, r3658; +} +{ +fma.rn.f16x2 r3672, r2114, r3656, r3669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3678, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3680, {low, high}; +} +{ +mul.f16x2 r3681, r3678, r3680; +} +{ +mul.f16x2 r3684, r3652, r3676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3687, {high, low}; +} +{ +fma.rn.f16x2 r3689, r3681, r3687, r3684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3693, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3695, {high, high}; +} +{ +mul.f16x2 r3697, r2434, r3695; +} +{ +neg.f16x2 r3700, r3697; +} +{ +fma.rn.f16x2 r3702, r2290, r3693, r3700; +} +{ +mul.f16x2 r3706, r2290, r3695; +} +{ +fma.rn.f16x2 r3709, r2434, r3693, r3706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3715, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3717, {low, high}; +} +{ +mul.f16x2 r3718, r3715, r3717; +} +{ +mul.f16x2 r3721, r3689, r3713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3724, {high, low}; +} +{ +fma.rn.f16x2 r3726, r3718, r3724, r3721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3730, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3732, {high, high}; +} +{ +mul.f16x2 r3734, r2754, r3732; +} +{ +neg.f16x2 r3737, r3734; +} +{ +fma.rn.f16x2 r3739, r2610, r3730, r3737; +} +{ +mul.f16x2 r3743, r2610, r3732; +} +{ +fma.rn.f16x2 r3746, r2754, r3730, r3743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3752, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3754, {low, high}; +} +{ +mul.f16x2 r3755, r3752, r3754; +} +{ +mul.f16x2 r3758, r3726, r3750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3761, {high, low}; +} +{ +fma.rn.f16x2 r3763, r3755, r3761, r3758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3767, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3769, {high, high}; +} +{ +mul.f16x2 r3771, r3074, r3769; +} +{ +neg.f16x2 r3774, r3771; +} +{ +fma.rn.f16x2 r3776, r2930, r3767, r3774; +} +{ +mul.f16x2 r3780, r2930, r3769; +} +{ +fma.rn.f16x2 r3783, r3074, r3767, r3780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3789, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3791, {low, high}; +} +{ +mul.f16x2 r3792, r3789, r3791; +} +{ +mul.f16x2 r3795, r3763, r3787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3798, {high, low}; +} +{ +fma.rn.f16x2 r3800, r3792, r3798, r3795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3804, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3806, {high, high}; +} +{ +mul.f16x2 r3808, r3394, r3806; +} +{ +neg.f16x2 r3811, r3808; +} +{ +fma.rn.f16x2 r3813, r3250, r3804, r3811; +} +{ +mul.f16x2 r3817, r3250, r3806; +} +{ +fma.rn.f16x2 r3820, r3394, r3804, r3817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3826, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3828, {low, high}; +} +{ +mul.f16x2 r3829, r3826, r3828; +} +{ +mul.f16x2 r3832, r3800, r3824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3835, {high, low}; +} +{ +fma.rn.f16x2 r3837, r3829, r3835, r3832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3841, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3843, {high, high}; +} +{ +mul.f16x2 r3845, r2186, r3843; +} +{ +neg.f16x2 r3848, r3845; +} +{ +fma.rn.f16x2 r3850, r2042, r3841, r3848; +} +{ +mul.f16x2 r3854, r2042, r3843; +} +{ +fma.rn.f16x2 r3857, r2186, r3841, r3854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3863, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3865, {low, high}; +} +{ +mul.f16x2 r3866, r3863, r3865; +} +{ +mul.f16x2 r3869, r3837, r3861; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3872, {high, low}; +} +{ +fma.rn.f16x2 r3874, r3866, r3872, r3869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3878, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3880, {high, high}; +} +{ +mul.f16x2 r3882, r2506, r3880; +} +{ +neg.f16x2 r3885, r3882; +} +{ +fma.rn.f16x2 r3887, r2362, r3878, r3885; +} +{ +mul.f16x2 r3891, r2362, r3880; +} +{ +fma.rn.f16x2 r3894, r2506, r3878, r3891; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3900, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3902, {low, high}; +} +{ +mul.f16x2 r3903, r3900, r3902; +} +{ +mul.f16x2 r3906, r3874, r3898; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3909, {high, low}; +} +{ +fma.rn.f16x2 r3911, r3903, r3909, r3906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3915, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3917, {high, high}; +} +{ +mul.f16x2 r3919, r2826, r3917; +} +{ +neg.f16x2 r3922, r3919; +} +{ +fma.rn.f16x2 r3924, r2682, r3915, r3922; +} +{ +mul.f16x2 r3928, r2682, r3917; +} +{ +fma.rn.f16x2 r3931, r2826, r3915, r3928; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3937, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3939, {low, high}; +} +{ +mul.f16x2 r3940, r3937, r3939; +} +{ +mul.f16x2 r3943, r3911, r3935; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3946, {high, low}; +} +{ +fma.rn.f16x2 r3948, r3940, r3946, r3943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3952, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3954, {high, high}; +} +{ +mul.f16x2 r3956, r3146, r3954; +} +{ +neg.f16x2 r3959, r3956; +} +{ +fma.rn.f16x2 r3961, r3002, r3952, r3959; +} +{ +mul.f16x2 r3965, r3002, r3954; +} +{ +fma.rn.f16x2 r3968, r3146, r3952, r3965; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3974, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3976, {low, high}; +} +{ +mul.f16x2 r3977, r3974, r3976; +} +{ +mul.f16x2 r3980, r3948, r3972; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3983, {high, low}; +} +{ +fma.rn.f16x2 r3985, r3977, r3983, r3980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3989, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3991, {high, high}; +} +{ +mul.f16x2 r3993, r3466, r3991; +} +{ +neg.f16x2 r3996, r3993; +} +{ +fma.rn.f16x2 r3998, r3322, r3989, r3996; +} +{ +mul.f16x2 r4002, r3322, r3991; +} +{ +fma.rn.f16x2 r4005, r3466, r3989, r4002; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4011, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4013, {low, high}; +} +{ +mul.f16x2 r4014, r4011, r4013; +} +{ +mul.f16x2 r4017, r3985, r4009; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r4020, {high, low}; +} +{ +fma.rn.f16x2 r4022, r4014, r4020, r4017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4026, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4028, {high, high}; +} +{ +mul.f16x2 r4030, r2222, r4028; +} +{ +neg.f16x2 r4033, r4030; +} +{ +fma.rn.f16x2 r4035, r2078, r4026, r4033; +} +{ +mul.f16x2 r4039, r2078, r4028; +} +{ +fma.rn.f16x2 r4042, r2222, r4026, r4039; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4048, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4050, {low, high}; +} +{ +mul.f16x2 r4051, r4048, r4050; +} +{ +mul.f16x2 r4054, r4022, r4046; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4057, {high, low}; +} +{ +fma.rn.f16x2 r4059, r4051, r4057, r4054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4063, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4065, {high, high}; +} +{ +mul.f16x2 r4067, r2542, r4065; +} +{ +neg.f16x2 r4070, r4067; +} +{ +fma.rn.f16x2 r4072, r2398, r4063, r4070; +} +{ +mul.f16x2 r4076, r2398, r4065; +} +{ +fma.rn.f16x2 r4079, r2542, r4063, r4076; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4085, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4087, {low, high}; +} +{ +mul.f16x2 r4088, r4085, r4087; +} +{ +mul.f16x2 r4091, r4059, r4083; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4094, {high, low}; +} +{ +fma.rn.f16x2 r4096, r4088, r4094, r4091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4100, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4102, {high, high}; +} +{ +mul.f16x2 r4104, r2862, r4102; +} +{ +neg.f16x2 r4107, r4104; +} +{ +fma.rn.f16x2 r4109, r2718, r4100, r4107; +} +{ +mul.f16x2 r4113, r2718, r4102; +} +{ +fma.rn.f16x2 r4116, r2862, r4100, r4113; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4122, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4125, r4122, r4124; +} +{ +mul.f16x2 r4128, r4096, r4120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4131, {high, low}; +} +{ +fma.rn.f16x2 r4133, r4125, r4131, r4128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4137, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4139, {high, high}; +} +{ +mul.f16x2 r4141, r3182, r4139; +} +{ +neg.f16x2 r4144, r4141; +} +{ +fma.rn.f16x2 r4146, r3038, r4137, r4144; +} +{ +mul.f16x2 r4150, r3038, r4139; +} +{ +fma.rn.f16x2 r4153, r3182, r4137, r4150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4159, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4162, r4159, r4161; +} +{ +mul.f16x2 r4165, r4133, r4157; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4168, {high, low}; +} +{ +fma.rn.f16x2 r4170, r4162, r4168, r4165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4174, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4176, {high, high}; +} +{ +mul.f16x2 r4178, r3502, r4176; +} +{ +neg.f16x2 r4181, r4178; +} +{ +fma.rn.f16x2 r4183, r3358, r4174, r4181; +} +{ +mul.f16x2 r4187, r3358, r4176; +} +{ +fma.rn.f16x2 r4190, r3502, r4174, r4187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4196, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4198, {low, high}; +} +{ +mul.f16x2 r4199, r4196, r4198; +} +{ +mul.f16x2 r4202, r4170, r4194; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4205, {high, low}; +} +{ +fma.rn.f16x2 r4207, r4199, r4205, r4202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4211, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4213, {high, high}; +} +{ +mul.f16x2 r4215, r2150, r4213; +} +{ +neg.f16x2 r4218, r4215; +} +{ +fma.rn.f16x2 r4220, r2006, r4211, r4218; +} +{ +mul.f16x2 r4224, r2006, r4213; +} +{ +fma.rn.f16x2 r4227, r2150, r4211, r4224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4233, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4235, {low, high}; +} +{ +mul.f16x2 r4236, r4233, r4235; +} +{ +mul.f16x2 r4239, r4207, r4231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4242, {high, low}; +} +{ +fma.rn.f16x2 r4244, r4236, r4242, r4239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4250, {high, high}; +} +{ +mul.f16x2 r4252, r2470, r4250; +} +{ +neg.f16x2 r4255, r4252; +} +{ +fma.rn.f16x2 r4257, r2326, r4248, r4255; +} +{ +mul.f16x2 r4261, r2326, r4250; +} +{ +fma.rn.f16x2 r4264, r2470, r4248, r4261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4272, {low, high}; +} +{ +mul.f16x2 r4273, r4270, r4272; +} +{ +mul.f16x2 r4276, r4244, r4268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4279, {high, low}; +} +{ +fma.rn.f16x2 r4281, r4273, r4279, r4276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4287, {high, high}; +} +{ +mul.f16x2 r4289, r2790, r4287; +} +{ +neg.f16x2 r4292, r4289; +} +{ +fma.rn.f16x2 r4294, r2646, r4285, r4292; +} +{ +mul.f16x2 r4298, r2646, r4287; +} +{ +fma.rn.f16x2 r4301, r2790, r4285, r4298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4309, {low, high}; +} +{ +mul.f16x2 r4310, r4307, r4309; +} +{ +mul.f16x2 r4313, r4281, r4305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4316, {high, low}; +} +{ +fma.rn.f16x2 r4318, r4310, r4316, r4313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4324, {high, high}; +} +{ +mul.f16x2 r4326, r3110, r4324; +} +{ +neg.f16x2 r4329, r4326; +} +{ +fma.rn.f16x2 r4331, r2966, r4322, r4329; +} +{ +mul.f16x2 r4335, r2966, r4324; +} +{ +fma.rn.f16x2 r4338, r3110, r4322, r4335; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4344, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4346, {low, high}; +} +{ +mul.f16x2 r4347, r4344, r4346; +} +{ +mul.f16x2 r4350, r4318, r4342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4353, {high, low}; +} +{ +fma.rn.f16x2 r4355, r4347, r4353, r4350; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4359, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4361, {high, high}; +} +{ +mul.f16x2 r4363, r3430, r4361; +} +{ +neg.f16x2 r4366, r4363; +} +{ +fma.rn.f16x2 r4368, r3286, r4359, r4366; +} +{ +mul.f16x2 r4372, r3286, r4361; +} +{ +fma.rn.f16x2 r4375, r3430, r4359, r4372; +} +barrier.sync 0; +mad.lo.s32 r6004, r6002, 100, r6003; +st.shared.u32 [r6004], r1922; +st.shared.u32 [r6004+4], r3517; +st.shared.u32 [r6004+8], r3554; +st.shared.u32 [r6004+12], r3591; +st.shared.u32 [r6004+16], r3628; +st.shared.u32 [r6004+20], r3665; +st.shared.u32 [r6004+24], r3702; +st.shared.u32 [r6004+28], r3739; +st.shared.u32 [r6004+32], r3776; +st.shared.u32 [r6004+36], r3813; +st.shared.u32 [r6004+40], r3850; +st.shared.u32 [r6004+44], r3887; +st.shared.u32 [r6004+48], r3924; +st.shared.u32 [r6004+52], r3961; +st.shared.u32 [r6004+56], r3998; +st.shared.u32 [r6004+60], r4035; +st.shared.u32 [r6004+64], r4072; +st.shared.u32 [r6004+68], r4109; +st.shared.u32 [r6004+72], r4146; +st.shared.u32 [r6004+76], r4183; +st.shared.u32 [r6004+80], r4220; +st.shared.u32 [r6004+84], r4257; +st.shared.u32 [r6004+88], r4294; +st.shared.u32 [r6004+92], r4331; +st.shared.u32 [r6004+96], r4368; +barrier.sync 0; +mad.lo.s32 r6005, r6002, -96, r6004; +ld.shared.u32 r4408, [r6005]; +ld.shared.u32 r4728, [r6005+20]; +ld.shared.u32 r5048, [r6005+40]; +ld.shared.u32 r5368, [r6005+60]; +ld.shared.u32 r5688, [r6005+80]; +ld.shared.u32 r4405, [r6005+100]; +ld.shared.u32 r4725, [r6005+120]; +ld.shared.u32 r5045, [r6005+140]; +ld.shared.u32 r5365, [r6005+160]; +ld.shared.u32 r5685, [r6005+180]; +ld.shared.u32 r4411, [r6005+200]; +ld.shared.u32 r4731, [r6005+220]; +ld.shared.u32 r5051, [r6005+240]; +ld.shared.u32 r5371, [r6005+260]; +ld.shared.u32 r5691, [r6005+280]; +ld.shared.u32 r4412, [r6005+300]; +ld.shared.u32 r4732, [r6005+320]; +ld.shared.u32 r5052, [r6005+340]; +ld.shared.u32 r5372, [r6005+360]; +ld.shared.u32 r5692, [r6005+380]; +ld.shared.u32 r4406, [r6005+400]; +ld.shared.u32 r4726, [r6005+420]; +ld.shared.u32 r5046, [r6005+440]; +ld.shared.u32 r5366, [r6005+460]; +ld.shared.u32 r5686, [r6005+480]; +barrier.sync 0; +st.shared.u32 [r6004], r1934; +st.shared.u32 [r6004+4], r3524; +st.shared.u32 [r6004+8], r3561; +st.shared.u32 [r6004+12], r3598; +st.shared.u32 [r6004+16], r3635; +st.shared.u32 [r6004+20], r3672; +st.shared.u32 [r6004+24], r3709; +st.shared.u32 [r6004+28], r3746; +st.shared.u32 [r6004+32], r3783; +st.shared.u32 [r6004+36], r3820; +st.shared.u32 [r6004+40], r3857; +st.shared.u32 [r6004+44], r3894; +st.shared.u32 [r6004+48], r3931; +st.shared.u32 [r6004+52], r3968; +st.shared.u32 [r6004+56], r4005; +st.shared.u32 [r6004+60], r4042; +st.shared.u32 [r6004+64], r4079; +st.shared.u32 [r6004+68], r4116; +st.shared.u32 [r6004+72], r4153; +st.shared.u32 [r6004+76], r4190; +st.shared.u32 [r6004+80], r4227; +st.shared.u32 [r6004+84], r4264; +st.shared.u32 [r6004+88], r4301; +st.shared.u32 [r6004+92], r4338; +st.shared.u32 [r6004+96], r4375; +barrier.sync 0; +ld.shared.u32 r4420, [r6005]; +ld.shared.u32 r4740, [r6005+20]; +ld.shared.u32 r5060, [r6005+40]; +ld.shared.u32 r5380, [r6005+60]; +ld.shared.u32 r5700, [r6005+80]; +ld.shared.u32 r4417, [r6005+100]; +ld.shared.u32 r4737, [r6005+120]; +ld.shared.u32 r5057, [r6005+140]; +ld.shared.u32 r5377, [r6005+160]; +ld.shared.u32 r5697, [r6005+180]; +ld.shared.u32 r4423, [r6005+200]; +ld.shared.u32 r4743, [r6005+220]; +ld.shared.u32 r5063, [r6005+240]; +ld.shared.u32 r5383, [r6005+260]; +ld.shared.u32 r5703, [r6005+280]; +ld.shared.u32 r4424, [r6005+300]; +ld.shared.u32 r4744, [r6005+320]; +ld.shared.u32 r5064, [r6005+340]; +ld.shared.u32 r5384, [r6005+360]; +ld.shared.u32 r5704, [r6005+380]; +ld.shared.u32 r4418, [r6005+400]; +ld.shared.u32 r4738, [r6005+420]; +ld.shared.u32 r5058, [r6005+440]; +ld.shared.u32 r5378, [r6005+460]; +ld.shared.u32 r5698, [r6005+480]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4401, {low, high}; +} +{ +neg.f16x2 r4402, r4401; +} +{ +add.f16x2 r4404, r4405, r4406; +} +{ +add.f16x2 r4407, r4408, r4404; +} +{ +add.f16x2 r4410, r4411, r4412; +} +{ +add.f16x2 %0, r4407, r4410; +} +{ +add.f16x2 r4416, r4417, r4418; +} +{ +add.f16x2 r4419, r4420, r4416; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 %1, r4419, r4422; +} +{ +add.f16x2 r4428, r4405, r4406; +} +{ +mul.f16x2 r4431, r4428, r4396; +} +{ +add.f16x2 r4434, r4408, r4431; +} +{ +add.f16x2 r4437, r4411, r4412; +} +{ +mul.f16x2 r4440, r4437, r4398; +} +{ +add.f16x2 r4443, r4434, r4440; +} +{ +sub.f16x2 r4446, r4417, r4418; +} +{ +mul.f16x2 r4449, r4446, r4397; +} +{ +sub.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4399; +} +{ +add.f16x2 r4458, r4449, r4455; +} +{ +sub.f16x2 %10, r4443, r4458; +} +{ +add.f16x2 r4464, r4405, r4406; +} +{ +mul.f16x2 r4467, r4464, r4396; +} +{ +add.f16x2 r4470, r4408, r4467; +} +{ +add.f16x2 r4473, r4411, r4412; +} +{ +mul.f16x2 r4476, r4473, r4398; +} +{ +add.f16x2 r4479, r4470, r4476; +} +{ +sub.f16x2 r4482, r4417, r4418; +} +{ +mul.f16x2 r4485, r4482, r4397; +} +{ +sub.f16x2 r4488, r4423, r4424; +} +{ +mul.f16x2 r4491, r4488, r4399; +} +{ +add.f16x2 r4494, r4485, r4491; +} +{ +add.f16x2 %40, r4479, r4494; +} +{ +add.f16x2 r4500, r4405, r4406; +} +{ +mul.f16x2 r4503, r4500, r4398; +} +{ +add.f16x2 r4506, r4408, r4503; +} +{ +add.f16x2 r4509, r4411, r4412; +} +{ +mul.f16x2 r4512, r4509, r4400; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +sub.f16x2 r4518, r4417, r4418; +} +{ +mul.f16x2 r4521, r4518, r4399; +} +{ +sub.f16x2 r4524, r4423, r4424; +} +{ +mul.f16x2 r4527, r4524, r4402; +} +{ +add.f16x2 r4530, r4521, r4527; +} +{ +sub.f16x2 %20, r4515, r4530; +} +{ +add.f16x2 r4536, r4405, r4406; +} +{ +mul.f16x2 r4539, r4536, r4398; +} +{ +add.f16x2 r4542, r4408, r4539; +} +{ +add.f16x2 r4545, r4411, r4412; +} +{ +mul.f16x2 r4548, r4545, r4400; +} +{ +add.f16x2 r4551, r4542, r4548; +} +{ +sub.f16x2 r4554, r4417, r4418; +} +{ +mul.f16x2 r4557, r4554, r4399; +} +{ +sub.f16x2 r4560, r4423, r4424; +} +{ +mul.f16x2 r4563, r4560, r4402; +} +{ +add.f16x2 r4566, r4557, r4563; +} +{ +add.f16x2 %30, r4551, r4566; +} +{ +add.f16x2 r4572, r4417, r4418; +} +{ +mul.f16x2 r4575, r4572, r4396; +} +{ +add.f16x2 r4578, r4420, r4575; +} +{ +add.f16x2 r4581, r4423, r4424; +} +{ +mul.f16x2 r4584, r4581, r4398; +} +{ +add.f16x2 r4587, r4578, r4584; +} +{ +sub.f16x2 r4590, r4405, r4406; +} +{ +mul.f16x2 r4593, r4590, r4397; +} +{ +sub.f16x2 r4596, r4411, r4412; +} +{ +mul.f16x2 r4599, r4596, r4399; +} +{ +add.f16x2 r4602, r4593, r4599; +} +{ +add.f16x2 %11, r4587, r4602; +} +{ +add.f16x2 r4608, r4417, r4418; +} +{ +mul.f16x2 r4611, r4608, r4396; +} +{ +add.f16x2 r4614, r4420, r4611; +} +{ +add.f16x2 r4617, r4423, r4424; +} +{ +mul.f16x2 r4620, r4617, r4398; +} +{ +add.f16x2 r4623, r4614, r4620; +} +{ +sub.f16x2 r4626, r4405, r4406; +} +{ +mul.f16x2 r4629, r4626, r4397; +} +{ +sub.f16x2 r4632, r4411, r4412; +} +{ +mul.f16x2 r4635, r4632, r4399; +} +{ +add.f16x2 r4638, r4629, r4635; +} +{ +sub.f16x2 %41, r4623, r4638; +} +{ +add.f16x2 r4644, r4417, r4418; +} +{ +mul.f16x2 r4647, r4644, r4398; +} +{ +add.f16x2 r4650, r4420, r4647; +} +{ +add.f16x2 r4653, r4423, r4424; +} +{ +mul.f16x2 r4656, r4653, r4400; +} +{ +add.f16x2 r4659, r4650, r4656; +} +{ +sub.f16x2 r4662, r4405, r4406; +} +{ +mul.f16x2 r4665, r4662, r4399; +} +{ +sub.f16x2 r4668, r4411, r4412; +} +{ +mul.f16x2 r4671, r4668, r4402; +} +{ +add.f16x2 r4674, r4665, r4671; +} +{ +add.f16x2 %21, r4659, r4674; +} +{ +add.f16x2 r4680, r4417, r4418; +} +{ +mul.f16x2 r4683, r4680, r4398; +} +{ +add.f16x2 r4686, r4420, r4683; +} +{ +add.f16x2 r4689, r4423, r4424; +} +{ +mul.f16x2 r4692, r4689, r4400; +} +{ +add.f16x2 r4695, r4686, r4692; +} +{ +sub.f16x2 r4698, r4405, r4406; +} +{ +mul.f16x2 r4701, r4698, r4399; +} +{ +sub.f16x2 r4704, r4411, r4412; +} +{ +mul.f16x2 r4707, r4704, r4402; +} +{ +add.f16x2 r4710, r4701, r4707; +} +{ +sub.f16x2 %31, r4695, r4710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4721, {low, high}; +} +{ +neg.f16x2 r4722, r4721; +} +{ +add.f16x2 r4724, r4725, r4726; +} +{ +add.f16x2 r4727, r4728, r4724; +} +{ +add.f16x2 r4730, r4731, r4732; +} +{ +add.f16x2 %2, r4727, r4730; +} +{ +add.f16x2 r4736, r4737, r4738; +} +{ +add.f16x2 r4739, r4740, r4736; +} +{ +add.f16x2 r4742, r4743, r4744; +} +{ +add.f16x2 %3, r4739, r4742; +} +{ +add.f16x2 r4748, r4725, r4726; +} +{ +mul.f16x2 r4751, r4748, r4716; +} +{ +add.f16x2 r4754, r4728, r4751; +} +{ +add.f16x2 r4757, r4731, r4732; +} +{ +mul.f16x2 r4760, r4757, r4718; +} +{ +add.f16x2 r4763, r4754, r4760; +} +{ +sub.f16x2 r4766, r4737, r4738; +} +{ +mul.f16x2 r4769, r4766, r4717; +} +{ +sub.f16x2 r4772, r4743, r4744; +} +{ +mul.f16x2 r4775, r4772, r4719; +} +{ +add.f16x2 r4778, r4769, r4775; +} +{ +sub.f16x2 %12, r4763, r4778; +} +{ +add.f16x2 r4784, r4725, r4726; +} +{ +mul.f16x2 r4787, r4784, r4716; +} +{ +add.f16x2 r4790, r4728, r4787; +} +{ +add.f16x2 r4793, r4731, r4732; +} +{ +mul.f16x2 r4796, r4793, r4718; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +sub.f16x2 r4802, r4737, r4738; +} +{ +mul.f16x2 r4805, r4802, r4717; +} +{ +sub.f16x2 r4808, r4743, r4744; +} +{ +mul.f16x2 r4811, r4808, r4719; +} +{ +add.f16x2 r4814, r4805, r4811; +} +{ +add.f16x2 %42, r4799, r4814; +} +{ +add.f16x2 r4820, r4725, r4726; +} +{ +mul.f16x2 r4823, r4820, r4718; +} +{ +add.f16x2 r4826, r4728, r4823; +} +{ +add.f16x2 r4829, r4731, r4732; +} +{ +mul.f16x2 r4832, r4829, r4720; +} +{ +add.f16x2 r4835, r4826, r4832; +} +{ +sub.f16x2 r4838, r4737, r4738; +} +{ +mul.f16x2 r4841, r4838, r4719; +} +{ +sub.f16x2 r4844, r4743, r4744; +} +{ +mul.f16x2 r4847, r4844, r4722; +} +{ +add.f16x2 r4850, r4841, r4847; +} +{ +sub.f16x2 %22, r4835, r4850; +} +{ +add.f16x2 r4856, r4725, r4726; +} +{ +mul.f16x2 r4859, r4856, r4718; +} +{ +add.f16x2 r4862, r4728, r4859; +} +{ +add.f16x2 r4865, r4731, r4732; +} +{ +mul.f16x2 r4868, r4865, r4720; +} +{ +add.f16x2 r4871, r4862, r4868; +} +{ +sub.f16x2 r4874, r4737, r4738; +} +{ +mul.f16x2 r4877, r4874, r4719; +} +{ +sub.f16x2 r4880, r4743, r4744; +} +{ +mul.f16x2 r4883, r4880, r4722; +} +{ +add.f16x2 r4886, r4877, r4883; +} +{ +add.f16x2 %32, r4871, r4886; +} +{ +add.f16x2 r4892, r4737, r4738; +} +{ +mul.f16x2 r4895, r4892, r4716; +} +{ +add.f16x2 r4898, r4740, r4895; +} +{ +add.f16x2 r4901, r4743, r4744; +} +{ +mul.f16x2 r4904, r4901, r4718; +} +{ +add.f16x2 r4907, r4898, r4904; +} +{ +sub.f16x2 r4910, r4725, r4726; +} +{ +mul.f16x2 r4913, r4910, r4717; +} +{ +sub.f16x2 r4916, r4731, r4732; +} +{ +mul.f16x2 r4919, r4916, r4719; +} +{ +add.f16x2 r4922, r4913, r4919; +} +{ +add.f16x2 %13, r4907, r4922; +} +{ +add.f16x2 r4928, r4737, r4738; +} +{ +mul.f16x2 r4931, r4928, r4716; +} +{ +add.f16x2 r4934, r4740, r4931; +} +{ +add.f16x2 r4937, r4743, r4744; +} +{ +mul.f16x2 r4940, r4937, r4718; +} +{ +add.f16x2 r4943, r4934, r4940; +} +{ +sub.f16x2 r4946, r4725, r4726; +} +{ +mul.f16x2 r4949, r4946, r4717; +} +{ +sub.f16x2 r4952, r4731, r4732; +} +{ +mul.f16x2 r4955, r4952, r4719; +} +{ +add.f16x2 r4958, r4949, r4955; +} +{ +sub.f16x2 %43, r4943, r4958; +} +{ +add.f16x2 r4964, r4737, r4738; +} +{ +mul.f16x2 r4967, r4964, r4718; +} +{ +add.f16x2 r4970, r4740, r4967; +} +{ +add.f16x2 r4973, r4743, r4744; +} +{ +mul.f16x2 r4976, r4973, r4720; +} +{ +add.f16x2 r4979, r4970, r4976; +} +{ +sub.f16x2 r4982, r4725, r4726; +} +{ +mul.f16x2 r4985, r4982, r4719; +} +{ +sub.f16x2 r4988, r4731, r4732; +} +{ +mul.f16x2 r4991, r4988, r4722; +} +{ +add.f16x2 r4994, r4985, r4991; +} +{ +add.f16x2 %23, r4979, r4994; +} +{ +add.f16x2 r5000, r4737, r4738; +} +{ +mul.f16x2 r5003, r5000, r4718; +} +{ +add.f16x2 r5006, r4740, r5003; +} +{ +add.f16x2 r5009, r4743, r4744; +} +{ +mul.f16x2 r5012, r5009, r4720; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +sub.f16x2 r5018, r4725, r4726; +} +{ +mul.f16x2 r5021, r5018, r4719; +} +{ +sub.f16x2 r5024, r4731, r4732; +} +{ +mul.f16x2 r5027, r5024, r4722; +} +{ +add.f16x2 r5030, r5021, r5027; +} +{ +sub.f16x2 %33, r5015, r5030; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5038, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5039, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5041, {low, high}; +} +{ +neg.f16x2 r5042, r5041; +} +{ +add.f16x2 r5044, r5045, r5046; +} +{ +add.f16x2 r5047, r5048, r5044; +} +{ +add.f16x2 r5050, r5051, r5052; +} +{ +add.f16x2 %4, r5047, r5050; +} +{ +add.f16x2 r5056, r5057, r5058; +} +{ +add.f16x2 r5059, r5060, r5056; +} +{ +add.f16x2 r5062, r5063, r5064; +} +{ +add.f16x2 %5, r5059, r5062; +} +{ +add.f16x2 r5068, r5045, r5046; +} +{ +mul.f16x2 r5071, r5068, r5036; +} +{ +add.f16x2 r5074, r5048, r5071; +} +{ +add.f16x2 r5077, r5051, r5052; +} +{ +mul.f16x2 r5080, r5077, r5038; +} +{ +add.f16x2 r5083, r5074, r5080; +} +{ +sub.f16x2 r5086, r5057, r5058; +} +{ +mul.f16x2 r5089, r5086, r5037; +} +{ +sub.f16x2 r5092, r5063, r5064; +} +{ +mul.f16x2 r5095, r5092, r5039; +} +{ +add.f16x2 r5098, r5089, r5095; +} +{ +sub.f16x2 %14, r5083, r5098; +} +{ +add.f16x2 r5104, r5045, r5046; +} +{ +mul.f16x2 r5107, r5104, r5036; +} +{ +add.f16x2 r5110, r5048, r5107; +} +{ +add.f16x2 r5113, r5051, r5052; +} +{ +mul.f16x2 r5116, r5113, r5038; +} +{ +add.f16x2 r5119, r5110, r5116; +} +{ +sub.f16x2 r5122, r5057, r5058; +} +{ +mul.f16x2 r5125, r5122, r5037; +} +{ +sub.f16x2 r5128, r5063, r5064; +} +{ +mul.f16x2 r5131, r5128, r5039; +} +{ +add.f16x2 r5134, r5125, r5131; +} +{ +add.f16x2 %44, r5119, r5134; +} +{ +add.f16x2 r5140, r5045, r5046; +} +{ +mul.f16x2 r5143, r5140, r5038; +} +{ +add.f16x2 r5146, r5048, r5143; +} +{ +add.f16x2 r5149, r5051, r5052; +} +{ +mul.f16x2 r5152, r5149, r5040; +} +{ +add.f16x2 r5155, r5146, r5152; +} +{ +sub.f16x2 r5158, r5057, r5058; +} +{ +mul.f16x2 r5161, r5158, r5039; +} +{ +sub.f16x2 r5164, r5063, r5064; +} +{ +mul.f16x2 r5167, r5164, r5042; +} +{ +add.f16x2 r5170, r5161, r5167; +} +{ +sub.f16x2 %24, r5155, r5170; +} +{ +add.f16x2 r5176, r5045, r5046; +} +{ +mul.f16x2 r5179, r5176, r5038; +} +{ +add.f16x2 r5182, r5048, r5179; +} +{ +add.f16x2 r5185, r5051, r5052; +} +{ +mul.f16x2 r5188, r5185, r5040; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +sub.f16x2 r5194, r5057, r5058; +} +{ +mul.f16x2 r5197, r5194, r5039; +} +{ +sub.f16x2 r5200, r5063, r5064; +} +{ +mul.f16x2 r5203, r5200, r5042; +} +{ +add.f16x2 r5206, r5197, r5203; +} +{ +add.f16x2 %34, r5191, r5206; +} +{ +add.f16x2 r5212, r5057, r5058; +} +{ +mul.f16x2 r5215, r5212, r5036; +} +{ +add.f16x2 r5218, r5060, r5215; +} +{ +add.f16x2 r5221, r5063, r5064; +} +{ +mul.f16x2 r5224, r5221, r5038; +} +{ +add.f16x2 r5227, r5218, r5224; +} +{ +sub.f16x2 r5230, r5045, r5046; +} +{ +mul.f16x2 r5233, r5230, r5037; +} +{ +sub.f16x2 r5236, r5051, r5052; +} +{ +mul.f16x2 r5239, r5236, r5039; +} +{ +add.f16x2 r5242, r5233, r5239; +} +{ +add.f16x2 %15, r5227, r5242; +} +{ +add.f16x2 r5248, r5057, r5058; +} +{ +mul.f16x2 r5251, r5248, r5036; +} +{ +add.f16x2 r5254, r5060, r5251; +} +{ +add.f16x2 r5257, r5063, r5064; +} +{ +mul.f16x2 r5260, r5257, r5038; +} +{ +add.f16x2 r5263, r5254, r5260; +} +{ +sub.f16x2 r5266, r5045, r5046; +} +{ +mul.f16x2 r5269, r5266, r5037; +} +{ +sub.f16x2 r5272, r5051, r5052; +} +{ +mul.f16x2 r5275, r5272, r5039; +} +{ +add.f16x2 r5278, r5269, r5275; +} +{ +sub.f16x2 %45, r5263, r5278; +} +{ +add.f16x2 r5284, r5057, r5058; +} +{ +mul.f16x2 r5287, r5284, r5038; +} +{ +add.f16x2 r5290, r5060, r5287; +} +{ +add.f16x2 r5293, r5063, r5064; +} +{ +mul.f16x2 r5296, r5293, r5040; +} +{ +add.f16x2 r5299, r5290, r5296; +} +{ +sub.f16x2 r5302, r5045, r5046; +} +{ +mul.f16x2 r5305, r5302, r5039; +} +{ +sub.f16x2 r5308, r5051, r5052; +} +{ +mul.f16x2 r5311, r5308, r5042; +} +{ +add.f16x2 r5314, r5305, r5311; +} +{ +add.f16x2 %25, r5299, r5314; +} +{ +add.f16x2 r5320, r5057, r5058; +} +{ +mul.f16x2 r5323, r5320, r5038; +} +{ +add.f16x2 r5326, r5060, r5323; +} +{ +add.f16x2 r5329, r5063, r5064; +} +{ +mul.f16x2 r5332, r5329, r5040; +} +{ +add.f16x2 r5335, r5326, r5332; +} +{ +sub.f16x2 r5338, r5045, r5046; +} +{ +mul.f16x2 r5341, r5338, r5039; +} +{ +sub.f16x2 r5344, r5051, r5052; +} +{ +mul.f16x2 r5347, r5344, r5042; +} +{ +add.f16x2 r5350, r5341, r5347; +} +{ +sub.f16x2 %35, r5335, r5350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5361, {low, high}; +} +{ +neg.f16x2 r5362, r5361; +} +{ +add.f16x2 r5364, r5365, r5366; +} +{ +add.f16x2 r5367, r5368, r5364; +} +{ +add.f16x2 r5370, r5371, r5372; +} +{ +add.f16x2 %6, r5367, r5370; +} +{ +add.f16x2 r5376, r5377, r5378; +} +{ +add.f16x2 r5379, r5380, r5376; +} +{ +add.f16x2 r5382, r5383, r5384; +} +{ +add.f16x2 %7, r5379, r5382; +} +{ +add.f16x2 r5388, r5365, r5366; +} +{ +mul.f16x2 r5391, r5388, r5356; +} +{ +add.f16x2 r5394, r5368, r5391; +} +{ +add.f16x2 r5397, r5371, r5372; +} +{ +mul.f16x2 r5400, r5397, r5358; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5377, r5378; +} +{ +mul.f16x2 r5409, r5406, r5357; +} +{ +sub.f16x2 r5412, r5383, r5384; +} +{ +mul.f16x2 r5415, r5412, r5359; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +sub.f16x2 %16, r5403, r5418; +} +{ +add.f16x2 r5424, r5365, r5366; +} +{ +mul.f16x2 r5427, r5424, r5356; +} +{ +add.f16x2 r5430, r5368, r5427; +} +{ +add.f16x2 r5433, r5371, r5372; +} +{ +mul.f16x2 r5436, r5433, r5358; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5377, r5378; +} +{ +mul.f16x2 r5445, r5442, r5357; +} +{ +sub.f16x2 r5448, r5383, r5384; +} +{ +mul.f16x2 r5451, r5448, r5359; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +add.f16x2 %46, r5439, r5454; +} +{ +add.f16x2 r5460, r5365, r5366; +} +{ +mul.f16x2 r5463, r5460, r5358; +} +{ +add.f16x2 r5466, r5368, r5463; +} +{ +add.f16x2 r5469, r5371, r5372; +} +{ +mul.f16x2 r5472, r5469, r5360; +} +{ +add.f16x2 r5475, r5466, r5472; +} +{ +sub.f16x2 r5478, r5377, r5378; +} +{ +mul.f16x2 r5481, r5478, r5359; +} +{ +sub.f16x2 r5484, r5383, r5384; +} +{ +mul.f16x2 r5487, r5484, r5362; +} +{ +add.f16x2 r5490, r5481, r5487; +} +{ +sub.f16x2 %26, r5475, r5490; +} +{ +add.f16x2 r5496, r5365, r5366; +} +{ +mul.f16x2 r5499, r5496, r5358; +} +{ +add.f16x2 r5502, r5368, r5499; +} +{ +add.f16x2 r5505, r5371, r5372; +} +{ +mul.f16x2 r5508, r5505, r5360; +} +{ +add.f16x2 r5511, r5502, r5508; +} +{ +sub.f16x2 r5514, r5377, r5378; +} +{ +mul.f16x2 r5517, r5514, r5359; +} +{ +sub.f16x2 r5520, r5383, r5384; +} +{ +mul.f16x2 r5523, r5520, r5362; +} +{ +add.f16x2 r5526, r5517, r5523; +} +{ +add.f16x2 %36, r5511, r5526; +} +{ +add.f16x2 r5532, r5377, r5378; +} +{ +mul.f16x2 r5535, r5532, r5356; +} +{ +add.f16x2 r5538, r5380, r5535; +} +{ +add.f16x2 r5541, r5383, r5384; +} +{ +mul.f16x2 r5544, r5541, r5358; +} +{ +add.f16x2 r5547, r5538, r5544; +} +{ +sub.f16x2 r5550, r5365, r5366; +} +{ +mul.f16x2 r5553, r5550, r5357; +} +{ +sub.f16x2 r5556, r5371, r5372; +} +{ +mul.f16x2 r5559, r5556, r5359; +} +{ +add.f16x2 r5562, r5553, r5559; +} +{ +add.f16x2 %17, r5547, r5562; +} +{ +add.f16x2 r5568, r5377, r5378; +} +{ +mul.f16x2 r5571, r5568, r5356; +} +{ +add.f16x2 r5574, r5380, r5571; +} +{ +add.f16x2 r5577, r5383, r5384; +} +{ +mul.f16x2 r5580, r5577, r5358; +} +{ +add.f16x2 r5583, r5574, r5580; +} +{ +sub.f16x2 r5586, r5365, r5366; +} +{ +mul.f16x2 r5589, r5586, r5357; +} +{ +sub.f16x2 r5592, r5371, r5372; +} +{ +mul.f16x2 r5595, r5592, r5359; +} +{ +add.f16x2 r5598, r5589, r5595; +} +{ +sub.f16x2 %47, r5583, r5598; +} +{ +add.f16x2 r5604, r5377, r5378; +} +{ +mul.f16x2 r5607, r5604, r5358; +} +{ +add.f16x2 r5610, r5380, r5607; +} +{ +add.f16x2 r5613, r5383, r5384; +} +{ +mul.f16x2 r5616, r5613, r5360; +} +{ +add.f16x2 r5619, r5610, r5616; +} +{ +sub.f16x2 r5622, r5365, r5366; +} +{ +mul.f16x2 r5625, r5622, r5359; +} +{ +sub.f16x2 r5628, r5371, r5372; +} +{ +mul.f16x2 r5631, r5628, r5362; +} +{ +add.f16x2 r5634, r5625, r5631; +} +{ +add.f16x2 %27, r5619, r5634; +} +{ +add.f16x2 r5640, r5377, r5378; +} +{ +mul.f16x2 r5643, r5640, r5358; +} +{ +add.f16x2 r5646, r5380, r5643; +} +{ +add.f16x2 r5649, r5383, r5384; +} +{ +mul.f16x2 r5652, r5649, r5360; +} +{ +add.f16x2 r5655, r5646, r5652; +} +{ +sub.f16x2 r5658, r5365, r5366; +} +{ +mul.f16x2 r5661, r5658, r5359; +} +{ +sub.f16x2 r5664, r5371, r5372; +} +{ +mul.f16x2 r5667, r5664, r5362; +} +{ +add.f16x2 r5670, r5661, r5667; +} +{ +sub.f16x2 %37, r5655, r5670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5680, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5681, {low, high}; +} +{ +neg.f16x2 r5682, r5681; +} +{ +add.f16x2 r5684, r5685, r5686; +} +{ +add.f16x2 r5687, r5688, r5684; +} +{ +add.f16x2 r5690, r5691, r5692; +} +{ +add.f16x2 %8, r5687, r5690; +} +{ +add.f16x2 r5696, r5697, r5698; +} +{ +add.f16x2 r5699, r5700, r5696; +} +{ +add.f16x2 r5702, r5703, r5704; +} +{ +add.f16x2 %9, r5699, r5702; +} +{ +add.f16x2 r5708, r5685, r5686; +} +{ +mul.f16x2 r5711, r5708, r5676; +} +{ +add.f16x2 r5714, r5688, r5711; +} +{ +add.f16x2 r5717, r5691, r5692; +} +{ +mul.f16x2 r5720, r5717, r5678; +} +{ +add.f16x2 r5723, r5714, r5720; +} +{ +sub.f16x2 r5726, r5697, r5698; +} +{ +mul.f16x2 r5729, r5726, r5677; +} +{ +sub.f16x2 r5732, r5703, r5704; +} +{ +mul.f16x2 r5735, r5732, r5679; +} +{ +add.f16x2 r5738, r5729, r5735; +} +{ +sub.f16x2 %18, r5723, r5738; +} +{ +add.f16x2 r5744, r5685, r5686; +} +{ +mul.f16x2 r5747, r5744, r5676; +} +{ +add.f16x2 r5750, r5688, r5747; +} +{ +add.f16x2 r5753, r5691, r5692; +} +{ +mul.f16x2 r5756, r5753, r5678; +} +{ +add.f16x2 r5759, r5750, r5756; +} +{ +sub.f16x2 r5762, r5697, r5698; +} +{ +mul.f16x2 r5765, r5762, r5677; +} +{ +sub.f16x2 r5768, r5703, r5704; +} +{ +mul.f16x2 r5771, r5768, r5679; +} +{ +add.f16x2 r5774, r5765, r5771; +} +{ +add.f16x2 %48, r5759, r5774; +} +{ +add.f16x2 r5780, r5685, r5686; +} +{ +mul.f16x2 r5783, r5780, r5678; +} +{ +add.f16x2 r5786, r5688, r5783; +} +{ +add.f16x2 r5789, r5691, r5692; +} +{ +mul.f16x2 r5792, r5789, r5680; +} +{ +add.f16x2 r5795, r5786, r5792; +} +{ +sub.f16x2 r5798, r5697, r5698; +} +{ +mul.f16x2 r5801, r5798, r5679; +} +{ +sub.f16x2 r5804, r5703, r5704; +} +{ +mul.f16x2 r5807, r5804, r5682; +} +{ +add.f16x2 r5810, r5801, r5807; +} +{ +sub.f16x2 %28, r5795, r5810; +} +{ +add.f16x2 r5816, r5685, r5686; +} +{ +mul.f16x2 r5819, r5816, r5678; +} +{ +add.f16x2 r5822, r5688, r5819; +} +{ +add.f16x2 r5825, r5691, r5692; +} +{ +mul.f16x2 r5828, r5825, r5680; +} +{ +add.f16x2 r5831, r5822, r5828; +} +{ +sub.f16x2 r5834, r5697, r5698; +} +{ +mul.f16x2 r5837, r5834, r5679; +} +{ +sub.f16x2 r5840, r5703, r5704; +} +{ +mul.f16x2 r5843, r5840, r5682; +} +{ +add.f16x2 r5846, r5837, r5843; +} +{ +add.f16x2 %38, r5831, r5846; +} +{ +add.f16x2 r5852, r5697, r5698; +} +{ +mul.f16x2 r5855, r5852, r5676; +} +{ +add.f16x2 r5858, r5700, r5855; +} +{ +add.f16x2 r5861, r5703, r5704; +} +{ +mul.f16x2 r5864, r5861, r5678; +} +{ +add.f16x2 r5867, r5858, r5864; +} +{ +sub.f16x2 r5870, r5685, r5686; +} +{ +mul.f16x2 r5873, r5870, r5677; +} +{ +sub.f16x2 r5876, r5691, r5692; +} +{ +mul.f16x2 r5879, r5876, r5679; +} +{ +add.f16x2 r5882, r5873, r5879; +} +{ +add.f16x2 %19, r5867, r5882; +} +{ +add.f16x2 r5888, r5697, r5698; +} +{ +mul.f16x2 r5891, r5888, r5676; +} +{ +add.f16x2 r5894, r5700, r5891; +} +{ +add.f16x2 r5897, r5703, r5704; +} +{ +mul.f16x2 r5900, r5897, r5678; +} +{ +add.f16x2 r5903, r5894, r5900; +} +{ +sub.f16x2 r5906, r5685, r5686; +} +{ +mul.f16x2 r5909, r5906, r5677; +} +{ +sub.f16x2 r5912, r5691, r5692; +} +{ +mul.f16x2 r5915, r5912, r5679; +} +{ +add.f16x2 r5918, r5909, r5915; +} +{ +sub.f16x2 %49, r5903, r5918; +} +{ +add.f16x2 r5924, r5697, r5698; +} +{ +mul.f16x2 r5927, r5924, r5678; +} +{ +add.f16x2 r5930, r5700, r5927; +} +{ +add.f16x2 r5933, r5703, r5704; +} +{ +mul.f16x2 r5936, r5933, r5680; +} +{ +add.f16x2 r5939, r5930, r5936; +} +{ +sub.f16x2 r5942, r5685, r5686; +} +{ +mul.f16x2 r5945, r5942, r5679; +} +{ +sub.f16x2 r5948, r5691, r5692; +} +{ +mul.f16x2 r5951, r5948, r5682; +} +{ +add.f16x2 r5954, r5945, r5951; +} +{ +add.f16x2 %29, r5939, r5954; +} +{ +add.f16x2 r5960, r5697, r5698; +} +{ +mul.f16x2 r5963, r5960, r5678; +} +{ +add.f16x2 r5966, r5700, r5963; +} +{ +add.f16x2 r5969, r5703, r5704; +} +{ +mul.f16x2 r5972, r5969, r5680; +} +{ +add.f16x2 r5975, r5966, r5972; +} +{ +sub.f16x2 r5978, r5685, r5686; +} +{ +mul.f16x2 r5981, r5978, r5679; +} +{ +sub.f16x2 r5984, r5691, r5692; +} +{ +mul.f16x2 r5987, r5984, r5682; +} +{ +add.f16x2 r5990, r5981, r5987; +} +{ +sub.f16x2 %39, r5975, r5990; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[18].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<906, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<67>; +.reg .b32 r<1280>; +.reg .b64 rd<6>; +mov.u32 r1263, %tid.x; +mov.f32 f58, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r1, {low, high}; +} +mov.f32 f60, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r2, {low, high}; +} +mov.f32 f54, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r3, {low, high}; +} +mov.f32 f56, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r1263, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1264, rd3; +mul.lo.s32 r1265, r1264, 25; +sub.s32 r1266, r1263, r1265; +cvt.rn.f32.u32 f61, r1266; +mul.f32 f62, f61, 0f3D4DE32E; +cos.approx.f32 f13, f62; +sin.approx.f32 f63, f62; +neg.f32 f14, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +mov.u32 r1267, %tid.y; +mov.u32 r1268, %10; +mad.lo.s32 r1269, r1267, 1000, r1268; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +mad.lo.s32 r1270, r1264, 1000, r1269; +barrier.sync 0; +mad.lo.s32 r1271, r1266, 40, r1270; +st.shared.v2.f32 [r1271], {r18, r30}; +st.shared.v2.f32 [r1271+8], {r333, r340}; +st.shared.v2.f32 [r1271+16], {r370, r377}; +st.shared.v2.f32 [r1271+24], {r407, r414}; +st.shared.v2.f32 [r1271+32], {r444, r451}; +barrier.sync 0; +shl.b32 r1272, r1266, 5; +sub.s32 r1273, r1271, r1272; +ld.shared.u32 r484, [r1273]; +ld.shared.u32 r496, [r1273+4]; +ld.shared.u32 r481, [r1273+200]; +ld.shared.u32 r493, [r1273+204]; +ld.shared.u32 r487, [r1273+400]; +ld.shared.u32 r499, [r1273+404]; +ld.shared.u32 r488, [r1273+600]; +ld.shared.u32 r500, [r1273+604]; +ld.shared.u32 r482, [r1273+800]; +ld.shared.u32 r494, [r1273+804]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 r489, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 r645, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 r681, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 r717, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 r753, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 r789, r771, r786; +} +mul.wide.u32 rd4, r1266, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1274, rd5; +cvt.rn.f32.u32 f64, r1274; +mul.f32 f65, f64, 0f3E80ADFD; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r792, {low, high}; +} +mul.lo.s32 r1275, r1274, 5; +sub.s32 r1276, r1266, r1275; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r797, {high, high}; +} +{ +mul.f16x2 r799, r681, r797; +} +{ +neg.f16x2 r802, r799; +} +{ +fma.rn.f16x2 r804, r537, r795, r802; +} +{ +mul.f16x2 r808, r537, r797; +} +{ +fma.rn.f16x2 r811, r681, r795, r808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r817, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r819, {low, high}; +} +{ +mul.f16x2 r820, r817, r819; +} +{ +mul.f16x2 r823, r792, r815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r826, {high, low}; +} +{ +fma.rn.f16x2 r828, r820, r826, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r834, {high, high}; +} +{ +mul.f16x2 r836, r753, r834; +} +{ +neg.f16x2 r839, r836; +} +{ +fma.rn.f16x2 r841, r609, r832, r839; +} +{ +mul.f16x2 r845, r609, r834; +} +{ +fma.rn.f16x2 r848, r753, r832, r845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r854, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r856, {low, high}; +} +{ +mul.f16x2 r857, r854, r856; +} +{ +mul.f16x2 r860, r828, r852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r863, {high, low}; +} +{ +fma.rn.f16x2 r865, r857, r863, r860; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r871, {high, high}; +} +{ +mul.f16x2 r873, r789, r871; +} +{ +neg.f16x2 r876, r873; +} +{ +fma.rn.f16x2 r878, r645, r869, r876; +} +{ +mul.f16x2 r882, r645, r871; +} +{ +fma.rn.f16x2 r885, r789, r869, r882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r891, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r893, {low, high}; +} +{ +mul.f16x2 r894, r891, r893; +} +{ +mul.f16x2 r897, r865, r889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r900, {high, low}; +} +{ +fma.rn.f16x2 r902, r894, r900, r897; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r908, {high, high}; +} +{ +mul.f16x2 r910, r717, r908; +} +{ +neg.f16x2 r913, r910; +} +{ +fma.rn.f16x2 r915, r573, r906, r913; +} +{ +mul.f16x2 r919, r573, r908; +} +{ +fma.rn.f16x2 r922, r717, r906, r919; +} +shl.b32 r1277, r1276, 3; +add.s32 r1278, r1270, r1277; +barrier.sync 0; +mad.lo.s32 r1279, r1274, 200, r1278; +st.shared.u32 [r1279], r489; +st.shared.u32 [r1279+4], r501; +st.shared.u32 [r1279+40], r804; +st.shared.u32 [r1279+44], r811; +st.shared.u32 [r1279+80], r841; +st.shared.u32 [r1279+84], r848; +st.shared.u32 [r1279+120], r878; +st.shared.u32 [r1279+124], r885; +st.shared.u32 [r1279+160], r915; +st.shared.u32 [r1279+164], r922; +barrier.sync 0; +ld.shared.u32 r955, [r1273]; +ld.shared.u32 r967, [r1273+4]; +ld.shared.u32 r952, [r1273+200]; +ld.shared.u32 r964, [r1273+204]; +ld.shared.u32 r958, [r1273+400]; +ld.shared.u32 r970, [r1273+404]; +ld.shared.u32 r959, [r1273+600]; +ld.shared.u32 r971, [r1273+604]; +ld.shared.u32 r953, [r1273+800]; +ld.shared.u32 r965, [r1273+804]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +add.f16x2 r951, r952, r953; +} +{ +add.f16x2 r954, r955, r951; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 %0, r954, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r967, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 %1, r966, r969; +} +{ +add.f16x2 r975, r952, r953; +} +{ +mul.f16x2 r978, r975, r943; +} +{ +add.f16x2 r981, r955, r978; +} +{ +add.f16x2 r984, r958, r959; +} +{ +mul.f16x2 r987, r984, r945; +} +{ +add.f16x2 r990, r981, r987; +} +{ +sub.f16x2 r993, r964, r965; +} +{ +mul.f16x2 r996, r993, r944; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r946; +} +{ +add.f16x2 r1005, r996, r1002; +} +{ +sub.f16x2 %2, r990, r1005; +} +{ +add.f16x2 r1011, r952, r953; +} +{ +mul.f16x2 r1014, r1011, r943; +} +{ +add.f16x2 r1017, r955, r1014; +} +{ +add.f16x2 r1020, r958, r959; +} +{ +mul.f16x2 r1023, r1020, r945; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r964, r965; +} +{ +mul.f16x2 r1032, r1029, r944; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r946; +} +{ +add.f16x2 r1041, r1032, r1038; +} +{ +add.f16x2 %8, r1026, r1041; +} +{ +add.f16x2 r1047, r952, r953; +} +{ +mul.f16x2 r1050, r1047, r945; +} +{ +add.f16x2 r1053, r955, r1050; +} +{ +add.f16x2 r1056, r958, r959; +} +{ +mul.f16x2 r1059, r1056, r947; +} +{ +add.f16x2 r1062, r1053, r1059; +} +{ +sub.f16x2 r1065, r964, r965; +} +{ +mul.f16x2 r1068, r1065, r946; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r949; +} +{ +add.f16x2 r1077, r1068, r1074; +} +{ +sub.f16x2 %4, r1062, r1077; +} +{ +add.f16x2 r1083, r952, r953; +} +{ +mul.f16x2 r1086, r1083, r945; +} +{ +add.f16x2 r1089, r955, r1086; +} +{ +add.f16x2 r1092, r958, r959; +} +{ +mul.f16x2 r1095, r1092, r947; +} +{ +add.f16x2 r1098, r1089, r1095; +} +{ +sub.f16x2 r1101, r964, r965; +} +{ +mul.f16x2 r1104, r1101, r946; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r949; +} +{ +add.f16x2 r1113, r1104, r1110; +} +{ +add.f16x2 %6, r1098, r1113; +} +{ +add.f16x2 r1119, r964, r965; +} +{ +mul.f16x2 r1122, r1119, r943; +} +{ +add.f16x2 r1125, r967, r1122; +} +{ +add.f16x2 r1128, r970, r971; +} +{ +mul.f16x2 r1131, r1128, r945; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r952, r953; +} +{ +mul.f16x2 r1140, r1137, r944; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r946; +} +{ +add.f16x2 r1149, r1140, r1146; +} +{ +add.f16x2 %3, r1134, r1149; +} +{ +add.f16x2 r1155, r964, r965; +} +{ +mul.f16x2 r1158, r1155, r943; +} +{ +add.f16x2 r1161, r967, r1158; +} +{ +add.f16x2 r1164, r970, r971; +} +{ +mul.f16x2 r1167, r1164, r945; +} +{ +add.f16x2 r1170, r1161, r1167; +} +{ +sub.f16x2 r1173, r952, r953; +} +{ +mul.f16x2 r1176, r1173, r944; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r946; +} +{ +add.f16x2 r1185, r1176, r1182; +} +{ +sub.f16x2 %9, r1170, r1185; +} +{ +add.f16x2 r1191, r964, r965; +} +{ +mul.f16x2 r1194, r1191, r945; +} +{ +add.f16x2 r1197, r967, r1194; +} +{ +add.f16x2 r1200, r970, r971; +} +{ +mul.f16x2 r1203, r1200, r947; +} +{ +add.f16x2 r1206, r1197, r1203; +} +{ +sub.f16x2 r1209, r952, r953; +} +{ +mul.f16x2 r1212, r1209, r946; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r949; +} +{ +add.f16x2 r1221, r1212, r1218; +} +{ +add.f16x2 %5, r1206, r1221; +} +{ +add.f16x2 r1227, r964, r965; +} +{ +mul.f16x2 r1230, r1227, r945; +} +{ +add.f16x2 r1233, r967, r1230; +} +{ +add.f16x2 r1236, r970, r971; +} +{ +mul.f16x2 r1239, r1236, r947; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r952, r953; +} +{ +mul.f16x2 r1248, r1245, r946; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r949; +} +{ +add.f16x2 r1257, r1248, r1254; +} +{ +sub.f16x2 %7, r1242, r1257; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<907, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<67>; +.reg .b32 r<1280>; +.reg .b64 rd<6>; +mov.u32 r1263, %tid.y; +mov.u32 r1264, %10; +mad.lo.s32 r1265, r1263, 500, r1264; +mov.u32 r1266, %tid.x; +mov.f32 f58, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r1, {low, high}; +} +mov.f32 f60, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r2, {low, high}; +} +mov.f32 f54, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r3, {low, high}; +} +mov.f32 f56, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r1266, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1267, rd3; +mul.lo.s32 r1268, r1267, 25; +sub.s32 r1269, r1266, r1268; +mad.lo.s32 r1270, r1267, 500, r1265; +cvt.rn.f32.u32 f61, r1269; +mul.f32 f62, f61, 0f3D4DE32E; +cos.approx.f32 f13, f62; +sin.approx.f32 f63, f62; +neg.f32 f14, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +barrier.sync 0; +mad.lo.s32 r1271, r1269, 20, r1270; +st.shared.u32 [r1271], r18; +st.shared.u32 [r1271+4], r333; +st.shared.u32 [r1271+8], r370; +st.shared.u32 [r1271+12], r407; +st.shared.u32 [r1271+16], r444; +barrier.sync 0; +shl.b32 r1272, r1269, 4; +sub.s32 r1273, r1271, r1272; +ld.shared.u32 r484, [r1273]; +ld.shared.u32 r481, [r1273+100]; +ld.shared.u32 r487, [r1273+200]; +ld.shared.u32 r488, [r1273+300]; +ld.shared.u32 r482, [r1273+400]; +barrier.sync 0; +st.shared.u32 [r1271], r30; +st.shared.u32 [r1271+4], r340; +st.shared.u32 [r1271+8], r377; +st.shared.u32 [r1271+12], r414; +st.shared.u32 [r1271+16], r451; +barrier.sync 0; +ld.shared.u32 r496, [r1273]; +ld.shared.u32 r493, [r1273+100]; +ld.shared.u32 r499, [r1273+200]; +ld.shared.u32 r500, [r1273+300]; +ld.shared.u32 r494, [r1273+400]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 r489, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 r645, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 r681, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 r717, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 r753, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 r789, r771, r786; +} +mul.wide.u32 rd4, r1269, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1274, rd5; +mul.lo.s32 r1275, r1274, 5; +sub.s32 r1276, r1269, r1275; +shl.b32 r1277, r1276, 2; +add.s32 r1278, r1270, r1277; +cvt.rn.f32.u32 f64, r1274; +mul.f32 f65, f64, 0f3E80ADFD; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r792, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r797, {high, high}; +} +{ +mul.f16x2 r799, r681, r797; +} +{ +neg.f16x2 r802, r799; +} +{ +fma.rn.f16x2 r804, r537, r795, r802; +} +{ +mul.f16x2 r808, r537, r797; +} +{ +fma.rn.f16x2 r811, r681, r795, r808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r817, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r819, {low, high}; +} +{ +mul.f16x2 r820, r817, r819; +} +{ +mul.f16x2 r823, r792, r815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r826, {high, low}; +} +{ +fma.rn.f16x2 r828, r820, r826, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r834, {high, high}; +} +{ +mul.f16x2 r836, r753, r834; +} +{ +neg.f16x2 r839, r836; +} +{ +fma.rn.f16x2 r841, r609, r832, r839; +} +{ +mul.f16x2 r845, r609, r834; +} +{ +fma.rn.f16x2 r848, r753, r832, r845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r854, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r856, {low, high}; +} +{ +mul.f16x2 r857, r854, r856; +} +{ +mul.f16x2 r860, r828, r852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r863, {high, low}; +} +{ +fma.rn.f16x2 r865, r857, r863, r860; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r871, {high, high}; +} +{ +mul.f16x2 r873, r789, r871; +} +{ +neg.f16x2 r876, r873; +} +{ +fma.rn.f16x2 r878, r645, r869, r876; +} +{ +mul.f16x2 r882, r645, r871; +} +{ +fma.rn.f16x2 r885, r789, r869, r882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r891, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r893, {low, high}; +} +{ +mul.f16x2 r894, r891, r893; +} +{ +mul.f16x2 r897, r865, r889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r900, {high, low}; +} +{ +fma.rn.f16x2 r902, r894, r900, r897; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r908, {high, high}; +} +{ +mul.f16x2 r910, r717, r908; +} +{ +neg.f16x2 r913, r910; +} +{ +fma.rn.f16x2 r915, r573, r906, r913; +} +{ +mul.f16x2 r919, r573, r908; +} +{ +fma.rn.f16x2 r922, r717, r906, r919; +} +barrier.sync 0; +mad.lo.s32 r1279, r1274, 100, r1278; +st.shared.u32 [r1279], r489; +st.shared.u32 [r1279+20], r804; +st.shared.u32 [r1279+40], r841; +st.shared.u32 [r1279+60], r878; +st.shared.u32 [r1279+80], r915; +barrier.sync 0; +ld.shared.u32 r955, [r1273]; +ld.shared.u32 r952, [r1273+100]; +ld.shared.u32 r958, [r1273+200]; +ld.shared.u32 r959, [r1273+300]; +ld.shared.u32 r953, [r1273+400]; +barrier.sync 0; +st.shared.u32 [r1279], r501; +st.shared.u32 [r1279+20], r811; +st.shared.u32 [r1279+40], r848; +st.shared.u32 [r1279+60], r885; +st.shared.u32 [r1279+80], r922; +barrier.sync 0; +ld.shared.u32 r967, [r1273]; +ld.shared.u32 r964, [r1273+100]; +ld.shared.u32 r970, [r1273+200]; +ld.shared.u32 r971, [r1273+300]; +ld.shared.u32 r965, [r1273+400]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +add.f16x2 r951, r952, r953; +} +{ +add.f16x2 r954, r955, r951; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 %0, r954, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r967, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 %1, r966, r969; +} +{ +add.f16x2 r975, r952, r953; +} +{ +mul.f16x2 r978, r975, r943; +} +{ +add.f16x2 r981, r955, r978; +} +{ +add.f16x2 r984, r958, r959; +} +{ +mul.f16x2 r987, r984, r945; +} +{ +add.f16x2 r990, r981, r987; +} +{ +sub.f16x2 r993, r964, r965; +} +{ +mul.f16x2 r996, r993, r944; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r946; +} +{ +add.f16x2 r1005, r996, r1002; +} +{ +sub.f16x2 %2, r990, r1005; +} +{ +add.f16x2 r1011, r952, r953; +} +{ +mul.f16x2 r1014, r1011, r943; +} +{ +add.f16x2 r1017, r955, r1014; +} +{ +add.f16x2 r1020, r958, r959; +} +{ +mul.f16x2 r1023, r1020, r945; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r964, r965; +} +{ +mul.f16x2 r1032, r1029, r944; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r946; +} +{ +add.f16x2 r1041, r1032, r1038; +} +{ +add.f16x2 %8, r1026, r1041; +} +{ +add.f16x2 r1047, r952, r953; +} +{ +mul.f16x2 r1050, r1047, r945; +} +{ +add.f16x2 r1053, r955, r1050; +} +{ +add.f16x2 r1056, r958, r959; +} +{ +mul.f16x2 r1059, r1056, r947; +} +{ +add.f16x2 r1062, r1053, r1059; +} +{ +sub.f16x2 r1065, r964, r965; +} +{ +mul.f16x2 r1068, r1065, r946; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r949; +} +{ +add.f16x2 r1077, r1068, r1074; +} +{ +sub.f16x2 %4, r1062, r1077; +} +{ +add.f16x2 r1083, r952, r953; +} +{ +mul.f16x2 r1086, r1083, r945; +} +{ +add.f16x2 r1089, r955, r1086; +} +{ +add.f16x2 r1092, r958, r959; +} +{ +mul.f16x2 r1095, r1092, r947; +} +{ +add.f16x2 r1098, r1089, r1095; +} +{ +sub.f16x2 r1101, r964, r965; +} +{ +mul.f16x2 r1104, r1101, r946; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r949; +} +{ +add.f16x2 r1113, r1104, r1110; +} +{ +add.f16x2 %6, r1098, r1113; +} +{ +add.f16x2 r1119, r964, r965; +} +{ +mul.f16x2 r1122, r1119, r943; +} +{ +add.f16x2 r1125, r967, r1122; +} +{ +add.f16x2 r1128, r970, r971; +} +{ +mul.f16x2 r1131, r1128, r945; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r952, r953; +} +{ +mul.f16x2 r1140, r1137, r944; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r946; +} +{ +add.f16x2 r1149, r1140, r1146; +} +{ +add.f16x2 %3, r1134, r1149; +} +{ +add.f16x2 r1155, r964, r965; +} +{ +mul.f16x2 r1158, r1155, r943; +} +{ +add.f16x2 r1161, r967, r1158; +} +{ +add.f16x2 r1164, r970, r971; +} +{ +mul.f16x2 r1167, r1164, r945; +} +{ +add.f16x2 r1170, r1161, r1167; +} +{ +sub.f16x2 r1173, r952, r953; +} +{ +mul.f16x2 r1176, r1173, r944; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r946; +} +{ +add.f16x2 r1185, r1176, r1182; +} +{ +sub.f16x2 %9, r1170, r1185; +} +{ +add.f16x2 r1191, r964, r965; +} +{ +mul.f16x2 r1194, r1191, r945; +} +{ +add.f16x2 r1197, r967, r1194; +} +{ +add.f16x2 r1200, r970, r971; +} +{ +mul.f16x2 r1203, r1200, r947; +} +{ +add.f16x2 r1206, r1197, r1203; +} +{ +sub.f16x2 r1209, r952, r953; +} +{ +mul.f16x2 r1212, r1209, r946; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r949; +} +{ +add.f16x2 r1221, r1212, r1218; +} +{ +add.f16x2 %5, r1206, r1221; +} +{ +add.f16x2 r1227, r964, r965; +} +{ +mul.f16x2 r1230, r1227, r945; +} +{ +add.f16x2 r1233, r967, r1230; +} +{ +add.f16x2 r1236, r970, r971; +} +{ +mul.f16x2 r1239, r1236, r947; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r952, r953; +} +{ +mul.f16x2 r1248, r1245, r946; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r949; +} +{ +add.f16x2 r1257, r1248, r1254; +} +{ +sub.f16x2 %7, r1242, r1257; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..5333883e06564 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp16_inv.hpp.inc @@ -0,0 +1,17294 @@ +#ifndef CUFFTDX_FFT_125_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_125_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1107, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<332>; +.reg .b32 r<6088>; +.reg .b64 rd<4>; +mov.u32 r6086, %tid.y; +mov.u32 r6087, %50; +mad.lo.s32 r6028, r6086, 1000, r6087; +mov.u32 r6029, %tid.x; +mov.f32 f326, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1, {low, high}; +} +mov.f32 f328, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f322, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5, {low, high}; +} +mov.f32 f324, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %67, %52; +} +{ +add.f16x2 r14, %74, r11; +} +{ +add.f16x2 r17, %81, %65; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %94, %79; +} +{ +add.f16x2 r26, %51, r23; +} +{ +add.f16x2 r29, %57, %92; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %67, %52; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %74, r38; +} +{ +add.f16x2 r44, %81, %65; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %94, %79; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %57, %92; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %67, %52; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %74, r74; +} +{ +add.f16x2 r80, %81, %65; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %94, %79; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %57, %92; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %67, %52; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %74, r110; +} +{ +add.f16x2 r116, %81, %65; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %94, %79; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %57, %92; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %67, %52; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %74, r146; +} +{ +add.f16x2 r152, %81, %65; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %94, %79; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %57, %92; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %94, %79; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %51, r182; +} +{ +add.f16x2 r188, %57, %92; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %67, %52; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %81, %65; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %94, %79; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %51, r218; +} +{ +add.f16x2 r224, %57, %92; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %67, %52; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %81, %65; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %94, %79; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %51, r254; +} +{ +add.f16x2 r260, %57, %92; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %67, %52; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %81, %65; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %94, %79; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %51, r290; +} +{ +add.f16x2 r296, %57, %92; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %67, %52; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %81, %65; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %89, %77; +} +{ +add.f16x2 r336, %96, r333; +} +{ +add.f16x2 r339, %55, %87; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %66, %53; +} +{ +add.f16x2 r348, %73, r345; +} +{ +add.f16x2 r351, %80, %64; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %89, %77; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %96, r360; +} +{ +add.f16x2 r366, %55, %87; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %66, %53; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %80, %64; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %89, %77; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %96, r396; +} +{ +add.f16x2 r402, %55, %87; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %66, %53; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %80, %64; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %89, %77; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %96, r432; +} +{ +add.f16x2 r438, %55, %87; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %66, %53; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %80, %64; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %89, %77; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %96, r468; +} +{ +add.f16x2 r474, %55, %87; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %66, %53; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %80, %64; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %66, %53; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %73, r504; +} +{ +add.f16x2 r510, %80, %64; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %89, %77; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %55, %87; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %66, %53; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %73, r540; +} +{ +add.f16x2 r546, %80, %64; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %89, %77; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %55, %87; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %66, %53; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %73, r576; +} +{ +add.f16x2 r582, %80, %64; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %89, %77; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %55, %87; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %66, %53; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %73, r612; +} +{ +add.f16x2 r618, %80, %64; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %89, %77; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %55, %87; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r650, {low, high}; +} +{ +neg.f16x2 r651, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, %62, %99; +} +{ +add.f16x2 r658, %69, r655; +} +{ +add.f16x2 r661, %78, %60; +} +{ +add.f16x2 r664, r658, r661; +} +{ +add.f16x2 r667, %88, %75; +} +{ +add.f16x2 r670, %95, r667; +} +{ +add.f16x2 r673, %54, %86; +} +{ +add.f16x2 r676, r670, r673; +} +{ +add.f16x2 r679, %62, %99; +} +{ +mul.f16x2 r682, r679, r645; +} +{ +add.f16x2 r685, %69, r682; +} +{ +add.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r649; +} +{ +add.f16x2 r694, r685, r691; +} +{ +sub.f16x2 r697, %88, %75; +} +{ +mul.f16x2 r700, r697, r647; +} +{ +sub.f16x2 r703, %54, %86; +} +{ +mul.f16x2 r706, r703, r651; +} +{ +add.f16x2 r709, r700, r706; +} +{ +sub.f16x2 r712, r694, r709; +} +{ +add.f16x2 r715, %62, %99; +} +{ +mul.f16x2 r718, r715, r645; +} +{ +add.f16x2 r721, %69, r718; +} +{ +add.f16x2 r724, %78, %60; +} +{ +mul.f16x2 r727, r724, r649; +} +{ +add.f16x2 r730, r721, r727; +} +{ +sub.f16x2 r733, %88, %75; +} +{ +mul.f16x2 r736, r733, r647; +} +{ +sub.f16x2 r739, %54, %86; +} +{ +mul.f16x2 r742, r739, r651; +} +{ +add.f16x2 r745, r736, r742; +} +{ +add.f16x2 r748, r730, r745; +} +{ +add.f16x2 r751, %62, %99; +} +{ +mul.f16x2 r754, r751, r649; +} +{ +add.f16x2 r757, %69, r754; +} +{ +add.f16x2 r760, %78, %60; +} +{ +mul.f16x2 r763, r760, r653; +} +{ +add.f16x2 r766, r757, r763; +} +{ +sub.f16x2 r769, %88, %75; +} +{ +mul.f16x2 r772, r769, r651; +} +{ +sub.f16x2 r775, %54, %86; +} +{ +mul.f16x2 r778, r775, r654; +} +{ +add.f16x2 r781, r772, r778; +} +{ +sub.f16x2 r784, r766, r781; +} +{ +add.f16x2 r787, %62, %99; +} +{ +mul.f16x2 r790, r787, r649; +} +{ +add.f16x2 r793, %69, r790; +} +{ +add.f16x2 r796, %78, %60; +} +{ +mul.f16x2 r799, r796, r653; +} +{ +add.f16x2 r802, r793, r799; +} +{ +sub.f16x2 r805, %88, %75; +} +{ +mul.f16x2 r808, r805, r651; +} +{ +sub.f16x2 r811, %54, %86; +} +{ +mul.f16x2 r814, r811, r654; +} +{ +add.f16x2 r817, r808, r814; +} +{ +add.f16x2 r820, r802, r817; +} +{ +add.f16x2 r823, %88, %75; +} +{ +mul.f16x2 r826, r823, r645; +} +{ +add.f16x2 r829, %95, r826; +} +{ +add.f16x2 r832, %54, %86; +} +{ +mul.f16x2 r835, r832, r649; +} +{ +add.f16x2 r838, r829, r835; +} +{ +sub.f16x2 r841, %62, %99; +} +{ +mul.f16x2 r844, r841, r647; +} +{ +sub.f16x2 r847, %78, %60; +} +{ +mul.f16x2 r850, r847, r651; +} +{ +add.f16x2 r853, r844, r850; +} +{ +add.f16x2 r856, r838, r853; +} +{ +add.f16x2 r859, %88, %75; +} +{ +mul.f16x2 r862, r859, r645; +} +{ +add.f16x2 r865, %95, r862; +} +{ +add.f16x2 r868, %54, %86; +} +{ +mul.f16x2 r871, r868, r649; +} +{ +add.f16x2 r874, r865, r871; +} +{ +sub.f16x2 r877, %62, %99; +} +{ +mul.f16x2 r880, r877, r647; +} +{ +sub.f16x2 r883, %78, %60; +} +{ +mul.f16x2 r886, r883, r651; +} +{ +add.f16x2 r889, r880, r886; +} +{ +sub.f16x2 r892, r874, r889; +} +{ +add.f16x2 r895, %88, %75; +} +{ +mul.f16x2 r898, r895, r649; +} +{ +add.f16x2 r901, %95, r898; +} +{ +add.f16x2 r904, %54, %86; +} +{ +mul.f16x2 r907, r904, r653; +} +{ +add.f16x2 r910, r901, r907; +} +{ +sub.f16x2 r913, %62, %99; +} +{ +mul.f16x2 r916, r913, r651; +} +{ +sub.f16x2 r919, %78, %60; +} +{ +mul.f16x2 r922, r919, r654; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r910, r925; +} +{ +add.f16x2 r931, %88, %75; +} +{ +mul.f16x2 r934, r931, r649; +} +{ +add.f16x2 r937, %95, r934; +} +{ +add.f16x2 r940, %54, %86; +} +{ +mul.f16x2 r943, r940, r653; +} +{ +add.f16x2 r946, r937, r943; +} +{ +sub.f16x2 r949, %62, %99; +} +{ +mul.f16x2 r952, r949, r651; +} +{ +sub.f16x2 r955, %78, %60; +} +{ +mul.f16x2 r958, r955, r654; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, r946, r961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r968, {low, high}; +} +{ +neg.f16x2 r969, r968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r972, {low, high}; +} +{ +neg.f16x2 r973, r972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r976, {low, high}; +} +{ +add.f16x2 r977, %85, %71; +} +{ +add.f16x2 r980, %91, r977; +} +{ +add.f16x2 r983, %100, %83; +} +{ +add.f16x2 r986, r980, r983; +} +{ +add.f16x2 r989, %61, %97; +} +{ +add.f16x2 r992, %68, r989; +} +{ +add.f16x2 r995, %76, %59; +} +{ +add.f16x2 r998, r992, r995; +} +{ +add.f16x2 r1001, %85, %71; +} +{ +mul.f16x2 r1004, r1001, r967; +} +{ +add.f16x2 r1007, %91, r1004; +} +{ +add.f16x2 r1010, %100, %83; +} +{ +mul.f16x2 r1013, r1010, r971; +} +{ +add.f16x2 r1016, r1007, r1013; +} +{ +sub.f16x2 r1019, %61, %97; +} +{ +mul.f16x2 r1022, r1019, r969; +} +{ +sub.f16x2 r1025, %76, %59; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r1022, r1028; +} +{ +sub.f16x2 r1034, r1016, r1031; +} +{ +add.f16x2 r1037, %85, %71; +} +{ +mul.f16x2 r1040, r1037, r967; +} +{ +add.f16x2 r1043, %91, r1040; +} +{ +add.f16x2 r1046, %100, %83; +} +{ +mul.f16x2 r1049, r1046, r971; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, %61, %97; +} +{ +mul.f16x2 r1058, r1055, r969; +} +{ +sub.f16x2 r1061, %76, %59; +} +{ +mul.f16x2 r1064, r1061, r973; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +add.f16x2 r1070, r1052, r1067; +} +{ +add.f16x2 r1073, %85, %71; +} +{ +mul.f16x2 r1076, r1073, r971; +} +{ +add.f16x2 r1079, %91, r1076; +} +{ +add.f16x2 r1082, %100, %83; +} +{ +mul.f16x2 r1085, r1082, r975; +} +{ +add.f16x2 r1088, r1079, r1085; +} +{ +sub.f16x2 r1091, %61, %97; +} +{ +mul.f16x2 r1094, r1091, r973; +} +{ +sub.f16x2 r1097, %76, %59; +} +{ +mul.f16x2 r1100, r1097, r976; +} +{ +add.f16x2 r1103, r1094, r1100; +} +{ +sub.f16x2 r1106, r1088, r1103; +} +{ +add.f16x2 r1109, %85, %71; +} +{ +mul.f16x2 r1112, r1109, r971; +} +{ +add.f16x2 r1115, %91, r1112; +} +{ +add.f16x2 r1118, %100, %83; +} +{ +mul.f16x2 r1121, r1118, r975; +} +{ +add.f16x2 r1124, r1115, r1121; +} +{ +sub.f16x2 r1127, %61, %97; +} +{ +mul.f16x2 r1130, r1127, r973; +} +{ +sub.f16x2 r1133, %76, %59; +} +{ +mul.f16x2 r1136, r1133, r976; +} +{ +add.f16x2 r1139, r1130, r1136; +} +{ +add.f16x2 r1142, r1124, r1139; +} +{ +add.f16x2 r1145, %61, %97; +} +{ +mul.f16x2 r1148, r1145, r967; +} +{ +add.f16x2 r1151, %68, r1148; +} +{ +add.f16x2 r1154, %76, %59; +} +{ +mul.f16x2 r1157, r1154, r971; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, %85, %71; +} +{ +mul.f16x2 r1166, r1163, r969; +} +{ +sub.f16x2 r1169, %100, %83; +} +{ +mul.f16x2 r1172, r1169, r973; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +add.f16x2 r1178, r1160, r1175; +} +{ +add.f16x2 r1181, %61, %97; +} +{ +mul.f16x2 r1184, r1181, r967; +} +{ +add.f16x2 r1187, %68, r1184; +} +{ +add.f16x2 r1190, %76, %59; +} +{ +mul.f16x2 r1193, r1190, r971; +} +{ +add.f16x2 r1196, r1187, r1193; +} +{ +sub.f16x2 r1199, %85, %71; +} +{ +mul.f16x2 r1202, r1199, r969; +} +{ +sub.f16x2 r1205, %100, %83; +} +{ +mul.f16x2 r1208, r1205, r973; +} +{ +add.f16x2 r1211, r1202, r1208; +} +{ +sub.f16x2 r1214, r1196, r1211; +} +{ +add.f16x2 r1217, %61, %97; +} +{ +mul.f16x2 r1220, r1217, r971; +} +{ +add.f16x2 r1223, %68, r1220; +} +{ +add.f16x2 r1226, %76, %59; +} +{ +mul.f16x2 r1229, r1226, r975; +} +{ +add.f16x2 r1232, r1223, r1229; +} +{ +sub.f16x2 r1235, %85, %71; +} +{ +mul.f16x2 r1238, r1235, r973; +} +{ +sub.f16x2 r1241, %100, %83; +} +{ +mul.f16x2 r1244, r1241, r976; +} +{ +add.f16x2 r1247, r1238, r1244; +} +{ +add.f16x2 r1250, r1232, r1247; +} +{ +add.f16x2 r1253, %61, %97; +} +{ +mul.f16x2 r1256, r1253, r971; +} +{ +add.f16x2 r1259, %68, r1256; +} +{ +add.f16x2 r1262, %76, %59; +} +{ +mul.f16x2 r1265, r1262, r975; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, %85, %71; +} +{ +mul.f16x2 r1274, r1271, r973; +} +{ +sub.f16x2 r1277, %100, %83; +} +{ +mul.f16x2 r1280, r1277, r976; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r1268, r1283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1290, {low, high}; +} +{ +neg.f16x2 r1291, r1290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1294, {low, high}; +} +{ +neg.f16x2 r1295, r1294; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1298, {low, high}; +} +{ +add.f16x2 r1299, %58, %93; +} +{ +add.f16x2 r1302, %63, r1299; +} +{ +add.f16x2 r1305, %72, %56; +} +{ +add.f16x2 r1308, r1302, r1305; +} +{ +add.f16x2 r1311, %84, %70; +} +{ +add.f16x2 r1314, %90, r1311; +} +{ +add.f16x2 r1317, %98, %82; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %58, %93; +} +{ +mul.f16x2 r1326, r1323, r1289; +} +{ +add.f16x2 r1329, %63, r1326; +} +{ +add.f16x2 r1332, %72, %56; +} +{ +mul.f16x2 r1335, r1332, r1293; +} +{ +add.f16x2 r1338, r1329, r1335; +} +{ +sub.f16x2 r1341, %84, %70; +} +{ +mul.f16x2 r1344, r1341, r1291; +} +{ +sub.f16x2 r1347, %98, %82; +} +{ +mul.f16x2 r1350, r1347, r1295; +} +{ +add.f16x2 r1353, r1344, r1350; +} +{ +sub.f16x2 r1356, r1338, r1353; +} +{ +add.f16x2 r1359, %58, %93; +} +{ +mul.f16x2 r1362, r1359, r1289; +} +{ +add.f16x2 r1365, %63, r1362; +} +{ +add.f16x2 r1368, %72, %56; +} +{ +mul.f16x2 r1371, r1368, r1293; +} +{ +add.f16x2 r1374, r1365, r1371; +} +{ +sub.f16x2 r1377, %84, %70; +} +{ +mul.f16x2 r1380, r1377, r1291; +} +{ +sub.f16x2 r1383, %98, %82; +} +{ +mul.f16x2 r1386, r1383, r1295; +} +{ +add.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1374, r1389; +} +{ +add.f16x2 r1395, %58, %93; +} +{ +mul.f16x2 r1398, r1395, r1293; +} +{ +add.f16x2 r1401, %63, r1398; +} +{ +add.f16x2 r1404, %72, %56; +} +{ +mul.f16x2 r1407, r1404, r1297; +} +{ +add.f16x2 r1410, r1401, r1407; +} +{ +sub.f16x2 r1413, %84, %70; +} +{ +mul.f16x2 r1416, r1413, r1295; +} +{ +sub.f16x2 r1419, %98, %82; +} +{ +mul.f16x2 r1422, r1419, r1298; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +sub.f16x2 r1428, r1410, r1425; +} +{ +add.f16x2 r1431, %58, %93; +} +{ +mul.f16x2 r1434, r1431, r1293; +} +{ +add.f16x2 r1437, %63, r1434; +} +{ +add.f16x2 r1440, %72, %56; +} +{ +mul.f16x2 r1443, r1440, r1297; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +sub.f16x2 r1449, %84, %70; +} +{ +mul.f16x2 r1452, r1449, r1295; +} +{ +sub.f16x2 r1455, %98, %82; +} +{ +mul.f16x2 r1458, r1455, r1298; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +add.f16x2 r1464, r1446, r1461; +} +{ +add.f16x2 r1467, %84, %70; +} +{ +mul.f16x2 r1470, r1467, r1289; +} +{ +add.f16x2 r1473, %90, r1470; +} +{ +add.f16x2 r1476, %98, %82; +} +{ +mul.f16x2 r1479, r1476, r1293; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +sub.f16x2 r1485, %58, %93; +} +{ +mul.f16x2 r1488, r1485, r1291; +} +{ +sub.f16x2 r1491, %72, %56; +} +{ +mul.f16x2 r1494, r1491, r1295; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +add.f16x2 r1500, r1482, r1497; +} +{ +add.f16x2 r1503, %84, %70; +} +{ +mul.f16x2 r1506, r1503, r1289; +} +{ +add.f16x2 r1509, %90, r1506; +} +{ +add.f16x2 r1512, %98, %82; +} +{ +mul.f16x2 r1515, r1512, r1293; +} +{ +add.f16x2 r1518, r1509, r1515; +} +{ +sub.f16x2 r1521, %58, %93; +} +{ +mul.f16x2 r1524, r1521, r1291; +} +{ +sub.f16x2 r1527, %72, %56; +} +{ +mul.f16x2 r1530, r1527, r1295; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1518, r1533; +} +{ +add.f16x2 r1539, %84, %70; +} +{ +mul.f16x2 r1542, r1539, r1293; +} +{ +add.f16x2 r1545, %90, r1542; +} +{ +add.f16x2 r1548, %98, %82; +} +{ +mul.f16x2 r1551, r1548, r1297; +} +{ +add.f16x2 r1554, r1545, r1551; +} +{ +sub.f16x2 r1557, %58, %93; +} +{ +mul.f16x2 r1560, r1557, r1295; +} +{ +sub.f16x2 r1563, %72, %56; +} +{ +mul.f16x2 r1566, r1563, r1298; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +add.f16x2 r1572, r1554, r1569; +} +{ +add.f16x2 r1575, %84, %70; +} +{ +mul.f16x2 r1578, r1575, r1293; +} +{ +add.f16x2 r1581, %90, r1578; +} +{ +add.f16x2 r1584, %98, %82; +} +{ +mul.f16x2 r1587, r1584, r1297; +} +{ +add.f16x2 r1590, r1581, r1587; +} +{ +sub.f16x2 r1593, %58, %93; +} +{ +mul.f16x2 r1596, r1593, r1295; +} +{ +sub.f16x2 r1599, %72, %56; +} +{ +mul.f16x2 r1602, r1599, r1298; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1590, r1605; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1611, {low, high}; +} +mov.f32 f64, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1612, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1613, {low, high}; +} +mov.f32 f68, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1614, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1615, {low, high}; +} +mov.f32 f72, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1616, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1617, {low, high}; +} +mov.f32 f76, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1618, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1621, {low, high}; +} +mov.f32 f84, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1622, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1625, {low, high}; +} +mov.f32 f92, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1626, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1627, {low, high}; +} +mov.f32 f96, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1628, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1633, {low, high}; +} +mov.f32 f108, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1641, {low, high}; +} +mov.f32 f124, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1642, {low, high}; +} +{ +mul.f16x2 r1659, r390, r1611; +} +{ +mul.f16x2 r1662, r534, r1612; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r390, r1612; +} +{ +fma.rn.f16x2 r1671, r534, r1611, r1668; +} +{ +mul.f16x2 r1675, r712, r1613; +} +{ +mul.f16x2 r1678, r856, r1614; +} +{ +sub.f16x2 r1681, r1675, r1678; +} +{ +mul.f16x2 r1684, r712, r1614; +} +{ +fma.rn.f16x2 r1687, r856, r1613, r1684; +} +{ +mul.f16x2 r1691, r1034, r1615; +} +{ +mul.f16x2 r1694, r1178, r1616; +} +{ +sub.f16x2 r1697, r1691, r1694; +} +{ +mul.f16x2 r1700, r1034, r1616; +} +{ +fma.rn.f16x2 r1703, r1178, r1615, r1700; +} +{ +mul.f16x2 r1707, r1356, r1617; +} +{ +mul.f16x2 r1710, r1500, r1618; +} +{ +sub.f16x2 r1713, r1707, r1710; +} +{ +mul.f16x2 r1716, r1356, r1618; +} +{ +fma.rn.f16x2 r1719, r1500, r1617, r1716; +} +{ +mul.f16x2 r1723, r462, r1613; +} +{ +mul.f16x2 r1726, r606, r1614; +} +{ +sub.f16x2 r1729, r1723, r1726; +} +{ +mul.f16x2 r1732, r462, r1614; +} +{ +fma.rn.f16x2 r1735, r606, r1613, r1732; +} +{ +mul.f16x2 r1739, r784, r1617; +} +{ +mul.f16x2 r1742, r928, r1618; +} +{ +sub.f16x2 r1745, r1739, r1742; +} +{ +mul.f16x2 r1748, r784, r1618; +} +{ +fma.rn.f16x2 r1751, r928, r1617, r1748; +} +{ +mul.f16x2 r1755, r1106, r1621; +} +{ +mul.f16x2 r1758, r1250, r1622; +} +{ +sub.f16x2 r1761, r1755, r1758; +} +{ +mul.f16x2 r1764, r1106, r1622; +} +{ +fma.rn.f16x2 r1767, r1250, r1621, r1764; +} +{ +mul.f16x2 r1771, r1428, r1625; +} +{ +mul.f16x2 r1774, r1572, r1626; +} +{ +sub.f16x2 r1777, r1771, r1774; +} +{ +mul.f16x2 r1780, r1428, r1626; +} +{ +fma.rn.f16x2 r1783, r1572, r1625, r1780; +} +{ +mul.f16x2 r1787, r498, r1615; +} +{ +mul.f16x2 r1790, r642, r1616; +} +{ +sub.f16x2 r1793, r1787, r1790; +} +{ +mul.f16x2 r1796, r498, r1616; +} +{ +fma.rn.f16x2 r1799, r642, r1615, r1796; +} +{ +mul.f16x2 r1803, r820, r1621; +} +{ +mul.f16x2 r1806, r964, r1622; +} +{ +sub.f16x2 r1809, r1803, r1806; +} +{ +mul.f16x2 r1812, r820, r1622; +} +{ +fma.rn.f16x2 r1815, r964, r1621, r1812; +} +{ +mul.f16x2 r1819, r1142, r1627; +} +{ +mul.f16x2 r1822, r1286, r1628; +} +{ +sub.f16x2 r1825, r1819, r1822; +} +{ +mul.f16x2 r1828, r1142, r1628; +} +{ +fma.rn.f16x2 r1831, r1286, r1627, r1828; +} +{ +mul.f16x2 r1835, r1464, r1633; +} +{ +mul.f16x2 r1838, r1608, r1634; +} +{ +sub.f16x2 r1841, r1835, r1838; +} +{ +mul.f16x2 r1844, r1464, r1634; +} +{ +fma.rn.f16x2 r1847, r1608, r1633, r1844; +} +{ +mul.f16x2 r1851, r426, r1617; +} +{ +mul.f16x2 r1854, r570, r1618; +} +{ +sub.f16x2 r1857, r1851, r1854; +} +{ +mul.f16x2 r1860, r426, r1618; +} +{ +fma.rn.f16x2 r1863, r570, r1617, r1860; +} +{ +mul.f16x2 r1867, r748, r1625; +} +{ +mul.f16x2 r1870, r892, r1626; +} +{ +sub.f16x2 r1873, r1867, r1870; +} +{ +mul.f16x2 r1876, r748, r1626; +} +{ +fma.rn.f16x2 r1879, r892, r1625, r1876; +} +{ +mul.f16x2 r1883, r1070, r1633; +} +{ +mul.f16x2 r1886, r1214, r1634; +} +{ +sub.f16x2 r1889, r1883, r1886; +} +{ +mul.f16x2 r1892, r1070, r1634; +} +{ +fma.rn.f16x2 r1895, r1214, r1633, r1892; +} +{ +mul.f16x2 r1899, r1392, r1641; +} +{ +mul.f16x2 r1902, r1536, r1642; +} +{ +sub.f16x2 r1905, r1899, r1902; +} +{ +mul.f16x2 r1908, r1392, r1642; +} +{ +fma.rn.f16x2 r1911, r1536, r1641, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1916, {low, high}; +} +{ +neg.f16x2 r1917, r1916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1919, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1920, {low, high}; +} +{ +neg.f16x2 r1921, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1924, {low, high}; +} +{ +add.f16x2 r1925, r342, r1308; +} +{ +add.f16x2 r1928, r20, r1925; +} +{ +add.f16x2 r1931, r664, r986; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r354, r1320; +} +{ +add.f16x2 r1940, r32, r1937; +} +{ +add.f16x2 r1943, r676, r998; +} +{ +add.f16x2 r1946, r1940, r1943; +} +{ +add.f16x2 r1949, r342, r1308; +} +{ +mul.f16x2 r1952, r1949, r1915; +} +{ +add.f16x2 r1955, r20, r1952; +} +{ +add.f16x2 r1958, r664, r986; +} +{ +mul.f16x2 r1961, r1958, r1919; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r354, r1320; +} +{ +mul.f16x2 r1970, r1967, r1917; +} +{ +sub.f16x2 r1973, r676, r998; +} +{ +mul.f16x2 r1976, r1973, r1921; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +sub.f16x2 r1982, r1964, r1979; +} +{ +add.f16x2 r1985, r342, r1308; +} +{ +mul.f16x2 r1988, r1985, r1915; +} +{ +add.f16x2 r1991, r20, r1988; +} +{ +add.f16x2 r1994, r664, r986; +} +{ +mul.f16x2 r1997, r1994, r1919; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r354, r1320; +} +{ +mul.f16x2 r2006, r2003, r1917; +} +{ +sub.f16x2 r2009, r676, r998; +} +{ +mul.f16x2 r2012, r2009, r1921; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 r2018, r2000, r2015; +} +{ +add.f16x2 r2021, r342, r1308; +} +{ +mul.f16x2 r2024, r2021, r1919; +} +{ +add.f16x2 r2027, r20, r2024; +} +{ +add.f16x2 r2030, r664, r986; +} +{ +mul.f16x2 r2033, r2030, r1923; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r354, r1320; +} +{ +mul.f16x2 r2042, r2039, r1921; +} +{ +sub.f16x2 r2045, r676, r998; +} +{ +mul.f16x2 r2048, r2045, r1924; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 r2054, r2036, r2051; +} +{ +add.f16x2 r2057, r342, r1308; +} +{ +mul.f16x2 r2060, r2057, r1919; +} +{ +add.f16x2 r2063, r20, r2060; +} +{ +add.f16x2 r2066, r664, r986; +} +{ +mul.f16x2 r2069, r2066, r1923; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r354, r1320; +} +{ +mul.f16x2 r2078, r2075, r1921; +} +{ +sub.f16x2 r2081, r676, r998; +} +{ +mul.f16x2 r2084, r2081, r1924; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 r2090, r2072, r2087; +} +{ +add.f16x2 r2093, r354, r1320; +} +{ +mul.f16x2 r2096, r2093, r1915; +} +{ +add.f16x2 r2099, r32, r2096; +} +{ +add.f16x2 r2102, r676, r998; +} +{ +mul.f16x2 r2105, r2102, r1919; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r342, r1308; +} +{ +mul.f16x2 r2114, r2111, r1917; +} +{ +sub.f16x2 r2117, r664, r986; +} +{ +mul.f16x2 r2120, r2117, r1921; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +add.f16x2 r2126, r2108, r2123; +} +{ +add.f16x2 r2129, r354, r1320; +} +{ +mul.f16x2 r2132, r2129, r1915; +} +{ +add.f16x2 r2135, r32, r2132; +} +{ +add.f16x2 r2138, r676, r998; +} +{ +mul.f16x2 r2141, r2138, r1919; +} +{ +add.f16x2 r2144, r2135, r2141; +} +{ +sub.f16x2 r2147, r342, r1308; +} +{ +mul.f16x2 r2150, r2147, r1917; +} +{ +sub.f16x2 r2153, r664, r986; +} +{ +mul.f16x2 r2156, r2153, r1921; +} +{ +add.f16x2 r2159, r2150, r2156; +} +{ +sub.f16x2 r2162, r2144, r2159; +} +{ +add.f16x2 r2165, r354, r1320; +} +{ +mul.f16x2 r2168, r2165, r1919; +} +{ +add.f16x2 r2171, r32, r2168; +} +{ +add.f16x2 r2174, r676, r998; +} +{ +mul.f16x2 r2177, r2174, r1923; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +sub.f16x2 r2183, r342, r1308; +} +{ +mul.f16x2 r2186, r2183, r1921; +} +{ +sub.f16x2 r2189, r664, r986; +} +{ +mul.f16x2 r2192, r2189, r1924; +} +{ +add.f16x2 r2195, r2186, r2192; +} +{ +add.f16x2 r2198, r2180, r2195; +} +{ +add.f16x2 r2201, r354, r1320; +} +{ +mul.f16x2 r2204, r2201, r1919; +} +{ +add.f16x2 r2207, r32, r2204; +} +{ +add.f16x2 r2210, r676, r998; +} +{ +mul.f16x2 r2213, r2210, r1923; +} +{ +add.f16x2 r2216, r2207, r2213; +} +{ +sub.f16x2 r2219, r342, r1308; +} +{ +mul.f16x2 r2222, r2219, r1921; +} +{ +sub.f16x2 r2225, r664, r986; +} +{ +mul.f16x2 r2228, r2225, r1924; +} +{ +add.f16x2 r2231, r2222, r2228; +} +{ +sub.f16x2 r2234, r2216, r2231; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2238, {low, high}; +} +{ +neg.f16x2 r2239, r2238; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2246, {low, high}; +} +{ +add.f16x2 r2247, r1665, r1713; +} +{ +add.f16x2 r2250, r68, r2247; +} +{ +add.f16x2 r2253, r1681, r1697; +} +{ +add.f16x2 r2256, r2250, r2253; +} +{ +add.f16x2 r2259, r1671, r1719; +} +{ +add.f16x2 r2262, r212, r2259; +} +{ +add.f16x2 r2265, r1687, r1703; +} +{ +add.f16x2 r2268, r2262, r2265; +} +{ +add.f16x2 r2271, r1665, r1713; +} +{ +mul.f16x2 r2274, r2271, r2237; +} +{ +add.f16x2 r2277, r68, r2274; +} +{ +add.f16x2 r2280, r1681, r1697; +} +{ +mul.f16x2 r2283, r2280, r2241; +} +{ +add.f16x2 r2286, r2277, r2283; +} +{ +sub.f16x2 r2289, r1671, r1719; +} +{ +mul.f16x2 r2292, r2289, r2239; +} +{ +sub.f16x2 r2295, r1687, r1703; +} +{ +mul.f16x2 r2298, r2295, r2243; +} +{ +add.f16x2 r2301, r2292, r2298; +} +{ +sub.f16x2 r2304, r2286, r2301; +} +{ +add.f16x2 r2307, r1665, r1713; +} +{ +mul.f16x2 r2310, r2307, r2237; +} +{ +add.f16x2 r2313, r68, r2310; +} +{ +add.f16x2 r2316, r1681, r1697; +} +{ +mul.f16x2 r2319, r2316, r2241; +} +{ +add.f16x2 r2322, r2313, r2319; +} +{ +sub.f16x2 r2325, r1671, r1719; +} +{ +mul.f16x2 r2328, r2325, r2239; +} +{ +sub.f16x2 r2331, r1687, r1703; +} +{ +mul.f16x2 r2334, r2331, r2243; +} +{ +add.f16x2 r2337, r2328, r2334; +} +{ +add.f16x2 r2340, r2322, r2337; +} +{ +add.f16x2 r2343, r1665, r1713; +} +{ +mul.f16x2 r2346, r2343, r2241; +} +{ +add.f16x2 r2349, r68, r2346; +} +{ +add.f16x2 r2352, r1681, r1697; +} +{ +mul.f16x2 r2355, r2352, r2245; +} +{ +add.f16x2 r2358, r2349, r2355; +} +{ +sub.f16x2 r2361, r1671, r1719; +} +{ +mul.f16x2 r2364, r2361, r2243; +} +{ +sub.f16x2 r2367, r1687, r1703; +} +{ +mul.f16x2 r2370, r2367, r2246; +} +{ +add.f16x2 r2373, r2364, r2370; +} +{ +sub.f16x2 r2376, r2358, r2373; +} +{ +add.f16x2 r2379, r1665, r1713; +} +{ +mul.f16x2 r2382, r2379, r2241; +} +{ +add.f16x2 r2385, r68, r2382; +} +{ +add.f16x2 r2388, r1681, r1697; +} +{ +mul.f16x2 r2391, r2388, r2245; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +sub.f16x2 r2397, r1671, r1719; +} +{ +mul.f16x2 r2400, r2397, r2243; +} +{ +sub.f16x2 r2403, r1687, r1703; +} +{ +mul.f16x2 r2406, r2403, r2246; +} +{ +add.f16x2 r2409, r2400, r2406; +} +{ +add.f16x2 r2412, r2394, r2409; +} +{ +add.f16x2 r2415, r1671, r1719; +} +{ +mul.f16x2 r2418, r2415, r2237; +} +{ +add.f16x2 r2421, r212, r2418; +} +{ +add.f16x2 r2424, r1687, r1703; +} +{ +mul.f16x2 r2427, r2424, r2241; +} +{ +add.f16x2 r2430, r2421, r2427; +} +{ +sub.f16x2 r2433, r1665, r1713; +} +{ +mul.f16x2 r2436, r2433, r2239; +} +{ +sub.f16x2 r2439, r1681, r1697; +} +{ +mul.f16x2 r2442, r2439, r2243; +} +{ +add.f16x2 r2445, r2436, r2442; +} +{ +add.f16x2 r2448, r2430, r2445; +} +{ +add.f16x2 r2451, r1671, r1719; +} +{ +mul.f16x2 r2454, r2451, r2237; +} +{ +add.f16x2 r2457, r212, r2454; +} +{ +add.f16x2 r2460, r1687, r1703; +} +{ +mul.f16x2 r2463, r2460, r2241; +} +{ +add.f16x2 r2466, r2457, r2463; +} +{ +sub.f16x2 r2469, r1665, r1713; +} +{ +mul.f16x2 r2472, r2469, r2239; +} +{ +sub.f16x2 r2475, r1681, r1697; +} +{ +mul.f16x2 r2478, r2475, r2243; +} +{ +add.f16x2 r2481, r2472, r2478; +} +{ +sub.f16x2 r2484, r2466, r2481; +} +{ +add.f16x2 r2487, r1671, r1719; +} +{ +mul.f16x2 r2490, r2487, r2241; +} +{ +add.f16x2 r2493, r212, r2490; +} +{ +add.f16x2 r2496, r1687, r1703; +} +{ +mul.f16x2 r2499, r2496, r2245; +} +{ +add.f16x2 r2502, r2493, r2499; +} +{ +sub.f16x2 r2505, r1665, r1713; +} +{ +mul.f16x2 r2508, r2505, r2243; +} +{ +sub.f16x2 r2511, r1681, r1697; +} +{ +mul.f16x2 r2514, r2511, r2246; +} +{ +add.f16x2 r2517, r2508, r2514; +} +{ +add.f16x2 r2520, r2502, r2517; +} +{ +add.f16x2 r2523, r1671, r1719; +} +{ +mul.f16x2 r2526, r2523, r2241; +} +{ +add.f16x2 r2529, r212, r2526; +} +{ +add.f16x2 r2532, r1687, r1703; +} +{ +mul.f16x2 r2535, r2532, r2245; +} +{ +add.f16x2 r2538, r2529, r2535; +} +{ +sub.f16x2 r2541, r1665, r1713; +} +{ +mul.f16x2 r2544, r2541, r2243; +} +{ +sub.f16x2 r2547, r1681, r1697; +} +{ +mul.f16x2 r2550, r2547, r2246; +} +{ +add.f16x2 r2553, r2544, r2550; +} +{ +sub.f16x2 r2556, r2538, r2553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2560, {low, high}; +} +{ +neg.f16x2 r2561, r2560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2564, {low, high}; +} +{ +neg.f16x2 r2565, r2564; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2568, {low, high}; +} +{ +add.f16x2 r2569, r1729, r1777; +} +{ +add.f16x2 r2572, r140, r2569; +} +{ +add.f16x2 r2575, r1745, r1761; +} +{ +add.f16x2 r2578, r2572, r2575; +} +{ +add.f16x2 r2581, r1735, r1783; +} +{ +add.f16x2 r2584, r284, r2581; +} +{ +add.f16x2 r2587, r1751, r1767; +} +{ +add.f16x2 r2590, r2584, r2587; +} +{ +add.f16x2 r2593, r1729, r1777; +} +{ +mul.f16x2 r2596, r2593, r2559; +} +{ +add.f16x2 r2599, r140, r2596; +} +{ +add.f16x2 r2602, r1745, r1761; +} +{ +mul.f16x2 r2605, r2602, r2563; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +sub.f16x2 r2611, r1735, r1783; +} +{ +mul.f16x2 r2614, r2611, r2561; +} +{ +sub.f16x2 r2617, r1751, r1767; +} +{ +mul.f16x2 r2620, r2617, r2565; +} +{ +add.f16x2 r2623, r2614, r2620; +} +{ +sub.f16x2 r2626, r2608, r2623; +} +{ +add.f16x2 r2629, r1729, r1777; +} +{ +mul.f16x2 r2632, r2629, r2559; +} +{ +add.f16x2 r2635, r140, r2632; +} +{ +add.f16x2 r2638, r1745, r1761; +} +{ +mul.f16x2 r2641, r2638, r2563; +} +{ +add.f16x2 r2644, r2635, r2641; +} +{ +sub.f16x2 r2647, r1735, r1783; +} +{ +mul.f16x2 r2650, r2647, r2561; +} +{ +sub.f16x2 r2653, r1751, r1767; +} +{ +mul.f16x2 r2656, r2653, r2565; +} +{ +add.f16x2 r2659, r2650, r2656; +} +{ +add.f16x2 r2662, r2644, r2659; +} +{ +add.f16x2 r2665, r1729, r1777; +} +{ +mul.f16x2 r2668, r2665, r2563; +} +{ +add.f16x2 r2671, r140, r2668; +} +{ +add.f16x2 r2674, r1745, r1761; +} +{ +mul.f16x2 r2677, r2674, r2567; +} +{ +add.f16x2 r2680, r2671, r2677; +} +{ +sub.f16x2 r2683, r1735, r1783; +} +{ +mul.f16x2 r2686, r2683, r2565; +} +{ +sub.f16x2 r2689, r1751, r1767; +} +{ +mul.f16x2 r2692, r2689, r2568; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2680, r2695; +} +{ +add.f16x2 r2701, r1729, r1777; +} +{ +mul.f16x2 r2704, r2701, r2563; +} +{ +add.f16x2 r2707, r140, r2704; +} +{ +add.f16x2 r2710, r1745, r1761; +} +{ +mul.f16x2 r2713, r2710, r2567; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +sub.f16x2 r2719, r1735, r1783; +} +{ +mul.f16x2 r2722, r2719, r2565; +} +{ +sub.f16x2 r2725, r1751, r1767; +} +{ +mul.f16x2 r2728, r2725, r2568; +} +{ +add.f16x2 r2731, r2722, r2728; +} +{ +add.f16x2 r2734, r2716, r2731; +} +{ +add.f16x2 r2737, r1735, r1783; +} +{ +mul.f16x2 r2740, r2737, r2559; +} +{ +add.f16x2 r2743, r284, r2740; +} +{ +add.f16x2 r2746, r1751, r1767; +} +{ +mul.f16x2 r2749, r2746, r2563; +} +{ +add.f16x2 r2752, r2743, r2749; +} +{ +sub.f16x2 r2755, r1729, r1777; +} +{ +mul.f16x2 r2758, r2755, r2561; +} +{ +sub.f16x2 r2761, r1745, r1761; +} +{ +mul.f16x2 r2764, r2761, r2565; +} +{ +add.f16x2 r2767, r2758, r2764; +} +{ +add.f16x2 r2770, r2752, r2767; +} +{ +add.f16x2 r2773, r1735, r1783; +} +{ +mul.f16x2 r2776, r2773, r2559; +} +{ +add.f16x2 r2779, r284, r2776; +} +{ +add.f16x2 r2782, r1751, r1767; +} +{ +mul.f16x2 r2785, r2782, r2563; +} +{ +add.f16x2 r2788, r2779, r2785; +} +{ +sub.f16x2 r2791, r1729, r1777; +} +{ +mul.f16x2 r2794, r2791, r2561; +} +{ +sub.f16x2 r2797, r1745, r1761; +} +{ +mul.f16x2 r2800, r2797, r2565; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2788, r2803; +} +{ +add.f16x2 r2809, r1735, r1783; +} +{ +mul.f16x2 r2812, r2809, r2563; +} +{ +add.f16x2 r2815, r284, r2812; +} +{ +add.f16x2 r2818, r1751, r1767; +} +{ +mul.f16x2 r2821, r2818, r2567; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +sub.f16x2 r2827, r1729, r1777; +} +{ +mul.f16x2 r2830, r2827, r2565; +} +{ +sub.f16x2 r2833, r1745, r1761; +} +{ +mul.f16x2 r2836, r2833, r2568; +} +{ +add.f16x2 r2839, r2830, r2836; +} +{ +add.f16x2 r2842, r2824, r2839; +} +{ +add.f16x2 r2845, r1735, r1783; +} +{ +mul.f16x2 r2848, r2845, r2563; +} +{ +add.f16x2 r2851, r284, r2848; +} +{ +add.f16x2 r2854, r1751, r1767; +} +{ +mul.f16x2 r2857, r2854, r2567; +} +{ +add.f16x2 r2860, r2851, r2857; +} +{ +sub.f16x2 r2863, r1729, r1777; +} +{ +mul.f16x2 r2866, r2863, r2565; +} +{ +sub.f16x2 r2869, r1745, r1761; +} +{ +mul.f16x2 r2872, r2869, r2568; +} +{ +add.f16x2 r2875, r2866, r2872; +} +{ +sub.f16x2 r2878, r2860, r2875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2882, {low, high}; +} +{ +neg.f16x2 r2883, r2882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2886, {low, high}; +} +{ +neg.f16x2 r2887, r2886; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2890, {low, high}; +} +{ +add.f16x2 r2891, r1793, r1841; +} +{ +add.f16x2 r2894, r176, r2891; +} +{ +add.f16x2 r2897, r1809, r1825; +} +{ +add.f16x2 r2900, r2894, r2897; +} +{ +add.f16x2 r2903, r1799, r1847; +} +{ +add.f16x2 r2906, r320, r2903; +} +{ +add.f16x2 r2909, r1815, r1831; +} +{ +add.f16x2 r2912, r2906, r2909; +} +{ +add.f16x2 r2915, r1793, r1841; +} +{ +mul.f16x2 r2918, r2915, r2881; +} +{ +add.f16x2 r2921, r176, r2918; +} +{ +add.f16x2 r2924, r1809, r1825; +} +{ +mul.f16x2 r2927, r2924, r2885; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +sub.f16x2 r2933, r1799, r1847; +} +{ +mul.f16x2 r2936, r2933, r2883; +} +{ +sub.f16x2 r2939, r1815, r1831; +} +{ +mul.f16x2 r2942, r2939, r2887; +} +{ +add.f16x2 r2945, r2936, r2942; +} +{ +sub.f16x2 r2948, r2930, r2945; +} +{ +add.f16x2 r2951, r1793, r1841; +} +{ +mul.f16x2 r2954, r2951, r2881; +} +{ +add.f16x2 r2957, r176, r2954; +} +{ +add.f16x2 r2960, r1809, r1825; +} +{ +mul.f16x2 r2963, r2960, r2885; +} +{ +add.f16x2 r2966, r2957, r2963; +} +{ +sub.f16x2 r2969, r1799, r1847; +} +{ +mul.f16x2 r2972, r2969, r2883; +} +{ +sub.f16x2 r2975, r1815, r1831; +} +{ +mul.f16x2 r2978, r2975, r2887; +} +{ +add.f16x2 r2981, r2972, r2978; +} +{ +add.f16x2 r2984, r2966, r2981; +} +{ +add.f16x2 r2987, r1793, r1841; +} +{ +mul.f16x2 r2990, r2987, r2885; +} +{ +add.f16x2 r2993, r176, r2990; +} +{ +add.f16x2 r2996, r1809, r1825; +} +{ +mul.f16x2 r2999, r2996, r2889; +} +{ +add.f16x2 r3002, r2993, r2999; +} +{ +sub.f16x2 r3005, r1799, r1847; +} +{ +mul.f16x2 r3008, r3005, r2887; +} +{ +sub.f16x2 r3011, r1815, r1831; +} +{ +mul.f16x2 r3014, r3011, r2890; +} +{ +add.f16x2 r3017, r3008, r3014; +} +{ +sub.f16x2 r3020, r3002, r3017; +} +{ +add.f16x2 r3023, r1793, r1841; +} +{ +mul.f16x2 r3026, r3023, r2885; +} +{ +add.f16x2 r3029, r176, r3026; +} +{ +add.f16x2 r3032, r1809, r1825; +} +{ +mul.f16x2 r3035, r3032, r2889; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +sub.f16x2 r3041, r1799, r1847; +} +{ +mul.f16x2 r3044, r3041, r2887; +} +{ +sub.f16x2 r3047, r1815, r1831; +} +{ +mul.f16x2 r3050, r3047, r2890; +} +{ +add.f16x2 r3053, r3044, r3050; +} +{ +add.f16x2 r3056, r3038, r3053; +} +{ +add.f16x2 r3059, r1799, r1847; +} +{ +mul.f16x2 r3062, r3059, r2881; +} +{ +add.f16x2 r3065, r320, r3062; +} +{ +add.f16x2 r3068, r1815, r1831; +} +{ +mul.f16x2 r3071, r3068, r2885; +} +{ +add.f16x2 r3074, r3065, r3071; +} +{ +sub.f16x2 r3077, r1793, r1841; +} +{ +mul.f16x2 r3080, r3077, r2883; +} +{ +sub.f16x2 r3083, r1809, r1825; +} +{ +mul.f16x2 r3086, r3083, r2887; +} +{ +add.f16x2 r3089, r3080, r3086; +} +{ +add.f16x2 r3092, r3074, r3089; +} +{ +add.f16x2 r3095, r1799, r1847; +} +{ +mul.f16x2 r3098, r3095, r2881; +} +{ +add.f16x2 r3101, r320, r3098; +} +{ +add.f16x2 r3104, r1815, r1831; +} +{ +mul.f16x2 r3107, r3104, r2885; +} +{ +add.f16x2 r3110, r3101, r3107; +} +{ +sub.f16x2 r3113, r1793, r1841; +} +{ +mul.f16x2 r3116, r3113, r2883; +} +{ +sub.f16x2 r3119, r1809, r1825; +} +{ +mul.f16x2 r3122, r3119, r2887; +} +{ +add.f16x2 r3125, r3116, r3122; +} +{ +sub.f16x2 r3128, r3110, r3125; +} +{ +add.f16x2 r3131, r1799, r1847; +} +{ +mul.f16x2 r3134, r3131, r2885; +} +{ +add.f16x2 r3137, r320, r3134; +} +{ +add.f16x2 r3140, r1815, r1831; +} +{ +mul.f16x2 r3143, r3140, r2889; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +sub.f16x2 r3149, r1793, r1841; +} +{ +mul.f16x2 r3152, r3149, r2887; +} +{ +sub.f16x2 r3155, r1809, r1825; +} +{ +mul.f16x2 r3158, r3155, r2890; +} +{ +add.f16x2 r3161, r3152, r3158; +} +{ +add.f16x2 r3164, r3146, r3161; +} +{ +add.f16x2 r3167, r1799, r1847; +} +{ +mul.f16x2 r3170, r3167, r2885; +} +{ +add.f16x2 r3173, r320, r3170; +} +{ +add.f16x2 r3176, r1815, r1831; +} +{ +mul.f16x2 r3179, r3176, r2889; +} +{ +add.f16x2 r3182, r3173, r3179; +} +{ +sub.f16x2 r3185, r1793, r1841; +} +{ +mul.f16x2 r3188, r3185, r2887; +} +{ +sub.f16x2 r3191, r1809, r1825; +} +{ +mul.f16x2 r3194, r3191, r2890; +} +{ +add.f16x2 r3197, r3188, r3194; +} +{ +sub.f16x2 r3200, r3182, r3197; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3204, {low, high}; +} +{ +neg.f16x2 r3205, r3204; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r3208, {low, high}; +} +{ +neg.f16x2 r3209, r3208; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3212, {low, high}; +} +{ +add.f16x2 r3213, r1857, r1905; +} +{ +add.f16x2 r3216, r104, r3213; +} +{ +add.f16x2 r3219, r1873, r1889; +} +{ +add.f16x2 r3222, r3216, r3219; +} +{ +add.f16x2 r3225, r1863, r1911; +} +{ +add.f16x2 r3228, r248, r3225; +} +{ +add.f16x2 r3231, r1879, r1895; +} +{ +add.f16x2 r3234, r3228, r3231; +} +{ +add.f16x2 r3237, r1857, r1905; +} +{ +mul.f16x2 r3240, r3237, r3203; +} +{ +add.f16x2 r3243, r104, r3240; +} +{ +add.f16x2 r3246, r1873, r1889; +} +{ +mul.f16x2 r3249, r3246, r3207; +} +{ +add.f16x2 r3252, r3243, r3249; +} +{ +sub.f16x2 r3255, r1863, r1911; +} +{ +mul.f16x2 r3258, r3255, r3205; +} +{ +sub.f16x2 r3261, r1879, r1895; +} +{ +mul.f16x2 r3264, r3261, r3209; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +sub.f16x2 r3270, r3252, r3267; +} +{ +add.f16x2 r3273, r1857, r1905; +} +{ +mul.f16x2 r3276, r3273, r3203; +} +{ +add.f16x2 r3279, r104, r3276; +} +{ +add.f16x2 r3282, r1873, r1889; +} +{ +mul.f16x2 r3285, r3282, r3207; +} +{ +add.f16x2 r3288, r3279, r3285; +} +{ +sub.f16x2 r3291, r1863, r1911; +} +{ +mul.f16x2 r3294, r3291, r3205; +} +{ +sub.f16x2 r3297, r1879, r1895; +} +{ +mul.f16x2 r3300, r3297, r3209; +} +{ +add.f16x2 r3303, r3294, r3300; +} +{ +add.f16x2 r3306, r3288, r3303; +} +{ +add.f16x2 r3309, r1857, r1905; +} +{ +mul.f16x2 r3312, r3309, r3207; +} +{ +add.f16x2 r3315, r104, r3312; +} +{ +add.f16x2 r3318, r1873, r1889; +} +{ +mul.f16x2 r3321, r3318, r3211; +} +{ +add.f16x2 r3324, r3315, r3321; +} +{ +sub.f16x2 r3327, r1863, r1911; +} +{ +mul.f16x2 r3330, r3327, r3209; +} +{ +sub.f16x2 r3333, r1879, r1895; +} +{ +mul.f16x2 r3336, r3333, r3212; +} +{ +add.f16x2 r3339, r3330, r3336; +} +{ +sub.f16x2 r3342, r3324, r3339; +} +{ +add.f16x2 r3345, r1857, r1905; +} +{ +mul.f16x2 r3348, r3345, r3207; +} +{ +add.f16x2 r3351, r104, r3348; +} +{ +add.f16x2 r3354, r1873, r1889; +} +{ +mul.f16x2 r3357, r3354, r3211; +} +{ +add.f16x2 r3360, r3351, r3357; +} +{ +sub.f16x2 r3363, r1863, r1911; +} +{ +mul.f16x2 r3366, r3363, r3209; +} +{ +sub.f16x2 r3369, r1879, r1895; +} +{ +mul.f16x2 r3372, r3369, r3212; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r3360, r3375; +} +{ +add.f16x2 r3381, r1863, r1911; +} +{ +mul.f16x2 r3384, r3381, r3203; +} +{ +add.f16x2 r3387, r248, r3384; +} +{ +add.f16x2 r3390, r1879, r1895; +} +{ +mul.f16x2 r3393, r3390, r3207; +} +{ +add.f16x2 r3396, r3387, r3393; +} +{ +sub.f16x2 r3399, r1857, r1905; +} +{ +mul.f16x2 r3402, r3399, r3205; +} +{ +sub.f16x2 r3405, r1873, r1889; +} +{ +mul.f16x2 r3408, r3405, r3209; +} +{ +add.f16x2 r3411, r3402, r3408; +} +{ +add.f16x2 r3414, r3396, r3411; +} +{ +add.f16x2 r3417, r1863, r1911; +} +{ +mul.f16x2 r3420, r3417, r3203; +} +{ +add.f16x2 r3423, r248, r3420; +} +{ +add.f16x2 r3426, r1879, r1895; +} +{ +mul.f16x2 r3429, r3426, r3207; +} +{ +add.f16x2 r3432, r3423, r3429; +} +{ +sub.f16x2 r3435, r1857, r1905; +} +{ +mul.f16x2 r3438, r3435, r3205; +} +{ +sub.f16x2 r3441, r1873, r1889; +} +{ +mul.f16x2 r3444, r3441, r3209; +} +{ +add.f16x2 r3447, r3438, r3444; +} +{ +sub.f16x2 r3450, r3432, r3447; +} +{ +add.f16x2 r3453, r1863, r1911; +} +{ +mul.f16x2 r3456, r3453, r3207; +} +{ +add.f16x2 r3459, r248, r3456; +} +{ +add.f16x2 r3462, r1879, r1895; +} +{ +mul.f16x2 r3465, r3462, r3211; +} +{ +add.f16x2 r3468, r3459, r3465; +} +{ +sub.f16x2 r3471, r1857, r1905; +} +{ +mul.f16x2 r3474, r3471, r3209; +} +{ +sub.f16x2 r3477, r1873, r1889; +} +{ +mul.f16x2 r3480, r3477, r3212; +} +{ +add.f16x2 r3483, r3474, r3480; +} +{ +add.f16x2 r3486, r3468, r3483; +} +{ +add.f16x2 r3489, r1863, r1911; +} +{ +mul.f16x2 r3492, r3489, r3207; +} +{ +add.f16x2 r3495, r248, r3492; +} +{ +add.f16x2 r3498, r1879, r1895; +} +{ +mul.f16x2 r3501, r3498, r3211; +} +{ +add.f16x2 r3504, r3495, r3501; +} +{ +sub.f16x2 r3507, r1857, r1905; +} +{ +mul.f16x2 r3510, r3507, r3209; +} +{ +sub.f16x2 r3513, r1873, r1889; +} +{ +mul.f16x2 r3516, r3513, r3212; +} +{ +add.f16x2 r3519, r3510, r3516; +} +{ +sub.f16x2 r3522, r3504, r3519; +} +mul.wide.u32 rd2, r6029, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r6030, rd3; +mul.lo.s32 r6031, r6030, 5; +sub.s32 r6032, r6029, r6031; +cvt.rn.f32.u32 f329, r6032; +mul.f32 f330, f329, 0f3D4DE32E; +cos.approx.f32 f217, f330; +sin.approx.f32 f331, f330; +neg.f32 f218, f331; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3525, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3530, {high, high}; +} +{ +mul.f16x2 r3532, r2268, r3530; +} +{ +fma.rn.f16x2 r3535, r2256, r3528, r3532; +} +{ +mul.f16x2 r3539, r2256, r3530; +} +{ +neg.f16x2 r3542, r3539; +} +{ +fma.rn.f16x2 r3544, r2268, r3528, r3542; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3550, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3552, {low, high}; +} +{ +mul.f16x2 r3553, r3550, r3552; +} +{ +mul.f16x2 r3556, r3525, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3559, {high, low}; +} +{ +fma.rn.f16x2 r3561, r3553, r3559, r3556; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3567, {high, high}; +} +{ +mul.f16x2 r3569, r2590, r3567; +} +{ +fma.rn.f16x2 r3572, r2578, r3565, r3569; +} +{ +mul.f16x2 r3576, r2578, r3567; +} +{ +neg.f16x2 r3579, r3576; +} +{ +fma.rn.f16x2 r3581, r2590, r3565, r3579; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3587, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3589, {low, high}; +} +{ +mul.f16x2 r3590, r3587, r3589; +} +{ +mul.f16x2 r3593, r3561, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3596, {high, low}; +} +{ +fma.rn.f16x2 r3598, r3590, r3596, r3593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3604, {high, high}; +} +{ +mul.f16x2 r3606, r2912, r3604; +} +{ +fma.rn.f16x2 r3609, r2900, r3602, r3606; +} +{ +mul.f16x2 r3613, r2900, r3604; +} +{ +neg.f16x2 r3616, r3613; +} +{ +fma.rn.f16x2 r3618, r2912, r3602, r3616; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3624, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3626, {low, high}; +} +{ +mul.f16x2 r3627, r3624, r3626; +} +{ +mul.f16x2 r3630, r3598, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3633, {high, low}; +} +{ +fma.rn.f16x2 r3635, r3627, r3633, r3630; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3641, {high, high}; +} +{ +mul.f16x2 r3643, r3234, r3641; +} +{ +fma.rn.f16x2 r3646, r3222, r3639, r3643; +} +{ +mul.f16x2 r3650, r3222, r3641; +} +{ +neg.f16x2 r3653, r3650; +} +{ +fma.rn.f16x2 r3655, r3234, r3639, r3653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3661, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3663, {low, high}; +} +{ +mul.f16x2 r3664, r3661, r3663; +} +{ +mul.f16x2 r3667, r3635, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3670, {high, low}; +} +{ +fma.rn.f16x2 r3672, r3664, r3670, r3667; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3678, {high, high}; +} +{ +mul.f16x2 r3680, r2126, r3678; +} +{ +fma.rn.f16x2 r3683, r1982, r3676, r3680; +} +{ +mul.f16x2 r3687, r1982, r3678; +} +{ +neg.f16x2 r3690, r3687; +} +{ +fma.rn.f16x2 r3692, r2126, r3676, r3690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3698, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3700, {low, high}; +} +{ +mul.f16x2 r3701, r3698, r3700; +} +{ +mul.f16x2 r3704, r3672, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3707, {high, low}; +} +{ +fma.rn.f16x2 r3709, r3701, r3707, r3704; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3715, {high, high}; +} +{ +mul.f16x2 r3717, r2448, r3715; +} +{ +fma.rn.f16x2 r3720, r2304, r3713, r3717; +} +{ +mul.f16x2 r3724, r2304, r3715; +} +{ +neg.f16x2 r3727, r3724; +} +{ +fma.rn.f16x2 r3729, r2448, r3713, r3727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3735, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3737, {low, high}; +} +{ +mul.f16x2 r3738, r3735, r3737; +} +{ +mul.f16x2 r3741, r3709, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3744, {high, low}; +} +{ +fma.rn.f16x2 r3746, r3738, r3744, r3741; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3752, {high, high}; +} +{ +mul.f16x2 r3754, r2770, r3752; +} +{ +fma.rn.f16x2 r3757, r2626, r3750, r3754; +} +{ +mul.f16x2 r3761, r2626, r3752; +} +{ +neg.f16x2 r3764, r3761; +} +{ +fma.rn.f16x2 r3766, r2770, r3750, r3764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3774, {low, high}; +} +{ +mul.f16x2 r3775, r3772, r3774; +} +{ +mul.f16x2 r3778, r3746, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3781, {high, low}; +} +{ +fma.rn.f16x2 r3783, r3775, r3781, r3778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3789, {high, high}; +} +{ +mul.f16x2 r3791, r3092, r3789; +} +{ +fma.rn.f16x2 r3794, r2948, r3787, r3791; +} +{ +mul.f16x2 r3798, r2948, r3789; +} +{ +neg.f16x2 r3801, r3798; +} +{ +fma.rn.f16x2 r3803, r3092, r3787, r3801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3811, {low, high}; +} +{ +mul.f16x2 r3812, r3809, r3811; +} +{ +mul.f16x2 r3815, r3783, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3818, {high, low}; +} +{ +fma.rn.f16x2 r3820, r3812, r3818, r3815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3826, {high, high}; +} +{ +mul.f16x2 r3828, r3414, r3826; +} +{ +fma.rn.f16x2 r3831, r3270, r3824, r3828; +} +{ +mul.f16x2 r3835, r3270, r3826; +} +{ +neg.f16x2 r3838, r3835; +} +{ +fma.rn.f16x2 r3840, r3414, r3824, r3838; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3848, {low, high}; +} +{ +mul.f16x2 r3849, r3846, r3848; +} +{ +mul.f16x2 r3852, r3820, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3855, {high, low}; +} +{ +fma.rn.f16x2 r3857, r3849, r3855, r3852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3863, {high, high}; +} +{ +mul.f16x2 r3865, r2198, r3863; +} +{ +fma.rn.f16x2 r3868, r2054, r3861, r3865; +} +{ +mul.f16x2 r3872, r2054, r3863; +} +{ +neg.f16x2 r3875, r3872; +} +{ +fma.rn.f16x2 r3877, r2198, r3861, r3875; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3885, {low, high}; +} +{ +mul.f16x2 r3886, r3883, r3885; +} +{ +mul.f16x2 r3889, r3857, r3881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3892, {high, low}; +} +{ +fma.rn.f16x2 r3894, r3886, r3892, r3889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3900, {high, high}; +} +{ +mul.f16x2 r3902, r2520, r3900; +} +{ +fma.rn.f16x2 r3905, r2376, r3898, r3902; +} +{ +mul.f16x2 r3909, r2376, r3900; +} +{ +neg.f16x2 r3912, r3909; +} +{ +fma.rn.f16x2 r3914, r2520, r3898, r3912; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3922, {low, high}; +} +{ +mul.f16x2 r3923, r3920, r3922; +} +{ +mul.f16x2 r3926, r3894, r3918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3929, {high, low}; +} +{ +fma.rn.f16x2 r3931, r3923, r3929, r3926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3937, {high, high}; +} +{ +mul.f16x2 r3939, r2842, r3937; +} +{ +fma.rn.f16x2 r3942, r2698, r3935, r3939; +} +{ +mul.f16x2 r3946, r2698, r3937; +} +{ +neg.f16x2 r3949, r3946; +} +{ +fma.rn.f16x2 r3951, r2842, r3935, r3949; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3959, {low, high}; +} +{ +mul.f16x2 r3960, r3957, r3959; +} +{ +mul.f16x2 r3963, r3931, r3955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3966, {high, low}; +} +{ +fma.rn.f16x2 r3968, r3960, r3966, r3963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3974, {high, high}; +} +{ +mul.f16x2 r3976, r3164, r3974; +} +{ +fma.rn.f16x2 r3979, r3020, r3972, r3976; +} +{ +mul.f16x2 r3983, r3020, r3974; +} +{ +neg.f16x2 r3986, r3983; +} +{ +fma.rn.f16x2 r3988, r3164, r3972, r3986; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3996, {low, high}; +} +{ +mul.f16x2 r3997, r3994, r3996; +} +{ +mul.f16x2 r4000, r3968, r3992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r4003, {high, low}; +} +{ +fma.rn.f16x2 r4005, r3997, r4003, r4000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4011, {high, high}; +} +{ +mul.f16x2 r4013, r3486, r4011; +} +{ +fma.rn.f16x2 r4016, r3342, r4009, r4013; +} +{ +mul.f16x2 r4020, r3342, r4011; +} +{ +neg.f16x2 r4023, r4020; +} +{ +fma.rn.f16x2 r4025, r3486, r4009, r4023; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4033, {low, high}; +} +{ +mul.f16x2 r4034, r4031, r4033; +} +{ +mul.f16x2 r4037, r4005, r4029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4040, {high, low}; +} +{ +fma.rn.f16x2 r4042, r4034, r4040, r4037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4048, {high, high}; +} +{ +mul.f16x2 r4050, r2234, r4048; +} +{ +fma.rn.f16x2 r4053, r2090, r4046, r4050; +} +{ +mul.f16x2 r4057, r2090, r4048; +} +{ +neg.f16x2 r4060, r4057; +} +{ +fma.rn.f16x2 r4062, r2234, r4046, r4060; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4070, {low, high}; +} +{ +mul.f16x2 r4071, r4068, r4070; +} +{ +mul.f16x2 r4074, r4042, r4066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4077, {high, low}; +} +{ +fma.rn.f16x2 r4079, r4071, r4077, r4074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4085, {high, high}; +} +{ +mul.f16x2 r4087, r2556, r4085; +} +{ +fma.rn.f16x2 r4090, r2412, r4083, r4087; +} +{ +mul.f16x2 r4094, r2412, r4085; +} +{ +neg.f16x2 r4097, r4094; +} +{ +fma.rn.f16x2 r4099, r2556, r4083, r4097; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4107, {low, high}; +} +{ +mul.f16x2 r4108, r4105, r4107; +} +{ +mul.f16x2 r4111, r4079, r4103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4114, {high, low}; +} +{ +fma.rn.f16x2 r4116, r4108, r4114, r4111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4122, {high, high}; +} +{ +mul.f16x2 r4124, r2878, r4122; +} +{ +fma.rn.f16x2 r4127, r2734, r4120, r4124; +} +{ +mul.f16x2 r4131, r2734, r4122; +} +{ +neg.f16x2 r4134, r4131; +} +{ +fma.rn.f16x2 r4136, r2878, r4120, r4134; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4140, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4142, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4144, {low, high}; +} +{ +mul.f16x2 r4145, r4142, r4144; +} +{ +mul.f16x2 r4148, r4116, r4140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4151, {high, low}; +} +{ +fma.rn.f16x2 r4153, r4145, r4151, r4148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4159, {high, high}; +} +{ +mul.f16x2 r4161, r3200, r4159; +} +{ +fma.rn.f16x2 r4164, r3056, r4157, r4161; +} +{ +mul.f16x2 r4168, r3056, r4159; +} +{ +neg.f16x2 r4171, r4168; +} +{ +fma.rn.f16x2 r4173, r3200, r4157, r4171; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4177, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4179, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4181, {low, high}; +} +{ +mul.f16x2 r4182, r4179, r4181; +} +{ +mul.f16x2 r4185, r4153, r4177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4188, {high, low}; +} +{ +fma.rn.f16x2 r4190, r4182, r4188, r4185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4196, {high, high}; +} +{ +mul.f16x2 r4198, r3522, r4196; +} +{ +fma.rn.f16x2 r4201, r3378, r4194, r4198; +} +{ +mul.f16x2 r4205, r3378, r4196; +} +{ +neg.f16x2 r4208, r4205; +} +{ +fma.rn.f16x2 r4210, r3522, r4194, r4208; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4214, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4216, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4218, {low, high}; +} +{ +mul.f16x2 r4219, r4216, r4218; +} +{ +mul.f16x2 r4222, r4190, r4214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4225, {high, low}; +} +{ +fma.rn.f16x2 r4227, r4219, r4225, r4222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4233, {high, high}; +} +{ +mul.f16x2 r4235, r2162, r4233; +} +{ +fma.rn.f16x2 r4238, r2018, r4231, r4235; +} +{ +mul.f16x2 r4242, r2018, r4233; +} +{ +neg.f16x2 r4245, r4242; +} +{ +fma.rn.f16x2 r4247, r2162, r4231, r4245; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4253, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4255, {low, high}; +} +{ +mul.f16x2 r4256, r4253, r4255; +} +{ +mul.f16x2 r4259, r4227, r4251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4262, {high, low}; +} +{ +fma.rn.f16x2 r4264, r4256, r4262, r4259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4270, {high, high}; +} +{ +mul.f16x2 r4272, r2484, r4270; +} +{ +fma.rn.f16x2 r4275, r2340, r4268, r4272; +} +{ +mul.f16x2 r4279, r2340, r4270; +} +{ +neg.f16x2 r4282, r4279; +} +{ +fma.rn.f16x2 r4284, r2484, r4268, r4282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4288, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4290, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4292, {low, high}; +} +{ +mul.f16x2 r4293, r4290, r4292; +} +{ +mul.f16x2 r4296, r4264, r4288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4299, {high, low}; +} +{ +fma.rn.f16x2 r4301, r4293, r4299, r4296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4307, {high, high}; +} +{ +mul.f16x2 r4309, r2806, r4307; +} +{ +fma.rn.f16x2 r4312, r2662, r4305, r4309; +} +{ +mul.f16x2 r4316, r2662, r4307; +} +{ +neg.f16x2 r4319, r4316; +} +{ +fma.rn.f16x2 r4321, r2806, r4305, r4319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4325, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4327, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4329, {low, high}; +} +{ +mul.f16x2 r4330, r4327, r4329; +} +{ +mul.f16x2 r4333, r4301, r4325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4336, {high, low}; +} +{ +fma.rn.f16x2 r4338, r4330, r4336, r4333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4344, {high, high}; +} +{ +mul.f16x2 r4346, r3128, r4344; +} +{ +fma.rn.f16x2 r4349, r2984, r4342, r4346; +} +{ +mul.f16x2 r4353, r2984, r4344; +} +{ +neg.f16x2 r4356, r4353; +} +{ +fma.rn.f16x2 r4358, r3128, r4342, r4356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4362, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4364, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4366, {low, high}; +} +{ +mul.f16x2 r4367, r4364, r4366; +} +{ +mul.f16x2 r4370, r4338, r4362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4373, {high, low}; +} +{ +fma.rn.f16x2 r4375, r4367, r4373, r4370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4379, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4381, {high, high}; +} +{ +mul.f16x2 r4383, r3450, r4381; +} +{ +fma.rn.f16x2 r4386, r3306, r4379, r4383; +} +{ +mul.f16x2 r4390, r3306, r4381; +} +{ +neg.f16x2 r4393, r4390; +} +{ +fma.rn.f16x2 r4395, r3450, r4379, r4393; +} +mad.lo.s32 r6033, r6030, 1000, r6028; +barrier.sync 0; +mad.lo.s32 r6034, r6032, 200, r6033; +st.shared.v2.f32 [r6034], {r1934, r1946}; +st.shared.v2.f32 [r6034+8], {r3535, r3544}; +st.shared.v2.f32 [r6034+16], {r3572, r3581}; +st.shared.v2.f32 [r6034+24], {r3609, r3618}; +st.shared.v2.f32 [r6034+32], {r3646, r3655}; +st.shared.v2.f32 [r6034+40], {r3683, r3692}; +st.shared.v2.f32 [r6034+48], {r3720, r3729}; +st.shared.v2.f32 [r6034+56], {r3757, r3766}; +st.shared.v2.f32 [r6034+64], {r3794, r3803}; +st.shared.v2.f32 [r6034+72], {r3831, r3840}; +st.shared.v2.f32 [r6034+80], {r3868, r3877}; +st.shared.v2.f32 [r6034+88], {r3905, r3914}; +st.shared.v2.f32 [r6034+96], {r3942, r3951}; +st.shared.v2.f32 [r6034+104], {r3979, r3988}; +st.shared.v2.f32 [r6034+112], {r4016, r4025}; +st.shared.v2.f32 [r6034+120], {r4053, r4062}; +st.shared.v2.f32 [r6034+128], {r4090, r4099}; +st.shared.v2.f32 [r6034+136], {r4127, r4136}; +st.shared.v2.f32 [r6034+144], {r4164, r4173}; +st.shared.v2.f32 [r6034+152], {r4201, r4210}; +st.shared.v2.f32 [r6034+160], {r4238, r4247}; +st.shared.v2.f32 [r6034+168], {r4275, r4284}; +st.shared.v2.f32 [r6034+176], {r4312, r4321}; +st.shared.v2.f32 [r6034+184], {r4349, r4358}; +st.shared.v2.f32 [r6034+192], {r4386, r4395}; +barrier.sync 0; +mad.lo.s32 r6035, r6032, -192, r6034; +ld.shared.u32 r4430, [r6035]; +ld.shared.u32 r4442, [r6035+4]; +ld.shared.u32 r4752, [r6035+40]; +ld.shared.u32 r4764, [r6035+44]; +ld.shared.u32 r5074, [r6035+80]; +ld.shared.u32 r5086, [r6035+84]; +ld.shared.u32 r5396, [r6035+120]; +ld.shared.u32 r5408, [r6035+124]; +ld.shared.u32 r5718, [r6035+160]; +ld.shared.u32 r5730, [r6035+164]; +ld.shared.u32 r4427, [r6035+200]; +ld.shared.u32 r4439, [r6035+204]; +ld.shared.u32 r4749, [r6035+240]; +ld.shared.u32 r4761, [r6035+244]; +ld.shared.u32 r5071, [r6035+280]; +ld.shared.u32 r5083, [r6035+284]; +ld.shared.u32 r5393, [r6035+320]; +ld.shared.u32 r5405, [r6035+324]; +ld.shared.u32 r5715, [r6035+360]; +ld.shared.u32 r5727, [r6035+364]; +ld.shared.u32 r4433, [r6035+400]; +ld.shared.u32 r4445, [r6035+404]; +ld.shared.u32 r4755, [r6035+440]; +ld.shared.u32 r4767, [r6035+444]; +ld.shared.u32 r5077, [r6035+480]; +ld.shared.u32 r5089, [r6035+484]; +ld.shared.u32 r5399, [r6035+520]; +ld.shared.u32 r5411, [r6035+524]; +ld.shared.u32 r5721, [r6035+560]; +ld.shared.u32 r5733, [r6035+564]; +ld.shared.u32 r4434, [r6035+600]; +ld.shared.u32 r4446, [r6035+604]; +ld.shared.u32 r4756, [r6035+640]; +ld.shared.u32 r4768, [r6035+644]; +ld.shared.u32 r5078, [r6035+680]; +ld.shared.u32 r5090, [r6035+684]; +ld.shared.u32 r5400, [r6035+720]; +ld.shared.u32 r5412, [r6035+724]; +ld.shared.u32 r5722, [r6035+760]; +ld.shared.u32 r5734, [r6035+764]; +ld.shared.u32 r4428, [r6035+800]; +ld.shared.u32 r4440, [r6035+804]; +ld.shared.u32 r4750, [r6035+840]; +ld.shared.u32 r4762, [r6035+844]; +ld.shared.u32 r5072, [r6035+880]; +ld.shared.u32 r5084, [r6035+884]; +ld.shared.u32 r5394, [r6035+920]; +ld.shared.u32 r5406, [r6035+924]; +ld.shared.u32 r5716, [r6035+960]; +ld.shared.u32 r5728, [r6035+964]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4417, {low, high}; +} +{ +neg.f16x2 r4418, r4417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4421, {low, high}; +} +{ +neg.f16x2 r4422, r4421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4425, {low, high}; +} +{ +add.f16x2 r4426, r4427, r4428; +} +{ +add.f16x2 r4429, r4430, r4426; +} +{ +add.f16x2 r4432, r4433, r4434; +} +{ +add.f16x2 %0, r4429, r4432; +} +{ +add.f16x2 r4438, r4439, r4440; +} +{ +add.f16x2 r4441, r4442, r4438; +} +{ +add.f16x2 r4444, r4445, r4446; +} +{ +add.f16x2 %1, r4441, r4444; +} +{ +add.f16x2 r4450, r4427, r4428; +} +{ +mul.f16x2 r4453, r4450, r4416; +} +{ +add.f16x2 r4456, r4430, r4453; +} +{ +add.f16x2 r4459, r4433, r4434; +} +{ +mul.f16x2 r4462, r4459, r4420; +} +{ +add.f16x2 r4465, r4456, r4462; +} +{ +sub.f16x2 r4468, r4439, r4440; +} +{ +mul.f16x2 r4471, r4468, r4418; +} +{ +sub.f16x2 r4474, r4445, r4446; +} +{ +mul.f16x2 r4477, r4474, r4422; +} +{ +add.f16x2 r4480, r4471, r4477; +} +{ +sub.f16x2 %10, r4465, r4480; +} +{ +add.f16x2 r4486, r4427, r4428; +} +{ +mul.f16x2 r4489, r4486, r4416; +} +{ +add.f16x2 r4492, r4430, r4489; +} +{ +add.f16x2 r4495, r4433, r4434; +} +{ +mul.f16x2 r4498, r4495, r4420; +} +{ +add.f16x2 r4501, r4492, r4498; +} +{ +sub.f16x2 r4504, r4439, r4440; +} +{ +mul.f16x2 r4507, r4504, r4418; +} +{ +sub.f16x2 r4510, r4445, r4446; +} +{ +mul.f16x2 r4513, r4510, r4422; +} +{ +add.f16x2 r4516, r4507, r4513; +} +{ +add.f16x2 %40, r4501, r4516; +} +{ +add.f16x2 r4522, r4427, r4428; +} +{ +mul.f16x2 r4525, r4522, r4420; +} +{ +add.f16x2 r4528, r4430, r4525; +} +{ +add.f16x2 r4531, r4433, r4434; +} +{ +mul.f16x2 r4534, r4531, r4424; +} +{ +add.f16x2 r4537, r4528, r4534; +} +{ +sub.f16x2 r4540, r4439, r4440; +} +{ +mul.f16x2 r4543, r4540, r4422; +} +{ +sub.f16x2 r4546, r4445, r4446; +} +{ +mul.f16x2 r4549, r4546, r4425; +} +{ +add.f16x2 r4552, r4543, r4549; +} +{ +sub.f16x2 %20, r4537, r4552; +} +{ +add.f16x2 r4558, r4427, r4428; +} +{ +mul.f16x2 r4561, r4558, r4420; +} +{ +add.f16x2 r4564, r4430, r4561; +} +{ +add.f16x2 r4567, r4433, r4434; +} +{ +mul.f16x2 r4570, r4567, r4424; +} +{ +add.f16x2 r4573, r4564, r4570; +} +{ +sub.f16x2 r4576, r4439, r4440; +} +{ +mul.f16x2 r4579, r4576, r4422; +} +{ +sub.f16x2 r4582, r4445, r4446; +} +{ +mul.f16x2 r4585, r4582, r4425; +} +{ +add.f16x2 r4588, r4579, r4585; +} +{ +add.f16x2 %30, r4573, r4588; +} +{ +add.f16x2 r4594, r4439, r4440; +} +{ +mul.f16x2 r4597, r4594, r4416; +} +{ +add.f16x2 r4600, r4442, r4597; +} +{ +add.f16x2 r4603, r4445, r4446; +} +{ +mul.f16x2 r4606, r4603, r4420; +} +{ +add.f16x2 r4609, r4600, r4606; +} +{ +sub.f16x2 r4612, r4427, r4428; +} +{ +mul.f16x2 r4615, r4612, r4418; +} +{ +sub.f16x2 r4618, r4433, r4434; +} +{ +mul.f16x2 r4621, r4618, r4422; +} +{ +add.f16x2 r4624, r4615, r4621; +} +{ +add.f16x2 %11, r4609, r4624; +} +{ +add.f16x2 r4630, r4439, r4440; +} +{ +mul.f16x2 r4633, r4630, r4416; +} +{ +add.f16x2 r4636, r4442, r4633; +} +{ +add.f16x2 r4639, r4445, r4446; +} +{ +mul.f16x2 r4642, r4639, r4420; +} +{ +add.f16x2 r4645, r4636, r4642; +} +{ +sub.f16x2 r4648, r4427, r4428; +} +{ +mul.f16x2 r4651, r4648, r4418; +} +{ +sub.f16x2 r4654, r4433, r4434; +} +{ +mul.f16x2 r4657, r4654, r4422; +} +{ +add.f16x2 r4660, r4651, r4657; +} +{ +sub.f16x2 %41, r4645, r4660; +} +{ +add.f16x2 r4666, r4439, r4440; +} +{ +mul.f16x2 r4669, r4666, r4420; +} +{ +add.f16x2 r4672, r4442, r4669; +} +{ +add.f16x2 r4675, r4445, r4446; +} +{ +mul.f16x2 r4678, r4675, r4424; +} +{ +add.f16x2 r4681, r4672, r4678; +} +{ +sub.f16x2 r4684, r4427, r4428; +} +{ +mul.f16x2 r4687, r4684, r4422; +} +{ +sub.f16x2 r4690, r4433, r4434; +} +{ +mul.f16x2 r4693, r4690, r4425; +} +{ +add.f16x2 r4696, r4687, r4693; +} +{ +add.f16x2 %21, r4681, r4696; +} +{ +add.f16x2 r4702, r4439, r4440; +} +{ +mul.f16x2 r4705, r4702, r4420; +} +{ +add.f16x2 r4708, r4442, r4705; +} +{ +add.f16x2 r4711, r4445, r4446; +} +{ +mul.f16x2 r4714, r4711, r4424; +} +{ +add.f16x2 r4717, r4708, r4714; +} +{ +sub.f16x2 r4720, r4427, r4428; +} +{ +mul.f16x2 r4723, r4720, r4422; +} +{ +sub.f16x2 r4726, r4433, r4434; +} +{ +mul.f16x2 r4729, r4726, r4425; +} +{ +add.f16x2 r4732, r4723, r4729; +} +{ +sub.f16x2 %31, r4717, r4732; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4739, {low, high}; +} +{ +neg.f16x2 r4740, r4739; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4743, {low, high}; +} +{ +neg.f16x2 r4744, r4743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4747, {low, high}; +} +{ +add.f16x2 r4748, r4749, r4750; +} +{ +add.f16x2 r4751, r4752, r4748; +} +{ +add.f16x2 r4754, r4755, r4756; +} +{ +add.f16x2 %2, r4751, r4754; +} +{ +add.f16x2 r4760, r4761, r4762; +} +{ +add.f16x2 r4763, r4764, r4760; +} +{ +add.f16x2 r4766, r4767, r4768; +} +{ +add.f16x2 %3, r4763, r4766; +} +{ +add.f16x2 r4772, r4749, r4750; +} +{ +mul.f16x2 r4775, r4772, r4738; +} +{ +add.f16x2 r4778, r4752, r4775; +} +{ +add.f16x2 r4781, r4755, r4756; +} +{ +mul.f16x2 r4784, r4781, r4742; +} +{ +add.f16x2 r4787, r4778, r4784; +} +{ +sub.f16x2 r4790, r4761, r4762; +} +{ +mul.f16x2 r4793, r4790, r4740; +} +{ +sub.f16x2 r4796, r4767, r4768; +} +{ +mul.f16x2 r4799, r4796, r4744; +} +{ +add.f16x2 r4802, r4793, r4799; +} +{ +sub.f16x2 %12, r4787, r4802; +} +{ +add.f16x2 r4808, r4749, r4750; +} +{ +mul.f16x2 r4811, r4808, r4738; +} +{ +add.f16x2 r4814, r4752, r4811; +} +{ +add.f16x2 r4817, r4755, r4756; +} +{ +mul.f16x2 r4820, r4817, r4742; +} +{ +add.f16x2 r4823, r4814, r4820; +} +{ +sub.f16x2 r4826, r4761, r4762; +} +{ +mul.f16x2 r4829, r4826, r4740; +} +{ +sub.f16x2 r4832, r4767, r4768; +} +{ +mul.f16x2 r4835, r4832, r4744; +} +{ +add.f16x2 r4838, r4829, r4835; +} +{ +add.f16x2 %42, r4823, r4838; +} +{ +add.f16x2 r4844, r4749, r4750; +} +{ +mul.f16x2 r4847, r4844, r4742; +} +{ +add.f16x2 r4850, r4752, r4847; +} +{ +add.f16x2 r4853, r4755, r4756; +} +{ +mul.f16x2 r4856, r4853, r4746; +} +{ +add.f16x2 r4859, r4850, r4856; +} +{ +sub.f16x2 r4862, r4761, r4762; +} +{ +mul.f16x2 r4865, r4862, r4744; +} +{ +sub.f16x2 r4868, r4767, r4768; +} +{ +mul.f16x2 r4871, r4868, r4747; +} +{ +add.f16x2 r4874, r4865, r4871; +} +{ +sub.f16x2 %22, r4859, r4874; +} +{ +add.f16x2 r4880, r4749, r4750; +} +{ +mul.f16x2 r4883, r4880, r4742; +} +{ +add.f16x2 r4886, r4752, r4883; +} +{ +add.f16x2 r4889, r4755, r4756; +} +{ +mul.f16x2 r4892, r4889, r4746; +} +{ +add.f16x2 r4895, r4886, r4892; +} +{ +sub.f16x2 r4898, r4761, r4762; +} +{ +mul.f16x2 r4901, r4898, r4744; +} +{ +sub.f16x2 r4904, r4767, r4768; +} +{ +mul.f16x2 r4907, r4904, r4747; +} +{ +add.f16x2 r4910, r4901, r4907; +} +{ +add.f16x2 %32, r4895, r4910; +} +{ +add.f16x2 r4916, r4761, r4762; +} +{ +mul.f16x2 r4919, r4916, r4738; +} +{ +add.f16x2 r4922, r4764, r4919; +} +{ +add.f16x2 r4925, r4767, r4768; +} +{ +mul.f16x2 r4928, r4925, r4742; +} +{ +add.f16x2 r4931, r4922, r4928; +} +{ +sub.f16x2 r4934, r4749, r4750; +} +{ +mul.f16x2 r4937, r4934, r4740; +} +{ +sub.f16x2 r4940, r4755, r4756; +} +{ +mul.f16x2 r4943, r4940, r4744; +} +{ +add.f16x2 r4946, r4937, r4943; +} +{ +add.f16x2 %13, r4931, r4946; +} +{ +add.f16x2 r4952, r4761, r4762; +} +{ +mul.f16x2 r4955, r4952, r4738; +} +{ +add.f16x2 r4958, r4764, r4955; +} +{ +add.f16x2 r4961, r4767, r4768; +} +{ +mul.f16x2 r4964, r4961, r4742; +} +{ +add.f16x2 r4967, r4958, r4964; +} +{ +sub.f16x2 r4970, r4749, r4750; +} +{ +mul.f16x2 r4973, r4970, r4740; +} +{ +sub.f16x2 r4976, r4755, r4756; +} +{ +mul.f16x2 r4979, r4976, r4744; +} +{ +add.f16x2 r4982, r4973, r4979; +} +{ +sub.f16x2 %43, r4967, r4982; +} +{ +add.f16x2 r4988, r4761, r4762; +} +{ +mul.f16x2 r4991, r4988, r4742; +} +{ +add.f16x2 r4994, r4764, r4991; +} +{ +add.f16x2 r4997, r4767, r4768; +} +{ +mul.f16x2 r5000, r4997, r4746; +} +{ +add.f16x2 r5003, r4994, r5000; +} +{ +sub.f16x2 r5006, r4749, r4750; +} +{ +mul.f16x2 r5009, r5006, r4744; +} +{ +sub.f16x2 r5012, r4755, r4756; +} +{ +mul.f16x2 r5015, r5012, r4747; +} +{ +add.f16x2 r5018, r5009, r5015; +} +{ +add.f16x2 %23, r5003, r5018; +} +{ +add.f16x2 r5024, r4761, r4762; +} +{ +mul.f16x2 r5027, r5024, r4742; +} +{ +add.f16x2 r5030, r4764, r5027; +} +{ +add.f16x2 r5033, r4767, r4768; +} +{ +mul.f16x2 r5036, r5033, r4746; +} +{ +add.f16x2 r5039, r5030, r5036; +} +{ +sub.f16x2 r5042, r4749, r4750; +} +{ +mul.f16x2 r5045, r5042, r4744; +} +{ +sub.f16x2 r5048, r4755, r4756; +} +{ +mul.f16x2 r5051, r5048, r4747; +} +{ +add.f16x2 r5054, r5045, r5051; +} +{ +sub.f16x2 %33, r5039, r5054; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5061, {low, high}; +} +{ +neg.f16x2 r5062, r5061; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5064, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5065, {low, high}; +} +{ +neg.f16x2 r5066, r5065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5069, {low, high}; +} +{ +add.f16x2 r5070, r5071, r5072; +} +{ +add.f16x2 r5073, r5074, r5070; +} +{ +add.f16x2 r5076, r5077, r5078; +} +{ +add.f16x2 %4, r5073, r5076; +} +{ +add.f16x2 r5082, r5083, r5084; +} +{ +add.f16x2 r5085, r5086, r5082; +} +{ +add.f16x2 r5088, r5089, r5090; +} +{ +add.f16x2 %5, r5085, r5088; +} +{ +add.f16x2 r5094, r5071, r5072; +} +{ +mul.f16x2 r5097, r5094, r5060; +} +{ +add.f16x2 r5100, r5074, r5097; +} +{ +add.f16x2 r5103, r5077, r5078; +} +{ +mul.f16x2 r5106, r5103, r5064; +} +{ +add.f16x2 r5109, r5100, r5106; +} +{ +sub.f16x2 r5112, r5083, r5084; +} +{ +mul.f16x2 r5115, r5112, r5062; +} +{ +sub.f16x2 r5118, r5089, r5090; +} +{ +mul.f16x2 r5121, r5118, r5066; +} +{ +add.f16x2 r5124, r5115, r5121; +} +{ +sub.f16x2 %14, r5109, r5124; +} +{ +add.f16x2 r5130, r5071, r5072; +} +{ +mul.f16x2 r5133, r5130, r5060; +} +{ +add.f16x2 r5136, r5074, r5133; +} +{ +add.f16x2 r5139, r5077, r5078; +} +{ +mul.f16x2 r5142, r5139, r5064; +} +{ +add.f16x2 r5145, r5136, r5142; +} +{ +sub.f16x2 r5148, r5083, r5084; +} +{ +mul.f16x2 r5151, r5148, r5062; +} +{ +sub.f16x2 r5154, r5089, r5090; +} +{ +mul.f16x2 r5157, r5154, r5066; +} +{ +add.f16x2 r5160, r5151, r5157; +} +{ +add.f16x2 %44, r5145, r5160; +} +{ +add.f16x2 r5166, r5071, r5072; +} +{ +mul.f16x2 r5169, r5166, r5064; +} +{ +add.f16x2 r5172, r5074, r5169; +} +{ +add.f16x2 r5175, r5077, r5078; +} +{ +mul.f16x2 r5178, r5175, r5068; +} +{ +add.f16x2 r5181, r5172, r5178; +} +{ +sub.f16x2 r5184, r5083, r5084; +} +{ +mul.f16x2 r5187, r5184, r5066; +} +{ +sub.f16x2 r5190, r5089, r5090; +} +{ +mul.f16x2 r5193, r5190, r5069; +} +{ +add.f16x2 r5196, r5187, r5193; +} +{ +sub.f16x2 %24, r5181, r5196; +} +{ +add.f16x2 r5202, r5071, r5072; +} +{ +mul.f16x2 r5205, r5202, r5064; +} +{ +add.f16x2 r5208, r5074, r5205; +} +{ +add.f16x2 r5211, r5077, r5078; +} +{ +mul.f16x2 r5214, r5211, r5068; +} +{ +add.f16x2 r5217, r5208, r5214; +} +{ +sub.f16x2 r5220, r5083, r5084; +} +{ +mul.f16x2 r5223, r5220, r5066; +} +{ +sub.f16x2 r5226, r5089, r5090; +} +{ +mul.f16x2 r5229, r5226, r5069; +} +{ +add.f16x2 r5232, r5223, r5229; +} +{ +add.f16x2 %34, r5217, r5232; +} +{ +add.f16x2 r5238, r5083, r5084; +} +{ +mul.f16x2 r5241, r5238, r5060; +} +{ +add.f16x2 r5244, r5086, r5241; +} +{ +add.f16x2 r5247, r5089, r5090; +} +{ +mul.f16x2 r5250, r5247, r5064; +} +{ +add.f16x2 r5253, r5244, r5250; +} +{ +sub.f16x2 r5256, r5071, r5072; +} +{ +mul.f16x2 r5259, r5256, r5062; +} +{ +sub.f16x2 r5262, r5077, r5078; +} +{ +mul.f16x2 r5265, r5262, r5066; +} +{ +add.f16x2 r5268, r5259, r5265; +} +{ +add.f16x2 %15, r5253, r5268; +} +{ +add.f16x2 r5274, r5083, r5084; +} +{ +mul.f16x2 r5277, r5274, r5060; +} +{ +add.f16x2 r5280, r5086, r5277; +} +{ +add.f16x2 r5283, r5089, r5090; +} +{ +mul.f16x2 r5286, r5283, r5064; +} +{ +add.f16x2 r5289, r5280, r5286; +} +{ +sub.f16x2 r5292, r5071, r5072; +} +{ +mul.f16x2 r5295, r5292, r5062; +} +{ +sub.f16x2 r5298, r5077, r5078; +} +{ +mul.f16x2 r5301, r5298, r5066; +} +{ +add.f16x2 r5304, r5295, r5301; +} +{ +sub.f16x2 %45, r5289, r5304; +} +{ +add.f16x2 r5310, r5083, r5084; +} +{ +mul.f16x2 r5313, r5310, r5064; +} +{ +add.f16x2 r5316, r5086, r5313; +} +{ +add.f16x2 r5319, r5089, r5090; +} +{ +mul.f16x2 r5322, r5319, r5068; +} +{ +add.f16x2 r5325, r5316, r5322; +} +{ +sub.f16x2 r5328, r5071, r5072; +} +{ +mul.f16x2 r5331, r5328, r5066; +} +{ +sub.f16x2 r5334, r5077, r5078; +} +{ +mul.f16x2 r5337, r5334, r5069; +} +{ +add.f16x2 r5340, r5331, r5337; +} +{ +add.f16x2 %25, r5325, r5340; +} +{ +add.f16x2 r5346, r5083, r5084; +} +{ +mul.f16x2 r5349, r5346, r5064; +} +{ +add.f16x2 r5352, r5086, r5349; +} +{ +add.f16x2 r5355, r5089, r5090; +} +{ +mul.f16x2 r5358, r5355, r5068; +} +{ +add.f16x2 r5361, r5352, r5358; +} +{ +sub.f16x2 r5364, r5071, r5072; +} +{ +mul.f16x2 r5367, r5364, r5066; +} +{ +sub.f16x2 r5370, r5077, r5078; +} +{ +mul.f16x2 r5373, r5370, r5069; +} +{ +add.f16x2 r5376, r5367, r5373; +} +{ +sub.f16x2 %35, r5361, r5376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5383, {low, high}; +} +{ +neg.f16x2 r5384, r5383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5386, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5387, {low, high}; +} +{ +neg.f16x2 r5388, r5387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5391, {low, high}; +} +{ +add.f16x2 r5392, r5393, r5394; +} +{ +add.f16x2 r5395, r5396, r5392; +} +{ +add.f16x2 r5398, r5399, r5400; +} +{ +add.f16x2 %6, r5395, r5398; +} +{ +add.f16x2 r5404, r5405, r5406; +} +{ +add.f16x2 r5407, r5408, r5404; +} +{ +add.f16x2 r5410, r5411, r5412; +} +{ +add.f16x2 %7, r5407, r5410; +} +{ +add.f16x2 r5416, r5393, r5394; +} +{ +mul.f16x2 r5419, r5416, r5382; +} +{ +add.f16x2 r5422, r5396, r5419; +} +{ +add.f16x2 r5425, r5399, r5400; +} +{ +mul.f16x2 r5428, r5425, r5386; +} +{ +add.f16x2 r5431, r5422, r5428; +} +{ +sub.f16x2 r5434, r5405, r5406; +} +{ +mul.f16x2 r5437, r5434, r5384; +} +{ +sub.f16x2 r5440, r5411, r5412; +} +{ +mul.f16x2 r5443, r5440, r5388; +} +{ +add.f16x2 r5446, r5437, r5443; +} +{ +sub.f16x2 %16, r5431, r5446; +} +{ +add.f16x2 r5452, r5393, r5394; +} +{ +mul.f16x2 r5455, r5452, r5382; +} +{ +add.f16x2 r5458, r5396, r5455; +} +{ +add.f16x2 r5461, r5399, r5400; +} +{ +mul.f16x2 r5464, r5461, r5386; +} +{ +add.f16x2 r5467, r5458, r5464; +} +{ +sub.f16x2 r5470, r5405, r5406; +} +{ +mul.f16x2 r5473, r5470, r5384; +} +{ +sub.f16x2 r5476, r5411, r5412; +} +{ +mul.f16x2 r5479, r5476, r5388; +} +{ +add.f16x2 r5482, r5473, r5479; +} +{ +add.f16x2 %46, r5467, r5482; +} +{ +add.f16x2 r5488, r5393, r5394; +} +{ +mul.f16x2 r5491, r5488, r5386; +} +{ +add.f16x2 r5494, r5396, r5491; +} +{ +add.f16x2 r5497, r5399, r5400; +} +{ +mul.f16x2 r5500, r5497, r5390; +} +{ +add.f16x2 r5503, r5494, r5500; +} +{ +sub.f16x2 r5506, r5405, r5406; +} +{ +mul.f16x2 r5509, r5506, r5388; +} +{ +sub.f16x2 r5512, r5411, r5412; +} +{ +mul.f16x2 r5515, r5512, r5391; +} +{ +add.f16x2 r5518, r5509, r5515; +} +{ +sub.f16x2 %26, r5503, r5518; +} +{ +add.f16x2 r5524, r5393, r5394; +} +{ +mul.f16x2 r5527, r5524, r5386; +} +{ +add.f16x2 r5530, r5396, r5527; +} +{ +add.f16x2 r5533, r5399, r5400; +} +{ +mul.f16x2 r5536, r5533, r5390; +} +{ +add.f16x2 r5539, r5530, r5536; +} +{ +sub.f16x2 r5542, r5405, r5406; +} +{ +mul.f16x2 r5545, r5542, r5388; +} +{ +sub.f16x2 r5548, r5411, r5412; +} +{ +mul.f16x2 r5551, r5548, r5391; +} +{ +add.f16x2 r5554, r5545, r5551; +} +{ +add.f16x2 %36, r5539, r5554; +} +{ +add.f16x2 r5560, r5405, r5406; +} +{ +mul.f16x2 r5563, r5560, r5382; +} +{ +add.f16x2 r5566, r5408, r5563; +} +{ +add.f16x2 r5569, r5411, r5412; +} +{ +mul.f16x2 r5572, r5569, r5386; +} +{ +add.f16x2 r5575, r5566, r5572; +} +{ +sub.f16x2 r5578, r5393, r5394; +} +{ +mul.f16x2 r5581, r5578, r5384; +} +{ +sub.f16x2 r5584, r5399, r5400; +} +{ +mul.f16x2 r5587, r5584, r5388; +} +{ +add.f16x2 r5590, r5581, r5587; +} +{ +add.f16x2 %17, r5575, r5590; +} +{ +add.f16x2 r5596, r5405, r5406; +} +{ +mul.f16x2 r5599, r5596, r5382; +} +{ +add.f16x2 r5602, r5408, r5599; +} +{ +add.f16x2 r5605, r5411, r5412; +} +{ +mul.f16x2 r5608, r5605, r5386; +} +{ +add.f16x2 r5611, r5602, r5608; +} +{ +sub.f16x2 r5614, r5393, r5394; +} +{ +mul.f16x2 r5617, r5614, r5384; +} +{ +sub.f16x2 r5620, r5399, r5400; +} +{ +mul.f16x2 r5623, r5620, r5388; +} +{ +add.f16x2 r5626, r5617, r5623; +} +{ +sub.f16x2 %47, r5611, r5626; +} +{ +add.f16x2 r5632, r5405, r5406; +} +{ +mul.f16x2 r5635, r5632, r5386; +} +{ +add.f16x2 r5638, r5408, r5635; +} +{ +add.f16x2 r5641, r5411, r5412; +} +{ +mul.f16x2 r5644, r5641, r5390; +} +{ +add.f16x2 r5647, r5638, r5644; +} +{ +sub.f16x2 r5650, r5393, r5394; +} +{ +mul.f16x2 r5653, r5650, r5388; +} +{ +sub.f16x2 r5656, r5399, r5400; +} +{ +mul.f16x2 r5659, r5656, r5391; +} +{ +add.f16x2 r5662, r5653, r5659; +} +{ +add.f16x2 %27, r5647, r5662; +} +{ +add.f16x2 r5668, r5405, r5406; +} +{ +mul.f16x2 r5671, r5668, r5386; +} +{ +add.f16x2 r5674, r5408, r5671; +} +{ +add.f16x2 r5677, r5411, r5412; +} +{ +mul.f16x2 r5680, r5677, r5390; +} +{ +add.f16x2 r5683, r5674, r5680; +} +{ +sub.f16x2 r5686, r5393, r5394; +} +{ +mul.f16x2 r5689, r5686, r5388; +} +{ +sub.f16x2 r5692, r5399, r5400; +} +{ +mul.f16x2 r5695, r5692, r5391; +} +{ +add.f16x2 r5698, r5689, r5695; +} +{ +sub.f16x2 %37, r5683, r5698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5704, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5705, {low, high}; +} +{ +neg.f16x2 r5706, r5705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5709, {low, high}; +} +{ +neg.f16x2 r5710, r5709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5712, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5713, {low, high}; +} +{ +add.f16x2 r5714, r5715, r5716; +} +{ +add.f16x2 r5717, r5718, r5714; +} +{ +add.f16x2 r5720, r5721, r5722; +} +{ +add.f16x2 %8, r5717, r5720; +} +{ +add.f16x2 r5726, r5727, r5728; +} +{ +add.f16x2 r5729, r5730, r5726; +} +{ +add.f16x2 r5732, r5733, r5734; +} +{ +add.f16x2 %9, r5729, r5732; +} +{ +add.f16x2 r5738, r5715, r5716; +} +{ +mul.f16x2 r5741, r5738, r5704; +} +{ +add.f16x2 r5744, r5718, r5741; +} +{ +add.f16x2 r5747, r5721, r5722; +} +{ +mul.f16x2 r5750, r5747, r5708; +} +{ +add.f16x2 r5753, r5744, r5750; +} +{ +sub.f16x2 r5756, r5727, r5728; +} +{ +mul.f16x2 r5759, r5756, r5706; +} +{ +sub.f16x2 r5762, r5733, r5734; +} +{ +mul.f16x2 r5765, r5762, r5710; +} +{ +add.f16x2 r5768, r5759, r5765; +} +{ +sub.f16x2 %18, r5753, r5768; +} +{ +add.f16x2 r5774, r5715, r5716; +} +{ +mul.f16x2 r5777, r5774, r5704; +} +{ +add.f16x2 r5780, r5718, r5777; +} +{ +add.f16x2 r5783, r5721, r5722; +} +{ +mul.f16x2 r5786, r5783, r5708; +} +{ +add.f16x2 r5789, r5780, r5786; +} +{ +sub.f16x2 r5792, r5727, r5728; +} +{ +mul.f16x2 r5795, r5792, r5706; +} +{ +sub.f16x2 r5798, r5733, r5734; +} +{ +mul.f16x2 r5801, r5798, r5710; +} +{ +add.f16x2 r5804, r5795, r5801; +} +{ +add.f16x2 %48, r5789, r5804; +} +{ +add.f16x2 r5810, r5715, r5716; +} +{ +mul.f16x2 r5813, r5810, r5708; +} +{ +add.f16x2 r5816, r5718, r5813; +} +{ +add.f16x2 r5819, r5721, r5722; +} +{ +mul.f16x2 r5822, r5819, r5712; +} +{ +add.f16x2 r5825, r5816, r5822; +} +{ +sub.f16x2 r5828, r5727, r5728; +} +{ +mul.f16x2 r5831, r5828, r5710; +} +{ +sub.f16x2 r5834, r5733, r5734; +} +{ +mul.f16x2 r5837, r5834, r5713; +} +{ +add.f16x2 r5840, r5831, r5837; +} +{ +sub.f16x2 %28, r5825, r5840; +} +{ +add.f16x2 r5846, r5715, r5716; +} +{ +mul.f16x2 r5849, r5846, r5708; +} +{ +add.f16x2 r5852, r5718, r5849; +} +{ +add.f16x2 r5855, r5721, r5722; +} +{ +mul.f16x2 r5858, r5855, r5712; +} +{ +add.f16x2 r5861, r5852, r5858; +} +{ +sub.f16x2 r5864, r5727, r5728; +} +{ +mul.f16x2 r5867, r5864, r5710; +} +{ +sub.f16x2 r5870, r5733, r5734; +} +{ +mul.f16x2 r5873, r5870, r5713; +} +{ +add.f16x2 r5876, r5867, r5873; +} +{ +add.f16x2 %38, r5861, r5876; +} +{ +add.f16x2 r5882, r5727, r5728; +} +{ +mul.f16x2 r5885, r5882, r5704; +} +{ +add.f16x2 r5888, r5730, r5885; +} +{ +add.f16x2 r5891, r5733, r5734; +} +{ +mul.f16x2 r5894, r5891, r5708; +} +{ +add.f16x2 r5897, r5888, r5894; +} +{ +sub.f16x2 r5900, r5715, r5716; +} +{ +mul.f16x2 r5903, r5900, r5706; +} +{ +sub.f16x2 r5906, r5721, r5722; +} +{ +mul.f16x2 r5909, r5906, r5710; +} +{ +add.f16x2 r5912, r5903, r5909; +} +{ +add.f16x2 %19, r5897, r5912; +} +{ +add.f16x2 r5918, r5727, r5728; +} +{ +mul.f16x2 r5921, r5918, r5704; +} +{ +add.f16x2 r5924, r5730, r5921; +} +{ +add.f16x2 r5927, r5733, r5734; +} +{ +mul.f16x2 r5930, r5927, r5708; +} +{ +add.f16x2 r5933, r5924, r5930; +} +{ +sub.f16x2 r5936, r5715, r5716; +} +{ +mul.f16x2 r5939, r5936, r5706; +} +{ +sub.f16x2 r5942, r5721, r5722; +} +{ +mul.f16x2 r5945, r5942, r5710; +} +{ +add.f16x2 r5948, r5939, r5945; +} +{ +sub.f16x2 %49, r5933, r5948; +} +{ +add.f16x2 r5954, r5727, r5728; +} +{ +mul.f16x2 r5957, r5954, r5708; +} +{ +add.f16x2 r5960, r5730, r5957; +} +{ +add.f16x2 r5963, r5733, r5734; +} +{ +mul.f16x2 r5966, r5963, r5712; +} +{ +add.f16x2 r5969, r5960, r5966; +} +{ +sub.f16x2 r5972, r5715, r5716; +} +{ +mul.f16x2 r5975, r5972, r5710; +} +{ +sub.f16x2 r5978, r5721, r5722; +} +{ +mul.f16x2 r5981, r5978, r5713; +} +{ +add.f16x2 r5984, r5975, r5981; +} +{ +add.f16x2 %29, r5969, r5984; +} +{ +add.f16x2 r5990, r5727, r5728; +} +{ +mul.f16x2 r5993, r5990, r5708; +} +{ +add.f16x2 r5996, r5730, r5993; +} +{ +add.f16x2 r5999, r5733, r5734; +} +{ +mul.f16x2 r6002, r5999, r5712; +} +{ +add.f16x2 r6005, r5996, r6002; +} +{ +sub.f16x2 r6008, r5715, r5716; +} +{ +mul.f16x2 r6011, r6008, r5710; +} +{ +sub.f16x2 r6014, r5721, r5722; +} +{ +mul.f16x2 r6017, r6014, r5713; +} +{ +add.f16x2 r6020, r6011, r6017; +} +{ +sub.f16x2 %39, r6005, r6020; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[13].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1106, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<332>; +.reg .b32 r<6088>; +.reg .b64 rd<4>; +mov.u32 r6086, %tid.y; +mov.u32 r6087, %50; +mad.lo.s32 r6028, r6086, 500, r6087; +mov.u32 r6029, %tid.x; +mov.f32 f326, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1, {low, high}; +} +mov.f32 f328, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f322, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5, {low, high}; +} +mov.f32 f324, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %67, %52; +} +{ +add.f16x2 r14, %74, r11; +} +{ +add.f16x2 r17, %81, %65; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %94, %79; +} +{ +add.f16x2 r26, %51, r23; +} +{ +add.f16x2 r29, %57, %92; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %67, %52; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %74, r38; +} +{ +add.f16x2 r44, %81, %65; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %94, %79; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %57, %92; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %67, %52; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %74, r74; +} +{ +add.f16x2 r80, %81, %65; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %94, %79; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %57, %92; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %67, %52; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %74, r110; +} +{ +add.f16x2 r116, %81, %65; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %94, %79; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %57, %92; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %67, %52; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %74, r146; +} +{ +add.f16x2 r152, %81, %65; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %94, %79; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %57, %92; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %94, %79; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %51, r182; +} +{ +add.f16x2 r188, %57, %92; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %67, %52; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %81, %65; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %94, %79; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %51, r218; +} +{ +add.f16x2 r224, %57, %92; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %67, %52; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %81, %65; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %94, %79; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %51, r254; +} +{ +add.f16x2 r260, %57, %92; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %67, %52; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %81, %65; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %94, %79; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %51, r290; +} +{ +add.f16x2 r296, %57, %92; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %67, %52; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %81, %65; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %89, %77; +} +{ +add.f16x2 r336, %96, r333; +} +{ +add.f16x2 r339, %55, %87; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %66, %53; +} +{ +add.f16x2 r348, %73, r345; +} +{ +add.f16x2 r351, %80, %64; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %89, %77; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %96, r360; +} +{ +add.f16x2 r366, %55, %87; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %66, %53; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %80, %64; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %89, %77; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %96, r396; +} +{ +add.f16x2 r402, %55, %87; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %66, %53; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %80, %64; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %89, %77; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %96, r432; +} +{ +add.f16x2 r438, %55, %87; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %66, %53; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %80, %64; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %89, %77; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %96, r468; +} +{ +add.f16x2 r474, %55, %87; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %66, %53; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %80, %64; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %66, %53; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %73, r504; +} +{ +add.f16x2 r510, %80, %64; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %89, %77; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %55, %87; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %66, %53; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %73, r540; +} +{ +add.f16x2 r546, %80, %64; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %89, %77; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %55, %87; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %66, %53; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %73, r576; +} +{ +add.f16x2 r582, %80, %64; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %89, %77; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %55, %87; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %66, %53; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %73, r612; +} +{ +add.f16x2 r618, %80, %64; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %89, %77; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %55, %87; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r650, {low, high}; +} +{ +neg.f16x2 r651, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, %62, %99; +} +{ +add.f16x2 r658, %69, r655; +} +{ +add.f16x2 r661, %78, %60; +} +{ +add.f16x2 r664, r658, r661; +} +{ +add.f16x2 r667, %88, %75; +} +{ +add.f16x2 r670, %95, r667; +} +{ +add.f16x2 r673, %54, %86; +} +{ +add.f16x2 r676, r670, r673; +} +{ +add.f16x2 r679, %62, %99; +} +{ +mul.f16x2 r682, r679, r645; +} +{ +add.f16x2 r685, %69, r682; +} +{ +add.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r649; +} +{ +add.f16x2 r694, r685, r691; +} +{ +sub.f16x2 r697, %88, %75; +} +{ +mul.f16x2 r700, r697, r647; +} +{ +sub.f16x2 r703, %54, %86; +} +{ +mul.f16x2 r706, r703, r651; +} +{ +add.f16x2 r709, r700, r706; +} +{ +sub.f16x2 r712, r694, r709; +} +{ +add.f16x2 r715, %62, %99; +} +{ +mul.f16x2 r718, r715, r645; +} +{ +add.f16x2 r721, %69, r718; +} +{ +add.f16x2 r724, %78, %60; +} +{ +mul.f16x2 r727, r724, r649; +} +{ +add.f16x2 r730, r721, r727; +} +{ +sub.f16x2 r733, %88, %75; +} +{ +mul.f16x2 r736, r733, r647; +} +{ +sub.f16x2 r739, %54, %86; +} +{ +mul.f16x2 r742, r739, r651; +} +{ +add.f16x2 r745, r736, r742; +} +{ +add.f16x2 r748, r730, r745; +} +{ +add.f16x2 r751, %62, %99; +} +{ +mul.f16x2 r754, r751, r649; +} +{ +add.f16x2 r757, %69, r754; +} +{ +add.f16x2 r760, %78, %60; +} +{ +mul.f16x2 r763, r760, r653; +} +{ +add.f16x2 r766, r757, r763; +} +{ +sub.f16x2 r769, %88, %75; +} +{ +mul.f16x2 r772, r769, r651; +} +{ +sub.f16x2 r775, %54, %86; +} +{ +mul.f16x2 r778, r775, r654; +} +{ +add.f16x2 r781, r772, r778; +} +{ +sub.f16x2 r784, r766, r781; +} +{ +add.f16x2 r787, %62, %99; +} +{ +mul.f16x2 r790, r787, r649; +} +{ +add.f16x2 r793, %69, r790; +} +{ +add.f16x2 r796, %78, %60; +} +{ +mul.f16x2 r799, r796, r653; +} +{ +add.f16x2 r802, r793, r799; +} +{ +sub.f16x2 r805, %88, %75; +} +{ +mul.f16x2 r808, r805, r651; +} +{ +sub.f16x2 r811, %54, %86; +} +{ +mul.f16x2 r814, r811, r654; +} +{ +add.f16x2 r817, r808, r814; +} +{ +add.f16x2 r820, r802, r817; +} +{ +add.f16x2 r823, %88, %75; +} +{ +mul.f16x2 r826, r823, r645; +} +{ +add.f16x2 r829, %95, r826; +} +{ +add.f16x2 r832, %54, %86; +} +{ +mul.f16x2 r835, r832, r649; +} +{ +add.f16x2 r838, r829, r835; +} +{ +sub.f16x2 r841, %62, %99; +} +{ +mul.f16x2 r844, r841, r647; +} +{ +sub.f16x2 r847, %78, %60; +} +{ +mul.f16x2 r850, r847, r651; +} +{ +add.f16x2 r853, r844, r850; +} +{ +add.f16x2 r856, r838, r853; +} +{ +add.f16x2 r859, %88, %75; +} +{ +mul.f16x2 r862, r859, r645; +} +{ +add.f16x2 r865, %95, r862; +} +{ +add.f16x2 r868, %54, %86; +} +{ +mul.f16x2 r871, r868, r649; +} +{ +add.f16x2 r874, r865, r871; +} +{ +sub.f16x2 r877, %62, %99; +} +{ +mul.f16x2 r880, r877, r647; +} +{ +sub.f16x2 r883, %78, %60; +} +{ +mul.f16x2 r886, r883, r651; +} +{ +add.f16x2 r889, r880, r886; +} +{ +sub.f16x2 r892, r874, r889; +} +{ +add.f16x2 r895, %88, %75; +} +{ +mul.f16x2 r898, r895, r649; +} +{ +add.f16x2 r901, %95, r898; +} +{ +add.f16x2 r904, %54, %86; +} +{ +mul.f16x2 r907, r904, r653; +} +{ +add.f16x2 r910, r901, r907; +} +{ +sub.f16x2 r913, %62, %99; +} +{ +mul.f16x2 r916, r913, r651; +} +{ +sub.f16x2 r919, %78, %60; +} +{ +mul.f16x2 r922, r919, r654; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r910, r925; +} +{ +add.f16x2 r931, %88, %75; +} +{ +mul.f16x2 r934, r931, r649; +} +{ +add.f16x2 r937, %95, r934; +} +{ +add.f16x2 r940, %54, %86; +} +{ +mul.f16x2 r943, r940, r653; +} +{ +add.f16x2 r946, r937, r943; +} +{ +sub.f16x2 r949, %62, %99; +} +{ +mul.f16x2 r952, r949, r651; +} +{ +sub.f16x2 r955, %78, %60; +} +{ +mul.f16x2 r958, r955, r654; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, r946, r961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r968, {low, high}; +} +{ +neg.f16x2 r969, r968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r972, {low, high}; +} +{ +neg.f16x2 r973, r972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r976, {low, high}; +} +{ +add.f16x2 r977, %85, %71; +} +{ +add.f16x2 r980, %91, r977; +} +{ +add.f16x2 r983, %100, %83; +} +{ +add.f16x2 r986, r980, r983; +} +{ +add.f16x2 r989, %61, %97; +} +{ +add.f16x2 r992, %68, r989; +} +{ +add.f16x2 r995, %76, %59; +} +{ +add.f16x2 r998, r992, r995; +} +{ +add.f16x2 r1001, %85, %71; +} +{ +mul.f16x2 r1004, r1001, r967; +} +{ +add.f16x2 r1007, %91, r1004; +} +{ +add.f16x2 r1010, %100, %83; +} +{ +mul.f16x2 r1013, r1010, r971; +} +{ +add.f16x2 r1016, r1007, r1013; +} +{ +sub.f16x2 r1019, %61, %97; +} +{ +mul.f16x2 r1022, r1019, r969; +} +{ +sub.f16x2 r1025, %76, %59; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r1022, r1028; +} +{ +sub.f16x2 r1034, r1016, r1031; +} +{ +add.f16x2 r1037, %85, %71; +} +{ +mul.f16x2 r1040, r1037, r967; +} +{ +add.f16x2 r1043, %91, r1040; +} +{ +add.f16x2 r1046, %100, %83; +} +{ +mul.f16x2 r1049, r1046, r971; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, %61, %97; +} +{ +mul.f16x2 r1058, r1055, r969; +} +{ +sub.f16x2 r1061, %76, %59; +} +{ +mul.f16x2 r1064, r1061, r973; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +add.f16x2 r1070, r1052, r1067; +} +{ +add.f16x2 r1073, %85, %71; +} +{ +mul.f16x2 r1076, r1073, r971; +} +{ +add.f16x2 r1079, %91, r1076; +} +{ +add.f16x2 r1082, %100, %83; +} +{ +mul.f16x2 r1085, r1082, r975; +} +{ +add.f16x2 r1088, r1079, r1085; +} +{ +sub.f16x2 r1091, %61, %97; +} +{ +mul.f16x2 r1094, r1091, r973; +} +{ +sub.f16x2 r1097, %76, %59; +} +{ +mul.f16x2 r1100, r1097, r976; +} +{ +add.f16x2 r1103, r1094, r1100; +} +{ +sub.f16x2 r1106, r1088, r1103; +} +{ +add.f16x2 r1109, %85, %71; +} +{ +mul.f16x2 r1112, r1109, r971; +} +{ +add.f16x2 r1115, %91, r1112; +} +{ +add.f16x2 r1118, %100, %83; +} +{ +mul.f16x2 r1121, r1118, r975; +} +{ +add.f16x2 r1124, r1115, r1121; +} +{ +sub.f16x2 r1127, %61, %97; +} +{ +mul.f16x2 r1130, r1127, r973; +} +{ +sub.f16x2 r1133, %76, %59; +} +{ +mul.f16x2 r1136, r1133, r976; +} +{ +add.f16x2 r1139, r1130, r1136; +} +{ +add.f16x2 r1142, r1124, r1139; +} +{ +add.f16x2 r1145, %61, %97; +} +{ +mul.f16x2 r1148, r1145, r967; +} +{ +add.f16x2 r1151, %68, r1148; +} +{ +add.f16x2 r1154, %76, %59; +} +{ +mul.f16x2 r1157, r1154, r971; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, %85, %71; +} +{ +mul.f16x2 r1166, r1163, r969; +} +{ +sub.f16x2 r1169, %100, %83; +} +{ +mul.f16x2 r1172, r1169, r973; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +add.f16x2 r1178, r1160, r1175; +} +{ +add.f16x2 r1181, %61, %97; +} +{ +mul.f16x2 r1184, r1181, r967; +} +{ +add.f16x2 r1187, %68, r1184; +} +{ +add.f16x2 r1190, %76, %59; +} +{ +mul.f16x2 r1193, r1190, r971; +} +{ +add.f16x2 r1196, r1187, r1193; +} +{ +sub.f16x2 r1199, %85, %71; +} +{ +mul.f16x2 r1202, r1199, r969; +} +{ +sub.f16x2 r1205, %100, %83; +} +{ +mul.f16x2 r1208, r1205, r973; +} +{ +add.f16x2 r1211, r1202, r1208; +} +{ +sub.f16x2 r1214, r1196, r1211; +} +{ +add.f16x2 r1217, %61, %97; +} +{ +mul.f16x2 r1220, r1217, r971; +} +{ +add.f16x2 r1223, %68, r1220; +} +{ +add.f16x2 r1226, %76, %59; +} +{ +mul.f16x2 r1229, r1226, r975; +} +{ +add.f16x2 r1232, r1223, r1229; +} +{ +sub.f16x2 r1235, %85, %71; +} +{ +mul.f16x2 r1238, r1235, r973; +} +{ +sub.f16x2 r1241, %100, %83; +} +{ +mul.f16x2 r1244, r1241, r976; +} +{ +add.f16x2 r1247, r1238, r1244; +} +{ +add.f16x2 r1250, r1232, r1247; +} +{ +add.f16x2 r1253, %61, %97; +} +{ +mul.f16x2 r1256, r1253, r971; +} +{ +add.f16x2 r1259, %68, r1256; +} +{ +add.f16x2 r1262, %76, %59; +} +{ +mul.f16x2 r1265, r1262, r975; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, %85, %71; +} +{ +mul.f16x2 r1274, r1271, r973; +} +{ +sub.f16x2 r1277, %100, %83; +} +{ +mul.f16x2 r1280, r1277, r976; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r1268, r1283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1290, {low, high}; +} +{ +neg.f16x2 r1291, r1290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1294, {low, high}; +} +{ +neg.f16x2 r1295, r1294; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1298, {low, high}; +} +{ +add.f16x2 r1299, %58, %93; +} +{ +add.f16x2 r1302, %63, r1299; +} +{ +add.f16x2 r1305, %72, %56; +} +{ +add.f16x2 r1308, r1302, r1305; +} +{ +add.f16x2 r1311, %84, %70; +} +{ +add.f16x2 r1314, %90, r1311; +} +{ +add.f16x2 r1317, %98, %82; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %58, %93; +} +{ +mul.f16x2 r1326, r1323, r1289; +} +{ +add.f16x2 r1329, %63, r1326; +} +{ +add.f16x2 r1332, %72, %56; +} +{ +mul.f16x2 r1335, r1332, r1293; +} +{ +add.f16x2 r1338, r1329, r1335; +} +{ +sub.f16x2 r1341, %84, %70; +} +{ +mul.f16x2 r1344, r1341, r1291; +} +{ +sub.f16x2 r1347, %98, %82; +} +{ +mul.f16x2 r1350, r1347, r1295; +} +{ +add.f16x2 r1353, r1344, r1350; +} +{ +sub.f16x2 r1356, r1338, r1353; +} +{ +add.f16x2 r1359, %58, %93; +} +{ +mul.f16x2 r1362, r1359, r1289; +} +{ +add.f16x2 r1365, %63, r1362; +} +{ +add.f16x2 r1368, %72, %56; +} +{ +mul.f16x2 r1371, r1368, r1293; +} +{ +add.f16x2 r1374, r1365, r1371; +} +{ +sub.f16x2 r1377, %84, %70; +} +{ +mul.f16x2 r1380, r1377, r1291; +} +{ +sub.f16x2 r1383, %98, %82; +} +{ +mul.f16x2 r1386, r1383, r1295; +} +{ +add.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1374, r1389; +} +{ +add.f16x2 r1395, %58, %93; +} +{ +mul.f16x2 r1398, r1395, r1293; +} +{ +add.f16x2 r1401, %63, r1398; +} +{ +add.f16x2 r1404, %72, %56; +} +{ +mul.f16x2 r1407, r1404, r1297; +} +{ +add.f16x2 r1410, r1401, r1407; +} +{ +sub.f16x2 r1413, %84, %70; +} +{ +mul.f16x2 r1416, r1413, r1295; +} +{ +sub.f16x2 r1419, %98, %82; +} +{ +mul.f16x2 r1422, r1419, r1298; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +sub.f16x2 r1428, r1410, r1425; +} +{ +add.f16x2 r1431, %58, %93; +} +{ +mul.f16x2 r1434, r1431, r1293; +} +{ +add.f16x2 r1437, %63, r1434; +} +{ +add.f16x2 r1440, %72, %56; +} +{ +mul.f16x2 r1443, r1440, r1297; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +sub.f16x2 r1449, %84, %70; +} +{ +mul.f16x2 r1452, r1449, r1295; +} +{ +sub.f16x2 r1455, %98, %82; +} +{ +mul.f16x2 r1458, r1455, r1298; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +add.f16x2 r1464, r1446, r1461; +} +{ +add.f16x2 r1467, %84, %70; +} +{ +mul.f16x2 r1470, r1467, r1289; +} +{ +add.f16x2 r1473, %90, r1470; +} +{ +add.f16x2 r1476, %98, %82; +} +{ +mul.f16x2 r1479, r1476, r1293; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +sub.f16x2 r1485, %58, %93; +} +{ +mul.f16x2 r1488, r1485, r1291; +} +{ +sub.f16x2 r1491, %72, %56; +} +{ +mul.f16x2 r1494, r1491, r1295; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +add.f16x2 r1500, r1482, r1497; +} +{ +add.f16x2 r1503, %84, %70; +} +{ +mul.f16x2 r1506, r1503, r1289; +} +{ +add.f16x2 r1509, %90, r1506; +} +{ +add.f16x2 r1512, %98, %82; +} +{ +mul.f16x2 r1515, r1512, r1293; +} +{ +add.f16x2 r1518, r1509, r1515; +} +{ +sub.f16x2 r1521, %58, %93; +} +{ +mul.f16x2 r1524, r1521, r1291; +} +{ +sub.f16x2 r1527, %72, %56; +} +{ +mul.f16x2 r1530, r1527, r1295; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1518, r1533; +} +{ +add.f16x2 r1539, %84, %70; +} +{ +mul.f16x2 r1542, r1539, r1293; +} +{ +add.f16x2 r1545, %90, r1542; +} +{ +add.f16x2 r1548, %98, %82; +} +{ +mul.f16x2 r1551, r1548, r1297; +} +{ +add.f16x2 r1554, r1545, r1551; +} +{ +sub.f16x2 r1557, %58, %93; +} +{ +mul.f16x2 r1560, r1557, r1295; +} +{ +sub.f16x2 r1563, %72, %56; +} +{ +mul.f16x2 r1566, r1563, r1298; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +add.f16x2 r1572, r1554, r1569; +} +{ +add.f16x2 r1575, %84, %70; +} +{ +mul.f16x2 r1578, r1575, r1293; +} +{ +add.f16x2 r1581, %90, r1578; +} +{ +add.f16x2 r1584, %98, %82; +} +{ +mul.f16x2 r1587, r1584, r1297; +} +{ +add.f16x2 r1590, r1581, r1587; +} +{ +sub.f16x2 r1593, %58, %93; +} +{ +mul.f16x2 r1596, r1593, r1295; +} +{ +sub.f16x2 r1599, %72, %56; +} +{ +mul.f16x2 r1602, r1599, r1298; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1590, r1605; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1611, {low, high}; +} +mov.f32 f64, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1612, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1613, {low, high}; +} +mov.f32 f68, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1614, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1615, {low, high}; +} +mov.f32 f72, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1616, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1617, {low, high}; +} +mov.f32 f76, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1618, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1621, {low, high}; +} +mov.f32 f84, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1622, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1625, {low, high}; +} +mov.f32 f92, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1626, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1627, {low, high}; +} +mov.f32 f96, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1628, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1633, {low, high}; +} +mov.f32 f108, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1641, {low, high}; +} +mov.f32 f124, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1642, {low, high}; +} +{ +mul.f16x2 r1659, r390, r1611; +} +{ +mul.f16x2 r1662, r534, r1612; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r390, r1612; +} +{ +fma.rn.f16x2 r1671, r534, r1611, r1668; +} +{ +mul.f16x2 r1675, r712, r1613; +} +{ +mul.f16x2 r1678, r856, r1614; +} +{ +sub.f16x2 r1681, r1675, r1678; +} +{ +mul.f16x2 r1684, r712, r1614; +} +{ +fma.rn.f16x2 r1687, r856, r1613, r1684; +} +{ +mul.f16x2 r1691, r1034, r1615; +} +{ +mul.f16x2 r1694, r1178, r1616; +} +{ +sub.f16x2 r1697, r1691, r1694; +} +{ +mul.f16x2 r1700, r1034, r1616; +} +{ +fma.rn.f16x2 r1703, r1178, r1615, r1700; +} +{ +mul.f16x2 r1707, r1356, r1617; +} +{ +mul.f16x2 r1710, r1500, r1618; +} +{ +sub.f16x2 r1713, r1707, r1710; +} +{ +mul.f16x2 r1716, r1356, r1618; +} +{ +fma.rn.f16x2 r1719, r1500, r1617, r1716; +} +{ +mul.f16x2 r1723, r462, r1613; +} +{ +mul.f16x2 r1726, r606, r1614; +} +{ +sub.f16x2 r1729, r1723, r1726; +} +{ +mul.f16x2 r1732, r462, r1614; +} +{ +fma.rn.f16x2 r1735, r606, r1613, r1732; +} +{ +mul.f16x2 r1739, r784, r1617; +} +{ +mul.f16x2 r1742, r928, r1618; +} +{ +sub.f16x2 r1745, r1739, r1742; +} +{ +mul.f16x2 r1748, r784, r1618; +} +{ +fma.rn.f16x2 r1751, r928, r1617, r1748; +} +{ +mul.f16x2 r1755, r1106, r1621; +} +{ +mul.f16x2 r1758, r1250, r1622; +} +{ +sub.f16x2 r1761, r1755, r1758; +} +{ +mul.f16x2 r1764, r1106, r1622; +} +{ +fma.rn.f16x2 r1767, r1250, r1621, r1764; +} +{ +mul.f16x2 r1771, r1428, r1625; +} +{ +mul.f16x2 r1774, r1572, r1626; +} +{ +sub.f16x2 r1777, r1771, r1774; +} +{ +mul.f16x2 r1780, r1428, r1626; +} +{ +fma.rn.f16x2 r1783, r1572, r1625, r1780; +} +{ +mul.f16x2 r1787, r498, r1615; +} +{ +mul.f16x2 r1790, r642, r1616; +} +{ +sub.f16x2 r1793, r1787, r1790; +} +{ +mul.f16x2 r1796, r498, r1616; +} +{ +fma.rn.f16x2 r1799, r642, r1615, r1796; +} +{ +mul.f16x2 r1803, r820, r1621; +} +{ +mul.f16x2 r1806, r964, r1622; +} +{ +sub.f16x2 r1809, r1803, r1806; +} +{ +mul.f16x2 r1812, r820, r1622; +} +{ +fma.rn.f16x2 r1815, r964, r1621, r1812; +} +{ +mul.f16x2 r1819, r1142, r1627; +} +{ +mul.f16x2 r1822, r1286, r1628; +} +{ +sub.f16x2 r1825, r1819, r1822; +} +{ +mul.f16x2 r1828, r1142, r1628; +} +{ +fma.rn.f16x2 r1831, r1286, r1627, r1828; +} +{ +mul.f16x2 r1835, r1464, r1633; +} +{ +mul.f16x2 r1838, r1608, r1634; +} +{ +sub.f16x2 r1841, r1835, r1838; +} +{ +mul.f16x2 r1844, r1464, r1634; +} +{ +fma.rn.f16x2 r1847, r1608, r1633, r1844; +} +{ +mul.f16x2 r1851, r426, r1617; +} +{ +mul.f16x2 r1854, r570, r1618; +} +{ +sub.f16x2 r1857, r1851, r1854; +} +{ +mul.f16x2 r1860, r426, r1618; +} +{ +fma.rn.f16x2 r1863, r570, r1617, r1860; +} +{ +mul.f16x2 r1867, r748, r1625; +} +{ +mul.f16x2 r1870, r892, r1626; +} +{ +sub.f16x2 r1873, r1867, r1870; +} +{ +mul.f16x2 r1876, r748, r1626; +} +{ +fma.rn.f16x2 r1879, r892, r1625, r1876; +} +{ +mul.f16x2 r1883, r1070, r1633; +} +{ +mul.f16x2 r1886, r1214, r1634; +} +{ +sub.f16x2 r1889, r1883, r1886; +} +{ +mul.f16x2 r1892, r1070, r1634; +} +{ +fma.rn.f16x2 r1895, r1214, r1633, r1892; +} +{ +mul.f16x2 r1899, r1392, r1641; +} +{ +mul.f16x2 r1902, r1536, r1642; +} +{ +sub.f16x2 r1905, r1899, r1902; +} +{ +mul.f16x2 r1908, r1392, r1642; +} +{ +fma.rn.f16x2 r1911, r1536, r1641, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1916, {low, high}; +} +{ +neg.f16x2 r1917, r1916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1919, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1920, {low, high}; +} +{ +neg.f16x2 r1921, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1924, {low, high}; +} +{ +add.f16x2 r1925, r342, r1308; +} +{ +add.f16x2 r1928, r20, r1925; +} +{ +add.f16x2 r1931, r664, r986; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r354, r1320; +} +{ +add.f16x2 r1940, r32, r1937; +} +{ +add.f16x2 r1943, r676, r998; +} +{ +add.f16x2 r1946, r1940, r1943; +} +{ +add.f16x2 r1949, r342, r1308; +} +{ +mul.f16x2 r1952, r1949, r1915; +} +{ +add.f16x2 r1955, r20, r1952; +} +{ +add.f16x2 r1958, r664, r986; +} +{ +mul.f16x2 r1961, r1958, r1919; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r354, r1320; +} +{ +mul.f16x2 r1970, r1967, r1917; +} +{ +sub.f16x2 r1973, r676, r998; +} +{ +mul.f16x2 r1976, r1973, r1921; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +sub.f16x2 r1982, r1964, r1979; +} +{ +add.f16x2 r1985, r342, r1308; +} +{ +mul.f16x2 r1988, r1985, r1915; +} +{ +add.f16x2 r1991, r20, r1988; +} +{ +add.f16x2 r1994, r664, r986; +} +{ +mul.f16x2 r1997, r1994, r1919; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r354, r1320; +} +{ +mul.f16x2 r2006, r2003, r1917; +} +{ +sub.f16x2 r2009, r676, r998; +} +{ +mul.f16x2 r2012, r2009, r1921; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 r2018, r2000, r2015; +} +{ +add.f16x2 r2021, r342, r1308; +} +{ +mul.f16x2 r2024, r2021, r1919; +} +{ +add.f16x2 r2027, r20, r2024; +} +{ +add.f16x2 r2030, r664, r986; +} +{ +mul.f16x2 r2033, r2030, r1923; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r354, r1320; +} +{ +mul.f16x2 r2042, r2039, r1921; +} +{ +sub.f16x2 r2045, r676, r998; +} +{ +mul.f16x2 r2048, r2045, r1924; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 r2054, r2036, r2051; +} +{ +add.f16x2 r2057, r342, r1308; +} +{ +mul.f16x2 r2060, r2057, r1919; +} +{ +add.f16x2 r2063, r20, r2060; +} +{ +add.f16x2 r2066, r664, r986; +} +{ +mul.f16x2 r2069, r2066, r1923; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r354, r1320; +} +{ +mul.f16x2 r2078, r2075, r1921; +} +{ +sub.f16x2 r2081, r676, r998; +} +{ +mul.f16x2 r2084, r2081, r1924; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 r2090, r2072, r2087; +} +{ +add.f16x2 r2093, r354, r1320; +} +{ +mul.f16x2 r2096, r2093, r1915; +} +{ +add.f16x2 r2099, r32, r2096; +} +{ +add.f16x2 r2102, r676, r998; +} +{ +mul.f16x2 r2105, r2102, r1919; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r342, r1308; +} +{ +mul.f16x2 r2114, r2111, r1917; +} +{ +sub.f16x2 r2117, r664, r986; +} +{ +mul.f16x2 r2120, r2117, r1921; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +add.f16x2 r2126, r2108, r2123; +} +{ +add.f16x2 r2129, r354, r1320; +} +{ +mul.f16x2 r2132, r2129, r1915; +} +{ +add.f16x2 r2135, r32, r2132; +} +{ +add.f16x2 r2138, r676, r998; +} +{ +mul.f16x2 r2141, r2138, r1919; +} +{ +add.f16x2 r2144, r2135, r2141; +} +{ +sub.f16x2 r2147, r342, r1308; +} +{ +mul.f16x2 r2150, r2147, r1917; +} +{ +sub.f16x2 r2153, r664, r986; +} +{ +mul.f16x2 r2156, r2153, r1921; +} +{ +add.f16x2 r2159, r2150, r2156; +} +{ +sub.f16x2 r2162, r2144, r2159; +} +{ +add.f16x2 r2165, r354, r1320; +} +{ +mul.f16x2 r2168, r2165, r1919; +} +{ +add.f16x2 r2171, r32, r2168; +} +{ +add.f16x2 r2174, r676, r998; +} +{ +mul.f16x2 r2177, r2174, r1923; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +sub.f16x2 r2183, r342, r1308; +} +{ +mul.f16x2 r2186, r2183, r1921; +} +{ +sub.f16x2 r2189, r664, r986; +} +{ +mul.f16x2 r2192, r2189, r1924; +} +{ +add.f16x2 r2195, r2186, r2192; +} +{ +add.f16x2 r2198, r2180, r2195; +} +{ +add.f16x2 r2201, r354, r1320; +} +{ +mul.f16x2 r2204, r2201, r1919; +} +{ +add.f16x2 r2207, r32, r2204; +} +{ +add.f16x2 r2210, r676, r998; +} +{ +mul.f16x2 r2213, r2210, r1923; +} +{ +add.f16x2 r2216, r2207, r2213; +} +{ +sub.f16x2 r2219, r342, r1308; +} +{ +mul.f16x2 r2222, r2219, r1921; +} +{ +sub.f16x2 r2225, r664, r986; +} +{ +mul.f16x2 r2228, r2225, r1924; +} +{ +add.f16x2 r2231, r2222, r2228; +} +{ +sub.f16x2 r2234, r2216, r2231; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2238, {low, high}; +} +{ +neg.f16x2 r2239, r2238; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2246, {low, high}; +} +{ +add.f16x2 r2247, r1665, r1713; +} +{ +add.f16x2 r2250, r68, r2247; +} +{ +add.f16x2 r2253, r1681, r1697; +} +{ +add.f16x2 r2256, r2250, r2253; +} +{ +add.f16x2 r2259, r1671, r1719; +} +{ +add.f16x2 r2262, r212, r2259; +} +{ +add.f16x2 r2265, r1687, r1703; +} +{ +add.f16x2 r2268, r2262, r2265; +} +{ +add.f16x2 r2271, r1665, r1713; +} +{ +mul.f16x2 r2274, r2271, r2237; +} +{ +add.f16x2 r2277, r68, r2274; +} +{ +add.f16x2 r2280, r1681, r1697; +} +{ +mul.f16x2 r2283, r2280, r2241; +} +{ +add.f16x2 r2286, r2277, r2283; +} +{ +sub.f16x2 r2289, r1671, r1719; +} +{ +mul.f16x2 r2292, r2289, r2239; +} +{ +sub.f16x2 r2295, r1687, r1703; +} +{ +mul.f16x2 r2298, r2295, r2243; +} +{ +add.f16x2 r2301, r2292, r2298; +} +{ +sub.f16x2 r2304, r2286, r2301; +} +{ +add.f16x2 r2307, r1665, r1713; +} +{ +mul.f16x2 r2310, r2307, r2237; +} +{ +add.f16x2 r2313, r68, r2310; +} +{ +add.f16x2 r2316, r1681, r1697; +} +{ +mul.f16x2 r2319, r2316, r2241; +} +{ +add.f16x2 r2322, r2313, r2319; +} +{ +sub.f16x2 r2325, r1671, r1719; +} +{ +mul.f16x2 r2328, r2325, r2239; +} +{ +sub.f16x2 r2331, r1687, r1703; +} +{ +mul.f16x2 r2334, r2331, r2243; +} +{ +add.f16x2 r2337, r2328, r2334; +} +{ +add.f16x2 r2340, r2322, r2337; +} +{ +add.f16x2 r2343, r1665, r1713; +} +{ +mul.f16x2 r2346, r2343, r2241; +} +{ +add.f16x2 r2349, r68, r2346; +} +{ +add.f16x2 r2352, r1681, r1697; +} +{ +mul.f16x2 r2355, r2352, r2245; +} +{ +add.f16x2 r2358, r2349, r2355; +} +{ +sub.f16x2 r2361, r1671, r1719; +} +{ +mul.f16x2 r2364, r2361, r2243; +} +{ +sub.f16x2 r2367, r1687, r1703; +} +{ +mul.f16x2 r2370, r2367, r2246; +} +{ +add.f16x2 r2373, r2364, r2370; +} +{ +sub.f16x2 r2376, r2358, r2373; +} +{ +add.f16x2 r2379, r1665, r1713; +} +{ +mul.f16x2 r2382, r2379, r2241; +} +{ +add.f16x2 r2385, r68, r2382; +} +{ +add.f16x2 r2388, r1681, r1697; +} +{ +mul.f16x2 r2391, r2388, r2245; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +sub.f16x2 r2397, r1671, r1719; +} +{ +mul.f16x2 r2400, r2397, r2243; +} +{ +sub.f16x2 r2403, r1687, r1703; +} +{ +mul.f16x2 r2406, r2403, r2246; +} +{ +add.f16x2 r2409, r2400, r2406; +} +{ +add.f16x2 r2412, r2394, r2409; +} +{ +add.f16x2 r2415, r1671, r1719; +} +{ +mul.f16x2 r2418, r2415, r2237; +} +{ +add.f16x2 r2421, r212, r2418; +} +{ +add.f16x2 r2424, r1687, r1703; +} +{ +mul.f16x2 r2427, r2424, r2241; +} +{ +add.f16x2 r2430, r2421, r2427; +} +{ +sub.f16x2 r2433, r1665, r1713; +} +{ +mul.f16x2 r2436, r2433, r2239; +} +{ +sub.f16x2 r2439, r1681, r1697; +} +{ +mul.f16x2 r2442, r2439, r2243; +} +{ +add.f16x2 r2445, r2436, r2442; +} +{ +add.f16x2 r2448, r2430, r2445; +} +{ +add.f16x2 r2451, r1671, r1719; +} +{ +mul.f16x2 r2454, r2451, r2237; +} +{ +add.f16x2 r2457, r212, r2454; +} +{ +add.f16x2 r2460, r1687, r1703; +} +{ +mul.f16x2 r2463, r2460, r2241; +} +{ +add.f16x2 r2466, r2457, r2463; +} +{ +sub.f16x2 r2469, r1665, r1713; +} +{ +mul.f16x2 r2472, r2469, r2239; +} +{ +sub.f16x2 r2475, r1681, r1697; +} +{ +mul.f16x2 r2478, r2475, r2243; +} +{ +add.f16x2 r2481, r2472, r2478; +} +{ +sub.f16x2 r2484, r2466, r2481; +} +{ +add.f16x2 r2487, r1671, r1719; +} +{ +mul.f16x2 r2490, r2487, r2241; +} +{ +add.f16x2 r2493, r212, r2490; +} +{ +add.f16x2 r2496, r1687, r1703; +} +{ +mul.f16x2 r2499, r2496, r2245; +} +{ +add.f16x2 r2502, r2493, r2499; +} +{ +sub.f16x2 r2505, r1665, r1713; +} +{ +mul.f16x2 r2508, r2505, r2243; +} +{ +sub.f16x2 r2511, r1681, r1697; +} +{ +mul.f16x2 r2514, r2511, r2246; +} +{ +add.f16x2 r2517, r2508, r2514; +} +{ +add.f16x2 r2520, r2502, r2517; +} +{ +add.f16x2 r2523, r1671, r1719; +} +{ +mul.f16x2 r2526, r2523, r2241; +} +{ +add.f16x2 r2529, r212, r2526; +} +{ +add.f16x2 r2532, r1687, r1703; +} +{ +mul.f16x2 r2535, r2532, r2245; +} +{ +add.f16x2 r2538, r2529, r2535; +} +{ +sub.f16x2 r2541, r1665, r1713; +} +{ +mul.f16x2 r2544, r2541, r2243; +} +{ +sub.f16x2 r2547, r1681, r1697; +} +{ +mul.f16x2 r2550, r2547, r2246; +} +{ +add.f16x2 r2553, r2544, r2550; +} +{ +sub.f16x2 r2556, r2538, r2553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2560, {low, high}; +} +{ +neg.f16x2 r2561, r2560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2564, {low, high}; +} +{ +neg.f16x2 r2565, r2564; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2568, {low, high}; +} +{ +add.f16x2 r2569, r1729, r1777; +} +{ +add.f16x2 r2572, r140, r2569; +} +{ +add.f16x2 r2575, r1745, r1761; +} +{ +add.f16x2 r2578, r2572, r2575; +} +{ +add.f16x2 r2581, r1735, r1783; +} +{ +add.f16x2 r2584, r284, r2581; +} +{ +add.f16x2 r2587, r1751, r1767; +} +{ +add.f16x2 r2590, r2584, r2587; +} +{ +add.f16x2 r2593, r1729, r1777; +} +{ +mul.f16x2 r2596, r2593, r2559; +} +{ +add.f16x2 r2599, r140, r2596; +} +{ +add.f16x2 r2602, r1745, r1761; +} +{ +mul.f16x2 r2605, r2602, r2563; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +sub.f16x2 r2611, r1735, r1783; +} +{ +mul.f16x2 r2614, r2611, r2561; +} +{ +sub.f16x2 r2617, r1751, r1767; +} +{ +mul.f16x2 r2620, r2617, r2565; +} +{ +add.f16x2 r2623, r2614, r2620; +} +{ +sub.f16x2 r2626, r2608, r2623; +} +{ +add.f16x2 r2629, r1729, r1777; +} +{ +mul.f16x2 r2632, r2629, r2559; +} +{ +add.f16x2 r2635, r140, r2632; +} +{ +add.f16x2 r2638, r1745, r1761; +} +{ +mul.f16x2 r2641, r2638, r2563; +} +{ +add.f16x2 r2644, r2635, r2641; +} +{ +sub.f16x2 r2647, r1735, r1783; +} +{ +mul.f16x2 r2650, r2647, r2561; +} +{ +sub.f16x2 r2653, r1751, r1767; +} +{ +mul.f16x2 r2656, r2653, r2565; +} +{ +add.f16x2 r2659, r2650, r2656; +} +{ +add.f16x2 r2662, r2644, r2659; +} +{ +add.f16x2 r2665, r1729, r1777; +} +{ +mul.f16x2 r2668, r2665, r2563; +} +{ +add.f16x2 r2671, r140, r2668; +} +{ +add.f16x2 r2674, r1745, r1761; +} +{ +mul.f16x2 r2677, r2674, r2567; +} +{ +add.f16x2 r2680, r2671, r2677; +} +{ +sub.f16x2 r2683, r1735, r1783; +} +{ +mul.f16x2 r2686, r2683, r2565; +} +{ +sub.f16x2 r2689, r1751, r1767; +} +{ +mul.f16x2 r2692, r2689, r2568; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2680, r2695; +} +{ +add.f16x2 r2701, r1729, r1777; +} +{ +mul.f16x2 r2704, r2701, r2563; +} +{ +add.f16x2 r2707, r140, r2704; +} +{ +add.f16x2 r2710, r1745, r1761; +} +{ +mul.f16x2 r2713, r2710, r2567; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +sub.f16x2 r2719, r1735, r1783; +} +{ +mul.f16x2 r2722, r2719, r2565; +} +{ +sub.f16x2 r2725, r1751, r1767; +} +{ +mul.f16x2 r2728, r2725, r2568; +} +{ +add.f16x2 r2731, r2722, r2728; +} +{ +add.f16x2 r2734, r2716, r2731; +} +{ +add.f16x2 r2737, r1735, r1783; +} +{ +mul.f16x2 r2740, r2737, r2559; +} +{ +add.f16x2 r2743, r284, r2740; +} +{ +add.f16x2 r2746, r1751, r1767; +} +{ +mul.f16x2 r2749, r2746, r2563; +} +{ +add.f16x2 r2752, r2743, r2749; +} +{ +sub.f16x2 r2755, r1729, r1777; +} +{ +mul.f16x2 r2758, r2755, r2561; +} +{ +sub.f16x2 r2761, r1745, r1761; +} +{ +mul.f16x2 r2764, r2761, r2565; +} +{ +add.f16x2 r2767, r2758, r2764; +} +{ +add.f16x2 r2770, r2752, r2767; +} +{ +add.f16x2 r2773, r1735, r1783; +} +{ +mul.f16x2 r2776, r2773, r2559; +} +{ +add.f16x2 r2779, r284, r2776; +} +{ +add.f16x2 r2782, r1751, r1767; +} +{ +mul.f16x2 r2785, r2782, r2563; +} +{ +add.f16x2 r2788, r2779, r2785; +} +{ +sub.f16x2 r2791, r1729, r1777; +} +{ +mul.f16x2 r2794, r2791, r2561; +} +{ +sub.f16x2 r2797, r1745, r1761; +} +{ +mul.f16x2 r2800, r2797, r2565; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2788, r2803; +} +{ +add.f16x2 r2809, r1735, r1783; +} +{ +mul.f16x2 r2812, r2809, r2563; +} +{ +add.f16x2 r2815, r284, r2812; +} +{ +add.f16x2 r2818, r1751, r1767; +} +{ +mul.f16x2 r2821, r2818, r2567; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +sub.f16x2 r2827, r1729, r1777; +} +{ +mul.f16x2 r2830, r2827, r2565; +} +{ +sub.f16x2 r2833, r1745, r1761; +} +{ +mul.f16x2 r2836, r2833, r2568; +} +{ +add.f16x2 r2839, r2830, r2836; +} +{ +add.f16x2 r2842, r2824, r2839; +} +{ +add.f16x2 r2845, r1735, r1783; +} +{ +mul.f16x2 r2848, r2845, r2563; +} +{ +add.f16x2 r2851, r284, r2848; +} +{ +add.f16x2 r2854, r1751, r1767; +} +{ +mul.f16x2 r2857, r2854, r2567; +} +{ +add.f16x2 r2860, r2851, r2857; +} +{ +sub.f16x2 r2863, r1729, r1777; +} +{ +mul.f16x2 r2866, r2863, r2565; +} +{ +sub.f16x2 r2869, r1745, r1761; +} +{ +mul.f16x2 r2872, r2869, r2568; +} +{ +add.f16x2 r2875, r2866, r2872; +} +{ +sub.f16x2 r2878, r2860, r2875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2882, {low, high}; +} +{ +neg.f16x2 r2883, r2882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2886, {low, high}; +} +{ +neg.f16x2 r2887, r2886; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2890, {low, high}; +} +{ +add.f16x2 r2891, r1793, r1841; +} +{ +add.f16x2 r2894, r176, r2891; +} +{ +add.f16x2 r2897, r1809, r1825; +} +{ +add.f16x2 r2900, r2894, r2897; +} +{ +add.f16x2 r2903, r1799, r1847; +} +{ +add.f16x2 r2906, r320, r2903; +} +{ +add.f16x2 r2909, r1815, r1831; +} +{ +add.f16x2 r2912, r2906, r2909; +} +{ +add.f16x2 r2915, r1793, r1841; +} +{ +mul.f16x2 r2918, r2915, r2881; +} +{ +add.f16x2 r2921, r176, r2918; +} +{ +add.f16x2 r2924, r1809, r1825; +} +{ +mul.f16x2 r2927, r2924, r2885; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +sub.f16x2 r2933, r1799, r1847; +} +{ +mul.f16x2 r2936, r2933, r2883; +} +{ +sub.f16x2 r2939, r1815, r1831; +} +{ +mul.f16x2 r2942, r2939, r2887; +} +{ +add.f16x2 r2945, r2936, r2942; +} +{ +sub.f16x2 r2948, r2930, r2945; +} +{ +add.f16x2 r2951, r1793, r1841; +} +{ +mul.f16x2 r2954, r2951, r2881; +} +{ +add.f16x2 r2957, r176, r2954; +} +{ +add.f16x2 r2960, r1809, r1825; +} +{ +mul.f16x2 r2963, r2960, r2885; +} +{ +add.f16x2 r2966, r2957, r2963; +} +{ +sub.f16x2 r2969, r1799, r1847; +} +{ +mul.f16x2 r2972, r2969, r2883; +} +{ +sub.f16x2 r2975, r1815, r1831; +} +{ +mul.f16x2 r2978, r2975, r2887; +} +{ +add.f16x2 r2981, r2972, r2978; +} +{ +add.f16x2 r2984, r2966, r2981; +} +{ +add.f16x2 r2987, r1793, r1841; +} +{ +mul.f16x2 r2990, r2987, r2885; +} +{ +add.f16x2 r2993, r176, r2990; +} +{ +add.f16x2 r2996, r1809, r1825; +} +{ +mul.f16x2 r2999, r2996, r2889; +} +{ +add.f16x2 r3002, r2993, r2999; +} +{ +sub.f16x2 r3005, r1799, r1847; +} +{ +mul.f16x2 r3008, r3005, r2887; +} +{ +sub.f16x2 r3011, r1815, r1831; +} +{ +mul.f16x2 r3014, r3011, r2890; +} +{ +add.f16x2 r3017, r3008, r3014; +} +{ +sub.f16x2 r3020, r3002, r3017; +} +{ +add.f16x2 r3023, r1793, r1841; +} +{ +mul.f16x2 r3026, r3023, r2885; +} +{ +add.f16x2 r3029, r176, r3026; +} +{ +add.f16x2 r3032, r1809, r1825; +} +{ +mul.f16x2 r3035, r3032, r2889; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +sub.f16x2 r3041, r1799, r1847; +} +{ +mul.f16x2 r3044, r3041, r2887; +} +{ +sub.f16x2 r3047, r1815, r1831; +} +{ +mul.f16x2 r3050, r3047, r2890; +} +{ +add.f16x2 r3053, r3044, r3050; +} +{ +add.f16x2 r3056, r3038, r3053; +} +{ +add.f16x2 r3059, r1799, r1847; +} +{ +mul.f16x2 r3062, r3059, r2881; +} +{ +add.f16x2 r3065, r320, r3062; +} +{ +add.f16x2 r3068, r1815, r1831; +} +{ +mul.f16x2 r3071, r3068, r2885; +} +{ +add.f16x2 r3074, r3065, r3071; +} +{ +sub.f16x2 r3077, r1793, r1841; +} +{ +mul.f16x2 r3080, r3077, r2883; +} +{ +sub.f16x2 r3083, r1809, r1825; +} +{ +mul.f16x2 r3086, r3083, r2887; +} +{ +add.f16x2 r3089, r3080, r3086; +} +{ +add.f16x2 r3092, r3074, r3089; +} +{ +add.f16x2 r3095, r1799, r1847; +} +{ +mul.f16x2 r3098, r3095, r2881; +} +{ +add.f16x2 r3101, r320, r3098; +} +{ +add.f16x2 r3104, r1815, r1831; +} +{ +mul.f16x2 r3107, r3104, r2885; +} +{ +add.f16x2 r3110, r3101, r3107; +} +{ +sub.f16x2 r3113, r1793, r1841; +} +{ +mul.f16x2 r3116, r3113, r2883; +} +{ +sub.f16x2 r3119, r1809, r1825; +} +{ +mul.f16x2 r3122, r3119, r2887; +} +{ +add.f16x2 r3125, r3116, r3122; +} +{ +sub.f16x2 r3128, r3110, r3125; +} +{ +add.f16x2 r3131, r1799, r1847; +} +{ +mul.f16x2 r3134, r3131, r2885; +} +{ +add.f16x2 r3137, r320, r3134; +} +{ +add.f16x2 r3140, r1815, r1831; +} +{ +mul.f16x2 r3143, r3140, r2889; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +sub.f16x2 r3149, r1793, r1841; +} +{ +mul.f16x2 r3152, r3149, r2887; +} +{ +sub.f16x2 r3155, r1809, r1825; +} +{ +mul.f16x2 r3158, r3155, r2890; +} +{ +add.f16x2 r3161, r3152, r3158; +} +{ +add.f16x2 r3164, r3146, r3161; +} +{ +add.f16x2 r3167, r1799, r1847; +} +{ +mul.f16x2 r3170, r3167, r2885; +} +{ +add.f16x2 r3173, r320, r3170; +} +{ +add.f16x2 r3176, r1815, r1831; +} +{ +mul.f16x2 r3179, r3176, r2889; +} +{ +add.f16x2 r3182, r3173, r3179; +} +{ +sub.f16x2 r3185, r1793, r1841; +} +{ +mul.f16x2 r3188, r3185, r2887; +} +{ +sub.f16x2 r3191, r1809, r1825; +} +{ +mul.f16x2 r3194, r3191, r2890; +} +{ +add.f16x2 r3197, r3188, r3194; +} +{ +sub.f16x2 r3200, r3182, r3197; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3204, {low, high}; +} +{ +neg.f16x2 r3205, r3204; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r3208, {low, high}; +} +{ +neg.f16x2 r3209, r3208; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r3212, {low, high}; +} +{ +add.f16x2 r3213, r1857, r1905; +} +{ +add.f16x2 r3216, r104, r3213; +} +{ +add.f16x2 r3219, r1873, r1889; +} +{ +add.f16x2 r3222, r3216, r3219; +} +{ +add.f16x2 r3225, r1863, r1911; +} +{ +add.f16x2 r3228, r248, r3225; +} +{ +add.f16x2 r3231, r1879, r1895; +} +{ +add.f16x2 r3234, r3228, r3231; +} +{ +add.f16x2 r3237, r1857, r1905; +} +{ +mul.f16x2 r3240, r3237, r3203; +} +{ +add.f16x2 r3243, r104, r3240; +} +{ +add.f16x2 r3246, r1873, r1889; +} +{ +mul.f16x2 r3249, r3246, r3207; +} +{ +add.f16x2 r3252, r3243, r3249; +} +{ +sub.f16x2 r3255, r1863, r1911; +} +{ +mul.f16x2 r3258, r3255, r3205; +} +{ +sub.f16x2 r3261, r1879, r1895; +} +{ +mul.f16x2 r3264, r3261, r3209; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +sub.f16x2 r3270, r3252, r3267; +} +{ +add.f16x2 r3273, r1857, r1905; +} +{ +mul.f16x2 r3276, r3273, r3203; +} +{ +add.f16x2 r3279, r104, r3276; +} +{ +add.f16x2 r3282, r1873, r1889; +} +{ +mul.f16x2 r3285, r3282, r3207; +} +{ +add.f16x2 r3288, r3279, r3285; +} +{ +sub.f16x2 r3291, r1863, r1911; +} +{ +mul.f16x2 r3294, r3291, r3205; +} +{ +sub.f16x2 r3297, r1879, r1895; +} +{ +mul.f16x2 r3300, r3297, r3209; +} +{ +add.f16x2 r3303, r3294, r3300; +} +{ +add.f16x2 r3306, r3288, r3303; +} +{ +add.f16x2 r3309, r1857, r1905; +} +{ +mul.f16x2 r3312, r3309, r3207; +} +{ +add.f16x2 r3315, r104, r3312; +} +{ +add.f16x2 r3318, r1873, r1889; +} +{ +mul.f16x2 r3321, r3318, r3211; +} +{ +add.f16x2 r3324, r3315, r3321; +} +{ +sub.f16x2 r3327, r1863, r1911; +} +{ +mul.f16x2 r3330, r3327, r3209; +} +{ +sub.f16x2 r3333, r1879, r1895; +} +{ +mul.f16x2 r3336, r3333, r3212; +} +{ +add.f16x2 r3339, r3330, r3336; +} +{ +sub.f16x2 r3342, r3324, r3339; +} +{ +add.f16x2 r3345, r1857, r1905; +} +{ +mul.f16x2 r3348, r3345, r3207; +} +{ +add.f16x2 r3351, r104, r3348; +} +{ +add.f16x2 r3354, r1873, r1889; +} +{ +mul.f16x2 r3357, r3354, r3211; +} +{ +add.f16x2 r3360, r3351, r3357; +} +{ +sub.f16x2 r3363, r1863, r1911; +} +{ +mul.f16x2 r3366, r3363, r3209; +} +{ +sub.f16x2 r3369, r1879, r1895; +} +{ +mul.f16x2 r3372, r3369, r3212; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r3360, r3375; +} +{ +add.f16x2 r3381, r1863, r1911; +} +{ +mul.f16x2 r3384, r3381, r3203; +} +{ +add.f16x2 r3387, r248, r3384; +} +{ +add.f16x2 r3390, r1879, r1895; +} +{ +mul.f16x2 r3393, r3390, r3207; +} +{ +add.f16x2 r3396, r3387, r3393; +} +{ +sub.f16x2 r3399, r1857, r1905; +} +{ +mul.f16x2 r3402, r3399, r3205; +} +{ +sub.f16x2 r3405, r1873, r1889; +} +{ +mul.f16x2 r3408, r3405, r3209; +} +{ +add.f16x2 r3411, r3402, r3408; +} +{ +add.f16x2 r3414, r3396, r3411; +} +{ +add.f16x2 r3417, r1863, r1911; +} +{ +mul.f16x2 r3420, r3417, r3203; +} +{ +add.f16x2 r3423, r248, r3420; +} +{ +add.f16x2 r3426, r1879, r1895; +} +{ +mul.f16x2 r3429, r3426, r3207; +} +{ +add.f16x2 r3432, r3423, r3429; +} +{ +sub.f16x2 r3435, r1857, r1905; +} +{ +mul.f16x2 r3438, r3435, r3205; +} +{ +sub.f16x2 r3441, r1873, r1889; +} +{ +mul.f16x2 r3444, r3441, r3209; +} +{ +add.f16x2 r3447, r3438, r3444; +} +{ +sub.f16x2 r3450, r3432, r3447; +} +{ +add.f16x2 r3453, r1863, r1911; +} +{ +mul.f16x2 r3456, r3453, r3207; +} +{ +add.f16x2 r3459, r248, r3456; +} +{ +add.f16x2 r3462, r1879, r1895; +} +{ +mul.f16x2 r3465, r3462, r3211; +} +{ +add.f16x2 r3468, r3459, r3465; +} +{ +sub.f16x2 r3471, r1857, r1905; +} +{ +mul.f16x2 r3474, r3471, r3209; +} +{ +sub.f16x2 r3477, r1873, r1889; +} +{ +mul.f16x2 r3480, r3477, r3212; +} +{ +add.f16x2 r3483, r3474, r3480; +} +{ +add.f16x2 r3486, r3468, r3483; +} +{ +add.f16x2 r3489, r1863, r1911; +} +{ +mul.f16x2 r3492, r3489, r3207; +} +{ +add.f16x2 r3495, r248, r3492; +} +{ +add.f16x2 r3498, r1879, r1895; +} +{ +mul.f16x2 r3501, r3498, r3211; +} +{ +add.f16x2 r3504, r3495, r3501; +} +{ +sub.f16x2 r3507, r1857, r1905; +} +{ +mul.f16x2 r3510, r3507, r3209; +} +{ +sub.f16x2 r3513, r1873, r1889; +} +{ +mul.f16x2 r3516, r3513, r3212; +} +{ +add.f16x2 r3519, r3510, r3516; +} +{ +sub.f16x2 r3522, r3504, r3519; +} +mul.wide.u32 rd2, r6029, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r6030, rd3; +mul.lo.s32 r6031, r6030, 5; +sub.s32 r6032, r6029, r6031; +mad.lo.s32 r6033, r6030, 500, r6028; +cvt.rn.f32.u32 f329, r6032; +mul.f32 f330, f329, 0f3D4DE32E; +cos.approx.f32 f217, f330; +sin.approx.f32 f331, f330; +neg.f32 f218, f331; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3525, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3530, {high, high}; +} +{ +mul.f16x2 r3532, r2268, r3530; +} +{ +fma.rn.f16x2 r3535, r2256, r3528, r3532; +} +{ +mul.f16x2 r3539, r2256, r3530; +} +{ +neg.f16x2 r3542, r3539; +} +{ +fma.rn.f16x2 r3544, r2268, r3528, r3542; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3550, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3552, {low, high}; +} +{ +mul.f16x2 r3553, r3550, r3552; +} +{ +mul.f16x2 r3556, r3525, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3559, {high, low}; +} +{ +fma.rn.f16x2 r3561, r3553, r3559, r3556; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3567, {high, high}; +} +{ +mul.f16x2 r3569, r2590, r3567; +} +{ +fma.rn.f16x2 r3572, r2578, r3565, r3569; +} +{ +mul.f16x2 r3576, r2578, r3567; +} +{ +neg.f16x2 r3579, r3576; +} +{ +fma.rn.f16x2 r3581, r2590, r3565, r3579; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3587, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3589, {low, high}; +} +{ +mul.f16x2 r3590, r3587, r3589; +} +{ +mul.f16x2 r3593, r3561, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3596, {high, low}; +} +{ +fma.rn.f16x2 r3598, r3590, r3596, r3593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3604, {high, high}; +} +{ +mul.f16x2 r3606, r2912, r3604; +} +{ +fma.rn.f16x2 r3609, r2900, r3602, r3606; +} +{ +mul.f16x2 r3613, r2900, r3604; +} +{ +neg.f16x2 r3616, r3613; +} +{ +fma.rn.f16x2 r3618, r2912, r3602, r3616; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3624, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3626, {low, high}; +} +{ +mul.f16x2 r3627, r3624, r3626; +} +{ +mul.f16x2 r3630, r3598, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3633, {high, low}; +} +{ +fma.rn.f16x2 r3635, r3627, r3633, r3630; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3641, {high, high}; +} +{ +mul.f16x2 r3643, r3234, r3641; +} +{ +fma.rn.f16x2 r3646, r3222, r3639, r3643; +} +{ +mul.f16x2 r3650, r3222, r3641; +} +{ +neg.f16x2 r3653, r3650; +} +{ +fma.rn.f16x2 r3655, r3234, r3639, r3653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3661, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3663, {low, high}; +} +{ +mul.f16x2 r3664, r3661, r3663; +} +{ +mul.f16x2 r3667, r3635, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3670, {high, low}; +} +{ +fma.rn.f16x2 r3672, r3664, r3670, r3667; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3678, {high, high}; +} +{ +mul.f16x2 r3680, r2126, r3678; +} +{ +fma.rn.f16x2 r3683, r1982, r3676, r3680; +} +{ +mul.f16x2 r3687, r1982, r3678; +} +{ +neg.f16x2 r3690, r3687; +} +{ +fma.rn.f16x2 r3692, r2126, r3676, r3690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3698, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3700, {low, high}; +} +{ +mul.f16x2 r3701, r3698, r3700; +} +{ +mul.f16x2 r3704, r3672, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3707, {high, low}; +} +{ +fma.rn.f16x2 r3709, r3701, r3707, r3704; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3715, {high, high}; +} +{ +mul.f16x2 r3717, r2448, r3715; +} +{ +fma.rn.f16x2 r3720, r2304, r3713, r3717; +} +{ +mul.f16x2 r3724, r2304, r3715; +} +{ +neg.f16x2 r3727, r3724; +} +{ +fma.rn.f16x2 r3729, r2448, r3713, r3727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3735, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3737, {low, high}; +} +{ +mul.f16x2 r3738, r3735, r3737; +} +{ +mul.f16x2 r3741, r3709, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3744, {high, low}; +} +{ +fma.rn.f16x2 r3746, r3738, r3744, r3741; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3752, {high, high}; +} +{ +mul.f16x2 r3754, r2770, r3752; +} +{ +fma.rn.f16x2 r3757, r2626, r3750, r3754; +} +{ +mul.f16x2 r3761, r2626, r3752; +} +{ +neg.f16x2 r3764, r3761; +} +{ +fma.rn.f16x2 r3766, r2770, r3750, r3764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3774, {low, high}; +} +{ +mul.f16x2 r3775, r3772, r3774; +} +{ +mul.f16x2 r3778, r3746, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3781, {high, low}; +} +{ +fma.rn.f16x2 r3783, r3775, r3781, r3778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3789, {high, high}; +} +{ +mul.f16x2 r3791, r3092, r3789; +} +{ +fma.rn.f16x2 r3794, r2948, r3787, r3791; +} +{ +mul.f16x2 r3798, r2948, r3789; +} +{ +neg.f16x2 r3801, r3798; +} +{ +fma.rn.f16x2 r3803, r3092, r3787, r3801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3811, {low, high}; +} +{ +mul.f16x2 r3812, r3809, r3811; +} +{ +mul.f16x2 r3815, r3783, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3818, {high, low}; +} +{ +fma.rn.f16x2 r3820, r3812, r3818, r3815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3826, {high, high}; +} +{ +mul.f16x2 r3828, r3414, r3826; +} +{ +fma.rn.f16x2 r3831, r3270, r3824, r3828; +} +{ +mul.f16x2 r3835, r3270, r3826; +} +{ +neg.f16x2 r3838, r3835; +} +{ +fma.rn.f16x2 r3840, r3414, r3824, r3838; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3848, {low, high}; +} +{ +mul.f16x2 r3849, r3846, r3848; +} +{ +mul.f16x2 r3852, r3820, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3855, {high, low}; +} +{ +fma.rn.f16x2 r3857, r3849, r3855, r3852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3863, {high, high}; +} +{ +mul.f16x2 r3865, r2198, r3863; +} +{ +fma.rn.f16x2 r3868, r2054, r3861, r3865; +} +{ +mul.f16x2 r3872, r2054, r3863; +} +{ +neg.f16x2 r3875, r3872; +} +{ +fma.rn.f16x2 r3877, r2198, r3861, r3875; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3885, {low, high}; +} +{ +mul.f16x2 r3886, r3883, r3885; +} +{ +mul.f16x2 r3889, r3857, r3881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3892, {high, low}; +} +{ +fma.rn.f16x2 r3894, r3886, r3892, r3889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3900, {high, high}; +} +{ +mul.f16x2 r3902, r2520, r3900; +} +{ +fma.rn.f16x2 r3905, r2376, r3898, r3902; +} +{ +mul.f16x2 r3909, r2376, r3900; +} +{ +neg.f16x2 r3912, r3909; +} +{ +fma.rn.f16x2 r3914, r2520, r3898, r3912; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3922, {low, high}; +} +{ +mul.f16x2 r3923, r3920, r3922; +} +{ +mul.f16x2 r3926, r3894, r3918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3929, {high, low}; +} +{ +fma.rn.f16x2 r3931, r3923, r3929, r3926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3937, {high, high}; +} +{ +mul.f16x2 r3939, r2842, r3937; +} +{ +fma.rn.f16x2 r3942, r2698, r3935, r3939; +} +{ +mul.f16x2 r3946, r2698, r3937; +} +{ +neg.f16x2 r3949, r3946; +} +{ +fma.rn.f16x2 r3951, r2842, r3935, r3949; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3959, {low, high}; +} +{ +mul.f16x2 r3960, r3957, r3959; +} +{ +mul.f16x2 r3963, r3931, r3955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3966, {high, low}; +} +{ +fma.rn.f16x2 r3968, r3960, r3966, r3963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3974, {high, high}; +} +{ +mul.f16x2 r3976, r3164, r3974; +} +{ +fma.rn.f16x2 r3979, r3020, r3972, r3976; +} +{ +mul.f16x2 r3983, r3020, r3974; +} +{ +neg.f16x2 r3986, r3983; +} +{ +fma.rn.f16x2 r3988, r3164, r3972, r3986; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3996, {low, high}; +} +{ +mul.f16x2 r3997, r3994, r3996; +} +{ +mul.f16x2 r4000, r3968, r3992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r4003, {high, low}; +} +{ +fma.rn.f16x2 r4005, r3997, r4003, r4000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4011, {high, high}; +} +{ +mul.f16x2 r4013, r3486, r4011; +} +{ +fma.rn.f16x2 r4016, r3342, r4009, r4013; +} +{ +mul.f16x2 r4020, r3342, r4011; +} +{ +neg.f16x2 r4023, r4020; +} +{ +fma.rn.f16x2 r4025, r3486, r4009, r4023; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4033, {low, high}; +} +{ +mul.f16x2 r4034, r4031, r4033; +} +{ +mul.f16x2 r4037, r4005, r4029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4040, {high, low}; +} +{ +fma.rn.f16x2 r4042, r4034, r4040, r4037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4048, {high, high}; +} +{ +mul.f16x2 r4050, r2234, r4048; +} +{ +fma.rn.f16x2 r4053, r2090, r4046, r4050; +} +{ +mul.f16x2 r4057, r2090, r4048; +} +{ +neg.f16x2 r4060, r4057; +} +{ +fma.rn.f16x2 r4062, r2234, r4046, r4060; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4070, {low, high}; +} +{ +mul.f16x2 r4071, r4068, r4070; +} +{ +mul.f16x2 r4074, r4042, r4066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4077, {high, low}; +} +{ +fma.rn.f16x2 r4079, r4071, r4077, r4074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4085, {high, high}; +} +{ +mul.f16x2 r4087, r2556, r4085; +} +{ +fma.rn.f16x2 r4090, r2412, r4083, r4087; +} +{ +mul.f16x2 r4094, r2412, r4085; +} +{ +neg.f16x2 r4097, r4094; +} +{ +fma.rn.f16x2 r4099, r2556, r4083, r4097; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4107, {low, high}; +} +{ +mul.f16x2 r4108, r4105, r4107; +} +{ +mul.f16x2 r4111, r4079, r4103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4114, {high, low}; +} +{ +fma.rn.f16x2 r4116, r4108, r4114, r4111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4122, {high, high}; +} +{ +mul.f16x2 r4124, r2878, r4122; +} +{ +fma.rn.f16x2 r4127, r2734, r4120, r4124; +} +{ +mul.f16x2 r4131, r2734, r4122; +} +{ +neg.f16x2 r4134, r4131; +} +{ +fma.rn.f16x2 r4136, r2878, r4120, r4134; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4140, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4142, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4144, {low, high}; +} +{ +mul.f16x2 r4145, r4142, r4144; +} +{ +mul.f16x2 r4148, r4116, r4140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4151, {high, low}; +} +{ +fma.rn.f16x2 r4153, r4145, r4151, r4148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4159, {high, high}; +} +{ +mul.f16x2 r4161, r3200, r4159; +} +{ +fma.rn.f16x2 r4164, r3056, r4157, r4161; +} +{ +mul.f16x2 r4168, r3056, r4159; +} +{ +neg.f16x2 r4171, r4168; +} +{ +fma.rn.f16x2 r4173, r3200, r4157, r4171; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4177, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4179, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4181, {low, high}; +} +{ +mul.f16x2 r4182, r4179, r4181; +} +{ +mul.f16x2 r4185, r4153, r4177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4188, {high, low}; +} +{ +fma.rn.f16x2 r4190, r4182, r4188, r4185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4196, {high, high}; +} +{ +mul.f16x2 r4198, r3522, r4196; +} +{ +fma.rn.f16x2 r4201, r3378, r4194, r4198; +} +{ +mul.f16x2 r4205, r3378, r4196; +} +{ +neg.f16x2 r4208, r4205; +} +{ +fma.rn.f16x2 r4210, r3522, r4194, r4208; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4214, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4216, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4218, {low, high}; +} +{ +mul.f16x2 r4219, r4216, r4218; +} +{ +mul.f16x2 r4222, r4190, r4214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4225, {high, low}; +} +{ +fma.rn.f16x2 r4227, r4219, r4225, r4222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4233, {high, high}; +} +{ +mul.f16x2 r4235, r2162, r4233; +} +{ +fma.rn.f16x2 r4238, r2018, r4231, r4235; +} +{ +mul.f16x2 r4242, r2018, r4233; +} +{ +neg.f16x2 r4245, r4242; +} +{ +fma.rn.f16x2 r4247, r2162, r4231, r4245; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4253, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4255, {low, high}; +} +{ +mul.f16x2 r4256, r4253, r4255; +} +{ +mul.f16x2 r4259, r4227, r4251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4262, {high, low}; +} +{ +fma.rn.f16x2 r4264, r4256, r4262, r4259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4270, {high, high}; +} +{ +mul.f16x2 r4272, r2484, r4270; +} +{ +fma.rn.f16x2 r4275, r2340, r4268, r4272; +} +{ +mul.f16x2 r4279, r2340, r4270; +} +{ +neg.f16x2 r4282, r4279; +} +{ +fma.rn.f16x2 r4284, r2484, r4268, r4282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4288, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4290, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4292, {low, high}; +} +{ +mul.f16x2 r4293, r4290, r4292; +} +{ +mul.f16x2 r4296, r4264, r4288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4299, {high, low}; +} +{ +fma.rn.f16x2 r4301, r4293, r4299, r4296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4307, {high, high}; +} +{ +mul.f16x2 r4309, r2806, r4307; +} +{ +fma.rn.f16x2 r4312, r2662, r4305, r4309; +} +{ +mul.f16x2 r4316, r2662, r4307; +} +{ +neg.f16x2 r4319, r4316; +} +{ +fma.rn.f16x2 r4321, r2806, r4305, r4319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4325, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4327, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4329, {low, high}; +} +{ +mul.f16x2 r4330, r4327, r4329; +} +{ +mul.f16x2 r4333, r4301, r4325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4336, {high, low}; +} +{ +fma.rn.f16x2 r4338, r4330, r4336, r4333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4344, {high, high}; +} +{ +mul.f16x2 r4346, r3128, r4344; +} +{ +fma.rn.f16x2 r4349, r2984, r4342, r4346; +} +{ +mul.f16x2 r4353, r2984, r4344; +} +{ +neg.f16x2 r4356, r4353; +} +{ +fma.rn.f16x2 r4358, r3128, r4342, r4356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4362, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4364, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4366, {low, high}; +} +{ +mul.f16x2 r4367, r4364, r4366; +} +{ +mul.f16x2 r4370, r4338, r4362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4373, {high, low}; +} +{ +fma.rn.f16x2 r4375, r4367, r4373, r4370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4379, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4381, {high, high}; +} +{ +mul.f16x2 r4383, r3450, r4381; +} +{ +fma.rn.f16x2 r4386, r3306, r4379, r4383; +} +{ +mul.f16x2 r4390, r3306, r4381; +} +{ +neg.f16x2 r4393, r4390; +} +{ +fma.rn.f16x2 r4395, r3450, r4379, r4393; +} +barrier.sync 0; +mad.lo.s32 r6034, r6032, 100, r6033; +st.shared.u32 [r6034], r1934; +st.shared.u32 [r6034+4], r3535; +st.shared.u32 [r6034+8], r3572; +st.shared.u32 [r6034+12], r3609; +st.shared.u32 [r6034+16], r3646; +st.shared.u32 [r6034+20], r3683; +st.shared.u32 [r6034+24], r3720; +st.shared.u32 [r6034+28], r3757; +st.shared.u32 [r6034+32], r3794; +st.shared.u32 [r6034+36], r3831; +st.shared.u32 [r6034+40], r3868; +st.shared.u32 [r6034+44], r3905; +st.shared.u32 [r6034+48], r3942; +st.shared.u32 [r6034+52], r3979; +st.shared.u32 [r6034+56], r4016; +st.shared.u32 [r6034+60], r4053; +st.shared.u32 [r6034+64], r4090; +st.shared.u32 [r6034+68], r4127; +st.shared.u32 [r6034+72], r4164; +st.shared.u32 [r6034+76], r4201; +st.shared.u32 [r6034+80], r4238; +st.shared.u32 [r6034+84], r4275; +st.shared.u32 [r6034+88], r4312; +st.shared.u32 [r6034+92], r4349; +st.shared.u32 [r6034+96], r4386; +barrier.sync 0; +mad.lo.s32 r6035, r6032, -96, r6034; +ld.shared.u32 r4430, [r6035]; +ld.shared.u32 r4752, [r6035+20]; +ld.shared.u32 r5074, [r6035+40]; +ld.shared.u32 r5396, [r6035+60]; +ld.shared.u32 r5718, [r6035+80]; +ld.shared.u32 r4427, [r6035+100]; +ld.shared.u32 r4749, [r6035+120]; +ld.shared.u32 r5071, [r6035+140]; +ld.shared.u32 r5393, [r6035+160]; +ld.shared.u32 r5715, [r6035+180]; +ld.shared.u32 r4433, [r6035+200]; +ld.shared.u32 r4755, [r6035+220]; +ld.shared.u32 r5077, [r6035+240]; +ld.shared.u32 r5399, [r6035+260]; +ld.shared.u32 r5721, [r6035+280]; +ld.shared.u32 r4434, [r6035+300]; +ld.shared.u32 r4756, [r6035+320]; +ld.shared.u32 r5078, [r6035+340]; +ld.shared.u32 r5400, [r6035+360]; +ld.shared.u32 r5722, [r6035+380]; +ld.shared.u32 r4428, [r6035+400]; +ld.shared.u32 r4750, [r6035+420]; +ld.shared.u32 r5072, [r6035+440]; +ld.shared.u32 r5394, [r6035+460]; +ld.shared.u32 r5716, [r6035+480]; +barrier.sync 0; +st.shared.u32 [r6034], r1946; +st.shared.u32 [r6034+4], r3544; +st.shared.u32 [r6034+8], r3581; +st.shared.u32 [r6034+12], r3618; +st.shared.u32 [r6034+16], r3655; +st.shared.u32 [r6034+20], r3692; +st.shared.u32 [r6034+24], r3729; +st.shared.u32 [r6034+28], r3766; +st.shared.u32 [r6034+32], r3803; +st.shared.u32 [r6034+36], r3840; +st.shared.u32 [r6034+40], r3877; +st.shared.u32 [r6034+44], r3914; +st.shared.u32 [r6034+48], r3951; +st.shared.u32 [r6034+52], r3988; +st.shared.u32 [r6034+56], r4025; +st.shared.u32 [r6034+60], r4062; +st.shared.u32 [r6034+64], r4099; +st.shared.u32 [r6034+68], r4136; +st.shared.u32 [r6034+72], r4173; +st.shared.u32 [r6034+76], r4210; +st.shared.u32 [r6034+80], r4247; +st.shared.u32 [r6034+84], r4284; +st.shared.u32 [r6034+88], r4321; +st.shared.u32 [r6034+92], r4358; +st.shared.u32 [r6034+96], r4395; +barrier.sync 0; +ld.shared.u32 r4442, [r6035]; +ld.shared.u32 r4764, [r6035+20]; +ld.shared.u32 r5086, [r6035+40]; +ld.shared.u32 r5408, [r6035+60]; +ld.shared.u32 r5730, [r6035+80]; +ld.shared.u32 r4439, [r6035+100]; +ld.shared.u32 r4761, [r6035+120]; +ld.shared.u32 r5083, [r6035+140]; +ld.shared.u32 r5405, [r6035+160]; +ld.shared.u32 r5727, [r6035+180]; +ld.shared.u32 r4445, [r6035+200]; +ld.shared.u32 r4767, [r6035+220]; +ld.shared.u32 r5089, [r6035+240]; +ld.shared.u32 r5411, [r6035+260]; +ld.shared.u32 r5733, [r6035+280]; +ld.shared.u32 r4446, [r6035+300]; +ld.shared.u32 r4768, [r6035+320]; +ld.shared.u32 r5090, [r6035+340]; +ld.shared.u32 r5412, [r6035+360]; +ld.shared.u32 r5734, [r6035+380]; +ld.shared.u32 r4440, [r6035+400]; +ld.shared.u32 r4762, [r6035+420]; +ld.shared.u32 r5084, [r6035+440]; +ld.shared.u32 r5406, [r6035+460]; +ld.shared.u32 r5728, [r6035+480]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4417, {low, high}; +} +{ +neg.f16x2 r4418, r4417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4421, {low, high}; +} +{ +neg.f16x2 r4422, r4421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4425, {low, high}; +} +{ +add.f16x2 r4426, r4427, r4428; +} +{ +add.f16x2 r4429, r4430, r4426; +} +{ +add.f16x2 r4432, r4433, r4434; +} +{ +add.f16x2 %0, r4429, r4432; +} +{ +add.f16x2 r4438, r4439, r4440; +} +{ +add.f16x2 r4441, r4442, r4438; +} +{ +add.f16x2 r4444, r4445, r4446; +} +{ +add.f16x2 %1, r4441, r4444; +} +{ +add.f16x2 r4450, r4427, r4428; +} +{ +mul.f16x2 r4453, r4450, r4416; +} +{ +add.f16x2 r4456, r4430, r4453; +} +{ +add.f16x2 r4459, r4433, r4434; +} +{ +mul.f16x2 r4462, r4459, r4420; +} +{ +add.f16x2 r4465, r4456, r4462; +} +{ +sub.f16x2 r4468, r4439, r4440; +} +{ +mul.f16x2 r4471, r4468, r4418; +} +{ +sub.f16x2 r4474, r4445, r4446; +} +{ +mul.f16x2 r4477, r4474, r4422; +} +{ +add.f16x2 r4480, r4471, r4477; +} +{ +sub.f16x2 %10, r4465, r4480; +} +{ +add.f16x2 r4486, r4427, r4428; +} +{ +mul.f16x2 r4489, r4486, r4416; +} +{ +add.f16x2 r4492, r4430, r4489; +} +{ +add.f16x2 r4495, r4433, r4434; +} +{ +mul.f16x2 r4498, r4495, r4420; +} +{ +add.f16x2 r4501, r4492, r4498; +} +{ +sub.f16x2 r4504, r4439, r4440; +} +{ +mul.f16x2 r4507, r4504, r4418; +} +{ +sub.f16x2 r4510, r4445, r4446; +} +{ +mul.f16x2 r4513, r4510, r4422; +} +{ +add.f16x2 r4516, r4507, r4513; +} +{ +add.f16x2 %40, r4501, r4516; +} +{ +add.f16x2 r4522, r4427, r4428; +} +{ +mul.f16x2 r4525, r4522, r4420; +} +{ +add.f16x2 r4528, r4430, r4525; +} +{ +add.f16x2 r4531, r4433, r4434; +} +{ +mul.f16x2 r4534, r4531, r4424; +} +{ +add.f16x2 r4537, r4528, r4534; +} +{ +sub.f16x2 r4540, r4439, r4440; +} +{ +mul.f16x2 r4543, r4540, r4422; +} +{ +sub.f16x2 r4546, r4445, r4446; +} +{ +mul.f16x2 r4549, r4546, r4425; +} +{ +add.f16x2 r4552, r4543, r4549; +} +{ +sub.f16x2 %20, r4537, r4552; +} +{ +add.f16x2 r4558, r4427, r4428; +} +{ +mul.f16x2 r4561, r4558, r4420; +} +{ +add.f16x2 r4564, r4430, r4561; +} +{ +add.f16x2 r4567, r4433, r4434; +} +{ +mul.f16x2 r4570, r4567, r4424; +} +{ +add.f16x2 r4573, r4564, r4570; +} +{ +sub.f16x2 r4576, r4439, r4440; +} +{ +mul.f16x2 r4579, r4576, r4422; +} +{ +sub.f16x2 r4582, r4445, r4446; +} +{ +mul.f16x2 r4585, r4582, r4425; +} +{ +add.f16x2 r4588, r4579, r4585; +} +{ +add.f16x2 %30, r4573, r4588; +} +{ +add.f16x2 r4594, r4439, r4440; +} +{ +mul.f16x2 r4597, r4594, r4416; +} +{ +add.f16x2 r4600, r4442, r4597; +} +{ +add.f16x2 r4603, r4445, r4446; +} +{ +mul.f16x2 r4606, r4603, r4420; +} +{ +add.f16x2 r4609, r4600, r4606; +} +{ +sub.f16x2 r4612, r4427, r4428; +} +{ +mul.f16x2 r4615, r4612, r4418; +} +{ +sub.f16x2 r4618, r4433, r4434; +} +{ +mul.f16x2 r4621, r4618, r4422; +} +{ +add.f16x2 r4624, r4615, r4621; +} +{ +add.f16x2 %11, r4609, r4624; +} +{ +add.f16x2 r4630, r4439, r4440; +} +{ +mul.f16x2 r4633, r4630, r4416; +} +{ +add.f16x2 r4636, r4442, r4633; +} +{ +add.f16x2 r4639, r4445, r4446; +} +{ +mul.f16x2 r4642, r4639, r4420; +} +{ +add.f16x2 r4645, r4636, r4642; +} +{ +sub.f16x2 r4648, r4427, r4428; +} +{ +mul.f16x2 r4651, r4648, r4418; +} +{ +sub.f16x2 r4654, r4433, r4434; +} +{ +mul.f16x2 r4657, r4654, r4422; +} +{ +add.f16x2 r4660, r4651, r4657; +} +{ +sub.f16x2 %41, r4645, r4660; +} +{ +add.f16x2 r4666, r4439, r4440; +} +{ +mul.f16x2 r4669, r4666, r4420; +} +{ +add.f16x2 r4672, r4442, r4669; +} +{ +add.f16x2 r4675, r4445, r4446; +} +{ +mul.f16x2 r4678, r4675, r4424; +} +{ +add.f16x2 r4681, r4672, r4678; +} +{ +sub.f16x2 r4684, r4427, r4428; +} +{ +mul.f16x2 r4687, r4684, r4422; +} +{ +sub.f16x2 r4690, r4433, r4434; +} +{ +mul.f16x2 r4693, r4690, r4425; +} +{ +add.f16x2 r4696, r4687, r4693; +} +{ +add.f16x2 %21, r4681, r4696; +} +{ +add.f16x2 r4702, r4439, r4440; +} +{ +mul.f16x2 r4705, r4702, r4420; +} +{ +add.f16x2 r4708, r4442, r4705; +} +{ +add.f16x2 r4711, r4445, r4446; +} +{ +mul.f16x2 r4714, r4711, r4424; +} +{ +add.f16x2 r4717, r4708, r4714; +} +{ +sub.f16x2 r4720, r4427, r4428; +} +{ +mul.f16x2 r4723, r4720, r4422; +} +{ +sub.f16x2 r4726, r4433, r4434; +} +{ +mul.f16x2 r4729, r4726, r4425; +} +{ +add.f16x2 r4732, r4723, r4729; +} +{ +sub.f16x2 %31, r4717, r4732; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4739, {low, high}; +} +{ +neg.f16x2 r4740, r4739; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r4742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r4743, {low, high}; +} +{ +neg.f16x2 r4744, r4743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r4746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r4747, {low, high}; +} +{ +add.f16x2 r4748, r4749, r4750; +} +{ +add.f16x2 r4751, r4752, r4748; +} +{ +add.f16x2 r4754, r4755, r4756; +} +{ +add.f16x2 %2, r4751, r4754; +} +{ +add.f16x2 r4760, r4761, r4762; +} +{ +add.f16x2 r4763, r4764, r4760; +} +{ +add.f16x2 r4766, r4767, r4768; +} +{ +add.f16x2 %3, r4763, r4766; +} +{ +add.f16x2 r4772, r4749, r4750; +} +{ +mul.f16x2 r4775, r4772, r4738; +} +{ +add.f16x2 r4778, r4752, r4775; +} +{ +add.f16x2 r4781, r4755, r4756; +} +{ +mul.f16x2 r4784, r4781, r4742; +} +{ +add.f16x2 r4787, r4778, r4784; +} +{ +sub.f16x2 r4790, r4761, r4762; +} +{ +mul.f16x2 r4793, r4790, r4740; +} +{ +sub.f16x2 r4796, r4767, r4768; +} +{ +mul.f16x2 r4799, r4796, r4744; +} +{ +add.f16x2 r4802, r4793, r4799; +} +{ +sub.f16x2 %12, r4787, r4802; +} +{ +add.f16x2 r4808, r4749, r4750; +} +{ +mul.f16x2 r4811, r4808, r4738; +} +{ +add.f16x2 r4814, r4752, r4811; +} +{ +add.f16x2 r4817, r4755, r4756; +} +{ +mul.f16x2 r4820, r4817, r4742; +} +{ +add.f16x2 r4823, r4814, r4820; +} +{ +sub.f16x2 r4826, r4761, r4762; +} +{ +mul.f16x2 r4829, r4826, r4740; +} +{ +sub.f16x2 r4832, r4767, r4768; +} +{ +mul.f16x2 r4835, r4832, r4744; +} +{ +add.f16x2 r4838, r4829, r4835; +} +{ +add.f16x2 %42, r4823, r4838; +} +{ +add.f16x2 r4844, r4749, r4750; +} +{ +mul.f16x2 r4847, r4844, r4742; +} +{ +add.f16x2 r4850, r4752, r4847; +} +{ +add.f16x2 r4853, r4755, r4756; +} +{ +mul.f16x2 r4856, r4853, r4746; +} +{ +add.f16x2 r4859, r4850, r4856; +} +{ +sub.f16x2 r4862, r4761, r4762; +} +{ +mul.f16x2 r4865, r4862, r4744; +} +{ +sub.f16x2 r4868, r4767, r4768; +} +{ +mul.f16x2 r4871, r4868, r4747; +} +{ +add.f16x2 r4874, r4865, r4871; +} +{ +sub.f16x2 %22, r4859, r4874; +} +{ +add.f16x2 r4880, r4749, r4750; +} +{ +mul.f16x2 r4883, r4880, r4742; +} +{ +add.f16x2 r4886, r4752, r4883; +} +{ +add.f16x2 r4889, r4755, r4756; +} +{ +mul.f16x2 r4892, r4889, r4746; +} +{ +add.f16x2 r4895, r4886, r4892; +} +{ +sub.f16x2 r4898, r4761, r4762; +} +{ +mul.f16x2 r4901, r4898, r4744; +} +{ +sub.f16x2 r4904, r4767, r4768; +} +{ +mul.f16x2 r4907, r4904, r4747; +} +{ +add.f16x2 r4910, r4901, r4907; +} +{ +add.f16x2 %32, r4895, r4910; +} +{ +add.f16x2 r4916, r4761, r4762; +} +{ +mul.f16x2 r4919, r4916, r4738; +} +{ +add.f16x2 r4922, r4764, r4919; +} +{ +add.f16x2 r4925, r4767, r4768; +} +{ +mul.f16x2 r4928, r4925, r4742; +} +{ +add.f16x2 r4931, r4922, r4928; +} +{ +sub.f16x2 r4934, r4749, r4750; +} +{ +mul.f16x2 r4937, r4934, r4740; +} +{ +sub.f16x2 r4940, r4755, r4756; +} +{ +mul.f16x2 r4943, r4940, r4744; +} +{ +add.f16x2 r4946, r4937, r4943; +} +{ +add.f16x2 %13, r4931, r4946; +} +{ +add.f16x2 r4952, r4761, r4762; +} +{ +mul.f16x2 r4955, r4952, r4738; +} +{ +add.f16x2 r4958, r4764, r4955; +} +{ +add.f16x2 r4961, r4767, r4768; +} +{ +mul.f16x2 r4964, r4961, r4742; +} +{ +add.f16x2 r4967, r4958, r4964; +} +{ +sub.f16x2 r4970, r4749, r4750; +} +{ +mul.f16x2 r4973, r4970, r4740; +} +{ +sub.f16x2 r4976, r4755, r4756; +} +{ +mul.f16x2 r4979, r4976, r4744; +} +{ +add.f16x2 r4982, r4973, r4979; +} +{ +sub.f16x2 %43, r4967, r4982; +} +{ +add.f16x2 r4988, r4761, r4762; +} +{ +mul.f16x2 r4991, r4988, r4742; +} +{ +add.f16x2 r4994, r4764, r4991; +} +{ +add.f16x2 r4997, r4767, r4768; +} +{ +mul.f16x2 r5000, r4997, r4746; +} +{ +add.f16x2 r5003, r4994, r5000; +} +{ +sub.f16x2 r5006, r4749, r4750; +} +{ +mul.f16x2 r5009, r5006, r4744; +} +{ +sub.f16x2 r5012, r4755, r4756; +} +{ +mul.f16x2 r5015, r5012, r4747; +} +{ +add.f16x2 r5018, r5009, r5015; +} +{ +add.f16x2 %23, r5003, r5018; +} +{ +add.f16x2 r5024, r4761, r4762; +} +{ +mul.f16x2 r5027, r5024, r4742; +} +{ +add.f16x2 r5030, r4764, r5027; +} +{ +add.f16x2 r5033, r4767, r4768; +} +{ +mul.f16x2 r5036, r5033, r4746; +} +{ +add.f16x2 r5039, r5030, r5036; +} +{ +sub.f16x2 r5042, r4749, r4750; +} +{ +mul.f16x2 r5045, r5042, r4744; +} +{ +sub.f16x2 r5048, r4755, r4756; +} +{ +mul.f16x2 r5051, r5048, r4747; +} +{ +add.f16x2 r5054, r5045, r5051; +} +{ +sub.f16x2 %33, r5039, r5054; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5061, {low, high}; +} +{ +neg.f16x2 r5062, r5061; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5064, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5065, {low, high}; +} +{ +neg.f16x2 r5066, r5065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5069, {low, high}; +} +{ +add.f16x2 r5070, r5071, r5072; +} +{ +add.f16x2 r5073, r5074, r5070; +} +{ +add.f16x2 r5076, r5077, r5078; +} +{ +add.f16x2 %4, r5073, r5076; +} +{ +add.f16x2 r5082, r5083, r5084; +} +{ +add.f16x2 r5085, r5086, r5082; +} +{ +add.f16x2 r5088, r5089, r5090; +} +{ +add.f16x2 %5, r5085, r5088; +} +{ +add.f16x2 r5094, r5071, r5072; +} +{ +mul.f16x2 r5097, r5094, r5060; +} +{ +add.f16x2 r5100, r5074, r5097; +} +{ +add.f16x2 r5103, r5077, r5078; +} +{ +mul.f16x2 r5106, r5103, r5064; +} +{ +add.f16x2 r5109, r5100, r5106; +} +{ +sub.f16x2 r5112, r5083, r5084; +} +{ +mul.f16x2 r5115, r5112, r5062; +} +{ +sub.f16x2 r5118, r5089, r5090; +} +{ +mul.f16x2 r5121, r5118, r5066; +} +{ +add.f16x2 r5124, r5115, r5121; +} +{ +sub.f16x2 %14, r5109, r5124; +} +{ +add.f16x2 r5130, r5071, r5072; +} +{ +mul.f16x2 r5133, r5130, r5060; +} +{ +add.f16x2 r5136, r5074, r5133; +} +{ +add.f16x2 r5139, r5077, r5078; +} +{ +mul.f16x2 r5142, r5139, r5064; +} +{ +add.f16x2 r5145, r5136, r5142; +} +{ +sub.f16x2 r5148, r5083, r5084; +} +{ +mul.f16x2 r5151, r5148, r5062; +} +{ +sub.f16x2 r5154, r5089, r5090; +} +{ +mul.f16x2 r5157, r5154, r5066; +} +{ +add.f16x2 r5160, r5151, r5157; +} +{ +add.f16x2 %44, r5145, r5160; +} +{ +add.f16x2 r5166, r5071, r5072; +} +{ +mul.f16x2 r5169, r5166, r5064; +} +{ +add.f16x2 r5172, r5074, r5169; +} +{ +add.f16x2 r5175, r5077, r5078; +} +{ +mul.f16x2 r5178, r5175, r5068; +} +{ +add.f16x2 r5181, r5172, r5178; +} +{ +sub.f16x2 r5184, r5083, r5084; +} +{ +mul.f16x2 r5187, r5184, r5066; +} +{ +sub.f16x2 r5190, r5089, r5090; +} +{ +mul.f16x2 r5193, r5190, r5069; +} +{ +add.f16x2 r5196, r5187, r5193; +} +{ +sub.f16x2 %24, r5181, r5196; +} +{ +add.f16x2 r5202, r5071, r5072; +} +{ +mul.f16x2 r5205, r5202, r5064; +} +{ +add.f16x2 r5208, r5074, r5205; +} +{ +add.f16x2 r5211, r5077, r5078; +} +{ +mul.f16x2 r5214, r5211, r5068; +} +{ +add.f16x2 r5217, r5208, r5214; +} +{ +sub.f16x2 r5220, r5083, r5084; +} +{ +mul.f16x2 r5223, r5220, r5066; +} +{ +sub.f16x2 r5226, r5089, r5090; +} +{ +mul.f16x2 r5229, r5226, r5069; +} +{ +add.f16x2 r5232, r5223, r5229; +} +{ +add.f16x2 %34, r5217, r5232; +} +{ +add.f16x2 r5238, r5083, r5084; +} +{ +mul.f16x2 r5241, r5238, r5060; +} +{ +add.f16x2 r5244, r5086, r5241; +} +{ +add.f16x2 r5247, r5089, r5090; +} +{ +mul.f16x2 r5250, r5247, r5064; +} +{ +add.f16x2 r5253, r5244, r5250; +} +{ +sub.f16x2 r5256, r5071, r5072; +} +{ +mul.f16x2 r5259, r5256, r5062; +} +{ +sub.f16x2 r5262, r5077, r5078; +} +{ +mul.f16x2 r5265, r5262, r5066; +} +{ +add.f16x2 r5268, r5259, r5265; +} +{ +add.f16x2 %15, r5253, r5268; +} +{ +add.f16x2 r5274, r5083, r5084; +} +{ +mul.f16x2 r5277, r5274, r5060; +} +{ +add.f16x2 r5280, r5086, r5277; +} +{ +add.f16x2 r5283, r5089, r5090; +} +{ +mul.f16x2 r5286, r5283, r5064; +} +{ +add.f16x2 r5289, r5280, r5286; +} +{ +sub.f16x2 r5292, r5071, r5072; +} +{ +mul.f16x2 r5295, r5292, r5062; +} +{ +sub.f16x2 r5298, r5077, r5078; +} +{ +mul.f16x2 r5301, r5298, r5066; +} +{ +add.f16x2 r5304, r5295, r5301; +} +{ +sub.f16x2 %45, r5289, r5304; +} +{ +add.f16x2 r5310, r5083, r5084; +} +{ +mul.f16x2 r5313, r5310, r5064; +} +{ +add.f16x2 r5316, r5086, r5313; +} +{ +add.f16x2 r5319, r5089, r5090; +} +{ +mul.f16x2 r5322, r5319, r5068; +} +{ +add.f16x2 r5325, r5316, r5322; +} +{ +sub.f16x2 r5328, r5071, r5072; +} +{ +mul.f16x2 r5331, r5328, r5066; +} +{ +sub.f16x2 r5334, r5077, r5078; +} +{ +mul.f16x2 r5337, r5334, r5069; +} +{ +add.f16x2 r5340, r5331, r5337; +} +{ +add.f16x2 %25, r5325, r5340; +} +{ +add.f16x2 r5346, r5083, r5084; +} +{ +mul.f16x2 r5349, r5346, r5064; +} +{ +add.f16x2 r5352, r5086, r5349; +} +{ +add.f16x2 r5355, r5089, r5090; +} +{ +mul.f16x2 r5358, r5355, r5068; +} +{ +add.f16x2 r5361, r5352, r5358; +} +{ +sub.f16x2 r5364, r5071, r5072; +} +{ +mul.f16x2 r5367, r5364, r5066; +} +{ +sub.f16x2 r5370, r5077, r5078; +} +{ +mul.f16x2 r5373, r5370, r5069; +} +{ +add.f16x2 r5376, r5367, r5373; +} +{ +sub.f16x2 %35, r5361, r5376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5383, {low, high}; +} +{ +neg.f16x2 r5384, r5383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5386, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5387, {low, high}; +} +{ +neg.f16x2 r5388, r5387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5391, {low, high}; +} +{ +add.f16x2 r5392, r5393, r5394; +} +{ +add.f16x2 r5395, r5396, r5392; +} +{ +add.f16x2 r5398, r5399, r5400; +} +{ +add.f16x2 %6, r5395, r5398; +} +{ +add.f16x2 r5404, r5405, r5406; +} +{ +add.f16x2 r5407, r5408, r5404; +} +{ +add.f16x2 r5410, r5411, r5412; +} +{ +add.f16x2 %7, r5407, r5410; +} +{ +add.f16x2 r5416, r5393, r5394; +} +{ +mul.f16x2 r5419, r5416, r5382; +} +{ +add.f16x2 r5422, r5396, r5419; +} +{ +add.f16x2 r5425, r5399, r5400; +} +{ +mul.f16x2 r5428, r5425, r5386; +} +{ +add.f16x2 r5431, r5422, r5428; +} +{ +sub.f16x2 r5434, r5405, r5406; +} +{ +mul.f16x2 r5437, r5434, r5384; +} +{ +sub.f16x2 r5440, r5411, r5412; +} +{ +mul.f16x2 r5443, r5440, r5388; +} +{ +add.f16x2 r5446, r5437, r5443; +} +{ +sub.f16x2 %16, r5431, r5446; +} +{ +add.f16x2 r5452, r5393, r5394; +} +{ +mul.f16x2 r5455, r5452, r5382; +} +{ +add.f16x2 r5458, r5396, r5455; +} +{ +add.f16x2 r5461, r5399, r5400; +} +{ +mul.f16x2 r5464, r5461, r5386; +} +{ +add.f16x2 r5467, r5458, r5464; +} +{ +sub.f16x2 r5470, r5405, r5406; +} +{ +mul.f16x2 r5473, r5470, r5384; +} +{ +sub.f16x2 r5476, r5411, r5412; +} +{ +mul.f16x2 r5479, r5476, r5388; +} +{ +add.f16x2 r5482, r5473, r5479; +} +{ +add.f16x2 %46, r5467, r5482; +} +{ +add.f16x2 r5488, r5393, r5394; +} +{ +mul.f16x2 r5491, r5488, r5386; +} +{ +add.f16x2 r5494, r5396, r5491; +} +{ +add.f16x2 r5497, r5399, r5400; +} +{ +mul.f16x2 r5500, r5497, r5390; +} +{ +add.f16x2 r5503, r5494, r5500; +} +{ +sub.f16x2 r5506, r5405, r5406; +} +{ +mul.f16x2 r5509, r5506, r5388; +} +{ +sub.f16x2 r5512, r5411, r5412; +} +{ +mul.f16x2 r5515, r5512, r5391; +} +{ +add.f16x2 r5518, r5509, r5515; +} +{ +sub.f16x2 %26, r5503, r5518; +} +{ +add.f16x2 r5524, r5393, r5394; +} +{ +mul.f16x2 r5527, r5524, r5386; +} +{ +add.f16x2 r5530, r5396, r5527; +} +{ +add.f16x2 r5533, r5399, r5400; +} +{ +mul.f16x2 r5536, r5533, r5390; +} +{ +add.f16x2 r5539, r5530, r5536; +} +{ +sub.f16x2 r5542, r5405, r5406; +} +{ +mul.f16x2 r5545, r5542, r5388; +} +{ +sub.f16x2 r5548, r5411, r5412; +} +{ +mul.f16x2 r5551, r5548, r5391; +} +{ +add.f16x2 r5554, r5545, r5551; +} +{ +add.f16x2 %36, r5539, r5554; +} +{ +add.f16x2 r5560, r5405, r5406; +} +{ +mul.f16x2 r5563, r5560, r5382; +} +{ +add.f16x2 r5566, r5408, r5563; +} +{ +add.f16x2 r5569, r5411, r5412; +} +{ +mul.f16x2 r5572, r5569, r5386; +} +{ +add.f16x2 r5575, r5566, r5572; +} +{ +sub.f16x2 r5578, r5393, r5394; +} +{ +mul.f16x2 r5581, r5578, r5384; +} +{ +sub.f16x2 r5584, r5399, r5400; +} +{ +mul.f16x2 r5587, r5584, r5388; +} +{ +add.f16x2 r5590, r5581, r5587; +} +{ +add.f16x2 %17, r5575, r5590; +} +{ +add.f16x2 r5596, r5405, r5406; +} +{ +mul.f16x2 r5599, r5596, r5382; +} +{ +add.f16x2 r5602, r5408, r5599; +} +{ +add.f16x2 r5605, r5411, r5412; +} +{ +mul.f16x2 r5608, r5605, r5386; +} +{ +add.f16x2 r5611, r5602, r5608; +} +{ +sub.f16x2 r5614, r5393, r5394; +} +{ +mul.f16x2 r5617, r5614, r5384; +} +{ +sub.f16x2 r5620, r5399, r5400; +} +{ +mul.f16x2 r5623, r5620, r5388; +} +{ +add.f16x2 r5626, r5617, r5623; +} +{ +sub.f16x2 %47, r5611, r5626; +} +{ +add.f16x2 r5632, r5405, r5406; +} +{ +mul.f16x2 r5635, r5632, r5386; +} +{ +add.f16x2 r5638, r5408, r5635; +} +{ +add.f16x2 r5641, r5411, r5412; +} +{ +mul.f16x2 r5644, r5641, r5390; +} +{ +add.f16x2 r5647, r5638, r5644; +} +{ +sub.f16x2 r5650, r5393, r5394; +} +{ +mul.f16x2 r5653, r5650, r5388; +} +{ +sub.f16x2 r5656, r5399, r5400; +} +{ +mul.f16x2 r5659, r5656, r5391; +} +{ +add.f16x2 r5662, r5653, r5659; +} +{ +add.f16x2 %27, r5647, r5662; +} +{ +add.f16x2 r5668, r5405, r5406; +} +{ +mul.f16x2 r5671, r5668, r5386; +} +{ +add.f16x2 r5674, r5408, r5671; +} +{ +add.f16x2 r5677, r5411, r5412; +} +{ +mul.f16x2 r5680, r5677, r5390; +} +{ +add.f16x2 r5683, r5674, r5680; +} +{ +sub.f16x2 r5686, r5393, r5394; +} +{ +mul.f16x2 r5689, r5686, r5388; +} +{ +sub.f16x2 r5692, r5399, r5400; +} +{ +mul.f16x2 r5695, r5692, r5391; +} +{ +add.f16x2 r5698, r5689, r5695; +} +{ +sub.f16x2 %37, r5683, r5698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5704, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5705, {low, high}; +} +{ +neg.f16x2 r5706, r5705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r5708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r5709, {low, high}; +} +{ +neg.f16x2 r5710, r5709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r5712, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r5713, {low, high}; +} +{ +add.f16x2 r5714, r5715, r5716; +} +{ +add.f16x2 r5717, r5718, r5714; +} +{ +add.f16x2 r5720, r5721, r5722; +} +{ +add.f16x2 %8, r5717, r5720; +} +{ +add.f16x2 r5726, r5727, r5728; +} +{ +add.f16x2 r5729, r5730, r5726; +} +{ +add.f16x2 r5732, r5733, r5734; +} +{ +add.f16x2 %9, r5729, r5732; +} +{ +add.f16x2 r5738, r5715, r5716; +} +{ +mul.f16x2 r5741, r5738, r5704; +} +{ +add.f16x2 r5744, r5718, r5741; +} +{ +add.f16x2 r5747, r5721, r5722; +} +{ +mul.f16x2 r5750, r5747, r5708; +} +{ +add.f16x2 r5753, r5744, r5750; +} +{ +sub.f16x2 r5756, r5727, r5728; +} +{ +mul.f16x2 r5759, r5756, r5706; +} +{ +sub.f16x2 r5762, r5733, r5734; +} +{ +mul.f16x2 r5765, r5762, r5710; +} +{ +add.f16x2 r5768, r5759, r5765; +} +{ +sub.f16x2 %18, r5753, r5768; +} +{ +add.f16x2 r5774, r5715, r5716; +} +{ +mul.f16x2 r5777, r5774, r5704; +} +{ +add.f16x2 r5780, r5718, r5777; +} +{ +add.f16x2 r5783, r5721, r5722; +} +{ +mul.f16x2 r5786, r5783, r5708; +} +{ +add.f16x2 r5789, r5780, r5786; +} +{ +sub.f16x2 r5792, r5727, r5728; +} +{ +mul.f16x2 r5795, r5792, r5706; +} +{ +sub.f16x2 r5798, r5733, r5734; +} +{ +mul.f16x2 r5801, r5798, r5710; +} +{ +add.f16x2 r5804, r5795, r5801; +} +{ +add.f16x2 %48, r5789, r5804; +} +{ +add.f16x2 r5810, r5715, r5716; +} +{ +mul.f16x2 r5813, r5810, r5708; +} +{ +add.f16x2 r5816, r5718, r5813; +} +{ +add.f16x2 r5819, r5721, r5722; +} +{ +mul.f16x2 r5822, r5819, r5712; +} +{ +add.f16x2 r5825, r5816, r5822; +} +{ +sub.f16x2 r5828, r5727, r5728; +} +{ +mul.f16x2 r5831, r5828, r5710; +} +{ +sub.f16x2 r5834, r5733, r5734; +} +{ +mul.f16x2 r5837, r5834, r5713; +} +{ +add.f16x2 r5840, r5831, r5837; +} +{ +sub.f16x2 %28, r5825, r5840; +} +{ +add.f16x2 r5846, r5715, r5716; +} +{ +mul.f16x2 r5849, r5846, r5708; +} +{ +add.f16x2 r5852, r5718, r5849; +} +{ +add.f16x2 r5855, r5721, r5722; +} +{ +mul.f16x2 r5858, r5855, r5712; +} +{ +add.f16x2 r5861, r5852, r5858; +} +{ +sub.f16x2 r5864, r5727, r5728; +} +{ +mul.f16x2 r5867, r5864, r5710; +} +{ +sub.f16x2 r5870, r5733, r5734; +} +{ +mul.f16x2 r5873, r5870, r5713; +} +{ +add.f16x2 r5876, r5867, r5873; +} +{ +add.f16x2 %38, r5861, r5876; +} +{ +add.f16x2 r5882, r5727, r5728; +} +{ +mul.f16x2 r5885, r5882, r5704; +} +{ +add.f16x2 r5888, r5730, r5885; +} +{ +add.f16x2 r5891, r5733, r5734; +} +{ +mul.f16x2 r5894, r5891, r5708; +} +{ +add.f16x2 r5897, r5888, r5894; +} +{ +sub.f16x2 r5900, r5715, r5716; +} +{ +mul.f16x2 r5903, r5900, r5706; +} +{ +sub.f16x2 r5906, r5721, r5722; +} +{ +mul.f16x2 r5909, r5906, r5710; +} +{ +add.f16x2 r5912, r5903, r5909; +} +{ +add.f16x2 %19, r5897, r5912; +} +{ +add.f16x2 r5918, r5727, r5728; +} +{ +mul.f16x2 r5921, r5918, r5704; +} +{ +add.f16x2 r5924, r5730, r5921; +} +{ +add.f16x2 r5927, r5733, r5734; +} +{ +mul.f16x2 r5930, r5927, r5708; +} +{ +add.f16x2 r5933, r5924, r5930; +} +{ +sub.f16x2 r5936, r5715, r5716; +} +{ +mul.f16x2 r5939, r5936, r5706; +} +{ +sub.f16x2 r5942, r5721, r5722; +} +{ +mul.f16x2 r5945, r5942, r5710; +} +{ +add.f16x2 r5948, r5939, r5945; +} +{ +sub.f16x2 %49, r5933, r5948; +} +{ +add.f16x2 r5954, r5727, r5728; +} +{ +mul.f16x2 r5957, r5954, r5708; +} +{ +add.f16x2 r5960, r5730, r5957; +} +{ +add.f16x2 r5963, r5733, r5734; +} +{ +mul.f16x2 r5966, r5963, r5712; +} +{ +add.f16x2 r5969, r5960, r5966; +} +{ +sub.f16x2 r5972, r5715, r5716; +} +{ +mul.f16x2 r5975, r5972, r5710; +} +{ +sub.f16x2 r5978, r5721, r5722; +} +{ +mul.f16x2 r5981, r5978, r5713; +} +{ +add.f16x2 r5984, r5975, r5981; +} +{ +add.f16x2 %29, r5969, r5984; +} +{ +add.f16x2 r5990, r5727, r5728; +} +{ +mul.f16x2 r5993, r5990, r5708; +} +{ +add.f16x2 r5996, r5730, r5993; +} +{ +add.f16x2 r5999, r5733, r5734; +} +{ +mul.f16x2 r6002, r5999, r5712; +} +{ +add.f16x2 r6005, r5996, r6002; +} +{ +sub.f16x2 r6008, r5715, r5716; +} +{ +mul.f16x2 r6011, r6008, r5710; +} +{ +sub.f16x2 r6014, r5721, r5722; +} +{ +mul.f16x2 r6017, r6014, r5713; +} +{ +add.f16x2 r6020, r6011, r6017; +} +{ +sub.f16x2 %39, r6005, r6020; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[13].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1108, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<67>; +.reg .b32 r<1286>; +.reg .b64 rd<6>; +mov.u32 r1269, %tid.x; +mov.f32 f58, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r1, {low, high}; +} +mov.f32 f60, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f54, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r5, {low, high}; +} +mov.f32 f56, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r1269, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1270, rd3; +mul.lo.s32 r1271, r1270, 25; +sub.s32 r1272, r1269, r1271; +cvt.rn.f32.u32 f61, r1272; +mul.f32 f62, f61, 0f3D4DE32E; +cos.approx.f32 f13, f62; +sin.approx.f32 f63, f62; +neg.f32 f14, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +mov.u32 r1273, %tid.y; +mov.u32 r1274, %10; +mad.lo.s32 r1275, r1273, 1000, r1274; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +mad.lo.s32 r1276, r1270, 1000, r1275; +barrier.sync 0; +mad.lo.s32 r1277, r1272, 40, r1276; +st.shared.v2.f32 [r1277], {r20, r32}; +st.shared.v2.f32 [r1277+8], {r333, r342}; +st.shared.v2.f32 [r1277+16], {r370, r379}; +st.shared.v2.f32 [r1277+24], {r407, r416}; +st.shared.v2.f32 [r1277+32], {r444, r453}; +barrier.sync 0; +shl.b32 r1278, r1272, 5; +sub.s32 r1279, r1277, r1278; +ld.shared.u32 r488, [r1279]; +ld.shared.u32 r500, [r1279+4]; +ld.shared.u32 r485, [r1279+200]; +ld.shared.u32 r497, [r1279+204]; +ld.shared.u32 r491, [r1279+400]; +ld.shared.u32 r503, [r1279+404]; +ld.shared.u32 r492, [r1279+600]; +ld.shared.u32 r504, [r1279+604]; +ld.shared.u32 r486, [r1279+800]; +ld.shared.u32 r498, [r1279+804]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 r493, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 r505, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 r577, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 r721, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 r757, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 r793, r775, r790; +} +mul.wide.u32 rd4, r1272, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1280, rd5; +cvt.rn.f32.u32 f64, r1280; +mul.f32 f65, f64, 0f3E80ADFD; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r796, {low, high}; +} +mul.lo.s32 r1281, r1280, 5; +sub.s32 r1282, r1272, r1281; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r801, {high, high}; +} +{ +mul.f16x2 r803, r685, r801; +} +{ +fma.rn.f16x2 r806, r541, r799, r803; +} +{ +mul.f16x2 r810, r541, r801; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r685, r799, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r821, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r823, {low, high}; +} +{ +mul.f16x2 r824, r821, r823; +} +{ +mul.f16x2 r827, r796, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r830, {high, low}; +} +{ +fma.rn.f16x2 r832, r824, r830, r827; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r838, {high, high}; +} +{ +mul.f16x2 r840, r757, r838; +} +{ +fma.rn.f16x2 r843, r613, r836, r840; +} +{ +mul.f16x2 r847, r613, r838; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r757, r836, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r858, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r860, {low, high}; +} +{ +mul.f16x2 r861, r858, r860; +} +{ +mul.f16x2 r864, r832, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r867, {high, low}; +} +{ +fma.rn.f16x2 r869, r861, r867, r864; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r873, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r875, {high, high}; +} +{ +mul.f16x2 r877, r793, r875; +} +{ +fma.rn.f16x2 r880, r649, r873, r877; +} +{ +mul.f16x2 r884, r649, r875; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r793, r873, r887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r893, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r895, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r897, {low, high}; +} +{ +mul.f16x2 r898, r895, r897; +} +{ +mul.f16x2 r901, r869, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r904, {high, low}; +} +{ +fma.rn.f16x2 r906, r898, r904, r901; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r910, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r912, {high, high}; +} +{ +mul.f16x2 r914, r721, r912; +} +{ +fma.rn.f16x2 r917, r577, r910, r914; +} +{ +mul.f16x2 r921, r577, r912; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r721, r910, r924; +} +shl.b32 r1283, r1282, 3; +add.s32 r1284, r1276, r1283; +barrier.sync 0; +mad.lo.s32 r1285, r1280, 200, r1284; +st.shared.u32 [r1285], r493; +st.shared.u32 [r1285+4], r505; +st.shared.u32 [r1285+40], r806; +st.shared.u32 [r1285+44], r815; +st.shared.u32 [r1285+80], r843; +st.shared.u32 [r1285+84], r852; +st.shared.u32 [r1285+120], r880; +st.shared.u32 [r1285+124], r889; +st.shared.u32 [r1285+160], r917; +st.shared.u32 [r1285+164], r926; +barrier.sync 0; +ld.shared.u32 r961, [r1279]; +ld.shared.u32 r973, [r1279+4]; +ld.shared.u32 r958, [r1279+200]; +ld.shared.u32 r970, [r1279+204]; +ld.shared.u32 r964, [r1279+400]; +ld.shared.u32 r976, [r1279+404]; +ld.shared.u32 r965, [r1279+600]; +ld.shared.u32 r977, [r1279+604]; +ld.shared.u32 r959, [r1279+800]; +ld.shared.u32 r971, [r1279+804]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r951, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r952, {low, high}; +} +{ +neg.f16x2 r953, r952; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r956, {low, high}; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r961, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 %0, r960, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r973, r969; +} +{ +add.f16x2 r975, r976, r977; +} +{ +add.f16x2 %1, r972, r975; +} +{ +add.f16x2 r981, r958, r959; +} +{ +mul.f16x2 r984, r981, r947; +} +{ +add.f16x2 r987, r961, r984; +} +{ +add.f16x2 r990, r964, r965; +} +{ +mul.f16x2 r993, r990, r951; +} +{ +add.f16x2 r996, r987, r993; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r949; +} +{ +sub.f16x2 r1005, r976, r977; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +sub.f16x2 %2, r996, r1011; +} +{ +add.f16x2 r1017, r958, r959; +} +{ +mul.f16x2 r1020, r1017, r947; +} +{ +add.f16x2 r1023, r961, r1020; +} +{ +add.f16x2 r1026, r964, r965; +} +{ +mul.f16x2 r1029, r1026, r951; +} +{ +add.f16x2 r1032, r1023, r1029; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r949; +} +{ +sub.f16x2 r1041, r976, r977; +} +{ +mul.f16x2 r1044, r1041, r953; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 %8, r1032, r1047; +} +{ +add.f16x2 r1053, r958, r959; +} +{ +mul.f16x2 r1056, r1053, r951; +} +{ +add.f16x2 r1059, r961, r1056; +} +{ +add.f16x2 r1062, r964, r965; +} +{ +mul.f16x2 r1065, r1062, r955; +} +{ +add.f16x2 r1068, r1059, r1065; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r953; +} +{ +sub.f16x2 r1077, r976, r977; +} +{ +mul.f16x2 r1080, r1077, r956; +} +{ +add.f16x2 r1083, r1074, r1080; +} +{ +sub.f16x2 %4, r1068, r1083; +} +{ +add.f16x2 r1089, r958, r959; +} +{ +mul.f16x2 r1092, r1089, r951; +} +{ +add.f16x2 r1095, r961, r1092; +} +{ +add.f16x2 r1098, r964, r965; +} +{ +mul.f16x2 r1101, r1098, r955; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r953; +} +{ +sub.f16x2 r1113, r976, r977; +} +{ +mul.f16x2 r1116, r1113, r956; +} +{ +add.f16x2 r1119, r1110, r1116; +} +{ +add.f16x2 %6, r1104, r1119; +} +{ +add.f16x2 r1125, r970, r971; +} +{ +mul.f16x2 r1128, r1125, r947; +} +{ +add.f16x2 r1131, r973, r1128; +} +{ +add.f16x2 r1134, r976, r977; +} +{ +mul.f16x2 r1137, r1134, r951; +} +{ +add.f16x2 r1140, r1131, r1137; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r949; +} +{ +sub.f16x2 r1149, r964, r965; +} +{ +mul.f16x2 r1152, r1149, r953; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 %3, r1140, r1155; +} +{ +add.f16x2 r1161, r970, r971; +} +{ +mul.f16x2 r1164, r1161, r947; +} +{ +add.f16x2 r1167, r973, r1164; +} +{ +add.f16x2 r1170, r976, r977; +} +{ +mul.f16x2 r1173, r1170, r951; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r949; +} +{ +sub.f16x2 r1185, r964, r965; +} +{ +mul.f16x2 r1188, r1185, r953; +} +{ +add.f16x2 r1191, r1182, r1188; +} +{ +sub.f16x2 %9, r1176, r1191; +} +{ +add.f16x2 r1197, r970, r971; +} +{ +mul.f16x2 r1200, r1197, r951; +} +{ +add.f16x2 r1203, r973, r1200; +} +{ +add.f16x2 r1206, r976, r977; +} +{ +mul.f16x2 r1209, r1206, r955; +} +{ +add.f16x2 r1212, r1203, r1209; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r953; +} +{ +sub.f16x2 r1221, r964, r965; +} +{ +mul.f16x2 r1224, r1221, r956; +} +{ +add.f16x2 r1227, r1218, r1224; +} +{ +add.f16x2 %5, r1212, r1227; +} +{ +add.f16x2 r1233, r970, r971; +} +{ +mul.f16x2 r1236, r1233, r951; +} +{ +add.f16x2 r1239, r973, r1236; +} +{ +add.f16x2 r1242, r976, r977; +} +{ +mul.f16x2 r1245, r1242, r955; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r953; +} +{ +sub.f16x2 r1257, r964, r965; +} +{ +mul.f16x2 r1260, r1257, r956; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +sub.f16x2 %7, r1248, r1263; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1109, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<67>; +.reg .b32 r<1286>; +.reg .b64 rd<6>; +mov.u32 r1269, %tid.y; +mov.u32 r1270, %10; +mad.lo.s32 r1271, r1269, 500, r1270; +mov.u32 r1272, %tid.x; +mov.f32 f58, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r1, {low, high}; +} +mov.f32 f60, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f54, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r5, {low, high}; +} +mov.f32 f56, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r1272, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1273, rd3; +mul.lo.s32 r1274, r1273, 25; +sub.s32 r1275, r1272, r1274; +mad.lo.s32 r1276, r1273, 500, r1271; +cvt.rn.f32.u32 f61, r1275; +mul.f32 f62, f61, 0f3D4DE32E; +cos.approx.f32 f13, f62; +sin.approx.f32 f63, f62; +neg.f32 f14, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +barrier.sync 0; +mad.lo.s32 r1277, r1275, 20, r1276; +st.shared.u32 [r1277], r20; +st.shared.u32 [r1277+4], r333; +st.shared.u32 [r1277+8], r370; +st.shared.u32 [r1277+12], r407; +st.shared.u32 [r1277+16], r444; +barrier.sync 0; +shl.b32 r1278, r1275, 4; +sub.s32 r1279, r1277, r1278; +ld.shared.u32 r488, [r1279]; +ld.shared.u32 r485, [r1279+100]; +ld.shared.u32 r491, [r1279+200]; +ld.shared.u32 r492, [r1279+300]; +ld.shared.u32 r486, [r1279+400]; +barrier.sync 0; +st.shared.u32 [r1277], r32; +st.shared.u32 [r1277+4], r342; +st.shared.u32 [r1277+8], r379; +st.shared.u32 [r1277+12], r416; +st.shared.u32 [r1277+16], r453; +barrier.sync 0; +ld.shared.u32 r500, [r1279]; +ld.shared.u32 r497, [r1279+100]; +ld.shared.u32 r503, [r1279+200]; +ld.shared.u32 r504, [r1279+300]; +ld.shared.u32 r498, [r1279+400]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 r493, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 r505, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 r577, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 r721, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 r757, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 r793, r775, r790; +} +mul.wide.u32 rd4, r1275, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1280, rd5; +mul.lo.s32 r1281, r1280, 5; +sub.s32 r1282, r1275, r1281; +shl.b32 r1283, r1282, 2; +add.s32 r1284, r1276, r1283; +cvt.rn.f32.u32 f64, r1280; +mul.f32 f65, f64, 0f3E80ADFD; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r796, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r801, {high, high}; +} +{ +mul.f16x2 r803, r685, r801; +} +{ +fma.rn.f16x2 r806, r541, r799, r803; +} +{ +mul.f16x2 r810, r541, r801; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r685, r799, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r821, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r823, {low, high}; +} +{ +mul.f16x2 r824, r821, r823; +} +{ +mul.f16x2 r827, r796, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r830, {high, low}; +} +{ +fma.rn.f16x2 r832, r824, r830, r827; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r838, {high, high}; +} +{ +mul.f16x2 r840, r757, r838; +} +{ +fma.rn.f16x2 r843, r613, r836, r840; +} +{ +mul.f16x2 r847, r613, r838; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r757, r836, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r858, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r860, {low, high}; +} +{ +mul.f16x2 r861, r858, r860; +} +{ +mul.f16x2 r864, r832, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r867, {high, low}; +} +{ +fma.rn.f16x2 r869, r861, r867, r864; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r873, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r875, {high, high}; +} +{ +mul.f16x2 r877, r793, r875; +} +{ +fma.rn.f16x2 r880, r649, r873, r877; +} +{ +mul.f16x2 r884, r649, r875; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r793, r873, r887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r893, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r895, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r897, {low, high}; +} +{ +mul.f16x2 r898, r895, r897; +} +{ +mul.f16x2 r901, r869, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r904, {high, low}; +} +{ +fma.rn.f16x2 r906, r898, r904, r901; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r910, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r912, {high, high}; +} +{ +mul.f16x2 r914, r721, r912; +} +{ +fma.rn.f16x2 r917, r577, r910, r914; +} +{ +mul.f16x2 r921, r577, r912; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r721, r910, r924; +} +barrier.sync 0; +mad.lo.s32 r1285, r1280, 100, r1284; +st.shared.u32 [r1285], r493; +st.shared.u32 [r1285+20], r806; +st.shared.u32 [r1285+40], r843; +st.shared.u32 [r1285+60], r880; +st.shared.u32 [r1285+80], r917; +barrier.sync 0; +ld.shared.u32 r961, [r1279]; +ld.shared.u32 r958, [r1279+100]; +ld.shared.u32 r964, [r1279+200]; +ld.shared.u32 r965, [r1279+300]; +ld.shared.u32 r959, [r1279+400]; +barrier.sync 0; +st.shared.u32 [r1285], r505; +st.shared.u32 [r1285+20], r815; +st.shared.u32 [r1285+40], r852; +st.shared.u32 [r1285+60], r889; +st.shared.u32 [r1285+80], r926; +barrier.sync 0; +ld.shared.u32 r973, [r1279]; +ld.shared.u32 r970, [r1279+100]; +ld.shared.u32 r976, [r1279+200]; +ld.shared.u32 r977, [r1279+300]; +ld.shared.u32 r971, [r1279+400]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r951, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r952, {low, high}; +} +{ +neg.f16x2 r953, r952; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f60; +cvt.rn.f16.f32 high, f60; +mov.b32 r956, {low, high}; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r961, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 %0, r960, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r973, r969; +} +{ +add.f16x2 r975, r976, r977; +} +{ +add.f16x2 %1, r972, r975; +} +{ +add.f16x2 r981, r958, r959; +} +{ +mul.f16x2 r984, r981, r947; +} +{ +add.f16x2 r987, r961, r984; +} +{ +add.f16x2 r990, r964, r965; +} +{ +mul.f16x2 r993, r990, r951; +} +{ +add.f16x2 r996, r987, r993; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r949; +} +{ +sub.f16x2 r1005, r976, r977; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +sub.f16x2 %2, r996, r1011; +} +{ +add.f16x2 r1017, r958, r959; +} +{ +mul.f16x2 r1020, r1017, r947; +} +{ +add.f16x2 r1023, r961, r1020; +} +{ +add.f16x2 r1026, r964, r965; +} +{ +mul.f16x2 r1029, r1026, r951; +} +{ +add.f16x2 r1032, r1023, r1029; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r949; +} +{ +sub.f16x2 r1041, r976, r977; +} +{ +mul.f16x2 r1044, r1041, r953; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 %8, r1032, r1047; +} +{ +add.f16x2 r1053, r958, r959; +} +{ +mul.f16x2 r1056, r1053, r951; +} +{ +add.f16x2 r1059, r961, r1056; +} +{ +add.f16x2 r1062, r964, r965; +} +{ +mul.f16x2 r1065, r1062, r955; +} +{ +add.f16x2 r1068, r1059, r1065; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r953; +} +{ +sub.f16x2 r1077, r976, r977; +} +{ +mul.f16x2 r1080, r1077, r956; +} +{ +add.f16x2 r1083, r1074, r1080; +} +{ +sub.f16x2 %4, r1068, r1083; +} +{ +add.f16x2 r1089, r958, r959; +} +{ +mul.f16x2 r1092, r1089, r951; +} +{ +add.f16x2 r1095, r961, r1092; +} +{ +add.f16x2 r1098, r964, r965; +} +{ +mul.f16x2 r1101, r1098, r955; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r953; +} +{ +sub.f16x2 r1113, r976, r977; +} +{ +mul.f16x2 r1116, r1113, r956; +} +{ +add.f16x2 r1119, r1110, r1116; +} +{ +add.f16x2 %6, r1104, r1119; +} +{ +add.f16x2 r1125, r970, r971; +} +{ +mul.f16x2 r1128, r1125, r947; +} +{ +add.f16x2 r1131, r973, r1128; +} +{ +add.f16x2 r1134, r976, r977; +} +{ +mul.f16x2 r1137, r1134, r951; +} +{ +add.f16x2 r1140, r1131, r1137; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r949; +} +{ +sub.f16x2 r1149, r964, r965; +} +{ +mul.f16x2 r1152, r1149, r953; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 %3, r1140, r1155; +} +{ +add.f16x2 r1161, r970, r971; +} +{ +mul.f16x2 r1164, r1161, r947; +} +{ +add.f16x2 r1167, r973, r1164; +} +{ +add.f16x2 r1170, r976, r977; +} +{ +mul.f16x2 r1173, r1170, r951; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r949; +} +{ +sub.f16x2 r1185, r964, r965; +} +{ +mul.f16x2 r1188, r1185, r953; +} +{ +add.f16x2 r1191, r1182, r1188; +} +{ +sub.f16x2 %9, r1176, r1191; +} +{ +add.f16x2 r1197, r970, r971; +} +{ +mul.f16x2 r1200, r1197, r951; +} +{ +add.f16x2 r1203, r973, r1200; +} +{ +add.f16x2 r1206, r976, r977; +} +{ +mul.f16x2 r1209, r1206, r955; +} +{ +add.f16x2 r1212, r1203, r1209; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r953; +} +{ +sub.f16x2 r1221, r964, r965; +} +{ +mul.f16x2 r1224, r1221, r956; +} +{ +add.f16x2 r1227, r1218, r1224; +} +{ +add.f16x2 %5, r1212, r1227; +} +{ +add.f16x2 r1233, r970, r971; +} +{ +mul.f16x2 r1236, r1233, r951; +} +{ +add.f16x2 r1239, r973, r1236; +} +{ +add.f16x2 r1242, r976, r977; +} +{ +mul.f16x2 r1245, r1242, r955; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r953; +} +{ +sub.f16x2 r1257, r964, r965; +} +{ +mul.f16x2 r1260, r1257, r956; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +sub.f16x2 %7, r1248, r1263; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..9bef6e4afeb76 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp32_fwd.hpp.inc @@ -0,0 +1,2728 @@ +#ifndef CUFFTDX_FFT_125_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_125_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<159, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1427>; +.reg .b32 r<14>; +.reg .b64 rd<10>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 1000, r13; +add.f32 f101, %62, %92; +add.f32 f103, %72, %82; +add.f32 f1426, %52, f101; +add.f32 f104, f103, f1426; +add.f32 f105, %102, %104; +add.f32 f107, %103, %83; +add.f32 f1422, %53, f105; +add.f32 f108, f107, f1422; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f1421, f101, 0f3E9E377A, %52; +sub.f32 f111, f1421, f110; +sub.f32 f112, %102, %104; +sub.f32 f114, %103, %83; +mul.f32 f1419, f112, 0f3F737871; +mul.f32 f1420, f114, 0fBF167918; +sub.f32 f116, f1420, f1419; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %52, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f1417, f105, 0f3E9E377A, %53; +mul.f32 f1418, f107, 0f3F4F1BBD; +sub.f32 f129, f1417, f1418; +sub.f32 f130, %62, %92; +sub.f32 f132, %72, %82; +mul.f32 f1415, f130, 0f3F737871; +mul.f32 f1416, f132, 0fBF167918; +sub.f32 f134, f1416, f1415; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %53, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %64, %94; +add.f32 f147, %74, %84; +add.f32 f1414, %54, f145; +add.f32 f148, f147, f1414; +add.f32 f149, %65, %95; +add.f32 f151, %107, %105; +add.f32 f1410, %106, f149; +add.f32 f152, f151, f1410; +fma.rn.f32 f1408, f145, 0f3E9E377A, %54; +mul.f32 f1409, f147, 0f3F4F1BBD; +sub.f32 f155, f1408, f1409; +sub.f32 f156, %65, %95; +sub.f32 f158, %107, %105; +mul.f32 f1406, f156, 0f3F737871; +mul.f32 f1407, f158, 0fBF167918; +sub.f32 f160, f1407, f1406; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %54, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +mul.f32 f172, f151, 0f3F4F1BBD; +fma.rn.f32 f1405, f149, 0f3E9E377A, %106; +sub.f32 f173, f1405, f172; +sub.f32 f174, %64, %94; +sub.f32 f176, %74, %84; +mul.f32 f177, f176, 0fBF167918; +mul.f32 f1404, f174, 0f3F737871; +sub.f32 f178, f177, f1404; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %106, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %66, %96; +add.f32 f191, %76, %86; +add.f32 f1403, %56, f189; +add.f32 f192, f191, f1403; +add.f32 f193, %110, %109; +add.f32 f195, %77, %111; +add.f32 f1398, %108, f193; +add.f32 f196, f195, f1398; +mul.f32 f198, f191, 0f3F4F1BBD; +fma.rn.f32 f1397, f189, 0f3E9E377A, %56; +sub.f32 f199, f1397, f198; +sub.f32 f200, %110, %109; +sub.f32 f202, %77, %111; +mul.f32 f203, f202, 0fBF167918; +mul.f32 f1396, f200, 0f3F737871; +sub.f32 f204, f203, f1396; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %56, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f1394, f193, 0f3E9E377A, %108; +mul.f32 f1395, f195, 0f3F4F1BBD; +sub.f32 f217, f1394, f1395; +sub.f32 f218, %66, %96; +sub.f32 f220, %76, %86; +mul.f32 f1392, f218, 0f3F737871; +mul.f32 f1393, f220, 0fBF167918; +sub.f32 f222, f1393, f1392; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %108, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %68, %98; +add.f32 f235, %78, %88; +add.f32 f1391, %58, f233; +add.f32 f236, f235, f1391; +add.f32 f237, %113, %112; +add.f32 f239, %114, %89; +add.f32 f1387, %59, f237; +add.f32 f240, f239, f1387; +fma.rn.f32 f1385, f233, 0f3E9E377A, %58; +mul.f32 f1386, f235, 0f3F4F1BBD; +sub.f32 f243, f1385, f1386; +sub.f32 f244, %113, %112; +sub.f32 f246, %114, %89; +mul.f32 f1383, f244, 0f3F737871; +mul.f32 f1384, f246, 0fBF167918; +sub.f32 f248, f1384, f1383; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %58, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +mul.f32 f260, f239, 0f3F4F1BBD; +fma.rn.f32 f1382, f237, 0f3E9E377A, %59; +sub.f32 f261, f1382, f260; +sub.f32 f262, %68, %98; +sub.f32 f264, %78, %88; +mul.f32 f1380, f262, 0f3F737871; +mul.f32 f1381, f264, 0fBF167918; +sub.f32 f266, f1381, f1380; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %59, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %70, %100; +add.f32 f279, %80, %90; +add.f32 f1379, %60, f277; +add.f32 f280, f279, f1379; +add.f32 f281, %71, %101; +add.f32 f283, %117, %115; +add.f32 f1375, %116, f281; +add.f32 f284, f283, f1375; +mul.f32 f286, f279, 0f3F4F1BBD; +fma.rn.f32 f1374, f277, 0f3E9E377A, %60; +sub.f32 f287, f1374, f286; +sub.f32 f288, %71, %101; +sub.f32 f290, %117, %115; +mul.f32 f1372, f288, 0f3F737871; +mul.f32 f1373, f290, 0fBF167918; +sub.f32 f292, f1373, f1372; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %60, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +mul.f32 f304, f283, 0f3F4F1BBD; +fma.rn.f32 f1371, f281, 0f3E9E377A, %116; +sub.f32 f305, f1371, f304; +sub.f32 f306, %70, %100; +sub.f32 f308, %80, %90; +mul.f32 f1369, f306, 0f3F737871; +mul.f32 f1370, f308, 0fBF167918; +sub.f32 f310, f1370, f1369; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %116, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mul.f32 f322, f179, 0fBE7EA890; +mul.f32 f1368, f161, 0f3F77F511; +sub.f32 f323, f1368, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f327, f223, 0fBEF6A86B; +mul.f32 f1367, f205, 0f3F6055A2; +sub.f32 f328, f1367, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f332, f267, 0fBF2F3E7B; +mul.f32 f1366, f249, 0f3F3A9DB0; +sub.f32 f333, f1366, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f337, f311, 0fBF5825E0; +mul.f32 f1365, f293, 0f3F092BF2; +sub.f32 f338, f1365, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f342, f187, 0fBEF6A86B; +mul.f32 f1364, f169, 0f3F6055A2; +sub.f32 f343, f1364, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f1362, f213, 0f3F092BF2; +mul.f32 f1363, f231, 0fBF5825E0; +sub.f32 f348, f1362, f1363; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f1360, f257, 0f3D809851; +mul.f32 f1361, f275, 0fBF7F7EAE; +sub.f32 f353, f1360, f1361; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f1358, f301, 0fBED9FFBE; +mul.f32 f1359, f319, 0fBF67A2BF; +sub.f32 f358, f1358, f1359; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f1356, f170, 0f3F3A9DB0; +mul.f32 f1357, f188, 0fBF2F3E7B; +sub.f32 f363, f1356, f1357; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f367, f232, 0fBF7F7EAE; +mul.f32 f1355, f214, 0f3D809851; +sub.f32 f368, f1355, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f372, f276, 0fBF45405B; +mul.f32 f1354, f258, 0fBF232E38; +sub.f32 f373, f1354, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f377, f320, 0fBE00575B; +mul.f32 f1353, f302, 0fBF7DFB3B; +sub.f32 f378, f1353, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f382, f180, 0fBF5825E0; +mul.f32 f1352, f162, 0f3F092BF2; +sub.f32 f383, f1352, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f387, f224, 0fBF67A2BF; +mul.f32 f1351, f206, 0fBED9FFBE; +sub.f32 f388, f1351, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f1349, f250, 0fBF7DFB3B; +mul.f32 f1350, f268, 0fBE00575B; +sub.f32 f393, f1349, f1350; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f1347, f294, 0fBF232E38; +mul.f32 f1348, f312, 0f3F45405B; +sub.f32 f398, f1347, f1348; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f403, f192, f236; +mul.f32 f408, f403, 0f3F4F1BBD; +fma.rn.f32 f1346, f401, 0f3E9E377A, f104; +sub.f32 f409, f1346, f408; +add.f32 f1345, f152, f284; +sub.f32 f410, f152, f284; +add.f32 f1344, f196, f240; +sub.f32 f412, f196, f240; +mul.f32 f413, f412, 0fBF167918; +mul.f32 f1343, f410, 0f3F737871; +sub.f32 f414, f413, f1343; +sub.f32 f415, f409, f414; +add.f32 f416, f414, f409; +add.f32 f1342, f104, f401; +mul.f32 f417, f401, 0f3F4F1BBD; +sub.f32 f418, f104, f417; +fma.rn.f32 f419, f403, 0f3E9E377A, f418; +mul.f32 f420, f410, 0f3F167918; +mul.f32 f421, f412, 0f3F737871; +sub.f32 f422, f421, f420; +sub.f32 f423, f419, f422; +add.f32 f424, f422, f419; +fma.rn.f32 f1340, f1345, 0f3E9E377A, f108; +mul.f32 f1341, f1344, 0f3F4F1BBD; +sub.f32 f427, f1340, f1341; +sub.f32 f428, f148, f280; +sub.f32 f430, f192, f236; +mul.f32 f1338, f428, 0f3F737871; +mul.f32 f1339, f430, 0fBF167918; +sub.f32 f432, f1339, f1338; +add.f32 f433, f432, f427; +sub.f32 f434, f427, f432; +add.f32 f1337, f108, f1345; +mul.f32 f435, f1345, 0f3F4F1BBD; +sub.f32 f436, f108, f435; +fma.rn.f32 f437, f1344, 0f3E9E377A, f436; +mul.f32 f438, f428, 0f3F167918; +mul.f32 f439, f430, 0f3F737871; +sub.f32 f440, f439, f438; +add.f32 f441, f440, f437; +sub.f32 f442, f437, f440; +add.f32 f443, f323, f338; +add.f32 f445, f328, f333; +add.f32 f1336, f117, f443; +add.f32 f446, f445, f1336; +add.f32 f447, f325, f340; +add.f32 f449, f330, f335; +add.f32 f1335, f135, f447; +add.f32 f450, f449, f1335; +fma.rn.f32 f1333, f443, 0f3E9E377A, f117; +mul.f32 f1334, f445, 0f3F4F1BBD; +sub.f32 f453, f1333, f1334; +sub.f32 f454, f325, f340; +sub.f32 f456, f330, f335; +mul.f32 f1331, f454, 0f3F737871; +mul.f32 f1332, f456, 0fBF167918; +sub.f32 f458, f1332, f1331; +sub.f32 f459, f453, f458; +add.f32 f460, f458, f453; +mul.f32 f461, f443, 0f3F4F1BBD; +sub.f32 f462, f117, f461; +fma.rn.f32 f463, f445, 0f3E9E377A, f462; +mul.f32 f464, f454, 0f3F167918; +mul.f32 f465, f456, 0f3F737871; +sub.f32 f466, f465, f464; +sub.f32 f467, f463, f466; +add.f32 f468, f466, f463; +mul.f32 f470, f449, 0f3F4F1BBD; +fma.rn.f32 f1330, f447, 0f3E9E377A, f135; +sub.f32 f471, f1330, f470; +sub.f32 f472, f323, f338; +sub.f32 f474, f328, f333; +mul.f32 f1328, f472, 0f3F737871; +mul.f32 f1329, f474, 0fBF167918; +sub.f32 f476, f1329, f1328; +add.f32 f477, f476, f471; +sub.f32 f478, f471, f476; +mul.f32 f479, f447, 0f3F4F1BBD; +sub.f32 f480, f135, f479; +fma.rn.f32 f481, f449, 0f3E9E377A, f480; +mul.f32 f482, f472, 0f3F167918; +mul.f32 f483, f474, 0f3F737871; +sub.f32 f484, f483, f482; +add.f32 f485, f484, f481; +sub.f32 f486, f481, f484; +add.f32 f487, f343, f358; +add.f32 f489, f348, f353; +add.f32 f1327, f125, f487; +add.f32 f490, f489, f1327; +add.f32 f491, f345, f360; +add.f32 f493, f350, f355; +add.f32 f1326, f143, f491; +add.f32 f494, f493, f1326; +mul.f32 f496, f489, 0f3F4F1BBD; +fma.rn.f32 f1325, f487, 0f3E9E377A, f125; +sub.f32 f497, f1325, f496; +sub.f32 f498, f345, f360; +sub.f32 f500, f350, f355; +mul.f32 f1323, f498, 0f3F737871; +mul.f32 f1324, f500, 0fBF167918; +sub.f32 f502, f1324, f1323; +sub.f32 f503, f497, f502; +add.f32 f504, f502, f497; +mul.f32 f505, f487, 0f3F4F1BBD; +sub.f32 f506, f125, f505; +fma.rn.f32 f507, f489, 0f3E9E377A, f506; +mul.f32 f508, f498, 0f3F167918; +mul.f32 f509, f500, 0f3F737871; +sub.f32 f510, f509, f508; +sub.f32 f511, f507, f510; +add.f32 f512, f510, f507; +mul.f32 f514, f493, 0f3F4F1BBD; +fma.rn.f32 f1322, f491, 0f3E9E377A, f143; +sub.f32 f515, f1322, f514; +sub.f32 f516, f343, f358; +sub.f32 f518, f348, f353; +mul.f32 f1320, f516, 0f3F737871; +mul.f32 f1321, f518, 0fBF167918; +sub.f32 f520, f1321, f1320; +add.f32 f521, f520, f515; +sub.f32 f522, f515, f520; +mul.f32 f523, f491, 0f3F4F1BBD; +sub.f32 f524, f143, f523; +fma.rn.f32 f525, f493, 0f3E9E377A, f524; +mul.f32 f526, f516, 0f3F167918; +mul.f32 f527, f518, 0f3F737871; +sub.f32 f528, f527, f526; +add.f32 f529, f528, f525; +sub.f32 f530, f525, f528; +add.f32 f531, f363, f378; +add.f32 f533, f368, f373; +add.f32 f1319, f126, f531; +add.f32 f534, f533, f1319; +add.f32 f535, f365, f380; +add.f32 f537, f370, f375; +add.f32 f1318, f144, f535; +add.f32 f538, f537, f1318; +mul.f32 f540, f533, 0f3F4F1BBD; +fma.rn.f32 f1317, f531, 0f3E9E377A, f126; +sub.f32 f541, f1317, f540; +sub.f32 f542, f365, f380; +sub.f32 f544, f370, f375; +mul.f32 f1315, f542, 0f3F737871; +mul.f32 f1316, f544, 0fBF167918; +sub.f32 f546, f1316, f1315; +sub.f32 f547, f541, f546; +add.f32 f548, f546, f541; +mul.f32 f549, f531, 0f3F4F1BBD; +sub.f32 f550, f126, f549; +fma.rn.f32 f551, f533, 0f3E9E377A, f550; +mul.f32 f552, f542, 0f3F167918; +mul.f32 f553, f544, 0f3F737871; +sub.f32 f554, f553, f552; +sub.f32 f555, f551, f554; +add.f32 f556, f554, f551; +fma.rn.f32 f1313, f535, 0f3E9E377A, f144; +mul.f32 f1314, f537, 0f3F4F1BBD; +sub.f32 f559, f1313, f1314; +sub.f32 f560, f363, f378; +sub.f32 f562, f368, f373; +mul.f32 f1311, f560, 0f3F737871; +mul.f32 f1312, f562, 0fBF167918; +sub.f32 f564, f1312, f1311; +add.f32 f565, f564, f559; +sub.f32 f566, f559, f564; +mul.f32 f567, f535, 0f3F4F1BBD; +sub.f32 f568, f144, f567; +fma.rn.f32 f569, f537, 0f3E9E377A, f568; +mul.f32 f570, f560, 0f3F167918; +mul.f32 f571, f562, 0f3F737871; +sub.f32 f572, f571, f570; +add.f32 f573, f572, f569; +sub.f32 f574, f569, f572; +add.f32 f575, f383, f398; +add.f32 f577, f388, f393; +add.f32 f1310, f118, f575; +add.f32 f578, f577, f1310; +add.f32 f579, f385, f400; +add.f32 f581, f390, f395; +add.f32 f1309, f136, f579; +add.f32 f582, f581, f1309; +fma.rn.f32 f1307, f575, 0f3E9E377A, f118; +mul.f32 f1308, f577, 0f3F4F1BBD; +sub.f32 f585, f1307, f1308; +sub.f32 f586, f385, f400; +sub.f32 f588, f390, f395; +mul.f32 f1305, f586, 0f3F737871; +mul.f32 f1306, f588, 0fBF167918; +sub.f32 f590, f1306, f1305; +sub.f32 f591, f585, f590; +add.f32 f592, f590, f585; +mul.f32 f593, f575, 0f3F4F1BBD; +sub.f32 f594, f118, f593; +fma.rn.f32 f595, f577, 0f3E9E377A, f594; +mul.f32 f596, f586, 0f3F167918; +mul.f32 f597, f588, 0f3F737871; +sub.f32 f598, f597, f596; +sub.f32 f599, f595, f598; +add.f32 f600, f598, f595; +mul.f32 f602, f581, 0f3F4F1BBD; +fma.rn.f32 f1304, f579, 0f3E9E377A, f136; +sub.f32 f603, f1304, f602; +sub.f32 f604, f383, f398; +sub.f32 f606, f388, f393; +mul.f32 f1302, f604, 0f3F737871; +mul.f32 f1303, f606, 0fBF167918; +sub.f32 f608, f1303, f1302; +add.f32 f609, f608, f603; +sub.f32 f610, f603, f608; +mul.f32 f611, f579, 0f3F4F1BBD; +sub.f32 f612, f136, f611; +fma.rn.f32 f613, f581, 0f3E9E377A, f612; +mul.f32 f614, f604, 0f3F167918; +mul.f32 f615, f606, 0f3F737871; +sub.f32 f616, f615, f614; +add.f32 f617, f616, f613; +sub.f32 f618, f613, f616; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 1000, r3; +mov.u64 rd5, %51; +mul.wide.u32 rd7, r7, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f619, f620}, [rd6]; +mul.f32 f624, f620, f450; +mul.f32 f625, f619, f450; +mul.f32 f627, f620, f620; +mul.f32 f1301, f619, f619; +sub.f32 f628, f1301, f627; +mul.f32 f629, f620, f619; +fma.rn.f32 f630, f620, f619, f629; +mul.f32 f632, f630, f494; +mul.f32 f633, f628, f494; +mul.f32 f1299, f619, f628; +mul.f32 f1300, f620, f630; +sub.f32 f636, f1299, f1300; +mul.f32 f1298, f628, f490; +mul.f32 f637, f619, f630; +fma.rn.f32 f638, f620, f628, f637; +mul.f32 f640, f638, f538; +mul.f32 f641, f636, f538; +mul.f32 f643, f620, f638; +mul.f32 f1297, f619, f636; +sub.f32 f644, f1297, f643; +mul.f32 f1296, f636, f534; +mul.f32 f645, f619, f638; +fma.rn.f32 f646, f620, f636, f645; +mul.f32 f648, f646, f582; +mul.f32 f649, f644, f582; +mul.f32 f1294, f619, f644; +mul.f32 f1295, f620, f646; +sub.f32 f652, f1294, f1295; +mul.f32 f1293, f644, f578; +mul.f32 f653, f619, f646; +fma.rn.f32 f654, f620, f644, f653; +mul.f32 f656, f654, f433; +mul.f32 f657, f652, f433; +mul.f32 f659, f620, f654; +mul.f32 f1292, f619, f652; +sub.f32 f660, f1292, f659; +mul.f32 f1291, f652, f415; +mul.f32 f661, f619, f654; +fma.rn.f32 f662, f620, f652, f661; +mul.f32 f664, f662, f477; +mul.f32 f665, f660, f477; +mul.f32 f667, f620, f662; +mul.f32 f1290, f619, f660; +sub.f32 f668, f1290, f667; +mul.f32 f1289, f660, f459; +mul.f32 f669, f619, f662; +fma.rn.f32 f670, f620, f660, f669; +mul.f32 f672, f670, f521; +mul.f32 f673, f668, f521; +mul.f32 f1287, f619, f668; +mul.f32 f1288, f620, f670; +sub.f32 f676, f1287, f1288; +mul.f32 f1286, f668, f503; +mul.f32 f677, f619, f670; +fma.rn.f32 f678, f620, f668, f677; +mul.f32 f680, f678, f565; +mul.f32 f681, f676, f565; +mul.f32 f683, f620, f678; +mul.f32 f1285, f619, f676; +sub.f32 f684, f1285, f683; +mul.f32 f1284, f676, f547; +mul.f32 f685, f619, f678; +fma.rn.f32 f686, f620, f676, f685; +mul.f32 f688, f686, f609; +mul.f32 f689, f684, f609; +mul.f32 f691, f620, f686; +mul.f32 f1283, f619, f684; +sub.f32 f692, f1283, f691; +mul.f32 f1282, f684, f591; +mul.f32 f693, f619, f686; +fma.rn.f32 f694, f620, f684, f693; +mul.f32 f696, f694, f441; +mul.f32 f697, f692, f441; +mul.f32 f1280, f619, f692; +mul.f32 f1281, f620, f694; +sub.f32 f700, f1280, f1281; +mul.f32 f1279, f692, f423; +mul.f32 f701, f619, f694; +fma.rn.f32 f702, f620, f692, f701; +mul.f32 f704, f702, f485; +mul.f32 f705, f700, f485; +mul.f32 f707, f620, f702; +mul.f32 f1278, f619, f700; +sub.f32 f708, f1278, f707; +mul.f32 f1277, f700, f467; +mul.f32 f709, f619, f702; +fma.rn.f32 f710, f620, f700, f709; +mul.f32 f712, f710, f529; +mul.f32 f713, f708, f529; +mul.f32 f1275, f619, f708; +mul.f32 f1276, f620, f710; +sub.f32 f716, f1275, f1276; +mul.f32 f1274, f708, f511; +mul.f32 f717, f619, f710; +fma.rn.f32 f718, f620, f708, f717; +mul.f32 f720, f718, f573; +mul.f32 f721, f716, f573; +mul.f32 f723, f620, f718; +mul.f32 f1273, f619, f716; +sub.f32 f724, f1273, f723; +mul.f32 f1272, f716, f555; +mul.f32 f725, f619, f718; +fma.rn.f32 f726, f620, f716, f725; +mul.f32 f728, f726, f617; +mul.f32 f729, f724, f617; +mul.f32 f731, f620, f726; +mul.f32 f1271, f619, f724; +sub.f32 f732, f1271, f731; +mul.f32 f1270, f724, f599; +mul.f32 f733, f619, f726; +fma.rn.f32 f734, f620, f724, f733; +mul.f32 f736, f734, f442; +mul.f32 f737, f732, f442; +mul.f32 f1268, f619, f732; +mul.f32 f1269, f620, f734; +sub.f32 f740, f1268, f1269; +mul.f32 f1267, f732, f424; +mul.f32 f741, f619, f734; +fma.rn.f32 f742, f620, f732, f741; +mul.f32 f744, f742, f486; +mul.f32 f745, f740, f486; +mul.f32 f747, f620, f742; +mul.f32 f1266, f619, f740; +sub.f32 f748, f1266, f747; +mul.f32 f1265, f740, f468; +mul.f32 f749, f619, f742; +fma.rn.f32 f750, f620, f740, f749; +mul.f32 f752, f750, f530; +mul.f32 f753, f748, f530; +mul.f32 f755, f620, f750; +mul.f32 f1264, f619, f748; +sub.f32 f756, f1264, f755; +mul.f32 f1263, f748, f512; +mul.f32 f757, f619, f750; +fma.rn.f32 f758, f620, f748, f757; +mul.f32 f760, f758, f574; +mul.f32 f761, f756, f574; +mul.f32 f1261, f619, f756; +mul.f32 f1262, f620, f758; +sub.f32 f764, f1261, f1262; +mul.f32 f1260, f756, f556; +mul.f32 f765, f619, f758; +fma.rn.f32 f766, f620, f756, f765; +mul.f32 f768, f766, f618; +mul.f32 f769, f764, f618; +mul.f32 f771, f620, f766; +mul.f32 f1259, f619, f764; +sub.f32 f772, f1259, f771; +mul.f32 f1258, f764, f600; +mul.f32 f773, f619, f766; +fma.rn.f32 f774, f620, f764, f773; +mul.f32 f776, f774, f434; +mul.f32 f777, f772, f434; +mul.f32 f1256, f619, f772; +mul.f32 f1257, f620, f774; +sub.f32 f780, f1256, f1257; +mul.f32 f1255, f772, f416; +mul.f32 f781, f619, f774; +fma.rn.f32 f782, f620, f772, f781; +mul.f32 f784, f782, f478; +mul.f32 f785, f780, f478; +mul.f32 f787, f620, f782; +mul.f32 f1254, f619, f780; +sub.f32 f788, f1254, f787; +mul.f32 f1253, f780, f460; +mul.f32 f789, f619, f782; +fma.rn.f32 f790, f620, f780, f789; +mul.f32 f792, f790, f522; +mul.f32 f793, f788, f522; +mul.f32 f795, f620, f790; +mul.f32 f1252, f619, f788; +sub.f32 f796, f1252, f795; +mul.f32 f1251, f788, f504; +mul.f32 f797, f619, f790; +fma.rn.f32 f798, f620, f788, f797; +mul.f32 f800, f798, f566; +mul.f32 f801, f796, f566; +mul.f32 f1249, f619, f796; +mul.f32 f1250, f620, f798; +sub.f32 f804, f1249, f1250; +mul.f32 f1248, f619, f446; +mul.f32 f805, f619, f798; +mul.f32 f1247, f796, f548; +fma.rn.f32 f806, f620, f796, f805; +mul.f32 f807, f804, f592; +mul.f32 f808, f806, f610; +mul.f32 f809, f804, f610; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +add.f32 f810, f1344, f1337; +add.f32 f811, f403, f1342; +st.shared.v2.f32 [r9], {f811, f810}; +fma.rn.f32 f812, f620, f446, f625; +sub.f32 f813, f1248, f624; +st.shared.v2.f32 [r9+8], {f813, f812}; +fma.rn.f32 f814, f630, f490, f633; +sub.f32 f815, f1298, f632; +st.shared.v2.f32 [r9+16], {f815, f814}; +fma.rn.f32 f816, f638, f534, f641; +sub.f32 f817, f1296, f640; +st.shared.v2.f32 [r9+24], {f817, f816}; +fma.rn.f32 f818, f646, f578, f649; +sub.f32 f819, f1293, f648; +st.shared.v2.f32 [r9+32], {f819, f818}; +sub.f32 f820, f1291, f656; +fma.rn.f32 f821, f654, f415, f657; +st.shared.v2.f32 [r9+40], {f820, f821}; +fma.rn.f32 f822, f662, f459, f665; +sub.f32 f823, f1289, f664; +st.shared.v2.f32 [r9+48], {f823, f822}; +sub.f32 f824, f1286, f672; +fma.rn.f32 f825, f670, f503, f673; +st.shared.v2.f32 [r9+56], {f824, f825}; +fma.rn.f32 f826, f678, f547, f681; +sub.f32 f827, f1284, f680; +st.shared.v2.f32 [r9+64], {f827, f826}; +fma.rn.f32 f828, f686, f591, f689; +sub.f32 f829, f1282, f688; +st.shared.v2.f32 [r9+72], {f829, f828}; +fma.rn.f32 f830, f694, f423, f697; +sub.f32 f831, f1279, f696; +st.shared.v2.f32 [r9+80], {f831, f830}; +fma.rn.f32 f832, f702, f467, f705; +sub.f32 f833, f1277, f704; +st.shared.v2.f32 [r9+88], {f833, f832}; +fma.rn.f32 f834, f710, f511, f713; +sub.f32 f835, f1274, f712; +st.shared.v2.f32 [r9+96], {f835, f834}; +fma.rn.f32 f836, f718, f555, f721; +sub.f32 f837, f1272, f720; +st.shared.v2.f32 [r9+104], {f837, f836}; +fma.rn.f32 f838, f726, f599, f729; +sub.f32 f839, f1270, f728; +st.shared.v2.f32 [r9+112], {f839, f838}; +fma.rn.f32 f840, f734, f424, f737; +sub.f32 f841, f1267, f736; +st.shared.v2.f32 [r9+120], {f841, f840}; +fma.rn.f32 f842, f742, f468, f745; +sub.f32 f843, f1265, f744; +st.shared.v2.f32 [r9+128], {f843, f842}; +fma.rn.f32 f844, f750, f512, f753; +sub.f32 f845, f1263, f752; +st.shared.v2.f32 [r9+136], {f845, f844}; +fma.rn.f32 f846, f758, f556, f761; +sub.f32 f847, f1260, f760; +st.shared.v2.f32 [r9+144], {f847, f846}; +fma.rn.f32 f848, f766, f600, f769; +sub.f32 f849, f1258, f768; +st.shared.v2.f32 [r9+152], {f849, f848}; +fma.rn.f32 f850, f774, f416, f777; +sub.f32 f851, f1255, f776; +st.shared.v2.f32 [r9+160], {f851, f850}; +fma.rn.f32 f852, f782, f460, f785; +sub.f32 f853, f1253, f784; +st.shared.v2.f32 [r9+168], {f853, f852}; +fma.rn.f32 f854, f790, f504, f793; +sub.f32 f855, f1251, f792; +st.shared.v2.f32 [r9+176], {f855, f854}; +fma.rn.f32 f856, f798, f548, f801; +sub.f32 f857, f1247, f800; +st.shared.v2.f32 [r9+184], {f857, f856}; +fma.rn.f32 f858, f806, f592, f809; +sub.f32 f859, f807, f808; +st.shared.v2.f32 [r9+192], {f859, f858}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.v2.f32 {f860, f861}, [r10]; +ld.shared.v2.f32 {f864, f865}, [r10+40]; +ld.shared.v2.f32 {f868, f869}, [r10+80]; +ld.shared.v2.f32 {f872, f873}, [r10+120]; +ld.shared.v2.f32 {f876, f877}, [r10+160]; +ld.shared.v2.f32 {f880, f881}, [r10+200]; +ld.shared.v2.f32 {f884, f885}, [r10+240]; +ld.shared.v2.f32 {f888, f889}, [r10+280]; +ld.shared.v2.f32 {f892, f893}, [r10+320]; +ld.shared.v2.f32 {f896, f897}, [r10+360]; +ld.shared.v2.f32 {f900, f901}, [r10+400]; +ld.shared.v2.f32 {f904, f905}, [r10+440]; +ld.shared.v2.f32 {f908, f909}, [r10+480]; +ld.shared.v2.f32 {f912, f913}, [r10+520]; +ld.shared.v2.f32 {f916, f917}, [r10+560]; +ld.shared.v2.f32 {f920, f921}, [r10+600]; +ld.shared.v2.f32 {f924, f925}, [r10+640]; +ld.shared.v2.f32 {f928, f929}, [r10+680]; +ld.shared.v2.f32 {f932, f933}, [r10+720]; +ld.shared.v2.f32 {f936, f937}, [r10+760]; +ld.shared.v2.f32 {f940, f941}, [r10+800]; +ld.shared.v2.f32 {f944, f945}, [r10+840]; +ld.shared.v2.f32 {f948, f949}, [r10+880]; +ld.shared.v2.f32 {f952, f953}, [r10+920]; +ld.shared.v2.f32 {f956, f957}, [r10+960]; +add.f32 f960, f880, f940; +add.f32 f962, f900, f920; +fma.rn.f32 f1245, f960, 0f3E9E377A, f860; +mul.f32 f1246, f962, 0f3F4F1BBD; +sub.f32 f968, f1245, f1246; +add.f32 f1244, f881, f941; +sub.f32 f969, f881, f941; +add.f32 f1243, f901, f921; +sub.f32 f971, f901, f921; +mul.f32 f1241, f969, 0f3F737871; +mul.f32 f1242, f971, 0fBF167918; +sub.f32 f973, f1242, f1241; +add.f32 f1240, f860, f960; +mul.f32 f974, f960, 0f3F4F1BBD; +sub.f32 f975, f860, f974; +fma.rn.f32 f976, f962, 0f3E9E377A, f975; +mul.f32 f977, f969, 0f3F167918; +mul.f32 f978, f971, 0f3F737871; +sub.f32 f979, f978, f977; +fma.rn.f32 f1238, f1244, 0f3E9E377A, f861; +mul.f32 f1239, f1243, 0f3F4F1BBD; +sub.f32 f982, f1238, f1239; +sub.f32 f983, f880, f940; +sub.f32 f985, f900, f920; +mul.f32 f1236, f983, 0f3F737871; +mul.f32 f1237, f985, 0fBF167918; +sub.f32 f987, f1237, f1236; +add.f32 f1235, f861, f1244; +mul.f32 f988, f1244, 0f3F4F1BBD; +sub.f32 f989, f861, f988; +fma.rn.f32 f990, f1243, 0f3E9E377A, f989; +mul.f32 f991, f983, 0f3F167918; +mul.f32 f992, f985, 0f3F737871; +sub.f32 f993, f992, f991; +add.f32 f994, f884, f944; +add.f32 f996, f904, f924; +mul.f32 f1001, f996, 0f3F4F1BBD; +fma.rn.f32 f1234, f994, 0f3E9E377A, f864; +sub.f32 f1002, f1234, f1001; +add.f32 f1233, f885, f945; +sub.f32 f1003, f885, f945; +add.f32 f1232, f905, f925; +sub.f32 f1005, f905, f925; +mul.f32 f1230, f1003, 0f3F737871; +mul.f32 f1231, f1005, 0fBF167918; +sub.f32 f1007, f1231, f1230; +add.f32 f1229, f864, f994; +mul.f32 f1008, f994, 0f3F4F1BBD; +sub.f32 f1009, f864, f1008; +fma.rn.f32 f1010, f996, 0f3E9E377A, f1009; +mul.f32 f1011, f1003, 0f3F167918; +mul.f32 f1012, f1005, 0f3F737871; +sub.f32 f1013, f1012, f1011; +mul.f32 f1015, f1232, 0f3F4F1BBD; +fma.rn.f32 f1228, f1233, 0f3E9E377A, f865; +sub.f32 f1016, f1228, f1015; +sub.f32 f1017, f884, f944; +sub.f32 f1019, f904, f924; +mul.f32 f1226, f1017, 0f3F737871; +mul.f32 f1227, f1019, 0fBF167918; +sub.f32 f1021, f1227, f1226; +add.f32 f1225, f865, f1233; +mul.f32 f1022, f1233, 0f3F4F1BBD; +sub.f32 f1023, f865, f1022; +fma.rn.f32 f1024, f1232, 0f3E9E377A, f1023; +mul.f32 f1025, f1017, 0f3F167918; +mul.f32 f1026, f1019, 0f3F737871; +sub.f32 f1027, f1026, f1025; +add.f32 f1028, f888, f948; +add.f32 f1030, f908, f928; +mul.f32 f1035, f1030, 0f3F4F1BBD; +fma.rn.f32 f1224, f1028, 0f3E9E377A, f868; +sub.f32 f1036, f1224, f1035; +add.f32 f1223, f889, f949; +sub.f32 f1037, f889, f949; +add.f32 f1222, f909, f929; +sub.f32 f1039, f909, f929; +mul.f32 f1220, f1037, 0f3F737871; +mul.f32 f1221, f1039, 0fBF167918; +sub.f32 f1041, f1221, f1220; +add.f32 f1219, f868, f1028; +mul.f32 f1042, f1028, 0f3F4F1BBD; +sub.f32 f1043, f868, f1042; +fma.rn.f32 f1044, f1030, 0f3E9E377A, f1043; +mul.f32 f1045, f1037, 0f3F167918; +mul.f32 f1046, f1039, 0f3F737871; +sub.f32 f1047, f1046, f1045; +mul.f32 f1049, f1222, 0f3F4F1BBD; +fma.rn.f32 f1218, f1223, 0f3E9E377A, f869; +sub.f32 f1050, f1218, f1049; +sub.f32 f1051, f888, f948; +sub.f32 f1053, f908, f928; +mul.f32 f1054, f1053, 0fBF167918; +mul.f32 f1217, f1051, 0f3F737871; +sub.f32 f1055, f1054, f1217; +add.f32 f1216, f869, f1223; +mul.f32 f1056, f1223, 0f3F4F1BBD; +sub.f32 f1057, f869, f1056; +fma.rn.f32 f1058, f1222, 0f3E9E377A, f1057; +mul.f32 f1059, f1051, 0f3F167918; +mul.f32 f1060, f1053, 0f3F737871; +sub.f32 f1061, f1060, f1059; +add.f32 f1062, f892, f952; +add.f32 f1064, f912, f932; +fma.rn.f32 f1214, f1062, 0f3E9E377A, f872; +mul.f32 f1215, f1064, 0f3F4F1BBD; +sub.f32 f1070, f1214, f1215; +add.f32 f1213, f893, f953; +sub.f32 f1071, f893, f953; +add.f32 f1212, f913, f933; +sub.f32 f1073, f913, f933; +mul.f32 f1210, f1071, 0f3F737871; +mul.f32 f1211, f1073, 0fBF167918; +sub.f32 f1075, f1211, f1210; +add.f32 f1209, f872, f1062; +mul.f32 f1076, f1062, 0f3F4F1BBD; +sub.f32 f1077, f872, f1076; +fma.rn.f32 f1078, f1064, 0f3E9E377A, f1077; +mul.f32 f1079, f1071, 0f3F167918; +mul.f32 f1080, f1073, 0f3F737871; +sub.f32 f1081, f1080, f1079; +fma.rn.f32 f1207, f1213, 0f3E9E377A, f873; +mul.f32 f1208, f1212, 0f3F4F1BBD; +sub.f32 f1084, f1207, f1208; +sub.f32 f1085, f892, f952; +sub.f32 f1087, f912, f932; +mul.f32 f1205, f1085, 0f3F737871; +mul.f32 f1206, f1087, 0fBF167918; +sub.f32 f1089, f1206, f1205; +add.f32 f1204, f873, f1213; +mul.f32 f1090, f1213, 0f3F4F1BBD; +sub.f32 f1091, f873, f1090; +fma.rn.f32 f1092, f1212, 0f3E9E377A, f1091; +mul.f32 f1093, f1085, 0f3F167918; +mul.f32 f1094, f1087, 0f3F737871; +sub.f32 f1095, f1094, f1093; +add.f32 f1096, f896, f956; +add.f32 f1098, f916, f936; +mul.f32 f1103, f1098, 0f3F4F1BBD; +fma.rn.f32 f1203, f1096, 0f3E9E377A, f876; +sub.f32 f1104, f1203, f1103; +add.f32 f1202, f897, f957; +sub.f32 f1105, f897, f957; +add.f32 f1201, f917, f937; +sub.f32 f1107, f917, f937; +mul.f32 f1199, f1105, 0f3F737871; +mul.f32 f1200, f1107, 0fBF167918; +sub.f32 f1109, f1200, f1199; +add.f32 f1198, f876, f1096; +mul.f32 f1110, f1096, 0f3F4F1BBD; +sub.f32 f1111, f876, f1110; +fma.rn.f32 f1112, f1098, 0f3E9E377A, f1111; +mul.f32 f1113, f1105, 0f3F167918; +mul.f32 f1114, f1107, 0f3F737871; +sub.f32 f1115, f1114, f1113; +mul.f32 f1117, f1201, 0f3F4F1BBD; +fma.rn.f32 f1197, f1202, 0f3E9E377A, f877; +sub.f32 f1118, f1197, f1117; +sub.f32 f1119, f896, f956; +sub.f32 f1121, f916, f936; +mul.f32 f1195, f1119, 0f3F737871; +mul.f32 f1196, f1121, 0fBF167918; +sub.f32 f1123, f1196, f1195; +add.f32 f1194, f877, f1202; +mul.f32 f1124, f1202, 0f3F4F1BBD; +sub.f32 f1125, f877, f1124; +fma.rn.f32 f1126, f1201, 0f3E9E377A, f1125; +mul.f32 f1127, f1119, 0f3F167918; +mul.f32 f1128, f1121, 0f3F737871; +sub.f32 f1129, f1128, f1127; +add.f32 %1, f1243, f1235; +add.f32 %0, f962, f1240; +add.f32 %3, f1232, f1225; +add.f32 %2, f996, f1229; +add.f32 %5, f1222, f1216; +add.f32 %4, f1030, f1219; +add.f32 %7, f1212, f1204; +add.f32 %6, f1064, f1209; +add.f32 %9, f1201, f1194; +add.f32 %8, f1098, f1198; +sub.f32 %10, f968, f973; +add.f32 %11, f987, f982; +sub.f32 %12, f1002, f1007; +add.f32 %13, f1021, f1016; +add.f32 %15, f1055, f1050; +sub.f32 %14, f1036, f1041; +add.f32 %17, f1089, f1084; +sub.f32 %16, f1070, f1075; +add.f32 %19, f1123, f1118; +sub.f32 %18, f1104, f1109; +sub.f32 %20, f976, f979; +add.f32 %21, f993, f990; +sub.f32 %22, f1010, f1013; +add.f32 %23, f1027, f1024; +sub.f32 %24, f1044, f1047; +add.f32 %25, f1061, f1058; +sub.f32 %26, f1078, f1081; +add.f32 %27, f1095, f1092; +add.f32 %29, f1129, f1126; +sub.f32 %28, f1112, f1115; +sub.f32 %31, f990, f993; +add.f32 %30, f979, f976; +sub.f32 %33, f1024, f1027; +add.f32 %32, f1013, f1010; +sub.f32 %35, f1058, f1061; +add.f32 %34, f1047, f1044; +sub.f32 %37, f1092, f1095; +add.f32 %36, f1081, f1078; +sub.f32 %39, f1126, f1129; +add.f32 %38, f1115, f1112; +sub.f32 %41, f982, f987; +add.f32 %40, f973, f968; +sub.f32 %43, f1016, f1021; +add.f32 %42, f1007, f1002; +sub.f32 %45, f1050, f1055; +add.f32 %44, f1041, f1036; +sub.f32 %47, f1084, f1089; +add.f32 %46, f1075, f1070; +sub.f32 %49, f1118, f1123; +add.f32 %48, f1109, f1104; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<158, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1130>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 500, r2; +add.f32 f101, %65, %105; +add.f32 f102, %52, f101; +add.f32 f103, %78, %92; +add.f32 f104, f103, f102; +add.f32 f105, %67, %107; +add.f32 f106, %53, f105; +add.f32 f107, %80, %93; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %52; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %67, %107; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %80, %93; +mul.f32 f115, f114, 0fBF167918; +sub.f32 f116, f115, f113; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %52, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f127, f105, 0f3E9E377A, %53; +mul.f32 f128, f107, 0f3F4F1BBD; +sub.f32 f129, f127, f128; +sub.f32 f130, %65, %105; +mul.f32 f131, f130, 0f3F737871; +sub.f32 f132, %78, %92; +mul.f32 f133, f132, 0fBF167918; +sub.f32 f134, f133, f131; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %53, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %68, %108; +add.f32 f146, %54, f145; +add.f32 f147, %81, %94; +add.f32 f148, f147, f146; +add.f32 f149, %69, %109; +add.f32 f150, %56, f149; +add.f32 f151, %83, %96; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f145, 0f3E9E377A, %54; +mul.f32 f154, f147, 0f3F4F1BBD; +sub.f32 f155, f153, f154; +sub.f32 f156, %69, %109; +mul.f32 f157, f156, 0f3F737871; +sub.f32 f158, %83, %96; +mul.f32 f159, f158, 0fBF167918; +sub.f32 f160, f159, f157; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %54, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +fma.rn.f32 f171, f149, 0f3E9E377A, %56; +mul.f32 f172, f151, 0f3F4F1BBD; +sub.f32 f173, f171, f172; +sub.f32 f174, %68, %108; +mul.f32 f175, f174, 0f3F737871; +sub.f32 f176, %81, %94; +mul.f32 f177, f176, 0fBF167918; +sub.f32 f178, f177, f175; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %56, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %70, %110; +add.f32 f190, %57, f189; +add.f32 f191, %84, %97; +add.f32 f192, f191, f190; +add.f32 f193, %72, %112; +add.f32 f194, %59, f193; +add.f32 f195, %85, %99; +add.f32 f196, f195, f194; +fma.rn.f32 f197, f189, 0f3E9E377A, %57; +mul.f32 f198, f191, 0f3F4F1BBD; +sub.f32 f199, f197, f198; +sub.f32 f200, %72, %112; +mul.f32 f201, f200, 0f3F737871; +sub.f32 f202, %85, %99; +mul.f32 f203, f202, 0fBF167918; +sub.f32 f204, f203, f201; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %57, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f215, f193, 0f3E9E377A, %59; +mul.f32 f216, f195, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, %70, %110; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, %84, %97; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %59, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %73, %113; +add.f32 f234, %60, f233; +add.f32 f235, %86, %100; +add.f32 f236, f235, f234; +add.f32 f237, %75, %115; +add.f32 f238, %61, f237; +add.f32 f239, %88, %101; +add.f32 f240, f239, f238; +fma.rn.f32 f241, f233, 0f3E9E377A, %60; +mul.f32 f242, f235, 0f3F4F1BBD; +sub.f32 f243, f241, f242; +sub.f32 f244, %75, %115; +mul.f32 f245, f244, 0f3F737871; +sub.f32 f246, %88, %101; +mul.f32 f247, f246, 0fBF167918; +sub.f32 f248, f247, f245; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %60, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +fma.rn.f32 f259, f237, 0f3E9E377A, %61; +mul.f32 f260, f239, 0f3F4F1BBD; +sub.f32 f261, f259, f260; +sub.f32 f262, %73, %113; +mul.f32 f263, f262, 0f3F737871; +sub.f32 f264, %86, %100; +mul.f32 f265, f264, 0fBF167918; +sub.f32 f266, f265, f263; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %61, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %76, %116; +add.f32 f278, %62, f277; +add.f32 f279, %89, %102; +add.f32 f280, f279, f278; +add.f32 f281, %77, %117; +add.f32 f282, %64, f281; +add.f32 f283, %91, %104; +add.f32 f284, f283, f282; +fma.rn.f32 f285, f277, 0f3E9E377A, %62; +mul.f32 f286, f279, 0f3F4F1BBD; +sub.f32 f287, f285, f286; +sub.f32 f288, %77, %117; +mul.f32 f289, f288, 0f3F737871; +sub.f32 f290, %91, %104; +mul.f32 f291, f290, 0fBF167918; +sub.f32 f292, f291, f289; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %62, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +fma.rn.f32 f303, f281, 0f3E9E377A, %64; +mul.f32 f304, f283, 0f3F4F1BBD; +sub.f32 f305, f303, f304; +sub.f32 f306, %76, %116; +mul.f32 f307, f306, 0f3F737871; +sub.f32 f308, %89, %102; +mul.f32 f309, f308, 0fBF167918; +sub.f32 f310, f309, f307; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %64, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mov.u32 r4, %tid.x; +mul.f32 f321, f161, 0f3F77F511; +mul.f32 f322, f179, 0fBE7EA890; +sub.f32 f323, f321, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f326, f205, 0f3F6055A2; +mul.f32 f327, f223, 0fBEF6A86B; +sub.f32 f328, f326, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f331, f249, 0f3F3A9DB0; +mul.f32 f332, f267, 0fBF2F3E7B; +sub.f32 f333, f331, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f336, f293, 0f3F092BF2; +mul.f32 f337, f311, 0fBF5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f341, f169, 0f3F6055A2; +mul.f32 f342, f187, 0fBEF6A86B; +sub.f32 f343, f341, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f346, f213, 0f3F092BF2; +mul.f32 f347, f231, 0fBF5825E0; +sub.f32 f348, f346, f347; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f351, f257, 0f3D809851; +mul.f32 f352, f275, 0fBF7F7EAE; +sub.f32 f353, f351, f352; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f356, f301, 0fBED9FFBE; +mul.f32 f357, f319, 0fBF67A2BF; +sub.f32 f358, f356, f357; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f361, f170, 0f3F3A9DB0; +mul.f32 f362, f188, 0fBF2F3E7B; +sub.f32 f363, f361, f362; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f366, f214, 0f3D809851; +mul.f32 f367, f232, 0fBF7F7EAE; +sub.f32 f368, f366, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f371, f258, 0fBF232E38; +mul.f32 f372, f276, 0fBF45405B; +sub.f32 f373, f371, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f376, f302, 0fBF7DFB3B; +mul.f32 f377, f320, 0fBE00575B; +sub.f32 f378, f376, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f381, f162, 0f3F092BF2; +mul.f32 f382, f180, 0fBF5825E0; +sub.f32 f383, f381, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f386, f206, 0fBED9FFBE; +mul.f32 f387, f224, 0fBF67A2BF; +sub.f32 f388, f386, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f391, f250, 0fBF7DFB3B; +mul.f32 f392, f268, 0fBE00575B; +sub.f32 f393, f391, f392; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f396, f294, 0fBF232E38; +mul.f32 f397, f312, 0f3F45405B; +sub.f32 f398, f396, f397; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f402, f104, f401; +add.f32 f403, f192, f236; +add.f32 f404, f403, f402; +add.f32 f405, f152, f284; +add.f32 f406, f108, f405; +add.f32 f407, f196, f240; +add.f32 f408, f407, f406; +fma.rn.f32 f409, f401, 0f3E9E377A, f104; +mul.f32 f410, f403, 0f3F4F1BBD; +sub.f32 f411, f409, f410; +sub.f32 f412, f152, f284; +mul.f32 f413, f412, 0f3F737871; +sub.f32 f414, f196, f240; +mul.f32 f415, f414, 0fBF167918; +sub.f32 f416, f415, f413; +sub.f32 f417, f411, f416; +add.f32 f418, f416, f411; +mul.f32 f419, f401, 0f3F4F1BBD; +sub.f32 f420, f104, f419; +fma.rn.f32 f421, f403, 0f3E9E377A, f420; +mul.f32 f422, f412, 0f3F167918; +mul.f32 f423, f414, 0f3F737871; +sub.f32 f424, f423, f422; +sub.f32 f425, f421, f424; +add.f32 f426, f424, f421; +fma.rn.f32 f427, f405, 0f3E9E377A, f108; +mul.f32 f428, f407, 0f3F4F1BBD; +sub.f32 f429, f427, f428; +sub.f32 f430, f148, f280; +mul.f32 f431, f430, 0f3F737871; +sub.f32 f432, f192, f236; +mul.f32 f433, f432, 0fBF167918; +sub.f32 f434, f433, f431; +add.f32 f435, f434, f429; +sub.f32 f436, f429, f434; +mul.f32 f437, f405, 0f3F4F1BBD; +sub.f32 f438, f108, f437; +fma.rn.f32 f439, f407, 0f3E9E377A, f438; +mul.f32 f440, f430, 0f3F167918; +mul.f32 f441, f432, 0f3F737871; +sub.f32 f442, f441, f440; +add.f32 f443, f442, f439; +sub.f32 f444, f439, f442; +add.f32 f445, f323, f338; +add.f32 f446, f117, f445; +add.f32 f447, f328, f333; +add.f32 f448, f447, f446; +add.f32 f449, f325, f340; +add.f32 f450, f135, f449; +add.f32 f451, f330, f335; +add.f32 f452, f451, f450; +fma.rn.f32 f453, f445, 0f3E9E377A, f117; +mul.f32 f454, f447, 0f3F4F1BBD; +sub.f32 f455, f453, f454; +sub.f32 f456, f325, f340; +mul.f32 f457, f456, 0f3F737871; +sub.f32 f458, f330, f335; +mul.f32 f459, f458, 0fBF167918; +sub.f32 f460, f459, f457; +sub.f32 f461, f455, f460; +add.f32 f462, f460, f455; +mul.f32 f463, f445, 0f3F4F1BBD; +sub.f32 f464, f117, f463; +fma.rn.f32 f465, f447, 0f3E9E377A, f464; +mul.f32 f466, f456, 0f3F167918; +mul.f32 f467, f458, 0f3F737871; +sub.f32 f468, f467, f466; +sub.f32 f469, f465, f468; +add.f32 f470, f468, f465; +fma.rn.f32 f471, f449, 0f3E9E377A, f135; +mul.f32 f472, f451, 0f3F4F1BBD; +sub.f32 f473, f471, f472; +sub.f32 f474, f323, f338; +mul.f32 f475, f474, 0f3F737871; +sub.f32 f476, f328, f333; +mul.f32 f477, f476, 0fBF167918; +sub.f32 f478, f477, f475; +add.f32 f479, f478, f473; +sub.f32 f480, f473, f478; +mul.f32 f481, f449, 0f3F4F1BBD; +sub.f32 f482, f135, f481; +fma.rn.f32 f483, f451, 0f3E9E377A, f482; +mul.f32 f484, f474, 0f3F167918; +mul.f32 f485, f476, 0f3F737871; +sub.f32 f486, f485, f484; +add.f32 f487, f486, f483; +sub.f32 f488, f483, f486; +add.f32 f489, f343, f358; +add.f32 f490, f125, f489; +add.f32 f491, f348, f353; +add.f32 f492, f491, f490; +add.f32 f493, f345, f360; +add.f32 f494, f143, f493; +add.f32 f495, f350, f355; +add.f32 f496, f495, f494; +fma.rn.f32 f497, f489, 0f3E9E377A, f125; +mul.f32 f498, f491, 0f3F4F1BBD; +sub.f32 f499, f497, f498; +sub.f32 f500, f345, f360; +mul.f32 f501, f500, 0f3F737871; +sub.f32 f502, f350, f355; +mul.f32 f503, f502, 0fBF167918; +sub.f32 f504, f503, f501; +sub.f32 f505, f499, f504; +add.f32 f506, f504, f499; +mul.f32 f507, f489, 0f3F4F1BBD; +sub.f32 f508, f125, f507; +fma.rn.f32 f509, f491, 0f3E9E377A, f508; +mul.f32 f510, f500, 0f3F167918; +mul.f32 f511, f502, 0f3F737871; +sub.f32 f512, f511, f510; +sub.f32 f513, f509, f512; +add.f32 f514, f512, f509; +fma.rn.f32 f515, f493, 0f3E9E377A, f143; +mul.f32 f516, f495, 0f3F4F1BBD; +sub.f32 f517, f515, f516; +sub.f32 f518, f343, f358; +mul.f32 f519, f518, 0f3F737871; +sub.f32 f520, f348, f353; +mul.f32 f521, f520, 0fBF167918; +sub.f32 f522, f521, f519; +add.f32 f523, f522, f517; +sub.f32 f524, f517, f522; +mul.f32 f525, f493, 0f3F4F1BBD; +sub.f32 f526, f143, f525; +fma.rn.f32 f527, f495, 0f3E9E377A, f526; +mul.f32 f528, f518, 0f3F167918; +mul.f32 f529, f520, 0f3F737871; +sub.f32 f530, f529, f528; +add.f32 f531, f530, f527; +sub.f32 f532, f527, f530; +add.f32 f533, f363, f378; +add.f32 f534, f126, f533; +add.f32 f535, f368, f373; +add.f32 f536, f535, f534; +add.f32 f537, f365, f380; +add.f32 f538, f144, f537; +add.f32 f539, f370, f375; +add.f32 f540, f539, f538; +fma.rn.f32 f541, f533, 0f3E9E377A, f126; +mul.f32 f542, f535, 0f3F4F1BBD; +sub.f32 f543, f541, f542; +sub.f32 f544, f365, f380; +mul.f32 f545, f544, 0f3F737871; +sub.f32 f546, f370, f375; +mul.f32 f547, f546, 0fBF167918; +sub.f32 f548, f547, f545; +sub.f32 f549, f543, f548; +add.f32 f550, f548, f543; +mul.f32 f551, f533, 0f3F4F1BBD; +sub.f32 f552, f126, f551; +fma.rn.f32 f553, f535, 0f3E9E377A, f552; +mul.f32 f554, f544, 0f3F167918; +mul.f32 f555, f546, 0f3F737871; +sub.f32 f556, f555, f554; +sub.f32 f557, f553, f556; +add.f32 f558, f556, f553; +fma.rn.f32 f559, f537, 0f3E9E377A, f144; +mul.f32 f560, f539, 0f3F4F1BBD; +sub.f32 f561, f559, f560; +sub.f32 f562, f363, f378; +mul.f32 f563, f562, 0f3F737871; +sub.f32 f564, f368, f373; +mul.f32 f565, f564, 0fBF167918; +sub.f32 f566, f565, f563; +add.f32 f567, f566, f561; +sub.f32 f568, f561, f566; +mul.f32 f569, f537, 0f3F4F1BBD; +sub.f32 f570, f144, f569; +fma.rn.f32 f571, f539, 0f3E9E377A, f570; +mul.f32 f572, f562, 0f3F167918; +mul.f32 f573, f564, 0f3F737871; +sub.f32 f574, f573, f572; +add.f32 f575, f574, f571; +sub.f32 f576, f571, f574; +add.f32 f577, f383, f398; +add.f32 f578, f118, f577; +add.f32 f579, f388, f393; +add.f32 f580, f579, f578; +add.f32 f581, f385, f400; +add.f32 f582, f136, f581; +add.f32 f583, f390, f395; +add.f32 f584, f583, f582; +fma.rn.f32 f585, f577, 0f3E9E377A, f118; +mul.f32 f586, f579, 0f3F4F1BBD; +sub.f32 f587, f585, f586; +sub.f32 f588, f385, f400; +mul.f32 f589, f588, 0f3F737871; +sub.f32 f590, f390, f395; +mul.f32 f591, f590, 0fBF167918; +sub.f32 f592, f591, f589; +sub.f32 f593, f587, f592; +add.f32 f594, f592, f587; +mul.f32 f595, f577, 0f3F4F1BBD; +sub.f32 f596, f118, f595; +fma.rn.f32 f597, f579, 0f3E9E377A, f596; +mul.f32 f598, f588, 0f3F167918; +mul.f32 f599, f590, 0f3F737871; +sub.f32 f600, f599, f598; +sub.f32 f601, f597, f600; +add.f32 f602, f600, f597; +fma.rn.f32 f603, f581, 0f3E9E377A, f136; +mul.f32 f604, f583, 0f3F4F1BBD; +sub.f32 f605, f603, f604; +sub.f32 f606, f383, f398; +mul.f32 f607, f606, 0f3F737871; +sub.f32 f608, f388, f393; +mul.f32 f609, f608, 0fBF167918; +sub.f32 f610, f609, f607; +add.f32 f611, f610, f605; +sub.f32 f612, f605, f610; +mul.f32 f613, f581, 0f3F4F1BBD; +sub.f32 f614, f136, f613; +fma.rn.f32 f615, f583, 0f3E9E377A, f614; +mul.f32 f616, f606, 0f3F167918; +mul.f32 f617, f608, 0f3F737871; +sub.f32 f618, f617, f616; +add.f32 f619, f618, f615; +sub.f32 f620, f615, f618; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f621, f622}, [rd6]; +mul.f32 f625, f621, f448; +mul.f32 f626, f622, f452; +sub.f32 f627, f625, f626; +mul.f32 f628, f621, f452; +fma.rn.f32 f629, f622, f448, f628; +mul.f32 f630, f621, f621; +mul.f32 f631, f622, f622; +sub.f32 f632, f630, f631; +mul.f32 f633, f622, f621; +fma.rn.f32 f634, f622, f621, f633; +mul.f32 f635, f632, f492; +mul.f32 f636, f634, f496; +sub.f32 f637, f635, f636; +mul.f32 f638, f632, f496; +fma.rn.f32 f639, f634, f492, f638; +mul.f32 f640, f621, f632; +mul.f32 f641, f622, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f621, f634; +fma.rn.f32 f644, f622, f632, f643; +mul.f32 f645, f642, f536; +mul.f32 f646, f644, f540; +sub.f32 f647, f645, f646; +mul.f32 f648, f642, f540; +fma.rn.f32 f649, f644, f536, f648; +mul.f32 f650, f621, f642; +mul.f32 f651, f622, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f621, f644; +fma.rn.f32 f654, f622, f642, f653; +mul.f32 f655, f652, f580; +mul.f32 f656, f654, f584; +sub.f32 f657, f655, f656; +mul.f32 f658, f652, f584; +fma.rn.f32 f659, f654, f580, f658; +mul.f32 f660, f621, f652; +mul.f32 f661, f622, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f621, f654; +fma.rn.f32 f664, f622, f652, f663; +mul.f32 f665, f662, f417; +mul.f32 f666, f664, f435; +sub.f32 f667, f665, f666; +mul.f32 f668, f662, f435; +fma.rn.f32 f669, f664, f417, f668; +mul.f32 f670, f621, f662; +mul.f32 f671, f622, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f621, f664; +fma.rn.f32 f674, f622, f662, f673; +mul.f32 f675, f672, f461; +mul.f32 f676, f674, f479; +sub.f32 f677, f675, f676; +mul.f32 f678, f672, f479; +fma.rn.f32 f679, f674, f461, f678; +mul.f32 f680, f621, f672; +mul.f32 f681, f622, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f621, f674; +fma.rn.f32 f684, f622, f672, f683; +mul.f32 f685, f682, f505; +mul.f32 f686, f684, f523; +sub.f32 f687, f685, f686; +mul.f32 f688, f682, f523; +fma.rn.f32 f689, f684, f505, f688; +mul.f32 f690, f621, f682; +mul.f32 f691, f622, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f621, f684; +fma.rn.f32 f694, f622, f682, f693; +mul.f32 f695, f692, f549; +mul.f32 f696, f694, f567; +sub.f32 f697, f695, f696; +mul.f32 f698, f692, f567; +fma.rn.f32 f699, f694, f549, f698; +mul.f32 f700, f621, f692; +mul.f32 f701, f622, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f621, f694; +fma.rn.f32 f704, f622, f692, f703; +mul.f32 f705, f702, f593; +mul.f32 f706, f704, f611; +sub.f32 f707, f705, f706; +mul.f32 f708, f702, f611; +fma.rn.f32 f709, f704, f593, f708; +mul.f32 f710, f621, f702; +mul.f32 f711, f622, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f621, f704; +fma.rn.f32 f714, f622, f702, f713; +mul.f32 f715, f712, f425; +mul.f32 f716, f714, f443; +sub.f32 f717, f715, f716; +mul.f32 f718, f712, f443; +fma.rn.f32 f719, f714, f425, f718; +mul.f32 f720, f621, f712; +mul.f32 f721, f622, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f621, f714; +fma.rn.f32 f724, f622, f712, f723; +mul.f32 f725, f722, f469; +mul.f32 f726, f724, f487; +sub.f32 f727, f725, f726; +mul.f32 f728, f722, f487; +fma.rn.f32 f729, f724, f469, f728; +mul.f32 f730, f621, f722; +mul.f32 f731, f622, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f621, f724; +fma.rn.f32 f734, f622, f722, f733; +mul.f32 f735, f732, f513; +mul.f32 f736, f734, f531; +sub.f32 f737, f735, f736; +mul.f32 f738, f732, f531; +fma.rn.f32 f739, f734, f513, f738; +mul.f32 f740, f621, f732; +mul.f32 f741, f622, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f621, f734; +fma.rn.f32 f744, f622, f732, f743; +mul.f32 f745, f742, f557; +mul.f32 f746, f744, f575; +sub.f32 f747, f745, f746; +mul.f32 f748, f742, f575; +fma.rn.f32 f749, f744, f557, f748; +mul.f32 f750, f621, f742; +mul.f32 f751, f622, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f621, f744; +fma.rn.f32 f754, f622, f742, f753; +mul.f32 f755, f752, f601; +mul.f32 f756, f754, f619; +sub.f32 f757, f755, f756; +mul.f32 f758, f752, f619; +fma.rn.f32 f759, f754, f601, f758; +mul.f32 f760, f621, f752; +mul.f32 f761, f622, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f621, f754; +fma.rn.f32 f764, f622, f752, f763; +mul.f32 f765, f762, f426; +mul.f32 f766, f764, f444; +sub.f32 f767, f765, f766; +mul.f32 f768, f762, f444; +fma.rn.f32 f769, f764, f426, f768; +mul.f32 f770, f621, f762; +mul.f32 f771, f622, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f621, f764; +fma.rn.f32 f774, f622, f762, f773; +mul.f32 f775, f772, f470; +mul.f32 f776, f774, f488; +sub.f32 f777, f775, f776; +mul.f32 f778, f772, f488; +fma.rn.f32 f779, f774, f470, f778; +mul.f32 f780, f621, f772; +mul.f32 f781, f622, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f621, f774; +fma.rn.f32 f784, f622, f772, f783; +mul.f32 f785, f782, f514; +mul.f32 f786, f784, f532; +sub.f32 f787, f785, f786; +mul.f32 f788, f782, f532; +fma.rn.f32 f789, f784, f514, f788; +mul.f32 f790, f621, f782; +mul.f32 f791, f622, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f621, f784; +fma.rn.f32 f794, f622, f782, f793; +mul.f32 f795, f792, f558; +mul.f32 f796, f794, f576; +sub.f32 f797, f795, f796; +mul.f32 f798, f792, f576; +fma.rn.f32 f799, f794, f558, f798; +mul.f32 f800, f621, f792; +mul.f32 f801, f622, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f621, f794; +fma.rn.f32 f804, f622, f792, f803; +mul.f32 f805, f802, f602; +mul.f32 f806, f804, f620; +sub.f32 f807, f805, f806; +mul.f32 f808, f802, f620; +fma.rn.f32 f809, f804, f602, f808; +mul.f32 f810, f621, f802; +mul.f32 f811, f622, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f621, f804; +fma.rn.f32 f814, f622, f802, f813; +mul.f32 f815, f812, f418; +mul.f32 f816, f814, f436; +sub.f32 f817, f815, f816; +mul.f32 f818, f812, f436; +fma.rn.f32 f819, f814, f418, f818; +mul.f32 f820, f621, f812; +mul.f32 f821, f622, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f621, f814; +fma.rn.f32 f824, f622, f812, f823; +mul.f32 f825, f822, f462; +mul.f32 f826, f824, f480; +sub.f32 f827, f825, f826; +mul.f32 f828, f822, f480; +fma.rn.f32 f829, f824, f462, f828; +mul.f32 f830, f621, f822; +mul.f32 f831, f622, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f621, f824; +fma.rn.f32 f834, f622, f822, f833; +mul.f32 f835, f832, f506; +mul.f32 f836, f834, f524; +sub.f32 f837, f835, f836; +mul.f32 f838, f832, f524; +fma.rn.f32 f839, f834, f506, f838; +mul.f32 f840, f621, f832; +mul.f32 f841, f622, f834; +sub.f32 f842, f840, f841; +mul.f32 f843, f621, f834; +fma.rn.f32 f844, f622, f832, f843; +mul.f32 f845, f842, f550; +mul.f32 f846, f844, f568; +sub.f32 f847, f845, f846; +mul.f32 f848, f842, f568; +fma.rn.f32 f849, f844, f550, f848; +mul.f32 f850, f621, f842; +mul.f32 f851, f622, f844; +sub.f32 f852, f850, f851; +mul.f32 f853, f621, f844; +fma.rn.f32 f854, f622, f842, f853; +mul.f32 f855, f852, f594; +mul.f32 f856, f854, f612; +sub.f32 f857, f855, f856; +mul.f32 f858, f852, f612; +fma.rn.f32 f859, f854, f594, f858; +mad.lo.s32 r8, r5, 500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f404; +st.shared.f32 [r9+4], f627; +st.shared.f32 [r9+8], f637; +st.shared.f32 [r9+12], f647; +st.shared.f32 [r9+16], f657; +st.shared.f32 [r9+20], f667; +st.shared.f32 [r9+24], f677; +st.shared.f32 [r9+28], f687; +st.shared.f32 [r9+32], f697; +st.shared.f32 [r9+36], f707; +st.shared.f32 [r9+40], f717; +st.shared.f32 [r9+44], f727; +st.shared.f32 [r9+48], f737; +st.shared.f32 [r9+52], f747; +st.shared.f32 [r9+56], f757; +st.shared.f32 [r9+60], f767; +st.shared.f32 [r9+64], f777; +st.shared.f32 [r9+68], f787; +st.shared.f32 [r9+72], f797; +st.shared.f32 [r9+76], f807; +st.shared.f32 [r9+80], f817; +st.shared.f32 [r9+84], f827; +st.shared.f32 [r9+88], f837; +st.shared.f32 [r9+92], f847; +st.shared.f32 [r9+96], f857; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f860, [r10]; +ld.shared.f32 f861, [r10+20]; +ld.shared.f32 f862, [r10+40]; +ld.shared.f32 f863, [r10+60]; +ld.shared.f32 f864, [r10+80]; +ld.shared.f32 f865, [r10+100]; +ld.shared.f32 f866, [r10+120]; +ld.shared.f32 f867, [r10+140]; +ld.shared.f32 f868, [r10+160]; +ld.shared.f32 f869, [r10+180]; +ld.shared.f32 f870, [r10+200]; +ld.shared.f32 f871, [r10+220]; +ld.shared.f32 f872, [r10+240]; +ld.shared.f32 f873, [r10+260]; +ld.shared.f32 f874, [r10+280]; +ld.shared.f32 f875, [r10+300]; +ld.shared.f32 f876, [r10+320]; +ld.shared.f32 f877, [r10+340]; +ld.shared.f32 f878, [r10+360]; +ld.shared.f32 f879, [r10+380]; +ld.shared.f32 f880, [r10+400]; +ld.shared.f32 f881, [r10+420]; +ld.shared.f32 f882, [r10+440]; +ld.shared.f32 f883, [r10+460]; +ld.shared.f32 f884, [r10+480]; +barrier.sync 0; +st.shared.f32 [r9], f408; +st.shared.f32 [r9+4], f629; +st.shared.f32 [r9+8], f639; +st.shared.f32 [r9+12], f649; +st.shared.f32 [r9+16], f659; +st.shared.f32 [r9+20], f669; +st.shared.f32 [r9+24], f679; +st.shared.f32 [r9+28], f689; +st.shared.f32 [r9+32], f699; +st.shared.f32 [r9+36], f709; +st.shared.f32 [r9+40], f719; +st.shared.f32 [r9+44], f729; +st.shared.f32 [r9+48], f739; +st.shared.f32 [r9+52], f749; +st.shared.f32 [r9+56], f759; +st.shared.f32 [r9+60], f769; +st.shared.f32 [r9+64], f779; +st.shared.f32 [r9+68], f789; +st.shared.f32 [r9+72], f799; +st.shared.f32 [r9+76], f809; +st.shared.f32 [r9+80], f819; +st.shared.f32 [r9+84], f829; +st.shared.f32 [r9+88], f839; +st.shared.f32 [r9+92], f849; +st.shared.f32 [r9+96], f859; +barrier.sync 0; +ld.shared.f32 f885, [r10]; +ld.shared.f32 f886, [r10+20]; +ld.shared.f32 f887, [r10+40]; +ld.shared.f32 f888, [r10+60]; +ld.shared.f32 f889, [r10+80]; +ld.shared.f32 f890, [r10+100]; +ld.shared.f32 f891, [r10+120]; +ld.shared.f32 f892, [r10+140]; +ld.shared.f32 f893, [r10+160]; +ld.shared.f32 f894, [r10+180]; +ld.shared.f32 f895, [r10+200]; +ld.shared.f32 f896, [r10+220]; +ld.shared.f32 f897, [r10+240]; +ld.shared.f32 f898, [r10+260]; +ld.shared.f32 f899, [r10+280]; +ld.shared.f32 f900, [r10+300]; +ld.shared.f32 f901, [r10+320]; +ld.shared.f32 f902, [r10+340]; +ld.shared.f32 f903, [r10+360]; +ld.shared.f32 f904, [r10+380]; +ld.shared.f32 f905, [r10+400]; +ld.shared.f32 f906, [r10+420]; +ld.shared.f32 f907, [r10+440]; +ld.shared.f32 f908, [r10+460]; +ld.shared.f32 f909, [r10+480]; +add.f32 f910, f865, f880; +add.f32 f911, f860, f910; +add.f32 f912, f870, f875; +add.f32 f913, f890, f905; +add.f32 f914, f885, f913; +add.f32 f915, f895, f900; +fma.rn.f32 f916, f910, 0f3E9E377A, f860; +mul.f32 f917, f912, 0f3F4F1BBD; +sub.f32 f918, f916, f917; +sub.f32 f919, f890, f905; +mul.f32 f920, f919, 0f3F737871; +sub.f32 f921, f895, f900; +mul.f32 f922, f921, 0fBF167918; +sub.f32 f923, f922, f920; +mul.f32 f924, f910, 0f3F4F1BBD; +sub.f32 f925, f860, f924; +fma.rn.f32 f926, f912, 0f3E9E377A, f925; +mul.f32 f927, f919, 0f3F167918; +mul.f32 f928, f921, 0f3F737871; +sub.f32 f929, f928, f927; +fma.rn.f32 f930, f913, 0f3E9E377A, f885; +mul.f32 f931, f915, 0f3F4F1BBD; +sub.f32 f932, f930, f931; +sub.f32 f933, f865, f880; +mul.f32 f934, f933, 0f3F737871; +sub.f32 f935, f870, f875; +mul.f32 f936, f935, 0fBF167918; +sub.f32 f937, f936, f934; +mul.f32 f938, f913, 0f3F4F1BBD; +sub.f32 f939, f885, f938; +fma.rn.f32 f940, f915, 0f3E9E377A, f939; +mul.f32 f941, f933, 0f3F167918; +mul.f32 f942, f935, 0f3F737871; +sub.f32 f943, f942, f941; +add.f32 f944, f866, f881; +add.f32 f945, f861, f944; +add.f32 f946, f871, f876; +add.f32 f947, f891, f906; +add.f32 f948, f886, f947; +add.f32 f949, f896, f901; +fma.rn.f32 f950, f944, 0f3E9E377A, f861; +mul.f32 f951, f946, 0f3F4F1BBD; +sub.f32 f952, f950, f951; +sub.f32 f953, f891, f906; +mul.f32 f954, f953, 0f3F737871; +sub.f32 f955, f896, f901; +mul.f32 f956, f955, 0fBF167918; +sub.f32 f957, f956, f954; +mul.f32 f958, f944, 0f3F4F1BBD; +sub.f32 f959, f861, f958; +fma.rn.f32 f960, f946, 0f3E9E377A, f959; +mul.f32 f961, f953, 0f3F167918; +mul.f32 f962, f955, 0f3F737871; +sub.f32 f963, f962, f961; +fma.rn.f32 f964, f947, 0f3E9E377A, f886; +mul.f32 f965, f949, 0f3F4F1BBD; +sub.f32 f966, f964, f965; +sub.f32 f967, f866, f881; +mul.f32 f968, f967, 0f3F737871; +sub.f32 f969, f871, f876; +mul.f32 f970, f969, 0fBF167918; +sub.f32 f971, f970, f968; +mul.f32 f972, f947, 0f3F4F1BBD; +sub.f32 f973, f886, f972; +fma.rn.f32 f974, f949, 0f3E9E377A, f973; +mul.f32 f975, f967, 0f3F167918; +mul.f32 f976, f969, 0f3F737871; +sub.f32 f977, f976, f975; +add.f32 f978, f867, f882; +add.f32 f979, f862, f978; +add.f32 f980, f872, f877; +add.f32 f981, f892, f907; +add.f32 f982, f887, f981; +add.f32 f983, f897, f902; +fma.rn.f32 f984, f978, 0f3E9E377A, f862; +mul.f32 f985, f980, 0f3F4F1BBD; +sub.f32 f986, f984, f985; +sub.f32 f987, f892, f907; +mul.f32 f988, f987, 0f3F737871; +sub.f32 f989, f897, f902; +mul.f32 f990, f989, 0fBF167918; +sub.f32 f991, f990, f988; +mul.f32 f992, f978, 0f3F4F1BBD; +sub.f32 f993, f862, f992; +fma.rn.f32 f994, f980, 0f3E9E377A, f993; +mul.f32 f995, f987, 0f3F167918; +mul.f32 f996, f989, 0f3F737871; +sub.f32 f997, f996, f995; +fma.rn.f32 f998, f981, 0f3E9E377A, f887; +mul.f32 f999, f983, 0f3F4F1BBD; +sub.f32 f1000, f998, f999; +sub.f32 f1001, f867, f882; +mul.f32 f1002, f1001, 0f3F737871; +sub.f32 f1003, f872, f877; +mul.f32 f1004, f1003, 0fBF167918; +sub.f32 f1005, f1004, f1002; +mul.f32 f1006, f981, 0f3F4F1BBD; +sub.f32 f1007, f887, f1006; +fma.rn.f32 f1008, f983, 0f3E9E377A, f1007; +mul.f32 f1009, f1001, 0f3F167918; +mul.f32 f1010, f1003, 0f3F737871; +sub.f32 f1011, f1010, f1009; +add.f32 f1012, f868, f883; +add.f32 f1013, f863, f1012; +add.f32 f1014, f873, f878; +add.f32 f1015, f893, f908; +add.f32 f1016, f888, f1015; +add.f32 f1017, f898, f903; +fma.rn.f32 f1018, f1012, 0f3E9E377A, f863; +mul.f32 f1019, f1014, 0f3F4F1BBD; +sub.f32 f1020, f1018, f1019; +sub.f32 f1021, f893, f908; +mul.f32 f1022, f1021, 0f3F737871; +sub.f32 f1023, f898, f903; +mul.f32 f1024, f1023, 0fBF167918; +sub.f32 f1025, f1024, f1022; +mul.f32 f1026, f1012, 0f3F4F1BBD; +sub.f32 f1027, f863, f1026; +fma.rn.f32 f1028, f1014, 0f3E9E377A, f1027; +mul.f32 f1029, f1021, 0f3F167918; +mul.f32 f1030, f1023, 0f3F737871; +sub.f32 f1031, f1030, f1029; +fma.rn.f32 f1032, f1015, 0f3E9E377A, f888; +mul.f32 f1033, f1017, 0f3F4F1BBD; +sub.f32 f1034, f1032, f1033; +sub.f32 f1035, f868, f883; +mul.f32 f1036, f1035, 0f3F737871; +sub.f32 f1037, f873, f878; +mul.f32 f1038, f1037, 0fBF167918; +sub.f32 f1039, f1038, f1036; +mul.f32 f1040, f1015, 0f3F4F1BBD; +sub.f32 f1041, f888, f1040; +fma.rn.f32 f1042, f1017, 0f3E9E377A, f1041; +mul.f32 f1043, f1035, 0f3F167918; +mul.f32 f1044, f1037, 0f3F737871; +sub.f32 f1045, f1044, f1043; +add.f32 f1046, f869, f884; +add.f32 f1047, f864, f1046; +add.f32 f1048, f874, f879; +add.f32 f1049, f894, f909; +add.f32 f1050, f889, f1049; +add.f32 f1051, f899, f904; +fma.rn.f32 f1052, f1046, 0f3E9E377A, f864; +mul.f32 f1053, f1048, 0f3F4F1BBD; +sub.f32 f1054, f1052, f1053; +sub.f32 f1055, f894, f909; +mul.f32 f1056, f1055, 0f3F737871; +sub.f32 f1057, f899, f904; +mul.f32 f1058, f1057, 0fBF167918; +sub.f32 f1059, f1058, f1056; +mul.f32 f1060, f1046, 0f3F4F1BBD; +sub.f32 f1061, f864, f1060; +fma.rn.f32 f1062, f1048, 0f3E9E377A, f1061; +mul.f32 f1063, f1055, 0f3F167918; +mul.f32 f1064, f1057, 0f3F737871; +sub.f32 f1065, f1064, f1063; +fma.rn.f32 f1066, f1049, 0f3E9E377A, f889; +mul.f32 f1067, f1051, 0f3F4F1BBD; +sub.f32 f1068, f1066, f1067; +sub.f32 f1069, f869, f884; +mul.f32 f1070, f1069, 0f3F737871; +sub.f32 f1071, f874, f879; +mul.f32 f1072, f1071, 0fBF167918; +sub.f32 f1073, f1072, f1070; +mul.f32 f1074, f1049, 0f3F4F1BBD; +sub.f32 f1075, f889, f1074; +fma.rn.f32 f1076, f1051, 0f3E9E377A, f1075; +mul.f32 f1077, f1069, 0f3F167918; +mul.f32 f1078, f1071, 0f3F737871; +sub.f32 f1079, f1078, f1077; +add.f32 %0, f912, f911; +add.f32 %1, f915, f914; +add.f32 %2, f946, f945; +add.f32 %3, f949, f948; +add.f32 %4, f980, f979; +add.f32 %5, f983, f982; +add.f32 %6, f1014, f1013; +add.f32 %7, f1017, f1016; +add.f32 %8, f1048, f1047; +add.f32 %9, f1051, f1050; +add.f32 %11, f937, f932; +sub.f32 %10, f918, f923; +add.f32 %13, f971, f966; +sub.f32 %12, f952, f957; +add.f32 %15, f1005, f1000; +sub.f32 %14, f986, f991; +add.f32 %17, f1039, f1034; +sub.f32 %16, f1020, f1025; +add.f32 %19, f1073, f1068; +sub.f32 %18, f1054, f1059; +sub.f32 %20, f926, f929; +add.f32 %21, f943, f940; +sub.f32 %22, f960, f963; +add.f32 %23, f977, f974; +sub.f32 %24, f994, f997; +add.f32 %25, f1011, f1008; +sub.f32 %26, f1028, f1031; +add.f32 %27, f1045, f1042; +sub.f32 %28, f1062, f1065; +add.f32 %29, f1079, f1076; +add.f32 %30, f929, f926; +sub.f32 %31, f940, f943; +add.f32 %32, f963, f960; +sub.f32 %33, f974, f977; +add.f32 %34, f997, f994; +sub.f32 %35, f1008, f1011; +add.f32 %36, f1031, f1028; +sub.f32 %37, f1042, f1045; +add.f32 %38, f1065, f1062; +sub.f32 %39, f1076, f1079; +sub.f32 %41, f932, f937; +add.f32 %40, f923, f918; +sub.f32 %43, f966, f971; +add.f32 %42, f957, f952; +sub.f32 %45, f1000, f1005; +add.f32 %44, f991, f986; +sub.f32 %47, f1034, f1039; +add.f32 %46, f1025, f1020; +sub.f32 %49, f1068, f1073; +add.f32 %48, f1059, f1054; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<160, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<271>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 1000, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %15, %23; +add.f32 f22, %13, f21; +add.f32 f23, %18, %21; +add.f32 f24, %17, %24; +add.f32 f25, %14, f24; +add.f32 f26, %20, %22; +fma.rn.f32 f27, f21, 0f3E9E377A, %13; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %17, %24; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %20, %22; +mul.f32 f33, f32, 0fBF167918; +sub.f32 f34, f33, f31; +sub.f32 f35, f29, f34; +add.f32 f36, f34, f29; +mul.f32 f37, f21, 0f3F4F1BBD; +sub.f32 f38, %13, f37; +fma.rn.f32 f39, f23, 0f3E9E377A, f38; +mul.f32 f40, f30, 0f3F167918; +mul.f32 f41, f32, 0f3F737871; +sub.f32 f42, f41, f40; +sub.f32 f43, f39, f42; +add.f32 f44, f42, f39; +fma.rn.f32 f45, f24, 0f3E9E377A, %14; +mul.f32 f46, f26, 0f3F4F1BBD; +sub.f32 f47, f45, f46; +sub.f32 f48, %15, %23; +mul.f32 f49, f48, 0f3F737871; +sub.f32 f50, %18, %21; +mul.f32 f51, f50, 0fBF167918; +sub.f32 f52, f51, f49; +add.f32 f53, f52, f47; +sub.f32 f54, f47, f52; +mul.f32 f55, f24, 0f3F4F1BBD; +sub.f32 f56, %14, f55; +fma.rn.f32 f57, f26, 0f3E9E377A, f56; +mul.f32 f58, f48, 0f3F167918; +mul.f32 f59, f50, 0f3F737871; +sub.f32 f60, f59, f58; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f63, f35; +mul.f32 f68, f64, f53; +mul.f32 f69, f63, f53; +mul.f32 f70, f63, f63; +mul.f32 f71, f64, f64; +sub.f32 f72, f70, f71; +mul.f32 f73, f64, f63; +fma.rn.f32 f74, f64, f63, f73; +mul.f32 f75, f72, f43; +mul.f32 f76, f74, f61; +mul.f32 f77, f72, f61; +mul.f32 f78, f63, f72; +mul.f32 f79, f64, f74; +sub.f32 f80, f78, f79; +mul.f32 f81, f63, f74; +fma.rn.f32 f82, f64, f72, f81; +mul.f32 f83, f80, f44; +mul.f32 f84, f82, f62; +mul.f32 f85, f80, f62; +mul.f32 f86, f63, f80; +mul.f32 f87, f64, f82; +sub.f32 f88, f86, f87; +mul.f32 f89, f63, f82; +fma.rn.f32 f90, f64, f80, f89; +mul.f32 f91, f88, f36; +mul.f32 f92, f90, f54; +mul.f32 f93, f88, f54; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f94, f26, f25; +add.f32 f95, f23, f22; +st.shared.v2.f32 [r9], {f95, f94}; +fma.rn.f32 f96, f64, f35, f69; +sub.f32 f97, f67, f68; +st.shared.v2.f32 [r9+8], {f97, f96}; +fma.rn.f32 f98, f74, f43, f77; +sub.f32 f99, f75, f76; +st.shared.v2.f32 [r9+16], {f99, f98}; +sub.f32 f100, f83, f84; +fma.rn.f32 f101, f82, f44, f85; +st.shared.v2.f32 [r9+24], {f100, f101}; +fma.rn.f32 f102, f90, f36, f93; +sub.f32 f103, f91, f92; +st.shared.v2.f32 [r9+32], {f103, f102}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f104, f105}, [r11]; +ld.shared.v2.f32 {f108, f109}, [r11+200]; +ld.shared.v2.f32 {f112, f113}, [r11+400]; +ld.shared.v2.f32 {f116, f117}, [r11+600]; +ld.shared.v2.f32 {f120, f121}, [r11+800]; +add.f32 f124, f108, f120; +add.f32 f125, f104, f124; +add.f32 f126, f112, f116; +add.f32 f127, f109, f121; +add.f32 f128, f105, f127; +add.f32 f129, f113, f117; +fma.rn.f32 f130, f124, 0f3E9E377A, f104; +mul.f32 f131, f126, 0f3F4F1BBD; +sub.f32 f132, f130, f131; +sub.f32 f133, f109, f121; +mul.f32 f134, f133, 0f3F737871; +sub.f32 f135, f113, f117; +mul.f32 f136, f135, 0fBF167918; +sub.f32 f137, f136, f134; +sub.f32 f138, f132, f137; +add.f32 f139, f137, f132; +mul.f32 f140, f124, 0f3F4F1BBD; +sub.f32 f141, f104, f140; +fma.rn.f32 f142, f126, 0f3E9E377A, f141; +mul.f32 f143, f133, 0f3F167918; +mul.f32 f144, f135, 0f3F737871; +sub.f32 f145, f144, f143; +sub.f32 f146, f142, f145; +add.f32 f147, f145, f142; +fma.rn.f32 f148, f127, 0f3E9E377A, f105; +mul.f32 f149, f129, 0f3F4F1BBD; +sub.f32 f150, f148, f149; +sub.f32 f151, f108, f120; +mul.f32 f152, f151, 0f3F737871; +sub.f32 f153, f112, f116; +mul.f32 f154, f153, 0fBF167918; +sub.f32 f155, f154, f152; +add.f32 f156, f155, f150; +sub.f32 f157, f150, f155; +mul.f32 f158, f127, 0f3F4F1BBD; +sub.f32 f159, f105, f158; +fma.rn.f32 f160, f129, 0f3E9E377A, f159; +mul.f32 f161, f151, 0f3F167918; +mul.f32 f162, f153, 0f3F737871; +sub.f32 f163, f162, f161; +add.f32 f164, f163, f160; +sub.f32 f165, f160, f163; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f166, f167}, [rd11]; +mul.f32 f170, f166, f138; +mul.f32 f171, f167, f156; +mul.f32 f172, f166, f156; +mul.f32 f173, f166, f166; +mul.f32 f174, f167, f167; +sub.f32 f175, f173, f174; +mul.f32 f176, f167, f166; +fma.rn.f32 f177, f167, f166, f176; +mul.f32 f178, f175, f146; +mul.f32 f179, f177, f164; +mul.f32 f180, f175, f164; +mul.f32 f181, f166, f175; +mul.f32 f182, f167, f177; +sub.f32 f183, f181, f182; +mul.f32 f184, f166, f177; +fma.rn.f32 f185, f167, f175, f184; +mul.f32 f186, f183, f147; +mul.f32 f187, f185, f165; +mul.f32 f188, f183, f165; +mul.f32 f189, f166, f183; +mul.f32 f190, f167, f185; +sub.f32 f191, f189, f190; +mul.f32 f192, f166, f185; +fma.rn.f32 f193, f167, f183, f192; +mul.f32 f194, f191, f139; +mul.f32 f195, f193, f157; +mul.f32 f196, f191, f157; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +add.f32 f197, f129, f128; +add.f32 f198, f126, f125; +st.shared.v2.f32 [r17], {f198, f197}; +fma.rn.f32 f199, f167, f138, f172; +sub.f32 f200, f170, f171; +st.shared.v2.f32 [r17+40], {f200, f199}; +fma.rn.f32 f201, f177, f146, f180; +sub.f32 f202, f178, f179; +st.shared.v2.f32 [r17+80], {f202, f201}; +fma.rn.f32 f203, f185, f147, f188; +sub.f32 f204, f186, f187; +st.shared.v2.f32 [r17+120], {f204, f203}; +fma.rn.f32 f205, f193, f139, f196; +sub.f32 f206, f194, f195; +st.shared.v2.f32 [r17+160], {f206, f205}; +barrier.sync 0; +ld.shared.v2.f32 {f207, f208}, [r11]; +ld.shared.v2.f32 {f211, f212}, [r11+200]; +ld.shared.v2.f32 {f215, f216}, [r11+400]; +ld.shared.v2.f32 {f219, f220}, [r11+600]; +ld.shared.v2.f32 {f223, f224}, [r11+800]; +add.f32 f227, f211, f223; +add.f32 f228, f207, f227; +add.f32 f229, f215, f219; +add.f32 f230, f212, f224; +add.f32 f231, f208, f230; +add.f32 f232, f216, f220; +fma.rn.f32 f233, f227, 0f3E9E377A, f207; +mul.f32 f234, f229, 0f3F4F1BBD; +sub.f32 f235, f233, f234; +sub.f32 f236, f212, f224; +mul.f32 f237, f236, 0f3F737871; +sub.f32 f238, f216, f220; +mul.f32 f239, f238, 0fBF167918; +sub.f32 f240, f239, f237; +mul.f32 f241, f227, 0f3F4F1BBD; +sub.f32 f242, f207, f241; +fma.rn.f32 f243, f229, 0f3E9E377A, f242; +mul.f32 f244, f236, 0f3F167918; +mul.f32 f245, f238, 0f3F737871; +sub.f32 f246, f245, f244; +fma.rn.f32 f247, f230, 0f3E9E377A, f208; +mul.f32 f248, f232, 0f3F4F1BBD; +sub.f32 f249, f247, f248; +sub.f32 f250, f211, f223; +mul.f32 f251, f250, 0f3F737871; +sub.f32 f252, f215, f219; +mul.f32 f253, f252, 0fBF167918; +sub.f32 f254, f253, f251; +mul.f32 f255, f230, 0f3F4F1BBD; +sub.f32 f256, f208, f255; +fma.rn.f32 f257, f232, 0f3E9E377A, f256; +mul.f32 f258, f250, 0f3F167918; +mul.f32 f259, f252, 0f3F737871; +sub.f32 f260, f259, f258; +add.f32 %1, f232, f231; +add.f32 %0, f229, f228; +add.f32 %3, f254, f249; +sub.f32 %2, f235, f240; +add.f32 %5, f260, f257; +sub.f32 %4, f243, f246; +sub.f32 %7, f257, f260; +add.f32 %6, f246, f243; +sub.f32 %9, f249, f254; +add.f32 %8, f240, f235; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<161, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<251>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 500, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %15, %23; +add.f32 f22, %13, f21; +add.f32 f23, %18, %21; +add.f32 f24, f23, f22; +add.f32 f25, %17, %24; +add.f32 f26, %14, f25; +add.f32 f27, %20, %22; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %13; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %17, %24; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %20, %22; +mul.f32 f35, f34, 0fBF167918; +sub.f32 f36, f35, f33; +sub.f32 f37, f31, f36; +add.f32 f38, f36, f31; +mul.f32 f39, f21, 0f3F4F1BBD; +sub.f32 f40, %13, f39; +fma.rn.f32 f41, f23, 0f3E9E377A, f40; +mul.f32 f42, f32, 0f3F167918; +mul.f32 f43, f34, 0f3F737871; +sub.f32 f44, f43, f42; +sub.f32 f45, f41, f44; +add.f32 f46, f44, f41; +fma.rn.f32 f47, f25, 0f3E9E377A, %14; +mul.f32 f48, f27, 0f3F4F1BBD; +sub.f32 f49, f47, f48; +sub.f32 f50, %15, %23; +mul.f32 f51, f50, 0f3F737871; +sub.f32 f52, %18, %21; +mul.f32 f53, f52, 0fBF167918; +sub.f32 f54, f53, f51; +add.f32 f55, f54, f49; +sub.f32 f56, f49, f54; +mul.f32 f57, f25, 0f3F4F1BBD; +sub.f32 f58, %14, f57; +fma.rn.f32 f59, f27, 0f3E9E377A, f58; +mul.f32 f60, f50, 0f3F167918; +mul.f32 f61, f52, 0f3F737871; +sub.f32 f62, f61, f60; +add.f32 f63, f62, f59; +sub.f32 f64, f59, f62; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f65, f66}, [rd6]; +mul.f32 f69, f65, f37; +mul.f32 f70, f66, f55; +sub.f32 f71, f69, f70; +mul.f32 f72, f65, f55; +fma.rn.f32 f73, f66, f37, f72; +mul.f32 f74, f65, f65; +mul.f32 f75, f66, f66; +sub.f32 f76, f74, f75; +mul.f32 f77, f66, f65; +fma.rn.f32 f78, f66, f65, f77; +mul.f32 f79, f76, f45; +mul.f32 f80, f78, f63; +sub.f32 f81, f79, f80; +mul.f32 f82, f76, f63; +fma.rn.f32 f83, f78, f45, f82; +mul.f32 f84, f65, f76; +mul.f32 f85, f66, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f65, f78; +fma.rn.f32 f88, f66, f76, f87; +mul.f32 f89, f86, f46; +mul.f32 f90, f88, f64; +sub.f32 f91, f89, f90; +mul.f32 f92, f86, f64; +fma.rn.f32 f93, f88, f46, f92; +mul.f32 f94, f65, f86; +mul.f32 f95, f66, f88; +sub.f32 f96, f94, f95; +mul.f32 f97, f65, f88; +fma.rn.f32 f98, f66, f86, f97; +mul.f32 f99, f96, f38; +mul.f32 f100, f98, f56; +sub.f32 f101, f99, f100; +mul.f32 f102, f96, f56; +fma.rn.f32 f103, f98, f38, f102; +mad.lo.s32 r8, r5, 500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f104, [r11]; +ld.shared.f32 f105, [r11+100]; +ld.shared.f32 f106, [r11+200]; +ld.shared.f32 f107, [r11+300]; +ld.shared.f32 f108, [r11+400]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f73; +st.shared.f32 [r9+8], f83; +st.shared.f32 [r9+12], f93; +st.shared.f32 [r9+16], f103; +barrier.sync 0; +ld.shared.f32 f109, [r11]; +ld.shared.f32 f110, [r11+100]; +ld.shared.f32 f111, [r11+200]; +ld.shared.f32 f112, [r11+300]; +ld.shared.f32 f113, [r11+400]; +add.f32 f114, f105, f108; +add.f32 f115, f104, f114; +add.f32 f116, f106, f107; +add.f32 f117, f116, f115; +add.f32 f118, f110, f113; +add.f32 f119, f109, f118; +add.f32 f120, f111, f112; +add.f32 f121, f120, f119; +fma.rn.f32 f122, f114, 0f3E9E377A, f104; +mul.f32 f123, f116, 0f3F4F1BBD; +sub.f32 f124, f122, f123; +sub.f32 f125, f110, f113; +mul.f32 f126, f125, 0f3F737871; +sub.f32 f127, f111, f112; +mul.f32 f128, f127, 0fBF167918; +sub.f32 f129, f128, f126; +sub.f32 f130, f124, f129; +add.f32 f131, f129, f124; +mul.f32 f132, f114, 0f3F4F1BBD; +sub.f32 f133, f104, f132; +fma.rn.f32 f134, f116, 0f3E9E377A, f133; +mul.f32 f135, f125, 0f3F167918; +mul.f32 f136, f127, 0f3F737871; +sub.f32 f137, f136, f135; +sub.f32 f138, f134, f137; +add.f32 f139, f137, f134; +fma.rn.f32 f140, f118, 0f3E9E377A, f109; +mul.f32 f141, f120, 0f3F4F1BBD; +sub.f32 f142, f140, f141; +sub.f32 f143, f105, f108; +mul.f32 f144, f143, 0f3F737871; +sub.f32 f145, f106, f107; +mul.f32 f146, f145, 0fBF167918; +sub.f32 f147, f146, f144; +add.f32 f148, f147, f142; +sub.f32 f149, f142, f147; +mul.f32 f150, f118, 0f3F4F1BBD; +sub.f32 f151, f109, f150; +fma.rn.f32 f152, f120, 0f3E9E377A, f151; +mul.f32 f153, f143, 0f3F167918; +mul.f32 f154, f145, 0f3F737871; +sub.f32 f155, f154, f153; +add.f32 f156, f155, f152; +sub.f32 f157, f152, f155; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f158, f159}, [rd11]; +mul.f32 f162, f158, f130; +mul.f32 f163, f159, f148; +sub.f32 f164, f162, f163; +mul.f32 f165, f158, f148; +fma.rn.f32 f166, f159, f130, f165; +mul.f32 f167, f158, f158; +mul.f32 f168, f159, f159; +sub.f32 f169, f167, f168; +mul.f32 f170, f159, f158; +fma.rn.f32 f171, f159, f158, f170; +mul.f32 f172, f169, f138; +mul.f32 f173, f171, f156; +sub.f32 f174, f172, f173; +mul.f32 f175, f169, f156; +fma.rn.f32 f176, f171, f138, f175; +mul.f32 f177, f158, f169; +mul.f32 f178, f159, f171; +sub.f32 f179, f177, f178; +mul.f32 f180, f158, f171; +fma.rn.f32 f181, f159, f169, f180; +mul.f32 f182, f179, f139; +mul.f32 f183, f181, f157; +sub.f32 f184, f182, f183; +mul.f32 f185, f179, f157; +fma.rn.f32 f186, f181, f139, f185; +mul.f32 f187, f158, f179; +mul.f32 f188, f159, f181; +sub.f32 f189, f187, f188; +mul.f32 f190, f158, f181; +fma.rn.f32 f191, f159, f179, f190; +mul.f32 f192, f189, f131; +mul.f32 f193, f191, f149; +sub.f32 f194, f192, f193; +mul.f32 f195, f189, f149; +fma.rn.f32 f196, f191, f131, f195; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 100, r16; +st.shared.f32 [r17], f117; +st.shared.f32 [r17+20], f164; +st.shared.f32 [r17+40], f174; +st.shared.f32 [r17+60], f184; +st.shared.f32 [r17+80], f194; +barrier.sync 0; +ld.shared.f32 f197, [r11]; +ld.shared.f32 f198, [r11+100]; +ld.shared.f32 f199, [r11+200]; +ld.shared.f32 f200, [r11+300]; +ld.shared.f32 f201, [r11+400]; +barrier.sync 0; +st.shared.f32 [r17], f121; +st.shared.f32 [r17+20], f166; +st.shared.f32 [r17+40], f176; +st.shared.f32 [r17+60], f186; +st.shared.f32 [r17+80], f196; +barrier.sync 0; +ld.shared.f32 f202, [r11]; +ld.shared.f32 f203, [r11+100]; +ld.shared.f32 f204, [r11+200]; +ld.shared.f32 f205, [r11+300]; +ld.shared.f32 f206, [r11+400]; +add.f32 f207, f198, f201; +add.f32 f208, f197, f207; +add.f32 f209, f199, f200; +add.f32 f210, f203, f206; +add.f32 f211, f202, f210; +add.f32 f212, f204, f205; +fma.rn.f32 f213, f207, 0f3E9E377A, f197; +mul.f32 f214, f209, 0f3F4F1BBD; +sub.f32 f215, f213, f214; +sub.f32 f216, f203, f206; +mul.f32 f217, f216, 0f3F737871; +sub.f32 f218, f204, f205; +mul.f32 f219, f218, 0fBF167918; +sub.f32 f220, f219, f217; +mul.f32 f221, f207, 0f3F4F1BBD; +sub.f32 f222, f197, f221; +fma.rn.f32 f223, f209, 0f3E9E377A, f222; +mul.f32 f224, f216, 0f3F167918; +mul.f32 f225, f218, 0f3F737871; +sub.f32 f226, f225, f224; +fma.rn.f32 f227, f210, 0f3E9E377A, f202; +mul.f32 f228, f212, 0f3F4F1BBD; +sub.f32 f229, f227, f228; +sub.f32 f230, f198, f201; +mul.f32 f231, f230, 0f3F737871; +sub.f32 f232, f199, f200; +mul.f32 f233, f232, 0fBF167918; +sub.f32 f234, f233, f231; +mul.f32 f235, f210, 0f3F4F1BBD; +sub.f32 f236, f202, f235; +fma.rn.f32 f237, f212, 0f3E9E377A, f236; +mul.f32 f238, f230, 0f3F167918; +mul.f32 f239, f232, 0f3F737871; +sub.f32 f240, f239, f238; +add.f32 %0, f209, f208; +add.f32 %1, f212, f211; +add.f32 %3, f234, f229; +sub.f32 %2, f215, f220; +sub.f32 %4, f223, f226; +add.f32 %5, f240, f237; +add.f32 %6, f226, f223; +sub.f32 %7, f237, f240; +sub.f32 %9, f229, f234; +add.f32 %8, f220, f215; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..7ba04335554b1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp32_inv.hpp.inc @@ -0,0 +1,2656 @@ +#ifndef CUFFTDX_FFT_125_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_125_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<361, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1369>; +.reg .b32 r<14>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 1000, r13; +add.f32 f101, %62, %92; +add.f32 f103, %72, %82; +add.f32 f1368, %52, f101; +add.f32 f104, f103, f1368; +add.f32 f105, %102, %104; +add.f32 f107, %103, %83; +add.f32 f1364, %53, f105; +add.f32 f108, f107, f1364; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f1363, f101, 0f3E9E377A, %52; +sub.f32 f111, f1363, f110; +sub.f32 f112, %102, %104; +sub.f32 f114, %103, %83; +mul.f32 f1362, f112, 0f3F737871; +fma.rn.f32 f115, f114, 0f3F167918, f1362; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %52, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +mul.f32 f127, f107, 0f3F4F1BBD; +fma.rn.f32 f1361, f105, 0f3E9E377A, %53; +sub.f32 f128, f1361, f127; +sub.f32 f129, %62, %92; +sub.f32 f131, %72, %82; +mul.f32 f1360, f129, 0f3F737871; +fma.rn.f32 f132, f131, 0f3F167918, f1360; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %53, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %64, %94; +add.f32 f145, %74, %84; +add.f32 f1359, %54, f143; +add.f32 f146, f145, f1359; +add.f32 f147, %65, %95; +add.f32 f149, %107, %105; +add.f32 f1355, %106, f147; +add.f32 f150, f149, f1355; +fma.rn.f32 f1353, f143, 0f3E9E377A, %54; +mul.f32 f1354, f145, 0f3F4F1BBD; +sub.f32 f153, f1353, f1354; +sub.f32 f154, %65, %95; +sub.f32 f156, %107, %105; +mul.f32 f1352, f154, 0f3F737871; +fma.rn.f32 f157, f156, 0f3F167918, f1352; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %54, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +mul.f32 f169, f149, 0f3F4F1BBD; +fma.rn.f32 f1351, f147, 0f3E9E377A, %106; +sub.f32 f170, f1351, f169; +sub.f32 f171, %64, %94; +sub.f32 f173, %74, %84; +mul.f32 f1350, f171, 0f3F737871; +fma.rn.f32 f174, f173, 0f3F167918, f1350; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %106, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %66, %96; +add.f32 f187, %76, %86; +add.f32 f1349, %56, f185; +add.f32 f188, f187, f1349; +add.f32 f189, %110, %109; +add.f32 f191, %77, %111; +add.f32 f1344, %108, f189; +add.f32 f192, f191, f1344; +fma.rn.f32 f1342, f185, 0f3E9E377A, %56; +mul.f32 f1343, f187, 0f3F4F1BBD; +sub.f32 f195, f1342, f1343; +sub.f32 f196, %110, %109; +sub.f32 f198, %77, %111; +mul.f32 f1341, f196, 0f3F737871; +fma.rn.f32 f199, f198, 0f3F167918, f1341; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %56, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f1339, f189, 0f3E9E377A, %108; +mul.f32 f1340, f191, 0f3F4F1BBD; +sub.f32 f212, f1339, f1340; +sub.f32 f213, %66, %96; +sub.f32 f215, %76, %86; +mul.f32 f1338, f213, 0f3F737871; +fma.rn.f32 f216, f215, 0f3F167918, f1338; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %108, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %68, %98; +add.f32 f229, %78, %88; +add.f32 f1337, %58, f227; +add.f32 f230, f229, f1337; +add.f32 f231, %112, %114; +add.f32 f233, %113, %89; +add.f32 f1333, %59, f231; +add.f32 f234, f233, f1333; +mul.f32 f236, f229, 0f3F4F1BBD; +fma.rn.f32 f1332, f227, 0f3E9E377A, %58; +sub.f32 f237, f1332, f236; +sub.f32 f238, %112, %114; +sub.f32 f240, %113, %89; +mul.f32 f1331, f238, 0f3F737871; +fma.rn.f32 f241, f240, 0f3F167918, f1331; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %58, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +mul.f32 f253, f233, 0f3F4F1BBD; +fma.rn.f32 f1330, f231, 0f3E9E377A, %59; +sub.f32 f254, f1330, f253; +sub.f32 f255, %68, %98; +sub.f32 f257, %78, %88; +mul.f32 f1329, f255, 0f3F737871; +fma.rn.f32 f258, f257, 0f3F167918, f1329; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %59, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %70, %100; +add.f32 f271, %80, %90; +add.f32 f1328, %60, f269; +add.f32 f272, f271, f1328; +add.f32 f273, %71, %101; +add.f32 f275, %116, %117; +add.f32 f1324, %115, f273; +add.f32 f276, f275, f1324; +mul.f32 f278, f271, 0f3F4F1BBD; +fma.rn.f32 f1323, f269, 0f3E9E377A, %60; +sub.f32 f279, f1323, f278; +sub.f32 f280, %71, %101; +sub.f32 f282, %116, %117; +mul.f32 f1322, f280, 0f3F737871; +fma.rn.f32 f283, f282, 0f3F167918, f1322; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %60, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +mul.f32 f295, f275, 0f3F4F1BBD; +fma.rn.f32 f1321, f273, 0f3E9E377A, %115; +sub.f32 f296, f1321, f295; +sub.f32 f297, %70, %100; +sub.f32 f299, %80, %90; +mul.f32 f1320, f297, 0f3F737871; +fma.rn.f32 f300, f299, 0f3F167918, f1320; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %115, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mul.f32 f1318, f158, 0f3F77F511; +mul.f32 f1319, f175, 0f3E7EA890; +sub.f32 f313, f1318, f1319; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f1316, f200, 0f3F6055A2; +mul.f32 f1317, f217, 0f3EF6A86B; +sub.f32 f318, f1316, f1317; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f322, f259, 0f3F2F3E7B; +mul.f32 f1315, f242, 0f3F3A9DB0; +sub.f32 f323, f1315, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f327, f301, 0f3F5825E0; +mul.f32 f1314, f284, 0f3F092BF2; +sub.f32 f328, f1314, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f332, f183, 0f3EF6A86B; +mul.f32 f1313, f166, 0f3F6055A2; +sub.f32 f333, f1313, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f337, f225, 0f3F5825E0; +mul.f32 f1312, f208, 0f3F092BF2; +sub.f32 f338, f1312, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f342, f267, 0f3F7F7EAE; +mul.f32 f1311, f250, 0f3D809851; +sub.f32 f343, f1311, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f1309, f292, 0fBED9FFBE; +mul.f32 f1310, f309, 0f3F67A2BF; +sub.f32 f348, f1309, f1310; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f1307, f167, 0f3F3A9DB0; +mul.f32 f1308, f184, 0f3F2F3E7B; +sub.f32 f353, f1307, f1308; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f1305, f209, 0f3D809851; +mul.f32 f1306, f226, 0f3F7F7EAE; +sub.f32 f358, f1305, f1306; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f1303, f251, 0fBF232E38; +mul.f32 f1304, f268, 0f3F45405B; +sub.f32 f363, f1303, f1304; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f367, f310, 0f3E00575B; +mul.f32 f1302, f293, 0fBF7DFB3B; +sub.f32 f368, f1302, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f372, f176, 0f3F5825E0; +mul.f32 f1301, f159, 0f3F092BF2; +sub.f32 f373, f1301, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f377, f218, 0f3F67A2BF; +mul.f32 f1300, f201, 0fBED9FFBE; +sub.f32 f378, f1300, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f382, f260, 0f3E00575B; +mul.f32 f1299, f243, 0fBF7DFB3B; +sub.f32 f383, f1299, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f387, f302, 0fBF45405B; +mul.f32 f1298, f285, 0fBF232E38; +sub.f32 f388, f1298, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f393, f188, f230; +mul.f32 f398, f393, 0f3F4F1BBD; +fma.rn.f32 f1297, f391, 0f3E9E377A, f104; +sub.f32 f399, f1297, f398; +add.f32 f1296, f150, f276; +sub.f32 f400, f150, f276; +add.f32 f1295, f192, f234; +sub.f32 f402, f192, f234; +mul.f32 f1294, f400, 0f3F737871; +fma.rn.f32 f403, f402, 0f3F167918, f1294; +sub.f32 f404, f399, f403; +add.f32 f405, f403, f399; +add.f32 f1293, f104, f391; +mul.f32 f406, f391, 0f3F4F1BBD; +sub.f32 f407, f104, f406; +fma.rn.f32 f408, f393, 0f3E9E377A, f407; +mul.f32 f409, f400, 0f3F167918; +mul.f32 f410, f402, 0f3F737871; +sub.f32 f411, f409, f410; +sub.f32 f412, f408, f411; +add.f32 f413, f411, f408; +mul.f32 f415, f1295, 0f3F4F1BBD; +fma.rn.f32 f1292, f1296, 0f3E9E377A, f108; +sub.f32 f416, f1292, f415; +sub.f32 f417, f146, f272; +sub.f32 f419, f188, f230; +mul.f32 f1291, f417, 0f3F737871; +fma.rn.f32 f420, f419, 0f3F167918, f1291; +add.f32 f421, f420, f416; +sub.f32 f422, f416, f420; +add.f32 f1290, f108, f1296; +mul.f32 f423, f1296, 0f3F4F1BBD; +sub.f32 f424, f108, f423; +fma.rn.f32 f425, f1295, 0f3E9E377A, f424; +mul.f32 f426, f417, 0f3F167918; +mul.f32 f427, f419, 0f3F737871; +sub.f32 f428, f426, f427; +add.f32 f429, f428, f425; +sub.f32 f430, f425, f428; +add.f32 f431, f313, f328; +add.f32 f433, f318, f323; +add.f32 f1289, f116, f431; +add.f32 f434, f433, f1289; +add.f32 f435, f315, f330; +add.f32 f437, f320, f325; +add.f32 f1288, f133, f435; +add.f32 f438, f437, f1288; +fma.rn.f32 f1286, f431, 0f3E9E377A, f116; +mul.f32 f1287, f433, 0f3F4F1BBD; +sub.f32 f441, f1286, f1287; +sub.f32 f442, f315, f330; +sub.f32 f444, f320, f325; +mul.f32 f1285, f442, 0f3F737871; +fma.rn.f32 f445, f444, 0f3F167918, f1285; +sub.f32 f446, f441, f445; +add.f32 f447, f445, f441; +mul.f32 f448, f431, 0f3F4F1BBD; +sub.f32 f449, f116, f448; +fma.rn.f32 f450, f433, 0f3E9E377A, f449; +mul.f32 f451, f442, 0f3F167918; +mul.f32 f452, f444, 0f3F737871; +sub.f32 f453, f451, f452; +sub.f32 f454, f450, f453; +add.f32 f455, f453, f450; +mul.f32 f457, f437, 0f3F4F1BBD; +fma.rn.f32 f1284, f435, 0f3E9E377A, f133; +sub.f32 f458, f1284, f457; +sub.f32 f459, f313, f328; +sub.f32 f461, f318, f323; +mul.f32 f1283, f459, 0f3F737871; +fma.rn.f32 f462, f461, 0f3F167918, f1283; +add.f32 f463, f462, f458; +sub.f32 f464, f458, f462; +mul.f32 f465, f435, 0f3F4F1BBD; +sub.f32 f466, f133, f465; +fma.rn.f32 f467, f437, 0f3E9E377A, f466; +mul.f32 f468, f459, 0f3F167918; +mul.f32 f469, f461, 0f3F737871; +sub.f32 f470, f468, f469; +add.f32 f471, f470, f467; +sub.f32 f472, f467, f470; +add.f32 f473, f333, f348; +add.f32 f475, f338, f343; +add.f32 f1282, f124, f473; +add.f32 f476, f475, f1282; +add.f32 f477, f335, f350; +add.f32 f479, f340, f345; +add.f32 f1281, f141, f477; +add.f32 f480, f479, f1281; +fma.rn.f32 f1279, f473, 0f3E9E377A, f124; +mul.f32 f1280, f475, 0f3F4F1BBD; +sub.f32 f483, f1279, f1280; +sub.f32 f484, f335, f350; +sub.f32 f486, f340, f345; +mul.f32 f1278, f484, 0f3F737871; +fma.rn.f32 f487, f486, 0f3F167918, f1278; +sub.f32 f488, f483, f487; +add.f32 f489, f487, f483; +mul.f32 f490, f473, 0f3F4F1BBD; +sub.f32 f491, f124, f490; +fma.rn.f32 f492, f475, 0f3E9E377A, f491; +mul.f32 f493, f484, 0f3F167918; +mul.f32 f494, f486, 0f3F737871; +sub.f32 f495, f493, f494; +sub.f32 f496, f492, f495; +add.f32 f497, f495, f492; +fma.rn.f32 f1276, f477, 0f3E9E377A, f141; +mul.f32 f1277, f479, 0f3F4F1BBD; +sub.f32 f500, f1276, f1277; +sub.f32 f501, f333, f348; +sub.f32 f503, f338, f343; +mul.f32 f1275, f501, 0f3F737871; +fma.rn.f32 f504, f503, 0f3F167918, f1275; +add.f32 f505, f504, f500; +sub.f32 f506, f500, f504; +mul.f32 f507, f477, 0f3F4F1BBD; +sub.f32 f508, f141, f507; +fma.rn.f32 f509, f479, 0f3E9E377A, f508; +mul.f32 f510, f501, 0f3F167918; +mul.f32 f511, f503, 0f3F737871; +sub.f32 f512, f510, f511; +add.f32 f513, f512, f509; +sub.f32 f514, f509, f512; +add.f32 f515, f353, f368; +add.f32 f517, f358, f363; +add.f32 f1274, f125, f515; +add.f32 f518, f517, f1274; +add.f32 f519, f355, f370; +add.f32 f521, f360, f365; +add.f32 f1273, f142, f519; +add.f32 f522, f521, f1273; +mul.f32 f524, f517, 0f3F4F1BBD; +fma.rn.f32 f1272, f515, 0f3E9E377A, f125; +sub.f32 f525, f1272, f524; +sub.f32 f526, f355, f370; +sub.f32 f528, f360, f365; +mul.f32 f1271, f526, 0f3F737871; +fma.rn.f32 f529, f528, 0f3F167918, f1271; +sub.f32 f530, f525, f529; +add.f32 f531, f529, f525; +mul.f32 f532, f515, 0f3F4F1BBD; +sub.f32 f533, f125, f532; +fma.rn.f32 f534, f517, 0f3E9E377A, f533; +mul.f32 f535, f526, 0f3F167918; +mul.f32 f536, f528, 0f3F737871; +sub.f32 f537, f535, f536; +sub.f32 f538, f534, f537; +add.f32 f539, f537, f534; +mul.f32 f541, f521, 0f3F4F1BBD; +fma.rn.f32 f1270, f519, 0f3E9E377A, f142; +sub.f32 f542, f1270, f541; +sub.f32 f543, f353, f368; +sub.f32 f545, f358, f363; +mul.f32 f1269, f543, 0f3F737871; +fma.rn.f32 f546, f545, 0f3F167918, f1269; +add.f32 f547, f546, f542; +sub.f32 f548, f542, f546; +mul.f32 f549, f519, 0f3F4F1BBD; +sub.f32 f550, f142, f549; +fma.rn.f32 f551, f521, 0f3E9E377A, f550; +mul.f32 f552, f543, 0f3F167918; +mul.f32 f553, f545, 0f3F737871; +sub.f32 f554, f552, f553; +add.f32 f555, f554, f551; +sub.f32 f556, f551, f554; +add.f32 f557, f373, f388; +add.f32 f559, f378, f383; +add.f32 f1268, f117, f557; +add.f32 f560, f559, f1268; +add.f32 f561, f375, f390; +add.f32 f563, f380, f385; +add.f32 f1267, f134, f561; +add.f32 f564, f563, f1267; +mul.f32 f566, f559, 0f3F4F1BBD; +fma.rn.f32 f1266, f557, 0f3E9E377A, f117; +sub.f32 f567, f1266, f566; +sub.f32 f568, f375, f390; +sub.f32 f570, f380, f385; +mul.f32 f1265, f568, 0f3F737871; +fma.rn.f32 f571, f570, 0f3F167918, f1265; +sub.f32 f572, f567, f571; +add.f32 f573, f571, f567; +mul.f32 f574, f557, 0f3F4F1BBD; +sub.f32 f575, f117, f574; +fma.rn.f32 f576, f559, 0f3E9E377A, f575; +mul.f32 f577, f568, 0f3F167918; +mul.f32 f578, f570, 0f3F737871; +sub.f32 f579, f577, f578; +sub.f32 f580, f576, f579; +add.f32 f581, f579, f576; +mul.f32 f583, f563, 0f3F4F1BBD; +fma.rn.f32 f1264, f561, 0f3E9E377A, f134; +sub.f32 f584, f1264, f583; +sub.f32 f585, f373, f388; +sub.f32 f587, f378, f383; +mul.f32 f1263, f585, 0f3F737871; +fma.rn.f32 f588, f587, 0f3F167918, f1263; +add.f32 f589, f588, f584; +sub.f32 f590, f584, f588; +mul.f32 f591, f561, 0f3F4F1BBD; +sub.f32 f592, f134, f591; +fma.rn.f32 f593, f563, 0f3E9E377A, f592; +mul.f32 f594, f585, 0f3F167918; +mul.f32 f595, f587, 0f3F737871; +sub.f32 f596, f594, f595; +add.f32 f597, f596, f593; +sub.f32 f598, f593, f596; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 1000, r3; +mul.wide.u32 rd7, r7, 8; +mov.u64 rd8, %51; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f599, f600}, [rd6]; +mul.f32 f603, f438, f600; +mul.f32 f605, f599, f438; +mul.f32 f607, f600, f600; +mul.f32 f1262, f599, f599; +sub.f32 f608, f1262, f607; +mul.f32 f609, f600, f599; +fma.rn.f32 f610, f600, f599, f609; +mul.f32 f611, f480, f610; +mul.f32 f613, f608, f480; +mul.f32 f615, f600, f610; +mul.f32 f1261, f599, f608; +sub.f32 f616, f1261, f615; +mul.f32 f1260, f476, f610; +mul.f32 f617, f599, f610; +fma.rn.f32 f618, f600, f608, f617; +mul.f32 f619, f522, f618; +mul.f32 f621, f616, f522; +mul.f32 f1258, f599, f616; +mul.f32 f1259, f600, f618; +sub.f32 f624, f1258, f1259; +mul.f32 f1257, f518, f618; +mul.f32 f625, f599, f618; +fma.rn.f32 f626, f600, f616, f625; +mul.f32 f627, f564, f626; +mul.f32 f629, f624, f564; +mul.f32 f631, f600, f626; +mul.f32 f1256, f599, f624; +sub.f32 f632, f1256, f631; +mul.f32 f1255, f560, f626; +mul.f32 f633, f599, f626; +fma.rn.f32 f634, f600, f624, f633; +mul.f32 f635, f421, f634; +mul.f32 f637, f632, f421; +mul.f32 f1253, f599, f632; +mul.f32 f1254, f600, f634; +sub.f32 f640, f1253, f1254; +mul.f32 f1252, f404, f634; +mul.f32 f641, f599, f634; +fma.rn.f32 f642, f600, f632, f641; +mul.f32 f643, f463, f642; +mul.f32 f645, f640, f463; +mul.f32 f647, f600, f642; +mul.f32 f1251, f599, f640; +sub.f32 f648, f1251, f647; +mul.f32 f1250, f446, f642; +mul.f32 f649, f599, f642; +fma.rn.f32 f650, f600, f640, f649; +mul.f32 f651, f505, f650; +mul.f32 f653, f648, f505; +mul.f32 f655, f600, f650; +mul.f32 f1249, f599, f648; +sub.f32 f656, f1249, f655; +mul.f32 f1248, f488, f650; +mul.f32 f657, f599, f650; +fma.rn.f32 f658, f600, f648, f657; +mul.f32 f659, f547, f658; +mul.f32 f661, f656, f547; +mul.f32 f1246, f599, f656; +mul.f32 f1247, f600, f658; +sub.f32 f664, f1246, f1247; +mul.f32 f1245, f530, f658; +mul.f32 f665, f599, f658; +fma.rn.f32 f666, f600, f656, f665; +mul.f32 f667, f589, f666; +mul.f32 f669, f664, f589; +mul.f32 f671, f600, f666; +mul.f32 f1244, f599, f664; +sub.f32 f672, f1244, f671; +mul.f32 f1243, f572, f666; +mul.f32 f673, f599, f666; +fma.rn.f32 f674, f600, f664, f673; +mul.f32 f675, f429, f674; +mul.f32 f677, f672, f429; +mul.f32 f679, f600, f674; +mul.f32 f1242, f599, f672; +sub.f32 f680, f1242, f679; +mul.f32 f1241, f412, f674; +mul.f32 f681, f599, f674; +fma.rn.f32 f682, f600, f672, f681; +mul.f32 f683, f471, f682; +mul.f32 f685, f680, f471; +mul.f32 f1239, f599, f680; +mul.f32 f1240, f600, f682; +sub.f32 f688, f1239, f1240; +mul.f32 f1238, f454, f682; +mul.f32 f689, f599, f682; +fma.rn.f32 f690, f600, f680, f689; +mul.f32 f691, f513, f690; +mul.f32 f693, f688, f513; +mul.f32 f695, f600, f690; +mul.f32 f1237, f599, f688; +sub.f32 f696, f1237, f695; +mul.f32 f1236, f496, f690; +mul.f32 f697, f599, f690; +fma.rn.f32 f698, f600, f688, f697; +mul.f32 f699, f555, f698; +mul.f32 f701, f696, f555; +mul.f32 f1234, f599, f696; +mul.f32 f1235, f600, f698; +sub.f32 f704, f1234, f1235; +mul.f32 f1233, f538, f698; +mul.f32 f705, f599, f698; +fma.rn.f32 f706, f600, f696, f705; +mul.f32 f707, f597, f706; +mul.f32 f709, f704, f597; +mul.f32 f711, f600, f706; +mul.f32 f1232, f599, f704; +sub.f32 f712, f1232, f711; +mul.f32 f1231, f580, f706; +mul.f32 f713, f599, f706; +fma.rn.f32 f714, f600, f704, f713; +mul.f32 f715, f430, f714; +mul.f32 f717, f712, f430; +mul.f32 f719, f600, f714; +mul.f32 f1230, f599, f712; +sub.f32 f720, f1230, f719; +mul.f32 f1229, f413, f714; +mul.f32 f721, f599, f714; +fma.rn.f32 f722, f600, f712, f721; +mul.f32 f723, f472, f722; +mul.f32 f725, f720, f472; +mul.f32 f1227, f599, f720; +mul.f32 f1228, f600, f722; +sub.f32 f728, f1227, f1228; +mul.f32 f1226, f455, f722; +mul.f32 f729, f599, f722; +fma.rn.f32 f730, f600, f720, f729; +mul.f32 f731, f514, f730; +mul.f32 f733, f728, f514; +mul.f32 f735, f600, f730; +mul.f32 f1225, f599, f728; +sub.f32 f736, f1225, f735; +mul.f32 f1224, f497, f730; +mul.f32 f737, f599, f730; +fma.rn.f32 f738, f600, f728, f737; +mul.f32 f739, f556, f738; +mul.f32 f741, f736, f556; +mul.f32 f743, f600, f738; +mul.f32 f1223, f599, f736; +sub.f32 f744, f1223, f743; +mul.f32 f1222, f539, f738; +mul.f32 f745, f599, f738; +fma.rn.f32 f746, f600, f736, f745; +mul.f32 f747, f598, f746; +mul.f32 f749, f744, f598; +mul.f32 f1220, f599, f744; +mul.f32 f1221, f600, f746; +sub.f32 f752, f1220, f1221; +mul.f32 f1219, f581, f746; +mul.f32 f753, f599, f746; +fma.rn.f32 f754, f600, f744, f753; +mul.f32 f755, f422, f754; +mul.f32 f757, f752, f422; +mul.f32 f759, f600, f754; +mul.f32 f1218, f599, f752; +sub.f32 f760, f1218, f759; +mul.f32 f1217, f405, f754; +mul.f32 f761, f599, f754; +fma.rn.f32 f762, f600, f752, f761; +mul.f32 f763, f464, f762; +mul.f32 f765, f760, f464; +mul.f32 f1215, f599, f760; +mul.f32 f1216, f600, f762; +sub.f32 f768, f1215, f1216; +mul.f32 f1214, f447, f762; +mul.f32 f769, f599, f762; +fma.rn.f32 f770, f600, f760, f769; +mul.f32 f771, f506, f770; +mul.f32 f773, f768, f506; +mul.f32 f775, f600, f770; +mul.f32 f1213, f599, f768; +sub.f32 f776, f1213, f775; +mul.f32 f1212, f489, f770; +mul.f32 f777, f599, f770; +fma.rn.f32 f778, f600, f768, f777; +mul.f32 f779, f548, f778; +mul.f32 f781, f776, f548; +mul.f32 f783, f600, f778; +mul.f32 f1211, f599, f776; +sub.f32 f784, f1211, f783; +mul.f32 f1210, f531, f778; +mul.f32 f785, f599, f778; +mul.f32 f1209, f434, f600; +fma.rn.f32 f786, f600, f776, f785; +mul.f32 f787, f590, f786; +mul.f32 f788, f573, f786; +mul.f32 f789, f784, f590; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +add.f32 f790, f1295, f1290; +add.f32 f791, f393, f1293; +st.shared.v2.f32 [r9], {f791, f790}; +fma.rn.f32 f792, f599, f434, f603; +sub.f32 f793, f605, f1209; +st.shared.v2.f32 [r9+8], {f792, f793}; +fma.rn.f32 f794, f608, f476, f611; +sub.f32 f795, f613, f1260; +st.shared.v2.f32 [r9+16], {f794, f795}; +fma.rn.f32 f796, f616, f518, f619; +sub.f32 f797, f621, f1257; +st.shared.v2.f32 [r9+24], {f796, f797}; +fma.rn.f32 f798, f624, f560, f627; +sub.f32 f799, f629, f1255; +st.shared.v2.f32 [r9+32], {f798, f799}; +sub.f32 f800, f637, f1252; +fma.rn.f32 f801, f632, f404, f635; +st.shared.v2.f32 [r9+40], {f801, f800}; +fma.rn.f32 f802, f640, f446, f643; +sub.f32 f803, f645, f1250; +st.shared.v2.f32 [r9+48], {f802, f803}; +sub.f32 f804, f653, f1248; +fma.rn.f32 f805, f648, f488, f651; +st.shared.v2.f32 [r9+56], {f805, f804}; +fma.rn.f32 f806, f656, f530, f659; +sub.f32 f807, f661, f1245; +st.shared.v2.f32 [r9+64], {f806, f807}; +fma.rn.f32 f808, f664, f572, f667; +sub.f32 f809, f669, f1243; +st.shared.v2.f32 [r9+72], {f808, f809}; +fma.rn.f32 f810, f672, f412, f675; +sub.f32 f811, f677, f1241; +st.shared.v2.f32 [r9+80], {f810, f811}; +fma.rn.f32 f812, f680, f454, f683; +sub.f32 f813, f685, f1238; +st.shared.v2.f32 [r9+88], {f812, f813}; +fma.rn.f32 f814, f688, f496, f691; +sub.f32 f815, f693, f1236; +st.shared.v2.f32 [r9+96], {f814, f815}; +fma.rn.f32 f816, f696, f538, f699; +sub.f32 f817, f701, f1233; +st.shared.v2.f32 [r9+104], {f816, f817}; +fma.rn.f32 f818, f704, f580, f707; +sub.f32 f819, f709, f1231; +st.shared.v2.f32 [r9+112], {f818, f819}; +fma.rn.f32 f820, f712, f413, f715; +sub.f32 f821, f717, f1229; +st.shared.v2.f32 [r9+120], {f820, f821}; +fma.rn.f32 f822, f720, f455, f723; +sub.f32 f823, f725, f1226; +st.shared.v2.f32 [r9+128], {f822, f823}; +fma.rn.f32 f824, f728, f497, f731; +sub.f32 f825, f733, f1224; +st.shared.v2.f32 [r9+136], {f824, f825}; +fma.rn.f32 f826, f736, f539, f739; +sub.f32 f827, f741, f1222; +st.shared.v2.f32 [r9+144], {f826, f827}; +fma.rn.f32 f828, f744, f581, f747; +sub.f32 f829, f749, f1219; +st.shared.v2.f32 [r9+152], {f828, f829}; +fma.rn.f32 f830, f752, f405, f755; +sub.f32 f831, f757, f1217; +st.shared.v2.f32 [r9+160], {f830, f831}; +fma.rn.f32 f832, f760, f447, f763; +sub.f32 f833, f765, f1214; +st.shared.v2.f32 [r9+168], {f832, f833}; +fma.rn.f32 f834, f768, f489, f771; +sub.f32 f835, f773, f1212; +st.shared.v2.f32 [r9+176], {f834, f835}; +fma.rn.f32 f836, f776, f531, f779; +sub.f32 f837, f781, f1210; +st.shared.v2.f32 [r9+184], {f836, f837}; +fma.rn.f32 f838, f784, f573, f787; +sub.f32 f839, f789, f788; +st.shared.v2.f32 [r9+192], {f838, f839}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.v2.f32 {f840, f841}, [r10]; +ld.shared.v2.f32 {f844, f845}, [r10+40]; +ld.shared.v2.f32 {f848, f849}, [r10+80]; +ld.shared.v2.f32 {f852, f853}, [r10+120]; +ld.shared.v2.f32 {f856, f857}, [r10+160]; +ld.shared.v2.f32 {f860, f861}, [r10+200]; +ld.shared.v2.f32 {f864, f865}, [r10+240]; +ld.shared.v2.f32 {f868, f869}, [r10+280]; +ld.shared.v2.f32 {f872, f873}, [r10+320]; +ld.shared.v2.f32 {f876, f877}, [r10+360]; +ld.shared.v2.f32 {f880, f881}, [r10+400]; +ld.shared.v2.f32 {f884, f885}, [r10+440]; +ld.shared.v2.f32 {f888, f889}, [r10+480]; +ld.shared.v2.f32 {f892, f893}, [r10+520]; +ld.shared.v2.f32 {f896, f897}, [r10+560]; +ld.shared.v2.f32 {f900, f901}, [r10+600]; +ld.shared.v2.f32 {f904, f905}, [r10+640]; +ld.shared.v2.f32 {f908, f909}, [r10+680]; +ld.shared.v2.f32 {f912, f913}, [r10+720]; +ld.shared.v2.f32 {f916, f917}, [r10+760]; +ld.shared.v2.f32 {f920, f921}, [r10+800]; +ld.shared.v2.f32 {f924, f925}, [r10+840]; +ld.shared.v2.f32 {f928, f929}, [r10+880]; +ld.shared.v2.f32 {f932, f933}, [r10+920]; +ld.shared.v2.f32 {f936, f937}, [r10+960]; +add.f32 f940, f860, f920; +add.f32 f942, f880, f900; +mul.f32 f947, f942, 0f3F4F1BBD; +fma.rn.f32 f1208, f940, 0f3E9E377A, f840; +sub.f32 f948, f1208, f947; +add.f32 f1207, f861, f921; +sub.f32 f949, f861, f921; +add.f32 f1206, f881, f901; +sub.f32 f951, f881, f901; +mul.f32 f1205, f949, 0f3F737871; +fma.rn.f32 f952, f951, 0f3F167918, f1205; +add.f32 f1204, f840, f940; +mul.f32 f953, f940, 0f3F4F1BBD; +sub.f32 f954, f840, f953; +fma.rn.f32 f955, f942, 0f3E9E377A, f954; +mul.f32 f956, f949, 0f3F167918; +mul.f32 f957, f951, 0f3F737871; +sub.f32 f958, f956, f957; +fma.rn.f32 f1202, f1207, 0f3E9E377A, f841; +mul.f32 f1203, f1206, 0f3F4F1BBD; +sub.f32 f961, f1202, f1203; +sub.f32 f962, f860, f920; +sub.f32 f964, f880, f900; +mul.f32 f1201, f962, 0f3F737871; +fma.rn.f32 f965, f964, 0f3F167918, f1201; +add.f32 f1200, f841, f1207; +mul.f32 f966, f1207, 0f3F4F1BBD; +sub.f32 f967, f841, f966; +fma.rn.f32 f968, f1206, 0f3E9E377A, f967; +mul.f32 f969, f962, 0f3F167918; +mul.f32 f970, f964, 0f3F737871; +sub.f32 f971, f969, f970; +add.f32 f972, f864, f924; +add.f32 f974, f884, f904; +fma.rn.f32 f1198, f972, 0f3E9E377A, f844; +mul.f32 f1199, f974, 0f3F4F1BBD; +sub.f32 f980, f1198, f1199; +add.f32 f1197, f865, f925; +sub.f32 f981, f865, f925; +add.f32 f1196, f885, f905; +sub.f32 f983, f885, f905; +mul.f32 f1195, f981, 0f3F737871; +fma.rn.f32 f984, f983, 0f3F167918, f1195; +add.f32 f1194, f844, f972; +mul.f32 f985, f972, 0f3F4F1BBD; +sub.f32 f986, f844, f985; +fma.rn.f32 f987, f974, 0f3E9E377A, f986; +mul.f32 f988, f981, 0f3F167918; +mul.f32 f989, f983, 0f3F737871; +sub.f32 f990, f988, f989; +mul.f32 f992, f1196, 0f3F4F1BBD; +fma.rn.f32 f1193, f1197, 0f3E9E377A, f845; +sub.f32 f993, f1193, f992; +sub.f32 f994, f864, f924; +sub.f32 f996, f884, f904; +mul.f32 f1192, f994, 0f3F737871; +fma.rn.f32 f997, f996, 0f3F167918, f1192; +add.f32 f1191, f845, f1197; +mul.f32 f998, f1197, 0f3F4F1BBD; +sub.f32 f999, f845, f998; +fma.rn.f32 f1000, f1196, 0f3E9E377A, f999; +mul.f32 f1001, f994, 0f3F167918; +mul.f32 f1002, f996, 0f3F737871; +sub.f32 f1003, f1001, f1002; +add.f32 f1004, f868, f928; +add.f32 f1006, f888, f908; +mul.f32 f1011, f1006, 0f3F4F1BBD; +fma.rn.f32 f1190, f1004, 0f3E9E377A, f848; +sub.f32 f1012, f1190, f1011; +add.f32 f1189, f869, f929; +sub.f32 f1013, f869, f929; +add.f32 f1188, f889, f909; +sub.f32 f1015, f889, f909; +mul.f32 f1187, f1013, 0f3F737871; +fma.rn.f32 f1016, f1015, 0f3F167918, f1187; +add.f32 f1186, f848, f1004; +mul.f32 f1017, f1004, 0f3F4F1BBD; +sub.f32 f1018, f848, f1017; +fma.rn.f32 f1019, f1006, 0f3E9E377A, f1018; +mul.f32 f1020, f1013, 0f3F167918; +mul.f32 f1021, f1015, 0f3F737871; +sub.f32 f1022, f1020, f1021; +fma.rn.f32 f1184, f1189, 0f3E9E377A, f849; +mul.f32 f1185, f1188, 0f3F4F1BBD; +sub.f32 f1025, f1184, f1185; +sub.f32 f1026, f868, f928; +sub.f32 f1028, f888, f908; +mul.f32 f1183, f1026, 0f3F737871; +fma.rn.f32 f1029, f1028, 0f3F167918, f1183; +add.f32 f1182, f849, f1189; +mul.f32 f1030, f1189, 0f3F4F1BBD; +sub.f32 f1031, f849, f1030; +fma.rn.f32 f1032, f1188, 0f3E9E377A, f1031; +mul.f32 f1033, f1026, 0f3F167918; +mul.f32 f1034, f1028, 0f3F737871; +sub.f32 f1035, f1033, f1034; +add.f32 f1036, f872, f932; +add.f32 f1038, f892, f912; +fma.rn.f32 f1180, f1036, 0f3E9E377A, f852; +mul.f32 f1181, f1038, 0f3F4F1BBD; +sub.f32 f1044, f1180, f1181; +add.f32 f1179, f873, f933; +sub.f32 f1045, f873, f933; +add.f32 f1178, f893, f913; +sub.f32 f1047, f893, f913; +mul.f32 f1177, f1045, 0f3F737871; +fma.rn.f32 f1048, f1047, 0f3F167918, f1177; +add.f32 f1176, f852, f1036; +mul.f32 f1049, f1036, 0f3F4F1BBD; +sub.f32 f1050, f852, f1049; +fma.rn.f32 f1051, f1038, 0f3E9E377A, f1050; +mul.f32 f1052, f1045, 0f3F167918; +mul.f32 f1053, f1047, 0f3F737871; +sub.f32 f1054, f1052, f1053; +mul.f32 f1056, f1178, 0f3F4F1BBD; +fma.rn.f32 f1175, f1179, 0f3E9E377A, f853; +sub.f32 f1057, f1175, f1056; +sub.f32 f1058, f872, f932; +sub.f32 f1060, f892, f912; +mul.f32 f1174, f1058, 0f3F737871; +fma.rn.f32 f1061, f1060, 0f3F167918, f1174; +add.f32 f1173, f853, f1179; +mul.f32 f1062, f1179, 0f3F4F1BBD; +sub.f32 f1063, f853, f1062; +fma.rn.f32 f1064, f1178, 0f3E9E377A, f1063; +mul.f32 f1065, f1058, 0f3F167918; +mul.f32 f1066, f1060, 0f3F737871; +sub.f32 f1067, f1065, f1066; +add.f32 f1068, f876, f936; +add.f32 f1070, f896, f916; +mul.f32 f1075, f1070, 0f3F4F1BBD; +fma.rn.f32 f1172, f1068, 0f3E9E377A, f856; +sub.f32 f1076, f1172, f1075; +add.f32 f1171, f877, f937; +sub.f32 f1077, f877, f937; +add.f32 f1170, f897, f917; +sub.f32 f1079, f897, f917; +mul.f32 f1169, f1077, 0f3F737871; +fma.rn.f32 f1080, f1079, 0f3F167918, f1169; +add.f32 f1168, f856, f1068; +mul.f32 f1081, f1068, 0f3F4F1BBD; +sub.f32 f1082, f856, f1081; +fma.rn.f32 f1083, f1070, 0f3E9E377A, f1082; +mul.f32 f1084, f1077, 0f3F167918; +mul.f32 f1085, f1079, 0f3F737871; +sub.f32 f1086, f1084, f1085; +fma.rn.f32 f1166, f1171, 0f3E9E377A, f857; +mul.f32 f1167, f1170, 0f3F4F1BBD; +sub.f32 f1089, f1166, f1167; +sub.f32 f1090, f876, f936; +sub.f32 f1092, f896, f916; +mul.f32 f1165, f1090, 0f3F737871; +fma.rn.f32 f1093, f1092, 0f3F167918, f1165; +add.f32 f1164, f857, f1171; +mul.f32 f1094, f1171, 0f3F4F1BBD; +sub.f32 f1095, f857, f1094; +fma.rn.f32 f1096, f1170, 0f3E9E377A, f1095; +mul.f32 f1097, f1090, 0f3F167918; +mul.f32 f1098, f1092, 0f3F737871; +sub.f32 f1099, f1097, f1098; +add.f32 %1, f1206, f1200; +add.f32 %0, f942, f1204; +add.f32 %3, f1196, f1191; +add.f32 %2, f974, f1194; +add.f32 %5, f1188, f1182; +add.f32 %4, f1006, f1186; +add.f32 %7, f1178, f1173; +add.f32 %6, f1038, f1176; +add.f32 %9, f1170, f1164; +add.f32 %8, f1070, f1168; +add.f32 %11, f965, f961; +sub.f32 %10, f948, f952; +sub.f32 %12, f980, f984; +add.f32 %13, f997, f993; +sub.f32 %14, f1012, f1016; +add.f32 %15, f1029, f1025; +sub.f32 %16, f1044, f1048; +add.f32 %17, f1061, f1057; +add.f32 %19, f1093, f1089; +sub.f32 %18, f1076, f1080; +add.f32 %21, f971, f968; +sub.f32 %20, f955, f958; +add.f32 %23, f1003, f1000; +sub.f32 %22, f987, f990; +sub.f32 %24, f1019, f1022; +add.f32 %25, f1035, f1032; +sub.f32 %26, f1051, f1054; +add.f32 %27, f1067, f1064; +sub.f32 %28, f1083, f1086; +add.f32 %29, f1099, f1096; +sub.f32 %31, f968, f971; +add.f32 %30, f958, f955; +sub.f32 %33, f1000, f1003; +add.f32 %32, f990, f987; +sub.f32 %35, f1032, f1035; +add.f32 %34, f1022, f1019; +sub.f32 %37, f1064, f1067; +add.f32 %36, f1054, f1051; +sub.f32 %39, f1096, f1099; +add.f32 %38, f1086, f1083; +sub.f32 %41, f961, f965; +add.f32 %40, f952, f948; +sub.f32 %43, f993, f997; +add.f32 %42, f984, f980; +sub.f32 %45, f1025, f1029; +add.f32 %44, f1016, f1012; +sub.f32 %47, f1057, f1061; +add.f32 %46, f1048, f1044; +sub.f32 %49, f1089, f1093; +add.f32 %48, f1080, f1076; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[23].y), "f"(rmem[4].y), "f"(rmem[14].y), "f"(rmem[19].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<360, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1100>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 500, r2; +add.f32 f101, %65, %105; +add.f32 f102, %52, f101; +add.f32 f103, %78, %92; +add.f32 f104, f103, f102; +add.f32 f105, %67, %107; +add.f32 f106, %53, f105; +add.f32 f107, %80, %93; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %52; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %67, %107; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %80, %93; +fma.rn.f32 f115, f114, 0f3F167918, f113; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %52, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +fma.rn.f32 f126, f105, 0f3E9E377A, %53; +mul.f32 f127, f107, 0f3F4F1BBD; +sub.f32 f128, f126, f127; +sub.f32 f129, %65, %105; +mul.f32 f130, f129, 0f3F737871; +sub.f32 f131, %78, %92; +fma.rn.f32 f132, f131, 0f3F167918, f130; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %53, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %68, %108; +add.f32 f144, %54, f143; +add.f32 f145, %81, %94; +add.f32 f146, f145, f144; +add.f32 f147, %69, %109; +add.f32 f148, %56, f147; +add.f32 f149, %83, %96; +add.f32 f150, f149, f148; +fma.rn.f32 f151, f143, 0f3E9E377A, %54; +mul.f32 f152, f145, 0f3F4F1BBD; +sub.f32 f153, f151, f152; +sub.f32 f154, %69, %109; +mul.f32 f155, f154, 0f3F737871; +sub.f32 f156, %83, %96; +fma.rn.f32 f157, f156, 0f3F167918, f155; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %54, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +fma.rn.f32 f168, f147, 0f3E9E377A, %56; +mul.f32 f169, f149, 0f3F4F1BBD; +sub.f32 f170, f168, f169; +sub.f32 f171, %68, %108; +mul.f32 f172, f171, 0f3F737871; +sub.f32 f173, %81, %94; +fma.rn.f32 f174, f173, 0f3F167918, f172; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %56, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %70, %110; +add.f32 f186, %57, f185; +add.f32 f187, %84, %97; +add.f32 f188, f187, f186; +add.f32 f189, %72, %112; +add.f32 f190, %59, f189; +add.f32 f191, %85, %99; +add.f32 f192, f191, f190; +fma.rn.f32 f193, f185, 0f3E9E377A, %57; +mul.f32 f194, f187, 0f3F4F1BBD; +sub.f32 f195, f193, f194; +sub.f32 f196, %72, %112; +mul.f32 f197, f196, 0f3F737871; +sub.f32 f198, %85, %99; +fma.rn.f32 f199, f198, 0f3F167918, f197; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %57, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f210, f189, 0f3E9E377A, %59; +mul.f32 f211, f191, 0f3F4F1BBD; +sub.f32 f212, f210, f211; +sub.f32 f213, %70, %110; +mul.f32 f214, f213, 0f3F737871; +sub.f32 f215, %84, %97; +fma.rn.f32 f216, f215, 0f3F167918, f214; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %59, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %73, %113; +add.f32 f228, %60, f227; +add.f32 f229, %86, %100; +add.f32 f230, f229, f228; +add.f32 f231, %75, %115; +add.f32 f232, %61, f231; +add.f32 f233, %88, %101; +add.f32 f234, f233, f232; +fma.rn.f32 f235, f227, 0f3E9E377A, %60; +mul.f32 f236, f229, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %75, %115; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %88, %101; +fma.rn.f32 f241, f240, 0f3F167918, f239; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %60, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +fma.rn.f32 f252, f231, 0f3E9E377A, %61; +mul.f32 f253, f233, 0f3F4F1BBD; +sub.f32 f254, f252, f253; +sub.f32 f255, %73, %113; +mul.f32 f256, f255, 0f3F737871; +sub.f32 f257, %86, %100; +fma.rn.f32 f258, f257, 0f3F167918, f256; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %61, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %76, %116; +add.f32 f270, %62, f269; +add.f32 f271, %89, %102; +add.f32 f272, f271, f270; +add.f32 f273, %77, %117; +add.f32 f274, %64, f273; +add.f32 f275, %91, %104; +add.f32 f276, f275, f274; +fma.rn.f32 f277, f269, 0f3E9E377A, %62; +mul.f32 f278, f271, 0f3F4F1BBD; +sub.f32 f279, f277, f278; +sub.f32 f280, %77, %117; +mul.f32 f281, f280, 0f3F737871; +sub.f32 f282, %91, %104; +fma.rn.f32 f283, f282, 0f3F167918, f281; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %62, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +fma.rn.f32 f294, f273, 0f3E9E377A, %64; +mul.f32 f295, f275, 0f3F4F1BBD; +sub.f32 f296, f294, f295; +sub.f32 f297, %76, %116; +mul.f32 f298, f297, 0f3F737871; +sub.f32 f299, %89, %102; +fma.rn.f32 f300, f299, 0f3F167918, f298; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %64, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mov.u32 r4, %tid.x; +mul.f32 f311, f158, 0f3F77F511; +mul.f32 f312, f175, 0f3E7EA890; +sub.f32 f313, f311, f312; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f316, f200, 0f3F6055A2; +mul.f32 f317, f217, 0f3EF6A86B; +sub.f32 f318, f316, f317; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f321, f242, 0f3F3A9DB0; +mul.f32 f322, f259, 0f3F2F3E7B; +sub.f32 f323, f321, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f326, f284, 0f3F092BF2; +mul.f32 f327, f301, 0f3F5825E0; +sub.f32 f328, f326, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f331, f166, 0f3F6055A2; +mul.f32 f332, f183, 0f3EF6A86B; +sub.f32 f333, f331, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f336, f208, 0f3F092BF2; +mul.f32 f337, f225, 0f3F5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f341, f250, 0f3D809851; +mul.f32 f342, f267, 0f3F7F7EAE; +sub.f32 f343, f341, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f346, f292, 0fBED9FFBE; +mul.f32 f347, f309, 0f3F67A2BF; +sub.f32 f348, f346, f347; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f351, f167, 0f3F3A9DB0; +mul.f32 f352, f184, 0f3F2F3E7B; +sub.f32 f353, f351, f352; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f356, f209, 0f3D809851; +mul.f32 f357, f226, 0f3F7F7EAE; +sub.f32 f358, f356, f357; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f361, f251, 0fBF232E38; +mul.f32 f362, f268, 0f3F45405B; +sub.f32 f363, f361, f362; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f366, f293, 0fBF7DFB3B; +mul.f32 f367, f310, 0f3E00575B; +sub.f32 f368, f366, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f371, f159, 0f3F092BF2; +mul.f32 f372, f176, 0f3F5825E0; +sub.f32 f373, f371, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f376, f201, 0fBED9FFBE; +mul.f32 f377, f218, 0f3F67A2BF; +sub.f32 f378, f376, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f381, f243, 0fBF7DFB3B; +mul.f32 f382, f260, 0f3E00575B; +sub.f32 f383, f381, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f386, f285, 0fBF232E38; +mul.f32 f387, f302, 0fBF45405B; +sub.f32 f388, f386, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f392, f104, f391; +add.f32 f393, f188, f230; +add.f32 f394, f393, f392; +add.f32 f395, f150, f276; +add.f32 f396, f108, f395; +add.f32 f397, f192, f234; +add.f32 f398, f397, f396; +fma.rn.f32 f399, f391, 0f3E9E377A, f104; +mul.f32 f400, f393, 0f3F4F1BBD; +sub.f32 f401, f399, f400; +sub.f32 f402, f150, f276; +mul.f32 f403, f402, 0f3F737871; +sub.f32 f404, f192, f234; +fma.rn.f32 f405, f404, 0f3F167918, f403; +sub.f32 f406, f401, f405; +add.f32 f407, f405, f401; +mul.f32 f408, f391, 0f3F4F1BBD; +sub.f32 f409, f104, f408; +fma.rn.f32 f410, f393, 0f3E9E377A, f409; +mul.f32 f411, f402, 0f3F167918; +mul.f32 f412, f404, 0f3F737871; +sub.f32 f413, f411, f412; +sub.f32 f414, f410, f413; +add.f32 f415, f413, f410; +fma.rn.f32 f416, f395, 0f3E9E377A, f108; +mul.f32 f417, f397, 0f3F4F1BBD; +sub.f32 f418, f416, f417; +sub.f32 f419, f146, f272; +mul.f32 f420, f419, 0f3F737871; +sub.f32 f421, f188, f230; +fma.rn.f32 f422, f421, 0f3F167918, f420; +add.f32 f423, f422, f418; +sub.f32 f424, f418, f422; +mul.f32 f425, f395, 0f3F4F1BBD; +sub.f32 f426, f108, f425; +fma.rn.f32 f427, f397, 0f3E9E377A, f426; +mul.f32 f428, f419, 0f3F167918; +mul.f32 f429, f421, 0f3F737871; +sub.f32 f430, f428, f429; +add.f32 f431, f430, f427; +sub.f32 f432, f427, f430; +add.f32 f433, f313, f328; +add.f32 f434, f116, f433; +add.f32 f435, f318, f323; +add.f32 f436, f435, f434; +add.f32 f437, f315, f330; +add.f32 f438, f133, f437; +add.f32 f439, f320, f325; +add.f32 f440, f439, f438; +fma.rn.f32 f441, f433, 0f3E9E377A, f116; +mul.f32 f442, f435, 0f3F4F1BBD; +sub.f32 f443, f441, f442; +sub.f32 f444, f315, f330; +mul.f32 f445, f444, 0f3F737871; +sub.f32 f446, f320, f325; +fma.rn.f32 f447, f446, 0f3F167918, f445; +sub.f32 f448, f443, f447; +add.f32 f449, f447, f443; +mul.f32 f450, f433, 0f3F4F1BBD; +sub.f32 f451, f116, f450; +fma.rn.f32 f452, f435, 0f3E9E377A, f451; +mul.f32 f453, f444, 0f3F167918; +mul.f32 f454, f446, 0f3F737871; +sub.f32 f455, f453, f454; +sub.f32 f456, f452, f455; +add.f32 f457, f455, f452; +fma.rn.f32 f458, f437, 0f3E9E377A, f133; +mul.f32 f459, f439, 0f3F4F1BBD; +sub.f32 f460, f458, f459; +sub.f32 f461, f313, f328; +mul.f32 f462, f461, 0f3F737871; +sub.f32 f463, f318, f323; +fma.rn.f32 f464, f463, 0f3F167918, f462; +add.f32 f465, f464, f460; +sub.f32 f466, f460, f464; +mul.f32 f467, f437, 0f3F4F1BBD; +sub.f32 f468, f133, f467; +fma.rn.f32 f469, f439, 0f3E9E377A, f468; +mul.f32 f470, f461, 0f3F167918; +mul.f32 f471, f463, 0f3F737871; +sub.f32 f472, f470, f471; +add.f32 f473, f472, f469; +sub.f32 f474, f469, f472; +add.f32 f475, f333, f348; +add.f32 f476, f124, f475; +add.f32 f477, f338, f343; +add.f32 f478, f477, f476; +add.f32 f479, f335, f350; +add.f32 f480, f141, f479; +add.f32 f481, f340, f345; +add.f32 f482, f481, f480; +fma.rn.f32 f483, f475, 0f3E9E377A, f124; +mul.f32 f484, f477, 0f3F4F1BBD; +sub.f32 f485, f483, f484; +sub.f32 f486, f335, f350; +mul.f32 f487, f486, 0f3F737871; +sub.f32 f488, f340, f345; +fma.rn.f32 f489, f488, 0f3F167918, f487; +sub.f32 f490, f485, f489; +add.f32 f491, f489, f485; +mul.f32 f492, f475, 0f3F4F1BBD; +sub.f32 f493, f124, f492; +fma.rn.f32 f494, f477, 0f3E9E377A, f493; +mul.f32 f495, f486, 0f3F167918; +mul.f32 f496, f488, 0f3F737871; +sub.f32 f497, f495, f496; +sub.f32 f498, f494, f497; +add.f32 f499, f497, f494; +fma.rn.f32 f500, f479, 0f3E9E377A, f141; +mul.f32 f501, f481, 0f3F4F1BBD; +sub.f32 f502, f500, f501; +sub.f32 f503, f333, f348; +mul.f32 f504, f503, 0f3F737871; +sub.f32 f505, f338, f343; +fma.rn.f32 f506, f505, 0f3F167918, f504; +add.f32 f507, f506, f502; +sub.f32 f508, f502, f506; +mul.f32 f509, f479, 0f3F4F1BBD; +sub.f32 f510, f141, f509; +fma.rn.f32 f511, f481, 0f3E9E377A, f510; +mul.f32 f512, f503, 0f3F167918; +mul.f32 f513, f505, 0f3F737871; +sub.f32 f514, f512, f513; +add.f32 f515, f514, f511; +sub.f32 f516, f511, f514; +add.f32 f517, f353, f368; +add.f32 f518, f125, f517; +add.f32 f519, f358, f363; +add.f32 f520, f519, f518; +add.f32 f521, f355, f370; +add.f32 f522, f142, f521; +add.f32 f523, f360, f365; +add.f32 f524, f523, f522; +fma.rn.f32 f525, f517, 0f3E9E377A, f125; +mul.f32 f526, f519, 0f3F4F1BBD; +sub.f32 f527, f525, f526; +sub.f32 f528, f355, f370; +mul.f32 f529, f528, 0f3F737871; +sub.f32 f530, f360, f365; +fma.rn.f32 f531, f530, 0f3F167918, f529; +sub.f32 f532, f527, f531; +add.f32 f533, f531, f527; +mul.f32 f534, f517, 0f3F4F1BBD; +sub.f32 f535, f125, f534; +fma.rn.f32 f536, f519, 0f3E9E377A, f535; +mul.f32 f537, f528, 0f3F167918; +mul.f32 f538, f530, 0f3F737871; +sub.f32 f539, f537, f538; +sub.f32 f540, f536, f539; +add.f32 f541, f539, f536; +fma.rn.f32 f542, f521, 0f3E9E377A, f142; +mul.f32 f543, f523, 0f3F4F1BBD; +sub.f32 f544, f542, f543; +sub.f32 f545, f353, f368; +mul.f32 f546, f545, 0f3F737871; +sub.f32 f547, f358, f363; +fma.rn.f32 f548, f547, 0f3F167918, f546; +add.f32 f549, f548, f544; +sub.f32 f550, f544, f548; +mul.f32 f551, f521, 0f3F4F1BBD; +sub.f32 f552, f142, f551; +fma.rn.f32 f553, f523, 0f3E9E377A, f552; +mul.f32 f554, f545, 0f3F167918; +mul.f32 f555, f547, 0f3F737871; +sub.f32 f556, f554, f555; +add.f32 f557, f556, f553; +sub.f32 f558, f553, f556; +add.f32 f559, f373, f388; +add.f32 f560, f117, f559; +add.f32 f561, f378, f383; +add.f32 f562, f561, f560; +add.f32 f563, f375, f390; +add.f32 f564, f134, f563; +add.f32 f565, f380, f385; +add.f32 f566, f565, f564; +fma.rn.f32 f567, f559, 0f3E9E377A, f117; +mul.f32 f568, f561, 0f3F4F1BBD; +sub.f32 f569, f567, f568; +sub.f32 f570, f375, f390; +mul.f32 f571, f570, 0f3F737871; +sub.f32 f572, f380, f385; +fma.rn.f32 f573, f572, 0f3F167918, f571; +sub.f32 f574, f569, f573; +add.f32 f575, f573, f569; +mul.f32 f576, f559, 0f3F4F1BBD; +sub.f32 f577, f117, f576; +fma.rn.f32 f578, f561, 0f3E9E377A, f577; +mul.f32 f579, f570, 0f3F167918; +mul.f32 f580, f572, 0f3F737871; +sub.f32 f581, f579, f580; +sub.f32 f582, f578, f581; +add.f32 f583, f581, f578; +fma.rn.f32 f584, f563, 0f3E9E377A, f134; +mul.f32 f585, f565, 0f3F4F1BBD; +sub.f32 f586, f584, f585; +sub.f32 f587, f373, f388; +mul.f32 f588, f587, 0f3F737871; +sub.f32 f589, f378, f383; +fma.rn.f32 f590, f589, 0f3F167918, f588; +add.f32 f591, f590, f586; +sub.f32 f592, f586, f590; +mul.f32 f593, f563, 0f3F4F1BBD; +sub.f32 f594, f134, f593; +fma.rn.f32 f595, f565, 0f3E9E377A, f594; +mul.f32 f596, f587, 0f3F167918; +mul.f32 f597, f589, 0f3F737871; +sub.f32 f598, f596, f597; +add.f32 f599, f598, f595; +sub.f32 f600, f595, f598; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f601, f602}, [rd6]; +mul.f32 f605, f440, f602; +fma.rn.f32 f606, f601, f436, f605; +mul.f32 f607, f436, f602; +mul.f32 f608, f601, f440; +sub.f32 f609, f608, f607; +mul.f32 f610, f601, f601; +mul.f32 f611, f602, f602; +sub.f32 f612, f610, f611; +mul.f32 f613, f602, f601; +fma.rn.f32 f614, f602, f601, f613; +mul.f32 f615, f482, f614; +fma.rn.f32 f616, f612, f478, f615; +mul.f32 f617, f478, f614; +mul.f32 f618, f612, f482; +sub.f32 f619, f618, f617; +mul.f32 f620, f601, f612; +mul.f32 f621, f602, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f601, f614; +fma.rn.f32 f624, f602, f612, f623; +mul.f32 f625, f524, f624; +fma.rn.f32 f626, f622, f520, f625; +mul.f32 f627, f520, f624; +mul.f32 f628, f622, f524; +sub.f32 f629, f628, f627; +mul.f32 f630, f601, f622; +mul.f32 f631, f602, f624; +sub.f32 f632, f630, f631; +mul.f32 f633, f601, f624; +fma.rn.f32 f634, f602, f622, f633; +mul.f32 f635, f566, f634; +fma.rn.f32 f636, f632, f562, f635; +mul.f32 f637, f562, f634; +mul.f32 f638, f632, f566; +sub.f32 f639, f638, f637; +mul.f32 f640, f601, f632; +mul.f32 f641, f602, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f601, f634; +fma.rn.f32 f644, f602, f632, f643; +mul.f32 f645, f423, f644; +fma.rn.f32 f646, f642, f406, f645; +mul.f32 f647, f406, f644; +mul.f32 f648, f642, f423; +sub.f32 f649, f648, f647; +mul.f32 f650, f601, f642; +mul.f32 f651, f602, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f601, f644; +fma.rn.f32 f654, f602, f642, f653; +mul.f32 f655, f465, f654; +fma.rn.f32 f656, f652, f448, f655; +mul.f32 f657, f448, f654; +mul.f32 f658, f652, f465; +sub.f32 f659, f658, f657; +mul.f32 f660, f601, f652; +mul.f32 f661, f602, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f601, f654; +fma.rn.f32 f664, f602, f652, f663; +mul.f32 f665, f507, f664; +fma.rn.f32 f666, f662, f490, f665; +mul.f32 f667, f490, f664; +mul.f32 f668, f662, f507; +sub.f32 f669, f668, f667; +mul.f32 f670, f601, f662; +mul.f32 f671, f602, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f601, f664; +fma.rn.f32 f674, f602, f662, f673; +mul.f32 f675, f549, f674; +fma.rn.f32 f676, f672, f532, f675; +mul.f32 f677, f532, f674; +mul.f32 f678, f672, f549; +sub.f32 f679, f678, f677; +mul.f32 f680, f601, f672; +mul.f32 f681, f602, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f601, f674; +fma.rn.f32 f684, f602, f672, f683; +mul.f32 f685, f591, f684; +fma.rn.f32 f686, f682, f574, f685; +mul.f32 f687, f574, f684; +mul.f32 f688, f682, f591; +sub.f32 f689, f688, f687; +mul.f32 f690, f601, f682; +mul.f32 f691, f602, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f601, f684; +fma.rn.f32 f694, f602, f682, f693; +mul.f32 f695, f431, f694; +fma.rn.f32 f696, f692, f414, f695; +mul.f32 f697, f414, f694; +mul.f32 f698, f692, f431; +sub.f32 f699, f698, f697; +mul.f32 f700, f601, f692; +mul.f32 f701, f602, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f601, f694; +fma.rn.f32 f704, f602, f692, f703; +mul.f32 f705, f473, f704; +fma.rn.f32 f706, f702, f456, f705; +mul.f32 f707, f456, f704; +mul.f32 f708, f702, f473; +sub.f32 f709, f708, f707; +mul.f32 f710, f601, f702; +mul.f32 f711, f602, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f601, f704; +fma.rn.f32 f714, f602, f702, f713; +mul.f32 f715, f515, f714; +fma.rn.f32 f716, f712, f498, f715; +mul.f32 f717, f498, f714; +mul.f32 f718, f712, f515; +sub.f32 f719, f718, f717; +mul.f32 f720, f601, f712; +mul.f32 f721, f602, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f601, f714; +fma.rn.f32 f724, f602, f712, f723; +mul.f32 f725, f557, f724; +fma.rn.f32 f726, f722, f540, f725; +mul.f32 f727, f540, f724; +mul.f32 f728, f722, f557; +sub.f32 f729, f728, f727; +mul.f32 f730, f601, f722; +mul.f32 f731, f602, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f601, f724; +fma.rn.f32 f734, f602, f722, f733; +mul.f32 f735, f599, f734; +fma.rn.f32 f736, f732, f582, f735; +mul.f32 f737, f582, f734; +mul.f32 f738, f732, f599; +sub.f32 f739, f738, f737; +mul.f32 f740, f601, f732; +mul.f32 f741, f602, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f601, f734; +fma.rn.f32 f744, f602, f732, f743; +mul.f32 f745, f432, f744; +fma.rn.f32 f746, f742, f415, f745; +mul.f32 f747, f415, f744; +mul.f32 f748, f742, f432; +sub.f32 f749, f748, f747; +mul.f32 f750, f601, f742; +mul.f32 f751, f602, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f601, f744; +fma.rn.f32 f754, f602, f742, f753; +mul.f32 f755, f474, f754; +fma.rn.f32 f756, f752, f457, f755; +mul.f32 f757, f457, f754; +mul.f32 f758, f752, f474; +sub.f32 f759, f758, f757; +mul.f32 f760, f601, f752; +mul.f32 f761, f602, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f601, f754; +fma.rn.f32 f764, f602, f752, f763; +mul.f32 f765, f516, f764; +fma.rn.f32 f766, f762, f499, f765; +mul.f32 f767, f499, f764; +mul.f32 f768, f762, f516; +sub.f32 f769, f768, f767; +mul.f32 f770, f601, f762; +mul.f32 f771, f602, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f601, f764; +fma.rn.f32 f774, f602, f762, f773; +mul.f32 f775, f558, f774; +fma.rn.f32 f776, f772, f541, f775; +mul.f32 f777, f541, f774; +mul.f32 f778, f772, f558; +sub.f32 f779, f778, f777; +mul.f32 f780, f601, f772; +mul.f32 f781, f602, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f601, f774; +fma.rn.f32 f784, f602, f772, f783; +mul.f32 f785, f600, f784; +fma.rn.f32 f786, f782, f583, f785; +mul.f32 f787, f583, f784; +mul.f32 f788, f782, f600; +sub.f32 f789, f788, f787; +mul.f32 f790, f601, f782; +mul.f32 f791, f602, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f601, f784; +fma.rn.f32 f794, f602, f782, f793; +mul.f32 f795, f424, f794; +fma.rn.f32 f796, f792, f407, f795; +mul.f32 f797, f407, f794; +mul.f32 f798, f792, f424; +sub.f32 f799, f798, f797; +mul.f32 f800, f601, f792; +mul.f32 f801, f602, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f601, f794; +fma.rn.f32 f804, f602, f792, f803; +mul.f32 f805, f466, f804; +fma.rn.f32 f806, f802, f449, f805; +mul.f32 f807, f449, f804; +mul.f32 f808, f802, f466; +sub.f32 f809, f808, f807; +mul.f32 f810, f601, f802; +mul.f32 f811, f602, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f601, f804; +fma.rn.f32 f814, f602, f802, f813; +mul.f32 f815, f508, f814; +fma.rn.f32 f816, f812, f491, f815; +mul.f32 f817, f491, f814; +mul.f32 f818, f812, f508; +sub.f32 f819, f818, f817; +mul.f32 f820, f601, f812; +mul.f32 f821, f602, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f601, f814; +fma.rn.f32 f824, f602, f812, f823; +mul.f32 f825, f550, f824; +fma.rn.f32 f826, f822, f533, f825; +mul.f32 f827, f533, f824; +mul.f32 f828, f822, f550; +sub.f32 f829, f828, f827; +mul.f32 f830, f601, f822; +mul.f32 f831, f602, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f601, f824; +fma.rn.f32 f834, f602, f822, f833; +mul.f32 f835, f592, f834; +fma.rn.f32 f836, f832, f575, f835; +mul.f32 f837, f575, f834; +mul.f32 f838, f832, f592; +sub.f32 f839, f838, f837; +mad.lo.s32 r8, r5, 500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f394; +st.shared.f32 [r9+4], f606; +st.shared.f32 [r9+8], f616; +st.shared.f32 [r9+12], f626; +st.shared.f32 [r9+16], f636; +st.shared.f32 [r9+20], f646; +st.shared.f32 [r9+24], f656; +st.shared.f32 [r9+28], f666; +st.shared.f32 [r9+32], f676; +st.shared.f32 [r9+36], f686; +st.shared.f32 [r9+40], f696; +st.shared.f32 [r9+44], f706; +st.shared.f32 [r9+48], f716; +st.shared.f32 [r9+52], f726; +st.shared.f32 [r9+56], f736; +st.shared.f32 [r9+60], f746; +st.shared.f32 [r9+64], f756; +st.shared.f32 [r9+68], f766; +st.shared.f32 [r9+72], f776; +st.shared.f32 [r9+76], f786; +st.shared.f32 [r9+80], f796; +st.shared.f32 [r9+84], f806; +st.shared.f32 [r9+88], f816; +st.shared.f32 [r9+92], f826; +st.shared.f32 [r9+96], f836; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f840, [r10]; +ld.shared.f32 f841, [r10+20]; +ld.shared.f32 f842, [r10+40]; +ld.shared.f32 f843, [r10+60]; +ld.shared.f32 f844, [r10+80]; +ld.shared.f32 f845, [r10+100]; +ld.shared.f32 f846, [r10+120]; +ld.shared.f32 f847, [r10+140]; +ld.shared.f32 f848, [r10+160]; +ld.shared.f32 f849, [r10+180]; +ld.shared.f32 f850, [r10+200]; +ld.shared.f32 f851, [r10+220]; +ld.shared.f32 f852, [r10+240]; +ld.shared.f32 f853, [r10+260]; +ld.shared.f32 f854, [r10+280]; +ld.shared.f32 f855, [r10+300]; +ld.shared.f32 f856, [r10+320]; +ld.shared.f32 f857, [r10+340]; +ld.shared.f32 f858, [r10+360]; +ld.shared.f32 f859, [r10+380]; +ld.shared.f32 f860, [r10+400]; +ld.shared.f32 f861, [r10+420]; +ld.shared.f32 f862, [r10+440]; +ld.shared.f32 f863, [r10+460]; +ld.shared.f32 f864, [r10+480]; +barrier.sync 0; +st.shared.f32 [r9], f398; +st.shared.f32 [r9+4], f609; +st.shared.f32 [r9+8], f619; +st.shared.f32 [r9+12], f629; +st.shared.f32 [r9+16], f639; +st.shared.f32 [r9+20], f649; +st.shared.f32 [r9+24], f659; +st.shared.f32 [r9+28], f669; +st.shared.f32 [r9+32], f679; +st.shared.f32 [r9+36], f689; +st.shared.f32 [r9+40], f699; +st.shared.f32 [r9+44], f709; +st.shared.f32 [r9+48], f719; +st.shared.f32 [r9+52], f729; +st.shared.f32 [r9+56], f739; +st.shared.f32 [r9+60], f749; +st.shared.f32 [r9+64], f759; +st.shared.f32 [r9+68], f769; +st.shared.f32 [r9+72], f779; +st.shared.f32 [r9+76], f789; +st.shared.f32 [r9+80], f799; +st.shared.f32 [r9+84], f809; +st.shared.f32 [r9+88], f819; +st.shared.f32 [r9+92], f829; +st.shared.f32 [r9+96], f839; +barrier.sync 0; +ld.shared.f32 f865, [r10]; +ld.shared.f32 f866, [r10+20]; +ld.shared.f32 f867, [r10+40]; +ld.shared.f32 f868, [r10+60]; +ld.shared.f32 f869, [r10+80]; +ld.shared.f32 f870, [r10+100]; +ld.shared.f32 f871, [r10+120]; +ld.shared.f32 f872, [r10+140]; +ld.shared.f32 f873, [r10+160]; +ld.shared.f32 f874, [r10+180]; +ld.shared.f32 f875, [r10+200]; +ld.shared.f32 f876, [r10+220]; +ld.shared.f32 f877, [r10+240]; +ld.shared.f32 f878, [r10+260]; +ld.shared.f32 f879, [r10+280]; +ld.shared.f32 f880, [r10+300]; +ld.shared.f32 f881, [r10+320]; +ld.shared.f32 f882, [r10+340]; +ld.shared.f32 f883, [r10+360]; +ld.shared.f32 f884, [r10+380]; +ld.shared.f32 f885, [r10+400]; +ld.shared.f32 f886, [r10+420]; +ld.shared.f32 f887, [r10+440]; +ld.shared.f32 f888, [r10+460]; +ld.shared.f32 f889, [r10+480]; +add.f32 f890, f845, f860; +add.f32 f891, f840, f890; +add.f32 f892, f850, f855; +add.f32 f893, f870, f885; +add.f32 f894, f865, f893; +add.f32 f895, f875, f880; +fma.rn.f32 f896, f890, 0f3E9E377A, f840; +mul.f32 f897, f892, 0f3F4F1BBD; +sub.f32 f898, f896, f897; +sub.f32 f899, f870, f885; +mul.f32 f900, f899, 0f3F737871; +sub.f32 f901, f875, f880; +fma.rn.f32 f902, f901, 0f3F167918, f900; +mul.f32 f903, f890, 0f3F4F1BBD; +sub.f32 f904, f840, f903; +fma.rn.f32 f905, f892, 0f3E9E377A, f904; +mul.f32 f906, f899, 0f3F167918; +mul.f32 f907, f901, 0f3F737871; +sub.f32 f908, f906, f907; +fma.rn.f32 f909, f893, 0f3E9E377A, f865; +mul.f32 f910, f895, 0f3F4F1BBD; +sub.f32 f911, f909, f910; +sub.f32 f912, f845, f860; +mul.f32 f913, f912, 0f3F737871; +sub.f32 f914, f850, f855; +fma.rn.f32 f915, f914, 0f3F167918, f913; +mul.f32 f916, f893, 0f3F4F1BBD; +sub.f32 f917, f865, f916; +fma.rn.f32 f918, f895, 0f3E9E377A, f917; +mul.f32 f919, f912, 0f3F167918; +mul.f32 f920, f914, 0f3F737871; +sub.f32 f921, f919, f920; +add.f32 f922, f846, f861; +add.f32 f923, f841, f922; +add.f32 f924, f851, f856; +add.f32 f925, f871, f886; +add.f32 f926, f866, f925; +add.f32 f927, f876, f881; +fma.rn.f32 f928, f922, 0f3E9E377A, f841; +mul.f32 f929, f924, 0f3F4F1BBD; +sub.f32 f930, f928, f929; +sub.f32 f931, f871, f886; +mul.f32 f932, f931, 0f3F737871; +sub.f32 f933, f876, f881; +fma.rn.f32 f934, f933, 0f3F167918, f932; +mul.f32 f935, f922, 0f3F4F1BBD; +sub.f32 f936, f841, f935; +fma.rn.f32 f937, f924, 0f3E9E377A, f936; +mul.f32 f938, f931, 0f3F167918; +mul.f32 f939, f933, 0f3F737871; +sub.f32 f940, f938, f939; +fma.rn.f32 f941, f925, 0f3E9E377A, f866; +mul.f32 f942, f927, 0f3F4F1BBD; +sub.f32 f943, f941, f942; +sub.f32 f944, f846, f861; +mul.f32 f945, f944, 0f3F737871; +sub.f32 f946, f851, f856; +fma.rn.f32 f947, f946, 0f3F167918, f945; +mul.f32 f948, f925, 0f3F4F1BBD; +sub.f32 f949, f866, f948; +fma.rn.f32 f950, f927, 0f3E9E377A, f949; +mul.f32 f951, f944, 0f3F167918; +mul.f32 f952, f946, 0f3F737871; +sub.f32 f953, f951, f952; +add.f32 f954, f847, f862; +add.f32 f955, f842, f954; +add.f32 f956, f852, f857; +add.f32 f957, f872, f887; +add.f32 f958, f867, f957; +add.f32 f959, f877, f882; +fma.rn.f32 f960, f954, 0f3E9E377A, f842; +mul.f32 f961, f956, 0f3F4F1BBD; +sub.f32 f962, f960, f961; +sub.f32 f963, f872, f887; +mul.f32 f964, f963, 0f3F737871; +sub.f32 f965, f877, f882; +fma.rn.f32 f966, f965, 0f3F167918, f964; +mul.f32 f967, f954, 0f3F4F1BBD; +sub.f32 f968, f842, f967; +fma.rn.f32 f969, f956, 0f3E9E377A, f968; +mul.f32 f970, f963, 0f3F167918; +mul.f32 f971, f965, 0f3F737871; +sub.f32 f972, f970, f971; +fma.rn.f32 f973, f957, 0f3E9E377A, f867; +mul.f32 f974, f959, 0f3F4F1BBD; +sub.f32 f975, f973, f974; +sub.f32 f976, f847, f862; +mul.f32 f977, f976, 0f3F737871; +sub.f32 f978, f852, f857; +fma.rn.f32 f979, f978, 0f3F167918, f977; +mul.f32 f980, f957, 0f3F4F1BBD; +sub.f32 f981, f867, f980; +fma.rn.f32 f982, f959, 0f3E9E377A, f981; +mul.f32 f983, f976, 0f3F167918; +mul.f32 f984, f978, 0f3F737871; +sub.f32 f985, f983, f984; +add.f32 f986, f848, f863; +add.f32 f987, f843, f986; +add.f32 f988, f853, f858; +add.f32 f989, f873, f888; +add.f32 f990, f868, f989; +add.f32 f991, f878, f883; +fma.rn.f32 f992, f986, 0f3E9E377A, f843; +mul.f32 f993, f988, 0f3F4F1BBD; +sub.f32 f994, f992, f993; +sub.f32 f995, f873, f888; +mul.f32 f996, f995, 0f3F737871; +sub.f32 f997, f878, f883; +fma.rn.f32 f998, f997, 0f3F167918, f996; +mul.f32 f999, f986, 0f3F4F1BBD; +sub.f32 f1000, f843, f999; +fma.rn.f32 f1001, f988, 0f3E9E377A, f1000; +mul.f32 f1002, f995, 0f3F167918; +mul.f32 f1003, f997, 0f3F737871; +sub.f32 f1004, f1002, f1003; +fma.rn.f32 f1005, f989, 0f3E9E377A, f868; +mul.f32 f1006, f991, 0f3F4F1BBD; +sub.f32 f1007, f1005, f1006; +sub.f32 f1008, f848, f863; +mul.f32 f1009, f1008, 0f3F737871; +sub.f32 f1010, f853, f858; +fma.rn.f32 f1011, f1010, 0f3F167918, f1009; +mul.f32 f1012, f989, 0f3F4F1BBD; +sub.f32 f1013, f868, f1012; +fma.rn.f32 f1014, f991, 0f3E9E377A, f1013; +mul.f32 f1015, f1008, 0f3F167918; +mul.f32 f1016, f1010, 0f3F737871; +sub.f32 f1017, f1015, f1016; +add.f32 f1018, f849, f864; +add.f32 f1019, f844, f1018; +add.f32 f1020, f854, f859; +add.f32 f1021, f874, f889; +add.f32 f1022, f869, f1021; +add.f32 f1023, f879, f884; +fma.rn.f32 f1024, f1018, 0f3E9E377A, f844; +mul.f32 f1025, f1020, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f874, f889; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f879, f884; +fma.rn.f32 f1030, f1029, 0f3F167918, f1028; +mul.f32 f1031, f1018, 0f3F4F1BBD; +sub.f32 f1032, f844, f1031; +fma.rn.f32 f1033, f1020, 0f3E9E377A, f1032; +mul.f32 f1034, f1027, 0f3F167918; +mul.f32 f1035, f1029, 0f3F737871; +sub.f32 f1036, f1034, f1035; +fma.rn.f32 f1037, f1021, 0f3E9E377A, f869; +mul.f32 f1038, f1023, 0f3F4F1BBD; +sub.f32 f1039, f1037, f1038; +sub.f32 f1040, f849, f864; +mul.f32 f1041, f1040, 0f3F737871; +sub.f32 f1042, f854, f859; +fma.rn.f32 f1043, f1042, 0f3F167918, f1041; +mul.f32 f1044, f1021, 0f3F4F1BBD; +sub.f32 f1045, f869, f1044; +fma.rn.f32 f1046, f1023, 0f3E9E377A, f1045; +mul.f32 f1047, f1040, 0f3F167918; +mul.f32 f1048, f1042, 0f3F737871; +sub.f32 f1049, f1047, f1048; +add.f32 %0, f892, f891; +add.f32 %1, f895, f894; +add.f32 %2, f924, f923; +add.f32 %3, f927, f926; +add.f32 %4, f956, f955; +add.f32 %5, f959, f958; +add.f32 %6, f988, f987; +add.f32 %7, f991, f990; +add.f32 %8, f1020, f1019; +add.f32 %9, f1023, f1022; +add.f32 %11, f915, f911; +sub.f32 %10, f898, f902; +add.f32 %13, f947, f943; +sub.f32 %12, f930, f934; +add.f32 %15, f979, f975; +sub.f32 %14, f962, f966; +add.f32 %17, f1011, f1007; +sub.f32 %16, f994, f998; +add.f32 %19, f1043, f1039; +sub.f32 %18, f1026, f1030; +sub.f32 %20, f905, f908; +add.f32 %21, f921, f918; +sub.f32 %22, f937, f940; +add.f32 %23, f953, f950; +sub.f32 %24, f969, f972; +add.f32 %25, f985, f982; +sub.f32 %26, f1001, f1004; +add.f32 %27, f1017, f1014; +sub.f32 %28, f1033, f1036; +add.f32 %29, f1049, f1046; +add.f32 %30, f908, f905; +sub.f32 %31, f918, f921; +add.f32 %32, f940, f937; +sub.f32 %33, f950, f953; +add.f32 %34, f972, f969; +sub.f32 %35, f982, f985; +add.f32 %36, f1004, f1001; +sub.f32 %37, f1014, f1017; +add.f32 %38, f1036, f1033; +sub.f32 %39, f1046, f1049; +sub.f32 %41, f911, f915; +add.f32 %40, f902, f898; +sub.f32 %43, f943, f947; +add.f32 %42, f934, f930; +sub.f32 %45, f975, f979; +add.f32 %44, f966, f962; +sub.f32 %47, f1007, f1011; +add.f32 %46, f998, f994; +sub.f32 %49, f1039, f1043; +add.f32 %48, f1030, f1026; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<362, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<265>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 1000, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %15, %23; +add.f32 f22, %13, f21; +add.f32 f23, %18, %21; +add.f32 f24, %17, %24; +add.f32 f25, %14, f24; +add.f32 f26, %20, %22; +fma.rn.f32 f27, f21, 0f3E9E377A, %13; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %17, %24; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %20, %22; +fma.rn.f32 f33, f32, 0f3F167918, f31; +sub.f32 f34, f29, f33; +add.f32 f35, f33, f29; +mul.f32 f36, f21, 0f3F4F1BBD; +sub.f32 f37, %13, f36; +fma.rn.f32 f38, f23, 0f3E9E377A, f37; +mul.f32 f39, f30, 0f3F167918; +mul.f32 f40, f32, 0f3F737871; +sub.f32 f41, f39, f40; +sub.f32 f42, f38, f41; +add.f32 f43, f41, f38; +fma.rn.f32 f44, f24, 0f3E9E377A, %14; +mul.f32 f45, f26, 0f3F4F1BBD; +sub.f32 f46, f44, f45; +sub.f32 f47, %15, %23; +mul.f32 f48, f47, 0f3F737871; +sub.f32 f49, %18, %21; +fma.rn.f32 f50, f49, 0f3F167918, f48; +add.f32 f51, f50, f46; +sub.f32 f52, f46, f50; +mul.f32 f53, f24, 0f3F4F1BBD; +sub.f32 f54, %14, f53; +fma.rn.f32 f55, f26, 0f3E9E377A, f54; +mul.f32 f56, f47, 0f3F167918; +mul.f32 f57, f49, 0f3F737871; +sub.f32 f58, f56, f57; +add.f32 f59, f58, f55; +sub.f32 f60, f55, f58; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f61, f62}, [rd6]; +mul.f32 f65, f51, f62; +mul.f32 f66, f34, f62; +mul.f32 f67, f61, f51; +mul.f32 f68, f61, f61; +mul.f32 f69, f62, f62; +sub.f32 f70, f68, f69; +mul.f32 f71, f62, f61; +fma.rn.f32 f72, f62, f61, f71; +mul.f32 f73, f59, f72; +mul.f32 f74, f42, f72; +mul.f32 f75, f70, f59; +mul.f32 f76, f61, f70; +mul.f32 f77, f62, f72; +sub.f32 f78, f76, f77; +mul.f32 f79, f61, f72; +fma.rn.f32 f80, f62, f70, f79; +mul.f32 f81, f60, f80; +mul.f32 f82, f43, f80; +mul.f32 f83, f78, f60; +mul.f32 f84, f61, f78; +mul.f32 f85, f62, f80; +sub.f32 f86, f84, f85; +mul.f32 f87, f61, f80; +fma.rn.f32 f88, f62, f78, f87; +mul.f32 f89, f52, f88; +mul.f32 f90, f35, f88; +mul.f32 f91, f86, f52; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f92, f26, f25; +add.f32 f93, f23, f22; +st.shared.v2.f32 [r9], {f93, f92}; +fma.rn.f32 f94, f61, f34, f65; +sub.f32 f95, f67, f66; +st.shared.v2.f32 [r9+8], {f94, f95}; +fma.rn.f32 f96, f70, f42, f73; +sub.f32 f97, f75, f74; +st.shared.v2.f32 [r9+16], {f96, f97}; +sub.f32 f98, f83, f82; +fma.rn.f32 f99, f78, f43, f81; +st.shared.v2.f32 [r9+24], {f99, f98}; +fma.rn.f32 f100, f86, f35, f89; +sub.f32 f101, f91, f90; +st.shared.v2.f32 [r9+32], {f100, f101}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f102, f103}, [r11]; +ld.shared.v2.f32 {f106, f107}, [r11+200]; +ld.shared.v2.f32 {f110, f111}, [r11+400]; +ld.shared.v2.f32 {f114, f115}, [r11+600]; +ld.shared.v2.f32 {f118, f119}, [r11+800]; +add.f32 f122, f106, f118; +add.f32 f123, f102, f122; +add.f32 f124, f110, f114; +add.f32 f125, f107, f119; +add.f32 f126, f103, f125; +add.f32 f127, f111, f115; +fma.rn.f32 f128, f122, 0f3E9E377A, f102; +mul.f32 f129, f124, 0f3F4F1BBD; +sub.f32 f130, f128, f129; +sub.f32 f131, f107, f119; +mul.f32 f132, f131, 0f3F737871; +sub.f32 f133, f111, f115; +fma.rn.f32 f134, f133, 0f3F167918, f132; +sub.f32 f135, f130, f134; +add.f32 f136, f134, f130; +mul.f32 f137, f122, 0f3F4F1BBD; +sub.f32 f138, f102, f137; +fma.rn.f32 f139, f124, 0f3E9E377A, f138; +mul.f32 f140, f131, 0f3F167918; +mul.f32 f141, f133, 0f3F737871; +sub.f32 f142, f140, f141; +sub.f32 f143, f139, f142; +add.f32 f144, f142, f139; +fma.rn.f32 f145, f125, 0f3E9E377A, f103; +mul.f32 f146, f127, 0f3F4F1BBD; +sub.f32 f147, f145, f146; +sub.f32 f148, f106, f118; +mul.f32 f149, f148, 0f3F737871; +sub.f32 f150, f110, f114; +fma.rn.f32 f151, f150, 0f3F167918, f149; +add.f32 f152, f151, f147; +sub.f32 f153, f147, f151; +mul.f32 f154, f125, 0f3F4F1BBD; +sub.f32 f155, f103, f154; +fma.rn.f32 f156, f127, 0f3E9E377A, f155; +mul.f32 f157, f148, 0f3F167918; +mul.f32 f158, f150, 0f3F737871; +sub.f32 f159, f157, f158; +add.f32 f160, f159, f156; +sub.f32 f161, f156, f159; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f162, f163}, [rd11]; +mul.f32 f166, f152, f163; +mul.f32 f167, f135, f163; +mul.f32 f168, f162, f152; +mul.f32 f169, f162, f162; +mul.f32 f170, f163, f163; +sub.f32 f171, f169, f170; +mul.f32 f172, f163, f162; +fma.rn.f32 f173, f163, f162, f172; +mul.f32 f174, f160, f173; +mul.f32 f175, f143, f173; +mul.f32 f176, f171, f160; +mul.f32 f177, f162, f171; +mul.f32 f178, f163, f173; +sub.f32 f179, f177, f178; +mul.f32 f180, f162, f173; +fma.rn.f32 f181, f163, f171, f180; +mul.f32 f182, f161, f181; +mul.f32 f183, f144, f181; +mul.f32 f184, f179, f161; +mul.f32 f185, f162, f179; +mul.f32 f186, f163, f181; +sub.f32 f187, f185, f186; +mul.f32 f188, f162, f181; +fma.rn.f32 f189, f163, f179, f188; +mul.f32 f190, f153, f189; +mul.f32 f191, f136, f189; +mul.f32 f192, f187, f153; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +add.f32 f193, f127, f126; +add.f32 f194, f124, f123; +st.shared.v2.f32 [r17], {f194, f193}; +fma.rn.f32 f195, f162, f135, f166; +sub.f32 f196, f168, f167; +st.shared.v2.f32 [r17+40], {f195, f196}; +fma.rn.f32 f197, f171, f143, f174; +sub.f32 f198, f176, f175; +st.shared.v2.f32 [r17+80], {f197, f198}; +fma.rn.f32 f199, f179, f144, f182; +sub.f32 f200, f184, f183; +st.shared.v2.f32 [r17+120], {f199, f200}; +fma.rn.f32 f201, f187, f136, f190; +sub.f32 f202, f192, f191; +st.shared.v2.f32 [r17+160], {f201, f202}; +barrier.sync 0; +ld.shared.v2.f32 {f203, f204}, [r11]; +ld.shared.v2.f32 {f207, f208}, [r11+200]; +ld.shared.v2.f32 {f211, f212}, [r11+400]; +ld.shared.v2.f32 {f215, f216}, [r11+600]; +ld.shared.v2.f32 {f219, f220}, [r11+800]; +add.f32 f223, f207, f219; +add.f32 f224, f203, f223; +add.f32 f225, f211, f215; +add.f32 f226, f208, f220; +add.f32 f227, f204, f226; +add.f32 f228, f212, f216; +fma.rn.f32 f229, f223, 0f3E9E377A, f203; +mul.f32 f230, f225, 0f3F4F1BBD; +sub.f32 f231, f229, f230; +sub.f32 f232, f208, f220; +mul.f32 f233, f232, 0f3F737871; +sub.f32 f234, f212, f216; +fma.rn.f32 f235, f234, 0f3F167918, f233; +mul.f32 f236, f223, 0f3F4F1BBD; +sub.f32 f237, f203, f236; +fma.rn.f32 f238, f225, 0f3E9E377A, f237; +mul.f32 f239, f232, 0f3F167918; +mul.f32 f240, f234, 0f3F737871; +sub.f32 f241, f239, f240; +fma.rn.f32 f242, f226, 0f3E9E377A, f204; +mul.f32 f243, f228, 0f3F4F1BBD; +sub.f32 f244, f242, f243; +sub.f32 f245, f207, f219; +mul.f32 f246, f245, 0f3F737871; +sub.f32 f247, f211, f215; +fma.rn.f32 f248, f247, 0f3F167918, f246; +mul.f32 f249, f226, 0f3F4F1BBD; +sub.f32 f250, f204, f249; +fma.rn.f32 f251, f228, 0f3E9E377A, f250; +mul.f32 f252, f245, 0f3F167918; +mul.f32 f253, f247, 0f3F737871; +sub.f32 f254, f252, f253; +add.f32 %1, f228, f227; +add.f32 %0, f225, f224; +add.f32 %3, f248, f244; +sub.f32 %2, f231, f235; +add.f32 %5, f254, f251; +sub.f32 %4, f238, f241; +sub.f32 %7, f251, f254; +add.f32 %6, f241, f238; +sub.f32 %9, f244, f248; +add.f32 %8, f235, f231; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<363, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<245>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 500, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %15, %23; +add.f32 f22, %13, f21; +add.f32 f23, %18, %21; +add.f32 f24, f23, f22; +add.f32 f25, %17, %24; +add.f32 f26, %14, f25; +add.f32 f27, %20, %22; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %13; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %17, %24; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %20, %22; +fma.rn.f32 f35, f34, 0f3F167918, f33; +sub.f32 f36, f31, f35; +add.f32 f37, f35, f31; +mul.f32 f38, f21, 0f3F4F1BBD; +sub.f32 f39, %13, f38; +fma.rn.f32 f40, f23, 0f3E9E377A, f39; +mul.f32 f41, f32, 0f3F167918; +mul.f32 f42, f34, 0f3F737871; +sub.f32 f43, f41, f42; +sub.f32 f44, f40, f43; +add.f32 f45, f43, f40; +fma.rn.f32 f46, f25, 0f3E9E377A, %14; +mul.f32 f47, f27, 0f3F4F1BBD; +sub.f32 f48, f46, f47; +sub.f32 f49, %15, %23; +mul.f32 f50, f49, 0f3F737871; +sub.f32 f51, %18, %21; +fma.rn.f32 f52, f51, 0f3F167918, f50; +add.f32 f53, f52, f48; +sub.f32 f54, f48, f52; +mul.f32 f55, f25, 0f3F4F1BBD; +sub.f32 f56, %14, f55; +fma.rn.f32 f57, f27, 0f3E9E377A, f56; +mul.f32 f58, f49, 0f3F167918; +mul.f32 f59, f51, 0f3F737871; +sub.f32 f60, f58, f59; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f53, f64; +fma.rn.f32 f68, f63, f36, f67; +mul.f32 f69, f36, f64; +mul.f32 f70, f63, f53; +sub.f32 f71, f70, f69; +mul.f32 f72, f63, f63; +mul.f32 f73, f64, f64; +sub.f32 f74, f72, f73; +mul.f32 f75, f64, f63; +fma.rn.f32 f76, f64, f63, f75; +mul.f32 f77, f61, f76; +fma.rn.f32 f78, f74, f44, f77; +mul.f32 f79, f44, f76; +mul.f32 f80, f74, f61; +sub.f32 f81, f80, f79; +mul.f32 f82, f63, f74; +mul.f32 f83, f64, f76; +sub.f32 f84, f82, f83; +mul.f32 f85, f63, f76; +fma.rn.f32 f86, f64, f74, f85; +mul.f32 f87, f62, f86; +fma.rn.f32 f88, f84, f45, f87; +mul.f32 f89, f45, f86; +mul.f32 f90, f84, f62; +sub.f32 f91, f90, f89; +mul.f32 f92, f63, f84; +mul.f32 f93, f64, f86; +sub.f32 f94, f92, f93; +mul.f32 f95, f63, f86; +fma.rn.f32 f96, f64, f84, f95; +mul.f32 f97, f54, f96; +fma.rn.f32 f98, f94, f37, f97; +mul.f32 f99, f37, f96; +mul.f32 f100, f94, f54; +sub.f32 f101, f100, f99; +mad.lo.s32 r8, r5, 500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f68; +st.shared.f32 [r9+8], f78; +st.shared.f32 [r9+12], f88; +st.shared.f32 [r9+16], f98; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f102, [r11]; +ld.shared.f32 f103, [r11+100]; +ld.shared.f32 f104, [r11+200]; +ld.shared.f32 f105, [r11+300]; +ld.shared.f32 f106, [r11+400]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +ld.shared.f32 f107, [r11]; +ld.shared.f32 f108, [r11+100]; +ld.shared.f32 f109, [r11+200]; +ld.shared.f32 f110, [r11+300]; +ld.shared.f32 f111, [r11+400]; +add.f32 f112, f103, f106; +add.f32 f113, f102, f112; +add.f32 f114, f104, f105; +add.f32 f115, f114, f113; +add.f32 f116, f108, f111; +add.f32 f117, f107, f116; +add.f32 f118, f109, f110; +add.f32 f119, f118, f117; +fma.rn.f32 f120, f112, 0f3E9E377A, f102; +mul.f32 f121, f114, 0f3F4F1BBD; +sub.f32 f122, f120, f121; +sub.f32 f123, f108, f111; +mul.f32 f124, f123, 0f3F737871; +sub.f32 f125, f109, f110; +fma.rn.f32 f126, f125, 0f3F167918, f124; +sub.f32 f127, f122, f126; +add.f32 f128, f126, f122; +mul.f32 f129, f112, 0f3F4F1BBD; +sub.f32 f130, f102, f129; +fma.rn.f32 f131, f114, 0f3E9E377A, f130; +mul.f32 f132, f123, 0f3F167918; +mul.f32 f133, f125, 0f3F737871; +sub.f32 f134, f132, f133; +sub.f32 f135, f131, f134; +add.f32 f136, f134, f131; +fma.rn.f32 f137, f116, 0f3E9E377A, f107; +mul.f32 f138, f118, 0f3F4F1BBD; +sub.f32 f139, f137, f138; +sub.f32 f140, f103, f106; +mul.f32 f141, f140, 0f3F737871; +sub.f32 f142, f104, f105; +fma.rn.f32 f143, f142, 0f3F167918, f141; +add.f32 f144, f143, f139; +sub.f32 f145, f139, f143; +mul.f32 f146, f116, 0f3F4F1BBD; +sub.f32 f147, f107, f146; +fma.rn.f32 f148, f118, 0f3E9E377A, f147; +mul.f32 f149, f140, 0f3F167918; +mul.f32 f150, f142, 0f3F737871; +sub.f32 f151, f149, f150; +add.f32 f152, f151, f148; +sub.f32 f153, f148, f151; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f154, f155}, [rd11]; +mul.f32 f158, f144, f155; +fma.rn.f32 f159, f154, f127, f158; +mul.f32 f160, f127, f155; +mul.f32 f161, f154, f144; +sub.f32 f162, f161, f160; +mul.f32 f163, f154, f154; +mul.f32 f164, f155, f155; +sub.f32 f165, f163, f164; +mul.f32 f166, f155, f154; +fma.rn.f32 f167, f155, f154, f166; +mul.f32 f168, f152, f167; +fma.rn.f32 f169, f165, f135, f168; +mul.f32 f170, f135, f167; +mul.f32 f171, f165, f152; +sub.f32 f172, f171, f170; +mul.f32 f173, f154, f165; +mul.f32 f174, f155, f167; +sub.f32 f175, f173, f174; +mul.f32 f176, f154, f167; +fma.rn.f32 f177, f155, f165, f176; +mul.f32 f178, f153, f177; +fma.rn.f32 f179, f175, f136, f178; +mul.f32 f180, f136, f177; +mul.f32 f181, f175, f153; +sub.f32 f182, f181, f180; +mul.f32 f183, f154, f175; +mul.f32 f184, f155, f177; +sub.f32 f185, f183, f184; +mul.f32 f186, f154, f177; +fma.rn.f32 f187, f155, f175, f186; +mul.f32 f188, f145, f187; +fma.rn.f32 f189, f185, f128, f188; +mul.f32 f190, f128, f187; +mul.f32 f191, f185, f145; +sub.f32 f192, f191, f190; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 100, r16; +st.shared.f32 [r17], f115; +st.shared.f32 [r17+20], f159; +st.shared.f32 [r17+40], f169; +st.shared.f32 [r17+60], f179; +st.shared.f32 [r17+80], f189; +barrier.sync 0; +ld.shared.f32 f193, [r11]; +ld.shared.f32 f194, [r11+100]; +ld.shared.f32 f195, [r11+200]; +ld.shared.f32 f196, [r11+300]; +ld.shared.f32 f197, [r11+400]; +barrier.sync 0; +st.shared.f32 [r17], f119; +st.shared.f32 [r17+20], f162; +st.shared.f32 [r17+40], f172; +st.shared.f32 [r17+60], f182; +st.shared.f32 [r17+80], f192; +barrier.sync 0; +ld.shared.f32 f198, [r11]; +ld.shared.f32 f199, [r11+100]; +ld.shared.f32 f200, [r11+200]; +ld.shared.f32 f201, [r11+300]; +ld.shared.f32 f202, [r11+400]; +add.f32 f203, f194, f197; +add.f32 f204, f193, f203; +add.f32 f205, f195, f196; +add.f32 f206, f199, f202; +add.f32 f207, f198, f206; +add.f32 f208, f200, f201; +fma.rn.f32 f209, f203, 0f3E9E377A, f193; +mul.f32 f210, f205, 0f3F4F1BBD; +sub.f32 f211, f209, f210; +sub.f32 f212, f199, f202; +mul.f32 f213, f212, 0f3F737871; +sub.f32 f214, f200, f201; +fma.rn.f32 f215, f214, 0f3F167918, f213; +mul.f32 f216, f203, 0f3F4F1BBD; +sub.f32 f217, f193, f216; +fma.rn.f32 f218, f205, 0f3E9E377A, f217; +mul.f32 f219, f212, 0f3F167918; +mul.f32 f220, f214, 0f3F737871; +sub.f32 f221, f219, f220; +fma.rn.f32 f222, f206, 0f3E9E377A, f198; +mul.f32 f223, f208, 0f3F4F1BBD; +sub.f32 f224, f222, f223; +sub.f32 f225, f194, f197; +mul.f32 f226, f225, 0f3F737871; +sub.f32 f227, f195, f196; +fma.rn.f32 f228, f227, 0f3F167918, f226; +mul.f32 f229, f206, 0f3F4F1BBD; +sub.f32 f230, f198, f229; +fma.rn.f32 f231, f208, 0f3E9E377A, f230; +mul.f32 f232, f225, 0f3F167918; +mul.f32 f233, f227, 0f3F737871; +sub.f32 f234, f232, f233; +add.f32 %0, f205, f204; +add.f32 %1, f208, f207; +add.f32 %3, f228, f224; +sub.f32 %2, f211, f215; +sub.f32 %4, f218, f221; +add.f32 %5, f234, f231; +add.f32 %6, f221, f218; +sub.f32 %7, f231, f234; +sub.f32 %9, f224, f228; +add.f32 %8, f215, f211; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..fe718660a3093 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp64_fwd.hpp.inc @@ -0,0 +1,2704 @@ +#ifndef CUFFTDX_FFT_125_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_125_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<539, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1421>; +.reg .b64 rd<10>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 2000, r13; +add.f64 fd101, %62, %92; +add.f64 fd103, %72, %82; +add.f64 fd1420, %52, fd101; +add.f64 fd104, fd103, fd1420; +add.f64 fd105, %102, %104; +add.f64 fd107, %103, %83; +add.f64 fd1416, %53, fd105; +add.f64 fd108, fd107, fd1416; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1415, fd101, 0d3FD3C6EF372FE950, %52; +sub.f64 fd111, fd1415, fd110; +sub.f64 fd112, %102, %104; +sub.f64 fd114, %103, %83; +mul.f64 fd1413, fd112, 0d3FEE6F0E134454FF; +mul.f64 fd1414, fd114, 0dBFE2CF2304755A5E; +sub.f64 fd116, fd1414, fd1413; +sub.f64 fd117, fd111, fd116; +add.f64 fd118, fd116, fd111; +mul.f64 fd119, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd120, %52, fd119; +fma.rn.f64 fd121, fd103, 0d3FD3C6EF372FE950, fd120; +mul.f64 fd122, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd123, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd123, fd122; +sub.f64 fd125, fd121, fd124; +add.f64 fd126, fd124, fd121; +fma.rn.f64 fd1411, fd105, 0d3FD3C6EF372FE950, %53; +mul.f64 fd1412, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd1411, fd1412; +sub.f64 fd130, %62, %92; +sub.f64 fd132, %72, %82; +mul.f64 fd1409, fd130, 0d3FEE6F0E134454FF; +mul.f64 fd1410, fd132, 0dBFE2CF2304755A5E; +sub.f64 fd134, fd1410, fd1409; +add.f64 fd135, fd134, fd129; +sub.f64 fd136, fd129, fd134; +mul.f64 fd137, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd138, %53, fd137; +fma.rn.f64 fd139, fd107, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd141, fd140; +add.f64 fd143, fd142, fd139; +sub.f64 fd144, fd139, fd142; +add.f64 fd145, %64, %94; +add.f64 fd147, %74, %84; +add.f64 fd1408, %54, fd145; +add.f64 fd148, fd147, fd1408; +add.f64 fd149, %65, %95; +add.f64 fd151, %107, %105; +add.f64 fd1404, %106, fd149; +add.f64 fd152, fd151, fd1404; +fma.rn.f64 fd1402, fd145, 0d3FD3C6EF372FE950, %54; +mul.f64 fd1403, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd155, fd1402, fd1403; +sub.f64 fd156, %65, %95; +sub.f64 fd158, %107, %105; +mul.f64 fd1400, fd156, 0d3FEE6F0E134454FF; +mul.f64 fd1401, fd158, 0dBFE2CF2304755A5E; +sub.f64 fd160, fd1401, fd1400; +sub.f64 fd161, fd155, fd160; +add.f64 fd162, fd160, fd155; +mul.f64 fd163, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd164, %54, fd163; +fma.rn.f64 fd165, fd147, 0d3FD3C6EF372FE950, fd164; +mul.f64 fd166, fd156, 0d3FE2CF2304755A5E; +mul.f64 fd167, fd158, 0d3FEE6F0E134454FF; +sub.f64 fd168, fd167, fd166; +sub.f64 fd169, fd165, fd168; +add.f64 fd170, fd168, fd165; +mul.f64 fd172, fd151, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1399, fd149, 0d3FD3C6EF372FE950, %106; +sub.f64 fd173, fd1399, fd172; +sub.f64 fd174, %64, %94; +sub.f64 fd176, %74, %84; +mul.f64 fd177, fd176, 0dBFE2CF2304755A5E; +mul.f64 fd1398, fd174, 0d3FEE6F0E134454FF; +sub.f64 fd178, fd177, fd1398; +add.f64 fd179, fd178, fd173; +sub.f64 fd180, fd173, fd178; +mul.f64 fd181, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd182, %106, fd181; +fma.rn.f64 fd183, fd151, 0d3FD3C6EF372FE950, fd182; +mul.f64 fd184, fd174, 0d3FE2CF2304755A5E; +mul.f64 fd185, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd186, fd185, fd184; +add.f64 fd187, fd186, fd183; +sub.f64 fd188, fd183, fd186; +add.f64 fd189, %66, %96; +add.f64 fd191, %76, %86; +add.f64 fd1397, %56, fd189; +add.f64 fd192, fd191, fd1397; +add.f64 fd193, %110, %109; +add.f64 fd195, %77, %111; +add.f64 fd1392, %108, fd193; +add.f64 fd196, fd195, fd1392; +mul.f64 fd198, fd191, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1391, fd189, 0d3FD3C6EF372FE950, %56; +sub.f64 fd199, fd1391, fd198; +sub.f64 fd200, %110, %109; +sub.f64 fd202, %77, %111; +mul.f64 fd203, fd202, 0dBFE2CF2304755A5E; +mul.f64 fd1390, fd200, 0d3FEE6F0E134454FF; +sub.f64 fd204, fd203, fd1390; +sub.f64 fd205, fd199, fd204; +add.f64 fd206, fd204, fd199; +mul.f64 fd207, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd208, %56, fd207; +fma.rn.f64 fd209, fd191, 0d3FD3C6EF372FE950, fd208; +mul.f64 fd210, fd200, 0d3FE2CF2304755A5E; +mul.f64 fd211, fd202, 0d3FEE6F0E134454FF; +sub.f64 fd212, fd211, fd210; +sub.f64 fd213, fd209, fd212; +add.f64 fd214, fd212, fd209; +fma.rn.f64 fd1388, fd193, 0d3FD3C6EF372FE950, %108; +mul.f64 fd1389, fd195, 0d3FE9E3779B97F4A8; +sub.f64 fd217, fd1388, fd1389; +sub.f64 fd218, %66, %96; +sub.f64 fd220, %76, %86; +mul.f64 fd1386, fd218, 0d3FEE6F0E134454FF; +mul.f64 fd1387, fd220, 0dBFE2CF2304755A5E; +sub.f64 fd222, fd1387, fd1386; +add.f64 fd223, fd222, fd217; +sub.f64 fd224, fd217, fd222; +mul.f64 fd225, fd193, 0d3FE9E3779B97F4A8; +sub.f64 fd226, %108, fd225; +fma.rn.f64 fd227, fd195, 0d3FD3C6EF372FE950, fd226; +mul.f64 fd228, fd218, 0d3FE2CF2304755A5E; +mul.f64 fd229, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd230, fd229, fd228; +add.f64 fd231, fd230, fd227; +sub.f64 fd232, fd227, fd230; +add.f64 fd233, %68, %98; +add.f64 fd235, %78, %88; +add.f64 fd1385, %58, fd233; +add.f64 fd236, fd235, fd1385; +add.f64 fd237, %113, %112; +add.f64 fd239, %114, %89; +add.f64 fd1381, %59, fd237; +add.f64 fd240, fd239, fd1381; +fma.rn.f64 fd1379, fd233, 0d3FD3C6EF372FE950, %58; +mul.f64 fd1380, fd235, 0d3FE9E3779B97F4A8; +sub.f64 fd243, fd1379, fd1380; +sub.f64 fd244, %113, %112; +sub.f64 fd246, %114, %89; +mul.f64 fd1377, fd244, 0d3FEE6F0E134454FF; +mul.f64 fd1378, fd246, 0dBFE2CF2304755A5E; +sub.f64 fd248, fd1378, fd1377; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +mul.f64 fd251, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd252, %58, fd251; +fma.rn.f64 fd253, fd235, 0d3FD3C6EF372FE950, fd252; +mul.f64 fd254, fd244, 0d3FE2CF2304755A5E; +mul.f64 fd255, fd246, 0d3FEE6F0E134454FF; +sub.f64 fd256, fd255, fd254; +sub.f64 fd257, fd253, fd256; +add.f64 fd258, fd256, fd253; +mul.f64 fd260, fd239, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1376, fd237, 0d3FD3C6EF372FE950, %59; +sub.f64 fd261, fd1376, fd260; +sub.f64 fd262, %68, %98; +sub.f64 fd264, %78, %88; +mul.f64 fd1374, fd262, 0d3FEE6F0E134454FF; +mul.f64 fd1375, fd264, 0dBFE2CF2304755A5E; +sub.f64 fd266, fd1375, fd1374; +add.f64 fd267, fd266, fd261; +sub.f64 fd268, fd261, fd266; +mul.f64 fd269, fd237, 0d3FE9E3779B97F4A8; +sub.f64 fd270, %59, fd269; +fma.rn.f64 fd271, fd239, 0d3FD3C6EF372FE950, fd270; +mul.f64 fd272, fd262, 0d3FE2CF2304755A5E; +mul.f64 fd273, fd264, 0d3FEE6F0E134454FF; +sub.f64 fd274, fd273, fd272; +add.f64 fd275, fd274, fd271; +sub.f64 fd276, fd271, fd274; +add.f64 fd277, %70, %100; +add.f64 fd279, %80, %90; +add.f64 fd1373, %60, fd277; +add.f64 fd280, fd279, fd1373; +add.f64 fd281, %71, %101; +add.f64 fd283, %117, %115; +add.f64 fd1369, %116, fd281; +add.f64 fd284, fd283, fd1369; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1368, fd277, 0d3FD3C6EF372FE950, %60; +sub.f64 fd287, fd1368, fd286; +sub.f64 fd288, %71, %101; +sub.f64 fd290, %117, %115; +mul.f64 fd1366, fd288, 0d3FEE6F0E134454FF; +mul.f64 fd1367, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd1367, fd1366; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, %60, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1365, fd281, 0d3FD3C6EF372FE950, %116; +sub.f64 fd305, fd1365, fd304; +sub.f64 fd306, %70, %100; +sub.f64 fd308, %80, %90; +mul.f64 fd1363, fd306, 0d3FEE6F0E134454FF; +mul.f64 fd1364, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd1364, fd1363; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, %116, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +mul.f64 fd322, fd179, 0dBFCFD511FA1C0796; +mul.f64 fd1362, fd161, 0d3FEEFEA21D101EE0; +sub.f64 fd323, fd1362, fd322; +mul.f64 fd324, fd179, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd325, fd161, 0dBFCFD511FA1C0796, fd324; +mul.f64 fd327, fd223, 0dBFDED50D5CBFA951; +mul.f64 fd1361, fd205, 0d3FEC0AB44E81C059; +sub.f64 fd328, fd1361, fd327; +mul.f64 fd329, fd223, 0d3FEC0AB44E81C059; +fma.rn.f64 fd330, fd205, 0dBFDED50D5CBFA951, fd329; +mul.f64 fd332, fd267, 0dBFE5E7CF55112014; +mul.f64 fd1360, fd249, 0d3FE753B603D2B816; +sub.f64 fd333, fd1360, fd332; +mul.f64 fd334, fd267, 0d3FE753B603D2B816; +fma.rn.f64 fd335, fd249, 0dBFE5E7CF55112014, fd334; +mul.f64 fd337, fd311, 0dBFEB04BBFF642E86; +mul.f64 fd1359, fd293, 0d3FE1257E3C182B51; +sub.f64 fd338, fd1359, fd337; +mul.f64 fd339, fd311, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd293, 0dBFEB04BBFF642E86, fd339; +mul.f64 fd342, fd187, 0dBFDED50D5CBFA951; +mul.f64 fd1358, fd169, 0d3FEC0AB44E81C059; +sub.f64 fd343, fd1358, fd342; +mul.f64 fd344, fd187, 0d3FEC0AB44E81C059; +fma.rn.f64 fd345, fd169, 0dBFDED50D5CBFA951, fd344; +mul.f64 fd1356, fd213, 0d3FE1257E3C182B51; +mul.f64 fd1357, fd231, 0dBFEB04BBFF642E86; +sub.f64 fd348, fd1356, fd1357; +mul.f64 fd349, fd231, 0d3FE1257E3C182B51; +fma.rn.f64 fd350, fd213, 0dBFEB04BBFF642E86, fd349; +mul.f64 fd1354, fd257, 0d3FB0130A1BE09379; +mul.f64 fd1355, fd275, 0dBFEFEFD5BFE443FE; +sub.f64 fd353, fd1354, fd1355; +mul.f64 fd354, fd275, 0d3FB0130A1BE09379; +fma.rn.f64 fd355, fd257, 0dBFEFEFD5BFE443FE, fd354; +mul.f64 fd1352, fd301, 0dBFDB3FF7C925819C; +mul.f64 fd1353, fd319, 0dBFECF457DCDC158C; +sub.f64 fd358, fd1352, fd1353; +mul.f64 fd359, fd319, 0dBFDB3FF7C925819C; +fma.rn.f64 fd360, fd301, 0dBFECF457DCDC158C, fd359; +mul.f64 fd1350, fd170, 0d3FE753B603D2B816; +mul.f64 fd1351, fd188, 0dBFE5E7CF55112014; +sub.f64 fd363, fd1350, fd1351; +mul.f64 fd364, fd188, 0d3FE753B603D2B816; +fma.rn.f64 fd365, fd170, 0dBFE5E7CF55112014, fd364; +mul.f64 fd367, fd232, 0dBFEFEFD5BFE443FE; +mul.f64 fd1349, fd214, 0d3FB0130A1BE09379; +sub.f64 fd368, fd1349, fd367; +mul.f64 fd369, fd232, 0d3FB0130A1BE09379; +fma.rn.f64 fd370, fd214, 0dBFEFEFD5BFE443FE, fd369; +mul.f64 fd372, fd276, 0dBFE8A80B635B6BEA; +mul.f64 fd1348, fd258, 0dBFE465C6FEB501BC; +sub.f64 fd373, fd1348, fd372; +mul.f64 fd374, fd276, 0dBFE465C6FEB501BC; +fma.rn.f64 fd375, fd258, 0dBFE8A80B635B6BEA, fd374; +mul.f64 fd377, fd320, 0dBFC00AEB5DA15BE0; +mul.f64 fd1347, fd302, 0dBFEFBF675480D903; +sub.f64 fd378, fd1347, fd377; +mul.f64 fd379, fd320, 0dBFEFBF675480D903; +fma.rn.f64 fd380, fd302, 0dBFC00AEB5DA15BE0, fd379; +mul.f64 fd382, fd180, 0dBFEB04BBFF642E86; +mul.f64 fd1346, fd162, 0d3FE1257E3C182B51; +sub.f64 fd383, fd1346, fd382; +mul.f64 fd384, fd180, 0d3FE1257E3C182B51; +fma.rn.f64 fd385, fd162, 0dBFEB04BBFF642E86, fd384; +mul.f64 fd387, fd224, 0dBFECF457DCDC158C; +mul.f64 fd1345, fd206, 0dBFDB3FF7C925819C; +sub.f64 fd388, fd1345, fd387; +mul.f64 fd389, fd224, 0dBFDB3FF7C925819C; +fma.rn.f64 fd390, fd206, 0dBFECF457DCDC158C, fd389; +mul.f64 fd1343, fd250, 0dBFEFBF675480D903; +mul.f64 fd1344, fd268, 0dBFC00AEB5DA15BE0; +sub.f64 fd393, fd1343, fd1344; +mul.f64 fd394, fd268, 0dBFEFBF675480D903; +fma.rn.f64 fd395, fd250, 0dBFC00AEB5DA15BE0, fd394; +mul.f64 fd1341, fd294, 0dBFE465C6FEB501BC; +mul.f64 fd1342, fd312, 0d3FE8A80B635B6BEA; +sub.f64 fd398, fd1341, fd1342; +mul.f64 fd399, fd312, 0dBFE465C6FEB501BC; +fma.rn.f64 fd400, fd294, 0d3FE8A80B635B6BEA, fd399; +add.f64 fd401, fd148, fd280; +add.f64 fd403, fd192, fd236; +mul.f64 fd408, fd403, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1340, fd401, 0d3FD3C6EF372FE950, fd104; +sub.f64 fd409, fd1340, fd408; +add.f64 fd1339, fd152, fd284; +sub.f64 fd410, fd152, fd284; +add.f64 fd1338, fd196, fd240; +sub.f64 fd412, fd196, fd240; +mul.f64 fd413, fd412, 0dBFE2CF2304755A5E; +mul.f64 fd1337, fd410, 0d3FEE6F0E134454FF; +sub.f64 fd414, fd413, fd1337; +sub.f64 fd415, fd409, fd414; +add.f64 fd416, fd414, fd409; +add.f64 fd1336, fd104, fd401; +mul.f64 fd417, fd401, 0d3FE9E3779B97F4A8; +sub.f64 fd418, fd104, fd417; +fma.rn.f64 fd419, fd403, 0d3FD3C6EF372FE950, fd418; +mul.f64 fd420, fd410, 0d3FE2CF2304755A5E; +mul.f64 fd421, fd412, 0d3FEE6F0E134454FF; +sub.f64 fd422, fd421, fd420; +sub.f64 fd423, fd419, fd422; +add.f64 fd424, fd422, fd419; +fma.rn.f64 fd1334, fd1339, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd1335, fd1338, 0d3FE9E3779B97F4A8; +sub.f64 fd427, fd1334, fd1335; +sub.f64 fd428, fd148, fd280; +sub.f64 fd430, fd192, fd236; +mul.f64 fd1332, fd428, 0d3FEE6F0E134454FF; +mul.f64 fd1333, fd430, 0dBFE2CF2304755A5E; +sub.f64 fd432, fd1333, fd1332; +add.f64 fd433, fd432, fd427; +sub.f64 fd434, fd427, fd432; +add.f64 fd1331, fd108, fd1339; +mul.f64 fd435, fd1339, 0d3FE9E3779B97F4A8; +sub.f64 fd436, fd108, fd435; +fma.rn.f64 fd437, fd1338, 0d3FD3C6EF372FE950, fd436; +mul.f64 fd438, fd428, 0d3FE2CF2304755A5E; +mul.f64 fd439, fd430, 0d3FEE6F0E134454FF; +sub.f64 fd440, fd439, fd438; +add.f64 fd441, fd440, fd437; +sub.f64 fd442, fd437, fd440; +add.f64 fd443, fd323, fd338; +add.f64 fd445, fd328, fd333; +add.f64 fd1330, fd117, fd443; +add.f64 fd446, fd445, fd1330; +add.f64 fd447, fd325, fd340; +add.f64 fd449, fd330, fd335; +add.f64 fd1329, fd135, fd447; +add.f64 fd450, fd449, fd1329; +fma.rn.f64 fd1327, fd443, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd1328, fd445, 0d3FE9E3779B97F4A8; +sub.f64 fd453, fd1327, fd1328; +sub.f64 fd454, fd325, fd340; +sub.f64 fd456, fd330, fd335; +mul.f64 fd1325, fd454, 0d3FEE6F0E134454FF; +mul.f64 fd1326, fd456, 0dBFE2CF2304755A5E; +sub.f64 fd458, fd1326, fd1325; +sub.f64 fd459, fd453, fd458; +add.f64 fd460, fd458, fd453; +mul.f64 fd461, fd443, 0d3FE9E3779B97F4A8; +sub.f64 fd462, fd117, fd461; +fma.rn.f64 fd463, fd445, 0d3FD3C6EF372FE950, fd462; +mul.f64 fd464, fd454, 0d3FE2CF2304755A5E; +mul.f64 fd465, fd456, 0d3FEE6F0E134454FF; +sub.f64 fd466, fd465, fd464; +sub.f64 fd467, fd463, fd466; +add.f64 fd468, fd466, fd463; +mul.f64 fd470, fd449, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1324, fd447, 0d3FD3C6EF372FE950, fd135; +sub.f64 fd471, fd1324, fd470; +sub.f64 fd472, fd323, fd338; +sub.f64 fd474, fd328, fd333; +mul.f64 fd1322, fd472, 0d3FEE6F0E134454FF; +mul.f64 fd1323, fd474, 0dBFE2CF2304755A5E; +sub.f64 fd476, fd1323, fd1322; +add.f64 fd477, fd476, fd471; +sub.f64 fd478, fd471, fd476; +mul.f64 fd479, fd447, 0d3FE9E3779B97F4A8; +sub.f64 fd480, fd135, fd479; +fma.rn.f64 fd481, fd449, 0d3FD3C6EF372FE950, fd480; +mul.f64 fd482, fd472, 0d3FE2CF2304755A5E; +mul.f64 fd483, fd474, 0d3FEE6F0E134454FF; +sub.f64 fd484, fd483, fd482; +add.f64 fd485, fd484, fd481; +sub.f64 fd486, fd481, fd484; +add.f64 fd487, fd343, fd358; +add.f64 fd489, fd348, fd353; +add.f64 fd1321, fd125, fd487; +add.f64 fd490, fd489, fd1321; +add.f64 fd491, fd345, fd360; +add.f64 fd493, fd350, fd355; +add.f64 fd1320, fd143, fd491; +add.f64 fd494, fd493, fd1320; +mul.f64 fd496, fd489, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1319, fd487, 0d3FD3C6EF372FE950, fd125; +sub.f64 fd497, fd1319, fd496; +sub.f64 fd498, fd345, fd360; +sub.f64 fd500, fd350, fd355; +mul.f64 fd1317, fd498, 0d3FEE6F0E134454FF; +mul.f64 fd1318, fd500, 0dBFE2CF2304755A5E; +sub.f64 fd502, fd1318, fd1317; +sub.f64 fd503, fd497, fd502; +add.f64 fd504, fd502, fd497; +mul.f64 fd505, fd487, 0d3FE9E3779B97F4A8; +sub.f64 fd506, fd125, fd505; +fma.rn.f64 fd507, fd489, 0d3FD3C6EF372FE950, fd506; +mul.f64 fd508, fd498, 0d3FE2CF2304755A5E; +mul.f64 fd509, fd500, 0d3FEE6F0E134454FF; +sub.f64 fd510, fd509, fd508; +sub.f64 fd511, fd507, fd510; +add.f64 fd512, fd510, fd507; +mul.f64 fd514, fd493, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1316, fd491, 0d3FD3C6EF372FE950, fd143; +sub.f64 fd515, fd1316, fd514; +sub.f64 fd516, fd343, fd358; +sub.f64 fd518, fd348, fd353; +mul.f64 fd1314, fd516, 0d3FEE6F0E134454FF; +mul.f64 fd1315, fd518, 0dBFE2CF2304755A5E; +sub.f64 fd520, fd1315, fd1314; +add.f64 fd521, fd520, fd515; +sub.f64 fd522, fd515, fd520; +mul.f64 fd523, fd491, 0d3FE9E3779B97F4A8; +sub.f64 fd524, fd143, fd523; +fma.rn.f64 fd525, fd493, 0d3FD3C6EF372FE950, fd524; +mul.f64 fd526, fd516, 0d3FE2CF2304755A5E; +mul.f64 fd527, fd518, 0d3FEE6F0E134454FF; +sub.f64 fd528, fd527, fd526; +add.f64 fd529, fd528, fd525; +sub.f64 fd530, fd525, fd528; +add.f64 fd531, fd363, fd378; +add.f64 fd533, fd368, fd373; +add.f64 fd1313, fd126, fd531; +add.f64 fd534, fd533, fd1313; +add.f64 fd535, fd365, fd380; +add.f64 fd537, fd370, fd375; +add.f64 fd1312, fd144, fd535; +add.f64 fd538, fd537, fd1312; +mul.f64 fd540, fd533, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1311, fd531, 0d3FD3C6EF372FE950, fd126; +sub.f64 fd541, fd1311, fd540; +sub.f64 fd542, fd365, fd380; +sub.f64 fd544, fd370, fd375; +mul.f64 fd1309, fd542, 0d3FEE6F0E134454FF; +mul.f64 fd1310, fd544, 0dBFE2CF2304755A5E; +sub.f64 fd546, fd1310, fd1309; +sub.f64 fd547, fd541, fd546; +add.f64 fd548, fd546, fd541; +mul.f64 fd549, fd531, 0d3FE9E3779B97F4A8; +sub.f64 fd550, fd126, fd549; +fma.rn.f64 fd551, fd533, 0d3FD3C6EF372FE950, fd550; +mul.f64 fd552, fd542, 0d3FE2CF2304755A5E; +mul.f64 fd553, fd544, 0d3FEE6F0E134454FF; +sub.f64 fd554, fd553, fd552; +sub.f64 fd555, fd551, fd554; +add.f64 fd556, fd554, fd551; +fma.rn.f64 fd1307, fd535, 0d3FD3C6EF372FE950, fd144; +mul.f64 fd1308, fd537, 0d3FE9E3779B97F4A8; +sub.f64 fd559, fd1307, fd1308; +sub.f64 fd560, fd363, fd378; +sub.f64 fd562, fd368, fd373; +mul.f64 fd1305, fd560, 0d3FEE6F0E134454FF; +mul.f64 fd1306, fd562, 0dBFE2CF2304755A5E; +sub.f64 fd564, fd1306, fd1305; +add.f64 fd565, fd564, fd559; +sub.f64 fd566, fd559, fd564; +mul.f64 fd567, fd535, 0d3FE9E3779B97F4A8; +sub.f64 fd568, fd144, fd567; +fma.rn.f64 fd569, fd537, 0d3FD3C6EF372FE950, fd568; +mul.f64 fd570, fd560, 0d3FE2CF2304755A5E; +mul.f64 fd571, fd562, 0d3FEE6F0E134454FF; +sub.f64 fd572, fd571, fd570; +add.f64 fd573, fd572, fd569; +sub.f64 fd574, fd569, fd572; +add.f64 fd575, fd383, fd398; +add.f64 fd577, fd388, fd393; +add.f64 fd1304, fd118, fd575; +add.f64 fd578, fd577, fd1304; +add.f64 fd579, fd385, fd400; +add.f64 fd581, fd390, fd395; +add.f64 fd1303, fd136, fd579; +add.f64 fd582, fd581, fd1303; +fma.rn.f64 fd1301, fd575, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd1302, fd577, 0d3FE9E3779B97F4A8; +sub.f64 fd585, fd1301, fd1302; +sub.f64 fd586, fd385, fd400; +sub.f64 fd588, fd390, fd395; +mul.f64 fd1299, fd586, 0d3FEE6F0E134454FF; +mul.f64 fd1300, fd588, 0dBFE2CF2304755A5E; +sub.f64 fd590, fd1300, fd1299; +sub.f64 fd591, fd585, fd590; +add.f64 fd592, fd590, fd585; +mul.f64 fd593, fd575, 0d3FE9E3779B97F4A8; +sub.f64 fd594, fd118, fd593; +fma.rn.f64 fd595, fd577, 0d3FD3C6EF372FE950, fd594; +mul.f64 fd596, fd586, 0d3FE2CF2304755A5E; +mul.f64 fd597, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd598, fd597, fd596; +sub.f64 fd599, fd595, fd598; +add.f64 fd600, fd598, fd595; +mul.f64 fd602, fd581, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1298, fd579, 0d3FD3C6EF372FE950, fd136; +sub.f64 fd603, fd1298, fd602; +sub.f64 fd604, fd383, fd398; +sub.f64 fd606, fd388, fd393; +mul.f64 fd1296, fd604, 0d3FEE6F0E134454FF; +mul.f64 fd1297, fd606, 0dBFE2CF2304755A5E; +sub.f64 fd608, fd1297, fd1296; +add.f64 fd609, fd608, fd603; +sub.f64 fd610, fd603, fd608; +mul.f64 fd611, fd579, 0d3FE9E3779B97F4A8; +sub.f64 fd612, fd136, fd611; +fma.rn.f64 fd613, fd581, 0d3FD3C6EF372FE950, fd612; +mul.f64 fd614, fd604, 0d3FE2CF2304755A5E; +mul.f64 fd615, fd606, 0d3FEE6F0E134454FF; +sub.f64 fd616, fd615, fd614; +add.f64 fd617, fd616, fd613; +sub.f64 fd618, fd613, fd616; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 2000, r3; +mov.u64 rd5, %51; +mul.wide.u32 rd7, r7, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd619, fd620}, [rd6]; +mul.f64 fd624, fd620, fd450; +mul.f64 fd625, fd619, fd450; +mul.f64 fd627, fd620, fd620; +mul.f64 fd1295, fd619, fd619; +sub.f64 fd628, fd1295, fd627; +mul.f64 fd629, fd620, fd619; +fma.rn.f64 fd630, fd620, fd619, fd629; +mul.f64 fd632, fd630, fd494; +mul.f64 fd633, fd628, fd494; +mul.f64 fd1293, fd619, fd628; +mul.f64 fd1294, fd620, fd630; +sub.f64 fd636, fd1293, fd1294; +mul.f64 fd1292, fd628, fd490; +mul.f64 fd637, fd619, fd630; +fma.rn.f64 fd638, fd620, fd628, fd637; +mul.f64 fd640, fd638, fd538; +mul.f64 fd641, fd636, fd538; +mul.f64 fd643, fd620, fd638; +mul.f64 fd1291, fd619, fd636; +sub.f64 fd644, fd1291, fd643; +mul.f64 fd1290, fd636, fd534; +mul.f64 fd645, fd619, fd638; +fma.rn.f64 fd646, fd620, fd636, fd645; +mul.f64 fd648, fd646, fd582; +mul.f64 fd649, fd644, fd582; +mul.f64 fd1288, fd619, fd644; +mul.f64 fd1289, fd620, fd646; +sub.f64 fd652, fd1288, fd1289; +mul.f64 fd1287, fd644, fd578; +mul.f64 fd653, fd619, fd646; +fma.rn.f64 fd654, fd620, fd644, fd653; +mul.f64 fd656, fd654, fd433; +mul.f64 fd657, fd652, fd433; +mul.f64 fd659, fd620, fd654; +mul.f64 fd1286, fd619, fd652; +sub.f64 fd660, fd1286, fd659; +mul.f64 fd1285, fd652, fd415; +mul.f64 fd661, fd619, fd654; +fma.rn.f64 fd662, fd620, fd652, fd661; +mul.f64 fd664, fd662, fd477; +mul.f64 fd665, fd660, fd477; +mul.f64 fd667, fd620, fd662; +mul.f64 fd1284, fd619, fd660; +sub.f64 fd668, fd1284, fd667; +mul.f64 fd1283, fd660, fd459; +mul.f64 fd669, fd619, fd662; +fma.rn.f64 fd670, fd620, fd660, fd669; +mul.f64 fd672, fd670, fd521; +mul.f64 fd673, fd668, fd521; +mul.f64 fd1281, fd619, fd668; +mul.f64 fd1282, fd620, fd670; +sub.f64 fd676, fd1281, fd1282; +mul.f64 fd1280, fd668, fd503; +mul.f64 fd677, fd619, fd670; +fma.rn.f64 fd678, fd620, fd668, fd677; +mul.f64 fd680, fd678, fd565; +mul.f64 fd681, fd676, fd565; +mul.f64 fd683, fd620, fd678; +mul.f64 fd1279, fd619, fd676; +sub.f64 fd684, fd1279, fd683; +mul.f64 fd1278, fd676, fd547; +mul.f64 fd685, fd619, fd678; +fma.rn.f64 fd686, fd620, fd676, fd685; +mul.f64 fd688, fd686, fd609; +mul.f64 fd689, fd684, fd609; +mul.f64 fd691, fd620, fd686; +mul.f64 fd1277, fd619, fd684; +sub.f64 fd692, fd1277, fd691; +mul.f64 fd1276, fd684, fd591; +mul.f64 fd693, fd619, fd686; +fma.rn.f64 fd694, fd620, fd684, fd693; +mul.f64 fd696, fd694, fd441; +mul.f64 fd697, fd692, fd441; +mul.f64 fd1274, fd619, fd692; +mul.f64 fd1275, fd620, fd694; +sub.f64 fd700, fd1274, fd1275; +mul.f64 fd1273, fd692, fd423; +mul.f64 fd701, fd619, fd694; +fma.rn.f64 fd702, fd620, fd692, fd701; +mul.f64 fd704, fd702, fd485; +mul.f64 fd705, fd700, fd485; +mul.f64 fd707, fd620, fd702; +mul.f64 fd1272, fd619, fd700; +sub.f64 fd708, fd1272, fd707; +mul.f64 fd1271, fd700, fd467; +mul.f64 fd709, fd619, fd702; +fma.rn.f64 fd710, fd620, fd700, fd709; +mul.f64 fd711, fd708, fd511; +mul.f64 fd712, fd710, fd529; +mul.f64 fd713, fd708, fd529; +ld.global.v2.f64 {fd714, fd715}, [rd6+80]; +mul.f64 fd719, fd715, fd573; +mul.f64 fd720, fd714, fd573; +mul.f64 fd1269, fd619, fd714; +mul.f64 fd1270, fd620, fd715; +sub.f64 fd723, fd1269, fd1270; +mul.f64 fd1268, fd714, fd555; +mul.f64 fd724, fd619, fd715; +fma.rn.f64 fd725, fd620, fd714, fd724; +mul.f64 fd727, fd725, fd617; +mul.f64 fd728, fd723, fd617; +mul.f64 fd730, fd620, fd725; +mul.f64 fd1267, fd619, fd723; +sub.f64 fd731, fd1267, fd730; +mul.f64 fd1266, fd723, fd599; +mul.f64 fd732, fd619, fd725; +fma.rn.f64 fd733, fd620, fd723, fd732; +mul.f64 fd735, fd733, fd442; +mul.f64 fd736, fd731, fd442; +mul.f64 fd738, fd620, fd733; +mul.f64 fd1265, fd619, fd731; +sub.f64 fd739, fd1265, fd738; +mul.f64 fd1264, fd731, fd424; +mul.f64 fd740, fd619, fd733; +fma.rn.f64 fd741, fd620, fd731, fd740; +mul.f64 fd743, fd741, fd486; +mul.f64 fd744, fd739, fd486; +mul.f64 fd1262, fd619, fd739; +mul.f64 fd1263, fd620, fd741; +sub.f64 fd747, fd1262, fd1263; +mul.f64 fd1261, fd739, fd468; +mul.f64 fd748, fd619, fd741; +fma.rn.f64 fd749, fd620, fd739, fd748; +mul.f64 fd751, fd749, fd530; +mul.f64 fd752, fd747, fd530; +mul.f64 fd754, fd620, fd749; +mul.f64 fd1260, fd619, fd747; +sub.f64 fd755, fd1260, fd754; +mul.f64 fd1259, fd747, fd512; +mul.f64 fd756, fd619, fd749; +fma.rn.f64 fd757, fd620, fd747, fd756; +mul.f64 fd759, fd757, fd574; +mul.f64 fd760, fd755, fd574; +mul.f64 fd762, fd620, fd757; +mul.f64 fd1258, fd619, fd755; +sub.f64 fd763, fd1258, fd762; +mul.f64 fd1257, fd755, fd556; +mul.f64 fd764, fd619, fd757; +fma.rn.f64 fd765, fd620, fd755, fd764; +mul.f64 fd767, fd765, fd618; +mul.f64 fd768, fd763, fd618; +mul.f64 fd1255, fd619, fd763; +mul.f64 fd1256, fd620, fd765; +sub.f64 fd771, fd1255, fd1256; +mul.f64 fd1254, fd763, fd600; +mul.f64 fd772, fd619, fd765; +fma.rn.f64 fd773, fd620, fd763, fd772; +mul.f64 fd775, fd773, fd434; +mul.f64 fd776, fd771, fd434; +mul.f64 fd778, fd620, fd773; +mul.f64 fd1253, fd619, fd771; +sub.f64 fd779, fd1253, fd778; +mul.f64 fd1252, fd771, fd416; +mul.f64 fd780, fd619, fd773; +fma.rn.f64 fd781, fd620, fd771, fd780; +mul.f64 fd783, fd781, fd478; +mul.f64 fd784, fd779, fd478; +mul.f64 fd1250, fd619, fd779; +mul.f64 fd1251, fd620, fd781; +sub.f64 fd787, fd1250, fd1251; +mul.f64 fd1249, fd779, fd460; +mul.f64 fd788, fd619, fd781; +fma.rn.f64 fd789, fd620, fd779, fd788; +mul.f64 fd791, fd789, fd522; +mul.f64 fd792, fd787, fd522; +mul.f64 fd794, fd620, fd789; +mul.f64 fd1248, fd619, fd787; +sub.f64 fd795, fd1248, fd794; +mul.f64 fd1247, fd787, fd504; +mul.f64 fd796, fd619, fd789; +fma.rn.f64 fd797, fd620, fd787, fd796; +mul.f64 fd799, fd797, fd566; +mul.f64 fd800, fd795, fd566; +mul.f64 fd802, fd620, fd797; +mul.f64 fd1246, fd619, fd795; +sub.f64 fd803, fd1246, fd802; +mul.f64 fd1245, fd619, fd446; +mul.f64 fd804, fd619, fd797; +mul.f64 fd1244, fd795, fd548; +fma.rn.f64 fd805, fd620, fd795, fd804; +mul.f64 fd806, fd803, fd592; +mul.f64 fd807, fd805, fd610; +mul.f64 fd808, fd803, fd610; +barrier.sync 0; +mad.lo.s32 r9, r7, 400, r8; +add.f64 fd809, fd1338, fd1331; +add.f64 fd810, fd403, fd1336; +st.shared.v2.f64 [r9], {fd810, fd809}; +fma.rn.f64 fd811, fd620, fd446, fd625; +sub.f64 fd812, fd1245, fd624; +st.shared.v2.f64 [r9+16], {fd812, fd811}; +fma.rn.f64 fd813, fd630, fd490, fd633; +sub.f64 fd814, fd1292, fd632; +st.shared.v2.f64 [r9+32], {fd814, fd813}; +fma.rn.f64 fd815, fd638, fd534, fd641; +sub.f64 fd816, fd1290, fd640; +st.shared.v2.f64 [r9+48], {fd816, fd815}; +fma.rn.f64 fd817, fd646, fd578, fd649; +sub.f64 fd818, fd1287, fd648; +st.shared.v2.f64 [r9+64], {fd818, fd817}; +sub.f64 fd819, fd1285, fd656; +fma.rn.f64 fd820, fd654, fd415, fd657; +st.shared.v2.f64 [r9+80], {fd819, fd820}; +fma.rn.f64 fd821, fd662, fd459, fd665; +sub.f64 fd822, fd1283, fd664; +st.shared.v2.f64 [r9+96], {fd822, fd821}; +sub.f64 fd823, fd1280, fd672; +fma.rn.f64 fd824, fd670, fd503, fd673; +st.shared.v2.f64 [r9+112], {fd823, fd824}; +fma.rn.f64 fd825, fd678, fd547, fd681; +sub.f64 fd826, fd1278, fd680; +st.shared.v2.f64 [r9+128], {fd826, fd825}; +fma.rn.f64 fd827, fd686, fd591, fd689; +sub.f64 fd828, fd1276, fd688; +st.shared.v2.f64 [r9+144], {fd828, fd827}; +fma.rn.f64 fd829, fd694, fd423, fd697; +sub.f64 fd830, fd1273, fd696; +st.shared.v2.f64 [r9+160], {fd830, fd829}; +fma.rn.f64 fd831, fd702, fd467, fd705; +sub.f64 fd832, fd1271, fd704; +st.shared.v2.f64 [r9+176], {fd832, fd831}; +fma.rn.f64 fd833, fd710, fd511, fd713; +sub.f64 fd834, fd711, fd712; +st.shared.v2.f64 [r9+192], {fd834, fd833}; +fma.rn.f64 fd835, fd715, fd555, fd720; +sub.f64 fd836, fd1268, fd719; +st.shared.v2.f64 [r9+208], {fd836, fd835}; +fma.rn.f64 fd837, fd725, fd599, fd728; +sub.f64 fd838, fd1266, fd727; +st.shared.v2.f64 [r9+224], {fd838, fd837}; +fma.rn.f64 fd839, fd733, fd424, fd736; +sub.f64 fd840, fd1264, fd735; +st.shared.v2.f64 [r9+240], {fd840, fd839}; +fma.rn.f64 fd841, fd741, fd468, fd744; +sub.f64 fd842, fd1261, fd743; +st.shared.v2.f64 [r9+256], {fd842, fd841}; +fma.rn.f64 fd843, fd749, fd512, fd752; +sub.f64 fd844, fd1259, fd751; +st.shared.v2.f64 [r9+272], {fd844, fd843}; +fma.rn.f64 fd845, fd757, fd556, fd760; +sub.f64 fd846, fd1257, fd759; +st.shared.v2.f64 [r9+288], {fd846, fd845}; +sub.f64 fd847, fd1254, fd767; +fma.rn.f64 fd848, fd765, fd600, fd768; +st.shared.v2.f64 [r9+304], {fd847, fd848}; +fma.rn.f64 fd849, fd773, fd416, fd776; +sub.f64 fd850, fd1252, fd775; +st.shared.v2.f64 [r9+320], {fd850, fd849}; +fma.rn.f64 fd851, fd781, fd460, fd784; +sub.f64 fd852, fd1249, fd783; +st.shared.v2.f64 [r9+336], {fd852, fd851}; +fma.rn.f64 fd853, fd789, fd504, fd792; +sub.f64 fd854, fd1247, fd791; +st.shared.v2.f64 [r9+352], {fd854, fd853}; +fma.rn.f64 fd855, fd797, fd548, fd800; +sub.f64 fd856, fd1244, fd799; +st.shared.v2.f64 [r9+368], {fd856, fd855}; +fma.rn.f64 fd857, fd805, fd592, fd808; +sub.f64 fd858, fd806, fd807; +st.shared.v2.f64 [r9+384], {fd858, fd857}; +barrier.sync 0; +mad.lo.s32 r10, r7, -384, r9; +ld.shared.v2.f64 {fd859, fd860}, [r10]; +ld.shared.v2.f64 {fd863, fd864}, [r10+80]; +ld.shared.v2.f64 {fd867, fd868}, [r10+160]; +ld.shared.v2.f64 {fd871, fd872}, [r10+240]; +ld.shared.v2.f64 {fd875, fd876}, [r10+320]; +ld.shared.v2.f64 {fd879, fd880}, [r10+400]; +ld.shared.v2.f64 {fd883, fd884}, [r10+480]; +ld.shared.v2.f64 {fd887, fd888}, [r10+560]; +ld.shared.v2.f64 {fd891, fd892}, [r10+640]; +ld.shared.v2.f64 {fd895, fd896}, [r10+720]; +ld.shared.v2.f64 {fd899, fd900}, [r10+800]; +ld.shared.v2.f64 {fd903, fd904}, [r10+880]; +ld.shared.v2.f64 {fd907, fd908}, [r10+960]; +ld.shared.v2.f64 {fd911, fd912}, [r10+1040]; +ld.shared.v2.f64 {fd915, fd916}, [r10+1120]; +ld.shared.v2.f64 {fd919, fd920}, [r10+1200]; +ld.shared.v2.f64 {fd923, fd924}, [r10+1280]; +ld.shared.v2.f64 {fd927, fd928}, [r10+1360]; +ld.shared.v2.f64 {fd931, fd932}, [r10+1440]; +ld.shared.v2.f64 {fd935, fd936}, [r10+1520]; +ld.shared.v2.f64 {fd939, fd940}, [r10+1600]; +ld.shared.v2.f64 {fd943, fd944}, [r10+1680]; +ld.shared.v2.f64 {fd947, fd948}, [r10+1760]; +ld.shared.v2.f64 {fd951, fd952}, [r10+1840]; +ld.shared.v2.f64 {fd955, fd956}, [r10+1920]; +add.f64 fd959, fd879, fd939; +add.f64 fd961, fd899, fd919; +mul.f64 fd966, fd961, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1243, fd959, 0d3FD3C6EF372FE950, fd859; +sub.f64 fd967, fd1243, fd966; +add.f64 fd1242, fd880, fd940; +sub.f64 fd968, fd880, fd940; +add.f64 fd1241, fd900, fd920; +sub.f64 fd970, fd900, fd920; +mul.f64 fd971, fd970, 0dBFE2CF2304755A5E; +mul.f64 fd1240, fd968, 0d3FEE6F0E134454FF; +sub.f64 fd972, fd971, fd1240; +add.f64 fd1239, fd859, fd959; +mul.f64 fd973, fd959, 0d3FE9E3779B97F4A8; +sub.f64 fd974, fd859, fd973; +fma.rn.f64 fd975, fd961, 0d3FD3C6EF372FE950, fd974; +mul.f64 fd976, fd968, 0d3FE2CF2304755A5E; +mul.f64 fd977, fd970, 0d3FEE6F0E134454FF; +sub.f64 fd978, fd977, fd976; +mul.f64 fd980, fd1241, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1238, fd1242, 0d3FD3C6EF372FE950, fd860; +sub.f64 fd981, fd1238, fd980; +sub.f64 fd982, fd879, fd939; +sub.f64 fd984, fd899, fd919; +mul.f64 fd985, fd984, 0dBFE2CF2304755A5E; +mul.f64 fd1237, fd982, 0d3FEE6F0E134454FF; +sub.f64 fd986, fd985, fd1237; +add.f64 fd1236, fd860, fd1242; +mul.f64 fd987, fd1242, 0d3FE9E3779B97F4A8; +sub.f64 fd988, fd860, fd987; +fma.rn.f64 fd989, fd1241, 0d3FD3C6EF372FE950, fd988; +mul.f64 fd990, fd982, 0d3FE2CF2304755A5E; +mul.f64 fd991, fd984, 0d3FEE6F0E134454FF; +sub.f64 fd992, fd991, fd990; +add.f64 fd993, fd883, fd943; +add.f64 fd995, fd903, fd923; +fma.rn.f64 fd1234, fd993, 0d3FD3C6EF372FE950, fd863; +mul.f64 fd1235, fd995, 0d3FE9E3779B97F4A8; +sub.f64 fd1001, fd1234, fd1235; +add.f64 fd1233, fd884, fd944; +sub.f64 fd1002, fd884, fd944; +add.f64 fd1232, fd904, fd924; +sub.f64 fd1004, fd904, fd924; +mul.f64 fd1230, fd1002, 0d3FEE6F0E134454FF; +mul.f64 fd1231, fd1004, 0dBFE2CF2304755A5E; +sub.f64 fd1006, fd1231, fd1230; +add.f64 fd1229, fd863, fd993; +mul.f64 fd1007, fd993, 0d3FE9E3779B97F4A8; +sub.f64 fd1008, fd863, fd1007; +fma.rn.f64 fd1009, fd995, 0d3FD3C6EF372FE950, fd1008; +mul.f64 fd1010, fd1002, 0d3FE2CF2304755A5E; +mul.f64 fd1011, fd1004, 0d3FEE6F0E134454FF; +sub.f64 fd1012, fd1011, fd1010; +fma.rn.f64 fd1227, fd1233, 0d3FD3C6EF372FE950, fd864; +mul.f64 fd1228, fd1232, 0d3FE9E3779B97F4A8; +sub.f64 fd1015, fd1227, fd1228; +sub.f64 fd1016, fd883, fd943; +sub.f64 fd1018, fd903, fd923; +mul.f64 fd1225, fd1016, 0d3FEE6F0E134454FF; +mul.f64 fd1226, fd1018, 0dBFE2CF2304755A5E; +sub.f64 fd1020, fd1226, fd1225; +add.f64 fd1224, fd864, fd1233; +mul.f64 fd1021, fd1233, 0d3FE9E3779B97F4A8; +sub.f64 fd1022, fd864, fd1021; +fma.rn.f64 fd1023, fd1232, 0d3FD3C6EF372FE950, fd1022; +mul.f64 fd1024, fd1016, 0d3FE2CF2304755A5E; +mul.f64 fd1025, fd1018, 0d3FEE6F0E134454FF; +sub.f64 fd1026, fd1025, fd1024; +add.f64 fd1027, fd887, fd947; +add.f64 fd1029, fd907, fd927; +mul.f64 fd1034, fd1029, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1223, fd1027, 0d3FD3C6EF372FE950, fd867; +sub.f64 fd1035, fd1223, fd1034; +add.f64 fd1222, fd888, fd948; +sub.f64 fd1036, fd888, fd948; +add.f64 fd1221, fd908, fd928; +sub.f64 fd1038, fd908, fd928; +mul.f64 fd1219, fd1036, 0d3FEE6F0E134454FF; +mul.f64 fd1220, fd1038, 0dBFE2CF2304755A5E; +sub.f64 fd1040, fd1220, fd1219; +add.f64 fd1218, fd867, fd1027; +mul.f64 fd1041, fd1027, 0d3FE9E3779B97F4A8; +sub.f64 fd1042, fd867, fd1041; +fma.rn.f64 fd1043, fd1029, 0d3FD3C6EF372FE950, fd1042; +mul.f64 fd1044, fd1036, 0d3FE2CF2304755A5E; +mul.f64 fd1045, fd1038, 0d3FEE6F0E134454FF; +sub.f64 fd1046, fd1045, fd1044; +mul.f64 fd1048, fd1221, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1217, fd1222, 0d3FD3C6EF372FE950, fd868; +sub.f64 fd1049, fd1217, fd1048; +sub.f64 fd1050, fd887, fd947; +sub.f64 fd1052, fd907, fd927; +mul.f64 fd1215, fd1050, 0d3FEE6F0E134454FF; +mul.f64 fd1216, fd1052, 0dBFE2CF2304755A5E; +sub.f64 fd1054, fd1216, fd1215; +add.f64 fd1214, fd868, fd1222; +mul.f64 fd1055, fd1222, 0d3FE9E3779B97F4A8; +sub.f64 fd1056, fd868, fd1055; +fma.rn.f64 fd1057, fd1221, 0d3FD3C6EF372FE950, fd1056; +mul.f64 fd1058, fd1050, 0d3FE2CF2304755A5E; +mul.f64 fd1059, fd1052, 0d3FEE6F0E134454FF; +sub.f64 fd1060, fd1059, fd1058; +add.f64 fd1061, fd891, fd951; +add.f64 fd1063, fd911, fd931; +mul.f64 fd1068, fd1063, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1213, fd1061, 0d3FD3C6EF372FE950, fd871; +sub.f64 fd1069, fd1213, fd1068; +add.f64 fd1212, fd892, fd952; +sub.f64 fd1070, fd892, fd952; +add.f64 fd1211, fd912, fd932; +sub.f64 fd1072, fd912, fd932; +mul.f64 fd1073, fd1072, 0dBFE2CF2304755A5E; +mul.f64 fd1210, fd1070, 0d3FEE6F0E134454FF; +sub.f64 fd1074, fd1073, fd1210; +add.f64 fd1209, fd871, fd1061; +mul.f64 fd1075, fd1061, 0d3FE9E3779B97F4A8; +sub.f64 fd1076, fd871, fd1075; +fma.rn.f64 fd1077, fd1063, 0d3FD3C6EF372FE950, fd1076; +mul.f64 fd1078, fd1070, 0d3FE2CF2304755A5E; +mul.f64 fd1079, fd1072, 0d3FEE6F0E134454FF; +sub.f64 fd1080, fd1079, fd1078; +mul.f64 fd1082, fd1211, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1208, fd1212, 0d3FD3C6EF372FE950, fd872; +sub.f64 fd1083, fd1208, fd1082; +sub.f64 fd1084, fd891, fd951; +sub.f64 fd1086, fd911, fd931; +mul.f64 fd1087, fd1086, 0dBFE2CF2304755A5E; +mul.f64 fd1207, fd1084, 0d3FEE6F0E134454FF; +sub.f64 fd1088, fd1087, fd1207; +add.f64 fd1206, fd872, fd1212; +mul.f64 fd1089, fd1212, 0d3FE9E3779B97F4A8; +sub.f64 fd1090, fd872, fd1089; +fma.rn.f64 fd1091, fd1211, 0d3FD3C6EF372FE950, fd1090; +mul.f64 fd1092, fd1084, 0d3FE2CF2304755A5E; +mul.f64 fd1093, fd1086, 0d3FEE6F0E134454FF; +sub.f64 fd1094, fd1093, fd1092; +add.f64 fd1095, fd895, fd955; +add.f64 fd1097, fd915, fd935; +fma.rn.f64 fd1204, fd1095, 0d3FD3C6EF372FE950, fd875; +mul.f64 fd1205, fd1097, 0d3FE9E3779B97F4A8; +sub.f64 fd1103, fd1204, fd1205; +add.f64 fd1203, fd896, fd956; +sub.f64 fd1104, fd896, fd956; +add.f64 fd1202, fd916, fd936; +sub.f64 fd1106, fd916, fd936; +mul.f64 fd1200, fd1104, 0d3FEE6F0E134454FF; +mul.f64 fd1201, fd1106, 0dBFE2CF2304755A5E; +sub.f64 fd1108, fd1201, fd1200; +add.f64 fd1199, fd875, fd1095; +mul.f64 fd1109, fd1095, 0d3FE9E3779B97F4A8; +sub.f64 fd1110, fd875, fd1109; +fma.rn.f64 fd1111, fd1097, 0d3FD3C6EF372FE950, fd1110; +mul.f64 fd1112, fd1104, 0d3FE2CF2304755A5E; +mul.f64 fd1113, fd1106, 0d3FEE6F0E134454FF; +sub.f64 fd1114, fd1113, fd1112; +fma.rn.f64 fd1197, fd1203, 0d3FD3C6EF372FE950, fd876; +mul.f64 fd1198, fd1202, 0d3FE9E3779B97F4A8; +sub.f64 fd1117, fd1197, fd1198; +sub.f64 fd1118, fd895, fd955; +sub.f64 fd1120, fd915, fd935; +mul.f64 fd1195, fd1118, 0d3FEE6F0E134454FF; +mul.f64 fd1196, fd1120, 0dBFE2CF2304755A5E; +sub.f64 fd1122, fd1196, fd1195; +add.f64 fd1194, fd876, fd1203; +mul.f64 fd1123, fd1203, 0d3FE9E3779B97F4A8; +sub.f64 fd1124, fd876, fd1123; +fma.rn.f64 fd1125, fd1202, 0d3FD3C6EF372FE950, fd1124; +mul.f64 fd1126, fd1118, 0d3FE2CF2304755A5E; +mul.f64 fd1127, fd1120, 0d3FEE6F0E134454FF; +sub.f64 fd1128, fd1127, fd1126; +add.f64 %1, fd1241, fd1236; +add.f64 %0, fd961, fd1239; +add.f64 %3, fd1232, fd1224; +add.f64 %2, fd995, fd1229; +add.f64 %5, fd1221, fd1214; +add.f64 %4, fd1029, fd1218; +add.f64 %7, fd1211, fd1206; +add.f64 %6, fd1063, fd1209; +add.f64 %9, fd1202, fd1194; +add.f64 %8, fd1097, fd1199; +add.f64 %11, fd986, fd981; +sub.f64 %10, fd967, fd972; +add.f64 %13, fd1020, fd1015; +sub.f64 %12, fd1001, fd1006; +sub.f64 %14, fd1035, fd1040; +add.f64 %15, fd1054, fd1049; +sub.f64 %16, fd1069, fd1074; +add.f64 %17, fd1088, fd1083; +sub.f64 %18, fd1103, fd1108; +add.f64 %19, fd1122, fd1117; +sub.f64 %20, fd975, fd978; +add.f64 %21, fd992, fd989; +add.f64 %23, fd1026, fd1023; +sub.f64 %22, fd1009, fd1012; +add.f64 %25, fd1060, fd1057; +sub.f64 %24, fd1043, fd1046; +add.f64 %27, fd1094, fd1091; +sub.f64 %26, fd1077, fd1080; +sub.f64 %28, fd1111, fd1114; +add.f64 %29, fd1128, fd1125; +sub.f64 %31, fd989, fd992; +add.f64 %30, fd978, fd975; +sub.f64 %33, fd1023, fd1026; +add.f64 %32, fd1012, fd1009; +sub.f64 %35, fd1057, fd1060; +add.f64 %34, fd1046, fd1043; +sub.f64 %37, fd1091, fd1094; +add.f64 %36, fd1080, fd1077; +sub.f64 %39, fd1125, fd1128; +add.f64 %38, fd1114, fd1111; +sub.f64 %41, fd981, fd986; +add.f64 %40, fd972, fd967; +sub.f64 %43, fd1015, fd1020; +add.f64 %42, fd1006, fd1001; +sub.f64 %45, fd1049, fd1054; +add.f64 %44, fd1040, fd1035; +sub.f64 %47, fd1083, fd1088; +add.f64 %46, fd1074, fd1069; +sub.f64 %49, fd1117, fd1122; +add.f64 %48, fd1108, fd1103; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_125), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[5].y), "d"(rmem[10].y), "d"(rmem[20].y), "d"(rmem[16].y), "d"(rmem[1].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[22].y), "d"(rmem[7].y), "d"(rmem[17].y), "d"(rmem[23].y), "d"(rmem[8].y), "d"(rmem[13].y), "d"(rmem[19].y), "d"(rmem[4].y), "d"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<537, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<249>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 1000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %15, %23; +add.f64 fd22, %13, fd21; +add.f64 fd23, %18, %21; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %17, %24; +add.f64 fd26, %14, fd25; +add.f64 fd27, %20, %22; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %13; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %17, %24; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %20, %22; +mul.f64 fd35, fd34, 0dBFE2CF2304755A5E; +sub.f64 fd36, fd35, fd33; +sub.f64 fd37, fd31, fd36; +add.f64 fd38, fd36, fd31; +mul.f64 fd39, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd40, %13, fd39; +fma.rn.f64 fd41, fd23, 0d3FD3C6EF372FE950, fd40; +mul.f64 fd42, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd43, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd44, fd43, fd42; +sub.f64 fd45, fd41, fd44; +add.f64 fd46, fd44, fd41; +fma.rn.f64 fd47, fd25, 0d3FD3C6EF372FE950, %14; +mul.f64 fd48, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd49, fd47, fd48; +sub.f64 fd50, %15, %23; +mul.f64 fd51, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd52, %18, %21; +mul.f64 fd53, fd52, 0dBFE2CF2304755A5E; +sub.f64 fd54, fd53, fd51; +add.f64 fd55, fd54, fd49; +sub.f64 fd56, fd49, fd54; +mul.f64 fd57, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd58, %14, fd57; +fma.rn.f64 fd59, fd27, 0d3FD3C6EF372FE950, fd58; +mul.f64 fd60, fd50, 0d3FE2CF2304755A5E; +mul.f64 fd61, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd62, fd61, fd60; +add.f64 fd63, fd62, fd59; +sub.f64 fd64, fd59, fd62; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd65, fd66}, [rd6]; +mul.f64 fd69, fd65, fd37; +mul.f64 fd70, fd66, fd55; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd65, fd55; +fma.rn.f64 fd73, fd66, fd37, fd72; +mul.f64 fd74, fd65, fd65; +mul.f64 fd75, fd66, fd66; +sub.f64 fd76, fd74, fd75; +mul.f64 fd77, fd66, fd65; +fma.rn.f64 fd78, fd66, fd65, fd77; +mul.f64 fd79, fd76, fd45; +mul.f64 fd80, fd78, fd63; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd76, fd63; +fma.rn.f64 fd83, fd78, fd45, fd82; +ld.global.v2.f64 {fd84, fd85}, [rd6+400]; +mul.f64 fd88, fd84, fd46; +mul.f64 fd89, fd85, fd64; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd84, fd64; +fma.rn.f64 fd92, fd85, fd46, fd91; +mul.f64 fd93, fd65, fd84; +mul.f64 fd94, fd66, fd85; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd65, fd85; +fma.rn.f64 fd97, fd66, fd84, fd96; +mul.f64 fd98, fd95, fd38; +mul.f64 fd99, fd97, fd56; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd95, fd56; +fma.rn.f64 fd102, fd97, fd38, fd101; +mad.lo.s32 r8, r5, 1000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd103, [r11]; +ld.shared.f64 fd104, [r11+200]; +ld.shared.f64 fd105, [r11+400]; +ld.shared.f64 fd106, [r11+600]; +ld.shared.f64 fd107, [r11+800]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd73; +st.shared.f64 [r9+16], fd83; +st.shared.f64 [r9+24], fd92; +st.shared.f64 [r9+32], fd102; +barrier.sync 0; +ld.shared.f64 fd108, [r11]; +ld.shared.f64 fd109, [r11+200]; +ld.shared.f64 fd110, [r11+400]; +ld.shared.f64 fd111, [r11+600]; +ld.shared.f64 fd112, [r11+800]; +add.f64 fd113, fd104, fd107; +add.f64 fd114, fd103, fd113; +add.f64 fd115, fd105, fd106; +add.f64 fd116, fd115, fd114; +add.f64 fd117, fd109, fd112; +add.f64 fd118, fd108, fd117; +add.f64 fd119, fd110, fd111; +add.f64 fd120, fd119, fd118; +fma.rn.f64 fd121, fd113, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd122, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd109, fd112; +mul.f64 fd125, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd110, fd111; +mul.f64 fd127, fd126, 0dBFE2CF2304755A5E; +sub.f64 fd128, fd127, fd125; +sub.f64 fd129, fd123, fd128; +add.f64 fd130, fd128, fd123; +mul.f64 fd131, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd132, fd103, fd131; +fma.rn.f64 fd133, fd115, 0d3FD3C6EF372FE950, fd132; +mul.f64 fd134, fd124, 0d3FE2CF2304755A5E; +mul.f64 fd135, fd126, 0d3FEE6F0E134454FF; +sub.f64 fd136, fd135, fd134; +sub.f64 fd137, fd133, fd136; +add.f64 fd138, fd136, fd133; +fma.rn.f64 fd139, fd117, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd140, fd119, 0d3FE9E3779B97F4A8; +sub.f64 fd141, fd139, fd140; +sub.f64 fd142, fd104, fd107; +mul.f64 fd143, fd142, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd105, fd106; +mul.f64 fd145, fd144, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd145, fd143; +add.f64 fd147, fd146, fd141; +sub.f64 fd148, fd141, fd146; +mul.f64 fd149, fd117, 0d3FE9E3779B97F4A8; +sub.f64 fd150, fd108, fd149; +fma.rn.f64 fd151, fd119, 0d3FD3C6EF372FE950, fd150; +mul.f64 fd152, fd142, 0d3FE2CF2304755A5E; +mul.f64 fd153, fd144, 0d3FEE6F0E134454FF; +sub.f64 fd154, fd153, fd152; +add.f64 fd155, fd154, fd151; +sub.f64 fd156, fd151, fd154; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd157, fd158}, [rd11]; +mul.f64 fd161, fd157, fd129; +mul.f64 fd162, fd158, fd147; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd157, fd147; +fma.rn.f64 fd165, fd158, fd129, fd164; +mul.f64 fd166, fd157, fd157; +mul.f64 fd167, fd158, fd158; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd158, fd157; +fma.rn.f64 fd170, fd158, fd157, fd169; +mul.f64 fd171, fd168, fd137; +mul.f64 fd172, fd170, fd155; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd168, fd155; +fma.rn.f64 fd175, fd170, fd137, fd174; +ld.global.v2.f64 {fd176, fd177}, [rd11+80]; +mul.f64 fd180, fd176, fd138; +mul.f64 fd181, fd177, fd156; +sub.f64 fd182, fd180, fd181; +mul.f64 fd183, fd176, fd156; +fma.rn.f64 fd184, fd177, fd138, fd183; +mul.f64 fd185, fd157, fd176; +mul.f64 fd186, fd158, fd177; +sub.f64 fd187, fd185, fd186; +mul.f64 fd188, fd157, fd177; +fma.rn.f64 fd189, fd158, fd176, fd188; +mul.f64 fd190, fd187, fd130; +mul.f64 fd191, fd189, fd148; +sub.f64 fd192, fd190, fd191; +mul.f64 fd193, fd187, fd148; +fma.rn.f64 fd194, fd189, fd130, fd193; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +st.shared.f64 [r17], fd116; +st.shared.f64 [r17+40], fd163; +st.shared.f64 [r17+80], fd173; +st.shared.f64 [r17+120], fd182; +st.shared.f64 [r17+160], fd192; +barrier.sync 0; +ld.shared.f64 fd195, [r11]; +ld.shared.f64 fd196, [r11+200]; +ld.shared.f64 fd197, [r11+400]; +ld.shared.f64 fd198, [r11+600]; +ld.shared.f64 fd199, [r11+800]; +barrier.sync 0; +st.shared.f64 [r17], fd120; +st.shared.f64 [r17+40], fd165; +st.shared.f64 [r17+80], fd175; +st.shared.f64 [r17+120], fd184; +st.shared.f64 [r17+160], fd194; +barrier.sync 0; +ld.shared.f64 fd200, [r11]; +ld.shared.f64 fd201, [r11+200]; +ld.shared.f64 fd202, [r11+400]; +ld.shared.f64 fd203, [r11+600]; +ld.shared.f64 fd204, [r11+800]; +add.f64 fd205, fd196, fd199; +add.f64 fd206, fd195, fd205; +add.f64 fd207, fd197, fd198; +add.f64 fd208, fd201, fd204; +add.f64 fd209, fd200, fd208; +add.f64 fd210, fd202, fd203; +fma.rn.f64 fd211, fd205, 0d3FD3C6EF372FE950, fd195; +mul.f64 fd212, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd213, fd211, fd212; +sub.f64 fd214, fd201, fd204; +mul.f64 fd215, fd214, 0d3FEE6F0E134454FF; +sub.f64 fd216, fd202, fd203; +mul.f64 fd217, fd216, 0dBFE2CF2304755A5E; +sub.f64 fd218, fd217, fd215; +mul.f64 fd219, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd220, fd195, fd219; +fma.rn.f64 fd221, fd207, 0d3FD3C6EF372FE950, fd220; +mul.f64 fd222, fd214, 0d3FE2CF2304755A5E; +mul.f64 fd223, fd216, 0d3FEE6F0E134454FF; +sub.f64 fd224, fd223, fd222; +fma.rn.f64 fd225, fd208, 0d3FD3C6EF372FE950, fd200; +mul.f64 fd226, fd210, 0d3FE9E3779B97F4A8; +sub.f64 fd227, fd225, fd226; +sub.f64 fd228, fd196, fd199; +mul.f64 fd229, fd228, 0d3FEE6F0E134454FF; +sub.f64 fd230, fd197, fd198; +mul.f64 fd231, fd230, 0dBFE2CF2304755A5E; +sub.f64 fd232, fd231, fd229; +mul.f64 fd233, fd208, 0d3FE9E3779B97F4A8; +sub.f64 fd234, fd200, fd233; +fma.rn.f64 fd235, fd210, 0d3FD3C6EF372FE950, fd234; +mul.f64 fd236, fd228, 0d3FE2CF2304755A5E; +mul.f64 fd237, fd230, 0d3FEE6F0E134454FF; +sub.f64 fd238, fd237, fd236; +add.f64 %0, fd207, fd206; +add.f64 %1, fd210, fd209; +add.f64 %3, fd232, fd227; +sub.f64 %2, fd213, fd218; +sub.f64 %4, fd221, fd224; +add.f64 %5, fd238, fd235; +add.f64 %6, fd224, fd221; +sub.f64 %7, fd235, fd238; +sub.f64 %9, fd227, fd232; +add.f64 %8, fd218, fd213; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<536, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<269>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 2000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %15, %23; +add.f64 fd22, %13, fd21; +add.f64 fd23, %18, %21; +add.f64 fd24, %17, %24; +add.f64 fd25, %14, fd24; +add.f64 fd26, %20, %22; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %13; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %17, %24; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %20, %22; +mul.f64 fd33, fd32, 0dBFE2CF2304755A5E; +sub.f64 fd34, fd33, fd31; +sub.f64 fd35, fd29, fd34; +add.f64 fd36, fd34, fd29; +mul.f64 fd37, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd38, %13, fd37; +fma.rn.f64 fd39, fd23, 0d3FD3C6EF372FE950, fd38; +mul.f64 fd40, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd41, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd42, fd41, fd40; +sub.f64 fd43, fd39, fd42; +add.f64 fd44, fd42, fd39; +fma.rn.f64 fd45, fd24, 0d3FD3C6EF372FE950, %14; +mul.f64 fd46, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd47, fd45, fd46; +sub.f64 fd48, %15, %23; +mul.f64 fd49, fd48, 0d3FEE6F0E134454FF; +sub.f64 fd50, %18, %21; +mul.f64 fd51, fd50, 0dBFE2CF2304755A5E; +sub.f64 fd52, fd51, fd49; +add.f64 fd53, fd52, fd47; +sub.f64 fd54, fd47, fd52; +mul.f64 fd55, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %14, fd55; +fma.rn.f64 fd57, fd26, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd48, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd59, fd58; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd63, fd35; +mul.f64 fd68, fd64, fd53; +mul.f64 fd69, fd63, fd53; +mul.f64 fd70, fd63, fd63; +mul.f64 fd71, fd64, fd64; +sub.f64 fd72, fd70, fd71; +mul.f64 fd73, fd64, fd63; +fma.rn.f64 fd74, fd64, fd63, fd73; +mul.f64 fd75, fd72, fd43; +mul.f64 fd76, fd74, fd61; +mul.f64 fd77, fd72, fd61; +ld.global.v2.f64 {fd78, fd79}, [rd6+400]; +mul.f64 fd82, fd78, fd44; +mul.f64 fd83, fd79, fd62; +mul.f64 fd84, fd78, fd62; +mul.f64 fd85, fd63, fd78; +mul.f64 fd86, fd64, fd79; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd63, fd79; +fma.rn.f64 fd89, fd64, fd78, fd88; +mul.f64 fd90, fd87, fd36; +mul.f64 fd91, fd89, fd54; +mul.f64 fd92, fd87, fd54; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd93, fd26, fd25; +add.f64 fd94, fd23, fd22; +st.shared.v2.f64 [r9], {fd94, fd93}; +fma.rn.f64 fd95, fd64, fd35, fd69; +sub.f64 fd96, fd67, fd68; +st.shared.v2.f64 [r9+16], {fd96, fd95}; +fma.rn.f64 fd97, fd74, fd43, fd77; +sub.f64 fd98, fd75, fd76; +st.shared.v2.f64 [r9+32], {fd98, fd97}; +fma.rn.f64 fd99, fd79, fd44, fd84; +sub.f64 fd100, fd82, fd83; +st.shared.v2.f64 [r9+48], {fd100, fd99}; +fma.rn.f64 fd101, fd89, fd36, fd92; +sub.f64 fd102, fd90, fd91; +st.shared.v2.f64 [r9+64], {fd102, fd101}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd103, fd104}, [r11]; +ld.shared.v2.f64 {fd107, fd108}, [r11+400]; +ld.shared.v2.f64 {fd111, fd112}, [r11+800]; +ld.shared.v2.f64 {fd115, fd116}, [r11+1200]; +ld.shared.v2.f64 {fd119, fd120}, [r11+1600]; +add.f64 fd123, fd107, fd119; +add.f64 fd124, fd103, fd123; +add.f64 fd125, fd111, fd115; +add.f64 fd126, fd108, fd120; +add.f64 fd127, fd104, fd126; +add.f64 fd128, fd112, fd116; +fma.rn.f64 fd129, fd123, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd130, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd131, fd129, fd130; +sub.f64 fd132, fd108, fd120; +mul.f64 fd133, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd134, fd112, fd116; +mul.f64 fd135, fd134, 0dBFE2CF2304755A5E; +sub.f64 fd136, fd135, fd133; +sub.f64 fd137, fd131, fd136; +add.f64 fd138, fd136, fd131; +mul.f64 fd139, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd140, fd103, fd139; +fma.rn.f64 fd141, fd125, 0d3FD3C6EF372FE950, fd140; +mul.f64 fd142, fd132, 0d3FE2CF2304755A5E; +mul.f64 fd143, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd143, fd142; +sub.f64 fd145, fd141, fd144; +add.f64 fd146, fd144, fd141; +fma.rn.f64 fd147, fd126, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd148, fd128, 0d3FE9E3779B97F4A8; +sub.f64 fd149, fd147, fd148; +sub.f64 fd150, fd107, fd119; +mul.f64 fd151, fd150, 0d3FEE6F0E134454FF; +sub.f64 fd152, fd111, fd115; +mul.f64 fd153, fd152, 0dBFE2CF2304755A5E; +sub.f64 fd154, fd153, fd151; +add.f64 fd155, fd154, fd149; +sub.f64 fd156, fd149, fd154; +mul.f64 fd157, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd158, fd104, fd157; +fma.rn.f64 fd159, fd128, 0d3FD3C6EF372FE950, fd158; +mul.f64 fd160, fd150, 0d3FE2CF2304755A5E; +mul.f64 fd161, fd152, 0d3FEE6F0E134454FF; +sub.f64 fd162, fd161, fd160; +add.f64 fd163, fd162, fd159; +sub.f64 fd164, fd159, fd162; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd165, fd166}, [rd11]; +mul.f64 fd169, fd165, fd137; +mul.f64 fd170, fd166, fd155; +mul.f64 fd171, fd165, fd155; +mul.f64 fd172, fd165, fd165; +mul.f64 fd173, fd166, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd166, fd165; +fma.rn.f64 fd176, fd166, fd165, fd175; +mul.f64 fd177, fd174, fd145; +mul.f64 fd178, fd176, fd163; +mul.f64 fd179, fd174, fd163; +ld.global.v2.f64 {fd180, fd181}, [rd11+80]; +mul.f64 fd184, fd180, fd146; +mul.f64 fd185, fd181, fd164; +mul.f64 fd186, fd180, fd164; +mul.f64 fd187, fd165, fd180; +mul.f64 fd188, fd166, fd181; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd165, fd181; +fma.rn.f64 fd191, fd166, fd180, fd190; +mul.f64 fd192, fd189, fd138; +mul.f64 fd193, fd191, fd156; +mul.f64 fd194, fd189, fd156; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 400, r16; +add.f64 fd195, fd128, fd127; +add.f64 fd196, fd125, fd124; +st.shared.v2.f64 [r17], {fd196, fd195}; +fma.rn.f64 fd197, fd166, fd137, fd171; +sub.f64 fd198, fd169, fd170; +st.shared.v2.f64 [r17+80], {fd198, fd197}; +fma.rn.f64 fd199, fd176, fd145, fd179; +sub.f64 fd200, fd177, fd178; +st.shared.v2.f64 [r17+160], {fd200, fd199}; +fma.rn.f64 fd201, fd181, fd146, fd186; +sub.f64 fd202, fd184, fd185; +st.shared.v2.f64 [r17+240], {fd202, fd201}; +fma.rn.f64 fd203, fd191, fd138, fd194; +sub.f64 fd204, fd192, fd193; +st.shared.v2.f64 [r17+320], {fd204, fd203}; +barrier.sync 0; +ld.shared.v2.f64 {fd205, fd206}, [r11]; +ld.shared.v2.f64 {fd209, fd210}, [r11+400]; +ld.shared.v2.f64 {fd213, fd214}, [r11+800]; +ld.shared.v2.f64 {fd217, fd218}, [r11+1200]; +ld.shared.v2.f64 {fd221, fd222}, [r11+1600]; +add.f64 fd225, fd209, fd221; +add.f64 fd226, fd205, fd225; +add.f64 fd227, fd213, fd217; +add.f64 fd228, fd210, fd222; +add.f64 fd229, fd206, fd228; +add.f64 fd230, fd214, fd218; +fma.rn.f64 fd231, fd225, 0d3FD3C6EF372FE950, fd205; +mul.f64 fd232, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd233, fd231, fd232; +sub.f64 fd234, fd210, fd222; +mul.f64 fd235, fd234, 0d3FEE6F0E134454FF; +sub.f64 fd236, fd214, fd218; +mul.f64 fd237, fd236, 0dBFE2CF2304755A5E; +sub.f64 fd238, fd237, fd235; +mul.f64 fd239, fd225, 0d3FE9E3779B97F4A8; +sub.f64 fd240, fd205, fd239; +fma.rn.f64 fd241, fd227, 0d3FD3C6EF372FE950, fd240; +mul.f64 fd242, fd234, 0d3FE2CF2304755A5E; +mul.f64 fd243, fd236, 0d3FEE6F0E134454FF; +sub.f64 fd244, fd243, fd242; +fma.rn.f64 fd245, fd228, 0d3FD3C6EF372FE950, fd206; +mul.f64 fd246, fd230, 0d3FE9E3779B97F4A8; +sub.f64 fd247, fd245, fd246; +sub.f64 fd248, fd209, fd221; +mul.f64 fd249, fd248, 0d3FEE6F0E134454FF; +sub.f64 fd250, fd213, fd217; +mul.f64 fd251, fd250, 0dBFE2CF2304755A5E; +sub.f64 fd252, fd251, fd249; +mul.f64 fd253, fd228, 0d3FE9E3779B97F4A8; +sub.f64 fd254, fd206, fd253; +fma.rn.f64 fd255, fd230, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd256, fd248, 0d3FE2CF2304755A5E; +mul.f64 fd257, fd250, 0d3FEE6F0E134454FF; +sub.f64 fd258, fd257, fd256; +add.f64 %1, fd230, fd229; +add.f64 %0, fd227, fd226; +add.f64 %3, fd252, fd247; +sub.f64 %2, fd233, fd238; +add.f64 %5, fd258, fd255; +sub.f64 %4, fd241, fd244; +sub.f64 %7, fd255, fd258; +add.f64 %6, fd244, fd241; +sub.f64 %9, fd247, fd252; +add.f64 %8, fd238, fd233; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<538, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<1129>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 1000, r2; +add.f64 fd101, %65, %105; +add.f64 fd102, %52, fd101; +add.f64 fd103, %78, %92; +add.f64 fd104, fd103, fd102; +add.f64 fd105, %67, %107; +add.f64 fd106, %53, fd105; +add.f64 fd107, %80, %93; +add.f64 fd108, fd107, fd106; +fma.rn.f64 fd109, fd101, 0d3FD3C6EF372FE950, %52; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %67, %107; +mul.f64 fd113, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd114, %80, %93; +mul.f64 fd115, fd114, 0dBFE2CF2304755A5E; +sub.f64 fd116, fd115, fd113; +sub.f64 fd117, fd111, fd116; +add.f64 fd118, fd116, fd111; +mul.f64 fd119, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd120, %52, fd119; +fma.rn.f64 fd121, fd103, 0d3FD3C6EF372FE950, fd120; +mul.f64 fd122, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd123, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd123, fd122; +sub.f64 fd125, fd121, fd124; +add.f64 fd126, fd124, fd121; +fma.rn.f64 fd127, fd105, 0d3FD3C6EF372FE950, %53; +mul.f64 fd128, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, %65, %105; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, %78, %92; +mul.f64 fd133, fd132, 0dBFE2CF2304755A5E; +sub.f64 fd134, fd133, fd131; +add.f64 fd135, fd134, fd129; +sub.f64 fd136, fd129, fd134; +mul.f64 fd137, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd138, %53, fd137; +fma.rn.f64 fd139, fd107, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd141, fd140; +add.f64 fd143, fd142, fd139; +sub.f64 fd144, fd139, fd142; +add.f64 fd145, %68, %108; +add.f64 fd146, %54, fd145; +add.f64 fd147, %81, %94; +add.f64 fd148, fd147, fd146; +add.f64 fd149, %69, %109; +add.f64 fd150, %56, fd149; +add.f64 fd151, %83, %96; +add.f64 fd152, fd151, fd150; +fma.rn.f64 fd153, fd145, 0d3FD3C6EF372FE950, %54; +mul.f64 fd154, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd155, fd153, fd154; +sub.f64 fd156, %69, %109; +mul.f64 fd157, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd158, %83, %96; +mul.f64 fd159, fd158, 0dBFE2CF2304755A5E; +sub.f64 fd160, fd159, fd157; +sub.f64 fd161, fd155, fd160; +add.f64 fd162, fd160, fd155; +mul.f64 fd163, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd164, %54, fd163; +fma.rn.f64 fd165, fd147, 0d3FD3C6EF372FE950, fd164; +mul.f64 fd166, fd156, 0d3FE2CF2304755A5E; +mul.f64 fd167, fd158, 0d3FEE6F0E134454FF; +sub.f64 fd168, fd167, fd166; +sub.f64 fd169, fd165, fd168; +add.f64 fd170, fd168, fd165; +fma.rn.f64 fd171, fd149, 0d3FD3C6EF372FE950, %56; +mul.f64 fd172, fd151, 0d3FE9E3779B97F4A8; +sub.f64 fd173, fd171, fd172; +sub.f64 fd174, %68, %108; +mul.f64 fd175, fd174, 0d3FEE6F0E134454FF; +sub.f64 fd176, %81, %94; +mul.f64 fd177, fd176, 0dBFE2CF2304755A5E; +sub.f64 fd178, fd177, fd175; +add.f64 fd179, fd178, fd173; +sub.f64 fd180, fd173, fd178; +mul.f64 fd181, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd182, %56, fd181; +fma.rn.f64 fd183, fd151, 0d3FD3C6EF372FE950, fd182; +mul.f64 fd184, fd174, 0d3FE2CF2304755A5E; +mul.f64 fd185, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd186, fd185, fd184; +add.f64 fd187, fd186, fd183; +sub.f64 fd188, fd183, fd186; +add.f64 fd189, %70, %110; +add.f64 fd190, %57, fd189; +add.f64 fd191, %84, %97; +add.f64 fd192, fd191, fd190; +add.f64 fd193, %72, %112; +add.f64 fd194, %59, fd193; +add.f64 fd195, %85, %99; +add.f64 fd196, fd195, fd194; +fma.rn.f64 fd197, fd189, 0d3FD3C6EF372FE950, %57; +mul.f64 fd198, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd199, fd197, fd198; +sub.f64 fd200, %72, %112; +mul.f64 fd201, fd200, 0d3FEE6F0E134454FF; +sub.f64 fd202, %85, %99; +mul.f64 fd203, fd202, 0dBFE2CF2304755A5E; +sub.f64 fd204, fd203, fd201; +sub.f64 fd205, fd199, fd204; +add.f64 fd206, fd204, fd199; +mul.f64 fd207, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd208, %57, fd207; +fma.rn.f64 fd209, fd191, 0d3FD3C6EF372FE950, fd208; +mul.f64 fd210, fd200, 0d3FE2CF2304755A5E; +mul.f64 fd211, fd202, 0d3FEE6F0E134454FF; +sub.f64 fd212, fd211, fd210; +sub.f64 fd213, fd209, fd212; +add.f64 fd214, fd212, fd209; +fma.rn.f64 fd215, fd193, 0d3FD3C6EF372FE950, %59; +mul.f64 fd216, fd195, 0d3FE9E3779B97F4A8; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, %70, %110; +mul.f64 fd219, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd220, %84, %97; +mul.f64 fd221, fd220, 0dBFE2CF2304755A5E; +sub.f64 fd222, fd221, fd219; +add.f64 fd223, fd222, fd217; +sub.f64 fd224, fd217, fd222; +mul.f64 fd225, fd193, 0d3FE9E3779B97F4A8; +sub.f64 fd226, %59, fd225; +fma.rn.f64 fd227, fd195, 0d3FD3C6EF372FE950, fd226; +mul.f64 fd228, fd218, 0d3FE2CF2304755A5E; +mul.f64 fd229, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd230, fd229, fd228; +add.f64 fd231, fd230, fd227; +sub.f64 fd232, fd227, fd230; +add.f64 fd233, %73, %113; +add.f64 fd234, %60, fd233; +add.f64 fd235, %86, %100; +add.f64 fd236, fd235, fd234; +add.f64 fd237, %75, %115; +add.f64 fd238, %61, fd237; +add.f64 fd239, %88, %101; +add.f64 fd240, fd239, fd238; +fma.rn.f64 fd241, fd233, 0d3FD3C6EF372FE950, %60; +mul.f64 fd242, fd235, 0d3FE9E3779B97F4A8; +sub.f64 fd243, fd241, fd242; +sub.f64 fd244, %75, %115; +mul.f64 fd245, fd244, 0d3FEE6F0E134454FF; +sub.f64 fd246, %88, %101; +mul.f64 fd247, fd246, 0dBFE2CF2304755A5E; +sub.f64 fd248, fd247, fd245; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +mul.f64 fd251, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd252, %60, fd251; +fma.rn.f64 fd253, fd235, 0d3FD3C6EF372FE950, fd252; +mul.f64 fd254, fd244, 0d3FE2CF2304755A5E; +mul.f64 fd255, fd246, 0d3FEE6F0E134454FF; +sub.f64 fd256, fd255, fd254; +sub.f64 fd257, fd253, fd256; +add.f64 fd258, fd256, fd253; +fma.rn.f64 fd259, fd237, 0d3FD3C6EF372FE950, %61; +mul.f64 fd260, fd239, 0d3FE9E3779B97F4A8; +sub.f64 fd261, fd259, fd260; +sub.f64 fd262, %73, %113; +mul.f64 fd263, fd262, 0d3FEE6F0E134454FF; +sub.f64 fd264, %86, %100; +mul.f64 fd265, fd264, 0dBFE2CF2304755A5E; +sub.f64 fd266, fd265, fd263; +add.f64 fd267, fd266, fd261; +sub.f64 fd268, fd261, fd266; +mul.f64 fd269, fd237, 0d3FE9E3779B97F4A8; +sub.f64 fd270, %61, fd269; +fma.rn.f64 fd271, fd239, 0d3FD3C6EF372FE950, fd270; +mul.f64 fd272, fd262, 0d3FE2CF2304755A5E; +mul.f64 fd273, fd264, 0d3FEE6F0E134454FF; +sub.f64 fd274, fd273, fd272; +add.f64 fd275, fd274, fd271; +sub.f64 fd276, fd271, fd274; +add.f64 fd277, %76, %116; +add.f64 fd278, %62, fd277; +add.f64 fd279, %89, %102; +add.f64 fd280, fd279, fd278; +add.f64 fd281, %77, %117; +add.f64 fd282, %64, fd281; +add.f64 fd283, %91, %104; +add.f64 fd284, fd283, fd282; +fma.rn.f64 fd285, fd277, 0d3FD3C6EF372FE950, %62; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, %77, %117; +mul.f64 fd289, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd290, %91, %104; +mul.f64 fd291, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd291, fd289; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, %62, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +fma.rn.f64 fd303, fd281, 0d3FD3C6EF372FE950, %64; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, %76, %116; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, %89, %102; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, %64, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +mov.u32 r4, %tid.x; +mul.f64 fd321, fd161, 0d3FEEFEA21D101EE0; +mul.f64 fd322, fd179, 0dBFCFD511FA1C0796; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd179, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd325, fd161, 0dBFCFD511FA1C0796, fd324; +mul.f64 fd326, fd205, 0d3FEC0AB44E81C059; +mul.f64 fd327, fd223, 0dBFDED50D5CBFA951; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd223, 0d3FEC0AB44E81C059; +fma.rn.f64 fd330, fd205, 0dBFDED50D5CBFA951, fd329; +mul.f64 fd331, fd249, 0d3FE753B603D2B816; +mul.f64 fd332, fd267, 0dBFE5E7CF55112014; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd267, 0d3FE753B603D2B816; +fma.rn.f64 fd335, fd249, 0dBFE5E7CF55112014, fd334; +mul.f64 fd336, fd293, 0d3FE1257E3C182B51; +mul.f64 fd337, fd311, 0dBFEB04BBFF642E86; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd311, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd293, 0dBFEB04BBFF642E86, fd339; +mul.f64 fd341, fd169, 0d3FEC0AB44E81C059; +mul.f64 fd342, fd187, 0dBFDED50D5CBFA951; +sub.f64 fd343, fd341, fd342; +mul.f64 fd344, fd187, 0d3FEC0AB44E81C059; +fma.rn.f64 fd345, fd169, 0dBFDED50D5CBFA951, fd344; +mul.f64 fd346, fd213, 0d3FE1257E3C182B51; +mul.f64 fd347, fd231, 0dBFEB04BBFF642E86; +sub.f64 fd348, fd346, fd347; +mul.f64 fd349, fd231, 0d3FE1257E3C182B51; +fma.rn.f64 fd350, fd213, 0dBFEB04BBFF642E86, fd349; +mul.f64 fd351, fd257, 0d3FB0130A1BE09379; +mul.f64 fd352, fd275, 0dBFEFEFD5BFE443FE; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd275, 0d3FB0130A1BE09379; +fma.rn.f64 fd355, fd257, 0dBFEFEFD5BFE443FE, fd354; +mul.f64 fd356, fd301, 0dBFDB3FF7C925819C; +mul.f64 fd357, fd319, 0dBFECF457DCDC158C; +sub.f64 fd358, fd356, fd357; +mul.f64 fd359, fd319, 0dBFDB3FF7C925819C; +fma.rn.f64 fd360, fd301, 0dBFECF457DCDC158C, fd359; +mul.f64 fd361, fd170, 0d3FE753B603D2B816; +mul.f64 fd362, fd188, 0dBFE5E7CF55112014; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd188, 0d3FE753B603D2B816; +fma.rn.f64 fd365, fd170, 0dBFE5E7CF55112014, fd364; +mul.f64 fd366, fd214, 0d3FB0130A1BE09379; +mul.f64 fd367, fd232, 0dBFEFEFD5BFE443FE; +sub.f64 fd368, fd366, fd367; +mul.f64 fd369, fd232, 0d3FB0130A1BE09379; +fma.rn.f64 fd370, fd214, 0dBFEFEFD5BFE443FE, fd369; +mul.f64 fd371, fd258, 0dBFE465C6FEB501BC; +mul.f64 fd372, fd276, 0dBFE8A80B635B6BEA; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd276, 0dBFE465C6FEB501BC; +fma.rn.f64 fd375, fd258, 0dBFE8A80B635B6BEA, fd374; +mul.f64 fd376, fd302, 0dBFEFBF675480D903; +mul.f64 fd377, fd320, 0dBFC00AEB5DA15BE0; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd320, 0dBFEFBF675480D903; +fma.rn.f64 fd380, fd302, 0dBFC00AEB5DA15BE0, fd379; +mul.f64 fd381, fd162, 0d3FE1257E3C182B51; +mul.f64 fd382, fd180, 0dBFEB04BBFF642E86; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd180, 0d3FE1257E3C182B51; +fma.rn.f64 fd385, fd162, 0dBFEB04BBFF642E86, fd384; +mul.f64 fd386, fd206, 0dBFDB3FF7C925819C; +mul.f64 fd387, fd224, 0dBFECF457DCDC158C; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd224, 0dBFDB3FF7C925819C; +fma.rn.f64 fd390, fd206, 0dBFECF457DCDC158C, fd389; +mul.f64 fd391, fd250, 0dBFEFBF675480D903; +mul.f64 fd392, fd268, 0dBFC00AEB5DA15BE0; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd268, 0dBFEFBF675480D903; +fma.rn.f64 fd395, fd250, 0dBFC00AEB5DA15BE0, fd394; +mul.f64 fd396, fd294, 0dBFE465C6FEB501BC; +mul.f64 fd397, fd312, 0d3FE8A80B635B6BEA; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd312, 0dBFE465C6FEB501BC; +fma.rn.f64 fd400, fd294, 0d3FE8A80B635B6BEA, fd399; +add.f64 fd401, fd148, fd280; +add.f64 fd402, fd104, fd401; +add.f64 fd403, fd192, fd236; +add.f64 fd404, fd403, fd402; +add.f64 fd405, fd152, fd284; +add.f64 fd406, fd108, fd405; +add.f64 fd407, fd196, fd240; +add.f64 fd408, fd407, fd406; +fma.rn.f64 fd409, fd401, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd410, fd403, 0d3FE9E3779B97F4A8; +sub.f64 fd411, fd409, fd410; +sub.f64 fd412, fd152, fd284; +mul.f64 fd413, fd412, 0d3FEE6F0E134454FF; +sub.f64 fd414, fd196, fd240; +mul.f64 fd415, fd414, 0dBFE2CF2304755A5E; +sub.f64 fd416, fd415, fd413; +sub.f64 fd417, fd411, fd416; +add.f64 fd418, fd416, fd411; +mul.f64 fd419, fd401, 0d3FE9E3779B97F4A8; +sub.f64 fd420, fd104, fd419; +fma.rn.f64 fd421, fd403, 0d3FD3C6EF372FE950, fd420; +mul.f64 fd422, fd412, 0d3FE2CF2304755A5E; +mul.f64 fd423, fd414, 0d3FEE6F0E134454FF; +sub.f64 fd424, fd423, fd422; +sub.f64 fd425, fd421, fd424; +add.f64 fd426, fd424, fd421; +fma.rn.f64 fd427, fd405, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd428, fd407, 0d3FE9E3779B97F4A8; +sub.f64 fd429, fd427, fd428; +sub.f64 fd430, fd148, fd280; +mul.f64 fd431, fd430, 0d3FEE6F0E134454FF; +sub.f64 fd432, fd192, fd236; +mul.f64 fd433, fd432, 0dBFE2CF2304755A5E; +sub.f64 fd434, fd433, fd431; +add.f64 fd435, fd434, fd429; +sub.f64 fd436, fd429, fd434; +mul.f64 fd437, fd405, 0d3FE9E3779B97F4A8; +sub.f64 fd438, fd108, fd437; +fma.rn.f64 fd439, fd407, 0d3FD3C6EF372FE950, fd438; +mul.f64 fd440, fd430, 0d3FE2CF2304755A5E; +mul.f64 fd441, fd432, 0d3FEE6F0E134454FF; +sub.f64 fd442, fd441, fd440; +add.f64 fd443, fd442, fd439; +sub.f64 fd444, fd439, fd442; +add.f64 fd445, fd323, fd338; +add.f64 fd446, fd117, fd445; +add.f64 fd447, fd328, fd333; +add.f64 fd448, fd447, fd446; +add.f64 fd449, fd325, fd340; +add.f64 fd450, fd135, fd449; +add.f64 fd451, fd330, fd335; +add.f64 fd452, fd451, fd450; +fma.rn.f64 fd453, fd445, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd454, fd447, 0d3FE9E3779B97F4A8; +sub.f64 fd455, fd453, fd454; +sub.f64 fd456, fd325, fd340; +mul.f64 fd457, fd456, 0d3FEE6F0E134454FF; +sub.f64 fd458, fd330, fd335; +mul.f64 fd459, fd458, 0dBFE2CF2304755A5E; +sub.f64 fd460, fd459, fd457; +sub.f64 fd461, fd455, fd460; +add.f64 fd462, fd460, fd455; +mul.f64 fd463, fd445, 0d3FE9E3779B97F4A8; +sub.f64 fd464, fd117, fd463; +fma.rn.f64 fd465, fd447, 0d3FD3C6EF372FE950, fd464; +mul.f64 fd466, fd456, 0d3FE2CF2304755A5E; +mul.f64 fd467, fd458, 0d3FEE6F0E134454FF; +sub.f64 fd468, fd467, fd466; +sub.f64 fd469, fd465, fd468; +add.f64 fd470, fd468, fd465; +fma.rn.f64 fd471, fd449, 0d3FD3C6EF372FE950, fd135; +mul.f64 fd472, fd451, 0d3FE9E3779B97F4A8; +sub.f64 fd473, fd471, fd472; +sub.f64 fd474, fd323, fd338; +mul.f64 fd475, fd474, 0d3FEE6F0E134454FF; +sub.f64 fd476, fd328, fd333; +mul.f64 fd477, fd476, 0dBFE2CF2304755A5E; +sub.f64 fd478, fd477, fd475; +add.f64 fd479, fd478, fd473; +sub.f64 fd480, fd473, fd478; +mul.f64 fd481, fd449, 0d3FE9E3779B97F4A8; +sub.f64 fd482, fd135, fd481; +fma.rn.f64 fd483, fd451, 0d3FD3C6EF372FE950, fd482; +mul.f64 fd484, fd474, 0d3FE2CF2304755A5E; +mul.f64 fd485, fd476, 0d3FEE6F0E134454FF; +sub.f64 fd486, fd485, fd484; +add.f64 fd487, fd486, fd483; +sub.f64 fd488, fd483, fd486; +add.f64 fd489, fd343, fd358; +add.f64 fd490, fd125, fd489; +add.f64 fd491, fd348, fd353; +add.f64 fd492, fd491, fd490; +add.f64 fd493, fd345, fd360; +add.f64 fd494, fd143, fd493; +add.f64 fd495, fd350, fd355; +add.f64 fd496, fd495, fd494; +fma.rn.f64 fd497, fd489, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd498, fd491, 0d3FE9E3779B97F4A8; +sub.f64 fd499, fd497, fd498; +sub.f64 fd500, fd345, fd360; +mul.f64 fd501, fd500, 0d3FEE6F0E134454FF; +sub.f64 fd502, fd350, fd355; +mul.f64 fd503, fd502, 0dBFE2CF2304755A5E; +sub.f64 fd504, fd503, fd501; +sub.f64 fd505, fd499, fd504; +add.f64 fd506, fd504, fd499; +mul.f64 fd507, fd489, 0d3FE9E3779B97F4A8; +sub.f64 fd508, fd125, fd507; +fma.rn.f64 fd509, fd491, 0d3FD3C6EF372FE950, fd508; +mul.f64 fd510, fd500, 0d3FE2CF2304755A5E; +mul.f64 fd511, fd502, 0d3FEE6F0E134454FF; +sub.f64 fd512, fd511, fd510; +sub.f64 fd513, fd509, fd512; +add.f64 fd514, fd512, fd509; +fma.rn.f64 fd515, fd493, 0d3FD3C6EF372FE950, fd143; +mul.f64 fd516, fd495, 0d3FE9E3779B97F4A8; +sub.f64 fd517, fd515, fd516; +sub.f64 fd518, fd343, fd358; +mul.f64 fd519, fd518, 0d3FEE6F0E134454FF; +sub.f64 fd520, fd348, fd353; +mul.f64 fd521, fd520, 0dBFE2CF2304755A5E; +sub.f64 fd522, fd521, fd519; +add.f64 fd523, fd522, fd517; +sub.f64 fd524, fd517, fd522; +mul.f64 fd525, fd493, 0d3FE9E3779B97F4A8; +sub.f64 fd526, fd143, fd525; +fma.rn.f64 fd527, fd495, 0d3FD3C6EF372FE950, fd526; +mul.f64 fd528, fd518, 0d3FE2CF2304755A5E; +mul.f64 fd529, fd520, 0d3FEE6F0E134454FF; +sub.f64 fd530, fd529, fd528; +add.f64 fd531, fd530, fd527; +sub.f64 fd532, fd527, fd530; +add.f64 fd533, fd363, fd378; +add.f64 fd534, fd126, fd533; +add.f64 fd535, fd368, fd373; +add.f64 fd536, fd535, fd534; +add.f64 fd537, fd365, fd380; +add.f64 fd538, fd144, fd537; +add.f64 fd539, fd370, fd375; +add.f64 fd540, fd539, fd538; +fma.rn.f64 fd541, fd533, 0d3FD3C6EF372FE950, fd126; +mul.f64 fd542, fd535, 0d3FE9E3779B97F4A8; +sub.f64 fd543, fd541, fd542; +sub.f64 fd544, fd365, fd380; +mul.f64 fd545, fd544, 0d3FEE6F0E134454FF; +sub.f64 fd546, fd370, fd375; +mul.f64 fd547, fd546, 0dBFE2CF2304755A5E; +sub.f64 fd548, fd547, fd545; +sub.f64 fd549, fd543, fd548; +add.f64 fd550, fd548, fd543; +mul.f64 fd551, fd533, 0d3FE9E3779B97F4A8; +sub.f64 fd552, fd126, fd551; +fma.rn.f64 fd553, fd535, 0d3FD3C6EF372FE950, fd552; +mul.f64 fd554, fd544, 0d3FE2CF2304755A5E; +mul.f64 fd555, fd546, 0d3FEE6F0E134454FF; +sub.f64 fd556, fd555, fd554; +sub.f64 fd557, fd553, fd556; +add.f64 fd558, fd556, fd553; +fma.rn.f64 fd559, fd537, 0d3FD3C6EF372FE950, fd144; +mul.f64 fd560, fd539, 0d3FE9E3779B97F4A8; +sub.f64 fd561, fd559, fd560; +sub.f64 fd562, fd363, fd378; +mul.f64 fd563, fd562, 0d3FEE6F0E134454FF; +sub.f64 fd564, fd368, fd373; +mul.f64 fd565, fd564, 0dBFE2CF2304755A5E; +sub.f64 fd566, fd565, fd563; +add.f64 fd567, fd566, fd561; +sub.f64 fd568, fd561, fd566; +mul.f64 fd569, fd537, 0d3FE9E3779B97F4A8; +sub.f64 fd570, fd144, fd569; +fma.rn.f64 fd571, fd539, 0d3FD3C6EF372FE950, fd570; +mul.f64 fd572, fd562, 0d3FE2CF2304755A5E; +mul.f64 fd573, fd564, 0d3FEE6F0E134454FF; +sub.f64 fd574, fd573, fd572; +add.f64 fd575, fd574, fd571; +sub.f64 fd576, fd571, fd574; +add.f64 fd577, fd383, fd398; +add.f64 fd578, fd118, fd577; +add.f64 fd579, fd388, fd393; +add.f64 fd580, fd579, fd578; +add.f64 fd581, fd385, fd400; +add.f64 fd582, fd136, fd581; +add.f64 fd583, fd390, fd395; +add.f64 fd584, fd583, fd582; +fma.rn.f64 fd585, fd577, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd586, fd579, 0d3FE9E3779B97F4A8; +sub.f64 fd587, fd585, fd586; +sub.f64 fd588, fd385, fd400; +mul.f64 fd589, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd590, fd390, fd395; +mul.f64 fd591, fd590, 0dBFE2CF2304755A5E; +sub.f64 fd592, fd591, fd589; +sub.f64 fd593, fd587, fd592; +add.f64 fd594, fd592, fd587; +mul.f64 fd595, fd577, 0d3FE9E3779B97F4A8; +sub.f64 fd596, fd118, fd595; +fma.rn.f64 fd597, fd579, 0d3FD3C6EF372FE950, fd596; +mul.f64 fd598, fd588, 0d3FE2CF2304755A5E; +mul.f64 fd599, fd590, 0d3FEE6F0E134454FF; +sub.f64 fd600, fd599, fd598; +sub.f64 fd601, fd597, fd600; +add.f64 fd602, fd600, fd597; +fma.rn.f64 fd603, fd581, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd604, fd583, 0d3FE9E3779B97F4A8; +sub.f64 fd605, fd603, fd604; +sub.f64 fd606, fd383, fd398; +mul.f64 fd607, fd606, 0d3FEE6F0E134454FF; +sub.f64 fd608, fd388, fd393; +mul.f64 fd609, fd608, 0dBFE2CF2304755A5E; +sub.f64 fd610, fd609, fd607; +add.f64 fd611, fd610, fd605; +sub.f64 fd612, fd605, fd610; +mul.f64 fd613, fd581, 0d3FE9E3779B97F4A8; +sub.f64 fd614, fd136, fd613; +fma.rn.f64 fd615, fd583, 0d3FD3C6EF372FE950, fd614; +mul.f64 fd616, fd606, 0d3FE2CF2304755A5E; +mul.f64 fd617, fd608, 0d3FEE6F0E134454FF; +sub.f64 fd618, fd617, fd616; +add.f64 fd619, fd618, fd615; +sub.f64 fd620, fd615, fd618; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd621, fd622}, [rd6]; +mul.f64 fd625, fd621, fd448; +mul.f64 fd626, fd622, fd452; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd621, fd452; +fma.rn.f64 fd629, fd622, fd448, fd628; +mul.f64 fd630, fd621, fd621; +mul.f64 fd631, fd622, fd622; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd622, fd621; +fma.rn.f64 fd634, fd622, fd621, fd633; +mul.f64 fd635, fd632, fd492; +mul.f64 fd636, fd634, fd496; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd632, fd496; +fma.rn.f64 fd639, fd634, fd492, fd638; +mul.f64 fd640, fd621, fd632; +mul.f64 fd641, fd622, fd634; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd621, fd634; +fma.rn.f64 fd644, fd622, fd632, fd643; +mul.f64 fd645, fd642, fd536; +mul.f64 fd646, fd644, fd540; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd642, fd540; +fma.rn.f64 fd649, fd644, fd536, fd648; +mul.f64 fd650, fd621, fd642; +mul.f64 fd651, fd622, fd644; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd621, fd644; +fma.rn.f64 fd654, fd622, fd642, fd653; +mul.f64 fd655, fd652, fd580; +mul.f64 fd656, fd654, fd584; +sub.f64 fd657, fd655, fd656; +mul.f64 fd658, fd652, fd584; +fma.rn.f64 fd659, fd654, fd580, fd658; +mul.f64 fd660, fd621, fd652; +mul.f64 fd661, fd622, fd654; +sub.f64 fd662, fd660, fd661; +mul.f64 fd663, fd621, fd654; +fma.rn.f64 fd664, fd622, fd652, fd663; +mul.f64 fd665, fd662, fd417; +mul.f64 fd666, fd664, fd435; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd662, fd435; +fma.rn.f64 fd669, fd664, fd417, fd668; +mul.f64 fd670, fd621, fd662; +mul.f64 fd671, fd622, fd664; +sub.f64 fd672, fd670, fd671; +mul.f64 fd673, fd621, fd664; +fma.rn.f64 fd674, fd622, fd662, fd673; +mul.f64 fd675, fd672, fd461; +mul.f64 fd676, fd674, fd479; +sub.f64 fd677, fd675, fd676; +mul.f64 fd678, fd672, fd479; +fma.rn.f64 fd679, fd674, fd461, fd678; +mul.f64 fd680, fd621, fd672; +mul.f64 fd681, fd622, fd674; +sub.f64 fd682, fd680, fd681; +mul.f64 fd683, fd621, fd674; +fma.rn.f64 fd684, fd622, fd672, fd683; +mul.f64 fd685, fd682, fd505; +mul.f64 fd686, fd684, fd523; +sub.f64 fd687, fd685, fd686; +mul.f64 fd688, fd682, fd523; +fma.rn.f64 fd689, fd684, fd505, fd688; +mul.f64 fd690, fd621, fd682; +mul.f64 fd691, fd622, fd684; +sub.f64 fd692, fd690, fd691; +mul.f64 fd693, fd621, fd684; +fma.rn.f64 fd694, fd622, fd682, fd693; +mul.f64 fd695, fd692, fd549; +mul.f64 fd696, fd694, fd567; +sub.f64 fd697, fd695, fd696; +mul.f64 fd698, fd692, fd567; +fma.rn.f64 fd699, fd694, fd549, fd698; +mul.f64 fd700, fd621, fd692; +mul.f64 fd701, fd622, fd694; +sub.f64 fd702, fd700, fd701; +mul.f64 fd703, fd621, fd694; +fma.rn.f64 fd704, fd622, fd692, fd703; +mul.f64 fd705, fd702, fd593; +mul.f64 fd706, fd704, fd611; +sub.f64 fd707, fd705, fd706; +mul.f64 fd708, fd702, fd611; +fma.rn.f64 fd709, fd704, fd593, fd708; +mul.f64 fd710, fd621, fd702; +mul.f64 fd711, fd622, fd704; +sub.f64 fd712, fd710, fd711; +mul.f64 fd713, fd621, fd704; +fma.rn.f64 fd714, fd622, fd702, fd713; +mul.f64 fd715, fd712, fd425; +mul.f64 fd716, fd714, fd443; +sub.f64 fd717, fd715, fd716; +mul.f64 fd718, fd712, fd443; +fma.rn.f64 fd719, fd714, fd425, fd718; +mul.f64 fd720, fd621, fd712; +mul.f64 fd721, fd622, fd714; +sub.f64 fd722, fd720, fd721; +mul.f64 fd723, fd621, fd714; +fma.rn.f64 fd724, fd622, fd712, fd723; +mul.f64 fd725, fd722, fd469; +mul.f64 fd726, fd724, fd487; +sub.f64 fd727, fd725, fd726; +mul.f64 fd728, fd722, fd487; +fma.rn.f64 fd729, fd724, fd469, fd728; +mul.f64 fd730, fd621, fd722; +mul.f64 fd731, fd622, fd724; +sub.f64 fd732, fd730, fd731; +mul.f64 fd733, fd621, fd724; +fma.rn.f64 fd734, fd622, fd722, fd733; +mul.f64 fd735, fd732, fd513; +mul.f64 fd736, fd734, fd531; +sub.f64 fd737, fd735, fd736; +mul.f64 fd738, fd732, fd531; +fma.rn.f64 fd739, fd734, fd513, fd738; +ld.global.v2.f64 {fd740, fd741}, [rd6+80]; +mul.f64 fd744, fd740, fd557; +mul.f64 fd745, fd741, fd575; +sub.f64 fd746, fd744, fd745; +mul.f64 fd747, fd740, fd575; +fma.rn.f64 fd748, fd741, fd557, fd747; +mul.f64 fd749, fd621, fd740; +mul.f64 fd750, fd622, fd741; +sub.f64 fd751, fd749, fd750; +mul.f64 fd752, fd621, fd741; +fma.rn.f64 fd753, fd622, fd740, fd752; +mul.f64 fd754, fd751, fd601; +mul.f64 fd755, fd753, fd619; +sub.f64 fd756, fd754, fd755; +mul.f64 fd757, fd751, fd619; +fma.rn.f64 fd758, fd753, fd601, fd757; +mul.f64 fd759, fd621, fd751; +mul.f64 fd760, fd622, fd753; +sub.f64 fd761, fd759, fd760; +mul.f64 fd762, fd621, fd753; +fma.rn.f64 fd763, fd622, fd751, fd762; +mul.f64 fd764, fd761, fd426; +mul.f64 fd765, fd763, fd444; +sub.f64 fd766, fd764, fd765; +mul.f64 fd767, fd761, fd444; +fma.rn.f64 fd768, fd763, fd426, fd767; +mul.f64 fd769, fd621, fd761; +mul.f64 fd770, fd622, fd763; +sub.f64 fd771, fd769, fd770; +mul.f64 fd772, fd621, fd763; +fma.rn.f64 fd773, fd622, fd761, fd772; +mul.f64 fd774, fd771, fd470; +mul.f64 fd775, fd773, fd488; +sub.f64 fd776, fd774, fd775; +mul.f64 fd777, fd771, fd488; +fma.rn.f64 fd778, fd773, fd470, fd777; +mul.f64 fd779, fd621, fd771; +mul.f64 fd780, fd622, fd773; +sub.f64 fd781, fd779, fd780; +mul.f64 fd782, fd621, fd773; +fma.rn.f64 fd783, fd622, fd771, fd782; +mul.f64 fd784, fd781, fd514; +mul.f64 fd785, fd783, fd532; +sub.f64 fd786, fd784, fd785; +mul.f64 fd787, fd781, fd532; +fma.rn.f64 fd788, fd783, fd514, fd787; +mul.f64 fd789, fd621, fd781; +mul.f64 fd790, fd622, fd783; +sub.f64 fd791, fd789, fd790; +mul.f64 fd792, fd621, fd783; +fma.rn.f64 fd793, fd622, fd781, fd792; +mul.f64 fd794, fd791, fd558; +mul.f64 fd795, fd793, fd576; +sub.f64 fd796, fd794, fd795; +mul.f64 fd797, fd791, fd576; +fma.rn.f64 fd798, fd793, fd558, fd797; +mul.f64 fd799, fd621, fd791; +mul.f64 fd800, fd622, fd793; +sub.f64 fd801, fd799, fd800; +mul.f64 fd802, fd621, fd793; +fma.rn.f64 fd803, fd622, fd791, fd802; +mul.f64 fd804, fd801, fd602; +mul.f64 fd805, fd803, fd620; +sub.f64 fd806, fd804, fd805; +mul.f64 fd807, fd801, fd620; +fma.rn.f64 fd808, fd803, fd602, fd807; +mul.f64 fd809, fd621, fd801; +mul.f64 fd810, fd622, fd803; +sub.f64 fd811, fd809, fd810; +mul.f64 fd812, fd621, fd803; +fma.rn.f64 fd813, fd622, fd801, fd812; +mul.f64 fd814, fd811, fd418; +mul.f64 fd815, fd813, fd436; +sub.f64 fd816, fd814, fd815; +mul.f64 fd817, fd811, fd436; +fma.rn.f64 fd818, fd813, fd418, fd817; +mul.f64 fd819, fd621, fd811; +mul.f64 fd820, fd622, fd813; +sub.f64 fd821, fd819, fd820; +mul.f64 fd822, fd621, fd813; +fma.rn.f64 fd823, fd622, fd811, fd822; +mul.f64 fd824, fd821, fd462; +mul.f64 fd825, fd823, fd480; +sub.f64 fd826, fd824, fd825; +mul.f64 fd827, fd821, fd480; +fma.rn.f64 fd828, fd823, fd462, fd827; +mul.f64 fd829, fd621, fd821; +mul.f64 fd830, fd622, fd823; +sub.f64 fd831, fd829, fd830; +mul.f64 fd832, fd621, fd823; +fma.rn.f64 fd833, fd622, fd821, fd832; +mul.f64 fd834, fd831, fd506; +mul.f64 fd835, fd833, fd524; +sub.f64 fd836, fd834, fd835; +mul.f64 fd837, fd831, fd524; +fma.rn.f64 fd838, fd833, fd506, fd837; +mul.f64 fd839, fd621, fd831; +mul.f64 fd840, fd622, fd833; +sub.f64 fd841, fd839, fd840; +mul.f64 fd842, fd621, fd833; +fma.rn.f64 fd843, fd622, fd831, fd842; +mul.f64 fd844, fd841, fd550; +mul.f64 fd845, fd843, fd568; +sub.f64 fd846, fd844, fd845; +mul.f64 fd847, fd841, fd568; +fma.rn.f64 fd848, fd843, fd550, fd847; +mul.f64 fd849, fd621, fd841; +mul.f64 fd850, fd622, fd843; +sub.f64 fd851, fd849, fd850; +mul.f64 fd852, fd621, fd843; +fma.rn.f64 fd853, fd622, fd841, fd852; +mul.f64 fd854, fd851, fd594; +mul.f64 fd855, fd853, fd612; +sub.f64 fd856, fd854, fd855; +mul.f64 fd857, fd851, fd612; +fma.rn.f64 fd858, fd853, fd594, fd857; +mad.lo.s32 r8, r5, 1000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +st.shared.f64 [r9], fd404; +st.shared.f64 [r9+8], fd627; +st.shared.f64 [r9+16], fd637; +st.shared.f64 [r9+24], fd647; +st.shared.f64 [r9+32], fd657; +st.shared.f64 [r9+40], fd667; +st.shared.f64 [r9+48], fd677; +st.shared.f64 [r9+56], fd687; +st.shared.f64 [r9+64], fd697; +st.shared.f64 [r9+72], fd707; +st.shared.f64 [r9+80], fd717; +st.shared.f64 [r9+88], fd727; +st.shared.f64 [r9+96], fd737; +st.shared.f64 [r9+104], fd746; +st.shared.f64 [r9+112], fd756; +st.shared.f64 [r9+120], fd766; +st.shared.f64 [r9+128], fd776; +st.shared.f64 [r9+136], fd786; +st.shared.f64 [r9+144], fd796; +st.shared.f64 [r9+152], fd806; +st.shared.f64 [r9+160], fd816; +st.shared.f64 [r9+168], fd826; +st.shared.f64 [r9+176], fd836; +st.shared.f64 [r9+184], fd846; +st.shared.f64 [r9+192], fd856; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.f64 fd859, [r10]; +ld.shared.f64 fd860, [r10+40]; +ld.shared.f64 fd861, [r10+80]; +ld.shared.f64 fd862, [r10+120]; +ld.shared.f64 fd863, [r10+160]; +ld.shared.f64 fd864, [r10+200]; +ld.shared.f64 fd865, [r10+240]; +ld.shared.f64 fd866, [r10+280]; +ld.shared.f64 fd867, [r10+320]; +ld.shared.f64 fd868, [r10+360]; +ld.shared.f64 fd869, [r10+400]; +ld.shared.f64 fd870, [r10+440]; +ld.shared.f64 fd871, [r10+480]; +ld.shared.f64 fd872, [r10+520]; +ld.shared.f64 fd873, [r10+560]; +ld.shared.f64 fd874, [r10+600]; +ld.shared.f64 fd875, [r10+640]; +ld.shared.f64 fd876, [r10+680]; +ld.shared.f64 fd877, [r10+720]; +ld.shared.f64 fd878, [r10+760]; +ld.shared.f64 fd879, [r10+800]; +ld.shared.f64 fd880, [r10+840]; +ld.shared.f64 fd881, [r10+880]; +ld.shared.f64 fd882, [r10+920]; +ld.shared.f64 fd883, [r10+960]; +barrier.sync 0; +st.shared.f64 [r9], fd408; +st.shared.f64 [r9+8], fd629; +st.shared.f64 [r9+16], fd639; +st.shared.f64 [r9+24], fd649; +st.shared.f64 [r9+32], fd659; +st.shared.f64 [r9+40], fd669; +st.shared.f64 [r9+48], fd679; +st.shared.f64 [r9+56], fd689; +st.shared.f64 [r9+64], fd699; +st.shared.f64 [r9+72], fd709; +st.shared.f64 [r9+80], fd719; +st.shared.f64 [r9+88], fd729; +st.shared.f64 [r9+96], fd739; +st.shared.f64 [r9+104], fd748; +st.shared.f64 [r9+112], fd758; +st.shared.f64 [r9+120], fd768; +st.shared.f64 [r9+128], fd778; +st.shared.f64 [r9+136], fd788; +st.shared.f64 [r9+144], fd798; +st.shared.f64 [r9+152], fd808; +st.shared.f64 [r9+160], fd818; +st.shared.f64 [r9+168], fd828; +st.shared.f64 [r9+176], fd838; +st.shared.f64 [r9+184], fd848; +st.shared.f64 [r9+192], fd858; +barrier.sync 0; +ld.shared.f64 fd884, [r10]; +ld.shared.f64 fd885, [r10+40]; +ld.shared.f64 fd886, [r10+80]; +ld.shared.f64 fd887, [r10+120]; +ld.shared.f64 fd888, [r10+160]; +ld.shared.f64 fd889, [r10+200]; +ld.shared.f64 fd890, [r10+240]; +ld.shared.f64 fd891, [r10+280]; +ld.shared.f64 fd892, [r10+320]; +ld.shared.f64 fd893, [r10+360]; +ld.shared.f64 fd894, [r10+400]; +ld.shared.f64 fd895, [r10+440]; +ld.shared.f64 fd896, [r10+480]; +ld.shared.f64 fd897, [r10+520]; +ld.shared.f64 fd898, [r10+560]; +ld.shared.f64 fd899, [r10+600]; +ld.shared.f64 fd900, [r10+640]; +ld.shared.f64 fd901, [r10+680]; +ld.shared.f64 fd902, [r10+720]; +ld.shared.f64 fd903, [r10+760]; +ld.shared.f64 fd904, [r10+800]; +ld.shared.f64 fd905, [r10+840]; +ld.shared.f64 fd906, [r10+880]; +ld.shared.f64 fd907, [r10+920]; +ld.shared.f64 fd908, [r10+960]; +add.f64 fd909, fd864, fd879; +add.f64 fd910, fd859, fd909; +add.f64 fd911, fd869, fd874; +add.f64 fd912, fd889, fd904; +add.f64 fd913, fd884, fd912; +add.f64 fd914, fd894, fd899; +fma.rn.f64 fd915, fd909, 0d3FD3C6EF372FE950, fd859; +mul.f64 fd916, fd911, 0d3FE9E3779B97F4A8; +sub.f64 fd917, fd915, fd916; +sub.f64 fd918, fd889, fd904; +mul.f64 fd919, fd918, 0d3FEE6F0E134454FF; +sub.f64 fd920, fd894, fd899; +mul.f64 fd921, fd920, 0dBFE2CF2304755A5E; +sub.f64 fd922, fd921, fd919; +mul.f64 fd923, fd909, 0d3FE9E3779B97F4A8; +sub.f64 fd924, fd859, fd923; +fma.rn.f64 fd925, fd911, 0d3FD3C6EF372FE950, fd924; +mul.f64 fd926, fd918, 0d3FE2CF2304755A5E; +mul.f64 fd927, fd920, 0d3FEE6F0E134454FF; +sub.f64 fd928, fd927, fd926; +fma.rn.f64 fd929, fd912, 0d3FD3C6EF372FE950, fd884; +mul.f64 fd930, fd914, 0d3FE9E3779B97F4A8; +sub.f64 fd931, fd929, fd930; +sub.f64 fd932, fd864, fd879; +mul.f64 fd933, fd932, 0d3FEE6F0E134454FF; +sub.f64 fd934, fd869, fd874; +mul.f64 fd935, fd934, 0dBFE2CF2304755A5E; +sub.f64 fd936, fd935, fd933; +mul.f64 fd937, fd912, 0d3FE9E3779B97F4A8; +sub.f64 fd938, fd884, fd937; +fma.rn.f64 fd939, fd914, 0d3FD3C6EF372FE950, fd938; +mul.f64 fd940, fd932, 0d3FE2CF2304755A5E; +mul.f64 fd941, fd934, 0d3FEE6F0E134454FF; +sub.f64 fd942, fd941, fd940; +add.f64 fd943, fd865, fd880; +add.f64 fd944, fd860, fd943; +add.f64 fd945, fd870, fd875; +add.f64 fd946, fd890, fd905; +add.f64 fd947, fd885, fd946; +add.f64 fd948, fd895, fd900; +fma.rn.f64 fd949, fd943, 0d3FD3C6EF372FE950, fd860; +mul.f64 fd950, fd945, 0d3FE9E3779B97F4A8; +sub.f64 fd951, fd949, fd950; +sub.f64 fd952, fd890, fd905; +mul.f64 fd953, fd952, 0d3FEE6F0E134454FF; +sub.f64 fd954, fd895, fd900; +mul.f64 fd955, fd954, 0dBFE2CF2304755A5E; +sub.f64 fd956, fd955, fd953; +mul.f64 fd957, fd943, 0d3FE9E3779B97F4A8; +sub.f64 fd958, fd860, fd957; +fma.rn.f64 fd959, fd945, 0d3FD3C6EF372FE950, fd958; +mul.f64 fd960, fd952, 0d3FE2CF2304755A5E; +mul.f64 fd961, fd954, 0d3FEE6F0E134454FF; +sub.f64 fd962, fd961, fd960; +fma.rn.f64 fd963, fd946, 0d3FD3C6EF372FE950, fd885; +mul.f64 fd964, fd948, 0d3FE9E3779B97F4A8; +sub.f64 fd965, fd963, fd964; +sub.f64 fd966, fd865, fd880; +mul.f64 fd967, fd966, 0d3FEE6F0E134454FF; +sub.f64 fd968, fd870, fd875; +mul.f64 fd969, fd968, 0dBFE2CF2304755A5E; +sub.f64 fd970, fd969, fd967; +mul.f64 fd971, fd946, 0d3FE9E3779B97F4A8; +sub.f64 fd972, fd885, fd971; +fma.rn.f64 fd973, fd948, 0d3FD3C6EF372FE950, fd972; +mul.f64 fd974, fd966, 0d3FE2CF2304755A5E; +mul.f64 fd975, fd968, 0d3FEE6F0E134454FF; +sub.f64 fd976, fd975, fd974; +add.f64 fd977, fd866, fd881; +add.f64 fd978, fd861, fd977; +add.f64 fd979, fd871, fd876; +add.f64 fd980, fd891, fd906; +add.f64 fd981, fd886, fd980; +add.f64 fd982, fd896, fd901; +fma.rn.f64 fd983, fd977, 0d3FD3C6EF372FE950, fd861; +mul.f64 fd984, fd979, 0d3FE9E3779B97F4A8; +sub.f64 fd985, fd983, fd984; +sub.f64 fd986, fd891, fd906; +mul.f64 fd987, fd986, 0d3FEE6F0E134454FF; +sub.f64 fd988, fd896, fd901; +mul.f64 fd989, fd988, 0dBFE2CF2304755A5E; +sub.f64 fd990, fd989, fd987; +mul.f64 fd991, fd977, 0d3FE9E3779B97F4A8; +sub.f64 fd992, fd861, fd991; +fma.rn.f64 fd993, fd979, 0d3FD3C6EF372FE950, fd992; +mul.f64 fd994, fd986, 0d3FE2CF2304755A5E; +mul.f64 fd995, fd988, 0d3FEE6F0E134454FF; +sub.f64 fd996, fd995, fd994; +fma.rn.f64 fd997, fd980, 0d3FD3C6EF372FE950, fd886; +mul.f64 fd998, fd982, 0d3FE9E3779B97F4A8; +sub.f64 fd999, fd997, fd998; +sub.f64 fd1000, fd866, fd881; +mul.f64 fd1001, fd1000, 0d3FEE6F0E134454FF; +sub.f64 fd1002, fd871, fd876; +mul.f64 fd1003, fd1002, 0dBFE2CF2304755A5E; +sub.f64 fd1004, fd1003, fd1001; +mul.f64 fd1005, fd980, 0d3FE9E3779B97F4A8; +sub.f64 fd1006, fd886, fd1005; +fma.rn.f64 fd1007, fd982, 0d3FD3C6EF372FE950, fd1006; +mul.f64 fd1008, fd1000, 0d3FE2CF2304755A5E; +mul.f64 fd1009, fd1002, 0d3FEE6F0E134454FF; +sub.f64 fd1010, fd1009, fd1008; +add.f64 fd1011, fd867, fd882; +add.f64 fd1012, fd862, fd1011; +add.f64 fd1013, fd872, fd877; +add.f64 fd1014, fd892, fd907; +add.f64 fd1015, fd887, fd1014; +add.f64 fd1016, fd897, fd902; +fma.rn.f64 fd1017, fd1011, 0d3FD3C6EF372FE950, fd862; +mul.f64 fd1018, fd1013, 0d3FE9E3779B97F4A8; +sub.f64 fd1019, fd1017, fd1018; +sub.f64 fd1020, fd892, fd907; +mul.f64 fd1021, fd1020, 0d3FEE6F0E134454FF; +sub.f64 fd1022, fd897, fd902; +mul.f64 fd1023, fd1022, 0dBFE2CF2304755A5E; +sub.f64 fd1024, fd1023, fd1021; +mul.f64 fd1025, fd1011, 0d3FE9E3779B97F4A8; +sub.f64 fd1026, fd862, fd1025; +fma.rn.f64 fd1027, fd1013, 0d3FD3C6EF372FE950, fd1026; +mul.f64 fd1028, fd1020, 0d3FE2CF2304755A5E; +mul.f64 fd1029, fd1022, 0d3FEE6F0E134454FF; +sub.f64 fd1030, fd1029, fd1028; +fma.rn.f64 fd1031, fd1014, 0d3FD3C6EF372FE950, fd887; +mul.f64 fd1032, fd1016, 0d3FE9E3779B97F4A8; +sub.f64 fd1033, fd1031, fd1032; +sub.f64 fd1034, fd867, fd882; +mul.f64 fd1035, fd1034, 0d3FEE6F0E134454FF; +sub.f64 fd1036, fd872, fd877; +mul.f64 fd1037, fd1036, 0dBFE2CF2304755A5E; +sub.f64 fd1038, fd1037, fd1035; +mul.f64 fd1039, fd1014, 0d3FE9E3779B97F4A8; +sub.f64 fd1040, fd887, fd1039; +fma.rn.f64 fd1041, fd1016, 0d3FD3C6EF372FE950, fd1040; +mul.f64 fd1042, fd1034, 0d3FE2CF2304755A5E; +mul.f64 fd1043, fd1036, 0d3FEE6F0E134454FF; +sub.f64 fd1044, fd1043, fd1042; +add.f64 fd1045, fd868, fd883; +add.f64 fd1046, fd863, fd1045; +add.f64 fd1047, fd873, fd878; +add.f64 fd1048, fd893, fd908; +add.f64 fd1049, fd888, fd1048; +add.f64 fd1050, fd898, fd903; +fma.rn.f64 fd1051, fd1045, 0d3FD3C6EF372FE950, fd863; +mul.f64 fd1052, fd1047, 0d3FE9E3779B97F4A8; +sub.f64 fd1053, fd1051, fd1052; +sub.f64 fd1054, fd893, fd908; +mul.f64 fd1055, fd1054, 0d3FEE6F0E134454FF; +sub.f64 fd1056, fd898, fd903; +mul.f64 fd1057, fd1056, 0dBFE2CF2304755A5E; +sub.f64 fd1058, fd1057, fd1055; +mul.f64 fd1059, fd1045, 0d3FE9E3779B97F4A8; +sub.f64 fd1060, fd863, fd1059; +fma.rn.f64 fd1061, fd1047, 0d3FD3C6EF372FE950, fd1060; +mul.f64 fd1062, fd1054, 0d3FE2CF2304755A5E; +mul.f64 fd1063, fd1056, 0d3FEE6F0E134454FF; +sub.f64 fd1064, fd1063, fd1062; +fma.rn.f64 fd1065, fd1048, 0d3FD3C6EF372FE950, fd888; +mul.f64 fd1066, fd1050, 0d3FE9E3779B97F4A8; +sub.f64 fd1067, fd1065, fd1066; +sub.f64 fd1068, fd868, fd883; +mul.f64 fd1069, fd1068, 0d3FEE6F0E134454FF; +sub.f64 fd1070, fd873, fd878; +mul.f64 fd1071, fd1070, 0dBFE2CF2304755A5E; +sub.f64 fd1072, fd1071, fd1069; +mul.f64 fd1073, fd1048, 0d3FE9E3779B97F4A8; +sub.f64 fd1074, fd888, fd1073; +fma.rn.f64 fd1075, fd1050, 0d3FD3C6EF372FE950, fd1074; +mul.f64 fd1076, fd1068, 0d3FE2CF2304755A5E; +mul.f64 fd1077, fd1070, 0d3FEE6F0E134454FF; +sub.f64 fd1078, fd1077, fd1076; +add.f64 %0, fd911, fd910; +add.f64 %1, fd914, fd913; +add.f64 %2, fd945, fd944; +add.f64 %3, fd948, fd947; +add.f64 %4, fd979, fd978; +add.f64 %5, fd982, fd981; +add.f64 %6, fd1013, fd1012; +add.f64 %7, fd1016, fd1015; +add.f64 %8, fd1047, fd1046; +add.f64 %9, fd1050, fd1049; +add.f64 %11, fd936, fd931; +sub.f64 %10, fd917, fd922; +add.f64 %13, fd970, fd965; +sub.f64 %12, fd951, fd956; +add.f64 %15, fd1004, fd999; +sub.f64 %14, fd985, fd990; +add.f64 %17, fd1038, fd1033; +sub.f64 %16, fd1019, fd1024; +add.f64 %19, fd1072, fd1067; +sub.f64 %18, fd1053, fd1058; +sub.f64 %20, fd925, fd928; +add.f64 %21, fd942, fd939; +sub.f64 %22, fd959, fd962; +add.f64 %23, fd976, fd973; +sub.f64 %24, fd993, fd996; +add.f64 %25, fd1010, fd1007; +sub.f64 %26, fd1027, fd1030; +add.f64 %27, fd1044, fd1041; +sub.f64 %28, fd1061, fd1064; +add.f64 %29, fd1078, fd1075; +add.f64 %30, fd928, fd925; +sub.f64 %31, fd939, fd942; +add.f64 %32, fd962, fd959; +sub.f64 %33, fd973, fd976; +add.f64 %34, fd996, fd993; +sub.f64 %35, fd1007, fd1010; +add.f64 %36, fd1030, fd1027; +sub.f64 %37, fd1041, fd1044; +add.f64 %38, fd1064, fd1061; +sub.f64 %39, fd1075, fd1078; +sub.f64 %41, fd931, fd936; +add.f64 %40, fd922, fd917; +sub.f64 %43, fd965, fd970; +add.f64 %42, fd956, fd951; +sub.f64 %45, fd999, fd1004; +add.f64 %44, fd990, fd985; +sub.f64 %47, fd1033, fd1038; +add.f64 %46, fd1024, fd1019; +sub.f64 %49, fd1067, fd1072; +add.f64 %48, fd1058, fd1053; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_125), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..89a467999caf2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_125_fp64_inv.hpp.inc @@ -0,0 +1,2632 @@ +#ifndef CUFFTDX_FFT_125_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_125_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<710, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1368>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 2000, r13; +add.f64 fd101, %62, %92; +add.f64 fd103, %72, %82; +add.f64 fd1367, %52, fd101; +add.f64 fd104, fd103, fd1367; +add.f64 fd105, %102, %104; +add.f64 fd107, %103, %83; +add.f64 fd1363, %53, fd105; +add.f64 fd108, fd107, fd1363; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1362, fd101, 0d3FD3C6EF372FE950, %52; +sub.f64 fd111, fd1362, fd110; +sub.f64 fd112, %102, %104; +sub.f64 fd114, %103, %83; +mul.f64 fd1361, fd112, 0d3FEE6F0E134454FF; +fma.rn.f64 fd115, fd114, 0d3FE2CF2304755A5E, fd1361; +sub.f64 fd116, fd111, fd115; +add.f64 fd117, fd115, fd111; +mul.f64 fd118, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd119, %52, fd118; +fma.rn.f64 fd120, fd103, 0d3FD3C6EF372FE950, fd119; +mul.f64 fd121, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd122, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd120, fd123; +add.f64 fd125, fd123, fd120; +mul.f64 fd127, fd107, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1360, fd105, 0d3FD3C6EF372FE950, %53; +sub.f64 fd128, fd1360, fd127; +sub.f64 fd129, %62, %92; +sub.f64 fd131, %72, %82; +mul.f64 fd1359, fd129, 0d3FEE6F0E134454FF; +fma.rn.f64 fd132, fd131, 0d3FE2CF2304755A5E, fd1359; +add.f64 fd133, fd132, fd128; +sub.f64 fd134, fd128, fd132; +mul.f64 fd135, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd136, %53, fd135; +fma.rn.f64 fd137, fd107, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd138, fd129, 0d3FE2CF2304755A5E; +mul.f64 fd139, fd131, 0d3FEE6F0E134454FF; +sub.f64 fd140, fd138, fd139; +add.f64 fd141, fd140, fd137; +sub.f64 fd142, fd137, fd140; +add.f64 fd143, %64, %94; +add.f64 fd145, %74, %84; +add.f64 fd1358, %54, fd143; +add.f64 fd146, fd145, fd1358; +add.f64 fd147, %65, %95; +add.f64 fd149, %107, %105; +add.f64 fd1354, %106, fd147; +add.f64 fd150, fd149, fd1354; +fma.rn.f64 fd1352, fd143, 0d3FD3C6EF372FE950, %54; +mul.f64 fd1353, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd153, fd1352, fd1353; +sub.f64 fd154, %65, %95; +sub.f64 fd156, %107, %105; +mul.f64 fd1351, fd154, 0d3FEE6F0E134454FF; +fma.rn.f64 fd157, fd156, 0d3FE2CF2304755A5E, fd1351; +sub.f64 fd158, fd153, fd157; +add.f64 fd159, fd157, fd153; +mul.f64 fd160, fd143, 0d3FE9E3779B97F4A8; +sub.f64 fd161, %54, fd160; +fma.rn.f64 fd162, fd145, 0d3FD3C6EF372FE950, fd161; +mul.f64 fd163, fd154, 0d3FE2CF2304755A5E; +mul.f64 fd164, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd165, fd163, fd164; +sub.f64 fd166, fd162, fd165; +add.f64 fd167, fd165, fd162; +mul.f64 fd169, fd149, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1350, fd147, 0d3FD3C6EF372FE950, %106; +sub.f64 fd170, fd1350, fd169; +sub.f64 fd171, %64, %94; +sub.f64 fd173, %74, %84; +mul.f64 fd1349, fd171, 0d3FEE6F0E134454FF; +fma.rn.f64 fd174, fd173, 0d3FE2CF2304755A5E, fd1349; +add.f64 fd175, fd174, fd170; +sub.f64 fd176, fd170, fd174; +mul.f64 fd177, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd178, %106, fd177; +fma.rn.f64 fd179, fd149, 0d3FD3C6EF372FE950, fd178; +mul.f64 fd180, fd171, 0d3FE2CF2304755A5E; +mul.f64 fd181, fd173, 0d3FEE6F0E134454FF; +sub.f64 fd182, fd180, fd181; +add.f64 fd183, fd182, fd179; +sub.f64 fd184, fd179, fd182; +add.f64 fd185, %66, %96; +add.f64 fd187, %76, %86; +add.f64 fd1348, %56, fd185; +add.f64 fd188, fd187, fd1348; +add.f64 fd189, %110, %109; +add.f64 fd191, %77, %111; +add.f64 fd1343, %108, fd189; +add.f64 fd192, fd191, fd1343; +fma.rn.f64 fd1341, fd185, 0d3FD3C6EF372FE950, %56; +mul.f64 fd1342, fd187, 0d3FE9E3779B97F4A8; +sub.f64 fd195, fd1341, fd1342; +sub.f64 fd196, %110, %109; +sub.f64 fd198, %77, %111; +mul.f64 fd1340, fd196, 0d3FEE6F0E134454FF; +fma.rn.f64 fd199, fd198, 0d3FE2CF2304755A5E, fd1340; +sub.f64 fd200, fd195, fd199; +add.f64 fd201, fd199, fd195; +mul.f64 fd202, fd185, 0d3FE9E3779B97F4A8; +sub.f64 fd203, %56, fd202; +fma.rn.f64 fd204, fd187, 0d3FD3C6EF372FE950, fd203; +mul.f64 fd205, fd196, 0d3FE2CF2304755A5E; +mul.f64 fd206, fd198, 0d3FEE6F0E134454FF; +sub.f64 fd207, fd205, fd206; +sub.f64 fd208, fd204, fd207; +add.f64 fd209, fd207, fd204; +fma.rn.f64 fd1338, fd189, 0d3FD3C6EF372FE950, %108; +mul.f64 fd1339, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd212, fd1338, fd1339; +sub.f64 fd213, %66, %96; +sub.f64 fd215, %76, %86; +mul.f64 fd1337, fd213, 0d3FEE6F0E134454FF; +fma.rn.f64 fd216, fd215, 0d3FE2CF2304755A5E, fd1337; +add.f64 fd217, fd216, fd212; +sub.f64 fd218, fd212, fd216; +mul.f64 fd219, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd220, %108, fd219; +fma.rn.f64 fd221, fd191, 0d3FD3C6EF372FE950, fd220; +mul.f64 fd222, fd213, 0d3FE2CF2304755A5E; +mul.f64 fd223, fd215, 0d3FEE6F0E134454FF; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd224, fd221; +sub.f64 fd226, fd221, fd224; +add.f64 fd227, %68, %98; +add.f64 fd229, %78, %88; +add.f64 fd1336, %58, fd227; +add.f64 fd230, fd229, fd1336; +add.f64 fd231, %113, %112; +add.f64 fd233, %114, %89; +add.f64 fd1332, %59, fd231; +add.f64 fd234, fd233, fd1332; +mul.f64 fd236, fd229, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1331, fd227, 0d3FD3C6EF372FE950, %58; +sub.f64 fd237, fd1331, fd236; +sub.f64 fd238, %113, %112; +sub.f64 fd240, %114, %89; +mul.f64 fd1330, fd238, 0d3FEE6F0E134454FF; +fma.rn.f64 fd241, fd240, 0d3FE2CF2304755A5E, fd1330; +sub.f64 fd242, fd237, fd241; +add.f64 fd243, fd241, fd237; +mul.f64 fd244, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd245, %58, fd244; +fma.rn.f64 fd246, fd229, 0d3FD3C6EF372FE950, fd245; +mul.f64 fd247, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd248, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd246, fd249; +add.f64 fd251, fd249, fd246; +mul.f64 fd253, fd233, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1329, fd231, 0d3FD3C6EF372FE950, %59; +sub.f64 fd254, fd1329, fd253; +sub.f64 fd255, %68, %98; +sub.f64 fd257, %78, %88; +mul.f64 fd1328, fd255, 0d3FEE6F0E134454FF; +fma.rn.f64 fd258, fd257, 0d3FE2CF2304755A5E, fd1328; +add.f64 fd259, fd258, fd254; +sub.f64 fd260, fd254, fd258; +mul.f64 fd261, fd231, 0d3FE9E3779B97F4A8; +sub.f64 fd262, %59, fd261; +fma.rn.f64 fd263, fd233, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd264, fd255, 0d3FE2CF2304755A5E; +mul.f64 fd265, fd257, 0d3FEE6F0E134454FF; +sub.f64 fd266, fd264, fd265; +add.f64 fd267, fd266, fd263; +sub.f64 fd268, fd263, fd266; +add.f64 fd269, %70, %100; +add.f64 fd271, %80, %90; +add.f64 fd1327, %60, fd269; +add.f64 fd272, fd271, fd1327; +add.f64 fd273, %71, %101; +add.f64 fd275, %116, %117; +add.f64 fd1323, %115, fd273; +add.f64 fd276, fd275, fd1323; +mul.f64 fd278, fd271, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1322, fd269, 0d3FD3C6EF372FE950, %60; +sub.f64 fd279, fd1322, fd278; +sub.f64 fd280, %71, %101; +sub.f64 fd282, %116, %117; +mul.f64 fd1321, fd280, 0d3FEE6F0E134454FF; +fma.rn.f64 fd283, fd282, 0d3FE2CF2304755A5E, fd1321; +sub.f64 fd284, fd279, fd283; +add.f64 fd285, fd283, fd279; +mul.f64 fd286, fd269, 0d3FE9E3779B97F4A8; +sub.f64 fd287, %60, fd286; +fma.rn.f64 fd288, fd271, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd289, fd280, 0d3FE2CF2304755A5E; +mul.f64 fd290, fd282, 0d3FEE6F0E134454FF; +sub.f64 fd291, fd289, fd290; +sub.f64 fd292, fd288, fd291; +add.f64 fd293, fd291, fd288; +mul.f64 fd295, fd275, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1320, fd273, 0d3FD3C6EF372FE950, %115; +sub.f64 fd296, fd1320, fd295; +sub.f64 fd297, %70, %100; +sub.f64 fd299, %80, %90; +mul.f64 fd1319, fd297, 0d3FEE6F0E134454FF; +fma.rn.f64 fd300, fd299, 0d3FE2CF2304755A5E, fd1319; +add.f64 fd301, fd300, fd296; +sub.f64 fd302, fd296, fd300; +mul.f64 fd303, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd304, %115, fd303; +fma.rn.f64 fd305, fd275, 0d3FD3C6EF372FE950, fd304; +mul.f64 fd306, fd297, 0d3FE2CF2304755A5E; +mul.f64 fd307, fd299, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd306, fd307; +add.f64 fd309, fd308, fd305; +sub.f64 fd310, fd305, fd308; +mul.f64 fd1317, fd158, 0d3FEEFEA21D101EE0; +mul.f64 fd1318, fd175, 0d3FCFD511FA1C0796; +sub.f64 fd313, fd1317, fd1318; +mul.f64 fd314, fd175, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd315, fd158, 0d3FCFD511FA1C0796, fd314; +mul.f64 fd1315, fd200, 0d3FEC0AB44E81C059; +mul.f64 fd1316, fd217, 0d3FDED50D5CBFA951; +sub.f64 fd318, fd1315, fd1316; +mul.f64 fd319, fd217, 0d3FEC0AB44E81C059; +fma.rn.f64 fd320, fd200, 0d3FDED50D5CBFA951, fd319; +mul.f64 fd322, fd259, 0d3FE5E7CF55112014; +mul.f64 fd1314, fd242, 0d3FE753B603D2B816; +sub.f64 fd323, fd1314, fd322; +mul.f64 fd324, fd259, 0d3FE753B603D2B816; +fma.rn.f64 fd325, fd242, 0d3FE5E7CF55112014, fd324; +mul.f64 fd327, fd301, 0d3FEB04BBFF642E86; +mul.f64 fd1313, fd284, 0d3FE1257E3C182B51; +sub.f64 fd328, fd1313, fd327; +mul.f64 fd329, fd301, 0d3FE1257E3C182B51; +fma.rn.f64 fd330, fd284, 0d3FEB04BBFF642E86, fd329; +mul.f64 fd332, fd183, 0d3FDED50D5CBFA951; +mul.f64 fd1312, fd166, 0d3FEC0AB44E81C059; +sub.f64 fd333, fd1312, fd332; +mul.f64 fd334, fd183, 0d3FEC0AB44E81C059; +fma.rn.f64 fd335, fd166, 0d3FDED50D5CBFA951, fd334; +mul.f64 fd337, fd225, 0d3FEB04BBFF642E86; +mul.f64 fd1311, fd208, 0d3FE1257E3C182B51; +sub.f64 fd338, fd1311, fd337; +mul.f64 fd339, fd225, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd208, 0d3FEB04BBFF642E86, fd339; +mul.f64 fd342, fd267, 0d3FEFEFD5BFE443FE; +mul.f64 fd1310, fd250, 0d3FB0130A1BE09379; +sub.f64 fd343, fd1310, fd342; +mul.f64 fd344, fd267, 0d3FB0130A1BE09379; +fma.rn.f64 fd345, fd250, 0d3FEFEFD5BFE443FE, fd344; +mul.f64 fd1308, fd292, 0dBFDB3FF7C925819C; +mul.f64 fd1309, fd309, 0d3FECF457DCDC158C; +sub.f64 fd348, fd1308, fd1309; +mul.f64 fd349, fd309, 0dBFDB3FF7C925819C; +fma.rn.f64 fd350, fd292, 0d3FECF457DCDC158C, fd349; +mul.f64 fd1306, fd167, 0d3FE753B603D2B816; +mul.f64 fd1307, fd184, 0d3FE5E7CF55112014; +sub.f64 fd353, fd1306, fd1307; +mul.f64 fd354, fd184, 0d3FE753B603D2B816; +fma.rn.f64 fd355, fd167, 0d3FE5E7CF55112014, fd354; +mul.f64 fd1304, fd209, 0d3FB0130A1BE09379; +mul.f64 fd1305, fd226, 0d3FEFEFD5BFE443FE; +sub.f64 fd358, fd1304, fd1305; +mul.f64 fd359, fd226, 0d3FB0130A1BE09379; +fma.rn.f64 fd360, fd209, 0d3FEFEFD5BFE443FE, fd359; +mul.f64 fd1302, fd251, 0dBFE465C6FEB501BC; +mul.f64 fd1303, fd268, 0d3FE8A80B635B6BEA; +sub.f64 fd363, fd1302, fd1303; +mul.f64 fd364, fd268, 0dBFE465C6FEB501BC; +fma.rn.f64 fd365, fd251, 0d3FE8A80B635B6BEA, fd364; +mul.f64 fd367, fd310, 0d3FC00AEB5DA15BE0; +mul.f64 fd1301, fd293, 0dBFEFBF675480D903; +sub.f64 fd368, fd1301, fd367; +mul.f64 fd369, fd310, 0dBFEFBF675480D903; +fma.rn.f64 fd370, fd293, 0d3FC00AEB5DA15BE0, fd369; +mul.f64 fd372, fd176, 0d3FEB04BBFF642E86; +mul.f64 fd1300, fd159, 0d3FE1257E3C182B51; +sub.f64 fd373, fd1300, fd372; +mul.f64 fd374, fd176, 0d3FE1257E3C182B51; +fma.rn.f64 fd375, fd159, 0d3FEB04BBFF642E86, fd374; +mul.f64 fd377, fd218, 0d3FECF457DCDC158C; +mul.f64 fd1299, fd201, 0dBFDB3FF7C925819C; +sub.f64 fd378, fd1299, fd377; +mul.f64 fd379, fd218, 0dBFDB3FF7C925819C; +fma.rn.f64 fd380, fd201, 0d3FECF457DCDC158C, fd379; +mul.f64 fd382, fd260, 0d3FC00AEB5DA15BE0; +mul.f64 fd1298, fd243, 0dBFEFBF675480D903; +sub.f64 fd383, fd1298, fd382; +mul.f64 fd384, fd260, 0dBFEFBF675480D903; +fma.rn.f64 fd385, fd243, 0d3FC00AEB5DA15BE0, fd384; +mul.f64 fd387, fd302, 0dBFE8A80B635B6BEA; +mul.f64 fd1297, fd285, 0dBFE465C6FEB501BC; +sub.f64 fd388, fd1297, fd387; +mul.f64 fd389, fd302, 0dBFE465C6FEB501BC; +fma.rn.f64 fd390, fd285, 0dBFE8A80B635B6BEA, fd389; +add.f64 fd391, fd146, fd272; +add.f64 fd393, fd188, fd230; +mul.f64 fd398, fd393, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1296, fd391, 0d3FD3C6EF372FE950, fd104; +sub.f64 fd399, fd1296, fd398; +add.f64 fd1295, fd150, fd276; +sub.f64 fd400, fd150, fd276; +add.f64 fd1294, fd192, fd234; +sub.f64 fd402, fd192, fd234; +mul.f64 fd1293, fd400, 0d3FEE6F0E134454FF; +fma.rn.f64 fd403, fd402, 0d3FE2CF2304755A5E, fd1293; +sub.f64 fd404, fd399, fd403; +add.f64 fd405, fd403, fd399; +add.f64 fd1292, fd104, fd391; +mul.f64 fd406, fd391, 0d3FE9E3779B97F4A8; +sub.f64 fd407, fd104, fd406; +fma.rn.f64 fd408, fd393, 0d3FD3C6EF372FE950, fd407; +mul.f64 fd409, fd400, 0d3FE2CF2304755A5E; +mul.f64 fd410, fd402, 0d3FEE6F0E134454FF; +sub.f64 fd411, fd409, fd410; +sub.f64 fd412, fd408, fd411; +add.f64 fd413, fd411, fd408; +mul.f64 fd415, fd1294, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1291, fd1295, 0d3FD3C6EF372FE950, fd108; +sub.f64 fd416, fd1291, fd415; +sub.f64 fd417, fd146, fd272; +sub.f64 fd419, fd188, fd230; +mul.f64 fd1290, fd417, 0d3FEE6F0E134454FF; +fma.rn.f64 fd420, fd419, 0d3FE2CF2304755A5E, fd1290; +add.f64 fd421, fd420, fd416; +sub.f64 fd422, fd416, fd420; +add.f64 fd1289, fd108, fd1295; +mul.f64 fd423, fd1295, 0d3FE9E3779B97F4A8; +sub.f64 fd424, fd108, fd423; +fma.rn.f64 fd425, fd1294, 0d3FD3C6EF372FE950, fd424; +mul.f64 fd426, fd417, 0d3FE2CF2304755A5E; +mul.f64 fd427, fd419, 0d3FEE6F0E134454FF; +sub.f64 fd428, fd426, fd427; +add.f64 fd429, fd428, fd425; +sub.f64 fd430, fd425, fd428; +add.f64 fd431, fd313, fd328; +add.f64 fd433, fd318, fd323; +add.f64 fd1288, fd116, fd431; +add.f64 fd434, fd433, fd1288; +add.f64 fd435, fd315, fd330; +add.f64 fd437, fd320, fd325; +add.f64 fd1287, fd133, fd435; +add.f64 fd438, fd437, fd1287; +fma.rn.f64 fd1285, fd431, 0d3FD3C6EF372FE950, fd116; +mul.f64 fd1286, fd433, 0d3FE9E3779B97F4A8; +sub.f64 fd441, fd1285, fd1286; +sub.f64 fd442, fd315, fd330; +sub.f64 fd444, fd320, fd325; +mul.f64 fd1284, fd442, 0d3FEE6F0E134454FF; +fma.rn.f64 fd445, fd444, 0d3FE2CF2304755A5E, fd1284; +sub.f64 fd446, fd441, fd445; +add.f64 fd447, fd445, fd441; +mul.f64 fd448, fd431, 0d3FE9E3779B97F4A8; +sub.f64 fd449, fd116, fd448; +fma.rn.f64 fd450, fd433, 0d3FD3C6EF372FE950, fd449; +mul.f64 fd451, fd442, 0d3FE2CF2304755A5E; +mul.f64 fd452, fd444, 0d3FEE6F0E134454FF; +sub.f64 fd453, fd451, fd452; +sub.f64 fd454, fd450, fd453; +add.f64 fd455, fd453, fd450; +mul.f64 fd457, fd437, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1283, fd435, 0d3FD3C6EF372FE950, fd133; +sub.f64 fd458, fd1283, fd457; +sub.f64 fd459, fd313, fd328; +sub.f64 fd461, fd318, fd323; +mul.f64 fd1282, fd459, 0d3FEE6F0E134454FF; +fma.rn.f64 fd462, fd461, 0d3FE2CF2304755A5E, fd1282; +add.f64 fd463, fd462, fd458; +sub.f64 fd464, fd458, fd462; +mul.f64 fd465, fd435, 0d3FE9E3779B97F4A8; +sub.f64 fd466, fd133, fd465; +fma.rn.f64 fd467, fd437, 0d3FD3C6EF372FE950, fd466; +mul.f64 fd468, fd459, 0d3FE2CF2304755A5E; +mul.f64 fd469, fd461, 0d3FEE6F0E134454FF; +sub.f64 fd470, fd468, fd469; +add.f64 fd471, fd470, fd467; +sub.f64 fd472, fd467, fd470; +add.f64 fd473, fd333, fd348; +add.f64 fd475, fd338, fd343; +add.f64 fd1281, fd124, fd473; +add.f64 fd476, fd475, fd1281; +add.f64 fd477, fd335, fd350; +add.f64 fd479, fd340, fd345; +add.f64 fd1280, fd141, fd477; +add.f64 fd480, fd479, fd1280; +fma.rn.f64 fd1278, fd473, 0d3FD3C6EF372FE950, fd124; +mul.f64 fd1279, fd475, 0d3FE9E3779B97F4A8; +sub.f64 fd483, fd1278, fd1279; +sub.f64 fd484, fd335, fd350; +sub.f64 fd486, fd340, fd345; +mul.f64 fd1277, fd484, 0d3FEE6F0E134454FF; +fma.rn.f64 fd487, fd486, 0d3FE2CF2304755A5E, fd1277; +sub.f64 fd488, fd483, fd487; +add.f64 fd489, fd487, fd483; +mul.f64 fd490, fd473, 0d3FE9E3779B97F4A8; +sub.f64 fd491, fd124, fd490; +fma.rn.f64 fd492, fd475, 0d3FD3C6EF372FE950, fd491; +mul.f64 fd493, fd484, 0d3FE2CF2304755A5E; +mul.f64 fd494, fd486, 0d3FEE6F0E134454FF; +sub.f64 fd495, fd493, fd494; +sub.f64 fd496, fd492, fd495; +add.f64 fd497, fd495, fd492; +fma.rn.f64 fd1275, fd477, 0d3FD3C6EF372FE950, fd141; +mul.f64 fd1276, fd479, 0d3FE9E3779B97F4A8; +sub.f64 fd500, fd1275, fd1276; +sub.f64 fd501, fd333, fd348; +sub.f64 fd503, fd338, fd343; +mul.f64 fd1274, fd501, 0d3FEE6F0E134454FF; +fma.rn.f64 fd504, fd503, 0d3FE2CF2304755A5E, fd1274; +add.f64 fd505, fd504, fd500; +sub.f64 fd506, fd500, fd504; +mul.f64 fd507, fd477, 0d3FE9E3779B97F4A8; +sub.f64 fd508, fd141, fd507; +fma.rn.f64 fd509, fd479, 0d3FD3C6EF372FE950, fd508; +mul.f64 fd510, fd501, 0d3FE2CF2304755A5E; +mul.f64 fd511, fd503, 0d3FEE6F0E134454FF; +sub.f64 fd512, fd510, fd511; +add.f64 fd513, fd512, fd509; +sub.f64 fd514, fd509, fd512; +add.f64 fd515, fd353, fd368; +add.f64 fd517, fd358, fd363; +add.f64 fd1273, fd125, fd515; +add.f64 fd518, fd517, fd1273; +add.f64 fd519, fd355, fd370; +add.f64 fd521, fd360, fd365; +add.f64 fd1272, fd142, fd519; +add.f64 fd522, fd521, fd1272; +mul.f64 fd524, fd517, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1271, fd515, 0d3FD3C6EF372FE950, fd125; +sub.f64 fd525, fd1271, fd524; +sub.f64 fd526, fd355, fd370; +sub.f64 fd528, fd360, fd365; +mul.f64 fd1270, fd526, 0d3FEE6F0E134454FF; +fma.rn.f64 fd529, fd528, 0d3FE2CF2304755A5E, fd1270; +sub.f64 fd530, fd525, fd529; +add.f64 fd531, fd529, fd525; +mul.f64 fd532, fd515, 0d3FE9E3779B97F4A8; +sub.f64 fd533, fd125, fd532; +fma.rn.f64 fd534, fd517, 0d3FD3C6EF372FE950, fd533; +mul.f64 fd535, fd526, 0d3FE2CF2304755A5E; +mul.f64 fd536, fd528, 0d3FEE6F0E134454FF; +sub.f64 fd537, fd535, fd536; +sub.f64 fd538, fd534, fd537; +add.f64 fd539, fd537, fd534; +mul.f64 fd541, fd521, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1269, fd519, 0d3FD3C6EF372FE950, fd142; +sub.f64 fd542, fd1269, fd541; +sub.f64 fd543, fd353, fd368; +sub.f64 fd545, fd358, fd363; +mul.f64 fd1268, fd543, 0d3FEE6F0E134454FF; +fma.rn.f64 fd546, fd545, 0d3FE2CF2304755A5E, fd1268; +add.f64 fd547, fd546, fd542; +sub.f64 fd548, fd542, fd546; +mul.f64 fd549, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd550, fd142, fd549; +fma.rn.f64 fd551, fd521, 0d3FD3C6EF372FE950, fd550; +mul.f64 fd552, fd543, 0d3FE2CF2304755A5E; +mul.f64 fd553, fd545, 0d3FEE6F0E134454FF; +sub.f64 fd554, fd552, fd553; +add.f64 fd555, fd554, fd551; +sub.f64 fd556, fd551, fd554; +add.f64 fd557, fd373, fd388; +add.f64 fd559, fd378, fd383; +add.f64 fd1267, fd117, fd557; +add.f64 fd560, fd559, fd1267; +add.f64 fd561, fd375, fd390; +add.f64 fd563, fd380, fd385; +add.f64 fd1266, fd134, fd561; +add.f64 fd564, fd563, fd1266; +mul.f64 fd566, fd559, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1265, fd557, 0d3FD3C6EF372FE950, fd117; +sub.f64 fd567, fd1265, fd566; +sub.f64 fd568, fd375, fd390; +sub.f64 fd570, fd380, fd385; +mul.f64 fd1264, fd568, 0d3FEE6F0E134454FF; +fma.rn.f64 fd571, fd570, 0d3FE2CF2304755A5E, fd1264; +sub.f64 fd572, fd567, fd571; +add.f64 fd573, fd571, fd567; +mul.f64 fd574, fd557, 0d3FE9E3779B97F4A8; +sub.f64 fd575, fd117, fd574; +fma.rn.f64 fd576, fd559, 0d3FD3C6EF372FE950, fd575; +mul.f64 fd577, fd568, 0d3FE2CF2304755A5E; +mul.f64 fd578, fd570, 0d3FEE6F0E134454FF; +sub.f64 fd579, fd577, fd578; +sub.f64 fd580, fd576, fd579; +add.f64 fd581, fd579, fd576; +mul.f64 fd583, fd563, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1263, fd561, 0d3FD3C6EF372FE950, fd134; +sub.f64 fd584, fd1263, fd583; +sub.f64 fd585, fd373, fd388; +sub.f64 fd587, fd378, fd383; +mul.f64 fd1262, fd585, 0d3FEE6F0E134454FF; +fma.rn.f64 fd588, fd587, 0d3FE2CF2304755A5E, fd1262; +add.f64 fd589, fd588, fd584; +sub.f64 fd590, fd584, fd588; +mul.f64 fd591, fd561, 0d3FE9E3779B97F4A8; +sub.f64 fd592, fd134, fd591; +fma.rn.f64 fd593, fd563, 0d3FD3C6EF372FE950, fd592; +mul.f64 fd594, fd585, 0d3FE2CF2304755A5E; +mul.f64 fd595, fd587, 0d3FEE6F0E134454FF; +sub.f64 fd596, fd594, fd595; +add.f64 fd597, fd596, fd593; +sub.f64 fd598, fd593, fd596; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 2000, r3; +mul.wide.u32 rd7, r7, 16; +mov.u64 rd8, %51; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd599, fd600}, [rd6]; +mul.f64 fd603, fd438, fd600; +mul.f64 fd605, fd599, fd438; +mul.f64 fd607, fd600, fd600; +mul.f64 fd1261, fd599, fd599; +sub.f64 fd608, fd1261, fd607; +mul.f64 fd609, fd600, fd599; +fma.rn.f64 fd610, fd600, fd599, fd609; +mul.f64 fd611, fd480, fd610; +mul.f64 fd613, fd608, fd480; +mul.f64 fd615, fd600, fd610; +mul.f64 fd1260, fd599, fd608; +sub.f64 fd616, fd1260, fd615; +mul.f64 fd1259, fd476, fd610; +mul.f64 fd617, fd599, fd610; +fma.rn.f64 fd618, fd600, fd608, fd617; +mul.f64 fd619, fd522, fd618; +mul.f64 fd621, fd616, fd522; +mul.f64 fd1257, fd599, fd616; +mul.f64 fd1258, fd600, fd618; +sub.f64 fd624, fd1257, fd1258; +mul.f64 fd1256, fd518, fd618; +mul.f64 fd625, fd599, fd618; +fma.rn.f64 fd626, fd600, fd616, fd625; +mul.f64 fd627, fd564, fd626; +mul.f64 fd629, fd624, fd564; +mul.f64 fd631, fd600, fd626; +mul.f64 fd1255, fd599, fd624; +sub.f64 fd632, fd1255, fd631; +mul.f64 fd1254, fd560, fd626; +mul.f64 fd633, fd599, fd626; +fma.rn.f64 fd634, fd600, fd624, fd633; +mul.f64 fd635, fd421, fd634; +mul.f64 fd637, fd632, fd421; +mul.f64 fd1252, fd599, fd632; +mul.f64 fd1253, fd600, fd634; +sub.f64 fd640, fd1252, fd1253; +mul.f64 fd1251, fd404, fd634; +mul.f64 fd641, fd599, fd634; +fma.rn.f64 fd642, fd600, fd632, fd641; +mul.f64 fd643, fd463, fd642; +mul.f64 fd645, fd640, fd463; +mul.f64 fd647, fd600, fd642; +mul.f64 fd1250, fd599, fd640; +sub.f64 fd648, fd1250, fd647; +mul.f64 fd1249, fd446, fd642; +mul.f64 fd649, fd599, fd642; +fma.rn.f64 fd650, fd600, fd640, fd649; +mul.f64 fd651, fd505, fd650; +mul.f64 fd653, fd648, fd505; +mul.f64 fd655, fd600, fd650; +mul.f64 fd1248, fd599, fd648; +sub.f64 fd656, fd1248, fd655; +mul.f64 fd1247, fd488, fd650; +mul.f64 fd657, fd599, fd650; +fma.rn.f64 fd658, fd600, fd648, fd657; +mul.f64 fd659, fd547, fd658; +mul.f64 fd661, fd656, fd547; +mul.f64 fd1245, fd599, fd656; +mul.f64 fd1246, fd600, fd658; +sub.f64 fd664, fd1245, fd1246; +mul.f64 fd1244, fd530, fd658; +mul.f64 fd665, fd599, fd658; +fma.rn.f64 fd666, fd600, fd656, fd665; +mul.f64 fd667, fd589, fd666; +mul.f64 fd669, fd664, fd589; +mul.f64 fd671, fd600, fd666; +mul.f64 fd1243, fd599, fd664; +sub.f64 fd672, fd1243, fd671; +mul.f64 fd1242, fd572, fd666; +mul.f64 fd673, fd599, fd666; +fma.rn.f64 fd674, fd600, fd664, fd673; +mul.f64 fd675, fd429, fd674; +mul.f64 fd677, fd672, fd429; +mul.f64 fd679, fd600, fd674; +mul.f64 fd1241, fd599, fd672; +sub.f64 fd680, fd1241, fd679; +mul.f64 fd1240, fd412, fd674; +mul.f64 fd681, fd599, fd674; +fma.rn.f64 fd682, fd600, fd672, fd681; +mul.f64 fd683, fd471, fd682; +mul.f64 fd685, fd680, fd471; +mul.f64 fd1238, fd599, fd680; +mul.f64 fd1239, fd600, fd682; +sub.f64 fd688, fd1238, fd1239; +mul.f64 fd1237, fd454, fd682; +mul.f64 fd689, fd599, fd682; +fma.rn.f64 fd690, fd600, fd680, fd689; +mul.f64 fd691, fd513, fd690; +mul.f64 fd692, fd496, fd690; +mul.f64 fd693, fd688, fd513; +ld.global.v2.f64 {fd694, fd695}, [rd6+80]; +mul.f64 fd698, fd555, fd695; +mul.f64 fd700, fd694, fd555; +mul.f64 fd702, fd600, fd695; +mul.f64 fd1236, fd599, fd694; +sub.f64 fd703, fd1236, fd702; +mul.f64 fd1235, fd538, fd695; +mul.f64 fd704, fd599, fd695; +fma.rn.f64 fd705, fd600, fd694, fd704; +mul.f64 fd706, fd597, fd705; +mul.f64 fd708, fd703, fd597; +mul.f64 fd1233, fd599, fd703; +mul.f64 fd1234, fd600, fd705; +sub.f64 fd711, fd1233, fd1234; +mul.f64 fd1232, fd580, fd705; +mul.f64 fd712, fd599, fd705; +fma.rn.f64 fd713, fd600, fd703, fd712; +mul.f64 fd714, fd430, fd713; +mul.f64 fd716, fd711, fd430; +mul.f64 fd1230, fd599, fd711; +mul.f64 fd1231, fd600, fd713; +sub.f64 fd719, fd1230, fd1231; +mul.f64 fd1229, fd413, fd713; +mul.f64 fd720, fd599, fd713; +fma.rn.f64 fd721, fd600, fd711, fd720; +mul.f64 fd722, fd472, fd721; +mul.f64 fd724, fd719, fd472; +mul.f64 fd726, fd600, fd721; +mul.f64 fd1228, fd599, fd719; +sub.f64 fd727, fd1228, fd726; +mul.f64 fd1227, fd455, fd721; +mul.f64 fd728, fd599, fd721; +fma.rn.f64 fd729, fd600, fd719, fd728; +mul.f64 fd730, fd514, fd729; +mul.f64 fd732, fd727, fd514; +mul.f64 fd1225, fd599, fd727; +mul.f64 fd1226, fd600, fd729; +sub.f64 fd735, fd1225, fd1226; +mul.f64 fd1224, fd497, fd729; +mul.f64 fd736, fd599, fd729; +fma.rn.f64 fd737, fd600, fd727, fd736; +mul.f64 fd738, fd556, fd737; +mul.f64 fd740, fd735, fd556; +mul.f64 fd742, fd600, fd737; +mul.f64 fd1223, fd599, fd735; +sub.f64 fd743, fd1223, fd742; +mul.f64 fd1222, fd539, fd737; +mul.f64 fd744, fd599, fd737; +fma.rn.f64 fd745, fd600, fd735, fd744; +mul.f64 fd746, fd598, fd745; +mul.f64 fd748, fd743, fd598; +mul.f64 fd750, fd600, fd745; +mul.f64 fd1221, fd599, fd743; +sub.f64 fd751, fd1221, fd750; +mul.f64 fd1220, fd581, fd745; +mul.f64 fd752, fd599, fd745; +fma.rn.f64 fd753, fd600, fd743, fd752; +mul.f64 fd754, fd422, fd753; +mul.f64 fd756, fd751, fd422; +mul.f64 fd1218, fd599, fd751; +mul.f64 fd1219, fd600, fd753; +sub.f64 fd759, fd1218, fd1219; +mul.f64 fd1217, fd405, fd753; +mul.f64 fd760, fd599, fd753; +fma.rn.f64 fd761, fd600, fd751, fd760; +mul.f64 fd762, fd464, fd761; +mul.f64 fd764, fd759, fd464; +mul.f64 fd766, fd600, fd761; +mul.f64 fd1216, fd599, fd759; +sub.f64 fd767, fd1216, fd766; +mul.f64 fd1215, fd447, fd761; +mul.f64 fd768, fd599, fd761; +fma.rn.f64 fd769, fd600, fd759, fd768; +mul.f64 fd770, fd506, fd769; +mul.f64 fd772, fd767, fd506; +mul.f64 fd1213, fd599, fd767; +mul.f64 fd1214, fd600, fd769; +sub.f64 fd775, fd1213, fd1214; +mul.f64 fd1212, fd489, fd769; +mul.f64 fd776, fd599, fd769; +fma.rn.f64 fd777, fd600, fd767, fd776; +mul.f64 fd778, fd548, fd777; +mul.f64 fd780, fd775, fd548; +mul.f64 fd1210, fd599, fd775; +mul.f64 fd1211, fd600, fd777; +sub.f64 fd783, fd1210, fd1211; +mul.f64 fd1209, fd531, fd777; +mul.f64 fd784, fd599, fd777; +mul.f64 fd1208, fd434, fd600; +fma.rn.f64 fd785, fd600, fd775, fd784; +mul.f64 fd786, fd590, fd785; +mul.f64 fd787, fd573, fd785; +mul.f64 fd788, fd783, fd590; +barrier.sync 0; +mad.lo.s32 r9, r7, 400, r8; +add.f64 fd789, fd1294, fd1289; +add.f64 fd790, fd393, fd1292; +st.shared.v2.f64 [r9], {fd790, fd789}; +fma.rn.f64 fd791, fd599, fd434, fd603; +sub.f64 fd792, fd605, fd1208; +st.shared.v2.f64 [r9+16], {fd791, fd792}; +fma.rn.f64 fd793, fd608, fd476, fd611; +sub.f64 fd794, fd613, fd1259; +st.shared.v2.f64 [r9+32], {fd793, fd794}; +fma.rn.f64 fd795, fd616, fd518, fd619; +sub.f64 fd796, fd621, fd1256; +st.shared.v2.f64 [r9+48], {fd795, fd796}; +fma.rn.f64 fd797, fd624, fd560, fd627; +sub.f64 fd798, fd629, fd1254; +st.shared.v2.f64 [r9+64], {fd797, fd798}; +sub.f64 fd799, fd637, fd1251; +fma.rn.f64 fd800, fd632, fd404, fd635; +st.shared.v2.f64 [r9+80], {fd800, fd799}; +fma.rn.f64 fd801, fd640, fd446, fd643; +sub.f64 fd802, fd645, fd1249; +st.shared.v2.f64 [r9+96], {fd801, fd802}; +sub.f64 fd803, fd653, fd1247; +fma.rn.f64 fd804, fd648, fd488, fd651; +st.shared.v2.f64 [r9+112], {fd804, fd803}; +fma.rn.f64 fd805, fd656, fd530, fd659; +sub.f64 fd806, fd661, fd1244; +st.shared.v2.f64 [r9+128], {fd805, fd806}; +fma.rn.f64 fd807, fd664, fd572, fd667; +sub.f64 fd808, fd669, fd1242; +st.shared.v2.f64 [r9+144], {fd807, fd808}; +fma.rn.f64 fd809, fd672, fd412, fd675; +sub.f64 fd810, fd677, fd1240; +st.shared.v2.f64 [r9+160], {fd809, fd810}; +fma.rn.f64 fd811, fd680, fd454, fd683; +sub.f64 fd812, fd685, fd1237; +st.shared.v2.f64 [r9+176], {fd811, fd812}; +fma.rn.f64 fd813, fd688, fd496, fd691; +sub.f64 fd814, fd693, fd692; +st.shared.v2.f64 [r9+192], {fd813, fd814}; +fma.rn.f64 fd815, fd694, fd538, fd698; +sub.f64 fd816, fd700, fd1235; +st.shared.v2.f64 [r9+208], {fd815, fd816}; +fma.rn.f64 fd817, fd703, fd580, fd706; +sub.f64 fd818, fd708, fd1232; +st.shared.v2.f64 [r9+224], {fd817, fd818}; +fma.rn.f64 fd819, fd711, fd413, fd714; +sub.f64 fd820, fd716, fd1229; +st.shared.v2.f64 [r9+240], {fd819, fd820}; +fma.rn.f64 fd821, fd719, fd455, fd722; +sub.f64 fd822, fd724, fd1227; +st.shared.v2.f64 [r9+256], {fd821, fd822}; +fma.rn.f64 fd823, fd727, fd497, fd730; +sub.f64 fd824, fd732, fd1224; +st.shared.v2.f64 [r9+272], {fd823, fd824}; +fma.rn.f64 fd825, fd735, fd539, fd738; +sub.f64 fd826, fd740, fd1222; +st.shared.v2.f64 [r9+288], {fd825, fd826}; +sub.f64 fd827, fd748, fd1220; +fma.rn.f64 fd828, fd743, fd581, fd746; +st.shared.v2.f64 [r9+304], {fd828, fd827}; +fma.rn.f64 fd829, fd751, fd405, fd754; +sub.f64 fd830, fd756, fd1217; +st.shared.v2.f64 [r9+320], {fd829, fd830}; +fma.rn.f64 fd831, fd759, fd447, fd762; +sub.f64 fd832, fd764, fd1215; +st.shared.v2.f64 [r9+336], {fd831, fd832}; +fma.rn.f64 fd833, fd767, fd489, fd770; +sub.f64 fd834, fd772, fd1212; +st.shared.v2.f64 [r9+352], {fd833, fd834}; +fma.rn.f64 fd835, fd775, fd531, fd778; +sub.f64 fd836, fd780, fd1209; +st.shared.v2.f64 [r9+368], {fd835, fd836}; +fma.rn.f64 fd837, fd783, fd573, fd786; +sub.f64 fd838, fd788, fd787; +st.shared.v2.f64 [r9+384], {fd837, fd838}; +barrier.sync 0; +mad.lo.s32 r10, r7, -384, r9; +ld.shared.v2.f64 {fd839, fd840}, [r10]; +ld.shared.v2.f64 {fd843, fd844}, [r10+80]; +ld.shared.v2.f64 {fd847, fd848}, [r10+160]; +ld.shared.v2.f64 {fd851, fd852}, [r10+240]; +ld.shared.v2.f64 {fd855, fd856}, [r10+320]; +ld.shared.v2.f64 {fd859, fd860}, [r10+400]; +ld.shared.v2.f64 {fd863, fd864}, [r10+480]; +ld.shared.v2.f64 {fd867, fd868}, [r10+560]; +ld.shared.v2.f64 {fd871, fd872}, [r10+640]; +ld.shared.v2.f64 {fd875, fd876}, [r10+720]; +ld.shared.v2.f64 {fd879, fd880}, [r10+800]; +ld.shared.v2.f64 {fd883, fd884}, [r10+880]; +ld.shared.v2.f64 {fd887, fd888}, [r10+960]; +ld.shared.v2.f64 {fd891, fd892}, [r10+1040]; +ld.shared.v2.f64 {fd895, fd896}, [r10+1120]; +ld.shared.v2.f64 {fd899, fd900}, [r10+1200]; +ld.shared.v2.f64 {fd903, fd904}, [r10+1280]; +ld.shared.v2.f64 {fd907, fd908}, [r10+1360]; +ld.shared.v2.f64 {fd911, fd912}, [r10+1440]; +ld.shared.v2.f64 {fd915, fd916}, [r10+1520]; +ld.shared.v2.f64 {fd919, fd920}, [r10+1600]; +ld.shared.v2.f64 {fd923, fd924}, [r10+1680]; +ld.shared.v2.f64 {fd927, fd928}, [r10+1760]; +ld.shared.v2.f64 {fd931, fd932}, [r10+1840]; +ld.shared.v2.f64 {fd935, fd936}, [r10+1920]; +add.f64 fd939, fd859, fd919; +add.f64 fd941, fd879, fd899; +fma.rn.f64 fd1206, fd939, 0d3FD3C6EF372FE950, fd839; +mul.f64 fd1207, fd941, 0d3FE9E3779B97F4A8; +sub.f64 fd947, fd1206, fd1207; +add.f64 fd1205, fd860, fd920; +sub.f64 fd948, fd860, fd920; +add.f64 fd1204, fd880, fd900; +sub.f64 fd950, fd880, fd900; +mul.f64 fd1203, fd948, 0d3FEE6F0E134454FF; +fma.rn.f64 fd951, fd950, 0d3FE2CF2304755A5E, fd1203; +add.f64 fd1202, fd839, fd939; +mul.f64 fd952, fd939, 0d3FE9E3779B97F4A8; +sub.f64 fd953, fd839, fd952; +fma.rn.f64 fd954, fd941, 0d3FD3C6EF372FE950, fd953; +mul.f64 fd955, fd948, 0d3FE2CF2304755A5E; +mul.f64 fd956, fd950, 0d3FEE6F0E134454FF; +sub.f64 fd957, fd955, fd956; +mul.f64 fd959, fd1204, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1201, fd1205, 0d3FD3C6EF372FE950, fd840; +sub.f64 fd960, fd1201, fd959; +sub.f64 fd961, fd859, fd919; +sub.f64 fd963, fd879, fd899; +mul.f64 fd1200, fd961, 0d3FEE6F0E134454FF; +fma.rn.f64 fd964, fd963, 0d3FE2CF2304755A5E, fd1200; +add.f64 fd1199, fd840, fd1205; +mul.f64 fd965, fd1205, 0d3FE9E3779B97F4A8; +sub.f64 fd966, fd840, fd965; +fma.rn.f64 fd967, fd1204, 0d3FD3C6EF372FE950, fd966; +mul.f64 fd968, fd961, 0d3FE2CF2304755A5E; +mul.f64 fd969, fd963, 0d3FEE6F0E134454FF; +sub.f64 fd970, fd968, fd969; +add.f64 fd971, fd863, fd923; +add.f64 fd973, fd883, fd903; +mul.f64 fd978, fd973, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1198, fd971, 0d3FD3C6EF372FE950, fd843; +sub.f64 fd979, fd1198, fd978; +add.f64 fd1197, fd864, fd924; +sub.f64 fd980, fd864, fd924; +add.f64 fd1196, fd884, fd904; +sub.f64 fd982, fd884, fd904; +mul.f64 fd1195, fd980, 0d3FEE6F0E134454FF; +fma.rn.f64 fd983, fd982, 0d3FE2CF2304755A5E, fd1195; +add.f64 fd1194, fd843, fd971; +mul.f64 fd984, fd971, 0d3FE9E3779B97F4A8; +sub.f64 fd985, fd843, fd984; +fma.rn.f64 fd986, fd973, 0d3FD3C6EF372FE950, fd985; +mul.f64 fd987, fd980, 0d3FE2CF2304755A5E; +mul.f64 fd988, fd982, 0d3FEE6F0E134454FF; +sub.f64 fd989, fd987, fd988; +fma.rn.f64 fd1192, fd1197, 0d3FD3C6EF372FE950, fd844; +mul.f64 fd1193, fd1196, 0d3FE9E3779B97F4A8; +sub.f64 fd992, fd1192, fd1193; +sub.f64 fd993, fd863, fd923; +sub.f64 fd995, fd883, fd903; +mul.f64 fd1191, fd993, 0d3FEE6F0E134454FF; +fma.rn.f64 fd996, fd995, 0d3FE2CF2304755A5E, fd1191; +add.f64 fd1190, fd844, fd1197; +mul.f64 fd997, fd1197, 0d3FE9E3779B97F4A8; +sub.f64 fd998, fd844, fd997; +fma.rn.f64 fd999, fd1196, 0d3FD3C6EF372FE950, fd998; +mul.f64 fd1000, fd993, 0d3FE2CF2304755A5E; +mul.f64 fd1001, fd995, 0d3FEE6F0E134454FF; +sub.f64 fd1002, fd1000, fd1001; +add.f64 fd1003, fd867, fd927; +add.f64 fd1005, fd887, fd907; +fma.rn.f64 fd1188, fd1003, 0d3FD3C6EF372FE950, fd847; +mul.f64 fd1189, fd1005, 0d3FE9E3779B97F4A8; +sub.f64 fd1011, fd1188, fd1189; +add.f64 fd1187, fd868, fd928; +sub.f64 fd1012, fd868, fd928; +add.f64 fd1186, fd888, fd908; +sub.f64 fd1014, fd888, fd908; +mul.f64 fd1185, fd1012, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1015, fd1014, 0d3FE2CF2304755A5E, fd1185; +add.f64 fd1184, fd847, fd1003; +mul.f64 fd1016, fd1003, 0d3FE9E3779B97F4A8; +sub.f64 fd1017, fd847, fd1016; +fma.rn.f64 fd1018, fd1005, 0d3FD3C6EF372FE950, fd1017; +mul.f64 fd1019, fd1012, 0d3FE2CF2304755A5E; +mul.f64 fd1020, fd1014, 0d3FEE6F0E134454FF; +sub.f64 fd1021, fd1019, fd1020; +mul.f64 fd1023, fd1186, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1183, fd1187, 0d3FD3C6EF372FE950, fd848; +sub.f64 fd1024, fd1183, fd1023; +sub.f64 fd1025, fd867, fd927; +sub.f64 fd1027, fd887, fd907; +mul.f64 fd1182, fd1025, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1028, fd1027, 0d3FE2CF2304755A5E, fd1182; +add.f64 fd1181, fd848, fd1187; +mul.f64 fd1029, fd1187, 0d3FE9E3779B97F4A8; +sub.f64 fd1030, fd848, fd1029; +fma.rn.f64 fd1031, fd1186, 0d3FD3C6EF372FE950, fd1030; +mul.f64 fd1032, fd1025, 0d3FE2CF2304755A5E; +mul.f64 fd1033, fd1027, 0d3FEE6F0E134454FF; +sub.f64 fd1034, fd1032, fd1033; +add.f64 fd1035, fd871, fd931; +add.f64 fd1037, fd891, fd911; +mul.f64 fd1042, fd1037, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1180, fd1035, 0d3FD3C6EF372FE950, fd851; +sub.f64 fd1043, fd1180, fd1042; +add.f64 fd1179, fd872, fd932; +sub.f64 fd1044, fd872, fd932; +add.f64 fd1178, fd892, fd912; +sub.f64 fd1046, fd892, fd912; +mul.f64 fd1177, fd1044, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1047, fd1046, 0d3FE2CF2304755A5E, fd1177; +add.f64 fd1176, fd851, fd1035; +mul.f64 fd1048, fd1035, 0d3FE9E3779B97F4A8; +sub.f64 fd1049, fd851, fd1048; +fma.rn.f64 fd1050, fd1037, 0d3FD3C6EF372FE950, fd1049; +mul.f64 fd1051, fd1044, 0d3FE2CF2304755A5E; +mul.f64 fd1052, fd1046, 0d3FEE6F0E134454FF; +sub.f64 fd1053, fd1051, fd1052; +fma.rn.f64 fd1174, fd1179, 0d3FD3C6EF372FE950, fd852; +mul.f64 fd1175, fd1178, 0d3FE9E3779B97F4A8; +sub.f64 fd1056, fd1174, fd1175; +sub.f64 fd1057, fd871, fd931; +sub.f64 fd1059, fd891, fd911; +mul.f64 fd1173, fd1057, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1060, fd1059, 0d3FE2CF2304755A5E, fd1173; +add.f64 fd1172, fd852, fd1179; +mul.f64 fd1061, fd1179, 0d3FE9E3779B97F4A8; +sub.f64 fd1062, fd852, fd1061; +fma.rn.f64 fd1063, fd1178, 0d3FD3C6EF372FE950, fd1062; +mul.f64 fd1064, fd1057, 0d3FE2CF2304755A5E; +mul.f64 fd1065, fd1059, 0d3FEE6F0E134454FF; +sub.f64 fd1066, fd1064, fd1065; +add.f64 fd1067, fd875, fd935; +add.f64 fd1069, fd895, fd915; +fma.rn.f64 fd1170, fd1067, 0d3FD3C6EF372FE950, fd855; +mul.f64 fd1171, fd1069, 0d3FE9E3779B97F4A8; +sub.f64 fd1075, fd1170, fd1171; +add.f64 fd1169, fd876, fd936; +sub.f64 fd1076, fd876, fd936; +add.f64 fd1168, fd896, fd916; +sub.f64 fd1078, fd896, fd916; +mul.f64 fd1167, fd1076, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1079, fd1078, 0d3FE2CF2304755A5E, fd1167; +add.f64 fd1166, fd855, fd1067; +mul.f64 fd1080, fd1067, 0d3FE9E3779B97F4A8; +sub.f64 fd1081, fd855, fd1080; +fma.rn.f64 fd1082, fd1069, 0d3FD3C6EF372FE950, fd1081; +mul.f64 fd1083, fd1076, 0d3FE2CF2304755A5E; +mul.f64 fd1084, fd1078, 0d3FEE6F0E134454FF; +sub.f64 fd1085, fd1083, fd1084; +mul.f64 fd1087, fd1168, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1165, fd1169, 0d3FD3C6EF372FE950, fd856; +sub.f64 fd1088, fd1165, fd1087; +sub.f64 fd1089, fd875, fd935; +sub.f64 fd1091, fd895, fd915; +mul.f64 fd1164, fd1089, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1092, fd1091, 0d3FE2CF2304755A5E, fd1164; +add.f64 fd1163, fd856, fd1169; +mul.f64 fd1093, fd1169, 0d3FE9E3779B97F4A8; +sub.f64 fd1094, fd856, fd1093; +fma.rn.f64 fd1095, fd1168, 0d3FD3C6EF372FE950, fd1094; +mul.f64 fd1096, fd1089, 0d3FE2CF2304755A5E; +mul.f64 fd1097, fd1091, 0d3FEE6F0E134454FF; +sub.f64 fd1098, fd1096, fd1097; +add.f64 %1, fd1204, fd1199; +add.f64 %0, fd941, fd1202; +add.f64 %3, fd1196, fd1190; +add.f64 %2, fd973, fd1194; +add.f64 %5, fd1186, fd1181; +add.f64 %4, fd1005, fd1184; +add.f64 %7, fd1178, fd1172; +add.f64 %6, fd1037, fd1176; +add.f64 %9, fd1168, fd1163; +add.f64 %8, fd1069, fd1166; +sub.f64 %10, fd947, fd951; +add.f64 %11, fd964, fd960; +sub.f64 %12, fd979, fd983; +add.f64 %13, fd996, fd992; +add.f64 %15, fd1028, fd1024; +sub.f64 %14, fd1011, fd1015; +add.f64 %17, fd1060, fd1056; +sub.f64 %16, fd1043, fd1047; +sub.f64 %18, fd1075, fd1079; +add.f64 %19, fd1092, fd1088; +sub.f64 %20, fd954, fd957; +add.f64 %21, fd970, fd967; +sub.f64 %22, fd986, fd989; +add.f64 %23, fd1002, fd999; +sub.f64 %24, fd1018, fd1021; +add.f64 %25, fd1034, fd1031; +add.f64 %27, fd1066, fd1063; +sub.f64 %26, fd1050, fd1053; +add.f64 %29, fd1098, fd1095; +sub.f64 %28, fd1082, fd1085; +sub.f64 %31, fd967, fd970; +add.f64 %30, fd957, fd954; +sub.f64 %33, fd999, fd1002; +add.f64 %32, fd989, fd986; +sub.f64 %35, fd1031, fd1034; +add.f64 %34, fd1021, fd1018; +sub.f64 %37, fd1063, fd1066; +add.f64 %36, fd1053, fd1050; +sub.f64 %39, fd1095, fd1098; +add.f64 %38, fd1085, fd1082; +sub.f64 %41, fd960, fd964; +add.f64 %40, fd951, fd947; +sub.f64 %43, fd992, fd996; +add.f64 %42, fd983, fd979; +sub.f64 %45, fd1024, fd1028; +add.f64 %44, fd1015, fd1011; +sub.f64 %47, fd1056, fd1060; +add.f64 %46, fd1047, fd1043; +sub.f64 %49, fd1088, fd1092; +add.f64 %48, fd1079, fd1075; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_125), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[5].y), "d"(rmem[10].y), "d"(rmem[20].y), "d"(rmem[16].y), "d"(rmem[1].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[22].y), "d"(rmem[7].y), "d"(rmem[17].y), "d"(rmem[23].y), "d"(rmem[8].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[14].y), "d"(rmem[19].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<708, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<243>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 1000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %15, %23; +add.f64 fd22, %13, fd21; +add.f64 fd23, %18, %21; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %17, %24; +add.f64 fd26, %14, fd25; +add.f64 fd27, %20, %22; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %13; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %17, %24; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %20, %22; +fma.rn.f64 fd35, fd34, 0d3FE2CF2304755A5E, fd33; +sub.f64 fd36, fd31, fd35; +add.f64 fd37, fd35, fd31; +mul.f64 fd38, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd39, %13, fd38; +fma.rn.f64 fd40, fd23, 0d3FD3C6EF372FE950, fd39; +mul.f64 fd41, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd42, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, fd40, fd43; +add.f64 fd45, fd43, fd40; +fma.rn.f64 fd46, fd25, 0d3FD3C6EF372FE950, %14; +mul.f64 fd47, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd48, fd46, fd47; +sub.f64 fd49, %15, %23; +mul.f64 fd50, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd51, %18, %21; +fma.rn.f64 fd52, fd51, 0d3FE2CF2304755A5E, fd50; +add.f64 fd53, fd52, fd48; +sub.f64 fd54, fd48, fd52; +mul.f64 fd55, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %14, fd55; +fma.rn.f64 fd57, fd27, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd49, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd51, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd58, fd59; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd53, fd64; +fma.rn.f64 fd68, fd63, fd36, fd67; +mul.f64 fd69, fd36, fd64; +mul.f64 fd70, fd63, fd53; +sub.f64 fd71, fd70, fd69; +mul.f64 fd72, fd63, fd63; +mul.f64 fd73, fd64, fd64; +sub.f64 fd74, fd72, fd73; +mul.f64 fd75, fd64, fd63; +fma.rn.f64 fd76, fd64, fd63, fd75; +mul.f64 fd77, fd61, fd76; +fma.rn.f64 fd78, fd74, fd44, fd77; +mul.f64 fd79, fd44, fd76; +mul.f64 fd80, fd74, fd61; +sub.f64 fd81, fd80, fd79; +ld.global.v2.f64 {fd82, fd83}, [rd6+400]; +mul.f64 fd86, fd62, fd83; +fma.rn.f64 fd87, fd82, fd45, fd86; +mul.f64 fd88, fd45, fd83; +mul.f64 fd89, fd82, fd62; +sub.f64 fd90, fd89, fd88; +mul.f64 fd91, fd63, fd82; +mul.f64 fd92, fd64, fd83; +sub.f64 fd93, fd91, fd92; +mul.f64 fd94, fd63, fd83; +fma.rn.f64 fd95, fd64, fd82, fd94; +mul.f64 fd96, fd54, fd95; +fma.rn.f64 fd97, fd93, fd37, fd96; +mul.f64 fd98, fd37, fd95; +mul.f64 fd99, fd93, fd54; +sub.f64 fd100, fd99, fd98; +mad.lo.s32 r8, r5, 1000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd68; +st.shared.f64 [r9+16], fd78; +st.shared.f64 [r9+24], fd87; +st.shared.f64 [r9+32], fd97; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd101, [r11]; +ld.shared.f64 fd102, [r11+200]; +ld.shared.f64 fd103, [r11+400]; +ld.shared.f64 fd104, [r11+600]; +ld.shared.f64 fd105, [r11+800]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +ld.shared.f64 fd106, [r11]; +ld.shared.f64 fd107, [r11+200]; +ld.shared.f64 fd108, [r11+400]; +ld.shared.f64 fd109, [r11+600]; +ld.shared.f64 fd110, [r11+800]; +add.f64 fd111, fd102, fd105; +add.f64 fd112, fd101, fd111; +add.f64 fd113, fd103, fd104; +add.f64 fd114, fd113, fd112; +add.f64 fd115, fd107, fd110; +add.f64 fd116, fd106, fd115; +add.f64 fd117, fd108, fd109; +add.f64 fd118, fd117, fd116; +fma.rn.f64 fd119, fd111, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd120, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd121, fd119, fd120; +sub.f64 fd122, fd107, fd110; +mul.f64 fd123, fd122, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd108, fd109; +fma.rn.f64 fd125, fd124, 0d3FE2CF2304755A5E, fd123; +sub.f64 fd126, fd121, fd125; +add.f64 fd127, fd125, fd121; +mul.f64 fd128, fd111, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd101, fd128; +fma.rn.f64 fd130, fd113, 0d3FD3C6EF372FE950, fd129; +mul.f64 fd131, fd122, 0d3FE2CF2304755A5E; +mul.f64 fd132, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd133, fd131, fd132; +sub.f64 fd134, fd130, fd133; +add.f64 fd135, fd133, fd130; +fma.rn.f64 fd136, fd115, 0d3FD3C6EF372FE950, fd106; +mul.f64 fd137, fd117, 0d3FE9E3779B97F4A8; +sub.f64 fd138, fd136, fd137; +sub.f64 fd139, fd102, fd105; +mul.f64 fd140, fd139, 0d3FEE6F0E134454FF; +sub.f64 fd141, fd103, fd104; +fma.rn.f64 fd142, fd141, 0d3FE2CF2304755A5E, fd140; +add.f64 fd143, fd142, fd138; +sub.f64 fd144, fd138, fd142; +mul.f64 fd145, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd146, fd106, fd145; +fma.rn.f64 fd147, fd117, 0d3FD3C6EF372FE950, fd146; +mul.f64 fd148, fd139, 0d3FE2CF2304755A5E; +mul.f64 fd149, fd141, 0d3FEE6F0E134454FF; +sub.f64 fd150, fd148, fd149; +add.f64 fd151, fd150, fd147; +sub.f64 fd152, fd147, fd150; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd153, fd154}, [rd11]; +mul.f64 fd157, fd143, fd154; +fma.rn.f64 fd158, fd153, fd126, fd157; +mul.f64 fd159, fd126, fd154; +mul.f64 fd160, fd153, fd143; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd151, fd166; +fma.rn.f64 fd168, fd164, fd134, fd167; +mul.f64 fd169, fd134, fd166; +mul.f64 fd170, fd164, fd151; +sub.f64 fd171, fd170, fd169; +ld.global.v2.f64 {fd172, fd173}, [rd11+80]; +mul.f64 fd176, fd152, fd173; +fma.rn.f64 fd177, fd172, fd135, fd176; +mul.f64 fd178, fd135, fd173; +mul.f64 fd179, fd172, fd152; +sub.f64 fd180, fd179, fd178; +mul.f64 fd181, fd153, fd172; +mul.f64 fd182, fd154, fd173; +sub.f64 fd183, fd181, fd182; +mul.f64 fd184, fd153, fd173; +fma.rn.f64 fd185, fd154, fd172, fd184; +mul.f64 fd186, fd144, fd185; +fma.rn.f64 fd187, fd183, fd127, fd186; +mul.f64 fd188, fd127, fd185; +mul.f64 fd189, fd183, fd144; +sub.f64 fd190, fd189, fd188; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +st.shared.f64 [r17], fd114; +st.shared.f64 [r17+40], fd158; +st.shared.f64 [r17+80], fd168; +st.shared.f64 [r17+120], fd177; +st.shared.f64 [r17+160], fd187; +barrier.sync 0; +ld.shared.f64 fd191, [r11]; +ld.shared.f64 fd192, [r11+200]; +ld.shared.f64 fd193, [r11+400]; +ld.shared.f64 fd194, [r11+600]; +ld.shared.f64 fd195, [r11+800]; +barrier.sync 0; +st.shared.f64 [r17], fd118; +st.shared.f64 [r17+40], fd161; +st.shared.f64 [r17+80], fd171; +st.shared.f64 [r17+120], fd180; +st.shared.f64 [r17+160], fd190; +barrier.sync 0; +ld.shared.f64 fd196, [r11]; +ld.shared.f64 fd197, [r11+200]; +ld.shared.f64 fd198, [r11+400]; +ld.shared.f64 fd199, [r11+600]; +ld.shared.f64 fd200, [r11+800]; +add.f64 fd201, fd192, fd195; +add.f64 fd202, fd191, fd201; +add.f64 fd203, fd193, fd194; +add.f64 fd204, fd197, fd200; +add.f64 fd205, fd196, fd204; +add.f64 fd206, fd198, fd199; +fma.rn.f64 fd207, fd201, 0d3FD3C6EF372FE950, fd191; +mul.f64 fd208, fd203, 0d3FE9E3779B97F4A8; +sub.f64 fd209, fd207, fd208; +sub.f64 fd210, fd197, fd200; +mul.f64 fd211, fd210, 0d3FEE6F0E134454FF; +sub.f64 fd212, fd198, fd199; +fma.rn.f64 fd213, fd212, 0d3FE2CF2304755A5E, fd211; +mul.f64 fd214, fd201, 0d3FE9E3779B97F4A8; +sub.f64 fd215, fd191, fd214; +fma.rn.f64 fd216, fd203, 0d3FD3C6EF372FE950, fd215; +mul.f64 fd217, fd210, 0d3FE2CF2304755A5E; +mul.f64 fd218, fd212, 0d3FEE6F0E134454FF; +sub.f64 fd219, fd217, fd218; +fma.rn.f64 fd220, fd204, 0d3FD3C6EF372FE950, fd196; +mul.f64 fd221, fd206, 0d3FE9E3779B97F4A8; +sub.f64 fd222, fd220, fd221; +sub.f64 fd223, fd192, fd195; +mul.f64 fd224, fd223, 0d3FEE6F0E134454FF; +sub.f64 fd225, fd193, fd194; +fma.rn.f64 fd226, fd225, 0d3FE2CF2304755A5E, fd224; +mul.f64 fd227, fd204, 0d3FE9E3779B97F4A8; +sub.f64 fd228, fd196, fd227; +fma.rn.f64 fd229, fd206, 0d3FD3C6EF372FE950, fd228; +mul.f64 fd230, fd223, 0d3FE2CF2304755A5E; +mul.f64 fd231, fd225, 0d3FEE6F0E134454FF; +sub.f64 fd232, fd230, fd231; +add.f64 %0, fd203, fd202; +add.f64 %1, fd206, fd205; +add.f64 %3, fd226, fd222; +sub.f64 %2, fd209, fd213; +sub.f64 %4, fd216, fd219; +add.f64 %5, fd232, fd229; +add.f64 %6, fd219, fd216; +sub.f64 %7, fd229, fd232; +sub.f64 %9, fd222, fd226; +add.f64 %8, fd213, fd209; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<707, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<263>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 2000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %15, %23; +add.f64 fd22, %13, fd21; +add.f64 fd23, %18, %21; +add.f64 fd24, %17, %24; +add.f64 fd25, %14, fd24; +add.f64 fd26, %20, %22; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %13; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %17, %24; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %20, %22; +fma.rn.f64 fd33, fd32, 0d3FE2CF2304755A5E, fd31; +sub.f64 fd34, fd29, fd33; +add.f64 fd35, fd33, fd29; +mul.f64 fd36, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd37, %13, fd36; +fma.rn.f64 fd38, fd23, 0d3FD3C6EF372FE950, fd37; +mul.f64 fd39, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd40, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd41, fd39, fd40; +sub.f64 fd42, fd38, fd41; +add.f64 fd43, fd41, fd38; +fma.rn.f64 fd44, fd24, 0d3FD3C6EF372FE950, %14; +mul.f64 fd45, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd46, fd44, fd45; +sub.f64 fd47, %15, %23; +mul.f64 fd48, fd47, 0d3FEE6F0E134454FF; +sub.f64 fd49, %18, %21; +fma.rn.f64 fd50, fd49, 0d3FE2CF2304755A5E, fd48; +add.f64 fd51, fd50, fd46; +sub.f64 fd52, fd46, fd50; +mul.f64 fd53, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd54, %14, fd53; +fma.rn.f64 fd55, fd26, 0d3FD3C6EF372FE950, fd54; +mul.f64 fd56, fd47, 0d3FE2CF2304755A5E; +mul.f64 fd57, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd58, fd56, fd57; +add.f64 fd59, fd58, fd55; +sub.f64 fd60, fd55, fd58; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd61, fd62}, [rd6]; +mul.f64 fd65, fd51, fd62; +mul.f64 fd66, fd34, fd62; +mul.f64 fd67, fd61, fd51; +mul.f64 fd68, fd61, fd61; +mul.f64 fd69, fd62, fd62; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd62, fd61; +fma.rn.f64 fd72, fd62, fd61, fd71; +mul.f64 fd73, fd59, fd72; +mul.f64 fd74, fd42, fd72; +mul.f64 fd75, fd70, fd59; +ld.global.v2.f64 {fd76, fd77}, [rd6+400]; +mul.f64 fd80, fd60, fd77; +mul.f64 fd81, fd43, fd77; +mul.f64 fd82, fd76, fd60; +mul.f64 fd83, fd61, fd76; +mul.f64 fd84, fd62, fd77; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd61, fd77; +fma.rn.f64 fd87, fd62, fd76, fd86; +mul.f64 fd88, fd52, fd87; +mul.f64 fd89, fd35, fd87; +mul.f64 fd90, fd85, fd52; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd91, fd26, fd25; +add.f64 fd92, fd23, fd22; +st.shared.v2.f64 [r9], {fd92, fd91}; +fma.rn.f64 fd93, fd61, fd34, fd65; +sub.f64 fd94, fd67, fd66; +st.shared.v2.f64 [r9+16], {fd93, fd94}; +fma.rn.f64 fd95, fd70, fd42, fd73; +sub.f64 fd96, fd75, fd74; +st.shared.v2.f64 [r9+32], {fd95, fd96}; +fma.rn.f64 fd97, fd76, fd43, fd80; +sub.f64 fd98, fd82, fd81; +st.shared.v2.f64 [r9+48], {fd97, fd98}; +fma.rn.f64 fd99, fd85, fd35, fd88; +sub.f64 fd100, fd90, fd89; +st.shared.v2.f64 [r9+64], {fd99, fd100}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd101, fd102}, [r11]; +ld.shared.v2.f64 {fd105, fd106}, [r11+400]; +ld.shared.v2.f64 {fd109, fd110}, [r11+800]; +ld.shared.v2.f64 {fd113, fd114}, [r11+1200]; +ld.shared.v2.f64 {fd117, fd118}, [r11+1600]; +add.f64 fd121, fd105, fd117; +add.f64 fd122, fd101, fd121; +add.f64 fd123, fd109, fd113; +add.f64 fd124, fd106, fd118; +add.f64 fd125, fd102, fd124; +add.f64 fd126, fd110, fd114; +fma.rn.f64 fd127, fd121, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd128, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, fd106, fd118; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd110, fd114; +fma.rn.f64 fd133, fd132, 0d3FE2CF2304755A5E, fd131; +sub.f64 fd134, fd129, fd133; +add.f64 fd135, fd133, fd129; +mul.f64 fd136, fd121, 0d3FE9E3779B97F4A8; +sub.f64 fd137, fd101, fd136; +fma.rn.f64 fd138, fd123, 0d3FD3C6EF372FE950, fd137; +mul.f64 fd139, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd140, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +sub.f64 fd142, fd138, fd141; +add.f64 fd143, fd141, fd138; +fma.rn.f64 fd144, fd124, 0d3FD3C6EF372FE950, fd102; +mul.f64 fd145, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd146, fd144, fd145; +sub.f64 fd147, fd105, fd117; +mul.f64 fd148, fd147, 0d3FEE6F0E134454FF; +sub.f64 fd149, fd109, fd113; +fma.rn.f64 fd150, fd149, 0d3FE2CF2304755A5E, fd148; +add.f64 fd151, fd150, fd146; +sub.f64 fd152, fd146, fd150; +mul.f64 fd153, fd124, 0d3FE9E3779B97F4A8; +sub.f64 fd154, fd102, fd153; +fma.rn.f64 fd155, fd126, 0d3FD3C6EF372FE950, fd154; +mul.f64 fd156, fd147, 0d3FE2CF2304755A5E; +mul.f64 fd157, fd149, 0d3FEE6F0E134454FF; +sub.f64 fd158, fd156, fd157; +add.f64 fd159, fd158, fd155; +sub.f64 fd160, fd155, fd158; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd161, fd162}, [rd11]; +mul.f64 fd165, fd151, fd162; +mul.f64 fd166, fd134, fd162; +mul.f64 fd167, fd161, fd151; +mul.f64 fd168, fd161, fd161; +mul.f64 fd169, fd162, fd162; +sub.f64 fd170, fd168, fd169; +mul.f64 fd171, fd162, fd161; +fma.rn.f64 fd172, fd162, fd161, fd171; +mul.f64 fd173, fd159, fd172; +mul.f64 fd174, fd142, fd172; +mul.f64 fd175, fd170, fd159; +ld.global.v2.f64 {fd176, fd177}, [rd11+80]; +mul.f64 fd180, fd160, fd177; +mul.f64 fd181, fd143, fd177; +mul.f64 fd182, fd176, fd160; +mul.f64 fd183, fd161, fd176; +mul.f64 fd184, fd162, fd177; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd161, fd177; +fma.rn.f64 fd187, fd162, fd176, fd186; +mul.f64 fd188, fd152, fd187; +mul.f64 fd189, fd135, fd187; +mul.f64 fd190, fd185, fd152; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 400, r16; +add.f64 fd191, fd126, fd125; +add.f64 fd192, fd123, fd122; +st.shared.v2.f64 [r17], {fd192, fd191}; +fma.rn.f64 fd193, fd161, fd134, fd165; +sub.f64 fd194, fd167, fd166; +st.shared.v2.f64 [r17+80], {fd193, fd194}; +fma.rn.f64 fd195, fd170, fd142, fd173; +sub.f64 fd196, fd175, fd174; +st.shared.v2.f64 [r17+160], {fd195, fd196}; +fma.rn.f64 fd197, fd176, fd143, fd180; +sub.f64 fd198, fd182, fd181; +st.shared.v2.f64 [r17+240], {fd197, fd198}; +fma.rn.f64 fd199, fd185, fd135, fd188; +sub.f64 fd200, fd190, fd189; +st.shared.v2.f64 [r17+320], {fd199, fd200}; +barrier.sync 0; +ld.shared.v2.f64 {fd201, fd202}, [r11]; +ld.shared.v2.f64 {fd205, fd206}, [r11+400]; +ld.shared.v2.f64 {fd209, fd210}, [r11+800]; +ld.shared.v2.f64 {fd213, fd214}, [r11+1200]; +ld.shared.v2.f64 {fd217, fd218}, [r11+1600]; +add.f64 fd221, fd205, fd217; +add.f64 fd222, fd201, fd221; +add.f64 fd223, fd209, fd213; +add.f64 fd224, fd206, fd218; +add.f64 fd225, fd202, fd224; +add.f64 fd226, fd210, fd214; +fma.rn.f64 fd227, fd221, 0d3FD3C6EF372FE950, fd201; +mul.f64 fd228, fd223, 0d3FE9E3779B97F4A8; +sub.f64 fd229, fd227, fd228; +sub.f64 fd230, fd206, fd218; +mul.f64 fd231, fd230, 0d3FEE6F0E134454FF; +sub.f64 fd232, fd210, fd214; +fma.rn.f64 fd233, fd232, 0d3FE2CF2304755A5E, fd231; +mul.f64 fd234, fd221, 0d3FE9E3779B97F4A8; +sub.f64 fd235, fd201, fd234; +fma.rn.f64 fd236, fd223, 0d3FD3C6EF372FE950, fd235; +mul.f64 fd237, fd230, 0d3FE2CF2304755A5E; +mul.f64 fd238, fd232, 0d3FEE6F0E134454FF; +sub.f64 fd239, fd237, fd238; +fma.rn.f64 fd240, fd224, 0d3FD3C6EF372FE950, fd202; +mul.f64 fd241, fd226, 0d3FE9E3779B97F4A8; +sub.f64 fd242, fd240, fd241; +sub.f64 fd243, fd205, fd217; +mul.f64 fd244, fd243, 0d3FEE6F0E134454FF; +sub.f64 fd245, fd209, fd213; +fma.rn.f64 fd246, fd245, 0d3FE2CF2304755A5E, fd244; +mul.f64 fd247, fd224, 0d3FE9E3779B97F4A8; +sub.f64 fd248, fd202, fd247; +fma.rn.f64 fd249, fd226, 0d3FD3C6EF372FE950, fd248; +mul.f64 fd250, fd243, 0d3FE2CF2304755A5E; +mul.f64 fd251, fd245, 0d3FEE6F0E134454FF; +sub.f64 fd252, fd250, fd251; +add.f64 %1, fd226, fd225; +add.f64 %0, fd223, fd222; +add.f64 %3, fd246, fd242; +sub.f64 %2, fd229, fd233; +add.f64 %5, fd252, fd249; +sub.f64 %4, fd236, fd239; +sub.f64 %7, fd249, fd252; +add.f64 %6, fd239, fd236; +sub.f64 %9, fd242, fd246; +add.f64 %8, fd233, fd229; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<709, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<1099>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 1000, r2; +add.f64 fd101, %65, %105; +add.f64 fd102, %52, fd101; +add.f64 fd103, %78, %92; +add.f64 fd104, fd103, fd102; +add.f64 fd105, %67, %107; +add.f64 fd106, %53, fd105; +add.f64 fd107, %80, %93; +add.f64 fd108, fd107, fd106; +fma.rn.f64 fd109, fd101, 0d3FD3C6EF372FE950, %52; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %67, %107; +mul.f64 fd113, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd114, %80, %93; +fma.rn.f64 fd115, fd114, 0d3FE2CF2304755A5E, fd113; +sub.f64 fd116, fd111, fd115; +add.f64 fd117, fd115, fd111; +mul.f64 fd118, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd119, %52, fd118; +fma.rn.f64 fd120, fd103, 0d3FD3C6EF372FE950, fd119; +mul.f64 fd121, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd122, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd120, fd123; +add.f64 fd125, fd123, fd120; +fma.rn.f64 fd126, fd105, 0d3FD3C6EF372FE950, %53; +mul.f64 fd127, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd128, fd126, fd127; +sub.f64 fd129, %65, %105; +mul.f64 fd130, fd129, 0d3FEE6F0E134454FF; +sub.f64 fd131, %78, %92; +fma.rn.f64 fd132, fd131, 0d3FE2CF2304755A5E, fd130; +add.f64 fd133, fd132, fd128; +sub.f64 fd134, fd128, fd132; +mul.f64 fd135, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd136, %53, fd135; +fma.rn.f64 fd137, fd107, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd138, fd129, 0d3FE2CF2304755A5E; +mul.f64 fd139, fd131, 0d3FEE6F0E134454FF; +sub.f64 fd140, fd138, fd139; +add.f64 fd141, fd140, fd137; +sub.f64 fd142, fd137, fd140; +add.f64 fd143, %68, %108; +add.f64 fd144, %54, fd143; +add.f64 fd145, %81, %94; +add.f64 fd146, fd145, fd144; +add.f64 fd147, %69, %109; +add.f64 fd148, %56, fd147; +add.f64 fd149, %83, %96; +add.f64 fd150, fd149, fd148; +fma.rn.f64 fd151, fd143, 0d3FD3C6EF372FE950, %54; +mul.f64 fd152, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd153, fd151, fd152; +sub.f64 fd154, %69, %109; +mul.f64 fd155, fd154, 0d3FEE6F0E134454FF; +sub.f64 fd156, %83, %96; +fma.rn.f64 fd157, fd156, 0d3FE2CF2304755A5E, fd155; +sub.f64 fd158, fd153, fd157; +add.f64 fd159, fd157, fd153; +mul.f64 fd160, fd143, 0d3FE9E3779B97F4A8; +sub.f64 fd161, %54, fd160; +fma.rn.f64 fd162, fd145, 0d3FD3C6EF372FE950, fd161; +mul.f64 fd163, fd154, 0d3FE2CF2304755A5E; +mul.f64 fd164, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd165, fd163, fd164; +sub.f64 fd166, fd162, fd165; +add.f64 fd167, fd165, fd162; +fma.rn.f64 fd168, fd147, 0d3FD3C6EF372FE950, %56; +mul.f64 fd169, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd170, fd168, fd169; +sub.f64 fd171, %68, %108; +mul.f64 fd172, fd171, 0d3FEE6F0E134454FF; +sub.f64 fd173, %81, %94; +fma.rn.f64 fd174, fd173, 0d3FE2CF2304755A5E, fd172; +add.f64 fd175, fd174, fd170; +sub.f64 fd176, fd170, fd174; +mul.f64 fd177, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd178, %56, fd177; +fma.rn.f64 fd179, fd149, 0d3FD3C6EF372FE950, fd178; +mul.f64 fd180, fd171, 0d3FE2CF2304755A5E; +mul.f64 fd181, fd173, 0d3FEE6F0E134454FF; +sub.f64 fd182, fd180, fd181; +add.f64 fd183, fd182, fd179; +sub.f64 fd184, fd179, fd182; +add.f64 fd185, %70, %110; +add.f64 fd186, %57, fd185; +add.f64 fd187, %84, %97; +add.f64 fd188, fd187, fd186; +add.f64 fd189, %72, %112; +add.f64 fd190, %59, fd189; +add.f64 fd191, %85, %99; +add.f64 fd192, fd191, fd190; +fma.rn.f64 fd193, fd185, 0d3FD3C6EF372FE950, %57; +mul.f64 fd194, fd187, 0d3FE9E3779B97F4A8; +sub.f64 fd195, fd193, fd194; +sub.f64 fd196, %72, %112; +mul.f64 fd197, fd196, 0d3FEE6F0E134454FF; +sub.f64 fd198, %85, %99; +fma.rn.f64 fd199, fd198, 0d3FE2CF2304755A5E, fd197; +sub.f64 fd200, fd195, fd199; +add.f64 fd201, fd199, fd195; +mul.f64 fd202, fd185, 0d3FE9E3779B97F4A8; +sub.f64 fd203, %57, fd202; +fma.rn.f64 fd204, fd187, 0d3FD3C6EF372FE950, fd203; +mul.f64 fd205, fd196, 0d3FE2CF2304755A5E; +mul.f64 fd206, fd198, 0d3FEE6F0E134454FF; +sub.f64 fd207, fd205, fd206; +sub.f64 fd208, fd204, fd207; +add.f64 fd209, fd207, fd204; +fma.rn.f64 fd210, fd189, 0d3FD3C6EF372FE950, %59; +mul.f64 fd211, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd212, fd210, fd211; +sub.f64 fd213, %70, %110; +mul.f64 fd214, fd213, 0d3FEE6F0E134454FF; +sub.f64 fd215, %84, %97; +fma.rn.f64 fd216, fd215, 0d3FE2CF2304755A5E, fd214; +add.f64 fd217, fd216, fd212; +sub.f64 fd218, fd212, fd216; +mul.f64 fd219, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd220, %59, fd219; +fma.rn.f64 fd221, fd191, 0d3FD3C6EF372FE950, fd220; +mul.f64 fd222, fd213, 0d3FE2CF2304755A5E; +mul.f64 fd223, fd215, 0d3FEE6F0E134454FF; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd224, fd221; +sub.f64 fd226, fd221, fd224; +add.f64 fd227, %73, %113; +add.f64 fd228, %60, fd227; +add.f64 fd229, %86, %100; +add.f64 fd230, fd229, fd228; +add.f64 fd231, %75, %115; +add.f64 fd232, %61, fd231; +add.f64 fd233, %88, %101; +add.f64 fd234, fd233, fd232; +fma.rn.f64 fd235, fd227, 0d3FD3C6EF372FE950, %60; +mul.f64 fd236, fd229, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, %75, %115; +mul.f64 fd239, fd238, 0d3FEE6F0E134454FF; +sub.f64 fd240, %88, %101; +fma.rn.f64 fd241, fd240, 0d3FE2CF2304755A5E, fd239; +sub.f64 fd242, fd237, fd241; +add.f64 fd243, fd241, fd237; +mul.f64 fd244, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd245, %60, fd244; +fma.rn.f64 fd246, fd229, 0d3FD3C6EF372FE950, fd245; +mul.f64 fd247, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd248, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd246, fd249; +add.f64 fd251, fd249, fd246; +fma.rn.f64 fd252, fd231, 0d3FD3C6EF372FE950, %61; +mul.f64 fd253, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd254, fd252, fd253; +sub.f64 fd255, %73, %113; +mul.f64 fd256, fd255, 0d3FEE6F0E134454FF; +sub.f64 fd257, %86, %100; +fma.rn.f64 fd258, fd257, 0d3FE2CF2304755A5E, fd256; +add.f64 fd259, fd258, fd254; +sub.f64 fd260, fd254, fd258; +mul.f64 fd261, fd231, 0d3FE9E3779B97F4A8; +sub.f64 fd262, %61, fd261; +fma.rn.f64 fd263, fd233, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd264, fd255, 0d3FE2CF2304755A5E; +mul.f64 fd265, fd257, 0d3FEE6F0E134454FF; +sub.f64 fd266, fd264, fd265; +add.f64 fd267, fd266, fd263; +sub.f64 fd268, fd263, fd266; +add.f64 fd269, %76, %116; +add.f64 fd270, %62, fd269; +add.f64 fd271, %89, %102; +add.f64 fd272, fd271, fd270; +add.f64 fd273, %77, %117; +add.f64 fd274, %64, fd273; +add.f64 fd275, %91, %104; +add.f64 fd276, fd275, fd274; +fma.rn.f64 fd277, fd269, 0d3FD3C6EF372FE950, %62; +mul.f64 fd278, fd271, 0d3FE9E3779B97F4A8; +sub.f64 fd279, fd277, fd278; +sub.f64 fd280, %77, %117; +mul.f64 fd281, fd280, 0d3FEE6F0E134454FF; +sub.f64 fd282, %91, %104; +fma.rn.f64 fd283, fd282, 0d3FE2CF2304755A5E, fd281; +sub.f64 fd284, fd279, fd283; +add.f64 fd285, fd283, fd279; +mul.f64 fd286, fd269, 0d3FE9E3779B97F4A8; +sub.f64 fd287, %62, fd286; +fma.rn.f64 fd288, fd271, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd289, fd280, 0d3FE2CF2304755A5E; +mul.f64 fd290, fd282, 0d3FEE6F0E134454FF; +sub.f64 fd291, fd289, fd290; +sub.f64 fd292, fd288, fd291; +add.f64 fd293, fd291, fd288; +fma.rn.f64 fd294, fd273, 0d3FD3C6EF372FE950, %64; +mul.f64 fd295, fd275, 0d3FE9E3779B97F4A8; +sub.f64 fd296, fd294, fd295; +sub.f64 fd297, %76, %116; +mul.f64 fd298, fd297, 0d3FEE6F0E134454FF; +sub.f64 fd299, %89, %102; +fma.rn.f64 fd300, fd299, 0d3FE2CF2304755A5E, fd298; +add.f64 fd301, fd300, fd296; +sub.f64 fd302, fd296, fd300; +mul.f64 fd303, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd304, %64, fd303; +fma.rn.f64 fd305, fd275, 0d3FD3C6EF372FE950, fd304; +mul.f64 fd306, fd297, 0d3FE2CF2304755A5E; +mul.f64 fd307, fd299, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd306, fd307; +add.f64 fd309, fd308, fd305; +sub.f64 fd310, fd305, fd308; +mov.u32 r4, %tid.x; +mul.f64 fd311, fd158, 0d3FEEFEA21D101EE0; +mul.f64 fd312, fd175, 0d3FCFD511FA1C0796; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd175, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd315, fd158, 0d3FCFD511FA1C0796, fd314; +mul.f64 fd316, fd200, 0d3FEC0AB44E81C059; +mul.f64 fd317, fd217, 0d3FDED50D5CBFA951; +sub.f64 fd318, fd316, fd317; +mul.f64 fd319, fd217, 0d3FEC0AB44E81C059; +fma.rn.f64 fd320, fd200, 0d3FDED50D5CBFA951, fd319; +mul.f64 fd321, fd242, 0d3FE753B603D2B816; +mul.f64 fd322, fd259, 0d3FE5E7CF55112014; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd259, 0d3FE753B603D2B816; +fma.rn.f64 fd325, fd242, 0d3FE5E7CF55112014, fd324; +mul.f64 fd326, fd284, 0d3FE1257E3C182B51; +mul.f64 fd327, fd301, 0d3FEB04BBFF642E86; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd301, 0d3FE1257E3C182B51; +fma.rn.f64 fd330, fd284, 0d3FEB04BBFF642E86, fd329; +mul.f64 fd331, fd166, 0d3FEC0AB44E81C059; +mul.f64 fd332, fd183, 0d3FDED50D5CBFA951; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd183, 0d3FEC0AB44E81C059; +fma.rn.f64 fd335, fd166, 0d3FDED50D5CBFA951, fd334; +mul.f64 fd336, fd208, 0d3FE1257E3C182B51; +mul.f64 fd337, fd225, 0d3FEB04BBFF642E86; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd225, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd208, 0d3FEB04BBFF642E86, fd339; +mul.f64 fd341, fd250, 0d3FB0130A1BE09379; +mul.f64 fd342, fd267, 0d3FEFEFD5BFE443FE; +sub.f64 fd343, fd341, fd342; +mul.f64 fd344, fd267, 0d3FB0130A1BE09379; +fma.rn.f64 fd345, fd250, 0d3FEFEFD5BFE443FE, fd344; +mul.f64 fd346, fd292, 0dBFDB3FF7C925819C; +mul.f64 fd347, fd309, 0d3FECF457DCDC158C; +sub.f64 fd348, fd346, fd347; +mul.f64 fd349, fd309, 0dBFDB3FF7C925819C; +fma.rn.f64 fd350, fd292, 0d3FECF457DCDC158C, fd349; +mul.f64 fd351, fd167, 0d3FE753B603D2B816; +mul.f64 fd352, fd184, 0d3FE5E7CF55112014; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd184, 0d3FE753B603D2B816; +fma.rn.f64 fd355, fd167, 0d3FE5E7CF55112014, fd354; +mul.f64 fd356, fd209, 0d3FB0130A1BE09379; +mul.f64 fd357, fd226, 0d3FEFEFD5BFE443FE; +sub.f64 fd358, fd356, fd357; +mul.f64 fd359, fd226, 0d3FB0130A1BE09379; +fma.rn.f64 fd360, fd209, 0d3FEFEFD5BFE443FE, fd359; +mul.f64 fd361, fd251, 0dBFE465C6FEB501BC; +mul.f64 fd362, fd268, 0d3FE8A80B635B6BEA; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd268, 0dBFE465C6FEB501BC; +fma.rn.f64 fd365, fd251, 0d3FE8A80B635B6BEA, fd364; +mul.f64 fd366, fd293, 0dBFEFBF675480D903; +mul.f64 fd367, fd310, 0d3FC00AEB5DA15BE0; +sub.f64 fd368, fd366, fd367; +mul.f64 fd369, fd310, 0dBFEFBF675480D903; +fma.rn.f64 fd370, fd293, 0d3FC00AEB5DA15BE0, fd369; +mul.f64 fd371, fd159, 0d3FE1257E3C182B51; +mul.f64 fd372, fd176, 0d3FEB04BBFF642E86; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd176, 0d3FE1257E3C182B51; +fma.rn.f64 fd375, fd159, 0d3FEB04BBFF642E86, fd374; +mul.f64 fd376, fd201, 0dBFDB3FF7C925819C; +mul.f64 fd377, fd218, 0d3FECF457DCDC158C; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd218, 0dBFDB3FF7C925819C; +fma.rn.f64 fd380, fd201, 0d3FECF457DCDC158C, fd379; +mul.f64 fd381, fd243, 0dBFEFBF675480D903; +mul.f64 fd382, fd260, 0d3FC00AEB5DA15BE0; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd260, 0dBFEFBF675480D903; +fma.rn.f64 fd385, fd243, 0d3FC00AEB5DA15BE0, fd384; +mul.f64 fd386, fd285, 0dBFE465C6FEB501BC; +mul.f64 fd387, fd302, 0dBFE8A80B635B6BEA; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd302, 0dBFE465C6FEB501BC; +fma.rn.f64 fd390, fd285, 0dBFE8A80B635B6BEA, fd389; +add.f64 fd391, fd146, fd272; +add.f64 fd392, fd104, fd391; +add.f64 fd393, fd188, fd230; +add.f64 fd394, fd393, fd392; +add.f64 fd395, fd150, fd276; +add.f64 fd396, fd108, fd395; +add.f64 fd397, fd192, fd234; +add.f64 fd398, fd397, fd396; +fma.rn.f64 fd399, fd391, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd400, fd393, 0d3FE9E3779B97F4A8; +sub.f64 fd401, fd399, fd400; +sub.f64 fd402, fd150, fd276; +mul.f64 fd403, fd402, 0d3FEE6F0E134454FF; +sub.f64 fd404, fd192, fd234; +fma.rn.f64 fd405, fd404, 0d3FE2CF2304755A5E, fd403; +sub.f64 fd406, fd401, fd405; +add.f64 fd407, fd405, fd401; +mul.f64 fd408, fd391, 0d3FE9E3779B97F4A8; +sub.f64 fd409, fd104, fd408; +fma.rn.f64 fd410, fd393, 0d3FD3C6EF372FE950, fd409; +mul.f64 fd411, fd402, 0d3FE2CF2304755A5E; +mul.f64 fd412, fd404, 0d3FEE6F0E134454FF; +sub.f64 fd413, fd411, fd412; +sub.f64 fd414, fd410, fd413; +add.f64 fd415, fd413, fd410; +fma.rn.f64 fd416, fd395, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd417, fd397, 0d3FE9E3779B97F4A8; +sub.f64 fd418, fd416, fd417; +sub.f64 fd419, fd146, fd272; +mul.f64 fd420, fd419, 0d3FEE6F0E134454FF; +sub.f64 fd421, fd188, fd230; +fma.rn.f64 fd422, fd421, 0d3FE2CF2304755A5E, fd420; +add.f64 fd423, fd422, fd418; +sub.f64 fd424, fd418, fd422; +mul.f64 fd425, fd395, 0d3FE9E3779B97F4A8; +sub.f64 fd426, fd108, fd425; +fma.rn.f64 fd427, fd397, 0d3FD3C6EF372FE950, fd426; +mul.f64 fd428, fd419, 0d3FE2CF2304755A5E; +mul.f64 fd429, fd421, 0d3FEE6F0E134454FF; +sub.f64 fd430, fd428, fd429; +add.f64 fd431, fd430, fd427; +sub.f64 fd432, fd427, fd430; +add.f64 fd433, fd313, fd328; +add.f64 fd434, fd116, fd433; +add.f64 fd435, fd318, fd323; +add.f64 fd436, fd435, fd434; +add.f64 fd437, fd315, fd330; +add.f64 fd438, fd133, fd437; +add.f64 fd439, fd320, fd325; +add.f64 fd440, fd439, fd438; +fma.rn.f64 fd441, fd433, 0d3FD3C6EF372FE950, fd116; +mul.f64 fd442, fd435, 0d3FE9E3779B97F4A8; +sub.f64 fd443, fd441, fd442; +sub.f64 fd444, fd315, fd330; +mul.f64 fd445, fd444, 0d3FEE6F0E134454FF; +sub.f64 fd446, fd320, fd325; +fma.rn.f64 fd447, fd446, 0d3FE2CF2304755A5E, fd445; +sub.f64 fd448, fd443, fd447; +add.f64 fd449, fd447, fd443; +mul.f64 fd450, fd433, 0d3FE9E3779B97F4A8; +sub.f64 fd451, fd116, fd450; +fma.rn.f64 fd452, fd435, 0d3FD3C6EF372FE950, fd451; +mul.f64 fd453, fd444, 0d3FE2CF2304755A5E; +mul.f64 fd454, fd446, 0d3FEE6F0E134454FF; +sub.f64 fd455, fd453, fd454; +sub.f64 fd456, fd452, fd455; +add.f64 fd457, fd455, fd452; +fma.rn.f64 fd458, fd437, 0d3FD3C6EF372FE950, fd133; +mul.f64 fd459, fd439, 0d3FE9E3779B97F4A8; +sub.f64 fd460, fd458, fd459; +sub.f64 fd461, fd313, fd328; +mul.f64 fd462, fd461, 0d3FEE6F0E134454FF; +sub.f64 fd463, fd318, fd323; +fma.rn.f64 fd464, fd463, 0d3FE2CF2304755A5E, fd462; +add.f64 fd465, fd464, fd460; +sub.f64 fd466, fd460, fd464; +mul.f64 fd467, fd437, 0d3FE9E3779B97F4A8; +sub.f64 fd468, fd133, fd467; +fma.rn.f64 fd469, fd439, 0d3FD3C6EF372FE950, fd468; +mul.f64 fd470, fd461, 0d3FE2CF2304755A5E; +mul.f64 fd471, fd463, 0d3FEE6F0E134454FF; +sub.f64 fd472, fd470, fd471; +add.f64 fd473, fd472, fd469; +sub.f64 fd474, fd469, fd472; +add.f64 fd475, fd333, fd348; +add.f64 fd476, fd124, fd475; +add.f64 fd477, fd338, fd343; +add.f64 fd478, fd477, fd476; +add.f64 fd479, fd335, fd350; +add.f64 fd480, fd141, fd479; +add.f64 fd481, fd340, fd345; +add.f64 fd482, fd481, fd480; +fma.rn.f64 fd483, fd475, 0d3FD3C6EF372FE950, fd124; +mul.f64 fd484, fd477, 0d3FE9E3779B97F4A8; +sub.f64 fd485, fd483, fd484; +sub.f64 fd486, fd335, fd350; +mul.f64 fd487, fd486, 0d3FEE6F0E134454FF; +sub.f64 fd488, fd340, fd345; +fma.rn.f64 fd489, fd488, 0d3FE2CF2304755A5E, fd487; +sub.f64 fd490, fd485, fd489; +add.f64 fd491, fd489, fd485; +mul.f64 fd492, fd475, 0d3FE9E3779B97F4A8; +sub.f64 fd493, fd124, fd492; +fma.rn.f64 fd494, fd477, 0d3FD3C6EF372FE950, fd493; +mul.f64 fd495, fd486, 0d3FE2CF2304755A5E; +mul.f64 fd496, fd488, 0d3FEE6F0E134454FF; +sub.f64 fd497, fd495, fd496; +sub.f64 fd498, fd494, fd497; +add.f64 fd499, fd497, fd494; +fma.rn.f64 fd500, fd479, 0d3FD3C6EF372FE950, fd141; +mul.f64 fd501, fd481, 0d3FE9E3779B97F4A8; +sub.f64 fd502, fd500, fd501; +sub.f64 fd503, fd333, fd348; +mul.f64 fd504, fd503, 0d3FEE6F0E134454FF; +sub.f64 fd505, fd338, fd343; +fma.rn.f64 fd506, fd505, 0d3FE2CF2304755A5E, fd504; +add.f64 fd507, fd506, fd502; +sub.f64 fd508, fd502, fd506; +mul.f64 fd509, fd479, 0d3FE9E3779B97F4A8; +sub.f64 fd510, fd141, fd509; +fma.rn.f64 fd511, fd481, 0d3FD3C6EF372FE950, fd510; +mul.f64 fd512, fd503, 0d3FE2CF2304755A5E; +mul.f64 fd513, fd505, 0d3FEE6F0E134454FF; +sub.f64 fd514, fd512, fd513; +add.f64 fd515, fd514, fd511; +sub.f64 fd516, fd511, fd514; +add.f64 fd517, fd353, fd368; +add.f64 fd518, fd125, fd517; +add.f64 fd519, fd358, fd363; +add.f64 fd520, fd519, fd518; +add.f64 fd521, fd355, fd370; +add.f64 fd522, fd142, fd521; +add.f64 fd523, fd360, fd365; +add.f64 fd524, fd523, fd522; +fma.rn.f64 fd525, fd517, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd526, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd527, fd525, fd526; +sub.f64 fd528, fd355, fd370; +mul.f64 fd529, fd528, 0d3FEE6F0E134454FF; +sub.f64 fd530, fd360, fd365; +fma.rn.f64 fd531, fd530, 0d3FE2CF2304755A5E, fd529; +sub.f64 fd532, fd527, fd531; +add.f64 fd533, fd531, fd527; +mul.f64 fd534, fd517, 0d3FE9E3779B97F4A8; +sub.f64 fd535, fd125, fd534; +fma.rn.f64 fd536, fd519, 0d3FD3C6EF372FE950, fd535; +mul.f64 fd537, fd528, 0d3FE2CF2304755A5E; +mul.f64 fd538, fd530, 0d3FEE6F0E134454FF; +sub.f64 fd539, fd537, fd538; +sub.f64 fd540, fd536, fd539; +add.f64 fd541, fd539, fd536; +fma.rn.f64 fd542, fd521, 0d3FD3C6EF372FE950, fd142; +mul.f64 fd543, fd523, 0d3FE9E3779B97F4A8; +sub.f64 fd544, fd542, fd543; +sub.f64 fd545, fd353, fd368; +mul.f64 fd546, fd545, 0d3FEE6F0E134454FF; +sub.f64 fd547, fd358, fd363; +fma.rn.f64 fd548, fd547, 0d3FE2CF2304755A5E, fd546; +add.f64 fd549, fd548, fd544; +sub.f64 fd550, fd544, fd548; +mul.f64 fd551, fd521, 0d3FE9E3779B97F4A8; +sub.f64 fd552, fd142, fd551; +fma.rn.f64 fd553, fd523, 0d3FD3C6EF372FE950, fd552; +mul.f64 fd554, fd545, 0d3FE2CF2304755A5E; +mul.f64 fd555, fd547, 0d3FEE6F0E134454FF; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd556, fd553; +sub.f64 fd558, fd553, fd556; +add.f64 fd559, fd373, fd388; +add.f64 fd560, fd117, fd559; +add.f64 fd561, fd378, fd383; +add.f64 fd562, fd561, fd560; +add.f64 fd563, fd375, fd390; +add.f64 fd564, fd134, fd563; +add.f64 fd565, fd380, fd385; +add.f64 fd566, fd565, fd564; +fma.rn.f64 fd567, fd559, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd568, fd561, 0d3FE9E3779B97F4A8; +sub.f64 fd569, fd567, fd568; +sub.f64 fd570, fd375, fd390; +mul.f64 fd571, fd570, 0d3FEE6F0E134454FF; +sub.f64 fd572, fd380, fd385; +fma.rn.f64 fd573, fd572, 0d3FE2CF2304755A5E, fd571; +sub.f64 fd574, fd569, fd573; +add.f64 fd575, fd573, fd569; +mul.f64 fd576, fd559, 0d3FE9E3779B97F4A8; +sub.f64 fd577, fd117, fd576; +fma.rn.f64 fd578, fd561, 0d3FD3C6EF372FE950, fd577; +mul.f64 fd579, fd570, 0d3FE2CF2304755A5E; +mul.f64 fd580, fd572, 0d3FEE6F0E134454FF; +sub.f64 fd581, fd579, fd580; +sub.f64 fd582, fd578, fd581; +add.f64 fd583, fd581, fd578; +fma.rn.f64 fd584, fd563, 0d3FD3C6EF372FE950, fd134; +mul.f64 fd585, fd565, 0d3FE9E3779B97F4A8; +sub.f64 fd586, fd584, fd585; +sub.f64 fd587, fd373, fd388; +mul.f64 fd588, fd587, 0d3FEE6F0E134454FF; +sub.f64 fd589, fd378, fd383; +fma.rn.f64 fd590, fd589, 0d3FE2CF2304755A5E, fd588; +add.f64 fd591, fd590, fd586; +sub.f64 fd592, fd586, fd590; +mul.f64 fd593, fd563, 0d3FE9E3779B97F4A8; +sub.f64 fd594, fd134, fd593; +fma.rn.f64 fd595, fd565, 0d3FD3C6EF372FE950, fd594; +mul.f64 fd596, fd587, 0d3FE2CF2304755A5E; +mul.f64 fd597, fd589, 0d3FEE6F0E134454FF; +sub.f64 fd598, fd596, fd597; +add.f64 fd599, fd598, fd595; +sub.f64 fd600, fd595, fd598; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd601, fd602}, [rd6]; +mul.f64 fd605, fd440, fd602; +fma.rn.f64 fd606, fd601, fd436, fd605; +mul.f64 fd607, fd436, fd602; +mul.f64 fd608, fd601, fd440; +sub.f64 fd609, fd608, fd607; +mul.f64 fd610, fd601, fd601; +mul.f64 fd611, fd602, fd602; +sub.f64 fd612, fd610, fd611; +mul.f64 fd613, fd602, fd601; +fma.rn.f64 fd614, fd602, fd601, fd613; +mul.f64 fd615, fd482, fd614; +fma.rn.f64 fd616, fd612, fd478, fd615; +mul.f64 fd617, fd478, fd614; +mul.f64 fd618, fd612, fd482; +sub.f64 fd619, fd618, fd617; +mul.f64 fd620, fd601, fd612; +mul.f64 fd621, fd602, fd614; +sub.f64 fd622, fd620, fd621; +mul.f64 fd623, fd601, fd614; +fma.rn.f64 fd624, fd602, fd612, fd623; +mul.f64 fd625, fd524, fd624; +fma.rn.f64 fd626, fd622, fd520, fd625; +mul.f64 fd627, fd520, fd624; +mul.f64 fd628, fd622, fd524; +sub.f64 fd629, fd628, fd627; +mul.f64 fd630, fd601, fd622; +mul.f64 fd631, fd602, fd624; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd601, fd624; +fma.rn.f64 fd634, fd602, fd622, fd633; +mul.f64 fd635, fd566, fd634; +fma.rn.f64 fd636, fd632, fd562, fd635; +mul.f64 fd637, fd562, fd634; +mul.f64 fd638, fd632, fd566; +sub.f64 fd639, fd638, fd637; +mul.f64 fd640, fd601, fd632; +mul.f64 fd641, fd602, fd634; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd601, fd634; +fma.rn.f64 fd644, fd602, fd632, fd643; +mul.f64 fd645, fd423, fd644; +fma.rn.f64 fd646, fd642, fd406, fd645; +mul.f64 fd647, fd406, fd644; +mul.f64 fd648, fd642, fd423; +sub.f64 fd649, fd648, fd647; +mul.f64 fd650, fd601, fd642; +mul.f64 fd651, fd602, fd644; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd601, fd644; +fma.rn.f64 fd654, fd602, fd642, fd653; +mul.f64 fd655, fd465, fd654; +fma.rn.f64 fd656, fd652, fd448, fd655; +mul.f64 fd657, fd448, fd654; +mul.f64 fd658, fd652, fd465; +sub.f64 fd659, fd658, fd657; +mul.f64 fd660, fd601, fd652; +mul.f64 fd661, fd602, fd654; +sub.f64 fd662, fd660, fd661; +mul.f64 fd663, fd601, fd654; +fma.rn.f64 fd664, fd602, fd652, fd663; +mul.f64 fd665, fd507, fd664; +fma.rn.f64 fd666, fd662, fd490, fd665; +mul.f64 fd667, fd490, fd664; +mul.f64 fd668, fd662, fd507; +sub.f64 fd669, fd668, fd667; +mul.f64 fd670, fd601, fd662; +mul.f64 fd671, fd602, fd664; +sub.f64 fd672, fd670, fd671; +mul.f64 fd673, fd601, fd664; +fma.rn.f64 fd674, fd602, fd662, fd673; +mul.f64 fd675, fd549, fd674; +fma.rn.f64 fd676, fd672, fd532, fd675; +mul.f64 fd677, fd532, fd674; +mul.f64 fd678, fd672, fd549; +sub.f64 fd679, fd678, fd677; +mul.f64 fd680, fd601, fd672; +mul.f64 fd681, fd602, fd674; +sub.f64 fd682, fd680, fd681; +mul.f64 fd683, fd601, fd674; +fma.rn.f64 fd684, fd602, fd672, fd683; +mul.f64 fd685, fd591, fd684; +fma.rn.f64 fd686, fd682, fd574, fd685; +mul.f64 fd687, fd574, fd684; +mul.f64 fd688, fd682, fd591; +sub.f64 fd689, fd688, fd687; +mul.f64 fd690, fd601, fd682; +mul.f64 fd691, fd602, fd684; +sub.f64 fd692, fd690, fd691; +mul.f64 fd693, fd601, fd684; +fma.rn.f64 fd694, fd602, fd682, fd693; +mul.f64 fd695, fd431, fd694; +fma.rn.f64 fd696, fd692, fd414, fd695; +mul.f64 fd697, fd414, fd694; +mul.f64 fd698, fd692, fd431; +sub.f64 fd699, fd698, fd697; +mul.f64 fd700, fd601, fd692; +mul.f64 fd701, fd602, fd694; +sub.f64 fd702, fd700, fd701; +mul.f64 fd703, fd601, fd694; +fma.rn.f64 fd704, fd602, fd692, fd703; +mul.f64 fd705, fd473, fd704; +fma.rn.f64 fd706, fd702, fd456, fd705; +mul.f64 fd707, fd456, fd704; +mul.f64 fd708, fd702, fd473; +sub.f64 fd709, fd708, fd707; +mul.f64 fd710, fd601, fd702; +mul.f64 fd711, fd602, fd704; +sub.f64 fd712, fd710, fd711; +mul.f64 fd713, fd601, fd704; +fma.rn.f64 fd714, fd602, fd702, fd713; +mul.f64 fd715, fd515, fd714; +fma.rn.f64 fd716, fd712, fd498, fd715; +mul.f64 fd717, fd498, fd714; +mul.f64 fd718, fd712, fd515; +sub.f64 fd719, fd718, fd717; +ld.global.v2.f64 {fd720, fd721}, [rd6+80]; +mul.f64 fd724, fd557, fd721; +fma.rn.f64 fd725, fd720, fd540, fd724; +mul.f64 fd726, fd540, fd721; +mul.f64 fd727, fd720, fd557; +sub.f64 fd728, fd727, fd726; +mul.f64 fd729, fd601, fd720; +mul.f64 fd730, fd602, fd721; +sub.f64 fd731, fd729, fd730; +mul.f64 fd732, fd601, fd721; +fma.rn.f64 fd733, fd602, fd720, fd732; +mul.f64 fd734, fd599, fd733; +fma.rn.f64 fd735, fd731, fd582, fd734; +mul.f64 fd736, fd582, fd733; +mul.f64 fd737, fd731, fd599; +sub.f64 fd738, fd737, fd736; +mul.f64 fd739, fd601, fd731; +mul.f64 fd740, fd602, fd733; +sub.f64 fd741, fd739, fd740; +mul.f64 fd742, fd601, fd733; +fma.rn.f64 fd743, fd602, fd731, fd742; +mul.f64 fd744, fd432, fd743; +fma.rn.f64 fd745, fd741, fd415, fd744; +mul.f64 fd746, fd415, fd743; +mul.f64 fd747, fd741, fd432; +sub.f64 fd748, fd747, fd746; +mul.f64 fd749, fd601, fd741; +mul.f64 fd750, fd602, fd743; +sub.f64 fd751, fd749, fd750; +mul.f64 fd752, fd601, fd743; +fma.rn.f64 fd753, fd602, fd741, fd752; +mul.f64 fd754, fd474, fd753; +fma.rn.f64 fd755, fd751, fd457, fd754; +mul.f64 fd756, fd457, fd753; +mul.f64 fd757, fd751, fd474; +sub.f64 fd758, fd757, fd756; +mul.f64 fd759, fd601, fd751; +mul.f64 fd760, fd602, fd753; +sub.f64 fd761, fd759, fd760; +mul.f64 fd762, fd601, fd753; +fma.rn.f64 fd763, fd602, fd751, fd762; +mul.f64 fd764, fd516, fd763; +fma.rn.f64 fd765, fd761, fd499, fd764; +mul.f64 fd766, fd499, fd763; +mul.f64 fd767, fd761, fd516; +sub.f64 fd768, fd767, fd766; +mul.f64 fd769, fd601, fd761; +mul.f64 fd770, fd602, fd763; +sub.f64 fd771, fd769, fd770; +mul.f64 fd772, fd601, fd763; +fma.rn.f64 fd773, fd602, fd761, fd772; +mul.f64 fd774, fd558, fd773; +fma.rn.f64 fd775, fd771, fd541, fd774; +mul.f64 fd776, fd541, fd773; +mul.f64 fd777, fd771, fd558; +sub.f64 fd778, fd777, fd776; +mul.f64 fd779, fd601, fd771; +mul.f64 fd780, fd602, fd773; +sub.f64 fd781, fd779, fd780; +mul.f64 fd782, fd601, fd773; +fma.rn.f64 fd783, fd602, fd771, fd782; +mul.f64 fd784, fd600, fd783; +fma.rn.f64 fd785, fd781, fd583, fd784; +mul.f64 fd786, fd583, fd783; +mul.f64 fd787, fd781, fd600; +sub.f64 fd788, fd787, fd786; +mul.f64 fd789, fd601, fd781; +mul.f64 fd790, fd602, fd783; +sub.f64 fd791, fd789, fd790; +mul.f64 fd792, fd601, fd783; +fma.rn.f64 fd793, fd602, fd781, fd792; +mul.f64 fd794, fd424, fd793; +fma.rn.f64 fd795, fd791, fd407, fd794; +mul.f64 fd796, fd407, fd793; +mul.f64 fd797, fd791, fd424; +sub.f64 fd798, fd797, fd796; +mul.f64 fd799, fd601, fd791; +mul.f64 fd800, fd602, fd793; +sub.f64 fd801, fd799, fd800; +mul.f64 fd802, fd601, fd793; +fma.rn.f64 fd803, fd602, fd791, fd802; +mul.f64 fd804, fd466, fd803; +fma.rn.f64 fd805, fd801, fd449, fd804; +mul.f64 fd806, fd449, fd803; +mul.f64 fd807, fd801, fd466; +sub.f64 fd808, fd807, fd806; +mul.f64 fd809, fd601, fd801; +mul.f64 fd810, fd602, fd803; +sub.f64 fd811, fd809, fd810; +mul.f64 fd812, fd601, fd803; +fma.rn.f64 fd813, fd602, fd801, fd812; +mul.f64 fd814, fd508, fd813; +fma.rn.f64 fd815, fd811, fd491, fd814; +mul.f64 fd816, fd491, fd813; +mul.f64 fd817, fd811, fd508; +sub.f64 fd818, fd817, fd816; +mul.f64 fd819, fd601, fd811; +mul.f64 fd820, fd602, fd813; +sub.f64 fd821, fd819, fd820; +mul.f64 fd822, fd601, fd813; +fma.rn.f64 fd823, fd602, fd811, fd822; +mul.f64 fd824, fd550, fd823; +fma.rn.f64 fd825, fd821, fd533, fd824; +mul.f64 fd826, fd533, fd823; +mul.f64 fd827, fd821, fd550; +sub.f64 fd828, fd827, fd826; +mul.f64 fd829, fd601, fd821; +mul.f64 fd830, fd602, fd823; +sub.f64 fd831, fd829, fd830; +mul.f64 fd832, fd601, fd823; +fma.rn.f64 fd833, fd602, fd821, fd832; +mul.f64 fd834, fd592, fd833; +fma.rn.f64 fd835, fd831, fd575, fd834; +mul.f64 fd836, fd575, fd833; +mul.f64 fd837, fd831, fd592; +sub.f64 fd838, fd837, fd836; +mad.lo.s32 r8, r5, 1000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +st.shared.f64 [r9], fd394; +st.shared.f64 [r9+8], fd606; +st.shared.f64 [r9+16], fd616; +st.shared.f64 [r9+24], fd626; +st.shared.f64 [r9+32], fd636; +st.shared.f64 [r9+40], fd646; +st.shared.f64 [r9+48], fd656; +st.shared.f64 [r9+56], fd666; +st.shared.f64 [r9+64], fd676; +st.shared.f64 [r9+72], fd686; +st.shared.f64 [r9+80], fd696; +st.shared.f64 [r9+88], fd706; +st.shared.f64 [r9+96], fd716; +st.shared.f64 [r9+104], fd725; +st.shared.f64 [r9+112], fd735; +st.shared.f64 [r9+120], fd745; +st.shared.f64 [r9+128], fd755; +st.shared.f64 [r9+136], fd765; +st.shared.f64 [r9+144], fd775; +st.shared.f64 [r9+152], fd785; +st.shared.f64 [r9+160], fd795; +st.shared.f64 [r9+168], fd805; +st.shared.f64 [r9+176], fd815; +st.shared.f64 [r9+184], fd825; +st.shared.f64 [r9+192], fd835; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.f64 fd839, [r10]; +ld.shared.f64 fd840, [r10+40]; +ld.shared.f64 fd841, [r10+80]; +ld.shared.f64 fd842, [r10+120]; +ld.shared.f64 fd843, [r10+160]; +ld.shared.f64 fd844, [r10+200]; +ld.shared.f64 fd845, [r10+240]; +ld.shared.f64 fd846, [r10+280]; +ld.shared.f64 fd847, [r10+320]; +ld.shared.f64 fd848, [r10+360]; +ld.shared.f64 fd849, [r10+400]; +ld.shared.f64 fd850, [r10+440]; +ld.shared.f64 fd851, [r10+480]; +ld.shared.f64 fd852, [r10+520]; +ld.shared.f64 fd853, [r10+560]; +ld.shared.f64 fd854, [r10+600]; +ld.shared.f64 fd855, [r10+640]; +ld.shared.f64 fd856, [r10+680]; +ld.shared.f64 fd857, [r10+720]; +ld.shared.f64 fd858, [r10+760]; +ld.shared.f64 fd859, [r10+800]; +ld.shared.f64 fd860, [r10+840]; +ld.shared.f64 fd861, [r10+880]; +ld.shared.f64 fd862, [r10+920]; +ld.shared.f64 fd863, [r10+960]; +barrier.sync 0; +st.shared.f64 [r9], fd398; +st.shared.f64 [r9+8], fd609; +st.shared.f64 [r9+16], fd619; +st.shared.f64 [r9+24], fd629; +st.shared.f64 [r9+32], fd639; +st.shared.f64 [r9+40], fd649; +st.shared.f64 [r9+48], fd659; +st.shared.f64 [r9+56], fd669; +st.shared.f64 [r9+64], fd679; +st.shared.f64 [r9+72], fd689; +st.shared.f64 [r9+80], fd699; +st.shared.f64 [r9+88], fd709; +st.shared.f64 [r9+96], fd719; +st.shared.f64 [r9+104], fd728; +st.shared.f64 [r9+112], fd738; +st.shared.f64 [r9+120], fd748; +st.shared.f64 [r9+128], fd758; +st.shared.f64 [r9+136], fd768; +st.shared.f64 [r9+144], fd778; +st.shared.f64 [r9+152], fd788; +st.shared.f64 [r9+160], fd798; +st.shared.f64 [r9+168], fd808; +st.shared.f64 [r9+176], fd818; +st.shared.f64 [r9+184], fd828; +st.shared.f64 [r9+192], fd838; +barrier.sync 0; +ld.shared.f64 fd864, [r10]; +ld.shared.f64 fd865, [r10+40]; +ld.shared.f64 fd866, [r10+80]; +ld.shared.f64 fd867, [r10+120]; +ld.shared.f64 fd868, [r10+160]; +ld.shared.f64 fd869, [r10+200]; +ld.shared.f64 fd870, [r10+240]; +ld.shared.f64 fd871, [r10+280]; +ld.shared.f64 fd872, [r10+320]; +ld.shared.f64 fd873, [r10+360]; +ld.shared.f64 fd874, [r10+400]; +ld.shared.f64 fd875, [r10+440]; +ld.shared.f64 fd876, [r10+480]; +ld.shared.f64 fd877, [r10+520]; +ld.shared.f64 fd878, [r10+560]; +ld.shared.f64 fd879, [r10+600]; +ld.shared.f64 fd880, [r10+640]; +ld.shared.f64 fd881, [r10+680]; +ld.shared.f64 fd882, [r10+720]; +ld.shared.f64 fd883, [r10+760]; +ld.shared.f64 fd884, [r10+800]; +ld.shared.f64 fd885, [r10+840]; +ld.shared.f64 fd886, [r10+880]; +ld.shared.f64 fd887, [r10+920]; +ld.shared.f64 fd888, [r10+960]; +add.f64 fd889, fd844, fd859; +add.f64 fd890, fd839, fd889; +add.f64 fd891, fd849, fd854; +add.f64 fd892, fd869, fd884; +add.f64 fd893, fd864, fd892; +add.f64 fd894, fd874, fd879; +fma.rn.f64 fd895, fd889, 0d3FD3C6EF372FE950, fd839; +mul.f64 fd896, fd891, 0d3FE9E3779B97F4A8; +sub.f64 fd897, fd895, fd896; +sub.f64 fd898, fd869, fd884; +mul.f64 fd899, fd898, 0d3FEE6F0E134454FF; +sub.f64 fd900, fd874, fd879; +fma.rn.f64 fd901, fd900, 0d3FE2CF2304755A5E, fd899; +mul.f64 fd902, fd889, 0d3FE9E3779B97F4A8; +sub.f64 fd903, fd839, fd902; +fma.rn.f64 fd904, fd891, 0d3FD3C6EF372FE950, fd903; +mul.f64 fd905, fd898, 0d3FE2CF2304755A5E; +mul.f64 fd906, fd900, 0d3FEE6F0E134454FF; +sub.f64 fd907, fd905, fd906; +fma.rn.f64 fd908, fd892, 0d3FD3C6EF372FE950, fd864; +mul.f64 fd909, fd894, 0d3FE9E3779B97F4A8; +sub.f64 fd910, fd908, fd909; +sub.f64 fd911, fd844, fd859; +mul.f64 fd912, fd911, 0d3FEE6F0E134454FF; +sub.f64 fd913, fd849, fd854; +fma.rn.f64 fd914, fd913, 0d3FE2CF2304755A5E, fd912; +mul.f64 fd915, fd892, 0d3FE9E3779B97F4A8; +sub.f64 fd916, fd864, fd915; +fma.rn.f64 fd917, fd894, 0d3FD3C6EF372FE950, fd916; +mul.f64 fd918, fd911, 0d3FE2CF2304755A5E; +mul.f64 fd919, fd913, 0d3FEE6F0E134454FF; +sub.f64 fd920, fd918, fd919; +add.f64 fd921, fd845, fd860; +add.f64 fd922, fd840, fd921; +add.f64 fd923, fd850, fd855; +add.f64 fd924, fd870, fd885; +add.f64 fd925, fd865, fd924; +add.f64 fd926, fd875, fd880; +fma.rn.f64 fd927, fd921, 0d3FD3C6EF372FE950, fd840; +mul.f64 fd928, fd923, 0d3FE9E3779B97F4A8; +sub.f64 fd929, fd927, fd928; +sub.f64 fd930, fd870, fd885; +mul.f64 fd931, fd930, 0d3FEE6F0E134454FF; +sub.f64 fd932, fd875, fd880; +fma.rn.f64 fd933, fd932, 0d3FE2CF2304755A5E, fd931; +mul.f64 fd934, fd921, 0d3FE9E3779B97F4A8; +sub.f64 fd935, fd840, fd934; +fma.rn.f64 fd936, fd923, 0d3FD3C6EF372FE950, fd935; +mul.f64 fd937, fd930, 0d3FE2CF2304755A5E; +mul.f64 fd938, fd932, 0d3FEE6F0E134454FF; +sub.f64 fd939, fd937, fd938; +fma.rn.f64 fd940, fd924, 0d3FD3C6EF372FE950, fd865; +mul.f64 fd941, fd926, 0d3FE9E3779B97F4A8; +sub.f64 fd942, fd940, fd941; +sub.f64 fd943, fd845, fd860; +mul.f64 fd944, fd943, 0d3FEE6F0E134454FF; +sub.f64 fd945, fd850, fd855; +fma.rn.f64 fd946, fd945, 0d3FE2CF2304755A5E, fd944; +mul.f64 fd947, fd924, 0d3FE9E3779B97F4A8; +sub.f64 fd948, fd865, fd947; +fma.rn.f64 fd949, fd926, 0d3FD3C6EF372FE950, fd948; +mul.f64 fd950, fd943, 0d3FE2CF2304755A5E; +mul.f64 fd951, fd945, 0d3FEE6F0E134454FF; +sub.f64 fd952, fd950, fd951; +add.f64 fd953, fd846, fd861; +add.f64 fd954, fd841, fd953; +add.f64 fd955, fd851, fd856; +add.f64 fd956, fd871, fd886; +add.f64 fd957, fd866, fd956; +add.f64 fd958, fd876, fd881; +fma.rn.f64 fd959, fd953, 0d3FD3C6EF372FE950, fd841; +mul.f64 fd960, fd955, 0d3FE9E3779B97F4A8; +sub.f64 fd961, fd959, fd960; +sub.f64 fd962, fd871, fd886; +mul.f64 fd963, fd962, 0d3FEE6F0E134454FF; +sub.f64 fd964, fd876, fd881; +fma.rn.f64 fd965, fd964, 0d3FE2CF2304755A5E, fd963; +mul.f64 fd966, fd953, 0d3FE9E3779B97F4A8; +sub.f64 fd967, fd841, fd966; +fma.rn.f64 fd968, fd955, 0d3FD3C6EF372FE950, fd967; +mul.f64 fd969, fd962, 0d3FE2CF2304755A5E; +mul.f64 fd970, fd964, 0d3FEE6F0E134454FF; +sub.f64 fd971, fd969, fd970; +fma.rn.f64 fd972, fd956, 0d3FD3C6EF372FE950, fd866; +mul.f64 fd973, fd958, 0d3FE9E3779B97F4A8; +sub.f64 fd974, fd972, fd973; +sub.f64 fd975, fd846, fd861; +mul.f64 fd976, fd975, 0d3FEE6F0E134454FF; +sub.f64 fd977, fd851, fd856; +fma.rn.f64 fd978, fd977, 0d3FE2CF2304755A5E, fd976; +mul.f64 fd979, fd956, 0d3FE9E3779B97F4A8; +sub.f64 fd980, fd866, fd979; +fma.rn.f64 fd981, fd958, 0d3FD3C6EF372FE950, fd980; +mul.f64 fd982, fd975, 0d3FE2CF2304755A5E; +mul.f64 fd983, fd977, 0d3FEE6F0E134454FF; +sub.f64 fd984, fd982, fd983; +add.f64 fd985, fd847, fd862; +add.f64 fd986, fd842, fd985; +add.f64 fd987, fd852, fd857; +add.f64 fd988, fd872, fd887; +add.f64 fd989, fd867, fd988; +add.f64 fd990, fd877, fd882; +fma.rn.f64 fd991, fd985, 0d3FD3C6EF372FE950, fd842; +mul.f64 fd992, fd987, 0d3FE9E3779B97F4A8; +sub.f64 fd993, fd991, fd992; +sub.f64 fd994, fd872, fd887; +mul.f64 fd995, fd994, 0d3FEE6F0E134454FF; +sub.f64 fd996, fd877, fd882; +fma.rn.f64 fd997, fd996, 0d3FE2CF2304755A5E, fd995; +mul.f64 fd998, fd985, 0d3FE9E3779B97F4A8; +sub.f64 fd999, fd842, fd998; +fma.rn.f64 fd1000, fd987, 0d3FD3C6EF372FE950, fd999; +mul.f64 fd1001, fd994, 0d3FE2CF2304755A5E; +mul.f64 fd1002, fd996, 0d3FEE6F0E134454FF; +sub.f64 fd1003, fd1001, fd1002; +fma.rn.f64 fd1004, fd988, 0d3FD3C6EF372FE950, fd867; +mul.f64 fd1005, fd990, 0d3FE9E3779B97F4A8; +sub.f64 fd1006, fd1004, fd1005; +sub.f64 fd1007, fd847, fd862; +mul.f64 fd1008, fd1007, 0d3FEE6F0E134454FF; +sub.f64 fd1009, fd852, fd857; +fma.rn.f64 fd1010, fd1009, 0d3FE2CF2304755A5E, fd1008; +mul.f64 fd1011, fd988, 0d3FE9E3779B97F4A8; +sub.f64 fd1012, fd867, fd1011; +fma.rn.f64 fd1013, fd990, 0d3FD3C6EF372FE950, fd1012; +mul.f64 fd1014, fd1007, 0d3FE2CF2304755A5E; +mul.f64 fd1015, fd1009, 0d3FEE6F0E134454FF; +sub.f64 fd1016, fd1014, fd1015; +add.f64 fd1017, fd848, fd863; +add.f64 fd1018, fd843, fd1017; +add.f64 fd1019, fd853, fd858; +add.f64 fd1020, fd873, fd888; +add.f64 fd1021, fd868, fd1020; +add.f64 fd1022, fd878, fd883; +fma.rn.f64 fd1023, fd1017, 0d3FD3C6EF372FE950, fd843; +mul.f64 fd1024, fd1019, 0d3FE9E3779B97F4A8; +sub.f64 fd1025, fd1023, fd1024; +sub.f64 fd1026, fd873, fd888; +mul.f64 fd1027, fd1026, 0d3FEE6F0E134454FF; +sub.f64 fd1028, fd878, fd883; +fma.rn.f64 fd1029, fd1028, 0d3FE2CF2304755A5E, fd1027; +mul.f64 fd1030, fd1017, 0d3FE9E3779B97F4A8; +sub.f64 fd1031, fd843, fd1030; +fma.rn.f64 fd1032, fd1019, 0d3FD3C6EF372FE950, fd1031; +mul.f64 fd1033, fd1026, 0d3FE2CF2304755A5E; +mul.f64 fd1034, fd1028, 0d3FEE6F0E134454FF; +sub.f64 fd1035, fd1033, fd1034; +fma.rn.f64 fd1036, fd1020, 0d3FD3C6EF372FE950, fd868; +mul.f64 fd1037, fd1022, 0d3FE9E3779B97F4A8; +sub.f64 fd1038, fd1036, fd1037; +sub.f64 fd1039, fd848, fd863; +mul.f64 fd1040, fd1039, 0d3FEE6F0E134454FF; +sub.f64 fd1041, fd853, fd858; +fma.rn.f64 fd1042, fd1041, 0d3FE2CF2304755A5E, fd1040; +mul.f64 fd1043, fd1020, 0d3FE9E3779B97F4A8; +sub.f64 fd1044, fd868, fd1043; +fma.rn.f64 fd1045, fd1022, 0d3FD3C6EF372FE950, fd1044; +mul.f64 fd1046, fd1039, 0d3FE2CF2304755A5E; +mul.f64 fd1047, fd1041, 0d3FEE6F0E134454FF; +sub.f64 fd1048, fd1046, fd1047; +add.f64 %0, fd891, fd890; +add.f64 %1, fd894, fd893; +add.f64 %2, fd923, fd922; +add.f64 %3, fd926, fd925; +add.f64 %4, fd955, fd954; +add.f64 %5, fd958, fd957; +add.f64 %6, fd987, fd986; +add.f64 %7, fd990, fd989; +add.f64 %8, fd1019, fd1018; +add.f64 %9, fd1022, fd1021; +add.f64 %11, fd914, fd910; +sub.f64 %10, fd897, fd901; +add.f64 %13, fd946, fd942; +sub.f64 %12, fd929, fd933; +add.f64 %15, fd978, fd974; +sub.f64 %14, fd961, fd965; +add.f64 %17, fd1010, fd1006; +sub.f64 %16, fd993, fd997; +add.f64 %19, fd1042, fd1038; +sub.f64 %18, fd1025, fd1029; +sub.f64 %20, fd904, fd907; +add.f64 %21, fd920, fd917; +sub.f64 %22, fd936, fd939; +add.f64 %23, fd952, fd949; +sub.f64 %24, fd968, fd971; +add.f64 %25, fd984, fd981; +sub.f64 %26, fd1000, fd1003; +add.f64 %27, fd1016, fd1013; +sub.f64 %28, fd1032, fd1035; +add.f64 %29, fd1048, fd1045; +add.f64 %30, fd907, fd904; +sub.f64 %31, fd917, fd920; +add.f64 %32, fd939, fd936; +sub.f64 %33, fd949, fd952; +add.f64 %34, fd971, fd968; +sub.f64 %35, fd981, fd984; +add.f64 %36, fd1003, fd1000; +sub.f64 %37, fd1013, fd1016; +add.f64 %38, fd1035, fd1032; +sub.f64 %39, fd1045, fd1048; +sub.f64 %41, fd910, fd914; +add.f64 %40, fd901, fd897; +sub.f64 %43, fd942, fd946; +add.f64 %42, fd933, fd929; +sub.f64 %45, fd974, fd978; +add.f64 %44, fd965, fd961; +sub.f64 %47, fd1006, fd1010; +add.f64 %46, fd997, fd993; +sub.f64 %49, fd1038, fd1042; +add.f64 %48, fd1029, fd1025; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_125), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..275cade261e79 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp16_fwd.hpp.inc @@ -0,0 +1,17074 @@ +#ifndef CUFFTDX_FFT_128_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_128_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<797, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<210>; +.reg .b32 r<1579>; +.reg .b64 rd<2>; +mov.u32 r1567, %tid.y; +shl.b32 r1568, r1567, 9; +mov.u32 r1569, %32; +add.s32 r1570, r1569, r1568; +mov.u32 r1571, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f180, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r101, {low, high}; +} +mov.f32 f190, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f58, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r396, {low, high}; +} +mov.f32 f66, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r397, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1572, r1571, 7; +shl.b32 r1573, r1571, 6; +and.b32 r1574, r1573, -512; +add.s32 r1575, r1570, r1574; +cvt.rn.f32.u32 f207, r1572; +mul.f32 f208, f207, 0f3D490FDB; +cos.approx.f32 f117, f208; +sin.approx.f32 f209, f208; +neg.f32 f118, f209; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1576, r1573, 448; +add.s32 r1577, r1575, r1576; +st.shared.v4.f32 [r1577], {r521, r629, r666, r703}; +st.shared.v4.f32 [r1577+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r1577+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r1577+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r1578, r1572, -60, r1577; +ld.shared.u32 r1176, [r1578]; +ld.shared.u32 r1372, [r1578+32]; +ld.shared.u32 r1226, [r1578+64]; +ld.shared.u32 r1422, [r1578+96]; +ld.shared.u32 r1188, [r1578+128]; +ld.shared.u32 r1384, [r1578+160]; +ld.shared.u32 r1238, [r1578+192]; +ld.shared.u32 r1434, [r1578+224]; +ld.shared.u32 r1177, [r1578+256]; +ld.shared.u32 r1373, [r1578+288]; +ld.shared.u32 r1227, [r1578+320]; +ld.shared.u32 r1423, [r1578+352]; +ld.shared.u32 r1189, [r1578+384]; +ld.shared.u32 r1385, [r1578+416]; +ld.shared.u32 r1239, [r1578+448]; +ld.shared.u32 r1435, [r1578+480]; +barrier.sync 0; +st.shared.v4.f32 [r1577], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1577+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1577+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1577+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1578]; +ld.shared.u32 r1375, [r1578+32]; +ld.shared.u32 r1229, [r1578+64]; +ld.shared.u32 r1425, [r1578+96]; +ld.shared.u32 r1191, [r1578+128]; +ld.shared.u32 r1387, [r1578+160]; +ld.shared.u32 r1241, [r1578+192]; +ld.shared.u32 r1437, [r1578+224]; +ld.shared.u32 r1180, [r1578+256]; +ld.shared.u32 r1376, [r1578+288]; +ld.shared.u32 r1230, [r1578+320]; +ld.shared.u32 r1426, [r1578+352]; +ld.shared.u32 r1192, [r1578+384]; +ld.shared.u32 r1388, [r1578+416]; +ld.shared.u32 r1242, [r1578+448]; +ld.shared.u32 r1438, [r1578+480]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 %0, r1201, r1251; +} +{ +add.f16x2 %1, r1204, r1254; +} +{ +sub.f16x2 %16, r1201, r1251; +} +{ +sub.f16x2 %17, r1204, r1254; +} +{ +add.f16x2 %4, r1213, r1295; +} +{ +add.f16x2 %5, r1216, r1301; +} +{ +sub.f16x2 %20, r1213, r1295; +} +{ +sub.f16x2 %21, r1216, r1301; +} +{ +add.f16x2 %8, r1207, r1260; +} +{ +add.f16x2 %9, r1210, r1305; +} +{ +sub.f16x2 %24, r1207, r1260; +} +{ +sub.f16x2 %25, r1210, r1305; +} +{ +add.f16x2 %12, r1219, r1313; +} +{ +add.f16x2 %13, r1222, r1319; +} +{ +sub.f16x2 %28, r1219, r1313; +} +{ +sub.f16x2 %29, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 %2, r1397, r1447; +} +{ +add.f16x2 %3, r1400, r1450; +} +{ +sub.f16x2 %18, r1397, r1447; +} +{ +sub.f16x2 %19, r1400, r1450; +} +{ +add.f16x2 %6, r1409, r1491; +} +{ +add.f16x2 %7, r1412, r1497; +} +{ +sub.f16x2 %22, r1409, r1491; +} +{ +sub.f16x2 %23, r1412, r1497; +} +{ +add.f16x2 %10, r1403, r1456; +} +{ +add.f16x2 %11, r1406, r1501; +} +{ +sub.f16x2 %26, r1403, r1456; +} +{ +sub.f16x2 %27, r1406, r1501; +} +{ +add.f16x2 %14, r1415, r1509; +} +{ +add.f16x2 %15, r1418, r1515; +} +{ +sub.f16x2 %30, r1415, r1509; +} +{ +sub.f16x2 %31, r1418, r1515; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<798, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<985>; +.reg .b64 rd<2>; +mov.u32 r965, %tid.y; +shl.b32 r966, r965, 10; +mov.u32 r967, %16; +add.s32 r968, r967, r966; +mov.u32 r969, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f48, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r101, {low, high}; +} +mov.f32 f58, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f90, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r970, r969, 15; +shl.b32 r971, r969, 6; +and.b32 r972, r971, -1024; +add.s32 r973, r968, r972; +cvt.rn.f32.u32 f93, r970; +mul.f32 f94, f93, 0f3D490FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r974, r971, 960; +add.s32 r975, r973, r974; +st.shared.v4.f32 [r975], {r149, r152, r209, r216}; +st.shared.v4.f32 [r975+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r975+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r975+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r976, r970, -56, r975; +ld.shared.u32 r460, [r976]; +ld.shared.u32 r463, [r976+4]; +ld.shared.u32 r510, [r976+128]; +ld.shared.u32 r513, [r976+132]; +ld.shared.u32 r472, [r976+256]; +ld.shared.u32 r475, [r976+260]; +ld.shared.u32 r522, [r976+384]; +ld.shared.u32 r525, [r976+388]; +ld.shared.u32 r461, [r976+512]; +ld.shared.u32 r464, [r976+516]; +ld.shared.u32 r511, [r976+640]; +ld.shared.u32 r514, [r976+644]; +ld.shared.u32 r473, [r976+768]; +ld.shared.u32 r476, [r976+772]; +ld.shared.u32 r523, [r976+896]; +ld.shared.u32 r526, [r976+900]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r977, r969, 8; +bfe.u32 r978, r969, 3, 1; +cvt.rn.f32.u32 f96, r978; +mul.f32 f97, f96, 0f3EC90FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r979, r969, 3; +and.b32 r980, r979, 56; +add.s32 r981, r973, r980; +barrier.sync 0; +and.b32 r982, r971, 512; +add.s32 r983, r981, r982; +st.shared.u32 [r983], r607; +st.shared.u32 [r983+4], r610; +st.shared.u32 [r983+64], r667; +st.shared.u32 [r983+68], r674; +st.shared.u32 [r983+128], r704; +st.shared.u32 [r983+132], r711; +st.shared.u32 [r983+192], r741; +st.shared.u32 [r983+196], r748; +st.shared.u32 [r983+256], r778; +st.shared.u32 [r983+260], r785; +st.shared.u32 [r983+320], r815; +st.shared.u32 [r983+324], r822; +st.shared.u32 [r983+384], r852; +st.shared.u32 [r983+388], r859; +st.shared.u32 [r983+448], r889; +st.shared.u32 [r983+452], r896; +barrier.sync 0; +mad.lo.s32 r984, r977, -56, r983; +ld.shared.u32 r918, [r984]; +ld.shared.u32 r921, [r984+4]; +ld.shared.u32 r930, [r984+128]; +ld.shared.u32 r933, [r984+132]; +ld.shared.u32 r942, [r984+256]; +ld.shared.u32 r945, [r984+260]; +ld.shared.u32 r954, [r984+384]; +ld.shared.u32 r957, [r984+388]; +ld.shared.u32 r919, [r984+512]; +ld.shared.u32 r922, [r984+516]; +ld.shared.u32 r931, [r984+640]; +ld.shared.u32 r934, [r984+644]; +ld.shared.u32 r943, [r984+768]; +ld.shared.u32 r946, [r984+772]; +ld.shared.u32 r955, [r984+896]; +ld.shared.u32 r958, [r984+900]; +{ +add.f16x2 %0, r918, r919; +} +{ +add.f16x2 %1, r921, r922; +} +{ +sub.f16x2 %8, r918, r919; +} +{ +sub.f16x2 %9, r921, r922; +} +{ +add.f16x2 %2, r930, r931; +} +{ +add.f16x2 %3, r933, r934; +} +{ +sub.f16x2 %10, r930, r931; +} +{ +sub.f16x2 %11, r933, r934; +} +{ +add.f16x2 %4, r942, r943; +} +{ +add.f16x2 %5, r945, r946; +} +{ +sub.f16x2 %12, r942, r943; +} +{ +sub.f16x2 %13, r945, r946; +} +{ +add.f16x2 %6, r954, r955; +} +{ +add.f16x2 %7, r957, r958; +} +{ +sub.f16x2 %14, r954, r955; +} +{ +sub.f16x2 %15, r957, r958; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<799, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<210>; +.reg .b32 r<1579>; +.reg .b64 rd<2>; +mov.u32 r1567, %tid.y; +shl.b32 r1568, r1567, 10; +mov.u32 r1569, %32; +add.s32 r1570, r1569, r1568; +mov.u32 r1571, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f180, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r101, {low, high}; +} +mov.f32 f190, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f58, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r396, {low, high}; +} +mov.f32 f66, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r397, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1572, r1571, 7; +shl.b32 r1573, r1571, 7; +and.b32 r1574, r1573, -1024; +add.s32 r1575, r1570, r1574; +cvt.rn.f32.u32 f207, r1572; +mul.f32 f208, f207, 0f3D490FDB; +cos.approx.f32 f117, f208; +sin.approx.f32 f209, f208; +neg.f32 f118, f209; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1576, r1573, 896; +add.s32 r1577, r1575, r1576; +st.shared.v4.f32 [r1577], {r521, r524, r629, r636}; +st.shared.v4.f32 [r1577+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r1577+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r1577+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r1577+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r1577+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r1577+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r1577+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r1578, r1572, -120, r1577; +ld.shared.u32 r1176, [r1578]; +ld.shared.u32 r1179, [r1578+4]; +ld.shared.u32 r1372, [r1578+64]; +ld.shared.u32 r1375, [r1578+68]; +ld.shared.u32 r1226, [r1578+128]; +ld.shared.u32 r1229, [r1578+132]; +ld.shared.u32 r1422, [r1578+192]; +ld.shared.u32 r1425, [r1578+196]; +ld.shared.u32 r1188, [r1578+256]; +ld.shared.u32 r1191, [r1578+260]; +ld.shared.u32 r1384, [r1578+320]; +ld.shared.u32 r1387, [r1578+324]; +ld.shared.u32 r1238, [r1578+384]; +ld.shared.u32 r1241, [r1578+388]; +ld.shared.u32 r1434, [r1578+448]; +ld.shared.u32 r1437, [r1578+452]; +ld.shared.u32 r1177, [r1578+512]; +ld.shared.u32 r1180, [r1578+516]; +ld.shared.u32 r1373, [r1578+576]; +ld.shared.u32 r1376, [r1578+580]; +ld.shared.u32 r1227, [r1578+640]; +ld.shared.u32 r1230, [r1578+644]; +ld.shared.u32 r1423, [r1578+704]; +ld.shared.u32 r1426, [r1578+708]; +ld.shared.u32 r1189, [r1578+768]; +ld.shared.u32 r1192, [r1578+772]; +ld.shared.u32 r1385, [r1578+832]; +ld.shared.u32 r1388, [r1578+836]; +ld.shared.u32 r1239, [r1578+896]; +ld.shared.u32 r1242, [r1578+900]; +ld.shared.u32 r1435, [r1578+960]; +ld.shared.u32 r1438, [r1578+964]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 %0, r1201, r1251; +} +{ +add.f16x2 %1, r1204, r1254; +} +{ +sub.f16x2 %16, r1201, r1251; +} +{ +sub.f16x2 %17, r1204, r1254; +} +{ +add.f16x2 %4, r1213, r1295; +} +{ +add.f16x2 %5, r1216, r1301; +} +{ +sub.f16x2 %20, r1213, r1295; +} +{ +sub.f16x2 %21, r1216, r1301; +} +{ +add.f16x2 %8, r1207, r1260; +} +{ +add.f16x2 %9, r1210, r1305; +} +{ +sub.f16x2 %24, r1207, r1260; +} +{ +sub.f16x2 %25, r1210, r1305; +} +{ +add.f16x2 %12, r1219, r1313; +} +{ +add.f16x2 %13, r1222, r1319; +} +{ +sub.f16x2 %28, r1219, r1313; +} +{ +sub.f16x2 %29, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 %2, r1397, r1447; +} +{ +add.f16x2 %3, r1400, r1450; +} +{ +sub.f16x2 %18, r1397, r1447; +} +{ +sub.f16x2 %19, r1400, r1450; +} +{ +add.f16x2 %6, r1409, r1491; +} +{ +add.f16x2 %7, r1412, r1497; +} +{ +sub.f16x2 %22, r1409, r1491; +} +{ +sub.f16x2 %23, r1412, r1497; +} +{ +add.f16x2 %10, r1403, r1456; +} +{ +add.f16x2 %11, r1406, r1501; +} +{ +sub.f16x2 %26, r1403, r1456; +} +{ +sub.f16x2 %27, r1406, r1501; +} +{ +add.f16x2 %14, r1415, r1509; +} +{ +add.f16x2 %15, r1418, r1515; +} +{ +sub.f16x2 %30, r1415, r1509; +} +{ +sub.f16x2 %31, r1418, r1515; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<800, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<985>; +.reg .b64 rd<2>; +mov.u32 r965, %tid.y; +shl.b32 r966, r965, 9; +mov.u32 r967, %16; +add.s32 r968, r967, r966; +mov.u32 r969, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f48, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r101, {low, high}; +} +mov.f32 f58, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f90, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r970, r969, 15; +shl.b32 r971, r969, 5; +and.b32 r972, r971, -512; +add.s32 r973, r968, r972; +cvt.rn.f32.u32 f93, r970; +mul.f32 f94, f93, 0f3D490FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r974, r971, 480; +add.s32 r975, r973, r974; +st.shared.v4.f32 [r975], {r149, r209, r246, r283}; +st.shared.v4.f32 [r975+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r976, r970, -28, r975; +ld.shared.u32 r460, [r976]; +ld.shared.u32 r510, [r976+64]; +ld.shared.u32 r472, [r976+128]; +ld.shared.u32 r522, [r976+192]; +ld.shared.u32 r461, [r976+256]; +ld.shared.u32 r511, [r976+320]; +ld.shared.u32 r473, [r976+384]; +ld.shared.u32 r523, [r976+448]; +barrier.sync 0; +st.shared.v4.f32 [r975], {r152, r216, r253, r290}; +st.shared.v4.f32 [r975+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r976]; +ld.shared.u32 r513, [r976+64]; +ld.shared.u32 r475, [r976+128]; +ld.shared.u32 r525, [r976+192]; +ld.shared.u32 r464, [r976+256]; +ld.shared.u32 r514, [r976+320]; +ld.shared.u32 r476, [r976+384]; +ld.shared.u32 r526, [r976+448]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r977, r969, 8; +bfe.u32 r978, r969, 3, 1; +shl.b32 r979, r969, 2; +and.b32 r980, r979, 28; +add.s32 r981, r973, r980; +cvt.rn.f32.u32 f96, r978; +mul.f32 f97, f96, 0f3EC90FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r982, r971, 256; +add.s32 r983, r981, r982; +st.shared.u32 [r983], r607; +st.shared.u32 [r983+32], r667; +st.shared.u32 [r983+64], r704; +st.shared.u32 [r983+96], r741; +st.shared.u32 [r983+128], r778; +st.shared.u32 [r983+160], r815; +st.shared.u32 [r983+192], r852; +st.shared.u32 [r983+224], r889; +barrier.sync 0; +mad.lo.s32 r984, r977, -28, r983; +ld.shared.u32 r918, [r984]; +ld.shared.u32 r930, [r984+64]; +ld.shared.u32 r942, [r984+128]; +ld.shared.u32 r954, [r984+192]; +ld.shared.u32 r919, [r984+256]; +ld.shared.u32 r931, [r984+320]; +ld.shared.u32 r943, [r984+384]; +ld.shared.u32 r955, [r984+448]; +barrier.sync 0; +st.shared.u32 [r983], r610; +st.shared.u32 [r983+32], r674; +st.shared.u32 [r983+64], r711; +st.shared.u32 [r983+96], r748; +st.shared.u32 [r983+128], r785; +st.shared.u32 [r983+160], r822; +st.shared.u32 [r983+192], r859; +st.shared.u32 [r983+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r984]; +ld.shared.u32 r933, [r984+64]; +ld.shared.u32 r945, [r984+128]; +ld.shared.u32 r957, [r984+192]; +ld.shared.u32 r922, [r984+256]; +ld.shared.u32 r934, [r984+320]; +ld.shared.u32 r946, [r984+384]; +ld.shared.u32 r958, [r984+448]; +{ +add.f16x2 %0, r918, r919; +} +{ +add.f16x2 %1, r921, r922; +} +{ +sub.f16x2 %8, r918, r919; +} +{ +sub.f16x2 %9, r921, r922; +} +{ +add.f16x2 %2, r930, r931; +} +{ +add.f16x2 %3, r933, r934; +} +{ +sub.f16x2 %10, r930, r931; +} +{ +sub.f16x2 %11, r933, r934; +} +{ +add.f16x2 %4, r942, r943; +} +{ +add.f16x2 %5, r945, r946; +} +{ +sub.f16x2 %12, r942, r943; +} +{ +sub.f16x2 %13, r945, r946; +} +{ +add.f16x2 %6, r954, r955; +} +{ +add.f16x2 %7, r957, r958; +} +{ +sub.f16x2 %14, r954, r955; +} +{ +sub.f16x2 %15, r957, r958; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<801, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<544>; +.reg .b64 rd<2>; +mov.u32 r517, %tid.y; +shl.b32 r518, r517, 10; +mov.u32 r519, %8; +add.s32 r520, r519, r518; +mov.u32 r521, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r522, r521, 31; +shl.b32 r523, r521, 5; +and.b32 r524, r523, -1024; +add.s32 r525, r520, r524; +cvt.rn.f32.u32 f31, r522; +mul.f32 f32, f31, 0f3D490FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r526, r523, 992; +add.s32 r527, r525, r526; +st.shared.v4.f32 [r527], {r27, r30, r63, r70}; +st.shared.v4.f32 [r527+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r528, r522, -24, r527; +ld.shared.u32 r166, [r528]; +ld.shared.u32 r169, [r528+4]; +ld.shared.u32 r178, [r528+256]; +ld.shared.u32 r181, [r528+260]; +ld.shared.u32 r167, [r528+512]; +ld.shared.u32 r170, [r528+516]; +ld.shared.u32 r179, [r528+768]; +ld.shared.u32 r182, [r528+772]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r529, r521, 28; +bfe.u32 r530, r521, 2, 3; +cvt.rn.f32.u32 f34, r530; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +shl.b32 r531, r521, 3; +and.b32 r532, r531, 24; +add.s32 r533, r525, r532; +barrier.sync 0; +and.b32 r534, r523, 896; +add.s32 r535, r533, r534; +st.shared.u32 [r535], r191; +st.shared.u32 [r535+4], r194; +st.shared.u32 [r535+32], r227; +st.shared.u32 [r535+36], r234; +st.shared.u32 [r535+64], r264; +st.shared.u32 [r535+68], r271; +st.shared.u32 [r535+96], r301; +st.shared.u32 [r535+100], r308; +barrier.sync 0; +mad.lo.s32 r536, r529, -24, r535; +ld.shared.u32 r330, [r536]; +ld.shared.u32 r333, [r536+4]; +ld.shared.u32 r342, [r536+256]; +ld.shared.u32 r345, [r536+260]; +ld.shared.u32 r331, [r536+512]; +ld.shared.u32 r334, [r536+516]; +ld.shared.u32 r343, [r536+768]; +ld.shared.u32 r346, [r536+772]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r350; +} +{ +add.f16x2 r370, r338, r353; +} +{ +sub.f16x2 r373, r335, r350; +} +{ +sub.f16x2 r376, r338, r353; +} +and.b32 r537, r521, 16; +bfe.u32 r538, r521, 4, 1; +cvt.rn.f32.u32 f37, r538; +mul.f32 f38, f37, 0f3F490FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +neg.f16x2 r389, r386; +} +{ +fma.rn.f16x2 r391, r367, r382, r389; +} +{ +mul.f16x2 r395, r367, r384; +} +{ +fma.rn.f16x2 r398, r370, r382, r395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r361, r419, r426; +} +{ +mul.f16x2 r432, r361, r421; +} +{ +fma.rn.f16x2 r435, r364, r419, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r373, r456, r463; +} +{ +mul.f16x2 r469, r373, r458; +} +{ +fma.rn.f16x2 r472, r376, r456, r469; +} +and.b32 r539, r531, 120; +add.s32 r540, r525, r539; +barrier.sync 0; +and.b32 r541, r523, 512; +add.s32 r542, r540, r541; +st.shared.u32 [r542], r355; +st.shared.u32 [r542+4], r358; +st.shared.u32 [r542+128], r391; +st.shared.u32 [r542+132], r398; +st.shared.u32 [r542+256], r428; +st.shared.u32 [r542+260], r435; +st.shared.u32 [r542+384], r465; +st.shared.u32 [r542+388], r472; +barrier.sync 0; +mad.lo.s32 r543, r537, -24, r542; +ld.shared.u32 r494, [r543]; +ld.shared.u32 r497, [r543+4]; +ld.shared.u32 r506, [r543+256]; +ld.shared.u32 r509, [r543+260]; +ld.shared.u32 r495, [r543+512]; +ld.shared.u32 r498, [r543+516]; +ld.shared.u32 r507, [r543+768]; +ld.shared.u32 r510, [r543+772]; +{ +add.f16x2 %0, r494, r495; +} +{ +add.f16x2 %1, r497, r498; +} +{ +sub.f16x2 %4, r494, r495; +} +{ +sub.f16x2 %5, r497, r498; +} +{ +add.f16x2 %2, r506, r507; +} +{ +add.f16x2 %3, r509, r510; +} +{ +sub.f16x2 %6, r506, r507; +} +{ +sub.f16x2 %7, r509, r510; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<802, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<544>; +.reg .b64 rd<2>; +mov.u32 r517, %tid.y; +shl.b32 r518, r517, 9; +mov.u32 r519, %8; +add.s32 r520, r519, r518; +mov.u32 r521, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r522, r521, 31; +shl.b32 r523, r521, 4; +and.b32 r524, r523, -512; +add.s32 r525, r520, r524; +cvt.rn.f32.u32 f31, r522; +mul.f32 f32, f31, 0f3D490FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r526, r523, 496; +add.s32 r527, r525, r526; +st.shared.v4.f32 [r527], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r528, r522, -12, r527; +ld.shared.u32 r166, [r528]; +ld.shared.u32 r178, [r528+128]; +ld.shared.u32 r167, [r528+256]; +ld.shared.u32 r179, [r528+384]; +barrier.sync 0; +st.shared.v4.f32 [r527], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r528]; +ld.shared.u32 r181, [r528+128]; +ld.shared.u32 r170, [r528+256]; +ld.shared.u32 r182, [r528+384]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r529, r521, 28; +bfe.u32 r530, r521, 2, 3; +shl.b32 r531, r521, 2; +and.b32 r532, r531, 12; +add.s32 r533, r525, r532; +cvt.rn.f32.u32 f34, r530; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +barrier.sync 0; +and.b32 r534, r523, 448; +add.s32 r535, r533, r534; +st.shared.u32 [r535], r191; +st.shared.u32 [r535+16], r227; +st.shared.u32 [r535+32], r264; +st.shared.u32 [r535+48], r301; +barrier.sync 0; +mad.lo.s32 r536, r529, -12, r535; +ld.shared.u32 r330, [r536]; +ld.shared.u32 r342, [r536+128]; +ld.shared.u32 r331, [r536+256]; +ld.shared.u32 r343, [r536+384]; +barrier.sync 0; +st.shared.u32 [r535], r194; +st.shared.u32 [r535+16], r234; +st.shared.u32 [r535+32], r271; +st.shared.u32 [r535+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r536]; +ld.shared.u32 r345, [r536+128]; +ld.shared.u32 r334, [r536+256]; +ld.shared.u32 r346, [r536+384]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r350; +} +{ +add.f16x2 r370, r338, r353; +} +{ +sub.f16x2 r373, r335, r350; +} +{ +sub.f16x2 r376, r338, r353; +} +and.b32 r537, r521, 16; +bfe.u32 r538, r521, 4, 1; +and.b32 r539, r531, 60; +add.s32 r540, r525, r539; +cvt.rn.f32.u32 f37, r538; +mul.f32 f38, f37, 0f3F490FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +neg.f16x2 r389, r386; +} +{ +fma.rn.f16x2 r391, r367, r382, r389; +} +{ +mul.f16x2 r395, r367, r384; +} +{ +fma.rn.f16x2 r398, r370, r382, r395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r361, r419, r426; +} +{ +mul.f16x2 r432, r361, r421; +} +{ +fma.rn.f16x2 r435, r364, r419, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r373, r456, r463; +} +{ +mul.f16x2 r469, r373, r458; +} +{ +fma.rn.f16x2 r472, r376, r456, r469; +} +barrier.sync 0; +and.b32 r541, r523, 256; +add.s32 r542, r540, r541; +st.shared.u32 [r542], r355; +st.shared.u32 [r542+64], r391; +st.shared.u32 [r542+128], r428; +st.shared.u32 [r542+192], r465; +barrier.sync 0; +mad.lo.s32 r543, r537, -12, r542; +ld.shared.u32 r494, [r543]; +ld.shared.u32 r506, [r543+128]; +ld.shared.u32 r495, [r543+256]; +ld.shared.u32 r507, [r543+384]; +barrier.sync 0; +st.shared.u32 [r542], r358; +st.shared.u32 [r542+64], r398; +st.shared.u32 [r542+128], r435; +st.shared.u32 [r542+192], r472; +barrier.sync 0; +ld.shared.u32 r497, [r543]; +ld.shared.u32 r509, [r543+128]; +ld.shared.u32 r498, [r543+256]; +ld.shared.u32 r510, [r543+384]; +{ +add.f16x2 %0, r494, r495; +} +{ +add.f16x2 %1, r497, r498; +} +{ +sub.f16x2 %4, r494, r495; +} +{ +sub.f16x2 %5, r497, r498; +} +{ +add.f16x2 %2, r506, r507; +} +{ +add.f16x2 %3, r509, r510; +} +{ +sub.f16x2 %6, r506, r507; +} +{ +sub.f16x2 %7, r509, r510; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<803, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3339>; +.reg .b64 rd<3>; +mov.u32 r3263, %tid.y; +shl.b32 r3264, r3263, 10; +mov.u32 r3265, %64; +add.s32 r3266, r3265, r3264; +mov.u32 r3267, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f246, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r101, {low, high}; +} +mov.f32 f280, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %79, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %79, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f238, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +mov.f32 f254, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r397, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %78, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %78, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3268, r3267, 3; +shl.b32 r3269, r3267, 8; +and.b32 r3270, r3269, -1024; +add.s32 r3271, r3266, r3270; +cvt.rn.f32.u32 f423, r3268; +mul.f32 f424, f423, 0f3D490FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r3272, r3269, 768; +add.s32 r3273, r3271, r3272; +st.shared.v4.f32 [r3273], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r3273+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r3273+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r3273+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r3273+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r3273+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r3273+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r3273+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r3273+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r3273+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r3273+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r3273+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r3273+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r3273+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r3273+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r3273+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r3274, r3268, -248, r3273; +ld.shared.u32 r2864, [r3274]; +ld.shared.u32 r2867, [r3274+4]; +ld.shared.u32 r2914, [r3274+32]; +ld.shared.u32 r2917, [r3274+36]; +ld.shared.u32 r2964, [r3274+64]; +ld.shared.u32 r2967, [r3274+68]; +ld.shared.u32 r3014, [r3274+96]; +ld.shared.u32 r3017, [r3274+100]; +ld.shared.u32 r3064, [r3274+128]; +ld.shared.u32 r3067, [r3274+132]; +ld.shared.u32 r3114, [r3274+160]; +ld.shared.u32 r3117, [r3274+164]; +ld.shared.u32 r3164, [r3274+192]; +ld.shared.u32 r3167, [r3274+196]; +ld.shared.u32 r3214, [r3274+224]; +ld.shared.u32 r3217, [r3274+228]; +ld.shared.u32 r2876, [r3274+256]; +ld.shared.u32 r2879, [r3274+260]; +ld.shared.u32 r2926, [r3274+288]; +ld.shared.u32 r2929, [r3274+292]; +ld.shared.u32 r2976, [r3274+320]; +ld.shared.u32 r2979, [r3274+324]; +ld.shared.u32 r3026, [r3274+352]; +ld.shared.u32 r3029, [r3274+356]; +ld.shared.u32 r3076, [r3274+384]; +ld.shared.u32 r3079, [r3274+388]; +ld.shared.u32 r3126, [r3274+416]; +ld.shared.u32 r3129, [r3274+420]; +ld.shared.u32 r3176, [r3274+448]; +ld.shared.u32 r3179, [r3274+452]; +ld.shared.u32 r3226, [r3274+480]; +ld.shared.u32 r3229, [r3274+484]; +ld.shared.u32 r2865, [r3274+512]; +ld.shared.u32 r2868, [r3274+516]; +ld.shared.u32 r2915, [r3274+544]; +ld.shared.u32 r2918, [r3274+548]; +ld.shared.u32 r2965, [r3274+576]; +ld.shared.u32 r2968, [r3274+580]; +ld.shared.u32 r3015, [r3274+608]; +ld.shared.u32 r3018, [r3274+612]; +ld.shared.u32 r3065, [r3274+640]; +ld.shared.u32 r3068, [r3274+644]; +ld.shared.u32 r3115, [r3274+672]; +ld.shared.u32 r3118, [r3274+676]; +ld.shared.u32 r3165, [r3274+704]; +ld.shared.u32 r3168, [r3274+708]; +ld.shared.u32 r3215, [r3274+736]; +ld.shared.u32 r3218, [r3274+740]; +ld.shared.u32 r2877, [r3274+768]; +ld.shared.u32 r2880, [r3274+772]; +ld.shared.u32 r2927, [r3274+800]; +ld.shared.u32 r2930, [r3274+804]; +ld.shared.u32 r2977, [r3274+832]; +ld.shared.u32 r2980, [r3274+836]; +ld.shared.u32 r3027, [r3274+864]; +ld.shared.u32 r3030, [r3274+868]; +ld.shared.u32 r3077, [r3274+896]; +ld.shared.u32 r3080, [r3274+900]; +ld.shared.u32 r3127, [r3274+928]; +ld.shared.u32 r3130, [r3274+932]; +ld.shared.u32 r3177, [r3274+960]; +ld.shared.u32 r3180, [r3274+964]; +ld.shared.u32 r3227, [r3274+992]; +ld.shared.u32 r3230, [r3274+996]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 %0, r2863, r2875; +} +{ +add.f16x2 %1, r2866, r2878; +} +{ +sub.f16x2 %32, r2863, r2875; +} +{ +sub.f16x2 %33, r2866, r2878; +} +{ +add.f16x2 %16, r2869, r2884; +} +{ +add.f16x2 %17, r2872, r2887; +} +{ +sub.f16x2 %48, r2869, r2884; +} +{ +sub.f16x2 %49, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 %2, r2913, r2925; +} +{ +add.f16x2 %3, r2916, r2928; +} +{ +sub.f16x2 %34, r2913, r2925; +} +{ +sub.f16x2 %35, r2916, r2928; +} +{ +add.f16x2 %18, r2919, r2934; +} +{ +add.f16x2 %19, r2922, r2937; +} +{ +sub.f16x2 %50, r2919, r2934; +} +{ +sub.f16x2 %51, r2922, r2937; +} +{ +add.f16x2 r2963, r2964, r2965; +} +{ +add.f16x2 r2966, r2967, r2968; +} +{ +sub.f16x2 r2969, r2964, r2965; +} +{ +sub.f16x2 r2972, r2967, r2968; +} +{ +add.f16x2 r2975, r2976, r2977; +} +{ +add.f16x2 r2978, r2979, r2980; +} +{ +sub.f16x2 r2981, r2976, r2977; +} +{ +sub.f16x2 r2984, r2979, r2980; +} +{ +neg.f16x2 r2987, r2981; +} +{ +add.f16x2 %4, r2963, r2975; +} +{ +add.f16x2 %5, r2966, r2978; +} +{ +sub.f16x2 %36, r2963, r2975; +} +{ +sub.f16x2 %37, r2966, r2978; +} +{ +add.f16x2 %20, r2969, r2984; +} +{ +add.f16x2 %21, r2972, r2987; +} +{ +sub.f16x2 %52, r2969, r2984; +} +{ +sub.f16x2 %53, r2972, r2987; +} +{ +add.f16x2 r3013, r3014, r3015; +} +{ +add.f16x2 r3016, r3017, r3018; +} +{ +sub.f16x2 r3019, r3014, r3015; +} +{ +sub.f16x2 r3022, r3017, r3018; +} +{ +add.f16x2 r3025, r3026, r3027; +} +{ +add.f16x2 r3028, r3029, r3030; +} +{ +sub.f16x2 r3031, r3026, r3027; +} +{ +sub.f16x2 r3034, r3029, r3030; +} +{ +neg.f16x2 r3037, r3031; +} +{ +add.f16x2 %6, r3013, r3025; +} +{ +add.f16x2 %7, r3016, r3028; +} +{ +sub.f16x2 %38, r3013, r3025; +} +{ +sub.f16x2 %39, r3016, r3028; +} +{ +add.f16x2 %22, r3019, r3034; +} +{ +add.f16x2 %23, r3022, r3037; +} +{ +sub.f16x2 %54, r3019, r3034; +} +{ +sub.f16x2 %55, r3022, r3037; +} +{ +add.f16x2 r3063, r3064, r3065; +} +{ +add.f16x2 r3066, r3067, r3068; +} +{ +sub.f16x2 r3069, r3064, r3065; +} +{ +sub.f16x2 r3072, r3067, r3068; +} +{ +add.f16x2 r3075, r3076, r3077; +} +{ +add.f16x2 r3078, r3079, r3080; +} +{ +sub.f16x2 r3081, r3076, r3077; +} +{ +sub.f16x2 r3084, r3079, r3080; +} +{ +neg.f16x2 r3087, r3081; +} +{ +add.f16x2 %8, r3063, r3075; +} +{ +add.f16x2 %9, r3066, r3078; +} +{ +sub.f16x2 %40, r3063, r3075; +} +{ +sub.f16x2 %41, r3066, r3078; +} +{ +add.f16x2 %24, r3069, r3084; +} +{ +add.f16x2 %25, r3072, r3087; +} +{ +sub.f16x2 %56, r3069, r3084; +} +{ +sub.f16x2 %57, r3072, r3087; +} +{ +add.f16x2 r3113, r3114, r3115; +} +{ +add.f16x2 r3116, r3117, r3118; +} +{ +sub.f16x2 r3119, r3114, r3115; +} +{ +sub.f16x2 r3122, r3117, r3118; +} +{ +add.f16x2 r3125, r3126, r3127; +} +{ +add.f16x2 r3128, r3129, r3130; +} +{ +sub.f16x2 r3131, r3126, r3127; +} +{ +sub.f16x2 r3134, r3129, r3130; +} +{ +neg.f16x2 r3137, r3131; +} +{ +add.f16x2 %10, r3113, r3125; +} +{ +add.f16x2 %11, r3116, r3128; +} +{ +sub.f16x2 %42, r3113, r3125; +} +{ +sub.f16x2 %43, r3116, r3128; +} +{ +add.f16x2 %26, r3119, r3134; +} +{ +add.f16x2 %27, r3122, r3137; +} +{ +sub.f16x2 %58, r3119, r3134; +} +{ +sub.f16x2 %59, r3122, r3137; +} +{ +add.f16x2 r3163, r3164, r3165; +} +{ +add.f16x2 r3166, r3167, r3168; +} +{ +sub.f16x2 r3169, r3164, r3165; +} +{ +sub.f16x2 r3172, r3167, r3168; +} +{ +add.f16x2 r3175, r3176, r3177; +} +{ +add.f16x2 r3178, r3179, r3180; +} +{ +sub.f16x2 r3181, r3176, r3177; +} +{ +sub.f16x2 r3184, r3179, r3180; +} +{ +neg.f16x2 r3187, r3181; +} +{ +add.f16x2 %12, r3163, r3175; +} +{ +add.f16x2 %13, r3166, r3178; +} +{ +sub.f16x2 %44, r3163, r3175; +} +{ +sub.f16x2 %45, r3166, r3178; +} +{ +add.f16x2 %28, r3169, r3184; +} +{ +add.f16x2 %29, r3172, r3187; +} +{ +sub.f16x2 %60, r3169, r3184; +} +{ +sub.f16x2 %61, r3172, r3187; +} +{ +add.f16x2 r3213, r3214, r3215; +} +{ +add.f16x2 r3216, r3217, r3218; +} +{ +sub.f16x2 r3219, r3214, r3215; +} +{ +sub.f16x2 r3222, r3217, r3218; +} +{ +add.f16x2 r3225, r3226, r3227; +} +{ +add.f16x2 r3228, r3229, r3230; +} +{ +sub.f16x2 r3231, r3226, r3227; +} +{ +sub.f16x2 r3234, r3229, r3230; +} +{ +neg.f16x2 r3237, r3231; +} +{ +add.f16x2 %14, r3213, r3225; +} +{ +add.f16x2 %15, r3216, r3228; +} +{ +sub.f16x2 %46, r3213, r3225; +} +{ +sub.f16x2 %47, r3216, r3228; +} +{ +add.f16x2 %30, r3219, r3234; +} +{ +add.f16x2 %31, r3222, r3237; +} +{ +sub.f16x2 %62, r3219, r3234; +} +{ +sub.f16x2 %63, r3222, r3237; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<804, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3339>; +.reg .b64 rd<3>; +mov.u32 r3263, %tid.y; +shl.b32 r3264, r3263, 9; +mov.u32 r3265, %64; +add.s32 r3266, r3265, r3264; +mov.u32 r3267, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f246, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r101, {low, high}; +} +mov.f32 f280, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %79, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %79, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f238, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +mov.f32 f254, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r397, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %78, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %78, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3268, r3267, 3; +shl.b32 r3269, r3267, 7; +and.b32 r3270, r3269, -512; +add.s32 r3271, r3266, r3270; +cvt.rn.f32.u32 f423, r3268; +mul.f32 f424, f423, 0f3D490FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r3272, r3269, 384; +add.s32 r3273, r3271, r3272; +st.shared.v4.f32 [r3273], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r3273+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r3273+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r3273+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r3273+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r3273+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r3273+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r3273+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r3274, r3268, -124, r3273; +ld.shared.u32 r2864, [r3274]; +ld.shared.u32 r2914, [r3274+16]; +ld.shared.u32 r2964, [r3274+32]; +ld.shared.u32 r3014, [r3274+48]; +ld.shared.u32 r3064, [r3274+64]; +ld.shared.u32 r3114, [r3274+80]; +ld.shared.u32 r3164, [r3274+96]; +ld.shared.u32 r3214, [r3274+112]; +ld.shared.u32 r2876, [r3274+128]; +ld.shared.u32 r2926, [r3274+144]; +ld.shared.u32 r2976, [r3274+160]; +ld.shared.u32 r3026, [r3274+176]; +ld.shared.u32 r3076, [r3274+192]; +ld.shared.u32 r3126, [r3274+208]; +ld.shared.u32 r3176, [r3274+224]; +ld.shared.u32 r3226, [r3274+240]; +ld.shared.u32 r2865, [r3274+256]; +ld.shared.u32 r2915, [r3274+272]; +ld.shared.u32 r2965, [r3274+288]; +ld.shared.u32 r3015, [r3274+304]; +ld.shared.u32 r3065, [r3274+320]; +ld.shared.u32 r3115, [r3274+336]; +ld.shared.u32 r3165, [r3274+352]; +ld.shared.u32 r3215, [r3274+368]; +ld.shared.u32 r2877, [r3274+384]; +ld.shared.u32 r2927, [r3274+400]; +ld.shared.u32 r2977, [r3274+416]; +ld.shared.u32 r3027, [r3274+432]; +ld.shared.u32 r3077, [r3274+448]; +ld.shared.u32 r3127, [r3274+464]; +ld.shared.u32 r3177, [r3274+480]; +ld.shared.u32 r3227, [r3274+496]; +barrier.sync 0; +st.shared.v4.f32 [r3273], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r3273+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r3273+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r3273+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r3273+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r3273+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r3273+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r3273+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r3274]; +ld.shared.u32 r2917, [r3274+16]; +ld.shared.u32 r2967, [r3274+32]; +ld.shared.u32 r3017, [r3274+48]; +ld.shared.u32 r3067, [r3274+64]; +ld.shared.u32 r3117, [r3274+80]; +ld.shared.u32 r3167, [r3274+96]; +ld.shared.u32 r3217, [r3274+112]; +ld.shared.u32 r2879, [r3274+128]; +ld.shared.u32 r2929, [r3274+144]; +ld.shared.u32 r2979, [r3274+160]; +ld.shared.u32 r3029, [r3274+176]; +ld.shared.u32 r3079, [r3274+192]; +ld.shared.u32 r3129, [r3274+208]; +ld.shared.u32 r3179, [r3274+224]; +ld.shared.u32 r3229, [r3274+240]; +ld.shared.u32 r2868, [r3274+256]; +ld.shared.u32 r2918, [r3274+272]; +ld.shared.u32 r2968, [r3274+288]; +ld.shared.u32 r3018, [r3274+304]; +ld.shared.u32 r3068, [r3274+320]; +ld.shared.u32 r3118, [r3274+336]; +ld.shared.u32 r3168, [r3274+352]; +ld.shared.u32 r3218, [r3274+368]; +ld.shared.u32 r2880, [r3274+384]; +ld.shared.u32 r2930, [r3274+400]; +ld.shared.u32 r2980, [r3274+416]; +ld.shared.u32 r3030, [r3274+432]; +ld.shared.u32 r3080, [r3274+448]; +ld.shared.u32 r3130, [r3274+464]; +ld.shared.u32 r3180, [r3274+480]; +ld.shared.u32 r3230, [r3274+496]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 %0, r2863, r2875; +} +{ +add.f16x2 %1, r2866, r2878; +} +{ +sub.f16x2 %32, r2863, r2875; +} +{ +sub.f16x2 %33, r2866, r2878; +} +{ +add.f16x2 %16, r2869, r2884; +} +{ +add.f16x2 %17, r2872, r2887; +} +{ +sub.f16x2 %48, r2869, r2884; +} +{ +sub.f16x2 %49, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 %2, r2913, r2925; +} +{ +add.f16x2 %3, r2916, r2928; +} +{ +sub.f16x2 %34, r2913, r2925; +} +{ +sub.f16x2 %35, r2916, r2928; +} +{ +add.f16x2 %18, r2919, r2934; +} +{ +add.f16x2 %19, r2922, r2937; +} +{ +sub.f16x2 %50, r2919, r2934; +} +{ +sub.f16x2 %51, r2922, r2937; +} +{ +add.f16x2 r2963, r2964, r2965; +} +{ +add.f16x2 r2966, r2967, r2968; +} +{ +sub.f16x2 r2969, r2964, r2965; +} +{ +sub.f16x2 r2972, r2967, r2968; +} +{ +add.f16x2 r2975, r2976, r2977; +} +{ +add.f16x2 r2978, r2979, r2980; +} +{ +sub.f16x2 r2981, r2976, r2977; +} +{ +sub.f16x2 r2984, r2979, r2980; +} +{ +neg.f16x2 r2987, r2981; +} +{ +add.f16x2 %4, r2963, r2975; +} +{ +add.f16x2 %5, r2966, r2978; +} +{ +sub.f16x2 %36, r2963, r2975; +} +{ +sub.f16x2 %37, r2966, r2978; +} +{ +add.f16x2 %20, r2969, r2984; +} +{ +add.f16x2 %21, r2972, r2987; +} +{ +sub.f16x2 %52, r2969, r2984; +} +{ +sub.f16x2 %53, r2972, r2987; +} +{ +add.f16x2 r3013, r3014, r3015; +} +{ +add.f16x2 r3016, r3017, r3018; +} +{ +sub.f16x2 r3019, r3014, r3015; +} +{ +sub.f16x2 r3022, r3017, r3018; +} +{ +add.f16x2 r3025, r3026, r3027; +} +{ +add.f16x2 r3028, r3029, r3030; +} +{ +sub.f16x2 r3031, r3026, r3027; +} +{ +sub.f16x2 r3034, r3029, r3030; +} +{ +neg.f16x2 r3037, r3031; +} +{ +add.f16x2 %6, r3013, r3025; +} +{ +add.f16x2 %7, r3016, r3028; +} +{ +sub.f16x2 %38, r3013, r3025; +} +{ +sub.f16x2 %39, r3016, r3028; +} +{ +add.f16x2 %22, r3019, r3034; +} +{ +add.f16x2 %23, r3022, r3037; +} +{ +sub.f16x2 %54, r3019, r3034; +} +{ +sub.f16x2 %55, r3022, r3037; +} +{ +add.f16x2 r3063, r3064, r3065; +} +{ +add.f16x2 r3066, r3067, r3068; +} +{ +sub.f16x2 r3069, r3064, r3065; +} +{ +sub.f16x2 r3072, r3067, r3068; +} +{ +add.f16x2 r3075, r3076, r3077; +} +{ +add.f16x2 r3078, r3079, r3080; +} +{ +sub.f16x2 r3081, r3076, r3077; +} +{ +sub.f16x2 r3084, r3079, r3080; +} +{ +neg.f16x2 r3087, r3081; +} +{ +add.f16x2 %8, r3063, r3075; +} +{ +add.f16x2 %9, r3066, r3078; +} +{ +sub.f16x2 %40, r3063, r3075; +} +{ +sub.f16x2 %41, r3066, r3078; +} +{ +add.f16x2 %24, r3069, r3084; +} +{ +add.f16x2 %25, r3072, r3087; +} +{ +sub.f16x2 %56, r3069, r3084; +} +{ +sub.f16x2 %57, r3072, r3087; +} +{ +add.f16x2 r3113, r3114, r3115; +} +{ +add.f16x2 r3116, r3117, r3118; +} +{ +sub.f16x2 r3119, r3114, r3115; +} +{ +sub.f16x2 r3122, r3117, r3118; +} +{ +add.f16x2 r3125, r3126, r3127; +} +{ +add.f16x2 r3128, r3129, r3130; +} +{ +sub.f16x2 r3131, r3126, r3127; +} +{ +sub.f16x2 r3134, r3129, r3130; +} +{ +neg.f16x2 r3137, r3131; +} +{ +add.f16x2 %10, r3113, r3125; +} +{ +add.f16x2 %11, r3116, r3128; +} +{ +sub.f16x2 %42, r3113, r3125; +} +{ +sub.f16x2 %43, r3116, r3128; +} +{ +add.f16x2 %26, r3119, r3134; +} +{ +add.f16x2 %27, r3122, r3137; +} +{ +sub.f16x2 %58, r3119, r3134; +} +{ +sub.f16x2 %59, r3122, r3137; +} +{ +add.f16x2 r3163, r3164, r3165; +} +{ +add.f16x2 r3166, r3167, r3168; +} +{ +sub.f16x2 r3169, r3164, r3165; +} +{ +sub.f16x2 r3172, r3167, r3168; +} +{ +add.f16x2 r3175, r3176, r3177; +} +{ +add.f16x2 r3178, r3179, r3180; +} +{ +sub.f16x2 r3181, r3176, r3177; +} +{ +sub.f16x2 r3184, r3179, r3180; +} +{ +neg.f16x2 r3187, r3181; +} +{ +add.f16x2 %12, r3163, r3175; +} +{ +add.f16x2 %13, r3166, r3178; +} +{ +sub.f16x2 %44, r3163, r3175; +} +{ +sub.f16x2 %45, r3166, r3178; +} +{ +add.f16x2 %28, r3169, r3184; +} +{ +add.f16x2 %29, r3172, r3187; +} +{ +sub.f16x2 %60, r3169, r3184; +} +{ +sub.f16x2 %61, r3172, r3187; +} +{ +add.f16x2 r3213, r3214, r3215; +} +{ +add.f16x2 r3216, r3217, r3218; +} +{ +sub.f16x2 r3219, r3214, r3215; +} +{ +sub.f16x2 r3222, r3217, r3218; +} +{ +add.f16x2 r3225, r3226, r3227; +} +{ +add.f16x2 r3228, r3229, r3230; +} +{ +sub.f16x2 r3231, r3226, r3227; +} +{ +sub.f16x2 r3234, r3229, r3230; +} +{ +neg.f16x2 r3237, r3231; +} +{ +add.f16x2 %14, r3213, r3225; +} +{ +add.f16x2 %15, r3216, r3228; +} +{ +sub.f16x2 %46, r3213, r3225; +} +{ +sub.f16x2 %47, r3216, r3228; +} +{ +add.f16x2 %30, r3219, r3234; +} +{ +add.f16x2 %31, r3222, r3237; +} +{ +sub.f16x2 %62, r3219, r3234; +} +{ +sub.f16x2 %63, r3222, r3237; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<805, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<55>; +.reg .b32 r<374>; +.reg .b64 rd<2>; +mov.u32 r325, %tid.y; +shl.b32 r326, r325, 10; +mov.u32 r327, %4; +add.s32 r328, r327, r326; +mov.u32 r329, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r330, r329, 63; +shl.b32 r331, r329, 4; +and.b32 r332, r331, -1024; +add.s32 r333, r328, r332; +cvt.rn.f32.u32 f37, r330; +mul.f32 f38, f37, 0f3D490FDB; +cos.approx.f32 f1, f38; +sin.approx.f32 f39, f38; +neg.f32 f2, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r334, r331, 1008; +add.s32 r335, r333, r334; +st.shared.v2.f32 [r335], {r1, r4}; +st.shared.v2.f32 [r335+8], {r25, r32}; +barrier.sync 0; +shl.b32 r336, r329, 3; +and.b32 r337, r336, 504; +sub.s32 r338, r335, r337; +ld.shared.u32 r54, [r338]; +ld.shared.u32 r57, [r338+4]; +ld.shared.u32 r55, [r338+512]; +ld.shared.u32 r58, [r338+516]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r339, r329, 1, 5; +cvt.rn.f32.u32 f40, r339; +mul.f32 f41, f40, 0f3DC90FDB; +cos.approx.f32 f7, f41; +sin.approx.f32 f42, f41; +neg.f32 f8, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r340, r336, 8; +add.s32 r341, r333, r340; +barrier.sync 0; +and.b32 r342, r331, 992; +add.s32 r343, r341, r342; +st.shared.u32 [r343], r53; +st.shared.u32 [r343+4], r56; +st.shared.u32 [r343+16], r77; +st.shared.u32 [r343+20], r84; +barrier.sync 0; +and.b32 r344, r336, 496; +sub.s32 r345, r343, r344; +ld.shared.u32 r106, [r345]; +ld.shared.u32 r109, [r345+4]; +ld.shared.u32 r107, [r345+512]; +ld.shared.u32 r110, [r345+516]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r346, r329, 2, 4; +cvt.rn.f32.u32 f43, r346; +mul.f32 f44, f43, 0f3E490FDB; +cos.approx.f32 f13, f44; +sin.approx.f32 f45, f44; +neg.f32 f14, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +and.b32 r347, r336, 24; +add.s32 r348, r333, r347; +barrier.sync 0; +and.b32 r349, r331, 960; +add.s32 r350, r348, r349; +st.shared.u32 [r350], r105; +st.shared.u32 [r350+4], r108; +st.shared.u32 [r350+32], r129; +st.shared.u32 [r350+36], r136; +barrier.sync 0; +and.b32 r351, r336, 480; +sub.s32 r352, r350, r351; +ld.shared.u32 r158, [r352]; +ld.shared.u32 r161, [r352+4]; +ld.shared.u32 r159, [r352+512]; +ld.shared.u32 r162, [r352+516]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r353, r329, 3, 3; +cvt.rn.f32.u32 f46, r353; +mul.f32 f47, f46, 0f3EC90FDB; +cos.approx.f32 f19, f47; +sin.approx.f32 f48, f47; +neg.f32 f20, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +and.b32 r354, r336, 56; +add.s32 r355, r333, r354; +barrier.sync 0; +and.b32 r356, r331, 896; +add.s32 r357, r355, r356; +st.shared.u32 [r357], r157; +st.shared.u32 [r357+4], r160; +st.shared.u32 [r357+64], r181; +st.shared.u32 [r357+68], r188; +barrier.sync 0; +and.b32 r358, r336, 448; +sub.s32 r359, r357, r358; +ld.shared.u32 r210, [r359]; +ld.shared.u32 r213, [r359+4]; +ld.shared.u32 r211, [r359+512]; +ld.shared.u32 r214, [r359+516]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r360, r329, 4, 2; +cvt.rn.f32.u32 f49, r360; +mul.f32 f50, f49, 0f3F490FDB; +cos.approx.f32 f25, f50; +sin.approx.f32 f51, f50; +neg.f32 f26, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +and.b32 r361, r336, 120; +add.s32 r362, r333, r361; +barrier.sync 0; +and.b32 r363, r331, 768; +add.s32 r364, r362, r363; +st.shared.u32 [r364], r209; +st.shared.u32 [r364+4], r212; +st.shared.u32 [r364+128], r233; +st.shared.u32 [r364+132], r240; +barrier.sync 0; +and.b32 r365, r336, 384; +sub.s32 r366, r364, r365; +ld.shared.u32 r262, [r366]; +ld.shared.u32 r265, [r366+4]; +ld.shared.u32 r263, [r366+512]; +ld.shared.u32 r266, [r366+516]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r367, r329, 5, 1; +cvt.rn.f32.u32 f52, r367; +mul.f32 f53, f52, 0f3FC90FDB; +cos.approx.f32 f31, f53; +sin.approx.f32 f54, f53; +neg.f32 f32, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +neg.f16x2 r283, r280; +} +{ +fma.rn.f16x2 r285, r267, r276, r283; +} +{ +mul.f16x2 r289, r267, r278; +} +{ +fma.rn.f16x2 r292, r270, r276, r289; +} +and.b32 r368, r336, 248; +add.s32 r369, r333, r368; +barrier.sync 0; +and.b32 r370, r331, 512; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r261; +st.shared.u32 [r371+4], r264; +st.shared.u32 [r371+256], r285; +st.shared.u32 [r371+260], r292; +barrier.sync 0; +and.b32 r372, r336, 256; +sub.s32 r373, r371, r372; +ld.shared.u32 r314, [r373]; +ld.shared.u32 r317, [r373+4]; +ld.shared.u32 r315, [r373+512]; +ld.shared.u32 r318, [r373+516]; +{ +add.f16x2 %0, r314, r315; +} +{ +add.f16x2 %1, r317, r318; +} +{ +sub.f16x2 %2, r314, r315; +} +{ +sub.f16x2 %3, r317, r318; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<806, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<55>; +.reg .b32 r<374>; +.reg .b64 rd<2>; +mov.u32 r325, %tid.y; +shl.b32 r326, r325, 9; +mov.u32 r327, %4; +add.s32 r328, r327, r326; +mov.u32 r329, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r330, r329, 63; +shl.b32 r331, r329, 3; +and.b32 r332, r331, -512; +add.s32 r333, r328, r332; +cvt.rn.f32.u32 f37, r330; +mul.f32 f38, f37, 0f3D490FDB; +cos.approx.f32 f1, f38; +sin.approx.f32 f39, f38; +neg.f32 f2, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r334, r331, 504; +add.s32 r335, r333, r334; +st.shared.v2.f32 [r335], {r1, r25}; +barrier.sync 0; +shl.b32 r336, r329, 2; +and.b32 r337, r336, 252; +sub.s32 r338, r335, r337; +ld.shared.u32 r54, [r338]; +ld.shared.u32 r55, [r338+256]; +barrier.sync 0; +st.shared.v2.f32 [r335], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r338]; +ld.shared.u32 r58, [r338+256]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r339, r329, 1, 5; +and.b32 r340, r336, 4; +add.s32 r341, r333, r340; +cvt.rn.f32.u32 f40, r339; +mul.f32 f41, f40, 0f3DC90FDB; +cos.approx.f32 f7, f41; +sin.approx.f32 f42, f41; +neg.f32 f8, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r342, r331, 496; +add.s32 r343, r341, r342; +st.shared.u32 [r343], r53; +st.shared.u32 [r343+8], r77; +barrier.sync 0; +and.b32 r344, r336, 248; +sub.s32 r345, r343, r344; +ld.shared.u32 r106, [r345]; +ld.shared.u32 r107, [r345+256]; +barrier.sync 0; +st.shared.u32 [r343], r56; +st.shared.u32 [r343+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r345]; +ld.shared.u32 r110, [r345+256]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r346, r329, 2, 4; +and.b32 r347, r336, 12; +add.s32 r348, r333, r347; +cvt.rn.f32.u32 f43, r346; +mul.f32 f44, f43, 0f3E490FDB; +cos.approx.f32 f13, f44; +sin.approx.f32 f45, f44; +neg.f32 f14, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +barrier.sync 0; +and.b32 r349, r331, 480; +add.s32 r350, r348, r349; +st.shared.u32 [r350], r105; +st.shared.u32 [r350+16], r129; +barrier.sync 0; +and.b32 r351, r336, 240; +sub.s32 r352, r350, r351; +ld.shared.u32 r158, [r352]; +ld.shared.u32 r159, [r352+256]; +barrier.sync 0; +st.shared.u32 [r350], r108; +st.shared.u32 [r350+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r352]; +ld.shared.u32 r162, [r352+256]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r353, r329, 3, 3; +and.b32 r354, r336, 28; +add.s32 r355, r333, r354; +cvt.rn.f32.u32 f46, r353; +mul.f32 f47, f46, 0f3EC90FDB; +cos.approx.f32 f19, f47; +sin.approx.f32 f48, f47; +neg.f32 f20, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +barrier.sync 0; +and.b32 r356, r331, 448; +add.s32 r357, r355, r356; +st.shared.u32 [r357], r157; +st.shared.u32 [r357+32], r181; +barrier.sync 0; +and.b32 r358, r336, 224; +sub.s32 r359, r357, r358; +ld.shared.u32 r210, [r359]; +ld.shared.u32 r211, [r359+256]; +barrier.sync 0; +st.shared.u32 [r357], r160; +st.shared.u32 [r357+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r359]; +ld.shared.u32 r214, [r359+256]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r360, r329, 4, 2; +and.b32 r361, r336, 60; +add.s32 r362, r333, r361; +cvt.rn.f32.u32 f49, r360; +mul.f32 f50, f49, 0f3F490FDB; +cos.approx.f32 f25, f50; +sin.approx.f32 f51, f50; +neg.f32 f26, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +barrier.sync 0; +and.b32 r363, r331, 384; +add.s32 r364, r362, r363; +st.shared.u32 [r364], r209; +st.shared.u32 [r364+64], r233; +barrier.sync 0; +and.b32 r365, r336, 192; +sub.s32 r366, r364, r365; +ld.shared.u32 r262, [r366]; +ld.shared.u32 r263, [r366+256]; +barrier.sync 0; +st.shared.u32 [r364], r212; +st.shared.u32 [r364+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r366]; +ld.shared.u32 r266, [r366+256]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r367, r329, 5, 1; +and.b32 r368, r336, 124; +add.s32 r369, r333, r368; +cvt.rn.f32.u32 f52, r367; +mul.f32 f53, f52, 0f3FC90FDB; +cos.approx.f32 f31, f53; +sin.approx.f32 f54, f53; +neg.f32 f32, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +neg.f16x2 r283, r280; +} +{ +fma.rn.f16x2 r285, r267, r276, r283; +} +{ +mul.f16x2 r289, r267, r278; +} +{ +fma.rn.f16x2 r292, r270, r276, r289; +} +barrier.sync 0; +and.b32 r370, r331, 256; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r261; +st.shared.u32 [r371+128], r285; +barrier.sync 0; +and.b32 r372, r336, 128; +sub.s32 r373, r371, r372; +ld.shared.u32 r314, [r373]; +ld.shared.u32 r315, [r373+256]; +barrier.sync 0; +st.shared.u32 [r371], r264; +st.shared.u32 [r371+128], r292; +barrier.sync 0; +ld.shared.u32 r317, [r373]; +ld.shared.u32 r318, [r373+256]; +{ +add.f16x2 %0, r314, r315; +} +{ +add.f16x2 %1, r317, r318; +} +{ +sub.f16x2 %2, r314, r315; +} +{ +sub.f16x2 %3, r317, r318; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..85c1053109798 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp16_inv.hpp.inc @@ -0,0 +1,17074 @@ +#ifndef CUFFTDX_FFT_128_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_128_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<999, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<210>; +.reg .b32 r<1579>; +.reg .b64 rd<2>; +mov.u32 r1567, %tid.y; +shl.b32 r1568, r1567, 9; +mov.u32 r1569, %32; +add.s32 r1570, r1569, r1568; +mov.u32 r1571, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f190, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f188, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f76, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r398, {low, high}; +} +mov.f32 f74, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r404, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1572, r1571, 7; +shl.b32 r1573, r1571, 6; +and.b32 r1574, r1573, -512; +add.s32 r1575, r1570, r1574; +cvt.rn.f32.u32 f207, r1572; +mul.f32 f208, f207, 0f3D490FDB; +cos.approx.f32 f117, f208; +sin.approx.f32 f209, f208; +neg.f32 f118, f209; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1576, r1573, 448; +add.s32 r1577, r1575, r1576; +st.shared.v4.f32 [r1577], {r521, r627, r664, r701}; +st.shared.v4.f32 [r1577+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r1577+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r1577+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r1578, r1572, -60, r1577; +ld.shared.u32 r1176, [r1578]; +ld.shared.u32 r1372, [r1578+32]; +ld.shared.u32 r1226, [r1578+64]; +ld.shared.u32 r1422, [r1578+96]; +ld.shared.u32 r1188, [r1578+128]; +ld.shared.u32 r1384, [r1578+160]; +ld.shared.u32 r1238, [r1578+192]; +ld.shared.u32 r1434, [r1578+224]; +ld.shared.u32 r1177, [r1578+256]; +ld.shared.u32 r1373, [r1578+288]; +ld.shared.u32 r1227, [r1578+320]; +ld.shared.u32 r1423, [r1578+352]; +ld.shared.u32 r1189, [r1578+384]; +ld.shared.u32 r1385, [r1578+416]; +ld.shared.u32 r1239, [r1578+448]; +ld.shared.u32 r1435, [r1578+480]; +barrier.sync 0; +st.shared.v4.f32 [r1577], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1577+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1577+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1577+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1578]; +ld.shared.u32 r1375, [r1578+32]; +ld.shared.u32 r1229, [r1578+64]; +ld.shared.u32 r1425, [r1578+96]; +ld.shared.u32 r1191, [r1578+128]; +ld.shared.u32 r1387, [r1578+160]; +ld.shared.u32 r1241, [r1578+192]; +ld.shared.u32 r1437, [r1578+224]; +ld.shared.u32 r1180, [r1578+256]; +ld.shared.u32 r1376, [r1578+288]; +ld.shared.u32 r1230, [r1578+320]; +ld.shared.u32 r1426, [r1578+352]; +ld.shared.u32 r1192, [r1578+384]; +ld.shared.u32 r1388, [r1578+416]; +ld.shared.u32 r1242, [r1578+448]; +ld.shared.u32 r1438, [r1578+480]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 %0, r1201, r1251; +} +{ +add.f16x2 %1, r1204, r1254; +} +{ +sub.f16x2 %16, r1201, r1251; +} +{ +sub.f16x2 %17, r1204, r1254; +} +{ +add.f16x2 %4, r1213, r1295; +} +{ +add.f16x2 %5, r1216, r1301; +} +{ +sub.f16x2 %20, r1213, r1295; +} +{ +sub.f16x2 %21, r1216, r1301; +} +{ +add.f16x2 %8, r1207, r1305; +} +{ +add.f16x2 %9, r1210, r1257; +} +{ +sub.f16x2 %24, r1207, r1305; +} +{ +sub.f16x2 %25, r1210, r1257; +} +{ +add.f16x2 %12, r1219, r1313; +} +{ +add.f16x2 %13, r1222, r1319; +} +{ +sub.f16x2 %28, r1219, r1313; +} +{ +sub.f16x2 %29, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 %2, r1397, r1447; +} +{ +add.f16x2 %3, r1400, r1450; +} +{ +sub.f16x2 %18, r1397, r1447; +} +{ +sub.f16x2 %19, r1400, r1450; +} +{ +add.f16x2 %6, r1409, r1491; +} +{ +add.f16x2 %7, r1412, r1497; +} +{ +sub.f16x2 %22, r1409, r1491; +} +{ +sub.f16x2 %23, r1412, r1497; +} +{ +add.f16x2 %10, r1403, r1501; +} +{ +add.f16x2 %11, r1406, r1453; +} +{ +sub.f16x2 %26, r1403, r1501; +} +{ +sub.f16x2 %27, r1406, r1453; +} +{ +add.f16x2 %14, r1415, r1509; +} +{ +add.f16x2 %15, r1418, r1515; +} +{ +sub.f16x2 %30, r1415, r1509; +} +{ +sub.f16x2 %31, r1418, r1515; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1000, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<985>; +.reg .b64 rd<2>; +mov.u32 r965, %tid.y; +shl.b32 r966, r965, 10; +mov.u32 r967, %16; +add.s32 r968, r967, r966; +mov.u32 r969, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f58, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f90, 0f3F800000; +mov.f32 f56, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r970, r969, 15; +shl.b32 r971, r969, 6; +and.b32 r972, r971, -1024; +add.s32 r973, r968, r972; +cvt.rn.f32.u32 f93, r970; +mul.f32 f94, f93, 0f3D490FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r974, r971, 960; +add.s32 r975, r973, r974; +st.shared.v4.f32 [r975], {r149, r152, r207, r216}; +st.shared.v4.f32 [r975+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r975+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r975+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r976, r970, -56, r975; +ld.shared.u32 r460, [r976]; +ld.shared.u32 r463, [r976+4]; +ld.shared.u32 r510, [r976+128]; +ld.shared.u32 r513, [r976+132]; +ld.shared.u32 r472, [r976+256]; +ld.shared.u32 r475, [r976+260]; +ld.shared.u32 r522, [r976+384]; +ld.shared.u32 r525, [r976+388]; +ld.shared.u32 r461, [r976+512]; +ld.shared.u32 r464, [r976+516]; +ld.shared.u32 r511, [r976+640]; +ld.shared.u32 r514, [r976+644]; +ld.shared.u32 r473, [r976+768]; +ld.shared.u32 r476, [r976+772]; +ld.shared.u32 r523, [r976+896]; +ld.shared.u32 r526, [r976+900]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r977, r969, 8; +bfe.u32 r978, r969, 3, 1; +cvt.rn.f32.u32 f96, r978; +mul.f32 f97, f96, 0f3EC90FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r979, r969, 3; +and.b32 r980, r979, 56; +add.s32 r981, r973, r980; +barrier.sync 0; +and.b32 r982, r971, 512; +add.s32 r983, r981, r982; +st.shared.u32 [r983], r607; +st.shared.u32 [r983+4], r610; +st.shared.u32 [r983+64], r665; +st.shared.u32 [r983+68], r674; +st.shared.u32 [r983+128], r702; +st.shared.u32 [r983+132], r711; +st.shared.u32 [r983+192], r739; +st.shared.u32 [r983+196], r748; +st.shared.u32 [r983+256], r776; +st.shared.u32 [r983+260], r785; +st.shared.u32 [r983+320], r813; +st.shared.u32 [r983+324], r822; +st.shared.u32 [r983+384], r850; +st.shared.u32 [r983+388], r859; +st.shared.u32 [r983+448], r887; +st.shared.u32 [r983+452], r896; +barrier.sync 0; +mad.lo.s32 r984, r977, -56, r983; +ld.shared.u32 r918, [r984]; +ld.shared.u32 r921, [r984+4]; +ld.shared.u32 r930, [r984+128]; +ld.shared.u32 r933, [r984+132]; +ld.shared.u32 r942, [r984+256]; +ld.shared.u32 r945, [r984+260]; +ld.shared.u32 r954, [r984+384]; +ld.shared.u32 r957, [r984+388]; +ld.shared.u32 r919, [r984+512]; +ld.shared.u32 r922, [r984+516]; +ld.shared.u32 r931, [r984+640]; +ld.shared.u32 r934, [r984+644]; +ld.shared.u32 r943, [r984+768]; +ld.shared.u32 r946, [r984+772]; +ld.shared.u32 r955, [r984+896]; +ld.shared.u32 r958, [r984+900]; +{ +add.f16x2 %0, r918, r919; +} +{ +add.f16x2 %1, r921, r922; +} +{ +sub.f16x2 %8, r918, r919; +} +{ +sub.f16x2 %9, r921, r922; +} +{ +add.f16x2 %2, r930, r931; +} +{ +add.f16x2 %3, r933, r934; +} +{ +sub.f16x2 %10, r930, r931; +} +{ +sub.f16x2 %11, r933, r934; +} +{ +add.f16x2 %4, r942, r943; +} +{ +add.f16x2 %5, r945, r946; +} +{ +sub.f16x2 %12, r942, r943; +} +{ +sub.f16x2 %13, r945, r946; +} +{ +add.f16x2 %6, r954, r955; +} +{ +add.f16x2 %7, r957, r958; +} +{ +sub.f16x2 %14, r954, r955; +} +{ +sub.f16x2 %15, r957, r958; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1001, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<210>; +.reg .b32 r<1579>; +.reg .b64 rd<2>; +mov.u32 r1567, %tid.y; +shl.b32 r1568, r1567, 10; +mov.u32 r1569, %32; +add.s32 r1570, r1569, r1568; +mov.u32 r1571, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f190, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f188, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f76, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r398, {low, high}; +} +mov.f32 f74, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r404, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1572, r1571, 7; +shl.b32 r1573, r1571, 7; +and.b32 r1574, r1573, -1024; +add.s32 r1575, r1570, r1574; +cvt.rn.f32.u32 f207, r1572; +mul.f32 f208, f207, 0f3D490FDB; +cos.approx.f32 f117, f208; +sin.approx.f32 f209, f208; +neg.f32 f118, f209; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1576, r1573, 896; +add.s32 r1577, r1575, r1576; +st.shared.v4.f32 [r1577], {r521, r524, r627, r636}; +st.shared.v4.f32 [r1577+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r1577+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r1577+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r1577+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r1577+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r1577+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r1577+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r1578, r1572, -120, r1577; +ld.shared.u32 r1176, [r1578]; +ld.shared.u32 r1179, [r1578+4]; +ld.shared.u32 r1372, [r1578+64]; +ld.shared.u32 r1375, [r1578+68]; +ld.shared.u32 r1226, [r1578+128]; +ld.shared.u32 r1229, [r1578+132]; +ld.shared.u32 r1422, [r1578+192]; +ld.shared.u32 r1425, [r1578+196]; +ld.shared.u32 r1188, [r1578+256]; +ld.shared.u32 r1191, [r1578+260]; +ld.shared.u32 r1384, [r1578+320]; +ld.shared.u32 r1387, [r1578+324]; +ld.shared.u32 r1238, [r1578+384]; +ld.shared.u32 r1241, [r1578+388]; +ld.shared.u32 r1434, [r1578+448]; +ld.shared.u32 r1437, [r1578+452]; +ld.shared.u32 r1177, [r1578+512]; +ld.shared.u32 r1180, [r1578+516]; +ld.shared.u32 r1373, [r1578+576]; +ld.shared.u32 r1376, [r1578+580]; +ld.shared.u32 r1227, [r1578+640]; +ld.shared.u32 r1230, [r1578+644]; +ld.shared.u32 r1423, [r1578+704]; +ld.shared.u32 r1426, [r1578+708]; +ld.shared.u32 r1189, [r1578+768]; +ld.shared.u32 r1192, [r1578+772]; +ld.shared.u32 r1385, [r1578+832]; +ld.shared.u32 r1388, [r1578+836]; +ld.shared.u32 r1239, [r1578+896]; +ld.shared.u32 r1242, [r1578+900]; +ld.shared.u32 r1435, [r1578+960]; +ld.shared.u32 r1438, [r1578+964]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 %0, r1201, r1251; +} +{ +add.f16x2 %1, r1204, r1254; +} +{ +sub.f16x2 %16, r1201, r1251; +} +{ +sub.f16x2 %17, r1204, r1254; +} +{ +add.f16x2 %4, r1213, r1295; +} +{ +add.f16x2 %5, r1216, r1301; +} +{ +sub.f16x2 %20, r1213, r1295; +} +{ +sub.f16x2 %21, r1216, r1301; +} +{ +add.f16x2 %8, r1207, r1305; +} +{ +add.f16x2 %9, r1210, r1257; +} +{ +sub.f16x2 %24, r1207, r1305; +} +{ +sub.f16x2 %25, r1210, r1257; +} +{ +add.f16x2 %12, r1219, r1313; +} +{ +add.f16x2 %13, r1222, r1319; +} +{ +sub.f16x2 %28, r1219, r1313; +} +{ +sub.f16x2 %29, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 %2, r1397, r1447; +} +{ +add.f16x2 %3, r1400, r1450; +} +{ +sub.f16x2 %18, r1397, r1447; +} +{ +sub.f16x2 %19, r1400, r1450; +} +{ +add.f16x2 %6, r1409, r1491; +} +{ +add.f16x2 %7, r1412, r1497; +} +{ +sub.f16x2 %22, r1409, r1491; +} +{ +sub.f16x2 %23, r1412, r1497; +} +{ +add.f16x2 %10, r1403, r1501; +} +{ +add.f16x2 %11, r1406, r1453; +} +{ +sub.f16x2 %26, r1403, r1501; +} +{ +sub.f16x2 %27, r1406, r1453; +} +{ +add.f16x2 %14, r1415, r1509; +} +{ +add.f16x2 %15, r1418, r1515; +} +{ +sub.f16x2 %30, r1415, r1509; +} +{ +sub.f16x2 %31, r1418, r1515; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1002, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<985>; +.reg .b64 rd<2>; +mov.u32 r965, %tid.y; +shl.b32 r966, r965, 9; +mov.u32 r967, %16; +add.s32 r968, r967, r966; +mov.u32 r969, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f58, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f90, 0f3F800000; +mov.f32 f56, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r970, r969, 15; +shl.b32 r971, r969, 5; +and.b32 r972, r971, -512; +add.s32 r973, r968, r972; +cvt.rn.f32.u32 f93, r970; +mul.f32 f94, f93, 0f3D490FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r974, r971, 480; +add.s32 r975, r973, r974; +st.shared.v4.f32 [r975], {r149, r207, r244, r281}; +st.shared.v4.f32 [r975+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r976, r970, -28, r975; +ld.shared.u32 r460, [r976]; +ld.shared.u32 r510, [r976+64]; +ld.shared.u32 r472, [r976+128]; +ld.shared.u32 r522, [r976+192]; +ld.shared.u32 r461, [r976+256]; +ld.shared.u32 r511, [r976+320]; +ld.shared.u32 r473, [r976+384]; +ld.shared.u32 r523, [r976+448]; +barrier.sync 0; +st.shared.v4.f32 [r975], {r152, r216, r253, r290}; +st.shared.v4.f32 [r975+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r976]; +ld.shared.u32 r513, [r976+64]; +ld.shared.u32 r475, [r976+128]; +ld.shared.u32 r525, [r976+192]; +ld.shared.u32 r464, [r976+256]; +ld.shared.u32 r514, [r976+320]; +ld.shared.u32 r476, [r976+384]; +ld.shared.u32 r526, [r976+448]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r977, r969, 8; +bfe.u32 r978, r969, 3, 1; +shl.b32 r979, r969, 2; +and.b32 r980, r979, 28; +add.s32 r981, r973, r980; +cvt.rn.f32.u32 f96, r978; +mul.f32 f97, f96, 0f3EC90FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r982, r971, 256; +add.s32 r983, r981, r982; +st.shared.u32 [r983], r607; +st.shared.u32 [r983+32], r665; +st.shared.u32 [r983+64], r702; +st.shared.u32 [r983+96], r739; +st.shared.u32 [r983+128], r776; +st.shared.u32 [r983+160], r813; +st.shared.u32 [r983+192], r850; +st.shared.u32 [r983+224], r887; +barrier.sync 0; +mad.lo.s32 r984, r977, -28, r983; +ld.shared.u32 r918, [r984]; +ld.shared.u32 r930, [r984+64]; +ld.shared.u32 r942, [r984+128]; +ld.shared.u32 r954, [r984+192]; +ld.shared.u32 r919, [r984+256]; +ld.shared.u32 r931, [r984+320]; +ld.shared.u32 r943, [r984+384]; +ld.shared.u32 r955, [r984+448]; +barrier.sync 0; +st.shared.u32 [r983], r610; +st.shared.u32 [r983+32], r674; +st.shared.u32 [r983+64], r711; +st.shared.u32 [r983+96], r748; +st.shared.u32 [r983+128], r785; +st.shared.u32 [r983+160], r822; +st.shared.u32 [r983+192], r859; +st.shared.u32 [r983+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r984]; +ld.shared.u32 r933, [r984+64]; +ld.shared.u32 r945, [r984+128]; +ld.shared.u32 r957, [r984+192]; +ld.shared.u32 r922, [r984+256]; +ld.shared.u32 r934, [r984+320]; +ld.shared.u32 r946, [r984+384]; +ld.shared.u32 r958, [r984+448]; +{ +add.f16x2 %0, r918, r919; +} +{ +add.f16x2 %1, r921, r922; +} +{ +sub.f16x2 %8, r918, r919; +} +{ +sub.f16x2 %9, r921, r922; +} +{ +add.f16x2 %2, r930, r931; +} +{ +add.f16x2 %3, r933, r934; +} +{ +sub.f16x2 %10, r930, r931; +} +{ +sub.f16x2 %11, r933, r934; +} +{ +add.f16x2 %4, r942, r943; +} +{ +add.f16x2 %5, r945, r946; +} +{ +sub.f16x2 %12, r942, r943; +} +{ +sub.f16x2 %13, r945, r946; +} +{ +add.f16x2 %6, r954, r955; +} +{ +add.f16x2 %7, r957, r958; +} +{ +sub.f16x2 %14, r954, r955; +} +{ +sub.f16x2 %15, r957, r958; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1003, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<544>; +.reg .b64 rd<2>; +mov.u32 r517, %tid.y; +shl.b32 r518, r517, 10; +mov.u32 r519, %8; +add.s32 r520, r519, r518; +mov.u32 r521, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r522, r521, 31; +shl.b32 r523, r521, 5; +and.b32 r524, r523, -1024; +add.s32 r525, r520, r524; +cvt.rn.f32.u32 f31, r522; +mul.f32 f32, f31, 0f3D490FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r526, r523, 992; +add.s32 r527, r525, r526; +st.shared.v4.f32 [r527], {r27, r30, r61, r70}; +st.shared.v4.f32 [r527+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r528, r522, -24, r527; +ld.shared.u32 r166, [r528]; +ld.shared.u32 r169, [r528+4]; +ld.shared.u32 r178, [r528+256]; +ld.shared.u32 r181, [r528+260]; +ld.shared.u32 r167, [r528+512]; +ld.shared.u32 r170, [r528+516]; +ld.shared.u32 r179, [r528+768]; +ld.shared.u32 r182, [r528+772]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r529, r521, 28; +bfe.u32 r530, r521, 2, 3; +cvt.rn.f32.u32 f34, r530; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +shl.b32 r531, r521, 3; +and.b32 r532, r531, 24; +add.s32 r533, r525, r532; +barrier.sync 0; +and.b32 r534, r523, 896; +add.s32 r535, r533, r534; +st.shared.u32 [r535], r191; +st.shared.u32 [r535+4], r194; +st.shared.u32 [r535+32], r225; +st.shared.u32 [r535+36], r234; +st.shared.u32 [r535+64], r262; +st.shared.u32 [r535+68], r271; +st.shared.u32 [r535+96], r299; +st.shared.u32 [r535+100], r308; +barrier.sync 0; +mad.lo.s32 r536, r529, -24, r535; +ld.shared.u32 r330, [r536]; +ld.shared.u32 r333, [r536+4]; +ld.shared.u32 r342, [r536+256]; +ld.shared.u32 r345, [r536+260]; +ld.shared.u32 r331, [r536+512]; +ld.shared.u32 r334, [r536+516]; +ld.shared.u32 r343, [r536+768]; +ld.shared.u32 r346, [r536+772]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r353; +} +{ +add.f16x2 r370, r338, r347; +} +{ +sub.f16x2 r373, r335, r353; +} +{ +sub.f16x2 r376, r338, r347; +} +and.b32 r537, r521, 16; +bfe.u32 r538, r521, 4, 1; +cvt.rn.f32.u32 f37, r538; +mul.f32 f38, f37, 0f3F490FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +fma.rn.f16x2 r389, r367, r382, r386; +} +{ +mul.f16x2 r393, r367, r384; +} +{ +neg.f16x2 r396, r393; +} +{ +fma.rn.f16x2 r398, r370, r382, r396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +fma.rn.f16x2 r426, r361, r419, r423; +} +{ +mul.f16x2 r430, r361, r421; +} +{ +neg.f16x2 r433, r430; +} +{ +fma.rn.f16x2 r435, r364, r419, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +fma.rn.f16x2 r463, r373, r456, r460; +} +{ +mul.f16x2 r467, r373, r458; +} +{ +neg.f16x2 r470, r467; +} +{ +fma.rn.f16x2 r472, r376, r456, r470; +} +and.b32 r539, r531, 120; +add.s32 r540, r525, r539; +barrier.sync 0; +and.b32 r541, r523, 512; +add.s32 r542, r540, r541; +st.shared.u32 [r542], r355; +st.shared.u32 [r542+4], r358; +st.shared.u32 [r542+128], r389; +st.shared.u32 [r542+132], r398; +st.shared.u32 [r542+256], r426; +st.shared.u32 [r542+260], r435; +st.shared.u32 [r542+384], r463; +st.shared.u32 [r542+388], r472; +barrier.sync 0; +mad.lo.s32 r543, r537, -24, r542; +ld.shared.u32 r494, [r543]; +ld.shared.u32 r497, [r543+4]; +ld.shared.u32 r506, [r543+256]; +ld.shared.u32 r509, [r543+260]; +ld.shared.u32 r495, [r543+512]; +ld.shared.u32 r498, [r543+516]; +ld.shared.u32 r507, [r543+768]; +ld.shared.u32 r510, [r543+772]; +{ +add.f16x2 %0, r494, r495; +} +{ +add.f16x2 %1, r497, r498; +} +{ +sub.f16x2 %4, r494, r495; +} +{ +sub.f16x2 %5, r497, r498; +} +{ +add.f16x2 %2, r506, r507; +} +{ +add.f16x2 %3, r509, r510; +} +{ +sub.f16x2 %6, r506, r507; +} +{ +sub.f16x2 %7, r509, r510; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1004, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<544>; +.reg .b64 rd<2>; +mov.u32 r517, %tid.y; +shl.b32 r518, r517, 9; +mov.u32 r519, %8; +add.s32 r520, r519, r518; +mov.u32 r521, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r522, r521, 31; +shl.b32 r523, r521, 4; +and.b32 r524, r523, -512; +add.s32 r525, r520, r524; +cvt.rn.f32.u32 f31, r522; +mul.f32 f32, f31, 0f3D490FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r526, r523, 496; +add.s32 r527, r525, r526; +st.shared.v4.f32 [r527], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r528, r522, -12, r527; +ld.shared.u32 r166, [r528]; +ld.shared.u32 r178, [r528+128]; +ld.shared.u32 r167, [r528+256]; +ld.shared.u32 r179, [r528+384]; +barrier.sync 0; +st.shared.v4.f32 [r527], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r528]; +ld.shared.u32 r181, [r528+128]; +ld.shared.u32 r170, [r528+256]; +ld.shared.u32 r182, [r528+384]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r529, r521, 28; +bfe.u32 r530, r521, 2, 3; +shl.b32 r531, r521, 2; +and.b32 r532, r531, 12; +add.s32 r533, r525, r532; +cvt.rn.f32.u32 f34, r530; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +barrier.sync 0; +and.b32 r534, r523, 448; +add.s32 r535, r533, r534; +st.shared.u32 [r535], r191; +st.shared.u32 [r535+16], r225; +st.shared.u32 [r535+32], r262; +st.shared.u32 [r535+48], r299; +barrier.sync 0; +mad.lo.s32 r536, r529, -12, r535; +ld.shared.u32 r330, [r536]; +ld.shared.u32 r342, [r536+128]; +ld.shared.u32 r331, [r536+256]; +ld.shared.u32 r343, [r536+384]; +barrier.sync 0; +st.shared.u32 [r535], r194; +st.shared.u32 [r535+16], r234; +st.shared.u32 [r535+32], r271; +st.shared.u32 [r535+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r536]; +ld.shared.u32 r345, [r536+128]; +ld.shared.u32 r334, [r536+256]; +ld.shared.u32 r346, [r536+384]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r353; +} +{ +add.f16x2 r370, r338, r347; +} +{ +sub.f16x2 r373, r335, r353; +} +{ +sub.f16x2 r376, r338, r347; +} +and.b32 r537, r521, 16; +bfe.u32 r538, r521, 4, 1; +and.b32 r539, r531, 60; +add.s32 r540, r525, r539; +cvt.rn.f32.u32 f37, r538; +mul.f32 f38, f37, 0f3F490FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +fma.rn.f16x2 r389, r367, r382, r386; +} +{ +mul.f16x2 r393, r367, r384; +} +{ +neg.f16x2 r396, r393; +} +{ +fma.rn.f16x2 r398, r370, r382, r396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +fma.rn.f16x2 r426, r361, r419, r423; +} +{ +mul.f16x2 r430, r361, r421; +} +{ +neg.f16x2 r433, r430; +} +{ +fma.rn.f16x2 r435, r364, r419, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +fma.rn.f16x2 r463, r373, r456, r460; +} +{ +mul.f16x2 r467, r373, r458; +} +{ +neg.f16x2 r470, r467; +} +{ +fma.rn.f16x2 r472, r376, r456, r470; +} +barrier.sync 0; +and.b32 r541, r523, 256; +add.s32 r542, r540, r541; +st.shared.u32 [r542], r355; +st.shared.u32 [r542+64], r389; +st.shared.u32 [r542+128], r426; +st.shared.u32 [r542+192], r463; +barrier.sync 0; +mad.lo.s32 r543, r537, -12, r542; +ld.shared.u32 r494, [r543]; +ld.shared.u32 r506, [r543+128]; +ld.shared.u32 r495, [r543+256]; +ld.shared.u32 r507, [r543+384]; +barrier.sync 0; +st.shared.u32 [r542], r358; +st.shared.u32 [r542+64], r398; +st.shared.u32 [r542+128], r435; +st.shared.u32 [r542+192], r472; +barrier.sync 0; +ld.shared.u32 r497, [r543]; +ld.shared.u32 r509, [r543+128]; +ld.shared.u32 r498, [r543+256]; +ld.shared.u32 r510, [r543+384]; +{ +add.f16x2 %0, r494, r495; +} +{ +add.f16x2 %1, r497, r498; +} +{ +sub.f16x2 %4, r494, r495; +} +{ +sub.f16x2 %5, r497, r498; +} +{ +add.f16x2 %2, r506, r507; +} +{ +add.f16x2 %3, r509, r510; +} +{ +sub.f16x2 %6, r506, r507; +} +{ +sub.f16x2 %7, r509, r510; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1005, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3339>; +.reg .b64 rd<3>; +mov.u32 r3263, %tid.y; +shl.b32 r3264, r3263, 10; +mov.u32 r3265, %64; +add.s32 r3266, r3265, r3264; +mov.u32 r3267, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f280, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +mov.f32 f278, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %84, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %84, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f272, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r398, {low, high}; +} +mov.f32 f270, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %83; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %83; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3268, r3267, 3; +shl.b32 r3269, r3267, 8; +and.b32 r3270, r3269, -1024; +add.s32 r3271, r3266, r3270; +cvt.rn.f32.u32 f423, r3268; +mul.f32 f424, f423, 0f3D490FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r3272, r3269, 768; +add.s32 r3273, r3271, r3272; +st.shared.v4.f32 [r3273], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r3273+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r3273+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r3273+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r3273+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r3273+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r3273+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r3273+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r3273+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r3273+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r3273+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r3273+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r3273+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r3273+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r3273+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r3273+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r3274, r3268, -248, r3273; +ld.shared.u32 r2864, [r3274]; +ld.shared.u32 r2867, [r3274+4]; +ld.shared.u32 r2914, [r3274+32]; +ld.shared.u32 r2917, [r3274+36]; +ld.shared.u32 r2964, [r3274+64]; +ld.shared.u32 r2967, [r3274+68]; +ld.shared.u32 r3014, [r3274+96]; +ld.shared.u32 r3017, [r3274+100]; +ld.shared.u32 r3064, [r3274+128]; +ld.shared.u32 r3067, [r3274+132]; +ld.shared.u32 r3114, [r3274+160]; +ld.shared.u32 r3117, [r3274+164]; +ld.shared.u32 r3164, [r3274+192]; +ld.shared.u32 r3167, [r3274+196]; +ld.shared.u32 r3214, [r3274+224]; +ld.shared.u32 r3217, [r3274+228]; +ld.shared.u32 r2876, [r3274+256]; +ld.shared.u32 r2879, [r3274+260]; +ld.shared.u32 r2926, [r3274+288]; +ld.shared.u32 r2929, [r3274+292]; +ld.shared.u32 r2976, [r3274+320]; +ld.shared.u32 r2979, [r3274+324]; +ld.shared.u32 r3026, [r3274+352]; +ld.shared.u32 r3029, [r3274+356]; +ld.shared.u32 r3076, [r3274+384]; +ld.shared.u32 r3079, [r3274+388]; +ld.shared.u32 r3126, [r3274+416]; +ld.shared.u32 r3129, [r3274+420]; +ld.shared.u32 r3176, [r3274+448]; +ld.shared.u32 r3179, [r3274+452]; +ld.shared.u32 r3226, [r3274+480]; +ld.shared.u32 r3229, [r3274+484]; +ld.shared.u32 r2865, [r3274+512]; +ld.shared.u32 r2868, [r3274+516]; +ld.shared.u32 r2915, [r3274+544]; +ld.shared.u32 r2918, [r3274+548]; +ld.shared.u32 r2965, [r3274+576]; +ld.shared.u32 r2968, [r3274+580]; +ld.shared.u32 r3015, [r3274+608]; +ld.shared.u32 r3018, [r3274+612]; +ld.shared.u32 r3065, [r3274+640]; +ld.shared.u32 r3068, [r3274+644]; +ld.shared.u32 r3115, [r3274+672]; +ld.shared.u32 r3118, [r3274+676]; +ld.shared.u32 r3165, [r3274+704]; +ld.shared.u32 r3168, [r3274+708]; +ld.shared.u32 r3215, [r3274+736]; +ld.shared.u32 r3218, [r3274+740]; +ld.shared.u32 r2877, [r3274+768]; +ld.shared.u32 r2880, [r3274+772]; +ld.shared.u32 r2927, [r3274+800]; +ld.shared.u32 r2930, [r3274+804]; +ld.shared.u32 r2977, [r3274+832]; +ld.shared.u32 r2980, [r3274+836]; +ld.shared.u32 r3027, [r3274+864]; +ld.shared.u32 r3030, [r3274+868]; +ld.shared.u32 r3077, [r3274+896]; +ld.shared.u32 r3080, [r3274+900]; +ld.shared.u32 r3127, [r3274+928]; +ld.shared.u32 r3130, [r3274+932]; +ld.shared.u32 r3177, [r3274+960]; +ld.shared.u32 r3180, [r3274+964]; +ld.shared.u32 r3227, [r3274+992]; +ld.shared.u32 r3230, [r3274+996]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 %0, r2863, r2875; +} +{ +add.f16x2 %1, r2866, r2878; +} +{ +sub.f16x2 %32, r2863, r2875; +} +{ +sub.f16x2 %33, r2866, r2878; +} +{ +add.f16x2 %16, r2869, r2887; +} +{ +add.f16x2 %17, r2872, r2881; +} +{ +sub.f16x2 %48, r2869, r2887; +} +{ +sub.f16x2 %49, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 %2, r2913, r2925; +} +{ +add.f16x2 %3, r2916, r2928; +} +{ +sub.f16x2 %34, r2913, r2925; +} +{ +sub.f16x2 %35, r2916, r2928; +} +{ +add.f16x2 %18, r2919, r2937; +} +{ +add.f16x2 %19, r2922, r2931; +} +{ +sub.f16x2 %50, r2919, r2937; +} +{ +sub.f16x2 %51, r2922, r2931; +} +{ +add.f16x2 r2963, r2964, r2965; +} +{ +add.f16x2 r2966, r2967, r2968; +} +{ +sub.f16x2 r2969, r2964, r2965; +} +{ +sub.f16x2 r2972, r2967, r2968; +} +{ +add.f16x2 r2975, r2976, r2977; +} +{ +add.f16x2 r2978, r2979, r2980; +} +{ +sub.f16x2 r2981, r2976, r2977; +} +{ +sub.f16x2 r2984, r2979, r2980; +} +{ +neg.f16x2 r2987, r2984; +} +{ +add.f16x2 %4, r2963, r2975; +} +{ +add.f16x2 %5, r2966, r2978; +} +{ +sub.f16x2 %36, r2963, r2975; +} +{ +sub.f16x2 %37, r2966, r2978; +} +{ +add.f16x2 %20, r2969, r2987; +} +{ +add.f16x2 %21, r2972, r2981; +} +{ +sub.f16x2 %52, r2969, r2987; +} +{ +sub.f16x2 %53, r2972, r2981; +} +{ +add.f16x2 r3013, r3014, r3015; +} +{ +add.f16x2 r3016, r3017, r3018; +} +{ +sub.f16x2 r3019, r3014, r3015; +} +{ +sub.f16x2 r3022, r3017, r3018; +} +{ +add.f16x2 r3025, r3026, r3027; +} +{ +add.f16x2 r3028, r3029, r3030; +} +{ +sub.f16x2 r3031, r3026, r3027; +} +{ +sub.f16x2 r3034, r3029, r3030; +} +{ +neg.f16x2 r3037, r3034; +} +{ +add.f16x2 %6, r3013, r3025; +} +{ +add.f16x2 %7, r3016, r3028; +} +{ +sub.f16x2 %38, r3013, r3025; +} +{ +sub.f16x2 %39, r3016, r3028; +} +{ +add.f16x2 %22, r3019, r3037; +} +{ +add.f16x2 %23, r3022, r3031; +} +{ +sub.f16x2 %54, r3019, r3037; +} +{ +sub.f16x2 %55, r3022, r3031; +} +{ +add.f16x2 r3063, r3064, r3065; +} +{ +add.f16x2 r3066, r3067, r3068; +} +{ +sub.f16x2 r3069, r3064, r3065; +} +{ +sub.f16x2 r3072, r3067, r3068; +} +{ +add.f16x2 r3075, r3076, r3077; +} +{ +add.f16x2 r3078, r3079, r3080; +} +{ +sub.f16x2 r3081, r3076, r3077; +} +{ +sub.f16x2 r3084, r3079, r3080; +} +{ +neg.f16x2 r3087, r3084; +} +{ +add.f16x2 %8, r3063, r3075; +} +{ +add.f16x2 %9, r3066, r3078; +} +{ +sub.f16x2 %40, r3063, r3075; +} +{ +sub.f16x2 %41, r3066, r3078; +} +{ +add.f16x2 %24, r3069, r3087; +} +{ +add.f16x2 %25, r3072, r3081; +} +{ +sub.f16x2 %56, r3069, r3087; +} +{ +sub.f16x2 %57, r3072, r3081; +} +{ +add.f16x2 r3113, r3114, r3115; +} +{ +add.f16x2 r3116, r3117, r3118; +} +{ +sub.f16x2 r3119, r3114, r3115; +} +{ +sub.f16x2 r3122, r3117, r3118; +} +{ +add.f16x2 r3125, r3126, r3127; +} +{ +add.f16x2 r3128, r3129, r3130; +} +{ +sub.f16x2 r3131, r3126, r3127; +} +{ +sub.f16x2 r3134, r3129, r3130; +} +{ +neg.f16x2 r3137, r3134; +} +{ +add.f16x2 %10, r3113, r3125; +} +{ +add.f16x2 %11, r3116, r3128; +} +{ +sub.f16x2 %42, r3113, r3125; +} +{ +sub.f16x2 %43, r3116, r3128; +} +{ +add.f16x2 %26, r3119, r3137; +} +{ +add.f16x2 %27, r3122, r3131; +} +{ +sub.f16x2 %58, r3119, r3137; +} +{ +sub.f16x2 %59, r3122, r3131; +} +{ +add.f16x2 r3163, r3164, r3165; +} +{ +add.f16x2 r3166, r3167, r3168; +} +{ +sub.f16x2 r3169, r3164, r3165; +} +{ +sub.f16x2 r3172, r3167, r3168; +} +{ +add.f16x2 r3175, r3176, r3177; +} +{ +add.f16x2 r3178, r3179, r3180; +} +{ +sub.f16x2 r3181, r3176, r3177; +} +{ +sub.f16x2 r3184, r3179, r3180; +} +{ +neg.f16x2 r3187, r3184; +} +{ +add.f16x2 %12, r3163, r3175; +} +{ +add.f16x2 %13, r3166, r3178; +} +{ +sub.f16x2 %44, r3163, r3175; +} +{ +sub.f16x2 %45, r3166, r3178; +} +{ +add.f16x2 %28, r3169, r3187; +} +{ +add.f16x2 %29, r3172, r3181; +} +{ +sub.f16x2 %60, r3169, r3187; +} +{ +sub.f16x2 %61, r3172, r3181; +} +{ +add.f16x2 r3213, r3214, r3215; +} +{ +add.f16x2 r3216, r3217, r3218; +} +{ +sub.f16x2 r3219, r3214, r3215; +} +{ +sub.f16x2 r3222, r3217, r3218; +} +{ +add.f16x2 r3225, r3226, r3227; +} +{ +add.f16x2 r3228, r3229, r3230; +} +{ +sub.f16x2 r3231, r3226, r3227; +} +{ +sub.f16x2 r3234, r3229, r3230; +} +{ +neg.f16x2 r3237, r3234; +} +{ +add.f16x2 %14, r3213, r3225; +} +{ +add.f16x2 %15, r3216, r3228; +} +{ +sub.f16x2 %46, r3213, r3225; +} +{ +sub.f16x2 %47, r3216, r3228; +} +{ +add.f16x2 %30, r3219, r3237; +} +{ +add.f16x2 %31, r3222, r3231; +} +{ +sub.f16x2 %62, r3219, r3237; +} +{ +sub.f16x2 %63, r3222, r3231; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1006, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3339>; +.reg .b64 rd<3>; +mov.u32 r3263, %tid.y; +shl.b32 r3264, r3263, 9; +mov.u32 r3265, %64; +add.s32 r3266, r3265, r3264; +mov.u32 r3267, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f280, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +mov.f32 f278, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %84, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %84, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f272, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r398, {low, high}; +} +mov.f32 f270, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %83; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %83; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3268, r3267, 3; +shl.b32 r3269, r3267, 7; +and.b32 r3270, r3269, -512; +add.s32 r3271, r3266, r3270; +cvt.rn.f32.u32 f423, r3268; +mul.f32 f424, f423, 0f3D490FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r3272, r3269, 384; +add.s32 r3273, r3271, r3272; +st.shared.v4.f32 [r3273], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r3273+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r3273+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r3273+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r3273+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r3273+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r3273+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r3273+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r3274, r3268, -124, r3273; +ld.shared.u32 r2864, [r3274]; +ld.shared.u32 r2914, [r3274+16]; +ld.shared.u32 r2964, [r3274+32]; +ld.shared.u32 r3014, [r3274+48]; +ld.shared.u32 r3064, [r3274+64]; +ld.shared.u32 r3114, [r3274+80]; +ld.shared.u32 r3164, [r3274+96]; +ld.shared.u32 r3214, [r3274+112]; +ld.shared.u32 r2876, [r3274+128]; +ld.shared.u32 r2926, [r3274+144]; +ld.shared.u32 r2976, [r3274+160]; +ld.shared.u32 r3026, [r3274+176]; +ld.shared.u32 r3076, [r3274+192]; +ld.shared.u32 r3126, [r3274+208]; +ld.shared.u32 r3176, [r3274+224]; +ld.shared.u32 r3226, [r3274+240]; +ld.shared.u32 r2865, [r3274+256]; +ld.shared.u32 r2915, [r3274+272]; +ld.shared.u32 r2965, [r3274+288]; +ld.shared.u32 r3015, [r3274+304]; +ld.shared.u32 r3065, [r3274+320]; +ld.shared.u32 r3115, [r3274+336]; +ld.shared.u32 r3165, [r3274+352]; +ld.shared.u32 r3215, [r3274+368]; +ld.shared.u32 r2877, [r3274+384]; +ld.shared.u32 r2927, [r3274+400]; +ld.shared.u32 r2977, [r3274+416]; +ld.shared.u32 r3027, [r3274+432]; +ld.shared.u32 r3077, [r3274+448]; +ld.shared.u32 r3127, [r3274+464]; +ld.shared.u32 r3177, [r3274+480]; +ld.shared.u32 r3227, [r3274+496]; +barrier.sync 0; +st.shared.v4.f32 [r3273], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r3273+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r3273+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r3273+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r3273+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r3273+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r3273+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r3273+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r3274]; +ld.shared.u32 r2917, [r3274+16]; +ld.shared.u32 r2967, [r3274+32]; +ld.shared.u32 r3017, [r3274+48]; +ld.shared.u32 r3067, [r3274+64]; +ld.shared.u32 r3117, [r3274+80]; +ld.shared.u32 r3167, [r3274+96]; +ld.shared.u32 r3217, [r3274+112]; +ld.shared.u32 r2879, [r3274+128]; +ld.shared.u32 r2929, [r3274+144]; +ld.shared.u32 r2979, [r3274+160]; +ld.shared.u32 r3029, [r3274+176]; +ld.shared.u32 r3079, [r3274+192]; +ld.shared.u32 r3129, [r3274+208]; +ld.shared.u32 r3179, [r3274+224]; +ld.shared.u32 r3229, [r3274+240]; +ld.shared.u32 r2868, [r3274+256]; +ld.shared.u32 r2918, [r3274+272]; +ld.shared.u32 r2968, [r3274+288]; +ld.shared.u32 r3018, [r3274+304]; +ld.shared.u32 r3068, [r3274+320]; +ld.shared.u32 r3118, [r3274+336]; +ld.shared.u32 r3168, [r3274+352]; +ld.shared.u32 r3218, [r3274+368]; +ld.shared.u32 r2880, [r3274+384]; +ld.shared.u32 r2930, [r3274+400]; +ld.shared.u32 r2980, [r3274+416]; +ld.shared.u32 r3030, [r3274+432]; +ld.shared.u32 r3080, [r3274+448]; +ld.shared.u32 r3130, [r3274+464]; +ld.shared.u32 r3180, [r3274+480]; +ld.shared.u32 r3230, [r3274+496]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 %0, r2863, r2875; +} +{ +add.f16x2 %1, r2866, r2878; +} +{ +sub.f16x2 %32, r2863, r2875; +} +{ +sub.f16x2 %33, r2866, r2878; +} +{ +add.f16x2 %16, r2869, r2887; +} +{ +add.f16x2 %17, r2872, r2881; +} +{ +sub.f16x2 %48, r2869, r2887; +} +{ +sub.f16x2 %49, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 %2, r2913, r2925; +} +{ +add.f16x2 %3, r2916, r2928; +} +{ +sub.f16x2 %34, r2913, r2925; +} +{ +sub.f16x2 %35, r2916, r2928; +} +{ +add.f16x2 %18, r2919, r2937; +} +{ +add.f16x2 %19, r2922, r2931; +} +{ +sub.f16x2 %50, r2919, r2937; +} +{ +sub.f16x2 %51, r2922, r2931; +} +{ +add.f16x2 r2963, r2964, r2965; +} +{ +add.f16x2 r2966, r2967, r2968; +} +{ +sub.f16x2 r2969, r2964, r2965; +} +{ +sub.f16x2 r2972, r2967, r2968; +} +{ +add.f16x2 r2975, r2976, r2977; +} +{ +add.f16x2 r2978, r2979, r2980; +} +{ +sub.f16x2 r2981, r2976, r2977; +} +{ +sub.f16x2 r2984, r2979, r2980; +} +{ +neg.f16x2 r2987, r2984; +} +{ +add.f16x2 %4, r2963, r2975; +} +{ +add.f16x2 %5, r2966, r2978; +} +{ +sub.f16x2 %36, r2963, r2975; +} +{ +sub.f16x2 %37, r2966, r2978; +} +{ +add.f16x2 %20, r2969, r2987; +} +{ +add.f16x2 %21, r2972, r2981; +} +{ +sub.f16x2 %52, r2969, r2987; +} +{ +sub.f16x2 %53, r2972, r2981; +} +{ +add.f16x2 r3013, r3014, r3015; +} +{ +add.f16x2 r3016, r3017, r3018; +} +{ +sub.f16x2 r3019, r3014, r3015; +} +{ +sub.f16x2 r3022, r3017, r3018; +} +{ +add.f16x2 r3025, r3026, r3027; +} +{ +add.f16x2 r3028, r3029, r3030; +} +{ +sub.f16x2 r3031, r3026, r3027; +} +{ +sub.f16x2 r3034, r3029, r3030; +} +{ +neg.f16x2 r3037, r3034; +} +{ +add.f16x2 %6, r3013, r3025; +} +{ +add.f16x2 %7, r3016, r3028; +} +{ +sub.f16x2 %38, r3013, r3025; +} +{ +sub.f16x2 %39, r3016, r3028; +} +{ +add.f16x2 %22, r3019, r3037; +} +{ +add.f16x2 %23, r3022, r3031; +} +{ +sub.f16x2 %54, r3019, r3037; +} +{ +sub.f16x2 %55, r3022, r3031; +} +{ +add.f16x2 r3063, r3064, r3065; +} +{ +add.f16x2 r3066, r3067, r3068; +} +{ +sub.f16x2 r3069, r3064, r3065; +} +{ +sub.f16x2 r3072, r3067, r3068; +} +{ +add.f16x2 r3075, r3076, r3077; +} +{ +add.f16x2 r3078, r3079, r3080; +} +{ +sub.f16x2 r3081, r3076, r3077; +} +{ +sub.f16x2 r3084, r3079, r3080; +} +{ +neg.f16x2 r3087, r3084; +} +{ +add.f16x2 %8, r3063, r3075; +} +{ +add.f16x2 %9, r3066, r3078; +} +{ +sub.f16x2 %40, r3063, r3075; +} +{ +sub.f16x2 %41, r3066, r3078; +} +{ +add.f16x2 %24, r3069, r3087; +} +{ +add.f16x2 %25, r3072, r3081; +} +{ +sub.f16x2 %56, r3069, r3087; +} +{ +sub.f16x2 %57, r3072, r3081; +} +{ +add.f16x2 r3113, r3114, r3115; +} +{ +add.f16x2 r3116, r3117, r3118; +} +{ +sub.f16x2 r3119, r3114, r3115; +} +{ +sub.f16x2 r3122, r3117, r3118; +} +{ +add.f16x2 r3125, r3126, r3127; +} +{ +add.f16x2 r3128, r3129, r3130; +} +{ +sub.f16x2 r3131, r3126, r3127; +} +{ +sub.f16x2 r3134, r3129, r3130; +} +{ +neg.f16x2 r3137, r3134; +} +{ +add.f16x2 %10, r3113, r3125; +} +{ +add.f16x2 %11, r3116, r3128; +} +{ +sub.f16x2 %42, r3113, r3125; +} +{ +sub.f16x2 %43, r3116, r3128; +} +{ +add.f16x2 %26, r3119, r3137; +} +{ +add.f16x2 %27, r3122, r3131; +} +{ +sub.f16x2 %58, r3119, r3137; +} +{ +sub.f16x2 %59, r3122, r3131; +} +{ +add.f16x2 r3163, r3164, r3165; +} +{ +add.f16x2 r3166, r3167, r3168; +} +{ +sub.f16x2 r3169, r3164, r3165; +} +{ +sub.f16x2 r3172, r3167, r3168; +} +{ +add.f16x2 r3175, r3176, r3177; +} +{ +add.f16x2 r3178, r3179, r3180; +} +{ +sub.f16x2 r3181, r3176, r3177; +} +{ +sub.f16x2 r3184, r3179, r3180; +} +{ +neg.f16x2 r3187, r3184; +} +{ +add.f16x2 %12, r3163, r3175; +} +{ +add.f16x2 %13, r3166, r3178; +} +{ +sub.f16x2 %44, r3163, r3175; +} +{ +sub.f16x2 %45, r3166, r3178; +} +{ +add.f16x2 %28, r3169, r3187; +} +{ +add.f16x2 %29, r3172, r3181; +} +{ +sub.f16x2 %60, r3169, r3187; +} +{ +sub.f16x2 %61, r3172, r3181; +} +{ +add.f16x2 r3213, r3214, r3215; +} +{ +add.f16x2 r3216, r3217, r3218; +} +{ +sub.f16x2 r3219, r3214, r3215; +} +{ +sub.f16x2 r3222, r3217, r3218; +} +{ +add.f16x2 r3225, r3226, r3227; +} +{ +add.f16x2 r3228, r3229, r3230; +} +{ +sub.f16x2 r3231, r3226, r3227; +} +{ +sub.f16x2 r3234, r3229, r3230; +} +{ +neg.f16x2 r3237, r3234; +} +{ +add.f16x2 %14, r3213, r3225; +} +{ +add.f16x2 %15, r3216, r3228; +} +{ +sub.f16x2 %46, r3213, r3225; +} +{ +sub.f16x2 %47, r3216, r3228; +} +{ +add.f16x2 %30, r3219, r3237; +} +{ +add.f16x2 %31, r3222, r3231; +} +{ +sub.f16x2 %62, r3219, r3237; +} +{ +sub.f16x2 %63, r3222, r3231; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1007, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<55>; +.reg .b32 r<374>; +.reg .b64 rd<2>; +mov.u32 r325, %tid.y; +shl.b32 r326, r325, 10; +mov.u32 r327, %4; +add.s32 r328, r327, r326; +mov.u32 r329, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r330, r329, 63; +shl.b32 r331, r329, 4; +and.b32 r332, r331, -1024; +add.s32 r333, r328, r332; +cvt.rn.f32.u32 f37, r330; +mul.f32 f38, f37, 0f3D490FDB; +cos.approx.f32 f1, f38; +sin.approx.f32 f39, f38; +neg.f32 f2, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r334, r331, 1008; +add.s32 r335, r333, r334; +st.shared.v2.f32 [r335], {r1, r4}; +st.shared.v2.f32 [r335+8], {r23, r32}; +barrier.sync 0; +shl.b32 r336, r329, 3; +and.b32 r337, r336, 504; +sub.s32 r338, r335, r337; +ld.shared.u32 r54, [r338]; +ld.shared.u32 r57, [r338+4]; +ld.shared.u32 r55, [r338+512]; +ld.shared.u32 r58, [r338+516]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r339, r329, 1, 5; +cvt.rn.f32.u32 f40, r339; +mul.f32 f41, f40, 0f3DC90FDB; +cos.approx.f32 f7, f41; +sin.approx.f32 f42, f41; +neg.f32 f8, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r340, r336, 8; +add.s32 r341, r333, r340; +barrier.sync 0; +and.b32 r342, r331, 992; +add.s32 r343, r341, r342; +st.shared.u32 [r343], r53; +st.shared.u32 [r343+4], r56; +st.shared.u32 [r343+16], r75; +st.shared.u32 [r343+20], r84; +barrier.sync 0; +and.b32 r344, r336, 496; +sub.s32 r345, r343, r344; +ld.shared.u32 r106, [r345]; +ld.shared.u32 r109, [r345+4]; +ld.shared.u32 r107, [r345+512]; +ld.shared.u32 r110, [r345+516]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r346, r329, 2, 4; +cvt.rn.f32.u32 f43, r346; +mul.f32 f44, f43, 0f3E490FDB; +cos.approx.f32 f13, f44; +sin.approx.f32 f45, f44; +neg.f32 f14, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +and.b32 r347, r336, 24; +add.s32 r348, r333, r347; +barrier.sync 0; +and.b32 r349, r331, 960; +add.s32 r350, r348, r349; +st.shared.u32 [r350], r105; +st.shared.u32 [r350+4], r108; +st.shared.u32 [r350+32], r127; +st.shared.u32 [r350+36], r136; +barrier.sync 0; +and.b32 r351, r336, 480; +sub.s32 r352, r350, r351; +ld.shared.u32 r158, [r352]; +ld.shared.u32 r161, [r352+4]; +ld.shared.u32 r159, [r352+512]; +ld.shared.u32 r162, [r352+516]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r353, r329, 3, 3; +cvt.rn.f32.u32 f46, r353; +mul.f32 f47, f46, 0f3EC90FDB; +cos.approx.f32 f19, f47; +sin.approx.f32 f48, f47; +neg.f32 f20, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +and.b32 r354, r336, 56; +add.s32 r355, r333, r354; +barrier.sync 0; +and.b32 r356, r331, 896; +add.s32 r357, r355, r356; +st.shared.u32 [r357], r157; +st.shared.u32 [r357+4], r160; +st.shared.u32 [r357+64], r179; +st.shared.u32 [r357+68], r188; +barrier.sync 0; +and.b32 r358, r336, 448; +sub.s32 r359, r357, r358; +ld.shared.u32 r210, [r359]; +ld.shared.u32 r213, [r359+4]; +ld.shared.u32 r211, [r359+512]; +ld.shared.u32 r214, [r359+516]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r360, r329, 4, 2; +cvt.rn.f32.u32 f49, r360; +mul.f32 f50, f49, 0f3F490FDB; +cos.approx.f32 f25, f50; +sin.approx.f32 f51, f50; +neg.f32 f26, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +and.b32 r361, r336, 120; +add.s32 r362, r333, r361; +barrier.sync 0; +and.b32 r363, r331, 768; +add.s32 r364, r362, r363; +st.shared.u32 [r364], r209; +st.shared.u32 [r364+4], r212; +st.shared.u32 [r364+128], r231; +st.shared.u32 [r364+132], r240; +barrier.sync 0; +and.b32 r365, r336, 384; +sub.s32 r366, r364, r365; +ld.shared.u32 r262, [r366]; +ld.shared.u32 r265, [r366+4]; +ld.shared.u32 r263, [r366+512]; +ld.shared.u32 r266, [r366+516]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r367, r329, 5, 1; +cvt.rn.f32.u32 f52, r367; +mul.f32 f53, f52, 0f3FC90FDB; +cos.approx.f32 f31, f53; +sin.approx.f32 f54, f53; +neg.f32 f32, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +fma.rn.f16x2 r283, r267, r276, r280; +} +{ +mul.f16x2 r287, r267, r278; +} +{ +neg.f16x2 r290, r287; +} +{ +fma.rn.f16x2 r292, r270, r276, r290; +} +and.b32 r368, r336, 248; +add.s32 r369, r333, r368; +barrier.sync 0; +and.b32 r370, r331, 512; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r261; +st.shared.u32 [r371+4], r264; +st.shared.u32 [r371+256], r283; +st.shared.u32 [r371+260], r292; +barrier.sync 0; +and.b32 r372, r336, 256; +sub.s32 r373, r371, r372; +ld.shared.u32 r314, [r373]; +ld.shared.u32 r317, [r373+4]; +ld.shared.u32 r315, [r373+512]; +ld.shared.u32 r318, [r373+516]; +{ +add.f16x2 %0, r314, r315; +} +{ +add.f16x2 %1, r317, r318; +} +{ +sub.f16x2 %2, r314, r315; +} +{ +sub.f16x2 %3, r317, r318; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1008, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<55>; +.reg .b32 r<374>; +.reg .b64 rd<2>; +mov.u32 r325, %tid.y; +shl.b32 r326, r325, 9; +mov.u32 r327, %4; +add.s32 r328, r327, r326; +mov.u32 r329, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r330, r329, 63; +shl.b32 r331, r329, 3; +and.b32 r332, r331, -512; +add.s32 r333, r328, r332; +cvt.rn.f32.u32 f37, r330; +mul.f32 f38, f37, 0f3D490FDB; +cos.approx.f32 f1, f38; +sin.approx.f32 f39, f38; +neg.f32 f2, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r334, r331, 504; +add.s32 r335, r333, r334; +st.shared.v2.f32 [r335], {r1, r23}; +barrier.sync 0; +shl.b32 r336, r329, 2; +and.b32 r337, r336, 252; +sub.s32 r338, r335, r337; +ld.shared.u32 r54, [r338]; +ld.shared.u32 r55, [r338+256]; +barrier.sync 0; +st.shared.v2.f32 [r335], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r338]; +ld.shared.u32 r58, [r338+256]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r339, r329, 1, 5; +and.b32 r340, r336, 4; +add.s32 r341, r333, r340; +cvt.rn.f32.u32 f40, r339; +mul.f32 f41, f40, 0f3DC90FDB; +cos.approx.f32 f7, f41; +sin.approx.f32 f42, f41; +neg.f32 f8, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r342, r331, 496; +add.s32 r343, r341, r342; +st.shared.u32 [r343], r53; +st.shared.u32 [r343+8], r75; +barrier.sync 0; +and.b32 r344, r336, 248; +sub.s32 r345, r343, r344; +ld.shared.u32 r106, [r345]; +ld.shared.u32 r107, [r345+256]; +barrier.sync 0; +st.shared.u32 [r343], r56; +st.shared.u32 [r343+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r345]; +ld.shared.u32 r110, [r345+256]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r346, r329, 2, 4; +and.b32 r347, r336, 12; +add.s32 r348, r333, r347; +cvt.rn.f32.u32 f43, r346; +mul.f32 f44, f43, 0f3E490FDB; +cos.approx.f32 f13, f44; +sin.approx.f32 f45, f44; +neg.f32 f14, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +barrier.sync 0; +and.b32 r349, r331, 480; +add.s32 r350, r348, r349; +st.shared.u32 [r350], r105; +st.shared.u32 [r350+16], r127; +barrier.sync 0; +and.b32 r351, r336, 240; +sub.s32 r352, r350, r351; +ld.shared.u32 r158, [r352]; +ld.shared.u32 r159, [r352+256]; +barrier.sync 0; +st.shared.u32 [r350], r108; +st.shared.u32 [r350+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r352]; +ld.shared.u32 r162, [r352+256]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r353, r329, 3, 3; +and.b32 r354, r336, 28; +add.s32 r355, r333, r354; +cvt.rn.f32.u32 f46, r353; +mul.f32 f47, f46, 0f3EC90FDB; +cos.approx.f32 f19, f47; +sin.approx.f32 f48, f47; +neg.f32 f20, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +barrier.sync 0; +and.b32 r356, r331, 448; +add.s32 r357, r355, r356; +st.shared.u32 [r357], r157; +st.shared.u32 [r357+32], r179; +barrier.sync 0; +and.b32 r358, r336, 224; +sub.s32 r359, r357, r358; +ld.shared.u32 r210, [r359]; +ld.shared.u32 r211, [r359+256]; +barrier.sync 0; +st.shared.u32 [r357], r160; +st.shared.u32 [r357+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r359]; +ld.shared.u32 r214, [r359+256]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r360, r329, 4, 2; +and.b32 r361, r336, 60; +add.s32 r362, r333, r361; +cvt.rn.f32.u32 f49, r360; +mul.f32 f50, f49, 0f3F490FDB; +cos.approx.f32 f25, f50; +sin.approx.f32 f51, f50; +neg.f32 f26, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +barrier.sync 0; +and.b32 r363, r331, 384; +add.s32 r364, r362, r363; +st.shared.u32 [r364], r209; +st.shared.u32 [r364+64], r231; +barrier.sync 0; +and.b32 r365, r336, 192; +sub.s32 r366, r364, r365; +ld.shared.u32 r262, [r366]; +ld.shared.u32 r263, [r366+256]; +barrier.sync 0; +st.shared.u32 [r364], r212; +st.shared.u32 [r364+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r366]; +ld.shared.u32 r266, [r366+256]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r367, r329, 5, 1; +and.b32 r368, r336, 124; +add.s32 r369, r333, r368; +cvt.rn.f32.u32 f52, r367; +mul.f32 f53, f52, 0f3FC90FDB; +cos.approx.f32 f31, f53; +sin.approx.f32 f54, f53; +neg.f32 f32, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +fma.rn.f16x2 r283, r267, r276, r280; +} +{ +mul.f16x2 r287, r267, r278; +} +{ +neg.f16x2 r290, r287; +} +{ +fma.rn.f16x2 r292, r270, r276, r290; +} +barrier.sync 0; +and.b32 r370, r331, 256; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r261; +st.shared.u32 [r371+128], r283; +barrier.sync 0; +and.b32 r372, r336, 128; +sub.s32 r373, r371, r372; +ld.shared.u32 r314, [r373]; +ld.shared.u32 r315, [r373+256]; +barrier.sync 0; +st.shared.u32 [r371], r264; +st.shared.u32 [r371+128], r292; +barrier.sync 0; +ld.shared.u32 r317, [r373]; +ld.shared.u32 r318, [r373+256]; +{ +add.f16x2 %0, r314, r315; +} +{ +add.f16x2 %1, r317, r318; +} +{ +sub.f16x2 %2, r314, r315; +} +{ +sub.f16x2 %3, r317, r318; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..f8be0a72cb06d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp32_fwd.hpp.inc @@ -0,0 +1,4593 @@ +#ifndef CUFFTDX_FFT_128_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_128_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<51, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<535>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 448; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+32]; +ld.shared.f32 f391, [r13+64]; +ld.shared.f32 f392, [r13+96]; +ld.shared.f32 f393, [r13+128]; +ld.shared.f32 f394, [r13+160]; +ld.shared.f32 f395, [r13+192]; +ld.shared.f32 f396, [r13+224]; +ld.shared.f32 f397, [r13+256]; +ld.shared.f32 f398, [r13+288]; +ld.shared.f32 f399, [r13+320]; +ld.shared.f32 f400, [r13+352]; +ld.shared.f32 f401, [r13+384]; +ld.shared.f32 f402, [r13+416]; +ld.shared.f32 f403, [r13+448]; +ld.shared.f32 f404, [r13+480]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+32]; +ld.shared.f32 f407, [r13+64]; +ld.shared.f32 f408, [r13+96]; +ld.shared.f32 f409, [r13+128]; +ld.shared.f32 f410, [r13+160]; +ld.shared.f32 f411, [r13+192]; +ld.shared.f32 f412, [r13+224]; +ld.shared.f32 f413, [r13+256]; +ld.shared.f32 f414, [r13+288]; +ld.shared.f32 f415, [r13+320]; +ld.shared.f32 f416, [r13+352]; +ld.shared.f32 f417, [r13+384]; +ld.shared.f32 f418, [r13+416]; +ld.shared.f32 f419, [r13+448]; +ld.shared.f32 f420, [r13+480]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f390, f398; +add.f32 f463, f406, f414; +sub.f32 f464, f390, f398; +sub.f32 f465, f406, f414; +add.f32 f466, f394, f402; +add.f32 f467, f410, f418; +sub.f32 f468, f394, f402; +sub.f32 f469, f410, f418; +add.f32 f470, f462, f466; +add.f32 f471, f463, f467; +sub.f32 f472, f462, f466; +sub.f32 f473, f463, f467; +add.f32 f474, f464, f469; +sub.f32 f475, f465, f468; +sub.f32 f476, f464, f469; +add.f32 f477, f465, f468; +add.f32 f478, f392, f400; +add.f32 f479, f408, f416; +sub.f32 f480, f392, f400; +sub.f32 f481, f408, f416; +add.f32 f482, f396, f404; +add.f32 f483, f412, f420; +sub.f32 f484, f396, f404; +sub.f32 f485, f412, f420; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +mul.f32 f494, f490, 0f3F3504F3; +mul.f32 f495, f491, 0fBF3504F3; +sub.f32 f496, f494, f495; +mul.f32 f497, f491, 0f3F3504F3; +fma.rn.f32 f498, f490, 0fBF3504F3, f497; +mul.f32 f499, f492, 0fBF3504F3; +mul.f32 f500, f493, 0fBF3504F3; +sub.f32 f501, f499, f500; +add.f32 f502, f499, f500; +add.f32 %0, f429, f445; +add.f32 %1, f430, f446; +add.f32 %2, f470, f486; +add.f32 %3, f471, f487; +add.f32 %5, f434, f457; +add.f32 %4, f433, f455; +add.f32 %7, f475, f498; +add.f32 %6, f474, f496; +sub.f32 %9, f432, f447; +add.f32 %8, f431, f448; +sub.f32 %11, f473, f488; +add.f32 %10, f472, f489; +add.f32 %13, f436, f461; +add.f32 %12, f435, f460; +add.f32 %15, f477, f502; +add.f32 %14, f476, f501; +sub.f32 %16, f429, f445; +sub.f32 %17, f430, f446; +sub.f32 %18, f470, f486; +sub.f32 %19, f471, f487; +sub.f32 %21, f434, f457; +sub.f32 %20, f433, f455; +sub.f32 %23, f475, f498; +sub.f32 %22, f474, f496; +add.f32 %25, f432, f447; +sub.f32 %24, f431, f448; +add.f32 %27, f473, f488; +sub.f32 %26, f472, f489; +sub.f32 %29, f436, f461; +sub.f32 %28, f435, f460; +sub.f32 %31, f477, f502; +sub.f32 %30, f476, f501; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<52, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<365>; +.reg .b32 r<20>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 960; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+128]; +ld.shared.v2.f32 {f167, f168}, [r13+256]; +ld.shared.v2.f32 {f171, f172}, [r13+384]; +ld.shared.v2.f32 {f175, f176}, [r13+512]; +ld.shared.v2.f32 {f179, f180}, [r13+640]; +ld.shared.v2.f32 {f183, f184}, [r13+768]; +ld.shared.v2.f32 {f187, f188}, [r13+896]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 8; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 512; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+128]; +ld.shared.v2.f32 {f325, f326}, [r19+256]; +ld.shared.v2.f32 {f329, f330}, [r19+384]; +ld.shared.v2.f32 {f333, f334}, [r19+512]; +ld.shared.v2.f32 {f337, f338}, [r19+640]; +ld.shared.v2.f32 {f341, f342}, [r19+768]; +ld.shared.v2.f32 {f345, f346}, [r19+896]; +add.f32 %1, f318, f334; +add.f32 %0, f317, f333; +add.f32 %3, f322, f338; +add.f32 %2, f321, f337; +add.f32 %5, f326, f342; +add.f32 %4, f325, f341; +add.f32 %7, f330, f346; +add.f32 %6, f329, f345; +sub.f32 %9, f318, f334; +sub.f32 %8, f317, f333; +sub.f32 %11, f322, f338; +sub.f32 %10, f321, f337; +sub.f32 %13, f326, f342; +sub.f32 %12, f325, f341; +sub.f32 %15, f330, f346; +sub.f32 %14, f329, f345; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<53, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<692>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f684, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f682, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f681, f684, f682; +sub.f32 f76, f684, f682; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f680, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f677, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f675, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f674, f677, f675; +sub.f32 f92, f677, f675; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f673, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f673, 0fBF3504F3; +mul.f32 f672, f93, 0f3F3504F3; +sub.f32 f99, f672, f98; +mul.f32 f100, f673, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f671, f681, f674; +sub.f32 f109, f681, f674; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f670, f680, f101; +sub.f32 f113, f680, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f669, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f668, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f666, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f663, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f662, f666, f663; +sub.f32 f133, f666, f663; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f661, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f659, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f657, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f656, f659, f657; +sub.f32 f149, f659, f657; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f655, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f655, 0fBF3504F3; +mul.f32 f654, f150, 0f3F3504F3; +sub.f32 f156, f654, f155; +mul.f32 f157, f655, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f653, f662, f656; +sub.f32 f166, f662, f656; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f652, f661, f158; +sub.f32 f170, f661, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f651, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f650, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f648, f167, 0f3F6C835E; +mul.f32 f649, f652, 0fBEC3EF15; +sub.f32 f181, f648, f649; +mul.f32 f182, f652, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f646, f171, 0f3F3504F3; +mul.f32 f647, f651, 0fBF3504F3; +sub.f32 f186, f646, f647; +mul.f32 f187, f651, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f644, f175, 0f3EC3EF15; +mul.f32 f645, f650, 0fBF6C835E; +sub.f32 f191, f644, f645; +mul.f32 f192, f650, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f642, f169, 0fBEC3EF15; +mul.f32 f643, f170, 0fBF6C835E; +sub.f32 f196, f642, f643; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f640, f177, 0fBF6C835E; +mul.f32 f641, f178, 0fBEC3EF15; +sub.f32 f205, f640, f641; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f639, f670, f183; +sub.f32 f213, f670, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f638, f669, f188; +sub.f32 f217, f669, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f637, f668, f193; +sub.f32 f221, f668, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f636, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f635, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f634, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f633, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +and.b32 r14, r15, 7; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f639; +mul.f32 f244, f238, f639; +mul.f32 f246, f239, f239; +mul.f32 f632, f238, f238; +sub.f32 f247, f632, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f638; +mul.f32 f252, f247, f638; +mul.f32 f630, f238, f247; +mul.f32 f631, f239, f249; +sub.f32 f255, f630, f631; +mul.f32 f629, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f637; +mul.f32 f260, f255, f637; +mul.f32 f262, f239, f257; +mul.f32 f628, f238, f255; +sub.f32 f263, f628, f262; +mul.f32 f627, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f636; +mul.f32 f268, f263, f636; +mul.f32 f270, f239, f265; +mul.f32 f626, f238, f263; +sub.f32 f271, f626, f270; +mul.f32 f625, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f635; +mul.f32 f276, f271, f635; +mul.f32 f623, f238, f271; +mul.f32 f624, f239, f273; +sub.f32 f279, f623, f624; +mul.f32 f622, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f634; +mul.f32 f284, f279, f634; +mul.f32 f286, f239, f281; +mul.f32 f621, f238, f279; +sub.f32 f287, f621, f286; +mul.f32 f620, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f633; +mul.f32 f292, f287, f633; +mul.f32 f294, f239, f289; +mul.f32 f619, f238, f287; +sub.f32 f295, f619, f294; +mul.f32 f618, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f617, f671, f653; +mul.f32 f299, f297, f617; +mul.f32 f300, f295, f617; +mul.f32 f615, f238, f295; +mul.f32 f616, f239, f297; +sub.f32 f303, f615, f616; +sub.f32 f614, f106, f163; +mul.f32 f613, f295, f614; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f612, f238, f303; +sub.f32 f311, f612, f310; +mul.f32 f611, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f609, f238, f311; +mul.f32 f610, f239, f313; +sub.f32 f319, f609, f610; +mul.f32 f608, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f607, f238, f319; +sub.f32 f327, f607, f326; +mul.f32 f606, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f605, f238, f327; +sub.f32 f335, f605, f334; +mul.f32 f604, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f602, f238, f335; +mul.f32 f603, f239, f337; +sub.f32 f343, f602, f603; +mul.f32 f601, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f600, f238, f343; +sub.f32 f351, f600, f350; +mul.f32 f599, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f598, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f689, f671, f653; +mul.f32 f688, f297, f689; +mov.u32 r21, %tid.x; +shl.b32 r20, r21, 7; +barrier.sync 0; +and.b32 r11, r20, 896; +add.s32 r12, r9, r11; +sub.f32 f691, f671, f653; +mul.f32 f690, f297, f691; +add.f32 f357, f671, f653; +sub.f32 f687, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 7; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 7; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f599, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f629, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f627, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f625, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f622, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f620, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f618, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f687, f300; +sub.f32 f374, f613, f690; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f611, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f608, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f606, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f604, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f601, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f598, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +mad.lo.s32 r13, r18, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+64]; +ld.shared.v2.f32 {f397, f398}, [r13+128]; +ld.shared.v2.f32 {f401, f402}, [r13+192]; +ld.shared.v2.f32 {f405, f406}, [r13+256]; +ld.shared.v2.f32 {f409, f410}, [r13+320]; +ld.shared.v2.f32 {f413, f414}, [r13+384]; +ld.shared.v2.f32 {f417, f418}, [r13+448]; +ld.shared.v2.f32 {f421, f422}, [r13+512]; +ld.shared.v2.f32 {f425, f426}, [r13+576]; +ld.shared.v2.f32 {f429, f430}, [r13+640]; +ld.shared.v2.f32 {f433, f434}, [r13+704]; +ld.shared.v2.f32 {f437, f438}, [r13+768]; +ld.shared.v2.f32 {f441, f442}, [r13+832]; +ld.shared.v2.f32 {f445, f446}, [r13+896]; +ld.shared.v2.f32 {f449, f450}, [r13+960]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f597, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f596, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f595, f597, f596; +sub.f32 f464, f597, f596; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f594, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f593, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f592, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f591, f593, f592; +sub.f32 f480, f593, f592; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f590, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f588, f481, 0f3F3504F3; +mul.f32 f589, f590, 0fBF3504F3; +sub.f32 f487, f588, f589; +mul.f32 f488, f590, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f393, f425; +sub.f32 f496, f393, f425; +add.f32 f587, f394, f426; +sub.f32 f497, f394, f426; +add.f32 f498, f409, f441; +sub.f32 f500, f409, f441; +add.f32 f586, f410, f442; +sub.f32 f501, f410, f442; +add.f32 f502, f494, f498; +sub.f32 f504, f494, f498; +add.f32 f585, f587, f586; +sub.f32 f505, f587, f586; +add.f32 f506, f496, f501; +sub.f32 f508, f496, f501; +sub.f32 f584, f497, f500; +add.f32 f509, f497, f500; +add.f32 f510, f401, f433; +sub.f32 f512, f401, f433; +add.f32 f583, f402, f434; +sub.f32 f513, f402, f434; +add.f32 f514, f417, f449; +sub.f32 f516, f417, f449; +add.f32 f582, f418, f450; +sub.f32 f517, f418, f450; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f581, f583, f582; +sub.f32 f521, f583, f582; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f580, f513, f516; +add.f32 f525, f513, f516; +mul.f32 f527, f580, 0fBF3504F3; +mul.f32 f579, f522, 0f3F3504F3; +sub.f32 f528, f579, f527; +mul.f32 f529, f580, 0f3F3504F3; +fma.rn.f32 f530, f522, 0fBF3504F3, f529; +mul.f32 f531, f524, 0fBF3504F3; +mul.f32 f532, f525, 0fBF3504F3; +sub.f32 f533, f531, f532; +add.f32 f534, f531, f532; +add.f32 %1, f595, f591; +add.f32 %0, f461, f477; +add.f32 %3, f585, f581; +add.f32 %2, f502, f518; +add.f32 %4, f465, f487; +add.f32 %5, f594, f489; +add.f32 %6, f506, f528; +add.f32 %7, f584, f530; +add.f32 %8, f463, f480; +sub.f32 %9, f464, f479; +add.f32 %10, f504, f521; +sub.f32 %11, f505, f520; +add.f32 %13, f468, f493; +add.f32 %12, f467, f492; +add.f32 %15, f509, f534; +add.f32 %14, f508, f533; +sub.f32 %17, f595, f591; +sub.f32 %16, f461, f477; +sub.f32 %19, f585, f581; +sub.f32 %18, f502, f518; +sub.f32 %21, f594, f489; +sub.f32 %20, f465, f487; +sub.f32 %23, f584, f530; +sub.f32 %22, f506, f528; +add.f32 %25, f464, f479; +sub.f32 %24, f463, f480; +add.f32 %27, f505, f520; +sub.f32 %26, f504, f521; +sub.f32 %29, f468, f493; +sub.f32 %28, f467, f492; +sub.f32 %31, f509, f534; +sub.f32 %30, f508, f533; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<54, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<333>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 480; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+64]; +ld.shared.f32 f161, [r13+128]; +ld.shared.f32 f162, [r13+192]; +ld.shared.f32 f163, [r13+256]; +ld.shared.f32 f164, [r13+320]; +ld.shared.f32 f165, [r13+384]; +ld.shared.f32 f166, [r13+448]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+64]; +ld.shared.f32 f169, [r13+128]; +ld.shared.f32 f170, [r13+192]; +ld.shared.f32 f171, [r13+256]; +ld.shared.f32 f172, [r13+320]; +ld.shared.f32 f173, [r13+384]; +ld.shared.f32 f174, [r13+448]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 8; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 256; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+64]; +ld.shared.f32 f303, [r20+128]; +ld.shared.f32 f304, [r20+192]; +ld.shared.f32 f305, [r20+256]; +ld.shared.f32 f306, [r20+320]; +ld.shared.f32 f307, [r20+384]; +ld.shared.f32 f308, [r20+448]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+64]; +ld.shared.f32 f311, [r20+128]; +ld.shared.f32 f312, [r20+192]; +ld.shared.f32 f313, [r20+256]; +ld.shared.f32 f314, [r20+320]; +ld.shared.f32 f315, [r20+384]; +ld.shared.f32 f316, [r20+448]; +add.f32 %0, f301, f305; +add.f32 %1, f309, f313; +add.f32 %2, f302, f306; +add.f32 %3, f310, f314; +add.f32 %4, f303, f307; +add.f32 %5, f311, f315; +add.f32 %6, f304, f308; +add.f32 %7, f312, f316; +sub.f32 %8, f301, f305; +sub.f32 %9, f309, f313; +sub.f32 %10, f302, f306; +sub.f32 %11, f310, f314; +sub.f32 %12, f303, f307; +sub.f32 %13, f311, f315; +sub.f32 %14, f304, f308; +sub.f32 %15, f312, f316; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<55, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<208>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 992; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+256]; +ld.shared.v2.f32 {f70, f71}, [r13+512]; +ld.shared.v2.f32 {f74, f75}, [r13+768]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 896; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+256]; +ld.shared.v2.f32 {f131, f132}, [r20+512]; +ld.shared.v2.f32 {f135, f136}, [r20+768]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +add.f32 f149, f141, f146; +sub.f32 f150, f142, f145; +sub.f32 f151, f141, f146; +add.f32 f152, f142, f145; +and.b32 r21, r5, 16; +bfe.u32 r22, r5, 4, 1; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f153, f149; +mul.f32 f158, f154, f150; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f162, f147; +mul.f32 f166, f164, f148; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f170, f151; +mul.f32 f174, f172, f152; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 512; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f154, f149, f159; +sub.f32 f179, f157, f158; +st.shared.v2.f32 [r26+128], {f179, f178}; +fma.rn.f32 f180, f164, f147, f167; +sub.f32 f181, f165, f166; +st.shared.v2.f32 [r26+256], {f181, f180}; +sub.f32 f182, f173, f174; +fma.rn.f32 f183, f172, f151, f175; +st.shared.v2.f32 [r26+384], {f182, f183}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+256]; +ld.shared.v2.f32 {f192, f193}, [r27+512]; +ld.shared.v2.f32 {f196, f197}, [r27+768]; +add.f32 %1, f185, f193; +add.f32 %0, f184, f192; +add.f32 %3, f189, f197; +add.f32 %2, f188, f196; +sub.f32 %5, f185, f193; +sub.f32 %4, f184, f192; +sub.f32 %7, f189, f197; +sub.f32 %6, f188, f196; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<56, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<184>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 496; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+128]; +ld.shared.f32 f64, [r13+256]; +ld.shared.f32 f65, [r13+384]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+128]; +ld.shared.f32 f68, [r13+256]; +ld.shared.f32 f69, [r13+384]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 448; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+128]; +ld.shared.f32 f117, [r21+256]; +ld.shared.f32 f118, [r21+384]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+128]; +ld.shared.f32 f121, [r21+256]; +ld.shared.f32 f122, [r21+384]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +add.f32 f135, f125, f130; +sub.f32 f136, f126, f129; +sub.f32 f137, f125, f130; +add.f32 f138, f126, f129; +and.b32 r22, r5, 16; +bfe.u32 r23, r5, 4, 1; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f139, f135; +mul.f32 f144, f140, f136; +sub.f32 f145, f143, f144; +mul.f32 f146, f139, f136; +fma.rn.f32 f147, f140, f135, f146; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f150, f133; +mul.f32 f154, f152, f134; +sub.f32 f155, f153, f154; +mul.f32 f156, f150, f134; +fma.rn.f32 f157, f152, f133, f156; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f160, f137; +mul.f32 f164, f162, f138; +sub.f32 f165, f163, f164; +mul.f32 f166, f160, f138; +fma.rn.f32 f167, f162, f137, f166; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 256; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f145; +st.shared.f32 [r27+128], f155; +st.shared.f32 [r27+192], f165; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+128]; +ld.shared.f32 f170, [r28+256]; +ld.shared.f32 f171, [r28+384]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+128]; +ld.shared.f32 f174, [r28+256]; +ld.shared.f32 f175, [r28+384]; +add.f32 %0, f168, f170; +add.f32 %1, f172, f174; +add.f32 %2, f169, f171; +add.f32 %3, f173, f175; +sub.f32 %4, f168, f170; +sub.f32 %5, f172, f174; +sub.f32 %6, f169, f171; +sub.f32 %7, f173, f175; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<57, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1432>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1427, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1425, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1424, f1427, f1425; +sub.f32 f140, f1427, f1425; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1423, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1420, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1418, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1417, f1420, f1418; +sub.f32 f156, f1420, f1418; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1416, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1416, 0fBF3504F3; +mul.f32 f1415, f157, 0f3F3504F3; +sub.f32 f163, f1415, f162; +mul.f32 f164, f1416, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1414, f1424, f1417; +sub.f32 f173, f1424, f1417; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1413, f1423, f165; +sub.f32 f177, f1423, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1412, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1411, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1409, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1406, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1405, f1409, f1406; +sub.f32 f197, f1409, f1406; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1404, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1402, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1400, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1399, f1402, f1400; +sub.f32 f213, f1402, f1400; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1398, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1398, 0fBF3504F3; +mul.f32 f1397, f214, 0f3F3504F3; +sub.f32 f220, f1397, f219; +mul.f32 f221, f1398, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1396, f1405, f1399; +sub.f32 f230, f1405, f1399; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1395, f1404, f222; +sub.f32 f234, f1404, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1394, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1393, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1391, f231, 0f3F6C835E; +mul.f32 f1392, f1395, 0fBEC3EF15; +sub.f32 f245, f1391, f1392; +mul.f32 f246, f1395, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1389, f235, 0f3F3504F3; +mul.f32 f1390, f1394, 0fBF3504F3; +sub.f32 f250, f1389, f1390; +mul.f32 f251, f1394, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1387, f239, 0f3EC3EF15; +mul.f32 f1388, f1393, 0fBF6C835E; +sub.f32 f255, f1387, f1388; +mul.f32 f256, f1393, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1385, f233, 0fBEC3EF15; +mul.f32 f1386, f234, 0fBF6C835E; +sub.f32 f260, f1385, f1386; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1383, f241, 0fBF6C835E; +mul.f32 f1384, f242, 0fBEC3EF15; +sub.f32 f269, f1383, f1384; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1382, f1414, f1396; +sub.f32 f275, f1414, f1396; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1381, f1413, f247; +sub.f32 f279, f1413, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1380, f1412, f252; +sub.f32 f283, f1412, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1379, f1411, f257; +sub.f32 f287, f1411, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1378, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1377, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1376, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1375, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1372, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1370, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1369, f1372, f1370; +sub.f32 f315, f1372, f1370; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1368, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1366, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1363, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1362, f1366, f1363; +sub.f32 f331, f1366, f1363; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1361, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1359, f332, 0f3F3504F3; +mul.f32 f1360, f1361, 0fBF3504F3; +sub.f32 f338, f1359, f1360; +mul.f32 f339, f1361, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1358, f1369, f1362; +sub.f32 f348, f1369, f1362; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1357, f1368, f340; +sub.f32 f352, f1368, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1356, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1355, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1353, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1351, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1350, f1353, f1351; +sub.f32 f372, f1353, f1351; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1349, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1346, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1345, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1344, f1346, f1345; +sub.f32 f388, f1346, f1345; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1343, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1341, f389, 0f3F3504F3; +mul.f32 f1342, f1343, 0fBF3504F3; +sub.f32 f395, f1341, f1342; +mul.f32 f396, f1343, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1340, f1350, f1344; +sub.f32 f405, f1350, f1344; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1339, f1349, f397; +sub.f32 f409, f1349, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1338, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1337, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1339, 0fBEC3EF15; +mul.f32 f1336, f406, 0f3F6C835E; +sub.f32 f420, f1336, f419; +mul.f32 f421, f1339, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1338, 0fBF3504F3; +mul.f32 f1335, f410, 0f3F3504F3; +sub.f32 f425, f1335, f424; +mul.f32 f426, f1338, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1333, f414, 0f3EC3EF15; +mul.f32 f1334, f1337, 0fBF6C835E; +sub.f32 f430, f1333, f1334; +mul.f32 f431, f1337, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1331, f408, 0fBEC3EF15; +mul.f32 f1332, f409, 0fBF6C835E; +sub.f32 f435, f1331, f1332; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1330, f416, 0fBF6C835E; +sub.f32 f444, f1330, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1329, f1358, f1340; +sub.f32 f450, f1358, f1340; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1328, f1357, f422; +sub.f32 f454, f1357, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1327, f1356, f427; +sub.f32 f458, f1356, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1326, f1355, f432; +sub.f32 f462, f1355, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1325, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1324, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1323, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1322, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1328, 0fBE47C5C2; +mul.f32 f1321, f451, 0f3F7B14BE; +sub.f32 f481, f1321, f480; +mul.f32 f482, f1328, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1327, 0fBEC3EF15; +mul.f32 f1320, f455, 0f3F6C835E; +sub.f32 f486, f1320, f485; +mul.f32 f487, f1327, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1326, 0fBF0E39DA; +mul.f32 f1319, f459, 0f3F54DB31; +sub.f32 f491, f1319, f490; +mul.f32 f492, f1326, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1325, 0fBF3504F3; +mul.f32 f1318, f463, 0f3F3504F3; +sub.f32 f496, f1318, f495; +mul.f32 f497, f1325, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1316, f467, 0f3F0E39DA; +mul.f32 f1317, f1324, 0fBF54DB31; +sub.f32 f501, f1316, f1317; +mul.f32 f502, f1324, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1314, f471, 0f3EC3EF15; +mul.f32 f1315, f1323, 0fBF6C835E; +sub.f32 f506, f1314, f1315; +mul.f32 f507, f1323, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1312, f475, 0f3E47C5C2; +mul.f32 f1313, f1322, 0fBF7B14BE; +sub.f32 f511, f1312, f1313; +mul.f32 f512, f1322, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1310, f453, 0fBE47C5C2; +mul.f32 f1311, f454, 0fBF7B14BE; +sub.f32 f516, f1310, f1311; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1309, f457, 0fBEC3EF15; +sub.f32 f521, f1309, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1308, f461, 0fBF0E39DA; +sub.f32 f526, f1308, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1306, f469, 0fBF54DB31; +mul.f32 f1307, f470, 0fBF0E39DA; +sub.f32 f535, f1306, f1307; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1305, f473, 0fBF6C835E; +sub.f32 f540, f1305, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1304, f477, 0fBF7B14BE; +sub.f32 f545, f1304, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1303, f1381, f483; +sub.f32 f553, f1381, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1302, f1380, f488; +sub.f32 f557, f1380, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1301, f1379, f493; +sub.f32 f561, f1379, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1300, f1378, f498; +sub.f32 f565, f1378, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f1299, f1377, f503; +sub.f32 f569, f1377, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f1298, f1376, f508; +sub.f32 f573, f1376, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f1297, f1375, f513; +sub.f32 f577, f1375, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f1296, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f1295, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f1294, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f1293, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f1292, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1291, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1290, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1289, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +and.b32 r14, r15, 3; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f1303; +mul.f32 f616, f610, f1303; +mul.f32 f618, f611, f611; +mul.f32 f1288, f610, f610; +sub.f32 f619, f1288, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f1302; +mul.f32 f624, f619, f1302; +mul.f32 f626, f611, f621; +mul.f32 f1287, f610, f619; +sub.f32 f627, f1287, f626; +mul.f32 f1286, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f1301; +mul.f32 f632, f627, f1301; +mul.f32 f1284, f610, f627; +mul.f32 f1285, f611, f629; +sub.f32 f635, f1284, f1285; +mul.f32 f1283, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f1300; +mul.f32 f640, f635, f1300; +mul.f32 f642, f611, f637; +mul.f32 f1282, f610, f635; +sub.f32 f643, f1282, f642; +mul.f32 f1281, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f1299; +mul.f32 f648, f643, f1299; +mul.f32 f1279, f610, f643; +mul.f32 f1280, f611, f645; +sub.f32 f651, f1279, f1280; +mul.f32 f1278, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f1298; +mul.f32 f656, f651, f1298; +mul.f32 f658, f611, f653; +mul.f32 f1277, f610, f651; +sub.f32 f659, f1277, f658; +mul.f32 f1276, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f1297; +mul.f32 f664, f659, f1297; +mul.f32 f666, f611, f661; +mul.f32 f1275, f610, f659; +sub.f32 f667, f1275, f666; +mul.f32 f1274, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f1296; +mul.f32 f672, f667, f1296; +mul.f32 f1272, f610, f667; +mul.f32 f1273, f611, f669; +sub.f32 f675, f1272, f1273; +mul.f32 f1271, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f1295; +mul.f32 f680, f675, f1295; +mul.f32 f682, f611, f677; +mul.f32 f1270, f610, f675; +sub.f32 f683, f1270, f682; +mul.f32 f1269, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f1294; +mul.f32 f688, f683, f1294; +mul.f32 f690, f611, f685; +mul.f32 f1268, f610, f683; +sub.f32 f691, f1268, f690; +mul.f32 f1267, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f1293; +mul.f32 f696, f691, f1293; +mul.f32 f1265, f610, f691; +mul.f32 f1266, f611, f693; +sub.f32 f699, f1265, f1266; +mul.f32 f1264, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f1292; +mul.f32 f704, f699, f1292; +mul.f32 f706, f611, f701; +mul.f32 f1263, f610, f699; +sub.f32 f707, f1263, f706; +mul.f32 f1262, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f1291; +mul.f32 f712, f707, f1291; +mul.f32 f1260, f610, f707; +mul.f32 f1261, f611, f709; +sub.f32 f715, f1260, f1261; +mul.f32 f1259, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f1290; +mul.f32 f720, f715, f1290; +mul.f32 f722, f611, f717; +mul.f32 f1258, f610, f715; +sub.f32 f723, f1258, f722; +mul.f32 f1257, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f1289; +mul.f32 f728, f723, f1289; +mul.f32 f730, f611, f725; +mul.f32 f1256, f610, f723; +sub.f32 f731, f1256, f730; +mul.f32 f1255, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1254, f1382, f1329; +mul.f32 f735, f733, f1254; +mul.f32 f736, f731, f1254; +mul.f32 f1252, f610, f731; +mul.f32 f1253, f611, f733; +sub.f32 f739, f1252, f1253; +sub.f32 f1251, f272, f447; +mul.f32 f1250, f731, f1251; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1249, f610, f739; +sub.f32 f747, f1249, f746; +mul.f32 f1248, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1247, f610, f747; +sub.f32 f755, f1247, f754; +mul.f32 f1246, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f1244, f610, f755; +mul.f32 f1245, f611, f757; +sub.f32 f763, f1244, f1245; +mul.f32 f1243, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1242, f610, f763; +sub.f32 f771, f1242, f770; +mul.f32 f1241, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f1239, f610, f771; +mul.f32 f1240, f611, f773; +sub.f32 f779, f1239, f1240; +mul.f32 f1238, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1237, f610, f779; +sub.f32 f787, f1237, f786; +mul.f32 f1236, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1235, f610, f787; +sub.f32 f795, f1235, f794; +mul.f32 f1234, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f1232, f610, f795; +mul.f32 f1233, f611, f797; +sub.f32 f803, f1232, f1233; +mul.f32 f1231, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1230, f610, f803; +sub.f32 f811, f1230, f810; +mul.f32 f1229, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1228, f610, f811; +sub.f32 f819, f1228, f818; +mul.f32 f1227, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f1225, f610, f819; +mul.f32 f1226, f611, f821; +sub.f32 f827, f1225, f1226; +mul.f32 f1224, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1223, f610, f827; +sub.f32 f835, f1223, f834; +mul.f32 f1222, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f1220, f610, f835; +mul.f32 f1221, f611, f837; +sub.f32 f843, f1220, f1221; +mul.f32 f1219, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1218, f610, f843; +sub.f32 f851, f1218, f850; +mul.f32 f1217, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f1216, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r17, %tid.x; +shl.b32 r16, r17, 8; +barrier.sync 0; +and.b32 r11, r16, 768; +add.s32 r12, r9, r11; +sub.f32 f1431, f1382, f1329; +mul.f32 f1430, f733, f1431; +add.f32 f857, f1382, f1329; +sub.f32 f1429, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 3; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 3; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f1217, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f1286, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f1283, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f1281, f639; +sub.f32 f867, f1278, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f1276, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f1274, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f1271, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f1269, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f1267, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f1264, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f1262, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f1259, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f1257, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f1255, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f1429, f736; +sub.f32 f890, f1250, f1430; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f1248, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f1246, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f1243, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f1241, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f1238, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f1236, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f1234, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f1231, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f1229, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f1227, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f1224, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f1222, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f1219, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f1216, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +mad.lo.s32 r13, r20, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+32]; +ld.shared.v2.f32 {f929, f930}, [r13+64]; +ld.shared.v2.f32 {f933, f934}, [r13+96]; +ld.shared.v2.f32 {f937, f938}, [r13+128]; +ld.shared.v2.f32 {f941, f942}, [r13+160]; +ld.shared.v2.f32 {f945, f946}, [r13+192]; +ld.shared.v2.f32 {f949, f950}, [r13+224]; +ld.shared.v2.f32 {f953, f954}, [r13+256]; +ld.shared.v2.f32 {f957, f958}, [r13+288]; +ld.shared.v2.f32 {f961, f962}, [r13+320]; +ld.shared.v2.f32 {f965, f966}, [r13+352]; +ld.shared.v2.f32 {f969, f970}, [r13+384]; +ld.shared.v2.f32 {f973, f974}, [r13+416]; +ld.shared.v2.f32 {f977, f978}, [r13+448]; +ld.shared.v2.f32 {f981, f982}, [r13+480]; +ld.shared.v2.f32 {f985, f986}, [r13+512]; +ld.shared.v2.f32 {f989, f990}, [r13+544]; +ld.shared.v2.f32 {f993, f994}, [r13+576]; +ld.shared.v2.f32 {f997, f998}, [r13+608]; +ld.shared.v2.f32 {f1001, f1002}, [r13+640]; +ld.shared.v2.f32 {f1005, f1006}, [r13+672]; +ld.shared.v2.f32 {f1009, f1010}, [r13+704]; +ld.shared.v2.f32 {f1013, f1014}, [r13+736]; +ld.shared.v2.f32 {f1017, f1018}, [r13+768]; +ld.shared.v2.f32 {f1021, f1022}, [r13+800]; +ld.shared.v2.f32 {f1025, f1026}, [r13+832]; +ld.shared.v2.f32 {f1029, f1030}, [r13+864]; +ld.shared.v2.f32 {f1033, f1034}, [r13+896]; +ld.shared.v2.f32 {f1037, f1038}, [r13+928]; +ld.shared.v2.f32 {f1041, f1042}, [r13+960]; +ld.shared.v2.f32 {f1045, f1046}, [r13+992]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1215, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1214, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f925, f989; +sub.f32 f1059, f925, f989; +add.f32 f1213, f926, f990; +sub.f32 f1060, f926, f990; +add.f32 f1061, f957, f1021; +sub.f32 f1063, f957, f1021; +add.f32 f1212, f958, f1022; +sub.f32 f1064, f958, f1022; +add.f32 f1065, f929, f993; +sub.f32 f1067, f929, f993; +add.f32 f1211, f930, f994; +sub.f32 f1068, f930, f994; +add.f32 f1069, f961, f1025; +sub.f32 f1071, f961, f1025; +add.f32 f1210, f962, f1026; +sub.f32 f1072, f962, f1026; +add.f32 f1073, f933, f997; +sub.f32 f1075, f933, f997; +add.f32 f1209, f934, f998; +sub.f32 f1076, f934, f998; +add.f32 f1077, f965, f1029; +sub.f32 f1079, f965, f1029; +add.f32 f1208, f966, f1030; +sub.f32 f1080, f966, f1030; +add.f32 f1081, f937, f1001; +sub.f32 f1083, f937, f1001; +add.f32 f1207, f938, f1002; +sub.f32 f1084, f938, f1002; +add.f32 f1085, f969, f1033; +sub.f32 f1087, f969, f1033; +add.f32 f1206, f970, f1034; +sub.f32 f1088, f970, f1034; +add.f32 f1089, f941, f1005; +sub.f32 f1091, f941, f1005; +add.f32 f1205, f942, f1006; +sub.f32 f1092, f942, f1006; +add.f32 f1093, f973, f1037; +sub.f32 f1095, f973, f1037; +add.f32 f1204, f974, f1038; +sub.f32 f1096, f974, f1038; +add.f32 f1097, f945, f1009; +sub.f32 f1099, f945, f1009; +add.f32 f1203, f946, f1010; +sub.f32 f1100, f946, f1010; +add.f32 f1101, f977, f1041; +sub.f32 f1103, f977, f1041; +add.f32 f1202, f978, f1042; +sub.f32 f1104, f978, f1042; +add.f32 f1105, f949, f1013; +sub.f32 f1107, f949, f1013; +add.f32 f1201, f950, f1014; +sub.f32 f1108, f950, f1014; +add.f32 f1109, f981, f1045; +sub.f32 f1111, f981, f1045; +add.f32 f1200, f982, f1046; +sub.f32 f1112, f982, f1046; +add.f32 %1, f1215, f1214; +add.f32 %0, f1049, f1053; +add.f32 %2, f1057, f1061; +add.f32 %3, f1213, f1212; +add.f32 %4, f1065, f1069; +add.f32 %5, f1211, f1210; +add.f32 %6, f1073, f1077; +add.f32 %7, f1209, f1208; +add.f32 %8, f1081, f1085; +add.f32 %9, f1207, f1206; +add.f32 %11, f1205, f1204; +add.f32 %10, f1089, f1093; +add.f32 %13, f1203, f1202; +add.f32 %12, f1097, f1101; +add.f32 %15, f1201, f1200; +add.f32 %14, f1105, f1109; +add.f32 %16, f1051, f1056; +sub.f32 %17, f1052, f1055; +add.f32 %18, f1059, f1064; +sub.f32 %19, f1060, f1063; +add.f32 %20, f1067, f1072; +sub.f32 %21, f1068, f1071; +sub.f32 %23, f1076, f1079; +add.f32 %22, f1075, f1080; +sub.f32 %25, f1084, f1087; +add.f32 %24, f1083, f1088; +sub.f32 %27, f1092, f1095; +add.f32 %26, f1091, f1096; +add.f32 %28, f1099, f1104; +sub.f32 %29, f1100, f1103; +add.f32 %30, f1107, f1112; +sub.f32 %31, f1108, f1111; +sub.f32 %33, f1215, f1214; +sub.f32 %32, f1049, f1053; +sub.f32 %35, f1213, f1212; +sub.f32 %34, f1057, f1061; +sub.f32 %37, f1211, f1210; +sub.f32 %36, f1065, f1069; +sub.f32 %39, f1209, f1208; +sub.f32 %38, f1073, f1077; +sub.f32 %41, f1207, f1206; +sub.f32 %40, f1081, f1085; +sub.f32 %43, f1205, f1204; +sub.f32 %42, f1089, f1093; +sub.f32 %45, f1203, f1202; +sub.f32 %44, f1097, f1101; +sub.f32 %47, f1201, f1200; +sub.f32 %46, f1105, f1109; +add.f32 %49, f1052, f1055; +sub.f32 %48, f1051, f1056; +add.f32 %51, f1060, f1063; +sub.f32 %50, f1059, f1064; +add.f32 %53, f1068, f1071; +sub.f32 %52, f1067, f1072; +add.f32 %55, f1076, f1079; +sub.f32 %54, f1075, f1080; +add.f32 %57, f1084, f1087; +sub.f32 %56, f1083, f1088; +add.f32 %59, f1092, f1095; +sub.f32 %58, f1091, f1096; +add.f32 %61, f1100, f1103; +sub.f32 %60, f1099, f1104; +add.f32 %63, f1108, f1111; +sub.f32 %62, f1107, f1112; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<58, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1381>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1379, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1377, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1376, f1379, f1377; +sub.f32 f140, f1379, f1377; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1375, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1372, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1370, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1369, f1372, f1370; +sub.f32 f156, f1372, f1370; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1368, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1368, 0fBF3504F3; +mul.f32 f1367, f157, 0f3F3504F3; +sub.f32 f163, f1367, f162; +mul.f32 f164, f1368, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1366, f1376, f1369; +sub.f32 f173, f1376, f1369; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1365, f1375, f165; +sub.f32 f177, f1375, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1364, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1363, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1361, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1358, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1357, f1361, f1358; +sub.f32 f197, f1361, f1358; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1356, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1354, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1352, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1351, f1354, f1352; +sub.f32 f213, f1354, f1352; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1350, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1350, 0fBF3504F3; +mul.f32 f1349, f214, 0f3F3504F3; +sub.f32 f220, f1349, f219; +mul.f32 f221, f1350, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1348, f1357, f1351; +sub.f32 f230, f1357, f1351; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1347, f1356, f222; +sub.f32 f234, f1356, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1346, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1345, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1343, f231, 0f3F6C835E; +mul.f32 f1344, f1347, 0fBEC3EF15; +sub.f32 f245, f1343, f1344; +mul.f32 f246, f1347, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1341, f235, 0f3F3504F3; +mul.f32 f1342, f1346, 0fBF3504F3; +sub.f32 f250, f1341, f1342; +mul.f32 f251, f1346, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1339, f239, 0f3EC3EF15; +mul.f32 f1340, f1345, 0fBF6C835E; +sub.f32 f255, f1339, f1340; +mul.f32 f256, f1345, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1337, f233, 0fBEC3EF15; +mul.f32 f1338, f234, 0fBF6C835E; +sub.f32 f260, f1337, f1338; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1335, f241, 0fBF6C835E; +mul.f32 f1336, f242, 0fBEC3EF15; +sub.f32 f269, f1335, f1336; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1334, f1366, f1348; +sub.f32 f275, f1366, f1348; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1333, f1365, f247; +sub.f32 f279, f1365, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1332, f1364, f252; +sub.f32 f283, f1364, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1331, f1363, f257; +sub.f32 f287, f1363, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1330, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1329, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1328, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1327, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1324, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1322, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1321, f1324, f1322; +sub.f32 f315, f1324, f1322; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1320, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1318, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1315, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1314, f1318, f1315; +sub.f32 f331, f1318, f1315; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1313, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1311, f332, 0f3F3504F3; +mul.f32 f1312, f1313, 0fBF3504F3; +sub.f32 f338, f1311, f1312; +mul.f32 f339, f1313, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1310, f1321, f1314; +sub.f32 f348, f1321, f1314; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1309, f1320, f340; +sub.f32 f352, f1320, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1308, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1307, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1305, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1303, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1302, f1305, f1303; +sub.f32 f372, f1305, f1303; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1301, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1298, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1297, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1296, f1298, f1297; +sub.f32 f388, f1298, f1297; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1295, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1293, f389, 0f3F3504F3; +mul.f32 f1294, f1295, 0fBF3504F3; +sub.f32 f395, f1293, f1294; +mul.f32 f396, f1295, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1292, f1302, f1296; +sub.f32 f405, f1302, f1296; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1291, f1301, f397; +sub.f32 f409, f1301, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1290, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1289, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1291, 0fBEC3EF15; +mul.f32 f1288, f406, 0f3F6C835E; +sub.f32 f420, f1288, f419; +mul.f32 f421, f1291, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1290, 0fBF3504F3; +mul.f32 f1287, f410, 0f3F3504F3; +sub.f32 f425, f1287, f424; +mul.f32 f426, f1290, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1285, f414, 0f3EC3EF15; +mul.f32 f1286, f1289, 0fBF6C835E; +sub.f32 f430, f1285, f1286; +mul.f32 f431, f1289, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1283, f408, 0fBEC3EF15; +mul.f32 f1284, f409, 0fBF6C835E; +sub.f32 f435, f1283, f1284; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1282, f416, 0fBF6C835E; +sub.f32 f444, f1282, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1281, f1310, f1292; +sub.f32 f450, f1310, f1292; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1280, f1309, f422; +sub.f32 f454, f1309, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1279, f1308, f427; +sub.f32 f458, f1308, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1278, f1307, f432; +sub.f32 f462, f1307, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1277, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1276, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1275, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1274, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1280, 0fBE47C5C2; +mul.f32 f1273, f451, 0f3F7B14BE; +sub.f32 f481, f1273, f480; +mul.f32 f482, f1280, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1279, 0fBEC3EF15; +mul.f32 f1272, f455, 0f3F6C835E; +sub.f32 f486, f1272, f485; +mul.f32 f487, f1279, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1278, 0fBF0E39DA; +mul.f32 f1271, f459, 0f3F54DB31; +sub.f32 f491, f1271, f490; +mul.f32 f492, f1278, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1277, 0fBF3504F3; +mul.f32 f1270, f463, 0f3F3504F3; +sub.f32 f496, f1270, f495; +mul.f32 f497, f1277, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1268, f467, 0f3F0E39DA; +mul.f32 f1269, f1276, 0fBF54DB31; +sub.f32 f501, f1268, f1269; +mul.f32 f502, f1276, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1266, f471, 0f3EC3EF15; +mul.f32 f1267, f1275, 0fBF6C835E; +sub.f32 f506, f1266, f1267; +mul.f32 f507, f1275, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1264, f475, 0f3E47C5C2; +mul.f32 f1265, f1274, 0fBF7B14BE; +sub.f32 f511, f1264, f1265; +mul.f32 f512, f1274, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1262, f453, 0fBE47C5C2; +mul.f32 f1263, f454, 0fBF7B14BE; +sub.f32 f516, f1262, f1263; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1261, f457, 0fBEC3EF15; +sub.f32 f521, f1261, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1260, f461, 0fBF0E39DA; +sub.f32 f526, f1260, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1258, f469, 0fBF54DB31; +mul.f32 f1259, f470, 0fBF0E39DA; +sub.f32 f535, f1258, f1259; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1257, f473, 0fBF6C835E; +sub.f32 f540, f1257, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1256, f477, 0fBF7B14BE; +sub.f32 f545, f1256, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1255, f1334, f1281; +sub.f32 f551, f1334, f1281; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1254, f1333, f483; +sub.f32 f555, f1333, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1253, f1332, f488; +sub.f32 f559, f1332, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1252, f1331, f493; +sub.f32 f563, f1331, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1251, f1330, f498; +sub.f32 f567, f1330, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f1250, f1329, f503; +sub.f32 f571, f1329, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f1249, f1328, f508; +sub.f32 f575, f1328, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f1248, f1327, f513; +sub.f32 f579, f1327, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f1247, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f1246, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f1245, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f1244, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f1243, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1242, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1241, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1240, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f1254; +mul.f32 f1239, f612, f552; +sub.f32 f618, f1239, f617; +mul.f32 f619, f612, f1254; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f1237, f612, f612; +mul.f32 f1238, f613, f613; +sub.f32 f623, f1237, f1238; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f1235, f623, f556; +mul.f32 f1236, f625, f1253; +sub.f32 f628, f1235, f1236; +mul.f32 f629, f623, f1253; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f1233, f612, f623; +mul.f32 f1234, f613, f625; +sub.f32 f633, f1233, f1234; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f1231, f633, f560; +mul.f32 f1232, f635, f1252; +sub.f32 f638, f1231, f1232; +mul.f32 f639, f633, f1252; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f1230, f612, f633; +sub.f32 f643, f1230, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f1251; +mul.f32 f1229, f643, f564; +sub.f32 f648, f1229, f647; +mul.f32 f649, f643, f1251; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f1228, f612, f643; +sub.f32 f653, f1228, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f1250; +mul.f32 f1227, f653, f568; +sub.f32 f658, f1227, f657; +mul.f32 f659, f653, f1250; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f1226, f612, f653; +sub.f32 f663, f1226, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f1224, f663, f572; +mul.f32 f1225, f665, f1249; +sub.f32 f668, f1224, f1225; +mul.f32 f669, f663, f1249; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f1222, f612, f663; +mul.f32 f1223, f613, f665; +sub.f32 f673, f1222, f1223; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f1220, f673, f576; +mul.f32 f1221, f675, f1248; +sub.f32 f678, f1220, f1221; +mul.f32 f679, f673, f1248; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f1218, f612, f673; +mul.f32 f1219, f613, f675; +sub.f32 f683, f1218, f1219; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f1247; +mul.f32 f1217, f683, f580; +sub.f32 f688, f1217, f687; +mul.f32 f689, f683, f1247; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f1216, f612, f683; +sub.f32 f693, f1216, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f1246; +mul.f32 f1215, f693, f584; +sub.f32 f698, f1215, f697; +mul.f32 f699, f693, f1246; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f1214, f612, f693; +sub.f32 f703, f1214, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f1245; +mul.f32 f1213, f703, f588; +sub.f32 f708, f1213, f707; +mul.f32 f709, f703, f1245; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f1211, f612, f703; +mul.f32 f1212, f613, f705; +sub.f32 f713, f1211, f1212; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f1209, f713, f592; +mul.f32 f1210, f715, f1244; +sub.f32 f718, f1209, f1210; +mul.f32 f719, f713, f1244; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f1207, f612, f713; +mul.f32 f1208, f613, f715; +sub.f32 f723, f1207, f1208; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f1205, f723, f596; +mul.f32 f1206, f725, f1243; +sub.f32 f728, f1205, f1206; +mul.f32 f729, f723, f1243; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f1204, f612, f723; +sub.f32 f733, f1204, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f1242; +mul.f32 f1203, f733, f600; +sub.f32 f738, f1203, f737; +mul.f32 f739, f733, f1242; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f1202, f612, f733; +sub.f32 f743, f1202, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f1241; +mul.f32 f1201, f743, f604; +sub.f32 f748, f1201, f747; +mul.f32 f749, f743, f1241; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f1200, f612, f743; +sub.f32 f753, f1200, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f1240; +mul.f32 f1199, f753, f608; +sub.f32 f758, f1199, f757; +mul.f32 f759, f753, f1240; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f1197, f612, f753; +mul.f32 f1198, f613, f755; +sub.f32 f763, f1197, f1198; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f1195, f763, f550; +mul.f32 f1196, f765, f551; +sub.f32 f768, f1195, f1196; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f1193, f612, f763; +mul.f32 f1194, f613, f765; +sub.f32 f773, f1193, f1194; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f1192, f773, f554; +sub.f32 f778, f1192, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f1191, f612, f773; +sub.f32 f783, f1191, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f1190, f783, f558; +sub.f32 f788, f1190, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f1189, f612, f783; +sub.f32 f793, f1189, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f1188, f793, f562; +sub.f32 f798, f1188, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f1187, f612, f793; +sub.f32 f803, f1187, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f1185, f803, f566; +mul.f32 f1186, f805, f567; +sub.f32 f808, f1185, f1186; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f1183, f612, f803; +mul.f32 f1184, f613, f805; +sub.f32 f813, f1183, f1184; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f1181, f813, f570; +mul.f32 f1182, f815, f571; +sub.f32 f818, f1181, f1182; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f1179, f612, f813; +mul.f32 f1180, f613, f815; +sub.f32 f823, f1179, f1180; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f1178, f823, f574; +sub.f32 f828, f1178, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f1177, f612, f823; +sub.f32 f833, f1177, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f1176, f833, f578; +sub.f32 f838, f1176, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f1175, f612, f833; +sub.f32 f843, f1175, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f1174, f843, f582; +sub.f32 f848, f1174, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f1172, f612, f843; +mul.f32 f1173, f613, f845; +sub.f32 f853, f1172, f1173; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f1170, f853, f586; +mul.f32 f1171, f855, f587; +sub.f32 f858, f1170, f1171; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f1168, f612, f853; +mul.f32 f1169, f613, f855; +sub.f32 f863, f1168, f1169; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f1166, f863, f590; +mul.f32 f1167, f865, f591; +sub.f32 f868, f1166, f1167; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f1165, f612, f863; +sub.f32 f873, f1165, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f1164, f873, f594; +sub.f32 f878, f1164, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f1163, f612, f873; +sub.f32 f883, f1163, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f1162, f883, f598; +sub.f32 f888, f1162, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f1161, f612, f883; +sub.f32 f893, f1161, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f1159, f893, f602; +mul.f32 f1160, f895, f603; +sub.f32 f898, f1159, f1160; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f1157, f612, f893; +mul.f32 f1158, f613, f895; +sub.f32 f903, f1157, f1158; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f1155, f903, f606; +mul.f32 f1156, f905, f607; +sub.f32 f908, f1155, f1156; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f1153, f612, f903; +mul.f32 f1154, f613, f905; +sub.f32 f913, f1153, f1154; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f1152, f913, f610; +sub.f32 f918, f1152, f917; +mov.u32 r17, %tid.x; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +and.b32 r14, r17, 3; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 384; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+16]; +ld.shared.f32 f923, [r13+32]; +ld.shared.f32 f924, [r13+48]; +ld.shared.f32 f925, [r13+64]; +ld.shared.f32 f926, [r13+80]; +ld.shared.f32 f927, [r13+96]; +ld.shared.f32 f928, [r13+112]; +ld.shared.f32 f929, [r13+128]; +ld.shared.f32 f930, [r13+144]; +ld.shared.f32 f931, [r13+160]; +ld.shared.f32 f932, [r13+176]; +ld.shared.f32 f933, [r13+192]; +ld.shared.f32 f934, [r13+208]; +ld.shared.f32 f935, [r13+224]; +ld.shared.f32 f936, [r13+240]; +ld.shared.f32 f937, [r13+256]; +ld.shared.f32 f938, [r13+272]; +ld.shared.f32 f939, [r13+288]; +ld.shared.f32 f940, [r13+304]; +ld.shared.f32 f941, [r13+320]; +ld.shared.f32 f942, [r13+336]; +ld.shared.f32 f943, [r13+352]; +ld.shared.f32 f944, [r13+368]; +ld.shared.f32 f945, [r13+384]; +ld.shared.f32 f946, [r13+400]; +ld.shared.f32 f947, [r13+416]; +ld.shared.f32 f948, [r13+432]; +ld.shared.f32 f949, [r13+448]; +ld.shared.f32 f950, [r13+464]; +ld.shared.f32 f951, [r13+480]; +ld.shared.f32 f952, [r13+496]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1255, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+16]; +ld.shared.f32 f955, [r13+32]; +ld.shared.f32 f956, [r13+48]; +ld.shared.f32 f957, [r13+64]; +ld.shared.f32 f958, [r13+80]; +ld.shared.f32 f959, [r13+96]; +ld.shared.f32 f960, [r13+112]; +ld.shared.f32 f961, [r13+128]; +ld.shared.f32 f962, [r13+144]; +ld.shared.f32 f963, [r13+160]; +ld.shared.f32 f964, [r13+176]; +ld.shared.f32 f965, [r13+192]; +ld.shared.f32 f966, [r13+208]; +ld.shared.f32 f967, [r13+224]; +ld.shared.f32 f968, [r13+240]; +ld.shared.f32 f969, [r13+256]; +ld.shared.f32 f970, [r13+272]; +ld.shared.f32 f971, [r13+288]; +ld.shared.f32 f972, [r13+304]; +ld.shared.f32 f973, [r13+320]; +ld.shared.f32 f974, [r13+336]; +ld.shared.f32 f975, [r13+352]; +ld.shared.f32 f976, [r13+368]; +ld.shared.f32 f977, [r13+384]; +ld.shared.f32 f978, [r13+400]; +ld.shared.f32 f979, [r13+416]; +ld.shared.f32 f980, [r13+432]; +ld.shared.f32 f981, [r13+448]; +ld.shared.f32 f982, [r13+464]; +ld.shared.f32 f983, [r13+480]; +ld.shared.f32 f984, [r13+496]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1151, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1150, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f922, f938; +sub.f32 f995, f922, f938; +add.f32 f1149, f954, f970; +sub.f32 f996, f954, f970; +add.f32 f997, f930, f946; +sub.f32 f999, f930, f946; +add.f32 f1148, f962, f978; +sub.f32 f1000, f962, f978; +add.f32 f1001, f923, f939; +sub.f32 f1003, f923, f939; +add.f32 f1147, f955, f971; +sub.f32 f1004, f955, f971; +add.f32 f1005, f931, f947; +sub.f32 f1007, f931, f947; +add.f32 f1146, f963, f979; +sub.f32 f1008, f963, f979; +add.f32 f1009, f924, f940; +sub.f32 f1011, f924, f940; +add.f32 f1145, f956, f972; +sub.f32 f1012, f956, f972; +add.f32 f1013, f932, f948; +sub.f32 f1015, f932, f948; +add.f32 f1144, f964, f980; +sub.f32 f1016, f964, f980; +add.f32 f1017, f925, f941; +sub.f32 f1019, f925, f941; +add.f32 f1143, f957, f973; +sub.f32 f1020, f957, f973; +add.f32 f1021, f933, f949; +sub.f32 f1023, f933, f949; +add.f32 f1142, f965, f981; +sub.f32 f1024, f965, f981; +add.f32 f1025, f926, f942; +sub.f32 f1027, f926, f942; +add.f32 f1141, f958, f974; +sub.f32 f1028, f958, f974; +add.f32 f1029, f934, f950; +sub.f32 f1031, f934, f950; +add.f32 f1140, f966, f982; +sub.f32 f1032, f966, f982; +add.f32 f1033, f927, f943; +sub.f32 f1035, f927, f943; +add.f32 f1139, f959, f975; +sub.f32 f1036, f959, f975; +add.f32 f1037, f935, f951; +sub.f32 f1039, f935, f951; +add.f32 f1138, f967, f983; +sub.f32 f1040, f967, f983; +add.f32 f1041, f928, f944; +sub.f32 f1043, f928, f944; +add.f32 f1137, f960, f976; +sub.f32 f1044, f960, f976; +add.f32 f1045, f936, f952; +sub.f32 f1047, f936, f952; +add.f32 f1136, f968, f984; +sub.f32 f1048, f968, f984; +add.f32 %0, f985, f989; +add.f32 %1, f1151, f1150; +add.f32 %3, f1149, f1148; +add.f32 %2, f993, f997; +add.f32 %5, f1147, f1146; +add.f32 %4, f1001, f1005; +add.f32 %7, f1145, f1144; +add.f32 %6, f1009, f1013; +add.f32 %9, f1143, f1142; +add.f32 %8, f1017, f1021; +add.f32 %10, f1025, f1029; +add.f32 %11, f1141, f1140; +add.f32 %12, f1033, f1037; +add.f32 %13, f1139, f1138; +add.f32 %14, f1041, f1045; +add.f32 %15, f1137, f1136; +add.f32 %16, f987, f992; +sub.f32 %17, f988, f991; +add.f32 %18, f995, f1000; +sub.f32 %19, f996, f999; +add.f32 %20, f1003, f1008; +sub.f32 %21, f1004, f1007; +sub.f32 %23, f1012, f1015; +add.f32 %22, f1011, f1016; +sub.f32 %25, f1020, f1023; +add.f32 %24, f1019, f1024; +sub.f32 %27, f1028, f1031; +add.f32 %26, f1027, f1032; +add.f32 %28, f1035, f1040; +sub.f32 %29, f1036, f1039; +add.f32 %30, f1043, f1048; +sub.f32 %31, f1044, f1047; +sub.f32 %32, f985, f989; +sub.f32 %33, f1151, f1150; +sub.f32 %34, f993, f997; +sub.f32 %35, f1149, f1148; +sub.f32 %36, f1001, f1005; +sub.f32 %37, f1147, f1146; +sub.f32 %38, f1009, f1013; +sub.f32 %39, f1145, f1144; +sub.f32 %40, f1017, f1021; +sub.f32 %41, f1143, f1142; +sub.f32 %42, f1025, f1029; +sub.f32 %43, f1141, f1140; +sub.f32 %44, f1033, f1037; +sub.f32 %45, f1139, f1138; +sub.f32 %46, f1041, f1045; +sub.f32 %47, f1137, f1136; +add.f32 %49, f988, f991; +sub.f32 %48, f987, f992; +add.f32 %51, f996, f999; +sub.f32 %50, f995, f1000; +add.f32 %53, f1004, f1007; +sub.f32 %52, f1003, f1008; +add.f32 %55, f1012, f1015; +sub.f32 %54, f1011, f1016; +add.f32 %57, f1020, f1023; +sub.f32 %56, f1019, f1024; +add.f32 %59, f1028, f1031; +sub.f32 %58, f1027, f1032; +add.f32 %61, f1036, f1039; +sub.f32 %60, f1035, f1040; +add.f32 %63, f1044, f1047; +sub.f32 %62, f1043, f1048; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<59, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<139>; +.reg .b32 r<49>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %11, %13; +sub.f32 f10, %12, %14; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 1008; +add.s32 r11, r8, r10; +add.f32 f18, %12, %14; +add.f32 f19, %11, %13; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 504; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+512]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 992; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 496; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+512]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 960; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 480; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+512]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 56; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 896; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 448; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+512]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 2; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f95, f93; +mul.f32 f100, f96, f94; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 768; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f96, f93, f101; +sub.f32 f105, f99, f100; +st.shared.v2.f32 [r39+128], {f105, f104}; +barrier.sync 0; +and.b32 r40, r9, 384; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+512]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f116, f114; +mul.f32 f121, f117, f115; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 512; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f117, f114, f122; +sub.f32 f126, f120, f121; +st.shared.v2.f32 [r46+256], {f126, f125}; +barrier.sync 0; +and.b32 r47, r9, 256; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+512]; +add.f32 %1, f128, f132; +add.f32 %0, f127, f131; +sub.f32 %3, f128, f132; +sub.f32 %2, f127, f131; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<60, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<115>; +.reg .b32 r<49>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %11, %13; +add.f32 f10, %12, %14; +sub.f32 f11, %11, %13; +sub.f32 f12, %12, %14; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 504; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 252; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+256]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+256]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 496; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 248; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+256]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+256]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 480; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 240; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+256]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+256]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 56; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 448; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 224; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+256]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+256]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 2; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f81, f79; +mul.f32 f86, f82, f80; +sub.f32 f87, f85, f86; +mul.f32 f88, f81, f80; +fma.rn.f32 f89, f82, f79, f88; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 384; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f87; +barrier.sync 0; +and.b32 r40, r11, 192; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+256]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+256]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f98, f96; +mul.f32 f103, f99, f97; +sub.f32 f104, f102, f103; +mul.f32 f105, f98, f97; +fma.rn.f32 f106, f99, f96, f105; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 256; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f104; +barrier.sync 0; +and.b32 r47, r11, 128; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+256]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+256]; +add.f32 %0, f107, f108; +add.f32 %1, f109, f110; +sub.f32 %2, f107, f108; +sub.f32 %3, f109, f110; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..e297520861b31 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp32_inv.hpp.inc @@ -0,0 +1,4587 @@ +#ifndef CUFFTDX_FFT_128_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_128_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<253, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<535>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 448; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+32]; +ld.shared.f32 f391, [r13+64]; +ld.shared.f32 f392, [r13+96]; +ld.shared.f32 f393, [r13+128]; +ld.shared.f32 f394, [r13+160]; +ld.shared.f32 f395, [r13+192]; +ld.shared.f32 f396, [r13+224]; +ld.shared.f32 f397, [r13+256]; +ld.shared.f32 f398, [r13+288]; +ld.shared.f32 f399, [r13+320]; +ld.shared.f32 f400, [r13+352]; +ld.shared.f32 f401, [r13+384]; +ld.shared.f32 f402, [r13+416]; +ld.shared.f32 f403, [r13+448]; +ld.shared.f32 f404, [r13+480]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+32]; +ld.shared.f32 f407, [r13+64]; +ld.shared.f32 f408, [r13+96]; +ld.shared.f32 f409, [r13+128]; +ld.shared.f32 f410, [r13+160]; +ld.shared.f32 f411, [r13+192]; +ld.shared.f32 f412, [r13+224]; +ld.shared.f32 f413, [r13+256]; +ld.shared.f32 f414, [r13+288]; +ld.shared.f32 f415, [r13+320]; +ld.shared.f32 f416, [r13+352]; +ld.shared.f32 f417, [r13+384]; +ld.shared.f32 f418, [r13+416]; +ld.shared.f32 f419, [r13+448]; +ld.shared.f32 f420, [r13+480]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f390, f398; +add.f32 f463, f406, f414; +sub.f32 f464, f390, f398; +sub.f32 f465, f406, f414; +add.f32 f466, f394, f402; +add.f32 f467, f410, f418; +sub.f32 f468, f394, f402; +sub.f32 f469, f410, f418; +add.f32 f470, f462, f466; +add.f32 f471, f463, f467; +sub.f32 f472, f462, f466; +sub.f32 f473, f463, f467; +sub.f32 f474, f464, f469; +add.f32 f475, f465, f468; +add.f32 f476, f464, f469; +sub.f32 f477, f465, f468; +add.f32 f478, f392, f400; +add.f32 f479, f408, f416; +sub.f32 f480, f392, f400; +sub.f32 f481, f408, f416; +add.f32 f482, f396, f404; +add.f32 f483, f412, f420; +sub.f32 f484, f396, f404; +sub.f32 f485, f412, f420; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +mul.f32 f494, f490, 0f3F3504F3; +mul.f32 f495, f491, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f498, f492, 0fBF3504F3; +mul.f32 f499, f493, 0f3F3504F3; +sub.f32 f500, f498, f499; +mul.f32 f501, f493, 0fBF3504F3; +fma.rn.f32 f502, f492, 0f3F3504F3, f501; +add.f32 %0, f429, f445; +add.f32 %1, f430, f446; +add.f32 %2, f470, f486; +add.f32 %3, f471, f487; +add.f32 %5, f434, f456; +add.f32 %4, f433, f455; +add.f32 %7, f475, f497; +add.f32 %6, f474, f496; +add.f32 %9, f432, f447; +sub.f32 %8, f431, f448; +add.f32 %11, f473, f488; +sub.f32 %10, f472, f489; +add.f32 %13, f436, f461; +add.f32 %12, f435, f459; +add.f32 %15, f477, f502; +add.f32 %14, f476, f500; +sub.f32 %16, f429, f445; +sub.f32 %17, f430, f446; +sub.f32 %18, f470, f486; +sub.f32 %19, f471, f487; +sub.f32 %21, f434, f456; +sub.f32 %20, f433, f455; +sub.f32 %23, f475, f497; +sub.f32 %22, f474, f496; +sub.f32 %25, f432, f447; +add.f32 %24, f431, f448; +sub.f32 %27, f473, f488; +add.f32 %26, f472, f489; +sub.f32 %29, f436, f461; +sub.f32 %28, f435, f459; +sub.f32 %31, f477, f502; +sub.f32 %30, f476, f500; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<254, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<365>; +.reg .b32 r<20>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 960; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+128]; +ld.shared.v2.f32 {f167, f168}, [r13+256]; +ld.shared.v2.f32 {f171, f172}, [r13+384]; +ld.shared.v2.f32 {f175, f176}, [r13+512]; +ld.shared.v2.f32 {f179, f180}, [r13+640]; +ld.shared.v2.f32 {f183, f184}, [r13+768]; +ld.shared.v2.f32 {f187, f188}, [r13+896]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 8; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 512; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+128]; +ld.shared.v2.f32 {f325, f326}, [r19+256]; +ld.shared.v2.f32 {f329, f330}, [r19+384]; +ld.shared.v2.f32 {f333, f334}, [r19+512]; +ld.shared.v2.f32 {f337, f338}, [r19+640]; +ld.shared.v2.f32 {f341, f342}, [r19+768]; +ld.shared.v2.f32 {f345, f346}, [r19+896]; +add.f32 %1, f318, f334; +add.f32 %0, f317, f333; +add.f32 %3, f322, f338; +add.f32 %2, f321, f337; +add.f32 %5, f326, f342; +add.f32 %4, f325, f341; +add.f32 %7, f330, f346; +add.f32 %6, f329, f345; +sub.f32 %9, f318, f334; +sub.f32 %8, f317, f333; +sub.f32 %11, f322, f338; +sub.f32 %10, f321, f337; +sub.f32 %13, f326, f342; +sub.f32 %12, f325, f341; +sub.f32 %15, f330, f346; +sub.f32 %14, f329, f345; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<255, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<690>; +.reg .b32 r<20>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f684, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f682, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f681, f684, f682; +sub.f32 f76, f684, f682; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f680, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f677, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f675, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f674, f677, f675; +sub.f32 f92, f677, f675; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f673, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f673, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f671, f95, 0fBF3504F3; +mul.f32 f672, f96, 0f3F3504F3; +sub.f32 f103, f671, f672; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f670, f681, f674; +sub.f32 f109, f681, f674; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f669, f680, f100; +sub.f32 f113, f680, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f668, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f667, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f665, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f662, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f661, f665, f662; +sub.f32 f133, f665, f662; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f660, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f658, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f656, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f655, f658, f656; +sub.f32 f149, f658, f656; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f654, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f654, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f652, f152, 0fBF3504F3; +mul.f32 f653, f153, 0f3F3504F3; +sub.f32 f160, f652, f653; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f651, f661, f655; +sub.f32 f166, f661, f655; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f650, f660, f157; +sub.f32 f170, f660, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f649, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f648, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f646, f167, 0f3F6C835E; +mul.f32 f647, f650, 0f3EC3EF15; +sub.f32 f181, f646, f647; +mul.f32 f182, f650, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f649, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f648, 0f3F6C835E; +mul.f32 f645, f175, 0f3EC3EF15; +sub.f32 f190, f645, f189; +mul.f32 f191, f648, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f644, f169, 0fBEC3EF15; +sub.f32 f195, f644, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f642, f173, 0fBF3504F3; +mul.f32 f643, f174, 0f3F3504F3; +sub.f32 f200, f642, f643; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f640, f177, 0fBF6C835E; +mul.f32 f641, f178, 0f3EC3EF15; +sub.f32 f205, f640, f641; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f639, f669, f183; +sub.f32 f213, f669, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f638, f668, f187; +sub.f32 f217, f668, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f637, f667, f192; +sub.f32 f221, f667, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f636, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f635, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f634, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f633, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +and.b32 r14, r15, 7; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f639, f239; +mul.f32 f244, f238, f639; +mul.f32 f246, f239, f239; +mul.f32 f632, f238, f238; +sub.f32 f247, f632, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f638, f249; +mul.f32 f252, f247, f638; +mul.f32 f630, f238, f247; +mul.f32 f631, f239, f249; +sub.f32 f255, f630, f631; +mul.f32 f629, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f637, f257; +mul.f32 f260, f255, f637; +mul.f32 f262, f239, f257; +mul.f32 f628, f238, f255; +sub.f32 f263, f628, f262; +mul.f32 f627, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f636, f265; +mul.f32 f268, f263, f636; +mul.f32 f270, f239, f265; +mul.f32 f626, f238, f263; +sub.f32 f271, f626, f270; +mul.f32 f625, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f635, f273; +mul.f32 f276, f271, f635; +mul.f32 f623, f238, f271; +mul.f32 f624, f239, f273; +sub.f32 f279, f623, f624; +mul.f32 f622, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f634, f281; +mul.f32 f284, f279, f634; +mul.f32 f286, f239, f281; +mul.f32 f621, f238, f279; +sub.f32 f287, f621, f286; +mul.f32 f620, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f633, f289; +mul.f32 f292, f287, f633; +mul.f32 f294, f239, f289; +mul.f32 f619, f238, f287; +sub.f32 f295, f619, f294; +mul.f32 f618, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f617, f670, f651; +mul.f32 f298, f617, f297; +mul.f32 f300, f295, f617; +mul.f32 f615, f238, f295; +mul.f32 f616, f239, f297; +sub.f32 f303, f615, f616; +sub.f32 f614, f106, f163; +mul.f32 f613, f614, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f612, f238, f303; +sub.f32 f311, f612, f310; +mul.f32 f611, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f609, f238, f311; +mul.f32 f610, f239, f313; +sub.f32 f319, f609, f610; +mul.f32 f608, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f607, f238, f319; +sub.f32 f327, f607, f326; +mul.f32 f606, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f605, f238, f327; +sub.f32 f335, f605, f334; +mul.f32 f604, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f602, f238, f335; +mul.f32 f603, f239, f337; +sub.f32 f343, f602, f603; +mul.f32 f601, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f600, f238, f343; +sub.f32 f351, f600, f350; +mul.f32 f599, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f598, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 896; +add.s32 r12, r9, r11; +sub.f32 f689, f670, f651; +mul.f32 f688, f295, f689; +add.f32 f357, f670, f651; +sub.f32 f687, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 7; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 7; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f598; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f629; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f627; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f625; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f622; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f620; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f618; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f687, f298; +sub.f32 f374, f688, f613; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f611; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f608; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f606; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f604; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f601; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f599; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +mad.lo.s32 r13, r18, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+64]; +ld.shared.v2.f32 {f397, f398}, [r13+128]; +ld.shared.v2.f32 {f401, f402}, [r13+192]; +ld.shared.v2.f32 {f405, f406}, [r13+256]; +ld.shared.v2.f32 {f409, f410}, [r13+320]; +ld.shared.v2.f32 {f413, f414}, [r13+384]; +ld.shared.v2.f32 {f417, f418}, [r13+448]; +ld.shared.v2.f32 {f421, f422}, [r13+512]; +ld.shared.v2.f32 {f425, f426}, [r13+576]; +ld.shared.v2.f32 {f429, f430}, [r13+640]; +ld.shared.v2.f32 {f433, f434}, [r13+704]; +ld.shared.v2.f32 {f437, f438}, [r13+768]; +ld.shared.v2.f32 {f441, f442}, [r13+832]; +ld.shared.v2.f32 {f445, f446}, [r13+896]; +ld.shared.v2.f32 {f449, f450}, [r13+960]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f597, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f596, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f595, f597, f596; +sub.f32 f464, f597, f596; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f594, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f593, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f592, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f591, f593, f592; +sub.f32 f480, f593, f592; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f590, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f590, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f589, f483, 0fBF3504F3; +sub.f32 f491, f589, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f393, f425; +sub.f32 f496, f393, f425; +add.f32 f588, f394, f426; +sub.f32 f497, f394, f426; +add.f32 f498, f409, f441; +sub.f32 f500, f409, f441; +add.f32 f587, f410, f442; +sub.f32 f501, f410, f442; +add.f32 f502, f494, f498; +sub.f32 f504, f494, f498; +add.f32 f586, f588, f587; +sub.f32 f505, f588, f587; +sub.f32 f506, f496, f501; +add.f32 f508, f496, f501; +add.f32 f585, f497, f500; +sub.f32 f509, f497, f500; +add.f32 f510, f401, f433; +sub.f32 f512, f401, f433; +add.f32 f584, f402, f434; +sub.f32 f513, f402, f434; +add.f32 f514, f417, f449; +sub.f32 f516, f417, f449; +add.f32 f583, f418, f450; +sub.f32 f517, f418, f450; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f582, f584, f583; +sub.f32 f521, f584, f583; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f581, f513, f516; +sub.f32 f525, f513, f516; +mul.f32 f526, f522, 0f3F3504F3; +mul.f32 f527, f581, 0f3F3504F3; +sub.f32 f528, f526, f527; +add.f32 f529, f526, f527; +mul.f32 f579, f524, 0fBF3504F3; +mul.f32 f580, f525, 0f3F3504F3; +sub.f32 f532, f579, f580; +mul.f32 f533, f525, 0fBF3504F3; +fma.rn.f32 f534, f524, 0f3F3504F3, f533; +add.f32 %1, f595, f591; +add.f32 %0, f461, f477; +add.f32 %3, f586, f582; +add.f32 %2, f502, f518; +add.f32 %4, f465, f487; +add.f32 %5, f594, f488; +add.f32 %6, f506, f528; +add.f32 %7, f585, f529; +sub.f32 %8, f463, f480; +add.f32 %9, f464, f479; +sub.f32 %10, f504, f521; +add.f32 %11, f505, f520; +add.f32 %13, f468, f493; +add.f32 %12, f467, f491; +add.f32 %15, f509, f534; +add.f32 %14, f508, f532; +sub.f32 %17, f595, f591; +sub.f32 %16, f461, f477; +sub.f32 %19, f586, f582; +sub.f32 %18, f502, f518; +sub.f32 %21, f594, f488; +sub.f32 %20, f465, f487; +sub.f32 %23, f585, f529; +sub.f32 %22, f506, f528; +sub.f32 %25, f464, f479; +add.f32 %24, f463, f480; +sub.f32 %27, f505, f520; +add.f32 %26, f504, f521; +sub.f32 %29, f468, f493; +sub.f32 %28, f467, f491; +sub.f32 %31, f509, f534; +sub.f32 %30, f508, f532; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<256, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<333>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 480; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+64]; +ld.shared.f32 f161, [r13+128]; +ld.shared.f32 f162, [r13+192]; +ld.shared.f32 f163, [r13+256]; +ld.shared.f32 f164, [r13+320]; +ld.shared.f32 f165, [r13+384]; +ld.shared.f32 f166, [r13+448]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+64]; +ld.shared.f32 f169, [r13+128]; +ld.shared.f32 f170, [r13+192]; +ld.shared.f32 f171, [r13+256]; +ld.shared.f32 f172, [r13+320]; +ld.shared.f32 f173, [r13+384]; +ld.shared.f32 f174, [r13+448]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 8; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 256; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+64]; +ld.shared.f32 f303, [r20+128]; +ld.shared.f32 f304, [r20+192]; +ld.shared.f32 f305, [r20+256]; +ld.shared.f32 f306, [r20+320]; +ld.shared.f32 f307, [r20+384]; +ld.shared.f32 f308, [r20+448]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+64]; +ld.shared.f32 f311, [r20+128]; +ld.shared.f32 f312, [r20+192]; +ld.shared.f32 f313, [r20+256]; +ld.shared.f32 f314, [r20+320]; +ld.shared.f32 f315, [r20+384]; +ld.shared.f32 f316, [r20+448]; +add.f32 %0, f301, f305; +add.f32 %1, f309, f313; +add.f32 %2, f302, f306; +add.f32 %3, f310, f314; +add.f32 %4, f303, f307; +add.f32 %5, f311, f315; +add.f32 %6, f304, f308; +add.f32 %7, f312, f316; +sub.f32 %8, f301, f305; +sub.f32 %9, f309, f313; +sub.f32 %10, f302, f306; +sub.f32 %11, f310, f314; +sub.f32 %12, f303, f307; +sub.f32 %13, f311, f315; +sub.f32 %14, f304, f308; +sub.f32 %15, f312, f316; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<257, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<208>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 992; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+256]; +ld.shared.v2.f32 {f70, f71}, [r13+512]; +ld.shared.v2.f32 {f74, f75}, [r13+768]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 896; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+256]; +ld.shared.v2.f32 {f131, f132}, [r20+512]; +ld.shared.v2.f32 {f135, f136}, [r20+768]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +sub.f32 f149, f141, f146; +add.f32 f150, f142, f145; +add.f32 f151, f141, f146; +sub.f32 f152, f142, f145; +and.b32 r21, r5, 16; +bfe.u32 r22, r5, 4, 1; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f150, f154; +mul.f32 f158, f149, f154; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f148, f164; +mul.f32 f166, f147, f164; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f152, f172; +mul.f32 f174, f151, f172; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 512; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f153, f149, f157; +sub.f32 f179, f159, f158; +st.shared.v2.f32 [r26+128], {f178, f179}; +fma.rn.f32 f180, f162, f147, f165; +sub.f32 f181, f167, f166; +st.shared.v2.f32 [r26+256], {f180, f181}; +sub.f32 f182, f175, f174; +fma.rn.f32 f183, f170, f151, f173; +st.shared.v2.f32 [r26+384], {f183, f182}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+256]; +ld.shared.v2.f32 {f192, f193}, [r27+512]; +ld.shared.v2.f32 {f196, f197}, [r27+768]; +add.f32 %1, f185, f193; +add.f32 %0, f184, f192; +add.f32 %3, f189, f197; +add.f32 %2, f188, f196; +sub.f32 %5, f185, f193; +sub.f32 %4, f184, f192; +sub.f32 %7, f189, f197; +sub.f32 %6, f188, f196; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<258, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<184>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 496; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+128]; +ld.shared.f32 f64, [r13+256]; +ld.shared.f32 f65, [r13+384]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+128]; +ld.shared.f32 f68, [r13+256]; +ld.shared.f32 f69, [r13+384]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 448; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+128]; +ld.shared.f32 f117, [r21+256]; +ld.shared.f32 f118, [r21+384]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+128]; +ld.shared.f32 f121, [r21+256]; +ld.shared.f32 f122, [r21+384]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +sub.f32 f135, f125, f130; +add.f32 f136, f126, f129; +add.f32 f137, f125, f130; +sub.f32 f138, f126, f129; +and.b32 r22, r5, 16; +bfe.u32 r23, r5, 4, 1; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f136, f140; +fma.rn.f32 f144, f139, f135, f143; +mul.f32 f145, f135, f140; +mul.f32 f146, f139, f136; +sub.f32 f147, f146, f145; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f134, f152; +fma.rn.f32 f154, f150, f133, f153; +mul.f32 f155, f133, f152; +mul.f32 f156, f150, f134; +sub.f32 f157, f156, f155; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f138, f162; +fma.rn.f32 f164, f160, f137, f163; +mul.f32 f165, f137, f162; +mul.f32 f166, f160, f138; +sub.f32 f167, f166, f165; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 256; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f144; +st.shared.f32 [r27+128], f154; +st.shared.f32 [r27+192], f164; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+128]; +ld.shared.f32 f170, [r28+256]; +ld.shared.f32 f171, [r28+384]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+128]; +ld.shared.f32 f174, [r28+256]; +ld.shared.f32 f175, [r28+384]; +add.f32 %0, f168, f170; +add.f32 %1, f172, f174; +add.f32 %2, f169, f171; +add.f32 %3, f173, f175; +sub.f32 %4, f168, f170; +sub.f32 %5, f172, f174; +sub.f32 %6, f169, f171; +sub.f32 %7, f173, f175; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<259, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1429>; +.reg .b32 r<20>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1423, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1421, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1420, f1423, f1421; +sub.f32 f140, f1423, f1421; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1419, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1416, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1414, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1413, f1416, f1414; +sub.f32 f156, f1416, f1414; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1412, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1412, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1410, f159, 0fBF3504F3; +mul.f32 f1411, f160, 0f3F3504F3; +sub.f32 f167, f1410, f1411; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1409, f1420, f1413; +sub.f32 f173, f1420, f1413; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1408, f1419, f164; +sub.f32 f177, f1419, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1407, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1406, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1404, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1401, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1400, f1404, f1401; +sub.f32 f197, f1404, f1401; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1399, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1397, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1395, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1394, f1397, f1395; +sub.f32 f213, f1397, f1395; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1393, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1393, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1391, f216, 0fBF3504F3; +mul.f32 f1392, f217, 0f3F3504F3; +sub.f32 f224, f1391, f1392; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1390, f1400, f1394; +sub.f32 f230, f1400, f1394; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1389, f1399, f221; +sub.f32 f234, f1399, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1388, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1387, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1385, f231, 0f3F6C835E; +mul.f32 f1386, f1389, 0f3EC3EF15; +sub.f32 f245, f1385, f1386; +mul.f32 f246, f1389, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1388, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1387, 0f3F6C835E; +mul.f32 f1384, f239, 0f3EC3EF15; +sub.f32 f254, f1384, f253; +mul.f32 f255, f1387, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1383, f233, 0fBEC3EF15; +sub.f32 f259, f1383, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1381, f237, 0fBF3504F3; +mul.f32 f1382, f238, 0f3F3504F3; +sub.f32 f264, f1381, f1382; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1379, f241, 0fBF6C835E; +mul.f32 f1380, f242, 0f3EC3EF15; +sub.f32 f269, f1379, f1380; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1378, f1409, f1390; +sub.f32 f275, f1409, f1390; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1377, f1408, f247; +sub.f32 f279, f1408, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1376, f1407, f251; +sub.f32 f283, f1407, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1375, f1406, f256; +sub.f32 f287, f1406, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1374, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1373, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1372, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1371, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1368, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1366, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1365, f1368, f1366; +sub.f32 f315, f1368, f1366; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1364, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1362, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1359, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1358, f1362, f1359; +sub.f32 f331, f1362, f1359; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1357, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1357, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1356, f334, 0fBF3504F3; +sub.f32 f342, f1356, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1355, f1365, f1358; +sub.f32 f348, f1365, f1358; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1354, f1364, f339; +sub.f32 f352, f1364, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1353, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1352, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1350, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1348, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1347, f1350, f1348; +sub.f32 f372, f1350, f1348; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1346, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1343, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1342, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1341, f1343, f1342; +sub.f32 f388, f1343, f1342; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1340, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1340, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1338, f391, 0fBF3504F3; +mul.f32 f1339, f392, 0f3F3504F3; +sub.f32 f399, f1338, f1339; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1337, f1347, f1341; +sub.f32 f405, f1347, f1341; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1336, f1346, f396; +sub.f32 f409, f1346, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1335, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1334, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1336, 0f3EC3EF15; +mul.f32 f1333, f406, 0f3F6C835E; +sub.f32 f420, f1333, f419; +mul.f32 f421, f1336, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1335, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1334, 0f3F6C835E; +mul.f32 f1332, f414, 0f3EC3EF15; +sub.f32 f429, f1332, f428; +mul.f32 f430, f1334, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1331, f408, 0fBEC3EF15; +sub.f32 f434, f1331, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1330, f412, 0fBF3504F3; +sub.f32 f439, f1330, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1329, f416, 0fBF6C835E; +sub.f32 f444, f1329, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1328, f1355, f1337; +sub.f32 f450, f1355, f1337; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1327, f1354, f422; +sub.f32 f454, f1354, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1326, f1353, f426; +sub.f32 f458, f1353, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1325, f1352, f431; +sub.f32 f462, f1352, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1324, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1323, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1322, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1321, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1327, 0f3E47C5C2; +mul.f32 f1320, f451, 0f3F7B14BE; +sub.f32 f481, f1320, f480; +mul.f32 f482, f1327, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1326, 0f3EC3EF15; +mul.f32 f1319, f455, 0f3F6C835E; +sub.f32 f486, f1319, f485; +mul.f32 f487, f1326, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1325, 0f3F0E39DA; +mul.f32 f1318, f459, 0f3F54DB31; +sub.f32 f491, f1318, f490; +mul.f32 f492, f1325, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1324, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1323, 0f3F54DB31; +mul.f32 f1317, f467, 0f3F0E39DA; +sub.f32 f500, f1317, f499; +mul.f32 f501, f1323, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1322, 0f3F6C835E; +mul.f32 f1316, f471, 0f3EC3EF15; +sub.f32 f505, f1316, f504; +mul.f32 f506, f1322, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1321, 0f3F7B14BE; +mul.f32 f1315, f475, 0f3E47C5C2; +sub.f32 f510, f1315, f509; +mul.f32 f511, f1321, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1314, f453, 0fBE47C5C2; +sub.f32 f515, f1314, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1312, f457, 0fBEC3EF15; +mul.f32 f1313, f458, 0f3F6C835E; +sub.f32 f520, f1312, f1313; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1310, f461, 0fBF0E39DA; +mul.f32 f1311, f462, 0f3F54DB31; +sub.f32 f525, f1310, f1311; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1308, f465, 0fBF3504F3; +mul.f32 f1309, f466, 0f3F3504F3; +sub.f32 f530, f1308, f1309; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1306, f469, 0fBF54DB31; +mul.f32 f1307, f470, 0f3F0E39DA; +sub.f32 f535, f1306, f1307; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1305, f473, 0fBF6C835E; +sub.f32 f540, f1305, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1304, f477, 0fBF7B14BE; +sub.f32 f545, f1304, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1303, f1377, f483; +sub.f32 f553, f1377, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1302, f1376, f488; +sub.f32 f557, f1376, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1301, f1375, f493; +sub.f32 f561, f1375, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1300, f1374, f497; +sub.f32 f565, f1374, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f1299, f1373, f502; +sub.f32 f569, f1373, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f1298, f1372, f507; +sub.f32 f573, f1372, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f1297, f1371, f512; +sub.f32 f577, f1371, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f1296, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f1295, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f1294, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f1293, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f1292, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1291, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1290, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1289, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +and.b32 r14, r15, 3; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f1303, f611; +mul.f32 f616, f610, f1303; +mul.f32 f618, f611, f611; +mul.f32 f1288, f610, f610; +sub.f32 f619, f1288, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f1302, f621; +mul.f32 f624, f619, f1302; +mul.f32 f626, f611, f621; +mul.f32 f1287, f610, f619; +sub.f32 f627, f1287, f626; +mul.f32 f1286, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f1301, f629; +mul.f32 f632, f627, f1301; +mul.f32 f1284, f610, f627; +mul.f32 f1285, f611, f629; +sub.f32 f635, f1284, f1285; +mul.f32 f1283, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f1300, f637; +mul.f32 f640, f635, f1300; +mul.f32 f642, f611, f637; +mul.f32 f1282, f610, f635; +sub.f32 f643, f1282, f642; +mul.f32 f1281, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f1299, f645; +mul.f32 f648, f643, f1299; +mul.f32 f1279, f610, f643; +mul.f32 f1280, f611, f645; +sub.f32 f651, f1279, f1280; +mul.f32 f1278, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f1298, f653; +mul.f32 f656, f651, f1298; +mul.f32 f658, f611, f653; +mul.f32 f1277, f610, f651; +sub.f32 f659, f1277, f658; +mul.f32 f1276, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f1297, f661; +mul.f32 f664, f659, f1297; +mul.f32 f666, f611, f661; +mul.f32 f1275, f610, f659; +sub.f32 f667, f1275, f666; +mul.f32 f1274, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f1296, f669; +mul.f32 f672, f667, f1296; +mul.f32 f1272, f610, f667; +mul.f32 f1273, f611, f669; +sub.f32 f675, f1272, f1273; +mul.f32 f1271, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f1295, f677; +mul.f32 f680, f675, f1295; +mul.f32 f682, f611, f677; +mul.f32 f1270, f610, f675; +sub.f32 f683, f1270, f682; +mul.f32 f1269, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f1294, f685; +mul.f32 f688, f683, f1294; +mul.f32 f690, f611, f685; +mul.f32 f1268, f610, f683; +sub.f32 f691, f1268, f690; +mul.f32 f1267, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f1293, f693; +mul.f32 f696, f691, f1293; +mul.f32 f1265, f610, f691; +mul.f32 f1266, f611, f693; +sub.f32 f699, f1265, f1266; +mul.f32 f1264, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f1292, f701; +mul.f32 f704, f699, f1292; +mul.f32 f706, f611, f701; +mul.f32 f1263, f610, f699; +sub.f32 f707, f1263, f706; +mul.f32 f1262, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f1291, f709; +mul.f32 f712, f707, f1291; +mul.f32 f1260, f610, f707; +mul.f32 f1261, f611, f709; +sub.f32 f715, f1260, f1261; +mul.f32 f1259, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f1290, f717; +mul.f32 f720, f715, f1290; +mul.f32 f722, f611, f717; +mul.f32 f1258, f610, f715; +sub.f32 f723, f1258, f722; +mul.f32 f1257, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f1289, f725; +mul.f32 f728, f723, f1289; +mul.f32 f730, f611, f725; +mul.f32 f1256, f610, f723; +sub.f32 f731, f1256, f730; +mul.f32 f1255, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1254, f1378, f1328; +mul.f32 f734, f1254, f733; +mul.f32 f736, f731, f1254; +mul.f32 f1252, f610, f731; +mul.f32 f1253, f611, f733; +sub.f32 f739, f1252, f1253; +sub.f32 f1251, f272, f447; +mul.f32 f1250, f1251, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1249, f610, f739; +sub.f32 f747, f1249, f746; +mul.f32 f1248, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1247, f610, f747; +sub.f32 f755, f1247, f754; +mul.f32 f1246, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f1244, f610, f755; +mul.f32 f1245, f611, f757; +sub.f32 f763, f1244, f1245; +mul.f32 f1243, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1242, f610, f763; +sub.f32 f771, f1242, f770; +mul.f32 f1241, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f1239, f610, f771; +mul.f32 f1240, f611, f773; +sub.f32 f779, f1239, f1240; +mul.f32 f1238, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1237, f610, f779; +sub.f32 f787, f1237, f786; +mul.f32 f1236, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1235, f610, f787; +sub.f32 f795, f1235, f794; +mul.f32 f1234, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f1232, f610, f795; +mul.f32 f1233, f611, f797; +sub.f32 f803, f1232, f1233; +mul.f32 f1231, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1230, f610, f803; +sub.f32 f811, f1230, f810; +mul.f32 f1229, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1228, f610, f811; +sub.f32 f819, f1228, f818; +mul.f32 f1227, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f1225, f610, f819; +mul.f32 f1226, f611, f821; +sub.f32 f827, f1225, f1226; +mul.f32 f1224, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1223, f610, f827; +sub.f32 f835, f1223, f834; +mul.f32 f1222, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f1220, f610, f835; +mul.f32 f1221, f611, f837; +sub.f32 f843, f1220, f1221; +mul.f32 f1219, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1218, f610, f843; +sub.f32 f851, f1218, f850; +mul.f32 f1217, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f1216, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 768; +add.s32 r12, r9, r11; +sub.f32 f1428, f1378, f1328; +mul.f32 f1427, f731, f1428; +add.f32 f857, f1378, f1328; +sub.f32 f1426, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 3; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 3; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f1216; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f1286; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f1283; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f1281; +sub.f32 f867, f648, f1278; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f1276; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f1274; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f1271; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f1269; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f1267; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f1264; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f1262; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f1259; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f1257; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f1255; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f1426, f734; +sub.f32 f890, f1427, f1250; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f1248; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f1246; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f1243; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f1241; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f1238; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f1236; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f1234; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f1231; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f1229; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f1227; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f1224; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f1222; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f1219; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f1217; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +mad.lo.s32 r13, r18, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+32]; +ld.shared.v2.f32 {f929, f930}, [r13+64]; +ld.shared.v2.f32 {f933, f934}, [r13+96]; +ld.shared.v2.f32 {f937, f938}, [r13+128]; +ld.shared.v2.f32 {f941, f942}, [r13+160]; +ld.shared.v2.f32 {f945, f946}, [r13+192]; +ld.shared.v2.f32 {f949, f950}, [r13+224]; +ld.shared.v2.f32 {f953, f954}, [r13+256]; +ld.shared.v2.f32 {f957, f958}, [r13+288]; +ld.shared.v2.f32 {f961, f962}, [r13+320]; +ld.shared.v2.f32 {f965, f966}, [r13+352]; +ld.shared.v2.f32 {f969, f970}, [r13+384]; +ld.shared.v2.f32 {f973, f974}, [r13+416]; +ld.shared.v2.f32 {f977, f978}, [r13+448]; +ld.shared.v2.f32 {f981, f982}, [r13+480]; +ld.shared.v2.f32 {f985, f986}, [r13+512]; +ld.shared.v2.f32 {f989, f990}, [r13+544]; +ld.shared.v2.f32 {f993, f994}, [r13+576]; +ld.shared.v2.f32 {f997, f998}, [r13+608]; +ld.shared.v2.f32 {f1001, f1002}, [r13+640]; +ld.shared.v2.f32 {f1005, f1006}, [r13+672]; +ld.shared.v2.f32 {f1009, f1010}, [r13+704]; +ld.shared.v2.f32 {f1013, f1014}, [r13+736]; +ld.shared.v2.f32 {f1017, f1018}, [r13+768]; +ld.shared.v2.f32 {f1021, f1022}, [r13+800]; +ld.shared.v2.f32 {f1025, f1026}, [r13+832]; +ld.shared.v2.f32 {f1029, f1030}, [r13+864]; +ld.shared.v2.f32 {f1033, f1034}, [r13+896]; +ld.shared.v2.f32 {f1037, f1038}, [r13+928]; +ld.shared.v2.f32 {f1041, f1042}, [r13+960]; +ld.shared.v2.f32 {f1045, f1046}, [r13+992]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1215, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1214, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f925, f989; +sub.f32 f1059, f925, f989; +add.f32 f1213, f926, f990; +sub.f32 f1060, f926, f990; +add.f32 f1061, f957, f1021; +sub.f32 f1063, f957, f1021; +add.f32 f1212, f958, f1022; +sub.f32 f1064, f958, f1022; +add.f32 f1065, f929, f993; +sub.f32 f1067, f929, f993; +add.f32 f1211, f930, f994; +sub.f32 f1068, f930, f994; +add.f32 f1069, f961, f1025; +sub.f32 f1071, f961, f1025; +add.f32 f1210, f962, f1026; +sub.f32 f1072, f962, f1026; +add.f32 f1073, f933, f997; +sub.f32 f1075, f933, f997; +add.f32 f1209, f934, f998; +sub.f32 f1076, f934, f998; +add.f32 f1077, f965, f1029; +sub.f32 f1079, f965, f1029; +add.f32 f1208, f966, f1030; +sub.f32 f1080, f966, f1030; +add.f32 f1081, f937, f1001; +sub.f32 f1083, f937, f1001; +add.f32 f1207, f938, f1002; +sub.f32 f1084, f938, f1002; +add.f32 f1085, f969, f1033; +sub.f32 f1087, f969, f1033; +add.f32 f1206, f970, f1034; +sub.f32 f1088, f970, f1034; +add.f32 f1089, f941, f1005; +sub.f32 f1091, f941, f1005; +add.f32 f1205, f942, f1006; +sub.f32 f1092, f942, f1006; +add.f32 f1093, f973, f1037; +sub.f32 f1095, f973, f1037; +add.f32 f1204, f974, f1038; +sub.f32 f1096, f974, f1038; +add.f32 f1097, f945, f1009; +sub.f32 f1099, f945, f1009; +add.f32 f1203, f946, f1010; +sub.f32 f1100, f946, f1010; +add.f32 f1101, f977, f1041; +sub.f32 f1103, f977, f1041; +add.f32 f1202, f978, f1042; +sub.f32 f1104, f978, f1042; +add.f32 f1105, f949, f1013; +sub.f32 f1107, f949, f1013; +add.f32 f1201, f950, f1014; +sub.f32 f1108, f950, f1014; +add.f32 f1109, f981, f1045; +sub.f32 f1111, f981, f1045; +add.f32 f1200, f982, f1046; +sub.f32 f1112, f982, f1046; +add.f32 %1, f1215, f1214; +add.f32 %0, f1049, f1053; +add.f32 %2, f1057, f1061; +add.f32 %3, f1213, f1212; +add.f32 %4, f1065, f1069; +add.f32 %5, f1211, f1210; +add.f32 %6, f1073, f1077; +add.f32 %7, f1209, f1208; +add.f32 %8, f1081, f1085; +add.f32 %9, f1207, f1206; +add.f32 %11, f1205, f1204; +add.f32 %10, f1089, f1093; +add.f32 %13, f1203, f1202; +add.f32 %12, f1097, f1101; +add.f32 %15, f1201, f1200; +add.f32 %14, f1105, f1109; +sub.f32 %16, f1051, f1056; +add.f32 %17, f1052, f1055; +sub.f32 %18, f1059, f1064; +add.f32 %19, f1060, f1063; +sub.f32 %20, f1067, f1072; +add.f32 %21, f1068, f1071; +add.f32 %23, f1076, f1079; +sub.f32 %22, f1075, f1080; +add.f32 %25, f1084, f1087; +sub.f32 %24, f1083, f1088; +add.f32 %27, f1092, f1095; +sub.f32 %26, f1091, f1096; +sub.f32 %28, f1099, f1104; +add.f32 %29, f1100, f1103; +sub.f32 %30, f1107, f1112; +add.f32 %31, f1108, f1111; +sub.f32 %33, f1215, f1214; +sub.f32 %32, f1049, f1053; +sub.f32 %35, f1213, f1212; +sub.f32 %34, f1057, f1061; +sub.f32 %37, f1211, f1210; +sub.f32 %36, f1065, f1069; +sub.f32 %39, f1209, f1208; +sub.f32 %38, f1073, f1077; +sub.f32 %41, f1207, f1206; +sub.f32 %40, f1081, f1085; +sub.f32 %43, f1205, f1204; +sub.f32 %42, f1089, f1093; +sub.f32 %45, f1203, f1202; +sub.f32 %44, f1097, f1101; +sub.f32 %47, f1201, f1200; +sub.f32 %46, f1105, f1109; +sub.f32 %49, f1052, f1055; +add.f32 %48, f1051, f1056; +sub.f32 %51, f1060, f1063; +add.f32 %50, f1059, f1064; +sub.f32 %53, f1068, f1071; +add.f32 %52, f1067, f1072; +sub.f32 %55, f1076, f1079; +add.f32 %54, f1075, f1080; +sub.f32 %57, f1084, f1087; +add.f32 %56, f1083, f1088; +sub.f32 %59, f1092, f1095; +add.f32 %58, f1091, f1096; +sub.f32 %61, f1100, f1103; +add.f32 %60, f1099, f1104; +sub.f32 %63, f1108, f1111; +add.f32 %62, f1107, f1112; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<260, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1333>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1331, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1329, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1328, f1331, f1329; +sub.f32 f140, f1331, f1329; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1327, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1324, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1322, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1321, f1324, f1322; +sub.f32 f156, f1324, f1322; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1320, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1320, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1318, f159, 0fBF3504F3; +mul.f32 f1319, f160, 0f3F3504F3; +sub.f32 f167, f1318, f1319; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1317, f1328, f1321; +sub.f32 f173, f1328, f1321; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1316, f1327, f164; +sub.f32 f177, f1327, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1315, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1314, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1312, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1309, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1308, f1312, f1309; +sub.f32 f197, f1312, f1309; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1307, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1305, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1303, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1302, f1305, f1303; +sub.f32 f213, f1305, f1303; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1301, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1301, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1299, f216, 0fBF3504F3; +mul.f32 f1300, f217, 0f3F3504F3; +sub.f32 f224, f1299, f1300; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1298, f1308, f1302; +sub.f32 f230, f1308, f1302; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1297, f1307, f221; +sub.f32 f234, f1307, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1296, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1295, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1293, f231, 0f3F6C835E; +mul.f32 f1294, f1297, 0f3EC3EF15; +sub.f32 f245, f1293, f1294; +mul.f32 f246, f1297, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1296, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1295, 0f3F6C835E; +mul.f32 f1292, f239, 0f3EC3EF15; +sub.f32 f254, f1292, f253; +mul.f32 f255, f1295, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1291, f233, 0fBEC3EF15; +sub.f32 f259, f1291, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1289, f237, 0fBF3504F3; +mul.f32 f1290, f238, 0f3F3504F3; +sub.f32 f264, f1289, f1290; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1287, f241, 0fBF6C835E; +mul.f32 f1288, f242, 0f3EC3EF15; +sub.f32 f269, f1287, f1288; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1286, f1317, f1298; +sub.f32 f275, f1317, f1298; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1285, f1316, f247; +sub.f32 f279, f1316, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1284, f1315, f251; +sub.f32 f283, f1315, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1283, f1314, f256; +sub.f32 f287, f1314, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1282, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1281, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1280, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1279, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1276, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1274, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1273, f1276, f1274; +sub.f32 f315, f1276, f1274; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1272, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1270, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1267, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1266, f1270, f1267; +sub.f32 f331, f1270, f1267; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1265, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1265, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1264, f334, 0fBF3504F3; +sub.f32 f342, f1264, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1263, f1273, f1266; +sub.f32 f348, f1273, f1266; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1262, f1272, f339; +sub.f32 f352, f1272, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1261, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1260, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1258, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1256, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1255, f1258, f1256; +sub.f32 f372, f1258, f1256; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1254, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1251, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1250, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1249, f1251, f1250; +sub.f32 f388, f1251, f1250; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1248, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1248, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1246, f391, 0fBF3504F3; +mul.f32 f1247, f392, 0f3F3504F3; +sub.f32 f399, f1246, f1247; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1245, f1255, f1249; +sub.f32 f405, f1255, f1249; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1244, f1254, f396; +sub.f32 f409, f1254, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1243, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1242, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1244, 0f3EC3EF15; +mul.f32 f1241, f406, 0f3F6C835E; +sub.f32 f420, f1241, f419; +mul.f32 f421, f1244, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1243, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1242, 0f3F6C835E; +mul.f32 f1240, f414, 0f3EC3EF15; +sub.f32 f429, f1240, f428; +mul.f32 f430, f1242, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1239, f408, 0fBEC3EF15; +sub.f32 f434, f1239, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1238, f412, 0fBF3504F3; +sub.f32 f439, f1238, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1237, f416, 0fBF6C835E; +sub.f32 f444, f1237, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1236, f1263, f1245; +sub.f32 f450, f1263, f1245; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1235, f1262, f422; +sub.f32 f454, f1262, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1234, f1261, f426; +sub.f32 f458, f1261, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1233, f1260, f431; +sub.f32 f462, f1260, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1232, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1231, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1230, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1229, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1235, 0f3E47C5C2; +mul.f32 f1228, f451, 0f3F7B14BE; +sub.f32 f481, f1228, f480; +mul.f32 f482, f1235, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1234, 0f3EC3EF15; +mul.f32 f1227, f455, 0f3F6C835E; +sub.f32 f486, f1227, f485; +mul.f32 f487, f1234, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1233, 0f3F0E39DA; +mul.f32 f1226, f459, 0f3F54DB31; +sub.f32 f491, f1226, f490; +mul.f32 f492, f1233, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1232, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1231, 0f3F54DB31; +mul.f32 f1225, f467, 0f3F0E39DA; +sub.f32 f500, f1225, f499; +mul.f32 f501, f1231, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1230, 0f3F6C835E; +mul.f32 f1224, f471, 0f3EC3EF15; +sub.f32 f505, f1224, f504; +mul.f32 f506, f1230, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1229, 0f3F7B14BE; +mul.f32 f1223, f475, 0f3E47C5C2; +sub.f32 f510, f1223, f509; +mul.f32 f511, f1229, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1222, f453, 0fBE47C5C2; +sub.f32 f515, f1222, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1220, f457, 0fBEC3EF15; +mul.f32 f1221, f458, 0f3F6C835E; +sub.f32 f520, f1220, f1221; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1218, f461, 0fBF0E39DA; +mul.f32 f1219, f462, 0f3F54DB31; +sub.f32 f525, f1218, f1219; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1216, f465, 0fBF3504F3; +mul.f32 f1217, f466, 0f3F3504F3; +sub.f32 f530, f1216, f1217; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1214, f469, 0fBF54DB31; +mul.f32 f1215, f470, 0f3F0E39DA; +sub.f32 f535, f1214, f1215; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1213, f473, 0fBF6C835E; +sub.f32 f540, f1213, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1212, f477, 0fBF7B14BE; +sub.f32 f545, f1212, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1211, f1286, f1236; +sub.f32 f551, f1286, f1236; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1210, f1285, f483; +sub.f32 f555, f1285, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1209, f1284, f488; +sub.f32 f559, f1284, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1208, f1283, f493; +sub.f32 f563, f1283, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1207, f1282, f497; +sub.f32 f567, f1282, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f1206, f1281, f502; +sub.f32 f571, f1281, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f1205, f1280, f507; +sub.f32 f575, f1280, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f1204, f1279, f512; +sub.f32 f579, f1279, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f1203, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f1202, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f1201, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f1200, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f1199, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1198, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1197, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1196, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f1210, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f1210; +sub.f32 f620, f619, f618; +mul.f32 f1194, f612, f612; +mul.f32 f1195, f613, f613; +sub.f32 f623, f1194, f1195; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f1209, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f1209; +sub.f32 f630, f629, f628; +mul.f32 f1192, f612, f623; +mul.f32 f1193, f613, f625; +sub.f32 f633, f1192, f1193; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f1208, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f1208; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f1191, f612, f633; +sub.f32 f643, f1191, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f1207, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f1207; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f1190, f612, f643; +sub.f32 f653, f1190, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f1206, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f1206; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f1189, f612, f653; +sub.f32 f663, f1189, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f1205, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f1205; +sub.f32 f670, f669, f668; +mul.f32 f1187, f612, f663; +mul.f32 f1188, f613, f665; +sub.f32 f673, f1187, f1188; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f1204, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f1204; +sub.f32 f680, f679, f678; +mul.f32 f1185, f612, f673; +mul.f32 f1186, f613, f675; +sub.f32 f683, f1185, f1186; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f1203, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f1203; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f1184, f612, f683; +sub.f32 f693, f1184, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f1202, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f1202; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f1183, f612, f693; +sub.f32 f703, f1183, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f1201, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f1201; +sub.f32 f710, f709, f708; +mul.f32 f1181, f612, f703; +mul.f32 f1182, f613, f705; +sub.f32 f713, f1181, f1182; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f1200, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f1200; +sub.f32 f720, f719, f718; +mul.f32 f1179, f612, f713; +mul.f32 f1180, f613, f715; +sub.f32 f723, f1179, f1180; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f1199, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f1199; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f1178, f612, f723; +sub.f32 f733, f1178, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f1198, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f1198; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f1177, f612, f733; +sub.f32 f743, f1177, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f1197, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f1197; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f1176, f612, f743; +sub.f32 f753, f1176, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f1196, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f1196; +sub.f32 f760, f759, f758; +mul.f32 f1174, f612, f753; +mul.f32 f1175, f613, f755; +sub.f32 f763, f1174, f1175; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f1172, f612, f763; +mul.f32 f1173, f613, f765; +sub.f32 f773, f1172, f1173; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f1171, f612, f773; +sub.f32 f783, f1171, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f1170, f612, f783; +sub.f32 f793, f1170, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f1169, f612, f793; +sub.f32 f803, f1169, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f1167, f612, f803; +mul.f32 f1168, f613, f805; +sub.f32 f813, f1167, f1168; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f1165, f612, f813; +mul.f32 f1166, f613, f815; +sub.f32 f823, f1165, f1166; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f1164, f612, f823; +sub.f32 f833, f1164, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f1163, f612, f833; +sub.f32 f843, f1163, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f1161, f612, f843; +mul.f32 f1162, f613, f845; +sub.f32 f853, f1161, f1162; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f1159, f612, f853; +mul.f32 f1160, f613, f855; +sub.f32 f863, f1159, f1160; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f1158, f612, f863; +sub.f32 f873, f1158, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f1157, f612, f873; +sub.f32 f883, f1157, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f1156, f612, f883; +sub.f32 f893, f1156, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f1154, f612, f893; +mul.f32 f1155, f613, f895; +sub.f32 f903, f1154, f1155; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f1152, f612, f903; +mul.f32 f1153, f613, f905; +sub.f32 f913, f1152, f1153; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mov.u32 r17, %tid.x; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +and.b32 r14, r17, 3; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 384; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+16]; +ld.shared.f32 f923, [r13+32]; +ld.shared.f32 f924, [r13+48]; +ld.shared.f32 f925, [r13+64]; +ld.shared.f32 f926, [r13+80]; +ld.shared.f32 f927, [r13+96]; +ld.shared.f32 f928, [r13+112]; +ld.shared.f32 f929, [r13+128]; +ld.shared.f32 f930, [r13+144]; +ld.shared.f32 f931, [r13+160]; +ld.shared.f32 f932, [r13+176]; +ld.shared.f32 f933, [r13+192]; +ld.shared.f32 f934, [r13+208]; +ld.shared.f32 f935, [r13+224]; +ld.shared.f32 f936, [r13+240]; +ld.shared.f32 f937, [r13+256]; +ld.shared.f32 f938, [r13+272]; +ld.shared.f32 f939, [r13+288]; +ld.shared.f32 f940, [r13+304]; +ld.shared.f32 f941, [r13+320]; +ld.shared.f32 f942, [r13+336]; +ld.shared.f32 f943, [r13+352]; +ld.shared.f32 f944, [r13+368]; +ld.shared.f32 f945, [r13+384]; +ld.shared.f32 f946, [r13+400]; +ld.shared.f32 f947, [r13+416]; +ld.shared.f32 f948, [r13+432]; +ld.shared.f32 f949, [r13+448]; +ld.shared.f32 f950, [r13+464]; +ld.shared.f32 f951, [r13+480]; +ld.shared.f32 f952, [r13+496]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1211, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+16]; +ld.shared.f32 f955, [r13+32]; +ld.shared.f32 f956, [r13+48]; +ld.shared.f32 f957, [r13+64]; +ld.shared.f32 f958, [r13+80]; +ld.shared.f32 f959, [r13+96]; +ld.shared.f32 f960, [r13+112]; +ld.shared.f32 f961, [r13+128]; +ld.shared.f32 f962, [r13+144]; +ld.shared.f32 f963, [r13+160]; +ld.shared.f32 f964, [r13+176]; +ld.shared.f32 f965, [r13+192]; +ld.shared.f32 f966, [r13+208]; +ld.shared.f32 f967, [r13+224]; +ld.shared.f32 f968, [r13+240]; +ld.shared.f32 f969, [r13+256]; +ld.shared.f32 f970, [r13+272]; +ld.shared.f32 f971, [r13+288]; +ld.shared.f32 f972, [r13+304]; +ld.shared.f32 f973, [r13+320]; +ld.shared.f32 f974, [r13+336]; +ld.shared.f32 f975, [r13+352]; +ld.shared.f32 f976, [r13+368]; +ld.shared.f32 f977, [r13+384]; +ld.shared.f32 f978, [r13+400]; +ld.shared.f32 f979, [r13+416]; +ld.shared.f32 f980, [r13+432]; +ld.shared.f32 f981, [r13+448]; +ld.shared.f32 f982, [r13+464]; +ld.shared.f32 f983, [r13+480]; +ld.shared.f32 f984, [r13+496]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1151, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1150, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f922, f938; +sub.f32 f995, f922, f938; +add.f32 f1149, f954, f970; +sub.f32 f996, f954, f970; +add.f32 f997, f930, f946; +sub.f32 f999, f930, f946; +add.f32 f1148, f962, f978; +sub.f32 f1000, f962, f978; +add.f32 f1001, f923, f939; +sub.f32 f1003, f923, f939; +add.f32 f1147, f955, f971; +sub.f32 f1004, f955, f971; +add.f32 f1005, f931, f947; +sub.f32 f1007, f931, f947; +add.f32 f1146, f963, f979; +sub.f32 f1008, f963, f979; +add.f32 f1009, f924, f940; +sub.f32 f1011, f924, f940; +add.f32 f1145, f956, f972; +sub.f32 f1012, f956, f972; +add.f32 f1013, f932, f948; +sub.f32 f1015, f932, f948; +add.f32 f1144, f964, f980; +sub.f32 f1016, f964, f980; +add.f32 f1017, f925, f941; +sub.f32 f1019, f925, f941; +add.f32 f1143, f957, f973; +sub.f32 f1020, f957, f973; +add.f32 f1021, f933, f949; +sub.f32 f1023, f933, f949; +add.f32 f1142, f965, f981; +sub.f32 f1024, f965, f981; +add.f32 f1025, f926, f942; +sub.f32 f1027, f926, f942; +add.f32 f1141, f958, f974; +sub.f32 f1028, f958, f974; +add.f32 f1029, f934, f950; +sub.f32 f1031, f934, f950; +add.f32 f1140, f966, f982; +sub.f32 f1032, f966, f982; +add.f32 f1033, f927, f943; +sub.f32 f1035, f927, f943; +add.f32 f1139, f959, f975; +sub.f32 f1036, f959, f975; +add.f32 f1037, f935, f951; +sub.f32 f1039, f935, f951; +add.f32 f1138, f967, f983; +sub.f32 f1040, f967, f983; +add.f32 f1041, f928, f944; +sub.f32 f1043, f928, f944; +add.f32 f1137, f960, f976; +sub.f32 f1044, f960, f976; +add.f32 f1045, f936, f952; +sub.f32 f1047, f936, f952; +add.f32 f1136, f968, f984; +sub.f32 f1048, f968, f984; +add.f32 %0, f985, f989; +add.f32 %1, f1151, f1150; +add.f32 %3, f1149, f1148; +add.f32 %2, f993, f997; +add.f32 %5, f1147, f1146; +add.f32 %4, f1001, f1005; +add.f32 %7, f1145, f1144; +add.f32 %6, f1009, f1013; +add.f32 %9, f1143, f1142; +add.f32 %8, f1017, f1021; +add.f32 %10, f1025, f1029; +add.f32 %11, f1141, f1140; +add.f32 %12, f1033, f1037; +add.f32 %13, f1139, f1138; +add.f32 %14, f1041, f1045; +add.f32 %15, f1137, f1136; +sub.f32 %16, f987, f992; +add.f32 %17, f988, f991; +sub.f32 %18, f995, f1000; +add.f32 %19, f996, f999; +sub.f32 %20, f1003, f1008; +add.f32 %21, f1004, f1007; +add.f32 %23, f1012, f1015; +sub.f32 %22, f1011, f1016; +add.f32 %25, f1020, f1023; +sub.f32 %24, f1019, f1024; +add.f32 %27, f1028, f1031; +sub.f32 %26, f1027, f1032; +sub.f32 %28, f1035, f1040; +add.f32 %29, f1036, f1039; +sub.f32 %30, f1043, f1048; +add.f32 %31, f1044, f1047; +sub.f32 %32, f985, f989; +sub.f32 %33, f1151, f1150; +sub.f32 %34, f993, f997; +sub.f32 %35, f1149, f1148; +sub.f32 %36, f1001, f1005; +sub.f32 %37, f1147, f1146; +sub.f32 %38, f1009, f1013; +sub.f32 %39, f1145, f1144; +sub.f32 %40, f1017, f1021; +sub.f32 %41, f1143, f1142; +sub.f32 %42, f1025, f1029; +sub.f32 %43, f1141, f1140; +sub.f32 %44, f1033, f1037; +sub.f32 %45, f1139, f1138; +sub.f32 %46, f1041, f1045; +sub.f32 %47, f1137, f1136; +sub.f32 %49, f988, f991; +add.f32 %48, f987, f992; +sub.f32 %51, f996, f999; +add.f32 %50, f995, f1000; +sub.f32 %53, f1004, f1007; +add.f32 %52, f1003, f1008; +sub.f32 %55, f1012, f1015; +add.f32 %54, f1011, f1016; +sub.f32 %57, f1020, f1023; +add.f32 %56, f1019, f1024; +sub.f32 %59, f1028, f1031; +add.f32 %58, f1027, f1032; +sub.f32 %61, f1036, f1039; +add.f32 %60, f1035, f1040; +sub.f32 %63, f1044, f1047; +add.f32 %62, f1043, f1048; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<261, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<139>; +.reg .b32 r<49>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %11, %13; +sub.f32 f10, %12, %14; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 1008; +add.s32 r11, r8, r10; +add.f32 f18, %12, %14; +add.f32 f19, %11, %13; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 504; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+512]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 992; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 496; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+512]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 960; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 480; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+512]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 56; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 896; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 448; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+512]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 2; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f94, f96; +mul.f32 f100, f93, f96; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 768; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f95, f93, f99; +sub.f32 f105, f101, f100; +st.shared.v2.f32 [r39+128], {f104, f105}; +barrier.sync 0; +and.b32 r40, r9, 384; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+512]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f115, f117; +mul.f32 f121, f114, f117; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 512; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f116, f114, f120; +sub.f32 f126, f122, f121; +st.shared.v2.f32 [r46+256], {f125, f126}; +barrier.sync 0; +and.b32 r47, r9, 256; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+512]; +add.f32 %1, f128, f132; +add.f32 %0, f127, f131; +sub.f32 %3, f128, f132; +sub.f32 %2, f127, f131; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<262, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<115>; +.reg .b32 r<49>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %11, %13; +add.f32 f10, %12, %14; +sub.f32 f11, %11, %13; +sub.f32 f12, %12, %14; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 504; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 252; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+256]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+256]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 496; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 248; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+256]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+256]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 480; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 240; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+256]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+256]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 56; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 448; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 224; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+256]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+256]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 2; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f80, f82; +fma.rn.f32 f86, f81, f79, f85; +mul.f32 f87, f79, f82; +mul.f32 f88, f81, f80; +sub.f32 f89, f88, f87; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 384; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f86; +barrier.sync 0; +and.b32 r40, r11, 192; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+256]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+256]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f97, f99; +fma.rn.f32 f103, f98, f96, f102; +mul.f32 f104, f96, f99; +mul.f32 f105, f98, f97; +sub.f32 f106, f105, f104; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 256; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f103; +barrier.sync 0; +and.b32 r47, r11, 128; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+256]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+256]; +add.f32 %0, f107, f108; +add.f32 %1, f109, f110; +sub.f32 %2, f107, f108; +sub.f32 %3, f109, f110; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..d3bd7f7a06b26 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp64_fwd.hpp.inc @@ -0,0 +1,2537 @@ +#ifndef CUFFTDX_FFT_128_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_128_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<452, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<331>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+256]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 960; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+128]; +ld.shared.f64 fd160, [r13+256]; +ld.shared.f64 fd161, [r13+384]; +ld.shared.f64 fd162, [r13+512]; +ld.shared.f64 fd163, [r13+640]; +ld.shared.f64 fd164, [r13+768]; +ld.shared.f64 fd165, [r13+896]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+128]; +ld.shared.f64 fd168, [r13+256]; +ld.shared.f64 fd169, [r13+384]; +ld.shared.f64 fd170, [r13+512]; +ld.shared.f64 fd171, [r13+640]; +ld.shared.f64 fd172, [r13+768]; +ld.shared.f64 fd173, [r13+896]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 8; +bfe.u32 r15, r5, 3, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+32]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 512; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+128]; +ld.shared.f64 fd301, [r21+256]; +ld.shared.f64 fd302, [r21+384]; +ld.shared.f64 fd303, [r21+512]; +ld.shared.f64 fd304, [r21+640]; +ld.shared.f64 fd305, [r21+768]; +ld.shared.f64 fd306, [r21+896]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+128]; +ld.shared.f64 fd309, [r21+256]; +ld.shared.f64 fd310, [r21+384]; +ld.shared.f64 fd311, [r21+512]; +ld.shared.f64 fd312, [r21+640]; +ld.shared.f64 fd313, [r21+768]; +ld.shared.f64 fd314, [r21+896]; +add.f64 %0, fd299, fd303; +add.f64 %1, fd307, fd311; +add.f64 %2, fd300, fd304; +add.f64 %3, fd308, fd312; +add.f64 %4, fd301, fd305; +add.f64 %5, fd309, fd313; +add.f64 %6, fd302, fd306; +add.f64 %7, fd310, fd314; +sub.f64 %8, fd299, fd303; +sub.f64 %9, fd307, fd311; +sub.f64 %10, fd300, fd304; +sub.f64 %11, fd308, fd312; +sub.f64 %12, fd301, fd305; +sub.f64 %13, fd309, fd313; +sub.f64 %14, fd302, fd306; +sub.f64 %15, fd310, fd314; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<453, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<688>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd681, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd679, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd678, fd681, fd679; +sub.f64 fd76, fd681, fd679; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd677, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd674, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd672, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd671, fd674, fd672; +sub.f64 fd92, fd674, fd672; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd670, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd670, 0dBFE6A09E667F3BCD; +mul.f64 fd669, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd669, fd98; +mul.f64 fd100, fd670, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd668, fd678, fd671; +sub.f64 fd109, fd678, fd671; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd667, fd677, fd101; +sub.f64 fd113, fd677, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd666, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd665, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd663, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd660, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd659, fd663, fd660; +sub.f64 fd133, fd663, fd660; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd658, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd656, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd654, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd653, fd656, fd654; +sub.f64 fd149, fd656, fd654; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd652, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd652, 0dBFE6A09E667F3BCD; +mul.f64 fd651, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd651, fd155; +mul.f64 fd157, fd652, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd650, fd659, fd653; +sub.f64 fd166, fd659, fd653; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd649, fd658, fd158; +sub.f64 fd170, fd658, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd648, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd647, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd645, fd167, 0d3FED906BCF328D46; +mul.f64 fd646, fd649, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd645, fd646; +mul.f64 fd182, fd649, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd643, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd644, fd648, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd643, fd644; +mul.f64 fd187, fd648, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd641, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd642, fd647, 0dBFED906BCF328D46; +sub.f64 fd191, fd641, fd642; +mul.f64 fd192, fd647, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd639, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd640, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd639, fd640; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd637, fd177, 0dBFED906BCF328D46; +mul.f64 fd638, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd637, fd638; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd636, fd667, fd183; +sub.f64 fd213, fd667, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd635, fd666, fd188; +sub.f64 fd217, fd666, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd634, fd665, fd193; +sub.f64 fd221, fd665, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd633, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd632, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd631, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd630, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +and.b32 r14, r15, 7; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd636; +mul.f64 fd244, fd238, fd636; +mul.f64 fd246, fd239, fd239; +mul.f64 fd629, fd238, fd238; +sub.f64 fd247, fd629, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd635; +mul.f64 fd252, fd247, fd635; +mul.f64 fd627, fd238, fd247; +mul.f64 fd628, fd239, fd249; +sub.f64 fd255, fd627, fd628; +mul.f64 fd626, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd634; +mul.f64 fd260, fd255, fd634; +mul.f64 fd262, fd239, fd257; +mul.f64 fd625, fd238, fd255; +sub.f64 fd263, fd625, fd262; +mul.f64 fd624, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd633; +mul.f64 fd268, fd263, fd633; +mul.f64 fd270, fd239, fd265; +mul.f64 fd623, fd238, fd263; +sub.f64 fd271, fd623, fd270; +mul.f64 fd622, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd632; +mul.f64 fd276, fd271, fd632; +mul.f64 fd620, fd238, fd271; +mul.f64 fd621, fd239, fd273; +sub.f64 fd279, fd620, fd621; +mul.f64 fd619, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd631; +mul.f64 fd284, fd279, fd631; +mul.f64 fd286, fd239, fd281; +mul.f64 fd618, fd238, fd279; +sub.f64 fd287, fd618, fd286; +mul.f64 fd617, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd630; +mul.f64 fd292, fd287, fd630; +mul.f64 fd294, fd239, fd289; +mul.f64 fd616, fd238, fd287; +sub.f64 fd295, fd616, fd294; +mul.f64 fd615, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd614, fd668, fd650; +sub.f64 fd613, fd106, fd163; +mul.f64 fd298, fd295, fd613; +mul.f64 fd299, fd297, fd614; +mul.f64 fd300, fd295, fd614; +ld.global.v2.f64 {fd301, fd302}, [rd5+128]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd611, fd238, fd301; +mul.f64 fd612, fd239, fd302; +sub.f64 fd310, fd611, fd612; +mul.f64 fd610, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd609, fd238, fd310; +sub.f64 fd318, fd609, fd317; +mul.f64 fd608, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd606, fd238, fd318; +mul.f64 fd607, fd239, fd320; +sub.f64 fd326, fd606, fd607; +mul.f64 fd605, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd603, fd238, fd326; +mul.f64 fd604, fd239, fd328; +sub.f64 fd334, fd603, fd604; +mul.f64 fd602, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd601, fd238, fd334; +sub.f64 fd342, fd601, fd341; +mul.f64 fd600, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd598, fd238, fd342; +mul.f64 fd599, fd239, fd344; +sub.f64 fd350, fd598, fd599; +mul.f64 fd597, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd596, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 7; +sub.f64 fd685, fd668, fd650; +mul.f64 fd684, fd297, fd685; +mov.u32 r23, %tid.x; +shl.b32 r22, r23, 8; +barrier.sync 0; +and.b32 r11, r22, 1792; +add.s32 r12, r9, r11; +sub.f64 fd687, fd668, fd650; +mul.f64 fd686, fd297, fd687; +add.f64 fd356, fd668, fd650; +sub.f64 fd683, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 7; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 7; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd597, fd243; +st.shared.v2.f64 [r12+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd626, fd251; +st.shared.v2.f64 [r12+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd624, fd259; +st.shared.v2.f64 [r12+48], {fd363, fd362}; +sub.f64 fd364, fd622, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r12+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd619, fd275; +st.shared.v2.f64 [r12+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd617, fd283; +st.shared.v2.f64 [r12+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd615, fd291; +st.shared.v2.f64 [r12+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd683, fd300; +sub.f64 fd373, fd298, fd686; +st.shared.v2.f64 [r12+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd610, fd306; +st.shared.v2.f64 [r12+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd608, fd314; +st.shared.v2.f64 [r12+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd605, fd322; +st.shared.v2.f64 [r12+176], {fd379, fd378}; +sub.f64 fd380, fd602, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r12+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd600, fd338; +st.shared.v2.f64 [r12+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd596, fd346; +st.shared.v2.f64 [r12+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r12+240], {fd387, fd386}; +barrier.sync 0; +mad.lo.s32 r13, r20, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+128]; +ld.shared.v2.f64 {fd396, fd397}, [r13+256]; +ld.shared.v2.f64 {fd400, fd401}, [r13+384]; +ld.shared.v2.f64 {fd404, fd405}, [r13+512]; +ld.shared.v2.f64 {fd408, fd409}, [r13+640]; +ld.shared.v2.f64 {fd412, fd413}, [r13+768]; +ld.shared.v2.f64 {fd416, fd417}, [r13+896]; +ld.shared.v2.f64 {fd420, fd421}, [r13+1024]; +ld.shared.v2.f64 {fd424, fd425}, [r13+1152]; +ld.shared.v2.f64 {fd428, fd429}, [r13+1280]; +ld.shared.v2.f64 {fd432, fd433}, [r13+1408]; +ld.shared.v2.f64 {fd436, fd437}, [r13+1536]; +ld.shared.v2.f64 {fd440, fd441}, [r13+1664]; +ld.shared.v2.f64 {fd444, fd445}, [r13+1792]; +ld.shared.v2.f64 {fd448, fd449}, [r13+1920]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd595, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd594, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd593, fd595, fd594; +sub.f64 fd463, fd595, fd594; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd592, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd591, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd590, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd589, fd591, fd590; +sub.f64 fd479, fd591, fd590; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd588, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd588, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd587, fd485; +mul.f64 fd487, fd588, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd392, fd424; +sub.f64 fd495, fd392, fd424; +add.f64 fd586, fd393, fd425; +sub.f64 fd496, fd393, fd425; +add.f64 fd497, fd408, fd440; +sub.f64 fd499, fd408, fd440; +add.f64 fd585, fd409, fd441; +sub.f64 fd500, fd409, fd441; +add.f64 fd501, fd493, fd497; +sub.f64 fd503, fd493, fd497; +add.f64 fd584, fd586, fd585; +sub.f64 fd504, fd586, fd585; +add.f64 fd505, fd495, fd500; +sub.f64 fd507, fd495, fd500; +sub.f64 fd583, fd496, fd499; +add.f64 fd508, fd496, fd499; +add.f64 fd509, fd400, fd432; +sub.f64 fd511, fd400, fd432; +add.f64 fd582, fd401, fd433; +sub.f64 fd512, fd401, fd433; +add.f64 fd513, fd416, fd448; +sub.f64 fd515, fd416, fd448; +add.f64 fd581, fd417, fd449; +sub.f64 fd516, fd417, fd449; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd580, fd582, fd581; +sub.f64 fd520, fd582, fd581; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd579, fd512, fd515; +add.f64 fd524, fd512, fd515; +mul.f64 fd577, fd521, 0d3FE6A09E667F3BCD; +mul.f64 fd578, fd579, 0dBFE6A09E667F3BCD; +sub.f64 fd527, fd577, fd578; +mul.f64 fd528, fd579, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd529, fd521, 0dBFE6A09E667F3BCD, fd528; +mul.f64 fd530, fd523, 0dBFE6A09E667F3BCD; +mul.f64 fd531, fd524, 0dBFE6A09E667F3BCD; +sub.f64 fd532, fd530, fd531; +add.f64 fd533, fd530, fd531; +add.f64 %0, fd460, fd476; +add.f64 %1, fd593, fd589; +add.f64 %2, fd501, fd517; +add.f64 %3, fd584, fd580; +add.f64 %4, fd464, fd486; +add.f64 %5, fd592, fd488; +add.f64 %7, fd583, fd529; +add.f64 %6, fd505, fd527; +sub.f64 %9, fd463, fd478; +add.f64 %8, fd462, fd479; +sub.f64 %11, fd504, fd519; +add.f64 %10, fd503, fd520; +add.f64 %12, fd466, fd491; +add.f64 %13, fd467, fd492; +add.f64 %14, fd507, fd532; +add.f64 %15, fd508, fd533; +sub.f64 %17, fd593, fd589; +sub.f64 %16, fd460, fd476; +sub.f64 %19, fd584, fd580; +sub.f64 %18, fd501, fd517; +sub.f64 %21, fd592, fd488; +sub.f64 %20, fd464, fd486; +sub.f64 %23, fd583, fd529; +sub.f64 %22, fd505, fd527; +add.f64 %25, fd463, fd478; +sub.f64 %24, fd462, fd479; +add.f64 %27, fd504, fd519; +sub.f64 %26, fd503, fd520; +sub.f64 %29, fd467, fd492; +sub.f64 %28, fd466, fd491; +sub.f64 %31, fd508, fd533; +sub.f64 %30, fd507, fd532; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<454, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<534>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+128]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 896; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+64]; +ld.shared.f64 fd390, [r13+128]; +ld.shared.f64 fd391, [r13+192]; +ld.shared.f64 fd392, [r13+256]; +ld.shared.f64 fd393, [r13+320]; +ld.shared.f64 fd394, [r13+384]; +ld.shared.f64 fd395, [r13+448]; +ld.shared.f64 fd396, [r13+512]; +ld.shared.f64 fd397, [r13+576]; +ld.shared.f64 fd398, [r13+640]; +ld.shared.f64 fd399, [r13+704]; +ld.shared.f64 fd400, [r13+768]; +ld.shared.f64 fd401, [r13+832]; +ld.shared.f64 fd402, [r13+896]; +ld.shared.f64 fd403, [r13+960]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+64]; +ld.shared.f64 fd406, [r13+128]; +ld.shared.f64 fd407, [r13+192]; +ld.shared.f64 fd408, [r13+256]; +ld.shared.f64 fd409, [r13+320]; +ld.shared.f64 fd410, [r13+384]; +ld.shared.f64 fd411, [r13+448]; +ld.shared.f64 fd412, [r13+512]; +ld.shared.f64 fd413, [r13+576]; +ld.shared.f64 fd414, [r13+640]; +ld.shared.f64 fd415, [r13+704]; +ld.shared.f64 fd416, [r13+768]; +ld.shared.f64 fd417, [r13+832]; +ld.shared.f64 fd418, [r13+896]; +ld.shared.f64 fd419, [r13+960]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd389, fd397; +add.f64 fd462, fd405, fd413; +sub.f64 fd463, fd389, fd397; +sub.f64 fd464, fd405, fd413; +add.f64 fd465, fd393, fd401; +add.f64 fd466, fd409, fd417; +sub.f64 fd467, fd393, fd401; +sub.f64 fd468, fd409, fd417; +add.f64 fd469, fd461, fd465; +add.f64 fd470, fd462, fd466; +sub.f64 fd471, fd461, fd465; +sub.f64 fd472, fd462, fd466; +add.f64 fd473, fd463, fd468; +sub.f64 fd474, fd464, fd467; +sub.f64 fd475, fd463, fd468; +add.f64 fd476, fd464, fd467; +add.f64 fd477, fd391, fd399; +add.f64 fd478, fd407, fd415; +sub.f64 fd479, fd391, fd399; +sub.f64 fd480, fd407, fd415; +add.f64 fd481, fd395, fd403; +add.f64 fd482, fd411, fd419; +sub.f64 fd483, fd395, fd403; +sub.f64 fd484, fd411, fd419; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +mul.f64 fd493, fd489, 0d3FE6A09E667F3BCD; +mul.f64 fd494, fd490, 0dBFE6A09E667F3BCD; +sub.f64 fd495, fd493, fd494; +mul.f64 fd496, fd490, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd497, fd489, 0dBFE6A09E667F3BCD, fd496; +mul.f64 fd498, fd491, 0dBFE6A09E667F3BCD; +mul.f64 fd499, fd492, 0dBFE6A09E667F3BCD; +sub.f64 fd500, fd498, fd499; +add.f64 fd501, fd498, fd499; +add.f64 %0, fd428, fd444; +add.f64 %1, fd429, fd445; +add.f64 %2, fd469, fd485; +add.f64 %3, fd470, fd486; +add.f64 %5, fd433, fd456; +add.f64 %4, fd432, fd454; +add.f64 %7, fd474, fd497; +add.f64 %6, fd473, fd495; +sub.f64 %9, fd431, fd446; +add.f64 %8, fd430, fd447; +sub.f64 %11, fd472, fd487; +add.f64 %10, fd471, fd488; +add.f64 %13, fd435, fd460; +add.f64 %12, fd434, fd459; +add.f64 %15, fd476, fd501; +add.f64 %14, fd475, fd500; +sub.f64 %16, fd428, fd444; +sub.f64 %17, fd429, fd445; +sub.f64 %18, fd469, fd485; +sub.f64 %19, fd470, fd486; +sub.f64 %21, fd433, fd456; +sub.f64 %20, fd432, fd454; +sub.f64 %23, fd474, fd497; +sub.f64 %22, fd473, fd495; +add.f64 %25, fd431, fd446; +sub.f64 %24, fd430, fd447; +add.f64 %27, fd472, fd487; +sub.f64 %26, fd471, fd488; +sub.f64 %29, fd435, fd460; +sub.f64 %28, fd434, fd459; +sub.f64 %31, fd476, fd501; +sub.f64 %30, fd475, fd500; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<455, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<205>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+512]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 1984; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+512]; +ld.shared.v2.f64 {fd69, fd70}, [r13+1024]; +ld.shared.v2.f64 {fd73, fd74}, [r13+1536]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+128]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 1792; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+512]; +ld.shared.v2.f64 {fd129, fd130}, [r20+1024]; +ld.shared.v2.f64 {fd133, fd134}, [r20+1536]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd144; +sub.f64 fd148, fd140, fd143; +sub.f64 fd149, fd139, fd144; +add.f64 fd150, fd140, fd143; +and.b32 r21, r5, 16; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd151, fd147; +mul.f64 fd156, fd152, fd148; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd145; +mul.f64 fd164, fd162, fd146; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+32]; +mul.f64 fd170, fd166, fd149; +mul.f64 fd171, fd167, fd150; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 1024; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd152, fd147, fd157; +sub.f64 fd176, fd155, fd156; +st.shared.v2.f64 [r25+256], {fd176, fd175}; +fma.rn.f64 fd177, fd162, fd145, fd165; +sub.f64 fd178, fd163, fd164; +st.shared.v2.f64 [r25+512], {fd178, fd177}; +fma.rn.f64 fd179, fd167, fd149, fd172; +sub.f64 fd180, fd170, fd171; +st.shared.v2.f64 [r25+768], {fd180, fd179}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+512]; +ld.shared.v2.f64 {fd189, fd190}, [r26+1024]; +ld.shared.v2.f64 {fd193, fd194}, [r26+1536]; +add.f64 %1, fd182, fd190; +add.f64 %0, fd181, fd189; +add.f64 %3, fd186, fd194; +add.f64 %2, fd185, fd193; +sub.f64 %5, fd182, fd190; +sub.f64 %4, fd181, fd189; +sub.f64 %7, fd186, fd194; +sub.f64 %6, fd185, fd193; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<457, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<363>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+256]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 1920; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+256]; +ld.shared.v2.f64 {fd166, fd167}, [r13+512]; +ld.shared.v2.f64 {fd170, fd171}, [r13+768]; +ld.shared.v2.f64 {fd174, fd175}, [r13+1024]; +ld.shared.v2.f64 {fd178, fd179}, [r13+1280]; +ld.shared.v2.f64 {fd182, fd183}, [r13+1536]; +ld.shared.v2.f64 {fd186, fd187}, [r13+1792]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 8; +bfe.u32 r15, r5, 3, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+32]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 1024; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+256]; +ld.shared.v2.f64 {fd323, fd324}, [r20+512]; +ld.shared.v2.f64 {fd327, fd328}, [r20+768]; +ld.shared.v2.f64 {fd331, fd332}, [r20+1024]; +ld.shared.v2.f64 {fd335, fd336}, [r20+1280]; +ld.shared.v2.f64 {fd339, fd340}, [r20+1536]; +ld.shared.v2.f64 {fd343, fd344}, [r20+1792]; +add.f64 %1, fd316, fd332; +add.f64 %0, fd315, fd331; +add.f64 %3, fd320, fd336; +add.f64 %2, fd319, fd335; +add.f64 %5, fd324, fd340; +add.f64 %4, fd323, fd339; +add.f64 %7, fd328, fd344; +add.f64 %6, fd327, fd343; +sub.f64 %9, fd316, fd332; +sub.f64 %8, fd315, fd331; +sub.f64 %11, fd320, fd336; +sub.f64 %10, fd319, fd335; +sub.f64 %13, fd324, fd340; +sub.f64 %12, fd323, fd339; +sub.f64 %15, fd328, fd344; +sub.f64 %14, fd327, fd343; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<456, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<181>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+512]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 992; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+256]; +ld.shared.f64 fd63, [r13+512]; +ld.shared.f64 fd64, [r13+768]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+256]; +ld.shared.f64 fd67, [r13+512]; +ld.shared.f64 fd68, [r13+768]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+128]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 896; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+256]; +ld.shared.f64 fd115, [r21+512]; +ld.shared.f64 fd116, [r21+768]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+256]; +ld.shared.f64 fd119, [r21+512]; +ld.shared.f64 fd120, [r21+768]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +add.f64 fd133, fd123, fd128; +sub.f64 fd134, fd124, fd127; +sub.f64 fd135, fd123, fd128; +add.f64 fd136, fd124, fd127; +and.b32 r22, r5, 16; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd137, fd133; +mul.f64 fd142, fd138, fd134; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd137, fd134; +fma.rn.f64 fd145, fd138, fd133, fd144; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd148, fd131; +mul.f64 fd152, fd150, fd132; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd148, fd132; +fma.rn.f64 fd155, fd150, fd131, fd154; +ld.global.v2.f64 {fd156, fd157}, [rd11+32]; +mul.f64 fd160, fd156, fd135; +mul.f64 fd161, fd157, fd136; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd156, fd136; +fma.rn.f64 fd164, fd157, fd135, fd163; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 512; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd143; +st.shared.f64 [r26+256], fd153; +st.shared.f64 [r26+384], fd162; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+256]; +ld.shared.f64 fd167, [r27+512]; +ld.shared.f64 fd168, [r27+768]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+256]; +ld.shared.f64 fd171, [r27+512]; +ld.shared.f64 fd172, [r27+768]; +add.f64 %0, fd165, fd167; +add.f64 %1, fd169, fd171; +add.f64 %2, fd166, fd168; +add.f64 %3, fd170, fd172; +sub.f64 %4, fd165, fd167; +sub.f64 %5, fd169, fd171; +sub.f64 %6, fd166, fd168; +sub.f64 %7, fd170, fd172; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<458, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<49>; +.reg .f64 fd<139>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %11, %13; +sub.f64 fd10, %12, %14; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 2016; +add.s32 r11, r8, r10; +add.f64 fd18, %12, %14; +add.f64 fd19, %11, %13; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 1008; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+1024]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 1984; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 992; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+1024]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 1920; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 960; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+1024]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 3; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1792; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 896; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+1024]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 48; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd95, fd93; +mul.f64 fd100, fd96, fd94; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1536; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd96, fd93, fd101; +sub.f64 fd105, fd99, fd100; +st.shared.v2.f64 [r39+256], {fd105, fd104}; +barrier.sync 0; +and.b32 r40, r9, 768; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+1024]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd116, fd114; +mul.f64 fd121, fd117, fd115; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1024; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd117, fd114, fd122; +sub.f64 fd126, fd120, fd121; +st.shared.v2.f64 [r46+512], {fd126, fd125}; +barrier.sync 0; +and.b32 r47, r9, 512; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+1024]; +add.f64 %1, fd128, fd132; +add.f64 %0, fd127, fd131; +sub.f64 %3, fd128, fd132; +sub.f64 %2, fd127, fd131; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<459, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<49>; +.reg .f64 fd<115>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %11, %13; +add.f64 fd10, %12, %14; +sub.f64 fd11, %11, %13; +sub.f64 fd12, %12, %14; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 1008; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 504; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+512]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+512]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 992; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 496; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+512]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+512]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 960; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 480; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+512]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+512]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 3; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 896; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 448; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+512]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+512]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 48; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd81, fd79; +mul.f64 fd86, fd82, fd80; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd81, fd80; +fma.rn.f64 fd89, fd82, fd79, fd88; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 768; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd87; +barrier.sync 0; +and.b32 r40, r11, 384; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+512]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+512]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd98, fd96; +mul.f64 fd103, fd99, fd97; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd98, fd97; +fma.rn.f64 fd106, fd99, fd96, fd105; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 512; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd104; +barrier.sync 0; +and.b32 r47, r11, 256; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+512]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+512]; +add.f64 %0, fd107, fd108; +add.f64 %1, fd109, fd110; +sub.f64 %2, fd107, fd108; +sub.f64 %3, fd109, fd110; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..a6dac318fd955 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_128_fp64_inv.hpp.inc @@ -0,0 +1,2535 @@ +#ifndef CUFFTDX_FFT_128_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_128_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<623, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<331>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+256]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 960; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+128]; +ld.shared.f64 fd160, [r13+256]; +ld.shared.f64 fd161, [r13+384]; +ld.shared.f64 fd162, [r13+512]; +ld.shared.f64 fd163, [r13+640]; +ld.shared.f64 fd164, [r13+768]; +ld.shared.f64 fd165, [r13+896]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+128]; +ld.shared.f64 fd168, [r13+256]; +ld.shared.f64 fd169, [r13+384]; +ld.shared.f64 fd170, [r13+512]; +ld.shared.f64 fd171, [r13+640]; +ld.shared.f64 fd172, [r13+768]; +ld.shared.f64 fd173, [r13+896]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 8; +bfe.u32 r15, r5, 3, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+32]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 512; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+128]; +ld.shared.f64 fd301, [r21+256]; +ld.shared.f64 fd302, [r21+384]; +ld.shared.f64 fd303, [r21+512]; +ld.shared.f64 fd304, [r21+640]; +ld.shared.f64 fd305, [r21+768]; +ld.shared.f64 fd306, [r21+896]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+128]; +ld.shared.f64 fd309, [r21+256]; +ld.shared.f64 fd310, [r21+384]; +ld.shared.f64 fd311, [r21+512]; +ld.shared.f64 fd312, [r21+640]; +ld.shared.f64 fd313, [r21+768]; +ld.shared.f64 fd314, [r21+896]; +add.f64 %0, fd299, fd303; +add.f64 %1, fd307, fd311; +add.f64 %2, fd300, fd304; +add.f64 %3, fd308, fd312; +add.f64 %4, fd301, fd305; +add.f64 %5, fd309, fd313; +add.f64 %6, fd302, fd306; +add.f64 %7, fd310, fd314; +sub.f64 %8, fd299, fd303; +sub.f64 %9, fd307, fd311; +sub.f64 %10, fd300, fd304; +sub.f64 %11, fd308, fd312; +sub.f64 %12, fd301, fd305; +sub.f64 %13, fd309, fd313; +sub.f64 %14, fd302, fd306; +sub.f64 %15, fd310, fd314; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<624, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<685>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd680, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd678, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd677, fd680, fd678; +sub.f64 fd76, fd680, fd678; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd676, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd673, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd671, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd670, fd673, fd671; +sub.f64 fd92, fd673, fd671; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd669, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd669, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd667, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd668, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd667, fd668; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd666, fd677, fd670; +sub.f64 fd109, fd677, fd670; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd665, fd676, fd100; +sub.f64 fd113, fd676, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd664, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd663, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd661, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd658, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd657, fd661, fd658; +sub.f64 fd133, fd661, fd658; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd656, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd654, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd652, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd651, fd654, fd652; +sub.f64 fd149, fd654, fd652; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd650, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd650, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd648, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd649, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd648, fd649; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd647, fd657, fd651; +sub.f64 fd166, fd657, fd651; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd646, fd656, fd157; +sub.f64 fd170, fd656, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd645, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd644, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd642, fd167, 0d3FED906BCF328D46; +mul.f64 fd643, fd646, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd642, fd643; +mul.f64 fd182, fd646, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd645, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd644, 0d3FED906BCF328D46; +mul.f64 fd641, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd641, fd189; +mul.f64 fd191, fd644, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd640, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd640, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd638, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd639, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd638, fd639; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd636, fd177, 0dBFED906BCF328D46; +mul.f64 fd637, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd636, fd637; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd635, fd665, fd183; +sub.f64 fd213, fd665, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd634, fd664, fd187; +sub.f64 fd217, fd664, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd633, fd663, fd192; +sub.f64 fd221, fd663, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd632, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd631, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd630, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd629, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +and.b32 r14, r15, 7; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd635, fd239; +mul.f64 fd244, fd238, fd635; +mul.f64 fd246, fd239, fd239; +mul.f64 fd628, fd238, fd238; +sub.f64 fd247, fd628, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd634, fd249; +mul.f64 fd252, fd247, fd634; +mul.f64 fd626, fd238, fd247; +mul.f64 fd627, fd239, fd249; +sub.f64 fd255, fd626, fd627; +mul.f64 fd625, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd633, fd257; +mul.f64 fd260, fd255, fd633; +mul.f64 fd262, fd239, fd257; +mul.f64 fd624, fd238, fd255; +sub.f64 fd263, fd624, fd262; +mul.f64 fd623, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd632, fd265; +mul.f64 fd268, fd263, fd632; +mul.f64 fd270, fd239, fd265; +mul.f64 fd622, fd238, fd263; +sub.f64 fd271, fd622, fd270; +mul.f64 fd621, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd631, fd273; +mul.f64 fd276, fd271, fd631; +mul.f64 fd619, fd238, fd271; +mul.f64 fd620, fd239, fd273; +sub.f64 fd279, fd619, fd620; +mul.f64 fd618, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd630, fd281; +mul.f64 fd284, fd279, fd630; +mul.f64 fd286, fd239, fd281; +mul.f64 fd617, fd238, fd279; +sub.f64 fd287, fd617, fd286; +mul.f64 fd616, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd629, fd289; +mul.f64 fd292, fd287, fd629; +mul.f64 fd294, fd239, fd289; +mul.f64 fd615, fd238, fd287; +sub.f64 fd295, fd615, fd294; +mul.f64 fd614, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd613, fd666, fd647; +mul.f64 fd298, fd613, fd297; +sub.f64 fd612, fd106, fd163; +mul.f64 fd299, fd612, fd297; +mul.f64 fd300, fd295, fd613; +ld.global.v2.f64 {fd301, fd302}, [rd5+128]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd610, fd238, fd301; +mul.f64 fd611, fd239, fd302; +sub.f64 fd310, fd610, fd611; +mul.f64 fd609, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd608, fd238, fd310; +sub.f64 fd318, fd608, fd317; +mul.f64 fd607, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd605, fd238, fd318; +mul.f64 fd606, fd239, fd320; +sub.f64 fd326, fd605, fd606; +mul.f64 fd604, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd602, fd238, fd326; +mul.f64 fd603, fd239, fd328; +sub.f64 fd334, fd602, fd603; +mul.f64 fd601, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd600, fd238, fd334; +sub.f64 fd342, fd600, fd341; +mul.f64 fd599, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd597, fd238, fd342; +mul.f64 fd598, fd239, fd344; +sub.f64 fd350, fd597, fd598; +mul.f64 fd596, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd595, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 1792; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 7; +sub.f64 fd683, fd666, fd647; +mul.f64 fd682, fd295, fd683; +add.f64 fd356, fd666, fd647; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 7; +sub.f64 fd684, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 7; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 7; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd595; +st.shared.v2.f64 [r12+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd625; +st.shared.v2.f64 [r12+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd623; +st.shared.v2.f64 [r12+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd621; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r12+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd618; +st.shared.v2.f64 [r12+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd616; +st.shared.v2.f64 [r12+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd614; +st.shared.v2.f64 [r12+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd684, fd298; +sub.f64 fd373, fd682, fd299; +st.shared.v2.f64 [r12+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd609; +st.shared.v2.f64 [r12+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd607; +st.shared.v2.f64 [r12+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd604; +st.shared.v2.f64 [r12+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd601; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r12+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd599; +st.shared.v2.f64 [r12+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd596; +st.shared.v2.f64 [r12+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r12+240], {fd386, fd387}; +barrier.sync 0; +mad.lo.s32 r13, r22, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+128]; +ld.shared.v2.f64 {fd396, fd397}, [r13+256]; +ld.shared.v2.f64 {fd400, fd401}, [r13+384]; +ld.shared.v2.f64 {fd404, fd405}, [r13+512]; +ld.shared.v2.f64 {fd408, fd409}, [r13+640]; +ld.shared.v2.f64 {fd412, fd413}, [r13+768]; +ld.shared.v2.f64 {fd416, fd417}, [r13+896]; +ld.shared.v2.f64 {fd420, fd421}, [r13+1024]; +ld.shared.v2.f64 {fd424, fd425}, [r13+1152]; +ld.shared.v2.f64 {fd428, fd429}, [r13+1280]; +ld.shared.v2.f64 {fd432, fd433}, [r13+1408]; +ld.shared.v2.f64 {fd436, fd437}, [r13+1536]; +ld.shared.v2.f64 {fd440, fd441}, [r13+1664]; +ld.shared.v2.f64 {fd444, fd445}, [r13+1792]; +ld.shared.v2.f64 {fd448, fd449}, [r13+1920]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd594, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd593, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd592, fd594, fd593; +sub.f64 fd463, fd594, fd593; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd591, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd590, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd589, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd588, fd590, fd589; +sub.f64 fd479, fd590, fd589; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd587, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd587, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd586, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd586, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd392, fd424; +sub.f64 fd495, fd392, fd424; +add.f64 fd585, fd393, fd425; +sub.f64 fd496, fd393, fd425; +add.f64 fd497, fd408, fd440; +sub.f64 fd499, fd408, fd440; +add.f64 fd584, fd409, fd441; +sub.f64 fd500, fd409, fd441; +add.f64 fd501, fd493, fd497; +sub.f64 fd503, fd493, fd497; +add.f64 fd583, fd585, fd584; +sub.f64 fd504, fd585, fd584; +sub.f64 fd505, fd495, fd500; +add.f64 fd507, fd495, fd500; +add.f64 fd582, fd496, fd499; +sub.f64 fd508, fd496, fd499; +add.f64 fd509, fd400, fd432; +sub.f64 fd511, fd400, fd432; +add.f64 fd581, fd401, fd433; +sub.f64 fd512, fd401, fd433; +add.f64 fd513, fd416, fd448; +sub.f64 fd515, fd416, fd448; +add.f64 fd580, fd417, fd449; +sub.f64 fd516, fd417, fd449; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd579, fd581, fd580; +sub.f64 fd520, fd581, fd580; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd578, fd512, fd515; +sub.f64 fd524, fd512, fd515; +mul.f64 fd525, fd521, 0d3FE6A09E667F3BCD; +mul.f64 fd526, fd578, 0d3FE6A09E667F3BCD; +sub.f64 fd527, fd525, fd526; +add.f64 fd528, fd525, fd526; +mul.f64 fd530, fd524, 0d3FE6A09E667F3BCD; +mul.f64 fd577, fd523, 0dBFE6A09E667F3BCD; +sub.f64 fd531, fd577, fd530; +mul.f64 fd532, fd524, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd533, fd523, 0d3FE6A09E667F3BCD, fd532; +add.f64 %0, fd460, fd476; +add.f64 %1, fd592, fd588; +add.f64 %2, fd501, fd517; +add.f64 %3, fd583, fd579; +add.f64 %4, fd464, fd486; +add.f64 %5, fd591, fd487; +add.f64 %7, fd582, fd528; +add.f64 %6, fd505, fd527; +add.f64 %9, fd463, fd478; +sub.f64 %8, fd462, fd479; +add.f64 %11, fd504, fd519; +sub.f64 %10, fd503, fd520; +add.f64 %12, fd466, fd490; +add.f64 %13, fd467, fd492; +add.f64 %14, fd507, fd531; +add.f64 %15, fd508, fd533; +sub.f64 %17, fd592, fd588; +sub.f64 %16, fd460, fd476; +sub.f64 %19, fd583, fd579; +sub.f64 %18, fd501, fd517; +sub.f64 %21, fd591, fd487; +sub.f64 %20, fd464, fd486; +sub.f64 %23, fd582, fd528; +sub.f64 %22, fd505, fd527; +sub.f64 %25, fd463, fd478; +add.f64 %24, fd462, fd479; +sub.f64 %27, fd504, fd519; +add.f64 %26, fd503, fd520; +sub.f64 %29, fd467, fd492; +sub.f64 %28, fd466, fd490; +sub.f64 %31, fd508, fd533; +sub.f64 %30, fd507, fd531; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<625, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<534>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+128]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 896; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+64]; +ld.shared.f64 fd390, [r13+128]; +ld.shared.f64 fd391, [r13+192]; +ld.shared.f64 fd392, [r13+256]; +ld.shared.f64 fd393, [r13+320]; +ld.shared.f64 fd394, [r13+384]; +ld.shared.f64 fd395, [r13+448]; +ld.shared.f64 fd396, [r13+512]; +ld.shared.f64 fd397, [r13+576]; +ld.shared.f64 fd398, [r13+640]; +ld.shared.f64 fd399, [r13+704]; +ld.shared.f64 fd400, [r13+768]; +ld.shared.f64 fd401, [r13+832]; +ld.shared.f64 fd402, [r13+896]; +ld.shared.f64 fd403, [r13+960]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+64]; +ld.shared.f64 fd406, [r13+128]; +ld.shared.f64 fd407, [r13+192]; +ld.shared.f64 fd408, [r13+256]; +ld.shared.f64 fd409, [r13+320]; +ld.shared.f64 fd410, [r13+384]; +ld.shared.f64 fd411, [r13+448]; +ld.shared.f64 fd412, [r13+512]; +ld.shared.f64 fd413, [r13+576]; +ld.shared.f64 fd414, [r13+640]; +ld.shared.f64 fd415, [r13+704]; +ld.shared.f64 fd416, [r13+768]; +ld.shared.f64 fd417, [r13+832]; +ld.shared.f64 fd418, [r13+896]; +ld.shared.f64 fd419, [r13+960]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd389, fd397; +add.f64 fd462, fd405, fd413; +sub.f64 fd463, fd389, fd397; +sub.f64 fd464, fd405, fd413; +add.f64 fd465, fd393, fd401; +add.f64 fd466, fd409, fd417; +sub.f64 fd467, fd393, fd401; +sub.f64 fd468, fd409, fd417; +add.f64 fd469, fd461, fd465; +add.f64 fd470, fd462, fd466; +sub.f64 fd471, fd461, fd465; +sub.f64 fd472, fd462, fd466; +sub.f64 fd473, fd463, fd468; +add.f64 fd474, fd464, fd467; +add.f64 fd475, fd463, fd468; +sub.f64 fd476, fd464, fd467; +add.f64 fd477, fd391, fd399; +add.f64 fd478, fd407, fd415; +sub.f64 fd479, fd391, fd399; +sub.f64 fd480, fd407, fd415; +add.f64 fd481, fd395, fd403; +add.f64 fd482, fd411, fd419; +sub.f64 fd483, fd395, fd403; +sub.f64 fd484, fd411, fd419; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +mul.f64 fd493, fd489, 0d3FE6A09E667F3BCD; +mul.f64 fd494, fd490, 0d3FE6A09E667F3BCD; +sub.f64 fd495, fd493, fd494; +add.f64 fd496, fd493, fd494; +mul.f64 fd497, fd491, 0dBFE6A09E667F3BCD; +mul.f64 fd498, fd492, 0d3FE6A09E667F3BCD; +sub.f64 fd499, fd497, fd498; +mul.f64 fd500, fd492, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd501, fd491, 0d3FE6A09E667F3BCD, fd500; +add.f64 %0, fd428, fd444; +add.f64 %1, fd429, fd445; +add.f64 %2, fd469, fd485; +add.f64 %3, fd470, fd486; +add.f64 %5, fd433, fd455; +add.f64 %4, fd432, fd454; +add.f64 %7, fd474, fd496; +add.f64 %6, fd473, fd495; +add.f64 %9, fd431, fd446; +sub.f64 %8, fd430, fd447; +add.f64 %11, fd472, fd487; +sub.f64 %10, fd471, fd488; +add.f64 %13, fd435, fd460; +add.f64 %12, fd434, fd458; +add.f64 %15, fd476, fd501; +add.f64 %14, fd475, fd499; +sub.f64 %16, fd428, fd444; +sub.f64 %17, fd429, fd445; +sub.f64 %18, fd469, fd485; +sub.f64 %19, fd470, fd486; +sub.f64 %21, fd433, fd455; +sub.f64 %20, fd432, fd454; +sub.f64 %23, fd474, fd496; +sub.f64 %22, fd473, fd495; +sub.f64 %25, fd431, fd446; +add.f64 %24, fd430, fd447; +sub.f64 %27, fd472, fd487; +add.f64 %26, fd471, fd488; +sub.f64 %29, fd435, fd460; +sub.f64 %28, fd434, fd458; +sub.f64 %31, fd476, fd501; +sub.f64 %30, fd475, fd499; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<626, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<205>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+512]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 1984; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+512]; +ld.shared.v2.f64 {fd69, fd70}, [r13+1024]; +ld.shared.v2.f64 {fd73, fd74}, [r13+1536]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+128]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 1792; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+512]; +ld.shared.v2.f64 {fd129, fd130}, [r20+1024]; +ld.shared.v2.f64 {fd133, fd134}, [r20+1536]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +sub.f64 fd147, fd139, fd144; +add.f64 fd148, fd140, fd143; +add.f64 fd149, fd139, fd144; +sub.f64 fd150, fd140, fd143; +and.b32 r21, r5, 16; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd148, fd152; +mul.f64 fd156, fd147, fd152; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd146, fd162; +mul.f64 fd164, fd145, fd162; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+32]; +mul.f64 fd170, fd150, fd167; +mul.f64 fd171, fd149, fd167; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 1024; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd151, fd147, fd155; +sub.f64 fd176, fd157, fd156; +st.shared.v2.f64 [r25+256], {fd175, fd176}; +fma.rn.f64 fd177, fd160, fd145, fd163; +sub.f64 fd178, fd165, fd164; +st.shared.v2.f64 [r25+512], {fd177, fd178}; +fma.rn.f64 fd179, fd166, fd149, fd170; +sub.f64 fd180, fd172, fd171; +st.shared.v2.f64 [r25+768], {fd179, fd180}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+512]; +ld.shared.v2.f64 {fd189, fd190}, [r26+1024]; +ld.shared.v2.f64 {fd193, fd194}, [r26+1536]; +add.f64 %1, fd182, fd190; +add.f64 %0, fd181, fd189; +add.f64 %3, fd186, fd194; +add.f64 %2, fd185, fd193; +sub.f64 %5, fd182, fd190; +sub.f64 %4, fd181, fd189; +sub.f64 %7, fd186, fd194; +sub.f64 %6, fd185, fd193; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<628, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<363>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+256]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 1920; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+256]; +ld.shared.v2.f64 {fd166, fd167}, [r13+512]; +ld.shared.v2.f64 {fd170, fd171}, [r13+768]; +ld.shared.v2.f64 {fd174, fd175}, [r13+1024]; +ld.shared.v2.f64 {fd178, fd179}, [r13+1280]; +ld.shared.v2.f64 {fd182, fd183}, [r13+1536]; +ld.shared.v2.f64 {fd186, fd187}, [r13+1792]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 8; +bfe.u32 r15, r5, 3, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+32]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 1024; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+256]; +ld.shared.v2.f64 {fd323, fd324}, [r20+512]; +ld.shared.v2.f64 {fd327, fd328}, [r20+768]; +ld.shared.v2.f64 {fd331, fd332}, [r20+1024]; +ld.shared.v2.f64 {fd335, fd336}, [r20+1280]; +ld.shared.v2.f64 {fd339, fd340}, [r20+1536]; +ld.shared.v2.f64 {fd343, fd344}, [r20+1792]; +add.f64 %1, fd316, fd332; +add.f64 %0, fd315, fd331; +add.f64 %3, fd320, fd336; +add.f64 %2, fd319, fd335; +add.f64 %5, fd324, fd340; +add.f64 %4, fd323, fd339; +add.f64 %7, fd328, fd344; +add.f64 %6, fd327, fd343; +sub.f64 %9, fd316, fd332; +sub.f64 %8, fd315, fd331; +sub.f64 %11, fd320, fd336; +sub.f64 %10, fd319, fd335; +sub.f64 %13, fd324, fd340; +sub.f64 %12, fd323, fd339; +sub.f64 %15, fd328, fd344; +sub.f64 %14, fd327, fd343; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<627, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<181>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+512]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 992; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+256]; +ld.shared.f64 fd63, [r13+512]; +ld.shared.f64 fd64, [r13+768]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+256]; +ld.shared.f64 fd67, [r13+512]; +ld.shared.f64 fd68, [r13+768]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 28; +bfe.u32 r15, r5, 2, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+128]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 896; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+256]; +ld.shared.f64 fd115, [r21+512]; +ld.shared.f64 fd116, [r21+768]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+256]; +ld.shared.f64 fd119, [r21+512]; +ld.shared.f64 fd120, [r21+768]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd128; +add.f64 fd134, fd124, fd127; +add.f64 fd135, fd123, fd128; +sub.f64 fd136, fd124, fd127; +and.b32 r22, r5, 16; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd134, fd138; +fma.rn.f64 fd142, fd137, fd133, fd141; +mul.f64 fd143, fd133, fd138; +mul.f64 fd144, fd137, fd134; +sub.f64 fd145, fd144, fd143; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd132, fd150; +fma.rn.f64 fd152, fd148, fd131, fd151; +mul.f64 fd153, fd131, fd150; +mul.f64 fd154, fd148, fd132; +sub.f64 fd155, fd154, fd153; +ld.global.v2.f64 {fd156, fd157}, [rd11+32]; +mul.f64 fd160, fd136, fd157; +fma.rn.f64 fd161, fd156, fd135, fd160; +mul.f64 fd162, fd135, fd157; +mul.f64 fd163, fd156, fd136; +sub.f64 fd164, fd163, fd162; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 512; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd142; +st.shared.f64 [r26+256], fd152; +st.shared.f64 [r26+384], fd161; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+256]; +ld.shared.f64 fd167, [r27+512]; +ld.shared.f64 fd168, [r27+768]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+256]; +ld.shared.f64 fd171, [r27+512]; +ld.shared.f64 fd172, [r27+768]; +add.f64 %0, fd165, fd167; +add.f64 %1, fd169, fd171; +add.f64 %2, fd166, fd168; +add.f64 %3, fd170, fd172; +sub.f64 %4, fd165, fd167; +sub.f64 %5, fd169, fd171; +sub.f64 %6, fd166, fd168; +sub.f64 %7, fd170, fd172; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<629, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<49>; +.reg .f64 fd<139>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %11, %13; +sub.f64 fd10, %12, %14; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 2016; +add.s32 r11, r8, r10; +add.f64 fd18, %12, %14; +add.f64 fd19, %11, %13; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 1008; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+1024]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 1984; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 992; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+1024]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 1920; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 960; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+1024]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 3; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1792; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 896; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+1024]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 48; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd94, fd96; +mul.f64 fd100, fd93, fd96; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1536; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd95, fd93, fd99; +sub.f64 fd105, fd101, fd100; +st.shared.v2.f64 [r39+256], {fd104, fd105}; +barrier.sync 0; +and.b32 r40, r9, 768; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+1024]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd115, fd117; +mul.f64 fd121, fd114, fd117; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1024; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd116, fd114, fd120; +sub.f64 fd126, fd122, fd121; +st.shared.v2.f64 [r46+512], {fd125, fd126}; +barrier.sync 0; +and.b32 r47, r9, 512; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+1024]; +add.f64 %1, fd128, fd132; +add.f64 %0, fd127, fd131; +sub.f64 %3, fd128, fd132; +sub.f64 %2, fd127, fd131; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<630, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<49>; +.reg .f64 fd<115>; +.reg .b64 rd<21>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %11, %13; +add.f64 fd10, %12, %14; +sub.f64 fd11, %11, %13; +sub.f64 fd12, %12, %14; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 1008; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 504; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+512]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+512]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 5; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 992; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 496; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+512]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+512]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 4; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 960; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 480; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+512]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+512]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 3; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 896; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 448; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+512]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+512]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 48; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd80, fd82; +fma.rn.f64 fd86, fd81, fd79, fd85; +mul.f64 fd87, fd79, fd82; +mul.f64 fd88, fd81, fd80; +sub.f64 fd89, fd88, fd87; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 768; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd86; +barrier.sync 0; +and.b32 r40, r11, 384; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+512]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+512]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 1; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd97, fd99; +fma.rn.f64 fd103, fd98, fd96, fd102; +mul.f64 fd104, fd96, fd99; +mul.f64 fd105, fd98, fd97; +sub.f64 fd106, fd105, fd104; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 512; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd103; +barrier.sync 0; +and.b32 r47, r11, 256; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+512]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+512]; +add.f64 %0, fd107, fd108; +add.f64 %1, fd109, fd110; +sub.f64 %2, fd107, fd108; +sub.f64 %3, fd109, fd110; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..ac6cfb98af967 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp16_fwd.hpp.inc @@ -0,0 +1,4090 @@ +#ifndef CUFFTDX_FFT_1296_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_1296_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<934, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<164>; +.reg .b32 r<1606>; +.reg .b64 rd<8>; +mov.u32 r1581, %tid.y; +shl.b32 r1582, r1581, 1; +mov.u32 r1583, %12; +mad.lo.s32 r1584, r1582, 5184, r1583; +mov.u32 r1585, %tid.x; +mov.f32 f140, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1, {low, high}; +} +mov.f32 f142, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %17, %21; +} +{ +add.f16x2 r8, %13, r5; +} +{ +add.f16x2 r11, %18, %22; +} +{ +add.f16x2 r14, %14, r11; +} +{ +add.f16x2 r17, %17, %21; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %13, r20; +} +{ +sub.f16x2 r26, %18, %22; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %17, %21; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %13, r38; +} +{ +sub.f16x2 r44, %18, %22; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %18, %22; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %14, r56; +} +{ +sub.f16x2 r62, %17, %21; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %18, %22; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %14, r74; +} +{ +sub.f16x2 r80, %17, %21; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %19, %23; +} +{ +add.f16x2 r96, %15, r93; +} +{ +add.f16x2 r99, %20, %24; +} +{ +add.f16x2 r102, %16, r99; +} +{ +add.f16x2 r105, %19, %23; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %15, r108; +} +{ +sub.f16x2 r114, %20, %24; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %19, %23; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %15, r126; +} +{ +sub.f16x2 r132, %20, %24; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %20, %24; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %16, r144; +} +{ +sub.f16x2 r150, %19, %23; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %20, %24; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %16, r162; +} +{ +sub.f16x2 r168, %19, %23; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f136, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r180, {low, high}; +} +mov.f32 f123, 0fBF800000; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +shr.u32 r1586, r1585, 3; +mul.wide.u32 rd2, r1586, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1587, rd3; +mul.lo.s32 r1588, r1587, 216; +sub.s32 r1589, r1585, r1588; +shl.b32 r1590, r1587, 1; +mad.lo.s32 r1591, r1590, 5184, r1584; +cvt.rn.f32.u32 f155, r1589; +mul.f32 f156, f155, 0f3B9EDD1F; +cos.approx.f32 f29, f156; +sin.approx.f32 f157, f156; +neg.f32 f30, f157; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r260, {high, high}; +} +{ +mul.f16x2 r262, r234, r260; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r231, r258, r265; +} +{ +mul.f16x2 r271, r231, r260; +} +{ +fma.rn.f16x2 r274, r234, r258, r271; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r280, {high, high}; +} +mov.f32 f124, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r280, r282; +} +{ +mul.f16x2 r286, r255, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r289, {high, low}; +} +{ +fma.rn.f16x2 r291, r283, r289, r286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r297, {high, high}; +} +{ +mul.f16x2 r299, r246, r297; +} +{ +neg.f16x2 r302, r299; +} +{ +fma.rn.f16x2 r304, r243, r295, r302; +} +{ +mul.f16x2 r308, r243, r297; +} +{ +fma.rn.f16x2 r311, r246, r295, r308; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r315, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r317, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r319, {low, high}; +} +{ +mul.f16x2 r320, r317, r319; +} +{ +mul.f16x2 r323, r291, r315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r326, {high, low}; +} +{ +fma.rn.f16x2 r328, r320, r326, r323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r334, {high, high}; +} +{ +mul.f16x2 r336, r228, r334; +} +{ +neg.f16x2 r339, r336; +} +{ +fma.rn.f16x2 r341, r225, r332, r339; +} +{ +mul.f16x2 r345, r225, r334; +} +{ +fma.rn.f16x2 r348, r228, r332, r345; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r352, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r354, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r356, {low, high}; +} +{ +mul.f16x2 r357, r354, r356; +} +{ +mul.f16x2 r360, r328, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r363, {high, low}; +} +{ +fma.rn.f16x2 r365, r357, r363, r360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r371, {high, high}; +} +{ +mul.f16x2 r373, r240, r371; +} +{ +neg.f16x2 r376, r373; +} +{ +fma.rn.f16x2 r378, r237, r369, r376; +} +{ +mul.f16x2 r382, r237, r371; +} +{ +fma.rn.f16x2 r385, r240, r369, r382; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r389, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r391, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r393, {low, high}; +} +{ +mul.f16x2 r394, r391, r393; +} +{ +mul.f16x2 r397, r365, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r400, {high, low}; +} +{ +fma.rn.f16x2 r402, r394, r400, r397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r408, {high, high}; +} +{ +mul.f16x2 r410, r252, r408; +} +{ +neg.f16x2 r413, r410; +} +{ +fma.rn.f16x2 r415, r249, r406, r413; +} +{ +mul.f16x2 r419, r249, r408; +} +{ +fma.rn.f16x2 r422, r252, r406, r419; +} +barrier.sync 0; +mad.lo.s32 r1592, r1589, 48, r1591; +st.shared.v2.f32 [r1592], {r219, r222}; +st.shared.v2.f32 [r1592+8], {r267, r274}; +st.shared.v2.f32 [r1592+16], {r304, r311}; +st.shared.v2.f32 [r1592+24], {r341, r348}; +st.shared.v2.f32 [r1592+32], {r378, r385}; +st.shared.v2.f32 [r1592+40], {r415, r422}; +barrier.sync 0; +mad.lo.s32 r1593, r1589, -40, r1592; +ld.shared.u32 r451, [r1593]; +ld.shared.u32 r457, [r1593+4]; +ld.shared.u32 r539, [r1593+1728]; +ld.shared.u32 r545, [r1593+1732]; +ld.shared.u32 r448, [r1593+3456]; +ld.shared.u32 r454, [r1593+3460]; +ld.shared.u32 r536, [r1593+5184]; +ld.shared.u32 r542, [r1593+5188]; +ld.shared.u32 r449, [r1593+6912]; +ld.shared.u32 r455, [r1593+6916]; +ld.shared.u32 r537, [r1593+8640]; +ld.shared.u32 r543, [r1593+8644]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r444, {low, high}; +} +{ +neg.f16x2 r445, r444; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r454, r455; +} +{ +add.f16x2 r456, r457, r453; +} +{ +add.f16x2 r459, r448, r449; +} +{ +mul.f16x2 r462, r459, r443; +} +{ +add.f16x2 r465, r451, r462; +} +{ +sub.f16x2 r468, r454, r455; +} +{ +mul.f16x2 r471, r468, r445; +} +{ +add.f16x2 r474, r465, r471; +} +{ +add.f16x2 r477, r448, r449; +} +{ +mul.f16x2 r480, r477, r443; +} +{ +add.f16x2 r483, r451, r480; +} +{ +sub.f16x2 r486, r454, r455; +} +{ +mul.f16x2 r489, r486, r445; +} +{ +sub.f16x2 r492, r483, r489; +} +{ +add.f16x2 r495, r454, r455; +} +{ +mul.f16x2 r498, r495, r443; +} +{ +add.f16x2 r501, r457, r498; +} +{ +sub.f16x2 r504, r448, r449; +} +{ +mul.f16x2 r507, r504, r445; +} +{ +sub.f16x2 r510, r501, r507; +} +{ +add.f16x2 r513, r454, r455; +} +{ +mul.f16x2 r516, r513, r443; +} +{ +add.f16x2 r519, r457, r516; +} +{ +sub.f16x2 r522, r448, r449; +} +{ +mul.f16x2 r525, r522, r445; +} +{ +add.f16x2 r528, r519, r525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r532, {low, high}; +} +{ +neg.f16x2 r533, r532; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r535; +} +{ +add.f16x2 r541, r542, r543; +} +{ +add.f16x2 r544, r545, r541; +} +{ +add.f16x2 r547, r536, r537; +} +{ +mul.f16x2 r550, r547, r531; +} +{ +add.f16x2 r553, r539, r550; +} +{ +sub.f16x2 r556, r542, r543; +} +{ +mul.f16x2 r559, r556, r533; +} +{ +add.f16x2 r562, r553, r559; +} +{ +add.f16x2 r565, r536, r537; +} +{ +mul.f16x2 r568, r565, r531; +} +{ +add.f16x2 r571, r539, r568; +} +{ +sub.f16x2 r574, r542, r543; +} +{ +mul.f16x2 r577, r574, r533; +} +{ +sub.f16x2 r580, r571, r577; +} +{ +add.f16x2 r583, r542, r543; +} +{ +mul.f16x2 r586, r583, r531; +} +{ +add.f16x2 r589, r545, r586; +} +{ +sub.f16x2 r592, r536, r537; +} +{ +mul.f16x2 r595, r592, r533; +} +{ +sub.f16x2 r598, r589, r595; +} +{ +add.f16x2 r601, r542, r543; +} +{ +mul.f16x2 r604, r601, r531; +} +{ +add.f16x2 r607, r545, r604; +} +{ +sub.f16x2 r610, r536, r537; +} +{ +mul.f16x2 r613, r610, r533; +} +{ +add.f16x2 r616, r607, r613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r619, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r622, {low, high}; +} +{ +mul.f16x2 r629, r562, r619; +} +{ +mul.f16x2 r632, r598, r620; +} +{ +sub.f16x2 r635, r629, r632; +} +{ +mul.f16x2 r638, r562, r620; +} +{ +fma.rn.f16x2 r641, r598, r619, r638; +} +{ +mul.f16x2 r645, r580, r621; +} +{ +mul.f16x2 r648, r616, r622; +} +{ +sub.f16x2 r651, r645, r648; +} +{ +mul.f16x2 r654, r580, r622; +} +{ +fma.rn.f16x2 r657, r616, r621, r654; +} +{ +add.f16x2 r661, r450, r538; +} +{ +add.f16x2 r664, r456, r544; +} +{ +sub.f16x2 r667, r450, r538; +} +{ +sub.f16x2 r670, r456, r544; +} +{ +add.f16x2 r673, r474, r635; +} +{ +add.f16x2 r676, r510, r641; +} +{ +sub.f16x2 r679, r474, r635; +} +{ +sub.f16x2 r682, r510, r641; +} +{ +add.f16x2 r685, r492, r651; +} +{ +add.f16x2 r688, r528, r657; +} +{ +sub.f16x2 r691, r492, r651; +} +{ +sub.f16x2 r694, r528, r657; +} +mul.wide.u32 rd4, r1589, -1431655765; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1594, rd5; +cvt.rn.f32.u32 f158, r1594; +mul.f32 f159, f158, 0f3CEE4BAE; +cos.approx.f32 f71, f159; +sin.approx.f32 f160, f159; +neg.f32 f72, f160; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r697, {low, high}; +} +mul.lo.s32 r1595, r1594, 6; +sub.s32 r1596, r1589, r1595; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r702, {high, high}; +} +{ +mul.f16x2 r704, r676, r702; +} +{ +neg.f16x2 r707, r704; +} +{ +fma.rn.f16x2 r709, r673, r700, r707; +} +{ +mul.f16x2 r713, r673, r702; +} +{ +fma.rn.f16x2 r716, r676, r700, r713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r722, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r724, {low, high}; +} +{ +mul.f16x2 r725, r722, r724; +} +{ +mul.f16x2 r728, r697, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r731, {high, low}; +} +{ +fma.rn.f16x2 r733, r725, r731, r728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r739, {high, high}; +} +{ +mul.f16x2 r741, r688, r739; +} +{ +neg.f16x2 r744, r741; +} +{ +fma.rn.f16x2 r746, r685, r737, r744; +} +{ +mul.f16x2 r750, r685, r739; +} +{ +fma.rn.f16x2 r753, r688, r737, r750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r759, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r759, r761; +} +{ +mul.f16x2 r765, r733, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r768, {high, low}; +} +{ +fma.rn.f16x2 r770, r762, r768, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r774, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r776, {high, high}; +} +{ +mul.f16x2 r778, r670, r776; +} +{ +neg.f16x2 r781, r778; +} +{ +fma.rn.f16x2 r783, r667, r774, r781; +} +{ +mul.f16x2 r787, r667, r776; +} +{ +fma.rn.f16x2 r790, r670, r774, r787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r796, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r798, {low, high}; +} +{ +mul.f16x2 r799, r796, r798; +} +{ +mul.f16x2 r802, r770, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r805, {high, low}; +} +{ +fma.rn.f16x2 r807, r799, r805, r802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r811, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r813, {high, high}; +} +{ +mul.f16x2 r815, r682, r813; +} +{ +neg.f16x2 r818, r815; +} +{ +fma.rn.f16x2 r820, r679, r811, r818; +} +{ +mul.f16x2 r824, r679, r813; +} +{ +fma.rn.f16x2 r827, r682, r811, r824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r833, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r833, r835; +} +{ +mul.f16x2 r839, r807, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r842, {high, low}; +} +{ +fma.rn.f16x2 r844, r836, r842, r839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r848, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r850, {high, high}; +} +{ +mul.f16x2 r852, r694, r850; +} +{ +neg.f16x2 r855, r852; +} +{ +fma.rn.f16x2 r857, r691, r848, r855; +} +{ +mul.f16x2 r861, r691, r850; +} +{ +fma.rn.f16x2 r864, r694, r848, r861; +} +shl.b32 r1597, r1596, 3; +add.s32 r1598, r1591, r1597; +barrier.sync 0; +mad.lo.s32 r1599, r1594, 288, r1598; +st.shared.u32 [r1599], r661; +st.shared.u32 [r1599+4], r664; +st.shared.u32 [r1599+48], r709; +st.shared.u32 [r1599+52], r716; +st.shared.u32 [r1599+96], r746; +st.shared.u32 [r1599+100], r753; +st.shared.u32 [r1599+144], r783; +st.shared.u32 [r1599+148], r790; +st.shared.u32 [r1599+192], r820; +st.shared.u32 [r1599+196], r827; +st.shared.u32 [r1599+240], r857; +st.shared.u32 [r1599+244], r864; +barrier.sync 0; +ld.shared.u32 r893, [r1593]; +ld.shared.u32 r899, [r1593+4]; +ld.shared.u32 r981, [r1593+1728]; +ld.shared.u32 r987, [r1593+1732]; +ld.shared.u32 r890, [r1593+3456]; +ld.shared.u32 r896, [r1593+3460]; +ld.shared.u32 r978, [r1593+5184]; +ld.shared.u32 r984, [r1593+5188]; +ld.shared.u32 r891, [r1593+6912]; +ld.shared.u32 r897, [r1593+6916]; +ld.shared.u32 r979, [r1593+8640]; +ld.shared.u32 r985, [r1593+8644]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r886, {low, high}; +} +{ +neg.f16x2 r887, r886; +} +{ +add.f16x2 r889, r890, r891; +} +{ +add.f16x2 r892, r893, r889; +} +{ +add.f16x2 r895, r896, r897; +} +{ +add.f16x2 r898, r899, r895; +} +{ +add.f16x2 r901, r890, r891; +} +{ +mul.f16x2 r904, r901, r885; +} +{ +add.f16x2 r907, r893, r904; +} +{ +sub.f16x2 r910, r896, r897; +} +{ +mul.f16x2 r913, r910, r887; +} +{ +add.f16x2 r916, r907, r913; +} +{ +add.f16x2 r919, r890, r891; +} +{ +mul.f16x2 r922, r919, r885; +} +{ +add.f16x2 r925, r893, r922; +} +{ +sub.f16x2 r928, r896, r897; +} +{ +mul.f16x2 r931, r928, r887; +} +{ +sub.f16x2 r934, r925, r931; +} +{ +add.f16x2 r937, r896, r897; +} +{ +mul.f16x2 r940, r937, r885; +} +{ +add.f16x2 r943, r899, r940; +} +{ +sub.f16x2 r946, r890, r891; +} +{ +mul.f16x2 r949, r946, r887; +} +{ +sub.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, r896, r897; +} +{ +mul.f16x2 r958, r955, r885; +} +{ +add.f16x2 r961, r899, r958; +} +{ +sub.f16x2 r964, r890, r891; +} +{ +mul.f16x2 r967, r964, r887; +} +{ +add.f16x2 r970, r961, r967; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r973, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r974, {low, high}; +} +{ +neg.f16x2 r975, r974; +} +{ +add.f16x2 r977, r978, r979; +} +{ +add.f16x2 r980, r981, r977; +} +{ +add.f16x2 r983, r984, r985; +} +{ +add.f16x2 r986, r987, r983; +} +{ +add.f16x2 r989, r978, r979; +} +{ +mul.f16x2 r992, r989, r973; +} +{ +add.f16x2 r995, r981, r992; +} +{ +sub.f16x2 r998, r984, r985; +} +{ +mul.f16x2 r1001, r998, r975; +} +{ +add.f16x2 r1004, r995, r1001; +} +{ +add.f16x2 r1007, r978, r979; +} +{ +mul.f16x2 r1010, r1007, r973; +} +{ +add.f16x2 r1013, r981, r1010; +} +{ +sub.f16x2 r1016, r984, r985; +} +{ +mul.f16x2 r1019, r1016, r975; +} +{ +sub.f16x2 r1022, r1013, r1019; +} +{ +add.f16x2 r1025, r984, r985; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r987, r1028; +} +{ +sub.f16x2 r1034, r978, r979; +} +{ +mul.f16x2 r1037, r1034, r975; +} +{ +sub.f16x2 r1040, r1031, r1037; +} +{ +add.f16x2 r1043, r984, r985; +} +{ +mul.f16x2 r1046, r1043, r973; +} +{ +add.f16x2 r1049, r987, r1046; +} +{ +sub.f16x2 r1052, r978, r979; +} +{ +mul.f16x2 r1055, r1052, r975; +} +{ +add.f16x2 r1058, r1049, r1055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1061, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1062, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1063, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1064, {low, high}; +} +{ +mul.f16x2 r1071, r1004, r1061; +} +{ +mul.f16x2 r1074, r1040, r1062; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r1004, r1062; +} +{ +fma.rn.f16x2 r1083, r1040, r1061, r1080; +} +{ +mul.f16x2 r1087, r1022, r1063; +} +{ +mul.f16x2 r1090, r1058, r1064; +} +{ +sub.f16x2 r1093, r1087, r1090; +} +{ +mul.f16x2 r1096, r1022, r1064; +} +{ +fma.rn.f16x2 r1099, r1058, r1063, r1096; +} +{ +add.f16x2 r1103, r892, r980; +} +{ +add.f16x2 r1106, r898, r986; +} +{ +sub.f16x2 r1109, r892, r980; +} +{ +sub.f16x2 r1112, r898, r986; +} +{ +add.f16x2 r1115, r916, r1077; +} +{ +add.f16x2 r1118, r952, r1083; +} +{ +sub.f16x2 r1121, r916, r1077; +} +{ +sub.f16x2 r1124, r952, r1083; +} +{ +add.f16x2 r1127, r934, r1093; +} +{ +add.f16x2 r1130, r970, r1099; +} +{ +sub.f16x2 r1133, r934, r1093; +} +{ +sub.f16x2 r1136, r970, r1099; +} +mul.wide.u32 rd6, r1589, 954437177; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1600, rd7; +cvt.rn.f32.u32 f161, r1600; +mul.f32 f162, f161, 0f3E32B8C2; +cos.approx.f32 f113, f162; +sin.approx.f32 f163, f162; +neg.f32 f114, f163; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f113; +cvt.rn.f16.f32 high, f114; +mov.b32 r1139, {low, high}; +} +mul.lo.s32 r1601, r1600, 36; +sub.s32 r1602, r1589, r1601; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1142, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1144, {high, high}; +} +{ +mul.f16x2 r1146, r1118, r1144; +} +{ +neg.f16x2 r1149, r1146; +} +{ +fma.rn.f16x2 r1151, r1115, r1142, r1149; +} +{ +mul.f16x2 r1155, r1115, r1144; +} +{ +fma.rn.f16x2 r1158, r1118, r1142, r1155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1162, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1164, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1166, {low, high}; +} +{ +mul.f16x2 r1167, r1164, r1166; +} +{ +mul.f16x2 r1170, r1139, r1162; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1173, {high, low}; +} +{ +fma.rn.f16x2 r1175, r1167, r1173, r1170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1175; +mov.b32 r1179, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1175; +mov.b32 r1181, {high, high}; +} +{ +mul.f16x2 r1183, r1130, r1181; +} +{ +neg.f16x2 r1186, r1183; +} +{ +fma.rn.f16x2 r1188, r1127, r1179, r1186; +} +{ +mul.f16x2 r1192, r1127, r1181; +} +{ +fma.rn.f16x2 r1195, r1130, r1179, r1192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1201, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1203, {low, high}; +} +{ +mul.f16x2 r1204, r1201, r1203; +} +{ +mul.f16x2 r1207, r1175, r1199; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1175; +mov.b32 r1210, {high, low}; +} +{ +fma.rn.f16x2 r1212, r1204, r1210, r1207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1212; +mov.b32 r1216, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1212; +mov.b32 r1218, {high, high}; +} +{ +mul.f16x2 r1220, r1112, r1218; +} +{ +neg.f16x2 r1223, r1220; +} +{ +fma.rn.f16x2 r1225, r1109, r1216, r1223; +} +{ +mul.f16x2 r1229, r1109, r1218; +} +{ +fma.rn.f16x2 r1232, r1112, r1216, r1229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1236, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1238, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1240, {low, high}; +} +{ +mul.f16x2 r1241, r1238, r1240; +} +{ +mul.f16x2 r1244, r1212, r1236; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1212; +mov.b32 r1247, {high, low}; +} +{ +fma.rn.f16x2 r1249, r1241, r1247, r1244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1249; +mov.b32 r1253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1249; +mov.b32 r1255, {high, high}; +} +{ +mul.f16x2 r1257, r1124, r1255; +} +{ +neg.f16x2 r1260, r1257; +} +{ +fma.rn.f16x2 r1262, r1121, r1253, r1260; +} +{ +mul.f16x2 r1266, r1121, r1255; +} +{ +fma.rn.f16x2 r1269, r1124, r1253, r1266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1277, {low, high}; +} +{ +mul.f16x2 r1278, r1275, r1277; +} +{ +mul.f16x2 r1281, r1249, r1273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1249; +mov.b32 r1284, {high, low}; +} +{ +fma.rn.f16x2 r1286, r1278, r1284, r1281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1286; +mov.b32 r1290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1286; +mov.b32 r1292, {high, high}; +} +{ +mul.f16x2 r1294, r1136, r1292; +} +{ +neg.f16x2 r1297, r1294; +} +{ +fma.rn.f16x2 r1299, r1133, r1290, r1297; +} +{ +mul.f16x2 r1303, r1133, r1292; +} +{ +fma.rn.f16x2 r1306, r1136, r1290, r1303; +} +shl.b32 r1603, r1602, 3; +add.s32 r1604, r1591, r1603; +barrier.sync 0; +mad.lo.s32 r1605, r1600, 1728, r1604; +st.shared.u32 [r1605], r1103; +st.shared.u32 [r1605+4], r1106; +st.shared.u32 [r1605+288], r1151; +st.shared.u32 [r1605+292], r1158; +st.shared.u32 [r1605+576], r1188; +st.shared.u32 [r1605+580], r1195; +st.shared.u32 [r1605+864], r1225; +st.shared.u32 [r1605+868], r1232; +st.shared.u32 [r1605+1152], r1262; +st.shared.u32 [r1605+1156], r1269; +st.shared.u32 [r1605+1440], r1299; +st.shared.u32 [r1605+1444], r1306; +barrier.sync 0; +ld.shared.u32 r1335, [r1593]; +ld.shared.u32 r1341, [r1593+4]; +ld.shared.u32 r1423, [r1593+1728]; +ld.shared.u32 r1429, [r1593+1732]; +ld.shared.u32 r1332, [r1593+3456]; +ld.shared.u32 r1338, [r1593+3460]; +ld.shared.u32 r1420, [r1593+5184]; +ld.shared.u32 r1426, [r1593+5188]; +ld.shared.u32 r1333, [r1593+6912]; +ld.shared.u32 r1339, [r1593+6916]; +ld.shared.u32 r1421, [r1593+8640]; +ld.shared.u32 r1427, [r1593+8644]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1328, {low, high}; +} +{ +neg.f16x2 r1329, r1328; +} +{ +add.f16x2 r1331, r1332, r1333; +} +{ +add.f16x2 r1334, r1335, r1331; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1337; +} +{ +add.f16x2 r1343, r1332, r1333; +} +{ +mul.f16x2 r1346, r1343, r1327; +} +{ +add.f16x2 r1349, r1335, r1346; +} +{ +sub.f16x2 r1352, r1338, r1339; +} +{ +mul.f16x2 r1355, r1352, r1329; +} +{ +add.f16x2 r1358, r1349, r1355; +} +{ +add.f16x2 r1361, r1332, r1333; +} +{ +mul.f16x2 r1364, r1361, r1327; +} +{ +add.f16x2 r1367, r1335, r1364; +} +{ +sub.f16x2 r1370, r1338, r1339; +} +{ +mul.f16x2 r1373, r1370, r1329; +} +{ +sub.f16x2 r1376, r1367, r1373; +} +{ +add.f16x2 r1379, r1338, r1339; +} +{ +mul.f16x2 r1382, r1379, r1327; +} +{ +add.f16x2 r1385, r1341, r1382; +} +{ +sub.f16x2 r1388, r1332, r1333; +} +{ +mul.f16x2 r1391, r1388, r1329; +} +{ +sub.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, r1338, r1339; +} +{ +mul.f16x2 r1400, r1397, r1327; +} +{ +add.f16x2 r1403, r1341, r1400; +} +{ +sub.f16x2 r1406, r1332, r1333; +} +{ +mul.f16x2 r1409, r1406, r1329; +} +{ +add.f16x2 r1412, r1403, r1409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1415, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1416, {low, high}; +} +{ +neg.f16x2 r1417, r1416; +} +{ +add.f16x2 r1419, r1420, r1421; +} +{ +add.f16x2 r1422, r1423, r1419; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1425; +} +{ +add.f16x2 r1431, r1420, r1421; +} +{ +mul.f16x2 r1434, r1431, r1415; +} +{ +add.f16x2 r1437, r1423, r1434; +} +{ +sub.f16x2 r1440, r1426, r1427; +} +{ +mul.f16x2 r1443, r1440, r1417; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +add.f16x2 r1449, r1420, r1421; +} +{ +mul.f16x2 r1452, r1449, r1415; +} +{ +add.f16x2 r1455, r1423, r1452; +} +{ +sub.f16x2 r1458, r1426, r1427; +} +{ +mul.f16x2 r1461, r1458, r1417; +} +{ +sub.f16x2 r1464, r1455, r1461; +} +{ +add.f16x2 r1467, r1426, r1427; +} +{ +mul.f16x2 r1470, r1467, r1415; +} +{ +add.f16x2 r1473, r1429, r1470; +} +{ +sub.f16x2 r1476, r1420, r1421; +} +{ +mul.f16x2 r1479, r1476, r1417; +} +{ +sub.f16x2 r1482, r1473, r1479; +} +{ +add.f16x2 r1485, r1426, r1427; +} +{ +mul.f16x2 r1488, r1485, r1415; +} +{ +add.f16x2 r1491, r1429, r1488; +} +{ +sub.f16x2 r1494, r1420, r1421; +} +{ +mul.f16x2 r1497, r1494, r1417; +} +{ +add.f16x2 r1500, r1491, r1497; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1503, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1504, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1505, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1506, {low, high}; +} +{ +mul.f16x2 r1513, r1446, r1503; +} +{ +mul.f16x2 r1516, r1482, r1504; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1446, r1504; +} +{ +fma.rn.f16x2 r1525, r1482, r1503, r1522; +} +{ +mul.f16x2 r1529, r1464, r1505; +} +{ +mul.f16x2 r1532, r1500, r1506; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1464, r1506; +} +{ +fma.rn.f16x2 r1541, r1500, r1505, r1538; +} +{ +add.f16x2 %0, r1334, r1422; +} +{ +add.f16x2 %1, r1340, r1428; +} +{ +sub.f16x2 %6, r1334, r1422; +} +{ +sub.f16x2 %7, r1340, r1428; +} +{ +add.f16x2 %2, r1358, r1519; +} +{ +add.f16x2 %3, r1394, r1525; +} +{ +sub.f16x2 %8, r1358, r1519; +} +{ +sub.f16x2 %9, r1394, r1525; +} +{ +add.f16x2 %4, r1376, r1535; +} +{ +add.f16x2 %5, r1412, r1541; +} +{ +sub.f16x2 %10, r1376, r1535; +} +{ +sub.f16x2 %11, r1412, r1541; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<935, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<164>; +.reg .b32 r<1604>; +.reg .b64 rd<8>; +mov.u32 r1581, %tid.y; +mov.u32 r1582, %12; +mad.lo.s32 r1583, r1581, 5184, r1582; +mov.u32 r1584, %tid.x; +mov.f32 f140, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1, {low, high}; +} +mov.f32 f142, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %17, %21; +} +{ +add.f16x2 r8, %13, r5; +} +{ +add.f16x2 r11, %18, %22; +} +{ +add.f16x2 r14, %14, r11; +} +{ +add.f16x2 r17, %17, %21; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %13, r20; +} +{ +sub.f16x2 r26, %18, %22; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %17, %21; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %13, r38; +} +{ +sub.f16x2 r44, %18, %22; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %18, %22; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %14, r56; +} +{ +sub.f16x2 r62, %17, %21; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %18, %22; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %14, r74; +} +{ +sub.f16x2 r80, %17, %21; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %19, %23; +} +{ +add.f16x2 r96, %15, r93; +} +{ +add.f16x2 r99, %20, %24; +} +{ +add.f16x2 r102, %16, r99; +} +{ +add.f16x2 r105, %19, %23; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %15, r108; +} +{ +sub.f16x2 r114, %20, %24; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %19, %23; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %15, r126; +} +{ +sub.f16x2 r132, %20, %24; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %20, %24; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %16, r144; +} +{ +sub.f16x2 r150, %19, %23; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %20, %24; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %16, r162; +} +{ +sub.f16x2 r168, %19, %23; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f136, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r180, {low, high}; +} +mov.f32 f123, 0fBF800000; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +shr.u32 r1585, r1584, 3; +mul.wide.u32 rd2, r1585, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1586, rd3; +mul.lo.s32 r1587, r1586, 216; +sub.s32 r1588, r1584, r1587; +mad.lo.s32 r1589, r1586, 5184, r1583; +cvt.rn.f32.u32 f155, r1588; +mul.f32 f156, f155, 0f3B9EDD1F; +cos.approx.f32 f29, f156; +sin.approx.f32 f157, f156; +neg.f32 f30, f157; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r260, {high, high}; +} +{ +mul.f16x2 r262, r234, r260; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r231, r258, r265; +} +{ +mul.f16x2 r271, r231, r260; +} +{ +fma.rn.f16x2 r274, r234, r258, r271; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r280, {high, high}; +} +mov.f32 f124, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r280, r282; +} +{ +mul.f16x2 r286, r255, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r289, {high, low}; +} +{ +fma.rn.f16x2 r291, r283, r289, r286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r297, {high, high}; +} +{ +mul.f16x2 r299, r246, r297; +} +{ +neg.f16x2 r302, r299; +} +{ +fma.rn.f16x2 r304, r243, r295, r302; +} +{ +mul.f16x2 r308, r243, r297; +} +{ +fma.rn.f16x2 r311, r246, r295, r308; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r315, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r317, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r319, {low, high}; +} +{ +mul.f16x2 r320, r317, r319; +} +{ +mul.f16x2 r323, r291, r315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r326, {high, low}; +} +{ +fma.rn.f16x2 r328, r320, r326, r323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r334, {high, high}; +} +{ +mul.f16x2 r336, r228, r334; +} +{ +neg.f16x2 r339, r336; +} +{ +fma.rn.f16x2 r341, r225, r332, r339; +} +{ +mul.f16x2 r345, r225, r334; +} +{ +fma.rn.f16x2 r348, r228, r332, r345; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r352, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r354, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r356, {low, high}; +} +{ +mul.f16x2 r357, r354, r356; +} +{ +mul.f16x2 r360, r328, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r363, {high, low}; +} +{ +fma.rn.f16x2 r365, r357, r363, r360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r371, {high, high}; +} +{ +mul.f16x2 r373, r240, r371; +} +{ +neg.f16x2 r376, r373; +} +{ +fma.rn.f16x2 r378, r237, r369, r376; +} +{ +mul.f16x2 r382, r237, r371; +} +{ +fma.rn.f16x2 r385, r240, r369, r382; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r389, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r391, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r393, {low, high}; +} +{ +mul.f16x2 r394, r391, r393; +} +{ +mul.f16x2 r397, r365, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r400, {high, low}; +} +{ +fma.rn.f16x2 r402, r394, r400, r397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r408, {high, high}; +} +{ +mul.f16x2 r410, r252, r408; +} +{ +neg.f16x2 r413, r410; +} +{ +fma.rn.f16x2 r415, r249, r406, r413; +} +{ +mul.f16x2 r419, r249, r408; +} +{ +fma.rn.f16x2 r422, r252, r406, r419; +} +barrier.sync 0; +mad.lo.s32 r1590, r1588, 24, r1589; +st.shared.v2.f32 [r1590], {r219, r267}; +st.shared.v2.f32 [r1590+8], {r304, r341}; +st.shared.v2.f32 [r1590+16], {r378, r415}; +barrier.sync 0; +mad.lo.s32 r1591, r1588, -20, r1590; +ld.shared.u32 r451, [r1591]; +ld.shared.u32 r539, [r1591+864]; +ld.shared.u32 r448, [r1591+1728]; +ld.shared.u32 r536, [r1591+2592]; +ld.shared.u32 r449, [r1591+3456]; +ld.shared.u32 r537, [r1591+4320]; +barrier.sync 0; +st.shared.v2.f32 [r1590], {r222, r274}; +st.shared.v2.f32 [r1590+8], {r311, r348}; +st.shared.v2.f32 [r1590+16], {r385, r422}; +barrier.sync 0; +ld.shared.u32 r457, [r1591]; +ld.shared.u32 r545, [r1591+864]; +ld.shared.u32 r454, [r1591+1728]; +ld.shared.u32 r542, [r1591+2592]; +ld.shared.u32 r455, [r1591+3456]; +ld.shared.u32 r543, [r1591+4320]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r444, {low, high}; +} +{ +neg.f16x2 r445, r444; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r454, r455; +} +{ +add.f16x2 r456, r457, r453; +} +{ +add.f16x2 r459, r448, r449; +} +{ +mul.f16x2 r462, r459, r443; +} +{ +add.f16x2 r465, r451, r462; +} +{ +sub.f16x2 r468, r454, r455; +} +{ +mul.f16x2 r471, r468, r445; +} +{ +add.f16x2 r474, r465, r471; +} +{ +add.f16x2 r477, r448, r449; +} +{ +mul.f16x2 r480, r477, r443; +} +{ +add.f16x2 r483, r451, r480; +} +{ +sub.f16x2 r486, r454, r455; +} +{ +mul.f16x2 r489, r486, r445; +} +{ +sub.f16x2 r492, r483, r489; +} +{ +add.f16x2 r495, r454, r455; +} +{ +mul.f16x2 r498, r495, r443; +} +{ +add.f16x2 r501, r457, r498; +} +{ +sub.f16x2 r504, r448, r449; +} +{ +mul.f16x2 r507, r504, r445; +} +{ +sub.f16x2 r510, r501, r507; +} +{ +add.f16x2 r513, r454, r455; +} +{ +mul.f16x2 r516, r513, r443; +} +{ +add.f16x2 r519, r457, r516; +} +{ +sub.f16x2 r522, r448, r449; +} +{ +mul.f16x2 r525, r522, r445; +} +{ +add.f16x2 r528, r519, r525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r532, {low, high}; +} +{ +neg.f16x2 r533, r532; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r535; +} +{ +add.f16x2 r541, r542, r543; +} +{ +add.f16x2 r544, r545, r541; +} +{ +add.f16x2 r547, r536, r537; +} +{ +mul.f16x2 r550, r547, r531; +} +{ +add.f16x2 r553, r539, r550; +} +{ +sub.f16x2 r556, r542, r543; +} +{ +mul.f16x2 r559, r556, r533; +} +{ +add.f16x2 r562, r553, r559; +} +{ +add.f16x2 r565, r536, r537; +} +{ +mul.f16x2 r568, r565, r531; +} +{ +add.f16x2 r571, r539, r568; +} +{ +sub.f16x2 r574, r542, r543; +} +{ +mul.f16x2 r577, r574, r533; +} +{ +sub.f16x2 r580, r571, r577; +} +{ +add.f16x2 r583, r542, r543; +} +{ +mul.f16x2 r586, r583, r531; +} +{ +add.f16x2 r589, r545, r586; +} +{ +sub.f16x2 r592, r536, r537; +} +{ +mul.f16x2 r595, r592, r533; +} +{ +sub.f16x2 r598, r589, r595; +} +{ +add.f16x2 r601, r542, r543; +} +{ +mul.f16x2 r604, r601, r531; +} +{ +add.f16x2 r607, r545, r604; +} +{ +sub.f16x2 r610, r536, r537; +} +{ +mul.f16x2 r613, r610, r533; +} +{ +add.f16x2 r616, r607, r613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r619, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r622, {low, high}; +} +{ +mul.f16x2 r629, r562, r619; +} +{ +mul.f16x2 r632, r598, r620; +} +{ +sub.f16x2 r635, r629, r632; +} +{ +mul.f16x2 r638, r562, r620; +} +{ +fma.rn.f16x2 r641, r598, r619, r638; +} +{ +mul.f16x2 r645, r580, r621; +} +{ +mul.f16x2 r648, r616, r622; +} +{ +sub.f16x2 r651, r645, r648; +} +{ +mul.f16x2 r654, r580, r622; +} +{ +fma.rn.f16x2 r657, r616, r621, r654; +} +{ +add.f16x2 r661, r450, r538; +} +{ +add.f16x2 r664, r456, r544; +} +{ +sub.f16x2 r667, r450, r538; +} +{ +sub.f16x2 r670, r456, r544; +} +{ +add.f16x2 r673, r474, r635; +} +{ +add.f16x2 r676, r510, r641; +} +{ +sub.f16x2 r679, r474, r635; +} +{ +sub.f16x2 r682, r510, r641; +} +{ +add.f16x2 r685, r492, r651; +} +{ +add.f16x2 r688, r528, r657; +} +{ +sub.f16x2 r691, r492, r651; +} +{ +sub.f16x2 r694, r528, r657; +} +mul.wide.u32 rd4, r1588, -1431655765; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1592, rd5; +mul.lo.s32 r1593, r1592, 6; +sub.s32 r1594, r1588, r1593; +shl.b32 r1595, r1594, 2; +add.s32 r1596, r1589, r1595; +cvt.rn.f32.u32 f158, r1592; +mul.f32 f159, f158, 0f3CEE4BAE; +cos.approx.f32 f71, f159; +sin.approx.f32 f160, f159; +neg.f32 f72, f160; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r702, {high, high}; +} +{ +mul.f16x2 r704, r676, r702; +} +{ +neg.f16x2 r707, r704; +} +{ +fma.rn.f16x2 r709, r673, r700, r707; +} +{ +mul.f16x2 r713, r673, r702; +} +{ +fma.rn.f16x2 r716, r676, r700, r713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r722, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r724, {low, high}; +} +{ +mul.f16x2 r725, r722, r724; +} +{ +mul.f16x2 r728, r697, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r731, {high, low}; +} +{ +fma.rn.f16x2 r733, r725, r731, r728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r739, {high, high}; +} +{ +mul.f16x2 r741, r688, r739; +} +{ +neg.f16x2 r744, r741; +} +{ +fma.rn.f16x2 r746, r685, r737, r744; +} +{ +mul.f16x2 r750, r685, r739; +} +{ +fma.rn.f16x2 r753, r688, r737, r750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r759, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r759, r761; +} +{ +mul.f16x2 r765, r733, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r768, {high, low}; +} +{ +fma.rn.f16x2 r770, r762, r768, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r774, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r776, {high, high}; +} +{ +mul.f16x2 r778, r670, r776; +} +{ +neg.f16x2 r781, r778; +} +{ +fma.rn.f16x2 r783, r667, r774, r781; +} +{ +mul.f16x2 r787, r667, r776; +} +{ +fma.rn.f16x2 r790, r670, r774, r787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r796, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r798, {low, high}; +} +{ +mul.f16x2 r799, r796, r798; +} +{ +mul.f16x2 r802, r770, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r805, {high, low}; +} +{ +fma.rn.f16x2 r807, r799, r805, r802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r811, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r813, {high, high}; +} +{ +mul.f16x2 r815, r682, r813; +} +{ +neg.f16x2 r818, r815; +} +{ +fma.rn.f16x2 r820, r679, r811, r818; +} +{ +mul.f16x2 r824, r679, r813; +} +{ +fma.rn.f16x2 r827, r682, r811, r824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r833, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r833, r835; +} +{ +mul.f16x2 r839, r807, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r842, {high, low}; +} +{ +fma.rn.f16x2 r844, r836, r842, r839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r848, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r850, {high, high}; +} +{ +mul.f16x2 r852, r694, r850; +} +{ +neg.f16x2 r855, r852; +} +{ +fma.rn.f16x2 r857, r691, r848, r855; +} +{ +mul.f16x2 r861, r691, r850; +} +{ +fma.rn.f16x2 r864, r694, r848, r861; +} +barrier.sync 0; +mad.lo.s32 r1597, r1592, 144, r1596; +st.shared.u32 [r1597], r661; +st.shared.u32 [r1597+24], r709; +st.shared.u32 [r1597+48], r746; +st.shared.u32 [r1597+72], r783; +st.shared.u32 [r1597+96], r820; +st.shared.u32 [r1597+120], r857; +barrier.sync 0; +ld.shared.u32 r893, [r1591]; +ld.shared.u32 r981, [r1591+864]; +ld.shared.u32 r890, [r1591+1728]; +ld.shared.u32 r978, [r1591+2592]; +ld.shared.u32 r891, [r1591+3456]; +ld.shared.u32 r979, [r1591+4320]; +barrier.sync 0; +st.shared.u32 [r1597], r664; +st.shared.u32 [r1597+24], r716; +st.shared.u32 [r1597+48], r753; +st.shared.u32 [r1597+72], r790; +st.shared.u32 [r1597+96], r827; +st.shared.u32 [r1597+120], r864; +barrier.sync 0; +ld.shared.u32 r899, [r1591]; +ld.shared.u32 r987, [r1591+864]; +ld.shared.u32 r896, [r1591+1728]; +ld.shared.u32 r984, [r1591+2592]; +ld.shared.u32 r897, [r1591+3456]; +ld.shared.u32 r985, [r1591+4320]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r886, {low, high}; +} +{ +neg.f16x2 r887, r886; +} +{ +add.f16x2 r889, r890, r891; +} +{ +add.f16x2 r892, r893, r889; +} +{ +add.f16x2 r895, r896, r897; +} +{ +add.f16x2 r898, r899, r895; +} +{ +add.f16x2 r901, r890, r891; +} +{ +mul.f16x2 r904, r901, r885; +} +{ +add.f16x2 r907, r893, r904; +} +{ +sub.f16x2 r910, r896, r897; +} +{ +mul.f16x2 r913, r910, r887; +} +{ +add.f16x2 r916, r907, r913; +} +{ +add.f16x2 r919, r890, r891; +} +{ +mul.f16x2 r922, r919, r885; +} +{ +add.f16x2 r925, r893, r922; +} +{ +sub.f16x2 r928, r896, r897; +} +{ +mul.f16x2 r931, r928, r887; +} +{ +sub.f16x2 r934, r925, r931; +} +{ +add.f16x2 r937, r896, r897; +} +{ +mul.f16x2 r940, r937, r885; +} +{ +add.f16x2 r943, r899, r940; +} +{ +sub.f16x2 r946, r890, r891; +} +{ +mul.f16x2 r949, r946, r887; +} +{ +sub.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, r896, r897; +} +{ +mul.f16x2 r958, r955, r885; +} +{ +add.f16x2 r961, r899, r958; +} +{ +sub.f16x2 r964, r890, r891; +} +{ +mul.f16x2 r967, r964, r887; +} +{ +add.f16x2 r970, r961, r967; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r973, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r974, {low, high}; +} +{ +neg.f16x2 r975, r974; +} +{ +add.f16x2 r977, r978, r979; +} +{ +add.f16x2 r980, r981, r977; +} +{ +add.f16x2 r983, r984, r985; +} +{ +add.f16x2 r986, r987, r983; +} +{ +add.f16x2 r989, r978, r979; +} +{ +mul.f16x2 r992, r989, r973; +} +{ +add.f16x2 r995, r981, r992; +} +{ +sub.f16x2 r998, r984, r985; +} +{ +mul.f16x2 r1001, r998, r975; +} +{ +add.f16x2 r1004, r995, r1001; +} +{ +add.f16x2 r1007, r978, r979; +} +{ +mul.f16x2 r1010, r1007, r973; +} +{ +add.f16x2 r1013, r981, r1010; +} +{ +sub.f16x2 r1016, r984, r985; +} +{ +mul.f16x2 r1019, r1016, r975; +} +{ +sub.f16x2 r1022, r1013, r1019; +} +{ +add.f16x2 r1025, r984, r985; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r987, r1028; +} +{ +sub.f16x2 r1034, r978, r979; +} +{ +mul.f16x2 r1037, r1034, r975; +} +{ +sub.f16x2 r1040, r1031, r1037; +} +{ +add.f16x2 r1043, r984, r985; +} +{ +mul.f16x2 r1046, r1043, r973; +} +{ +add.f16x2 r1049, r987, r1046; +} +{ +sub.f16x2 r1052, r978, r979; +} +{ +mul.f16x2 r1055, r1052, r975; +} +{ +add.f16x2 r1058, r1049, r1055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1061, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1062, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1063, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1064, {low, high}; +} +{ +mul.f16x2 r1071, r1004, r1061; +} +{ +mul.f16x2 r1074, r1040, r1062; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r1004, r1062; +} +{ +fma.rn.f16x2 r1083, r1040, r1061, r1080; +} +{ +mul.f16x2 r1087, r1022, r1063; +} +{ +mul.f16x2 r1090, r1058, r1064; +} +{ +sub.f16x2 r1093, r1087, r1090; +} +{ +mul.f16x2 r1096, r1022, r1064; +} +{ +fma.rn.f16x2 r1099, r1058, r1063, r1096; +} +{ +add.f16x2 r1103, r892, r980; +} +{ +add.f16x2 r1106, r898, r986; +} +{ +sub.f16x2 r1109, r892, r980; +} +{ +sub.f16x2 r1112, r898, r986; +} +{ +add.f16x2 r1115, r916, r1077; +} +{ +add.f16x2 r1118, r952, r1083; +} +{ +sub.f16x2 r1121, r916, r1077; +} +{ +sub.f16x2 r1124, r952, r1083; +} +{ +add.f16x2 r1127, r934, r1093; +} +{ +add.f16x2 r1130, r970, r1099; +} +{ +sub.f16x2 r1133, r934, r1093; +} +{ +sub.f16x2 r1136, r970, r1099; +} +mul.wide.u32 rd6, r1588, 954437177; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1598, rd7; +mul.lo.s32 r1599, r1598, 36; +sub.s32 r1600, r1588, r1599; +shl.b32 r1601, r1600, 2; +add.s32 r1602, r1589, r1601; +cvt.rn.f32.u32 f161, r1598; +mul.f32 f162, f161, 0f3E32B8C2; +cos.approx.f32 f113, f162; +sin.approx.f32 f163, f162; +neg.f32 f114, f163; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f113; +cvt.rn.f16.f32 high, f114; +mov.b32 r1139, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1142, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1144, {high, high}; +} +{ +mul.f16x2 r1146, r1118, r1144; +} +{ +neg.f16x2 r1149, r1146; +} +{ +fma.rn.f16x2 r1151, r1115, r1142, r1149; +} +{ +mul.f16x2 r1155, r1115, r1144; +} +{ +fma.rn.f16x2 r1158, r1118, r1142, r1155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1162, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1164, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1166, {low, high}; +} +{ +mul.f16x2 r1167, r1164, r1166; +} +{ +mul.f16x2 r1170, r1139, r1162; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1173, {high, low}; +} +{ +fma.rn.f16x2 r1175, r1167, r1173, r1170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1175; +mov.b32 r1179, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1175; +mov.b32 r1181, {high, high}; +} +{ +mul.f16x2 r1183, r1130, r1181; +} +{ +neg.f16x2 r1186, r1183; +} +{ +fma.rn.f16x2 r1188, r1127, r1179, r1186; +} +{ +mul.f16x2 r1192, r1127, r1181; +} +{ +fma.rn.f16x2 r1195, r1130, r1179, r1192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1201, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1203, {low, high}; +} +{ +mul.f16x2 r1204, r1201, r1203; +} +{ +mul.f16x2 r1207, r1175, r1199; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1175; +mov.b32 r1210, {high, low}; +} +{ +fma.rn.f16x2 r1212, r1204, r1210, r1207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1212; +mov.b32 r1216, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1212; +mov.b32 r1218, {high, high}; +} +{ +mul.f16x2 r1220, r1112, r1218; +} +{ +neg.f16x2 r1223, r1220; +} +{ +fma.rn.f16x2 r1225, r1109, r1216, r1223; +} +{ +mul.f16x2 r1229, r1109, r1218; +} +{ +fma.rn.f16x2 r1232, r1112, r1216, r1229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1236, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1238, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1240, {low, high}; +} +{ +mul.f16x2 r1241, r1238, r1240; +} +{ +mul.f16x2 r1244, r1212, r1236; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1212; +mov.b32 r1247, {high, low}; +} +{ +fma.rn.f16x2 r1249, r1241, r1247, r1244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1249; +mov.b32 r1253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1249; +mov.b32 r1255, {high, high}; +} +{ +mul.f16x2 r1257, r1124, r1255; +} +{ +neg.f16x2 r1260, r1257; +} +{ +fma.rn.f16x2 r1262, r1121, r1253, r1260; +} +{ +mul.f16x2 r1266, r1121, r1255; +} +{ +fma.rn.f16x2 r1269, r1124, r1253, r1266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1139; +mov.b32 r1275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1277, {low, high}; +} +{ +mul.f16x2 r1278, r1275, r1277; +} +{ +mul.f16x2 r1281, r1249, r1273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1249; +mov.b32 r1284, {high, low}; +} +{ +fma.rn.f16x2 r1286, r1278, r1284, r1281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1286; +mov.b32 r1290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1286; +mov.b32 r1292, {high, high}; +} +{ +mul.f16x2 r1294, r1136, r1292; +} +{ +neg.f16x2 r1297, r1294; +} +{ +fma.rn.f16x2 r1299, r1133, r1290, r1297; +} +{ +mul.f16x2 r1303, r1133, r1292; +} +{ +fma.rn.f16x2 r1306, r1136, r1290, r1303; +} +barrier.sync 0; +mad.lo.s32 r1603, r1598, 864, r1602; +st.shared.u32 [r1603], r1103; +st.shared.u32 [r1603+144], r1151; +st.shared.u32 [r1603+288], r1188; +st.shared.u32 [r1603+432], r1225; +st.shared.u32 [r1603+576], r1262; +st.shared.u32 [r1603+720], r1299; +barrier.sync 0; +ld.shared.u32 r1335, [r1591]; +ld.shared.u32 r1423, [r1591+864]; +ld.shared.u32 r1332, [r1591+1728]; +ld.shared.u32 r1420, [r1591+2592]; +ld.shared.u32 r1333, [r1591+3456]; +ld.shared.u32 r1421, [r1591+4320]; +barrier.sync 0; +st.shared.u32 [r1603], r1106; +st.shared.u32 [r1603+144], r1158; +st.shared.u32 [r1603+288], r1195; +st.shared.u32 [r1603+432], r1232; +st.shared.u32 [r1603+576], r1269; +st.shared.u32 [r1603+720], r1306; +barrier.sync 0; +ld.shared.u32 r1341, [r1591]; +ld.shared.u32 r1429, [r1591+864]; +ld.shared.u32 r1338, [r1591+1728]; +ld.shared.u32 r1426, [r1591+2592]; +ld.shared.u32 r1339, [r1591+3456]; +ld.shared.u32 r1427, [r1591+4320]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1328, {low, high}; +} +{ +neg.f16x2 r1329, r1328; +} +{ +add.f16x2 r1331, r1332, r1333; +} +{ +add.f16x2 r1334, r1335, r1331; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1337; +} +{ +add.f16x2 r1343, r1332, r1333; +} +{ +mul.f16x2 r1346, r1343, r1327; +} +{ +add.f16x2 r1349, r1335, r1346; +} +{ +sub.f16x2 r1352, r1338, r1339; +} +{ +mul.f16x2 r1355, r1352, r1329; +} +{ +add.f16x2 r1358, r1349, r1355; +} +{ +add.f16x2 r1361, r1332, r1333; +} +{ +mul.f16x2 r1364, r1361, r1327; +} +{ +add.f16x2 r1367, r1335, r1364; +} +{ +sub.f16x2 r1370, r1338, r1339; +} +{ +mul.f16x2 r1373, r1370, r1329; +} +{ +sub.f16x2 r1376, r1367, r1373; +} +{ +add.f16x2 r1379, r1338, r1339; +} +{ +mul.f16x2 r1382, r1379, r1327; +} +{ +add.f16x2 r1385, r1341, r1382; +} +{ +sub.f16x2 r1388, r1332, r1333; +} +{ +mul.f16x2 r1391, r1388, r1329; +} +{ +sub.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, r1338, r1339; +} +{ +mul.f16x2 r1400, r1397, r1327; +} +{ +add.f16x2 r1403, r1341, r1400; +} +{ +sub.f16x2 r1406, r1332, r1333; +} +{ +mul.f16x2 r1409, r1406, r1329; +} +{ +add.f16x2 r1412, r1403, r1409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1415, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1416, {low, high}; +} +{ +neg.f16x2 r1417, r1416; +} +{ +add.f16x2 r1419, r1420, r1421; +} +{ +add.f16x2 r1422, r1423, r1419; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1425; +} +{ +add.f16x2 r1431, r1420, r1421; +} +{ +mul.f16x2 r1434, r1431, r1415; +} +{ +add.f16x2 r1437, r1423, r1434; +} +{ +sub.f16x2 r1440, r1426, r1427; +} +{ +mul.f16x2 r1443, r1440, r1417; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +add.f16x2 r1449, r1420, r1421; +} +{ +mul.f16x2 r1452, r1449, r1415; +} +{ +add.f16x2 r1455, r1423, r1452; +} +{ +sub.f16x2 r1458, r1426, r1427; +} +{ +mul.f16x2 r1461, r1458, r1417; +} +{ +sub.f16x2 r1464, r1455, r1461; +} +{ +add.f16x2 r1467, r1426, r1427; +} +{ +mul.f16x2 r1470, r1467, r1415; +} +{ +add.f16x2 r1473, r1429, r1470; +} +{ +sub.f16x2 r1476, r1420, r1421; +} +{ +mul.f16x2 r1479, r1476, r1417; +} +{ +sub.f16x2 r1482, r1473, r1479; +} +{ +add.f16x2 r1485, r1426, r1427; +} +{ +mul.f16x2 r1488, r1485, r1415; +} +{ +add.f16x2 r1491, r1429, r1488; +} +{ +sub.f16x2 r1494, r1420, r1421; +} +{ +mul.f16x2 r1497, r1494, r1417; +} +{ +add.f16x2 r1500, r1491, r1497; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1503, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1504, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1505, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1506, {low, high}; +} +{ +mul.f16x2 r1513, r1446, r1503; +} +{ +mul.f16x2 r1516, r1482, r1504; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1446, r1504; +} +{ +fma.rn.f16x2 r1525, r1482, r1503, r1522; +} +{ +mul.f16x2 r1529, r1464, r1505; +} +{ +mul.f16x2 r1532, r1500, r1506; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1464, r1506; +} +{ +fma.rn.f16x2 r1541, r1500, r1505, r1538; +} +{ +add.f16x2 %0, r1334, r1422; +} +{ +add.f16x2 %1, r1340, r1428; +} +{ +sub.f16x2 %6, r1334, r1422; +} +{ +sub.f16x2 %7, r1340, r1428; +} +{ +add.f16x2 %2, r1358, r1519; +} +{ +add.f16x2 %3, r1394, r1525; +} +{ +sub.f16x2 %8, r1358, r1519; +} +{ +sub.f16x2 %9, r1394, r1525; +} +{ +add.f16x2 %4, r1376, r1535; +} +{ +add.f16x2 %5, r1412, r1541; +} +{ +sub.f16x2 %10, r1376, r1535; +} +{ +sub.f16x2 %11, r1412, r1541; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..8aa93c9b37cf6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp16_inv.hpp.inc @@ -0,0 +1,4044 @@ +#ifndef CUFFTDX_FFT_1296_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_1296_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1136, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<164>; +.reg .b32 r<1590>; +.reg .b64 rd<8>; +mov.u32 r1565, %tid.y; +shl.b32 r1566, r1565, 1; +mov.u32 r1567, %12; +mad.lo.s32 r1568, r1566, 5184, r1567; +mov.u32 r1569, %tid.x; +mov.f32 f140, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1, {low, high}; +} +mov.f32 f134, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %17, %21; +} +{ +add.f16x2 r6, %13, r3; +} +{ +add.f16x2 r9, %18, %22; +} +{ +add.f16x2 r12, %14, r9; +} +{ +add.f16x2 r15, %17, %21; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %13, r18; +} +{ +sub.f16x2 r24, %18, %22; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %17, %21; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %13, r36; +} +{ +sub.f16x2 r42, %18, %22; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %18, %22; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %14, r54; +} +{ +sub.f16x2 r60, %17, %21; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %18, %22; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %14, r72; +} +{ +sub.f16x2 r78, %17, %21; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %19, %23; +} +{ +add.f16x2 r92, %15, r89; +} +{ +add.f16x2 r95, %20, %24; +} +{ +add.f16x2 r98, %16, r95; +} +{ +add.f16x2 r101, %19, %23; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %15, r104; +} +{ +sub.f16x2 r110, %20, %24; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %19, %23; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %15, r122; +} +{ +sub.f16x2 r128, %20, %24; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %20, %24; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %16, r140; +} +{ +sub.f16x2 r146, %19, %23; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %20, %24; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %16, r158; +} +{ +sub.f16x2 r164, %19, %23; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f136, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r173, {low, high}; +} +mov.f32 f142, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r176, {low, high}; +} +mov.f32 f123, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +shr.u32 r1570, r1569, 3; +mul.wide.u32 rd2, r1570, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1571, rd3; +mul.lo.s32 r1572, r1571, 216; +sub.s32 r1573, r1569, r1572; +shl.b32 r1574, r1571, 1; +mad.lo.s32 r1575, r1574, 5184, r1568; +cvt.rn.f32.u32 f155, r1573; +mul.f32 f156, f155, 0f3B9EDD1F; +cos.approx.f32 f29, f156; +sin.approx.f32 f157, f156; +neg.f32 f30, f157; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r256, {high, high}; +} +{ +mul.f16x2 r258, r230, r256; +} +{ +fma.rn.f16x2 r261, r227, r254, r258; +} +{ +mul.f16x2 r265, r227, r256; +} +{ +neg.f16x2 r268, r265; +} +{ +fma.rn.f16x2 r270, r230, r254, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r276, {high, high}; +} +mov.f32 f124, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r276, r278; +} +{ +mul.f16x2 r282, r251, r274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r285, {high, low}; +} +{ +fma.rn.f16x2 r287, r279, r285, r282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r293, {high, high}; +} +{ +mul.f16x2 r295, r242, r293; +} +{ +fma.rn.f16x2 r298, r239, r291, r295; +} +{ +mul.f16x2 r302, r239, r293; +} +{ +neg.f16x2 r305, r302; +} +{ +fma.rn.f16x2 r307, r242, r291, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r313, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r315, {low, high}; +} +{ +mul.f16x2 r316, r313, r315; +} +{ +mul.f16x2 r319, r287, r311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r322, {high, low}; +} +{ +fma.rn.f16x2 r324, r316, r322, r319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r224, r330; +} +{ +fma.rn.f16x2 r335, r221, r328, r332; +} +{ +mul.f16x2 r339, r221, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r224, r328, r342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r350, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r350, r352; +} +{ +mul.f16x2 r356, r324, r348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r359, {high, low}; +} +{ +fma.rn.f16x2 r361, r353, r359, r356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r367, {high, high}; +} +{ +mul.f16x2 r369, r236, r367; +} +{ +fma.rn.f16x2 r372, r233, r365, r369; +} +{ +mul.f16x2 r376, r233, r367; +} +{ +neg.f16x2 r379, r376; +} +{ +fma.rn.f16x2 r381, r236, r365, r379; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r387, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r389, {low, high}; +} +{ +mul.f16x2 r390, r387, r389; +} +{ +mul.f16x2 r393, r361, r385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r396, {high, low}; +} +{ +fma.rn.f16x2 r398, r390, r396, r393; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r404, {high, high}; +} +{ +mul.f16x2 r406, r248, r404; +} +{ +fma.rn.f16x2 r409, r245, r402, r406; +} +{ +mul.f16x2 r413, r245, r404; +} +{ +neg.f16x2 r416, r413; +} +{ +fma.rn.f16x2 r418, r248, r402, r416; +} +barrier.sync 0; +mad.lo.s32 r1576, r1573, 48, r1575; +st.shared.v2.f32 [r1576], {r215, r218}; +st.shared.v2.f32 [r1576+8], {r261, r270}; +st.shared.v2.f32 [r1576+16], {r298, r307}; +st.shared.v2.f32 [r1576+24], {r335, r344}; +st.shared.v2.f32 [r1576+32], {r372, r381}; +st.shared.v2.f32 [r1576+40], {r409, r418}; +barrier.sync 0; +mad.lo.s32 r1577, r1573, -40, r1576; +ld.shared.u32 r445, [r1577]; +ld.shared.u32 r451, [r1577+4]; +ld.shared.u32 r531, [r1577+1728]; +ld.shared.u32 r537, [r1577+1732]; +ld.shared.u32 r442, [r1577+3456]; +ld.shared.u32 r448, [r1577+3460]; +ld.shared.u32 r528, [r1577+5184]; +ld.shared.u32 r534, [r1577+5188]; +ld.shared.u32 r443, [r1577+6912]; +ld.shared.u32 r449, [r1577+6916]; +ld.shared.u32 r529, [r1577+8640]; +ld.shared.u32 r535, [r1577+8644]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r440, {low, high}; +} +{ +add.f16x2 r441, r442, r443; +} +{ +add.f16x2 r444, r445, r441; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r442, r443; +} +{ +mul.f16x2 r456, r453, r439; +} +{ +add.f16x2 r459, r445, r456; +} +{ +sub.f16x2 r462, r448, r449; +} +{ +mul.f16x2 r465, r462, r440; +} +{ +add.f16x2 r468, r459, r465; +} +{ +add.f16x2 r471, r442, r443; +} +{ +mul.f16x2 r474, r471, r439; +} +{ +add.f16x2 r477, r445, r474; +} +{ +sub.f16x2 r480, r448, r449; +} +{ +mul.f16x2 r483, r480, r440; +} +{ +sub.f16x2 r486, r477, r483; +} +{ +add.f16x2 r489, r448, r449; +} +{ +mul.f16x2 r492, r489, r439; +} +{ +add.f16x2 r495, r451, r492; +} +{ +sub.f16x2 r498, r442, r443; +} +{ +mul.f16x2 r501, r498, r440; +} +{ +sub.f16x2 r504, r495, r501; +} +{ +add.f16x2 r507, r448, r449; +} +{ +mul.f16x2 r510, r507, r439; +} +{ +add.f16x2 r513, r451, r510; +} +{ +sub.f16x2 r516, r442, r443; +} +{ +mul.f16x2 r519, r516, r440; +} +{ +add.f16x2 r522, r513, r519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r526, {low, high}; +} +{ +add.f16x2 r527, r528, r529; +} +{ +add.f16x2 r530, r531, r527; +} +{ +add.f16x2 r533, r534, r535; +} +{ +add.f16x2 r536, r537, r533; +} +{ +add.f16x2 r539, r528, r529; +} +{ +mul.f16x2 r542, r539, r525; +} +{ +add.f16x2 r545, r531, r542; +} +{ +sub.f16x2 r548, r534, r535; +} +{ +mul.f16x2 r551, r548, r526; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, r528, r529; +} +{ +mul.f16x2 r560, r557, r525; +} +{ +add.f16x2 r563, r531, r560; +} +{ +sub.f16x2 r566, r534, r535; +} +{ +mul.f16x2 r569, r566, r526; +} +{ +sub.f16x2 r572, r563, r569; +} +{ +add.f16x2 r575, r534, r535; +} +{ +mul.f16x2 r578, r575, r525; +} +{ +add.f16x2 r581, r537, r578; +} +{ +sub.f16x2 r584, r528, r529; +} +{ +mul.f16x2 r587, r584, r526; +} +{ +sub.f16x2 r590, r581, r587; +} +{ +add.f16x2 r593, r534, r535; +} +{ +mul.f16x2 r596, r593, r525; +} +{ +add.f16x2 r599, r537, r596; +} +{ +sub.f16x2 r602, r528, r529; +} +{ +mul.f16x2 r605, r602, r526; +} +{ +add.f16x2 r608, r599, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r621, r554, r611; +} +{ +mul.f16x2 r624, r590, r612; +} +{ +sub.f16x2 r627, r621, r624; +} +{ +mul.f16x2 r630, r554, r612; +} +{ +fma.rn.f16x2 r633, r590, r611, r630; +} +{ +mul.f16x2 r637, r572, r613; +} +{ +mul.f16x2 r640, r608, r614; +} +{ +sub.f16x2 r643, r637, r640; +} +{ +mul.f16x2 r646, r572, r614; +} +{ +fma.rn.f16x2 r649, r608, r613, r646; +} +{ +add.f16x2 r653, r444, r530; +} +{ +add.f16x2 r656, r450, r536; +} +{ +sub.f16x2 r659, r444, r530; +} +{ +sub.f16x2 r662, r450, r536; +} +{ +add.f16x2 r665, r468, r627; +} +{ +add.f16x2 r668, r504, r633; +} +{ +sub.f16x2 r671, r468, r627; +} +{ +sub.f16x2 r674, r504, r633; +} +{ +add.f16x2 r677, r486, r643; +} +{ +add.f16x2 r680, r522, r649; +} +{ +sub.f16x2 r683, r486, r643; +} +{ +sub.f16x2 r686, r522, r649; +} +mul.wide.u32 rd4, r1573, -1431655765; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1578, rd5; +cvt.rn.f32.u32 f158, r1578; +mul.f32 f159, f158, 0f3CEE4BAE; +cos.approx.f32 f71, f159; +sin.approx.f32 f160, f159; +neg.f32 f72, f160; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r689, {low, high}; +} +mul.lo.s32 r1579, r1578, 6; +sub.s32 r1580, r1573, r1579; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r694, {high, high}; +} +{ +mul.f16x2 r696, r668, r694; +} +{ +fma.rn.f16x2 r699, r665, r692, r696; +} +{ +mul.f16x2 r703, r665, r694; +} +{ +neg.f16x2 r706, r703; +} +{ +fma.rn.f16x2 r708, r668, r692, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r712, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r714, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r714, r716; +} +{ +mul.f16x2 r720, r689, r712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r723, {high, low}; +} +{ +fma.rn.f16x2 r725, r717, r723, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r731, {high, high}; +} +{ +mul.f16x2 r733, r680, r731; +} +{ +fma.rn.f16x2 r736, r677, r729, r733; +} +{ +mul.f16x2 r740, r677, r731; +} +{ +neg.f16x2 r743, r740; +} +{ +fma.rn.f16x2 r745, r680, r729, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r749, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r751, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r753, {low, high}; +} +{ +mul.f16x2 r754, r751, r753; +} +{ +mul.f16x2 r757, r725, r749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r760, {high, low}; +} +{ +fma.rn.f16x2 r762, r754, r760, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r768, {high, high}; +} +{ +mul.f16x2 r770, r662, r768; +} +{ +fma.rn.f16x2 r773, r659, r766, r770; +} +{ +mul.f16x2 r777, r659, r768; +} +{ +neg.f16x2 r780, r777; +} +{ +fma.rn.f16x2 r782, r662, r766, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r788, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r790, {low, high}; +} +{ +mul.f16x2 r791, r788, r790; +} +{ +mul.f16x2 r794, r762, r786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r797, {high, low}; +} +{ +fma.rn.f16x2 r799, r791, r797, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r805, {high, high}; +} +{ +mul.f16x2 r807, r674, r805; +} +{ +fma.rn.f16x2 r810, r671, r803, r807; +} +{ +mul.f16x2 r814, r671, r805; +} +{ +neg.f16x2 r817, r814; +} +{ +fma.rn.f16x2 r819, r674, r803, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r825, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r827, {low, high}; +} +{ +mul.f16x2 r828, r825, r827; +} +{ +mul.f16x2 r831, r799, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r834, {high, low}; +} +{ +fma.rn.f16x2 r836, r828, r834, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r842, {high, high}; +} +{ +mul.f16x2 r844, r686, r842; +} +{ +fma.rn.f16x2 r847, r683, r840, r844; +} +{ +mul.f16x2 r851, r683, r842; +} +{ +neg.f16x2 r854, r851; +} +{ +fma.rn.f16x2 r856, r686, r840, r854; +} +shl.b32 r1581, r1580, 3; +add.s32 r1582, r1575, r1581; +barrier.sync 0; +mad.lo.s32 r1583, r1578, 288, r1582; +st.shared.u32 [r1583], r653; +st.shared.u32 [r1583+4], r656; +st.shared.u32 [r1583+48], r699; +st.shared.u32 [r1583+52], r708; +st.shared.u32 [r1583+96], r736; +st.shared.u32 [r1583+100], r745; +st.shared.u32 [r1583+144], r773; +st.shared.u32 [r1583+148], r782; +st.shared.u32 [r1583+192], r810; +st.shared.u32 [r1583+196], r819; +st.shared.u32 [r1583+240], r847; +st.shared.u32 [r1583+244], r856; +barrier.sync 0; +ld.shared.u32 r883, [r1577]; +ld.shared.u32 r889, [r1577+4]; +ld.shared.u32 r969, [r1577+1728]; +ld.shared.u32 r975, [r1577+1732]; +ld.shared.u32 r880, [r1577+3456]; +ld.shared.u32 r886, [r1577+3460]; +ld.shared.u32 r966, [r1577+5184]; +ld.shared.u32 r972, [r1577+5188]; +ld.shared.u32 r881, [r1577+6912]; +ld.shared.u32 r887, [r1577+6916]; +ld.shared.u32 r967, [r1577+8640]; +ld.shared.u32 r973, [r1577+8644]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r878, {low, high}; +} +{ +add.f16x2 r879, r880, r881; +} +{ +add.f16x2 r882, r883, r879; +} +{ +add.f16x2 r885, r886, r887; +} +{ +add.f16x2 r888, r889, r885; +} +{ +add.f16x2 r891, r880, r881; +} +{ +mul.f16x2 r894, r891, r877; +} +{ +add.f16x2 r897, r883, r894; +} +{ +sub.f16x2 r900, r886, r887; +} +{ +mul.f16x2 r903, r900, r878; +} +{ +add.f16x2 r906, r897, r903; +} +{ +add.f16x2 r909, r880, r881; +} +{ +mul.f16x2 r912, r909, r877; +} +{ +add.f16x2 r915, r883, r912; +} +{ +sub.f16x2 r918, r886, r887; +} +{ +mul.f16x2 r921, r918, r878; +} +{ +sub.f16x2 r924, r915, r921; +} +{ +add.f16x2 r927, r886, r887; +} +{ +mul.f16x2 r930, r927, r877; +} +{ +add.f16x2 r933, r889, r930; +} +{ +sub.f16x2 r936, r880, r881; +} +{ +mul.f16x2 r939, r936, r878; +} +{ +sub.f16x2 r942, r933, r939; +} +{ +add.f16x2 r945, r886, r887; +} +{ +mul.f16x2 r948, r945, r877; +} +{ +add.f16x2 r951, r889, r948; +} +{ +sub.f16x2 r954, r880, r881; +} +{ +mul.f16x2 r957, r954, r878; +} +{ +add.f16x2 r960, r951, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r964, {low, high}; +} +{ +add.f16x2 r965, r966, r967; +} +{ +add.f16x2 r968, r969, r965; +} +{ +add.f16x2 r971, r972, r973; +} +{ +add.f16x2 r974, r975, r971; +} +{ +add.f16x2 r977, r966, r967; +} +{ +mul.f16x2 r980, r977, r963; +} +{ +add.f16x2 r983, r969, r980; +} +{ +sub.f16x2 r986, r972, r973; +} +{ +mul.f16x2 r989, r986, r964; +} +{ +add.f16x2 r992, r983, r989; +} +{ +add.f16x2 r995, r966, r967; +} +{ +mul.f16x2 r998, r995, r963; +} +{ +add.f16x2 r1001, r969, r998; +} +{ +sub.f16x2 r1004, r972, r973; +} +{ +mul.f16x2 r1007, r1004, r964; +} +{ +sub.f16x2 r1010, r1001, r1007; +} +{ +add.f16x2 r1013, r972, r973; +} +{ +mul.f16x2 r1016, r1013, r963; +} +{ +add.f16x2 r1019, r975, r1016; +} +{ +sub.f16x2 r1022, r966, r967; +} +{ +mul.f16x2 r1025, r1022, r964; +} +{ +sub.f16x2 r1028, r1019, r1025; +} +{ +add.f16x2 r1031, r972, r973; +} +{ +mul.f16x2 r1034, r1031, r963; +} +{ +add.f16x2 r1037, r975, r1034; +} +{ +sub.f16x2 r1040, r966, r967; +} +{ +mul.f16x2 r1043, r1040, r964; +} +{ +add.f16x2 r1046, r1037, r1043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1052, {low, high}; +} +{ +mul.f16x2 r1059, r992, r1049; +} +{ +mul.f16x2 r1062, r1028, r1050; +} +{ +sub.f16x2 r1065, r1059, r1062; +} +{ +mul.f16x2 r1068, r992, r1050; +} +{ +fma.rn.f16x2 r1071, r1028, r1049, r1068; +} +{ +mul.f16x2 r1075, r1010, r1051; +} +{ +mul.f16x2 r1078, r1046, r1052; +} +{ +sub.f16x2 r1081, r1075, r1078; +} +{ +mul.f16x2 r1084, r1010, r1052; +} +{ +fma.rn.f16x2 r1087, r1046, r1051, r1084; +} +{ +add.f16x2 r1091, r882, r968; +} +{ +add.f16x2 r1094, r888, r974; +} +{ +sub.f16x2 r1097, r882, r968; +} +{ +sub.f16x2 r1100, r888, r974; +} +{ +add.f16x2 r1103, r906, r1065; +} +{ +add.f16x2 r1106, r942, r1071; +} +{ +sub.f16x2 r1109, r906, r1065; +} +{ +sub.f16x2 r1112, r942, r1071; +} +{ +add.f16x2 r1115, r924, r1081; +} +{ +add.f16x2 r1118, r960, r1087; +} +{ +sub.f16x2 r1121, r924, r1081; +} +{ +sub.f16x2 r1124, r960, r1087; +} +mul.wide.u32 rd6, r1573, 954437177; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1584, rd7; +cvt.rn.f32.u32 f161, r1584; +mul.f32 f162, f161, 0f3E32B8C2; +cos.approx.f32 f113, f162; +sin.approx.f32 f163, f162; +neg.f32 f114, f163; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f113; +cvt.rn.f16.f32 high, f114; +mov.b32 r1127, {low, high}; +} +mul.lo.s32 r1585, r1584, 36; +sub.s32 r1586, r1573, r1585; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1130, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1132, {high, high}; +} +{ +mul.f16x2 r1134, r1106, r1132; +} +{ +fma.rn.f16x2 r1137, r1103, r1130, r1134; +} +{ +mul.f16x2 r1141, r1103, r1132; +} +{ +neg.f16x2 r1144, r1141; +} +{ +fma.rn.f16x2 r1146, r1106, r1130, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1152, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1154, {low, high}; +} +{ +mul.f16x2 r1155, r1152, r1154; +} +{ +mul.f16x2 r1158, r1127, r1150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1161, {high, low}; +} +{ +fma.rn.f16x2 r1163, r1155, r1161, r1158; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1163; +mov.b32 r1167, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1163; +mov.b32 r1169, {high, high}; +} +{ +mul.f16x2 r1171, r1118, r1169; +} +{ +fma.rn.f16x2 r1174, r1115, r1167, r1171; +} +{ +mul.f16x2 r1178, r1115, r1169; +} +{ +neg.f16x2 r1181, r1178; +} +{ +fma.rn.f16x2 r1183, r1118, r1167, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1189, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1191, {low, high}; +} +{ +mul.f16x2 r1192, r1189, r1191; +} +{ +mul.f16x2 r1195, r1163, r1187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1163; +mov.b32 r1198, {high, low}; +} +{ +fma.rn.f16x2 r1200, r1192, r1198, r1195; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1200; +mov.b32 r1204, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1200; +mov.b32 r1206, {high, high}; +} +{ +mul.f16x2 r1208, r1100, r1206; +} +{ +fma.rn.f16x2 r1211, r1097, r1204, r1208; +} +{ +mul.f16x2 r1215, r1097, r1206; +} +{ +neg.f16x2 r1218, r1215; +} +{ +fma.rn.f16x2 r1220, r1100, r1204, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1226, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1228, {low, high}; +} +{ +mul.f16x2 r1229, r1226, r1228; +} +{ +mul.f16x2 r1232, r1200, r1224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1200; +mov.b32 r1235, {high, low}; +} +{ +fma.rn.f16x2 r1237, r1229, r1235, r1232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1237; +mov.b32 r1241, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1237; +mov.b32 r1243, {high, high}; +} +{ +mul.f16x2 r1245, r1112, r1243; +} +{ +fma.rn.f16x2 r1248, r1109, r1241, r1245; +} +{ +mul.f16x2 r1252, r1109, r1243; +} +{ +neg.f16x2 r1255, r1252; +} +{ +fma.rn.f16x2 r1257, r1112, r1241, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1263, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1265, {low, high}; +} +{ +mul.f16x2 r1266, r1263, r1265; +} +{ +mul.f16x2 r1269, r1237, r1261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1237; +mov.b32 r1272, {high, low}; +} +{ +fma.rn.f16x2 r1274, r1266, r1272, r1269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1274; +mov.b32 r1278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1274; +mov.b32 r1280, {high, high}; +} +{ +mul.f16x2 r1282, r1124, r1280; +} +{ +fma.rn.f16x2 r1285, r1121, r1278, r1282; +} +{ +mul.f16x2 r1289, r1121, r1280; +} +{ +neg.f16x2 r1292, r1289; +} +{ +fma.rn.f16x2 r1294, r1124, r1278, r1292; +} +shl.b32 r1587, r1586, 3; +add.s32 r1588, r1575, r1587; +barrier.sync 0; +mad.lo.s32 r1589, r1584, 1728, r1588; +st.shared.u32 [r1589], r1091; +st.shared.u32 [r1589+4], r1094; +st.shared.u32 [r1589+288], r1137; +st.shared.u32 [r1589+292], r1146; +st.shared.u32 [r1589+576], r1174; +st.shared.u32 [r1589+580], r1183; +st.shared.u32 [r1589+864], r1211; +st.shared.u32 [r1589+868], r1220; +st.shared.u32 [r1589+1152], r1248; +st.shared.u32 [r1589+1156], r1257; +st.shared.u32 [r1589+1440], r1285; +st.shared.u32 [r1589+1444], r1294; +barrier.sync 0; +ld.shared.u32 r1321, [r1577]; +ld.shared.u32 r1327, [r1577+4]; +ld.shared.u32 r1407, [r1577+1728]; +ld.shared.u32 r1413, [r1577+1732]; +ld.shared.u32 r1318, [r1577+3456]; +ld.shared.u32 r1324, [r1577+3460]; +ld.shared.u32 r1404, [r1577+5184]; +ld.shared.u32 r1410, [r1577+5188]; +ld.shared.u32 r1319, [r1577+6912]; +ld.shared.u32 r1325, [r1577+6916]; +ld.shared.u32 r1405, [r1577+8640]; +ld.shared.u32 r1411, [r1577+8644]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1315, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r1316, {low, high}; +} +{ +add.f16x2 r1317, r1318, r1319; +} +{ +add.f16x2 r1320, r1321, r1317; +} +{ +add.f16x2 r1323, r1324, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +add.f16x2 r1329, r1318, r1319; +} +{ +mul.f16x2 r1332, r1329, r1315; +} +{ +add.f16x2 r1335, r1321, r1332; +} +{ +sub.f16x2 r1338, r1324, r1325; +} +{ +mul.f16x2 r1341, r1338, r1316; +} +{ +add.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, r1318, r1319; +} +{ +mul.f16x2 r1350, r1347, r1315; +} +{ +add.f16x2 r1353, r1321, r1350; +} +{ +sub.f16x2 r1356, r1324, r1325; +} +{ +mul.f16x2 r1359, r1356, r1316; +} +{ +sub.f16x2 r1362, r1353, r1359; +} +{ +add.f16x2 r1365, r1324, r1325; +} +{ +mul.f16x2 r1368, r1365, r1315; +} +{ +add.f16x2 r1371, r1327, r1368; +} +{ +sub.f16x2 r1374, r1318, r1319; +} +{ +mul.f16x2 r1377, r1374, r1316; +} +{ +sub.f16x2 r1380, r1371, r1377; +} +{ +add.f16x2 r1383, r1324, r1325; +} +{ +mul.f16x2 r1386, r1383, r1315; +} +{ +add.f16x2 r1389, r1327, r1386; +} +{ +sub.f16x2 r1392, r1318, r1319; +} +{ +mul.f16x2 r1395, r1392, r1316; +} +{ +add.f16x2 r1398, r1389, r1395; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r1402, {low, high}; +} +{ +add.f16x2 r1403, r1404, r1405; +} +{ +add.f16x2 r1406, r1407, r1403; +} +{ +add.f16x2 r1409, r1410, r1411; +} +{ +add.f16x2 r1412, r1413, r1409; +} +{ +add.f16x2 r1415, r1404, r1405; +} +{ +mul.f16x2 r1418, r1415, r1401; +} +{ +add.f16x2 r1421, r1407, r1418; +} +{ +sub.f16x2 r1424, r1410, r1411; +} +{ +mul.f16x2 r1427, r1424, r1402; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, r1404, r1405; +} +{ +mul.f16x2 r1436, r1433, r1401; +} +{ +add.f16x2 r1439, r1407, r1436; +} +{ +sub.f16x2 r1442, r1410, r1411; +} +{ +mul.f16x2 r1445, r1442, r1402; +} +{ +sub.f16x2 r1448, r1439, r1445; +} +{ +add.f16x2 r1451, r1410, r1411; +} +{ +mul.f16x2 r1454, r1451, r1401; +} +{ +add.f16x2 r1457, r1413, r1454; +} +{ +sub.f16x2 r1460, r1404, r1405; +} +{ +mul.f16x2 r1463, r1460, r1402; +} +{ +sub.f16x2 r1466, r1457, r1463; +} +{ +add.f16x2 r1469, r1410, r1411; +} +{ +mul.f16x2 r1472, r1469, r1401; +} +{ +add.f16x2 r1475, r1413, r1472; +} +{ +sub.f16x2 r1478, r1404, r1405; +} +{ +mul.f16x2 r1481, r1478, r1402; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1488, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1489, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1490, {low, high}; +} +{ +mul.f16x2 r1497, r1430, r1487; +} +{ +mul.f16x2 r1500, r1466, r1488; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1430, r1488; +} +{ +fma.rn.f16x2 r1509, r1466, r1487, r1506; +} +{ +mul.f16x2 r1513, r1448, r1489; +} +{ +mul.f16x2 r1516, r1484, r1490; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1448, r1490; +} +{ +fma.rn.f16x2 r1525, r1484, r1489, r1522; +} +{ +add.f16x2 %0, r1320, r1406; +} +{ +add.f16x2 %1, r1326, r1412; +} +{ +sub.f16x2 %6, r1320, r1406; +} +{ +sub.f16x2 %7, r1326, r1412; +} +{ +add.f16x2 %2, r1344, r1503; +} +{ +add.f16x2 %3, r1380, r1509; +} +{ +sub.f16x2 %8, r1344, r1503; +} +{ +sub.f16x2 %9, r1380, r1509; +} +{ +add.f16x2 %4, r1362, r1519; +} +{ +add.f16x2 %5, r1398, r1525; +} +{ +sub.f16x2 %10, r1362, r1519; +} +{ +sub.f16x2 %11, r1398, r1525; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1137, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<164>; +.reg .b32 r<1588>; +.reg .b64 rd<8>; +mov.u32 r1565, %tid.y; +mov.u32 r1566, %12; +mad.lo.s32 r1567, r1565, 5184, r1566; +mov.u32 r1568, %tid.x; +mov.f32 f140, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1, {low, high}; +} +mov.f32 f134, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %17, %21; +} +{ +add.f16x2 r6, %13, r3; +} +{ +add.f16x2 r9, %18, %22; +} +{ +add.f16x2 r12, %14, r9; +} +{ +add.f16x2 r15, %17, %21; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %13, r18; +} +{ +sub.f16x2 r24, %18, %22; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %17, %21; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %13, r36; +} +{ +sub.f16x2 r42, %18, %22; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %18, %22; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %14, r54; +} +{ +sub.f16x2 r60, %17, %21; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %18, %22; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %14, r72; +} +{ +sub.f16x2 r78, %17, %21; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %19, %23; +} +{ +add.f16x2 r92, %15, r89; +} +{ +add.f16x2 r95, %20, %24; +} +{ +add.f16x2 r98, %16, r95; +} +{ +add.f16x2 r101, %19, %23; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %15, r104; +} +{ +sub.f16x2 r110, %20, %24; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %19, %23; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %15, r122; +} +{ +sub.f16x2 r128, %20, %24; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %20, %24; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %16, r140; +} +{ +sub.f16x2 r146, %19, %23; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %20, %24; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %16, r158; +} +{ +sub.f16x2 r164, %19, %23; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f136, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r173, {low, high}; +} +mov.f32 f142, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r176, {low, high}; +} +mov.f32 f123, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +shr.u32 r1569, r1568, 3; +mul.wide.u32 rd2, r1569, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1570, rd3; +mul.lo.s32 r1571, r1570, 216; +sub.s32 r1572, r1568, r1571; +mad.lo.s32 r1573, r1570, 5184, r1567; +cvt.rn.f32.u32 f155, r1572; +mul.f32 f156, f155, 0f3B9EDD1F; +cos.approx.f32 f29, f156; +sin.approx.f32 f157, f156; +neg.f32 f30, f157; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r256, {high, high}; +} +{ +mul.f16x2 r258, r230, r256; +} +{ +fma.rn.f16x2 r261, r227, r254, r258; +} +{ +mul.f16x2 r265, r227, r256; +} +{ +neg.f16x2 r268, r265; +} +{ +fma.rn.f16x2 r270, r230, r254, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r276, {high, high}; +} +mov.f32 f124, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r276, r278; +} +{ +mul.f16x2 r282, r251, r274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r285, {high, low}; +} +{ +fma.rn.f16x2 r287, r279, r285, r282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r293, {high, high}; +} +{ +mul.f16x2 r295, r242, r293; +} +{ +fma.rn.f16x2 r298, r239, r291, r295; +} +{ +mul.f16x2 r302, r239, r293; +} +{ +neg.f16x2 r305, r302; +} +{ +fma.rn.f16x2 r307, r242, r291, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r313, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r315, {low, high}; +} +{ +mul.f16x2 r316, r313, r315; +} +{ +mul.f16x2 r319, r287, r311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r322, {high, low}; +} +{ +fma.rn.f16x2 r324, r316, r322, r319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r224, r330; +} +{ +fma.rn.f16x2 r335, r221, r328, r332; +} +{ +mul.f16x2 r339, r221, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r224, r328, r342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r350, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r350, r352; +} +{ +mul.f16x2 r356, r324, r348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r359, {high, low}; +} +{ +fma.rn.f16x2 r361, r353, r359, r356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r367, {high, high}; +} +{ +mul.f16x2 r369, r236, r367; +} +{ +fma.rn.f16x2 r372, r233, r365, r369; +} +{ +mul.f16x2 r376, r233, r367; +} +{ +neg.f16x2 r379, r376; +} +{ +fma.rn.f16x2 r381, r236, r365, r379; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r387, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r389, {low, high}; +} +{ +mul.f16x2 r390, r387, r389; +} +{ +mul.f16x2 r393, r361, r385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r396, {high, low}; +} +{ +fma.rn.f16x2 r398, r390, r396, r393; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r404, {high, high}; +} +{ +mul.f16x2 r406, r248, r404; +} +{ +fma.rn.f16x2 r409, r245, r402, r406; +} +{ +mul.f16x2 r413, r245, r404; +} +{ +neg.f16x2 r416, r413; +} +{ +fma.rn.f16x2 r418, r248, r402, r416; +} +barrier.sync 0; +mad.lo.s32 r1574, r1572, 24, r1573; +st.shared.v2.f32 [r1574], {r215, r261}; +st.shared.v2.f32 [r1574+8], {r298, r335}; +st.shared.v2.f32 [r1574+16], {r372, r409}; +barrier.sync 0; +mad.lo.s32 r1575, r1572, -20, r1574; +ld.shared.u32 r445, [r1575]; +ld.shared.u32 r531, [r1575+864]; +ld.shared.u32 r442, [r1575+1728]; +ld.shared.u32 r528, [r1575+2592]; +ld.shared.u32 r443, [r1575+3456]; +ld.shared.u32 r529, [r1575+4320]; +barrier.sync 0; +st.shared.v2.f32 [r1574], {r218, r270}; +st.shared.v2.f32 [r1574+8], {r307, r344}; +st.shared.v2.f32 [r1574+16], {r381, r418}; +barrier.sync 0; +ld.shared.u32 r451, [r1575]; +ld.shared.u32 r537, [r1575+864]; +ld.shared.u32 r448, [r1575+1728]; +ld.shared.u32 r534, [r1575+2592]; +ld.shared.u32 r449, [r1575+3456]; +ld.shared.u32 r535, [r1575+4320]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r440, {low, high}; +} +{ +add.f16x2 r441, r442, r443; +} +{ +add.f16x2 r444, r445, r441; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r442, r443; +} +{ +mul.f16x2 r456, r453, r439; +} +{ +add.f16x2 r459, r445, r456; +} +{ +sub.f16x2 r462, r448, r449; +} +{ +mul.f16x2 r465, r462, r440; +} +{ +add.f16x2 r468, r459, r465; +} +{ +add.f16x2 r471, r442, r443; +} +{ +mul.f16x2 r474, r471, r439; +} +{ +add.f16x2 r477, r445, r474; +} +{ +sub.f16x2 r480, r448, r449; +} +{ +mul.f16x2 r483, r480, r440; +} +{ +sub.f16x2 r486, r477, r483; +} +{ +add.f16x2 r489, r448, r449; +} +{ +mul.f16x2 r492, r489, r439; +} +{ +add.f16x2 r495, r451, r492; +} +{ +sub.f16x2 r498, r442, r443; +} +{ +mul.f16x2 r501, r498, r440; +} +{ +sub.f16x2 r504, r495, r501; +} +{ +add.f16x2 r507, r448, r449; +} +{ +mul.f16x2 r510, r507, r439; +} +{ +add.f16x2 r513, r451, r510; +} +{ +sub.f16x2 r516, r442, r443; +} +{ +mul.f16x2 r519, r516, r440; +} +{ +add.f16x2 r522, r513, r519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r526, {low, high}; +} +{ +add.f16x2 r527, r528, r529; +} +{ +add.f16x2 r530, r531, r527; +} +{ +add.f16x2 r533, r534, r535; +} +{ +add.f16x2 r536, r537, r533; +} +{ +add.f16x2 r539, r528, r529; +} +{ +mul.f16x2 r542, r539, r525; +} +{ +add.f16x2 r545, r531, r542; +} +{ +sub.f16x2 r548, r534, r535; +} +{ +mul.f16x2 r551, r548, r526; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, r528, r529; +} +{ +mul.f16x2 r560, r557, r525; +} +{ +add.f16x2 r563, r531, r560; +} +{ +sub.f16x2 r566, r534, r535; +} +{ +mul.f16x2 r569, r566, r526; +} +{ +sub.f16x2 r572, r563, r569; +} +{ +add.f16x2 r575, r534, r535; +} +{ +mul.f16x2 r578, r575, r525; +} +{ +add.f16x2 r581, r537, r578; +} +{ +sub.f16x2 r584, r528, r529; +} +{ +mul.f16x2 r587, r584, r526; +} +{ +sub.f16x2 r590, r581, r587; +} +{ +add.f16x2 r593, r534, r535; +} +{ +mul.f16x2 r596, r593, r525; +} +{ +add.f16x2 r599, r537, r596; +} +{ +sub.f16x2 r602, r528, r529; +} +{ +mul.f16x2 r605, r602, r526; +} +{ +add.f16x2 r608, r599, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r621, r554, r611; +} +{ +mul.f16x2 r624, r590, r612; +} +{ +sub.f16x2 r627, r621, r624; +} +{ +mul.f16x2 r630, r554, r612; +} +{ +fma.rn.f16x2 r633, r590, r611, r630; +} +{ +mul.f16x2 r637, r572, r613; +} +{ +mul.f16x2 r640, r608, r614; +} +{ +sub.f16x2 r643, r637, r640; +} +{ +mul.f16x2 r646, r572, r614; +} +{ +fma.rn.f16x2 r649, r608, r613, r646; +} +{ +add.f16x2 r653, r444, r530; +} +{ +add.f16x2 r656, r450, r536; +} +{ +sub.f16x2 r659, r444, r530; +} +{ +sub.f16x2 r662, r450, r536; +} +{ +add.f16x2 r665, r468, r627; +} +{ +add.f16x2 r668, r504, r633; +} +{ +sub.f16x2 r671, r468, r627; +} +{ +sub.f16x2 r674, r504, r633; +} +{ +add.f16x2 r677, r486, r643; +} +{ +add.f16x2 r680, r522, r649; +} +{ +sub.f16x2 r683, r486, r643; +} +{ +sub.f16x2 r686, r522, r649; +} +mul.wide.u32 rd4, r1572, -1431655765; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1576, rd5; +mul.lo.s32 r1577, r1576, 6; +sub.s32 r1578, r1572, r1577; +shl.b32 r1579, r1578, 2; +add.s32 r1580, r1573, r1579; +cvt.rn.f32.u32 f158, r1576; +mul.f32 f159, f158, 0f3CEE4BAE; +cos.approx.f32 f71, f159; +sin.approx.f32 f160, f159; +neg.f32 f72, f160; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r689, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r694, {high, high}; +} +{ +mul.f16x2 r696, r668, r694; +} +{ +fma.rn.f16x2 r699, r665, r692, r696; +} +{ +mul.f16x2 r703, r665, r694; +} +{ +neg.f16x2 r706, r703; +} +{ +fma.rn.f16x2 r708, r668, r692, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r712, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r714, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r714, r716; +} +{ +mul.f16x2 r720, r689, r712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r723, {high, low}; +} +{ +fma.rn.f16x2 r725, r717, r723, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r731, {high, high}; +} +{ +mul.f16x2 r733, r680, r731; +} +{ +fma.rn.f16x2 r736, r677, r729, r733; +} +{ +mul.f16x2 r740, r677, r731; +} +{ +neg.f16x2 r743, r740; +} +{ +fma.rn.f16x2 r745, r680, r729, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r749, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r751, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r753, {low, high}; +} +{ +mul.f16x2 r754, r751, r753; +} +{ +mul.f16x2 r757, r725, r749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r760, {high, low}; +} +{ +fma.rn.f16x2 r762, r754, r760, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r768, {high, high}; +} +{ +mul.f16x2 r770, r662, r768; +} +{ +fma.rn.f16x2 r773, r659, r766, r770; +} +{ +mul.f16x2 r777, r659, r768; +} +{ +neg.f16x2 r780, r777; +} +{ +fma.rn.f16x2 r782, r662, r766, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r788, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r790, {low, high}; +} +{ +mul.f16x2 r791, r788, r790; +} +{ +mul.f16x2 r794, r762, r786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r797, {high, low}; +} +{ +fma.rn.f16x2 r799, r791, r797, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r805, {high, high}; +} +{ +mul.f16x2 r807, r674, r805; +} +{ +fma.rn.f16x2 r810, r671, r803, r807; +} +{ +mul.f16x2 r814, r671, r805; +} +{ +neg.f16x2 r817, r814; +} +{ +fma.rn.f16x2 r819, r674, r803, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r825, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r827, {low, high}; +} +{ +mul.f16x2 r828, r825, r827; +} +{ +mul.f16x2 r831, r799, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r834, {high, low}; +} +{ +fma.rn.f16x2 r836, r828, r834, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r842, {high, high}; +} +{ +mul.f16x2 r844, r686, r842; +} +{ +fma.rn.f16x2 r847, r683, r840, r844; +} +{ +mul.f16x2 r851, r683, r842; +} +{ +neg.f16x2 r854, r851; +} +{ +fma.rn.f16x2 r856, r686, r840, r854; +} +barrier.sync 0; +mad.lo.s32 r1581, r1576, 144, r1580; +st.shared.u32 [r1581], r653; +st.shared.u32 [r1581+24], r699; +st.shared.u32 [r1581+48], r736; +st.shared.u32 [r1581+72], r773; +st.shared.u32 [r1581+96], r810; +st.shared.u32 [r1581+120], r847; +barrier.sync 0; +ld.shared.u32 r883, [r1575]; +ld.shared.u32 r969, [r1575+864]; +ld.shared.u32 r880, [r1575+1728]; +ld.shared.u32 r966, [r1575+2592]; +ld.shared.u32 r881, [r1575+3456]; +ld.shared.u32 r967, [r1575+4320]; +barrier.sync 0; +st.shared.u32 [r1581], r656; +st.shared.u32 [r1581+24], r708; +st.shared.u32 [r1581+48], r745; +st.shared.u32 [r1581+72], r782; +st.shared.u32 [r1581+96], r819; +st.shared.u32 [r1581+120], r856; +barrier.sync 0; +ld.shared.u32 r889, [r1575]; +ld.shared.u32 r975, [r1575+864]; +ld.shared.u32 r886, [r1575+1728]; +ld.shared.u32 r972, [r1575+2592]; +ld.shared.u32 r887, [r1575+3456]; +ld.shared.u32 r973, [r1575+4320]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r878, {low, high}; +} +{ +add.f16x2 r879, r880, r881; +} +{ +add.f16x2 r882, r883, r879; +} +{ +add.f16x2 r885, r886, r887; +} +{ +add.f16x2 r888, r889, r885; +} +{ +add.f16x2 r891, r880, r881; +} +{ +mul.f16x2 r894, r891, r877; +} +{ +add.f16x2 r897, r883, r894; +} +{ +sub.f16x2 r900, r886, r887; +} +{ +mul.f16x2 r903, r900, r878; +} +{ +add.f16x2 r906, r897, r903; +} +{ +add.f16x2 r909, r880, r881; +} +{ +mul.f16x2 r912, r909, r877; +} +{ +add.f16x2 r915, r883, r912; +} +{ +sub.f16x2 r918, r886, r887; +} +{ +mul.f16x2 r921, r918, r878; +} +{ +sub.f16x2 r924, r915, r921; +} +{ +add.f16x2 r927, r886, r887; +} +{ +mul.f16x2 r930, r927, r877; +} +{ +add.f16x2 r933, r889, r930; +} +{ +sub.f16x2 r936, r880, r881; +} +{ +mul.f16x2 r939, r936, r878; +} +{ +sub.f16x2 r942, r933, r939; +} +{ +add.f16x2 r945, r886, r887; +} +{ +mul.f16x2 r948, r945, r877; +} +{ +add.f16x2 r951, r889, r948; +} +{ +sub.f16x2 r954, r880, r881; +} +{ +mul.f16x2 r957, r954, r878; +} +{ +add.f16x2 r960, r951, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r964, {low, high}; +} +{ +add.f16x2 r965, r966, r967; +} +{ +add.f16x2 r968, r969, r965; +} +{ +add.f16x2 r971, r972, r973; +} +{ +add.f16x2 r974, r975, r971; +} +{ +add.f16x2 r977, r966, r967; +} +{ +mul.f16x2 r980, r977, r963; +} +{ +add.f16x2 r983, r969, r980; +} +{ +sub.f16x2 r986, r972, r973; +} +{ +mul.f16x2 r989, r986, r964; +} +{ +add.f16x2 r992, r983, r989; +} +{ +add.f16x2 r995, r966, r967; +} +{ +mul.f16x2 r998, r995, r963; +} +{ +add.f16x2 r1001, r969, r998; +} +{ +sub.f16x2 r1004, r972, r973; +} +{ +mul.f16x2 r1007, r1004, r964; +} +{ +sub.f16x2 r1010, r1001, r1007; +} +{ +add.f16x2 r1013, r972, r973; +} +{ +mul.f16x2 r1016, r1013, r963; +} +{ +add.f16x2 r1019, r975, r1016; +} +{ +sub.f16x2 r1022, r966, r967; +} +{ +mul.f16x2 r1025, r1022, r964; +} +{ +sub.f16x2 r1028, r1019, r1025; +} +{ +add.f16x2 r1031, r972, r973; +} +{ +mul.f16x2 r1034, r1031, r963; +} +{ +add.f16x2 r1037, r975, r1034; +} +{ +sub.f16x2 r1040, r966, r967; +} +{ +mul.f16x2 r1043, r1040, r964; +} +{ +add.f16x2 r1046, r1037, r1043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1052, {low, high}; +} +{ +mul.f16x2 r1059, r992, r1049; +} +{ +mul.f16x2 r1062, r1028, r1050; +} +{ +sub.f16x2 r1065, r1059, r1062; +} +{ +mul.f16x2 r1068, r992, r1050; +} +{ +fma.rn.f16x2 r1071, r1028, r1049, r1068; +} +{ +mul.f16x2 r1075, r1010, r1051; +} +{ +mul.f16x2 r1078, r1046, r1052; +} +{ +sub.f16x2 r1081, r1075, r1078; +} +{ +mul.f16x2 r1084, r1010, r1052; +} +{ +fma.rn.f16x2 r1087, r1046, r1051, r1084; +} +{ +add.f16x2 r1091, r882, r968; +} +{ +add.f16x2 r1094, r888, r974; +} +{ +sub.f16x2 r1097, r882, r968; +} +{ +sub.f16x2 r1100, r888, r974; +} +{ +add.f16x2 r1103, r906, r1065; +} +{ +add.f16x2 r1106, r942, r1071; +} +{ +sub.f16x2 r1109, r906, r1065; +} +{ +sub.f16x2 r1112, r942, r1071; +} +{ +add.f16x2 r1115, r924, r1081; +} +{ +add.f16x2 r1118, r960, r1087; +} +{ +sub.f16x2 r1121, r924, r1081; +} +{ +sub.f16x2 r1124, r960, r1087; +} +mul.wide.u32 rd6, r1572, 954437177; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1582, rd7; +mul.lo.s32 r1583, r1582, 36; +sub.s32 r1584, r1572, r1583; +shl.b32 r1585, r1584, 2; +add.s32 r1586, r1573, r1585; +cvt.rn.f32.u32 f161, r1582; +mul.f32 f162, f161, 0f3E32B8C2; +cos.approx.f32 f113, f162; +sin.approx.f32 f163, f162; +neg.f32 f114, f163; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f113; +cvt.rn.f16.f32 high, f114; +mov.b32 r1127, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1130, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1132, {high, high}; +} +{ +mul.f16x2 r1134, r1106, r1132; +} +{ +fma.rn.f16x2 r1137, r1103, r1130, r1134; +} +{ +mul.f16x2 r1141, r1103, r1132; +} +{ +neg.f16x2 r1144, r1141; +} +{ +fma.rn.f16x2 r1146, r1106, r1130, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1152, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1154, {low, high}; +} +{ +mul.f16x2 r1155, r1152, r1154; +} +{ +mul.f16x2 r1158, r1127, r1150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1161, {high, low}; +} +{ +fma.rn.f16x2 r1163, r1155, r1161, r1158; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1163; +mov.b32 r1167, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1163; +mov.b32 r1169, {high, high}; +} +{ +mul.f16x2 r1171, r1118, r1169; +} +{ +fma.rn.f16x2 r1174, r1115, r1167, r1171; +} +{ +mul.f16x2 r1178, r1115, r1169; +} +{ +neg.f16x2 r1181, r1178; +} +{ +fma.rn.f16x2 r1183, r1118, r1167, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1189, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1191, {low, high}; +} +{ +mul.f16x2 r1192, r1189, r1191; +} +{ +mul.f16x2 r1195, r1163, r1187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1163; +mov.b32 r1198, {high, low}; +} +{ +fma.rn.f16x2 r1200, r1192, r1198, r1195; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1200; +mov.b32 r1204, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1200; +mov.b32 r1206, {high, high}; +} +{ +mul.f16x2 r1208, r1100, r1206; +} +{ +fma.rn.f16x2 r1211, r1097, r1204, r1208; +} +{ +mul.f16x2 r1215, r1097, r1206; +} +{ +neg.f16x2 r1218, r1215; +} +{ +fma.rn.f16x2 r1220, r1100, r1204, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1226, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1228, {low, high}; +} +{ +mul.f16x2 r1229, r1226, r1228; +} +{ +mul.f16x2 r1232, r1200, r1224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1200; +mov.b32 r1235, {high, low}; +} +{ +fma.rn.f16x2 r1237, r1229, r1235, r1232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1237; +mov.b32 r1241, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1237; +mov.b32 r1243, {high, high}; +} +{ +mul.f16x2 r1245, r1112, r1243; +} +{ +fma.rn.f16x2 r1248, r1109, r1241, r1245; +} +{ +mul.f16x2 r1252, r1109, r1243; +} +{ +neg.f16x2 r1255, r1252; +} +{ +fma.rn.f16x2 r1257, r1112, r1241, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1127; +mov.b32 r1263, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1265, {low, high}; +} +{ +mul.f16x2 r1266, r1263, r1265; +} +{ +mul.f16x2 r1269, r1237, r1261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1237; +mov.b32 r1272, {high, low}; +} +{ +fma.rn.f16x2 r1274, r1266, r1272, r1269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1274; +mov.b32 r1278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1274; +mov.b32 r1280, {high, high}; +} +{ +mul.f16x2 r1282, r1124, r1280; +} +{ +fma.rn.f16x2 r1285, r1121, r1278, r1282; +} +{ +mul.f16x2 r1289, r1121, r1280; +} +{ +neg.f16x2 r1292, r1289; +} +{ +fma.rn.f16x2 r1294, r1124, r1278, r1292; +} +barrier.sync 0; +mad.lo.s32 r1587, r1582, 864, r1586; +st.shared.u32 [r1587], r1091; +st.shared.u32 [r1587+144], r1137; +st.shared.u32 [r1587+288], r1174; +st.shared.u32 [r1587+432], r1211; +st.shared.u32 [r1587+576], r1248; +st.shared.u32 [r1587+720], r1285; +barrier.sync 0; +ld.shared.u32 r1321, [r1575]; +ld.shared.u32 r1407, [r1575+864]; +ld.shared.u32 r1318, [r1575+1728]; +ld.shared.u32 r1404, [r1575+2592]; +ld.shared.u32 r1319, [r1575+3456]; +ld.shared.u32 r1405, [r1575+4320]; +barrier.sync 0; +st.shared.u32 [r1587], r1094; +st.shared.u32 [r1587+144], r1146; +st.shared.u32 [r1587+288], r1183; +st.shared.u32 [r1587+432], r1220; +st.shared.u32 [r1587+576], r1257; +st.shared.u32 [r1587+720], r1294; +barrier.sync 0; +ld.shared.u32 r1327, [r1575]; +ld.shared.u32 r1413, [r1575+864]; +ld.shared.u32 r1324, [r1575+1728]; +ld.shared.u32 r1410, [r1575+2592]; +ld.shared.u32 r1325, [r1575+3456]; +ld.shared.u32 r1411, [r1575+4320]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1315, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r1316, {low, high}; +} +{ +add.f16x2 r1317, r1318, r1319; +} +{ +add.f16x2 r1320, r1321, r1317; +} +{ +add.f16x2 r1323, r1324, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +add.f16x2 r1329, r1318, r1319; +} +{ +mul.f16x2 r1332, r1329, r1315; +} +{ +add.f16x2 r1335, r1321, r1332; +} +{ +sub.f16x2 r1338, r1324, r1325; +} +{ +mul.f16x2 r1341, r1338, r1316; +} +{ +add.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, r1318, r1319; +} +{ +mul.f16x2 r1350, r1347, r1315; +} +{ +add.f16x2 r1353, r1321, r1350; +} +{ +sub.f16x2 r1356, r1324, r1325; +} +{ +mul.f16x2 r1359, r1356, r1316; +} +{ +sub.f16x2 r1362, r1353, r1359; +} +{ +add.f16x2 r1365, r1324, r1325; +} +{ +mul.f16x2 r1368, r1365, r1315; +} +{ +add.f16x2 r1371, r1327, r1368; +} +{ +sub.f16x2 r1374, r1318, r1319; +} +{ +mul.f16x2 r1377, r1374, r1316; +} +{ +sub.f16x2 r1380, r1371, r1377; +} +{ +add.f16x2 r1383, r1324, r1325; +} +{ +mul.f16x2 r1386, r1383, r1315; +} +{ +add.f16x2 r1389, r1327, r1386; +} +{ +sub.f16x2 r1392, r1318, r1319; +} +{ +mul.f16x2 r1395, r1392, r1316; +} +{ +add.f16x2 r1398, r1389, r1395; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f134; +cvt.rn.f16.f32 high, f134; +mov.b32 r1402, {low, high}; +} +{ +add.f16x2 r1403, r1404, r1405; +} +{ +add.f16x2 r1406, r1407, r1403; +} +{ +add.f16x2 r1409, r1410, r1411; +} +{ +add.f16x2 r1412, r1413, r1409; +} +{ +add.f16x2 r1415, r1404, r1405; +} +{ +mul.f16x2 r1418, r1415, r1401; +} +{ +add.f16x2 r1421, r1407, r1418; +} +{ +sub.f16x2 r1424, r1410, r1411; +} +{ +mul.f16x2 r1427, r1424, r1402; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, r1404, r1405; +} +{ +mul.f16x2 r1436, r1433, r1401; +} +{ +add.f16x2 r1439, r1407, r1436; +} +{ +sub.f16x2 r1442, r1410, r1411; +} +{ +mul.f16x2 r1445, r1442, r1402; +} +{ +sub.f16x2 r1448, r1439, r1445; +} +{ +add.f16x2 r1451, r1410, r1411; +} +{ +mul.f16x2 r1454, r1451, r1401; +} +{ +add.f16x2 r1457, r1413, r1454; +} +{ +sub.f16x2 r1460, r1404, r1405; +} +{ +mul.f16x2 r1463, r1460, r1402; +} +{ +sub.f16x2 r1466, r1457, r1463; +} +{ +add.f16x2 r1469, r1410, r1411; +} +{ +mul.f16x2 r1472, r1469, r1401; +} +{ +add.f16x2 r1475, r1413, r1472; +} +{ +sub.f16x2 r1478, r1404, r1405; +} +{ +mul.f16x2 r1481, r1478, r1402; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f136; +cvt.rn.f16.f32 high, f136; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1488, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1489, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f142; +cvt.rn.f16.f32 high, f142; +mov.b32 r1490, {low, high}; +} +{ +mul.f16x2 r1497, r1430, r1487; +} +{ +mul.f16x2 r1500, r1466, r1488; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1430, r1488; +} +{ +fma.rn.f16x2 r1509, r1466, r1487, r1506; +} +{ +mul.f16x2 r1513, r1448, r1489; +} +{ +mul.f16x2 r1516, r1484, r1490; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1448, r1490; +} +{ +fma.rn.f16x2 r1525, r1484, r1489, r1522; +} +{ +add.f16x2 %0, r1320, r1406; +} +{ +add.f16x2 %1, r1326, r1412; +} +{ +sub.f16x2 %6, r1320, r1406; +} +{ +sub.f16x2 %7, r1326, r1412; +} +{ +add.f16x2 %2, r1344, r1503; +} +{ +add.f16x2 %3, r1380, r1509; +} +{ +sub.f16x2 %8, r1344, r1503; +} +{ +sub.f16x2 %9, r1380, r1509; +} +{ +add.f16x2 %4, r1362, r1519; +} +{ +add.f16x2 %5, r1398, r1525; +} +{ +sub.f16x2 %10, r1362, r1519; +} +{ +sub.f16x2 %11, r1398, r1525; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..402b3167c702d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp32_fwd.hpp.inc @@ -0,0 +1,934 @@ +#ifndef CUFFTDX_FFT_1296_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_1296_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<188, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 10368, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %21, %26; +add.f32 f26, %16, f25; +add.f32 f27, %23, %28; +add.f32 f28, %17, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %16, f29; +sub.f32 f31, %23, %28; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %17, f35; +sub.f32 f37, %21, %26; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %24, %29; +add.f32 f42, %18, f41; +add.f32 f43, %25, %30; +add.f32 f44, %20, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %18, f45; +sub.f32 f47, %25, %30; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %20, f51; +sub.f32 f53, %24, %29; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +sub.f32 f67, f26, f42; +sub.f32 f68, f28, f44; +add.f32 f69, f33, f59; +add.f32 f70, f39, f61; +sub.f32 f71, f33, f59; +sub.f32 f72, f39, f61; +add.f32 f73, f34, f64; +add.f32 f74, f40, f66; +sub.f32 f75, f34, f64; +sub.f32 f76, f40, f66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mad.lo.s32 r9, r6, 10368, r3; +mul.wide.u32 rd4, r8, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f77, f78}, [rd6]; +mul.f32 f81, f77, f69; +mul.f32 f82, f78, f70; +mul.f32 f83, f77, f70; +mul.f32 f84, f77, f77; +mul.f32 f85, f78, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f78, f77; +fma.rn.f32 f88, f78, f77, f87; +mul.f32 f89, f86, f73; +mul.f32 f90, f88, f74; +mul.f32 f91, f86, f74; +mul.f32 f92, f77, f86; +mul.f32 f93, f78, f88; +sub.f32 f94, f92, f93; +mul.f32 f95, f77, f88; +fma.rn.f32 f96, f78, f86, f95; +mul.f32 f97, f94, f67; +mul.f32 f98, f96, f68; +mul.f32 f99, f94, f68; +mul.f32 f100, f77, f94; +mul.f32 f101, f78, f96; +sub.f32 f102, f100, f101; +mul.f32 f103, f77, f96; +fma.rn.f32 f104, f78, f94, f103; +mul.f32 f105, f102, f71; +mul.f32 f106, f104, f72; +mul.f32 f107, f102, f72; +mul.f32 f108, f77, f102; +mul.f32 f109, f78, f104; +sub.f32 f110, f108, f109; +mul.f32 f111, f77, f104; +fma.rn.f32 f112, f78, f102, f111; +mul.f32 f113, f110, f75; +mul.f32 f114, f112, f76; +mul.f32 f115, f110, f76; +barrier.sync 0; +mad.lo.s32 r10, r8, 48, r9; +add.f32 f116, f28, f44; +add.f32 f117, f26, f42; +st.shared.v2.f32 [r10], {f117, f116}; +fma.rn.f32 f118, f78, f69, f83; +sub.f32 f119, f81, f82; +st.shared.v2.f32 [r10+8], {f119, f118}; +fma.rn.f32 f120, f88, f73, f91; +sub.f32 f121, f89, f90; +st.shared.v2.f32 [r10+16], {f121, f120}; +fma.rn.f32 f122, f96, f67, f99; +sub.f32 f123, f97, f98; +st.shared.v2.f32 [r10+24], {f123, f122}; +fma.rn.f32 f124, f104, f71, f107; +sub.f32 f125, f105, f106; +st.shared.v2.f32 [r10+32], {f125, f124}; +fma.rn.f32 f126, f112, f75, f115; +sub.f32 f127, f113, f114; +st.shared.v2.f32 [r10+40], {f127, f126}; +barrier.sync 0; +mad.lo.s32 r11, r8, -40, r10; +ld.shared.v2.f32 {f128, f129}, [r11]; +ld.shared.v2.f32 {f132, f133}, [r11+1728]; +ld.shared.v2.f32 {f136, f137}, [r11+3456]; +ld.shared.v2.f32 {f140, f141}, [r11+5184]; +ld.shared.v2.f32 {f144, f145}, [r11+6912]; +ld.shared.v2.f32 {f148, f149}, [r11+8640]; +add.f32 f152, f136, f144; +add.f32 f153, f128, f152; +add.f32 f154, f137, f145; +add.f32 f155, f129, f154; +mul.f32 f156, f152, 0f3F000000; +sub.f32 f157, f128, f156; +sub.f32 f158, f137, f145; +mul.f32 f159, f158, 0f3F5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f154, 0f3F000000; +sub.f32 f163, f129, f162; +sub.f32 f164, f136, f144; +mul.f32 f165, f164, 0f3F5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +add.f32 f168, f140, f148; +add.f32 f169, f132, f168; +add.f32 f170, f141, f149; +add.f32 f171, f133, f170; +mul.f32 f172, f168, 0f3F000000; +sub.f32 f173, f132, f172; +sub.f32 f174, f141, f149; +mul.f32 f175, f174, 0f3F5DB3D7; +add.f32 f176, f175, f173; +sub.f32 f177, f173, f175; +mul.f32 f178, f170, 0f3F000000; +sub.f32 f179, f133, f178; +sub.f32 f180, f140, f148; +mul.f32 f181, f180, 0f3F5DB3D7; +sub.f32 f182, f179, f181; +add.f32 f183, f181, f179; +mul.f32 f184, f176, 0f3F000000; +mul.f32 f185, f182, 0fBF5DB3D7; +sub.f32 f186, f184, f185; +mul.f32 f187, f182, 0f3F000000; +fma.rn.f32 f188, f176, 0fBF5DB3D7, f187; +mul.f32 f189, f177, 0fBF000000; +mul.f32 f190, f183, 0fBF5DB3D7; +sub.f32 f191, f189, f190; +mul.f32 f192, f183, 0fBF000000; +fma.rn.f32 f193, f177, 0fBF5DB3D7, f192; +sub.f32 f194, f153, f169; +sub.f32 f195, f155, f171; +add.f32 f196, f160, f186; +add.f32 f197, f166, f188; +sub.f32 f198, f160, f186; +sub.f32 f199, f166, f188; +add.f32 f200, f161, f191; +add.f32 f201, f167, f193; +sub.f32 f202, f161, f191; +sub.f32 f203, f167, f193; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f204, f205}, [rd11]; +mul.f32 f208, f204, f196; +mul.f32 f209, f205, f197; +mul.f32 f210, f204, f197; +mul.f32 f211, f204, f204; +mul.f32 f212, f205, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f205, f204; +fma.rn.f32 f215, f205, f204, f214; +mul.f32 f216, f213, f200; +mul.f32 f217, f215, f201; +mul.f32 f218, f213, f201; +mul.f32 f219, f204, f213; +mul.f32 f220, f205, f215; +sub.f32 f221, f219, f220; +mul.f32 f222, f204, f215; +fma.rn.f32 f223, f205, f213, f222; +mul.f32 f224, f221, f194; +mul.f32 f225, f223, f195; +mul.f32 f226, f221, f195; +mul.f32 f227, f204, f221; +mul.f32 f228, f205, f223; +sub.f32 f229, f227, f228; +mul.f32 f230, f204, f223; +fma.rn.f32 f231, f205, f221, f230; +mul.f32 f232, f229, f198; +mul.f32 f233, f231, f199; +mul.f32 f234, f229, f199; +mul.f32 f235, f204, f229; +mul.f32 f236, f205, f231; +sub.f32 f237, f235, f236; +mul.f32 f238, f204, f231; +fma.rn.f32 f239, f205, f229, f238; +mul.f32 f240, f237, f202; +mul.f32 f241, f239, f203; +mul.f32 f242, f237, f203; +shl.b32 r15, r14, 3; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 288, r16; +add.f32 f243, f155, f171; +add.f32 f244, f153, f169; +st.shared.v2.f32 [r17], {f244, f243}; +fma.rn.f32 f245, f205, f196, f210; +sub.f32 f246, f208, f209; +st.shared.v2.f32 [r17+48], {f246, f245}; +fma.rn.f32 f247, f215, f200, f218; +sub.f32 f248, f216, f217; +st.shared.v2.f32 [r17+96], {f248, f247}; +fma.rn.f32 f249, f223, f194, f226; +sub.f32 f250, f224, f225; +st.shared.v2.f32 [r17+144], {f250, f249}; +fma.rn.f32 f251, f231, f198, f234; +sub.f32 f252, f232, f233; +st.shared.v2.f32 [r17+192], {f252, f251}; +sub.f32 f253, f240, f241; +fma.rn.f32 f254, f239, f202, f242; +st.shared.v2.f32 [r17+240], {f253, f254}; +barrier.sync 0; +ld.shared.v2.f32 {f255, f256}, [r11]; +ld.shared.v2.f32 {f259, f260}, [r11+1728]; +ld.shared.v2.f32 {f263, f264}, [r11+3456]; +ld.shared.v2.f32 {f267, f268}, [r11+5184]; +ld.shared.v2.f32 {f271, f272}, [r11+6912]; +ld.shared.v2.f32 {f275, f276}, [r11+8640]; +add.f32 f279, f263, f271; +add.f32 f280, f255, f279; +add.f32 f281, f264, f272; +add.f32 f282, f256, f281; +mul.f32 f283, f279, 0f3F000000; +sub.f32 f284, f255, f283; +sub.f32 f285, f264, f272; +mul.f32 f286, f285, 0f3F5DB3D7; +add.f32 f287, f286, f284; +sub.f32 f288, f284, f286; +mul.f32 f289, f281, 0f3F000000; +sub.f32 f290, f256, f289; +sub.f32 f291, f263, f271; +mul.f32 f292, f291, 0f3F5DB3D7; +sub.f32 f293, f290, f292; +add.f32 f294, f292, f290; +add.f32 f295, f267, f275; +add.f32 f296, f259, f295; +add.f32 f297, f268, f276; +add.f32 f298, f260, f297; +mul.f32 f299, f295, 0f3F000000; +sub.f32 f300, f259, f299; +sub.f32 f301, f268, f276; +mul.f32 f302, f301, 0f3F5DB3D7; +add.f32 f303, f302, f300; +sub.f32 f304, f300, f302; +mul.f32 f305, f297, 0f3F000000; +sub.f32 f306, f260, f305; +sub.f32 f307, f267, f275; +mul.f32 f308, f307, 0f3F5DB3D7; +sub.f32 f309, f306, f308; +add.f32 f310, f308, f306; +mul.f32 f311, f303, 0f3F000000; +mul.f32 f312, f309, 0fBF5DB3D7; +sub.f32 f313, f311, f312; +mul.f32 f314, f309, 0f3F000000; +fma.rn.f32 f315, f303, 0fBF5DB3D7, f314; +mul.f32 f316, f304, 0fBF000000; +mul.f32 f317, f310, 0fBF5DB3D7; +sub.f32 f318, f316, f317; +mul.f32 f319, f310, 0fBF000000; +fma.rn.f32 f320, f304, 0fBF5DB3D7, f319; +sub.f32 f321, f280, f296; +sub.f32 f322, f282, f298; +add.f32 f323, f287, f313; +add.f32 f324, f293, f315; +sub.f32 f325, f287, f313; +sub.f32 f326, f293, f315; +add.f32 f327, f288, f318; +add.f32 f328, f294, f320; +sub.f32 f329, f288, f318; +sub.f32 f330, f294, f320; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f331, f332}, [rd16]; +mul.f32 f335, f331, f323; +mul.f32 f336, f332, f324; +mul.f32 f337, f331, f324; +mul.f32 f338, f331, f331; +mul.f32 f339, f332, f332; +sub.f32 f340, f338, f339; +mul.f32 f341, f332, f331; +fma.rn.f32 f342, f332, f331, f341; +mul.f32 f343, f340, f327; +mul.f32 f344, f342, f328; +mul.f32 f345, f340, f328; +mul.f32 f346, f331, f340; +mul.f32 f347, f332, f342; +sub.f32 f348, f346, f347; +mul.f32 f349, f331, f342; +fma.rn.f32 f350, f332, f340, f349; +mul.f32 f351, f348, f321; +mul.f32 f352, f350, f322; +mul.f32 f353, f348, f322; +mul.f32 f354, f331, f348; +mul.f32 f355, f332, f350; +sub.f32 f356, f354, f355; +mul.f32 f357, f331, f350; +fma.rn.f32 f358, f332, f348, f357; +mul.f32 f359, f356, f325; +mul.f32 f360, f358, f326; +mul.f32 f361, f356, f326; +mul.f32 f362, f331, f356; +mul.f32 f363, f332, f358; +sub.f32 f364, f362, f363; +mul.f32 f365, f331, f358; +fma.rn.f32 f366, f332, f356, f365; +mul.f32 f367, f364, f329; +mul.f32 f368, f366, f330; +mul.f32 f369, f364, f330; +shl.b32 r21, r20, 3; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1728, r22; +add.f32 f370, f282, f298; +add.f32 f371, f280, f296; +st.shared.v2.f32 [r23], {f371, f370}; +fma.rn.f32 f372, f332, f323, f337; +sub.f32 f373, f335, f336; +st.shared.v2.f32 [r23+288], {f373, f372}; +fma.rn.f32 f374, f342, f327, f345; +sub.f32 f375, f343, f344; +st.shared.v2.f32 [r23+576], {f375, f374}; +fma.rn.f32 f376, f350, f321, f353; +sub.f32 f377, f351, f352; +st.shared.v2.f32 [r23+864], {f377, f376}; +fma.rn.f32 f378, f358, f325, f361; +sub.f32 f379, f359, f360; +st.shared.v2.f32 [r23+1152], {f379, f378}; +sub.f32 f380, f367, f368; +fma.rn.f32 f381, f366, f329, f369; +st.shared.v2.f32 [r23+1440], {f380, f381}; +barrier.sync 0; +ld.shared.v2.f32 {f382, f383}, [r11]; +ld.shared.v2.f32 {f386, f387}, [r11+1728]; +ld.shared.v2.f32 {f390, f391}, [r11+3456]; +ld.shared.v2.f32 {f394, f395}, [r11+5184]; +ld.shared.v2.f32 {f398, f399}, [r11+6912]; +ld.shared.v2.f32 {f402, f403}, [r11+8640]; +add.f32 f406, f390, f398; +add.f32 f407, f382, f406; +add.f32 f408, f391, f399; +add.f32 f409, f383, f408; +mul.f32 f410, f406, 0f3F000000; +sub.f32 f411, f382, f410; +sub.f32 f412, f391, f399; +mul.f32 f413, f412, 0f3F5DB3D7; +add.f32 f414, f413, f411; +sub.f32 f415, f411, f413; +mul.f32 f416, f408, 0f3F000000; +sub.f32 f417, f383, f416; +sub.f32 f418, f390, f398; +mul.f32 f419, f418, 0f3F5DB3D7; +sub.f32 f420, f417, f419; +add.f32 f421, f419, f417; +add.f32 f422, f394, f402; +add.f32 f423, f386, f422; +add.f32 f424, f395, f403; +add.f32 f425, f387, f424; +mul.f32 f426, f422, 0f3F000000; +sub.f32 f427, f386, f426; +sub.f32 f428, f395, f403; +mul.f32 f429, f428, 0f3F5DB3D7; +add.f32 f430, f429, f427; +sub.f32 f431, f427, f429; +mul.f32 f432, f424, 0f3F000000; +sub.f32 f433, f387, f432; +sub.f32 f434, f394, f402; +mul.f32 f435, f434, 0f3F5DB3D7; +sub.f32 f436, f433, f435; +add.f32 f437, f435, f433; +mul.f32 f438, f430, 0f3F000000; +mul.f32 f439, f436, 0fBF5DB3D7; +sub.f32 f440, f438, f439; +mul.f32 f441, f436, 0f3F000000; +fma.rn.f32 f442, f430, 0fBF5DB3D7, f441; +mul.f32 f443, f431, 0fBF000000; +mul.f32 f444, f437, 0fBF5DB3D7; +sub.f32 f445, f443, f444; +mul.f32 f446, f437, 0fBF000000; +fma.rn.f32 f447, f431, 0fBF5DB3D7, f446; +add.f32 %1, f409, f425; +add.f32 %0, f407, f423; +add.f32 %3, f420, f442; +add.f32 %2, f414, f440; +add.f32 %5, f421, f447; +add.f32 %4, f415, f445; +sub.f32 %7, f409, f425; +sub.f32 %6, f407, f423; +sub.f32 %9, f420, f442; +sub.f32 %8, f414, f440; +sub.f32 %11, f421, f447; +sub.f32 %10, f415, f445; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_1296), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<189, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<424>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 5184, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %21, %26; +add.f32 f26, %16, f25; +add.f32 f27, %23, %28; +add.f32 f28, %17, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %16, f29; +sub.f32 f31, %23, %28; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %17, f35; +sub.f32 f37, %21, %26; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %24, %29; +add.f32 f42, %18, f41; +add.f32 f43, %25, %30; +add.f32 f44, %20, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %18, f45; +sub.f32 f47, %25, %30; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %20, f51; +sub.f32 f53, %24, %29; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +add.f32 f67, f26, f42; +add.f32 f68, f28, f44; +sub.f32 f69, f26, f42; +sub.f32 f70, f28, f44; +add.f32 f71, f33, f59; +add.f32 f72, f39, f61; +sub.f32 f73, f33, f59; +sub.f32 f74, f39, f61; +add.f32 f75, f34, f64; +add.f32 f76, f40, f66; +sub.f32 f77, f34, f64; +sub.f32 f78, f40, f66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mul.wide.u32 rd4, r8, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f79, f80}, [rd6]; +mul.f32 f83, f79, f71; +mul.f32 f84, f80, f72; +sub.f32 f85, f83, f84; +mul.f32 f86, f79, f72; +fma.rn.f32 f87, f80, f71, f86; +mul.f32 f88, f79, f79; +mul.f32 f89, f80, f80; +sub.f32 f90, f88, f89; +mul.f32 f91, f80, f79; +fma.rn.f32 f92, f80, f79, f91; +mul.f32 f93, f90, f75; +mul.f32 f94, f92, f76; +sub.f32 f95, f93, f94; +mul.f32 f96, f90, f76; +fma.rn.f32 f97, f92, f75, f96; +mul.f32 f98, f79, f90; +mul.f32 f99, f80, f92; +sub.f32 f100, f98, f99; +mul.f32 f101, f79, f92; +fma.rn.f32 f102, f80, f90, f101; +mul.f32 f103, f100, f69; +mul.f32 f104, f102, f70; +sub.f32 f105, f103, f104; +mul.f32 f106, f100, f70; +fma.rn.f32 f107, f102, f69, f106; +mul.f32 f108, f79, f100; +mul.f32 f109, f80, f102; +sub.f32 f110, f108, f109; +mul.f32 f111, f79, f102; +fma.rn.f32 f112, f80, f100, f111; +mul.f32 f113, f110, f73; +mul.f32 f114, f112, f74; +sub.f32 f115, f113, f114; +mul.f32 f116, f110, f74; +fma.rn.f32 f117, f112, f73, f116; +mul.f32 f118, f79, f110; +mul.f32 f119, f80, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f79, f112; +fma.rn.f32 f122, f80, f110, f121; +mul.f32 f123, f120, f77; +mul.f32 f124, f122, f78; +sub.f32 f125, f123, f124; +mul.f32 f126, f120, f78; +fma.rn.f32 f127, f122, f77, f126; +mad.lo.s32 r9, r6, 5184, r3; +barrier.sync 0; +mad.lo.s32 r10, r8, 24, r9; +st.shared.v2.f32 [r10], {f67, f85}; +st.shared.v2.f32 [r10+8], {f95, f105}; +st.shared.v2.f32 [r10+16], {f115, f125}; +barrier.sync 0; +mad.lo.s32 r11, r8, -20, r10; +ld.shared.f32 f128, [r11]; +ld.shared.f32 f129, [r11+864]; +ld.shared.f32 f130, [r11+1728]; +ld.shared.f32 f131, [r11+2592]; +ld.shared.f32 f132, [r11+3456]; +ld.shared.f32 f133, [r11+4320]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f68, f87}; +st.shared.v2.f32 [r10+8], {f97, f107}; +st.shared.v2.f32 [r10+16], {f117, f127}; +barrier.sync 0; +ld.shared.f32 f134, [r11]; +ld.shared.f32 f135, [r11+864]; +ld.shared.f32 f136, [r11+1728]; +ld.shared.f32 f137, [r11+2592]; +ld.shared.f32 f138, [r11+3456]; +ld.shared.f32 f139, [r11+4320]; +add.f32 f140, f130, f132; +add.f32 f141, f128, f140; +add.f32 f142, f136, f138; +add.f32 f143, f134, f142; +mul.f32 f144, f140, 0f3F000000; +sub.f32 f145, f128, f144; +sub.f32 f146, f136, f138; +mul.f32 f147, f146, 0f3F5DB3D7; +add.f32 f148, f147, f145; +sub.f32 f149, f145, f147; +mul.f32 f150, f142, 0f3F000000; +sub.f32 f151, f134, f150; +sub.f32 f152, f130, f132; +mul.f32 f153, f152, 0f3F5DB3D7; +sub.f32 f154, f151, f153; +add.f32 f155, f153, f151; +add.f32 f156, f131, f133; +add.f32 f157, f129, f156; +add.f32 f158, f137, f139; +add.f32 f159, f135, f158; +mul.f32 f160, f156, 0f3F000000; +sub.f32 f161, f129, f160; +sub.f32 f162, f137, f139; +mul.f32 f163, f162, 0f3F5DB3D7; +add.f32 f164, f163, f161; +sub.f32 f165, f161, f163; +mul.f32 f166, f158, 0f3F000000; +sub.f32 f167, f135, f166; +sub.f32 f168, f131, f133; +mul.f32 f169, f168, 0f3F5DB3D7; +sub.f32 f170, f167, f169; +add.f32 f171, f169, f167; +mul.f32 f172, f164, 0f3F000000; +mul.f32 f173, f170, 0fBF5DB3D7; +sub.f32 f174, f172, f173; +mul.f32 f175, f170, 0f3F000000; +fma.rn.f32 f176, f164, 0fBF5DB3D7, f175; +mul.f32 f177, f165, 0fBF000000; +mul.f32 f178, f171, 0fBF5DB3D7; +sub.f32 f179, f177, f178; +mul.f32 f180, f171, 0fBF000000; +fma.rn.f32 f181, f165, 0fBF5DB3D7, f180; +add.f32 f182, f141, f157; +add.f32 f183, f143, f159; +sub.f32 f184, f141, f157; +sub.f32 f185, f143, f159; +add.f32 f186, f148, f174; +add.f32 f187, f154, f176; +sub.f32 f188, f148, f174; +sub.f32 f189, f154, f176; +add.f32 f190, f149, f179; +add.f32 f191, f155, f181; +sub.f32 f192, f149, f179; +sub.f32 f193, f155, f181; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f194, f195}, [rd11]; +mul.f32 f198, f194, f186; +mul.f32 f199, f195, f187; +sub.f32 f200, f198, f199; +mul.f32 f201, f194, f187; +fma.rn.f32 f202, f195, f186, f201; +mul.f32 f203, f194, f194; +mul.f32 f204, f195, f195; +sub.f32 f205, f203, f204; +mul.f32 f206, f195, f194; +fma.rn.f32 f207, f195, f194, f206; +mul.f32 f208, f205, f190; +mul.f32 f209, f207, f191; +sub.f32 f210, f208, f209; +mul.f32 f211, f205, f191; +fma.rn.f32 f212, f207, f190, f211; +mul.f32 f213, f194, f205; +mul.f32 f214, f195, f207; +sub.f32 f215, f213, f214; +mul.f32 f216, f194, f207; +fma.rn.f32 f217, f195, f205, f216; +mul.f32 f218, f215, f184; +mul.f32 f219, f217, f185; +sub.f32 f220, f218, f219; +mul.f32 f221, f215, f185; +fma.rn.f32 f222, f217, f184, f221; +mul.f32 f223, f194, f215; +mul.f32 f224, f195, f217; +sub.f32 f225, f223, f224; +mul.f32 f226, f194, f217; +fma.rn.f32 f227, f195, f215, f226; +mul.f32 f228, f225, f188; +mul.f32 f229, f227, f189; +sub.f32 f230, f228, f229; +mul.f32 f231, f225, f189; +fma.rn.f32 f232, f227, f188, f231; +mul.f32 f233, f194, f225; +mul.f32 f234, f195, f227; +sub.f32 f235, f233, f234; +mul.f32 f236, f194, f227; +fma.rn.f32 f237, f195, f225, f236; +mul.f32 f238, f235, f192; +mul.f32 f239, f237, f193; +sub.f32 f240, f238, f239; +mul.f32 f241, f235, f193; +fma.rn.f32 f242, f237, f192, f241; +shl.b32 r15, r14, 2; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +st.shared.f32 [r17], f182; +st.shared.f32 [r17+24], f200; +st.shared.f32 [r17+48], f210; +st.shared.f32 [r17+72], f220; +st.shared.f32 [r17+96], f230; +st.shared.f32 [r17+120], f240; +barrier.sync 0; +ld.shared.f32 f243, [r11]; +ld.shared.f32 f244, [r11+864]; +ld.shared.f32 f245, [r11+1728]; +ld.shared.f32 f246, [r11+2592]; +ld.shared.f32 f247, [r11+3456]; +ld.shared.f32 f248, [r11+4320]; +barrier.sync 0; +st.shared.f32 [r17], f183; +st.shared.f32 [r17+24], f202; +st.shared.f32 [r17+48], f212; +st.shared.f32 [r17+72], f222; +st.shared.f32 [r17+96], f232; +st.shared.f32 [r17+120], f242; +barrier.sync 0; +ld.shared.f32 f249, [r11]; +ld.shared.f32 f250, [r11+864]; +ld.shared.f32 f251, [r11+1728]; +ld.shared.f32 f252, [r11+2592]; +ld.shared.f32 f253, [r11+3456]; +ld.shared.f32 f254, [r11+4320]; +add.f32 f255, f245, f247; +add.f32 f256, f243, f255; +add.f32 f257, f251, f253; +add.f32 f258, f249, f257; +mul.f32 f259, f255, 0f3F000000; +sub.f32 f260, f243, f259; +sub.f32 f261, f251, f253; +mul.f32 f262, f261, 0f3F5DB3D7; +add.f32 f263, f262, f260; +sub.f32 f264, f260, f262; +mul.f32 f265, f257, 0f3F000000; +sub.f32 f266, f249, f265; +sub.f32 f267, f245, f247; +mul.f32 f268, f267, 0f3F5DB3D7; +sub.f32 f269, f266, f268; +add.f32 f270, f268, f266; +add.f32 f271, f246, f248; +add.f32 f272, f244, f271; +add.f32 f273, f252, f254; +add.f32 f274, f250, f273; +mul.f32 f275, f271, 0f3F000000; +sub.f32 f276, f244, f275; +sub.f32 f277, f252, f254; +mul.f32 f278, f277, 0f3F5DB3D7; +add.f32 f279, f278, f276; +sub.f32 f280, f276, f278; +mul.f32 f281, f273, 0f3F000000; +sub.f32 f282, f250, f281; +sub.f32 f283, f246, f248; +mul.f32 f284, f283, 0f3F5DB3D7; +sub.f32 f285, f282, f284; +add.f32 f286, f284, f282; +mul.f32 f287, f279, 0f3F000000; +mul.f32 f288, f285, 0fBF5DB3D7; +sub.f32 f289, f287, f288; +mul.f32 f290, f285, 0f3F000000; +fma.rn.f32 f291, f279, 0fBF5DB3D7, f290; +mul.f32 f292, f280, 0fBF000000; +mul.f32 f293, f286, 0fBF5DB3D7; +sub.f32 f294, f292, f293; +mul.f32 f295, f286, 0fBF000000; +fma.rn.f32 f296, f280, 0fBF5DB3D7, f295; +add.f32 f297, f256, f272; +add.f32 f298, f258, f274; +sub.f32 f299, f256, f272; +sub.f32 f300, f258, f274; +add.f32 f301, f263, f289; +add.f32 f302, f269, f291; +sub.f32 f303, f263, f289; +sub.f32 f304, f269, f291; +add.f32 f305, f264, f294; +add.f32 f306, f270, f296; +sub.f32 f307, f264, f294; +sub.f32 f308, f270, f296; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f309, f310}, [rd16]; +mul.f32 f313, f309, f301; +mul.f32 f314, f310, f302; +sub.f32 f315, f313, f314; +mul.f32 f316, f309, f302; +fma.rn.f32 f317, f310, f301, f316; +mul.f32 f318, f309, f309; +mul.f32 f319, f310, f310; +sub.f32 f320, f318, f319; +mul.f32 f321, f310, f309; +fma.rn.f32 f322, f310, f309, f321; +mul.f32 f323, f320, f305; +mul.f32 f324, f322, f306; +sub.f32 f325, f323, f324; +mul.f32 f326, f320, f306; +fma.rn.f32 f327, f322, f305, f326; +mul.f32 f328, f309, f320; +mul.f32 f329, f310, f322; +sub.f32 f330, f328, f329; +mul.f32 f331, f309, f322; +fma.rn.f32 f332, f310, f320, f331; +mul.f32 f333, f330, f299; +mul.f32 f334, f332, f300; +sub.f32 f335, f333, f334; +mul.f32 f336, f330, f300; +fma.rn.f32 f337, f332, f299, f336; +mul.f32 f338, f309, f330; +mul.f32 f339, f310, f332; +sub.f32 f340, f338, f339; +mul.f32 f341, f309, f332; +fma.rn.f32 f342, f310, f330, f341; +mul.f32 f343, f340, f303; +mul.f32 f344, f342, f304; +sub.f32 f345, f343, f344; +mul.f32 f346, f340, f304; +fma.rn.f32 f347, f342, f303, f346; +mul.f32 f348, f309, f340; +mul.f32 f349, f310, f342; +sub.f32 f350, f348, f349; +mul.f32 f351, f309, f342; +fma.rn.f32 f352, f310, f340, f351; +mul.f32 f353, f350, f307; +mul.f32 f354, f352, f308; +sub.f32 f355, f353, f354; +mul.f32 f356, f350, f308; +fma.rn.f32 f357, f352, f307, f356; +shl.b32 r21, r20, 2; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 864, r22; +st.shared.f32 [r23], f297; +st.shared.f32 [r23+144], f315; +st.shared.f32 [r23+288], f325; +st.shared.f32 [r23+432], f335; +st.shared.f32 [r23+576], f345; +st.shared.f32 [r23+720], f355; +barrier.sync 0; +ld.shared.f32 f358, [r11]; +ld.shared.f32 f359, [r11+864]; +ld.shared.f32 f360, [r11+1728]; +ld.shared.f32 f361, [r11+2592]; +ld.shared.f32 f362, [r11+3456]; +ld.shared.f32 f363, [r11+4320]; +barrier.sync 0; +st.shared.f32 [r23], f298; +st.shared.f32 [r23+144], f317; +st.shared.f32 [r23+288], f327; +st.shared.f32 [r23+432], f337; +st.shared.f32 [r23+576], f347; +st.shared.f32 [r23+720], f357; +barrier.sync 0; +ld.shared.f32 f364, [r11]; +ld.shared.f32 f365, [r11+864]; +ld.shared.f32 f366, [r11+1728]; +ld.shared.f32 f367, [r11+2592]; +ld.shared.f32 f368, [r11+3456]; +ld.shared.f32 f369, [r11+4320]; +add.f32 f370, f360, f362; +add.f32 f371, f358, f370; +add.f32 f372, f366, f368; +add.f32 f373, f364, f372; +mul.f32 f374, f370, 0f3F000000; +sub.f32 f375, f358, f374; +sub.f32 f376, f366, f368; +mul.f32 f377, f376, 0f3F5DB3D7; +add.f32 f378, f377, f375; +sub.f32 f379, f375, f377; +mul.f32 f380, f372, 0f3F000000; +sub.f32 f381, f364, f380; +sub.f32 f382, f360, f362; +mul.f32 f383, f382, 0f3F5DB3D7; +sub.f32 f384, f381, f383; +add.f32 f385, f383, f381; +add.f32 f386, f361, f363; +add.f32 f387, f359, f386; +add.f32 f388, f367, f369; +add.f32 f389, f365, f388; +mul.f32 f390, f386, 0f3F000000; +sub.f32 f391, f359, f390; +sub.f32 f392, f367, f369; +mul.f32 f393, f392, 0f3F5DB3D7; +add.f32 f394, f393, f391; +sub.f32 f395, f391, f393; +mul.f32 f396, f388, 0f3F000000; +sub.f32 f397, f365, f396; +sub.f32 f398, f361, f363; +mul.f32 f399, f398, 0f3F5DB3D7; +sub.f32 f400, f397, f399; +add.f32 f401, f399, f397; +mul.f32 f402, f394, 0f3F000000; +mul.f32 f403, f400, 0fBF5DB3D7; +sub.f32 f404, f402, f403; +mul.f32 f405, f400, 0f3F000000; +fma.rn.f32 f406, f394, 0fBF5DB3D7, f405; +mul.f32 f407, f395, 0fBF000000; +mul.f32 f408, f401, 0fBF5DB3D7; +sub.f32 f409, f407, f408; +mul.f32 f410, f401, 0fBF000000; +fma.rn.f32 f411, f395, 0fBF5DB3D7, f410; +add.f32 %0, f371, f387; +add.f32 %1, f373, f389; +add.f32 %3, f384, f406; +add.f32 %2, f378, f404; +add.f32 %5, f385, f411; +add.f32 %4, f379, f409; +sub.f32 %6, f371, f387; +sub.f32 %7, f373, f389; +sub.f32 %9, f384, f406; +sub.f32 %8, f378, f404; +sub.f32 %11, f385, f411; +sub.f32 %10, f379, f409; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_1296), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..e75b4857a7688 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp32_inv.hpp.inc @@ -0,0 +1,934 @@ +#ifndef CUFFTDX_FFT_1296_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_1296_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<390, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 10368, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %21, %26; +add.f32 f26, %16, f25; +add.f32 f27, %23, %28; +add.f32 f28, %17, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %16, f29; +sub.f32 f31, %23, %28; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %17, f35; +sub.f32 f37, %21, %26; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %24, %29; +add.f32 f42, %18, f41; +add.f32 f43, %25, %30; +add.f32 f44, %20, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %18, f45; +sub.f32 f47, %25, %30; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %20, f51; +sub.f32 f53, %24, %29; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +sub.f32 f67, f26, f42; +sub.f32 f68, f28, f44; +add.f32 f69, f33, f59; +add.f32 f70, f39, f61; +sub.f32 f71, f33, f59; +sub.f32 f72, f39, f61; +add.f32 f73, f34, f64; +add.f32 f74, f40, f66; +sub.f32 f75, f34, f64; +sub.f32 f76, f40, f66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mad.lo.s32 r9, r6, 10368, r3; +mul.wide.u32 rd4, r8, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f77, f78}, [rd6]; +mul.f32 f81, f70, f78; +mul.f32 f82, f69, f78; +mul.f32 f83, f77, f70; +mul.f32 f84, f77, f77; +mul.f32 f85, f78, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f78, f77; +fma.rn.f32 f88, f78, f77, f87; +mul.f32 f89, f74, f88; +mul.f32 f90, f73, f88; +mul.f32 f91, f86, f74; +mul.f32 f92, f77, f86; +mul.f32 f93, f78, f88; +sub.f32 f94, f92, f93; +mul.f32 f95, f77, f88; +fma.rn.f32 f96, f78, f86, f95; +mul.f32 f97, f68, f96; +mul.f32 f98, f67, f96; +mul.f32 f99, f94, f68; +mul.f32 f100, f77, f94; +mul.f32 f101, f78, f96; +sub.f32 f102, f100, f101; +mul.f32 f103, f77, f96; +fma.rn.f32 f104, f78, f94, f103; +mul.f32 f105, f72, f104; +mul.f32 f106, f71, f104; +mul.f32 f107, f102, f72; +mul.f32 f108, f77, f102; +mul.f32 f109, f78, f104; +sub.f32 f110, f108, f109; +mul.f32 f111, f77, f104; +fma.rn.f32 f112, f78, f102, f111; +mul.f32 f113, f76, f112; +mul.f32 f114, f75, f112; +mul.f32 f115, f110, f76; +barrier.sync 0; +mad.lo.s32 r10, r8, 48, r9; +add.f32 f116, f28, f44; +add.f32 f117, f26, f42; +st.shared.v2.f32 [r10], {f117, f116}; +fma.rn.f32 f118, f77, f69, f81; +sub.f32 f119, f83, f82; +st.shared.v2.f32 [r10+8], {f118, f119}; +fma.rn.f32 f120, f86, f73, f89; +sub.f32 f121, f91, f90; +st.shared.v2.f32 [r10+16], {f120, f121}; +fma.rn.f32 f122, f94, f67, f97; +sub.f32 f123, f99, f98; +st.shared.v2.f32 [r10+24], {f122, f123}; +fma.rn.f32 f124, f102, f71, f105; +sub.f32 f125, f107, f106; +st.shared.v2.f32 [r10+32], {f124, f125}; +fma.rn.f32 f126, f110, f75, f113; +sub.f32 f127, f115, f114; +st.shared.v2.f32 [r10+40], {f126, f127}; +barrier.sync 0; +mad.lo.s32 r11, r8, -40, r10; +ld.shared.v2.f32 {f128, f129}, [r11]; +ld.shared.v2.f32 {f132, f133}, [r11+1728]; +ld.shared.v2.f32 {f136, f137}, [r11+3456]; +ld.shared.v2.f32 {f140, f141}, [r11+5184]; +ld.shared.v2.f32 {f144, f145}, [r11+6912]; +ld.shared.v2.f32 {f148, f149}, [r11+8640]; +add.f32 f152, f136, f144; +add.f32 f153, f128, f152; +add.f32 f154, f137, f145; +add.f32 f155, f129, f154; +mul.f32 f156, f152, 0f3F000000; +sub.f32 f157, f128, f156; +sub.f32 f158, f137, f145; +mul.f32 f159, f158, 0fBF5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f154, 0f3F000000; +sub.f32 f163, f129, f162; +sub.f32 f164, f136, f144; +mul.f32 f165, f164, 0fBF5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +add.f32 f168, f140, f148; +add.f32 f169, f132, f168; +add.f32 f170, f141, f149; +add.f32 f171, f133, f170; +mul.f32 f172, f168, 0f3F000000; +sub.f32 f173, f132, f172; +sub.f32 f174, f141, f149; +mul.f32 f175, f174, 0fBF5DB3D7; +add.f32 f176, f175, f173; +sub.f32 f177, f173, f175; +mul.f32 f178, f170, 0f3F000000; +sub.f32 f179, f133, f178; +sub.f32 f180, f140, f148; +mul.f32 f181, f180, 0fBF5DB3D7; +sub.f32 f182, f179, f181; +add.f32 f183, f181, f179; +mul.f32 f184, f176, 0f3F000000; +mul.f32 f185, f182, 0f3F5DB3D7; +sub.f32 f186, f184, f185; +mul.f32 f187, f182, 0f3F000000; +fma.rn.f32 f188, f176, 0f3F5DB3D7, f187; +mul.f32 f189, f177, 0fBF000000; +mul.f32 f190, f183, 0f3F5DB3D7; +sub.f32 f191, f189, f190; +mul.f32 f192, f183, 0fBF000000; +fma.rn.f32 f193, f177, 0f3F5DB3D7, f192; +sub.f32 f194, f153, f169; +sub.f32 f195, f155, f171; +add.f32 f196, f160, f186; +add.f32 f197, f166, f188; +sub.f32 f198, f160, f186; +sub.f32 f199, f166, f188; +add.f32 f200, f161, f191; +add.f32 f201, f167, f193; +sub.f32 f202, f161, f191; +sub.f32 f203, f167, f193; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f204, f205}, [rd11]; +mul.f32 f208, f197, f205; +mul.f32 f209, f196, f205; +mul.f32 f210, f204, f197; +mul.f32 f211, f204, f204; +mul.f32 f212, f205, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f205, f204; +fma.rn.f32 f215, f205, f204, f214; +mul.f32 f216, f201, f215; +mul.f32 f217, f200, f215; +mul.f32 f218, f213, f201; +mul.f32 f219, f204, f213; +mul.f32 f220, f205, f215; +sub.f32 f221, f219, f220; +mul.f32 f222, f204, f215; +fma.rn.f32 f223, f205, f213, f222; +mul.f32 f224, f195, f223; +mul.f32 f225, f194, f223; +mul.f32 f226, f221, f195; +mul.f32 f227, f204, f221; +mul.f32 f228, f205, f223; +sub.f32 f229, f227, f228; +mul.f32 f230, f204, f223; +fma.rn.f32 f231, f205, f221, f230; +mul.f32 f232, f199, f231; +mul.f32 f233, f198, f231; +mul.f32 f234, f229, f199; +mul.f32 f235, f204, f229; +mul.f32 f236, f205, f231; +sub.f32 f237, f235, f236; +mul.f32 f238, f204, f231; +fma.rn.f32 f239, f205, f229, f238; +mul.f32 f240, f203, f239; +mul.f32 f241, f202, f239; +mul.f32 f242, f237, f203; +shl.b32 r15, r14, 3; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 288, r16; +add.f32 f243, f155, f171; +add.f32 f244, f153, f169; +st.shared.v2.f32 [r17], {f244, f243}; +fma.rn.f32 f245, f204, f196, f208; +sub.f32 f246, f210, f209; +st.shared.v2.f32 [r17+48], {f245, f246}; +fma.rn.f32 f247, f213, f200, f216; +sub.f32 f248, f218, f217; +st.shared.v2.f32 [r17+96], {f247, f248}; +fma.rn.f32 f249, f221, f194, f224; +sub.f32 f250, f226, f225; +st.shared.v2.f32 [r17+144], {f249, f250}; +fma.rn.f32 f251, f229, f198, f232; +sub.f32 f252, f234, f233; +st.shared.v2.f32 [r17+192], {f251, f252}; +sub.f32 f253, f242, f241; +fma.rn.f32 f254, f237, f202, f240; +st.shared.v2.f32 [r17+240], {f254, f253}; +barrier.sync 0; +ld.shared.v2.f32 {f255, f256}, [r11]; +ld.shared.v2.f32 {f259, f260}, [r11+1728]; +ld.shared.v2.f32 {f263, f264}, [r11+3456]; +ld.shared.v2.f32 {f267, f268}, [r11+5184]; +ld.shared.v2.f32 {f271, f272}, [r11+6912]; +ld.shared.v2.f32 {f275, f276}, [r11+8640]; +add.f32 f279, f263, f271; +add.f32 f280, f255, f279; +add.f32 f281, f264, f272; +add.f32 f282, f256, f281; +mul.f32 f283, f279, 0f3F000000; +sub.f32 f284, f255, f283; +sub.f32 f285, f264, f272; +mul.f32 f286, f285, 0fBF5DB3D7; +add.f32 f287, f286, f284; +sub.f32 f288, f284, f286; +mul.f32 f289, f281, 0f3F000000; +sub.f32 f290, f256, f289; +sub.f32 f291, f263, f271; +mul.f32 f292, f291, 0fBF5DB3D7; +sub.f32 f293, f290, f292; +add.f32 f294, f292, f290; +add.f32 f295, f267, f275; +add.f32 f296, f259, f295; +add.f32 f297, f268, f276; +add.f32 f298, f260, f297; +mul.f32 f299, f295, 0f3F000000; +sub.f32 f300, f259, f299; +sub.f32 f301, f268, f276; +mul.f32 f302, f301, 0fBF5DB3D7; +add.f32 f303, f302, f300; +sub.f32 f304, f300, f302; +mul.f32 f305, f297, 0f3F000000; +sub.f32 f306, f260, f305; +sub.f32 f307, f267, f275; +mul.f32 f308, f307, 0fBF5DB3D7; +sub.f32 f309, f306, f308; +add.f32 f310, f308, f306; +mul.f32 f311, f303, 0f3F000000; +mul.f32 f312, f309, 0f3F5DB3D7; +sub.f32 f313, f311, f312; +mul.f32 f314, f309, 0f3F000000; +fma.rn.f32 f315, f303, 0f3F5DB3D7, f314; +mul.f32 f316, f304, 0fBF000000; +mul.f32 f317, f310, 0f3F5DB3D7; +sub.f32 f318, f316, f317; +mul.f32 f319, f310, 0fBF000000; +fma.rn.f32 f320, f304, 0f3F5DB3D7, f319; +sub.f32 f321, f280, f296; +sub.f32 f322, f282, f298; +add.f32 f323, f287, f313; +add.f32 f324, f293, f315; +sub.f32 f325, f287, f313; +sub.f32 f326, f293, f315; +add.f32 f327, f288, f318; +add.f32 f328, f294, f320; +sub.f32 f329, f288, f318; +sub.f32 f330, f294, f320; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f331, f332}, [rd16]; +mul.f32 f335, f324, f332; +mul.f32 f336, f323, f332; +mul.f32 f337, f331, f324; +mul.f32 f338, f331, f331; +mul.f32 f339, f332, f332; +sub.f32 f340, f338, f339; +mul.f32 f341, f332, f331; +fma.rn.f32 f342, f332, f331, f341; +mul.f32 f343, f328, f342; +mul.f32 f344, f327, f342; +mul.f32 f345, f340, f328; +mul.f32 f346, f331, f340; +mul.f32 f347, f332, f342; +sub.f32 f348, f346, f347; +mul.f32 f349, f331, f342; +fma.rn.f32 f350, f332, f340, f349; +mul.f32 f351, f322, f350; +mul.f32 f352, f321, f350; +mul.f32 f353, f348, f322; +mul.f32 f354, f331, f348; +mul.f32 f355, f332, f350; +sub.f32 f356, f354, f355; +mul.f32 f357, f331, f350; +fma.rn.f32 f358, f332, f348, f357; +mul.f32 f359, f326, f358; +mul.f32 f360, f325, f358; +mul.f32 f361, f356, f326; +mul.f32 f362, f331, f356; +mul.f32 f363, f332, f358; +sub.f32 f364, f362, f363; +mul.f32 f365, f331, f358; +fma.rn.f32 f366, f332, f356, f365; +mul.f32 f367, f330, f366; +mul.f32 f368, f329, f366; +mul.f32 f369, f364, f330; +shl.b32 r21, r20, 3; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1728, r22; +add.f32 f370, f282, f298; +add.f32 f371, f280, f296; +st.shared.v2.f32 [r23], {f371, f370}; +fma.rn.f32 f372, f331, f323, f335; +sub.f32 f373, f337, f336; +st.shared.v2.f32 [r23+288], {f372, f373}; +fma.rn.f32 f374, f340, f327, f343; +sub.f32 f375, f345, f344; +st.shared.v2.f32 [r23+576], {f374, f375}; +fma.rn.f32 f376, f348, f321, f351; +sub.f32 f377, f353, f352; +st.shared.v2.f32 [r23+864], {f376, f377}; +fma.rn.f32 f378, f356, f325, f359; +sub.f32 f379, f361, f360; +st.shared.v2.f32 [r23+1152], {f378, f379}; +sub.f32 f380, f369, f368; +fma.rn.f32 f381, f364, f329, f367; +st.shared.v2.f32 [r23+1440], {f381, f380}; +barrier.sync 0; +ld.shared.v2.f32 {f382, f383}, [r11]; +ld.shared.v2.f32 {f386, f387}, [r11+1728]; +ld.shared.v2.f32 {f390, f391}, [r11+3456]; +ld.shared.v2.f32 {f394, f395}, [r11+5184]; +ld.shared.v2.f32 {f398, f399}, [r11+6912]; +ld.shared.v2.f32 {f402, f403}, [r11+8640]; +add.f32 f406, f390, f398; +add.f32 f407, f382, f406; +add.f32 f408, f391, f399; +add.f32 f409, f383, f408; +mul.f32 f410, f406, 0f3F000000; +sub.f32 f411, f382, f410; +sub.f32 f412, f391, f399; +mul.f32 f413, f412, 0fBF5DB3D7; +add.f32 f414, f413, f411; +sub.f32 f415, f411, f413; +mul.f32 f416, f408, 0f3F000000; +sub.f32 f417, f383, f416; +sub.f32 f418, f390, f398; +mul.f32 f419, f418, 0fBF5DB3D7; +sub.f32 f420, f417, f419; +add.f32 f421, f419, f417; +add.f32 f422, f394, f402; +add.f32 f423, f386, f422; +add.f32 f424, f395, f403; +add.f32 f425, f387, f424; +mul.f32 f426, f422, 0f3F000000; +sub.f32 f427, f386, f426; +sub.f32 f428, f395, f403; +mul.f32 f429, f428, 0fBF5DB3D7; +add.f32 f430, f429, f427; +sub.f32 f431, f427, f429; +mul.f32 f432, f424, 0f3F000000; +sub.f32 f433, f387, f432; +sub.f32 f434, f394, f402; +mul.f32 f435, f434, 0fBF5DB3D7; +sub.f32 f436, f433, f435; +add.f32 f437, f435, f433; +mul.f32 f438, f430, 0f3F000000; +mul.f32 f439, f436, 0f3F5DB3D7; +sub.f32 f440, f438, f439; +mul.f32 f441, f436, 0f3F000000; +fma.rn.f32 f442, f430, 0f3F5DB3D7, f441; +mul.f32 f443, f431, 0fBF000000; +mul.f32 f444, f437, 0f3F5DB3D7; +sub.f32 f445, f443, f444; +mul.f32 f446, f437, 0fBF000000; +fma.rn.f32 f447, f431, 0f3F5DB3D7, f446; +add.f32 %1, f409, f425; +add.f32 %0, f407, f423; +add.f32 %3, f420, f442; +add.f32 %2, f414, f440; +add.f32 %5, f421, f447; +add.f32 %4, f415, f445; +sub.f32 %7, f409, f425; +sub.f32 %6, f407, f423; +sub.f32 %9, f420, f442; +sub.f32 %8, f414, f440; +sub.f32 %11, f421, f447; +sub.f32 %10, f415, f445; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_1296), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<391, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<424>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 5184, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %21, %26; +add.f32 f26, %16, f25; +add.f32 f27, %23, %28; +add.f32 f28, %17, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %16, f29; +sub.f32 f31, %23, %28; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %17, f35; +sub.f32 f37, %21, %26; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %24, %29; +add.f32 f42, %18, f41; +add.f32 f43, %25, %30; +add.f32 f44, %20, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %18, f45; +sub.f32 f47, %25, %30; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %20, f51; +sub.f32 f53, %24, %29; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +add.f32 f67, f26, f42; +add.f32 f68, f28, f44; +sub.f32 f69, f26, f42; +sub.f32 f70, f28, f44; +add.f32 f71, f33, f59; +add.f32 f72, f39, f61; +sub.f32 f73, f33, f59; +sub.f32 f74, f39, f61; +add.f32 f75, f34, f64; +add.f32 f76, f40, f66; +sub.f32 f77, f34, f64; +sub.f32 f78, f40, f66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mul.wide.u32 rd4, r8, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f79, f80}, [rd6]; +mul.f32 f83, f72, f80; +fma.rn.f32 f84, f79, f71, f83; +mul.f32 f85, f71, f80; +mul.f32 f86, f79, f72; +sub.f32 f87, f86, f85; +mul.f32 f88, f79, f79; +mul.f32 f89, f80, f80; +sub.f32 f90, f88, f89; +mul.f32 f91, f80, f79; +fma.rn.f32 f92, f80, f79, f91; +mul.f32 f93, f76, f92; +fma.rn.f32 f94, f90, f75, f93; +mul.f32 f95, f75, f92; +mul.f32 f96, f90, f76; +sub.f32 f97, f96, f95; +mul.f32 f98, f79, f90; +mul.f32 f99, f80, f92; +sub.f32 f100, f98, f99; +mul.f32 f101, f79, f92; +fma.rn.f32 f102, f80, f90, f101; +mul.f32 f103, f70, f102; +fma.rn.f32 f104, f100, f69, f103; +mul.f32 f105, f69, f102; +mul.f32 f106, f100, f70; +sub.f32 f107, f106, f105; +mul.f32 f108, f79, f100; +mul.f32 f109, f80, f102; +sub.f32 f110, f108, f109; +mul.f32 f111, f79, f102; +fma.rn.f32 f112, f80, f100, f111; +mul.f32 f113, f74, f112; +fma.rn.f32 f114, f110, f73, f113; +mul.f32 f115, f73, f112; +mul.f32 f116, f110, f74; +sub.f32 f117, f116, f115; +mul.f32 f118, f79, f110; +mul.f32 f119, f80, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f79, f112; +fma.rn.f32 f122, f80, f110, f121; +mul.f32 f123, f78, f122; +fma.rn.f32 f124, f120, f77, f123; +mul.f32 f125, f77, f122; +mul.f32 f126, f120, f78; +sub.f32 f127, f126, f125; +mad.lo.s32 r9, r6, 5184, r3; +barrier.sync 0; +mad.lo.s32 r10, r8, 24, r9; +st.shared.v2.f32 [r10], {f67, f84}; +st.shared.v2.f32 [r10+8], {f94, f104}; +st.shared.v2.f32 [r10+16], {f114, f124}; +barrier.sync 0; +mad.lo.s32 r11, r8, -20, r10; +ld.shared.f32 f128, [r11]; +ld.shared.f32 f129, [r11+864]; +ld.shared.f32 f130, [r11+1728]; +ld.shared.f32 f131, [r11+2592]; +ld.shared.f32 f132, [r11+3456]; +ld.shared.f32 f133, [r11+4320]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f68, f87}; +st.shared.v2.f32 [r10+8], {f97, f107}; +st.shared.v2.f32 [r10+16], {f117, f127}; +barrier.sync 0; +ld.shared.f32 f134, [r11]; +ld.shared.f32 f135, [r11+864]; +ld.shared.f32 f136, [r11+1728]; +ld.shared.f32 f137, [r11+2592]; +ld.shared.f32 f138, [r11+3456]; +ld.shared.f32 f139, [r11+4320]; +add.f32 f140, f130, f132; +add.f32 f141, f128, f140; +add.f32 f142, f136, f138; +add.f32 f143, f134, f142; +mul.f32 f144, f140, 0f3F000000; +sub.f32 f145, f128, f144; +sub.f32 f146, f136, f138; +mul.f32 f147, f146, 0fBF5DB3D7; +add.f32 f148, f147, f145; +sub.f32 f149, f145, f147; +mul.f32 f150, f142, 0f3F000000; +sub.f32 f151, f134, f150; +sub.f32 f152, f130, f132; +mul.f32 f153, f152, 0fBF5DB3D7; +sub.f32 f154, f151, f153; +add.f32 f155, f153, f151; +add.f32 f156, f131, f133; +add.f32 f157, f129, f156; +add.f32 f158, f137, f139; +add.f32 f159, f135, f158; +mul.f32 f160, f156, 0f3F000000; +sub.f32 f161, f129, f160; +sub.f32 f162, f137, f139; +mul.f32 f163, f162, 0fBF5DB3D7; +add.f32 f164, f163, f161; +sub.f32 f165, f161, f163; +mul.f32 f166, f158, 0f3F000000; +sub.f32 f167, f135, f166; +sub.f32 f168, f131, f133; +mul.f32 f169, f168, 0fBF5DB3D7; +sub.f32 f170, f167, f169; +add.f32 f171, f169, f167; +mul.f32 f172, f164, 0f3F000000; +mul.f32 f173, f170, 0f3F5DB3D7; +sub.f32 f174, f172, f173; +mul.f32 f175, f170, 0f3F000000; +fma.rn.f32 f176, f164, 0f3F5DB3D7, f175; +mul.f32 f177, f165, 0fBF000000; +mul.f32 f178, f171, 0f3F5DB3D7; +sub.f32 f179, f177, f178; +mul.f32 f180, f171, 0fBF000000; +fma.rn.f32 f181, f165, 0f3F5DB3D7, f180; +add.f32 f182, f141, f157; +add.f32 f183, f143, f159; +sub.f32 f184, f141, f157; +sub.f32 f185, f143, f159; +add.f32 f186, f148, f174; +add.f32 f187, f154, f176; +sub.f32 f188, f148, f174; +sub.f32 f189, f154, f176; +add.f32 f190, f149, f179; +add.f32 f191, f155, f181; +sub.f32 f192, f149, f179; +sub.f32 f193, f155, f181; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f194, f195}, [rd11]; +mul.f32 f198, f187, f195; +fma.rn.f32 f199, f194, f186, f198; +mul.f32 f200, f186, f195; +mul.f32 f201, f194, f187; +sub.f32 f202, f201, f200; +mul.f32 f203, f194, f194; +mul.f32 f204, f195, f195; +sub.f32 f205, f203, f204; +mul.f32 f206, f195, f194; +fma.rn.f32 f207, f195, f194, f206; +mul.f32 f208, f191, f207; +fma.rn.f32 f209, f205, f190, f208; +mul.f32 f210, f190, f207; +mul.f32 f211, f205, f191; +sub.f32 f212, f211, f210; +mul.f32 f213, f194, f205; +mul.f32 f214, f195, f207; +sub.f32 f215, f213, f214; +mul.f32 f216, f194, f207; +fma.rn.f32 f217, f195, f205, f216; +mul.f32 f218, f185, f217; +fma.rn.f32 f219, f215, f184, f218; +mul.f32 f220, f184, f217; +mul.f32 f221, f215, f185; +sub.f32 f222, f221, f220; +mul.f32 f223, f194, f215; +mul.f32 f224, f195, f217; +sub.f32 f225, f223, f224; +mul.f32 f226, f194, f217; +fma.rn.f32 f227, f195, f215, f226; +mul.f32 f228, f189, f227; +fma.rn.f32 f229, f225, f188, f228; +mul.f32 f230, f188, f227; +mul.f32 f231, f225, f189; +sub.f32 f232, f231, f230; +mul.f32 f233, f194, f225; +mul.f32 f234, f195, f227; +sub.f32 f235, f233, f234; +mul.f32 f236, f194, f227; +fma.rn.f32 f237, f195, f225, f236; +mul.f32 f238, f193, f237; +fma.rn.f32 f239, f235, f192, f238; +mul.f32 f240, f192, f237; +mul.f32 f241, f235, f193; +sub.f32 f242, f241, f240; +shl.b32 r15, r14, 2; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +st.shared.f32 [r17], f182; +st.shared.f32 [r17+24], f199; +st.shared.f32 [r17+48], f209; +st.shared.f32 [r17+72], f219; +st.shared.f32 [r17+96], f229; +st.shared.f32 [r17+120], f239; +barrier.sync 0; +ld.shared.f32 f243, [r11]; +ld.shared.f32 f244, [r11+864]; +ld.shared.f32 f245, [r11+1728]; +ld.shared.f32 f246, [r11+2592]; +ld.shared.f32 f247, [r11+3456]; +ld.shared.f32 f248, [r11+4320]; +barrier.sync 0; +st.shared.f32 [r17], f183; +st.shared.f32 [r17+24], f202; +st.shared.f32 [r17+48], f212; +st.shared.f32 [r17+72], f222; +st.shared.f32 [r17+96], f232; +st.shared.f32 [r17+120], f242; +barrier.sync 0; +ld.shared.f32 f249, [r11]; +ld.shared.f32 f250, [r11+864]; +ld.shared.f32 f251, [r11+1728]; +ld.shared.f32 f252, [r11+2592]; +ld.shared.f32 f253, [r11+3456]; +ld.shared.f32 f254, [r11+4320]; +add.f32 f255, f245, f247; +add.f32 f256, f243, f255; +add.f32 f257, f251, f253; +add.f32 f258, f249, f257; +mul.f32 f259, f255, 0f3F000000; +sub.f32 f260, f243, f259; +sub.f32 f261, f251, f253; +mul.f32 f262, f261, 0fBF5DB3D7; +add.f32 f263, f262, f260; +sub.f32 f264, f260, f262; +mul.f32 f265, f257, 0f3F000000; +sub.f32 f266, f249, f265; +sub.f32 f267, f245, f247; +mul.f32 f268, f267, 0fBF5DB3D7; +sub.f32 f269, f266, f268; +add.f32 f270, f268, f266; +add.f32 f271, f246, f248; +add.f32 f272, f244, f271; +add.f32 f273, f252, f254; +add.f32 f274, f250, f273; +mul.f32 f275, f271, 0f3F000000; +sub.f32 f276, f244, f275; +sub.f32 f277, f252, f254; +mul.f32 f278, f277, 0fBF5DB3D7; +add.f32 f279, f278, f276; +sub.f32 f280, f276, f278; +mul.f32 f281, f273, 0f3F000000; +sub.f32 f282, f250, f281; +sub.f32 f283, f246, f248; +mul.f32 f284, f283, 0fBF5DB3D7; +sub.f32 f285, f282, f284; +add.f32 f286, f284, f282; +mul.f32 f287, f279, 0f3F000000; +mul.f32 f288, f285, 0f3F5DB3D7; +sub.f32 f289, f287, f288; +mul.f32 f290, f285, 0f3F000000; +fma.rn.f32 f291, f279, 0f3F5DB3D7, f290; +mul.f32 f292, f280, 0fBF000000; +mul.f32 f293, f286, 0f3F5DB3D7; +sub.f32 f294, f292, f293; +mul.f32 f295, f286, 0fBF000000; +fma.rn.f32 f296, f280, 0f3F5DB3D7, f295; +add.f32 f297, f256, f272; +add.f32 f298, f258, f274; +sub.f32 f299, f256, f272; +sub.f32 f300, f258, f274; +add.f32 f301, f263, f289; +add.f32 f302, f269, f291; +sub.f32 f303, f263, f289; +sub.f32 f304, f269, f291; +add.f32 f305, f264, f294; +add.f32 f306, f270, f296; +sub.f32 f307, f264, f294; +sub.f32 f308, f270, f296; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f309, f310}, [rd16]; +mul.f32 f313, f302, f310; +fma.rn.f32 f314, f309, f301, f313; +mul.f32 f315, f301, f310; +mul.f32 f316, f309, f302; +sub.f32 f317, f316, f315; +mul.f32 f318, f309, f309; +mul.f32 f319, f310, f310; +sub.f32 f320, f318, f319; +mul.f32 f321, f310, f309; +fma.rn.f32 f322, f310, f309, f321; +mul.f32 f323, f306, f322; +fma.rn.f32 f324, f320, f305, f323; +mul.f32 f325, f305, f322; +mul.f32 f326, f320, f306; +sub.f32 f327, f326, f325; +mul.f32 f328, f309, f320; +mul.f32 f329, f310, f322; +sub.f32 f330, f328, f329; +mul.f32 f331, f309, f322; +fma.rn.f32 f332, f310, f320, f331; +mul.f32 f333, f300, f332; +fma.rn.f32 f334, f330, f299, f333; +mul.f32 f335, f299, f332; +mul.f32 f336, f330, f300; +sub.f32 f337, f336, f335; +mul.f32 f338, f309, f330; +mul.f32 f339, f310, f332; +sub.f32 f340, f338, f339; +mul.f32 f341, f309, f332; +fma.rn.f32 f342, f310, f330, f341; +mul.f32 f343, f304, f342; +fma.rn.f32 f344, f340, f303, f343; +mul.f32 f345, f303, f342; +mul.f32 f346, f340, f304; +sub.f32 f347, f346, f345; +mul.f32 f348, f309, f340; +mul.f32 f349, f310, f342; +sub.f32 f350, f348, f349; +mul.f32 f351, f309, f342; +fma.rn.f32 f352, f310, f340, f351; +mul.f32 f353, f308, f352; +fma.rn.f32 f354, f350, f307, f353; +mul.f32 f355, f307, f352; +mul.f32 f356, f350, f308; +sub.f32 f357, f356, f355; +shl.b32 r21, r20, 2; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 864, r22; +st.shared.f32 [r23], f297; +st.shared.f32 [r23+144], f314; +st.shared.f32 [r23+288], f324; +st.shared.f32 [r23+432], f334; +st.shared.f32 [r23+576], f344; +st.shared.f32 [r23+720], f354; +barrier.sync 0; +ld.shared.f32 f358, [r11]; +ld.shared.f32 f359, [r11+864]; +ld.shared.f32 f360, [r11+1728]; +ld.shared.f32 f361, [r11+2592]; +ld.shared.f32 f362, [r11+3456]; +ld.shared.f32 f363, [r11+4320]; +barrier.sync 0; +st.shared.f32 [r23], f298; +st.shared.f32 [r23+144], f317; +st.shared.f32 [r23+288], f327; +st.shared.f32 [r23+432], f337; +st.shared.f32 [r23+576], f347; +st.shared.f32 [r23+720], f357; +barrier.sync 0; +ld.shared.f32 f364, [r11]; +ld.shared.f32 f365, [r11+864]; +ld.shared.f32 f366, [r11+1728]; +ld.shared.f32 f367, [r11+2592]; +ld.shared.f32 f368, [r11+3456]; +ld.shared.f32 f369, [r11+4320]; +add.f32 f370, f360, f362; +add.f32 f371, f358, f370; +add.f32 f372, f366, f368; +add.f32 f373, f364, f372; +mul.f32 f374, f370, 0f3F000000; +sub.f32 f375, f358, f374; +sub.f32 f376, f366, f368; +mul.f32 f377, f376, 0fBF5DB3D7; +add.f32 f378, f377, f375; +sub.f32 f379, f375, f377; +mul.f32 f380, f372, 0f3F000000; +sub.f32 f381, f364, f380; +sub.f32 f382, f360, f362; +mul.f32 f383, f382, 0fBF5DB3D7; +sub.f32 f384, f381, f383; +add.f32 f385, f383, f381; +add.f32 f386, f361, f363; +add.f32 f387, f359, f386; +add.f32 f388, f367, f369; +add.f32 f389, f365, f388; +mul.f32 f390, f386, 0f3F000000; +sub.f32 f391, f359, f390; +sub.f32 f392, f367, f369; +mul.f32 f393, f392, 0fBF5DB3D7; +add.f32 f394, f393, f391; +sub.f32 f395, f391, f393; +mul.f32 f396, f388, 0f3F000000; +sub.f32 f397, f365, f396; +sub.f32 f398, f361, f363; +mul.f32 f399, f398, 0fBF5DB3D7; +sub.f32 f400, f397, f399; +add.f32 f401, f399, f397; +mul.f32 f402, f394, 0f3F000000; +mul.f32 f403, f400, 0f3F5DB3D7; +sub.f32 f404, f402, f403; +mul.f32 f405, f400, 0f3F000000; +fma.rn.f32 f406, f394, 0f3F5DB3D7, f405; +mul.f32 f407, f395, 0fBF000000; +mul.f32 f408, f401, 0f3F5DB3D7; +sub.f32 f409, f407, f408; +mul.f32 f410, f401, 0fBF000000; +fma.rn.f32 f411, f395, 0f3F5DB3D7, f410; +add.f32 %0, f371, f387; +add.f32 %1, f373, f389; +add.f32 %3, f384, f406; +add.f32 %2, f378, f404; +add.f32 %5, f385, f411; +add.f32 %4, f379, f409; +sub.f32 %6, f371, f387; +sub.f32 %7, f373, f389; +sub.f32 %9, f384, f406; +sub.f32 %8, f378, f404; +sub.f32 %11, f385, f411; +sub.f32 %10, f379, f409; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_1296), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..5617f4b3afed4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp64_fwd.hpp.inc @@ -0,0 +1,910 @@ +#ifndef CUFFTDX_FFT_1296_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_1296_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<564, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<457>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 20736, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %21, %26; +add.f64 fd26, %16, fd25; +add.f64 fd27, %23, %28; +add.f64 fd28, %17, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %16, fd29; +sub.f64 fd31, %23, %28; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %17, fd35; +sub.f64 fd37, %21, %26; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %24, %29; +add.f64 fd42, %18, fd41; +add.f64 fd43, %25, %30; +add.f64 fd44, %20, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %18, fd45; +sub.f64 fd47, %25, %30; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %20, fd51; +sub.f64 fd53, %24, %29; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +sub.f64 fd67, fd26, fd42; +sub.f64 fd68, fd28, fd44; +add.f64 fd69, fd33, fd59; +add.f64 fd70, fd39, fd61; +sub.f64 fd71, fd33, fd59; +sub.f64 fd72, fd39, fd61; +add.f64 fd73, fd34, fd64; +add.f64 fd74, fd40, fd66; +sub.f64 fd75, fd34, fd64; +sub.f64 fd76, fd40, fd66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mad.lo.s32 r9, r6, 20736, r3; +mul.wide.u32 rd4, r8, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd77, fd78}, [rd6]; +mul.f64 fd81, fd77, fd69; +mul.f64 fd82, fd78, fd70; +mul.f64 fd83, fd77, fd70; +mul.f64 fd84, fd77, fd77; +mul.f64 fd85, fd78, fd78; +sub.f64 fd86, fd84, fd85; +mul.f64 fd87, fd78, fd77; +fma.rn.f64 fd88, fd78, fd77, fd87; +mul.f64 fd89, fd86, fd73; +mul.f64 fd90, fd88, fd74; +mul.f64 fd91, fd86, fd74; +mul.f64 fd92, fd77, fd86; +mul.f64 fd93, fd78, fd88; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd77, fd88; +fma.rn.f64 fd96, fd78, fd86, fd95; +mul.f64 fd97, fd94, fd67; +mul.f64 fd98, fd96, fd68; +mul.f64 fd99, fd94, fd68; +ld.global.v2.f64 {fd100, fd101}, [rd6+3456]; +mul.f64 fd104, fd100, fd71; +mul.f64 fd105, fd101, fd72; +mul.f64 fd106, fd100, fd72; +mul.f64 fd107, fd77, fd100; +mul.f64 fd108, fd78, fd101; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd77, fd101; +fma.rn.f64 fd111, fd78, fd100, fd110; +mul.f64 fd112, fd109, fd75; +mul.f64 fd113, fd111, fd76; +mul.f64 fd114, fd109, fd76; +barrier.sync 0; +mad.lo.s32 r10, r8, 96, r9; +add.f64 fd115, fd28, fd44; +add.f64 fd116, fd26, fd42; +st.shared.v2.f64 [r10], {fd116, fd115}; +fma.rn.f64 fd117, fd78, fd69, fd83; +sub.f64 fd118, fd81, fd82; +st.shared.v2.f64 [r10+16], {fd118, fd117}; +fma.rn.f64 fd119, fd88, fd73, fd91; +sub.f64 fd120, fd89, fd90; +st.shared.v2.f64 [r10+32], {fd120, fd119}; +fma.rn.f64 fd121, fd96, fd67, fd99; +sub.f64 fd122, fd97, fd98; +st.shared.v2.f64 [r10+48], {fd122, fd121}; +fma.rn.f64 fd123, fd101, fd71, fd106; +sub.f64 fd124, fd104, fd105; +st.shared.v2.f64 [r10+64], {fd124, fd123}; +fma.rn.f64 fd125, fd111, fd75, fd114; +sub.f64 fd126, fd112, fd113; +st.shared.v2.f64 [r10+80], {fd126, fd125}; +barrier.sync 0; +mad.lo.s32 r11, r8, -80, r10; +ld.shared.v2.f64 {fd127, fd128}, [r11]; +ld.shared.v2.f64 {fd131, fd132}, [r11+3456]; +ld.shared.v2.f64 {fd135, fd136}, [r11+6912]; +ld.shared.v2.f64 {fd139, fd140}, [r11+10368]; +ld.shared.v2.f64 {fd143, fd144}, [r11+13824]; +ld.shared.v2.f64 {fd147, fd148}, [r11+17280]; +add.f64 fd151, fd135, fd143; +add.f64 fd152, fd127, fd151; +add.f64 fd153, fd136, fd144; +add.f64 fd154, fd128, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, fd127, fd155; +sub.f64 fd157, fd136, fd144; +mul.f64 fd158, fd157, 0d3FEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, fd128, fd161; +sub.f64 fd163, fd135, fd143; +mul.f64 fd164, fd163, 0d3FEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, fd139, fd147; +add.f64 fd168, fd131, fd167; +add.f64 fd169, fd140, fd148; +add.f64 fd170, fd132, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, fd131, fd171; +sub.f64 fd173, fd140, fd148; +mul.f64 fd174, fd173, 0d3FEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, fd132, fd177; +sub.f64 fd179, fd139, fd147; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0dBFEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0dBFEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0dBFEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0dBFEBB67AE8584CAA, fd191; +sub.f64 fd193, fd152, fd168; +sub.f64 fd194, fd154, fd170; +add.f64 fd195, fd159, fd185; +add.f64 fd196, fd165, fd187; +sub.f64 fd197, fd159, fd185; +sub.f64 fd198, fd165, fd187; +add.f64 fd199, fd160, fd190; +add.f64 fd200, fd166, fd192; +sub.f64 fd201, fd160, fd190; +sub.f64 fd202, fd166, fd192; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd203, fd204}, [rd11]; +mul.f64 fd207, fd203, fd195; +mul.f64 fd208, fd204, fd196; +mul.f64 fd209, fd203, fd196; +mul.f64 fd210, fd203, fd203; +mul.f64 fd211, fd204, fd204; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd204, fd203; +fma.rn.f64 fd214, fd204, fd203, fd213; +mul.f64 fd215, fd212, fd199; +mul.f64 fd216, fd214, fd200; +mul.f64 fd217, fd212, fd200; +mul.f64 fd218, fd203, fd212; +mul.f64 fd219, fd204, fd214; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd203, fd214; +fma.rn.f64 fd222, fd204, fd212, fd221; +mul.f64 fd223, fd220, fd193; +mul.f64 fd224, fd222, fd194; +mul.f64 fd225, fd220, fd194; +ld.global.v2.f64 {fd226, fd227}, [rd11+576]; +mul.f64 fd230, fd226, fd197; +mul.f64 fd231, fd227, fd198; +mul.f64 fd232, fd226, fd198; +mul.f64 fd233, fd203, fd226; +mul.f64 fd234, fd204, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd203, fd227; +fma.rn.f64 fd237, fd204, fd226, fd236; +mul.f64 fd238, fd235, fd201; +mul.f64 fd239, fd237, fd202; +mul.f64 fd240, fd235, fd202; +shl.b32 r15, r14, 4; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 576, r16; +add.f64 fd241, fd154, fd170; +add.f64 fd242, fd152, fd168; +st.shared.v2.f64 [r17], {fd242, fd241}; +fma.rn.f64 fd243, fd204, fd195, fd209; +sub.f64 fd244, fd207, fd208; +st.shared.v2.f64 [r17+96], {fd244, fd243}; +fma.rn.f64 fd245, fd214, fd199, fd217; +sub.f64 fd246, fd215, fd216; +st.shared.v2.f64 [r17+192], {fd246, fd245}; +fma.rn.f64 fd247, fd222, fd193, fd225; +sub.f64 fd248, fd223, fd224; +st.shared.v2.f64 [r17+288], {fd248, fd247}; +fma.rn.f64 fd249, fd227, fd197, fd232; +sub.f64 fd250, fd230, fd231; +st.shared.v2.f64 [r17+384], {fd250, fd249}; +fma.rn.f64 fd251, fd237, fd201, fd240; +sub.f64 fd252, fd238, fd239; +st.shared.v2.f64 [r17+480], {fd252, fd251}; +barrier.sync 0; +ld.shared.v2.f64 {fd253, fd254}, [r11]; +ld.shared.v2.f64 {fd257, fd258}, [r11+3456]; +ld.shared.v2.f64 {fd261, fd262}, [r11+6912]; +ld.shared.v2.f64 {fd265, fd266}, [r11+10368]; +ld.shared.v2.f64 {fd269, fd270}, [r11+13824]; +ld.shared.v2.f64 {fd273, fd274}, [r11+17280]; +add.f64 fd277, fd261, fd269; +add.f64 fd278, fd253, fd277; +add.f64 fd279, fd262, fd270; +add.f64 fd280, fd254, fd279; +mul.f64 fd281, fd277, 0d3FE0000000000000; +sub.f64 fd282, fd253, fd281; +sub.f64 fd283, fd262, fd270; +mul.f64 fd284, fd283, 0d3FEBB67AE8584CAA; +add.f64 fd285, fd284, fd282; +sub.f64 fd286, fd282, fd284; +mul.f64 fd287, fd279, 0d3FE0000000000000; +sub.f64 fd288, fd254, fd287; +sub.f64 fd289, fd261, fd269; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +sub.f64 fd291, fd288, fd290; +add.f64 fd292, fd290, fd288; +add.f64 fd293, fd265, fd273; +add.f64 fd294, fd257, fd293; +add.f64 fd295, fd266, fd274; +add.f64 fd296, fd258, fd295; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd257, fd297; +sub.f64 fd299, fd266, fd274; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +mul.f64 fd303, fd295, 0d3FE0000000000000; +sub.f64 fd304, fd258, fd303; +sub.f64 fd305, fd265, fd273; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +mul.f64 fd309, fd301, 0d3FE0000000000000; +mul.f64 fd310, fd307, 0dBFEBB67AE8584CAA; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd307, 0d3FE0000000000000; +fma.rn.f64 fd313, fd301, 0dBFEBB67AE8584CAA, fd312; +mul.f64 fd314, fd302, 0dBFE0000000000000; +mul.f64 fd315, fd308, 0dBFEBB67AE8584CAA; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd308, 0dBFE0000000000000; +fma.rn.f64 fd318, fd302, 0dBFEBB67AE8584CAA, fd317; +sub.f64 fd319, fd278, fd294; +sub.f64 fd320, fd280, fd296; +add.f64 fd321, fd285, fd311; +add.f64 fd322, fd291, fd313; +sub.f64 fd323, fd285, fd311; +sub.f64 fd324, fd291, fd313; +add.f64 fd325, fd286, fd316; +add.f64 fd326, fd292, fd318; +sub.f64 fd327, fd286, fd316; +sub.f64 fd328, fd292, fd318; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd329, fd330}, [rd16]; +mul.f64 fd333, fd329, fd321; +mul.f64 fd334, fd330, fd322; +mul.f64 fd335, fd329, fd322; +mul.f64 fd336, fd329, fd329; +mul.f64 fd337, fd330, fd330; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd330, fd329; +fma.rn.f64 fd340, fd330, fd329, fd339; +mul.f64 fd341, fd338, fd325; +mul.f64 fd342, fd340, fd326; +mul.f64 fd343, fd338, fd326; +mul.f64 fd344, fd329, fd338; +mul.f64 fd345, fd330, fd340; +sub.f64 fd346, fd344, fd345; +mul.f64 fd347, fd329, fd340; +fma.rn.f64 fd348, fd330, fd338, fd347; +mul.f64 fd349, fd346, fd319; +mul.f64 fd350, fd348, fd320; +mul.f64 fd351, fd346, fd320; +ld.global.v2.f64 {fd352, fd353}, [rd16+96]; +mul.f64 fd356, fd352, fd323; +mul.f64 fd357, fd353, fd324; +mul.f64 fd358, fd352, fd324; +mul.f64 fd359, fd329, fd352; +mul.f64 fd360, fd330, fd353; +sub.f64 fd361, fd359, fd360; +mul.f64 fd362, fd329, fd353; +fma.rn.f64 fd363, fd330, fd352, fd362; +mul.f64 fd364, fd361, fd327; +mul.f64 fd365, fd363, fd328; +mul.f64 fd366, fd361, fd328; +shl.b32 r21, r20, 4; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 3456, r22; +add.f64 fd367, fd280, fd296; +add.f64 fd368, fd278, fd294; +st.shared.v2.f64 [r23], {fd368, fd367}; +fma.rn.f64 fd369, fd330, fd321, fd335; +sub.f64 fd370, fd333, fd334; +st.shared.v2.f64 [r23+576], {fd370, fd369}; +fma.rn.f64 fd371, fd340, fd325, fd343; +sub.f64 fd372, fd341, fd342; +st.shared.v2.f64 [r23+1152], {fd372, fd371}; +fma.rn.f64 fd373, fd348, fd319, fd351; +sub.f64 fd374, fd349, fd350; +st.shared.v2.f64 [r23+1728], {fd374, fd373}; +fma.rn.f64 fd375, fd353, fd323, fd358; +sub.f64 fd376, fd356, fd357; +st.shared.v2.f64 [r23+2304], {fd376, fd375}; +fma.rn.f64 fd377, fd363, fd327, fd366; +sub.f64 fd378, fd364, fd365; +st.shared.v2.f64 [r23+2880], {fd378, fd377}; +barrier.sync 0; +ld.shared.v2.f64 {fd379, fd380}, [r11]; +ld.shared.v2.f64 {fd383, fd384}, [r11+3456]; +ld.shared.v2.f64 {fd387, fd388}, [r11+6912]; +ld.shared.v2.f64 {fd391, fd392}, [r11+10368]; +ld.shared.v2.f64 {fd395, fd396}, [r11+13824]; +ld.shared.v2.f64 {fd399, fd400}, [r11+17280]; +add.f64 fd403, fd387, fd395; +add.f64 fd404, fd379, fd403; +add.f64 fd405, fd388, fd396; +add.f64 fd406, fd380, fd405; +mul.f64 fd407, fd403, 0d3FE0000000000000; +sub.f64 fd408, fd379, fd407; +sub.f64 fd409, fd388, fd396; +mul.f64 fd410, fd409, 0d3FEBB67AE8584CAA; +add.f64 fd411, fd410, fd408; +sub.f64 fd412, fd408, fd410; +mul.f64 fd413, fd405, 0d3FE0000000000000; +sub.f64 fd414, fd380, fd413; +sub.f64 fd415, fd387, fd395; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +sub.f64 fd417, fd414, fd416; +add.f64 fd418, fd416, fd414; +add.f64 fd419, fd391, fd399; +add.f64 fd420, fd383, fd419; +add.f64 fd421, fd392, fd400; +add.f64 fd422, fd384, fd421; +mul.f64 fd423, fd419, 0d3FE0000000000000; +sub.f64 fd424, fd383, fd423; +sub.f64 fd425, fd392, fd400; +mul.f64 fd426, fd425, 0d3FEBB67AE8584CAA; +add.f64 fd427, fd426, fd424; +sub.f64 fd428, fd424, fd426; +mul.f64 fd429, fd421, 0d3FE0000000000000; +sub.f64 fd430, fd384, fd429; +sub.f64 fd431, fd391, fd399; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +sub.f64 fd433, fd430, fd432; +add.f64 fd434, fd432, fd430; +mul.f64 fd435, fd427, 0d3FE0000000000000; +mul.f64 fd436, fd433, 0dBFEBB67AE8584CAA; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd433, 0d3FE0000000000000; +fma.rn.f64 fd439, fd427, 0dBFEBB67AE8584CAA, fd438; +mul.f64 fd440, fd428, 0dBFE0000000000000; +mul.f64 fd441, fd434, 0dBFEBB67AE8584CAA; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd434, 0dBFE0000000000000; +fma.rn.f64 fd444, fd428, 0dBFEBB67AE8584CAA, fd443; +add.f64 %1, fd406, fd422; +add.f64 %0, fd404, fd420; +add.f64 %3, fd417, fd439; +add.f64 %2, fd411, fd437; +add.f64 %5, fd418, fd444; +add.f64 %4, fd412, fd442; +sub.f64 %7, fd406, fd422; +sub.f64 %6, fd404, fd420; +sub.f64 %9, fd417, fd439; +sub.f64 %8, fd411, fd437; +sub.f64 %11, fd418, fd444; +sub.f64 %10, fd412, fd442; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_1296), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<563, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<421>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 10368, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %21, %26; +add.f64 fd26, %16, fd25; +add.f64 fd27, %23, %28; +add.f64 fd28, %17, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %16, fd29; +sub.f64 fd31, %23, %28; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %17, fd35; +sub.f64 fd37, %21, %26; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %24, %29; +add.f64 fd42, %18, fd41; +add.f64 fd43, %25, %30; +add.f64 fd44, %20, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %18, fd45; +sub.f64 fd47, %25, %30; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %20, fd51; +sub.f64 fd53, %24, %29; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +add.f64 fd67, fd26, fd42; +add.f64 fd68, fd28, fd44; +sub.f64 fd69, fd26, fd42; +sub.f64 fd70, fd28, fd44; +add.f64 fd71, fd33, fd59; +add.f64 fd72, fd39, fd61; +sub.f64 fd73, fd33, fd59; +sub.f64 fd74, fd39, fd61; +add.f64 fd75, fd34, fd64; +add.f64 fd76, fd40, fd66; +sub.f64 fd77, fd34, fd64; +sub.f64 fd78, fd40, fd66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mul.wide.u32 rd4, r8, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd79, fd80}, [rd6]; +mul.f64 fd83, fd79, fd71; +mul.f64 fd84, fd80, fd72; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd79, fd72; +fma.rn.f64 fd87, fd80, fd71, fd86; +mul.f64 fd88, fd79, fd79; +mul.f64 fd89, fd80, fd80; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd80, fd79; +fma.rn.f64 fd92, fd80, fd79, fd91; +mul.f64 fd93, fd90, fd75; +mul.f64 fd94, fd92, fd76; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd90, fd76; +fma.rn.f64 fd97, fd92, fd75, fd96; +mul.f64 fd98, fd79, fd90; +mul.f64 fd99, fd80, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd79, fd92; +fma.rn.f64 fd102, fd80, fd90, fd101; +mul.f64 fd103, fd100, fd69; +mul.f64 fd104, fd102, fd70; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd100, fd70; +fma.rn.f64 fd107, fd102, fd69, fd106; +ld.global.v2.f64 {fd108, fd109}, [rd6+3456]; +mul.f64 fd112, fd108, fd73; +mul.f64 fd113, fd109, fd74; +sub.f64 fd114, fd112, fd113; +mul.f64 fd115, fd108, fd74; +fma.rn.f64 fd116, fd109, fd73, fd115; +mul.f64 fd117, fd79, fd108; +mul.f64 fd118, fd80, fd109; +sub.f64 fd119, fd117, fd118; +mul.f64 fd120, fd79, fd109; +fma.rn.f64 fd121, fd80, fd108, fd120; +mul.f64 fd122, fd119, fd77; +mul.f64 fd123, fd121, fd78; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd119, fd78; +fma.rn.f64 fd126, fd121, fd77, fd125; +mad.lo.s32 r9, r6, 10368, r3; +barrier.sync 0; +mad.lo.s32 r10, r8, 48, r9; +st.shared.v2.f64 [r10], {fd67, fd85}; +st.shared.v2.f64 [r10+16], {fd95, fd105}; +st.shared.v2.f64 [r10+32], {fd114, fd124}; +barrier.sync 0; +mad.lo.s32 r11, r8, -40, r10; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+1728]; +ld.shared.f64 fd129, [r11+3456]; +ld.shared.f64 fd130, [r11+5184]; +ld.shared.f64 fd131, [r11+6912]; +ld.shared.f64 fd132, [r11+8640]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd68, fd87}; +st.shared.v2.f64 [r10+16], {fd97, fd107}; +st.shared.v2.f64 [r10+32], {fd116, fd126}; +barrier.sync 0; +ld.shared.f64 fd133, [r11]; +ld.shared.f64 fd134, [r11+1728]; +ld.shared.f64 fd135, [r11+3456]; +ld.shared.f64 fd136, [r11+5184]; +ld.shared.f64 fd137, [r11+6912]; +ld.shared.f64 fd138, [r11+8640]; +add.f64 fd139, fd129, fd131; +add.f64 fd140, fd127, fd139; +add.f64 fd141, fd135, fd137; +add.f64 fd142, fd133, fd141; +mul.f64 fd143, fd139, 0d3FE0000000000000; +sub.f64 fd144, fd127, fd143; +sub.f64 fd145, fd135, fd137; +mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA; +add.f64 fd147, fd146, fd144; +sub.f64 fd148, fd144, fd146; +mul.f64 fd149, fd141, 0d3FE0000000000000; +sub.f64 fd150, fd133, fd149; +sub.f64 fd151, fd129, fd131; +mul.f64 fd152, fd151, 0d3FEBB67AE8584CAA; +sub.f64 fd153, fd150, fd152; +add.f64 fd154, fd152, fd150; +add.f64 fd155, fd130, fd132; +add.f64 fd156, fd128, fd155; +add.f64 fd157, fd136, fd138; +add.f64 fd158, fd134, fd157; +mul.f64 fd159, fd155, 0d3FE0000000000000; +sub.f64 fd160, fd128, fd159; +sub.f64 fd161, fd136, fd138; +mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA; +add.f64 fd163, fd162, fd160; +sub.f64 fd164, fd160, fd162; +mul.f64 fd165, fd157, 0d3FE0000000000000; +sub.f64 fd166, fd134, fd165; +sub.f64 fd167, fd130, fd132; +mul.f64 fd168, fd167, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd166, fd168; +add.f64 fd170, fd168, fd166; +mul.f64 fd171, fd163, 0d3FE0000000000000; +mul.f64 fd172, fd169, 0dBFEBB67AE8584CAA; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd169, 0d3FE0000000000000; +fma.rn.f64 fd175, fd163, 0dBFEBB67AE8584CAA, fd174; +mul.f64 fd176, fd164, 0dBFE0000000000000; +mul.f64 fd177, fd170, 0dBFEBB67AE8584CAA; +sub.f64 fd178, fd176, fd177; +mul.f64 fd179, fd170, 0dBFE0000000000000; +fma.rn.f64 fd180, fd164, 0dBFEBB67AE8584CAA, fd179; +add.f64 fd181, fd140, fd156; +add.f64 fd182, fd142, fd158; +sub.f64 fd183, fd140, fd156; +sub.f64 fd184, fd142, fd158; +add.f64 fd185, fd147, fd173; +add.f64 fd186, fd153, fd175; +sub.f64 fd187, fd147, fd173; +sub.f64 fd188, fd153, fd175; +add.f64 fd189, fd148, fd178; +add.f64 fd190, fd154, fd180; +sub.f64 fd191, fd148, fd178; +sub.f64 fd192, fd154, fd180; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd193, fd194}, [rd11]; +mul.f64 fd197, fd193, fd185; +mul.f64 fd198, fd194, fd186; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd193, fd186; +fma.rn.f64 fd201, fd194, fd185, fd200; +mul.f64 fd202, fd193, fd193; +mul.f64 fd203, fd194, fd194; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd194, fd193; +fma.rn.f64 fd206, fd194, fd193, fd205; +mul.f64 fd207, fd204, fd189; +mul.f64 fd208, fd206, fd190; +sub.f64 fd209, fd207, fd208; +mul.f64 fd210, fd204, fd190; +fma.rn.f64 fd211, fd206, fd189, fd210; +mul.f64 fd212, fd193, fd204; +mul.f64 fd213, fd194, fd206; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd193, fd206; +fma.rn.f64 fd216, fd194, fd204, fd215; +mul.f64 fd217, fd214, fd183; +mul.f64 fd218, fd216, fd184; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd214, fd184; +fma.rn.f64 fd221, fd216, fd183, fd220; +ld.global.v2.f64 {fd222, fd223}, [rd11+576]; +mul.f64 fd226, fd222, fd187; +mul.f64 fd227, fd223, fd188; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd222, fd188; +fma.rn.f64 fd230, fd223, fd187, fd229; +mul.f64 fd231, fd193, fd222; +mul.f64 fd232, fd194, fd223; +sub.f64 fd233, fd231, fd232; +mul.f64 fd234, fd193, fd223; +fma.rn.f64 fd235, fd194, fd222, fd234; +mul.f64 fd236, fd233, fd191; +mul.f64 fd237, fd235, fd192; +sub.f64 fd238, fd236, fd237; +mul.f64 fd239, fd233, fd192; +fma.rn.f64 fd240, fd235, fd191, fd239; +shl.b32 r15, r14, 3; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 288, r16; +st.shared.f64 [r17], fd181; +st.shared.f64 [r17+48], fd199; +st.shared.f64 [r17+96], fd209; +st.shared.f64 [r17+144], fd219; +st.shared.f64 [r17+192], fd228; +st.shared.f64 [r17+240], fd238; +barrier.sync 0; +ld.shared.f64 fd241, [r11]; +ld.shared.f64 fd242, [r11+1728]; +ld.shared.f64 fd243, [r11+3456]; +ld.shared.f64 fd244, [r11+5184]; +ld.shared.f64 fd245, [r11+6912]; +ld.shared.f64 fd246, [r11+8640]; +barrier.sync 0; +st.shared.f64 [r17], fd182; +st.shared.f64 [r17+48], fd201; +st.shared.f64 [r17+96], fd211; +st.shared.f64 [r17+144], fd221; +st.shared.f64 [r17+192], fd230; +st.shared.f64 [r17+240], fd240; +barrier.sync 0; +ld.shared.f64 fd247, [r11]; +ld.shared.f64 fd248, [r11+1728]; +ld.shared.f64 fd249, [r11+3456]; +ld.shared.f64 fd250, [r11+5184]; +ld.shared.f64 fd251, [r11+6912]; +ld.shared.f64 fd252, [r11+8640]; +add.f64 fd253, fd243, fd245; +add.f64 fd254, fd241, fd253; +add.f64 fd255, fd249, fd251; +add.f64 fd256, fd247, fd255; +mul.f64 fd257, fd253, 0d3FE0000000000000; +sub.f64 fd258, fd241, fd257; +sub.f64 fd259, fd249, fd251; +mul.f64 fd260, fd259, 0d3FEBB67AE8584CAA; +add.f64 fd261, fd260, fd258; +sub.f64 fd262, fd258, fd260; +mul.f64 fd263, fd255, 0d3FE0000000000000; +sub.f64 fd264, fd247, fd263; +sub.f64 fd265, fd243, fd245; +mul.f64 fd266, fd265, 0d3FEBB67AE8584CAA; +sub.f64 fd267, fd264, fd266; +add.f64 fd268, fd266, fd264; +add.f64 fd269, fd244, fd246; +add.f64 fd270, fd242, fd269; +add.f64 fd271, fd250, fd252; +add.f64 fd272, fd248, fd271; +mul.f64 fd273, fd269, 0d3FE0000000000000; +sub.f64 fd274, fd242, fd273; +sub.f64 fd275, fd250, fd252; +mul.f64 fd276, fd275, 0d3FEBB67AE8584CAA; +add.f64 fd277, fd276, fd274; +sub.f64 fd278, fd274, fd276; +mul.f64 fd279, fd271, 0d3FE0000000000000; +sub.f64 fd280, fd248, fd279; +sub.f64 fd281, fd244, fd246; +mul.f64 fd282, fd281, 0d3FEBB67AE8584CAA; +sub.f64 fd283, fd280, fd282; +add.f64 fd284, fd282, fd280; +mul.f64 fd285, fd277, 0d3FE0000000000000; +mul.f64 fd286, fd283, 0dBFEBB67AE8584CAA; +sub.f64 fd287, fd285, fd286; +mul.f64 fd288, fd283, 0d3FE0000000000000; +fma.rn.f64 fd289, fd277, 0dBFEBB67AE8584CAA, fd288; +mul.f64 fd290, fd278, 0dBFE0000000000000; +mul.f64 fd291, fd284, 0dBFEBB67AE8584CAA; +sub.f64 fd292, fd290, fd291; +mul.f64 fd293, fd284, 0dBFE0000000000000; +fma.rn.f64 fd294, fd278, 0dBFEBB67AE8584CAA, fd293; +add.f64 fd295, fd254, fd270; +add.f64 fd296, fd256, fd272; +sub.f64 fd297, fd254, fd270; +sub.f64 fd298, fd256, fd272; +add.f64 fd299, fd261, fd287; +add.f64 fd300, fd267, fd289; +sub.f64 fd301, fd261, fd287; +sub.f64 fd302, fd267, fd289; +add.f64 fd303, fd262, fd292; +add.f64 fd304, fd268, fd294; +sub.f64 fd305, fd262, fd292; +sub.f64 fd306, fd268, fd294; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd307, fd308}, [rd16]; +mul.f64 fd311, fd307, fd299; +mul.f64 fd312, fd308, fd300; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd307, fd300; +fma.rn.f64 fd315, fd308, fd299, fd314; +mul.f64 fd316, fd307, fd307; +mul.f64 fd317, fd308, fd308; +sub.f64 fd318, fd316, fd317; +mul.f64 fd319, fd308, fd307; +fma.rn.f64 fd320, fd308, fd307, fd319; +mul.f64 fd321, fd318, fd303; +mul.f64 fd322, fd320, fd304; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd318, fd304; +fma.rn.f64 fd325, fd320, fd303, fd324; +mul.f64 fd326, fd307, fd318; +mul.f64 fd327, fd308, fd320; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd307, fd320; +fma.rn.f64 fd330, fd308, fd318, fd329; +mul.f64 fd331, fd328, fd297; +mul.f64 fd332, fd330, fd298; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd328, fd298; +fma.rn.f64 fd335, fd330, fd297, fd334; +ld.global.v2.f64 {fd336, fd337}, [rd16+96]; +mul.f64 fd340, fd336, fd301; +mul.f64 fd341, fd337, fd302; +sub.f64 fd342, fd340, fd341; +mul.f64 fd343, fd336, fd302; +fma.rn.f64 fd344, fd337, fd301, fd343; +mul.f64 fd345, fd307, fd336; +mul.f64 fd346, fd308, fd337; +sub.f64 fd347, fd345, fd346; +mul.f64 fd348, fd307, fd337; +fma.rn.f64 fd349, fd308, fd336, fd348; +mul.f64 fd350, fd347, fd305; +mul.f64 fd351, fd349, fd306; +sub.f64 fd352, fd350, fd351; +mul.f64 fd353, fd347, fd306; +fma.rn.f64 fd354, fd349, fd305, fd353; +shl.b32 r21, r20, 3; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1728, r22; +st.shared.f64 [r23], fd295; +st.shared.f64 [r23+288], fd313; +st.shared.f64 [r23+576], fd323; +st.shared.f64 [r23+864], fd333; +st.shared.f64 [r23+1152], fd342; +st.shared.f64 [r23+1440], fd352; +barrier.sync 0; +ld.shared.f64 fd355, [r11]; +ld.shared.f64 fd356, [r11+1728]; +ld.shared.f64 fd357, [r11+3456]; +ld.shared.f64 fd358, [r11+5184]; +ld.shared.f64 fd359, [r11+6912]; +ld.shared.f64 fd360, [r11+8640]; +barrier.sync 0; +st.shared.f64 [r23], fd296; +st.shared.f64 [r23+288], fd315; +st.shared.f64 [r23+576], fd325; +st.shared.f64 [r23+864], fd335; +st.shared.f64 [r23+1152], fd344; +st.shared.f64 [r23+1440], fd354; +barrier.sync 0; +ld.shared.f64 fd361, [r11]; +ld.shared.f64 fd362, [r11+1728]; +ld.shared.f64 fd363, [r11+3456]; +ld.shared.f64 fd364, [r11+5184]; +ld.shared.f64 fd365, [r11+6912]; +ld.shared.f64 fd366, [r11+8640]; +add.f64 fd367, fd357, fd359; +add.f64 fd368, fd355, fd367; +add.f64 fd369, fd363, fd365; +add.f64 fd370, fd361, fd369; +mul.f64 fd371, fd367, 0d3FE0000000000000; +sub.f64 fd372, fd355, fd371; +sub.f64 fd373, fd363, fd365; +mul.f64 fd374, fd373, 0d3FEBB67AE8584CAA; +add.f64 fd375, fd374, fd372; +sub.f64 fd376, fd372, fd374; +mul.f64 fd377, fd369, 0d3FE0000000000000; +sub.f64 fd378, fd361, fd377; +sub.f64 fd379, fd357, fd359; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +sub.f64 fd381, fd378, fd380; +add.f64 fd382, fd380, fd378; +add.f64 fd383, fd358, fd360; +add.f64 fd384, fd356, fd383; +add.f64 fd385, fd364, fd366; +add.f64 fd386, fd362, fd385; +mul.f64 fd387, fd383, 0d3FE0000000000000; +sub.f64 fd388, fd356, fd387; +sub.f64 fd389, fd364, fd366; +mul.f64 fd390, fd389, 0d3FEBB67AE8584CAA; +add.f64 fd391, fd390, fd388; +sub.f64 fd392, fd388, fd390; +mul.f64 fd393, fd385, 0d3FE0000000000000; +sub.f64 fd394, fd362, fd393; +sub.f64 fd395, fd358, fd360; +mul.f64 fd396, fd395, 0d3FEBB67AE8584CAA; +sub.f64 fd397, fd394, fd396; +add.f64 fd398, fd396, fd394; +mul.f64 fd399, fd391, 0d3FE0000000000000; +mul.f64 fd400, fd397, 0dBFEBB67AE8584CAA; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd397, 0d3FE0000000000000; +fma.rn.f64 fd403, fd391, 0dBFEBB67AE8584CAA, fd402; +mul.f64 fd404, fd392, 0dBFE0000000000000; +mul.f64 fd405, fd398, 0dBFEBB67AE8584CAA; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd398, 0dBFE0000000000000; +fma.rn.f64 fd408, fd392, 0dBFEBB67AE8584CAA, fd407; +add.f64 %0, fd368, fd384; +add.f64 %1, fd370, fd386; +add.f64 %3, fd381, fd403; +add.f64 %2, fd375, fd401; +add.f64 %5, fd382, fd408; +add.f64 %4, fd376, fd406; +sub.f64 %6, fd368, fd384; +sub.f64 %7, fd370, fd386; +sub.f64 %9, fd381, fd403; +sub.f64 %8, fd375, fd401; +sub.f64 %11, fd382, fd408; +sub.f64 %10, fd376, fd406; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_1296), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..753db4c1a77e3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1296_fp64_inv.hpp.inc @@ -0,0 +1,910 @@ +#ifndef CUFFTDX_FFT_1296_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_1296_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<735, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<457>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 20736, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %21, %26; +add.f64 fd26, %16, fd25; +add.f64 fd27, %23, %28; +add.f64 fd28, %17, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %16, fd29; +sub.f64 fd31, %23, %28; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %17, fd35; +sub.f64 fd37, %21, %26; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %24, %29; +add.f64 fd42, %18, fd41; +add.f64 fd43, %25, %30; +add.f64 fd44, %20, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %18, fd45; +sub.f64 fd47, %25, %30; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %20, fd51; +sub.f64 fd53, %24, %29; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +sub.f64 fd67, fd26, fd42; +sub.f64 fd68, fd28, fd44; +add.f64 fd69, fd33, fd59; +add.f64 fd70, fd39, fd61; +sub.f64 fd71, fd33, fd59; +sub.f64 fd72, fd39, fd61; +add.f64 fd73, fd34, fd64; +add.f64 fd74, fd40, fd66; +sub.f64 fd75, fd34, fd64; +sub.f64 fd76, fd40, fd66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mad.lo.s32 r9, r6, 20736, r3; +mul.wide.u32 rd4, r8, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd77, fd78}, [rd6]; +mul.f64 fd81, fd70, fd78; +mul.f64 fd82, fd69, fd78; +mul.f64 fd83, fd77, fd70; +mul.f64 fd84, fd77, fd77; +mul.f64 fd85, fd78, fd78; +sub.f64 fd86, fd84, fd85; +mul.f64 fd87, fd78, fd77; +fma.rn.f64 fd88, fd78, fd77, fd87; +mul.f64 fd89, fd74, fd88; +mul.f64 fd90, fd73, fd88; +mul.f64 fd91, fd86, fd74; +mul.f64 fd92, fd77, fd86; +mul.f64 fd93, fd78, fd88; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd77, fd88; +fma.rn.f64 fd96, fd78, fd86, fd95; +mul.f64 fd97, fd68, fd96; +mul.f64 fd98, fd67, fd96; +mul.f64 fd99, fd94, fd68; +ld.global.v2.f64 {fd100, fd101}, [rd6+3456]; +mul.f64 fd104, fd72, fd101; +mul.f64 fd105, fd71, fd101; +mul.f64 fd106, fd100, fd72; +mul.f64 fd107, fd77, fd100; +mul.f64 fd108, fd78, fd101; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd77, fd101; +fma.rn.f64 fd111, fd78, fd100, fd110; +mul.f64 fd112, fd76, fd111; +mul.f64 fd113, fd75, fd111; +mul.f64 fd114, fd109, fd76; +barrier.sync 0; +mad.lo.s32 r10, r8, 96, r9; +add.f64 fd115, fd28, fd44; +add.f64 fd116, fd26, fd42; +st.shared.v2.f64 [r10], {fd116, fd115}; +fma.rn.f64 fd117, fd77, fd69, fd81; +sub.f64 fd118, fd83, fd82; +st.shared.v2.f64 [r10+16], {fd117, fd118}; +fma.rn.f64 fd119, fd86, fd73, fd89; +sub.f64 fd120, fd91, fd90; +st.shared.v2.f64 [r10+32], {fd119, fd120}; +fma.rn.f64 fd121, fd94, fd67, fd97; +sub.f64 fd122, fd99, fd98; +st.shared.v2.f64 [r10+48], {fd121, fd122}; +fma.rn.f64 fd123, fd100, fd71, fd104; +sub.f64 fd124, fd106, fd105; +st.shared.v2.f64 [r10+64], {fd123, fd124}; +fma.rn.f64 fd125, fd109, fd75, fd112; +sub.f64 fd126, fd114, fd113; +st.shared.v2.f64 [r10+80], {fd125, fd126}; +barrier.sync 0; +mad.lo.s32 r11, r8, -80, r10; +ld.shared.v2.f64 {fd127, fd128}, [r11]; +ld.shared.v2.f64 {fd131, fd132}, [r11+3456]; +ld.shared.v2.f64 {fd135, fd136}, [r11+6912]; +ld.shared.v2.f64 {fd139, fd140}, [r11+10368]; +ld.shared.v2.f64 {fd143, fd144}, [r11+13824]; +ld.shared.v2.f64 {fd147, fd148}, [r11+17280]; +add.f64 fd151, fd135, fd143; +add.f64 fd152, fd127, fd151; +add.f64 fd153, fd136, fd144; +add.f64 fd154, fd128, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, fd127, fd155; +sub.f64 fd157, fd136, fd144; +mul.f64 fd158, fd157, 0dBFEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, fd128, fd161; +sub.f64 fd163, fd135, fd143; +mul.f64 fd164, fd163, 0dBFEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, fd139, fd147; +add.f64 fd168, fd131, fd167; +add.f64 fd169, fd140, fd148; +add.f64 fd170, fd132, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, fd131, fd171; +sub.f64 fd173, fd140, fd148; +mul.f64 fd174, fd173, 0dBFEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, fd132, fd177; +sub.f64 fd179, fd139, fd147; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0d3FEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0d3FEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0d3FEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0d3FEBB67AE8584CAA, fd191; +sub.f64 fd193, fd152, fd168; +sub.f64 fd194, fd154, fd170; +add.f64 fd195, fd159, fd185; +add.f64 fd196, fd165, fd187; +sub.f64 fd197, fd159, fd185; +sub.f64 fd198, fd165, fd187; +add.f64 fd199, fd160, fd190; +add.f64 fd200, fd166, fd192; +sub.f64 fd201, fd160, fd190; +sub.f64 fd202, fd166, fd192; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd203, fd204}, [rd11]; +mul.f64 fd207, fd196, fd204; +mul.f64 fd208, fd195, fd204; +mul.f64 fd209, fd203, fd196; +mul.f64 fd210, fd203, fd203; +mul.f64 fd211, fd204, fd204; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd204, fd203; +fma.rn.f64 fd214, fd204, fd203, fd213; +mul.f64 fd215, fd200, fd214; +mul.f64 fd216, fd199, fd214; +mul.f64 fd217, fd212, fd200; +mul.f64 fd218, fd203, fd212; +mul.f64 fd219, fd204, fd214; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd203, fd214; +fma.rn.f64 fd222, fd204, fd212, fd221; +mul.f64 fd223, fd194, fd222; +mul.f64 fd224, fd193, fd222; +mul.f64 fd225, fd220, fd194; +ld.global.v2.f64 {fd226, fd227}, [rd11+576]; +mul.f64 fd230, fd198, fd227; +mul.f64 fd231, fd197, fd227; +mul.f64 fd232, fd226, fd198; +mul.f64 fd233, fd203, fd226; +mul.f64 fd234, fd204, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd203, fd227; +fma.rn.f64 fd237, fd204, fd226, fd236; +mul.f64 fd238, fd202, fd237; +mul.f64 fd239, fd201, fd237; +mul.f64 fd240, fd235, fd202; +shl.b32 r15, r14, 4; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 576, r16; +add.f64 fd241, fd154, fd170; +add.f64 fd242, fd152, fd168; +st.shared.v2.f64 [r17], {fd242, fd241}; +fma.rn.f64 fd243, fd203, fd195, fd207; +sub.f64 fd244, fd209, fd208; +st.shared.v2.f64 [r17+96], {fd243, fd244}; +fma.rn.f64 fd245, fd212, fd199, fd215; +sub.f64 fd246, fd217, fd216; +st.shared.v2.f64 [r17+192], {fd245, fd246}; +fma.rn.f64 fd247, fd220, fd193, fd223; +sub.f64 fd248, fd225, fd224; +st.shared.v2.f64 [r17+288], {fd247, fd248}; +fma.rn.f64 fd249, fd226, fd197, fd230; +sub.f64 fd250, fd232, fd231; +st.shared.v2.f64 [r17+384], {fd249, fd250}; +fma.rn.f64 fd251, fd235, fd201, fd238; +sub.f64 fd252, fd240, fd239; +st.shared.v2.f64 [r17+480], {fd251, fd252}; +barrier.sync 0; +ld.shared.v2.f64 {fd253, fd254}, [r11]; +ld.shared.v2.f64 {fd257, fd258}, [r11+3456]; +ld.shared.v2.f64 {fd261, fd262}, [r11+6912]; +ld.shared.v2.f64 {fd265, fd266}, [r11+10368]; +ld.shared.v2.f64 {fd269, fd270}, [r11+13824]; +ld.shared.v2.f64 {fd273, fd274}, [r11+17280]; +add.f64 fd277, fd261, fd269; +add.f64 fd278, fd253, fd277; +add.f64 fd279, fd262, fd270; +add.f64 fd280, fd254, fd279; +mul.f64 fd281, fd277, 0d3FE0000000000000; +sub.f64 fd282, fd253, fd281; +sub.f64 fd283, fd262, fd270; +mul.f64 fd284, fd283, 0dBFEBB67AE8584CAA; +add.f64 fd285, fd284, fd282; +sub.f64 fd286, fd282, fd284; +mul.f64 fd287, fd279, 0d3FE0000000000000; +sub.f64 fd288, fd254, fd287; +sub.f64 fd289, fd261, fd269; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +sub.f64 fd291, fd288, fd290; +add.f64 fd292, fd290, fd288; +add.f64 fd293, fd265, fd273; +add.f64 fd294, fd257, fd293; +add.f64 fd295, fd266, fd274; +add.f64 fd296, fd258, fd295; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd257, fd297; +sub.f64 fd299, fd266, fd274; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +mul.f64 fd303, fd295, 0d3FE0000000000000; +sub.f64 fd304, fd258, fd303; +sub.f64 fd305, fd265, fd273; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +mul.f64 fd309, fd301, 0d3FE0000000000000; +mul.f64 fd310, fd307, 0d3FEBB67AE8584CAA; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd307, 0d3FE0000000000000; +fma.rn.f64 fd313, fd301, 0d3FEBB67AE8584CAA, fd312; +mul.f64 fd314, fd302, 0dBFE0000000000000; +mul.f64 fd315, fd308, 0d3FEBB67AE8584CAA; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd308, 0dBFE0000000000000; +fma.rn.f64 fd318, fd302, 0d3FEBB67AE8584CAA, fd317; +sub.f64 fd319, fd278, fd294; +sub.f64 fd320, fd280, fd296; +add.f64 fd321, fd285, fd311; +add.f64 fd322, fd291, fd313; +sub.f64 fd323, fd285, fd311; +sub.f64 fd324, fd291, fd313; +add.f64 fd325, fd286, fd316; +add.f64 fd326, fd292, fd318; +sub.f64 fd327, fd286, fd316; +sub.f64 fd328, fd292, fd318; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd329, fd330}, [rd16]; +mul.f64 fd333, fd322, fd330; +mul.f64 fd334, fd321, fd330; +mul.f64 fd335, fd329, fd322; +mul.f64 fd336, fd329, fd329; +mul.f64 fd337, fd330, fd330; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd330, fd329; +fma.rn.f64 fd340, fd330, fd329, fd339; +mul.f64 fd341, fd326, fd340; +mul.f64 fd342, fd325, fd340; +mul.f64 fd343, fd338, fd326; +mul.f64 fd344, fd329, fd338; +mul.f64 fd345, fd330, fd340; +sub.f64 fd346, fd344, fd345; +mul.f64 fd347, fd329, fd340; +fma.rn.f64 fd348, fd330, fd338, fd347; +mul.f64 fd349, fd320, fd348; +mul.f64 fd350, fd319, fd348; +mul.f64 fd351, fd346, fd320; +ld.global.v2.f64 {fd352, fd353}, [rd16+96]; +mul.f64 fd356, fd324, fd353; +mul.f64 fd357, fd323, fd353; +mul.f64 fd358, fd352, fd324; +mul.f64 fd359, fd329, fd352; +mul.f64 fd360, fd330, fd353; +sub.f64 fd361, fd359, fd360; +mul.f64 fd362, fd329, fd353; +fma.rn.f64 fd363, fd330, fd352, fd362; +mul.f64 fd364, fd328, fd363; +mul.f64 fd365, fd327, fd363; +mul.f64 fd366, fd361, fd328; +shl.b32 r21, r20, 4; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 3456, r22; +add.f64 fd367, fd280, fd296; +add.f64 fd368, fd278, fd294; +st.shared.v2.f64 [r23], {fd368, fd367}; +fma.rn.f64 fd369, fd329, fd321, fd333; +sub.f64 fd370, fd335, fd334; +st.shared.v2.f64 [r23+576], {fd369, fd370}; +fma.rn.f64 fd371, fd338, fd325, fd341; +sub.f64 fd372, fd343, fd342; +st.shared.v2.f64 [r23+1152], {fd371, fd372}; +fma.rn.f64 fd373, fd346, fd319, fd349; +sub.f64 fd374, fd351, fd350; +st.shared.v2.f64 [r23+1728], {fd373, fd374}; +fma.rn.f64 fd375, fd352, fd323, fd356; +sub.f64 fd376, fd358, fd357; +st.shared.v2.f64 [r23+2304], {fd375, fd376}; +fma.rn.f64 fd377, fd361, fd327, fd364; +sub.f64 fd378, fd366, fd365; +st.shared.v2.f64 [r23+2880], {fd377, fd378}; +barrier.sync 0; +ld.shared.v2.f64 {fd379, fd380}, [r11]; +ld.shared.v2.f64 {fd383, fd384}, [r11+3456]; +ld.shared.v2.f64 {fd387, fd388}, [r11+6912]; +ld.shared.v2.f64 {fd391, fd392}, [r11+10368]; +ld.shared.v2.f64 {fd395, fd396}, [r11+13824]; +ld.shared.v2.f64 {fd399, fd400}, [r11+17280]; +add.f64 fd403, fd387, fd395; +add.f64 fd404, fd379, fd403; +add.f64 fd405, fd388, fd396; +add.f64 fd406, fd380, fd405; +mul.f64 fd407, fd403, 0d3FE0000000000000; +sub.f64 fd408, fd379, fd407; +sub.f64 fd409, fd388, fd396; +mul.f64 fd410, fd409, 0dBFEBB67AE8584CAA; +add.f64 fd411, fd410, fd408; +sub.f64 fd412, fd408, fd410; +mul.f64 fd413, fd405, 0d3FE0000000000000; +sub.f64 fd414, fd380, fd413; +sub.f64 fd415, fd387, fd395; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +sub.f64 fd417, fd414, fd416; +add.f64 fd418, fd416, fd414; +add.f64 fd419, fd391, fd399; +add.f64 fd420, fd383, fd419; +add.f64 fd421, fd392, fd400; +add.f64 fd422, fd384, fd421; +mul.f64 fd423, fd419, 0d3FE0000000000000; +sub.f64 fd424, fd383, fd423; +sub.f64 fd425, fd392, fd400; +mul.f64 fd426, fd425, 0dBFEBB67AE8584CAA; +add.f64 fd427, fd426, fd424; +sub.f64 fd428, fd424, fd426; +mul.f64 fd429, fd421, 0d3FE0000000000000; +sub.f64 fd430, fd384, fd429; +sub.f64 fd431, fd391, fd399; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +sub.f64 fd433, fd430, fd432; +add.f64 fd434, fd432, fd430; +mul.f64 fd435, fd427, 0d3FE0000000000000; +mul.f64 fd436, fd433, 0d3FEBB67AE8584CAA; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd433, 0d3FE0000000000000; +fma.rn.f64 fd439, fd427, 0d3FEBB67AE8584CAA, fd438; +mul.f64 fd440, fd428, 0dBFE0000000000000; +mul.f64 fd441, fd434, 0d3FEBB67AE8584CAA; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd434, 0dBFE0000000000000; +fma.rn.f64 fd444, fd428, 0d3FEBB67AE8584CAA, fd443; +add.f64 %1, fd406, fd422; +add.f64 %0, fd404, fd420; +add.f64 %3, fd417, fd439; +add.f64 %2, fd411, fd437; +add.f64 %5, fd418, fd444; +add.f64 %4, fd412, fd442; +sub.f64 %7, fd406, fd422; +sub.f64 %6, fd404, fd420; +sub.f64 %9, fd417, fd439; +sub.f64 %8, fd411, fd437; +sub.f64 %11, fd418, fd444; +sub.f64 %10, fd412, fd442; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_1296), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<734, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<421>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 10368, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %21, %26; +add.f64 fd26, %16, fd25; +add.f64 fd27, %23, %28; +add.f64 fd28, %17, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %16, fd29; +sub.f64 fd31, %23, %28; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %17, fd35; +sub.f64 fd37, %21, %26; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %24, %29; +add.f64 fd42, %18, fd41; +add.f64 fd43, %25, %30; +add.f64 fd44, %20, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %18, fd45; +sub.f64 fd47, %25, %30; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %20, fd51; +sub.f64 fd53, %24, %29; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +add.f64 fd67, fd26, fd42; +add.f64 fd68, fd28, fd44; +sub.f64 fd69, fd26, fd42; +sub.f64 fd70, fd28, fd44; +add.f64 fd71, fd33, fd59; +add.f64 fd72, fd39, fd61; +sub.f64 fd73, fd33, fd59; +sub.f64 fd74, fd39, fd61; +add.f64 fd75, fd34, fd64; +add.f64 fd76, fd40, fd66; +sub.f64 fd77, fd34, fd64; +sub.f64 fd78, fd40, fd66; +shr.u32 r5, r4, 3; +mul.wide.u32 rd2, r5, 159072863; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6, rd3; +mul.lo.s32 r7, r6, 216; +sub.s32 r8, r4, r7; +mul.wide.u32 rd4, r8, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd79, fd80}, [rd6]; +mul.f64 fd83, fd72, fd80; +fma.rn.f64 fd84, fd79, fd71, fd83; +mul.f64 fd85, fd71, fd80; +mul.f64 fd86, fd79, fd72; +sub.f64 fd87, fd86, fd85; +mul.f64 fd88, fd79, fd79; +mul.f64 fd89, fd80, fd80; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd80, fd79; +fma.rn.f64 fd92, fd80, fd79, fd91; +mul.f64 fd93, fd76, fd92; +fma.rn.f64 fd94, fd90, fd75, fd93; +mul.f64 fd95, fd75, fd92; +mul.f64 fd96, fd90, fd76; +sub.f64 fd97, fd96, fd95; +mul.f64 fd98, fd79, fd90; +mul.f64 fd99, fd80, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd79, fd92; +fma.rn.f64 fd102, fd80, fd90, fd101; +mul.f64 fd103, fd70, fd102; +fma.rn.f64 fd104, fd100, fd69, fd103; +mul.f64 fd105, fd69, fd102; +mul.f64 fd106, fd100, fd70; +sub.f64 fd107, fd106, fd105; +ld.global.v2.f64 {fd108, fd109}, [rd6+3456]; +mul.f64 fd112, fd74, fd109; +fma.rn.f64 fd113, fd108, fd73, fd112; +mul.f64 fd114, fd73, fd109; +mul.f64 fd115, fd108, fd74; +sub.f64 fd116, fd115, fd114; +mul.f64 fd117, fd79, fd108; +mul.f64 fd118, fd80, fd109; +sub.f64 fd119, fd117, fd118; +mul.f64 fd120, fd79, fd109; +fma.rn.f64 fd121, fd80, fd108, fd120; +mul.f64 fd122, fd78, fd121; +fma.rn.f64 fd123, fd119, fd77, fd122; +mul.f64 fd124, fd77, fd121; +mul.f64 fd125, fd119, fd78; +sub.f64 fd126, fd125, fd124; +mad.lo.s32 r9, r6, 10368, r3; +barrier.sync 0; +mad.lo.s32 r10, r8, 48, r9; +st.shared.v2.f64 [r10], {fd67, fd84}; +st.shared.v2.f64 [r10+16], {fd94, fd104}; +st.shared.v2.f64 [r10+32], {fd113, fd123}; +barrier.sync 0; +mad.lo.s32 r11, r8, -40, r10; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+1728]; +ld.shared.f64 fd129, [r11+3456]; +ld.shared.f64 fd130, [r11+5184]; +ld.shared.f64 fd131, [r11+6912]; +ld.shared.f64 fd132, [r11+8640]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd68, fd87}; +st.shared.v2.f64 [r10+16], {fd97, fd107}; +st.shared.v2.f64 [r10+32], {fd116, fd126}; +barrier.sync 0; +ld.shared.f64 fd133, [r11]; +ld.shared.f64 fd134, [r11+1728]; +ld.shared.f64 fd135, [r11+3456]; +ld.shared.f64 fd136, [r11+5184]; +ld.shared.f64 fd137, [r11+6912]; +ld.shared.f64 fd138, [r11+8640]; +add.f64 fd139, fd129, fd131; +add.f64 fd140, fd127, fd139; +add.f64 fd141, fd135, fd137; +add.f64 fd142, fd133, fd141; +mul.f64 fd143, fd139, 0d3FE0000000000000; +sub.f64 fd144, fd127, fd143; +sub.f64 fd145, fd135, fd137; +mul.f64 fd146, fd145, 0dBFEBB67AE8584CAA; +add.f64 fd147, fd146, fd144; +sub.f64 fd148, fd144, fd146; +mul.f64 fd149, fd141, 0d3FE0000000000000; +sub.f64 fd150, fd133, fd149; +sub.f64 fd151, fd129, fd131; +mul.f64 fd152, fd151, 0dBFEBB67AE8584CAA; +sub.f64 fd153, fd150, fd152; +add.f64 fd154, fd152, fd150; +add.f64 fd155, fd130, fd132; +add.f64 fd156, fd128, fd155; +add.f64 fd157, fd136, fd138; +add.f64 fd158, fd134, fd157; +mul.f64 fd159, fd155, 0d3FE0000000000000; +sub.f64 fd160, fd128, fd159; +sub.f64 fd161, fd136, fd138; +mul.f64 fd162, fd161, 0dBFEBB67AE8584CAA; +add.f64 fd163, fd162, fd160; +sub.f64 fd164, fd160, fd162; +mul.f64 fd165, fd157, 0d3FE0000000000000; +sub.f64 fd166, fd134, fd165; +sub.f64 fd167, fd130, fd132; +mul.f64 fd168, fd167, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd166, fd168; +add.f64 fd170, fd168, fd166; +mul.f64 fd171, fd163, 0d3FE0000000000000; +mul.f64 fd172, fd169, 0d3FEBB67AE8584CAA; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd169, 0d3FE0000000000000; +fma.rn.f64 fd175, fd163, 0d3FEBB67AE8584CAA, fd174; +mul.f64 fd176, fd164, 0dBFE0000000000000; +mul.f64 fd177, fd170, 0d3FEBB67AE8584CAA; +sub.f64 fd178, fd176, fd177; +mul.f64 fd179, fd170, 0dBFE0000000000000; +fma.rn.f64 fd180, fd164, 0d3FEBB67AE8584CAA, fd179; +add.f64 fd181, fd140, fd156; +add.f64 fd182, fd142, fd158; +sub.f64 fd183, fd140, fd156; +sub.f64 fd184, fd142, fd158; +add.f64 fd185, fd147, fd173; +add.f64 fd186, fd153, fd175; +sub.f64 fd187, fd147, fd173; +sub.f64 fd188, fd153, fd175; +add.f64 fd189, fd148, fd178; +add.f64 fd190, fd154, fd180; +sub.f64 fd191, fd148, fd178; +sub.f64 fd192, fd154, fd180; +mul.wide.u32 rd7, r8, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 6; +sub.s32 r14, r8, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd193, fd194}, [rd11]; +mul.f64 fd197, fd186, fd194; +fma.rn.f64 fd198, fd193, fd185, fd197; +mul.f64 fd199, fd185, fd194; +mul.f64 fd200, fd193, fd186; +sub.f64 fd201, fd200, fd199; +mul.f64 fd202, fd193, fd193; +mul.f64 fd203, fd194, fd194; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd194, fd193; +fma.rn.f64 fd206, fd194, fd193, fd205; +mul.f64 fd207, fd190, fd206; +fma.rn.f64 fd208, fd204, fd189, fd207; +mul.f64 fd209, fd189, fd206; +mul.f64 fd210, fd204, fd190; +sub.f64 fd211, fd210, fd209; +mul.f64 fd212, fd193, fd204; +mul.f64 fd213, fd194, fd206; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd193, fd206; +fma.rn.f64 fd216, fd194, fd204, fd215; +mul.f64 fd217, fd184, fd216; +fma.rn.f64 fd218, fd214, fd183, fd217; +mul.f64 fd219, fd183, fd216; +mul.f64 fd220, fd214, fd184; +sub.f64 fd221, fd220, fd219; +ld.global.v2.f64 {fd222, fd223}, [rd11+576]; +mul.f64 fd226, fd188, fd223; +fma.rn.f64 fd227, fd222, fd187, fd226; +mul.f64 fd228, fd187, fd223; +mul.f64 fd229, fd222, fd188; +sub.f64 fd230, fd229, fd228; +mul.f64 fd231, fd193, fd222; +mul.f64 fd232, fd194, fd223; +sub.f64 fd233, fd231, fd232; +mul.f64 fd234, fd193, fd223; +fma.rn.f64 fd235, fd194, fd222, fd234; +mul.f64 fd236, fd192, fd235; +fma.rn.f64 fd237, fd233, fd191, fd236; +mul.f64 fd238, fd191, fd235; +mul.f64 fd239, fd233, fd192; +sub.f64 fd240, fd239, fd238; +shl.b32 r15, r14, 3; +add.s32 r16, r9, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 288, r16; +st.shared.f64 [r17], fd181; +st.shared.f64 [r17+48], fd198; +st.shared.f64 [r17+96], fd208; +st.shared.f64 [r17+144], fd218; +st.shared.f64 [r17+192], fd227; +st.shared.f64 [r17+240], fd237; +barrier.sync 0; +ld.shared.f64 fd241, [r11]; +ld.shared.f64 fd242, [r11+1728]; +ld.shared.f64 fd243, [r11+3456]; +ld.shared.f64 fd244, [r11+5184]; +ld.shared.f64 fd245, [r11+6912]; +ld.shared.f64 fd246, [r11+8640]; +barrier.sync 0; +st.shared.f64 [r17], fd182; +st.shared.f64 [r17+48], fd201; +st.shared.f64 [r17+96], fd211; +st.shared.f64 [r17+144], fd221; +st.shared.f64 [r17+192], fd230; +st.shared.f64 [r17+240], fd240; +barrier.sync 0; +ld.shared.f64 fd247, [r11]; +ld.shared.f64 fd248, [r11+1728]; +ld.shared.f64 fd249, [r11+3456]; +ld.shared.f64 fd250, [r11+5184]; +ld.shared.f64 fd251, [r11+6912]; +ld.shared.f64 fd252, [r11+8640]; +add.f64 fd253, fd243, fd245; +add.f64 fd254, fd241, fd253; +add.f64 fd255, fd249, fd251; +add.f64 fd256, fd247, fd255; +mul.f64 fd257, fd253, 0d3FE0000000000000; +sub.f64 fd258, fd241, fd257; +sub.f64 fd259, fd249, fd251; +mul.f64 fd260, fd259, 0dBFEBB67AE8584CAA; +add.f64 fd261, fd260, fd258; +sub.f64 fd262, fd258, fd260; +mul.f64 fd263, fd255, 0d3FE0000000000000; +sub.f64 fd264, fd247, fd263; +sub.f64 fd265, fd243, fd245; +mul.f64 fd266, fd265, 0dBFEBB67AE8584CAA; +sub.f64 fd267, fd264, fd266; +add.f64 fd268, fd266, fd264; +add.f64 fd269, fd244, fd246; +add.f64 fd270, fd242, fd269; +add.f64 fd271, fd250, fd252; +add.f64 fd272, fd248, fd271; +mul.f64 fd273, fd269, 0d3FE0000000000000; +sub.f64 fd274, fd242, fd273; +sub.f64 fd275, fd250, fd252; +mul.f64 fd276, fd275, 0dBFEBB67AE8584CAA; +add.f64 fd277, fd276, fd274; +sub.f64 fd278, fd274, fd276; +mul.f64 fd279, fd271, 0d3FE0000000000000; +sub.f64 fd280, fd248, fd279; +sub.f64 fd281, fd244, fd246; +mul.f64 fd282, fd281, 0dBFEBB67AE8584CAA; +sub.f64 fd283, fd280, fd282; +add.f64 fd284, fd282, fd280; +mul.f64 fd285, fd277, 0d3FE0000000000000; +mul.f64 fd286, fd283, 0d3FEBB67AE8584CAA; +sub.f64 fd287, fd285, fd286; +mul.f64 fd288, fd283, 0d3FE0000000000000; +fma.rn.f64 fd289, fd277, 0d3FEBB67AE8584CAA, fd288; +mul.f64 fd290, fd278, 0dBFE0000000000000; +mul.f64 fd291, fd284, 0d3FEBB67AE8584CAA; +sub.f64 fd292, fd290, fd291; +mul.f64 fd293, fd284, 0dBFE0000000000000; +fma.rn.f64 fd294, fd278, 0d3FEBB67AE8584CAA, fd293; +add.f64 fd295, fd254, fd270; +add.f64 fd296, fd256, fd272; +sub.f64 fd297, fd254, fd270; +sub.f64 fd298, fd256, fd272; +add.f64 fd299, fd261, fd287; +add.f64 fd300, fd267, fd289; +sub.f64 fd301, fd261, fd287; +sub.f64 fd302, fd267, fd289; +add.f64 fd303, fd262, fd292; +add.f64 fd304, fd268, fd294; +sub.f64 fd305, fd262, fd292; +sub.f64 fd306, fd268, fd294; +mul.wide.u32 rd12, r8, 954437177; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 36; +sub.s32 r20, r8, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %15; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd307, fd308}, [rd16]; +mul.f64 fd311, fd300, fd308; +fma.rn.f64 fd312, fd307, fd299, fd311; +mul.f64 fd313, fd299, fd308; +mul.f64 fd314, fd307, fd300; +sub.f64 fd315, fd314, fd313; +mul.f64 fd316, fd307, fd307; +mul.f64 fd317, fd308, fd308; +sub.f64 fd318, fd316, fd317; +mul.f64 fd319, fd308, fd307; +fma.rn.f64 fd320, fd308, fd307, fd319; +mul.f64 fd321, fd304, fd320; +fma.rn.f64 fd322, fd318, fd303, fd321; +mul.f64 fd323, fd303, fd320; +mul.f64 fd324, fd318, fd304; +sub.f64 fd325, fd324, fd323; +mul.f64 fd326, fd307, fd318; +mul.f64 fd327, fd308, fd320; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd307, fd320; +fma.rn.f64 fd330, fd308, fd318, fd329; +mul.f64 fd331, fd298, fd330; +fma.rn.f64 fd332, fd328, fd297, fd331; +mul.f64 fd333, fd297, fd330; +mul.f64 fd334, fd328, fd298; +sub.f64 fd335, fd334, fd333; +ld.global.v2.f64 {fd336, fd337}, [rd16+96]; +mul.f64 fd340, fd302, fd337; +fma.rn.f64 fd341, fd336, fd301, fd340; +mul.f64 fd342, fd301, fd337; +mul.f64 fd343, fd336, fd302; +sub.f64 fd344, fd343, fd342; +mul.f64 fd345, fd307, fd336; +mul.f64 fd346, fd308, fd337; +sub.f64 fd347, fd345, fd346; +mul.f64 fd348, fd307, fd337; +fma.rn.f64 fd349, fd308, fd336, fd348; +mul.f64 fd350, fd306, fd349; +fma.rn.f64 fd351, fd347, fd305, fd350; +mul.f64 fd352, fd305, fd349; +mul.f64 fd353, fd347, fd306; +sub.f64 fd354, fd353, fd352; +shl.b32 r21, r20, 3; +add.s32 r22, r9, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1728, r22; +st.shared.f64 [r23], fd295; +st.shared.f64 [r23+288], fd312; +st.shared.f64 [r23+576], fd322; +st.shared.f64 [r23+864], fd332; +st.shared.f64 [r23+1152], fd341; +st.shared.f64 [r23+1440], fd351; +barrier.sync 0; +ld.shared.f64 fd355, [r11]; +ld.shared.f64 fd356, [r11+1728]; +ld.shared.f64 fd357, [r11+3456]; +ld.shared.f64 fd358, [r11+5184]; +ld.shared.f64 fd359, [r11+6912]; +ld.shared.f64 fd360, [r11+8640]; +barrier.sync 0; +st.shared.f64 [r23], fd296; +st.shared.f64 [r23+288], fd315; +st.shared.f64 [r23+576], fd325; +st.shared.f64 [r23+864], fd335; +st.shared.f64 [r23+1152], fd344; +st.shared.f64 [r23+1440], fd354; +barrier.sync 0; +ld.shared.f64 fd361, [r11]; +ld.shared.f64 fd362, [r11+1728]; +ld.shared.f64 fd363, [r11+3456]; +ld.shared.f64 fd364, [r11+5184]; +ld.shared.f64 fd365, [r11+6912]; +ld.shared.f64 fd366, [r11+8640]; +add.f64 fd367, fd357, fd359; +add.f64 fd368, fd355, fd367; +add.f64 fd369, fd363, fd365; +add.f64 fd370, fd361, fd369; +mul.f64 fd371, fd367, 0d3FE0000000000000; +sub.f64 fd372, fd355, fd371; +sub.f64 fd373, fd363, fd365; +mul.f64 fd374, fd373, 0dBFEBB67AE8584CAA; +add.f64 fd375, fd374, fd372; +sub.f64 fd376, fd372, fd374; +mul.f64 fd377, fd369, 0d3FE0000000000000; +sub.f64 fd378, fd361, fd377; +sub.f64 fd379, fd357, fd359; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +sub.f64 fd381, fd378, fd380; +add.f64 fd382, fd380, fd378; +add.f64 fd383, fd358, fd360; +add.f64 fd384, fd356, fd383; +add.f64 fd385, fd364, fd366; +add.f64 fd386, fd362, fd385; +mul.f64 fd387, fd383, 0d3FE0000000000000; +sub.f64 fd388, fd356, fd387; +sub.f64 fd389, fd364, fd366; +mul.f64 fd390, fd389, 0dBFEBB67AE8584CAA; +add.f64 fd391, fd390, fd388; +sub.f64 fd392, fd388, fd390; +mul.f64 fd393, fd385, 0d3FE0000000000000; +sub.f64 fd394, fd362, fd393; +sub.f64 fd395, fd358, fd360; +mul.f64 fd396, fd395, 0dBFEBB67AE8584CAA; +sub.f64 fd397, fd394, fd396; +add.f64 fd398, fd396, fd394; +mul.f64 fd399, fd391, 0d3FE0000000000000; +mul.f64 fd400, fd397, 0d3FEBB67AE8584CAA; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd397, 0d3FE0000000000000; +fma.rn.f64 fd403, fd391, 0d3FEBB67AE8584CAA, fd402; +mul.f64 fd404, fd392, 0dBFE0000000000000; +mul.f64 fd405, fd398, 0d3FEBB67AE8584CAA; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd398, 0dBFE0000000000000; +fma.rn.f64 fd408, fd392, 0d3FEBB67AE8584CAA, fd407; +add.f64 %0, fd368, fd384; +add.f64 %1, fd370, fd386; +add.f64 %3, fd381, fd403; +add.f64 %2, fd375, fd401; +add.f64 %5, fd382, fd408; +add.f64 %4, fd376, fd406; +sub.f64 %6, fd368, fd384; +sub.f64 %7, fd370, fd386; +sub.f64 %9, fd381, fd403; +sub.f64 %8, fd375, fd401; +sub.f64 %11, fd382, fd408; +sub.f64 %10, fd376, fd406; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_1296), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..eafd2041dc101 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp16_fwd.hpp.inc @@ -0,0 +1,733 @@ +#ifndef CUFFTDX_FFT_12_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_12_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<943, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<59>; +.reg .b32 r<611>; +.reg .f64 fd<51>; +.reg .b64 rd<2>; +mov.f64 fd38, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd38; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd37, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd37; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %32, %40; +} +{ +add.f16x2 r4, %24, r1; +} +{ +add.f16x2 r7, %33, %41; +} +{ +add.f16x2 r10, %25, r7; +} +{ +add.f16x2 r13, %32, %40; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %24, r16; +} +{ +sub.f16x2 r22, %33, %41; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %32, %40; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %24, r34; +} +{ +sub.f16x2 r40, %33, %41; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %33, %41; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %25, r52; +} +{ +sub.f16x2 r58, %32, %40; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %33, %41; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %25, r70; +} +{ +sub.f16x2 r76, %32, %40; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs5, fd38; +} +mov.b32 r156, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd37; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r165, {rs7, rs7}; +{ +add.f16x2 r85, %36, %44; +} +{ +add.f16x2 r88, %28, r85; +} +{ +add.f16x2 r91, %37, %45; +} +{ +add.f16x2 r94, %29, r91; +} +{ +add.f16x2 r97, %36, %44; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %28, r100; +} +{ +sub.f16x2 r106, %37, %45; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %36, %44; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %28, r118; +} +{ +sub.f16x2 r124, %37, %45; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %37, %45; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %29, r136; +} +{ +sub.f16x2 r142, %36, %44; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %37, %45; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %29, r154; +} +{ +sub.f16x2 r160, %36, %44; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +mov.f64 fd31, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs9, fd31; +} +{ +cvt.rn.f16.f64 rs10, fd37; +} +{ +cvt.rn.f16.f64 rs11, fd38; +} +{ +cvt.rn.f16.f64 rs12, fd37; +} +mov.f64 fd29, 0d3FEBB67AE8584CAA; +mov.b32 r183, {rs9, rs9}; +{ +mul.f16x2 r169, r112, r183; +} +mov.b32 r180, {rs10, rs10}; +{ +mul.f16x2 r172, r148, r180; +} +{ +sub.f16x2 r175, r169, r172; +} +{ +mul.f16x2 r178, r112, r180; +} +{ +fma.rn.f16x2 r181, r148, r183, r178; +} +mov.b32 r199, {rs11, rs11}; +{ +mul.f16x2 r185, r130, r199; +} +mov.b32 r196, {rs12, rs12}; +{ +mul.f16x2 r188, r166, r196; +} +{ +sub.f16x2 r191, r185, r188; +} +{ +mul.f16x2 r194, r130, r196; +} +{ +fma.rn.f16x2 r197, r166, r199, r194; +} +{ +add.f16x2 r201, r4, r88; +} +{ +add.f16x2 r204, r10, r94; +} +{ +sub.f16x2 r207, r4, r88; +} +{ +sub.f16x2 r210, r10, r94; +} +{ +add.f16x2 r213, r28, r175; +} +{ +add.f16x2 r216, r64, r181; +} +{ +sub.f16x2 r219, r28, r175; +} +{ +sub.f16x2 r222, r64, r181; +} +{ +add.f16x2 r225, r46, r191; +} +{ +add.f16x2 r228, r82, r197; +} +{ +sub.f16x2 r231, r46, r191; +} +{ +sub.f16x2 r234, r82, r197; +} +{ +cvt.rn.f16.f64 rs19, fd38; +} +mov.b32 r308, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd37; +} +{ +neg.f16 rs21, rs20; +} +mov.b32 r317, {rs21, rs21}; +{ +add.f16x2 r237, %34, %42; +} +{ +add.f16x2 r240, %26, r237; +} +{ +add.f16x2 r243, %35, %43; +} +{ +add.f16x2 r246, %27, r243; +} +{ +add.f16x2 r249, %34, %42; +} +{ +mul.f16x2 r252, r249, r308; +} +{ +add.f16x2 r255, %26, r252; +} +{ +sub.f16x2 r258, %35, %43; +} +{ +mul.f16x2 r261, r258, r317; +} +{ +add.f16x2 r264, r255, r261; +} +{ +add.f16x2 r267, %34, %42; +} +{ +mul.f16x2 r270, r267, r308; +} +{ +add.f16x2 r273, %26, r270; +} +{ +sub.f16x2 r276, %35, %43; +} +{ +mul.f16x2 r279, r276, r317; +} +{ +sub.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %35, %43; +} +{ +mul.f16x2 r288, r285, r308; +} +{ +add.f16x2 r291, %27, r288; +} +{ +sub.f16x2 r294, %34, %42; +} +{ +mul.f16x2 r297, r294, r317; +} +{ +sub.f16x2 r300, r291, r297; +} +{ +add.f16x2 r303, %35, %43; +} +{ +mul.f16x2 r306, r303, r308; +} +{ +add.f16x2 r309, %27, r306; +} +{ +sub.f16x2 r312, %34, %42; +} +{ +mul.f16x2 r315, r312, r317; +} +{ +add.f16x2 r318, r309, r315; +} +{ +cvt.rn.f16.f64 rs23, fd38; +} +mov.b32 r392, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs24, fd37; +} +{ +neg.f16 rs25, rs24; +} +mov.b32 r401, {rs25, rs25}; +{ +add.f16x2 r321, %38, %46; +} +{ +add.f16x2 r324, %30, r321; +} +{ +add.f16x2 r327, %39, %47; +} +{ +add.f16x2 r330, %31, r327; +} +{ +add.f16x2 r333, %38, %46; +} +{ +mul.f16x2 r336, r333, r392; +} +{ +add.f16x2 r339, %30, r336; +} +{ +sub.f16x2 r342, %39, %47; +} +{ +mul.f16x2 r345, r342, r401; +} +{ +add.f16x2 r348, r339, r345; +} +{ +add.f16x2 r351, %38, %46; +} +{ +mul.f16x2 r354, r351, r392; +} +{ +add.f16x2 r357, %30, r354; +} +{ +sub.f16x2 r360, %39, %47; +} +{ +mul.f16x2 r363, r360, r401; +} +{ +sub.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %39, %47; +} +{ +mul.f16x2 r372, r369, r392; +} +{ +add.f16x2 r375, %31, r372; +} +{ +sub.f16x2 r378, %38, %46; +} +{ +mul.f16x2 r381, r378, r401; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %39, %47; +} +{ +mul.f16x2 r390, r387, r392; +} +{ +add.f16x2 r393, %31, r390; +} +{ +sub.f16x2 r396, %38, %46; +} +{ +mul.f16x2 r399, r396, r401; +} +{ +add.f16x2 r402, r393, r399; +} +{ +cvt.rn.f16.f64 rs27, fd31; +} +{ +cvt.rn.f16.f64 rs28, fd37; +} +{ +cvt.rn.f16.f64 rs29, fd38; +} +{ +cvt.rn.f16.f64 rs30, fd37; +} +mov.b32 r419, {rs27, rs27}; +{ +mul.f16x2 r405, r348, r419; +} +mov.b32 r416, {rs28, rs28}; +{ +mul.f16x2 r408, r384, r416; +} +{ +sub.f16x2 r411, r405, r408; +} +{ +mul.f16x2 r414, r348, r416; +} +{ +fma.rn.f16x2 r417, r384, r419, r414; +} +mov.b32 r435, {rs29, rs29}; +{ +mul.f16x2 r421, r366, r435; +} +mov.b32 r432, {rs30, rs30}; +{ +mul.f16x2 r424, r402, r432; +} +{ +sub.f16x2 r427, r421, r424; +} +{ +mul.f16x2 r430, r366, r432; +} +{ +fma.rn.f16x2 r433, r402, r435, r430; +} +{ +add.f16x2 r437, r240, r324; +} +{ +add.f16x2 r440, r246, r330; +} +{ +sub.f16x2 r443, r240, r324; +} +{ +sub.f16x2 r446, r246, r330; +} +{ +add.f16x2 r449, r264, r411; +} +{ +add.f16x2 r452, r300, r417; +} +{ +sub.f16x2 r455, r264, r411; +} +{ +sub.f16x2 r458, r300, r417; +} +{ +add.f16x2 r461, r282, r427; +} +{ +add.f16x2 r464, r318, r433; +} +{ +sub.f16x2 r467, r282, r427; +} +{ +sub.f16x2 r470, r318, r433; +} +{ +cvt.rn.f16.f64 rs37, fd29; +} +{ +cvt.rn.f16.f64 rs38, fd38; +} +{ +cvt.rn.f16.f64 rs39, fd31; +} +{ +cvt.rn.f16.f64 rs40, fd37; +} +{ +cvt.rn.f16.f64 rs43, fd38; +} +{ +cvt.rn.f16.f64 rs44, fd37; +} +{ +cvt.rn.f16.f64 rs45, fd37; +} +{ +cvt.rn.f16.f64 rs46, fd38; +} +mov.b32 r487, {rs37, rs37}; +{ +mul.f16x2 r473, r449, r487; +} +mov.b32 r484, {rs38, rs38}; +{ +mul.f16x2 r476, r452, r484; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r449, r484; +} +{ +fma.rn.f16x2 r485, r452, r487, r482; +} +mov.b32 r503, {rs39, rs39}; +{ +mul.f16x2 r489, r461, r503; +} +mov.b32 r500, {rs40, rs40}; +{ +mul.f16x2 r492, r464, r500; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r461, r500; +} +{ +fma.rn.f16x2 r501, r464, r503, r498; +} +{ +neg.f16x2 r505, r443; +} +mov.b32 r521, {rs43, rs43}; +{ +mul.f16x2 r507, r455, r521; +} +mov.b32 r518, {rs44, rs44}; +{ +mul.f16x2 r510, r458, r518; +} +{ +sub.f16x2 r513, r507, r510; +} +{ +mul.f16x2 r516, r455, r518; +} +{ +fma.rn.f16x2 r519, r458, r521, r516; +} +mov.b32 r537, {rs45, rs45}; +{ +mul.f16x2 r523, r467, r537; +} +mov.b32 r534, {rs46, rs46}; +{ +mul.f16x2 r526, r470, r534; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r467, r534; +} +{ +fma.rn.f16x2 r535, r470, r537, r532; +} +{ +add.f16x2 %0, r201, r437; +} +{ +add.f16x2 %1, r204, r440; +} +{ +sub.f16x2 %12, r201, r437; +} +{ +sub.f16x2 %13, r204, r440; +} +{ +add.f16x2 %2, r213, r479; +} +{ +add.f16x2 %3, r216, r485; +} +{ +sub.f16x2 %14, r213, r479; +} +{ +sub.f16x2 %15, r216, r485; +} +{ +add.f16x2 %4, r225, r495; +} +{ +add.f16x2 %5, r228, r501; +} +{ +sub.f16x2 %16, r225, r495; +} +{ +sub.f16x2 %17, r228, r501; +} +{ +add.f16x2 %6, r207, r446; +} +{ +add.f16x2 %7, r210, r505; +} +{ +sub.f16x2 %18, r207, r446; +} +{ +sub.f16x2 %19, r210, r505; +} +{ +add.f16x2 %8, r219, r513; +} +{ +add.f16x2 %9, r222, r519; +} +{ +sub.f16x2 %20, r219, r513; +} +{ +sub.f16x2 %21, r222, r519; +} +{ +add.f16x2 %10, r231, r529; +} +{ +add.f16x2 %11, r234, r535; +} +{ +sub.f16x2 %22, r231, r529; +} +{ +sub.f16x2 %23, r234, r535; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..5d1b1461c60a5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp16_inv.hpp.inc @@ -0,0 +1,721 @@ +#ifndef CUFFTDX_FFT_12_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_12_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1145, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<51>; +.reg .b32 r<611>; +.reg .f64 fd<51>; +.reg .b64 rd<2>; +mov.f64 fd35, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd35; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd37, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd37; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %32, %40; +} +{ +add.f16x2 r4, %24, r1; +} +{ +add.f16x2 r7, %33, %41; +} +{ +add.f16x2 r10, %25, r7; +} +{ +add.f16x2 r13, %32, %40; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %24, r16; +} +{ +sub.f16x2 r22, %33, %41; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %32, %40; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %24, r34; +} +{ +sub.f16x2 r40, %33, %41; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %33, %41; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %25, r52; +} +{ +sub.f16x2 r58, %32, %40; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %33, %41; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %25, r70; +} +{ +sub.f16x2 r76, %32, %40; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs3, fd35; +} +mov.b32 r156, {rs3, rs3}; +{ +cvt.rn.f16.f64 rs4, fd37; +} +mov.b32 r165, {rs4, rs4}; +{ +add.f16x2 r85, %36, %44; +} +{ +add.f16x2 r88, %28, r85; +} +{ +add.f16x2 r91, %37, %45; +} +{ +add.f16x2 r94, %29, r91; +} +{ +add.f16x2 r97, %36, %44; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %28, r100; +} +{ +sub.f16x2 r106, %37, %45; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %36, %44; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %28, r118; +} +{ +sub.f16x2 r124, %37, %45; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %37, %45; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %29, r136; +} +{ +sub.f16x2 r142, %36, %44; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %37, %45; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %29, r154; +} +{ +sub.f16x2 r160, %36, %44; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +mov.f64 fd38, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs5, fd38; +} +mov.f64 fd36, 0d3FEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs6, fd36; +} +{ +cvt.rn.f16.f64 rs7, fd35; +} +{ +cvt.rn.f16.f64 rs8, fd36; +} +mov.b32 r183, {rs5, rs5}; +{ +mul.f16x2 r169, r112, r183; +} +mov.b32 r180, {rs6, rs6}; +{ +mul.f16x2 r172, r148, r180; +} +{ +sub.f16x2 r175, r169, r172; +} +{ +mul.f16x2 r178, r112, r180; +} +{ +fma.rn.f16x2 r181, r148, r183, r178; +} +mov.b32 r199, {rs7, rs7}; +{ +mul.f16x2 r185, r130, r199; +} +mov.b32 r196, {rs8, rs8}; +{ +mul.f16x2 r188, r166, r196; +} +{ +sub.f16x2 r191, r185, r188; +} +{ +mul.f16x2 r194, r130, r196; +} +{ +fma.rn.f16x2 r197, r166, r199, r194; +} +{ +add.f16x2 r201, r4, r88; +} +{ +add.f16x2 r204, r10, r94; +} +{ +sub.f16x2 r207, r4, r88; +} +{ +sub.f16x2 r210, r10, r94; +} +{ +add.f16x2 r213, r28, r175; +} +{ +add.f16x2 r216, r64, r181; +} +{ +sub.f16x2 r219, r28, r175; +} +{ +sub.f16x2 r222, r64, r181; +} +{ +add.f16x2 r225, r46, r191; +} +{ +add.f16x2 r228, r82, r197; +} +{ +sub.f16x2 r231, r46, r191; +} +{ +sub.f16x2 r234, r82, r197; +} +{ +cvt.rn.f16.f64 rs15, fd35; +} +mov.b32 r308, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd37; +} +mov.b32 r317, {rs16, rs16}; +{ +add.f16x2 r237, %34, %42; +} +{ +add.f16x2 r240, %26, r237; +} +{ +add.f16x2 r243, %35, %43; +} +{ +add.f16x2 r246, %27, r243; +} +{ +add.f16x2 r249, %34, %42; +} +{ +mul.f16x2 r252, r249, r308; +} +{ +add.f16x2 r255, %26, r252; +} +{ +sub.f16x2 r258, %35, %43; +} +{ +mul.f16x2 r261, r258, r317; +} +{ +add.f16x2 r264, r255, r261; +} +{ +add.f16x2 r267, %34, %42; +} +{ +mul.f16x2 r270, r267, r308; +} +{ +add.f16x2 r273, %26, r270; +} +{ +sub.f16x2 r276, %35, %43; +} +{ +mul.f16x2 r279, r276, r317; +} +{ +sub.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %35, %43; +} +{ +mul.f16x2 r288, r285, r308; +} +{ +add.f16x2 r291, %27, r288; +} +{ +sub.f16x2 r294, %34, %42; +} +{ +mul.f16x2 r297, r294, r317; +} +{ +sub.f16x2 r300, r291, r297; +} +{ +add.f16x2 r303, %35, %43; +} +{ +mul.f16x2 r306, r303, r308; +} +{ +add.f16x2 r309, %27, r306; +} +{ +sub.f16x2 r312, %34, %42; +} +{ +mul.f16x2 r315, r312, r317; +} +{ +add.f16x2 r318, r309, r315; +} +{ +cvt.rn.f16.f64 rs17, fd35; +} +mov.b32 r392, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd37; +} +mov.b32 r401, {rs18, rs18}; +{ +add.f16x2 r321, %38, %46; +} +{ +add.f16x2 r324, %30, r321; +} +{ +add.f16x2 r327, %39, %47; +} +{ +add.f16x2 r330, %31, r327; +} +{ +add.f16x2 r333, %38, %46; +} +{ +mul.f16x2 r336, r333, r392; +} +{ +add.f16x2 r339, %30, r336; +} +{ +sub.f16x2 r342, %39, %47; +} +{ +mul.f16x2 r345, r342, r401; +} +{ +add.f16x2 r348, r339, r345; +} +{ +add.f16x2 r351, %38, %46; +} +{ +mul.f16x2 r354, r351, r392; +} +{ +add.f16x2 r357, %30, r354; +} +{ +sub.f16x2 r360, %39, %47; +} +{ +mul.f16x2 r363, r360, r401; +} +{ +sub.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %39, %47; +} +{ +mul.f16x2 r372, r369, r392; +} +{ +add.f16x2 r375, %31, r372; +} +{ +sub.f16x2 r378, %38, %46; +} +{ +mul.f16x2 r381, r378, r401; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %39, %47; +} +{ +mul.f16x2 r390, r387, r392; +} +{ +add.f16x2 r393, %31, r390; +} +{ +sub.f16x2 r396, %38, %46; +} +{ +mul.f16x2 r399, r396, r401; +} +{ +add.f16x2 r402, r393, r399; +} +{ +cvt.rn.f16.f64 rs19, fd38; +} +{ +cvt.rn.f16.f64 rs20, fd36; +} +{ +cvt.rn.f16.f64 rs21, fd35; +} +{ +cvt.rn.f16.f64 rs22, fd36; +} +mov.b32 r419, {rs19, rs19}; +{ +mul.f16x2 r405, r348, r419; +} +mov.b32 r416, {rs20, rs20}; +{ +mul.f16x2 r408, r384, r416; +} +{ +sub.f16x2 r411, r405, r408; +} +{ +mul.f16x2 r414, r348, r416; +} +{ +fma.rn.f16x2 r417, r384, r419, r414; +} +mov.b32 r435, {rs21, rs21}; +{ +mul.f16x2 r421, r366, r435; +} +mov.b32 r432, {rs22, rs22}; +{ +mul.f16x2 r424, r402, r432; +} +{ +sub.f16x2 r427, r421, r424; +} +{ +mul.f16x2 r430, r366, r432; +} +{ +fma.rn.f16x2 r433, r402, r435, r430; +} +{ +add.f16x2 r437, r240, r324; +} +{ +add.f16x2 r440, r246, r330; +} +{ +sub.f16x2 r443, r240, r324; +} +{ +sub.f16x2 r446, r246, r330; +} +{ +add.f16x2 r449, r264, r411; +} +{ +add.f16x2 r452, r300, r417; +} +{ +sub.f16x2 r455, r264, r411; +} +{ +sub.f16x2 r458, r300, r417; +} +{ +add.f16x2 r461, r282, r427; +} +{ +add.f16x2 r464, r318, r433; +} +{ +sub.f16x2 r467, r282, r427; +} +{ +sub.f16x2 r470, r318, r433; +} +{ +cvt.rn.f16.f64 rs29, fd36; +} +{ +cvt.rn.f16.f64 rs30, fd38; +} +{ +cvt.rn.f16.f64 rs31, fd38; +} +{ +cvt.rn.f16.f64 rs32, fd36; +} +{ +cvt.rn.f16.f64 rs35, fd35; +} +{ +cvt.rn.f16.f64 rs36, fd36; +} +{ +cvt.rn.f16.f64 rs37, fd37; +} +{ +cvt.rn.f16.f64 rs38, fd38; +} +mov.b32 r487, {rs29, rs29}; +{ +mul.f16x2 r473, r449, r487; +} +mov.b32 r484, {rs30, rs30}; +{ +mul.f16x2 r476, r452, r484; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r449, r484; +} +{ +fma.rn.f16x2 r485, r452, r487, r482; +} +mov.b32 r503, {rs31, rs31}; +{ +mul.f16x2 r489, r461, r503; +} +mov.b32 r500, {rs32, rs32}; +{ +mul.f16x2 r492, r464, r500; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r461, r500; +} +{ +fma.rn.f16x2 r501, r464, r503, r498; +} +{ +neg.f16x2 r505, r446; +} +mov.b32 r521, {rs35, rs35}; +{ +mul.f16x2 r507, r455, r521; +} +mov.b32 r518, {rs36, rs36}; +{ +mul.f16x2 r510, r458, r518; +} +{ +sub.f16x2 r513, r507, r510; +} +{ +mul.f16x2 r516, r455, r518; +} +{ +fma.rn.f16x2 r519, r458, r521, r516; +} +mov.b32 r537, {rs37, rs37}; +{ +mul.f16x2 r523, r467, r537; +} +mov.b32 r534, {rs38, rs38}; +{ +mul.f16x2 r526, r470, r534; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r467, r534; +} +{ +fma.rn.f16x2 r535, r470, r537, r532; +} +{ +add.f16x2 %0, r201, r437; +} +{ +add.f16x2 %1, r204, r440; +} +{ +sub.f16x2 %12, r201, r437; +} +{ +sub.f16x2 %13, r204, r440; +} +{ +add.f16x2 %2, r213, r479; +} +{ +add.f16x2 %3, r216, r485; +} +{ +sub.f16x2 %14, r213, r479; +} +{ +sub.f16x2 %15, r216, r485; +} +{ +add.f16x2 %4, r225, r495; +} +{ +add.f16x2 %5, r228, r501; +} +{ +sub.f16x2 %16, r225, r495; +} +{ +sub.f16x2 %17, r228, r501; +} +{ +add.f16x2 %6, r207, r505; +} +{ +add.f16x2 %7, r210, r443; +} +{ +sub.f16x2 %18, r207, r505; +} +{ +sub.f16x2 %19, r210, r443; +} +{ +add.f16x2 %8, r219, r513; +} +{ +add.f16x2 %9, r222, r519; +} +{ +sub.f16x2 %20, r219, r513; +} +{ +sub.f16x2 %21, r222, r519; +} +{ +add.f16x2 %10, r231, r529; +} +{ +add.f16x2 %11, r234, r535; +} +{ +sub.f16x2 %22, r231, r529; +} +{ +sub.f16x2 %23, r234, r535; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..5330e98ab4c35 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp32_fwd.hpp.inc @@ -0,0 +1,168 @@ +#ifndef CUFFTDX_FFT_12_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_12_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<197, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<201>; +.reg .b64 rd<2>; +add.f32 f49, %34, %45; +add.f32 f50, %24, f49; +add.f32 f51, %36, %47; +add.f32 f52, %25, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %24, f53; +sub.f32 f55, %36, %47; +mul.f32 f56, f55, 0f3F5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %25, f59; +sub.f32 f61, %34, %45; +mul.f32 f62, f61, 0f3F5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %40, %50; +add.f32 f66, %29, f65; +add.f32 f67, %41, %52; +add.f32 f68, %31, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %29, f69; +sub.f32 f71, %41, %52; +mul.f32 f72, f71, 0f3F5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %31, f75; +sub.f32 f77, %40, %50; +mul.f32 f78, f77, 0f3F5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0fBF5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0fBF5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0fBF5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0fBF5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %37, %48; +add.f32 f104, %26, f103; +add.f32 f105, %39, %49; +add.f32 f106, %28, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %26, f107; +sub.f32 f109, %39, %49; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %28, f113; +sub.f32 f115, %37, %48; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %42, %53; +add.f32 f120, %32, f119; +add.f32 f121, %44, %54; +add.f32 f122, %33, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %32, f123; +sub.f32 f125, %44, %54; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %33, f129; +sub.f32 f131, %42, %53; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0fBF5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0fBF5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0fBF5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0fBF5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0fBF000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0fBF000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0fBF5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0fBF5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0fBF5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0fBF5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0fBF000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0fBF000000, f175; +add.f32 %1, f92, f146; +add.f32 %0, f91, f145; +add.f32 %3, f96, f161; +add.f32 %2, f95, f159; +add.f32 %5, f100, f166; +add.f32 %4, f99, f164; +sub.f32 %7, f94, f147; +add.f32 %6, f93, f148; +add.f32 %9, f98, f171; +add.f32 %8, f97, f169; +add.f32 %11, f102, f176; +add.f32 %10, f101, f174; +sub.f32 %13, f92, f146; +sub.f32 %12, f91, f145; +sub.f32 %15, f96, f161; +sub.f32 %14, f95, f159; +sub.f32 %17, f100, f166; +sub.f32 %16, f99, f164; +add.f32 %19, f94, f147; +sub.f32 %18, f93, f148; +sub.f32 %21, f98, f171; +sub.f32 %20, f97, f169; +sub.f32 %23, f102, f176; +sub.f32 %22, f101, f174; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..091ff8d6c0df1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp32_inv.hpp.inc @@ -0,0 +1,168 @@ +#ifndef CUFFTDX_FFT_12_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_12_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<399, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<201>; +.reg .b64 rd<2>; +add.f32 f49, %34, %45; +add.f32 f50, %24, f49; +add.f32 f51, %36, %47; +add.f32 f52, %25, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %24, f53; +sub.f32 f55, %36, %47; +mul.f32 f56, f55, 0fBF5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %25, f59; +sub.f32 f61, %34, %45; +mul.f32 f62, f61, 0fBF5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %40, %50; +add.f32 f66, %29, f65; +add.f32 f67, %41, %52; +add.f32 f68, %31, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %29, f69; +sub.f32 f71, %41, %52; +mul.f32 f72, f71, 0fBF5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %31, f75; +sub.f32 f77, %40, %50; +mul.f32 f78, f77, 0fBF5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0f3F5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0f3F5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0f3F5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0f3F5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %37, %48; +add.f32 f104, %26, f103; +add.f32 f105, %39, %49; +add.f32 f106, %28, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %26, f107; +sub.f32 f109, %39, %49; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %28, f113; +sub.f32 f115, %37, %48; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %42, %53; +add.f32 f120, %32, f119; +add.f32 f121, %44, %54; +add.f32 f122, %33, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %32, f123; +sub.f32 f125, %44, %54; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %33, f129; +sub.f32 f131, %42, %53; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0f3F5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0f3F5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0f3F5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0f3F5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0f3F000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0f3F000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0f3F5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0f3F5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0f3F5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0f3F5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0f3F000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0f3F000000, f175; +add.f32 %1, f92, f146; +add.f32 %0, f91, f145; +add.f32 %3, f96, f161; +add.f32 %2, f95, f159; +add.f32 %5, f100, f166; +add.f32 %4, f99, f164; +add.f32 %7, f94, f147; +sub.f32 %6, f93, f148; +add.f32 %9, f98, f171; +add.f32 %8, f97, f169; +add.f32 %11, f102, f176; +add.f32 %10, f101, f174; +sub.f32 %13, f92, f146; +sub.f32 %12, f91, f145; +sub.f32 %15, f96, f161; +sub.f32 %14, f95, f159; +sub.f32 %17, f100, f166; +sub.f32 %16, f99, f164; +sub.f32 %19, f94, f147; +add.f32 %18, f93, f148; +sub.f32 %21, f98, f171; +sub.f32 %20, f97, f169; +sub.f32 %23, f102, f176; +sub.f32 %22, f101, f174; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..be4286300b910 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp64_fwd.hpp.inc @@ -0,0 +1,168 @@ +#ifndef CUFFTDX_FFT_12_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_12_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<570, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<201>; +.reg .b64 rd<2>; +add.f64 fd49, %34, %45; +add.f64 fd50, %24, fd49; +add.f64 fd51, %36, %47; +add.f64 fd52, %25, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %24, fd53; +sub.f64 fd55, %36, %47; +mul.f64 fd56, fd55, 0d3FEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %25, fd59; +sub.f64 fd61, %34, %45; +mul.f64 fd62, fd61, 0d3FEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %40, %50; +add.f64 fd66, %29, fd65; +add.f64 fd67, %41, %52; +add.f64 fd68, %31, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %29, fd69; +sub.f64 fd71, %41, %52; +mul.f64 fd72, fd71, 0d3FEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %31, fd75; +sub.f64 fd77, %40, %50; +mul.f64 fd78, fd77, 0d3FEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0dBFEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0dBFEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0dBFEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %37, %48; +add.f64 fd104, %26, fd103; +add.f64 fd105, %39, %49; +add.f64 fd106, %28, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %26, fd107; +sub.f64 fd109, %39, %49; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %28, fd113; +sub.f64 fd115, %37, %48; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %42, %53; +add.f64 fd120, %32, fd119; +add.f64 fd121, %44, %54; +add.f64 fd122, %33, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %32, fd123; +sub.f64 fd125, %44, %54; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %33, fd129; +sub.f64 fd131, %42, %53; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0dBFEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0dBFEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0dBFEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0dBFE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0dBFE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0dBFEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0dBFEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0dBFEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0dBFE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0dBFE0000000000000, fd175; +add.f64 %1, fd92, fd146; +add.f64 %0, fd91, fd145; +add.f64 %3, fd96, fd161; +add.f64 %2, fd95, fd159; +add.f64 %5, fd100, fd166; +add.f64 %4, fd99, fd164; +sub.f64 %7, fd94, fd147; +add.f64 %6, fd93, fd148; +add.f64 %9, fd98, fd171; +add.f64 %8, fd97, fd169; +add.f64 %11, fd102, fd176; +add.f64 %10, fd101, fd174; +sub.f64 %13, fd92, fd146; +sub.f64 %12, fd91, fd145; +sub.f64 %15, fd96, fd161; +sub.f64 %14, fd95, fd159; +sub.f64 %17, fd100, fd166; +sub.f64 %16, fd99, fd164; +add.f64 %19, fd94, fd147; +sub.f64 %18, fd93, fd148; +sub.f64 %21, fd98, fd171; +sub.f64 %20, fd97, fd169; +sub.f64 %23, fd102, fd176; +sub.f64 %22, fd101, fd174; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..97ad7e7cf4f0b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_12_fp64_inv.hpp.inc @@ -0,0 +1,168 @@ +#ifndef CUFFTDX_FFT_12_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_12_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<741, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<201>; +.reg .b64 rd<2>; +add.f64 fd49, %34, %45; +add.f64 fd50, %24, fd49; +add.f64 fd51, %36, %47; +add.f64 fd52, %25, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %24, fd53; +sub.f64 fd55, %36, %47; +mul.f64 fd56, fd55, 0dBFEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %25, fd59; +sub.f64 fd61, %34, %45; +mul.f64 fd62, fd61, 0dBFEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %40, %50; +add.f64 fd66, %29, fd65; +add.f64 fd67, %41, %52; +add.f64 fd68, %31, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %29, fd69; +sub.f64 fd71, %41, %52; +mul.f64 fd72, fd71, 0dBFEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %31, fd75; +sub.f64 fd77, %40, %50; +mul.f64 fd78, fd77, 0dBFEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0d3FEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0d3FEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0d3FEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %37, %48; +add.f64 fd104, %26, fd103; +add.f64 fd105, %39, %49; +add.f64 fd106, %28, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %26, fd107; +sub.f64 fd109, %39, %49; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %28, fd113; +sub.f64 fd115, %37, %48; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %42, %53; +add.f64 fd120, %32, fd119; +add.f64 fd121, %44, %54; +add.f64 fd122, %33, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %32, fd123; +sub.f64 fd125, %44, %54; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %33, fd129; +sub.f64 fd131, %42, %53; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0d3FEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0d3FEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0d3FEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0d3FE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0d3FE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0d3FEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0d3FEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0d3FEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0d3FE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0d3FE0000000000000, fd175; +add.f64 %1, fd92, fd146; +add.f64 %0, fd91, fd145; +add.f64 %3, fd96, fd161; +add.f64 %2, fd95, fd159; +add.f64 %5, fd100, fd166; +add.f64 %4, fd99, fd164; +add.f64 %7, fd94, fd147; +sub.f64 %6, fd93, fd148; +add.f64 %9, fd98, fd171; +add.f64 %8, fd97, fd169; +add.f64 %11, fd102, fd176; +add.f64 %10, fd101, fd174; +sub.f64 %13, fd92, fd146; +sub.f64 %12, fd91, fd145; +sub.f64 %15, fd96, fd161; +sub.f64 %14, fd95, fd159; +sub.f64 %17, fd100, fd166; +sub.f64 %16, fd99, fd164; +sub.f64 %19, fd94, fd147; +add.f64 %18, fd93, fd148; +sub.f64 %21, fd98, fd171; +sub.f64 %20, fd97, fd169; +sub.f64 %23, fd102, fd176; +sub.f64 %22, fd101, fd174; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..ec960d0aca115 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp16_fwd.hpp.inc @@ -0,0 +1,10883 @@ +#ifndef CUFFTDX_FFT_1331_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_1331_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<928, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<718>; +.reg .b32 r<3374>; +.reg .b64 rd<8>; +{ +add.f16x2 r25, %25, %43; +} +{ +add.f16x2 r28, %26, %44; +} +{ +sub.f16x2 r31, %25, %43; +} +{ +sub.f16x2 r34, %26, %44; +} +{ +add.f16x2 r37, %27, %41; +} +{ +add.f16x2 r40, %28, %42; +} +{ +sub.f16x2 r43, %27, %41; +} +{ +sub.f16x2 r46, %28, %42; +} +{ +add.f16x2 r49, %29, %39; +} +{ +add.f16x2 r52, %30, %40; +} +{ +sub.f16x2 r55, %29, %39; +} +{ +sub.f16x2 r58, %30, %40; +} +{ +add.f16x2 r61, %31, %37; +} +{ +add.f16x2 r64, %32, %38; +} +{ +sub.f16x2 r67, %31, %37; +} +{ +sub.f16x2 r70, %32, %38; +} +{ +add.f16x2 r73, %33, %35; +} +{ +add.f16x2 r76, %34, %36; +} +{ +sub.f16x2 r79, %33, %35; +} +{ +sub.f16x2 r82, %34, %36; +} +{ +add.f16x2 r85, %23, r25; +} +{ +add.f16x2 r88, %24, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 r103, r97, r61; +} +{ +add.f16x2 r106, r100, r64; +} +{ +add.f16x2 r109, r103, r73; +} +{ +add.f16x2 r112, r106, r76; +} +mov.f32 f183, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r116, {low, high}; +} +mov.f32 f197, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r117, {low, high}; +} +{ +mul.f16x2 r118, r25, r117; +} +{ +add.f16x2 r121, %23, r118; +} +mov.f32 f163, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r124, {low, high}; +} +{ +mul.f16x2 r125, r34, r124; +} +{ +add.f16x2 r128, r115, r125; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r131, {low, high}; +} +{ +mul.f16x2 r132, r28, r131; +} +{ +add.f16x2 r135, %24, r132; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r138, {low, high}; +} +{ +mul.f16x2 r139, r31, r138; +} +{ +add.f16x2 r142, r116, r139; +} +mov.f32 f213, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r145, {low, high}; +} +{ +mul.f16x2 r146, r37, r145; +} +{ +add.f16x2 r149, r121, r146; +} +mov.f32 f59, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r152, {low, high}; +} +{ +mul.f16x2 r153, r46, r152; +} +{ +add.f16x2 r156, r128, r153; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r159, {low, high}; +} +{ +mul.f16x2 r160, r40, r159; +} +{ +add.f16x2 r163, r135, r160; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r166, {low, high}; +} +{ +mul.f16x2 r167, r43, r166; +} +{ +add.f16x2 r170, r142, r167; +} +mov.f32 f221, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r173, {low, high}; +} +{ +mul.f16x2 r174, r49, r173; +} +{ +add.f16x2 r177, r149, r174; +} +mov.f32 f223, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r180, {low, high}; +} +{ +mul.f16x2 r181, r58, r180; +} +{ +add.f16x2 r184, r156, r181; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r187, {low, high}; +} +{ +mul.f16x2 r188, r52, r187; +} +{ +add.f16x2 r191, r163, r188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r194, {low, high}; +} +{ +mul.f16x2 r195, r55, r194; +} +{ +add.f16x2 r198, r170, r195; +} +mov.f32 f205, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r201, {low, high}; +} +{ +mul.f16x2 r202, r61, r201; +} +{ +add.f16x2 r205, r177, r202; +} +mov.f32 f207, 0fBF4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r208, {low, high}; +} +{ +mul.f16x2 r209, r70, r208; +} +{ +add.f16x2 r212, r184, r209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r215, {low, high}; +} +{ +mul.f16x2 r216, r64, r215; +} +{ +add.f16x2 r219, r191, r216; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r222, {low, high}; +} +{ +mul.f16x2 r223, r67, r222; +} +{ +add.f16x2 r226, r198, r223; +} +mov.f32 f189, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r229, {low, high}; +} +{ +mul.f16x2 r230, r73, r229; +} +{ +add.f16x2 r233, r205, r230; +} +mov.f32 f191, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r236, {low, high}; +} +{ +mul.f16x2 r237, r82, r236; +} +{ +add.f16x2 r240, r212, r237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r243, {low, high}; +} +{ +mul.f16x2 r244, r76, r243; +} +{ +add.f16x2 r247, r219, r244; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r250, {low, high}; +} +{ +mul.f16x2 r251, r79, r250; +} +{ +add.f16x2 r254, r226, r251; +} +{ +sub.f16x2 r257, r233, r240; +} +{ +add.f16x2 r260, r247, r254; +} +{ +add.f16x2 r263, r233, r240; +} +{ +sub.f16x2 r266, r247, r254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r270, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r271, {low, high}; +} +{ +mul.f16x2 r272, r25, r271; +} +{ +add.f16x2 r275, %23, r272; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r34, r278; +} +{ +add.f16x2 r282, r269, r279; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r285, {low, high}; +} +{ +mul.f16x2 r286, r28, r285; +} +{ +add.f16x2 r289, %24, r286; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r292, {low, high}; +} +{ +mul.f16x2 r293, r31, r292; +} +{ +add.f16x2 r296, r270, r293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r299, {low, high}; +} +{ +mul.f16x2 r300, r37, r299; +} +{ +add.f16x2 r303, r275, r300; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r306, {low, high}; +} +{ +mul.f16x2 r307, r46, r306; +} +{ +add.f16x2 r310, r282, r307; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r313, {low, high}; +} +{ +mul.f16x2 r314, r40, r313; +} +{ +add.f16x2 r317, r289, r314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r320, {low, high}; +} +{ +mul.f16x2 r321, r43, r320; +} +{ +add.f16x2 r324, r296, r321; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r327, {low, high}; +} +{ +mul.f16x2 r328, r49, r327; +} +{ +add.f16x2 r331, r303, r328; +} +mov.f32 f111, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r334, {low, high}; +} +{ +mul.f16x2 r335, r58, r334; +} +{ +add.f16x2 r338, r310, r335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r341, {low, high}; +} +{ +mul.f16x2 r342, r52, r341; +} +{ +add.f16x2 r345, r317, r342; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r55, r348; +} +{ +add.f16x2 r352, r324, r349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r355, {low, high}; +} +{ +mul.f16x2 r356, r61, r355; +} +{ +add.f16x2 r359, r331, r356; +} +mov.f32 f155, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r362, {low, high}; +} +{ +mul.f16x2 r363, r70, r362; +} +{ +add.f16x2 r366, r338, r363; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r369, {low, high}; +} +{ +mul.f16x2 r370, r64, r369; +} +{ +add.f16x2 r373, r345, r370; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r376, {low, high}; +} +{ +mul.f16x2 r377, r67, r376; +} +{ +add.f16x2 r380, r352, r377; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r383, {low, high}; +} +{ +mul.f16x2 r384, r73, r383; +} +{ +add.f16x2 r387, r359, r384; +} +mov.f32 f199, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r390, {low, high}; +} +{ +mul.f16x2 r391, r82, r390; +} +{ +add.f16x2 r394, r366, r391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r397, {low, high}; +} +{ +mul.f16x2 r398, r76, r397; +} +{ +add.f16x2 r401, r373, r398; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r404, {low, high}; +} +{ +mul.f16x2 r405, r79, r404; +} +{ +add.f16x2 r408, r380, r405; +} +{ +sub.f16x2 r411, r387, r394; +} +{ +add.f16x2 r414, r401, r408; +} +{ +add.f16x2 r417, r387, r394; +} +{ +sub.f16x2 r420, r401, r408; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r425, {low, high}; +} +{ +mul.f16x2 r426, r25, r425; +} +{ +add.f16x2 r429, %23, r426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r432, {low, high}; +} +{ +mul.f16x2 r433, r34, r432; +} +{ +add.f16x2 r436, r423, r433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r439, {low, high}; +} +{ +mul.f16x2 r440, r28, r439; +} +{ +add.f16x2 r443, %24, r440; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r31, r446; +} +{ +add.f16x2 r450, r424, r447; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r37, r453; +} +{ +add.f16x2 r457, r429, r454; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r460, {low, high}; +} +{ +mul.f16x2 r461, r46, r460; +} +{ +add.f16x2 r464, r436, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r467, {low, high}; +} +{ +mul.f16x2 r468, r40, r467; +} +{ +add.f16x2 r471, r443, r468; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r474, {low, high}; +} +{ +mul.f16x2 r475, r43, r474; +} +{ +add.f16x2 r478, r450, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r481, {low, high}; +} +{ +mul.f16x2 r482, r49, r481; +} +{ +add.f16x2 r485, r457, r482; +} +mov.f32 f215, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r488, {low, high}; +} +{ +mul.f16x2 r489, r58, r488; +} +{ +add.f16x2 r492, r464, r489; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r495, {low, high}; +} +{ +mul.f16x2 r496, r52, r495; +} +{ +add.f16x2 r499, r471, r496; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r502, {low, high}; +} +{ +mul.f16x2 r503, r55, r502; +} +{ +add.f16x2 r506, r478, r503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r509, {low, high}; +} +{ +mul.f16x2 r510, r61, r509; +} +{ +add.f16x2 r513, r485, r510; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r516, {low, high}; +} +{ +mul.f16x2 r517, r70, r516; +} +{ +add.f16x2 r520, r492, r517; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r523, {low, high}; +} +{ +mul.f16x2 r524, r64, r523; +} +{ +add.f16x2 r527, r499, r524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r530, {low, high}; +} +{ +mul.f16x2 r531, r67, r530; +} +{ +add.f16x2 r534, r506, r531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r537, {low, high}; +} +{ +mul.f16x2 r538, r73, r537; +} +{ +add.f16x2 r541, r513, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r544, {low, high}; +} +{ +mul.f16x2 r545, r82, r544; +} +{ +add.f16x2 r548, r520, r545; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r551, {low, high}; +} +{ +mul.f16x2 r552, r76, r551; +} +{ +add.f16x2 r555, r527, r552; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r558, {low, high}; +} +{ +mul.f16x2 r559, r79, r558; +} +{ +add.f16x2 r562, r534, r559; +} +{ +sub.f16x2 r565, r541, r548; +} +{ +add.f16x2 r568, r555, r562; +} +{ +add.f16x2 r571, r541, r548; +} +{ +sub.f16x2 r574, r555, r562; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r579, {low, high}; +} +{ +mul.f16x2 r580, r25, r579; +} +{ +add.f16x2 r583, %23, r580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r586, {low, high}; +} +{ +mul.f16x2 r587, r34, r586; +} +{ +add.f16x2 r590, r577, r587; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r593, {low, high}; +} +{ +mul.f16x2 r594, r28, r593; +} +{ +add.f16x2 r597, %24, r594; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r600, {low, high}; +} +{ +mul.f16x2 r601, r31, r600; +} +{ +add.f16x2 r604, r578, r601; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r37, r607; +} +{ +add.f16x2 r611, r583, r608; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r615, r46, r614; +} +{ +add.f16x2 r618, r590, r615; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r621, {low, high}; +} +{ +mul.f16x2 r622, r40, r621; +} +{ +add.f16x2 r625, r597, r622; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r628, {low, high}; +} +{ +mul.f16x2 r629, r43, r628; +} +{ +add.f16x2 r632, r604, r629; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r635, {low, high}; +} +{ +mul.f16x2 r636, r49, r635; +} +{ +add.f16x2 r639, r611, r636; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r642, {low, high}; +} +{ +mul.f16x2 r643, r58, r642; +} +{ +add.f16x2 r646, r618, r643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r649, {low, high}; +} +{ +mul.f16x2 r650, r52, r649; +} +{ +add.f16x2 r653, r625, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r656, {low, high}; +} +{ +mul.f16x2 r657, r55, r656; +} +{ +add.f16x2 r660, r632, r657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r663, {low, high}; +} +{ +mul.f16x2 r664, r61, r663; +} +{ +add.f16x2 r667, r639, r664; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r670, {low, high}; +} +{ +mul.f16x2 r671, r70, r670; +} +{ +add.f16x2 r674, r646, r671; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r677, {low, high}; +} +{ +mul.f16x2 r678, r64, r677; +} +{ +add.f16x2 r681, r653, r678; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r684, {low, high}; +} +{ +mul.f16x2 r685, r67, r684; +} +{ +add.f16x2 r688, r660, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r691, {low, high}; +} +{ +mul.f16x2 r692, r73, r691; +} +{ +add.f16x2 r695, r667, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r82, r698; +} +{ +add.f16x2 r702, r674, r699; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r705, {low, high}; +} +{ +mul.f16x2 r706, r76, r705; +} +{ +add.f16x2 r709, r681, r706; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r712, {low, high}; +} +{ +mul.f16x2 r713, r79, r712; +} +{ +add.f16x2 r716, r688, r713; +} +{ +sub.f16x2 r719, r695, r702; +} +{ +add.f16x2 r722, r709, r716; +} +{ +add.f16x2 r725, r695, r702; +} +{ +sub.f16x2 r728, r709, r716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r25, r733; +} +{ +add.f16x2 r737, %23, r734; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r740, {low, high}; +} +{ +mul.f16x2 r741, r34, r740; +} +{ +add.f16x2 r744, r731, r741; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r28, r747; +} +{ +add.f16x2 r751, %24, r748; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r754, {low, high}; +} +{ +mul.f16x2 r755, r31, r754; +} +{ +add.f16x2 r758, r732, r755; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r37, r761; +} +{ +add.f16x2 r765, r737, r762; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r768, {low, high}; +} +{ +mul.f16x2 r769, r46, r768; +} +{ +add.f16x2 r772, r744, r769; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r775, {low, high}; +} +{ +mul.f16x2 r776, r40, r775; +} +{ +add.f16x2 r779, r751, r776; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r782, {low, high}; +} +{ +mul.f16x2 r783, r43, r782; +} +{ +add.f16x2 r786, r758, r783; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r789, {low, high}; +} +{ +mul.f16x2 r790, r49, r789; +} +{ +add.f16x2 r793, r765, r790; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r796, {low, high}; +} +{ +mul.f16x2 r797, r58, r796; +} +{ +add.f16x2 r800, r772, r797; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r803, {low, high}; +} +{ +mul.f16x2 r804, r52, r803; +} +{ +add.f16x2 r807, r779, r804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r55, r810; +} +{ +add.f16x2 r814, r786, r811; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r817, {low, high}; +} +{ +mul.f16x2 r818, r61, r817; +} +{ +add.f16x2 r821, r793, r818; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r824, {low, high}; +} +{ +mul.f16x2 r825, r70, r824; +} +{ +add.f16x2 r828, r800, r825; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r831, {low, high}; +} +{ +mul.f16x2 r832, r64, r831; +} +{ +add.f16x2 r835, r807, r832; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r838, {low, high}; +} +{ +mul.f16x2 r839, r67, r838; +} +{ +add.f16x2 r842, r814, r839; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r845, {low, high}; +} +{ +mul.f16x2 r846, r73, r845; +} +{ +add.f16x2 r849, r821, r846; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r852, {low, high}; +} +{ +mul.f16x2 r853, r82, r852; +} +{ +add.f16x2 r856, r828, r853; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r859, {low, high}; +} +{ +mul.f16x2 r860, r76, r859; +} +{ +add.f16x2 r863, r835, r860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r79, r866; +} +{ +add.f16x2 r870, r842, r867; +} +{ +sub.f16x2 r873, r849, r856; +} +{ +add.f16x2 r876, r863, r870; +} +{ +add.f16x2 r879, r849, r856; +} +{ +sub.f16x2 r882, r863, r870; +} +mov.u32 r23, %tid.x; +mul.wide.u32 rd2, r23, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r885, rd3; +sub.s32 r886, r23, r885; +shr.u32 r887, r886, 1; +add.s32 r888, r887, r885; +shr.u32 r889, r888, 6; +mul.lo.s32 r890, r889, 121; +sub.s32 r24, r23, r890; +cvt.rn.f32.u32 f225, r24; +mul.f32 f1, f225, 0f3B9AAFAF; +setp.eq.s32 p1, r24, 18; +mov.f32 f717, 0f3DADD00E; +@p1 bra LBB0_2; +sin.approx.f32 f717, f1; +LBB0_2: +mov.u32 r3362, %tid.y; +mov.u32 r3363, %22; +mad.lo.s32 r3364, r3362, 10648, r3363; +mad.lo.s32 r3365, r889, 10648, r3364; +neg.f32 f227, f717; +cos.approx.f32 f226, f1; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f227; +mov.b32 r891, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r260, r896; +} +{ +neg.f16x2 r901, r898; +} +{ +fma.rn.f16x2 r903, r257, r894, r901; +} +{ +mul.f16x2 r907, r257, r896; +} +{ +fma.rn.f16x2 r910, r260, r894, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r916, {high, high}; +} +mov.f32 f490, 0fBF800000; +mov.f32 f491, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r891, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r414, r933; +} +{ +neg.f16x2 r938, r935; +} +{ +fma.rn.f16x2 r940, r411, r931, r938; +} +{ +mul.f16x2 r944, r411, r933; +} +{ +fma.rn.f16x2 r947, r414, r931, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r568, r970; +} +{ +neg.f16x2 r975, r972; +} +{ +fma.rn.f16x2 r977, r565, r968, r975; +} +{ +mul.f16x2 r981, r565, r970; +} +{ +fma.rn.f16x2 r984, r568, r968, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r722, r1007; +} +{ +neg.f16x2 r1012, r1009; +} +{ +fma.rn.f16x2 r1014, r719, r1005, r1012; +} +{ +mul.f16x2 r1018, r719, r1007; +} +{ +fma.rn.f16x2 r1021, r722, r1005, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r876, r1044; +} +{ +neg.f16x2 r1049, r1046; +} +{ +fma.rn.f16x2 r1051, r873, r1042, r1049; +} +{ +mul.f16x2 r1055, r873, r1044; +} +{ +fma.rn.f16x2 r1058, r876, r1042, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1066, {low, high}; +} +{ +mul.f16x2 r1067, r1064, r1066; +} +{ +mul.f16x2 r1070, r1038, r1062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1073, {high, low}; +} +{ +fma.rn.f16x2 r1075, r1067, r1073, r1070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1081, {high, high}; +} +{ +mul.f16x2 r1083, r882, r1081; +} +{ +neg.f16x2 r1086, r1083; +} +{ +fma.rn.f16x2 r1088, r879, r1079, r1086; +} +{ +mul.f16x2 r1092, r879, r1081; +} +{ +fma.rn.f16x2 r1095, r882, r1079, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1103, {low, high}; +} +{ +mul.f16x2 r1104, r1101, r1103; +} +{ +mul.f16x2 r1107, r1075, r1099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1110, {high, low}; +} +{ +fma.rn.f16x2 r1112, r1104, r1110, r1107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r728, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r725, r1116, r1123; +} +{ +mul.f16x2 r1129, r725, r1118; +} +{ +fma.rn.f16x2 r1132, r728, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1112, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r574, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r571, r1153, r1160; +} +{ +mul.f16x2 r1166, r571, r1155; +} +{ +fma.rn.f16x2 r1169, r574, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r420, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r417, r1190, r1197; +} +{ +mul.f16x2 r1203, r417, r1192; +} +{ +fma.rn.f16x2 r1206, r420, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r266, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r263, r1227, r1234; +} +{ +mul.f16x2 r1240, r263, r1229; +} +{ +fma.rn.f16x2 r1243, r266, r1227, r1240; +} +barrier.sync 0; +mad.lo.s32 r3366, r24, 88, r3365; +st.shared.v2.f32 [r3366], {r109, r112}; +st.shared.v2.f32 [r3366+8], {r903, r910}; +st.shared.v2.f32 [r3366+16], {r940, r947}; +st.shared.v2.f32 [r3366+24], {r977, r984}; +st.shared.v2.f32 [r3366+32], {r1014, r1021}; +st.shared.v2.f32 [r3366+40], {r1051, r1058}; +st.shared.v2.f32 [r3366+48], {r1088, r1095}; +st.shared.v2.f32 [r3366+56], {r1125, r1132}; +st.shared.v2.f32 [r3366+64], {r1162, r1169}; +st.shared.v2.f32 [r3366+72], {r1199, r1206}; +st.shared.v2.f32 [r3366+80], {r1236, r1243}; +barrier.sync 0; +mad.lo.s32 r3367, r24, -80, r3366; +ld.shared.u32 r1325, [r3367]; +ld.shared.u32 r1328, [r3367+4]; +ld.shared.u32 r1265, [r3367+968]; +ld.shared.u32 r1268, [r3367+972]; +ld.shared.u32 r1277, [r3367+1936]; +ld.shared.u32 r1280, [r3367+1940]; +ld.shared.u32 r1289, [r3367+2904]; +ld.shared.u32 r1292, [r3367+2908]; +ld.shared.u32 r1301, [r3367+3872]; +ld.shared.u32 r1304, [r3367+3876]; +ld.shared.u32 r1313, [r3367+4840]; +ld.shared.u32 r1316, [r3367+4844]; +ld.shared.u32 r1314, [r3367+5808]; +ld.shared.u32 r1317, [r3367+5812]; +ld.shared.u32 r1302, [r3367+6776]; +ld.shared.u32 r1305, [r3367+6780]; +ld.shared.u32 r1290, [r3367+7744]; +ld.shared.u32 r1293, [r3367+7748]; +ld.shared.u32 r1278, [r3367+8712]; +ld.shared.u32 r1281, [r3367+8716]; +ld.shared.u32 r1266, [r3367+9680]; +ld.shared.u32 r1269, [r3367+9684]; +{ +add.f16x2 r1264, r1265, r1266; +} +{ +add.f16x2 r1267, r1268, r1269; +} +{ +sub.f16x2 r1270, r1265, r1266; +} +{ +sub.f16x2 r1273, r1268, r1269; +} +{ +add.f16x2 r1276, r1277, r1278; +} +{ +add.f16x2 r1279, r1280, r1281; +} +{ +sub.f16x2 r1282, r1277, r1278; +} +{ +sub.f16x2 r1285, r1280, r1281; +} +{ +add.f16x2 r1288, r1289, r1290; +} +{ +add.f16x2 r1291, r1292, r1293; +} +{ +sub.f16x2 r1294, r1289, r1290; +} +{ +sub.f16x2 r1297, r1292, r1293; +} +{ +add.f16x2 r1300, r1301, r1302; +} +{ +add.f16x2 r1303, r1304, r1305; +} +{ +sub.f16x2 r1306, r1301, r1302; +} +{ +sub.f16x2 r1309, r1304, r1305; +} +{ +add.f16x2 r1312, r1313, r1314; +} +{ +add.f16x2 r1315, r1316, r1317; +} +{ +sub.f16x2 r1318, r1313, r1314; +} +{ +sub.f16x2 r1321, r1316, r1317; +} +{ +add.f16x2 r1324, r1325, r1264; +} +{ +add.f16x2 r1327, r1328, r1267; +} +{ +add.f16x2 r1330, r1324, r1276; +} +{ +add.f16x2 r1333, r1327, r1279; +} +{ +add.f16x2 r1336, r1330, r1288; +} +{ +add.f16x2 r1339, r1333, r1291; +} +{ +add.f16x2 r1342, r1336, r1300; +} +{ +add.f16x2 r1345, r1339, r1303; +} +{ +add.f16x2 r1348, r1342, r1312; +} +{ +add.f16x2 r1351, r1345, r1315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1354, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1356, {low, high}; +} +{ +mul.f16x2 r1357, r1264, r1356; +} +{ +add.f16x2 r1360, r1325, r1357; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1363, {low, high}; +} +{ +mul.f16x2 r1364, r1273, r1363; +} +{ +add.f16x2 r1367, r1354, r1364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1267, r1370; +} +{ +add.f16x2 r1374, r1328, r1371; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1377, {low, high}; +} +{ +mul.f16x2 r1378, r1270, r1377; +} +{ +add.f16x2 r1381, r1355, r1378; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1384, {low, high}; +} +{ +mul.f16x2 r1385, r1276, r1384; +} +{ +add.f16x2 r1388, r1360, r1385; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1391, {low, high}; +} +{ +mul.f16x2 r1392, r1285, r1391; +} +{ +add.f16x2 r1395, r1367, r1392; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1398, {low, high}; +} +{ +mul.f16x2 r1399, r1279, r1398; +} +{ +add.f16x2 r1402, r1374, r1399; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1405, {low, high}; +} +{ +mul.f16x2 r1406, r1282, r1405; +} +{ +add.f16x2 r1409, r1381, r1406; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1412, {low, high}; +} +{ +mul.f16x2 r1413, r1288, r1412; +} +{ +add.f16x2 r1416, r1388, r1413; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1419, {low, high}; +} +{ +mul.f16x2 r1420, r1297, r1419; +} +{ +add.f16x2 r1423, r1395, r1420; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1426, {low, high}; +} +{ +mul.f16x2 r1427, r1291, r1426; +} +{ +add.f16x2 r1430, r1402, r1427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1433, {low, high}; +} +{ +mul.f16x2 r1434, r1294, r1433; +} +{ +add.f16x2 r1437, r1409, r1434; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1440, {low, high}; +} +{ +mul.f16x2 r1441, r1300, r1440; +} +{ +add.f16x2 r1444, r1416, r1441; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1447, {low, high}; +} +{ +mul.f16x2 r1448, r1309, r1447; +} +{ +add.f16x2 r1451, r1423, r1448; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1454, {low, high}; +} +{ +mul.f16x2 r1455, r1303, r1454; +} +{ +add.f16x2 r1458, r1430, r1455; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1461, {low, high}; +} +{ +mul.f16x2 r1462, r1306, r1461; +} +{ +add.f16x2 r1465, r1437, r1462; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1468, {low, high}; +} +{ +mul.f16x2 r1469, r1312, r1468; +} +{ +add.f16x2 r1472, r1444, r1469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1475, {low, high}; +} +{ +mul.f16x2 r1476, r1321, r1475; +} +{ +add.f16x2 r1479, r1451, r1476; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1482, {low, high}; +} +{ +mul.f16x2 r1483, r1315, r1482; +} +{ +add.f16x2 r1486, r1458, r1483; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1489, {low, high}; +} +{ +mul.f16x2 r1490, r1318, r1489; +} +{ +add.f16x2 r1493, r1465, r1490; +} +{ +sub.f16x2 r1496, r1472, r1479; +} +{ +add.f16x2 r1499, r1486, r1493; +} +{ +add.f16x2 r1502, r1472, r1479; +} +{ +sub.f16x2 r1505, r1486, r1493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1510, {low, high}; +} +{ +mul.f16x2 r1511, r1264, r1510; +} +{ +add.f16x2 r1514, r1325, r1511; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1517, {low, high}; +} +{ +mul.f16x2 r1518, r1273, r1517; +} +{ +add.f16x2 r1521, r1508, r1518; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1524, {low, high}; +} +{ +mul.f16x2 r1525, r1267, r1524; +} +{ +add.f16x2 r1528, r1328, r1525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1531, {low, high}; +} +{ +mul.f16x2 r1532, r1270, r1531; +} +{ +add.f16x2 r1535, r1509, r1532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1538, {low, high}; +} +{ +mul.f16x2 r1539, r1276, r1538; +} +{ +add.f16x2 r1542, r1514, r1539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1545, {low, high}; +} +{ +mul.f16x2 r1546, r1285, r1545; +} +{ +add.f16x2 r1549, r1521, r1546; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1552, {low, high}; +} +{ +mul.f16x2 r1553, r1279, r1552; +} +{ +add.f16x2 r1556, r1528, r1553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1559, {low, high}; +} +{ +mul.f16x2 r1560, r1282, r1559; +} +{ +add.f16x2 r1563, r1535, r1560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1566, {low, high}; +} +{ +mul.f16x2 r1567, r1288, r1566; +} +{ +add.f16x2 r1570, r1542, r1567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1573, {low, high}; +} +{ +mul.f16x2 r1574, r1297, r1573; +} +{ +add.f16x2 r1577, r1549, r1574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1291, r1580; +} +{ +add.f16x2 r1584, r1556, r1581; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1587, {low, high}; +} +{ +mul.f16x2 r1588, r1294, r1587; +} +{ +add.f16x2 r1591, r1563, r1588; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1594, {low, high}; +} +{ +mul.f16x2 r1595, r1300, r1594; +} +{ +add.f16x2 r1598, r1570, r1595; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1601, {low, high}; +} +{ +mul.f16x2 r1602, r1309, r1601; +} +{ +add.f16x2 r1605, r1577, r1602; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1608, {low, high}; +} +{ +mul.f16x2 r1609, r1303, r1608; +} +{ +add.f16x2 r1612, r1584, r1609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1615, {low, high}; +} +{ +mul.f16x2 r1616, r1306, r1615; +} +{ +add.f16x2 r1619, r1591, r1616; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1622, {low, high}; +} +{ +mul.f16x2 r1623, r1312, r1622; +} +{ +add.f16x2 r1626, r1598, r1623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1629, {low, high}; +} +{ +mul.f16x2 r1630, r1321, r1629; +} +{ +add.f16x2 r1633, r1605, r1630; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1636, {low, high}; +} +{ +mul.f16x2 r1637, r1315, r1636; +} +{ +add.f16x2 r1640, r1612, r1637; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1643, {low, high}; +} +{ +mul.f16x2 r1644, r1318, r1643; +} +{ +add.f16x2 r1647, r1619, r1644; +} +{ +sub.f16x2 r1650, r1626, r1633; +} +{ +add.f16x2 r1653, r1640, r1647; +} +{ +add.f16x2 r1656, r1626, r1633; +} +{ +sub.f16x2 r1659, r1640, r1647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1662, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1663, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1664, {low, high}; +} +{ +mul.f16x2 r1665, r1264, r1664; +} +{ +add.f16x2 r1668, r1325, r1665; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1671, {low, high}; +} +{ +mul.f16x2 r1672, r1273, r1671; +} +{ +add.f16x2 r1675, r1662, r1672; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1678, {low, high}; +} +{ +mul.f16x2 r1679, r1267, r1678; +} +{ +add.f16x2 r1682, r1328, r1679; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1685, {low, high}; +} +{ +mul.f16x2 r1686, r1270, r1685; +} +{ +add.f16x2 r1689, r1663, r1686; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1692, {low, high}; +} +{ +mul.f16x2 r1693, r1276, r1692; +} +{ +add.f16x2 r1696, r1668, r1693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1699, {low, high}; +} +{ +mul.f16x2 r1700, r1285, r1699; +} +{ +add.f16x2 r1703, r1675, r1700; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1706, {low, high}; +} +{ +mul.f16x2 r1707, r1279, r1706; +} +{ +add.f16x2 r1710, r1682, r1707; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1713, {low, high}; +} +{ +mul.f16x2 r1714, r1282, r1713; +} +{ +add.f16x2 r1717, r1689, r1714; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1720, {low, high}; +} +{ +mul.f16x2 r1721, r1288, r1720; +} +{ +add.f16x2 r1724, r1696, r1721; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1297, r1727; +} +{ +add.f16x2 r1731, r1703, r1728; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1734, {low, high}; +} +{ +mul.f16x2 r1735, r1291, r1734; +} +{ +add.f16x2 r1738, r1710, r1735; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1294, r1741; +} +{ +add.f16x2 r1745, r1717, r1742; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1748, {low, high}; +} +{ +mul.f16x2 r1749, r1300, r1748; +} +{ +add.f16x2 r1752, r1724, r1749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1755, {low, high}; +} +{ +mul.f16x2 r1756, r1309, r1755; +} +{ +add.f16x2 r1759, r1731, r1756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1762, {low, high}; +} +{ +mul.f16x2 r1763, r1303, r1762; +} +{ +add.f16x2 r1766, r1738, r1763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1769, {low, high}; +} +{ +mul.f16x2 r1770, r1306, r1769; +} +{ +add.f16x2 r1773, r1745, r1770; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1776, {low, high}; +} +{ +mul.f16x2 r1777, r1312, r1776; +} +{ +add.f16x2 r1780, r1752, r1777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1321, r1783; +} +{ +add.f16x2 r1787, r1759, r1784; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1790, {low, high}; +} +{ +mul.f16x2 r1791, r1315, r1790; +} +{ +add.f16x2 r1794, r1766, r1791; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1318, r1797; +} +{ +add.f16x2 r1801, r1773, r1798; +} +{ +sub.f16x2 r1804, r1780, r1787; +} +{ +add.f16x2 r1807, r1794, r1801; +} +{ +add.f16x2 r1810, r1780, r1787; +} +{ +sub.f16x2 r1813, r1794, r1801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1817, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1264, r1818; +} +{ +add.f16x2 r1822, r1325, r1819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1825, {low, high}; +} +{ +mul.f16x2 r1826, r1273, r1825; +} +{ +add.f16x2 r1829, r1816, r1826; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1832, {low, high}; +} +{ +mul.f16x2 r1833, r1267, r1832; +} +{ +add.f16x2 r1836, r1328, r1833; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1839, {low, high}; +} +{ +mul.f16x2 r1840, r1270, r1839; +} +{ +add.f16x2 r1843, r1817, r1840; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1846, {low, high}; +} +{ +mul.f16x2 r1847, r1276, r1846; +} +{ +add.f16x2 r1850, r1822, r1847; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1853, {low, high}; +} +{ +mul.f16x2 r1854, r1285, r1853; +} +{ +add.f16x2 r1857, r1829, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1860, {low, high}; +} +{ +mul.f16x2 r1861, r1279, r1860; +} +{ +add.f16x2 r1864, r1836, r1861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1867, {low, high}; +} +{ +mul.f16x2 r1868, r1282, r1867; +} +{ +add.f16x2 r1871, r1843, r1868; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1874, {low, high}; +} +{ +mul.f16x2 r1875, r1288, r1874; +} +{ +add.f16x2 r1878, r1850, r1875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1881, {low, high}; +} +{ +mul.f16x2 r1882, r1297, r1881; +} +{ +add.f16x2 r1885, r1857, r1882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1291, r1888; +} +{ +add.f16x2 r1892, r1864, r1889; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1895, {low, high}; +} +{ +mul.f16x2 r1896, r1294, r1895; +} +{ +add.f16x2 r1899, r1871, r1896; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1902, {low, high}; +} +{ +mul.f16x2 r1903, r1300, r1902; +} +{ +add.f16x2 r1906, r1878, r1903; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1909, {low, high}; +} +{ +mul.f16x2 r1910, r1309, r1909; +} +{ +add.f16x2 r1913, r1885, r1910; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1916, {low, high}; +} +{ +mul.f16x2 r1917, r1303, r1916; +} +{ +add.f16x2 r1920, r1892, r1917; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1923, {low, high}; +} +{ +mul.f16x2 r1924, r1306, r1923; +} +{ +add.f16x2 r1927, r1899, r1924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1930, {low, high}; +} +{ +mul.f16x2 r1931, r1312, r1930; +} +{ +add.f16x2 r1934, r1906, r1931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1937, {low, high}; +} +{ +mul.f16x2 r1938, r1321, r1937; +} +{ +add.f16x2 r1941, r1913, r1938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1944, {low, high}; +} +{ +mul.f16x2 r1945, r1315, r1944; +} +{ +add.f16x2 r1948, r1920, r1945; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1951, {low, high}; +} +{ +mul.f16x2 r1952, r1318, r1951; +} +{ +add.f16x2 r1955, r1927, r1952; +} +{ +sub.f16x2 r1958, r1934, r1941; +} +{ +add.f16x2 r1961, r1948, r1955; +} +{ +add.f16x2 r1964, r1934, r1941; +} +{ +sub.f16x2 r1967, r1948, r1955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1972, {low, high}; +} +{ +mul.f16x2 r1973, r1264, r1972; +} +{ +add.f16x2 r1976, r1325, r1973; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1979, {low, high}; +} +{ +mul.f16x2 r1980, r1273, r1979; +} +{ +add.f16x2 r1983, r1970, r1980; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1986, {low, high}; +} +{ +mul.f16x2 r1987, r1267, r1986; +} +{ +add.f16x2 r1990, r1328, r1987; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1993, {low, high}; +} +{ +mul.f16x2 r1994, r1270, r1993; +} +{ +add.f16x2 r1997, r1971, r1994; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2000, {low, high}; +} +{ +mul.f16x2 r2001, r1276, r2000; +} +{ +add.f16x2 r2004, r1976, r2001; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2007, {low, high}; +} +{ +mul.f16x2 r2008, r1285, r2007; +} +{ +add.f16x2 r2011, r1983, r2008; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2014, {low, high}; +} +{ +mul.f16x2 r2015, r1279, r2014; +} +{ +add.f16x2 r2018, r1990, r2015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2021, {low, high}; +} +{ +mul.f16x2 r2022, r1282, r2021; +} +{ +add.f16x2 r2025, r1997, r2022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2028, {low, high}; +} +{ +mul.f16x2 r2029, r1288, r2028; +} +{ +add.f16x2 r2032, r2004, r2029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2035, {low, high}; +} +{ +mul.f16x2 r2036, r1297, r2035; +} +{ +add.f16x2 r2039, r2011, r2036; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2042, {low, high}; +} +{ +mul.f16x2 r2043, r1291, r2042; +} +{ +add.f16x2 r2046, r2018, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2049, {low, high}; +} +{ +mul.f16x2 r2050, r1294, r2049; +} +{ +add.f16x2 r2053, r2025, r2050; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2057, r1300, r2056; +} +{ +add.f16x2 r2060, r2032, r2057; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2063, {low, high}; +} +{ +mul.f16x2 r2064, r1309, r2063; +} +{ +add.f16x2 r2067, r2039, r2064; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2070, {low, high}; +} +{ +mul.f16x2 r2071, r1303, r2070; +} +{ +add.f16x2 r2074, r2046, r2071; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r1306, r2077; +} +{ +add.f16x2 r2081, r2053, r2078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r1312, r2084; +} +{ +add.f16x2 r2088, r2060, r2085; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2091, {low, high}; +} +{ +mul.f16x2 r2092, r1321, r2091; +} +{ +add.f16x2 r2095, r2067, r2092; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2098, {low, high}; +} +{ +mul.f16x2 r2099, r1315, r2098; +} +{ +add.f16x2 r2102, r2074, r2099; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2105, {low, high}; +} +{ +mul.f16x2 r2106, r1318, r2105; +} +{ +add.f16x2 r2109, r2081, r2106; +} +{ +sub.f16x2 r2112, r2088, r2095; +} +{ +add.f16x2 r2115, r2102, r2109; +} +{ +add.f16x2 r2118, r2088, r2095; +} +{ +sub.f16x2 r2121, r2102, r2109; +} +mul.wide.u32 rd6, r24, -1171354717; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r3368, rd7; +cvt.rn.f32.u32 f714, r3368; +mul.f32 f715, f714, 0f3D54B191; +cos.approx.f32 f470, f715; +sin.approx.f32 f716, f715; +neg.f32 f471, f716; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f470; +cvt.rn.f16.f32 high, f471; +mov.b32 r2124, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1499, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1496, r2127, r2134; +} +{ +mul.f16x2 r2140, r1496, r2129; +} +{ +fma.rn.f16x2 r2143, r1499, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2124, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1653, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1650, r2164, r2171; +} +{ +mul.f16x2 r2177, r1650, r2166; +} +{ +fma.rn.f16x2 r2180, r1653, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1807, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1804, r2201, r2208; +} +{ +mul.f16x2 r2214, r1804, r2203; +} +{ +fma.rn.f16x2 r2217, r1807, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1961, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1958, r2238, r2245; +} +{ +mul.f16x2 r2251, r1958, r2240; +} +{ +fma.rn.f16x2 r2254, r1961, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r2115, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r2112, r2275, r2282; +} +{ +mul.f16x2 r2288, r2112, r2277; +} +{ +fma.rn.f16x2 r2291, r2115, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r2121, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r2118, r2312, r2319; +} +{ +mul.f16x2 r2325, r2118, r2314; +} +{ +fma.rn.f16x2 r2328, r2121, r2312, r2325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2334, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2336, {low, high}; +} +{ +mul.f16x2 r2337, r2334, r2336; +} +{ +mul.f16x2 r2340, r2308, r2332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2343, {high, low}; +} +{ +fma.rn.f16x2 r2345, r2337, r2343, r2340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2349, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2351, {high, high}; +} +{ +mul.f16x2 r2353, r1967, r2351; +} +{ +neg.f16x2 r2356, r2353; +} +{ +fma.rn.f16x2 r2358, r1964, r2349, r2356; +} +{ +mul.f16x2 r2362, r1964, r2351; +} +{ +fma.rn.f16x2 r2365, r1967, r2349, r2362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2371, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2373, {low, high}; +} +{ +mul.f16x2 r2374, r2371, r2373; +} +{ +mul.f16x2 r2377, r2345, r2369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2380, {high, low}; +} +{ +fma.rn.f16x2 r2382, r2374, r2380, r2377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2388, {high, high}; +} +{ +mul.f16x2 r2390, r1813, r2388; +} +{ +neg.f16x2 r2393, r2390; +} +{ +fma.rn.f16x2 r2395, r1810, r2386, r2393; +} +{ +mul.f16x2 r2399, r1810, r2388; +} +{ +fma.rn.f16x2 r2402, r1813, r2386, r2399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2408, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2410, {low, high}; +} +{ +mul.f16x2 r2411, r2408, r2410; +} +{ +mul.f16x2 r2414, r2382, r2406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2417, {high, low}; +} +{ +fma.rn.f16x2 r2419, r2411, r2417, r2414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2423, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2425, {high, high}; +} +{ +mul.f16x2 r2427, r1659, r2425; +} +{ +neg.f16x2 r2430, r2427; +} +{ +fma.rn.f16x2 r2432, r1656, r2423, r2430; +} +{ +mul.f16x2 r2436, r1656, r2425; +} +{ +fma.rn.f16x2 r2439, r1659, r2423, r2436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2443, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2445, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2447, {low, high}; +} +{ +mul.f16x2 r2448, r2445, r2447; +} +{ +mul.f16x2 r2451, r2419, r2443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2454, {high, low}; +} +{ +fma.rn.f16x2 r2456, r2448, r2454, r2451; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2460, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2462, {high, high}; +} +{ +mul.f16x2 r2464, r1505, r2462; +} +{ +neg.f16x2 r2467, r2464; +} +{ +fma.rn.f16x2 r2469, r1502, r2460, r2467; +} +{ +mul.f16x2 r2473, r1502, r2462; +} +{ +fma.rn.f16x2 r2476, r1505, r2460, r2473; +} +mul.lo.s32 r3369, r3368, 11; +sub.s32 r3370, r24, r3369; +shl.b32 r3371, r3370, 3; +add.s32 r3372, r3365, r3371; +barrier.sync 0; +mad.lo.s32 r3373, r3368, 968, r3372; +st.shared.u32 [r3373], r1348; +st.shared.u32 [r3373+4], r1351; +st.shared.u32 [r3373+88], r2136; +st.shared.u32 [r3373+92], r2143; +st.shared.u32 [r3373+176], r2173; +st.shared.u32 [r3373+180], r2180; +st.shared.u32 [r3373+264], r2210; +st.shared.u32 [r3373+268], r2217; +st.shared.u32 [r3373+352], r2247; +st.shared.u32 [r3373+356], r2254; +st.shared.u32 [r3373+440], r2284; +st.shared.u32 [r3373+444], r2291; +st.shared.u32 [r3373+528], r2321; +st.shared.u32 [r3373+532], r2328; +st.shared.u32 [r3373+616], r2358; +st.shared.u32 [r3373+620], r2365; +st.shared.u32 [r3373+704], r2395; +st.shared.u32 [r3373+708], r2402; +st.shared.u32 [r3373+792], r2432; +st.shared.u32 [r3373+796], r2439; +st.shared.u32 [r3373+880], r2469; +st.shared.u32 [r3373+884], r2476; +barrier.sync 0; +ld.shared.u32 r2558, [r3367]; +ld.shared.u32 r2561, [r3367+4]; +ld.shared.u32 r2498, [r3367+968]; +ld.shared.u32 r2501, [r3367+972]; +ld.shared.u32 r2510, [r3367+1936]; +ld.shared.u32 r2513, [r3367+1940]; +ld.shared.u32 r2522, [r3367+2904]; +ld.shared.u32 r2525, [r3367+2908]; +ld.shared.u32 r2534, [r3367+3872]; +ld.shared.u32 r2537, [r3367+3876]; +ld.shared.u32 r2546, [r3367+4840]; +ld.shared.u32 r2549, [r3367+4844]; +ld.shared.u32 r2547, [r3367+5808]; +ld.shared.u32 r2550, [r3367+5812]; +ld.shared.u32 r2535, [r3367+6776]; +ld.shared.u32 r2538, [r3367+6780]; +ld.shared.u32 r2523, [r3367+7744]; +ld.shared.u32 r2526, [r3367+7748]; +ld.shared.u32 r2511, [r3367+8712]; +ld.shared.u32 r2514, [r3367+8716]; +ld.shared.u32 r2499, [r3367+9680]; +ld.shared.u32 r2502, [r3367+9684]; +{ +add.f16x2 r2497, r2498, r2499; +} +{ +add.f16x2 r2500, r2501, r2502; +} +{ +sub.f16x2 r2503, r2498, r2499; +} +{ +sub.f16x2 r2506, r2501, r2502; +} +{ +add.f16x2 r2509, r2510, r2511; +} +{ +add.f16x2 r2512, r2513, r2514; +} +{ +sub.f16x2 r2515, r2510, r2511; +} +{ +sub.f16x2 r2518, r2513, r2514; +} +{ +add.f16x2 r2521, r2522, r2523; +} +{ +add.f16x2 r2524, r2525, r2526; +} +{ +sub.f16x2 r2527, r2522, r2523; +} +{ +sub.f16x2 r2530, r2525, r2526; +} +{ +add.f16x2 r2533, r2534, r2535; +} +{ +add.f16x2 r2536, r2537, r2538; +} +{ +sub.f16x2 r2539, r2534, r2535; +} +{ +sub.f16x2 r2542, r2537, r2538; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2497; +} +{ +add.f16x2 r2560, r2561, r2500; +} +{ +add.f16x2 r2563, r2557, r2509; +} +{ +add.f16x2 r2566, r2560, r2512; +} +{ +add.f16x2 r2569, r2563, r2521; +} +{ +add.f16x2 r2572, r2566, r2524; +} +{ +add.f16x2 r2575, r2569, r2533; +} +{ +add.f16x2 r2578, r2572, r2536; +} +{ +add.f16x2 %0, r2575, r2545; +} +{ +add.f16x2 %1, r2578, r2548; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2589, {low, high}; +} +{ +mul.f16x2 r2590, r2497, r2589; +} +{ +add.f16x2 r2593, r2558, r2590; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2596, {low, high}; +} +{ +mul.f16x2 r2597, r2506, r2596; +} +{ +add.f16x2 r2600, r2587, r2597; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2603, {low, high}; +} +{ +mul.f16x2 r2604, r2500, r2603; +} +{ +add.f16x2 r2607, r2561, r2604; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2610, {low, high}; +} +{ +mul.f16x2 r2611, r2503, r2610; +} +{ +add.f16x2 r2614, r2588, r2611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2617, {low, high}; +} +{ +mul.f16x2 r2618, r2509, r2617; +} +{ +add.f16x2 r2621, r2593, r2618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2624, {low, high}; +} +{ +mul.f16x2 r2625, r2518, r2624; +} +{ +add.f16x2 r2628, r2600, r2625; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2631, {low, high}; +} +{ +mul.f16x2 r2632, r2512, r2631; +} +{ +add.f16x2 r2635, r2607, r2632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2638, {low, high}; +} +{ +mul.f16x2 r2639, r2515, r2638; +} +{ +add.f16x2 r2642, r2614, r2639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2645, {low, high}; +} +{ +mul.f16x2 r2646, r2521, r2645; +} +{ +add.f16x2 r2649, r2621, r2646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2652, {low, high}; +} +{ +mul.f16x2 r2653, r2530, r2652; +} +{ +add.f16x2 r2656, r2628, r2653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2659, {low, high}; +} +{ +mul.f16x2 r2660, r2524, r2659; +} +{ +add.f16x2 r2663, r2635, r2660; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2666, {low, high}; +} +{ +mul.f16x2 r2667, r2527, r2666; +} +{ +add.f16x2 r2670, r2642, r2667; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2673, {low, high}; +} +{ +mul.f16x2 r2674, r2533, r2673; +} +{ +add.f16x2 r2677, r2649, r2674; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2680, {low, high}; +} +{ +mul.f16x2 r2681, r2542, r2680; +} +{ +add.f16x2 r2684, r2656, r2681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2687, {low, high}; +} +{ +mul.f16x2 r2688, r2536, r2687; +} +{ +add.f16x2 r2691, r2663, r2688; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2694, {low, high}; +} +{ +mul.f16x2 r2695, r2539, r2694; +} +{ +add.f16x2 r2698, r2670, r2695; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2701, {low, high}; +} +{ +mul.f16x2 r2702, r2545, r2701; +} +{ +add.f16x2 r2705, r2677, r2702; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2708, {low, high}; +} +{ +mul.f16x2 r2709, r2554, r2708; +} +{ +add.f16x2 r2712, r2684, r2709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2715, {low, high}; +} +{ +mul.f16x2 r2716, r2548, r2715; +} +{ +add.f16x2 r2719, r2691, r2716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2722, {low, high}; +} +{ +mul.f16x2 r2723, r2551, r2722; +} +{ +add.f16x2 r2726, r2698, r2723; +} +{ +sub.f16x2 %2, r2705, r2712; +} +{ +add.f16x2 %3, r2719, r2726; +} +{ +add.f16x2 %20, r2705, r2712; +} +{ +sub.f16x2 %21, r2719, r2726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2743, {low, high}; +} +{ +mul.f16x2 r2744, r2497, r2743; +} +{ +add.f16x2 r2747, r2558, r2744; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2750, {low, high}; +} +{ +mul.f16x2 r2751, r2506, r2750; +} +{ +add.f16x2 r2754, r2741, r2751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2757, {low, high}; +} +{ +mul.f16x2 r2758, r2500, r2757; +} +{ +add.f16x2 r2761, r2561, r2758; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2764, {low, high}; +} +{ +mul.f16x2 r2765, r2503, r2764; +} +{ +add.f16x2 r2768, r2742, r2765; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2771, {low, high}; +} +{ +mul.f16x2 r2772, r2509, r2771; +} +{ +add.f16x2 r2775, r2747, r2772; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2778, {low, high}; +} +{ +mul.f16x2 r2779, r2518, r2778; +} +{ +add.f16x2 r2782, r2754, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2785, {low, high}; +} +{ +mul.f16x2 r2786, r2512, r2785; +} +{ +add.f16x2 r2789, r2761, r2786; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2792, {low, high}; +} +{ +mul.f16x2 r2793, r2515, r2792; +} +{ +add.f16x2 r2796, r2768, r2793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2799, {low, high}; +} +{ +mul.f16x2 r2800, r2521, r2799; +} +{ +add.f16x2 r2803, r2775, r2800; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2806, {low, high}; +} +{ +mul.f16x2 r2807, r2530, r2806; +} +{ +add.f16x2 r2810, r2782, r2807; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2524, r2813; +} +{ +add.f16x2 r2817, r2789, r2814; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2820, {low, high}; +} +{ +mul.f16x2 r2821, r2527, r2820; +} +{ +add.f16x2 r2824, r2796, r2821; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2827, {low, high}; +} +{ +mul.f16x2 r2828, r2533, r2827; +} +{ +add.f16x2 r2831, r2803, r2828; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2834, {low, high}; +} +{ +mul.f16x2 r2835, r2542, r2834; +} +{ +add.f16x2 r2838, r2810, r2835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2841, {low, high}; +} +{ +mul.f16x2 r2842, r2536, r2841; +} +{ +add.f16x2 r2845, r2817, r2842; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2848, {low, high}; +} +{ +mul.f16x2 r2849, r2539, r2848; +} +{ +add.f16x2 r2852, r2824, r2849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2855, {low, high}; +} +{ +mul.f16x2 r2856, r2545, r2855; +} +{ +add.f16x2 r2859, r2831, r2856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2862, {low, high}; +} +{ +mul.f16x2 r2863, r2554, r2862; +} +{ +add.f16x2 r2866, r2838, r2863; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2869, {low, high}; +} +{ +mul.f16x2 r2870, r2548, r2869; +} +{ +add.f16x2 r2873, r2845, r2870; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2876, {low, high}; +} +{ +mul.f16x2 r2877, r2551, r2876; +} +{ +add.f16x2 r2880, r2852, r2877; +} +{ +sub.f16x2 %4, r2859, r2866; +} +{ +add.f16x2 %5, r2873, r2880; +} +{ +add.f16x2 %18, r2859, r2866; +} +{ +sub.f16x2 %19, r2873, r2880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2897, {low, high}; +} +{ +mul.f16x2 r2898, r2497, r2897; +} +{ +add.f16x2 r2901, r2558, r2898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2904, {low, high}; +} +{ +mul.f16x2 r2905, r2506, r2904; +} +{ +add.f16x2 r2908, r2895, r2905; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2911, {low, high}; +} +{ +mul.f16x2 r2912, r2500, r2911; +} +{ +add.f16x2 r2915, r2561, r2912; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2918, {low, high}; +} +{ +mul.f16x2 r2919, r2503, r2918; +} +{ +add.f16x2 r2922, r2896, r2919; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2925, {low, high}; +} +{ +mul.f16x2 r2926, r2509, r2925; +} +{ +add.f16x2 r2929, r2901, r2926; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2932, {low, high}; +} +{ +mul.f16x2 r2933, r2518, r2932; +} +{ +add.f16x2 r2936, r2908, r2933; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2939, {low, high}; +} +{ +mul.f16x2 r2940, r2512, r2939; +} +{ +add.f16x2 r2943, r2915, r2940; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2946, {low, high}; +} +{ +mul.f16x2 r2947, r2515, r2946; +} +{ +add.f16x2 r2950, r2922, r2947; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2953, {low, high}; +} +{ +mul.f16x2 r2954, r2521, r2953; +} +{ +add.f16x2 r2957, r2929, r2954; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2960, {low, high}; +} +{ +mul.f16x2 r2961, r2530, r2960; +} +{ +add.f16x2 r2964, r2936, r2961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2967, {low, high}; +} +{ +mul.f16x2 r2968, r2524, r2967; +} +{ +add.f16x2 r2971, r2943, r2968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2974, {low, high}; +} +{ +mul.f16x2 r2975, r2527, r2974; +} +{ +add.f16x2 r2978, r2950, r2975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2981, {low, high}; +} +{ +mul.f16x2 r2982, r2533, r2981; +} +{ +add.f16x2 r2985, r2957, r2982; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2988, {low, high}; +} +{ +mul.f16x2 r2989, r2542, r2988; +} +{ +add.f16x2 r2992, r2964, r2989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2995, {low, high}; +} +{ +mul.f16x2 r2996, r2536, r2995; +} +{ +add.f16x2 r2999, r2971, r2996; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3002, {low, high}; +} +{ +mul.f16x2 r3003, r2539, r3002; +} +{ +add.f16x2 r3006, r2978, r3003; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r2545, r3009; +} +{ +add.f16x2 r3013, r2985, r3010; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3016, {low, high}; +} +{ +mul.f16x2 r3017, r2554, r3016; +} +{ +add.f16x2 r3020, r2992, r3017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3023, {low, high}; +} +{ +mul.f16x2 r3024, r2548, r3023; +} +{ +add.f16x2 r3027, r2999, r3024; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3030, {low, high}; +} +{ +mul.f16x2 r3031, r2551, r3030; +} +{ +add.f16x2 r3034, r3006, r3031; +} +{ +sub.f16x2 %6, r3013, r3020; +} +{ +add.f16x2 %7, r3027, r3034; +} +{ +add.f16x2 %16, r3013, r3020; +} +{ +sub.f16x2 %17, r3027, r3034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3051, {low, high}; +} +{ +mul.f16x2 r3052, r2497, r3051; +} +{ +add.f16x2 r3055, r2558, r3052; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3058, {low, high}; +} +{ +mul.f16x2 r3059, r2506, r3058; +} +{ +add.f16x2 r3062, r3049, r3059; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3065, {low, high}; +} +{ +mul.f16x2 r3066, r2500, r3065; +} +{ +add.f16x2 r3069, r2561, r3066; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3072, {low, high}; +} +{ +mul.f16x2 r3073, r2503, r3072; +} +{ +add.f16x2 r3076, r3050, r3073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3079, {low, high}; +} +{ +mul.f16x2 r3080, r2509, r3079; +} +{ +add.f16x2 r3083, r3055, r3080; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3086, {low, high}; +} +{ +mul.f16x2 r3087, r2518, r3086; +} +{ +add.f16x2 r3090, r3062, r3087; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3093, {low, high}; +} +{ +mul.f16x2 r3094, r2512, r3093; +} +{ +add.f16x2 r3097, r3069, r3094; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r2515, r3100; +} +{ +add.f16x2 r3104, r3076, r3101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3107, {low, high}; +} +{ +mul.f16x2 r3108, r2521, r3107; +} +{ +add.f16x2 r3111, r3083, r3108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3114, {low, high}; +} +{ +mul.f16x2 r3115, r2530, r3114; +} +{ +add.f16x2 r3118, r3090, r3115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3121, {low, high}; +} +{ +mul.f16x2 r3122, r2524, r3121; +} +{ +add.f16x2 r3125, r3097, r3122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3128, {low, high}; +} +{ +mul.f16x2 r3129, r2527, r3128; +} +{ +add.f16x2 r3132, r3104, r3129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3135, {low, high}; +} +{ +mul.f16x2 r3136, r2533, r3135; +} +{ +add.f16x2 r3139, r3111, r3136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3142, {low, high}; +} +{ +mul.f16x2 r3143, r2542, r3142; +} +{ +add.f16x2 r3146, r3118, r3143; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3149, {low, high}; +} +{ +mul.f16x2 r3150, r2536, r3149; +} +{ +add.f16x2 r3153, r3125, r3150; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3156, {low, high}; +} +{ +mul.f16x2 r3157, r2539, r3156; +} +{ +add.f16x2 r3160, r3132, r3157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3163, {low, high}; +} +{ +mul.f16x2 r3164, r2545, r3163; +} +{ +add.f16x2 r3167, r3139, r3164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3170, {low, high}; +} +{ +mul.f16x2 r3171, r2554, r3170; +} +{ +add.f16x2 r3174, r3146, r3171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r2548, r3177; +} +{ +add.f16x2 r3181, r3153, r3178; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3184, {low, high}; +} +{ +mul.f16x2 r3185, r2551, r3184; +} +{ +add.f16x2 r3188, r3160, r3185; +} +{ +sub.f16x2 %8, r3167, r3174; +} +{ +add.f16x2 %9, r3181, r3188; +} +{ +add.f16x2 %14, r3167, r3174; +} +{ +sub.f16x2 %15, r3181, r3188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3204, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3205, {low, high}; +} +{ +mul.f16x2 r3206, r2497, r3205; +} +{ +add.f16x2 r3209, r2558, r3206; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3212, {low, high}; +} +{ +mul.f16x2 r3213, r2506, r3212; +} +{ +add.f16x2 r3216, r3203, r3213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3219, {low, high}; +} +{ +mul.f16x2 r3220, r2500, r3219; +} +{ +add.f16x2 r3223, r2561, r3220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3226, {low, high}; +} +{ +mul.f16x2 r3227, r2503, r3226; +} +{ +add.f16x2 r3230, r3204, r3227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3233, {low, high}; +} +{ +mul.f16x2 r3234, r2509, r3233; +} +{ +add.f16x2 r3237, r3209, r3234; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3240, {low, high}; +} +{ +mul.f16x2 r3241, r2518, r3240; +} +{ +add.f16x2 r3244, r3216, r3241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3247, {low, high}; +} +{ +mul.f16x2 r3248, r2512, r3247; +} +{ +add.f16x2 r3251, r3223, r3248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3254, {low, high}; +} +{ +mul.f16x2 r3255, r2515, r3254; +} +{ +add.f16x2 r3258, r3230, r3255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3261, {low, high}; +} +{ +mul.f16x2 r3262, r2521, r3261; +} +{ +add.f16x2 r3265, r3237, r3262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r2530, r3268; +} +{ +add.f16x2 r3272, r3244, r3269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3275, {low, high}; +} +{ +mul.f16x2 r3276, r2524, r3275; +} +{ +add.f16x2 r3279, r3251, r3276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3282, {low, high}; +} +{ +mul.f16x2 r3283, r2527, r3282; +} +{ +add.f16x2 r3286, r3258, r3283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3289, {low, high}; +} +{ +mul.f16x2 r3290, r2533, r3289; +} +{ +add.f16x2 r3293, r3265, r3290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3296, {low, high}; +} +{ +mul.f16x2 r3297, r2542, r3296; +} +{ +add.f16x2 r3300, r3272, r3297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3303, {low, high}; +} +{ +mul.f16x2 r3304, r2536, r3303; +} +{ +add.f16x2 r3307, r3279, r3304; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3310, {low, high}; +} +{ +mul.f16x2 r3311, r2539, r3310; +} +{ +add.f16x2 r3314, r3286, r3311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3317, {low, high}; +} +{ +mul.f16x2 r3318, r2545, r3317; +} +{ +add.f16x2 r3321, r3293, r3318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3324, {low, high}; +} +{ +mul.f16x2 r3325, r2554, r3324; +} +{ +add.f16x2 r3328, r3300, r3325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3331, {low, high}; +} +{ +mul.f16x2 r3332, r2548, r3331; +} +{ +add.f16x2 r3335, r3307, r3332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3338, {low, high}; +} +{ +mul.f16x2 r3339, r2551, r3338; +} +{ +add.f16x2 r3342, r3314, r3339; +} +{ +sub.f16x2 %10, r3321, r3328; +} +{ +add.f16x2 %11, r3335, r3342; +} +{ +add.f16x2 %12, r3321, r3328; +} +{ +sub.f16x2 %13, r3335, r3342; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<927, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<718>; +.reg .b32 r<3374>; +.reg .b64 rd<8>; +{ +add.f16x2 r25, %25, %43; +} +{ +add.f16x2 r28, %26, %44; +} +{ +sub.f16x2 r31, %25, %43; +} +{ +sub.f16x2 r34, %26, %44; +} +{ +add.f16x2 r37, %27, %41; +} +{ +add.f16x2 r40, %28, %42; +} +{ +sub.f16x2 r43, %27, %41; +} +{ +sub.f16x2 r46, %28, %42; +} +{ +add.f16x2 r49, %29, %39; +} +{ +add.f16x2 r52, %30, %40; +} +{ +sub.f16x2 r55, %29, %39; +} +{ +sub.f16x2 r58, %30, %40; +} +{ +add.f16x2 r61, %31, %37; +} +{ +add.f16x2 r64, %32, %38; +} +{ +sub.f16x2 r67, %31, %37; +} +{ +sub.f16x2 r70, %32, %38; +} +{ +add.f16x2 r73, %33, %35; +} +{ +add.f16x2 r76, %34, %36; +} +{ +sub.f16x2 r79, %33, %35; +} +{ +sub.f16x2 r82, %34, %36; +} +{ +add.f16x2 r85, %23, r25; +} +{ +add.f16x2 r88, %24, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 r103, r97, r61; +} +{ +add.f16x2 r106, r100, r64; +} +{ +add.f16x2 r109, r103, r73; +} +{ +add.f16x2 r112, r106, r76; +} +mov.f32 f183, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r116, {low, high}; +} +mov.f32 f197, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r117, {low, high}; +} +{ +mul.f16x2 r118, r25, r117; +} +{ +add.f16x2 r121, %23, r118; +} +mov.f32 f163, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r124, {low, high}; +} +{ +mul.f16x2 r125, r34, r124; +} +{ +add.f16x2 r128, r115, r125; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r131, {low, high}; +} +{ +mul.f16x2 r132, r28, r131; +} +{ +add.f16x2 r135, %24, r132; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r138, {low, high}; +} +{ +mul.f16x2 r139, r31, r138; +} +{ +add.f16x2 r142, r116, r139; +} +mov.f32 f213, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r145, {low, high}; +} +{ +mul.f16x2 r146, r37, r145; +} +{ +add.f16x2 r149, r121, r146; +} +mov.f32 f59, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r152, {low, high}; +} +{ +mul.f16x2 r153, r46, r152; +} +{ +add.f16x2 r156, r128, r153; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r159, {low, high}; +} +{ +mul.f16x2 r160, r40, r159; +} +{ +add.f16x2 r163, r135, r160; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r166, {low, high}; +} +{ +mul.f16x2 r167, r43, r166; +} +{ +add.f16x2 r170, r142, r167; +} +mov.f32 f221, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r173, {low, high}; +} +{ +mul.f16x2 r174, r49, r173; +} +{ +add.f16x2 r177, r149, r174; +} +mov.f32 f223, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r180, {low, high}; +} +{ +mul.f16x2 r181, r58, r180; +} +{ +add.f16x2 r184, r156, r181; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r187, {low, high}; +} +{ +mul.f16x2 r188, r52, r187; +} +{ +add.f16x2 r191, r163, r188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r194, {low, high}; +} +{ +mul.f16x2 r195, r55, r194; +} +{ +add.f16x2 r198, r170, r195; +} +mov.f32 f205, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r201, {low, high}; +} +{ +mul.f16x2 r202, r61, r201; +} +{ +add.f16x2 r205, r177, r202; +} +mov.f32 f207, 0fBF4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r208, {low, high}; +} +{ +mul.f16x2 r209, r70, r208; +} +{ +add.f16x2 r212, r184, r209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r215, {low, high}; +} +{ +mul.f16x2 r216, r64, r215; +} +{ +add.f16x2 r219, r191, r216; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r222, {low, high}; +} +{ +mul.f16x2 r223, r67, r222; +} +{ +add.f16x2 r226, r198, r223; +} +mov.f32 f189, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r229, {low, high}; +} +{ +mul.f16x2 r230, r73, r229; +} +{ +add.f16x2 r233, r205, r230; +} +mov.f32 f191, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r236, {low, high}; +} +{ +mul.f16x2 r237, r82, r236; +} +{ +add.f16x2 r240, r212, r237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r243, {low, high}; +} +{ +mul.f16x2 r244, r76, r243; +} +{ +add.f16x2 r247, r219, r244; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r250, {low, high}; +} +{ +mul.f16x2 r251, r79, r250; +} +{ +add.f16x2 r254, r226, r251; +} +{ +sub.f16x2 r257, r233, r240; +} +{ +add.f16x2 r260, r247, r254; +} +{ +add.f16x2 r263, r233, r240; +} +{ +sub.f16x2 r266, r247, r254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r270, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r271, {low, high}; +} +{ +mul.f16x2 r272, r25, r271; +} +{ +add.f16x2 r275, %23, r272; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r34, r278; +} +{ +add.f16x2 r282, r269, r279; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r285, {low, high}; +} +{ +mul.f16x2 r286, r28, r285; +} +{ +add.f16x2 r289, %24, r286; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r292, {low, high}; +} +{ +mul.f16x2 r293, r31, r292; +} +{ +add.f16x2 r296, r270, r293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r299, {low, high}; +} +{ +mul.f16x2 r300, r37, r299; +} +{ +add.f16x2 r303, r275, r300; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r306, {low, high}; +} +{ +mul.f16x2 r307, r46, r306; +} +{ +add.f16x2 r310, r282, r307; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r313, {low, high}; +} +{ +mul.f16x2 r314, r40, r313; +} +{ +add.f16x2 r317, r289, r314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r320, {low, high}; +} +{ +mul.f16x2 r321, r43, r320; +} +{ +add.f16x2 r324, r296, r321; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r327, {low, high}; +} +{ +mul.f16x2 r328, r49, r327; +} +{ +add.f16x2 r331, r303, r328; +} +mov.f32 f111, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r334, {low, high}; +} +{ +mul.f16x2 r335, r58, r334; +} +{ +add.f16x2 r338, r310, r335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r341, {low, high}; +} +{ +mul.f16x2 r342, r52, r341; +} +{ +add.f16x2 r345, r317, r342; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r55, r348; +} +{ +add.f16x2 r352, r324, r349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r355, {low, high}; +} +{ +mul.f16x2 r356, r61, r355; +} +{ +add.f16x2 r359, r331, r356; +} +mov.f32 f155, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r362, {low, high}; +} +{ +mul.f16x2 r363, r70, r362; +} +{ +add.f16x2 r366, r338, r363; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r369, {low, high}; +} +{ +mul.f16x2 r370, r64, r369; +} +{ +add.f16x2 r373, r345, r370; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r376, {low, high}; +} +{ +mul.f16x2 r377, r67, r376; +} +{ +add.f16x2 r380, r352, r377; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r383, {low, high}; +} +{ +mul.f16x2 r384, r73, r383; +} +{ +add.f16x2 r387, r359, r384; +} +mov.f32 f199, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r390, {low, high}; +} +{ +mul.f16x2 r391, r82, r390; +} +{ +add.f16x2 r394, r366, r391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r397, {low, high}; +} +{ +mul.f16x2 r398, r76, r397; +} +{ +add.f16x2 r401, r373, r398; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r404, {low, high}; +} +{ +mul.f16x2 r405, r79, r404; +} +{ +add.f16x2 r408, r380, r405; +} +{ +sub.f16x2 r411, r387, r394; +} +{ +add.f16x2 r414, r401, r408; +} +{ +add.f16x2 r417, r387, r394; +} +{ +sub.f16x2 r420, r401, r408; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r425, {low, high}; +} +{ +mul.f16x2 r426, r25, r425; +} +{ +add.f16x2 r429, %23, r426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r432, {low, high}; +} +{ +mul.f16x2 r433, r34, r432; +} +{ +add.f16x2 r436, r423, r433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r439, {low, high}; +} +{ +mul.f16x2 r440, r28, r439; +} +{ +add.f16x2 r443, %24, r440; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r31, r446; +} +{ +add.f16x2 r450, r424, r447; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r37, r453; +} +{ +add.f16x2 r457, r429, r454; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r460, {low, high}; +} +{ +mul.f16x2 r461, r46, r460; +} +{ +add.f16x2 r464, r436, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r467, {low, high}; +} +{ +mul.f16x2 r468, r40, r467; +} +{ +add.f16x2 r471, r443, r468; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r474, {low, high}; +} +{ +mul.f16x2 r475, r43, r474; +} +{ +add.f16x2 r478, r450, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r481, {low, high}; +} +{ +mul.f16x2 r482, r49, r481; +} +{ +add.f16x2 r485, r457, r482; +} +mov.f32 f215, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r488, {low, high}; +} +{ +mul.f16x2 r489, r58, r488; +} +{ +add.f16x2 r492, r464, r489; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r495, {low, high}; +} +{ +mul.f16x2 r496, r52, r495; +} +{ +add.f16x2 r499, r471, r496; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r502, {low, high}; +} +{ +mul.f16x2 r503, r55, r502; +} +{ +add.f16x2 r506, r478, r503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r509, {low, high}; +} +{ +mul.f16x2 r510, r61, r509; +} +{ +add.f16x2 r513, r485, r510; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r516, {low, high}; +} +{ +mul.f16x2 r517, r70, r516; +} +{ +add.f16x2 r520, r492, r517; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r523, {low, high}; +} +{ +mul.f16x2 r524, r64, r523; +} +{ +add.f16x2 r527, r499, r524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r530, {low, high}; +} +{ +mul.f16x2 r531, r67, r530; +} +{ +add.f16x2 r534, r506, r531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r537, {low, high}; +} +{ +mul.f16x2 r538, r73, r537; +} +{ +add.f16x2 r541, r513, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r544, {low, high}; +} +{ +mul.f16x2 r545, r82, r544; +} +{ +add.f16x2 r548, r520, r545; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r551, {low, high}; +} +{ +mul.f16x2 r552, r76, r551; +} +{ +add.f16x2 r555, r527, r552; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r558, {low, high}; +} +{ +mul.f16x2 r559, r79, r558; +} +{ +add.f16x2 r562, r534, r559; +} +{ +sub.f16x2 r565, r541, r548; +} +{ +add.f16x2 r568, r555, r562; +} +{ +add.f16x2 r571, r541, r548; +} +{ +sub.f16x2 r574, r555, r562; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r579, {low, high}; +} +{ +mul.f16x2 r580, r25, r579; +} +{ +add.f16x2 r583, %23, r580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r586, {low, high}; +} +{ +mul.f16x2 r587, r34, r586; +} +{ +add.f16x2 r590, r577, r587; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r593, {low, high}; +} +{ +mul.f16x2 r594, r28, r593; +} +{ +add.f16x2 r597, %24, r594; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r600, {low, high}; +} +{ +mul.f16x2 r601, r31, r600; +} +{ +add.f16x2 r604, r578, r601; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r37, r607; +} +{ +add.f16x2 r611, r583, r608; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r615, r46, r614; +} +{ +add.f16x2 r618, r590, r615; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r621, {low, high}; +} +{ +mul.f16x2 r622, r40, r621; +} +{ +add.f16x2 r625, r597, r622; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r628, {low, high}; +} +{ +mul.f16x2 r629, r43, r628; +} +{ +add.f16x2 r632, r604, r629; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r635, {low, high}; +} +{ +mul.f16x2 r636, r49, r635; +} +{ +add.f16x2 r639, r611, r636; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r642, {low, high}; +} +{ +mul.f16x2 r643, r58, r642; +} +{ +add.f16x2 r646, r618, r643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r649, {low, high}; +} +{ +mul.f16x2 r650, r52, r649; +} +{ +add.f16x2 r653, r625, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r656, {low, high}; +} +{ +mul.f16x2 r657, r55, r656; +} +{ +add.f16x2 r660, r632, r657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r663, {low, high}; +} +{ +mul.f16x2 r664, r61, r663; +} +{ +add.f16x2 r667, r639, r664; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r670, {low, high}; +} +{ +mul.f16x2 r671, r70, r670; +} +{ +add.f16x2 r674, r646, r671; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r677, {low, high}; +} +{ +mul.f16x2 r678, r64, r677; +} +{ +add.f16x2 r681, r653, r678; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r684, {low, high}; +} +{ +mul.f16x2 r685, r67, r684; +} +{ +add.f16x2 r688, r660, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r691, {low, high}; +} +{ +mul.f16x2 r692, r73, r691; +} +{ +add.f16x2 r695, r667, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r82, r698; +} +{ +add.f16x2 r702, r674, r699; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r705, {low, high}; +} +{ +mul.f16x2 r706, r76, r705; +} +{ +add.f16x2 r709, r681, r706; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r712, {low, high}; +} +{ +mul.f16x2 r713, r79, r712; +} +{ +add.f16x2 r716, r688, r713; +} +{ +sub.f16x2 r719, r695, r702; +} +{ +add.f16x2 r722, r709, r716; +} +{ +add.f16x2 r725, r695, r702; +} +{ +sub.f16x2 r728, r709, r716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r25, r733; +} +{ +add.f16x2 r737, %23, r734; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r740, {low, high}; +} +{ +mul.f16x2 r741, r34, r740; +} +{ +add.f16x2 r744, r731, r741; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r28, r747; +} +{ +add.f16x2 r751, %24, r748; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r754, {low, high}; +} +{ +mul.f16x2 r755, r31, r754; +} +{ +add.f16x2 r758, r732, r755; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r37, r761; +} +{ +add.f16x2 r765, r737, r762; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r768, {low, high}; +} +{ +mul.f16x2 r769, r46, r768; +} +{ +add.f16x2 r772, r744, r769; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r775, {low, high}; +} +{ +mul.f16x2 r776, r40, r775; +} +{ +add.f16x2 r779, r751, r776; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r782, {low, high}; +} +{ +mul.f16x2 r783, r43, r782; +} +{ +add.f16x2 r786, r758, r783; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r789, {low, high}; +} +{ +mul.f16x2 r790, r49, r789; +} +{ +add.f16x2 r793, r765, r790; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r796, {low, high}; +} +{ +mul.f16x2 r797, r58, r796; +} +{ +add.f16x2 r800, r772, r797; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r803, {low, high}; +} +{ +mul.f16x2 r804, r52, r803; +} +{ +add.f16x2 r807, r779, r804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r55, r810; +} +{ +add.f16x2 r814, r786, r811; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r817, {low, high}; +} +{ +mul.f16x2 r818, r61, r817; +} +{ +add.f16x2 r821, r793, r818; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r824, {low, high}; +} +{ +mul.f16x2 r825, r70, r824; +} +{ +add.f16x2 r828, r800, r825; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r831, {low, high}; +} +{ +mul.f16x2 r832, r64, r831; +} +{ +add.f16x2 r835, r807, r832; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r838, {low, high}; +} +{ +mul.f16x2 r839, r67, r838; +} +{ +add.f16x2 r842, r814, r839; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r845, {low, high}; +} +{ +mul.f16x2 r846, r73, r845; +} +{ +add.f16x2 r849, r821, r846; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r852, {low, high}; +} +{ +mul.f16x2 r853, r82, r852; +} +{ +add.f16x2 r856, r828, r853; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r859, {low, high}; +} +{ +mul.f16x2 r860, r76, r859; +} +{ +add.f16x2 r863, r835, r860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r79, r866; +} +{ +add.f16x2 r870, r842, r867; +} +{ +sub.f16x2 r873, r849, r856; +} +{ +add.f16x2 r876, r863, r870; +} +{ +add.f16x2 r879, r849, r856; +} +{ +sub.f16x2 r882, r863, r870; +} +mov.u32 r23, %tid.x; +mul.wide.u32 rd2, r23, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r885, rd3; +sub.s32 r886, r23, r885; +shr.u32 r887, r886, 1; +add.s32 r888, r887, r885; +shr.u32 r889, r888, 6; +mul.lo.s32 r890, r889, 121; +sub.s32 r24, r23, r890; +cvt.rn.f32.u32 f225, r24; +mul.f32 f1, f225, 0f3B9AAFAF; +setp.eq.s32 p1, r24, 18; +mov.f32 f717, 0f3DADD00E; +@p1 bra LBB1_2; +sin.approx.f32 f717, f1; +LBB1_2: +mov.u32 r3362, %tid.y; +mov.u32 r3363, %22; +mad.lo.s32 r3364, r3362, 5324, r3363; +mad.lo.s32 r3365, r889, 5324, r3364; +neg.f32 f227, f717; +cos.approx.f32 f226, f1; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f227; +mov.b32 r891, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r260, r896; +} +{ +neg.f16x2 r901, r898; +} +{ +fma.rn.f16x2 r903, r257, r894, r901; +} +{ +mul.f16x2 r907, r257, r896; +} +{ +fma.rn.f16x2 r910, r260, r894, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r916, {high, high}; +} +mov.f32 f490, 0fBF800000; +mov.f32 f491, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r891, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r414, r933; +} +{ +neg.f16x2 r938, r935; +} +{ +fma.rn.f16x2 r940, r411, r931, r938; +} +{ +mul.f16x2 r944, r411, r933; +} +{ +fma.rn.f16x2 r947, r414, r931, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r568, r970; +} +{ +neg.f16x2 r975, r972; +} +{ +fma.rn.f16x2 r977, r565, r968, r975; +} +{ +mul.f16x2 r981, r565, r970; +} +{ +fma.rn.f16x2 r984, r568, r968, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r722, r1007; +} +{ +neg.f16x2 r1012, r1009; +} +{ +fma.rn.f16x2 r1014, r719, r1005, r1012; +} +{ +mul.f16x2 r1018, r719, r1007; +} +{ +fma.rn.f16x2 r1021, r722, r1005, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r876, r1044; +} +{ +neg.f16x2 r1049, r1046; +} +{ +fma.rn.f16x2 r1051, r873, r1042, r1049; +} +{ +mul.f16x2 r1055, r873, r1044; +} +{ +fma.rn.f16x2 r1058, r876, r1042, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1066, {low, high}; +} +{ +mul.f16x2 r1067, r1064, r1066; +} +{ +mul.f16x2 r1070, r1038, r1062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1073, {high, low}; +} +{ +fma.rn.f16x2 r1075, r1067, r1073, r1070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1081, {high, high}; +} +{ +mul.f16x2 r1083, r882, r1081; +} +{ +neg.f16x2 r1086, r1083; +} +{ +fma.rn.f16x2 r1088, r879, r1079, r1086; +} +{ +mul.f16x2 r1092, r879, r1081; +} +{ +fma.rn.f16x2 r1095, r882, r1079, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1103, {low, high}; +} +{ +mul.f16x2 r1104, r1101, r1103; +} +{ +mul.f16x2 r1107, r1075, r1099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1110, {high, low}; +} +{ +fma.rn.f16x2 r1112, r1104, r1110, r1107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r728, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r725, r1116, r1123; +} +{ +mul.f16x2 r1129, r725, r1118; +} +{ +fma.rn.f16x2 r1132, r728, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1112, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r574, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r571, r1153, r1160; +} +{ +mul.f16x2 r1166, r571, r1155; +} +{ +fma.rn.f16x2 r1169, r574, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r420, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r417, r1190, r1197; +} +{ +mul.f16x2 r1203, r417, r1192; +} +{ +fma.rn.f16x2 r1206, r420, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r266, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r263, r1227, r1234; +} +{ +mul.f16x2 r1240, r263, r1229; +} +{ +fma.rn.f16x2 r1243, r266, r1227, r1240; +} +barrier.sync 0; +mad.lo.s32 r3366, r24, 44, r3365; +st.shared.u32 [r3366], r109; +st.shared.u32 [r3366+4], r903; +st.shared.u32 [r3366+8], r940; +st.shared.u32 [r3366+12], r977; +st.shared.u32 [r3366+16], r1014; +st.shared.u32 [r3366+20], r1051; +st.shared.u32 [r3366+24], r1088; +st.shared.u32 [r3366+28], r1125; +st.shared.u32 [r3366+32], r1162; +st.shared.u32 [r3366+36], r1199; +st.shared.u32 [r3366+40], r1236; +barrier.sync 0; +mad.lo.s32 r3367, r24, -40, r3366; +ld.shared.u32 r1325, [r3367]; +ld.shared.u32 r1265, [r3367+484]; +ld.shared.u32 r1277, [r3367+968]; +ld.shared.u32 r1289, [r3367+1452]; +ld.shared.u32 r1301, [r3367+1936]; +ld.shared.u32 r1313, [r3367+2420]; +ld.shared.u32 r1314, [r3367+2904]; +ld.shared.u32 r1302, [r3367+3388]; +ld.shared.u32 r1290, [r3367+3872]; +ld.shared.u32 r1278, [r3367+4356]; +ld.shared.u32 r1266, [r3367+4840]; +barrier.sync 0; +st.shared.u32 [r3366], r112; +st.shared.u32 [r3366+4], r910; +st.shared.u32 [r3366+8], r947; +st.shared.u32 [r3366+12], r984; +st.shared.u32 [r3366+16], r1021; +st.shared.u32 [r3366+20], r1058; +st.shared.u32 [r3366+24], r1095; +st.shared.u32 [r3366+28], r1132; +st.shared.u32 [r3366+32], r1169; +st.shared.u32 [r3366+36], r1206; +st.shared.u32 [r3366+40], r1243; +barrier.sync 0; +ld.shared.u32 r1328, [r3367]; +ld.shared.u32 r1268, [r3367+484]; +ld.shared.u32 r1280, [r3367+968]; +ld.shared.u32 r1292, [r3367+1452]; +ld.shared.u32 r1304, [r3367+1936]; +ld.shared.u32 r1316, [r3367+2420]; +ld.shared.u32 r1317, [r3367+2904]; +ld.shared.u32 r1305, [r3367+3388]; +ld.shared.u32 r1293, [r3367+3872]; +ld.shared.u32 r1281, [r3367+4356]; +ld.shared.u32 r1269, [r3367+4840]; +{ +add.f16x2 r1264, r1265, r1266; +} +{ +add.f16x2 r1267, r1268, r1269; +} +{ +sub.f16x2 r1270, r1265, r1266; +} +{ +sub.f16x2 r1273, r1268, r1269; +} +{ +add.f16x2 r1276, r1277, r1278; +} +{ +add.f16x2 r1279, r1280, r1281; +} +{ +sub.f16x2 r1282, r1277, r1278; +} +{ +sub.f16x2 r1285, r1280, r1281; +} +{ +add.f16x2 r1288, r1289, r1290; +} +{ +add.f16x2 r1291, r1292, r1293; +} +{ +sub.f16x2 r1294, r1289, r1290; +} +{ +sub.f16x2 r1297, r1292, r1293; +} +{ +add.f16x2 r1300, r1301, r1302; +} +{ +add.f16x2 r1303, r1304, r1305; +} +{ +sub.f16x2 r1306, r1301, r1302; +} +{ +sub.f16x2 r1309, r1304, r1305; +} +{ +add.f16x2 r1312, r1313, r1314; +} +{ +add.f16x2 r1315, r1316, r1317; +} +{ +sub.f16x2 r1318, r1313, r1314; +} +{ +sub.f16x2 r1321, r1316, r1317; +} +{ +add.f16x2 r1324, r1325, r1264; +} +{ +add.f16x2 r1327, r1328, r1267; +} +{ +add.f16x2 r1330, r1324, r1276; +} +{ +add.f16x2 r1333, r1327, r1279; +} +{ +add.f16x2 r1336, r1330, r1288; +} +{ +add.f16x2 r1339, r1333, r1291; +} +{ +add.f16x2 r1342, r1336, r1300; +} +{ +add.f16x2 r1345, r1339, r1303; +} +{ +add.f16x2 r1348, r1342, r1312; +} +{ +add.f16x2 r1351, r1345, r1315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1354, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1356, {low, high}; +} +{ +mul.f16x2 r1357, r1264, r1356; +} +{ +add.f16x2 r1360, r1325, r1357; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1363, {low, high}; +} +{ +mul.f16x2 r1364, r1273, r1363; +} +{ +add.f16x2 r1367, r1354, r1364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1267, r1370; +} +{ +add.f16x2 r1374, r1328, r1371; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1377, {low, high}; +} +{ +mul.f16x2 r1378, r1270, r1377; +} +{ +add.f16x2 r1381, r1355, r1378; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1384, {low, high}; +} +{ +mul.f16x2 r1385, r1276, r1384; +} +{ +add.f16x2 r1388, r1360, r1385; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1391, {low, high}; +} +{ +mul.f16x2 r1392, r1285, r1391; +} +{ +add.f16x2 r1395, r1367, r1392; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1398, {low, high}; +} +{ +mul.f16x2 r1399, r1279, r1398; +} +{ +add.f16x2 r1402, r1374, r1399; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1405, {low, high}; +} +{ +mul.f16x2 r1406, r1282, r1405; +} +{ +add.f16x2 r1409, r1381, r1406; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1412, {low, high}; +} +{ +mul.f16x2 r1413, r1288, r1412; +} +{ +add.f16x2 r1416, r1388, r1413; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1419, {low, high}; +} +{ +mul.f16x2 r1420, r1297, r1419; +} +{ +add.f16x2 r1423, r1395, r1420; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1426, {low, high}; +} +{ +mul.f16x2 r1427, r1291, r1426; +} +{ +add.f16x2 r1430, r1402, r1427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1433, {low, high}; +} +{ +mul.f16x2 r1434, r1294, r1433; +} +{ +add.f16x2 r1437, r1409, r1434; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1440, {low, high}; +} +{ +mul.f16x2 r1441, r1300, r1440; +} +{ +add.f16x2 r1444, r1416, r1441; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1447, {low, high}; +} +{ +mul.f16x2 r1448, r1309, r1447; +} +{ +add.f16x2 r1451, r1423, r1448; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1454, {low, high}; +} +{ +mul.f16x2 r1455, r1303, r1454; +} +{ +add.f16x2 r1458, r1430, r1455; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1461, {low, high}; +} +{ +mul.f16x2 r1462, r1306, r1461; +} +{ +add.f16x2 r1465, r1437, r1462; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1468, {low, high}; +} +{ +mul.f16x2 r1469, r1312, r1468; +} +{ +add.f16x2 r1472, r1444, r1469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1475, {low, high}; +} +{ +mul.f16x2 r1476, r1321, r1475; +} +{ +add.f16x2 r1479, r1451, r1476; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1482, {low, high}; +} +{ +mul.f16x2 r1483, r1315, r1482; +} +{ +add.f16x2 r1486, r1458, r1483; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1489, {low, high}; +} +{ +mul.f16x2 r1490, r1318, r1489; +} +{ +add.f16x2 r1493, r1465, r1490; +} +{ +sub.f16x2 r1496, r1472, r1479; +} +{ +add.f16x2 r1499, r1486, r1493; +} +{ +add.f16x2 r1502, r1472, r1479; +} +{ +sub.f16x2 r1505, r1486, r1493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1510, {low, high}; +} +{ +mul.f16x2 r1511, r1264, r1510; +} +{ +add.f16x2 r1514, r1325, r1511; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1517, {low, high}; +} +{ +mul.f16x2 r1518, r1273, r1517; +} +{ +add.f16x2 r1521, r1508, r1518; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1524, {low, high}; +} +{ +mul.f16x2 r1525, r1267, r1524; +} +{ +add.f16x2 r1528, r1328, r1525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1531, {low, high}; +} +{ +mul.f16x2 r1532, r1270, r1531; +} +{ +add.f16x2 r1535, r1509, r1532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1538, {low, high}; +} +{ +mul.f16x2 r1539, r1276, r1538; +} +{ +add.f16x2 r1542, r1514, r1539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1545, {low, high}; +} +{ +mul.f16x2 r1546, r1285, r1545; +} +{ +add.f16x2 r1549, r1521, r1546; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1552, {low, high}; +} +{ +mul.f16x2 r1553, r1279, r1552; +} +{ +add.f16x2 r1556, r1528, r1553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1559, {low, high}; +} +{ +mul.f16x2 r1560, r1282, r1559; +} +{ +add.f16x2 r1563, r1535, r1560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1566, {low, high}; +} +{ +mul.f16x2 r1567, r1288, r1566; +} +{ +add.f16x2 r1570, r1542, r1567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1573, {low, high}; +} +{ +mul.f16x2 r1574, r1297, r1573; +} +{ +add.f16x2 r1577, r1549, r1574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1291, r1580; +} +{ +add.f16x2 r1584, r1556, r1581; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1587, {low, high}; +} +{ +mul.f16x2 r1588, r1294, r1587; +} +{ +add.f16x2 r1591, r1563, r1588; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1594, {low, high}; +} +{ +mul.f16x2 r1595, r1300, r1594; +} +{ +add.f16x2 r1598, r1570, r1595; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1601, {low, high}; +} +{ +mul.f16x2 r1602, r1309, r1601; +} +{ +add.f16x2 r1605, r1577, r1602; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1608, {low, high}; +} +{ +mul.f16x2 r1609, r1303, r1608; +} +{ +add.f16x2 r1612, r1584, r1609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1615, {low, high}; +} +{ +mul.f16x2 r1616, r1306, r1615; +} +{ +add.f16x2 r1619, r1591, r1616; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1622, {low, high}; +} +{ +mul.f16x2 r1623, r1312, r1622; +} +{ +add.f16x2 r1626, r1598, r1623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1629, {low, high}; +} +{ +mul.f16x2 r1630, r1321, r1629; +} +{ +add.f16x2 r1633, r1605, r1630; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1636, {low, high}; +} +{ +mul.f16x2 r1637, r1315, r1636; +} +{ +add.f16x2 r1640, r1612, r1637; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1643, {low, high}; +} +{ +mul.f16x2 r1644, r1318, r1643; +} +{ +add.f16x2 r1647, r1619, r1644; +} +{ +sub.f16x2 r1650, r1626, r1633; +} +{ +add.f16x2 r1653, r1640, r1647; +} +{ +add.f16x2 r1656, r1626, r1633; +} +{ +sub.f16x2 r1659, r1640, r1647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1662, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1663, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1664, {low, high}; +} +{ +mul.f16x2 r1665, r1264, r1664; +} +{ +add.f16x2 r1668, r1325, r1665; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1671, {low, high}; +} +{ +mul.f16x2 r1672, r1273, r1671; +} +{ +add.f16x2 r1675, r1662, r1672; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1678, {low, high}; +} +{ +mul.f16x2 r1679, r1267, r1678; +} +{ +add.f16x2 r1682, r1328, r1679; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1685, {low, high}; +} +{ +mul.f16x2 r1686, r1270, r1685; +} +{ +add.f16x2 r1689, r1663, r1686; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1692, {low, high}; +} +{ +mul.f16x2 r1693, r1276, r1692; +} +{ +add.f16x2 r1696, r1668, r1693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1699, {low, high}; +} +{ +mul.f16x2 r1700, r1285, r1699; +} +{ +add.f16x2 r1703, r1675, r1700; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1706, {low, high}; +} +{ +mul.f16x2 r1707, r1279, r1706; +} +{ +add.f16x2 r1710, r1682, r1707; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1713, {low, high}; +} +{ +mul.f16x2 r1714, r1282, r1713; +} +{ +add.f16x2 r1717, r1689, r1714; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1720, {low, high}; +} +{ +mul.f16x2 r1721, r1288, r1720; +} +{ +add.f16x2 r1724, r1696, r1721; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1297, r1727; +} +{ +add.f16x2 r1731, r1703, r1728; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1734, {low, high}; +} +{ +mul.f16x2 r1735, r1291, r1734; +} +{ +add.f16x2 r1738, r1710, r1735; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1294, r1741; +} +{ +add.f16x2 r1745, r1717, r1742; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1748, {low, high}; +} +{ +mul.f16x2 r1749, r1300, r1748; +} +{ +add.f16x2 r1752, r1724, r1749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1755, {low, high}; +} +{ +mul.f16x2 r1756, r1309, r1755; +} +{ +add.f16x2 r1759, r1731, r1756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1762, {low, high}; +} +{ +mul.f16x2 r1763, r1303, r1762; +} +{ +add.f16x2 r1766, r1738, r1763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1769, {low, high}; +} +{ +mul.f16x2 r1770, r1306, r1769; +} +{ +add.f16x2 r1773, r1745, r1770; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1776, {low, high}; +} +{ +mul.f16x2 r1777, r1312, r1776; +} +{ +add.f16x2 r1780, r1752, r1777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1321, r1783; +} +{ +add.f16x2 r1787, r1759, r1784; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1790, {low, high}; +} +{ +mul.f16x2 r1791, r1315, r1790; +} +{ +add.f16x2 r1794, r1766, r1791; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1318, r1797; +} +{ +add.f16x2 r1801, r1773, r1798; +} +{ +sub.f16x2 r1804, r1780, r1787; +} +{ +add.f16x2 r1807, r1794, r1801; +} +{ +add.f16x2 r1810, r1780, r1787; +} +{ +sub.f16x2 r1813, r1794, r1801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1817, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1264, r1818; +} +{ +add.f16x2 r1822, r1325, r1819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1825, {low, high}; +} +{ +mul.f16x2 r1826, r1273, r1825; +} +{ +add.f16x2 r1829, r1816, r1826; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1832, {low, high}; +} +{ +mul.f16x2 r1833, r1267, r1832; +} +{ +add.f16x2 r1836, r1328, r1833; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1839, {low, high}; +} +{ +mul.f16x2 r1840, r1270, r1839; +} +{ +add.f16x2 r1843, r1817, r1840; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1846, {low, high}; +} +{ +mul.f16x2 r1847, r1276, r1846; +} +{ +add.f16x2 r1850, r1822, r1847; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1853, {low, high}; +} +{ +mul.f16x2 r1854, r1285, r1853; +} +{ +add.f16x2 r1857, r1829, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1860, {low, high}; +} +{ +mul.f16x2 r1861, r1279, r1860; +} +{ +add.f16x2 r1864, r1836, r1861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1867, {low, high}; +} +{ +mul.f16x2 r1868, r1282, r1867; +} +{ +add.f16x2 r1871, r1843, r1868; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1874, {low, high}; +} +{ +mul.f16x2 r1875, r1288, r1874; +} +{ +add.f16x2 r1878, r1850, r1875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1881, {low, high}; +} +{ +mul.f16x2 r1882, r1297, r1881; +} +{ +add.f16x2 r1885, r1857, r1882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1291, r1888; +} +{ +add.f16x2 r1892, r1864, r1889; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1895, {low, high}; +} +{ +mul.f16x2 r1896, r1294, r1895; +} +{ +add.f16x2 r1899, r1871, r1896; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1902, {low, high}; +} +{ +mul.f16x2 r1903, r1300, r1902; +} +{ +add.f16x2 r1906, r1878, r1903; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1909, {low, high}; +} +{ +mul.f16x2 r1910, r1309, r1909; +} +{ +add.f16x2 r1913, r1885, r1910; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1916, {low, high}; +} +{ +mul.f16x2 r1917, r1303, r1916; +} +{ +add.f16x2 r1920, r1892, r1917; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1923, {low, high}; +} +{ +mul.f16x2 r1924, r1306, r1923; +} +{ +add.f16x2 r1927, r1899, r1924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1930, {low, high}; +} +{ +mul.f16x2 r1931, r1312, r1930; +} +{ +add.f16x2 r1934, r1906, r1931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1937, {low, high}; +} +{ +mul.f16x2 r1938, r1321, r1937; +} +{ +add.f16x2 r1941, r1913, r1938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1944, {low, high}; +} +{ +mul.f16x2 r1945, r1315, r1944; +} +{ +add.f16x2 r1948, r1920, r1945; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1951, {low, high}; +} +{ +mul.f16x2 r1952, r1318, r1951; +} +{ +add.f16x2 r1955, r1927, r1952; +} +{ +sub.f16x2 r1958, r1934, r1941; +} +{ +add.f16x2 r1961, r1948, r1955; +} +{ +add.f16x2 r1964, r1934, r1941; +} +{ +sub.f16x2 r1967, r1948, r1955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1972, {low, high}; +} +{ +mul.f16x2 r1973, r1264, r1972; +} +{ +add.f16x2 r1976, r1325, r1973; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1979, {low, high}; +} +{ +mul.f16x2 r1980, r1273, r1979; +} +{ +add.f16x2 r1983, r1970, r1980; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1986, {low, high}; +} +{ +mul.f16x2 r1987, r1267, r1986; +} +{ +add.f16x2 r1990, r1328, r1987; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1993, {low, high}; +} +{ +mul.f16x2 r1994, r1270, r1993; +} +{ +add.f16x2 r1997, r1971, r1994; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2000, {low, high}; +} +{ +mul.f16x2 r2001, r1276, r2000; +} +{ +add.f16x2 r2004, r1976, r2001; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2007, {low, high}; +} +{ +mul.f16x2 r2008, r1285, r2007; +} +{ +add.f16x2 r2011, r1983, r2008; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2014, {low, high}; +} +{ +mul.f16x2 r2015, r1279, r2014; +} +{ +add.f16x2 r2018, r1990, r2015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2021, {low, high}; +} +{ +mul.f16x2 r2022, r1282, r2021; +} +{ +add.f16x2 r2025, r1997, r2022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2028, {low, high}; +} +{ +mul.f16x2 r2029, r1288, r2028; +} +{ +add.f16x2 r2032, r2004, r2029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2035, {low, high}; +} +{ +mul.f16x2 r2036, r1297, r2035; +} +{ +add.f16x2 r2039, r2011, r2036; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2042, {low, high}; +} +{ +mul.f16x2 r2043, r1291, r2042; +} +{ +add.f16x2 r2046, r2018, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2049, {low, high}; +} +{ +mul.f16x2 r2050, r1294, r2049; +} +{ +add.f16x2 r2053, r2025, r2050; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2057, r1300, r2056; +} +{ +add.f16x2 r2060, r2032, r2057; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2063, {low, high}; +} +{ +mul.f16x2 r2064, r1309, r2063; +} +{ +add.f16x2 r2067, r2039, r2064; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2070, {low, high}; +} +{ +mul.f16x2 r2071, r1303, r2070; +} +{ +add.f16x2 r2074, r2046, r2071; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r1306, r2077; +} +{ +add.f16x2 r2081, r2053, r2078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r1312, r2084; +} +{ +add.f16x2 r2088, r2060, r2085; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2091, {low, high}; +} +{ +mul.f16x2 r2092, r1321, r2091; +} +{ +add.f16x2 r2095, r2067, r2092; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2098, {low, high}; +} +{ +mul.f16x2 r2099, r1315, r2098; +} +{ +add.f16x2 r2102, r2074, r2099; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2105, {low, high}; +} +{ +mul.f16x2 r2106, r1318, r2105; +} +{ +add.f16x2 r2109, r2081, r2106; +} +{ +sub.f16x2 r2112, r2088, r2095; +} +{ +add.f16x2 r2115, r2102, r2109; +} +{ +add.f16x2 r2118, r2088, r2095; +} +{ +sub.f16x2 r2121, r2102, r2109; +} +mul.wide.u32 rd6, r24, -1171354717; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r3368, rd7; +mul.lo.s32 r3369, r3368, 11; +sub.s32 r3370, r24, r3369; +shl.b32 r3371, r3370, 2; +add.s32 r3372, r3365, r3371; +cvt.rn.f32.u32 f714, r3368; +mul.f32 f715, f714, 0f3D54B191; +cos.approx.f32 f470, f715; +sin.approx.f32 f716, f715; +neg.f32 f471, f716; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f470; +cvt.rn.f16.f32 high, f471; +mov.b32 r2124, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1499, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1496, r2127, r2134; +} +{ +mul.f16x2 r2140, r1496, r2129; +} +{ +fma.rn.f16x2 r2143, r1499, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2124, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1653, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1650, r2164, r2171; +} +{ +mul.f16x2 r2177, r1650, r2166; +} +{ +fma.rn.f16x2 r2180, r1653, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1807, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1804, r2201, r2208; +} +{ +mul.f16x2 r2214, r1804, r2203; +} +{ +fma.rn.f16x2 r2217, r1807, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1961, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1958, r2238, r2245; +} +{ +mul.f16x2 r2251, r1958, r2240; +} +{ +fma.rn.f16x2 r2254, r1961, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r2115, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r2112, r2275, r2282; +} +{ +mul.f16x2 r2288, r2112, r2277; +} +{ +fma.rn.f16x2 r2291, r2115, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r2121, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r2118, r2312, r2319; +} +{ +mul.f16x2 r2325, r2118, r2314; +} +{ +fma.rn.f16x2 r2328, r2121, r2312, r2325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2334, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2336, {low, high}; +} +{ +mul.f16x2 r2337, r2334, r2336; +} +{ +mul.f16x2 r2340, r2308, r2332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2343, {high, low}; +} +{ +fma.rn.f16x2 r2345, r2337, r2343, r2340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2349, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2351, {high, high}; +} +{ +mul.f16x2 r2353, r1967, r2351; +} +{ +neg.f16x2 r2356, r2353; +} +{ +fma.rn.f16x2 r2358, r1964, r2349, r2356; +} +{ +mul.f16x2 r2362, r1964, r2351; +} +{ +fma.rn.f16x2 r2365, r1967, r2349, r2362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2371, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2373, {low, high}; +} +{ +mul.f16x2 r2374, r2371, r2373; +} +{ +mul.f16x2 r2377, r2345, r2369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2380, {high, low}; +} +{ +fma.rn.f16x2 r2382, r2374, r2380, r2377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2388, {high, high}; +} +{ +mul.f16x2 r2390, r1813, r2388; +} +{ +neg.f16x2 r2393, r2390; +} +{ +fma.rn.f16x2 r2395, r1810, r2386, r2393; +} +{ +mul.f16x2 r2399, r1810, r2388; +} +{ +fma.rn.f16x2 r2402, r1813, r2386, r2399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2408, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2410, {low, high}; +} +{ +mul.f16x2 r2411, r2408, r2410; +} +{ +mul.f16x2 r2414, r2382, r2406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2417, {high, low}; +} +{ +fma.rn.f16x2 r2419, r2411, r2417, r2414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2423, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2425, {high, high}; +} +{ +mul.f16x2 r2427, r1659, r2425; +} +{ +neg.f16x2 r2430, r2427; +} +{ +fma.rn.f16x2 r2432, r1656, r2423, r2430; +} +{ +mul.f16x2 r2436, r1656, r2425; +} +{ +fma.rn.f16x2 r2439, r1659, r2423, r2436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2443, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2445, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2447, {low, high}; +} +{ +mul.f16x2 r2448, r2445, r2447; +} +{ +mul.f16x2 r2451, r2419, r2443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2454, {high, low}; +} +{ +fma.rn.f16x2 r2456, r2448, r2454, r2451; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2460, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2462, {high, high}; +} +{ +mul.f16x2 r2464, r1505, r2462; +} +{ +neg.f16x2 r2467, r2464; +} +{ +fma.rn.f16x2 r2469, r1502, r2460, r2467; +} +{ +mul.f16x2 r2473, r1502, r2462; +} +{ +fma.rn.f16x2 r2476, r1505, r2460, r2473; +} +barrier.sync 0; +mad.lo.s32 r3373, r3368, 484, r3372; +st.shared.u32 [r3373], r1348; +st.shared.u32 [r3373+44], r2136; +st.shared.u32 [r3373+88], r2173; +st.shared.u32 [r3373+132], r2210; +st.shared.u32 [r3373+176], r2247; +st.shared.u32 [r3373+220], r2284; +st.shared.u32 [r3373+264], r2321; +st.shared.u32 [r3373+308], r2358; +st.shared.u32 [r3373+352], r2395; +st.shared.u32 [r3373+396], r2432; +st.shared.u32 [r3373+440], r2469; +barrier.sync 0; +ld.shared.u32 r2558, [r3367]; +ld.shared.u32 r2498, [r3367+484]; +ld.shared.u32 r2510, [r3367+968]; +ld.shared.u32 r2522, [r3367+1452]; +ld.shared.u32 r2534, [r3367+1936]; +ld.shared.u32 r2546, [r3367+2420]; +ld.shared.u32 r2547, [r3367+2904]; +ld.shared.u32 r2535, [r3367+3388]; +ld.shared.u32 r2523, [r3367+3872]; +ld.shared.u32 r2511, [r3367+4356]; +ld.shared.u32 r2499, [r3367+4840]; +barrier.sync 0; +st.shared.u32 [r3373], r1351; +st.shared.u32 [r3373+44], r2143; +st.shared.u32 [r3373+88], r2180; +st.shared.u32 [r3373+132], r2217; +st.shared.u32 [r3373+176], r2254; +st.shared.u32 [r3373+220], r2291; +st.shared.u32 [r3373+264], r2328; +st.shared.u32 [r3373+308], r2365; +st.shared.u32 [r3373+352], r2402; +st.shared.u32 [r3373+396], r2439; +st.shared.u32 [r3373+440], r2476; +barrier.sync 0; +ld.shared.u32 r2561, [r3367]; +ld.shared.u32 r2501, [r3367+484]; +ld.shared.u32 r2513, [r3367+968]; +ld.shared.u32 r2525, [r3367+1452]; +ld.shared.u32 r2537, [r3367+1936]; +ld.shared.u32 r2549, [r3367+2420]; +ld.shared.u32 r2550, [r3367+2904]; +ld.shared.u32 r2538, [r3367+3388]; +ld.shared.u32 r2526, [r3367+3872]; +ld.shared.u32 r2514, [r3367+4356]; +ld.shared.u32 r2502, [r3367+4840]; +{ +add.f16x2 r2497, r2498, r2499; +} +{ +add.f16x2 r2500, r2501, r2502; +} +{ +sub.f16x2 r2503, r2498, r2499; +} +{ +sub.f16x2 r2506, r2501, r2502; +} +{ +add.f16x2 r2509, r2510, r2511; +} +{ +add.f16x2 r2512, r2513, r2514; +} +{ +sub.f16x2 r2515, r2510, r2511; +} +{ +sub.f16x2 r2518, r2513, r2514; +} +{ +add.f16x2 r2521, r2522, r2523; +} +{ +add.f16x2 r2524, r2525, r2526; +} +{ +sub.f16x2 r2527, r2522, r2523; +} +{ +sub.f16x2 r2530, r2525, r2526; +} +{ +add.f16x2 r2533, r2534, r2535; +} +{ +add.f16x2 r2536, r2537, r2538; +} +{ +sub.f16x2 r2539, r2534, r2535; +} +{ +sub.f16x2 r2542, r2537, r2538; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2497; +} +{ +add.f16x2 r2560, r2561, r2500; +} +{ +add.f16x2 r2563, r2557, r2509; +} +{ +add.f16x2 r2566, r2560, r2512; +} +{ +add.f16x2 r2569, r2563, r2521; +} +{ +add.f16x2 r2572, r2566, r2524; +} +{ +add.f16x2 r2575, r2569, r2533; +} +{ +add.f16x2 r2578, r2572, r2536; +} +{ +add.f16x2 %0, r2575, r2545; +} +{ +add.f16x2 %1, r2578, r2548; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2589, {low, high}; +} +{ +mul.f16x2 r2590, r2497, r2589; +} +{ +add.f16x2 r2593, r2558, r2590; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2596, {low, high}; +} +{ +mul.f16x2 r2597, r2506, r2596; +} +{ +add.f16x2 r2600, r2587, r2597; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2603, {low, high}; +} +{ +mul.f16x2 r2604, r2500, r2603; +} +{ +add.f16x2 r2607, r2561, r2604; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2610, {low, high}; +} +{ +mul.f16x2 r2611, r2503, r2610; +} +{ +add.f16x2 r2614, r2588, r2611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2617, {low, high}; +} +{ +mul.f16x2 r2618, r2509, r2617; +} +{ +add.f16x2 r2621, r2593, r2618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2624, {low, high}; +} +{ +mul.f16x2 r2625, r2518, r2624; +} +{ +add.f16x2 r2628, r2600, r2625; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2631, {low, high}; +} +{ +mul.f16x2 r2632, r2512, r2631; +} +{ +add.f16x2 r2635, r2607, r2632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2638, {low, high}; +} +{ +mul.f16x2 r2639, r2515, r2638; +} +{ +add.f16x2 r2642, r2614, r2639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2645, {low, high}; +} +{ +mul.f16x2 r2646, r2521, r2645; +} +{ +add.f16x2 r2649, r2621, r2646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2652, {low, high}; +} +{ +mul.f16x2 r2653, r2530, r2652; +} +{ +add.f16x2 r2656, r2628, r2653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2659, {low, high}; +} +{ +mul.f16x2 r2660, r2524, r2659; +} +{ +add.f16x2 r2663, r2635, r2660; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2666, {low, high}; +} +{ +mul.f16x2 r2667, r2527, r2666; +} +{ +add.f16x2 r2670, r2642, r2667; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2673, {low, high}; +} +{ +mul.f16x2 r2674, r2533, r2673; +} +{ +add.f16x2 r2677, r2649, r2674; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2680, {low, high}; +} +{ +mul.f16x2 r2681, r2542, r2680; +} +{ +add.f16x2 r2684, r2656, r2681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2687, {low, high}; +} +{ +mul.f16x2 r2688, r2536, r2687; +} +{ +add.f16x2 r2691, r2663, r2688; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2694, {low, high}; +} +{ +mul.f16x2 r2695, r2539, r2694; +} +{ +add.f16x2 r2698, r2670, r2695; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2701, {low, high}; +} +{ +mul.f16x2 r2702, r2545, r2701; +} +{ +add.f16x2 r2705, r2677, r2702; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2708, {low, high}; +} +{ +mul.f16x2 r2709, r2554, r2708; +} +{ +add.f16x2 r2712, r2684, r2709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2715, {low, high}; +} +{ +mul.f16x2 r2716, r2548, r2715; +} +{ +add.f16x2 r2719, r2691, r2716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2722, {low, high}; +} +{ +mul.f16x2 r2723, r2551, r2722; +} +{ +add.f16x2 r2726, r2698, r2723; +} +{ +sub.f16x2 %2, r2705, r2712; +} +{ +add.f16x2 %3, r2719, r2726; +} +{ +add.f16x2 %20, r2705, r2712; +} +{ +sub.f16x2 %21, r2719, r2726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2743, {low, high}; +} +{ +mul.f16x2 r2744, r2497, r2743; +} +{ +add.f16x2 r2747, r2558, r2744; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2750, {low, high}; +} +{ +mul.f16x2 r2751, r2506, r2750; +} +{ +add.f16x2 r2754, r2741, r2751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2757, {low, high}; +} +{ +mul.f16x2 r2758, r2500, r2757; +} +{ +add.f16x2 r2761, r2561, r2758; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2764, {low, high}; +} +{ +mul.f16x2 r2765, r2503, r2764; +} +{ +add.f16x2 r2768, r2742, r2765; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2771, {low, high}; +} +{ +mul.f16x2 r2772, r2509, r2771; +} +{ +add.f16x2 r2775, r2747, r2772; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2778, {low, high}; +} +{ +mul.f16x2 r2779, r2518, r2778; +} +{ +add.f16x2 r2782, r2754, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2785, {low, high}; +} +{ +mul.f16x2 r2786, r2512, r2785; +} +{ +add.f16x2 r2789, r2761, r2786; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2792, {low, high}; +} +{ +mul.f16x2 r2793, r2515, r2792; +} +{ +add.f16x2 r2796, r2768, r2793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2799, {low, high}; +} +{ +mul.f16x2 r2800, r2521, r2799; +} +{ +add.f16x2 r2803, r2775, r2800; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2806, {low, high}; +} +{ +mul.f16x2 r2807, r2530, r2806; +} +{ +add.f16x2 r2810, r2782, r2807; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2524, r2813; +} +{ +add.f16x2 r2817, r2789, r2814; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2820, {low, high}; +} +{ +mul.f16x2 r2821, r2527, r2820; +} +{ +add.f16x2 r2824, r2796, r2821; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2827, {low, high}; +} +{ +mul.f16x2 r2828, r2533, r2827; +} +{ +add.f16x2 r2831, r2803, r2828; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2834, {low, high}; +} +{ +mul.f16x2 r2835, r2542, r2834; +} +{ +add.f16x2 r2838, r2810, r2835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2841, {low, high}; +} +{ +mul.f16x2 r2842, r2536, r2841; +} +{ +add.f16x2 r2845, r2817, r2842; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2848, {low, high}; +} +{ +mul.f16x2 r2849, r2539, r2848; +} +{ +add.f16x2 r2852, r2824, r2849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2855, {low, high}; +} +{ +mul.f16x2 r2856, r2545, r2855; +} +{ +add.f16x2 r2859, r2831, r2856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2862, {low, high}; +} +{ +mul.f16x2 r2863, r2554, r2862; +} +{ +add.f16x2 r2866, r2838, r2863; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2869, {low, high}; +} +{ +mul.f16x2 r2870, r2548, r2869; +} +{ +add.f16x2 r2873, r2845, r2870; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2876, {low, high}; +} +{ +mul.f16x2 r2877, r2551, r2876; +} +{ +add.f16x2 r2880, r2852, r2877; +} +{ +sub.f16x2 %4, r2859, r2866; +} +{ +add.f16x2 %5, r2873, r2880; +} +{ +add.f16x2 %18, r2859, r2866; +} +{ +sub.f16x2 %19, r2873, r2880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2897, {low, high}; +} +{ +mul.f16x2 r2898, r2497, r2897; +} +{ +add.f16x2 r2901, r2558, r2898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2904, {low, high}; +} +{ +mul.f16x2 r2905, r2506, r2904; +} +{ +add.f16x2 r2908, r2895, r2905; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2911, {low, high}; +} +{ +mul.f16x2 r2912, r2500, r2911; +} +{ +add.f16x2 r2915, r2561, r2912; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2918, {low, high}; +} +{ +mul.f16x2 r2919, r2503, r2918; +} +{ +add.f16x2 r2922, r2896, r2919; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2925, {low, high}; +} +{ +mul.f16x2 r2926, r2509, r2925; +} +{ +add.f16x2 r2929, r2901, r2926; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2932, {low, high}; +} +{ +mul.f16x2 r2933, r2518, r2932; +} +{ +add.f16x2 r2936, r2908, r2933; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2939, {low, high}; +} +{ +mul.f16x2 r2940, r2512, r2939; +} +{ +add.f16x2 r2943, r2915, r2940; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2946, {low, high}; +} +{ +mul.f16x2 r2947, r2515, r2946; +} +{ +add.f16x2 r2950, r2922, r2947; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2953, {low, high}; +} +{ +mul.f16x2 r2954, r2521, r2953; +} +{ +add.f16x2 r2957, r2929, r2954; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2960, {low, high}; +} +{ +mul.f16x2 r2961, r2530, r2960; +} +{ +add.f16x2 r2964, r2936, r2961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2967, {low, high}; +} +{ +mul.f16x2 r2968, r2524, r2967; +} +{ +add.f16x2 r2971, r2943, r2968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2974, {low, high}; +} +{ +mul.f16x2 r2975, r2527, r2974; +} +{ +add.f16x2 r2978, r2950, r2975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2981, {low, high}; +} +{ +mul.f16x2 r2982, r2533, r2981; +} +{ +add.f16x2 r2985, r2957, r2982; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2988, {low, high}; +} +{ +mul.f16x2 r2989, r2542, r2988; +} +{ +add.f16x2 r2992, r2964, r2989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2995, {low, high}; +} +{ +mul.f16x2 r2996, r2536, r2995; +} +{ +add.f16x2 r2999, r2971, r2996; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3002, {low, high}; +} +{ +mul.f16x2 r3003, r2539, r3002; +} +{ +add.f16x2 r3006, r2978, r3003; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r2545, r3009; +} +{ +add.f16x2 r3013, r2985, r3010; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3016, {low, high}; +} +{ +mul.f16x2 r3017, r2554, r3016; +} +{ +add.f16x2 r3020, r2992, r3017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3023, {low, high}; +} +{ +mul.f16x2 r3024, r2548, r3023; +} +{ +add.f16x2 r3027, r2999, r3024; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3030, {low, high}; +} +{ +mul.f16x2 r3031, r2551, r3030; +} +{ +add.f16x2 r3034, r3006, r3031; +} +{ +sub.f16x2 %6, r3013, r3020; +} +{ +add.f16x2 %7, r3027, r3034; +} +{ +add.f16x2 %16, r3013, r3020; +} +{ +sub.f16x2 %17, r3027, r3034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3051, {low, high}; +} +{ +mul.f16x2 r3052, r2497, r3051; +} +{ +add.f16x2 r3055, r2558, r3052; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3058, {low, high}; +} +{ +mul.f16x2 r3059, r2506, r3058; +} +{ +add.f16x2 r3062, r3049, r3059; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3065, {low, high}; +} +{ +mul.f16x2 r3066, r2500, r3065; +} +{ +add.f16x2 r3069, r2561, r3066; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3072, {low, high}; +} +{ +mul.f16x2 r3073, r2503, r3072; +} +{ +add.f16x2 r3076, r3050, r3073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3079, {low, high}; +} +{ +mul.f16x2 r3080, r2509, r3079; +} +{ +add.f16x2 r3083, r3055, r3080; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3086, {low, high}; +} +{ +mul.f16x2 r3087, r2518, r3086; +} +{ +add.f16x2 r3090, r3062, r3087; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3093, {low, high}; +} +{ +mul.f16x2 r3094, r2512, r3093; +} +{ +add.f16x2 r3097, r3069, r3094; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r2515, r3100; +} +{ +add.f16x2 r3104, r3076, r3101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3107, {low, high}; +} +{ +mul.f16x2 r3108, r2521, r3107; +} +{ +add.f16x2 r3111, r3083, r3108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3114, {low, high}; +} +{ +mul.f16x2 r3115, r2530, r3114; +} +{ +add.f16x2 r3118, r3090, r3115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3121, {low, high}; +} +{ +mul.f16x2 r3122, r2524, r3121; +} +{ +add.f16x2 r3125, r3097, r3122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3128, {low, high}; +} +{ +mul.f16x2 r3129, r2527, r3128; +} +{ +add.f16x2 r3132, r3104, r3129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3135, {low, high}; +} +{ +mul.f16x2 r3136, r2533, r3135; +} +{ +add.f16x2 r3139, r3111, r3136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3142, {low, high}; +} +{ +mul.f16x2 r3143, r2542, r3142; +} +{ +add.f16x2 r3146, r3118, r3143; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3149, {low, high}; +} +{ +mul.f16x2 r3150, r2536, r3149; +} +{ +add.f16x2 r3153, r3125, r3150; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3156, {low, high}; +} +{ +mul.f16x2 r3157, r2539, r3156; +} +{ +add.f16x2 r3160, r3132, r3157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3163, {low, high}; +} +{ +mul.f16x2 r3164, r2545, r3163; +} +{ +add.f16x2 r3167, r3139, r3164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3170, {low, high}; +} +{ +mul.f16x2 r3171, r2554, r3170; +} +{ +add.f16x2 r3174, r3146, r3171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r2548, r3177; +} +{ +add.f16x2 r3181, r3153, r3178; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3184, {low, high}; +} +{ +mul.f16x2 r3185, r2551, r3184; +} +{ +add.f16x2 r3188, r3160, r3185; +} +{ +sub.f16x2 %8, r3167, r3174; +} +{ +add.f16x2 %9, r3181, r3188; +} +{ +add.f16x2 %14, r3167, r3174; +} +{ +sub.f16x2 %15, r3181, r3188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3204, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3205, {low, high}; +} +{ +mul.f16x2 r3206, r2497, r3205; +} +{ +add.f16x2 r3209, r2558, r3206; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3212, {low, high}; +} +{ +mul.f16x2 r3213, r2506, r3212; +} +{ +add.f16x2 r3216, r3203, r3213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3219, {low, high}; +} +{ +mul.f16x2 r3220, r2500, r3219; +} +{ +add.f16x2 r3223, r2561, r3220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3226, {low, high}; +} +{ +mul.f16x2 r3227, r2503, r3226; +} +{ +add.f16x2 r3230, r3204, r3227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3233, {low, high}; +} +{ +mul.f16x2 r3234, r2509, r3233; +} +{ +add.f16x2 r3237, r3209, r3234; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3240, {low, high}; +} +{ +mul.f16x2 r3241, r2518, r3240; +} +{ +add.f16x2 r3244, r3216, r3241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3247, {low, high}; +} +{ +mul.f16x2 r3248, r2512, r3247; +} +{ +add.f16x2 r3251, r3223, r3248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3254, {low, high}; +} +{ +mul.f16x2 r3255, r2515, r3254; +} +{ +add.f16x2 r3258, r3230, r3255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3261, {low, high}; +} +{ +mul.f16x2 r3262, r2521, r3261; +} +{ +add.f16x2 r3265, r3237, r3262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r2530, r3268; +} +{ +add.f16x2 r3272, r3244, r3269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3275, {low, high}; +} +{ +mul.f16x2 r3276, r2524, r3275; +} +{ +add.f16x2 r3279, r3251, r3276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3282, {low, high}; +} +{ +mul.f16x2 r3283, r2527, r3282; +} +{ +add.f16x2 r3286, r3258, r3283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3289, {low, high}; +} +{ +mul.f16x2 r3290, r2533, r3289; +} +{ +add.f16x2 r3293, r3265, r3290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3296, {low, high}; +} +{ +mul.f16x2 r3297, r2542, r3296; +} +{ +add.f16x2 r3300, r3272, r3297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3303, {low, high}; +} +{ +mul.f16x2 r3304, r2536, r3303; +} +{ +add.f16x2 r3307, r3279, r3304; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3310, {low, high}; +} +{ +mul.f16x2 r3311, r2539, r3310; +} +{ +add.f16x2 r3314, r3286, r3311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3317, {low, high}; +} +{ +mul.f16x2 r3318, r2545, r3317; +} +{ +add.f16x2 r3321, r3293, r3318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3324, {low, high}; +} +{ +mul.f16x2 r3325, r2554, r3324; +} +{ +add.f16x2 r3328, r3300, r3325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3331, {low, high}; +} +{ +mul.f16x2 r3332, r2548, r3331; +} +{ +add.f16x2 r3335, r3307, r3332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3338, {low, high}; +} +{ +mul.f16x2 r3339, r2551, r3338; +} +{ +add.f16x2 r3342, r3314, r3339; +} +{ +sub.f16x2 %10, r3321, r3328; +} +{ +add.f16x2 %11, r3335, r3342; +} +{ +add.f16x2 %12, r3321, r3328; +} +{ +sub.f16x2 %13, r3335, r3342; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ec6dc9121c361 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp16_inv.hpp.inc @@ -0,0 +1,10883 @@ +#ifndef CUFFTDX_FFT_1331_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_1331_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1130, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<718>; +.reg .b32 r<3374>; +.reg .b64 rd<8>; +{ +add.f16x2 r25, %25, %43; +} +{ +add.f16x2 r28, %26, %44; +} +{ +sub.f16x2 r31, %25, %43; +} +{ +sub.f16x2 r34, %26, %44; +} +{ +add.f16x2 r37, %27, %41; +} +{ +add.f16x2 r40, %28, %42; +} +{ +sub.f16x2 r43, %27, %41; +} +{ +sub.f16x2 r46, %28, %42; +} +{ +add.f16x2 r49, %29, %39; +} +{ +add.f16x2 r52, %30, %40; +} +{ +sub.f16x2 r55, %29, %39; +} +{ +sub.f16x2 r58, %30, %40; +} +{ +add.f16x2 r61, %31, %37; +} +{ +add.f16x2 r64, %32, %38; +} +{ +sub.f16x2 r67, %31, %37; +} +{ +sub.f16x2 r70, %32, %38; +} +{ +add.f16x2 r73, %33, %35; +} +{ +add.f16x2 r76, %34, %36; +} +{ +sub.f16x2 r79, %33, %35; +} +{ +sub.f16x2 r82, %34, %36; +} +{ +add.f16x2 r85, %23, r25; +} +{ +add.f16x2 r88, %24, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 r103, r97, r61; +} +{ +add.f16x2 r106, r100, r64; +} +{ +add.f16x2 r109, r103, r73; +} +{ +add.f16x2 r112, r106, r76; +} +mov.f32 f183, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r116, {low, high}; +} +mov.f32 f197, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r117, {low, high}; +} +{ +mul.f16x2 r118, r25, r117; +} +{ +add.f16x2 r121, %23, r118; +} +mov.f32 f163, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r124, {low, high}; +} +{ +mul.f16x2 r125, r34, r124; +} +{ +add.f16x2 r128, r115, r125; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r131, {low, high}; +} +{ +mul.f16x2 r132, r28, r131; +} +{ +add.f16x2 r135, %24, r132; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r138, {low, high}; +} +{ +mul.f16x2 r139, r31, r138; +} +{ +add.f16x2 r142, r116, r139; +} +mov.f32 f213, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r145, {low, high}; +} +{ +mul.f16x2 r146, r37, r145; +} +{ +add.f16x2 r149, r121, r146; +} +mov.f32 f59, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r152, {low, high}; +} +{ +mul.f16x2 r153, r46, r152; +} +{ +add.f16x2 r156, r128, r153; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r159, {low, high}; +} +{ +mul.f16x2 r160, r40, r159; +} +{ +add.f16x2 r163, r135, r160; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r166, {low, high}; +} +{ +mul.f16x2 r167, r43, r166; +} +{ +add.f16x2 r170, r142, r167; +} +mov.f32 f221, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r173, {low, high}; +} +{ +mul.f16x2 r174, r49, r173; +} +{ +add.f16x2 r177, r149, r174; +} +mov.f32 f223, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r180, {low, high}; +} +{ +mul.f16x2 r181, r58, r180; +} +{ +add.f16x2 r184, r156, r181; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r187, {low, high}; +} +{ +mul.f16x2 r188, r52, r187; +} +{ +add.f16x2 r191, r163, r188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r194, {low, high}; +} +{ +mul.f16x2 r195, r55, r194; +} +{ +add.f16x2 r198, r170, r195; +} +mov.f32 f205, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r201, {low, high}; +} +{ +mul.f16x2 r202, r61, r201; +} +{ +add.f16x2 r205, r177, r202; +} +mov.f32 f207, 0f3F4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r208, {low, high}; +} +{ +mul.f16x2 r209, r70, r208; +} +{ +add.f16x2 r212, r184, r209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r215, {low, high}; +} +{ +mul.f16x2 r216, r64, r215; +} +{ +add.f16x2 r219, r191, r216; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r222, {low, high}; +} +{ +mul.f16x2 r223, r67, r222; +} +{ +add.f16x2 r226, r198, r223; +} +mov.f32 f189, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r229, {low, high}; +} +{ +mul.f16x2 r230, r73, r229; +} +{ +add.f16x2 r233, r205, r230; +} +mov.f32 f191, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r236, {low, high}; +} +{ +mul.f16x2 r237, r82, r236; +} +{ +add.f16x2 r240, r212, r237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r243, {low, high}; +} +{ +mul.f16x2 r244, r76, r243; +} +{ +add.f16x2 r247, r219, r244; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r250, {low, high}; +} +{ +mul.f16x2 r251, r79, r250; +} +{ +add.f16x2 r254, r226, r251; +} +{ +sub.f16x2 r257, r233, r240; +} +{ +add.f16x2 r260, r247, r254; +} +{ +add.f16x2 r263, r233, r240; +} +{ +sub.f16x2 r266, r247, r254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r270, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r271, {low, high}; +} +{ +mul.f16x2 r272, r25, r271; +} +{ +add.f16x2 r275, %23, r272; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r34, r278; +} +{ +add.f16x2 r282, r269, r279; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r285, {low, high}; +} +{ +mul.f16x2 r286, r28, r285; +} +{ +add.f16x2 r289, %24, r286; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r292, {low, high}; +} +{ +mul.f16x2 r293, r31, r292; +} +{ +add.f16x2 r296, r270, r293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r299, {low, high}; +} +{ +mul.f16x2 r300, r37, r299; +} +{ +add.f16x2 r303, r275, r300; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r306, {low, high}; +} +{ +mul.f16x2 r307, r46, r306; +} +{ +add.f16x2 r310, r282, r307; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r313, {low, high}; +} +{ +mul.f16x2 r314, r40, r313; +} +{ +add.f16x2 r317, r289, r314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r320, {low, high}; +} +{ +mul.f16x2 r321, r43, r320; +} +{ +add.f16x2 r324, r296, r321; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r327, {low, high}; +} +{ +mul.f16x2 r328, r49, r327; +} +{ +add.f16x2 r331, r303, r328; +} +mov.f32 f111, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r334, {low, high}; +} +{ +mul.f16x2 r335, r58, r334; +} +{ +add.f16x2 r338, r310, r335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r341, {low, high}; +} +{ +mul.f16x2 r342, r52, r341; +} +{ +add.f16x2 r345, r317, r342; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r55, r348; +} +{ +add.f16x2 r352, r324, r349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r355, {low, high}; +} +{ +mul.f16x2 r356, r61, r355; +} +{ +add.f16x2 r359, r331, r356; +} +mov.f32 f155, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r362, {low, high}; +} +{ +mul.f16x2 r363, r70, r362; +} +{ +add.f16x2 r366, r338, r363; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r369, {low, high}; +} +{ +mul.f16x2 r370, r64, r369; +} +{ +add.f16x2 r373, r345, r370; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r376, {low, high}; +} +{ +mul.f16x2 r377, r67, r376; +} +{ +add.f16x2 r380, r352, r377; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r383, {low, high}; +} +{ +mul.f16x2 r384, r73, r383; +} +{ +add.f16x2 r387, r359, r384; +} +mov.f32 f199, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r390, {low, high}; +} +{ +mul.f16x2 r391, r82, r390; +} +{ +add.f16x2 r394, r366, r391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r397, {low, high}; +} +{ +mul.f16x2 r398, r76, r397; +} +{ +add.f16x2 r401, r373, r398; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r404, {low, high}; +} +{ +mul.f16x2 r405, r79, r404; +} +{ +add.f16x2 r408, r380, r405; +} +{ +sub.f16x2 r411, r387, r394; +} +{ +add.f16x2 r414, r401, r408; +} +{ +add.f16x2 r417, r387, r394; +} +{ +sub.f16x2 r420, r401, r408; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r425, {low, high}; +} +{ +mul.f16x2 r426, r25, r425; +} +{ +add.f16x2 r429, %23, r426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r432, {low, high}; +} +{ +mul.f16x2 r433, r34, r432; +} +{ +add.f16x2 r436, r423, r433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r439, {low, high}; +} +{ +mul.f16x2 r440, r28, r439; +} +{ +add.f16x2 r443, %24, r440; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r31, r446; +} +{ +add.f16x2 r450, r424, r447; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r37, r453; +} +{ +add.f16x2 r457, r429, r454; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r460, {low, high}; +} +{ +mul.f16x2 r461, r46, r460; +} +{ +add.f16x2 r464, r436, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r467, {low, high}; +} +{ +mul.f16x2 r468, r40, r467; +} +{ +add.f16x2 r471, r443, r468; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r474, {low, high}; +} +{ +mul.f16x2 r475, r43, r474; +} +{ +add.f16x2 r478, r450, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r481, {low, high}; +} +{ +mul.f16x2 r482, r49, r481; +} +{ +add.f16x2 r485, r457, r482; +} +mov.f32 f215, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r488, {low, high}; +} +{ +mul.f16x2 r489, r58, r488; +} +{ +add.f16x2 r492, r464, r489; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r495, {low, high}; +} +{ +mul.f16x2 r496, r52, r495; +} +{ +add.f16x2 r499, r471, r496; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r502, {low, high}; +} +{ +mul.f16x2 r503, r55, r502; +} +{ +add.f16x2 r506, r478, r503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r509, {low, high}; +} +{ +mul.f16x2 r510, r61, r509; +} +{ +add.f16x2 r513, r485, r510; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r516, {low, high}; +} +{ +mul.f16x2 r517, r70, r516; +} +{ +add.f16x2 r520, r492, r517; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r523, {low, high}; +} +{ +mul.f16x2 r524, r64, r523; +} +{ +add.f16x2 r527, r499, r524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r530, {low, high}; +} +{ +mul.f16x2 r531, r67, r530; +} +{ +add.f16x2 r534, r506, r531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r537, {low, high}; +} +{ +mul.f16x2 r538, r73, r537; +} +{ +add.f16x2 r541, r513, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r544, {low, high}; +} +{ +mul.f16x2 r545, r82, r544; +} +{ +add.f16x2 r548, r520, r545; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r551, {low, high}; +} +{ +mul.f16x2 r552, r76, r551; +} +{ +add.f16x2 r555, r527, r552; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r558, {low, high}; +} +{ +mul.f16x2 r559, r79, r558; +} +{ +add.f16x2 r562, r534, r559; +} +{ +sub.f16x2 r565, r541, r548; +} +{ +add.f16x2 r568, r555, r562; +} +{ +add.f16x2 r571, r541, r548; +} +{ +sub.f16x2 r574, r555, r562; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r579, {low, high}; +} +{ +mul.f16x2 r580, r25, r579; +} +{ +add.f16x2 r583, %23, r580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r586, {low, high}; +} +{ +mul.f16x2 r587, r34, r586; +} +{ +add.f16x2 r590, r577, r587; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r593, {low, high}; +} +{ +mul.f16x2 r594, r28, r593; +} +{ +add.f16x2 r597, %24, r594; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r600, {low, high}; +} +{ +mul.f16x2 r601, r31, r600; +} +{ +add.f16x2 r604, r578, r601; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r37, r607; +} +{ +add.f16x2 r611, r583, r608; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r615, r46, r614; +} +{ +add.f16x2 r618, r590, r615; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r621, {low, high}; +} +{ +mul.f16x2 r622, r40, r621; +} +{ +add.f16x2 r625, r597, r622; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r628, {low, high}; +} +{ +mul.f16x2 r629, r43, r628; +} +{ +add.f16x2 r632, r604, r629; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r635, {low, high}; +} +{ +mul.f16x2 r636, r49, r635; +} +{ +add.f16x2 r639, r611, r636; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r642, {low, high}; +} +{ +mul.f16x2 r643, r58, r642; +} +{ +add.f16x2 r646, r618, r643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r649, {low, high}; +} +{ +mul.f16x2 r650, r52, r649; +} +{ +add.f16x2 r653, r625, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r656, {low, high}; +} +{ +mul.f16x2 r657, r55, r656; +} +{ +add.f16x2 r660, r632, r657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r663, {low, high}; +} +{ +mul.f16x2 r664, r61, r663; +} +{ +add.f16x2 r667, r639, r664; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r670, {low, high}; +} +{ +mul.f16x2 r671, r70, r670; +} +{ +add.f16x2 r674, r646, r671; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r677, {low, high}; +} +{ +mul.f16x2 r678, r64, r677; +} +{ +add.f16x2 r681, r653, r678; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r684, {low, high}; +} +{ +mul.f16x2 r685, r67, r684; +} +{ +add.f16x2 r688, r660, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r691, {low, high}; +} +{ +mul.f16x2 r692, r73, r691; +} +{ +add.f16x2 r695, r667, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r82, r698; +} +{ +add.f16x2 r702, r674, r699; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r705, {low, high}; +} +{ +mul.f16x2 r706, r76, r705; +} +{ +add.f16x2 r709, r681, r706; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r712, {low, high}; +} +{ +mul.f16x2 r713, r79, r712; +} +{ +add.f16x2 r716, r688, r713; +} +{ +sub.f16x2 r719, r695, r702; +} +{ +add.f16x2 r722, r709, r716; +} +{ +add.f16x2 r725, r695, r702; +} +{ +sub.f16x2 r728, r709, r716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r25, r733; +} +{ +add.f16x2 r737, %23, r734; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r740, {low, high}; +} +{ +mul.f16x2 r741, r34, r740; +} +{ +add.f16x2 r744, r731, r741; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r28, r747; +} +{ +add.f16x2 r751, %24, r748; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r754, {low, high}; +} +{ +mul.f16x2 r755, r31, r754; +} +{ +add.f16x2 r758, r732, r755; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r37, r761; +} +{ +add.f16x2 r765, r737, r762; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r768, {low, high}; +} +{ +mul.f16x2 r769, r46, r768; +} +{ +add.f16x2 r772, r744, r769; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r775, {low, high}; +} +{ +mul.f16x2 r776, r40, r775; +} +{ +add.f16x2 r779, r751, r776; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r782, {low, high}; +} +{ +mul.f16x2 r783, r43, r782; +} +{ +add.f16x2 r786, r758, r783; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r789, {low, high}; +} +{ +mul.f16x2 r790, r49, r789; +} +{ +add.f16x2 r793, r765, r790; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r796, {low, high}; +} +{ +mul.f16x2 r797, r58, r796; +} +{ +add.f16x2 r800, r772, r797; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r803, {low, high}; +} +{ +mul.f16x2 r804, r52, r803; +} +{ +add.f16x2 r807, r779, r804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r55, r810; +} +{ +add.f16x2 r814, r786, r811; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r817, {low, high}; +} +{ +mul.f16x2 r818, r61, r817; +} +{ +add.f16x2 r821, r793, r818; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r824, {low, high}; +} +{ +mul.f16x2 r825, r70, r824; +} +{ +add.f16x2 r828, r800, r825; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r831, {low, high}; +} +{ +mul.f16x2 r832, r64, r831; +} +{ +add.f16x2 r835, r807, r832; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r838, {low, high}; +} +{ +mul.f16x2 r839, r67, r838; +} +{ +add.f16x2 r842, r814, r839; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r845, {low, high}; +} +{ +mul.f16x2 r846, r73, r845; +} +{ +add.f16x2 r849, r821, r846; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r852, {low, high}; +} +{ +mul.f16x2 r853, r82, r852; +} +{ +add.f16x2 r856, r828, r853; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r859, {low, high}; +} +{ +mul.f16x2 r860, r76, r859; +} +{ +add.f16x2 r863, r835, r860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r79, r866; +} +{ +add.f16x2 r870, r842, r867; +} +{ +sub.f16x2 r873, r849, r856; +} +{ +add.f16x2 r876, r863, r870; +} +{ +add.f16x2 r879, r849, r856; +} +{ +sub.f16x2 r882, r863, r870; +} +mov.u32 r23, %tid.x; +mul.wide.u32 rd2, r23, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r885, rd3; +sub.s32 r886, r23, r885; +shr.u32 r887, r886, 1; +add.s32 r888, r887, r885; +shr.u32 r889, r888, 6; +mul.lo.s32 r890, r889, 121; +sub.s32 r24, r23, r890; +cvt.rn.f32.u32 f225, r24; +mul.f32 f1, f225, 0f3B9AAFAF; +setp.eq.s32 p1, r24, 18; +mov.f32 f717, 0f3DADD00E; +@p1 bra LBB0_2; +sin.approx.f32 f717, f1; +LBB0_2: +mov.u32 r3362, %tid.y; +mov.u32 r3363, %22; +mad.lo.s32 r3364, r3362, 10648, r3363; +mad.lo.s32 r3365, r889, 10648, r3364; +neg.f32 f227, f717; +cos.approx.f32 f226, f1; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f227; +mov.b32 r891, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r260, r896; +} +{ +fma.rn.f16x2 r901, r257, r894, r898; +} +{ +mul.f16x2 r905, r257, r896; +} +{ +neg.f16x2 r908, r905; +} +{ +fma.rn.f16x2 r910, r260, r894, r908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r916, {high, high}; +} +mov.f32 f490, 0fBF800000; +mov.f32 f491, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r891, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r414, r933; +} +{ +fma.rn.f16x2 r938, r411, r931, r935; +} +{ +mul.f16x2 r942, r411, r933; +} +{ +neg.f16x2 r945, r942; +} +{ +fma.rn.f16x2 r947, r414, r931, r945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r568, r970; +} +{ +fma.rn.f16x2 r975, r565, r968, r972; +} +{ +mul.f16x2 r979, r565, r970; +} +{ +neg.f16x2 r982, r979; +} +{ +fma.rn.f16x2 r984, r568, r968, r982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r722, r1007; +} +{ +fma.rn.f16x2 r1012, r719, r1005, r1009; +} +{ +mul.f16x2 r1016, r719, r1007; +} +{ +neg.f16x2 r1019, r1016; +} +{ +fma.rn.f16x2 r1021, r722, r1005, r1019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r876, r1044; +} +{ +fma.rn.f16x2 r1049, r873, r1042, r1046; +} +{ +mul.f16x2 r1053, r873, r1044; +} +{ +neg.f16x2 r1056, r1053; +} +{ +fma.rn.f16x2 r1058, r876, r1042, r1056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1066, {low, high}; +} +{ +mul.f16x2 r1067, r1064, r1066; +} +{ +mul.f16x2 r1070, r1038, r1062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1073, {high, low}; +} +{ +fma.rn.f16x2 r1075, r1067, r1073, r1070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1081, {high, high}; +} +{ +mul.f16x2 r1083, r882, r1081; +} +{ +fma.rn.f16x2 r1086, r879, r1079, r1083; +} +{ +mul.f16x2 r1090, r879, r1081; +} +{ +neg.f16x2 r1093, r1090; +} +{ +fma.rn.f16x2 r1095, r882, r1079, r1093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1103, {low, high}; +} +{ +mul.f16x2 r1104, r1101, r1103; +} +{ +mul.f16x2 r1107, r1075, r1099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1110, {high, low}; +} +{ +fma.rn.f16x2 r1112, r1104, r1110, r1107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r728, r1118; +} +{ +fma.rn.f16x2 r1123, r725, r1116, r1120; +} +{ +mul.f16x2 r1127, r725, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r728, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1112, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r574, r1155; +} +{ +fma.rn.f16x2 r1160, r571, r1153, r1157; +} +{ +mul.f16x2 r1164, r571, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r574, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r420, r1192; +} +{ +fma.rn.f16x2 r1197, r417, r1190, r1194; +} +{ +mul.f16x2 r1201, r417, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r420, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r266, r1229; +} +{ +fma.rn.f16x2 r1234, r263, r1227, r1231; +} +{ +mul.f16x2 r1238, r263, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r266, r1227, r1241; +} +barrier.sync 0; +mad.lo.s32 r3366, r24, 88, r3365; +st.shared.v2.f32 [r3366], {r109, r112}; +st.shared.v2.f32 [r3366+8], {r901, r910}; +st.shared.v2.f32 [r3366+16], {r938, r947}; +st.shared.v2.f32 [r3366+24], {r975, r984}; +st.shared.v2.f32 [r3366+32], {r1012, r1021}; +st.shared.v2.f32 [r3366+40], {r1049, r1058}; +st.shared.v2.f32 [r3366+48], {r1086, r1095}; +st.shared.v2.f32 [r3366+56], {r1123, r1132}; +st.shared.v2.f32 [r3366+64], {r1160, r1169}; +st.shared.v2.f32 [r3366+72], {r1197, r1206}; +st.shared.v2.f32 [r3366+80], {r1234, r1243}; +barrier.sync 0; +mad.lo.s32 r3367, r24, -80, r3366; +ld.shared.u32 r1325, [r3367]; +ld.shared.u32 r1328, [r3367+4]; +ld.shared.u32 r1265, [r3367+968]; +ld.shared.u32 r1268, [r3367+972]; +ld.shared.u32 r1277, [r3367+1936]; +ld.shared.u32 r1280, [r3367+1940]; +ld.shared.u32 r1289, [r3367+2904]; +ld.shared.u32 r1292, [r3367+2908]; +ld.shared.u32 r1301, [r3367+3872]; +ld.shared.u32 r1304, [r3367+3876]; +ld.shared.u32 r1313, [r3367+4840]; +ld.shared.u32 r1316, [r3367+4844]; +ld.shared.u32 r1314, [r3367+5808]; +ld.shared.u32 r1317, [r3367+5812]; +ld.shared.u32 r1302, [r3367+6776]; +ld.shared.u32 r1305, [r3367+6780]; +ld.shared.u32 r1290, [r3367+7744]; +ld.shared.u32 r1293, [r3367+7748]; +ld.shared.u32 r1278, [r3367+8712]; +ld.shared.u32 r1281, [r3367+8716]; +ld.shared.u32 r1266, [r3367+9680]; +ld.shared.u32 r1269, [r3367+9684]; +{ +add.f16x2 r1264, r1265, r1266; +} +{ +add.f16x2 r1267, r1268, r1269; +} +{ +sub.f16x2 r1270, r1265, r1266; +} +{ +sub.f16x2 r1273, r1268, r1269; +} +{ +add.f16x2 r1276, r1277, r1278; +} +{ +add.f16x2 r1279, r1280, r1281; +} +{ +sub.f16x2 r1282, r1277, r1278; +} +{ +sub.f16x2 r1285, r1280, r1281; +} +{ +add.f16x2 r1288, r1289, r1290; +} +{ +add.f16x2 r1291, r1292, r1293; +} +{ +sub.f16x2 r1294, r1289, r1290; +} +{ +sub.f16x2 r1297, r1292, r1293; +} +{ +add.f16x2 r1300, r1301, r1302; +} +{ +add.f16x2 r1303, r1304, r1305; +} +{ +sub.f16x2 r1306, r1301, r1302; +} +{ +sub.f16x2 r1309, r1304, r1305; +} +{ +add.f16x2 r1312, r1313, r1314; +} +{ +add.f16x2 r1315, r1316, r1317; +} +{ +sub.f16x2 r1318, r1313, r1314; +} +{ +sub.f16x2 r1321, r1316, r1317; +} +{ +add.f16x2 r1324, r1325, r1264; +} +{ +add.f16x2 r1327, r1328, r1267; +} +{ +add.f16x2 r1330, r1324, r1276; +} +{ +add.f16x2 r1333, r1327, r1279; +} +{ +add.f16x2 r1336, r1330, r1288; +} +{ +add.f16x2 r1339, r1333, r1291; +} +{ +add.f16x2 r1342, r1336, r1300; +} +{ +add.f16x2 r1345, r1339, r1303; +} +{ +add.f16x2 r1348, r1342, r1312; +} +{ +add.f16x2 r1351, r1345, r1315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1354, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1356, {low, high}; +} +{ +mul.f16x2 r1357, r1264, r1356; +} +{ +add.f16x2 r1360, r1325, r1357; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1363, {low, high}; +} +{ +mul.f16x2 r1364, r1273, r1363; +} +{ +add.f16x2 r1367, r1354, r1364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1267, r1370; +} +{ +add.f16x2 r1374, r1328, r1371; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1377, {low, high}; +} +{ +mul.f16x2 r1378, r1270, r1377; +} +{ +add.f16x2 r1381, r1355, r1378; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1384, {low, high}; +} +{ +mul.f16x2 r1385, r1276, r1384; +} +{ +add.f16x2 r1388, r1360, r1385; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1391, {low, high}; +} +{ +mul.f16x2 r1392, r1285, r1391; +} +{ +add.f16x2 r1395, r1367, r1392; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1398, {low, high}; +} +{ +mul.f16x2 r1399, r1279, r1398; +} +{ +add.f16x2 r1402, r1374, r1399; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1405, {low, high}; +} +{ +mul.f16x2 r1406, r1282, r1405; +} +{ +add.f16x2 r1409, r1381, r1406; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1412, {low, high}; +} +{ +mul.f16x2 r1413, r1288, r1412; +} +{ +add.f16x2 r1416, r1388, r1413; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1419, {low, high}; +} +{ +mul.f16x2 r1420, r1297, r1419; +} +{ +add.f16x2 r1423, r1395, r1420; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1426, {low, high}; +} +{ +mul.f16x2 r1427, r1291, r1426; +} +{ +add.f16x2 r1430, r1402, r1427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1433, {low, high}; +} +{ +mul.f16x2 r1434, r1294, r1433; +} +{ +add.f16x2 r1437, r1409, r1434; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1440, {low, high}; +} +{ +mul.f16x2 r1441, r1300, r1440; +} +{ +add.f16x2 r1444, r1416, r1441; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1447, {low, high}; +} +{ +mul.f16x2 r1448, r1309, r1447; +} +{ +add.f16x2 r1451, r1423, r1448; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1454, {low, high}; +} +{ +mul.f16x2 r1455, r1303, r1454; +} +{ +add.f16x2 r1458, r1430, r1455; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1461, {low, high}; +} +{ +mul.f16x2 r1462, r1306, r1461; +} +{ +add.f16x2 r1465, r1437, r1462; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1468, {low, high}; +} +{ +mul.f16x2 r1469, r1312, r1468; +} +{ +add.f16x2 r1472, r1444, r1469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1475, {low, high}; +} +{ +mul.f16x2 r1476, r1321, r1475; +} +{ +add.f16x2 r1479, r1451, r1476; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1482, {low, high}; +} +{ +mul.f16x2 r1483, r1315, r1482; +} +{ +add.f16x2 r1486, r1458, r1483; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1489, {low, high}; +} +{ +mul.f16x2 r1490, r1318, r1489; +} +{ +add.f16x2 r1493, r1465, r1490; +} +{ +sub.f16x2 r1496, r1472, r1479; +} +{ +add.f16x2 r1499, r1486, r1493; +} +{ +add.f16x2 r1502, r1472, r1479; +} +{ +sub.f16x2 r1505, r1486, r1493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1510, {low, high}; +} +{ +mul.f16x2 r1511, r1264, r1510; +} +{ +add.f16x2 r1514, r1325, r1511; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1517, {low, high}; +} +{ +mul.f16x2 r1518, r1273, r1517; +} +{ +add.f16x2 r1521, r1508, r1518; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1524, {low, high}; +} +{ +mul.f16x2 r1525, r1267, r1524; +} +{ +add.f16x2 r1528, r1328, r1525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1531, {low, high}; +} +{ +mul.f16x2 r1532, r1270, r1531; +} +{ +add.f16x2 r1535, r1509, r1532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1538, {low, high}; +} +{ +mul.f16x2 r1539, r1276, r1538; +} +{ +add.f16x2 r1542, r1514, r1539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1545, {low, high}; +} +{ +mul.f16x2 r1546, r1285, r1545; +} +{ +add.f16x2 r1549, r1521, r1546; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1552, {low, high}; +} +{ +mul.f16x2 r1553, r1279, r1552; +} +{ +add.f16x2 r1556, r1528, r1553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1559, {low, high}; +} +{ +mul.f16x2 r1560, r1282, r1559; +} +{ +add.f16x2 r1563, r1535, r1560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1566, {low, high}; +} +{ +mul.f16x2 r1567, r1288, r1566; +} +{ +add.f16x2 r1570, r1542, r1567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1573, {low, high}; +} +{ +mul.f16x2 r1574, r1297, r1573; +} +{ +add.f16x2 r1577, r1549, r1574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1291, r1580; +} +{ +add.f16x2 r1584, r1556, r1581; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1587, {low, high}; +} +{ +mul.f16x2 r1588, r1294, r1587; +} +{ +add.f16x2 r1591, r1563, r1588; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1594, {low, high}; +} +{ +mul.f16x2 r1595, r1300, r1594; +} +{ +add.f16x2 r1598, r1570, r1595; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1601, {low, high}; +} +{ +mul.f16x2 r1602, r1309, r1601; +} +{ +add.f16x2 r1605, r1577, r1602; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1608, {low, high}; +} +{ +mul.f16x2 r1609, r1303, r1608; +} +{ +add.f16x2 r1612, r1584, r1609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1615, {low, high}; +} +{ +mul.f16x2 r1616, r1306, r1615; +} +{ +add.f16x2 r1619, r1591, r1616; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1622, {low, high}; +} +{ +mul.f16x2 r1623, r1312, r1622; +} +{ +add.f16x2 r1626, r1598, r1623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1629, {low, high}; +} +{ +mul.f16x2 r1630, r1321, r1629; +} +{ +add.f16x2 r1633, r1605, r1630; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1636, {low, high}; +} +{ +mul.f16x2 r1637, r1315, r1636; +} +{ +add.f16x2 r1640, r1612, r1637; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1643, {low, high}; +} +{ +mul.f16x2 r1644, r1318, r1643; +} +{ +add.f16x2 r1647, r1619, r1644; +} +{ +sub.f16x2 r1650, r1626, r1633; +} +{ +add.f16x2 r1653, r1640, r1647; +} +{ +add.f16x2 r1656, r1626, r1633; +} +{ +sub.f16x2 r1659, r1640, r1647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1662, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1663, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1664, {low, high}; +} +{ +mul.f16x2 r1665, r1264, r1664; +} +{ +add.f16x2 r1668, r1325, r1665; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1671, {low, high}; +} +{ +mul.f16x2 r1672, r1273, r1671; +} +{ +add.f16x2 r1675, r1662, r1672; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1678, {low, high}; +} +{ +mul.f16x2 r1679, r1267, r1678; +} +{ +add.f16x2 r1682, r1328, r1679; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1685, {low, high}; +} +{ +mul.f16x2 r1686, r1270, r1685; +} +{ +add.f16x2 r1689, r1663, r1686; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1692, {low, high}; +} +{ +mul.f16x2 r1693, r1276, r1692; +} +{ +add.f16x2 r1696, r1668, r1693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1699, {low, high}; +} +{ +mul.f16x2 r1700, r1285, r1699; +} +{ +add.f16x2 r1703, r1675, r1700; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1706, {low, high}; +} +{ +mul.f16x2 r1707, r1279, r1706; +} +{ +add.f16x2 r1710, r1682, r1707; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1713, {low, high}; +} +{ +mul.f16x2 r1714, r1282, r1713; +} +{ +add.f16x2 r1717, r1689, r1714; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1720, {low, high}; +} +{ +mul.f16x2 r1721, r1288, r1720; +} +{ +add.f16x2 r1724, r1696, r1721; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1297, r1727; +} +{ +add.f16x2 r1731, r1703, r1728; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1734, {low, high}; +} +{ +mul.f16x2 r1735, r1291, r1734; +} +{ +add.f16x2 r1738, r1710, r1735; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1294, r1741; +} +{ +add.f16x2 r1745, r1717, r1742; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1748, {low, high}; +} +{ +mul.f16x2 r1749, r1300, r1748; +} +{ +add.f16x2 r1752, r1724, r1749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1755, {low, high}; +} +{ +mul.f16x2 r1756, r1309, r1755; +} +{ +add.f16x2 r1759, r1731, r1756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1762, {low, high}; +} +{ +mul.f16x2 r1763, r1303, r1762; +} +{ +add.f16x2 r1766, r1738, r1763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1769, {low, high}; +} +{ +mul.f16x2 r1770, r1306, r1769; +} +{ +add.f16x2 r1773, r1745, r1770; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1776, {low, high}; +} +{ +mul.f16x2 r1777, r1312, r1776; +} +{ +add.f16x2 r1780, r1752, r1777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1321, r1783; +} +{ +add.f16x2 r1787, r1759, r1784; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1790, {low, high}; +} +{ +mul.f16x2 r1791, r1315, r1790; +} +{ +add.f16x2 r1794, r1766, r1791; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1318, r1797; +} +{ +add.f16x2 r1801, r1773, r1798; +} +{ +sub.f16x2 r1804, r1780, r1787; +} +{ +add.f16x2 r1807, r1794, r1801; +} +{ +add.f16x2 r1810, r1780, r1787; +} +{ +sub.f16x2 r1813, r1794, r1801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1817, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1264, r1818; +} +{ +add.f16x2 r1822, r1325, r1819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1825, {low, high}; +} +{ +mul.f16x2 r1826, r1273, r1825; +} +{ +add.f16x2 r1829, r1816, r1826; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1832, {low, high}; +} +{ +mul.f16x2 r1833, r1267, r1832; +} +{ +add.f16x2 r1836, r1328, r1833; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1839, {low, high}; +} +{ +mul.f16x2 r1840, r1270, r1839; +} +{ +add.f16x2 r1843, r1817, r1840; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1846, {low, high}; +} +{ +mul.f16x2 r1847, r1276, r1846; +} +{ +add.f16x2 r1850, r1822, r1847; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1853, {low, high}; +} +{ +mul.f16x2 r1854, r1285, r1853; +} +{ +add.f16x2 r1857, r1829, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1860, {low, high}; +} +{ +mul.f16x2 r1861, r1279, r1860; +} +{ +add.f16x2 r1864, r1836, r1861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1867, {low, high}; +} +{ +mul.f16x2 r1868, r1282, r1867; +} +{ +add.f16x2 r1871, r1843, r1868; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1874, {low, high}; +} +{ +mul.f16x2 r1875, r1288, r1874; +} +{ +add.f16x2 r1878, r1850, r1875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1881, {low, high}; +} +{ +mul.f16x2 r1882, r1297, r1881; +} +{ +add.f16x2 r1885, r1857, r1882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1291, r1888; +} +{ +add.f16x2 r1892, r1864, r1889; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1895, {low, high}; +} +{ +mul.f16x2 r1896, r1294, r1895; +} +{ +add.f16x2 r1899, r1871, r1896; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1902, {low, high}; +} +{ +mul.f16x2 r1903, r1300, r1902; +} +{ +add.f16x2 r1906, r1878, r1903; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1909, {low, high}; +} +{ +mul.f16x2 r1910, r1309, r1909; +} +{ +add.f16x2 r1913, r1885, r1910; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1916, {low, high}; +} +{ +mul.f16x2 r1917, r1303, r1916; +} +{ +add.f16x2 r1920, r1892, r1917; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1923, {low, high}; +} +{ +mul.f16x2 r1924, r1306, r1923; +} +{ +add.f16x2 r1927, r1899, r1924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1930, {low, high}; +} +{ +mul.f16x2 r1931, r1312, r1930; +} +{ +add.f16x2 r1934, r1906, r1931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1937, {low, high}; +} +{ +mul.f16x2 r1938, r1321, r1937; +} +{ +add.f16x2 r1941, r1913, r1938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1944, {low, high}; +} +{ +mul.f16x2 r1945, r1315, r1944; +} +{ +add.f16x2 r1948, r1920, r1945; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1951, {low, high}; +} +{ +mul.f16x2 r1952, r1318, r1951; +} +{ +add.f16x2 r1955, r1927, r1952; +} +{ +sub.f16x2 r1958, r1934, r1941; +} +{ +add.f16x2 r1961, r1948, r1955; +} +{ +add.f16x2 r1964, r1934, r1941; +} +{ +sub.f16x2 r1967, r1948, r1955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1972, {low, high}; +} +{ +mul.f16x2 r1973, r1264, r1972; +} +{ +add.f16x2 r1976, r1325, r1973; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1979, {low, high}; +} +{ +mul.f16x2 r1980, r1273, r1979; +} +{ +add.f16x2 r1983, r1970, r1980; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1986, {low, high}; +} +{ +mul.f16x2 r1987, r1267, r1986; +} +{ +add.f16x2 r1990, r1328, r1987; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1993, {low, high}; +} +{ +mul.f16x2 r1994, r1270, r1993; +} +{ +add.f16x2 r1997, r1971, r1994; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2000, {low, high}; +} +{ +mul.f16x2 r2001, r1276, r2000; +} +{ +add.f16x2 r2004, r1976, r2001; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2007, {low, high}; +} +{ +mul.f16x2 r2008, r1285, r2007; +} +{ +add.f16x2 r2011, r1983, r2008; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2014, {low, high}; +} +{ +mul.f16x2 r2015, r1279, r2014; +} +{ +add.f16x2 r2018, r1990, r2015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2021, {low, high}; +} +{ +mul.f16x2 r2022, r1282, r2021; +} +{ +add.f16x2 r2025, r1997, r2022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2028, {low, high}; +} +{ +mul.f16x2 r2029, r1288, r2028; +} +{ +add.f16x2 r2032, r2004, r2029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2035, {low, high}; +} +{ +mul.f16x2 r2036, r1297, r2035; +} +{ +add.f16x2 r2039, r2011, r2036; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2042, {low, high}; +} +{ +mul.f16x2 r2043, r1291, r2042; +} +{ +add.f16x2 r2046, r2018, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2049, {low, high}; +} +{ +mul.f16x2 r2050, r1294, r2049; +} +{ +add.f16x2 r2053, r2025, r2050; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2057, r1300, r2056; +} +{ +add.f16x2 r2060, r2032, r2057; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2063, {low, high}; +} +{ +mul.f16x2 r2064, r1309, r2063; +} +{ +add.f16x2 r2067, r2039, r2064; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2070, {low, high}; +} +{ +mul.f16x2 r2071, r1303, r2070; +} +{ +add.f16x2 r2074, r2046, r2071; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r1306, r2077; +} +{ +add.f16x2 r2081, r2053, r2078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r1312, r2084; +} +{ +add.f16x2 r2088, r2060, r2085; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2091, {low, high}; +} +{ +mul.f16x2 r2092, r1321, r2091; +} +{ +add.f16x2 r2095, r2067, r2092; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2098, {low, high}; +} +{ +mul.f16x2 r2099, r1315, r2098; +} +{ +add.f16x2 r2102, r2074, r2099; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2105, {low, high}; +} +{ +mul.f16x2 r2106, r1318, r2105; +} +{ +add.f16x2 r2109, r2081, r2106; +} +{ +sub.f16x2 r2112, r2088, r2095; +} +{ +add.f16x2 r2115, r2102, r2109; +} +{ +add.f16x2 r2118, r2088, r2095; +} +{ +sub.f16x2 r2121, r2102, r2109; +} +mul.wide.u32 rd6, r24, -1171354717; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r3368, rd7; +cvt.rn.f32.u32 f714, r3368; +mul.f32 f715, f714, 0f3D54B191; +cos.approx.f32 f470, f715; +sin.approx.f32 f716, f715; +neg.f32 f471, f716; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f470; +cvt.rn.f16.f32 high, f471; +mov.b32 r2124, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1499, r2129; +} +{ +fma.rn.f16x2 r2134, r1496, r2127, r2131; +} +{ +mul.f16x2 r2138, r1496, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1499, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2124, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1653, r2166; +} +{ +fma.rn.f16x2 r2171, r1650, r2164, r2168; +} +{ +mul.f16x2 r2175, r1650, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1653, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1807, r2203; +} +{ +fma.rn.f16x2 r2208, r1804, r2201, r2205; +} +{ +mul.f16x2 r2212, r1804, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1807, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1961, r2240; +} +{ +fma.rn.f16x2 r2245, r1958, r2238, r2242; +} +{ +mul.f16x2 r2249, r1958, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1961, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r2115, r2277; +} +{ +fma.rn.f16x2 r2282, r2112, r2275, r2279; +} +{ +mul.f16x2 r2286, r2112, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r2115, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r2121, r2314; +} +{ +fma.rn.f16x2 r2319, r2118, r2312, r2316; +} +{ +mul.f16x2 r2323, r2118, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r2121, r2312, r2326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2334, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2336, {low, high}; +} +{ +mul.f16x2 r2337, r2334, r2336; +} +{ +mul.f16x2 r2340, r2308, r2332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2343, {high, low}; +} +{ +fma.rn.f16x2 r2345, r2337, r2343, r2340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2349, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2351, {high, high}; +} +{ +mul.f16x2 r2353, r1967, r2351; +} +{ +fma.rn.f16x2 r2356, r1964, r2349, r2353; +} +{ +mul.f16x2 r2360, r1964, r2351; +} +{ +neg.f16x2 r2363, r2360; +} +{ +fma.rn.f16x2 r2365, r1967, r2349, r2363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2371, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2373, {low, high}; +} +{ +mul.f16x2 r2374, r2371, r2373; +} +{ +mul.f16x2 r2377, r2345, r2369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2380, {high, low}; +} +{ +fma.rn.f16x2 r2382, r2374, r2380, r2377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2388, {high, high}; +} +{ +mul.f16x2 r2390, r1813, r2388; +} +{ +fma.rn.f16x2 r2393, r1810, r2386, r2390; +} +{ +mul.f16x2 r2397, r1810, r2388; +} +{ +neg.f16x2 r2400, r2397; +} +{ +fma.rn.f16x2 r2402, r1813, r2386, r2400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2408, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2410, {low, high}; +} +{ +mul.f16x2 r2411, r2408, r2410; +} +{ +mul.f16x2 r2414, r2382, r2406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2417, {high, low}; +} +{ +fma.rn.f16x2 r2419, r2411, r2417, r2414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2423, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2425, {high, high}; +} +{ +mul.f16x2 r2427, r1659, r2425; +} +{ +fma.rn.f16x2 r2430, r1656, r2423, r2427; +} +{ +mul.f16x2 r2434, r1656, r2425; +} +{ +neg.f16x2 r2437, r2434; +} +{ +fma.rn.f16x2 r2439, r1659, r2423, r2437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2443, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2445, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2447, {low, high}; +} +{ +mul.f16x2 r2448, r2445, r2447; +} +{ +mul.f16x2 r2451, r2419, r2443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2454, {high, low}; +} +{ +fma.rn.f16x2 r2456, r2448, r2454, r2451; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2460, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2462, {high, high}; +} +{ +mul.f16x2 r2464, r1505, r2462; +} +{ +fma.rn.f16x2 r2467, r1502, r2460, r2464; +} +{ +mul.f16x2 r2471, r1502, r2462; +} +{ +neg.f16x2 r2474, r2471; +} +{ +fma.rn.f16x2 r2476, r1505, r2460, r2474; +} +mul.lo.s32 r3369, r3368, 11; +sub.s32 r3370, r24, r3369; +shl.b32 r3371, r3370, 3; +add.s32 r3372, r3365, r3371; +barrier.sync 0; +mad.lo.s32 r3373, r3368, 968, r3372; +st.shared.u32 [r3373], r1348; +st.shared.u32 [r3373+4], r1351; +st.shared.u32 [r3373+88], r2134; +st.shared.u32 [r3373+92], r2143; +st.shared.u32 [r3373+176], r2171; +st.shared.u32 [r3373+180], r2180; +st.shared.u32 [r3373+264], r2208; +st.shared.u32 [r3373+268], r2217; +st.shared.u32 [r3373+352], r2245; +st.shared.u32 [r3373+356], r2254; +st.shared.u32 [r3373+440], r2282; +st.shared.u32 [r3373+444], r2291; +st.shared.u32 [r3373+528], r2319; +st.shared.u32 [r3373+532], r2328; +st.shared.u32 [r3373+616], r2356; +st.shared.u32 [r3373+620], r2365; +st.shared.u32 [r3373+704], r2393; +st.shared.u32 [r3373+708], r2402; +st.shared.u32 [r3373+792], r2430; +st.shared.u32 [r3373+796], r2439; +st.shared.u32 [r3373+880], r2467; +st.shared.u32 [r3373+884], r2476; +barrier.sync 0; +ld.shared.u32 r2558, [r3367]; +ld.shared.u32 r2561, [r3367+4]; +ld.shared.u32 r2498, [r3367+968]; +ld.shared.u32 r2501, [r3367+972]; +ld.shared.u32 r2510, [r3367+1936]; +ld.shared.u32 r2513, [r3367+1940]; +ld.shared.u32 r2522, [r3367+2904]; +ld.shared.u32 r2525, [r3367+2908]; +ld.shared.u32 r2534, [r3367+3872]; +ld.shared.u32 r2537, [r3367+3876]; +ld.shared.u32 r2546, [r3367+4840]; +ld.shared.u32 r2549, [r3367+4844]; +ld.shared.u32 r2547, [r3367+5808]; +ld.shared.u32 r2550, [r3367+5812]; +ld.shared.u32 r2535, [r3367+6776]; +ld.shared.u32 r2538, [r3367+6780]; +ld.shared.u32 r2523, [r3367+7744]; +ld.shared.u32 r2526, [r3367+7748]; +ld.shared.u32 r2511, [r3367+8712]; +ld.shared.u32 r2514, [r3367+8716]; +ld.shared.u32 r2499, [r3367+9680]; +ld.shared.u32 r2502, [r3367+9684]; +{ +add.f16x2 r2497, r2498, r2499; +} +{ +add.f16x2 r2500, r2501, r2502; +} +{ +sub.f16x2 r2503, r2498, r2499; +} +{ +sub.f16x2 r2506, r2501, r2502; +} +{ +add.f16x2 r2509, r2510, r2511; +} +{ +add.f16x2 r2512, r2513, r2514; +} +{ +sub.f16x2 r2515, r2510, r2511; +} +{ +sub.f16x2 r2518, r2513, r2514; +} +{ +add.f16x2 r2521, r2522, r2523; +} +{ +add.f16x2 r2524, r2525, r2526; +} +{ +sub.f16x2 r2527, r2522, r2523; +} +{ +sub.f16x2 r2530, r2525, r2526; +} +{ +add.f16x2 r2533, r2534, r2535; +} +{ +add.f16x2 r2536, r2537, r2538; +} +{ +sub.f16x2 r2539, r2534, r2535; +} +{ +sub.f16x2 r2542, r2537, r2538; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2497; +} +{ +add.f16x2 r2560, r2561, r2500; +} +{ +add.f16x2 r2563, r2557, r2509; +} +{ +add.f16x2 r2566, r2560, r2512; +} +{ +add.f16x2 r2569, r2563, r2521; +} +{ +add.f16x2 r2572, r2566, r2524; +} +{ +add.f16x2 r2575, r2569, r2533; +} +{ +add.f16x2 r2578, r2572, r2536; +} +{ +add.f16x2 %0, r2575, r2545; +} +{ +add.f16x2 %1, r2578, r2548; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2589, {low, high}; +} +{ +mul.f16x2 r2590, r2497, r2589; +} +{ +add.f16x2 r2593, r2558, r2590; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2596, {low, high}; +} +{ +mul.f16x2 r2597, r2506, r2596; +} +{ +add.f16x2 r2600, r2587, r2597; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2603, {low, high}; +} +{ +mul.f16x2 r2604, r2500, r2603; +} +{ +add.f16x2 r2607, r2561, r2604; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2610, {low, high}; +} +{ +mul.f16x2 r2611, r2503, r2610; +} +{ +add.f16x2 r2614, r2588, r2611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2617, {low, high}; +} +{ +mul.f16x2 r2618, r2509, r2617; +} +{ +add.f16x2 r2621, r2593, r2618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2624, {low, high}; +} +{ +mul.f16x2 r2625, r2518, r2624; +} +{ +add.f16x2 r2628, r2600, r2625; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2631, {low, high}; +} +{ +mul.f16x2 r2632, r2512, r2631; +} +{ +add.f16x2 r2635, r2607, r2632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2638, {low, high}; +} +{ +mul.f16x2 r2639, r2515, r2638; +} +{ +add.f16x2 r2642, r2614, r2639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2645, {low, high}; +} +{ +mul.f16x2 r2646, r2521, r2645; +} +{ +add.f16x2 r2649, r2621, r2646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2652, {low, high}; +} +{ +mul.f16x2 r2653, r2530, r2652; +} +{ +add.f16x2 r2656, r2628, r2653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2659, {low, high}; +} +{ +mul.f16x2 r2660, r2524, r2659; +} +{ +add.f16x2 r2663, r2635, r2660; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2666, {low, high}; +} +{ +mul.f16x2 r2667, r2527, r2666; +} +{ +add.f16x2 r2670, r2642, r2667; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2673, {low, high}; +} +{ +mul.f16x2 r2674, r2533, r2673; +} +{ +add.f16x2 r2677, r2649, r2674; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2680, {low, high}; +} +{ +mul.f16x2 r2681, r2542, r2680; +} +{ +add.f16x2 r2684, r2656, r2681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2687, {low, high}; +} +{ +mul.f16x2 r2688, r2536, r2687; +} +{ +add.f16x2 r2691, r2663, r2688; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2694, {low, high}; +} +{ +mul.f16x2 r2695, r2539, r2694; +} +{ +add.f16x2 r2698, r2670, r2695; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2701, {low, high}; +} +{ +mul.f16x2 r2702, r2545, r2701; +} +{ +add.f16x2 r2705, r2677, r2702; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2708, {low, high}; +} +{ +mul.f16x2 r2709, r2554, r2708; +} +{ +add.f16x2 r2712, r2684, r2709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2715, {low, high}; +} +{ +mul.f16x2 r2716, r2548, r2715; +} +{ +add.f16x2 r2719, r2691, r2716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2722, {low, high}; +} +{ +mul.f16x2 r2723, r2551, r2722; +} +{ +add.f16x2 r2726, r2698, r2723; +} +{ +sub.f16x2 %2, r2705, r2712; +} +{ +add.f16x2 %3, r2719, r2726; +} +{ +add.f16x2 %20, r2705, r2712; +} +{ +sub.f16x2 %21, r2719, r2726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2743, {low, high}; +} +{ +mul.f16x2 r2744, r2497, r2743; +} +{ +add.f16x2 r2747, r2558, r2744; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2750, {low, high}; +} +{ +mul.f16x2 r2751, r2506, r2750; +} +{ +add.f16x2 r2754, r2741, r2751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2757, {low, high}; +} +{ +mul.f16x2 r2758, r2500, r2757; +} +{ +add.f16x2 r2761, r2561, r2758; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2764, {low, high}; +} +{ +mul.f16x2 r2765, r2503, r2764; +} +{ +add.f16x2 r2768, r2742, r2765; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2771, {low, high}; +} +{ +mul.f16x2 r2772, r2509, r2771; +} +{ +add.f16x2 r2775, r2747, r2772; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2778, {low, high}; +} +{ +mul.f16x2 r2779, r2518, r2778; +} +{ +add.f16x2 r2782, r2754, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2785, {low, high}; +} +{ +mul.f16x2 r2786, r2512, r2785; +} +{ +add.f16x2 r2789, r2761, r2786; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2792, {low, high}; +} +{ +mul.f16x2 r2793, r2515, r2792; +} +{ +add.f16x2 r2796, r2768, r2793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2799, {low, high}; +} +{ +mul.f16x2 r2800, r2521, r2799; +} +{ +add.f16x2 r2803, r2775, r2800; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2806, {low, high}; +} +{ +mul.f16x2 r2807, r2530, r2806; +} +{ +add.f16x2 r2810, r2782, r2807; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2524, r2813; +} +{ +add.f16x2 r2817, r2789, r2814; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2820, {low, high}; +} +{ +mul.f16x2 r2821, r2527, r2820; +} +{ +add.f16x2 r2824, r2796, r2821; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2827, {low, high}; +} +{ +mul.f16x2 r2828, r2533, r2827; +} +{ +add.f16x2 r2831, r2803, r2828; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2834, {low, high}; +} +{ +mul.f16x2 r2835, r2542, r2834; +} +{ +add.f16x2 r2838, r2810, r2835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2841, {low, high}; +} +{ +mul.f16x2 r2842, r2536, r2841; +} +{ +add.f16x2 r2845, r2817, r2842; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2848, {low, high}; +} +{ +mul.f16x2 r2849, r2539, r2848; +} +{ +add.f16x2 r2852, r2824, r2849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2855, {low, high}; +} +{ +mul.f16x2 r2856, r2545, r2855; +} +{ +add.f16x2 r2859, r2831, r2856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2862, {low, high}; +} +{ +mul.f16x2 r2863, r2554, r2862; +} +{ +add.f16x2 r2866, r2838, r2863; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2869, {low, high}; +} +{ +mul.f16x2 r2870, r2548, r2869; +} +{ +add.f16x2 r2873, r2845, r2870; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2876, {low, high}; +} +{ +mul.f16x2 r2877, r2551, r2876; +} +{ +add.f16x2 r2880, r2852, r2877; +} +{ +sub.f16x2 %4, r2859, r2866; +} +{ +add.f16x2 %5, r2873, r2880; +} +{ +add.f16x2 %18, r2859, r2866; +} +{ +sub.f16x2 %19, r2873, r2880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2897, {low, high}; +} +{ +mul.f16x2 r2898, r2497, r2897; +} +{ +add.f16x2 r2901, r2558, r2898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2904, {low, high}; +} +{ +mul.f16x2 r2905, r2506, r2904; +} +{ +add.f16x2 r2908, r2895, r2905; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2911, {low, high}; +} +{ +mul.f16x2 r2912, r2500, r2911; +} +{ +add.f16x2 r2915, r2561, r2912; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2918, {low, high}; +} +{ +mul.f16x2 r2919, r2503, r2918; +} +{ +add.f16x2 r2922, r2896, r2919; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2925, {low, high}; +} +{ +mul.f16x2 r2926, r2509, r2925; +} +{ +add.f16x2 r2929, r2901, r2926; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2932, {low, high}; +} +{ +mul.f16x2 r2933, r2518, r2932; +} +{ +add.f16x2 r2936, r2908, r2933; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2939, {low, high}; +} +{ +mul.f16x2 r2940, r2512, r2939; +} +{ +add.f16x2 r2943, r2915, r2940; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2946, {low, high}; +} +{ +mul.f16x2 r2947, r2515, r2946; +} +{ +add.f16x2 r2950, r2922, r2947; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2953, {low, high}; +} +{ +mul.f16x2 r2954, r2521, r2953; +} +{ +add.f16x2 r2957, r2929, r2954; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2960, {low, high}; +} +{ +mul.f16x2 r2961, r2530, r2960; +} +{ +add.f16x2 r2964, r2936, r2961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2967, {low, high}; +} +{ +mul.f16x2 r2968, r2524, r2967; +} +{ +add.f16x2 r2971, r2943, r2968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2974, {low, high}; +} +{ +mul.f16x2 r2975, r2527, r2974; +} +{ +add.f16x2 r2978, r2950, r2975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2981, {low, high}; +} +{ +mul.f16x2 r2982, r2533, r2981; +} +{ +add.f16x2 r2985, r2957, r2982; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2988, {low, high}; +} +{ +mul.f16x2 r2989, r2542, r2988; +} +{ +add.f16x2 r2992, r2964, r2989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2995, {low, high}; +} +{ +mul.f16x2 r2996, r2536, r2995; +} +{ +add.f16x2 r2999, r2971, r2996; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3002, {low, high}; +} +{ +mul.f16x2 r3003, r2539, r3002; +} +{ +add.f16x2 r3006, r2978, r3003; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r2545, r3009; +} +{ +add.f16x2 r3013, r2985, r3010; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3016, {low, high}; +} +{ +mul.f16x2 r3017, r2554, r3016; +} +{ +add.f16x2 r3020, r2992, r3017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3023, {low, high}; +} +{ +mul.f16x2 r3024, r2548, r3023; +} +{ +add.f16x2 r3027, r2999, r3024; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3030, {low, high}; +} +{ +mul.f16x2 r3031, r2551, r3030; +} +{ +add.f16x2 r3034, r3006, r3031; +} +{ +sub.f16x2 %6, r3013, r3020; +} +{ +add.f16x2 %7, r3027, r3034; +} +{ +add.f16x2 %16, r3013, r3020; +} +{ +sub.f16x2 %17, r3027, r3034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3051, {low, high}; +} +{ +mul.f16x2 r3052, r2497, r3051; +} +{ +add.f16x2 r3055, r2558, r3052; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3058, {low, high}; +} +{ +mul.f16x2 r3059, r2506, r3058; +} +{ +add.f16x2 r3062, r3049, r3059; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3065, {low, high}; +} +{ +mul.f16x2 r3066, r2500, r3065; +} +{ +add.f16x2 r3069, r2561, r3066; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3072, {low, high}; +} +{ +mul.f16x2 r3073, r2503, r3072; +} +{ +add.f16x2 r3076, r3050, r3073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3079, {low, high}; +} +{ +mul.f16x2 r3080, r2509, r3079; +} +{ +add.f16x2 r3083, r3055, r3080; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3086, {low, high}; +} +{ +mul.f16x2 r3087, r2518, r3086; +} +{ +add.f16x2 r3090, r3062, r3087; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3093, {low, high}; +} +{ +mul.f16x2 r3094, r2512, r3093; +} +{ +add.f16x2 r3097, r3069, r3094; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r2515, r3100; +} +{ +add.f16x2 r3104, r3076, r3101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3107, {low, high}; +} +{ +mul.f16x2 r3108, r2521, r3107; +} +{ +add.f16x2 r3111, r3083, r3108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3114, {low, high}; +} +{ +mul.f16x2 r3115, r2530, r3114; +} +{ +add.f16x2 r3118, r3090, r3115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3121, {low, high}; +} +{ +mul.f16x2 r3122, r2524, r3121; +} +{ +add.f16x2 r3125, r3097, r3122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3128, {low, high}; +} +{ +mul.f16x2 r3129, r2527, r3128; +} +{ +add.f16x2 r3132, r3104, r3129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3135, {low, high}; +} +{ +mul.f16x2 r3136, r2533, r3135; +} +{ +add.f16x2 r3139, r3111, r3136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3142, {low, high}; +} +{ +mul.f16x2 r3143, r2542, r3142; +} +{ +add.f16x2 r3146, r3118, r3143; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3149, {low, high}; +} +{ +mul.f16x2 r3150, r2536, r3149; +} +{ +add.f16x2 r3153, r3125, r3150; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3156, {low, high}; +} +{ +mul.f16x2 r3157, r2539, r3156; +} +{ +add.f16x2 r3160, r3132, r3157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3163, {low, high}; +} +{ +mul.f16x2 r3164, r2545, r3163; +} +{ +add.f16x2 r3167, r3139, r3164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3170, {low, high}; +} +{ +mul.f16x2 r3171, r2554, r3170; +} +{ +add.f16x2 r3174, r3146, r3171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r2548, r3177; +} +{ +add.f16x2 r3181, r3153, r3178; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3184, {low, high}; +} +{ +mul.f16x2 r3185, r2551, r3184; +} +{ +add.f16x2 r3188, r3160, r3185; +} +{ +sub.f16x2 %8, r3167, r3174; +} +{ +add.f16x2 %9, r3181, r3188; +} +{ +add.f16x2 %14, r3167, r3174; +} +{ +sub.f16x2 %15, r3181, r3188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3204, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3205, {low, high}; +} +{ +mul.f16x2 r3206, r2497, r3205; +} +{ +add.f16x2 r3209, r2558, r3206; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3212, {low, high}; +} +{ +mul.f16x2 r3213, r2506, r3212; +} +{ +add.f16x2 r3216, r3203, r3213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3219, {low, high}; +} +{ +mul.f16x2 r3220, r2500, r3219; +} +{ +add.f16x2 r3223, r2561, r3220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3226, {low, high}; +} +{ +mul.f16x2 r3227, r2503, r3226; +} +{ +add.f16x2 r3230, r3204, r3227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3233, {low, high}; +} +{ +mul.f16x2 r3234, r2509, r3233; +} +{ +add.f16x2 r3237, r3209, r3234; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3240, {low, high}; +} +{ +mul.f16x2 r3241, r2518, r3240; +} +{ +add.f16x2 r3244, r3216, r3241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3247, {low, high}; +} +{ +mul.f16x2 r3248, r2512, r3247; +} +{ +add.f16x2 r3251, r3223, r3248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3254, {low, high}; +} +{ +mul.f16x2 r3255, r2515, r3254; +} +{ +add.f16x2 r3258, r3230, r3255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3261, {low, high}; +} +{ +mul.f16x2 r3262, r2521, r3261; +} +{ +add.f16x2 r3265, r3237, r3262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r2530, r3268; +} +{ +add.f16x2 r3272, r3244, r3269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3275, {low, high}; +} +{ +mul.f16x2 r3276, r2524, r3275; +} +{ +add.f16x2 r3279, r3251, r3276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3282, {low, high}; +} +{ +mul.f16x2 r3283, r2527, r3282; +} +{ +add.f16x2 r3286, r3258, r3283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3289, {low, high}; +} +{ +mul.f16x2 r3290, r2533, r3289; +} +{ +add.f16x2 r3293, r3265, r3290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3296, {low, high}; +} +{ +mul.f16x2 r3297, r2542, r3296; +} +{ +add.f16x2 r3300, r3272, r3297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3303, {low, high}; +} +{ +mul.f16x2 r3304, r2536, r3303; +} +{ +add.f16x2 r3307, r3279, r3304; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3310, {low, high}; +} +{ +mul.f16x2 r3311, r2539, r3310; +} +{ +add.f16x2 r3314, r3286, r3311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3317, {low, high}; +} +{ +mul.f16x2 r3318, r2545, r3317; +} +{ +add.f16x2 r3321, r3293, r3318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3324, {low, high}; +} +{ +mul.f16x2 r3325, r2554, r3324; +} +{ +add.f16x2 r3328, r3300, r3325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3331, {low, high}; +} +{ +mul.f16x2 r3332, r2548, r3331; +} +{ +add.f16x2 r3335, r3307, r3332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3338, {low, high}; +} +{ +mul.f16x2 r3339, r2551, r3338; +} +{ +add.f16x2 r3342, r3314, r3339; +} +{ +sub.f16x2 %10, r3321, r3328; +} +{ +add.f16x2 %11, r3335, r3342; +} +{ +add.f16x2 %12, r3321, r3328; +} +{ +sub.f16x2 %13, r3335, r3342; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1129, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<2>; +.reg .f32 f<718>; +.reg .b32 r<3374>; +.reg .b64 rd<8>; +{ +add.f16x2 r25, %25, %43; +} +{ +add.f16x2 r28, %26, %44; +} +{ +sub.f16x2 r31, %25, %43; +} +{ +sub.f16x2 r34, %26, %44; +} +{ +add.f16x2 r37, %27, %41; +} +{ +add.f16x2 r40, %28, %42; +} +{ +sub.f16x2 r43, %27, %41; +} +{ +sub.f16x2 r46, %28, %42; +} +{ +add.f16x2 r49, %29, %39; +} +{ +add.f16x2 r52, %30, %40; +} +{ +sub.f16x2 r55, %29, %39; +} +{ +sub.f16x2 r58, %30, %40; +} +{ +add.f16x2 r61, %31, %37; +} +{ +add.f16x2 r64, %32, %38; +} +{ +sub.f16x2 r67, %31, %37; +} +{ +sub.f16x2 r70, %32, %38; +} +{ +add.f16x2 r73, %33, %35; +} +{ +add.f16x2 r76, %34, %36; +} +{ +sub.f16x2 r79, %33, %35; +} +{ +sub.f16x2 r82, %34, %36; +} +{ +add.f16x2 r85, %23, r25; +} +{ +add.f16x2 r88, %24, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 r103, r97, r61; +} +{ +add.f16x2 r106, r100, r64; +} +{ +add.f16x2 r109, r103, r73; +} +{ +add.f16x2 r112, r106, r76; +} +mov.f32 f183, 0f00000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r116, {low, high}; +} +mov.f32 f197, 0f3F575C64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r117, {low, high}; +} +{ +mul.f16x2 r118, r25, r117; +} +{ +add.f16x2 r121, %23, r118; +} +mov.f32 f163, 0f3F0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r124, {low, high}; +} +{ +mul.f16x2 r125, r34, r124; +} +{ +add.f16x2 r128, r115, r125; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r131, {low, high}; +} +{ +mul.f16x2 r132, r28, r131; +} +{ +add.f16x2 r135, %24, r132; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r138, {low, high}; +} +{ +mul.f16x2 r139, r31, r138; +} +{ +add.f16x2 r142, r116, r139; +} +mov.f32 f213, 0f3ED4B147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r145, {low, high}; +} +{ +mul.f16x2 r146, r37, r145; +} +{ +add.f16x2 r149, r121, r146; +} +mov.f32 f59, 0f3F68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r152, {low, high}; +} +{ +mul.f16x2 r153, r46, r152; +} +{ +add.f16x2 r156, r128, r153; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r159, {low, high}; +} +{ +mul.f16x2 r160, r40, r159; +} +{ +add.f16x2 r163, r135, r160; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r166, {low, high}; +} +{ +mul.f16x2 r167, r43, r166; +} +{ +add.f16x2 r170, r142, r167; +} +mov.f32 f221, 0fBE11BAFB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r173, {low, high}; +} +{ +mul.f16x2 r174, r49, r173; +} +{ +add.f16x2 r177, r149, r174; +} +mov.f32 f223, 0f3F7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r180, {low, high}; +} +{ +mul.f16x2 r181, r58, r180; +} +{ +add.f16x2 r184, r156, r181; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r187, {low, high}; +} +{ +mul.f16x2 r188, r52, r187; +} +{ +add.f16x2 r191, r163, r188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r194, {low, high}; +} +{ +mul.f16x2 r195, r55, r194; +} +{ +add.f16x2 r198, r170, r195; +} +mov.f32 f205, 0fBF27A4F4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r201, {low, high}; +} +{ +mul.f16x2 r202, r61, r201; +} +{ +add.f16x2 r205, r177, r202; +} +mov.f32 f207, 0f3F4178CE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r208, {low, high}; +} +{ +mul.f16x2 r209, r70, r208; +} +{ +add.f16x2 r212, r184, r209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r215, {low, high}; +} +{ +mul.f16x2 r216, r64, r215; +} +{ +add.f16x2 r219, r191, r216; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r222, {low, high}; +} +{ +mul.f16x2 r223, r67, r222; +} +{ +add.f16x2 r226, r198, r223; +} +mov.f32 f189, 0fBF75A155; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r229, {low, high}; +} +{ +mul.f16x2 r230, r73, r229; +} +{ +add.f16x2 r233, r205, r230; +} +mov.f32 f191, 0f3E903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r236, {low, high}; +} +{ +mul.f16x2 r237, r82, r236; +} +{ +add.f16x2 r240, r212, r237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r243, {low, high}; +} +{ +mul.f16x2 r244, r76, r243; +} +{ +add.f16x2 r247, r219, r244; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r250, {low, high}; +} +{ +mul.f16x2 r251, r79, r250; +} +{ +add.f16x2 r254, r226, r251; +} +{ +sub.f16x2 r257, r233, r240; +} +{ +add.f16x2 r260, r247, r254; +} +{ +add.f16x2 r263, r233, r240; +} +{ +sub.f16x2 r266, r247, r254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r270, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r271, {low, high}; +} +{ +mul.f16x2 r272, r25, r271; +} +{ +add.f16x2 r275, %23, r272; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r34, r278; +} +{ +add.f16x2 r282, r269, r279; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r285, {low, high}; +} +{ +mul.f16x2 r286, r28, r285; +} +{ +add.f16x2 r289, %24, r286; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r292, {low, high}; +} +{ +mul.f16x2 r293, r31, r292; +} +{ +add.f16x2 r296, r270, r293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r299, {low, high}; +} +{ +mul.f16x2 r300, r37, r299; +} +{ +add.f16x2 r303, r275, r300; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r306, {low, high}; +} +{ +mul.f16x2 r307, r46, r306; +} +{ +add.f16x2 r310, r282, r307; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r313, {low, high}; +} +{ +mul.f16x2 r314, r40, r313; +} +{ +add.f16x2 r317, r289, r314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r320, {low, high}; +} +{ +mul.f16x2 r321, r43, r320; +} +{ +add.f16x2 r324, r296, r321; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r327, {low, high}; +} +{ +mul.f16x2 r328, r49, r327; +} +{ +add.f16x2 r331, r303, r328; +} +mov.f32 f111, 0fBE903F40; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r334, {low, high}; +} +{ +mul.f16x2 r335, r58, r334; +} +{ +add.f16x2 r338, r310, r335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r341, {low, high}; +} +{ +mul.f16x2 r342, r52, r341; +} +{ +add.f16x2 r345, r317, r342; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r55, r348; +} +{ +add.f16x2 r352, r324, r349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r355, {low, high}; +} +{ +mul.f16x2 r356, r61, r355; +} +{ +add.f16x2 r359, r331, r356; +} +mov.f32 f155, 0fBF7D64F0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r362, {low, high}; +} +{ +mul.f16x2 r363, r70, r362; +} +{ +add.f16x2 r366, r338, r363; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r369, {low, high}; +} +{ +mul.f16x2 r370, r64, r369; +} +{ +add.f16x2 r373, r345, r370; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r376, {low, high}; +} +{ +mul.f16x2 r377, r67, r376; +} +{ +add.f16x2 r380, r352, r377; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r383, {low, high}; +} +{ +mul.f16x2 r384, r73, r383; +} +{ +add.f16x2 r387, r359, r384; +} +mov.f32 f199, 0fBF0A6770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r390, {low, high}; +} +{ +mul.f16x2 r391, r82, r390; +} +{ +add.f16x2 r394, r366, r391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r397, {low, high}; +} +{ +mul.f16x2 r398, r76, r397; +} +{ +add.f16x2 r401, r373, r398; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r404, {low, high}; +} +{ +mul.f16x2 r405, r79, r404; +} +{ +add.f16x2 r408, r380, r405; +} +{ +sub.f16x2 r411, r387, r394; +} +{ +add.f16x2 r414, r401, r408; +} +{ +add.f16x2 r417, r387, r394; +} +{ +sub.f16x2 r420, r401, r408; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r425, {low, high}; +} +{ +mul.f16x2 r426, r25, r425; +} +{ +add.f16x2 r429, %23, r426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r432, {low, high}; +} +{ +mul.f16x2 r433, r34, r432; +} +{ +add.f16x2 r436, r423, r433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r439, {low, high}; +} +{ +mul.f16x2 r440, r28, r439; +} +{ +add.f16x2 r443, %24, r440; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r31, r446; +} +{ +add.f16x2 r450, r424, r447; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r37, r453; +} +{ +add.f16x2 r457, r429, r454; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r460, {low, high}; +} +{ +mul.f16x2 r461, r46, r460; +} +{ +add.f16x2 r464, r436, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r467, {low, high}; +} +{ +mul.f16x2 r468, r40, r467; +} +{ +add.f16x2 r471, r443, r468; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r474, {low, high}; +} +{ +mul.f16x2 r475, r43, r474; +} +{ +add.f16x2 r478, r450, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r481, {low, high}; +} +{ +mul.f16x2 r482, r49, r481; +} +{ +add.f16x2 r485, r457, r482; +} +mov.f32 f215, 0fBF68DDA4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r488, {low, high}; +} +{ +mul.f16x2 r489, r58, r488; +} +{ +add.f16x2 r492, r464, r489; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r495, {low, high}; +} +{ +mul.f16x2 r496, r52, r495; +} +{ +add.f16x2 r499, r471, r496; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r502, {low, high}; +} +{ +mul.f16x2 r503, r55, r502; +} +{ +add.f16x2 r506, r478, r503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r509, {low, high}; +} +{ +mul.f16x2 r510, r61, r509; +} +{ +add.f16x2 r513, r485, r510; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r516, {low, high}; +} +{ +mul.f16x2 r517, r70, r516; +} +{ +add.f16x2 r520, r492, r517; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r523, {low, high}; +} +{ +mul.f16x2 r524, r64, r523; +} +{ +add.f16x2 r527, r499, r524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r530, {low, high}; +} +{ +mul.f16x2 r531, r67, r530; +} +{ +add.f16x2 r534, r506, r531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r537, {low, high}; +} +{ +mul.f16x2 r538, r73, r537; +} +{ +add.f16x2 r541, r513, r538; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r544, {low, high}; +} +{ +mul.f16x2 r545, r82, r544; +} +{ +add.f16x2 r548, r520, r545; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r551, {low, high}; +} +{ +mul.f16x2 r552, r76, r551; +} +{ +add.f16x2 r555, r527, r552; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r558, {low, high}; +} +{ +mul.f16x2 r559, r79, r558; +} +{ +add.f16x2 r562, r534, r559; +} +{ +sub.f16x2 r565, r541, r548; +} +{ +add.f16x2 r568, r555, r562; +} +{ +add.f16x2 r571, r541, r548; +} +{ +sub.f16x2 r574, r555, r562; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r579, {low, high}; +} +{ +mul.f16x2 r580, r25, r579; +} +{ +add.f16x2 r583, %23, r580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r586, {low, high}; +} +{ +mul.f16x2 r587, r34, r586; +} +{ +add.f16x2 r590, r577, r587; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r593, {low, high}; +} +{ +mul.f16x2 r594, r28, r593; +} +{ +add.f16x2 r597, %24, r594; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r600, {low, high}; +} +{ +mul.f16x2 r601, r31, r600; +} +{ +add.f16x2 r604, r578, r601; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r37, r607; +} +{ +add.f16x2 r611, r583, r608; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r615, r46, r614; +} +{ +add.f16x2 r618, r590, r615; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r621, {low, high}; +} +{ +mul.f16x2 r622, r40, r621; +} +{ +add.f16x2 r625, r597, r622; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r628, {low, high}; +} +{ +mul.f16x2 r629, r43, r628; +} +{ +add.f16x2 r632, r604, r629; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r635, {low, high}; +} +{ +mul.f16x2 r636, r49, r635; +} +{ +add.f16x2 r639, r611, r636; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r642, {low, high}; +} +{ +mul.f16x2 r643, r58, r642; +} +{ +add.f16x2 r646, r618, r643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r649, {low, high}; +} +{ +mul.f16x2 r650, r52, r649; +} +{ +add.f16x2 r653, r625, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r656, {low, high}; +} +{ +mul.f16x2 r657, r55, r656; +} +{ +add.f16x2 r660, r632, r657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r663, {low, high}; +} +{ +mul.f16x2 r664, r61, r663; +} +{ +add.f16x2 r667, r639, r664; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r670, {low, high}; +} +{ +mul.f16x2 r671, r70, r670; +} +{ +add.f16x2 r674, r646, r671; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r677, {low, high}; +} +{ +mul.f16x2 r678, r64, r677; +} +{ +add.f16x2 r681, r653, r678; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r684, {low, high}; +} +{ +mul.f16x2 r685, r67, r684; +} +{ +add.f16x2 r688, r660, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r691, {low, high}; +} +{ +mul.f16x2 r692, r73, r691; +} +{ +add.f16x2 r695, r667, r692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r82, r698; +} +{ +add.f16x2 r702, r674, r699; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r705, {low, high}; +} +{ +mul.f16x2 r706, r76, r705; +} +{ +add.f16x2 r709, r681, r706; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r712, {low, high}; +} +{ +mul.f16x2 r713, r79, r712; +} +{ +add.f16x2 r716, r688, r713; +} +{ +sub.f16x2 r719, r695, r702; +} +{ +add.f16x2 r722, r709, r716; +} +{ +add.f16x2 r725, r695, r702; +} +{ +sub.f16x2 r728, r709, r716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r25, r733; +} +{ +add.f16x2 r737, %23, r734; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r740, {low, high}; +} +{ +mul.f16x2 r741, r34, r740; +} +{ +add.f16x2 r744, r731, r741; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r28, r747; +} +{ +add.f16x2 r751, %24, r748; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r754, {low, high}; +} +{ +mul.f16x2 r755, r31, r754; +} +{ +add.f16x2 r758, r732, r755; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r37, r761; +} +{ +add.f16x2 r765, r737, r762; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r768, {low, high}; +} +{ +mul.f16x2 r769, r46, r768; +} +{ +add.f16x2 r772, r744, r769; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r775, {low, high}; +} +{ +mul.f16x2 r776, r40, r775; +} +{ +add.f16x2 r779, r751, r776; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r782, {low, high}; +} +{ +mul.f16x2 r783, r43, r782; +} +{ +add.f16x2 r786, r758, r783; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r789, {low, high}; +} +{ +mul.f16x2 r790, r49, r789; +} +{ +add.f16x2 r793, r765, r790; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r796, {low, high}; +} +{ +mul.f16x2 r797, r58, r796; +} +{ +add.f16x2 r800, r772, r797; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r803, {low, high}; +} +{ +mul.f16x2 r804, r52, r803; +} +{ +add.f16x2 r807, r779, r804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r810, {low, high}; +} +{ +mul.f16x2 r811, r55, r810; +} +{ +add.f16x2 r814, r786, r811; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r817, {low, high}; +} +{ +mul.f16x2 r818, r61, r817; +} +{ +add.f16x2 r821, r793, r818; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r824, {low, high}; +} +{ +mul.f16x2 r825, r70, r824; +} +{ +add.f16x2 r828, r800, r825; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r831, {low, high}; +} +{ +mul.f16x2 r832, r64, r831; +} +{ +add.f16x2 r835, r807, r832; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r838, {low, high}; +} +{ +mul.f16x2 r839, r67, r838; +} +{ +add.f16x2 r842, r814, r839; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r845, {low, high}; +} +{ +mul.f16x2 r846, r73, r845; +} +{ +add.f16x2 r849, r821, r846; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r852, {low, high}; +} +{ +mul.f16x2 r853, r82, r852; +} +{ +add.f16x2 r856, r828, r853; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r859, {low, high}; +} +{ +mul.f16x2 r860, r76, r859; +} +{ +add.f16x2 r863, r835, r860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r79, r866; +} +{ +add.f16x2 r870, r842, r867; +} +{ +sub.f16x2 r873, r849, r856; +} +{ +add.f16x2 r876, r863, r870; +} +{ +add.f16x2 r879, r849, r856; +} +{ +sub.f16x2 r882, r863, r870; +} +mov.u32 r23, %tid.x; +mul.wide.u32 rd2, r23, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r885, rd3; +sub.s32 r886, r23, r885; +shr.u32 r887, r886, 1; +add.s32 r888, r887, r885; +shr.u32 r889, r888, 6; +mul.lo.s32 r890, r889, 121; +sub.s32 r24, r23, r890; +cvt.rn.f32.u32 f225, r24; +mul.f32 f1, f225, 0f3B9AAFAF; +setp.eq.s32 p1, r24, 18; +mov.f32 f717, 0f3DADD00E; +@p1 bra LBB1_2; +sin.approx.f32 f717, f1; +LBB1_2: +mov.u32 r3362, %tid.y; +mov.u32 r3363, %22; +mad.lo.s32 r3364, r3362, 5324, r3363; +mad.lo.s32 r3365, r889, 5324, r3364; +neg.f32 f227, f717; +cos.approx.f32 f226, f1; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f227; +mov.b32 r891, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r260, r896; +} +{ +fma.rn.f16x2 r901, r257, r894, r898; +} +{ +mul.f16x2 r905, r257, r896; +} +{ +neg.f16x2 r908, r905; +} +{ +fma.rn.f16x2 r910, r260, r894, r908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r916, {high, high}; +} +mov.f32 f490, 0fBF800000; +mov.f32 f491, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r891, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r414, r933; +} +{ +fma.rn.f16x2 r938, r411, r931, r935; +} +{ +mul.f16x2 r942, r411, r933; +} +{ +neg.f16x2 r945, r942; +} +{ +fma.rn.f16x2 r947, r414, r931, r945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r568, r970; +} +{ +fma.rn.f16x2 r975, r565, r968, r972; +} +{ +mul.f16x2 r979, r565, r970; +} +{ +neg.f16x2 r982, r979; +} +{ +fma.rn.f16x2 r984, r568, r968, r982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r722, r1007; +} +{ +fma.rn.f16x2 r1012, r719, r1005, r1009; +} +{ +mul.f16x2 r1016, r719, r1007; +} +{ +neg.f16x2 r1019, r1016; +} +{ +fma.rn.f16x2 r1021, r722, r1005, r1019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r876, r1044; +} +{ +fma.rn.f16x2 r1049, r873, r1042, r1046; +} +{ +mul.f16x2 r1053, r873, r1044; +} +{ +neg.f16x2 r1056, r1053; +} +{ +fma.rn.f16x2 r1058, r876, r1042, r1056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1066, {low, high}; +} +{ +mul.f16x2 r1067, r1064, r1066; +} +{ +mul.f16x2 r1070, r1038, r1062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1073, {high, low}; +} +{ +fma.rn.f16x2 r1075, r1067, r1073, r1070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1081, {high, high}; +} +{ +mul.f16x2 r1083, r882, r1081; +} +{ +fma.rn.f16x2 r1086, r879, r1079, r1083; +} +{ +mul.f16x2 r1090, r879, r1081; +} +{ +neg.f16x2 r1093, r1090; +} +{ +fma.rn.f16x2 r1095, r882, r1079, r1093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1103, {low, high}; +} +{ +mul.f16x2 r1104, r1101, r1103; +} +{ +mul.f16x2 r1107, r1075, r1099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1075; +mov.b32 r1110, {high, low}; +} +{ +fma.rn.f16x2 r1112, r1104, r1110, r1107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r728, r1118; +} +{ +fma.rn.f16x2 r1123, r725, r1116, r1120; +} +{ +mul.f16x2 r1127, r725, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r728, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1112, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1112; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r574, r1155; +} +{ +fma.rn.f16x2 r1160, r571, r1153, r1157; +} +{ +mul.f16x2 r1164, r571, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r574, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r420, r1192; +} +{ +fma.rn.f16x2 r1197, r417, r1190, r1194; +} +{ +mul.f16x2 r1201, r417, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r420, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r891; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r266, r1229; +} +{ +fma.rn.f16x2 r1234, r263, r1227, r1231; +} +{ +mul.f16x2 r1238, r263, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r266, r1227, r1241; +} +barrier.sync 0; +mad.lo.s32 r3366, r24, 44, r3365; +st.shared.u32 [r3366], r109; +st.shared.u32 [r3366+4], r901; +st.shared.u32 [r3366+8], r938; +st.shared.u32 [r3366+12], r975; +st.shared.u32 [r3366+16], r1012; +st.shared.u32 [r3366+20], r1049; +st.shared.u32 [r3366+24], r1086; +st.shared.u32 [r3366+28], r1123; +st.shared.u32 [r3366+32], r1160; +st.shared.u32 [r3366+36], r1197; +st.shared.u32 [r3366+40], r1234; +barrier.sync 0; +mad.lo.s32 r3367, r24, -40, r3366; +ld.shared.u32 r1325, [r3367]; +ld.shared.u32 r1265, [r3367+484]; +ld.shared.u32 r1277, [r3367+968]; +ld.shared.u32 r1289, [r3367+1452]; +ld.shared.u32 r1301, [r3367+1936]; +ld.shared.u32 r1313, [r3367+2420]; +ld.shared.u32 r1314, [r3367+2904]; +ld.shared.u32 r1302, [r3367+3388]; +ld.shared.u32 r1290, [r3367+3872]; +ld.shared.u32 r1278, [r3367+4356]; +ld.shared.u32 r1266, [r3367+4840]; +barrier.sync 0; +st.shared.u32 [r3366], r112; +st.shared.u32 [r3366+4], r910; +st.shared.u32 [r3366+8], r947; +st.shared.u32 [r3366+12], r984; +st.shared.u32 [r3366+16], r1021; +st.shared.u32 [r3366+20], r1058; +st.shared.u32 [r3366+24], r1095; +st.shared.u32 [r3366+28], r1132; +st.shared.u32 [r3366+32], r1169; +st.shared.u32 [r3366+36], r1206; +st.shared.u32 [r3366+40], r1243; +barrier.sync 0; +ld.shared.u32 r1328, [r3367]; +ld.shared.u32 r1268, [r3367+484]; +ld.shared.u32 r1280, [r3367+968]; +ld.shared.u32 r1292, [r3367+1452]; +ld.shared.u32 r1304, [r3367+1936]; +ld.shared.u32 r1316, [r3367+2420]; +ld.shared.u32 r1317, [r3367+2904]; +ld.shared.u32 r1305, [r3367+3388]; +ld.shared.u32 r1293, [r3367+3872]; +ld.shared.u32 r1281, [r3367+4356]; +ld.shared.u32 r1269, [r3367+4840]; +{ +add.f16x2 r1264, r1265, r1266; +} +{ +add.f16x2 r1267, r1268, r1269; +} +{ +sub.f16x2 r1270, r1265, r1266; +} +{ +sub.f16x2 r1273, r1268, r1269; +} +{ +add.f16x2 r1276, r1277, r1278; +} +{ +add.f16x2 r1279, r1280, r1281; +} +{ +sub.f16x2 r1282, r1277, r1278; +} +{ +sub.f16x2 r1285, r1280, r1281; +} +{ +add.f16x2 r1288, r1289, r1290; +} +{ +add.f16x2 r1291, r1292, r1293; +} +{ +sub.f16x2 r1294, r1289, r1290; +} +{ +sub.f16x2 r1297, r1292, r1293; +} +{ +add.f16x2 r1300, r1301, r1302; +} +{ +add.f16x2 r1303, r1304, r1305; +} +{ +sub.f16x2 r1306, r1301, r1302; +} +{ +sub.f16x2 r1309, r1304, r1305; +} +{ +add.f16x2 r1312, r1313, r1314; +} +{ +add.f16x2 r1315, r1316, r1317; +} +{ +sub.f16x2 r1318, r1313, r1314; +} +{ +sub.f16x2 r1321, r1316, r1317; +} +{ +add.f16x2 r1324, r1325, r1264; +} +{ +add.f16x2 r1327, r1328, r1267; +} +{ +add.f16x2 r1330, r1324, r1276; +} +{ +add.f16x2 r1333, r1327, r1279; +} +{ +add.f16x2 r1336, r1330, r1288; +} +{ +add.f16x2 r1339, r1333, r1291; +} +{ +add.f16x2 r1342, r1336, r1300; +} +{ +add.f16x2 r1345, r1339, r1303; +} +{ +add.f16x2 r1348, r1342, r1312; +} +{ +add.f16x2 r1351, r1345, r1315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1354, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1356, {low, high}; +} +{ +mul.f16x2 r1357, r1264, r1356; +} +{ +add.f16x2 r1360, r1325, r1357; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1363, {low, high}; +} +{ +mul.f16x2 r1364, r1273, r1363; +} +{ +add.f16x2 r1367, r1354, r1364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1267, r1370; +} +{ +add.f16x2 r1374, r1328, r1371; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1377, {low, high}; +} +{ +mul.f16x2 r1378, r1270, r1377; +} +{ +add.f16x2 r1381, r1355, r1378; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1384, {low, high}; +} +{ +mul.f16x2 r1385, r1276, r1384; +} +{ +add.f16x2 r1388, r1360, r1385; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1391, {low, high}; +} +{ +mul.f16x2 r1392, r1285, r1391; +} +{ +add.f16x2 r1395, r1367, r1392; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1398, {low, high}; +} +{ +mul.f16x2 r1399, r1279, r1398; +} +{ +add.f16x2 r1402, r1374, r1399; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1405, {low, high}; +} +{ +mul.f16x2 r1406, r1282, r1405; +} +{ +add.f16x2 r1409, r1381, r1406; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1412, {low, high}; +} +{ +mul.f16x2 r1413, r1288, r1412; +} +{ +add.f16x2 r1416, r1388, r1413; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1419, {low, high}; +} +{ +mul.f16x2 r1420, r1297, r1419; +} +{ +add.f16x2 r1423, r1395, r1420; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1426, {low, high}; +} +{ +mul.f16x2 r1427, r1291, r1426; +} +{ +add.f16x2 r1430, r1402, r1427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1433, {low, high}; +} +{ +mul.f16x2 r1434, r1294, r1433; +} +{ +add.f16x2 r1437, r1409, r1434; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1440, {low, high}; +} +{ +mul.f16x2 r1441, r1300, r1440; +} +{ +add.f16x2 r1444, r1416, r1441; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1447, {low, high}; +} +{ +mul.f16x2 r1448, r1309, r1447; +} +{ +add.f16x2 r1451, r1423, r1448; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1454, {low, high}; +} +{ +mul.f16x2 r1455, r1303, r1454; +} +{ +add.f16x2 r1458, r1430, r1455; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1461, {low, high}; +} +{ +mul.f16x2 r1462, r1306, r1461; +} +{ +add.f16x2 r1465, r1437, r1462; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1468, {low, high}; +} +{ +mul.f16x2 r1469, r1312, r1468; +} +{ +add.f16x2 r1472, r1444, r1469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1475, {low, high}; +} +{ +mul.f16x2 r1476, r1321, r1475; +} +{ +add.f16x2 r1479, r1451, r1476; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1482, {low, high}; +} +{ +mul.f16x2 r1483, r1315, r1482; +} +{ +add.f16x2 r1486, r1458, r1483; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1489, {low, high}; +} +{ +mul.f16x2 r1490, r1318, r1489; +} +{ +add.f16x2 r1493, r1465, r1490; +} +{ +sub.f16x2 r1496, r1472, r1479; +} +{ +add.f16x2 r1499, r1486, r1493; +} +{ +add.f16x2 r1502, r1472, r1479; +} +{ +sub.f16x2 r1505, r1486, r1493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1510, {low, high}; +} +{ +mul.f16x2 r1511, r1264, r1510; +} +{ +add.f16x2 r1514, r1325, r1511; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1517, {low, high}; +} +{ +mul.f16x2 r1518, r1273, r1517; +} +{ +add.f16x2 r1521, r1508, r1518; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1524, {low, high}; +} +{ +mul.f16x2 r1525, r1267, r1524; +} +{ +add.f16x2 r1528, r1328, r1525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r1531, {low, high}; +} +{ +mul.f16x2 r1532, r1270, r1531; +} +{ +add.f16x2 r1535, r1509, r1532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1538, {low, high}; +} +{ +mul.f16x2 r1539, r1276, r1538; +} +{ +add.f16x2 r1542, r1514, r1539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1545, {low, high}; +} +{ +mul.f16x2 r1546, r1285, r1545; +} +{ +add.f16x2 r1549, r1521, r1546; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1552, {low, high}; +} +{ +mul.f16x2 r1553, r1279, r1552; +} +{ +add.f16x2 r1556, r1528, r1553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1559, {low, high}; +} +{ +mul.f16x2 r1560, r1282, r1559; +} +{ +add.f16x2 r1563, r1535, r1560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1566, {low, high}; +} +{ +mul.f16x2 r1567, r1288, r1566; +} +{ +add.f16x2 r1570, r1542, r1567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1573, {low, high}; +} +{ +mul.f16x2 r1574, r1297, r1573; +} +{ +add.f16x2 r1577, r1549, r1574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1291, r1580; +} +{ +add.f16x2 r1584, r1556, r1581; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1587, {low, high}; +} +{ +mul.f16x2 r1588, r1294, r1587; +} +{ +add.f16x2 r1591, r1563, r1588; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1594, {low, high}; +} +{ +mul.f16x2 r1595, r1300, r1594; +} +{ +add.f16x2 r1598, r1570, r1595; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1601, {low, high}; +} +{ +mul.f16x2 r1602, r1309, r1601; +} +{ +add.f16x2 r1605, r1577, r1602; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1608, {low, high}; +} +{ +mul.f16x2 r1609, r1303, r1608; +} +{ +add.f16x2 r1612, r1584, r1609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1615, {low, high}; +} +{ +mul.f16x2 r1616, r1306, r1615; +} +{ +add.f16x2 r1619, r1591, r1616; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1622, {low, high}; +} +{ +mul.f16x2 r1623, r1312, r1622; +} +{ +add.f16x2 r1626, r1598, r1623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1629, {low, high}; +} +{ +mul.f16x2 r1630, r1321, r1629; +} +{ +add.f16x2 r1633, r1605, r1630; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1636, {low, high}; +} +{ +mul.f16x2 r1637, r1315, r1636; +} +{ +add.f16x2 r1640, r1612, r1637; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r1643, {low, high}; +} +{ +mul.f16x2 r1644, r1318, r1643; +} +{ +add.f16x2 r1647, r1619, r1644; +} +{ +sub.f16x2 r1650, r1626, r1633; +} +{ +add.f16x2 r1653, r1640, r1647; +} +{ +add.f16x2 r1656, r1626, r1633; +} +{ +sub.f16x2 r1659, r1640, r1647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1662, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1663, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1664, {low, high}; +} +{ +mul.f16x2 r1665, r1264, r1664; +} +{ +add.f16x2 r1668, r1325, r1665; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1671, {low, high}; +} +{ +mul.f16x2 r1672, r1273, r1671; +} +{ +add.f16x2 r1675, r1662, r1672; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1678, {low, high}; +} +{ +mul.f16x2 r1679, r1267, r1678; +} +{ +add.f16x2 r1682, r1328, r1679; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r1685, {low, high}; +} +{ +mul.f16x2 r1686, r1270, r1685; +} +{ +add.f16x2 r1689, r1663, r1686; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1692, {low, high}; +} +{ +mul.f16x2 r1693, r1276, r1692; +} +{ +add.f16x2 r1696, r1668, r1693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1699, {low, high}; +} +{ +mul.f16x2 r1700, r1285, r1699; +} +{ +add.f16x2 r1703, r1675, r1700; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1706, {low, high}; +} +{ +mul.f16x2 r1707, r1279, r1706; +} +{ +add.f16x2 r1710, r1682, r1707; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r1713, {low, high}; +} +{ +mul.f16x2 r1714, r1282, r1713; +} +{ +add.f16x2 r1717, r1689, r1714; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1720, {low, high}; +} +{ +mul.f16x2 r1721, r1288, r1720; +} +{ +add.f16x2 r1724, r1696, r1721; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1297, r1727; +} +{ +add.f16x2 r1731, r1703, r1728; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1734, {low, high}; +} +{ +mul.f16x2 r1735, r1291, r1734; +} +{ +add.f16x2 r1738, r1710, r1735; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1294, r1741; +} +{ +add.f16x2 r1745, r1717, r1742; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1748, {low, high}; +} +{ +mul.f16x2 r1749, r1300, r1748; +} +{ +add.f16x2 r1752, r1724, r1749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1755, {low, high}; +} +{ +mul.f16x2 r1756, r1309, r1755; +} +{ +add.f16x2 r1759, r1731, r1756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1762, {low, high}; +} +{ +mul.f16x2 r1763, r1303, r1762; +} +{ +add.f16x2 r1766, r1738, r1763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1769, {low, high}; +} +{ +mul.f16x2 r1770, r1306, r1769; +} +{ +add.f16x2 r1773, r1745, r1770; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1776, {low, high}; +} +{ +mul.f16x2 r1777, r1312, r1776; +} +{ +add.f16x2 r1780, r1752, r1777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1321, r1783; +} +{ +add.f16x2 r1787, r1759, r1784; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1790, {low, high}; +} +{ +mul.f16x2 r1791, r1315, r1790; +} +{ +add.f16x2 r1794, r1766, r1791; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1318, r1797; +} +{ +add.f16x2 r1801, r1773, r1798; +} +{ +sub.f16x2 r1804, r1780, r1787; +} +{ +add.f16x2 r1807, r1794, r1801; +} +{ +add.f16x2 r1810, r1780, r1787; +} +{ +sub.f16x2 r1813, r1794, r1801; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1817, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1264, r1818; +} +{ +add.f16x2 r1822, r1325, r1819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1825, {low, high}; +} +{ +mul.f16x2 r1826, r1273, r1825; +} +{ +add.f16x2 r1829, r1816, r1826; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r1832, {low, high}; +} +{ +mul.f16x2 r1833, r1267, r1832; +} +{ +add.f16x2 r1836, r1328, r1833; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r1839, {low, high}; +} +{ +mul.f16x2 r1840, r1270, r1839; +} +{ +add.f16x2 r1843, r1817, r1840; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1846, {low, high}; +} +{ +mul.f16x2 r1847, r1276, r1846; +} +{ +add.f16x2 r1850, r1822, r1847; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1853, {low, high}; +} +{ +mul.f16x2 r1854, r1285, r1853; +} +{ +add.f16x2 r1857, r1829, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r1860, {low, high}; +} +{ +mul.f16x2 r1861, r1279, r1860; +} +{ +add.f16x2 r1864, r1836, r1861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r1867, {low, high}; +} +{ +mul.f16x2 r1868, r1282, r1867; +} +{ +add.f16x2 r1871, r1843, r1868; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1874, {low, high}; +} +{ +mul.f16x2 r1875, r1288, r1874; +} +{ +add.f16x2 r1878, r1850, r1875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1881, {low, high}; +} +{ +mul.f16x2 r1882, r1297, r1881; +} +{ +add.f16x2 r1885, r1857, r1882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1291, r1888; +} +{ +add.f16x2 r1892, r1864, r1889; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r1895, {low, high}; +} +{ +mul.f16x2 r1896, r1294, r1895; +} +{ +add.f16x2 r1899, r1871, r1896; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1902, {low, high}; +} +{ +mul.f16x2 r1903, r1300, r1902; +} +{ +add.f16x2 r1906, r1878, r1903; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1909, {low, high}; +} +{ +mul.f16x2 r1910, r1309, r1909; +} +{ +add.f16x2 r1913, r1885, r1910; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1916, {low, high}; +} +{ +mul.f16x2 r1917, r1303, r1916; +} +{ +add.f16x2 r1920, r1892, r1917; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1923, {low, high}; +} +{ +mul.f16x2 r1924, r1306, r1923; +} +{ +add.f16x2 r1927, r1899, r1924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1930, {low, high}; +} +{ +mul.f16x2 r1931, r1312, r1930; +} +{ +add.f16x2 r1934, r1906, r1931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1937, {low, high}; +} +{ +mul.f16x2 r1938, r1321, r1937; +} +{ +add.f16x2 r1941, r1913, r1938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r1944, {low, high}; +} +{ +mul.f16x2 r1945, r1315, r1944; +} +{ +add.f16x2 r1948, r1920, r1945; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r1951, {low, high}; +} +{ +mul.f16x2 r1952, r1318, r1951; +} +{ +add.f16x2 r1955, r1927, r1952; +} +{ +sub.f16x2 r1958, r1934, r1941; +} +{ +add.f16x2 r1961, r1948, r1955; +} +{ +add.f16x2 r1964, r1934, r1941; +} +{ +sub.f16x2 r1967, r1948, r1955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r1971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1972, {low, high}; +} +{ +mul.f16x2 r1973, r1264, r1972; +} +{ +add.f16x2 r1976, r1325, r1973; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1979, {low, high}; +} +{ +mul.f16x2 r1980, r1273, r1979; +} +{ +add.f16x2 r1983, r1970, r1980; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r1986, {low, high}; +} +{ +mul.f16x2 r1987, r1267, r1986; +} +{ +add.f16x2 r1990, r1328, r1987; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r1993, {low, high}; +} +{ +mul.f16x2 r1994, r1270, r1993; +} +{ +add.f16x2 r1997, r1971, r1994; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2000, {low, high}; +} +{ +mul.f16x2 r2001, r1276, r2000; +} +{ +add.f16x2 r2004, r1976, r2001; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2007, {low, high}; +} +{ +mul.f16x2 r2008, r1285, r2007; +} +{ +add.f16x2 r2011, r1983, r2008; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2014, {low, high}; +} +{ +mul.f16x2 r2015, r1279, r2014; +} +{ +add.f16x2 r2018, r1990, r2015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2021, {low, high}; +} +{ +mul.f16x2 r2022, r1282, r2021; +} +{ +add.f16x2 r2025, r1997, r2022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2028, {low, high}; +} +{ +mul.f16x2 r2029, r1288, r2028; +} +{ +add.f16x2 r2032, r2004, r2029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2035, {low, high}; +} +{ +mul.f16x2 r2036, r1297, r2035; +} +{ +add.f16x2 r2039, r2011, r2036; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2042, {low, high}; +} +{ +mul.f16x2 r2043, r1291, r2042; +} +{ +add.f16x2 r2046, r2018, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2049, {low, high}; +} +{ +mul.f16x2 r2050, r1294, r2049; +} +{ +add.f16x2 r2053, r2025, r2050; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2057, r1300, r2056; +} +{ +add.f16x2 r2060, r2032, r2057; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2063, {low, high}; +} +{ +mul.f16x2 r2064, r1309, r2063; +} +{ +add.f16x2 r2067, r2039, r2064; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2070, {low, high}; +} +{ +mul.f16x2 r2071, r1303, r2070; +} +{ +add.f16x2 r2074, r2046, r2071; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r1306, r2077; +} +{ +add.f16x2 r2081, r2053, r2078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2084, {low, high}; +} +{ +mul.f16x2 r2085, r1312, r2084; +} +{ +add.f16x2 r2088, r2060, r2085; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2091, {low, high}; +} +{ +mul.f16x2 r2092, r1321, r2091; +} +{ +add.f16x2 r2095, r2067, r2092; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2098, {low, high}; +} +{ +mul.f16x2 r2099, r1315, r2098; +} +{ +add.f16x2 r2102, r2074, r2099; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2105, {low, high}; +} +{ +mul.f16x2 r2106, r1318, r2105; +} +{ +add.f16x2 r2109, r2081, r2106; +} +{ +sub.f16x2 r2112, r2088, r2095; +} +{ +add.f16x2 r2115, r2102, r2109; +} +{ +add.f16x2 r2118, r2088, r2095; +} +{ +sub.f16x2 r2121, r2102, r2109; +} +mul.wide.u32 rd6, r24, -1171354717; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r3368, rd7; +mul.lo.s32 r3369, r3368, 11; +sub.s32 r3370, r24, r3369; +shl.b32 r3371, r3370, 2; +add.s32 r3372, r3365, r3371; +cvt.rn.f32.u32 f714, r3368; +mul.f32 f715, f714, 0f3D54B191; +cos.approx.f32 f470, f715; +sin.approx.f32 f716, f715; +neg.f32 f471, f716; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f470; +cvt.rn.f16.f32 high, f471; +mov.b32 r2124, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1499, r2129; +} +{ +fma.rn.f16x2 r2134, r1496, r2127, r2131; +} +{ +mul.f16x2 r2138, r1496, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1499, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2124, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1653, r2166; +} +{ +fma.rn.f16x2 r2171, r1650, r2164, r2168; +} +{ +mul.f16x2 r2175, r1650, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1653, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1807, r2203; +} +{ +fma.rn.f16x2 r2208, r1804, r2201, r2205; +} +{ +mul.f16x2 r2212, r1804, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1807, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1961, r2240; +} +{ +fma.rn.f16x2 r2245, r1958, r2238, r2242; +} +{ +mul.f16x2 r2249, r1958, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1961, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r2115, r2277; +} +{ +fma.rn.f16x2 r2282, r2112, r2275, r2279; +} +{ +mul.f16x2 r2286, r2112, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r2115, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r2121, r2314; +} +{ +fma.rn.f16x2 r2319, r2118, r2312, r2316; +} +{ +mul.f16x2 r2323, r2118, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r2121, r2312, r2326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2334, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2336, {low, high}; +} +{ +mul.f16x2 r2337, r2334, r2336; +} +{ +mul.f16x2 r2340, r2308, r2332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2343, {high, low}; +} +{ +fma.rn.f16x2 r2345, r2337, r2343, r2340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2349, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2351, {high, high}; +} +{ +mul.f16x2 r2353, r1967, r2351; +} +{ +fma.rn.f16x2 r2356, r1964, r2349, r2353; +} +{ +mul.f16x2 r2360, r1964, r2351; +} +{ +neg.f16x2 r2363, r2360; +} +{ +fma.rn.f16x2 r2365, r1967, r2349, r2363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2371, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2373, {low, high}; +} +{ +mul.f16x2 r2374, r2371, r2373; +} +{ +mul.f16x2 r2377, r2345, r2369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2345; +mov.b32 r2380, {high, low}; +} +{ +fma.rn.f16x2 r2382, r2374, r2380, r2377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2386, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2388, {high, high}; +} +{ +mul.f16x2 r2390, r1813, r2388; +} +{ +fma.rn.f16x2 r2393, r1810, r2386, r2390; +} +{ +mul.f16x2 r2397, r1810, r2388; +} +{ +neg.f16x2 r2400, r2397; +} +{ +fma.rn.f16x2 r2402, r1813, r2386, r2400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2408, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2410, {low, high}; +} +{ +mul.f16x2 r2411, r2408, r2410; +} +{ +mul.f16x2 r2414, r2382, r2406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2382; +mov.b32 r2417, {high, low}; +} +{ +fma.rn.f16x2 r2419, r2411, r2417, r2414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2423, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2425, {high, high}; +} +{ +mul.f16x2 r2427, r1659, r2425; +} +{ +fma.rn.f16x2 r2430, r1656, r2423, r2427; +} +{ +mul.f16x2 r2434, r1656, r2425; +} +{ +neg.f16x2 r2437, r2434; +} +{ +fma.rn.f16x2 r2439, r1659, r2423, r2437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2443, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2124; +mov.b32 r2445, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f491; +mov.b32 r2447, {low, high}; +} +{ +mul.f16x2 r2448, r2445, r2447; +} +{ +mul.f16x2 r2451, r2419, r2443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2419; +mov.b32 r2454, {high, low}; +} +{ +fma.rn.f16x2 r2456, r2448, r2454, r2451; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2460, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2456; +mov.b32 r2462, {high, high}; +} +{ +mul.f16x2 r2464, r1505, r2462; +} +{ +fma.rn.f16x2 r2467, r1502, r2460, r2464; +} +{ +mul.f16x2 r2471, r1502, r2462; +} +{ +neg.f16x2 r2474, r2471; +} +{ +fma.rn.f16x2 r2476, r1505, r2460, r2474; +} +barrier.sync 0; +mad.lo.s32 r3373, r3368, 484, r3372; +st.shared.u32 [r3373], r1348; +st.shared.u32 [r3373+44], r2134; +st.shared.u32 [r3373+88], r2171; +st.shared.u32 [r3373+132], r2208; +st.shared.u32 [r3373+176], r2245; +st.shared.u32 [r3373+220], r2282; +st.shared.u32 [r3373+264], r2319; +st.shared.u32 [r3373+308], r2356; +st.shared.u32 [r3373+352], r2393; +st.shared.u32 [r3373+396], r2430; +st.shared.u32 [r3373+440], r2467; +barrier.sync 0; +ld.shared.u32 r2558, [r3367]; +ld.shared.u32 r2498, [r3367+484]; +ld.shared.u32 r2510, [r3367+968]; +ld.shared.u32 r2522, [r3367+1452]; +ld.shared.u32 r2534, [r3367+1936]; +ld.shared.u32 r2546, [r3367+2420]; +ld.shared.u32 r2547, [r3367+2904]; +ld.shared.u32 r2535, [r3367+3388]; +ld.shared.u32 r2523, [r3367+3872]; +ld.shared.u32 r2511, [r3367+4356]; +ld.shared.u32 r2499, [r3367+4840]; +barrier.sync 0; +st.shared.u32 [r3373], r1351; +st.shared.u32 [r3373+44], r2143; +st.shared.u32 [r3373+88], r2180; +st.shared.u32 [r3373+132], r2217; +st.shared.u32 [r3373+176], r2254; +st.shared.u32 [r3373+220], r2291; +st.shared.u32 [r3373+264], r2328; +st.shared.u32 [r3373+308], r2365; +st.shared.u32 [r3373+352], r2402; +st.shared.u32 [r3373+396], r2439; +st.shared.u32 [r3373+440], r2476; +barrier.sync 0; +ld.shared.u32 r2561, [r3367]; +ld.shared.u32 r2501, [r3367+484]; +ld.shared.u32 r2513, [r3367+968]; +ld.shared.u32 r2525, [r3367+1452]; +ld.shared.u32 r2537, [r3367+1936]; +ld.shared.u32 r2549, [r3367+2420]; +ld.shared.u32 r2550, [r3367+2904]; +ld.shared.u32 r2538, [r3367+3388]; +ld.shared.u32 r2526, [r3367+3872]; +ld.shared.u32 r2514, [r3367+4356]; +ld.shared.u32 r2502, [r3367+4840]; +{ +add.f16x2 r2497, r2498, r2499; +} +{ +add.f16x2 r2500, r2501, r2502; +} +{ +sub.f16x2 r2503, r2498, r2499; +} +{ +sub.f16x2 r2506, r2501, r2502; +} +{ +add.f16x2 r2509, r2510, r2511; +} +{ +add.f16x2 r2512, r2513, r2514; +} +{ +sub.f16x2 r2515, r2510, r2511; +} +{ +sub.f16x2 r2518, r2513, r2514; +} +{ +add.f16x2 r2521, r2522, r2523; +} +{ +add.f16x2 r2524, r2525, r2526; +} +{ +sub.f16x2 r2527, r2522, r2523; +} +{ +sub.f16x2 r2530, r2525, r2526; +} +{ +add.f16x2 r2533, r2534, r2535; +} +{ +add.f16x2 r2536, r2537, r2538; +} +{ +sub.f16x2 r2539, r2534, r2535; +} +{ +sub.f16x2 r2542, r2537, r2538; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2497; +} +{ +add.f16x2 r2560, r2561, r2500; +} +{ +add.f16x2 r2563, r2557, r2509; +} +{ +add.f16x2 r2566, r2560, r2512; +} +{ +add.f16x2 r2569, r2563, r2521; +} +{ +add.f16x2 r2572, r2566, r2524; +} +{ +add.f16x2 r2575, r2569, r2533; +} +{ +add.f16x2 r2578, r2572, r2536; +} +{ +add.f16x2 %0, r2575, r2545; +} +{ +add.f16x2 %1, r2578, r2548; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2589, {low, high}; +} +{ +mul.f16x2 r2590, r2497, r2589; +} +{ +add.f16x2 r2593, r2558, r2590; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2596, {low, high}; +} +{ +mul.f16x2 r2597, r2506, r2596; +} +{ +add.f16x2 r2600, r2587, r2597; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2603, {low, high}; +} +{ +mul.f16x2 r2604, r2500, r2603; +} +{ +add.f16x2 r2607, r2561, r2604; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2610, {low, high}; +} +{ +mul.f16x2 r2611, r2503, r2610; +} +{ +add.f16x2 r2614, r2588, r2611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2617, {low, high}; +} +{ +mul.f16x2 r2618, r2509, r2617; +} +{ +add.f16x2 r2621, r2593, r2618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2624, {low, high}; +} +{ +mul.f16x2 r2625, r2518, r2624; +} +{ +add.f16x2 r2628, r2600, r2625; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2631, {low, high}; +} +{ +mul.f16x2 r2632, r2512, r2631; +} +{ +add.f16x2 r2635, r2607, r2632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2638, {low, high}; +} +{ +mul.f16x2 r2639, r2515, r2638; +} +{ +add.f16x2 r2642, r2614, r2639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2645, {low, high}; +} +{ +mul.f16x2 r2646, r2521, r2645; +} +{ +add.f16x2 r2649, r2621, r2646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2652, {low, high}; +} +{ +mul.f16x2 r2653, r2530, r2652; +} +{ +add.f16x2 r2656, r2628, r2653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2659, {low, high}; +} +{ +mul.f16x2 r2660, r2524, r2659; +} +{ +add.f16x2 r2663, r2635, r2660; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2666, {low, high}; +} +{ +mul.f16x2 r2667, r2527, r2666; +} +{ +add.f16x2 r2670, r2642, r2667; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2673, {low, high}; +} +{ +mul.f16x2 r2674, r2533, r2673; +} +{ +add.f16x2 r2677, r2649, r2674; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2680, {low, high}; +} +{ +mul.f16x2 r2681, r2542, r2680; +} +{ +add.f16x2 r2684, r2656, r2681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2687, {low, high}; +} +{ +mul.f16x2 r2688, r2536, r2687; +} +{ +add.f16x2 r2691, r2663, r2688; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2694, {low, high}; +} +{ +mul.f16x2 r2695, r2539, r2694; +} +{ +add.f16x2 r2698, r2670, r2695; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2701, {low, high}; +} +{ +mul.f16x2 r2702, r2545, r2701; +} +{ +add.f16x2 r2705, r2677, r2702; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2708, {low, high}; +} +{ +mul.f16x2 r2709, r2554, r2708; +} +{ +add.f16x2 r2712, r2684, r2709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2715, {low, high}; +} +{ +mul.f16x2 r2716, r2548, r2715; +} +{ +add.f16x2 r2719, r2691, r2716; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r2722, {low, high}; +} +{ +mul.f16x2 r2723, r2551, r2722; +} +{ +add.f16x2 r2726, r2698, r2723; +} +{ +sub.f16x2 %2, r2705, r2712; +} +{ +add.f16x2 %3, r2719, r2726; +} +{ +add.f16x2 %20, r2705, r2712; +} +{ +sub.f16x2 %21, r2719, r2726; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2743, {low, high}; +} +{ +mul.f16x2 r2744, r2497, r2743; +} +{ +add.f16x2 r2747, r2558, r2744; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2750, {low, high}; +} +{ +mul.f16x2 r2751, r2506, r2750; +} +{ +add.f16x2 r2754, r2741, r2751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2757, {low, high}; +} +{ +mul.f16x2 r2758, r2500, r2757; +} +{ +add.f16x2 r2761, r2561, r2758; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f59; +cvt.rn.f16.f32 high, f59; +mov.b32 r2764, {low, high}; +} +{ +mul.f16x2 r2765, r2503, r2764; +} +{ +add.f16x2 r2768, r2742, r2765; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2771, {low, high}; +} +{ +mul.f16x2 r2772, r2509, r2771; +} +{ +add.f16x2 r2775, r2747, r2772; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2778, {low, high}; +} +{ +mul.f16x2 r2779, r2518, r2778; +} +{ +add.f16x2 r2782, r2754, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r2785, {low, high}; +} +{ +mul.f16x2 r2786, r2512, r2785; +} +{ +add.f16x2 r2789, r2761, r2786; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r2792, {low, high}; +} +{ +mul.f16x2 r2793, r2515, r2792; +} +{ +add.f16x2 r2796, r2768, r2793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2799, {low, high}; +} +{ +mul.f16x2 r2800, r2521, r2799; +} +{ +add.f16x2 r2803, r2775, r2800; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2806, {low, high}; +} +{ +mul.f16x2 r2807, r2530, r2806; +} +{ +add.f16x2 r2810, r2782, r2807; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2524, r2813; +} +{ +add.f16x2 r2817, r2789, r2814; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2820, {low, high}; +} +{ +mul.f16x2 r2821, r2527, r2820; +} +{ +add.f16x2 r2824, r2796, r2821; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2827, {low, high}; +} +{ +mul.f16x2 r2828, r2533, r2827; +} +{ +add.f16x2 r2831, r2803, r2828; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2834, {low, high}; +} +{ +mul.f16x2 r2835, r2542, r2834; +} +{ +add.f16x2 r2838, r2810, r2835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2841, {low, high}; +} +{ +mul.f16x2 r2842, r2536, r2841; +} +{ +add.f16x2 r2845, r2817, r2842; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r2848, {low, high}; +} +{ +mul.f16x2 r2849, r2539, r2848; +} +{ +add.f16x2 r2852, r2824, r2849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2855, {low, high}; +} +{ +mul.f16x2 r2856, r2545, r2855; +} +{ +add.f16x2 r2859, r2831, r2856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2862, {low, high}; +} +{ +mul.f16x2 r2863, r2554, r2862; +} +{ +add.f16x2 r2866, r2838, r2863; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2869, {low, high}; +} +{ +mul.f16x2 r2870, r2548, r2869; +} +{ +add.f16x2 r2873, r2845, r2870; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r2876, {low, high}; +} +{ +mul.f16x2 r2877, r2551, r2876; +} +{ +add.f16x2 r2880, r2852, r2877; +} +{ +sub.f16x2 %4, r2859, r2866; +} +{ +add.f16x2 %5, r2873, r2880; +} +{ +add.f16x2 %18, r2859, r2866; +} +{ +sub.f16x2 %19, r2873, r2880; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2895, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r2896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2897, {low, high}; +} +{ +mul.f16x2 r2898, r2497, r2897; +} +{ +add.f16x2 r2901, r2558, r2898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2904, {low, high}; +} +{ +mul.f16x2 r2905, r2506, r2904; +} +{ +add.f16x2 r2908, r2895, r2905; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r2911, {low, high}; +} +{ +mul.f16x2 r2912, r2500, r2911; +} +{ +add.f16x2 r2915, r2561, r2912; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r2918, {low, high}; +} +{ +mul.f16x2 r2919, r2503, r2918; +} +{ +add.f16x2 r2922, r2896, r2919; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2925, {low, high}; +} +{ +mul.f16x2 r2926, r2509, r2925; +} +{ +add.f16x2 r2929, r2901, r2926; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2932, {low, high}; +} +{ +mul.f16x2 r2933, r2518, r2932; +} +{ +add.f16x2 r2936, r2908, r2933; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r2939, {low, high}; +} +{ +mul.f16x2 r2940, r2512, r2939; +} +{ +add.f16x2 r2943, r2915, r2940; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f111; +cvt.rn.f16.f32 high, f111; +mov.b32 r2946, {low, high}; +} +{ +mul.f16x2 r2947, r2515, r2946; +} +{ +add.f16x2 r2950, r2922, r2947; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2953, {low, high}; +} +{ +mul.f16x2 r2954, r2521, r2953; +} +{ +add.f16x2 r2957, r2929, r2954; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2960, {low, high}; +} +{ +mul.f16x2 r2961, r2530, r2960; +} +{ +add.f16x2 r2964, r2936, r2961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r2967, {low, high}; +} +{ +mul.f16x2 r2968, r2524, r2967; +} +{ +add.f16x2 r2971, r2943, r2968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r2974, {low, high}; +} +{ +mul.f16x2 r2975, r2527, r2974; +} +{ +add.f16x2 r2978, r2950, r2975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2981, {low, high}; +} +{ +mul.f16x2 r2982, r2533, r2981; +} +{ +add.f16x2 r2985, r2957, r2982; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r2988, {low, high}; +} +{ +mul.f16x2 r2989, r2542, r2988; +} +{ +add.f16x2 r2992, r2964, r2989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r2995, {low, high}; +} +{ +mul.f16x2 r2996, r2536, r2995; +} +{ +add.f16x2 r2999, r2971, r2996; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3002, {low, high}; +} +{ +mul.f16x2 r3003, r2539, r3002; +} +{ +add.f16x2 r3006, r2978, r3003; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r2545, r3009; +} +{ +add.f16x2 r3013, r2985, r3010; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3016, {low, high}; +} +{ +mul.f16x2 r3017, r2554, r3016; +} +{ +add.f16x2 r3020, r2992, r3017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3023, {low, high}; +} +{ +mul.f16x2 r3024, r2548, r3023; +} +{ +add.f16x2 r3027, r2999, r3024; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3030, {low, high}; +} +{ +mul.f16x2 r3031, r2551, r3030; +} +{ +add.f16x2 r3034, r3006, r3031; +} +{ +sub.f16x2 %6, r3013, r3020; +} +{ +add.f16x2 %7, r3027, r3034; +} +{ +add.f16x2 %16, r3013, r3020; +} +{ +sub.f16x2 %17, r3027, r3034; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3051, {low, high}; +} +{ +mul.f16x2 r3052, r2497, r3051; +} +{ +add.f16x2 r3055, r2558, r3052; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3058, {low, high}; +} +{ +mul.f16x2 r3059, r2506, r3058; +} +{ +add.f16x2 r3062, r3049, r3059; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3065, {low, high}; +} +{ +mul.f16x2 r3066, r2500, r3065; +} +{ +add.f16x2 r3069, r2561, r3066; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3072, {low, high}; +} +{ +mul.f16x2 r3073, r2503, r3072; +} +{ +add.f16x2 r3076, r3050, r3073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3079, {low, high}; +} +{ +mul.f16x2 r3080, r2509, r3079; +} +{ +add.f16x2 r3083, r3055, r3080; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3086, {low, high}; +} +{ +mul.f16x2 r3087, r2518, r3086; +} +{ +add.f16x2 r3090, r3062, r3087; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3093, {low, high}; +} +{ +mul.f16x2 r3094, r2512, r3093; +} +{ +add.f16x2 r3097, r3069, r3094; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f155; +cvt.rn.f16.f32 high, f155; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r2515, r3100; +} +{ +add.f16x2 r3104, r3076, r3101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3107, {low, high}; +} +{ +mul.f16x2 r3108, r2521, r3107; +} +{ +add.f16x2 r3111, r3083, r3108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3114, {low, high}; +} +{ +mul.f16x2 r3115, r2530, r3114; +} +{ +add.f16x2 r3118, r3090, r3115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3121, {low, high}; +} +{ +mul.f16x2 r3122, r2524, r3121; +} +{ +add.f16x2 r3125, r3097, r3122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f163; +cvt.rn.f16.f32 high, f163; +mov.b32 r3128, {low, high}; +} +{ +mul.f16x2 r3129, r2527, r3128; +} +{ +add.f16x2 r3132, r3104, r3129; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3135, {low, high}; +} +{ +mul.f16x2 r3136, r2533, r3135; +} +{ +add.f16x2 r3139, r3111, r3136; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3142, {low, high}; +} +{ +mul.f16x2 r3143, r2542, r3142; +} +{ +add.f16x2 r3146, r3118, r3143; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3149, {low, high}; +} +{ +mul.f16x2 r3150, r2536, r3149; +} +{ +add.f16x2 r3153, r3125, r3150; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3156, {low, high}; +} +{ +mul.f16x2 r3157, r2539, r3156; +} +{ +add.f16x2 r3160, r3132, r3157; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3163, {low, high}; +} +{ +mul.f16x2 r3164, r2545, r3163; +} +{ +add.f16x2 r3167, r3139, r3164; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3170, {low, high}; +} +{ +mul.f16x2 r3171, r2554, r3170; +} +{ +add.f16x2 r3174, r3146, r3171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r2548, r3177; +} +{ +add.f16x2 r3181, r3153, r3178; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3184, {low, high}; +} +{ +mul.f16x2 r3185, r2551, r3184; +} +{ +add.f16x2 r3188, r3160, r3185; +} +{ +sub.f16x2 %8, r3167, r3174; +} +{ +add.f16x2 %9, r3181, r3188; +} +{ +add.f16x2 %14, r3167, r3174; +} +{ +sub.f16x2 %15, r3181, r3188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f183; +cvt.rn.f16.f32 high, f183; +mov.b32 r3204, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3205, {low, high}; +} +{ +mul.f16x2 r3206, r2497, r3205; +} +{ +add.f16x2 r3209, r2558, r3206; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3212, {low, high}; +} +{ +mul.f16x2 r3213, r2506, r3212; +} +{ +add.f16x2 r3216, r3203, r3213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f189; +cvt.rn.f16.f32 high, f189; +mov.b32 r3219, {low, high}; +} +{ +mul.f16x2 r3220, r2500, r3219; +} +{ +add.f16x2 r3223, r2561, r3220; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f191; +cvt.rn.f16.f32 high, f191; +mov.b32 r3226, {low, high}; +} +{ +mul.f16x2 r3227, r2503, r3226; +} +{ +add.f16x2 r3230, r3204, r3227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3233, {low, high}; +} +{ +mul.f16x2 r3234, r2509, r3233; +} +{ +add.f16x2 r3237, r3209, r3234; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3240, {low, high}; +} +{ +mul.f16x2 r3241, r2518, r3240; +} +{ +add.f16x2 r3244, r3216, r3241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f197; +cvt.rn.f16.f32 high, f197; +mov.b32 r3247, {low, high}; +} +{ +mul.f16x2 r3248, r2512, r3247; +} +{ +add.f16x2 r3251, r3223, r3248; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f199; +cvt.rn.f16.f32 high, f199; +mov.b32 r3254, {low, high}; +} +{ +mul.f16x2 r3255, r2515, r3254; +} +{ +add.f16x2 r3258, r3230, r3255; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3261, {low, high}; +} +{ +mul.f16x2 r3262, r2521, r3261; +} +{ +add.f16x2 r3265, r3237, r3262; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r2530, r3268; +} +{ +add.f16x2 r3272, r3244, r3269; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f205; +cvt.rn.f16.f32 high, f205; +mov.b32 r3275, {low, high}; +} +{ +mul.f16x2 r3276, r2524, r3275; +} +{ +add.f16x2 r3279, r3251, r3276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f207; +cvt.rn.f16.f32 high, f207; +mov.b32 r3282, {low, high}; +} +{ +mul.f16x2 r3283, r2527, r3282; +} +{ +add.f16x2 r3286, r3258, r3283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3289, {low, high}; +} +{ +mul.f16x2 r3290, r2533, r3289; +} +{ +add.f16x2 r3293, r3265, r3290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3296, {low, high}; +} +{ +mul.f16x2 r3297, r2542, r3296; +} +{ +add.f16x2 r3300, r3272, r3297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f213; +cvt.rn.f16.f32 high, f213; +mov.b32 r3303, {low, high}; +} +{ +mul.f16x2 r3304, r2536, r3303; +} +{ +add.f16x2 r3307, r3279, r3304; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f215; +cvt.rn.f16.f32 high, f215; +mov.b32 r3310, {low, high}; +} +{ +mul.f16x2 r3311, r2539, r3310; +} +{ +add.f16x2 r3314, r3286, r3311; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3317, {low, high}; +} +{ +mul.f16x2 r3318, r2545, r3317; +} +{ +add.f16x2 r3321, r3293, r3318; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3324, {low, high}; +} +{ +mul.f16x2 r3325, r2554, r3324; +} +{ +add.f16x2 r3328, r3300, r3325; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f221; +cvt.rn.f16.f32 high, f221; +mov.b32 r3331, {low, high}; +} +{ +mul.f16x2 r3332, r2548, r3331; +} +{ +add.f16x2 r3335, r3307, r3332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f223; +cvt.rn.f16.f32 high, f223; +mov.b32 r3338, {low, high}; +} +{ +mul.f16x2 r3339, r2551, r3338; +} +{ +add.f16x2 r3342, r3314, r3339; +} +{ +sub.f16x2 %10, r3321, r3328; +} +{ +add.f16x2 %11, r3335, r3342; +} +{ +add.f16x2 %12, r3321, r3328; +} +{ +sub.f16x2 %13, r3335, r3342; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..425b5950d644e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp32_fwd.hpp.inc @@ -0,0 +1,1518 @@ +#ifndef CUFFTDX_FFT_1331_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_1331_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<182, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<781>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 10648, r2; +add.f32 f45, %27, %51; +add.f32 f46, %29, %52; +sub.f32 f47, %27, %51; +sub.f32 f48, %29, %52; +add.f32 f49, %30, %49; +add.f32 f50, %32, %50; +sub.f32 f51, %30, %49; +sub.f32 f52, %32, %50; +add.f32 f53, %33, %46; +add.f32 f54, %34, %48; +sub.f32 f55, %33, %46; +sub.f32 f56, %34, %48; +add.f32 f57, %35, %43; +add.f32 f58, %37, %45; +sub.f32 f59, %35, %43; +sub.f32 f60, %37, %45; +add.f32 f61, %38, %41; +add.f32 f62, %40, %42; +sub.f32 f63, %38, %41; +sub.f32 f64, %40, %42; +mov.u32 r4, %tid.x; +add.f32 f65, %25, f45; +add.f32 f66, %26, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +fma.rn.f32 f73, f45, 0f3F575C64, %25; +fma.rn.f32 f74, f48, 0fBF0A6770, 0f00000000; +fma.rn.f32 f75, f46, 0f3F575C64, %26; +fma.rn.f32 f76, f47, 0fBF0A6770, 0f00000000; +fma.rn.f32 f77, f49, 0f3ED4B147, f73; +fma.rn.f32 f78, f52, 0fBF68DDA4, f74; +fma.rn.f32 f79, f50, 0f3ED4B147, f75; +fma.rn.f32 f80, f51, 0fBF68DDA4, f76; +fma.rn.f32 f81, f53, 0fBE11BAFB, f77; +fma.rn.f32 f82, f56, 0fBF7D64F0, f78; +fma.rn.f32 f83, f54, 0fBE11BAFB, f79; +fma.rn.f32 f84, f55, 0fBF7D64F0, f80; +fma.rn.f32 f85, f57, 0fBF27A4F4, f81; +fma.rn.f32 f86, f60, 0fBF4178CE, f82; +fma.rn.f32 f87, f58, 0fBF27A4F4, f83; +fma.rn.f32 f88, f59, 0fBF4178CE, f84; +fma.rn.f32 f89, f61, 0fBF75A155, f85; +fma.rn.f32 f90, f64, 0fBE903F40, f86; +fma.rn.f32 f91, f62, 0fBF75A155, f87; +fma.rn.f32 f92, f63, 0fBE903F40, f88; +sub.f32 f93, f89, f90; +add.f32 f94, f92, f91; +add.f32 f95, f90, f89; +sub.f32 f96, f91, f92; +fma.rn.f32 f97, f45, 0f3ED4B147, %25; +fma.rn.f32 f98, f48, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f99, f46, 0f3ED4B147, %26; +fma.rn.f32 f100, f47, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f101, f49, 0fBF27A4F4, f97; +fma.rn.f32 f102, f52, 0fBF4178CE, f98; +fma.rn.f32 f103, f50, 0fBF27A4F4, f99; +fma.rn.f32 f104, f51, 0fBF4178CE, f100; +fma.rn.f32 f105, f53, 0fBF75A155, f101; +fma.rn.f32 f106, f56, 0f3E903F40, f102; +fma.rn.f32 f107, f54, 0fBF75A155, f103; +fma.rn.f32 f108, f55, 0f3E903F40, f104; +fma.rn.f32 f109, f57, 0fBE11BAFB, f105; +fma.rn.f32 f110, f60, 0f3F7D64F0, f106; +fma.rn.f32 f111, f58, 0fBE11BAFB, f107; +fma.rn.f32 f112, f59, 0f3F7D64F0, f108; +fma.rn.f32 f113, f61, 0f3F575C64, f109; +fma.rn.f32 f114, f64, 0f3F0A6770, f110; +fma.rn.f32 f115, f62, 0f3F575C64, f111; +fma.rn.f32 f116, f63, 0f3F0A6770, f112; +sub.f32 f117, f113, f114; +add.f32 f118, f116, f115; +add.f32 f119, f114, f113; +sub.f32 f120, f115, f116; +fma.rn.f32 f121, f45, 0fBE11BAFB, %25; +fma.rn.f32 f122, f48, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f123, f46, 0fBE11BAFB, %26; +fma.rn.f32 f124, f47, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f125, f49, 0fBF75A155, f121; +fma.rn.f32 f126, f52, 0f3E903F40, f122; +fma.rn.f32 f127, f50, 0fBF75A155, f123; +fma.rn.f32 f128, f51, 0f3E903F40, f124; +fma.rn.f32 f129, f53, 0f3ED4B147, f125; +fma.rn.f32 f130, f56, 0f3F68DDA4, f126; +fma.rn.f32 f131, f54, 0f3ED4B147, f127; +fma.rn.f32 f132, f55, 0f3F68DDA4, f128; +fma.rn.f32 f133, f57, 0f3F575C64, f129; +fma.rn.f32 f134, f60, 0fBF0A6770, f130; +fma.rn.f32 f135, f58, 0f3F575C64, f131; +fma.rn.f32 f136, f59, 0fBF0A6770, f132; +fma.rn.f32 f137, f61, 0fBF27A4F4, f133; +fma.rn.f32 f138, f64, 0fBF4178CE, f134; +fma.rn.f32 f139, f62, 0fBF27A4F4, f135; +fma.rn.f32 f140, f63, 0fBF4178CE, f136; +sub.f32 f141, f137, f138; +add.f32 f142, f140, f139; +add.f32 f143, f138, f137; +sub.f32 f144, f139, f140; +fma.rn.f32 f145, f45, 0fBF27A4F4, %25; +fma.rn.f32 f146, f48, 0fBF4178CE, 0f00000000; +fma.rn.f32 f147, f46, 0fBF27A4F4, %26; +fma.rn.f32 f148, f47, 0fBF4178CE, 0f00000000; +fma.rn.f32 f149, f49, 0fBE11BAFB, f145; +fma.rn.f32 f150, f52, 0f3F7D64F0, f146; +fma.rn.f32 f151, f50, 0fBE11BAFB, f147; +fma.rn.f32 f152, f51, 0f3F7D64F0, f148; +fma.rn.f32 f153, f53, 0f3F575C64, f149; +fma.rn.f32 f154, f56, 0fBF0A6770, f150; +fma.rn.f32 f155, f54, 0f3F575C64, f151; +fma.rn.f32 f156, f55, 0fBF0A6770, f152; +fma.rn.f32 f157, f57, 0fBF75A155, f153; +fma.rn.f32 f158, f60, 0fBE903F40, f154; +fma.rn.f32 f159, f58, 0fBF75A155, f155; +fma.rn.f32 f160, f59, 0fBE903F40, f156; +fma.rn.f32 f161, f61, 0f3ED4B147, f157; +fma.rn.f32 f162, f64, 0f3F68DDA4, f158; +fma.rn.f32 f163, f62, 0f3ED4B147, f159; +fma.rn.f32 f164, f63, 0f3F68DDA4, f160; +sub.f32 f165, f161, f162; +add.f32 f166, f164, f163; +add.f32 f167, f162, f161; +sub.f32 f168, f163, f164; +fma.rn.f32 f169, f45, 0fBF75A155, %25; +fma.rn.f32 f170, f48, 0fBE903F40, 0f00000000; +fma.rn.f32 f171, f46, 0fBF75A155, %26; +fma.rn.f32 f172, f47, 0fBE903F40, 0f00000000; +fma.rn.f32 f173, f49, 0f3F575C64, f169; +fma.rn.f32 f174, f52, 0f3F0A6770, f170; +fma.rn.f32 f175, f50, 0f3F575C64, f171; +fma.rn.f32 f176, f51, 0f3F0A6770, f172; +fma.rn.f32 f177, f53, 0fBF27A4F4, f173; +fma.rn.f32 f178, f56, 0fBF4178CE, f174; +fma.rn.f32 f179, f54, 0fBF27A4F4, f175; +fma.rn.f32 f180, f55, 0fBF4178CE, f176; +fma.rn.f32 f181, f57, 0f3ED4B147, f177; +fma.rn.f32 f182, f60, 0f3F68DDA4, f178; +fma.rn.f32 f183, f58, 0f3ED4B147, f179; +fma.rn.f32 f184, f59, 0f3F68DDA4, f180; +fma.rn.f32 f185, f61, 0fBE11BAFB, f181; +fma.rn.f32 f186, f64, 0fBF7D64F0, f182; +fma.rn.f32 f187, f62, 0fBE11BAFB, f183; +fma.rn.f32 f188, f63, 0fBF7D64F0, f184; +sub.f32 f189, f185, f186; +add.f32 f190, f188, f187; +add.f32 f191, f186, f185; +sub.f32 f192, f187, f188; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 10648, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f193, f194}, [rd6]; +mul.f32 f197, f193, f93; +mul.f32 f198, f194, f94; +mul.f32 f199, f193, f94; +mul.f32 f200, f193, f193; +mul.f32 f201, f194, f194; +sub.f32 f202, f200, f201; +mul.f32 f203, f194, f193; +fma.rn.f32 f204, f194, f193, f203; +mul.f32 f205, f202, f117; +mul.f32 f206, f204, f118; +mul.f32 f207, f202, f118; +mul.f32 f208, f193, f202; +mul.f32 f209, f194, f204; +sub.f32 f210, f208, f209; +mul.f32 f211, f193, f204; +fma.rn.f32 f212, f194, f202, f211; +mul.f32 f213, f210, f141; +mul.f32 f214, f212, f142; +mul.f32 f215, f210, f142; +mul.f32 f216, f193, f210; +mul.f32 f217, f194, f212; +sub.f32 f218, f216, f217; +mul.f32 f219, f193, f212; +fma.rn.f32 f220, f194, f210, f219; +mul.f32 f221, f218, f165; +mul.f32 f222, f220, f166; +mul.f32 f223, f218, f166; +mul.f32 f224, f193, f218; +mul.f32 f225, f194, f220; +sub.f32 f226, f224, f225; +mul.f32 f227, f193, f220; +fma.rn.f32 f228, f194, f218, f227; +mul.f32 f229, f226, f189; +mul.f32 f230, f228, f190; +mul.f32 f231, f226, f190; +mul.f32 f232, f193, f226; +mul.f32 f233, f194, f228; +sub.f32 f234, f232, f233; +mul.f32 f235, f193, f228; +fma.rn.f32 f236, f194, f226, f235; +mul.f32 f237, f234, f191; +mul.f32 f238, f236, f192; +mul.f32 f239, f234, f192; +mul.f32 f240, f193, f234; +mul.f32 f241, f194, f236; +sub.f32 f242, f240, f241; +mul.f32 f243, f193, f236; +fma.rn.f32 f244, f194, f234, f243; +mul.f32 f245, f242, f167; +mul.f32 f246, f244, f168; +mul.f32 f247, f242, f168; +mul.f32 f248, f193, f242; +mul.f32 f249, f194, f244; +sub.f32 f250, f248, f249; +mul.f32 f251, f193, f244; +fma.rn.f32 f252, f194, f242, f251; +mul.f32 f253, f250, f143; +mul.f32 f254, f252, f144; +mul.f32 f255, f250, f144; +mul.f32 f256, f193, f250; +mul.f32 f257, f194, f252; +sub.f32 f258, f256, f257; +mul.f32 f259, f193, f252; +fma.rn.f32 f260, f194, f250, f259; +mul.f32 f261, f258, f119; +mul.f32 f262, f260, f120; +mul.f32 f263, f258, f120; +mul.f32 f264, f193, f258; +mul.f32 f265, f194, f260; +sub.f32 f266, f264, f265; +mul.f32 f267, f193, f260; +fma.rn.f32 f268, f194, f258, f267; +mul.f32 f269, f266, f95; +mul.f32 f270, f268, f96; +mul.f32 f271, f266, f96; +barrier.sync 0; +mad.lo.s32 r13, r11, 88, r12; +add.f32 f272, f72, f62; +add.f32 f273, f71, f61; +st.shared.v2.f32 [r13], {f273, f272}; +fma.rn.f32 f274, f194, f93, f199; +sub.f32 f275, f197, f198; +st.shared.v2.f32 [r13+8], {f275, f274}; +fma.rn.f32 f276, f204, f117, f207; +sub.f32 f277, f205, f206; +st.shared.v2.f32 [r13+16], {f277, f276}; +sub.f32 f278, f213, f214; +fma.rn.f32 f279, f212, f141, f215; +st.shared.v2.f32 [r13+24], {f278, f279}; +fma.rn.f32 f280, f220, f165, f223; +sub.f32 f281, f221, f222; +st.shared.v2.f32 [r13+32], {f281, f280}; +fma.rn.f32 f282, f228, f189, f231; +sub.f32 f283, f229, f230; +st.shared.v2.f32 [r13+40], {f283, f282}; +fma.rn.f32 f284, f236, f191, f239; +sub.f32 f285, f237, f238; +st.shared.v2.f32 [r13+48], {f285, f284}; +fma.rn.f32 f286, f244, f167, f247; +sub.f32 f287, f245, f246; +st.shared.v2.f32 [r13+56], {f287, f286}; +fma.rn.f32 f288, f252, f143, f255; +sub.f32 f289, f253, f254; +st.shared.v2.f32 [r13+64], {f289, f288}; +fma.rn.f32 f290, f260, f119, f263; +sub.f32 f291, f261, f262; +st.shared.v2.f32 [r13+72], {f291, f290}; +fma.rn.f32 f292, f268, f95, f271; +sub.f32 f293, f269, f270; +st.shared.v2.f32 [r13+80], {f293, f292}; +barrier.sync 0; +mad.lo.s32 r14, r11, -80, r13; +ld.shared.v2.f32 {f294, f295}, [r14]; +ld.shared.v2.f32 {f298, f299}, [r14+968]; +ld.shared.v2.f32 {f302, f303}, [r14+1936]; +ld.shared.v2.f32 {f306, f307}, [r14+2904]; +ld.shared.v2.f32 {f310, f311}, [r14+3872]; +ld.shared.v2.f32 {f314, f315}, [r14+4840]; +ld.shared.v2.f32 {f318, f319}, [r14+5808]; +ld.shared.v2.f32 {f322, f323}, [r14+6776]; +ld.shared.v2.f32 {f326, f327}, [r14+7744]; +ld.shared.v2.f32 {f330, f331}, [r14+8712]; +ld.shared.v2.f32 {f334, f335}, [r14+9680]; +add.f32 f338, f298, f334; +add.f32 f339, f299, f335; +sub.f32 f340, f298, f334; +sub.f32 f341, f299, f335; +add.f32 f342, f302, f330; +add.f32 f343, f303, f331; +sub.f32 f344, f302, f330; +sub.f32 f345, f303, f331; +add.f32 f346, f306, f326; +add.f32 f347, f307, f327; +sub.f32 f348, f306, f326; +sub.f32 f349, f307, f327; +add.f32 f350, f310, f322; +add.f32 f351, f311, f323; +sub.f32 f352, f310, f322; +sub.f32 f353, f311, f323; +add.f32 f354, f314, f318; +add.f32 f355, f315, f319; +sub.f32 f356, f314, f318; +sub.f32 f357, f315, f319; +add.f32 f358, f294, f338; +add.f32 f359, f295, f339; +add.f32 f360, f358, f342; +add.f32 f361, f359, f343; +add.f32 f362, f360, f346; +add.f32 f363, f361, f347; +add.f32 f364, f362, f350; +add.f32 f365, f363, f351; +fma.rn.f32 f366, f338, 0f3F575C64, f294; +fma.rn.f32 f367, f341, 0fBF0A6770, 0f00000000; +fma.rn.f32 f368, f339, 0f3F575C64, f295; +fma.rn.f32 f369, f340, 0fBF0A6770, 0f00000000; +fma.rn.f32 f370, f342, 0f3ED4B147, f366; +fma.rn.f32 f371, f345, 0fBF68DDA4, f367; +fma.rn.f32 f372, f343, 0f3ED4B147, f368; +fma.rn.f32 f373, f344, 0fBF68DDA4, f369; +fma.rn.f32 f374, f346, 0fBE11BAFB, f370; +fma.rn.f32 f375, f349, 0fBF7D64F0, f371; +fma.rn.f32 f376, f347, 0fBE11BAFB, f372; +fma.rn.f32 f377, f348, 0fBF7D64F0, f373; +fma.rn.f32 f378, f350, 0fBF27A4F4, f374; +fma.rn.f32 f379, f353, 0fBF4178CE, f375; +fma.rn.f32 f380, f351, 0fBF27A4F4, f376; +fma.rn.f32 f381, f352, 0fBF4178CE, f377; +fma.rn.f32 f382, f354, 0fBF75A155, f378; +fma.rn.f32 f383, f357, 0fBE903F40, f379; +fma.rn.f32 f384, f355, 0fBF75A155, f380; +fma.rn.f32 f385, f356, 0fBE903F40, f381; +sub.f32 f386, f382, f383; +add.f32 f387, f385, f384; +add.f32 f388, f383, f382; +sub.f32 f389, f384, f385; +fma.rn.f32 f390, f338, 0f3ED4B147, f294; +fma.rn.f32 f391, f341, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f392, f339, 0f3ED4B147, f295; +fma.rn.f32 f393, f340, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f394, f342, 0fBF27A4F4, f390; +fma.rn.f32 f395, f345, 0fBF4178CE, f391; +fma.rn.f32 f396, f343, 0fBF27A4F4, f392; +fma.rn.f32 f397, f344, 0fBF4178CE, f393; +fma.rn.f32 f398, f346, 0fBF75A155, f394; +fma.rn.f32 f399, f349, 0f3E903F40, f395; +fma.rn.f32 f400, f347, 0fBF75A155, f396; +fma.rn.f32 f401, f348, 0f3E903F40, f397; +fma.rn.f32 f402, f350, 0fBE11BAFB, f398; +fma.rn.f32 f403, f353, 0f3F7D64F0, f399; +fma.rn.f32 f404, f351, 0fBE11BAFB, f400; +fma.rn.f32 f405, f352, 0f3F7D64F0, f401; +fma.rn.f32 f406, f354, 0f3F575C64, f402; +fma.rn.f32 f407, f357, 0f3F0A6770, f403; +fma.rn.f32 f408, f355, 0f3F575C64, f404; +fma.rn.f32 f409, f356, 0f3F0A6770, f405; +sub.f32 f410, f406, f407; +add.f32 f411, f409, f408; +add.f32 f412, f407, f406; +sub.f32 f413, f408, f409; +fma.rn.f32 f414, f338, 0fBE11BAFB, f294; +fma.rn.f32 f415, f341, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f416, f339, 0fBE11BAFB, f295; +fma.rn.f32 f417, f340, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f418, f342, 0fBF75A155, f414; +fma.rn.f32 f419, f345, 0f3E903F40, f415; +fma.rn.f32 f420, f343, 0fBF75A155, f416; +fma.rn.f32 f421, f344, 0f3E903F40, f417; +fma.rn.f32 f422, f346, 0f3ED4B147, f418; +fma.rn.f32 f423, f349, 0f3F68DDA4, f419; +fma.rn.f32 f424, f347, 0f3ED4B147, f420; +fma.rn.f32 f425, f348, 0f3F68DDA4, f421; +fma.rn.f32 f426, f350, 0f3F575C64, f422; +fma.rn.f32 f427, f353, 0fBF0A6770, f423; +fma.rn.f32 f428, f351, 0f3F575C64, f424; +fma.rn.f32 f429, f352, 0fBF0A6770, f425; +fma.rn.f32 f430, f354, 0fBF27A4F4, f426; +fma.rn.f32 f431, f357, 0fBF4178CE, f427; +fma.rn.f32 f432, f355, 0fBF27A4F4, f428; +fma.rn.f32 f433, f356, 0fBF4178CE, f429; +sub.f32 f434, f430, f431; +add.f32 f435, f433, f432; +add.f32 f436, f431, f430; +sub.f32 f437, f432, f433; +fma.rn.f32 f438, f338, 0fBF27A4F4, f294; +fma.rn.f32 f439, f341, 0fBF4178CE, 0f00000000; +fma.rn.f32 f440, f339, 0fBF27A4F4, f295; +fma.rn.f32 f441, f340, 0fBF4178CE, 0f00000000; +fma.rn.f32 f442, f342, 0fBE11BAFB, f438; +fma.rn.f32 f443, f345, 0f3F7D64F0, f439; +fma.rn.f32 f444, f343, 0fBE11BAFB, f440; +fma.rn.f32 f445, f344, 0f3F7D64F0, f441; +fma.rn.f32 f446, f346, 0f3F575C64, f442; +fma.rn.f32 f447, f349, 0fBF0A6770, f443; +fma.rn.f32 f448, f347, 0f3F575C64, f444; +fma.rn.f32 f449, f348, 0fBF0A6770, f445; +fma.rn.f32 f450, f350, 0fBF75A155, f446; +fma.rn.f32 f451, f353, 0fBE903F40, f447; +fma.rn.f32 f452, f351, 0fBF75A155, f448; +fma.rn.f32 f453, f352, 0fBE903F40, f449; +fma.rn.f32 f454, f354, 0f3ED4B147, f450; +fma.rn.f32 f455, f357, 0f3F68DDA4, f451; +fma.rn.f32 f456, f355, 0f3ED4B147, f452; +fma.rn.f32 f457, f356, 0f3F68DDA4, f453; +sub.f32 f458, f454, f455; +add.f32 f459, f457, f456; +add.f32 f460, f455, f454; +sub.f32 f461, f456, f457; +fma.rn.f32 f462, f338, 0fBF75A155, f294; +fma.rn.f32 f463, f341, 0fBE903F40, 0f00000000; +fma.rn.f32 f464, f339, 0fBF75A155, f295; +fma.rn.f32 f465, f340, 0fBE903F40, 0f00000000; +fma.rn.f32 f466, f342, 0f3F575C64, f462; +fma.rn.f32 f467, f345, 0f3F0A6770, f463; +fma.rn.f32 f468, f343, 0f3F575C64, f464; +fma.rn.f32 f469, f344, 0f3F0A6770, f465; +fma.rn.f32 f470, f346, 0fBF27A4F4, f466; +fma.rn.f32 f471, f349, 0fBF4178CE, f467; +fma.rn.f32 f472, f347, 0fBF27A4F4, f468; +fma.rn.f32 f473, f348, 0fBF4178CE, f469; +fma.rn.f32 f474, f350, 0f3ED4B147, f470; +fma.rn.f32 f475, f353, 0f3F68DDA4, f471; +fma.rn.f32 f476, f351, 0f3ED4B147, f472; +fma.rn.f32 f477, f352, 0f3F68DDA4, f473; +fma.rn.f32 f478, f354, 0fBE11BAFB, f474; +fma.rn.f32 f479, f357, 0fBF7D64F0, f475; +fma.rn.f32 f480, f355, 0fBE11BAFB, f476; +fma.rn.f32 f481, f356, 0fBF7D64F0, f477; +sub.f32 f482, f478, f479; +add.f32 f483, f481, f480; +add.f32 f484, f479, f478; +sub.f32 f485, f480, f481; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f486, f487}, [rd11]; +mul.f32 f490, f486, f386; +mul.f32 f491, f487, f387; +mul.f32 f492, f486, f387; +mul.f32 f493, f486, f486; +mul.f32 f494, f487, f487; +sub.f32 f495, f493, f494; +mul.f32 f496, f487, f486; +fma.rn.f32 f497, f487, f486, f496; +mul.f32 f498, f495, f410; +mul.f32 f499, f497, f411; +mul.f32 f500, f495, f411; +mul.f32 f501, f486, f495; +mul.f32 f502, f487, f497; +sub.f32 f503, f501, f502; +mul.f32 f504, f486, f497; +fma.rn.f32 f505, f487, f495, f504; +mul.f32 f506, f503, f434; +mul.f32 f507, f505, f435; +mul.f32 f508, f503, f435; +mul.f32 f509, f486, f503; +mul.f32 f510, f487, f505; +sub.f32 f511, f509, f510; +mul.f32 f512, f486, f505; +fma.rn.f32 f513, f487, f503, f512; +mul.f32 f514, f511, f458; +mul.f32 f515, f513, f459; +mul.f32 f516, f511, f459; +mul.f32 f517, f486, f511; +mul.f32 f518, f487, f513; +sub.f32 f519, f517, f518; +mul.f32 f520, f486, f513; +fma.rn.f32 f521, f487, f511, f520; +mul.f32 f522, f519, f482; +mul.f32 f523, f521, f483; +mul.f32 f524, f519, f483; +mul.f32 f525, f486, f519; +mul.f32 f526, f487, f521; +sub.f32 f527, f525, f526; +mul.f32 f528, f486, f521; +fma.rn.f32 f529, f487, f519, f528; +mul.f32 f530, f527, f484; +mul.f32 f531, f529, f485; +mul.f32 f532, f527, f485; +mul.f32 f533, f486, f527; +mul.f32 f534, f487, f529; +sub.f32 f535, f533, f534; +mul.f32 f536, f486, f529; +fma.rn.f32 f537, f487, f527, f536; +mul.f32 f538, f535, f460; +mul.f32 f539, f537, f461; +mul.f32 f540, f535, f461; +mul.f32 f541, f486, f535; +mul.f32 f542, f487, f537; +sub.f32 f543, f541, f542; +mul.f32 f544, f486, f537; +fma.rn.f32 f545, f487, f535, f544; +mul.f32 f546, f543, f436; +mul.f32 f547, f545, f437; +mul.f32 f548, f543, f437; +mul.f32 f549, f486, f543; +mul.f32 f550, f487, f545; +sub.f32 f551, f549, f550; +mul.f32 f552, f486, f545; +fma.rn.f32 f553, f487, f543, f552; +mul.f32 f554, f551, f412; +mul.f32 f555, f553, f413; +mul.f32 f556, f551, f413; +mul.f32 f557, f486, f551; +mul.f32 f558, f487, f553; +sub.f32 f559, f557, f558; +mul.f32 f560, f486, f553; +fma.rn.f32 f561, f487, f551, f560; +mul.f32 f562, f559, f388; +mul.f32 f563, f561, f389; +mul.f32 f564, f559, f389; +shl.b32 r18, r17, 3; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 968, r19; +add.f32 f565, f365, f355; +add.f32 f566, f364, f354; +st.shared.v2.f32 [r20], {f566, f565}; +fma.rn.f32 f567, f487, f386, f492; +sub.f32 f568, f490, f491; +st.shared.v2.f32 [r20+88], {f568, f567}; +fma.rn.f32 f569, f497, f410, f500; +sub.f32 f570, f498, f499; +st.shared.v2.f32 [r20+176], {f570, f569}; +fma.rn.f32 f571, f505, f434, f508; +sub.f32 f572, f506, f507; +st.shared.v2.f32 [r20+264], {f572, f571}; +fma.rn.f32 f573, f513, f458, f516; +sub.f32 f574, f514, f515; +st.shared.v2.f32 [r20+352], {f574, f573}; +sub.f32 f575, f522, f523; +fma.rn.f32 f576, f521, f482, f524; +st.shared.v2.f32 [r20+440], {f575, f576}; +fma.rn.f32 f577, f529, f484, f532; +sub.f32 f578, f530, f531; +st.shared.v2.f32 [r20+528], {f578, f577}; +fma.rn.f32 f579, f537, f460, f540; +sub.f32 f580, f538, f539; +st.shared.v2.f32 [r20+616], {f580, f579}; +fma.rn.f32 f581, f545, f436, f548; +sub.f32 f582, f546, f547; +st.shared.v2.f32 [r20+704], {f582, f581}; +fma.rn.f32 f583, f553, f412, f556; +sub.f32 f584, f554, f555; +st.shared.v2.f32 [r20+792], {f584, f583}; +fma.rn.f32 f585, f561, f388, f564; +sub.f32 f586, f562, f563; +st.shared.v2.f32 [r20+880], {f586, f585}; +barrier.sync 0; +ld.shared.v2.f32 {f587, f588}, [r14]; +ld.shared.v2.f32 {f591, f592}, [r14+968]; +ld.shared.v2.f32 {f595, f596}, [r14+1936]; +ld.shared.v2.f32 {f599, f600}, [r14+2904]; +ld.shared.v2.f32 {f603, f604}, [r14+3872]; +ld.shared.v2.f32 {f607, f608}, [r14+4840]; +ld.shared.v2.f32 {f611, f612}, [r14+5808]; +ld.shared.v2.f32 {f615, f616}, [r14+6776]; +ld.shared.v2.f32 {f619, f620}, [r14+7744]; +ld.shared.v2.f32 {f623, f624}, [r14+8712]; +ld.shared.v2.f32 {f627, f628}, [r14+9680]; +add.f32 f631, f591, f627; +add.f32 f632, f592, f628; +sub.f32 f633, f591, f627; +sub.f32 f634, f592, f628; +add.f32 f635, f595, f623; +add.f32 f636, f596, f624; +sub.f32 f637, f595, f623; +sub.f32 f638, f596, f624; +add.f32 f639, f599, f619; +add.f32 f640, f600, f620; +sub.f32 f641, f599, f619; +sub.f32 f642, f600, f620; +add.f32 f643, f603, f615; +add.f32 f644, f604, f616; +sub.f32 f645, f603, f615; +sub.f32 f646, f604, f616; +add.f32 f647, f607, f611; +add.f32 f648, f608, f612; +sub.f32 f649, f607, f611; +sub.f32 f650, f608, f612; +add.f32 f651, f587, f631; +add.f32 f652, f588, f632; +add.f32 f653, f651, f635; +add.f32 f654, f652, f636; +add.f32 f655, f653, f639; +add.f32 f656, f654, f640; +add.f32 f657, f655, f643; +add.f32 f658, f656, f644; +fma.rn.f32 f659, f631, 0f3F575C64, f587; +fma.rn.f32 f660, f634, 0fBF0A6770, 0f00000000; +fma.rn.f32 f661, f632, 0f3F575C64, f588; +fma.rn.f32 f662, f633, 0fBF0A6770, 0f00000000; +fma.rn.f32 f663, f635, 0f3ED4B147, f659; +fma.rn.f32 f664, f638, 0fBF68DDA4, f660; +fma.rn.f32 f665, f636, 0f3ED4B147, f661; +fma.rn.f32 f666, f637, 0fBF68DDA4, f662; +fma.rn.f32 f667, f639, 0fBE11BAFB, f663; +fma.rn.f32 f668, f642, 0fBF7D64F0, f664; +fma.rn.f32 f669, f640, 0fBE11BAFB, f665; +fma.rn.f32 f670, f641, 0fBF7D64F0, f666; +fma.rn.f32 f671, f643, 0fBF27A4F4, f667; +fma.rn.f32 f672, f646, 0fBF4178CE, f668; +fma.rn.f32 f673, f644, 0fBF27A4F4, f669; +fma.rn.f32 f674, f645, 0fBF4178CE, f670; +fma.rn.f32 f675, f647, 0fBF75A155, f671; +fma.rn.f32 f676, f650, 0fBE903F40, f672; +fma.rn.f32 f677, f648, 0fBF75A155, f673; +fma.rn.f32 f678, f649, 0fBE903F40, f674; +fma.rn.f32 f679, f631, 0f3ED4B147, f587; +fma.rn.f32 f680, f634, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f681, f632, 0f3ED4B147, f588; +fma.rn.f32 f682, f633, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f683, f635, 0fBF27A4F4, f679; +fma.rn.f32 f684, f638, 0fBF4178CE, f680; +fma.rn.f32 f685, f636, 0fBF27A4F4, f681; +fma.rn.f32 f686, f637, 0fBF4178CE, f682; +fma.rn.f32 f687, f639, 0fBF75A155, f683; +fma.rn.f32 f688, f642, 0f3E903F40, f684; +fma.rn.f32 f689, f640, 0fBF75A155, f685; +fma.rn.f32 f690, f641, 0f3E903F40, f686; +fma.rn.f32 f691, f643, 0fBE11BAFB, f687; +fma.rn.f32 f692, f646, 0f3F7D64F0, f688; +fma.rn.f32 f693, f644, 0fBE11BAFB, f689; +fma.rn.f32 f694, f645, 0f3F7D64F0, f690; +fma.rn.f32 f695, f647, 0f3F575C64, f691; +fma.rn.f32 f696, f650, 0f3F0A6770, f692; +fma.rn.f32 f697, f648, 0f3F575C64, f693; +fma.rn.f32 f698, f649, 0f3F0A6770, f694; +fma.rn.f32 f699, f631, 0fBE11BAFB, f587; +fma.rn.f32 f700, f634, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f701, f632, 0fBE11BAFB, f588; +fma.rn.f32 f702, f633, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f703, f635, 0fBF75A155, f699; +fma.rn.f32 f704, f638, 0f3E903F40, f700; +fma.rn.f32 f705, f636, 0fBF75A155, f701; +fma.rn.f32 f706, f637, 0f3E903F40, f702; +fma.rn.f32 f707, f639, 0f3ED4B147, f703; +fma.rn.f32 f708, f642, 0f3F68DDA4, f704; +fma.rn.f32 f709, f640, 0f3ED4B147, f705; +fma.rn.f32 f710, f641, 0f3F68DDA4, f706; +fma.rn.f32 f711, f643, 0f3F575C64, f707; +fma.rn.f32 f712, f646, 0fBF0A6770, f708; +fma.rn.f32 f713, f644, 0f3F575C64, f709; +fma.rn.f32 f714, f645, 0fBF0A6770, f710; +fma.rn.f32 f715, f647, 0fBF27A4F4, f711; +fma.rn.f32 f716, f650, 0fBF4178CE, f712; +fma.rn.f32 f717, f648, 0fBF27A4F4, f713; +fma.rn.f32 f718, f649, 0fBF4178CE, f714; +fma.rn.f32 f719, f631, 0fBF27A4F4, f587; +fma.rn.f32 f720, f634, 0fBF4178CE, 0f00000000; +fma.rn.f32 f721, f632, 0fBF27A4F4, f588; +fma.rn.f32 f722, f633, 0fBF4178CE, 0f00000000; +fma.rn.f32 f723, f635, 0fBE11BAFB, f719; +fma.rn.f32 f724, f638, 0f3F7D64F0, f720; +fma.rn.f32 f725, f636, 0fBE11BAFB, f721; +fma.rn.f32 f726, f637, 0f3F7D64F0, f722; +fma.rn.f32 f727, f639, 0f3F575C64, f723; +fma.rn.f32 f728, f642, 0fBF0A6770, f724; +fma.rn.f32 f729, f640, 0f3F575C64, f725; +fma.rn.f32 f730, f641, 0fBF0A6770, f726; +fma.rn.f32 f731, f643, 0fBF75A155, f727; +fma.rn.f32 f732, f646, 0fBE903F40, f728; +fma.rn.f32 f733, f644, 0fBF75A155, f729; +fma.rn.f32 f734, f645, 0fBE903F40, f730; +fma.rn.f32 f735, f647, 0f3ED4B147, f731; +fma.rn.f32 f736, f650, 0f3F68DDA4, f732; +fma.rn.f32 f737, f648, 0f3ED4B147, f733; +fma.rn.f32 f738, f649, 0f3F68DDA4, f734; +fma.rn.f32 f739, f631, 0fBF75A155, f587; +fma.rn.f32 f740, f634, 0fBE903F40, 0f00000000; +fma.rn.f32 f741, f632, 0fBF75A155, f588; +fma.rn.f32 f742, f633, 0fBE903F40, 0f00000000; +fma.rn.f32 f743, f635, 0f3F575C64, f739; +fma.rn.f32 f744, f638, 0f3F0A6770, f740; +fma.rn.f32 f745, f636, 0f3F575C64, f741; +fma.rn.f32 f746, f637, 0f3F0A6770, f742; +fma.rn.f32 f747, f639, 0fBF27A4F4, f743; +fma.rn.f32 f748, f642, 0fBF4178CE, f744; +fma.rn.f32 f749, f640, 0fBF27A4F4, f745; +fma.rn.f32 f750, f641, 0fBF4178CE, f746; +fma.rn.f32 f751, f643, 0f3ED4B147, f747; +fma.rn.f32 f752, f646, 0f3F68DDA4, f748; +fma.rn.f32 f753, f644, 0f3ED4B147, f749; +fma.rn.f32 f754, f645, 0f3F68DDA4, f750; +fma.rn.f32 f755, f647, 0fBE11BAFB, f751; +fma.rn.f32 f756, f650, 0fBF7D64F0, f752; +fma.rn.f32 f757, f648, 0fBE11BAFB, f753; +fma.rn.f32 f758, f649, 0fBF7D64F0, f754; +add.f32 %1, f658, f648; +add.f32 %0, f657, f647; +add.f32 %3, f678, f677; +sub.f32 %2, f675, f676; +add.f32 %5, f698, f697; +sub.f32 %4, f695, f696; +add.f32 %7, f718, f717; +sub.f32 %6, f715, f716; +add.f32 %9, f738, f737; +sub.f32 %8, f735, f736; +add.f32 %11, f758, f757; +sub.f32 %10, f755, f756; +sub.f32 %13, f757, f758; +add.f32 %12, f756, f755; +sub.f32 %15, f737, f738; +add.f32 %14, f736, f735; +sub.f32 %17, f717, f718; +add.f32 %16, f716, f715; +sub.f32 %19, f697, f698; +add.f32 %18, f696, f695; +sub.f32 %21, f677, f678; +add.f32 %20, f676, f675; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_1331), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<181, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<737>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 5324, r2; +add.f32 f45, %27, %51; +add.f32 f46, %29, %52; +sub.f32 f47, %27, %51; +sub.f32 f48, %29, %52; +add.f32 f49, %30, %49; +add.f32 f50, %32, %50; +sub.f32 f51, %30, %49; +sub.f32 f52, %32, %50; +add.f32 f53, %33, %46; +add.f32 f54, %34, %48; +sub.f32 f55, %33, %46; +sub.f32 f56, %34, %48; +add.f32 f57, %35, %43; +add.f32 f58, %37, %45; +sub.f32 f59, %35, %43; +sub.f32 f60, %37, %45; +add.f32 f61, %38, %41; +add.f32 f62, %40, %42; +sub.f32 f63, %38, %41; +sub.f32 f64, %40, %42; +mov.u32 r4, %tid.x; +add.f32 f65, %25, f45; +add.f32 f66, %26, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +add.f32 f73, f71, f61; +add.f32 f74, f72, f62; +fma.rn.f32 f75, f45, 0f3F575C64, %25; +fma.rn.f32 f76, f48, 0fBF0A6770, 0f00000000; +fma.rn.f32 f77, f46, 0f3F575C64, %26; +fma.rn.f32 f78, f47, 0fBF0A6770, 0f00000000; +fma.rn.f32 f79, f49, 0f3ED4B147, f75; +fma.rn.f32 f80, f52, 0fBF68DDA4, f76; +fma.rn.f32 f81, f50, 0f3ED4B147, f77; +fma.rn.f32 f82, f51, 0fBF68DDA4, f78; +fma.rn.f32 f83, f53, 0fBE11BAFB, f79; +fma.rn.f32 f84, f56, 0fBF7D64F0, f80; +fma.rn.f32 f85, f54, 0fBE11BAFB, f81; +fma.rn.f32 f86, f55, 0fBF7D64F0, f82; +fma.rn.f32 f87, f57, 0fBF27A4F4, f83; +fma.rn.f32 f88, f60, 0fBF4178CE, f84; +fma.rn.f32 f89, f58, 0fBF27A4F4, f85; +fma.rn.f32 f90, f59, 0fBF4178CE, f86; +fma.rn.f32 f91, f61, 0fBF75A155, f87; +fma.rn.f32 f92, f64, 0fBE903F40, f88; +fma.rn.f32 f93, f62, 0fBF75A155, f89; +fma.rn.f32 f94, f63, 0fBE903F40, f90; +sub.f32 f95, f91, f92; +add.f32 f96, f94, f93; +add.f32 f97, f92, f91; +sub.f32 f98, f93, f94; +fma.rn.f32 f99, f45, 0f3ED4B147, %25; +fma.rn.f32 f100, f48, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f101, f46, 0f3ED4B147, %26; +fma.rn.f32 f102, f47, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f103, f49, 0fBF27A4F4, f99; +fma.rn.f32 f104, f52, 0fBF4178CE, f100; +fma.rn.f32 f105, f50, 0fBF27A4F4, f101; +fma.rn.f32 f106, f51, 0fBF4178CE, f102; +fma.rn.f32 f107, f53, 0fBF75A155, f103; +fma.rn.f32 f108, f56, 0f3E903F40, f104; +fma.rn.f32 f109, f54, 0fBF75A155, f105; +fma.rn.f32 f110, f55, 0f3E903F40, f106; +fma.rn.f32 f111, f57, 0fBE11BAFB, f107; +fma.rn.f32 f112, f60, 0f3F7D64F0, f108; +fma.rn.f32 f113, f58, 0fBE11BAFB, f109; +fma.rn.f32 f114, f59, 0f3F7D64F0, f110; +fma.rn.f32 f115, f61, 0f3F575C64, f111; +fma.rn.f32 f116, f64, 0f3F0A6770, f112; +fma.rn.f32 f117, f62, 0f3F575C64, f113; +fma.rn.f32 f118, f63, 0f3F0A6770, f114; +sub.f32 f119, f115, f116; +add.f32 f120, f118, f117; +add.f32 f121, f116, f115; +sub.f32 f122, f117, f118; +fma.rn.f32 f123, f45, 0fBE11BAFB, %25; +fma.rn.f32 f124, f48, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f125, f46, 0fBE11BAFB, %26; +fma.rn.f32 f126, f47, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f127, f49, 0fBF75A155, f123; +fma.rn.f32 f128, f52, 0f3E903F40, f124; +fma.rn.f32 f129, f50, 0fBF75A155, f125; +fma.rn.f32 f130, f51, 0f3E903F40, f126; +fma.rn.f32 f131, f53, 0f3ED4B147, f127; +fma.rn.f32 f132, f56, 0f3F68DDA4, f128; +fma.rn.f32 f133, f54, 0f3ED4B147, f129; +fma.rn.f32 f134, f55, 0f3F68DDA4, f130; +fma.rn.f32 f135, f57, 0f3F575C64, f131; +fma.rn.f32 f136, f60, 0fBF0A6770, f132; +fma.rn.f32 f137, f58, 0f3F575C64, f133; +fma.rn.f32 f138, f59, 0fBF0A6770, f134; +fma.rn.f32 f139, f61, 0fBF27A4F4, f135; +fma.rn.f32 f140, f64, 0fBF4178CE, f136; +fma.rn.f32 f141, f62, 0fBF27A4F4, f137; +fma.rn.f32 f142, f63, 0fBF4178CE, f138; +sub.f32 f143, f139, f140; +add.f32 f144, f142, f141; +add.f32 f145, f140, f139; +sub.f32 f146, f141, f142; +fma.rn.f32 f147, f45, 0fBF27A4F4, %25; +fma.rn.f32 f148, f48, 0fBF4178CE, 0f00000000; +fma.rn.f32 f149, f46, 0fBF27A4F4, %26; +fma.rn.f32 f150, f47, 0fBF4178CE, 0f00000000; +fma.rn.f32 f151, f49, 0fBE11BAFB, f147; +fma.rn.f32 f152, f52, 0f3F7D64F0, f148; +fma.rn.f32 f153, f50, 0fBE11BAFB, f149; +fma.rn.f32 f154, f51, 0f3F7D64F0, f150; +fma.rn.f32 f155, f53, 0f3F575C64, f151; +fma.rn.f32 f156, f56, 0fBF0A6770, f152; +fma.rn.f32 f157, f54, 0f3F575C64, f153; +fma.rn.f32 f158, f55, 0fBF0A6770, f154; +fma.rn.f32 f159, f57, 0fBF75A155, f155; +fma.rn.f32 f160, f60, 0fBE903F40, f156; +fma.rn.f32 f161, f58, 0fBF75A155, f157; +fma.rn.f32 f162, f59, 0fBE903F40, f158; +fma.rn.f32 f163, f61, 0f3ED4B147, f159; +fma.rn.f32 f164, f64, 0f3F68DDA4, f160; +fma.rn.f32 f165, f62, 0f3ED4B147, f161; +fma.rn.f32 f166, f63, 0f3F68DDA4, f162; +sub.f32 f167, f163, f164; +add.f32 f168, f166, f165; +add.f32 f169, f164, f163; +sub.f32 f170, f165, f166; +fma.rn.f32 f171, f45, 0fBF75A155, %25; +fma.rn.f32 f172, f48, 0fBE903F40, 0f00000000; +fma.rn.f32 f173, f46, 0fBF75A155, %26; +fma.rn.f32 f174, f47, 0fBE903F40, 0f00000000; +fma.rn.f32 f175, f49, 0f3F575C64, f171; +fma.rn.f32 f176, f52, 0f3F0A6770, f172; +fma.rn.f32 f177, f50, 0f3F575C64, f173; +fma.rn.f32 f178, f51, 0f3F0A6770, f174; +fma.rn.f32 f179, f53, 0fBF27A4F4, f175; +fma.rn.f32 f180, f56, 0fBF4178CE, f176; +fma.rn.f32 f181, f54, 0fBF27A4F4, f177; +fma.rn.f32 f182, f55, 0fBF4178CE, f178; +fma.rn.f32 f183, f57, 0f3ED4B147, f179; +fma.rn.f32 f184, f60, 0f3F68DDA4, f180; +fma.rn.f32 f185, f58, 0f3ED4B147, f181; +fma.rn.f32 f186, f59, 0f3F68DDA4, f182; +fma.rn.f32 f187, f61, 0fBE11BAFB, f183; +fma.rn.f32 f188, f64, 0fBF7D64F0, f184; +fma.rn.f32 f189, f62, 0fBE11BAFB, f185; +fma.rn.f32 f190, f63, 0fBF7D64F0, f186; +sub.f32 f191, f187, f188; +add.f32 f192, f190, f189; +add.f32 f193, f188, f187; +sub.f32 f194, f189, f190; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f195, f196}, [rd6]; +mul.f32 f199, f195, f95; +mul.f32 f200, f196, f96; +sub.f32 f201, f199, f200; +mul.f32 f202, f195, f96; +fma.rn.f32 f203, f196, f95, f202; +mul.f32 f204, f195, f195; +mul.f32 f205, f196, f196; +sub.f32 f206, f204, f205; +mul.f32 f207, f196, f195; +fma.rn.f32 f208, f196, f195, f207; +mul.f32 f209, f206, f119; +mul.f32 f210, f208, f120; +sub.f32 f211, f209, f210; +mul.f32 f212, f206, f120; +fma.rn.f32 f213, f208, f119, f212; +mul.f32 f214, f195, f206; +mul.f32 f215, f196, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f195, f208; +fma.rn.f32 f218, f196, f206, f217; +mul.f32 f219, f216, f143; +mul.f32 f220, f218, f144; +sub.f32 f221, f219, f220; +mul.f32 f222, f216, f144; +fma.rn.f32 f223, f218, f143, f222; +mul.f32 f224, f195, f216; +mul.f32 f225, f196, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f195, f218; +fma.rn.f32 f228, f196, f216, f227; +mul.f32 f229, f226, f167; +mul.f32 f230, f228, f168; +sub.f32 f231, f229, f230; +mul.f32 f232, f226, f168; +fma.rn.f32 f233, f228, f167, f232; +mul.f32 f234, f195, f226; +mul.f32 f235, f196, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f195, f228; +fma.rn.f32 f238, f196, f226, f237; +mul.f32 f239, f236, f191; +mul.f32 f240, f238, f192; +sub.f32 f241, f239, f240; +mul.f32 f242, f236, f192; +fma.rn.f32 f243, f238, f191, f242; +mul.f32 f244, f195, f236; +mul.f32 f245, f196, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f195, f238; +fma.rn.f32 f248, f196, f236, f247; +mul.f32 f249, f246, f193; +mul.f32 f250, f248, f194; +sub.f32 f251, f249, f250; +mul.f32 f252, f246, f194; +fma.rn.f32 f253, f248, f193, f252; +mul.f32 f254, f195, f246; +mul.f32 f255, f196, f248; +sub.f32 f256, f254, f255; +mul.f32 f257, f195, f248; +fma.rn.f32 f258, f196, f246, f257; +mul.f32 f259, f256, f169; +mul.f32 f260, f258, f170; +sub.f32 f261, f259, f260; +mul.f32 f262, f256, f170; +fma.rn.f32 f263, f258, f169, f262; +mul.f32 f264, f195, f256; +mul.f32 f265, f196, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f195, f258; +fma.rn.f32 f268, f196, f256, f267; +mul.f32 f269, f266, f145; +mul.f32 f270, f268, f146; +sub.f32 f271, f269, f270; +mul.f32 f272, f266, f146; +fma.rn.f32 f273, f268, f145, f272; +mul.f32 f274, f195, f266; +mul.f32 f275, f196, f268; +sub.f32 f276, f274, f275; +mul.f32 f277, f195, f268; +fma.rn.f32 f278, f196, f266, f277; +mul.f32 f279, f276, f121; +mul.f32 f280, f278, f122; +sub.f32 f281, f279, f280; +mul.f32 f282, f276, f122; +fma.rn.f32 f283, f278, f121, f282; +mul.f32 f284, f195, f276; +mul.f32 f285, f196, f278; +sub.f32 f286, f284, f285; +mul.f32 f287, f195, f278; +fma.rn.f32 f288, f196, f276, f287; +mul.f32 f289, f286, f97; +mul.f32 f290, f288, f98; +sub.f32 f291, f289, f290; +mul.f32 f292, f286, f98; +fma.rn.f32 f293, f288, f97, f292; +mad.lo.s32 r12, r9, 5324, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 44, r12; +st.shared.f32 [r13], f73; +st.shared.f32 [r13+4], f201; +st.shared.f32 [r13+8], f211; +st.shared.f32 [r13+12], f221; +st.shared.f32 [r13+16], f231; +st.shared.f32 [r13+20], f241; +st.shared.f32 [r13+24], f251; +st.shared.f32 [r13+28], f261; +st.shared.f32 [r13+32], f271; +st.shared.f32 [r13+36], f281; +st.shared.f32 [r13+40], f291; +barrier.sync 0; +mad.lo.s32 r14, r11, -40, r13; +ld.shared.f32 f294, [r14]; +ld.shared.f32 f295, [r14+484]; +ld.shared.f32 f296, [r14+968]; +ld.shared.f32 f297, [r14+1452]; +ld.shared.f32 f298, [r14+1936]; +ld.shared.f32 f299, [r14+2420]; +ld.shared.f32 f300, [r14+2904]; +ld.shared.f32 f301, [r14+3388]; +ld.shared.f32 f302, [r14+3872]; +ld.shared.f32 f303, [r14+4356]; +ld.shared.f32 f304, [r14+4840]; +barrier.sync 0; +st.shared.f32 [r13], f74; +st.shared.f32 [r13+4], f203; +st.shared.f32 [r13+8], f213; +st.shared.f32 [r13+12], f223; +st.shared.f32 [r13+16], f233; +st.shared.f32 [r13+20], f243; +st.shared.f32 [r13+24], f253; +st.shared.f32 [r13+28], f263; +st.shared.f32 [r13+32], f273; +st.shared.f32 [r13+36], f283; +st.shared.f32 [r13+40], f293; +barrier.sync 0; +ld.shared.f32 f305, [r14]; +ld.shared.f32 f306, [r14+484]; +ld.shared.f32 f307, [r14+968]; +ld.shared.f32 f308, [r14+1452]; +ld.shared.f32 f309, [r14+1936]; +ld.shared.f32 f310, [r14+2420]; +ld.shared.f32 f311, [r14+2904]; +ld.shared.f32 f312, [r14+3388]; +ld.shared.f32 f313, [r14+3872]; +ld.shared.f32 f314, [r14+4356]; +ld.shared.f32 f315, [r14+4840]; +add.f32 f316, f295, f304; +add.f32 f317, f306, f315; +sub.f32 f318, f295, f304; +sub.f32 f319, f306, f315; +add.f32 f320, f296, f303; +add.f32 f321, f307, f314; +sub.f32 f322, f296, f303; +sub.f32 f323, f307, f314; +add.f32 f324, f297, f302; +add.f32 f325, f308, f313; +sub.f32 f326, f297, f302; +sub.f32 f327, f308, f313; +add.f32 f328, f298, f301; +add.f32 f329, f309, f312; +sub.f32 f330, f298, f301; +sub.f32 f331, f309, f312; +add.f32 f332, f299, f300; +add.f32 f333, f310, f311; +sub.f32 f334, f299, f300; +sub.f32 f335, f310, f311; +add.f32 f336, f294, f316; +add.f32 f337, f305, f317; +add.f32 f338, f336, f320; +add.f32 f339, f337, f321; +add.f32 f340, f338, f324; +add.f32 f341, f339, f325; +add.f32 f342, f340, f328; +add.f32 f343, f341, f329; +add.f32 f344, f342, f332; +add.f32 f345, f343, f333; +fma.rn.f32 f346, f316, 0f3F575C64, f294; +fma.rn.f32 f347, f319, 0fBF0A6770, 0f00000000; +fma.rn.f32 f348, f317, 0f3F575C64, f305; +fma.rn.f32 f349, f318, 0fBF0A6770, 0f00000000; +fma.rn.f32 f350, f320, 0f3ED4B147, f346; +fma.rn.f32 f351, f323, 0fBF68DDA4, f347; +fma.rn.f32 f352, f321, 0f3ED4B147, f348; +fma.rn.f32 f353, f322, 0fBF68DDA4, f349; +fma.rn.f32 f354, f324, 0fBE11BAFB, f350; +fma.rn.f32 f355, f327, 0fBF7D64F0, f351; +fma.rn.f32 f356, f325, 0fBE11BAFB, f352; +fma.rn.f32 f357, f326, 0fBF7D64F0, f353; +fma.rn.f32 f358, f328, 0fBF27A4F4, f354; +fma.rn.f32 f359, f331, 0fBF4178CE, f355; +fma.rn.f32 f360, f329, 0fBF27A4F4, f356; +fma.rn.f32 f361, f330, 0fBF4178CE, f357; +fma.rn.f32 f362, f332, 0fBF75A155, f358; +fma.rn.f32 f363, f335, 0fBE903F40, f359; +fma.rn.f32 f364, f333, 0fBF75A155, f360; +fma.rn.f32 f365, f334, 0fBE903F40, f361; +sub.f32 f366, f362, f363; +add.f32 f367, f365, f364; +add.f32 f368, f363, f362; +sub.f32 f369, f364, f365; +fma.rn.f32 f370, f316, 0f3ED4B147, f294; +fma.rn.f32 f371, f319, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f372, f317, 0f3ED4B147, f305; +fma.rn.f32 f373, f318, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f374, f320, 0fBF27A4F4, f370; +fma.rn.f32 f375, f323, 0fBF4178CE, f371; +fma.rn.f32 f376, f321, 0fBF27A4F4, f372; +fma.rn.f32 f377, f322, 0fBF4178CE, f373; +fma.rn.f32 f378, f324, 0fBF75A155, f374; +fma.rn.f32 f379, f327, 0f3E903F40, f375; +fma.rn.f32 f380, f325, 0fBF75A155, f376; +fma.rn.f32 f381, f326, 0f3E903F40, f377; +fma.rn.f32 f382, f328, 0fBE11BAFB, f378; +fma.rn.f32 f383, f331, 0f3F7D64F0, f379; +fma.rn.f32 f384, f329, 0fBE11BAFB, f380; +fma.rn.f32 f385, f330, 0f3F7D64F0, f381; +fma.rn.f32 f386, f332, 0f3F575C64, f382; +fma.rn.f32 f387, f335, 0f3F0A6770, f383; +fma.rn.f32 f388, f333, 0f3F575C64, f384; +fma.rn.f32 f389, f334, 0f3F0A6770, f385; +sub.f32 f390, f386, f387; +add.f32 f391, f389, f388; +add.f32 f392, f387, f386; +sub.f32 f393, f388, f389; +fma.rn.f32 f394, f316, 0fBE11BAFB, f294; +fma.rn.f32 f395, f319, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f396, f317, 0fBE11BAFB, f305; +fma.rn.f32 f397, f318, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f398, f320, 0fBF75A155, f394; +fma.rn.f32 f399, f323, 0f3E903F40, f395; +fma.rn.f32 f400, f321, 0fBF75A155, f396; +fma.rn.f32 f401, f322, 0f3E903F40, f397; +fma.rn.f32 f402, f324, 0f3ED4B147, f398; +fma.rn.f32 f403, f327, 0f3F68DDA4, f399; +fma.rn.f32 f404, f325, 0f3ED4B147, f400; +fma.rn.f32 f405, f326, 0f3F68DDA4, f401; +fma.rn.f32 f406, f328, 0f3F575C64, f402; +fma.rn.f32 f407, f331, 0fBF0A6770, f403; +fma.rn.f32 f408, f329, 0f3F575C64, f404; +fma.rn.f32 f409, f330, 0fBF0A6770, f405; +fma.rn.f32 f410, f332, 0fBF27A4F4, f406; +fma.rn.f32 f411, f335, 0fBF4178CE, f407; +fma.rn.f32 f412, f333, 0fBF27A4F4, f408; +fma.rn.f32 f413, f334, 0fBF4178CE, f409; +sub.f32 f414, f410, f411; +add.f32 f415, f413, f412; +add.f32 f416, f411, f410; +sub.f32 f417, f412, f413; +fma.rn.f32 f418, f316, 0fBF27A4F4, f294; +fma.rn.f32 f419, f319, 0fBF4178CE, 0f00000000; +fma.rn.f32 f420, f317, 0fBF27A4F4, f305; +fma.rn.f32 f421, f318, 0fBF4178CE, 0f00000000; +fma.rn.f32 f422, f320, 0fBE11BAFB, f418; +fma.rn.f32 f423, f323, 0f3F7D64F0, f419; +fma.rn.f32 f424, f321, 0fBE11BAFB, f420; +fma.rn.f32 f425, f322, 0f3F7D64F0, f421; +fma.rn.f32 f426, f324, 0f3F575C64, f422; +fma.rn.f32 f427, f327, 0fBF0A6770, f423; +fma.rn.f32 f428, f325, 0f3F575C64, f424; +fma.rn.f32 f429, f326, 0fBF0A6770, f425; +fma.rn.f32 f430, f328, 0fBF75A155, f426; +fma.rn.f32 f431, f331, 0fBE903F40, f427; +fma.rn.f32 f432, f329, 0fBF75A155, f428; +fma.rn.f32 f433, f330, 0fBE903F40, f429; +fma.rn.f32 f434, f332, 0f3ED4B147, f430; +fma.rn.f32 f435, f335, 0f3F68DDA4, f431; +fma.rn.f32 f436, f333, 0f3ED4B147, f432; +fma.rn.f32 f437, f334, 0f3F68DDA4, f433; +sub.f32 f438, f434, f435; +add.f32 f439, f437, f436; +add.f32 f440, f435, f434; +sub.f32 f441, f436, f437; +fma.rn.f32 f442, f316, 0fBF75A155, f294; +fma.rn.f32 f443, f319, 0fBE903F40, 0f00000000; +fma.rn.f32 f444, f317, 0fBF75A155, f305; +fma.rn.f32 f445, f318, 0fBE903F40, 0f00000000; +fma.rn.f32 f446, f320, 0f3F575C64, f442; +fma.rn.f32 f447, f323, 0f3F0A6770, f443; +fma.rn.f32 f448, f321, 0f3F575C64, f444; +fma.rn.f32 f449, f322, 0f3F0A6770, f445; +fma.rn.f32 f450, f324, 0fBF27A4F4, f446; +fma.rn.f32 f451, f327, 0fBF4178CE, f447; +fma.rn.f32 f452, f325, 0fBF27A4F4, f448; +fma.rn.f32 f453, f326, 0fBF4178CE, f449; +fma.rn.f32 f454, f328, 0f3ED4B147, f450; +fma.rn.f32 f455, f331, 0f3F68DDA4, f451; +fma.rn.f32 f456, f329, 0f3ED4B147, f452; +fma.rn.f32 f457, f330, 0f3F68DDA4, f453; +fma.rn.f32 f458, f332, 0fBE11BAFB, f454; +fma.rn.f32 f459, f335, 0fBF7D64F0, f455; +fma.rn.f32 f460, f333, 0fBE11BAFB, f456; +fma.rn.f32 f461, f334, 0fBF7D64F0, f457; +sub.f32 f462, f458, f459; +add.f32 f463, f461, f460; +add.f32 f464, f459, f458; +sub.f32 f465, f460, f461; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f466, f467}, [rd11]; +mul.f32 f470, f466, f366; +mul.f32 f471, f467, f367; +sub.f32 f472, f470, f471; +mul.f32 f473, f466, f367; +fma.rn.f32 f474, f467, f366, f473; +mul.f32 f475, f466, f466; +mul.f32 f476, f467, f467; +sub.f32 f477, f475, f476; +mul.f32 f478, f467, f466; +fma.rn.f32 f479, f467, f466, f478; +mul.f32 f480, f477, f390; +mul.f32 f481, f479, f391; +sub.f32 f482, f480, f481; +mul.f32 f483, f477, f391; +fma.rn.f32 f484, f479, f390, f483; +mul.f32 f485, f466, f477; +mul.f32 f486, f467, f479; +sub.f32 f487, f485, f486; +mul.f32 f488, f466, f479; +fma.rn.f32 f489, f467, f477, f488; +mul.f32 f490, f487, f414; +mul.f32 f491, f489, f415; +sub.f32 f492, f490, f491; +mul.f32 f493, f487, f415; +fma.rn.f32 f494, f489, f414, f493; +mul.f32 f495, f466, f487; +mul.f32 f496, f467, f489; +sub.f32 f497, f495, f496; +mul.f32 f498, f466, f489; +fma.rn.f32 f499, f467, f487, f498; +mul.f32 f500, f497, f438; +mul.f32 f501, f499, f439; +sub.f32 f502, f500, f501; +mul.f32 f503, f497, f439; +fma.rn.f32 f504, f499, f438, f503; +mul.f32 f505, f466, f497; +mul.f32 f506, f467, f499; +sub.f32 f507, f505, f506; +mul.f32 f508, f466, f499; +fma.rn.f32 f509, f467, f497, f508; +mul.f32 f510, f507, f462; +mul.f32 f511, f509, f463; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, f463; +fma.rn.f32 f514, f509, f462, f513; +mul.f32 f515, f466, f507; +mul.f32 f516, f467, f509; +sub.f32 f517, f515, f516; +mul.f32 f518, f466, f509; +fma.rn.f32 f519, f467, f507, f518; +mul.f32 f520, f517, f464; +mul.f32 f521, f519, f465; +sub.f32 f522, f520, f521; +mul.f32 f523, f517, f465; +fma.rn.f32 f524, f519, f464, f523; +mul.f32 f525, f466, f517; +mul.f32 f526, f467, f519; +sub.f32 f527, f525, f526; +mul.f32 f528, f466, f519; +fma.rn.f32 f529, f467, f517, f528; +mul.f32 f530, f527, f440; +mul.f32 f531, f529, f441; +sub.f32 f532, f530, f531; +mul.f32 f533, f527, f441; +fma.rn.f32 f534, f529, f440, f533; +mul.f32 f535, f466, f527; +mul.f32 f536, f467, f529; +sub.f32 f537, f535, f536; +mul.f32 f538, f466, f529; +fma.rn.f32 f539, f467, f527, f538; +mul.f32 f540, f537, f416; +mul.f32 f541, f539, f417; +sub.f32 f542, f540, f541; +mul.f32 f543, f537, f417; +fma.rn.f32 f544, f539, f416, f543; +mul.f32 f545, f466, f537; +mul.f32 f546, f467, f539; +sub.f32 f547, f545, f546; +mul.f32 f548, f466, f539; +fma.rn.f32 f549, f467, f537, f548; +mul.f32 f550, f547, f392; +mul.f32 f551, f549, f393; +sub.f32 f552, f550, f551; +mul.f32 f553, f547, f393; +fma.rn.f32 f554, f549, f392, f553; +mul.f32 f555, f466, f547; +mul.f32 f556, f467, f549; +sub.f32 f557, f555, f556; +mul.f32 f558, f466, f549; +fma.rn.f32 f559, f467, f547, f558; +mul.f32 f560, f557, f368; +mul.f32 f561, f559, f369; +sub.f32 f562, f560, f561; +mul.f32 f563, f557, f369; +fma.rn.f32 f564, f559, f368, f563; +shl.b32 r18, r17, 2; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 484, r19; +st.shared.f32 [r20], f344; +st.shared.f32 [r20+44], f472; +st.shared.f32 [r20+88], f482; +st.shared.f32 [r20+132], f492; +st.shared.f32 [r20+176], f502; +st.shared.f32 [r20+220], f512; +st.shared.f32 [r20+264], f522; +st.shared.f32 [r20+308], f532; +st.shared.f32 [r20+352], f542; +st.shared.f32 [r20+396], f552; +st.shared.f32 [r20+440], f562; +barrier.sync 0; +ld.shared.f32 f565, [r14]; +ld.shared.f32 f566, [r14+484]; +ld.shared.f32 f567, [r14+968]; +ld.shared.f32 f568, [r14+1452]; +ld.shared.f32 f569, [r14+1936]; +ld.shared.f32 f570, [r14+2420]; +ld.shared.f32 f571, [r14+2904]; +ld.shared.f32 f572, [r14+3388]; +ld.shared.f32 f573, [r14+3872]; +ld.shared.f32 f574, [r14+4356]; +ld.shared.f32 f575, [r14+4840]; +barrier.sync 0; +st.shared.f32 [r20], f345; +st.shared.f32 [r20+44], f474; +st.shared.f32 [r20+88], f484; +st.shared.f32 [r20+132], f494; +st.shared.f32 [r20+176], f504; +st.shared.f32 [r20+220], f514; +st.shared.f32 [r20+264], f524; +st.shared.f32 [r20+308], f534; +st.shared.f32 [r20+352], f544; +st.shared.f32 [r20+396], f554; +st.shared.f32 [r20+440], f564; +barrier.sync 0; +ld.shared.f32 f576, [r14]; +ld.shared.f32 f577, [r14+484]; +ld.shared.f32 f578, [r14+968]; +ld.shared.f32 f579, [r14+1452]; +ld.shared.f32 f580, [r14+1936]; +ld.shared.f32 f581, [r14+2420]; +ld.shared.f32 f582, [r14+2904]; +ld.shared.f32 f583, [r14+3388]; +ld.shared.f32 f584, [r14+3872]; +ld.shared.f32 f585, [r14+4356]; +ld.shared.f32 f586, [r14+4840]; +add.f32 f587, f566, f575; +add.f32 f588, f577, f586; +sub.f32 f589, f566, f575; +sub.f32 f590, f577, f586; +add.f32 f591, f567, f574; +add.f32 f592, f578, f585; +sub.f32 f593, f567, f574; +sub.f32 f594, f578, f585; +add.f32 f595, f568, f573; +add.f32 f596, f579, f584; +sub.f32 f597, f568, f573; +sub.f32 f598, f579, f584; +add.f32 f599, f569, f572; +add.f32 f600, f580, f583; +sub.f32 f601, f569, f572; +sub.f32 f602, f580, f583; +add.f32 f603, f570, f571; +add.f32 f604, f581, f582; +sub.f32 f605, f570, f571; +sub.f32 f606, f581, f582; +add.f32 f607, f565, f587; +add.f32 f608, f576, f588; +add.f32 f609, f607, f591; +add.f32 f610, f608, f592; +add.f32 f611, f609, f595; +add.f32 f612, f610, f596; +add.f32 f613, f611, f599; +add.f32 f614, f612, f600; +fma.rn.f32 f615, f587, 0f3F575C64, f565; +fma.rn.f32 f616, f590, 0fBF0A6770, 0f00000000; +fma.rn.f32 f617, f588, 0f3F575C64, f576; +fma.rn.f32 f618, f589, 0fBF0A6770, 0f00000000; +fma.rn.f32 f619, f591, 0f3ED4B147, f615; +fma.rn.f32 f620, f594, 0fBF68DDA4, f616; +fma.rn.f32 f621, f592, 0f3ED4B147, f617; +fma.rn.f32 f622, f593, 0fBF68DDA4, f618; +fma.rn.f32 f623, f595, 0fBE11BAFB, f619; +fma.rn.f32 f624, f598, 0fBF7D64F0, f620; +fma.rn.f32 f625, f596, 0fBE11BAFB, f621; +fma.rn.f32 f626, f597, 0fBF7D64F0, f622; +fma.rn.f32 f627, f599, 0fBF27A4F4, f623; +fma.rn.f32 f628, f602, 0fBF4178CE, f624; +fma.rn.f32 f629, f600, 0fBF27A4F4, f625; +fma.rn.f32 f630, f601, 0fBF4178CE, f626; +fma.rn.f32 f631, f603, 0fBF75A155, f627; +fma.rn.f32 f632, f606, 0fBE903F40, f628; +fma.rn.f32 f633, f604, 0fBF75A155, f629; +fma.rn.f32 f634, f605, 0fBE903F40, f630; +fma.rn.f32 f635, f587, 0f3ED4B147, f565; +fma.rn.f32 f636, f590, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f637, f588, 0f3ED4B147, f576; +fma.rn.f32 f638, f589, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f639, f591, 0fBF27A4F4, f635; +fma.rn.f32 f640, f594, 0fBF4178CE, f636; +fma.rn.f32 f641, f592, 0fBF27A4F4, f637; +fma.rn.f32 f642, f593, 0fBF4178CE, f638; +fma.rn.f32 f643, f595, 0fBF75A155, f639; +fma.rn.f32 f644, f598, 0f3E903F40, f640; +fma.rn.f32 f645, f596, 0fBF75A155, f641; +fma.rn.f32 f646, f597, 0f3E903F40, f642; +fma.rn.f32 f647, f599, 0fBE11BAFB, f643; +fma.rn.f32 f648, f602, 0f3F7D64F0, f644; +fma.rn.f32 f649, f600, 0fBE11BAFB, f645; +fma.rn.f32 f650, f601, 0f3F7D64F0, f646; +fma.rn.f32 f651, f603, 0f3F575C64, f647; +fma.rn.f32 f652, f606, 0f3F0A6770, f648; +fma.rn.f32 f653, f604, 0f3F575C64, f649; +fma.rn.f32 f654, f605, 0f3F0A6770, f650; +fma.rn.f32 f655, f587, 0fBE11BAFB, f565; +fma.rn.f32 f656, f590, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f657, f588, 0fBE11BAFB, f576; +fma.rn.f32 f658, f589, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f659, f591, 0fBF75A155, f655; +fma.rn.f32 f660, f594, 0f3E903F40, f656; +fma.rn.f32 f661, f592, 0fBF75A155, f657; +fma.rn.f32 f662, f593, 0f3E903F40, f658; +fma.rn.f32 f663, f595, 0f3ED4B147, f659; +fma.rn.f32 f664, f598, 0f3F68DDA4, f660; +fma.rn.f32 f665, f596, 0f3ED4B147, f661; +fma.rn.f32 f666, f597, 0f3F68DDA4, f662; +fma.rn.f32 f667, f599, 0f3F575C64, f663; +fma.rn.f32 f668, f602, 0fBF0A6770, f664; +fma.rn.f32 f669, f600, 0f3F575C64, f665; +fma.rn.f32 f670, f601, 0fBF0A6770, f666; +fma.rn.f32 f671, f603, 0fBF27A4F4, f667; +fma.rn.f32 f672, f606, 0fBF4178CE, f668; +fma.rn.f32 f673, f604, 0fBF27A4F4, f669; +fma.rn.f32 f674, f605, 0fBF4178CE, f670; +fma.rn.f32 f675, f587, 0fBF27A4F4, f565; +fma.rn.f32 f676, f590, 0fBF4178CE, 0f00000000; +fma.rn.f32 f677, f588, 0fBF27A4F4, f576; +fma.rn.f32 f678, f589, 0fBF4178CE, 0f00000000; +fma.rn.f32 f679, f591, 0fBE11BAFB, f675; +fma.rn.f32 f680, f594, 0f3F7D64F0, f676; +fma.rn.f32 f681, f592, 0fBE11BAFB, f677; +fma.rn.f32 f682, f593, 0f3F7D64F0, f678; +fma.rn.f32 f683, f595, 0f3F575C64, f679; +fma.rn.f32 f684, f598, 0fBF0A6770, f680; +fma.rn.f32 f685, f596, 0f3F575C64, f681; +fma.rn.f32 f686, f597, 0fBF0A6770, f682; +fma.rn.f32 f687, f599, 0fBF75A155, f683; +fma.rn.f32 f688, f602, 0fBE903F40, f684; +fma.rn.f32 f689, f600, 0fBF75A155, f685; +fma.rn.f32 f690, f601, 0fBE903F40, f686; +fma.rn.f32 f691, f603, 0f3ED4B147, f687; +fma.rn.f32 f692, f606, 0f3F68DDA4, f688; +fma.rn.f32 f693, f604, 0f3ED4B147, f689; +fma.rn.f32 f694, f605, 0f3F68DDA4, f690; +fma.rn.f32 f695, f587, 0fBF75A155, f565; +fma.rn.f32 f696, f590, 0fBE903F40, 0f00000000; +fma.rn.f32 f697, f588, 0fBF75A155, f576; +fma.rn.f32 f698, f589, 0fBE903F40, 0f00000000; +fma.rn.f32 f699, f591, 0f3F575C64, f695; +fma.rn.f32 f700, f594, 0f3F0A6770, f696; +fma.rn.f32 f701, f592, 0f3F575C64, f697; +fma.rn.f32 f702, f593, 0f3F0A6770, f698; +fma.rn.f32 f703, f595, 0fBF27A4F4, f699; +fma.rn.f32 f704, f598, 0fBF4178CE, f700; +fma.rn.f32 f705, f596, 0fBF27A4F4, f701; +fma.rn.f32 f706, f597, 0fBF4178CE, f702; +fma.rn.f32 f707, f599, 0f3ED4B147, f703; +fma.rn.f32 f708, f602, 0f3F68DDA4, f704; +fma.rn.f32 f709, f600, 0f3ED4B147, f705; +fma.rn.f32 f710, f601, 0f3F68DDA4, f706; +fma.rn.f32 f711, f603, 0fBE11BAFB, f707; +fma.rn.f32 f712, f606, 0fBF7D64F0, f708; +fma.rn.f32 f713, f604, 0fBE11BAFB, f709; +fma.rn.f32 f714, f605, 0fBF7D64F0, f710; +add.f32 %0, f613, f603; +add.f32 %1, f614, f604; +add.f32 %3, f634, f633; +sub.f32 %2, f631, f632; +add.f32 %5, f654, f653; +sub.f32 %4, f651, f652; +add.f32 %7, f674, f673; +sub.f32 %6, f671, f672; +add.f32 %9, f694, f693; +sub.f32 %8, f691, f692; +add.f32 %11, f714, f713; +sub.f32 %10, f711, f712; +sub.f32 %13, f713, f714; +add.f32 %12, f712, f711; +sub.f32 %15, f693, f694; +add.f32 %14, f692, f691; +sub.f32 %17, f673, f674; +add.f32 %16, f672, f671; +sub.f32 %19, f653, f654; +add.f32 %18, f652, f651; +sub.f32 %21, f633, f634; +add.f32 %20, f632, f631; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_1331), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..bb17b3550c650 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp32_inv.hpp.inc @@ -0,0 +1,1518 @@ +#ifndef CUFFTDX_FFT_1331_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_1331_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<384, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<781>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 10648, r2; +add.f32 f45, %27, %51; +add.f32 f46, %29, %52; +sub.f32 f47, %27, %51; +sub.f32 f48, %29, %52; +add.f32 f49, %30, %49; +add.f32 f50, %32, %50; +sub.f32 f51, %30, %49; +sub.f32 f52, %32, %50; +add.f32 f53, %33, %46; +add.f32 f54, %34, %48; +sub.f32 f55, %33, %46; +sub.f32 f56, %34, %48; +add.f32 f57, %35, %43; +add.f32 f58, %37, %45; +sub.f32 f59, %35, %43; +sub.f32 f60, %37, %45; +add.f32 f61, %38, %41; +add.f32 f62, %40, %42; +sub.f32 f63, %38, %41; +sub.f32 f64, %40, %42; +mov.u32 r4, %tid.x; +add.f32 f65, %25, f45; +add.f32 f66, %26, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +fma.rn.f32 f73, f45, 0f3F575C64, %25; +fma.rn.f32 f74, f48, 0f3F0A6770, 0f00000000; +fma.rn.f32 f75, f46, 0f3F575C64, %26; +fma.rn.f32 f76, f47, 0f3F0A6770, 0f00000000; +fma.rn.f32 f77, f49, 0f3ED4B147, f73; +fma.rn.f32 f78, f52, 0f3F68DDA4, f74; +fma.rn.f32 f79, f50, 0f3ED4B147, f75; +fma.rn.f32 f80, f51, 0f3F68DDA4, f76; +fma.rn.f32 f81, f53, 0fBE11BAFB, f77; +fma.rn.f32 f82, f56, 0f3F7D64F0, f78; +fma.rn.f32 f83, f54, 0fBE11BAFB, f79; +fma.rn.f32 f84, f55, 0f3F7D64F0, f80; +fma.rn.f32 f85, f57, 0fBF27A4F4, f81; +fma.rn.f32 f86, f60, 0f3F4178CE, f82; +fma.rn.f32 f87, f58, 0fBF27A4F4, f83; +fma.rn.f32 f88, f59, 0f3F4178CE, f84; +fma.rn.f32 f89, f61, 0fBF75A155, f85; +fma.rn.f32 f90, f64, 0f3E903F40, f86; +fma.rn.f32 f91, f62, 0fBF75A155, f87; +fma.rn.f32 f92, f63, 0f3E903F40, f88; +sub.f32 f93, f89, f90; +add.f32 f94, f92, f91; +add.f32 f95, f90, f89; +sub.f32 f96, f91, f92; +fma.rn.f32 f97, f45, 0f3ED4B147, %25; +fma.rn.f32 f98, f48, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f99, f46, 0f3ED4B147, %26; +fma.rn.f32 f100, f47, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f101, f49, 0fBF27A4F4, f97; +fma.rn.f32 f102, f52, 0f3F4178CE, f98; +fma.rn.f32 f103, f50, 0fBF27A4F4, f99; +fma.rn.f32 f104, f51, 0f3F4178CE, f100; +fma.rn.f32 f105, f53, 0fBF75A155, f101; +fma.rn.f32 f106, f56, 0fBE903F40, f102; +fma.rn.f32 f107, f54, 0fBF75A155, f103; +fma.rn.f32 f108, f55, 0fBE903F40, f104; +fma.rn.f32 f109, f57, 0fBE11BAFB, f105; +fma.rn.f32 f110, f60, 0fBF7D64F0, f106; +fma.rn.f32 f111, f58, 0fBE11BAFB, f107; +fma.rn.f32 f112, f59, 0fBF7D64F0, f108; +fma.rn.f32 f113, f61, 0f3F575C64, f109; +fma.rn.f32 f114, f64, 0fBF0A6770, f110; +fma.rn.f32 f115, f62, 0f3F575C64, f111; +fma.rn.f32 f116, f63, 0fBF0A6770, f112; +sub.f32 f117, f113, f114; +add.f32 f118, f116, f115; +add.f32 f119, f114, f113; +sub.f32 f120, f115, f116; +fma.rn.f32 f121, f45, 0fBE11BAFB, %25; +fma.rn.f32 f122, f48, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f123, f46, 0fBE11BAFB, %26; +fma.rn.f32 f124, f47, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f125, f49, 0fBF75A155, f121; +fma.rn.f32 f126, f52, 0fBE903F40, f122; +fma.rn.f32 f127, f50, 0fBF75A155, f123; +fma.rn.f32 f128, f51, 0fBE903F40, f124; +fma.rn.f32 f129, f53, 0f3ED4B147, f125; +fma.rn.f32 f130, f56, 0fBF68DDA4, f126; +fma.rn.f32 f131, f54, 0f3ED4B147, f127; +fma.rn.f32 f132, f55, 0fBF68DDA4, f128; +fma.rn.f32 f133, f57, 0f3F575C64, f129; +fma.rn.f32 f134, f60, 0f3F0A6770, f130; +fma.rn.f32 f135, f58, 0f3F575C64, f131; +fma.rn.f32 f136, f59, 0f3F0A6770, f132; +fma.rn.f32 f137, f61, 0fBF27A4F4, f133; +fma.rn.f32 f138, f64, 0f3F4178CE, f134; +fma.rn.f32 f139, f62, 0fBF27A4F4, f135; +fma.rn.f32 f140, f63, 0f3F4178CE, f136; +sub.f32 f141, f137, f138; +add.f32 f142, f140, f139; +add.f32 f143, f138, f137; +sub.f32 f144, f139, f140; +fma.rn.f32 f145, f45, 0fBF27A4F4, %25; +fma.rn.f32 f146, f48, 0f3F4178CE, 0f00000000; +fma.rn.f32 f147, f46, 0fBF27A4F4, %26; +fma.rn.f32 f148, f47, 0f3F4178CE, 0f00000000; +fma.rn.f32 f149, f49, 0fBE11BAFB, f145; +fma.rn.f32 f150, f52, 0fBF7D64F0, f146; +fma.rn.f32 f151, f50, 0fBE11BAFB, f147; +fma.rn.f32 f152, f51, 0fBF7D64F0, f148; +fma.rn.f32 f153, f53, 0f3F575C64, f149; +fma.rn.f32 f154, f56, 0f3F0A6770, f150; +fma.rn.f32 f155, f54, 0f3F575C64, f151; +fma.rn.f32 f156, f55, 0f3F0A6770, f152; +fma.rn.f32 f157, f57, 0fBF75A155, f153; +fma.rn.f32 f158, f60, 0f3E903F40, f154; +fma.rn.f32 f159, f58, 0fBF75A155, f155; +fma.rn.f32 f160, f59, 0f3E903F40, f156; +fma.rn.f32 f161, f61, 0f3ED4B147, f157; +fma.rn.f32 f162, f64, 0fBF68DDA4, f158; +fma.rn.f32 f163, f62, 0f3ED4B147, f159; +fma.rn.f32 f164, f63, 0fBF68DDA4, f160; +sub.f32 f165, f161, f162; +add.f32 f166, f164, f163; +add.f32 f167, f162, f161; +sub.f32 f168, f163, f164; +fma.rn.f32 f169, f45, 0fBF75A155, %25; +fma.rn.f32 f170, f48, 0f3E903F40, 0f00000000; +fma.rn.f32 f171, f46, 0fBF75A155, %26; +fma.rn.f32 f172, f47, 0f3E903F40, 0f00000000; +fma.rn.f32 f173, f49, 0f3F575C64, f169; +fma.rn.f32 f174, f52, 0fBF0A6770, f170; +fma.rn.f32 f175, f50, 0f3F575C64, f171; +fma.rn.f32 f176, f51, 0fBF0A6770, f172; +fma.rn.f32 f177, f53, 0fBF27A4F4, f173; +fma.rn.f32 f178, f56, 0f3F4178CE, f174; +fma.rn.f32 f179, f54, 0fBF27A4F4, f175; +fma.rn.f32 f180, f55, 0f3F4178CE, f176; +fma.rn.f32 f181, f57, 0f3ED4B147, f177; +fma.rn.f32 f182, f60, 0fBF68DDA4, f178; +fma.rn.f32 f183, f58, 0f3ED4B147, f179; +fma.rn.f32 f184, f59, 0fBF68DDA4, f180; +fma.rn.f32 f185, f61, 0fBE11BAFB, f181; +fma.rn.f32 f186, f64, 0f3F7D64F0, f182; +fma.rn.f32 f187, f62, 0fBE11BAFB, f183; +fma.rn.f32 f188, f63, 0f3F7D64F0, f184; +sub.f32 f189, f185, f186; +add.f32 f190, f188, f187; +add.f32 f191, f186, f185; +sub.f32 f192, f187, f188; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 10648, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f193, f194}, [rd6]; +mul.f32 f197, f94, f194; +mul.f32 f198, f93, f194; +mul.f32 f199, f193, f94; +mul.f32 f200, f193, f193; +mul.f32 f201, f194, f194; +sub.f32 f202, f200, f201; +mul.f32 f203, f194, f193; +fma.rn.f32 f204, f194, f193, f203; +mul.f32 f205, f118, f204; +mul.f32 f206, f117, f204; +mul.f32 f207, f202, f118; +mul.f32 f208, f193, f202; +mul.f32 f209, f194, f204; +sub.f32 f210, f208, f209; +mul.f32 f211, f193, f204; +fma.rn.f32 f212, f194, f202, f211; +mul.f32 f213, f142, f212; +mul.f32 f214, f141, f212; +mul.f32 f215, f210, f142; +mul.f32 f216, f193, f210; +mul.f32 f217, f194, f212; +sub.f32 f218, f216, f217; +mul.f32 f219, f193, f212; +fma.rn.f32 f220, f194, f210, f219; +mul.f32 f221, f166, f220; +mul.f32 f222, f165, f220; +mul.f32 f223, f218, f166; +mul.f32 f224, f193, f218; +mul.f32 f225, f194, f220; +sub.f32 f226, f224, f225; +mul.f32 f227, f193, f220; +fma.rn.f32 f228, f194, f218, f227; +mul.f32 f229, f190, f228; +mul.f32 f230, f189, f228; +mul.f32 f231, f226, f190; +mul.f32 f232, f193, f226; +mul.f32 f233, f194, f228; +sub.f32 f234, f232, f233; +mul.f32 f235, f193, f228; +fma.rn.f32 f236, f194, f226, f235; +mul.f32 f237, f192, f236; +mul.f32 f238, f191, f236; +mul.f32 f239, f234, f192; +mul.f32 f240, f193, f234; +mul.f32 f241, f194, f236; +sub.f32 f242, f240, f241; +mul.f32 f243, f193, f236; +fma.rn.f32 f244, f194, f234, f243; +mul.f32 f245, f168, f244; +mul.f32 f246, f167, f244; +mul.f32 f247, f242, f168; +mul.f32 f248, f193, f242; +mul.f32 f249, f194, f244; +sub.f32 f250, f248, f249; +mul.f32 f251, f193, f244; +fma.rn.f32 f252, f194, f242, f251; +mul.f32 f253, f144, f252; +mul.f32 f254, f143, f252; +mul.f32 f255, f250, f144; +mul.f32 f256, f193, f250; +mul.f32 f257, f194, f252; +sub.f32 f258, f256, f257; +mul.f32 f259, f193, f252; +fma.rn.f32 f260, f194, f250, f259; +mul.f32 f261, f120, f260; +mul.f32 f262, f119, f260; +mul.f32 f263, f258, f120; +mul.f32 f264, f193, f258; +mul.f32 f265, f194, f260; +sub.f32 f266, f264, f265; +mul.f32 f267, f193, f260; +fma.rn.f32 f268, f194, f258, f267; +mul.f32 f269, f96, f268; +mul.f32 f270, f95, f268; +mul.f32 f271, f266, f96; +barrier.sync 0; +mad.lo.s32 r13, r11, 88, r12; +add.f32 f272, f72, f62; +add.f32 f273, f71, f61; +st.shared.v2.f32 [r13], {f273, f272}; +fma.rn.f32 f274, f193, f93, f197; +sub.f32 f275, f199, f198; +st.shared.v2.f32 [r13+8], {f274, f275}; +fma.rn.f32 f276, f202, f117, f205; +sub.f32 f277, f207, f206; +st.shared.v2.f32 [r13+16], {f276, f277}; +sub.f32 f278, f215, f214; +fma.rn.f32 f279, f210, f141, f213; +st.shared.v2.f32 [r13+24], {f279, f278}; +fma.rn.f32 f280, f218, f165, f221; +sub.f32 f281, f223, f222; +st.shared.v2.f32 [r13+32], {f280, f281}; +fma.rn.f32 f282, f226, f189, f229; +sub.f32 f283, f231, f230; +st.shared.v2.f32 [r13+40], {f282, f283}; +fma.rn.f32 f284, f234, f191, f237; +sub.f32 f285, f239, f238; +st.shared.v2.f32 [r13+48], {f284, f285}; +fma.rn.f32 f286, f242, f167, f245; +sub.f32 f287, f247, f246; +st.shared.v2.f32 [r13+56], {f286, f287}; +fma.rn.f32 f288, f250, f143, f253; +sub.f32 f289, f255, f254; +st.shared.v2.f32 [r13+64], {f288, f289}; +fma.rn.f32 f290, f258, f119, f261; +sub.f32 f291, f263, f262; +st.shared.v2.f32 [r13+72], {f290, f291}; +fma.rn.f32 f292, f266, f95, f269; +sub.f32 f293, f271, f270; +st.shared.v2.f32 [r13+80], {f292, f293}; +barrier.sync 0; +mad.lo.s32 r14, r11, -80, r13; +ld.shared.v2.f32 {f294, f295}, [r14]; +ld.shared.v2.f32 {f298, f299}, [r14+968]; +ld.shared.v2.f32 {f302, f303}, [r14+1936]; +ld.shared.v2.f32 {f306, f307}, [r14+2904]; +ld.shared.v2.f32 {f310, f311}, [r14+3872]; +ld.shared.v2.f32 {f314, f315}, [r14+4840]; +ld.shared.v2.f32 {f318, f319}, [r14+5808]; +ld.shared.v2.f32 {f322, f323}, [r14+6776]; +ld.shared.v2.f32 {f326, f327}, [r14+7744]; +ld.shared.v2.f32 {f330, f331}, [r14+8712]; +ld.shared.v2.f32 {f334, f335}, [r14+9680]; +add.f32 f338, f298, f334; +add.f32 f339, f299, f335; +sub.f32 f340, f298, f334; +sub.f32 f341, f299, f335; +add.f32 f342, f302, f330; +add.f32 f343, f303, f331; +sub.f32 f344, f302, f330; +sub.f32 f345, f303, f331; +add.f32 f346, f306, f326; +add.f32 f347, f307, f327; +sub.f32 f348, f306, f326; +sub.f32 f349, f307, f327; +add.f32 f350, f310, f322; +add.f32 f351, f311, f323; +sub.f32 f352, f310, f322; +sub.f32 f353, f311, f323; +add.f32 f354, f314, f318; +add.f32 f355, f315, f319; +sub.f32 f356, f314, f318; +sub.f32 f357, f315, f319; +add.f32 f358, f294, f338; +add.f32 f359, f295, f339; +add.f32 f360, f358, f342; +add.f32 f361, f359, f343; +add.f32 f362, f360, f346; +add.f32 f363, f361, f347; +add.f32 f364, f362, f350; +add.f32 f365, f363, f351; +fma.rn.f32 f366, f338, 0f3F575C64, f294; +fma.rn.f32 f367, f341, 0f3F0A6770, 0f00000000; +fma.rn.f32 f368, f339, 0f3F575C64, f295; +fma.rn.f32 f369, f340, 0f3F0A6770, 0f00000000; +fma.rn.f32 f370, f342, 0f3ED4B147, f366; +fma.rn.f32 f371, f345, 0f3F68DDA4, f367; +fma.rn.f32 f372, f343, 0f3ED4B147, f368; +fma.rn.f32 f373, f344, 0f3F68DDA4, f369; +fma.rn.f32 f374, f346, 0fBE11BAFB, f370; +fma.rn.f32 f375, f349, 0f3F7D64F0, f371; +fma.rn.f32 f376, f347, 0fBE11BAFB, f372; +fma.rn.f32 f377, f348, 0f3F7D64F0, f373; +fma.rn.f32 f378, f350, 0fBF27A4F4, f374; +fma.rn.f32 f379, f353, 0f3F4178CE, f375; +fma.rn.f32 f380, f351, 0fBF27A4F4, f376; +fma.rn.f32 f381, f352, 0f3F4178CE, f377; +fma.rn.f32 f382, f354, 0fBF75A155, f378; +fma.rn.f32 f383, f357, 0f3E903F40, f379; +fma.rn.f32 f384, f355, 0fBF75A155, f380; +fma.rn.f32 f385, f356, 0f3E903F40, f381; +sub.f32 f386, f382, f383; +add.f32 f387, f385, f384; +add.f32 f388, f383, f382; +sub.f32 f389, f384, f385; +fma.rn.f32 f390, f338, 0f3ED4B147, f294; +fma.rn.f32 f391, f341, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f392, f339, 0f3ED4B147, f295; +fma.rn.f32 f393, f340, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f394, f342, 0fBF27A4F4, f390; +fma.rn.f32 f395, f345, 0f3F4178CE, f391; +fma.rn.f32 f396, f343, 0fBF27A4F4, f392; +fma.rn.f32 f397, f344, 0f3F4178CE, f393; +fma.rn.f32 f398, f346, 0fBF75A155, f394; +fma.rn.f32 f399, f349, 0fBE903F40, f395; +fma.rn.f32 f400, f347, 0fBF75A155, f396; +fma.rn.f32 f401, f348, 0fBE903F40, f397; +fma.rn.f32 f402, f350, 0fBE11BAFB, f398; +fma.rn.f32 f403, f353, 0fBF7D64F0, f399; +fma.rn.f32 f404, f351, 0fBE11BAFB, f400; +fma.rn.f32 f405, f352, 0fBF7D64F0, f401; +fma.rn.f32 f406, f354, 0f3F575C64, f402; +fma.rn.f32 f407, f357, 0fBF0A6770, f403; +fma.rn.f32 f408, f355, 0f3F575C64, f404; +fma.rn.f32 f409, f356, 0fBF0A6770, f405; +sub.f32 f410, f406, f407; +add.f32 f411, f409, f408; +add.f32 f412, f407, f406; +sub.f32 f413, f408, f409; +fma.rn.f32 f414, f338, 0fBE11BAFB, f294; +fma.rn.f32 f415, f341, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f416, f339, 0fBE11BAFB, f295; +fma.rn.f32 f417, f340, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f418, f342, 0fBF75A155, f414; +fma.rn.f32 f419, f345, 0fBE903F40, f415; +fma.rn.f32 f420, f343, 0fBF75A155, f416; +fma.rn.f32 f421, f344, 0fBE903F40, f417; +fma.rn.f32 f422, f346, 0f3ED4B147, f418; +fma.rn.f32 f423, f349, 0fBF68DDA4, f419; +fma.rn.f32 f424, f347, 0f3ED4B147, f420; +fma.rn.f32 f425, f348, 0fBF68DDA4, f421; +fma.rn.f32 f426, f350, 0f3F575C64, f422; +fma.rn.f32 f427, f353, 0f3F0A6770, f423; +fma.rn.f32 f428, f351, 0f3F575C64, f424; +fma.rn.f32 f429, f352, 0f3F0A6770, f425; +fma.rn.f32 f430, f354, 0fBF27A4F4, f426; +fma.rn.f32 f431, f357, 0f3F4178CE, f427; +fma.rn.f32 f432, f355, 0fBF27A4F4, f428; +fma.rn.f32 f433, f356, 0f3F4178CE, f429; +sub.f32 f434, f430, f431; +add.f32 f435, f433, f432; +add.f32 f436, f431, f430; +sub.f32 f437, f432, f433; +fma.rn.f32 f438, f338, 0fBF27A4F4, f294; +fma.rn.f32 f439, f341, 0f3F4178CE, 0f00000000; +fma.rn.f32 f440, f339, 0fBF27A4F4, f295; +fma.rn.f32 f441, f340, 0f3F4178CE, 0f00000000; +fma.rn.f32 f442, f342, 0fBE11BAFB, f438; +fma.rn.f32 f443, f345, 0fBF7D64F0, f439; +fma.rn.f32 f444, f343, 0fBE11BAFB, f440; +fma.rn.f32 f445, f344, 0fBF7D64F0, f441; +fma.rn.f32 f446, f346, 0f3F575C64, f442; +fma.rn.f32 f447, f349, 0f3F0A6770, f443; +fma.rn.f32 f448, f347, 0f3F575C64, f444; +fma.rn.f32 f449, f348, 0f3F0A6770, f445; +fma.rn.f32 f450, f350, 0fBF75A155, f446; +fma.rn.f32 f451, f353, 0f3E903F40, f447; +fma.rn.f32 f452, f351, 0fBF75A155, f448; +fma.rn.f32 f453, f352, 0f3E903F40, f449; +fma.rn.f32 f454, f354, 0f3ED4B147, f450; +fma.rn.f32 f455, f357, 0fBF68DDA4, f451; +fma.rn.f32 f456, f355, 0f3ED4B147, f452; +fma.rn.f32 f457, f356, 0fBF68DDA4, f453; +sub.f32 f458, f454, f455; +add.f32 f459, f457, f456; +add.f32 f460, f455, f454; +sub.f32 f461, f456, f457; +fma.rn.f32 f462, f338, 0fBF75A155, f294; +fma.rn.f32 f463, f341, 0f3E903F40, 0f00000000; +fma.rn.f32 f464, f339, 0fBF75A155, f295; +fma.rn.f32 f465, f340, 0f3E903F40, 0f00000000; +fma.rn.f32 f466, f342, 0f3F575C64, f462; +fma.rn.f32 f467, f345, 0fBF0A6770, f463; +fma.rn.f32 f468, f343, 0f3F575C64, f464; +fma.rn.f32 f469, f344, 0fBF0A6770, f465; +fma.rn.f32 f470, f346, 0fBF27A4F4, f466; +fma.rn.f32 f471, f349, 0f3F4178CE, f467; +fma.rn.f32 f472, f347, 0fBF27A4F4, f468; +fma.rn.f32 f473, f348, 0f3F4178CE, f469; +fma.rn.f32 f474, f350, 0f3ED4B147, f470; +fma.rn.f32 f475, f353, 0fBF68DDA4, f471; +fma.rn.f32 f476, f351, 0f3ED4B147, f472; +fma.rn.f32 f477, f352, 0fBF68DDA4, f473; +fma.rn.f32 f478, f354, 0fBE11BAFB, f474; +fma.rn.f32 f479, f357, 0f3F7D64F0, f475; +fma.rn.f32 f480, f355, 0fBE11BAFB, f476; +fma.rn.f32 f481, f356, 0f3F7D64F0, f477; +sub.f32 f482, f478, f479; +add.f32 f483, f481, f480; +add.f32 f484, f479, f478; +sub.f32 f485, f480, f481; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f486, f487}, [rd11]; +mul.f32 f490, f387, f487; +mul.f32 f491, f386, f487; +mul.f32 f492, f486, f387; +mul.f32 f493, f486, f486; +mul.f32 f494, f487, f487; +sub.f32 f495, f493, f494; +mul.f32 f496, f487, f486; +fma.rn.f32 f497, f487, f486, f496; +mul.f32 f498, f411, f497; +mul.f32 f499, f410, f497; +mul.f32 f500, f495, f411; +mul.f32 f501, f486, f495; +mul.f32 f502, f487, f497; +sub.f32 f503, f501, f502; +mul.f32 f504, f486, f497; +fma.rn.f32 f505, f487, f495, f504; +mul.f32 f506, f435, f505; +mul.f32 f507, f434, f505; +mul.f32 f508, f503, f435; +mul.f32 f509, f486, f503; +mul.f32 f510, f487, f505; +sub.f32 f511, f509, f510; +mul.f32 f512, f486, f505; +fma.rn.f32 f513, f487, f503, f512; +mul.f32 f514, f459, f513; +mul.f32 f515, f458, f513; +mul.f32 f516, f511, f459; +mul.f32 f517, f486, f511; +mul.f32 f518, f487, f513; +sub.f32 f519, f517, f518; +mul.f32 f520, f486, f513; +fma.rn.f32 f521, f487, f511, f520; +mul.f32 f522, f483, f521; +mul.f32 f523, f482, f521; +mul.f32 f524, f519, f483; +mul.f32 f525, f486, f519; +mul.f32 f526, f487, f521; +sub.f32 f527, f525, f526; +mul.f32 f528, f486, f521; +fma.rn.f32 f529, f487, f519, f528; +mul.f32 f530, f485, f529; +mul.f32 f531, f484, f529; +mul.f32 f532, f527, f485; +mul.f32 f533, f486, f527; +mul.f32 f534, f487, f529; +sub.f32 f535, f533, f534; +mul.f32 f536, f486, f529; +fma.rn.f32 f537, f487, f527, f536; +mul.f32 f538, f461, f537; +mul.f32 f539, f460, f537; +mul.f32 f540, f535, f461; +mul.f32 f541, f486, f535; +mul.f32 f542, f487, f537; +sub.f32 f543, f541, f542; +mul.f32 f544, f486, f537; +fma.rn.f32 f545, f487, f535, f544; +mul.f32 f546, f437, f545; +mul.f32 f547, f436, f545; +mul.f32 f548, f543, f437; +mul.f32 f549, f486, f543; +mul.f32 f550, f487, f545; +sub.f32 f551, f549, f550; +mul.f32 f552, f486, f545; +fma.rn.f32 f553, f487, f543, f552; +mul.f32 f554, f413, f553; +mul.f32 f555, f412, f553; +mul.f32 f556, f551, f413; +mul.f32 f557, f486, f551; +mul.f32 f558, f487, f553; +sub.f32 f559, f557, f558; +mul.f32 f560, f486, f553; +fma.rn.f32 f561, f487, f551, f560; +mul.f32 f562, f389, f561; +mul.f32 f563, f388, f561; +mul.f32 f564, f559, f389; +shl.b32 r18, r17, 3; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 968, r19; +add.f32 f565, f365, f355; +add.f32 f566, f364, f354; +st.shared.v2.f32 [r20], {f566, f565}; +fma.rn.f32 f567, f486, f386, f490; +sub.f32 f568, f492, f491; +st.shared.v2.f32 [r20+88], {f567, f568}; +fma.rn.f32 f569, f495, f410, f498; +sub.f32 f570, f500, f499; +st.shared.v2.f32 [r20+176], {f569, f570}; +fma.rn.f32 f571, f503, f434, f506; +sub.f32 f572, f508, f507; +st.shared.v2.f32 [r20+264], {f571, f572}; +fma.rn.f32 f573, f511, f458, f514; +sub.f32 f574, f516, f515; +st.shared.v2.f32 [r20+352], {f573, f574}; +sub.f32 f575, f524, f523; +fma.rn.f32 f576, f519, f482, f522; +st.shared.v2.f32 [r20+440], {f576, f575}; +fma.rn.f32 f577, f527, f484, f530; +sub.f32 f578, f532, f531; +st.shared.v2.f32 [r20+528], {f577, f578}; +fma.rn.f32 f579, f535, f460, f538; +sub.f32 f580, f540, f539; +st.shared.v2.f32 [r20+616], {f579, f580}; +fma.rn.f32 f581, f543, f436, f546; +sub.f32 f582, f548, f547; +st.shared.v2.f32 [r20+704], {f581, f582}; +fma.rn.f32 f583, f551, f412, f554; +sub.f32 f584, f556, f555; +st.shared.v2.f32 [r20+792], {f583, f584}; +fma.rn.f32 f585, f559, f388, f562; +sub.f32 f586, f564, f563; +st.shared.v2.f32 [r20+880], {f585, f586}; +barrier.sync 0; +ld.shared.v2.f32 {f587, f588}, [r14]; +ld.shared.v2.f32 {f591, f592}, [r14+968]; +ld.shared.v2.f32 {f595, f596}, [r14+1936]; +ld.shared.v2.f32 {f599, f600}, [r14+2904]; +ld.shared.v2.f32 {f603, f604}, [r14+3872]; +ld.shared.v2.f32 {f607, f608}, [r14+4840]; +ld.shared.v2.f32 {f611, f612}, [r14+5808]; +ld.shared.v2.f32 {f615, f616}, [r14+6776]; +ld.shared.v2.f32 {f619, f620}, [r14+7744]; +ld.shared.v2.f32 {f623, f624}, [r14+8712]; +ld.shared.v2.f32 {f627, f628}, [r14+9680]; +add.f32 f631, f591, f627; +add.f32 f632, f592, f628; +sub.f32 f633, f591, f627; +sub.f32 f634, f592, f628; +add.f32 f635, f595, f623; +add.f32 f636, f596, f624; +sub.f32 f637, f595, f623; +sub.f32 f638, f596, f624; +add.f32 f639, f599, f619; +add.f32 f640, f600, f620; +sub.f32 f641, f599, f619; +sub.f32 f642, f600, f620; +add.f32 f643, f603, f615; +add.f32 f644, f604, f616; +sub.f32 f645, f603, f615; +sub.f32 f646, f604, f616; +add.f32 f647, f607, f611; +add.f32 f648, f608, f612; +sub.f32 f649, f607, f611; +sub.f32 f650, f608, f612; +add.f32 f651, f587, f631; +add.f32 f652, f588, f632; +add.f32 f653, f651, f635; +add.f32 f654, f652, f636; +add.f32 f655, f653, f639; +add.f32 f656, f654, f640; +add.f32 f657, f655, f643; +add.f32 f658, f656, f644; +fma.rn.f32 f659, f631, 0f3F575C64, f587; +fma.rn.f32 f660, f634, 0f3F0A6770, 0f00000000; +fma.rn.f32 f661, f632, 0f3F575C64, f588; +fma.rn.f32 f662, f633, 0f3F0A6770, 0f00000000; +fma.rn.f32 f663, f635, 0f3ED4B147, f659; +fma.rn.f32 f664, f638, 0f3F68DDA4, f660; +fma.rn.f32 f665, f636, 0f3ED4B147, f661; +fma.rn.f32 f666, f637, 0f3F68DDA4, f662; +fma.rn.f32 f667, f639, 0fBE11BAFB, f663; +fma.rn.f32 f668, f642, 0f3F7D64F0, f664; +fma.rn.f32 f669, f640, 0fBE11BAFB, f665; +fma.rn.f32 f670, f641, 0f3F7D64F0, f666; +fma.rn.f32 f671, f643, 0fBF27A4F4, f667; +fma.rn.f32 f672, f646, 0f3F4178CE, f668; +fma.rn.f32 f673, f644, 0fBF27A4F4, f669; +fma.rn.f32 f674, f645, 0f3F4178CE, f670; +fma.rn.f32 f675, f647, 0fBF75A155, f671; +fma.rn.f32 f676, f650, 0f3E903F40, f672; +fma.rn.f32 f677, f648, 0fBF75A155, f673; +fma.rn.f32 f678, f649, 0f3E903F40, f674; +fma.rn.f32 f679, f631, 0f3ED4B147, f587; +fma.rn.f32 f680, f634, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f681, f632, 0f3ED4B147, f588; +fma.rn.f32 f682, f633, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f683, f635, 0fBF27A4F4, f679; +fma.rn.f32 f684, f638, 0f3F4178CE, f680; +fma.rn.f32 f685, f636, 0fBF27A4F4, f681; +fma.rn.f32 f686, f637, 0f3F4178CE, f682; +fma.rn.f32 f687, f639, 0fBF75A155, f683; +fma.rn.f32 f688, f642, 0fBE903F40, f684; +fma.rn.f32 f689, f640, 0fBF75A155, f685; +fma.rn.f32 f690, f641, 0fBE903F40, f686; +fma.rn.f32 f691, f643, 0fBE11BAFB, f687; +fma.rn.f32 f692, f646, 0fBF7D64F0, f688; +fma.rn.f32 f693, f644, 0fBE11BAFB, f689; +fma.rn.f32 f694, f645, 0fBF7D64F0, f690; +fma.rn.f32 f695, f647, 0f3F575C64, f691; +fma.rn.f32 f696, f650, 0fBF0A6770, f692; +fma.rn.f32 f697, f648, 0f3F575C64, f693; +fma.rn.f32 f698, f649, 0fBF0A6770, f694; +fma.rn.f32 f699, f631, 0fBE11BAFB, f587; +fma.rn.f32 f700, f634, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f701, f632, 0fBE11BAFB, f588; +fma.rn.f32 f702, f633, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f703, f635, 0fBF75A155, f699; +fma.rn.f32 f704, f638, 0fBE903F40, f700; +fma.rn.f32 f705, f636, 0fBF75A155, f701; +fma.rn.f32 f706, f637, 0fBE903F40, f702; +fma.rn.f32 f707, f639, 0f3ED4B147, f703; +fma.rn.f32 f708, f642, 0fBF68DDA4, f704; +fma.rn.f32 f709, f640, 0f3ED4B147, f705; +fma.rn.f32 f710, f641, 0fBF68DDA4, f706; +fma.rn.f32 f711, f643, 0f3F575C64, f707; +fma.rn.f32 f712, f646, 0f3F0A6770, f708; +fma.rn.f32 f713, f644, 0f3F575C64, f709; +fma.rn.f32 f714, f645, 0f3F0A6770, f710; +fma.rn.f32 f715, f647, 0fBF27A4F4, f711; +fma.rn.f32 f716, f650, 0f3F4178CE, f712; +fma.rn.f32 f717, f648, 0fBF27A4F4, f713; +fma.rn.f32 f718, f649, 0f3F4178CE, f714; +fma.rn.f32 f719, f631, 0fBF27A4F4, f587; +fma.rn.f32 f720, f634, 0f3F4178CE, 0f00000000; +fma.rn.f32 f721, f632, 0fBF27A4F4, f588; +fma.rn.f32 f722, f633, 0f3F4178CE, 0f00000000; +fma.rn.f32 f723, f635, 0fBE11BAFB, f719; +fma.rn.f32 f724, f638, 0fBF7D64F0, f720; +fma.rn.f32 f725, f636, 0fBE11BAFB, f721; +fma.rn.f32 f726, f637, 0fBF7D64F0, f722; +fma.rn.f32 f727, f639, 0f3F575C64, f723; +fma.rn.f32 f728, f642, 0f3F0A6770, f724; +fma.rn.f32 f729, f640, 0f3F575C64, f725; +fma.rn.f32 f730, f641, 0f3F0A6770, f726; +fma.rn.f32 f731, f643, 0fBF75A155, f727; +fma.rn.f32 f732, f646, 0f3E903F40, f728; +fma.rn.f32 f733, f644, 0fBF75A155, f729; +fma.rn.f32 f734, f645, 0f3E903F40, f730; +fma.rn.f32 f735, f647, 0f3ED4B147, f731; +fma.rn.f32 f736, f650, 0fBF68DDA4, f732; +fma.rn.f32 f737, f648, 0f3ED4B147, f733; +fma.rn.f32 f738, f649, 0fBF68DDA4, f734; +fma.rn.f32 f739, f631, 0fBF75A155, f587; +fma.rn.f32 f740, f634, 0f3E903F40, 0f00000000; +fma.rn.f32 f741, f632, 0fBF75A155, f588; +fma.rn.f32 f742, f633, 0f3E903F40, 0f00000000; +fma.rn.f32 f743, f635, 0f3F575C64, f739; +fma.rn.f32 f744, f638, 0fBF0A6770, f740; +fma.rn.f32 f745, f636, 0f3F575C64, f741; +fma.rn.f32 f746, f637, 0fBF0A6770, f742; +fma.rn.f32 f747, f639, 0fBF27A4F4, f743; +fma.rn.f32 f748, f642, 0f3F4178CE, f744; +fma.rn.f32 f749, f640, 0fBF27A4F4, f745; +fma.rn.f32 f750, f641, 0f3F4178CE, f746; +fma.rn.f32 f751, f643, 0f3ED4B147, f747; +fma.rn.f32 f752, f646, 0fBF68DDA4, f748; +fma.rn.f32 f753, f644, 0f3ED4B147, f749; +fma.rn.f32 f754, f645, 0fBF68DDA4, f750; +fma.rn.f32 f755, f647, 0fBE11BAFB, f751; +fma.rn.f32 f756, f650, 0f3F7D64F0, f752; +fma.rn.f32 f757, f648, 0fBE11BAFB, f753; +fma.rn.f32 f758, f649, 0f3F7D64F0, f754; +add.f32 %1, f658, f648; +add.f32 %0, f657, f647; +add.f32 %3, f678, f677; +sub.f32 %2, f675, f676; +add.f32 %5, f698, f697; +sub.f32 %4, f695, f696; +add.f32 %7, f718, f717; +sub.f32 %6, f715, f716; +add.f32 %9, f738, f737; +sub.f32 %8, f735, f736; +add.f32 %11, f758, f757; +sub.f32 %10, f755, f756; +sub.f32 %13, f757, f758; +add.f32 %12, f756, f755; +sub.f32 %15, f737, f738; +add.f32 %14, f736, f735; +sub.f32 %17, f717, f718; +add.f32 %16, f716, f715; +sub.f32 %19, f697, f698; +add.f32 %18, f696, f695; +sub.f32 %21, f677, f678; +add.f32 %20, f676, f675; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_1331), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<383, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<737>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 5324, r2; +add.f32 f45, %27, %51; +add.f32 f46, %29, %52; +sub.f32 f47, %27, %51; +sub.f32 f48, %29, %52; +add.f32 f49, %30, %49; +add.f32 f50, %32, %50; +sub.f32 f51, %30, %49; +sub.f32 f52, %32, %50; +add.f32 f53, %33, %46; +add.f32 f54, %34, %48; +sub.f32 f55, %33, %46; +sub.f32 f56, %34, %48; +add.f32 f57, %35, %43; +add.f32 f58, %37, %45; +sub.f32 f59, %35, %43; +sub.f32 f60, %37, %45; +add.f32 f61, %38, %41; +add.f32 f62, %40, %42; +sub.f32 f63, %38, %41; +sub.f32 f64, %40, %42; +mov.u32 r4, %tid.x; +add.f32 f65, %25, f45; +add.f32 f66, %26, f46; +add.f32 f67, f65, f49; +add.f32 f68, f66, f50; +add.f32 f69, f67, f53; +add.f32 f70, f68, f54; +add.f32 f71, f69, f57; +add.f32 f72, f70, f58; +add.f32 f73, f71, f61; +add.f32 f74, f72, f62; +fma.rn.f32 f75, f45, 0f3F575C64, %25; +fma.rn.f32 f76, f48, 0f3F0A6770, 0f00000000; +fma.rn.f32 f77, f46, 0f3F575C64, %26; +fma.rn.f32 f78, f47, 0f3F0A6770, 0f00000000; +fma.rn.f32 f79, f49, 0f3ED4B147, f75; +fma.rn.f32 f80, f52, 0f3F68DDA4, f76; +fma.rn.f32 f81, f50, 0f3ED4B147, f77; +fma.rn.f32 f82, f51, 0f3F68DDA4, f78; +fma.rn.f32 f83, f53, 0fBE11BAFB, f79; +fma.rn.f32 f84, f56, 0f3F7D64F0, f80; +fma.rn.f32 f85, f54, 0fBE11BAFB, f81; +fma.rn.f32 f86, f55, 0f3F7D64F0, f82; +fma.rn.f32 f87, f57, 0fBF27A4F4, f83; +fma.rn.f32 f88, f60, 0f3F4178CE, f84; +fma.rn.f32 f89, f58, 0fBF27A4F4, f85; +fma.rn.f32 f90, f59, 0f3F4178CE, f86; +fma.rn.f32 f91, f61, 0fBF75A155, f87; +fma.rn.f32 f92, f64, 0f3E903F40, f88; +fma.rn.f32 f93, f62, 0fBF75A155, f89; +fma.rn.f32 f94, f63, 0f3E903F40, f90; +sub.f32 f95, f91, f92; +add.f32 f96, f94, f93; +add.f32 f97, f92, f91; +sub.f32 f98, f93, f94; +fma.rn.f32 f99, f45, 0f3ED4B147, %25; +fma.rn.f32 f100, f48, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f101, f46, 0f3ED4B147, %26; +fma.rn.f32 f102, f47, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f103, f49, 0fBF27A4F4, f99; +fma.rn.f32 f104, f52, 0f3F4178CE, f100; +fma.rn.f32 f105, f50, 0fBF27A4F4, f101; +fma.rn.f32 f106, f51, 0f3F4178CE, f102; +fma.rn.f32 f107, f53, 0fBF75A155, f103; +fma.rn.f32 f108, f56, 0fBE903F40, f104; +fma.rn.f32 f109, f54, 0fBF75A155, f105; +fma.rn.f32 f110, f55, 0fBE903F40, f106; +fma.rn.f32 f111, f57, 0fBE11BAFB, f107; +fma.rn.f32 f112, f60, 0fBF7D64F0, f108; +fma.rn.f32 f113, f58, 0fBE11BAFB, f109; +fma.rn.f32 f114, f59, 0fBF7D64F0, f110; +fma.rn.f32 f115, f61, 0f3F575C64, f111; +fma.rn.f32 f116, f64, 0fBF0A6770, f112; +fma.rn.f32 f117, f62, 0f3F575C64, f113; +fma.rn.f32 f118, f63, 0fBF0A6770, f114; +sub.f32 f119, f115, f116; +add.f32 f120, f118, f117; +add.f32 f121, f116, f115; +sub.f32 f122, f117, f118; +fma.rn.f32 f123, f45, 0fBE11BAFB, %25; +fma.rn.f32 f124, f48, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f125, f46, 0fBE11BAFB, %26; +fma.rn.f32 f126, f47, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f127, f49, 0fBF75A155, f123; +fma.rn.f32 f128, f52, 0fBE903F40, f124; +fma.rn.f32 f129, f50, 0fBF75A155, f125; +fma.rn.f32 f130, f51, 0fBE903F40, f126; +fma.rn.f32 f131, f53, 0f3ED4B147, f127; +fma.rn.f32 f132, f56, 0fBF68DDA4, f128; +fma.rn.f32 f133, f54, 0f3ED4B147, f129; +fma.rn.f32 f134, f55, 0fBF68DDA4, f130; +fma.rn.f32 f135, f57, 0f3F575C64, f131; +fma.rn.f32 f136, f60, 0f3F0A6770, f132; +fma.rn.f32 f137, f58, 0f3F575C64, f133; +fma.rn.f32 f138, f59, 0f3F0A6770, f134; +fma.rn.f32 f139, f61, 0fBF27A4F4, f135; +fma.rn.f32 f140, f64, 0f3F4178CE, f136; +fma.rn.f32 f141, f62, 0fBF27A4F4, f137; +fma.rn.f32 f142, f63, 0f3F4178CE, f138; +sub.f32 f143, f139, f140; +add.f32 f144, f142, f141; +add.f32 f145, f140, f139; +sub.f32 f146, f141, f142; +fma.rn.f32 f147, f45, 0fBF27A4F4, %25; +fma.rn.f32 f148, f48, 0f3F4178CE, 0f00000000; +fma.rn.f32 f149, f46, 0fBF27A4F4, %26; +fma.rn.f32 f150, f47, 0f3F4178CE, 0f00000000; +fma.rn.f32 f151, f49, 0fBE11BAFB, f147; +fma.rn.f32 f152, f52, 0fBF7D64F0, f148; +fma.rn.f32 f153, f50, 0fBE11BAFB, f149; +fma.rn.f32 f154, f51, 0fBF7D64F0, f150; +fma.rn.f32 f155, f53, 0f3F575C64, f151; +fma.rn.f32 f156, f56, 0f3F0A6770, f152; +fma.rn.f32 f157, f54, 0f3F575C64, f153; +fma.rn.f32 f158, f55, 0f3F0A6770, f154; +fma.rn.f32 f159, f57, 0fBF75A155, f155; +fma.rn.f32 f160, f60, 0f3E903F40, f156; +fma.rn.f32 f161, f58, 0fBF75A155, f157; +fma.rn.f32 f162, f59, 0f3E903F40, f158; +fma.rn.f32 f163, f61, 0f3ED4B147, f159; +fma.rn.f32 f164, f64, 0fBF68DDA4, f160; +fma.rn.f32 f165, f62, 0f3ED4B147, f161; +fma.rn.f32 f166, f63, 0fBF68DDA4, f162; +sub.f32 f167, f163, f164; +add.f32 f168, f166, f165; +add.f32 f169, f164, f163; +sub.f32 f170, f165, f166; +fma.rn.f32 f171, f45, 0fBF75A155, %25; +fma.rn.f32 f172, f48, 0f3E903F40, 0f00000000; +fma.rn.f32 f173, f46, 0fBF75A155, %26; +fma.rn.f32 f174, f47, 0f3E903F40, 0f00000000; +fma.rn.f32 f175, f49, 0f3F575C64, f171; +fma.rn.f32 f176, f52, 0fBF0A6770, f172; +fma.rn.f32 f177, f50, 0f3F575C64, f173; +fma.rn.f32 f178, f51, 0fBF0A6770, f174; +fma.rn.f32 f179, f53, 0fBF27A4F4, f175; +fma.rn.f32 f180, f56, 0f3F4178CE, f176; +fma.rn.f32 f181, f54, 0fBF27A4F4, f177; +fma.rn.f32 f182, f55, 0f3F4178CE, f178; +fma.rn.f32 f183, f57, 0f3ED4B147, f179; +fma.rn.f32 f184, f60, 0fBF68DDA4, f180; +fma.rn.f32 f185, f58, 0f3ED4B147, f181; +fma.rn.f32 f186, f59, 0fBF68DDA4, f182; +fma.rn.f32 f187, f61, 0fBE11BAFB, f183; +fma.rn.f32 f188, f64, 0f3F7D64F0, f184; +fma.rn.f32 f189, f62, 0fBE11BAFB, f185; +fma.rn.f32 f190, f63, 0f3F7D64F0, f186; +sub.f32 f191, f187, f188; +add.f32 f192, f190, f189; +add.f32 f193, f188, f187; +sub.f32 f194, f189, f190; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f195, f196}, [rd6]; +mul.f32 f199, f96, f196; +fma.rn.f32 f200, f195, f95, f199; +mul.f32 f201, f95, f196; +mul.f32 f202, f195, f96; +sub.f32 f203, f202, f201; +mul.f32 f204, f195, f195; +mul.f32 f205, f196, f196; +sub.f32 f206, f204, f205; +mul.f32 f207, f196, f195; +fma.rn.f32 f208, f196, f195, f207; +mul.f32 f209, f120, f208; +fma.rn.f32 f210, f206, f119, f209; +mul.f32 f211, f119, f208; +mul.f32 f212, f206, f120; +sub.f32 f213, f212, f211; +mul.f32 f214, f195, f206; +mul.f32 f215, f196, f208; +sub.f32 f216, f214, f215; +mul.f32 f217, f195, f208; +fma.rn.f32 f218, f196, f206, f217; +mul.f32 f219, f144, f218; +fma.rn.f32 f220, f216, f143, f219; +mul.f32 f221, f143, f218; +mul.f32 f222, f216, f144; +sub.f32 f223, f222, f221; +mul.f32 f224, f195, f216; +mul.f32 f225, f196, f218; +sub.f32 f226, f224, f225; +mul.f32 f227, f195, f218; +fma.rn.f32 f228, f196, f216, f227; +mul.f32 f229, f168, f228; +fma.rn.f32 f230, f226, f167, f229; +mul.f32 f231, f167, f228; +mul.f32 f232, f226, f168; +sub.f32 f233, f232, f231; +mul.f32 f234, f195, f226; +mul.f32 f235, f196, f228; +sub.f32 f236, f234, f235; +mul.f32 f237, f195, f228; +fma.rn.f32 f238, f196, f226, f237; +mul.f32 f239, f192, f238; +fma.rn.f32 f240, f236, f191, f239; +mul.f32 f241, f191, f238; +mul.f32 f242, f236, f192; +sub.f32 f243, f242, f241; +mul.f32 f244, f195, f236; +mul.f32 f245, f196, f238; +sub.f32 f246, f244, f245; +mul.f32 f247, f195, f238; +fma.rn.f32 f248, f196, f236, f247; +mul.f32 f249, f194, f248; +fma.rn.f32 f250, f246, f193, f249; +mul.f32 f251, f193, f248; +mul.f32 f252, f246, f194; +sub.f32 f253, f252, f251; +mul.f32 f254, f195, f246; +mul.f32 f255, f196, f248; +sub.f32 f256, f254, f255; +mul.f32 f257, f195, f248; +fma.rn.f32 f258, f196, f246, f257; +mul.f32 f259, f170, f258; +fma.rn.f32 f260, f256, f169, f259; +mul.f32 f261, f169, f258; +mul.f32 f262, f256, f170; +sub.f32 f263, f262, f261; +mul.f32 f264, f195, f256; +mul.f32 f265, f196, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f195, f258; +fma.rn.f32 f268, f196, f256, f267; +mul.f32 f269, f146, f268; +fma.rn.f32 f270, f266, f145, f269; +mul.f32 f271, f145, f268; +mul.f32 f272, f266, f146; +sub.f32 f273, f272, f271; +mul.f32 f274, f195, f266; +mul.f32 f275, f196, f268; +sub.f32 f276, f274, f275; +mul.f32 f277, f195, f268; +fma.rn.f32 f278, f196, f266, f277; +mul.f32 f279, f122, f278; +fma.rn.f32 f280, f276, f121, f279; +mul.f32 f281, f121, f278; +mul.f32 f282, f276, f122; +sub.f32 f283, f282, f281; +mul.f32 f284, f195, f276; +mul.f32 f285, f196, f278; +sub.f32 f286, f284, f285; +mul.f32 f287, f195, f278; +fma.rn.f32 f288, f196, f276, f287; +mul.f32 f289, f98, f288; +fma.rn.f32 f290, f286, f97, f289; +mul.f32 f291, f97, f288; +mul.f32 f292, f286, f98; +sub.f32 f293, f292, f291; +mad.lo.s32 r12, r9, 5324, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 44, r12; +st.shared.f32 [r13], f73; +st.shared.f32 [r13+4], f200; +st.shared.f32 [r13+8], f210; +st.shared.f32 [r13+12], f220; +st.shared.f32 [r13+16], f230; +st.shared.f32 [r13+20], f240; +st.shared.f32 [r13+24], f250; +st.shared.f32 [r13+28], f260; +st.shared.f32 [r13+32], f270; +st.shared.f32 [r13+36], f280; +st.shared.f32 [r13+40], f290; +barrier.sync 0; +mad.lo.s32 r14, r11, -40, r13; +ld.shared.f32 f294, [r14]; +ld.shared.f32 f295, [r14+484]; +ld.shared.f32 f296, [r14+968]; +ld.shared.f32 f297, [r14+1452]; +ld.shared.f32 f298, [r14+1936]; +ld.shared.f32 f299, [r14+2420]; +ld.shared.f32 f300, [r14+2904]; +ld.shared.f32 f301, [r14+3388]; +ld.shared.f32 f302, [r14+3872]; +ld.shared.f32 f303, [r14+4356]; +ld.shared.f32 f304, [r14+4840]; +barrier.sync 0; +st.shared.f32 [r13], f74; +st.shared.f32 [r13+4], f203; +st.shared.f32 [r13+8], f213; +st.shared.f32 [r13+12], f223; +st.shared.f32 [r13+16], f233; +st.shared.f32 [r13+20], f243; +st.shared.f32 [r13+24], f253; +st.shared.f32 [r13+28], f263; +st.shared.f32 [r13+32], f273; +st.shared.f32 [r13+36], f283; +st.shared.f32 [r13+40], f293; +barrier.sync 0; +ld.shared.f32 f305, [r14]; +ld.shared.f32 f306, [r14+484]; +ld.shared.f32 f307, [r14+968]; +ld.shared.f32 f308, [r14+1452]; +ld.shared.f32 f309, [r14+1936]; +ld.shared.f32 f310, [r14+2420]; +ld.shared.f32 f311, [r14+2904]; +ld.shared.f32 f312, [r14+3388]; +ld.shared.f32 f313, [r14+3872]; +ld.shared.f32 f314, [r14+4356]; +ld.shared.f32 f315, [r14+4840]; +add.f32 f316, f295, f304; +add.f32 f317, f306, f315; +sub.f32 f318, f295, f304; +sub.f32 f319, f306, f315; +add.f32 f320, f296, f303; +add.f32 f321, f307, f314; +sub.f32 f322, f296, f303; +sub.f32 f323, f307, f314; +add.f32 f324, f297, f302; +add.f32 f325, f308, f313; +sub.f32 f326, f297, f302; +sub.f32 f327, f308, f313; +add.f32 f328, f298, f301; +add.f32 f329, f309, f312; +sub.f32 f330, f298, f301; +sub.f32 f331, f309, f312; +add.f32 f332, f299, f300; +add.f32 f333, f310, f311; +sub.f32 f334, f299, f300; +sub.f32 f335, f310, f311; +add.f32 f336, f294, f316; +add.f32 f337, f305, f317; +add.f32 f338, f336, f320; +add.f32 f339, f337, f321; +add.f32 f340, f338, f324; +add.f32 f341, f339, f325; +add.f32 f342, f340, f328; +add.f32 f343, f341, f329; +add.f32 f344, f342, f332; +add.f32 f345, f343, f333; +fma.rn.f32 f346, f316, 0f3F575C64, f294; +fma.rn.f32 f347, f319, 0f3F0A6770, 0f00000000; +fma.rn.f32 f348, f317, 0f3F575C64, f305; +fma.rn.f32 f349, f318, 0f3F0A6770, 0f00000000; +fma.rn.f32 f350, f320, 0f3ED4B147, f346; +fma.rn.f32 f351, f323, 0f3F68DDA4, f347; +fma.rn.f32 f352, f321, 0f3ED4B147, f348; +fma.rn.f32 f353, f322, 0f3F68DDA4, f349; +fma.rn.f32 f354, f324, 0fBE11BAFB, f350; +fma.rn.f32 f355, f327, 0f3F7D64F0, f351; +fma.rn.f32 f356, f325, 0fBE11BAFB, f352; +fma.rn.f32 f357, f326, 0f3F7D64F0, f353; +fma.rn.f32 f358, f328, 0fBF27A4F4, f354; +fma.rn.f32 f359, f331, 0f3F4178CE, f355; +fma.rn.f32 f360, f329, 0fBF27A4F4, f356; +fma.rn.f32 f361, f330, 0f3F4178CE, f357; +fma.rn.f32 f362, f332, 0fBF75A155, f358; +fma.rn.f32 f363, f335, 0f3E903F40, f359; +fma.rn.f32 f364, f333, 0fBF75A155, f360; +fma.rn.f32 f365, f334, 0f3E903F40, f361; +sub.f32 f366, f362, f363; +add.f32 f367, f365, f364; +add.f32 f368, f363, f362; +sub.f32 f369, f364, f365; +fma.rn.f32 f370, f316, 0f3ED4B147, f294; +fma.rn.f32 f371, f319, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f372, f317, 0f3ED4B147, f305; +fma.rn.f32 f373, f318, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f374, f320, 0fBF27A4F4, f370; +fma.rn.f32 f375, f323, 0f3F4178CE, f371; +fma.rn.f32 f376, f321, 0fBF27A4F4, f372; +fma.rn.f32 f377, f322, 0f3F4178CE, f373; +fma.rn.f32 f378, f324, 0fBF75A155, f374; +fma.rn.f32 f379, f327, 0fBE903F40, f375; +fma.rn.f32 f380, f325, 0fBF75A155, f376; +fma.rn.f32 f381, f326, 0fBE903F40, f377; +fma.rn.f32 f382, f328, 0fBE11BAFB, f378; +fma.rn.f32 f383, f331, 0fBF7D64F0, f379; +fma.rn.f32 f384, f329, 0fBE11BAFB, f380; +fma.rn.f32 f385, f330, 0fBF7D64F0, f381; +fma.rn.f32 f386, f332, 0f3F575C64, f382; +fma.rn.f32 f387, f335, 0fBF0A6770, f383; +fma.rn.f32 f388, f333, 0f3F575C64, f384; +fma.rn.f32 f389, f334, 0fBF0A6770, f385; +sub.f32 f390, f386, f387; +add.f32 f391, f389, f388; +add.f32 f392, f387, f386; +sub.f32 f393, f388, f389; +fma.rn.f32 f394, f316, 0fBE11BAFB, f294; +fma.rn.f32 f395, f319, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f396, f317, 0fBE11BAFB, f305; +fma.rn.f32 f397, f318, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f398, f320, 0fBF75A155, f394; +fma.rn.f32 f399, f323, 0fBE903F40, f395; +fma.rn.f32 f400, f321, 0fBF75A155, f396; +fma.rn.f32 f401, f322, 0fBE903F40, f397; +fma.rn.f32 f402, f324, 0f3ED4B147, f398; +fma.rn.f32 f403, f327, 0fBF68DDA4, f399; +fma.rn.f32 f404, f325, 0f3ED4B147, f400; +fma.rn.f32 f405, f326, 0fBF68DDA4, f401; +fma.rn.f32 f406, f328, 0f3F575C64, f402; +fma.rn.f32 f407, f331, 0f3F0A6770, f403; +fma.rn.f32 f408, f329, 0f3F575C64, f404; +fma.rn.f32 f409, f330, 0f3F0A6770, f405; +fma.rn.f32 f410, f332, 0fBF27A4F4, f406; +fma.rn.f32 f411, f335, 0f3F4178CE, f407; +fma.rn.f32 f412, f333, 0fBF27A4F4, f408; +fma.rn.f32 f413, f334, 0f3F4178CE, f409; +sub.f32 f414, f410, f411; +add.f32 f415, f413, f412; +add.f32 f416, f411, f410; +sub.f32 f417, f412, f413; +fma.rn.f32 f418, f316, 0fBF27A4F4, f294; +fma.rn.f32 f419, f319, 0f3F4178CE, 0f00000000; +fma.rn.f32 f420, f317, 0fBF27A4F4, f305; +fma.rn.f32 f421, f318, 0f3F4178CE, 0f00000000; +fma.rn.f32 f422, f320, 0fBE11BAFB, f418; +fma.rn.f32 f423, f323, 0fBF7D64F0, f419; +fma.rn.f32 f424, f321, 0fBE11BAFB, f420; +fma.rn.f32 f425, f322, 0fBF7D64F0, f421; +fma.rn.f32 f426, f324, 0f3F575C64, f422; +fma.rn.f32 f427, f327, 0f3F0A6770, f423; +fma.rn.f32 f428, f325, 0f3F575C64, f424; +fma.rn.f32 f429, f326, 0f3F0A6770, f425; +fma.rn.f32 f430, f328, 0fBF75A155, f426; +fma.rn.f32 f431, f331, 0f3E903F40, f427; +fma.rn.f32 f432, f329, 0fBF75A155, f428; +fma.rn.f32 f433, f330, 0f3E903F40, f429; +fma.rn.f32 f434, f332, 0f3ED4B147, f430; +fma.rn.f32 f435, f335, 0fBF68DDA4, f431; +fma.rn.f32 f436, f333, 0f3ED4B147, f432; +fma.rn.f32 f437, f334, 0fBF68DDA4, f433; +sub.f32 f438, f434, f435; +add.f32 f439, f437, f436; +add.f32 f440, f435, f434; +sub.f32 f441, f436, f437; +fma.rn.f32 f442, f316, 0fBF75A155, f294; +fma.rn.f32 f443, f319, 0f3E903F40, 0f00000000; +fma.rn.f32 f444, f317, 0fBF75A155, f305; +fma.rn.f32 f445, f318, 0f3E903F40, 0f00000000; +fma.rn.f32 f446, f320, 0f3F575C64, f442; +fma.rn.f32 f447, f323, 0fBF0A6770, f443; +fma.rn.f32 f448, f321, 0f3F575C64, f444; +fma.rn.f32 f449, f322, 0fBF0A6770, f445; +fma.rn.f32 f450, f324, 0fBF27A4F4, f446; +fma.rn.f32 f451, f327, 0f3F4178CE, f447; +fma.rn.f32 f452, f325, 0fBF27A4F4, f448; +fma.rn.f32 f453, f326, 0f3F4178CE, f449; +fma.rn.f32 f454, f328, 0f3ED4B147, f450; +fma.rn.f32 f455, f331, 0fBF68DDA4, f451; +fma.rn.f32 f456, f329, 0f3ED4B147, f452; +fma.rn.f32 f457, f330, 0fBF68DDA4, f453; +fma.rn.f32 f458, f332, 0fBE11BAFB, f454; +fma.rn.f32 f459, f335, 0f3F7D64F0, f455; +fma.rn.f32 f460, f333, 0fBE11BAFB, f456; +fma.rn.f32 f461, f334, 0f3F7D64F0, f457; +sub.f32 f462, f458, f459; +add.f32 f463, f461, f460; +add.f32 f464, f459, f458; +sub.f32 f465, f460, f461; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f466, f467}, [rd11]; +mul.f32 f470, f367, f467; +fma.rn.f32 f471, f466, f366, f470; +mul.f32 f472, f366, f467; +mul.f32 f473, f466, f367; +sub.f32 f474, f473, f472; +mul.f32 f475, f466, f466; +mul.f32 f476, f467, f467; +sub.f32 f477, f475, f476; +mul.f32 f478, f467, f466; +fma.rn.f32 f479, f467, f466, f478; +mul.f32 f480, f391, f479; +fma.rn.f32 f481, f477, f390, f480; +mul.f32 f482, f390, f479; +mul.f32 f483, f477, f391; +sub.f32 f484, f483, f482; +mul.f32 f485, f466, f477; +mul.f32 f486, f467, f479; +sub.f32 f487, f485, f486; +mul.f32 f488, f466, f479; +fma.rn.f32 f489, f467, f477, f488; +mul.f32 f490, f415, f489; +fma.rn.f32 f491, f487, f414, f490; +mul.f32 f492, f414, f489; +mul.f32 f493, f487, f415; +sub.f32 f494, f493, f492; +mul.f32 f495, f466, f487; +mul.f32 f496, f467, f489; +sub.f32 f497, f495, f496; +mul.f32 f498, f466, f489; +fma.rn.f32 f499, f467, f487, f498; +mul.f32 f500, f439, f499; +fma.rn.f32 f501, f497, f438, f500; +mul.f32 f502, f438, f499; +mul.f32 f503, f497, f439; +sub.f32 f504, f503, f502; +mul.f32 f505, f466, f497; +mul.f32 f506, f467, f499; +sub.f32 f507, f505, f506; +mul.f32 f508, f466, f499; +fma.rn.f32 f509, f467, f497, f508; +mul.f32 f510, f463, f509; +fma.rn.f32 f511, f507, f462, f510; +mul.f32 f512, f462, f509; +mul.f32 f513, f507, f463; +sub.f32 f514, f513, f512; +mul.f32 f515, f466, f507; +mul.f32 f516, f467, f509; +sub.f32 f517, f515, f516; +mul.f32 f518, f466, f509; +fma.rn.f32 f519, f467, f507, f518; +mul.f32 f520, f465, f519; +fma.rn.f32 f521, f517, f464, f520; +mul.f32 f522, f464, f519; +mul.f32 f523, f517, f465; +sub.f32 f524, f523, f522; +mul.f32 f525, f466, f517; +mul.f32 f526, f467, f519; +sub.f32 f527, f525, f526; +mul.f32 f528, f466, f519; +fma.rn.f32 f529, f467, f517, f528; +mul.f32 f530, f441, f529; +fma.rn.f32 f531, f527, f440, f530; +mul.f32 f532, f440, f529; +mul.f32 f533, f527, f441; +sub.f32 f534, f533, f532; +mul.f32 f535, f466, f527; +mul.f32 f536, f467, f529; +sub.f32 f537, f535, f536; +mul.f32 f538, f466, f529; +fma.rn.f32 f539, f467, f527, f538; +mul.f32 f540, f417, f539; +fma.rn.f32 f541, f537, f416, f540; +mul.f32 f542, f416, f539; +mul.f32 f543, f537, f417; +sub.f32 f544, f543, f542; +mul.f32 f545, f466, f537; +mul.f32 f546, f467, f539; +sub.f32 f547, f545, f546; +mul.f32 f548, f466, f539; +fma.rn.f32 f549, f467, f537, f548; +mul.f32 f550, f393, f549; +fma.rn.f32 f551, f547, f392, f550; +mul.f32 f552, f392, f549; +mul.f32 f553, f547, f393; +sub.f32 f554, f553, f552; +mul.f32 f555, f466, f547; +mul.f32 f556, f467, f549; +sub.f32 f557, f555, f556; +mul.f32 f558, f466, f549; +fma.rn.f32 f559, f467, f547, f558; +mul.f32 f560, f369, f559; +fma.rn.f32 f561, f557, f368, f560; +mul.f32 f562, f368, f559; +mul.f32 f563, f557, f369; +sub.f32 f564, f563, f562; +shl.b32 r18, r17, 2; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 484, r19; +st.shared.f32 [r20], f344; +st.shared.f32 [r20+44], f471; +st.shared.f32 [r20+88], f481; +st.shared.f32 [r20+132], f491; +st.shared.f32 [r20+176], f501; +st.shared.f32 [r20+220], f511; +st.shared.f32 [r20+264], f521; +st.shared.f32 [r20+308], f531; +st.shared.f32 [r20+352], f541; +st.shared.f32 [r20+396], f551; +st.shared.f32 [r20+440], f561; +barrier.sync 0; +ld.shared.f32 f565, [r14]; +ld.shared.f32 f566, [r14+484]; +ld.shared.f32 f567, [r14+968]; +ld.shared.f32 f568, [r14+1452]; +ld.shared.f32 f569, [r14+1936]; +ld.shared.f32 f570, [r14+2420]; +ld.shared.f32 f571, [r14+2904]; +ld.shared.f32 f572, [r14+3388]; +ld.shared.f32 f573, [r14+3872]; +ld.shared.f32 f574, [r14+4356]; +ld.shared.f32 f575, [r14+4840]; +barrier.sync 0; +st.shared.f32 [r20], f345; +st.shared.f32 [r20+44], f474; +st.shared.f32 [r20+88], f484; +st.shared.f32 [r20+132], f494; +st.shared.f32 [r20+176], f504; +st.shared.f32 [r20+220], f514; +st.shared.f32 [r20+264], f524; +st.shared.f32 [r20+308], f534; +st.shared.f32 [r20+352], f544; +st.shared.f32 [r20+396], f554; +st.shared.f32 [r20+440], f564; +barrier.sync 0; +ld.shared.f32 f576, [r14]; +ld.shared.f32 f577, [r14+484]; +ld.shared.f32 f578, [r14+968]; +ld.shared.f32 f579, [r14+1452]; +ld.shared.f32 f580, [r14+1936]; +ld.shared.f32 f581, [r14+2420]; +ld.shared.f32 f582, [r14+2904]; +ld.shared.f32 f583, [r14+3388]; +ld.shared.f32 f584, [r14+3872]; +ld.shared.f32 f585, [r14+4356]; +ld.shared.f32 f586, [r14+4840]; +add.f32 f587, f566, f575; +add.f32 f588, f577, f586; +sub.f32 f589, f566, f575; +sub.f32 f590, f577, f586; +add.f32 f591, f567, f574; +add.f32 f592, f578, f585; +sub.f32 f593, f567, f574; +sub.f32 f594, f578, f585; +add.f32 f595, f568, f573; +add.f32 f596, f579, f584; +sub.f32 f597, f568, f573; +sub.f32 f598, f579, f584; +add.f32 f599, f569, f572; +add.f32 f600, f580, f583; +sub.f32 f601, f569, f572; +sub.f32 f602, f580, f583; +add.f32 f603, f570, f571; +add.f32 f604, f581, f582; +sub.f32 f605, f570, f571; +sub.f32 f606, f581, f582; +add.f32 f607, f565, f587; +add.f32 f608, f576, f588; +add.f32 f609, f607, f591; +add.f32 f610, f608, f592; +add.f32 f611, f609, f595; +add.f32 f612, f610, f596; +add.f32 f613, f611, f599; +add.f32 f614, f612, f600; +fma.rn.f32 f615, f587, 0f3F575C64, f565; +fma.rn.f32 f616, f590, 0f3F0A6770, 0f00000000; +fma.rn.f32 f617, f588, 0f3F575C64, f576; +fma.rn.f32 f618, f589, 0f3F0A6770, 0f00000000; +fma.rn.f32 f619, f591, 0f3ED4B147, f615; +fma.rn.f32 f620, f594, 0f3F68DDA4, f616; +fma.rn.f32 f621, f592, 0f3ED4B147, f617; +fma.rn.f32 f622, f593, 0f3F68DDA4, f618; +fma.rn.f32 f623, f595, 0fBE11BAFB, f619; +fma.rn.f32 f624, f598, 0f3F7D64F0, f620; +fma.rn.f32 f625, f596, 0fBE11BAFB, f621; +fma.rn.f32 f626, f597, 0f3F7D64F0, f622; +fma.rn.f32 f627, f599, 0fBF27A4F4, f623; +fma.rn.f32 f628, f602, 0f3F4178CE, f624; +fma.rn.f32 f629, f600, 0fBF27A4F4, f625; +fma.rn.f32 f630, f601, 0f3F4178CE, f626; +fma.rn.f32 f631, f603, 0fBF75A155, f627; +fma.rn.f32 f632, f606, 0f3E903F40, f628; +fma.rn.f32 f633, f604, 0fBF75A155, f629; +fma.rn.f32 f634, f605, 0f3E903F40, f630; +fma.rn.f32 f635, f587, 0f3ED4B147, f565; +fma.rn.f32 f636, f590, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f637, f588, 0f3ED4B147, f576; +fma.rn.f32 f638, f589, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f639, f591, 0fBF27A4F4, f635; +fma.rn.f32 f640, f594, 0f3F4178CE, f636; +fma.rn.f32 f641, f592, 0fBF27A4F4, f637; +fma.rn.f32 f642, f593, 0f3F4178CE, f638; +fma.rn.f32 f643, f595, 0fBF75A155, f639; +fma.rn.f32 f644, f598, 0fBE903F40, f640; +fma.rn.f32 f645, f596, 0fBF75A155, f641; +fma.rn.f32 f646, f597, 0fBE903F40, f642; +fma.rn.f32 f647, f599, 0fBE11BAFB, f643; +fma.rn.f32 f648, f602, 0fBF7D64F0, f644; +fma.rn.f32 f649, f600, 0fBE11BAFB, f645; +fma.rn.f32 f650, f601, 0fBF7D64F0, f646; +fma.rn.f32 f651, f603, 0f3F575C64, f647; +fma.rn.f32 f652, f606, 0fBF0A6770, f648; +fma.rn.f32 f653, f604, 0f3F575C64, f649; +fma.rn.f32 f654, f605, 0fBF0A6770, f650; +fma.rn.f32 f655, f587, 0fBE11BAFB, f565; +fma.rn.f32 f656, f590, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f657, f588, 0fBE11BAFB, f576; +fma.rn.f32 f658, f589, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f659, f591, 0fBF75A155, f655; +fma.rn.f32 f660, f594, 0fBE903F40, f656; +fma.rn.f32 f661, f592, 0fBF75A155, f657; +fma.rn.f32 f662, f593, 0fBE903F40, f658; +fma.rn.f32 f663, f595, 0f3ED4B147, f659; +fma.rn.f32 f664, f598, 0fBF68DDA4, f660; +fma.rn.f32 f665, f596, 0f3ED4B147, f661; +fma.rn.f32 f666, f597, 0fBF68DDA4, f662; +fma.rn.f32 f667, f599, 0f3F575C64, f663; +fma.rn.f32 f668, f602, 0f3F0A6770, f664; +fma.rn.f32 f669, f600, 0f3F575C64, f665; +fma.rn.f32 f670, f601, 0f3F0A6770, f666; +fma.rn.f32 f671, f603, 0fBF27A4F4, f667; +fma.rn.f32 f672, f606, 0f3F4178CE, f668; +fma.rn.f32 f673, f604, 0fBF27A4F4, f669; +fma.rn.f32 f674, f605, 0f3F4178CE, f670; +fma.rn.f32 f675, f587, 0fBF27A4F4, f565; +fma.rn.f32 f676, f590, 0f3F4178CE, 0f00000000; +fma.rn.f32 f677, f588, 0fBF27A4F4, f576; +fma.rn.f32 f678, f589, 0f3F4178CE, 0f00000000; +fma.rn.f32 f679, f591, 0fBE11BAFB, f675; +fma.rn.f32 f680, f594, 0fBF7D64F0, f676; +fma.rn.f32 f681, f592, 0fBE11BAFB, f677; +fma.rn.f32 f682, f593, 0fBF7D64F0, f678; +fma.rn.f32 f683, f595, 0f3F575C64, f679; +fma.rn.f32 f684, f598, 0f3F0A6770, f680; +fma.rn.f32 f685, f596, 0f3F575C64, f681; +fma.rn.f32 f686, f597, 0f3F0A6770, f682; +fma.rn.f32 f687, f599, 0fBF75A155, f683; +fma.rn.f32 f688, f602, 0f3E903F40, f684; +fma.rn.f32 f689, f600, 0fBF75A155, f685; +fma.rn.f32 f690, f601, 0f3E903F40, f686; +fma.rn.f32 f691, f603, 0f3ED4B147, f687; +fma.rn.f32 f692, f606, 0fBF68DDA4, f688; +fma.rn.f32 f693, f604, 0f3ED4B147, f689; +fma.rn.f32 f694, f605, 0fBF68DDA4, f690; +fma.rn.f32 f695, f587, 0fBF75A155, f565; +fma.rn.f32 f696, f590, 0f3E903F40, 0f00000000; +fma.rn.f32 f697, f588, 0fBF75A155, f576; +fma.rn.f32 f698, f589, 0f3E903F40, 0f00000000; +fma.rn.f32 f699, f591, 0f3F575C64, f695; +fma.rn.f32 f700, f594, 0fBF0A6770, f696; +fma.rn.f32 f701, f592, 0f3F575C64, f697; +fma.rn.f32 f702, f593, 0fBF0A6770, f698; +fma.rn.f32 f703, f595, 0fBF27A4F4, f699; +fma.rn.f32 f704, f598, 0f3F4178CE, f700; +fma.rn.f32 f705, f596, 0fBF27A4F4, f701; +fma.rn.f32 f706, f597, 0f3F4178CE, f702; +fma.rn.f32 f707, f599, 0f3ED4B147, f703; +fma.rn.f32 f708, f602, 0fBF68DDA4, f704; +fma.rn.f32 f709, f600, 0f3ED4B147, f705; +fma.rn.f32 f710, f601, 0fBF68DDA4, f706; +fma.rn.f32 f711, f603, 0fBE11BAFB, f707; +fma.rn.f32 f712, f606, 0f3F7D64F0, f708; +fma.rn.f32 f713, f604, 0fBE11BAFB, f709; +fma.rn.f32 f714, f605, 0f3F7D64F0, f710; +add.f32 %0, f613, f603; +add.f32 %1, f614, f604; +add.f32 %3, f634, f633; +sub.f32 %2, f631, f632; +add.f32 %5, f654, f653; +sub.f32 %4, f651, f652; +add.f32 %7, f674, f673; +sub.f32 %6, f671, f672; +add.f32 %9, f694, f693; +sub.f32 %8, f691, f692; +add.f32 %11, f714, f713; +sub.f32 %10, f711, f712; +sub.f32 %13, f713, f714; +add.f32 %12, f712, f711; +sub.f32 %15, f693, f694; +add.f32 %14, f692, f691; +sub.f32 %17, f673, f674; +add.f32 %16, f672, f671; +sub.f32 %19, f653, f654; +add.f32 %18, f652, f651; +sub.f32 %21, f633, f634; +add.f32 %20, f632, f631; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y): "r"(smem), "l"(lut_sp_11_1331), "l"(lut_sp_11_121), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..3f2be2f13c6f1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp64_fwd.hpp.inc @@ -0,0 +1,1502 @@ +#ifndef CUFFTDX_FFT_1331_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_1331_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<556, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<735>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 10648, r2; +add.f64 fd45, %27, %51; +add.f64 fd46, %29, %52; +sub.f64 fd47, %27, %51; +sub.f64 fd48, %29, %52; +add.f64 fd49, %30, %49; +add.f64 fd50, %32, %50; +sub.f64 fd51, %30, %49; +sub.f64 fd52, %32, %50; +add.f64 fd53, %33, %46; +add.f64 fd54, %34, %48; +sub.f64 fd55, %33, %46; +sub.f64 fd56, %34, %48; +add.f64 fd57, %35, %43; +add.f64 fd58, %37, %45; +sub.f64 fd59, %35, %43; +sub.f64 fd60, %37, %45; +add.f64 fd61, %38, %41; +add.f64 fd62, %40, %42; +sub.f64 fd63, %38, %41; +sub.f64 fd64, %40, %42; +mov.u32 r4, %tid.x; +add.f64 fd65, %25, fd45; +add.f64 fd66, %26, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +add.f64 fd73, fd71, fd61; +add.f64 fd74, fd72, fd62; +fma.rn.f64 fd75, fd45, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd76, fd48, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd46, 0d3FEAEB8C8764F0BA, %26; +fma.rn.f64 fd78, fd47, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd79, fd49, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd52, 0dBFED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd50, 0d3FDA9628D9C712B6, fd77; +fma.rn.f64 fd82, fd51, 0dBFED1BB48EEE2C13, fd78; +fma.rn.f64 fd83, fd53, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd56, 0dBFEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd54, 0dBFC2375F640F44DB, fd81; +fma.rn.f64 fd86, fd55, 0dBFEFAC9E043842EF, fd82; +fma.rn.f64 fd87, fd57, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd60, 0dBFE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd58, 0dBFE4F49E7F775887, fd85; +fma.rn.f64 fd90, fd59, 0dBFE82F19BB3A28A1, fd86; +fma.rn.f64 fd91, fd61, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd64, 0dBFD207E7FD768DBF, fd88; +fma.rn.f64 fd93, fd62, 0dBFEEB42A9BCD5057, fd89; +fma.rn.f64 fd94, fd63, 0dBFD207E7FD768DBF, fd90; +sub.f64 fd95, fd91, fd92; +add.f64 fd96, fd94, fd93; +add.f64 fd97, fd92, fd91; +sub.f64 fd98, fd93, fd94; +fma.rn.f64 fd99, fd45, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd100, fd48, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd46, 0d3FDA9628D9C712B6, %26; +fma.rn.f64 fd102, fd47, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd103, fd49, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd52, 0dBFE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd50, 0dBFE4F49E7F775887, fd101; +fma.rn.f64 fd106, fd51, 0dBFE82F19BB3A28A1, fd102; +fma.rn.f64 fd107, fd53, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd56, 0d3FD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd54, 0dBFEEB42A9BCD5057, fd105; +fma.rn.f64 fd110, fd55, 0d3FD207E7FD768DBF, fd106; +fma.rn.f64 fd111, fd57, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd60, 0d3FEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd58, 0dBFC2375F640F44DB, fd109; +fma.rn.f64 fd114, fd59, 0d3FEFAC9E043842EF, fd110; +fma.rn.f64 fd115, fd61, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd64, 0d3FE14CEDF8BB580B, fd112; +fma.rn.f64 fd117, fd62, 0d3FEAEB8C8764F0BA, fd113; +fma.rn.f64 fd118, fd63, 0d3FE14CEDF8BB580B, fd114; +sub.f64 fd119, fd115, fd116; +add.f64 fd120, fd118, fd117; +add.f64 fd121, fd116, fd115; +sub.f64 fd122, fd117, fd118; +fma.rn.f64 fd123, fd45, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd124, fd48, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd46, 0dBFC2375F640F44DB, %26; +fma.rn.f64 fd126, fd47, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd127, fd49, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd52, 0d3FD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd50, 0dBFEEB42A9BCD5057, fd125; +fma.rn.f64 fd130, fd51, 0d3FD207E7FD768DBF, fd126; +fma.rn.f64 fd131, fd53, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd56, 0d3FED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd54, 0d3FDA9628D9C712B6, fd129; +fma.rn.f64 fd134, fd55, 0d3FED1BB48EEE2C13, fd130; +fma.rn.f64 fd135, fd57, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd60, 0dBFE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd58, 0d3FEAEB8C8764F0BA, fd133; +fma.rn.f64 fd138, fd59, 0dBFE14CEDF8BB580B, fd134; +fma.rn.f64 fd139, fd61, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd64, 0dBFE82F19BB3A28A1, fd136; +fma.rn.f64 fd141, fd62, 0dBFE4F49E7F775887, fd137; +fma.rn.f64 fd142, fd63, 0dBFE82F19BB3A28A1, fd138; +sub.f64 fd143, fd139, fd140; +add.f64 fd144, fd142, fd141; +add.f64 fd145, fd140, fd139; +sub.f64 fd146, fd141, fd142; +fma.rn.f64 fd147, fd45, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd148, fd48, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd46, 0dBFE4F49E7F775887, %26; +fma.rn.f64 fd150, fd47, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd151, fd49, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd52, 0d3FEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd50, 0dBFC2375F640F44DB, fd149; +fma.rn.f64 fd154, fd51, 0d3FEFAC9E043842EF, fd150; +fma.rn.f64 fd155, fd53, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd56, 0dBFE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd54, 0d3FEAEB8C8764F0BA, fd153; +fma.rn.f64 fd158, fd55, 0dBFE14CEDF8BB580B, fd154; +fma.rn.f64 fd159, fd57, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd60, 0dBFD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd58, 0dBFEEB42A9BCD5057, fd157; +fma.rn.f64 fd162, fd59, 0dBFD207E7FD768DBF, fd158; +fma.rn.f64 fd163, fd61, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd64, 0d3FED1BB48EEE2C13, fd160; +fma.rn.f64 fd165, fd62, 0d3FDA9628D9C712B6, fd161; +fma.rn.f64 fd166, fd63, 0d3FED1BB48EEE2C13, fd162; +sub.f64 fd167, fd163, fd164; +add.f64 fd168, fd166, fd165; +add.f64 fd169, fd164, fd163; +sub.f64 fd170, fd165, fd166; +fma.rn.f64 fd171, fd45, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd172, fd48, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd46, 0dBFEEB42A9BCD5057, %26; +fma.rn.f64 fd174, fd47, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd175, fd49, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd52, 0d3FE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd50, 0d3FEAEB8C8764F0BA, fd173; +fma.rn.f64 fd178, fd51, 0d3FE14CEDF8BB580B, fd174; +fma.rn.f64 fd179, fd53, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd56, 0dBFE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd54, 0dBFE4F49E7F775887, fd177; +fma.rn.f64 fd182, fd55, 0dBFE82F19BB3A28A1, fd178; +fma.rn.f64 fd183, fd57, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd60, 0d3FED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd58, 0d3FDA9628D9C712B6, fd181; +fma.rn.f64 fd186, fd59, 0d3FED1BB48EEE2C13, fd182; +fma.rn.f64 fd187, fd61, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd64, 0dBFEFAC9E043842EF, fd184; +fma.rn.f64 fd189, fd62, 0dBFC2375F640F44DB, fd185; +fma.rn.f64 fd190, fd63, 0dBFEFAC9E043842EF, fd186; +sub.f64 fd191, fd187, fd188; +add.f64 fd192, fd190, fd189; +add.f64 fd193, fd188, fd187; +sub.f64 fd194, fd189, fd190; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd195, fd196}, [rd6]; +mul.f64 fd199, fd195, fd95; +mul.f64 fd200, fd196, fd96; +sub.f64 fd201, fd199, fd200; +mul.f64 fd202, fd195, fd96; +fma.rn.f64 fd203, fd196, fd95, fd202; +mul.f64 fd204, fd195, fd195; +mul.f64 fd205, fd196, fd196; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd196, fd195; +fma.rn.f64 fd208, fd196, fd195, fd207; +mul.f64 fd209, fd206, fd119; +mul.f64 fd210, fd208, fd120; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd206, fd120; +fma.rn.f64 fd213, fd208, fd119, fd212; +mul.f64 fd214, fd195, fd206; +mul.f64 fd215, fd196, fd208; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd195, fd208; +fma.rn.f64 fd218, fd196, fd206, fd217; +mul.f64 fd219, fd216, fd143; +mul.f64 fd220, fd218, fd144; +sub.f64 fd221, fd219, fd220; +mul.f64 fd222, fd216, fd144; +fma.rn.f64 fd223, fd218, fd143, fd222; +mul.f64 fd224, fd195, fd216; +mul.f64 fd225, fd196, fd218; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd195, fd218; +fma.rn.f64 fd228, fd196, fd216, fd227; +mul.f64 fd229, fd226, fd167; +mul.f64 fd230, fd228, fd168; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd226, fd168; +fma.rn.f64 fd233, fd228, fd167, fd232; +mul.f64 fd234, fd195, fd226; +mul.f64 fd235, fd196, fd228; +sub.f64 fd236, fd234, fd235; +mul.f64 fd237, fd195, fd228; +fma.rn.f64 fd238, fd196, fd226, fd237; +mul.f64 fd239, fd236, fd191; +mul.f64 fd240, fd238, fd192; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd236, fd192; +fma.rn.f64 fd243, fd238, fd191, fd242; +ld.global.v2.f64 {fd244, fd245}, [rd6+1936]; +mul.f64 fd248, fd244, fd193; +mul.f64 fd249, fd245, fd194; +sub.f64 fd250, fd248, fd249; +mul.f64 fd251, fd244, fd194; +fma.rn.f64 fd252, fd245, fd193, fd251; +mul.f64 fd253, fd195, fd244; +mul.f64 fd254, fd196, fd245; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd195, fd245; +fma.rn.f64 fd257, fd196, fd244, fd256; +mul.f64 fd258, fd255, fd169; +mul.f64 fd259, fd257, fd170; +sub.f64 fd260, fd258, fd259; +mul.f64 fd261, fd255, fd170; +fma.rn.f64 fd262, fd257, fd169, fd261; +mul.f64 fd263, fd195, fd255; +mul.f64 fd264, fd196, fd257; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd195, fd257; +fma.rn.f64 fd267, fd196, fd255, fd266; +mul.f64 fd268, fd265, fd145; +mul.f64 fd269, fd267, fd146; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd265, fd146; +fma.rn.f64 fd272, fd267, fd145, fd271; +mul.f64 fd273, fd195, fd265; +mul.f64 fd274, fd196, fd267; +sub.f64 fd275, fd273, fd274; +mul.f64 fd276, fd195, fd267; +fma.rn.f64 fd277, fd196, fd265, fd276; +mul.f64 fd278, fd275, fd121; +mul.f64 fd279, fd277, fd122; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd275, fd122; +fma.rn.f64 fd282, fd277, fd121, fd281; +mul.f64 fd283, fd195, fd275; +mul.f64 fd284, fd196, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd195, fd277; +fma.rn.f64 fd287, fd196, fd275, fd286; +mul.f64 fd288, fd285, fd97; +mul.f64 fd289, fd287, fd98; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd285, fd98; +fma.rn.f64 fd292, fd287, fd97, fd291; +mad.lo.s32 r12, r9, 10648, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 88, r12; +st.shared.f64 [r13], fd73; +st.shared.f64 [r13+8], fd201; +st.shared.f64 [r13+16], fd211; +st.shared.f64 [r13+24], fd221; +st.shared.f64 [r13+32], fd231; +st.shared.f64 [r13+40], fd241; +st.shared.f64 [r13+48], fd250; +st.shared.f64 [r13+56], fd260; +st.shared.f64 [r13+64], fd270; +st.shared.f64 [r13+72], fd280; +st.shared.f64 [r13+80], fd290; +barrier.sync 0; +mad.lo.s32 r14, r11, -80, r13; +ld.shared.f64 fd293, [r14]; +ld.shared.f64 fd294, [r14+968]; +ld.shared.f64 fd295, [r14+1936]; +ld.shared.f64 fd296, [r14+2904]; +ld.shared.f64 fd297, [r14+3872]; +ld.shared.f64 fd298, [r14+4840]; +ld.shared.f64 fd299, [r14+5808]; +ld.shared.f64 fd300, [r14+6776]; +ld.shared.f64 fd301, [r14+7744]; +ld.shared.f64 fd302, [r14+8712]; +ld.shared.f64 fd303, [r14+9680]; +barrier.sync 0; +st.shared.f64 [r13], fd74; +st.shared.f64 [r13+8], fd203; +st.shared.f64 [r13+16], fd213; +st.shared.f64 [r13+24], fd223; +st.shared.f64 [r13+32], fd233; +st.shared.f64 [r13+40], fd243; +st.shared.f64 [r13+48], fd252; +st.shared.f64 [r13+56], fd262; +st.shared.f64 [r13+64], fd272; +st.shared.f64 [r13+72], fd282; +st.shared.f64 [r13+80], fd292; +barrier.sync 0; +ld.shared.f64 fd304, [r14]; +ld.shared.f64 fd305, [r14+968]; +ld.shared.f64 fd306, [r14+1936]; +ld.shared.f64 fd307, [r14+2904]; +ld.shared.f64 fd308, [r14+3872]; +ld.shared.f64 fd309, [r14+4840]; +ld.shared.f64 fd310, [r14+5808]; +ld.shared.f64 fd311, [r14+6776]; +ld.shared.f64 fd312, [r14+7744]; +ld.shared.f64 fd313, [r14+8712]; +ld.shared.f64 fd314, [r14+9680]; +add.f64 fd315, fd294, fd303; +add.f64 fd316, fd305, fd314; +sub.f64 fd317, fd294, fd303; +sub.f64 fd318, fd305, fd314; +add.f64 fd319, fd295, fd302; +add.f64 fd320, fd306, fd313; +sub.f64 fd321, fd295, fd302; +sub.f64 fd322, fd306, fd313; +add.f64 fd323, fd296, fd301; +add.f64 fd324, fd307, fd312; +sub.f64 fd325, fd296, fd301; +sub.f64 fd326, fd307, fd312; +add.f64 fd327, fd297, fd300; +add.f64 fd328, fd308, fd311; +sub.f64 fd329, fd297, fd300; +sub.f64 fd330, fd308, fd311; +add.f64 fd331, fd298, fd299; +add.f64 fd332, fd309, fd310; +sub.f64 fd333, fd298, fd299; +sub.f64 fd334, fd309, fd310; +add.f64 fd335, fd293, fd315; +add.f64 fd336, fd304, fd316; +add.f64 fd337, fd335, fd319; +add.f64 fd338, fd336, fd320; +add.f64 fd339, fd337, fd323; +add.f64 fd340, fd338, fd324; +add.f64 fd341, fd339, fd327; +add.f64 fd342, fd340, fd328; +add.f64 fd343, fd341, fd331; +add.f64 fd344, fd342, fd332; +fma.rn.f64 fd345, fd315, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd346, fd318, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd347, fd316, 0d3FEAEB8C8764F0BA, fd304; +fma.rn.f64 fd348, fd317, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd349, fd319, 0d3FDA9628D9C712B6, fd345; +fma.rn.f64 fd350, fd322, 0dBFED1BB48EEE2C13, fd346; +fma.rn.f64 fd351, fd320, 0d3FDA9628D9C712B6, fd347; +fma.rn.f64 fd352, fd321, 0dBFED1BB48EEE2C13, fd348; +fma.rn.f64 fd353, fd323, 0dBFC2375F640F44DB, fd349; +fma.rn.f64 fd354, fd326, 0dBFEFAC9E043842EF, fd350; +fma.rn.f64 fd355, fd324, 0dBFC2375F640F44DB, fd351; +fma.rn.f64 fd356, fd325, 0dBFEFAC9E043842EF, fd352; +fma.rn.f64 fd357, fd327, 0dBFE4F49E7F775887, fd353; +fma.rn.f64 fd358, fd330, 0dBFE82F19BB3A28A1, fd354; +fma.rn.f64 fd359, fd328, 0dBFE4F49E7F775887, fd355; +fma.rn.f64 fd360, fd329, 0dBFE82F19BB3A28A1, fd356; +fma.rn.f64 fd361, fd331, 0dBFEEB42A9BCD5057, fd357; +fma.rn.f64 fd362, fd334, 0dBFD207E7FD768DBF, fd358; +fma.rn.f64 fd363, fd332, 0dBFEEB42A9BCD5057, fd359; +fma.rn.f64 fd364, fd333, 0dBFD207E7FD768DBF, fd360; +sub.f64 fd365, fd361, fd362; +add.f64 fd366, fd364, fd363; +add.f64 fd367, fd362, fd361; +sub.f64 fd368, fd363, fd364; +fma.rn.f64 fd369, fd315, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd370, fd318, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd371, fd316, 0d3FDA9628D9C712B6, fd304; +fma.rn.f64 fd372, fd317, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd373, fd319, 0dBFE4F49E7F775887, fd369; +fma.rn.f64 fd374, fd322, 0dBFE82F19BB3A28A1, fd370; +fma.rn.f64 fd375, fd320, 0dBFE4F49E7F775887, fd371; +fma.rn.f64 fd376, fd321, 0dBFE82F19BB3A28A1, fd372; +fma.rn.f64 fd377, fd323, 0dBFEEB42A9BCD5057, fd373; +fma.rn.f64 fd378, fd326, 0d3FD207E7FD768DBF, fd374; +fma.rn.f64 fd379, fd324, 0dBFEEB42A9BCD5057, fd375; +fma.rn.f64 fd380, fd325, 0d3FD207E7FD768DBF, fd376; +fma.rn.f64 fd381, fd327, 0dBFC2375F640F44DB, fd377; +fma.rn.f64 fd382, fd330, 0d3FEFAC9E043842EF, fd378; +fma.rn.f64 fd383, fd328, 0dBFC2375F640F44DB, fd379; +fma.rn.f64 fd384, fd329, 0d3FEFAC9E043842EF, fd380; +fma.rn.f64 fd385, fd331, 0d3FEAEB8C8764F0BA, fd381; +fma.rn.f64 fd386, fd334, 0d3FE14CEDF8BB580B, fd382; +fma.rn.f64 fd387, fd332, 0d3FEAEB8C8764F0BA, fd383; +fma.rn.f64 fd388, fd333, 0d3FE14CEDF8BB580B, fd384; +sub.f64 fd389, fd385, fd386; +add.f64 fd390, fd388, fd387; +add.f64 fd391, fd386, fd385; +sub.f64 fd392, fd387, fd388; +fma.rn.f64 fd393, fd315, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd394, fd318, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd395, fd316, 0dBFC2375F640F44DB, fd304; +fma.rn.f64 fd396, fd317, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd397, fd319, 0dBFEEB42A9BCD5057, fd393; +fma.rn.f64 fd398, fd322, 0d3FD207E7FD768DBF, fd394; +fma.rn.f64 fd399, fd320, 0dBFEEB42A9BCD5057, fd395; +fma.rn.f64 fd400, fd321, 0d3FD207E7FD768DBF, fd396; +fma.rn.f64 fd401, fd323, 0d3FDA9628D9C712B6, fd397; +fma.rn.f64 fd402, fd326, 0d3FED1BB48EEE2C13, fd398; +fma.rn.f64 fd403, fd324, 0d3FDA9628D9C712B6, fd399; +fma.rn.f64 fd404, fd325, 0d3FED1BB48EEE2C13, fd400; +fma.rn.f64 fd405, fd327, 0d3FEAEB8C8764F0BA, fd401; +fma.rn.f64 fd406, fd330, 0dBFE14CEDF8BB580B, fd402; +fma.rn.f64 fd407, fd328, 0d3FEAEB8C8764F0BA, fd403; +fma.rn.f64 fd408, fd329, 0dBFE14CEDF8BB580B, fd404; +fma.rn.f64 fd409, fd331, 0dBFE4F49E7F775887, fd405; +fma.rn.f64 fd410, fd334, 0dBFE82F19BB3A28A1, fd406; +fma.rn.f64 fd411, fd332, 0dBFE4F49E7F775887, fd407; +fma.rn.f64 fd412, fd333, 0dBFE82F19BB3A28A1, fd408; +sub.f64 fd413, fd409, fd410; +add.f64 fd414, fd412, fd411; +add.f64 fd415, fd410, fd409; +sub.f64 fd416, fd411, fd412; +fma.rn.f64 fd417, fd315, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd418, fd318, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd419, fd316, 0dBFE4F49E7F775887, fd304; +fma.rn.f64 fd420, fd317, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd421, fd319, 0dBFC2375F640F44DB, fd417; +fma.rn.f64 fd422, fd322, 0d3FEFAC9E043842EF, fd418; +fma.rn.f64 fd423, fd320, 0dBFC2375F640F44DB, fd419; +fma.rn.f64 fd424, fd321, 0d3FEFAC9E043842EF, fd420; +fma.rn.f64 fd425, fd323, 0d3FEAEB8C8764F0BA, fd421; +fma.rn.f64 fd426, fd326, 0dBFE14CEDF8BB580B, fd422; +fma.rn.f64 fd427, fd324, 0d3FEAEB8C8764F0BA, fd423; +fma.rn.f64 fd428, fd325, 0dBFE14CEDF8BB580B, fd424; +fma.rn.f64 fd429, fd327, 0dBFEEB42A9BCD5057, fd425; +fma.rn.f64 fd430, fd330, 0dBFD207E7FD768DBF, fd426; +fma.rn.f64 fd431, fd328, 0dBFEEB42A9BCD5057, fd427; +fma.rn.f64 fd432, fd329, 0dBFD207E7FD768DBF, fd428; +fma.rn.f64 fd433, fd331, 0d3FDA9628D9C712B6, fd429; +fma.rn.f64 fd434, fd334, 0d3FED1BB48EEE2C13, fd430; +fma.rn.f64 fd435, fd332, 0d3FDA9628D9C712B6, fd431; +fma.rn.f64 fd436, fd333, 0d3FED1BB48EEE2C13, fd432; +sub.f64 fd437, fd433, fd434; +add.f64 fd438, fd436, fd435; +add.f64 fd439, fd434, fd433; +sub.f64 fd440, fd435, fd436; +fma.rn.f64 fd441, fd315, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd442, fd318, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd443, fd316, 0dBFEEB42A9BCD5057, fd304; +fma.rn.f64 fd444, fd317, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd445, fd319, 0d3FEAEB8C8764F0BA, fd441; +fma.rn.f64 fd446, fd322, 0d3FE14CEDF8BB580B, fd442; +fma.rn.f64 fd447, fd320, 0d3FEAEB8C8764F0BA, fd443; +fma.rn.f64 fd448, fd321, 0d3FE14CEDF8BB580B, fd444; +fma.rn.f64 fd449, fd323, 0dBFE4F49E7F775887, fd445; +fma.rn.f64 fd450, fd326, 0dBFE82F19BB3A28A1, fd446; +fma.rn.f64 fd451, fd324, 0dBFE4F49E7F775887, fd447; +fma.rn.f64 fd452, fd325, 0dBFE82F19BB3A28A1, fd448; +fma.rn.f64 fd453, fd327, 0d3FDA9628D9C712B6, fd449; +fma.rn.f64 fd454, fd330, 0d3FED1BB48EEE2C13, fd450; +fma.rn.f64 fd455, fd328, 0d3FDA9628D9C712B6, fd451; +fma.rn.f64 fd456, fd329, 0d3FED1BB48EEE2C13, fd452; +fma.rn.f64 fd457, fd331, 0dBFC2375F640F44DB, fd453; +fma.rn.f64 fd458, fd334, 0dBFEFAC9E043842EF, fd454; +fma.rn.f64 fd459, fd332, 0dBFC2375F640F44DB, fd455; +fma.rn.f64 fd460, fd333, 0dBFEFAC9E043842EF, fd456; +sub.f64 fd461, fd457, fd458; +add.f64 fd462, fd460, fd459; +add.f64 fd463, fd458, fd457; +sub.f64 fd464, fd459, fd460; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd465, fd466}, [rd11]; +mul.f64 fd469, fd465, fd365; +mul.f64 fd470, fd466, fd366; +sub.f64 fd471, fd469, fd470; +mul.f64 fd472, fd465, fd366; +fma.rn.f64 fd473, fd466, fd365, fd472; +mul.f64 fd474, fd465, fd465; +mul.f64 fd475, fd466, fd466; +sub.f64 fd476, fd474, fd475; +mul.f64 fd477, fd466, fd465; +fma.rn.f64 fd478, fd466, fd465, fd477; +mul.f64 fd479, fd476, fd389; +mul.f64 fd480, fd478, fd390; +sub.f64 fd481, fd479, fd480; +mul.f64 fd482, fd476, fd390; +fma.rn.f64 fd483, fd478, fd389, fd482; +mul.f64 fd484, fd465, fd476; +mul.f64 fd485, fd466, fd478; +sub.f64 fd486, fd484, fd485; +mul.f64 fd487, fd465, fd478; +fma.rn.f64 fd488, fd466, fd476, fd487; +mul.f64 fd489, fd486, fd413; +mul.f64 fd490, fd488, fd414; +sub.f64 fd491, fd489, fd490; +mul.f64 fd492, fd486, fd414; +fma.rn.f64 fd493, fd488, fd413, fd492; +mul.f64 fd494, fd465, fd486; +mul.f64 fd495, fd466, fd488; +sub.f64 fd496, fd494, fd495; +mul.f64 fd497, fd465, fd488; +fma.rn.f64 fd498, fd466, fd486, fd497; +mul.f64 fd499, fd496, fd437; +mul.f64 fd500, fd498, fd438; +sub.f64 fd501, fd499, fd500; +mul.f64 fd502, fd496, fd438; +fma.rn.f64 fd503, fd498, fd437, fd502; +mul.f64 fd504, fd465, fd496; +mul.f64 fd505, fd466, fd498; +sub.f64 fd506, fd504, fd505; +mul.f64 fd507, fd465, fd498; +fma.rn.f64 fd508, fd466, fd496, fd507; +mul.f64 fd509, fd506, fd461; +mul.f64 fd510, fd508, fd462; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, fd462; +fma.rn.f64 fd513, fd508, fd461, fd512; +ld.global.v2.f64 {fd514, fd515}, [rd11+176]; +mul.f64 fd518, fd514, fd463; +mul.f64 fd519, fd515, fd464; +sub.f64 fd520, fd518, fd519; +mul.f64 fd521, fd514, fd464; +fma.rn.f64 fd522, fd515, fd463, fd521; +mul.f64 fd523, fd465, fd514; +mul.f64 fd524, fd466, fd515; +sub.f64 fd525, fd523, fd524; +mul.f64 fd526, fd465, fd515; +fma.rn.f64 fd527, fd466, fd514, fd526; +mul.f64 fd528, fd525, fd439; +mul.f64 fd529, fd527, fd440; +sub.f64 fd530, fd528, fd529; +mul.f64 fd531, fd525, fd440; +fma.rn.f64 fd532, fd527, fd439, fd531; +mul.f64 fd533, fd465, fd525; +mul.f64 fd534, fd466, fd527; +sub.f64 fd535, fd533, fd534; +mul.f64 fd536, fd465, fd527; +fma.rn.f64 fd537, fd466, fd525, fd536; +mul.f64 fd538, fd535, fd415; +mul.f64 fd539, fd537, fd416; +sub.f64 fd540, fd538, fd539; +mul.f64 fd541, fd535, fd416; +fma.rn.f64 fd542, fd537, fd415, fd541; +mul.f64 fd543, fd465, fd535; +mul.f64 fd544, fd466, fd537; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd465, fd537; +fma.rn.f64 fd547, fd466, fd535, fd546; +mul.f64 fd548, fd545, fd391; +mul.f64 fd549, fd547, fd392; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd545, fd392; +fma.rn.f64 fd552, fd547, fd391, fd551; +mul.f64 fd553, fd465, fd545; +mul.f64 fd554, fd466, fd547; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd465, fd547; +fma.rn.f64 fd557, fd466, fd545, fd556; +mul.f64 fd558, fd555, fd367; +mul.f64 fd559, fd557, fd368; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd555, fd368; +fma.rn.f64 fd562, fd557, fd367, fd561; +shl.b32 r18, r17, 3; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 968, r19; +st.shared.f64 [r20], fd343; +st.shared.f64 [r20+88], fd471; +st.shared.f64 [r20+176], fd481; +st.shared.f64 [r20+264], fd491; +st.shared.f64 [r20+352], fd501; +st.shared.f64 [r20+440], fd511; +st.shared.f64 [r20+528], fd520; +st.shared.f64 [r20+616], fd530; +st.shared.f64 [r20+704], fd540; +st.shared.f64 [r20+792], fd550; +st.shared.f64 [r20+880], fd560; +barrier.sync 0; +ld.shared.f64 fd563, [r14]; +ld.shared.f64 fd564, [r14+968]; +ld.shared.f64 fd565, [r14+1936]; +ld.shared.f64 fd566, [r14+2904]; +ld.shared.f64 fd567, [r14+3872]; +ld.shared.f64 fd568, [r14+4840]; +ld.shared.f64 fd569, [r14+5808]; +ld.shared.f64 fd570, [r14+6776]; +ld.shared.f64 fd571, [r14+7744]; +ld.shared.f64 fd572, [r14+8712]; +ld.shared.f64 fd573, [r14+9680]; +barrier.sync 0; +st.shared.f64 [r20], fd344; +st.shared.f64 [r20+88], fd473; +st.shared.f64 [r20+176], fd483; +st.shared.f64 [r20+264], fd493; +st.shared.f64 [r20+352], fd503; +st.shared.f64 [r20+440], fd513; +st.shared.f64 [r20+528], fd522; +st.shared.f64 [r20+616], fd532; +st.shared.f64 [r20+704], fd542; +st.shared.f64 [r20+792], fd552; +st.shared.f64 [r20+880], fd562; +barrier.sync 0; +ld.shared.f64 fd574, [r14]; +ld.shared.f64 fd575, [r14+968]; +ld.shared.f64 fd576, [r14+1936]; +ld.shared.f64 fd577, [r14+2904]; +ld.shared.f64 fd578, [r14+3872]; +ld.shared.f64 fd579, [r14+4840]; +ld.shared.f64 fd580, [r14+5808]; +ld.shared.f64 fd581, [r14+6776]; +ld.shared.f64 fd582, [r14+7744]; +ld.shared.f64 fd583, [r14+8712]; +ld.shared.f64 fd584, [r14+9680]; +add.f64 fd585, fd564, fd573; +add.f64 fd586, fd575, fd584; +sub.f64 fd587, fd564, fd573; +sub.f64 fd588, fd575, fd584; +add.f64 fd589, fd565, fd572; +add.f64 fd590, fd576, fd583; +sub.f64 fd591, fd565, fd572; +sub.f64 fd592, fd576, fd583; +add.f64 fd593, fd566, fd571; +add.f64 fd594, fd577, fd582; +sub.f64 fd595, fd566, fd571; +sub.f64 fd596, fd577, fd582; +add.f64 fd597, fd567, fd570; +add.f64 fd598, fd578, fd581; +sub.f64 fd599, fd567, fd570; +sub.f64 fd600, fd578, fd581; +add.f64 fd601, fd568, fd569; +add.f64 fd602, fd579, fd580; +sub.f64 fd603, fd568, fd569; +sub.f64 fd604, fd579, fd580; +add.f64 fd605, fd563, fd585; +add.f64 fd606, fd574, fd586; +add.f64 fd607, fd605, fd589; +add.f64 fd608, fd606, fd590; +add.f64 fd609, fd607, fd593; +add.f64 fd610, fd608, fd594; +add.f64 fd611, fd609, fd597; +add.f64 fd612, fd610, fd598; +fma.rn.f64 fd613, fd585, 0d3FEAEB8C8764F0BA, fd563; +fma.rn.f64 fd614, fd588, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd615, fd586, 0d3FEAEB8C8764F0BA, fd574; +fma.rn.f64 fd616, fd587, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd617, fd589, 0d3FDA9628D9C712B6, fd613; +fma.rn.f64 fd618, fd592, 0dBFED1BB48EEE2C13, fd614; +fma.rn.f64 fd619, fd590, 0d3FDA9628D9C712B6, fd615; +fma.rn.f64 fd620, fd591, 0dBFED1BB48EEE2C13, fd616; +fma.rn.f64 fd621, fd593, 0dBFC2375F640F44DB, fd617; +fma.rn.f64 fd622, fd596, 0dBFEFAC9E043842EF, fd618; +fma.rn.f64 fd623, fd594, 0dBFC2375F640F44DB, fd619; +fma.rn.f64 fd624, fd595, 0dBFEFAC9E043842EF, fd620; +fma.rn.f64 fd625, fd597, 0dBFE4F49E7F775887, fd621; +fma.rn.f64 fd626, fd600, 0dBFE82F19BB3A28A1, fd622; +fma.rn.f64 fd627, fd598, 0dBFE4F49E7F775887, fd623; +fma.rn.f64 fd628, fd599, 0dBFE82F19BB3A28A1, fd624; +fma.rn.f64 fd629, fd601, 0dBFEEB42A9BCD5057, fd625; +fma.rn.f64 fd630, fd604, 0dBFD207E7FD768DBF, fd626; +fma.rn.f64 fd631, fd602, 0dBFEEB42A9BCD5057, fd627; +fma.rn.f64 fd632, fd603, 0dBFD207E7FD768DBF, fd628; +fma.rn.f64 fd633, fd585, 0d3FDA9628D9C712B6, fd563; +fma.rn.f64 fd634, fd588, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd635, fd586, 0d3FDA9628D9C712B6, fd574; +fma.rn.f64 fd636, fd587, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd637, fd589, 0dBFE4F49E7F775887, fd633; +fma.rn.f64 fd638, fd592, 0dBFE82F19BB3A28A1, fd634; +fma.rn.f64 fd639, fd590, 0dBFE4F49E7F775887, fd635; +fma.rn.f64 fd640, fd591, 0dBFE82F19BB3A28A1, fd636; +fma.rn.f64 fd641, fd593, 0dBFEEB42A9BCD5057, fd637; +fma.rn.f64 fd642, fd596, 0d3FD207E7FD768DBF, fd638; +fma.rn.f64 fd643, fd594, 0dBFEEB42A9BCD5057, fd639; +fma.rn.f64 fd644, fd595, 0d3FD207E7FD768DBF, fd640; +fma.rn.f64 fd645, fd597, 0dBFC2375F640F44DB, fd641; +fma.rn.f64 fd646, fd600, 0d3FEFAC9E043842EF, fd642; +fma.rn.f64 fd647, fd598, 0dBFC2375F640F44DB, fd643; +fma.rn.f64 fd648, fd599, 0d3FEFAC9E043842EF, fd644; +fma.rn.f64 fd649, fd601, 0d3FEAEB8C8764F0BA, fd645; +fma.rn.f64 fd650, fd604, 0d3FE14CEDF8BB580B, fd646; +fma.rn.f64 fd651, fd602, 0d3FEAEB8C8764F0BA, fd647; +fma.rn.f64 fd652, fd603, 0d3FE14CEDF8BB580B, fd648; +fma.rn.f64 fd653, fd585, 0dBFC2375F640F44DB, fd563; +fma.rn.f64 fd654, fd588, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd655, fd586, 0dBFC2375F640F44DB, fd574; +fma.rn.f64 fd656, fd587, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd657, fd589, 0dBFEEB42A9BCD5057, fd653; +fma.rn.f64 fd658, fd592, 0d3FD207E7FD768DBF, fd654; +fma.rn.f64 fd659, fd590, 0dBFEEB42A9BCD5057, fd655; +fma.rn.f64 fd660, fd591, 0d3FD207E7FD768DBF, fd656; +fma.rn.f64 fd661, fd593, 0d3FDA9628D9C712B6, fd657; +fma.rn.f64 fd662, fd596, 0d3FED1BB48EEE2C13, fd658; +fma.rn.f64 fd663, fd594, 0d3FDA9628D9C712B6, fd659; +fma.rn.f64 fd664, fd595, 0d3FED1BB48EEE2C13, fd660; +fma.rn.f64 fd665, fd597, 0d3FEAEB8C8764F0BA, fd661; +fma.rn.f64 fd666, fd600, 0dBFE14CEDF8BB580B, fd662; +fma.rn.f64 fd667, fd598, 0d3FEAEB8C8764F0BA, fd663; +fma.rn.f64 fd668, fd599, 0dBFE14CEDF8BB580B, fd664; +fma.rn.f64 fd669, fd601, 0dBFE4F49E7F775887, fd665; +fma.rn.f64 fd670, fd604, 0dBFE82F19BB3A28A1, fd666; +fma.rn.f64 fd671, fd602, 0dBFE4F49E7F775887, fd667; +fma.rn.f64 fd672, fd603, 0dBFE82F19BB3A28A1, fd668; +fma.rn.f64 fd673, fd585, 0dBFE4F49E7F775887, fd563; +fma.rn.f64 fd674, fd588, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd675, fd586, 0dBFE4F49E7F775887, fd574; +fma.rn.f64 fd676, fd587, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd677, fd589, 0dBFC2375F640F44DB, fd673; +fma.rn.f64 fd678, fd592, 0d3FEFAC9E043842EF, fd674; +fma.rn.f64 fd679, fd590, 0dBFC2375F640F44DB, fd675; +fma.rn.f64 fd680, fd591, 0d3FEFAC9E043842EF, fd676; +fma.rn.f64 fd681, fd593, 0d3FEAEB8C8764F0BA, fd677; +fma.rn.f64 fd682, fd596, 0dBFE14CEDF8BB580B, fd678; +fma.rn.f64 fd683, fd594, 0d3FEAEB8C8764F0BA, fd679; +fma.rn.f64 fd684, fd595, 0dBFE14CEDF8BB580B, fd680; +fma.rn.f64 fd685, fd597, 0dBFEEB42A9BCD5057, fd681; +fma.rn.f64 fd686, fd600, 0dBFD207E7FD768DBF, fd682; +fma.rn.f64 fd687, fd598, 0dBFEEB42A9BCD5057, fd683; +fma.rn.f64 fd688, fd599, 0dBFD207E7FD768DBF, fd684; +fma.rn.f64 fd689, fd601, 0d3FDA9628D9C712B6, fd685; +fma.rn.f64 fd690, fd604, 0d3FED1BB48EEE2C13, fd686; +fma.rn.f64 fd691, fd602, 0d3FDA9628D9C712B6, fd687; +fma.rn.f64 fd692, fd603, 0d3FED1BB48EEE2C13, fd688; +fma.rn.f64 fd693, fd585, 0dBFEEB42A9BCD5057, fd563; +fma.rn.f64 fd694, fd588, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd695, fd586, 0dBFEEB42A9BCD5057, fd574; +fma.rn.f64 fd696, fd587, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd697, fd589, 0d3FEAEB8C8764F0BA, fd693; +fma.rn.f64 fd698, fd592, 0d3FE14CEDF8BB580B, fd694; +fma.rn.f64 fd699, fd590, 0d3FEAEB8C8764F0BA, fd695; +fma.rn.f64 fd700, fd591, 0d3FE14CEDF8BB580B, fd696; +fma.rn.f64 fd701, fd593, 0dBFE4F49E7F775887, fd697; +fma.rn.f64 fd702, fd596, 0dBFE82F19BB3A28A1, fd698; +fma.rn.f64 fd703, fd594, 0dBFE4F49E7F775887, fd699; +fma.rn.f64 fd704, fd595, 0dBFE82F19BB3A28A1, fd700; +fma.rn.f64 fd705, fd597, 0d3FDA9628D9C712B6, fd701; +fma.rn.f64 fd706, fd600, 0d3FED1BB48EEE2C13, fd702; +fma.rn.f64 fd707, fd598, 0d3FDA9628D9C712B6, fd703; +fma.rn.f64 fd708, fd599, 0d3FED1BB48EEE2C13, fd704; +fma.rn.f64 fd709, fd601, 0dBFC2375F640F44DB, fd705; +fma.rn.f64 fd710, fd604, 0dBFEFAC9E043842EF, fd706; +fma.rn.f64 fd711, fd602, 0dBFC2375F640F44DB, fd707; +fma.rn.f64 fd712, fd603, 0dBFEFAC9E043842EF, fd708; +add.f64 %0, fd611, fd601; +add.f64 %1, fd612, fd602; +add.f64 %3, fd632, fd631; +sub.f64 %2, fd629, fd630; +add.f64 %5, fd652, fd651; +sub.f64 %4, fd649, fd650; +add.f64 %7, fd672, fd671; +sub.f64 %6, fd669, fd670; +add.f64 %9, fd692, fd691; +sub.f64 %8, fd689, fd690; +add.f64 %11, fd712, fd711; +sub.f64 %10, fd709, fd710; +sub.f64 %13, fd711, fd712; +add.f64 %12, fd710, fd709; +sub.f64 %15, fd691, fd692; +add.f64 %14, fd690, fd689; +sub.f64 %17, fd671, fd672; +add.f64 %16, fd670, fd669; +sub.f64 %19, fd651, fd652; +add.f64 %18, fd650, fd649; +sub.f64 %21, fd631, fd632; +add.f64 %20, fd630, fd629; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_1331), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<557, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<779>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 21296, r2; +add.f64 fd45, %27, %51; +add.f64 fd46, %29, %52; +sub.f64 fd47, %27, %51; +sub.f64 fd48, %29, %52; +add.f64 fd49, %30, %49; +add.f64 fd50, %32, %50; +sub.f64 fd51, %30, %49; +sub.f64 fd52, %32, %50; +add.f64 fd53, %33, %46; +add.f64 fd54, %34, %48; +sub.f64 fd55, %33, %46; +sub.f64 fd56, %34, %48; +add.f64 fd57, %35, %43; +add.f64 fd58, %37, %45; +sub.f64 fd59, %35, %43; +sub.f64 fd60, %37, %45; +add.f64 fd61, %38, %41; +add.f64 fd62, %40, %42; +sub.f64 fd63, %38, %41; +sub.f64 fd64, %40, %42; +mov.u32 r4, %tid.x; +add.f64 fd65, %25, fd45; +add.f64 fd66, %26, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +fma.rn.f64 fd73, fd45, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd74, fd48, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd75, fd46, 0d3FEAEB8C8764F0BA, %26; +fma.rn.f64 fd76, fd47, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd49, 0d3FDA9628D9C712B6, fd73; +fma.rn.f64 fd78, fd52, 0dBFED1BB48EEE2C13, fd74; +fma.rn.f64 fd79, fd50, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd51, 0dBFED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd53, 0dBFC2375F640F44DB, fd77; +fma.rn.f64 fd82, fd56, 0dBFEFAC9E043842EF, fd78; +fma.rn.f64 fd83, fd54, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd55, 0dBFEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd57, 0dBFE4F49E7F775887, fd81; +fma.rn.f64 fd86, fd60, 0dBFE82F19BB3A28A1, fd82; +fma.rn.f64 fd87, fd58, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd59, 0dBFE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd61, 0dBFEEB42A9BCD5057, fd85; +fma.rn.f64 fd90, fd64, 0dBFD207E7FD768DBF, fd86; +fma.rn.f64 fd91, fd62, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd63, 0dBFD207E7FD768DBF, fd88; +sub.f64 fd93, fd89, fd90; +add.f64 fd94, fd92, fd91; +add.f64 fd95, fd90, fd89; +sub.f64 fd96, fd91, fd92; +fma.rn.f64 fd97, fd45, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd98, fd48, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd99, fd46, 0d3FDA9628D9C712B6, %26; +fma.rn.f64 fd100, fd47, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd49, 0dBFE4F49E7F775887, fd97; +fma.rn.f64 fd102, fd52, 0dBFE82F19BB3A28A1, fd98; +fma.rn.f64 fd103, fd50, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd51, 0dBFE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd53, 0dBFEEB42A9BCD5057, fd101; +fma.rn.f64 fd106, fd56, 0d3FD207E7FD768DBF, fd102; +fma.rn.f64 fd107, fd54, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd55, 0d3FD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd57, 0dBFC2375F640F44DB, fd105; +fma.rn.f64 fd110, fd60, 0d3FEFAC9E043842EF, fd106; +fma.rn.f64 fd111, fd58, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd59, 0d3FEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd61, 0d3FEAEB8C8764F0BA, fd109; +fma.rn.f64 fd114, fd64, 0d3FE14CEDF8BB580B, fd110; +fma.rn.f64 fd115, fd62, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd63, 0d3FE14CEDF8BB580B, fd112; +sub.f64 fd117, fd113, fd114; +add.f64 fd118, fd116, fd115; +add.f64 fd119, fd114, fd113; +sub.f64 fd120, fd115, fd116; +fma.rn.f64 fd121, fd45, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd122, fd48, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd123, fd46, 0dBFC2375F640F44DB, %26; +fma.rn.f64 fd124, fd47, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd49, 0dBFEEB42A9BCD5057, fd121; +fma.rn.f64 fd126, fd52, 0d3FD207E7FD768DBF, fd122; +fma.rn.f64 fd127, fd50, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd51, 0d3FD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd53, 0d3FDA9628D9C712B6, fd125; +fma.rn.f64 fd130, fd56, 0d3FED1BB48EEE2C13, fd126; +fma.rn.f64 fd131, fd54, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd55, 0d3FED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd57, 0d3FEAEB8C8764F0BA, fd129; +fma.rn.f64 fd134, fd60, 0dBFE14CEDF8BB580B, fd130; +fma.rn.f64 fd135, fd58, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd59, 0dBFE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd61, 0dBFE4F49E7F775887, fd133; +fma.rn.f64 fd138, fd64, 0dBFE82F19BB3A28A1, fd134; +fma.rn.f64 fd139, fd62, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd63, 0dBFE82F19BB3A28A1, fd136; +sub.f64 fd141, fd137, fd138; +add.f64 fd142, fd140, fd139; +add.f64 fd143, fd138, fd137; +sub.f64 fd144, fd139, fd140; +fma.rn.f64 fd145, fd45, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd146, fd48, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd147, fd46, 0dBFE4F49E7F775887, %26; +fma.rn.f64 fd148, fd47, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd49, 0dBFC2375F640F44DB, fd145; +fma.rn.f64 fd150, fd52, 0d3FEFAC9E043842EF, fd146; +fma.rn.f64 fd151, fd50, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd51, 0d3FEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd53, 0d3FEAEB8C8764F0BA, fd149; +fma.rn.f64 fd154, fd56, 0dBFE14CEDF8BB580B, fd150; +fma.rn.f64 fd155, fd54, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd55, 0dBFE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd57, 0dBFEEB42A9BCD5057, fd153; +fma.rn.f64 fd158, fd60, 0dBFD207E7FD768DBF, fd154; +fma.rn.f64 fd159, fd58, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd59, 0dBFD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd61, 0d3FDA9628D9C712B6, fd157; +fma.rn.f64 fd162, fd64, 0d3FED1BB48EEE2C13, fd158; +fma.rn.f64 fd163, fd62, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd63, 0d3FED1BB48EEE2C13, fd160; +sub.f64 fd165, fd161, fd162; +add.f64 fd166, fd164, fd163; +add.f64 fd167, fd162, fd161; +sub.f64 fd168, fd163, fd164; +fma.rn.f64 fd169, fd45, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd170, fd48, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd171, fd46, 0dBFEEB42A9BCD5057, %26; +fma.rn.f64 fd172, fd47, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd49, 0d3FEAEB8C8764F0BA, fd169; +fma.rn.f64 fd174, fd52, 0d3FE14CEDF8BB580B, fd170; +fma.rn.f64 fd175, fd50, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd51, 0d3FE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd53, 0dBFE4F49E7F775887, fd173; +fma.rn.f64 fd178, fd56, 0dBFE82F19BB3A28A1, fd174; +fma.rn.f64 fd179, fd54, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd55, 0dBFE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd57, 0d3FDA9628D9C712B6, fd177; +fma.rn.f64 fd182, fd60, 0d3FED1BB48EEE2C13, fd178; +fma.rn.f64 fd183, fd58, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd59, 0d3FED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd61, 0dBFC2375F640F44DB, fd181; +fma.rn.f64 fd186, fd64, 0dBFEFAC9E043842EF, fd182; +fma.rn.f64 fd187, fd62, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd63, 0dBFEFAC9E043842EF, fd184; +sub.f64 fd189, fd185, fd186; +add.f64 fd190, fd188, fd187; +add.f64 fd191, fd186, fd185; +sub.f64 fd192, fd187, fd188; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 21296, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd193, fd194}, [rd6]; +mul.f64 fd197, fd193, fd93; +mul.f64 fd198, fd194, fd94; +mul.f64 fd199, fd193, fd94; +mul.f64 fd200, fd193, fd193; +mul.f64 fd201, fd194, fd194; +sub.f64 fd202, fd200, fd201; +mul.f64 fd203, fd194, fd193; +fma.rn.f64 fd204, fd194, fd193, fd203; +mul.f64 fd205, fd202, fd117; +mul.f64 fd206, fd204, fd118; +mul.f64 fd207, fd202, fd118; +mul.f64 fd208, fd193, fd202; +mul.f64 fd209, fd194, fd204; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd193, fd204; +fma.rn.f64 fd212, fd194, fd202, fd211; +mul.f64 fd213, fd210, fd141; +mul.f64 fd214, fd212, fd142; +mul.f64 fd215, fd210, fd142; +mul.f64 fd216, fd193, fd210; +mul.f64 fd217, fd194, fd212; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd193, fd212; +fma.rn.f64 fd220, fd194, fd210, fd219; +mul.f64 fd221, fd218, fd165; +mul.f64 fd222, fd220, fd166; +mul.f64 fd223, fd218, fd166; +mul.f64 fd224, fd193, fd218; +mul.f64 fd225, fd194, fd220; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd193, fd220; +fma.rn.f64 fd228, fd194, fd218, fd227; +mul.f64 fd229, fd226, fd189; +mul.f64 fd230, fd228, fd190; +mul.f64 fd231, fd226, fd190; +ld.global.v2.f64 {fd232, fd233}, [rd6+1936]; +mul.f64 fd236, fd232, fd191; +mul.f64 fd237, fd233, fd192; +mul.f64 fd238, fd232, fd192; +mul.f64 fd239, fd193, fd232; +mul.f64 fd240, fd194, fd233; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd193, fd233; +fma.rn.f64 fd243, fd194, fd232, fd242; +mul.f64 fd244, fd241, fd167; +mul.f64 fd245, fd243, fd168; +mul.f64 fd246, fd241, fd168; +mul.f64 fd247, fd193, fd241; +mul.f64 fd248, fd194, fd243; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd193, fd243; +fma.rn.f64 fd251, fd194, fd241, fd250; +mul.f64 fd252, fd249, fd143; +mul.f64 fd253, fd251, fd144; +mul.f64 fd254, fd249, fd144; +mul.f64 fd255, fd193, fd249; +mul.f64 fd256, fd194, fd251; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd193, fd251; +fma.rn.f64 fd259, fd194, fd249, fd258; +mul.f64 fd260, fd257, fd119; +mul.f64 fd261, fd259, fd120; +mul.f64 fd262, fd257, fd120; +mul.f64 fd263, fd193, fd257; +mul.f64 fd264, fd194, fd259; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd193, fd259; +fma.rn.f64 fd267, fd194, fd257, fd266; +mul.f64 fd268, fd265, fd95; +mul.f64 fd269, fd267, fd96; +mul.f64 fd270, fd265, fd96; +barrier.sync 0; +mad.lo.s32 r13, r11, 176, r12; +add.f64 fd271, fd72, fd62; +add.f64 fd272, fd71, fd61; +st.shared.v2.f64 [r13], {fd272, fd271}; +fma.rn.f64 fd273, fd194, fd93, fd199; +sub.f64 fd274, fd197, fd198; +st.shared.v2.f64 [r13+16], {fd274, fd273}; +fma.rn.f64 fd275, fd204, fd117, fd207; +sub.f64 fd276, fd205, fd206; +st.shared.v2.f64 [r13+32], {fd276, fd275}; +sub.f64 fd277, fd213, fd214; +fma.rn.f64 fd278, fd212, fd141, fd215; +st.shared.v2.f64 [r13+48], {fd277, fd278}; +fma.rn.f64 fd279, fd220, fd165, fd223; +sub.f64 fd280, fd221, fd222; +st.shared.v2.f64 [r13+64], {fd280, fd279}; +fma.rn.f64 fd281, fd228, fd189, fd231; +sub.f64 fd282, fd229, fd230; +st.shared.v2.f64 [r13+80], {fd282, fd281}; +fma.rn.f64 fd283, fd233, fd191, fd238; +sub.f64 fd284, fd236, fd237; +st.shared.v2.f64 [r13+96], {fd284, fd283}; +fma.rn.f64 fd285, fd243, fd167, fd246; +sub.f64 fd286, fd244, fd245; +st.shared.v2.f64 [r13+112], {fd286, fd285}; +sub.f64 fd287, fd252, fd253; +fma.rn.f64 fd288, fd251, fd143, fd254; +st.shared.v2.f64 [r13+128], {fd287, fd288}; +fma.rn.f64 fd289, fd259, fd119, fd262; +sub.f64 fd290, fd260, fd261; +st.shared.v2.f64 [r13+144], {fd290, fd289}; +fma.rn.f64 fd291, fd267, fd95, fd270; +sub.f64 fd292, fd268, fd269; +st.shared.v2.f64 [r13+160], {fd292, fd291}; +barrier.sync 0; +mad.lo.s32 r14, r11, -160, r13; +ld.shared.v2.f64 {fd293, fd294}, [r14]; +ld.shared.v2.f64 {fd297, fd298}, [r14+1936]; +ld.shared.v2.f64 {fd301, fd302}, [r14+3872]; +ld.shared.v2.f64 {fd305, fd306}, [r14+5808]; +ld.shared.v2.f64 {fd309, fd310}, [r14+7744]; +ld.shared.v2.f64 {fd313, fd314}, [r14+9680]; +ld.shared.v2.f64 {fd317, fd318}, [r14+11616]; +ld.shared.v2.f64 {fd321, fd322}, [r14+13552]; +ld.shared.v2.f64 {fd325, fd326}, [r14+15488]; +ld.shared.v2.f64 {fd329, fd330}, [r14+17424]; +ld.shared.v2.f64 {fd333, fd334}, [r14+19360]; +add.f64 fd337, fd297, fd333; +add.f64 fd338, fd298, fd334; +sub.f64 fd339, fd297, fd333; +sub.f64 fd340, fd298, fd334; +add.f64 fd341, fd301, fd329; +add.f64 fd342, fd302, fd330; +sub.f64 fd343, fd301, fd329; +sub.f64 fd344, fd302, fd330; +add.f64 fd345, fd305, fd325; +add.f64 fd346, fd306, fd326; +sub.f64 fd347, fd305, fd325; +sub.f64 fd348, fd306, fd326; +add.f64 fd349, fd309, fd321; +add.f64 fd350, fd310, fd322; +sub.f64 fd351, fd309, fd321; +sub.f64 fd352, fd310, fd322; +add.f64 fd353, fd313, fd317; +add.f64 fd354, fd314, fd318; +sub.f64 fd355, fd313, fd317; +sub.f64 fd356, fd314, fd318; +add.f64 fd357, fd293, fd337; +add.f64 fd358, fd294, fd338; +add.f64 fd359, fd357, fd341; +add.f64 fd360, fd358, fd342; +add.f64 fd361, fd359, fd345; +add.f64 fd362, fd360, fd346; +add.f64 fd363, fd361, fd349; +add.f64 fd364, fd362, fd350; +fma.rn.f64 fd365, fd337, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd366, fd340, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd367, fd338, 0d3FEAEB8C8764F0BA, fd294; +fma.rn.f64 fd368, fd339, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd369, fd341, 0d3FDA9628D9C712B6, fd365; +fma.rn.f64 fd370, fd344, 0dBFED1BB48EEE2C13, fd366; +fma.rn.f64 fd371, fd342, 0d3FDA9628D9C712B6, fd367; +fma.rn.f64 fd372, fd343, 0dBFED1BB48EEE2C13, fd368; +fma.rn.f64 fd373, fd345, 0dBFC2375F640F44DB, fd369; +fma.rn.f64 fd374, fd348, 0dBFEFAC9E043842EF, fd370; +fma.rn.f64 fd375, fd346, 0dBFC2375F640F44DB, fd371; +fma.rn.f64 fd376, fd347, 0dBFEFAC9E043842EF, fd372; +fma.rn.f64 fd377, fd349, 0dBFE4F49E7F775887, fd373; +fma.rn.f64 fd378, fd352, 0dBFE82F19BB3A28A1, fd374; +fma.rn.f64 fd379, fd350, 0dBFE4F49E7F775887, fd375; +fma.rn.f64 fd380, fd351, 0dBFE82F19BB3A28A1, fd376; +fma.rn.f64 fd381, fd353, 0dBFEEB42A9BCD5057, fd377; +fma.rn.f64 fd382, fd356, 0dBFD207E7FD768DBF, fd378; +fma.rn.f64 fd383, fd354, 0dBFEEB42A9BCD5057, fd379; +fma.rn.f64 fd384, fd355, 0dBFD207E7FD768DBF, fd380; +sub.f64 fd385, fd381, fd382; +add.f64 fd386, fd384, fd383; +add.f64 fd387, fd382, fd381; +sub.f64 fd388, fd383, fd384; +fma.rn.f64 fd389, fd337, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd390, fd340, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd391, fd338, 0d3FDA9628D9C712B6, fd294; +fma.rn.f64 fd392, fd339, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd393, fd341, 0dBFE4F49E7F775887, fd389; +fma.rn.f64 fd394, fd344, 0dBFE82F19BB3A28A1, fd390; +fma.rn.f64 fd395, fd342, 0dBFE4F49E7F775887, fd391; +fma.rn.f64 fd396, fd343, 0dBFE82F19BB3A28A1, fd392; +fma.rn.f64 fd397, fd345, 0dBFEEB42A9BCD5057, fd393; +fma.rn.f64 fd398, fd348, 0d3FD207E7FD768DBF, fd394; +fma.rn.f64 fd399, fd346, 0dBFEEB42A9BCD5057, fd395; +fma.rn.f64 fd400, fd347, 0d3FD207E7FD768DBF, fd396; +fma.rn.f64 fd401, fd349, 0dBFC2375F640F44DB, fd397; +fma.rn.f64 fd402, fd352, 0d3FEFAC9E043842EF, fd398; +fma.rn.f64 fd403, fd350, 0dBFC2375F640F44DB, fd399; +fma.rn.f64 fd404, fd351, 0d3FEFAC9E043842EF, fd400; +fma.rn.f64 fd405, fd353, 0d3FEAEB8C8764F0BA, fd401; +fma.rn.f64 fd406, fd356, 0d3FE14CEDF8BB580B, fd402; +fma.rn.f64 fd407, fd354, 0d3FEAEB8C8764F0BA, fd403; +fma.rn.f64 fd408, fd355, 0d3FE14CEDF8BB580B, fd404; +sub.f64 fd409, fd405, fd406; +add.f64 fd410, fd408, fd407; +add.f64 fd411, fd406, fd405; +sub.f64 fd412, fd407, fd408; +fma.rn.f64 fd413, fd337, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd414, fd340, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd415, fd338, 0dBFC2375F640F44DB, fd294; +fma.rn.f64 fd416, fd339, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd417, fd341, 0dBFEEB42A9BCD5057, fd413; +fma.rn.f64 fd418, fd344, 0d3FD207E7FD768DBF, fd414; +fma.rn.f64 fd419, fd342, 0dBFEEB42A9BCD5057, fd415; +fma.rn.f64 fd420, fd343, 0d3FD207E7FD768DBF, fd416; +fma.rn.f64 fd421, fd345, 0d3FDA9628D9C712B6, fd417; +fma.rn.f64 fd422, fd348, 0d3FED1BB48EEE2C13, fd418; +fma.rn.f64 fd423, fd346, 0d3FDA9628D9C712B6, fd419; +fma.rn.f64 fd424, fd347, 0d3FED1BB48EEE2C13, fd420; +fma.rn.f64 fd425, fd349, 0d3FEAEB8C8764F0BA, fd421; +fma.rn.f64 fd426, fd352, 0dBFE14CEDF8BB580B, fd422; +fma.rn.f64 fd427, fd350, 0d3FEAEB8C8764F0BA, fd423; +fma.rn.f64 fd428, fd351, 0dBFE14CEDF8BB580B, fd424; +fma.rn.f64 fd429, fd353, 0dBFE4F49E7F775887, fd425; +fma.rn.f64 fd430, fd356, 0dBFE82F19BB3A28A1, fd426; +fma.rn.f64 fd431, fd354, 0dBFE4F49E7F775887, fd427; +fma.rn.f64 fd432, fd355, 0dBFE82F19BB3A28A1, fd428; +sub.f64 fd433, fd429, fd430; +add.f64 fd434, fd432, fd431; +add.f64 fd435, fd430, fd429; +sub.f64 fd436, fd431, fd432; +fma.rn.f64 fd437, fd337, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd438, fd340, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd439, fd338, 0dBFE4F49E7F775887, fd294; +fma.rn.f64 fd440, fd339, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd441, fd341, 0dBFC2375F640F44DB, fd437; +fma.rn.f64 fd442, fd344, 0d3FEFAC9E043842EF, fd438; +fma.rn.f64 fd443, fd342, 0dBFC2375F640F44DB, fd439; +fma.rn.f64 fd444, fd343, 0d3FEFAC9E043842EF, fd440; +fma.rn.f64 fd445, fd345, 0d3FEAEB8C8764F0BA, fd441; +fma.rn.f64 fd446, fd348, 0dBFE14CEDF8BB580B, fd442; +fma.rn.f64 fd447, fd346, 0d3FEAEB8C8764F0BA, fd443; +fma.rn.f64 fd448, fd347, 0dBFE14CEDF8BB580B, fd444; +fma.rn.f64 fd449, fd349, 0dBFEEB42A9BCD5057, fd445; +fma.rn.f64 fd450, fd352, 0dBFD207E7FD768DBF, fd446; +fma.rn.f64 fd451, fd350, 0dBFEEB42A9BCD5057, fd447; +fma.rn.f64 fd452, fd351, 0dBFD207E7FD768DBF, fd448; +fma.rn.f64 fd453, fd353, 0d3FDA9628D9C712B6, fd449; +fma.rn.f64 fd454, fd356, 0d3FED1BB48EEE2C13, fd450; +fma.rn.f64 fd455, fd354, 0d3FDA9628D9C712B6, fd451; +fma.rn.f64 fd456, fd355, 0d3FED1BB48EEE2C13, fd452; +sub.f64 fd457, fd453, fd454; +add.f64 fd458, fd456, fd455; +add.f64 fd459, fd454, fd453; +sub.f64 fd460, fd455, fd456; +fma.rn.f64 fd461, fd337, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd462, fd340, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd463, fd338, 0dBFEEB42A9BCD5057, fd294; +fma.rn.f64 fd464, fd339, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd465, fd341, 0d3FEAEB8C8764F0BA, fd461; +fma.rn.f64 fd466, fd344, 0d3FE14CEDF8BB580B, fd462; +fma.rn.f64 fd467, fd342, 0d3FEAEB8C8764F0BA, fd463; +fma.rn.f64 fd468, fd343, 0d3FE14CEDF8BB580B, fd464; +fma.rn.f64 fd469, fd345, 0dBFE4F49E7F775887, fd465; +fma.rn.f64 fd470, fd348, 0dBFE82F19BB3A28A1, fd466; +fma.rn.f64 fd471, fd346, 0dBFE4F49E7F775887, fd467; +fma.rn.f64 fd472, fd347, 0dBFE82F19BB3A28A1, fd468; +fma.rn.f64 fd473, fd349, 0d3FDA9628D9C712B6, fd469; +fma.rn.f64 fd474, fd352, 0d3FED1BB48EEE2C13, fd470; +fma.rn.f64 fd475, fd350, 0d3FDA9628D9C712B6, fd471; +fma.rn.f64 fd476, fd351, 0d3FED1BB48EEE2C13, fd472; +fma.rn.f64 fd477, fd353, 0dBFC2375F640F44DB, fd473; +fma.rn.f64 fd478, fd356, 0dBFEFAC9E043842EF, fd474; +fma.rn.f64 fd479, fd354, 0dBFC2375F640F44DB, fd475; +fma.rn.f64 fd480, fd355, 0dBFEFAC9E043842EF, fd476; +sub.f64 fd481, fd477, fd478; +add.f64 fd482, fd480, fd479; +add.f64 fd483, fd478, fd477; +sub.f64 fd484, fd479, fd480; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd485, fd486}, [rd11]; +mul.f64 fd489, fd485, fd385; +mul.f64 fd490, fd486, fd386; +mul.f64 fd491, fd485, fd386; +mul.f64 fd492, fd485, fd485; +mul.f64 fd493, fd486, fd486; +sub.f64 fd494, fd492, fd493; +mul.f64 fd495, fd486, fd485; +fma.rn.f64 fd496, fd486, fd485, fd495; +mul.f64 fd497, fd494, fd409; +mul.f64 fd498, fd496, fd410; +mul.f64 fd499, fd494, fd410; +mul.f64 fd500, fd485, fd494; +mul.f64 fd501, fd486, fd496; +sub.f64 fd502, fd500, fd501; +mul.f64 fd503, fd485, fd496; +fma.rn.f64 fd504, fd486, fd494, fd503; +mul.f64 fd505, fd502, fd433; +mul.f64 fd506, fd504, fd434; +mul.f64 fd507, fd502, fd434; +mul.f64 fd508, fd485, fd502; +mul.f64 fd509, fd486, fd504; +sub.f64 fd510, fd508, fd509; +mul.f64 fd511, fd485, fd504; +fma.rn.f64 fd512, fd486, fd502, fd511; +mul.f64 fd513, fd510, fd457; +mul.f64 fd514, fd512, fd458; +mul.f64 fd515, fd510, fd458; +mul.f64 fd516, fd485, fd510; +mul.f64 fd517, fd486, fd512; +sub.f64 fd518, fd516, fd517; +mul.f64 fd519, fd485, fd512; +fma.rn.f64 fd520, fd486, fd510, fd519; +mul.f64 fd521, fd518, fd481; +mul.f64 fd522, fd520, fd482; +mul.f64 fd523, fd518, fd482; +ld.global.v2.f64 {fd524, fd525}, [rd11+176]; +mul.f64 fd528, fd524, fd483; +mul.f64 fd529, fd525, fd484; +mul.f64 fd530, fd524, fd484; +mul.f64 fd531, fd485, fd524; +mul.f64 fd532, fd486, fd525; +sub.f64 fd533, fd531, fd532; +mul.f64 fd534, fd485, fd525; +fma.rn.f64 fd535, fd486, fd524, fd534; +mul.f64 fd536, fd533, fd459; +mul.f64 fd537, fd535, fd460; +mul.f64 fd538, fd533, fd460; +mul.f64 fd539, fd485, fd533; +mul.f64 fd540, fd486, fd535; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd485, fd535; +fma.rn.f64 fd543, fd486, fd533, fd542; +mul.f64 fd544, fd541, fd435; +mul.f64 fd545, fd543, fd436; +mul.f64 fd546, fd541, fd436; +mul.f64 fd547, fd485, fd541; +mul.f64 fd548, fd486, fd543; +sub.f64 fd549, fd547, fd548; +mul.f64 fd550, fd485, fd543; +fma.rn.f64 fd551, fd486, fd541, fd550; +mul.f64 fd552, fd549, fd411; +mul.f64 fd553, fd551, fd412; +mul.f64 fd554, fd549, fd412; +mul.f64 fd555, fd485, fd549; +mul.f64 fd556, fd486, fd551; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd485, fd551; +fma.rn.f64 fd559, fd486, fd549, fd558; +mul.f64 fd560, fd557, fd387; +mul.f64 fd561, fd559, fd388; +mul.f64 fd562, fd557, fd388; +shl.b32 r18, r17, 4; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 1936, r19; +add.f64 fd563, fd364, fd354; +add.f64 fd564, fd363, fd353; +st.shared.v2.f64 [r20], {fd564, fd563}; +fma.rn.f64 fd565, fd486, fd385, fd491; +sub.f64 fd566, fd489, fd490; +st.shared.v2.f64 [r20+176], {fd566, fd565}; +fma.rn.f64 fd567, fd496, fd409, fd499; +sub.f64 fd568, fd497, fd498; +st.shared.v2.f64 [r20+352], {fd568, fd567}; +fma.rn.f64 fd569, fd504, fd433, fd507; +sub.f64 fd570, fd505, fd506; +st.shared.v2.f64 [r20+528], {fd570, fd569}; +fma.rn.f64 fd571, fd512, fd457, fd515; +sub.f64 fd572, fd513, fd514; +st.shared.v2.f64 [r20+704], {fd572, fd571}; +sub.f64 fd573, fd521, fd522; +fma.rn.f64 fd574, fd520, fd481, fd523; +st.shared.v2.f64 [r20+880], {fd573, fd574}; +fma.rn.f64 fd575, fd525, fd483, fd530; +sub.f64 fd576, fd528, fd529; +st.shared.v2.f64 [r20+1056], {fd576, fd575}; +fma.rn.f64 fd577, fd535, fd459, fd538; +sub.f64 fd578, fd536, fd537; +st.shared.v2.f64 [r20+1232], {fd578, fd577}; +fma.rn.f64 fd579, fd543, fd435, fd546; +sub.f64 fd580, fd544, fd545; +st.shared.v2.f64 [r20+1408], {fd580, fd579}; +fma.rn.f64 fd581, fd551, fd411, fd554; +sub.f64 fd582, fd552, fd553; +st.shared.v2.f64 [r20+1584], {fd582, fd581}; +sub.f64 fd583, fd560, fd561; +fma.rn.f64 fd584, fd559, fd387, fd562; +st.shared.v2.f64 [r20+1760], {fd583, fd584}; +barrier.sync 0; +ld.shared.v2.f64 {fd585, fd586}, [r14]; +ld.shared.v2.f64 {fd589, fd590}, [r14+1936]; +ld.shared.v2.f64 {fd593, fd594}, [r14+3872]; +ld.shared.v2.f64 {fd597, fd598}, [r14+5808]; +ld.shared.v2.f64 {fd601, fd602}, [r14+7744]; +ld.shared.v2.f64 {fd605, fd606}, [r14+9680]; +ld.shared.v2.f64 {fd609, fd610}, [r14+11616]; +ld.shared.v2.f64 {fd613, fd614}, [r14+13552]; +ld.shared.v2.f64 {fd617, fd618}, [r14+15488]; +ld.shared.v2.f64 {fd621, fd622}, [r14+17424]; +ld.shared.v2.f64 {fd625, fd626}, [r14+19360]; +add.f64 fd629, fd589, fd625; +add.f64 fd630, fd590, fd626; +sub.f64 fd631, fd589, fd625; +sub.f64 fd632, fd590, fd626; +add.f64 fd633, fd593, fd621; +add.f64 fd634, fd594, fd622; +sub.f64 fd635, fd593, fd621; +sub.f64 fd636, fd594, fd622; +add.f64 fd637, fd597, fd617; +add.f64 fd638, fd598, fd618; +sub.f64 fd639, fd597, fd617; +sub.f64 fd640, fd598, fd618; +add.f64 fd641, fd601, fd613; +add.f64 fd642, fd602, fd614; +sub.f64 fd643, fd601, fd613; +sub.f64 fd644, fd602, fd614; +add.f64 fd645, fd605, fd609; +add.f64 fd646, fd606, fd610; +sub.f64 fd647, fd605, fd609; +sub.f64 fd648, fd606, fd610; +add.f64 fd649, fd585, fd629; +add.f64 fd650, fd586, fd630; +add.f64 fd651, fd649, fd633; +add.f64 fd652, fd650, fd634; +add.f64 fd653, fd651, fd637; +add.f64 fd654, fd652, fd638; +add.f64 fd655, fd653, fd641; +add.f64 fd656, fd654, fd642; +fma.rn.f64 fd657, fd629, 0d3FEAEB8C8764F0BA, fd585; +fma.rn.f64 fd658, fd632, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd659, fd630, 0d3FEAEB8C8764F0BA, fd586; +fma.rn.f64 fd660, fd631, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd661, fd633, 0d3FDA9628D9C712B6, fd657; +fma.rn.f64 fd662, fd636, 0dBFED1BB48EEE2C13, fd658; +fma.rn.f64 fd663, fd634, 0d3FDA9628D9C712B6, fd659; +fma.rn.f64 fd664, fd635, 0dBFED1BB48EEE2C13, fd660; +fma.rn.f64 fd665, fd637, 0dBFC2375F640F44DB, fd661; +fma.rn.f64 fd666, fd640, 0dBFEFAC9E043842EF, fd662; +fma.rn.f64 fd667, fd638, 0dBFC2375F640F44DB, fd663; +fma.rn.f64 fd668, fd639, 0dBFEFAC9E043842EF, fd664; +fma.rn.f64 fd669, fd641, 0dBFE4F49E7F775887, fd665; +fma.rn.f64 fd670, fd644, 0dBFE82F19BB3A28A1, fd666; +fma.rn.f64 fd671, fd642, 0dBFE4F49E7F775887, fd667; +fma.rn.f64 fd672, fd643, 0dBFE82F19BB3A28A1, fd668; +fma.rn.f64 fd673, fd645, 0dBFEEB42A9BCD5057, fd669; +fma.rn.f64 fd674, fd648, 0dBFD207E7FD768DBF, fd670; +fma.rn.f64 fd675, fd646, 0dBFEEB42A9BCD5057, fd671; +fma.rn.f64 fd676, fd647, 0dBFD207E7FD768DBF, fd672; +fma.rn.f64 fd677, fd629, 0d3FDA9628D9C712B6, fd585; +fma.rn.f64 fd678, fd632, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd679, fd630, 0d3FDA9628D9C712B6, fd586; +fma.rn.f64 fd680, fd631, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd681, fd633, 0dBFE4F49E7F775887, fd677; +fma.rn.f64 fd682, fd636, 0dBFE82F19BB3A28A1, fd678; +fma.rn.f64 fd683, fd634, 0dBFE4F49E7F775887, fd679; +fma.rn.f64 fd684, fd635, 0dBFE82F19BB3A28A1, fd680; +fma.rn.f64 fd685, fd637, 0dBFEEB42A9BCD5057, fd681; +fma.rn.f64 fd686, fd640, 0d3FD207E7FD768DBF, fd682; +fma.rn.f64 fd687, fd638, 0dBFEEB42A9BCD5057, fd683; +fma.rn.f64 fd688, fd639, 0d3FD207E7FD768DBF, fd684; +fma.rn.f64 fd689, fd641, 0dBFC2375F640F44DB, fd685; +fma.rn.f64 fd690, fd644, 0d3FEFAC9E043842EF, fd686; +fma.rn.f64 fd691, fd642, 0dBFC2375F640F44DB, fd687; +fma.rn.f64 fd692, fd643, 0d3FEFAC9E043842EF, fd688; +fma.rn.f64 fd693, fd645, 0d3FEAEB8C8764F0BA, fd689; +fma.rn.f64 fd694, fd648, 0d3FE14CEDF8BB580B, fd690; +fma.rn.f64 fd695, fd646, 0d3FEAEB8C8764F0BA, fd691; +fma.rn.f64 fd696, fd647, 0d3FE14CEDF8BB580B, fd692; +fma.rn.f64 fd697, fd629, 0dBFC2375F640F44DB, fd585; +fma.rn.f64 fd698, fd632, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd699, fd630, 0dBFC2375F640F44DB, fd586; +fma.rn.f64 fd700, fd631, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd701, fd633, 0dBFEEB42A9BCD5057, fd697; +fma.rn.f64 fd702, fd636, 0d3FD207E7FD768DBF, fd698; +fma.rn.f64 fd703, fd634, 0dBFEEB42A9BCD5057, fd699; +fma.rn.f64 fd704, fd635, 0d3FD207E7FD768DBF, fd700; +fma.rn.f64 fd705, fd637, 0d3FDA9628D9C712B6, fd701; +fma.rn.f64 fd706, fd640, 0d3FED1BB48EEE2C13, fd702; +fma.rn.f64 fd707, fd638, 0d3FDA9628D9C712B6, fd703; +fma.rn.f64 fd708, fd639, 0d3FED1BB48EEE2C13, fd704; +fma.rn.f64 fd709, fd641, 0d3FEAEB8C8764F0BA, fd705; +fma.rn.f64 fd710, fd644, 0dBFE14CEDF8BB580B, fd706; +fma.rn.f64 fd711, fd642, 0d3FEAEB8C8764F0BA, fd707; +fma.rn.f64 fd712, fd643, 0dBFE14CEDF8BB580B, fd708; +fma.rn.f64 fd713, fd645, 0dBFE4F49E7F775887, fd709; +fma.rn.f64 fd714, fd648, 0dBFE82F19BB3A28A1, fd710; +fma.rn.f64 fd715, fd646, 0dBFE4F49E7F775887, fd711; +fma.rn.f64 fd716, fd647, 0dBFE82F19BB3A28A1, fd712; +fma.rn.f64 fd717, fd629, 0dBFE4F49E7F775887, fd585; +fma.rn.f64 fd718, fd632, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd719, fd630, 0dBFE4F49E7F775887, fd586; +fma.rn.f64 fd720, fd631, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd721, fd633, 0dBFC2375F640F44DB, fd717; +fma.rn.f64 fd722, fd636, 0d3FEFAC9E043842EF, fd718; +fma.rn.f64 fd723, fd634, 0dBFC2375F640F44DB, fd719; +fma.rn.f64 fd724, fd635, 0d3FEFAC9E043842EF, fd720; +fma.rn.f64 fd725, fd637, 0d3FEAEB8C8764F0BA, fd721; +fma.rn.f64 fd726, fd640, 0dBFE14CEDF8BB580B, fd722; +fma.rn.f64 fd727, fd638, 0d3FEAEB8C8764F0BA, fd723; +fma.rn.f64 fd728, fd639, 0dBFE14CEDF8BB580B, fd724; +fma.rn.f64 fd729, fd641, 0dBFEEB42A9BCD5057, fd725; +fma.rn.f64 fd730, fd644, 0dBFD207E7FD768DBF, fd726; +fma.rn.f64 fd731, fd642, 0dBFEEB42A9BCD5057, fd727; +fma.rn.f64 fd732, fd643, 0dBFD207E7FD768DBF, fd728; +fma.rn.f64 fd733, fd645, 0d3FDA9628D9C712B6, fd729; +fma.rn.f64 fd734, fd648, 0d3FED1BB48EEE2C13, fd730; +fma.rn.f64 fd735, fd646, 0d3FDA9628D9C712B6, fd731; +fma.rn.f64 fd736, fd647, 0d3FED1BB48EEE2C13, fd732; +fma.rn.f64 fd737, fd629, 0dBFEEB42A9BCD5057, fd585; +fma.rn.f64 fd738, fd632, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd739, fd630, 0dBFEEB42A9BCD5057, fd586; +fma.rn.f64 fd740, fd631, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd741, fd633, 0d3FEAEB8C8764F0BA, fd737; +fma.rn.f64 fd742, fd636, 0d3FE14CEDF8BB580B, fd738; +fma.rn.f64 fd743, fd634, 0d3FEAEB8C8764F0BA, fd739; +fma.rn.f64 fd744, fd635, 0d3FE14CEDF8BB580B, fd740; +fma.rn.f64 fd745, fd637, 0dBFE4F49E7F775887, fd741; +fma.rn.f64 fd746, fd640, 0dBFE82F19BB3A28A1, fd742; +fma.rn.f64 fd747, fd638, 0dBFE4F49E7F775887, fd743; +fma.rn.f64 fd748, fd639, 0dBFE82F19BB3A28A1, fd744; +fma.rn.f64 fd749, fd641, 0d3FDA9628D9C712B6, fd745; +fma.rn.f64 fd750, fd644, 0d3FED1BB48EEE2C13, fd746; +fma.rn.f64 fd751, fd642, 0d3FDA9628D9C712B6, fd747; +fma.rn.f64 fd752, fd643, 0d3FED1BB48EEE2C13, fd748; +fma.rn.f64 fd753, fd645, 0dBFC2375F640F44DB, fd749; +fma.rn.f64 fd754, fd648, 0dBFEFAC9E043842EF, fd750; +fma.rn.f64 fd755, fd646, 0dBFC2375F640F44DB, fd751; +fma.rn.f64 fd756, fd647, 0dBFEFAC9E043842EF, fd752; +add.f64 %1, fd656, fd646; +add.f64 %0, fd655, fd645; +add.f64 %3, fd676, fd675; +sub.f64 %2, fd673, fd674; +add.f64 %5, fd696, fd695; +sub.f64 %4, fd693, fd694; +add.f64 %7, fd716, fd715; +sub.f64 %6, fd713, fd714; +add.f64 %9, fd736, fd735; +sub.f64 %8, fd733, fd734; +add.f64 %11, fd756, fd755; +sub.f64 %10, fd753, fd754; +sub.f64 %13, fd755, fd756; +add.f64 %12, fd754, fd753; +sub.f64 %15, fd735, fd736; +add.f64 %14, fd734, fd733; +sub.f64 %17, fd715, fd716; +add.f64 %16, fd714, fd713; +sub.f64 %19, fd695, fd696; +add.f64 %18, fd694, fd693; +sub.f64 %21, fd675, fd676; +add.f64 %20, fd674, fd673; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_1331), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..520c8a94c28c3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1331_fp64_inv.hpp.inc @@ -0,0 +1,1502 @@ +#ifndef CUFFTDX_FFT_1331_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_1331_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<727, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<735>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 10648, r2; +add.f64 fd45, %27, %51; +add.f64 fd46, %29, %52; +sub.f64 fd47, %27, %51; +sub.f64 fd48, %29, %52; +add.f64 fd49, %30, %49; +add.f64 fd50, %32, %50; +sub.f64 fd51, %30, %49; +sub.f64 fd52, %32, %50; +add.f64 fd53, %33, %46; +add.f64 fd54, %34, %48; +sub.f64 fd55, %33, %46; +sub.f64 fd56, %34, %48; +add.f64 fd57, %35, %43; +add.f64 fd58, %37, %45; +sub.f64 fd59, %35, %43; +sub.f64 fd60, %37, %45; +add.f64 fd61, %38, %41; +add.f64 fd62, %40, %42; +sub.f64 fd63, %38, %41; +sub.f64 fd64, %40, %42; +mov.u32 r4, %tid.x; +add.f64 fd65, %25, fd45; +add.f64 fd66, %26, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +add.f64 fd73, fd71, fd61; +add.f64 fd74, fd72, fd62; +fma.rn.f64 fd75, fd45, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd76, fd48, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd46, 0d3FEAEB8C8764F0BA, %26; +fma.rn.f64 fd78, fd47, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd79, fd49, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd52, 0d3FED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd50, 0d3FDA9628D9C712B6, fd77; +fma.rn.f64 fd82, fd51, 0d3FED1BB48EEE2C13, fd78; +fma.rn.f64 fd83, fd53, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd56, 0d3FEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd54, 0dBFC2375F640F44DB, fd81; +fma.rn.f64 fd86, fd55, 0d3FEFAC9E043842EF, fd82; +fma.rn.f64 fd87, fd57, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd60, 0d3FE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd58, 0dBFE4F49E7F775887, fd85; +fma.rn.f64 fd90, fd59, 0d3FE82F19BB3A28A1, fd86; +fma.rn.f64 fd91, fd61, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd64, 0d3FD207E7FD768DBF, fd88; +fma.rn.f64 fd93, fd62, 0dBFEEB42A9BCD5057, fd89; +fma.rn.f64 fd94, fd63, 0d3FD207E7FD768DBF, fd90; +sub.f64 fd95, fd91, fd92; +add.f64 fd96, fd94, fd93; +add.f64 fd97, fd92, fd91; +sub.f64 fd98, fd93, fd94; +fma.rn.f64 fd99, fd45, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd100, fd48, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd46, 0d3FDA9628D9C712B6, %26; +fma.rn.f64 fd102, fd47, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd103, fd49, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd52, 0d3FE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd50, 0dBFE4F49E7F775887, fd101; +fma.rn.f64 fd106, fd51, 0d3FE82F19BB3A28A1, fd102; +fma.rn.f64 fd107, fd53, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd56, 0dBFD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd54, 0dBFEEB42A9BCD5057, fd105; +fma.rn.f64 fd110, fd55, 0dBFD207E7FD768DBF, fd106; +fma.rn.f64 fd111, fd57, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd60, 0dBFEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd58, 0dBFC2375F640F44DB, fd109; +fma.rn.f64 fd114, fd59, 0dBFEFAC9E043842EF, fd110; +fma.rn.f64 fd115, fd61, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd64, 0dBFE14CEDF8BB580B, fd112; +fma.rn.f64 fd117, fd62, 0d3FEAEB8C8764F0BA, fd113; +fma.rn.f64 fd118, fd63, 0dBFE14CEDF8BB580B, fd114; +sub.f64 fd119, fd115, fd116; +add.f64 fd120, fd118, fd117; +add.f64 fd121, fd116, fd115; +sub.f64 fd122, fd117, fd118; +fma.rn.f64 fd123, fd45, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd124, fd48, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd46, 0dBFC2375F640F44DB, %26; +fma.rn.f64 fd126, fd47, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd127, fd49, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd52, 0dBFD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd50, 0dBFEEB42A9BCD5057, fd125; +fma.rn.f64 fd130, fd51, 0dBFD207E7FD768DBF, fd126; +fma.rn.f64 fd131, fd53, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd56, 0dBFED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd54, 0d3FDA9628D9C712B6, fd129; +fma.rn.f64 fd134, fd55, 0dBFED1BB48EEE2C13, fd130; +fma.rn.f64 fd135, fd57, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd60, 0d3FE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd58, 0d3FEAEB8C8764F0BA, fd133; +fma.rn.f64 fd138, fd59, 0d3FE14CEDF8BB580B, fd134; +fma.rn.f64 fd139, fd61, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd64, 0d3FE82F19BB3A28A1, fd136; +fma.rn.f64 fd141, fd62, 0dBFE4F49E7F775887, fd137; +fma.rn.f64 fd142, fd63, 0d3FE82F19BB3A28A1, fd138; +sub.f64 fd143, fd139, fd140; +add.f64 fd144, fd142, fd141; +add.f64 fd145, fd140, fd139; +sub.f64 fd146, fd141, fd142; +fma.rn.f64 fd147, fd45, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd148, fd48, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd46, 0dBFE4F49E7F775887, %26; +fma.rn.f64 fd150, fd47, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd151, fd49, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd52, 0dBFEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd50, 0dBFC2375F640F44DB, fd149; +fma.rn.f64 fd154, fd51, 0dBFEFAC9E043842EF, fd150; +fma.rn.f64 fd155, fd53, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd56, 0d3FE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd54, 0d3FEAEB8C8764F0BA, fd153; +fma.rn.f64 fd158, fd55, 0d3FE14CEDF8BB580B, fd154; +fma.rn.f64 fd159, fd57, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd60, 0d3FD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd58, 0dBFEEB42A9BCD5057, fd157; +fma.rn.f64 fd162, fd59, 0d3FD207E7FD768DBF, fd158; +fma.rn.f64 fd163, fd61, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd64, 0dBFED1BB48EEE2C13, fd160; +fma.rn.f64 fd165, fd62, 0d3FDA9628D9C712B6, fd161; +fma.rn.f64 fd166, fd63, 0dBFED1BB48EEE2C13, fd162; +sub.f64 fd167, fd163, fd164; +add.f64 fd168, fd166, fd165; +add.f64 fd169, fd164, fd163; +sub.f64 fd170, fd165, fd166; +fma.rn.f64 fd171, fd45, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd172, fd48, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd46, 0dBFEEB42A9BCD5057, %26; +fma.rn.f64 fd174, fd47, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd175, fd49, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd52, 0dBFE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd50, 0d3FEAEB8C8764F0BA, fd173; +fma.rn.f64 fd178, fd51, 0dBFE14CEDF8BB580B, fd174; +fma.rn.f64 fd179, fd53, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd56, 0d3FE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd54, 0dBFE4F49E7F775887, fd177; +fma.rn.f64 fd182, fd55, 0d3FE82F19BB3A28A1, fd178; +fma.rn.f64 fd183, fd57, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd60, 0dBFED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd58, 0d3FDA9628D9C712B6, fd181; +fma.rn.f64 fd186, fd59, 0dBFED1BB48EEE2C13, fd182; +fma.rn.f64 fd187, fd61, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd64, 0d3FEFAC9E043842EF, fd184; +fma.rn.f64 fd189, fd62, 0dBFC2375F640F44DB, fd185; +fma.rn.f64 fd190, fd63, 0d3FEFAC9E043842EF, fd186; +sub.f64 fd191, fd187, fd188; +add.f64 fd192, fd190, fd189; +add.f64 fd193, fd188, fd187; +sub.f64 fd194, fd189, fd190; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd195, fd196}, [rd6]; +mul.f64 fd199, fd96, fd196; +fma.rn.f64 fd200, fd195, fd95, fd199; +mul.f64 fd201, fd95, fd196; +mul.f64 fd202, fd195, fd96; +sub.f64 fd203, fd202, fd201; +mul.f64 fd204, fd195, fd195; +mul.f64 fd205, fd196, fd196; +sub.f64 fd206, fd204, fd205; +mul.f64 fd207, fd196, fd195; +fma.rn.f64 fd208, fd196, fd195, fd207; +mul.f64 fd209, fd120, fd208; +fma.rn.f64 fd210, fd206, fd119, fd209; +mul.f64 fd211, fd119, fd208; +mul.f64 fd212, fd206, fd120; +sub.f64 fd213, fd212, fd211; +mul.f64 fd214, fd195, fd206; +mul.f64 fd215, fd196, fd208; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd195, fd208; +fma.rn.f64 fd218, fd196, fd206, fd217; +mul.f64 fd219, fd144, fd218; +fma.rn.f64 fd220, fd216, fd143, fd219; +mul.f64 fd221, fd143, fd218; +mul.f64 fd222, fd216, fd144; +sub.f64 fd223, fd222, fd221; +mul.f64 fd224, fd195, fd216; +mul.f64 fd225, fd196, fd218; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd195, fd218; +fma.rn.f64 fd228, fd196, fd216, fd227; +mul.f64 fd229, fd168, fd228; +fma.rn.f64 fd230, fd226, fd167, fd229; +mul.f64 fd231, fd167, fd228; +mul.f64 fd232, fd226, fd168; +sub.f64 fd233, fd232, fd231; +mul.f64 fd234, fd195, fd226; +mul.f64 fd235, fd196, fd228; +sub.f64 fd236, fd234, fd235; +mul.f64 fd237, fd195, fd228; +fma.rn.f64 fd238, fd196, fd226, fd237; +mul.f64 fd239, fd192, fd238; +fma.rn.f64 fd240, fd236, fd191, fd239; +mul.f64 fd241, fd191, fd238; +mul.f64 fd242, fd236, fd192; +sub.f64 fd243, fd242, fd241; +ld.global.v2.f64 {fd244, fd245}, [rd6+1936]; +mul.f64 fd248, fd194, fd245; +fma.rn.f64 fd249, fd244, fd193, fd248; +mul.f64 fd250, fd193, fd245; +mul.f64 fd251, fd244, fd194; +sub.f64 fd252, fd251, fd250; +mul.f64 fd253, fd195, fd244; +mul.f64 fd254, fd196, fd245; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd195, fd245; +fma.rn.f64 fd257, fd196, fd244, fd256; +mul.f64 fd258, fd170, fd257; +fma.rn.f64 fd259, fd255, fd169, fd258; +mul.f64 fd260, fd169, fd257; +mul.f64 fd261, fd255, fd170; +sub.f64 fd262, fd261, fd260; +mul.f64 fd263, fd195, fd255; +mul.f64 fd264, fd196, fd257; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd195, fd257; +fma.rn.f64 fd267, fd196, fd255, fd266; +mul.f64 fd268, fd146, fd267; +fma.rn.f64 fd269, fd265, fd145, fd268; +mul.f64 fd270, fd145, fd267; +mul.f64 fd271, fd265, fd146; +sub.f64 fd272, fd271, fd270; +mul.f64 fd273, fd195, fd265; +mul.f64 fd274, fd196, fd267; +sub.f64 fd275, fd273, fd274; +mul.f64 fd276, fd195, fd267; +fma.rn.f64 fd277, fd196, fd265, fd276; +mul.f64 fd278, fd122, fd277; +fma.rn.f64 fd279, fd275, fd121, fd278; +mul.f64 fd280, fd121, fd277; +mul.f64 fd281, fd275, fd122; +sub.f64 fd282, fd281, fd280; +mul.f64 fd283, fd195, fd275; +mul.f64 fd284, fd196, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd195, fd277; +fma.rn.f64 fd287, fd196, fd275, fd286; +mul.f64 fd288, fd98, fd287; +fma.rn.f64 fd289, fd285, fd97, fd288; +mul.f64 fd290, fd97, fd287; +mul.f64 fd291, fd285, fd98; +sub.f64 fd292, fd291, fd290; +mad.lo.s32 r12, r9, 10648, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 88, r12; +st.shared.f64 [r13], fd73; +st.shared.f64 [r13+8], fd200; +st.shared.f64 [r13+16], fd210; +st.shared.f64 [r13+24], fd220; +st.shared.f64 [r13+32], fd230; +st.shared.f64 [r13+40], fd240; +st.shared.f64 [r13+48], fd249; +st.shared.f64 [r13+56], fd259; +st.shared.f64 [r13+64], fd269; +st.shared.f64 [r13+72], fd279; +st.shared.f64 [r13+80], fd289; +barrier.sync 0; +mad.lo.s32 r14, r11, -80, r13; +ld.shared.f64 fd293, [r14]; +ld.shared.f64 fd294, [r14+968]; +ld.shared.f64 fd295, [r14+1936]; +ld.shared.f64 fd296, [r14+2904]; +ld.shared.f64 fd297, [r14+3872]; +ld.shared.f64 fd298, [r14+4840]; +ld.shared.f64 fd299, [r14+5808]; +ld.shared.f64 fd300, [r14+6776]; +ld.shared.f64 fd301, [r14+7744]; +ld.shared.f64 fd302, [r14+8712]; +ld.shared.f64 fd303, [r14+9680]; +barrier.sync 0; +st.shared.f64 [r13], fd74; +st.shared.f64 [r13+8], fd203; +st.shared.f64 [r13+16], fd213; +st.shared.f64 [r13+24], fd223; +st.shared.f64 [r13+32], fd233; +st.shared.f64 [r13+40], fd243; +st.shared.f64 [r13+48], fd252; +st.shared.f64 [r13+56], fd262; +st.shared.f64 [r13+64], fd272; +st.shared.f64 [r13+72], fd282; +st.shared.f64 [r13+80], fd292; +barrier.sync 0; +ld.shared.f64 fd304, [r14]; +ld.shared.f64 fd305, [r14+968]; +ld.shared.f64 fd306, [r14+1936]; +ld.shared.f64 fd307, [r14+2904]; +ld.shared.f64 fd308, [r14+3872]; +ld.shared.f64 fd309, [r14+4840]; +ld.shared.f64 fd310, [r14+5808]; +ld.shared.f64 fd311, [r14+6776]; +ld.shared.f64 fd312, [r14+7744]; +ld.shared.f64 fd313, [r14+8712]; +ld.shared.f64 fd314, [r14+9680]; +add.f64 fd315, fd294, fd303; +add.f64 fd316, fd305, fd314; +sub.f64 fd317, fd294, fd303; +sub.f64 fd318, fd305, fd314; +add.f64 fd319, fd295, fd302; +add.f64 fd320, fd306, fd313; +sub.f64 fd321, fd295, fd302; +sub.f64 fd322, fd306, fd313; +add.f64 fd323, fd296, fd301; +add.f64 fd324, fd307, fd312; +sub.f64 fd325, fd296, fd301; +sub.f64 fd326, fd307, fd312; +add.f64 fd327, fd297, fd300; +add.f64 fd328, fd308, fd311; +sub.f64 fd329, fd297, fd300; +sub.f64 fd330, fd308, fd311; +add.f64 fd331, fd298, fd299; +add.f64 fd332, fd309, fd310; +sub.f64 fd333, fd298, fd299; +sub.f64 fd334, fd309, fd310; +add.f64 fd335, fd293, fd315; +add.f64 fd336, fd304, fd316; +add.f64 fd337, fd335, fd319; +add.f64 fd338, fd336, fd320; +add.f64 fd339, fd337, fd323; +add.f64 fd340, fd338, fd324; +add.f64 fd341, fd339, fd327; +add.f64 fd342, fd340, fd328; +add.f64 fd343, fd341, fd331; +add.f64 fd344, fd342, fd332; +fma.rn.f64 fd345, fd315, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd346, fd318, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd347, fd316, 0d3FEAEB8C8764F0BA, fd304; +fma.rn.f64 fd348, fd317, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd349, fd319, 0d3FDA9628D9C712B6, fd345; +fma.rn.f64 fd350, fd322, 0d3FED1BB48EEE2C13, fd346; +fma.rn.f64 fd351, fd320, 0d3FDA9628D9C712B6, fd347; +fma.rn.f64 fd352, fd321, 0d3FED1BB48EEE2C13, fd348; +fma.rn.f64 fd353, fd323, 0dBFC2375F640F44DB, fd349; +fma.rn.f64 fd354, fd326, 0d3FEFAC9E043842EF, fd350; +fma.rn.f64 fd355, fd324, 0dBFC2375F640F44DB, fd351; +fma.rn.f64 fd356, fd325, 0d3FEFAC9E043842EF, fd352; +fma.rn.f64 fd357, fd327, 0dBFE4F49E7F775887, fd353; +fma.rn.f64 fd358, fd330, 0d3FE82F19BB3A28A1, fd354; +fma.rn.f64 fd359, fd328, 0dBFE4F49E7F775887, fd355; +fma.rn.f64 fd360, fd329, 0d3FE82F19BB3A28A1, fd356; +fma.rn.f64 fd361, fd331, 0dBFEEB42A9BCD5057, fd357; +fma.rn.f64 fd362, fd334, 0d3FD207E7FD768DBF, fd358; +fma.rn.f64 fd363, fd332, 0dBFEEB42A9BCD5057, fd359; +fma.rn.f64 fd364, fd333, 0d3FD207E7FD768DBF, fd360; +sub.f64 fd365, fd361, fd362; +add.f64 fd366, fd364, fd363; +add.f64 fd367, fd362, fd361; +sub.f64 fd368, fd363, fd364; +fma.rn.f64 fd369, fd315, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd370, fd318, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd371, fd316, 0d3FDA9628D9C712B6, fd304; +fma.rn.f64 fd372, fd317, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd373, fd319, 0dBFE4F49E7F775887, fd369; +fma.rn.f64 fd374, fd322, 0d3FE82F19BB3A28A1, fd370; +fma.rn.f64 fd375, fd320, 0dBFE4F49E7F775887, fd371; +fma.rn.f64 fd376, fd321, 0d3FE82F19BB3A28A1, fd372; +fma.rn.f64 fd377, fd323, 0dBFEEB42A9BCD5057, fd373; +fma.rn.f64 fd378, fd326, 0dBFD207E7FD768DBF, fd374; +fma.rn.f64 fd379, fd324, 0dBFEEB42A9BCD5057, fd375; +fma.rn.f64 fd380, fd325, 0dBFD207E7FD768DBF, fd376; +fma.rn.f64 fd381, fd327, 0dBFC2375F640F44DB, fd377; +fma.rn.f64 fd382, fd330, 0dBFEFAC9E043842EF, fd378; +fma.rn.f64 fd383, fd328, 0dBFC2375F640F44DB, fd379; +fma.rn.f64 fd384, fd329, 0dBFEFAC9E043842EF, fd380; +fma.rn.f64 fd385, fd331, 0d3FEAEB8C8764F0BA, fd381; +fma.rn.f64 fd386, fd334, 0dBFE14CEDF8BB580B, fd382; +fma.rn.f64 fd387, fd332, 0d3FEAEB8C8764F0BA, fd383; +fma.rn.f64 fd388, fd333, 0dBFE14CEDF8BB580B, fd384; +sub.f64 fd389, fd385, fd386; +add.f64 fd390, fd388, fd387; +add.f64 fd391, fd386, fd385; +sub.f64 fd392, fd387, fd388; +fma.rn.f64 fd393, fd315, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd394, fd318, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd395, fd316, 0dBFC2375F640F44DB, fd304; +fma.rn.f64 fd396, fd317, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd397, fd319, 0dBFEEB42A9BCD5057, fd393; +fma.rn.f64 fd398, fd322, 0dBFD207E7FD768DBF, fd394; +fma.rn.f64 fd399, fd320, 0dBFEEB42A9BCD5057, fd395; +fma.rn.f64 fd400, fd321, 0dBFD207E7FD768DBF, fd396; +fma.rn.f64 fd401, fd323, 0d3FDA9628D9C712B6, fd397; +fma.rn.f64 fd402, fd326, 0dBFED1BB48EEE2C13, fd398; +fma.rn.f64 fd403, fd324, 0d3FDA9628D9C712B6, fd399; +fma.rn.f64 fd404, fd325, 0dBFED1BB48EEE2C13, fd400; +fma.rn.f64 fd405, fd327, 0d3FEAEB8C8764F0BA, fd401; +fma.rn.f64 fd406, fd330, 0d3FE14CEDF8BB580B, fd402; +fma.rn.f64 fd407, fd328, 0d3FEAEB8C8764F0BA, fd403; +fma.rn.f64 fd408, fd329, 0d3FE14CEDF8BB580B, fd404; +fma.rn.f64 fd409, fd331, 0dBFE4F49E7F775887, fd405; +fma.rn.f64 fd410, fd334, 0d3FE82F19BB3A28A1, fd406; +fma.rn.f64 fd411, fd332, 0dBFE4F49E7F775887, fd407; +fma.rn.f64 fd412, fd333, 0d3FE82F19BB3A28A1, fd408; +sub.f64 fd413, fd409, fd410; +add.f64 fd414, fd412, fd411; +add.f64 fd415, fd410, fd409; +sub.f64 fd416, fd411, fd412; +fma.rn.f64 fd417, fd315, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd418, fd318, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd419, fd316, 0dBFE4F49E7F775887, fd304; +fma.rn.f64 fd420, fd317, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd421, fd319, 0dBFC2375F640F44DB, fd417; +fma.rn.f64 fd422, fd322, 0dBFEFAC9E043842EF, fd418; +fma.rn.f64 fd423, fd320, 0dBFC2375F640F44DB, fd419; +fma.rn.f64 fd424, fd321, 0dBFEFAC9E043842EF, fd420; +fma.rn.f64 fd425, fd323, 0d3FEAEB8C8764F0BA, fd421; +fma.rn.f64 fd426, fd326, 0d3FE14CEDF8BB580B, fd422; +fma.rn.f64 fd427, fd324, 0d3FEAEB8C8764F0BA, fd423; +fma.rn.f64 fd428, fd325, 0d3FE14CEDF8BB580B, fd424; +fma.rn.f64 fd429, fd327, 0dBFEEB42A9BCD5057, fd425; +fma.rn.f64 fd430, fd330, 0d3FD207E7FD768DBF, fd426; +fma.rn.f64 fd431, fd328, 0dBFEEB42A9BCD5057, fd427; +fma.rn.f64 fd432, fd329, 0d3FD207E7FD768DBF, fd428; +fma.rn.f64 fd433, fd331, 0d3FDA9628D9C712B6, fd429; +fma.rn.f64 fd434, fd334, 0dBFED1BB48EEE2C13, fd430; +fma.rn.f64 fd435, fd332, 0d3FDA9628D9C712B6, fd431; +fma.rn.f64 fd436, fd333, 0dBFED1BB48EEE2C13, fd432; +sub.f64 fd437, fd433, fd434; +add.f64 fd438, fd436, fd435; +add.f64 fd439, fd434, fd433; +sub.f64 fd440, fd435, fd436; +fma.rn.f64 fd441, fd315, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd442, fd318, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd443, fd316, 0dBFEEB42A9BCD5057, fd304; +fma.rn.f64 fd444, fd317, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd445, fd319, 0d3FEAEB8C8764F0BA, fd441; +fma.rn.f64 fd446, fd322, 0dBFE14CEDF8BB580B, fd442; +fma.rn.f64 fd447, fd320, 0d3FEAEB8C8764F0BA, fd443; +fma.rn.f64 fd448, fd321, 0dBFE14CEDF8BB580B, fd444; +fma.rn.f64 fd449, fd323, 0dBFE4F49E7F775887, fd445; +fma.rn.f64 fd450, fd326, 0d3FE82F19BB3A28A1, fd446; +fma.rn.f64 fd451, fd324, 0dBFE4F49E7F775887, fd447; +fma.rn.f64 fd452, fd325, 0d3FE82F19BB3A28A1, fd448; +fma.rn.f64 fd453, fd327, 0d3FDA9628D9C712B6, fd449; +fma.rn.f64 fd454, fd330, 0dBFED1BB48EEE2C13, fd450; +fma.rn.f64 fd455, fd328, 0d3FDA9628D9C712B6, fd451; +fma.rn.f64 fd456, fd329, 0dBFED1BB48EEE2C13, fd452; +fma.rn.f64 fd457, fd331, 0dBFC2375F640F44DB, fd453; +fma.rn.f64 fd458, fd334, 0d3FEFAC9E043842EF, fd454; +fma.rn.f64 fd459, fd332, 0dBFC2375F640F44DB, fd455; +fma.rn.f64 fd460, fd333, 0d3FEFAC9E043842EF, fd456; +sub.f64 fd461, fd457, fd458; +add.f64 fd462, fd460, fd459; +add.f64 fd463, fd458, fd457; +sub.f64 fd464, fd459, fd460; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd465, fd466}, [rd11]; +mul.f64 fd469, fd366, fd466; +fma.rn.f64 fd470, fd465, fd365, fd469; +mul.f64 fd471, fd365, fd466; +mul.f64 fd472, fd465, fd366; +sub.f64 fd473, fd472, fd471; +mul.f64 fd474, fd465, fd465; +mul.f64 fd475, fd466, fd466; +sub.f64 fd476, fd474, fd475; +mul.f64 fd477, fd466, fd465; +fma.rn.f64 fd478, fd466, fd465, fd477; +mul.f64 fd479, fd390, fd478; +fma.rn.f64 fd480, fd476, fd389, fd479; +mul.f64 fd481, fd389, fd478; +mul.f64 fd482, fd476, fd390; +sub.f64 fd483, fd482, fd481; +mul.f64 fd484, fd465, fd476; +mul.f64 fd485, fd466, fd478; +sub.f64 fd486, fd484, fd485; +mul.f64 fd487, fd465, fd478; +fma.rn.f64 fd488, fd466, fd476, fd487; +mul.f64 fd489, fd414, fd488; +fma.rn.f64 fd490, fd486, fd413, fd489; +mul.f64 fd491, fd413, fd488; +mul.f64 fd492, fd486, fd414; +sub.f64 fd493, fd492, fd491; +mul.f64 fd494, fd465, fd486; +mul.f64 fd495, fd466, fd488; +sub.f64 fd496, fd494, fd495; +mul.f64 fd497, fd465, fd488; +fma.rn.f64 fd498, fd466, fd486, fd497; +mul.f64 fd499, fd438, fd498; +fma.rn.f64 fd500, fd496, fd437, fd499; +mul.f64 fd501, fd437, fd498; +mul.f64 fd502, fd496, fd438; +sub.f64 fd503, fd502, fd501; +mul.f64 fd504, fd465, fd496; +mul.f64 fd505, fd466, fd498; +sub.f64 fd506, fd504, fd505; +mul.f64 fd507, fd465, fd498; +fma.rn.f64 fd508, fd466, fd496, fd507; +mul.f64 fd509, fd462, fd508; +fma.rn.f64 fd510, fd506, fd461, fd509; +mul.f64 fd511, fd461, fd508; +mul.f64 fd512, fd506, fd462; +sub.f64 fd513, fd512, fd511; +ld.global.v2.f64 {fd514, fd515}, [rd11+176]; +mul.f64 fd518, fd464, fd515; +fma.rn.f64 fd519, fd514, fd463, fd518; +mul.f64 fd520, fd463, fd515; +mul.f64 fd521, fd514, fd464; +sub.f64 fd522, fd521, fd520; +mul.f64 fd523, fd465, fd514; +mul.f64 fd524, fd466, fd515; +sub.f64 fd525, fd523, fd524; +mul.f64 fd526, fd465, fd515; +fma.rn.f64 fd527, fd466, fd514, fd526; +mul.f64 fd528, fd440, fd527; +fma.rn.f64 fd529, fd525, fd439, fd528; +mul.f64 fd530, fd439, fd527; +mul.f64 fd531, fd525, fd440; +sub.f64 fd532, fd531, fd530; +mul.f64 fd533, fd465, fd525; +mul.f64 fd534, fd466, fd527; +sub.f64 fd535, fd533, fd534; +mul.f64 fd536, fd465, fd527; +fma.rn.f64 fd537, fd466, fd525, fd536; +mul.f64 fd538, fd416, fd537; +fma.rn.f64 fd539, fd535, fd415, fd538; +mul.f64 fd540, fd415, fd537; +mul.f64 fd541, fd535, fd416; +sub.f64 fd542, fd541, fd540; +mul.f64 fd543, fd465, fd535; +mul.f64 fd544, fd466, fd537; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd465, fd537; +fma.rn.f64 fd547, fd466, fd535, fd546; +mul.f64 fd548, fd392, fd547; +fma.rn.f64 fd549, fd545, fd391, fd548; +mul.f64 fd550, fd391, fd547; +mul.f64 fd551, fd545, fd392; +sub.f64 fd552, fd551, fd550; +mul.f64 fd553, fd465, fd545; +mul.f64 fd554, fd466, fd547; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd465, fd547; +fma.rn.f64 fd557, fd466, fd545, fd556; +mul.f64 fd558, fd368, fd557; +fma.rn.f64 fd559, fd555, fd367, fd558; +mul.f64 fd560, fd367, fd557; +mul.f64 fd561, fd555, fd368; +sub.f64 fd562, fd561, fd560; +shl.b32 r18, r17, 3; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 968, r19; +st.shared.f64 [r20], fd343; +st.shared.f64 [r20+88], fd470; +st.shared.f64 [r20+176], fd480; +st.shared.f64 [r20+264], fd490; +st.shared.f64 [r20+352], fd500; +st.shared.f64 [r20+440], fd510; +st.shared.f64 [r20+528], fd519; +st.shared.f64 [r20+616], fd529; +st.shared.f64 [r20+704], fd539; +st.shared.f64 [r20+792], fd549; +st.shared.f64 [r20+880], fd559; +barrier.sync 0; +ld.shared.f64 fd563, [r14]; +ld.shared.f64 fd564, [r14+968]; +ld.shared.f64 fd565, [r14+1936]; +ld.shared.f64 fd566, [r14+2904]; +ld.shared.f64 fd567, [r14+3872]; +ld.shared.f64 fd568, [r14+4840]; +ld.shared.f64 fd569, [r14+5808]; +ld.shared.f64 fd570, [r14+6776]; +ld.shared.f64 fd571, [r14+7744]; +ld.shared.f64 fd572, [r14+8712]; +ld.shared.f64 fd573, [r14+9680]; +barrier.sync 0; +st.shared.f64 [r20], fd344; +st.shared.f64 [r20+88], fd473; +st.shared.f64 [r20+176], fd483; +st.shared.f64 [r20+264], fd493; +st.shared.f64 [r20+352], fd503; +st.shared.f64 [r20+440], fd513; +st.shared.f64 [r20+528], fd522; +st.shared.f64 [r20+616], fd532; +st.shared.f64 [r20+704], fd542; +st.shared.f64 [r20+792], fd552; +st.shared.f64 [r20+880], fd562; +barrier.sync 0; +ld.shared.f64 fd574, [r14]; +ld.shared.f64 fd575, [r14+968]; +ld.shared.f64 fd576, [r14+1936]; +ld.shared.f64 fd577, [r14+2904]; +ld.shared.f64 fd578, [r14+3872]; +ld.shared.f64 fd579, [r14+4840]; +ld.shared.f64 fd580, [r14+5808]; +ld.shared.f64 fd581, [r14+6776]; +ld.shared.f64 fd582, [r14+7744]; +ld.shared.f64 fd583, [r14+8712]; +ld.shared.f64 fd584, [r14+9680]; +add.f64 fd585, fd564, fd573; +add.f64 fd586, fd575, fd584; +sub.f64 fd587, fd564, fd573; +sub.f64 fd588, fd575, fd584; +add.f64 fd589, fd565, fd572; +add.f64 fd590, fd576, fd583; +sub.f64 fd591, fd565, fd572; +sub.f64 fd592, fd576, fd583; +add.f64 fd593, fd566, fd571; +add.f64 fd594, fd577, fd582; +sub.f64 fd595, fd566, fd571; +sub.f64 fd596, fd577, fd582; +add.f64 fd597, fd567, fd570; +add.f64 fd598, fd578, fd581; +sub.f64 fd599, fd567, fd570; +sub.f64 fd600, fd578, fd581; +add.f64 fd601, fd568, fd569; +add.f64 fd602, fd579, fd580; +sub.f64 fd603, fd568, fd569; +sub.f64 fd604, fd579, fd580; +add.f64 fd605, fd563, fd585; +add.f64 fd606, fd574, fd586; +add.f64 fd607, fd605, fd589; +add.f64 fd608, fd606, fd590; +add.f64 fd609, fd607, fd593; +add.f64 fd610, fd608, fd594; +add.f64 fd611, fd609, fd597; +add.f64 fd612, fd610, fd598; +fma.rn.f64 fd613, fd585, 0d3FEAEB8C8764F0BA, fd563; +fma.rn.f64 fd614, fd588, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd615, fd586, 0d3FEAEB8C8764F0BA, fd574; +fma.rn.f64 fd616, fd587, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd617, fd589, 0d3FDA9628D9C712B6, fd613; +fma.rn.f64 fd618, fd592, 0d3FED1BB48EEE2C13, fd614; +fma.rn.f64 fd619, fd590, 0d3FDA9628D9C712B6, fd615; +fma.rn.f64 fd620, fd591, 0d3FED1BB48EEE2C13, fd616; +fma.rn.f64 fd621, fd593, 0dBFC2375F640F44DB, fd617; +fma.rn.f64 fd622, fd596, 0d3FEFAC9E043842EF, fd618; +fma.rn.f64 fd623, fd594, 0dBFC2375F640F44DB, fd619; +fma.rn.f64 fd624, fd595, 0d3FEFAC9E043842EF, fd620; +fma.rn.f64 fd625, fd597, 0dBFE4F49E7F775887, fd621; +fma.rn.f64 fd626, fd600, 0d3FE82F19BB3A28A1, fd622; +fma.rn.f64 fd627, fd598, 0dBFE4F49E7F775887, fd623; +fma.rn.f64 fd628, fd599, 0d3FE82F19BB3A28A1, fd624; +fma.rn.f64 fd629, fd601, 0dBFEEB42A9BCD5057, fd625; +fma.rn.f64 fd630, fd604, 0d3FD207E7FD768DBF, fd626; +fma.rn.f64 fd631, fd602, 0dBFEEB42A9BCD5057, fd627; +fma.rn.f64 fd632, fd603, 0d3FD207E7FD768DBF, fd628; +fma.rn.f64 fd633, fd585, 0d3FDA9628D9C712B6, fd563; +fma.rn.f64 fd634, fd588, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd635, fd586, 0d3FDA9628D9C712B6, fd574; +fma.rn.f64 fd636, fd587, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd637, fd589, 0dBFE4F49E7F775887, fd633; +fma.rn.f64 fd638, fd592, 0d3FE82F19BB3A28A1, fd634; +fma.rn.f64 fd639, fd590, 0dBFE4F49E7F775887, fd635; +fma.rn.f64 fd640, fd591, 0d3FE82F19BB3A28A1, fd636; +fma.rn.f64 fd641, fd593, 0dBFEEB42A9BCD5057, fd637; +fma.rn.f64 fd642, fd596, 0dBFD207E7FD768DBF, fd638; +fma.rn.f64 fd643, fd594, 0dBFEEB42A9BCD5057, fd639; +fma.rn.f64 fd644, fd595, 0dBFD207E7FD768DBF, fd640; +fma.rn.f64 fd645, fd597, 0dBFC2375F640F44DB, fd641; +fma.rn.f64 fd646, fd600, 0dBFEFAC9E043842EF, fd642; +fma.rn.f64 fd647, fd598, 0dBFC2375F640F44DB, fd643; +fma.rn.f64 fd648, fd599, 0dBFEFAC9E043842EF, fd644; +fma.rn.f64 fd649, fd601, 0d3FEAEB8C8764F0BA, fd645; +fma.rn.f64 fd650, fd604, 0dBFE14CEDF8BB580B, fd646; +fma.rn.f64 fd651, fd602, 0d3FEAEB8C8764F0BA, fd647; +fma.rn.f64 fd652, fd603, 0dBFE14CEDF8BB580B, fd648; +fma.rn.f64 fd653, fd585, 0dBFC2375F640F44DB, fd563; +fma.rn.f64 fd654, fd588, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd655, fd586, 0dBFC2375F640F44DB, fd574; +fma.rn.f64 fd656, fd587, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd657, fd589, 0dBFEEB42A9BCD5057, fd653; +fma.rn.f64 fd658, fd592, 0dBFD207E7FD768DBF, fd654; +fma.rn.f64 fd659, fd590, 0dBFEEB42A9BCD5057, fd655; +fma.rn.f64 fd660, fd591, 0dBFD207E7FD768DBF, fd656; +fma.rn.f64 fd661, fd593, 0d3FDA9628D9C712B6, fd657; +fma.rn.f64 fd662, fd596, 0dBFED1BB48EEE2C13, fd658; +fma.rn.f64 fd663, fd594, 0d3FDA9628D9C712B6, fd659; +fma.rn.f64 fd664, fd595, 0dBFED1BB48EEE2C13, fd660; +fma.rn.f64 fd665, fd597, 0d3FEAEB8C8764F0BA, fd661; +fma.rn.f64 fd666, fd600, 0d3FE14CEDF8BB580B, fd662; +fma.rn.f64 fd667, fd598, 0d3FEAEB8C8764F0BA, fd663; +fma.rn.f64 fd668, fd599, 0d3FE14CEDF8BB580B, fd664; +fma.rn.f64 fd669, fd601, 0dBFE4F49E7F775887, fd665; +fma.rn.f64 fd670, fd604, 0d3FE82F19BB3A28A1, fd666; +fma.rn.f64 fd671, fd602, 0dBFE4F49E7F775887, fd667; +fma.rn.f64 fd672, fd603, 0d3FE82F19BB3A28A1, fd668; +fma.rn.f64 fd673, fd585, 0dBFE4F49E7F775887, fd563; +fma.rn.f64 fd674, fd588, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd675, fd586, 0dBFE4F49E7F775887, fd574; +fma.rn.f64 fd676, fd587, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd677, fd589, 0dBFC2375F640F44DB, fd673; +fma.rn.f64 fd678, fd592, 0dBFEFAC9E043842EF, fd674; +fma.rn.f64 fd679, fd590, 0dBFC2375F640F44DB, fd675; +fma.rn.f64 fd680, fd591, 0dBFEFAC9E043842EF, fd676; +fma.rn.f64 fd681, fd593, 0d3FEAEB8C8764F0BA, fd677; +fma.rn.f64 fd682, fd596, 0d3FE14CEDF8BB580B, fd678; +fma.rn.f64 fd683, fd594, 0d3FEAEB8C8764F0BA, fd679; +fma.rn.f64 fd684, fd595, 0d3FE14CEDF8BB580B, fd680; +fma.rn.f64 fd685, fd597, 0dBFEEB42A9BCD5057, fd681; +fma.rn.f64 fd686, fd600, 0d3FD207E7FD768DBF, fd682; +fma.rn.f64 fd687, fd598, 0dBFEEB42A9BCD5057, fd683; +fma.rn.f64 fd688, fd599, 0d3FD207E7FD768DBF, fd684; +fma.rn.f64 fd689, fd601, 0d3FDA9628D9C712B6, fd685; +fma.rn.f64 fd690, fd604, 0dBFED1BB48EEE2C13, fd686; +fma.rn.f64 fd691, fd602, 0d3FDA9628D9C712B6, fd687; +fma.rn.f64 fd692, fd603, 0dBFED1BB48EEE2C13, fd688; +fma.rn.f64 fd693, fd585, 0dBFEEB42A9BCD5057, fd563; +fma.rn.f64 fd694, fd588, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd695, fd586, 0dBFEEB42A9BCD5057, fd574; +fma.rn.f64 fd696, fd587, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd697, fd589, 0d3FEAEB8C8764F0BA, fd693; +fma.rn.f64 fd698, fd592, 0dBFE14CEDF8BB580B, fd694; +fma.rn.f64 fd699, fd590, 0d3FEAEB8C8764F0BA, fd695; +fma.rn.f64 fd700, fd591, 0dBFE14CEDF8BB580B, fd696; +fma.rn.f64 fd701, fd593, 0dBFE4F49E7F775887, fd697; +fma.rn.f64 fd702, fd596, 0d3FE82F19BB3A28A1, fd698; +fma.rn.f64 fd703, fd594, 0dBFE4F49E7F775887, fd699; +fma.rn.f64 fd704, fd595, 0d3FE82F19BB3A28A1, fd700; +fma.rn.f64 fd705, fd597, 0d3FDA9628D9C712B6, fd701; +fma.rn.f64 fd706, fd600, 0dBFED1BB48EEE2C13, fd702; +fma.rn.f64 fd707, fd598, 0d3FDA9628D9C712B6, fd703; +fma.rn.f64 fd708, fd599, 0dBFED1BB48EEE2C13, fd704; +fma.rn.f64 fd709, fd601, 0dBFC2375F640F44DB, fd705; +fma.rn.f64 fd710, fd604, 0d3FEFAC9E043842EF, fd706; +fma.rn.f64 fd711, fd602, 0dBFC2375F640F44DB, fd707; +fma.rn.f64 fd712, fd603, 0d3FEFAC9E043842EF, fd708; +add.f64 %0, fd611, fd601; +add.f64 %1, fd612, fd602; +add.f64 %3, fd632, fd631; +sub.f64 %2, fd629, fd630; +add.f64 %5, fd652, fd651; +sub.f64 %4, fd649, fd650; +add.f64 %7, fd672, fd671; +sub.f64 %6, fd669, fd670; +add.f64 %9, fd692, fd691; +sub.f64 %8, fd689, fd690; +add.f64 %11, fd712, fd711; +sub.f64 %10, fd709, fd710; +sub.f64 %13, fd711, fd712; +add.f64 %12, fd710, fd709; +sub.f64 %15, fd691, fd692; +add.f64 %14, fd690, fd689; +sub.f64 %17, fd671, fd672; +add.f64 %16, fd670, fd669; +sub.f64 %19, fd651, fd652; +add.f64 %18, fd650, fd649; +sub.f64 %21, fd631, fd632; +add.f64 %20, fd630, fd629; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_1331), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<728, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<779>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %22; +mad.lo.s32 r3, r1, 21296, r2; +add.f64 fd45, %27, %51; +add.f64 fd46, %29, %52; +sub.f64 fd47, %27, %51; +sub.f64 fd48, %29, %52; +add.f64 fd49, %30, %49; +add.f64 fd50, %32, %50; +sub.f64 fd51, %30, %49; +sub.f64 fd52, %32, %50; +add.f64 fd53, %33, %46; +add.f64 fd54, %34, %48; +sub.f64 fd55, %33, %46; +sub.f64 fd56, %34, %48; +add.f64 fd57, %35, %43; +add.f64 fd58, %37, %45; +sub.f64 fd59, %35, %43; +sub.f64 fd60, %37, %45; +add.f64 fd61, %38, %41; +add.f64 fd62, %40, %42; +sub.f64 fd63, %38, %41; +sub.f64 fd64, %40, %42; +mov.u32 r4, %tid.x; +add.f64 fd65, %25, fd45; +add.f64 fd66, %26, fd46; +add.f64 fd67, fd65, fd49; +add.f64 fd68, fd66, fd50; +add.f64 fd69, fd67, fd53; +add.f64 fd70, fd68, fd54; +add.f64 fd71, fd69, fd57; +add.f64 fd72, fd70, fd58; +fma.rn.f64 fd73, fd45, 0d3FEAEB8C8764F0BA, %25; +fma.rn.f64 fd74, fd48, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd75, fd46, 0d3FEAEB8C8764F0BA, %26; +fma.rn.f64 fd76, fd47, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd77, fd49, 0d3FDA9628D9C712B6, fd73; +fma.rn.f64 fd78, fd52, 0d3FED1BB48EEE2C13, fd74; +fma.rn.f64 fd79, fd50, 0d3FDA9628D9C712B6, fd75; +fma.rn.f64 fd80, fd51, 0d3FED1BB48EEE2C13, fd76; +fma.rn.f64 fd81, fd53, 0dBFC2375F640F44DB, fd77; +fma.rn.f64 fd82, fd56, 0d3FEFAC9E043842EF, fd78; +fma.rn.f64 fd83, fd54, 0dBFC2375F640F44DB, fd79; +fma.rn.f64 fd84, fd55, 0d3FEFAC9E043842EF, fd80; +fma.rn.f64 fd85, fd57, 0dBFE4F49E7F775887, fd81; +fma.rn.f64 fd86, fd60, 0d3FE82F19BB3A28A1, fd82; +fma.rn.f64 fd87, fd58, 0dBFE4F49E7F775887, fd83; +fma.rn.f64 fd88, fd59, 0d3FE82F19BB3A28A1, fd84; +fma.rn.f64 fd89, fd61, 0dBFEEB42A9BCD5057, fd85; +fma.rn.f64 fd90, fd64, 0d3FD207E7FD768DBF, fd86; +fma.rn.f64 fd91, fd62, 0dBFEEB42A9BCD5057, fd87; +fma.rn.f64 fd92, fd63, 0d3FD207E7FD768DBF, fd88; +sub.f64 fd93, fd89, fd90; +add.f64 fd94, fd92, fd91; +add.f64 fd95, fd90, fd89; +sub.f64 fd96, fd91, fd92; +fma.rn.f64 fd97, fd45, 0d3FDA9628D9C712B6, %25; +fma.rn.f64 fd98, fd48, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd99, fd46, 0d3FDA9628D9C712B6, %26; +fma.rn.f64 fd100, fd47, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd101, fd49, 0dBFE4F49E7F775887, fd97; +fma.rn.f64 fd102, fd52, 0d3FE82F19BB3A28A1, fd98; +fma.rn.f64 fd103, fd50, 0dBFE4F49E7F775887, fd99; +fma.rn.f64 fd104, fd51, 0d3FE82F19BB3A28A1, fd100; +fma.rn.f64 fd105, fd53, 0dBFEEB42A9BCD5057, fd101; +fma.rn.f64 fd106, fd56, 0dBFD207E7FD768DBF, fd102; +fma.rn.f64 fd107, fd54, 0dBFEEB42A9BCD5057, fd103; +fma.rn.f64 fd108, fd55, 0dBFD207E7FD768DBF, fd104; +fma.rn.f64 fd109, fd57, 0dBFC2375F640F44DB, fd105; +fma.rn.f64 fd110, fd60, 0dBFEFAC9E043842EF, fd106; +fma.rn.f64 fd111, fd58, 0dBFC2375F640F44DB, fd107; +fma.rn.f64 fd112, fd59, 0dBFEFAC9E043842EF, fd108; +fma.rn.f64 fd113, fd61, 0d3FEAEB8C8764F0BA, fd109; +fma.rn.f64 fd114, fd64, 0dBFE14CEDF8BB580B, fd110; +fma.rn.f64 fd115, fd62, 0d3FEAEB8C8764F0BA, fd111; +fma.rn.f64 fd116, fd63, 0dBFE14CEDF8BB580B, fd112; +sub.f64 fd117, fd113, fd114; +add.f64 fd118, fd116, fd115; +add.f64 fd119, fd114, fd113; +sub.f64 fd120, fd115, fd116; +fma.rn.f64 fd121, fd45, 0dBFC2375F640F44DB, %25; +fma.rn.f64 fd122, fd48, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd123, fd46, 0dBFC2375F640F44DB, %26; +fma.rn.f64 fd124, fd47, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd125, fd49, 0dBFEEB42A9BCD5057, fd121; +fma.rn.f64 fd126, fd52, 0dBFD207E7FD768DBF, fd122; +fma.rn.f64 fd127, fd50, 0dBFEEB42A9BCD5057, fd123; +fma.rn.f64 fd128, fd51, 0dBFD207E7FD768DBF, fd124; +fma.rn.f64 fd129, fd53, 0d3FDA9628D9C712B6, fd125; +fma.rn.f64 fd130, fd56, 0dBFED1BB48EEE2C13, fd126; +fma.rn.f64 fd131, fd54, 0d3FDA9628D9C712B6, fd127; +fma.rn.f64 fd132, fd55, 0dBFED1BB48EEE2C13, fd128; +fma.rn.f64 fd133, fd57, 0d3FEAEB8C8764F0BA, fd129; +fma.rn.f64 fd134, fd60, 0d3FE14CEDF8BB580B, fd130; +fma.rn.f64 fd135, fd58, 0d3FEAEB8C8764F0BA, fd131; +fma.rn.f64 fd136, fd59, 0d3FE14CEDF8BB580B, fd132; +fma.rn.f64 fd137, fd61, 0dBFE4F49E7F775887, fd133; +fma.rn.f64 fd138, fd64, 0d3FE82F19BB3A28A1, fd134; +fma.rn.f64 fd139, fd62, 0dBFE4F49E7F775887, fd135; +fma.rn.f64 fd140, fd63, 0d3FE82F19BB3A28A1, fd136; +sub.f64 fd141, fd137, fd138; +add.f64 fd142, fd140, fd139; +add.f64 fd143, fd138, fd137; +sub.f64 fd144, fd139, fd140; +fma.rn.f64 fd145, fd45, 0dBFE4F49E7F775887, %25; +fma.rn.f64 fd146, fd48, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd147, fd46, 0dBFE4F49E7F775887, %26; +fma.rn.f64 fd148, fd47, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd149, fd49, 0dBFC2375F640F44DB, fd145; +fma.rn.f64 fd150, fd52, 0dBFEFAC9E043842EF, fd146; +fma.rn.f64 fd151, fd50, 0dBFC2375F640F44DB, fd147; +fma.rn.f64 fd152, fd51, 0dBFEFAC9E043842EF, fd148; +fma.rn.f64 fd153, fd53, 0d3FEAEB8C8764F0BA, fd149; +fma.rn.f64 fd154, fd56, 0d3FE14CEDF8BB580B, fd150; +fma.rn.f64 fd155, fd54, 0d3FEAEB8C8764F0BA, fd151; +fma.rn.f64 fd156, fd55, 0d3FE14CEDF8BB580B, fd152; +fma.rn.f64 fd157, fd57, 0dBFEEB42A9BCD5057, fd153; +fma.rn.f64 fd158, fd60, 0d3FD207E7FD768DBF, fd154; +fma.rn.f64 fd159, fd58, 0dBFEEB42A9BCD5057, fd155; +fma.rn.f64 fd160, fd59, 0d3FD207E7FD768DBF, fd156; +fma.rn.f64 fd161, fd61, 0d3FDA9628D9C712B6, fd157; +fma.rn.f64 fd162, fd64, 0dBFED1BB48EEE2C13, fd158; +fma.rn.f64 fd163, fd62, 0d3FDA9628D9C712B6, fd159; +fma.rn.f64 fd164, fd63, 0dBFED1BB48EEE2C13, fd160; +sub.f64 fd165, fd161, fd162; +add.f64 fd166, fd164, fd163; +add.f64 fd167, fd162, fd161; +sub.f64 fd168, fd163, fd164; +fma.rn.f64 fd169, fd45, 0dBFEEB42A9BCD5057, %25; +fma.rn.f64 fd170, fd48, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd171, fd46, 0dBFEEB42A9BCD5057, %26; +fma.rn.f64 fd172, fd47, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd173, fd49, 0d3FEAEB8C8764F0BA, fd169; +fma.rn.f64 fd174, fd52, 0dBFE14CEDF8BB580B, fd170; +fma.rn.f64 fd175, fd50, 0d3FEAEB8C8764F0BA, fd171; +fma.rn.f64 fd176, fd51, 0dBFE14CEDF8BB580B, fd172; +fma.rn.f64 fd177, fd53, 0dBFE4F49E7F775887, fd173; +fma.rn.f64 fd178, fd56, 0d3FE82F19BB3A28A1, fd174; +fma.rn.f64 fd179, fd54, 0dBFE4F49E7F775887, fd175; +fma.rn.f64 fd180, fd55, 0d3FE82F19BB3A28A1, fd176; +fma.rn.f64 fd181, fd57, 0d3FDA9628D9C712B6, fd177; +fma.rn.f64 fd182, fd60, 0dBFED1BB48EEE2C13, fd178; +fma.rn.f64 fd183, fd58, 0d3FDA9628D9C712B6, fd179; +fma.rn.f64 fd184, fd59, 0dBFED1BB48EEE2C13, fd180; +fma.rn.f64 fd185, fd61, 0dBFC2375F640F44DB, fd181; +fma.rn.f64 fd186, fd64, 0d3FEFAC9E043842EF, fd182; +fma.rn.f64 fd187, fd62, 0dBFC2375F640F44DB, fd183; +fma.rn.f64 fd188, fd63, 0d3FEFAC9E043842EF, fd184; +sub.f64 fd189, fd185, fd186; +add.f64 fd190, fd188, fd187; +add.f64 fd191, fd186, fd185; +sub.f64 fd192, fd187, fd188; +mul.wide.u32 rd2, r4, 248469183; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 6; +mul.lo.s32 r10, r9, 121; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 21296, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %23; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd193, fd194}, [rd6]; +mul.f64 fd197, fd94, fd194; +mul.f64 fd198, fd93, fd194; +mul.f64 fd199, fd193, fd94; +mul.f64 fd200, fd193, fd193; +mul.f64 fd201, fd194, fd194; +sub.f64 fd202, fd200, fd201; +mul.f64 fd203, fd194, fd193; +fma.rn.f64 fd204, fd194, fd193, fd203; +mul.f64 fd205, fd118, fd204; +mul.f64 fd206, fd117, fd204; +mul.f64 fd207, fd202, fd118; +mul.f64 fd208, fd193, fd202; +mul.f64 fd209, fd194, fd204; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd193, fd204; +fma.rn.f64 fd212, fd194, fd202, fd211; +mul.f64 fd213, fd142, fd212; +mul.f64 fd214, fd141, fd212; +mul.f64 fd215, fd210, fd142; +mul.f64 fd216, fd193, fd210; +mul.f64 fd217, fd194, fd212; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd193, fd212; +fma.rn.f64 fd220, fd194, fd210, fd219; +mul.f64 fd221, fd166, fd220; +mul.f64 fd222, fd165, fd220; +mul.f64 fd223, fd218, fd166; +mul.f64 fd224, fd193, fd218; +mul.f64 fd225, fd194, fd220; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd193, fd220; +fma.rn.f64 fd228, fd194, fd218, fd227; +mul.f64 fd229, fd190, fd228; +mul.f64 fd230, fd189, fd228; +mul.f64 fd231, fd226, fd190; +ld.global.v2.f64 {fd232, fd233}, [rd6+1936]; +mul.f64 fd236, fd192, fd233; +mul.f64 fd237, fd191, fd233; +mul.f64 fd238, fd232, fd192; +mul.f64 fd239, fd193, fd232; +mul.f64 fd240, fd194, fd233; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd193, fd233; +fma.rn.f64 fd243, fd194, fd232, fd242; +mul.f64 fd244, fd168, fd243; +mul.f64 fd245, fd167, fd243; +mul.f64 fd246, fd241, fd168; +mul.f64 fd247, fd193, fd241; +mul.f64 fd248, fd194, fd243; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd193, fd243; +fma.rn.f64 fd251, fd194, fd241, fd250; +mul.f64 fd252, fd144, fd251; +mul.f64 fd253, fd143, fd251; +mul.f64 fd254, fd249, fd144; +mul.f64 fd255, fd193, fd249; +mul.f64 fd256, fd194, fd251; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd193, fd251; +fma.rn.f64 fd259, fd194, fd249, fd258; +mul.f64 fd260, fd120, fd259; +mul.f64 fd261, fd119, fd259; +mul.f64 fd262, fd257, fd120; +mul.f64 fd263, fd193, fd257; +mul.f64 fd264, fd194, fd259; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd193, fd259; +fma.rn.f64 fd267, fd194, fd257, fd266; +mul.f64 fd268, fd96, fd267; +mul.f64 fd269, fd95, fd267; +mul.f64 fd270, fd265, fd96; +barrier.sync 0; +mad.lo.s32 r13, r11, 176, r12; +add.f64 fd271, fd72, fd62; +add.f64 fd272, fd71, fd61; +st.shared.v2.f64 [r13], {fd272, fd271}; +fma.rn.f64 fd273, fd193, fd93, fd197; +sub.f64 fd274, fd199, fd198; +st.shared.v2.f64 [r13+16], {fd273, fd274}; +fma.rn.f64 fd275, fd202, fd117, fd205; +sub.f64 fd276, fd207, fd206; +st.shared.v2.f64 [r13+32], {fd275, fd276}; +sub.f64 fd277, fd215, fd214; +fma.rn.f64 fd278, fd210, fd141, fd213; +st.shared.v2.f64 [r13+48], {fd278, fd277}; +fma.rn.f64 fd279, fd218, fd165, fd221; +sub.f64 fd280, fd223, fd222; +st.shared.v2.f64 [r13+64], {fd279, fd280}; +fma.rn.f64 fd281, fd226, fd189, fd229; +sub.f64 fd282, fd231, fd230; +st.shared.v2.f64 [r13+80], {fd281, fd282}; +fma.rn.f64 fd283, fd232, fd191, fd236; +sub.f64 fd284, fd238, fd237; +st.shared.v2.f64 [r13+96], {fd283, fd284}; +fma.rn.f64 fd285, fd241, fd167, fd244; +sub.f64 fd286, fd246, fd245; +st.shared.v2.f64 [r13+112], {fd285, fd286}; +sub.f64 fd287, fd254, fd253; +fma.rn.f64 fd288, fd249, fd143, fd252; +st.shared.v2.f64 [r13+128], {fd288, fd287}; +fma.rn.f64 fd289, fd257, fd119, fd260; +sub.f64 fd290, fd262, fd261; +st.shared.v2.f64 [r13+144], {fd289, fd290}; +fma.rn.f64 fd291, fd265, fd95, fd268; +sub.f64 fd292, fd270, fd269; +st.shared.v2.f64 [r13+160], {fd291, fd292}; +barrier.sync 0; +mad.lo.s32 r14, r11, -160, r13; +ld.shared.v2.f64 {fd293, fd294}, [r14]; +ld.shared.v2.f64 {fd297, fd298}, [r14+1936]; +ld.shared.v2.f64 {fd301, fd302}, [r14+3872]; +ld.shared.v2.f64 {fd305, fd306}, [r14+5808]; +ld.shared.v2.f64 {fd309, fd310}, [r14+7744]; +ld.shared.v2.f64 {fd313, fd314}, [r14+9680]; +ld.shared.v2.f64 {fd317, fd318}, [r14+11616]; +ld.shared.v2.f64 {fd321, fd322}, [r14+13552]; +ld.shared.v2.f64 {fd325, fd326}, [r14+15488]; +ld.shared.v2.f64 {fd329, fd330}, [r14+17424]; +ld.shared.v2.f64 {fd333, fd334}, [r14+19360]; +add.f64 fd337, fd297, fd333; +add.f64 fd338, fd298, fd334; +sub.f64 fd339, fd297, fd333; +sub.f64 fd340, fd298, fd334; +add.f64 fd341, fd301, fd329; +add.f64 fd342, fd302, fd330; +sub.f64 fd343, fd301, fd329; +sub.f64 fd344, fd302, fd330; +add.f64 fd345, fd305, fd325; +add.f64 fd346, fd306, fd326; +sub.f64 fd347, fd305, fd325; +sub.f64 fd348, fd306, fd326; +add.f64 fd349, fd309, fd321; +add.f64 fd350, fd310, fd322; +sub.f64 fd351, fd309, fd321; +sub.f64 fd352, fd310, fd322; +add.f64 fd353, fd313, fd317; +add.f64 fd354, fd314, fd318; +sub.f64 fd355, fd313, fd317; +sub.f64 fd356, fd314, fd318; +add.f64 fd357, fd293, fd337; +add.f64 fd358, fd294, fd338; +add.f64 fd359, fd357, fd341; +add.f64 fd360, fd358, fd342; +add.f64 fd361, fd359, fd345; +add.f64 fd362, fd360, fd346; +add.f64 fd363, fd361, fd349; +add.f64 fd364, fd362, fd350; +fma.rn.f64 fd365, fd337, 0d3FEAEB8C8764F0BA, fd293; +fma.rn.f64 fd366, fd340, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd367, fd338, 0d3FEAEB8C8764F0BA, fd294; +fma.rn.f64 fd368, fd339, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd369, fd341, 0d3FDA9628D9C712B6, fd365; +fma.rn.f64 fd370, fd344, 0d3FED1BB48EEE2C13, fd366; +fma.rn.f64 fd371, fd342, 0d3FDA9628D9C712B6, fd367; +fma.rn.f64 fd372, fd343, 0d3FED1BB48EEE2C13, fd368; +fma.rn.f64 fd373, fd345, 0dBFC2375F640F44DB, fd369; +fma.rn.f64 fd374, fd348, 0d3FEFAC9E043842EF, fd370; +fma.rn.f64 fd375, fd346, 0dBFC2375F640F44DB, fd371; +fma.rn.f64 fd376, fd347, 0d3FEFAC9E043842EF, fd372; +fma.rn.f64 fd377, fd349, 0dBFE4F49E7F775887, fd373; +fma.rn.f64 fd378, fd352, 0d3FE82F19BB3A28A1, fd374; +fma.rn.f64 fd379, fd350, 0dBFE4F49E7F775887, fd375; +fma.rn.f64 fd380, fd351, 0d3FE82F19BB3A28A1, fd376; +fma.rn.f64 fd381, fd353, 0dBFEEB42A9BCD5057, fd377; +fma.rn.f64 fd382, fd356, 0d3FD207E7FD768DBF, fd378; +fma.rn.f64 fd383, fd354, 0dBFEEB42A9BCD5057, fd379; +fma.rn.f64 fd384, fd355, 0d3FD207E7FD768DBF, fd380; +sub.f64 fd385, fd381, fd382; +add.f64 fd386, fd384, fd383; +add.f64 fd387, fd382, fd381; +sub.f64 fd388, fd383, fd384; +fma.rn.f64 fd389, fd337, 0d3FDA9628D9C712B6, fd293; +fma.rn.f64 fd390, fd340, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd391, fd338, 0d3FDA9628D9C712B6, fd294; +fma.rn.f64 fd392, fd339, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd393, fd341, 0dBFE4F49E7F775887, fd389; +fma.rn.f64 fd394, fd344, 0d3FE82F19BB3A28A1, fd390; +fma.rn.f64 fd395, fd342, 0dBFE4F49E7F775887, fd391; +fma.rn.f64 fd396, fd343, 0d3FE82F19BB3A28A1, fd392; +fma.rn.f64 fd397, fd345, 0dBFEEB42A9BCD5057, fd393; +fma.rn.f64 fd398, fd348, 0dBFD207E7FD768DBF, fd394; +fma.rn.f64 fd399, fd346, 0dBFEEB42A9BCD5057, fd395; +fma.rn.f64 fd400, fd347, 0dBFD207E7FD768DBF, fd396; +fma.rn.f64 fd401, fd349, 0dBFC2375F640F44DB, fd397; +fma.rn.f64 fd402, fd352, 0dBFEFAC9E043842EF, fd398; +fma.rn.f64 fd403, fd350, 0dBFC2375F640F44DB, fd399; +fma.rn.f64 fd404, fd351, 0dBFEFAC9E043842EF, fd400; +fma.rn.f64 fd405, fd353, 0d3FEAEB8C8764F0BA, fd401; +fma.rn.f64 fd406, fd356, 0dBFE14CEDF8BB580B, fd402; +fma.rn.f64 fd407, fd354, 0d3FEAEB8C8764F0BA, fd403; +fma.rn.f64 fd408, fd355, 0dBFE14CEDF8BB580B, fd404; +sub.f64 fd409, fd405, fd406; +add.f64 fd410, fd408, fd407; +add.f64 fd411, fd406, fd405; +sub.f64 fd412, fd407, fd408; +fma.rn.f64 fd413, fd337, 0dBFC2375F640F44DB, fd293; +fma.rn.f64 fd414, fd340, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd415, fd338, 0dBFC2375F640F44DB, fd294; +fma.rn.f64 fd416, fd339, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd417, fd341, 0dBFEEB42A9BCD5057, fd413; +fma.rn.f64 fd418, fd344, 0dBFD207E7FD768DBF, fd414; +fma.rn.f64 fd419, fd342, 0dBFEEB42A9BCD5057, fd415; +fma.rn.f64 fd420, fd343, 0dBFD207E7FD768DBF, fd416; +fma.rn.f64 fd421, fd345, 0d3FDA9628D9C712B6, fd417; +fma.rn.f64 fd422, fd348, 0dBFED1BB48EEE2C13, fd418; +fma.rn.f64 fd423, fd346, 0d3FDA9628D9C712B6, fd419; +fma.rn.f64 fd424, fd347, 0dBFED1BB48EEE2C13, fd420; +fma.rn.f64 fd425, fd349, 0d3FEAEB8C8764F0BA, fd421; +fma.rn.f64 fd426, fd352, 0d3FE14CEDF8BB580B, fd422; +fma.rn.f64 fd427, fd350, 0d3FEAEB8C8764F0BA, fd423; +fma.rn.f64 fd428, fd351, 0d3FE14CEDF8BB580B, fd424; +fma.rn.f64 fd429, fd353, 0dBFE4F49E7F775887, fd425; +fma.rn.f64 fd430, fd356, 0d3FE82F19BB3A28A1, fd426; +fma.rn.f64 fd431, fd354, 0dBFE4F49E7F775887, fd427; +fma.rn.f64 fd432, fd355, 0d3FE82F19BB3A28A1, fd428; +sub.f64 fd433, fd429, fd430; +add.f64 fd434, fd432, fd431; +add.f64 fd435, fd430, fd429; +sub.f64 fd436, fd431, fd432; +fma.rn.f64 fd437, fd337, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd438, fd340, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd439, fd338, 0dBFE4F49E7F775887, fd294; +fma.rn.f64 fd440, fd339, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd441, fd341, 0dBFC2375F640F44DB, fd437; +fma.rn.f64 fd442, fd344, 0dBFEFAC9E043842EF, fd438; +fma.rn.f64 fd443, fd342, 0dBFC2375F640F44DB, fd439; +fma.rn.f64 fd444, fd343, 0dBFEFAC9E043842EF, fd440; +fma.rn.f64 fd445, fd345, 0d3FEAEB8C8764F0BA, fd441; +fma.rn.f64 fd446, fd348, 0d3FE14CEDF8BB580B, fd442; +fma.rn.f64 fd447, fd346, 0d3FEAEB8C8764F0BA, fd443; +fma.rn.f64 fd448, fd347, 0d3FE14CEDF8BB580B, fd444; +fma.rn.f64 fd449, fd349, 0dBFEEB42A9BCD5057, fd445; +fma.rn.f64 fd450, fd352, 0d3FD207E7FD768DBF, fd446; +fma.rn.f64 fd451, fd350, 0dBFEEB42A9BCD5057, fd447; +fma.rn.f64 fd452, fd351, 0d3FD207E7FD768DBF, fd448; +fma.rn.f64 fd453, fd353, 0d3FDA9628D9C712B6, fd449; +fma.rn.f64 fd454, fd356, 0dBFED1BB48EEE2C13, fd450; +fma.rn.f64 fd455, fd354, 0d3FDA9628D9C712B6, fd451; +fma.rn.f64 fd456, fd355, 0dBFED1BB48EEE2C13, fd452; +sub.f64 fd457, fd453, fd454; +add.f64 fd458, fd456, fd455; +add.f64 fd459, fd454, fd453; +sub.f64 fd460, fd455, fd456; +fma.rn.f64 fd461, fd337, 0dBFEEB42A9BCD5057, fd293; +fma.rn.f64 fd462, fd340, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd463, fd338, 0dBFEEB42A9BCD5057, fd294; +fma.rn.f64 fd464, fd339, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd465, fd341, 0d3FEAEB8C8764F0BA, fd461; +fma.rn.f64 fd466, fd344, 0dBFE14CEDF8BB580B, fd462; +fma.rn.f64 fd467, fd342, 0d3FEAEB8C8764F0BA, fd463; +fma.rn.f64 fd468, fd343, 0dBFE14CEDF8BB580B, fd464; +fma.rn.f64 fd469, fd345, 0dBFE4F49E7F775887, fd465; +fma.rn.f64 fd470, fd348, 0d3FE82F19BB3A28A1, fd466; +fma.rn.f64 fd471, fd346, 0dBFE4F49E7F775887, fd467; +fma.rn.f64 fd472, fd347, 0d3FE82F19BB3A28A1, fd468; +fma.rn.f64 fd473, fd349, 0d3FDA9628D9C712B6, fd469; +fma.rn.f64 fd474, fd352, 0dBFED1BB48EEE2C13, fd470; +fma.rn.f64 fd475, fd350, 0d3FDA9628D9C712B6, fd471; +fma.rn.f64 fd476, fd351, 0dBFED1BB48EEE2C13, fd472; +fma.rn.f64 fd477, fd353, 0dBFC2375F640F44DB, fd473; +fma.rn.f64 fd478, fd356, 0d3FEFAC9E043842EF, fd474; +fma.rn.f64 fd479, fd354, 0dBFC2375F640F44DB, fd475; +fma.rn.f64 fd480, fd355, 0d3FEFAC9E043842EF, fd476; +sub.f64 fd481, fd477, fd478; +add.f64 fd482, fd480, fd479; +add.f64 fd483, fd478, fd477; +sub.f64 fd484, fd479, fd480; +mul.wide.u32 rd7, r11, -1171354717; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r15, rd8; +mul.lo.s32 r16, r15, 11; +sub.s32 r17, r11, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %24; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd485, fd486}, [rd11]; +mul.f64 fd489, fd386, fd486; +mul.f64 fd490, fd385, fd486; +mul.f64 fd491, fd485, fd386; +mul.f64 fd492, fd485, fd485; +mul.f64 fd493, fd486, fd486; +sub.f64 fd494, fd492, fd493; +mul.f64 fd495, fd486, fd485; +fma.rn.f64 fd496, fd486, fd485, fd495; +mul.f64 fd497, fd410, fd496; +mul.f64 fd498, fd409, fd496; +mul.f64 fd499, fd494, fd410; +mul.f64 fd500, fd485, fd494; +mul.f64 fd501, fd486, fd496; +sub.f64 fd502, fd500, fd501; +mul.f64 fd503, fd485, fd496; +fma.rn.f64 fd504, fd486, fd494, fd503; +mul.f64 fd505, fd434, fd504; +mul.f64 fd506, fd433, fd504; +mul.f64 fd507, fd502, fd434; +mul.f64 fd508, fd485, fd502; +mul.f64 fd509, fd486, fd504; +sub.f64 fd510, fd508, fd509; +mul.f64 fd511, fd485, fd504; +fma.rn.f64 fd512, fd486, fd502, fd511; +mul.f64 fd513, fd458, fd512; +mul.f64 fd514, fd457, fd512; +mul.f64 fd515, fd510, fd458; +mul.f64 fd516, fd485, fd510; +mul.f64 fd517, fd486, fd512; +sub.f64 fd518, fd516, fd517; +mul.f64 fd519, fd485, fd512; +fma.rn.f64 fd520, fd486, fd510, fd519; +mul.f64 fd521, fd482, fd520; +mul.f64 fd522, fd481, fd520; +mul.f64 fd523, fd518, fd482; +ld.global.v2.f64 {fd524, fd525}, [rd11+176]; +mul.f64 fd528, fd484, fd525; +mul.f64 fd529, fd483, fd525; +mul.f64 fd530, fd524, fd484; +mul.f64 fd531, fd485, fd524; +mul.f64 fd532, fd486, fd525; +sub.f64 fd533, fd531, fd532; +mul.f64 fd534, fd485, fd525; +fma.rn.f64 fd535, fd486, fd524, fd534; +mul.f64 fd536, fd460, fd535; +mul.f64 fd537, fd459, fd535; +mul.f64 fd538, fd533, fd460; +mul.f64 fd539, fd485, fd533; +mul.f64 fd540, fd486, fd535; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd485, fd535; +fma.rn.f64 fd543, fd486, fd533, fd542; +mul.f64 fd544, fd436, fd543; +mul.f64 fd545, fd435, fd543; +mul.f64 fd546, fd541, fd436; +mul.f64 fd547, fd485, fd541; +mul.f64 fd548, fd486, fd543; +sub.f64 fd549, fd547, fd548; +mul.f64 fd550, fd485, fd543; +fma.rn.f64 fd551, fd486, fd541, fd550; +mul.f64 fd552, fd412, fd551; +mul.f64 fd553, fd411, fd551; +mul.f64 fd554, fd549, fd412; +mul.f64 fd555, fd485, fd549; +mul.f64 fd556, fd486, fd551; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd485, fd551; +fma.rn.f64 fd559, fd486, fd549, fd558; +mul.f64 fd560, fd388, fd559; +mul.f64 fd561, fd387, fd559; +mul.f64 fd562, fd557, fd388; +shl.b32 r18, r17, 4; +add.s32 r19, r12, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 1936, r19; +add.f64 fd563, fd364, fd354; +add.f64 fd564, fd363, fd353; +st.shared.v2.f64 [r20], {fd564, fd563}; +fma.rn.f64 fd565, fd485, fd385, fd489; +sub.f64 fd566, fd491, fd490; +st.shared.v2.f64 [r20+176], {fd565, fd566}; +fma.rn.f64 fd567, fd494, fd409, fd497; +sub.f64 fd568, fd499, fd498; +st.shared.v2.f64 [r20+352], {fd567, fd568}; +fma.rn.f64 fd569, fd502, fd433, fd505; +sub.f64 fd570, fd507, fd506; +st.shared.v2.f64 [r20+528], {fd569, fd570}; +fma.rn.f64 fd571, fd510, fd457, fd513; +sub.f64 fd572, fd515, fd514; +st.shared.v2.f64 [r20+704], {fd571, fd572}; +sub.f64 fd573, fd523, fd522; +fma.rn.f64 fd574, fd518, fd481, fd521; +st.shared.v2.f64 [r20+880], {fd574, fd573}; +fma.rn.f64 fd575, fd524, fd483, fd528; +sub.f64 fd576, fd530, fd529; +st.shared.v2.f64 [r20+1056], {fd575, fd576}; +fma.rn.f64 fd577, fd533, fd459, fd536; +sub.f64 fd578, fd538, fd537; +st.shared.v2.f64 [r20+1232], {fd577, fd578}; +fma.rn.f64 fd579, fd541, fd435, fd544; +sub.f64 fd580, fd546, fd545; +st.shared.v2.f64 [r20+1408], {fd579, fd580}; +fma.rn.f64 fd581, fd549, fd411, fd552; +sub.f64 fd582, fd554, fd553; +st.shared.v2.f64 [r20+1584], {fd581, fd582}; +sub.f64 fd583, fd562, fd561; +fma.rn.f64 fd584, fd557, fd387, fd560; +st.shared.v2.f64 [r20+1760], {fd584, fd583}; +barrier.sync 0; +ld.shared.v2.f64 {fd585, fd586}, [r14]; +ld.shared.v2.f64 {fd589, fd590}, [r14+1936]; +ld.shared.v2.f64 {fd593, fd594}, [r14+3872]; +ld.shared.v2.f64 {fd597, fd598}, [r14+5808]; +ld.shared.v2.f64 {fd601, fd602}, [r14+7744]; +ld.shared.v2.f64 {fd605, fd606}, [r14+9680]; +ld.shared.v2.f64 {fd609, fd610}, [r14+11616]; +ld.shared.v2.f64 {fd613, fd614}, [r14+13552]; +ld.shared.v2.f64 {fd617, fd618}, [r14+15488]; +ld.shared.v2.f64 {fd621, fd622}, [r14+17424]; +ld.shared.v2.f64 {fd625, fd626}, [r14+19360]; +add.f64 fd629, fd589, fd625; +add.f64 fd630, fd590, fd626; +sub.f64 fd631, fd589, fd625; +sub.f64 fd632, fd590, fd626; +add.f64 fd633, fd593, fd621; +add.f64 fd634, fd594, fd622; +sub.f64 fd635, fd593, fd621; +sub.f64 fd636, fd594, fd622; +add.f64 fd637, fd597, fd617; +add.f64 fd638, fd598, fd618; +sub.f64 fd639, fd597, fd617; +sub.f64 fd640, fd598, fd618; +add.f64 fd641, fd601, fd613; +add.f64 fd642, fd602, fd614; +sub.f64 fd643, fd601, fd613; +sub.f64 fd644, fd602, fd614; +add.f64 fd645, fd605, fd609; +add.f64 fd646, fd606, fd610; +sub.f64 fd647, fd605, fd609; +sub.f64 fd648, fd606, fd610; +add.f64 fd649, fd585, fd629; +add.f64 fd650, fd586, fd630; +add.f64 fd651, fd649, fd633; +add.f64 fd652, fd650, fd634; +add.f64 fd653, fd651, fd637; +add.f64 fd654, fd652, fd638; +add.f64 fd655, fd653, fd641; +add.f64 fd656, fd654, fd642; +fma.rn.f64 fd657, fd629, 0d3FEAEB8C8764F0BA, fd585; +fma.rn.f64 fd658, fd632, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd659, fd630, 0d3FEAEB8C8764F0BA, fd586; +fma.rn.f64 fd660, fd631, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd661, fd633, 0d3FDA9628D9C712B6, fd657; +fma.rn.f64 fd662, fd636, 0d3FED1BB48EEE2C13, fd658; +fma.rn.f64 fd663, fd634, 0d3FDA9628D9C712B6, fd659; +fma.rn.f64 fd664, fd635, 0d3FED1BB48EEE2C13, fd660; +fma.rn.f64 fd665, fd637, 0dBFC2375F640F44DB, fd661; +fma.rn.f64 fd666, fd640, 0d3FEFAC9E043842EF, fd662; +fma.rn.f64 fd667, fd638, 0dBFC2375F640F44DB, fd663; +fma.rn.f64 fd668, fd639, 0d3FEFAC9E043842EF, fd664; +fma.rn.f64 fd669, fd641, 0dBFE4F49E7F775887, fd665; +fma.rn.f64 fd670, fd644, 0d3FE82F19BB3A28A1, fd666; +fma.rn.f64 fd671, fd642, 0dBFE4F49E7F775887, fd667; +fma.rn.f64 fd672, fd643, 0d3FE82F19BB3A28A1, fd668; +fma.rn.f64 fd673, fd645, 0dBFEEB42A9BCD5057, fd669; +fma.rn.f64 fd674, fd648, 0d3FD207E7FD768DBF, fd670; +fma.rn.f64 fd675, fd646, 0dBFEEB42A9BCD5057, fd671; +fma.rn.f64 fd676, fd647, 0d3FD207E7FD768DBF, fd672; +fma.rn.f64 fd677, fd629, 0d3FDA9628D9C712B6, fd585; +fma.rn.f64 fd678, fd632, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd679, fd630, 0d3FDA9628D9C712B6, fd586; +fma.rn.f64 fd680, fd631, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd681, fd633, 0dBFE4F49E7F775887, fd677; +fma.rn.f64 fd682, fd636, 0d3FE82F19BB3A28A1, fd678; +fma.rn.f64 fd683, fd634, 0dBFE4F49E7F775887, fd679; +fma.rn.f64 fd684, fd635, 0d3FE82F19BB3A28A1, fd680; +fma.rn.f64 fd685, fd637, 0dBFEEB42A9BCD5057, fd681; +fma.rn.f64 fd686, fd640, 0dBFD207E7FD768DBF, fd682; +fma.rn.f64 fd687, fd638, 0dBFEEB42A9BCD5057, fd683; +fma.rn.f64 fd688, fd639, 0dBFD207E7FD768DBF, fd684; +fma.rn.f64 fd689, fd641, 0dBFC2375F640F44DB, fd685; +fma.rn.f64 fd690, fd644, 0dBFEFAC9E043842EF, fd686; +fma.rn.f64 fd691, fd642, 0dBFC2375F640F44DB, fd687; +fma.rn.f64 fd692, fd643, 0dBFEFAC9E043842EF, fd688; +fma.rn.f64 fd693, fd645, 0d3FEAEB8C8764F0BA, fd689; +fma.rn.f64 fd694, fd648, 0dBFE14CEDF8BB580B, fd690; +fma.rn.f64 fd695, fd646, 0d3FEAEB8C8764F0BA, fd691; +fma.rn.f64 fd696, fd647, 0dBFE14CEDF8BB580B, fd692; +fma.rn.f64 fd697, fd629, 0dBFC2375F640F44DB, fd585; +fma.rn.f64 fd698, fd632, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd699, fd630, 0dBFC2375F640F44DB, fd586; +fma.rn.f64 fd700, fd631, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd701, fd633, 0dBFEEB42A9BCD5057, fd697; +fma.rn.f64 fd702, fd636, 0dBFD207E7FD768DBF, fd698; +fma.rn.f64 fd703, fd634, 0dBFEEB42A9BCD5057, fd699; +fma.rn.f64 fd704, fd635, 0dBFD207E7FD768DBF, fd700; +fma.rn.f64 fd705, fd637, 0d3FDA9628D9C712B6, fd701; +fma.rn.f64 fd706, fd640, 0dBFED1BB48EEE2C13, fd702; +fma.rn.f64 fd707, fd638, 0d3FDA9628D9C712B6, fd703; +fma.rn.f64 fd708, fd639, 0dBFED1BB48EEE2C13, fd704; +fma.rn.f64 fd709, fd641, 0d3FEAEB8C8764F0BA, fd705; +fma.rn.f64 fd710, fd644, 0d3FE14CEDF8BB580B, fd706; +fma.rn.f64 fd711, fd642, 0d3FEAEB8C8764F0BA, fd707; +fma.rn.f64 fd712, fd643, 0d3FE14CEDF8BB580B, fd708; +fma.rn.f64 fd713, fd645, 0dBFE4F49E7F775887, fd709; +fma.rn.f64 fd714, fd648, 0d3FE82F19BB3A28A1, fd710; +fma.rn.f64 fd715, fd646, 0dBFE4F49E7F775887, fd711; +fma.rn.f64 fd716, fd647, 0d3FE82F19BB3A28A1, fd712; +fma.rn.f64 fd717, fd629, 0dBFE4F49E7F775887, fd585; +fma.rn.f64 fd718, fd632, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd719, fd630, 0dBFE4F49E7F775887, fd586; +fma.rn.f64 fd720, fd631, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd721, fd633, 0dBFC2375F640F44DB, fd717; +fma.rn.f64 fd722, fd636, 0dBFEFAC9E043842EF, fd718; +fma.rn.f64 fd723, fd634, 0dBFC2375F640F44DB, fd719; +fma.rn.f64 fd724, fd635, 0dBFEFAC9E043842EF, fd720; +fma.rn.f64 fd725, fd637, 0d3FEAEB8C8764F0BA, fd721; +fma.rn.f64 fd726, fd640, 0d3FE14CEDF8BB580B, fd722; +fma.rn.f64 fd727, fd638, 0d3FEAEB8C8764F0BA, fd723; +fma.rn.f64 fd728, fd639, 0d3FE14CEDF8BB580B, fd724; +fma.rn.f64 fd729, fd641, 0dBFEEB42A9BCD5057, fd725; +fma.rn.f64 fd730, fd644, 0d3FD207E7FD768DBF, fd726; +fma.rn.f64 fd731, fd642, 0dBFEEB42A9BCD5057, fd727; +fma.rn.f64 fd732, fd643, 0d3FD207E7FD768DBF, fd728; +fma.rn.f64 fd733, fd645, 0d3FDA9628D9C712B6, fd729; +fma.rn.f64 fd734, fd648, 0dBFED1BB48EEE2C13, fd730; +fma.rn.f64 fd735, fd646, 0d3FDA9628D9C712B6, fd731; +fma.rn.f64 fd736, fd647, 0dBFED1BB48EEE2C13, fd732; +fma.rn.f64 fd737, fd629, 0dBFEEB42A9BCD5057, fd585; +fma.rn.f64 fd738, fd632, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd739, fd630, 0dBFEEB42A9BCD5057, fd586; +fma.rn.f64 fd740, fd631, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd741, fd633, 0d3FEAEB8C8764F0BA, fd737; +fma.rn.f64 fd742, fd636, 0dBFE14CEDF8BB580B, fd738; +fma.rn.f64 fd743, fd634, 0d3FEAEB8C8764F0BA, fd739; +fma.rn.f64 fd744, fd635, 0dBFE14CEDF8BB580B, fd740; +fma.rn.f64 fd745, fd637, 0dBFE4F49E7F775887, fd741; +fma.rn.f64 fd746, fd640, 0d3FE82F19BB3A28A1, fd742; +fma.rn.f64 fd747, fd638, 0dBFE4F49E7F775887, fd743; +fma.rn.f64 fd748, fd639, 0d3FE82F19BB3A28A1, fd744; +fma.rn.f64 fd749, fd641, 0d3FDA9628D9C712B6, fd745; +fma.rn.f64 fd750, fd644, 0dBFED1BB48EEE2C13, fd746; +fma.rn.f64 fd751, fd642, 0d3FDA9628D9C712B6, fd747; +fma.rn.f64 fd752, fd643, 0dBFED1BB48EEE2C13, fd748; +fma.rn.f64 fd753, fd645, 0dBFC2375F640F44DB, fd749; +fma.rn.f64 fd754, fd648, 0d3FEFAC9E043842EF, fd750; +fma.rn.f64 fd755, fd646, 0dBFC2375F640F44DB, fd751; +fma.rn.f64 fd756, fd647, 0d3FEFAC9E043842EF, fd752; +add.f64 %1, fd656, fd646; +add.f64 %0, fd655, fd645; +add.f64 %3, fd676, fd675; +sub.f64 %2, fd673, fd674; +add.f64 %5, fd696, fd695; +sub.f64 %4, fd693, fd694; +add.f64 %7, fd716, fd715; +sub.f64 %6, fd713, fd714; +add.f64 %9, fd736, fd735; +sub.f64 %8, fd733, fd734; +add.f64 %11, fd756, fd755; +sub.f64 %10, fd753, fd754; +sub.f64 %13, fd755, fd756; +add.f64 %12, fd754, fd753; +sub.f64 %15, fd735, fd736; +add.f64 %14, fd734, fd733; +sub.f64 %17, fd715, fd716; +add.f64 %16, fd714, fd713; +sub.f64 %19, fd695, fd696; +add.f64 %18, fd694, fd693; +sub.f64 %21, fd675, fd676; +add.f64 %20, fd674, fd673; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y): "r"(smem), "l"(lut_dp_11_1331), "l"(lut_dp_11_121), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..b408aa997e3a4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp16_fwd.hpp.inc @@ -0,0 +1,1681 @@ +#ifndef CUFFTDX_FFT_13_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_13_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<746, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<157>; +.reg .b32 r<1057>; +.reg .f64 fd<145>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %28, %50; +} +{ +add.f16x2 r4, %29, %51; +} +{ +sub.f16x2 r7, %28, %50; +} +{ +sub.f16x2 r10, %29, %51; +} +{ +add.f16x2 r13, %30, %48; +} +{ +add.f16x2 r16, %31, %49; +} +{ +sub.f16x2 r19, %30, %48; +} +{ +sub.f16x2 r22, %31, %49; +} +{ +add.f16x2 r25, %32, %46; +} +{ +add.f16x2 r28, %33, %47; +} +{ +sub.f16x2 r31, %32, %46; +} +{ +sub.f16x2 r34, %33, %47; +} +{ +add.f16x2 r37, %34, %44; +} +{ +add.f16x2 r40, %35, %45; +} +{ +sub.f16x2 r43, %34, %44; +} +{ +sub.f16x2 r46, %35, %45; +} +{ +add.f16x2 r49, %36, %42; +} +{ +add.f16x2 r52, %37, %43; +} +{ +sub.f16x2 r55, %36, %42; +} +{ +sub.f16x2 r58, %37, %43; +} +{ +add.f16x2 r61, %38, %40; +} +{ +add.f16x2 r64, %39, %41; +} +{ +sub.f16x2 r67, %38, %40; +} +{ +sub.f16x2 r70, %39, %41; +} +{ +add.f16x2 r73, %26, r1; +} +{ +add.f16x2 r76, %27, r4; +} +{ +add.f16x2 r79, r73, r13; +} +{ +add.f16x2 r82, r76, r16; +} +{ +add.f16x2 r85, r79, r25; +} +{ +add.f16x2 r88, r82, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 %0, r97, r61; +} +{ +add.f16x2 %1, r100, r64; +} +mov.u32 r900, 0; +cvt.rn.f16.s32 rs1, r900; +mov.b32 r121, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r900; +mov.b32 r133, {rs2, rs2}; +mov.f64 fd127, 0d3FEC55A7E00740E9; +{ +cvt.rn.f16.f64 rs3, fd127; +} +mov.b32 r113, {rs3, rs3}; +{ +mul.f16x2 r111, r1, r113; +} +{ +add.f16x2 r114, %26, r111; +} +mov.f64 fd4, 0dBFDDBE064267C47C; +{ +cvt.rn.f16.f64 rs4, fd4; +} +mov.b32 r119, {rs4, rs4}; +{ +mul.f16x2 r117, r10, r119; +} +{ +add.f16x2 r120, r121, r117; +} +{ +cvt.rn.f16.f64 rs5, fd127; +} +mov.b32 r125, {rs5, rs5}; +{ +mul.f16x2 r123, r4, r125; +} +{ +add.f16x2 r126, %27, r123; +} +{ +cvt.rn.f16.f64 rs6, fd4; +} +mov.b32 r131, {rs6, rs6}; +{ +mul.f16x2 r129, r7, r131; +} +{ +add.f16x2 r132, r133, r129; +} +mov.f64 fd135, 0d3FE22D961EA71119; +{ +cvt.rn.f16.f64 rs7, fd135; +} +mov.b32 r137, {rs7, rs7}; +{ +mul.f16x2 r135, r13, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd108, 0dBFEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs8, fd108; +} +mov.b32 r143, {rs8, rs8}; +{ +mul.f16x2 r141, r22, r143; +} +{ +add.f16x2 r144, r120, r141; +} +{ +cvt.rn.f16.f64 rs9, fd135; +} +mov.b32 r149, {rs9, rs9}; +{ +mul.f16x2 r147, r16, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs10, fd108; +} +mov.b32 r155, {rs10, rs10}; +{ +mul.f16x2 r153, r19, r155; +} +{ +add.f16x2 r156, r132, r153; +} +mov.f64 fd143, 0d3FBEDB7DEBAA3ED8; +{ +cvt.rn.f16.f64 rs11, fd143; +} +mov.b32 r161, {rs11, rs11}; +{ +mul.f16x2 r159, r25, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd88, 0dBFEFC44566966769; +{ +cvt.rn.f16.f64 rs12, fd88; +} +mov.b32 r167, {rs12, rs12}; +{ +mul.f16x2 r165, r34, r167; +} +{ +add.f16x2 r168, r144, r165; +} +{ +cvt.rn.f16.f64 rs13, fd143; +} +mov.b32 r173, {rs13, rs13}; +{ +mul.f16x2 r171, r28, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs14, fd88; +} +mov.b32 r179, {rs14, rs14}; +{ +mul.f16x2 r177, r31, r179; +} +{ +add.f16x2 r180, r156, r177; +} +mov.f64 fd139, 0dBFD6B1D8B2365DA1; +{ +cvt.rn.f16.f64 rs15, fd139; +} +mov.b32 r185, {rs15, rs15}; +{ +mul.f16x2 r183, r37, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd140, 0dBFEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs16, fd140; +} +mov.b32 r191, {rs16, rs16}; +{ +mul.f16x2 r189, r46, r191; +} +{ +add.f16x2 r192, r168, r189; +} +{ +cvt.rn.f16.f64 rs17, fd139; +} +mov.b32 r197, {rs17, rs17}; +{ +mul.f16x2 r195, r40, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs18, fd140; +} +mov.b32 r203, {rs18, rs18}; +{ +mul.f16x2 r201, r43, r203; +} +{ +add.f16x2 r204, r180, r201; +} +mov.f64 fd131, 0dBFE7F3CCD0032E0C; +{ +cvt.rn.f16.f64 rs19, fd131; +} +mov.b32 r209, {rs19, rs19}; +{ +mul.f16x2 r207, r49, r209; +} +{ +add.f16x2 r210, r186, r207; +} +mov.f64 fd132, 0dBFE5384D024C2F84; +{ +cvt.rn.f16.f64 rs20, fd132; +} +mov.b32 r215, {rs20, rs20}; +{ +mul.f16x2 r213, r58, r215; +} +{ +add.f16x2 r216, r192, r213; +} +{ +cvt.rn.f16.f64 rs21, fd131; +} +mov.b32 r221, {rs21, rs21}; +{ +mul.f16x2 r219, r52, r221; +} +{ +add.f16x2 r222, r198, r219; +} +{ +cvt.rn.f16.f64 rs22, fd132; +} +mov.b32 r227, {rs22, rs22}; +{ +mul.f16x2 r225, r55, r227; +} +{ +add.f16x2 r228, r204, r225; +} +mov.f64 fd123, 0dBFEF11F493053D00; +{ +cvt.rn.f16.f64 rs23, fd123; +} +mov.b32 r233, {rs23, rs23}; +{ +mul.f16x2 r231, r61, r233; +} +{ +add.f16x2 r234, r210, r231; +} +mov.f64 fd124, 0dBFCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs24, fd124; +} +mov.b32 r239, {rs24, rs24}; +{ +mul.f16x2 r237, r70, r239; +} +{ +add.f16x2 r240, r216, r237; +} +{ +cvt.rn.f16.f64 rs25, fd123; +} +mov.b32 r245, {rs25, rs25}; +{ +mul.f16x2 r243, r64, r245; +} +{ +add.f16x2 r246, r222, r243; +} +{ +cvt.rn.f16.f64 rs26, fd124; +} +mov.b32 r251, {rs26, rs26}; +{ +mul.f16x2 r249, r67, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +sub.f16x2 %2, r234, r240; +} +{ +add.f16x2 %3, r246, r252; +} +{ +add.f16x2 %24, r234, r240; +} +{ +sub.f16x2 %25, r246, r252; +} +cvt.rn.f16.s32 rs27, r900; +mov.b32 r279, {rs27, rs27}; +cvt.rn.f16.s32 rs28, r900; +mov.b32 r291, {rs28, rs28}; +{ +cvt.rn.f16.f64 rs29, fd135; +} +mov.b32 r271, {rs29, rs29}; +{ +mul.f16x2 r269, r1, r271; +} +{ +add.f16x2 r272, %26, r269; +} +{ +cvt.rn.f16.f64 rs30, fd108; +} +mov.b32 r277, {rs30, rs30}; +{ +mul.f16x2 r275, r10, r277; +} +{ +add.f16x2 r278, r279, r275; +} +{ +cvt.rn.f16.f64 rs31, fd135; +} +mov.b32 r283, {rs31, rs31}; +{ +mul.f16x2 r281, r4, r283; +} +{ +add.f16x2 r284, %27, r281; +} +{ +cvt.rn.f16.f64 rs32, fd108; +} +mov.b32 r289, {rs32, rs32}; +{ +mul.f16x2 r287, r7, r289; +} +{ +add.f16x2 r290, r291, r287; +} +{ +cvt.rn.f16.f64 rs33, fd139; +} +mov.b32 r295, {rs33, rs33}; +{ +mul.f16x2 r293, r13, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs34, fd140; +} +mov.b32 r301, {rs34, rs34}; +{ +mul.f16x2 r299, r22, r301; +} +{ +add.f16x2 r302, r278, r299; +} +{ +cvt.rn.f16.f64 rs35, fd139; +} +mov.b32 r307, {rs35, rs35}; +{ +mul.f16x2 r305, r16, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs36, fd140; +} +mov.b32 r313, {rs36, rs36}; +{ +mul.f16x2 r311, r19, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs37, fd123; +} +mov.b32 r319, {rs37, rs37}; +{ +mul.f16x2 r317, r25, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs38, fd124; +} +mov.b32 r325, {rs38, rs38}; +{ +mul.f16x2 r323, r34, r325; +} +{ +add.f16x2 r326, r302, r323; +} +{ +cvt.rn.f16.f64 rs39, fd123; +} +mov.b32 r331, {rs39, rs39}; +{ +mul.f16x2 r329, r28, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs40, fd124; +} +mov.b32 r337, {rs40, rs40}; +{ +mul.f16x2 r335, r31, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs41, fd131; +} +mov.b32 r343, {rs41, rs41}; +{ +mul.f16x2 r341, r37, r343; +} +{ +add.f16x2 r344, r320, r341; +} +mov.f64 fd80, 0d3FE5384D024C2F84; +{ +cvt.rn.f16.f64 rs42, fd80; +} +mov.b32 r349, {rs42, rs42}; +{ +mul.f16x2 r347, r46, r349; +} +{ +add.f16x2 r350, r326, r347; +} +{ +cvt.rn.f16.f64 rs43, fd131; +} +mov.b32 r355, {rs43, rs43}; +{ +mul.f16x2 r353, r40, r355; +} +{ +add.f16x2 r356, r332, r353; +} +{ +cvt.rn.f16.f64 rs44, fd80; +} +mov.b32 r361, {rs44, rs44}; +{ +mul.f16x2 r359, r43, r361; +} +{ +add.f16x2 r362, r338, r359; +} +{ +cvt.rn.f16.f64 rs45, fd143; +} +mov.b32 r367, {rs45, rs45}; +{ +mul.f16x2 r365, r49, r367; +} +{ +add.f16x2 r368, r344, r365; +} +mov.f64 fd144, 0d3FEFC44566966769; +{ +cvt.rn.f16.f64 rs46, fd144; +} +mov.b32 r373, {rs46, rs46}; +{ +mul.f16x2 r371, r58, r373; +} +{ +add.f16x2 r374, r350, r371; +} +{ +cvt.rn.f16.f64 rs47, fd143; +} +mov.b32 r379, {rs47, rs47}; +{ +mul.f16x2 r377, r52, r379; +} +{ +add.f16x2 r380, r356, r377; +} +{ +cvt.rn.f16.f64 rs48, fd144; +} +mov.b32 r385, {rs48, rs48}; +{ +mul.f16x2 r383, r55, r385; +} +{ +add.f16x2 r386, r362, r383; +} +{ +cvt.rn.f16.f64 rs49, fd127; +} +mov.b32 r391, {rs49, rs49}; +{ +mul.f16x2 r389, r61, r391; +} +{ +add.f16x2 r392, r368, r389; +} +mov.f64 fd128, 0d3FDDBE064267C47C; +{ +cvt.rn.f16.f64 rs50, fd128; +} +mov.b32 r397, {rs50, rs50}; +{ +mul.f16x2 r395, r70, r397; +} +{ +add.f16x2 r398, r374, r395; +} +{ +cvt.rn.f16.f64 rs51, fd127; +} +mov.b32 r403, {rs51, rs51}; +{ +mul.f16x2 r401, r64, r403; +} +{ +add.f16x2 r404, r380, r401; +} +{ +cvt.rn.f16.f64 rs52, fd128; +} +mov.b32 r409, {rs52, rs52}; +{ +mul.f16x2 r407, r67, r409; +} +{ +add.f16x2 r410, r386, r407; +} +{ +sub.f16x2 %4, r392, r398; +} +{ +add.f16x2 %5, r404, r410; +} +{ +add.f16x2 %22, r392, r398; +} +{ +sub.f16x2 %23, r404, r410; +} +cvt.rn.f16.s32 rs53, r900; +mov.b32 r437, {rs53, rs53}; +cvt.rn.f16.s32 rs54, r900; +mov.b32 r449, {rs54, rs54}; +{ +cvt.rn.f16.f64 rs55, fd143; +} +mov.b32 r429, {rs55, rs55}; +{ +mul.f16x2 r427, r1, r429; +} +{ +add.f16x2 r430, %26, r427; +} +{ +cvt.rn.f16.f64 rs56, fd88; +} +mov.b32 r435, {rs56, rs56}; +{ +mul.f16x2 r433, r10, r435; +} +{ +add.f16x2 r436, r437, r433; +} +{ +cvt.rn.f16.f64 rs57, fd143; +} +mov.b32 r441, {rs57, rs57}; +{ +mul.f16x2 r439, r4, r441; +} +{ +add.f16x2 r442, %27, r439; +} +{ +cvt.rn.f16.f64 rs58, fd88; +} +mov.b32 r447, {rs58, rs58}; +{ +mul.f16x2 r445, r7, r447; +} +{ +add.f16x2 r448, r449, r445; +} +{ +cvt.rn.f16.f64 rs59, fd123; +} +mov.b32 r453, {rs59, rs59}; +{ +mul.f16x2 r451, r13, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs60, fd124; +} +mov.b32 r459, {rs60, rs60}; +{ +mul.f16x2 r457, r22, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs61, fd123; +} +mov.b32 r465, {rs61, rs61}; +{ +mul.f16x2 r463, r16, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs62, fd124; +} +mov.b32 r471, {rs62, rs62}; +{ +mul.f16x2 r469, r19, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs63, fd139; +} +mov.b32 r477, {rs63, rs63}; +{ +mul.f16x2 r475, r25, r477; +} +{ +add.f16x2 r478, r454, r475; +} +mov.f64 fd60, 0d3FEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs64, fd60; +} +mov.b32 r483, {rs64, rs64}; +{ +mul.f16x2 r481, r34, r483; +} +{ +add.f16x2 r484, r460, r481; +} +{ +cvt.rn.f16.f64 rs65, fd139; +} +mov.b32 r489, {rs65, rs65}; +{ +mul.f16x2 r487, r28, r489; +} +{ +add.f16x2 r490, r466, r487; +} +{ +cvt.rn.f16.f64 rs66, fd60; +} +mov.b32 r495, {rs66, rs66}; +{ +mul.f16x2 r493, r31, r495; +} +{ +add.f16x2 r496, r472, r493; +} +{ +cvt.rn.f16.f64 rs67, fd127; +} +mov.b32 r501, {rs67, rs67}; +{ +mul.f16x2 r499, r37, r501; +} +{ +add.f16x2 r502, r478, r499; +} +{ +cvt.rn.f16.f64 rs68, fd128; +} +mov.b32 r507, {rs68, rs68}; +{ +mul.f16x2 r505, r46, r507; +} +{ +add.f16x2 r508, r484, r505; +} +{ +cvt.rn.f16.f64 rs69, fd127; +} +mov.b32 r513, {rs69, rs69}; +{ +mul.f16x2 r511, r40, r513; +} +{ +add.f16x2 r514, r490, r511; +} +{ +cvt.rn.f16.f64 rs70, fd128; +} +mov.b32 r519, {rs70, rs70}; +{ +mul.f16x2 r517, r43, r519; +} +{ +add.f16x2 r520, r496, r517; +} +{ +cvt.rn.f16.f64 rs71, fd135; +} +mov.b32 r525, {rs71, rs71}; +{ +mul.f16x2 r523, r49, r525; +} +{ +add.f16x2 r526, r502, r523; +} +{ +cvt.rn.f16.f64 rs72, fd108; +} +mov.b32 r531, {rs72, rs72}; +{ +mul.f16x2 r529, r58, r531; +} +{ +add.f16x2 r532, r508, r529; +} +{ +cvt.rn.f16.f64 rs73, fd135; +} +mov.b32 r537, {rs73, rs73}; +{ +mul.f16x2 r535, r52, r537; +} +{ +add.f16x2 r538, r514, r535; +} +{ +cvt.rn.f16.f64 rs74, fd108; +} +mov.b32 r543, {rs74, rs74}; +{ +mul.f16x2 r541, r55, r543; +} +{ +add.f16x2 r544, r520, r541; +} +{ +cvt.rn.f16.f64 rs75, fd131; +} +mov.b32 r549, {rs75, rs75}; +{ +mul.f16x2 r547, r61, r549; +} +{ +add.f16x2 r550, r526, r547; +} +{ +cvt.rn.f16.f64 rs76, fd132; +} +mov.b32 r555, {rs76, rs76}; +{ +mul.f16x2 r553, r70, r555; +} +{ +add.f16x2 r556, r532, r553; +} +{ +cvt.rn.f16.f64 rs77, fd131; +} +mov.b32 r561, {rs77, rs77}; +{ +mul.f16x2 r559, r64, r561; +} +{ +add.f16x2 r562, r538, r559; +} +{ +cvt.rn.f16.f64 rs78, fd132; +} +mov.b32 r567, {rs78, rs78}; +{ +mul.f16x2 r565, r67, r567; +} +{ +add.f16x2 r568, r544, r565; +} +{ +sub.f16x2 %6, r550, r556; +} +{ +add.f16x2 %7, r562, r568; +} +{ +add.f16x2 %20, r550, r556; +} +{ +sub.f16x2 %21, r562, r568; +} +cvt.rn.f16.s32 rs79, r900; +mov.b32 r595, {rs79, rs79}; +cvt.rn.f16.s32 rs80, r900; +mov.b32 r607, {rs80, rs80}; +{ +cvt.rn.f16.f64 rs81, fd139; +} +mov.b32 r587, {rs81, rs81}; +{ +mul.f16x2 r585, r1, r587; +} +{ +add.f16x2 r588, %26, r585; +} +{ +cvt.rn.f16.f64 rs82, fd140; +} +mov.b32 r593, {rs82, rs82}; +{ +mul.f16x2 r591, r10, r593; +} +{ +add.f16x2 r594, r595, r591; +} +{ +cvt.rn.f16.f64 rs83, fd139; +} +mov.b32 r599, {rs83, rs83}; +{ +mul.f16x2 r597, r4, r599; +} +{ +add.f16x2 r600, %27, r597; +} +{ +cvt.rn.f16.f64 rs84, fd140; +} +mov.b32 r605, {rs84, rs84}; +{ +mul.f16x2 r603, r7, r605; +} +{ +add.f16x2 r606, r607, r603; +} +{ +cvt.rn.f16.f64 rs85, fd131; +} +mov.b32 r611, {rs85, rs85}; +{ +mul.f16x2 r609, r13, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +cvt.rn.f16.f64 rs86, fd80; +} +mov.b32 r617, {rs86, rs86}; +{ +mul.f16x2 r615, r22, r617; +} +{ +add.f16x2 r618, r594, r615; +} +{ +cvt.rn.f16.f64 rs87, fd131; +} +mov.b32 r623, {rs87, rs87}; +{ +mul.f16x2 r621, r16, r623; +} +{ +add.f16x2 r624, r600, r621; +} +{ +cvt.rn.f16.f64 rs88, fd80; +} +mov.b32 r629, {rs88, rs88}; +{ +mul.f16x2 r627, r19, r629; +} +{ +add.f16x2 r630, r606, r627; +} +{ +cvt.rn.f16.f64 rs89, fd127; +} +mov.b32 r635, {rs89, rs89}; +{ +mul.f16x2 r633, r25, r635; +} +{ +add.f16x2 r636, r612, r633; +} +{ +cvt.rn.f16.f64 rs90, fd128; +} +mov.b32 r641, {rs90, rs90}; +{ +mul.f16x2 r639, r34, r641; +} +{ +add.f16x2 r642, r618, r639; +} +{ +cvt.rn.f16.f64 rs91, fd127; +} +mov.b32 r647, {rs91, rs91}; +{ +mul.f16x2 r645, r28, r647; +} +{ +add.f16x2 r648, r624, r645; +} +{ +cvt.rn.f16.f64 rs92, fd128; +} +mov.b32 r653, {rs92, rs92}; +{ +mul.f16x2 r651, r31, r653; +} +{ +add.f16x2 r654, r630, r651; +} +{ +cvt.rn.f16.f64 rs93, fd143; +} +mov.b32 r659, {rs93, rs93}; +{ +mul.f16x2 r657, r37, r659; +} +{ +add.f16x2 r660, r636, r657; +} +{ +cvt.rn.f16.f64 rs94, fd88; +} +mov.b32 r665, {rs94, rs94}; +{ +mul.f16x2 r663, r46, r665; +} +{ +add.f16x2 r666, r642, r663; +} +{ +cvt.rn.f16.f64 rs95, fd143; +} +mov.b32 r671, {rs95, rs95}; +{ +mul.f16x2 r669, r40, r671; +} +{ +add.f16x2 r672, r648, r669; +} +{ +cvt.rn.f16.f64 rs96, fd88; +} +mov.b32 r677, {rs96, rs96}; +{ +mul.f16x2 r675, r43, r677; +} +{ +add.f16x2 r678, r654, r675; +} +{ +cvt.rn.f16.f64 rs97, fd123; +} +mov.b32 r683, {rs97, rs97}; +{ +mul.f16x2 r681, r49, r683; +} +{ +add.f16x2 r684, r660, r681; +} +mov.f64 fd112, 0d3FCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs98, fd112; +} +mov.b32 r689, {rs98, rs98}; +{ +mul.f16x2 r687, r58, r689; +} +{ +add.f16x2 r690, r666, r687; +} +{ +cvt.rn.f16.f64 rs99, fd123; +} +mov.b32 r695, {rs99, rs99}; +{ +mul.f16x2 r693, r52, r695; +} +{ +add.f16x2 r696, r672, r693; +} +{ +cvt.rn.f16.f64 rs100, fd112; +} +mov.b32 r701, {rs100, rs100}; +{ +mul.f16x2 r699, r55, r701; +} +{ +add.f16x2 r702, r678, r699; +} +{ +cvt.rn.f16.f64 rs101, fd135; +} +mov.b32 r707, {rs101, rs101}; +{ +mul.f16x2 r705, r61, r707; +} +{ +add.f16x2 r708, r684, r705; +} +mov.f64 fd136, 0d3FEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs102, fd136; +} +mov.b32 r713, {rs102, rs102}; +{ +mul.f16x2 r711, r70, r713; +} +{ +add.f16x2 r714, r690, r711; +} +{ +cvt.rn.f16.f64 rs103, fd135; +} +mov.b32 r719, {rs103, rs103}; +{ +mul.f16x2 r717, r64, r719; +} +{ +add.f16x2 r720, r696, r717; +} +{ +cvt.rn.f16.f64 rs104, fd136; +} +mov.b32 r725, {rs104, rs104}; +{ +mul.f16x2 r723, r67, r725; +} +{ +add.f16x2 r726, r702, r723; +} +{ +sub.f16x2 %8, r708, r714; +} +{ +add.f16x2 %9, r720, r726; +} +{ +add.f16x2 %18, r708, r714; +} +{ +sub.f16x2 %19, r720, r726; +} +cvt.rn.f16.s32 rs105, r900; +mov.b32 r753, {rs105, rs105}; +cvt.rn.f16.s32 rs106, r900; +mov.b32 r765, {rs106, rs106}; +{ +cvt.rn.f16.f64 rs107, fd131; +} +mov.b32 r745, {rs107, rs107}; +{ +mul.f16x2 r743, r1, r745; +} +{ +add.f16x2 r746, %26, r743; +} +{ +cvt.rn.f16.f64 rs108, fd132; +} +mov.b32 r751, {rs108, rs108}; +{ +mul.f16x2 r749, r10, r751; +} +{ +add.f16x2 r752, r753, r749; +} +{ +cvt.rn.f16.f64 rs109, fd131; +} +mov.b32 r757, {rs109, rs109}; +{ +mul.f16x2 r755, r4, r757; +} +{ +add.f16x2 r758, %27, r755; +} +{ +cvt.rn.f16.f64 rs110, fd132; +} +mov.b32 r763, {rs110, rs110}; +{ +mul.f16x2 r761, r7, r763; +} +{ +add.f16x2 r764, r765, r761; +} +{ +cvt.rn.f16.f64 rs111, fd143; +} +mov.b32 r769, {rs111, rs111}; +{ +mul.f16x2 r767, r13, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs112, fd144; +} +mov.b32 r775, {rs112, rs112}; +{ +mul.f16x2 r773, r22, r775; +} +{ +add.f16x2 r776, r752, r773; +} +{ +cvt.rn.f16.f64 rs113, fd143; +} +mov.b32 r781, {rs113, rs113}; +{ +mul.f16x2 r779, r16, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs114, fd144; +} +mov.b32 r787, {rs114, rs114}; +{ +mul.f16x2 r785, r19, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs115, fd135; +} +mov.b32 r793, {rs115, rs115}; +{ +mul.f16x2 r791, r25, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs116, fd108; +} +mov.b32 r799, {rs116, rs116}; +{ +mul.f16x2 r797, r34, r799; +} +{ +add.f16x2 r800, r776, r797; +} +{ +cvt.rn.f16.f64 rs117, fd135; +} +mov.b32 r805, {rs117, rs117}; +{ +mul.f16x2 r803, r28, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs118, fd108; +} +mov.b32 r811, {rs118, rs118}; +{ +mul.f16x2 r809, r31, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs119, fd123; +} +mov.b32 r817, {rs119, rs119}; +{ +mul.f16x2 r815, r37, r817; +} +{ +add.f16x2 r818, r794, r815; +} +{ +cvt.rn.f16.f64 rs120, fd112; +} +mov.b32 r823, {rs120, rs120}; +{ +mul.f16x2 r821, r46, r823; +} +{ +add.f16x2 r824, r800, r821; +} +{ +cvt.rn.f16.f64 rs121, fd123; +} +mov.b32 r829, {rs121, rs121}; +{ +mul.f16x2 r827, r40, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs122, fd112; +} +mov.b32 r835, {rs122, rs122}; +{ +mul.f16x2 r833, r43, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs123, fd127; +} +mov.b32 r841, {rs123, rs123}; +{ +mul.f16x2 r839, r49, r841; +} +{ +add.f16x2 r842, r818, r839; +} +{ +cvt.rn.f16.f64 rs124, fd128; +} +mov.b32 r847, {rs124, rs124}; +{ +mul.f16x2 r845, r58, r847; +} +{ +add.f16x2 r848, r824, r845; +} +{ +cvt.rn.f16.f64 rs125, fd127; +} +mov.b32 r853, {rs125, rs125}; +{ +mul.f16x2 r851, r52, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs126, fd128; +} +mov.b32 r859, {rs126, rs126}; +{ +mul.f16x2 r857, r55, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs127, fd139; +} +mov.b32 r865, {rs127, rs127}; +{ +mul.f16x2 r863, r61, r865; +} +{ +add.f16x2 r866, r842, r863; +} +{ +cvt.rn.f16.f64 rs128, fd140; +} +mov.b32 r871, {rs128, rs128}; +{ +mul.f16x2 r869, r70, r871; +} +{ +add.f16x2 r872, r848, r869; +} +{ +cvt.rn.f16.f64 rs129, fd139; +} +mov.b32 r877, {rs129, rs129}; +{ +mul.f16x2 r875, r64, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs130, fd140; +} +mov.b32 r883, {rs130, rs130}; +{ +mul.f16x2 r881, r67, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +sub.f16x2 %10, r866, r872; +} +{ +add.f16x2 %11, r878, r884; +} +{ +add.f16x2 %16, r866, r872; +} +{ +sub.f16x2 %17, r878, r884; +} +cvt.rn.f16.s32 rs131, r900; +mov.b32 r911, {rs131, rs131}; +cvt.rn.f16.s32 rs132, r900; +mov.b32 r923, {rs132, rs132}; +{ +cvt.rn.f16.f64 rs133, fd123; +} +mov.b32 r903, {rs133, rs133}; +{ +mul.f16x2 r901, r1, r903; +} +{ +add.f16x2 r904, %26, r901; +} +{ +cvt.rn.f16.f64 rs134, fd124; +} +mov.b32 r909, {rs134, rs134}; +{ +mul.f16x2 r907, r10, r909; +} +{ +add.f16x2 r910, r911, r907; +} +{ +cvt.rn.f16.f64 rs135, fd123; +} +mov.b32 r915, {rs135, rs135}; +{ +mul.f16x2 r913, r4, r915; +} +{ +add.f16x2 r916, %27, r913; +} +{ +cvt.rn.f16.f64 rs136, fd124; +} +mov.b32 r921, {rs136, rs136}; +{ +mul.f16x2 r919, r7, r921; +} +{ +add.f16x2 r922, r923, r919; +} +{ +cvt.rn.f16.f64 rs137, fd127; +} +mov.b32 r927, {rs137, rs137}; +{ +mul.f16x2 r925, r13, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs138, fd128; +} +mov.b32 r933, {rs138, rs138}; +{ +mul.f16x2 r931, r22, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs139, fd127; +} +mov.b32 r939, {rs139, rs139}; +{ +mul.f16x2 r937, r16, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs140, fd128; +} +mov.b32 r945, {rs140, rs140}; +{ +mul.f16x2 r943, r19, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs141, fd131; +} +mov.b32 r951, {rs141, rs141}; +{ +mul.f16x2 r949, r25, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs142, fd132; +} +mov.b32 r957, {rs142, rs142}; +{ +mul.f16x2 r955, r34, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs143, fd131; +} +mov.b32 r963, {rs143, rs143}; +{ +mul.f16x2 r961, r28, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs144, fd132; +} +mov.b32 r969, {rs144, rs144}; +{ +mul.f16x2 r967, r31, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +cvt.rn.f16.f64 rs145, fd135; +} +mov.b32 r975, {rs145, rs145}; +{ +mul.f16x2 r973, r37, r975; +} +{ +add.f16x2 r976, r952, r973; +} +{ +cvt.rn.f16.f64 rs146, fd136; +} +mov.b32 r981, {rs146, rs146}; +{ +mul.f16x2 r979, r46, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs147, fd135; +} +mov.b32 r987, {rs147, rs147}; +{ +mul.f16x2 r985, r40, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs148, fd136; +} +mov.b32 r993, {rs148, rs148}; +{ +mul.f16x2 r991, r43, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs149, fd139; +} +mov.b32 r999, {rs149, rs149}; +{ +mul.f16x2 r997, r49, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs150, fd140; +} +mov.b32 r1005, {rs150, rs150}; +{ +mul.f16x2 r1003, r58, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs151, fd139; +} +mov.b32 r1011, {rs151, rs151}; +{ +mul.f16x2 r1009, r52, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs152, fd140; +} +mov.b32 r1017, {rs152, rs152}; +{ +mul.f16x2 r1015, r55, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +cvt.rn.f16.f64 rs153, fd143; +} +mov.b32 r1023, {rs153, rs153}; +{ +mul.f16x2 r1021, r61, r1023; +} +{ +add.f16x2 r1024, r1000, r1021; +} +{ +cvt.rn.f16.f64 rs154, fd144; +} +mov.b32 r1029, {rs154, rs154}; +{ +mul.f16x2 r1027, r70, r1029; +} +{ +add.f16x2 r1030, r1006, r1027; +} +{ +cvt.rn.f16.f64 rs155, fd143; +} +mov.b32 r1035, {rs155, rs155}; +{ +mul.f16x2 r1033, r64, r1035; +} +{ +add.f16x2 r1036, r1012, r1033; +} +{ +cvt.rn.f16.f64 rs156, fd144; +} +mov.b32 r1041, {rs156, rs156}; +{ +mul.f16x2 r1039, r67, r1041; +} +{ +add.f16x2 r1042, r1018, r1039; +} +{ +sub.f16x2 %12, r1024, r1030; +} +{ +add.f16x2 %13, r1036, r1042; +} +{ +add.f16x2 %14, r1024, r1030; +} +{ +sub.f16x2 %15, r1036, r1042; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..81ac3f0b9c35c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp16_inv.hpp.inc @@ -0,0 +1,1681 @@ +#ifndef CUFFTDX_FFT_13_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_13_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<948, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<157>; +.reg .b32 r<1057>; +.reg .f64 fd<145>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %28, %50; +} +{ +add.f16x2 r4, %29, %51; +} +{ +sub.f16x2 r7, %28, %50; +} +{ +sub.f16x2 r10, %29, %51; +} +{ +add.f16x2 r13, %30, %48; +} +{ +add.f16x2 r16, %31, %49; +} +{ +sub.f16x2 r19, %30, %48; +} +{ +sub.f16x2 r22, %31, %49; +} +{ +add.f16x2 r25, %32, %46; +} +{ +add.f16x2 r28, %33, %47; +} +{ +sub.f16x2 r31, %32, %46; +} +{ +sub.f16x2 r34, %33, %47; +} +{ +add.f16x2 r37, %34, %44; +} +{ +add.f16x2 r40, %35, %45; +} +{ +sub.f16x2 r43, %34, %44; +} +{ +sub.f16x2 r46, %35, %45; +} +{ +add.f16x2 r49, %36, %42; +} +{ +add.f16x2 r52, %37, %43; +} +{ +sub.f16x2 r55, %36, %42; +} +{ +sub.f16x2 r58, %37, %43; +} +{ +add.f16x2 r61, %38, %40; +} +{ +add.f16x2 r64, %39, %41; +} +{ +sub.f16x2 r67, %38, %40; +} +{ +sub.f16x2 r70, %39, %41; +} +{ +add.f16x2 r73, %26, r1; +} +{ +add.f16x2 r76, %27, r4; +} +{ +add.f16x2 r79, r73, r13; +} +{ +add.f16x2 r82, r76, r16; +} +{ +add.f16x2 r85, r79, r25; +} +{ +add.f16x2 r88, r82, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 %0, r97, r61; +} +{ +add.f16x2 %1, r100, r64; +} +mov.u32 r900, 0; +cvt.rn.f16.s32 rs1, r900; +mov.b32 r121, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r900; +mov.b32 r133, {rs2, rs2}; +mov.f64 fd127, 0d3FEC55A7E00740E9; +{ +cvt.rn.f16.f64 rs3, fd127; +} +mov.b32 r113, {rs3, rs3}; +{ +mul.f16x2 r111, r1, r113; +} +{ +add.f16x2 r114, %26, r111; +} +mov.f64 fd4, 0d3FDDBE064267C47C; +{ +cvt.rn.f16.f64 rs4, fd4; +} +mov.b32 r119, {rs4, rs4}; +{ +mul.f16x2 r117, r10, r119; +} +{ +add.f16x2 r120, r121, r117; +} +{ +cvt.rn.f16.f64 rs5, fd127; +} +mov.b32 r125, {rs5, rs5}; +{ +mul.f16x2 r123, r4, r125; +} +{ +add.f16x2 r126, %27, r123; +} +{ +cvt.rn.f16.f64 rs6, fd4; +} +mov.b32 r131, {rs6, rs6}; +{ +mul.f16x2 r129, r7, r131; +} +{ +add.f16x2 r132, r133, r129; +} +mov.f64 fd135, 0d3FE22D961EA71119; +{ +cvt.rn.f16.f64 rs7, fd135; +} +mov.b32 r137, {rs7, rs7}; +{ +mul.f16x2 r135, r13, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd108, 0d3FEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs8, fd108; +} +mov.b32 r143, {rs8, rs8}; +{ +mul.f16x2 r141, r22, r143; +} +{ +add.f16x2 r144, r120, r141; +} +{ +cvt.rn.f16.f64 rs9, fd135; +} +mov.b32 r149, {rs9, rs9}; +{ +mul.f16x2 r147, r16, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs10, fd108; +} +mov.b32 r155, {rs10, rs10}; +{ +mul.f16x2 r153, r19, r155; +} +{ +add.f16x2 r156, r132, r153; +} +mov.f64 fd143, 0d3FBEDB7DEBAA3ED8; +{ +cvt.rn.f16.f64 rs11, fd143; +} +mov.b32 r161, {rs11, rs11}; +{ +mul.f16x2 r159, r25, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd88, 0d3FEFC44566966769; +{ +cvt.rn.f16.f64 rs12, fd88; +} +mov.b32 r167, {rs12, rs12}; +{ +mul.f16x2 r165, r34, r167; +} +{ +add.f16x2 r168, r144, r165; +} +{ +cvt.rn.f16.f64 rs13, fd143; +} +mov.b32 r173, {rs13, rs13}; +{ +mul.f16x2 r171, r28, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs14, fd88; +} +mov.b32 r179, {rs14, rs14}; +{ +mul.f16x2 r177, r31, r179; +} +{ +add.f16x2 r180, r156, r177; +} +mov.f64 fd139, 0dBFD6B1D8B2365DA1; +{ +cvt.rn.f16.f64 rs15, fd139; +} +mov.b32 r185, {rs15, rs15}; +{ +mul.f16x2 r183, r37, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd140, 0d3FEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs16, fd140; +} +mov.b32 r191, {rs16, rs16}; +{ +mul.f16x2 r189, r46, r191; +} +{ +add.f16x2 r192, r168, r189; +} +{ +cvt.rn.f16.f64 rs17, fd139; +} +mov.b32 r197, {rs17, rs17}; +{ +mul.f16x2 r195, r40, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs18, fd140; +} +mov.b32 r203, {rs18, rs18}; +{ +mul.f16x2 r201, r43, r203; +} +{ +add.f16x2 r204, r180, r201; +} +mov.f64 fd131, 0dBFE7F3CCD0032E0C; +{ +cvt.rn.f16.f64 rs19, fd131; +} +mov.b32 r209, {rs19, rs19}; +{ +mul.f16x2 r207, r49, r209; +} +{ +add.f16x2 r210, r186, r207; +} +mov.f64 fd132, 0d3FE5384D024C2F84; +{ +cvt.rn.f16.f64 rs20, fd132; +} +mov.b32 r215, {rs20, rs20}; +{ +mul.f16x2 r213, r58, r215; +} +{ +add.f16x2 r216, r192, r213; +} +{ +cvt.rn.f16.f64 rs21, fd131; +} +mov.b32 r221, {rs21, rs21}; +{ +mul.f16x2 r219, r52, r221; +} +{ +add.f16x2 r222, r198, r219; +} +{ +cvt.rn.f16.f64 rs22, fd132; +} +mov.b32 r227, {rs22, rs22}; +{ +mul.f16x2 r225, r55, r227; +} +{ +add.f16x2 r228, r204, r225; +} +mov.f64 fd123, 0dBFEF11F493053D00; +{ +cvt.rn.f16.f64 rs23, fd123; +} +mov.b32 r233, {rs23, rs23}; +{ +mul.f16x2 r231, r61, r233; +} +{ +add.f16x2 r234, r210, r231; +} +mov.f64 fd124, 0d3FCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs24, fd124; +} +mov.b32 r239, {rs24, rs24}; +{ +mul.f16x2 r237, r70, r239; +} +{ +add.f16x2 r240, r216, r237; +} +{ +cvt.rn.f16.f64 rs25, fd123; +} +mov.b32 r245, {rs25, rs25}; +{ +mul.f16x2 r243, r64, r245; +} +{ +add.f16x2 r246, r222, r243; +} +{ +cvt.rn.f16.f64 rs26, fd124; +} +mov.b32 r251, {rs26, rs26}; +{ +mul.f16x2 r249, r67, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +sub.f16x2 %2, r234, r240; +} +{ +add.f16x2 %3, r246, r252; +} +{ +add.f16x2 %24, r234, r240; +} +{ +sub.f16x2 %25, r246, r252; +} +cvt.rn.f16.s32 rs27, r900; +mov.b32 r279, {rs27, rs27}; +cvt.rn.f16.s32 rs28, r900; +mov.b32 r291, {rs28, rs28}; +{ +cvt.rn.f16.f64 rs29, fd135; +} +mov.b32 r271, {rs29, rs29}; +{ +mul.f16x2 r269, r1, r271; +} +{ +add.f16x2 r272, %26, r269; +} +{ +cvt.rn.f16.f64 rs30, fd108; +} +mov.b32 r277, {rs30, rs30}; +{ +mul.f16x2 r275, r10, r277; +} +{ +add.f16x2 r278, r279, r275; +} +{ +cvt.rn.f16.f64 rs31, fd135; +} +mov.b32 r283, {rs31, rs31}; +{ +mul.f16x2 r281, r4, r283; +} +{ +add.f16x2 r284, %27, r281; +} +{ +cvt.rn.f16.f64 rs32, fd108; +} +mov.b32 r289, {rs32, rs32}; +{ +mul.f16x2 r287, r7, r289; +} +{ +add.f16x2 r290, r291, r287; +} +{ +cvt.rn.f16.f64 rs33, fd139; +} +mov.b32 r295, {rs33, rs33}; +{ +mul.f16x2 r293, r13, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs34, fd140; +} +mov.b32 r301, {rs34, rs34}; +{ +mul.f16x2 r299, r22, r301; +} +{ +add.f16x2 r302, r278, r299; +} +{ +cvt.rn.f16.f64 rs35, fd139; +} +mov.b32 r307, {rs35, rs35}; +{ +mul.f16x2 r305, r16, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs36, fd140; +} +mov.b32 r313, {rs36, rs36}; +{ +mul.f16x2 r311, r19, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs37, fd123; +} +mov.b32 r319, {rs37, rs37}; +{ +mul.f16x2 r317, r25, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs38, fd124; +} +mov.b32 r325, {rs38, rs38}; +{ +mul.f16x2 r323, r34, r325; +} +{ +add.f16x2 r326, r302, r323; +} +{ +cvt.rn.f16.f64 rs39, fd123; +} +mov.b32 r331, {rs39, rs39}; +{ +mul.f16x2 r329, r28, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs40, fd124; +} +mov.b32 r337, {rs40, rs40}; +{ +mul.f16x2 r335, r31, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs41, fd131; +} +mov.b32 r343, {rs41, rs41}; +{ +mul.f16x2 r341, r37, r343; +} +{ +add.f16x2 r344, r320, r341; +} +mov.f64 fd80, 0dBFE5384D024C2F84; +{ +cvt.rn.f16.f64 rs42, fd80; +} +mov.b32 r349, {rs42, rs42}; +{ +mul.f16x2 r347, r46, r349; +} +{ +add.f16x2 r350, r326, r347; +} +{ +cvt.rn.f16.f64 rs43, fd131; +} +mov.b32 r355, {rs43, rs43}; +{ +mul.f16x2 r353, r40, r355; +} +{ +add.f16x2 r356, r332, r353; +} +{ +cvt.rn.f16.f64 rs44, fd80; +} +mov.b32 r361, {rs44, rs44}; +{ +mul.f16x2 r359, r43, r361; +} +{ +add.f16x2 r362, r338, r359; +} +{ +cvt.rn.f16.f64 rs45, fd143; +} +mov.b32 r367, {rs45, rs45}; +{ +mul.f16x2 r365, r49, r367; +} +{ +add.f16x2 r368, r344, r365; +} +mov.f64 fd144, 0dBFEFC44566966769; +{ +cvt.rn.f16.f64 rs46, fd144; +} +mov.b32 r373, {rs46, rs46}; +{ +mul.f16x2 r371, r58, r373; +} +{ +add.f16x2 r374, r350, r371; +} +{ +cvt.rn.f16.f64 rs47, fd143; +} +mov.b32 r379, {rs47, rs47}; +{ +mul.f16x2 r377, r52, r379; +} +{ +add.f16x2 r380, r356, r377; +} +{ +cvt.rn.f16.f64 rs48, fd144; +} +mov.b32 r385, {rs48, rs48}; +{ +mul.f16x2 r383, r55, r385; +} +{ +add.f16x2 r386, r362, r383; +} +{ +cvt.rn.f16.f64 rs49, fd127; +} +mov.b32 r391, {rs49, rs49}; +{ +mul.f16x2 r389, r61, r391; +} +{ +add.f16x2 r392, r368, r389; +} +mov.f64 fd128, 0dBFDDBE064267C47C; +{ +cvt.rn.f16.f64 rs50, fd128; +} +mov.b32 r397, {rs50, rs50}; +{ +mul.f16x2 r395, r70, r397; +} +{ +add.f16x2 r398, r374, r395; +} +{ +cvt.rn.f16.f64 rs51, fd127; +} +mov.b32 r403, {rs51, rs51}; +{ +mul.f16x2 r401, r64, r403; +} +{ +add.f16x2 r404, r380, r401; +} +{ +cvt.rn.f16.f64 rs52, fd128; +} +mov.b32 r409, {rs52, rs52}; +{ +mul.f16x2 r407, r67, r409; +} +{ +add.f16x2 r410, r386, r407; +} +{ +sub.f16x2 %4, r392, r398; +} +{ +add.f16x2 %5, r404, r410; +} +{ +add.f16x2 %22, r392, r398; +} +{ +sub.f16x2 %23, r404, r410; +} +cvt.rn.f16.s32 rs53, r900; +mov.b32 r437, {rs53, rs53}; +cvt.rn.f16.s32 rs54, r900; +mov.b32 r449, {rs54, rs54}; +{ +cvt.rn.f16.f64 rs55, fd143; +} +mov.b32 r429, {rs55, rs55}; +{ +mul.f16x2 r427, r1, r429; +} +{ +add.f16x2 r430, %26, r427; +} +{ +cvt.rn.f16.f64 rs56, fd88; +} +mov.b32 r435, {rs56, rs56}; +{ +mul.f16x2 r433, r10, r435; +} +{ +add.f16x2 r436, r437, r433; +} +{ +cvt.rn.f16.f64 rs57, fd143; +} +mov.b32 r441, {rs57, rs57}; +{ +mul.f16x2 r439, r4, r441; +} +{ +add.f16x2 r442, %27, r439; +} +{ +cvt.rn.f16.f64 rs58, fd88; +} +mov.b32 r447, {rs58, rs58}; +{ +mul.f16x2 r445, r7, r447; +} +{ +add.f16x2 r448, r449, r445; +} +{ +cvt.rn.f16.f64 rs59, fd123; +} +mov.b32 r453, {rs59, rs59}; +{ +mul.f16x2 r451, r13, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs60, fd124; +} +mov.b32 r459, {rs60, rs60}; +{ +mul.f16x2 r457, r22, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs61, fd123; +} +mov.b32 r465, {rs61, rs61}; +{ +mul.f16x2 r463, r16, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs62, fd124; +} +mov.b32 r471, {rs62, rs62}; +{ +mul.f16x2 r469, r19, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs63, fd139; +} +mov.b32 r477, {rs63, rs63}; +{ +mul.f16x2 r475, r25, r477; +} +{ +add.f16x2 r478, r454, r475; +} +mov.f64 fd60, 0dBFEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs64, fd60; +} +mov.b32 r483, {rs64, rs64}; +{ +mul.f16x2 r481, r34, r483; +} +{ +add.f16x2 r484, r460, r481; +} +{ +cvt.rn.f16.f64 rs65, fd139; +} +mov.b32 r489, {rs65, rs65}; +{ +mul.f16x2 r487, r28, r489; +} +{ +add.f16x2 r490, r466, r487; +} +{ +cvt.rn.f16.f64 rs66, fd60; +} +mov.b32 r495, {rs66, rs66}; +{ +mul.f16x2 r493, r31, r495; +} +{ +add.f16x2 r496, r472, r493; +} +{ +cvt.rn.f16.f64 rs67, fd127; +} +mov.b32 r501, {rs67, rs67}; +{ +mul.f16x2 r499, r37, r501; +} +{ +add.f16x2 r502, r478, r499; +} +{ +cvt.rn.f16.f64 rs68, fd128; +} +mov.b32 r507, {rs68, rs68}; +{ +mul.f16x2 r505, r46, r507; +} +{ +add.f16x2 r508, r484, r505; +} +{ +cvt.rn.f16.f64 rs69, fd127; +} +mov.b32 r513, {rs69, rs69}; +{ +mul.f16x2 r511, r40, r513; +} +{ +add.f16x2 r514, r490, r511; +} +{ +cvt.rn.f16.f64 rs70, fd128; +} +mov.b32 r519, {rs70, rs70}; +{ +mul.f16x2 r517, r43, r519; +} +{ +add.f16x2 r520, r496, r517; +} +{ +cvt.rn.f16.f64 rs71, fd135; +} +mov.b32 r525, {rs71, rs71}; +{ +mul.f16x2 r523, r49, r525; +} +{ +add.f16x2 r526, r502, r523; +} +{ +cvt.rn.f16.f64 rs72, fd108; +} +mov.b32 r531, {rs72, rs72}; +{ +mul.f16x2 r529, r58, r531; +} +{ +add.f16x2 r532, r508, r529; +} +{ +cvt.rn.f16.f64 rs73, fd135; +} +mov.b32 r537, {rs73, rs73}; +{ +mul.f16x2 r535, r52, r537; +} +{ +add.f16x2 r538, r514, r535; +} +{ +cvt.rn.f16.f64 rs74, fd108; +} +mov.b32 r543, {rs74, rs74}; +{ +mul.f16x2 r541, r55, r543; +} +{ +add.f16x2 r544, r520, r541; +} +{ +cvt.rn.f16.f64 rs75, fd131; +} +mov.b32 r549, {rs75, rs75}; +{ +mul.f16x2 r547, r61, r549; +} +{ +add.f16x2 r550, r526, r547; +} +{ +cvt.rn.f16.f64 rs76, fd132; +} +mov.b32 r555, {rs76, rs76}; +{ +mul.f16x2 r553, r70, r555; +} +{ +add.f16x2 r556, r532, r553; +} +{ +cvt.rn.f16.f64 rs77, fd131; +} +mov.b32 r561, {rs77, rs77}; +{ +mul.f16x2 r559, r64, r561; +} +{ +add.f16x2 r562, r538, r559; +} +{ +cvt.rn.f16.f64 rs78, fd132; +} +mov.b32 r567, {rs78, rs78}; +{ +mul.f16x2 r565, r67, r567; +} +{ +add.f16x2 r568, r544, r565; +} +{ +sub.f16x2 %6, r550, r556; +} +{ +add.f16x2 %7, r562, r568; +} +{ +add.f16x2 %20, r550, r556; +} +{ +sub.f16x2 %21, r562, r568; +} +cvt.rn.f16.s32 rs79, r900; +mov.b32 r595, {rs79, rs79}; +cvt.rn.f16.s32 rs80, r900; +mov.b32 r607, {rs80, rs80}; +{ +cvt.rn.f16.f64 rs81, fd139; +} +mov.b32 r587, {rs81, rs81}; +{ +mul.f16x2 r585, r1, r587; +} +{ +add.f16x2 r588, %26, r585; +} +{ +cvt.rn.f16.f64 rs82, fd140; +} +mov.b32 r593, {rs82, rs82}; +{ +mul.f16x2 r591, r10, r593; +} +{ +add.f16x2 r594, r595, r591; +} +{ +cvt.rn.f16.f64 rs83, fd139; +} +mov.b32 r599, {rs83, rs83}; +{ +mul.f16x2 r597, r4, r599; +} +{ +add.f16x2 r600, %27, r597; +} +{ +cvt.rn.f16.f64 rs84, fd140; +} +mov.b32 r605, {rs84, rs84}; +{ +mul.f16x2 r603, r7, r605; +} +{ +add.f16x2 r606, r607, r603; +} +{ +cvt.rn.f16.f64 rs85, fd131; +} +mov.b32 r611, {rs85, rs85}; +{ +mul.f16x2 r609, r13, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +cvt.rn.f16.f64 rs86, fd80; +} +mov.b32 r617, {rs86, rs86}; +{ +mul.f16x2 r615, r22, r617; +} +{ +add.f16x2 r618, r594, r615; +} +{ +cvt.rn.f16.f64 rs87, fd131; +} +mov.b32 r623, {rs87, rs87}; +{ +mul.f16x2 r621, r16, r623; +} +{ +add.f16x2 r624, r600, r621; +} +{ +cvt.rn.f16.f64 rs88, fd80; +} +mov.b32 r629, {rs88, rs88}; +{ +mul.f16x2 r627, r19, r629; +} +{ +add.f16x2 r630, r606, r627; +} +{ +cvt.rn.f16.f64 rs89, fd127; +} +mov.b32 r635, {rs89, rs89}; +{ +mul.f16x2 r633, r25, r635; +} +{ +add.f16x2 r636, r612, r633; +} +{ +cvt.rn.f16.f64 rs90, fd128; +} +mov.b32 r641, {rs90, rs90}; +{ +mul.f16x2 r639, r34, r641; +} +{ +add.f16x2 r642, r618, r639; +} +{ +cvt.rn.f16.f64 rs91, fd127; +} +mov.b32 r647, {rs91, rs91}; +{ +mul.f16x2 r645, r28, r647; +} +{ +add.f16x2 r648, r624, r645; +} +{ +cvt.rn.f16.f64 rs92, fd128; +} +mov.b32 r653, {rs92, rs92}; +{ +mul.f16x2 r651, r31, r653; +} +{ +add.f16x2 r654, r630, r651; +} +{ +cvt.rn.f16.f64 rs93, fd143; +} +mov.b32 r659, {rs93, rs93}; +{ +mul.f16x2 r657, r37, r659; +} +{ +add.f16x2 r660, r636, r657; +} +{ +cvt.rn.f16.f64 rs94, fd88; +} +mov.b32 r665, {rs94, rs94}; +{ +mul.f16x2 r663, r46, r665; +} +{ +add.f16x2 r666, r642, r663; +} +{ +cvt.rn.f16.f64 rs95, fd143; +} +mov.b32 r671, {rs95, rs95}; +{ +mul.f16x2 r669, r40, r671; +} +{ +add.f16x2 r672, r648, r669; +} +{ +cvt.rn.f16.f64 rs96, fd88; +} +mov.b32 r677, {rs96, rs96}; +{ +mul.f16x2 r675, r43, r677; +} +{ +add.f16x2 r678, r654, r675; +} +{ +cvt.rn.f16.f64 rs97, fd123; +} +mov.b32 r683, {rs97, rs97}; +{ +mul.f16x2 r681, r49, r683; +} +{ +add.f16x2 r684, r660, r681; +} +mov.f64 fd112, 0dBFCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs98, fd112; +} +mov.b32 r689, {rs98, rs98}; +{ +mul.f16x2 r687, r58, r689; +} +{ +add.f16x2 r690, r666, r687; +} +{ +cvt.rn.f16.f64 rs99, fd123; +} +mov.b32 r695, {rs99, rs99}; +{ +mul.f16x2 r693, r52, r695; +} +{ +add.f16x2 r696, r672, r693; +} +{ +cvt.rn.f16.f64 rs100, fd112; +} +mov.b32 r701, {rs100, rs100}; +{ +mul.f16x2 r699, r55, r701; +} +{ +add.f16x2 r702, r678, r699; +} +{ +cvt.rn.f16.f64 rs101, fd135; +} +mov.b32 r707, {rs101, rs101}; +{ +mul.f16x2 r705, r61, r707; +} +{ +add.f16x2 r708, r684, r705; +} +mov.f64 fd136, 0dBFEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs102, fd136; +} +mov.b32 r713, {rs102, rs102}; +{ +mul.f16x2 r711, r70, r713; +} +{ +add.f16x2 r714, r690, r711; +} +{ +cvt.rn.f16.f64 rs103, fd135; +} +mov.b32 r719, {rs103, rs103}; +{ +mul.f16x2 r717, r64, r719; +} +{ +add.f16x2 r720, r696, r717; +} +{ +cvt.rn.f16.f64 rs104, fd136; +} +mov.b32 r725, {rs104, rs104}; +{ +mul.f16x2 r723, r67, r725; +} +{ +add.f16x2 r726, r702, r723; +} +{ +sub.f16x2 %8, r708, r714; +} +{ +add.f16x2 %9, r720, r726; +} +{ +add.f16x2 %18, r708, r714; +} +{ +sub.f16x2 %19, r720, r726; +} +cvt.rn.f16.s32 rs105, r900; +mov.b32 r753, {rs105, rs105}; +cvt.rn.f16.s32 rs106, r900; +mov.b32 r765, {rs106, rs106}; +{ +cvt.rn.f16.f64 rs107, fd131; +} +mov.b32 r745, {rs107, rs107}; +{ +mul.f16x2 r743, r1, r745; +} +{ +add.f16x2 r746, %26, r743; +} +{ +cvt.rn.f16.f64 rs108, fd132; +} +mov.b32 r751, {rs108, rs108}; +{ +mul.f16x2 r749, r10, r751; +} +{ +add.f16x2 r752, r753, r749; +} +{ +cvt.rn.f16.f64 rs109, fd131; +} +mov.b32 r757, {rs109, rs109}; +{ +mul.f16x2 r755, r4, r757; +} +{ +add.f16x2 r758, %27, r755; +} +{ +cvt.rn.f16.f64 rs110, fd132; +} +mov.b32 r763, {rs110, rs110}; +{ +mul.f16x2 r761, r7, r763; +} +{ +add.f16x2 r764, r765, r761; +} +{ +cvt.rn.f16.f64 rs111, fd143; +} +mov.b32 r769, {rs111, rs111}; +{ +mul.f16x2 r767, r13, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs112, fd144; +} +mov.b32 r775, {rs112, rs112}; +{ +mul.f16x2 r773, r22, r775; +} +{ +add.f16x2 r776, r752, r773; +} +{ +cvt.rn.f16.f64 rs113, fd143; +} +mov.b32 r781, {rs113, rs113}; +{ +mul.f16x2 r779, r16, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs114, fd144; +} +mov.b32 r787, {rs114, rs114}; +{ +mul.f16x2 r785, r19, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs115, fd135; +} +mov.b32 r793, {rs115, rs115}; +{ +mul.f16x2 r791, r25, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs116, fd108; +} +mov.b32 r799, {rs116, rs116}; +{ +mul.f16x2 r797, r34, r799; +} +{ +add.f16x2 r800, r776, r797; +} +{ +cvt.rn.f16.f64 rs117, fd135; +} +mov.b32 r805, {rs117, rs117}; +{ +mul.f16x2 r803, r28, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs118, fd108; +} +mov.b32 r811, {rs118, rs118}; +{ +mul.f16x2 r809, r31, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs119, fd123; +} +mov.b32 r817, {rs119, rs119}; +{ +mul.f16x2 r815, r37, r817; +} +{ +add.f16x2 r818, r794, r815; +} +{ +cvt.rn.f16.f64 rs120, fd112; +} +mov.b32 r823, {rs120, rs120}; +{ +mul.f16x2 r821, r46, r823; +} +{ +add.f16x2 r824, r800, r821; +} +{ +cvt.rn.f16.f64 rs121, fd123; +} +mov.b32 r829, {rs121, rs121}; +{ +mul.f16x2 r827, r40, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs122, fd112; +} +mov.b32 r835, {rs122, rs122}; +{ +mul.f16x2 r833, r43, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs123, fd127; +} +mov.b32 r841, {rs123, rs123}; +{ +mul.f16x2 r839, r49, r841; +} +{ +add.f16x2 r842, r818, r839; +} +{ +cvt.rn.f16.f64 rs124, fd128; +} +mov.b32 r847, {rs124, rs124}; +{ +mul.f16x2 r845, r58, r847; +} +{ +add.f16x2 r848, r824, r845; +} +{ +cvt.rn.f16.f64 rs125, fd127; +} +mov.b32 r853, {rs125, rs125}; +{ +mul.f16x2 r851, r52, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs126, fd128; +} +mov.b32 r859, {rs126, rs126}; +{ +mul.f16x2 r857, r55, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs127, fd139; +} +mov.b32 r865, {rs127, rs127}; +{ +mul.f16x2 r863, r61, r865; +} +{ +add.f16x2 r866, r842, r863; +} +{ +cvt.rn.f16.f64 rs128, fd140; +} +mov.b32 r871, {rs128, rs128}; +{ +mul.f16x2 r869, r70, r871; +} +{ +add.f16x2 r872, r848, r869; +} +{ +cvt.rn.f16.f64 rs129, fd139; +} +mov.b32 r877, {rs129, rs129}; +{ +mul.f16x2 r875, r64, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs130, fd140; +} +mov.b32 r883, {rs130, rs130}; +{ +mul.f16x2 r881, r67, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +sub.f16x2 %10, r866, r872; +} +{ +add.f16x2 %11, r878, r884; +} +{ +add.f16x2 %16, r866, r872; +} +{ +sub.f16x2 %17, r878, r884; +} +cvt.rn.f16.s32 rs131, r900; +mov.b32 r911, {rs131, rs131}; +cvt.rn.f16.s32 rs132, r900; +mov.b32 r923, {rs132, rs132}; +{ +cvt.rn.f16.f64 rs133, fd123; +} +mov.b32 r903, {rs133, rs133}; +{ +mul.f16x2 r901, r1, r903; +} +{ +add.f16x2 r904, %26, r901; +} +{ +cvt.rn.f16.f64 rs134, fd124; +} +mov.b32 r909, {rs134, rs134}; +{ +mul.f16x2 r907, r10, r909; +} +{ +add.f16x2 r910, r911, r907; +} +{ +cvt.rn.f16.f64 rs135, fd123; +} +mov.b32 r915, {rs135, rs135}; +{ +mul.f16x2 r913, r4, r915; +} +{ +add.f16x2 r916, %27, r913; +} +{ +cvt.rn.f16.f64 rs136, fd124; +} +mov.b32 r921, {rs136, rs136}; +{ +mul.f16x2 r919, r7, r921; +} +{ +add.f16x2 r922, r923, r919; +} +{ +cvt.rn.f16.f64 rs137, fd127; +} +mov.b32 r927, {rs137, rs137}; +{ +mul.f16x2 r925, r13, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs138, fd128; +} +mov.b32 r933, {rs138, rs138}; +{ +mul.f16x2 r931, r22, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs139, fd127; +} +mov.b32 r939, {rs139, rs139}; +{ +mul.f16x2 r937, r16, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs140, fd128; +} +mov.b32 r945, {rs140, rs140}; +{ +mul.f16x2 r943, r19, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs141, fd131; +} +mov.b32 r951, {rs141, rs141}; +{ +mul.f16x2 r949, r25, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs142, fd132; +} +mov.b32 r957, {rs142, rs142}; +{ +mul.f16x2 r955, r34, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs143, fd131; +} +mov.b32 r963, {rs143, rs143}; +{ +mul.f16x2 r961, r28, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs144, fd132; +} +mov.b32 r969, {rs144, rs144}; +{ +mul.f16x2 r967, r31, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +cvt.rn.f16.f64 rs145, fd135; +} +mov.b32 r975, {rs145, rs145}; +{ +mul.f16x2 r973, r37, r975; +} +{ +add.f16x2 r976, r952, r973; +} +{ +cvt.rn.f16.f64 rs146, fd136; +} +mov.b32 r981, {rs146, rs146}; +{ +mul.f16x2 r979, r46, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs147, fd135; +} +mov.b32 r987, {rs147, rs147}; +{ +mul.f16x2 r985, r40, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs148, fd136; +} +mov.b32 r993, {rs148, rs148}; +{ +mul.f16x2 r991, r43, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs149, fd139; +} +mov.b32 r999, {rs149, rs149}; +{ +mul.f16x2 r997, r49, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs150, fd140; +} +mov.b32 r1005, {rs150, rs150}; +{ +mul.f16x2 r1003, r58, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs151, fd139; +} +mov.b32 r1011, {rs151, rs151}; +{ +mul.f16x2 r1009, r52, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs152, fd140; +} +mov.b32 r1017, {rs152, rs152}; +{ +mul.f16x2 r1015, r55, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +cvt.rn.f16.f64 rs153, fd143; +} +mov.b32 r1023, {rs153, rs153}; +{ +mul.f16x2 r1021, r61, r1023; +} +{ +add.f16x2 r1024, r1000, r1021; +} +{ +cvt.rn.f16.f64 rs154, fd144; +} +mov.b32 r1029, {rs154, rs154}; +{ +mul.f16x2 r1027, r70, r1029; +} +{ +add.f16x2 r1030, r1006, r1027; +} +{ +cvt.rn.f16.f64 rs155, fd143; +} +mov.b32 r1035, {rs155, rs155}; +{ +mul.f16x2 r1033, r64, r1035; +} +{ +add.f16x2 r1036, r1012, r1033; +} +{ +cvt.rn.f16.f64 rs156, fd144; +} +mov.b32 r1041, {rs156, rs156}; +{ +mul.f16x2 r1039, r67, r1041; +} +{ +add.f16x2 r1042, r1018, r1039; +} +{ +sub.f16x2 %12, r1024, r1030; +} +{ +add.f16x2 %13, r1036, r1042; +} +{ +add.f16x2 %14, r1024, r1030; +} +{ +sub.f16x2 %15, r1036, r1042; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..a48610e246256 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp32_fwd.hpp.inc @@ -0,0 +1,220 @@ +#ifndef CUFFTDX_FFT_13_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_13_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<0, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<257>; +.reg .b64 rd<2>; +add.f32 f53, %28, %58; +add.f32 f54, %30, %59; +sub.f32 f55, %28, %58; +sub.f32 f56, %30, %59; +add.f32 f57, %31, %55; +add.f32 f58, %33, %57; +sub.f32 f59, %31, %55; +sub.f32 f60, %33, %57; +add.f32 f61, %34, %52; +add.f32 f62, %35, %54; +sub.f32 f63, %34, %52; +sub.f32 f64, %35, %54; +add.f32 f65, %36, %50; +add.f32 f66, %38, %51; +sub.f32 f67, %36, %50; +sub.f32 f68, %38, %51; +add.f32 f69, %39, %47; +add.f32 f70, %41, %49; +sub.f32 f71, %39, %47; +sub.f32 f72, %41, %49; +add.f32 f73, %42, %44; +add.f32 f74, %43, %46; +sub.f32 f75, %42, %44; +sub.f32 f76, %43, %46; +add.f32 f77, %26, f53; +add.f32 f78, %27, f54; +add.f32 f79, f77, f57; +add.f32 f80, f78, f58; +add.f32 f81, f79, f61; +add.f32 f82, f80, f62; +add.f32 f83, f81, f65; +add.f32 f84, f82, f66; +add.f32 f85, f83, f69; +add.f32 f86, f84, f70; +fma.rn.f32 f87, f53, 0f3F62AD3F, %26; +fma.rn.f32 f88, f56, 0fBEEDF032, 0f00000000; +fma.rn.f32 f89, f54, 0f3F62AD3F, %27; +fma.rn.f32 f90, f55, 0fBEEDF032, 0f00000000; +fma.rn.f32 f91, f57, 0f3F116CB1, f87; +fma.rn.f32 f92, f60, 0fBF52AF12, f88; +fma.rn.f32 f93, f58, 0f3F116CB1, f89; +fma.rn.f32 f94, f59, 0fBF52AF12, f90; +fma.rn.f32 f95, f61, 0f3DF6DBEF, f91; +fma.rn.f32 f96, f64, 0fBF7E222B, f92; +fma.rn.f32 f97, f62, 0f3DF6DBEF, f93; +fma.rn.f32 f98, f63, 0fBF7E222B, f94; +fma.rn.f32 f99, f65, 0fBEB58EC6, f95; +fma.rn.f32 f100, f68, 0fBF6F5D39, f96; +fma.rn.f32 f101, f66, 0fBEB58EC6, f97; +fma.rn.f32 f102, f67, 0fBF6F5D39, f98; +fma.rn.f32 f103, f69, 0fBF3F9E67, f99; +fma.rn.f32 f104, f72, 0fBF29C268, f100; +fma.rn.f32 f105, f70, 0fBF3F9E67, f101; +fma.rn.f32 f106, f71, 0fBF29C268, f102; +fma.rn.f32 f107, f73, 0fBF788FA5, f103; +fma.rn.f32 f108, f76, 0fBE750F2A, f104; +fma.rn.f32 f109, f74, 0fBF788FA5, f105; +fma.rn.f32 f110, f75, 0fBE750F2A, f106; +fma.rn.f32 f111, f53, 0f3F116CB1, %26; +fma.rn.f32 f112, f56, 0fBF52AF12, 0f00000000; +fma.rn.f32 f113, f54, 0f3F116CB1, %27; +fma.rn.f32 f114, f55, 0fBF52AF12, 0f00000000; +fma.rn.f32 f115, f57, 0fBEB58EC6, f111; +fma.rn.f32 f116, f60, 0fBF6F5D39, f112; +fma.rn.f32 f117, f58, 0fBEB58EC6, f113; +fma.rn.f32 f118, f59, 0fBF6F5D39, f114; +fma.rn.f32 f119, f61, 0fBF788FA5, f115; +fma.rn.f32 f120, f64, 0fBE750F2A, f116; +fma.rn.f32 f121, f62, 0fBF788FA5, f117; +fma.rn.f32 f122, f63, 0fBE750F2A, f118; +fma.rn.f32 f123, f65, 0fBF3F9E67, f119; +fma.rn.f32 f124, f68, 0f3F29C268, f120; +fma.rn.f32 f125, f66, 0fBF3F9E67, f121; +fma.rn.f32 f126, f67, 0f3F29C268, f122; +fma.rn.f32 f127, f69, 0f3DF6DBEF, f123; +fma.rn.f32 f128, f72, 0f3F7E222B, f124; +fma.rn.f32 f129, f70, 0f3DF6DBEF, f125; +fma.rn.f32 f130, f71, 0f3F7E222B, f126; +fma.rn.f32 f131, f73, 0f3F62AD3F, f127; +fma.rn.f32 f132, f76, 0f3EEDF032, f128; +fma.rn.f32 f133, f74, 0f3F62AD3F, f129; +fma.rn.f32 f134, f75, 0f3EEDF032, f130; +fma.rn.f32 f135, f53, 0f3DF6DBEF, %26; +fma.rn.f32 f136, f56, 0fBF7E222B, 0f00000000; +fma.rn.f32 f137, f54, 0f3DF6DBEF, %27; +fma.rn.f32 f138, f55, 0fBF7E222B, 0f00000000; +fma.rn.f32 f139, f57, 0fBF788FA5, f135; +fma.rn.f32 f140, f60, 0fBE750F2A, f136; +fma.rn.f32 f141, f58, 0fBF788FA5, f137; +fma.rn.f32 f142, f59, 0fBE750F2A, f138; +fma.rn.f32 f143, f61, 0fBEB58EC6, f139; +fma.rn.f32 f144, f64, 0f3F6F5D39, f140; +fma.rn.f32 f145, f62, 0fBEB58EC6, f141; +fma.rn.f32 f146, f63, 0f3F6F5D39, f142; +fma.rn.f32 f147, f65, 0f3F62AD3F, f143; +fma.rn.f32 f148, f68, 0f3EEDF032, f144; +fma.rn.f32 f149, f66, 0f3F62AD3F, f145; +fma.rn.f32 f150, f67, 0f3EEDF032, f146; +fma.rn.f32 f151, f69, 0f3F116CB1, f147; +fma.rn.f32 f152, f72, 0fBF52AF12, f148; +fma.rn.f32 f153, f70, 0f3F116CB1, f149; +fma.rn.f32 f154, f71, 0fBF52AF12, f150; +fma.rn.f32 f155, f73, 0fBF3F9E67, f151; +fma.rn.f32 f156, f76, 0fBF29C268, f152; +fma.rn.f32 f157, f74, 0fBF3F9E67, f153; +fma.rn.f32 f158, f75, 0fBF29C268, f154; +fma.rn.f32 f159, f53, 0fBEB58EC6, %26; +fma.rn.f32 f160, f56, 0fBF6F5D39, 0f00000000; +fma.rn.f32 f161, f54, 0fBEB58EC6, %27; +fma.rn.f32 f162, f55, 0fBF6F5D39, 0f00000000; +fma.rn.f32 f163, f57, 0fBF3F9E67, f159; +fma.rn.f32 f164, f60, 0f3F29C268, f160; +fma.rn.f32 f165, f58, 0fBF3F9E67, f161; +fma.rn.f32 f166, f59, 0f3F29C268, f162; +fma.rn.f32 f167, f61, 0f3F62AD3F, f163; +fma.rn.f32 f168, f64, 0f3EEDF032, f164; +fma.rn.f32 f169, f62, 0f3F62AD3F, f165; +fma.rn.f32 f170, f63, 0f3EEDF032, f166; +fma.rn.f32 f171, f65, 0f3DF6DBEF, f167; +fma.rn.f32 f172, f68, 0fBF7E222B, f168; +fma.rn.f32 f173, f66, 0f3DF6DBEF, f169; +fma.rn.f32 f174, f67, 0fBF7E222B, f170; +fma.rn.f32 f175, f69, 0fBF788FA5, f171; +fma.rn.f32 f176, f72, 0f3E750F2A, f172; +fma.rn.f32 f177, f70, 0fBF788FA5, f173; +fma.rn.f32 f178, f71, 0f3E750F2A, f174; +fma.rn.f32 f179, f73, 0f3F116CB1, f175; +fma.rn.f32 f180, f76, 0f3F52AF12, f176; +fma.rn.f32 f181, f74, 0f3F116CB1, f177; +fma.rn.f32 f182, f75, 0f3F52AF12, f178; +fma.rn.f32 f183, f53, 0fBF3F9E67, %26; +fma.rn.f32 f184, f56, 0fBF29C268, 0f00000000; +fma.rn.f32 f185, f54, 0fBF3F9E67, %27; +fma.rn.f32 f186, f55, 0fBF29C268, 0f00000000; +fma.rn.f32 f187, f57, 0f3DF6DBEF, f183; +fma.rn.f32 f188, f60, 0f3F7E222B, f184; +fma.rn.f32 f189, f58, 0f3DF6DBEF, f185; +fma.rn.f32 f190, f59, 0f3F7E222B, f186; +fma.rn.f32 f191, f61, 0f3F116CB1, f187; +fma.rn.f32 f192, f64, 0fBF52AF12, f188; +fma.rn.f32 f193, f62, 0f3F116CB1, f189; +fma.rn.f32 f194, f63, 0fBF52AF12, f190; +fma.rn.f32 f195, f65, 0fBF788FA5, f191; +fma.rn.f32 f196, f68, 0f3E750F2A, f192; +fma.rn.f32 f197, f66, 0fBF788FA5, f193; +fma.rn.f32 f198, f67, 0f3E750F2A, f194; +fma.rn.f32 f199, f69, 0f3F62AD3F, f195; +fma.rn.f32 f200, f72, 0f3EEDF032, f196; +fma.rn.f32 f201, f70, 0f3F62AD3F, f197; +fma.rn.f32 f202, f71, 0f3EEDF032, f198; +fma.rn.f32 f203, f73, 0fBEB58EC6, f199; +fma.rn.f32 f204, f76, 0fBF6F5D39, f200; +fma.rn.f32 f205, f74, 0fBEB58EC6, f201; +fma.rn.f32 f206, f75, 0fBF6F5D39, f202; +fma.rn.f32 f207, f53, 0fBF788FA5, %26; +fma.rn.f32 f208, f56, 0fBE750F2A, 0f00000000; +fma.rn.f32 f209, f54, 0fBF788FA5, %27; +fma.rn.f32 f210, f55, 0fBE750F2A, 0f00000000; +fma.rn.f32 f211, f57, 0f3F62AD3F, f207; +fma.rn.f32 f212, f60, 0f3EEDF032, f208; +fma.rn.f32 f213, f58, 0f3F62AD3F, f209; +fma.rn.f32 f214, f59, 0f3EEDF032, f210; +fma.rn.f32 f215, f61, 0fBF3F9E67, f211; +fma.rn.f32 f216, f64, 0fBF29C268, f212; +fma.rn.f32 f217, f62, 0fBF3F9E67, f213; +fma.rn.f32 f218, f63, 0fBF29C268, f214; +fma.rn.f32 f219, f65, 0f3F116CB1, f215; +fma.rn.f32 f220, f68, 0f3F52AF12, f216; +fma.rn.f32 f221, f66, 0f3F116CB1, f217; +fma.rn.f32 f222, f67, 0f3F52AF12, f218; +fma.rn.f32 f223, f69, 0fBEB58EC6, f219; +fma.rn.f32 f224, f72, 0fBF6F5D39, f220; +fma.rn.f32 f225, f70, 0fBEB58EC6, f221; +fma.rn.f32 f226, f71, 0fBF6F5D39, f222; +fma.rn.f32 f227, f73, 0f3DF6DBEF, f223; +fma.rn.f32 f228, f76, 0f3F7E222B, f224; +fma.rn.f32 f229, f74, 0f3DF6DBEF, f225; +fma.rn.f32 f230, f75, 0f3F7E222B, f226; +add.f32 %1, f86, f74; +add.f32 %0, f85, f73; +add.f32 %3, f109, f110; +sub.f32 %2, f107, f108; +add.f32 %5, f133, f134; +sub.f32 %4, f131, f132; +add.f32 %7, f157, f158; +sub.f32 %6, f155, f156; +add.f32 %9, f181, f182; +sub.f32 %8, f179, f180; +add.f32 %11, f205, f206; +sub.f32 %10, f203, f204; +add.f32 %13, f229, f230; +sub.f32 %12, f227, f228; +sub.f32 %15, f229, f230; +add.f32 %14, f227, f228; +sub.f32 %17, f205, f206; +add.f32 %16, f203, f204; +sub.f32 %19, f181, f182; +add.f32 %18, f179, f180; +sub.f32 %21, f157, f158; +add.f32 %20, f155, f156; +sub.f32 %23, f133, f134; +add.f32 %22, f131, f132; +sub.f32 %25, f109, f110; +add.f32 %24, f107, f108; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..cffabfde4aa17 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp32_inv.hpp.inc @@ -0,0 +1,220 @@ +#ifndef CUFFTDX_FFT_13_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_13_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<202, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<257>; +.reg .b64 rd<2>; +add.f32 f53, %28, %58; +add.f32 f54, %30, %59; +sub.f32 f55, %28, %58; +sub.f32 f56, %30, %59; +add.f32 f57, %31, %55; +add.f32 f58, %33, %57; +sub.f32 f59, %31, %55; +sub.f32 f60, %33, %57; +add.f32 f61, %34, %52; +add.f32 f62, %35, %54; +sub.f32 f63, %34, %52; +sub.f32 f64, %35, %54; +add.f32 f65, %36, %50; +add.f32 f66, %38, %51; +sub.f32 f67, %36, %50; +sub.f32 f68, %38, %51; +add.f32 f69, %39, %47; +add.f32 f70, %41, %49; +sub.f32 f71, %39, %47; +sub.f32 f72, %41, %49; +add.f32 f73, %42, %44; +add.f32 f74, %43, %46; +sub.f32 f75, %42, %44; +sub.f32 f76, %43, %46; +add.f32 f77, %26, f53; +add.f32 f78, %27, f54; +add.f32 f79, f77, f57; +add.f32 f80, f78, f58; +add.f32 f81, f79, f61; +add.f32 f82, f80, f62; +add.f32 f83, f81, f65; +add.f32 f84, f82, f66; +add.f32 f85, f83, f69; +add.f32 f86, f84, f70; +fma.rn.f32 f87, f53, 0f3F62AD3F, %26; +fma.rn.f32 f88, f56, 0f3EEDF032, 0f00000000; +fma.rn.f32 f89, f54, 0f3F62AD3F, %27; +fma.rn.f32 f90, f55, 0f3EEDF032, 0f00000000; +fma.rn.f32 f91, f57, 0f3F116CB1, f87; +fma.rn.f32 f92, f60, 0f3F52AF12, f88; +fma.rn.f32 f93, f58, 0f3F116CB1, f89; +fma.rn.f32 f94, f59, 0f3F52AF12, f90; +fma.rn.f32 f95, f61, 0f3DF6DBEF, f91; +fma.rn.f32 f96, f64, 0f3F7E222B, f92; +fma.rn.f32 f97, f62, 0f3DF6DBEF, f93; +fma.rn.f32 f98, f63, 0f3F7E222B, f94; +fma.rn.f32 f99, f65, 0fBEB58EC6, f95; +fma.rn.f32 f100, f68, 0f3F6F5D39, f96; +fma.rn.f32 f101, f66, 0fBEB58EC6, f97; +fma.rn.f32 f102, f67, 0f3F6F5D39, f98; +fma.rn.f32 f103, f69, 0fBF3F9E67, f99; +fma.rn.f32 f104, f72, 0f3F29C268, f100; +fma.rn.f32 f105, f70, 0fBF3F9E67, f101; +fma.rn.f32 f106, f71, 0f3F29C268, f102; +fma.rn.f32 f107, f73, 0fBF788FA5, f103; +fma.rn.f32 f108, f76, 0f3E750F2A, f104; +fma.rn.f32 f109, f74, 0fBF788FA5, f105; +fma.rn.f32 f110, f75, 0f3E750F2A, f106; +fma.rn.f32 f111, f53, 0f3F116CB1, %26; +fma.rn.f32 f112, f56, 0f3F52AF12, 0f00000000; +fma.rn.f32 f113, f54, 0f3F116CB1, %27; +fma.rn.f32 f114, f55, 0f3F52AF12, 0f00000000; +fma.rn.f32 f115, f57, 0fBEB58EC6, f111; +fma.rn.f32 f116, f60, 0f3F6F5D39, f112; +fma.rn.f32 f117, f58, 0fBEB58EC6, f113; +fma.rn.f32 f118, f59, 0f3F6F5D39, f114; +fma.rn.f32 f119, f61, 0fBF788FA5, f115; +fma.rn.f32 f120, f64, 0f3E750F2A, f116; +fma.rn.f32 f121, f62, 0fBF788FA5, f117; +fma.rn.f32 f122, f63, 0f3E750F2A, f118; +fma.rn.f32 f123, f65, 0fBF3F9E67, f119; +fma.rn.f32 f124, f68, 0fBF29C268, f120; +fma.rn.f32 f125, f66, 0fBF3F9E67, f121; +fma.rn.f32 f126, f67, 0fBF29C268, f122; +fma.rn.f32 f127, f69, 0f3DF6DBEF, f123; +fma.rn.f32 f128, f72, 0fBF7E222B, f124; +fma.rn.f32 f129, f70, 0f3DF6DBEF, f125; +fma.rn.f32 f130, f71, 0fBF7E222B, f126; +fma.rn.f32 f131, f73, 0f3F62AD3F, f127; +fma.rn.f32 f132, f76, 0fBEEDF032, f128; +fma.rn.f32 f133, f74, 0f3F62AD3F, f129; +fma.rn.f32 f134, f75, 0fBEEDF032, f130; +fma.rn.f32 f135, f53, 0f3DF6DBEF, %26; +fma.rn.f32 f136, f56, 0f3F7E222B, 0f00000000; +fma.rn.f32 f137, f54, 0f3DF6DBEF, %27; +fma.rn.f32 f138, f55, 0f3F7E222B, 0f00000000; +fma.rn.f32 f139, f57, 0fBF788FA5, f135; +fma.rn.f32 f140, f60, 0f3E750F2A, f136; +fma.rn.f32 f141, f58, 0fBF788FA5, f137; +fma.rn.f32 f142, f59, 0f3E750F2A, f138; +fma.rn.f32 f143, f61, 0fBEB58EC6, f139; +fma.rn.f32 f144, f64, 0fBF6F5D39, f140; +fma.rn.f32 f145, f62, 0fBEB58EC6, f141; +fma.rn.f32 f146, f63, 0fBF6F5D39, f142; +fma.rn.f32 f147, f65, 0f3F62AD3F, f143; +fma.rn.f32 f148, f68, 0fBEEDF032, f144; +fma.rn.f32 f149, f66, 0f3F62AD3F, f145; +fma.rn.f32 f150, f67, 0fBEEDF032, f146; +fma.rn.f32 f151, f69, 0f3F116CB1, f147; +fma.rn.f32 f152, f72, 0f3F52AF12, f148; +fma.rn.f32 f153, f70, 0f3F116CB1, f149; +fma.rn.f32 f154, f71, 0f3F52AF12, f150; +fma.rn.f32 f155, f73, 0fBF3F9E67, f151; +fma.rn.f32 f156, f76, 0f3F29C268, f152; +fma.rn.f32 f157, f74, 0fBF3F9E67, f153; +fma.rn.f32 f158, f75, 0f3F29C268, f154; +fma.rn.f32 f159, f53, 0fBEB58EC6, %26; +fma.rn.f32 f160, f56, 0f3F6F5D39, 0f00000000; +fma.rn.f32 f161, f54, 0fBEB58EC6, %27; +fma.rn.f32 f162, f55, 0f3F6F5D39, 0f00000000; +fma.rn.f32 f163, f57, 0fBF3F9E67, f159; +fma.rn.f32 f164, f60, 0fBF29C268, f160; +fma.rn.f32 f165, f58, 0fBF3F9E67, f161; +fma.rn.f32 f166, f59, 0fBF29C268, f162; +fma.rn.f32 f167, f61, 0f3F62AD3F, f163; +fma.rn.f32 f168, f64, 0fBEEDF032, f164; +fma.rn.f32 f169, f62, 0f3F62AD3F, f165; +fma.rn.f32 f170, f63, 0fBEEDF032, f166; +fma.rn.f32 f171, f65, 0f3DF6DBEF, f167; +fma.rn.f32 f172, f68, 0f3F7E222B, f168; +fma.rn.f32 f173, f66, 0f3DF6DBEF, f169; +fma.rn.f32 f174, f67, 0f3F7E222B, f170; +fma.rn.f32 f175, f69, 0fBF788FA5, f171; +fma.rn.f32 f176, f72, 0fBE750F2A, f172; +fma.rn.f32 f177, f70, 0fBF788FA5, f173; +fma.rn.f32 f178, f71, 0fBE750F2A, f174; +fma.rn.f32 f179, f73, 0f3F116CB1, f175; +fma.rn.f32 f180, f76, 0fBF52AF12, f176; +fma.rn.f32 f181, f74, 0f3F116CB1, f177; +fma.rn.f32 f182, f75, 0fBF52AF12, f178; +fma.rn.f32 f183, f53, 0fBF3F9E67, %26; +fma.rn.f32 f184, f56, 0f3F29C268, 0f00000000; +fma.rn.f32 f185, f54, 0fBF3F9E67, %27; +fma.rn.f32 f186, f55, 0f3F29C268, 0f00000000; +fma.rn.f32 f187, f57, 0f3DF6DBEF, f183; +fma.rn.f32 f188, f60, 0fBF7E222B, f184; +fma.rn.f32 f189, f58, 0f3DF6DBEF, f185; +fma.rn.f32 f190, f59, 0fBF7E222B, f186; +fma.rn.f32 f191, f61, 0f3F116CB1, f187; +fma.rn.f32 f192, f64, 0f3F52AF12, f188; +fma.rn.f32 f193, f62, 0f3F116CB1, f189; +fma.rn.f32 f194, f63, 0f3F52AF12, f190; +fma.rn.f32 f195, f65, 0fBF788FA5, f191; +fma.rn.f32 f196, f68, 0fBE750F2A, f192; +fma.rn.f32 f197, f66, 0fBF788FA5, f193; +fma.rn.f32 f198, f67, 0fBE750F2A, f194; +fma.rn.f32 f199, f69, 0f3F62AD3F, f195; +fma.rn.f32 f200, f72, 0fBEEDF032, f196; +fma.rn.f32 f201, f70, 0f3F62AD3F, f197; +fma.rn.f32 f202, f71, 0fBEEDF032, f198; +fma.rn.f32 f203, f73, 0fBEB58EC6, f199; +fma.rn.f32 f204, f76, 0f3F6F5D39, f200; +fma.rn.f32 f205, f74, 0fBEB58EC6, f201; +fma.rn.f32 f206, f75, 0f3F6F5D39, f202; +fma.rn.f32 f207, f53, 0fBF788FA5, %26; +fma.rn.f32 f208, f56, 0f3E750F2A, 0f00000000; +fma.rn.f32 f209, f54, 0fBF788FA5, %27; +fma.rn.f32 f210, f55, 0f3E750F2A, 0f00000000; +fma.rn.f32 f211, f57, 0f3F62AD3F, f207; +fma.rn.f32 f212, f60, 0fBEEDF032, f208; +fma.rn.f32 f213, f58, 0f3F62AD3F, f209; +fma.rn.f32 f214, f59, 0fBEEDF032, f210; +fma.rn.f32 f215, f61, 0fBF3F9E67, f211; +fma.rn.f32 f216, f64, 0f3F29C268, f212; +fma.rn.f32 f217, f62, 0fBF3F9E67, f213; +fma.rn.f32 f218, f63, 0f3F29C268, f214; +fma.rn.f32 f219, f65, 0f3F116CB1, f215; +fma.rn.f32 f220, f68, 0fBF52AF12, f216; +fma.rn.f32 f221, f66, 0f3F116CB1, f217; +fma.rn.f32 f222, f67, 0fBF52AF12, f218; +fma.rn.f32 f223, f69, 0fBEB58EC6, f219; +fma.rn.f32 f224, f72, 0f3F6F5D39, f220; +fma.rn.f32 f225, f70, 0fBEB58EC6, f221; +fma.rn.f32 f226, f71, 0f3F6F5D39, f222; +fma.rn.f32 f227, f73, 0f3DF6DBEF, f223; +fma.rn.f32 f228, f76, 0fBF7E222B, f224; +fma.rn.f32 f229, f74, 0f3DF6DBEF, f225; +fma.rn.f32 f230, f75, 0fBF7E222B, f226; +add.f32 %1, f86, f74; +add.f32 %0, f85, f73; +add.f32 %3, f109, f110; +sub.f32 %2, f107, f108; +add.f32 %5, f133, f134; +sub.f32 %4, f131, f132; +add.f32 %7, f157, f158; +sub.f32 %6, f155, f156; +add.f32 %9, f181, f182; +sub.f32 %8, f179, f180; +add.f32 %11, f205, f206; +sub.f32 %10, f203, f204; +add.f32 %13, f229, f230; +sub.f32 %12, f227, f228; +sub.f32 %15, f229, f230; +add.f32 %14, f227, f228; +sub.f32 %17, f205, f206; +add.f32 %16, f203, f204; +sub.f32 %19, f181, f182; +add.f32 %18, f179, f180; +sub.f32 %21, f157, f158; +add.f32 %20, f155, f156; +sub.f32 %23, f133, f134; +add.f32 %22, f131, f132; +sub.f32 %25, f109, f110; +add.f32 %24, f107, f108; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..db2a6cf8127da --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp64_fwd.hpp.inc @@ -0,0 +1,220 @@ +#ifndef CUFFTDX_FFT_13_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_13_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<404, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<257>; +.reg .b64 rd<2>; +add.f64 fd53, %28, %58; +add.f64 fd54, %30, %59; +sub.f64 fd55, %28, %58; +sub.f64 fd56, %30, %59; +add.f64 fd57, %31, %55; +add.f64 fd58, %33, %57; +sub.f64 fd59, %31, %55; +sub.f64 fd60, %33, %57; +add.f64 fd61, %34, %52; +add.f64 fd62, %35, %54; +sub.f64 fd63, %34, %52; +sub.f64 fd64, %35, %54; +add.f64 fd65, %36, %50; +add.f64 fd66, %38, %51; +sub.f64 fd67, %36, %50; +sub.f64 fd68, %38, %51; +add.f64 fd69, %39, %47; +add.f64 fd70, %41, %49; +sub.f64 fd71, %39, %47; +sub.f64 fd72, %41, %49; +add.f64 fd73, %42, %44; +add.f64 fd74, %43, %46; +sub.f64 fd75, %42, %44; +sub.f64 fd76, %43, %46; +add.f64 fd77, %26, fd53; +add.f64 fd78, %27, fd54; +add.f64 fd79, fd77, fd57; +add.f64 fd80, fd78, fd58; +add.f64 fd81, fd79, fd61; +add.f64 fd82, fd80, fd62; +add.f64 fd83, fd81, fd65; +add.f64 fd84, fd82, fd66; +add.f64 fd85, fd83, fd69; +add.f64 fd86, fd84, fd70; +fma.rn.f64 fd87, fd53, 0d3FEC55A7E00740E9, %26; +fma.rn.f64 fd88, fd56, 0dBFDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd89, fd54, 0d3FEC55A7E00740E9, %27; +fma.rn.f64 fd90, fd55, 0dBFDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd91, fd57, 0d3FE22D961EA71119, fd87; +fma.rn.f64 fd92, fd60, 0dBFEA55E242A4C3D2, fd88; +fma.rn.f64 fd93, fd58, 0d3FE22D961EA71119, fd89; +fma.rn.f64 fd94, fd59, 0dBFEA55E242A4C3D2, fd90; +fma.rn.f64 fd95, fd61, 0d3FBEDB7DEBAA3ED8, fd91; +fma.rn.f64 fd96, fd64, 0dBFEFC44566966769, fd92; +fma.rn.f64 fd97, fd62, 0d3FBEDB7DEBAA3ED8, fd93; +fma.rn.f64 fd98, fd63, 0dBFEFC44566966769, fd94; +fma.rn.f64 fd99, fd65, 0dBFD6B1D8B2365DA1, fd95; +fma.rn.f64 fd100, fd68, 0dBFEDEBA72EF20147, fd96; +fma.rn.f64 fd101, fd66, 0dBFD6B1D8B2365DA1, fd97; +fma.rn.f64 fd102, fd67, 0dBFEDEBA72EF20147, fd98; +fma.rn.f64 fd103, fd69, 0dBFE7F3CCD0032E0C, fd99; +fma.rn.f64 fd104, fd72, 0dBFE5384D024C2F84, fd100; +fma.rn.f64 fd105, fd70, 0dBFE7F3CCD0032E0C, fd101; +fma.rn.f64 fd106, fd71, 0dBFE5384D024C2F84, fd102; +fma.rn.f64 fd107, fd73, 0dBFEF11F493053D00, fd103; +fma.rn.f64 fd108, fd76, 0dBFCEA1E54BC48DBF, fd104; +fma.rn.f64 fd109, fd74, 0dBFEF11F493053D00, fd105; +fma.rn.f64 fd110, fd75, 0dBFCEA1E54BC48DBF, fd106; +fma.rn.f64 fd111, fd53, 0d3FE22D961EA71119, %26; +fma.rn.f64 fd112, fd56, 0dBFEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd113, fd54, 0d3FE22D961EA71119, %27; +fma.rn.f64 fd114, fd55, 0dBFEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd115, fd57, 0dBFD6B1D8B2365DA1, fd111; +fma.rn.f64 fd116, fd60, 0dBFEDEBA72EF20147, fd112; +fma.rn.f64 fd117, fd58, 0dBFD6B1D8B2365DA1, fd113; +fma.rn.f64 fd118, fd59, 0dBFEDEBA72EF20147, fd114; +fma.rn.f64 fd119, fd61, 0dBFEF11F493053D00, fd115; +fma.rn.f64 fd120, fd64, 0dBFCEA1E54BC48DBF, fd116; +fma.rn.f64 fd121, fd62, 0dBFEF11F493053D00, fd117; +fma.rn.f64 fd122, fd63, 0dBFCEA1E54BC48DBF, fd118; +fma.rn.f64 fd123, fd65, 0dBFE7F3CCD0032E0C, fd119; +fma.rn.f64 fd124, fd68, 0d3FE5384D024C2F84, fd120; +fma.rn.f64 fd125, fd66, 0dBFE7F3CCD0032E0C, fd121; +fma.rn.f64 fd126, fd67, 0d3FE5384D024C2F84, fd122; +fma.rn.f64 fd127, fd69, 0d3FBEDB7DEBAA3ED8, fd123; +fma.rn.f64 fd128, fd72, 0d3FEFC44566966769, fd124; +fma.rn.f64 fd129, fd70, 0d3FBEDB7DEBAA3ED8, fd125; +fma.rn.f64 fd130, fd71, 0d3FEFC44566966769, fd126; +fma.rn.f64 fd131, fd73, 0d3FEC55A7E00740E9, fd127; +fma.rn.f64 fd132, fd76, 0d3FDDBE064267C47C, fd128; +fma.rn.f64 fd133, fd74, 0d3FEC55A7E00740E9, fd129; +fma.rn.f64 fd134, fd75, 0d3FDDBE064267C47C, fd130; +fma.rn.f64 fd135, fd53, 0d3FBEDB7DEBAA3ED8, %26; +fma.rn.f64 fd136, fd56, 0dBFEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd137, fd54, 0d3FBEDB7DEBAA3ED8, %27; +fma.rn.f64 fd138, fd55, 0dBFEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd139, fd57, 0dBFEF11F493053D00, fd135; +fma.rn.f64 fd140, fd60, 0dBFCEA1E54BC48DBF, fd136; +fma.rn.f64 fd141, fd58, 0dBFEF11F493053D00, fd137; +fma.rn.f64 fd142, fd59, 0dBFCEA1E54BC48DBF, fd138; +fma.rn.f64 fd143, fd61, 0dBFD6B1D8B2365DA1, fd139; +fma.rn.f64 fd144, fd64, 0d3FEDEBA72EF20147, fd140; +fma.rn.f64 fd145, fd62, 0dBFD6B1D8B2365DA1, fd141; +fma.rn.f64 fd146, fd63, 0d3FEDEBA72EF20147, fd142; +fma.rn.f64 fd147, fd65, 0d3FEC55A7E00740E9, fd143; +fma.rn.f64 fd148, fd68, 0d3FDDBE064267C47C, fd144; +fma.rn.f64 fd149, fd66, 0d3FEC55A7E00740E9, fd145; +fma.rn.f64 fd150, fd67, 0d3FDDBE064267C47C, fd146; +fma.rn.f64 fd151, fd69, 0d3FE22D961EA71119, fd147; +fma.rn.f64 fd152, fd72, 0dBFEA55E242A4C3D2, fd148; +fma.rn.f64 fd153, fd70, 0d3FE22D961EA71119, fd149; +fma.rn.f64 fd154, fd71, 0dBFEA55E242A4C3D2, fd150; +fma.rn.f64 fd155, fd73, 0dBFE7F3CCD0032E0C, fd151; +fma.rn.f64 fd156, fd76, 0dBFE5384D024C2F84, fd152; +fma.rn.f64 fd157, fd74, 0dBFE7F3CCD0032E0C, fd153; +fma.rn.f64 fd158, fd75, 0dBFE5384D024C2F84, fd154; +fma.rn.f64 fd159, fd53, 0dBFD6B1D8B2365DA1, %26; +fma.rn.f64 fd160, fd56, 0dBFEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd161, fd54, 0dBFD6B1D8B2365DA1, %27; +fma.rn.f64 fd162, fd55, 0dBFEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd163, fd57, 0dBFE7F3CCD0032E0C, fd159; +fma.rn.f64 fd164, fd60, 0d3FE5384D024C2F84, fd160; +fma.rn.f64 fd165, fd58, 0dBFE7F3CCD0032E0C, fd161; +fma.rn.f64 fd166, fd59, 0d3FE5384D024C2F84, fd162; +fma.rn.f64 fd167, fd61, 0d3FEC55A7E00740E9, fd163; +fma.rn.f64 fd168, fd64, 0d3FDDBE064267C47C, fd164; +fma.rn.f64 fd169, fd62, 0d3FEC55A7E00740E9, fd165; +fma.rn.f64 fd170, fd63, 0d3FDDBE064267C47C, fd166; +fma.rn.f64 fd171, fd65, 0d3FBEDB7DEBAA3ED8, fd167; +fma.rn.f64 fd172, fd68, 0dBFEFC44566966769, fd168; +fma.rn.f64 fd173, fd66, 0d3FBEDB7DEBAA3ED8, fd169; +fma.rn.f64 fd174, fd67, 0dBFEFC44566966769, fd170; +fma.rn.f64 fd175, fd69, 0dBFEF11F493053D00, fd171; +fma.rn.f64 fd176, fd72, 0d3FCEA1E54BC48DBF, fd172; +fma.rn.f64 fd177, fd70, 0dBFEF11F493053D00, fd173; +fma.rn.f64 fd178, fd71, 0d3FCEA1E54BC48DBF, fd174; +fma.rn.f64 fd179, fd73, 0d3FE22D961EA71119, fd175; +fma.rn.f64 fd180, fd76, 0d3FEA55E242A4C3D2, fd176; +fma.rn.f64 fd181, fd74, 0d3FE22D961EA71119, fd177; +fma.rn.f64 fd182, fd75, 0d3FEA55E242A4C3D2, fd178; +fma.rn.f64 fd183, fd53, 0dBFE7F3CCD0032E0C, %26; +fma.rn.f64 fd184, fd56, 0dBFE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd185, fd54, 0dBFE7F3CCD0032E0C, %27; +fma.rn.f64 fd186, fd55, 0dBFE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd187, fd57, 0d3FBEDB7DEBAA3ED8, fd183; +fma.rn.f64 fd188, fd60, 0d3FEFC44566966769, fd184; +fma.rn.f64 fd189, fd58, 0d3FBEDB7DEBAA3ED8, fd185; +fma.rn.f64 fd190, fd59, 0d3FEFC44566966769, fd186; +fma.rn.f64 fd191, fd61, 0d3FE22D961EA71119, fd187; +fma.rn.f64 fd192, fd64, 0dBFEA55E242A4C3D2, fd188; +fma.rn.f64 fd193, fd62, 0d3FE22D961EA71119, fd189; +fma.rn.f64 fd194, fd63, 0dBFEA55E242A4C3D2, fd190; +fma.rn.f64 fd195, fd65, 0dBFEF11F493053D00, fd191; +fma.rn.f64 fd196, fd68, 0d3FCEA1E54BC48DBF, fd192; +fma.rn.f64 fd197, fd66, 0dBFEF11F493053D00, fd193; +fma.rn.f64 fd198, fd67, 0d3FCEA1E54BC48DBF, fd194; +fma.rn.f64 fd199, fd69, 0d3FEC55A7E00740E9, fd195; +fma.rn.f64 fd200, fd72, 0d3FDDBE064267C47C, fd196; +fma.rn.f64 fd201, fd70, 0d3FEC55A7E00740E9, fd197; +fma.rn.f64 fd202, fd71, 0d3FDDBE064267C47C, fd198; +fma.rn.f64 fd203, fd73, 0dBFD6B1D8B2365DA1, fd199; +fma.rn.f64 fd204, fd76, 0dBFEDEBA72EF20147, fd200; +fma.rn.f64 fd205, fd74, 0dBFD6B1D8B2365DA1, fd201; +fma.rn.f64 fd206, fd75, 0dBFEDEBA72EF20147, fd202; +fma.rn.f64 fd207, fd53, 0dBFEF11F493053D00, %26; +fma.rn.f64 fd208, fd56, 0dBFCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd209, fd54, 0dBFEF11F493053D00, %27; +fma.rn.f64 fd210, fd55, 0dBFCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd211, fd57, 0d3FEC55A7E00740E9, fd207; +fma.rn.f64 fd212, fd60, 0d3FDDBE064267C47C, fd208; +fma.rn.f64 fd213, fd58, 0d3FEC55A7E00740E9, fd209; +fma.rn.f64 fd214, fd59, 0d3FDDBE064267C47C, fd210; +fma.rn.f64 fd215, fd61, 0dBFE7F3CCD0032E0C, fd211; +fma.rn.f64 fd216, fd64, 0dBFE5384D024C2F84, fd212; +fma.rn.f64 fd217, fd62, 0dBFE7F3CCD0032E0C, fd213; +fma.rn.f64 fd218, fd63, 0dBFE5384D024C2F84, fd214; +fma.rn.f64 fd219, fd65, 0d3FE22D961EA71119, fd215; +fma.rn.f64 fd220, fd68, 0d3FEA55E242A4C3D2, fd216; +fma.rn.f64 fd221, fd66, 0d3FE22D961EA71119, fd217; +fma.rn.f64 fd222, fd67, 0d3FEA55E242A4C3D2, fd218; +fma.rn.f64 fd223, fd69, 0dBFD6B1D8B2365DA1, fd219; +fma.rn.f64 fd224, fd72, 0dBFEDEBA72EF20147, fd220; +fma.rn.f64 fd225, fd70, 0dBFD6B1D8B2365DA1, fd221; +fma.rn.f64 fd226, fd71, 0dBFEDEBA72EF20147, fd222; +fma.rn.f64 fd227, fd73, 0d3FBEDB7DEBAA3ED8, fd223; +fma.rn.f64 fd228, fd76, 0d3FEFC44566966769, fd224; +fma.rn.f64 fd229, fd74, 0d3FBEDB7DEBAA3ED8, fd225; +fma.rn.f64 fd230, fd75, 0d3FEFC44566966769, fd226; +add.f64 %1, fd86, fd74; +add.f64 %0, fd85, fd73; +add.f64 %3, fd109, fd110; +sub.f64 %2, fd107, fd108; +add.f64 %5, fd133, fd134; +sub.f64 %4, fd131, fd132; +add.f64 %7, fd157, fd158; +sub.f64 %6, fd155, fd156; +add.f64 %9, fd181, fd182; +sub.f64 %8, fd179, fd180; +add.f64 %11, fd205, fd206; +sub.f64 %10, fd203, fd204; +add.f64 %13, fd229, fd230; +sub.f64 %12, fd227, fd228; +sub.f64 %15, fd229, fd230; +add.f64 %14, fd227, fd228; +sub.f64 %17, fd205, fd206; +add.f64 %16, fd203, fd204; +sub.f64 %19, fd181, fd182; +add.f64 %18, fd179, fd180; +sub.f64 %21, fd157, fd158; +add.f64 %20, fd155, fd156; +sub.f64 %23, fd133, fd134; +add.f64 %22, fd131, fd132; +sub.f64 %25, fd109, fd110; +add.f64 %24, fd107, fd108; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..632a8a0ffc732 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_13_fp64_inv.hpp.inc @@ -0,0 +1,220 @@ +#ifndef CUFFTDX_FFT_13_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_13_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<575, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<257>; +.reg .b64 rd<2>; +add.f64 fd53, %28, %58; +add.f64 fd54, %30, %59; +sub.f64 fd55, %28, %58; +sub.f64 fd56, %30, %59; +add.f64 fd57, %31, %55; +add.f64 fd58, %33, %57; +sub.f64 fd59, %31, %55; +sub.f64 fd60, %33, %57; +add.f64 fd61, %34, %52; +add.f64 fd62, %35, %54; +sub.f64 fd63, %34, %52; +sub.f64 fd64, %35, %54; +add.f64 fd65, %36, %50; +add.f64 fd66, %38, %51; +sub.f64 fd67, %36, %50; +sub.f64 fd68, %38, %51; +add.f64 fd69, %39, %47; +add.f64 fd70, %41, %49; +sub.f64 fd71, %39, %47; +sub.f64 fd72, %41, %49; +add.f64 fd73, %42, %44; +add.f64 fd74, %43, %46; +sub.f64 fd75, %42, %44; +sub.f64 fd76, %43, %46; +add.f64 fd77, %26, fd53; +add.f64 fd78, %27, fd54; +add.f64 fd79, fd77, fd57; +add.f64 fd80, fd78, fd58; +add.f64 fd81, fd79, fd61; +add.f64 fd82, fd80, fd62; +add.f64 fd83, fd81, fd65; +add.f64 fd84, fd82, fd66; +add.f64 fd85, fd83, fd69; +add.f64 fd86, fd84, fd70; +fma.rn.f64 fd87, fd53, 0d3FEC55A7E00740E9, %26; +fma.rn.f64 fd88, fd56, 0d3FDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd89, fd54, 0d3FEC55A7E00740E9, %27; +fma.rn.f64 fd90, fd55, 0d3FDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd91, fd57, 0d3FE22D961EA71119, fd87; +fma.rn.f64 fd92, fd60, 0d3FEA55E242A4C3D2, fd88; +fma.rn.f64 fd93, fd58, 0d3FE22D961EA71119, fd89; +fma.rn.f64 fd94, fd59, 0d3FEA55E242A4C3D2, fd90; +fma.rn.f64 fd95, fd61, 0d3FBEDB7DEBAA3ED8, fd91; +fma.rn.f64 fd96, fd64, 0d3FEFC44566966769, fd92; +fma.rn.f64 fd97, fd62, 0d3FBEDB7DEBAA3ED8, fd93; +fma.rn.f64 fd98, fd63, 0d3FEFC44566966769, fd94; +fma.rn.f64 fd99, fd65, 0dBFD6B1D8B2365DA1, fd95; +fma.rn.f64 fd100, fd68, 0d3FEDEBA72EF20147, fd96; +fma.rn.f64 fd101, fd66, 0dBFD6B1D8B2365DA1, fd97; +fma.rn.f64 fd102, fd67, 0d3FEDEBA72EF20147, fd98; +fma.rn.f64 fd103, fd69, 0dBFE7F3CCD0032E0C, fd99; +fma.rn.f64 fd104, fd72, 0d3FE5384D024C2F84, fd100; +fma.rn.f64 fd105, fd70, 0dBFE7F3CCD0032E0C, fd101; +fma.rn.f64 fd106, fd71, 0d3FE5384D024C2F84, fd102; +fma.rn.f64 fd107, fd73, 0dBFEF11F493053D00, fd103; +fma.rn.f64 fd108, fd76, 0d3FCEA1E54BC48DBF, fd104; +fma.rn.f64 fd109, fd74, 0dBFEF11F493053D00, fd105; +fma.rn.f64 fd110, fd75, 0d3FCEA1E54BC48DBF, fd106; +fma.rn.f64 fd111, fd53, 0d3FE22D961EA71119, %26; +fma.rn.f64 fd112, fd56, 0d3FEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd113, fd54, 0d3FE22D961EA71119, %27; +fma.rn.f64 fd114, fd55, 0d3FEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd115, fd57, 0dBFD6B1D8B2365DA1, fd111; +fma.rn.f64 fd116, fd60, 0d3FEDEBA72EF20147, fd112; +fma.rn.f64 fd117, fd58, 0dBFD6B1D8B2365DA1, fd113; +fma.rn.f64 fd118, fd59, 0d3FEDEBA72EF20147, fd114; +fma.rn.f64 fd119, fd61, 0dBFEF11F493053D00, fd115; +fma.rn.f64 fd120, fd64, 0d3FCEA1E54BC48DBF, fd116; +fma.rn.f64 fd121, fd62, 0dBFEF11F493053D00, fd117; +fma.rn.f64 fd122, fd63, 0d3FCEA1E54BC48DBF, fd118; +fma.rn.f64 fd123, fd65, 0dBFE7F3CCD0032E0C, fd119; +fma.rn.f64 fd124, fd68, 0dBFE5384D024C2F84, fd120; +fma.rn.f64 fd125, fd66, 0dBFE7F3CCD0032E0C, fd121; +fma.rn.f64 fd126, fd67, 0dBFE5384D024C2F84, fd122; +fma.rn.f64 fd127, fd69, 0d3FBEDB7DEBAA3ED8, fd123; +fma.rn.f64 fd128, fd72, 0dBFEFC44566966769, fd124; +fma.rn.f64 fd129, fd70, 0d3FBEDB7DEBAA3ED8, fd125; +fma.rn.f64 fd130, fd71, 0dBFEFC44566966769, fd126; +fma.rn.f64 fd131, fd73, 0d3FEC55A7E00740E9, fd127; +fma.rn.f64 fd132, fd76, 0dBFDDBE064267C47C, fd128; +fma.rn.f64 fd133, fd74, 0d3FEC55A7E00740E9, fd129; +fma.rn.f64 fd134, fd75, 0dBFDDBE064267C47C, fd130; +fma.rn.f64 fd135, fd53, 0d3FBEDB7DEBAA3ED8, %26; +fma.rn.f64 fd136, fd56, 0d3FEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd137, fd54, 0d3FBEDB7DEBAA3ED8, %27; +fma.rn.f64 fd138, fd55, 0d3FEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd139, fd57, 0dBFEF11F493053D00, fd135; +fma.rn.f64 fd140, fd60, 0d3FCEA1E54BC48DBF, fd136; +fma.rn.f64 fd141, fd58, 0dBFEF11F493053D00, fd137; +fma.rn.f64 fd142, fd59, 0d3FCEA1E54BC48DBF, fd138; +fma.rn.f64 fd143, fd61, 0dBFD6B1D8B2365DA1, fd139; +fma.rn.f64 fd144, fd64, 0dBFEDEBA72EF20147, fd140; +fma.rn.f64 fd145, fd62, 0dBFD6B1D8B2365DA1, fd141; +fma.rn.f64 fd146, fd63, 0dBFEDEBA72EF20147, fd142; +fma.rn.f64 fd147, fd65, 0d3FEC55A7E00740E9, fd143; +fma.rn.f64 fd148, fd68, 0dBFDDBE064267C47C, fd144; +fma.rn.f64 fd149, fd66, 0d3FEC55A7E00740E9, fd145; +fma.rn.f64 fd150, fd67, 0dBFDDBE064267C47C, fd146; +fma.rn.f64 fd151, fd69, 0d3FE22D961EA71119, fd147; +fma.rn.f64 fd152, fd72, 0d3FEA55E242A4C3D2, fd148; +fma.rn.f64 fd153, fd70, 0d3FE22D961EA71119, fd149; +fma.rn.f64 fd154, fd71, 0d3FEA55E242A4C3D2, fd150; +fma.rn.f64 fd155, fd73, 0dBFE7F3CCD0032E0C, fd151; +fma.rn.f64 fd156, fd76, 0d3FE5384D024C2F84, fd152; +fma.rn.f64 fd157, fd74, 0dBFE7F3CCD0032E0C, fd153; +fma.rn.f64 fd158, fd75, 0d3FE5384D024C2F84, fd154; +fma.rn.f64 fd159, fd53, 0dBFD6B1D8B2365DA1, %26; +fma.rn.f64 fd160, fd56, 0d3FEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd161, fd54, 0dBFD6B1D8B2365DA1, %27; +fma.rn.f64 fd162, fd55, 0d3FEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd163, fd57, 0dBFE7F3CCD0032E0C, fd159; +fma.rn.f64 fd164, fd60, 0dBFE5384D024C2F84, fd160; +fma.rn.f64 fd165, fd58, 0dBFE7F3CCD0032E0C, fd161; +fma.rn.f64 fd166, fd59, 0dBFE5384D024C2F84, fd162; +fma.rn.f64 fd167, fd61, 0d3FEC55A7E00740E9, fd163; +fma.rn.f64 fd168, fd64, 0dBFDDBE064267C47C, fd164; +fma.rn.f64 fd169, fd62, 0d3FEC55A7E00740E9, fd165; +fma.rn.f64 fd170, fd63, 0dBFDDBE064267C47C, fd166; +fma.rn.f64 fd171, fd65, 0d3FBEDB7DEBAA3ED8, fd167; +fma.rn.f64 fd172, fd68, 0d3FEFC44566966769, fd168; +fma.rn.f64 fd173, fd66, 0d3FBEDB7DEBAA3ED8, fd169; +fma.rn.f64 fd174, fd67, 0d3FEFC44566966769, fd170; +fma.rn.f64 fd175, fd69, 0dBFEF11F493053D00, fd171; +fma.rn.f64 fd176, fd72, 0dBFCEA1E54BC48DBF, fd172; +fma.rn.f64 fd177, fd70, 0dBFEF11F493053D00, fd173; +fma.rn.f64 fd178, fd71, 0dBFCEA1E54BC48DBF, fd174; +fma.rn.f64 fd179, fd73, 0d3FE22D961EA71119, fd175; +fma.rn.f64 fd180, fd76, 0dBFEA55E242A4C3D2, fd176; +fma.rn.f64 fd181, fd74, 0d3FE22D961EA71119, fd177; +fma.rn.f64 fd182, fd75, 0dBFEA55E242A4C3D2, fd178; +fma.rn.f64 fd183, fd53, 0dBFE7F3CCD0032E0C, %26; +fma.rn.f64 fd184, fd56, 0d3FE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd185, fd54, 0dBFE7F3CCD0032E0C, %27; +fma.rn.f64 fd186, fd55, 0d3FE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd187, fd57, 0d3FBEDB7DEBAA3ED8, fd183; +fma.rn.f64 fd188, fd60, 0dBFEFC44566966769, fd184; +fma.rn.f64 fd189, fd58, 0d3FBEDB7DEBAA3ED8, fd185; +fma.rn.f64 fd190, fd59, 0dBFEFC44566966769, fd186; +fma.rn.f64 fd191, fd61, 0d3FE22D961EA71119, fd187; +fma.rn.f64 fd192, fd64, 0d3FEA55E242A4C3D2, fd188; +fma.rn.f64 fd193, fd62, 0d3FE22D961EA71119, fd189; +fma.rn.f64 fd194, fd63, 0d3FEA55E242A4C3D2, fd190; +fma.rn.f64 fd195, fd65, 0dBFEF11F493053D00, fd191; +fma.rn.f64 fd196, fd68, 0dBFCEA1E54BC48DBF, fd192; +fma.rn.f64 fd197, fd66, 0dBFEF11F493053D00, fd193; +fma.rn.f64 fd198, fd67, 0dBFCEA1E54BC48DBF, fd194; +fma.rn.f64 fd199, fd69, 0d3FEC55A7E00740E9, fd195; +fma.rn.f64 fd200, fd72, 0dBFDDBE064267C47C, fd196; +fma.rn.f64 fd201, fd70, 0d3FEC55A7E00740E9, fd197; +fma.rn.f64 fd202, fd71, 0dBFDDBE064267C47C, fd198; +fma.rn.f64 fd203, fd73, 0dBFD6B1D8B2365DA1, fd199; +fma.rn.f64 fd204, fd76, 0d3FEDEBA72EF20147, fd200; +fma.rn.f64 fd205, fd74, 0dBFD6B1D8B2365DA1, fd201; +fma.rn.f64 fd206, fd75, 0d3FEDEBA72EF20147, fd202; +fma.rn.f64 fd207, fd53, 0dBFEF11F493053D00, %26; +fma.rn.f64 fd208, fd56, 0d3FCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd209, fd54, 0dBFEF11F493053D00, %27; +fma.rn.f64 fd210, fd55, 0d3FCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd211, fd57, 0d3FEC55A7E00740E9, fd207; +fma.rn.f64 fd212, fd60, 0dBFDDBE064267C47C, fd208; +fma.rn.f64 fd213, fd58, 0d3FEC55A7E00740E9, fd209; +fma.rn.f64 fd214, fd59, 0dBFDDBE064267C47C, fd210; +fma.rn.f64 fd215, fd61, 0dBFE7F3CCD0032E0C, fd211; +fma.rn.f64 fd216, fd64, 0d3FE5384D024C2F84, fd212; +fma.rn.f64 fd217, fd62, 0dBFE7F3CCD0032E0C, fd213; +fma.rn.f64 fd218, fd63, 0d3FE5384D024C2F84, fd214; +fma.rn.f64 fd219, fd65, 0d3FE22D961EA71119, fd215; +fma.rn.f64 fd220, fd68, 0dBFEA55E242A4C3D2, fd216; +fma.rn.f64 fd221, fd66, 0d3FE22D961EA71119, fd217; +fma.rn.f64 fd222, fd67, 0dBFEA55E242A4C3D2, fd218; +fma.rn.f64 fd223, fd69, 0dBFD6B1D8B2365DA1, fd219; +fma.rn.f64 fd224, fd72, 0d3FEDEBA72EF20147, fd220; +fma.rn.f64 fd225, fd70, 0dBFD6B1D8B2365DA1, fd221; +fma.rn.f64 fd226, fd71, 0d3FEDEBA72EF20147, fd222; +fma.rn.f64 fd227, fd73, 0d3FBEDB7DEBAA3ED8, fd223; +fma.rn.f64 fd228, fd76, 0dBFEFC44566966769, fd224; +fma.rn.f64 fd229, fd74, 0d3FBEDB7DEBAA3ED8, fd225; +fma.rn.f64 fd230, fd75, 0dBFEFC44566966769, fd226; +add.f64 %1, fd86, fd74; +add.f64 %0, fd85, fd73; +add.f64 %3, fd109, fd110; +sub.f64 %2, fd107, fd108; +add.f64 %5, fd133, fd134; +sub.f64 %4, fd131, fd132; +add.f64 %7, fd157, fd158; +sub.f64 %6, fd155, fd156; +add.f64 %9, fd181, fd182; +sub.f64 %8, fd179, fd180; +add.f64 %11, fd205, fd206; +sub.f64 %10, fd203, fd204; +add.f64 %13, fd229, fd230; +sub.f64 %12, fd227, fd228; +sub.f64 %15, fd229, fd230; +add.f64 %14, fd227, fd228; +sub.f64 %17, fd205, fd206; +add.f64 %16, fd203, fd204; +sub.f64 %19, fd181, fd182; +add.f64 %18, fd179, fd180; +sub.f64 %21, fd157, fd158; +add.f64 %20, fd155, fd156; +sub.f64 %23, fd133, fd134; +add.f64 %22, fd131, fd132; +sub.f64 %25, fd109, fd110; +add.f64 %24, fd107, fd108; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..63473f2cf33a1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp16_fwd.hpp.inc @@ -0,0 +1,4344 @@ +#ifndef CUFFTDX_FFT_144_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_144_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<944, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<230>; +.reg .b32 r<1760>; +.reg .b64 rd<5>; +mov.u32 r1747, %tid.y; +shl.b32 r1748, r1747, 1; +mov.u32 r1749, %24; +mad.lo.s32 r1750, r1748, 576, r1749; +mov.u32 r1751, %tid.x; +mov.f32 f202, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1, {low, high}; +} +mov.f32 f200, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %33, %41; +} +{ +add.f16x2 r8, %25, r5; +} +{ +add.f16x2 r11, %34, %42; +} +{ +add.f16x2 r14, %26, r11; +} +{ +add.f16x2 r17, %33, %41; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %25, r20; +} +{ +sub.f16x2 r26, %34, %42; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %33, %41; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %25, r38; +} +{ +sub.f16x2 r44, %34, %42; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %34, %42; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %26, r56; +} +{ +sub.f16x2 r62, %33, %41; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %34, %42; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %26, r74; +} +{ +sub.f16x2 r80, %33, %41; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %37, %45; +} +{ +add.f16x2 r96, %29, r93; +} +{ +add.f16x2 r99, %38, %46; +} +{ +add.f16x2 r102, %30, r99; +} +{ +add.f16x2 r105, %37, %45; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %29, r108; +} +{ +sub.f16x2 r114, %38, %46; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %37, %45; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %29, r126; +} +{ +sub.f16x2 r132, %38, %46; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %38, %46; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %30, r144; +} +{ +sub.f16x2 r150, %37, %45; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %38, %46; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %30, r162; +} +{ +sub.f16x2 r168, %37, %45; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f188, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r180, {low, high}; +} +mov.f32 f123, 0fBF800000; +mov.f32 f184, 0f3F5DB3D7; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r256, {low, high}; +} +{ +neg.f16x2 r257, r256; +} +{ +add.f16x2 r259, %35, %43; +} +{ +add.f16x2 r262, %27, r259; +} +{ +add.f16x2 r265, %36, %44; +} +{ +add.f16x2 r268, %28, r265; +} +{ +add.f16x2 r271, %35, %43; +} +{ +mul.f16x2 r274, r271, r255; +} +{ +add.f16x2 r277, %27, r274; +} +{ +sub.f16x2 r280, %36, %44; +} +{ +mul.f16x2 r283, r280, r257; +} +{ +add.f16x2 r286, r277, r283; +} +{ +add.f16x2 r289, %35, %43; +} +{ +mul.f16x2 r292, r289, r255; +} +{ +add.f16x2 r295, %27, r292; +} +{ +sub.f16x2 r298, %36, %44; +} +{ +mul.f16x2 r301, r298, r257; +} +{ +sub.f16x2 r304, r295, r301; +} +{ +add.f16x2 r307, %36, %44; +} +{ +mul.f16x2 r310, r307, r255; +} +{ +add.f16x2 r313, %28, r310; +} +{ +sub.f16x2 r316, %35, %43; +} +{ +mul.f16x2 r319, r316, r257; +} +{ +sub.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %36, %44; +} +{ +mul.f16x2 r328, r325, r255; +} +{ +add.f16x2 r331, %28, r328; +} +{ +sub.f16x2 r334, %35, %43; +} +{ +mul.f16x2 r337, r334, r257; +} +{ +add.f16x2 r340, r331, r337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r344, {low, high}; +} +{ +neg.f16x2 r345, r344; +} +{ +add.f16x2 r347, %39, %47; +} +{ +add.f16x2 r350, %31, r347; +} +{ +add.f16x2 r353, %40, %48; +} +{ +add.f16x2 r356, %32, r353; +} +{ +add.f16x2 r359, %39, %47; +} +{ +mul.f16x2 r362, r359, r343; +} +{ +add.f16x2 r365, %31, r362; +} +{ +sub.f16x2 r368, %40, %48; +} +{ +mul.f16x2 r371, r368, r345; +} +{ +add.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, %39, %47; +} +{ +mul.f16x2 r380, r377, r343; +} +{ +add.f16x2 r383, %31, r380; +} +{ +sub.f16x2 r386, %40, %48; +} +{ +mul.f16x2 r389, r386, r345; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %40, %48; +} +{ +mul.f16x2 r398, r395, r343; +} +{ +add.f16x2 r401, %32, r398; +} +{ +sub.f16x2 r404, %39, %47; +} +{ +mul.f16x2 r407, r404, r345; +} +{ +sub.f16x2 r410, r401, r407; +} +{ +add.f16x2 r413, %40, %48; +} +{ +mul.f16x2 r416, r413, r343; +} +{ +add.f16x2 r419, %32, r416; +} +{ +sub.f16x2 r422, %39, %47; +} +{ +mul.f16x2 r425, r422, r345; +} +{ +add.f16x2 r428, r419, r425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r434, {low, high}; +} +{ +mul.f16x2 r441, r374, r431; +} +{ +mul.f16x2 r444, r410, r432; +} +{ +sub.f16x2 r447, r441, r444; +} +{ +mul.f16x2 r450, r374, r432; +} +{ +fma.rn.f16x2 r453, r410, r431, r450; +} +{ +mul.f16x2 r457, r392, r433; +} +{ +mul.f16x2 r460, r428, r434; +} +{ +sub.f16x2 r463, r457, r460; +} +{ +mul.f16x2 r466, r392, r434; +} +{ +fma.rn.f16x2 r469, r428, r433, r466; +} +{ +add.f16x2 r473, r262, r350; +} +{ +add.f16x2 r476, r268, r356; +} +{ +sub.f16x2 r479, r262, r350; +} +{ +sub.f16x2 r482, r268, r356; +} +{ +add.f16x2 r485, r286, r447; +} +{ +add.f16x2 r488, r322, r453; +} +{ +sub.f16x2 r491, r286, r447; +} +{ +sub.f16x2 r494, r322, r453; +} +{ +add.f16x2 r497, r304, r463; +} +{ +add.f16x2 r500, r340, r469; +} +{ +sub.f16x2 r503, r304, r463; +} +{ +sub.f16x2 r506, r340, r469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r512, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r516, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r517, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r518, {low, high}; +} +mov.f32 f124, 0f3F800000; +{ +mul.f16x2 r531, r485, r509; +} +{ +mul.f16x2 r534, r488, r510; +} +{ +sub.f16x2 r537, r531, r534; +} +{ +mul.f16x2 r540, r485, r510; +} +{ +fma.rn.f16x2 r543, r488, r509, r540; +} +{ +mul.f16x2 r547, r497, r511; +} +{ +mul.f16x2 r550, r500, r512; +} +{ +sub.f16x2 r553, r547, r550; +} +{ +mul.f16x2 r556, r497, r512; +} +{ +fma.rn.f16x2 r559, r500, r511, r556; +} +{ +neg.f16x2 r563, r479; +} +{ +mul.f16x2 r565, r491, r515; +} +{ +mul.f16x2 r568, r494, r516; +} +{ +sub.f16x2 r571, r565, r568; +} +{ +mul.f16x2 r574, r491, r516; +} +{ +fma.rn.f16x2 r577, r494, r515, r574; +} +{ +mul.f16x2 r581, r503, r517; +} +{ +mul.f16x2 r584, r506, r518; +} +{ +sub.f16x2 r587, r581, r584; +} +{ +mul.f16x2 r590, r503, r518; +} +{ +fma.rn.f16x2 r593, r506, r517, r590; +} +{ +add.f16x2 r597, r219, r473; +} +{ +add.f16x2 r600, r222, r476; +} +{ +sub.f16x2 r603, r219, r473; +} +{ +sub.f16x2 r606, r222, r476; +} +{ +add.f16x2 r609, r231, r537; +} +{ +add.f16x2 r612, r234, r543; +} +{ +sub.f16x2 r615, r231, r537; +} +{ +sub.f16x2 r618, r234, r543; +} +{ +add.f16x2 r621, r243, r553; +} +{ +add.f16x2 r624, r246, r559; +} +{ +sub.f16x2 r627, r243, r553; +} +{ +sub.f16x2 r630, r246, r559; +} +{ +add.f16x2 r633, r225, r482; +} +{ +add.f16x2 r636, r228, r563; +} +{ +sub.f16x2 r639, r225, r482; +} +{ +sub.f16x2 r642, r228, r563; +} +{ +add.f16x2 r645, r237, r571; +} +{ +add.f16x2 r648, r240, r577; +} +{ +sub.f16x2 r651, r237, r571; +} +{ +sub.f16x2 r654, r240, r577; +} +{ +add.f16x2 r657, r249, r587; +} +{ +add.f16x2 r660, r252, r593; +} +{ +sub.f16x2 r663, r249, r587; +} +{ +sub.f16x2 r666, r252, r593; +} +mul.wide.u32 rd2, r1751, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1752, rd3; +mul.lo.s32 r1753, r1752, 12; +sub.s32 r1754, r1751, r1753; +shr.u64 rd4, rd2, 34; +cvt.u32.u64 r1755, rd4; +and.b32 r1756, r1755, 1073741822; +mad.lo.s32 r1757, r1756, 576, r1750; +cvt.rn.f32.u32 f227, r1754; +mul.f32 f228, f227, 0f3D32B8C2; +cos.approx.f32 f101, f228; +sin.approx.f32 f229, f228; +neg.f32 f102, f229; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r669, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r674, {high, high}; +} +{ +mul.f16x2 r676, r612, r674; +} +{ +neg.f16x2 r679, r676; +} +{ +fma.rn.f16x2 r681, r609, r672, r679; +} +{ +mul.f16x2 r685, r609, r674; +} +{ +fma.rn.f16x2 r688, r612, r672, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r694, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r696, {low, high}; +} +{ +mul.f16x2 r697, r694, r696; +} +{ +mul.f16x2 r700, r669, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r703, {high, low}; +} +{ +fma.rn.f16x2 r705, r697, r703, r700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r711, {high, high}; +} +{ +mul.f16x2 r713, r624, r711; +} +{ +neg.f16x2 r716, r713; +} +{ +fma.rn.f16x2 r718, r621, r709, r716; +} +{ +mul.f16x2 r722, r621, r711; +} +{ +fma.rn.f16x2 r725, r624, r709, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r731, r733; +} +{ +mul.f16x2 r737, r705, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r740, {high, low}; +} +{ +fma.rn.f16x2 r742, r734, r740, r737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r748, {high, high}; +} +{ +mul.f16x2 r750, r636, r748; +} +{ +neg.f16x2 r753, r750; +} +{ +fma.rn.f16x2 r755, r633, r746, r753; +} +{ +mul.f16x2 r759, r633, r748; +} +{ +fma.rn.f16x2 r762, r636, r746, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r770, {low, high}; +} +{ +mul.f16x2 r771, r768, r770; +} +{ +mul.f16x2 r774, r742, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r777, {high, low}; +} +{ +fma.rn.f16x2 r779, r771, r777, r774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r785, {high, high}; +} +{ +mul.f16x2 r787, r648, r785; +} +{ +neg.f16x2 r790, r787; +} +{ +fma.rn.f16x2 r792, r645, r783, r790; +} +{ +mul.f16x2 r796, r645, r785; +} +{ +fma.rn.f16x2 r799, r648, r783, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r805, r807; +} +{ +mul.f16x2 r811, r779, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r814, {high, low}; +} +{ +fma.rn.f16x2 r816, r808, r814, r811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r822, {high, high}; +} +{ +mul.f16x2 r824, r660, r822; +} +{ +neg.f16x2 r827, r824; +} +{ +fma.rn.f16x2 r829, r657, r820, r827; +} +{ +mul.f16x2 r833, r657, r822; +} +{ +fma.rn.f16x2 r836, r660, r820, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r844, {low, high}; +} +{ +mul.f16x2 r845, r842, r844; +} +{ +mul.f16x2 r848, r816, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r851, {high, low}; +} +{ +fma.rn.f16x2 r853, r845, r851, r848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r859, {high, high}; +} +{ +mul.f16x2 r861, r606, r859; +} +{ +neg.f16x2 r864, r861; +} +{ +fma.rn.f16x2 r866, r603, r857, r864; +} +{ +mul.f16x2 r870, r603, r859; +} +{ +fma.rn.f16x2 r873, r606, r857, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r881, {low, high}; +} +{ +mul.f16x2 r882, r879, r881; +} +{ +mul.f16x2 r885, r853, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r888, {high, low}; +} +{ +fma.rn.f16x2 r890, r882, r888, r885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r618, r896; +} +{ +neg.f16x2 r901, r898; +} +{ +fma.rn.f16x2 r903, r615, r894, r901; +} +{ +mul.f16x2 r907, r615, r896; +} +{ +fma.rn.f16x2 r910, r618, r894, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r890, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r630, r933; +} +{ +neg.f16x2 r938, r935; +} +{ +fma.rn.f16x2 r940, r627, r931, r938; +} +{ +mul.f16x2 r944, r627, r933; +} +{ +fma.rn.f16x2 r947, r630, r931, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r642, r970; +} +{ +neg.f16x2 r975, r972; +} +{ +fma.rn.f16x2 r977, r639, r968, r975; +} +{ +mul.f16x2 r981, r639, r970; +} +{ +fma.rn.f16x2 r984, r642, r968, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r654, r1007; +} +{ +neg.f16x2 r1012, r1009; +} +{ +fma.rn.f16x2 r1014, r651, r1005, r1012; +} +{ +mul.f16x2 r1018, r651, r1007; +} +{ +fma.rn.f16x2 r1021, r654, r1005, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r666, r1044; +} +{ +neg.f16x2 r1049, r1046; +} +{ +fma.rn.f16x2 r1051, r663, r1042, r1049; +} +{ +mul.f16x2 r1055, r663, r1044; +} +{ +fma.rn.f16x2 r1058, r666, r1042, r1055; +} +barrier.sync 0; +mad.lo.s32 r1758, r1754, 96, r1757; +st.shared.v4.f32 [r1758], {r597, r600, r681, r688}; +st.shared.v4.f32 [r1758+16], {r718, r725, r755, r762}; +st.shared.v4.f32 [r1758+32], {r792, r799, r829, r836}; +st.shared.v4.f32 [r1758+48], {r866, r873, r903, r910}; +st.shared.v4.f32 [r1758+64], {r940, r947, r977, r984}; +st.shared.v4.f32 [r1758+80], {r1014, r1021, r1051, r1058}; +barrier.sync 0; +mad.lo.s32 r1759, r1754, -88, r1758; +ld.shared.u32 r1087, [r1759]; +ld.shared.u32 r1093, [r1759+4]; +ld.shared.u32 r1341, [r1759+96]; +ld.shared.u32 r1347, [r1759+100]; +ld.shared.u32 r1175, [r1759+192]; +ld.shared.u32 r1181, [r1759+196]; +ld.shared.u32 r1429, [r1759+288]; +ld.shared.u32 r1435, [r1759+292]; +ld.shared.u32 r1084, [r1759+384]; +ld.shared.u32 r1090, [r1759+388]; +ld.shared.u32 r1338, [r1759+480]; +ld.shared.u32 r1344, [r1759+484]; +ld.shared.u32 r1172, [r1759+576]; +ld.shared.u32 r1178, [r1759+580]; +ld.shared.u32 r1426, [r1759+672]; +ld.shared.u32 r1432, [r1759+676]; +ld.shared.u32 r1085, [r1759+768]; +ld.shared.u32 r1091, [r1759+772]; +ld.shared.u32 r1339, [r1759+864]; +ld.shared.u32 r1345, [r1759+868]; +ld.shared.u32 r1173, [r1759+960]; +ld.shared.u32 r1179, [r1759+964]; +ld.shared.u32 r1427, [r1759+1056]; +ld.shared.u32 r1433, [r1759+1060]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1080, {low, high}; +} +{ +neg.f16x2 r1081, r1080; +} +{ +add.f16x2 r1083, r1084, r1085; +} +{ +add.f16x2 r1086, r1087, r1083; +} +{ +add.f16x2 r1089, r1090, r1091; +} +{ +add.f16x2 r1092, r1093, r1089; +} +{ +add.f16x2 r1095, r1084, r1085; +} +{ +mul.f16x2 r1098, r1095, r1079; +} +{ +add.f16x2 r1101, r1087, r1098; +} +{ +sub.f16x2 r1104, r1090, r1091; +} +{ +mul.f16x2 r1107, r1104, r1081; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +add.f16x2 r1113, r1084, r1085; +} +{ +mul.f16x2 r1116, r1113, r1079; +} +{ +add.f16x2 r1119, r1087, r1116; +} +{ +sub.f16x2 r1122, r1090, r1091; +} +{ +mul.f16x2 r1125, r1122, r1081; +} +{ +sub.f16x2 r1128, r1119, r1125; +} +{ +add.f16x2 r1131, r1090, r1091; +} +{ +mul.f16x2 r1134, r1131, r1079; +} +{ +add.f16x2 r1137, r1093, r1134; +} +{ +sub.f16x2 r1140, r1084, r1085; +} +{ +mul.f16x2 r1143, r1140, r1081; +} +{ +sub.f16x2 r1146, r1137, r1143; +} +{ +add.f16x2 r1149, r1090, r1091; +} +{ +mul.f16x2 r1152, r1149, r1079; +} +{ +add.f16x2 r1155, r1093, r1152; +} +{ +sub.f16x2 r1158, r1084, r1085; +} +{ +mul.f16x2 r1161, r1158, r1081; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1167, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1168, {low, high}; +} +{ +neg.f16x2 r1169, r1168; +} +{ +add.f16x2 r1171, r1172, r1173; +} +{ +add.f16x2 r1174, r1175, r1171; +} +{ +add.f16x2 r1177, r1178, r1179; +} +{ +add.f16x2 r1180, r1181, r1177; +} +{ +add.f16x2 r1183, r1172, r1173; +} +{ +mul.f16x2 r1186, r1183, r1167; +} +{ +add.f16x2 r1189, r1175, r1186; +} +{ +sub.f16x2 r1192, r1178, r1179; +} +{ +mul.f16x2 r1195, r1192, r1169; +} +{ +add.f16x2 r1198, r1189, r1195; +} +{ +add.f16x2 r1201, r1172, r1173; +} +{ +mul.f16x2 r1204, r1201, r1167; +} +{ +add.f16x2 r1207, r1175, r1204; +} +{ +sub.f16x2 r1210, r1178, r1179; +} +{ +mul.f16x2 r1213, r1210, r1169; +} +{ +sub.f16x2 r1216, r1207, r1213; +} +{ +add.f16x2 r1219, r1178, r1179; +} +{ +mul.f16x2 r1222, r1219, r1167; +} +{ +add.f16x2 r1225, r1181, r1222; +} +{ +sub.f16x2 r1228, r1172, r1173; +} +{ +mul.f16x2 r1231, r1228, r1169; +} +{ +sub.f16x2 r1234, r1225, r1231; +} +{ +add.f16x2 r1237, r1178, r1179; +} +{ +mul.f16x2 r1240, r1237, r1167; +} +{ +add.f16x2 r1243, r1181, r1240; +} +{ +sub.f16x2 r1246, r1172, r1173; +} +{ +mul.f16x2 r1249, r1246, r1169; +} +{ +add.f16x2 r1252, r1243, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1258, {low, high}; +} +{ +mul.f16x2 r1265, r1198, r1255; +} +{ +mul.f16x2 r1268, r1234, r1256; +} +{ +sub.f16x2 r1271, r1265, r1268; +} +{ +mul.f16x2 r1274, r1198, r1256; +} +{ +fma.rn.f16x2 r1277, r1234, r1255, r1274; +} +{ +mul.f16x2 r1281, r1216, r1257; +} +{ +mul.f16x2 r1284, r1252, r1258; +} +{ +sub.f16x2 r1287, r1281, r1284; +} +{ +mul.f16x2 r1290, r1216, r1258; +} +{ +fma.rn.f16x2 r1293, r1252, r1257, r1290; +} +{ +add.f16x2 r1297, r1086, r1174; +} +{ +add.f16x2 r1300, r1092, r1180; +} +{ +sub.f16x2 r1303, r1086, r1174; +} +{ +sub.f16x2 r1306, r1092, r1180; +} +{ +add.f16x2 r1309, r1110, r1271; +} +{ +add.f16x2 r1312, r1146, r1277; +} +{ +sub.f16x2 r1315, r1110, r1271; +} +{ +sub.f16x2 r1318, r1146, r1277; +} +{ +add.f16x2 r1321, r1128, r1287; +} +{ +add.f16x2 r1324, r1164, r1293; +} +{ +sub.f16x2 r1327, r1128, r1287; +} +{ +sub.f16x2 r1330, r1164, r1293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1333, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1334, {low, high}; +} +{ +neg.f16x2 r1335, r1334; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1337; +} +{ +add.f16x2 r1343, r1344, r1345; +} +{ +add.f16x2 r1346, r1347, r1343; +} +{ +add.f16x2 r1349, r1338, r1339; +} +{ +mul.f16x2 r1352, r1349, r1333; +} +{ +add.f16x2 r1355, r1341, r1352; +} +{ +sub.f16x2 r1358, r1344, r1345; +} +{ +mul.f16x2 r1361, r1358, r1335; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +add.f16x2 r1367, r1338, r1339; +} +{ +mul.f16x2 r1370, r1367, r1333; +} +{ +add.f16x2 r1373, r1341, r1370; +} +{ +sub.f16x2 r1376, r1344, r1345; +} +{ +mul.f16x2 r1379, r1376, r1335; +} +{ +sub.f16x2 r1382, r1373, r1379; +} +{ +add.f16x2 r1385, r1344, r1345; +} +{ +mul.f16x2 r1388, r1385, r1333; +} +{ +add.f16x2 r1391, r1347, r1388; +} +{ +sub.f16x2 r1394, r1338, r1339; +} +{ +mul.f16x2 r1397, r1394, r1335; +} +{ +sub.f16x2 r1400, r1391, r1397; +} +{ +add.f16x2 r1403, r1344, r1345; +} +{ +mul.f16x2 r1406, r1403, r1333; +} +{ +add.f16x2 r1409, r1347, r1406; +} +{ +sub.f16x2 r1412, r1338, r1339; +} +{ +mul.f16x2 r1415, r1412, r1335; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1421, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1422, {low, high}; +} +{ +neg.f16x2 r1423, r1422; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1425; +} +{ +add.f16x2 r1431, r1432, r1433; +} +{ +add.f16x2 r1434, r1435, r1431; +} +{ +add.f16x2 r1437, r1426, r1427; +} +{ +mul.f16x2 r1440, r1437, r1421; +} +{ +add.f16x2 r1443, r1429, r1440; +} +{ +sub.f16x2 r1446, r1432, r1433; +} +{ +mul.f16x2 r1449, r1446, r1423; +} +{ +add.f16x2 r1452, r1443, r1449; +} +{ +add.f16x2 r1455, r1426, r1427; +} +{ +mul.f16x2 r1458, r1455, r1421; +} +{ +add.f16x2 r1461, r1429, r1458; +} +{ +sub.f16x2 r1464, r1432, r1433; +} +{ +mul.f16x2 r1467, r1464, r1423; +} +{ +sub.f16x2 r1470, r1461, r1467; +} +{ +add.f16x2 r1473, r1432, r1433; +} +{ +mul.f16x2 r1476, r1473, r1421; +} +{ +add.f16x2 r1479, r1435, r1476; +} +{ +sub.f16x2 r1482, r1426, r1427; +} +{ +mul.f16x2 r1485, r1482, r1423; +} +{ +sub.f16x2 r1488, r1479, r1485; +} +{ +add.f16x2 r1491, r1432, r1433; +} +{ +mul.f16x2 r1494, r1491, r1421; +} +{ +add.f16x2 r1497, r1435, r1494; +} +{ +sub.f16x2 r1500, r1426, r1427; +} +{ +mul.f16x2 r1503, r1500, r1423; +} +{ +add.f16x2 r1506, r1497, r1503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1512, {low, high}; +} +{ +mul.f16x2 r1519, r1452, r1509; +} +{ +mul.f16x2 r1522, r1488, r1510; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1452, r1510; +} +{ +fma.rn.f16x2 r1531, r1488, r1509, r1528; +} +{ +mul.f16x2 r1535, r1470, r1511; +} +{ +mul.f16x2 r1538, r1506, r1512; +} +{ +sub.f16x2 r1541, r1535, r1538; +} +{ +mul.f16x2 r1544, r1470, r1512; +} +{ +fma.rn.f16x2 r1547, r1506, r1511, r1544; +} +{ +add.f16x2 r1551, r1340, r1428; +} +{ +add.f16x2 r1554, r1346, r1434; +} +{ +sub.f16x2 r1557, r1340, r1428; +} +{ +sub.f16x2 r1560, r1346, r1434; +} +{ +add.f16x2 r1563, r1364, r1525; +} +{ +add.f16x2 r1566, r1400, r1531; +} +{ +sub.f16x2 r1569, r1364, r1525; +} +{ +sub.f16x2 r1572, r1400, r1531; +} +{ +add.f16x2 r1575, r1382, r1541; +} +{ +add.f16x2 r1578, r1418, r1547; +} +{ +sub.f16x2 r1581, r1382, r1541; +} +{ +sub.f16x2 r1584, r1418, r1547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1590, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1593, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1595, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1596, {low, high}; +} +{ +mul.f16x2 r1609, r1563, r1587; +} +{ +mul.f16x2 r1612, r1566, r1588; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1563, r1588; +} +{ +fma.rn.f16x2 r1621, r1566, r1587, r1618; +} +{ +mul.f16x2 r1625, r1575, r1589; +} +{ +mul.f16x2 r1628, r1578, r1590; +} +{ +sub.f16x2 r1631, r1625, r1628; +} +{ +mul.f16x2 r1634, r1575, r1590; +} +{ +fma.rn.f16x2 r1637, r1578, r1589, r1634; +} +{ +neg.f16x2 r1641, r1557; +} +{ +mul.f16x2 r1643, r1569, r1593; +} +{ +mul.f16x2 r1646, r1572, r1594; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1569, r1594; +} +{ +fma.rn.f16x2 r1655, r1572, r1593, r1652; +} +{ +mul.f16x2 r1659, r1581, r1595; +} +{ +mul.f16x2 r1662, r1584, r1596; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r1581, r1596; +} +{ +fma.rn.f16x2 r1671, r1584, r1595, r1668; +} +{ +add.f16x2 %0, r1297, r1551; +} +{ +add.f16x2 %1, r1300, r1554; +} +{ +sub.f16x2 %12, r1297, r1551; +} +{ +sub.f16x2 %13, r1300, r1554; +} +{ +add.f16x2 %2, r1309, r1615; +} +{ +add.f16x2 %3, r1312, r1621; +} +{ +sub.f16x2 %14, r1309, r1615; +} +{ +sub.f16x2 %15, r1312, r1621; +} +{ +add.f16x2 %4, r1321, r1631; +} +{ +add.f16x2 %5, r1324, r1637; +} +{ +sub.f16x2 %16, r1321, r1631; +} +{ +sub.f16x2 %17, r1324, r1637; +} +{ +add.f16x2 %6, r1303, r1560; +} +{ +add.f16x2 %7, r1306, r1641; +} +{ +sub.f16x2 %18, r1303, r1560; +} +{ +sub.f16x2 %19, r1306, r1641; +} +{ +add.f16x2 %8, r1315, r1649; +} +{ +add.f16x2 %9, r1318, r1655; +} +{ +sub.f16x2 %20, r1315, r1649; +} +{ +sub.f16x2 %21, r1318, r1655; +} +{ +add.f16x2 %10, r1327, r1665; +} +{ +add.f16x2 %11, r1330, r1671; +} +{ +sub.f16x2 %22, r1327, r1665; +} +{ +sub.f16x2 %23, r1330, r1671; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<945, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<230>; +.reg .b32 r<1757>; +.reg .b64 rd<4>; +mov.u32 r1747, %tid.y; +mov.u32 r1748, %24; +mad.lo.s32 r1749, r1747, 576, r1748; +mov.u32 r1750, %tid.x; +mov.f32 f202, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1, {low, high}; +} +mov.f32 f200, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %33, %41; +} +{ +add.f16x2 r8, %25, r5; +} +{ +add.f16x2 r11, %34, %42; +} +{ +add.f16x2 r14, %26, r11; +} +{ +add.f16x2 r17, %33, %41; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %25, r20; +} +{ +sub.f16x2 r26, %34, %42; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %33, %41; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %25, r38; +} +{ +sub.f16x2 r44, %34, %42; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %34, %42; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %26, r56; +} +{ +sub.f16x2 r62, %33, %41; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %34, %42; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %26, r74; +} +{ +sub.f16x2 r80, %33, %41; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %37, %45; +} +{ +add.f16x2 r96, %29, r93; +} +{ +add.f16x2 r99, %38, %46; +} +{ +add.f16x2 r102, %30, r99; +} +{ +add.f16x2 r105, %37, %45; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %29, r108; +} +{ +sub.f16x2 r114, %38, %46; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %37, %45; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %29, r126; +} +{ +sub.f16x2 r132, %38, %46; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %38, %46; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %30, r144; +} +{ +sub.f16x2 r150, %37, %45; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %38, %46; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %30, r162; +} +{ +sub.f16x2 r168, %37, %45; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f188, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r180, {low, high}; +} +mov.f32 f123, 0fBF800000; +mov.f32 f184, 0f3F5DB3D7; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r256, {low, high}; +} +{ +neg.f16x2 r257, r256; +} +{ +add.f16x2 r259, %35, %43; +} +{ +add.f16x2 r262, %27, r259; +} +{ +add.f16x2 r265, %36, %44; +} +{ +add.f16x2 r268, %28, r265; +} +{ +add.f16x2 r271, %35, %43; +} +{ +mul.f16x2 r274, r271, r255; +} +{ +add.f16x2 r277, %27, r274; +} +{ +sub.f16x2 r280, %36, %44; +} +{ +mul.f16x2 r283, r280, r257; +} +{ +add.f16x2 r286, r277, r283; +} +{ +add.f16x2 r289, %35, %43; +} +{ +mul.f16x2 r292, r289, r255; +} +{ +add.f16x2 r295, %27, r292; +} +{ +sub.f16x2 r298, %36, %44; +} +{ +mul.f16x2 r301, r298, r257; +} +{ +sub.f16x2 r304, r295, r301; +} +{ +add.f16x2 r307, %36, %44; +} +{ +mul.f16x2 r310, r307, r255; +} +{ +add.f16x2 r313, %28, r310; +} +{ +sub.f16x2 r316, %35, %43; +} +{ +mul.f16x2 r319, r316, r257; +} +{ +sub.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %36, %44; +} +{ +mul.f16x2 r328, r325, r255; +} +{ +add.f16x2 r331, %28, r328; +} +{ +sub.f16x2 r334, %35, %43; +} +{ +mul.f16x2 r337, r334, r257; +} +{ +add.f16x2 r340, r331, r337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r344, {low, high}; +} +{ +neg.f16x2 r345, r344; +} +{ +add.f16x2 r347, %39, %47; +} +{ +add.f16x2 r350, %31, r347; +} +{ +add.f16x2 r353, %40, %48; +} +{ +add.f16x2 r356, %32, r353; +} +{ +add.f16x2 r359, %39, %47; +} +{ +mul.f16x2 r362, r359, r343; +} +{ +add.f16x2 r365, %31, r362; +} +{ +sub.f16x2 r368, %40, %48; +} +{ +mul.f16x2 r371, r368, r345; +} +{ +add.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, %39, %47; +} +{ +mul.f16x2 r380, r377, r343; +} +{ +add.f16x2 r383, %31, r380; +} +{ +sub.f16x2 r386, %40, %48; +} +{ +mul.f16x2 r389, r386, r345; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %40, %48; +} +{ +mul.f16x2 r398, r395, r343; +} +{ +add.f16x2 r401, %32, r398; +} +{ +sub.f16x2 r404, %39, %47; +} +{ +mul.f16x2 r407, r404, r345; +} +{ +sub.f16x2 r410, r401, r407; +} +{ +add.f16x2 r413, %40, %48; +} +{ +mul.f16x2 r416, r413, r343; +} +{ +add.f16x2 r419, %32, r416; +} +{ +sub.f16x2 r422, %39, %47; +} +{ +mul.f16x2 r425, r422, r345; +} +{ +add.f16x2 r428, r419, r425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r434, {low, high}; +} +{ +mul.f16x2 r441, r374, r431; +} +{ +mul.f16x2 r444, r410, r432; +} +{ +sub.f16x2 r447, r441, r444; +} +{ +mul.f16x2 r450, r374, r432; +} +{ +fma.rn.f16x2 r453, r410, r431, r450; +} +{ +mul.f16x2 r457, r392, r433; +} +{ +mul.f16x2 r460, r428, r434; +} +{ +sub.f16x2 r463, r457, r460; +} +{ +mul.f16x2 r466, r392, r434; +} +{ +fma.rn.f16x2 r469, r428, r433, r466; +} +{ +add.f16x2 r473, r262, r350; +} +{ +add.f16x2 r476, r268, r356; +} +{ +sub.f16x2 r479, r262, r350; +} +{ +sub.f16x2 r482, r268, r356; +} +{ +add.f16x2 r485, r286, r447; +} +{ +add.f16x2 r488, r322, r453; +} +{ +sub.f16x2 r491, r286, r447; +} +{ +sub.f16x2 r494, r322, r453; +} +{ +add.f16x2 r497, r304, r463; +} +{ +add.f16x2 r500, r340, r469; +} +{ +sub.f16x2 r503, r304, r463; +} +{ +sub.f16x2 r506, r340, r469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r512, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r516, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r517, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r518, {low, high}; +} +mov.f32 f124, 0f3F800000; +{ +mul.f16x2 r531, r485, r509; +} +{ +mul.f16x2 r534, r488, r510; +} +{ +sub.f16x2 r537, r531, r534; +} +{ +mul.f16x2 r540, r485, r510; +} +{ +fma.rn.f16x2 r543, r488, r509, r540; +} +{ +mul.f16x2 r547, r497, r511; +} +{ +mul.f16x2 r550, r500, r512; +} +{ +sub.f16x2 r553, r547, r550; +} +{ +mul.f16x2 r556, r497, r512; +} +{ +fma.rn.f16x2 r559, r500, r511, r556; +} +{ +neg.f16x2 r563, r479; +} +{ +mul.f16x2 r565, r491, r515; +} +{ +mul.f16x2 r568, r494, r516; +} +{ +sub.f16x2 r571, r565, r568; +} +{ +mul.f16x2 r574, r491, r516; +} +{ +fma.rn.f16x2 r577, r494, r515, r574; +} +{ +mul.f16x2 r581, r503, r517; +} +{ +mul.f16x2 r584, r506, r518; +} +{ +sub.f16x2 r587, r581, r584; +} +{ +mul.f16x2 r590, r503, r518; +} +{ +fma.rn.f16x2 r593, r506, r517, r590; +} +{ +add.f16x2 r597, r219, r473; +} +{ +add.f16x2 r600, r222, r476; +} +{ +sub.f16x2 r603, r219, r473; +} +{ +sub.f16x2 r606, r222, r476; +} +{ +add.f16x2 r609, r231, r537; +} +{ +add.f16x2 r612, r234, r543; +} +{ +sub.f16x2 r615, r231, r537; +} +{ +sub.f16x2 r618, r234, r543; +} +{ +add.f16x2 r621, r243, r553; +} +{ +add.f16x2 r624, r246, r559; +} +{ +sub.f16x2 r627, r243, r553; +} +{ +sub.f16x2 r630, r246, r559; +} +{ +add.f16x2 r633, r225, r482; +} +{ +add.f16x2 r636, r228, r563; +} +{ +sub.f16x2 r639, r225, r482; +} +{ +sub.f16x2 r642, r228, r563; +} +{ +add.f16x2 r645, r237, r571; +} +{ +add.f16x2 r648, r240, r577; +} +{ +sub.f16x2 r651, r237, r571; +} +{ +sub.f16x2 r654, r240, r577; +} +{ +add.f16x2 r657, r249, r587; +} +{ +add.f16x2 r660, r252, r593; +} +{ +sub.f16x2 r663, r249, r587; +} +{ +sub.f16x2 r666, r252, r593; +} +mul.wide.u32 rd2, r1750, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1751, rd3; +mul.lo.s32 r1752, r1751, 12; +sub.s32 r1753, r1750, r1752; +mad.lo.s32 r1754, r1751, 576, r1749; +cvt.rn.f32.u32 f227, r1753; +mul.f32 f228, f227, 0f3D32B8C2; +cos.approx.f32 f101, f228; +sin.approx.f32 f229, f228; +neg.f32 f102, f229; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r669, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r674, {high, high}; +} +{ +mul.f16x2 r676, r612, r674; +} +{ +neg.f16x2 r679, r676; +} +{ +fma.rn.f16x2 r681, r609, r672, r679; +} +{ +mul.f16x2 r685, r609, r674; +} +{ +fma.rn.f16x2 r688, r612, r672, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r694, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r696, {low, high}; +} +{ +mul.f16x2 r697, r694, r696; +} +{ +mul.f16x2 r700, r669, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r703, {high, low}; +} +{ +fma.rn.f16x2 r705, r697, r703, r700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r711, {high, high}; +} +{ +mul.f16x2 r713, r624, r711; +} +{ +neg.f16x2 r716, r713; +} +{ +fma.rn.f16x2 r718, r621, r709, r716; +} +{ +mul.f16x2 r722, r621, r711; +} +{ +fma.rn.f16x2 r725, r624, r709, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r731, r733; +} +{ +mul.f16x2 r737, r705, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r740, {high, low}; +} +{ +fma.rn.f16x2 r742, r734, r740, r737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r748, {high, high}; +} +{ +mul.f16x2 r750, r636, r748; +} +{ +neg.f16x2 r753, r750; +} +{ +fma.rn.f16x2 r755, r633, r746, r753; +} +{ +mul.f16x2 r759, r633, r748; +} +{ +fma.rn.f16x2 r762, r636, r746, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r770, {low, high}; +} +{ +mul.f16x2 r771, r768, r770; +} +{ +mul.f16x2 r774, r742, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r777, {high, low}; +} +{ +fma.rn.f16x2 r779, r771, r777, r774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r785, {high, high}; +} +{ +mul.f16x2 r787, r648, r785; +} +{ +neg.f16x2 r790, r787; +} +{ +fma.rn.f16x2 r792, r645, r783, r790; +} +{ +mul.f16x2 r796, r645, r785; +} +{ +fma.rn.f16x2 r799, r648, r783, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r805, r807; +} +{ +mul.f16x2 r811, r779, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r814, {high, low}; +} +{ +fma.rn.f16x2 r816, r808, r814, r811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r822, {high, high}; +} +{ +mul.f16x2 r824, r660, r822; +} +{ +neg.f16x2 r827, r824; +} +{ +fma.rn.f16x2 r829, r657, r820, r827; +} +{ +mul.f16x2 r833, r657, r822; +} +{ +fma.rn.f16x2 r836, r660, r820, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r844, {low, high}; +} +{ +mul.f16x2 r845, r842, r844; +} +{ +mul.f16x2 r848, r816, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r851, {high, low}; +} +{ +fma.rn.f16x2 r853, r845, r851, r848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r859, {high, high}; +} +{ +mul.f16x2 r861, r606, r859; +} +{ +neg.f16x2 r864, r861; +} +{ +fma.rn.f16x2 r866, r603, r857, r864; +} +{ +mul.f16x2 r870, r603, r859; +} +{ +fma.rn.f16x2 r873, r606, r857, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r881, {low, high}; +} +{ +mul.f16x2 r882, r879, r881; +} +{ +mul.f16x2 r885, r853, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r888, {high, low}; +} +{ +fma.rn.f16x2 r890, r882, r888, r885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r618, r896; +} +{ +neg.f16x2 r901, r898; +} +{ +fma.rn.f16x2 r903, r615, r894, r901; +} +{ +mul.f16x2 r907, r615, r896; +} +{ +fma.rn.f16x2 r910, r618, r894, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r890, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r630, r933; +} +{ +neg.f16x2 r938, r935; +} +{ +fma.rn.f16x2 r940, r627, r931, r938; +} +{ +mul.f16x2 r944, r627, r933; +} +{ +fma.rn.f16x2 r947, r630, r931, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r642, r970; +} +{ +neg.f16x2 r975, r972; +} +{ +fma.rn.f16x2 r977, r639, r968, r975; +} +{ +mul.f16x2 r981, r639, r970; +} +{ +fma.rn.f16x2 r984, r642, r968, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r654, r1007; +} +{ +neg.f16x2 r1012, r1009; +} +{ +fma.rn.f16x2 r1014, r651, r1005, r1012; +} +{ +mul.f16x2 r1018, r651, r1007; +} +{ +fma.rn.f16x2 r1021, r654, r1005, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r666, r1044; +} +{ +neg.f16x2 r1049, r1046; +} +{ +fma.rn.f16x2 r1051, r663, r1042, r1049; +} +{ +mul.f16x2 r1055, r663, r1044; +} +{ +fma.rn.f16x2 r1058, r666, r1042, r1055; +} +barrier.sync 0; +mad.lo.s32 r1755, r1753, 48, r1754; +st.shared.v4.f32 [r1755], {r597, r681, r718, r755}; +st.shared.v4.f32 [r1755+16], {r792, r829, r866, r903}; +st.shared.v4.f32 [r1755+32], {r940, r977, r1014, r1051}; +barrier.sync 0; +mad.lo.s32 r1756, r1753, -44, r1755; +ld.shared.u32 r1087, [r1756]; +ld.shared.u32 r1341, [r1756+48]; +ld.shared.u32 r1175, [r1756+96]; +ld.shared.u32 r1429, [r1756+144]; +ld.shared.u32 r1084, [r1756+192]; +ld.shared.u32 r1338, [r1756+240]; +ld.shared.u32 r1172, [r1756+288]; +ld.shared.u32 r1426, [r1756+336]; +ld.shared.u32 r1085, [r1756+384]; +ld.shared.u32 r1339, [r1756+432]; +ld.shared.u32 r1173, [r1756+480]; +ld.shared.u32 r1427, [r1756+528]; +barrier.sync 0; +st.shared.v4.f32 [r1755], {r600, r688, r725, r762}; +st.shared.v4.f32 [r1755+16], {r799, r836, r873, r910}; +st.shared.v4.f32 [r1755+32], {r947, r984, r1021, r1058}; +barrier.sync 0; +ld.shared.u32 r1093, [r1756]; +ld.shared.u32 r1347, [r1756+48]; +ld.shared.u32 r1181, [r1756+96]; +ld.shared.u32 r1435, [r1756+144]; +ld.shared.u32 r1090, [r1756+192]; +ld.shared.u32 r1344, [r1756+240]; +ld.shared.u32 r1178, [r1756+288]; +ld.shared.u32 r1432, [r1756+336]; +ld.shared.u32 r1091, [r1756+384]; +ld.shared.u32 r1345, [r1756+432]; +ld.shared.u32 r1179, [r1756+480]; +ld.shared.u32 r1433, [r1756+528]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1080, {low, high}; +} +{ +neg.f16x2 r1081, r1080; +} +{ +add.f16x2 r1083, r1084, r1085; +} +{ +add.f16x2 r1086, r1087, r1083; +} +{ +add.f16x2 r1089, r1090, r1091; +} +{ +add.f16x2 r1092, r1093, r1089; +} +{ +add.f16x2 r1095, r1084, r1085; +} +{ +mul.f16x2 r1098, r1095, r1079; +} +{ +add.f16x2 r1101, r1087, r1098; +} +{ +sub.f16x2 r1104, r1090, r1091; +} +{ +mul.f16x2 r1107, r1104, r1081; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +add.f16x2 r1113, r1084, r1085; +} +{ +mul.f16x2 r1116, r1113, r1079; +} +{ +add.f16x2 r1119, r1087, r1116; +} +{ +sub.f16x2 r1122, r1090, r1091; +} +{ +mul.f16x2 r1125, r1122, r1081; +} +{ +sub.f16x2 r1128, r1119, r1125; +} +{ +add.f16x2 r1131, r1090, r1091; +} +{ +mul.f16x2 r1134, r1131, r1079; +} +{ +add.f16x2 r1137, r1093, r1134; +} +{ +sub.f16x2 r1140, r1084, r1085; +} +{ +mul.f16x2 r1143, r1140, r1081; +} +{ +sub.f16x2 r1146, r1137, r1143; +} +{ +add.f16x2 r1149, r1090, r1091; +} +{ +mul.f16x2 r1152, r1149, r1079; +} +{ +add.f16x2 r1155, r1093, r1152; +} +{ +sub.f16x2 r1158, r1084, r1085; +} +{ +mul.f16x2 r1161, r1158, r1081; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1167, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1168, {low, high}; +} +{ +neg.f16x2 r1169, r1168; +} +{ +add.f16x2 r1171, r1172, r1173; +} +{ +add.f16x2 r1174, r1175, r1171; +} +{ +add.f16x2 r1177, r1178, r1179; +} +{ +add.f16x2 r1180, r1181, r1177; +} +{ +add.f16x2 r1183, r1172, r1173; +} +{ +mul.f16x2 r1186, r1183, r1167; +} +{ +add.f16x2 r1189, r1175, r1186; +} +{ +sub.f16x2 r1192, r1178, r1179; +} +{ +mul.f16x2 r1195, r1192, r1169; +} +{ +add.f16x2 r1198, r1189, r1195; +} +{ +add.f16x2 r1201, r1172, r1173; +} +{ +mul.f16x2 r1204, r1201, r1167; +} +{ +add.f16x2 r1207, r1175, r1204; +} +{ +sub.f16x2 r1210, r1178, r1179; +} +{ +mul.f16x2 r1213, r1210, r1169; +} +{ +sub.f16x2 r1216, r1207, r1213; +} +{ +add.f16x2 r1219, r1178, r1179; +} +{ +mul.f16x2 r1222, r1219, r1167; +} +{ +add.f16x2 r1225, r1181, r1222; +} +{ +sub.f16x2 r1228, r1172, r1173; +} +{ +mul.f16x2 r1231, r1228, r1169; +} +{ +sub.f16x2 r1234, r1225, r1231; +} +{ +add.f16x2 r1237, r1178, r1179; +} +{ +mul.f16x2 r1240, r1237, r1167; +} +{ +add.f16x2 r1243, r1181, r1240; +} +{ +sub.f16x2 r1246, r1172, r1173; +} +{ +mul.f16x2 r1249, r1246, r1169; +} +{ +add.f16x2 r1252, r1243, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1258, {low, high}; +} +{ +mul.f16x2 r1265, r1198, r1255; +} +{ +mul.f16x2 r1268, r1234, r1256; +} +{ +sub.f16x2 r1271, r1265, r1268; +} +{ +mul.f16x2 r1274, r1198, r1256; +} +{ +fma.rn.f16x2 r1277, r1234, r1255, r1274; +} +{ +mul.f16x2 r1281, r1216, r1257; +} +{ +mul.f16x2 r1284, r1252, r1258; +} +{ +sub.f16x2 r1287, r1281, r1284; +} +{ +mul.f16x2 r1290, r1216, r1258; +} +{ +fma.rn.f16x2 r1293, r1252, r1257, r1290; +} +{ +add.f16x2 r1297, r1086, r1174; +} +{ +add.f16x2 r1300, r1092, r1180; +} +{ +sub.f16x2 r1303, r1086, r1174; +} +{ +sub.f16x2 r1306, r1092, r1180; +} +{ +add.f16x2 r1309, r1110, r1271; +} +{ +add.f16x2 r1312, r1146, r1277; +} +{ +sub.f16x2 r1315, r1110, r1271; +} +{ +sub.f16x2 r1318, r1146, r1277; +} +{ +add.f16x2 r1321, r1128, r1287; +} +{ +add.f16x2 r1324, r1164, r1293; +} +{ +sub.f16x2 r1327, r1128, r1287; +} +{ +sub.f16x2 r1330, r1164, r1293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1333, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1334, {low, high}; +} +{ +neg.f16x2 r1335, r1334; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1337; +} +{ +add.f16x2 r1343, r1344, r1345; +} +{ +add.f16x2 r1346, r1347, r1343; +} +{ +add.f16x2 r1349, r1338, r1339; +} +{ +mul.f16x2 r1352, r1349, r1333; +} +{ +add.f16x2 r1355, r1341, r1352; +} +{ +sub.f16x2 r1358, r1344, r1345; +} +{ +mul.f16x2 r1361, r1358, r1335; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +add.f16x2 r1367, r1338, r1339; +} +{ +mul.f16x2 r1370, r1367, r1333; +} +{ +add.f16x2 r1373, r1341, r1370; +} +{ +sub.f16x2 r1376, r1344, r1345; +} +{ +mul.f16x2 r1379, r1376, r1335; +} +{ +sub.f16x2 r1382, r1373, r1379; +} +{ +add.f16x2 r1385, r1344, r1345; +} +{ +mul.f16x2 r1388, r1385, r1333; +} +{ +add.f16x2 r1391, r1347, r1388; +} +{ +sub.f16x2 r1394, r1338, r1339; +} +{ +mul.f16x2 r1397, r1394, r1335; +} +{ +sub.f16x2 r1400, r1391, r1397; +} +{ +add.f16x2 r1403, r1344, r1345; +} +{ +mul.f16x2 r1406, r1403, r1333; +} +{ +add.f16x2 r1409, r1347, r1406; +} +{ +sub.f16x2 r1412, r1338, r1339; +} +{ +mul.f16x2 r1415, r1412, r1335; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1421, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1422, {low, high}; +} +{ +neg.f16x2 r1423, r1422; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1425; +} +{ +add.f16x2 r1431, r1432, r1433; +} +{ +add.f16x2 r1434, r1435, r1431; +} +{ +add.f16x2 r1437, r1426, r1427; +} +{ +mul.f16x2 r1440, r1437, r1421; +} +{ +add.f16x2 r1443, r1429, r1440; +} +{ +sub.f16x2 r1446, r1432, r1433; +} +{ +mul.f16x2 r1449, r1446, r1423; +} +{ +add.f16x2 r1452, r1443, r1449; +} +{ +add.f16x2 r1455, r1426, r1427; +} +{ +mul.f16x2 r1458, r1455, r1421; +} +{ +add.f16x2 r1461, r1429, r1458; +} +{ +sub.f16x2 r1464, r1432, r1433; +} +{ +mul.f16x2 r1467, r1464, r1423; +} +{ +sub.f16x2 r1470, r1461, r1467; +} +{ +add.f16x2 r1473, r1432, r1433; +} +{ +mul.f16x2 r1476, r1473, r1421; +} +{ +add.f16x2 r1479, r1435, r1476; +} +{ +sub.f16x2 r1482, r1426, r1427; +} +{ +mul.f16x2 r1485, r1482, r1423; +} +{ +sub.f16x2 r1488, r1479, r1485; +} +{ +add.f16x2 r1491, r1432, r1433; +} +{ +mul.f16x2 r1494, r1491, r1421; +} +{ +add.f16x2 r1497, r1435, r1494; +} +{ +sub.f16x2 r1500, r1426, r1427; +} +{ +mul.f16x2 r1503, r1500, r1423; +} +{ +add.f16x2 r1506, r1497, r1503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1512, {low, high}; +} +{ +mul.f16x2 r1519, r1452, r1509; +} +{ +mul.f16x2 r1522, r1488, r1510; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1452, r1510; +} +{ +fma.rn.f16x2 r1531, r1488, r1509, r1528; +} +{ +mul.f16x2 r1535, r1470, r1511; +} +{ +mul.f16x2 r1538, r1506, r1512; +} +{ +sub.f16x2 r1541, r1535, r1538; +} +{ +mul.f16x2 r1544, r1470, r1512; +} +{ +fma.rn.f16x2 r1547, r1506, r1511, r1544; +} +{ +add.f16x2 r1551, r1340, r1428; +} +{ +add.f16x2 r1554, r1346, r1434; +} +{ +sub.f16x2 r1557, r1340, r1428; +} +{ +sub.f16x2 r1560, r1346, r1434; +} +{ +add.f16x2 r1563, r1364, r1525; +} +{ +add.f16x2 r1566, r1400, r1531; +} +{ +sub.f16x2 r1569, r1364, r1525; +} +{ +sub.f16x2 r1572, r1400, r1531; +} +{ +add.f16x2 r1575, r1382, r1541; +} +{ +add.f16x2 r1578, r1418, r1547; +} +{ +sub.f16x2 r1581, r1382, r1541; +} +{ +sub.f16x2 r1584, r1418, r1547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1590, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1593, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1595, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1596, {low, high}; +} +{ +mul.f16x2 r1609, r1563, r1587; +} +{ +mul.f16x2 r1612, r1566, r1588; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1563, r1588; +} +{ +fma.rn.f16x2 r1621, r1566, r1587, r1618; +} +{ +mul.f16x2 r1625, r1575, r1589; +} +{ +mul.f16x2 r1628, r1578, r1590; +} +{ +sub.f16x2 r1631, r1625, r1628; +} +{ +mul.f16x2 r1634, r1575, r1590; +} +{ +fma.rn.f16x2 r1637, r1578, r1589, r1634; +} +{ +neg.f16x2 r1641, r1557; +} +{ +mul.f16x2 r1643, r1569, r1593; +} +{ +mul.f16x2 r1646, r1572, r1594; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1569, r1594; +} +{ +fma.rn.f16x2 r1655, r1572, r1593, r1652; +} +{ +mul.f16x2 r1659, r1581, r1595; +} +{ +mul.f16x2 r1662, r1584, r1596; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r1581, r1596; +} +{ +fma.rn.f16x2 r1671, r1584, r1595, r1668; +} +{ +add.f16x2 %0, r1297, r1551; +} +{ +add.f16x2 %1, r1300, r1554; +} +{ +sub.f16x2 %12, r1297, r1551; +} +{ +sub.f16x2 %13, r1300, r1554; +} +{ +add.f16x2 %2, r1309, r1615; +} +{ +add.f16x2 %3, r1312, r1621; +} +{ +sub.f16x2 %14, r1309, r1615; +} +{ +sub.f16x2 %15, r1312, r1621; +} +{ +add.f16x2 %4, r1321, r1631; +} +{ +add.f16x2 %5, r1324, r1637; +} +{ +sub.f16x2 %16, r1321, r1631; +} +{ +sub.f16x2 %17, r1324, r1637; +} +{ +add.f16x2 %6, r1303, r1560; +} +{ +add.f16x2 %7, r1306, r1641; +} +{ +sub.f16x2 %18, r1303, r1560; +} +{ +sub.f16x2 %19, r1306, r1641; +} +{ +add.f16x2 %8, r1315, r1649; +} +{ +add.f16x2 %9, r1318, r1655; +} +{ +sub.f16x2 %20, r1315, r1649; +} +{ +sub.f16x2 %21, r1318, r1655; +} +{ +add.f16x2 %10, r1327, r1665; +} +{ +add.f16x2 %11, r1330, r1671; +} +{ +sub.f16x2 %22, r1327, r1665; +} +{ +sub.f16x2 %23, r1330, r1671; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..a6c9fe99241d4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp16_inv.hpp.inc @@ -0,0 +1,4296 @@ +#ifndef CUFFTDX_FFT_144_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_144_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1146, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<230>; +.reg .b32 r<1744>; +.reg .b64 rd<5>; +mov.u32 r1731, %tid.y; +shl.b32 r1732, r1731, 1; +mov.u32 r1733, %24; +mad.lo.s32 r1734, r1732, 576, r1733; +mov.u32 r1735, %tid.x; +mov.f32 f196, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1, {low, high}; +} +mov.f32 f200, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %33, %41; +} +{ +add.f16x2 r6, %25, r3; +} +{ +add.f16x2 r9, %34, %42; +} +{ +add.f16x2 r12, %26, r9; +} +{ +add.f16x2 r15, %33, %41; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %25, r18; +} +{ +sub.f16x2 r24, %34, %42; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %33, %41; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %25, r36; +} +{ +sub.f16x2 r42, %34, %42; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %34, %42; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %26, r54; +} +{ +sub.f16x2 r60, %33, %41; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %34, %42; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %26, r72; +} +{ +sub.f16x2 r78, %33, %41; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %37, %45; +} +{ +add.f16x2 r92, %29, r89; +} +{ +add.f16x2 r95, %38, %46; +} +{ +add.f16x2 r98, %30, r95; +} +{ +add.f16x2 r101, %37, %45; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %29, r104; +} +{ +sub.f16x2 r110, %38, %46; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %37, %45; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %29, r122; +} +{ +sub.f16x2 r128, %38, %46; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %38, %46; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %30, r140; +} +{ +sub.f16x2 r146, %37, %45; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %38, %46; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %30, r158; +} +{ +sub.f16x2 r164, %37, %45; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f202, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r173, {low, high}; +} +mov.f32 f198, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r176, {low, high}; +} +mov.f32 f123, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r252, {low, high}; +} +{ +add.f16x2 r253, %35, %43; +} +{ +add.f16x2 r256, %27, r253; +} +{ +add.f16x2 r259, %36, %44; +} +{ +add.f16x2 r262, %28, r259; +} +{ +add.f16x2 r265, %35, %43; +} +{ +mul.f16x2 r268, r265, r251; +} +{ +add.f16x2 r271, %27, r268; +} +{ +sub.f16x2 r274, %36, %44; +} +{ +mul.f16x2 r277, r274, r252; +} +{ +add.f16x2 r280, r271, r277; +} +{ +add.f16x2 r283, %35, %43; +} +{ +mul.f16x2 r286, r283, r251; +} +{ +add.f16x2 r289, %27, r286; +} +{ +sub.f16x2 r292, %36, %44; +} +{ +mul.f16x2 r295, r292, r252; +} +{ +sub.f16x2 r298, r289, r295; +} +{ +add.f16x2 r301, %36, %44; +} +{ +mul.f16x2 r304, r301, r251; +} +{ +add.f16x2 r307, %28, r304; +} +{ +sub.f16x2 r310, %35, %43; +} +{ +mul.f16x2 r313, r310, r252; +} +{ +sub.f16x2 r316, r307, r313; +} +{ +add.f16x2 r319, %36, %44; +} +{ +mul.f16x2 r322, r319, r251; +} +{ +add.f16x2 r325, %28, r322; +} +{ +sub.f16x2 r328, %35, %43; +} +{ +mul.f16x2 r331, r328, r252; +} +{ +add.f16x2 r334, r325, r331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r337, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r338, {low, high}; +} +{ +add.f16x2 r339, %39, %47; +} +{ +add.f16x2 r342, %31, r339; +} +{ +add.f16x2 r345, %40, %48; +} +{ +add.f16x2 r348, %32, r345; +} +{ +add.f16x2 r351, %39, %47; +} +{ +mul.f16x2 r354, r351, r337; +} +{ +add.f16x2 r357, %31, r354; +} +{ +sub.f16x2 r360, %40, %48; +} +{ +mul.f16x2 r363, r360, r338; +} +{ +add.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %39, %47; +} +{ +mul.f16x2 r372, r369, r337; +} +{ +add.f16x2 r375, %31, r372; +} +{ +sub.f16x2 r378, %40, %48; +} +{ +mul.f16x2 r381, r378, r338; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %40, %48; +} +{ +mul.f16x2 r390, r387, r337; +} +{ +add.f16x2 r393, %32, r390; +} +{ +sub.f16x2 r396, %39, %47; +} +{ +mul.f16x2 r399, r396, r338; +} +{ +sub.f16x2 r402, r393, r399; +} +{ +add.f16x2 r405, %40, %48; +} +{ +mul.f16x2 r408, r405, r337; +} +{ +add.f16x2 r411, %32, r408; +} +{ +sub.f16x2 r414, %39, %47; +} +{ +mul.f16x2 r417, r414, r338; +} +{ +add.f16x2 r420, r411, r417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r426, {low, high}; +} +{ +mul.f16x2 r433, r366, r423; +} +{ +mul.f16x2 r436, r402, r424; +} +{ +sub.f16x2 r439, r433, r436; +} +{ +mul.f16x2 r442, r366, r424; +} +{ +fma.rn.f16x2 r445, r402, r423, r442; +} +{ +mul.f16x2 r449, r384, r425; +} +{ +mul.f16x2 r452, r420, r426; +} +{ +sub.f16x2 r455, r449, r452; +} +{ +mul.f16x2 r458, r384, r426; +} +{ +fma.rn.f16x2 r461, r420, r425, r458; +} +{ +add.f16x2 r465, r256, r342; +} +{ +add.f16x2 r468, r262, r348; +} +{ +sub.f16x2 r471, r256, r342; +} +{ +sub.f16x2 r474, r262, r348; +} +{ +add.f16x2 r477, r280, r439; +} +{ +add.f16x2 r480, r316, r445; +} +{ +sub.f16x2 r483, r280, r439; +} +{ +sub.f16x2 r486, r316, r445; +} +{ +add.f16x2 r489, r298, r455; +} +{ +add.f16x2 r492, r334, r461; +} +{ +sub.f16x2 r495, r298, r455; +} +{ +sub.f16x2 r498, r334, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r501, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r502, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r503, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r504, {low, high}; +} +mov.f32 f124, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r507, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r510, {low, high}; +} +{ +mul.f16x2 r523, r477, r501; +} +{ +mul.f16x2 r526, r480, r502; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r477, r502; +} +{ +fma.rn.f16x2 r535, r480, r501, r532; +} +{ +mul.f16x2 r539, r489, r503; +} +{ +mul.f16x2 r542, r492, r504; +} +{ +sub.f16x2 r545, r539, r542; +} +{ +mul.f16x2 r548, r489, r504; +} +{ +fma.rn.f16x2 r551, r492, r503, r548; +} +{ +neg.f16x2 r555, r474; +} +{ +mul.f16x2 r557, r483, r507; +} +{ +mul.f16x2 r560, r486, r508; +} +{ +sub.f16x2 r563, r557, r560; +} +{ +mul.f16x2 r566, r483, r508; +} +{ +fma.rn.f16x2 r569, r486, r507, r566; +} +{ +mul.f16x2 r573, r495, r509; +} +{ +mul.f16x2 r576, r498, r510; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r495, r510; +} +{ +fma.rn.f16x2 r585, r498, r509, r582; +} +{ +add.f16x2 r589, r215, r465; +} +{ +add.f16x2 r592, r218, r468; +} +{ +sub.f16x2 r595, r215, r465; +} +{ +sub.f16x2 r598, r218, r468; +} +{ +add.f16x2 r601, r227, r529; +} +{ +add.f16x2 r604, r230, r535; +} +{ +sub.f16x2 r607, r227, r529; +} +{ +sub.f16x2 r610, r230, r535; +} +{ +add.f16x2 r613, r239, r545; +} +{ +add.f16x2 r616, r242, r551; +} +{ +sub.f16x2 r619, r239, r545; +} +{ +sub.f16x2 r622, r242, r551; +} +{ +add.f16x2 r625, r221, r555; +} +{ +add.f16x2 r628, r224, r471; +} +{ +sub.f16x2 r631, r221, r555; +} +{ +sub.f16x2 r634, r224, r471; +} +{ +add.f16x2 r637, r233, r563; +} +{ +add.f16x2 r640, r236, r569; +} +{ +sub.f16x2 r643, r233, r563; +} +{ +sub.f16x2 r646, r236, r569; +} +{ +add.f16x2 r649, r245, r579; +} +{ +add.f16x2 r652, r248, r585; +} +{ +sub.f16x2 r655, r245, r579; +} +{ +sub.f16x2 r658, r248, r585; +} +mul.wide.u32 rd2, r1735, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1736, rd3; +mul.lo.s32 r1737, r1736, 12; +sub.s32 r1738, r1735, r1737; +shr.u64 rd4, rd2, 34; +cvt.u32.u64 r1739, rd4; +and.b32 r1740, r1739, 1073741822; +mad.lo.s32 r1741, r1740, 576, r1734; +cvt.rn.f32.u32 f227, r1738; +mul.f32 f228, f227, 0f3D32B8C2; +cos.approx.f32 f101, f228; +sin.approx.f32 f229, f228; +neg.f32 f102, f229; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r664, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r666, {high, high}; +} +{ +mul.f16x2 r668, r604, r666; +} +{ +fma.rn.f16x2 r671, r601, r664, r668; +} +{ +mul.f16x2 r675, r601, r666; +} +{ +neg.f16x2 r678, r675; +} +{ +fma.rn.f16x2 r680, r604, r664, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r684, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r686, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r686, r688; +} +{ +mul.f16x2 r692, r661, r684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r695, {high, low}; +} +{ +fma.rn.f16x2 r697, r689, r695, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r701, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r703, {high, high}; +} +{ +mul.f16x2 r705, r616, r703; +} +{ +fma.rn.f16x2 r708, r613, r701, r705; +} +{ +mul.f16x2 r712, r613, r703; +} +{ +neg.f16x2 r715, r712; +} +{ +fma.rn.f16x2 r717, r616, r701, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r721, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r723, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r725, {low, high}; +} +{ +mul.f16x2 r726, r723, r725; +} +{ +mul.f16x2 r729, r697, r721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r732, {high, low}; +} +{ +fma.rn.f16x2 r734, r726, r732, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r738, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r740, {high, high}; +} +{ +mul.f16x2 r742, r628, r740; +} +{ +fma.rn.f16x2 r745, r625, r738, r742; +} +{ +mul.f16x2 r749, r625, r740; +} +{ +neg.f16x2 r752, r749; +} +{ +fma.rn.f16x2 r754, r628, r738, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r758, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r760, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r762, {low, high}; +} +{ +mul.f16x2 r763, r760, r762; +} +{ +mul.f16x2 r766, r734, r758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r769, {high, low}; +} +{ +fma.rn.f16x2 r771, r763, r769, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r775, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r777, {high, high}; +} +{ +mul.f16x2 r779, r640, r777; +} +{ +fma.rn.f16x2 r782, r637, r775, r779; +} +{ +mul.f16x2 r786, r637, r777; +} +{ +neg.f16x2 r789, r786; +} +{ +fma.rn.f16x2 r791, r640, r775, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r797, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r799, {low, high}; +} +{ +mul.f16x2 r800, r797, r799; +} +{ +mul.f16x2 r803, r771, r795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r806, {high, low}; +} +{ +fma.rn.f16x2 r808, r800, r806, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r812, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r814, {high, high}; +} +{ +mul.f16x2 r816, r652, r814; +} +{ +fma.rn.f16x2 r819, r649, r812, r816; +} +{ +mul.f16x2 r823, r649, r814; +} +{ +neg.f16x2 r826, r823; +} +{ +fma.rn.f16x2 r828, r652, r812, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r834, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r836, {low, high}; +} +{ +mul.f16x2 r837, r834, r836; +} +{ +mul.f16x2 r840, r808, r832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r843, {high, low}; +} +{ +fma.rn.f16x2 r845, r837, r843, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r849, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r851, {high, high}; +} +{ +mul.f16x2 r853, r598, r851; +} +{ +fma.rn.f16x2 r856, r595, r849, r853; +} +{ +mul.f16x2 r860, r595, r851; +} +{ +neg.f16x2 r863, r860; +} +{ +fma.rn.f16x2 r865, r598, r849, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r871, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r873, {low, high}; +} +{ +mul.f16x2 r874, r871, r873; +} +{ +mul.f16x2 r877, r845, r869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r880, {high, low}; +} +{ +fma.rn.f16x2 r882, r874, r880, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r886, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r888, {high, high}; +} +{ +mul.f16x2 r890, r610, r888; +} +{ +fma.rn.f16x2 r893, r607, r886, r890; +} +{ +mul.f16x2 r897, r607, r888; +} +{ +neg.f16x2 r900, r897; +} +{ +fma.rn.f16x2 r902, r610, r886, r900; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r908, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r910, {low, high}; +} +{ +mul.f16x2 r911, r908, r910; +} +{ +mul.f16x2 r914, r882, r906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r917, {high, low}; +} +{ +fma.rn.f16x2 r919, r911, r917, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r923, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r925, {high, high}; +} +{ +mul.f16x2 r927, r622, r925; +} +{ +fma.rn.f16x2 r930, r619, r923, r927; +} +{ +mul.f16x2 r934, r619, r925; +} +{ +neg.f16x2 r937, r934; +} +{ +fma.rn.f16x2 r939, r622, r923, r937; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r943, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r945, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r947, {low, high}; +} +{ +mul.f16x2 r948, r945, r947; +} +{ +mul.f16x2 r951, r919, r943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r954, {high, low}; +} +{ +fma.rn.f16x2 r956, r948, r954, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r960, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r962, {high, high}; +} +{ +mul.f16x2 r964, r634, r962; +} +{ +fma.rn.f16x2 r967, r631, r960, r964; +} +{ +mul.f16x2 r971, r631, r962; +} +{ +neg.f16x2 r974, r971; +} +{ +fma.rn.f16x2 r976, r634, r960, r974; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r980, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r982, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r984, {low, high}; +} +{ +mul.f16x2 r985, r982, r984; +} +{ +mul.f16x2 r988, r956, r980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r991, {high, low}; +} +{ +fma.rn.f16x2 r993, r985, r991, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r997, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r999, {high, high}; +} +{ +mul.f16x2 r1001, r646, r999; +} +{ +fma.rn.f16x2 r1004, r643, r997, r1001; +} +{ +mul.f16x2 r1008, r643, r999; +} +{ +neg.f16x2 r1011, r1008; +} +{ +fma.rn.f16x2 r1013, r646, r997, r1011; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1017, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1019, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1021, {low, high}; +} +{ +mul.f16x2 r1022, r1019, r1021; +} +{ +mul.f16x2 r1025, r993, r1017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r1028, {high, low}; +} +{ +fma.rn.f16x2 r1030, r1022, r1028, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1036, {high, high}; +} +{ +mul.f16x2 r1038, r658, r1036; +} +{ +fma.rn.f16x2 r1041, r655, r1034, r1038; +} +{ +mul.f16x2 r1045, r655, r1036; +} +{ +neg.f16x2 r1048, r1045; +} +{ +fma.rn.f16x2 r1050, r658, r1034, r1048; +} +barrier.sync 0; +mad.lo.s32 r1742, r1738, 96, r1741; +st.shared.v4.f32 [r1742], {r589, r592, r671, r680}; +st.shared.v4.f32 [r1742+16], {r708, r717, r745, r754}; +st.shared.v4.f32 [r1742+32], {r782, r791, r819, r828}; +st.shared.v4.f32 [r1742+48], {r856, r865, r893, r902}; +st.shared.v4.f32 [r1742+64], {r930, r939, r967, r976}; +st.shared.v4.f32 [r1742+80], {r1004, r1013, r1041, r1050}; +barrier.sync 0; +mad.lo.s32 r1743, r1738, -88, r1742; +ld.shared.u32 r1077, [r1743]; +ld.shared.u32 r1083, [r1743+4]; +ld.shared.u32 r1327, [r1743+96]; +ld.shared.u32 r1333, [r1743+100]; +ld.shared.u32 r1163, [r1743+192]; +ld.shared.u32 r1169, [r1743+196]; +ld.shared.u32 r1413, [r1743+288]; +ld.shared.u32 r1419, [r1743+292]; +ld.shared.u32 r1074, [r1743+384]; +ld.shared.u32 r1080, [r1743+388]; +ld.shared.u32 r1324, [r1743+480]; +ld.shared.u32 r1330, [r1743+484]; +ld.shared.u32 r1160, [r1743+576]; +ld.shared.u32 r1166, [r1743+580]; +ld.shared.u32 r1410, [r1743+672]; +ld.shared.u32 r1416, [r1743+676]; +ld.shared.u32 r1075, [r1743+768]; +ld.shared.u32 r1081, [r1743+772]; +ld.shared.u32 r1325, [r1743+864]; +ld.shared.u32 r1331, [r1743+868]; +ld.shared.u32 r1161, [r1743+960]; +ld.shared.u32 r1167, [r1743+964]; +ld.shared.u32 r1411, [r1743+1056]; +ld.shared.u32 r1417, [r1743+1060]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1071, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1072, {low, high}; +} +{ +add.f16x2 r1073, r1074, r1075; +} +{ +add.f16x2 r1076, r1077, r1073; +} +{ +add.f16x2 r1079, r1080, r1081; +} +{ +add.f16x2 r1082, r1083, r1079; +} +{ +add.f16x2 r1085, r1074, r1075; +} +{ +mul.f16x2 r1088, r1085, r1071; +} +{ +add.f16x2 r1091, r1077, r1088; +} +{ +sub.f16x2 r1094, r1080, r1081; +} +{ +mul.f16x2 r1097, r1094, r1072; +} +{ +add.f16x2 r1100, r1091, r1097; +} +{ +add.f16x2 r1103, r1074, r1075; +} +{ +mul.f16x2 r1106, r1103, r1071; +} +{ +add.f16x2 r1109, r1077, r1106; +} +{ +sub.f16x2 r1112, r1080, r1081; +} +{ +mul.f16x2 r1115, r1112, r1072; +} +{ +sub.f16x2 r1118, r1109, r1115; +} +{ +add.f16x2 r1121, r1080, r1081; +} +{ +mul.f16x2 r1124, r1121, r1071; +} +{ +add.f16x2 r1127, r1083, r1124; +} +{ +sub.f16x2 r1130, r1074, r1075; +} +{ +mul.f16x2 r1133, r1130, r1072; +} +{ +sub.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r1080, r1081; +} +{ +mul.f16x2 r1142, r1139, r1071; +} +{ +add.f16x2 r1145, r1083, r1142; +} +{ +sub.f16x2 r1148, r1074, r1075; +} +{ +mul.f16x2 r1151, r1148, r1072; +} +{ +add.f16x2 r1154, r1145, r1151; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1158, {low, high}; +} +{ +add.f16x2 r1159, r1160, r1161; +} +{ +add.f16x2 r1162, r1163, r1159; +} +{ +add.f16x2 r1165, r1166, r1167; +} +{ +add.f16x2 r1168, r1169, r1165; +} +{ +add.f16x2 r1171, r1160, r1161; +} +{ +mul.f16x2 r1174, r1171, r1157; +} +{ +add.f16x2 r1177, r1163, r1174; +} +{ +sub.f16x2 r1180, r1166, r1167; +} +{ +mul.f16x2 r1183, r1180, r1158; +} +{ +add.f16x2 r1186, r1177, r1183; +} +{ +add.f16x2 r1189, r1160, r1161; +} +{ +mul.f16x2 r1192, r1189, r1157; +} +{ +add.f16x2 r1195, r1163, r1192; +} +{ +sub.f16x2 r1198, r1166, r1167; +} +{ +mul.f16x2 r1201, r1198, r1158; +} +{ +sub.f16x2 r1204, r1195, r1201; +} +{ +add.f16x2 r1207, r1166, r1167; +} +{ +mul.f16x2 r1210, r1207, r1157; +} +{ +add.f16x2 r1213, r1169, r1210; +} +{ +sub.f16x2 r1216, r1160, r1161; +} +{ +mul.f16x2 r1219, r1216, r1158; +} +{ +sub.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, r1166, r1167; +} +{ +mul.f16x2 r1228, r1225, r1157; +} +{ +add.f16x2 r1231, r1169, r1228; +} +{ +sub.f16x2 r1234, r1160, r1161; +} +{ +mul.f16x2 r1237, r1234, r1158; +} +{ +add.f16x2 r1240, r1231, r1237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1246, {low, high}; +} +{ +mul.f16x2 r1253, r1186, r1243; +} +{ +mul.f16x2 r1256, r1222, r1244; +} +{ +sub.f16x2 r1259, r1253, r1256; +} +{ +mul.f16x2 r1262, r1186, r1244; +} +{ +fma.rn.f16x2 r1265, r1222, r1243, r1262; +} +{ +mul.f16x2 r1269, r1204, r1245; +} +{ +mul.f16x2 r1272, r1240, r1246; +} +{ +sub.f16x2 r1275, r1269, r1272; +} +{ +mul.f16x2 r1278, r1204, r1246; +} +{ +fma.rn.f16x2 r1281, r1240, r1245, r1278; +} +{ +add.f16x2 r1285, r1076, r1162; +} +{ +add.f16x2 r1288, r1082, r1168; +} +{ +sub.f16x2 r1291, r1076, r1162; +} +{ +sub.f16x2 r1294, r1082, r1168; +} +{ +add.f16x2 r1297, r1100, r1259; +} +{ +add.f16x2 r1300, r1136, r1265; +} +{ +sub.f16x2 r1303, r1100, r1259; +} +{ +sub.f16x2 r1306, r1136, r1265; +} +{ +add.f16x2 r1309, r1118, r1275; +} +{ +add.f16x2 r1312, r1154, r1281; +} +{ +sub.f16x2 r1315, r1118, r1275; +} +{ +sub.f16x2 r1318, r1154, r1281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1322, {low, high}; +} +{ +add.f16x2 r1323, r1324, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +add.f16x2 r1329, r1330, r1331; +} +{ +add.f16x2 r1332, r1333, r1329; +} +{ +add.f16x2 r1335, r1324, r1325; +} +{ +mul.f16x2 r1338, r1335, r1321; +} +{ +add.f16x2 r1341, r1327, r1338; +} +{ +sub.f16x2 r1344, r1330, r1331; +} +{ +mul.f16x2 r1347, r1344, r1322; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1324, r1325; +} +{ +mul.f16x2 r1356, r1353, r1321; +} +{ +add.f16x2 r1359, r1327, r1356; +} +{ +sub.f16x2 r1362, r1330, r1331; +} +{ +mul.f16x2 r1365, r1362, r1322; +} +{ +sub.f16x2 r1368, r1359, r1365; +} +{ +add.f16x2 r1371, r1330, r1331; +} +{ +mul.f16x2 r1374, r1371, r1321; +} +{ +add.f16x2 r1377, r1333, r1374; +} +{ +sub.f16x2 r1380, r1324, r1325; +} +{ +mul.f16x2 r1383, r1380, r1322; +} +{ +sub.f16x2 r1386, r1377, r1383; +} +{ +add.f16x2 r1389, r1330, r1331; +} +{ +mul.f16x2 r1392, r1389, r1321; +} +{ +add.f16x2 r1395, r1333, r1392; +} +{ +sub.f16x2 r1398, r1324, r1325; +} +{ +mul.f16x2 r1401, r1398, r1322; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1407, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1408, {low, high}; +} +{ +add.f16x2 r1409, r1410, r1411; +} +{ +add.f16x2 r1412, r1413, r1409; +} +{ +add.f16x2 r1415, r1416, r1417; +} +{ +add.f16x2 r1418, r1419, r1415; +} +{ +add.f16x2 r1421, r1410, r1411; +} +{ +mul.f16x2 r1424, r1421, r1407; +} +{ +add.f16x2 r1427, r1413, r1424; +} +{ +sub.f16x2 r1430, r1416, r1417; +} +{ +mul.f16x2 r1433, r1430, r1408; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +add.f16x2 r1439, r1410, r1411; +} +{ +mul.f16x2 r1442, r1439, r1407; +} +{ +add.f16x2 r1445, r1413, r1442; +} +{ +sub.f16x2 r1448, r1416, r1417; +} +{ +mul.f16x2 r1451, r1448, r1408; +} +{ +sub.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1416, r1417; +} +{ +mul.f16x2 r1460, r1457, r1407; +} +{ +add.f16x2 r1463, r1419, r1460; +} +{ +sub.f16x2 r1466, r1410, r1411; +} +{ +mul.f16x2 r1469, r1466, r1408; +} +{ +sub.f16x2 r1472, r1463, r1469; +} +{ +add.f16x2 r1475, r1416, r1417; +} +{ +mul.f16x2 r1478, r1475, r1407; +} +{ +add.f16x2 r1481, r1419, r1478; +} +{ +sub.f16x2 r1484, r1410, r1411; +} +{ +mul.f16x2 r1487, r1484, r1408; +} +{ +add.f16x2 r1490, r1481, r1487; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1493, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1494, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1495, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1496, {low, high}; +} +{ +mul.f16x2 r1503, r1436, r1493; +} +{ +mul.f16x2 r1506, r1472, r1494; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1436, r1494; +} +{ +fma.rn.f16x2 r1515, r1472, r1493, r1512; +} +{ +mul.f16x2 r1519, r1454, r1495; +} +{ +mul.f16x2 r1522, r1490, r1496; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1454, r1496; +} +{ +fma.rn.f16x2 r1531, r1490, r1495, r1528; +} +{ +add.f16x2 r1535, r1326, r1412; +} +{ +add.f16x2 r1538, r1332, r1418; +} +{ +sub.f16x2 r1541, r1326, r1412; +} +{ +sub.f16x2 r1544, r1332, r1418; +} +{ +add.f16x2 r1547, r1350, r1509; +} +{ +add.f16x2 r1550, r1386, r1515; +} +{ +sub.f16x2 r1553, r1350, r1509; +} +{ +sub.f16x2 r1556, r1386, r1515; +} +{ +add.f16x2 r1559, r1368, r1525; +} +{ +add.f16x2 r1562, r1404, r1531; +} +{ +sub.f16x2 r1565, r1368, r1525; +} +{ +sub.f16x2 r1568, r1404, r1531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1574, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1593, r1547, r1571; +} +{ +mul.f16x2 r1596, r1550, r1572; +} +{ +sub.f16x2 r1599, r1593, r1596; +} +{ +mul.f16x2 r1602, r1547, r1572; +} +{ +fma.rn.f16x2 r1605, r1550, r1571, r1602; +} +{ +mul.f16x2 r1609, r1559, r1573; +} +{ +mul.f16x2 r1612, r1562, r1574; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1559, r1574; +} +{ +fma.rn.f16x2 r1621, r1562, r1573, r1618; +} +{ +neg.f16x2 r1625, r1544; +} +{ +mul.f16x2 r1627, r1553, r1577; +} +{ +mul.f16x2 r1630, r1556, r1578; +} +{ +sub.f16x2 r1633, r1627, r1630; +} +{ +mul.f16x2 r1636, r1553, r1578; +} +{ +fma.rn.f16x2 r1639, r1556, r1577, r1636; +} +{ +mul.f16x2 r1643, r1565, r1579; +} +{ +mul.f16x2 r1646, r1568, r1580; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1565, r1580; +} +{ +fma.rn.f16x2 r1655, r1568, r1579, r1652; +} +{ +add.f16x2 %0, r1285, r1535; +} +{ +add.f16x2 %1, r1288, r1538; +} +{ +sub.f16x2 %12, r1285, r1535; +} +{ +sub.f16x2 %13, r1288, r1538; +} +{ +add.f16x2 %2, r1297, r1599; +} +{ +add.f16x2 %3, r1300, r1605; +} +{ +sub.f16x2 %14, r1297, r1599; +} +{ +sub.f16x2 %15, r1300, r1605; +} +{ +add.f16x2 %4, r1309, r1615; +} +{ +add.f16x2 %5, r1312, r1621; +} +{ +sub.f16x2 %16, r1309, r1615; +} +{ +sub.f16x2 %17, r1312, r1621; +} +{ +add.f16x2 %6, r1291, r1625; +} +{ +add.f16x2 %7, r1294, r1541; +} +{ +sub.f16x2 %18, r1291, r1625; +} +{ +sub.f16x2 %19, r1294, r1541; +} +{ +add.f16x2 %8, r1303, r1633; +} +{ +add.f16x2 %9, r1306, r1639; +} +{ +sub.f16x2 %20, r1303, r1633; +} +{ +sub.f16x2 %21, r1306, r1639; +} +{ +add.f16x2 %10, r1315, r1649; +} +{ +add.f16x2 %11, r1318, r1655; +} +{ +sub.f16x2 %22, r1315, r1649; +} +{ +sub.f16x2 %23, r1318, r1655; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1147, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<230>; +.reg .b32 r<1741>; +.reg .b64 rd<4>; +mov.u32 r1731, %tid.y; +mov.u32 r1732, %24; +mad.lo.s32 r1733, r1731, 576, r1732; +mov.u32 r1734, %tid.x; +mov.f32 f196, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1, {low, high}; +} +mov.f32 f200, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %33, %41; +} +{ +add.f16x2 r6, %25, r3; +} +{ +add.f16x2 r9, %34, %42; +} +{ +add.f16x2 r12, %26, r9; +} +{ +add.f16x2 r15, %33, %41; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %25, r18; +} +{ +sub.f16x2 r24, %34, %42; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %33, %41; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %25, r36; +} +{ +sub.f16x2 r42, %34, %42; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %34, %42; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %26, r54; +} +{ +sub.f16x2 r60, %33, %41; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %34, %42; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %26, r72; +} +{ +sub.f16x2 r78, %33, %41; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %37, %45; +} +{ +add.f16x2 r92, %29, r89; +} +{ +add.f16x2 r95, %38, %46; +} +{ +add.f16x2 r98, %30, r95; +} +{ +add.f16x2 r101, %37, %45; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %29, r104; +} +{ +sub.f16x2 r110, %38, %46; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %37, %45; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %29, r122; +} +{ +sub.f16x2 r128, %38, %46; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %38, %46; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %30, r140; +} +{ +sub.f16x2 r146, %37, %45; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %38, %46; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %30, r158; +} +{ +sub.f16x2 r164, %37, %45; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f202, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r173, {low, high}; +} +mov.f32 f198, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r176, {low, high}; +} +mov.f32 f123, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r252, {low, high}; +} +{ +add.f16x2 r253, %35, %43; +} +{ +add.f16x2 r256, %27, r253; +} +{ +add.f16x2 r259, %36, %44; +} +{ +add.f16x2 r262, %28, r259; +} +{ +add.f16x2 r265, %35, %43; +} +{ +mul.f16x2 r268, r265, r251; +} +{ +add.f16x2 r271, %27, r268; +} +{ +sub.f16x2 r274, %36, %44; +} +{ +mul.f16x2 r277, r274, r252; +} +{ +add.f16x2 r280, r271, r277; +} +{ +add.f16x2 r283, %35, %43; +} +{ +mul.f16x2 r286, r283, r251; +} +{ +add.f16x2 r289, %27, r286; +} +{ +sub.f16x2 r292, %36, %44; +} +{ +mul.f16x2 r295, r292, r252; +} +{ +sub.f16x2 r298, r289, r295; +} +{ +add.f16x2 r301, %36, %44; +} +{ +mul.f16x2 r304, r301, r251; +} +{ +add.f16x2 r307, %28, r304; +} +{ +sub.f16x2 r310, %35, %43; +} +{ +mul.f16x2 r313, r310, r252; +} +{ +sub.f16x2 r316, r307, r313; +} +{ +add.f16x2 r319, %36, %44; +} +{ +mul.f16x2 r322, r319, r251; +} +{ +add.f16x2 r325, %28, r322; +} +{ +sub.f16x2 r328, %35, %43; +} +{ +mul.f16x2 r331, r328, r252; +} +{ +add.f16x2 r334, r325, r331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r337, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r338, {low, high}; +} +{ +add.f16x2 r339, %39, %47; +} +{ +add.f16x2 r342, %31, r339; +} +{ +add.f16x2 r345, %40, %48; +} +{ +add.f16x2 r348, %32, r345; +} +{ +add.f16x2 r351, %39, %47; +} +{ +mul.f16x2 r354, r351, r337; +} +{ +add.f16x2 r357, %31, r354; +} +{ +sub.f16x2 r360, %40, %48; +} +{ +mul.f16x2 r363, r360, r338; +} +{ +add.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %39, %47; +} +{ +mul.f16x2 r372, r369, r337; +} +{ +add.f16x2 r375, %31, r372; +} +{ +sub.f16x2 r378, %40, %48; +} +{ +mul.f16x2 r381, r378, r338; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %40, %48; +} +{ +mul.f16x2 r390, r387, r337; +} +{ +add.f16x2 r393, %32, r390; +} +{ +sub.f16x2 r396, %39, %47; +} +{ +mul.f16x2 r399, r396, r338; +} +{ +sub.f16x2 r402, r393, r399; +} +{ +add.f16x2 r405, %40, %48; +} +{ +mul.f16x2 r408, r405, r337; +} +{ +add.f16x2 r411, %32, r408; +} +{ +sub.f16x2 r414, %39, %47; +} +{ +mul.f16x2 r417, r414, r338; +} +{ +add.f16x2 r420, r411, r417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r426, {low, high}; +} +{ +mul.f16x2 r433, r366, r423; +} +{ +mul.f16x2 r436, r402, r424; +} +{ +sub.f16x2 r439, r433, r436; +} +{ +mul.f16x2 r442, r366, r424; +} +{ +fma.rn.f16x2 r445, r402, r423, r442; +} +{ +mul.f16x2 r449, r384, r425; +} +{ +mul.f16x2 r452, r420, r426; +} +{ +sub.f16x2 r455, r449, r452; +} +{ +mul.f16x2 r458, r384, r426; +} +{ +fma.rn.f16x2 r461, r420, r425, r458; +} +{ +add.f16x2 r465, r256, r342; +} +{ +add.f16x2 r468, r262, r348; +} +{ +sub.f16x2 r471, r256, r342; +} +{ +sub.f16x2 r474, r262, r348; +} +{ +add.f16x2 r477, r280, r439; +} +{ +add.f16x2 r480, r316, r445; +} +{ +sub.f16x2 r483, r280, r439; +} +{ +sub.f16x2 r486, r316, r445; +} +{ +add.f16x2 r489, r298, r455; +} +{ +add.f16x2 r492, r334, r461; +} +{ +sub.f16x2 r495, r298, r455; +} +{ +sub.f16x2 r498, r334, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r501, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r502, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r503, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r504, {low, high}; +} +mov.f32 f124, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r507, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r510, {low, high}; +} +{ +mul.f16x2 r523, r477, r501; +} +{ +mul.f16x2 r526, r480, r502; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r477, r502; +} +{ +fma.rn.f16x2 r535, r480, r501, r532; +} +{ +mul.f16x2 r539, r489, r503; +} +{ +mul.f16x2 r542, r492, r504; +} +{ +sub.f16x2 r545, r539, r542; +} +{ +mul.f16x2 r548, r489, r504; +} +{ +fma.rn.f16x2 r551, r492, r503, r548; +} +{ +neg.f16x2 r555, r474; +} +{ +mul.f16x2 r557, r483, r507; +} +{ +mul.f16x2 r560, r486, r508; +} +{ +sub.f16x2 r563, r557, r560; +} +{ +mul.f16x2 r566, r483, r508; +} +{ +fma.rn.f16x2 r569, r486, r507, r566; +} +{ +mul.f16x2 r573, r495, r509; +} +{ +mul.f16x2 r576, r498, r510; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r495, r510; +} +{ +fma.rn.f16x2 r585, r498, r509, r582; +} +{ +add.f16x2 r589, r215, r465; +} +{ +add.f16x2 r592, r218, r468; +} +{ +sub.f16x2 r595, r215, r465; +} +{ +sub.f16x2 r598, r218, r468; +} +{ +add.f16x2 r601, r227, r529; +} +{ +add.f16x2 r604, r230, r535; +} +{ +sub.f16x2 r607, r227, r529; +} +{ +sub.f16x2 r610, r230, r535; +} +{ +add.f16x2 r613, r239, r545; +} +{ +add.f16x2 r616, r242, r551; +} +{ +sub.f16x2 r619, r239, r545; +} +{ +sub.f16x2 r622, r242, r551; +} +{ +add.f16x2 r625, r221, r555; +} +{ +add.f16x2 r628, r224, r471; +} +{ +sub.f16x2 r631, r221, r555; +} +{ +sub.f16x2 r634, r224, r471; +} +{ +add.f16x2 r637, r233, r563; +} +{ +add.f16x2 r640, r236, r569; +} +{ +sub.f16x2 r643, r233, r563; +} +{ +sub.f16x2 r646, r236, r569; +} +{ +add.f16x2 r649, r245, r579; +} +{ +add.f16x2 r652, r248, r585; +} +{ +sub.f16x2 r655, r245, r579; +} +{ +sub.f16x2 r658, r248, r585; +} +mul.wide.u32 rd2, r1734, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1735, rd3; +mul.lo.s32 r1736, r1735, 12; +sub.s32 r1737, r1734, r1736; +mad.lo.s32 r1738, r1735, 576, r1733; +cvt.rn.f32.u32 f227, r1737; +mul.f32 f228, f227, 0f3D32B8C2; +cos.approx.f32 f101, f228; +sin.approx.f32 f229, f228; +neg.f32 f102, f229; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r664, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r666, {high, high}; +} +{ +mul.f16x2 r668, r604, r666; +} +{ +fma.rn.f16x2 r671, r601, r664, r668; +} +{ +mul.f16x2 r675, r601, r666; +} +{ +neg.f16x2 r678, r675; +} +{ +fma.rn.f16x2 r680, r604, r664, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r684, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r686, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r686, r688; +} +{ +mul.f16x2 r692, r661, r684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r695, {high, low}; +} +{ +fma.rn.f16x2 r697, r689, r695, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r701, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r703, {high, high}; +} +{ +mul.f16x2 r705, r616, r703; +} +{ +fma.rn.f16x2 r708, r613, r701, r705; +} +{ +mul.f16x2 r712, r613, r703; +} +{ +neg.f16x2 r715, r712; +} +{ +fma.rn.f16x2 r717, r616, r701, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r721, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r723, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r725, {low, high}; +} +{ +mul.f16x2 r726, r723, r725; +} +{ +mul.f16x2 r729, r697, r721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r732, {high, low}; +} +{ +fma.rn.f16x2 r734, r726, r732, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r738, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r740, {high, high}; +} +{ +mul.f16x2 r742, r628, r740; +} +{ +fma.rn.f16x2 r745, r625, r738, r742; +} +{ +mul.f16x2 r749, r625, r740; +} +{ +neg.f16x2 r752, r749; +} +{ +fma.rn.f16x2 r754, r628, r738, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r758, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r760, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r762, {low, high}; +} +{ +mul.f16x2 r763, r760, r762; +} +{ +mul.f16x2 r766, r734, r758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r769, {high, low}; +} +{ +fma.rn.f16x2 r771, r763, r769, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r775, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r777, {high, high}; +} +{ +mul.f16x2 r779, r640, r777; +} +{ +fma.rn.f16x2 r782, r637, r775, r779; +} +{ +mul.f16x2 r786, r637, r777; +} +{ +neg.f16x2 r789, r786; +} +{ +fma.rn.f16x2 r791, r640, r775, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r797, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r799, {low, high}; +} +{ +mul.f16x2 r800, r797, r799; +} +{ +mul.f16x2 r803, r771, r795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r806, {high, low}; +} +{ +fma.rn.f16x2 r808, r800, r806, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r812, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r814, {high, high}; +} +{ +mul.f16x2 r816, r652, r814; +} +{ +fma.rn.f16x2 r819, r649, r812, r816; +} +{ +mul.f16x2 r823, r649, r814; +} +{ +neg.f16x2 r826, r823; +} +{ +fma.rn.f16x2 r828, r652, r812, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r834, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r836, {low, high}; +} +{ +mul.f16x2 r837, r834, r836; +} +{ +mul.f16x2 r840, r808, r832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r843, {high, low}; +} +{ +fma.rn.f16x2 r845, r837, r843, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r849, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r851, {high, high}; +} +{ +mul.f16x2 r853, r598, r851; +} +{ +fma.rn.f16x2 r856, r595, r849, r853; +} +{ +mul.f16x2 r860, r595, r851; +} +{ +neg.f16x2 r863, r860; +} +{ +fma.rn.f16x2 r865, r598, r849, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r871, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r873, {low, high}; +} +{ +mul.f16x2 r874, r871, r873; +} +{ +mul.f16x2 r877, r845, r869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r880, {high, low}; +} +{ +fma.rn.f16x2 r882, r874, r880, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r886, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r888, {high, high}; +} +{ +mul.f16x2 r890, r610, r888; +} +{ +fma.rn.f16x2 r893, r607, r886, r890; +} +{ +mul.f16x2 r897, r607, r888; +} +{ +neg.f16x2 r900, r897; +} +{ +fma.rn.f16x2 r902, r610, r886, r900; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r908, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r910, {low, high}; +} +{ +mul.f16x2 r911, r908, r910; +} +{ +mul.f16x2 r914, r882, r906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r917, {high, low}; +} +{ +fma.rn.f16x2 r919, r911, r917, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r923, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r925, {high, high}; +} +{ +mul.f16x2 r927, r622, r925; +} +{ +fma.rn.f16x2 r930, r619, r923, r927; +} +{ +mul.f16x2 r934, r619, r925; +} +{ +neg.f16x2 r937, r934; +} +{ +fma.rn.f16x2 r939, r622, r923, r937; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r943, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r945, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r947, {low, high}; +} +{ +mul.f16x2 r948, r945, r947; +} +{ +mul.f16x2 r951, r919, r943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r954, {high, low}; +} +{ +fma.rn.f16x2 r956, r948, r954, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r960, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r962, {high, high}; +} +{ +mul.f16x2 r964, r634, r962; +} +{ +fma.rn.f16x2 r967, r631, r960, r964; +} +{ +mul.f16x2 r971, r631, r962; +} +{ +neg.f16x2 r974, r971; +} +{ +fma.rn.f16x2 r976, r634, r960, r974; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r980, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r982, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r984, {low, high}; +} +{ +mul.f16x2 r985, r982, r984; +} +{ +mul.f16x2 r988, r956, r980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r991, {high, low}; +} +{ +fma.rn.f16x2 r993, r985, r991, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r997, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r999, {high, high}; +} +{ +mul.f16x2 r1001, r646, r999; +} +{ +fma.rn.f16x2 r1004, r643, r997, r1001; +} +{ +mul.f16x2 r1008, r643, r999; +} +{ +neg.f16x2 r1011, r1008; +} +{ +fma.rn.f16x2 r1013, r646, r997, r1011; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1017, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1019, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f123; +cvt.rn.f16.f32 high, f124; +mov.b32 r1021, {low, high}; +} +{ +mul.f16x2 r1022, r1019, r1021; +} +{ +mul.f16x2 r1025, r993, r1017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r1028, {high, low}; +} +{ +fma.rn.f16x2 r1030, r1022, r1028, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1036, {high, high}; +} +{ +mul.f16x2 r1038, r658, r1036; +} +{ +fma.rn.f16x2 r1041, r655, r1034, r1038; +} +{ +mul.f16x2 r1045, r655, r1036; +} +{ +neg.f16x2 r1048, r1045; +} +{ +fma.rn.f16x2 r1050, r658, r1034, r1048; +} +barrier.sync 0; +mad.lo.s32 r1739, r1737, 48, r1738; +st.shared.v4.f32 [r1739], {r589, r671, r708, r745}; +st.shared.v4.f32 [r1739+16], {r782, r819, r856, r893}; +st.shared.v4.f32 [r1739+32], {r930, r967, r1004, r1041}; +barrier.sync 0; +mad.lo.s32 r1740, r1737, -44, r1739; +ld.shared.u32 r1077, [r1740]; +ld.shared.u32 r1327, [r1740+48]; +ld.shared.u32 r1163, [r1740+96]; +ld.shared.u32 r1413, [r1740+144]; +ld.shared.u32 r1074, [r1740+192]; +ld.shared.u32 r1324, [r1740+240]; +ld.shared.u32 r1160, [r1740+288]; +ld.shared.u32 r1410, [r1740+336]; +ld.shared.u32 r1075, [r1740+384]; +ld.shared.u32 r1325, [r1740+432]; +ld.shared.u32 r1161, [r1740+480]; +ld.shared.u32 r1411, [r1740+528]; +barrier.sync 0; +st.shared.v4.f32 [r1739], {r592, r680, r717, r754}; +st.shared.v4.f32 [r1739+16], {r791, r828, r865, r902}; +st.shared.v4.f32 [r1739+32], {r939, r976, r1013, r1050}; +barrier.sync 0; +ld.shared.u32 r1083, [r1740]; +ld.shared.u32 r1333, [r1740+48]; +ld.shared.u32 r1169, [r1740+96]; +ld.shared.u32 r1419, [r1740+144]; +ld.shared.u32 r1080, [r1740+192]; +ld.shared.u32 r1330, [r1740+240]; +ld.shared.u32 r1166, [r1740+288]; +ld.shared.u32 r1416, [r1740+336]; +ld.shared.u32 r1081, [r1740+384]; +ld.shared.u32 r1331, [r1740+432]; +ld.shared.u32 r1167, [r1740+480]; +ld.shared.u32 r1417, [r1740+528]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1071, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1072, {low, high}; +} +{ +add.f16x2 r1073, r1074, r1075; +} +{ +add.f16x2 r1076, r1077, r1073; +} +{ +add.f16x2 r1079, r1080, r1081; +} +{ +add.f16x2 r1082, r1083, r1079; +} +{ +add.f16x2 r1085, r1074, r1075; +} +{ +mul.f16x2 r1088, r1085, r1071; +} +{ +add.f16x2 r1091, r1077, r1088; +} +{ +sub.f16x2 r1094, r1080, r1081; +} +{ +mul.f16x2 r1097, r1094, r1072; +} +{ +add.f16x2 r1100, r1091, r1097; +} +{ +add.f16x2 r1103, r1074, r1075; +} +{ +mul.f16x2 r1106, r1103, r1071; +} +{ +add.f16x2 r1109, r1077, r1106; +} +{ +sub.f16x2 r1112, r1080, r1081; +} +{ +mul.f16x2 r1115, r1112, r1072; +} +{ +sub.f16x2 r1118, r1109, r1115; +} +{ +add.f16x2 r1121, r1080, r1081; +} +{ +mul.f16x2 r1124, r1121, r1071; +} +{ +add.f16x2 r1127, r1083, r1124; +} +{ +sub.f16x2 r1130, r1074, r1075; +} +{ +mul.f16x2 r1133, r1130, r1072; +} +{ +sub.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r1080, r1081; +} +{ +mul.f16x2 r1142, r1139, r1071; +} +{ +add.f16x2 r1145, r1083, r1142; +} +{ +sub.f16x2 r1148, r1074, r1075; +} +{ +mul.f16x2 r1151, r1148, r1072; +} +{ +add.f16x2 r1154, r1145, r1151; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1158, {low, high}; +} +{ +add.f16x2 r1159, r1160, r1161; +} +{ +add.f16x2 r1162, r1163, r1159; +} +{ +add.f16x2 r1165, r1166, r1167; +} +{ +add.f16x2 r1168, r1169, r1165; +} +{ +add.f16x2 r1171, r1160, r1161; +} +{ +mul.f16x2 r1174, r1171, r1157; +} +{ +add.f16x2 r1177, r1163, r1174; +} +{ +sub.f16x2 r1180, r1166, r1167; +} +{ +mul.f16x2 r1183, r1180, r1158; +} +{ +add.f16x2 r1186, r1177, r1183; +} +{ +add.f16x2 r1189, r1160, r1161; +} +{ +mul.f16x2 r1192, r1189, r1157; +} +{ +add.f16x2 r1195, r1163, r1192; +} +{ +sub.f16x2 r1198, r1166, r1167; +} +{ +mul.f16x2 r1201, r1198, r1158; +} +{ +sub.f16x2 r1204, r1195, r1201; +} +{ +add.f16x2 r1207, r1166, r1167; +} +{ +mul.f16x2 r1210, r1207, r1157; +} +{ +add.f16x2 r1213, r1169, r1210; +} +{ +sub.f16x2 r1216, r1160, r1161; +} +{ +mul.f16x2 r1219, r1216, r1158; +} +{ +sub.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, r1166, r1167; +} +{ +mul.f16x2 r1228, r1225, r1157; +} +{ +add.f16x2 r1231, r1169, r1228; +} +{ +sub.f16x2 r1234, r1160, r1161; +} +{ +mul.f16x2 r1237, r1234, r1158; +} +{ +add.f16x2 r1240, r1231, r1237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1246, {low, high}; +} +{ +mul.f16x2 r1253, r1186, r1243; +} +{ +mul.f16x2 r1256, r1222, r1244; +} +{ +sub.f16x2 r1259, r1253, r1256; +} +{ +mul.f16x2 r1262, r1186, r1244; +} +{ +fma.rn.f16x2 r1265, r1222, r1243, r1262; +} +{ +mul.f16x2 r1269, r1204, r1245; +} +{ +mul.f16x2 r1272, r1240, r1246; +} +{ +sub.f16x2 r1275, r1269, r1272; +} +{ +mul.f16x2 r1278, r1204, r1246; +} +{ +fma.rn.f16x2 r1281, r1240, r1245, r1278; +} +{ +add.f16x2 r1285, r1076, r1162; +} +{ +add.f16x2 r1288, r1082, r1168; +} +{ +sub.f16x2 r1291, r1076, r1162; +} +{ +sub.f16x2 r1294, r1082, r1168; +} +{ +add.f16x2 r1297, r1100, r1259; +} +{ +add.f16x2 r1300, r1136, r1265; +} +{ +sub.f16x2 r1303, r1100, r1259; +} +{ +sub.f16x2 r1306, r1136, r1265; +} +{ +add.f16x2 r1309, r1118, r1275; +} +{ +add.f16x2 r1312, r1154, r1281; +} +{ +sub.f16x2 r1315, r1118, r1275; +} +{ +sub.f16x2 r1318, r1154, r1281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1322, {low, high}; +} +{ +add.f16x2 r1323, r1324, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +add.f16x2 r1329, r1330, r1331; +} +{ +add.f16x2 r1332, r1333, r1329; +} +{ +add.f16x2 r1335, r1324, r1325; +} +{ +mul.f16x2 r1338, r1335, r1321; +} +{ +add.f16x2 r1341, r1327, r1338; +} +{ +sub.f16x2 r1344, r1330, r1331; +} +{ +mul.f16x2 r1347, r1344, r1322; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1324, r1325; +} +{ +mul.f16x2 r1356, r1353, r1321; +} +{ +add.f16x2 r1359, r1327, r1356; +} +{ +sub.f16x2 r1362, r1330, r1331; +} +{ +mul.f16x2 r1365, r1362, r1322; +} +{ +sub.f16x2 r1368, r1359, r1365; +} +{ +add.f16x2 r1371, r1330, r1331; +} +{ +mul.f16x2 r1374, r1371, r1321; +} +{ +add.f16x2 r1377, r1333, r1374; +} +{ +sub.f16x2 r1380, r1324, r1325; +} +{ +mul.f16x2 r1383, r1380, r1322; +} +{ +sub.f16x2 r1386, r1377, r1383; +} +{ +add.f16x2 r1389, r1330, r1331; +} +{ +mul.f16x2 r1392, r1389, r1321; +} +{ +add.f16x2 r1395, r1333, r1392; +} +{ +sub.f16x2 r1398, r1324, r1325; +} +{ +mul.f16x2 r1401, r1398, r1322; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1407, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1408, {low, high}; +} +{ +add.f16x2 r1409, r1410, r1411; +} +{ +add.f16x2 r1412, r1413, r1409; +} +{ +add.f16x2 r1415, r1416, r1417; +} +{ +add.f16x2 r1418, r1419, r1415; +} +{ +add.f16x2 r1421, r1410, r1411; +} +{ +mul.f16x2 r1424, r1421, r1407; +} +{ +add.f16x2 r1427, r1413, r1424; +} +{ +sub.f16x2 r1430, r1416, r1417; +} +{ +mul.f16x2 r1433, r1430, r1408; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +add.f16x2 r1439, r1410, r1411; +} +{ +mul.f16x2 r1442, r1439, r1407; +} +{ +add.f16x2 r1445, r1413, r1442; +} +{ +sub.f16x2 r1448, r1416, r1417; +} +{ +mul.f16x2 r1451, r1448, r1408; +} +{ +sub.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1416, r1417; +} +{ +mul.f16x2 r1460, r1457, r1407; +} +{ +add.f16x2 r1463, r1419, r1460; +} +{ +sub.f16x2 r1466, r1410, r1411; +} +{ +mul.f16x2 r1469, r1466, r1408; +} +{ +sub.f16x2 r1472, r1463, r1469; +} +{ +add.f16x2 r1475, r1416, r1417; +} +{ +mul.f16x2 r1478, r1475, r1407; +} +{ +add.f16x2 r1481, r1419, r1478; +} +{ +sub.f16x2 r1484, r1410, r1411; +} +{ +mul.f16x2 r1487, r1484, r1408; +} +{ +add.f16x2 r1490, r1481, r1487; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1493, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1494, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1495, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1496, {low, high}; +} +{ +mul.f16x2 r1503, r1436, r1493; +} +{ +mul.f16x2 r1506, r1472, r1494; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1436, r1494; +} +{ +fma.rn.f16x2 r1515, r1472, r1493, r1512; +} +{ +mul.f16x2 r1519, r1454, r1495; +} +{ +mul.f16x2 r1522, r1490, r1496; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1454, r1496; +} +{ +fma.rn.f16x2 r1531, r1490, r1495, r1528; +} +{ +add.f16x2 r1535, r1326, r1412; +} +{ +add.f16x2 r1538, r1332, r1418; +} +{ +sub.f16x2 r1541, r1326, r1412; +} +{ +sub.f16x2 r1544, r1332, r1418; +} +{ +add.f16x2 r1547, r1350, r1509; +} +{ +add.f16x2 r1550, r1386, r1515; +} +{ +sub.f16x2 r1553, r1350, r1509; +} +{ +sub.f16x2 r1556, r1386, r1515; +} +{ +add.f16x2 r1559, r1368, r1525; +} +{ +add.f16x2 r1562, r1404, r1531; +} +{ +sub.f16x2 r1565, r1368, r1525; +} +{ +sub.f16x2 r1568, r1404, r1531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1574, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f202; +cvt.rn.f16.f32 high, f202; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1593, r1547, r1571; +} +{ +mul.f16x2 r1596, r1550, r1572; +} +{ +sub.f16x2 r1599, r1593, r1596; +} +{ +mul.f16x2 r1602, r1547, r1572; +} +{ +fma.rn.f16x2 r1605, r1550, r1571, r1602; +} +{ +mul.f16x2 r1609, r1559, r1573; +} +{ +mul.f16x2 r1612, r1562, r1574; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1559, r1574; +} +{ +fma.rn.f16x2 r1621, r1562, r1573, r1618; +} +{ +neg.f16x2 r1625, r1544; +} +{ +mul.f16x2 r1627, r1553, r1577; +} +{ +mul.f16x2 r1630, r1556, r1578; +} +{ +sub.f16x2 r1633, r1627, r1630; +} +{ +mul.f16x2 r1636, r1553, r1578; +} +{ +fma.rn.f16x2 r1639, r1556, r1577, r1636; +} +{ +mul.f16x2 r1643, r1565, r1579; +} +{ +mul.f16x2 r1646, r1568, r1580; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1565, r1580; +} +{ +fma.rn.f16x2 r1655, r1568, r1579, r1652; +} +{ +add.f16x2 %0, r1285, r1535; +} +{ +add.f16x2 %1, r1288, r1538; +} +{ +sub.f16x2 %12, r1285, r1535; +} +{ +sub.f16x2 %13, r1288, r1538; +} +{ +add.f16x2 %2, r1297, r1599; +} +{ +add.f16x2 %3, r1300, r1605; +} +{ +sub.f16x2 %14, r1297, r1599; +} +{ +sub.f16x2 %15, r1300, r1605; +} +{ +add.f16x2 %4, r1309, r1615; +} +{ +add.f16x2 %5, r1312, r1621; +} +{ +sub.f16x2 %16, r1309, r1615; +} +{ +sub.f16x2 %17, r1312, r1621; +} +{ +add.f16x2 %6, r1291, r1625; +} +{ +add.f16x2 %7, r1294, r1541; +} +{ +sub.f16x2 %18, r1291, r1625; +} +{ +sub.f16x2 %19, r1294, r1541; +} +{ +add.f16x2 %8, r1303, r1633; +} +{ +add.f16x2 %9, r1306, r1639; +} +{ +sub.f16x2 %20, r1303, r1633; +} +{ +sub.f16x2 %21, r1306, r1639; +} +{ +add.f16x2 %10, r1315, r1649; +} +{ +add.f16x2 %11, r1318, r1655; +} +{ +sub.f16x2 %22, r1315, r1649; +} +{ +sub.f16x2 %23, r1318, r1655; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..637f537adda13 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp32_fwd.hpp.inc @@ -0,0 +1,934 @@ +#ifndef CUFFTDX_FFT_144_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_144_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<198, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<510>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 1152, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %36, %47; +add.f32 f50, %26, f49; +add.f32 f51, %38, %49; +add.f32 f52, %27, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %26, f53; +sub.f32 f55, %38, %49; +mul.f32 f56, f55, 0f3F5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %27, f59; +sub.f32 f61, %36, %47; +mul.f32 f62, f61, 0f3F5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %42, %52; +add.f32 f66, %31, f65; +add.f32 f67, %43, %54; +add.f32 f68, %33, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %31, f69; +sub.f32 f71, %43, %54; +mul.f32 f72, f71, 0f3F5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %33, f75; +sub.f32 f77, %42, %52; +mul.f32 f78, f77, 0f3F5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0fBF5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0fBF5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0fBF5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0fBF5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %39, %50; +add.f32 f104, %28, f103; +add.f32 f105, %41, %51; +add.f32 f106, %30, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %28, f107; +sub.f32 f109, %41, %51; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %30, f113; +sub.f32 f115, %39, %50; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %44, %55; +add.f32 f120, %34, f119; +add.f32 f121, %46, %56; +add.f32 f122, %35, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %34, f123; +sub.f32 f125, %46, %56; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %35, f129; +sub.f32 f131, %44, %55; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0fBF5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0fBF5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0fBF5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0fBF5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0fBF000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0fBF000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0fBF5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0fBF5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0fBF5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0fBF5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0fBF000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0fBF000000, f175; +sub.f32 f177, f91, f145; +sub.f32 f178, f92, f146; +add.f32 f179, f95, f159; +add.f32 f180, f96, f161; +sub.f32 f181, f95, f159; +sub.f32 f182, f96, f161; +add.f32 f183, f99, f164; +add.f32 f184, f100, f166; +sub.f32 f185, f99, f164; +sub.f32 f186, f100, f166; +add.f32 f187, f93, f148; +sub.f32 f188, f94, f147; +sub.f32 f189, f93, f148; +add.f32 f190, f94, f147; +add.f32 f191, f97, f169; +add.f32 f192, f98, f171; +sub.f32 f193, f97, f169; +sub.f32 f194, f98, f171; +add.f32 f195, f101, f174; +add.f32 f196, f102, f176; +sub.f32 f197, f101, f174; +sub.f32 f198, f102, f176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1152, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f199, f200}, [rd6]; +mul.f32 f203, f199, f179; +mul.f32 f204, f200, f180; +mul.f32 f205, f199, f180; +mul.f32 f206, f199, f199; +mul.f32 f207, f200, f200; +sub.f32 f208, f206, f207; +mul.f32 f209, f200, f199; +fma.rn.f32 f210, f200, f199, f209; +mul.f32 f211, f208, f183; +mul.f32 f212, f210, f184; +mul.f32 f213, f208, f184; +mul.f32 f214, f199, f208; +mul.f32 f215, f200, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f199, f210; +fma.rn.f32 f218, f200, f208, f217; +mul.f32 f219, f216, f187; +mul.f32 f220, f218, f188; +mul.f32 f221, f216, f188; +mul.f32 f222, f199, f216; +mul.f32 f223, f200, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f199, f218; +fma.rn.f32 f226, f200, f216, f225; +mul.f32 f227, f224, f191; +mul.f32 f228, f226, f192; +mul.f32 f229, f224, f192; +mul.f32 f230, f199, f224; +mul.f32 f231, f200, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f199, f226; +fma.rn.f32 f234, f200, f224, f233; +mul.f32 f235, f232, f195; +mul.f32 f236, f234, f196; +mul.f32 f237, f232, f196; +mul.f32 f238, f199, f232; +mul.f32 f239, f200, f234; +sub.f32 f240, f238, f239; +mul.f32 f241, f199, f234; +fma.rn.f32 f242, f200, f232, f241; +mul.f32 f243, f240, f177; +mul.f32 f244, f242, f178; +mul.f32 f245, f240, f178; +mul.f32 f246, f199, f240; +mul.f32 f247, f200, f242; +sub.f32 f248, f246, f247; +mul.f32 f249, f199, f242; +fma.rn.f32 f250, f200, f240, f249; +mul.f32 f251, f248, f181; +mul.f32 f252, f250, f182; +mul.f32 f253, f248, f182; +mul.f32 f254, f199, f248; +mul.f32 f255, f200, f250; +sub.f32 f256, f254, f255; +mul.f32 f257, f199, f250; +fma.rn.f32 f258, f200, f248, f257; +mul.f32 f259, f256, f185; +mul.f32 f260, f258, f186; +mul.f32 f261, f256, f186; +mul.f32 f262, f199, f256; +mul.f32 f263, f200, f258; +sub.f32 f264, f262, f263; +mul.f32 f265, f199, f258; +fma.rn.f32 f266, f200, f256, f265; +mul.f32 f267, f264, f189; +mul.f32 f268, f266, f190; +mul.f32 f269, f264, f190; +mul.f32 f270, f199, f264; +mul.f32 f271, f200, f266; +sub.f32 f272, f270, f271; +mul.f32 f273, f199, f266; +fma.rn.f32 f274, f200, f264, f273; +mul.f32 f275, f272, f193; +mul.f32 f276, f274, f194; +mul.f32 f277, f272, f194; +mul.f32 f278, f199, f272; +mul.f32 f279, f200, f274; +sub.f32 f280, f278, f279; +mul.f32 f281, f199, f274; +fma.rn.f32 f282, f200, f272, f281; +mul.f32 f283, f280, f197; +mul.f32 f284, f282, f198; +mul.f32 f285, f280, f198; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f32 f286, f92, f146; +add.f32 f287, f91, f145; +fma.rn.f32 f288, f200, f179, f205; +sub.f32 f289, f203, f204; +st.shared.v4.f32 [r9], {f287, f286, f289, f288}; +fma.rn.f32 f290, f210, f183, f213; +sub.f32 f291, f211, f212; +sub.f32 f292, f219, f220; +fma.rn.f32 f293, f218, f187, f221; +st.shared.v4.f32 [r9+16], {f291, f290, f292, f293}; +sub.f32 f294, f227, f228; +fma.rn.f32 f295, f226, f191, f229; +fma.rn.f32 f296, f234, f195, f237; +sub.f32 f297, f235, f236; +st.shared.v4.f32 [r9+32], {f294, f295, f297, f296}; +fma.rn.f32 f298, f242, f177, f245; +sub.f32 f299, f243, f244; +fma.rn.f32 f300, f250, f181, f253; +sub.f32 f301, f251, f252; +st.shared.v4.f32 [r9+48], {f299, f298, f301, f300}; +fma.rn.f32 f302, f258, f185, f261; +sub.f32 f303, f259, f260; +fma.rn.f32 f304, f266, f189, f269; +sub.f32 f305, f267, f268; +st.shared.v4.f32 [r9+64], {f303, f302, f305, f304}; +fma.rn.f32 f306, f274, f193, f277; +sub.f32 f307, f275, f276; +fma.rn.f32 f308, f282, f197, f285; +sub.f32 f309, f283, f284; +st.shared.v4.f32 [r9+80], {f307, f306, f309, f308}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.v2.f32 {f310, f311}, [r10]; +ld.shared.v2.f32 {f314, f315}, [r10+96]; +ld.shared.v2.f32 {f318, f319}, [r10+192]; +ld.shared.v2.f32 {f322, f323}, [r10+288]; +ld.shared.v2.f32 {f326, f327}, [r10+384]; +ld.shared.v2.f32 {f330, f331}, [r10+480]; +ld.shared.v2.f32 {f334, f335}, [r10+576]; +ld.shared.v2.f32 {f338, f339}, [r10+672]; +ld.shared.v2.f32 {f342, f343}, [r10+768]; +ld.shared.v2.f32 {f346, f347}, [r10+864]; +ld.shared.v2.f32 {f350, f351}, [r10+960]; +ld.shared.v2.f32 {f354, f355}, [r10+1056]; +add.f32 f358, f326, f342; +add.f32 f359, f310, f358; +add.f32 f360, f327, f343; +add.f32 f361, f311, f360; +mul.f32 f362, f358, 0f3F000000; +sub.f32 f363, f310, f362; +sub.f32 f364, f327, f343; +mul.f32 f365, f364, 0f3F5DB3D7; +add.f32 f366, f365, f363; +sub.f32 f367, f363, f365; +mul.f32 f368, f360, 0f3F000000; +sub.f32 f369, f311, f368; +sub.f32 f370, f326, f342; +mul.f32 f371, f370, 0f3F5DB3D7; +sub.f32 f372, f369, f371; +add.f32 f373, f371, f369; +add.f32 f374, f334, f350; +add.f32 f375, f318, f374; +add.f32 f376, f335, f351; +add.f32 f377, f319, f376; +mul.f32 f378, f374, 0f3F000000; +sub.f32 f379, f318, f378; +sub.f32 f380, f335, f351; +mul.f32 f381, f380, 0f3F5DB3D7; +add.f32 f382, f381, f379; +sub.f32 f383, f379, f381; +mul.f32 f384, f376, 0f3F000000; +sub.f32 f385, f319, f384; +sub.f32 f386, f334, f350; +mul.f32 f387, f386, 0f3F5DB3D7; +sub.f32 f388, f385, f387; +add.f32 f389, f387, f385; +mul.f32 f390, f382, 0f3F000000; +mul.f32 f391, f388, 0fBF5DB3D7; +sub.f32 f392, f390, f391; +mul.f32 f393, f388, 0f3F000000; +fma.rn.f32 f394, f382, 0fBF5DB3D7, f393; +mul.f32 f395, f383, 0fBF000000; +mul.f32 f396, f389, 0fBF5DB3D7; +sub.f32 f397, f395, f396; +mul.f32 f398, f389, 0fBF000000; +fma.rn.f32 f399, f383, 0fBF5DB3D7, f398; +add.f32 f400, f359, f375; +add.f32 f401, f361, f377; +sub.f32 f402, f359, f375; +sub.f32 f403, f361, f377; +add.f32 f404, f366, f392; +add.f32 f405, f372, f394; +sub.f32 f406, f366, f392; +sub.f32 f407, f372, f394; +add.f32 f408, f367, f397; +add.f32 f409, f373, f399; +sub.f32 f410, f367, f397; +sub.f32 f411, f373, f399; +add.f32 f412, f330, f346; +add.f32 f413, f314, f412; +add.f32 f414, f331, f347; +add.f32 f415, f315, f414; +mul.f32 f416, f412, 0f3F000000; +sub.f32 f417, f314, f416; +sub.f32 f418, f331, f347; +mul.f32 f419, f418, 0f3F5DB3D7; +add.f32 f420, f419, f417; +sub.f32 f421, f417, f419; +mul.f32 f422, f414, 0f3F000000; +sub.f32 f423, f315, f422; +sub.f32 f424, f330, f346; +mul.f32 f425, f424, 0f3F5DB3D7; +sub.f32 f426, f423, f425; +add.f32 f427, f425, f423; +add.f32 f428, f338, f354; +add.f32 f429, f322, f428; +add.f32 f430, f339, f355; +add.f32 f431, f323, f430; +mul.f32 f432, f428, 0f3F000000; +sub.f32 f433, f322, f432; +sub.f32 f434, f339, f355; +mul.f32 f435, f434, 0f3F5DB3D7; +add.f32 f436, f435, f433; +sub.f32 f437, f433, f435; +mul.f32 f438, f430, 0f3F000000; +sub.f32 f439, f323, f438; +sub.f32 f440, f338, f354; +mul.f32 f441, f440, 0f3F5DB3D7; +sub.f32 f442, f439, f441; +add.f32 f443, f441, f439; +mul.f32 f444, f436, 0f3F000000; +mul.f32 f445, f442, 0fBF5DB3D7; +sub.f32 f446, f444, f445; +mul.f32 f447, f442, 0f3F000000; +fma.rn.f32 f448, f436, 0fBF5DB3D7, f447; +mul.f32 f449, f437, 0fBF000000; +mul.f32 f450, f443, 0fBF5DB3D7; +sub.f32 f451, f449, f450; +mul.f32 f452, f443, 0fBF000000; +fma.rn.f32 f453, f437, 0fBF5DB3D7, f452; +add.f32 f454, f413, f429; +add.f32 f455, f415, f431; +sub.f32 f456, f413, f429; +sub.f32 f457, f415, f431; +add.f32 f458, f420, f446; +add.f32 f459, f426, f448; +sub.f32 f460, f420, f446; +sub.f32 f461, f426, f448; +add.f32 f462, f421, f451; +add.f32 f463, f427, f453; +sub.f32 f464, f421, f451; +sub.f32 f465, f427, f453; +mul.f32 f466, f458, 0f3F5DB3D7; +mul.f32 f467, f459, 0fBF000000; +sub.f32 f468, f466, f467; +mul.f32 f469, f459, 0f3F5DB3D7; +fma.rn.f32 f470, f458, 0fBF000000, f469; +mul.f32 f471, f462, 0f3F000000; +mul.f32 f472, f463, 0fBF5DB3D7; +sub.f32 f473, f471, f472; +mul.f32 f474, f463, 0f3F000000; +fma.rn.f32 f475, f462, 0fBF5DB3D7, f474; +mul.f32 f476, f460, 0fBF000000; +mul.f32 f477, f461, 0fBF5DB3D7; +sub.f32 f478, f476, f477; +mul.f32 f479, f461, 0fBF000000; +fma.rn.f32 f480, f460, 0fBF5DB3D7, f479; +mul.f32 f481, f464, 0fBF5DB3D7; +mul.f32 f482, f465, 0fBF000000; +sub.f32 f483, f481, f482; +mul.f32 f484, f465, 0fBF5DB3D7; +fma.rn.f32 f485, f464, 0fBF000000, f484; +add.f32 %1, f401, f455; +add.f32 %0, f400, f454; +add.f32 %3, f405, f470; +add.f32 %2, f404, f468; +add.f32 %5, f409, f475; +add.f32 %4, f408, f473; +sub.f32 %7, f403, f456; +add.f32 %6, f402, f457; +add.f32 %9, f407, f480; +add.f32 %8, f406, f478; +add.f32 %11, f411, f485; +add.f32 %10, f410, f483; +sub.f32 %13, f401, f455; +sub.f32 %12, f400, f454; +sub.f32 %15, f405, f470; +sub.f32 %14, f404, f468; +sub.f32 %17, f409, f475; +sub.f32 %16, f408, f473; +add.f32 %19, f403, f456; +sub.f32 %18, f402, f457; +sub.f32 %21, f407, f480; +sub.f32 %20, f406, f478; +sub.f32 %23, f411, f485; +sub.f32 %22, f410, f483; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<199, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<486>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 576, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %36, %47; +add.f32 f50, %26, f49; +add.f32 f51, %38, %49; +add.f32 f52, %27, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %26, f53; +sub.f32 f55, %38, %49; +mul.f32 f56, f55, 0f3F5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %27, f59; +sub.f32 f61, %36, %47; +mul.f32 f62, f61, 0f3F5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %42, %52; +add.f32 f66, %31, f65; +add.f32 f67, %43, %54; +add.f32 f68, %33, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %31, f69; +sub.f32 f71, %43, %54; +mul.f32 f72, f71, 0f3F5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %33, f75; +sub.f32 f77, %42, %52; +mul.f32 f78, f77, 0f3F5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0fBF5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0fBF5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0fBF5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0fBF5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %39, %50; +add.f32 f104, %28, f103; +add.f32 f105, %41, %51; +add.f32 f106, %30, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %28, f107; +sub.f32 f109, %41, %51; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %30, f113; +sub.f32 f115, %39, %50; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %44, %55; +add.f32 f120, %34, f119; +add.f32 f121, %46, %56; +add.f32 f122, %35, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %34, f123; +sub.f32 f125, %46, %56; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %35, f129; +sub.f32 f131, %44, %55; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0fBF5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0fBF5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0fBF5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0fBF5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0fBF000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0fBF000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0fBF5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0fBF5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0fBF5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0fBF5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0fBF000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0fBF000000, f175; +add.f32 f177, f91, f145; +add.f32 f178, f92, f146; +sub.f32 f179, f91, f145; +sub.f32 f180, f92, f146; +add.f32 f181, f95, f159; +add.f32 f182, f96, f161; +sub.f32 f183, f95, f159; +sub.f32 f184, f96, f161; +add.f32 f185, f99, f164; +add.f32 f186, f100, f166; +sub.f32 f187, f99, f164; +sub.f32 f188, f100, f166; +add.f32 f189, f93, f148; +sub.f32 f190, f94, f147; +sub.f32 f191, f93, f148; +add.f32 f192, f94, f147; +add.f32 f193, f97, f169; +add.f32 f194, f98, f171; +sub.f32 f195, f97, f169; +sub.f32 f196, f98, f171; +add.f32 f197, f101, f174; +add.f32 f198, f102, f176; +sub.f32 f199, f101, f174; +sub.f32 f200, f102, f176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f201, f202}, [rd6]; +mul.f32 f205, f201, f181; +mul.f32 f206, f202, f182; +sub.f32 f207, f205, f206; +mul.f32 f208, f201, f182; +fma.rn.f32 f209, f202, f181, f208; +mul.f32 f210, f201, f201; +mul.f32 f211, f202, f202; +sub.f32 f212, f210, f211; +mul.f32 f213, f202, f201; +fma.rn.f32 f214, f202, f201, f213; +mul.f32 f215, f212, f185; +mul.f32 f216, f214, f186; +sub.f32 f217, f215, f216; +mul.f32 f218, f212, f186; +fma.rn.f32 f219, f214, f185, f218; +mul.f32 f220, f201, f212; +mul.f32 f221, f202, f214; +sub.f32 f222, f220, f221; +mul.f32 f223, f201, f214; +fma.rn.f32 f224, f202, f212, f223; +mul.f32 f225, f222, f189; +mul.f32 f226, f224, f190; +sub.f32 f227, f225, f226; +mul.f32 f228, f222, f190; +fma.rn.f32 f229, f224, f189, f228; +mul.f32 f230, f201, f222; +mul.f32 f231, f202, f224; +sub.f32 f232, f230, f231; +mul.f32 f233, f201, f224; +fma.rn.f32 f234, f202, f222, f233; +mul.f32 f235, f232, f193; +mul.f32 f236, f234, f194; +sub.f32 f237, f235, f236; +mul.f32 f238, f232, f194; +fma.rn.f32 f239, f234, f193, f238; +mul.f32 f240, f201, f232; +mul.f32 f241, f202, f234; +sub.f32 f242, f240, f241; +mul.f32 f243, f201, f234; +fma.rn.f32 f244, f202, f232, f243; +mul.f32 f245, f242, f197; +mul.f32 f246, f244, f198; +sub.f32 f247, f245, f246; +mul.f32 f248, f242, f198; +fma.rn.f32 f249, f244, f197, f248; +mul.f32 f250, f201, f242; +mul.f32 f251, f202, f244; +sub.f32 f252, f250, f251; +mul.f32 f253, f201, f244; +fma.rn.f32 f254, f202, f242, f253; +mul.f32 f255, f252, f179; +mul.f32 f256, f254, f180; +sub.f32 f257, f255, f256; +mul.f32 f258, f252, f180; +fma.rn.f32 f259, f254, f179, f258; +mul.f32 f260, f201, f252; +mul.f32 f261, f202, f254; +sub.f32 f262, f260, f261; +mul.f32 f263, f201, f254; +fma.rn.f32 f264, f202, f252, f263; +mul.f32 f265, f262, f183; +mul.f32 f266, f264, f184; +sub.f32 f267, f265, f266; +mul.f32 f268, f262, f184; +fma.rn.f32 f269, f264, f183, f268; +mul.f32 f270, f201, f262; +mul.f32 f271, f202, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f201, f264; +fma.rn.f32 f274, f202, f262, f273; +mul.f32 f275, f272, f187; +mul.f32 f276, f274, f188; +sub.f32 f277, f275, f276; +mul.f32 f278, f272, f188; +fma.rn.f32 f279, f274, f187, f278; +mul.f32 f280, f201, f272; +mul.f32 f281, f202, f274; +sub.f32 f282, f280, f281; +mul.f32 f283, f201, f274; +fma.rn.f32 f284, f202, f272, f283; +mul.f32 f285, f282, f191; +mul.f32 f286, f284, f192; +sub.f32 f287, f285, f286; +mul.f32 f288, f282, f192; +fma.rn.f32 f289, f284, f191, f288; +mul.f32 f290, f201, f282; +mul.f32 f291, f202, f284; +sub.f32 f292, f290, f291; +mul.f32 f293, f201, f284; +fma.rn.f32 f294, f202, f282, f293; +mul.f32 f295, f292, f195; +mul.f32 f296, f294, f196; +sub.f32 f297, f295, f296; +mul.f32 f298, f292, f196; +fma.rn.f32 f299, f294, f195, f298; +mul.f32 f300, f201, f292; +mul.f32 f301, f202, f294; +sub.f32 f302, f300, f301; +mul.f32 f303, f201, f294; +fma.rn.f32 f304, f202, f292, f303; +mul.f32 f305, f302, f199; +mul.f32 f306, f304, f200; +sub.f32 f307, f305, f306; +mul.f32 f308, f302, f200; +fma.rn.f32 f309, f304, f199, f308; +mad.lo.s32 r8, r5, 576, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v4.f32 [r9], {f177, f207, f217, f227}; +st.shared.v4.f32 [r9+16], {f237, f247, f257, f267}; +st.shared.v4.f32 [r9+32], {f277, f287, f297, f307}; +barrier.sync 0; +mad.lo.s32 r10, r7, -44, r9; +ld.shared.f32 f310, [r10]; +ld.shared.f32 f311, [r10+48]; +ld.shared.f32 f312, [r10+96]; +ld.shared.f32 f313, [r10+144]; +ld.shared.f32 f314, [r10+192]; +ld.shared.f32 f315, [r10+240]; +ld.shared.f32 f316, [r10+288]; +ld.shared.f32 f317, [r10+336]; +ld.shared.f32 f318, [r10+384]; +ld.shared.f32 f319, [r10+432]; +ld.shared.f32 f320, [r10+480]; +ld.shared.f32 f321, [r10+528]; +barrier.sync 0; +st.shared.v4.f32 [r9], {f178, f209, f219, f229}; +st.shared.v4.f32 [r9+16], {f239, f249, f259, f269}; +st.shared.v4.f32 [r9+32], {f279, f289, f299, f309}; +barrier.sync 0; +ld.shared.f32 f322, [r10]; +ld.shared.f32 f323, [r10+48]; +ld.shared.f32 f324, [r10+96]; +ld.shared.f32 f325, [r10+144]; +ld.shared.f32 f326, [r10+192]; +ld.shared.f32 f327, [r10+240]; +ld.shared.f32 f328, [r10+288]; +ld.shared.f32 f329, [r10+336]; +ld.shared.f32 f330, [r10+384]; +ld.shared.f32 f331, [r10+432]; +ld.shared.f32 f332, [r10+480]; +ld.shared.f32 f333, [r10+528]; +add.f32 f334, f314, f318; +add.f32 f335, f310, f334; +add.f32 f336, f326, f330; +add.f32 f337, f322, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f310, f338; +sub.f32 f340, f326, f330; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f322, f344; +sub.f32 f346, f314, f318; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f316, f320; +add.f32 f351, f312, f350; +add.f32 f352, f328, f332; +add.f32 f353, f324, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f312, f354; +sub.f32 f356, f328, f332; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f324, f360; +sub.f32 f362, f316, f320; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.f32 f366, f358, 0f3F000000; +mul.f32 f367, f364, 0fBF5DB3D7; +sub.f32 f368, f366, f367; +mul.f32 f369, f364, 0f3F000000; +fma.rn.f32 f370, f358, 0fBF5DB3D7, f369; +mul.f32 f371, f359, 0fBF000000; +mul.f32 f372, f365, 0fBF5DB3D7; +sub.f32 f373, f371, f372; +mul.f32 f374, f365, 0fBF000000; +fma.rn.f32 f375, f359, 0fBF5DB3D7, f374; +add.f32 f376, f335, f351; +add.f32 f377, f337, f353; +sub.f32 f378, f335, f351; +sub.f32 f379, f337, f353; +add.f32 f380, f342, f368; +add.f32 f381, f348, f370; +sub.f32 f382, f342, f368; +sub.f32 f383, f348, f370; +add.f32 f384, f343, f373; +add.f32 f385, f349, f375; +sub.f32 f386, f343, f373; +sub.f32 f387, f349, f375; +add.f32 f388, f315, f319; +add.f32 f389, f311, f388; +add.f32 f390, f327, f331; +add.f32 f391, f323, f390; +mul.f32 f392, f388, 0f3F000000; +sub.f32 f393, f311, f392; +sub.f32 f394, f327, f331; +mul.f32 f395, f394, 0f3F5DB3D7; +add.f32 f396, f395, f393; +sub.f32 f397, f393, f395; +mul.f32 f398, f390, 0f3F000000; +sub.f32 f399, f323, f398; +sub.f32 f400, f315, f319; +mul.f32 f401, f400, 0f3F5DB3D7; +sub.f32 f402, f399, f401; +add.f32 f403, f401, f399; +add.f32 f404, f317, f321; +add.f32 f405, f313, f404; +add.f32 f406, f329, f333; +add.f32 f407, f325, f406; +mul.f32 f408, f404, 0f3F000000; +sub.f32 f409, f313, f408; +sub.f32 f410, f329, f333; +mul.f32 f411, f410, 0f3F5DB3D7; +add.f32 f412, f411, f409; +sub.f32 f413, f409, f411; +mul.f32 f414, f406, 0f3F000000; +sub.f32 f415, f325, f414; +sub.f32 f416, f317, f321; +mul.f32 f417, f416, 0f3F5DB3D7; +sub.f32 f418, f415, f417; +add.f32 f419, f417, f415; +mul.f32 f420, f412, 0f3F000000; +mul.f32 f421, f418, 0fBF5DB3D7; +sub.f32 f422, f420, f421; +mul.f32 f423, f418, 0f3F000000; +fma.rn.f32 f424, f412, 0fBF5DB3D7, f423; +mul.f32 f425, f413, 0fBF000000; +mul.f32 f426, f419, 0fBF5DB3D7; +sub.f32 f427, f425, f426; +mul.f32 f428, f419, 0fBF000000; +fma.rn.f32 f429, f413, 0fBF5DB3D7, f428; +add.f32 f430, f389, f405; +add.f32 f431, f391, f407; +sub.f32 f432, f389, f405; +sub.f32 f433, f391, f407; +add.f32 f434, f396, f422; +add.f32 f435, f402, f424; +sub.f32 f436, f396, f422; +sub.f32 f437, f402, f424; +add.f32 f438, f397, f427; +add.f32 f439, f403, f429; +sub.f32 f440, f397, f427; +sub.f32 f441, f403, f429; +mul.f32 f442, f434, 0f3F5DB3D7; +mul.f32 f443, f435, 0fBF000000; +sub.f32 f444, f442, f443; +mul.f32 f445, f435, 0f3F5DB3D7; +fma.rn.f32 f446, f434, 0fBF000000, f445; +mul.f32 f447, f438, 0f3F000000; +mul.f32 f448, f439, 0fBF5DB3D7; +sub.f32 f449, f447, f448; +mul.f32 f450, f439, 0f3F000000; +fma.rn.f32 f451, f438, 0fBF5DB3D7, f450; +mul.f32 f452, f436, 0fBF000000; +mul.f32 f453, f437, 0fBF5DB3D7; +sub.f32 f454, f452, f453; +mul.f32 f455, f437, 0fBF000000; +fma.rn.f32 f456, f436, 0fBF5DB3D7, f455; +mul.f32 f457, f440, 0fBF5DB3D7; +mul.f32 f458, f441, 0fBF000000; +sub.f32 f459, f457, f458; +mul.f32 f460, f441, 0fBF5DB3D7; +fma.rn.f32 f461, f440, 0fBF000000, f460; +add.f32 %0, f376, f430; +add.f32 %1, f377, f431; +add.f32 %3, f381, f446; +add.f32 %2, f380, f444; +add.f32 %5, f385, f451; +add.f32 %4, f384, f449; +sub.f32 %7, f379, f432; +add.f32 %6, f378, f433; +add.f32 %9, f383, f456; +add.f32 %8, f382, f454; +add.f32 %11, f387, f461; +add.f32 %10, f386, f459; +sub.f32 %12, f376, f430; +sub.f32 %13, f377, f431; +sub.f32 %15, f381, f446; +sub.f32 %14, f380, f444; +sub.f32 %17, f385, f451; +sub.f32 %16, f384, f449; +add.f32 %19, f379, f432; +sub.f32 %18, f378, f433; +sub.f32 %21, f383, f456; +sub.f32 %20, f382, f454; +sub.f32 %23, f387, f461; +sub.f32 %22, f386, f459; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..4393596145ba7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp32_inv.hpp.inc @@ -0,0 +1,934 @@ +#ifndef CUFFTDX_FFT_144_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_144_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<400, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<510>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 1152, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %36, %47; +add.f32 f50, %26, f49; +add.f32 f51, %38, %49; +add.f32 f52, %27, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %26, f53; +sub.f32 f55, %38, %49; +mul.f32 f56, f55, 0fBF5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %27, f59; +sub.f32 f61, %36, %47; +mul.f32 f62, f61, 0fBF5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %42, %52; +add.f32 f66, %31, f65; +add.f32 f67, %43, %54; +add.f32 f68, %33, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %31, f69; +sub.f32 f71, %43, %54; +mul.f32 f72, f71, 0fBF5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %33, f75; +sub.f32 f77, %42, %52; +mul.f32 f78, f77, 0fBF5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0f3F5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0f3F5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0f3F5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0f3F5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %39, %50; +add.f32 f104, %28, f103; +add.f32 f105, %41, %51; +add.f32 f106, %30, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %28, f107; +sub.f32 f109, %41, %51; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %30, f113; +sub.f32 f115, %39, %50; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %44, %55; +add.f32 f120, %34, f119; +add.f32 f121, %46, %56; +add.f32 f122, %35, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %34, f123; +sub.f32 f125, %46, %56; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %35, f129; +sub.f32 f131, %44, %55; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0f3F5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0f3F5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0f3F5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0f3F5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0f3F000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0f3F000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0f3F5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0f3F5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0f3F5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0f3F5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0f3F000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0f3F000000, f175; +sub.f32 f177, f91, f145; +sub.f32 f178, f92, f146; +add.f32 f179, f95, f159; +add.f32 f180, f96, f161; +sub.f32 f181, f95, f159; +sub.f32 f182, f96, f161; +add.f32 f183, f99, f164; +add.f32 f184, f100, f166; +sub.f32 f185, f99, f164; +sub.f32 f186, f100, f166; +sub.f32 f187, f93, f148; +add.f32 f188, f94, f147; +add.f32 f189, f93, f148; +sub.f32 f190, f94, f147; +add.f32 f191, f97, f169; +add.f32 f192, f98, f171; +sub.f32 f193, f97, f169; +sub.f32 f194, f98, f171; +add.f32 f195, f101, f174; +add.f32 f196, f102, f176; +sub.f32 f197, f101, f174; +sub.f32 f198, f102, f176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1152, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f199, f200}, [rd6]; +mul.f32 f203, f180, f200; +mul.f32 f204, f179, f200; +mul.f32 f205, f199, f180; +mul.f32 f206, f199, f199; +mul.f32 f207, f200, f200; +sub.f32 f208, f206, f207; +mul.f32 f209, f200, f199; +fma.rn.f32 f210, f200, f199, f209; +mul.f32 f211, f184, f210; +mul.f32 f212, f183, f210; +mul.f32 f213, f208, f184; +mul.f32 f214, f199, f208; +mul.f32 f215, f200, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f199, f210; +fma.rn.f32 f218, f200, f208, f217; +mul.f32 f219, f188, f218; +mul.f32 f220, f187, f218; +mul.f32 f221, f216, f188; +mul.f32 f222, f199, f216; +mul.f32 f223, f200, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f199, f218; +fma.rn.f32 f226, f200, f216, f225; +mul.f32 f227, f192, f226; +mul.f32 f228, f191, f226; +mul.f32 f229, f224, f192; +mul.f32 f230, f199, f224; +mul.f32 f231, f200, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f199, f226; +fma.rn.f32 f234, f200, f224, f233; +mul.f32 f235, f196, f234; +mul.f32 f236, f195, f234; +mul.f32 f237, f232, f196; +mul.f32 f238, f199, f232; +mul.f32 f239, f200, f234; +sub.f32 f240, f238, f239; +mul.f32 f241, f199, f234; +fma.rn.f32 f242, f200, f232, f241; +mul.f32 f243, f178, f242; +mul.f32 f244, f177, f242; +mul.f32 f245, f240, f178; +mul.f32 f246, f199, f240; +mul.f32 f247, f200, f242; +sub.f32 f248, f246, f247; +mul.f32 f249, f199, f242; +fma.rn.f32 f250, f200, f240, f249; +mul.f32 f251, f182, f250; +mul.f32 f252, f181, f250; +mul.f32 f253, f248, f182; +mul.f32 f254, f199, f248; +mul.f32 f255, f200, f250; +sub.f32 f256, f254, f255; +mul.f32 f257, f199, f250; +fma.rn.f32 f258, f200, f248, f257; +mul.f32 f259, f186, f258; +mul.f32 f260, f185, f258; +mul.f32 f261, f256, f186; +mul.f32 f262, f199, f256; +mul.f32 f263, f200, f258; +sub.f32 f264, f262, f263; +mul.f32 f265, f199, f258; +fma.rn.f32 f266, f200, f256, f265; +mul.f32 f267, f190, f266; +mul.f32 f268, f189, f266; +mul.f32 f269, f264, f190; +mul.f32 f270, f199, f264; +mul.f32 f271, f200, f266; +sub.f32 f272, f270, f271; +mul.f32 f273, f199, f266; +fma.rn.f32 f274, f200, f264, f273; +mul.f32 f275, f194, f274; +mul.f32 f276, f193, f274; +mul.f32 f277, f272, f194; +mul.f32 f278, f199, f272; +mul.f32 f279, f200, f274; +sub.f32 f280, f278, f279; +mul.f32 f281, f199, f274; +fma.rn.f32 f282, f200, f272, f281; +mul.f32 f283, f198, f282; +mul.f32 f284, f197, f282; +mul.f32 f285, f280, f198; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f32 f286, f92, f146; +add.f32 f287, f91, f145; +fma.rn.f32 f288, f199, f179, f203; +sub.f32 f289, f205, f204; +st.shared.v4.f32 [r9], {f287, f286, f288, f289}; +fma.rn.f32 f290, f208, f183, f211; +sub.f32 f291, f213, f212; +sub.f32 f292, f221, f220; +fma.rn.f32 f293, f216, f187, f219; +st.shared.v4.f32 [r9+16], {f290, f291, f293, f292}; +sub.f32 f294, f229, f228; +fma.rn.f32 f295, f224, f191, f227; +fma.rn.f32 f296, f232, f195, f235; +sub.f32 f297, f237, f236; +st.shared.v4.f32 [r9+32], {f295, f294, f296, f297}; +fma.rn.f32 f298, f240, f177, f243; +sub.f32 f299, f245, f244; +fma.rn.f32 f300, f248, f181, f251; +sub.f32 f301, f253, f252; +st.shared.v4.f32 [r9+48], {f298, f299, f300, f301}; +fma.rn.f32 f302, f256, f185, f259; +sub.f32 f303, f261, f260; +fma.rn.f32 f304, f264, f189, f267; +sub.f32 f305, f269, f268; +st.shared.v4.f32 [r9+64], {f302, f303, f304, f305}; +fma.rn.f32 f306, f272, f193, f275; +sub.f32 f307, f277, f276; +fma.rn.f32 f308, f280, f197, f283; +sub.f32 f309, f285, f284; +st.shared.v4.f32 [r9+80], {f306, f307, f308, f309}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.v2.f32 {f310, f311}, [r10]; +ld.shared.v2.f32 {f314, f315}, [r10+96]; +ld.shared.v2.f32 {f318, f319}, [r10+192]; +ld.shared.v2.f32 {f322, f323}, [r10+288]; +ld.shared.v2.f32 {f326, f327}, [r10+384]; +ld.shared.v2.f32 {f330, f331}, [r10+480]; +ld.shared.v2.f32 {f334, f335}, [r10+576]; +ld.shared.v2.f32 {f338, f339}, [r10+672]; +ld.shared.v2.f32 {f342, f343}, [r10+768]; +ld.shared.v2.f32 {f346, f347}, [r10+864]; +ld.shared.v2.f32 {f350, f351}, [r10+960]; +ld.shared.v2.f32 {f354, f355}, [r10+1056]; +add.f32 f358, f326, f342; +add.f32 f359, f310, f358; +add.f32 f360, f327, f343; +add.f32 f361, f311, f360; +mul.f32 f362, f358, 0f3F000000; +sub.f32 f363, f310, f362; +sub.f32 f364, f327, f343; +mul.f32 f365, f364, 0fBF5DB3D7; +add.f32 f366, f365, f363; +sub.f32 f367, f363, f365; +mul.f32 f368, f360, 0f3F000000; +sub.f32 f369, f311, f368; +sub.f32 f370, f326, f342; +mul.f32 f371, f370, 0fBF5DB3D7; +sub.f32 f372, f369, f371; +add.f32 f373, f371, f369; +add.f32 f374, f334, f350; +add.f32 f375, f318, f374; +add.f32 f376, f335, f351; +add.f32 f377, f319, f376; +mul.f32 f378, f374, 0f3F000000; +sub.f32 f379, f318, f378; +sub.f32 f380, f335, f351; +mul.f32 f381, f380, 0fBF5DB3D7; +add.f32 f382, f381, f379; +sub.f32 f383, f379, f381; +mul.f32 f384, f376, 0f3F000000; +sub.f32 f385, f319, f384; +sub.f32 f386, f334, f350; +mul.f32 f387, f386, 0fBF5DB3D7; +sub.f32 f388, f385, f387; +add.f32 f389, f387, f385; +mul.f32 f390, f382, 0f3F000000; +mul.f32 f391, f388, 0f3F5DB3D7; +sub.f32 f392, f390, f391; +mul.f32 f393, f388, 0f3F000000; +fma.rn.f32 f394, f382, 0f3F5DB3D7, f393; +mul.f32 f395, f383, 0fBF000000; +mul.f32 f396, f389, 0f3F5DB3D7; +sub.f32 f397, f395, f396; +mul.f32 f398, f389, 0fBF000000; +fma.rn.f32 f399, f383, 0f3F5DB3D7, f398; +add.f32 f400, f359, f375; +add.f32 f401, f361, f377; +sub.f32 f402, f359, f375; +sub.f32 f403, f361, f377; +add.f32 f404, f366, f392; +add.f32 f405, f372, f394; +sub.f32 f406, f366, f392; +sub.f32 f407, f372, f394; +add.f32 f408, f367, f397; +add.f32 f409, f373, f399; +sub.f32 f410, f367, f397; +sub.f32 f411, f373, f399; +add.f32 f412, f330, f346; +add.f32 f413, f314, f412; +add.f32 f414, f331, f347; +add.f32 f415, f315, f414; +mul.f32 f416, f412, 0f3F000000; +sub.f32 f417, f314, f416; +sub.f32 f418, f331, f347; +mul.f32 f419, f418, 0fBF5DB3D7; +add.f32 f420, f419, f417; +sub.f32 f421, f417, f419; +mul.f32 f422, f414, 0f3F000000; +sub.f32 f423, f315, f422; +sub.f32 f424, f330, f346; +mul.f32 f425, f424, 0fBF5DB3D7; +sub.f32 f426, f423, f425; +add.f32 f427, f425, f423; +add.f32 f428, f338, f354; +add.f32 f429, f322, f428; +add.f32 f430, f339, f355; +add.f32 f431, f323, f430; +mul.f32 f432, f428, 0f3F000000; +sub.f32 f433, f322, f432; +sub.f32 f434, f339, f355; +mul.f32 f435, f434, 0fBF5DB3D7; +add.f32 f436, f435, f433; +sub.f32 f437, f433, f435; +mul.f32 f438, f430, 0f3F000000; +sub.f32 f439, f323, f438; +sub.f32 f440, f338, f354; +mul.f32 f441, f440, 0fBF5DB3D7; +sub.f32 f442, f439, f441; +add.f32 f443, f441, f439; +mul.f32 f444, f436, 0f3F000000; +mul.f32 f445, f442, 0f3F5DB3D7; +sub.f32 f446, f444, f445; +mul.f32 f447, f442, 0f3F000000; +fma.rn.f32 f448, f436, 0f3F5DB3D7, f447; +mul.f32 f449, f437, 0fBF000000; +mul.f32 f450, f443, 0f3F5DB3D7; +sub.f32 f451, f449, f450; +mul.f32 f452, f443, 0fBF000000; +fma.rn.f32 f453, f437, 0f3F5DB3D7, f452; +add.f32 f454, f413, f429; +add.f32 f455, f415, f431; +sub.f32 f456, f413, f429; +sub.f32 f457, f415, f431; +add.f32 f458, f420, f446; +add.f32 f459, f426, f448; +sub.f32 f460, f420, f446; +sub.f32 f461, f426, f448; +add.f32 f462, f421, f451; +add.f32 f463, f427, f453; +sub.f32 f464, f421, f451; +sub.f32 f465, f427, f453; +mul.f32 f466, f458, 0f3F5DB3D7; +mul.f32 f467, f459, 0f3F000000; +sub.f32 f468, f466, f467; +mul.f32 f469, f459, 0f3F5DB3D7; +fma.rn.f32 f470, f458, 0f3F000000, f469; +mul.f32 f471, f462, 0f3F000000; +mul.f32 f472, f463, 0f3F5DB3D7; +sub.f32 f473, f471, f472; +mul.f32 f474, f463, 0f3F000000; +fma.rn.f32 f475, f462, 0f3F5DB3D7, f474; +mul.f32 f476, f460, 0fBF000000; +mul.f32 f477, f461, 0f3F5DB3D7; +sub.f32 f478, f476, f477; +mul.f32 f479, f461, 0fBF000000; +fma.rn.f32 f480, f460, 0f3F5DB3D7, f479; +mul.f32 f481, f464, 0fBF5DB3D7; +mul.f32 f482, f465, 0f3F000000; +sub.f32 f483, f481, f482; +mul.f32 f484, f465, 0fBF5DB3D7; +fma.rn.f32 f485, f464, 0f3F000000, f484; +add.f32 %1, f401, f455; +add.f32 %0, f400, f454; +add.f32 %3, f405, f470; +add.f32 %2, f404, f468; +add.f32 %5, f409, f475; +add.f32 %4, f408, f473; +add.f32 %7, f403, f456; +sub.f32 %6, f402, f457; +add.f32 %9, f407, f480; +add.f32 %8, f406, f478; +add.f32 %11, f411, f485; +add.f32 %10, f410, f483; +sub.f32 %13, f401, f455; +sub.f32 %12, f400, f454; +sub.f32 %15, f405, f470; +sub.f32 %14, f404, f468; +sub.f32 %17, f409, f475; +sub.f32 %16, f408, f473; +sub.f32 %19, f403, f456; +add.f32 %18, f402, f457; +sub.f32 %21, f407, f480; +sub.f32 %20, f406, f478; +sub.f32 %23, f411, f485; +sub.f32 %22, f410, f483; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<401, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<486>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 576, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %36, %47; +add.f32 f50, %26, f49; +add.f32 f51, %38, %49; +add.f32 f52, %27, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %26, f53; +sub.f32 f55, %38, %49; +mul.f32 f56, f55, 0fBF5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %27, f59; +sub.f32 f61, %36, %47; +mul.f32 f62, f61, 0fBF5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %42, %52; +add.f32 f66, %31, f65; +add.f32 f67, %43, %54; +add.f32 f68, %33, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %31, f69; +sub.f32 f71, %43, %54; +mul.f32 f72, f71, 0fBF5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %33, f75; +sub.f32 f77, %42, %52; +mul.f32 f78, f77, 0fBF5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0f3F5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0f3F5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0f3F5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0f3F5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %39, %50; +add.f32 f104, %28, f103; +add.f32 f105, %41, %51; +add.f32 f106, %30, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %28, f107; +sub.f32 f109, %41, %51; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %30, f113; +sub.f32 f115, %39, %50; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %44, %55; +add.f32 f120, %34, f119; +add.f32 f121, %46, %56; +add.f32 f122, %35, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %34, f123; +sub.f32 f125, %46, %56; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %35, f129; +sub.f32 f131, %44, %55; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0f3F5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0f3F5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0f3F5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0f3F5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0f3F000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0f3F000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0f3F5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0f3F5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0f3F5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0f3F5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0f3F000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0f3F000000, f175; +add.f32 f177, f91, f145; +add.f32 f178, f92, f146; +sub.f32 f179, f91, f145; +sub.f32 f180, f92, f146; +add.f32 f181, f95, f159; +add.f32 f182, f96, f161; +sub.f32 f183, f95, f159; +sub.f32 f184, f96, f161; +add.f32 f185, f99, f164; +add.f32 f186, f100, f166; +sub.f32 f187, f99, f164; +sub.f32 f188, f100, f166; +sub.f32 f189, f93, f148; +add.f32 f190, f94, f147; +add.f32 f191, f93, f148; +sub.f32 f192, f94, f147; +add.f32 f193, f97, f169; +add.f32 f194, f98, f171; +sub.f32 f195, f97, f169; +sub.f32 f196, f98, f171; +add.f32 f197, f101, f174; +add.f32 f198, f102, f176; +sub.f32 f199, f101, f174; +sub.f32 f200, f102, f176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f201, f202}, [rd6]; +mul.f32 f205, f182, f202; +fma.rn.f32 f206, f201, f181, f205; +mul.f32 f207, f181, f202; +mul.f32 f208, f201, f182; +sub.f32 f209, f208, f207; +mul.f32 f210, f201, f201; +mul.f32 f211, f202, f202; +sub.f32 f212, f210, f211; +mul.f32 f213, f202, f201; +fma.rn.f32 f214, f202, f201, f213; +mul.f32 f215, f186, f214; +fma.rn.f32 f216, f212, f185, f215; +mul.f32 f217, f185, f214; +mul.f32 f218, f212, f186; +sub.f32 f219, f218, f217; +mul.f32 f220, f201, f212; +mul.f32 f221, f202, f214; +sub.f32 f222, f220, f221; +mul.f32 f223, f201, f214; +fma.rn.f32 f224, f202, f212, f223; +mul.f32 f225, f190, f224; +fma.rn.f32 f226, f222, f189, f225; +mul.f32 f227, f189, f224; +mul.f32 f228, f222, f190; +sub.f32 f229, f228, f227; +mul.f32 f230, f201, f222; +mul.f32 f231, f202, f224; +sub.f32 f232, f230, f231; +mul.f32 f233, f201, f224; +fma.rn.f32 f234, f202, f222, f233; +mul.f32 f235, f194, f234; +fma.rn.f32 f236, f232, f193, f235; +mul.f32 f237, f193, f234; +mul.f32 f238, f232, f194; +sub.f32 f239, f238, f237; +mul.f32 f240, f201, f232; +mul.f32 f241, f202, f234; +sub.f32 f242, f240, f241; +mul.f32 f243, f201, f234; +fma.rn.f32 f244, f202, f232, f243; +mul.f32 f245, f198, f244; +fma.rn.f32 f246, f242, f197, f245; +mul.f32 f247, f197, f244; +mul.f32 f248, f242, f198; +sub.f32 f249, f248, f247; +mul.f32 f250, f201, f242; +mul.f32 f251, f202, f244; +sub.f32 f252, f250, f251; +mul.f32 f253, f201, f244; +fma.rn.f32 f254, f202, f242, f253; +mul.f32 f255, f180, f254; +fma.rn.f32 f256, f252, f179, f255; +mul.f32 f257, f179, f254; +mul.f32 f258, f252, f180; +sub.f32 f259, f258, f257; +mul.f32 f260, f201, f252; +mul.f32 f261, f202, f254; +sub.f32 f262, f260, f261; +mul.f32 f263, f201, f254; +fma.rn.f32 f264, f202, f252, f263; +mul.f32 f265, f184, f264; +fma.rn.f32 f266, f262, f183, f265; +mul.f32 f267, f183, f264; +mul.f32 f268, f262, f184; +sub.f32 f269, f268, f267; +mul.f32 f270, f201, f262; +mul.f32 f271, f202, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f201, f264; +fma.rn.f32 f274, f202, f262, f273; +mul.f32 f275, f188, f274; +fma.rn.f32 f276, f272, f187, f275; +mul.f32 f277, f187, f274; +mul.f32 f278, f272, f188; +sub.f32 f279, f278, f277; +mul.f32 f280, f201, f272; +mul.f32 f281, f202, f274; +sub.f32 f282, f280, f281; +mul.f32 f283, f201, f274; +fma.rn.f32 f284, f202, f272, f283; +mul.f32 f285, f192, f284; +fma.rn.f32 f286, f282, f191, f285; +mul.f32 f287, f191, f284; +mul.f32 f288, f282, f192; +sub.f32 f289, f288, f287; +mul.f32 f290, f201, f282; +mul.f32 f291, f202, f284; +sub.f32 f292, f290, f291; +mul.f32 f293, f201, f284; +fma.rn.f32 f294, f202, f282, f293; +mul.f32 f295, f196, f294; +fma.rn.f32 f296, f292, f195, f295; +mul.f32 f297, f195, f294; +mul.f32 f298, f292, f196; +sub.f32 f299, f298, f297; +mul.f32 f300, f201, f292; +mul.f32 f301, f202, f294; +sub.f32 f302, f300, f301; +mul.f32 f303, f201, f294; +fma.rn.f32 f304, f202, f292, f303; +mul.f32 f305, f200, f304; +fma.rn.f32 f306, f302, f199, f305; +mul.f32 f307, f199, f304; +mul.f32 f308, f302, f200; +sub.f32 f309, f308, f307; +mad.lo.s32 r8, r5, 576, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v4.f32 [r9], {f177, f206, f216, f226}; +st.shared.v4.f32 [r9+16], {f236, f246, f256, f266}; +st.shared.v4.f32 [r9+32], {f276, f286, f296, f306}; +barrier.sync 0; +mad.lo.s32 r10, r7, -44, r9; +ld.shared.f32 f310, [r10]; +ld.shared.f32 f311, [r10+48]; +ld.shared.f32 f312, [r10+96]; +ld.shared.f32 f313, [r10+144]; +ld.shared.f32 f314, [r10+192]; +ld.shared.f32 f315, [r10+240]; +ld.shared.f32 f316, [r10+288]; +ld.shared.f32 f317, [r10+336]; +ld.shared.f32 f318, [r10+384]; +ld.shared.f32 f319, [r10+432]; +ld.shared.f32 f320, [r10+480]; +ld.shared.f32 f321, [r10+528]; +barrier.sync 0; +st.shared.v4.f32 [r9], {f178, f209, f219, f229}; +st.shared.v4.f32 [r9+16], {f239, f249, f259, f269}; +st.shared.v4.f32 [r9+32], {f279, f289, f299, f309}; +barrier.sync 0; +ld.shared.f32 f322, [r10]; +ld.shared.f32 f323, [r10+48]; +ld.shared.f32 f324, [r10+96]; +ld.shared.f32 f325, [r10+144]; +ld.shared.f32 f326, [r10+192]; +ld.shared.f32 f327, [r10+240]; +ld.shared.f32 f328, [r10+288]; +ld.shared.f32 f329, [r10+336]; +ld.shared.f32 f330, [r10+384]; +ld.shared.f32 f331, [r10+432]; +ld.shared.f32 f332, [r10+480]; +ld.shared.f32 f333, [r10+528]; +add.f32 f334, f314, f318; +add.f32 f335, f310, f334; +add.f32 f336, f326, f330; +add.f32 f337, f322, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f310, f338; +sub.f32 f340, f326, f330; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f322, f344; +sub.f32 f346, f314, f318; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f316, f320; +add.f32 f351, f312, f350; +add.f32 f352, f328, f332; +add.f32 f353, f324, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f312, f354; +sub.f32 f356, f328, f332; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f324, f360; +sub.f32 f362, f316, f320; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.f32 f366, f358, 0f3F000000; +mul.f32 f367, f364, 0f3F5DB3D7; +sub.f32 f368, f366, f367; +mul.f32 f369, f364, 0f3F000000; +fma.rn.f32 f370, f358, 0f3F5DB3D7, f369; +mul.f32 f371, f359, 0fBF000000; +mul.f32 f372, f365, 0f3F5DB3D7; +sub.f32 f373, f371, f372; +mul.f32 f374, f365, 0fBF000000; +fma.rn.f32 f375, f359, 0f3F5DB3D7, f374; +add.f32 f376, f335, f351; +add.f32 f377, f337, f353; +sub.f32 f378, f335, f351; +sub.f32 f379, f337, f353; +add.f32 f380, f342, f368; +add.f32 f381, f348, f370; +sub.f32 f382, f342, f368; +sub.f32 f383, f348, f370; +add.f32 f384, f343, f373; +add.f32 f385, f349, f375; +sub.f32 f386, f343, f373; +sub.f32 f387, f349, f375; +add.f32 f388, f315, f319; +add.f32 f389, f311, f388; +add.f32 f390, f327, f331; +add.f32 f391, f323, f390; +mul.f32 f392, f388, 0f3F000000; +sub.f32 f393, f311, f392; +sub.f32 f394, f327, f331; +mul.f32 f395, f394, 0fBF5DB3D7; +add.f32 f396, f395, f393; +sub.f32 f397, f393, f395; +mul.f32 f398, f390, 0f3F000000; +sub.f32 f399, f323, f398; +sub.f32 f400, f315, f319; +mul.f32 f401, f400, 0fBF5DB3D7; +sub.f32 f402, f399, f401; +add.f32 f403, f401, f399; +add.f32 f404, f317, f321; +add.f32 f405, f313, f404; +add.f32 f406, f329, f333; +add.f32 f407, f325, f406; +mul.f32 f408, f404, 0f3F000000; +sub.f32 f409, f313, f408; +sub.f32 f410, f329, f333; +mul.f32 f411, f410, 0fBF5DB3D7; +add.f32 f412, f411, f409; +sub.f32 f413, f409, f411; +mul.f32 f414, f406, 0f3F000000; +sub.f32 f415, f325, f414; +sub.f32 f416, f317, f321; +mul.f32 f417, f416, 0fBF5DB3D7; +sub.f32 f418, f415, f417; +add.f32 f419, f417, f415; +mul.f32 f420, f412, 0f3F000000; +mul.f32 f421, f418, 0f3F5DB3D7; +sub.f32 f422, f420, f421; +mul.f32 f423, f418, 0f3F000000; +fma.rn.f32 f424, f412, 0f3F5DB3D7, f423; +mul.f32 f425, f413, 0fBF000000; +mul.f32 f426, f419, 0f3F5DB3D7; +sub.f32 f427, f425, f426; +mul.f32 f428, f419, 0fBF000000; +fma.rn.f32 f429, f413, 0f3F5DB3D7, f428; +add.f32 f430, f389, f405; +add.f32 f431, f391, f407; +sub.f32 f432, f389, f405; +sub.f32 f433, f391, f407; +add.f32 f434, f396, f422; +add.f32 f435, f402, f424; +sub.f32 f436, f396, f422; +sub.f32 f437, f402, f424; +add.f32 f438, f397, f427; +add.f32 f439, f403, f429; +sub.f32 f440, f397, f427; +sub.f32 f441, f403, f429; +mul.f32 f442, f434, 0f3F5DB3D7; +mul.f32 f443, f435, 0f3F000000; +sub.f32 f444, f442, f443; +mul.f32 f445, f435, 0f3F5DB3D7; +fma.rn.f32 f446, f434, 0f3F000000, f445; +mul.f32 f447, f438, 0f3F000000; +mul.f32 f448, f439, 0f3F5DB3D7; +sub.f32 f449, f447, f448; +mul.f32 f450, f439, 0f3F000000; +fma.rn.f32 f451, f438, 0f3F5DB3D7, f450; +mul.f32 f452, f436, 0fBF000000; +mul.f32 f453, f437, 0f3F5DB3D7; +sub.f32 f454, f452, f453; +mul.f32 f455, f437, 0fBF000000; +fma.rn.f32 f456, f436, 0f3F5DB3D7, f455; +mul.f32 f457, f440, 0fBF5DB3D7; +mul.f32 f458, f441, 0f3F000000; +sub.f32 f459, f457, f458; +mul.f32 f460, f441, 0fBF5DB3D7; +fma.rn.f32 f461, f440, 0f3F000000, f460; +add.f32 %0, f376, f430; +add.f32 %1, f377, f431; +add.f32 %3, f381, f446; +add.f32 %2, f380, f444; +add.f32 %5, f385, f451; +add.f32 %4, f384, f449; +add.f32 %7, f379, f432; +sub.f32 %6, f378, f433; +add.f32 %9, f383, f456; +add.f32 %8, f382, f454; +add.f32 %11, f387, f461; +add.f32 %10, f386, f459; +sub.f32 %12, f376, f430; +sub.f32 %13, f377, f431; +sub.f32 %15, f381, f446; +sub.f32 %14, f380, f444; +sub.f32 %17, f385, f451; +sub.f32 %16, f384, f449; +sub.f32 %19, f379, f432; +add.f32 %18, f378, f433; +sub.f32 %21, f383, f456; +sub.f32 %20, f382, f454; +sub.f32 %23, f387, f461; +sub.f32 %22, f386, f459; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..ae199c49cc3ec --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp64_fwd.hpp.inc @@ -0,0 +1,938 @@ +#ifndef CUFFTDX_FFT_144_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_144_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<571, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<485>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 1152, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %36, %47; +add.f64 fd50, %26, fd49; +add.f64 fd51, %38, %49; +add.f64 fd52, %27, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %26, fd53; +sub.f64 fd55, %38, %49; +mul.f64 fd56, fd55, 0d3FEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %27, fd59; +sub.f64 fd61, %36, %47; +mul.f64 fd62, fd61, 0d3FEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %42, %52; +add.f64 fd66, %31, fd65; +add.f64 fd67, %43, %54; +add.f64 fd68, %33, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %31, fd69; +sub.f64 fd71, %43, %54; +mul.f64 fd72, fd71, 0d3FEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %33, fd75; +sub.f64 fd77, %42, %52; +mul.f64 fd78, fd77, 0d3FEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0dBFEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0dBFEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0dBFEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %39, %50; +add.f64 fd104, %28, fd103; +add.f64 fd105, %41, %51; +add.f64 fd106, %30, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %28, fd107; +sub.f64 fd109, %41, %51; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %30, fd113; +sub.f64 fd115, %39, %50; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %44, %55; +add.f64 fd120, %34, fd119; +add.f64 fd121, %46, %56; +add.f64 fd122, %35, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %34, fd123; +sub.f64 fd125, %46, %56; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %35, fd129; +sub.f64 fd131, %44, %55; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0dBFEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0dBFEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0dBFEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0dBFE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0dBFE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0dBFEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0dBFEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0dBFEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0dBFE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0dBFE0000000000000, fd175; +add.f64 fd177, fd91, fd145; +add.f64 fd178, fd92, fd146; +sub.f64 fd179, fd91, fd145; +sub.f64 fd180, fd92, fd146; +add.f64 fd181, fd95, fd159; +add.f64 fd182, fd96, fd161; +sub.f64 fd183, fd95, fd159; +sub.f64 fd184, fd96, fd161; +add.f64 fd185, fd99, fd164; +add.f64 fd186, fd100, fd166; +sub.f64 fd187, fd99, fd164; +sub.f64 fd188, fd100, fd166; +add.f64 fd189, fd93, fd148; +sub.f64 fd190, fd94, fd147; +sub.f64 fd191, fd93, fd148; +add.f64 fd192, fd94, fd147; +add.f64 fd193, fd97, fd169; +add.f64 fd194, fd98, fd171; +sub.f64 fd195, fd97, fd169; +sub.f64 fd196, fd98, fd171; +add.f64 fd197, fd101, fd174; +add.f64 fd198, fd102, fd176; +sub.f64 fd199, fd101, fd174; +sub.f64 fd200, fd102, fd176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd201, fd202}, [rd6]; +mul.f64 fd205, fd201, fd181; +mul.f64 fd206, fd202, fd182; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd201, fd182; +fma.rn.f64 fd209, fd202, fd181, fd208; +mul.f64 fd210, fd201, fd201; +mul.f64 fd211, fd202, fd202; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd202, fd201; +fma.rn.f64 fd214, fd202, fd201, fd213; +mul.f64 fd215, fd212, fd185; +mul.f64 fd216, fd214, fd186; +sub.f64 fd217, fd215, fd216; +mul.f64 fd218, fd212, fd186; +fma.rn.f64 fd219, fd214, fd185, fd218; +mul.f64 fd220, fd201, fd212; +mul.f64 fd221, fd202, fd214; +sub.f64 fd222, fd220, fd221; +mul.f64 fd223, fd201, fd214; +fma.rn.f64 fd224, fd202, fd212, fd223; +mul.f64 fd225, fd222, fd189; +mul.f64 fd226, fd224, fd190; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd222, fd190; +fma.rn.f64 fd229, fd224, fd189, fd228; +mul.f64 fd230, fd201, fd222; +mul.f64 fd231, fd202, fd224; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd201, fd224; +fma.rn.f64 fd234, fd202, fd222, fd233; +mul.f64 fd235, fd232, fd193; +mul.f64 fd236, fd234, fd194; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd232, fd194; +fma.rn.f64 fd239, fd234, fd193, fd238; +mul.f64 fd240, fd201, fd232; +mul.f64 fd241, fd202, fd234; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd201, fd234; +fma.rn.f64 fd244, fd202, fd232, fd243; +mul.f64 fd245, fd242, fd197; +mul.f64 fd246, fd244, fd198; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd198; +fma.rn.f64 fd249, fd244, fd197, fd248; +mul.f64 fd250, fd201, fd242; +mul.f64 fd251, fd202, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd201, fd244; +fma.rn.f64 fd254, fd202, fd242, fd253; +mul.f64 fd255, fd252, fd179; +mul.f64 fd256, fd254, fd180; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd180; +fma.rn.f64 fd259, fd254, fd179, fd258; +ld.global.v2.f64 {fd260, fd261}, [rd6+192]; +mul.f64 fd264, fd260, fd183; +mul.f64 fd265, fd261, fd184; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd260, fd184; +fma.rn.f64 fd268, fd261, fd183, fd267; +mul.f64 fd269, fd201, fd260; +mul.f64 fd270, fd202, fd261; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd201, fd261; +fma.rn.f64 fd273, fd202, fd260, fd272; +mul.f64 fd274, fd271, fd187; +mul.f64 fd275, fd273, fd188; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd188; +fma.rn.f64 fd278, fd273, fd187, fd277; +mul.f64 fd279, fd201, fd271; +mul.f64 fd280, fd202, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd201, fd273; +fma.rn.f64 fd283, fd202, fd271, fd282; +mul.f64 fd284, fd281, fd191; +mul.f64 fd285, fd283, fd192; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd192; +fma.rn.f64 fd288, fd283, fd191, fd287; +mul.f64 fd289, fd201, fd281; +mul.f64 fd290, fd202, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd201, fd283; +fma.rn.f64 fd293, fd202, fd281, fd292; +mul.f64 fd294, fd291, fd195; +mul.f64 fd295, fd293, fd196; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd196; +fma.rn.f64 fd298, fd293, fd195, fd297; +mul.f64 fd299, fd201, fd291; +mul.f64 fd300, fd202, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd201, fd293; +fma.rn.f64 fd303, fd202, fd291, fd302; +mul.f64 fd304, fd301, fd199; +mul.f64 fd305, fd303, fd200; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd200; +fma.rn.f64 fd308, fd303, fd199, fd307; +mad.lo.s32 r8, r5, 1152, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +st.shared.v2.f64 [r9], {fd177, fd207}; +st.shared.v2.f64 [r9+16], {fd217, fd227}; +st.shared.v2.f64 [r9+32], {fd237, fd247}; +st.shared.v2.f64 [r9+48], {fd257, fd266}; +st.shared.v2.f64 [r9+64], {fd276, fd286}; +st.shared.v2.f64 [r9+80], {fd296, fd306}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.f64 fd309, [r10]; +ld.shared.f64 fd310, [r10+96]; +ld.shared.f64 fd311, [r10+192]; +ld.shared.f64 fd312, [r10+288]; +ld.shared.f64 fd313, [r10+384]; +ld.shared.f64 fd314, [r10+480]; +ld.shared.f64 fd315, [r10+576]; +ld.shared.f64 fd316, [r10+672]; +ld.shared.f64 fd317, [r10+768]; +ld.shared.f64 fd318, [r10+864]; +ld.shared.f64 fd319, [r10+960]; +ld.shared.f64 fd320, [r10+1056]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd178, fd209}; +st.shared.v2.f64 [r9+16], {fd219, fd229}; +st.shared.v2.f64 [r9+32], {fd239, fd249}; +st.shared.v2.f64 [r9+48], {fd259, fd268}; +st.shared.v2.f64 [r9+64], {fd278, fd288}; +st.shared.v2.f64 [r9+80], {fd298, fd308}; +barrier.sync 0; +ld.shared.f64 fd321, [r10]; +ld.shared.f64 fd322, [r10+96]; +ld.shared.f64 fd323, [r10+192]; +ld.shared.f64 fd324, [r10+288]; +ld.shared.f64 fd325, [r10+384]; +ld.shared.f64 fd326, [r10+480]; +ld.shared.f64 fd327, [r10+576]; +ld.shared.f64 fd328, [r10+672]; +ld.shared.f64 fd329, [r10+768]; +ld.shared.f64 fd330, [r10+864]; +ld.shared.f64 fd331, [r10+960]; +ld.shared.f64 fd332, [r10+1056]; +add.f64 fd333, fd313, fd317; +add.f64 fd334, fd309, fd333; +add.f64 fd335, fd325, fd329; +add.f64 fd336, fd321, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd309, fd337; +sub.f64 fd339, fd325, fd329; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd321, fd343; +sub.f64 fd345, fd313, fd317; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd315, fd319; +add.f64 fd350, fd311, fd349; +add.f64 fd351, fd327, fd331; +add.f64 fd352, fd323, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd311, fd353; +sub.f64 fd355, fd327, fd331; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd323, fd359; +sub.f64 fd361, fd315, fd319; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.f64 fd365, fd357, 0d3FE0000000000000; +mul.f64 fd366, fd363, 0dBFEBB67AE8584CAA; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd363, 0d3FE0000000000000; +fma.rn.f64 fd369, fd357, 0dBFEBB67AE8584CAA, fd368; +mul.f64 fd370, fd358, 0dBFE0000000000000; +mul.f64 fd371, fd364, 0dBFEBB67AE8584CAA; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd364, 0dBFE0000000000000; +fma.rn.f64 fd374, fd358, 0dBFEBB67AE8584CAA, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd336, fd352; +sub.f64 fd377, fd334, fd350; +sub.f64 fd378, fd336, fd352; +add.f64 fd379, fd341, fd367; +add.f64 fd380, fd347, fd369; +sub.f64 fd381, fd341, fd367; +sub.f64 fd382, fd347, fd369; +add.f64 fd383, fd342, fd372; +add.f64 fd384, fd348, fd374; +sub.f64 fd385, fd342, fd372; +sub.f64 fd386, fd348, fd374; +add.f64 fd387, fd314, fd318; +add.f64 fd388, fd310, fd387; +add.f64 fd389, fd326, fd330; +add.f64 fd390, fd322, fd389; +mul.f64 fd391, fd387, 0d3FE0000000000000; +sub.f64 fd392, fd310, fd391; +sub.f64 fd393, fd326, fd330; +mul.f64 fd394, fd393, 0d3FEBB67AE8584CAA; +add.f64 fd395, fd394, fd392; +sub.f64 fd396, fd392, fd394; +mul.f64 fd397, fd389, 0d3FE0000000000000; +sub.f64 fd398, fd322, fd397; +sub.f64 fd399, fd314, fd318; +mul.f64 fd400, fd399, 0d3FEBB67AE8584CAA; +sub.f64 fd401, fd398, fd400; +add.f64 fd402, fd400, fd398; +add.f64 fd403, fd316, fd320; +add.f64 fd404, fd312, fd403; +add.f64 fd405, fd328, fd332; +add.f64 fd406, fd324, fd405; +mul.f64 fd407, fd403, 0d3FE0000000000000; +sub.f64 fd408, fd312, fd407; +sub.f64 fd409, fd328, fd332; +mul.f64 fd410, fd409, 0d3FEBB67AE8584CAA; +add.f64 fd411, fd410, fd408; +sub.f64 fd412, fd408, fd410; +mul.f64 fd413, fd405, 0d3FE0000000000000; +sub.f64 fd414, fd324, fd413; +sub.f64 fd415, fd316, fd320; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +sub.f64 fd417, fd414, fd416; +add.f64 fd418, fd416, fd414; +mul.f64 fd419, fd411, 0d3FE0000000000000; +mul.f64 fd420, fd417, 0dBFEBB67AE8584CAA; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd417, 0d3FE0000000000000; +fma.rn.f64 fd423, fd411, 0dBFEBB67AE8584CAA, fd422; +mul.f64 fd424, fd412, 0dBFE0000000000000; +mul.f64 fd425, fd418, 0dBFEBB67AE8584CAA; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd418, 0dBFE0000000000000; +fma.rn.f64 fd428, fd412, 0dBFEBB67AE8584CAA, fd427; +add.f64 fd429, fd388, fd404; +add.f64 fd430, fd390, fd406; +sub.f64 fd431, fd388, fd404; +sub.f64 fd432, fd390, fd406; +add.f64 fd433, fd395, fd421; +add.f64 fd434, fd401, fd423; +sub.f64 fd435, fd395, fd421; +sub.f64 fd436, fd401, fd423; +add.f64 fd437, fd396, fd426; +add.f64 fd438, fd402, fd428; +sub.f64 fd439, fd396, fd426; +sub.f64 fd440, fd402, fd428; +mul.f64 fd441, fd433, 0d3FEBB67AE8584CAA; +mul.f64 fd442, fd434, 0dBFE0000000000000; +sub.f64 fd443, fd441, fd442; +mul.f64 fd444, fd434, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd445, fd433, 0dBFE0000000000000, fd444; +mul.f64 fd446, fd437, 0d3FE0000000000000; +mul.f64 fd447, fd438, 0dBFEBB67AE8584CAA; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd438, 0d3FE0000000000000; +fma.rn.f64 fd450, fd437, 0dBFEBB67AE8584CAA, fd449; +mul.f64 fd451, fd435, 0dBFE0000000000000; +mul.f64 fd452, fd436, 0dBFEBB67AE8584CAA; +sub.f64 fd453, fd451, fd452; +mul.f64 fd454, fd436, 0dBFE0000000000000; +fma.rn.f64 fd455, fd435, 0dBFEBB67AE8584CAA, fd454; +mul.f64 fd456, fd439, 0dBFEBB67AE8584CAA; +mul.f64 fd457, fd440, 0dBFE0000000000000; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd440, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd460, fd439, 0dBFE0000000000000, fd459; +add.f64 %0, fd375, fd429; +add.f64 %1, fd376, fd430; +add.f64 %3, fd380, fd445; +add.f64 %2, fd379, fd443; +add.f64 %5, fd384, fd450; +add.f64 %4, fd383, fd448; +sub.f64 %7, fd378, fd431; +add.f64 %6, fd377, fd432; +add.f64 %9, fd382, fd455; +add.f64 %8, fd381, fd453; +add.f64 %11, fd386, fd460; +add.f64 %10, fd385, fd458; +sub.f64 %12, fd375, fd429; +sub.f64 %13, fd376, fd430; +sub.f64 %15, fd380, fd445; +sub.f64 %14, fd379, fd443; +sub.f64 %17, fd384, fd450; +sub.f64 %16, fd383, fd448; +add.f64 %19, fd378, fd431; +sub.f64 %18, fd377, fd432; +sub.f64 %21, fd382, fd455; +sub.f64 %20, fd381, fd453; +sub.f64 %23, fd386, fd460; +sub.f64 %22, fd385, fd458; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<572, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<509>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 2304, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %36, %47; +add.f64 fd50, %26, fd49; +add.f64 fd51, %38, %49; +add.f64 fd52, %27, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %26, fd53; +sub.f64 fd55, %38, %49; +mul.f64 fd56, fd55, 0d3FEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %27, fd59; +sub.f64 fd61, %36, %47; +mul.f64 fd62, fd61, 0d3FEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %42, %52; +add.f64 fd66, %31, fd65; +add.f64 fd67, %43, %54; +add.f64 fd68, %33, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %31, fd69; +sub.f64 fd71, %43, %54; +mul.f64 fd72, fd71, 0d3FEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %33, fd75; +sub.f64 fd77, %42, %52; +mul.f64 fd78, fd77, 0d3FEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0dBFEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0dBFEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0dBFEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %39, %50; +add.f64 fd104, %28, fd103; +add.f64 fd105, %41, %51; +add.f64 fd106, %30, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %28, fd107; +sub.f64 fd109, %41, %51; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %30, fd113; +sub.f64 fd115, %39, %50; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %44, %55; +add.f64 fd120, %34, fd119; +add.f64 fd121, %46, %56; +add.f64 fd122, %35, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %34, fd123; +sub.f64 fd125, %46, %56; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %35, fd129; +sub.f64 fd131, %44, %55; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0dBFEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0dBFEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0dBFEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0dBFE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0dBFE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0dBFEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0dBFEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0dBFEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0dBFE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0dBFE0000000000000, fd175; +sub.f64 fd177, fd91, fd145; +sub.f64 fd178, fd92, fd146; +add.f64 fd179, fd95, fd159; +add.f64 fd180, fd96, fd161; +sub.f64 fd181, fd95, fd159; +sub.f64 fd182, fd96, fd161; +add.f64 fd183, fd99, fd164; +add.f64 fd184, fd100, fd166; +sub.f64 fd185, fd99, fd164; +sub.f64 fd186, fd100, fd166; +add.f64 fd187, fd93, fd148; +sub.f64 fd188, fd94, fd147; +sub.f64 fd189, fd93, fd148; +add.f64 fd190, fd94, fd147; +add.f64 fd191, fd97, fd169; +add.f64 fd192, fd98, fd171; +sub.f64 fd193, fd97, fd169; +sub.f64 fd194, fd98, fd171; +add.f64 fd195, fd101, fd174; +add.f64 fd196, fd102, fd176; +sub.f64 fd197, fd101, fd174; +sub.f64 fd198, fd102, fd176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2304, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd199, fd200}, [rd6]; +mul.f64 fd203, fd199, fd179; +mul.f64 fd204, fd200, fd180; +mul.f64 fd205, fd199, fd180; +mul.f64 fd206, fd199, fd199; +mul.f64 fd207, fd200, fd200; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd200, fd199; +fma.rn.f64 fd210, fd200, fd199, fd209; +mul.f64 fd211, fd208, fd183; +mul.f64 fd212, fd210, fd184; +mul.f64 fd213, fd208, fd184; +mul.f64 fd214, fd199, fd208; +mul.f64 fd215, fd200, fd210; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd199, fd210; +fma.rn.f64 fd218, fd200, fd208, fd217; +mul.f64 fd219, fd216, fd187; +mul.f64 fd220, fd218, fd188; +mul.f64 fd221, fd216, fd188; +mul.f64 fd222, fd199, fd216; +mul.f64 fd223, fd200, fd218; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd199, fd218; +fma.rn.f64 fd226, fd200, fd216, fd225; +mul.f64 fd227, fd224, fd191; +mul.f64 fd228, fd226, fd192; +mul.f64 fd229, fd224, fd192; +mul.f64 fd230, fd199, fd224; +mul.f64 fd231, fd200, fd226; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd199, fd226; +fma.rn.f64 fd234, fd200, fd224, fd233; +mul.f64 fd235, fd232, fd195; +mul.f64 fd236, fd234, fd196; +mul.f64 fd237, fd232, fd196; +mul.f64 fd238, fd199, fd232; +mul.f64 fd239, fd200, fd234; +sub.f64 fd240, fd238, fd239; +mul.f64 fd241, fd199, fd234; +fma.rn.f64 fd242, fd200, fd232, fd241; +mul.f64 fd243, fd240, fd177; +mul.f64 fd244, fd242, fd178; +mul.f64 fd245, fd240, fd178; +ld.global.v2.f64 {fd246, fd247}, [rd6+192]; +mul.f64 fd250, fd246, fd181; +mul.f64 fd251, fd247, fd182; +mul.f64 fd252, fd246, fd182; +mul.f64 fd253, fd199, fd246; +mul.f64 fd254, fd200, fd247; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd199, fd247; +fma.rn.f64 fd257, fd200, fd246, fd256; +mul.f64 fd258, fd255, fd185; +mul.f64 fd259, fd257, fd186; +mul.f64 fd260, fd255, fd186; +mul.f64 fd261, fd199, fd255; +mul.f64 fd262, fd200, fd257; +sub.f64 fd263, fd261, fd262; +mul.f64 fd264, fd199, fd257; +fma.rn.f64 fd265, fd200, fd255, fd264; +mul.f64 fd266, fd263, fd189; +mul.f64 fd267, fd265, fd190; +mul.f64 fd268, fd263, fd190; +mul.f64 fd269, fd199, fd263; +mul.f64 fd270, fd200, fd265; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd199, fd265; +fma.rn.f64 fd273, fd200, fd263, fd272; +mul.f64 fd274, fd271, fd193; +mul.f64 fd275, fd273, fd194; +mul.f64 fd276, fd271, fd194; +mul.f64 fd277, fd199, fd271; +mul.f64 fd278, fd200, fd273; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd199, fd273; +fma.rn.f64 fd281, fd200, fd271, fd280; +mul.f64 fd282, fd279, fd197; +mul.f64 fd283, fd281, fd198; +mul.f64 fd284, fd279, fd198; +barrier.sync 0; +mad.lo.s32 r9, r7, 192, r8; +add.f64 fd285, fd92, fd146; +add.f64 fd286, fd91, fd145; +st.shared.v2.f64 [r9], {fd286, fd285}; +fma.rn.f64 fd287, fd200, fd179, fd205; +sub.f64 fd288, fd203, fd204; +st.shared.v2.f64 [r9+16], {fd288, fd287}; +fma.rn.f64 fd289, fd210, fd183, fd213; +sub.f64 fd290, fd211, fd212; +st.shared.v2.f64 [r9+32], {fd290, fd289}; +sub.f64 fd291, fd219, fd220; +fma.rn.f64 fd292, fd218, fd187, fd221; +st.shared.v2.f64 [r9+48], {fd291, fd292}; +sub.f64 fd293, fd227, fd228; +fma.rn.f64 fd294, fd226, fd191, fd229; +st.shared.v2.f64 [r9+64], {fd293, fd294}; +fma.rn.f64 fd295, fd234, fd195, fd237; +sub.f64 fd296, fd235, fd236; +st.shared.v2.f64 [r9+80], {fd296, fd295}; +fma.rn.f64 fd297, fd242, fd177, fd245; +sub.f64 fd298, fd243, fd244; +st.shared.v2.f64 [r9+96], {fd298, fd297}; +fma.rn.f64 fd299, fd247, fd181, fd252; +sub.f64 fd300, fd250, fd251; +st.shared.v2.f64 [r9+112], {fd300, fd299}; +fma.rn.f64 fd301, fd257, fd185, fd260; +sub.f64 fd302, fd258, fd259; +st.shared.v2.f64 [r9+128], {fd302, fd301}; +sub.f64 fd303, fd266, fd267; +fma.rn.f64 fd304, fd265, fd189, fd268; +st.shared.v2.f64 [r9+144], {fd303, fd304}; +sub.f64 fd305, fd274, fd275; +fma.rn.f64 fd306, fd273, fd193, fd276; +st.shared.v2.f64 [r9+160], {fd305, fd306}; +fma.rn.f64 fd307, fd281, fd197, fd284; +sub.f64 fd308, fd282, fd283; +st.shared.v2.f64 [r9+176], {fd308, fd307}; +barrier.sync 0; +mad.lo.s32 r10, r7, -176, r9; +ld.shared.v2.f64 {fd309, fd310}, [r10]; +ld.shared.v2.f64 {fd313, fd314}, [r10+192]; +ld.shared.v2.f64 {fd317, fd318}, [r10+384]; +ld.shared.v2.f64 {fd321, fd322}, [r10+576]; +ld.shared.v2.f64 {fd325, fd326}, [r10+768]; +ld.shared.v2.f64 {fd329, fd330}, [r10+960]; +ld.shared.v2.f64 {fd333, fd334}, [r10+1152]; +ld.shared.v2.f64 {fd337, fd338}, [r10+1344]; +ld.shared.v2.f64 {fd341, fd342}, [r10+1536]; +ld.shared.v2.f64 {fd345, fd346}, [r10+1728]; +ld.shared.v2.f64 {fd349, fd350}, [r10+1920]; +ld.shared.v2.f64 {fd353, fd354}, [r10+2112]; +add.f64 fd357, fd325, fd341; +add.f64 fd358, fd309, fd357; +add.f64 fd359, fd326, fd342; +add.f64 fd360, fd310, fd359; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, fd309, fd361; +sub.f64 fd363, fd326, fd342; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +mul.f64 fd367, fd359, 0d3FE0000000000000; +sub.f64 fd368, fd310, fd367; +sub.f64 fd369, fd325, fd341; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, fd333, fd349; +add.f64 fd374, fd317, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd318, fd375; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, fd317, fd377; +sub.f64 fd379, fd334, fd350; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +mul.f64 fd383, fd375, 0d3FE0000000000000; +sub.f64 fd384, fd318, fd383; +sub.f64 fd385, fd333, fd349; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd389, fd381, 0d3FE0000000000000; +mul.f64 fd390, fd387, 0dBFEBB67AE8584CAA; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd387, 0d3FE0000000000000; +fma.rn.f64 fd393, fd381, 0dBFEBB67AE8584CAA, fd392; +mul.f64 fd394, fd382, 0dBFE0000000000000; +mul.f64 fd395, fd388, 0dBFEBB67AE8584CAA; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd388, 0dBFE0000000000000; +fma.rn.f64 fd398, fd382, 0dBFEBB67AE8584CAA, fd397; +add.f64 fd399, fd358, fd374; +add.f64 fd400, fd360, fd376; +sub.f64 fd401, fd358, fd374; +sub.f64 fd402, fd360, fd376; +add.f64 fd403, fd365, fd391; +add.f64 fd404, fd371, fd393; +sub.f64 fd405, fd365, fd391; +sub.f64 fd406, fd371, fd393; +add.f64 fd407, fd366, fd396; +add.f64 fd408, fd372, fd398; +sub.f64 fd409, fd366, fd396; +sub.f64 fd410, fd372, fd398; +add.f64 fd411, fd329, fd345; +add.f64 fd412, fd313, fd411; +add.f64 fd413, fd330, fd346; +add.f64 fd414, fd314, fd413; +mul.f64 fd415, fd411, 0d3FE0000000000000; +sub.f64 fd416, fd313, fd415; +sub.f64 fd417, fd330, fd346; +mul.f64 fd418, fd417, 0d3FEBB67AE8584CAA; +add.f64 fd419, fd418, fd416; +sub.f64 fd420, fd416, fd418; +mul.f64 fd421, fd413, 0d3FE0000000000000; +sub.f64 fd422, fd314, fd421; +sub.f64 fd423, fd329, fd345; +mul.f64 fd424, fd423, 0d3FEBB67AE8584CAA; +sub.f64 fd425, fd422, fd424; +add.f64 fd426, fd424, fd422; +add.f64 fd427, fd337, fd353; +add.f64 fd428, fd321, fd427; +add.f64 fd429, fd338, fd354; +add.f64 fd430, fd322, fd429; +mul.f64 fd431, fd427, 0d3FE0000000000000; +sub.f64 fd432, fd321, fd431; +sub.f64 fd433, fd338, fd354; +mul.f64 fd434, fd433, 0d3FEBB67AE8584CAA; +add.f64 fd435, fd434, fd432; +sub.f64 fd436, fd432, fd434; +mul.f64 fd437, fd429, 0d3FE0000000000000; +sub.f64 fd438, fd322, fd437; +sub.f64 fd439, fd337, fd353; +mul.f64 fd440, fd439, 0d3FEBB67AE8584CAA; +sub.f64 fd441, fd438, fd440; +add.f64 fd442, fd440, fd438; +mul.f64 fd443, fd435, 0d3FE0000000000000; +mul.f64 fd444, fd441, 0dBFEBB67AE8584CAA; +sub.f64 fd445, fd443, fd444; +mul.f64 fd446, fd441, 0d3FE0000000000000; +fma.rn.f64 fd447, fd435, 0dBFEBB67AE8584CAA, fd446; +mul.f64 fd448, fd436, 0dBFE0000000000000; +mul.f64 fd449, fd442, 0dBFEBB67AE8584CAA; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd442, 0dBFE0000000000000; +fma.rn.f64 fd452, fd436, 0dBFEBB67AE8584CAA, fd451; +add.f64 fd453, fd412, fd428; +add.f64 fd454, fd414, fd430; +sub.f64 fd455, fd412, fd428; +sub.f64 fd456, fd414, fd430; +add.f64 fd457, fd419, fd445; +add.f64 fd458, fd425, fd447; +sub.f64 fd459, fd419, fd445; +sub.f64 fd460, fd425, fd447; +add.f64 fd461, fd420, fd450; +add.f64 fd462, fd426, fd452; +sub.f64 fd463, fd420, fd450; +sub.f64 fd464, fd426, fd452; +mul.f64 fd465, fd457, 0d3FEBB67AE8584CAA; +mul.f64 fd466, fd458, 0dBFE0000000000000; +sub.f64 fd467, fd465, fd466; +mul.f64 fd468, fd458, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd469, fd457, 0dBFE0000000000000, fd468; +mul.f64 fd470, fd461, 0d3FE0000000000000; +mul.f64 fd471, fd462, 0dBFEBB67AE8584CAA; +sub.f64 fd472, fd470, fd471; +mul.f64 fd473, fd462, 0d3FE0000000000000; +fma.rn.f64 fd474, fd461, 0dBFEBB67AE8584CAA, fd473; +mul.f64 fd475, fd459, 0dBFE0000000000000; +mul.f64 fd476, fd460, 0dBFEBB67AE8584CAA; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd460, 0dBFE0000000000000; +fma.rn.f64 fd479, fd459, 0dBFEBB67AE8584CAA, fd478; +mul.f64 fd480, fd463, 0dBFEBB67AE8584CAA; +mul.f64 fd481, fd464, 0dBFE0000000000000; +sub.f64 fd482, fd480, fd481; +mul.f64 fd483, fd464, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd484, fd463, 0dBFE0000000000000, fd483; +add.f64 %1, fd400, fd454; +add.f64 %0, fd399, fd453; +add.f64 %3, fd404, fd469; +add.f64 %2, fd403, fd467; +add.f64 %5, fd408, fd474; +add.f64 %4, fd407, fd472; +sub.f64 %7, fd402, fd455; +add.f64 %6, fd401, fd456; +add.f64 %9, fd406, fd479; +add.f64 %8, fd405, fd477; +add.f64 %11, fd410, fd484; +add.f64 %10, fd409, fd482; +sub.f64 %13, fd400, fd454; +sub.f64 %12, fd399, fd453; +sub.f64 %15, fd404, fd469; +sub.f64 %14, fd403, fd467; +sub.f64 %17, fd408, fd474; +sub.f64 %16, fd407, fd472; +add.f64 %19, fd402, fd455; +sub.f64 %18, fd401, fd456; +sub.f64 %21, fd406, fd479; +sub.f64 %20, fd405, fd477; +sub.f64 %23, fd410, fd484; +sub.f64 %22, fd409, fd482; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..b19f69833dfcc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_144_fp64_inv.hpp.inc @@ -0,0 +1,938 @@ +#ifndef CUFFTDX_FFT_144_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_144_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<742, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<485>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 1152, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %36, %47; +add.f64 fd50, %26, fd49; +add.f64 fd51, %38, %49; +add.f64 fd52, %27, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %26, fd53; +sub.f64 fd55, %38, %49; +mul.f64 fd56, fd55, 0dBFEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %27, fd59; +sub.f64 fd61, %36, %47; +mul.f64 fd62, fd61, 0dBFEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %42, %52; +add.f64 fd66, %31, fd65; +add.f64 fd67, %43, %54; +add.f64 fd68, %33, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %31, fd69; +sub.f64 fd71, %43, %54; +mul.f64 fd72, fd71, 0dBFEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %33, fd75; +sub.f64 fd77, %42, %52; +mul.f64 fd78, fd77, 0dBFEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0d3FEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0d3FEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0d3FEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %39, %50; +add.f64 fd104, %28, fd103; +add.f64 fd105, %41, %51; +add.f64 fd106, %30, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %28, fd107; +sub.f64 fd109, %41, %51; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %30, fd113; +sub.f64 fd115, %39, %50; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %44, %55; +add.f64 fd120, %34, fd119; +add.f64 fd121, %46, %56; +add.f64 fd122, %35, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %34, fd123; +sub.f64 fd125, %46, %56; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %35, fd129; +sub.f64 fd131, %44, %55; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0d3FEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0d3FEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0d3FEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0d3FE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0d3FE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0d3FEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0d3FEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0d3FEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0d3FE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0d3FE0000000000000, fd175; +add.f64 fd177, fd91, fd145; +add.f64 fd178, fd92, fd146; +sub.f64 fd179, fd91, fd145; +sub.f64 fd180, fd92, fd146; +add.f64 fd181, fd95, fd159; +add.f64 fd182, fd96, fd161; +sub.f64 fd183, fd95, fd159; +sub.f64 fd184, fd96, fd161; +add.f64 fd185, fd99, fd164; +add.f64 fd186, fd100, fd166; +sub.f64 fd187, fd99, fd164; +sub.f64 fd188, fd100, fd166; +sub.f64 fd189, fd93, fd148; +add.f64 fd190, fd94, fd147; +add.f64 fd191, fd93, fd148; +sub.f64 fd192, fd94, fd147; +add.f64 fd193, fd97, fd169; +add.f64 fd194, fd98, fd171; +sub.f64 fd195, fd97, fd169; +sub.f64 fd196, fd98, fd171; +add.f64 fd197, fd101, fd174; +add.f64 fd198, fd102, fd176; +sub.f64 fd199, fd101, fd174; +sub.f64 fd200, fd102, fd176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd201, fd202}, [rd6]; +mul.f64 fd205, fd182, fd202; +fma.rn.f64 fd206, fd201, fd181, fd205; +mul.f64 fd207, fd181, fd202; +mul.f64 fd208, fd201, fd182; +sub.f64 fd209, fd208, fd207; +mul.f64 fd210, fd201, fd201; +mul.f64 fd211, fd202, fd202; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd202, fd201; +fma.rn.f64 fd214, fd202, fd201, fd213; +mul.f64 fd215, fd186, fd214; +fma.rn.f64 fd216, fd212, fd185, fd215; +mul.f64 fd217, fd185, fd214; +mul.f64 fd218, fd212, fd186; +sub.f64 fd219, fd218, fd217; +mul.f64 fd220, fd201, fd212; +mul.f64 fd221, fd202, fd214; +sub.f64 fd222, fd220, fd221; +mul.f64 fd223, fd201, fd214; +fma.rn.f64 fd224, fd202, fd212, fd223; +mul.f64 fd225, fd190, fd224; +fma.rn.f64 fd226, fd222, fd189, fd225; +mul.f64 fd227, fd189, fd224; +mul.f64 fd228, fd222, fd190; +sub.f64 fd229, fd228, fd227; +mul.f64 fd230, fd201, fd222; +mul.f64 fd231, fd202, fd224; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd201, fd224; +fma.rn.f64 fd234, fd202, fd222, fd233; +mul.f64 fd235, fd194, fd234; +fma.rn.f64 fd236, fd232, fd193, fd235; +mul.f64 fd237, fd193, fd234; +mul.f64 fd238, fd232, fd194; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd201, fd232; +mul.f64 fd241, fd202, fd234; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd201, fd234; +fma.rn.f64 fd244, fd202, fd232, fd243; +mul.f64 fd245, fd198, fd244; +fma.rn.f64 fd246, fd242, fd197, fd245; +mul.f64 fd247, fd197, fd244; +mul.f64 fd248, fd242, fd198; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd201, fd242; +mul.f64 fd251, fd202, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd201, fd244; +fma.rn.f64 fd254, fd202, fd242, fd253; +mul.f64 fd255, fd180, fd254; +fma.rn.f64 fd256, fd252, fd179, fd255; +mul.f64 fd257, fd179, fd254; +mul.f64 fd258, fd252, fd180; +sub.f64 fd259, fd258, fd257; +ld.global.v2.f64 {fd260, fd261}, [rd6+192]; +mul.f64 fd264, fd184, fd261; +fma.rn.f64 fd265, fd260, fd183, fd264; +mul.f64 fd266, fd183, fd261; +mul.f64 fd267, fd260, fd184; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd201, fd260; +mul.f64 fd270, fd202, fd261; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd201, fd261; +fma.rn.f64 fd273, fd202, fd260, fd272; +mul.f64 fd274, fd188, fd273; +fma.rn.f64 fd275, fd271, fd187, fd274; +mul.f64 fd276, fd187, fd273; +mul.f64 fd277, fd271, fd188; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd201, fd271; +mul.f64 fd280, fd202, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd201, fd273; +fma.rn.f64 fd283, fd202, fd271, fd282; +mul.f64 fd284, fd192, fd283; +fma.rn.f64 fd285, fd281, fd191, fd284; +mul.f64 fd286, fd191, fd283; +mul.f64 fd287, fd281, fd192; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd201, fd281; +mul.f64 fd290, fd202, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd201, fd283; +fma.rn.f64 fd293, fd202, fd281, fd292; +mul.f64 fd294, fd196, fd293; +fma.rn.f64 fd295, fd291, fd195, fd294; +mul.f64 fd296, fd195, fd293; +mul.f64 fd297, fd291, fd196; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd201, fd291; +mul.f64 fd300, fd202, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd201, fd293; +fma.rn.f64 fd303, fd202, fd291, fd302; +mul.f64 fd304, fd200, fd303; +fma.rn.f64 fd305, fd301, fd199, fd304; +mul.f64 fd306, fd199, fd303; +mul.f64 fd307, fd301, fd200; +sub.f64 fd308, fd307, fd306; +mad.lo.s32 r8, r5, 1152, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +st.shared.v2.f64 [r9], {fd177, fd206}; +st.shared.v2.f64 [r9+16], {fd216, fd226}; +st.shared.v2.f64 [r9+32], {fd236, fd246}; +st.shared.v2.f64 [r9+48], {fd256, fd265}; +st.shared.v2.f64 [r9+64], {fd275, fd285}; +st.shared.v2.f64 [r9+80], {fd295, fd305}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.f64 fd309, [r10]; +ld.shared.f64 fd310, [r10+96]; +ld.shared.f64 fd311, [r10+192]; +ld.shared.f64 fd312, [r10+288]; +ld.shared.f64 fd313, [r10+384]; +ld.shared.f64 fd314, [r10+480]; +ld.shared.f64 fd315, [r10+576]; +ld.shared.f64 fd316, [r10+672]; +ld.shared.f64 fd317, [r10+768]; +ld.shared.f64 fd318, [r10+864]; +ld.shared.f64 fd319, [r10+960]; +ld.shared.f64 fd320, [r10+1056]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd178, fd209}; +st.shared.v2.f64 [r9+16], {fd219, fd229}; +st.shared.v2.f64 [r9+32], {fd239, fd249}; +st.shared.v2.f64 [r9+48], {fd259, fd268}; +st.shared.v2.f64 [r9+64], {fd278, fd288}; +st.shared.v2.f64 [r9+80], {fd298, fd308}; +barrier.sync 0; +ld.shared.f64 fd321, [r10]; +ld.shared.f64 fd322, [r10+96]; +ld.shared.f64 fd323, [r10+192]; +ld.shared.f64 fd324, [r10+288]; +ld.shared.f64 fd325, [r10+384]; +ld.shared.f64 fd326, [r10+480]; +ld.shared.f64 fd327, [r10+576]; +ld.shared.f64 fd328, [r10+672]; +ld.shared.f64 fd329, [r10+768]; +ld.shared.f64 fd330, [r10+864]; +ld.shared.f64 fd331, [r10+960]; +ld.shared.f64 fd332, [r10+1056]; +add.f64 fd333, fd313, fd317; +add.f64 fd334, fd309, fd333; +add.f64 fd335, fd325, fd329; +add.f64 fd336, fd321, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd309, fd337; +sub.f64 fd339, fd325, fd329; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd321, fd343; +sub.f64 fd345, fd313, fd317; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd315, fd319; +add.f64 fd350, fd311, fd349; +add.f64 fd351, fd327, fd331; +add.f64 fd352, fd323, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd311, fd353; +sub.f64 fd355, fd327, fd331; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd323, fd359; +sub.f64 fd361, fd315, fd319; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.f64 fd365, fd357, 0d3FE0000000000000; +mul.f64 fd366, fd363, 0d3FEBB67AE8584CAA; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd363, 0d3FE0000000000000; +fma.rn.f64 fd369, fd357, 0d3FEBB67AE8584CAA, fd368; +mul.f64 fd370, fd358, 0dBFE0000000000000; +mul.f64 fd371, fd364, 0d3FEBB67AE8584CAA; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd364, 0dBFE0000000000000; +fma.rn.f64 fd374, fd358, 0d3FEBB67AE8584CAA, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd336, fd352; +sub.f64 fd377, fd334, fd350; +sub.f64 fd378, fd336, fd352; +add.f64 fd379, fd341, fd367; +add.f64 fd380, fd347, fd369; +sub.f64 fd381, fd341, fd367; +sub.f64 fd382, fd347, fd369; +add.f64 fd383, fd342, fd372; +add.f64 fd384, fd348, fd374; +sub.f64 fd385, fd342, fd372; +sub.f64 fd386, fd348, fd374; +add.f64 fd387, fd314, fd318; +add.f64 fd388, fd310, fd387; +add.f64 fd389, fd326, fd330; +add.f64 fd390, fd322, fd389; +mul.f64 fd391, fd387, 0d3FE0000000000000; +sub.f64 fd392, fd310, fd391; +sub.f64 fd393, fd326, fd330; +mul.f64 fd394, fd393, 0dBFEBB67AE8584CAA; +add.f64 fd395, fd394, fd392; +sub.f64 fd396, fd392, fd394; +mul.f64 fd397, fd389, 0d3FE0000000000000; +sub.f64 fd398, fd322, fd397; +sub.f64 fd399, fd314, fd318; +mul.f64 fd400, fd399, 0dBFEBB67AE8584CAA; +sub.f64 fd401, fd398, fd400; +add.f64 fd402, fd400, fd398; +add.f64 fd403, fd316, fd320; +add.f64 fd404, fd312, fd403; +add.f64 fd405, fd328, fd332; +add.f64 fd406, fd324, fd405; +mul.f64 fd407, fd403, 0d3FE0000000000000; +sub.f64 fd408, fd312, fd407; +sub.f64 fd409, fd328, fd332; +mul.f64 fd410, fd409, 0dBFEBB67AE8584CAA; +add.f64 fd411, fd410, fd408; +sub.f64 fd412, fd408, fd410; +mul.f64 fd413, fd405, 0d3FE0000000000000; +sub.f64 fd414, fd324, fd413; +sub.f64 fd415, fd316, fd320; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +sub.f64 fd417, fd414, fd416; +add.f64 fd418, fd416, fd414; +mul.f64 fd419, fd411, 0d3FE0000000000000; +mul.f64 fd420, fd417, 0d3FEBB67AE8584CAA; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd417, 0d3FE0000000000000; +fma.rn.f64 fd423, fd411, 0d3FEBB67AE8584CAA, fd422; +mul.f64 fd424, fd412, 0dBFE0000000000000; +mul.f64 fd425, fd418, 0d3FEBB67AE8584CAA; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd418, 0dBFE0000000000000; +fma.rn.f64 fd428, fd412, 0d3FEBB67AE8584CAA, fd427; +add.f64 fd429, fd388, fd404; +add.f64 fd430, fd390, fd406; +sub.f64 fd431, fd388, fd404; +sub.f64 fd432, fd390, fd406; +add.f64 fd433, fd395, fd421; +add.f64 fd434, fd401, fd423; +sub.f64 fd435, fd395, fd421; +sub.f64 fd436, fd401, fd423; +add.f64 fd437, fd396, fd426; +add.f64 fd438, fd402, fd428; +sub.f64 fd439, fd396, fd426; +sub.f64 fd440, fd402, fd428; +mul.f64 fd441, fd433, 0d3FEBB67AE8584CAA; +mul.f64 fd442, fd434, 0d3FE0000000000000; +sub.f64 fd443, fd441, fd442; +mul.f64 fd444, fd434, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd445, fd433, 0d3FE0000000000000, fd444; +mul.f64 fd446, fd437, 0d3FE0000000000000; +mul.f64 fd447, fd438, 0d3FEBB67AE8584CAA; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd438, 0d3FE0000000000000; +fma.rn.f64 fd450, fd437, 0d3FEBB67AE8584CAA, fd449; +mul.f64 fd451, fd435, 0dBFE0000000000000; +mul.f64 fd452, fd436, 0d3FEBB67AE8584CAA; +sub.f64 fd453, fd451, fd452; +mul.f64 fd454, fd436, 0dBFE0000000000000; +fma.rn.f64 fd455, fd435, 0d3FEBB67AE8584CAA, fd454; +mul.f64 fd456, fd439, 0dBFEBB67AE8584CAA; +mul.f64 fd457, fd440, 0d3FE0000000000000; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd440, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd460, fd439, 0d3FE0000000000000, fd459; +add.f64 %0, fd375, fd429; +add.f64 %1, fd376, fd430; +add.f64 %3, fd380, fd445; +add.f64 %2, fd379, fd443; +add.f64 %5, fd384, fd450; +add.f64 %4, fd383, fd448; +add.f64 %7, fd378, fd431; +sub.f64 %6, fd377, fd432; +add.f64 %9, fd382, fd455; +add.f64 %8, fd381, fd453; +add.f64 %11, fd386, fd460; +add.f64 %10, fd385, fd458; +sub.f64 %12, fd375, fd429; +sub.f64 %13, fd376, fd430; +sub.f64 %15, fd380, fd445; +sub.f64 %14, fd379, fd443; +sub.f64 %17, fd384, fd450; +sub.f64 %16, fd383, fd448; +sub.f64 %19, fd378, fd431; +add.f64 %18, fd377, fd432; +sub.f64 %21, fd382, fd455; +sub.f64 %20, fd381, fd453; +sub.f64 %23, fd386, fd460; +sub.f64 %22, fd385, fd458; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<743, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<509>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 2304, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %36, %47; +add.f64 fd50, %26, fd49; +add.f64 fd51, %38, %49; +add.f64 fd52, %27, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %26, fd53; +sub.f64 fd55, %38, %49; +mul.f64 fd56, fd55, 0dBFEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %27, fd59; +sub.f64 fd61, %36, %47; +mul.f64 fd62, fd61, 0dBFEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %42, %52; +add.f64 fd66, %31, fd65; +add.f64 fd67, %43, %54; +add.f64 fd68, %33, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %31, fd69; +sub.f64 fd71, %43, %54; +mul.f64 fd72, fd71, 0dBFEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %33, fd75; +sub.f64 fd77, %42, %52; +mul.f64 fd78, fd77, 0dBFEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0d3FEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0d3FEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0d3FEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %39, %50; +add.f64 fd104, %28, fd103; +add.f64 fd105, %41, %51; +add.f64 fd106, %30, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %28, fd107; +sub.f64 fd109, %41, %51; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %30, fd113; +sub.f64 fd115, %39, %50; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %44, %55; +add.f64 fd120, %34, fd119; +add.f64 fd121, %46, %56; +add.f64 fd122, %35, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %34, fd123; +sub.f64 fd125, %46, %56; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %35, fd129; +sub.f64 fd131, %44, %55; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0d3FEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0d3FEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0d3FEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0d3FE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0d3FE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0d3FEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0d3FEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0d3FEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0d3FE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0d3FE0000000000000, fd175; +sub.f64 fd177, fd91, fd145; +sub.f64 fd178, fd92, fd146; +add.f64 fd179, fd95, fd159; +add.f64 fd180, fd96, fd161; +sub.f64 fd181, fd95, fd159; +sub.f64 fd182, fd96, fd161; +add.f64 fd183, fd99, fd164; +add.f64 fd184, fd100, fd166; +sub.f64 fd185, fd99, fd164; +sub.f64 fd186, fd100, fd166; +sub.f64 fd187, fd93, fd148; +add.f64 fd188, fd94, fd147; +add.f64 fd189, fd93, fd148; +sub.f64 fd190, fd94, fd147; +add.f64 fd191, fd97, fd169; +add.f64 fd192, fd98, fd171; +sub.f64 fd193, fd97, fd169; +sub.f64 fd194, fd98, fd171; +add.f64 fd195, fd101, fd174; +add.f64 fd196, fd102, fd176; +sub.f64 fd197, fd101, fd174; +sub.f64 fd198, fd102, fd176; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 12; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2304, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd199, fd200}, [rd6]; +mul.f64 fd203, fd180, fd200; +mul.f64 fd204, fd179, fd200; +mul.f64 fd205, fd199, fd180; +mul.f64 fd206, fd199, fd199; +mul.f64 fd207, fd200, fd200; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd200, fd199; +fma.rn.f64 fd210, fd200, fd199, fd209; +mul.f64 fd211, fd184, fd210; +mul.f64 fd212, fd183, fd210; +mul.f64 fd213, fd208, fd184; +mul.f64 fd214, fd199, fd208; +mul.f64 fd215, fd200, fd210; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd199, fd210; +fma.rn.f64 fd218, fd200, fd208, fd217; +mul.f64 fd219, fd188, fd218; +mul.f64 fd220, fd187, fd218; +mul.f64 fd221, fd216, fd188; +mul.f64 fd222, fd199, fd216; +mul.f64 fd223, fd200, fd218; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd199, fd218; +fma.rn.f64 fd226, fd200, fd216, fd225; +mul.f64 fd227, fd192, fd226; +mul.f64 fd228, fd191, fd226; +mul.f64 fd229, fd224, fd192; +mul.f64 fd230, fd199, fd224; +mul.f64 fd231, fd200, fd226; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd199, fd226; +fma.rn.f64 fd234, fd200, fd224, fd233; +mul.f64 fd235, fd196, fd234; +mul.f64 fd236, fd195, fd234; +mul.f64 fd237, fd232, fd196; +mul.f64 fd238, fd199, fd232; +mul.f64 fd239, fd200, fd234; +sub.f64 fd240, fd238, fd239; +mul.f64 fd241, fd199, fd234; +fma.rn.f64 fd242, fd200, fd232, fd241; +mul.f64 fd243, fd178, fd242; +mul.f64 fd244, fd177, fd242; +mul.f64 fd245, fd240, fd178; +ld.global.v2.f64 {fd246, fd247}, [rd6+192]; +mul.f64 fd250, fd182, fd247; +mul.f64 fd251, fd181, fd247; +mul.f64 fd252, fd246, fd182; +mul.f64 fd253, fd199, fd246; +mul.f64 fd254, fd200, fd247; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd199, fd247; +fma.rn.f64 fd257, fd200, fd246, fd256; +mul.f64 fd258, fd186, fd257; +mul.f64 fd259, fd185, fd257; +mul.f64 fd260, fd255, fd186; +mul.f64 fd261, fd199, fd255; +mul.f64 fd262, fd200, fd257; +sub.f64 fd263, fd261, fd262; +mul.f64 fd264, fd199, fd257; +fma.rn.f64 fd265, fd200, fd255, fd264; +mul.f64 fd266, fd190, fd265; +mul.f64 fd267, fd189, fd265; +mul.f64 fd268, fd263, fd190; +mul.f64 fd269, fd199, fd263; +mul.f64 fd270, fd200, fd265; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd199, fd265; +fma.rn.f64 fd273, fd200, fd263, fd272; +mul.f64 fd274, fd194, fd273; +mul.f64 fd275, fd193, fd273; +mul.f64 fd276, fd271, fd194; +mul.f64 fd277, fd199, fd271; +mul.f64 fd278, fd200, fd273; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd199, fd273; +fma.rn.f64 fd281, fd200, fd271, fd280; +mul.f64 fd282, fd198, fd281; +mul.f64 fd283, fd197, fd281; +mul.f64 fd284, fd279, fd198; +barrier.sync 0; +mad.lo.s32 r9, r7, 192, r8; +add.f64 fd285, fd92, fd146; +add.f64 fd286, fd91, fd145; +st.shared.v2.f64 [r9], {fd286, fd285}; +fma.rn.f64 fd287, fd199, fd179, fd203; +sub.f64 fd288, fd205, fd204; +st.shared.v2.f64 [r9+16], {fd287, fd288}; +fma.rn.f64 fd289, fd208, fd183, fd211; +sub.f64 fd290, fd213, fd212; +st.shared.v2.f64 [r9+32], {fd289, fd290}; +sub.f64 fd291, fd221, fd220; +fma.rn.f64 fd292, fd216, fd187, fd219; +st.shared.v2.f64 [r9+48], {fd292, fd291}; +sub.f64 fd293, fd229, fd228; +fma.rn.f64 fd294, fd224, fd191, fd227; +st.shared.v2.f64 [r9+64], {fd294, fd293}; +fma.rn.f64 fd295, fd232, fd195, fd235; +sub.f64 fd296, fd237, fd236; +st.shared.v2.f64 [r9+80], {fd295, fd296}; +fma.rn.f64 fd297, fd240, fd177, fd243; +sub.f64 fd298, fd245, fd244; +st.shared.v2.f64 [r9+96], {fd297, fd298}; +fma.rn.f64 fd299, fd246, fd181, fd250; +sub.f64 fd300, fd252, fd251; +st.shared.v2.f64 [r9+112], {fd299, fd300}; +fma.rn.f64 fd301, fd255, fd185, fd258; +sub.f64 fd302, fd260, fd259; +st.shared.v2.f64 [r9+128], {fd301, fd302}; +sub.f64 fd303, fd268, fd267; +fma.rn.f64 fd304, fd263, fd189, fd266; +st.shared.v2.f64 [r9+144], {fd304, fd303}; +sub.f64 fd305, fd276, fd275; +fma.rn.f64 fd306, fd271, fd193, fd274; +st.shared.v2.f64 [r9+160], {fd306, fd305}; +fma.rn.f64 fd307, fd279, fd197, fd282; +sub.f64 fd308, fd284, fd283; +st.shared.v2.f64 [r9+176], {fd307, fd308}; +barrier.sync 0; +mad.lo.s32 r10, r7, -176, r9; +ld.shared.v2.f64 {fd309, fd310}, [r10]; +ld.shared.v2.f64 {fd313, fd314}, [r10+192]; +ld.shared.v2.f64 {fd317, fd318}, [r10+384]; +ld.shared.v2.f64 {fd321, fd322}, [r10+576]; +ld.shared.v2.f64 {fd325, fd326}, [r10+768]; +ld.shared.v2.f64 {fd329, fd330}, [r10+960]; +ld.shared.v2.f64 {fd333, fd334}, [r10+1152]; +ld.shared.v2.f64 {fd337, fd338}, [r10+1344]; +ld.shared.v2.f64 {fd341, fd342}, [r10+1536]; +ld.shared.v2.f64 {fd345, fd346}, [r10+1728]; +ld.shared.v2.f64 {fd349, fd350}, [r10+1920]; +ld.shared.v2.f64 {fd353, fd354}, [r10+2112]; +add.f64 fd357, fd325, fd341; +add.f64 fd358, fd309, fd357; +add.f64 fd359, fd326, fd342; +add.f64 fd360, fd310, fd359; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, fd309, fd361; +sub.f64 fd363, fd326, fd342; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +mul.f64 fd367, fd359, 0d3FE0000000000000; +sub.f64 fd368, fd310, fd367; +sub.f64 fd369, fd325, fd341; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, fd333, fd349; +add.f64 fd374, fd317, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd318, fd375; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, fd317, fd377; +sub.f64 fd379, fd334, fd350; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +mul.f64 fd383, fd375, 0d3FE0000000000000; +sub.f64 fd384, fd318, fd383; +sub.f64 fd385, fd333, fd349; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd389, fd381, 0d3FE0000000000000; +mul.f64 fd390, fd387, 0d3FEBB67AE8584CAA; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd387, 0d3FE0000000000000; +fma.rn.f64 fd393, fd381, 0d3FEBB67AE8584CAA, fd392; +mul.f64 fd394, fd382, 0dBFE0000000000000; +mul.f64 fd395, fd388, 0d3FEBB67AE8584CAA; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd388, 0dBFE0000000000000; +fma.rn.f64 fd398, fd382, 0d3FEBB67AE8584CAA, fd397; +add.f64 fd399, fd358, fd374; +add.f64 fd400, fd360, fd376; +sub.f64 fd401, fd358, fd374; +sub.f64 fd402, fd360, fd376; +add.f64 fd403, fd365, fd391; +add.f64 fd404, fd371, fd393; +sub.f64 fd405, fd365, fd391; +sub.f64 fd406, fd371, fd393; +add.f64 fd407, fd366, fd396; +add.f64 fd408, fd372, fd398; +sub.f64 fd409, fd366, fd396; +sub.f64 fd410, fd372, fd398; +add.f64 fd411, fd329, fd345; +add.f64 fd412, fd313, fd411; +add.f64 fd413, fd330, fd346; +add.f64 fd414, fd314, fd413; +mul.f64 fd415, fd411, 0d3FE0000000000000; +sub.f64 fd416, fd313, fd415; +sub.f64 fd417, fd330, fd346; +mul.f64 fd418, fd417, 0dBFEBB67AE8584CAA; +add.f64 fd419, fd418, fd416; +sub.f64 fd420, fd416, fd418; +mul.f64 fd421, fd413, 0d3FE0000000000000; +sub.f64 fd422, fd314, fd421; +sub.f64 fd423, fd329, fd345; +mul.f64 fd424, fd423, 0dBFEBB67AE8584CAA; +sub.f64 fd425, fd422, fd424; +add.f64 fd426, fd424, fd422; +add.f64 fd427, fd337, fd353; +add.f64 fd428, fd321, fd427; +add.f64 fd429, fd338, fd354; +add.f64 fd430, fd322, fd429; +mul.f64 fd431, fd427, 0d3FE0000000000000; +sub.f64 fd432, fd321, fd431; +sub.f64 fd433, fd338, fd354; +mul.f64 fd434, fd433, 0dBFEBB67AE8584CAA; +add.f64 fd435, fd434, fd432; +sub.f64 fd436, fd432, fd434; +mul.f64 fd437, fd429, 0d3FE0000000000000; +sub.f64 fd438, fd322, fd437; +sub.f64 fd439, fd337, fd353; +mul.f64 fd440, fd439, 0dBFEBB67AE8584CAA; +sub.f64 fd441, fd438, fd440; +add.f64 fd442, fd440, fd438; +mul.f64 fd443, fd435, 0d3FE0000000000000; +mul.f64 fd444, fd441, 0d3FEBB67AE8584CAA; +sub.f64 fd445, fd443, fd444; +mul.f64 fd446, fd441, 0d3FE0000000000000; +fma.rn.f64 fd447, fd435, 0d3FEBB67AE8584CAA, fd446; +mul.f64 fd448, fd436, 0dBFE0000000000000; +mul.f64 fd449, fd442, 0d3FEBB67AE8584CAA; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd442, 0dBFE0000000000000; +fma.rn.f64 fd452, fd436, 0d3FEBB67AE8584CAA, fd451; +add.f64 fd453, fd412, fd428; +add.f64 fd454, fd414, fd430; +sub.f64 fd455, fd412, fd428; +sub.f64 fd456, fd414, fd430; +add.f64 fd457, fd419, fd445; +add.f64 fd458, fd425, fd447; +sub.f64 fd459, fd419, fd445; +sub.f64 fd460, fd425, fd447; +add.f64 fd461, fd420, fd450; +add.f64 fd462, fd426, fd452; +sub.f64 fd463, fd420, fd450; +sub.f64 fd464, fd426, fd452; +mul.f64 fd465, fd457, 0d3FEBB67AE8584CAA; +mul.f64 fd466, fd458, 0d3FE0000000000000; +sub.f64 fd467, fd465, fd466; +mul.f64 fd468, fd458, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd469, fd457, 0d3FE0000000000000, fd468; +mul.f64 fd470, fd461, 0d3FE0000000000000; +mul.f64 fd471, fd462, 0d3FEBB67AE8584CAA; +sub.f64 fd472, fd470, fd471; +mul.f64 fd473, fd462, 0d3FE0000000000000; +fma.rn.f64 fd474, fd461, 0d3FEBB67AE8584CAA, fd473; +mul.f64 fd475, fd459, 0dBFE0000000000000; +mul.f64 fd476, fd460, 0d3FEBB67AE8584CAA; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd460, 0dBFE0000000000000; +fma.rn.f64 fd479, fd459, 0d3FEBB67AE8584CAA, fd478; +mul.f64 fd480, fd463, 0dBFEBB67AE8584CAA; +mul.f64 fd481, fd464, 0d3FE0000000000000; +sub.f64 fd482, fd480, fd481; +mul.f64 fd483, fd464, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd484, fd463, 0d3FE0000000000000, fd483; +add.f64 %1, fd400, fd454; +add.f64 %0, fd399, fd453; +add.f64 %3, fd404, fd469; +add.f64 %2, fd403, fd467; +add.f64 %5, fd408, fd474; +add.f64 %4, fd407, fd472; +add.f64 %7, fd402, fd455; +sub.f64 %6, fd401, fd456; +add.f64 %9, fd406, fd479; +add.f64 %8, fd405, fd477; +add.f64 %11, fd410, fd484; +add.f64 %10, fd409, fd482; +sub.f64 %13, fd400, fd454; +sub.f64 %12, fd399, fd453; +sub.f64 %15, fd404, fd469; +sub.f64 %14, fd403, fd467; +sub.f64 %17, fd408, fd474; +sub.f64 %16, fd407, fd472; +sub.f64 %19, fd402, fd455; +add.f64 %18, fd401, fd456; +sub.f64 %21, fd406, fd479; +sub.f64 %20, fd405, fd477; +sub.f64 %23, fd410, fd484; +sub.f64 %22, fd409, fd482; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..adc009fbfeb36 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp16_fwd.hpp.inc @@ -0,0 +1,1709 @@ +#ifndef CUFFTDX_FFT_14_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_14_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<747, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<55>; +.reg .b32 r<1549>; +.reg .f64 fd<47>; +.reg .b64 rd<2>; +mov.f64 fd23, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd23; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd30, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd30; +} +mov.b32 r447, {rs2, rs2}; +mov.f64 fd27, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs3, fd27; +} +mov.b32 r654, {rs3, rs3}; +mov.f64 fd28, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs4, fd28; +} +mov.b32 r678, {rs4, rs4}; +mov.f64 fd31, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs5, fd31; +} +mov.b32 r636, {rs5, rs5}; +mov.f64 fd32, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs6, fd32; +} +mov.b32 r663, {rs6, rs6}; +{ +cvt.rn.f16.f64 rs7, fd31; +} +mov.b32 r537, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs8, fd32; +} +{ +neg.f16 rs9, rs8; +} +mov.b32 r561, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs11, fd23; +} +mov.b32 r645, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd30; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r669, {rs13, rs13}; +{ +add.f16x2 r1, %32, %52; +} +{ +add.f16x2 r4, %28, r1; +} +{ +add.f16x2 r7, %36, %48; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %40, %44; +} +{ +add.f16x2 r16, r10, r13; +} +{ +add.f16x2 r19, %33, %53; +} +{ +add.f16x2 r22, %29, r19; +} +{ +add.f16x2 r25, %37, %49; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %41, %45; +} +{ +add.f16x2 r34, r28, r31; +} +{ +add.f16x2 r37, %32, %52; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %28, r40; +} +{ +add.f16x2 r46, %36, %48; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %40, %44; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %33, %53; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %37, %49; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %41, %45; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r61, r85; +} +{ +add.f16x2 r91, %32, %52; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %28, r94; +} +{ +add.f16x2 r100, %36, %48; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %40, %44; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %33, %53; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %37, %49; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %41, %45; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 r142, r115, r139; +} +{ +add.f16x2 r145, %32, %52; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %28, r148; +} +{ +add.f16x2 r154, %36, %48; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %40, %44; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %33, %53; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %37, %49; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %41, %45; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 r196, r169, r193; +} +{ +add.f16x2 r199, %32, %52; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %28, r202; +} +{ +add.f16x2 r208, %36, %48; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %40, %44; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %33, %53; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %37, %49; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %41, %45; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 r250, r223, r247; +} +{ +add.f16x2 r253, %32, %52; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %28, r256; +} +{ +add.f16x2 r262, %36, %48; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %40, %44; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %33, %53; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %37, %49; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %41, %45; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 r304, r277, r301; +} +{ +add.f16x2 r307, %32, %52; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %28, r310; +} +{ +add.f16x2 r316, %36, %48; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %40, %44; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %33, %53; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %37, %49; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %41, %45; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 r358, r331, r355; +} +{ +add.f16x2 r361, %33, %53; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %29, r364; +} +{ +add.f16x2 r370, %37, %49; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %41, %45; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %32, %52; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %36, %48; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %40, %44; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 r412, r385, r409; +} +{ +add.f16x2 r415, %33, %53; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %29, r418; +} +{ +add.f16x2 r424, %37, %49; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %41, %45; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %32, %52; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %36, %48; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %40, %44; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 r466, r439, r463; +} +{ +add.f16x2 r469, %33, %53; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %29, r472; +} +{ +add.f16x2 r478, %37, %49; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %41, %45; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %32, %52; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %36, %48; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %40, %44; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r493, r517; +} +{ +add.f16x2 r523, %33, %53; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %29, r526; +} +{ +add.f16x2 r532, %37, %49; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %41, %45; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %32, %52; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %36, %48; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %40, %44; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 r574, r547, r571; +} +{ +add.f16x2 r577, %33, %53; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %29, r580; +} +{ +add.f16x2 r586, %37, %49; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %41, %45; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %32, %52; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %36, %48; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %40, %44; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r601, r625; +} +{ +add.f16x2 r631, %33, %53; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %29, r634; +} +{ +add.f16x2 r640, %37, %49; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %41, %45; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %32, %52; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %36, %48; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %40, %44; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r655, r679; +} +{ +cvt.rn.f16.f64 rs15, fd23; +} +mov.b32 r1104, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd30; +} +mov.b32 r1131, {rs16, rs16}; +{ +cvt.rn.f16.f64 rs17, fd27; +} +mov.b32 r1338, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd28; +} +mov.b32 r1362, {rs18, rs18}; +{ +cvt.rn.f16.f64 rs19, fd31; +} +mov.b32 r1320, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd32; +} +mov.b32 r1347, {rs20, rs20}; +{ +cvt.rn.f16.f64 rs21, fd31; +} +mov.b32 r1221, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd32; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r1245, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd23; +} +mov.b32 r1329, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd30; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r1353, {rs27, rs27}; +{ +add.f16x2 r685, %34, %54; +} +{ +add.f16x2 r688, %30, r685; +} +{ +add.f16x2 r691, %38, %50; +} +{ +add.f16x2 r694, r688, r691; +} +{ +add.f16x2 r697, %42, %46; +} +{ +add.f16x2 r700, r694, r697; +} +{ +add.f16x2 r703, %35, %55; +} +{ +add.f16x2 r706, %31, r703; +} +{ +add.f16x2 r709, %39, %51; +} +{ +add.f16x2 r712, r706, r709; +} +{ +add.f16x2 r715, %43, %47; +} +{ +add.f16x2 r718, r712, r715; +} +{ +add.f16x2 r721, %34, %54; +} +{ +mul.f16x2 r724, r721, r1104; +} +{ +add.f16x2 r727, %30, r724; +} +{ +add.f16x2 r730, %38, %50; +} +{ +mul.f16x2 r733, r730, r1338; +} +{ +add.f16x2 r736, r727, r733; +} +{ +add.f16x2 r739, %42, %46; +} +{ +mul.f16x2 r742, r739, r1320; +} +{ +add.f16x2 r745, r736, r742; +} +{ +sub.f16x2 r748, %35, %55; +} +{ +mul.f16x2 r751, r748, r1131; +} +{ +sub.f16x2 r754, %39, %51; +} +{ +mul.f16x2 r757, r754, r1362; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %43, %47; +} +{ +mul.f16x2 r766, r763, r1347; +} +{ +add.f16x2 r769, r760, r766; +} +{ +sub.f16x2 r772, r745, r769; +} +{ +add.f16x2 r775, %34, %54; +} +{ +mul.f16x2 r778, r775, r1104; +} +{ +add.f16x2 r781, %30, r778; +} +{ +add.f16x2 r784, %38, %50; +} +{ +mul.f16x2 r787, r784, r1338; +} +{ +add.f16x2 r790, r781, r787; +} +{ +add.f16x2 r793, %42, %46; +} +{ +mul.f16x2 r796, r793, r1320; +} +{ +add.f16x2 r799, r790, r796; +} +{ +sub.f16x2 r802, %35, %55; +} +{ +mul.f16x2 r805, r802, r1131; +} +{ +sub.f16x2 r808, %39, %51; +} +{ +mul.f16x2 r811, r808, r1362; +} +{ +add.f16x2 r814, r805, r811; +} +{ +sub.f16x2 r817, %43, %47; +} +{ +mul.f16x2 r820, r817, r1347; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r799, r823; +} +{ +add.f16x2 r829, %34, %54; +} +{ +mul.f16x2 r832, r829, r1338; +} +{ +add.f16x2 r835, %30, r832; +} +{ +add.f16x2 r838, %38, %50; +} +{ +mul.f16x2 r841, r838, r1221; +} +{ +add.f16x2 r844, r835, r841; +} +{ +add.f16x2 r847, %42, %46; +} +{ +mul.f16x2 r850, r847, r1329; +} +{ +add.f16x2 r853, r844, r850; +} +{ +sub.f16x2 r856, %35, %55; +} +{ +mul.f16x2 r859, r856, r1362; +} +{ +sub.f16x2 r862, %39, %51; +} +{ +mul.f16x2 r865, r862, r1245; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %43, %47; +} +{ +mul.f16x2 r874, r871, r1353; +} +{ +add.f16x2 r877, r868, r874; +} +{ +sub.f16x2 r880, r853, r877; +} +{ +add.f16x2 r883, %34, %54; +} +{ +mul.f16x2 r886, r883, r1338; +} +{ +add.f16x2 r889, %30, r886; +} +{ +add.f16x2 r892, %38, %50; +} +{ +mul.f16x2 r895, r892, r1221; +} +{ +add.f16x2 r898, r889, r895; +} +{ +add.f16x2 r901, %42, %46; +} +{ +mul.f16x2 r904, r901, r1329; +} +{ +add.f16x2 r907, r898, r904; +} +{ +sub.f16x2 r910, %35, %55; +} +{ +mul.f16x2 r913, r910, r1362; +} +{ +sub.f16x2 r916, %39, %51; +} +{ +mul.f16x2 r919, r916, r1245; +} +{ +add.f16x2 r922, r913, r919; +} +{ +sub.f16x2 r925, %43, %47; +} +{ +mul.f16x2 r928, r925, r1353; +} +{ +add.f16x2 r931, r922, r928; +} +{ +add.f16x2 r934, r907, r931; +} +{ +add.f16x2 r937, %34, %54; +} +{ +mul.f16x2 r940, r937, r1320; +} +{ +add.f16x2 r943, %30, r940; +} +{ +add.f16x2 r946, %38, %50; +} +{ +mul.f16x2 r949, r946, r1329; +} +{ +add.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, %42, %46; +} +{ +mul.f16x2 r958, r955, r1338; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, %35, %55; +} +{ +mul.f16x2 r967, r964, r1347; +} +{ +sub.f16x2 r970, %39, %51; +} +{ +mul.f16x2 r973, r970, r1353; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %43, %47; +} +{ +mul.f16x2 r982, r979, r1362; +} +{ +add.f16x2 r985, r976, r982; +} +{ +sub.f16x2 r988, r961, r985; +} +{ +add.f16x2 r991, %34, %54; +} +{ +mul.f16x2 r994, r991, r1320; +} +{ +add.f16x2 r997, %30, r994; +} +{ +add.f16x2 r1000, %38, %50; +} +{ +mul.f16x2 r1003, r1000, r1329; +} +{ +add.f16x2 r1006, r997, r1003; +} +{ +add.f16x2 r1009, %42, %46; +} +{ +mul.f16x2 r1012, r1009, r1338; +} +{ +add.f16x2 r1015, r1006, r1012; +} +{ +sub.f16x2 r1018, %35, %55; +} +{ +mul.f16x2 r1021, r1018, r1347; +} +{ +sub.f16x2 r1024, %39, %51; +} +{ +mul.f16x2 r1027, r1024, r1353; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %43, %47; +} +{ +mul.f16x2 r1036, r1033, r1362; +} +{ +add.f16x2 r1039, r1030, r1036; +} +{ +add.f16x2 r1042, r1015, r1039; +} +{ +add.f16x2 r1045, %35, %55; +} +{ +mul.f16x2 r1048, r1045, r1104; +} +{ +add.f16x2 r1051, %31, r1048; +} +{ +add.f16x2 r1054, %39, %51; +} +{ +mul.f16x2 r1057, r1054, r1338; +} +{ +add.f16x2 r1060, r1051, r1057; +} +{ +add.f16x2 r1063, %43, %47; +} +{ +mul.f16x2 r1066, r1063, r1320; +} +{ +add.f16x2 r1069, r1060, r1066; +} +{ +sub.f16x2 r1072, %34, %54; +} +{ +mul.f16x2 r1075, r1072, r1131; +} +{ +sub.f16x2 r1078, %38, %50; +} +{ +mul.f16x2 r1081, r1078, r1362; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %42, %46; +} +{ +mul.f16x2 r1090, r1087, r1347; +} +{ +add.f16x2 r1093, r1084, r1090; +} +{ +add.f16x2 r1096, r1069, r1093; +} +{ +add.f16x2 r1099, %35, %55; +} +{ +mul.f16x2 r1102, r1099, r1104; +} +{ +add.f16x2 r1105, %31, r1102; +} +{ +add.f16x2 r1108, %39, %51; +} +{ +mul.f16x2 r1111, r1108, r1338; +} +{ +add.f16x2 r1114, r1105, r1111; +} +{ +add.f16x2 r1117, %43, %47; +} +{ +mul.f16x2 r1120, r1117, r1320; +} +{ +add.f16x2 r1123, r1114, r1120; +} +{ +sub.f16x2 r1126, %34, %54; +} +{ +mul.f16x2 r1129, r1126, r1131; +} +{ +sub.f16x2 r1132, %38, %50; +} +{ +mul.f16x2 r1135, r1132, r1362; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %42, %46; +} +{ +mul.f16x2 r1144, r1141, r1347; +} +{ +add.f16x2 r1147, r1138, r1144; +} +{ +sub.f16x2 r1150, r1123, r1147; +} +{ +add.f16x2 r1153, %35, %55; +} +{ +mul.f16x2 r1156, r1153, r1338; +} +{ +add.f16x2 r1159, %31, r1156; +} +{ +add.f16x2 r1162, %39, %51; +} +{ +mul.f16x2 r1165, r1162, r1221; +} +{ +add.f16x2 r1168, r1159, r1165; +} +{ +add.f16x2 r1171, %43, %47; +} +{ +mul.f16x2 r1174, r1171, r1329; +} +{ +add.f16x2 r1177, r1168, r1174; +} +{ +sub.f16x2 r1180, %34, %54; +} +{ +mul.f16x2 r1183, r1180, r1362; +} +{ +sub.f16x2 r1186, %38, %50; +} +{ +mul.f16x2 r1189, r1186, r1245; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %42, %46; +} +{ +mul.f16x2 r1198, r1195, r1353; +} +{ +add.f16x2 r1201, r1192, r1198; +} +{ +add.f16x2 r1204, r1177, r1201; +} +{ +add.f16x2 r1207, %35, %55; +} +{ +mul.f16x2 r1210, r1207, r1338; +} +{ +add.f16x2 r1213, %31, r1210; +} +{ +add.f16x2 r1216, %39, %51; +} +{ +mul.f16x2 r1219, r1216, r1221; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %43, %47; +} +{ +mul.f16x2 r1228, r1225, r1329; +} +{ +add.f16x2 r1231, r1222, r1228; +} +{ +sub.f16x2 r1234, %34, %54; +} +{ +mul.f16x2 r1237, r1234, r1362; +} +{ +sub.f16x2 r1240, %38, %50; +} +{ +mul.f16x2 r1243, r1240, r1245; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %42, %46; +} +{ +mul.f16x2 r1252, r1249, r1353; +} +{ +add.f16x2 r1255, r1246, r1252; +} +{ +sub.f16x2 r1258, r1231, r1255; +} +{ +add.f16x2 r1261, %35, %55; +} +{ +mul.f16x2 r1264, r1261, r1320; +} +{ +add.f16x2 r1267, %31, r1264; +} +{ +add.f16x2 r1270, %39, %51; +} +{ +mul.f16x2 r1273, r1270, r1329; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +add.f16x2 r1279, %43, %47; +} +{ +mul.f16x2 r1282, r1279, r1338; +} +{ +add.f16x2 r1285, r1276, r1282; +} +{ +sub.f16x2 r1288, %34, %54; +} +{ +mul.f16x2 r1291, r1288, r1347; +} +{ +sub.f16x2 r1294, %38, %50; +} +{ +mul.f16x2 r1297, r1294, r1353; +} +{ +add.f16x2 r1300, r1291, r1297; +} +{ +sub.f16x2 r1303, %42, %46; +} +{ +mul.f16x2 r1306, r1303, r1362; +} +{ +add.f16x2 r1309, r1300, r1306; +} +{ +add.f16x2 r1312, r1285, r1309; +} +{ +add.f16x2 r1315, %35, %55; +} +{ +mul.f16x2 r1318, r1315, r1320; +} +{ +add.f16x2 r1321, %31, r1318; +} +{ +add.f16x2 r1324, %39, %51; +} +{ +mul.f16x2 r1327, r1324, r1329; +} +{ +add.f16x2 r1330, r1321, r1327; +} +{ +add.f16x2 r1333, %43, %47; +} +{ +mul.f16x2 r1336, r1333, r1338; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +sub.f16x2 r1342, %34, %54; +} +{ +mul.f16x2 r1345, r1342, r1347; +} +{ +sub.f16x2 r1348, %38, %50; +} +{ +mul.f16x2 r1351, r1348, r1353; +} +{ +add.f16x2 r1354, r1345, r1351; +} +{ +sub.f16x2 r1357, %42, %46; +} +{ +mul.f16x2 r1360, r1357, r1362; +} +{ +add.f16x2 r1363, r1354, r1360; +} +{ +sub.f16x2 r1366, r1339, r1363; +} +mov.f64 fd21, 0d3FECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs29, fd21; +} +{ +cvt.rn.f16.f64 rs30, fd32; +} +{ +cvt.rn.f16.f64 rs31, fd23; +} +{ +cvt.rn.f16.f64 rs32, fd30; +} +mov.f64 fd25, 0d3FCC7B90E3024582; +{ +cvt.rn.f16.f64 rs33, fd25; +} +{ +cvt.rn.f16.f64 rs34, fd28; +} +{ +cvt.rn.f16.f64 rs35, fd27; +} +{ +cvt.rn.f16.f64 rs36, fd28; +} +mov.f64 fd29, 0dBFE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs37, fd29; +} +{ +cvt.rn.f16.f64 rs38, fd30; +} +{ +cvt.rn.f16.f64 rs39, fd31; +} +{ +cvt.rn.f16.f64 rs40, fd32; +} +mov.b32 r1383, {rs29, rs29}; +{ +mul.f16x2 r1369, r772, r1383; +} +mov.b32 r1380, {rs30, rs30}; +{ +mul.f16x2 r1372, r1096, r1380; +} +{ +sub.f16x2 r1375, r1369, r1372; +} +{ +mul.f16x2 r1378, r772, r1380; +} +{ +fma.rn.f16x2 r1381, r1096, r1383, r1378; +} +mov.b32 r1399, {rs31, rs31}; +{ +mul.f16x2 r1385, r880, r1399; +} +mov.b32 r1396, {rs32, rs32}; +{ +mul.f16x2 r1388, r1204, r1396; +} +{ +sub.f16x2 r1391, r1385, r1388; +} +{ +mul.f16x2 r1394, r880, r1396; +} +{ +fma.rn.f16x2 r1397, r1204, r1399, r1394; +} +mov.b32 r1415, {rs33, rs33}; +{ +mul.f16x2 r1401, r988, r1415; +} +mov.b32 r1412, {rs34, rs34}; +{ +mul.f16x2 r1404, r1312, r1412; +} +{ +sub.f16x2 r1407, r1401, r1404; +} +{ +mul.f16x2 r1410, r988, r1412; +} +{ +fma.rn.f16x2 r1413, r1312, r1415, r1410; +} +mov.b32 r1431, {rs35, rs35}; +{ +mul.f16x2 r1417, r1042, r1431; +} +mov.b32 r1428, {rs36, rs36}; +{ +mul.f16x2 r1420, r1366, r1428; +} +{ +sub.f16x2 r1423, r1417, r1420; +} +{ +mul.f16x2 r1426, r1042, r1428; +} +{ +fma.rn.f16x2 r1429, r1366, r1431, r1426; +} +mov.b32 r1447, {rs37, rs37}; +{ +mul.f16x2 r1433, r934, r1447; +} +mov.b32 r1444, {rs38, rs38}; +{ +mul.f16x2 r1436, r1258, r1444; +} +{ +sub.f16x2 r1439, r1433, r1436; +} +{ +mul.f16x2 r1442, r934, r1444; +} +{ +fma.rn.f16x2 r1445, r1258, r1447, r1442; +} +mov.b32 r1463, {rs39, rs39}; +{ +mul.f16x2 r1449, r826, r1463; +} +mov.b32 r1460, {rs40, rs40}; +{ +mul.f16x2 r1452, r1150, r1460; +} +{ +sub.f16x2 r1455, r1449, r1452; +} +{ +mul.f16x2 r1458, r826, r1460; +} +{ +fma.rn.f16x2 r1461, r1150, r1463, r1458; +} +{ +add.f16x2 %0, r16, r700; +} +{ +add.f16x2 %1, r34, r718; +} +{ +sub.f16x2 %14, r16, r700; +} +{ +sub.f16x2 %15, r34, r718; +} +{ +add.f16x2 %2, r88, r1375; +} +{ +add.f16x2 %3, r412, r1381; +} +{ +sub.f16x2 %16, r88, r1375; +} +{ +sub.f16x2 %17, r412, r1381; +} +{ +add.f16x2 %4, r196, r1391; +} +{ +add.f16x2 %5, r520, r1397; +} +{ +sub.f16x2 %18, r196, r1391; +} +{ +sub.f16x2 %19, r520, r1397; +} +{ +add.f16x2 %6, r304, r1407; +} +{ +add.f16x2 %7, r628, r1413; +} +{ +sub.f16x2 %20, r304, r1407; +} +{ +sub.f16x2 %21, r628, r1413; +} +{ +add.f16x2 %8, r358, r1423; +} +{ +add.f16x2 %9, r682, r1429; +} +{ +sub.f16x2 %22, r358, r1423; +} +{ +sub.f16x2 %23, r682, r1429; +} +{ +add.f16x2 %10, r250, r1439; +} +{ +add.f16x2 %11, r574, r1445; +} +{ +sub.f16x2 %24, r250, r1439; +} +{ +sub.f16x2 %25, r574, r1445; +} +{ +add.f16x2 %12, r142, r1455; +} +{ +add.f16x2 %13, r466, r1461; +} +{ +sub.f16x2 %26, r142, r1455; +} +{ +sub.f16x2 %27, r466, r1461; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..b20e3316cbbd4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp16_inv.hpp.inc @@ -0,0 +1,1718 @@ +#ifndef CUFFTDX_FFT_14_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_14_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<949, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<59>; +.reg .b32 r<1549>; +.reg .f64 fd<47>; +.reg .b64 rd<2>; +mov.f64 fd23, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd23; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd20, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd20; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r447, {rs3, rs3}; +mov.f64 fd27, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs5, fd27; +} +mov.b32 r654, {rs5, rs5}; +mov.f64 fd14, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs6, fd14; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r678, {rs7, rs7}; +mov.f64 fd31, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs9, fd31; +} +mov.b32 r636, {rs9, rs9}; +mov.f64 fd18, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs10, fd18; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r663, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs13, fd31; +} +mov.b32 r537, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd18; +} +mov.b32 r561, {rs14, rs14}; +{ +cvt.rn.f16.f64 rs15, fd23; +} +mov.b32 r645, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd20; +} +mov.b32 r669, {rs16, rs16}; +{ +add.f16x2 r1, %32, %52; +} +{ +add.f16x2 r4, %28, r1; +} +{ +add.f16x2 r7, %36, %48; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %40, %44; +} +{ +add.f16x2 r16, r10, r13; +} +{ +add.f16x2 r19, %33, %53; +} +{ +add.f16x2 r22, %29, r19; +} +{ +add.f16x2 r25, %37, %49; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %41, %45; +} +{ +add.f16x2 r34, r28, r31; +} +{ +add.f16x2 r37, %32, %52; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %28, r40; +} +{ +add.f16x2 r46, %36, %48; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %40, %44; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %33, %53; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %37, %49; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %41, %45; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r61, r85; +} +{ +add.f16x2 r91, %32, %52; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %28, r94; +} +{ +add.f16x2 r100, %36, %48; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %40, %44; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %33, %53; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %37, %49; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %41, %45; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 r142, r115, r139; +} +{ +add.f16x2 r145, %32, %52; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %28, r148; +} +{ +add.f16x2 r154, %36, %48; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %40, %44; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %33, %53; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %37, %49; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %41, %45; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 r196, r169, r193; +} +{ +add.f16x2 r199, %32, %52; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %28, r202; +} +{ +add.f16x2 r208, %36, %48; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %40, %44; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %33, %53; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %37, %49; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %41, %45; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 r250, r223, r247; +} +{ +add.f16x2 r253, %32, %52; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %28, r256; +} +{ +add.f16x2 r262, %36, %48; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %40, %44; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %33, %53; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %37, %49; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %41, %45; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 r304, r277, r301; +} +{ +add.f16x2 r307, %32, %52; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %28, r310; +} +{ +add.f16x2 r316, %36, %48; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %40, %44; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %33, %53; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %37, %49; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %41, %45; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 r358, r331, r355; +} +{ +add.f16x2 r361, %33, %53; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %29, r364; +} +{ +add.f16x2 r370, %37, %49; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %41, %45; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %32, %52; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %36, %48; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %40, %44; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 r412, r385, r409; +} +{ +add.f16x2 r415, %33, %53; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %29, r418; +} +{ +add.f16x2 r424, %37, %49; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %41, %45; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %32, %52; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %36, %48; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %40, %44; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 r466, r439, r463; +} +{ +add.f16x2 r469, %33, %53; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %29, r472; +} +{ +add.f16x2 r478, %37, %49; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %41, %45; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %32, %52; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %36, %48; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %40, %44; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r493, r517; +} +{ +add.f16x2 r523, %33, %53; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %29, r526; +} +{ +add.f16x2 r532, %37, %49; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %41, %45; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %32, %52; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %36, %48; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %40, %44; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 r574, r547, r571; +} +{ +add.f16x2 r577, %33, %53; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %29, r580; +} +{ +add.f16x2 r586, %37, %49; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %41, %45; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %32, %52; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %36, %48; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %40, %44; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r601, r625; +} +{ +add.f16x2 r631, %33, %53; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %29, r634; +} +{ +add.f16x2 r640, %37, %49; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %41, %45; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %32, %52; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %36, %48; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %40, %44; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r655, r679; +} +{ +cvt.rn.f16.f64 rs17, fd23; +} +mov.b32 r1104, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd20; +} +{ +neg.f16 rs19, rs18; +} +mov.b32 r1131, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs21, fd27; +} +mov.b32 r1338, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd14; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r1362, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd31; +} +mov.b32 r1320, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd18; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r1347, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs29, fd31; +} +mov.b32 r1221, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd18; +} +mov.b32 r1245, {rs30, rs30}; +{ +cvt.rn.f16.f64 rs31, fd23; +} +mov.b32 r1329, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd20; +} +mov.b32 r1353, {rs32, rs32}; +{ +add.f16x2 r685, %34, %54; +} +{ +add.f16x2 r688, %30, r685; +} +{ +add.f16x2 r691, %38, %50; +} +{ +add.f16x2 r694, r688, r691; +} +{ +add.f16x2 r697, %42, %46; +} +{ +add.f16x2 r700, r694, r697; +} +{ +add.f16x2 r703, %35, %55; +} +{ +add.f16x2 r706, %31, r703; +} +{ +add.f16x2 r709, %39, %51; +} +{ +add.f16x2 r712, r706, r709; +} +{ +add.f16x2 r715, %43, %47; +} +{ +add.f16x2 r718, r712, r715; +} +{ +add.f16x2 r721, %34, %54; +} +{ +mul.f16x2 r724, r721, r1104; +} +{ +add.f16x2 r727, %30, r724; +} +{ +add.f16x2 r730, %38, %50; +} +{ +mul.f16x2 r733, r730, r1338; +} +{ +add.f16x2 r736, r727, r733; +} +{ +add.f16x2 r739, %42, %46; +} +{ +mul.f16x2 r742, r739, r1320; +} +{ +add.f16x2 r745, r736, r742; +} +{ +sub.f16x2 r748, %35, %55; +} +{ +mul.f16x2 r751, r748, r1131; +} +{ +sub.f16x2 r754, %39, %51; +} +{ +mul.f16x2 r757, r754, r1362; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %43, %47; +} +{ +mul.f16x2 r766, r763, r1347; +} +{ +add.f16x2 r769, r760, r766; +} +{ +sub.f16x2 r772, r745, r769; +} +{ +add.f16x2 r775, %34, %54; +} +{ +mul.f16x2 r778, r775, r1104; +} +{ +add.f16x2 r781, %30, r778; +} +{ +add.f16x2 r784, %38, %50; +} +{ +mul.f16x2 r787, r784, r1338; +} +{ +add.f16x2 r790, r781, r787; +} +{ +add.f16x2 r793, %42, %46; +} +{ +mul.f16x2 r796, r793, r1320; +} +{ +add.f16x2 r799, r790, r796; +} +{ +sub.f16x2 r802, %35, %55; +} +{ +mul.f16x2 r805, r802, r1131; +} +{ +sub.f16x2 r808, %39, %51; +} +{ +mul.f16x2 r811, r808, r1362; +} +{ +add.f16x2 r814, r805, r811; +} +{ +sub.f16x2 r817, %43, %47; +} +{ +mul.f16x2 r820, r817, r1347; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r799, r823; +} +{ +add.f16x2 r829, %34, %54; +} +{ +mul.f16x2 r832, r829, r1338; +} +{ +add.f16x2 r835, %30, r832; +} +{ +add.f16x2 r838, %38, %50; +} +{ +mul.f16x2 r841, r838, r1221; +} +{ +add.f16x2 r844, r835, r841; +} +{ +add.f16x2 r847, %42, %46; +} +{ +mul.f16x2 r850, r847, r1329; +} +{ +add.f16x2 r853, r844, r850; +} +{ +sub.f16x2 r856, %35, %55; +} +{ +mul.f16x2 r859, r856, r1362; +} +{ +sub.f16x2 r862, %39, %51; +} +{ +mul.f16x2 r865, r862, r1245; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %43, %47; +} +{ +mul.f16x2 r874, r871, r1353; +} +{ +add.f16x2 r877, r868, r874; +} +{ +sub.f16x2 r880, r853, r877; +} +{ +add.f16x2 r883, %34, %54; +} +{ +mul.f16x2 r886, r883, r1338; +} +{ +add.f16x2 r889, %30, r886; +} +{ +add.f16x2 r892, %38, %50; +} +{ +mul.f16x2 r895, r892, r1221; +} +{ +add.f16x2 r898, r889, r895; +} +{ +add.f16x2 r901, %42, %46; +} +{ +mul.f16x2 r904, r901, r1329; +} +{ +add.f16x2 r907, r898, r904; +} +{ +sub.f16x2 r910, %35, %55; +} +{ +mul.f16x2 r913, r910, r1362; +} +{ +sub.f16x2 r916, %39, %51; +} +{ +mul.f16x2 r919, r916, r1245; +} +{ +add.f16x2 r922, r913, r919; +} +{ +sub.f16x2 r925, %43, %47; +} +{ +mul.f16x2 r928, r925, r1353; +} +{ +add.f16x2 r931, r922, r928; +} +{ +add.f16x2 r934, r907, r931; +} +{ +add.f16x2 r937, %34, %54; +} +{ +mul.f16x2 r940, r937, r1320; +} +{ +add.f16x2 r943, %30, r940; +} +{ +add.f16x2 r946, %38, %50; +} +{ +mul.f16x2 r949, r946, r1329; +} +{ +add.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, %42, %46; +} +{ +mul.f16x2 r958, r955, r1338; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, %35, %55; +} +{ +mul.f16x2 r967, r964, r1347; +} +{ +sub.f16x2 r970, %39, %51; +} +{ +mul.f16x2 r973, r970, r1353; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %43, %47; +} +{ +mul.f16x2 r982, r979, r1362; +} +{ +add.f16x2 r985, r976, r982; +} +{ +sub.f16x2 r988, r961, r985; +} +{ +add.f16x2 r991, %34, %54; +} +{ +mul.f16x2 r994, r991, r1320; +} +{ +add.f16x2 r997, %30, r994; +} +{ +add.f16x2 r1000, %38, %50; +} +{ +mul.f16x2 r1003, r1000, r1329; +} +{ +add.f16x2 r1006, r997, r1003; +} +{ +add.f16x2 r1009, %42, %46; +} +{ +mul.f16x2 r1012, r1009, r1338; +} +{ +add.f16x2 r1015, r1006, r1012; +} +{ +sub.f16x2 r1018, %35, %55; +} +{ +mul.f16x2 r1021, r1018, r1347; +} +{ +sub.f16x2 r1024, %39, %51; +} +{ +mul.f16x2 r1027, r1024, r1353; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %43, %47; +} +{ +mul.f16x2 r1036, r1033, r1362; +} +{ +add.f16x2 r1039, r1030, r1036; +} +{ +add.f16x2 r1042, r1015, r1039; +} +{ +add.f16x2 r1045, %35, %55; +} +{ +mul.f16x2 r1048, r1045, r1104; +} +{ +add.f16x2 r1051, %31, r1048; +} +{ +add.f16x2 r1054, %39, %51; +} +{ +mul.f16x2 r1057, r1054, r1338; +} +{ +add.f16x2 r1060, r1051, r1057; +} +{ +add.f16x2 r1063, %43, %47; +} +{ +mul.f16x2 r1066, r1063, r1320; +} +{ +add.f16x2 r1069, r1060, r1066; +} +{ +sub.f16x2 r1072, %34, %54; +} +{ +mul.f16x2 r1075, r1072, r1131; +} +{ +sub.f16x2 r1078, %38, %50; +} +{ +mul.f16x2 r1081, r1078, r1362; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %42, %46; +} +{ +mul.f16x2 r1090, r1087, r1347; +} +{ +add.f16x2 r1093, r1084, r1090; +} +{ +add.f16x2 r1096, r1069, r1093; +} +{ +add.f16x2 r1099, %35, %55; +} +{ +mul.f16x2 r1102, r1099, r1104; +} +{ +add.f16x2 r1105, %31, r1102; +} +{ +add.f16x2 r1108, %39, %51; +} +{ +mul.f16x2 r1111, r1108, r1338; +} +{ +add.f16x2 r1114, r1105, r1111; +} +{ +add.f16x2 r1117, %43, %47; +} +{ +mul.f16x2 r1120, r1117, r1320; +} +{ +add.f16x2 r1123, r1114, r1120; +} +{ +sub.f16x2 r1126, %34, %54; +} +{ +mul.f16x2 r1129, r1126, r1131; +} +{ +sub.f16x2 r1132, %38, %50; +} +{ +mul.f16x2 r1135, r1132, r1362; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %42, %46; +} +{ +mul.f16x2 r1144, r1141, r1347; +} +{ +add.f16x2 r1147, r1138, r1144; +} +{ +sub.f16x2 r1150, r1123, r1147; +} +{ +add.f16x2 r1153, %35, %55; +} +{ +mul.f16x2 r1156, r1153, r1338; +} +{ +add.f16x2 r1159, %31, r1156; +} +{ +add.f16x2 r1162, %39, %51; +} +{ +mul.f16x2 r1165, r1162, r1221; +} +{ +add.f16x2 r1168, r1159, r1165; +} +{ +add.f16x2 r1171, %43, %47; +} +{ +mul.f16x2 r1174, r1171, r1329; +} +{ +add.f16x2 r1177, r1168, r1174; +} +{ +sub.f16x2 r1180, %34, %54; +} +{ +mul.f16x2 r1183, r1180, r1362; +} +{ +sub.f16x2 r1186, %38, %50; +} +{ +mul.f16x2 r1189, r1186, r1245; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %42, %46; +} +{ +mul.f16x2 r1198, r1195, r1353; +} +{ +add.f16x2 r1201, r1192, r1198; +} +{ +add.f16x2 r1204, r1177, r1201; +} +{ +add.f16x2 r1207, %35, %55; +} +{ +mul.f16x2 r1210, r1207, r1338; +} +{ +add.f16x2 r1213, %31, r1210; +} +{ +add.f16x2 r1216, %39, %51; +} +{ +mul.f16x2 r1219, r1216, r1221; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %43, %47; +} +{ +mul.f16x2 r1228, r1225, r1329; +} +{ +add.f16x2 r1231, r1222, r1228; +} +{ +sub.f16x2 r1234, %34, %54; +} +{ +mul.f16x2 r1237, r1234, r1362; +} +{ +sub.f16x2 r1240, %38, %50; +} +{ +mul.f16x2 r1243, r1240, r1245; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %42, %46; +} +{ +mul.f16x2 r1252, r1249, r1353; +} +{ +add.f16x2 r1255, r1246, r1252; +} +{ +sub.f16x2 r1258, r1231, r1255; +} +{ +add.f16x2 r1261, %35, %55; +} +{ +mul.f16x2 r1264, r1261, r1320; +} +{ +add.f16x2 r1267, %31, r1264; +} +{ +add.f16x2 r1270, %39, %51; +} +{ +mul.f16x2 r1273, r1270, r1329; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +add.f16x2 r1279, %43, %47; +} +{ +mul.f16x2 r1282, r1279, r1338; +} +{ +add.f16x2 r1285, r1276, r1282; +} +{ +sub.f16x2 r1288, %34, %54; +} +{ +mul.f16x2 r1291, r1288, r1347; +} +{ +sub.f16x2 r1294, %38, %50; +} +{ +mul.f16x2 r1297, r1294, r1353; +} +{ +add.f16x2 r1300, r1291, r1297; +} +{ +sub.f16x2 r1303, %42, %46; +} +{ +mul.f16x2 r1306, r1303, r1362; +} +{ +add.f16x2 r1309, r1300, r1306; +} +{ +add.f16x2 r1312, r1285, r1309; +} +{ +add.f16x2 r1315, %35, %55; +} +{ +mul.f16x2 r1318, r1315, r1320; +} +{ +add.f16x2 r1321, %31, r1318; +} +{ +add.f16x2 r1324, %39, %51; +} +{ +mul.f16x2 r1327, r1324, r1329; +} +{ +add.f16x2 r1330, r1321, r1327; +} +{ +add.f16x2 r1333, %43, %47; +} +{ +mul.f16x2 r1336, r1333, r1338; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +sub.f16x2 r1342, %34, %54; +} +{ +mul.f16x2 r1345, r1342, r1347; +} +{ +sub.f16x2 r1348, %38, %50; +} +{ +mul.f16x2 r1351, r1348, r1353; +} +{ +add.f16x2 r1354, r1345, r1351; +} +{ +sub.f16x2 r1357, %42, %46; +} +{ +mul.f16x2 r1360, r1357, r1362; +} +{ +add.f16x2 r1363, r1354, r1360; +} +{ +sub.f16x2 r1366, r1339, r1363; +} +mov.f64 fd21, 0d3FECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs33, fd21; +} +mov.f64 fd32, 0d3FDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs34, fd32; +} +{ +cvt.rn.f16.f64 rs35, fd23; +} +mov.f64 fd30, 0d3FE904C37505DE4B; +{ +cvt.rn.f16.f64 rs36, fd30; +} +mov.f64 fd25, 0d3FCC7B90E3024582; +{ +cvt.rn.f16.f64 rs37, fd25; +} +mov.f64 fd28, 0d3FEF329C0558E969; +{ +cvt.rn.f16.f64 rs38, fd28; +} +{ +cvt.rn.f16.f64 rs39, fd27; +} +{ +cvt.rn.f16.f64 rs40, fd28; +} +mov.f64 fd29, 0dBFE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs41, fd29; +} +{ +cvt.rn.f16.f64 rs42, fd30; +} +{ +cvt.rn.f16.f64 rs43, fd31; +} +{ +cvt.rn.f16.f64 rs44, fd32; +} +mov.b32 r1383, {rs33, rs33}; +{ +mul.f16x2 r1369, r772, r1383; +} +mov.b32 r1380, {rs34, rs34}; +{ +mul.f16x2 r1372, r1096, r1380; +} +{ +sub.f16x2 r1375, r1369, r1372; +} +{ +mul.f16x2 r1378, r772, r1380; +} +{ +fma.rn.f16x2 r1381, r1096, r1383, r1378; +} +mov.b32 r1399, {rs35, rs35}; +{ +mul.f16x2 r1385, r880, r1399; +} +mov.b32 r1396, {rs36, rs36}; +{ +mul.f16x2 r1388, r1204, r1396; +} +{ +sub.f16x2 r1391, r1385, r1388; +} +{ +mul.f16x2 r1394, r880, r1396; +} +{ +fma.rn.f16x2 r1397, r1204, r1399, r1394; +} +mov.b32 r1415, {rs37, rs37}; +{ +mul.f16x2 r1401, r988, r1415; +} +mov.b32 r1412, {rs38, rs38}; +{ +mul.f16x2 r1404, r1312, r1412; +} +{ +sub.f16x2 r1407, r1401, r1404; +} +{ +mul.f16x2 r1410, r988, r1412; +} +{ +fma.rn.f16x2 r1413, r1312, r1415, r1410; +} +mov.b32 r1431, {rs39, rs39}; +{ +mul.f16x2 r1417, r1042, r1431; +} +mov.b32 r1428, {rs40, rs40}; +{ +mul.f16x2 r1420, r1366, r1428; +} +{ +sub.f16x2 r1423, r1417, r1420; +} +{ +mul.f16x2 r1426, r1042, r1428; +} +{ +fma.rn.f16x2 r1429, r1366, r1431, r1426; +} +mov.b32 r1447, {rs41, rs41}; +{ +mul.f16x2 r1433, r934, r1447; +} +mov.b32 r1444, {rs42, rs42}; +{ +mul.f16x2 r1436, r1258, r1444; +} +{ +sub.f16x2 r1439, r1433, r1436; +} +{ +mul.f16x2 r1442, r934, r1444; +} +{ +fma.rn.f16x2 r1445, r1258, r1447, r1442; +} +mov.b32 r1463, {rs43, rs43}; +{ +mul.f16x2 r1449, r826, r1463; +} +mov.b32 r1460, {rs44, rs44}; +{ +mul.f16x2 r1452, r1150, r1460; +} +{ +sub.f16x2 r1455, r1449, r1452; +} +{ +mul.f16x2 r1458, r826, r1460; +} +{ +fma.rn.f16x2 r1461, r1150, r1463, r1458; +} +{ +add.f16x2 %0, r16, r700; +} +{ +add.f16x2 %1, r34, r718; +} +{ +sub.f16x2 %14, r16, r700; +} +{ +sub.f16x2 %15, r34, r718; +} +{ +add.f16x2 %2, r88, r1375; +} +{ +add.f16x2 %3, r412, r1381; +} +{ +sub.f16x2 %16, r88, r1375; +} +{ +sub.f16x2 %17, r412, r1381; +} +{ +add.f16x2 %4, r196, r1391; +} +{ +add.f16x2 %5, r520, r1397; +} +{ +sub.f16x2 %18, r196, r1391; +} +{ +sub.f16x2 %19, r520, r1397; +} +{ +add.f16x2 %6, r304, r1407; +} +{ +add.f16x2 %7, r628, r1413; +} +{ +sub.f16x2 %20, r304, r1407; +} +{ +sub.f16x2 %21, r628, r1413; +} +{ +add.f16x2 %8, r358, r1423; +} +{ +add.f16x2 %9, r682, r1429; +} +{ +sub.f16x2 %22, r358, r1423; +} +{ +sub.f16x2 %23, r682, r1429; +} +{ +add.f16x2 %10, r250, r1439; +} +{ +add.f16x2 %11, r574, r1445; +} +{ +sub.f16x2 %24, r250, r1439; +} +{ +sub.f16x2 %25, r574, r1445; +} +{ +add.f16x2 %12, r142, r1455; +} +{ +add.f16x2 %13, r466, r1461; +} +{ +sub.f16x2 %26, r142, r1455; +} +{ +sub.f16x2 %27, r466, r1461; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..1f7e4a75eb051 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp32_fwd.hpp.inc @@ -0,0 +1,250 @@ +#ifndef CUFFTDX_FFT_14_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_14_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<291>; +.reg .b64 rd<2>; +add.f32 f57, %33, %60; +add.f32 f58, %28, f57; +add.f32 f59, %38, %54; +add.f32 f60, f59, f58; +add.f32 f61, %44, %49; +add.f32 f62, f61, f60; +add.f32 f63, %35, %61; +add.f32 f64, %29, f63; +add.f32 f65, %40, %56; +add.f32 f66, f65, f64; +add.f32 f67, %45, %51; +add.f32 f68, f67, f66; +fma.rn.f32 f69, f57, 0f3F1F9D07, %28; +mul.f32 f70, f59, 0f3E63DC87; +sub.f32 f71, f69, f70; +mul.f32 f72, f61, 0f3F66A5E5; +sub.f32 f73, f71, f72; +sub.f32 f74, %35, %61; +mul.f32 f75, f74, 0f3F48261C; +sub.f32 f76, %40, %56; +mul.f32 f77, f76, 0fBF7994E0; +sub.f32 f78, f77, f75; +sub.f32 f79, %45, %51; +mul.f32 f80, f79, 0f3EDE2602; +sub.f32 f81, f78, f80; +sub.f32 f82, f73, f81; +add.f32 f83, f81, f73; +mul.f32 f84, f57, 0f3E63DC87; +sub.f32 f85, %28, f84; +mul.f32 f86, f59, 0f3F66A5E5; +sub.f32 f87, f85, f86; +fma.rn.f32 f88, f61, 0f3F1F9D07, f87; +mul.f32 f89, f74, 0f3F7994E0; +mul.f32 f90, f76, 0f3EDE2602; +sub.f32 f91, f90, f89; +fma.rn.f32 f92, f79, 0f3F48261C, f91; +sub.f32 f93, f88, f92; +add.f32 f94, f92, f88; +mul.f32 f95, f57, 0f3F66A5E5; +sub.f32 f96, %28, f95; +fma.rn.f32 f97, f59, 0f3F1F9D07, f96; +mul.f32 f98, f61, 0f3E63DC87; +sub.f32 f99, f97, f98; +mul.f32 f100, f74, 0f3EDE2602; +mul.f32 f101, f76, 0f3F48261C; +sub.f32 f102, f101, f100; +mul.f32 f103, f79, 0f3F7994E0; +sub.f32 f104, f102, f103; +sub.f32 f105, f99, f104; +add.f32 f106, f104, f99; +fma.rn.f32 f107, f63, 0f3F1F9D07, %29; +mul.f32 f108, f65, 0f3E63DC87; +sub.f32 f109, f107, f108; +mul.f32 f110, f67, 0f3F66A5E5; +sub.f32 f111, f109, f110; +sub.f32 f112, %33, %60; +mul.f32 f113, f112, 0f3F48261C; +sub.f32 f114, %38, %54; +mul.f32 f115, f114, 0fBF7994E0; +sub.f32 f116, f115, f113; +sub.f32 f117, %44, %49; +mul.f32 f118, f117, 0f3EDE2602; +sub.f32 f119, f116, f118; +add.f32 f120, f119, f111; +sub.f32 f121, f111, f119; +mul.f32 f122, f63, 0f3E63DC87; +sub.f32 f123, %29, f122; +mul.f32 f124, f65, 0f3F66A5E5; +sub.f32 f125, f123, f124; +fma.rn.f32 f126, f67, 0f3F1F9D07, f125; +mul.f32 f127, f112, 0f3F7994E0; +mul.f32 f128, f114, 0f3EDE2602; +sub.f32 f129, f128, f127; +fma.rn.f32 f130, f117, 0f3F48261C, f129; +add.f32 f131, f130, f126; +sub.f32 f132, f126, f130; +mul.f32 f133, f63, 0f3F66A5E5; +sub.f32 f134, %29, f133; +fma.rn.f32 f135, f65, 0f3F1F9D07, f134; +mul.f32 f136, f67, 0f3E63DC87; +sub.f32 f137, f135, f136; +mul.f32 f138, f112, 0f3EDE2602; +mul.f32 f139, f114, 0f3F48261C; +sub.f32 f140, f139, f138; +mul.f32 f141, f117, 0f3F7994E0; +sub.f32 f142, f140, f141; +add.f32 f143, f142, f137; +sub.f32 f144, f137, f142; +add.f32 f145, %36, %62; +add.f32 f146, %30, f145; +add.f32 f147, %41, %57; +add.f32 f148, f147, f146; +add.f32 f149, %46, %52; +add.f32 f150, f149, f148; +add.f32 f151, %37, %63; +add.f32 f152, %32, f151; +add.f32 f153, %43, %59; +add.f32 f154, f153, f152; +add.f32 f155, %48, %53; +add.f32 f156, f155, f154; +fma.rn.f32 f157, f145, 0f3F1F9D07, %30; +mul.f32 f158, f147, 0f3E63DC87; +sub.f32 f159, f157, f158; +mul.f32 f160, f149, 0f3F66A5E5; +sub.f32 f161, f159, f160; +sub.f32 f162, %37, %63; +mul.f32 f163, f162, 0f3F48261C; +sub.f32 f164, %43, %59; +mul.f32 f165, f164, 0fBF7994E0; +sub.f32 f166, f165, f163; +sub.f32 f167, %48, %53; +mul.f32 f168, f167, 0f3EDE2602; +sub.f32 f169, f166, f168; +sub.f32 f170, f161, f169; +add.f32 f171, f169, f161; +mul.f32 f172, f145, 0f3E63DC87; +sub.f32 f173, %30, f172; +mul.f32 f174, f147, 0f3F66A5E5; +sub.f32 f175, f173, f174; +fma.rn.f32 f176, f149, 0f3F1F9D07, f175; +mul.f32 f177, f162, 0f3F7994E0; +mul.f32 f178, f164, 0f3EDE2602; +sub.f32 f179, f178, f177; +fma.rn.f32 f180, f167, 0f3F48261C, f179; +sub.f32 f181, f176, f180; +add.f32 f182, f180, f176; +mul.f32 f183, f145, 0f3F66A5E5; +sub.f32 f184, %30, f183; +fma.rn.f32 f185, f147, 0f3F1F9D07, f184; +mul.f32 f186, f149, 0f3E63DC87; +sub.f32 f187, f185, f186; +mul.f32 f188, f162, 0f3EDE2602; +mul.f32 f189, f164, 0f3F48261C; +sub.f32 f190, f189, f188; +mul.f32 f191, f167, 0f3F7994E0; +sub.f32 f192, f190, f191; +sub.f32 f193, f187, f192; +add.f32 f194, f192, f187; +fma.rn.f32 f195, f151, 0f3F1F9D07, %32; +mul.f32 f196, f153, 0f3E63DC87; +sub.f32 f197, f195, f196; +mul.f32 f198, f155, 0f3F66A5E5; +sub.f32 f199, f197, f198; +sub.f32 f200, %36, %62; +mul.f32 f201, f200, 0f3F48261C; +sub.f32 f202, %41, %57; +mul.f32 f203, f202, 0fBF7994E0; +sub.f32 f204, f203, f201; +sub.f32 f205, %46, %52; +mul.f32 f206, f205, 0f3EDE2602; +sub.f32 f207, f204, f206; +add.f32 f208, f207, f199; +sub.f32 f209, f199, f207; +mul.f32 f210, f151, 0f3E63DC87; +sub.f32 f211, %32, f210; +mul.f32 f212, f153, 0f3F66A5E5; +sub.f32 f213, f211, f212; +fma.rn.f32 f214, f155, 0f3F1F9D07, f213; +mul.f32 f215, f200, 0f3F7994E0; +mul.f32 f216, f202, 0f3EDE2602; +sub.f32 f217, f216, f215; +fma.rn.f32 f218, f205, 0f3F48261C, f217; +add.f32 f219, f218, f214; +sub.f32 f220, f214, f218; +mul.f32 f221, f151, 0f3F66A5E5; +sub.f32 f222, %32, f221; +fma.rn.f32 f223, f153, 0f3F1F9D07, f222; +mul.f32 f224, f155, 0f3E63DC87; +sub.f32 f225, f223, f224; +mul.f32 f226, f200, 0f3EDE2602; +mul.f32 f227, f202, 0f3F48261C; +sub.f32 f228, f227, f226; +mul.f32 f229, f205, 0f3F7994E0; +sub.f32 f230, f228, f229; +add.f32 f231, f230, f225; +sub.f32 f232, f225, f230; +mul.f32 f233, f170, 0f3F66A5E5; +mul.f32 f234, f208, 0fBEDE2602; +sub.f32 f235, f233, f234; +mul.f32 f236, f208, 0f3F66A5E5; +fma.rn.f32 f237, f170, 0fBEDE2602, f236; +mul.f32 f238, f181, 0f3F1F9D07; +mul.f32 f239, f219, 0fBF48261C; +sub.f32 f240, f238, f239; +mul.f32 f241, f219, 0f3F1F9D07; +fma.rn.f32 f242, f181, 0fBF48261C, f241; +mul.f32 f243, f193, 0f3E63DC87; +mul.f32 f244, f231, 0fBF7994E0; +sub.f32 f245, f243, f244; +mul.f32 f246, f231, 0f3E63DC87; +fma.rn.f32 f247, f193, 0fBF7994E0, f246; +mul.f32 f248, f194, 0fBE63DC87; +mul.f32 f249, f232, 0fBF7994E0; +sub.f32 f250, f248, f249; +mul.f32 f251, f232, 0fBE63DC87; +fma.rn.f32 f252, f194, 0fBF7994E0, f251; +mul.f32 f253, f182, 0fBF1F9D07; +mul.f32 f254, f220, 0fBF48261C; +sub.f32 f255, f253, f254; +mul.f32 f256, f220, 0fBF1F9D07; +fma.rn.f32 f257, f182, 0fBF48261C, f256; +mul.f32 f258, f171, 0fBF66A5E5; +mul.f32 f259, f209, 0fBEDE2602; +sub.f32 f260, f258, f259; +mul.f32 f261, f209, 0fBF66A5E5; +fma.rn.f32 f262, f171, 0fBEDE2602, f261; +add.f32 %1, f68, f156; +add.f32 %0, f62, f150; +add.f32 %3, f120, f237; +add.f32 %2, f82, f235; +add.f32 %5, f131, f242; +add.f32 %4, f93, f240; +add.f32 %7, f143, f247; +add.f32 %6, f105, f245; +add.f32 %9, f144, f252; +add.f32 %8, f106, f250; +add.f32 %11, f132, f257; +add.f32 %10, f94, f255; +add.f32 %13, f121, f262; +add.f32 %12, f83, f260; +sub.f32 %15, f68, f156; +sub.f32 %14, f62, f150; +sub.f32 %17, f120, f237; +sub.f32 %16, f82, f235; +sub.f32 %19, f131, f242; +sub.f32 %18, f93, f240; +sub.f32 %21, f143, f247; +sub.f32 %20, f105, f245; +sub.f32 %23, f144, f252; +sub.f32 %22, f106, f250; +sub.f32 %25, f132, f257; +sub.f32 %24, f94, f255; +sub.f32 %27, f121, f262; +sub.f32 %26, f83, f260; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..51f950df81b71 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp32_inv.hpp.inc @@ -0,0 +1,242 @@ +#ifndef CUFFTDX_FFT_14_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_14_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<203, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<283>; +.reg .b64 rd<2>; +add.f32 f57, %33, %60; +add.f32 f58, %28, f57; +add.f32 f59, %38, %54; +add.f32 f60, f59, f58; +add.f32 f61, %44, %49; +add.f32 f62, f61, f60; +add.f32 f63, %35, %61; +add.f32 f64, %29, f63; +add.f32 f65, %40, %56; +add.f32 f66, f65, f64; +add.f32 f67, %45, %51; +add.f32 f68, f67, f66; +fma.rn.f32 f69, f57, 0f3F1F9D07, %28; +mul.f32 f70, f59, 0f3E63DC87; +sub.f32 f71, f69, f70; +mul.f32 f72, f61, 0f3F66A5E5; +sub.f32 f73, f71, f72; +sub.f32 f74, %35, %61; +mul.f32 f75, f74, 0f3F48261C; +sub.f32 f76, %40, %56; +fma.rn.f32 f77, f76, 0f3F7994E0, f75; +sub.f32 f78, %45, %51; +fma.rn.f32 f79, f78, 0f3EDE2602, f77; +sub.f32 f80, f73, f79; +add.f32 f81, f79, f73; +mul.f32 f82, f57, 0f3E63DC87; +sub.f32 f83, %28, f82; +mul.f32 f84, f59, 0f3F66A5E5; +sub.f32 f85, f83, f84; +fma.rn.f32 f86, f61, 0f3F1F9D07, f85; +mul.f32 f87, f74, 0f3F7994E0; +mul.f32 f88, f76, 0f3EDE2602; +sub.f32 f89, f87, f88; +mul.f32 f90, f78, 0f3F48261C; +sub.f32 f91, f89, f90; +sub.f32 f92, f86, f91; +add.f32 f93, f91, f86; +mul.f32 f94, f57, 0f3F66A5E5; +sub.f32 f95, %28, f94; +fma.rn.f32 f96, f59, 0f3F1F9D07, f95; +mul.f32 f97, f61, 0f3E63DC87; +sub.f32 f98, f96, f97; +mul.f32 f99, f74, 0f3EDE2602; +mul.f32 f100, f76, 0f3F48261C; +sub.f32 f101, f99, f100; +fma.rn.f32 f102, f78, 0f3F7994E0, f101; +sub.f32 f103, f98, f102; +add.f32 f104, f102, f98; +fma.rn.f32 f105, f63, 0f3F1F9D07, %29; +mul.f32 f106, f65, 0f3E63DC87; +sub.f32 f107, f105, f106; +mul.f32 f108, f67, 0f3F66A5E5; +sub.f32 f109, f107, f108; +sub.f32 f110, %33, %60; +mul.f32 f111, f110, 0f3F48261C; +sub.f32 f112, %38, %54; +fma.rn.f32 f113, f112, 0f3F7994E0, f111; +sub.f32 f114, %44, %49; +fma.rn.f32 f115, f114, 0f3EDE2602, f113; +add.f32 f116, f115, f109; +sub.f32 f117, f109, f115; +mul.f32 f118, f63, 0f3E63DC87; +sub.f32 f119, %29, f118; +mul.f32 f120, f65, 0f3F66A5E5; +sub.f32 f121, f119, f120; +fma.rn.f32 f122, f67, 0f3F1F9D07, f121; +mul.f32 f123, f110, 0f3F7994E0; +mul.f32 f124, f112, 0f3EDE2602; +sub.f32 f125, f123, f124; +mul.f32 f126, f114, 0f3F48261C; +sub.f32 f127, f125, f126; +add.f32 f128, f127, f122; +sub.f32 f129, f122, f127; +mul.f32 f130, f63, 0f3F66A5E5; +sub.f32 f131, %29, f130; +fma.rn.f32 f132, f65, 0f3F1F9D07, f131; +mul.f32 f133, f67, 0f3E63DC87; +sub.f32 f134, f132, f133; +mul.f32 f135, f110, 0f3EDE2602; +mul.f32 f136, f112, 0f3F48261C; +sub.f32 f137, f135, f136; +fma.rn.f32 f138, f114, 0f3F7994E0, f137; +add.f32 f139, f138, f134; +sub.f32 f140, f134, f138; +add.f32 f141, %36, %62; +add.f32 f142, %30, f141; +add.f32 f143, %41, %57; +add.f32 f144, f143, f142; +add.f32 f145, %46, %52; +add.f32 f146, f145, f144; +add.f32 f147, %37, %63; +add.f32 f148, %32, f147; +add.f32 f149, %43, %59; +add.f32 f150, f149, f148; +add.f32 f151, %48, %53; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f141, 0f3F1F9D07, %30; +mul.f32 f154, f143, 0f3E63DC87; +sub.f32 f155, f153, f154; +mul.f32 f156, f145, 0f3F66A5E5; +sub.f32 f157, f155, f156; +sub.f32 f158, %37, %63; +mul.f32 f159, f158, 0f3F48261C; +sub.f32 f160, %43, %59; +fma.rn.f32 f161, f160, 0f3F7994E0, f159; +sub.f32 f162, %48, %53; +fma.rn.f32 f163, f162, 0f3EDE2602, f161; +sub.f32 f164, f157, f163; +add.f32 f165, f163, f157; +mul.f32 f166, f141, 0f3E63DC87; +sub.f32 f167, %30, f166; +mul.f32 f168, f143, 0f3F66A5E5; +sub.f32 f169, f167, f168; +fma.rn.f32 f170, f145, 0f3F1F9D07, f169; +mul.f32 f171, f158, 0f3F7994E0; +mul.f32 f172, f160, 0f3EDE2602; +sub.f32 f173, f171, f172; +mul.f32 f174, f162, 0f3F48261C; +sub.f32 f175, f173, f174; +sub.f32 f176, f170, f175; +add.f32 f177, f175, f170; +mul.f32 f178, f141, 0f3F66A5E5; +sub.f32 f179, %30, f178; +fma.rn.f32 f180, f143, 0f3F1F9D07, f179; +mul.f32 f181, f145, 0f3E63DC87; +sub.f32 f182, f180, f181; +mul.f32 f183, f158, 0f3EDE2602; +mul.f32 f184, f160, 0f3F48261C; +sub.f32 f185, f183, f184; +fma.rn.f32 f186, f162, 0f3F7994E0, f185; +sub.f32 f187, f182, f186; +add.f32 f188, f186, f182; +fma.rn.f32 f189, f147, 0f3F1F9D07, %32; +mul.f32 f190, f149, 0f3E63DC87; +sub.f32 f191, f189, f190; +mul.f32 f192, f151, 0f3F66A5E5; +sub.f32 f193, f191, f192; +sub.f32 f194, %36, %62; +mul.f32 f195, f194, 0f3F48261C; +sub.f32 f196, %41, %57; +fma.rn.f32 f197, f196, 0f3F7994E0, f195; +sub.f32 f198, %46, %52; +fma.rn.f32 f199, f198, 0f3EDE2602, f197; +add.f32 f200, f199, f193; +sub.f32 f201, f193, f199; +mul.f32 f202, f147, 0f3E63DC87; +sub.f32 f203, %32, f202; +mul.f32 f204, f149, 0f3F66A5E5; +sub.f32 f205, f203, f204; +fma.rn.f32 f206, f151, 0f3F1F9D07, f205; +mul.f32 f207, f194, 0f3F7994E0; +mul.f32 f208, f196, 0f3EDE2602; +sub.f32 f209, f207, f208; +mul.f32 f210, f198, 0f3F48261C; +sub.f32 f211, f209, f210; +add.f32 f212, f211, f206; +sub.f32 f213, f206, f211; +mul.f32 f214, f147, 0f3F66A5E5; +sub.f32 f215, %32, f214; +fma.rn.f32 f216, f149, 0f3F1F9D07, f215; +mul.f32 f217, f151, 0f3E63DC87; +sub.f32 f218, f216, f217; +mul.f32 f219, f194, 0f3EDE2602; +mul.f32 f220, f196, 0f3F48261C; +sub.f32 f221, f219, f220; +fma.rn.f32 f222, f198, 0f3F7994E0, f221; +add.f32 f223, f222, f218; +sub.f32 f224, f218, f222; +mul.f32 f225, f164, 0f3F66A5E5; +mul.f32 f226, f200, 0f3EDE2602; +sub.f32 f227, f225, f226; +mul.f32 f228, f200, 0f3F66A5E5; +fma.rn.f32 f229, f164, 0f3EDE2602, f228; +mul.f32 f230, f176, 0f3F1F9D07; +mul.f32 f231, f212, 0f3F48261C; +sub.f32 f232, f230, f231; +mul.f32 f233, f212, 0f3F1F9D07; +fma.rn.f32 f234, f176, 0f3F48261C, f233; +mul.f32 f235, f187, 0f3E63DC87; +mul.f32 f236, f223, 0f3F7994E0; +sub.f32 f237, f235, f236; +mul.f32 f238, f223, 0f3E63DC87; +fma.rn.f32 f239, f187, 0f3F7994E0, f238; +mul.f32 f240, f188, 0fBE63DC87; +mul.f32 f241, f224, 0f3F7994E0; +sub.f32 f242, f240, f241; +mul.f32 f243, f224, 0fBE63DC87; +fma.rn.f32 f244, f188, 0f3F7994E0, f243; +mul.f32 f245, f177, 0fBF1F9D07; +mul.f32 f246, f213, 0f3F48261C; +sub.f32 f247, f245, f246; +mul.f32 f248, f213, 0fBF1F9D07; +fma.rn.f32 f249, f177, 0f3F48261C, f248; +mul.f32 f250, f165, 0fBF66A5E5; +mul.f32 f251, f201, 0f3EDE2602; +sub.f32 f252, f250, f251; +mul.f32 f253, f201, 0fBF66A5E5; +fma.rn.f32 f254, f165, 0f3EDE2602, f253; +add.f32 %1, f68, f152; +add.f32 %0, f62, f146; +add.f32 %3, f116, f229; +add.f32 %2, f80, f227; +add.f32 %5, f128, f234; +add.f32 %4, f92, f232; +add.f32 %7, f139, f239; +add.f32 %6, f103, f237; +add.f32 %9, f140, f244; +add.f32 %8, f104, f242; +add.f32 %11, f129, f249; +add.f32 %10, f93, f247; +add.f32 %13, f117, f254; +add.f32 %12, f81, f252; +sub.f32 %15, f68, f152; +sub.f32 %14, f62, f146; +sub.f32 %17, f116, f229; +sub.f32 %16, f80, f227; +sub.f32 %19, f128, f234; +sub.f32 %18, f92, f232; +sub.f32 %21, f139, f239; +sub.f32 %20, f103, f237; +sub.f32 %23, f140, f244; +sub.f32 %22, f104, f242; +sub.f32 %25, f129, f249; +sub.f32 %24, f93, f247; +sub.f32 %27, f117, f254; +sub.f32 %26, f81, f252; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..5229b9b487eaa --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp64_fwd.hpp.inc @@ -0,0 +1,250 @@ +#ifndef CUFFTDX_FFT_14_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_14_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<405, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<291>; +.reg .b64 rd<2>; +add.f64 fd57, %33, %60; +add.f64 fd58, %28, fd57; +add.f64 fd59, %38, %54; +add.f64 fd60, fd59, fd58; +add.f64 fd61, %44, %49; +add.f64 fd62, fd61, fd60; +add.f64 fd63, %35, %61; +add.f64 fd64, %29, fd63; +add.f64 fd65, %40, %56; +add.f64 fd66, fd65, fd64; +add.f64 fd67, %45, %51; +add.f64 fd68, fd67, fd66; +fma.rn.f64 fd69, fd57, 0d3FE3F3A0E28BEDD1, %28; +mul.f64 fd70, fd59, 0d3FCC7B90E3024582; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd61, 0d3FECD4BCA9CB5C71; +sub.f64 fd73, fd71, fd72; +sub.f64 fd74, %35, %61; +mul.f64 fd75, fd74, 0d3FE904C37505DE4B; +sub.f64 fd76, %40, %56; +mul.f64 fd77, fd76, 0dBFEF329C0558E969; +sub.f64 fd78, fd77, fd75; +sub.f64 fd79, %45, %51; +mul.f64 fd80, fd79, 0d3FDBC4C04D71ABC1; +sub.f64 fd81, fd78, fd80; +sub.f64 fd82, fd73, fd81; +add.f64 fd83, fd81, fd73; +mul.f64 fd84, fd57, 0d3FCC7B90E3024582; +sub.f64 fd85, %28, fd84; +mul.f64 fd86, fd59, 0d3FECD4BCA9CB5C71; +sub.f64 fd87, fd85, fd86; +fma.rn.f64 fd88, fd61, 0d3FE3F3A0E28BEDD1, fd87; +mul.f64 fd89, fd74, 0d3FEF329C0558E969; +mul.f64 fd90, fd76, 0d3FDBC4C04D71ABC1; +sub.f64 fd91, fd90, fd89; +fma.rn.f64 fd92, fd79, 0d3FE904C37505DE4B, fd91; +sub.f64 fd93, fd88, fd92; +add.f64 fd94, fd92, fd88; +mul.f64 fd95, fd57, 0d3FECD4BCA9CB5C71; +sub.f64 fd96, %28, fd95; +fma.rn.f64 fd97, fd59, 0d3FE3F3A0E28BEDD1, fd96; +mul.f64 fd98, fd61, 0d3FCC7B90E3024582; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd74, 0d3FDBC4C04D71ABC1; +mul.f64 fd101, fd76, 0d3FE904C37505DE4B; +sub.f64 fd102, fd101, fd100; +mul.f64 fd103, fd79, 0d3FEF329C0558E969; +sub.f64 fd104, fd102, fd103; +sub.f64 fd105, fd99, fd104; +add.f64 fd106, fd104, fd99; +fma.rn.f64 fd107, fd63, 0d3FE3F3A0E28BEDD1, %29; +mul.f64 fd108, fd65, 0d3FCC7B90E3024582; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd67, 0d3FECD4BCA9CB5C71; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %33, %60; +mul.f64 fd113, fd112, 0d3FE904C37505DE4B; +sub.f64 fd114, %38, %54; +mul.f64 fd115, fd114, 0dBFEF329C0558E969; +sub.f64 fd116, fd115, fd113; +sub.f64 fd117, %44, %49; +mul.f64 fd118, fd117, 0d3FDBC4C04D71ABC1; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd119, fd111; +sub.f64 fd121, fd111, fd119; +mul.f64 fd122, fd63, 0d3FCC7B90E3024582; +sub.f64 fd123, %29, fd122; +mul.f64 fd124, fd65, 0d3FECD4BCA9CB5C71; +sub.f64 fd125, fd123, fd124; +fma.rn.f64 fd126, fd67, 0d3FE3F3A0E28BEDD1, fd125; +mul.f64 fd127, fd112, 0d3FEF329C0558E969; +mul.f64 fd128, fd114, 0d3FDBC4C04D71ABC1; +sub.f64 fd129, fd128, fd127; +fma.rn.f64 fd130, fd117, 0d3FE904C37505DE4B, fd129; +add.f64 fd131, fd130, fd126; +sub.f64 fd132, fd126, fd130; +mul.f64 fd133, fd63, 0d3FECD4BCA9CB5C71; +sub.f64 fd134, %29, fd133; +fma.rn.f64 fd135, fd65, 0d3FE3F3A0E28BEDD1, fd134; +mul.f64 fd136, fd67, 0d3FCC7B90E3024582; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd112, 0d3FDBC4C04D71ABC1; +mul.f64 fd139, fd114, 0d3FE904C37505DE4B; +sub.f64 fd140, fd139, fd138; +mul.f64 fd141, fd117, 0d3FEF329C0558E969; +sub.f64 fd142, fd140, fd141; +add.f64 fd143, fd142, fd137; +sub.f64 fd144, fd137, fd142; +add.f64 fd145, %36, %62; +add.f64 fd146, %30, fd145; +add.f64 fd147, %41, %57; +add.f64 fd148, fd147, fd146; +add.f64 fd149, %46, %52; +add.f64 fd150, fd149, fd148; +add.f64 fd151, %37, %63; +add.f64 fd152, %32, fd151; +add.f64 fd153, %43, %59; +add.f64 fd154, fd153, fd152; +add.f64 fd155, %48, %53; +add.f64 fd156, fd155, fd154; +fma.rn.f64 fd157, fd145, 0d3FE3F3A0E28BEDD1, %30; +mul.f64 fd158, fd147, 0d3FCC7B90E3024582; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd149, 0d3FECD4BCA9CB5C71; +sub.f64 fd161, fd159, fd160; +sub.f64 fd162, %37, %63; +mul.f64 fd163, fd162, 0d3FE904C37505DE4B; +sub.f64 fd164, %43, %59; +mul.f64 fd165, fd164, 0dBFEF329C0558E969; +sub.f64 fd166, fd165, fd163; +sub.f64 fd167, %48, %53; +mul.f64 fd168, fd167, 0d3FDBC4C04D71ABC1; +sub.f64 fd169, fd166, fd168; +sub.f64 fd170, fd161, fd169; +add.f64 fd171, fd169, fd161; +mul.f64 fd172, fd145, 0d3FCC7B90E3024582; +sub.f64 fd173, %30, fd172; +mul.f64 fd174, fd147, 0d3FECD4BCA9CB5C71; +sub.f64 fd175, fd173, fd174; +fma.rn.f64 fd176, fd149, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd177, fd162, 0d3FEF329C0558E969; +mul.f64 fd178, fd164, 0d3FDBC4C04D71ABC1; +sub.f64 fd179, fd178, fd177; +fma.rn.f64 fd180, fd167, 0d3FE904C37505DE4B, fd179; +sub.f64 fd181, fd176, fd180; +add.f64 fd182, fd180, fd176; +mul.f64 fd183, fd145, 0d3FECD4BCA9CB5C71; +sub.f64 fd184, %30, fd183; +fma.rn.f64 fd185, fd147, 0d3FE3F3A0E28BEDD1, fd184; +mul.f64 fd186, fd149, 0d3FCC7B90E3024582; +sub.f64 fd187, fd185, fd186; +mul.f64 fd188, fd162, 0d3FDBC4C04D71ABC1; +mul.f64 fd189, fd164, 0d3FE904C37505DE4B; +sub.f64 fd190, fd189, fd188; +mul.f64 fd191, fd167, 0d3FEF329C0558E969; +sub.f64 fd192, fd190, fd191; +sub.f64 fd193, fd187, fd192; +add.f64 fd194, fd192, fd187; +fma.rn.f64 fd195, fd151, 0d3FE3F3A0E28BEDD1, %32; +mul.f64 fd196, fd153, 0d3FCC7B90E3024582; +sub.f64 fd197, fd195, fd196; +mul.f64 fd198, fd155, 0d3FECD4BCA9CB5C71; +sub.f64 fd199, fd197, fd198; +sub.f64 fd200, %36, %62; +mul.f64 fd201, fd200, 0d3FE904C37505DE4B; +sub.f64 fd202, %41, %57; +mul.f64 fd203, fd202, 0dBFEF329C0558E969; +sub.f64 fd204, fd203, fd201; +sub.f64 fd205, %46, %52; +mul.f64 fd206, fd205, 0d3FDBC4C04D71ABC1; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd207, fd199; +sub.f64 fd209, fd199, fd207; +mul.f64 fd210, fd151, 0d3FCC7B90E3024582; +sub.f64 fd211, %32, fd210; +mul.f64 fd212, fd153, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +fma.rn.f64 fd214, fd155, 0d3FE3F3A0E28BEDD1, fd213; +mul.f64 fd215, fd200, 0d3FEF329C0558E969; +mul.f64 fd216, fd202, 0d3FDBC4C04D71ABC1; +sub.f64 fd217, fd216, fd215; +fma.rn.f64 fd218, fd205, 0d3FE904C37505DE4B, fd217; +add.f64 fd219, fd218, fd214; +sub.f64 fd220, fd214, fd218; +mul.f64 fd221, fd151, 0d3FECD4BCA9CB5C71; +sub.f64 fd222, %32, fd221; +fma.rn.f64 fd223, fd153, 0d3FE3F3A0E28BEDD1, fd222; +mul.f64 fd224, fd155, 0d3FCC7B90E3024582; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd200, 0d3FDBC4C04D71ABC1; +mul.f64 fd227, fd202, 0d3FE904C37505DE4B; +sub.f64 fd228, fd227, fd226; +mul.f64 fd229, fd205, 0d3FEF329C0558E969; +sub.f64 fd230, fd228, fd229; +add.f64 fd231, fd230, fd225; +sub.f64 fd232, fd225, fd230; +mul.f64 fd233, fd170, 0d3FECD4BCA9CB5C71; +mul.f64 fd234, fd208, 0dBFDBC4C04D71ABC1; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd208, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd237, fd170, 0dBFDBC4C04D71ABC1, fd236; +mul.f64 fd238, fd181, 0d3FE3F3A0E28BEDD1; +mul.f64 fd239, fd219, 0dBFE904C37505DE4B; +sub.f64 fd240, fd238, fd239; +mul.f64 fd241, fd219, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd242, fd181, 0dBFE904C37505DE4B, fd241; +mul.f64 fd243, fd193, 0d3FCC7B90E3024582; +mul.f64 fd244, fd231, 0dBFEF329C0558E969; +sub.f64 fd245, fd243, fd244; +mul.f64 fd246, fd231, 0d3FCC7B90E3024582; +fma.rn.f64 fd247, fd193, 0dBFEF329C0558E969, fd246; +mul.f64 fd248, fd194, 0dBFCC7B90E3024582; +mul.f64 fd249, fd232, 0dBFEF329C0558E969; +sub.f64 fd250, fd248, fd249; +mul.f64 fd251, fd232, 0dBFCC7B90E3024582; +fma.rn.f64 fd252, fd194, 0dBFEF329C0558E969, fd251; +mul.f64 fd253, fd182, 0dBFE3F3A0E28BEDD1; +mul.f64 fd254, fd220, 0dBFE904C37505DE4B; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd220, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd257, fd182, 0dBFE904C37505DE4B, fd256; +mul.f64 fd258, fd171, 0dBFECD4BCA9CB5C71; +mul.f64 fd259, fd209, 0dBFDBC4C04D71ABC1; +sub.f64 fd260, fd258, fd259; +mul.f64 fd261, fd209, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd262, fd171, 0dBFDBC4C04D71ABC1, fd261; +add.f64 %1, fd68, fd156; +add.f64 %0, fd62, fd150; +add.f64 %3, fd120, fd237; +add.f64 %2, fd82, fd235; +add.f64 %5, fd131, fd242; +add.f64 %4, fd93, fd240; +add.f64 %7, fd143, fd247; +add.f64 %6, fd105, fd245; +add.f64 %9, fd144, fd252; +add.f64 %8, fd106, fd250; +add.f64 %11, fd132, fd257; +add.f64 %10, fd94, fd255; +add.f64 %13, fd121, fd262; +add.f64 %12, fd83, fd260; +sub.f64 %15, fd68, fd156; +sub.f64 %14, fd62, fd150; +sub.f64 %17, fd120, fd237; +sub.f64 %16, fd82, fd235; +sub.f64 %19, fd131, fd242; +sub.f64 %18, fd93, fd240; +sub.f64 %21, fd143, fd247; +sub.f64 %20, fd105, fd245; +sub.f64 %23, fd144, fd252; +sub.f64 %22, fd106, fd250; +sub.f64 %25, fd132, fd257; +sub.f64 %24, fd94, fd255; +sub.f64 %27, fd121, fd262; +sub.f64 %26, fd83, fd260; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..eea681283f3a5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_14_fp64_inv.hpp.inc @@ -0,0 +1,242 @@ +#ifndef CUFFTDX_FFT_14_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_14_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<576, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<283>; +.reg .b64 rd<2>; +add.f64 fd57, %33, %60; +add.f64 fd58, %28, fd57; +add.f64 fd59, %38, %54; +add.f64 fd60, fd59, fd58; +add.f64 fd61, %44, %49; +add.f64 fd62, fd61, fd60; +add.f64 fd63, %35, %61; +add.f64 fd64, %29, fd63; +add.f64 fd65, %40, %56; +add.f64 fd66, fd65, fd64; +add.f64 fd67, %45, %51; +add.f64 fd68, fd67, fd66; +fma.rn.f64 fd69, fd57, 0d3FE3F3A0E28BEDD1, %28; +mul.f64 fd70, fd59, 0d3FCC7B90E3024582; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd61, 0d3FECD4BCA9CB5C71; +sub.f64 fd73, fd71, fd72; +sub.f64 fd74, %35, %61; +mul.f64 fd75, fd74, 0d3FE904C37505DE4B; +sub.f64 fd76, %40, %56; +fma.rn.f64 fd77, fd76, 0d3FEF329C0558E969, fd75; +sub.f64 fd78, %45, %51; +fma.rn.f64 fd79, fd78, 0d3FDBC4C04D71ABC1, fd77; +sub.f64 fd80, fd73, fd79; +add.f64 fd81, fd79, fd73; +mul.f64 fd82, fd57, 0d3FCC7B90E3024582; +sub.f64 fd83, %28, fd82; +mul.f64 fd84, fd59, 0d3FECD4BCA9CB5C71; +sub.f64 fd85, fd83, fd84; +fma.rn.f64 fd86, fd61, 0d3FE3F3A0E28BEDD1, fd85; +mul.f64 fd87, fd74, 0d3FEF329C0558E969; +mul.f64 fd88, fd76, 0d3FDBC4C04D71ABC1; +sub.f64 fd89, fd87, fd88; +mul.f64 fd90, fd78, 0d3FE904C37505DE4B; +sub.f64 fd91, fd89, fd90; +sub.f64 fd92, fd86, fd91; +add.f64 fd93, fd91, fd86; +mul.f64 fd94, fd57, 0d3FECD4BCA9CB5C71; +sub.f64 fd95, %28, fd94; +fma.rn.f64 fd96, fd59, 0d3FE3F3A0E28BEDD1, fd95; +mul.f64 fd97, fd61, 0d3FCC7B90E3024582; +sub.f64 fd98, fd96, fd97; +mul.f64 fd99, fd74, 0d3FDBC4C04D71ABC1; +mul.f64 fd100, fd76, 0d3FE904C37505DE4B; +sub.f64 fd101, fd99, fd100; +fma.rn.f64 fd102, fd78, 0d3FEF329C0558E969, fd101; +sub.f64 fd103, fd98, fd102; +add.f64 fd104, fd102, fd98; +fma.rn.f64 fd105, fd63, 0d3FE3F3A0E28BEDD1, %29; +mul.f64 fd106, fd65, 0d3FCC7B90E3024582; +sub.f64 fd107, fd105, fd106; +mul.f64 fd108, fd67, 0d3FECD4BCA9CB5C71; +sub.f64 fd109, fd107, fd108; +sub.f64 fd110, %33, %60; +mul.f64 fd111, fd110, 0d3FE904C37505DE4B; +sub.f64 fd112, %38, %54; +fma.rn.f64 fd113, fd112, 0d3FEF329C0558E969, fd111; +sub.f64 fd114, %44, %49; +fma.rn.f64 fd115, fd114, 0d3FDBC4C04D71ABC1, fd113; +add.f64 fd116, fd115, fd109; +sub.f64 fd117, fd109, fd115; +mul.f64 fd118, fd63, 0d3FCC7B90E3024582; +sub.f64 fd119, %29, fd118; +mul.f64 fd120, fd65, 0d3FECD4BCA9CB5C71; +sub.f64 fd121, fd119, fd120; +fma.rn.f64 fd122, fd67, 0d3FE3F3A0E28BEDD1, fd121; +mul.f64 fd123, fd110, 0d3FEF329C0558E969; +mul.f64 fd124, fd112, 0d3FDBC4C04D71ABC1; +sub.f64 fd125, fd123, fd124; +mul.f64 fd126, fd114, 0d3FE904C37505DE4B; +sub.f64 fd127, fd125, fd126; +add.f64 fd128, fd127, fd122; +sub.f64 fd129, fd122, fd127; +mul.f64 fd130, fd63, 0d3FECD4BCA9CB5C71; +sub.f64 fd131, %29, fd130; +fma.rn.f64 fd132, fd65, 0d3FE3F3A0E28BEDD1, fd131; +mul.f64 fd133, fd67, 0d3FCC7B90E3024582; +sub.f64 fd134, fd132, fd133; +mul.f64 fd135, fd110, 0d3FDBC4C04D71ABC1; +mul.f64 fd136, fd112, 0d3FE904C37505DE4B; +sub.f64 fd137, fd135, fd136; +fma.rn.f64 fd138, fd114, 0d3FEF329C0558E969, fd137; +add.f64 fd139, fd138, fd134; +sub.f64 fd140, fd134, fd138; +add.f64 fd141, %36, %62; +add.f64 fd142, %30, fd141; +add.f64 fd143, %41, %57; +add.f64 fd144, fd143, fd142; +add.f64 fd145, %46, %52; +add.f64 fd146, fd145, fd144; +add.f64 fd147, %37, %63; +add.f64 fd148, %32, fd147; +add.f64 fd149, %43, %59; +add.f64 fd150, fd149, fd148; +add.f64 fd151, %48, %53; +add.f64 fd152, fd151, fd150; +fma.rn.f64 fd153, fd141, 0d3FE3F3A0E28BEDD1, %30; +mul.f64 fd154, fd143, 0d3FCC7B90E3024582; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd145, 0d3FECD4BCA9CB5C71; +sub.f64 fd157, fd155, fd156; +sub.f64 fd158, %37, %63; +mul.f64 fd159, fd158, 0d3FE904C37505DE4B; +sub.f64 fd160, %43, %59; +fma.rn.f64 fd161, fd160, 0d3FEF329C0558E969, fd159; +sub.f64 fd162, %48, %53; +fma.rn.f64 fd163, fd162, 0d3FDBC4C04D71ABC1, fd161; +sub.f64 fd164, fd157, fd163; +add.f64 fd165, fd163, fd157; +mul.f64 fd166, fd141, 0d3FCC7B90E3024582; +sub.f64 fd167, %30, fd166; +mul.f64 fd168, fd143, 0d3FECD4BCA9CB5C71; +sub.f64 fd169, fd167, fd168; +fma.rn.f64 fd170, fd145, 0d3FE3F3A0E28BEDD1, fd169; +mul.f64 fd171, fd158, 0d3FEF329C0558E969; +mul.f64 fd172, fd160, 0d3FDBC4C04D71ABC1; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd162, 0d3FE904C37505DE4B; +sub.f64 fd175, fd173, fd174; +sub.f64 fd176, fd170, fd175; +add.f64 fd177, fd175, fd170; +mul.f64 fd178, fd141, 0d3FECD4BCA9CB5C71; +sub.f64 fd179, %30, fd178; +fma.rn.f64 fd180, fd143, 0d3FE3F3A0E28BEDD1, fd179; +mul.f64 fd181, fd145, 0d3FCC7B90E3024582; +sub.f64 fd182, fd180, fd181; +mul.f64 fd183, fd158, 0d3FDBC4C04D71ABC1; +mul.f64 fd184, fd160, 0d3FE904C37505DE4B; +sub.f64 fd185, fd183, fd184; +fma.rn.f64 fd186, fd162, 0d3FEF329C0558E969, fd185; +sub.f64 fd187, fd182, fd186; +add.f64 fd188, fd186, fd182; +fma.rn.f64 fd189, fd147, 0d3FE3F3A0E28BEDD1, %32; +mul.f64 fd190, fd149, 0d3FCC7B90E3024582; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, 0d3FECD4BCA9CB5C71; +sub.f64 fd193, fd191, fd192; +sub.f64 fd194, %36, %62; +mul.f64 fd195, fd194, 0d3FE904C37505DE4B; +sub.f64 fd196, %41, %57; +fma.rn.f64 fd197, fd196, 0d3FEF329C0558E969, fd195; +sub.f64 fd198, %46, %52; +fma.rn.f64 fd199, fd198, 0d3FDBC4C04D71ABC1, fd197; +add.f64 fd200, fd199, fd193; +sub.f64 fd201, fd193, fd199; +mul.f64 fd202, fd147, 0d3FCC7B90E3024582; +sub.f64 fd203, %32, fd202; +mul.f64 fd204, fd149, 0d3FECD4BCA9CB5C71; +sub.f64 fd205, fd203, fd204; +fma.rn.f64 fd206, fd151, 0d3FE3F3A0E28BEDD1, fd205; +mul.f64 fd207, fd194, 0d3FEF329C0558E969; +mul.f64 fd208, fd196, 0d3FDBC4C04D71ABC1; +sub.f64 fd209, fd207, fd208; +mul.f64 fd210, fd198, 0d3FE904C37505DE4B; +sub.f64 fd211, fd209, fd210; +add.f64 fd212, fd211, fd206; +sub.f64 fd213, fd206, fd211; +mul.f64 fd214, fd147, 0d3FECD4BCA9CB5C71; +sub.f64 fd215, %32, fd214; +fma.rn.f64 fd216, fd149, 0d3FE3F3A0E28BEDD1, fd215; +mul.f64 fd217, fd151, 0d3FCC7B90E3024582; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd194, 0d3FDBC4C04D71ABC1; +mul.f64 fd220, fd196, 0d3FE904C37505DE4B; +sub.f64 fd221, fd219, fd220; +fma.rn.f64 fd222, fd198, 0d3FEF329C0558E969, fd221; +add.f64 fd223, fd222, fd218; +sub.f64 fd224, fd218, fd222; +mul.f64 fd225, fd164, 0d3FECD4BCA9CB5C71; +mul.f64 fd226, fd200, 0d3FDBC4C04D71ABC1; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd200, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd229, fd164, 0d3FDBC4C04D71ABC1, fd228; +mul.f64 fd230, fd176, 0d3FE3F3A0E28BEDD1; +mul.f64 fd231, fd212, 0d3FE904C37505DE4B; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd212, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd234, fd176, 0d3FE904C37505DE4B, fd233; +mul.f64 fd235, fd187, 0d3FCC7B90E3024582; +mul.f64 fd236, fd223, 0d3FEF329C0558E969; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd223, 0d3FCC7B90E3024582; +fma.rn.f64 fd239, fd187, 0d3FEF329C0558E969, fd238; +mul.f64 fd240, fd188, 0dBFCC7B90E3024582; +mul.f64 fd241, fd224, 0d3FEF329C0558E969; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd224, 0dBFCC7B90E3024582; +fma.rn.f64 fd244, fd188, 0d3FEF329C0558E969, fd243; +mul.f64 fd245, fd177, 0dBFE3F3A0E28BEDD1; +mul.f64 fd246, fd213, 0d3FE904C37505DE4B; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd213, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd249, fd177, 0d3FE904C37505DE4B, fd248; +mul.f64 fd250, fd165, 0dBFECD4BCA9CB5C71; +mul.f64 fd251, fd201, 0d3FDBC4C04D71ABC1; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd201, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd254, fd165, 0d3FDBC4C04D71ABC1, fd253; +add.f64 %1, fd68, fd152; +add.f64 %0, fd62, fd146; +add.f64 %3, fd116, fd229; +add.f64 %2, fd80, fd227; +add.f64 %5, fd128, fd234; +add.f64 %4, fd92, fd232; +add.f64 %7, fd139, fd239; +add.f64 %6, fd103, fd237; +add.f64 %9, fd140, fd244; +add.f64 %8, fd104, fd242; +add.f64 %11, fd129, fd249; +add.f64 %10, fd93, fd247; +add.f64 %13, fd117, fd254; +add.f64 %12, fd81, fd252; +sub.f64 %15, fd68, fd152; +sub.f64 %14, fd62, fd146; +sub.f64 %17, fd116, fd229; +sub.f64 %16, fd80, fd227; +sub.f64 %19, fd128, fd234; +sub.f64 %18, fd92, fd232; +sub.f64 %21, fd139, fd239; +sub.f64 %20, fd103, fd237; +sub.f64 %23, fd140, fd244; +sub.f64 %22, fd104, fd242; +sub.f64 %25, fd129, fd249; +sub.f64 %24, fd93, fd247; +sub.f64 %27, fd117, fd254; +sub.f64 %26, fd81, fd252; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..94ba078e6ca41 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp16_fwd.hpp.inc @@ -0,0 +1,25718 @@ +#ifndef CUFFTDX_FFT_15625_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_15625_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1179, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.local .align 8 .b8 __local_depot0[200]; +.reg .b64 SP; +.reg .b64 SPL; +.reg .pred p<3>; +.reg .f32 f<695>; +.reg .b32 r<11354>; +.reg .b64 rd<21>; +mov.u64 SPL, __local_depot0; +add.u64 rd3, SPL, 0; +mov.u32 r3531, %tid.y; +mul.lo.s32 r1, r3531, 15625; +add.s64 rd4, rd3, 4; +mov.f32 f214, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r23, {low, high}; +} +mov.f32 f216, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r24, {low, high}; +} +mov.f32 f210, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r25, {low, high}; +} +mov.f32 f212, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r26, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r27, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r28, {low, high}; +} +{ +neg.f16x2 r29, r28; +} +{ +add.f16x2 r31, %61, %91; +} +{ +add.f16x2 r34, %51, r31; +} +{ +add.f16x2 r37, %71, %81; +} +{ +add.f16x2 r40, r34, r37; +} +{ +add.f16x2 r43, %62, %92; +} +{ +add.f16x2 r46, %52, r43; +} +{ +add.f16x2 r49, %72, %82; +} +{ +add.f16x2 r52, r46, r49; +} +{ +add.f16x2 r55, %61, %91; +} +{ +mul.f16x2 r58, r55, r23; +} +{ +add.f16x2 r61, %51, r58; +} +{ +add.f16x2 r64, %71, %81; +} +{ +mul.f16x2 r67, r64, r25; +} +{ +add.f16x2 r70, r61, r67; +} +{ +sub.f16x2 r73, %62, %92; +} +{ +mul.f16x2 r76, r73, r24; +} +{ +sub.f16x2 r79, %72, %82; +} +{ +mul.f16x2 r82, r79, r26; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r70, r85; +} +{ +add.f16x2 r91, %61, %91; +} +{ +mul.f16x2 r94, r91, r23; +} +{ +add.f16x2 r97, %51, r94; +} +{ +add.f16x2 r100, %71, %81; +} +{ +mul.f16x2 r103, r100, r25; +} +{ +add.f16x2 r106, r97, r103; +} +{ +sub.f16x2 r109, %62, %92; +} +{ +mul.f16x2 r112, r109, r24; +} +{ +sub.f16x2 r115, %72, %82; +} +{ +mul.f16x2 r118, r115, r26; +} +{ +add.f16x2 r121, r112, r118; +} +{ +add.f16x2 r124, r106, r121; +} +{ +add.f16x2 r127, %61, %91; +} +{ +mul.f16x2 r130, r127, r25; +} +{ +add.f16x2 r133, %51, r130; +} +{ +add.f16x2 r136, %71, %81; +} +{ +mul.f16x2 r139, r136, r27; +} +{ +add.f16x2 r142, r133, r139; +} +{ +sub.f16x2 r145, %62, %92; +} +{ +mul.f16x2 r148, r145, r26; +} +{ +sub.f16x2 r151, %72, %82; +} +{ +mul.f16x2 r154, r151, r29; +} +{ +add.f16x2 r157, r148, r154; +} +{ +sub.f16x2 r160, r142, r157; +} +{ +add.f16x2 r163, %61, %91; +} +{ +mul.f16x2 r166, r163, r25; +} +{ +add.f16x2 r169, %51, r166; +} +{ +add.f16x2 r172, %71, %81; +} +{ +mul.f16x2 r175, r172, r27; +} +{ +add.f16x2 r178, r169, r175; +} +{ +sub.f16x2 r181, %62, %92; +} +{ +mul.f16x2 r184, r181, r26; +} +{ +sub.f16x2 r187, %72, %82; +} +{ +mul.f16x2 r190, r187, r29; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r178, r193; +} +{ +add.f16x2 r199, %62, %92; +} +{ +mul.f16x2 r202, r199, r23; +} +{ +add.f16x2 r205, %52, r202; +} +{ +add.f16x2 r208, %72, %82; +} +{ +mul.f16x2 r211, r208, r25; +} +{ +add.f16x2 r214, r205, r211; +} +{ +sub.f16x2 r217, %61, %91; +} +{ +mul.f16x2 r220, r217, r24; +} +{ +sub.f16x2 r223, %71, %81; +} +{ +mul.f16x2 r226, r223, r26; +} +{ +add.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r214, r229; +} +{ +add.f16x2 r235, %62, %92; +} +{ +mul.f16x2 r238, r235, r23; +} +{ +add.f16x2 r241, %52, r238; +} +{ +add.f16x2 r244, %72, %82; +} +{ +mul.f16x2 r247, r244, r25; +} +{ +add.f16x2 r250, r241, r247; +} +{ +sub.f16x2 r253, %61, %91; +} +{ +mul.f16x2 r256, r253, r24; +} +{ +sub.f16x2 r259, %71, %81; +} +{ +mul.f16x2 r262, r259, r26; +} +{ +add.f16x2 r265, r256, r262; +} +{ +sub.f16x2 r268, r250, r265; +} +{ +add.f16x2 r271, %62, %92; +} +{ +mul.f16x2 r274, r271, r25; +} +{ +add.f16x2 r277, %52, r274; +} +{ +add.f16x2 r280, %72, %82; +} +{ +mul.f16x2 r283, r280, r27; +} +{ +add.f16x2 r286, r277, r283; +} +{ +sub.f16x2 r289, %61, %91; +} +{ +mul.f16x2 r292, r289, r26; +} +{ +sub.f16x2 r295, %71, %81; +} +{ +mul.f16x2 r298, r295, r29; +} +{ +add.f16x2 r301, r292, r298; +} +{ +add.f16x2 r304, r286, r301; +} +{ +add.f16x2 r307, %62, %92; +} +{ +mul.f16x2 r310, r307, r25; +} +{ +add.f16x2 r313, %52, r310; +} +{ +add.f16x2 r316, %72, %82; +} +{ +mul.f16x2 r319, r316, r27; +} +{ +add.f16x2 r322, r313, r319; +} +{ +sub.f16x2 r325, %61, %91; +} +{ +mul.f16x2 r328, r325, r26; +} +{ +sub.f16x2 r331, %71, %81; +} +{ +mul.f16x2 r334, r331, r29; +} +{ +add.f16x2 r337, r328, r334; +} +{ +sub.f16x2 r340, r322, r337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r344, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r347, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r348, {low, high}; +} +{ +neg.f16x2 r349, r348; +} +{ +add.f16x2 r351, %63, %93; +} +{ +add.f16x2 r354, %53, r351; +} +{ +add.f16x2 r357, %73, %83; +} +{ +add.f16x2 r360, r354, r357; +} +{ +add.f16x2 r363, %64, %94; +} +{ +add.f16x2 r366, %54, r363; +} +{ +add.f16x2 r369, %74, %84; +} +{ +add.f16x2 r372, r366, r369; +} +{ +add.f16x2 r375, %63, %93; +} +{ +mul.f16x2 r378, r375, r343; +} +{ +add.f16x2 r381, %53, r378; +} +{ +add.f16x2 r384, %73, %83; +} +{ +mul.f16x2 r387, r384, r345; +} +{ +add.f16x2 r390, r381, r387; +} +{ +sub.f16x2 r393, %64, %94; +} +{ +mul.f16x2 r396, r393, r344; +} +{ +sub.f16x2 r399, %74, %84; +} +{ +mul.f16x2 r402, r399, r346; +} +{ +add.f16x2 r405, r396, r402; +} +{ +sub.f16x2 r408, r390, r405; +} +{ +add.f16x2 r411, %63, %93; +} +{ +mul.f16x2 r414, r411, r343; +} +{ +add.f16x2 r417, %53, r414; +} +{ +add.f16x2 r420, %73, %83; +} +{ +mul.f16x2 r423, r420, r345; +} +{ +add.f16x2 r426, r417, r423; +} +{ +sub.f16x2 r429, %64, %94; +} +{ +mul.f16x2 r432, r429, r344; +} +{ +sub.f16x2 r435, %74, %84; +} +{ +mul.f16x2 r438, r435, r346; +} +{ +add.f16x2 r441, r432, r438; +} +{ +add.f16x2 r444, r426, r441; +} +{ +add.f16x2 r447, %63, %93; +} +{ +mul.f16x2 r450, r447, r345; +} +{ +add.f16x2 r453, %53, r450; +} +{ +add.f16x2 r456, %73, %83; +} +{ +mul.f16x2 r459, r456, r347; +} +{ +add.f16x2 r462, r453, r459; +} +{ +sub.f16x2 r465, %64, %94; +} +{ +mul.f16x2 r468, r465, r346; +} +{ +sub.f16x2 r471, %74, %84; +} +{ +mul.f16x2 r474, r471, r349; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r462, r477; +} +{ +add.f16x2 r483, %63, %93; +} +{ +mul.f16x2 r486, r483, r345; +} +{ +add.f16x2 r489, %53, r486; +} +{ +add.f16x2 r492, %73, %83; +} +{ +mul.f16x2 r495, r492, r347; +} +{ +add.f16x2 r498, r489, r495; +} +{ +sub.f16x2 r501, %64, %94; +} +{ +mul.f16x2 r504, r501, r346; +} +{ +sub.f16x2 r507, %74, %84; +} +{ +mul.f16x2 r510, r507, r349; +} +{ +add.f16x2 r513, r504, r510; +} +{ +add.f16x2 r516, r498, r513; +} +{ +add.f16x2 r519, %64, %94; +} +{ +mul.f16x2 r522, r519, r343; +} +{ +add.f16x2 r525, %54, r522; +} +{ +add.f16x2 r528, %74, %84; +} +{ +mul.f16x2 r531, r528, r345; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, %63, %93; +} +{ +mul.f16x2 r540, r537, r344; +} +{ +sub.f16x2 r543, %73, %83; +} +{ +mul.f16x2 r546, r543, r346; +} +{ +add.f16x2 r549, r540, r546; +} +{ +add.f16x2 r552, r534, r549; +} +{ +add.f16x2 r555, %64, %94; +} +{ +mul.f16x2 r558, r555, r343; +} +{ +add.f16x2 r561, %54, r558; +} +{ +add.f16x2 r564, %74, %84; +} +{ +mul.f16x2 r567, r564, r345; +} +{ +add.f16x2 r570, r561, r567; +} +{ +sub.f16x2 r573, %63, %93; +} +{ +mul.f16x2 r576, r573, r344; +} +{ +sub.f16x2 r579, %73, %83; +} +{ +mul.f16x2 r582, r579, r346; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r570, r585; +} +{ +add.f16x2 r591, %64, %94; +} +{ +mul.f16x2 r594, r591, r345; +} +{ +add.f16x2 r597, %54, r594; +} +{ +add.f16x2 r600, %74, %84; +} +{ +mul.f16x2 r603, r600, r347; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, %63, %93; +} +{ +mul.f16x2 r612, r609, r346; +} +{ +sub.f16x2 r615, %73, %83; +} +{ +mul.f16x2 r618, r615, r349; +} +{ +add.f16x2 r621, r612, r618; +} +{ +add.f16x2 r624, r606, r621; +} +{ +add.f16x2 r627, %64, %94; +} +{ +mul.f16x2 r630, r627, r345; +} +{ +add.f16x2 r633, %54, r630; +} +{ +add.f16x2 r636, %74, %84; +} +{ +mul.f16x2 r639, r636, r347; +} +{ +add.f16x2 r642, r633, r639; +} +{ +sub.f16x2 r645, %63, %93; +} +{ +mul.f16x2 r648, r645, r346; +} +{ +sub.f16x2 r651, %73, %83; +} +{ +mul.f16x2 r654, r651, r349; +} +{ +add.f16x2 r657, r648, r654; +} +{ +sub.f16x2 r660, r642, r657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r663, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r664, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r665, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r666, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r667, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r668, {low, high}; +} +{ +neg.f16x2 r669, r668; +} +{ +add.f16x2 r671, %65, %95; +} +{ +add.f16x2 r674, %55, r671; +} +{ +add.f16x2 r677, %75, %85; +} +{ +add.f16x2 r680, r674, r677; +} +{ +add.f16x2 r683, %66, %96; +} +{ +add.f16x2 r686, %56, r683; +} +{ +add.f16x2 r689, %76, %86; +} +{ +add.f16x2 r692, r686, r689; +} +{ +add.f16x2 r695, %65, %95; +} +{ +mul.f16x2 r698, r695, r663; +} +{ +add.f16x2 r701, %55, r698; +} +{ +add.f16x2 r704, %75, %85; +} +{ +mul.f16x2 r707, r704, r665; +} +{ +add.f16x2 r710, r701, r707; +} +{ +sub.f16x2 r713, %66, %96; +} +{ +mul.f16x2 r716, r713, r664; +} +{ +sub.f16x2 r719, %76, %86; +} +{ +mul.f16x2 r722, r719, r666; +} +{ +add.f16x2 r725, r716, r722; +} +{ +sub.f16x2 r728, r710, r725; +} +{ +add.f16x2 r731, %65, %95; +} +{ +mul.f16x2 r734, r731, r663; +} +{ +add.f16x2 r737, %55, r734; +} +{ +add.f16x2 r740, %75, %85; +} +{ +mul.f16x2 r743, r740, r665; +} +{ +add.f16x2 r746, r737, r743; +} +{ +sub.f16x2 r749, %66, %96; +} +{ +mul.f16x2 r752, r749, r664; +} +{ +sub.f16x2 r755, %76, %86; +} +{ +mul.f16x2 r758, r755, r666; +} +{ +add.f16x2 r761, r752, r758; +} +{ +add.f16x2 r764, r746, r761; +} +{ +add.f16x2 r767, %65, %95; +} +{ +mul.f16x2 r770, r767, r665; +} +{ +add.f16x2 r773, %55, r770; +} +{ +add.f16x2 r776, %75, %85; +} +{ +mul.f16x2 r779, r776, r667; +} +{ +add.f16x2 r782, r773, r779; +} +{ +sub.f16x2 r785, %66, %96; +} +{ +mul.f16x2 r788, r785, r666; +} +{ +sub.f16x2 r791, %76, %86; +} +{ +mul.f16x2 r794, r791, r669; +} +{ +add.f16x2 r797, r788, r794; +} +{ +sub.f16x2 r800, r782, r797; +} +{ +add.f16x2 r803, %65, %95; +} +{ +mul.f16x2 r806, r803, r665; +} +{ +add.f16x2 r809, %55, r806; +} +{ +add.f16x2 r812, %75, %85; +} +{ +mul.f16x2 r815, r812, r667; +} +{ +add.f16x2 r818, r809, r815; +} +{ +sub.f16x2 r821, %66, %96; +} +{ +mul.f16x2 r824, r821, r666; +} +{ +sub.f16x2 r827, %76, %86; +} +{ +mul.f16x2 r830, r827, r669; +} +{ +add.f16x2 r833, r824, r830; +} +{ +add.f16x2 r836, r818, r833; +} +{ +add.f16x2 r839, %66, %96; +} +{ +mul.f16x2 r842, r839, r663; +} +{ +add.f16x2 r845, %56, r842; +} +{ +add.f16x2 r848, %76, %86; +} +{ +mul.f16x2 r851, r848, r665; +} +{ +add.f16x2 r854, r845, r851; +} +{ +sub.f16x2 r857, %65, %95; +} +{ +mul.f16x2 r860, r857, r664; +} +{ +sub.f16x2 r863, %75, %85; +} +{ +mul.f16x2 r866, r863, r666; +} +{ +add.f16x2 r869, r860, r866; +} +{ +add.f16x2 r872, r854, r869; +} +{ +add.f16x2 r875, %66, %96; +} +{ +mul.f16x2 r878, r875, r663; +} +{ +add.f16x2 r881, %56, r878; +} +{ +add.f16x2 r884, %76, %86; +} +{ +mul.f16x2 r887, r884, r665; +} +{ +add.f16x2 r890, r881, r887; +} +{ +sub.f16x2 r893, %65, %95; +} +{ +mul.f16x2 r896, r893, r664; +} +{ +sub.f16x2 r899, %75, %85; +} +{ +mul.f16x2 r902, r899, r666; +} +{ +add.f16x2 r905, r896, r902; +} +{ +sub.f16x2 r908, r890, r905; +} +{ +add.f16x2 r911, %66, %96; +} +{ +mul.f16x2 r914, r911, r665; +} +{ +add.f16x2 r917, %56, r914; +} +{ +add.f16x2 r920, %76, %86; +} +{ +mul.f16x2 r923, r920, r667; +} +{ +add.f16x2 r926, r917, r923; +} +{ +sub.f16x2 r929, %65, %95; +} +{ +mul.f16x2 r932, r929, r666; +} +{ +sub.f16x2 r935, %75, %85; +} +{ +mul.f16x2 r938, r935, r669; +} +{ +add.f16x2 r941, r932, r938; +} +{ +add.f16x2 r944, r926, r941; +} +{ +add.f16x2 r947, %66, %96; +} +{ +mul.f16x2 r950, r947, r665; +} +{ +add.f16x2 r953, %56, r950; +} +{ +add.f16x2 r956, %76, %86; +} +{ +mul.f16x2 r959, r956, r667; +} +{ +add.f16x2 r962, r953, r959; +} +{ +sub.f16x2 r965, %65, %95; +} +{ +mul.f16x2 r968, r965, r666; +} +{ +sub.f16x2 r971, %75, %85; +} +{ +mul.f16x2 r974, r971, r669; +} +{ +add.f16x2 r977, r968, r974; +} +{ +sub.f16x2 r980, r962, r977; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r983, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r984, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r985, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r986, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r987, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r988, {low, high}; +} +{ +neg.f16x2 r989, r988; +} +{ +add.f16x2 r991, %67, %97; +} +{ +add.f16x2 r994, %57, r991; +} +{ +add.f16x2 r997, %77, %87; +} +{ +add.f16x2 r1000, r994, r997; +} +{ +add.f16x2 r1003, %68, %98; +} +{ +add.f16x2 r1006, %58, r1003; +} +{ +add.f16x2 r1009, %78, %88; +} +{ +add.f16x2 r1012, r1006, r1009; +} +{ +add.f16x2 r1015, %67, %97; +} +{ +mul.f16x2 r1018, r1015, r983; +} +{ +add.f16x2 r1021, %57, r1018; +} +{ +add.f16x2 r1024, %77, %87; +} +{ +mul.f16x2 r1027, r1024, r985; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %68, %98; +} +{ +mul.f16x2 r1036, r1033, r984; +} +{ +sub.f16x2 r1039, %78, %88; +} +{ +mul.f16x2 r1042, r1039, r986; +} +{ +add.f16x2 r1045, r1036, r1042; +} +{ +sub.f16x2 r1048, r1030, r1045; +} +{ +add.f16x2 r1051, %67, %97; +} +{ +mul.f16x2 r1054, r1051, r983; +} +{ +add.f16x2 r1057, %57, r1054; +} +{ +add.f16x2 r1060, %77, %87; +} +{ +mul.f16x2 r1063, r1060, r985; +} +{ +add.f16x2 r1066, r1057, r1063; +} +{ +sub.f16x2 r1069, %68, %98; +} +{ +mul.f16x2 r1072, r1069, r984; +} +{ +sub.f16x2 r1075, %78, %88; +} +{ +mul.f16x2 r1078, r1075, r986; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +add.f16x2 r1084, r1066, r1081; +} +{ +add.f16x2 r1087, %67, %97; +} +{ +mul.f16x2 r1090, r1087, r985; +} +{ +add.f16x2 r1093, %57, r1090; +} +{ +add.f16x2 r1096, %77, %87; +} +{ +mul.f16x2 r1099, r1096, r987; +} +{ +add.f16x2 r1102, r1093, r1099; +} +{ +sub.f16x2 r1105, %68, %98; +} +{ +mul.f16x2 r1108, r1105, r986; +} +{ +sub.f16x2 r1111, %78, %88; +} +{ +mul.f16x2 r1114, r1111, r989; +} +{ +add.f16x2 r1117, r1108, r1114; +} +{ +sub.f16x2 r1120, r1102, r1117; +} +{ +add.f16x2 r1123, %67, %97; +} +{ +mul.f16x2 r1126, r1123, r985; +} +{ +add.f16x2 r1129, %57, r1126; +} +{ +add.f16x2 r1132, %77, %87; +} +{ +mul.f16x2 r1135, r1132, r987; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %68, %98; +} +{ +mul.f16x2 r1144, r1141, r986; +} +{ +sub.f16x2 r1147, %78, %88; +} +{ +mul.f16x2 r1150, r1147, r989; +} +{ +add.f16x2 r1153, r1144, r1150; +} +{ +add.f16x2 r1156, r1138, r1153; +} +{ +add.f16x2 r1159, %68, %98; +} +{ +mul.f16x2 r1162, r1159, r983; +} +{ +add.f16x2 r1165, %58, r1162; +} +{ +add.f16x2 r1168, %78, %88; +} +{ +mul.f16x2 r1171, r1168, r985; +} +{ +add.f16x2 r1174, r1165, r1171; +} +{ +sub.f16x2 r1177, %67, %97; +} +{ +mul.f16x2 r1180, r1177, r984; +} +{ +sub.f16x2 r1183, %77, %87; +} +{ +mul.f16x2 r1186, r1183, r986; +} +{ +add.f16x2 r1189, r1180, r1186; +} +{ +add.f16x2 r1192, r1174, r1189; +} +{ +add.f16x2 r1195, %68, %98; +} +{ +mul.f16x2 r1198, r1195, r983; +} +{ +add.f16x2 r1201, %58, r1198; +} +{ +add.f16x2 r1204, %78, %88; +} +{ +mul.f16x2 r1207, r1204, r985; +} +{ +add.f16x2 r1210, r1201, r1207; +} +{ +sub.f16x2 r1213, %67, %97; +} +{ +mul.f16x2 r1216, r1213, r984; +} +{ +sub.f16x2 r1219, %77, %87; +} +{ +mul.f16x2 r1222, r1219, r986; +} +{ +add.f16x2 r1225, r1216, r1222; +} +{ +sub.f16x2 r1228, r1210, r1225; +} +{ +add.f16x2 r1231, %68, %98; +} +{ +mul.f16x2 r1234, r1231, r985; +} +{ +add.f16x2 r1237, %58, r1234; +} +{ +add.f16x2 r1240, %78, %88; +} +{ +mul.f16x2 r1243, r1240, r987; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %67, %97; +} +{ +mul.f16x2 r1252, r1249, r986; +} +{ +sub.f16x2 r1255, %77, %87; +} +{ +mul.f16x2 r1258, r1255, r989; +} +{ +add.f16x2 r1261, r1252, r1258; +} +{ +add.f16x2 r1264, r1246, r1261; +} +{ +add.f16x2 r1267, %68, %98; +} +{ +mul.f16x2 r1270, r1267, r985; +} +{ +add.f16x2 r1273, %58, r1270; +} +{ +add.f16x2 r1276, %78, %88; +} +{ +mul.f16x2 r1279, r1276, r987; +} +{ +add.f16x2 r1282, r1273, r1279; +} +{ +sub.f16x2 r1285, %67, %97; +} +{ +mul.f16x2 r1288, r1285, r986; +} +{ +sub.f16x2 r1291, %77, %87; +} +{ +mul.f16x2 r1294, r1291, r989; +} +{ +add.f16x2 r1297, r1288, r1294; +} +{ +sub.f16x2 r1300, r1282, r1297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1306, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1307, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1308, {low, high}; +} +{ +neg.f16x2 r1309, r1308; +} +{ +add.f16x2 r1311, %69, %99; +} +{ +add.f16x2 r1314, %59, r1311; +} +{ +add.f16x2 r1317, %79, %89; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %70, %100; +} +{ +add.f16x2 r1326, %60, r1323; +} +{ +add.f16x2 r1329, %80, %90; +} +{ +add.f16x2 r1332, r1326, r1329; +} +{ +add.f16x2 r1335, %69, %99; +} +{ +mul.f16x2 r1338, r1335, r1303; +} +{ +add.f16x2 r1341, %59, r1338; +} +{ +add.f16x2 r1344, %79, %89; +} +{ +mul.f16x2 r1347, r1344, r1305; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +sub.f16x2 r1353, %70, %100; +} +{ +mul.f16x2 r1356, r1353, r1304; +} +{ +sub.f16x2 r1359, %80, %90; +} +{ +mul.f16x2 r1362, r1359, r1306; +} +{ +add.f16x2 r1365, r1356, r1362; +} +{ +sub.f16x2 r1368, r1350, r1365; +} +{ +add.f16x2 r1371, %69, %99; +} +{ +mul.f16x2 r1374, r1371, r1303; +} +{ +add.f16x2 r1377, %59, r1374; +} +{ +add.f16x2 r1380, %79, %89; +} +{ +mul.f16x2 r1383, r1380, r1305; +} +{ +add.f16x2 r1386, r1377, r1383; +} +{ +sub.f16x2 r1389, %70, %100; +} +{ +mul.f16x2 r1392, r1389, r1304; +} +{ +sub.f16x2 r1395, %80, %90; +} +{ +mul.f16x2 r1398, r1395, r1306; +} +{ +add.f16x2 r1401, r1392, r1398; +} +{ +add.f16x2 r1404, r1386, r1401; +} +{ +add.f16x2 r1407, %69, %99; +} +{ +mul.f16x2 r1410, r1407, r1305; +} +{ +add.f16x2 r1413, %59, r1410; +} +{ +add.f16x2 r1416, %79, %89; +} +{ +mul.f16x2 r1419, r1416, r1307; +} +{ +add.f16x2 r1422, r1413, r1419; +} +{ +sub.f16x2 r1425, %70, %100; +} +{ +mul.f16x2 r1428, r1425, r1306; +} +{ +sub.f16x2 r1431, %80, %90; +} +{ +mul.f16x2 r1434, r1431, r1309; +} +{ +add.f16x2 r1437, r1428, r1434; +} +{ +sub.f16x2 r1440, r1422, r1437; +} +{ +add.f16x2 r1443, %69, %99; +} +{ +mul.f16x2 r1446, r1443, r1305; +} +{ +add.f16x2 r1449, %59, r1446; +} +{ +add.f16x2 r1452, %79, %89; +} +{ +mul.f16x2 r1455, r1452, r1307; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +sub.f16x2 r1461, %70, %100; +} +{ +mul.f16x2 r1464, r1461, r1306; +} +{ +sub.f16x2 r1467, %80, %90; +} +{ +mul.f16x2 r1470, r1467, r1309; +} +{ +add.f16x2 r1473, r1464, r1470; +} +{ +add.f16x2 r1476, r1458, r1473; +} +{ +add.f16x2 r1479, %70, %100; +} +{ +mul.f16x2 r1482, r1479, r1303; +} +{ +add.f16x2 r1485, %60, r1482; +} +{ +add.f16x2 r1488, %80, %90; +} +{ +mul.f16x2 r1491, r1488, r1305; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, %69, %99; +} +{ +mul.f16x2 r1500, r1497, r1304; +} +{ +sub.f16x2 r1503, %79, %89; +} +{ +mul.f16x2 r1506, r1503, r1306; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +add.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, %70, %100; +} +{ +mul.f16x2 r1518, r1515, r1303; +} +{ +add.f16x2 r1521, %60, r1518; +} +{ +add.f16x2 r1524, %80, %90; +} +{ +mul.f16x2 r1527, r1524, r1305; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, %69, %99; +} +{ +mul.f16x2 r1536, r1533, r1304; +} +{ +sub.f16x2 r1539, %79, %89; +} +{ +mul.f16x2 r1542, r1539, r1306; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +sub.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, %70, %100; +} +{ +mul.f16x2 r1554, r1551, r1305; +} +{ +add.f16x2 r1557, %60, r1554; +} +{ +add.f16x2 r1560, %80, %90; +} +{ +mul.f16x2 r1563, r1560, r1307; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, %69, %99; +} +{ +mul.f16x2 r1572, r1569, r1306; +} +{ +sub.f16x2 r1575, %79, %89; +} +{ +mul.f16x2 r1578, r1575, r1309; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +add.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, %70, %100; +} +{ +mul.f16x2 r1590, r1587, r1305; +} +{ +add.f16x2 r1593, %60, r1590; +} +{ +add.f16x2 r1596, %80, %90; +} +{ +mul.f16x2 r1599, r1596, r1307; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, %69, %99; +} +{ +mul.f16x2 r1608, r1605, r1306; +} +{ +sub.f16x2 r1611, %79, %89; +} +{ +mul.f16x2 r1614, r1611, r1309; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +sub.f16x2 r1620, r1602, r1617; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1623, {low, high}; +} +mov.f32 f64, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1624, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1625, {low, high}; +} +mov.f32 f68, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1626, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1627, {low, high}; +} +mov.f32 f72, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1628, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1629, {low, high}; +} +mov.f32 f76, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1630, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1633, {low, high}; +} +mov.f32 f84, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1634, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1637, {low, high}; +} +mov.f32 f92, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1638, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1639, {low, high}; +} +mov.f32 f96, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1640, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1645, {low, high}; +} +mov.f32 f108, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1653, {low, high}; +} +mov.f32 f124, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1671, r408, r1623; +} +{ +mul.f16x2 r1674, r552, r1624; +} +{ +sub.f16x2 r1677, r1671, r1674; +} +{ +mul.f16x2 r1680, r408, r1624; +} +{ +fma.rn.f16x2 r1683, r552, r1623, r1680; +} +{ +mul.f16x2 r1687, r728, r1625; +} +{ +mul.f16x2 r1690, r872, r1626; +} +{ +sub.f16x2 r1693, r1687, r1690; +} +{ +mul.f16x2 r1696, r728, r1626; +} +{ +fma.rn.f16x2 r1699, r872, r1625, r1696; +} +{ +mul.f16x2 r1703, r1048, r1627; +} +{ +mul.f16x2 r1706, r1192, r1628; +} +{ +sub.f16x2 r1709, r1703, r1706; +} +{ +mul.f16x2 r1712, r1048, r1628; +} +{ +fma.rn.f16x2 r1715, r1192, r1627, r1712; +} +{ +mul.f16x2 r1719, r1368, r1629; +} +{ +mul.f16x2 r1722, r1512, r1630; +} +{ +sub.f16x2 r1725, r1719, r1722; +} +{ +mul.f16x2 r1728, r1368, r1630; +} +{ +fma.rn.f16x2 r1731, r1512, r1629, r1728; +} +{ +mul.f16x2 r1735, r480, r1625; +} +{ +mul.f16x2 r1738, r624, r1626; +} +{ +sub.f16x2 r1741, r1735, r1738; +} +{ +mul.f16x2 r1744, r480, r1626; +} +{ +fma.rn.f16x2 r1747, r624, r1625, r1744; +} +{ +mul.f16x2 r1751, r800, r1629; +} +{ +mul.f16x2 r1754, r944, r1630; +} +{ +sub.f16x2 r1757, r1751, r1754; +} +{ +mul.f16x2 r1760, r800, r1630; +} +{ +fma.rn.f16x2 r1763, r944, r1629, r1760; +} +{ +mul.f16x2 r1767, r1120, r1633; +} +{ +mul.f16x2 r1770, r1264, r1634; +} +{ +sub.f16x2 r1773, r1767, r1770; +} +{ +mul.f16x2 r1776, r1120, r1634; +} +{ +fma.rn.f16x2 r1779, r1264, r1633, r1776; +} +{ +mul.f16x2 r1783, r1440, r1637; +} +{ +mul.f16x2 r1786, r1584, r1638; +} +{ +sub.f16x2 r1789, r1783, r1786; +} +{ +mul.f16x2 r1792, r1440, r1638; +} +{ +fma.rn.f16x2 r1795, r1584, r1637, r1792; +} +{ +mul.f16x2 r1799, r516, r1627; +} +{ +mul.f16x2 r1802, r660, r1628; +} +{ +sub.f16x2 r1805, r1799, r1802; +} +{ +mul.f16x2 r1808, r516, r1628; +} +{ +fma.rn.f16x2 r1811, r660, r1627, r1808; +} +{ +mul.f16x2 r1815, r836, r1633; +} +{ +mul.f16x2 r1818, r980, r1634; +} +{ +sub.f16x2 r1821, r1815, r1818; +} +{ +mul.f16x2 r1824, r836, r1634; +} +{ +fma.rn.f16x2 r1827, r980, r1633, r1824; +} +{ +mul.f16x2 r1831, r1156, r1639; +} +{ +mul.f16x2 r1834, r1300, r1640; +} +{ +sub.f16x2 r1837, r1831, r1834; +} +{ +mul.f16x2 r1840, r1156, r1640; +} +{ +fma.rn.f16x2 r1843, r1300, r1639, r1840; +} +{ +mul.f16x2 r1847, r1476, r1645; +} +{ +mul.f16x2 r1850, r1620, r1646; +} +{ +sub.f16x2 r1853, r1847, r1850; +} +{ +mul.f16x2 r1856, r1476, r1646; +} +{ +fma.rn.f16x2 r1859, r1620, r1645, r1856; +} +{ +mul.f16x2 r1863, r444, r1629; +} +{ +mul.f16x2 r1866, r588, r1630; +} +{ +sub.f16x2 r1869, r1863, r1866; +} +{ +mul.f16x2 r1872, r444, r1630; +} +{ +fma.rn.f16x2 r1875, r588, r1629, r1872; +} +{ +mul.f16x2 r1879, r764, r1637; +} +{ +mul.f16x2 r1882, r908, r1638; +} +{ +sub.f16x2 r1885, r1879, r1882; +} +{ +mul.f16x2 r1888, r764, r1638; +} +{ +fma.rn.f16x2 r1891, r908, r1637, r1888; +} +{ +mul.f16x2 r1895, r1084, r1645; +} +{ +mul.f16x2 r1898, r1228, r1646; +} +{ +sub.f16x2 r1901, r1895, r1898; +} +{ +mul.f16x2 r1904, r1084, r1646; +} +{ +fma.rn.f16x2 r1907, r1228, r1645, r1904; +} +{ +mul.f16x2 r1911, r1404, r1653; +} +{ +mul.f16x2 r1914, r1548, r1654; +} +{ +sub.f16x2 r1917, r1911, r1914; +} +{ +mul.f16x2 r1920, r1404, r1654; +} +{ +fma.rn.f16x2 r1923, r1548, r1653, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1931, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1932, {low, high}; +} +{ +neg.f16x2 r1933, r1932; +} +{ +add.f16x2 r1935, r360, r1320; +} +{ +add.f16x2 r1938, r40, r1935; +} +{ +add.f16x2 r1941, r680, r1000; +} +{ +add.f16x2 r1944, r1938, r1941; +} +st.local.u32 [rd3], r1944; +{ +add.f16x2 r1947, r372, r1332; +} +{ +add.f16x2 r1950, r52, r1947; +} +{ +add.f16x2 r1953, r692, r1012; +} +{ +add.f16x2 r1956, r1950, r1953; +} +st.local.u32 [rd3+4], r1956; +{ +add.f16x2 r1959, r360, r1320; +} +{ +mul.f16x2 r1962, r1959, r1927; +} +{ +add.f16x2 r1965, r40, r1962; +} +{ +add.f16x2 r1968, r680, r1000; +} +{ +mul.f16x2 r1971, r1968, r1929; +} +{ +add.f16x2 r1974, r1965, r1971; +} +{ +sub.f16x2 r1977, r372, r1332; +} +{ +mul.f16x2 r1980, r1977, r1928; +} +{ +sub.f16x2 r1983, r692, r1012; +} +{ +mul.f16x2 r1986, r1983, r1930; +} +{ +add.f16x2 r1989, r1980, r1986; +} +{ +sub.f16x2 r1992, r1974, r1989; +} +st.local.u32 [rd3+40], r1992; +{ +add.f16x2 r1995, r360, r1320; +} +{ +mul.f16x2 r1998, r1995, r1927; +} +{ +add.f16x2 r2001, r40, r1998; +} +{ +add.f16x2 r2004, r680, r1000; +} +{ +mul.f16x2 r2007, r2004, r1929; +} +{ +add.f16x2 r2010, r2001, r2007; +} +{ +sub.f16x2 r2013, r372, r1332; +} +{ +mul.f16x2 r2016, r2013, r1928; +} +{ +sub.f16x2 r2019, r692, r1012; +} +{ +mul.f16x2 r2022, r2019, r1930; +} +{ +add.f16x2 r2025, r2016, r2022; +} +{ +add.f16x2 r2028, r2010, r2025; +} +st.local.u32 [rd3+160], r2028; +{ +add.f16x2 r2031, r360, r1320; +} +{ +mul.f16x2 r2034, r2031, r1929; +} +{ +add.f16x2 r2037, r40, r2034; +} +{ +add.f16x2 r2040, r680, r1000; +} +{ +mul.f16x2 r2043, r2040, r1931; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +sub.f16x2 r2049, r372, r1332; +} +{ +mul.f16x2 r2052, r2049, r1930; +} +{ +sub.f16x2 r2055, r692, r1012; +} +{ +mul.f16x2 r2058, r2055, r1933; +} +{ +add.f16x2 r2061, r2052, r2058; +} +{ +sub.f16x2 r2064, r2046, r2061; +} +st.local.u32 [rd3+80], r2064; +{ +add.f16x2 r2067, r360, r1320; +} +{ +mul.f16x2 r2070, r2067, r1929; +} +{ +add.f16x2 r2073, r40, r2070; +} +{ +add.f16x2 r2076, r680, r1000; +} +{ +mul.f16x2 r2079, r2076, r1931; +} +{ +add.f16x2 r2082, r2073, r2079; +} +{ +sub.f16x2 r2085, r372, r1332; +} +{ +mul.f16x2 r2088, r2085, r1930; +} +{ +sub.f16x2 r2091, r692, r1012; +} +{ +mul.f16x2 r2094, r2091, r1933; +} +{ +add.f16x2 r2097, r2088, r2094; +} +{ +add.f16x2 r2100, r2082, r2097; +} +st.local.u32 [rd3+120], r2100; +{ +add.f16x2 r2103, r372, r1332; +} +{ +mul.f16x2 r2106, r2103, r1927; +} +{ +add.f16x2 r2109, r52, r2106; +} +{ +add.f16x2 r2112, r692, r1012; +} +{ +mul.f16x2 r2115, r2112, r1929; +} +{ +add.f16x2 r2118, r2109, r2115; +} +{ +sub.f16x2 r2121, r360, r1320; +} +{ +mul.f16x2 r2124, r2121, r1928; +} +{ +sub.f16x2 r2127, r680, r1000; +} +{ +mul.f16x2 r2130, r2127, r1930; +} +{ +add.f16x2 r2133, r2124, r2130; +} +{ +add.f16x2 r2136, r2118, r2133; +} +st.local.u32 [rd3+44], r2136; +{ +add.f16x2 r2139, r372, r1332; +} +{ +mul.f16x2 r2142, r2139, r1927; +} +{ +add.f16x2 r2145, r52, r2142; +} +{ +add.f16x2 r2148, r692, r1012; +} +{ +mul.f16x2 r2151, r2148, r1929; +} +{ +add.f16x2 r2154, r2145, r2151; +} +{ +sub.f16x2 r2157, r360, r1320; +} +{ +mul.f16x2 r2160, r2157, r1928; +} +{ +sub.f16x2 r2163, r680, r1000; +} +{ +mul.f16x2 r2166, r2163, r1930; +} +{ +add.f16x2 r2169, r2160, r2166; +} +{ +sub.f16x2 r2172, r2154, r2169; +} +st.local.u32 [rd3+164], r2172; +{ +add.f16x2 r2175, r372, r1332; +} +{ +mul.f16x2 r2178, r2175, r1929; +} +{ +add.f16x2 r2181, r52, r2178; +} +{ +add.f16x2 r2184, r692, r1012; +} +{ +mul.f16x2 r2187, r2184, r1931; +} +{ +add.f16x2 r2190, r2181, r2187; +} +{ +sub.f16x2 r2193, r360, r1320; +} +{ +mul.f16x2 r2196, r2193, r1930; +} +{ +sub.f16x2 r2199, r680, r1000; +} +{ +mul.f16x2 r2202, r2199, r1933; +} +{ +add.f16x2 r2205, r2196, r2202; +} +{ +add.f16x2 r2208, r2190, r2205; +} +st.local.u32 [rd3+84], r2208; +{ +add.f16x2 r2211, r372, r1332; +} +{ +mul.f16x2 r2214, r2211, r1929; +} +{ +add.f16x2 r2217, r52, r2214; +} +{ +add.f16x2 r2220, r692, r1012; +} +{ +mul.f16x2 r2223, r2220, r1931; +} +{ +add.f16x2 r2226, r2217, r2223; +} +{ +sub.f16x2 r2229, r360, r1320; +} +{ +mul.f16x2 r2232, r2229, r1930; +} +{ +sub.f16x2 r2235, r680, r1000; +} +{ +mul.f16x2 r2238, r2235, r1933; +} +{ +add.f16x2 r2241, r2232, r2238; +} +{ +sub.f16x2 r2244, r2226, r2241; +} +st.local.u32 [rd3+124], r2244; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2248, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2252, {low, high}; +} +{ +neg.f16x2 r2253, r2252; +} +{ +add.f16x2 r2255, r1677, r1725; +} +{ +add.f16x2 r2258, r88, r2255; +} +{ +add.f16x2 r2261, r1693, r1709; +} +{ +add.f16x2 r11348, r2258, r2261; +} +st.local.u32 [rd3+8], r11348; +{ +add.f16x2 r2267, r1683, r1731; +} +{ +add.f16x2 r2270, r232, r2267; +} +{ +add.f16x2 r2273, r1699, r1715; +} +{ +add.f16x2 r2276, r2270, r2273; +} +st.local.u32 [rd3+12], r2276; +{ +add.f16x2 r2279, r1677, r1725; +} +{ +mul.f16x2 r2282, r2279, r2247; +} +{ +add.f16x2 r2285, r88, r2282; +} +{ +add.f16x2 r2288, r1693, r1709; +} +{ +mul.f16x2 r2291, r2288, r2249; +} +{ +add.f16x2 r2294, r2285, r2291; +} +{ +sub.f16x2 r2297, r1683, r1731; +} +{ +mul.f16x2 r2300, r2297, r2248; +} +{ +sub.f16x2 r2303, r1699, r1715; +} +{ +mul.f16x2 r2306, r2303, r2250; +} +{ +add.f16x2 r2309, r2300, r2306; +} +{ +sub.f16x2 r2312, r2294, r2309; +} +st.local.u32 [rd3+48], r2312; +{ +add.f16x2 r2315, r1677, r1725; +} +{ +mul.f16x2 r2318, r2315, r2247; +} +{ +add.f16x2 r2321, r88, r2318; +} +{ +add.f16x2 r2324, r1693, r1709; +} +{ +mul.f16x2 r2327, r2324, r2249; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +sub.f16x2 r2333, r1683, r1731; +} +{ +mul.f16x2 r2336, r2333, r2248; +} +{ +sub.f16x2 r2339, r1699, r1715; +} +{ +mul.f16x2 r2342, r2339, r2250; +} +{ +add.f16x2 r2345, r2336, r2342; +} +{ +add.f16x2 r2348, r2330, r2345; +} +st.local.u32 [rd3+168], r2348; +{ +add.f16x2 r2351, r1677, r1725; +} +{ +mul.f16x2 r2354, r2351, r2249; +} +{ +add.f16x2 r2357, r88, r2354; +} +{ +add.f16x2 r2360, r1693, r1709; +} +{ +mul.f16x2 r2363, r2360, r2251; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +sub.f16x2 r2369, r1683, r1731; +} +{ +mul.f16x2 r2372, r2369, r2250; +} +{ +sub.f16x2 r2375, r1699, r1715; +} +{ +mul.f16x2 r2378, r2375, r2253; +} +{ +add.f16x2 r2381, r2372, r2378; +} +{ +sub.f16x2 r2384, r2366, r2381; +} +st.local.u32 [rd3+88], r2384; +{ +add.f16x2 r2387, r1677, r1725; +} +{ +mul.f16x2 r2390, r2387, r2249; +} +{ +add.f16x2 r2393, r88, r2390; +} +{ +add.f16x2 r2396, r1693, r1709; +} +{ +mul.f16x2 r2399, r2396, r2251; +} +{ +add.f16x2 r2402, r2393, r2399; +} +{ +sub.f16x2 r2405, r1683, r1731; +} +{ +mul.f16x2 r2408, r2405, r2250; +} +{ +sub.f16x2 r2411, r1699, r1715; +} +{ +mul.f16x2 r2414, r2411, r2253; +} +{ +add.f16x2 r2417, r2408, r2414; +} +{ +add.f16x2 r2420, r2402, r2417; +} +st.local.u32 [rd3+128], r2420; +{ +add.f16x2 r2423, r1683, r1731; +} +{ +mul.f16x2 r2426, r2423, r2247; +} +{ +add.f16x2 r2429, r232, r2426; +} +{ +add.f16x2 r2432, r1699, r1715; +} +{ +mul.f16x2 r2435, r2432, r2249; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +sub.f16x2 r2441, r1677, r1725; +} +{ +mul.f16x2 r2444, r2441, r2248; +} +{ +sub.f16x2 r2447, r1693, r1709; +} +{ +mul.f16x2 r2450, r2447, r2250; +} +{ +add.f16x2 r2453, r2444, r2450; +} +{ +add.f16x2 r2456, r2438, r2453; +} +st.local.u32 [rd3+52], r2456; +{ +add.f16x2 r2459, r1683, r1731; +} +{ +mul.f16x2 r2462, r2459, r2247; +} +{ +add.f16x2 r2465, r232, r2462; +} +{ +add.f16x2 r2468, r1699, r1715; +} +{ +mul.f16x2 r2471, r2468, r2249; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +sub.f16x2 r2477, r1677, r1725; +} +{ +mul.f16x2 r2480, r2477, r2248; +} +{ +sub.f16x2 r2483, r1693, r1709; +} +{ +mul.f16x2 r2486, r2483, r2250; +} +{ +add.f16x2 r2489, r2480, r2486; +} +{ +sub.f16x2 r2492, r2474, r2489; +} +st.local.u32 [rd3+172], r2492; +{ +add.f16x2 r2495, r1683, r1731; +} +{ +mul.f16x2 r2498, r2495, r2249; +} +{ +add.f16x2 r2501, r232, r2498; +} +{ +add.f16x2 r2504, r1699, r1715; +} +{ +mul.f16x2 r2507, r2504, r2251; +} +{ +add.f16x2 r2510, r2501, r2507; +} +{ +sub.f16x2 r2513, r1677, r1725; +} +{ +mul.f16x2 r2516, r2513, r2250; +} +{ +sub.f16x2 r2519, r1693, r1709; +} +{ +mul.f16x2 r2522, r2519, r2253; +} +{ +add.f16x2 r2525, r2516, r2522; +} +{ +add.f16x2 r2528, r2510, r2525; +} +st.local.u32 [rd3+92], r2528; +{ +add.f16x2 r2531, r1683, r1731; +} +{ +mul.f16x2 r2534, r2531, r2249; +} +{ +add.f16x2 r2537, r232, r2534; +} +{ +add.f16x2 r2540, r1699, r1715; +} +{ +mul.f16x2 r2543, r2540, r2251; +} +{ +add.f16x2 r2546, r2537, r2543; +} +{ +sub.f16x2 r2549, r1677, r1725; +} +{ +mul.f16x2 r2552, r2549, r2250; +} +{ +sub.f16x2 r2555, r1693, r1709; +} +{ +mul.f16x2 r2558, r2555, r2253; +} +{ +add.f16x2 r2561, r2552, r2558; +} +{ +sub.f16x2 r2564, r2546, r2561; +} +st.local.u32 [rd3+132], r2564; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2572, {low, high}; +} +{ +neg.f16x2 r2573, r2572; +} +{ +add.f16x2 r2575, r1741, r1789; +} +{ +add.f16x2 r2578, r160, r2575; +} +{ +add.f16x2 r2581, r1757, r1773; +} +{ +add.f16x2 r2584, r2578, r2581; +} +st.local.u32 [rd3+16], r2584; +{ +add.f16x2 r2587, r1747, r1795; +} +{ +add.f16x2 r2590, r304, r2587; +} +{ +add.f16x2 r2593, r1763, r1779; +} +{ +add.f16x2 r2596, r2590, r2593; +} +st.local.u32 [rd3+20], r2596; +{ +add.f16x2 r2599, r1741, r1789; +} +{ +mul.f16x2 r2602, r2599, r2567; +} +{ +add.f16x2 r2605, r160, r2602; +} +{ +add.f16x2 r2608, r1757, r1773; +} +{ +mul.f16x2 r2611, r2608, r2569; +} +{ +add.f16x2 r2614, r2605, r2611; +} +{ +sub.f16x2 r2617, r1747, r1795; +} +{ +mul.f16x2 r2620, r2617, r2568; +} +{ +sub.f16x2 r2623, r1763, r1779; +} +{ +mul.f16x2 r2626, r2623, r2570; +} +{ +add.f16x2 r2629, r2620, r2626; +} +{ +sub.f16x2 r2632, r2614, r2629; +} +st.local.u32 [rd3+56], r2632; +{ +add.f16x2 r2635, r1741, r1789; +} +{ +mul.f16x2 r2638, r2635, r2567; +} +{ +add.f16x2 r2641, r160, r2638; +} +{ +add.f16x2 r2644, r1757, r1773; +} +{ +mul.f16x2 r2647, r2644, r2569; +} +{ +add.f16x2 r2650, r2641, r2647; +} +{ +sub.f16x2 r2653, r1747, r1795; +} +{ +mul.f16x2 r2656, r2653, r2568; +} +{ +sub.f16x2 r2659, r1763, r1779; +} +{ +mul.f16x2 r2662, r2659, r2570; +} +{ +add.f16x2 r2665, r2656, r2662; +} +{ +add.f16x2 r2668, r2650, r2665; +} +st.local.u32 [rd3+176], r2668; +{ +add.f16x2 r2671, r1741, r1789; +} +{ +mul.f16x2 r2674, r2671, r2569; +} +{ +add.f16x2 r2677, r160, r2674; +} +{ +add.f16x2 r2680, r1757, r1773; +} +{ +mul.f16x2 r2683, r2680, r2571; +} +{ +add.f16x2 r2686, r2677, r2683; +} +{ +sub.f16x2 r2689, r1747, r1795; +} +{ +mul.f16x2 r2692, r2689, r2570; +} +{ +sub.f16x2 r2695, r1763, r1779; +} +{ +mul.f16x2 r2698, r2695, r2573; +} +{ +add.f16x2 r2701, r2692, r2698; +} +{ +sub.f16x2 r2704, r2686, r2701; +} +st.local.u32 [rd3+96], r2704; +{ +add.f16x2 r2707, r1741, r1789; +} +{ +mul.f16x2 r2710, r2707, r2569; +} +{ +add.f16x2 r2713, r160, r2710; +} +{ +add.f16x2 r2716, r1757, r1773; +} +{ +mul.f16x2 r2719, r2716, r2571; +} +{ +add.f16x2 r2722, r2713, r2719; +} +{ +sub.f16x2 r2725, r1747, r1795; +} +{ +mul.f16x2 r2728, r2725, r2570; +} +{ +sub.f16x2 r2731, r1763, r1779; +} +{ +mul.f16x2 r2734, r2731, r2573; +} +{ +add.f16x2 r2737, r2728, r2734; +} +{ +add.f16x2 r2740, r2722, r2737; +} +st.local.u32 [rd3+136], r2740; +{ +add.f16x2 r2743, r1747, r1795; +} +{ +mul.f16x2 r2746, r2743, r2567; +} +{ +add.f16x2 r2749, r304, r2746; +} +{ +add.f16x2 r2752, r1763, r1779; +} +{ +mul.f16x2 r2755, r2752, r2569; +} +{ +add.f16x2 r2758, r2749, r2755; +} +{ +sub.f16x2 r2761, r1741, r1789; +} +{ +mul.f16x2 r2764, r2761, r2568; +} +{ +sub.f16x2 r2767, r1757, r1773; +} +{ +mul.f16x2 r2770, r2767, r2570; +} +{ +add.f16x2 r2773, r2764, r2770; +} +{ +add.f16x2 r2776, r2758, r2773; +} +st.local.u32 [rd3+60], r2776; +{ +add.f16x2 r2779, r1747, r1795; +} +{ +mul.f16x2 r2782, r2779, r2567; +} +{ +add.f16x2 r2785, r304, r2782; +} +{ +add.f16x2 r2788, r1763, r1779; +} +{ +mul.f16x2 r2791, r2788, r2569; +} +{ +add.f16x2 r2794, r2785, r2791; +} +{ +sub.f16x2 r2797, r1741, r1789; +} +{ +mul.f16x2 r2800, r2797, r2568; +} +{ +sub.f16x2 r2803, r1757, r1773; +} +{ +mul.f16x2 r2806, r2803, r2570; +} +{ +add.f16x2 r2809, r2800, r2806; +} +{ +sub.f16x2 r2812, r2794, r2809; +} +st.local.u32 [rd3+180], r2812; +{ +add.f16x2 r2815, r1747, r1795; +} +{ +mul.f16x2 r2818, r2815, r2569; +} +{ +add.f16x2 r2821, r304, r2818; +} +{ +add.f16x2 r2824, r1763, r1779; +} +{ +mul.f16x2 r2827, r2824, r2571; +} +{ +add.f16x2 r2830, r2821, r2827; +} +{ +sub.f16x2 r2833, r1741, r1789; +} +{ +mul.f16x2 r2836, r2833, r2570; +} +{ +sub.f16x2 r2839, r1757, r1773; +} +{ +mul.f16x2 r2842, r2839, r2573; +} +{ +add.f16x2 r2845, r2836, r2842; +} +{ +add.f16x2 r2848, r2830, r2845; +} +st.local.u32 [rd3+100], r2848; +{ +add.f16x2 r2851, r1747, r1795; +} +{ +mul.f16x2 r2854, r2851, r2569; +} +{ +add.f16x2 r2857, r304, r2854; +} +{ +add.f16x2 r2860, r1763, r1779; +} +{ +mul.f16x2 r2863, r2860, r2571; +} +{ +add.f16x2 r2866, r2857, r2863; +} +{ +sub.f16x2 r2869, r1741, r1789; +} +{ +mul.f16x2 r2872, r2869, r2570; +} +{ +sub.f16x2 r2875, r1757, r1773; +} +{ +mul.f16x2 r2878, r2875, r2573; +} +{ +add.f16x2 r2881, r2872, r2878; +} +{ +sub.f16x2 r2884, r2866, r2881; +} +st.local.u32 [rd3+140], r2884; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2887, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2888, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2891, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2892, {low, high}; +} +{ +neg.f16x2 r2893, r2892; +} +{ +add.f16x2 r2895, r1805, r1853; +} +{ +add.f16x2 r2898, r196, r2895; +} +{ +add.f16x2 r2901, r1821, r1837; +} +{ +add.f16x2 r2904, r2898, r2901; +} +st.local.u32 [rd3+24], r2904; +{ +add.f16x2 r2907, r1811, r1859; +} +{ +add.f16x2 r2910, r340, r2907; +} +{ +add.f16x2 r2913, r1827, r1843; +} +{ +add.f16x2 r2916, r2910, r2913; +} +st.local.u32 [rd3+28], r2916; +{ +add.f16x2 r2919, r1805, r1853; +} +{ +mul.f16x2 r2922, r2919, r2887; +} +{ +add.f16x2 r2925, r196, r2922; +} +{ +add.f16x2 r2928, r1821, r1837; +} +{ +mul.f16x2 r2931, r2928, r2889; +} +{ +add.f16x2 r2934, r2925, r2931; +} +{ +sub.f16x2 r2937, r1811, r1859; +} +{ +mul.f16x2 r2940, r2937, r2888; +} +{ +sub.f16x2 r2943, r1827, r1843; +} +{ +mul.f16x2 r2946, r2943, r2890; +} +{ +add.f16x2 r2949, r2940, r2946; +} +{ +sub.f16x2 r2952, r2934, r2949; +} +st.local.u32 [rd3+64], r2952; +{ +add.f16x2 r2955, r1805, r1853; +} +{ +mul.f16x2 r2958, r2955, r2887; +} +{ +add.f16x2 r2961, r196, r2958; +} +{ +add.f16x2 r2964, r1821, r1837; +} +{ +mul.f16x2 r2967, r2964, r2889; +} +{ +add.f16x2 r2970, r2961, r2967; +} +{ +sub.f16x2 r2973, r1811, r1859; +} +{ +mul.f16x2 r2976, r2973, r2888; +} +{ +sub.f16x2 r2979, r1827, r1843; +} +{ +mul.f16x2 r2982, r2979, r2890; +} +{ +add.f16x2 r2985, r2976, r2982; +} +{ +add.f16x2 r2988, r2970, r2985; +} +st.local.u32 [rd3+184], r2988; +{ +add.f16x2 r2991, r1805, r1853; +} +{ +mul.f16x2 r2994, r2991, r2889; +} +{ +add.f16x2 r2997, r196, r2994; +} +{ +add.f16x2 r3000, r1821, r1837; +} +{ +mul.f16x2 r3003, r3000, r2891; +} +{ +add.f16x2 r3006, r2997, r3003; +} +{ +sub.f16x2 r3009, r1811, r1859; +} +{ +mul.f16x2 r3012, r3009, r2890; +} +{ +sub.f16x2 r3015, r1827, r1843; +} +{ +mul.f16x2 r3018, r3015, r2893; +} +{ +add.f16x2 r3021, r3012, r3018; +} +{ +sub.f16x2 r3024, r3006, r3021; +} +st.local.u32 [rd3+104], r3024; +{ +add.f16x2 r3027, r1805, r1853; +} +{ +mul.f16x2 r3030, r3027, r2889; +} +{ +add.f16x2 r3033, r196, r3030; +} +{ +add.f16x2 r3036, r1821, r1837; +} +{ +mul.f16x2 r3039, r3036, r2891; +} +{ +add.f16x2 r3042, r3033, r3039; +} +{ +sub.f16x2 r3045, r1811, r1859; +} +{ +mul.f16x2 r3048, r3045, r2890; +} +{ +sub.f16x2 r3051, r1827, r1843; +} +{ +mul.f16x2 r3054, r3051, r2893; +} +{ +add.f16x2 r3057, r3048, r3054; +} +{ +add.f16x2 r3060, r3042, r3057; +} +st.local.u32 [rd3+144], r3060; +{ +add.f16x2 r3063, r1811, r1859; +} +{ +mul.f16x2 r3066, r3063, r2887; +} +{ +add.f16x2 r3069, r340, r3066; +} +{ +add.f16x2 r3072, r1827, r1843; +} +{ +mul.f16x2 r3075, r3072, r2889; +} +{ +add.f16x2 r3078, r3069, r3075; +} +{ +sub.f16x2 r3081, r1805, r1853; +} +{ +mul.f16x2 r3084, r3081, r2888; +} +{ +sub.f16x2 r3087, r1821, r1837; +} +{ +mul.f16x2 r3090, r3087, r2890; +} +{ +add.f16x2 r3093, r3084, r3090; +} +{ +add.f16x2 r3096, r3078, r3093; +} +st.local.u32 [rd3+68], r3096; +{ +add.f16x2 r3099, r1811, r1859; +} +{ +mul.f16x2 r3102, r3099, r2887; +} +{ +add.f16x2 r3105, r340, r3102; +} +{ +add.f16x2 r3108, r1827, r1843; +} +{ +mul.f16x2 r3111, r3108, r2889; +} +{ +add.f16x2 r3114, r3105, r3111; +} +{ +sub.f16x2 r3117, r1805, r1853; +} +{ +mul.f16x2 r3120, r3117, r2888; +} +{ +sub.f16x2 r3123, r1821, r1837; +} +{ +mul.f16x2 r3126, r3123, r2890; +} +{ +add.f16x2 r3129, r3120, r3126; +} +{ +sub.f16x2 r3132, r3114, r3129; +} +st.local.u32 [rd3+188], r3132; +{ +add.f16x2 r3135, r1811, r1859; +} +{ +mul.f16x2 r3138, r3135, r2889; +} +{ +add.f16x2 r3141, r340, r3138; +} +{ +add.f16x2 r3144, r1827, r1843; +} +{ +mul.f16x2 r3147, r3144, r2891; +} +{ +add.f16x2 r3150, r3141, r3147; +} +{ +sub.f16x2 r3153, r1805, r1853; +} +{ +mul.f16x2 r3156, r3153, r2890; +} +{ +sub.f16x2 r3159, r1821, r1837; +} +{ +mul.f16x2 r3162, r3159, r2893; +} +{ +add.f16x2 r3165, r3156, r3162; +} +{ +add.f16x2 r3168, r3150, r3165; +} +st.local.u32 [rd3+108], r3168; +{ +add.f16x2 r3171, r1811, r1859; +} +{ +mul.f16x2 r3174, r3171, r2889; +} +{ +add.f16x2 r3177, r340, r3174; +} +{ +add.f16x2 r3180, r1827, r1843; +} +{ +mul.f16x2 r3183, r3180, r2891; +} +{ +add.f16x2 r3186, r3177, r3183; +} +{ +sub.f16x2 r3189, r1805, r1853; +} +{ +mul.f16x2 r3192, r3189, r2890; +} +{ +sub.f16x2 r3195, r1821, r1837; +} +{ +mul.f16x2 r3198, r3195, r2893; +} +{ +add.f16x2 r3201, r3192, r3198; +} +{ +sub.f16x2 r3204, r3186, r3201; +} +st.local.u32 [rd3+148], r3204; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3208, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3209, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3210, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3212, {low, high}; +} +{ +neg.f16x2 r3213, r3212; +} +{ +add.f16x2 r3215, r1869, r1917; +} +{ +add.f16x2 r3218, r124, r3215; +} +{ +add.f16x2 r3221, r1885, r1901; +} +{ +add.f16x2 r3224, r3218, r3221; +} +st.local.u32 [rd3+32], r3224; +{ +add.f16x2 r3227, r1875, r1923; +} +{ +add.f16x2 r3230, r268, r3227; +} +{ +add.f16x2 r3233, r1891, r1907; +} +{ +add.f16x2 r3236, r3230, r3233; +} +st.local.u32 [rd3+36], r3236; +{ +add.f16x2 r3239, r1869, r1917; +} +{ +mul.f16x2 r3242, r3239, r3207; +} +{ +add.f16x2 r3245, r124, r3242; +} +{ +add.f16x2 r3248, r1885, r1901; +} +{ +mul.f16x2 r3251, r3248, r3209; +} +{ +add.f16x2 r3254, r3245, r3251; +} +{ +sub.f16x2 r3257, r1875, r1923; +} +{ +mul.f16x2 r3260, r3257, r3208; +} +{ +sub.f16x2 r3263, r1891, r1907; +} +{ +mul.f16x2 r3266, r3263, r3210; +} +{ +add.f16x2 r3269, r3260, r3266; +} +{ +sub.f16x2 r3272, r3254, r3269; +} +st.local.u32 [rd3+72], r3272; +{ +add.f16x2 r3275, r1869, r1917; +} +{ +mul.f16x2 r3278, r3275, r3207; +} +{ +add.f16x2 r3281, r124, r3278; +} +{ +add.f16x2 r3284, r1885, r1901; +} +{ +mul.f16x2 r3287, r3284, r3209; +} +{ +add.f16x2 r3290, r3281, r3287; +} +{ +sub.f16x2 r3293, r1875, r1923; +} +{ +mul.f16x2 r3296, r3293, r3208; +} +{ +sub.f16x2 r3299, r1891, r1907; +} +{ +mul.f16x2 r3302, r3299, r3210; +} +{ +add.f16x2 r3305, r3296, r3302; +} +{ +add.f16x2 r3308, r3290, r3305; +} +st.local.u32 [rd3+192], r3308; +{ +add.f16x2 r3311, r1869, r1917; +} +{ +mul.f16x2 r3314, r3311, r3209; +} +{ +add.f16x2 r3317, r124, r3314; +} +{ +add.f16x2 r3320, r1885, r1901; +} +{ +mul.f16x2 r3323, r3320, r3211; +} +{ +add.f16x2 r3326, r3317, r3323; +} +{ +sub.f16x2 r3329, r1875, r1923; +} +{ +mul.f16x2 r3332, r3329, r3210; +} +{ +sub.f16x2 r3335, r1891, r1907; +} +{ +mul.f16x2 r3338, r3335, r3213; +} +{ +add.f16x2 r3341, r3332, r3338; +} +{ +sub.f16x2 r3344, r3326, r3341; +} +st.local.u32 [rd3+112], r3344; +{ +add.f16x2 r3347, r1869, r1917; +} +{ +mul.f16x2 r3350, r3347, r3209; +} +{ +add.f16x2 r3353, r124, r3350; +} +{ +add.f16x2 r3356, r1885, r1901; +} +{ +mul.f16x2 r3359, r3356, r3211; +} +{ +add.f16x2 r3362, r3353, r3359; +} +{ +sub.f16x2 r3365, r1875, r1923; +} +{ +mul.f16x2 r3368, r3365, r3210; +} +{ +sub.f16x2 r3371, r1891, r1907; +} +{ +mul.f16x2 r3374, r3371, r3213; +} +{ +add.f16x2 r3377, r3368, r3374; +} +{ +add.f16x2 r3380, r3362, r3377; +} +st.local.u32 [rd3+152], r3380; +{ +add.f16x2 r3383, r1875, r1923; +} +{ +mul.f16x2 r3386, r3383, r3207; +} +{ +add.f16x2 r3389, r268, r3386; +} +{ +add.f16x2 r3392, r1891, r1907; +} +{ +mul.f16x2 r3395, r3392, r3209; +} +{ +add.f16x2 r3398, r3389, r3395; +} +{ +sub.f16x2 r3401, r1869, r1917; +} +{ +mul.f16x2 r3404, r3401, r3208; +} +{ +sub.f16x2 r3407, r1885, r1901; +} +{ +mul.f16x2 r3410, r3407, r3210; +} +{ +add.f16x2 r3413, r3404, r3410; +} +{ +add.f16x2 r3416, r3398, r3413; +} +st.local.u32 [rd3+76], r3416; +{ +add.f16x2 r3419, r1875, r1923; +} +{ +mul.f16x2 r3422, r3419, r3207; +} +{ +add.f16x2 r3425, r268, r3422; +} +{ +add.f16x2 r3428, r1891, r1907; +} +{ +mul.f16x2 r3431, r3428, r3209; +} +{ +add.f16x2 r3434, r3425, r3431; +} +{ +sub.f16x2 r3437, r1869, r1917; +} +{ +mul.f16x2 r3440, r3437, r3208; +} +{ +sub.f16x2 r3443, r1885, r1901; +} +{ +mul.f16x2 r3446, r3443, r3210; +} +{ +add.f16x2 r3449, r3440, r3446; +} +{ +sub.f16x2 r3452, r3434, r3449; +} +st.local.u32 [rd3+196], r3452; +{ +add.f16x2 r3455, r1875, r1923; +} +{ +mul.f16x2 r3458, r3455, r3209; +} +{ +add.f16x2 r3461, r268, r3458; +} +{ +add.f16x2 r3464, r1891, r1907; +} +{ +mul.f16x2 r3467, r3464, r3211; +} +{ +add.f16x2 r3470, r3461, r3467; +} +{ +sub.f16x2 r3473, r1869, r1917; +} +{ +mul.f16x2 r3476, r3473, r3210; +} +{ +sub.f16x2 r3479, r1885, r1901; +} +{ +mul.f16x2 r3482, r3479, r3213; +} +{ +add.f16x2 r3485, r3476, r3482; +} +{ +add.f16x2 r3488, r3470, r3485; +} +st.local.u32 [rd3+116], r3488; +{ +add.f16x2 r3491, r1875, r1923; +} +{ +mul.f16x2 r3494, r3491, r3209; +} +{ +add.f16x2 r3497, r268, r3494; +} +{ +add.f16x2 r3500, r1891, r1907; +} +{ +mul.f16x2 r3503, r3500, r3211; +} +{ +add.f16x2 r3506, r3497, r3503; +} +{ +sub.f16x2 r3509, r1869, r1917; +} +{ +mul.f16x2 r3512, r3509, r3210; +} +{ +sub.f16x2 r3515, r1885, r1901; +} +{ +mul.f16x2 r3518, r3515, r3213; +} +{ +add.f16x2 r3521, r3512, r3518; +} +{ +sub.f16x2 r3524, r3506, r3521; +} +st.local.u32 [rd3+156], r3524; +mov.u32 r3532, %tid.x; +mul.wide.u32 rd11, r3532, -776530087; +shr.u64 rd12, rd11, 41; +cvt.u32.u64 r4, rd12; +mul.lo.s32 r3533, r4, 625; +sub.s32 r3, r3532, r3533; +cvt.rn.f32.u32 f221, r3; +mul.f32 f222, f221, 0f39D2D427; +cos.approx.f32 f217, f222; +sin.approx.f32 f223, f222; +neg.f32 f218, f223; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r11350, {low, high}; +} +mov.u32 r11349, 1; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11350; +mov.b32 r3554, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11350; +mov.b32 r3556, {high, high}; +} +bra.uni LBB0_1; +LBB0_2: +ld.local.u32 r11348, [rd5+60]; +LBB0_1: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11350; +mov.b32 r3534, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11350; +mov.b32 r3536, {high, high}; +} +mul.wide.u32 rd13, r11349, 8; +add.s64 rd14, rd3, rd13; +add.s64 rd5, rd14, 4; +ld.local.u32 r3539, [rd14+4]; +{ +mul.f16x2 r3538, r3539, r3536; +} +{ +neg.f16x2 r3541, r3538; +} +{ +fma.rn.f16x2 r3543, r11348, r3534, r3541; +} +st.local.u32 [rd14], r3543; +{ +mul.f16x2 r3547, r11348, r3536; +} +{ +fma.rn.f16x2 r3550, r3539, r3534, r3547; +} +st.local.u32 [rd14+4], r3550; +mov.f32 f238, 0fBF800000; +mov.f32 f239, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3558, {low, high}; +} +{ +mul.f16x2 r3559, r3556, r3558; +} +{ +mul.f16x2 r3562, r11350, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11350; +mov.b32 r3565, {high, low}; +} +{ +fma.rn.f16x2 r3567, r3559, r3565, r3562; +} +ld.local.u32 r3585, [rd14+8]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3567; +mov.b32 r3571, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3567; +mov.b32 r3573, {high, high}; +} +ld.local.u32 r3588, [rd14+12]; +{ +mul.f16x2 r3575, r3588, r3573; +} +{ +neg.f16x2 r3578, r3575; +} +{ +fma.rn.f16x2 r3580, r3585, r3571, r3578; +} +st.local.u32 [rd14+8], r3580; +{ +mul.f16x2 r3584, r3585, r3573; +} +{ +fma.rn.f16x2 r3587, r3588, r3571, r3584; +} +st.local.u32 [rd14+12], r3587; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3595, {low, high}; +} +{ +mul.f16x2 r3596, r3556, r3595; +} +{ +mul.f16x2 r3599, r3567, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3567; +mov.b32 r3602, {high, low}; +} +{ +fma.rn.f16x2 r3604, r3596, r3602, r3599; +} +ld.local.u32 r3622, [rd14+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3604; +mov.b32 r3608, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3604; +mov.b32 r3610, {high, high}; +} +ld.local.u32 r3625, [rd14+20]; +{ +mul.f16x2 r3612, r3625, r3610; +} +{ +neg.f16x2 r3615, r3612; +} +{ +fma.rn.f16x2 r3617, r3622, r3608, r3615; +} +st.local.u32 [rd14+16], r3617; +{ +mul.f16x2 r3621, r3622, r3610; +} +{ +fma.rn.f16x2 r3624, r3625, r3608, r3621; +} +st.local.u32 [rd14+20], r3624; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3632, {low, high}; +} +{ +mul.f16x2 r3633, r3556, r3632; +} +{ +mul.f16x2 r3636, r3604, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3604; +mov.b32 r3639, {high, low}; +} +{ +fma.rn.f16x2 r3641, r3633, r3639, r3636; +} +ld.local.u32 r3659, [rd14+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3641; +mov.b32 r3645, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3641; +mov.b32 r3647, {high, high}; +} +ld.local.u32 r3662, [rd14+28]; +{ +mul.f16x2 r3649, r3662, r3647; +} +{ +neg.f16x2 r3652, r3649; +} +{ +fma.rn.f16x2 r3654, r3659, r3645, r3652; +} +st.local.u32 [rd14+24], r3654; +{ +mul.f16x2 r3658, r3659, r3647; +} +{ +fma.rn.f16x2 r3661, r3662, r3645, r3658; +} +st.local.u32 [rd14+28], r3661; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3669, {low, high}; +} +{ +mul.f16x2 r3670, r3556, r3669; +} +{ +mul.f16x2 r3673, r3641, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3641; +mov.b32 r3676, {high, low}; +} +{ +fma.rn.f16x2 r3678, r3670, r3676, r3673; +} +ld.local.u32 r3692, [rd14+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3678; +mov.b32 r3682, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3678; +mov.b32 r3684, {high, high}; +} +ld.local.u32 r3699, [rd14+36]; +{ +mul.f16x2 r3686, r3699, r3684; +} +{ +neg.f16x2 r3689, r3686; +} +{ +fma.rn.f16x2 r3691, r3692, r3682, r3689; +} +st.local.u32 [rd14+32], r3691; +{ +mul.f16x2 r3695, r3692, r3684; +} +{ +fma.rn.f16x2 r3698, r3699, r3682, r3695; +} +st.local.u32 [rd14+36], r3698; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3706, {low, high}; +} +{ +mul.f16x2 r3707, r3556, r3706; +} +{ +mul.f16x2 r3710, r3678, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3678; +mov.b32 r3713, {high, low}; +} +{ +fma.rn.f16x2 r3715, r3707, r3713, r3710; +} +ld.local.u32 r3729, [rd14+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3715; +mov.b32 r3719, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3715; +mov.b32 r3721, {high, high}; +} +ld.local.u32 r3724, [rd14+44]; +{ +mul.f16x2 r3723, r3724, r3721; +} +{ +neg.f16x2 r3726, r3723; +} +{ +fma.rn.f16x2 r3728, r3729, r3719, r3726; +} +st.local.u32 [rd14+40], r3728; +{ +mul.f16x2 r3732, r3729, r3721; +} +{ +fma.rn.f16x2 r3735, r3724, r3719, r3732; +} +st.local.u32 [rd14+44], r3735; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3743, {low, high}; +} +{ +mul.f16x2 r3744, r3556, r3743; +} +{ +mul.f16x2 r3747, r3715, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3715; +mov.b32 r3750, {high, low}; +} +{ +fma.rn.f16x2 r3752, r3744, r3750, r3747; +} +ld.local.u32 r3766, [rd14+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3752; +mov.b32 r3756, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3752; +mov.b32 r3758, {high, high}; +} +ld.local.u32 r3761, [rd14+52]; +{ +mul.f16x2 r3760, r3761, r3758; +} +{ +neg.f16x2 r3763, r3760; +} +{ +fma.rn.f16x2 r3765, r3766, r3756, r3763; +} +st.local.u32 [rd14+48], r3765; +{ +mul.f16x2 r3769, r3766, r3758; +} +{ +fma.rn.f16x2 r3772, r3761, r3756, r3769; +} +st.local.u32 [rd14+52], r3772; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3781, r3556, r3780; +} +{ +mul.f16x2 r3784, r3752, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3752; +mov.b32 r3787, {high, low}; +} +{ +fma.rn.f16x2 r3789, r3781, r3787, r3784; +} +ld.local.u32 r3803, [rd14+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3789; +mov.b32 r3793, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3789; +mov.b32 r3795, {high, high}; +} +ld.local.u32 r3798, [rd14+60]; +{ +mul.f16x2 r3797, r3798, r3795; +} +{ +neg.f16x2 r3800, r3797; +} +{ +fma.rn.f16x2 r3802, r3803, r3793, r3800; +} +st.local.u32 [rd14+56], r3802; +{ +mul.f16x2 r3806, r3803, r3795; +} +{ +fma.rn.f16x2 r3809, r3798, r3793, r3806; +} +st.local.u32 [rd14+60], r3809; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3817, {low, high}; +} +{ +mul.f16x2 r3818, r3556, r3817; +} +{ +mul.f16x2 r3821, r3789, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3789; +mov.b32 r3824, {high, low}; +} +{ +fma.rn.f16x2 r11350, r3818, r3824, r3821; +} +add.s32 r11349, r11349, 8; +setp.eq.s32 p1, r11349, 25; +@p1 bra LBB0_3; +bra.uni LBB0_2; +LBB0_3: +shl.b32 r7338, r1, 3; +mov.u32 r7339, %50; +add.s32 r7340, r7339, r7338; +mad.lo.s32 r12, r4, 125000, r7340; +barrier.sync 0; +mad.lo.s32 r7341, r3, 200, r12; +ld.local.v2.u32 {r7342, r7343}, [rd3]; +st.shared.v2.f32 [r7341], {r7342, r7343}; +ld.local.v2.u32 {r7346, r7347}, [rd4+4]; +st.shared.v2.f32 [r7341+8], {r7346, r7347}; +ld.local.v2.u32 {r7350, r7351}, [rd4+12]; +st.shared.v2.f32 [r7341+16], {r7350, r7351}; +ld.local.v2.u32 {r7354, r7355}, [rd4+20]; +st.shared.v2.f32 [r7341+24], {r7354, r7355}; +ld.local.v2.u32 {r7358, r7359}, [rd4+28]; +st.shared.v2.f32 [r7341+32], {r7358, r7359}; +ld.local.v2.u32 {r7362, r7363}, [rd4+36]; +st.shared.v2.f32 [r7341+40], {r7362, r7363}; +ld.local.v2.u32 {r7366, r7367}, [rd4+44]; +st.shared.v2.f32 [r7341+48], {r7366, r7367}; +ld.local.v2.u32 {r7370, r7371}, [rd4+52]; +st.shared.v2.f32 [r7341+56], {r7370, r7371}; +ld.local.v2.u32 {r7374, r7375}, [rd4+60]; +st.shared.v2.f32 [r7341+64], {r7374, r7375}; +ld.local.v2.u32 {r7378, r7379}, [rd4+68]; +st.shared.v2.f32 [r7341+72], {r7378, r7379}; +ld.local.v2.u32 {r7382, r7383}, [rd4+76]; +st.shared.v2.f32 [r7341+80], {r7382, r7383}; +ld.local.v2.u32 {r7386, r7387}, [rd4+84]; +st.shared.v2.f32 [r7341+88], {r7386, r7387}; +ld.local.v2.u32 {r7390, r7391}, [rd4+92]; +st.shared.v2.f32 [r7341+96], {r7390, r7391}; +ld.local.v2.u32 {r7394, r7395}, [rd4+100]; +st.shared.v2.f32 [r7341+104], {r7394, r7395}; +ld.local.v2.u32 {r7398, r7399}, [rd4+108]; +st.shared.v2.f32 [r7341+112], {r7398, r7399}; +ld.local.v2.u32 {r7402, r7403}, [rd4+116]; +st.shared.v2.f32 [r7341+120], {r7402, r7403}; +ld.local.v2.u32 {r7406, r7407}, [rd4+124]; +st.shared.v2.f32 [r7341+128], {r7406, r7407}; +ld.local.v2.u32 {r7410, r7411}, [rd4+132]; +st.shared.v2.f32 [r7341+136], {r7410, r7411}; +ld.local.v2.u32 {r7414, r7415}, [rd4+140]; +st.shared.v2.f32 [r7341+144], {r7414, r7415}; +ld.local.v2.u32 {r7418, r7419}, [rd4+148]; +st.shared.v2.f32 [r7341+152], {r7418, r7419}; +ld.local.v2.u32 {r7422, r7423}, [rd4+156]; +st.shared.v2.f32 [r7341+160], {r7422, r7423}; +ld.local.v2.u32 {r7426, r7427}, [rd4+164]; +st.shared.v2.f32 [r7341+168], {r7426, r7427}; +ld.local.v2.u32 {r7430, r7431}, [rd4+172]; +st.shared.v2.f32 [r7341+176], {r7430, r7431}; +ld.local.v2.u32 {r7434, r7435}, [rd4+180]; +st.shared.v2.f32 [r7341+184], {r7434, r7435}; +ld.local.v2.u32 {r7438, r7439}, [rd4+188]; +st.shared.v2.f32 [r7341+192], {r7438, r7439}; +barrier.sync 0; +mad.lo.s32 r13, r3, -192, r7341; +ld.shared.u32 r3842, [r13]; +ld.shared.u32 r3854, [r13+4]; +ld.shared.u32 r4162, [r13+5000]; +ld.shared.u32 r4174, [r13+5004]; +ld.shared.u32 r4482, [r13+10000]; +ld.shared.u32 r4494, [r13+10004]; +ld.shared.u32 r4802, [r13+15000]; +ld.shared.u32 r4814, [r13+15004]; +ld.shared.u32 r5122, [r13+20000]; +ld.shared.u32 r5134, [r13+20004]; +ld.shared.u32 r3839, [r13+25000]; +ld.shared.u32 r3851, [r13+25004]; +ld.shared.u32 r4159, [r13+30000]; +ld.shared.u32 r4171, [r13+30004]; +ld.shared.u32 r4479, [r13+35000]; +ld.shared.u32 r4491, [r13+35004]; +ld.shared.u32 r4799, [r13+40000]; +ld.shared.u32 r4811, [r13+40004]; +ld.shared.u32 r5119, [r13+45000]; +ld.shared.u32 r5131, [r13+45004]; +ld.shared.u32 r3845, [r13+50000]; +ld.shared.u32 r3857, [r13+50004]; +ld.shared.u32 r4165, [r13+55000]; +ld.shared.u32 r4177, [r13+55004]; +ld.shared.u32 r4485, [r13+60000]; +ld.shared.u32 r4497, [r13+60004]; +ld.shared.u32 r4805, [r13+65000]; +ld.shared.u32 r4817, [r13+65004]; +ld.shared.u32 r5125, [r13+70000]; +ld.shared.u32 r5137, [r13+70004]; +ld.shared.u32 r3846, [r13+75000]; +ld.shared.u32 r3858, [r13+75004]; +ld.shared.u32 r4166, [r13+80000]; +ld.shared.u32 r4178, [r13+80004]; +ld.shared.u32 r4486, [r13+85000]; +ld.shared.u32 r4498, [r13+85004]; +ld.shared.u32 r4806, [r13+90000]; +ld.shared.u32 r4818, [r13+90004]; +ld.shared.u32 r5126, [r13+95000]; +ld.shared.u32 r5138, [r13+95004]; +ld.shared.u32 r3840, [r13+100000]; +ld.shared.u32 r3852, [r13+100004]; +ld.shared.u32 r4160, [r13+105000]; +ld.shared.u32 r4172, [r13+105004]; +ld.shared.u32 r4480, [r13+110000]; +ld.shared.u32 r4492, [r13+110004]; +ld.shared.u32 r4800, [r13+115000]; +ld.shared.u32 r4812, [r13+115004]; +ld.shared.u32 r5120, [r13+120000]; +ld.shared.u32 r5132, [r13+120004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3830, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3831, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3832, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3833, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3835, {low, high}; +} +{ +neg.f16x2 r3836, r3835; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3841, r3844; +} +{ +add.f16x2 r3850, r3851, r3852; +} +{ +add.f16x2 r3853, r3854, r3850; +} +{ +add.f16x2 r3856, r3857, r3858; +} +{ +add.f16x2 r3859, r3853, r3856; +} +{ +add.f16x2 r3862, r3839, r3840; +} +{ +mul.f16x2 r3865, r3862, r3830; +} +{ +add.f16x2 r3868, r3842, r3865; +} +{ +add.f16x2 r3871, r3845, r3846; +} +{ +mul.f16x2 r3874, r3871, r3832; +} +{ +add.f16x2 r3877, r3868, r3874; +} +{ +sub.f16x2 r3880, r3851, r3852; +} +{ +mul.f16x2 r3883, r3880, r3831; +} +{ +sub.f16x2 r3886, r3857, r3858; +} +{ +mul.f16x2 r3889, r3886, r3833; +} +{ +add.f16x2 r3892, r3883, r3889; +} +{ +sub.f16x2 r3895, r3877, r3892; +} +{ +add.f16x2 r3898, r3839, r3840; +} +{ +mul.f16x2 r3901, r3898, r3830; +} +{ +add.f16x2 r3904, r3842, r3901; +} +{ +add.f16x2 r3907, r3845, r3846; +} +{ +mul.f16x2 r3910, r3907, r3832; +} +{ +add.f16x2 r3913, r3904, r3910; +} +{ +sub.f16x2 r3916, r3851, r3852; +} +{ +mul.f16x2 r3919, r3916, r3831; +} +{ +sub.f16x2 r3922, r3857, r3858; +} +{ +mul.f16x2 r3925, r3922, r3833; +} +{ +add.f16x2 r3928, r3919, r3925; +} +{ +add.f16x2 r3931, r3913, r3928; +} +{ +add.f16x2 r3934, r3839, r3840; +} +{ +mul.f16x2 r3937, r3934, r3832; +} +{ +add.f16x2 r3940, r3842, r3937; +} +{ +add.f16x2 r3943, r3845, r3846; +} +{ +mul.f16x2 r3946, r3943, r3834; +} +{ +add.f16x2 r3949, r3940, r3946; +} +{ +sub.f16x2 r3952, r3851, r3852; +} +{ +mul.f16x2 r3955, r3952, r3833; +} +{ +sub.f16x2 r3958, r3857, r3858; +} +{ +mul.f16x2 r3961, r3958, r3836; +} +{ +add.f16x2 r3964, r3955, r3961; +} +{ +sub.f16x2 r3967, r3949, r3964; +} +{ +add.f16x2 r3970, r3839, r3840; +} +{ +mul.f16x2 r3973, r3970, r3832; +} +{ +add.f16x2 r3976, r3842, r3973; +} +{ +add.f16x2 r3979, r3845, r3846; +} +{ +mul.f16x2 r3982, r3979, r3834; +} +{ +add.f16x2 r3985, r3976, r3982; +} +{ +sub.f16x2 r3988, r3851, r3852; +} +{ +mul.f16x2 r3991, r3988, r3833; +} +{ +sub.f16x2 r3994, r3857, r3858; +} +{ +mul.f16x2 r3997, r3994, r3836; +} +{ +add.f16x2 r4000, r3991, r3997; +} +{ +add.f16x2 r4003, r3985, r4000; +} +{ +add.f16x2 r4006, r3851, r3852; +} +{ +mul.f16x2 r4009, r4006, r3830; +} +{ +add.f16x2 r4012, r3854, r4009; +} +{ +add.f16x2 r4015, r3857, r3858; +} +{ +mul.f16x2 r4018, r4015, r3832; +} +{ +add.f16x2 r4021, r4012, r4018; +} +{ +sub.f16x2 r4024, r3839, r3840; +} +{ +mul.f16x2 r4027, r4024, r3831; +} +{ +sub.f16x2 r4030, r3845, r3846; +} +{ +mul.f16x2 r4033, r4030, r3833; +} +{ +add.f16x2 r4036, r4027, r4033; +} +{ +add.f16x2 r4039, r4021, r4036; +} +{ +add.f16x2 r4042, r3851, r3852; +} +{ +mul.f16x2 r4045, r4042, r3830; +} +{ +add.f16x2 r4048, r3854, r4045; +} +{ +add.f16x2 r4051, r3857, r3858; +} +{ +mul.f16x2 r4054, r4051, r3832; +} +{ +add.f16x2 r4057, r4048, r4054; +} +{ +sub.f16x2 r4060, r3839, r3840; +} +{ +mul.f16x2 r4063, r4060, r3831; +} +{ +sub.f16x2 r4066, r3845, r3846; +} +{ +mul.f16x2 r4069, r4066, r3833; +} +{ +add.f16x2 r4072, r4063, r4069; +} +{ +sub.f16x2 r4075, r4057, r4072; +} +{ +add.f16x2 r4078, r3851, r3852; +} +{ +mul.f16x2 r4081, r4078, r3832; +} +{ +add.f16x2 r4084, r3854, r4081; +} +{ +add.f16x2 r4087, r3857, r3858; +} +{ +mul.f16x2 r4090, r4087, r3834; +} +{ +add.f16x2 r4093, r4084, r4090; +} +{ +sub.f16x2 r4096, r3839, r3840; +} +{ +mul.f16x2 r4099, r4096, r3833; +} +{ +sub.f16x2 r4102, r3845, r3846; +} +{ +mul.f16x2 r4105, r4102, r3836; +} +{ +add.f16x2 r4108, r4099, r4105; +} +{ +add.f16x2 r4111, r4093, r4108; +} +{ +add.f16x2 r4114, r3851, r3852; +} +{ +mul.f16x2 r4117, r4114, r3832; +} +{ +add.f16x2 r4120, r3854, r4117; +} +{ +add.f16x2 r4123, r3857, r3858; +} +{ +mul.f16x2 r4126, r4123, r3834; +} +{ +add.f16x2 r4129, r4120, r4126; +} +{ +sub.f16x2 r4132, r3839, r3840; +} +{ +mul.f16x2 r4135, r4132, r3833; +} +{ +sub.f16x2 r4138, r3845, r3846; +} +{ +mul.f16x2 r4141, r4138, r3836; +} +{ +add.f16x2 r4144, r4135, r4141; +} +{ +sub.f16x2 r4147, r4129, r4144; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4150, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4151, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4152, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4153, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4155, {low, high}; +} +{ +neg.f16x2 r4156, r4155; +} +{ +add.f16x2 r4158, r4159, r4160; +} +{ +add.f16x2 r4161, r4162, r4158; +} +{ +add.f16x2 r4164, r4165, r4166; +} +{ +add.f16x2 r4167, r4161, r4164; +} +{ +add.f16x2 r4170, r4171, r4172; +} +{ +add.f16x2 r4173, r4174, r4170; +} +{ +add.f16x2 r4176, r4177, r4178; +} +{ +add.f16x2 r4179, r4173, r4176; +} +{ +add.f16x2 r4182, r4159, r4160; +} +{ +mul.f16x2 r4185, r4182, r4150; +} +{ +add.f16x2 r4188, r4162, r4185; +} +{ +add.f16x2 r4191, r4165, r4166; +} +{ +mul.f16x2 r4194, r4191, r4152; +} +{ +add.f16x2 r4197, r4188, r4194; +} +{ +sub.f16x2 r4200, r4171, r4172; +} +{ +mul.f16x2 r4203, r4200, r4151; +} +{ +sub.f16x2 r4206, r4177, r4178; +} +{ +mul.f16x2 r4209, r4206, r4153; +} +{ +add.f16x2 r4212, r4203, r4209; +} +{ +sub.f16x2 r4215, r4197, r4212; +} +{ +add.f16x2 r4218, r4159, r4160; +} +{ +mul.f16x2 r4221, r4218, r4150; +} +{ +add.f16x2 r4224, r4162, r4221; +} +{ +add.f16x2 r4227, r4165, r4166; +} +{ +mul.f16x2 r4230, r4227, r4152; +} +{ +add.f16x2 r4233, r4224, r4230; +} +{ +sub.f16x2 r4236, r4171, r4172; +} +{ +mul.f16x2 r4239, r4236, r4151; +} +{ +sub.f16x2 r4242, r4177, r4178; +} +{ +mul.f16x2 r4245, r4242, r4153; +} +{ +add.f16x2 r4248, r4239, r4245; +} +{ +add.f16x2 r4251, r4233, r4248; +} +{ +add.f16x2 r4254, r4159, r4160; +} +{ +mul.f16x2 r4257, r4254, r4152; +} +{ +add.f16x2 r4260, r4162, r4257; +} +{ +add.f16x2 r4263, r4165, r4166; +} +{ +mul.f16x2 r4266, r4263, r4154; +} +{ +add.f16x2 r4269, r4260, r4266; +} +{ +sub.f16x2 r4272, r4171, r4172; +} +{ +mul.f16x2 r4275, r4272, r4153; +} +{ +sub.f16x2 r4278, r4177, r4178; +} +{ +mul.f16x2 r4281, r4278, r4156; +} +{ +add.f16x2 r4284, r4275, r4281; +} +{ +sub.f16x2 r4287, r4269, r4284; +} +{ +add.f16x2 r4290, r4159, r4160; +} +{ +mul.f16x2 r4293, r4290, r4152; +} +{ +add.f16x2 r4296, r4162, r4293; +} +{ +add.f16x2 r4299, r4165, r4166; +} +{ +mul.f16x2 r4302, r4299, r4154; +} +{ +add.f16x2 r4305, r4296, r4302; +} +{ +sub.f16x2 r4308, r4171, r4172; +} +{ +mul.f16x2 r4311, r4308, r4153; +} +{ +sub.f16x2 r4314, r4177, r4178; +} +{ +mul.f16x2 r4317, r4314, r4156; +} +{ +add.f16x2 r4320, r4311, r4317; +} +{ +add.f16x2 r4323, r4305, r4320; +} +{ +add.f16x2 r4326, r4171, r4172; +} +{ +mul.f16x2 r4329, r4326, r4150; +} +{ +add.f16x2 r4332, r4174, r4329; +} +{ +add.f16x2 r4335, r4177, r4178; +} +{ +mul.f16x2 r4338, r4335, r4152; +} +{ +add.f16x2 r4341, r4332, r4338; +} +{ +sub.f16x2 r4344, r4159, r4160; +} +{ +mul.f16x2 r4347, r4344, r4151; +} +{ +sub.f16x2 r4350, r4165, r4166; +} +{ +mul.f16x2 r4353, r4350, r4153; +} +{ +add.f16x2 r4356, r4347, r4353; +} +{ +add.f16x2 r4359, r4341, r4356; +} +{ +add.f16x2 r4362, r4171, r4172; +} +{ +mul.f16x2 r4365, r4362, r4150; +} +{ +add.f16x2 r4368, r4174, r4365; +} +{ +add.f16x2 r4371, r4177, r4178; +} +{ +mul.f16x2 r4374, r4371, r4152; +} +{ +add.f16x2 r4377, r4368, r4374; +} +{ +sub.f16x2 r4380, r4159, r4160; +} +{ +mul.f16x2 r4383, r4380, r4151; +} +{ +sub.f16x2 r4386, r4165, r4166; +} +{ +mul.f16x2 r4389, r4386, r4153; +} +{ +add.f16x2 r4392, r4383, r4389; +} +{ +sub.f16x2 r4395, r4377, r4392; +} +{ +add.f16x2 r4398, r4171, r4172; +} +{ +mul.f16x2 r4401, r4398, r4152; +} +{ +add.f16x2 r4404, r4174, r4401; +} +{ +add.f16x2 r4407, r4177, r4178; +} +{ +mul.f16x2 r4410, r4407, r4154; +} +{ +add.f16x2 r4413, r4404, r4410; +} +{ +sub.f16x2 r4416, r4159, r4160; +} +{ +mul.f16x2 r4419, r4416, r4153; +} +{ +sub.f16x2 r4422, r4165, r4166; +} +{ +mul.f16x2 r4425, r4422, r4156; +} +{ +add.f16x2 r4428, r4419, r4425; +} +{ +add.f16x2 r4431, r4413, r4428; +} +{ +add.f16x2 r4434, r4171, r4172; +} +{ +mul.f16x2 r4437, r4434, r4152; +} +{ +add.f16x2 r4440, r4174, r4437; +} +{ +add.f16x2 r4443, r4177, r4178; +} +{ +mul.f16x2 r4446, r4443, r4154; +} +{ +add.f16x2 r4449, r4440, r4446; +} +{ +sub.f16x2 r4452, r4159, r4160; +} +{ +mul.f16x2 r4455, r4452, r4153; +} +{ +sub.f16x2 r4458, r4165, r4166; +} +{ +mul.f16x2 r4461, r4458, r4156; +} +{ +add.f16x2 r4464, r4455, r4461; +} +{ +sub.f16x2 r4467, r4449, r4464; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4470, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4475, {low, high}; +} +{ +neg.f16x2 r4476, r4475; +} +{ +add.f16x2 r4478, r4479, r4480; +} +{ +add.f16x2 r4481, r4482, r4478; +} +{ +add.f16x2 r4484, r4485, r4486; +} +{ +add.f16x2 r4487, r4481, r4484; +} +{ +add.f16x2 r4490, r4491, r4492; +} +{ +add.f16x2 r4493, r4494, r4490; +} +{ +add.f16x2 r4496, r4497, r4498; +} +{ +add.f16x2 r4499, r4493, r4496; +} +{ +add.f16x2 r4502, r4479, r4480; +} +{ +mul.f16x2 r4505, r4502, r4470; +} +{ +add.f16x2 r4508, r4482, r4505; +} +{ +add.f16x2 r4511, r4485, r4486; +} +{ +mul.f16x2 r4514, r4511, r4472; +} +{ +add.f16x2 r4517, r4508, r4514; +} +{ +sub.f16x2 r4520, r4491, r4492; +} +{ +mul.f16x2 r4523, r4520, r4471; +} +{ +sub.f16x2 r4526, r4497, r4498; +} +{ +mul.f16x2 r4529, r4526, r4473; +} +{ +add.f16x2 r4532, r4523, r4529; +} +{ +sub.f16x2 r4535, r4517, r4532; +} +{ +add.f16x2 r4538, r4479, r4480; +} +{ +mul.f16x2 r4541, r4538, r4470; +} +{ +add.f16x2 r4544, r4482, r4541; +} +{ +add.f16x2 r4547, r4485, r4486; +} +{ +mul.f16x2 r4550, r4547, r4472; +} +{ +add.f16x2 r4553, r4544, r4550; +} +{ +sub.f16x2 r4556, r4491, r4492; +} +{ +mul.f16x2 r4559, r4556, r4471; +} +{ +sub.f16x2 r4562, r4497, r4498; +} +{ +mul.f16x2 r4565, r4562, r4473; +} +{ +add.f16x2 r4568, r4559, r4565; +} +{ +add.f16x2 r4571, r4553, r4568; +} +{ +add.f16x2 r4574, r4479, r4480; +} +{ +mul.f16x2 r4577, r4574, r4472; +} +{ +add.f16x2 r4580, r4482, r4577; +} +{ +add.f16x2 r4583, r4485, r4486; +} +{ +mul.f16x2 r4586, r4583, r4474; +} +{ +add.f16x2 r4589, r4580, r4586; +} +{ +sub.f16x2 r4592, r4491, r4492; +} +{ +mul.f16x2 r4595, r4592, r4473; +} +{ +sub.f16x2 r4598, r4497, r4498; +} +{ +mul.f16x2 r4601, r4598, r4476; +} +{ +add.f16x2 r4604, r4595, r4601; +} +{ +sub.f16x2 r4607, r4589, r4604; +} +{ +add.f16x2 r4610, r4479, r4480; +} +{ +mul.f16x2 r4613, r4610, r4472; +} +{ +add.f16x2 r4616, r4482, r4613; +} +{ +add.f16x2 r4619, r4485, r4486; +} +{ +mul.f16x2 r4622, r4619, r4474; +} +{ +add.f16x2 r4625, r4616, r4622; +} +{ +sub.f16x2 r4628, r4491, r4492; +} +{ +mul.f16x2 r4631, r4628, r4473; +} +{ +sub.f16x2 r4634, r4497, r4498; +} +{ +mul.f16x2 r4637, r4634, r4476; +} +{ +add.f16x2 r4640, r4631, r4637; +} +{ +add.f16x2 r4643, r4625, r4640; +} +{ +add.f16x2 r4646, r4491, r4492; +} +{ +mul.f16x2 r4649, r4646, r4470; +} +{ +add.f16x2 r4652, r4494, r4649; +} +{ +add.f16x2 r4655, r4497, r4498; +} +{ +mul.f16x2 r4658, r4655, r4472; +} +{ +add.f16x2 r4661, r4652, r4658; +} +{ +sub.f16x2 r4664, r4479, r4480; +} +{ +mul.f16x2 r4667, r4664, r4471; +} +{ +sub.f16x2 r4670, r4485, r4486; +} +{ +mul.f16x2 r4673, r4670, r4473; +} +{ +add.f16x2 r4676, r4667, r4673; +} +{ +add.f16x2 r4679, r4661, r4676; +} +{ +add.f16x2 r4682, r4491, r4492; +} +{ +mul.f16x2 r4685, r4682, r4470; +} +{ +add.f16x2 r4688, r4494, r4685; +} +{ +add.f16x2 r4691, r4497, r4498; +} +{ +mul.f16x2 r4694, r4691, r4472; +} +{ +add.f16x2 r4697, r4688, r4694; +} +{ +sub.f16x2 r4700, r4479, r4480; +} +{ +mul.f16x2 r4703, r4700, r4471; +} +{ +sub.f16x2 r4706, r4485, r4486; +} +{ +mul.f16x2 r4709, r4706, r4473; +} +{ +add.f16x2 r4712, r4703, r4709; +} +{ +sub.f16x2 r4715, r4697, r4712; +} +{ +add.f16x2 r4718, r4491, r4492; +} +{ +mul.f16x2 r4721, r4718, r4472; +} +{ +add.f16x2 r4724, r4494, r4721; +} +{ +add.f16x2 r4727, r4497, r4498; +} +{ +mul.f16x2 r4730, r4727, r4474; +} +{ +add.f16x2 r4733, r4724, r4730; +} +{ +sub.f16x2 r4736, r4479, r4480; +} +{ +mul.f16x2 r4739, r4736, r4473; +} +{ +sub.f16x2 r4742, r4485, r4486; +} +{ +mul.f16x2 r4745, r4742, r4476; +} +{ +add.f16x2 r4748, r4739, r4745; +} +{ +add.f16x2 r4751, r4733, r4748; +} +{ +add.f16x2 r4754, r4491, r4492; +} +{ +mul.f16x2 r4757, r4754, r4472; +} +{ +add.f16x2 r4760, r4494, r4757; +} +{ +add.f16x2 r4763, r4497, r4498; +} +{ +mul.f16x2 r4766, r4763, r4474; +} +{ +add.f16x2 r4769, r4760, r4766; +} +{ +sub.f16x2 r4772, r4479, r4480; +} +{ +mul.f16x2 r4775, r4772, r4473; +} +{ +sub.f16x2 r4778, r4485, r4486; +} +{ +mul.f16x2 r4781, r4778, r4476; +} +{ +add.f16x2 r4784, r4775, r4781; +} +{ +sub.f16x2 r4787, r4769, r4784; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4790, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4794, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4795, {low, high}; +} +{ +neg.f16x2 r4796, r4795; +} +{ +add.f16x2 r4798, r4799, r4800; +} +{ +add.f16x2 r4801, r4802, r4798; +} +{ +add.f16x2 r4804, r4805, r4806; +} +{ +add.f16x2 r4807, r4801, r4804; +} +{ +add.f16x2 r4810, r4811, r4812; +} +{ +add.f16x2 r4813, r4814, r4810; +} +{ +add.f16x2 r4816, r4817, r4818; +} +{ +add.f16x2 r4819, r4813, r4816; +} +{ +add.f16x2 r4822, r4799, r4800; +} +{ +mul.f16x2 r4825, r4822, r4790; +} +{ +add.f16x2 r4828, r4802, r4825; +} +{ +add.f16x2 r4831, r4805, r4806; +} +{ +mul.f16x2 r4834, r4831, r4792; +} +{ +add.f16x2 r4837, r4828, r4834; +} +{ +sub.f16x2 r4840, r4811, r4812; +} +{ +mul.f16x2 r4843, r4840, r4791; +} +{ +sub.f16x2 r4846, r4817, r4818; +} +{ +mul.f16x2 r4849, r4846, r4793; +} +{ +add.f16x2 r4852, r4843, r4849; +} +{ +sub.f16x2 r4855, r4837, r4852; +} +{ +add.f16x2 r4858, r4799, r4800; +} +{ +mul.f16x2 r4861, r4858, r4790; +} +{ +add.f16x2 r4864, r4802, r4861; +} +{ +add.f16x2 r4867, r4805, r4806; +} +{ +mul.f16x2 r4870, r4867, r4792; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +sub.f16x2 r4876, r4811, r4812; +} +{ +mul.f16x2 r4879, r4876, r4791; +} +{ +sub.f16x2 r4882, r4817, r4818; +} +{ +mul.f16x2 r4885, r4882, r4793; +} +{ +add.f16x2 r4888, r4879, r4885; +} +{ +add.f16x2 r4891, r4873, r4888; +} +{ +add.f16x2 r4894, r4799, r4800; +} +{ +mul.f16x2 r4897, r4894, r4792; +} +{ +add.f16x2 r4900, r4802, r4897; +} +{ +add.f16x2 r4903, r4805, r4806; +} +{ +mul.f16x2 r4906, r4903, r4794; +} +{ +add.f16x2 r4909, r4900, r4906; +} +{ +sub.f16x2 r4912, r4811, r4812; +} +{ +mul.f16x2 r4915, r4912, r4793; +} +{ +sub.f16x2 r4918, r4817, r4818; +} +{ +mul.f16x2 r4921, r4918, r4796; +} +{ +add.f16x2 r4924, r4915, r4921; +} +{ +sub.f16x2 r4927, r4909, r4924; +} +{ +add.f16x2 r4930, r4799, r4800; +} +{ +mul.f16x2 r4933, r4930, r4792; +} +{ +add.f16x2 r4936, r4802, r4933; +} +{ +add.f16x2 r4939, r4805, r4806; +} +{ +mul.f16x2 r4942, r4939, r4794; +} +{ +add.f16x2 r4945, r4936, r4942; +} +{ +sub.f16x2 r4948, r4811, r4812; +} +{ +mul.f16x2 r4951, r4948, r4793; +} +{ +sub.f16x2 r4954, r4817, r4818; +} +{ +mul.f16x2 r4957, r4954, r4796; +} +{ +add.f16x2 r4960, r4951, r4957; +} +{ +add.f16x2 r4963, r4945, r4960; +} +{ +add.f16x2 r4966, r4811, r4812; +} +{ +mul.f16x2 r4969, r4966, r4790; +} +{ +add.f16x2 r4972, r4814, r4969; +} +{ +add.f16x2 r4975, r4817, r4818; +} +{ +mul.f16x2 r4978, r4975, r4792; +} +{ +add.f16x2 r4981, r4972, r4978; +} +{ +sub.f16x2 r4984, r4799, r4800; +} +{ +mul.f16x2 r4987, r4984, r4791; +} +{ +sub.f16x2 r4990, r4805, r4806; +} +{ +mul.f16x2 r4993, r4990, r4793; +} +{ +add.f16x2 r4996, r4987, r4993; +} +{ +add.f16x2 r4999, r4981, r4996; +} +{ +add.f16x2 r5002, r4811, r4812; +} +{ +mul.f16x2 r5005, r5002, r4790; +} +{ +add.f16x2 r5008, r4814, r5005; +} +{ +add.f16x2 r5011, r4817, r4818; +} +{ +mul.f16x2 r5014, r5011, r4792; +} +{ +add.f16x2 r5017, r5008, r5014; +} +{ +sub.f16x2 r5020, r4799, r4800; +} +{ +mul.f16x2 r5023, r5020, r4791; +} +{ +sub.f16x2 r5026, r4805, r4806; +} +{ +mul.f16x2 r5029, r5026, r4793; +} +{ +add.f16x2 r5032, r5023, r5029; +} +{ +sub.f16x2 r5035, r5017, r5032; +} +{ +add.f16x2 r5038, r4811, r4812; +} +{ +mul.f16x2 r5041, r5038, r4792; +} +{ +add.f16x2 r5044, r4814, r5041; +} +{ +add.f16x2 r5047, r4817, r4818; +} +{ +mul.f16x2 r5050, r5047, r4794; +} +{ +add.f16x2 r5053, r5044, r5050; +} +{ +sub.f16x2 r5056, r4799, r4800; +} +{ +mul.f16x2 r5059, r5056, r4793; +} +{ +sub.f16x2 r5062, r4805, r4806; +} +{ +mul.f16x2 r5065, r5062, r4796; +} +{ +add.f16x2 r5068, r5059, r5065; +} +{ +add.f16x2 r5071, r5053, r5068; +} +{ +add.f16x2 r5074, r4811, r4812; +} +{ +mul.f16x2 r5077, r5074, r4792; +} +{ +add.f16x2 r5080, r4814, r5077; +} +{ +add.f16x2 r5083, r4817, r4818; +} +{ +mul.f16x2 r5086, r5083, r4794; +} +{ +add.f16x2 r5089, r5080, r5086; +} +{ +sub.f16x2 r5092, r4799, r4800; +} +{ +mul.f16x2 r5095, r5092, r4793; +} +{ +sub.f16x2 r5098, r4805, r4806; +} +{ +mul.f16x2 r5101, r5098, r4796; +} +{ +add.f16x2 r5104, r5095, r5101; +} +{ +sub.f16x2 r5107, r5089, r5104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5115, {low, high}; +} +{ +neg.f16x2 r5116, r5115; +} +{ +add.f16x2 r5118, r5119, r5120; +} +{ +add.f16x2 r5121, r5122, r5118; +} +{ +add.f16x2 r5124, r5125, r5126; +} +{ +add.f16x2 r5127, r5121, r5124; +} +{ +add.f16x2 r5130, r5131, r5132; +} +{ +add.f16x2 r5133, r5134, r5130; +} +{ +add.f16x2 r5136, r5137, r5138; +} +{ +add.f16x2 r5139, r5133, r5136; +} +{ +add.f16x2 r5142, r5119, r5120; +} +{ +mul.f16x2 r5145, r5142, r5110; +} +{ +add.f16x2 r5148, r5122, r5145; +} +{ +add.f16x2 r5151, r5125, r5126; +} +{ +mul.f16x2 r5154, r5151, r5112; +} +{ +add.f16x2 r5157, r5148, r5154; +} +{ +sub.f16x2 r5160, r5131, r5132; +} +{ +mul.f16x2 r5163, r5160, r5111; +} +{ +sub.f16x2 r5166, r5137, r5138; +} +{ +mul.f16x2 r5169, r5166, r5113; +} +{ +add.f16x2 r5172, r5163, r5169; +} +{ +sub.f16x2 r5175, r5157, r5172; +} +{ +add.f16x2 r5178, r5119, r5120; +} +{ +mul.f16x2 r5181, r5178, r5110; +} +{ +add.f16x2 r5184, r5122, r5181; +} +{ +add.f16x2 r5187, r5125, r5126; +} +{ +mul.f16x2 r5190, r5187, r5112; +} +{ +add.f16x2 r5193, r5184, r5190; +} +{ +sub.f16x2 r5196, r5131, r5132; +} +{ +mul.f16x2 r5199, r5196, r5111; +} +{ +sub.f16x2 r5202, r5137, r5138; +} +{ +mul.f16x2 r5205, r5202, r5113; +} +{ +add.f16x2 r5208, r5199, r5205; +} +{ +add.f16x2 r5211, r5193, r5208; +} +{ +add.f16x2 r5214, r5119, r5120; +} +{ +mul.f16x2 r5217, r5214, r5112; +} +{ +add.f16x2 r5220, r5122, r5217; +} +{ +add.f16x2 r5223, r5125, r5126; +} +{ +mul.f16x2 r5226, r5223, r5114; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +sub.f16x2 r5232, r5131, r5132; +} +{ +mul.f16x2 r5235, r5232, r5113; +} +{ +sub.f16x2 r5238, r5137, r5138; +} +{ +mul.f16x2 r5241, r5238, r5116; +} +{ +add.f16x2 r5244, r5235, r5241; +} +{ +sub.f16x2 r5247, r5229, r5244; +} +{ +add.f16x2 r5250, r5119, r5120; +} +{ +mul.f16x2 r5253, r5250, r5112; +} +{ +add.f16x2 r5256, r5122, r5253; +} +{ +add.f16x2 r5259, r5125, r5126; +} +{ +mul.f16x2 r5262, r5259, r5114; +} +{ +add.f16x2 r5265, r5256, r5262; +} +{ +sub.f16x2 r5268, r5131, r5132; +} +{ +mul.f16x2 r5271, r5268, r5113; +} +{ +sub.f16x2 r5274, r5137, r5138; +} +{ +mul.f16x2 r5277, r5274, r5116; +} +{ +add.f16x2 r5280, r5271, r5277; +} +{ +add.f16x2 r5283, r5265, r5280; +} +{ +add.f16x2 r5286, r5131, r5132; +} +{ +mul.f16x2 r5289, r5286, r5110; +} +{ +add.f16x2 r5292, r5134, r5289; +} +{ +add.f16x2 r5295, r5137, r5138; +} +{ +mul.f16x2 r5298, r5295, r5112; +} +{ +add.f16x2 r5301, r5292, r5298; +} +{ +sub.f16x2 r5304, r5119, r5120; +} +{ +mul.f16x2 r5307, r5304, r5111; +} +{ +sub.f16x2 r5310, r5125, r5126; +} +{ +mul.f16x2 r5313, r5310, r5113; +} +{ +add.f16x2 r5316, r5307, r5313; +} +{ +add.f16x2 r5319, r5301, r5316; +} +{ +add.f16x2 r5322, r5131, r5132; +} +{ +mul.f16x2 r5325, r5322, r5110; +} +{ +add.f16x2 r5328, r5134, r5325; +} +{ +add.f16x2 r5331, r5137, r5138; +} +{ +mul.f16x2 r5334, r5331, r5112; +} +{ +add.f16x2 r5337, r5328, r5334; +} +{ +sub.f16x2 r5340, r5119, r5120; +} +{ +mul.f16x2 r5343, r5340, r5111; +} +{ +sub.f16x2 r5346, r5125, r5126; +} +{ +mul.f16x2 r5349, r5346, r5113; +} +{ +add.f16x2 r5352, r5343, r5349; +} +{ +sub.f16x2 r5355, r5337, r5352; +} +{ +add.f16x2 r5358, r5131, r5132; +} +{ +mul.f16x2 r5361, r5358, r5112; +} +{ +add.f16x2 r5364, r5134, r5361; +} +{ +add.f16x2 r5367, r5137, r5138; +} +{ +mul.f16x2 r5370, r5367, r5114; +} +{ +add.f16x2 r5373, r5364, r5370; +} +{ +sub.f16x2 r5376, r5119, r5120; +} +{ +mul.f16x2 r5379, r5376, r5113; +} +{ +sub.f16x2 r5382, r5125, r5126; +} +{ +mul.f16x2 r5385, r5382, r5116; +} +{ +add.f16x2 r5388, r5379, r5385; +} +{ +add.f16x2 r5391, r5373, r5388; +} +{ +add.f16x2 r5394, r5131, r5132; +} +{ +mul.f16x2 r5397, r5394, r5112; +} +{ +add.f16x2 r5400, r5134, r5397; +} +{ +add.f16x2 r5403, r5137, r5138; +} +{ +mul.f16x2 r5406, r5403, r5114; +} +{ +add.f16x2 r5409, r5400, r5406; +} +{ +sub.f16x2 r5412, r5119, r5120; +} +{ +mul.f16x2 r5415, r5412, r5113; +} +{ +sub.f16x2 r5418, r5125, r5126; +} +{ +mul.f16x2 r5421, r5418, r5116; +} +{ +add.f16x2 r5424, r5415, r5421; +} +{ +sub.f16x2 r5427, r5409, r5424; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r5430, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r5431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r5432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r5433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r5434, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r5435, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r5436, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r5437, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r5440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r5441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r5444, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r5445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5446, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r5447, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r5453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5460, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r5461, {low, high}; +} +{ +mul.f16x2 r5478, r4215, r5430; +} +{ +mul.f16x2 r5481, r4359, r5431; +} +{ +sub.f16x2 r5484, r5478, r5481; +} +{ +mul.f16x2 r5487, r4215, r5431; +} +{ +fma.rn.f16x2 r5490, r4359, r5430, r5487; +} +{ +mul.f16x2 r5494, r4535, r5432; +} +{ +mul.f16x2 r5497, r4679, r5433; +} +{ +sub.f16x2 r5500, r5494, r5497; +} +{ +mul.f16x2 r5503, r4535, r5433; +} +{ +fma.rn.f16x2 r5506, r4679, r5432, r5503; +} +{ +mul.f16x2 r5510, r4855, r5434; +} +{ +mul.f16x2 r5513, r4999, r5435; +} +{ +sub.f16x2 r5516, r5510, r5513; +} +{ +mul.f16x2 r5519, r4855, r5435; +} +{ +fma.rn.f16x2 r5522, r4999, r5434, r5519; +} +{ +mul.f16x2 r5526, r5175, r5436; +} +{ +mul.f16x2 r5529, r5319, r5437; +} +{ +sub.f16x2 r5532, r5526, r5529; +} +{ +mul.f16x2 r5535, r5175, r5437; +} +{ +fma.rn.f16x2 r5538, r5319, r5436, r5535; +} +{ +mul.f16x2 r5542, r4287, r5432; +} +{ +mul.f16x2 r5545, r4431, r5433; +} +{ +sub.f16x2 r5548, r5542, r5545; +} +{ +mul.f16x2 r5551, r4287, r5433; +} +{ +fma.rn.f16x2 r5554, r4431, r5432, r5551; +} +{ +mul.f16x2 r5558, r4607, r5436; +} +{ +mul.f16x2 r5561, r4751, r5437; +} +{ +sub.f16x2 r5564, r5558, r5561; +} +{ +mul.f16x2 r5567, r4607, r5437; +} +{ +fma.rn.f16x2 r5570, r4751, r5436, r5567; +} +{ +mul.f16x2 r5574, r4927, r5440; +} +{ +mul.f16x2 r5577, r5071, r5441; +} +{ +sub.f16x2 r5580, r5574, r5577; +} +{ +mul.f16x2 r5583, r4927, r5441; +} +{ +fma.rn.f16x2 r5586, r5071, r5440, r5583; +} +{ +mul.f16x2 r5590, r5247, r5444; +} +{ +mul.f16x2 r5593, r5391, r5445; +} +{ +sub.f16x2 r5596, r5590, r5593; +} +{ +mul.f16x2 r5599, r5247, r5445; +} +{ +fma.rn.f16x2 r5602, r5391, r5444, r5599; +} +{ +mul.f16x2 r5606, r4323, r5434; +} +{ +mul.f16x2 r5609, r4467, r5435; +} +{ +sub.f16x2 r5612, r5606, r5609; +} +{ +mul.f16x2 r5615, r4323, r5435; +} +{ +fma.rn.f16x2 r5618, r4467, r5434, r5615; +} +{ +mul.f16x2 r5622, r4643, r5440; +} +{ +mul.f16x2 r5625, r4787, r5441; +} +{ +sub.f16x2 r5628, r5622, r5625; +} +{ +mul.f16x2 r5631, r4643, r5441; +} +{ +fma.rn.f16x2 r5634, r4787, r5440, r5631; +} +{ +mul.f16x2 r5638, r4963, r5446; +} +{ +mul.f16x2 r5641, r5107, r5447; +} +{ +sub.f16x2 r5644, r5638, r5641; +} +{ +mul.f16x2 r5647, r4963, r5447; +} +{ +fma.rn.f16x2 r5650, r5107, r5446, r5647; +} +{ +mul.f16x2 r5654, r5283, r5452; +} +{ +mul.f16x2 r5657, r5427, r5453; +} +{ +sub.f16x2 r5660, r5654, r5657; +} +{ +mul.f16x2 r5663, r5283, r5453; +} +{ +fma.rn.f16x2 r5666, r5427, r5452, r5663; +} +{ +mul.f16x2 r5670, r4251, r5436; +} +{ +mul.f16x2 r5673, r4395, r5437; +} +{ +sub.f16x2 r5676, r5670, r5673; +} +{ +mul.f16x2 r5679, r4251, r5437; +} +{ +fma.rn.f16x2 r5682, r4395, r5436, r5679; +} +{ +mul.f16x2 r5686, r4571, r5444; +} +{ +mul.f16x2 r5689, r4715, r5445; +} +{ +sub.f16x2 r5692, r5686, r5689; +} +{ +mul.f16x2 r5695, r4571, r5445; +} +{ +fma.rn.f16x2 r5698, r4715, r5444, r5695; +} +{ +mul.f16x2 r5702, r4891, r5452; +} +{ +mul.f16x2 r5705, r5035, r5453; +} +{ +sub.f16x2 r5708, r5702, r5705; +} +{ +mul.f16x2 r5711, r4891, r5453; +} +{ +fma.rn.f16x2 r5714, r5035, r5452, r5711; +} +{ +mul.f16x2 r5718, r5211, r5460; +} +{ +mul.f16x2 r5721, r5355, r5461; +} +{ +sub.f16x2 r5724, r5718, r5721; +} +{ +mul.f16x2 r5727, r5211, r5461; +} +{ +fma.rn.f16x2 r5730, r5355, r5460, r5727; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5739, {low, high}; +} +{ +neg.f16x2 r5740, r5739; +} +{ +add.f16x2 r5742, r4167, r5127; +} +{ +add.f16x2 r5745, r3847, r5742; +} +{ +add.f16x2 r5748, r4487, r4807; +} +{ +add.f16x2 r5751, r5745, r5748; +} +st.local.u32 [rd3], r5751; +{ +add.f16x2 r5754, r4179, r5139; +} +{ +add.f16x2 r5757, r3859, r5754; +} +{ +add.f16x2 r5760, r4499, r4819; +} +{ +add.f16x2 r5763, r5757, r5760; +} +st.local.u32 [rd4], r5763; +{ +add.f16x2 r5766, r4167, r5127; +} +{ +mul.f16x2 r5769, r5766, r5734; +} +{ +add.f16x2 r5772, r3847, r5769; +} +{ +add.f16x2 r5775, r4487, r4807; +} +{ +mul.f16x2 r5778, r5775, r5736; +} +{ +add.f16x2 r5781, r5772, r5778; +} +{ +sub.f16x2 r5784, r4179, r5139; +} +{ +mul.f16x2 r5787, r5784, r5735; +} +{ +sub.f16x2 r5790, r4499, r4819; +} +{ +mul.f16x2 r5793, r5790, r5737; +} +{ +add.f16x2 r5796, r5787, r5793; +} +{ +sub.f16x2 r5799, r5781, r5796; +} +st.local.u32 [rd4+36], r5799; +{ +add.f16x2 r5802, r4167, r5127; +} +{ +mul.f16x2 r5805, r5802, r5734; +} +{ +add.f16x2 r5808, r3847, r5805; +} +{ +add.f16x2 r5811, r4487, r4807; +} +{ +mul.f16x2 r5814, r5811, r5736; +} +{ +add.f16x2 r5817, r5808, r5814; +} +{ +sub.f16x2 r5820, r4179, r5139; +} +{ +mul.f16x2 r5823, r5820, r5735; +} +{ +sub.f16x2 r5826, r4499, r4819; +} +{ +mul.f16x2 r5829, r5826, r5737; +} +{ +add.f16x2 r5832, r5823, r5829; +} +{ +add.f16x2 r5835, r5817, r5832; +} +st.local.u32 [rd4+156], r5835; +{ +add.f16x2 r5838, r4167, r5127; +} +{ +mul.f16x2 r5841, r5838, r5736; +} +{ +add.f16x2 r5844, r3847, r5841; +} +{ +add.f16x2 r5847, r4487, r4807; +} +{ +mul.f16x2 r5850, r5847, r5738; +} +{ +add.f16x2 r5853, r5844, r5850; +} +{ +sub.f16x2 r5856, r4179, r5139; +} +{ +mul.f16x2 r5859, r5856, r5737; +} +{ +sub.f16x2 r5862, r4499, r4819; +} +{ +mul.f16x2 r5865, r5862, r5740; +} +{ +add.f16x2 r5868, r5859, r5865; +} +{ +sub.f16x2 r5871, r5853, r5868; +} +st.local.u32 [rd4+76], r5871; +{ +add.f16x2 r5874, r4167, r5127; +} +{ +mul.f16x2 r5877, r5874, r5736; +} +{ +add.f16x2 r5880, r3847, r5877; +} +{ +add.f16x2 r5883, r4487, r4807; +} +{ +mul.f16x2 r5886, r5883, r5738; +} +{ +add.f16x2 r5889, r5880, r5886; +} +{ +sub.f16x2 r5892, r4179, r5139; +} +{ +mul.f16x2 r5895, r5892, r5737; +} +{ +sub.f16x2 r5898, r4499, r4819; +} +{ +mul.f16x2 r5901, r5898, r5740; +} +{ +add.f16x2 r5904, r5895, r5901; +} +{ +add.f16x2 r5907, r5889, r5904; +} +st.local.u32 [rd4+116], r5907; +{ +add.f16x2 r5910, r4179, r5139; +} +{ +mul.f16x2 r5913, r5910, r5734; +} +{ +add.f16x2 r5916, r3859, r5913; +} +{ +add.f16x2 r5919, r4499, r4819; +} +{ +mul.f16x2 r5922, r5919, r5736; +} +{ +add.f16x2 r5925, r5916, r5922; +} +{ +sub.f16x2 r5928, r4167, r5127; +} +{ +mul.f16x2 r5931, r5928, r5735; +} +{ +sub.f16x2 r5934, r4487, r4807; +} +{ +mul.f16x2 r5937, r5934, r5737; +} +{ +add.f16x2 r5940, r5931, r5937; +} +{ +add.f16x2 r5943, r5925, r5940; +} +st.local.u32 [rd4+40], r5943; +{ +add.f16x2 r5946, r4179, r5139; +} +{ +mul.f16x2 r5949, r5946, r5734; +} +{ +add.f16x2 r5952, r3859, r5949; +} +{ +add.f16x2 r5955, r4499, r4819; +} +{ +mul.f16x2 r5958, r5955, r5736; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +sub.f16x2 r5964, r4167, r5127; +} +{ +mul.f16x2 r5967, r5964, r5735; +} +{ +sub.f16x2 r5970, r4487, r4807; +} +{ +mul.f16x2 r5973, r5970, r5737; +} +{ +add.f16x2 r5976, r5967, r5973; +} +{ +sub.f16x2 r5979, r5961, r5976; +} +st.local.u32 [rd4+160], r5979; +{ +add.f16x2 r5982, r4179, r5139; +} +{ +mul.f16x2 r5985, r5982, r5736; +} +{ +add.f16x2 r5988, r3859, r5985; +} +{ +add.f16x2 r5991, r4499, r4819; +} +{ +mul.f16x2 r5994, r5991, r5738; +} +{ +add.f16x2 r5997, r5988, r5994; +} +{ +sub.f16x2 r6000, r4167, r5127; +} +{ +mul.f16x2 r6003, r6000, r5737; +} +{ +sub.f16x2 r6006, r4487, r4807; +} +{ +mul.f16x2 r6009, r6006, r5740; +} +{ +add.f16x2 r6012, r6003, r6009; +} +{ +add.f16x2 r6015, r5997, r6012; +} +st.local.u32 [rd4+80], r6015; +{ +add.f16x2 r6018, r4179, r5139; +} +{ +mul.f16x2 r6021, r6018, r5736; +} +{ +add.f16x2 r6024, r3859, r6021; +} +{ +add.f16x2 r6027, r4499, r4819; +} +{ +mul.f16x2 r6030, r6027, r5738; +} +{ +add.f16x2 r6033, r6024, r6030; +} +{ +sub.f16x2 r6036, r4167, r5127; +} +{ +mul.f16x2 r6039, r6036, r5737; +} +{ +sub.f16x2 r6042, r4487, r4807; +} +{ +mul.f16x2 r6045, r6042, r5740; +} +{ +add.f16x2 r6048, r6039, r6045; +} +{ +sub.f16x2 r6051, r6033, r6048; +} +st.local.u32 [rd4+120], r6051; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6054, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6057, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6058, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6059, {low, high}; +} +{ +neg.f16x2 r6060, r6059; +} +{ +add.f16x2 r6062, r5484, r5532; +} +{ +add.f16x2 r6065, r3895, r6062; +} +{ +add.f16x2 r6068, r5500, r5516; +} +{ +add.f16x2 r11352, r6065, r6068; +} +st.local.u32 [rd4+4], r11352; +{ +add.f16x2 r6074, r5490, r5538; +} +{ +add.f16x2 r6077, r4039, r6074; +} +{ +add.f16x2 r6080, r5506, r5522; +} +{ +add.f16x2 r6083, r6077, r6080; +} +st.local.u32 [rd4+8], r6083; +{ +add.f16x2 r6086, r5484, r5532; +} +{ +mul.f16x2 r6089, r6086, r6054; +} +{ +add.f16x2 r6092, r3895, r6089; +} +{ +add.f16x2 r6095, r5500, r5516; +} +{ +mul.f16x2 r6098, r6095, r6056; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +sub.f16x2 r6104, r5490, r5538; +} +{ +mul.f16x2 r6107, r6104, r6055; +} +{ +sub.f16x2 r6110, r5506, r5522; +} +{ +mul.f16x2 r6113, r6110, r6057; +} +{ +add.f16x2 r6116, r6107, r6113; +} +{ +sub.f16x2 r6119, r6101, r6116; +} +st.local.u32 [rd4+44], r6119; +{ +add.f16x2 r6122, r5484, r5532; +} +{ +mul.f16x2 r6125, r6122, r6054; +} +{ +add.f16x2 r6128, r3895, r6125; +} +{ +add.f16x2 r6131, r5500, r5516; +} +{ +mul.f16x2 r6134, r6131, r6056; +} +{ +add.f16x2 r6137, r6128, r6134; +} +{ +sub.f16x2 r6140, r5490, r5538; +} +{ +mul.f16x2 r6143, r6140, r6055; +} +{ +sub.f16x2 r6146, r5506, r5522; +} +{ +mul.f16x2 r6149, r6146, r6057; +} +{ +add.f16x2 r6152, r6143, r6149; +} +{ +add.f16x2 r6155, r6137, r6152; +} +st.local.u32 [rd4+164], r6155; +{ +add.f16x2 r6158, r5484, r5532; +} +{ +mul.f16x2 r6161, r6158, r6056; +} +{ +add.f16x2 r6164, r3895, r6161; +} +{ +add.f16x2 r6167, r5500, r5516; +} +{ +mul.f16x2 r6170, r6167, r6058; +} +{ +add.f16x2 r6173, r6164, r6170; +} +{ +sub.f16x2 r6176, r5490, r5538; +} +{ +mul.f16x2 r6179, r6176, r6057; +} +{ +sub.f16x2 r6182, r5506, r5522; +} +{ +mul.f16x2 r6185, r6182, r6060; +} +{ +add.f16x2 r6188, r6179, r6185; +} +{ +sub.f16x2 r6191, r6173, r6188; +} +st.local.u32 [rd4+84], r6191; +{ +add.f16x2 r6194, r5484, r5532; +} +{ +mul.f16x2 r6197, r6194, r6056; +} +{ +add.f16x2 r6200, r3895, r6197; +} +{ +add.f16x2 r6203, r5500, r5516; +} +{ +mul.f16x2 r6206, r6203, r6058; +} +{ +add.f16x2 r6209, r6200, r6206; +} +{ +sub.f16x2 r6212, r5490, r5538; +} +{ +mul.f16x2 r6215, r6212, r6057; +} +{ +sub.f16x2 r6218, r5506, r5522; +} +{ +mul.f16x2 r6221, r6218, r6060; +} +{ +add.f16x2 r6224, r6215, r6221; +} +{ +add.f16x2 r6227, r6209, r6224; +} +st.local.u32 [rd4+124], r6227; +{ +add.f16x2 r6230, r5490, r5538; +} +{ +mul.f16x2 r6233, r6230, r6054; +} +{ +add.f16x2 r6236, r4039, r6233; +} +{ +add.f16x2 r6239, r5506, r5522; +} +{ +mul.f16x2 r6242, r6239, r6056; +} +{ +add.f16x2 r6245, r6236, r6242; +} +{ +sub.f16x2 r6248, r5484, r5532; +} +{ +mul.f16x2 r6251, r6248, r6055; +} +{ +sub.f16x2 r6254, r5500, r5516; +} +{ +mul.f16x2 r6257, r6254, r6057; +} +{ +add.f16x2 r6260, r6251, r6257; +} +{ +add.f16x2 r6263, r6245, r6260; +} +st.local.u32 [rd4+48], r6263; +{ +add.f16x2 r6266, r5490, r5538; +} +{ +mul.f16x2 r6269, r6266, r6054; +} +{ +add.f16x2 r6272, r4039, r6269; +} +{ +add.f16x2 r6275, r5506, r5522; +} +{ +mul.f16x2 r6278, r6275, r6056; +} +{ +add.f16x2 r6281, r6272, r6278; +} +{ +sub.f16x2 r6284, r5484, r5532; +} +{ +mul.f16x2 r6287, r6284, r6055; +} +{ +sub.f16x2 r6290, r5500, r5516; +} +{ +mul.f16x2 r6293, r6290, r6057; +} +{ +add.f16x2 r6296, r6287, r6293; +} +{ +sub.f16x2 r6299, r6281, r6296; +} +st.local.u32 [rd4+168], r6299; +{ +add.f16x2 r6302, r5490, r5538; +} +{ +mul.f16x2 r6305, r6302, r6056; +} +{ +add.f16x2 r6308, r4039, r6305; +} +{ +add.f16x2 r6311, r5506, r5522; +} +{ +mul.f16x2 r6314, r6311, r6058; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +sub.f16x2 r6320, r5484, r5532; +} +{ +mul.f16x2 r6323, r6320, r6057; +} +{ +sub.f16x2 r6326, r5500, r5516; +} +{ +mul.f16x2 r6329, r6326, r6060; +} +{ +add.f16x2 r6332, r6323, r6329; +} +{ +add.f16x2 r6335, r6317, r6332; +} +st.local.u32 [rd4+88], r6335; +{ +add.f16x2 r6338, r5490, r5538; +} +{ +mul.f16x2 r6341, r6338, r6056; +} +{ +add.f16x2 r6344, r4039, r6341; +} +{ +add.f16x2 r6347, r5506, r5522; +} +{ +mul.f16x2 r6350, r6347, r6058; +} +{ +add.f16x2 r6353, r6344, r6350; +} +{ +sub.f16x2 r6356, r5484, r5532; +} +{ +mul.f16x2 r6359, r6356, r6057; +} +{ +sub.f16x2 r6362, r5500, r5516; +} +{ +mul.f16x2 r6365, r6362, r6060; +} +{ +add.f16x2 r6368, r6359, r6365; +} +{ +sub.f16x2 r6371, r6353, r6368; +} +st.local.u32 [rd4+128], r6371; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6375, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6377, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6378, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6379, {low, high}; +} +{ +neg.f16x2 r6380, r6379; +} +{ +add.f16x2 r6382, r5548, r5596; +} +{ +add.f16x2 r6385, r3967, r6382; +} +{ +add.f16x2 r6388, r5564, r5580; +} +{ +add.f16x2 r6391, r6385, r6388; +} +st.local.u32 [rd4+12], r6391; +{ +add.f16x2 r6394, r5554, r5602; +} +{ +add.f16x2 r6397, r4111, r6394; +} +{ +add.f16x2 r6400, r5570, r5586; +} +{ +add.f16x2 r6403, r6397, r6400; +} +st.local.u32 [rd4+16], r6403; +{ +add.f16x2 r6406, r5548, r5596; +} +{ +mul.f16x2 r6409, r6406, r6374; +} +{ +add.f16x2 r6412, r3967, r6409; +} +{ +add.f16x2 r6415, r5564, r5580; +} +{ +mul.f16x2 r6418, r6415, r6376; +} +{ +add.f16x2 r6421, r6412, r6418; +} +{ +sub.f16x2 r6424, r5554, r5602; +} +{ +mul.f16x2 r6427, r6424, r6375; +} +{ +sub.f16x2 r6430, r5570, r5586; +} +{ +mul.f16x2 r6433, r6430, r6377; +} +{ +add.f16x2 r6436, r6427, r6433; +} +{ +sub.f16x2 r6439, r6421, r6436; +} +st.local.u32 [rd4+52], r6439; +{ +add.f16x2 r6442, r5548, r5596; +} +{ +mul.f16x2 r6445, r6442, r6374; +} +{ +add.f16x2 r6448, r3967, r6445; +} +{ +add.f16x2 r6451, r5564, r5580; +} +{ +mul.f16x2 r6454, r6451, r6376; +} +{ +add.f16x2 r6457, r6448, r6454; +} +{ +sub.f16x2 r6460, r5554, r5602; +} +{ +mul.f16x2 r6463, r6460, r6375; +} +{ +sub.f16x2 r6466, r5570, r5586; +} +{ +mul.f16x2 r6469, r6466, r6377; +} +{ +add.f16x2 r6472, r6463, r6469; +} +{ +add.f16x2 r6475, r6457, r6472; +} +st.local.u32 [rd4+172], r6475; +{ +add.f16x2 r6478, r5548, r5596; +} +{ +mul.f16x2 r6481, r6478, r6376; +} +{ +add.f16x2 r6484, r3967, r6481; +} +{ +add.f16x2 r6487, r5564, r5580; +} +{ +mul.f16x2 r6490, r6487, r6378; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +sub.f16x2 r6496, r5554, r5602; +} +{ +mul.f16x2 r6499, r6496, r6377; +} +{ +sub.f16x2 r6502, r5570, r5586; +} +{ +mul.f16x2 r6505, r6502, r6380; +} +{ +add.f16x2 r6508, r6499, r6505; +} +{ +sub.f16x2 r6511, r6493, r6508; +} +st.local.u32 [rd4+92], r6511; +{ +add.f16x2 r6514, r5548, r5596; +} +{ +mul.f16x2 r6517, r6514, r6376; +} +{ +add.f16x2 r6520, r3967, r6517; +} +{ +add.f16x2 r6523, r5564, r5580; +} +{ +mul.f16x2 r6526, r6523, r6378; +} +{ +add.f16x2 r6529, r6520, r6526; +} +{ +sub.f16x2 r6532, r5554, r5602; +} +{ +mul.f16x2 r6535, r6532, r6377; +} +{ +sub.f16x2 r6538, r5570, r5586; +} +{ +mul.f16x2 r6541, r6538, r6380; +} +{ +add.f16x2 r6544, r6535, r6541; +} +{ +add.f16x2 r6547, r6529, r6544; +} +st.local.u32 [rd4+132], r6547; +{ +add.f16x2 r6550, r5554, r5602; +} +{ +mul.f16x2 r6553, r6550, r6374; +} +{ +add.f16x2 r6556, r4111, r6553; +} +{ +add.f16x2 r6559, r5570, r5586; +} +{ +mul.f16x2 r6562, r6559, r6376; +} +{ +add.f16x2 r6565, r6556, r6562; +} +{ +sub.f16x2 r6568, r5548, r5596; +} +{ +mul.f16x2 r6571, r6568, r6375; +} +{ +sub.f16x2 r6574, r5564, r5580; +} +{ +mul.f16x2 r6577, r6574, r6377; +} +{ +add.f16x2 r6580, r6571, r6577; +} +{ +add.f16x2 r6583, r6565, r6580; +} +st.local.u32 [rd4+56], r6583; +{ +add.f16x2 r6586, r5554, r5602; +} +{ +mul.f16x2 r6589, r6586, r6374; +} +{ +add.f16x2 r6592, r4111, r6589; +} +{ +add.f16x2 r6595, r5570, r5586; +} +{ +mul.f16x2 r6598, r6595, r6376; +} +{ +add.f16x2 r6601, r6592, r6598; +} +{ +sub.f16x2 r6604, r5548, r5596; +} +{ +mul.f16x2 r6607, r6604, r6375; +} +{ +sub.f16x2 r6610, r5564, r5580; +} +{ +mul.f16x2 r6613, r6610, r6377; +} +{ +add.f16x2 r6616, r6607, r6613; +} +{ +sub.f16x2 r6619, r6601, r6616; +} +st.local.u32 [rd4+176], r6619; +{ +add.f16x2 r6622, r5554, r5602; +} +{ +mul.f16x2 r6625, r6622, r6376; +} +{ +add.f16x2 r6628, r4111, r6625; +} +{ +add.f16x2 r6631, r5570, r5586; +} +{ +mul.f16x2 r6634, r6631, r6378; +} +{ +add.f16x2 r6637, r6628, r6634; +} +{ +sub.f16x2 r6640, r5548, r5596; +} +{ +mul.f16x2 r6643, r6640, r6377; +} +{ +sub.f16x2 r6646, r5564, r5580; +} +{ +mul.f16x2 r6649, r6646, r6380; +} +{ +add.f16x2 r6652, r6643, r6649; +} +{ +add.f16x2 r6655, r6637, r6652; +} +st.local.u32 [rd4+96], r6655; +{ +add.f16x2 r6658, r5554, r5602; +} +{ +mul.f16x2 r6661, r6658, r6376; +} +{ +add.f16x2 r6664, r4111, r6661; +} +{ +add.f16x2 r6667, r5570, r5586; +} +{ +mul.f16x2 r6670, r6667, r6378; +} +{ +add.f16x2 r6673, r6664, r6670; +} +{ +sub.f16x2 r6676, r5548, r5596; +} +{ +mul.f16x2 r6679, r6676, r6377; +} +{ +sub.f16x2 r6682, r5564, r5580; +} +{ +mul.f16x2 r6685, r6682, r6380; +} +{ +add.f16x2 r6688, r6679, r6685; +} +{ +sub.f16x2 r6691, r6673, r6688; +} +st.local.u32 [rd4+136], r6691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6694, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6695, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6698, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6699, {low, high}; +} +{ +neg.f16x2 r6700, r6699; +} +{ +add.f16x2 r6702, r5612, r5660; +} +{ +add.f16x2 r6705, r4003, r6702; +} +{ +add.f16x2 r6708, r5628, r5644; +} +{ +add.f16x2 r6711, r6705, r6708; +} +st.local.u32 [rd4+20], r6711; +{ +add.f16x2 r6714, r5618, r5666; +} +{ +add.f16x2 r6717, r4147, r6714; +} +{ +add.f16x2 r6720, r5634, r5650; +} +{ +add.f16x2 r6723, r6717, r6720; +} +st.local.u32 [rd4+24], r6723; +{ +add.f16x2 r6726, r5612, r5660; +} +{ +mul.f16x2 r6729, r6726, r6694; +} +{ +add.f16x2 r6732, r4003, r6729; +} +{ +add.f16x2 r6735, r5628, r5644; +} +{ +mul.f16x2 r6738, r6735, r6696; +} +{ +add.f16x2 r6741, r6732, r6738; +} +{ +sub.f16x2 r6744, r5618, r5666; +} +{ +mul.f16x2 r6747, r6744, r6695; +} +{ +sub.f16x2 r6750, r5634, r5650; +} +{ +mul.f16x2 r6753, r6750, r6697; +} +{ +add.f16x2 r6756, r6747, r6753; +} +{ +sub.f16x2 r6759, r6741, r6756; +} +st.local.u32 [rd4+60], r6759; +{ +add.f16x2 r6762, r5612, r5660; +} +{ +mul.f16x2 r6765, r6762, r6694; +} +{ +add.f16x2 r6768, r4003, r6765; +} +{ +add.f16x2 r6771, r5628, r5644; +} +{ +mul.f16x2 r6774, r6771, r6696; +} +{ +add.f16x2 r6777, r6768, r6774; +} +{ +sub.f16x2 r6780, r5618, r5666; +} +{ +mul.f16x2 r6783, r6780, r6695; +} +{ +sub.f16x2 r6786, r5634, r5650; +} +{ +mul.f16x2 r6789, r6786, r6697; +} +{ +add.f16x2 r6792, r6783, r6789; +} +{ +add.f16x2 r6795, r6777, r6792; +} +st.local.u32 [rd4+180], r6795; +{ +add.f16x2 r6798, r5612, r5660; +} +{ +mul.f16x2 r6801, r6798, r6696; +} +{ +add.f16x2 r6804, r4003, r6801; +} +{ +add.f16x2 r6807, r5628, r5644; +} +{ +mul.f16x2 r6810, r6807, r6698; +} +{ +add.f16x2 r6813, r6804, r6810; +} +{ +sub.f16x2 r6816, r5618, r5666; +} +{ +mul.f16x2 r6819, r6816, r6697; +} +{ +sub.f16x2 r6822, r5634, r5650; +} +{ +mul.f16x2 r6825, r6822, r6700; +} +{ +add.f16x2 r6828, r6819, r6825; +} +{ +sub.f16x2 r6831, r6813, r6828; +} +st.local.u32 [rd4+100], r6831; +{ +add.f16x2 r6834, r5612, r5660; +} +{ +mul.f16x2 r6837, r6834, r6696; +} +{ +add.f16x2 r6840, r4003, r6837; +} +{ +add.f16x2 r6843, r5628, r5644; +} +{ +mul.f16x2 r6846, r6843, r6698; +} +{ +add.f16x2 r6849, r6840, r6846; +} +{ +sub.f16x2 r6852, r5618, r5666; +} +{ +mul.f16x2 r6855, r6852, r6697; +} +{ +sub.f16x2 r6858, r5634, r5650; +} +{ +mul.f16x2 r6861, r6858, r6700; +} +{ +add.f16x2 r6864, r6855, r6861; +} +{ +add.f16x2 r6867, r6849, r6864; +} +st.local.u32 [rd4+140], r6867; +{ +add.f16x2 r6870, r5618, r5666; +} +{ +mul.f16x2 r6873, r6870, r6694; +} +{ +add.f16x2 r6876, r4147, r6873; +} +{ +add.f16x2 r6879, r5634, r5650; +} +{ +mul.f16x2 r6882, r6879, r6696; +} +{ +add.f16x2 r6885, r6876, r6882; +} +{ +sub.f16x2 r6888, r5612, r5660; +} +{ +mul.f16x2 r6891, r6888, r6695; +} +{ +sub.f16x2 r6894, r5628, r5644; +} +{ +mul.f16x2 r6897, r6894, r6697; +} +{ +add.f16x2 r6900, r6891, r6897; +} +{ +add.f16x2 r6903, r6885, r6900; +} +st.local.u32 [rd4+64], r6903; +{ +add.f16x2 r6906, r5618, r5666; +} +{ +mul.f16x2 r6909, r6906, r6694; +} +{ +add.f16x2 r6912, r4147, r6909; +} +{ +add.f16x2 r6915, r5634, r5650; +} +{ +mul.f16x2 r6918, r6915, r6696; +} +{ +add.f16x2 r6921, r6912, r6918; +} +{ +sub.f16x2 r6924, r5612, r5660; +} +{ +mul.f16x2 r6927, r6924, r6695; +} +{ +sub.f16x2 r6930, r5628, r5644; +} +{ +mul.f16x2 r6933, r6930, r6697; +} +{ +add.f16x2 r6936, r6927, r6933; +} +{ +sub.f16x2 r6939, r6921, r6936; +} +st.local.u32 [rd4+184], r6939; +{ +add.f16x2 r6942, r5618, r5666; +} +{ +mul.f16x2 r6945, r6942, r6696; +} +{ +add.f16x2 r6948, r4147, r6945; +} +{ +add.f16x2 r6951, r5634, r5650; +} +{ +mul.f16x2 r6954, r6951, r6698; +} +{ +add.f16x2 r6957, r6948, r6954; +} +{ +sub.f16x2 r6960, r5612, r5660; +} +{ +mul.f16x2 r6963, r6960, r6697; +} +{ +sub.f16x2 r6966, r5628, r5644; +} +{ +mul.f16x2 r6969, r6966, r6700; +} +{ +add.f16x2 r6972, r6963, r6969; +} +{ +add.f16x2 r6975, r6957, r6972; +} +st.local.u32 [rd4+104], r6975; +{ +add.f16x2 r6978, r5618, r5666; +} +{ +mul.f16x2 r6981, r6978, r6696; +} +{ +add.f16x2 r6984, r4147, r6981; +} +{ +add.f16x2 r6987, r5634, r5650; +} +{ +mul.f16x2 r6990, r6987, r6698; +} +{ +add.f16x2 r6993, r6984, r6990; +} +{ +sub.f16x2 r6996, r5612, r5660; +} +{ +mul.f16x2 r6999, r6996, r6697; +} +{ +sub.f16x2 r7002, r5628, r5644; +} +{ +mul.f16x2 r7005, r7002, r6700; +} +{ +add.f16x2 r7008, r6999, r7005; +} +{ +sub.f16x2 r7011, r6993, r7008; +} +st.local.u32 [rd4+144], r7011; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7015, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7016, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7019, {low, high}; +} +{ +neg.f16x2 r7020, r7019; +} +{ +add.f16x2 r7022, r5676, r5724; +} +{ +add.f16x2 r7025, r3931, r7022; +} +{ +add.f16x2 r7028, r5692, r5708; +} +{ +add.f16x2 r7031, r7025, r7028; +} +st.local.u32 [rd4+28], r7031; +{ +add.f16x2 r7034, r5682, r5730; +} +{ +add.f16x2 r7037, r4075, r7034; +} +{ +add.f16x2 r7040, r5698, r5714; +} +{ +add.f16x2 r7043, r7037, r7040; +} +st.local.u32 [rd4+32], r7043; +{ +add.f16x2 r7046, r5676, r5724; +} +{ +mul.f16x2 r7049, r7046, r7014; +} +{ +add.f16x2 r7052, r3931, r7049; +} +{ +add.f16x2 r7055, r5692, r5708; +} +{ +mul.f16x2 r7058, r7055, r7016; +} +{ +add.f16x2 r7061, r7052, r7058; +} +{ +sub.f16x2 r7064, r5682, r5730; +} +{ +mul.f16x2 r7067, r7064, r7015; +} +{ +sub.f16x2 r7070, r5698, r5714; +} +{ +mul.f16x2 r7073, r7070, r7017; +} +{ +add.f16x2 r7076, r7067, r7073; +} +{ +sub.f16x2 r7079, r7061, r7076; +} +st.local.u32 [rd4+68], r7079; +{ +add.f16x2 r7082, r5676, r5724; +} +{ +mul.f16x2 r7085, r7082, r7014; +} +{ +add.f16x2 r7088, r3931, r7085; +} +{ +add.f16x2 r7091, r5692, r5708; +} +{ +mul.f16x2 r7094, r7091, r7016; +} +{ +add.f16x2 r7097, r7088, r7094; +} +{ +sub.f16x2 r7100, r5682, r5730; +} +{ +mul.f16x2 r7103, r7100, r7015; +} +{ +sub.f16x2 r7106, r5698, r5714; +} +{ +mul.f16x2 r7109, r7106, r7017; +} +{ +add.f16x2 r7112, r7103, r7109; +} +{ +add.f16x2 r7115, r7097, r7112; +} +st.local.u32 [rd4+188], r7115; +{ +add.f16x2 r7118, r5676, r5724; +} +{ +mul.f16x2 r7121, r7118, r7016; +} +{ +add.f16x2 r7124, r3931, r7121; +} +{ +add.f16x2 r7127, r5692, r5708; +} +{ +mul.f16x2 r7130, r7127, r7018; +} +{ +add.f16x2 r7133, r7124, r7130; +} +{ +sub.f16x2 r7136, r5682, r5730; +} +{ +mul.f16x2 r7139, r7136, r7017; +} +{ +sub.f16x2 r7142, r5698, r5714; +} +{ +mul.f16x2 r7145, r7142, r7020; +} +{ +add.f16x2 r7148, r7139, r7145; +} +{ +sub.f16x2 r7151, r7133, r7148; +} +st.local.u32 [rd4+108], r7151; +{ +add.f16x2 r7154, r5676, r5724; +} +{ +mul.f16x2 r7157, r7154, r7016; +} +{ +add.f16x2 r7160, r3931, r7157; +} +{ +add.f16x2 r7163, r5692, r5708; +} +{ +mul.f16x2 r7166, r7163, r7018; +} +{ +add.f16x2 r7169, r7160, r7166; +} +{ +sub.f16x2 r7172, r5682, r5730; +} +{ +mul.f16x2 r7175, r7172, r7017; +} +{ +sub.f16x2 r7178, r5698, r5714; +} +{ +mul.f16x2 r7181, r7178, r7020; +} +{ +add.f16x2 r7184, r7175, r7181; +} +{ +add.f16x2 r7187, r7169, r7184; +} +st.local.u32 [rd4+148], r7187; +{ +add.f16x2 r7190, r5682, r5730; +} +{ +mul.f16x2 r7193, r7190, r7014; +} +{ +add.f16x2 r7196, r4075, r7193; +} +{ +add.f16x2 r7199, r5698, r5714; +} +{ +mul.f16x2 r7202, r7199, r7016; +} +{ +add.f16x2 r7205, r7196, r7202; +} +{ +sub.f16x2 r7208, r5676, r5724; +} +{ +mul.f16x2 r7211, r7208, r7015; +} +{ +sub.f16x2 r7214, r5692, r5708; +} +{ +mul.f16x2 r7217, r7214, r7017; +} +{ +add.f16x2 r7220, r7211, r7217; +} +{ +add.f16x2 r7223, r7205, r7220; +} +st.local.u32 [rd4+72], r7223; +{ +add.f16x2 r7226, r5682, r5730; +} +{ +mul.f16x2 r7229, r7226, r7014; +} +{ +add.f16x2 r7232, r4075, r7229; +} +{ +add.f16x2 r7235, r5698, r5714; +} +{ +mul.f16x2 r7238, r7235, r7016; +} +{ +add.f16x2 r7241, r7232, r7238; +} +{ +sub.f16x2 r7244, r5676, r5724; +} +{ +mul.f16x2 r7247, r7244, r7015; +} +{ +sub.f16x2 r7250, r5692, r5708; +} +{ +mul.f16x2 r7253, r7250, r7017; +} +{ +add.f16x2 r7256, r7247, r7253; +} +{ +sub.f16x2 r7259, r7241, r7256; +} +st.local.u32 [rd4+192], r7259; +{ +add.f16x2 r7262, r5682, r5730; +} +{ +mul.f16x2 r7265, r7262, r7016; +} +{ +add.f16x2 r7268, r4075, r7265; +} +{ +add.f16x2 r7271, r5698, r5714; +} +{ +mul.f16x2 r7274, r7271, r7018; +} +{ +add.f16x2 r7277, r7268, r7274; +} +{ +sub.f16x2 r7280, r5676, r5724; +} +{ +mul.f16x2 r7283, r7280, r7017; +} +{ +sub.f16x2 r7286, r5692, r5708; +} +{ +mul.f16x2 r7289, r7286, r7020; +} +{ +add.f16x2 r7292, r7283, r7289; +} +{ +add.f16x2 r7295, r7277, r7292; +} +st.local.u32 [rd4+112], r7295; +{ +add.f16x2 r7298, r5682, r5730; +} +{ +mul.f16x2 r7301, r7298, r7016; +} +{ +add.f16x2 r7304, r4075, r7301; +} +{ +add.f16x2 r7307, r5698, r5714; +} +{ +mul.f16x2 r7310, r7307, r7018; +} +{ +add.f16x2 r7313, r7304, r7310; +} +{ +sub.f16x2 r7316, r5676, r5724; +} +{ +mul.f16x2 r7319, r7316, r7017; +} +{ +sub.f16x2 r7322, r5692, r5708; +} +{ +mul.f16x2 r7325, r7322, r7020; +} +{ +add.f16x2 r7328, r7319, r7325; +} +{ +sub.f16x2 r7331, r7313, r7328; +} +st.local.u32 [rd4+152], r7331; +mul.wide.u32 rd16, r3, 1374389535; +shr.u64 rd17, rd16, 35; +cvt.u32.u64 r15, rd17; +cvt.rn.f32.u32 f460, r15; +mul.f32 f461, f460, 0f3C24B5BE; +cos.approx.f32 f456, f461; +sin.approx.f32 f462, f461; +neg.f32 f457, f462; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f457; +mov.b32 r11353, {low, high}; +} +mov.u32 r11351, -8; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11353; +mov.b32 r7462, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11353; +mov.b32 r7464, {high, high}; +} +mov.u64 rd20, rd3; +bra.uni LBB0_4; +LBB0_5: +ld.local.u32 r11352, [rd20+72]; +add.s64 rd20, rd20, 64; +LBB0_4: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11353; +mov.b32 r7442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11353; +mov.b32 r7444, {high, high}; +} +ld.local.u32 r7447, [rd20+12]; +{ +mul.f16x2 r7446, r7447, r7444; +} +{ +neg.f16x2 r7449, r7446; +} +{ +fma.rn.f16x2 r7451, r11352, r7442, r7449; +} +st.local.u32 [rd20+8], r7451; +{ +mul.f16x2 r7455, r11352, r7444; +} +{ +fma.rn.f16x2 r7458, r7447, r7442, r7455; +} +st.local.u32 [rd20+12], r7458; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7466, {low, high}; +} +{ +mul.f16x2 r7467, r7464, r7466; +} +{ +mul.f16x2 r7470, r11353, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11353; +mov.b32 r7473, {high, low}; +} +{ +fma.rn.f16x2 r7475, r7467, r7473, r7470; +} +ld.local.u32 r7493, [rd20+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7475; +mov.b32 r7479, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7475; +mov.b32 r7481, {high, high}; +} +ld.local.u32 r7496, [rd20+20]; +{ +mul.f16x2 r7483, r7496, r7481; +} +{ +neg.f16x2 r7486, r7483; +} +{ +fma.rn.f16x2 r7488, r7493, r7479, r7486; +} +st.local.u32 [rd20+16], r7488; +{ +mul.f16x2 r7492, r7493, r7481; +} +{ +fma.rn.f16x2 r7495, r7496, r7479, r7492; +} +st.local.u32 [rd20+20], r7495; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7503, {low, high}; +} +{ +mul.f16x2 r7504, r7464, r7503; +} +{ +mul.f16x2 r7507, r7475, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7475; +mov.b32 r7510, {high, low}; +} +{ +fma.rn.f16x2 r7512, r7504, r7510, r7507; +} +ld.local.u32 r7530, [rd20+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7512; +mov.b32 r7516, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7512; +mov.b32 r7518, {high, high}; +} +ld.local.u32 r7533, [rd20+28]; +{ +mul.f16x2 r7520, r7533, r7518; +} +{ +neg.f16x2 r7523, r7520; +} +{ +fma.rn.f16x2 r7525, r7530, r7516, r7523; +} +st.local.u32 [rd20+24], r7525; +{ +mul.f16x2 r7529, r7530, r7518; +} +{ +fma.rn.f16x2 r7532, r7533, r7516, r7529; +} +st.local.u32 [rd20+28], r7532; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7540, {low, high}; +} +{ +mul.f16x2 r7541, r7464, r7540; +} +{ +mul.f16x2 r7544, r7512, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7512; +mov.b32 r7547, {high, low}; +} +{ +fma.rn.f16x2 r7549, r7541, r7547, r7544; +} +ld.local.u32 r7567, [rd20+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7549; +mov.b32 r7553, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7549; +mov.b32 r7555, {high, high}; +} +ld.local.u32 r7570, [rd20+36]; +{ +mul.f16x2 r7557, r7570, r7555; +} +{ +neg.f16x2 r7560, r7557; +} +{ +fma.rn.f16x2 r7562, r7567, r7553, r7560; +} +st.local.u32 [rd20+32], r7562; +{ +mul.f16x2 r7566, r7567, r7555; +} +{ +fma.rn.f16x2 r7569, r7570, r7553, r7566; +} +st.local.u32 [rd20+36], r7569; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7577, {low, high}; +} +{ +mul.f16x2 r7578, r7464, r7577; +} +{ +mul.f16x2 r7581, r7549, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7549; +mov.b32 r7584, {high, low}; +} +{ +fma.rn.f16x2 r7586, r7578, r7584, r7581; +} +ld.local.u32 r7600, [rd20+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7586; +mov.b32 r7590, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7586; +mov.b32 r7592, {high, high}; +} +ld.local.u32 r7607, [rd20+44]; +{ +mul.f16x2 r7594, r7607, r7592; +} +{ +neg.f16x2 r7597, r7594; +} +{ +fma.rn.f16x2 r7599, r7600, r7590, r7597; +} +st.local.u32 [rd20+40], r7599; +{ +mul.f16x2 r7603, r7600, r7592; +} +{ +fma.rn.f16x2 r7606, r7607, r7590, r7603; +} +st.local.u32 [rd20+44], r7606; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7614, {low, high}; +} +{ +mul.f16x2 r7615, r7464, r7614; +} +{ +mul.f16x2 r7618, r7586, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7586; +mov.b32 r7621, {high, low}; +} +{ +fma.rn.f16x2 r7623, r7615, r7621, r7618; +} +ld.local.u32 r7637, [rd20+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7623; +mov.b32 r7627, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7623; +mov.b32 r7629, {high, high}; +} +ld.local.u32 r7632, [rd20+52]; +{ +mul.f16x2 r7631, r7632, r7629; +} +{ +neg.f16x2 r7634, r7631; +} +{ +fma.rn.f16x2 r7636, r7637, r7627, r7634; +} +st.local.u32 [rd20+48], r7636; +{ +mul.f16x2 r7640, r7637, r7629; +} +{ +fma.rn.f16x2 r7643, r7632, r7627, r7640; +} +st.local.u32 [rd20+52], r7643; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7651, {low, high}; +} +{ +mul.f16x2 r7652, r7464, r7651; +} +{ +mul.f16x2 r7655, r7623, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7623; +mov.b32 r7658, {high, low}; +} +{ +fma.rn.f16x2 r7660, r7652, r7658, r7655; +} +ld.local.u32 r7674, [rd20+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7660; +mov.b32 r7664, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7660; +mov.b32 r7666, {high, high}; +} +ld.local.u32 r7669, [rd20+60]; +{ +mul.f16x2 r7668, r7669, r7666; +} +{ +neg.f16x2 r7671, r7668; +} +{ +fma.rn.f16x2 r7673, r7674, r7664, r7671; +} +st.local.u32 [rd20+56], r7673; +{ +mul.f16x2 r7677, r7674, r7666; +} +{ +fma.rn.f16x2 r7680, r7669, r7664, r7677; +} +st.local.u32 [rd20+60], r7680; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7688, {low, high}; +} +{ +mul.f16x2 r7689, r7464, r7688; +} +{ +mul.f16x2 r7692, r7660, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7660; +mov.b32 r7695, {high, low}; +} +{ +fma.rn.f16x2 r7697, r7689, r7695, r7692; +} +ld.local.u32 r7711, [rd20+64]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7697; +mov.b32 r7701, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7697; +mov.b32 r7703, {high, high}; +} +ld.local.u32 r7706, [rd20+68]; +{ +mul.f16x2 r7705, r7706, r7703; +} +{ +neg.f16x2 r7708, r7705; +} +{ +fma.rn.f16x2 r7710, r7711, r7701, r7708; +} +st.local.u32 [rd20+64], r7710; +{ +mul.f16x2 r7714, r7711, r7703; +} +{ +fma.rn.f16x2 r7717, r7706, r7701, r7714; +} +st.local.u32 [rd20+68], r7717; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7725, {low, high}; +} +{ +mul.f16x2 r7726, r7464, r7725; +} +{ +mul.f16x2 r7729, r7697, r7462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7697; +mov.b32 r7732, {high, low}; +} +{ +fma.rn.f16x2 r11353, r7726, r7732, r7729; +} +add.s32 r11351, r11351, 8; +setp.eq.s32 p2, r11351, 16; +@p2 bra LBB0_6; +bra.uni LBB0_5; +LBB0_6: +mul.lo.s32 r11243, r15, 25; +sub.s32 r11244, r3, r11243; +shl.b32 r11245, r11244, 3; +add.s32 r11246, r12, r11245; +barrier.sync 0; +mad.lo.s32 r11247, r15, 5000, r11246; +ld.local.v2.u32 {r11248, r11249}, [rd3]; +st.shared.u32 [r11247], r11248; +st.shared.u32 [r11247+4], r11249; +ld.local.v2.u32 {r11252, r11253}, [rd4+4]; +st.shared.u32 [r11247+200], r11252; +st.shared.u32 [r11247+204], r11253; +ld.local.v2.u32 {r11256, r11257}, [rd4+12]; +st.shared.u32 [r11247+400], r11256; +st.shared.u32 [r11247+404], r11257; +ld.local.v2.u32 {r11260, r11261}, [rd4+20]; +st.shared.u32 [r11247+600], r11260; +st.shared.u32 [r11247+604], r11261; +ld.local.v2.u32 {r11264, r11265}, [rd4+28]; +st.shared.u32 [r11247+800], r11264; +st.shared.u32 [r11247+804], r11265; +ld.local.v2.u32 {r11268, r11269}, [rd4+36]; +st.shared.u32 [r11247+1000], r11268; +st.shared.u32 [r11247+1004], r11269; +ld.local.v2.u32 {r11272, r11273}, [rd4+44]; +st.shared.u32 [r11247+1200], r11272; +st.shared.u32 [r11247+1204], r11273; +ld.local.v2.u32 {r11276, r11277}, [rd4+52]; +st.shared.u32 [r11247+1400], r11276; +st.shared.u32 [r11247+1404], r11277; +ld.local.v2.u32 {r11280, r11281}, [rd4+60]; +st.shared.u32 [r11247+1600], r11280; +st.shared.u32 [r11247+1604], r11281; +ld.local.v2.u32 {r11284, r11285}, [rd4+68]; +st.shared.u32 [r11247+1800], r11284; +st.shared.u32 [r11247+1804], r11285; +ld.local.v2.u32 {r11288, r11289}, [rd4+76]; +st.shared.u32 [r11247+2000], r11288; +st.shared.u32 [r11247+2004], r11289; +ld.local.v2.u32 {r11292, r11293}, [rd4+84]; +st.shared.u32 [r11247+2200], r11292; +st.shared.u32 [r11247+2204], r11293; +ld.local.v2.u32 {r11296, r11297}, [rd4+92]; +st.shared.u32 [r11247+2400], r11296; +st.shared.u32 [r11247+2404], r11297; +ld.local.v2.u32 {r11300, r11301}, [rd4+100]; +st.shared.u32 [r11247+2600], r11300; +st.shared.u32 [r11247+2604], r11301; +ld.local.v2.u32 {r11304, r11305}, [rd4+108]; +st.shared.u32 [r11247+2800], r11304; +st.shared.u32 [r11247+2804], r11305; +ld.local.v2.u32 {r11308, r11309}, [rd4+116]; +st.shared.u32 [r11247+3000], r11308; +st.shared.u32 [r11247+3004], r11309; +ld.local.v2.u32 {r11312, r11313}, [rd4+124]; +st.shared.u32 [r11247+3200], r11312; +st.shared.u32 [r11247+3204], r11313; +ld.local.v2.u32 {r11316, r11317}, [rd4+132]; +st.shared.u32 [r11247+3400], r11316; +st.shared.u32 [r11247+3404], r11317; +ld.local.v2.u32 {r11320, r11321}, [rd4+140]; +st.shared.u32 [r11247+3600], r11320; +st.shared.u32 [r11247+3604], r11321; +ld.local.v2.u32 {r11324, r11325}, [rd4+148]; +st.shared.u32 [r11247+3800], r11324; +st.shared.u32 [r11247+3804], r11325; +ld.local.v2.u32 {r11328, r11329}, [rd4+156]; +st.shared.u32 [r11247+4000], r11328; +st.shared.u32 [r11247+4004], r11329; +ld.local.v2.u32 {r11332, r11333}, [rd4+164]; +st.shared.u32 [r11247+4200], r11332; +st.shared.u32 [r11247+4204], r11333; +ld.local.v2.u32 {r11336, r11337}, [rd4+172]; +st.shared.u32 [r11247+4400], r11336; +st.shared.u32 [r11247+4404], r11337; +ld.local.v2.u32 {r11340, r11341}, [rd4+180]; +st.shared.u32 [r11247+4600], r11340; +st.shared.u32 [r11247+4604], r11341; +ld.local.v2.u32 {r11344, r11345}, [rd4+188]; +st.shared.u32 [r11247+4800], r11344; +st.shared.u32 [r11247+4804], r11345; +barrier.sync 0; +ld.shared.u32 r7750, [r13]; +ld.shared.u32 r7762, [r13+4]; +ld.shared.u32 r8070, [r13+5000]; +ld.shared.u32 r8082, [r13+5004]; +ld.shared.u32 r8390, [r13+10000]; +ld.shared.u32 r8402, [r13+10004]; +ld.shared.u32 r8710, [r13+15000]; +ld.shared.u32 r8722, [r13+15004]; +ld.shared.u32 r9030, [r13+20000]; +ld.shared.u32 r9042, [r13+20004]; +ld.shared.u32 r7747, [r13+25000]; +ld.shared.u32 r7759, [r13+25004]; +ld.shared.u32 r8067, [r13+30000]; +ld.shared.u32 r8079, [r13+30004]; +ld.shared.u32 r8387, [r13+35000]; +ld.shared.u32 r8399, [r13+35004]; +ld.shared.u32 r8707, [r13+40000]; +ld.shared.u32 r8719, [r13+40004]; +ld.shared.u32 r9027, [r13+45000]; +ld.shared.u32 r9039, [r13+45004]; +ld.shared.u32 r7753, [r13+50000]; +ld.shared.u32 r7765, [r13+50004]; +ld.shared.u32 r8073, [r13+55000]; +ld.shared.u32 r8085, [r13+55004]; +ld.shared.u32 r8393, [r13+60000]; +ld.shared.u32 r8405, [r13+60004]; +ld.shared.u32 r8713, [r13+65000]; +ld.shared.u32 r8725, [r13+65004]; +ld.shared.u32 r9033, [r13+70000]; +ld.shared.u32 r9045, [r13+70004]; +ld.shared.u32 r7754, [r13+75000]; +ld.shared.u32 r7766, [r13+75004]; +ld.shared.u32 r8074, [r13+80000]; +ld.shared.u32 r8086, [r13+80004]; +ld.shared.u32 r8394, [r13+85000]; +ld.shared.u32 r8406, [r13+85004]; +ld.shared.u32 r8714, [r13+90000]; +ld.shared.u32 r8726, [r13+90004]; +ld.shared.u32 r9034, [r13+95000]; +ld.shared.u32 r9046, [r13+95004]; +ld.shared.u32 r7748, [r13+100000]; +ld.shared.u32 r7760, [r13+100004]; +ld.shared.u32 r8068, [r13+105000]; +ld.shared.u32 r8080, [r13+105004]; +ld.shared.u32 r8388, [r13+110000]; +ld.shared.u32 r8400, [r13+110004]; +ld.shared.u32 r8708, [r13+115000]; +ld.shared.u32 r8720, [r13+115004]; +ld.shared.u32 r9028, [r13+120000]; +ld.shared.u32 r9040, [r13+120004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7739, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7743, {low, high}; +} +{ +neg.f16x2 r7744, r7743; +} +{ +add.f16x2 r7746, r7747, r7748; +} +{ +add.f16x2 r7749, r7750, r7746; +} +{ +add.f16x2 r7752, r7753, r7754; +} +{ +add.f16x2 r7755, r7749, r7752; +} +{ +add.f16x2 r7758, r7759, r7760; +} +{ +add.f16x2 r7761, r7762, r7758; +} +{ +add.f16x2 r7764, r7765, r7766; +} +{ +add.f16x2 r7767, r7761, r7764; +} +{ +add.f16x2 r7770, r7747, r7748; +} +{ +mul.f16x2 r7773, r7770, r7738; +} +{ +add.f16x2 r7776, r7750, r7773; +} +{ +add.f16x2 r7779, r7753, r7754; +} +{ +mul.f16x2 r7782, r7779, r7740; +} +{ +add.f16x2 r7785, r7776, r7782; +} +{ +sub.f16x2 r7788, r7759, r7760; +} +{ +mul.f16x2 r7791, r7788, r7739; +} +{ +sub.f16x2 r7794, r7765, r7766; +} +{ +mul.f16x2 r7797, r7794, r7741; +} +{ +add.f16x2 r7800, r7791, r7797; +} +{ +sub.f16x2 r7803, r7785, r7800; +} +{ +add.f16x2 r7806, r7747, r7748; +} +{ +mul.f16x2 r7809, r7806, r7738; +} +{ +add.f16x2 r7812, r7750, r7809; +} +{ +add.f16x2 r7815, r7753, r7754; +} +{ +mul.f16x2 r7818, r7815, r7740; +} +{ +add.f16x2 r7821, r7812, r7818; +} +{ +sub.f16x2 r7824, r7759, r7760; +} +{ +mul.f16x2 r7827, r7824, r7739; +} +{ +sub.f16x2 r7830, r7765, r7766; +} +{ +mul.f16x2 r7833, r7830, r7741; +} +{ +add.f16x2 r7836, r7827, r7833; +} +{ +add.f16x2 r7839, r7821, r7836; +} +{ +add.f16x2 r7842, r7747, r7748; +} +{ +mul.f16x2 r7845, r7842, r7740; +} +{ +add.f16x2 r7848, r7750, r7845; +} +{ +add.f16x2 r7851, r7753, r7754; +} +{ +mul.f16x2 r7854, r7851, r7742; +} +{ +add.f16x2 r7857, r7848, r7854; +} +{ +sub.f16x2 r7860, r7759, r7760; +} +{ +mul.f16x2 r7863, r7860, r7741; +} +{ +sub.f16x2 r7866, r7765, r7766; +} +{ +mul.f16x2 r7869, r7866, r7744; +} +{ +add.f16x2 r7872, r7863, r7869; +} +{ +sub.f16x2 r7875, r7857, r7872; +} +{ +add.f16x2 r7878, r7747, r7748; +} +{ +mul.f16x2 r7881, r7878, r7740; +} +{ +add.f16x2 r7884, r7750, r7881; +} +{ +add.f16x2 r7887, r7753, r7754; +} +{ +mul.f16x2 r7890, r7887, r7742; +} +{ +add.f16x2 r7893, r7884, r7890; +} +{ +sub.f16x2 r7896, r7759, r7760; +} +{ +mul.f16x2 r7899, r7896, r7741; +} +{ +sub.f16x2 r7902, r7765, r7766; +} +{ +mul.f16x2 r7905, r7902, r7744; +} +{ +add.f16x2 r7908, r7899, r7905; +} +{ +add.f16x2 r7911, r7893, r7908; +} +{ +add.f16x2 r7914, r7759, r7760; +} +{ +mul.f16x2 r7917, r7914, r7738; +} +{ +add.f16x2 r7920, r7762, r7917; +} +{ +add.f16x2 r7923, r7765, r7766; +} +{ +mul.f16x2 r7926, r7923, r7740; +} +{ +add.f16x2 r7929, r7920, r7926; +} +{ +sub.f16x2 r7932, r7747, r7748; +} +{ +mul.f16x2 r7935, r7932, r7739; +} +{ +sub.f16x2 r7938, r7753, r7754; +} +{ +mul.f16x2 r7941, r7938, r7741; +} +{ +add.f16x2 r7944, r7935, r7941; +} +{ +add.f16x2 r7947, r7929, r7944; +} +{ +add.f16x2 r7950, r7759, r7760; +} +{ +mul.f16x2 r7953, r7950, r7738; +} +{ +add.f16x2 r7956, r7762, r7953; +} +{ +add.f16x2 r7959, r7765, r7766; +} +{ +mul.f16x2 r7962, r7959, r7740; +} +{ +add.f16x2 r7965, r7956, r7962; +} +{ +sub.f16x2 r7968, r7747, r7748; +} +{ +mul.f16x2 r7971, r7968, r7739; +} +{ +sub.f16x2 r7974, r7753, r7754; +} +{ +mul.f16x2 r7977, r7974, r7741; +} +{ +add.f16x2 r7980, r7971, r7977; +} +{ +sub.f16x2 r7983, r7965, r7980; +} +{ +add.f16x2 r7986, r7759, r7760; +} +{ +mul.f16x2 r7989, r7986, r7740; +} +{ +add.f16x2 r7992, r7762, r7989; +} +{ +add.f16x2 r7995, r7765, r7766; +} +{ +mul.f16x2 r7998, r7995, r7742; +} +{ +add.f16x2 r8001, r7992, r7998; +} +{ +sub.f16x2 r8004, r7747, r7748; +} +{ +mul.f16x2 r8007, r8004, r7741; +} +{ +sub.f16x2 r8010, r7753, r7754; +} +{ +mul.f16x2 r8013, r8010, r7744; +} +{ +add.f16x2 r8016, r8007, r8013; +} +{ +add.f16x2 r8019, r8001, r8016; +} +{ +add.f16x2 r8022, r7759, r7760; +} +{ +mul.f16x2 r8025, r8022, r7740; +} +{ +add.f16x2 r8028, r7762, r8025; +} +{ +add.f16x2 r8031, r7765, r7766; +} +{ +mul.f16x2 r8034, r8031, r7742; +} +{ +add.f16x2 r8037, r8028, r8034; +} +{ +sub.f16x2 r8040, r7747, r7748; +} +{ +mul.f16x2 r8043, r8040, r7741; +} +{ +sub.f16x2 r8046, r7753, r7754; +} +{ +mul.f16x2 r8049, r8046, r7744; +} +{ +add.f16x2 r8052, r8043, r8049; +} +{ +sub.f16x2 r8055, r8037, r8052; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8058, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8059, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8061, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8062, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8063, {low, high}; +} +{ +neg.f16x2 r8064, r8063; +} +{ +add.f16x2 r8066, r8067, r8068; +} +{ +add.f16x2 r8069, r8070, r8066; +} +{ +add.f16x2 r8072, r8073, r8074; +} +{ +add.f16x2 r8075, r8069, r8072; +} +{ +add.f16x2 r8078, r8079, r8080; +} +{ +add.f16x2 r8081, r8082, r8078; +} +{ +add.f16x2 r8084, r8085, r8086; +} +{ +add.f16x2 r8087, r8081, r8084; +} +{ +add.f16x2 r8090, r8067, r8068; +} +{ +mul.f16x2 r8093, r8090, r8058; +} +{ +add.f16x2 r8096, r8070, r8093; +} +{ +add.f16x2 r8099, r8073, r8074; +} +{ +mul.f16x2 r8102, r8099, r8060; +} +{ +add.f16x2 r8105, r8096, r8102; +} +{ +sub.f16x2 r8108, r8079, r8080; +} +{ +mul.f16x2 r8111, r8108, r8059; +} +{ +sub.f16x2 r8114, r8085, r8086; +} +{ +mul.f16x2 r8117, r8114, r8061; +} +{ +add.f16x2 r8120, r8111, r8117; +} +{ +sub.f16x2 r8123, r8105, r8120; +} +{ +add.f16x2 r8126, r8067, r8068; +} +{ +mul.f16x2 r8129, r8126, r8058; +} +{ +add.f16x2 r8132, r8070, r8129; +} +{ +add.f16x2 r8135, r8073, r8074; +} +{ +mul.f16x2 r8138, r8135, r8060; +} +{ +add.f16x2 r8141, r8132, r8138; +} +{ +sub.f16x2 r8144, r8079, r8080; +} +{ +mul.f16x2 r8147, r8144, r8059; +} +{ +sub.f16x2 r8150, r8085, r8086; +} +{ +mul.f16x2 r8153, r8150, r8061; +} +{ +add.f16x2 r8156, r8147, r8153; +} +{ +add.f16x2 r8159, r8141, r8156; +} +{ +add.f16x2 r8162, r8067, r8068; +} +{ +mul.f16x2 r8165, r8162, r8060; +} +{ +add.f16x2 r8168, r8070, r8165; +} +{ +add.f16x2 r8171, r8073, r8074; +} +{ +mul.f16x2 r8174, r8171, r8062; +} +{ +add.f16x2 r8177, r8168, r8174; +} +{ +sub.f16x2 r8180, r8079, r8080; +} +{ +mul.f16x2 r8183, r8180, r8061; +} +{ +sub.f16x2 r8186, r8085, r8086; +} +{ +mul.f16x2 r8189, r8186, r8064; +} +{ +add.f16x2 r8192, r8183, r8189; +} +{ +sub.f16x2 r8195, r8177, r8192; +} +{ +add.f16x2 r8198, r8067, r8068; +} +{ +mul.f16x2 r8201, r8198, r8060; +} +{ +add.f16x2 r8204, r8070, r8201; +} +{ +add.f16x2 r8207, r8073, r8074; +} +{ +mul.f16x2 r8210, r8207, r8062; +} +{ +add.f16x2 r8213, r8204, r8210; +} +{ +sub.f16x2 r8216, r8079, r8080; +} +{ +mul.f16x2 r8219, r8216, r8061; +} +{ +sub.f16x2 r8222, r8085, r8086; +} +{ +mul.f16x2 r8225, r8222, r8064; +} +{ +add.f16x2 r8228, r8219, r8225; +} +{ +add.f16x2 r8231, r8213, r8228; +} +{ +add.f16x2 r8234, r8079, r8080; +} +{ +mul.f16x2 r8237, r8234, r8058; +} +{ +add.f16x2 r8240, r8082, r8237; +} +{ +add.f16x2 r8243, r8085, r8086; +} +{ +mul.f16x2 r8246, r8243, r8060; +} +{ +add.f16x2 r8249, r8240, r8246; +} +{ +sub.f16x2 r8252, r8067, r8068; +} +{ +mul.f16x2 r8255, r8252, r8059; +} +{ +sub.f16x2 r8258, r8073, r8074; +} +{ +mul.f16x2 r8261, r8258, r8061; +} +{ +add.f16x2 r8264, r8255, r8261; +} +{ +add.f16x2 r8267, r8249, r8264; +} +{ +add.f16x2 r8270, r8079, r8080; +} +{ +mul.f16x2 r8273, r8270, r8058; +} +{ +add.f16x2 r8276, r8082, r8273; +} +{ +add.f16x2 r8279, r8085, r8086; +} +{ +mul.f16x2 r8282, r8279, r8060; +} +{ +add.f16x2 r8285, r8276, r8282; +} +{ +sub.f16x2 r8288, r8067, r8068; +} +{ +mul.f16x2 r8291, r8288, r8059; +} +{ +sub.f16x2 r8294, r8073, r8074; +} +{ +mul.f16x2 r8297, r8294, r8061; +} +{ +add.f16x2 r8300, r8291, r8297; +} +{ +sub.f16x2 r8303, r8285, r8300; +} +{ +add.f16x2 r8306, r8079, r8080; +} +{ +mul.f16x2 r8309, r8306, r8060; +} +{ +add.f16x2 r8312, r8082, r8309; +} +{ +add.f16x2 r8315, r8085, r8086; +} +{ +mul.f16x2 r8318, r8315, r8062; +} +{ +add.f16x2 r8321, r8312, r8318; +} +{ +sub.f16x2 r8324, r8067, r8068; +} +{ +mul.f16x2 r8327, r8324, r8061; +} +{ +sub.f16x2 r8330, r8073, r8074; +} +{ +mul.f16x2 r8333, r8330, r8064; +} +{ +add.f16x2 r8336, r8327, r8333; +} +{ +add.f16x2 r8339, r8321, r8336; +} +{ +add.f16x2 r8342, r8079, r8080; +} +{ +mul.f16x2 r8345, r8342, r8060; +} +{ +add.f16x2 r8348, r8082, r8345; +} +{ +add.f16x2 r8351, r8085, r8086; +} +{ +mul.f16x2 r8354, r8351, r8062; +} +{ +add.f16x2 r8357, r8348, r8354; +} +{ +sub.f16x2 r8360, r8067, r8068; +} +{ +mul.f16x2 r8363, r8360, r8061; +} +{ +sub.f16x2 r8366, r8073, r8074; +} +{ +mul.f16x2 r8369, r8366, r8064; +} +{ +add.f16x2 r8372, r8363, r8369; +} +{ +sub.f16x2 r8375, r8357, r8372; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8378, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8379, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8380, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8381, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8383, {low, high}; +} +{ +neg.f16x2 r8384, r8383; +} +{ +add.f16x2 r8386, r8387, r8388; +} +{ +add.f16x2 r8389, r8390, r8386; +} +{ +add.f16x2 r8392, r8393, r8394; +} +{ +add.f16x2 r8395, r8389, r8392; +} +{ +add.f16x2 r8398, r8399, r8400; +} +{ +add.f16x2 r8401, r8402, r8398; +} +{ +add.f16x2 r8404, r8405, r8406; +} +{ +add.f16x2 r8407, r8401, r8404; +} +{ +add.f16x2 r8410, r8387, r8388; +} +{ +mul.f16x2 r8413, r8410, r8378; +} +{ +add.f16x2 r8416, r8390, r8413; +} +{ +add.f16x2 r8419, r8393, r8394; +} +{ +mul.f16x2 r8422, r8419, r8380; +} +{ +add.f16x2 r8425, r8416, r8422; +} +{ +sub.f16x2 r8428, r8399, r8400; +} +{ +mul.f16x2 r8431, r8428, r8379; +} +{ +sub.f16x2 r8434, r8405, r8406; +} +{ +mul.f16x2 r8437, r8434, r8381; +} +{ +add.f16x2 r8440, r8431, r8437; +} +{ +sub.f16x2 r8443, r8425, r8440; +} +{ +add.f16x2 r8446, r8387, r8388; +} +{ +mul.f16x2 r8449, r8446, r8378; +} +{ +add.f16x2 r8452, r8390, r8449; +} +{ +add.f16x2 r8455, r8393, r8394; +} +{ +mul.f16x2 r8458, r8455, r8380; +} +{ +add.f16x2 r8461, r8452, r8458; +} +{ +sub.f16x2 r8464, r8399, r8400; +} +{ +mul.f16x2 r8467, r8464, r8379; +} +{ +sub.f16x2 r8470, r8405, r8406; +} +{ +mul.f16x2 r8473, r8470, r8381; +} +{ +add.f16x2 r8476, r8467, r8473; +} +{ +add.f16x2 r8479, r8461, r8476; +} +{ +add.f16x2 r8482, r8387, r8388; +} +{ +mul.f16x2 r8485, r8482, r8380; +} +{ +add.f16x2 r8488, r8390, r8485; +} +{ +add.f16x2 r8491, r8393, r8394; +} +{ +mul.f16x2 r8494, r8491, r8382; +} +{ +add.f16x2 r8497, r8488, r8494; +} +{ +sub.f16x2 r8500, r8399, r8400; +} +{ +mul.f16x2 r8503, r8500, r8381; +} +{ +sub.f16x2 r8506, r8405, r8406; +} +{ +mul.f16x2 r8509, r8506, r8384; +} +{ +add.f16x2 r8512, r8503, r8509; +} +{ +sub.f16x2 r8515, r8497, r8512; +} +{ +add.f16x2 r8518, r8387, r8388; +} +{ +mul.f16x2 r8521, r8518, r8380; +} +{ +add.f16x2 r8524, r8390, r8521; +} +{ +add.f16x2 r8527, r8393, r8394; +} +{ +mul.f16x2 r8530, r8527, r8382; +} +{ +add.f16x2 r8533, r8524, r8530; +} +{ +sub.f16x2 r8536, r8399, r8400; +} +{ +mul.f16x2 r8539, r8536, r8381; +} +{ +sub.f16x2 r8542, r8405, r8406; +} +{ +mul.f16x2 r8545, r8542, r8384; +} +{ +add.f16x2 r8548, r8539, r8545; +} +{ +add.f16x2 r8551, r8533, r8548; +} +{ +add.f16x2 r8554, r8399, r8400; +} +{ +mul.f16x2 r8557, r8554, r8378; +} +{ +add.f16x2 r8560, r8402, r8557; +} +{ +add.f16x2 r8563, r8405, r8406; +} +{ +mul.f16x2 r8566, r8563, r8380; +} +{ +add.f16x2 r8569, r8560, r8566; +} +{ +sub.f16x2 r8572, r8387, r8388; +} +{ +mul.f16x2 r8575, r8572, r8379; +} +{ +sub.f16x2 r8578, r8393, r8394; +} +{ +mul.f16x2 r8581, r8578, r8381; +} +{ +add.f16x2 r8584, r8575, r8581; +} +{ +add.f16x2 r8587, r8569, r8584; +} +{ +add.f16x2 r8590, r8399, r8400; +} +{ +mul.f16x2 r8593, r8590, r8378; +} +{ +add.f16x2 r8596, r8402, r8593; +} +{ +add.f16x2 r8599, r8405, r8406; +} +{ +mul.f16x2 r8602, r8599, r8380; +} +{ +add.f16x2 r8605, r8596, r8602; +} +{ +sub.f16x2 r8608, r8387, r8388; +} +{ +mul.f16x2 r8611, r8608, r8379; +} +{ +sub.f16x2 r8614, r8393, r8394; +} +{ +mul.f16x2 r8617, r8614, r8381; +} +{ +add.f16x2 r8620, r8611, r8617; +} +{ +sub.f16x2 r8623, r8605, r8620; +} +{ +add.f16x2 r8626, r8399, r8400; +} +{ +mul.f16x2 r8629, r8626, r8380; +} +{ +add.f16x2 r8632, r8402, r8629; +} +{ +add.f16x2 r8635, r8405, r8406; +} +{ +mul.f16x2 r8638, r8635, r8382; +} +{ +add.f16x2 r8641, r8632, r8638; +} +{ +sub.f16x2 r8644, r8387, r8388; +} +{ +mul.f16x2 r8647, r8644, r8381; +} +{ +sub.f16x2 r8650, r8393, r8394; +} +{ +mul.f16x2 r8653, r8650, r8384; +} +{ +add.f16x2 r8656, r8647, r8653; +} +{ +add.f16x2 r8659, r8641, r8656; +} +{ +add.f16x2 r8662, r8399, r8400; +} +{ +mul.f16x2 r8665, r8662, r8380; +} +{ +add.f16x2 r8668, r8402, r8665; +} +{ +add.f16x2 r8671, r8405, r8406; +} +{ +mul.f16x2 r8674, r8671, r8382; +} +{ +add.f16x2 r8677, r8668, r8674; +} +{ +sub.f16x2 r8680, r8387, r8388; +} +{ +mul.f16x2 r8683, r8680, r8381; +} +{ +sub.f16x2 r8686, r8393, r8394; +} +{ +mul.f16x2 r8689, r8686, r8384; +} +{ +add.f16x2 r8692, r8683, r8689; +} +{ +sub.f16x2 r8695, r8677, r8692; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8698, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8700, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8701, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8702, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8703, {low, high}; +} +{ +neg.f16x2 r8704, r8703; +} +{ +add.f16x2 r8706, r8707, r8708; +} +{ +add.f16x2 r8709, r8710, r8706; +} +{ +add.f16x2 r8712, r8713, r8714; +} +{ +add.f16x2 r8715, r8709, r8712; +} +{ +add.f16x2 r8718, r8719, r8720; +} +{ +add.f16x2 r8721, r8722, r8718; +} +{ +add.f16x2 r8724, r8725, r8726; +} +{ +add.f16x2 r8727, r8721, r8724; +} +{ +add.f16x2 r8730, r8707, r8708; +} +{ +mul.f16x2 r8733, r8730, r8698; +} +{ +add.f16x2 r8736, r8710, r8733; +} +{ +add.f16x2 r8739, r8713, r8714; +} +{ +mul.f16x2 r8742, r8739, r8700; +} +{ +add.f16x2 r8745, r8736, r8742; +} +{ +sub.f16x2 r8748, r8719, r8720; +} +{ +mul.f16x2 r8751, r8748, r8699; +} +{ +sub.f16x2 r8754, r8725, r8726; +} +{ +mul.f16x2 r8757, r8754, r8701; +} +{ +add.f16x2 r8760, r8751, r8757; +} +{ +sub.f16x2 r8763, r8745, r8760; +} +{ +add.f16x2 r8766, r8707, r8708; +} +{ +mul.f16x2 r8769, r8766, r8698; +} +{ +add.f16x2 r8772, r8710, r8769; +} +{ +add.f16x2 r8775, r8713, r8714; +} +{ +mul.f16x2 r8778, r8775, r8700; +} +{ +add.f16x2 r8781, r8772, r8778; +} +{ +sub.f16x2 r8784, r8719, r8720; +} +{ +mul.f16x2 r8787, r8784, r8699; +} +{ +sub.f16x2 r8790, r8725, r8726; +} +{ +mul.f16x2 r8793, r8790, r8701; +} +{ +add.f16x2 r8796, r8787, r8793; +} +{ +add.f16x2 r8799, r8781, r8796; +} +{ +add.f16x2 r8802, r8707, r8708; +} +{ +mul.f16x2 r8805, r8802, r8700; +} +{ +add.f16x2 r8808, r8710, r8805; +} +{ +add.f16x2 r8811, r8713, r8714; +} +{ +mul.f16x2 r8814, r8811, r8702; +} +{ +add.f16x2 r8817, r8808, r8814; +} +{ +sub.f16x2 r8820, r8719, r8720; +} +{ +mul.f16x2 r8823, r8820, r8701; +} +{ +sub.f16x2 r8826, r8725, r8726; +} +{ +mul.f16x2 r8829, r8826, r8704; +} +{ +add.f16x2 r8832, r8823, r8829; +} +{ +sub.f16x2 r8835, r8817, r8832; +} +{ +add.f16x2 r8838, r8707, r8708; +} +{ +mul.f16x2 r8841, r8838, r8700; +} +{ +add.f16x2 r8844, r8710, r8841; +} +{ +add.f16x2 r8847, r8713, r8714; +} +{ +mul.f16x2 r8850, r8847, r8702; +} +{ +add.f16x2 r8853, r8844, r8850; +} +{ +sub.f16x2 r8856, r8719, r8720; +} +{ +mul.f16x2 r8859, r8856, r8701; +} +{ +sub.f16x2 r8862, r8725, r8726; +} +{ +mul.f16x2 r8865, r8862, r8704; +} +{ +add.f16x2 r8868, r8859, r8865; +} +{ +add.f16x2 r8871, r8853, r8868; +} +{ +add.f16x2 r8874, r8719, r8720; +} +{ +mul.f16x2 r8877, r8874, r8698; +} +{ +add.f16x2 r8880, r8722, r8877; +} +{ +add.f16x2 r8883, r8725, r8726; +} +{ +mul.f16x2 r8886, r8883, r8700; +} +{ +add.f16x2 r8889, r8880, r8886; +} +{ +sub.f16x2 r8892, r8707, r8708; +} +{ +mul.f16x2 r8895, r8892, r8699; +} +{ +sub.f16x2 r8898, r8713, r8714; +} +{ +mul.f16x2 r8901, r8898, r8701; +} +{ +add.f16x2 r8904, r8895, r8901; +} +{ +add.f16x2 r8907, r8889, r8904; +} +{ +add.f16x2 r8910, r8719, r8720; +} +{ +mul.f16x2 r8913, r8910, r8698; +} +{ +add.f16x2 r8916, r8722, r8913; +} +{ +add.f16x2 r8919, r8725, r8726; +} +{ +mul.f16x2 r8922, r8919, r8700; +} +{ +add.f16x2 r8925, r8916, r8922; +} +{ +sub.f16x2 r8928, r8707, r8708; +} +{ +mul.f16x2 r8931, r8928, r8699; +} +{ +sub.f16x2 r8934, r8713, r8714; +} +{ +mul.f16x2 r8937, r8934, r8701; +} +{ +add.f16x2 r8940, r8931, r8937; +} +{ +sub.f16x2 r8943, r8925, r8940; +} +{ +add.f16x2 r8946, r8719, r8720; +} +{ +mul.f16x2 r8949, r8946, r8700; +} +{ +add.f16x2 r8952, r8722, r8949; +} +{ +add.f16x2 r8955, r8725, r8726; +} +{ +mul.f16x2 r8958, r8955, r8702; +} +{ +add.f16x2 r8961, r8952, r8958; +} +{ +sub.f16x2 r8964, r8707, r8708; +} +{ +mul.f16x2 r8967, r8964, r8701; +} +{ +sub.f16x2 r8970, r8713, r8714; +} +{ +mul.f16x2 r8973, r8970, r8704; +} +{ +add.f16x2 r8976, r8967, r8973; +} +{ +add.f16x2 r8979, r8961, r8976; +} +{ +add.f16x2 r8982, r8719, r8720; +} +{ +mul.f16x2 r8985, r8982, r8700; +} +{ +add.f16x2 r8988, r8722, r8985; +} +{ +add.f16x2 r8991, r8725, r8726; +} +{ +mul.f16x2 r8994, r8991, r8702; +} +{ +add.f16x2 r8997, r8988, r8994; +} +{ +sub.f16x2 r9000, r8707, r8708; +} +{ +mul.f16x2 r9003, r9000, r8701; +} +{ +sub.f16x2 r9006, r8713, r8714; +} +{ +mul.f16x2 r9009, r9006, r8704; +} +{ +add.f16x2 r9012, r9003, r9009; +} +{ +sub.f16x2 r9015, r8997, r9012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9023, {low, high}; +} +{ +neg.f16x2 r9024, r9023; +} +{ +add.f16x2 r9026, r9027, r9028; +} +{ +add.f16x2 r9029, r9030, r9026; +} +{ +add.f16x2 r9032, r9033, r9034; +} +{ +add.f16x2 r9035, r9029, r9032; +} +{ +add.f16x2 r9038, r9039, r9040; +} +{ +add.f16x2 r9041, r9042, r9038; +} +{ +add.f16x2 r9044, r9045, r9046; +} +{ +add.f16x2 r9047, r9041, r9044; +} +{ +add.f16x2 r9050, r9027, r9028; +} +{ +mul.f16x2 r9053, r9050, r9018; +} +{ +add.f16x2 r9056, r9030, r9053; +} +{ +add.f16x2 r9059, r9033, r9034; +} +{ +mul.f16x2 r9062, r9059, r9020; +} +{ +add.f16x2 r9065, r9056, r9062; +} +{ +sub.f16x2 r9068, r9039, r9040; +} +{ +mul.f16x2 r9071, r9068, r9019; +} +{ +sub.f16x2 r9074, r9045, r9046; +} +{ +mul.f16x2 r9077, r9074, r9021; +} +{ +add.f16x2 r9080, r9071, r9077; +} +{ +sub.f16x2 r9083, r9065, r9080; +} +{ +add.f16x2 r9086, r9027, r9028; +} +{ +mul.f16x2 r9089, r9086, r9018; +} +{ +add.f16x2 r9092, r9030, r9089; +} +{ +add.f16x2 r9095, r9033, r9034; +} +{ +mul.f16x2 r9098, r9095, r9020; +} +{ +add.f16x2 r9101, r9092, r9098; +} +{ +sub.f16x2 r9104, r9039, r9040; +} +{ +mul.f16x2 r9107, r9104, r9019; +} +{ +sub.f16x2 r9110, r9045, r9046; +} +{ +mul.f16x2 r9113, r9110, r9021; +} +{ +add.f16x2 r9116, r9107, r9113; +} +{ +add.f16x2 r9119, r9101, r9116; +} +{ +add.f16x2 r9122, r9027, r9028; +} +{ +mul.f16x2 r9125, r9122, r9020; +} +{ +add.f16x2 r9128, r9030, r9125; +} +{ +add.f16x2 r9131, r9033, r9034; +} +{ +mul.f16x2 r9134, r9131, r9022; +} +{ +add.f16x2 r9137, r9128, r9134; +} +{ +sub.f16x2 r9140, r9039, r9040; +} +{ +mul.f16x2 r9143, r9140, r9021; +} +{ +sub.f16x2 r9146, r9045, r9046; +} +{ +mul.f16x2 r9149, r9146, r9024; +} +{ +add.f16x2 r9152, r9143, r9149; +} +{ +sub.f16x2 r9155, r9137, r9152; +} +{ +add.f16x2 r9158, r9027, r9028; +} +{ +mul.f16x2 r9161, r9158, r9020; +} +{ +add.f16x2 r9164, r9030, r9161; +} +{ +add.f16x2 r9167, r9033, r9034; +} +{ +mul.f16x2 r9170, r9167, r9022; +} +{ +add.f16x2 r9173, r9164, r9170; +} +{ +sub.f16x2 r9176, r9039, r9040; +} +{ +mul.f16x2 r9179, r9176, r9021; +} +{ +sub.f16x2 r9182, r9045, r9046; +} +{ +mul.f16x2 r9185, r9182, r9024; +} +{ +add.f16x2 r9188, r9179, r9185; +} +{ +add.f16x2 r9191, r9173, r9188; +} +{ +add.f16x2 r9194, r9039, r9040; +} +{ +mul.f16x2 r9197, r9194, r9018; +} +{ +add.f16x2 r9200, r9042, r9197; +} +{ +add.f16x2 r9203, r9045, r9046; +} +{ +mul.f16x2 r9206, r9203, r9020; +} +{ +add.f16x2 r9209, r9200, r9206; +} +{ +sub.f16x2 r9212, r9027, r9028; +} +{ +mul.f16x2 r9215, r9212, r9019; +} +{ +sub.f16x2 r9218, r9033, r9034; +} +{ +mul.f16x2 r9221, r9218, r9021; +} +{ +add.f16x2 r9224, r9215, r9221; +} +{ +add.f16x2 r9227, r9209, r9224; +} +{ +add.f16x2 r9230, r9039, r9040; +} +{ +mul.f16x2 r9233, r9230, r9018; +} +{ +add.f16x2 r9236, r9042, r9233; +} +{ +add.f16x2 r9239, r9045, r9046; +} +{ +mul.f16x2 r9242, r9239, r9020; +} +{ +add.f16x2 r9245, r9236, r9242; +} +{ +sub.f16x2 r9248, r9027, r9028; +} +{ +mul.f16x2 r9251, r9248, r9019; +} +{ +sub.f16x2 r9254, r9033, r9034; +} +{ +mul.f16x2 r9257, r9254, r9021; +} +{ +add.f16x2 r9260, r9251, r9257; +} +{ +sub.f16x2 r9263, r9245, r9260; +} +{ +add.f16x2 r9266, r9039, r9040; +} +{ +mul.f16x2 r9269, r9266, r9020; +} +{ +add.f16x2 r9272, r9042, r9269; +} +{ +add.f16x2 r9275, r9045, r9046; +} +{ +mul.f16x2 r9278, r9275, r9022; +} +{ +add.f16x2 r9281, r9272, r9278; +} +{ +sub.f16x2 r9284, r9027, r9028; +} +{ +mul.f16x2 r9287, r9284, r9021; +} +{ +sub.f16x2 r9290, r9033, r9034; +} +{ +mul.f16x2 r9293, r9290, r9024; +} +{ +add.f16x2 r9296, r9287, r9293; +} +{ +add.f16x2 r9299, r9281, r9296; +} +{ +add.f16x2 r9302, r9039, r9040; +} +{ +mul.f16x2 r9305, r9302, r9020; +} +{ +add.f16x2 r9308, r9042, r9305; +} +{ +add.f16x2 r9311, r9045, r9046; +} +{ +mul.f16x2 r9314, r9311, r9022; +} +{ +add.f16x2 r9317, r9308, r9314; +} +{ +sub.f16x2 r9320, r9027, r9028; +} +{ +mul.f16x2 r9323, r9320, r9021; +} +{ +sub.f16x2 r9326, r9033, r9034; +} +{ +mul.f16x2 r9329, r9326, r9024; +} +{ +add.f16x2 r9332, r9323, r9329; +} +{ +sub.f16x2 r9335, r9317, r9332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r9338, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r9339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r9340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r9341, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r9342, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r9343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r9344, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r9345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r9348, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r9349, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r9352, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r9353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9354, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r9355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r9360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r9361, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9368, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r9369, {low, high}; +} +{ +mul.f16x2 r9386, r8123, r9338; +} +{ +mul.f16x2 r9389, r8267, r9339; +} +{ +sub.f16x2 r9392, r9386, r9389; +} +{ +mul.f16x2 r9395, r8123, r9339; +} +{ +fma.rn.f16x2 r9398, r8267, r9338, r9395; +} +{ +mul.f16x2 r9402, r8443, r9340; +} +{ +mul.f16x2 r9405, r8587, r9341; +} +{ +sub.f16x2 r9408, r9402, r9405; +} +{ +mul.f16x2 r9411, r8443, r9341; +} +{ +fma.rn.f16x2 r9414, r8587, r9340, r9411; +} +{ +mul.f16x2 r9418, r8763, r9342; +} +{ +mul.f16x2 r9421, r8907, r9343; +} +{ +sub.f16x2 r9424, r9418, r9421; +} +{ +mul.f16x2 r9427, r8763, r9343; +} +{ +fma.rn.f16x2 r9430, r8907, r9342, r9427; +} +{ +mul.f16x2 r9434, r9083, r9344; +} +{ +mul.f16x2 r9437, r9227, r9345; +} +{ +sub.f16x2 r9440, r9434, r9437; +} +{ +mul.f16x2 r9443, r9083, r9345; +} +{ +fma.rn.f16x2 r9446, r9227, r9344, r9443; +} +{ +mul.f16x2 r9450, r8195, r9340; +} +{ +mul.f16x2 r9453, r8339, r9341; +} +{ +sub.f16x2 r9456, r9450, r9453; +} +{ +mul.f16x2 r9459, r8195, r9341; +} +{ +fma.rn.f16x2 r9462, r8339, r9340, r9459; +} +{ +mul.f16x2 r9466, r8515, r9344; +} +{ +mul.f16x2 r9469, r8659, r9345; +} +{ +sub.f16x2 r9472, r9466, r9469; +} +{ +mul.f16x2 r9475, r8515, r9345; +} +{ +fma.rn.f16x2 r9478, r8659, r9344, r9475; +} +{ +mul.f16x2 r9482, r8835, r9348; +} +{ +mul.f16x2 r9485, r8979, r9349; +} +{ +sub.f16x2 r9488, r9482, r9485; +} +{ +mul.f16x2 r9491, r8835, r9349; +} +{ +fma.rn.f16x2 r9494, r8979, r9348, r9491; +} +{ +mul.f16x2 r9498, r9155, r9352; +} +{ +mul.f16x2 r9501, r9299, r9353; +} +{ +sub.f16x2 r9504, r9498, r9501; +} +{ +mul.f16x2 r9507, r9155, r9353; +} +{ +fma.rn.f16x2 r9510, r9299, r9352, r9507; +} +{ +mul.f16x2 r9514, r8231, r9342; +} +{ +mul.f16x2 r9517, r8375, r9343; +} +{ +sub.f16x2 r9520, r9514, r9517; +} +{ +mul.f16x2 r9523, r8231, r9343; +} +{ +fma.rn.f16x2 r9526, r8375, r9342, r9523; +} +{ +mul.f16x2 r9530, r8551, r9348; +} +{ +mul.f16x2 r9533, r8695, r9349; +} +{ +sub.f16x2 r9536, r9530, r9533; +} +{ +mul.f16x2 r9539, r8551, r9349; +} +{ +fma.rn.f16x2 r9542, r8695, r9348, r9539; +} +{ +mul.f16x2 r9546, r8871, r9354; +} +{ +mul.f16x2 r9549, r9015, r9355; +} +{ +sub.f16x2 r9552, r9546, r9549; +} +{ +mul.f16x2 r9555, r8871, r9355; +} +{ +fma.rn.f16x2 r9558, r9015, r9354, r9555; +} +{ +mul.f16x2 r9562, r9191, r9360; +} +{ +mul.f16x2 r9565, r9335, r9361; +} +{ +sub.f16x2 r9568, r9562, r9565; +} +{ +mul.f16x2 r9571, r9191, r9361; +} +{ +fma.rn.f16x2 r9574, r9335, r9360, r9571; +} +{ +mul.f16x2 r9578, r8159, r9344; +} +{ +mul.f16x2 r9581, r8303, r9345; +} +{ +sub.f16x2 r9584, r9578, r9581; +} +{ +mul.f16x2 r9587, r8159, r9345; +} +{ +fma.rn.f16x2 r9590, r8303, r9344, r9587; +} +{ +mul.f16x2 r9594, r8479, r9352; +} +{ +mul.f16x2 r9597, r8623, r9353; +} +{ +sub.f16x2 r9600, r9594, r9597; +} +{ +mul.f16x2 r9603, r8479, r9353; +} +{ +fma.rn.f16x2 r9606, r8623, r9352, r9603; +} +{ +mul.f16x2 r9610, r8799, r9360; +} +{ +mul.f16x2 r9613, r8943, r9361; +} +{ +sub.f16x2 r9616, r9610, r9613; +} +{ +mul.f16x2 r9619, r8799, r9361; +} +{ +fma.rn.f16x2 r9622, r8943, r9360, r9619; +} +{ +mul.f16x2 r9626, r9119, r9368; +} +{ +mul.f16x2 r9629, r9263, r9369; +} +{ +sub.f16x2 r9632, r9626, r9629; +} +{ +mul.f16x2 r9635, r9119, r9369; +} +{ +fma.rn.f16x2 r9638, r9263, r9368, r9635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9647, {low, high}; +} +{ +neg.f16x2 r9648, r9647; +} +{ +add.f16x2 r9650, r8075, r9035; +} +{ +add.f16x2 r9653, r7755, r9650; +} +{ +add.f16x2 r9656, r8395, r8715; +} +{ +add.f16x2 %0, r9653, r9656; +} +{ +add.f16x2 r9662, r8087, r9047; +} +{ +add.f16x2 r9665, r7767, r9662; +} +{ +add.f16x2 r9668, r8407, r8727; +} +{ +add.f16x2 %1, r9665, r9668; +} +{ +add.f16x2 r9674, r8075, r9035; +} +{ +mul.f16x2 r9677, r9674, r9642; +} +{ +add.f16x2 r9680, r7755, r9677; +} +{ +add.f16x2 r9683, r8395, r8715; +} +{ +mul.f16x2 r9686, r9683, r9644; +} +{ +add.f16x2 r9689, r9680, r9686; +} +{ +sub.f16x2 r9692, r8087, r9047; +} +{ +mul.f16x2 r9695, r9692, r9643; +} +{ +sub.f16x2 r9698, r8407, r8727; +} +{ +mul.f16x2 r9701, r9698, r9645; +} +{ +add.f16x2 r9704, r9695, r9701; +} +{ +sub.f16x2 %10, r9689, r9704; +} +{ +add.f16x2 r9710, r8075, r9035; +} +{ +mul.f16x2 r9713, r9710, r9642; +} +{ +add.f16x2 r9716, r7755, r9713; +} +{ +add.f16x2 r9719, r8395, r8715; +} +{ +mul.f16x2 r9722, r9719, r9644; +} +{ +add.f16x2 r9725, r9716, r9722; +} +{ +sub.f16x2 r9728, r8087, r9047; +} +{ +mul.f16x2 r9731, r9728, r9643; +} +{ +sub.f16x2 r9734, r8407, r8727; +} +{ +mul.f16x2 r9737, r9734, r9645; +} +{ +add.f16x2 r9740, r9731, r9737; +} +{ +add.f16x2 %40, r9725, r9740; +} +{ +add.f16x2 r9746, r8075, r9035; +} +{ +mul.f16x2 r9749, r9746, r9644; +} +{ +add.f16x2 r9752, r7755, r9749; +} +{ +add.f16x2 r9755, r8395, r8715; +} +{ +mul.f16x2 r9758, r9755, r9646; +} +{ +add.f16x2 r9761, r9752, r9758; +} +{ +sub.f16x2 r9764, r8087, r9047; +} +{ +mul.f16x2 r9767, r9764, r9645; +} +{ +sub.f16x2 r9770, r8407, r8727; +} +{ +mul.f16x2 r9773, r9770, r9648; +} +{ +add.f16x2 r9776, r9767, r9773; +} +{ +sub.f16x2 %20, r9761, r9776; +} +{ +add.f16x2 r9782, r8075, r9035; +} +{ +mul.f16x2 r9785, r9782, r9644; +} +{ +add.f16x2 r9788, r7755, r9785; +} +{ +add.f16x2 r9791, r8395, r8715; +} +{ +mul.f16x2 r9794, r9791, r9646; +} +{ +add.f16x2 r9797, r9788, r9794; +} +{ +sub.f16x2 r9800, r8087, r9047; +} +{ +mul.f16x2 r9803, r9800, r9645; +} +{ +sub.f16x2 r9806, r8407, r8727; +} +{ +mul.f16x2 r9809, r9806, r9648; +} +{ +add.f16x2 r9812, r9803, r9809; +} +{ +add.f16x2 %30, r9797, r9812; +} +{ +add.f16x2 r9818, r8087, r9047; +} +{ +mul.f16x2 r9821, r9818, r9642; +} +{ +add.f16x2 r9824, r7767, r9821; +} +{ +add.f16x2 r9827, r8407, r8727; +} +{ +mul.f16x2 r9830, r9827, r9644; +} +{ +add.f16x2 r9833, r9824, r9830; +} +{ +sub.f16x2 r9836, r8075, r9035; +} +{ +mul.f16x2 r9839, r9836, r9643; +} +{ +sub.f16x2 r9842, r8395, r8715; +} +{ +mul.f16x2 r9845, r9842, r9645; +} +{ +add.f16x2 r9848, r9839, r9845; +} +{ +add.f16x2 %11, r9833, r9848; +} +{ +add.f16x2 r9854, r8087, r9047; +} +{ +mul.f16x2 r9857, r9854, r9642; +} +{ +add.f16x2 r9860, r7767, r9857; +} +{ +add.f16x2 r9863, r8407, r8727; +} +{ +mul.f16x2 r9866, r9863, r9644; +} +{ +add.f16x2 r9869, r9860, r9866; +} +{ +sub.f16x2 r9872, r8075, r9035; +} +{ +mul.f16x2 r9875, r9872, r9643; +} +{ +sub.f16x2 r9878, r8395, r8715; +} +{ +mul.f16x2 r9881, r9878, r9645; +} +{ +add.f16x2 r9884, r9875, r9881; +} +{ +sub.f16x2 %41, r9869, r9884; +} +{ +add.f16x2 r9890, r8087, r9047; +} +{ +mul.f16x2 r9893, r9890, r9644; +} +{ +add.f16x2 r9896, r7767, r9893; +} +{ +add.f16x2 r9899, r8407, r8727; +} +{ +mul.f16x2 r9902, r9899, r9646; +} +{ +add.f16x2 r9905, r9896, r9902; +} +{ +sub.f16x2 r9908, r8075, r9035; +} +{ +mul.f16x2 r9911, r9908, r9645; +} +{ +sub.f16x2 r9914, r8395, r8715; +} +{ +mul.f16x2 r9917, r9914, r9648; +} +{ +add.f16x2 r9920, r9911, r9917; +} +{ +add.f16x2 %21, r9905, r9920; +} +{ +add.f16x2 r9926, r8087, r9047; +} +{ +mul.f16x2 r9929, r9926, r9644; +} +{ +add.f16x2 r9932, r7767, r9929; +} +{ +add.f16x2 r9935, r8407, r8727; +} +{ +mul.f16x2 r9938, r9935, r9646; +} +{ +add.f16x2 r9941, r9932, r9938; +} +{ +sub.f16x2 r9944, r8075, r9035; +} +{ +mul.f16x2 r9947, r9944, r9645; +} +{ +sub.f16x2 r9950, r8395, r8715; +} +{ +mul.f16x2 r9953, r9950, r9648; +} +{ +add.f16x2 r9956, r9947, r9953; +} +{ +sub.f16x2 %31, r9941, r9956; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9966, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9967, {low, high}; +} +{ +neg.f16x2 r9968, r9967; +} +{ +add.f16x2 r9970, r9392, r9440; +} +{ +add.f16x2 r9973, r7803, r9970; +} +{ +add.f16x2 r9976, r9408, r9424; +} +{ +add.f16x2 %2, r9973, r9976; +} +{ +add.f16x2 r9982, r9398, r9446; +} +{ +add.f16x2 r9985, r7947, r9982; +} +{ +add.f16x2 r9988, r9414, r9430; +} +{ +add.f16x2 %3, r9985, r9988; +} +{ +add.f16x2 r9994, r9392, r9440; +} +{ +mul.f16x2 r9997, r9994, r9962; +} +{ +add.f16x2 r10000, r7803, r9997; +} +{ +add.f16x2 r10003, r9408, r9424; +} +{ +mul.f16x2 r10006, r10003, r9964; +} +{ +add.f16x2 r10009, r10000, r10006; +} +{ +sub.f16x2 r10012, r9398, r9446; +} +{ +mul.f16x2 r10015, r10012, r9963; +} +{ +sub.f16x2 r10018, r9414, r9430; +} +{ +mul.f16x2 r10021, r10018, r9965; +} +{ +add.f16x2 r10024, r10015, r10021; +} +{ +sub.f16x2 %12, r10009, r10024; +} +{ +add.f16x2 r10030, r9392, r9440; +} +{ +mul.f16x2 r10033, r10030, r9962; +} +{ +add.f16x2 r10036, r7803, r10033; +} +{ +add.f16x2 r10039, r9408, r9424; +} +{ +mul.f16x2 r10042, r10039, r9964; +} +{ +add.f16x2 r10045, r10036, r10042; +} +{ +sub.f16x2 r10048, r9398, r9446; +} +{ +mul.f16x2 r10051, r10048, r9963; +} +{ +sub.f16x2 r10054, r9414, r9430; +} +{ +mul.f16x2 r10057, r10054, r9965; +} +{ +add.f16x2 r10060, r10051, r10057; +} +{ +add.f16x2 %42, r10045, r10060; +} +{ +add.f16x2 r10066, r9392, r9440; +} +{ +mul.f16x2 r10069, r10066, r9964; +} +{ +add.f16x2 r10072, r7803, r10069; +} +{ +add.f16x2 r10075, r9408, r9424; +} +{ +mul.f16x2 r10078, r10075, r9966; +} +{ +add.f16x2 r10081, r10072, r10078; +} +{ +sub.f16x2 r10084, r9398, r9446; +} +{ +mul.f16x2 r10087, r10084, r9965; +} +{ +sub.f16x2 r10090, r9414, r9430; +} +{ +mul.f16x2 r10093, r10090, r9968; +} +{ +add.f16x2 r10096, r10087, r10093; +} +{ +sub.f16x2 %22, r10081, r10096; +} +{ +add.f16x2 r10102, r9392, r9440; +} +{ +mul.f16x2 r10105, r10102, r9964; +} +{ +add.f16x2 r10108, r7803, r10105; +} +{ +add.f16x2 r10111, r9408, r9424; +} +{ +mul.f16x2 r10114, r10111, r9966; +} +{ +add.f16x2 r10117, r10108, r10114; +} +{ +sub.f16x2 r10120, r9398, r9446; +} +{ +mul.f16x2 r10123, r10120, r9965; +} +{ +sub.f16x2 r10126, r9414, r9430; +} +{ +mul.f16x2 r10129, r10126, r9968; +} +{ +add.f16x2 r10132, r10123, r10129; +} +{ +add.f16x2 %32, r10117, r10132; +} +{ +add.f16x2 r10138, r9398, r9446; +} +{ +mul.f16x2 r10141, r10138, r9962; +} +{ +add.f16x2 r10144, r7947, r10141; +} +{ +add.f16x2 r10147, r9414, r9430; +} +{ +mul.f16x2 r10150, r10147, r9964; +} +{ +add.f16x2 r10153, r10144, r10150; +} +{ +sub.f16x2 r10156, r9392, r9440; +} +{ +mul.f16x2 r10159, r10156, r9963; +} +{ +sub.f16x2 r10162, r9408, r9424; +} +{ +mul.f16x2 r10165, r10162, r9965; +} +{ +add.f16x2 r10168, r10159, r10165; +} +{ +add.f16x2 %13, r10153, r10168; +} +{ +add.f16x2 r10174, r9398, r9446; +} +{ +mul.f16x2 r10177, r10174, r9962; +} +{ +add.f16x2 r10180, r7947, r10177; +} +{ +add.f16x2 r10183, r9414, r9430; +} +{ +mul.f16x2 r10186, r10183, r9964; +} +{ +add.f16x2 r10189, r10180, r10186; +} +{ +sub.f16x2 r10192, r9392, r9440; +} +{ +mul.f16x2 r10195, r10192, r9963; +} +{ +sub.f16x2 r10198, r9408, r9424; +} +{ +mul.f16x2 r10201, r10198, r9965; +} +{ +add.f16x2 r10204, r10195, r10201; +} +{ +sub.f16x2 %43, r10189, r10204; +} +{ +add.f16x2 r10210, r9398, r9446; +} +{ +mul.f16x2 r10213, r10210, r9964; +} +{ +add.f16x2 r10216, r7947, r10213; +} +{ +add.f16x2 r10219, r9414, r9430; +} +{ +mul.f16x2 r10222, r10219, r9966; +} +{ +add.f16x2 r10225, r10216, r10222; +} +{ +sub.f16x2 r10228, r9392, r9440; +} +{ +mul.f16x2 r10231, r10228, r9965; +} +{ +sub.f16x2 r10234, r9408, r9424; +} +{ +mul.f16x2 r10237, r10234, r9968; +} +{ +add.f16x2 r10240, r10231, r10237; +} +{ +add.f16x2 %23, r10225, r10240; +} +{ +add.f16x2 r10246, r9398, r9446; +} +{ +mul.f16x2 r10249, r10246, r9964; +} +{ +add.f16x2 r10252, r7947, r10249; +} +{ +add.f16x2 r10255, r9414, r9430; +} +{ +mul.f16x2 r10258, r10255, r9966; +} +{ +add.f16x2 r10261, r10252, r10258; +} +{ +sub.f16x2 r10264, r9392, r9440; +} +{ +mul.f16x2 r10267, r10264, r9965; +} +{ +sub.f16x2 r10270, r9408, r9424; +} +{ +mul.f16x2 r10273, r10270, r9968; +} +{ +add.f16x2 r10276, r10267, r10273; +} +{ +sub.f16x2 %33, r10261, r10276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10287, {low, high}; +} +{ +neg.f16x2 r10288, r10287; +} +{ +add.f16x2 r10290, r9456, r9504; +} +{ +add.f16x2 r10293, r7875, r10290; +} +{ +add.f16x2 r10296, r9472, r9488; +} +{ +add.f16x2 %4, r10293, r10296; +} +{ +add.f16x2 r10302, r9462, r9510; +} +{ +add.f16x2 r10305, r8019, r10302; +} +{ +add.f16x2 r10308, r9478, r9494; +} +{ +add.f16x2 %5, r10305, r10308; +} +{ +add.f16x2 r10314, r9456, r9504; +} +{ +mul.f16x2 r10317, r10314, r10282; +} +{ +add.f16x2 r10320, r7875, r10317; +} +{ +add.f16x2 r10323, r9472, r9488; +} +{ +mul.f16x2 r10326, r10323, r10284; +} +{ +add.f16x2 r10329, r10320, r10326; +} +{ +sub.f16x2 r10332, r9462, r9510; +} +{ +mul.f16x2 r10335, r10332, r10283; +} +{ +sub.f16x2 r10338, r9478, r9494; +} +{ +mul.f16x2 r10341, r10338, r10285; +} +{ +add.f16x2 r10344, r10335, r10341; +} +{ +sub.f16x2 %14, r10329, r10344; +} +{ +add.f16x2 r10350, r9456, r9504; +} +{ +mul.f16x2 r10353, r10350, r10282; +} +{ +add.f16x2 r10356, r7875, r10353; +} +{ +add.f16x2 r10359, r9472, r9488; +} +{ +mul.f16x2 r10362, r10359, r10284; +} +{ +add.f16x2 r10365, r10356, r10362; +} +{ +sub.f16x2 r10368, r9462, r9510; +} +{ +mul.f16x2 r10371, r10368, r10283; +} +{ +sub.f16x2 r10374, r9478, r9494; +} +{ +mul.f16x2 r10377, r10374, r10285; +} +{ +add.f16x2 r10380, r10371, r10377; +} +{ +add.f16x2 %44, r10365, r10380; +} +{ +add.f16x2 r10386, r9456, r9504; +} +{ +mul.f16x2 r10389, r10386, r10284; +} +{ +add.f16x2 r10392, r7875, r10389; +} +{ +add.f16x2 r10395, r9472, r9488; +} +{ +mul.f16x2 r10398, r10395, r10286; +} +{ +add.f16x2 r10401, r10392, r10398; +} +{ +sub.f16x2 r10404, r9462, r9510; +} +{ +mul.f16x2 r10407, r10404, r10285; +} +{ +sub.f16x2 r10410, r9478, r9494; +} +{ +mul.f16x2 r10413, r10410, r10288; +} +{ +add.f16x2 r10416, r10407, r10413; +} +{ +sub.f16x2 %24, r10401, r10416; +} +{ +add.f16x2 r10422, r9456, r9504; +} +{ +mul.f16x2 r10425, r10422, r10284; +} +{ +add.f16x2 r10428, r7875, r10425; +} +{ +add.f16x2 r10431, r9472, r9488; +} +{ +mul.f16x2 r10434, r10431, r10286; +} +{ +add.f16x2 r10437, r10428, r10434; +} +{ +sub.f16x2 r10440, r9462, r9510; +} +{ +mul.f16x2 r10443, r10440, r10285; +} +{ +sub.f16x2 r10446, r9478, r9494; +} +{ +mul.f16x2 r10449, r10446, r10288; +} +{ +add.f16x2 r10452, r10443, r10449; +} +{ +add.f16x2 %34, r10437, r10452; +} +{ +add.f16x2 r10458, r9462, r9510; +} +{ +mul.f16x2 r10461, r10458, r10282; +} +{ +add.f16x2 r10464, r8019, r10461; +} +{ +add.f16x2 r10467, r9478, r9494; +} +{ +mul.f16x2 r10470, r10467, r10284; +} +{ +add.f16x2 r10473, r10464, r10470; +} +{ +sub.f16x2 r10476, r9456, r9504; +} +{ +mul.f16x2 r10479, r10476, r10283; +} +{ +sub.f16x2 r10482, r9472, r9488; +} +{ +mul.f16x2 r10485, r10482, r10285; +} +{ +add.f16x2 r10488, r10479, r10485; +} +{ +add.f16x2 %15, r10473, r10488; +} +{ +add.f16x2 r10494, r9462, r9510; +} +{ +mul.f16x2 r10497, r10494, r10282; +} +{ +add.f16x2 r10500, r8019, r10497; +} +{ +add.f16x2 r10503, r9478, r9494; +} +{ +mul.f16x2 r10506, r10503, r10284; +} +{ +add.f16x2 r10509, r10500, r10506; +} +{ +sub.f16x2 r10512, r9456, r9504; +} +{ +mul.f16x2 r10515, r10512, r10283; +} +{ +sub.f16x2 r10518, r9472, r9488; +} +{ +mul.f16x2 r10521, r10518, r10285; +} +{ +add.f16x2 r10524, r10515, r10521; +} +{ +sub.f16x2 %45, r10509, r10524; +} +{ +add.f16x2 r10530, r9462, r9510; +} +{ +mul.f16x2 r10533, r10530, r10284; +} +{ +add.f16x2 r10536, r8019, r10533; +} +{ +add.f16x2 r10539, r9478, r9494; +} +{ +mul.f16x2 r10542, r10539, r10286; +} +{ +add.f16x2 r10545, r10536, r10542; +} +{ +sub.f16x2 r10548, r9456, r9504; +} +{ +mul.f16x2 r10551, r10548, r10285; +} +{ +sub.f16x2 r10554, r9472, r9488; +} +{ +mul.f16x2 r10557, r10554, r10288; +} +{ +add.f16x2 r10560, r10551, r10557; +} +{ +add.f16x2 %25, r10545, r10560; +} +{ +add.f16x2 r10566, r9462, r9510; +} +{ +mul.f16x2 r10569, r10566, r10284; +} +{ +add.f16x2 r10572, r8019, r10569; +} +{ +add.f16x2 r10575, r9478, r9494; +} +{ +mul.f16x2 r10578, r10575, r10286; +} +{ +add.f16x2 r10581, r10572, r10578; +} +{ +sub.f16x2 r10584, r9456, r9504; +} +{ +mul.f16x2 r10587, r10584, r10285; +} +{ +sub.f16x2 r10590, r9472, r9488; +} +{ +mul.f16x2 r10593, r10590, r10288; +} +{ +add.f16x2 r10596, r10587, r10593; +} +{ +sub.f16x2 %35, r10581, r10596; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10602, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10603, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10606, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10607, {low, high}; +} +{ +neg.f16x2 r10608, r10607; +} +{ +add.f16x2 r10610, r9520, r9568; +} +{ +add.f16x2 r10613, r7911, r10610; +} +{ +add.f16x2 r10616, r9536, r9552; +} +{ +add.f16x2 %6, r10613, r10616; +} +{ +add.f16x2 r10622, r9526, r9574; +} +{ +add.f16x2 r10625, r8055, r10622; +} +{ +add.f16x2 r10628, r9542, r9558; +} +{ +add.f16x2 %7, r10625, r10628; +} +{ +add.f16x2 r10634, r9520, r9568; +} +{ +mul.f16x2 r10637, r10634, r10602; +} +{ +add.f16x2 r10640, r7911, r10637; +} +{ +add.f16x2 r10643, r9536, r9552; +} +{ +mul.f16x2 r10646, r10643, r10604; +} +{ +add.f16x2 r10649, r10640, r10646; +} +{ +sub.f16x2 r10652, r9526, r9574; +} +{ +mul.f16x2 r10655, r10652, r10603; +} +{ +sub.f16x2 r10658, r9542, r9558; +} +{ +mul.f16x2 r10661, r10658, r10605; +} +{ +add.f16x2 r10664, r10655, r10661; +} +{ +sub.f16x2 %16, r10649, r10664; +} +{ +add.f16x2 r10670, r9520, r9568; +} +{ +mul.f16x2 r10673, r10670, r10602; +} +{ +add.f16x2 r10676, r7911, r10673; +} +{ +add.f16x2 r10679, r9536, r9552; +} +{ +mul.f16x2 r10682, r10679, r10604; +} +{ +add.f16x2 r10685, r10676, r10682; +} +{ +sub.f16x2 r10688, r9526, r9574; +} +{ +mul.f16x2 r10691, r10688, r10603; +} +{ +sub.f16x2 r10694, r9542, r9558; +} +{ +mul.f16x2 r10697, r10694, r10605; +} +{ +add.f16x2 r10700, r10691, r10697; +} +{ +add.f16x2 %46, r10685, r10700; +} +{ +add.f16x2 r10706, r9520, r9568; +} +{ +mul.f16x2 r10709, r10706, r10604; +} +{ +add.f16x2 r10712, r7911, r10709; +} +{ +add.f16x2 r10715, r9536, r9552; +} +{ +mul.f16x2 r10718, r10715, r10606; +} +{ +add.f16x2 r10721, r10712, r10718; +} +{ +sub.f16x2 r10724, r9526, r9574; +} +{ +mul.f16x2 r10727, r10724, r10605; +} +{ +sub.f16x2 r10730, r9542, r9558; +} +{ +mul.f16x2 r10733, r10730, r10608; +} +{ +add.f16x2 r10736, r10727, r10733; +} +{ +sub.f16x2 %26, r10721, r10736; +} +{ +add.f16x2 r10742, r9520, r9568; +} +{ +mul.f16x2 r10745, r10742, r10604; +} +{ +add.f16x2 r10748, r7911, r10745; +} +{ +add.f16x2 r10751, r9536, r9552; +} +{ +mul.f16x2 r10754, r10751, r10606; +} +{ +add.f16x2 r10757, r10748, r10754; +} +{ +sub.f16x2 r10760, r9526, r9574; +} +{ +mul.f16x2 r10763, r10760, r10605; +} +{ +sub.f16x2 r10766, r9542, r9558; +} +{ +mul.f16x2 r10769, r10766, r10608; +} +{ +add.f16x2 r10772, r10763, r10769; +} +{ +add.f16x2 %36, r10757, r10772; +} +{ +add.f16x2 r10778, r9526, r9574; +} +{ +mul.f16x2 r10781, r10778, r10602; +} +{ +add.f16x2 r10784, r8055, r10781; +} +{ +add.f16x2 r10787, r9542, r9558; +} +{ +mul.f16x2 r10790, r10787, r10604; +} +{ +add.f16x2 r10793, r10784, r10790; +} +{ +sub.f16x2 r10796, r9520, r9568; +} +{ +mul.f16x2 r10799, r10796, r10603; +} +{ +sub.f16x2 r10802, r9536, r9552; +} +{ +mul.f16x2 r10805, r10802, r10605; +} +{ +add.f16x2 r10808, r10799, r10805; +} +{ +add.f16x2 %17, r10793, r10808; +} +{ +add.f16x2 r10814, r9526, r9574; +} +{ +mul.f16x2 r10817, r10814, r10602; +} +{ +add.f16x2 r10820, r8055, r10817; +} +{ +add.f16x2 r10823, r9542, r9558; +} +{ +mul.f16x2 r10826, r10823, r10604; +} +{ +add.f16x2 r10829, r10820, r10826; +} +{ +sub.f16x2 r10832, r9520, r9568; +} +{ +mul.f16x2 r10835, r10832, r10603; +} +{ +sub.f16x2 r10838, r9536, r9552; +} +{ +mul.f16x2 r10841, r10838, r10605; +} +{ +add.f16x2 r10844, r10835, r10841; +} +{ +sub.f16x2 %47, r10829, r10844; +} +{ +add.f16x2 r10850, r9526, r9574; +} +{ +mul.f16x2 r10853, r10850, r10604; +} +{ +add.f16x2 r10856, r8055, r10853; +} +{ +add.f16x2 r10859, r9542, r9558; +} +{ +mul.f16x2 r10862, r10859, r10606; +} +{ +add.f16x2 r10865, r10856, r10862; +} +{ +sub.f16x2 r10868, r9520, r9568; +} +{ +mul.f16x2 r10871, r10868, r10605; +} +{ +sub.f16x2 r10874, r9536, r9552; +} +{ +mul.f16x2 r10877, r10874, r10608; +} +{ +add.f16x2 r10880, r10871, r10877; +} +{ +add.f16x2 %27, r10865, r10880; +} +{ +add.f16x2 r10886, r9526, r9574; +} +{ +mul.f16x2 r10889, r10886, r10604; +} +{ +add.f16x2 r10892, r8055, r10889; +} +{ +add.f16x2 r10895, r9542, r9558; +} +{ +mul.f16x2 r10898, r10895, r10606; +} +{ +add.f16x2 r10901, r10892, r10898; +} +{ +sub.f16x2 r10904, r9520, r9568; +} +{ +mul.f16x2 r10907, r10904, r10605; +} +{ +sub.f16x2 r10910, r9536, r9552; +} +{ +mul.f16x2 r10913, r10910, r10608; +} +{ +add.f16x2 r10916, r10907, r10913; +} +{ +sub.f16x2 %37, r10901, r10916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10927, {low, high}; +} +{ +neg.f16x2 r10928, r10927; +} +{ +add.f16x2 r10930, r9584, r9632; +} +{ +add.f16x2 r10933, r7839, r10930; +} +{ +add.f16x2 r10936, r9600, r9616; +} +{ +add.f16x2 %8, r10933, r10936; +} +{ +add.f16x2 r10942, r9590, r9638; +} +{ +add.f16x2 r10945, r7983, r10942; +} +{ +add.f16x2 r10948, r9606, r9622; +} +{ +add.f16x2 %9, r10945, r10948; +} +{ +add.f16x2 r10954, r9584, r9632; +} +{ +mul.f16x2 r10957, r10954, r10922; +} +{ +add.f16x2 r10960, r7839, r10957; +} +{ +add.f16x2 r10963, r9600, r9616; +} +{ +mul.f16x2 r10966, r10963, r10924; +} +{ +add.f16x2 r10969, r10960, r10966; +} +{ +sub.f16x2 r10972, r9590, r9638; +} +{ +mul.f16x2 r10975, r10972, r10923; +} +{ +sub.f16x2 r10978, r9606, r9622; +} +{ +mul.f16x2 r10981, r10978, r10925; +} +{ +add.f16x2 r10984, r10975, r10981; +} +{ +sub.f16x2 %18, r10969, r10984; +} +{ +add.f16x2 r10990, r9584, r9632; +} +{ +mul.f16x2 r10993, r10990, r10922; +} +{ +add.f16x2 r10996, r7839, r10993; +} +{ +add.f16x2 r10999, r9600, r9616; +} +{ +mul.f16x2 r11002, r10999, r10924; +} +{ +add.f16x2 r11005, r10996, r11002; +} +{ +sub.f16x2 r11008, r9590, r9638; +} +{ +mul.f16x2 r11011, r11008, r10923; +} +{ +sub.f16x2 r11014, r9606, r9622; +} +{ +mul.f16x2 r11017, r11014, r10925; +} +{ +add.f16x2 r11020, r11011, r11017; +} +{ +add.f16x2 %48, r11005, r11020; +} +{ +add.f16x2 r11026, r9584, r9632; +} +{ +mul.f16x2 r11029, r11026, r10924; +} +{ +add.f16x2 r11032, r7839, r11029; +} +{ +add.f16x2 r11035, r9600, r9616; +} +{ +mul.f16x2 r11038, r11035, r10926; +} +{ +add.f16x2 r11041, r11032, r11038; +} +{ +sub.f16x2 r11044, r9590, r9638; +} +{ +mul.f16x2 r11047, r11044, r10925; +} +{ +sub.f16x2 r11050, r9606, r9622; +} +{ +mul.f16x2 r11053, r11050, r10928; +} +{ +add.f16x2 r11056, r11047, r11053; +} +{ +sub.f16x2 %28, r11041, r11056; +} +{ +add.f16x2 r11062, r9584, r9632; +} +{ +mul.f16x2 r11065, r11062, r10924; +} +{ +add.f16x2 r11068, r7839, r11065; +} +{ +add.f16x2 r11071, r9600, r9616; +} +{ +mul.f16x2 r11074, r11071, r10926; +} +{ +add.f16x2 r11077, r11068, r11074; +} +{ +sub.f16x2 r11080, r9590, r9638; +} +{ +mul.f16x2 r11083, r11080, r10925; +} +{ +sub.f16x2 r11086, r9606, r9622; +} +{ +mul.f16x2 r11089, r11086, r10928; +} +{ +add.f16x2 r11092, r11083, r11089; +} +{ +add.f16x2 %38, r11077, r11092; +} +{ +add.f16x2 r11098, r9590, r9638; +} +{ +mul.f16x2 r11101, r11098, r10922; +} +{ +add.f16x2 r11104, r7983, r11101; +} +{ +add.f16x2 r11107, r9606, r9622; +} +{ +mul.f16x2 r11110, r11107, r10924; +} +{ +add.f16x2 r11113, r11104, r11110; +} +{ +sub.f16x2 r11116, r9584, r9632; +} +{ +mul.f16x2 r11119, r11116, r10923; +} +{ +sub.f16x2 r11122, r9600, r9616; +} +{ +mul.f16x2 r11125, r11122, r10925; +} +{ +add.f16x2 r11128, r11119, r11125; +} +{ +add.f16x2 %19, r11113, r11128; +} +{ +add.f16x2 r11134, r9590, r9638; +} +{ +mul.f16x2 r11137, r11134, r10922; +} +{ +add.f16x2 r11140, r7983, r11137; +} +{ +add.f16x2 r11143, r9606, r9622; +} +{ +mul.f16x2 r11146, r11143, r10924; +} +{ +add.f16x2 r11149, r11140, r11146; +} +{ +sub.f16x2 r11152, r9584, r9632; +} +{ +mul.f16x2 r11155, r11152, r10923; +} +{ +sub.f16x2 r11158, r9600, r9616; +} +{ +mul.f16x2 r11161, r11158, r10925; +} +{ +add.f16x2 r11164, r11155, r11161; +} +{ +sub.f16x2 %49, r11149, r11164; +} +{ +add.f16x2 r11170, r9590, r9638; +} +{ +mul.f16x2 r11173, r11170, r10924; +} +{ +add.f16x2 r11176, r7983, r11173; +} +{ +add.f16x2 r11179, r9606, r9622; +} +{ +mul.f16x2 r11182, r11179, r10926; +} +{ +add.f16x2 r11185, r11176, r11182; +} +{ +sub.f16x2 r11188, r9584, r9632; +} +{ +mul.f16x2 r11191, r11188, r10925; +} +{ +sub.f16x2 r11194, r9600, r9616; +} +{ +mul.f16x2 r11197, r11194, r10928; +} +{ +add.f16x2 r11200, r11191, r11197; +} +{ +add.f16x2 %29, r11185, r11200; +} +{ +add.f16x2 r11206, r9590, r9638; +} +{ +mul.f16x2 r11209, r11206, r10924; +} +{ +add.f16x2 r11212, r7983, r11209; +} +{ +add.f16x2 r11215, r9606, r9622; +} +{ +mul.f16x2 r11218, r11215, r10926; +} +{ +add.f16x2 r11221, r11212, r11218; +} +{ +sub.f16x2 r11224, r9584, r9632; +} +{ +mul.f16x2 r11227, r11224, r10925; +} +{ +sub.f16x2 r11230, r9600, r9616; +} +{ +mul.f16x2 r11233, r11230, r10928; +} +{ +add.f16x2 r11236, r11227, r11233; +} +{ +sub.f16x2 %39, r11221, r11236; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<916, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.local .align 8 .b8 __local_depot1[200]; +.reg .b64 SP; +.reg .b64 SPL; +.reg .pred p<3>; +.reg .f32 f<695>; +.reg .b32 r<11254>; +.reg .b64 rd<19>; +mov.u64 SPL, __local_depot1; +add.u64 rd3, SPL, 0; +mov.u32 r3531, %tid.y; +mul.lo.s32 r1, r3531, 15625; +add.s64 rd4, rd3, 4; +mov.f32 f214, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r23, {low, high}; +} +mov.f32 f216, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r24, {low, high}; +} +mov.f32 f210, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r25, {low, high}; +} +mov.f32 f212, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r26, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r27, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r28, {low, high}; +} +{ +neg.f16x2 r29, r28; +} +{ +add.f16x2 r31, %61, %91; +} +{ +add.f16x2 r34, %51, r31; +} +{ +add.f16x2 r37, %71, %81; +} +{ +add.f16x2 r40, r34, r37; +} +{ +add.f16x2 r43, %62, %92; +} +{ +add.f16x2 r46, %52, r43; +} +{ +add.f16x2 r49, %72, %82; +} +{ +add.f16x2 r52, r46, r49; +} +{ +add.f16x2 r55, %61, %91; +} +{ +mul.f16x2 r58, r55, r23; +} +{ +add.f16x2 r61, %51, r58; +} +{ +add.f16x2 r64, %71, %81; +} +{ +mul.f16x2 r67, r64, r25; +} +{ +add.f16x2 r70, r61, r67; +} +{ +sub.f16x2 r73, %62, %92; +} +{ +mul.f16x2 r76, r73, r24; +} +{ +sub.f16x2 r79, %72, %82; +} +{ +mul.f16x2 r82, r79, r26; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r70, r85; +} +{ +add.f16x2 r91, %61, %91; +} +{ +mul.f16x2 r94, r91, r23; +} +{ +add.f16x2 r97, %51, r94; +} +{ +add.f16x2 r100, %71, %81; +} +{ +mul.f16x2 r103, r100, r25; +} +{ +add.f16x2 r106, r97, r103; +} +{ +sub.f16x2 r109, %62, %92; +} +{ +mul.f16x2 r112, r109, r24; +} +{ +sub.f16x2 r115, %72, %82; +} +{ +mul.f16x2 r118, r115, r26; +} +{ +add.f16x2 r121, r112, r118; +} +{ +add.f16x2 r124, r106, r121; +} +{ +add.f16x2 r127, %61, %91; +} +{ +mul.f16x2 r130, r127, r25; +} +{ +add.f16x2 r133, %51, r130; +} +{ +add.f16x2 r136, %71, %81; +} +{ +mul.f16x2 r139, r136, r27; +} +{ +add.f16x2 r142, r133, r139; +} +{ +sub.f16x2 r145, %62, %92; +} +{ +mul.f16x2 r148, r145, r26; +} +{ +sub.f16x2 r151, %72, %82; +} +{ +mul.f16x2 r154, r151, r29; +} +{ +add.f16x2 r157, r148, r154; +} +{ +sub.f16x2 r160, r142, r157; +} +{ +add.f16x2 r163, %61, %91; +} +{ +mul.f16x2 r166, r163, r25; +} +{ +add.f16x2 r169, %51, r166; +} +{ +add.f16x2 r172, %71, %81; +} +{ +mul.f16x2 r175, r172, r27; +} +{ +add.f16x2 r178, r169, r175; +} +{ +sub.f16x2 r181, %62, %92; +} +{ +mul.f16x2 r184, r181, r26; +} +{ +sub.f16x2 r187, %72, %82; +} +{ +mul.f16x2 r190, r187, r29; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r178, r193; +} +{ +add.f16x2 r199, %62, %92; +} +{ +mul.f16x2 r202, r199, r23; +} +{ +add.f16x2 r205, %52, r202; +} +{ +add.f16x2 r208, %72, %82; +} +{ +mul.f16x2 r211, r208, r25; +} +{ +add.f16x2 r214, r205, r211; +} +{ +sub.f16x2 r217, %61, %91; +} +{ +mul.f16x2 r220, r217, r24; +} +{ +sub.f16x2 r223, %71, %81; +} +{ +mul.f16x2 r226, r223, r26; +} +{ +add.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r214, r229; +} +{ +add.f16x2 r235, %62, %92; +} +{ +mul.f16x2 r238, r235, r23; +} +{ +add.f16x2 r241, %52, r238; +} +{ +add.f16x2 r244, %72, %82; +} +{ +mul.f16x2 r247, r244, r25; +} +{ +add.f16x2 r250, r241, r247; +} +{ +sub.f16x2 r253, %61, %91; +} +{ +mul.f16x2 r256, r253, r24; +} +{ +sub.f16x2 r259, %71, %81; +} +{ +mul.f16x2 r262, r259, r26; +} +{ +add.f16x2 r265, r256, r262; +} +{ +sub.f16x2 r268, r250, r265; +} +{ +add.f16x2 r271, %62, %92; +} +{ +mul.f16x2 r274, r271, r25; +} +{ +add.f16x2 r277, %52, r274; +} +{ +add.f16x2 r280, %72, %82; +} +{ +mul.f16x2 r283, r280, r27; +} +{ +add.f16x2 r286, r277, r283; +} +{ +sub.f16x2 r289, %61, %91; +} +{ +mul.f16x2 r292, r289, r26; +} +{ +sub.f16x2 r295, %71, %81; +} +{ +mul.f16x2 r298, r295, r29; +} +{ +add.f16x2 r301, r292, r298; +} +{ +add.f16x2 r304, r286, r301; +} +{ +add.f16x2 r307, %62, %92; +} +{ +mul.f16x2 r310, r307, r25; +} +{ +add.f16x2 r313, %52, r310; +} +{ +add.f16x2 r316, %72, %82; +} +{ +mul.f16x2 r319, r316, r27; +} +{ +add.f16x2 r322, r313, r319; +} +{ +sub.f16x2 r325, %61, %91; +} +{ +mul.f16x2 r328, r325, r26; +} +{ +sub.f16x2 r331, %71, %81; +} +{ +mul.f16x2 r334, r331, r29; +} +{ +add.f16x2 r337, r328, r334; +} +{ +sub.f16x2 r340, r322, r337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r344, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r347, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r348, {low, high}; +} +{ +neg.f16x2 r349, r348; +} +{ +add.f16x2 r351, %63, %93; +} +{ +add.f16x2 r354, %53, r351; +} +{ +add.f16x2 r357, %73, %83; +} +{ +add.f16x2 r360, r354, r357; +} +{ +add.f16x2 r363, %64, %94; +} +{ +add.f16x2 r366, %54, r363; +} +{ +add.f16x2 r369, %74, %84; +} +{ +add.f16x2 r372, r366, r369; +} +{ +add.f16x2 r375, %63, %93; +} +{ +mul.f16x2 r378, r375, r343; +} +{ +add.f16x2 r381, %53, r378; +} +{ +add.f16x2 r384, %73, %83; +} +{ +mul.f16x2 r387, r384, r345; +} +{ +add.f16x2 r390, r381, r387; +} +{ +sub.f16x2 r393, %64, %94; +} +{ +mul.f16x2 r396, r393, r344; +} +{ +sub.f16x2 r399, %74, %84; +} +{ +mul.f16x2 r402, r399, r346; +} +{ +add.f16x2 r405, r396, r402; +} +{ +sub.f16x2 r408, r390, r405; +} +{ +add.f16x2 r411, %63, %93; +} +{ +mul.f16x2 r414, r411, r343; +} +{ +add.f16x2 r417, %53, r414; +} +{ +add.f16x2 r420, %73, %83; +} +{ +mul.f16x2 r423, r420, r345; +} +{ +add.f16x2 r426, r417, r423; +} +{ +sub.f16x2 r429, %64, %94; +} +{ +mul.f16x2 r432, r429, r344; +} +{ +sub.f16x2 r435, %74, %84; +} +{ +mul.f16x2 r438, r435, r346; +} +{ +add.f16x2 r441, r432, r438; +} +{ +add.f16x2 r444, r426, r441; +} +{ +add.f16x2 r447, %63, %93; +} +{ +mul.f16x2 r450, r447, r345; +} +{ +add.f16x2 r453, %53, r450; +} +{ +add.f16x2 r456, %73, %83; +} +{ +mul.f16x2 r459, r456, r347; +} +{ +add.f16x2 r462, r453, r459; +} +{ +sub.f16x2 r465, %64, %94; +} +{ +mul.f16x2 r468, r465, r346; +} +{ +sub.f16x2 r471, %74, %84; +} +{ +mul.f16x2 r474, r471, r349; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r462, r477; +} +{ +add.f16x2 r483, %63, %93; +} +{ +mul.f16x2 r486, r483, r345; +} +{ +add.f16x2 r489, %53, r486; +} +{ +add.f16x2 r492, %73, %83; +} +{ +mul.f16x2 r495, r492, r347; +} +{ +add.f16x2 r498, r489, r495; +} +{ +sub.f16x2 r501, %64, %94; +} +{ +mul.f16x2 r504, r501, r346; +} +{ +sub.f16x2 r507, %74, %84; +} +{ +mul.f16x2 r510, r507, r349; +} +{ +add.f16x2 r513, r504, r510; +} +{ +add.f16x2 r516, r498, r513; +} +{ +add.f16x2 r519, %64, %94; +} +{ +mul.f16x2 r522, r519, r343; +} +{ +add.f16x2 r525, %54, r522; +} +{ +add.f16x2 r528, %74, %84; +} +{ +mul.f16x2 r531, r528, r345; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, %63, %93; +} +{ +mul.f16x2 r540, r537, r344; +} +{ +sub.f16x2 r543, %73, %83; +} +{ +mul.f16x2 r546, r543, r346; +} +{ +add.f16x2 r549, r540, r546; +} +{ +add.f16x2 r552, r534, r549; +} +{ +add.f16x2 r555, %64, %94; +} +{ +mul.f16x2 r558, r555, r343; +} +{ +add.f16x2 r561, %54, r558; +} +{ +add.f16x2 r564, %74, %84; +} +{ +mul.f16x2 r567, r564, r345; +} +{ +add.f16x2 r570, r561, r567; +} +{ +sub.f16x2 r573, %63, %93; +} +{ +mul.f16x2 r576, r573, r344; +} +{ +sub.f16x2 r579, %73, %83; +} +{ +mul.f16x2 r582, r579, r346; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r570, r585; +} +{ +add.f16x2 r591, %64, %94; +} +{ +mul.f16x2 r594, r591, r345; +} +{ +add.f16x2 r597, %54, r594; +} +{ +add.f16x2 r600, %74, %84; +} +{ +mul.f16x2 r603, r600, r347; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, %63, %93; +} +{ +mul.f16x2 r612, r609, r346; +} +{ +sub.f16x2 r615, %73, %83; +} +{ +mul.f16x2 r618, r615, r349; +} +{ +add.f16x2 r621, r612, r618; +} +{ +add.f16x2 r624, r606, r621; +} +{ +add.f16x2 r627, %64, %94; +} +{ +mul.f16x2 r630, r627, r345; +} +{ +add.f16x2 r633, %54, r630; +} +{ +add.f16x2 r636, %74, %84; +} +{ +mul.f16x2 r639, r636, r347; +} +{ +add.f16x2 r642, r633, r639; +} +{ +sub.f16x2 r645, %63, %93; +} +{ +mul.f16x2 r648, r645, r346; +} +{ +sub.f16x2 r651, %73, %83; +} +{ +mul.f16x2 r654, r651, r349; +} +{ +add.f16x2 r657, r648, r654; +} +{ +sub.f16x2 r660, r642, r657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r663, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r664, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r665, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r666, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r667, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r668, {low, high}; +} +{ +neg.f16x2 r669, r668; +} +{ +add.f16x2 r671, %65, %95; +} +{ +add.f16x2 r674, %55, r671; +} +{ +add.f16x2 r677, %75, %85; +} +{ +add.f16x2 r680, r674, r677; +} +{ +add.f16x2 r683, %66, %96; +} +{ +add.f16x2 r686, %56, r683; +} +{ +add.f16x2 r689, %76, %86; +} +{ +add.f16x2 r692, r686, r689; +} +{ +add.f16x2 r695, %65, %95; +} +{ +mul.f16x2 r698, r695, r663; +} +{ +add.f16x2 r701, %55, r698; +} +{ +add.f16x2 r704, %75, %85; +} +{ +mul.f16x2 r707, r704, r665; +} +{ +add.f16x2 r710, r701, r707; +} +{ +sub.f16x2 r713, %66, %96; +} +{ +mul.f16x2 r716, r713, r664; +} +{ +sub.f16x2 r719, %76, %86; +} +{ +mul.f16x2 r722, r719, r666; +} +{ +add.f16x2 r725, r716, r722; +} +{ +sub.f16x2 r728, r710, r725; +} +{ +add.f16x2 r731, %65, %95; +} +{ +mul.f16x2 r734, r731, r663; +} +{ +add.f16x2 r737, %55, r734; +} +{ +add.f16x2 r740, %75, %85; +} +{ +mul.f16x2 r743, r740, r665; +} +{ +add.f16x2 r746, r737, r743; +} +{ +sub.f16x2 r749, %66, %96; +} +{ +mul.f16x2 r752, r749, r664; +} +{ +sub.f16x2 r755, %76, %86; +} +{ +mul.f16x2 r758, r755, r666; +} +{ +add.f16x2 r761, r752, r758; +} +{ +add.f16x2 r764, r746, r761; +} +{ +add.f16x2 r767, %65, %95; +} +{ +mul.f16x2 r770, r767, r665; +} +{ +add.f16x2 r773, %55, r770; +} +{ +add.f16x2 r776, %75, %85; +} +{ +mul.f16x2 r779, r776, r667; +} +{ +add.f16x2 r782, r773, r779; +} +{ +sub.f16x2 r785, %66, %96; +} +{ +mul.f16x2 r788, r785, r666; +} +{ +sub.f16x2 r791, %76, %86; +} +{ +mul.f16x2 r794, r791, r669; +} +{ +add.f16x2 r797, r788, r794; +} +{ +sub.f16x2 r800, r782, r797; +} +{ +add.f16x2 r803, %65, %95; +} +{ +mul.f16x2 r806, r803, r665; +} +{ +add.f16x2 r809, %55, r806; +} +{ +add.f16x2 r812, %75, %85; +} +{ +mul.f16x2 r815, r812, r667; +} +{ +add.f16x2 r818, r809, r815; +} +{ +sub.f16x2 r821, %66, %96; +} +{ +mul.f16x2 r824, r821, r666; +} +{ +sub.f16x2 r827, %76, %86; +} +{ +mul.f16x2 r830, r827, r669; +} +{ +add.f16x2 r833, r824, r830; +} +{ +add.f16x2 r836, r818, r833; +} +{ +add.f16x2 r839, %66, %96; +} +{ +mul.f16x2 r842, r839, r663; +} +{ +add.f16x2 r845, %56, r842; +} +{ +add.f16x2 r848, %76, %86; +} +{ +mul.f16x2 r851, r848, r665; +} +{ +add.f16x2 r854, r845, r851; +} +{ +sub.f16x2 r857, %65, %95; +} +{ +mul.f16x2 r860, r857, r664; +} +{ +sub.f16x2 r863, %75, %85; +} +{ +mul.f16x2 r866, r863, r666; +} +{ +add.f16x2 r869, r860, r866; +} +{ +add.f16x2 r872, r854, r869; +} +{ +add.f16x2 r875, %66, %96; +} +{ +mul.f16x2 r878, r875, r663; +} +{ +add.f16x2 r881, %56, r878; +} +{ +add.f16x2 r884, %76, %86; +} +{ +mul.f16x2 r887, r884, r665; +} +{ +add.f16x2 r890, r881, r887; +} +{ +sub.f16x2 r893, %65, %95; +} +{ +mul.f16x2 r896, r893, r664; +} +{ +sub.f16x2 r899, %75, %85; +} +{ +mul.f16x2 r902, r899, r666; +} +{ +add.f16x2 r905, r896, r902; +} +{ +sub.f16x2 r908, r890, r905; +} +{ +add.f16x2 r911, %66, %96; +} +{ +mul.f16x2 r914, r911, r665; +} +{ +add.f16x2 r917, %56, r914; +} +{ +add.f16x2 r920, %76, %86; +} +{ +mul.f16x2 r923, r920, r667; +} +{ +add.f16x2 r926, r917, r923; +} +{ +sub.f16x2 r929, %65, %95; +} +{ +mul.f16x2 r932, r929, r666; +} +{ +sub.f16x2 r935, %75, %85; +} +{ +mul.f16x2 r938, r935, r669; +} +{ +add.f16x2 r941, r932, r938; +} +{ +add.f16x2 r944, r926, r941; +} +{ +add.f16x2 r947, %66, %96; +} +{ +mul.f16x2 r950, r947, r665; +} +{ +add.f16x2 r953, %56, r950; +} +{ +add.f16x2 r956, %76, %86; +} +{ +mul.f16x2 r959, r956, r667; +} +{ +add.f16x2 r962, r953, r959; +} +{ +sub.f16x2 r965, %65, %95; +} +{ +mul.f16x2 r968, r965, r666; +} +{ +sub.f16x2 r971, %75, %85; +} +{ +mul.f16x2 r974, r971, r669; +} +{ +add.f16x2 r977, r968, r974; +} +{ +sub.f16x2 r980, r962, r977; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r983, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r984, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r985, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r986, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r987, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r988, {low, high}; +} +{ +neg.f16x2 r989, r988; +} +{ +add.f16x2 r991, %67, %97; +} +{ +add.f16x2 r994, %57, r991; +} +{ +add.f16x2 r997, %77, %87; +} +{ +add.f16x2 r1000, r994, r997; +} +{ +add.f16x2 r1003, %68, %98; +} +{ +add.f16x2 r1006, %58, r1003; +} +{ +add.f16x2 r1009, %78, %88; +} +{ +add.f16x2 r1012, r1006, r1009; +} +{ +add.f16x2 r1015, %67, %97; +} +{ +mul.f16x2 r1018, r1015, r983; +} +{ +add.f16x2 r1021, %57, r1018; +} +{ +add.f16x2 r1024, %77, %87; +} +{ +mul.f16x2 r1027, r1024, r985; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %68, %98; +} +{ +mul.f16x2 r1036, r1033, r984; +} +{ +sub.f16x2 r1039, %78, %88; +} +{ +mul.f16x2 r1042, r1039, r986; +} +{ +add.f16x2 r1045, r1036, r1042; +} +{ +sub.f16x2 r1048, r1030, r1045; +} +{ +add.f16x2 r1051, %67, %97; +} +{ +mul.f16x2 r1054, r1051, r983; +} +{ +add.f16x2 r1057, %57, r1054; +} +{ +add.f16x2 r1060, %77, %87; +} +{ +mul.f16x2 r1063, r1060, r985; +} +{ +add.f16x2 r1066, r1057, r1063; +} +{ +sub.f16x2 r1069, %68, %98; +} +{ +mul.f16x2 r1072, r1069, r984; +} +{ +sub.f16x2 r1075, %78, %88; +} +{ +mul.f16x2 r1078, r1075, r986; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +add.f16x2 r1084, r1066, r1081; +} +{ +add.f16x2 r1087, %67, %97; +} +{ +mul.f16x2 r1090, r1087, r985; +} +{ +add.f16x2 r1093, %57, r1090; +} +{ +add.f16x2 r1096, %77, %87; +} +{ +mul.f16x2 r1099, r1096, r987; +} +{ +add.f16x2 r1102, r1093, r1099; +} +{ +sub.f16x2 r1105, %68, %98; +} +{ +mul.f16x2 r1108, r1105, r986; +} +{ +sub.f16x2 r1111, %78, %88; +} +{ +mul.f16x2 r1114, r1111, r989; +} +{ +add.f16x2 r1117, r1108, r1114; +} +{ +sub.f16x2 r1120, r1102, r1117; +} +{ +add.f16x2 r1123, %67, %97; +} +{ +mul.f16x2 r1126, r1123, r985; +} +{ +add.f16x2 r1129, %57, r1126; +} +{ +add.f16x2 r1132, %77, %87; +} +{ +mul.f16x2 r1135, r1132, r987; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %68, %98; +} +{ +mul.f16x2 r1144, r1141, r986; +} +{ +sub.f16x2 r1147, %78, %88; +} +{ +mul.f16x2 r1150, r1147, r989; +} +{ +add.f16x2 r1153, r1144, r1150; +} +{ +add.f16x2 r1156, r1138, r1153; +} +{ +add.f16x2 r1159, %68, %98; +} +{ +mul.f16x2 r1162, r1159, r983; +} +{ +add.f16x2 r1165, %58, r1162; +} +{ +add.f16x2 r1168, %78, %88; +} +{ +mul.f16x2 r1171, r1168, r985; +} +{ +add.f16x2 r1174, r1165, r1171; +} +{ +sub.f16x2 r1177, %67, %97; +} +{ +mul.f16x2 r1180, r1177, r984; +} +{ +sub.f16x2 r1183, %77, %87; +} +{ +mul.f16x2 r1186, r1183, r986; +} +{ +add.f16x2 r1189, r1180, r1186; +} +{ +add.f16x2 r1192, r1174, r1189; +} +{ +add.f16x2 r1195, %68, %98; +} +{ +mul.f16x2 r1198, r1195, r983; +} +{ +add.f16x2 r1201, %58, r1198; +} +{ +add.f16x2 r1204, %78, %88; +} +{ +mul.f16x2 r1207, r1204, r985; +} +{ +add.f16x2 r1210, r1201, r1207; +} +{ +sub.f16x2 r1213, %67, %97; +} +{ +mul.f16x2 r1216, r1213, r984; +} +{ +sub.f16x2 r1219, %77, %87; +} +{ +mul.f16x2 r1222, r1219, r986; +} +{ +add.f16x2 r1225, r1216, r1222; +} +{ +sub.f16x2 r1228, r1210, r1225; +} +{ +add.f16x2 r1231, %68, %98; +} +{ +mul.f16x2 r1234, r1231, r985; +} +{ +add.f16x2 r1237, %58, r1234; +} +{ +add.f16x2 r1240, %78, %88; +} +{ +mul.f16x2 r1243, r1240, r987; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %67, %97; +} +{ +mul.f16x2 r1252, r1249, r986; +} +{ +sub.f16x2 r1255, %77, %87; +} +{ +mul.f16x2 r1258, r1255, r989; +} +{ +add.f16x2 r1261, r1252, r1258; +} +{ +add.f16x2 r1264, r1246, r1261; +} +{ +add.f16x2 r1267, %68, %98; +} +{ +mul.f16x2 r1270, r1267, r985; +} +{ +add.f16x2 r1273, %58, r1270; +} +{ +add.f16x2 r1276, %78, %88; +} +{ +mul.f16x2 r1279, r1276, r987; +} +{ +add.f16x2 r1282, r1273, r1279; +} +{ +sub.f16x2 r1285, %67, %97; +} +{ +mul.f16x2 r1288, r1285, r986; +} +{ +sub.f16x2 r1291, %77, %87; +} +{ +mul.f16x2 r1294, r1291, r989; +} +{ +add.f16x2 r1297, r1288, r1294; +} +{ +sub.f16x2 r1300, r1282, r1297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1306, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1307, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1308, {low, high}; +} +{ +neg.f16x2 r1309, r1308; +} +{ +add.f16x2 r1311, %69, %99; +} +{ +add.f16x2 r1314, %59, r1311; +} +{ +add.f16x2 r1317, %79, %89; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %70, %100; +} +{ +add.f16x2 r1326, %60, r1323; +} +{ +add.f16x2 r1329, %80, %90; +} +{ +add.f16x2 r1332, r1326, r1329; +} +{ +add.f16x2 r1335, %69, %99; +} +{ +mul.f16x2 r1338, r1335, r1303; +} +{ +add.f16x2 r1341, %59, r1338; +} +{ +add.f16x2 r1344, %79, %89; +} +{ +mul.f16x2 r1347, r1344, r1305; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +sub.f16x2 r1353, %70, %100; +} +{ +mul.f16x2 r1356, r1353, r1304; +} +{ +sub.f16x2 r1359, %80, %90; +} +{ +mul.f16x2 r1362, r1359, r1306; +} +{ +add.f16x2 r1365, r1356, r1362; +} +{ +sub.f16x2 r1368, r1350, r1365; +} +{ +add.f16x2 r1371, %69, %99; +} +{ +mul.f16x2 r1374, r1371, r1303; +} +{ +add.f16x2 r1377, %59, r1374; +} +{ +add.f16x2 r1380, %79, %89; +} +{ +mul.f16x2 r1383, r1380, r1305; +} +{ +add.f16x2 r1386, r1377, r1383; +} +{ +sub.f16x2 r1389, %70, %100; +} +{ +mul.f16x2 r1392, r1389, r1304; +} +{ +sub.f16x2 r1395, %80, %90; +} +{ +mul.f16x2 r1398, r1395, r1306; +} +{ +add.f16x2 r1401, r1392, r1398; +} +{ +add.f16x2 r1404, r1386, r1401; +} +{ +add.f16x2 r1407, %69, %99; +} +{ +mul.f16x2 r1410, r1407, r1305; +} +{ +add.f16x2 r1413, %59, r1410; +} +{ +add.f16x2 r1416, %79, %89; +} +{ +mul.f16x2 r1419, r1416, r1307; +} +{ +add.f16x2 r1422, r1413, r1419; +} +{ +sub.f16x2 r1425, %70, %100; +} +{ +mul.f16x2 r1428, r1425, r1306; +} +{ +sub.f16x2 r1431, %80, %90; +} +{ +mul.f16x2 r1434, r1431, r1309; +} +{ +add.f16x2 r1437, r1428, r1434; +} +{ +sub.f16x2 r1440, r1422, r1437; +} +{ +add.f16x2 r1443, %69, %99; +} +{ +mul.f16x2 r1446, r1443, r1305; +} +{ +add.f16x2 r1449, %59, r1446; +} +{ +add.f16x2 r1452, %79, %89; +} +{ +mul.f16x2 r1455, r1452, r1307; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +sub.f16x2 r1461, %70, %100; +} +{ +mul.f16x2 r1464, r1461, r1306; +} +{ +sub.f16x2 r1467, %80, %90; +} +{ +mul.f16x2 r1470, r1467, r1309; +} +{ +add.f16x2 r1473, r1464, r1470; +} +{ +add.f16x2 r1476, r1458, r1473; +} +{ +add.f16x2 r1479, %70, %100; +} +{ +mul.f16x2 r1482, r1479, r1303; +} +{ +add.f16x2 r1485, %60, r1482; +} +{ +add.f16x2 r1488, %80, %90; +} +{ +mul.f16x2 r1491, r1488, r1305; +} +{ +add.f16x2 r1494, r1485, r1491; +} +{ +sub.f16x2 r1497, %69, %99; +} +{ +mul.f16x2 r1500, r1497, r1304; +} +{ +sub.f16x2 r1503, %79, %89; +} +{ +mul.f16x2 r1506, r1503, r1306; +} +{ +add.f16x2 r1509, r1500, r1506; +} +{ +add.f16x2 r1512, r1494, r1509; +} +{ +add.f16x2 r1515, %70, %100; +} +{ +mul.f16x2 r1518, r1515, r1303; +} +{ +add.f16x2 r1521, %60, r1518; +} +{ +add.f16x2 r1524, %80, %90; +} +{ +mul.f16x2 r1527, r1524, r1305; +} +{ +add.f16x2 r1530, r1521, r1527; +} +{ +sub.f16x2 r1533, %69, %99; +} +{ +mul.f16x2 r1536, r1533, r1304; +} +{ +sub.f16x2 r1539, %79, %89; +} +{ +mul.f16x2 r1542, r1539, r1306; +} +{ +add.f16x2 r1545, r1536, r1542; +} +{ +sub.f16x2 r1548, r1530, r1545; +} +{ +add.f16x2 r1551, %70, %100; +} +{ +mul.f16x2 r1554, r1551, r1305; +} +{ +add.f16x2 r1557, %60, r1554; +} +{ +add.f16x2 r1560, %80, %90; +} +{ +mul.f16x2 r1563, r1560, r1307; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +sub.f16x2 r1569, %69, %99; +} +{ +mul.f16x2 r1572, r1569, r1306; +} +{ +sub.f16x2 r1575, %79, %89; +} +{ +mul.f16x2 r1578, r1575, r1309; +} +{ +add.f16x2 r1581, r1572, r1578; +} +{ +add.f16x2 r1584, r1566, r1581; +} +{ +add.f16x2 r1587, %70, %100; +} +{ +mul.f16x2 r1590, r1587, r1305; +} +{ +add.f16x2 r1593, %60, r1590; +} +{ +add.f16x2 r1596, %80, %90; +} +{ +mul.f16x2 r1599, r1596, r1307; +} +{ +add.f16x2 r1602, r1593, r1599; +} +{ +sub.f16x2 r1605, %69, %99; +} +{ +mul.f16x2 r1608, r1605, r1306; +} +{ +sub.f16x2 r1611, %79, %89; +} +{ +mul.f16x2 r1614, r1611, r1309; +} +{ +add.f16x2 r1617, r1608, r1614; +} +{ +sub.f16x2 r1620, r1602, r1617; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1623, {low, high}; +} +mov.f32 f64, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1624, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1625, {low, high}; +} +mov.f32 f68, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1626, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1627, {low, high}; +} +mov.f32 f72, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1628, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1629, {low, high}; +} +mov.f32 f76, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1630, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1633, {low, high}; +} +mov.f32 f84, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1634, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1637, {low, high}; +} +mov.f32 f92, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1638, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1639, {low, high}; +} +mov.f32 f96, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1640, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1645, {low, high}; +} +mov.f32 f108, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1653, {low, high}; +} +mov.f32 f124, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1671, r408, r1623; +} +{ +mul.f16x2 r1674, r552, r1624; +} +{ +sub.f16x2 r1677, r1671, r1674; +} +{ +mul.f16x2 r1680, r408, r1624; +} +{ +fma.rn.f16x2 r1683, r552, r1623, r1680; +} +{ +mul.f16x2 r1687, r728, r1625; +} +{ +mul.f16x2 r1690, r872, r1626; +} +{ +sub.f16x2 r1693, r1687, r1690; +} +{ +mul.f16x2 r1696, r728, r1626; +} +{ +fma.rn.f16x2 r1699, r872, r1625, r1696; +} +{ +mul.f16x2 r1703, r1048, r1627; +} +{ +mul.f16x2 r1706, r1192, r1628; +} +{ +sub.f16x2 r1709, r1703, r1706; +} +{ +mul.f16x2 r1712, r1048, r1628; +} +{ +fma.rn.f16x2 r1715, r1192, r1627, r1712; +} +{ +mul.f16x2 r1719, r1368, r1629; +} +{ +mul.f16x2 r1722, r1512, r1630; +} +{ +sub.f16x2 r1725, r1719, r1722; +} +{ +mul.f16x2 r1728, r1368, r1630; +} +{ +fma.rn.f16x2 r1731, r1512, r1629, r1728; +} +{ +mul.f16x2 r1735, r480, r1625; +} +{ +mul.f16x2 r1738, r624, r1626; +} +{ +sub.f16x2 r1741, r1735, r1738; +} +{ +mul.f16x2 r1744, r480, r1626; +} +{ +fma.rn.f16x2 r1747, r624, r1625, r1744; +} +{ +mul.f16x2 r1751, r800, r1629; +} +{ +mul.f16x2 r1754, r944, r1630; +} +{ +sub.f16x2 r1757, r1751, r1754; +} +{ +mul.f16x2 r1760, r800, r1630; +} +{ +fma.rn.f16x2 r1763, r944, r1629, r1760; +} +{ +mul.f16x2 r1767, r1120, r1633; +} +{ +mul.f16x2 r1770, r1264, r1634; +} +{ +sub.f16x2 r1773, r1767, r1770; +} +{ +mul.f16x2 r1776, r1120, r1634; +} +{ +fma.rn.f16x2 r1779, r1264, r1633, r1776; +} +{ +mul.f16x2 r1783, r1440, r1637; +} +{ +mul.f16x2 r1786, r1584, r1638; +} +{ +sub.f16x2 r1789, r1783, r1786; +} +{ +mul.f16x2 r1792, r1440, r1638; +} +{ +fma.rn.f16x2 r1795, r1584, r1637, r1792; +} +{ +mul.f16x2 r1799, r516, r1627; +} +{ +mul.f16x2 r1802, r660, r1628; +} +{ +sub.f16x2 r1805, r1799, r1802; +} +{ +mul.f16x2 r1808, r516, r1628; +} +{ +fma.rn.f16x2 r1811, r660, r1627, r1808; +} +{ +mul.f16x2 r1815, r836, r1633; +} +{ +mul.f16x2 r1818, r980, r1634; +} +{ +sub.f16x2 r1821, r1815, r1818; +} +{ +mul.f16x2 r1824, r836, r1634; +} +{ +fma.rn.f16x2 r1827, r980, r1633, r1824; +} +{ +mul.f16x2 r1831, r1156, r1639; +} +{ +mul.f16x2 r1834, r1300, r1640; +} +{ +sub.f16x2 r1837, r1831, r1834; +} +{ +mul.f16x2 r1840, r1156, r1640; +} +{ +fma.rn.f16x2 r1843, r1300, r1639, r1840; +} +{ +mul.f16x2 r1847, r1476, r1645; +} +{ +mul.f16x2 r1850, r1620, r1646; +} +{ +sub.f16x2 r1853, r1847, r1850; +} +{ +mul.f16x2 r1856, r1476, r1646; +} +{ +fma.rn.f16x2 r1859, r1620, r1645, r1856; +} +{ +mul.f16x2 r1863, r444, r1629; +} +{ +mul.f16x2 r1866, r588, r1630; +} +{ +sub.f16x2 r1869, r1863, r1866; +} +{ +mul.f16x2 r1872, r444, r1630; +} +{ +fma.rn.f16x2 r1875, r588, r1629, r1872; +} +{ +mul.f16x2 r1879, r764, r1637; +} +{ +mul.f16x2 r1882, r908, r1638; +} +{ +sub.f16x2 r1885, r1879, r1882; +} +{ +mul.f16x2 r1888, r764, r1638; +} +{ +fma.rn.f16x2 r1891, r908, r1637, r1888; +} +{ +mul.f16x2 r1895, r1084, r1645; +} +{ +mul.f16x2 r1898, r1228, r1646; +} +{ +sub.f16x2 r1901, r1895, r1898; +} +{ +mul.f16x2 r1904, r1084, r1646; +} +{ +fma.rn.f16x2 r1907, r1228, r1645, r1904; +} +{ +mul.f16x2 r1911, r1404, r1653; +} +{ +mul.f16x2 r1914, r1548, r1654; +} +{ +sub.f16x2 r1917, r1911, r1914; +} +{ +mul.f16x2 r1920, r1404, r1654; +} +{ +fma.rn.f16x2 r1923, r1548, r1653, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1931, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1932, {low, high}; +} +{ +neg.f16x2 r1933, r1932; +} +{ +add.f16x2 r1935, r360, r1320; +} +{ +add.f16x2 r1938, r40, r1935; +} +{ +add.f16x2 r1941, r680, r1000; +} +{ +add.f16x2 r1944, r1938, r1941; +} +st.local.u32 [rd3], r1944; +{ +add.f16x2 r1947, r372, r1332; +} +{ +add.f16x2 r1950, r52, r1947; +} +{ +add.f16x2 r1953, r692, r1012; +} +{ +add.f16x2 r1956, r1950, r1953; +} +st.local.u32 [rd3+4], r1956; +{ +add.f16x2 r1959, r360, r1320; +} +{ +mul.f16x2 r1962, r1959, r1927; +} +{ +add.f16x2 r1965, r40, r1962; +} +{ +add.f16x2 r1968, r680, r1000; +} +{ +mul.f16x2 r1971, r1968, r1929; +} +{ +add.f16x2 r1974, r1965, r1971; +} +{ +sub.f16x2 r1977, r372, r1332; +} +{ +mul.f16x2 r1980, r1977, r1928; +} +{ +sub.f16x2 r1983, r692, r1012; +} +{ +mul.f16x2 r1986, r1983, r1930; +} +{ +add.f16x2 r1989, r1980, r1986; +} +{ +sub.f16x2 r1992, r1974, r1989; +} +st.local.u32 [rd3+40], r1992; +{ +add.f16x2 r1995, r360, r1320; +} +{ +mul.f16x2 r1998, r1995, r1927; +} +{ +add.f16x2 r2001, r40, r1998; +} +{ +add.f16x2 r2004, r680, r1000; +} +{ +mul.f16x2 r2007, r2004, r1929; +} +{ +add.f16x2 r2010, r2001, r2007; +} +{ +sub.f16x2 r2013, r372, r1332; +} +{ +mul.f16x2 r2016, r2013, r1928; +} +{ +sub.f16x2 r2019, r692, r1012; +} +{ +mul.f16x2 r2022, r2019, r1930; +} +{ +add.f16x2 r2025, r2016, r2022; +} +{ +add.f16x2 r2028, r2010, r2025; +} +st.local.u32 [rd3+160], r2028; +{ +add.f16x2 r2031, r360, r1320; +} +{ +mul.f16x2 r2034, r2031, r1929; +} +{ +add.f16x2 r2037, r40, r2034; +} +{ +add.f16x2 r2040, r680, r1000; +} +{ +mul.f16x2 r2043, r2040, r1931; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +sub.f16x2 r2049, r372, r1332; +} +{ +mul.f16x2 r2052, r2049, r1930; +} +{ +sub.f16x2 r2055, r692, r1012; +} +{ +mul.f16x2 r2058, r2055, r1933; +} +{ +add.f16x2 r2061, r2052, r2058; +} +{ +sub.f16x2 r2064, r2046, r2061; +} +st.local.u32 [rd3+80], r2064; +{ +add.f16x2 r2067, r360, r1320; +} +{ +mul.f16x2 r2070, r2067, r1929; +} +{ +add.f16x2 r2073, r40, r2070; +} +{ +add.f16x2 r2076, r680, r1000; +} +{ +mul.f16x2 r2079, r2076, r1931; +} +{ +add.f16x2 r2082, r2073, r2079; +} +{ +sub.f16x2 r2085, r372, r1332; +} +{ +mul.f16x2 r2088, r2085, r1930; +} +{ +sub.f16x2 r2091, r692, r1012; +} +{ +mul.f16x2 r2094, r2091, r1933; +} +{ +add.f16x2 r2097, r2088, r2094; +} +{ +add.f16x2 r2100, r2082, r2097; +} +st.local.u32 [rd3+120], r2100; +{ +add.f16x2 r2103, r372, r1332; +} +{ +mul.f16x2 r2106, r2103, r1927; +} +{ +add.f16x2 r2109, r52, r2106; +} +{ +add.f16x2 r2112, r692, r1012; +} +{ +mul.f16x2 r2115, r2112, r1929; +} +{ +add.f16x2 r2118, r2109, r2115; +} +{ +sub.f16x2 r2121, r360, r1320; +} +{ +mul.f16x2 r2124, r2121, r1928; +} +{ +sub.f16x2 r2127, r680, r1000; +} +{ +mul.f16x2 r2130, r2127, r1930; +} +{ +add.f16x2 r2133, r2124, r2130; +} +{ +add.f16x2 r2136, r2118, r2133; +} +st.local.u32 [rd3+44], r2136; +{ +add.f16x2 r2139, r372, r1332; +} +{ +mul.f16x2 r2142, r2139, r1927; +} +{ +add.f16x2 r2145, r52, r2142; +} +{ +add.f16x2 r2148, r692, r1012; +} +{ +mul.f16x2 r2151, r2148, r1929; +} +{ +add.f16x2 r2154, r2145, r2151; +} +{ +sub.f16x2 r2157, r360, r1320; +} +{ +mul.f16x2 r2160, r2157, r1928; +} +{ +sub.f16x2 r2163, r680, r1000; +} +{ +mul.f16x2 r2166, r2163, r1930; +} +{ +add.f16x2 r2169, r2160, r2166; +} +{ +sub.f16x2 r2172, r2154, r2169; +} +st.local.u32 [rd3+164], r2172; +{ +add.f16x2 r2175, r372, r1332; +} +{ +mul.f16x2 r2178, r2175, r1929; +} +{ +add.f16x2 r2181, r52, r2178; +} +{ +add.f16x2 r2184, r692, r1012; +} +{ +mul.f16x2 r2187, r2184, r1931; +} +{ +add.f16x2 r2190, r2181, r2187; +} +{ +sub.f16x2 r2193, r360, r1320; +} +{ +mul.f16x2 r2196, r2193, r1930; +} +{ +sub.f16x2 r2199, r680, r1000; +} +{ +mul.f16x2 r2202, r2199, r1933; +} +{ +add.f16x2 r2205, r2196, r2202; +} +{ +add.f16x2 r2208, r2190, r2205; +} +st.local.u32 [rd3+84], r2208; +{ +add.f16x2 r2211, r372, r1332; +} +{ +mul.f16x2 r2214, r2211, r1929; +} +{ +add.f16x2 r2217, r52, r2214; +} +{ +add.f16x2 r2220, r692, r1012; +} +{ +mul.f16x2 r2223, r2220, r1931; +} +{ +add.f16x2 r2226, r2217, r2223; +} +{ +sub.f16x2 r2229, r360, r1320; +} +{ +mul.f16x2 r2232, r2229, r1930; +} +{ +sub.f16x2 r2235, r680, r1000; +} +{ +mul.f16x2 r2238, r2235, r1933; +} +{ +add.f16x2 r2241, r2232, r2238; +} +{ +sub.f16x2 r2244, r2226, r2241; +} +st.local.u32 [rd3+124], r2244; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2248, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2252, {low, high}; +} +{ +neg.f16x2 r2253, r2252; +} +{ +add.f16x2 r2255, r1677, r1725; +} +{ +add.f16x2 r2258, r88, r2255; +} +{ +add.f16x2 r2261, r1693, r1709; +} +{ +add.f16x2 r11248, r2258, r2261; +} +st.local.u32 [rd3+8], r11248; +{ +add.f16x2 r2267, r1683, r1731; +} +{ +add.f16x2 r2270, r232, r2267; +} +{ +add.f16x2 r2273, r1699, r1715; +} +{ +add.f16x2 r2276, r2270, r2273; +} +st.local.u32 [rd3+12], r2276; +{ +add.f16x2 r2279, r1677, r1725; +} +{ +mul.f16x2 r2282, r2279, r2247; +} +{ +add.f16x2 r2285, r88, r2282; +} +{ +add.f16x2 r2288, r1693, r1709; +} +{ +mul.f16x2 r2291, r2288, r2249; +} +{ +add.f16x2 r2294, r2285, r2291; +} +{ +sub.f16x2 r2297, r1683, r1731; +} +{ +mul.f16x2 r2300, r2297, r2248; +} +{ +sub.f16x2 r2303, r1699, r1715; +} +{ +mul.f16x2 r2306, r2303, r2250; +} +{ +add.f16x2 r2309, r2300, r2306; +} +{ +sub.f16x2 r2312, r2294, r2309; +} +st.local.u32 [rd3+48], r2312; +{ +add.f16x2 r2315, r1677, r1725; +} +{ +mul.f16x2 r2318, r2315, r2247; +} +{ +add.f16x2 r2321, r88, r2318; +} +{ +add.f16x2 r2324, r1693, r1709; +} +{ +mul.f16x2 r2327, r2324, r2249; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +sub.f16x2 r2333, r1683, r1731; +} +{ +mul.f16x2 r2336, r2333, r2248; +} +{ +sub.f16x2 r2339, r1699, r1715; +} +{ +mul.f16x2 r2342, r2339, r2250; +} +{ +add.f16x2 r2345, r2336, r2342; +} +{ +add.f16x2 r2348, r2330, r2345; +} +st.local.u32 [rd3+168], r2348; +{ +add.f16x2 r2351, r1677, r1725; +} +{ +mul.f16x2 r2354, r2351, r2249; +} +{ +add.f16x2 r2357, r88, r2354; +} +{ +add.f16x2 r2360, r1693, r1709; +} +{ +mul.f16x2 r2363, r2360, r2251; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +sub.f16x2 r2369, r1683, r1731; +} +{ +mul.f16x2 r2372, r2369, r2250; +} +{ +sub.f16x2 r2375, r1699, r1715; +} +{ +mul.f16x2 r2378, r2375, r2253; +} +{ +add.f16x2 r2381, r2372, r2378; +} +{ +sub.f16x2 r2384, r2366, r2381; +} +st.local.u32 [rd3+88], r2384; +{ +add.f16x2 r2387, r1677, r1725; +} +{ +mul.f16x2 r2390, r2387, r2249; +} +{ +add.f16x2 r2393, r88, r2390; +} +{ +add.f16x2 r2396, r1693, r1709; +} +{ +mul.f16x2 r2399, r2396, r2251; +} +{ +add.f16x2 r2402, r2393, r2399; +} +{ +sub.f16x2 r2405, r1683, r1731; +} +{ +mul.f16x2 r2408, r2405, r2250; +} +{ +sub.f16x2 r2411, r1699, r1715; +} +{ +mul.f16x2 r2414, r2411, r2253; +} +{ +add.f16x2 r2417, r2408, r2414; +} +{ +add.f16x2 r2420, r2402, r2417; +} +st.local.u32 [rd3+128], r2420; +{ +add.f16x2 r2423, r1683, r1731; +} +{ +mul.f16x2 r2426, r2423, r2247; +} +{ +add.f16x2 r2429, r232, r2426; +} +{ +add.f16x2 r2432, r1699, r1715; +} +{ +mul.f16x2 r2435, r2432, r2249; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +sub.f16x2 r2441, r1677, r1725; +} +{ +mul.f16x2 r2444, r2441, r2248; +} +{ +sub.f16x2 r2447, r1693, r1709; +} +{ +mul.f16x2 r2450, r2447, r2250; +} +{ +add.f16x2 r2453, r2444, r2450; +} +{ +add.f16x2 r2456, r2438, r2453; +} +st.local.u32 [rd3+52], r2456; +{ +add.f16x2 r2459, r1683, r1731; +} +{ +mul.f16x2 r2462, r2459, r2247; +} +{ +add.f16x2 r2465, r232, r2462; +} +{ +add.f16x2 r2468, r1699, r1715; +} +{ +mul.f16x2 r2471, r2468, r2249; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +sub.f16x2 r2477, r1677, r1725; +} +{ +mul.f16x2 r2480, r2477, r2248; +} +{ +sub.f16x2 r2483, r1693, r1709; +} +{ +mul.f16x2 r2486, r2483, r2250; +} +{ +add.f16x2 r2489, r2480, r2486; +} +{ +sub.f16x2 r2492, r2474, r2489; +} +st.local.u32 [rd3+172], r2492; +{ +add.f16x2 r2495, r1683, r1731; +} +{ +mul.f16x2 r2498, r2495, r2249; +} +{ +add.f16x2 r2501, r232, r2498; +} +{ +add.f16x2 r2504, r1699, r1715; +} +{ +mul.f16x2 r2507, r2504, r2251; +} +{ +add.f16x2 r2510, r2501, r2507; +} +{ +sub.f16x2 r2513, r1677, r1725; +} +{ +mul.f16x2 r2516, r2513, r2250; +} +{ +sub.f16x2 r2519, r1693, r1709; +} +{ +mul.f16x2 r2522, r2519, r2253; +} +{ +add.f16x2 r2525, r2516, r2522; +} +{ +add.f16x2 r2528, r2510, r2525; +} +st.local.u32 [rd3+92], r2528; +{ +add.f16x2 r2531, r1683, r1731; +} +{ +mul.f16x2 r2534, r2531, r2249; +} +{ +add.f16x2 r2537, r232, r2534; +} +{ +add.f16x2 r2540, r1699, r1715; +} +{ +mul.f16x2 r2543, r2540, r2251; +} +{ +add.f16x2 r2546, r2537, r2543; +} +{ +sub.f16x2 r2549, r1677, r1725; +} +{ +mul.f16x2 r2552, r2549, r2250; +} +{ +sub.f16x2 r2555, r1693, r1709; +} +{ +mul.f16x2 r2558, r2555, r2253; +} +{ +add.f16x2 r2561, r2552, r2558; +} +{ +sub.f16x2 r2564, r2546, r2561; +} +st.local.u32 [rd3+132], r2564; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2572, {low, high}; +} +{ +neg.f16x2 r2573, r2572; +} +{ +add.f16x2 r2575, r1741, r1789; +} +{ +add.f16x2 r2578, r160, r2575; +} +{ +add.f16x2 r2581, r1757, r1773; +} +{ +add.f16x2 r2584, r2578, r2581; +} +st.local.u32 [rd3+16], r2584; +{ +add.f16x2 r2587, r1747, r1795; +} +{ +add.f16x2 r2590, r304, r2587; +} +{ +add.f16x2 r2593, r1763, r1779; +} +{ +add.f16x2 r2596, r2590, r2593; +} +st.local.u32 [rd3+20], r2596; +{ +add.f16x2 r2599, r1741, r1789; +} +{ +mul.f16x2 r2602, r2599, r2567; +} +{ +add.f16x2 r2605, r160, r2602; +} +{ +add.f16x2 r2608, r1757, r1773; +} +{ +mul.f16x2 r2611, r2608, r2569; +} +{ +add.f16x2 r2614, r2605, r2611; +} +{ +sub.f16x2 r2617, r1747, r1795; +} +{ +mul.f16x2 r2620, r2617, r2568; +} +{ +sub.f16x2 r2623, r1763, r1779; +} +{ +mul.f16x2 r2626, r2623, r2570; +} +{ +add.f16x2 r2629, r2620, r2626; +} +{ +sub.f16x2 r2632, r2614, r2629; +} +st.local.u32 [rd3+56], r2632; +{ +add.f16x2 r2635, r1741, r1789; +} +{ +mul.f16x2 r2638, r2635, r2567; +} +{ +add.f16x2 r2641, r160, r2638; +} +{ +add.f16x2 r2644, r1757, r1773; +} +{ +mul.f16x2 r2647, r2644, r2569; +} +{ +add.f16x2 r2650, r2641, r2647; +} +{ +sub.f16x2 r2653, r1747, r1795; +} +{ +mul.f16x2 r2656, r2653, r2568; +} +{ +sub.f16x2 r2659, r1763, r1779; +} +{ +mul.f16x2 r2662, r2659, r2570; +} +{ +add.f16x2 r2665, r2656, r2662; +} +{ +add.f16x2 r2668, r2650, r2665; +} +st.local.u32 [rd3+176], r2668; +{ +add.f16x2 r2671, r1741, r1789; +} +{ +mul.f16x2 r2674, r2671, r2569; +} +{ +add.f16x2 r2677, r160, r2674; +} +{ +add.f16x2 r2680, r1757, r1773; +} +{ +mul.f16x2 r2683, r2680, r2571; +} +{ +add.f16x2 r2686, r2677, r2683; +} +{ +sub.f16x2 r2689, r1747, r1795; +} +{ +mul.f16x2 r2692, r2689, r2570; +} +{ +sub.f16x2 r2695, r1763, r1779; +} +{ +mul.f16x2 r2698, r2695, r2573; +} +{ +add.f16x2 r2701, r2692, r2698; +} +{ +sub.f16x2 r2704, r2686, r2701; +} +st.local.u32 [rd3+96], r2704; +{ +add.f16x2 r2707, r1741, r1789; +} +{ +mul.f16x2 r2710, r2707, r2569; +} +{ +add.f16x2 r2713, r160, r2710; +} +{ +add.f16x2 r2716, r1757, r1773; +} +{ +mul.f16x2 r2719, r2716, r2571; +} +{ +add.f16x2 r2722, r2713, r2719; +} +{ +sub.f16x2 r2725, r1747, r1795; +} +{ +mul.f16x2 r2728, r2725, r2570; +} +{ +sub.f16x2 r2731, r1763, r1779; +} +{ +mul.f16x2 r2734, r2731, r2573; +} +{ +add.f16x2 r2737, r2728, r2734; +} +{ +add.f16x2 r2740, r2722, r2737; +} +st.local.u32 [rd3+136], r2740; +{ +add.f16x2 r2743, r1747, r1795; +} +{ +mul.f16x2 r2746, r2743, r2567; +} +{ +add.f16x2 r2749, r304, r2746; +} +{ +add.f16x2 r2752, r1763, r1779; +} +{ +mul.f16x2 r2755, r2752, r2569; +} +{ +add.f16x2 r2758, r2749, r2755; +} +{ +sub.f16x2 r2761, r1741, r1789; +} +{ +mul.f16x2 r2764, r2761, r2568; +} +{ +sub.f16x2 r2767, r1757, r1773; +} +{ +mul.f16x2 r2770, r2767, r2570; +} +{ +add.f16x2 r2773, r2764, r2770; +} +{ +add.f16x2 r2776, r2758, r2773; +} +st.local.u32 [rd3+60], r2776; +{ +add.f16x2 r2779, r1747, r1795; +} +{ +mul.f16x2 r2782, r2779, r2567; +} +{ +add.f16x2 r2785, r304, r2782; +} +{ +add.f16x2 r2788, r1763, r1779; +} +{ +mul.f16x2 r2791, r2788, r2569; +} +{ +add.f16x2 r2794, r2785, r2791; +} +{ +sub.f16x2 r2797, r1741, r1789; +} +{ +mul.f16x2 r2800, r2797, r2568; +} +{ +sub.f16x2 r2803, r1757, r1773; +} +{ +mul.f16x2 r2806, r2803, r2570; +} +{ +add.f16x2 r2809, r2800, r2806; +} +{ +sub.f16x2 r2812, r2794, r2809; +} +st.local.u32 [rd3+180], r2812; +{ +add.f16x2 r2815, r1747, r1795; +} +{ +mul.f16x2 r2818, r2815, r2569; +} +{ +add.f16x2 r2821, r304, r2818; +} +{ +add.f16x2 r2824, r1763, r1779; +} +{ +mul.f16x2 r2827, r2824, r2571; +} +{ +add.f16x2 r2830, r2821, r2827; +} +{ +sub.f16x2 r2833, r1741, r1789; +} +{ +mul.f16x2 r2836, r2833, r2570; +} +{ +sub.f16x2 r2839, r1757, r1773; +} +{ +mul.f16x2 r2842, r2839, r2573; +} +{ +add.f16x2 r2845, r2836, r2842; +} +{ +add.f16x2 r2848, r2830, r2845; +} +st.local.u32 [rd3+100], r2848; +{ +add.f16x2 r2851, r1747, r1795; +} +{ +mul.f16x2 r2854, r2851, r2569; +} +{ +add.f16x2 r2857, r304, r2854; +} +{ +add.f16x2 r2860, r1763, r1779; +} +{ +mul.f16x2 r2863, r2860, r2571; +} +{ +add.f16x2 r2866, r2857, r2863; +} +{ +sub.f16x2 r2869, r1741, r1789; +} +{ +mul.f16x2 r2872, r2869, r2570; +} +{ +sub.f16x2 r2875, r1757, r1773; +} +{ +mul.f16x2 r2878, r2875, r2573; +} +{ +add.f16x2 r2881, r2872, r2878; +} +{ +sub.f16x2 r2884, r2866, r2881; +} +st.local.u32 [rd3+140], r2884; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2887, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2888, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2891, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2892, {low, high}; +} +{ +neg.f16x2 r2893, r2892; +} +{ +add.f16x2 r2895, r1805, r1853; +} +{ +add.f16x2 r2898, r196, r2895; +} +{ +add.f16x2 r2901, r1821, r1837; +} +{ +add.f16x2 r2904, r2898, r2901; +} +st.local.u32 [rd3+24], r2904; +{ +add.f16x2 r2907, r1811, r1859; +} +{ +add.f16x2 r2910, r340, r2907; +} +{ +add.f16x2 r2913, r1827, r1843; +} +{ +add.f16x2 r2916, r2910, r2913; +} +st.local.u32 [rd3+28], r2916; +{ +add.f16x2 r2919, r1805, r1853; +} +{ +mul.f16x2 r2922, r2919, r2887; +} +{ +add.f16x2 r2925, r196, r2922; +} +{ +add.f16x2 r2928, r1821, r1837; +} +{ +mul.f16x2 r2931, r2928, r2889; +} +{ +add.f16x2 r2934, r2925, r2931; +} +{ +sub.f16x2 r2937, r1811, r1859; +} +{ +mul.f16x2 r2940, r2937, r2888; +} +{ +sub.f16x2 r2943, r1827, r1843; +} +{ +mul.f16x2 r2946, r2943, r2890; +} +{ +add.f16x2 r2949, r2940, r2946; +} +{ +sub.f16x2 r2952, r2934, r2949; +} +st.local.u32 [rd3+64], r2952; +{ +add.f16x2 r2955, r1805, r1853; +} +{ +mul.f16x2 r2958, r2955, r2887; +} +{ +add.f16x2 r2961, r196, r2958; +} +{ +add.f16x2 r2964, r1821, r1837; +} +{ +mul.f16x2 r2967, r2964, r2889; +} +{ +add.f16x2 r2970, r2961, r2967; +} +{ +sub.f16x2 r2973, r1811, r1859; +} +{ +mul.f16x2 r2976, r2973, r2888; +} +{ +sub.f16x2 r2979, r1827, r1843; +} +{ +mul.f16x2 r2982, r2979, r2890; +} +{ +add.f16x2 r2985, r2976, r2982; +} +{ +add.f16x2 r2988, r2970, r2985; +} +st.local.u32 [rd3+184], r2988; +{ +add.f16x2 r2991, r1805, r1853; +} +{ +mul.f16x2 r2994, r2991, r2889; +} +{ +add.f16x2 r2997, r196, r2994; +} +{ +add.f16x2 r3000, r1821, r1837; +} +{ +mul.f16x2 r3003, r3000, r2891; +} +{ +add.f16x2 r3006, r2997, r3003; +} +{ +sub.f16x2 r3009, r1811, r1859; +} +{ +mul.f16x2 r3012, r3009, r2890; +} +{ +sub.f16x2 r3015, r1827, r1843; +} +{ +mul.f16x2 r3018, r3015, r2893; +} +{ +add.f16x2 r3021, r3012, r3018; +} +{ +sub.f16x2 r3024, r3006, r3021; +} +st.local.u32 [rd3+104], r3024; +{ +add.f16x2 r3027, r1805, r1853; +} +{ +mul.f16x2 r3030, r3027, r2889; +} +{ +add.f16x2 r3033, r196, r3030; +} +{ +add.f16x2 r3036, r1821, r1837; +} +{ +mul.f16x2 r3039, r3036, r2891; +} +{ +add.f16x2 r3042, r3033, r3039; +} +{ +sub.f16x2 r3045, r1811, r1859; +} +{ +mul.f16x2 r3048, r3045, r2890; +} +{ +sub.f16x2 r3051, r1827, r1843; +} +{ +mul.f16x2 r3054, r3051, r2893; +} +{ +add.f16x2 r3057, r3048, r3054; +} +{ +add.f16x2 r3060, r3042, r3057; +} +st.local.u32 [rd3+144], r3060; +{ +add.f16x2 r3063, r1811, r1859; +} +{ +mul.f16x2 r3066, r3063, r2887; +} +{ +add.f16x2 r3069, r340, r3066; +} +{ +add.f16x2 r3072, r1827, r1843; +} +{ +mul.f16x2 r3075, r3072, r2889; +} +{ +add.f16x2 r3078, r3069, r3075; +} +{ +sub.f16x2 r3081, r1805, r1853; +} +{ +mul.f16x2 r3084, r3081, r2888; +} +{ +sub.f16x2 r3087, r1821, r1837; +} +{ +mul.f16x2 r3090, r3087, r2890; +} +{ +add.f16x2 r3093, r3084, r3090; +} +{ +add.f16x2 r3096, r3078, r3093; +} +st.local.u32 [rd3+68], r3096; +{ +add.f16x2 r3099, r1811, r1859; +} +{ +mul.f16x2 r3102, r3099, r2887; +} +{ +add.f16x2 r3105, r340, r3102; +} +{ +add.f16x2 r3108, r1827, r1843; +} +{ +mul.f16x2 r3111, r3108, r2889; +} +{ +add.f16x2 r3114, r3105, r3111; +} +{ +sub.f16x2 r3117, r1805, r1853; +} +{ +mul.f16x2 r3120, r3117, r2888; +} +{ +sub.f16x2 r3123, r1821, r1837; +} +{ +mul.f16x2 r3126, r3123, r2890; +} +{ +add.f16x2 r3129, r3120, r3126; +} +{ +sub.f16x2 r3132, r3114, r3129; +} +st.local.u32 [rd3+188], r3132; +{ +add.f16x2 r3135, r1811, r1859; +} +{ +mul.f16x2 r3138, r3135, r2889; +} +{ +add.f16x2 r3141, r340, r3138; +} +{ +add.f16x2 r3144, r1827, r1843; +} +{ +mul.f16x2 r3147, r3144, r2891; +} +{ +add.f16x2 r3150, r3141, r3147; +} +{ +sub.f16x2 r3153, r1805, r1853; +} +{ +mul.f16x2 r3156, r3153, r2890; +} +{ +sub.f16x2 r3159, r1821, r1837; +} +{ +mul.f16x2 r3162, r3159, r2893; +} +{ +add.f16x2 r3165, r3156, r3162; +} +{ +add.f16x2 r3168, r3150, r3165; +} +st.local.u32 [rd3+108], r3168; +{ +add.f16x2 r3171, r1811, r1859; +} +{ +mul.f16x2 r3174, r3171, r2889; +} +{ +add.f16x2 r3177, r340, r3174; +} +{ +add.f16x2 r3180, r1827, r1843; +} +{ +mul.f16x2 r3183, r3180, r2891; +} +{ +add.f16x2 r3186, r3177, r3183; +} +{ +sub.f16x2 r3189, r1805, r1853; +} +{ +mul.f16x2 r3192, r3189, r2890; +} +{ +sub.f16x2 r3195, r1821, r1837; +} +{ +mul.f16x2 r3198, r3195, r2893; +} +{ +add.f16x2 r3201, r3192, r3198; +} +{ +sub.f16x2 r3204, r3186, r3201; +} +st.local.u32 [rd3+148], r3204; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3208, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3209, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3210, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3212, {low, high}; +} +{ +neg.f16x2 r3213, r3212; +} +{ +add.f16x2 r3215, r1869, r1917; +} +{ +add.f16x2 r3218, r124, r3215; +} +{ +add.f16x2 r3221, r1885, r1901; +} +{ +add.f16x2 r3224, r3218, r3221; +} +st.local.u32 [rd3+32], r3224; +{ +add.f16x2 r3227, r1875, r1923; +} +{ +add.f16x2 r3230, r268, r3227; +} +{ +add.f16x2 r3233, r1891, r1907; +} +{ +add.f16x2 r3236, r3230, r3233; +} +st.local.u32 [rd3+36], r3236; +{ +add.f16x2 r3239, r1869, r1917; +} +{ +mul.f16x2 r3242, r3239, r3207; +} +{ +add.f16x2 r3245, r124, r3242; +} +{ +add.f16x2 r3248, r1885, r1901; +} +{ +mul.f16x2 r3251, r3248, r3209; +} +{ +add.f16x2 r3254, r3245, r3251; +} +{ +sub.f16x2 r3257, r1875, r1923; +} +{ +mul.f16x2 r3260, r3257, r3208; +} +{ +sub.f16x2 r3263, r1891, r1907; +} +{ +mul.f16x2 r3266, r3263, r3210; +} +{ +add.f16x2 r3269, r3260, r3266; +} +{ +sub.f16x2 r3272, r3254, r3269; +} +st.local.u32 [rd3+72], r3272; +{ +add.f16x2 r3275, r1869, r1917; +} +{ +mul.f16x2 r3278, r3275, r3207; +} +{ +add.f16x2 r3281, r124, r3278; +} +{ +add.f16x2 r3284, r1885, r1901; +} +{ +mul.f16x2 r3287, r3284, r3209; +} +{ +add.f16x2 r3290, r3281, r3287; +} +{ +sub.f16x2 r3293, r1875, r1923; +} +{ +mul.f16x2 r3296, r3293, r3208; +} +{ +sub.f16x2 r3299, r1891, r1907; +} +{ +mul.f16x2 r3302, r3299, r3210; +} +{ +add.f16x2 r3305, r3296, r3302; +} +{ +add.f16x2 r3308, r3290, r3305; +} +st.local.u32 [rd3+192], r3308; +{ +add.f16x2 r3311, r1869, r1917; +} +{ +mul.f16x2 r3314, r3311, r3209; +} +{ +add.f16x2 r3317, r124, r3314; +} +{ +add.f16x2 r3320, r1885, r1901; +} +{ +mul.f16x2 r3323, r3320, r3211; +} +{ +add.f16x2 r3326, r3317, r3323; +} +{ +sub.f16x2 r3329, r1875, r1923; +} +{ +mul.f16x2 r3332, r3329, r3210; +} +{ +sub.f16x2 r3335, r1891, r1907; +} +{ +mul.f16x2 r3338, r3335, r3213; +} +{ +add.f16x2 r3341, r3332, r3338; +} +{ +sub.f16x2 r3344, r3326, r3341; +} +st.local.u32 [rd3+112], r3344; +{ +add.f16x2 r3347, r1869, r1917; +} +{ +mul.f16x2 r3350, r3347, r3209; +} +{ +add.f16x2 r3353, r124, r3350; +} +{ +add.f16x2 r3356, r1885, r1901; +} +{ +mul.f16x2 r3359, r3356, r3211; +} +{ +add.f16x2 r3362, r3353, r3359; +} +{ +sub.f16x2 r3365, r1875, r1923; +} +{ +mul.f16x2 r3368, r3365, r3210; +} +{ +sub.f16x2 r3371, r1891, r1907; +} +{ +mul.f16x2 r3374, r3371, r3213; +} +{ +add.f16x2 r3377, r3368, r3374; +} +{ +add.f16x2 r3380, r3362, r3377; +} +st.local.u32 [rd3+152], r3380; +{ +add.f16x2 r3383, r1875, r1923; +} +{ +mul.f16x2 r3386, r3383, r3207; +} +{ +add.f16x2 r3389, r268, r3386; +} +{ +add.f16x2 r3392, r1891, r1907; +} +{ +mul.f16x2 r3395, r3392, r3209; +} +{ +add.f16x2 r3398, r3389, r3395; +} +{ +sub.f16x2 r3401, r1869, r1917; +} +{ +mul.f16x2 r3404, r3401, r3208; +} +{ +sub.f16x2 r3407, r1885, r1901; +} +{ +mul.f16x2 r3410, r3407, r3210; +} +{ +add.f16x2 r3413, r3404, r3410; +} +{ +add.f16x2 r3416, r3398, r3413; +} +st.local.u32 [rd3+76], r3416; +{ +add.f16x2 r3419, r1875, r1923; +} +{ +mul.f16x2 r3422, r3419, r3207; +} +{ +add.f16x2 r3425, r268, r3422; +} +{ +add.f16x2 r3428, r1891, r1907; +} +{ +mul.f16x2 r3431, r3428, r3209; +} +{ +add.f16x2 r3434, r3425, r3431; +} +{ +sub.f16x2 r3437, r1869, r1917; +} +{ +mul.f16x2 r3440, r3437, r3208; +} +{ +sub.f16x2 r3443, r1885, r1901; +} +{ +mul.f16x2 r3446, r3443, r3210; +} +{ +add.f16x2 r3449, r3440, r3446; +} +{ +sub.f16x2 r3452, r3434, r3449; +} +st.local.u32 [rd3+196], r3452; +{ +add.f16x2 r3455, r1875, r1923; +} +{ +mul.f16x2 r3458, r3455, r3209; +} +{ +add.f16x2 r3461, r268, r3458; +} +{ +add.f16x2 r3464, r1891, r1907; +} +{ +mul.f16x2 r3467, r3464, r3211; +} +{ +add.f16x2 r3470, r3461, r3467; +} +{ +sub.f16x2 r3473, r1869, r1917; +} +{ +mul.f16x2 r3476, r3473, r3210; +} +{ +sub.f16x2 r3479, r1885, r1901; +} +{ +mul.f16x2 r3482, r3479, r3213; +} +{ +add.f16x2 r3485, r3476, r3482; +} +{ +add.f16x2 r3488, r3470, r3485; +} +st.local.u32 [rd3+116], r3488; +{ +add.f16x2 r3491, r1875, r1923; +} +{ +mul.f16x2 r3494, r3491, r3209; +} +{ +add.f16x2 r3497, r268, r3494; +} +{ +add.f16x2 r3500, r1891, r1907; +} +{ +mul.f16x2 r3503, r3500, r3211; +} +{ +add.f16x2 r3506, r3497, r3503; +} +{ +sub.f16x2 r3509, r1869, r1917; +} +{ +mul.f16x2 r3512, r3509, r3210; +} +{ +sub.f16x2 r3515, r1885, r1901; +} +{ +mul.f16x2 r3518, r3515, r3213; +} +{ +add.f16x2 r3521, r3512, r3518; +} +{ +sub.f16x2 r3524, r3506, r3521; +} +st.local.u32 [rd3+156], r3524; +mov.u32 r3532, %tid.x; +mul.wide.u32 rd9, r3532, -776530087; +shr.u64 rd10, rd9, 41; +cvt.u32.u64 r4, rd10; +mul.lo.s32 r3533, r4, 625; +sub.s32 r3, r3532, r3533; +cvt.rn.f32.u32 f221, r3; +mul.f32 f222, f221, 0f39D2D427; +cos.approx.f32 f217, f222; +sin.approx.f32 f223, f222; +neg.f32 f218, f223; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r11250, {low, high}; +} +mov.u32 r11249, 1; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11250; +mov.b32 r3554, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11250; +mov.b32 r3556, {high, high}; +} +bra.uni LBB1_1; +LBB1_2: +ld.local.u32 r11248, [rd5+60]; +LBB1_1: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11250; +mov.b32 r3534, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11250; +mov.b32 r3536, {high, high}; +} +mul.wide.u32 rd11, r11249, 8; +add.s64 rd12, rd3, rd11; +add.s64 rd5, rd12, 4; +ld.local.u32 r3539, [rd12+4]; +{ +mul.f16x2 r3538, r3539, r3536; +} +{ +neg.f16x2 r3541, r3538; +} +{ +fma.rn.f16x2 r3543, r11248, r3534, r3541; +} +st.local.u32 [rd12], r3543; +{ +mul.f16x2 r3547, r11248, r3536; +} +{ +fma.rn.f16x2 r3550, r3539, r3534, r3547; +} +st.local.u32 [rd12+4], r3550; +mov.f32 f238, 0fBF800000; +mov.f32 f239, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3558, {low, high}; +} +{ +mul.f16x2 r3559, r3556, r3558; +} +{ +mul.f16x2 r3562, r11250, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11250; +mov.b32 r3565, {high, low}; +} +{ +fma.rn.f16x2 r3567, r3559, r3565, r3562; +} +ld.local.u32 r3585, [rd12+8]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3567; +mov.b32 r3571, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3567; +mov.b32 r3573, {high, high}; +} +ld.local.u32 r3588, [rd12+12]; +{ +mul.f16x2 r3575, r3588, r3573; +} +{ +neg.f16x2 r3578, r3575; +} +{ +fma.rn.f16x2 r3580, r3585, r3571, r3578; +} +st.local.u32 [rd12+8], r3580; +{ +mul.f16x2 r3584, r3585, r3573; +} +{ +fma.rn.f16x2 r3587, r3588, r3571, r3584; +} +st.local.u32 [rd12+12], r3587; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3595, {low, high}; +} +{ +mul.f16x2 r3596, r3556, r3595; +} +{ +mul.f16x2 r3599, r3567, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3567; +mov.b32 r3602, {high, low}; +} +{ +fma.rn.f16x2 r3604, r3596, r3602, r3599; +} +ld.local.u32 r3622, [rd12+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3604; +mov.b32 r3608, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3604; +mov.b32 r3610, {high, high}; +} +ld.local.u32 r3625, [rd12+20]; +{ +mul.f16x2 r3612, r3625, r3610; +} +{ +neg.f16x2 r3615, r3612; +} +{ +fma.rn.f16x2 r3617, r3622, r3608, r3615; +} +st.local.u32 [rd12+16], r3617; +{ +mul.f16x2 r3621, r3622, r3610; +} +{ +fma.rn.f16x2 r3624, r3625, r3608, r3621; +} +st.local.u32 [rd12+20], r3624; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3632, {low, high}; +} +{ +mul.f16x2 r3633, r3556, r3632; +} +{ +mul.f16x2 r3636, r3604, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3604; +mov.b32 r3639, {high, low}; +} +{ +fma.rn.f16x2 r3641, r3633, r3639, r3636; +} +ld.local.u32 r3659, [rd12+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3641; +mov.b32 r3645, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3641; +mov.b32 r3647, {high, high}; +} +ld.local.u32 r3662, [rd12+28]; +{ +mul.f16x2 r3649, r3662, r3647; +} +{ +neg.f16x2 r3652, r3649; +} +{ +fma.rn.f16x2 r3654, r3659, r3645, r3652; +} +st.local.u32 [rd12+24], r3654; +{ +mul.f16x2 r3658, r3659, r3647; +} +{ +fma.rn.f16x2 r3661, r3662, r3645, r3658; +} +st.local.u32 [rd12+28], r3661; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3669, {low, high}; +} +{ +mul.f16x2 r3670, r3556, r3669; +} +{ +mul.f16x2 r3673, r3641, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3641; +mov.b32 r3676, {high, low}; +} +{ +fma.rn.f16x2 r3678, r3670, r3676, r3673; +} +ld.local.u32 r3692, [rd12+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3678; +mov.b32 r3682, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3678; +mov.b32 r3684, {high, high}; +} +ld.local.u32 r3699, [rd12+36]; +{ +mul.f16x2 r3686, r3699, r3684; +} +{ +neg.f16x2 r3689, r3686; +} +{ +fma.rn.f16x2 r3691, r3692, r3682, r3689; +} +st.local.u32 [rd12+32], r3691; +{ +mul.f16x2 r3695, r3692, r3684; +} +{ +fma.rn.f16x2 r3698, r3699, r3682, r3695; +} +st.local.u32 [rd12+36], r3698; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3706, {low, high}; +} +{ +mul.f16x2 r3707, r3556, r3706; +} +{ +mul.f16x2 r3710, r3678, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3678; +mov.b32 r3713, {high, low}; +} +{ +fma.rn.f16x2 r3715, r3707, r3713, r3710; +} +ld.local.u32 r3729, [rd12+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3715; +mov.b32 r3719, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3715; +mov.b32 r3721, {high, high}; +} +ld.local.u32 r3724, [rd12+44]; +{ +mul.f16x2 r3723, r3724, r3721; +} +{ +neg.f16x2 r3726, r3723; +} +{ +fma.rn.f16x2 r3728, r3729, r3719, r3726; +} +st.local.u32 [rd12+40], r3728; +{ +mul.f16x2 r3732, r3729, r3721; +} +{ +fma.rn.f16x2 r3735, r3724, r3719, r3732; +} +st.local.u32 [rd12+44], r3735; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3743, {low, high}; +} +{ +mul.f16x2 r3744, r3556, r3743; +} +{ +mul.f16x2 r3747, r3715, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3715; +mov.b32 r3750, {high, low}; +} +{ +fma.rn.f16x2 r3752, r3744, r3750, r3747; +} +ld.local.u32 r3766, [rd12+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3752; +mov.b32 r3756, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3752; +mov.b32 r3758, {high, high}; +} +ld.local.u32 r3761, [rd12+52]; +{ +mul.f16x2 r3760, r3761, r3758; +} +{ +neg.f16x2 r3763, r3760; +} +{ +fma.rn.f16x2 r3765, r3766, r3756, r3763; +} +st.local.u32 [rd12+48], r3765; +{ +mul.f16x2 r3769, r3766, r3758; +} +{ +fma.rn.f16x2 r3772, r3761, r3756, r3769; +} +st.local.u32 [rd12+52], r3772; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3781, r3556, r3780; +} +{ +mul.f16x2 r3784, r3752, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3752; +mov.b32 r3787, {high, low}; +} +{ +fma.rn.f16x2 r3789, r3781, r3787, r3784; +} +ld.local.u32 r3803, [rd12+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3789; +mov.b32 r3793, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3789; +mov.b32 r3795, {high, high}; +} +ld.local.u32 r3798, [rd12+60]; +{ +mul.f16x2 r3797, r3798, r3795; +} +{ +neg.f16x2 r3800, r3797; +} +{ +fma.rn.f16x2 r3802, r3803, r3793, r3800; +} +st.local.u32 [rd12+56], r3802; +{ +mul.f16x2 r3806, r3803, r3795; +} +{ +fma.rn.f16x2 r3809, r3798, r3793, r3806; +} +st.local.u32 [rd12+60], r3809; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3817, {low, high}; +} +{ +mul.f16x2 r3818, r3556, r3817; +} +{ +mul.f16x2 r3821, r3789, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3789; +mov.b32 r3824, {high, low}; +} +{ +fma.rn.f16x2 r11250, r3818, r3824, r3821; +} +add.s32 r11249, r11249, 8; +setp.eq.s32 p1, r11249, 25; +@p1 bra LBB1_3; +bra.uni LBB1_2; +LBB1_3: +shl.b32 r7338, r1, 2; +mov.u32 r7339, %50; +add.s32 r7340, r7339, r7338; +mad.lo.s32 r12, r4, 62500, r7340; +barrier.sync 0; +mad.lo.s32 r7341, r3, 100, r12; +ld.local.u32 r7342, [rd3]; +st.shared.u32 [r7341], r7342; +ld.local.u32 r7343, [rd4+4]; +st.shared.u32 [r7341+4], r7343; +ld.local.u32 r7344, [rd4+12]; +st.shared.u32 [r7341+8], r7344; +ld.local.u32 r7345, [rd4+20]; +st.shared.u32 [r7341+12], r7345; +ld.local.u32 r7346, [rd4+28]; +st.shared.u32 [r7341+16], r7346; +ld.local.u32 r7347, [rd4+36]; +st.shared.u32 [r7341+20], r7347; +ld.local.u32 r7348, [rd4+44]; +st.shared.u32 [r7341+24], r7348; +ld.local.u32 r7349, [rd4+52]; +st.shared.u32 [r7341+28], r7349; +ld.local.u32 r7350, [rd4+60]; +st.shared.u32 [r7341+32], r7350; +ld.local.u32 r7351, [rd4+68]; +st.shared.u32 [r7341+36], r7351; +ld.local.u32 r7352, [rd4+76]; +st.shared.u32 [r7341+40], r7352; +ld.local.u32 r7353, [rd4+84]; +st.shared.u32 [r7341+44], r7353; +ld.local.u32 r7354, [rd4+92]; +st.shared.u32 [r7341+48], r7354; +ld.local.u32 r7355, [rd4+100]; +st.shared.u32 [r7341+52], r7355; +ld.local.u32 r7356, [rd4+108]; +st.shared.u32 [r7341+56], r7356; +ld.local.u32 r7357, [rd4+116]; +st.shared.u32 [r7341+60], r7357; +ld.local.u32 r7358, [rd4+124]; +st.shared.u32 [r7341+64], r7358; +ld.local.u32 r7359, [rd4+132]; +st.shared.u32 [r7341+68], r7359; +ld.local.u32 r7360, [rd4+140]; +st.shared.u32 [r7341+72], r7360; +ld.local.u32 r7361, [rd4+148]; +st.shared.u32 [r7341+76], r7361; +ld.local.u32 r7362, [rd4+156]; +st.shared.u32 [r7341+80], r7362; +ld.local.u32 r7363, [rd4+164]; +st.shared.u32 [r7341+84], r7363; +ld.local.u32 r7364, [rd4+172]; +st.shared.u32 [r7341+88], r7364; +ld.local.u32 r7365, [rd4+180]; +st.shared.u32 [r7341+92], r7365; +ld.local.u32 r7366, [rd4+188]; +st.shared.u32 [r7341+96], r7366; +barrier.sync 0; +mad.lo.s32 r13, r3, -96, r7341; +ld.shared.u32 r3842, [r13]; +ld.shared.u32 r4162, [r13+2500]; +ld.shared.u32 r4482, [r13+5000]; +ld.shared.u32 r4802, [r13+7500]; +ld.shared.u32 r5122, [r13+10000]; +ld.shared.u32 r3839, [r13+12500]; +ld.shared.u32 r4159, [r13+15000]; +ld.shared.u32 r4479, [r13+17500]; +ld.shared.u32 r4799, [r13+20000]; +ld.shared.u32 r5119, [r13+22500]; +ld.shared.u32 r3845, [r13+25000]; +ld.shared.u32 r4165, [r13+27500]; +ld.shared.u32 r4485, [r13+30000]; +ld.shared.u32 r4805, [r13+32500]; +ld.shared.u32 r5125, [r13+35000]; +ld.shared.u32 r3846, [r13+37500]; +ld.shared.u32 r4166, [r13+40000]; +ld.shared.u32 r4486, [r13+42500]; +ld.shared.u32 r4806, [r13+45000]; +ld.shared.u32 r5126, [r13+47500]; +ld.shared.u32 r3840, [r13+50000]; +ld.shared.u32 r4160, [r13+52500]; +ld.shared.u32 r4480, [r13+55000]; +ld.shared.u32 r4800, [r13+57500]; +ld.shared.u32 r5120, [r13+60000]; +barrier.sync 0; +ld.local.u32 r7367, [rd4]; +st.shared.u32 [r7341], r7367; +ld.local.u32 r7368, [rd4+8]; +st.shared.u32 [r7341+4], r7368; +ld.local.u32 r7369, [rd4+16]; +st.shared.u32 [r7341+8], r7369; +ld.local.u32 r7370, [rd4+24]; +st.shared.u32 [r7341+12], r7370; +ld.local.u32 r7371, [rd4+32]; +st.shared.u32 [r7341+16], r7371; +ld.local.u32 r7372, [rd4+40]; +st.shared.u32 [r7341+20], r7372; +ld.local.u32 r7373, [rd4+48]; +st.shared.u32 [r7341+24], r7373; +ld.local.u32 r7374, [rd4+56]; +st.shared.u32 [r7341+28], r7374; +ld.local.u32 r7375, [rd4+64]; +st.shared.u32 [r7341+32], r7375; +ld.local.u32 r7376, [rd4+72]; +st.shared.u32 [r7341+36], r7376; +ld.local.u32 r7377, [rd4+80]; +st.shared.u32 [r7341+40], r7377; +ld.local.u32 r7378, [rd4+88]; +st.shared.u32 [r7341+44], r7378; +ld.local.u32 r7379, [rd4+96]; +st.shared.u32 [r7341+48], r7379; +ld.local.u32 r7380, [rd4+104]; +st.shared.u32 [r7341+52], r7380; +ld.local.u32 r7381, [rd4+112]; +st.shared.u32 [r7341+56], r7381; +ld.local.u32 r7382, [rd4+120]; +st.shared.u32 [r7341+60], r7382; +ld.local.u32 r7383, [rd4+128]; +st.shared.u32 [r7341+64], r7383; +ld.local.u32 r7384, [rd4+136]; +st.shared.u32 [r7341+68], r7384; +ld.local.u32 r7385, [rd4+144]; +st.shared.u32 [r7341+72], r7385; +ld.local.u32 r7386, [rd4+152]; +st.shared.u32 [r7341+76], r7386; +ld.local.u32 r7387, [rd4+160]; +st.shared.u32 [r7341+80], r7387; +ld.local.u32 r7388, [rd4+168]; +st.shared.u32 [r7341+84], r7388; +ld.local.u32 r7389, [rd4+176]; +st.shared.u32 [r7341+88], r7389; +ld.local.u32 r7390, [rd4+184]; +st.shared.u32 [r7341+92], r7390; +ld.local.u32 r7391, [rd4+192]; +st.shared.u32 [r7341+96], r7391; +barrier.sync 0; +ld.shared.u32 r3854, [r13]; +ld.shared.u32 r4174, [r13+2500]; +ld.shared.u32 r4494, [r13+5000]; +ld.shared.u32 r4814, [r13+7500]; +ld.shared.u32 r5134, [r13+10000]; +ld.shared.u32 r3851, [r13+12500]; +ld.shared.u32 r4171, [r13+15000]; +ld.shared.u32 r4491, [r13+17500]; +ld.shared.u32 r4811, [r13+20000]; +ld.shared.u32 r5131, [r13+22500]; +ld.shared.u32 r3857, [r13+25000]; +ld.shared.u32 r4177, [r13+27500]; +ld.shared.u32 r4497, [r13+30000]; +ld.shared.u32 r4817, [r13+32500]; +ld.shared.u32 r5137, [r13+35000]; +ld.shared.u32 r3858, [r13+37500]; +ld.shared.u32 r4178, [r13+40000]; +ld.shared.u32 r4498, [r13+42500]; +ld.shared.u32 r4818, [r13+45000]; +ld.shared.u32 r5138, [r13+47500]; +ld.shared.u32 r3852, [r13+50000]; +ld.shared.u32 r4172, [r13+52500]; +ld.shared.u32 r4492, [r13+55000]; +ld.shared.u32 r4812, [r13+57500]; +ld.shared.u32 r5132, [r13+60000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3830, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3831, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3832, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3833, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3835, {low, high}; +} +{ +neg.f16x2 r3836, r3835; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3841, r3844; +} +{ +add.f16x2 r3850, r3851, r3852; +} +{ +add.f16x2 r3853, r3854, r3850; +} +{ +add.f16x2 r3856, r3857, r3858; +} +{ +add.f16x2 r3859, r3853, r3856; +} +{ +add.f16x2 r3862, r3839, r3840; +} +{ +mul.f16x2 r3865, r3862, r3830; +} +{ +add.f16x2 r3868, r3842, r3865; +} +{ +add.f16x2 r3871, r3845, r3846; +} +{ +mul.f16x2 r3874, r3871, r3832; +} +{ +add.f16x2 r3877, r3868, r3874; +} +{ +sub.f16x2 r3880, r3851, r3852; +} +{ +mul.f16x2 r3883, r3880, r3831; +} +{ +sub.f16x2 r3886, r3857, r3858; +} +{ +mul.f16x2 r3889, r3886, r3833; +} +{ +add.f16x2 r3892, r3883, r3889; +} +{ +sub.f16x2 r3895, r3877, r3892; +} +{ +add.f16x2 r3898, r3839, r3840; +} +{ +mul.f16x2 r3901, r3898, r3830; +} +{ +add.f16x2 r3904, r3842, r3901; +} +{ +add.f16x2 r3907, r3845, r3846; +} +{ +mul.f16x2 r3910, r3907, r3832; +} +{ +add.f16x2 r3913, r3904, r3910; +} +{ +sub.f16x2 r3916, r3851, r3852; +} +{ +mul.f16x2 r3919, r3916, r3831; +} +{ +sub.f16x2 r3922, r3857, r3858; +} +{ +mul.f16x2 r3925, r3922, r3833; +} +{ +add.f16x2 r3928, r3919, r3925; +} +{ +add.f16x2 r3931, r3913, r3928; +} +{ +add.f16x2 r3934, r3839, r3840; +} +{ +mul.f16x2 r3937, r3934, r3832; +} +{ +add.f16x2 r3940, r3842, r3937; +} +{ +add.f16x2 r3943, r3845, r3846; +} +{ +mul.f16x2 r3946, r3943, r3834; +} +{ +add.f16x2 r3949, r3940, r3946; +} +{ +sub.f16x2 r3952, r3851, r3852; +} +{ +mul.f16x2 r3955, r3952, r3833; +} +{ +sub.f16x2 r3958, r3857, r3858; +} +{ +mul.f16x2 r3961, r3958, r3836; +} +{ +add.f16x2 r3964, r3955, r3961; +} +{ +sub.f16x2 r3967, r3949, r3964; +} +{ +add.f16x2 r3970, r3839, r3840; +} +{ +mul.f16x2 r3973, r3970, r3832; +} +{ +add.f16x2 r3976, r3842, r3973; +} +{ +add.f16x2 r3979, r3845, r3846; +} +{ +mul.f16x2 r3982, r3979, r3834; +} +{ +add.f16x2 r3985, r3976, r3982; +} +{ +sub.f16x2 r3988, r3851, r3852; +} +{ +mul.f16x2 r3991, r3988, r3833; +} +{ +sub.f16x2 r3994, r3857, r3858; +} +{ +mul.f16x2 r3997, r3994, r3836; +} +{ +add.f16x2 r4000, r3991, r3997; +} +{ +add.f16x2 r4003, r3985, r4000; +} +{ +add.f16x2 r4006, r3851, r3852; +} +{ +mul.f16x2 r4009, r4006, r3830; +} +{ +add.f16x2 r4012, r3854, r4009; +} +{ +add.f16x2 r4015, r3857, r3858; +} +{ +mul.f16x2 r4018, r4015, r3832; +} +{ +add.f16x2 r4021, r4012, r4018; +} +{ +sub.f16x2 r4024, r3839, r3840; +} +{ +mul.f16x2 r4027, r4024, r3831; +} +{ +sub.f16x2 r4030, r3845, r3846; +} +{ +mul.f16x2 r4033, r4030, r3833; +} +{ +add.f16x2 r4036, r4027, r4033; +} +{ +add.f16x2 r4039, r4021, r4036; +} +{ +add.f16x2 r4042, r3851, r3852; +} +{ +mul.f16x2 r4045, r4042, r3830; +} +{ +add.f16x2 r4048, r3854, r4045; +} +{ +add.f16x2 r4051, r3857, r3858; +} +{ +mul.f16x2 r4054, r4051, r3832; +} +{ +add.f16x2 r4057, r4048, r4054; +} +{ +sub.f16x2 r4060, r3839, r3840; +} +{ +mul.f16x2 r4063, r4060, r3831; +} +{ +sub.f16x2 r4066, r3845, r3846; +} +{ +mul.f16x2 r4069, r4066, r3833; +} +{ +add.f16x2 r4072, r4063, r4069; +} +{ +sub.f16x2 r4075, r4057, r4072; +} +{ +add.f16x2 r4078, r3851, r3852; +} +{ +mul.f16x2 r4081, r4078, r3832; +} +{ +add.f16x2 r4084, r3854, r4081; +} +{ +add.f16x2 r4087, r3857, r3858; +} +{ +mul.f16x2 r4090, r4087, r3834; +} +{ +add.f16x2 r4093, r4084, r4090; +} +{ +sub.f16x2 r4096, r3839, r3840; +} +{ +mul.f16x2 r4099, r4096, r3833; +} +{ +sub.f16x2 r4102, r3845, r3846; +} +{ +mul.f16x2 r4105, r4102, r3836; +} +{ +add.f16x2 r4108, r4099, r4105; +} +{ +add.f16x2 r4111, r4093, r4108; +} +{ +add.f16x2 r4114, r3851, r3852; +} +{ +mul.f16x2 r4117, r4114, r3832; +} +{ +add.f16x2 r4120, r3854, r4117; +} +{ +add.f16x2 r4123, r3857, r3858; +} +{ +mul.f16x2 r4126, r4123, r3834; +} +{ +add.f16x2 r4129, r4120, r4126; +} +{ +sub.f16x2 r4132, r3839, r3840; +} +{ +mul.f16x2 r4135, r4132, r3833; +} +{ +sub.f16x2 r4138, r3845, r3846; +} +{ +mul.f16x2 r4141, r4138, r3836; +} +{ +add.f16x2 r4144, r4135, r4141; +} +{ +sub.f16x2 r4147, r4129, r4144; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4150, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4151, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4152, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4153, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4155, {low, high}; +} +{ +neg.f16x2 r4156, r4155; +} +{ +add.f16x2 r4158, r4159, r4160; +} +{ +add.f16x2 r4161, r4162, r4158; +} +{ +add.f16x2 r4164, r4165, r4166; +} +{ +add.f16x2 r4167, r4161, r4164; +} +{ +add.f16x2 r4170, r4171, r4172; +} +{ +add.f16x2 r4173, r4174, r4170; +} +{ +add.f16x2 r4176, r4177, r4178; +} +{ +add.f16x2 r4179, r4173, r4176; +} +{ +add.f16x2 r4182, r4159, r4160; +} +{ +mul.f16x2 r4185, r4182, r4150; +} +{ +add.f16x2 r4188, r4162, r4185; +} +{ +add.f16x2 r4191, r4165, r4166; +} +{ +mul.f16x2 r4194, r4191, r4152; +} +{ +add.f16x2 r4197, r4188, r4194; +} +{ +sub.f16x2 r4200, r4171, r4172; +} +{ +mul.f16x2 r4203, r4200, r4151; +} +{ +sub.f16x2 r4206, r4177, r4178; +} +{ +mul.f16x2 r4209, r4206, r4153; +} +{ +add.f16x2 r4212, r4203, r4209; +} +{ +sub.f16x2 r4215, r4197, r4212; +} +{ +add.f16x2 r4218, r4159, r4160; +} +{ +mul.f16x2 r4221, r4218, r4150; +} +{ +add.f16x2 r4224, r4162, r4221; +} +{ +add.f16x2 r4227, r4165, r4166; +} +{ +mul.f16x2 r4230, r4227, r4152; +} +{ +add.f16x2 r4233, r4224, r4230; +} +{ +sub.f16x2 r4236, r4171, r4172; +} +{ +mul.f16x2 r4239, r4236, r4151; +} +{ +sub.f16x2 r4242, r4177, r4178; +} +{ +mul.f16x2 r4245, r4242, r4153; +} +{ +add.f16x2 r4248, r4239, r4245; +} +{ +add.f16x2 r4251, r4233, r4248; +} +{ +add.f16x2 r4254, r4159, r4160; +} +{ +mul.f16x2 r4257, r4254, r4152; +} +{ +add.f16x2 r4260, r4162, r4257; +} +{ +add.f16x2 r4263, r4165, r4166; +} +{ +mul.f16x2 r4266, r4263, r4154; +} +{ +add.f16x2 r4269, r4260, r4266; +} +{ +sub.f16x2 r4272, r4171, r4172; +} +{ +mul.f16x2 r4275, r4272, r4153; +} +{ +sub.f16x2 r4278, r4177, r4178; +} +{ +mul.f16x2 r4281, r4278, r4156; +} +{ +add.f16x2 r4284, r4275, r4281; +} +{ +sub.f16x2 r4287, r4269, r4284; +} +{ +add.f16x2 r4290, r4159, r4160; +} +{ +mul.f16x2 r4293, r4290, r4152; +} +{ +add.f16x2 r4296, r4162, r4293; +} +{ +add.f16x2 r4299, r4165, r4166; +} +{ +mul.f16x2 r4302, r4299, r4154; +} +{ +add.f16x2 r4305, r4296, r4302; +} +{ +sub.f16x2 r4308, r4171, r4172; +} +{ +mul.f16x2 r4311, r4308, r4153; +} +{ +sub.f16x2 r4314, r4177, r4178; +} +{ +mul.f16x2 r4317, r4314, r4156; +} +{ +add.f16x2 r4320, r4311, r4317; +} +{ +add.f16x2 r4323, r4305, r4320; +} +{ +add.f16x2 r4326, r4171, r4172; +} +{ +mul.f16x2 r4329, r4326, r4150; +} +{ +add.f16x2 r4332, r4174, r4329; +} +{ +add.f16x2 r4335, r4177, r4178; +} +{ +mul.f16x2 r4338, r4335, r4152; +} +{ +add.f16x2 r4341, r4332, r4338; +} +{ +sub.f16x2 r4344, r4159, r4160; +} +{ +mul.f16x2 r4347, r4344, r4151; +} +{ +sub.f16x2 r4350, r4165, r4166; +} +{ +mul.f16x2 r4353, r4350, r4153; +} +{ +add.f16x2 r4356, r4347, r4353; +} +{ +add.f16x2 r4359, r4341, r4356; +} +{ +add.f16x2 r4362, r4171, r4172; +} +{ +mul.f16x2 r4365, r4362, r4150; +} +{ +add.f16x2 r4368, r4174, r4365; +} +{ +add.f16x2 r4371, r4177, r4178; +} +{ +mul.f16x2 r4374, r4371, r4152; +} +{ +add.f16x2 r4377, r4368, r4374; +} +{ +sub.f16x2 r4380, r4159, r4160; +} +{ +mul.f16x2 r4383, r4380, r4151; +} +{ +sub.f16x2 r4386, r4165, r4166; +} +{ +mul.f16x2 r4389, r4386, r4153; +} +{ +add.f16x2 r4392, r4383, r4389; +} +{ +sub.f16x2 r4395, r4377, r4392; +} +{ +add.f16x2 r4398, r4171, r4172; +} +{ +mul.f16x2 r4401, r4398, r4152; +} +{ +add.f16x2 r4404, r4174, r4401; +} +{ +add.f16x2 r4407, r4177, r4178; +} +{ +mul.f16x2 r4410, r4407, r4154; +} +{ +add.f16x2 r4413, r4404, r4410; +} +{ +sub.f16x2 r4416, r4159, r4160; +} +{ +mul.f16x2 r4419, r4416, r4153; +} +{ +sub.f16x2 r4422, r4165, r4166; +} +{ +mul.f16x2 r4425, r4422, r4156; +} +{ +add.f16x2 r4428, r4419, r4425; +} +{ +add.f16x2 r4431, r4413, r4428; +} +{ +add.f16x2 r4434, r4171, r4172; +} +{ +mul.f16x2 r4437, r4434, r4152; +} +{ +add.f16x2 r4440, r4174, r4437; +} +{ +add.f16x2 r4443, r4177, r4178; +} +{ +mul.f16x2 r4446, r4443, r4154; +} +{ +add.f16x2 r4449, r4440, r4446; +} +{ +sub.f16x2 r4452, r4159, r4160; +} +{ +mul.f16x2 r4455, r4452, r4153; +} +{ +sub.f16x2 r4458, r4165, r4166; +} +{ +mul.f16x2 r4461, r4458, r4156; +} +{ +add.f16x2 r4464, r4455, r4461; +} +{ +sub.f16x2 r4467, r4449, r4464; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4470, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4475, {low, high}; +} +{ +neg.f16x2 r4476, r4475; +} +{ +add.f16x2 r4478, r4479, r4480; +} +{ +add.f16x2 r4481, r4482, r4478; +} +{ +add.f16x2 r4484, r4485, r4486; +} +{ +add.f16x2 r4487, r4481, r4484; +} +{ +add.f16x2 r4490, r4491, r4492; +} +{ +add.f16x2 r4493, r4494, r4490; +} +{ +add.f16x2 r4496, r4497, r4498; +} +{ +add.f16x2 r4499, r4493, r4496; +} +{ +add.f16x2 r4502, r4479, r4480; +} +{ +mul.f16x2 r4505, r4502, r4470; +} +{ +add.f16x2 r4508, r4482, r4505; +} +{ +add.f16x2 r4511, r4485, r4486; +} +{ +mul.f16x2 r4514, r4511, r4472; +} +{ +add.f16x2 r4517, r4508, r4514; +} +{ +sub.f16x2 r4520, r4491, r4492; +} +{ +mul.f16x2 r4523, r4520, r4471; +} +{ +sub.f16x2 r4526, r4497, r4498; +} +{ +mul.f16x2 r4529, r4526, r4473; +} +{ +add.f16x2 r4532, r4523, r4529; +} +{ +sub.f16x2 r4535, r4517, r4532; +} +{ +add.f16x2 r4538, r4479, r4480; +} +{ +mul.f16x2 r4541, r4538, r4470; +} +{ +add.f16x2 r4544, r4482, r4541; +} +{ +add.f16x2 r4547, r4485, r4486; +} +{ +mul.f16x2 r4550, r4547, r4472; +} +{ +add.f16x2 r4553, r4544, r4550; +} +{ +sub.f16x2 r4556, r4491, r4492; +} +{ +mul.f16x2 r4559, r4556, r4471; +} +{ +sub.f16x2 r4562, r4497, r4498; +} +{ +mul.f16x2 r4565, r4562, r4473; +} +{ +add.f16x2 r4568, r4559, r4565; +} +{ +add.f16x2 r4571, r4553, r4568; +} +{ +add.f16x2 r4574, r4479, r4480; +} +{ +mul.f16x2 r4577, r4574, r4472; +} +{ +add.f16x2 r4580, r4482, r4577; +} +{ +add.f16x2 r4583, r4485, r4486; +} +{ +mul.f16x2 r4586, r4583, r4474; +} +{ +add.f16x2 r4589, r4580, r4586; +} +{ +sub.f16x2 r4592, r4491, r4492; +} +{ +mul.f16x2 r4595, r4592, r4473; +} +{ +sub.f16x2 r4598, r4497, r4498; +} +{ +mul.f16x2 r4601, r4598, r4476; +} +{ +add.f16x2 r4604, r4595, r4601; +} +{ +sub.f16x2 r4607, r4589, r4604; +} +{ +add.f16x2 r4610, r4479, r4480; +} +{ +mul.f16x2 r4613, r4610, r4472; +} +{ +add.f16x2 r4616, r4482, r4613; +} +{ +add.f16x2 r4619, r4485, r4486; +} +{ +mul.f16x2 r4622, r4619, r4474; +} +{ +add.f16x2 r4625, r4616, r4622; +} +{ +sub.f16x2 r4628, r4491, r4492; +} +{ +mul.f16x2 r4631, r4628, r4473; +} +{ +sub.f16x2 r4634, r4497, r4498; +} +{ +mul.f16x2 r4637, r4634, r4476; +} +{ +add.f16x2 r4640, r4631, r4637; +} +{ +add.f16x2 r4643, r4625, r4640; +} +{ +add.f16x2 r4646, r4491, r4492; +} +{ +mul.f16x2 r4649, r4646, r4470; +} +{ +add.f16x2 r4652, r4494, r4649; +} +{ +add.f16x2 r4655, r4497, r4498; +} +{ +mul.f16x2 r4658, r4655, r4472; +} +{ +add.f16x2 r4661, r4652, r4658; +} +{ +sub.f16x2 r4664, r4479, r4480; +} +{ +mul.f16x2 r4667, r4664, r4471; +} +{ +sub.f16x2 r4670, r4485, r4486; +} +{ +mul.f16x2 r4673, r4670, r4473; +} +{ +add.f16x2 r4676, r4667, r4673; +} +{ +add.f16x2 r4679, r4661, r4676; +} +{ +add.f16x2 r4682, r4491, r4492; +} +{ +mul.f16x2 r4685, r4682, r4470; +} +{ +add.f16x2 r4688, r4494, r4685; +} +{ +add.f16x2 r4691, r4497, r4498; +} +{ +mul.f16x2 r4694, r4691, r4472; +} +{ +add.f16x2 r4697, r4688, r4694; +} +{ +sub.f16x2 r4700, r4479, r4480; +} +{ +mul.f16x2 r4703, r4700, r4471; +} +{ +sub.f16x2 r4706, r4485, r4486; +} +{ +mul.f16x2 r4709, r4706, r4473; +} +{ +add.f16x2 r4712, r4703, r4709; +} +{ +sub.f16x2 r4715, r4697, r4712; +} +{ +add.f16x2 r4718, r4491, r4492; +} +{ +mul.f16x2 r4721, r4718, r4472; +} +{ +add.f16x2 r4724, r4494, r4721; +} +{ +add.f16x2 r4727, r4497, r4498; +} +{ +mul.f16x2 r4730, r4727, r4474; +} +{ +add.f16x2 r4733, r4724, r4730; +} +{ +sub.f16x2 r4736, r4479, r4480; +} +{ +mul.f16x2 r4739, r4736, r4473; +} +{ +sub.f16x2 r4742, r4485, r4486; +} +{ +mul.f16x2 r4745, r4742, r4476; +} +{ +add.f16x2 r4748, r4739, r4745; +} +{ +add.f16x2 r4751, r4733, r4748; +} +{ +add.f16x2 r4754, r4491, r4492; +} +{ +mul.f16x2 r4757, r4754, r4472; +} +{ +add.f16x2 r4760, r4494, r4757; +} +{ +add.f16x2 r4763, r4497, r4498; +} +{ +mul.f16x2 r4766, r4763, r4474; +} +{ +add.f16x2 r4769, r4760, r4766; +} +{ +sub.f16x2 r4772, r4479, r4480; +} +{ +mul.f16x2 r4775, r4772, r4473; +} +{ +sub.f16x2 r4778, r4485, r4486; +} +{ +mul.f16x2 r4781, r4778, r4476; +} +{ +add.f16x2 r4784, r4775, r4781; +} +{ +sub.f16x2 r4787, r4769, r4784; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4790, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4794, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4795, {low, high}; +} +{ +neg.f16x2 r4796, r4795; +} +{ +add.f16x2 r4798, r4799, r4800; +} +{ +add.f16x2 r4801, r4802, r4798; +} +{ +add.f16x2 r4804, r4805, r4806; +} +{ +add.f16x2 r4807, r4801, r4804; +} +{ +add.f16x2 r4810, r4811, r4812; +} +{ +add.f16x2 r4813, r4814, r4810; +} +{ +add.f16x2 r4816, r4817, r4818; +} +{ +add.f16x2 r4819, r4813, r4816; +} +{ +add.f16x2 r4822, r4799, r4800; +} +{ +mul.f16x2 r4825, r4822, r4790; +} +{ +add.f16x2 r4828, r4802, r4825; +} +{ +add.f16x2 r4831, r4805, r4806; +} +{ +mul.f16x2 r4834, r4831, r4792; +} +{ +add.f16x2 r4837, r4828, r4834; +} +{ +sub.f16x2 r4840, r4811, r4812; +} +{ +mul.f16x2 r4843, r4840, r4791; +} +{ +sub.f16x2 r4846, r4817, r4818; +} +{ +mul.f16x2 r4849, r4846, r4793; +} +{ +add.f16x2 r4852, r4843, r4849; +} +{ +sub.f16x2 r4855, r4837, r4852; +} +{ +add.f16x2 r4858, r4799, r4800; +} +{ +mul.f16x2 r4861, r4858, r4790; +} +{ +add.f16x2 r4864, r4802, r4861; +} +{ +add.f16x2 r4867, r4805, r4806; +} +{ +mul.f16x2 r4870, r4867, r4792; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +sub.f16x2 r4876, r4811, r4812; +} +{ +mul.f16x2 r4879, r4876, r4791; +} +{ +sub.f16x2 r4882, r4817, r4818; +} +{ +mul.f16x2 r4885, r4882, r4793; +} +{ +add.f16x2 r4888, r4879, r4885; +} +{ +add.f16x2 r4891, r4873, r4888; +} +{ +add.f16x2 r4894, r4799, r4800; +} +{ +mul.f16x2 r4897, r4894, r4792; +} +{ +add.f16x2 r4900, r4802, r4897; +} +{ +add.f16x2 r4903, r4805, r4806; +} +{ +mul.f16x2 r4906, r4903, r4794; +} +{ +add.f16x2 r4909, r4900, r4906; +} +{ +sub.f16x2 r4912, r4811, r4812; +} +{ +mul.f16x2 r4915, r4912, r4793; +} +{ +sub.f16x2 r4918, r4817, r4818; +} +{ +mul.f16x2 r4921, r4918, r4796; +} +{ +add.f16x2 r4924, r4915, r4921; +} +{ +sub.f16x2 r4927, r4909, r4924; +} +{ +add.f16x2 r4930, r4799, r4800; +} +{ +mul.f16x2 r4933, r4930, r4792; +} +{ +add.f16x2 r4936, r4802, r4933; +} +{ +add.f16x2 r4939, r4805, r4806; +} +{ +mul.f16x2 r4942, r4939, r4794; +} +{ +add.f16x2 r4945, r4936, r4942; +} +{ +sub.f16x2 r4948, r4811, r4812; +} +{ +mul.f16x2 r4951, r4948, r4793; +} +{ +sub.f16x2 r4954, r4817, r4818; +} +{ +mul.f16x2 r4957, r4954, r4796; +} +{ +add.f16x2 r4960, r4951, r4957; +} +{ +add.f16x2 r4963, r4945, r4960; +} +{ +add.f16x2 r4966, r4811, r4812; +} +{ +mul.f16x2 r4969, r4966, r4790; +} +{ +add.f16x2 r4972, r4814, r4969; +} +{ +add.f16x2 r4975, r4817, r4818; +} +{ +mul.f16x2 r4978, r4975, r4792; +} +{ +add.f16x2 r4981, r4972, r4978; +} +{ +sub.f16x2 r4984, r4799, r4800; +} +{ +mul.f16x2 r4987, r4984, r4791; +} +{ +sub.f16x2 r4990, r4805, r4806; +} +{ +mul.f16x2 r4993, r4990, r4793; +} +{ +add.f16x2 r4996, r4987, r4993; +} +{ +add.f16x2 r4999, r4981, r4996; +} +{ +add.f16x2 r5002, r4811, r4812; +} +{ +mul.f16x2 r5005, r5002, r4790; +} +{ +add.f16x2 r5008, r4814, r5005; +} +{ +add.f16x2 r5011, r4817, r4818; +} +{ +mul.f16x2 r5014, r5011, r4792; +} +{ +add.f16x2 r5017, r5008, r5014; +} +{ +sub.f16x2 r5020, r4799, r4800; +} +{ +mul.f16x2 r5023, r5020, r4791; +} +{ +sub.f16x2 r5026, r4805, r4806; +} +{ +mul.f16x2 r5029, r5026, r4793; +} +{ +add.f16x2 r5032, r5023, r5029; +} +{ +sub.f16x2 r5035, r5017, r5032; +} +{ +add.f16x2 r5038, r4811, r4812; +} +{ +mul.f16x2 r5041, r5038, r4792; +} +{ +add.f16x2 r5044, r4814, r5041; +} +{ +add.f16x2 r5047, r4817, r4818; +} +{ +mul.f16x2 r5050, r5047, r4794; +} +{ +add.f16x2 r5053, r5044, r5050; +} +{ +sub.f16x2 r5056, r4799, r4800; +} +{ +mul.f16x2 r5059, r5056, r4793; +} +{ +sub.f16x2 r5062, r4805, r4806; +} +{ +mul.f16x2 r5065, r5062, r4796; +} +{ +add.f16x2 r5068, r5059, r5065; +} +{ +add.f16x2 r5071, r5053, r5068; +} +{ +add.f16x2 r5074, r4811, r4812; +} +{ +mul.f16x2 r5077, r5074, r4792; +} +{ +add.f16x2 r5080, r4814, r5077; +} +{ +add.f16x2 r5083, r4817, r4818; +} +{ +mul.f16x2 r5086, r5083, r4794; +} +{ +add.f16x2 r5089, r5080, r5086; +} +{ +sub.f16x2 r5092, r4799, r4800; +} +{ +mul.f16x2 r5095, r5092, r4793; +} +{ +sub.f16x2 r5098, r4805, r4806; +} +{ +mul.f16x2 r5101, r5098, r4796; +} +{ +add.f16x2 r5104, r5095, r5101; +} +{ +sub.f16x2 r5107, r5089, r5104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5115, {low, high}; +} +{ +neg.f16x2 r5116, r5115; +} +{ +add.f16x2 r5118, r5119, r5120; +} +{ +add.f16x2 r5121, r5122, r5118; +} +{ +add.f16x2 r5124, r5125, r5126; +} +{ +add.f16x2 r5127, r5121, r5124; +} +{ +add.f16x2 r5130, r5131, r5132; +} +{ +add.f16x2 r5133, r5134, r5130; +} +{ +add.f16x2 r5136, r5137, r5138; +} +{ +add.f16x2 r5139, r5133, r5136; +} +{ +add.f16x2 r5142, r5119, r5120; +} +{ +mul.f16x2 r5145, r5142, r5110; +} +{ +add.f16x2 r5148, r5122, r5145; +} +{ +add.f16x2 r5151, r5125, r5126; +} +{ +mul.f16x2 r5154, r5151, r5112; +} +{ +add.f16x2 r5157, r5148, r5154; +} +{ +sub.f16x2 r5160, r5131, r5132; +} +{ +mul.f16x2 r5163, r5160, r5111; +} +{ +sub.f16x2 r5166, r5137, r5138; +} +{ +mul.f16x2 r5169, r5166, r5113; +} +{ +add.f16x2 r5172, r5163, r5169; +} +{ +sub.f16x2 r5175, r5157, r5172; +} +{ +add.f16x2 r5178, r5119, r5120; +} +{ +mul.f16x2 r5181, r5178, r5110; +} +{ +add.f16x2 r5184, r5122, r5181; +} +{ +add.f16x2 r5187, r5125, r5126; +} +{ +mul.f16x2 r5190, r5187, r5112; +} +{ +add.f16x2 r5193, r5184, r5190; +} +{ +sub.f16x2 r5196, r5131, r5132; +} +{ +mul.f16x2 r5199, r5196, r5111; +} +{ +sub.f16x2 r5202, r5137, r5138; +} +{ +mul.f16x2 r5205, r5202, r5113; +} +{ +add.f16x2 r5208, r5199, r5205; +} +{ +add.f16x2 r5211, r5193, r5208; +} +{ +add.f16x2 r5214, r5119, r5120; +} +{ +mul.f16x2 r5217, r5214, r5112; +} +{ +add.f16x2 r5220, r5122, r5217; +} +{ +add.f16x2 r5223, r5125, r5126; +} +{ +mul.f16x2 r5226, r5223, r5114; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +sub.f16x2 r5232, r5131, r5132; +} +{ +mul.f16x2 r5235, r5232, r5113; +} +{ +sub.f16x2 r5238, r5137, r5138; +} +{ +mul.f16x2 r5241, r5238, r5116; +} +{ +add.f16x2 r5244, r5235, r5241; +} +{ +sub.f16x2 r5247, r5229, r5244; +} +{ +add.f16x2 r5250, r5119, r5120; +} +{ +mul.f16x2 r5253, r5250, r5112; +} +{ +add.f16x2 r5256, r5122, r5253; +} +{ +add.f16x2 r5259, r5125, r5126; +} +{ +mul.f16x2 r5262, r5259, r5114; +} +{ +add.f16x2 r5265, r5256, r5262; +} +{ +sub.f16x2 r5268, r5131, r5132; +} +{ +mul.f16x2 r5271, r5268, r5113; +} +{ +sub.f16x2 r5274, r5137, r5138; +} +{ +mul.f16x2 r5277, r5274, r5116; +} +{ +add.f16x2 r5280, r5271, r5277; +} +{ +add.f16x2 r5283, r5265, r5280; +} +{ +add.f16x2 r5286, r5131, r5132; +} +{ +mul.f16x2 r5289, r5286, r5110; +} +{ +add.f16x2 r5292, r5134, r5289; +} +{ +add.f16x2 r5295, r5137, r5138; +} +{ +mul.f16x2 r5298, r5295, r5112; +} +{ +add.f16x2 r5301, r5292, r5298; +} +{ +sub.f16x2 r5304, r5119, r5120; +} +{ +mul.f16x2 r5307, r5304, r5111; +} +{ +sub.f16x2 r5310, r5125, r5126; +} +{ +mul.f16x2 r5313, r5310, r5113; +} +{ +add.f16x2 r5316, r5307, r5313; +} +{ +add.f16x2 r5319, r5301, r5316; +} +{ +add.f16x2 r5322, r5131, r5132; +} +{ +mul.f16x2 r5325, r5322, r5110; +} +{ +add.f16x2 r5328, r5134, r5325; +} +{ +add.f16x2 r5331, r5137, r5138; +} +{ +mul.f16x2 r5334, r5331, r5112; +} +{ +add.f16x2 r5337, r5328, r5334; +} +{ +sub.f16x2 r5340, r5119, r5120; +} +{ +mul.f16x2 r5343, r5340, r5111; +} +{ +sub.f16x2 r5346, r5125, r5126; +} +{ +mul.f16x2 r5349, r5346, r5113; +} +{ +add.f16x2 r5352, r5343, r5349; +} +{ +sub.f16x2 r5355, r5337, r5352; +} +{ +add.f16x2 r5358, r5131, r5132; +} +{ +mul.f16x2 r5361, r5358, r5112; +} +{ +add.f16x2 r5364, r5134, r5361; +} +{ +add.f16x2 r5367, r5137, r5138; +} +{ +mul.f16x2 r5370, r5367, r5114; +} +{ +add.f16x2 r5373, r5364, r5370; +} +{ +sub.f16x2 r5376, r5119, r5120; +} +{ +mul.f16x2 r5379, r5376, r5113; +} +{ +sub.f16x2 r5382, r5125, r5126; +} +{ +mul.f16x2 r5385, r5382, r5116; +} +{ +add.f16x2 r5388, r5379, r5385; +} +{ +add.f16x2 r5391, r5373, r5388; +} +{ +add.f16x2 r5394, r5131, r5132; +} +{ +mul.f16x2 r5397, r5394, r5112; +} +{ +add.f16x2 r5400, r5134, r5397; +} +{ +add.f16x2 r5403, r5137, r5138; +} +{ +mul.f16x2 r5406, r5403, r5114; +} +{ +add.f16x2 r5409, r5400, r5406; +} +{ +sub.f16x2 r5412, r5119, r5120; +} +{ +mul.f16x2 r5415, r5412, r5113; +} +{ +sub.f16x2 r5418, r5125, r5126; +} +{ +mul.f16x2 r5421, r5418, r5116; +} +{ +add.f16x2 r5424, r5415, r5421; +} +{ +sub.f16x2 r5427, r5409, r5424; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r5430, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r5431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r5432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r5433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r5434, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r5435, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r5436, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r5437, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r5440, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r5441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r5444, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r5445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5446, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r5447, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r5453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5460, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r5461, {low, high}; +} +{ +mul.f16x2 r5478, r4215, r5430; +} +{ +mul.f16x2 r5481, r4359, r5431; +} +{ +sub.f16x2 r5484, r5478, r5481; +} +{ +mul.f16x2 r5487, r4215, r5431; +} +{ +fma.rn.f16x2 r5490, r4359, r5430, r5487; +} +{ +mul.f16x2 r5494, r4535, r5432; +} +{ +mul.f16x2 r5497, r4679, r5433; +} +{ +sub.f16x2 r5500, r5494, r5497; +} +{ +mul.f16x2 r5503, r4535, r5433; +} +{ +fma.rn.f16x2 r5506, r4679, r5432, r5503; +} +{ +mul.f16x2 r5510, r4855, r5434; +} +{ +mul.f16x2 r5513, r4999, r5435; +} +{ +sub.f16x2 r5516, r5510, r5513; +} +{ +mul.f16x2 r5519, r4855, r5435; +} +{ +fma.rn.f16x2 r5522, r4999, r5434, r5519; +} +{ +mul.f16x2 r5526, r5175, r5436; +} +{ +mul.f16x2 r5529, r5319, r5437; +} +{ +sub.f16x2 r5532, r5526, r5529; +} +{ +mul.f16x2 r5535, r5175, r5437; +} +{ +fma.rn.f16x2 r5538, r5319, r5436, r5535; +} +{ +mul.f16x2 r5542, r4287, r5432; +} +{ +mul.f16x2 r5545, r4431, r5433; +} +{ +sub.f16x2 r5548, r5542, r5545; +} +{ +mul.f16x2 r5551, r4287, r5433; +} +{ +fma.rn.f16x2 r5554, r4431, r5432, r5551; +} +{ +mul.f16x2 r5558, r4607, r5436; +} +{ +mul.f16x2 r5561, r4751, r5437; +} +{ +sub.f16x2 r5564, r5558, r5561; +} +{ +mul.f16x2 r5567, r4607, r5437; +} +{ +fma.rn.f16x2 r5570, r4751, r5436, r5567; +} +{ +mul.f16x2 r5574, r4927, r5440; +} +{ +mul.f16x2 r5577, r5071, r5441; +} +{ +sub.f16x2 r5580, r5574, r5577; +} +{ +mul.f16x2 r5583, r4927, r5441; +} +{ +fma.rn.f16x2 r5586, r5071, r5440, r5583; +} +{ +mul.f16x2 r5590, r5247, r5444; +} +{ +mul.f16x2 r5593, r5391, r5445; +} +{ +sub.f16x2 r5596, r5590, r5593; +} +{ +mul.f16x2 r5599, r5247, r5445; +} +{ +fma.rn.f16x2 r5602, r5391, r5444, r5599; +} +{ +mul.f16x2 r5606, r4323, r5434; +} +{ +mul.f16x2 r5609, r4467, r5435; +} +{ +sub.f16x2 r5612, r5606, r5609; +} +{ +mul.f16x2 r5615, r4323, r5435; +} +{ +fma.rn.f16x2 r5618, r4467, r5434, r5615; +} +{ +mul.f16x2 r5622, r4643, r5440; +} +{ +mul.f16x2 r5625, r4787, r5441; +} +{ +sub.f16x2 r5628, r5622, r5625; +} +{ +mul.f16x2 r5631, r4643, r5441; +} +{ +fma.rn.f16x2 r5634, r4787, r5440, r5631; +} +{ +mul.f16x2 r5638, r4963, r5446; +} +{ +mul.f16x2 r5641, r5107, r5447; +} +{ +sub.f16x2 r5644, r5638, r5641; +} +{ +mul.f16x2 r5647, r4963, r5447; +} +{ +fma.rn.f16x2 r5650, r5107, r5446, r5647; +} +{ +mul.f16x2 r5654, r5283, r5452; +} +{ +mul.f16x2 r5657, r5427, r5453; +} +{ +sub.f16x2 r5660, r5654, r5657; +} +{ +mul.f16x2 r5663, r5283, r5453; +} +{ +fma.rn.f16x2 r5666, r5427, r5452, r5663; +} +{ +mul.f16x2 r5670, r4251, r5436; +} +{ +mul.f16x2 r5673, r4395, r5437; +} +{ +sub.f16x2 r5676, r5670, r5673; +} +{ +mul.f16x2 r5679, r4251, r5437; +} +{ +fma.rn.f16x2 r5682, r4395, r5436, r5679; +} +{ +mul.f16x2 r5686, r4571, r5444; +} +{ +mul.f16x2 r5689, r4715, r5445; +} +{ +sub.f16x2 r5692, r5686, r5689; +} +{ +mul.f16x2 r5695, r4571, r5445; +} +{ +fma.rn.f16x2 r5698, r4715, r5444, r5695; +} +{ +mul.f16x2 r5702, r4891, r5452; +} +{ +mul.f16x2 r5705, r5035, r5453; +} +{ +sub.f16x2 r5708, r5702, r5705; +} +{ +mul.f16x2 r5711, r4891, r5453; +} +{ +fma.rn.f16x2 r5714, r5035, r5452, r5711; +} +{ +mul.f16x2 r5718, r5211, r5460; +} +{ +mul.f16x2 r5721, r5355, r5461; +} +{ +sub.f16x2 r5724, r5718, r5721; +} +{ +mul.f16x2 r5727, r5211, r5461; +} +{ +fma.rn.f16x2 r5730, r5355, r5460, r5727; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5739, {low, high}; +} +{ +neg.f16x2 r5740, r5739; +} +{ +add.f16x2 r5742, r4167, r5127; +} +{ +add.f16x2 r5745, r3847, r5742; +} +{ +add.f16x2 r5748, r4487, r4807; +} +{ +add.f16x2 r5751, r5745, r5748; +} +st.local.u32 [rd3], r5751; +{ +add.f16x2 r5754, r4179, r5139; +} +{ +add.f16x2 r5757, r3859, r5754; +} +{ +add.f16x2 r5760, r4499, r4819; +} +{ +add.f16x2 r5763, r5757, r5760; +} +st.local.u32 [rd4], r5763; +{ +add.f16x2 r5766, r4167, r5127; +} +{ +mul.f16x2 r5769, r5766, r5734; +} +{ +add.f16x2 r5772, r3847, r5769; +} +{ +add.f16x2 r5775, r4487, r4807; +} +{ +mul.f16x2 r5778, r5775, r5736; +} +{ +add.f16x2 r5781, r5772, r5778; +} +{ +sub.f16x2 r5784, r4179, r5139; +} +{ +mul.f16x2 r5787, r5784, r5735; +} +{ +sub.f16x2 r5790, r4499, r4819; +} +{ +mul.f16x2 r5793, r5790, r5737; +} +{ +add.f16x2 r5796, r5787, r5793; +} +{ +sub.f16x2 r5799, r5781, r5796; +} +st.local.u32 [rd4+36], r5799; +{ +add.f16x2 r5802, r4167, r5127; +} +{ +mul.f16x2 r5805, r5802, r5734; +} +{ +add.f16x2 r5808, r3847, r5805; +} +{ +add.f16x2 r5811, r4487, r4807; +} +{ +mul.f16x2 r5814, r5811, r5736; +} +{ +add.f16x2 r5817, r5808, r5814; +} +{ +sub.f16x2 r5820, r4179, r5139; +} +{ +mul.f16x2 r5823, r5820, r5735; +} +{ +sub.f16x2 r5826, r4499, r4819; +} +{ +mul.f16x2 r5829, r5826, r5737; +} +{ +add.f16x2 r5832, r5823, r5829; +} +{ +add.f16x2 r5835, r5817, r5832; +} +st.local.u32 [rd4+156], r5835; +{ +add.f16x2 r5838, r4167, r5127; +} +{ +mul.f16x2 r5841, r5838, r5736; +} +{ +add.f16x2 r5844, r3847, r5841; +} +{ +add.f16x2 r5847, r4487, r4807; +} +{ +mul.f16x2 r5850, r5847, r5738; +} +{ +add.f16x2 r5853, r5844, r5850; +} +{ +sub.f16x2 r5856, r4179, r5139; +} +{ +mul.f16x2 r5859, r5856, r5737; +} +{ +sub.f16x2 r5862, r4499, r4819; +} +{ +mul.f16x2 r5865, r5862, r5740; +} +{ +add.f16x2 r5868, r5859, r5865; +} +{ +sub.f16x2 r5871, r5853, r5868; +} +st.local.u32 [rd4+76], r5871; +{ +add.f16x2 r5874, r4167, r5127; +} +{ +mul.f16x2 r5877, r5874, r5736; +} +{ +add.f16x2 r5880, r3847, r5877; +} +{ +add.f16x2 r5883, r4487, r4807; +} +{ +mul.f16x2 r5886, r5883, r5738; +} +{ +add.f16x2 r5889, r5880, r5886; +} +{ +sub.f16x2 r5892, r4179, r5139; +} +{ +mul.f16x2 r5895, r5892, r5737; +} +{ +sub.f16x2 r5898, r4499, r4819; +} +{ +mul.f16x2 r5901, r5898, r5740; +} +{ +add.f16x2 r5904, r5895, r5901; +} +{ +add.f16x2 r5907, r5889, r5904; +} +st.local.u32 [rd4+116], r5907; +{ +add.f16x2 r5910, r4179, r5139; +} +{ +mul.f16x2 r5913, r5910, r5734; +} +{ +add.f16x2 r5916, r3859, r5913; +} +{ +add.f16x2 r5919, r4499, r4819; +} +{ +mul.f16x2 r5922, r5919, r5736; +} +{ +add.f16x2 r5925, r5916, r5922; +} +{ +sub.f16x2 r5928, r4167, r5127; +} +{ +mul.f16x2 r5931, r5928, r5735; +} +{ +sub.f16x2 r5934, r4487, r4807; +} +{ +mul.f16x2 r5937, r5934, r5737; +} +{ +add.f16x2 r5940, r5931, r5937; +} +{ +add.f16x2 r5943, r5925, r5940; +} +st.local.u32 [rd4+40], r5943; +{ +add.f16x2 r5946, r4179, r5139; +} +{ +mul.f16x2 r5949, r5946, r5734; +} +{ +add.f16x2 r5952, r3859, r5949; +} +{ +add.f16x2 r5955, r4499, r4819; +} +{ +mul.f16x2 r5958, r5955, r5736; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +sub.f16x2 r5964, r4167, r5127; +} +{ +mul.f16x2 r5967, r5964, r5735; +} +{ +sub.f16x2 r5970, r4487, r4807; +} +{ +mul.f16x2 r5973, r5970, r5737; +} +{ +add.f16x2 r5976, r5967, r5973; +} +{ +sub.f16x2 r5979, r5961, r5976; +} +st.local.u32 [rd4+160], r5979; +{ +add.f16x2 r5982, r4179, r5139; +} +{ +mul.f16x2 r5985, r5982, r5736; +} +{ +add.f16x2 r5988, r3859, r5985; +} +{ +add.f16x2 r5991, r4499, r4819; +} +{ +mul.f16x2 r5994, r5991, r5738; +} +{ +add.f16x2 r5997, r5988, r5994; +} +{ +sub.f16x2 r6000, r4167, r5127; +} +{ +mul.f16x2 r6003, r6000, r5737; +} +{ +sub.f16x2 r6006, r4487, r4807; +} +{ +mul.f16x2 r6009, r6006, r5740; +} +{ +add.f16x2 r6012, r6003, r6009; +} +{ +add.f16x2 r6015, r5997, r6012; +} +st.local.u32 [rd4+80], r6015; +{ +add.f16x2 r6018, r4179, r5139; +} +{ +mul.f16x2 r6021, r6018, r5736; +} +{ +add.f16x2 r6024, r3859, r6021; +} +{ +add.f16x2 r6027, r4499, r4819; +} +{ +mul.f16x2 r6030, r6027, r5738; +} +{ +add.f16x2 r6033, r6024, r6030; +} +{ +sub.f16x2 r6036, r4167, r5127; +} +{ +mul.f16x2 r6039, r6036, r5737; +} +{ +sub.f16x2 r6042, r4487, r4807; +} +{ +mul.f16x2 r6045, r6042, r5740; +} +{ +add.f16x2 r6048, r6039, r6045; +} +{ +sub.f16x2 r6051, r6033, r6048; +} +st.local.u32 [rd4+120], r6051; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6054, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6057, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6058, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6059, {low, high}; +} +{ +neg.f16x2 r6060, r6059; +} +{ +add.f16x2 r6062, r5484, r5532; +} +{ +add.f16x2 r6065, r3895, r6062; +} +{ +add.f16x2 r6068, r5500, r5516; +} +{ +add.f16x2 r11251, r6065, r6068; +} +st.local.u32 [rd4+4], r11251; +{ +add.f16x2 r6074, r5490, r5538; +} +{ +add.f16x2 r6077, r4039, r6074; +} +{ +add.f16x2 r6080, r5506, r5522; +} +{ +add.f16x2 r6083, r6077, r6080; +} +st.local.u32 [rd4+8], r6083; +{ +add.f16x2 r6086, r5484, r5532; +} +{ +mul.f16x2 r6089, r6086, r6054; +} +{ +add.f16x2 r6092, r3895, r6089; +} +{ +add.f16x2 r6095, r5500, r5516; +} +{ +mul.f16x2 r6098, r6095, r6056; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +sub.f16x2 r6104, r5490, r5538; +} +{ +mul.f16x2 r6107, r6104, r6055; +} +{ +sub.f16x2 r6110, r5506, r5522; +} +{ +mul.f16x2 r6113, r6110, r6057; +} +{ +add.f16x2 r6116, r6107, r6113; +} +{ +sub.f16x2 r6119, r6101, r6116; +} +st.local.u32 [rd4+44], r6119; +{ +add.f16x2 r6122, r5484, r5532; +} +{ +mul.f16x2 r6125, r6122, r6054; +} +{ +add.f16x2 r6128, r3895, r6125; +} +{ +add.f16x2 r6131, r5500, r5516; +} +{ +mul.f16x2 r6134, r6131, r6056; +} +{ +add.f16x2 r6137, r6128, r6134; +} +{ +sub.f16x2 r6140, r5490, r5538; +} +{ +mul.f16x2 r6143, r6140, r6055; +} +{ +sub.f16x2 r6146, r5506, r5522; +} +{ +mul.f16x2 r6149, r6146, r6057; +} +{ +add.f16x2 r6152, r6143, r6149; +} +{ +add.f16x2 r6155, r6137, r6152; +} +st.local.u32 [rd4+164], r6155; +{ +add.f16x2 r6158, r5484, r5532; +} +{ +mul.f16x2 r6161, r6158, r6056; +} +{ +add.f16x2 r6164, r3895, r6161; +} +{ +add.f16x2 r6167, r5500, r5516; +} +{ +mul.f16x2 r6170, r6167, r6058; +} +{ +add.f16x2 r6173, r6164, r6170; +} +{ +sub.f16x2 r6176, r5490, r5538; +} +{ +mul.f16x2 r6179, r6176, r6057; +} +{ +sub.f16x2 r6182, r5506, r5522; +} +{ +mul.f16x2 r6185, r6182, r6060; +} +{ +add.f16x2 r6188, r6179, r6185; +} +{ +sub.f16x2 r6191, r6173, r6188; +} +st.local.u32 [rd4+84], r6191; +{ +add.f16x2 r6194, r5484, r5532; +} +{ +mul.f16x2 r6197, r6194, r6056; +} +{ +add.f16x2 r6200, r3895, r6197; +} +{ +add.f16x2 r6203, r5500, r5516; +} +{ +mul.f16x2 r6206, r6203, r6058; +} +{ +add.f16x2 r6209, r6200, r6206; +} +{ +sub.f16x2 r6212, r5490, r5538; +} +{ +mul.f16x2 r6215, r6212, r6057; +} +{ +sub.f16x2 r6218, r5506, r5522; +} +{ +mul.f16x2 r6221, r6218, r6060; +} +{ +add.f16x2 r6224, r6215, r6221; +} +{ +add.f16x2 r6227, r6209, r6224; +} +st.local.u32 [rd4+124], r6227; +{ +add.f16x2 r6230, r5490, r5538; +} +{ +mul.f16x2 r6233, r6230, r6054; +} +{ +add.f16x2 r6236, r4039, r6233; +} +{ +add.f16x2 r6239, r5506, r5522; +} +{ +mul.f16x2 r6242, r6239, r6056; +} +{ +add.f16x2 r6245, r6236, r6242; +} +{ +sub.f16x2 r6248, r5484, r5532; +} +{ +mul.f16x2 r6251, r6248, r6055; +} +{ +sub.f16x2 r6254, r5500, r5516; +} +{ +mul.f16x2 r6257, r6254, r6057; +} +{ +add.f16x2 r6260, r6251, r6257; +} +{ +add.f16x2 r6263, r6245, r6260; +} +st.local.u32 [rd4+48], r6263; +{ +add.f16x2 r6266, r5490, r5538; +} +{ +mul.f16x2 r6269, r6266, r6054; +} +{ +add.f16x2 r6272, r4039, r6269; +} +{ +add.f16x2 r6275, r5506, r5522; +} +{ +mul.f16x2 r6278, r6275, r6056; +} +{ +add.f16x2 r6281, r6272, r6278; +} +{ +sub.f16x2 r6284, r5484, r5532; +} +{ +mul.f16x2 r6287, r6284, r6055; +} +{ +sub.f16x2 r6290, r5500, r5516; +} +{ +mul.f16x2 r6293, r6290, r6057; +} +{ +add.f16x2 r6296, r6287, r6293; +} +{ +sub.f16x2 r6299, r6281, r6296; +} +st.local.u32 [rd4+168], r6299; +{ +add.f16x2 r6302, r5490, r5538; +} +{ +mul.f16x2 r6305, r6302, r6056; +} +{ +add.f16x2 r6308, r4039, r6305; +} +{ +add.f16x2 r6311, r5506, r5522; +} +{ +mul.f16x2 r6314, r6311, r6058; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +sub.f16x2 r6320, r5484, r5532; +} +{ +mul.f16x2 r6323, r6320, r6057; +} +{ +sub.f16x2 r6326, r5500, r5516; +} +{ +mul.f16x2 r6329, r6326, r6060; +} +{ +add.f16x2 r6332, r6323, r6329; +} +{ +add.f16x2 r6335, r6317, r6332; +} +st.local.u32 [rd4+88], r6335; +{ +add.f16x2 r6338, r5490, r5538; +} +{ +mul.f16x2 r6341, r6338, r6056; +} +{ +add.f16x2 r6344, r4039, r6341; +} +{ +add.f16x2 r6347, r5506, r5522; +} +{ +mul.f16x2 r6350, r6347, r6058; +} +{ +add.f16x2 r6353, r6344, r6350; +} +{ +sub.f16x2 r6356, r5484, r5532; +} +{ +mul.f16x2 r6359, r6356, r6057; +} +{ +sub.f16x2 r6362, r5500, r5516; +} +{ +mul.f16x2 r6365, r6362, r6060; +} +{ +add.f16x2 r6368, r6359, r6365; +} +{ +sub.f16x2 r6371, r6353, r6368; +} +st.local.u32 [rd4+128], r6371; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6375, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6377, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6378, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6379, {low, high}; +} +{ +neg.f16x2 r6380, r6379; +} +{ +add.f16x2 r6382, r5548, r5596; +} +{ +add.f16x2 r6385, r3967, r6382; +} +{ +add.f16x2 r6388, r5564, r5580; +} +{ +add.f16x2 r6391, r6385, r6388; +} +st.local.u32 [rd4+12], r6391; +{ +add.f16x2 r6394, r5554, r5602; +} +{ +add.f16x2 r6397, r4111, r6394; +} +{ +add.f16x2 r6400, r5570, r5586; +} +{ +add.f16x2 r6403, r6397, r6400; +} +st.local.u32 [rd4+16], r6403; +{ +add.f16x2 r6406, r5548, r5596; +} +{ +mul.f16x2 r6409, r6406, r6374; +} +{ +add.f16x2 r6412, r3967, r6409; +} +{ +add.f16x2 r6415, r5564, r5580; +} +{ +mul.f16x2 r6418, r6415, r6376; +} +{ +add.f16x2 r6421, r6412, r6418; +} +{ +sub.f16x2 r6424, r5554, r5602; +} +{ +mul.f16x2 r6427, r6424, r6375; +} +{ +sub.f16x2 r6430, r5570, r5586; +} +{ +mul.f16x2 r6433, r6430, r6377; +} +{ +add.f16x2 r6436, r6427, r6433; +} +{ +sub.f16x2 r6439, r6421, r6436; +} +st.local.u32 [rd4+52], r6439; +{ +add.f16x2 r6442, r5548, r5596; +} +{ +mul.f16x2 r6445, r6442, r6374; +} +{ +add.f16x2 r6448, r3967, r6445; +} +{ +add.f16x2 r6451, r5564, r5580; +} +{ +mul.f16x2 r6454, r6451, r6376; +} +{ +add.f16x2 r6457, r6448, r6454; +} +{ +sub.f16x2 r6460, r5554, r5602; +} +{ +mul.f16x2 r6463, r6460, r6375; +} +{ +sub.f16x2 r6466, r5570, r5586; +} +{ +mul.f16x2 r6469, r6466, r6377; +} +{ +add.f16x2 r6472, r6463, r6469; +} +{ +add.f16x2 r6475, r6457, r6472; +} +st.local.u32 [rd4+172], r6475; +{ +add.f16x2 r6478, r5548, r5596; +} +{ +mul.f16x2 r6481, r6478, r6376; +} +{ +add.f16x2 r6484, r3967, r6481; +} +{ +add.f16x2 r6487, r5564, r5580; +} +{ +mul.f16x2 r6490, r6487, r6378; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +sub.f16x2 r6496, r5554, r5602; +} +{ +mul.f16x2 r6499, r6496, r6377; +} +{ +sub.f16x2 r6502, r5570, r5586; +} +{ +mul.f16x2 r6505, r6502, r6380; +} +{ +add.f16x2 r6508, r6499, r6505; +} +{ +sub.f16x2 r6511, r6493, r6508; +} +st.local.u32 [rd4+92], r6511; +{ +add.f16x2 r6514, r5548, r5596; +} +{ +mul.f16x2 r6517, r6514, r6376; +} +{ +add.f16x2 r6520, r3967, r6517; +} +{ +add.f16x2 r6523, r5564, r5580; +} +{ +mul.f16x2 r6526, r6523, r6378; +} +{ +add.f16x2 r6529, r6520, r6526; +} +{ +sub.f16x2 r6532, r5554, r5602; +} +{ +mul.f16x2 r6535, r6532, r6377; +} +{ +sub.f16x2 r6538, r5570, r5586; +} +{ +mul.f16x2 r6541, r6538, r6380; +} +{ +add.f16x2 r6544, r6535, r6541; +} +{ +add.f16x2 r6547, r6529, r6544; +} +st.local.u32 [rd4+132], r6547; +{ +add.f16x2 r6550, r5554, r5602; +} +{ +mul.f16x2 r6553, r6550, r6374; +} +{ +add.f16x2 r6556, r4111, r6553; +} +{ +add.f16x2 r6559, r5570, r5586; +} +{ +mul.f16x2 r6562, r6559, r6376; +} +{ +add.f16x2 r6565, r6556, r6562; +} +{ +sub.f16x2 r6568, r5548, r5596; +} +{ +mul.f16x2 r6571, r6568, r6375; +} +{ +sub.f16x2 r6574, r5564, r5580; +} +{ +mul.f16x2 r6577, r6574, r6377; +} +{ +add.f16x2 r6580, r6571, r6577; +} +{ +add.f16x2 r6583, r6565, r6580; +} +st.local.u32 [rd4+56], r6583; +{ +add.f16x2 r6586, r5554, r5602; +} +{ +mul.f16x2 r6589, r6586, r6374; +} +{ +add.f16x2 r6592, r4111, r6589; +} +{ +add.f16x2 r6595, r5570, r5586; +} +{ +mul.f16x2 r6598, r6595, r6376; +} +{ +add.f16x2 r6601, r6592, r6598; +} +{ +sub.f16x2 r6604, r5548, r5596; +} +{ +mul.f16x2 r6607, r6604, r6375; +} +{ +sub.f16x2 r6610, r5564, r5580; +} +{ +mul.f16x2 r6613, r6610, r6377; +} +{ +add.f16x2 r6616, r6607, r6613; +} +{ +sub.f16x2 r6619, r6601, r6616; +} +st.local.u32 [rd4+176], r6619; +{ +add.f16x2 r6622, r5554, r5602; +} +{ +mul.f16x2 r6625, r6622, r6376; +} +{ +add.f16x2 r6628, r4111, r6625; +} +{ +add.f16x2 r6631, r5570, r5586; +} +{ +mul.f16x2 r6634, r6631, r6378; +} +{ +add.f16x2 r6637, r6628, r6634; +} +{ +sub.f16x2 r6640, r5548, r5596; +} +{ +mul.f16x2 r6643, r6640, r6377; +} +{ +sub.f16x2 r6646, r5564, r5580; +} +{ +mul.f16x2 r6649, r6646, r6380; +} +{ +add.f16x2 r6652, r6643, r6649; +} +{ +add.f16x2 r6655, r6637, r6652; +} +st.local.u32 [rd4+96], r6655; +{ +add.f16x2 r6658, r5554, r5602; +} +{ +mul.f16x2 r6661, r6658, r6376; +} +{ +add.f16x2 r6664, r4111, r6661; +} +{ +add.f16x2 r6667, r5570, r5586; +} +{ +mul.f16x2 r6670, r6667, r6378; +} +{ +add.f16x2 r6673, r6664, r6670; +} +{ +sub.f16x2 r6676, r5548, r5596; +} +{ +mul.f16x2 r6679, r6676, r6377; +} +{ +sub.f16x2 r6682, r5564, r5580; +} +{ +mul.f16x2 r6685, r6682, r6380; +} +{ +add.f16x2 r6688, r6679, r6685; +} +{ +sub.f16x2 r6691, r6673, r6688; +} +st.local.u32 [rd4+136], r6691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6694, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6695, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6698, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6699, {low, high}; +} +{ +neg.f16x2 r6700, r6699; +} +{ +add.f16x2 r6702, r5612, r5660; +} +{ +add.f16x2 r6705, r4003, r6702; +} +{ +add.f16x2 r6708, r5628, r5644; +} +{ +add.f16x2 r6711, r6705, r6708; +} +st.local.u32 [rd4+20], r6711; +{ +add.f16x2 r6714, r5618, r5666; +} +{ +add.f16x2 r6717, r4147, r6714; +} +{ +add.f16x2 r6720, r5634, r5650; +} +{ +add.f16x2 r6723, r6717, r6720; +} +st.local.u32 [rd4+24], r6723; +{ +add.f16x2 r6726, r5612, r5660; +} +{ +mul.f16x2 r6729, r6726, r6694; +} +{ +add.f16x2 r6732, r4003, r6729; +} +{ +add.f16x2 r6735, r5628, r5644; +} +{ +mul.f16x2 r6738, r6735, r6696; +} +{ +add.f16x2 r6741, r6732, r6738; +} +{ +sub.f16x2 r6744, r5618, r5666; +} +{ +mul.f16x2 r6747, r6744, r6695; +} +{ +sub.f16x2 r6750, r5634, r5650; +} +{ +mul.f16x2 r6753, r6750, r6697; +} +{ +add.f16x2 r6756, r6747, r6753; +} +{ +sub.f16x2 r6759, r6741, r6756; +} +st.local.u32 [rd4+60], r6759; +{ +add.f16x2 r6762, r5612, r5660; +} +{ +mul.f16x2 r6765, r6762, r6694; +} +{ +add.f16x2 r6768, r4003, r6765; +} +{ +add.f16x2 r6771, r5628, r5644; +} +{ +mul.f16x2 r6774, r6771, r6696; +} +{ +add.f16x2 r6777, r6768, r6774; +} +{ +sub.f16x2 r6780, r5618, r5666; +} +{ +mul.f16x2 r6783, r6780, r6695; +} +{ +sub.f16x2 r6786, r5634, r5650; +} +{ +mul.f16x2 r6789, r6786, r6697; +} +{ +add.f16x2 r6792, r6783, r6789; +} +{ +add.f16x2 r6795, r6777, r6792; +} +st.local.u32 [rd4+180], r6795; +{ +add.f16x2 r6798, r5612, r5660; +} +{ +mul.f16x2 r6801, r6798, r6696; +} +{ +add.f16x2 r6804, r4003, r6801; +} +{ +add.f16x2 r6807, r5628, r5644; +} +{ +mul.f16x2 r6810, r6807, r6698; +} +{ +add.f16x2 r6813, r6804, r6810; +} +{ +sub.f16x2 r6816, r5618, r5666; +} +{ +mul.f16x2 r6819, r6816, r6697; +} +{ +sub.f16x2 r6822, r5634, r5650; +} +{ +mul.f16x2 r6825, r6822, r6700; +} +{ +add.f16x2 r6828, r6819, r6825; +} +{ +sub.f16x2 r6831, r6813, r6828; +} +st.local.u32 [rd4+100], r6831; +{ +add.f16x2 r6834, r5612, r5660; +} +{ +mul.f16x2 r6837, r6834, r6696; +} +{ +add.f16x2 r6840, r4003, r6837; +} +{ +add.f16x2 r6843, r5628, r5644; +} +{ +mul.f16x2 r6846, r6843, r6698; +} +{ +add.f16x2 r6849, r6840, r6846; +} +{ +sub.f16x2 r6852, r5618, r5666; +} +{ +mul.f16x2 r6855, r6852, r6697; +} +{ +sub.f16x2 r6858, r5634, r5650; +} +{ +mul.f16x2 r6861, r6858, r6700; +} +{ +add.f16x2 r6864, r6855, r6861; +} +{ +add.f16x2 r6867, r6849, r6864; +} +st.local.u32 [rd4+140], r6867; +{ +add.f16x2 r6870, r5618, r5666; +} +{ +mul.f16x2 r6873, r6870, r6694; +} +{ +add.f16x2 r6876, r4147, r6873; +} +{ +add.f16x2 r6879, r5634, r5650; +} +{ +mul.f16x2 r6882, r6879, r6696; +} +{ +add.f16x2 r6885, r6876, r6882; +} +{ +sub.f16x2 r6888, r5612, r5660; +} +{ +mul.f16x2 r6891, r6888, r6695; +} +{ +sub.f16x2 r6894, r5628, r5644; +} +{ +mul.f16x2 r6897, r6894, r6697; +} +{ +add.f16x2 r6900, r6891, r6897; +} +{ +add.f16x2 r6903, r6885, r6900; +} +st.local.u32 [rd4+64], r6903; +{ +add.f16x2 r6906, r5618, r5666; +} +{ +mul.f16x2 r6909, r6906, r6694; +} +{ +add.f16x2 r6912, r4147, r6909; +} +{ +add.f16x2 r6915, r5634, r5650; +} +{ +mul.f16x2 r6918, r6915, r6696; +} +{ +add.f16x2 r6921, r6912, r6918; +} +{ +sub.f16x2 r6924, r5612, r5660; +} +{ +mul.f16x2 r6927, r6924, r6695; +} +{ +sub.f16x2 r6930, r5628, r5644; +} +{ +mul.f16x2 r6933, r6930, r6697; +} +{ +add.f16x2 r6936, r6927, r6933; +} +{ +sub.f16x2 r6939, r6921, r6936; +} +st.local.u32 [rd4+184], r6939; +{ +add.f16x2 r6942, r5618, r5666; +} +{ +mul.f16x2 r6945, r6942, r6696; +} +{ +add.f16x2 r6948, r4147, r6945; +} +{ +add.f16x2 r6951, r5634, r5650; +} +{ +mul.f16x2 r6954, r6951, r6698; +} +{ +add.f16x2 r6957, r6948, r6954; +} +{ +sub.f16x2 r6960, r5612, r5660; +} +{ +mul.f16x2 r6963, r6960, r6697; +} +{ +sub.f16x2 r6966, r5628, r5644; +} +{ +mul.f16x2 r6969, r6966, r6700; +} +{ +add.f16x2 r6972, r6963, r6969; +} +{ +add.f16x2 r6975, r6957, r6972; +} +st.local.u32 [rd4+104], r6975; +{ +add.f16x2 r6978, r5618, r5666; +} +{ +mul.f16x2 r6981, r6978, r6696; +} +{ +add.f16x2 r6984, r4147, r6981; +} +{ +add.f16x2 r6987, r5634, r5650; +} +{ +mul.f16x2 r6990, r6987, r6698; +} +{ +add.f16x2 r6993, r6984, r6990; +} +{ +sub.f16x2 r6996, r5612, r5660; +} +{ +mul.f16x2 r6999, r6996, r6697; +} +{ +sub.f16x2 r7002, r5628, r5644; +} +{ +mul.f16x2 r7005, r7002, r6700; +} +{ +add.f16x2 r7008, r6999, r7005; +} +{ +sub.f16x2 r7011, r6993, r7008; +} +st.local.u32 [rd4+144], r7011; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7015, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7016, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7019, {low, high}; +} +{ +neg.f16x2 r7020, r7019; +} +{ +add.f16x2 r7022, r5676, r5724; +} +{ +add.f16x2 r7025, r3931, r7022; +} +{ +add.f16x2 r7028, r5692, r5708; +} +{ +add.f16x2 r7031, r7025, r7028; +} +st.local.u32 [rd4+28], r7031; +{ +add.f16x2 r7034, r5682, r5730; +} +{ +add.f16x2 r7037, r4075, r7034; +} +{ +add.f16x2 r7040, r5698, r5714; +} +{ +add.f16x2 r7043, r7037, r7040; +} +st.local.u32 [rd4+32], r7043; +{ +add.f16x2 r7046, r5676, r5724; +} +{ +mul.f16x2 r7049, r7046, r7014; +} +{ +add.f16x2 r7052, r3931, r7049; +} +{ +add.f16x2 r7055, r5692, r5708; +} +{ +mul.f16x2 r7058, r7055, r7016; +} +{ +add.f16x2 r7061, r7052, r7058; +} +{ +sub.f16x2 r7064, r5682, r5730; +} +{ +mul.f16x2 r7067, r7064, r7015; +} +{ +sub.f16x2 r7070, r5698, r5714; +} +{ +mul.f16x2 r7073, r7070, r7017; +} +{ +add.f16x2 r7076, r7067, r7073; +} +{ +sub.f16x2 r7079, r7061, r7076; +} +st.local.u32 [rd4+68], r7079; +{ +add.f16x2 r7082, r5676, r5724; +} +{ +mul.f16x2 r7085, r7082, r7014; +} +{ +add.f16x2 r7088, r3931, r7085; +} +{ +add.f16x2 r7091, r5692, r5708; +} +{ +mul.f16x2 r7094, r7091, r7016; +} +{ +add.f16x2 r7097, r7088, r7094; +} +{ +sub.f16x2 r7100, r5682, r5730; +} +{ +mul.f16x2 r7103, r7100, r7015; +} +{ +sub.f16x2 r7106, r5698, r5714; +} +{ +mul.f16x2 r7109, r7106, r7017; +} +{ +add.f16x2 r7112, r7103, r7109; +} +{ +add.f16x2 r7115, r7097, r7112; +} +st.local.u32 [rd4+188], r7115; +{ +add.f16x2 r7118, r5676, r5724; +} +{ +mul.f16x2 r7121, r7118, r7016; +} +{ +add.f16x2 r7124, r3931, r7121; +} +{ +add.f16x2 r7127, r5692, r5708; +} +{ +mul.f16x2 r7130, r7127, r7018; +} +{ +add.f16x2 r7133, r7124, r7130; +} +{ +sub.f16x2 r7136, r5682, r5730; +} +{ +mul.f16x2 r7139, r7136, r7017; +} +{ +sub.f16x2 r7142, r5698, r5714; +} +{ +mul.f16x2 r7145, r7142, r7020; +} +{ +add.f16x2 r7148, r7139, r7145; +} +{ +sub.f16x2 r7151, r7133, r7148; +} +st.local.u32 [rd4+108], r7151; +{ +add.f16x2 r7154, r5676, r5724; +} +{ +mul.f16x2 r7157, r7154, r7016; +} +{ +add.f16x2 r7160, r3931, r7157; +} +{ +add.f16x2 r7163, r5692, r5708; +} +{ +mul.f16x2 r7166, r7163, r7018; +} +{ +add.f16x2 r7169, r7160, r7166; +} +{ +sub.f16x2 r7172, r5682, r5730; +} +{ +mul.f16x2 r7175, r7172, r7017; +} +{ +sub.f16x2 r7178, r5698, r5714; +} +{ +mul.f16x2 r7181, r7178, r7020; +} +{ +add.f16x2 r7184, r7175, r7181; +} +{ +add.f16x2 r7187, r7169, r7184; +} +st.local.u32 [rd4+148], r7187; +{ +add.f16x2 r7190, r5682, r5730; +} +{ +mul.f16x2 r7193, r7190, r7014; +} +{ +add.f16x2 r7196, r4075, r7193; +} +{ +add.f16x2 r7199, r5698, r5714; +} +{ +mul.f16x2 r7202, r7199, r7016; +} +{ +add.f16x2 r7205, r7196, r7202; +} +{ +sub.f16x2 r7208, r5676, r5724; +} +{ +mul.f16x2 r7211, r7208, r7015; +} +{ +sub.f16x2 r7214, r5692, r5708; +} +{ +mul.f16x2 r7217, r7214, r7017; +} +{ +add.f16x2 r7220, r7211, r7217; +} +{ +add.f16x2 r7223, r7205, r7220; +} +st.local.u32 [rd4+72], r7223; +{ +add.f16x2 r7226, r5682, r5730; +} +{ +mul.f16x2 r7229, r7226, r7014; +} +{ +add.f16x2 r7232, r4075, r7229; +} +{ +add.f16x2 r7235, r5698, r5714; +} +{ +mul.f16x2 r7238, r7235, r7016; +} +{ +add.f16x2 r7241, r7232, r7238; +} +{ +sub.f16x2 r7244, r5676, r5724; +} +{ +mul.f16x2 r7247, r7244, r7015; +} +{ +sub.f16x2 r7250, r5692, r5708; +} +{ +mul.f16x2 r7253, r7250, r7017; +} +{ +add.f16x2 r7256, r7247, r7253; +} +{ +sub.f16x2 r7259, r7241, r7256; +} +st.local.u32 [rd4+192], r7259; +{ +add.f16x2 r7262, r5682, r5730; +} +{ +mul.f16x2 r7265, r7262, r7016; +} +{ +add.f16x2 r7268, r4075, r7265; +} +{ +add.f16x2 r7271, r5698, r5714; +} +{ +mul.f16x2 r7274, r7271, r7018; +} +{ +add.f16x2 r7277, r7268, r7274; +} +{ +sub.f16x2 r7280, r5676, r5724; +} +{ +mul.f16x2 r7283, r7280, r7017; +} +{ +sub.f16x2 r7286, r5692, r5708; +} +{ +mul.f16x2 r7289, r7286, r7020; +} +{ +add.f16x2 r7292, r7283, r7289; +} +{ +add.f16x2 r7295, r7277, r7292; +} +st.local.u32 [rd4+112], r7295; +{ +add.f16x2 r7298, r5682, r5730; +} +{ +mul.f16x2 r7301, r7298, r7016; +} +{ +add.f16x2 r7304, r4075, r7301; +} +{ +add.f16x2 r7307, r5698, r5714; +} +{ +mul.f16x2 r7310, r7307, r7018; +} +{ +add.f16x2 r7313, r7304, r7310; +} +{ +sub.f16x2 r7316, r5676, r5724; +} +{ +mul.f16x2 r7319, r7316, r7017; +} +{ +sub.f16x2 r7322, r5692, r5708; +} +{ +mul.f16x2 r7325, r7322, r7020; +} +{ +add.f16x2 r7328, r7319, r7325; +} +{ +sub.f16x2 r7331, r7313, r7328; +} +st.local.u32 [rd4+152], r7331; +mul.wide.u32 rd13, r3, 1374389535; +shr.u64 rd14, rd13, 35; +cvt.u32.u64 r15, rd14; +cvt.rn.f32.u32 f460, r15; +mul.f32 f461, f460, 0f3C24B5BE; +cos.approx.f32 f456, f461; +sin.approx.f32 f462, f461; +neg.f32 f457, f462; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f457; +mov.b32 r11253, {low, high}; +} +mov.u32 r11252, 1; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11253; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11253; +mov.b32 r7414, {high, high}; +} +bra.uni LBB1_4; +LBB1_5: +ld.local.u32 r11251, [rd7+60]; +LBB1_4: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11253; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11253; +mov.b32 r7394, {high, high}; +} +mul.wide.u32 rd15, r11252, 8; +add.s64 rd16, rd3, rd15; +add.s64 rd7, rd16, 4; +ld.local.u32 r7397, [rd16+4]; +{ +mul.f16x2 r7396, r7397, r7394; +} +{ +neg.f16x2 r7399, r7396; +} +{ +fma.rn.f16x2 r7401, r11251, r7392, r7399; +} +st.local.u32 [rd16], r7401; +{ +mul.f16x2 r7405, r11251, r7394; +} +{ +fma.rn.f16x2 r7408, r7397, r7392, r7405; +} +st.local.u32 [rd16+4], r7408; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7416, {low, high}; +} +{ +mul.f16x2 r7417, r7414, r7416; +} +{ +mul.f16x2 r7420, r11253, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11253; +mov.b32 r7423, {high, low}; +} +{ +fma.rn.f16x2 r7425, r7417, r7423, r7420; +} +ld.local.u32 r7443, [rd16+8]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7425; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7425; +mov.b32 r7431, {high, high}; +} +ld.local.u32 r7446, [rd16+12]; +{ +mul.f16x2 r7433, r7446, r7431; +} +{ +neg.f16x2 r7436, r7433; +} +{ +fma.rn.f16x2 r7438, r7443, r7429, r7436; +} +st.local.u32 [rd16+8], r7438; +{ +mul.f16x2 r7442, r7443, r7431; +} +{ +fma.rn.f16x2 r7445, r7446, r7429, r7442; +} +st.local.u32 [rd16+12], r7445; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7453, {low, high}; +} +{ +mul.f16x2 r7454, r7414, r7453; +} +{ +mul.f16x2 r7457, r7425, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7425; +mov.b32 r7460, {high, low}; +} +{ +fma.rn.f16x2 r7462, r7454, r7460, r7457; +} +ld.local.u32 r7480, [rd16+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7462; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7462; +mov.b32 r7468, {high, high}; +} +ld.local.u32 r7483, [rd16+20]; +{ +mul.f16x2 r7470, r7483, r7468; +} +{ +neg.f16x2 r7473, r7470; +} +{ +fma.rn.f16x2 r7475, r7480, r7466, r7473; +} +st.local.u32 [rd16+16], r7475; +{ +mul.f16x2 r7479, r7480, r7468; +} +{ +fma.rn.f16x2 r7482, r7483, r7466, r7479; +} +st.local.u32 [rd16+20], r7482; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7490, {low, high}; +} +{ +mul.f16x2 r7491, r7414, r7490; +} +{ +mul.f16x2 r7494, r7462, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7462; +mov.b32 r7497, {high, low}; +} +{ +fma.rn.f16x2 r7499, r7491, r7497, r7494; +} +ld.local.u32 r7517, [rd16+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7499; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7499; +mov.b32 r7505, {high, high}; +} +ld.local.u32 r7520, [rd16+28]; +{ +mul.f16x2 r7507, r7520, r7505; +} +{ +neg.f16x2 r7510, r7507; +} +{ +fma.rn.f16x2 r7512, r7517, r7503, r7510; +} +st.local.u32 [rd16+24], r7512; +{ +mul.f16x2 r7516, r7517, r7505; +} +{ +fma.rn.f16x2 r7519, r7520, r7503, r7516; +} +st.local.u32 [rd16+28], r7519; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7527, {low, high}; +} +{ +mul.f16x2 r7528, r7414, r7527; +} +{ +mul.f16x2 r7531, r7499, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7499; +mov.b32 r7534, {high, low}; +} +{ +fma.rn.f16x2 r7536, r7528, r7534, r7531; +} +ld.local.u32 r7550, [rd16+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7536; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7536; +mov.b32 r7542, {high, high}; +} +ld.local.u32 r7557, [rd16+36]; +{ +mul.f16x2 r7544, r7557, r7542; +} +{ +neg.f16x2 r7547, r7544; +} +{ +fma.rn.f16x2 r7549, r7550, r7540, r7547; +} +st.local.u32 [rd16+32], r7549; +{ +mul.f16x2 r7553, r7550, r7542; +} +{ +fma.rn.f16x2 r7556, r7557, r7540, r7553; +} +st.local.u32 [rd16+36], r7556; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7564, {low, high}; +} +{ +mul.f16x2 r7565, r7414, r7564; +} +{ +mul.f16x2 r7568, r7536, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7536; +mov.b32 r7571, {high, low}; +} +{ +fma.rn.f16x2 r7573, r7565, r7571, r7568; +} +ld.local.u32 r7587, [rd16+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7573; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7573; +mov.b32 r7579, {high, high}; +} +ld.local.u32 r7582, [rd16+44]; +{ +mul.f16x2 r7581, r7582, r7579; +} +{ +neg.f16x2 r7584, r7581; +} +{ +fma.rn.f16x2 r7586, r7587, r7577, r7584; +} +st.local.u32 [rd16+40], r7586; +{ +mul.f16x2 r7590, r7587, r7579; +} +{ +fma.rn.f16x2 r7593, r7582, r7577, r7590; +} +st.local.u32 [rd16+44], r7593; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7601, {low, high}; +} +{ +mul.f16x2 r7602, r7414, r7601; +} +{ +mul.f16x2 r7605, r7573, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7573; +mov.b32 r7608, {high, low}; +} +{ +fma.rn.f16x2 r7610, r7602, r7608, r7605; +} +ld.local.u32 r7624, [rd16+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7610; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7610; +mov.b32 r7616, {high, high}; +} +ld.local.u32 r7619, [rd16+52]; +{ +mul.f16x2 r7618, r7619, r7616; +} +{ +neg.f16x2 r7621, r7618; +} +{ +fma.rn.f16x2 r7623, r7624, r7614, r7621; +} +st.local.u32 [rd16+48], r7623; +{ +mul.f16x2 r7627, r7624, r7616; +} +{ +fma.rn.f16x2 r7630, r7619, r7614, r7627; +} +st.local.u32 [rd16+52], r7630; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7638, {low, high}; +} +{ +mul.f16x2 r7639, r7414, r7638; +} +{ +mul.f16x2 r7642, r7610, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7610; +mov.b32 r7645, {high, low}; +} +{ +fma.rn.f16x2 r7647, r7639, r7645, r7642; +} +ld.local.u32 r7661, [rd16+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7647; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7647; +mov.b32 r7653, {high, high}; +} +ld.local.u32 r7656, [rd16+60]; +{ +mul.f16x2 r7655, r7656, r7653; +} +{ +neg.f16x2 r7658, r7655; +} +{ +fma.rn.f16x2 r7660, r7661, r7651, r7658; +} +st.local.u32 [rd16+56], r7660; +{ +mul.f16x2 r7664, r7661, r7653; +} +{ +fma.rn.f16x2 r7667, r7656, r7651, r7664; +} +st.local.u32 [rd16+60], r7667; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7675, {low, high}; +} +{ +mul.f16x2 r7676, r7414, r7675; +} +{ +mul.f16x2 r7679, r7647, r7412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7647; +mov.b32 r7682, {high, low}; +} +{ +fma.rn.f16x2 r11253, r7676, r7682, r7679; +} +add.s32 r11252, r11252, 8; +setp.eq.s32 p2, r11252, 25; +@p2 bra LBB1_6; +bra.uni LBB1_5; +LBB1_6: +mul.lo.s32 r11193, r15, 25; +sub.s32 r11194, r3, r11193; +shl.b32 r11195, r11194, 2; +add.s32 r11196, r12, r11195; +barrier.sync 0; +mad.lo.s32 r11197, r15, 2500, r11196; +ld.local.u32 r11198, [rd3]; +st.shared.u32 [r11197], r11198; +ld.local.u32 r11199, [rd4+4]; +st.shared.u32 [r11197+100], r11199; +ld.local.u32 r11200, [rd4+12]; +st.shared.u32 [r11197+200], r11200; +ld.local.u32 r11201, [rd4+20]; +st.shared.u32 [r11197+300], r11201; +ld.local.u32 r11202, [rd4+28]; +st.shared.u32 [r11197+400], r11202; +ld.local.u32 r11203, [rd4+36]; +st.shared.u32 [r11197+500], r11203; +ld.local.u32 r11204, [rd4+44]; +st.shared.u32 [r11197+600], r11204; +ld.local.u32 r11205, [rd4+52]; +st.shared.u32 [r11197+700], r11205; +ld.local.u32 r11206, [rd4+60]; +st.shared.u32 [r11197+800], r11206; +ld.local.u32 r11207, [rd4+68]; +st.shared.u32 [r11197+900], r11207; +ld.local.u32 r11208, [rd4+76]; +st.shared.u32 [r11197+1000], r11208; +ld.local.u32 r11209, [rd4+84]; +st.shared.u32 [r11197+1100], r11209; +ld.local.u32 r11210, [rd4+92]; +st.shared.u32 [r11197+1200], r11210; +ld.local.u32 r11211, [rd4+100]; +st.shared.u32 [r11197+1300], r11211; +ld.local.u32 r11212, [rd4+108]; +st.shared.u32 [r11197+1400], r11212; +ld.local.u32 r11213, [rd4+116]; +st.shared.u32 [r11197+1500], r11213; +ld.local.u32 r11214, [rd4+124]; +st.shared.u32 [r11197+1600], r11214; +ld.local.u32 r11215, [rd4+132]; +st.shared.u32 [r11197+1700], r11215; +ld.local.u32 r11216, [rd4+140]; +st.shared.u32 [r11197+1800], r11216; +ld.local.u32 r11217, [rd4+148]; +st.shared.u32 [r11197+1900], r11217; +ld.local.u32 r11218, [rd4+156]; +st.shared.u32 [r11197+2000], r11218; +ld.local.u32 r11219, [rd4+164]; +st.shared.u32 [r11197+2100], r11219; +ld.local.u32 r11220, [rd4+172]; +st.shared.u32 [r11197+2200], r11220; +ld.local.u32 r11221, [rd4+180]; +st.shared.u32 [r11197+2300], r11221; +ld.local.u32 r11222, [rd4+188]; +st.shared.u32 [r11197+2400], r11222; +barrier.sync 0; +ld.shared.u32 r7700, [r13]; +ld.shared.u32 r8020, [r13+2500]; +ld.shared.u32 r8340, [r13+5000]; +ld.shared.u32 r8660, [r13+7500]; +ld.shared.u32 r8980, [r13+10000]; +ld.shared.u32 r7697, [r13+12500]; +ld.shared.u32 r8017, [r13+15000]; +ld.shared.u32 r8337, [r13+17500]; +ld.shared.u32 r8657, [r13+20000]; +ld.shared.u32 r8977, [r13+22500]; +ld.shared.u32 r7703, [r13+25000]; +ld.shared.u32 r8023, [r13+27500]; +ld.shared.u32 r8343, [r13+30000]; +ld.shared.u32 r8663, [r13+32500]; +ld.shared.u32 r8983, [r13+35000]; +ld.shared.u32 r7704, [r13+37500]; +ld.shared.u32 r8024, [r13+40000]; +ld.shared.u32 r8344, [r13+42500]; +ld.shared.u32 r8664, [r13+45000]; +ld.shared.u32 r8984, [r13+47500]; +ld.shared.u32 r7698, [r13+50000]; +ld.shared.u32 r8018, [r13+52500]; +ld.shared.u32 r8338, [r13+55000]; +ld.shared.u32 r8658, [r13+57500]; +ld.shared.u32 r8978, [r13+60000]; +barrier.sync 0; +ld.local.u32 r11223, [rd4]; +st.shared.u32 [r11197], r11223; +ld.local.u32 r11224, [rd4+8]; +st.shared.u32 [r11197+100], r11224; +ld.local.u32 r11225, [rd4+16]; +st.shared.u32 [r11197+200], r11225; +ld.local.u32 r11226, [rd4+24]; +st.shared.u32 [r11197+300], r11226; +ld.local.u32 r11227, [rd4+32]; +st.shared.u32 [r11197+400], r11227; +ld.local.u32 r11228, [rd4+40]; +st.shared.u32 [r11197+500], r11228; +ld.local.u32 r11229, [rd4+48]; +st.shared.u32 [r11197+600], r11229; +ld.local.u32 r11230, [rd4+56]; +st.shared.u32 [r11197+700], r11230; +ld.local.u32 r11231, [rd4+64]; +st.shared.u32 [r11197+800], r11231; +ld.local.u32 r11232, [rd4+72]; +st.shared.u32 [r11197+900], r11232; +ld.local.u32 r11233, [rd4+80]; +st.shared.u32 [r11197+1000], r11233; +ld.local.u32 r11234, [rd4+88]; +st.shared.u32 [r11197+1100], r11234; +ld.local.u32 r11235, [rd4+96]; +st.shared.u32 [r11197+1200], r11235; +ld.local.u32 r11236, [rd4+104]; +st.shared.u32 [r11197+1300], r11236; +ld.local.u32 r11237, [rd4+112]; +st.shared.u32 [r11197+1400], r11237; +ld.local.u32 r11238, [rd4+120]; +st.shared.u32 [r11197+1500], r11238; +ld.local.u32 r11239, [rd4+128]; +st.shared.u32 [r11197+1600], r11239; +ld.local.u32 r11240, [rd4+136]; +st.shared.u32 [r11197+1700], r11240; +ld.local.u32 r11241, [rd4+144]; +st.shared.u32 [r11197+1800], r11241; +ld.local.u32 r11242, [rd4+152]; +st.shared.u32 [r11197+1900], r11242; +ld.local.u32 r11243, [rd4+160]; +st.shared.u32 [r11197+2000], r11243; +ld.local.u32 r11244, [rd4+168]; +st.shared.u32 [r11197+2100], r11244; +ld.local.u32 r11245, [rd4+176]; +st.shared.u32 [r11197+2200], r11245; +ld.local.u32 r11246, [rd4+184]; +st.shared.u32 [r11197+2300], r11246; +ld.local.u32 r11247, [rd4+192]; +st.shared.u32 [r11197+2400], r11247; +barrier.sync 0; +ld.shared.u32 r7712, [r13]; +ld.shared.u32 r8032, [r13+2500]; +ld.shared.u32 r8352, [r13+5000]; +ld.shared.u32 r8672, [r13+7500]; +ld.shared.u32 r8992, [r13+10000]; +ld.shared.u32 r7709, [r13+12500]; +ld.shared.u32 r8029, [r13+15000]; +ld.shared.u32 r8349, [r13+17500]; +ld.shared.u32 r8669, [r13+20000]; +ld.shared.u32 r8989, [r13+22500]; +ld.shared.u32 r7715, [r13+25000]; +ld.shared.u32 r8035, [r13+27500]; +ld.shared.u32 r8355, [r13+30000]; +ld.shared.u32 r8675, [r13+32500]; +ld.shared.u32 r8995, [r13+35000]; +ld.shared.u32 r7716, [r13+37500]; +ld.shared.u32 r8036, [r13+40000]; +ld.shared.u32 r8356, [r13+42500]; +ld.shared.u32 r8676, [r13+45000]; +ld.shared.u32 r8996, [r13+47500]; +ld.shared.u32 r7710, [r13+50000]; +ld.shared.u32 r8030, [r13+52500]; +ld.shared.u32 r8350, [r13+55000]; +ld.shared.u32 r8670, [r13+57500]; +ld.shared.u32 r8990, [r13+60000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7688, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7689, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7693, {low, high}; +} +{ +neg.f16x2 r7694, r7693; +} +{ +add.f16x2 r7696, r7697, r7698; +} +{ +add.f16x2 r7699, r7700, r7696; +} +{ +add.f16x2 r7702, r7703, r7704; +} +{ +add.f16x2 r7705, r7699, r7702; +} +{ +add.f16x2 r7708, r7709, r7710; +} +{ +add.f16x2 r7711, r7712, r7708; +} +{ +add.f16x2 r7714, r7715, r7716; +} +{ +add.f16x2 r7717, r7711, r7714; +} +{ +add.f16x2 r7720, r7697, r7698; +} +{ +mul.f16x2 r7723, r7720, r7688; +} +{ +add.f16x2 r7726, r7700, r7723; +} +{ +add.f16x2 r7729, r7703, r7704; +} +{ +mul.f16x2 r7732, r7729, r7690; +} +{ +add.f16x2 r7735, r7726, r7732; +} +{ +sub.f16x2 r7738, r7709, r7710; +} +{ +mul.f16x2 r7741, r7738, r7689; +} +{ +sub.f16x2 r7744, r7715, r7716; +} +{ +mul.f16x2 r7747, r7744, r7691; +} +{ +add.f16x2 r7750, r7741, r7747; +} +{ +sub.f16x2 r7753, r7735, r7750; +} +{ +add.f16x2 r7756, r7697, r7698; +} +{ +mul.f16x2 r7759, r7756, r7688; +} +{ +add.f16x2 r7762, r7700, r7759; +} +{ +add.f16x2 r7765, r7703, r7704; +} +{ +mul.f16x2 r7768, r7765, r7690; +} +{ +add.f16x2 r7771, r7762, r7768; +} +{ +sub.f16x2 r7774, r7709, r7710; +} +{ +mul.f16x2 r7777, r7774, r7689; +} +{ +sub.f16x2 r7780, r7715, r7716; +} +{ +mul.f16x2 r7783, r7780, r7691; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7771, r7786; +} +{ +add.f16x2 r7792, r7697, r7698; +} +{ +mul.f16x2 r7795, r7792, r7690; +} +{ +add.f16x2 r7798, r7700, r7795; +} +{ +add.f16x2 r7801, r7703, r7704; +} +{ +mul.f16x2 r7804, r7801, r7692; +} +{ +add.f16x2 r7807, r7798, r7804; +} +{ +sub.f16x2 r7810, r7709, r7710; +} +{ +mul.f16x2 r7813, r7810, r7691; +} +{ +sub.f16x2 r7816, r7715, r7716; +} +{ +mul.f16x2 r7819, r7816, r7694; +} +{ +add.f16x2 r7822, r7813, r7819; +} +{ +sub.f16x2 r7825, r7807, r7822; +} +{ +add.f16x2 r7828, r7697, r7698; +} +{ +mul.f16x2 r7831, r7828, r7690; +} +{ +add.f16x2 r7834, r7700, r7831; +} +{ +add.f16x2 r7837, r7703, r7704; +} +{ +mul.f16x2 r7840, r7837, r7692; +} +{ +add.f16x2 r7843, r7834, r7840; +} +{ +sub.f16x2 r7846, r7709, r7710; +} +{ +mul.f16x2 r7849, r7846, r7691; +} +{ +sub.f16x2 r7852, r7715, r7716; +} +{ +mul.f16x2 r7855, r7852, r7694; +} +{ +add.f16x2 r7858, r7849, r7855; +} +{ +add.f16x2 r7861, r7843, r7858; +} +{ +add.f16x2 r7864, r7709, r7710; +} +{ +mul.f16x2 r7867, r7864, r7688; +} +{ +add.f16x2 r7870, r7712, r7867; +} +{ +add.f16x2 r7873, r7715, r7716; +} +{ +mul.f16x2 r7876, r7873, r7690; +} +{ +add.f16x2 r7879, r7870, r7876; +} +{ +sub.f16x2 r7882, r7697, r7698; +} +{ +mul.f16x2 r7885, r7882, r7689; +} +{ +sub.f16x2 r7888, r7703, r7704; +} +{ +mul.f16x2 r7891, r7888, r7691; +} +{ +add.f16x2 r7894, r7885, r7891; +} +{ +add.f16x2 r7897, r7879, r7894; +} +{ +add.f16x2 r7900, r7709, r7710; +} +{ +mul.f16x2 r7903, r7900, r7688; +} +{ +add.f16x2 r7906, r7712, r7903; +} +{ +add.f16x2 r7909, r7715, r7716; +} +{ +mul.f16x2 r7912, r7909, r7690; +} +{ +add.f16x2 r7915, r7906, r7912; +} +{ +sub.f16x2 r7918, r7697, r7698; +} +{ +mul.f16x2 r7921, r7918, r7689; +} +{ +sub.f16x2 r7924, r7703, r7704; +} +{ +mul.f16x2 r7927, r7924, r7691; +} +{ +add.f16x2 r7930, r7921, r7927; +} +{ +sub.f16x2 r7933, r7915, r7930; +} +{ +add.f16x2 r7936, r7709, r7710; +} +{ +mul.f16x2 r7939, r7936, r7690; +} +{ +add.f16x2 r7942, r7712, r7939; +} +{ +add.f16x2 r7945, r7715, r7716; +} +{ +mul.f16x2 r7948, r7945, r7692; +} +{ +add.f16x2 r7951, r7942, r7948; +} +{ +sub.f16x2 r7954, r7697, r7698; +} +{ +mul.f16x2 r7957, r7954, r7691; +} +{ +sub.f16x2 r7960, r7703, r7704; +} +{ +mul.f16x2 r7963, r7960, r7694; +} +{ +add.f16x2 r7966, r7957, r7963; +} +{ +add.f16x2 r7969, r7951, r7966; +} +{ +add.f16x2 r7972, r7709, r7710; +} +{ +mul.f16x2 r7975, r7972, r7690; +} +{ +add.f16x2 r7978, r7712, r7975; +} +{ +add.f16x2 r7981, r7715, r7716; +} +{ +mul.f16x2 r7984, r7981, r7692; +} +{ +add.f16x2 r7987, r7978, r7984; +} +{ +sub.f16x2 r7990, r7697, r7698; +} +{ +mul.f16x2 r7993, r7990, r7691; +} +{ +sub.f16x2 r7996, r7703, r7704; +} +{ +mul.f16x2 r7999, r7996, r7694; +} +{ +add.f16x2 r8002, r7993, r7999; +} +{ +sub.f16x2 r8005, r7987, r8002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8013, {low, high}; +} +{ +neg.f16x2 r8014, r8013; +} +{ +add.f16x2 r8016, r8017, r8018; +} +{ +add.f16x2 r8019, r8020, r8016; +} +{ +add.f16x2 r8022, r8023, r8024; +} +{ +add.f16x2 r8025, r8019, r8022; +} +{ +add.f16x2 r8028, r8029, r8030; +} +{ +add.f16x2 r8031, r8032, r8028; +} +{ +add.f16x2 r8034, r8035, r8036; +} +{ +add.f16x2 r8037, r8031, r8034; +} +{ +add.f16x2 r8040, r8017, r8018; +} +{ +mul.f16x2 r8043, r8040, r8008; +} +{ +add.f16x2 r8046, r8020, r8043; +} +{ +add.f16x2 r8049, r8023, r8024; +} +{ +mul.f16x2 r8052, r8049, r8010; +} +{ +add.f16x2 r8055, r8046, r8052; +} +{ +sub.f16x2 r8058, r8029, r8030; +} +{ +mul.f16x2 r8061, r8058, r8009; +} +{ +sub.f16x2 r8064, r8035, r8036; +} +{ +mul.f16x2 r8067, r8064, r8011; +} +{ +add.f16x2 r8070, r8061, r8067; +} +{ +sub.f16x2 r8073, r8055, r8070; +} +{ +add.f16x2 r8076, r8017, r8018; +} +{ +mul.f16x2 r8079, r8076, r8008; +} +{ +add.f16x2 r8082, r8020, r8079; +} +{ +add.f16x2 r8085, r8023, r8024; +} +{ +mul.f16x2 r8088, r8085, r8010; +} +{ +add.f16x2 r8091, r8082, r8088; +} +{ +sub.f16x2 r8094, r8029, r8030; +} +{ +mul.f16x2 r8097, r8094, r8009; +} +{ +sub.f16x2 r8100, r8035, r8036; +} +{ +mul.f16x2 r8103, r8100, r8011; +} +{ +add.f16x2 r8106, r8097, r8103; +} +{ +add.f16x2 r8109, r8091, r8106; +} +{ +add.f16x2 r8112, r8017, r8018; +} +{ +mul.f16x2 r8115, r8112, r8010; +} +{ +add.f16x2 r8118, r8020, r8115; +} +{ +add.f16x2 r8121, r8023, r8024; +} +{ +mul.f16x2 r8124, r8121, r8012; +} +{ +add.f16x2 r8127, r8118, r8124; +} +{ +sub.f16x2 r8130, r8029, r8030; +} +{ +mul.f16x2 r8133, r8130, r8011; +} +{ +sub.f16x2 r8136, r8035, r8036; +} +{ +mul.f16x2 r8139, r8136, r8014; +} +{ +add.f16x2 r8142, r8133, r8139; +} +{ +sub.f16x2 r8145, r8127, r8142; +} +{ +add.f16x2 r8148, r8017, r8018; +} +{ +mul.f16x2 r8151, r8148, r8010; +} +{ +add.f16x2 r8154, r8020, r8151; +} +{ +add.f16x2 r8157, r8023, r8024; +} +{ +mul.f16x2 r8160, r8157, r8012; +} +{ +add.f16x2 r8163, r8154, r8160; +} +{ +sub.f16x2 r8166, r8029, r8030; +} +{ +mul.f16x2 r8169, r8166, r8011; +} +{ +sub.f16x2 r8172, r8035, r8036; +} +{ +mul.f16x2 r8175, r8172, r8014; +} +{ +add.f16x2 r8178, r8169, r8175; +} +{ +add.f16x2 r8181, r8163, r8178; +} +{ +add.f16x2 r8184, r8029, r8030; +} +{ +mul.f16x2 r8187, r8184, r8008; +} +{ +add.f16x2 r8190, r8032, r8187; +} +{ +add.f16x2 r8193, r8035, r8036; +} +{ +mul.f16x2 r8196, r8193, r8010; +} +{ +add.f16x2 r8199, r8190, r8196; +} +{ +sub.f16x2 r8202, r8017, r8018; +} +{ +mul.f16x2 r8205, r8202, r8009; +} +{ +sub.f16x2 r8208, r8023, r8024; +} +{ +mul.f16x2 r8211, r8208, r8011; +} +{ +add.f16x2 r8214, r8205, r8211; +} +{ +add.f16x2 r8217, r8199, r8214; +} +{ +add.f16x2 r8220, r8029, r8030; +} +{ +mul.f16x2 r8223, r8220, r8008; +} +{ +add.f16x2 r8226, r8032, r8223; +} +{ +add.f16x2 r8229, r8035, r8036; +} +{ +mul.f16x2 r8232, r8229, r8010; +} +{ +add.f16x2 r8235, r8226, r8232; +} +{ +sub.f16x2 r8238, r8017, r8018; +} +{ +mul.f16x2 r8241, r8238, r8009; +} +{ +sub.f16x2 r8244, r8023, r8024; +} +{ +mul.f16x2 r8247, r8244, r8011; +} +{ +add.f16x2 r8250, r8241, r8247; +} +{ +sub.f16x2 r8253, r8235, r8250; +} +{ +add.f16x2 r8256, r8029, r8030; +} +{ +mul.f16x2 r8259, r8256, r8010; +} +{ +add.f16x2 r8262, r8032, r8259; +} +{ +add.f16x2 r8265, r8035, r8036; +} +{ +mul.f16x2 r8268, r8265, r8012; +} +{ +add.f16x2 r8271, r8262, r8268; +} +{ +sub.f16x2 r8274, r8017, r8018; +} +{ +mul.f16x2 r8277, r8274, r8011; +} +{ +sub.f16x2 r8280, r8023, r8024; +} +{ +mul.f16x2 r8283, r8280, r8014; +} +{ +add.f16x2 r8286, r8277, r8283; +} +{ +add.f16x2 r8289, r8271, r8286; +} +{ +add.f16x2 r8292, r8029, r8030; +} +{ +mul.f16x2 r8295, r8292, r8010; +} +{ +add.f16x2 r8298, r8032, r8295; +} +{ +add.f16x2 r8301, r8035, r8036; +} +{ +mul.f16x2 r8304, r8301, r8012; +} +{ +add.f16x2 r8307, r8298, r8304; +} +{ +sub.f16x2 r8310, r8017, r8018; +} +{ +mul.f16x2 r8313, r8310, r8011; +} +{ +sub.f16x2 r8316, r8023, r8024; +} +{ +mul.f16x2 r8319, r8316, r8014; +} +{ +add.f16x2 r8322, r8313, r8319; +} +{ +sub.f16x2 r8325, r8307, r8322; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8328, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8329, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8332, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8333, {low, high}; +} +{ +neg.f16x2 r8334, r8333; +} +{ +add.f16x2 r8336, r8337, r8338; +} +{ +add.f16x2 r8339, r8340, r8336; +} +{ +add.f16x2 r8342, r8343, r8344; +} +{ +add.f16x2 r8345, r8339, r8342; +} +{ +add.f16x2 r8348, r8349, r8350; +} +{ +add.f16x2 r8351, r8352, r8348; +} +{ +add.f16x2 r8354, r8355, r8356; +} +{ +add.f16x2 r8357, r8351, r8354; +} +{ +add.f16x2 r8360, r8337, r8338; +} +{ +mul.f16x2 r8363, r8360, r8328; +} +{ +add.f16x2 r8366, r8340, r8363; +} +{ +add.f16x2 r8369, r8343, r8344; +} +{ +mul.f16x2 r8372, r8369, r8330; +} +{ +add.f16x2 r8375, r8366, r8372; +} +{ +sub.f16x2 r8378, r8349, r8350; +} +{ +mul.f16x2 r8381, r8378, r8329; +} +{ +sub.f16x2 r8384, r8355, r8356; +} +{ +mul.f16x2 r8387, r8384, r8331; +} +{ +add.f16x2 r8390, r8381, r8387; +} +{ +sub.f16x2 r8393, r8375, r8390; +} +{ +add.f16x2 r8396, r8337, r8338; +} +{ +mul.f16x2 r8399, r8396, r8328; +} +{ +add.f16x2 r8402, r8340, r8399; +} +{ +add.f16x2 r8405, r8343, r8344; +} +{ +mul.f16x2 r8408, r8405, r8330; +} +{ +add.f16x2 r8411, r8402, r8408; +} +{ +sub.f16x2 r8414, r8349, r8350; +} +{ +mul.f16x2 r8417, r8414, r8329; +} +{ +sub.f16x2 r8420, r8355, r8356; +} +{ +mul.f16x2 r8423, r8420, r8331; +} +{ +add.f16x2 r8426, r8417, r8423; +} +{ +add.f16x2 r8429, r8411, r8426; +} +{ +add.f16x2 r8432, r8337, r8338; +} +{ +mul.f16x2 r8435, r8432, r8330; +} +{ +add.f16x2 r8438, r8340, r8435; +} +{ +add.f16x2 r8441, r8343, r8344; +} +{ +mul.f16x2 r8444, r8441, r8332; +} +{ +add.f16x2 r8447, r8438, r8444; +} +{ +sub.f16x2 r8450, r8349, r8350; +} +{ +mul.f16x2 r8453, r8450, r8331; +} +{ +sub.f16x2 r8456, r8355, r8356; +} +{ +mul.f16x2 r8459, r8456, r8334; +} +{ +add.f16x2 r8462, r8453, r8459; +} +{ +sub.f16x2 r8465, r8447, r8462; +} +{ +add.f16x2 r8468, r8337, r8338; +} +{ +mul.f16x2 r8471, r8468, r8330; +} +{ +add.f16x2 r8474, r8340, r8471; +} +{ +add.f16x2 r8477, r8343, r8344; +} +{ +mul.f16x2 r8480, r8477, r8332; +} +{ +add.f16x2 r8483, r8474, r8480; +} +{ +sub.f16x2 r8486, r8349, r8350; +} +{ +mul.f16x2 r8489, r8486, r8331; +} +{ +sub.f16x2 r8492, r8355, r8356; +} +{ +mul.f16x2 r8495, r8492, r8334; +} +{ +add.f16x2 r8498, r8489, r8495; +} +{ +add.f16x2 r8501, r8483, r8498; +} +{ +add.f16x2 r8504, r8349, r8350; +} +{ +mul.f16x2 r8507, r8504, r8328; +} +{ +add.f16x2 r8510, r8352, r8507; +} +{ +add.f16x2 r8513, r8355, r8356; +} +{ +mul.f16x2 r8516, r8513, r8330; +} +{ +add.f16x2 r8519, r8510, r8516; +} +{ +sub.f16x2 r8522, r8337, r8338; +} +{ +mul.f16x2 r8525, r8522, r8329; +} +{ +sub.f16x2 r8528, r8343, r8344; +} +{ +mul.f16x2 r8531, r8528, r8331; +} +{ +add.f16x2 r8534, r8525, r8531; +} +{ +add.f16x2 r8537, r8519, r8534; +} +{ +add.f16x2 r8540, r8349, r8350; +} +{ +mul.f16x2 r8543, r8540, r8328; +} +{ +add.f16x2 r8546, r8352, r8543; +} +{ +add.f16x2 r8549, r8355, r8356; +} +{ +mul.f16x2 r8552, r8549, r8330; +} +{ +add.f16x2 r8555, r8546, r8552; +} +{ +sub.f16x2 r8558, r8337, r8338; +} +{ +mul.f16x2 r8561, r8558, r8329; +} +{ +sub.f16x2 r8564, r8343, r8344; +} +{ +mul.f16x2 r8567, r8564, r8331; +} +{ +add.f16x2 r8570, r8561, r8567; +} +{ +sub.f16x2 r8573, r8555, r8570; +} +{ +add.f16x2 r8576, r8349, r8350; +} +{ +mul.f16x2 r8579, r8576, r8330; +} +{ +add.f16x2 r8582, r8352, r8579; +} +{ +add.f16x2 r8585, r8355, r8356; +} +{ +mul.f16x2 r8588, r8585, r8332; +} +{ +add.f16x2 r8591, r8582, r8588; +} +{ +sub.f16x2 r8594, r8337, r8338; +} +{ +mul.f16x2 r8597, r8594, r8331; +} +{ +sub.f16x2 r8600, r8343, r8344; +} +{ +mul.f16x2 r8603, r8600, r8334; +} +{ +add.f16x2 r8606, r8597, r8603; +} +{ +add.f16x2 r8609, r8591, r8606; +} +{ +add.f16x2 r8612, r8349, r8350; +} +{ +mul.f16x2 r8615, r8612, r8330; +} +{ +add.f16x2 r8618, r8352, r8615; +} +{ +add.f16x2 r8621, r8355, r8356; +} +{ +mul.f16x2 r8624, r8621, r8332; +} +{ +add.f16x2 r8627, r8618, r8624; +} +{ +sub.f16x2 r8630, r8337, r8338; +} +{ +mul.f16x2 r8633, r8630, r8331; +} +{ +sub.f16x2 r8636, r8343, r8344; +} +{ +mul.f16x2 r8639, r8636, r8334; +} +{ +add.f16x2 r8642, r8633, r8639; +} +{ +sub.f16x2 r8645, r8627, r8642; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8648, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8653, {low, high}; +} +{ +neg.f16x2 r8654, r8653; +} +{ +add.f16x2 r8656, r8657, r8658; +} +{ +add.f16x2 r8659, r8660, r8656; +} +{ +add.f16x2 r8662, r8663, r8664; +} +{ +add.f16x2 r8665, r8659, r8662; +} +{ +add.f16x2 r8668, r8669, r8670; +} +{ +add.f16x2 r8671, r8672, r8668; +} +{ +add.f16x2 r8674, r8675, r8676; +} +{ +add.f16x2 r8677, r8671, r8674; +} +{ +add.f16x2 r8680, r8657, r8658; +} +{ +mul.f16x2 r8683, r8680, r8648; +} +{ +add.f16x2 r8686, r8660, r8683; +} +{ +add.f16x2 r8689, r8663, r8664; +} +{ +mul.f16x2 r8692, r8689, r8650; +} +{ +add.f16x2 r8695, r8686, r8692; +} +{ +sub.f16x2 r8698, r8669, r8670; +} +{ +mul.f16x2 r8701, r8698, r8649; +} +{ +sub.f16x2 r8704, r8675, r8676; +} +{ +mul.f16x2 r8707, r8704, r8651; +} +{ +add.f16x2 r8710, r8701, r8707; +} +{ +sub.f16x2 r8713, r8695, r8710; +} +{ +add.f16x2 r8716, r8657, r8658; +} +{ +mul.f16x2 r8719, r8716, r8648; +} +{ +add.f16x2 r8722, r8660, r8719; +} +{ +add.f16x2 r8725, r8663, r8664; +} +{ +mul.f16x2 r8728, r8725, r8650; +} +{ +add.f16x2 r8731, r8722, r8728; +} +{ +sub.f16x2 r8734, r8669, r8670; +} +{ +mul.f16x2 r8737, r8734, r8649; +} +{ +sub.f16x2 r8740, r8675, r8676; +} +{ +mul.f16x2 r8743, r8740, r8651; +} +{ +add.f16x2 r8746, r8737, r8743; +} +{ +add.f16x2 r8749, r8731, r8746; +} +{ +add.f16x2 r8752, r8657, r8658; +} +{ +mul.f16x2 r8755, r8752, r8650; +} +{ +add.f16x2 r8758, r8660, r8755; +} +{ +add.f16x2 r8761, r8663, r8664; +} +{ +mul.f16x2 r8764, r8761, r8652; +} +{ +add.f16x2 r8767, r8758, r8764; +} +{ +sub.f16x2 r8770, r8669, r8670; +} +{ +mul.f16x2 r8773, r8770, r8651; +} +{ +sub.f16x2 r8776, r8675, r8676; +} +{ +mul.f16x2 r8779, r8776, r8654; +} +{ +add.f16x2 r8782, r8773, r8779; +} +{ +sub.f16x2 r8785, r8767, r8782; +} +{ +add.f16x2 r8788, r8657, r8658; +} +{ +mul.f16x2 r8791, r8788, r8650; +} +{ +add.f16x2 r8794, r8660, r8791; +} +{ +add.f16x2 r8797, r8663, r8664; +} +{ +mul.f16x2 r8800, r8797, r8652; +} +{ +add.f16x2 r8803, r8794, r8800; +} +{ +sub.f16x2 r8806, r8669, r8670; +} +{ +mul.f16x2 r8809, r8806, r8651; +} +{ +sub.f16x2 r8812, r8675, r8676; +} +{ +mul.f16x2 r8815, r8812, r8654; +} +{ +add.f16x2 r8818, r8809, r8815; +} +{ +add.f16x2 r8821, r8803, r8818; +} +{ +add.f16x2 r8824, r8669, r8670; +} +{ +mul.f16x2 r8827, r8824, r8648; +} +{ +add.f16x2 r8830, r8672, r8827; +} +{ +add.f16x2 r8833, r8675, r8676; +} +{ +mul.f16x2 r8836, r8833, r8650; +} +{ +add.f16x2 r8839, r8830, r8836; +} +{ +sub.f16x2 r8842, r8657, r8658; +} +{ +mul.f16x2 r8845, r8842, r8649; +} +{ +sub.f16x2 r8848, r8663, r8664; +} +{ +mul.f16x2 r8851, r8848, r8651; +} +{ +add.f16x2 r8854, r8845, r8851; +} +{ +add.f16x2 r8857, r8839, r8854; +} +{ +add.f16x2 r8860, r8669, r8670; +} +{ +mul.f16x2 r8863, r8860, r8648; +} +{ +add.f16x2 r8866, r8672, r8863; +} +{ +add.f16x2 r8869, r8675, r8676; +} +{ +mul.f16x2 r8872, r8869, r8650; +} +{ +add.f16x2 r8875, r8866, r8872; +} +{ +sub.f16x2 r8878, r8657, r8658; +} +{ +mul.f16x2 r8881, r8878, r8649; +} +{ +sub.f16x2 r8884, r8663, r8664; +} +{ +mul.f16x2 r8887, r8884, r8651; +} +{ +add.f16x2 r8890, r8881, r8887; +} +{ +sub.f16x2 r8893, r8875, r8890; +} +{ +add.f16x2 r8896, r8669, r8670; +} +{ +mul.f16x2 r8899, r8896, r8650; +} +{ +add.f16x2 r8902, r8672, r8899; +} +{ +add.f16x2 r8905, r8675, r8676; +} +{ +mul.f16x2 r8908, r8905, r8652; +} +{ +add.f16x2 r8911, r8902, r8908; +} +{ +sub.f16x2 r8914, r8657, r8658; +} +{ +mul.f16x2 r8917, r8914, r8651; +} +{ +sub.f16x2 r8920, r8663, r8664; +} +{ +mul.f16x2 r8923, r8920, r8654; +} +{ +add.f16x2 r8926, r8917, r8923; +} +{ +add.f16x2 r8929, r8911, r8926; +} +{ +add.f16x2 r8932, r8669, r8670; +} +{ +mul.f16x2 r8935, r8932, r8650; +} +{ +add.f16x2 r8938, r8672, r8935; +} +{ +add.f16x2 r8941, r8675, r8676; +} +{ +mul.f16x2 r8944, r8941, r8652; +} +{ +add.f16x2 r8947, r8938, r8944; +} +{ +sub.f16x2 r8950, r8657, r8658; +} +{ +mul.f16x2 r8953, r8950, r8651; +} +{ +sub.f16x2 r8956, r8663, r8664; +} +{ +mul.f16x2 r8959, r8956, r8654; +} +{ +add.f16x2 r8962, r8953, r8959; +} +{ +sub.f16x2 r8965, r8947, r8962; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8968, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8969, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8972, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8973, {low, high}; +} +{ +neg.f16x2 r8974, r8973; +} +{ +add.f16x2 r8976, r8977, r8978; +} +{ +add.f16x2 r8979, r8980, r8976; +} +{ +add.f16x2 r8982, r8983, r8984; +} +{ +add.f16x2 r8985, r8979, r8982; +} +{ +add.f16x2 r8988, r8989, r8990; +} +{ +add.f16x2 r8991, r8992, r8988; +} +{ +add.f16x2 r8994, r8995, r8996; +} +{ +add.f16x2 r8997, r8991, r8994; +} +{ +add.f16x2 r9000, r8977, r8978; +} +{ +mul.f16x2 r9003, r9000, r8968; +} +{ +add.f16x2 r9006, r8980, r9003; +} +{ +add.f16x2 r9009, r8983, r8984; +} +{ +mul.f16x2 r9012, r9009, r8970; +} +{ +add.f16x2 r9015, r9006, r9012; +} +{ +sub.f16x2 r9018, r8989, r8990; +} +{ +mul.f16x2 r9021, r9018, r8969; +} +{ +sub.f16x2 r9024, r8995, r8996; +} +{ +mul.f16x2 r9027, r9024, r8971; +} +{ +add.f16x2 r9030, r9021, r9027; +} +{ +sub.f16x2 r9033, r9015, r9030; +} +{ +add.f16x2 r9036, r8977, r8978; +} +{ +mul.f16x2 r9039, r9036, r8968; +} +{ +add.f16x2 r9042, r8980, r9039; +} +{ +add.f16x2 r9045, r8983, r8984; +} +{ +mul.f16x2 r9048, r9045, r8970; +} +{ +add.f16x2 r9051, r9042, r9048; +} +{ +sub.f16x2 r9054, r8989, r8990; +} +{ +mul.f16x2 r9057, r9054, r8969; +} +{ +sub.f16x2 r9060, r8995, r8996; +} +{ +mul.f16x2 r9063, r9060, r8971; +} +{ +add.f16x2 r9066, r9057, r9063; +} +{ +add.f16x2 r9069, r9051, r9066; +} +{ +add.f16x2 r9072, r8977, r8978; +} +{ +mul.f16x2 r9075, r9072, r8970; +} +{ +add.f16x2 r9078, r8980, r9075; +} +{ +add.f16x2 r9081, r8983, r8984; +} +{ +mul.f16x2 r9084, r9081, r8972; +} +{ +add.f16x2 r9087, r9078, r9084; +} +{ +sub.f16x2 r9090, r8989, r8990; +} +{ +mul.f16x2 r9093, r9090, r8971; +} +{ +sub.f16x2 r9096, r8995, r8996; +} +{ +mul.f16x2 r9099, r9096, r8974; +} +{ +add.f16x2 r9102, r9093, r9099; +} +{ +sub.f16x2 r9105, r9087, r9102; +} +{ +add.f16x2 r9108, r8977, r8978; +} +{ +mul.f16x2 r9111, r9108, r8970; +} +{ +add.f16x2 r9114, r8980, r9111; +} +{ +add.f16x2 r9117, r8983, r8984; +} +{ +mul.f16x2 r9120, r9117, r8972; +} +{ +add.f16x2 r9123, r9114, r9120; +} +{ +sub.f16x2 r9126, r8989, r8990; +} +{ +mul.f16x2 r9129, r9126, r8971; +} +{ +sub.f16x2 r9132, r8995, r8996; +} +{ +mul.f16x2 r9135, r9132, r8974; +} +{ +add.f16x2 r9138, r9129, r9135; +} +{ +add.f16x2 r9141, r9123, r9138; +} +{ +add.f16x2 r9144, r8989, r8990; +} +{ +mul.f16x2 r9147, r9144, r8968; +} +{ +add.f16x2 r9150, r8992, r9147; +} +{ +add.f16x2 r9153, r8995, r8996; +} +{ +mul.f16x2 r9156, r9153, r8970; +} +{ +add.f16x2 r9159, r9150, r9156; +} +{ +sub.f16x2 r9162, r8977, r8978; +} +{ +mul.f16x2 r9165, r9162, r8969; +} +{ +sub.f16x2 r9168, r8983, r8984; +} +{ +mul.f16x2 r9171, r9168, r8971; +} +{ +add.f16x2 r9174, r9165, r9171; +} +{ +add.f16x2 r9177, r9159, r9174; +} +{ +add.f16x2 r9180, r8989, r8990; +} +{ +mul.f16x2 r9183, r9180, r8968; +} +{ +add.f16x2 r9186, r8992, r9183; +} +{ +add.f16x2 r9189, r8995, r8996; +} +{ +mul.f16x2 r9192, r9189, r8970; +} +{ +add.f16x2 r9195, r9186, r9192; +} +{ +sub.f16x2 r9198, r8977, r8978; +} +{ +mul.f16x2 r9201, r9198, r8969; +} +{ +sub.f16x2 r9204, r8983, r8984; +} +{ +mul.f16x2 r9207, r9204, r8971; +} +{ +add.f16x2 r9210, r9201, r9207; +} +{ +sub.f16x2 r9213, r9195, r9210; +} +{ +add.f16x2 r9216, r8989, r8990; +} +{ +mul.f16x2 r9219, r9216, r8970; +} +{ +add.f16x2 r9222, r8992, r9219; +} +{ +add.f16x2 r9225, r8995, r8996; +} +{ +mul.f16x2 r9228, r9225, r8972; +} +{ +add.f16x2 r9231, r9222, r9228; +} +{ +sub.f16x2 r9234, r8977, r8978; +} +{ +mul.f16x2 r9237, r9234, r8971; +} +{ +sub.f16x2 r9240, r8983, r8984; +} +{ +mul.f16x2 r9243, r9240, r8974; +} +{ +add.f16x2 r9246, r9237, r9243; +} +{ +add.f16x2 r9249, r9231, r9246; +} +{ +add.f16x2 r9252, r8989, r8990; +} +{ +mul.f16x2 r9255, r9252, r8970; +} +{ +add.f16x2 r9258, r8992, r9255; +} +{ +add.f16x2 r9261, r8995, r8996; +} +{ +mul.f16x2 r9264, r9261, r8972; +} +{ +add.f16x2 r9267, r9258, r9264; +} +{ +sub.f16x2 r9270, r8977, r8978; +} +{ +mul.f16x2 r9273, r9270, r8971; +} +{ +sub.f16x2 r9276, r8983, r8984; +} +{ +mul.f16x2 r9279, r9276, r8974; +} +{ +add.f16x2 r9282, r9273, r9279; +} +{ +sub.f16x2 r9285, r9267, r9282; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r9288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r9289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r9290, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r9291, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r9292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r9293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r9294, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r9295, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r9298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r9299, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r9302, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r9303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r9305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r9310, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r9311, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9318, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r9319, {low, high}; +} +{ +mul.f16x2 r9336, r8073, r9288; +} +{ +mul.f16x2 r9339, r8217, r9289; +} +{ +sub.f16x2 r9342, r9336, r9339; +} +{ +mul.f16x2 r9345, r8073, r9289; +} +{ +fma.rn.f16x2 r9348, r8217, r9288, r9345; +} +{ +mul.f16x2 r9352, r8393, r9290; +} +{ +mul.f16x2 r9355, r8537, r9291; +} +{ +sub.f16x2 r9358, r9352, r9355; +} +{ +mul.f16x2 r9361, r8393, r9291; +} +{ +fma.rn.f16x2 r9364, r8537, r9290, r9361; +} +{ +mul.f16x2 r9368, r8713, r9292; +} +{ +mul.f16x2 r9371, r8857, r9293; +} +{ +sub.f16x2 r9374, r9368, r9371; +} +{ +mul.f16x2 r9377, r8713, r9293; +} +{ +fma.rn.f16x2 r9380, r8857, r9292, r9377; +} +{ +mul.f16x2 r9384, r9033, r9294; +} +{ +mul.f16x2 r9387, r9177, r9295; +} +{ +sub.f16x2 r9390, r9384, r9387; +} +{ +mul.f16x2 r9393, r9033, r9295; +} +{ +fma.rn.f16x2 r9396, r9177, r9294, r9393; +} +{ +mul.f16x2 r9400, r8145, r9290; +} +{ +mul.f16x2 r9403, r8289, r9291; +} +{ +sub.f16x2 r9406, r9400, r9403; +} +{ +mul.f16x2 r9409, r8145, r9291; +} +{ +fma.rn.f16x2 r9412, r8289, r9290, r9409; +} +{ +mul.f16x2 r9416, r8465, r9294; +} +{ +mul.f16x2 r9419, r8609, r9295; +} +{ +sub.f16x2 r9422, r9416, r9419; +} +{ +mul.f16x2 r9425, r8465, r9295; +} +{ +fma.rn.f16x2 r9428, r8609, r9294, r9425; +} +{ +mul.f16x2 r9432, r8785, r9298; +} +{ +mul.f16x2 r9435, r8929, r9299; +} +{ +sub.f16x2 r9438, r9432, r9435; +} +{ +mul.f16x2 r9441, r8785, r9299; +} +{ +fma.rn.f16x2 r9444, r8929, r9298, r9441; +} +{ +mul.f16x2 r9448, r9105, r9302; +} +{ +mul.f16x2 r9451, r9249, r9303; +} +{ +sub.f16x2 r9454, r9448, r9451; +} +{ +mul.f16x2 r9457, r9105, r9303; +} +{ +fma.rn.f16x2 r9460, r9249, r9302, r9457; +} +{ +mul.f16x2 r9464, r8181, r9292; +} +{ +mul.f16x2 r9467, r8325, r9293; +} +{ +sub.f16x2 r9470, r9464, r9467; +} +{ +mul.f16x2 r9473, r8181, r9293; +} +{ +fma.rn.f16x2 r9476, r8325, r9292, r9473; +} +{ +mul.f16x2 r9480, r8501, r9298; +} +{ +mul.f16x2 r9483, r8645, r9299; +} +{ +sub.f16x2 r9486, r9480, r9483; +} +{ +mul.f16x2 r9489, r8501, r9299; +} +{ +fma.rn.f16x2 r9492, r8645, r9298, r9489; +} +{ +mul.f16x2 r9496, r8821, r9304; +} +{ +mul.f16x2 r9499, r8965, r9305; +} +{ +sub.f16x2 r9502, r9496, r9499; +} +{ +mul.f16x2 r9505, r8821, r9305; +} +{ +fma.rn.f16x2 r9508, r8965, r9304, r9505; +} +{ +mul.f16x2 r9512, r9141, r9310; +} +{ +mul.f16x2 r9515, r9285, r9311; +} +{ +sub.f16x2 r9518, r9512, r9515; +} +{ +mul.f16x2 r9521, r9141, r9311; +} +{ +fma.rn.f16x2 r9524, r9285, r9310, r9521; +} +{ +mul.f16x2 r9528, r8109, r9294; +} +{ +mul.f16x2 r9531, r8253, r9295; +} +{ +sub.f16x2 r9534, r9528, r9531; +} +{ +mul.f16x2 r9537, r8109, r9295; +} +{ +fma.rn.f16x2 r9540, r8253, r9294, r9537; +} +{ +mul.f16x2 r9544, r8429, r9302; +} +{ +mul.f16x2 r9547, r8573, r9303; +} +{ +sub.f16x2 r9550, r9544, r9547; +} +{ +mul.f16x2 r9553, r8429, r9303; +} +{ +fma.rn.f16x2 r9556, r8573, r9302, r9553; +} +{ +mul.f16x2 r9560, r8749, r9310; +} +{ +mul.f16x2 r9563, r8893, r9311; +} +{ +sub.f16x2 r9566, r9560, r9563; +} +{ +mul.f16x2 r9569, r8749, r9311; +} +{ +fma.rn.f16x2 r9572, r8893, r9310, r9569; +} +{ +mul.f16x2 r9576, r9069, r9318; +} +{ +mul.f16x2 r9579, r9213, r9319; +} +{ +sub.f16x2 r9582, r9576, r9579; +} +{ +mul.f16x2 r9585, r9069, r9319; +} +{ +fma.rn.f16x2 r9588, r9213, r9318, r9585; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9592, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9593, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9595, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9596, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9597, {low, high}; +} +{ +neg.f16x2 r9598, r9597; +} +{ +add.f16x2 r9600, r8025, r8985; +} +{ +add.f16x2 r9603, r7705, r9600; +} +{ +add.f16x2 r9606, r8345, r8665; +} +{ +add.f16x2 %0, r9603, r9606; +} +{ +add.f16x2 r9612, r8037, r8997; +} +{ +add.f16x2 r9615, r7717, r9612; +} +{ +add.f16x2 r9618, r8357, r8677; +} +{ +add.f16x2 %1, r9615, r9618; +} +{ +add.f16x2 r9624, r8025, r8985; +} +{ +mul.f16x2 r9627, r9624, r9592; +} +{ +add.f16x2 r9630, r7705, r9627; +} +{ +add.f16x2 r9633, r8345, r8665; +} +{ +mul.f16x2 r9636, r9633, r9594; +} +{ +add.f16x2 r9639, r9630, r9636; +} +{ +sub.f16x2 r9642, r8037, r8997; +} +{ +mul.f16x2 r9645, r9642, r9593; +} +{ +sub.f16x2 r9648, r8357, r8677; +} +{ +mul.f16x2 r9651, r9648, r9595; +} +{ +add.f16x2 r9654, r9645, r9651; +} +{ +sub.f16x2 %10, r9639, r9654; +} +{ +add.f16x2 r9660, r8025, r8985; +} +{ +mul.f16x2 r9663, r9660, r9592; +} +{ +add.f16x2 r9666, r7705, r9663; +} +{ +add.f16x2 r9669, r8345, r8665; +} +{ +mul.f16x2 r9672, r9669, r9594; +} +{ +add.f16x2 r9675, r9666, r9672; +} +{ +sub.f16x2 r9678, r8037, r8997; +} +{ +mul.f16x2 r9681, r9678, r9593; +} +{ +sub.f16x2 r9684, r8357, r8677; +} +{ +mul.f16x2 r9687, r9684, r9595; +} +{ +add.f16x2 r9690, r9681, r9687; +} +{ +add.f16x2 %40, r9675, r9690; +} +{ +add.f16x2 r9696, r8025, r8985; +} +{ +mul.f16x2 r9699, r9696, r9594; +} +{ +add.f16x2 r9702, r7705, r9699; +} +{ +add.f16x2 r9705, r8345, r8665; +} +{ +mul.f16x2 r9708, r9705, r9596; +} +{ +add.f16x2 r9711, r9702, r9708; +} +{ +sub.f16x2 r9714, r8037, r8997; +} +{ +mul.f16x2 r9717, r9714, r9595; +} +{ +sub.f16x2 r9720, r8357, r8677; +} +{ +mul.f16x2 r9723, r9720, r9598; +} +{ +add.f16x2 r9726, r9717, r9723; +} +{ +sub.f16x2 %20, r9711, r9726; +} +{ +add.f16x2 r9732, r8025, r8985; +} +{ +mul.f16x2 r9735, r9732, r9594; +} +{ +add.f16x2 r9738, r7705, r9735; +} +{ +add.f16x2 r9741, r8345, r8665; +} +{ +mul.f16x2 r9744, r9741, r9596; +} +{ +add.f16x2 r9747, r9738, r9744; +} +{ +sub.f16x2 r9750, r8037, r8997; +} +{ +mul.f16x2 r9753, r9750, r9595; +} +{ +sub.f16x2 r9756, r8357, r8677; +} +{ +mul.f16x2 r9759, r9756, r9598; +} +{ +add.f16x2 r9762, r9753, r9759; +} +{ +add.f16x2 %30, r9747, r9762; +} +{ +add.f16x2 r9768, r8037, r8997; +} +{ +mul.f16x2 r9771, r9768, r9592; +} +{ +add.f16x2 r9774, r7717, r9771; +} +{ +add.f16x2 r9777, r8357, r8677; +} +{ +mul.f16x2 r9780, r9777, r9594; +} +{ +add.f16x2 r9783, r9774, r9780; +} +{ +sub.f16x2 r9786, r8025, r8985; +} +{ +mul.f16x2 r9789, r9786, r9593; +} +{ +sub.f16x2 r9792, r8345, r8665; +} +{ +mul.f16x2 r9795, r9792, r9595; +} +{ +add.f16x2 r9798, r9789, r9795; +} +{ +add.f16x2 %11, r9783, r9798; +} +{ +add.f16x2 r9804, r8037, r8997; +} +{ +mul.f16x2 r9807, r9804, r9592; +} +{ +add.f16x2 r9810, r7717, r9807; +} +{ +add.f16x2 r9813, r8357, r8677; +} +{ +mul.f16x2 r9816, r9813, r9594; +} +{ +add.f16x2 r9819, r9810, r9816; +} +{ +sub.f16x2 r9822, r8025, r8985; +} +{ +mul.f16x2 r9825, r9822, r9593; +} +{ +sub.f16x2 r9828, r8345, r8665; +} +{ +mul.f16x2 r9831, r9828, r9595; +} +{ +add.f16x2 r9834, r9825, r9831; +} +{ +sub.f16x2 %41, r9819, r9834; +} +{ +add.f16x2 r9840, r8037, r8997; +} +{ +mul.f16x2 r9843, r9840, r9594; +} +{ +add.f16x2 r9846, r7717, r9843; +} +{ +add.f16x2 r9849, r8357, r8677; +} +{ +mul.f16x2 r9852, r9849, r9596; +} +{ +add.f16x2 r9855, r9846, r9852; +} +{ +sub.f16x2 r9858, r8025, r8985; +} +{ +mul.f16x2 r9861, r9858, r9595; +} +{ +sub.f16x2 r9864, r8345, r8665; +} +{ +mul.f16x2 r9867, r9864, r9598; +} +{ +add.f16x2 r9870, r9861, r9867; +} +{ +add.f16x2 %21, r9855, r9870; +} +{ +add.f16x2 r9876, r8037, r8997; +} +{ +mul.f16x2 r9879, r9876, r9594; +} +{ +add.f16x2 r9882, r7717, r9879; +} +{ +add.f16x2 r9885, r8357, r8677; +} +{ +mul.f16x2 r9888, r9885, r9596; +} +{ +add.f16x2 r9891, r9882, r9888; +} +{ +sub.f16x2 r9894, r8025, r8985; +} +{ +mul.f16x2 r9897, r9894, r9595; +} +{ +sub.f16x2 r9900, r8345, r8665; +} +{ +mul.f16x2 r9903, r9900, r9598; +} +{ +add.f16x2 r9906, r9897, r9903; +} +{ +sub.f16x2 %31, r9891, r9906; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9912, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9916, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9917, {low, high}; +} +{ +neg.f16x2 r9918, r9917; +} +{ +add.f16x2 r9920, r9342, r9390; +} +{ +add.f16x2 r9923, r7753, r9920; +} +{ +add.f16x2 r9926, r9358, r9374; +} +{ +add.f16x2 %2, r9923, r9926; +} +{ +add.f16x2 r9932, r9348, r9396; +} +{ +add.f16x2 r9935, r7897, r9932; +} +{ +add.f16x2 r9938, r9364, r9380; +} +{ +add.f16x2 %3, r9935, r9938; +} +{ +add.f16x2 r9944, r9342, r9390; +} +{ +mul.f16x2 r9947, r9944, r9912; +} +{ +add.f16x2 r9950, r7753, r9947; +} +{ +add.f16x2 r9953, r9358, r9374; +} +{ +mul.f16x2 r9956, r9953, r9914; +} +{ +add.f16x2 r9959, r9950, r9956; +} +{ +sub.f16x2 r9962, r9348, r9396; +} +{ +mul.f16x2 r9965, r9962, r9913; +} +{ +sub.f16x2 r9968, r9364, r9380; +} +{ +mul.f16x2 r9971, r9968, r9915; +} +{ +add.f16x2 r9974, r9965, r9971; +} +{ +sub.f16x2 %12, r9959, r9974; +} +{ +add.f16x2 r9980, r9342, r9390; +} +{ +mul.f16x2 r9983, r9980, r9912; +} +{ +add.f16x2 r9986, r7753, r9983; +} +{ +add.f16x2 r9989, r9358, r9374; +} +{ +mul.f16x2 r9992, r9989, r9914; +} +{ +add.f16x2 r9995, r9986, r9992; +} +{ +sub.f16x2 r9998, r9348, r9396; +} +{ +mul.f16x2 r10001, r9998, r9913; +} +{ +sub.f16x2 r10004, r9364, r9380; +} +{ +mul.f16x2 r10007, r10004, r9915; +} +{ +add.f16x2 r10010, r10001, r10007; +} +{ +add.f16x2 %42, r9995, r10010; +} +{ +add.f16x2 r10016, r9342, r9390; +} +{ +mul.f16x2 r10019, r10016, r9914; +} +{ +add.f16x2 r10022, r7753, r10019; +} +{ +add.f16x2 r10025, r9358, r9374; +} +{ +mul.f16x2 r10028, r10025, r9916; +} +{ +add.f16x2 r10031, r10022, r10028; +} +{ +sub.f16x2 r10034, r9348, r9396; +} +{ +mul.f16x2 r10037, r10034, r9915; +} +{ +sub.f16x2 r10040, r9364, r9380; +} +{ +mul.f16x2 r10043, r10040, r9918; +} +{ +add.f16x2 r10046, r10037, r10043; +} +{ +sub.f16x2 %22, r10031, r10046; +} +{ +add.f16x2 r10052, r9342, r9390; +} +{ +mul.f16x2 r10055, r10052, r9914; +} +{ +add.f16x2 r10058, r7753, r10055; +} +{ +add.f16x2 r10061, r9358, r9374; +} +{ +mul.f16x2 r10064, r10061, r9916; +} +{ +add.f16x2 r10067, r10058, r10064; +} +{ +sub.f16x2 r10070, r9348, r9396; +} +{ +mul.f16x2 r10073, r10070, r9915; +} +{ +sub.f16x2 r10076, r9364, r9380; +} +{ +mul.f16x2 r10079, r10076, r9918; +} +{ +add.f16x2 r10082, r10073, r10079; +} +{ +add.f16x2 %32, r10067, r10082; +} +{ +add.f16x2 r10088, r9348, r9396; +} +{ +mul.f16x2 r10091, r10088, r9912; +} +{ +add.f16x2 r10094, r7897, r10091; +} +{ +add.f16x2 r10097, r9364, r9380; +} +{ +mul.f16x2 r10100, r10097, r9914; +} +{ +add.f16x2 r10103, r10094, r10100; +} +{ +sub.f16x2 r10106, r9342, r9390; +} +{ +mul.f16x2 r10109, r10106, r9913; +} +{ +sub.f16x2 r10112, r9358, r9374; +} +{ +mul.f16x2 r10115, r10112, r9915; +} +{ +add.f16x2 r10118, r10109, r10115; +} +{ +add.f16x2 %13, r10103, r10118; +} +{ +add.f16x2 r10124, r9348, r9396; +} +{ +mul.f16x2 r10127, r10124, r9912; +} +{ +add.f16x2 r10130, r7897, r10127; +} +{ +add.f16x2 r10133, r9364, r9380; +} +{ +mul.f16x2 r10136, r10133, r9914; +} +{ +add.f16x2 r10139, r10130, r10136; +} +{ +sub.f16x2 r10142, r9342, r9390; +} +{ +mul.f16x2 r10145, r10142, r9913; +} +{ +sub.f16x2 r10148, r9358, r9374; +} +{ +mul.f16x2 r10151, r10148, r9915; +} +{ +add.f16x2 r10154, r10145, r10151; +} +{ +sub.f16x2 %43, r10139, r10154; +} +{ +add.f16x2 r10160, r9348, r9396; +} +{ +mul.f16x2 r10163, r10160, r9914; +} +{ +add.f16x2 r10166, r7897, r10163; +} +{ +add.f16x2 r10169, r9364, r9380; +} +{ +mul.f16x2 r10172, r10169, r9916; +} +{ +add.f16x2 r10175, r10166, r10172; +} +{ +sub.f16x2 r10178, r9342, r9390; +} +{ +mul.f16x2 r10181, r10178, r9915; +} +{ +sub.f16x2 r10184, r9358, r9374; +} +{ +mul.f16x2 r10187, r10184, r9918; +} +{ +add.f16x2 r10190, r10181, r10187; +} +{ +add.f16x2 %23, r10175, r10190; +} +{ +add.f16x2 r10196, r9348, r9396; +} +{ +mul.f16x2 r10199, r10196, r9914; +} +{ +add.f16x2 r10202, r7897, r10199; +} +{ +add.f16x2 r10205, r9364, r9380; +} +{ +mul.f16x2 r10208, r10205, r9916; +} +{ +add.f16x2 r10211, r10202, r10208; +} +{ +sub.f16x2 r10214, r9342, r9390; +} +{ +mul.f16x2 r10217, r10214, r9915; +} +{ +sub.f16x2 r10220, r9358, r9374; +} +{ +mul.f16x2 r10223, r10220, r9918; +} +{ +add.f16x2 r10226, r10217, r10223; +} +{ +sub.f16x2 %33, r10211, r10226; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10232, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10233, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10236, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10237, {low, high}; +} +{ +neg.f16x2 r10238, r10237; +} +{ +add.f16x2 r10240, r9406, r9454; +} +{ +add.f16x2 r10243, r7825, r10240; +} +{ +add.f16x2 r10246, r9422, r9438; +} +{ +add.f16x2 %4, r10243, r10246; +} +{ +add.f16x2 r10252, r9412, r9460; +} +{ +add.f16x2 r10255, r7969, r10252; +} +{ +add.f16x2 r10258, r9428, r9444; +} +{ +add.f16x2 %5, r10255, r10258; +} +{ +add.f16x2 r10264, r9406, r9454; +} +{ +mul.f16x2 r10267, r10264, r10232; +} +{ +add.f16x2 r10270, r7825, r10267; +} +{ +add.f16x2 r10273, r9422, r9438; +} +{ +mul.f16x2 r10276, r10273, r10234; +} +{ +add.f16x2 r10279, r10270, r10276; +} +{ +sub.f16x2 r10282, r9412, r9460; +} +{ +mul.f16x2 r10285, r10282, r10233; +} +{ +sub.f16x2 r10288, r9428, r9444; +} +{ +mul.f16x2 r10291, r10288, r10235; +} +{ +add.f16x2 r10294, r10285, r10291; +} +{ +sub.f16x2 %14, r10279, r10294; +} +{ +add.f16x2 r10300, r9406, r9454; +} +{ +mul.f16x2 r10303, r10300, r10232; +} +{ +add.f16x2 r10306, r7825, r10303; +} +{ +add.f16x2 r10309, r9422, r9438; +} +{ +mul.f16x2 r10312, r10309, r10234; +} +{ +add.f16x2 r10315, r10306, r10312; +} +{ +sub.f16x2 r10318, r9412, r9460; +} +{ +mul.f16x2 r10321, r10318, r10233; +} +{ +sub.f16x2 r10324, r9428, r9444; +} +{ +mul.f16x2 r10327, r10324, r10235; +} +{ +add.f16x2 r10330, r10321, r10327; +} +{ +add.f16x2 %44, r10315, r10330; +} +{ +add.f16x2 r10336, r9406, r9454; +} +{ +mul.f16x2 r10339, r10336, r10234; +} +{ +add.f16x2 r10342, r7825, r10339; +} +{ +add.f16x2 r10345, r9422, r9438; +} +{ +mul.f16x2 r10348, r10345, r10236; +} +{ +add.f16x2 r10351, r10342, r10348; +} +{ +sub.f16x2 r10354, r9412, r9460; +} +{ +mul.f16x2 r10357, r10354, r10235; +} +{ +sub.f16x2 r10360, r9428, r9444; +} +{ +mul.f16x2 r10363, r10360, r10238; +} +{ +add.f16x2 r10366, r10357, r10363; +} +{ +sub.f16x2 %24, r10351, r10366; +} +{ +add.f16x2 r10372, r9406, r9454; +} +{ +mul.f16x2 r10375, r10372, r10234; +} +{ +add.f16x2 r10378, r7825, r10375; +} +{ +add.f16x2 r10381, r9422, r9438; +} +{ +mul.f16x2 r10384, r10381, r10236; +} +{ +add.f16x2 r10387, r10378, r10384; +} +{ +sub.f16x2 r10390, r9412, r9460; +} +{ +mul.f16x2 r10393, r10390, r10235; +} +{ +sub.f16x2 r10396, r9428, r9444; +} +{ +mul.f16x2 r10399, r10396, r10238; +} +{ +add.f16x2 r10402, r10393, r10399; +} +{ +add.f16x2 %34, r10387, r10402; +} +{ +add.f16x2 r10408, r9412, r9460; +} +{ +mul.f16x2 r10411, r10408, r10232; +} +{ +add.f16x2 r10414, r7969, r10411; +} +{ +add.f16x2 r10417, r9428, r9444; +} +{ +mul.f16x2 r10420, r10417, r10234; +} +{ +add.f16x2 r10423, r10414, r10420; +} +{ +sub.f16x2 r10426, r9406, r9454; +} +{ +mul.f16x2 r10429, r10426, r10233; +} +{ +sub.f16x2 r10432, r9422, r9438; +} +{ +mul.f16x2 r10435, r10432, r10235; +} +{ +add.f16x2 r10438, r10429, r10435; +} +{ +add.f16x2 %15, r10423, r10438; +} +{ +add.f16x2 r10444, r9412, r9460; +} +{ +mul.f16x2 r10447, r10444, r10232; +} +{ +add.f16x2 r10450, r7969, r10447; +} +{ +add.f16x2 r10453, r9428, r9444; +} +{ +mul.f16x2 r10456, r10453, r10234; +} +{ +add.f16x2 r10459, r10450, r10456; +} +{ +sub.f16x2 r10462, r9406, r9454; +} +{ +mul.f16x2 r10465, r10462, r10233; +} +{ +sub.f16x2 r10468, r9422, r9438; +} +{ +mul.f16x2 r10471, r10468, r10235; +} +{ +add.f16x2 r10474, r10465, r10471; +} +{ +sub.f16x2 %45, r10459, r10474; +} +{ +add.f16x2 r10480, r9412, r9460; +} +{ +mul.f16x2 r10483, r10480, r10234; +} +{ +add.f16x2 r10486, r7969, r10483; +} +{ +add.f16x2 r10489, r9428, r9444; +} +{ +mul.f16x2 r10492, r10489, r10236; +} +{ +add.f16x2 r10495, r10486, r10492; +} +{ +sub.f16x2 r10498, r9406, r9454; +} +{ +mul.f16x2 r10501, r10498, r10235; +} +{ +sub.f16x2 r10504, r9422, r9438; +} +{ +mul.f16x2 r10507, r10504, r10238; +} +{ +add.f16x2 r10510, r10501, r10507; +} +{ +add.f16x2 %25, r10495, r10510; +} +{ +add.f16x2 r10516, r9412, r9460; +} +{ +mul.f16x2 r10519, r10516, r10234; +} +{ +add.f16x2 r10522, r7969, r10519; +} +{ +add.f16x2 r10525, r9428, r9444; +} +{ +mul.f16x2 r10528, r10525, r10236; +} +{ +add.f16x2 r10531, r10522, r10528; +} +{ +sub.f16x2 r10534, r9406, r9454; +} +{ +mul.f16x2 r10537, r10534, r10235; +} +{ +sub.f16x2 r10540, r9422, r9438; +} +{ +mul.f16x2 r10543, r10540, r10238; +} +{ +add.f16x2 r10546, r10537, r10543; +} +{ +sub.f16x2 %35, r10531, r10546; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10552, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10553, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10554, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10555, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10556, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10557, {low, high}; +} +{ +neg.f16x2 r10558, r10557; +} +{ +add.f16x2 r10560, r9470, r9518; +} +{ +add.f16x2 r10563, r7861, r10560; +} +{ +add.f16x2 r10566, r9486, r9502; +} +{ +add.f16x2 %6, r10563, r10566; +} +{ +add.f16x2 r10572, r9476, r9524; +} +{ +add.f16x2 r10575, r8005, r10572; +} +{ +add.f16x2 r10578, r9492, r9508; +} +{ +add.f16x2 %7, r10575, r10578; +} +{ +add.f16x2 r10584, r9470, r9518; +} +{ +mul.f16x2 r10587, r10584, r10552; +} +{ +add.f16x2 r10590, r7861, r10587; +} +{ +add.f16x2 r10593, r9486, r9502; +} +{ +mul.f16x2 r10596, r10593, r10554; +} +{ +add.f16x2 r10599, r10590, r10596; +} +{ +sub.f16x2 r10602, r9476, r9524; +} +{ +mul.f16x2 r10605, r10602, r10553; +} +{ +sub.f16x2 r10608, r9492, r9508; +} +{ +mul.f16x2 r10611, r10608, r10555; +} +{ +add.f16x2 r10614, r10605, r10611; +} +{ +sub.f16x2 %16, r10599, r10614; +} +{ +add.f16x2 r10620, r9470, r9518; +} +{ +mul.f16x2 r10623, r10620, r10552; +} +{ +add.f16x2 r10626, r7861, r10623; +} +{ +add.f16x2 r10629, r9486, r9502; +} +{ +mul.f16x2 r10632, r10629, r10554; +} +{ +add.f16x2 r10635, r10626, r10632; +} +{ +sub.f16x2 r10638, r9476, r9524; +} +{ +mul.f16x2 r10641, r10638, r10553; +} +{ +sub.f16x2 r10644, r9492, r9508; +} +{ +mul.f16x2 r10647, r10644, r10555; +} +{ +add.f16x2 r10650, r10641, r10647; +} +{ +add.f16x2 %46, r10635, r10650; +} +{ +add.f16x2 r10656, r9470, r9518; +} +{ +mul.f16x2 r10659, r10656, r10554; +} +{ +add.f16x2 r10662, r7861, r10659; +} +{ +add.f16x2 r10665, r9486, r9502; +} +{ +mul.f16x2 r10668, r10665, r10556; +} +{ +add.f16x2 r10671, r10662, r10668; +} +{ +sub.f16x2 r10674, r9476, r9524; +} +{ +mul.f16x2 r10677, r10674, r10555; +} +{ +sub.f16x2 r10680, r9492, r9508; +} +{ +mul.f16x2 r10683, r10680, r10558; +} +{ +add.f16x2 r10686, r10677, r10683; +} +{ +sub.f16x2 %26, r10671, r10686; +} +{ +add.f16x2 r10692, r9470, r9518; +} +{ +mul.f16x2 r10695, r10692, r10554; +} +{ +add.f16x2 r10698, r7861, r10695; +} +{ +add.f16x2 r10701, r9486, r9502; +} +{ +mul.f16x2 r10704, r10701, r10556; +} +{ +add.f16x2 r10707, r10698, r10704; +} +{ +sub.f16x2 r10710, r9476, r9524; +} +{ +mul.f16x2 r10713, r10710, r10555; +} +{ +sub.f16x2 r10716, r9492, r9508; +} +{ +mul.f16x2 r10719, r10716, r10558; +} +{ +add.f16x2 r10722, r10713, r10719; +} +{ +add.f16x2 %36, r10707, r10722; +} +{ +add.f16x2 r10728, r9476, r9524; +} +{ +mul.f16x2 r10731, r10728, r10552; +} +{ +add.f16x2 r10734, r8005, r10731; +} +{ +add.f16x2 r10737, r9492, r9508; +} +{ +mul.f16x2 r10740, r10737, r10554; +} +{ +add.f16x2 r10743, r10734, r10740; +} +{ +sub.f16x2 r10746, r9470, r9518; +} +{ +mul.f16x2 r10749, r10746, r10553; +} +{ +sub.f16x2 r10752, r9486, r9502; +} +{ +mul.f16x2 r10755, r10752, r10555; +} +{ +add.f16x2 r10758, r10749, r10755; +} +{ +add.f16x2 %17, r10743, r10758; +} +{ +add.f16x2 r10764, r9476, r9524; +} +{ +mul.f16x2 r10767, r10764, r10552; +} +{ +add.f16x2 r10770, r8005, r10767; +} +{ +add.f16x2 r10773, r9492, r9508; +} +{ +mul.f16x2 r10776, r10773, r10554; +} +{ +add.f16x2 r10779, r10770, r10776; +} +{ +sub.f16x2 r10782, r9470, r9518; +} +{ +mul.f16x2 r10785, r10782, r10553; +} +{ +sub.f16x2 r10788, r9486, r9502; +} +{ +mul.f16x2 r10791, r10788, r10555; +} +{ +add.f16x2 r10794, r10785, r10791; +} +{ +sub.f16x2 %47, r10779, r10794; +} +{ +add.f16x2 r10800, r9476, r9524; +} +{ +mul.f16x2 r10803, r10800, r10554; +} +{ +add.f16x2 r10806, r8005, r10803; +} +{ +add.f16x2 r10809, r9492, r9508; +} +{ +mul.f16x2 r10812, r10809, r10556; +} +{ +add.f16x2 r10815, r10806, r10812; +} +{ +sub.f16x2 r10818, r9470, r9518; +} +{ +mul.f16x2 r10821, r10818, r10555; +} +{ +sub.f16x2 r10824, r9486, r9502; +} +{ +mul.f16x2 r10827, r10824, r10558; +} +{ +add.f16x2 r10830, r10821, r10827; +} +{ +add.f16x2 %27, r10815, r10830; +} +{ +add.f16x2 r10836, r9476, r9524; +} +{ +mul.f16x2 r10839, r10836, r10554; +} +{ +add.f16x2 r10842, r8005, r10839; +} +{ +add.f16x2 r10845, r9492, r9508; +} +{ +mul.f16x2 r10848, r10845, r10556; +} +{ +add.f16x2 r10851, r10842, r10848; +} +{ +sub.f16x2 r10854, r9470, r9518; +} +{ +mul.f16x2 r10857, r10854, r10555; +} +{ +sub.f16x2 r10860, r9486, r9502; +} +{ +mul.f16x2 r10863, r10860, r10558; +} +{ +add.f16x2 r10866, r10857, r10863; +} +{ +sub.f16x2 %37, r10851, r10866; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10877, {low, high}; +} +{ +neg.f16x2 r10878, r10877; +} +{ +add.f16x2 r10880, r9534, r9582; +} +{ +add.f16x2 r10883, r7789, r10880; +} +{ +add.f16x2 r10886, r9550, r9566; +} +{ +add.f16x2 %8, r10883, r10886; +} +{ +add.f16x2 r10892, r9540, r9588; +} +{ +add.f16x2 r10895, r7933, r10892; +} +{ +add.f16x2 r10898, r9556, r9572; +} +{ +add.f16x2 %9, r10895, r10898; +} +{ +add.f16x2 r10904, r9534, r9582; +} +{ +mul.f16x2 r10907, r10904, r10872; +} +{ +add.f16x2 r10910, r7789, r10907; +} +{ +add.f16x2 r10913, r9550, r9566; +} +{ +mul.f16x2 r10916, r10913, r10874; +} +{ +add.f16x2 r10919, r10910, r10916; +} +{ +sub.f16x2 r10922, r9540, r9588; +} +{ +mul.f16x2 r10925, r10922, r10873; +} +{ +sub.f16x2 r10928, r9556, r9572; +} +{ +mul.f16x2 r10931, r10928, r10875; +} +{ +add.f16x2 r10934, r10925, r10931; +} +{ +sub.f16x2 %18, r10919, r10934; +} +{ +add.f16x2 r10940, r9534, r9582; +} +{ +mul.f16x2 r10943, r10940, r10872; +} +{ +add.f16x2 r10946, r7789, r10943; +} +{ +add.f16x2 r10949, r9550, r9566; +} +{ +mul.f16x2 r10952, r10949, r10874; +} +{ +add.f16x2 r10955, r10946, r10952; +} +{ +sub.f16x2 r10958, r9540, r9588; +} +{ +mul.f16x2 r10961, r10958, r10873; +} +{ +sub.f16x2 r10964, r9556, r9572; +} +{ +mul.f16x2 r10967, r10964, r10875; +} +{ +add.f16x2 r10970, r10961, r10967; +} +{ +add.f16x2 %48, r10955, r10970; +} +{ +add.f16x2 r10976, r9534, r9582; +} +{ +mul.f16x2 r10979, r10976, r10874; +} +{ +add.f16x2 r10982, r7789, r10979; +} +{ +add.f16x2 r10985, r9550, r9566; +} +{ +mul.f16x2 r10988, r10985, r10876; +} +{ +add.f16x2 r10991, r10982, r10988; +} +{ +sub.f16x2 r10994, r9540, r9588; +} +{ +mul.f16x2 r10997, r10994, r10875; +} +{ +sub.f16x2 r11000, r9556, r9572; +} +{ +mul.f16x2 r11003, r11000, r10878; +} +{ +add.f16x2 r11006, r10997, r11003; +} +{ +sub.f16x2 %28, r10991, r11006; +} +{ +add.f16x2 r11012, r9534, r9582; +} +{ +mul.f16x2 r11015, r11012, r10874; +} +{ +add.f16x2 r11018, r7789, r11015; +} +{ +add.f16x2 r11021, r9550, r9566; +} +{ +mul.f16x2 r11024, r11021, r10876; +} +{ +add.f16x2 r11027, r11018, r11024; +} +{ +sub.f16x2 r11030, r9540, r9588; +} +{ +mul.f16x2 r11033, r11030, r10875; +} +{ +sub.f16x2 r11036, r9556, r9572; +} +{ +mul.f16x2 r11039, r11036, r10878; +} +{ +add.f16x2 r11042, r11033, r11039; +} +{ +add.f16x2 %38, r11027, r11042; +} +{ +add.f16x2 r11048, r9540, r9588; +} +{ +mul.f16x2 r11051, r11048, r10872; +} +{ +add.f16x2 r11054, r7933, r11051; +} +{ +add.f16x2 r11057, r9556, r9572; +} +{ +mul.f16x2 r11060, r11057, r10874; +} +{ +add.f16x2 r11063, r11054, r11060; +} +{ +sub.f16x2 r11066, r9534, r9582; +} +{ +mul.f16x2 r11069, r11066, r10873; +} +{ +sub.f16x2 r11072, r9550, r9566; +} +{ +mul.f16x2 r11075, r11072, r10875; +} +{ +add.f16x2 r11078, r11069, r11075; +} +{ +add.f16x2 %19, r11063, r11078; +} +{ +add.f16x2 r11084, r9540, r9588; +} +{ +mul.f16x2 r11087, r11084, r10872; +} +{ +add.f16x2 r11090, r7933, r11087; +} +{ +add.f16x2 r11093, r9556, r9572; +} +{ +mul.f16x2 r11096, r11093, r10874; +} +{ +add.f16x2 r11099, r11090, r11096; +} +{ +sub.f16x2 r11102, r9534, r9582; +} +{ +mul.f16x2 r11105, r11102, r10873; +} +{ +sub.f16x2 r11108, r9550, r9566; +} +{ +mul.f16x2 r11111, r11108, r10875; +} +{ +add.f16x2 r11114, r11105, r11111; +} +{ +sub.f16x2 %49, r11099, r11114; +} +{ +add.f16x2 r11120, r9540, r9588; +} +{ +mul.f16x2 r11123, r11120, r10874; +} +{ +add.f16x2 r11126, r7933, r11123; +} +{ +add.f16x2 r11129, r9556, r9572; +} +{ +mul.f16x2 r11132, r11129, r10876; +} +{ +add.f16x2 r11135, r11126, r11132; +} +{ +sub.f16x2 r11138, r9534, r9582; +} +{ +mul.f16x2 r11141, r11138, r10875; +} +{ +sub.f16x2 r11144, r9550, r9566; +} +{ +mul.f16x2 r11147, r11144, r10878; +} +{ +add.f16x2 r11150, r11141, r11147; +} +{ +add.f16x2 %29, r11135, r11150; +} +{ +add.f16x2 r11156, r9540, r9588; +} +{ +mul.f16x2 r11159, r11156, r10874; +} +{ +add.f16x2 r11162, r7933, r11159; +} +{ +add.f16x2 r11165, r9556, r9572; +} +{ +mul.f16x2 r11168, r11165, r10876; +} +{ +add.f16x2 r11171, r11162, r11168; +} +{ +sub.f16x2 r11174, r9534, r9582; +} +{ +mul.f16x2 r11177, r11174, r10875; +} +{ +sub.f16x2 r11180, r9550, r9566; +} +{ +mul.f16x2 r11183, r11180, r10878; +} +{ +add.f16x2 r11186, r11177, r11183; +} +{ +sub.f16x2 %39, r11171, r11186; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..bd40a66a9114f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp16_inv.hpp.inc @@ -0,0 +1,25898 @@ +#ifndef CUFFTDX_FFT_15625_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_15625_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1185, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.local .align 8 .b8 __local_depot0[200]; +.reg .b64 SP; +.reg .b64 SPL; +.reg .pred p<3>; +.reg .f32 f<695>; +.reg .b32 r<11414>; +.reg .b64 rd<21>; +mov.u64 SPL, __local_depot0; +add.u64 rd3, SPL, 0; +mov.u32 r3551, %tid.y; +mul.lo.s32 r1, r3551, 15625; +add.s64 rd4, rd3, 4; +mov.f32 f214, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r23, {low, high}; +} +mov.f32 f216, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r24, {low, high}; +} +{ +neg.f16x2 r25, r24; +} +mov.f32 f210, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r27, {low, high}; +} +mov.f32 f212, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r28, {low, high}; +} +{ +neg.f16x2 r29, r28; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r31, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r32, {low, high}; +} +{ +add.f16x2 r33, %61, %91; +} +{ +add.f16x2 r36, %51, r33; +} +{ +add.f16x2 r39, %71, %81; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %62, %92; +} +{ +add.f16x2 r48, %52, r45; +} +{ +add.f16x2 r51, %72, %82; +} +{ +add.f16x2 r54, r48, r51; +} +{ +add.f16x2 r57, %61, %91; +} +{ +mul.f16x2 r60, r57, r23; +} +{ +add.f16x2 r63, %51, r60; +} +{ +add.f16x2 r66, %71, %81; +} +{ +mul.f16x2 r69, r66, r27; +} +{ +add.f16x2 r72, r63, r69; +} +{ +sub.f16x2 r75, %62, %92; +} +{ +mul.f16x2 r78, r75, r25; +} +{ +sub.f16x2 r81, %72, %82; +} +{ +mul.f16x2 r84, r81, r29; +} +{ +add.f16x2 r87, r78, r84; +} +{ +sub.f16x2 r90, r72, r87; +} +{ +add.f16x2 r93, %61, %91; +} +{ +mul.f16x2 r96, r93, r23; +} +{ +add.f16x2 r99, %51, r96; +} +{ +add.f16x2 r102, %71, %81; +} +{ +mul.f16x2 r105, r102, r27; +} +{ +add.f16x2 r108, r99, r105; +} +{ +sub.f16x2 r111, %62, %92; +} +{ +mul.f16x2 r114, r111, r25; +} +{ +sub.f16x2 r117, %72, %82; +} +{ +mul.f16x2 r120, r117, r29; +} +{ +add.f16x2 r123, r114, r120; +} +{ +add.f16x2 r126, r108, r123; +} +{ +add.f16x2 r129, %61, %91; +} +{ +mul.f16x2 r132, r129, r27; +} +{ +add.f16x2 r135, %51, r132; +} +{ +add.f16x2 r138, %71, %81; +} +{ +mul.f16x2 r141, r138, r31; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %62, %92; +} +{ +mul.f16x2 r150, r147, r29; +} +{ +sub.f16x2 r153, %72, %82; +} +{ +mul.f16x2 r156, r153, r32; +} +{ +add.f16x2 r159, r150, r156; +} +{ +sub.f16x2 r162, r144, r159; +} +{ +add.f16x2 r165, %61, %91; +} +{ +mul.f16x2 r168, r165, r27; +} +{ +add.f16x2 r171, %51, r168; +} +{ +add.f16x2 r174, %71, %81; +} +{ +mul.f16x2 r177, r174, r31; +} +{ +add.f16x2 r180, r171, r177; +} +{ +sub.f16x2 r183, %62, %92; +} +{ +mul.f16x2 r186, r183, r29; +} +{ +sub.f16x2 r189, %72, %82; +} +{ +mul.f16x2 r192, r189, r32; +} +{ +add.f16x2 r195, r186, r192; +} +{ +add.f16x2 r198, r180, r195; +} +{ +add.f16x2 r201, %62, %92; +} +{ +mul.f16x2 r204, r201, r23; +} +{ +add.f16x2 r207, %52, r204; +} +{ +add.f16x2 r210, %72, %82; +} +{ +mul.f16x2 r213, r210, r27; +} +{ +add.f16x2 r216, r207, r213; +} +{ +sub.f16x2 r219, %61, %91; +} +{ +mul.f16x2 r222, r219, r25; +} +{ +sub.f16x2 r225, %71, %81; +} +{ +mul.f16x2 r228, r225, r29; +} +{ +add.f16x2 r231, r222, r228; +} +{ +add.f16x2 r234, r216, r231; +} +{ +add.f16x2 r237, %62, %92; +} +{ +mul.f16x2 r240, r237, r23; +} +{ +add.f16x2 r243, %52, r240; +} +{ +add.f16x2 r246, %72, %82; +} +{ +mul.f16x2 r249, r246, r27; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %61, %91; +} +{ +mul.f16x2 r258, r255, r25; +} +{ +sub.f16x2 r261, %71, %81; +} +{ +mul.f16x2 r264, r261, r29; +} +{ +add.f16x2 r267, r258, r264; +} +{ +sub.f16x2 r270, r252, r267; +} +{ +add.f16x2 r273, %62, %92; +} +{ +mul.f16x2 r276, r273, r27; +} +{ +add.f16x2 r279, %52, r276; +} +{ +add.f16x2 r282, %72, %82; +} +{ +mul.f16x2 r285, r282, r31; +} +{ +add.f16x2 r288, r279, r285; +} +{ +sub.f16x2 r291, %61, %91; +} +{ +mul.f16x2 r294, r291, r29; +} +{ +sub.f16x2 r297, %71, %81; +} +{ +mul.f16x2 r300, r297, r32; +} +{ +add.f16x2 r303, r294, r300; +} +{ +add.f16x2 r306, r288, r303; +} +{ +add.f16x2 r309, %62, %92; +} +{ +mul.f16x2 r312, r309, r27; +} +{ +add.f16x2 r315, %52, r312; +} +{ +add.f16x2 r318, %72, %82; +} +{ +mul.f16x2 r321, r318, r31; +} +{ +add.f16x2 r324, r315, r321; +} +{ +sub.f16x2 r327, %61, %91; +} +{ +mul.f16x2 r330, r327, r29; +} +{ +sub.f16x2 r333, %71, %81; +} +{ +mul.f16x2 r336, r333, r32; +} +{ +add.f16x2 r339, r330, r336; +} +{ +sub.f16x2 r342, r324, r339; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r349, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r350, {low, high}; +} +{ +neg.f16x2 r351, r350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r354, {low, high}; +} +{ +add.f16x2 r355, %63, %93; +} +{ +add.f16x2 r358, %53, r355; +} +{ +add.f16x2 r361, %73, %83; +} +{ +add.f16x2 r364, r358, r361; +} +{ +add.f16x2 r367, %64, %94; +} +{ +add.f16x2 r370, %54, r367; +} +{ +add.f16x2 r373, %74, %84; +} +{ +add.f16x2 r376, r370, r373; +} +{ +add.f16x2 r379, %63, %93; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, %53, r382; +} +{ +add.f16x2 r388, %73, %83; +} +{ +mul.f16x2 r391, r388, r349; +} +{ +add.f16x2 r394, r385, r391; +} +{ +sub.f16x2 r397, %64, %94; +} +{ +mul.f16x2 r400, r397, r347; +} +{ +sub.f16x2 r403, %74, %84; +} +{ +mul.f16x2 r406, r403, r351; +} +{ +add.f16x2 r409, r400, r406; +} +{ +sub.f16x2 r412, r394, r409; +} +{ +add.f16x2 r415, %63, %93; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, %53, r418; +} +{ +add.f16x2 r424, %73, %83; +} +{ +mul.f16x2 r427, r424, r349; +} +{ +add.f16x2 r430, r421, r427; +} +{ +sub.f16x2 r433, %64, %94; +} +{ +mul.f16x2 r436, r433, r347; +} +{ +sub.f16x2 r439, %74, %84; +} +{ +mul.f16x2 r442, r439, r351; +} +{ +add.f16x2 r445, r436, r442; +} +{ +add.f16x2 r448, r430, r445; +} +{ +add.f16x2 r451, %63, %93; +} +{ +mul.f16x2 r454, r451, r349; +} +{ +add.f16x2 r457, %53, r454; +} +{ +add.f16x2 r460, %73, %83; +} +{ +mul.f16x2 r463, r460, r353; +} +{ +add.f16x2 r466, r457, r463; +} +{ +sub.f16x2 r469, %64, %94; +} +{ +mul.f16x2 r472, r469, r351; +} +{ +sub.f16x2 r475, %74, %84; +} +{ +mul.f16x2 r478, r475, r354; +} +{ +add.f16x2 r481, r472, r478; +} +{ +sub.f16x2 r484, r466, r481; +} +{ +add.f16x2 r487, %63, %93; +} +{ +mul.f16x2 r490, r487, r349; +} +{ +add.f16x2 r493, %53, r490; +} +{ +add.f16x2 r496, %73, %83; +} +{ +mul.f16x2 r499, r496, r353; +} +{ +add.f16x2 r502, r493, r499; +} +{ +sub.f16x2 r505, %64, %94; +} +{ +mul.f16x2 r508, r505, r351; +} +{ +sub.f16x2 r511, %74, %84; +} +{ +mul.f16x2 r514, r511, r354; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r502, r517; +} +{ +add.f16x2 r523, %64, %94; +} +{ +mul.f16x2 r526, r523, r345; +} +{ +add.f16x2 r529, %54, r526; +} +{ +add.f16x2 r532, %74, %84; +} +{ +mul.f16x2 r535, r532, r349; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, %63, %93; +} +{ +mul.f16x2 r544, r541, r347; +} +{ +sub.f16x2 r547, %73, %83; +} +{ +mul.f16x2 r550, r547, r351; +} +{ +add.f16x2 r553, r544, r550; +} +{ +add.f16x2 r556, r538, r553; +} +{ +add.f16x2 r559, %64, %94; +} +{ +mul.f16x2 r562, r559, r345; +} +{ +add.f16x2 r565, %54, r562; +} +{ +add.f16x2 r568, %74, %84; +} +{ +mul.f16x2 r571, r568, r349; +} +{ +add.f16x2 r574, r565, r571; +} +{ +sub.f16x2 r577, %63, %93; +} +{ +mul.f16x2 r580, r577, r347; +} +{ +sub.f16x2 r583, %73, %83; +} +{ +mul.f16x2 r586, r583, r351; +} +{ +add.f16x2 r589, r580, r586; +} +{ +sub.f16x2 r592, r574, r589; +} +{ +add.f16x2 r595, %64, %94; +} +{ +mul.f16x2 r598, r595, r349; +} +{ +add.f16x2 r601, %54, r598; +} +{ +add.f16x2 r604, %74, %84; +} +{ +mul.f16x2 r607, r604, r353; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, %63, %93; +} +{ +mul.f16x2 r616, r613, r351; +} +{ +sub.f16x2 r619, %73, %83; +} +{ +mul.f16x2 r622, r619, r354; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r610, r625; +} +{ +add.f16x2 r631, %64, %94; +} +{ +mul.f16x2 r634, r631, r349; +} +{ +add.f16x2 r637, %54, r634; +} +{ +add.f16x2 r640, %74, %84; +} +{ +mul.f16x2 r643, r640, r353; +} +{ +add.f16x2 r646, r637, r643; +} +{ +sub.f16x2 r649, %63, %93; +} +{ +mul.f16x2 r652, r649, r351; +} +{ +sub.f16x2 r655, %73, %83; +} +{ +mul.f16x2 r658, r655, r354; +} +{ +add.f16x2 r661, r652, r658; +} +{ +sub.f16x2 r664, r646, r661; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r667, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r668, {low, high}; +} +{ +neg.f16x2 r669, r668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r672, {low, high}; +} +{ +neg.f16x2 r673, r672; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r675, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r676, {low, high}; +} +{ +add.f16x2 r677, %65, %95; +} +{ +add.f16x2 r680, %55, r677; +} +{ +add.f16x2 r683, %75, %85; +} +{ +add.f16x2 r686, r680, r683; +} +{ +add.f16x2 r689, %66, %96; +} +{ +add.f16x2 r692, %56, r689; +} +{ +add.f16x2 r695, %76, %86; +} +{ +add.f16x2 r698, r692, r695; +} +{ +add.f16x2 r701, %65, %95; +} +{ +mul.f16x2 r704, r701, r667; +} +{ +add.f16x2 r707, %55, r704; +} +{ +add.f16x2 r710, %75, %85; +} +{ +mul.f16x2 r713, r710, r671; +} +{ +add.f16x2 r716, r707, r713; +} +{ +sub.f16x2 r719, %66, %96; +} +{ +mul.f16x2 r722, r719, r669; +} +{ +sub.f16x2 r725, %76, %86; +} +{ +mul.f16x2 r728, r725, r673; +} +{ +add.f16x2 r731, r722, r728; +} +{ +sub.f16x2 r734, r716, r731; +} +{ +add.f16x2 r737, %65, %95; +} +{ +mul.f16x2 r740, r737, r667; +} +{ +add.f16x2 r743, %55, r740; +} +{ +add.f16x2 r746, %75, %85; +} +{ +mul.f16x2 r749, r746, r671; +} +{ +add.f16x2 r752, r743, r749; +} +{ +sub.f16x2 r755, %66, %96; +} +{ +mul.f16x2 r758, r755, r669; +} +{ +sub.f16x2 r761, %76, %86; +} +{ +mul.f16x2 r764, r761, r673; +} +{ +add.f16x2 r767, r758, r764; +} +{ +add.f16x2 r770, r752, r767; +} +{ +add.f16x2 r773, %65, %95; +} +{ +mul.f16x2 r776, r773, r671; +} +{ +add.f16x2 r779, %55, r776; +} +{ +add.f16x2 r782, %75, %85; +} +{ +mul.f16x2 r785, r782, r675; +} +{ +add.f16x2 r788, r779, r785; +} +{ +sub.f16x2 r791, %66, %96; +} +{ +mul.f16x2 r794, r791, r673; +} +{ +sub.f16x2 r797, %76, %86; +} +{ +mul.f16x2 r800, r797, r676; +} +{ +add.f16x2 r803, r794, r800; +} +{ +sub.f16x2 r806, r788, r803; +} +{ +add.f16x2 r809, %65, %95; +} +{ +mul.f16x2 r812, r809, r671; +} +{ +add.f16x2 r815, %55, r812; +} +{ +add.f16x2 r818, %75, %85; +} +{ +mul.f16x2 r821, r818, r675; +} +{ +add.f16x2 r824, r815, r821; +} +{ +sub.f16x2 r827, %66, %96; +} +{ +mul.f16x2 r830, r827, r673; +} +{ +sub.f16x2 r833, %76, %86; +} +{ +mul.f16x2 r836, r833, r676; +} +{ +add.f16x2 r839, r830, r836; +} +{ +add.f16x2 r842, r824, r839; +} +{ +add.f16x2 r845, %66, %96; +} +{ +mul.f16x2 r848, r845, r667; +} +{ +add.f16x2 r851, %56, r848; +} +{ +add.f16x2 r854, %76, %86; +} +{ +mul.f16x2 r857, r854, r671; +} +{ +add.f16x2 r860, r851, r857; +} +{ +sub.f16x2 r863, %65, %95; +} +{ +mul.f16x2 r866, r863, r669; +} +{ +sub.f16x2 r869, %75, %85; +} +{ +mul.f16x2 r872, r869, r673; +} +{ +add.f16x2 r875, r866, r872; +} +{ +add.f16x2 r878, r860, r875; +} +{ +add.f16x2 r881, %66, %96; +} +{ +mul.f16x2 r884, r881, r667; +} +{ +add.f16x2 r887, %56, r884; +} +{ +add.f16x2 r890, %76, %86; +} +{ +mul.f16x2 r893, r890, r671; +} +{ +add.f16x2 r896, r887, r893; +} +{ +sub.f16x2 r899, %65, %95; +} +{ +mul.f16x2 r902, r899, r669; +} +{ +sub.f16x2 r905, %75, %85; +} +{ +mul.f16x2 r908, r905, r673; +} +{ +add.f16x2 r911, r902, r908; +} +{ +sub.f16x2 r914, r896, r911; +} +{ +add.f16x2 r917, %66, %96; +} +{ +mul.f16x2 r920, r917, r671; +} +{ +add.f16x2 r923, %56, r920; +} +{ +add.f16x2 r926, %76, %86; +} +{ +mul.f16x2 r929, r926, r675; +} +{ +add.f16x2 r932, r923, r929; +} +{ +sub.f16x2 r935, %65, %95; +} +{ +mul.f16x2 r938, r935, r673; +} +{ +sub.f16x2 r941, %75, %85; +} +{ +mul.f16x2 r944, r941, r676; +} +{ +add.f16x2 r947, r938, r944; +} +{ +add.f16x2 r950, r932, r947; +} +{ +add.f16x2 r953, %66, %96; +} +{ +mul.f16x2 r956, r953, r671; +} +{ +add.f16x2 r959, %56, r956; +} +{ +add.f16x2 r962, %76, %86; +} +{ +mul.f16x2 r965, r962, r675; +} +{ +add.f16x2 r968, r959, r965; +} +{ +sub.f16x2 r971, %65, %95; +} +{ +mul.f16x2 r974, r971, r673; +} +{ +sub.f16x2 r977, %75, %85; +} +{ +mul.f16x2 r980, r977, r676; +} +{ +add.f16x2 r983, r974, r980; +} +{ +sub.f16x2 r986, r968, r983; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r989, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r990, {low, high}; +} +{ +neg.f16x2 r991, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r993, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r994, {low, high}; +} +{ +neg.f16x2 r995, r994; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r998, {low, high}; +} +{ +add.f16x2 r999, %67, %97; +} +{ +add.f16x2 r1002, %57, r999; +} +{ +add.f16x2 r1005, %77, %87; +} +{ +add.f16x2 r1008, r1002, r1005; +} +{ +add.f16x2 r1011, %68, %98; +} +{ +add.f16x2 r1014, %58, r1011; +} +{ +add.f16x2 r1017, %78, %88; +} +{ +add.f16x2 r1020, r1014, r1017; +} +{ +add.f16x2 r1023, %67, %97; +} +{ +mul.f16x2 r1026, r1023, r989; +} +{ +add.f16x2 r1029, %57, r1026; +} +{ +add.f16x2 r1032, %77, %87; +} +{ +mul.f16x2 r1035, r1032, r993; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +sub.f16x2 r1041, %68, %98; +} +{ +mul.f16x2 r1044, r1041, r991; +} +{ +sub.f16x2 r1047, %78, %88; +} +{ +mul.f16x2 r1050, r1047, r995; +} +{ +add.f16x2 r1053, r1044, r1050; +} +{ +sub.f16x2 r1056, r1038, r1053; +} +{ +add.f16x2 r1059, %67, %97; +} +{ +mul.f16x2 r1062, r1059, r989; +} +{ +add.f16x2 r1065, %57, r1062; +} +{ +add.f16x2 r1068, %77, %87; +} +{ +mul.f16x2 r1071, r1068, r993; +} +{ +add.f16x2 r1074, r1065, r1071; +} +{ +sub.f16x2 r1077, %68, %98; +} +{ +mul.f16x2 r1080, r1077, r991; +} +{ +sub.f16x2 r1083, %78, %88; +} +{ +mul.f16x2 r1086, r1083, r995; +} +{ +add.f16x2 r1089, r1080, r1086; +} +{ +add.f16x2 r1092, r1074, r1089; +} +{ +add.f16x2 r1095, %67, %97; +} +{ +mul.f16x2 r1098, r1095, r993; +} +{ +add.f16x2 r1101, %57, r1098; +} +{ +add.f16x2 r1104, %77, %87; +} +{ +mul.f16x2 r1107, r1104, r997; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, %68, %98; +} +{ +mul.f16x2 r1116, r1113, r995; +} +{ +sub.f16x2 r1119, %78, %88; +} +{ +mul.f16x2 r1122, r1119, r998; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r1110, r1125; +} +{ +add.f16x2 r1131, %67, %97; +} +{ +mul.f16x2 r1134, r1131, r993; +} +{ +add.f16x2 r1137, %57, r1134; +} +{ +add.f16x2 r1140, %77, %87; +} +{ +mul.f16x2 r1143, r1140, r997; +} +{ +add.f16x2 r1146, r1137, r1143; +} +{ +sub.f16x2 r1149, %68, %98; +} +{ +mul.f16x2 r1152, r1149, r995; +} +{ +sub.f16x2 r1155, %78, %88; +} +{ +mul.f16x2 r1158, r1155, r998; +} +{ +add.f16x2 r1161, r1152, r1158; +} +{ +add.f16x2 r1164, r1146, r1161; +} +{ +add.f16x2 r1167, %68, %98; +} +{ +mul.f16x2 r1170, r1167, r989; +} +{ +add.f16x2 r1173, %58, r1170; +} +{ +add.f16x2 r1176, %78, %88; +} +{ +mul.f16x2 r1179, r1176, r993; +} +{ +add.f16x2 r1182, r1173, r1179; +} +{ +sub.f16x2 r1185, %67, %97; +} +{ +mul.f16x2 r1188, r1185, r991; +} +{ +sub.f16x2 r1191, %77, %87; +} +{ +mul.f16x2 r1194, r1191, r995; +} +{ +add.f16x2 r1197, r1188, r1194; +} +{ +add.f16x2 r1200, r1182, r1197; +} +{ +add.f16x2 r1203, %68, %98; +} +{ +mul.f16x2 r1206, r1203, r989; +} +{ +add.f16x2 r1209, %58, r1206; +} +{ +add.f16x2 r1212, %78, %88; +} +{ +mul.f16x2 r1215, r1212, r993; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, %67, %97; +} +{ +mul.f16x2 r1224, r1221, r991; +} +{ +sub.f16x2 r1227, %77, %87; +} +{ +mul.f16x2 r1230, r1227, r995; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r1218, r1233; +} +{ +add.f16x2 r1239, %68, %98; +} +{ +mul.f16x2 r1242, r1239, r993; +} +{ +add.f16x2 r1245, %58, r1242; +} +{ +add.f16x2 r1248, %78, %88; +} +{ +mul.f16x2 r1251, r1248, r997; +} +{ +add.f16x2 r1254, r1245, r1251; +} +{ +sub.f16x2 r1257, %67, %97; +} +{ +mul.f16x2 r1260, r1257, r995; +} +{ +sub.f16x2 r1263, %77, %87; +} +{ +mul.f16x2 r1266, r1263, r998; +} +{ +add.f16x2 r1269, r1260, r1266; +} +{ +add.f16x2 r1272, r1254, r1269; +} +{ +add.f16x2 r1275, %68, %98; +} +{ +mul.f16x2 r1278, r1275, r993; +} +{ +add.f16x2 r1281, %58, r1278; +} +{ +add.f16x2 r1284, %78, %88; +} +{ +mul.f16x2 r1287, r1284, r997; +} +{ +add.f16x2 r1290, r1281, r1287; +} +{ +sub.f16x2 r1293, %67, %97; +} +{ +mul.f16x2 r1296, r1293, r995; +} +{ +sub.f16x2 r1299, %77, %87; +} +{ +mul.f16x2 r1302, r1299, r998; +} +{ +add.f16x2 r1305, r1296, r1302; +} +{ +sub.f16x2 r1308, r1290, r1305; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1311, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1312, {low, high}; +} +{ +neg.f16x2 r1313, r1312; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1315, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1316, {low, high}; +} +{ +neg.f16x2 r1317, r1316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1319, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1320, {low, high}; +} +{ +add.f16x2 r1321, %69, %99; +} +{ +add.f16x2 r1324, %59, r1321; +} +{ +add.f16x2 r1327, %79, %89; +} +{ +add.f16x2 r1330, r1324, r1327; +} +{ +add.f16x2 r1333, %70, %100; +} +{ +add.f16x2 r1336, %60, r1333; +} +{ +add.f16x2 r1339, %80, %90; +} +{ +add.f16x2 r1342, r1336, r1339; +} +{ +add.f16x2 r1345, %69, %99; +} +{ +mul.f16x2 r1348, r1345, r1311; +} +{ +add.f16x2 r1351, %59, r1348; +} +{ +add.f16x2 r1354, %79, %89; +} +{ +mul.f16x2 r1357, r1354, r1315; +} +{ +add.f16x2 r1360, r1351, r1357; +} +{ +sub.f16x2 r1363, %70, %100; +} +{ +mul.f16x2 r1366, r1363, r1313; +} +{ +sub.f16x2 r1369, %80, %90; +} +{ +mul.f16x2 r1372, r1369, r1317; +} +{ +add.f16x2 r1375, r1366, r1372; +} +{ +sub.f16x2 r1378, r1360, r1375; +} +{ +add.f16x2 r1381, %69, %99; +} +{ +mul.f16x2 r1384, r1381, r1311; +} +{ +add.f16x2 r1387, %59, r1384; +} +{ +add.f16x2 r1390, %79, %89; +} +{ +mul.f16x2 r1393, r1390, r1315; +} +{ +add.f16x2 r1396, r1387, r1393; +} +{ +sub.f16x2 r1399, %70, %100; +} +{ +mul.f16x2 r1402, r1399, r1313; +} +{ +sub.f16x2 r1405, %80, %90; +} +{ +mul.f16x2 r1408, r1405, r1317; +} +{ +add.f16x2 r1411, r1402, r1408; +} +{ +add.f16x2 r1414, r1396, r1411; +} +{ +add.f16x2 r1417, %69, %99; +} +{ +mul.f16x2 r1420, r1417, r1315; +} +{ +add.f16x2 r1423, %59, r1420; +} +{ +add.f16x2 r1426, %79, %89; +} +{ +mul.f16x2 r1429, r1426, r1319; +} +{ +add.f16x2 r1432, r1423, r1429; +} +{ +sub.f16x2 r1435, %70, %100; +} +{ +mul.f16x2 r1438, r1435, r1317; +} +{ +sub.f16x2 r1441, %80, %90; +} +{ +mul.f16x2 r1444, r1441, r1320; +} +{ +add.f16x2 r1447, r1438, r1444; +} +{ +sub.f16x2 r1450, r1432, r1447; +} +{ +add.f16x2 r1453, %69, %99; +} +{ +mul.f16x2 r1456, r1453, r1315; +} +{ +add.f16x2 r1459, %59, r1456; +} +{ +add.f16x2 r1462, %79, %89; +} +{ +mul.f16x2 r1465, r1462, r1319; +} +{ +add.f16x2 r1468, r1459, r1465; +} +{ +sub.f16x2 r1471, %70, %100; +} +{ +mul.f16x2 r1474, r1471, r1317; +} +{ +sub.f16x2 r1477, %80, %90; +} +{ +mul.f16x2 r1480, r1477, r1320; +} +{ +add.f16x2 r1483, r1474, r1480; +} +{ +add.f16x2 r1486, r1468, r1483; +} +{ +add.f16x2 r1489, %70, %100; +} +{ +mul.f16x2 r1492, r1489, r1311; +} +{ +add.f16x2 r1495, %60, r1492; +} +{ +add.f16x2 r1498, %80, %90; +} +{ +mul.f16x2 r1501, r1498, r1315; +} +{ +add.f16x2 r1504, r1495, r1501; +} +{ +sub.f16x2 r1507, %69, %99; +} +{ +mul.f16x2 r1510, r1507, r1313; +} +{ +sub.f16x2 r1513, %79, %89; +} +{ +mul.f16x2 r1516, r1513, r1317; +} +{ +add.f16x2 r1519, r1510, r1516; +} +{ +add.f16x2 r1522, r1504, r1519; +} +{ +add.f16x2 r1525, %70, %100; +} +{ +mul.f16x2 r1528, r1525, r1311; +} +{ +add.f16x2 r1531, %60, r1528; +} +{ +add.f16x2 r1534, %80, %90; +} +{ +mul.f16x2 r1537, r1534, r1315; +} +{ +add.f16x2 r1540, r1531, r1537; +} +{ +sub.f16x2 r1543, %69, %99; +} +{ +mul.f16x2 r1546, r1543, r1313; +} +{ +sub.f16x2 r1549, %79, %89; +} +{ +mul.f16x2 r1552, r1549, r1317; +} +{ +add.f16x2 r1555, r1546, r1552; +} +{ +sub.f16x2 r1558, r1540, r1555; +} +{ +add.f16x2 r1561, %70, %100; +} +{ +mul.f16x2 r1564, r1561, r1315; +} +{ +add.f16x2 r1567, %60, r1564; +} +{ +add.f16x2 r1570, %80, %90; +} +{ +mul.f16x2 r1573, r1570, r1319; +} +{ +add.f16x2 r1576, r1567, r1573; +} +{ +sub.f16x2 r1579, %69, %99; +} +{ +mul.f16x2 r1582, r1579, r1317; +} +{ +sub.f16x2 r1585, %79, %89; +} +{ +mul.f16x2 r1588, r1585, r1320; +} +{ +add.f16x2 r1591, r1582, r1588; +} +{ +add.f16x2 r1594, r1576, r1591; +} +{ +add.f16x2 r1597, %70, %100; +} +{ +mul.f16x2 r1600, r1597, r1315; +} +{ +add.f16x2 r1603, %60, r1600; +} +{ +add.f16x2 r1606, %80, %90; +} +{ +mul.f16x2 r1609, r1606, r1319; +} +{ +add.f16x2 r1612, r1603, r1609; +} +{ +sub.f16x2 r1615, %69, %99; +} +{ +mul.f16x2 r1618, r1615, r1317; +} +{ +sub.f16x2 r1621, %79, %89; +} +{ +mul.f16x2 r1624, r1621, r1320; +} +{ +add.f16x2 r1627, r1618, r1624; +} +{ +sub.f16x2 r1630, r1612, r1627; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1633, {low, high}; +} +mov.f32 f64, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1634, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1635, {low, high}; +} +mov.f32 f68, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1636, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1637, {low, high}; +} +mov.f32 f72, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1638, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1639, {low, high}; +} +mov.f32 f76, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1640, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1643, {low, high}; +} +mov.f32 f84, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1644, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1647, {low, high}; +} +mov.f32 f92, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1648, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1649, {low, high}; +} +mov.f32 f96, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1650, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1655, {low, high}; +} +mov.f32 f108, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1656, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1663, {low, high}; +} +mov.f32 f124, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1664, {low, high}; +} +{ +mul.f16x2 r1681, r412, r1633; +} +{ +mul.f16x2 r1684, r556, r1634; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r412, r1634; +} +{ +fma.rn.f16x2 r1693, r556, r1633, r1690; +} +{ +mul.f16x2 r1697, r734, r1635; +} +{ +mul.f16x2 r1700, r878, r1636; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r734, r1636; +} +{ +fma.rn.f16x2 r1709, r878, r1635, r1706; +} +{ +mul.f16x2 r1713, r1056, r1637; +} +{ +mul.f16x2 r1716, r1200, r1638; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r1056, r1638; +} +{ +fma.rn.f16x2 r1725, r1200, r1637, r1722; +} +{ +mul.f16x2 r1729, r1378, r1639; +} +{ +mul.f16x2 r1732, r1522, r1640; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r1378, r1640; +} +{ +fma.rn.f16x2 r1741, r1522, r1639, r1738; +} +{ +mul.f16x2 r1745, r484, r1635; +} +{ +mul.f16x2 r1748, r628, r1636; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r484, r1636; +} +{ +fma.rn.f16x2 r1757, r628, r1635, r1754; +} +{ +mul.f16x2 r1761, r806, r1639; +} +{ +mul.f16x2 r1764, r950, r1640; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r806, r1640; +} +{ +fma.rn.f16x2 r1773, r950, r1639, r1770; +} +{ +mul.f16x2 r1777, r1128, r1643; +} +{ +mul.f16x2 r1780, r1272, r1644; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1128, r1644; +} +{ +fma.rn.f16x2 r1789, r1272, r1643, r1786; +} +{ +mul.f16x2 r1793, r1450, r1647; +} +{ +mul.f16x2 r1796, r1594, r1648; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1450, r1648; +} +{ +fma.rn.f16x2 r1805, r1594, r1647, r1802; +} +{ +mul.f16x2 r1809, r520, r1637; +} +{ +mul.f16x2 r1812, r664, r1638; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r520, r1638; +} +{ +fma.rn.f16x2 r1821, r664, r1637, r1818; +} +{ +mul.f16x2 r1825, r842, r1643; +} +{ +mul.f16x2 r1828, r986, r1644; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r842, r1644; +} +{ +fma.rn.f16x2 r1837, r986, r1643, r1834; +} +{ +mul.f16x2 r1841, r1164, r1649; +} +{ +mul.f16x2 r1844, r1308, r1650; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1164, r1650; +} +{ +fma.rn.f16x2 r1853, r1308, r1649, r1850; +} +{ +mul.f16x2 r1857, r1486, r1655; +} +{ +mul.f16x2 r1860, r1630, r1656; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1486, r1656; +} +{ +fma.rn.f16x2 r1869, r1630, r1655, r1866; +} +{ +mul.f16x2 r1873, r448, r1639; +} +{ +mul.f16x2 r1876, r592, r1640; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r448, r1640; +} +{ +fma.rn.f16x2 r1885, r592, r1639, r1882; +} +{ +mul.f16x2 r1889, r770, r1647; +} +{ +mul.f16x2 r1892, r914, r1648; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r770, r1648; +} +{ +fma.rn.f16x2 r1901, r914, r1647, r1898; +} +{ +mul.f16x2 r1905, r1092, r1655; +} +{ +mul.f16x2 r1908, r1236, r1656; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r1092, r1656; +} +{ +fma.rn.f16x2 r1917, r1236, r1655, r1914; +} +{ +mul.f16x2 r1921, r1414, r1663; +} +{ +mul.f16x2 r1924, r1558, r1664; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1414, r1664; +} +{ +fma.rn.f16x2 r1933, r1558, r1663, r1930; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1937, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1938, {low, high}; +} +{ +neg.f16x2 r1939, r1938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1942, {low, high}; +} +{ +neg.f16x2 r1943, r1942; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1946, {low, high}; +} +{ +add.f16x2 r1947, r364, r1330; +} +{ +add.f16x2 r1950, r42, r1947; +} +{ +add.f16x2 r1953, r686, r1008; +} +{ +add.f16x2 r1956, r1950, r1953; +} +st.local.u32 [rd3], r1956; +{ +add.f16x2 r1959, r376, r1342; +} +{ +add.f16x2 r1962, r54, r1959; +} +{ +add.f16x2 r1965, r698, r1020; +} +{ +add.f16x2 r1968, r1962, r1965; +} +st.local.u32 [rd3+4], r1968; +{ +add.f16x2 r1971, r364, r1330; +} +{ +mul.f16x2 r1974, r1971, r1937; +} +{ +add.f16x2 r1977, r42, r1974; +} +{ +add.f16x2 r1980, r686, r1008; +} +{ +mul.f16x2 r1983, r1980, r1941; +} +{ +add.f16x2 r1986, r1977, r1983; +} +{ +sub.f16x2 r1989, r376, r1342; +} +{ +mul.f16x2 r1992, r1989, r1939; +} +{ +sub.f16x2 r1995, r698, r1020; +} +{ +mul.f16x2 r1998, r1995, r1943; +} +{ +add.f16x2 r2001, r1992, r1998; +} +{ +sub.f16x2 r2004, r1986, r2001; +} +st.local.u32 [rd3+40], r2004; +{ +add.f16x2 r2007, r364, r1330; +} +{ +mul.f16x2 r2010, r2007, r1937; +} +{ +add.f16x2 r2013, r42, r2010; +} +{ +add.f16x2 r2016, r686, r1008; +} +{ +mul.f16x2 r2019, r2016, r1941; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +sub.f16x2 r2025, r376, r1342; +} +{ +mul.f16x2 r2028, r2025, r1939; +} +{ +sub.f16x2 r2031, r698, r1020; +} +{ +mul.f16x2 r2034, r2031, r1943; +} +{ +add.f16x2 r2037, r2028, r2034; +} +{ +add.f16x2 r2040, r2022, r2037; +} +st.local.u32 [rd3+160], r2040; +{ +add.f16x2 r2043, r364, r1330; +} +{ +mul.f16x2 r2046, r2043, r1941; +} +{ +add.f16x2 r2049, r42, r2046; +} +{ +add.f16x2 r2052, r686, r1008; +} +{ +mul.f16x2 r2055, r2052, r1945; +} +{ +add.f16x2 r2058, r2049, r2055; +} +{ +sub.f16x2 r2061, r376, r1342; +} +{ +mul.f16x2 r2064, r2061, r1943; +} +{ +sub.f16x2 r2067, r698, r1020; +} +{ +mul.f16x2 r2070, r2067, r1946; +} +{ +add.f16x2 r2073, r2064, r2070; +} +{ +sub.f16x2 r2076, r2058, r2073; +} +st.local.u32 [rd3+80], r2076; +{ +add.f16x2 r2079, r364, r1330; +} +{ +mul.f16x2 r2082, r2079, r1941; +} +{ +add.f16x2 r2085, r42, r2082; +} +{ +add.f16x2 r2088, r686, r1008; +} +{ +mul.f16x2 r2091, r2088, r1945; +} +{ +add.f16x2 r2094, r2085, r2091; +} +{ +sub.f16x2 r2097, r376, r1342; +} +{ +mul.f16x2 r2100, r2097, r1943; +} +{ +sub.f16x2 r2103, r698, r1020; +} +{ +mul.f16x2 r2106, r2103, r1946; +} +{ +add.f16x2 r2109, r2100, r2106; +} +{ +add.f16x2 r2112, r2094, r2109; +} +st.local.u32 [rd3+120], r2112; +{ +add.f16x2 r2115, r376, r1342; +} +{ +mul.f16x2 r2118, r2115, r1937; +} +{ +add.f16x2 r2121, r54, r2118; +} +{ +add.f16x2 r2124, r698, r1020; +} +{ +mul.f16x2 r2127, r2124, r1941; +} +{ +add.f16x2 r2130, r2121, r2127; +} +{ +sub.f16x2 r2133, r364, r1330; +} +{ +mul.f16x2 r2136, r2133, r1939; +} +{ +sub.f16x2 r2139, r686, r1008; +} +{ +mul.f16x2 r2142, r2139, r1943; +} +{ +add.f16x2 r2145, r2136, r2142; +} +{ +add.f16x2 r2148, r2130, r2145; +} +st.local.u32 [rd3+44], r2148; +{ +add.f16x2 r2151, r376, r1342; +} +{ +mul.f16x2 r2154, r2151, r1937; +} +{ +add.f16x2 r2157, r54, r2154; +} +{ +add.f16x2 r2160, r698, r1020; +} +{ +mul.f16x2 r2163, r2160, r1941; +} +{ +add.f16x2 r2166, r2157, r2163; +} +{ +sub.f16x2 r2169, r364, r1330; +} +{ +mul.f16x2 r2172, r2169, r1939; +} +{ +sub.f16x2 r2175, r686, r1008; +} +{ +mul.f16x2 r2178, r2175, r1943; +} +{ +add.f16x2 r2181, r2172, r2178; +} +{ +sub.f16x2 r2184, r2166, r2181; +} +st.local.u32 [rd3+164], r2184; +{ +add.f16x2 r2187, r376, r1342; +} +{ +mul.f16x2 r2190, r2187, r1941; +} +{ +add.f16x2 r2193, r54, r2190; +} +{ +add.f16x2 r2196, r698, r1020; +} +{ +mul.f16x2 r2199, r2196, r1945; +} +{ +add.f16x2 r2202, r2193, r2199; +} +{ +sub.f16x2 r2205, r364, r1330; +} +{ +mul.f16x2 r2208, r2205, r1943; +} +{ +sub.f16x2 r2211, r686, r1008; +} +{ +mul.f16x2 r2214, r2211, r1946; +} +{ +add.f16x2 r2217, r2208, r2214; +} +{ +add.f16x2 r2220, r2202, r2217; +} +st.local.u32 [rd3+84], r2220; +{ +add.f16x2 r2223, r376, r1342; +} +{ +mul.f16x2 r2226, r2223, r1941; +} +{ +add.f16x2 r2229, r54, r2226; +} +{ +add.f16x2 r2232, r698, r1020; +} +{ +mul.f16x2 r2235, r2232, r1945; +} +{ +add.f16x2 r2238, r2229, r2235; +} +{ +sub.f16x2 r2241, r364, r1330; +} +{ +mul.f16x2 r2244, r2241, r1943; +} +{ +sub.f16x2 r2247, r686, r1008; +} +{ +mul.f16x2 r2250, r2247, r1946; +} +{ +add.f16x2 r2253, r2244, r2250; +} +{ +sub.f16x2 r2256, r2238, r2253; +} +st.local.u32 [rd3+124], r2256; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2260, {low, high}; +} +{ +neg.f16x2 r2261, r2260; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2264, {low, high}; +} +{ +neg.f16x2 r2265, r2264; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2268, {low, high}; +} +{ +add.f16x2 r2269, r1687, r1735; +} +{ +add.f16x2 r2272, r90, r2269; +} +{ +add.f16x2 r2275, r1703, r1719; +} +{ +add.f16x2 r11408, r2272, r2275; +} +st.local.u32 [rd3+8], r11408; +{ +add.f16x2 r2281, r1693, r1741; +} +{ +add.f16x2 r2284, r234, r2281; +} +{ +add.f16x2 r2287, r1709, r1725; +} +{ +add.f16x2 r2290, r2284, r2287; +} +st.local.u32 [rd3+12], r2290; +{ +add.f16x2 r2293, r1687, r1735; +} +{ +mul.f16x2 r2296, r2293, r2259; +} +{ +add.f16x2 r2299, r90, r2296; +} +{ +add.f16x2 r2302, r1703, r1719; +} +{ +mul.f16x2 r2305, r2302, r2263; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1693, r1741; +} +{ +mul.f16x2 r2314, r2311, r2261; +} +{ +sub.f16x2 r2317, r1709, r1725; +} +{ +mul.f16x2 r2320, r2317, r2265; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +sub.f16x2 r2326, r2308, r2323; +} +st.local.u32 [rd3+48], r2326; +{ +add.f16x2 r2329, r1687, r1735; +} +{ +mul.f16x2 r2332, r2329, r2259; +} +{ +add.f16x2 r2335, r90, r2332; +} +{ +add.f16x2 r2338, r1703, r1719; +} +{ +mul.f16x2 r2341, r2338, r2263; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1693, r1741; +} +{ +mul.f16x2 r2350, r2347, r2261; +} +{ +sub.f16x2 r2353, r1709, r1725; +} +{ +mul.f16x2 r2356, r2353, r2265; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +add.f16x2 r2362, r2344, r2359; +} +st.local.u32 [rd3+168], r2362; +{ +add.f16x2 r2365, r1687, r1735; +} +{ +mul.f16x2 r2368, r2365, r2263; +} +{ +add.f16x2 r2371, r90, r2368; +} +{ +add.f16x2 r2374, r1703, r1719; +} +{ +mul.f16x2 r2377, r2374, r2267; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1693, r1741; +} +{ +mul.f16x2 r2386, r2383, r2265; +} +{ +sub.f16x2 r2389, r1709, r1725; +} +{ +mul.f16x2 r2392, r2389, r2268; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +sub.f16x2 r2398, r2380, r2395; +} +st.local.u32 [rd3+88], r2398; +{ +add.f16x2 r2401, r1687, r1735; +} +{ +mul.f16x2 r2404, r2401, r2263; +} +{ +add.f16x2 r2407, r90, r2404; +} +{ +add.f16x2 r2410, r1703, r1719; +} +{ +mul.f16x2 r2413, r2410, r2267; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1693, r1741; +} +{ +mul.f16x2 r2422, r2419, r2265; +} +{ +sub.f16x2 r2425, r1709, r1725; +} +{ +mul.f16x2 r2428, r2425, r2268; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +st.local.u32 [rd3+128], r2434; +{ +add.f16x2 r2437, r1693, r1741; +} +{ +mul.f16x2 r2440, r2437, r2259; +} +{ +add.f16x2 r2443, r234, r2440; +} +{ +add.f16x2 r2446, r1709, r1725; +} +{ +mul.f16x2 r2449, r2446, r2263; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1687, r1735; +} +{ +mul.f16x2 r2458, r2455, r2261; +} +{ +sub.f16x2 r2461, r1703, r1719; +} +{ +mul.f16x2 r2464, r2461, r2265; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +add.f16x2 r2470, r2452, r2467; +} +st.local.u32 [rd3+52], r2470; +{ +add.f16x2 r2473, r1693, r1741; +} +{ +mul.f16x2 r2476, r2473, r2259; +} +{ +add.f16x2 r2479, r234, r2476; +} +{ +add.f16x2 r2482, r1709, r1725; +} +{ +mul.f16x2 r2485, r2482, r2263; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1687, r1735; +} +{ +mul.f16x2 r2494, r2491, r2261; +} +{ +sub.f16x2 r2497, r1703, r1719; +} +{ +mul.f16x2 r2500, r2497, r2265; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +sub.f16x2 r2506, r2488, r2503; +} +st.local.u32 [rd3+172], r2506; +{ +add.f16x2 r2509, r1693, r1741; +} +{ +mul.f16x2 r2512, r2509, r2263; +} +{ +add.f16x2 r2515, r234, r2512; +} +{ +add.f16x2 r2518, r1709, r1725; +} +{ +mul.f16x2 r2521, r2518, r2267; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1687, r1735; +} +{ +mul.f16x2 r2530, r2527, r2265; +} +{ +sub.f16x2 r2533, r1703, r1719; +} +{ +mul.f16x2 r2536, r2533, r2268; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +add.f16x2 r2542, r2524, r2539; +} +st.local.u32 [rd3+92], r2542; +{ +add.f16x2 r2545, r1693, r1741; +} +{ +mul.f16x2 r2548, r2545, r2263; +} +{ +add.f16x2 r2551, r234, r2548; +} +{ +add.f16x2 r2554, r1709, r1725; +} +{ +mul.f16x2 r2557, r2554, r2267; +} +{ +add.f16x2 r2560, r2551, r2557; +} +{ +sub.f16x2 r2563, r1687, r1735; +} +{ +mul.f16x2 r2566, r2563, r2265; +} +{ +sub.f16x2 r2569, r1703, r1719; +} +{ +mul.f16x2 r2572, r2569, r2268; +} +{ +add.f16x2 r2575, r2566, r2572; +} +{ +sub.f16x2 r2578, r2560, r2575; +} +st.local.u32 [rd3+132], r2578; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2581, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2582, {low, high}; +} +{ +neg.f16x2 r2583, r2582; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2585, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2586, {low, high}; +} +{ +neg.f16x2 r2587, r2586; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2590, {low, high}; +} +{ +add.f16x2 r2591, r1751, r1799; +} +{ +add.f16x2 r2594, r162, r2591; +} +{ +add.f16x2 r2597, r1767, r1783; +} +{ +add.f16x2 r2600, r2594, r2597; +} +st.local.u32 [rd3+16], r2600; +{ +add.f16x2 r2603, r1757, r1805; +} +{ +add.f16x2 r2606, r306, r2603; +} +{ +add.f16x2 r2609, r1773, r1789; +} +{ +add.f16x2 r2612, r2606, r2609; +} +st.local.u32 [rd3+20], r2612; +{ +add.f16x2 r2615, r1751, r1799; +} +{ +mul.f16x2 r2618, r2615, r2581; +} +{ +add.f16x2 r2621, r162, r2618; +} +{ +add.f16x2 r2624, r1767, r1783; +} +{ +mul.f16x2 r2627, r2624, r2585; +} +{ +add.f16x2 r2630, r2621, r2627; +} +{ +sub.f16x2 r2633, r1757, r1805; +} +{ +mul.f16x2 r2636, r2633, r2583; +} +{ +sub.f16x2 r2639, r1773, r1789; +} +{ +mul.f16x2 r2642, r2639, r2587; +} +{ +add.f16x2 r2645, r2636, r2642; +} +{ +sub.f16x2 r2648, r2630, r2645; +} +st.local.u32 [rd3+56], r2648; +{ +add.f16x2 r2651, r1751, r1799; +} +{ +mul.f16x2 r2654, r2651, r2581; +} +{ +add.f16x2 r2657, r162, r2654; +} +{ +add.f16x2 r2660, r1767, r1783; +} +{ +mul.f16x2 r2663, r2660, r2585; +} +{ +add.f16x2 r2666, r2657, r2663; +} +{ +sub.f16x2 r2669, r1757, r1805; +} +{ +mul.f16x2 r2672, r2669, r2583; +} +{ +sub.f16x2 r2675, r1773, r1789; +} +{ +mul.f16x2 r2678, r2675, r2587; +} +{ +add.f16x2 r2681, r2672, r2678; +} +{ +add.f16x2 r2684, r2666, r2681; +} +st.local.u32 [rd3+176], r2684; +{ +add.f16x2 r2687, r1751, r1799; +} +{ +mul.f16x2 r2690, r2687, r2585; +} +{ +add.f16x2 r2693, r162, r2690; +} +{ +add.f16x2 r2696, r1767, r1783; +} +{ +mul.f16x2 r2699, r2696, r2589; +} +{ +add.f16x2 r2702, r2693, r2699; +} +{ +sub.f16x2 r2705, r1757, r1805; +} +{ +mul.f16x2 r2708, r2705, r2587; +} +{ +sub.f16x2 r2711, r1773, r1789; +} +{ +mul.f16x2 r2714, r2711, r2590; +} +{ +add.f16x2 r2717, r2708, r2714; +} +{ +sub.f16x2 r2720, r2702, r2717; +} +st.local.u32 [rd3+96], r2720; +{ +add.f16x2 r2723, r1751, r1799; +} +{ +mul.f16x2 r2726, r2723, r2585; +} +{ +add.f16x2 r2729, r162, r2726; +} +{ +add.f16x2 r2732, r1767, r1783; +} +{ +mul.f16x2 r2735, r2732, r2589; +} +{ +add.f16x2 r2738, r2729, r2735; +} +{ +sub.f16x2 r2741, r1757, r1805; +} +{ +mul.f16x2 r2744, r2741, r2587; +} +{ +sub.f16x2 r2747, r1773, r1789; +} +{ +mul.f16x2 r2750, r2747, r2590; +} +{ +add.f16x2 r2753, r2744, r2750; +} +{ +add.f16x2 r2756, r2738, r2753; +} +st.local.u32 [rd3+136], r2756; +{ +add.f16x2 r2759, r1757, r1805; +} +{ +mul.f16x2 r2762, r2759, r2581; +} +{ +add.f16x2 r2765, r306, r2762; +} +{ +add.f16x2 r2768, r1773, r1789; +} +{ +mul.f16x2 r2771, r2768, r2585; +} +{ +add.f16x2 r2774, r2765, r2771; +} +{ +sub.f16x2 r2777, r1751, r1799; +} +{ +mul.f16x2 r2780, r2777, r2583; +} +{ +sub.f16x2 r2783, r1767, r1783; +} +{ +mul.f16x2 r2786, r2783, r2587; +} +{ +add.f16x2 r2789, r2780, r2786; +} +{ +add.f16x2 r2792, r2774, r2789; +} +st.local.u32 [rd3+60], r2792; +{ +add.f16x2 r2795, r1757, r1805; +} +{ +mul.f16x2 r2798, r2795, r2581; +} +{ +add.f16x2 r2801, r306, r2798; +} +{ +add.f16x2 r2804, r1773, r1789; +} +{ +mul.f16x2 r2807, r2804, r2585; +} +{ +add.f16x2 r2810, r2801, r2807; +} +{ +sub.f16x2 r2813, r1751, r1799; +} +{ +mul.f16x2 r2816, r2813, r2583; +} +{ +sub.f16x2 r2819, r1767, r1783; +} +{ +mul.f16x2 r2822, r2819, r2587; +} +{ +add.f16x2 r2825, r2816, r2822; +} +{ +sub.f16x2 r2828, r2810, r2825; +} +st.local.u32 [rd3+180], r2828; +{ +add.f16x2 r2831, r1757, r1805; +} +{ +mul.f16x2 r2834, r2831, r2585; +} +{ +add.f16x2 r2837, r306, r2834; +} +{ +add.f16x2 r2840, r1773, r1789; +} +{ +mul.f16x2 r2843, r2840, r2589; +} +{ +add.f16x2 r2846, r2837, r2843; +} +{ +sub.f16x2 r2849, r1751, r1799; +} +{ +mul.f16x2 r2852, r2849, r2587; +} +{ +sub.f16x2 r2855, r1767, r1783; +} +{ +mul.f16x2 r2858, r2855, r2590; +} +{ +add.f16x2 r2861, r2852, r2858; +} +{ +add.f16x2 r2864, r2846, r2861; +} +st.local.u32 [rd3+100], r2864; +{ +add.f16x2 r2867, r1757, r1805; +} +{ +mul.f16x2 r2870, r2867, r2585; +} +{ +add.f16x2 r2873, r306, r2870; +} +{ +add.f16x2 r2876, r1773, r1789; +} +{ +mul.f16x2 r2879, r2876, r2589; +} +{ +add.f16x2 r2882, r2873, r2879; +} +{ +sub.f16x2 r2885, r1751, r1799; +} +{ +mul.f16x2 r2888, r2885, r2587; +} +{ +sub.f16x2 r2891, r1767, r1783; +} +{ +mul.f16x2 r2894, r2891, r2590; +} +{ +add.f16x2 r2897, r2888, r2894; +} +{ +sub.f16x2 r2900, r2882, r2897; +} +st.local.u32 [rd3+140], r2900; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2904, {low, high}; +} +{ +neg.f16x2 r2905, r2904; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2908, {low, high}; +} +{ +neg.f16x2 r2909, r2908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2911, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2912, {low, high}; +} +{ +add.f16x2 r2913, r1815, r1863; +} +{ +add.f16x2 r2916, r198, r2913; +} +{ +add.f16x2 r2919, r1831, r1847; +} +{ +add.f16x2 r2922, r2916, r2919; +} +st.local.u32 [rd3+24], r2922; +{ +add.f16x2 r2925, r1821, r1869; +} +{ +add.f16x2 r2928, r342, r2925; +} +{ +add.f16x2 r2931, r1837, r1853; +} +{ +add.f16x2 r2934, r2928, r2931; +} +st.local.u32 [rd3+28], r2934; +{ +add.f16x2 r2937, r1815, r1863; +} +{ +mul.f16x2 r2940, r2937, r2903; +} +{ +add.f16x2 r2943, r198, r2940; +} +{ +add.f16x2 r2946, r1831, r1847; +} +{ +mul.f16x2 r2949, r2946, r2907; +} +{ +add.f16x2 r2952, r2943, r2949; +} +{ +sub.f16x2 r2955, r1821, r1869; +} +{ +mul.f16x2 r2958, r2955, r2905; +} +{ +sub.f16x2 r2961, r1837, r1853; +} +{ +mul.f16x2 r2964, r2961, r2909; +} +{ +add.f16x2 r2967, r2958, r2964; +} +{ +sub.f16x2 r2970, r2952, r2967; +} +st.local.u32 [rd3+64], r2970; +{ +add.f16x2 r2973, r1815, r1863; +} +{ +mul.f16x2 r2976, r2973, r2903; +} +{ +add.f16x2 r2979, r198, r2976; +} +{ +add.f16x2 r2982, r1831, r1847; +} +{ +mul.f16x2 r2985, r2982, r2907; +} +{ +add.f16x2 r2988, r2979, r2985; +} +{ +sub.f16x2 r2991, r1821, r1869; +} +{ +mul.f16x2 r2994, r2991, r2905; +} +{ +sub.f16x2 r2997, r1837, r1853; +} +{ +mul.f16x2 r3000, r2997, r2909; +} +{ +add.f16x2 r3003, r2994, r3000; +} +{ +add.f16x2 r3006, r2988, r3003; +} +st.local.u32 [rd3+184], r3006; +{ +add.f16x2 r3009, r1815, r1863; +} +{ +mul.f16x2 r3012, r3009, r2907; +} +{ +add.f16x2 r3015, r198, r3012; +} +{ +add.f16x2 r3018, r1831, r1847; +} +{ +mul.f16x2 r3021, r3018, r2911; +} +{ +add.f16x2 r3024, r3015, r3021; +} +{ +sub.f16x2 r3027, r1821, r1869; +} +{ +mul.f16x2 r3030, r3027, r2909; +} +{ +sub.f16x2 r3033, r1837, r1853; +} +{ +mul.f16x2 r3036, r3033, r2912; +} +{ +add.f16x2 r3039, r3030, r3036; +} +{ +sub.f16x2 r3042, r3024, r3039; +} +st.local.u32 [rd3+104], r3042; +{ +add.f16x2 r3045, r1815, r1863; +} +{ +mul.f16x2 r3048, r3045, r2907; +} +{ +add.f16x2 r3051, r198, r3048; +} +{ +add.f16x2 r3054, r1831, r1847; +} +{ +mul.f16x2 r3057, r3054, r2911; +} +{ +add.f16x2 r3060, r3051, r3057; +} +{ +sub.f16x2 r3063, r1821, r1869; +} +{ +mul.f16x2 r3066, r3063, r2909; +} +{ +sub.f16x2 r3069, r1837, r1853; +} +{ +mul.f16x2 r3072, r3069, r2912; +} +{ +add.f16x2 r3075, r3066, r3072; +} +{ +add.f16x2 r3078, r3060, r3075; +} +st.local.u32 [rd3+144], r3078; +{ +add.f16x2 r3081, r1821, r1869; +} +{ +mul.f16x2 r3084, r3081, r2903; +} +{ +add.f16x2 r3087, r342, r3084; +} +{ +add.f16x2 r3090, r1837, r1853; +} +{ +mul.f16x2 r3093, r3090, r2907; +} +{ +add.f16x2 r3096, r3087, r3093; +} +{ +sub.f16x2 r3099, r1815, r1863; +} +{ +mul.f16x2 r3102, r3099, r2905; +} +{ +sub.f16x2 r3105, r1831, r1847; +} +{ +mul.f16x2 r3108, r3105, r2909; +} +{ +add.f16x2 r3111, r3102, r3108; +} +{ +add.f16x2 r3114, r3096, r3111; +} +st.local.u32 [rd3+68], r3114; +{ +add.f16x2 r3117, r1821, r1869; +} +{ +mul.f16x2 r3120, r3117, r2903; +} +{ +add.f16x2 r3123, r342, r3120; +} +{ +add.f16x2 r3126, r1837, r1853; +} +{ +mul.f16x2 r3129, r3126, r2907; +} +{ +add.f16x2 r3132, r3123, r3129; +} +{ +sub.f16x2 r3135, r1815, r1863; +} +{ +mul.f16x2 r3138, r3135, r2905; +} +{ +sub.f16x2 r3141, r1831, r1847; +} +{ +mul.f16x2 r3144, r3141, r2909; +} +{ +add.f16x2 r3147, r3138, r3144; +} +{ +sub.f16x2 r3150, r3132, r3147; +} +st.local.u32 [rd3+188], r3150; +{ +add.f16x2 r3153, r1821, r1869; +} +{ +mul.f16x2 r3156, r3153, r2907; +} +{ +add.f16x2 r3159, r342, r3156; +} +{ +add.f16x2 r3162, r1837, r1853; +} +{ +mul.f16x2 r3165, r3162, r2911; +} +{ +add.f16x2 r3168, r3159, r3165; +} +{ +sub.f16x2 r3171, r1815, r1863; +} +{ +mul.f16x2 r3174, r3171, r2909; +} +{ +sub.f16x2 r3177, r1831, r1847; +} +{ +mul.f16x2 r3180, r3177, r2912; +} +{ +add.f16x2 r3183, r3174, r3180; +} +{ +add.f16x2 r3186, r3168, r3183; +} +st.local.u32 [rd3+108], r3186; +{ +add.f16x2 r3189, r1821, r1869; +} +{ +mul.f16x2 r3192, r3189, r2907; +} +{ +add.f16x2 r3195, r342, r3192; +} +{ +add.f16x2 r3198, r1837, r1853; +} +{ +mul.f16x2 r3201, r3198, r2911; +} +{ +add.f16x2 r3204, r3195, r3201; +} +{ +sub.f16x2 r3207, r1815, r1863; +} +{ +mul.f16x2 r3210, r3207, r2909; +} +{ +sub.f16x2 r3213, r1831, r1847; +} +{ +mul.f16x2 r3216, r3213, r2912; +} +{ +add.f16x2 r3219, r3210, r3216; +} +{ +sub.f16x2 r3222, r3204, r3219; +} +st.local.u32 [rd3+148], r3222; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3226, {low, high}; +} +{ +neg.f16x2 r3227, r3226; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3230, {low, high}; +} +{ +neg.f16x2 r3231, r3230; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3233, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3234, {low, high}; +} +{ +add.f16x2 r3235, r1879, r1927; +} +{ +add.f16x2 r3238, r126, r3235; +} +{ +add.f16x2 r3241, r1895, r1911; +} +{ +add.f16x2 r3244, r3238, r3241; +} +st.local.u32 [rd3+32], r3244; +{ +add.f16x2 r3247, r1885, r1933; +} +{ +add.f16x2 r3250, r270, r3247; +} +{ +add.f16x2 r3253, r1901, r1917; +} +{ +add.f16x2 r3256, r3250, r3253; +} +st.local.u32 [rd3+36], r3256; +{ +add.f16x2 r3259, r1879, r1927; +} +{ +mul.f16x2 r3262, r3259, r3225; +} +{ +add.f16x2 r3265, r126, r3262; +} +{ +add.f16x2 r3268, r1895, r1911; +} +{ +mul.f16x2 r3271, r3268, r3229; +} +{ +add.f16x2 r3274, r3265, r3271; +} +{ +sub.f16x2 r3277, r1885, r1933; +} +{ +mul.f16x2 r3280, r3277, r3227; +} +{ +sub.f16x2 r3283, r1901, r1917; +} +{ +mul.f16x2 r3286, r3283, r3231; +} +{ +add.f16x2 r3289, r3280, r3286; +} +{ +sub.f16x2 r3292, r3274, r3289; +} +st.local.u32 [rd3+72], r3292; +{ +add.f16x2 r3295, r1879, r1927; +} +{ +mul.f16x2 r3298, r3295, r3225; +} +{ +add.f16x2 r3301, r126, r3298; +} +{ +add.f16x2 r3304, r1895, r1911; +} +{ +mul.f16x2 r3307, r3304, r3229; +} +{ +add.f16x2 r3310, r3301, r3307; +} +{ +sub.f16x2 r3313, r1885, r1933; +} +{ +mul.f16x2 r3316, r3313, r3227; +} +{ +sub.f16x2 r3319, r1901, r1917; +} +{ +mul.f16x2 r3322, r3319, r3231; +} +{ +add.f16x2 r3325, r3316, r3322; +} +{ +add.f16x2 r3328, r3310, r3325; +} +st.local.u32 [rd3+192], r3328; +{ +add.f16x2 r3331, r1879, r1927; +} +{ +mul.f16x2 r3334, r3331, r3229; +} +{ +add.f16x2 r3337, r126, r3334; +} +{ +add.f16x2 r3340, r1895, r1911; +} +{ +mul.f16x2 r3343, r3340, r3233; +} +{ +add.f16x2 r3346, r3337, r3343; +} +{ +sub.f16x2 r3349, r1885, r1933; +} +{ +mul.f16x2 r3352, r3349, r3231; +} +{ +sub.f16x2 r3355, r1901, r1917; +} +{ +mul.f16x2 r3358, r3355, r3234; +} +{ +add.f16x2 r3361, r3352, r3358; +} +{ +sub.f16x2 r3364, r3346, r3361; +} +st.local.u32 [rd3+112], r3364; +{ +add.f16x2 r3367, r1879, r1927; +} +{ +mul.f16x2 r3370, r3367, r3229; +} +{ +add.f16x2 r3373, r126, r3370; +} +{ +add.f16x2 r3376, r1895, r1911; +} +{ +mul.f16x2 r3379, r3376, r3233; +} +{ +add.f16x2 r3382, r3373, r3379; +} +{ +sub.f16x2 r3385, r1885, r1933; +} +{ +mul.f16x2 r3388, r3385, r3231; +} +{ +sub.f16x2 r3391, r1901, r1917; +} +{ +mul.f16x2 r3394, r3391, r3234; +} +{ +add.f16x2 r3397, r3388, r3394; +} +{ +add.f16x2 r3400, r3382, r3397; +} +st.local.u32 [rd3+152], r3400; +{ +add.f16x2 r3403, r1885, r1933; +} +{ +mul.f16x2 r3406, r3403, r3225; +} +{ +add.f16x2 r3409, r270, r3406; +} +{ +add.f16x2 r3412, r1901, r1917; +} +{ +mul.f16x2 r3415, r3412, r3229; +} +{ +add.f16x2 r3418, r3409, r3415; +} +{ +sub.f16x2 r3421, r1879, r1927; +} +{ +mul.f16x2 r3424, r3421, r3227; +} +{ +sub.f16x2 r3427, r1895, r1911; +} +{ +mul.f16x2 r3430, r3427, r3231; +} +{ +add.f16x2 r3433, r3424, r3430; +} +{ +add.f16x2 r3436, r3418, r3433; +} +st.local.u32 [rd3+76], r3436; +{ +add.f16x2 r3439, r1885, r1933; +} +{ +mul.f16x2 r3442, r3439, r3225; +} +{ +add.f16x2 r3445, r270, r3442; +} +{ +add.f16x2 r3448, r1901, r1917; +} +{ +mul.f16x2 r3451, r3448, r3229; +} +{ +add.f16x2 r3454, r3445, r3451; +} +{ +sub.f16x2 r3457, r1879, r1927; +} +{ +mul.f16x2 r3460, r3457, r3227; +} +{ +sub.f16x2 r3463, r1895, r1911; +} +{ +mul.f16x2 r3466, r3463, r3231; +} +{ +add.f16x2 r3469, r3460, r3466; +} +{ +sub.f16x2 r3472, r3454, r3469; +} +st.local.u32 [rd3+196], r3472; +{ +add.f16x2 r3475, r1885, r1933; +} +{ +mul.f16x2 r3478, r3475, r3229; +} +{ +add.f16x2 r3481, r270, r3478; +} +{ +add.f16x2 r3484, r1901, r1917; +} +{ +mul.f16x2 r3487, r3484, r3233; +} +{ +add.f16x2 r3490, r3481, r3487; +} +{ +sub.f16x2 r3493, r1879, r1927; +} +{ +mul.f16x2 r3496, r3493, r3231; +} +{ +sub.f16x2 r3499, r1895, r1911; +} +{ +mul.f16x2 r3502, r3499, r3234; +} +{ +add.f16x2 r3505, r3496, r3502; +} +{ +add.f16x2 r3508, r3490, r3505; +} +st.local.u32 [rd3+116], r3508; +{ +add.f16x2 r3511, r1885, r1933; +} +{ +mul.f16x2 r3514, r3511, r3229; +} +{ +add.f16x2 r3517, r270, r3514; +} +{ +add.f16x2 r3520, r1901, r1917; +} +{ +mul.f16x2 r3523, r3520, r3233; +} +{ +add.f16x2 r3526, r3517, r3523; +} +{ +sub.f16x2 r3529, r1879, r1927; +} +{ +mul.f16x2 r3532, r3529, r3231; +} +{ +sub.f16x2 r3535, r1895, r1911; +} +{ +mul.f16x2 r3538, r3535, r3234; +} +{ +add.f16x2 r3541, r3532, r3538; +} +{ +sub.f16x2 r3544, r3526, r3541; +} +st.local.u32 [rd3+156], r3544; +mov.u32 r3552, %tid.x; +mul.wide.u32 rd11, r3552, -776530087; +shr.u64 rd12, rd11, 41; +cvt.u32.u64 r4, rd12; +mul.lo.s32 r3553, r4, 625; +sub.s32 r3, r3552, r3553; +cvt.rn.f32.u32 f221, r3; +mul.f32 f222, f221, 0f39D2D427; +cos.approx.f32 f217, f222; +sin.approx.f32 f223, f222; +neg.f32 f218, f223; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r11410, {low, high}; +} +mov.u32 r11409, 1; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11410; +mov.b32 r3574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11410; +mov.b32 r3576, {high, high}; +} +bra.uni LBB0_1; +LBB0_2: +ld.local.u32 r11408, [rd5+60]; +LBB0_1: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11410; +mov.b32 r3554, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11410; +mov.b32 r3556, {high, high}; +} +mul.wide.u32 rd13, r11409, 8; +add.s64 rd14, rd3, rd13; +add.s64 rd5, rd14, 4; +ld.local.u32 r3559, [rd14+4]; +{ +mul.f16x2 r3558, r3559, r3556; +} +{ +fma.rn.f16x2 r3561, r11408, r3554, r3558; +} +st.local.u32 [rd14], r3561; +{ +mul.f16x2 r3565, r11408, r3556; +} +{ +neg.f16x2 r3568, r3565; +} +{ +fma.rn.f16x2 r3570, r3559, r3554, r3568; +} +st.local.u32 [rd14+4], r3570; +mov.f32 f238, 0fBF800000; +mov.f32 f239, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3578, {low, high}; +} +{ +mul.f16x2 r3579, r3576, r3578; +} +{ +mul.f16x2 r3582, r11410, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11410; +mov.b32 r3585, {high, low}; +} +{ +fma.rn.f16x2 r3587, r3579, r3585, r3582; +} +ld.local.u32 r3603, [rd14+8]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3587; +mov.b32 r3591, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3587; +mov.b32 r3593, {high, high}; +} +ld.local.u32 r3608, [rd14+12]; +{ +mul.f16x2 r3595, r3608, r3593; +} +{ +fma.rn.f16x2 r3598, r3603, r3591, r3595; +} +st.local.u32 [rd14+8], r3598; +{ +mul.f16x2 r3602, r3603, r3593; +} +{ +neg.f16x2 r3605, r3602; +} +{ +fma.rn.f16x2 r3607, r3608, r3591, r3605; +} +st.local.u32 [rd14+12], r3607; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3615, {low, high}; +} +{ +mul.f16x2 r3616, r3576, r3615; +} +{ +mul.f16x2 r3619, r3587, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3587; +mov.b32 r3622, {high, low}; +} +{ +fma.rn.f16x2 r3624, r3616, r3622, r3619; +} +ld.local.u32 r3640, [rd14+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3624; +mov.b32 r3628, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3624; +mov.b32 r3630, {high, high}; +} +ld.local.u32 r3645, [rd14+20]; +{ +mul.f16x2 r3632, r3645, r3630; +} +{ +fma.rn.f16x2 r3635, r3640, r3628, r3632; +} +st.local.u32 [rd14+16], r3635; +{ +mul.f16x2 r3639, r3640, r3630; +} +{ +neg.f16x2 r3642, r3639; +} +{ +fma.rn.f16x2 r3644, r3645, r3628, r3642; +} +st.local.u32 [rd14+20], r3644; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3652, {low, high}; +} +{ +mul.f16x2 r3653, r3576, r3652; +} +{ +mul.f16x2 r3656, r3624, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3624; +mov.b32 r3659, {high, low}; +} +{ +fma.rn.f16x2 r3661, r3653, r3659, r3656; +} +ld.local.u32 r3677, [rd14+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3661; +mov.b32 r3665, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3661; +mov.b32 r3667, {high, high}; +} +ld.local.u32 r3682, [rd14+28]; +{ +mul.f16x2 r3669, r3682, r3667; +} +{ +fma.rn.f16x2 r3672, r3677, r3665, r3669; +} +st.local.u32 [rd14+24], r3672; +{ +mul.f16x2 r3676, r3677, r3667; +} +{ +neg.f16x2 r3679, r3676; +} +{ +fma.rn.f16x2 r3681, r3682, r3665, r3679; +} +st.local.u32 [rd14+28], r3681; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3689, {low, high}; +} +{ +mul.f16x2 r3690, r3576, r3689; +} +{ +mul.f16x2 r3693, r3661, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3661; +mov.b32 r3696, {high, low}; +} +{ +fma.rn.f16x2 r3698, r3690, r3696, r3693; +} +ld.local.u32 r3710, [rd14+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3698; +mov.b32 r3702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3698; +mov.b32 r3704, {high, high}; +} +ld.local.u32 r3719, [rd14+36]; +{ +mul.f16x2 r3706, r3719, r3704; +} +{ +fma.rn.f16x2 r3709, r3710, r3702, r3706; +} +st.local.u32 [rd14+32], r3709; +{ +mul.f16x2 r3713, r3710, r3704; +} +{ +neg.f16x2 r3716, r3713; +} +{ +fma.rn.f16x2 r3718, r3719, r3702, r3716; +} +st.local.u32 [rd14+36], r3718; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3726, {low, high}; +} +{ +mul.f16x2 r3727, r3576, r3726; +} +{ +mul.f16x2 r3730, r3698, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3698; +mov.b32 r3733, {high, low}; +} +{ +fma.rn.f16x2 r3735, r3727, r3733, r3730; +} +ld.local.u32 r3747, [rd14+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3735; +mov.b32 r3739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3735; +mov.b32 r3741, {high, high}; +} +ld.local.u32 r3744, [rd14+44]; +{ +mul.f16x2 r3743, r3744, r3741; +} +{ +fma.rn.f16x2 r3746, r3747, r3739, r3743; +} +st.local.u32 [rd14+40], r3746; +{ +mul.f16x2 r3750, r3747, r3741; +} +{ +neg.f16x2 r3753, r3750; +} +{ +fma.rn.f16x2 r3755, r3744, r3739, r3753; +} +st.local.u32 [rd14+44], r3755; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3763, {low, high}; +} +{ +mul.f16x2 r3764, r3576, r3763; +} +{ +mul.f16x2 r3767, r3735, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3735; +mov.b32 r3770, {high, low}; +} +{ +fma.rn.f16x2 r3772, r3764, r3770, r3767; +} +ld.local.u32 r3784, [rd14+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3772; +mov.b32 r3776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3772; +mov.b32 r3778, {high, high}; +} +ld.local.u32 r3781, [rd14+52]; +{ +mul.f16x2 r3780, r3781, r3778; +} +{ +fma.rn.f16x2 r3783, r3784, r3776, r3780; +} +st.local.u32 [rd14+48], r3783; +{ +mul.f16x2 r3787, r3784, r3778; +} +{ +neg.f16x2 r3790, r3787; +} +{ +fma.rn.f16x2 r3792, r3781, r3776, r3790; +} +st.local.u32 [rd14+52], r3792; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3800, {low, high}; +} +{ +mul.f16x2 r3801, r3576, r3800; +} +{ +mul.f16x2 r3804, r3772, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3772; +mov.b32 r3807, {high, low}; +} +{ +fma.rn.f16x2 r3809, r3801, r3807, r3804; +} +ld.local.u32 r3821, [rd14+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3809; +mov.b32 r3813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3809; +mov.b32 r3815, {high, high}; +} +ld.local.u32 r3818, [rd14+60]; +{ +mul.f16x2 r3817, r3818, r3815; +} +{ +fma.rn.f16x2 r3820, r3821, r3813, r3817; +} +st.local.u32 [rd14+56], r3820; +{ +mul.f16x2 r3824, r3821, r3815; +} +{ +neg.f16x2 r3827, r3824; +} +{ +fma.rn.f16x2 r3829, r3818, r3813, r3827; +} +st.local.u32 [rd14+60], r3829; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3837, {low, high}; +} +{ +mul.f16x2 r3838, r3576, r3837; +} +{ +mul.f16x2 r3841, r3809, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3809; +mov.b32 r3844, {high, low}; +} +{ +fma.rn.f16x2 r11410, r3838, r3844, r3841; +} +add.s32 r11409, r11409, 8; +setp.eq.s32 p1, r11409, 25; +@p1 bra LBB0_3; +bra.uni LBB0_2; +LBB0_3: +shl.b32 r7378, r1, 3; +mov.u32 r7379, %50; +add.s32 r7380, r7379, r7378; +mad.lo.s32 r12, r4, 125000, r7380; +barrier.sync 0; +mad.lo.s32 r7381, r3, 200, r12; +ld.local.v2.u32 {r7382, r7383}, [rd3]; +st.shared.v2.f32 [r7381], {r7382, r7383}; +ld.local.v2.u32 {r7386, r7387}, [rd4+4]; +st.shared.v2.f32 [r7381+8], {r7386, r7387}; +ld.local.v2.u32 {r7390, r7391}, [rd4+12]; +st.shared.v2.f32 [r7381+16], {r7390, r7391}; +ld.local.v2.u32 {r7394, r7395}, [rd4+20]; +st.shared.v2.f32 [r7381+24], {r7394, r7395}; +ld.local.v2.u32 {r7398, r7399}, [rd4+28]; +st.shared.v2.f32 [r7381+32], {r7398, r7399}; +ld.local.v2.u32 {r7402, r7403}, [rd4+36]; +st.shared.v2.f32 [r7381+40], {r7402, r7403}; +ld.local.v2.u32 {r7406, r7407}, [rd4+44]; +st.shared.v2.f32 [r7381+48], {r7406, r7407}; +ld.local.v2.u32 {r7410, r7411}, [rd4+52]; +st.shared.v2.f32 [r7381+56], {r7410, r7411}; +ld.local.v2.u32 {r7414, r7415}, [rd4+60]; +st.shared.v2.f32 [r7381+64], {r7414, r7415}; +ld.local.v2.u32 {r7418, r7419}, [rd4+68]; +st.shared.v2.f32 [r7381+72], {r7418, r7419}; +ld.local.v2.u32 {r7422, r7423}, [rd4+76]; +st.shared.v2.f32 [r7381+80], {r7422, r7423}; +ld.local.v2.u32 {r7426, r7427}, [rd4+84]; +st.shared.v2.f32 [r7381+88], {r7426, r7427}; +ld.local.v2.u32 {r7430, r7431}, [rd4+92]; +st.shared.v2.f32 [r7381+96], {r7430, r7431}; +ld.local.v2.u32 {r7434, r7435}, [rd4+100]; +st.shared.v2.f32 [r7381+104], {r7434, r7435}; +ld.local.v2.u32 {r7438, r7439}, [rd4+108]; +st.shared.v2.f32 [r7381+112], {r7438, r7439}; +ld.local.v2.u32 {r7442, r7443}, [rd4+116]; +st.shared.v2.f32 [r7381+120], {r7442, r7443}; +ld.local.v2.u32 {r7446, r7447}, [rd4+124]; +st.shared.v2.f32 [r7381+128], {r7446, r7447}; +ld.local.v2.u32 {r7450, r7451}, [rd4+132]; +st.shared.v2.f32 [r7381+136], {r7450, r7451}; +ld.local.v2.u32 {r7454, r7455}, [rd4+140]; +st.shared.v2.f32 [r7381+144], {r7454, r7455}; +ld.local.v2.u32 {r7458, r7459}, [rd4+148]; +st.shared.v2.f32 [r7381+152], {r7458, r7459}; +ld.local.v2.u32 {r7462, r7463}, [rd4+156]; +st.shared.v2.f32 [r7381+160], {r7462, r7463}; +ld.local.v2.u32 {r7466, r7467}, [rd4+164]; +st.shared.v2.f32 [r7381+168], {r7466, r7467}; +ld.local.v2.u32 {r7470, r7471}, [rd4+172]; +st.shared.v2.f32 [r7381+176], {r7470, r7471}; +ld.local.v2.u32 {r7474, r7475}, [rd4+180]; +st.shared.v2.f32 [r7381+184], {r7474, r7475}; +ld.local.v2.u32 {r7478, r7479}, [rd4+188]; +st.shared.v2.f32 [r7381+192], {r7478, r7479}; +barrier.sync 0; +mad.lo.s32 r13, r3, -192, r7381; +ld.shared.u32 r3864, [r13]; +ld.shared.u32 r3876, [r13+4]; +ld.shared.u32 r4186, [r13+5000]; +ld.shared.u32 r4198, [r13+5004]; +ld.shared.u32 r4508, [r13+10000]; +ld.shared.u32 r4520, [r13+10004]; +ld.shared.u32 r4830, [r13+15000]; +ld.shared.u32 r4842, [r13+15004]; +ld.shared.u32 r5152, [r13+20000]; +ld.shared.u32 r5164, [r13+20004]; +ld.shared.u32 r3861, [r13+25000]; +ld.shared.u32 r3873, [r13+25004]; +ld.shared.u32 r4183, [r13+30000]; +ld.shared.u32 r4195, [r13+30004]; +ld.shared.u32 r4505, [r13+35000]; +ld.shared.u32 r4517, [r13+35004]; +ld.shared.u32 r4827, [r13+40000]; +ld.shared.u32 r4839, [r13+40004]; +ld.shared.u32 r5149, [r13+45000]; +ld.shared.u32 r5161, [r13+45004]; +ld.shared.u32 r3867, [r13+50000]; +ld.shared.u32 r3879, [r13+50004]; +ld.shared.u32 r4189, [r13+55000]; +ld.shared.u32 r4201, [r13+55004]; +ld.shared.u32 r4511, [r13+60000]; +ld.shared.u32 r4523, [r13+60004]; +ld.shared.u32 r4833, [r13+65000]; +ld.shared.u32 r4845, [r13+65004]; +ld.shared.u32 r5155, [r13+70000]; +ld.shared.u32 r5167, [r13+70004]; +ld.shared.u32 r3868, [r13+75000]; +ld.shared.u32 r3880, [r13+75004]; +ld.shared.u32 r4190, [r13+80000]; +ld.shared.u32 r4202, [r13+80004]; +ld.shared.u32 r4512, [r13+85000]; +ld.shared.u32 r4524, [r13+85004]; +ld.shared.u32 r4834, [r13+90000]; +ld.shared.u32 r4846, [r13+90004]; +ld.shared.u32 r5156, [r13+95000]; +ld.shared.u32 r5168, [r13+95004]; +ld.shared.u32 r3862, [r13+100000]; +ld.shared.u32 r3874, [r13+100004]; +ld.shared.u32 r4184, [r13+105000]; +ld.shared.u32 r4196, [r13+105004]; +ld.shared.u32 r4506, [r13+110000]; +ld.shared.u32 r4518, [r13+110004]; +ld.shared.u32 r4828, [r13+115000]; +ld.shared.u32 r4840, [r13+115004]; +ld.shared.u32 r5150, [r13+120000]; +ld.shared.u32 r5162, [r13+120004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3850, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3851, {low, high}; +} +{ +neg.f16x2 r3852, r3851; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3854, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3855, {low, high}; +} +{ +neg.f16x2 r3856, r3855; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3859, {low, high}; +} +{ +add.f16x2 r3860, r3861, r3862; +} +{ +add.f16x2 r3863, r3864, r3860; +} +{ +add.f16x2 r3866, r3867, r3868; +} +{ +add.f16x2 r3869, r3863, r3866; +} +{ +add.f16x2 r3872, r3873, r3874; +} +{ +add.f16x2 r3875, r3876, r3872; +} +{ +add.f16x2 r3878, r3879, r3880; +} +{ +add.f16x2 r3881, r3875, r3878; +} +{ +add.f16x2 r3884, r3861, r3862; +} +{ +mul.f16x2 r3887, r3884, r3850; +} +{ +add.f16x2 r3890, r3864, r3887; +} +{ +add.f16x2 r3893, r3867, r3868; +} +{ +mul.f16x2 r3896, r3893, r3854; +} +{ +add.f16x2 r3899, r3890, r3896; +} +{ +sub.f16x2 r3902, r3873, r3874; +} +{ +mul.f16x2 r3905, r3902, r3852; +} +{ +sub.f16x2 r3908, r3879, r3880; +} +{ +mul.f16x2 r3911, r3908, r3856; +} +{ +add.f16x2 r3914, r3905, r3911; +} +{ +sub.f16x2 r3917, r3899, r3914; +} +{ +add.f16x2 r3920, r3861, r3862; +} +{ +mul.f16x2 r3923, r3920, r3850; +} +{ +add.f16x2 r3926, r3864, r3923; +} +{ +add.f16x2 r3929, r3867, r3868; +} +{ +mul.f16x2 r3932, r3929, r3854; +} +{ +add.f16x2 r3935, r3926, r3932; +} +{ +sub.f16x2 r3938, r3873, r3874; +} +{ +mul.f16x2 r3941, r3938, r3852; +} +{ +sub.f16x2 r3944, r3879, r3880; +} +{ +mul.f16x2 r3947, r3944, r3856; +} +{ +add.f16x2 r3950, r3941, r3947; +} +{ +add.f16x2 r3953, r3935, r3950; +} +{ +add.f16x2 r3956, r3861, r3862; +} +{ +mul.f16x2 r3959, r3956, r3854; +} +{ +add.f16x2 r3962, r3864, r3959; +} +{ +add.f16x2 r3965, r3867, r3868; +} +{ +mul.f16x2 r3968, r3965, r3858; +} +{ +add.f16x2 r3971, r3962, r3968; +} +{ +sub.f16x2 r3974, r3873, r3874; +} +{ +mul.f16x2 r3977, r3974, r3856; +} +{ +sub.f16x2 r3980, r3879, r3880; +} +{ +mul.f16x2 r3983, r3980, r3859; +} +{ +add.f16x2 r3986, r3977, r3983; +} +{ +sub.f16x2 r3989, r3971, r3986; +} +{ +add.f16x2 r3992, r3861, r3862; +} +{ +mul.f16x2 r3995, r3992, r3854; +} +{ +add.f16x2 r3998, r3864, r3995; +} +{ +add.f16x2 r4001, r3867, r3868; +} +{ +mul.f16x2 r4004, r4001, r3858; +} +{ +add.f16x2 r4007, r3998, r4004; +} +{ +sub.f16x2 r4010, r3873, r3874; +} +{ +mul.f16x2 r4013, r4010, r3856; +} +{ +sub.f16x2 r4016, r3879, r3880; +} +{ +mul.f16x2 r4019, r4016, r3859; +} +{ +add.f16x2 r4022, r4013, r4019; +} +{ +add.f16x2 r4025, r4007, r4022; +} +{ +add.f16x2 r4028, r3873, r3874; +} +{ +mul.f16x2 r4031, r4028, r3850; +} +{ +add.f16x2 r4034, r3876, r4031; +} +{ +add.f16x2 r4037, r3879, r3880; +} +{ +mul.f16x2 r4040, r4037, r3854; +} +{ +add.f16x2 r4043, r4034, r4040; +} +{ +sub.f16x2 r4046, r3861, r3862; +} +{ +mul.f16x2 r4049, r4046, r3852; +} +{ +sub.f16x2 r4052, r3867, r3868; +} +{ +mul.f16x2 r4055, r4052, r3856; +} +{ +add.f16x2 r4058, r4049, r4055; +} +{ +add.f16x2 r4061, r4043, r4058; +} +{ +add.f16x2 r4064, r3873, r3874; +} +{ +mul.f16x2 r4067, r4064, r3850; +} +{ +add.f16x2 r4070, r3876, r4067; +} +{ +add.f16x2 r4073, r3879, r3880; +} +{ +mul.f16x2 r4076, r4073, r3854; +} +{ +add.f16x2 r4079, r4070, r4076; +} +{ +sub.f16x2 r4082, r3861, r3862; +} +{ +mul.f16x2 r4085, r4082, r3852; +} +{ +sub.f16x2 r4088, r3867, r3868; +} +{ +mul.f16x2 r4091, r4088, r3856; +} +{ +add.f16x2 r4094, r4085, r4091; +} +{ +sub.f16x2 r4097, r4079, r4094; +} +{ +add.f16x2 r4100, r3873, r3874; +} +{ +mul.f16x2 r4103, r4100, r3854; +} +{ +add.f16x2 r4106, r3876, r4103; +} +{ +add.f16x2 r4109, r3879, r3880; +} +{ +mul.f16x2 r4112, r4109, r3858; +} +{ +add.f16x2 r4115, r4106, r4112; +} +{ +sub.f16x2 r4118, r3861, r3862; +} +{ +mul.f16x2 r4121, r4118, r3856; +} +{ +sub.f16x2 r4124, r3867, r3868; +} +{ +mul.f16x2 r4127, r4124, r3859; +} +{ +add.f16x2 r4130, r4121, r4127; +} +{ +add.f16x2 r4133, r4115, r4130; +} +{ +add.f16x2 r4136, r3873, r3874; +} +{ +mul.f16x2 r4139, r4136, r3854; +} +{ +add.f16x2 r4142, r3876, r4139; +} +{ +add.f16x2 r4145, r3879, r3880; +} +{ +mul.f16x2 r4148, r4145, r3858; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +sub.f16x2 r4154, r3861, r3862; +} +{ +mul.f16x2 r4157, r4154, r3856; +} +{ +sub.f16x2 r4160, r3867, r3868; +} +{ +mul.f16x2 r4163, r4160, r3859; +} +{ +add.f16x2 r4166, r4157, r4163; +} +{ +sub.f16x2 r4169, r4151, r4166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4173, {low, high}; +} +{ +neg.f16x2 r4174, r4173; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4176, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4177, {low, high}; +} +{ +neg.f16x2 r4178, r4177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4180, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4181, {low, high}; +} +{ +add.f16x2 r4182, r4183, r4184; +} +{ +add.f16x2 r4185, r4186, r4182; +} +{ +add.f16x2 r4188, r4189, r4190; +} +{ +add.f16x2 r4191, r4185, r4188; +} +{ +add.f16x2 r4194, r4195, r4196; +} +{ +add.f16x2 r4197, r4198, r4194; +} +{ +add.f16x2 r4200, r4201, r4202; +} +{ +add.f16x2 r4203, r4197, r4200; +} +{ +add.f16x2 r4206, r4183, r4184; +} +{ +mul.f16x2 r4209, r4206, r4172; +} +{ +add.f16x2 r4212, r4186, r4209; +} +{ +add.f16x2 r4215, r4189, r4190; +} +{ +mul.f16x2 r4218, r4215, r4176; +} +{ +add.f16x2 r4221, r4212, r4218; +} +{ +sub.f16x2 r4224, r4195, r4196; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +sub.f16x2 r4230, r4201, r4202; +} +{ +mul.f16x2 r4233, r4230, r4178; +} +{ +add.f16x2 r4236, r4227, r4233; +} +{ +sub.f16x2 r4239, r4221, r4236; +} +{ +add.f16x2 r4242, r4183, r4184; +} +{ +mul.f16x2 r4245, r4242, r4172; +} +{ +add.f16x2 r4248, r4186, r4245; +} +{ +add.f16x2 r4251, r4189, r4190; +} +{ +mul.f16x2 r4254, r4251, r4176; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +sub.f16x2 r4260, r4195, r4196; +} +{ +mul.f16x2 r4263, r4260, r4174; +} +{ +sub.f16x2 r4266, r4201, r4202; +} +{ +mul.f16x2 r4269, r4266, r4178; +} +{ +add.f16x2 r4272, r4263, r4269; +} +{ +add.f16x2 r4275, r4257, r4272; +} +{ +add.f16x2 r4278, r4183, r4184; +} +{ +mul.f16x2 r4281, r4278, r4176; +} +{ +add.f16x2 r4284, r4186, r4281; +} +{ +add.f16x2 r4287, r4189, r4190; +} +{ +mul.f16x2 r4290, r4287, r4180; +} +{ +add.f16x2 r4293, r4284, r4290; +} +{ +sub.f16x2 r4296, r4195, r4196; +} +{ +mul.f16x2 r4299, r4296, r4178; +} +{ +sub.f16x2 r4302, r4201, r4202; +} +{ +mul.f16x2 r4305, r4302, r4181; +} +{ +add.f16x2 r4308, r4299, r4305; +} +{ +sub.f16x2 r4311, r4293, r4308; +} +{ +add.f16x2 r4314, r4183, r4184; +} +{ +mul.f16x2 r4317, r4314, r4176; +} +{ +add.f16x2 r4320, r4186, r4317; +} +{ +add.f16x2 r4323, r4189, r4190; +} +{ +mul.f16x2 r4326, r4323, r4180; +} +{ +add.f16x2 r4329, r4320, r4326; +} +{ +sub.f16x2 r4332, r4195, r4196; +} +{ +mul.f16x2 r4335, r4332, r4178; +} +{ +sub.f16x2 r4338, r4201, r4202; +} +{ +mul.f16x2 r4341, r4338, r4181; +} +{ +add.f16x2 r4344, r4335, r4341; +} +{ +add.f16x2 r4347, r4329, r4344; +} +{ +add.f16x2 r4350, r4195, r4196; +} +{ +mul.f16x2 r4353, r4350, r4172; +} +{ +add.f16x2 r4356, r4198, r4353; +} +{ +add.f16x2 r4359, r4201, r4202; +} +{ +mul.f16x2 r4362, r4359, r4176; +} +{ +add.f16x2 r4365, r4356, r4362; +} +{ +sub.f16x2 r4368, r4183, r4184; +} +{ +mul.f16x2 r4371, r4368, r4174; +} +{ +sub.f16x2 r4374, r4189, r4190; +} +{ +mul.f16x2 r4377, r4374, r4178; +} +{ +add.f16x2 r4380, r4371, r4377; +} +{ +add.f16x2 r4383, r4365, r4380; +} +{ +add.f16x2 r4386, r4195, r4196; +} +{ +mul.f16x2 r4389, r4386, r4172; +} +{ +add.f16x2 r4392, r4198, r4389; +} +{ +add.f16x2 r4395, r4201, r4202; +} +{ +mul.f16x2 r4398, r4395, r4176; +} +{ +add.f16x2 r4401, r4392, r4398; +} +{ +sub.f16x2 r4404, r4183, r4184; +} +{ +mul.f16x2 r4407, r4404, r4174; +} +{ +sub.f16x2 r4410, r4189, r4190; +} +{ +mul.f16x2 r4413, r4410, r4178; +} +{ +add.f16x2 r4416, r4407, r4413; +} +{ +sub.f16x2 r4419, r4401, r4416; +} +{ +add.f16x2 r4422, r4195, r4196; +} +{ +mul.f16x2 r4425, r4422, r4176; +} +{ +add.f16x2 r4428, r4198, r4425; +} +{ +add.f16x2 r4431, r4201, r4202; +} +{ +mul.f16x2 r4434, r4431, r4180; +} +{ +add.f16x2 r4437, r4428, r4434; +} +{ +sub.f16x2 r4440, r4183, r4184; +} +{ +mul.f16x2 r4443, r4440, r4178; +} +{ +sub.f16x2 r4446, r4189, r4190; +} +{ +mul.f16x2 r4449, r4446, r4181; +} +{ +add.f16x2 r4452, r4443, r4449; +} +{ +add.f16x2 r4455, r4437, r4452; +} +{ +add.f16x2 r4458, r4195, r4196; +} +{ +mul.f16x2 r4461, r4458, r4176; +} +{ +add.f16x2 r4464, r4198, r4461; +} +{ +add.f16x2 r4467, r4201, r4202; +} +{ +mul.f16x2 r4470, r4467, r4180; +} +{ +add.f16x2 r4473, r4464, r4470; +} +{ +sub.f16x2 r4476, r4183, r4184; +} +{ +mul.f16x2 r4479, r4476, r4178; +} +{ +sub.f16x2 r4482, r4189, r4190; +} +{ +mul.f16x2 r4485, r4482, r4181; +} +{ +add.f16x2 r4488, r4479, r4485; +} +{ +sub.f16x2 r4491, r4473, r4488; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4494, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4495, {low, high}; +} +{ +neg.f16x2 r4496, r4495; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4502, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4503, {low, high}; +} +{ +add.f16x2 r4504, r4505, r4506; +} +{ +add.f16x2 r4507, r4508, r4504; +} +{ +add.f16x2 r4510, r4511, r4512; +} +{ +add.f16x2 r4513, r4507, r4510; +} +{ +add.f16x2 r4516, r4517, r4518; +} +{ +add.f16x2 r4519, r4520, r4516; +} +{ +add.f16x2 r4522, r4523, r4524; +} +{ +add.f16x2 r4525, r4519, r4522; +} +{ +add.f16x2 r4528, r4505, r4506; +} +{ +mul.f16x2 r4531, r4528, r4494; +} +{ +add.f16x2 r4534, r4508, r4531; +} +{ +add.f16x2 r4537, r4511, r4512; +} +{ +mul.f16x2 r4540, r4537, r4498; +} +{ +add.f16x2 r4543, r4534, r4540; +} +{ +sub.f16x2 r4546, r4517, r4518; +} +{ +mul.f16x2 r4549, r4546, r4496; +} +{ +sub.f16x2 r4552, r4523, r4524; +} +{ +mul.f16x2 r4555, r4552, r4500; +} +{ +add.f16x2 r4558, r4549, r4555; +} +{ +sub.f16x2 r4561, r4543, r4558; +} +{ +add.f16x2 r4564, r4505, r4506; +} +{ +mul.f16x2 r4567, r4564, r4494; +} +{ +add.f16x2 r4570, r4508, r4567; +} +{ +add.f16x2 r4573, r4511, r4512; +} +{ +mul.f16x2 r4576, r4573, r4498; +} +{ +add.f16x2 r4579, r4570, r4576; +} +{ +sub.f16x2 r4582, r4517, r4518; +} +{ +mul.f16x2 r4585, r4582, r4496; +} +{ +sub.f16x2 r4588, r4523, r4524; +} +{ +mul.f16x2 r4591, r4588, r4500; +} +{ +add.f16x2 r4594, r4585, r4591; +} +{ +add.f16x2 r4597, r4579, r4594; +} +{ +add.f16x2 r4600, r4505, r4506; +} +{ +mul.f16x2 r4603, r4600, r4498; +} +{ +add.f16x2 r4606, r4508, r4603; +} +{ +add.f16x2 r4609, r4511, r4512; +} +{ +mul.f16x2 r4612, r4609, r4502; +} +{ +add.f16x2 r4615, r4606, r4612; +} +{ +sub.f16x2 r4618, r4517, r4518; +} +{ +mul.f16x2 r4621, r4618, r4500; +} +{ +sub.f16x2 r4624, r4523, r4524; +} +{ +mul.f16x2 r4627, r4624, r4503; +} +{ +add.f16x2 r4630, r4621, r4627; +} +{ +sub.f16x2 r4633, r4615, r4630; +} +{ +add.f16x2 r4636, r4505, r4506; +} +{ +mul.f16x2 r4639, r4636, r4498; +} +{ +add.f16x2 r4642, r4508, r4639; +} +{ +add.f16x2 r4645, r4511, r4512; +} +{ +mul.f16x2 r4648, r4645, r4502; +} +{ +add.f16x2 r4651, r4642, r4648; +} +{ +sub.f16x2 r4654, r4517, r4518; +} +{ +mul.f16x2 r4657, r4654, r4500; +} +{ +sub.f16x2 r4660, r4523, r4524; +} +{ +mul.f16x2 r4663, r4660, r4503; +} +{ +add.f16x2 r4666, r4657, r4663; +} +{ +add.f16x2 r4669, r4651, r4666; +} +{ +add.f16x2 r4672, r4517, r4518; +} +{ +mul.f16x2 r4675, r4672, r4494; +} +{ +add.f16x2 r4678, r4520, r4675; +} +{ +add.f16x2 r4681, r4523, r4524; +} +{ +mul.f16x2 r4684, r4681, r4498; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +sub.f16x2 r4690, r4505, r4506; +} +{ +mul.f16x2 r4693, r4690, r4496; +} +{ +sub.f16x2 r4696, r4511, r4512; +} +{ +mul.f16x2 r4699, r4696, r4500; +} +{ +add.f16x2 r4702, r4693, r4699; +} +{ +add.f16x2 r4705, r4687, r4702; +} +{ +add.f16x2 r4708, r4517, r4518; +} +{ +mul.f16x2 r4711, r4708, r4494; +} +{ +add.f16x2 r4714, r4520, r4711; +} +{ +add.f16x2 r4717, r4523, r4524; +} +{ +mul.f16x2 r4720, r4717, r4498; +} +{ +add.f16x2 r4723, r4714, r4720; +} +{ +sub.f16x2 r4726, r4505, r4506; +} +{ +mul.f16x2 r4729, r4726, r4496; +} +{ +sub.f16x2 r4732, r4511, r4512; +} +{ +mul.f16x2 r4735, r4732, r4500; +} +{ +add.f16x2 r4738, r4729, r4735; +} +{ +sub.f16x2 r4741, r4723, r4738; +} +{ +add.f16x2 r4744, r4517, r4518; +} +{ +mul.f16x2 r4747, r4744, r4498; +} +{ +add.f16x2 r4750, r4520, r4747; +} +{ +add.f16x2 r4753, r4523, r4524; +} +{ +mul.f16x2 r4756, r4753, r4502; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +sub.f16x2 r4762, r4505, r4506; +} +{ +mul.f16x2 r4765, r4762, r4500; +} +{ +sub.f16x2 r4768, r4511, r4512; +} +{ +mul.f16x2 r4771, r4768, r4503; +} +{ +add.f16x2 r4774, r4765, r4771; +} +{ +add.f16x2 r4777, r4759, r4774; +} +{ +add.f16x2 r4780, r4517, r4518; +} +{ +mul.f16x2 r4783, r4780, r4498; +} +{ +add.f16x2 r4786, r4520, r4783; +} +{ +add.f16x2 r4789, r4523, r4524; +} +{ +mul.f16x2 r4792, r4789, r4502; +} +{ +add.f16x2 r4795, r4786, r4792; +} +{ +sub.f16x2 r4798, r4505, r4506; +} +{ +mul.f16x2 r4801, r4798, r4500; +} +{ +sub.f16x2 r4804, r4511, r4512; +} +{ +mul.f16x2 r4807, r4804, r4503; +} +{ +add.f16x2 r4810, r4801, r4807; +} +{ +sub.f16x2 r4813, r4795, r4810; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4817, {low, high}; +} +{ +neg.f16x2 r4818, r4817; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4820, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4821, {low, high}; +} +{ +neg.f16x2 r4822, r4821; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4824, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4825, {low, high}; +} +{ +add.f16x2 r4826, r4827, r4828; +} +{ +add.f16x2 r4829, r4830, r4826; +} +{ +add.f16x2 r4832, r4833, r4834; +} +{ +add.f16x2 r4835, r4829, r4832; +} +{ +add.f16x2 r4838, r4839, r4840; +} +{ +add.f16x2 r4841, r4842, r4838; +} +{ +add.f16x2 r4844, r4845, r4846; +} +{ +add.f16x2 r4847, r4841, r4844; +} +{ +add.f16x2 r4850, r4827, r4828; +} +{ +mul.f16x2 r4853, r4850, r4816; +} +{ +add.f16x2 r4856, r4830, r4853; +} +{ +add.f16x2 r4859, r4833, r4834; +} +{ +mul.f16x2 r4862, r4859, r4820; +} +{ +add.f16x2 r4865, r4856, r4862; +} +{ +sub.f16x2 r4868, r4839, r4840; +} +{ +mul.f16x2 r4871, r4868, r4818; +} +{ +sub.f16x2 r4874, r4845, r4846; +} +{ +mul.f16x2 r4877, r4874, r4822; +} +{ +add.f16x2 r4880, r4871, r4877; +} +{ +sub.f16x2 r4883, r4865, r4880; +} +{ +add.f16x2 r4886, r4827, r4828; +} +{ +mul.f16x2 r4889, r4886, r4816; +} +{ +add.f16x2 r4892, r4830, r4889; +} +{ +add.f16x2 r4895, r4833, r4834; +} +{ +mul.f16x2 r4898, r4895, r4820; +} +{ +add.f16x2 r4901, r4892, r4898; +} +{ +sub.f16x2 r4904, r4839, r4840; +} +{ +mul.f16x2 r4907, r4904, r4818; +} +{ +sub.f16x2 r4910, r4845, r4846; +} +{ +mul.f16x2 r4913, r4910, r4822; +} +{ +add.f16x2 r4916, r4907, r4913; +} +{ +add.f16x2 r4919, r4901, r4916; +} +{ +add.f16x2 r4922, r4827, r4828; +} +{ +mul.f16x2 r4925, r4922, r4820; +} +{ +add.f16x2 r4928, r4830, r4925; +} +{ +add.f16x2 r4931, r4833, r4834; +} +{ +mul.f16x2 r4934, r4931, r4824; +} +{ +add.f16x2 r4937, r4928, r4934; +} +{ +sub.f16x2 r4940, r4839, r4840; +} +{ +mul.f16x2 r4943, r4940, r4822; +} +{ +sub.f16x2 r4946, r4845, r4846; +} +{ +mul.f16x2 r4949, r4946, r4825; +} +{ +add.f16x2 r4952, r4943, r4949; +} +{ +sub.f16x2 r4955, r4937, r4952; +} +{ +add.f16x2 r4958, r4827, r4828; +} +{ +mul.f16x2 r4961, r4958, r4820; +} +{ +add.f16x2 r4964, r4830, r4961; +} +{ +add.f16x2 r4967, r4833, r4834; +} +{ +mul.f16x2 r4970, r4967, r4824; +} +{ +add.f16x2 r4973, r4964, r4970; +} +{ +sub.f16x2 r4976, r4839, r4840; +} +{ +mul.f16x2 r4979, r4976, r4822; +} +{ +sub.f16x2 r4982, r4845, r4846; +} +{ +mul.f16x2 r4985, r4982, r4825; +} +{ +add.f16x2 r4988, r4979, r4985; +} +{ +add.f16x2 r4991, r4973, r4988; +} +{ +add.f16x2 r4994, r4839, r4840; +} +{ +mul.f16x2 r4997, r4994, r4816; +} +{ +add.f16x2 r5000, r4842, r4997; +} +{ +add.f16x2 r5003, r4845, r4846; +} +{ +mul.f16x2 r5006, r5003, r4820; +} +{ +add.f16x2 r5009, r5000, r5006; +} +{ +sub.f16x2 r5012, r4827, r4828; +} +{ +mul.f16x2 r5015, r5012, r4818; +} +{ +sub.f16x2 r5018, r4833, r4834; +} +{ +mul.f16x2 r5021, r5018, r4822; +} +{ +add.f16x2 r5024, r5015, r5021; +} +{ +add.f16x2 r5027, r5009, r5024; +} +{ +add.f16x2 r5030, r4839, r4840; +} +{ +mul.f16x2 r5033, r5030, r4816; +} +{ +add.f16x2 r5036, r4842, r5033; +} +{ +add.f16x2 r5039, r4845, r4846; +} +{ +mul.f16x2 r5042, r5039, r4820; +} +{ +add.f16x2 r5045, r5036, r5042; +} +{ +sub.f16x2 r5048, r4827, r4828; +} +{ +mul.f16x2 r5051, r5048, r4818; +} +{ +sub.f16x2 r5054, r4833, r4834; +} +{ +mul.f16x2 r5057, r5054, r4822; +} +{ +add.f16x2 r5060, r5051, r5057; +} +{ +sub.f16x2 r5063, r5045, r5060; +} +{ +add.f16x2 r5066, r4839, r4840; +} +{ +mul.f16x2 r5069, r5066, r4820; +} +{ +add.f16x2 r5072, r4842, r5069; +} +{ +add.f16x2 r5075, r4845, r4846; +} +{ +mul.f16x2 r5078, r5075, r4824; +} +{ +add.f16x2 r5081, r5072, r5078; +} +{ +sub.f16x2 r5084, r4827, r4828; +} +{ +mul.f16x2 r5087, r5084, r4822; +} +{ +sub.f16x2 r5090, r4833, r4834; +} +{ +mul.f16x2 r5093, r5090, r4825; +} +{ +add.f16x2 r5096, r5087, r5093; +} +{ +add.f16x2 r5099, r5081, r5096; +} +{ +add.f16x2 r5102, r4839, r4840; +} +{ +mul.f16x2 r5105, r5102, r4820; +} +{ +add.f16x2 r5108, r4842, r5105; +} +{ +add.f16x2 r5111, r4845, r4846; +} +{ +mul.f16x2 r5114, r5111, r4824; +} +{ +add.f16x2 r5117, r5108, r5114; +} +{ +sub.f16x2 r5120, r4827, r4828; +} +{ +mul.f16x2 r5123, r5120, r4822; +} +{ +sub.f16x2 r5126, r4833, r4834; +} +{ +mul.f16x2 r5129, r5126, r4825; +} +{ +add.f16x2 r5132, r5123, r5129; +} +{ +sub.f16x2 r5135, r5117, r5132; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5138, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5139, {low, high}; +} +{ +neg.f16x2 r5140, r5139; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5142, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5143, {low, high}; +} +{ +neg.f16x2 r5144, r5143; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5146, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5147, {low, high}; +} +{ +add.f16x2 r5148, r5149, r5150; +} +{ +add.f16x2 r5151, r5152, r5148; +} +{ +add.f16x2 r5154, r5155, r5156; +} +{ +add.f16x2 r5157, r5151, r5154; +} +{ +add.f16x2 r5160, r5161, r5162; +} +{ +add.f16x2 r5163, r5164, r5160; +} +{ +add.f16x2 r5166, r5167, r5168; +} +{ +add.f16x2 r5169, r5163, r5166; +} +{ +add.f16x2 r5172, r5149, r5150; +} +{ +mul.f16x2 r5175, r5172, r5138; +} +{ +add.f16x2 r5178, r5152, r5175; +} +{ +add.f16x2 r5181, r5155, r5156; +} +{ +mul.f16x2 r5184, r5181, r5142; +} +{ +add.f16x2 r5187, r5178, r5184; +} +{ +sub.f16x2 r5190, r5161, r5162; +} +{ +mul.f16x2 r5193, r5190, r5140; +} +{ +sub.f16x2 r5196, r5167, r5168; +} +{ +mul.f16x2 r5199, r5196, r5144; +} +{ +add.f16x2 r5202, r5193, r5199; +} +{ +sub.f16x2 r5205, r5187, r5202; +} +{ +add.f16x2 r5208, r5149, r5150; +} +{ +mul.f16x2 r5211, r5208, r5138; +} +{ +add.f16x2 r5214, r5152, r5211; +} +{ +add.f16x2 r5217, r5155, r5156; +} +{ +mul.f16x2 r5220, r5217, r5142; +} +{ +add.f16x2 r5223, r5214, r5220; +} +{ +sub.f16x2 r5226, r5161, r5162; +} +{ +mul.f16x2 r5229, r5226, r5140; +} +{ +sub.f16x2 r5232, r5167, r5168; +} +{ +mul.f16x2 r5235, r5232, r5144; +} +{ +add.f16x2 r5238, r5229, r5235; +} +{ +add.f16x2 r5241, r5223, r5238; +} +{ +add.f16x2 r5244, r5149, r5150; +} +{ +mul.f16x2 r5247, r5244, r5142; +} +{ +add.f16x2 r5250, r5152, r5247; +} +{ +add.f16x2 r5253, r5155, r5156; +} +{ +mul.f16x2 r5256, r5253, r5146; +} +{ +add.f16x2 r5259, r5250, r5256; +} +{ +sub.f16x2 r5262, r5161, r5162; +} +{ +mul.f16x2 r5265, r5262, r5144; +} +{ +sub.f16x2 r5268, r5167, r5168; +} +{ +mul.f16x2 r5271, r5268, r5147; +} +{ +add.f16x2 r5274, r5265, r5271; +} +{ +sub.f16x2 r5277, r5259, r5274; +} +{ +add.f16x2 r5280, r5149, r5150; +} +{ +mul.f16x2 r5283, r5280, r5142; +} +{ +add.f16x2 r5286, r5152, r5283; +} +{ +add.f16x2 r5289, r5155, r5156; +} +{ +mul.f16x2 r5292, r5289, r5146; +} +{ +add.f16x2 r5295, r5286, r5292; +} +{ +sub.f16x2 r5298, r5161, r5162; +} +{ +mul.f16x2 r5301, r5298, r5144; +} +{ +sub.f16x2 r5304, r5167, r5168; +} +{ +mul.f16x2 r5307, r5304, r5147; +} +{ +add.f16x2 r5310, r5301, r5307; +} +{ +add.f16x2 r5313, r5295, r5310; +} +{ +add.f16x2 r5316, r5161, r5162; +} +{ +mul.f16x2 r5319, r5316, r5138; +} +{ +add.f16x2 r5322, r5164, r5319; +} +{ +add.f16x2 r5325, r5167, r5168; +} +{ +mul.f16x2 r5328, r5325, r5142; +} +{ +add.f16x2 r5331, r5322, r5328; +} +{ +sub.f16x2 r5334, r5149, r5150; +} +{ +mul.f16x2 r5337, r5334, r5140; +} +{ +sub.f16x2 r5340, r5155, r5156; +} +{ +mul.f16x2 r5343, r5340, r5144; +} +{ +add.f16x2 r5346, r5337, r5343; +} +{ +add.f16x2 r5349, r5331, r5346; +} +{ +add.f16x2 r5352, r5161, r5162; +} +{ +mul.f16x2 r5355, r5352, r5138; +} +{ +add.f16x2 r5358, r5164, r5355; +} +{ +add.f16x2 r5361, r5167, r5168; +} +{ +mul.f16x2 r5364, r5361, r5142; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +sub.f16x2 r5370, r5149, r5150; +} +{ +mul.f16x2 r5373, r5370, r5140; +} +{ +sub.f16x2 r5376, r5155, r5156; +} +{ +mul.f16x2 r5379, r5376, r5144; +} +{ +add.f16x2 r5382, r5373, r5379; +} +{ +sub.f16x2 r5385, r5367, r5382; +} +{ +add.f16x2 r5388, r5161, r5162; +} +{ +mul.f16x2 r5391, r5388, r5142; +} +{ +add.f16x2 r5394, r5164, r5391; +} +{ +add.f16x2 r5397, r5167, r5168; +} +{ +mul.f16x2 r5400, r5397, r5146; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5149, r5150; +} +{ +mul.f16x2 r5409, r5406, r5144; +} +{ +sub.f16x2 r5412, r5155, r5156; +} +{ +mul.f16x2 r5415, r5412, r5147; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +add.f16x2 r5421, r5403, r5418; +} +{ +add.f16x2 r5424, r5161, r5162; +} +{ +mul.f16x2 r5427, r5424, r5142; +} +{ +add.f16x2 r5430, r5164, r5427; +} +{ +add.f16x2 r5433, r5167, r5168; +} +{ +mul.f16x2 r5436, r5433, r5146; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5149, r5150; +} +{ +mul.f16x2 r5445, r5442, r5144; +} +{ +sub.f16x2 r5448, r5155, r5156; +} +{ +mul.f16x2 r5451, r5448, r5147; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +sub.f16x2 r5457, r5439, r5454; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r5460, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r5461, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r5462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r5463, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r5464, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r5465, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r5466, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r5467, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r5470, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r5471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r5474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r5475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r5477, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r5482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r5483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r5491, {low, high}; +} +{ +mul.f16x2 r5508, r4239, r5460; +} +{ +mul.f16x2 r5511, r4383, r5461; +} +{ +sub.f16x2 r5514, r5508, r5511; +} +{ +mul.f16x2 r5517, r4239, r5461; +} +{ +fma.rn.f16x2 r5520, r4383, r5460, r5517; +} +{ +mul.f16x2 r5524, r4561, r5462; +} +{ +mul.f16x2 r5527, r4705, r5463; +} +{ +sub.f16x2 r5530, r5524, r5527; +} +{ +mul.f16x2 r5533, r4561, r5463; +} +{ +fma.rn.f16x2 r5536, r4705, r5462, r5533; +} +{ +mul.f16x2 r5540, r4883, r5464; +} +{ +mul.f16x2 r5543, r5027, r5465; +} +{ +sub.f16x2 r5546, r5540, r5543; +} +{ +mul.f16x2 r5549, r4883, r5465; +} +{ +fma.rn.f16x2 r5552, r5027, r5464, r5549; +} +{ +mul.f16x2 r5556, r5205, r5466; +} +{ +mul.f16x2 r5559, r5349, r5467; +} +{ +sub.f16x2 r5562, r5556, r5559; +} +{ +mul.f16x2 r5565, r5205, r5467; +} +{ +fma.rn.f16x2 r5568, r5349, r5466, r5565; +} +{ +mul.f16x2 r5572, r4311, r5462; +} +{ +mul.f16x2 r5575, r4455, r5463; +} +{ +sub.f16x2 r5578, r5572, r5575; +} +{ +mul.f16x2 r5581, r4311, r5463; +} +{ +fma.rn.f16x2 r5584, r4455, r5462, r5581; +} +{ +mul.f16x2 r5588, r4633, r5466; +} +{ +mul.f16x2 r5591, r4777, r5467; +} +{ +sub.f16x2 r5594, r5588, r5591; +} +{ +mul.f16x2 r5597, r4633, r5467; +} +{ +fma.rn.f16x2 r5600, r4777, r5466, r5597; +} +{ +mul.f16x2 r5604, r4955, r5470; +} +{ +mul.f16x2 r5607, r5099, r5471; +} +{ +sub.f16x2 r5610, r5604, r5607; +} +{ +mul.f16x2 r5613, r4955, r5471; +} +{ +fma.rn.f16x2 r5616, r5099, r5470, r5613; +} +{ +mul.f16x2 r5620, r5277, r5474; +} +{ +mul.f16x2 r5623, r5421, r5475; +} +{ +sub.f16x2 r5626, r5620, r5623; +} +{ +mul.f16x2 r5629, r5277, r5475; +} +{ +fma.rn.f16x2 r5632, r5421, r5474, r5629; +} +{ +mul.f16x2 r5636, r4347, r5464; +} +{ +mul.f16x2 r5639, r4491, r5465; +} +{ +sub.f16x2 r5642, r5636, r5639; +} +{ +mul.f16x2 r5645, r4347, r5465; +} +{ +fma.rn.f16x2 r5648, r4491, r5464, r5645; +} +{ +mul.f16x2 r5652, r4669, r5470; +} +{ +mul.f16x2 r5655, r4813, r5471; +} +{ +sub.f16x2 r5658, r5652, r5655; +} +{ +mul.f16x2 r5661, r4669, r5471; +} +{ +fma.rn.f16x2 r5664, r4813, r5470, r5661; +} +{ +mul.f16x2 r5668, r4991, r5476; +} +{ +mul.f16x2 r5671, r5135, r5477; +} +{ +sub.f16x2 r5674, r5668, r5671; +} +{ +mul.f16x2 r5677, r4991, r5477; +} +{ +fma.rn.f16x2 r5680, r5135, r5476, r5677; +} +{ +mul.f16x2 r5684, r5313, r5482; +} +{ +mul.f16x2 r5687, r5457, r5483; +} +{ +sub.f16x2 r5690, r5684, r5687; +} +{ +mul.f16x2 r5693, r5313, r5483; +} +{ +fma.rn.f16x2 r5696, r5457, r5482, r5693; +} +{ +mul.f16x2 r5700, r4275, r5466; +} +{ +mul.f16x2 r5703, r4419, r5467; +} +{ +sub.f16x2 r5706, r5700, r5703; +} +{ +mul.f16x2 r5709, r4275, r5467; +} +{ +fma.rn.f16x2 r5712, r4419, r5466, r5709; +} +{ +mul.f16x2 r5716, r4597, r5474; +} +{ +mul.f16x2 r5719, r4741, r5475; +} +{ +sub.f16x2 r5722, r5716, r5719; +} +{ +mul.f16x2 r5725, r4597, r5475; +} +{ +fma.rn.f16x2 r5728, r4741, r5474, r5725; +} +{ +mul.f16x2 r5732, r4919, r5482; +} +{ +mul.f16x2 r5735, r5063, r5483; +} +{ +sub.f16x2 r5738, r5732, r5735; +} +{ +mul.f16x2 r5741, r4919, r5483; +} +{ +fma.rn.f16x2 r5744, r5063, r5482, r5741; +} +{ +mul.f16x2 r5748, r5241, r5490; +} +{ +mul.f16x2 r5751, r5385, r5491; +} +{ +sub.f16x2 r5754, r5748, r5751; +} +{ +mul.f16x2 r5757, r5241, r5491; +} +{ +fma.rn.f16x2 r5760, r5385, r5490, r5757; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5765, {low, high}; +} +{ +neg.f16x2 r5766, r5765; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5769, {low, high}; +} +{ +neg.f16x2 r5770, r5769; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5773, {low, high}; +} +{ +add.f16x2 r5774, r4191, r5157; +} +{ +add.f16x2 r5777, r3869, r5774; +} +{ +add.f16x2 r5780, r4513, r4835; +} +{ +add.f16x2 r5783, r5777, r5780; +} +st.local.u32 [rd3], r5783; +{ +add.f16x2 r5786, r4203, r5169; +} +{ +add.f16x2 r5789, r3881, r5786; +} +{ +add.f16x2 r5792, r4525, r4847; +} +{ +add.f16x2 r5795, r5789, r5792; +} +st.local.u32 [rd4], r5795; +{ +add.f16x2 r5798, r4191, r5157; +} +{ +mul.f16x2 r5801, r5798, r5764; +} +{ +add.f16x2 r5804, r3869, r5801; +} +{ +add.f16x2 r5807, r4513, r4835; +} +{ +mul.f16x2 r5810, r5807, r5768; +} +{ +add.f16x2 r5813, r5804, r5810; +} +{ +sub.f16x2 r5816, r4203, r5169; +} +{ +mul.f16x2 r5819, r5816, r5766; +} +{ +sub.f16x2 r5822, r4525, r4847; +} +{ +mul.f16x2 r5825, r5822, r5770; +} +{ +add.f16x2 r5828, r5819, r5825; +} +{ +sub.f16x2 r5831, r5813, r5828; +} +st.local.u32 [rd4+36], r5831; +{ +add.f16x2 r5834, r4191, r5157; +} +{ +mul.f16x2 r5837, r5834, r5764; +} +{ +add.f16x2 r5840, r3869, r5837; +} +{ +add.f16x2 r5843, r4513, r4835; +} +{ +mul.f16x2 r5846, r5843, r5768; +} +{ +add.f16x2 r5849, r5840, r5846; +} +{ +sub.f16x2 r5852, r4203, r5169; +} +{ +mul.f16x2 r5855, r5852, r5766; +} +{ +sub.f16x2 r5858, r4525, r4847; +} +{ +mul.f16x2 r5861, r5858, r5770; +} +{ +add.f16x2 r5864, r5855, r5861; +} +{ +add.f16x2 r5867, r5849, r5864; +} +st.local.u32 [rd4+156], r5867; +{ +add.f16x2 r5870, r4191, r5157; +} +{ +mul.f16x2 r5873, r5870, r5768; +} +{ +add.f16x2 r5876, r3869, r5873; +} +{ +add.f16x2 r5879, r4513, r4835; +} +{ +mul.f16x2 r5882, r5879, r5772; +} +{ +add.f16x2 r5885, r5876, r5882; +} +{ +sub.f16x2 r5888, r4203, r5169; +} +{ +mul.f16x2 r5891, r5888, r5770; +} +{ +sub.f16x2 r5894, r4525, r4847; +} +{ +mul.f16x2 r5897, r5894, r5773; +} +{ +add.f16x2 r5900, r5891, r5897; +} +{ +sub.f16x2 r5903, r5885, r5900; +} +st.local.u32 [rd4+76], r5903; +{ +add.f16x2 r5906, r4191, r5157; +} +{ +mul.f16x2 r5909, r5906, r5768; +} +{ +add.f16x2 r5912, r3869, r5909; +} +{ +add.f16x2 r5915, r4513, r4835; +} +{ +mul.f16x2 r5918, r5915, r5772; +} +{ +add.f16x2 r5921, r5912, r5918; +} +{ +sub.f16x2 r5924, r4203, r5169; +} +{ +mul.f16x2 r5927, r5924, r5770; +} +{ +sub.f16x2 r5930, r4525, r4847; +} +{ +mul.f16x2 r5933, r5930, r5773; +} +{ +add.f16x2 r5936, r5927, r5933; +} +{ +add.f16x2 r5939, r5921, r5936; +} +st.local.u32 [rd4+116], r5939; +{ +add.f16x2 r5942, r4203, r5169; +} +{ +mul.f16x2 r5945, r5942, r5764; +} +{ +add.f16x2 r5948, r3881, r5945; +} +{ +add.f16x2 r5951, r4525, r4847; +} +{ +mul.f16x2 r5954, r5951, r5768; +} +{ +add.f16x2 r5957, r5948, r5954; +} +{ +sub.f16x2 r5960, r4191, r5157; +} +{ +mul.f16x2 r5963, r5960, r5766; +} +{ +sub.f16x2 r5966, r4513, r4835; +} +{ +mul.f16x2 r5969, r5966, r5770; +} +{ +add.f16x2 r5972, r5963, r5969; +} +{ +add.f16x2 r5975, r5957, r5972; +} +st.local.u32 [rd4+40], r5975; +{ +add.f16x2 r5978, r4203, r5169; +} +{ +mul.f16x2 r5981, r5978, r5764; +} +{ +add.f16x2 r5984, r3881, r5981; +} +{ +add.f16x2 r5987, r4525, r4847; +} +{ +mul.f16x2 r5990, r5987, r5768; +} +{ +add.f16x2 r5993, r5984, r5990; +} +{ +sub.f16x2 r5996, r4191, r5157; +} +{ +mul.f16x2 r5999, r5996, r5766; +} +{ +sub.f16x2 r6002, r4513, r4835; +} +{ +mul.f16x2 r6005, r6002, r5770; +} +{ +add.f16x2 r6008, r5999, r6005; +} +{ +sub.f16x2 r6011, r5993, r6008; +} +st.local.u32 [rd4+160], r6011; +{ +add.f16x2 r6014, r4203, r5169; +} +{ +mul.f16x2 r6017, r6014, r5768; +} +{ +add.f16x2 r6020, r3881, r6017; +} +{ +add.f16x2 r6023, r4525, r4847; +} +{ +mul.f16x2 r6026, r6023, r5772; +} +{ +add.f16x2 r6029, r6020, r6026; +} +{ +sub.f16x2 r6032, r4191, r5157; +} +{ +mul.f16x2 r6035, r6032, r5770; +} +{ +sub.f16x2 r6038, r4513, r4835; +} +{ +mul.f16x2 r6041, r6038, r5773; +} +{ +add.f16x2 r6044, r6035, r6041; +} +{ +add.f16x2 r6047, r6029, r6044; +} +st.local.u32 [rd4+80], r6047; +{ +add.f16x2 r6050, r4203, r5169; +} +{ +mul.f16x2 r6053, r6050, r5768; +} +{ +add.f16x2 r6056, r3881, r6053; +} +{ +add.f16x2 r6059, r4525, r4847; +} +{ +mul.f16x2 r6062, r6059, r5772; +} +{ +add.f16x2 r6065, r6056, r6062; +} +{ +sub.f16x2 r6068, r4191, r5157; +} +{ +mul.f16x2 r6071, r6068, r5770; +} +{ +sub.f16x2 r6074, r4513, r4835; +} +{ +mul.f16x2 r6077, r6074, r5773; +} +{ +add.f16x2 r6080, r6071, r6077; +} +{ +sub.f16x2 r6083, r6065, r6080; +} +st.local.u32 [rd4+120], r6083; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6086, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6087, {low, high}; +} +{ +neg.f16x2 r6088, r6087; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6090, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6091, {low, high}; +} +{ +neg.f16x2 r6092, r6091; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6095, {low, high}; +} +{ +add.f16x2 r6096, r5514, r5562; +} +{ +add.f16x2 r6099, r3917, r6096; +} +{ +add.f16x2 r6102, r5530, r5546; +} +{ +add.f16x2 r11412, r6099, r6102; +} +st.local.u32 [rd4+4], r11412; +{ +add.f16x2 r6108, r5520, r5568; +} +{ +add.f16x2 r6111, r4061, r6108; +} +{ +add.f16x2 r6114, r5536, r5552; +} +{ +add.f16x2 r6117, r6111, r6114; +} +st.local.u32 [rd4+8], r6117; +{ +add.f16x2 r6120, r5514, r5562; +} +{ +mul.f16x2 r6123, r6120, r6086; +} +{ +add.f16x2 r6126, r3917, r6123; +} +{ +add.f16x2 r6129, r5530, r5546; +} +{ +mul.f16x2 r6132, r6129, r6090; +} +{ +add.f16x2 r6135, r6126, r6132; +} +{ +sub.f16x2 r6138, r5520, r5568; +} +{ +mul.f16x2 r6141, r6138, r6088; +} +{ +sub.f16x2 r6144, r5536, r5552; +} +{ +mul.f16x2 r6147, r6144, r6092; +} +{ +add.f16x2 r6150, r6141, r6147; +} +{ +sub.f16x2 r6153, r6135, r6150; +} +st.local.u32 [rd4+44], r6153; +{ +add.f16x2 r6156, r5514, r5562; +} +{ +mul.f16x2 r6159, r6156, r6086; +} +{ +add.f16x2 r6162, r3917, r6159; +} +{ +add.f16x2 r6165, r5530, r5546; +} +{ +mul.f16x2 r6168, r6165, r6090; +} +{ +add.f16x2 r6171, r6162, r6168; +} +{ +sub.f16x2 r6174, r5520, r5568; +} +{ +mul.f16x2 r6177, r6174, r6088; +} +{ +sub.f16x2 r6180, r5536, r5552; +} +{ +mul.f16x2 r6183, r6180, r6092; +} +{ +add.f16x2 r6186, r6177, r6183; +} +{ +add.f16x2 r6189, r6171, r6186; +} +st.local.u32 [rd4+164], r6189; +{ +add.f16x2 r6192, r5514, r5562; +} +{ +mul.f16x2 r6195, r6192, r6090; +} +{ +add.f16x2 r6198, r3917, r6195; +} +{ +add.f16x2 r6201, r5530, r5546; +} +{ +mul.f16x2 r6204, r6201, r6094; +} +{ +add.f16x2 r6207, r6198, r6204; +} +{ +sub.f16x2 r6210, r5520, r5568; +} +{ +mul.f16x2 r6213, r6210, r6092; +} +{ +sub.f16x2 r6216, r5536, r5552; +} +{ +mul.f16x2 r6219, r6216, r6095; +} +{ +add.f16x2 r6222, r6213, r6219; +} +{ +sub.f16x2 r6225, r6207, r6222; +} +st.local.u32 [rd4+84], r6225; +{ +add.f16x2 r6228, r5514, r5562; +} +{ +mul.f16x2 r6231, r6228, r6090; +} +{ +add.f16x2 r6234, r3917, r6231; +} +{ +add.f16x2 r6237, r5530, r5546; +} +{ +mul.f16x2 r6240, r6237, r6094; +} +{ +add.f16x2 r6243, r6234, r6240; +} +{ +sub.f16x2 r6246, r5520, r5568; +} +{ +mul.f16x2 r6249, r6246, r6092; +} +{ +sub.f16x2 r6252, r5536, r5552; +} +{ +mul.f16x2 r6255, r6252, r6095; +} +{ +add.f16x2 r6258, r6249, r6255; +} +{ +add.f16x2 r6261, r6243, r6258; +} +st.local.u32 [rd4+124], r6261; +{ +add.f16x2 r6264, r5520, r5568; +} +{ +mul.f16x2 r6267, r6264, r6086; +} +{ +add.f16x2 r6270, r4061, r6267; +} +{ +add.f16x2 r6273, r5536, r5552; +} +{ +mul.f16x2 r6276, r6273, r6090; +} +{ +add.f16x2 r6279, r6270, r6276; +} +{ +sub.f16x2 r6282, r5514, r5562; +} +{ +mul.f16x2 r6285, r6282, r6088; +} +{ +sub.f16x2 r6288, r5530, r5546; +} +{ +mul.f16x2 r6291, r6288, r6092; +} +{ +add.f16x2 r6294, r6285, r6291; +} +{ +add.f16x2 r6297, r6279, r6294; +} +st.local.u32 [rd4+48], r6297; +{ +add.f16x2 r6300, r5520, r5568; +} +{ +mul.f16x2 r6303, r6300, r6086; +} +{ +add.f16x2 r6306, r4061, r6303; +} +{ +add.f16x2 r6309, r5536, r5552; +} +{ +mul.f16x2 r6312, r6309, r6090; +} +{ +add.f16x2 r6315, r6306, r6312; +} +{ +sub.f16x2 r6318, r5514, r5562; +} +{ +mul.f16x2 r6321, r6318, r6088; +} +{ +sub.f16x2 r6324, r5530, r5546; +} +{ +mul.f16x2 r6327, r6324, r6092; +} +{ +add.f16x2 r6330, r6321, r6327; +} +{ +sub.f16x2 r6333, r6315, r6330; +} +st.local.u32 [rd4+168], r6333; +{ +add.f16x2 r6336, r5520, r5568; +} +{ +mul.f16x2 r6339, r6336, r6090; +} +{ +add.f16x2 r6342, r4061, r6339; +} +{ +add.f16x2 r6345, r5536, r5552; +} +{ +mul.f16x2 r6348, r6345, r6094; +} +{ +add.f16x2 r6351, r6342, r6348; +} +{ +sub.f16x2 r6354, r5514, r5562; +} +{ +mul.f16x2 r6357, r6354, r6092; +} +{ +sub.f16x2 r6360, r5530, r5546; +} +{ +mul.f16x2 r6363, r6360, r6095; +} +{ +add.f16x2 r6366, r6357, r6363; +} +{ +add.f16x2 r6369, r6351, r6366; +} +st.local.u32 [rd4+88], r6369; +{ +add.f16x2 r6372, r5520, r5568; +} +{ +mul.f16x2 r6375, r6372, r6090; +} +{ +add.f16x2 r6378, r4061, r6375; +} +{ +add.f16x2 r6381, r5536, r5552; +} +{ +mul.f16x2 r6384, r6381, r6094; +} +{ +add.f16x2 r6387, r6378, r6384; +} +{ +sub.f16x2 r6390, r5514, r5562; +} +{ +mul.f16x2 r6393, r6390, r6092; +} +{ +sub.f16x2 r6396, r5530, r5546; +} +{ +mul.f16x2 r6399, r6396, r6095; +} +{ +add.f16x2 r6402, r6393, r6399; +} +{ +sub.f16x2 r6405, r6387, r6402; +} +st.local.u32 [rd4+128], r6405; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6408, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6409, {low, high}; +} +{ +neg.f16x2 r6410, r6409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6412, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6413, {low, high}; +} +{ +neg.f16x2 r6414, r6413; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6417, {low, high}; +} +{ +add.f16x2 r6418, r5578, r5626; +} +{ +add.f16x2 r6421, r3989, r6418; +} +{ +add.f16x2 r6424, r5594, r5610; +} +{ +add.f16x2 r6427, r6421, r6424; +} +st.local.u32 [rd4+12], r6427; +{ +add.f16x2 r6430, r5584, r5632; +} +{ +add.f16x2 r6433, r4133, r6430; +} +{ +add.f16x2 r6436, r5600, r5616; +} +{ +add.f16x2 r6439, r6433, r6436; +} +st.local.u32 [rd4+16], r6439; +{ +add.f16x2 r6442, r5578, r5626; +} +{ +mul.f16x2 r6445, r6442, r6408; +} +{ +add.f16x2 r6448, r3989, r6445; +} +{ +add.f16x2 r6451, r5594, r5610; +} +{ +mul.f16x2 r6454, r6451, r6412; +} +{ +add.f16x2 r6457, r6448, r6454; +} +{ +sub.f16x2 r6460, r5584, r5632; +} +{ +mul.f16x2 r6463, r6460, r6410; +} +{ +sub.f16x2 r6466, r5600, r5616; +} +{ +mul.f16x2 r6469, r6466, r6414; +} +{ +add.f16x2 r6472, r6463, r6469; +} +{ +sub.f16x2 r6475, r6457, r6472; +} +st.local.u32 [rd4+52], r6475; +{ +add.f16x2 r6478, r5578, r5626; +} +{ +mul.f16x2 r6481, r6478, r6408; +} +{ +add.f16x2 r6484, r3989, r6481; +} +{ +add.f16x2 r6487, r5594, r5610; +} +{ +mul.f16x2 r6490, r6487, r6412; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +sub.f16x2 r6496, r5584, r5632; +} +{ +mul.f16x2 r6499, r6496, r6410; +} +{ +sub.f16x2 r6502, r5600, r5616; +} +{ +mul.f16x2 r6505, r6502, r6414; +} +{ +add.f16x2 r6508, r6499, r6505; +} +{ +add.f16x2 r6511, r6493, r6508; +} +st.local.u32 [rd4+172], r6511; +{ +add.f16x2 r6514, r5578, r5626; +} +{ +mul.f16x2 r6517, r6514, r6412; +} +{ +add.f16x2 r6520, r3989, r6517; +} +{ +add.f16x2 r6523, r5594, r5610; +} +{ +mul.f16x2 r6526, r6523, r6416; +} +{ +add.f16x2 r6529, r6520, r6526; +} +{ +sub.f16x2 r6532, r5584, r5632; +} +{ +mul.f16x2 r6535, r6532, r6414; +} +{ +sub.f16x2 r6538, r5600, r5616; +} +{ +mul.f16x2 r6541, r6538, r6417; +} +{ +add.f16x2 r6544, r6535, r6541; +} +{ +sub.f16x2 r6547, r6529, r6544; +} +st.local.u32 [rd4+92], r6547; +{ +add.f16x2 r6550, r5578, r5626; +} +{ +mul.f16x2 r6553, r6550, r6412; +} +{ +add.f16x2 r6556, r3989, r6553; +} +{ +add.f16x2 r6559, r5594, r5610; +} +{ +mul.f16x2 r6562, r6559, r6416; +} +{ +add.f16x2 r6565, r6556, r6562; +} +{ +sub.f16x2 r6568, r5584, r5632; +} +{ +mul.f16x2 r6571, r6568, r6414; +} +{ +sub.f16x2 r6574, r5600, r5616; +} +{ +mul.f16x2 r6577, r6574, r6417; +} +{ +add.f16x2 r6580, r6571, r6577; +} +{ +add.f16x2 r6583, r6565, r6580; +} +st.local.u32 [rd4+132], r6583; +{ +add.f16x2 r6586, r5584, r5632; +} +{ +mul.f16x2 r6589, r6586, r6408; +} +{ +add.f16x2 r6592, r4133, r6589; +} +{ +add.f16x2 r6595, r5600, r5616; +} +{ +mul.f16x2 r6598, r6595, r6412; +} +{ +add.f16x2 r6601, r6592, r6598; +} +{ +sub.f16x2 r6604, r5578, r5626; +} +{ +mul.f16x2 r6607, r6604, r6410; +} +{ +sub.f16x2 r6610, r5594, r5610; +} +{ +mul.f16x2 r6613, r6610, r6414; +} +{ +add.f16x2 r6616, r6607, r6613; +} +{ +add.f16x2 r6619, r6601, r6616; +} +st.local.u32 [rd4+56], r6619; +{ +add.f16x2 r6622, r5584, r5632; +} +{ +mul.f16x2 r6625, r6622, r6408; +} +{ +add.f16x2 r6628, r4133, r6625; +} +{ +add.f16x2 r6631, r5600, r5616; +} +{ +mul.f16x2 r6634, r6631, r6412; +} +{ +add.f16x2 r6637, r6628, r6634; +} +{ +sub.f16x2 r6640, r5578, r5626; +} +{ +mul.f16x2 r6643, r6640, r6410; +} +{ +sub.f16x2 r6646, r5594, r5610; +} +{ +mul.f16x2 r6649, r6646, r6414; +} +{ +add.f16x2 r6652, r6643, r6649; +} +{ +sub.f16x2 r6655, r6637, r6652; +} +st.local.u32 [rd4+176], r6655; +{ +add.f16x2 r6658, r5584, r5632; +} +{ +mul.f16x2 r6661, r6658, r6412; +} +{ +add.f16x2 r6664, r4133, r6661; +} +{ +add.f16x2 r6667, r5600, r5616; +} +{ +mul.f16x2 r6670, r6667, r6416; +} +{ +add.f16x2 r6673, r6664, r6670; +} +{ +sub.f16x2 r6676, r5578, r5626; +} +{ +mul.f16x2 r6679, r6676, r6414; +} +{ +sub.f16x2 r6682, r5594, r5610; +} +{ +mul.f16x2 r6685, r6682, r6417; +} +{ +add.f16x2 r6688, r6679, r6685; +} +{ +add.f16x2 r6691, r6673, r6688; +} +st.local.u32 [rd4+96], r6691; +{ +add.f16x2 r6694, r5584, r5632; +} +{ +mul.f16x2 r6697, r6694, r6412; +} +{ +add.f16x2 r6700, r4133, r6697; +} +{ +add.f16x2 r6703, r5600, r5616; +} +{ +mul.f16x2 r6706, r6703, r6416; +} +{ +add.f16x2 r6709, r6700, r6706; +} +{ +sub.f16x2 r6712, r5578, r5626; +} +{ +mul.f16x2 r6715, r6712, r6414; +} +{ +sub.f16x2 r6718, r5594, r5610; +} +{ +mul.f16x2 r6721, r6718, r6417; +} +{ +add.f16x2 r6724, r6715, r6721; +} +{ +sub.f16x2 r6727, r6709, r6724; +} +st.local.u32 [rd4+136], r6727; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6730, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6731, {low, high}; +} +{ +neg.f16x2 r6732, r6731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6735, {low, high}; +} +{ +neg.f16x2 r6736, r6735; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6739, {low, high}; +} +{ +add.f16x2 r6740, r5642, r5690; +} +{ +add.f16x2 r6743, r4025, r6740; +} +{ +add.f16x2 r6746, r5658, r5674; +} +{ +add.f16x2 r6749, r6743, r6746; +} +st.local.u32 [rd4+20], r6749; +{ +add.f16x2 r6752, r5648, r5696; +} +{ +add.f16x2 r6755, r4169, r6752; +} +{ +add.f16x2 r6758, r5664, r5680; +} +{ +add.f16x2 r6761, r6755, r6758; +} +st.local.u32 [rd4+24], r6761; +{ +add.f16x2 r6764, r5642, r5690; +} +{ +mul.f16x2 r6767, r6764, r6730; +} +{ +add.f16x2 r6770, r4025, r6767; +} +{ +add.f16x2 r6773, r5658, r5674; +} +{ +mul.f16x2 r6776, r6773, r6734; +} +{ +add.f16x2 r6779, r6770, r6776; +} +{ +sub.f16x2 r6782, r5648, r5696; +} +{ +mul.f16x2 r6785, r6782, r6732; +} +{ +sub.f16x2 r6788, r5664, r5680; +} +{ +mul.f16x2 r6791, r6788, r6736; +} +{ +add.f16x2 r6794, r6785, r6791; +} +{ +sub.f16x2 r6797, r6779, r6794; +} +st.local.u32 [rd4+60], r6797; +{ +add.f16x2 r6800, r5642, r5690; +} +{ +mul.f16x2 r6803, r6800, r6730; +} +{ +add.f16x2 r6806, r4025, r6803; +} +{ +add.f16x2 r6809, r5658, r5674; +} +{ +mul.f16x2 r6812, r6809, r6734; +} +{ +add.f16x2 r6815, r6806, r6812; +} +{ +sub.f16x2 r6818, r5648, r5696; +} +{ +mul.f16x2 r6821, r6818, r6732; +} +{ +sub.f16x2 r6824, r5664, r5680; +} +{ +mul.f16x2 r6827, r6824, r6736; +} +{ +add.f16x2 r6830, r6821, r6827; +} +{ +add.f16x2 r6833, r6815, r6830; +} +st.local.u32 [rd4+180], r6833; +{ +add.f16x2 r6836, r5642, r5690; +} +{ +mul.f16x2 r6839, r6836, r6734; +} +{ +add.f16x2 r6842, r4025, r6839; +} +{ +add.f16x2 r6845, r5658, r5674; +} +{ +mul.f16x2 r6848, r6845, r6738; +} +{ +add.f16x2 r6851, r6842, r6848; +} +{ +sub.f16x2 r6854, r5648, r5696; +} +{ +mul.f16x2 r6857, r6854, r6736; +} +{ +sub.f16x2 r6860, r5664, r5680; +} +{ +mul.f16x2 r6863, r6860, r6739; +} +{ +add.f16x2 r6866, r6857, r6863; +} +{ +sub.f16x2 r6869, r6851, r6866; +} +st.local.u32 [rd4+100], r6869; +{ +add.f16x2 r6872, r5642, r5690; +} +{ +mul.f16x2 r6875, r6872, r6734; +} +{ +add.f16x2 r6878, r4025, r6875; +} +{ +add.f16x2 r6881, r5658, r5674; +} +{ +mul.f16x2 r6884, r6881, r6738; +} +{ +add.f16x2 r6887, r6878, r6884; +} +{ +sub.f16x2 r6890, r5648, r5696; +} +{ +mul.f16x2 r6893, r6890, r6736; +} +{ +sub.f16x2 r6896, r5664, r5680; +} +{ +mul.f16x2 r6899, r6896, r6739; +} +{ +add.f16x2 r6902, r6893, r6899; +} +{ +add.f16x2 r6905, r6887, r6902; +} +st.local.u32 [rd4+140], r6905; +{ +add.f16x2 r6908, r5648, r5696; +} +{ +mul.f16x2 r6911, r6908, r6730; +} +{ +add.f16x2 r6914, r4169, r6911; +} +{ +add.f16x2 r6917, r5664, r5680; +} +{ +mul.f16x2 r6920, r6917, r6734; +} +{ +add.f16x2 r6923, r6914, r6920; +} +{ +sub.f16x2 r6926, r5642, r5690; +} +{ +mul.f16x2 r6929, r6926, r6732; +} +{ +sub.f16x2 r6932, r5658, r5674; +} +{ +mul.f16x2 r6935, r6932, r6736; +} +{ +add.f16x2 r6938, r6929, r6935; +} +{ +add.f16x2 r6941, r6923, r6938; +} +st.local.u32 [rd4+64], r6941; +{ +add.f16x2 r6944, r5648, r5696; +} +{ +mul.f16x2 r6947, r6944, r6730; +} +{ +add.f16x2 r6950, r4169, r6947; +} +{ +add.f16x2 r6953, r5664, r5680; +} +{ +mul.f16x2 r6956, r6953, r6734; +} +{ +add.f16x2 r6959, r6950, r6956; +} +{ +sub.f16x2 r6962, r5642, r5690; +} +{ +mul.f16x2 r6965, r6962, r6732; +} +{ +sub.f16x2 r6968, r5658, r5674; +} +{ +mul.f16x2 r6971, r6968, r6736; +} +{ +add.f16x2 r6974, r6965, r6971; +} +{ +sub.f16x2 r6977, r6959, r6974; +} +st.local.u32 [rd4+184], r6977; +{ +add.f16x2 r6980, r5648, r5696; +} +{ +mul.f16x2 r6983, r6980, r6734; +} +{ +add.f16x2 r6986, r4169, r6983; +} +{ +add.f16x2 r6989, r5664, r5680; +} +{ +mul.f16x2 r6992, r6989, r6738; +} +{ +add.f16x2 r6995, r6986, r6992; +} +{ +sub.f16x2 r6998, r5642, r5690; +} +{ +mul.f16x2 r7001, r6998, r6736; +} +{ +sub.f16x2 r7004, r5658, r5674; +} +{ +mul.f16x2 r7007, r7004, r6739; +} +{ +add.f16x2 r7010, r7001, r7007; +} +{ +add.f16x2 r7013, r6995, r7010; +} +st.local.u32 [rd4+104], r7013; +{ +add.f16x2 r7016, r5648, r5696; +} +{ +mul.f16x2 r7019, r7016, r6734; +} +{ +add.f16x2 r7022, r4169, r7019; +} +{ +add.f16x2 r7025, r5664, r5680; +} +{ +mul.f16x2 r7028, r7025, r6738; +} +{ +add.f16x2 r7031, r7022, r7028; +} +{ +sub.f16x2 r7034, r5642, r5690; +} +{ +mul.f16x2 r7037, r7034, r6736; +} +{ +sub.f16x2 r7040, r5658, r5674; +} +{ +mul.f16x2 r7043, r7040, r6739; +} +{ +add.f16x2 r7046, r7037, r7043; +} +{ +sub.f16x2 r7049, r7031, r7046; +} +st.local.u32 [rd4+144], r7049; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7053, {low, high}; +} +{ +neg.f16x2 r7054, r7053; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7057, {low, high}; +} +{ +neg.f16x2 r7058, r7057; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7061, {low, high}; +} +{ +add.f16x2 r7062, r5706, r5754; +} +{ +add.f16x2 r7065, r3953, r7062; +} +{ +add.f16x2 r7068, r5722, r5738; +} +{ +add.f16x2 r7071, r7065, r7068; +} +st.local.u32 [rd4+28], r7071; +{ +add.f16x2 r7074, r5712, r5760; +} +{ +add.f16x2 r7077, r4097, r7074; +} +{ +add.f16x2 r7080, r5728, r5744; +} +{ +add.f16x2 r7083, r7077, r7080; +} +st.local.u32 [rd4+32], r7083; +{ +add.f16x2 r7086, r5706, r5754; +} +{ +mul.f16x2 r7089, r7086, r7052; +} +{ +add.f16x2 r7092, r3953, r7089; +} +{ +add.f16x2 r7095, r5722, r5738; +} +{ +mul.f16x2 r7098, r7095, r7056; +} +{ +add.f16x2 r7101, r7092, r7098; +} +{ +sub.f16x2 r7104, r5712, r5760; +} +{ +mul.f16x2 r7107, r7104, r7054; +} +{ +sub.f16x2 r7110, r5728, r5744; +} +{ +mul.f16x2 r7113, r7110, r7058; +} +{ +add.f16x2 r7116, r7107, r7113; +} +{ +sub.f16x2 r7119, r7101, r7116; +} +st.local.u32 [rd4+68], r7119; +{ +add.f16x2 r7122, r5706, r5754; +} +{ +mul.f16x2 r7125, r7122, r7052; +} +{ +add.f16x2 r7128, r3953, r7125; +} +{ +add.f16x2 r7131, r5722, r5738; +} +{ +mul.f16x2 r7134, r7131, r7056; +} +{ +add.f16x2 r7137, r7128, r7134; +} +{ +sub.f16x2 r7140, r5712, r5760; +} +{ +mul.f16x2 r7143, r7140, r7054; +} +{ +sub.f16x2 r7146, r5728, r5744; +} +{ +mul.f16x2 r7149, r7146, r7058; +} +{ +add.f16x2 r7152, r7143, r7149; +} +{ +add.f16x2 r7155, r7137, r7152; +} +st.local.u32 [rd4+188], r7155; +{ +add.f16x2 r7158, r5706, r5754; +} +{ +mul.f16x2 r7161, r7158, r7056; +} +{ +add.f16x2 r7164, r3953, r7161; +} +{ +add.f16x2 r7167, r5722, r5738; +} +{ +mul.f16x2 r7170, r7167, r7060; +} +{ +add.f16x2 r7173, r7164, r7170; +} +{ +sub.f16x2 r7176, r5712, r5760; +} +{ +mul.f16x2 r7179, r7176, r7058; +} +{ +sub.f16x2 r7182, r5728, r5744; +} +{ +mul.f16x2 r7185, r7182, r7061; +} +{ +add.f16x2 r7188, r7179, r7185; +} +{ +sub.f16x2 r7191, r7173, r7188; +} +st.local.u32 [rd4+108], r7191; +{ +add.f16x2 r7194, r5706, r5754; +} +{ +mul.f16x2 r7197, r7194, r7056; +} +{ +add.f16x2 r7200, r3953, r7197; +} +{ +add.f16x2 r7203, r5722, r5738; +} +{ +mul.f16x2 r7206, r7203, r7060; +} +{ +add.f16x2 r7209, r7200, r7206; +} +{ +sub.f16x2 r7212, r5712, r5760; +} +{ +mul.f16x2 r7215, r7212, r7058; +} +{ +sub.f16x2 r7218, r5728, r5744; +} +{ +mul.f16x2 r7221, r7218, r7061; +} +{ +add.f16x2 r7224, r7215, r7221; +} +{ +add.f16x2 r7227, r7209, r7224; +} +st.local.u32 [rd4+148], r7227; +{ +add.f16x2 r7230, r5712, r5760; +} +{ +mul.f16x2 r7233, r7230, r7052; +} +{ +add.f16x2 r7236, r4097, r7233; +} +{ +add.f16x2 r7239, r5728, r5744; +} +{ +mul.f16x2 r7242, r7239, r7056; +} +{ +add.f16x2 r7245, r7236, r7242; +} +{ +sub.f16x2 r7248, r5706, r5754; +} +{ +mul.f16x2 r7251, r7248, r7054; +} +{ +sub.f16x2 r7254, r5722, r5738; +} +{ +mul.f16x2 r7257, r7254, r7058; +} +{ +add.f16x2 r7260, r7251, r7257; +} +{ +add.f16x2 r7263, r7245, r7260; +} +st.local.u32 [rd4+72], r7263; +{ +add.f16x2 r7266, r5712, r5760; +} +{ +mul.f16x2 r7269, r7266, r7052; +} +{ +add.f16x2 r7272, r4097, r7269; +} +{ +add.f16x2 r7275, r5728, r5744; +} +{ +mul.f16x2 r7278, r7275, r7056; +} +{ +add.f16x2 r7281, r7272, r7278; +} +{ +sub.f16x2 r7284, r5706, r5754; +} +{ +mul.f16x2 r7287, r7284, r7054; +} +{ +sub.f16x2 r7290, r5722, r5738; +} +{ +mul.f16x2 r7293, r7290, r7058; +} +{ +add.f16x2 r7296, r7287, r7293; +} +{ +sub.f16x2 r7299, r7281, r7296; +} +st.local.u32 [rd4+192], r7299; +{ +add.f16x2 r7302, r5712, r5760; +} +{ +mul.f16x2 r7305, r7302, r7056; +} +{ +add.f16x2 r7308, r4097, r7305; +} +{ +add.f16x2 r7311, r5728, r5744; +} +{ +mul.f16x2 r7314, r7311, r7060; +} +{ +add.f16x2 r7317, r7308, r7314; +} +{ +sub.f16x2 r7320, r5706, r5754; +} +{ +mul.f16x2 r7323, r7320, r7058; +} +{ +sub.f16x2 r7326, r5722, r5738; +} +{ +mul.f16x2 r7329, r7326, r7061; +} +{ +add.f16x2 r7332, r7323, r7329; +} +{ +add.f16x2 r7335, r7317, r7332; +} +st.local.u32 [rd4+112], r7335; +{ +add.f16x2 r7338, r5712, r5760; +} +{ +mul.f16x2 r7341, r7338, r7056; +} +{ +add.f16x2 r7344, r4097, r7341; +} +{ +add.f16x2 r7347, r5728, r5744; +} +{ +mul.f16x2 r7350, r7347, r7060; +} +{ +add.f16x2 r7353, r7344, r7350; +} +{ +sub.f16x2 r7356, r5706, r5754; +} +{ +mul.f16x2 r7359, r7356, r7058; +} +{ +sub.f16x2 r7362, r5722, r5738; +} +{ +mul.f16x2 r7365, r7362, r7061; +} +{ +add.f16x2 r7368, r7359, r7365; +} +{ +sub.f16x2 r7371, r7353, r7368; +} +st.local.u32 [rd4+152], r7371; +mul.wide.u32 rd16, r3, 1374389535; +shr.u64 rd17, rd16, 35; +cvt.u32.u64 r15, rd17; +cvt.rn.f32.u32 f460, r15; +mul.f32 f461, f460, 0f3C24B5BE; +cos.approx.f32 f456, f461; +sin.approx.f32 f462, f461; +neg.f32 f457, f462; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f457; +mov.b32 r11413, {low, high}; +} +mov.u32 r11411, -8; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11413; +mov.b32 r7502, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11413; +mov.b32 r7504, {high, high}; +} +mov.u64 rd20, rd3; +bra.uni LBB0_4; +LBB0_5: +ld.local.u32 r11412, [rd20+72]; +add.s64 rd20, rd20, 64; +LBB0_4: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11413; +mov.b32 r7482, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11413; +mov.b32 r7484, {high, high}; +} +ld.local.u32 r7487, [rd20+12]; +{ +mul.f16x2 r7486, r7487, r7484; +} +{ +fma.rn.f16x2 r7489, r11412, r7482, r7486; +} +st.local.u32 [rd20+8], r7489; +{ +mul.f16x2 r7493, r11412, r7484; +} +{ +neg.f16x2 r7496, r7493; +} +{ +fma.rn.f16x2 r7498, r7487, r7482, r7496; +} +st.local.u32 [rd20+12], r7498; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7506, {low, high}; +} +{ +mul.f16x2 r7507, r7504, r7506; +} +{ +mul.f16x2 r7510, r11413, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11413; +mov.b32 r7513, {high, low}; +} +{ +fma.rn.f16x2 r7515, r7507, r7513, r7510; +} +ld.local.u32 r7531, [rd20+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7515; +mov.b32 r7519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7515; +mov.b32 r7521, {high, high}; +} +ld.local.u32 r7536, [rd20+20]; +{ +mul.f16x2 r7523, r7536, r7521; +} +{ +fma.rn.f16x2 r7526, r7531, r7519, r7523; +} +st.local.u32 [rd20+16], r7526; +{ +mul.f16x2 r7530, r7531, r7521; +} +{ +neg.f16x2 r7533, r7530; +} +{ +fma.rn.f16x2 r7535, r7536, r7519, r7533; +} +st.local.u32 [rd20+20], r7535; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7543, {low, high}; +} +{ +mul.f16x2 r7544, r7504, r7543; +} +{ +mul.f16x2 r7547, r7515, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7515; +mov.b32 r7550, {high, low}; +} +{ +fma.rn.f16x2 r7552, r7544, r7550, r7547; +} +ld.local.u32 r7568, [rd20+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7552; +mov.b32 r7556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7552; +mov.b32 r7558, {high, high}; +} +ld.local.u32 r7573, [rd20+28]; +{ +mul.f16x2 r7560, r7573, r7558; +} +{ +fma.rn.f16x2 r7563, r7568, r7556, r7560; +} +st.local.u32 [rd20+24], r7563; +{ +mul.f16x2 r7567, r7568, r7558; +} +{ +neg.f16x2 r7570, r7567; +} +{ +fma.rn.f16x2 r7572, r7573, r7556, r7570; +} +st.local.u32 [rd20+28], r7572; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7580, {low, high}; +} +{ +mul.f16x2 r7581, r7504, r7580; +} +{ +mul.f16x2 r7584, r7552, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7552; +mov.b32 r7587, {high, low}; +} +{ +fma.rn.f16x2 r7589, r7581, r7587, r7584; +} +ld.local.u32 r7605, [rd20+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7589; +mov.b32 r7593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7589; +mov.b32 r7595, {high, high}; +} +ld.local.u32 r7610, [rd20+36]; +{ +mul.f16x2 r7597, r7610, r7595; +} +{ +fma.rn.f16x2 r7600, r7605, r7593, r7597; +} +st.local.u32 [rd20+32], r7600; +{ +mul.f16x2 r7604, r7605, r7595; +} +{ +neg.f16x2 r7607, r7604; +} +{ +fma.rn.f16x2 r7609, r7610, r7593, r7607; +} +st.local.u32 [rd20+36], r7609; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7617, {low, high}; +} +{ +mul.f16x2 r7618, r7504, r7617; +} +{ +mul.f16x2 r7621, r7589, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7589; +mov.b32 r7624, {high, low}; +} +{ +fma.rn.f16x2 r7626, r7618, r7624, r7621; +} +ld.local.u32 r7638, [rd20+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7626; +mov.b32 r7630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7626; +mov.b32 r7632, {high, high}; +} +ld.local.u32 r7647, [rd20+44]; +{ +mul.f16x2 r7634, r7647, r7632; +} +{ +fma.rn.f16x2 r7637, r7638, r7630, r7634; +} +st.local.u32 [rd20+40], r7637; +{ +mul.f16x2 r7641, r7638, r7632; +} +{ +neg.f16x2 r7644, r7641; +} +{ +fma.rn.f16x2 r7646, r7647, r7630, r7644; +} +st.local.u32 [rd20+44], r7646; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7654, {low, high}; +} +{ +mul.f16x2 r7655, r7504, r7654; +} +{ +mul.f16x2 r7658, r7626, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7626; +mov.b32 r7661, {high, low}; +} +{ +fma.rn.f16x2 r7663, r7655, r7661, r7658; +} +ld.local.u32 r7675, [rd20+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7663; +mov.b32 r7667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7663; +mov.b32 r7669, {high, high}; +} +ld.local.u32 r7672, [rd20+52]; +{ +mul.f16x2 r7671, r7672, r7669; +} +{ +fma.rn.f16x2 r7674, r7675, r7667, r7671; +} +st.local.u32 [rd20+48], r7674; +{ +mul.f16x2 r7678, r7675, r7669; +} +{ +neg.f16x2 r7681, r7678; +} +{ +fma.rn.f16x2 r7683, r7672, r7667, r7681; +} +st.local.u32 [rd20+52], r7683; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7691, {low, high}; +} +{ +mul.f16x2 r7692, r7504, r7691; +} +{ +mul.f16x2 r7695, r7663, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7663; +mov.b32 r7698, {high, low}; +} +{ +fma.rn.f16x2 r7700, r7692, r7698, r7695; +} +ld.local.u32 r7712, [rd20+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7700; +mov.b32 r7704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7700; +mov.b32 r7706, {high, high}; +} +ld.local.u32 r7709, [rd20+60]; +{ +mul.f16x2 r7708, r7709, r7706; +} +{ +fma.rn.f16x2 r7711, r7712, r7704, r7708; +} +st.local.u32 [rd20+56], r7711; +{ +mul.f16x2 r7715, r7712, r7706; +} +{ +neg.f16x2 r7718, r7715; +} +{ +fma.rn.f16x2 r7720, r7709, r7704, r7718; +} +st.local.u32 [rd20+60], r7720; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7728, {low, high}; +} +{ +mul.f16x2 r7729, r7504, r7728; +} +{ +mul.f16x2 r7732, r7700, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7700; +mov.b32 r7735, {high, low}; +} +{ +fma.rn.f16x2 r7737, r7729, r7735, r7732; +} +ld.local.u32 r7749, [rd20+64]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7737; +mov.b32 r7741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7737; +mov.b32 r7743, {high, high}; +} +ld.local.u32 r7746, [rd20+68]; +{ +mul.f16x2 r7745, r7746, r7743; +} +{ +fma.rn.f16x2 r7748, r7749, r7741, r7745; +} +st.local.u32 [rd20+64], r7748; +{ +mul.f16x2 r7752, r7749, r7743; +} +{ +neg.f16x2 r7755, r7752; +} +{ +fma.rn.f16x2 r7757, r7746, r7741, r7755; +} +st.local.u32 [rd20+68], r7757; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7765, {low, high}; +} +{ +mul.f16x2 r7766, r7504, r7765; +} +{ +mul.f16x2 r7769, r7737, r7502; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7737; +mov.b32 r7772, {high, low}; +} +{ +fma.rn.f16x2 r11413, r7766, r7772, r7769; +} +add.s32 r11411, r11411, 8; +setp.eq.s32 p2, r11411, 16; +@p2 bra LBB0_6; +bra.uni LBB0_5; +LBB0_6: +mul.lo.s32 r11303, r15, 25; +sub.s32 r11304, r3, r11303; +shl.b32 r11305, r11304, 3; +add.s32 r11306, r12, r11305; +barrier.sync 0; +mad.lo.s32 r11307, r15, 5000, r11306; +ld.local.v2.u32 {r11308, r11309}, [rd3]; +st.shared.u32 [r11307], r11308; +st.shared.u32 [r11307+4], r11309; +ld.local.v2.u32 {r11312, r11313}, [rd4+4]; +st.shared.u32 [r11307+200], r11312; +st.shared.u32 [r11307+204], r11313; +ld.local.v2.u32 {r11316, r11317}, [rd4+12]; +st.shared.u32 [r11307+400], r11316; +st.shared.u32 [r11307+404], r11317; +ld.local.v2.u32 {r11320, r11321}, [rd4+20]; +st.shared.u32 [r11307+600], r11320; +st.shared.u32 [r11307+604], r11321; +ld.local.v2.u32 {r11324, r11325}, [rd4+28]; +st.shared.u32 [r11307+800], r11324; +st.shared.u32 [r11307+804], r11325; +ld.local.v2.u32 {r11328, r11329}, [rd4+36]; +st.shared.u32 [r11307+1000], r11328; +st.shared.u32 [r11307+1004], r11329; +ld.local.v2.u32 {r11332, r11333}, [rd4+44]; +st.shared.u32 [r11307+1200], r11332; +st.shared.u32 [r11307+1204], r11333; +ld.local.v2.u32 {r11336, r11337}, [rd4+52]; +st.shared.u32 [r11307+1400], r11336; +st.shared.u32 [r11307+1404], r11337; +ld.local.v2.u32 {r11340, r11341}, [rd4+60]; +st.shared.u32 [r11307+1600], r11340; +st.shared.u32 [r11307+1604], r11341; +ld.local.v2.u32 {r11344, r11345}, [rd4+68]; +st.shared.u32 [r11307+1800], r11344; +st.shared.u32 [r11307+1804], r11345; +ld.local.v2.u32 {r11348, r11349}, [rd4+76]; +st.shared.u32 [r11307+2000], r11348; +st.shared.u32 [r11307+2004], r11349; +ld.local.v2.u32 {r11352, r11353}, [rd4+84]; +st.shared.u32 [r11307+2200], r11352; +st.shared.u32 [r11307+2204], r11353; +ld.local.v2.u32 {r11356, r11357}, [rd4+92]; +st.shared.u32 [r11307+2400], r11356; +st.shared.u32 [r11307+2404], r11357; +ld.local.v2.u32 {r11360, r11361}, [rd4+100]; +st.shared.u32 [r11307+2600], r11360; +st.shared.u32 [r11307+2604], r11361; +ld.local.v2.u32 {r11364, r11365}, [rd4+108]; +st.shared.u32 [r11307+2800], r11364; +st.shared.u32 [r11307+2804], r11365; +ld.local.v2.u32 {r11368, r11369}, [rd4+116]; +st.shared.u32 [r11307+3000], r11368; +st.shared.u32 [r11307+3004], r11369; +ld.local.v2.u32 {r11372, r11373}, [rd4+124]; +st.shared.u32 [r11307+3200], r11372; +st.shared.u32 [r11307+3204], r11373; +ld.local.v2.u32 {r11376, r11377}, [rd4+132]; +st.shared.u32 [r11307+3400], r11376; +st.shared.u32 [r11307+3404], r11377; +ld.local.v2.u32 {r11380, r11381}, [rd4+140]; +st.shared.u32 [r11307+3600], r11380; +st.shared.u32 [r11307+3604], r11381; +ld.local.v2.u32 {r11384, r11385}, [rd4+148]; +st.shared.u32 [r11307+3800], r11384; +st.shared.u32 [r11307+3804], r11385; +ld.local.v2.u32 {r11388, r11389}, [rd4+156]; +st.shared.u32 [r11307+4000], r11388; +st.shared.u32 [r11307+4004], r11389; +ld.local.v2.u32 {r11392, r11393}, [rd4+164]; +st.shared.u32 [r11307+4200], r11392; +st.shared.u32 [r11307+4204], r11393; +ld.local.v2.u32 {r11396, r11397}, [rd4+172]; +st.shared.u32 [r11307+4400], r11396; +st.shared.u32 [r11307+4404], r11397; +ld.local.v2.u32 {r11400, r11401}, [rd4+180]; +st.shared.u32 [r11307+4600], r11400; +st.shared.u32 [r11307+4604], r11401; +ld.local.v2.u32 {r11404, r11405}, [rd4+188]; +st.shared.u32 [r11307+4800], r11404; +st.shared.u32 [r11307+4804], r11405; +barrier.sync 0; +ld.shared.u32 r7792, [r13]; +ld.shared.u32 r7804, [r13+4]; +ld.shared.u32 r8114, [r13+5000]; +ld.shared.u32 r8126, [r13+5004]; +ld.shared.u32 r8436, [r13+10000]; +ld.shared.u32 r8448, [r13+10004]; +ld.shared.u32 r8758, [r13+15000]; +ld.shared.u32 r8770, [r13+15004]; +ld.shared.u32 r9080, [r13+20000]; +ld.shared.u32 r9092, [r13+20004]; +ld.shared.u32 r7789, [r13+25000]; +ld.shared.u32 r7801, [r13+25004]; +ld.shared.u32 r8111, [r13+30000]; +ld.shared.u32 r8123, [r13+30004]; +ld.shared.u32 r8433, [r13+35000]; +ld.shared.u32 r8445, [r13+35004]; +ld.shared.u32 r8755, [r13+40000]; +ld.shared.u32 r8767, [r13+40004]; +ld.shared.u32 r9077, [r13+45000]; +ld.shared.u32 r9089, [r13+45004]; +ld.shared.u32 r7795, [r13+50000]; +ld.shared.u32 r7807, [r13+50004]; +ld.shared.u32 r8117, [r13+55000]; +ld.shared.u32 r8129, [r13+55004]; +ld.shared.u32 r8439, [r13+60000]; +ld.shared.u32 r8451, [r13+60004]; +ld.shared.u32 r8761, [r13+65000]; +ld.shared.u32 r8773, [r13+65004]; +ld.shared.u32 r9083, [r13+70000]; +ld.shared.u32 r9095, [r13+70004]; +ld.shared.u32 r7796, [r13+75000]; +ld.shared.u32 r7808, [r13+75004]; +ld.shared.u32 r8118, [r13+80000]; +ld.shared.u32 r8130, [r13+80004]; +ld.shared.u32 r8440, [r13+85000]; +ld.shared.u32 r8452, [r13+85004]; +ld.shared.u32 r8762, [r13+90000]; +ld.shared.u32 r8774, [r13+90004]; +ld.shared.u32 r9084, [r13+95000]; +ld.shared.u32 r9096, [r13+95004]; +ld.shared.u32 r7790, [r13+100000]; +ld.shared.u32 r7802, [r13+100004]; +ld.shared.u32 r8112, [r13+105000]; +ld.shared.u32 r8124, [r13+105004]; +ld.shared.u32 r8434, [r13+110000]; +ld.shared.u32 r8446, [r13+110004]; +ld.shared.u32 r8756, [r13+115000]; +ld.shared.u32 r8768, [r13+115004]; +ld.shared.u32 r9078, [r13+120000]; +ld.shared.u32 r9090, [r13+120004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7778, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7779, {low, high}; +} +{ +neg.f16x2 r7780, r7779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7782, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7783, {low, high}; +} +{ +neg.f16x2 r7784, r7783; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7786, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7787, {low, high}; +} +{ +add.f16x2 r7788, r7789, r7790; +} +{ +add.f16x2 r7791, r7792, r7788; +} +{ +add.f16x2 r7794, r7795, r7796; +} +{ +add.f16x2 r7797, r7791, r7794; +} +{ +add.f16x2 r7800, r7801, r7802; +} +{ +add.f16x2 r7803, r7804, r7800; +} +{ +add.f16x2 r7806, r7807, r7808; +} +{ +add.f16x2 r7809, r7803, r7806; +} +{ +add.f16x2 r7812, r7789, r7790; +} +{ +mul.f16x2 r7815, r7812, r7778; +} +{ +add.f16x2 r7818, r7792, r7815; +} +{ +add.f16x2 r7821, r7795, r7796; +} +{ +mul.f16x2 r7824, r7821, r7782; +} +{ +add.f16x2 r7827, r7818, r7824; +} +{ +sub.f16x2 r7830, r7801, r7802; +} +{ +mul.f16x2 r7833, r7830, r7780; +} +{ +sub.f16x2 r7836, r7807, r7808; +} +{ +mul.f16x2 r7839, r7836, r7784; +} +{ +add.f16x2 r7842, r7833, r7839; +} +{ +sub.f16x2 r7845, r7827, r7842; +} +{ +add.f16x2 r7848, r7789, r7790; +} +{ +mul.f16x2 r7851, r7848, r7778; +} +{ +add.f16x2 r7854, r7792, r7851; +} +{ +add.f16x2 r7857, r7795, r7796; +} +{ +mul.f16x2 r7860, r7857, r7782; +} +{ +add.f16x2 r7863, r7854, r7860; +} +{ +sub.f16x2 r7866, r7801, r7802; +} +{ +mul.f16x2 r7869, r7866, r7780; +} +{ +sub.f16x2 r7872, r7807, r7808; +} +{ +mul.f16x2 r7875, r7872, r7784; +} +{ +add.f16x2 r7878, r7869, r7875; +} +{ +add.f16x2 r7881, r7863, r7878; +} +{ +add.f16x2 r7884, r7789, r7790; +} +{ +mul.f16x2 r7887, r7884, r7782; +} +{ +add.f16x2 r7890, r7792, r7887; +} +{ +add.f16x2 r7893, r7795, r7796; +} +{ +mul.f16x2 r7896, r7893, r7786; +} +{ +add.f16x2 r7899, r7890, r7896; +} +{ +sub.f16x2 r7902, r7801, r7802; +} +{ +mul.f16x2 r7905, r7902, r7784; +} +{ +sub.f16x2 r7908, r7807, r7808; +} +{ +mul.f16x2 r7911, r7908, r7787; +} +{ +add.f16x2 r7914, r7905, r7911; +} +{ +sub.f16x2 r7917, r7899, r7914; +} +{ +add.f16x2 r7920, r7789, r7790; +} +{ +mul.f16x2 r7923, r7920, r7782; +} +{ +add.f16x2 r7926, r7792, r7923; +} +{ +add.f16x2 r7929, r7795, r7796; +} +{ +mul.f16x2 r7932, r7929, r7786; +} +{ +add.f16x2 r7935, r7926, r7932; +} +{ +sub.f16x2 r7938, r7801, r7802; +} +{ +mul.f16x2 r7941, r7938, r7784; +} +{ +sub.f16x2 r7944, r7807, r7808; +} +{ +mul.f16x2 r7947, r7944, r7787; +} +{ +add.f16x2 r7950, r7941, r7947; +} +{ +add.f16x2 r7953, r7935, r7950; +} +{ +add.f16x2 r7956, r7801, r7802; +} +{ +mul.f16x2 r7959, r7956, r7778; +} +{ +add.f16x2 r7962, r7804, r7959; +} +{ +add.f16x2 r7965, r7807, r7808; +} +{ +mul.f16x2 r7968, r7965, r7782; +} +{ +add.f16x2 r7971, r7962, r7968; +} +{ +sub.f16x2 r7974, r7789, r7790; +} +{ +mul.f16x2 r7977, r7974, r7780; +} +{ +sub.f16x2 r7980, r7795, r7796; +} +{ +mul.f16x2 r7983, r7980, r7784; +} +{ +add.f16x2 r7986, r7977, r7983; +} +{ +add.f16x2 r7989, r7971, r7986; +} +{ +add.f16x2 r7992, r7801, r7802; +} +{ +mul.f16x2 r7995, r7992, r7778; +} +{ +add.f16x2 r7998, r7804, r7995; +} +{ +add.f16x2 r8001, r7807, r7808; +} +{ +mul.f16x2 r8004, r8001, r7782; +} +{ +add.f16x2 r8007, r7998, r8004; +} +{ +sub.f16x2 r8010, r7789, r7790; +} +{ +mul.f16x2 r8013, r8010, r7780; +} +{ +sub.f16x2 r8016, r7795, r7796; +} +{ +mul.f16x2 r8019, r8016, r7784; +} +{ +add.f16x2 r8022, r8013, r8019; +} +{ +sub.f16x2 r8025, r8007, r8022; +} +{ +add.f16x2 r8028, r7801, r7802; +} +{ +mul.f16x2 r8031, r8028, r7782; +} +{ +add.f16x2 r8034, r7804, r8031; +} +{ +add.f16x2 r8037, r7807, r7808; +} +{ +mul.f16x2 r8040, r8037, r7786; +} +{ +add.f16x2 r8043, r8034, r8040; +} +{ +sub.f16x2 r8046, r7789, r7790; +} +{ +mul.f16x2 r8049, r8046, r7784; +} +{ +sub.f16x2 r8052, r7795, r7796; +} +{ +mul.f16x2 r8055, r8052, r7787; +} +{ +add.f16x2 r8058, r8049, r8055; +} +{ +add.f16x2 r8061, r8043, r8058; +} +{ +add.f16x2 r8064, r7801, r7802; +} +{ +mul.f16x2 r8067, r8064, r7782; +} +{ +add.f16x2 r8070, r7804, r8067; +} +{ +add.f16x2 r8073, r7807, r7808; +} +{ +mul.f16x2 r8076, r8073, r7786; +} +{ +add.f16x2 r8079, r8070, r8076; +} +{ +sub.f16x2 r8082, r7789, r7790; +} +{ +mul.f16x2 r8085, r8082, r7784; +} +{ +sub.f16x2 r8088, r7795, r7796; +} +{ +mul.f16x2 r8091, r8088, r7787; +} +{ +add.f16x2 r8094, r8085, r8091; +} +{ +sub.f16x2 r8097, r8079, r8094; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8101, {low, high}; +} +{ +neg.f16x2 r8102, r8101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8105, {low, high}; +} +{ +neg.f16x2 r8106, r8105; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8109, {low, high}; +} +{ +add.f16x2 r8110, r8111, r8112; +} +{ +add.f16x2 r8113, r8114, r8110; +} +{ +add.f16x2 r8116, r8117, r8118; +} +{ +add.f16x2 r8119, r8113, r8116; +} +{ +add.f16x2 r8122, r8123, r8124; +} +{ +add.f16x2 r8125, r8126, r8122; +} +{ +add.f16x2 r8128, r8129, r8130; +} +{ +add.f16x2 r8131, r8125, r8128; +} +{ +add.f16x2 r8134, r8111, r8112; +} +{ +mul.f16x2 r8137, r8134, r8100; +} +{ +add.f16x2 r8140, r8114, r8137; +} +{ +add.f16x2 r8143, r8117, r8118; +} +{ +mul.f16x2 r8146, r8143, r8104; +} +{ +add.f16x2 r8149, r8140, r8146; +} +{ +sub.f16x2 r8152, r8123, r8124; +} +{ +mul.f16x2 r8155, r8152, r8102; +} +{ +sub.f16x2 r8158, r8129, r8130; +} +{ +mul.f16x2 r8161, r8158, r8106; +} +{ +add.f16x2 r8164, r8155, r8161; +} +{ +sub.f16x2 r8167, r8149, r8164; +} +{ +add.f16x2 r8170, r8111, r8112; +} +{ +mul.f16x2 r8173, r8170, r8100; +} +{ +add.f16x2 r8176, r8114, r8173; +} +{ +add.f16x2 r8179, r8117, r8118; +} +{ +mul.f16x2 r8182, r8179, r8104; +} +{ +add.f16x2 r8185, r8176, r8182; +} +{ +sub.f16x2 r8188, r8123, r8124; +} +{ +mul.f16x2 r8191, r8188, r8102; +} +{ +sub.f16x2 r8194, r8129, r8130; +} +{ +mul.f16x2 r8197, r8194, r8106; +} +{ +add.f16x2 r8200, r8191, r8197; +} +{ +add.f16x2 r8203, r8185, r8200; +} +{ +add.f16x2 r8206, r8111, r8112; +} +{ +mul.f16x2 r8209, r8206, r8104; +} +{ +add.f16x2 r8212, r8114, r8209; +} +{ +add.f16x2 r8215, r8117, r8118; +} +{ +mul.f16x2 r8218, r8215, r8108; +} +{ +add.f16x2 r8221, r8212, r8218; +} +{ +sub.f16x2 r8224, r8123, r8124; +} +{ +mul.f16x2 r8227, r8224, r8106; +} +{ +sub.f16x2 r8230, r8129, r8130; +} +{ +mul.f16x2 r8233, r8230, r8109; +} +{ +add.f16x2 r8236, r8227, r8233; +} +{ +sub.f16x2 r8239, r8221, r8236; +} +{ +add.f16x2 r8242, r8111, r8112; +} +{ +mul.f16x2 r8245, r8242, r8104; +} +{ +add.f16x2 r8248, r8114, r8245; +} +{ +add.f16x2 r8251, r8117, r8118; +} +{ +mul.f16x2 r8254, r8251, r8108; +} +{ +add.f16x2 r8257, r8248, r8254; +} +{ +sub.f16x2 r8260, r8123, r8124; +} +{ +mul.f16x2 r8263, r8260, r8106; +} +{ +sub.f16x2 r8266, r8129, r8130; +} +{ +mul.f16x2 r8269, r8266, r8109; +} +{ +add.f16x2 r8272, r8263, r8269; +} +{ +add.f16x2 r8275, r8257, r8272; +} +{ +add.f16x2 r8278, r8123, r8124; +} +{ +mul.f16x2 r8281, r8278, r8100; +} +{ +add.f16x2 r8284, r8126, r8281; +} +{ +add.f16x2 r8287, r8129, r8130; +} +{ +mul.f16x2 r8290, r8287, r8104; +} +{ +add.f16x2 r8293, r8284, r8290; +} +{ +sub.f16x2 r8296, r8111, r8112; +} +{ +mul.f16x2 r8299, r8296, r8102; +} +{ +sub.f16x2 r8302, r8117, r8118; +} +{ +mul.f16x2 r8305, r8302, r8106; +} +{ +add.f16x2 r8308, r8299, r8305; +} +{ +add.f16x2 r8311, r8293, r8308; +} +{ +add.f16x2 r8314, r8123, r8124; +} +{ +mul.f16x2 r8317, r8314, r8100; +} +{ +add.f16x2 r8320, r8126, r8317; +} +{ +add.f16x2 r8323, r8129, r8130; +} +{ +mul.f16x2 r8326, r8323, r8104; +} +{ +add.f16x2 r8329, r8320, r8326; +} +{ +sub.f16x2 r8332, r8111, r8112; +} +{ +mul.f16x2 r8335, r8332, r8102; +} +{ +sub.f16x2 r8338, r8117, r8118; +} +{ +mul.f16x2 r8341, r8338, r8106; +} +{ +add.f16x2 r8344, r8335, r8341; +} +{ +sub.f16x2 r8347, r8329, r8344; +} +{ +add.f16x2 r8350, r8123, r8124; +} +{ +mul.f16x2 r8353, r8350, r8104; +} +{ +add.f16x2 r8356, r8126, r8353; +} +{ +add.f16x2 r8359, r8129, r8130; +} +{ +mul.f16x2 r8362, r8359, r8108; +} +{ +add.f16x2 r8365, r8356, r8362; +} +{ +sub.f16x2 r8368, r8111, r8112; +} +{ +mul.f16x2 r8371, r8368, r8106; +} +{ +sub.f16x2 r8374, r8117, r8118; +} +{ +mul.f16x2 r8377, r8374, r8109; +} +{ +add.f16x2 r8380, r8371, r8377; +} +{ +add.f16x2 r8383, r8365, r8380; +} +{ +add.f16x2 r8386, r8123, r8124; +} +{ +mul.f16x2 r8389, r8386, r8104; +} +{ +add.f16x2 r8392, r8126, r8389; +} +{ +add.f16x2 r8395, r8129, r8130; +} +{ +mul.f16x2 r8398, r8395, r8108; +} +{ +add.f16x2 r8401, r8392, r8398; +} +{ +sub.f16x2 r8404, r8111, r8112; +} +{ +mul.f16x2 r8407, r8404, r8106; +} +{ +sub.f16x2 r8410, r8117, r8118; +} +{ +mul.f16x2 r8413, r8410, r8109; +} +{ +add.f16x2 r8416, r8407, r8413; +} +{ +sub.f16x2 r8419, r8401, r8416; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8422, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8423, {low, high}; +} +{ +neg.f16x2 r8424, r8423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8426, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8427, {low, high}; +} +{ +neg.f16x2 r8428, r8427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8430, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8431, {low, high}; +} +{ +add.f16x2 r8432, r8433, r8434; +} +{ +add.f16x2 r8435, r8436, r8432; +} +{ +add.f16x2 r8438, r8439, r8440; +} +{ +add.f16x2 r8441, r8435, r8438; +} +{ +add.f16x2 r8444, r8445, r8446; +} +{ +add.f16x2 r8447, r8448, r8444; +} +{ +add.f16x2 r8450, r8451, r8452; +} +{ +add.f16x2 r8453, r8447, r8450; +} +{ +add.f16x2 r8456, r8433, r8434; +} +{ +mul.f16x2 r8459, r8456, r8422; +} +{ +add.f16x2 r8462, r8436, r8459; +} +{ +add.f16x2 r8465, r8439, r8440; +} +{ +mul.f16x2 r8468, r8465, r8426; +} +{ +add.f16x2 r8471, r8462, r8468; +} +{ +sub.f16x2 r8474, r8445, r8446; +} +{ +mul.f16x2 r8477, r8474, r8424; +} +{ +sub.f16x2 r8480, r8451, r8452; +} +{ +mul.f16x2 r8483, r8480, r8428; +} +{ +add.f16x2 r8486, r8477, r8483; +} +{ +sub.f16x2 r8489, r8471, r8486; +} +{ +add.f16x2 r8492, r8433, r8434; +} +{ +mul.f16x2 r8495, r8492, r8422; +} +{ +add.f16x2 r8498, r8436, r8495; +} +{ +add.f16x2 r8501, r8439, r8440; +} +{ +mul.f16x2 r8504, r8501, r8426; +} +{ +add.f16x2 r8507, r8498, r8504; +} +{ +sub.f16x2 r8510, r8445, r8446; +} +{ +mul.f16x2 r8513, r8510, r8424; +} +{ +sub.f16x2 r8516, r8451, r8452; +} +{ +mul.f16x2 r8519, r8516, r8428; +} +{ +add.f16x2 r8522, r8513, r8519; +} +{ +add.f16x2 r8525, r8507, r8522; +} +{ +add.f16x2 r8528, r8433, r8434; +} +{ +mul.f16x2 r8531, r8528, r8426; +} +{ +add.f16x2 r8534, r8436, r8531; +} +{ +add.f16x2 r8537, r8439, r8440; +} +{ +mul.f16x2 r8540, r8537, r8430; +} +{ +add.f16x2 r8543, r8534, r8540; +} +{ +sub.f16x2 r8546, r8445, r8446; +} +{ +mul.f16x2 r8549, r8546, r8428; +} +{ +sub.f16x2 r8552, r8451, r8452; +} +{ +mul.f16x2 r8555, r8552, r8431; +} +{ +add.f16x2 r8558, r8549, r8555; +} +{ +sub.f16x2 r8561, r8543, r8558; +} +{ +add.f16x2 r8564, r8433, r8434; +} +{ +mul.f16x2 r8567, r8564, r8426; +} +{ +add.f16x2 r8570, r8436, r8567; +} +{ +add.f16x2 r8573, r8439, r8440; +} +{ +mul.f16x2 r8576, r8573, r8430; +} +{ +add.f16x2 r8579, r8570, r8576; +} +{ +sub.f16x2 r8582, r8445, r8446; +} +{ +mul.f16x2 r8585, r8582, r8428; +} +{ +sub.f16x2 r8588, r8451, r8452; +} +{ +mul.f16x2 r8591, r8588, r8431; +} +{ +add.f16x2 r8594, r8585, r8591; +} +{ +add.f16x2 r8597, r8579, r8594; +} +{ +add.f16x2 r8600, r8445, r8446; +} +{ +mul.f16x2 r8603, r8600, r8422; +} +{ +add.f16x2 r8606, r8448, r8603; +} +{ +add.f16x2 r8609, r8451, r8452; +} +{ +mul.f16x2 r8612, r8609, r8426; +} +{ +add.f16x2 r8615, r8606, r8612; +} +{ +sub.f16x2 r8618, r8433, r8434; +} +{ +mul.f16x2 r8621, r8618, r8424; +} +{ +sub.f16x2 r8624, r8439, r8440; +} +{ +mul.f16x2 r8627, r8624, r8428; +} +{ +add.f16x2 r8630, r8621, r8627; +} +{ +add.f16x2 r8633, r8615, r8630; +} +{ +add.f16x2 r8636, r8445, r8446; +} +{ +mul.f16x2 r8639, r8636, r8422; +} +{ +add.f16x2 r8642, r8448, r8639; +} +{ +add.f16x2 r8645, r8451, r8452; +} +{ +mul.f16x2 r8648, r8645, r8426; +} +{ +add.f16x2 r8651, r8642, r8648; +} +{ +sub.f16x2 r8654, r8433, r8434; +} +{ +mul.f16x2 r8657, r8654, r8424; +} +{ +sub.f16x2 r8660, r8439, r8440; +} +{ +mul.f16x2 r8663, r8660, r8428; +} +{ +add.f16x2 r8666, r8657, r8663; +} +{ +sub.f16x2 r8669, r8651, r8666; +} +{ +add.f16x2 r8672, r8445, r8446; +} +{ +mul.f16x2 r8675, r8672, r8426; +} +{ +add.f16x2 r8678, r8448, r8675; +} +{ +add.f16x2 r8681, r8451, r8452; +} +{ +mul.f16x2 r8684, r8681, r8430; +} +{ +add.f16x2 r8687, r8678, r8684; +} +{ +sub.f16x2 r8690, r8433, r8434; +} +{ +mul.f16x2 r8693, r8690, r8428; +} +{ +sub.f16x2 r8696, r8439, r8440; +} +{ +mul.f16x2 r8699, r8696, r8431; +} +{ +add.f16x2 r8702, r8693, r8699; +} +{ +add.f16x2 r8705, r8687, r8702; +} +{ +add.f16x2 r8708, r8445, r8446; +} +{ +mul.f16x2 r8711, r8708, r8426; +} +{ +add.f16x2 r8714, r8448, r8711; +} +{ +add.f16x2 r8717, r8451, r8452; +} +{ +mul.f16x2 r8720, r8717, r8430; +} +{ +add.f16x2 r8723, r8714, r8720; +} +{ +sub.f16x2 r8726, r8433, r8434; +} +{ +mul.f16x2 r8729, r8726, r8428; +} +{ +sub.f16x2 r8732, r8439, r8440; +} +{ +mul.f16x2 r8735, r8732, r8431; +} +{ +add.f16x2 r8738, r8729, r8735; +} +{ +sub.f16x2 r8741, r8723, r8738; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8745, {low, high}; +} +{ +neg.f16x2 r8746, r8745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8748, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8749, {low, high}; +} +{ +neg.f16x2 r8750, r8749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8753, {low, high}; +} +{ +add.f16x2 r8754, r8755, r8756; +} +{ +add.f16x2 r8757, r8758, r8754; +} +{ +add.f16x2 r8760, r8761, r8762; +} +{ +add.f16x2 r8763, r8757, r8760; +} +{ +add.f16x2 r8766, r8767, r8768; +} +{ +add.f16x2 r8769, r8770, r8766; +} +{ +add.f16x2 r8772, r8773, r8774; +} +{ +add.f16x2 r8775, r8769, r8772; +} +{ +add.f16x2 r8778, r8755, r8756; +} +{ +mul.f16x2 r8781, r8778, r8744; +} +{ +add.f16x2 r8784, r8758, r8781; +} +{ +add.f16x2 r8787, r8761, r8762; +} +{ +mul.f16x2 r8790, r8787, r8748; +} +{ +add.f16x2 r8793, r8784, r8790; +} +{ +sub.f16x2 r8796, r8767, r8768; +} +{ +mul.f16x2 r8799, r8796, r8746; +} +{ +sub.f16x2 r8802, r8773, r8774; +} +{ +mul.f16x2 r8805, r8802, r8750; +} +{ +add.f16x2 r8808, r8799, r8805; +} +{ +sub.f16x2 r8811, r8793, r8808; +} +{ +add.f16x2 r8814, r8755, r8756; +} +{ +mul.f16x2 r8817, r8814, r8744; +} +{ +add.f16x2 r8820, r8758, r8817; +} +{ +add.f16x2 r8823, r8761, r8762; +} +{ +mul.f16x2 r8826, r8823, r8748; +} +{ +add.f16x2 r8829, r8820, r8826; +} +{ +sub.f16x2 r8832, r8767, r8768; +} +{ +mul.f16x2 r8835, r8832, r8746; +} +{ +sub.f16x2 r8838, r8773, r8774; +} +{ +mul.f16x2 r8841, r8838, r8750; +} +{ +add.f16x2 r8844, r8835, r8841; +} +{ +add.f16x2 r8847, r8829, r8844; +} +{ +add.f16x2 r8850, r8755, r8756; +} +{ +mul.f16x2 r8853, r8850, r8748; +} +{ +add.f16x2 r8856, r8758, r8853; +} +{ +add.f16x2 r8859, r8761, r8762; +} +{ +mul.f16x2 r8862, r8859, r8752; +} +{ +add.f16x2 r8865, r8856, r8862; +} +{ +sub.f16x2 r8868, r8767, r8768; +} +{ +mul.f16x2 r8871, r8868, r8750; +} +{ +sub.f16x2 r8874, r8773, r8774; +} +{ +mul.f16x2 r8877, r8874, r8753; +} +{ +add.f16x2 r8880, r8871, r8877; +} +{ +sub.f16x2 r8883, r8865, r8880; +} +{ +add.f16x2 r8886, r8755, r8756; +} +{ +mul.f16x2 r8889, r8886, r8748; +} +{ +add.f16x2 r8892, r8758, r8889; +} +{ +add.f16x2 r8895, r8761, r8762; +} +{ +mul.f16x2 r8898, r8895, r8752; +} +{ +add.f16x2 r8901, r8892, r8898; +} +{ +sub.f16x2 r8904, r8767, r8768; +} +{ +mul.f16x2 r8907, r8904, r8750; +} +{ +sub.f16x2 r8910, r8773, r8774; +} +{ +mul.f16x2 r8913, r8910, r8753; +} +{ +add.f16x2 r8916, r8907, r8913; +} +{ +add.f16x2 r8919, r8901, r8916; +} +{ +add.f16x2 r8922, r8767, r8768; +} +{ +mul.f16x2 r8925, r8922, r8744; +} +{ +add.f16x2 r8928, r8770, r8925; +} +{ +add.f16x2 r8931, r8773, r8774; +} +{ +mul.f16x2 r8934, r8931, r8748; +} +{ +add.f16x2 r8937, r8928, r8934; +} +{ +sub.f16x2 r8940, r8755, r8756; +} +{ +mul.f16x2 r8943, r8940, r8746; +} +{ +sub.f16x2 r8946, r8761, r8762; +} +{ +mul.f16x2 r8949, r8946, r8750; +} +{ +add.f16x2 r8952, r8943, r8949; +} +{ +add.f16x2 r8955, r8937, r8952; +} +{ +add.f16x2 r8958, r8767, r8768; +} +{ +mul.f16x2 r8961, r8958, r8744; +} +{ +add.f16x2 r8964, r8770, r8961; +} +{ +add.f16x2 r8967, r8773, r8774; +} +{ +mul.f16x2 r8970, r8967, r8748; +} +{ +add.f16x2 r8973, r8964, r8970; +} +{ +sub.f16x2 r8976, r8755, r8756; +} +{ +mul.f16x2 r8979, r8976, r8746; +} +{ +sub.f16x2 r8982, r8761, r8762; +} +{ +mul.f16x2 r8985, r8982, r8750; +} +{ +add.f16x2 r8988, r8979, r8985; +} +{ +sub.f16x2 r8991, r8973, r8988; +} +{ +add.f16x2 r8994, r8767, r8768; +} +{ +mul.f16x2 r8997, r8994, r8748; +} +{ +add.f16x2 r9000, r8770, r8997; +} +{ +add.f16x2 r9003, r8773, r8774; +} +{ +mul.f16x2 r9006, r9003, r8752; +} +{ +add.f16x2 r9009, r9000, r9006; +} +{ +sub.f16x2 r9012, r8755, r8756; +} +{ +mul.f16x2 r9015, r9012, r8750; +} +{ +sub.f16x2 r9018, r8761, r8762; +} +{ +mul.f16x2 r9021, r9018, r8753; +} +{ +add.f16x2 r9024, r9015, r9021; +} +{ +add.f16x2 r9027, r9009, r9024; +} +{ +add.f16x2 r9030, r8767, r8768; +} +{ +mul.f16x2 r9033, r9030, r8748; +} +{ +add.f16x2 r9036, r8770, r9033; +} +{ +add.f16x2 r9039, r8773, r8774; +} +{ +mul.f16x2 r9042, r9039, r8752; +} +{ +add.f16x2 r9045, r9036, r9042; +} +{ +sub.f16x2 r9048, r8755, r8756; +} +{ +mul.f16x2 r9051, r9048, r8750; +} +{ +sub.f16x2 r9054, r8761, r8762; +} +{ +mul.f16x2 r9057, r9054, r8753; +} +{ +add.f16x2 r9060, r9051, r9057; +} +{ +sub.f16x2 r9063, r9045, r9060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9067, {low, high}; +} +{ +neg.f16x2 r9068, r9067; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9070, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9071, {low, high}; +} +{ +neg.f16x2 r9072, r9071; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9074, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9075, {low, high}; +} +{ +add.f16x2 r9076, r9077, r9078; +} +{ +add.f16x2 r9079, r9080, r9076; +} +{ +add.f16x2 r9082, r9083, r9084; +} +{ +add.f16x2 r9085, r9079, r9082; +} +{ +add.f16x2 r9088, r9089, r9090; +} +{ +add.f16x2 r9091, r9092, r9088; +} +{ +add.f16x2 r9094, r9095, r9096; +} +{ +add.f16x2 r9097, r9091, r9094; +} +{ +add.f16x2 r9100, r9077, r9078; +} +{ +mul.f16x2 r9103, r9100, r9066; +} +{ +add.f16x2 r9106, r9080, r9103; +} +{ +add.f16x2 r9109, r9083, r9084; +} +{ +mul.f16x2 r9112, r9109, r9070; +} +{ +add.f16x2 r9115, r9106, r9112; +} +{ +sub.f16x2 r9118, r9089, r9090; +} +{ +mul.f16x2 r9121, r9118, r9068; +} +{ +sub.f16x2 r9124, r9095, r9096; +} +{ +mul.f16x2 r9127, r9124, r9072; +} +{ +add.f16x2 r9130, r9121, r9127; +} +{ +sub.f16x2 r9133, r9115, r9130; +} +{ +add.f16x2 r9136, r9077, r9078; +} +{ +mul.f16x2 r9139, r9136, r9066; +} +{ +add.f16x2 r9142, r9080, r9139; +} +{ +add.f16x2 r9145, r9083, r9084; +} +{ +mul.f16x2 r9148, r9145, r9070; +} +{ +add.f16x2 r9151, r9142, r9148; +} +{ +sub.f16x2 r9154, r9089, r9090; +} +{ +mul.f16x2 r9157, r9154, r9068; +} +{ +sub.f16x2 r9160, r9095, r9096; +} +{ +mul.f16x2 r9163, r9160, r9072; +} +{ +add.f16x2 r9166, r9157, r9163; +} +{ +add.f16x2 r9169, r9151, r9166; +} +{ +add.f16x2 r9172, r9077, r9078; +} +{ +mul.f16x2 r9175, r9172, r9070; +} +{ +add.f16x2 r9178, r9080, r9175; +} +{ +add.f16x2 r9181, r9083, r9084; +} +{ +mul.f16x2 r9184, r9181, r9074; +} +{ +add.f16x2 r9187, r9178, r9184; +} +{ +sub.f16x2 r9190, r9089, r9090; +} +{ +mul.f16x2 r9193, r9190, r9072; +} +{ +sub.f16x2 r9196, r9095, r9096; +} +{ +mul.f16x2 r9199, r9196, r9075; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +sub.f16x2 r9205, r9187, r9202; +} +{ +add.f16x2 r9208, r9077, r9078; +} +{ +mul.f16x2 r9211, r9208, r9070; +} +{ +add.f16x2 r9214, r9080, r9211; +} +{ +add.f16x2 r9217, r9083, r9084; +} +{ +mul.f16x2 r9220, r9217, r9074; +} +{ +add.f16x2 r9223, r9214, r9220; +} +{ +sub.f16x2 r9226, r9089, r9090; +} +{ +mul.f16x2 r9229, r9226, r9072; +} +{ +sub.f16x2 r9232, r9095, r9096; +} +{ +mul.f16x2 r9235, r9232, r9075; +} +{ +add.f16x2 r9238, r9229, r9235; +} +{ +add.f16x2 r9241, r9223, r9238; +} +{ +add.f16x2 r9244, r9089, r9090; +} +{ +mul.f16x2 r9247, r9244, r9066; +} +{ +add.f16x2 r9250, r9092, r9247; +} +{ +add.f16x2 r9253, r9095, r9096; +} +{ +mul.f16x2 r9256, r9253, r9070; +} +{ +add.f16x2 r9259, r9250, r9256; +} +{ +sub.f16x2 r9262, r9077, r9078; +} +{ +mul.f16x2 r9265, r9262, r9068; +} +{ +sub.f16x2 r9268, r9083, r9084; +} +{ +mul.f16x2 r9271, r9268, r9072; +} +{ +add.f16x2 r9274, r9265, r9271; +} +{ +add.f16x2 r9277, r9259, r9274; +} +{ +add.f16x2 r9280, r9089, r9090; +} +{ +mul.f16x2 r9283, r9280, r9066; +} +{ +add.f16x2 r9286, r9092, r9283; +} +{ +add.f16x2 r9289, r9095, r9096; +} +{ +mul.f16x2 r9292, r9289, r9070; +} +{ +add.f16x2 r9295, r9286, r9292; +} +{ +sub.f16x2 r9298, r9077, r9078; +} +{ +mul.f16x2 r9301, r9298, r9068; +} +{ +sub.f16x2 r9304, r9083, r9084; +} +{ +mul.f16x2 r9307, r9304, r9072; +} +{ +add.f16x2 r9310, r9301, r9307; +} +{ +sub.f16x2 r9313, r9295, r9310; +} +{ +add.f16x2 r9316, r9089, r9090; +} +{ +mul.f16x2 r9319, r9316, r9070; +} +{ +add.f16x2 r9322, r9092, r9319; +} +{ +add.f16x2 r9325, r9095, r9096; +} +{ +mul.f16x2 r9328, r9325, r9074; +} +{ +add.f16x2 r9331, r9322, r9328; +} +{ +sub.f16x2 r9334, r9077, r9078; +} +{ +mul.f16x2 r9337, r9334, r9072; +} +{ +sub.f16x2 r9340, r9083, r9084; +} +{ +mul.f16x2 r9343, r9340, r9075; +} +{ +add.f16x2 r9346, r9337, r9343; +} +{ +add.f16x2 r9349, r9331, r9346; +} +{ +add.f16x2 r9352, r9089, r9090; +} +{ +mul.f16x2 r9355, r9352, r9070; +} +{ +add.f16x2 r9358, r9092, r9355; +} +{ +add.f16x2 r9361, r9095, r9096; +} +{ +mul.f16x2 r9364, r9361, r9074; +} +{ +add.f16x2 r9367, r9358, r9364; +} +{ +sub.f16x2 r9370, r9077, r9078; +} +{ +mul.f16x2 r9373, r9370, r9072; +} +{ +sub.f16x2 r9376, r9083, r9084; +} +{ +mul.f16x2 r9379, r9376, r9075; +} +{ +add.f16x2 r9382, r9373, r9379; +} +{ +sub.f16x2 r9385, r9367, r9382; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r9388, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r9389, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r9390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r9391, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r9392, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r9393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r9394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r9395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r9398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r9399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r9402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r9403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r9405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r9410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r9411, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r9419, {low, high}; +} +{ +mul.f16x2 r9436, r8167, r9388; +} +{ +mul.f16x2 r9439, r8311, r9389; +} +{ +sub.f16x2 r9442, r9436, r9439; +} +{ +mul.f16x2 r9445, r8167, r9389; +} +{ +fma.rn.f16x2 r9448, r8311, r9388, r9445; +} +{ +mul.f16x2 r9452, r8489, r9390; +} +{ +mul.f16x2 r9455, r8633, r9391; +} +{ +sub.f16x2 r9458, r9452, r9455; +} +{ +mul.f16x2 r9461, r8489, r9391; +} +{ +fma.rn.f16x2 r9464, r8633, r9390, r9461; +} +{ +mul.f16x2 r9468, r8811, r9392; +} +{ +mul.f16x2 r9471, r8955, r9393; +} +{ +sub.f16x2 r9474, r9468, r9471; +} +{ +mul.f16x2 r9477, r8811, r9393; +} +{ +fma.rn.f16x2 r9480, r8955, r9392, r9477; +} +{ +mul.f16x2 r9484, r9133, r9394; +} +{ +mul.f16x2 r9487, r9277, r9395; +} +{ +sub.f16x2 r9490, r9484, r9487; +} +{ +mul.f16x2 r9493, r9133, r9395; +} +{ +fma.rn.f16x2 r9496, r9277, r9394, r9493; +} +{ +mul.f16x2 r9500, r8239, r9390; +} +{ +mul.f16x2 r9503, r8383, r9391; +} +{ +sub.f16x2 r9506, r9500, r9503; +} +{ +mul.f16x2 r9509, r8239, r9391; +} +{ +fma.rn.f16x2 r9512, r8383, r9390, r9509; +} +{ +mul.f16x2 r9516, r8561, r9394; +} +{ +mul.f16x2 r9519, r8705, r9395; +} +{ +sub.f16x2 r9522, r9516, r9519; +} +{ +mul.f16x2 r9525, r8561, r9395; +} +{ +fma.rn.f16x2 r9528, r8705, r9394, r9525; +} +{ +mul.f16x2 r9532, r8883, r9398; +} +{ +mul.f16x2 r9535, r9027, r9399; +} +{ +sub.f16x2 r9538, r9532, r9535; +} +{ +mul.f16x2 r9541, r8883, r9399; +} +{ +fma.rn.f16x2 r9544, r9027, r9398, r9541; +} +{ +mul.f16x2 r9548, r9205, r9402; +} +{ +mul.f16x2 r9551, r9349, r9403; +} +{ +sub.f16x2 r9554, r9548, r9551; +} +{ +mul.f16x2 r9557, r9205, r9403; +} +{ +fma.rn.f16x2 r9560, r9349, r9402, r9557; +} +{ +mul.f16x2 r9564, r8275, r9392; +} +{ +mul.f16x2 r9567, r8419, r9393; +} +{ +sub.f16x2 r9570, r9564, r9567; +} +{ +mul.f16x2 r9573, r8275, r9393; +} +{ +fma.rn.f16x2 r9576, r8419, r9392, r9573; +} +{ +mul.f16x2 r9580, r8597, r9398; +} +{ +mul.f16x2 r9583, r8741, r9399; +} +{ +sub.f16x2 r9586, r9580, r9583; +} +{ +mul.f16x2 r9589, r8597, r9399; +} +{ +fma.rn.f16x2 r9592, r8741, r9398, r9589; +} +{ +mul.f16x2 r9596, r8919, r9404; +} +{ +mul.f16x2 r9599, r9063, r9405; +} +{ +sub.f16x2 r9602, r9596, r9599; +} +{ +mul.f16x2 r9605, r8919, r9405; +} +{ +fma.rn.f16x2 r9608, r9063, r9404, r9605; +} +{ +mul.f16x2 r9612, r9241, r9410; +} +{ +mul.f16x2 r9615, r9385, r9411; +} +{ +sub.f16x2 r9618, r9612, r9615; +} +{ +mul.f16x2 r9621, r9241, r9411; +} +{ +fma.rn.f16x2 r9624, r9385, r9410, r9621; +} +{ +mul.f16x2 r9628, r8203, r9394; +} +{ +mul.f16x2 r9631, r8347, r9395; +} +{ +sub.f16x2 r9634, r9628, r9631; +} +{ +mul.f16x2 r9637, r8203, r9395; +} +{ +fma.rn.f16x2 r9640, r8347, r9394, r9637; +} +{ +mul.f16x2 r9644, r8525, r9402; +} +{ +mul.f16x2 r9647, r8669, r9403; +} +{ +sub.f16x2 r9650, r9644, r9647; +} +{ +mul.f16x2 r9653, r8525, r9403; +} +{ +fma.rn.f16x2 r9656, r8669, r9402, r9653; +} +{ +mul.f16x2 r9660, r8847, r9410; +} +{ +mul.f16x2 r9663, r8991, r9411; +} +{ +sub.f16x2 r9666, r9660, r9663; +} +{ +mul.f16x2 r9669, r8847, r9411; +} +{ +fma.rn.f16x2 r9672, r8991, r9410, r9669; +} +{ +mul.f16x2 r9676, r9169, r9418; +} +{ +mul.f16x2 r9679, r9313, r9419; +} +{ +sub.f16x2 r9682, r9676, r9679; +} +{ +mul.f16x2 r9685, r9169, r9419; +} +{ +fma.rn.f16x2 r9688, r9313, r9418, r9685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9693, {low, high}; +} +{ +neg.f16x2 r9694, r9693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9697, {low, high}; +} +{ +neg.f16x2 r9698, r9697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9700, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9701, {low, high}; +} +{ +add.f16x2 r9702, r8119, r9085; +} +{ +add.f16x2 r9705, r7797, r9702; +} +{ +add.f16x2 r9708, r8441, r8763; +} +{ +add.f16x2 %0, r9705, r9708; +} +{ +add.f16x2 r9714, r8131, r9097; +} +{ +add.f16x2 r9717, r7809, r9714; +} +{ +add.f16x2 r9720, r8453, r8775; +} +{ +add.f16x2 %1, r9717, r9720; +} +{ +add.f16x2 r9726, r8119, r9085; +} +{ +mul.f16x2 r9729, r9726, r9692; +} +{ +add.f16x2 r9732, r7797, r9729; +} +{ +add.f16x2 r9735, r8441, r8763; +} +{ +mul.f16x2 r9738, r9735, r9696; +} +{ +add.f16x2 r9741, r9732, r9738; +} +{ +sub.f16x2 r9744, r8131, r9097; +} +{ +mul.f16x2 r9747, r9744, r9694; +} +{ +sub.f16x2 r9750, r8453, r8775; +} +{ +mul.f16x2 r9753, r9750, r9698; +} +{ +add.f16x2 r9756, r9747, r9753; +} +{ +sub.f16x2 %10, r9741, r9756; +} +{ +add.f16x2 r9762, r8119, r9085; +} +{ +mul.f16x2 r9765, r9762, r9692; +} +{ +add.f16x2 r9768, r7797, r9765; +} +{ +add.f16x2 r9771, r8441, r8763; +} +{ +mul.f16x2 r9774, r9771, r9696; +} +{ +add.f16x2 r9777, r9768, r9774; +} +{ +sub.f16x2 r9780, r8131, r9097; +} +{ +mul.f16x2 r9783, r9780, r9694; +} +{ +sub.f16x2 r9786, r8453, r8775; +} +{ +mul.f16x2 r9789, r9786, r9698; +} +{ +add.f16x2 r9792, r9783, r9789; +} +{ +add.f16x2 %40, r9777, r9792; +} +{ +add.f16x2 r9798, r8119, r9085; +} +{ +mul.f16x2 r9801, r9798, r9696; +} +{ +add.f16x2 r9804, r7797, r9801; +} +{ +add.f16x2 r9807, r8441, r8763; +} +{ +mul.f16x2 r9810, r9807, r9700; +} +{ +add.f16x2 r9813, r9804, r9810; +} +{ +sub.f16x2 r9816, r8131, r9097; +} +{ +mul.f16x2 r9819, r9816, r9698; +} +{ +sub.f16x2 r9822, r8453, r8775; +} +{ +mul.f16x2 r9825, r9822, r9701; +} +{ +add.f16x2 r9828, r9819, r9825; +} +{ +sub.f16x2 %20, r9813, r9828; +} +{ +add.f16x2 r9834, r8119, r9085; +} +{ +mul.f16x2 r9837, r9834, r9696; +} +{ +add.f16x2 r9840, r7797, r9837; +} +{ +add.f16x2 r9843, r8441, r8763; +} +{ +mul.f16x2 r9846, r9843, r9700; +} +{ +add.f16x2 r9849, r9840, r9846; +} +{ +sub.f16x2 r9852, r8131, r9097; +} +{ +mul.f16x2 r9855, r9852, r9698; +} +{ +sub.f16x2 r9858, r8453, r8775; +} +{ +mul.f16x2 r9861, r9858, r9701; +} +{ +add.f16x2 r9864, r9855, r9861; +} +{ +add.f16x2 %30, r9849, r9864; +} +{ +add.f16x2 r9870, r8131, r9097; +} +{ +mul.f16x2 r9873, r9870, r9692; +} +{ +add.f16x2 r9876, r7809, r9873; +} +{ +add.f16x2 r9879, r8453, r8775; +} +{ +mul.f16x2 r9882, r9879, r9696; +} +{ +add.f16x2 r9885, r9876, r9882; +} +{ +sub.f16x2 r9888, r8119, r9085; +} +{ +mul.f16x2 r9891, r9888, r9694; +} +{ +sub.f16x2 r9894, r8441, r8763; +} +{ +mul.f16x2 r9897, r9894, r9698; +} +{ +add.f16x2 r9900, r9891, r9897; +} +{ +add.f16x2 %11, r9885, r9900; +} +{ +add.f16x2 r9906, r8131, r9097; +} +{ +mul.f16x2 r9909, r9906, r9692; +} +{ +add.f16x2 r9912, r7809, r9909; +} +{ +add.f16x2 r9915, r8453, r8775; +} +{ +mul.f16x2 r9918, r9915, r9696; +} +{ +add.f16x2 r9921, r9912, r9918; +} +{ +sub.f16x2 r9924, r8119, r9085; +} +{ +mul.f16x2 r9927, r9924, r9694; +} +{ +sub.f16x2 r9930, r8441, r8763; +} +{ +mul.f16x2 r9933, r9930, r9698; +} +{ +add.f16x2 r9936, r9927, r9933; +} +{ +sub.f16x2 %41, r9921, r9936; +} +{ +add.f16x2 r9942, r8131, r9097; +} +{ +mul.f16x2 r9945, r9942, r9696; +} +{ +add.f16x2 r9948, r7809, r9945; +} +{ +add.f16x2 r9951, r8453, r8775; +} +{ +mul.f16x2 r9954, r9951, r9700; +} +{ +add.f16x2 r9957, r9948, r9954; +} +{ +sub.f16x2 r9960, r8119, r9085; +} +{ +mul.f16x2 r9963, r9960, r9698; +} +{ +sub.f16x2 r9966, r8441, r8763; +} +{ +mul.f16x2 r9969, r9966, r9701; +} +{ +add.f16x2 r9972, r9963, r9969; +} +{ +add.f16x2 %21, r9957, r9972; +} +{ +add.f16x2 r9978, r8131, r9097; +} +{ +mul.f16x2 r9981, r9978, r9696; +} +{ +add.f16x2 r9984, r7809, r9981; +} +{ +add.f16x2 r9987, r8453, r8775; +} +{ +mul.f16x2 r9990, r9987, r9700; +} +{ +add.f16x2 r9993, r9984, r9990; +} +{ +sub.f16x2 r9996, r8119, r9085; +} +{ +mul.f16x2 r9999, r9996, r9698; +} +{ +sub.f16x2 r10002, r8441, r8763; +} +{ +mul.f16x2 r10005, r10002, r9701; +} +{ +add.f16x2 r10008, r9999, r10005; +} +{ +sub.f16x2 %31, r9993, r10008; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10015, {low, high}; +} +{ +neg.f16x2 r10016, r10015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10019, {low, high}; +} +{ +neg.f16x2 r10020, r10019; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10023, {low, high}; +} +{ +add.f16x2 r10024, r9442, r9490; +} +{ +add.f16x2 r10027, r7845, r10024; +} +{ +add.f16x2 r10030, r9458, r9474; +} +{ +add.f16x2 %2, r10027, r10030; +} +{ +add.f16x2 r10036, r9448, r9496; +} +{ +add.f16x2 r10039, r7989, r10036; +} +{ +add.f16x2 r10042, r9464, r9480; +} +{ +add.f16x2 %3, r10039, r10042; +} +{ +add.f16x2 r10048, r9442, r9490; +} +{ +mul.f16x2 r10051, r10048, r10014; +} +{ +add.f16x2 r10054, r7845, r10051; +} +{ +add.f16x2 r10057, r9458, r9474; +} +{ +mul.f16x2 r10060, r10057, r10018; +} +{ +add.f16x2 r10063, r10054, r10060; +} +{ +sub.f16x2 r10066, r9448, r9496; +} +{ +mul.f16x2 r10069, r10066, r10016; +} +{ +sub.f16x2 r10072, r9464, r9480; +} +{ +mul.f16x2 r10075, r10072, r10020; +} +{ +add.f16x2 r10078, r10069, r10075; +} +{ +sub.f16x2 %12, r10063, r10078; +} +{ +add.f16x2 r10084, r9442, r9490; +} +{ +mul.f16x2 r10087, r10084, r10014; +} +{ +add.f16x2 r10090, r7845, r10087; +} +{ +add.f16x2 r10093, r9458, r9474; +} +{ +mul.f16x2 r10096, r10093, r10018; +} +{ +add.f16x2 r10099, r10090, r10096; +} +{ +sub.f16x2 r10102, r9448, r9496; +} +{ +mul.f16x2 r10105, r10102, r10016; +} +{ +sub.f16x2 r10108, r9464, r9480; +} +{ +mul.f16x2 r10111, r10108, r10020; +} +{ +add.f16x2 r10114, r10105, r10111; +} +{ +add.f16x2 %42, r10099, r10114; +} +{ +add.f16x2 r10120, r9442, r9490; +} +{ +mul.f16x2 r10123, r10120, r10018; +} +{ +add.f16x2 r10126, r7845, r10123; +} +{ +add.f16x2 r10129, r9458, r9474; +} +{ +mul.f16x2 r10132, r10129, r10022; +} +{ +add.f16x2 r10135, r10126, r10132; +} +{ +sub.f16x2 r10138, r9448, r9496; +} +{ +mul.f16x2 r10141, r10138, r10020; +} +{ +sub.f16x2 r10144, r9464, r9480; +} +{ +mul.f16x2 r10147, r10144, r10023; +} +{ +add.f16x2 r10150, r10141, r10147; +} +{ +sub.f16x2 %22, r10135, r10150; +} +{ +add.f16x2 r10156, r9442, r9490; +} +{ +mul.f16x2 r10159, r10156, r10018; +} +{ +add.f16x2 r10162, r7845, r10159; +} +{ +add.f16x2 r10165, r9458, r9474; +} +{ +mul.f16x2 r10168, r10165, r10022; +} +{ +add.f16x2 r10171, r10162, r10168; +} +{ +sub.f16x2 r10174, r9448, r9496; +} +{ +mul.f16x2 r10177, r10174, r10020; +} +{ +sub.f16x2 r10180, r9464, r9480; +} +{ +mul.f16x2 r10183, r10180, r10023; +} +{ +add.f16x2 r10186, r10177, r10183; +} +{ +add.f16x2 %32, r10171, r10186; +} +{ +add.f16x2 r10192, r9448, r9496; +} +{ +mul.f16x2 r10195, r10192, r10014; +} +{ +add.f16x2 r10198, r7989, r10195; +} +{ +add.f16x2 r10201, r9464, r9480; +} +{ +mul.f16x2 r10204, r10201, r10018; +} +{ +add.f16x2 r10207, r10198, r10204; +} +{ +sub.f16x2 r10210, r9442, r9490; +} +{ +mul.f16x2 r10213, r10210, r10016; +} +{ +sub.f16x2 r10216, r9458, r9474; +} +{ +mul.f16x2 r10219, r10216, r10020; +} +{ +add.f16x2 r10222, r10213, r10219; +} +{ +add.f16x2 %13, r10207, r10222; +} +{ +add.f16x2 r10228, r9448, r9496; +} +{ +mul.f16x2 r10231, r10228, r10014; +} +{ +add.f16x2 r10234, r7989, r10231; +} +{ +add.f16x2 r10237, r9464, r9480; +} +{ +mul.f16x2 r10240, r10237, r10018; +} +{ +add.f16x2 r10243, r10234, r10240; +} +{ +sub.f16x2 r10246, r9442, r9490; +} +{ +mul.f16x2 r10249, r10246, r10016; +} +{ +sub.f16x2 r10252, r9458, r9474; +} +{ +mul.f16x2 r10255, r10252, r10020; +} +{ +add.f16x2 r10258, r10249, r10255; +} +{ +sub.f16x2 %43, r10243, r10258; +} +{ +add.f16x2 r10264, r9448, r9496; +} +{ +mul.f16x2 r10267, r10264, r10018; +} +{ +add.f16x2 r10270, r7989, r10267; +} +{ +add.f16x2 r10273, r9464, r9480; +} +{ +mul.f16x2 r10276, r10273, r10022; +} +{ +add.f16x2 r10279, r10270, r10276; +} +{ +sub.f16x2 r10282, r9442, r9490; +} +{ +mul.f16x2 r10285, r10282, r10020; +} +{ +sub.f16x2 r10288, r9458, r9474; +} +{ +mul.f16x2 r10291, r10288, r10023; +} +{ +add.f16x2 r10294, r10285, r10291; +} +{ +add.f16x2 %23, r10279, r10294; +} +{ +add.f16x2 r10300, r9448, r9496; +} +{ +mul.f16x2 r10303, r10300, r10018; +} +{ +add.f16x2 r10306, r7989, r10303; +} +{ +add.f16x2 r10309, r9464, r9480; +} +{ +mul.f16x2 r10312, r10309, r10022; +} +{ +add.f16x2 r10315, r10306, r10312; +} +{ +sub.f16x2 r10318, r9442, r9490; +} +{ +mul.f16x2 r10321, r10318, r10020; +} +{ +sub.f16x2 r10324, r9458, r9474; +} +{ +mul.f16x2 r10327, r10324, r10023; +} +{ +add.f16x2 r10330, r10321, r10327; +} +{ +sub.f16x2 %33, r10315, r10330; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10336, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10337, {low, high}; +} +{ +neg.f16x2 r10338, r10337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10341, {low, high}; +} +{ +neg.f16x2 r10342, r10341; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10344, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10345, {low, high}; +} +{ +add.f16x2 r10346, r9506, r9554; +} +{ +add.f16x2 r10349, r7917, r10346; +} +{ +add.f16x2 r10352, r9522, r9538; +} +{ +add.f16x2 %4, r10349, r10352; +} +{ +add.f16x2 r10358, r9512, r9560; +} +{ +add.f16x2 r10361, r8061, r10358; +} +{ +add.f16x2 r10364, r9528, r9544; +} +{ +add.f16x2 %5, r10361, r10364; +} +{ +add.f16x2 r10370, r9506, r9554; +} +{ +mul.f16x2 r10373, r10370, r10336; +} +{ +add.f16x2 r10376, r7917, r10373; +} +{ +add.f16x2 r10379, r9522, r9538; +} +{ +mul.f16x2 r10382, r10379, r10340; +} +{ +add.f16x2 r10385, r10376, r10382; +} +{ +sub.f16x2 r10388, r9512, r9560; +} +{ +mul.f16x2 r10391, r10388, r10338; +} +{ +sub.f16x2 r10394, r9528, r9544; +} +{ +mul.f16x2 r10397, r10394, r10342; +} +{ +add.f16x2 r10400, r10391, r10397; +} +{ +sub.f16x2 %14, r10385, r10400; +} +{ +add.f16x2 r10406, r9506, r9554; +} +{ +mul.f16x2 r10409, r10406, r10336; +} +{ +add.f16x2 r10412, r7917, r10409; +} +{ +add.f16x2 r10415, r9522, r9538; +} +{ +mul.f16x2 r10418, r10415, r10340; +} +{ +add.f16x2 r10421, r10412, r10418; +} +{ +sub.f16x2 r10424, r9512, r9560; +} +{ +mul.f16x2 r10427, r10424, r10338; +} +{ +sub.f16x2 r10430, r9528, r9544; +} +{ +mul.f16x2 r10433, r10430, r10342; +} +{ +add.f16x2 r10436, r10427, r10433; +} +{ +add.f16x2 %44, r10421, r10436; +} +{ +add.f16x2 r10442, r9506, r9554; +} +{ +mul.f16x2 r10445, r10442, r10340; +} +{ +add.f16x2 r10448, r7917, r10445; +} +{ +add.f16x2 r10451, r9522, r9538; +} +{ +mul.f16x2 r10454, r10451, r10344; +} +{ +add.f16x2 r10457, r10448, r10454; +} +{ +sub.f16x2 r10460, r9512, r9560; +} +{ +mul.f16x2 r10463, r10460, r10342; +} +{ +sub.f16x2 r10466, r9528, r9544; +} +{ +mul.f16x2 r10469, r10466, r10345; +} +{ +add.f16x2 r10472, r10463, r10469; +} +{ +sub.f16x2 %24, r10457, r10472; +} +{ +add.f16x2 r10478, r9506, r9554; +} +{ +mul.f16x2 r10481, r10478, r10340; +} +{ +add.f16x2 r10484, r7917, r10481; +} +{ +add.f16x2 r10487, r9522, r9538; +} +{ +mul.f16x2 r10490, r10487, r10344; +} +{ +add.f16x2 r10493, r10484, r10490; +} +{ +sub.f16x2 r10496, r9512, r9560; +} +{ +mul.f16x2 r10499, r10496, r10342; +} +{ +sub.f16x2 r10502, r9528, r9544; +} +{ +mul.f16x2 r10505, r10502, r10345; +} +{ +add.f16x2 r10508, r10499, r10505; +} +{ +add.f16x2 %34, r10493, r10508; +} +{ +add.f16x2 r10514, r9512, r9560; +} +{ +mul.f16x2 r10517, r10514, r10336; +} +{ +add.f16x2 r10520, r8061, r10517; +} +{ +add.f16x2 r10523, r9528, r9544; +} +{ +mul.f16x2 r10526, r10523, r10340; +} +{ +add.f16x2 r10529, r10520, r10526; +} +{ +sub.f16x2 r10532, r9506, r9554; +} +{ +mul.f16x2 r10535, r10532, r10338; +} +{ +sub.f16x2 r10538, r9522, r9538; +} +{ +mul.f16x2 r10541, r10538, r10342; +} +{ +add.f16x2 r10544, r10535, r10541; +} +{ +add.f16x2 %15, r10529, r10544; +} +{ +add.f16x2 r10550, r9512, r9560; +} +{ +mul.f16x2 r10553, r10550, r10336; +} +{ +add.f16x2 r10556, r8061, r10553; +} +{ +add.f16x2 r10559, r9528, r9544; +} +{ +mul.f16x2 r10562, r10559, r10340; +} +{ +add.f16x2 r10565, r10556, r10562; +} +{ +sub.f16x2 r10568, r9506, r9554; +} +{ +mul.f16x2 r10571, r10568, r10338; +} +{ +sub.f16x2 r10574, r9522, r9538; +} +{ +mul.f16x2 r10577, r10574, r10342; +} +{ +add.f16x2 r10580, r10571, r10577; +} +{ +sub.f16x2 %45, r10565, r10580; +} +{ +add.f16x2 r10586, r9512, r9560; +} +{ +mul.f16x2 r10589, r10586, r10340; +} +{ +add.f16x2 r10592, r8061, r10589; +} +{ +add.f16x2 r10595, r9528, r9544; +} +{ +mul.f16x2 r10598, r10595, r10344; +} +{ +add.f16x2 r10601, r10592, r10598; +} +{ +sub.f16x2 r10604, r9506, r9554; +} +{ +mul.f16x2 r10607, r10604, r10342; +} +{ +sub.f16x2 r10610, r9522, r9538; +} +{ +mul.f16x2 r10613, r10610, r10345; +} +{ +add.f16x2 r10616, r10607, r10613; +} +{ +add.f16x2 %25, r10601, r10616; +} +{ +add.f16x2 r10622, r9512, r9560; +} +{ +mul.f16x2 r10625, r10622, r10340; +} +{ +add.f16x2 r10628, r8061, r10625; +} +{ +add.f16x2 r10631, r9528, r9544; +} +{ +mul.f16x2 r10634, r10631, r10344; +} +{ +add.f16x2 r10637, r10628, r10634; +} +{ +sub.f16x2 r10640, r9506, r9554; +} +{ +mul.f16x2 r10643, r10640, r10342; +} +{ +sub.f16x2 r10646, r9522, r9538; +} +{ +mul.f16x2 r10649, r10646, r10345; +} +{ +add.f16x2 r10652, r10643, r10649; +} +{ +sub.f16x2 %35, r10637, r10652; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10658, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10659, {low, high}; +} +{ +neg.f16x2 r10660, r10659; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10662, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10663, {low, high}; +} +{ +neg.f16x2 r10664, r10663; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10666, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10667, {low, high}; +} +{ +add.f16x2 r10668, r9570, r9618; +} +{ +add.f16x2 r10671, r7953, r10668; +} +{ +add.f16x2 r10674, r9586, r9602; +} +{ +add.f16x2 %6, r10671, r10674; +} +{ +add.f16x2 r10680, r9576, r9624; +} +{ +add.f16x2 r10683, r8097, r10680; +} +{ +add.f16x2 r10686, r9592, r9608; +} +{ +add.f16x2 %7, r10683, r10686; +} +{ +add.f16x2 r10692, r9570, r9618; +} +{ +mul.f16x2 r10695, r10692, r10658; +} +{ +add.f16x2 r10698, r7953, r10695; +} +{ +add.f16x2 r10701, r9586, r9602; +} +{ +mul.f16x2 r10704, r10701, r10662; +} +{ +add.f16x2 r10707, r10698, r10704; +} +{ +sub.f16x2 r10710, r9576, r9624; +} +{ +mul.f16x2 r10713, r10710, r10660; +} +{ +sub.f16x2 r10716, r9592, r9608; +} +{ +mul.f16x2 r10719, r10716, r10664; +} +{ +add.f16x2 r10722, r10713, r10719; +} +{ +sub.f16x2 %16, r10707, r10722; +} +{ +add.f16x2 r10728, r9570, r9618; +} +{ +mul.f16x2 r10731, r10728, r10658; +} +{ +add.f16x2 r10734, r7953, r10731; +} +{ +add.f16x2 r10737, r9586, r9602; +} +{ +mul.f16x2 r10740, r10737, r10662; +} +{ +add.f16x2 r10743, r10734, r10740; +} +{ +sub.f16x2 r10746, r9576, r9624; +} +{ +mul.f16x2 r10749, r10746, r10660; +} +{ +sub.f16x2 r10752, r9592, r9608; +} +{ +mul.f16x2 r10755, r10752, r10664; +} +{ +add.f16x2 r10758, r10749, r10755; +} +{ +add.f16x2 %46, r10743, r10758; +} +{ +add.f16x2 r10764, r9570, r9618; +} +{ +mul.f16x2 r10767, r10764, r10662; +} +{ +add.f16x2 r10770, r7953, r10767; +} +{ +add.f16x2 r10773, r9586, r9602; +} +{ +mul.f16x2 r10776, r10773, r10666; +} +{ +add.f16x2 r10779, r10770, r10776; +} +{ +sub.f16x2 r10782, r9576, r9624; +} +{ +mul.f16x2 r10785, r10782, r10664; +} +{ +sub.f16x2 r10788, r9592, r9608; +} +{ +mul.f16x2 r10791, r10788, r10667; +} +{ +add.f16x2 r10794, r10785, r10791; +} +{ +sub.f16x2 %26, r10779, r10794; +} +{ +add.f16x2 r10800, r9570, r9618; +} +{ +mul.f16x2 r10803, r10800, r10662; +} +{ +add.f16x2 r10806, r7953, r10803; +} +{ +add.f16x2 r10809, r9586, r9602; +} +{ +mul.f16x2 r10812, r10809, r10666; +} +{ +add.f16x2 r10815, r10806, r10812; +} +{ +sub.f16x2 r10818, r9576, r9624; +} +{ +mul.f16x2 r10821, r10818, r10664; +} +{ +sub.f16x2 r10824, r9592, r9608; +} +{ +mul.f16x2 r10827, r10824, r10667; +} +{ +add.f16x2 r10830, r10821, r10827; +} +{ +add.f16x2 %36, r10815, r10830; +} +{ +add.f16x2 r10836, r9576, r9624; +} +{ +mul.f16x2 r10839, r10836, r10658; +} +{ +add.f16x2 r10842, r8097, r10839; +} +{ +add.f16x2 r10845, r9592, r9608; +} +{ +mul.f16x2 r10848, r10845, r10662; +} +{ +add.f16x2 r10851, r10842, r10848; +} +{ +sub.f16x2 r10854, r9570, r9618; +} +{ +mul.f16x2 r10857, r10854, r10660; +} +{ +sub.f16x2 r10860, r9586, r9602; +} +{ +mul.f16x2 r10863, r10860, r10664; +} +{ +add.f16x2 r10866, r10857, r10863; +} +{ +add.f16x2 %17, r10851, r10866; +} +{ +add.f16x2 r10872, r9576, r9624; +} +{ +mul.f16x2 r10875, r10872, r10658; +} +{ +add.f16x2 r10878, r8097, r10875; +} +{ +add.f16x2 r10881, r9592, r9608; +} +{ +mul.f16x2 r10884, r10881, r10662; +} +{ +add.f16x2 r10887, r10878, r10884; +} +{ +sub.f16x2 r10890, r9570, r9618; +} +{ +mul.f16x2 r10893, r10890, r10660; +} +{ +sub.f16x2 r10896, r9586, r9602; +} +{ +mul.f16x2 r10899, r10896, r10664; +} +{ +add.f16x2 r10902, r10893, r10899; +} +{ +sub.f16x2 %47, r10887, r10902; +} +{ +add.f16x2 r10908, r9576, r9624; +} +{ +mul.f16x2 r10911, r10908, r10662; +} +{ +add.f16x2 r10914, r8097, r10911; +} +{ +add.f16x2 r10917, r9592, r9608; +} +{ +mul.f16x2 r10920, r10917, r10666; +} +{ +add.f16x2 r10923, r10914, r10920; +} +{ +sub.f16x2 r10926, r9570, r9618; +} +{ +mul.f16x2 r10929, r10926, r10664; +} +{ +sub.f16x2 r10932, r9586, r9602; +} +{ +mul.f16x2 r10935, r10932, r10667; +} +{ +add.f16x2 r10938, r10929, r10935; +} +{ +add.f16x2 %27, r10923, r10938; +} +{ +add.f16x2 r10944, r9576, r9624; +} +{ +mul.f16x2 r10947, r10944, r10662; +} +{ +add.f16x2 r10950, r8097, r10947; +} +{ +add.f16x2 r10953, r9592, r9608; +} +{ +mul.f16x2 r10956, r10953, r10666; +} +{ +add.f16x2 r10959, r10950, r10956; +} +{ +sub.f16x2 r10962, r9570, r9618; +} +{ +mul.f16x2 r10965, r10962, r10664; +} +{ +sub.f16x2 r10968, r9586, r9602; +} +{ +mul.f16x2 r10971, r10968, r10667; +} +{ +add.f16x2 r10974, r10965, r10971; +} +{ +sub.f16x2 %37, r10959, r10974; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10980, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10981, {low, high}; +} +{ +neg.f16x2 r10982, r10981; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10984, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10985, {low, high}; +} +{ +neg.f16x2 r10986, r10985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10988, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10989, {low, high}; +} +{ +add.f16x2 r10990, r9634, r9682; +} +{ +add.f16x2 r10993, r7881, r10990; +} +{ +add.f16x2 r10996, r9650, r9666; +} +{ +add.f16x2 %8, r10993, r10996; +} +{ +add.f16x2 r11002, r9640, r9688; +} +{ +add.f16x2 r11005, r8025, r11002; +} +{ +add.f16x2 r11008, r9656, r9672; +} +{ +add.f16x2 %9, r11005, r11008; +} +{ +add.f16x2 r11014, r9634, r9682; +} +{ +mul.f16x2 r11017, r11014, r10980; +} +{ +add.f16x2 r11020, r7881, r11017; +} +{ +add.f16x2 r11023, r9650, r9666; +} +{ +mul.f16x2 r11026, r11023, r10984; +} +{ +add.f16x2 r11029, r11020, r11026; +} +{ +sub.f16x2 r11032, r9640, r9688; +} +{ +mul.f16x2 r11035, r11032, r10982; +} +{ +sub.f16x2 r11038, r9656, r9672; +} +{ +mul.f16x2 r11041, r11038, r10986; +} +{ +add.f16x2 r11044, r11035, r11041; +} +{ +sub.f16x2 %18, r11029, r11044; +} +{ +add.f16x2 r11050, r9634, r9682; +} +{ +mul.f16x2 r11053, r11050, r10980; +} +{ +add.f16x2 r11056, r7881, r11053; +} +{ +add.f16x2 r11059, r9650, r9666; +} +{ +mul.f16x2 r11062, r11059, r10984; +} +{ +add.f16x2 r11065, r11056, r11062; +} +{ +sub.f16x2 r11068, r9640, r9688; +} +{ +mul.f16x2 r11071, r11068, r10982; +} +{ +sub.f16x2 r11074, r9656, r9672; +} +{ +mul.f16x2 r11077, r11074, r10986; +} +{ +add.f16x2 r11080, r11071, r11077; +} +{ +add.f16x2 %48, r11065, r11080; +} +{ +add.f16x2 r11086, r9634, r9682; +} +{ +mul.f16x2 r11089, r11086, r10984; +} +{ +add.f16x2 r11092, r7881, r11089; +} +{ +add.f16x2 r11095, r9650, r9666; +} +{ +mul.f16x2 r11098, r11095, r10988; +} +{ +add.f16x2 r11101, r11092, r11098; +} +{ +sub.f16x2 r11104, r9640, r9688; +} +{ +mul.f16x2 r11107, r11104, r10986; +} +{ +sub.f16x2 r11110, r9656, r9672; +} +{ +mul.f16x2 r11113, r11110, r10989; +} +{ +add.f16x2 r11116, r11107, r11113; +} +{ +sub.f16x2 %28, r11101, r11116; +} +{ +add.f16x2 r11122, r9634, r9682; +} +{ +mul.f16x2 r11125, r11122, r10984; +} +{ +add.f16x2 r11128, r7881, r11125; +} +{ +add.f16x2 r11131, r9650, r9666; +} +{ +mul.f16x2 r11134, r11131, r10988; +} +{ +add.f16x2 r11137, r11128, r11134; +} +{ +sub.f16x2 r11140, r9640, r9688; +} +{ +mul.f16x2 r11143, r11140, r10986; +} +{ +sub.f16x2 r11146, r9656, r9672; +} +{ +mul.f16x2 r11149, r11146, r10989; +} +{ +add.f16x2 r11152, r11143, r11149; +} +{ +add.f16x2 %38, r11137, r11152; +} +{ +add.f16x2 r11158, r9640, r9688; +} +{ +mul.f16x2 r11161, r11158, r10980; +} +{ +add.f16x2 r11164, r8025, r11161; +} +{ +add.f16x2 r11167, r9656, r9672; +} +{ +mul.f16x2 r11170, r11167, r10984; +} +{ +add.f16x2 r11173, r11164, r11170; +} +{ +sub.f16x2 r11176, r9634, r9682; +} +{ +mul.f16x2 r11179, r11176, r10982; +} +{ +sub.f16x2 r11182, r9650, r9666; +} +{ +mul.f16x2 r11185, r11182, r10986; +} +{ +add.f16x2 r11188, r11179, r11185; +} +{ +add.f16x2 %19, r11173, r11188; +} +{ +add.f16x2 r11194, r9640, r9688; +} +{ +mul.f16x2 r11197, r11194, r10980; +} +{ +add.f16x2 r11200, r8025, r11197; +} +{ +add.f16x2 r11203, r9656, r9672; +} +{ +mul.f16x2 r11206, r11203, r10984; +} +{ +add.f16x2 r11209, r11200, r11206; +} +{ +sub.f16x2 r11212, r9634, r9682; +} +{ +mul.f16x2 r11215, r11212, r10982; +} +{ +sub.f16x2 r11218, r9650, r9666; +} +{ +mul.f16x2 r11221, r11218, r10986; +} +{ +add.f16x2 r11224, r11215, r11221; +} +{ +sub.f16x2 %49, r11209, r11224; +} +{ +add.f16x2 r11230, r9640, r9688; +} +{ +mul.f16x2 r11233, r11230, r10984; +} +{ +add.f16x2 r11236, r8025, r11233; +} +{ +add.f16x2 r11239, r9656, r9672; +} +{ +mul.f16x2 r11242, r11239, r10988; +} +{ +add.f16x2 r11245, r11236, r11242; +} +{ +sub.f16x2 r11248, r9634, r9682; +} +{ +mul.f16x2 r11251, r11248, r10986; +} +{ +sub.f16x2 r11254, r9650, r9666; +} +{ +mul.f16x2 r11257, r11254, r10989; +} +{ +add.f16x2 r11260, r11251, r11257; +} +{ +add.f16x2 %29, r11245, r11260; +} +{ +add.f16x2 r11266, r9640, r9688; +} +{ +mul.f16x2 r11269, r11266, r10984; +} +{ +add.f16x2 r11272, r8025, r11269; +} +{ +add.f16x2 r11275, r9656, r9672; +} +{ +mul.f16x2 r11278, r11275, r10988; +} +{ +add.f16x2 r11281, r11272, r11278; +} +{ +sub.f16x2 r11284, r9634, r9682; +} +{ +mul.f16x2 r11287, r11284, r10986; +} +{ +sub.f16x2 r11290, r9650, r9666; +} +{ +mul.f16x2 r11293, r11290, r10989; +} +{ +add.f16x2 r11296, r11287, r11293; +} +{ +sub.f16x2 %39, r11281, r11296; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1118, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.local .align 8 .b8 __local_depot1[200]; +.reg .b64 SP; +.reg .b64 SPL; +.reg .pred p<3>; +.reg .f32 f<695>; +.reg .b32 r<11314>; +.reg .b64 rd<19>; +mov.u64 SPL, __local_depot1; +add.u64 rd3, SPL, 0; +mov.u32 r3551, %tid.y; +mul.lo.s32 r1, r3551, 15625; +add.s64 rd4, rd3, 4; +mov.f32 f214, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r23, {low, high}; +} +mov.f32 f216, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r24, {low, high}; +} +{ +neg.f16x2 r25, r24; +} +mov.f32 f210, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r27, {low, high}; +} +mov.f32 f212, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r28, {low, high}; +} +{ +neg.f16x2 r29, r28; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r31, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r32, {low, high}; +} +{ +add.f16x2 r33, %61, %91; +} +{ +add.f16x2 r36, %51, r33; +} +{ +add.f16x2 r39, %71, %81; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %62, %92; +} +{ +add.f16x2 r48, %52, r45; +} +{ +add.f16x2 r51, %72, %82; +} +{ +add.f16x2 r54, r48, r51; +} +{ +add.f16x2 r57, %61, %91; +} +{ +mul.f16x2 r60, r57, r23; +} +{ +add.f16x2 r63, %51, r60; +} +{ +add.f16x2 r66, %71, %81; +} +{ +mul.f16x2 r69, r66, r27; +} +{ +add.f16x2 r72, r63, r69; +} +{ +sub.f16x2 r75, %62, %92; +} +{ +mul.f16x2 r78, r75, r25; +} +{ +sub.f16x2 r81, %72, %82; +} +{ +mul.f16x2 r84, r81, r29; +} +{ +add.f16x2 r87, r78, r84; +} +{ +sub.f16x2 r90, r72, r87; +} +{ +add.f16x2 r93, %61, %91; +} +{ +mul.f16x2 r96, r93, r23; +} +{ +add.f16x2 r99, %51, r96; +} +{ +add.f16x2 r102, %71, %81; +} +{ +mul.f16x2 r105, r102, r27; +} +{ +add.f16x2 r108, r99, r105; +} +{ +sub.f16x2 r111, %62, %92; +} +{ +mul.f16x2 r114, r111, r25; +} +{ +sub.f16x2 r117, %72, %82; +} +{ +mul.f16x2 r120, r117, r29; +} +{ +add.f16x2 r123, r114, r120; +} +{ +add.f16x2 r126, r108, r123; +} +{ +add.f16x2 r129, %61, %91; +} +{ +mul.f16x2 r132, r129, r27; +} +{ +add.f16x2 r135, %51, r132; +} +{ +add.f16x2 r138, %71, %81; +} +{ +mul.f16x2 r141, r138, r31; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %62, %92; +} +{ +mul.f16x2 r150, r147, r29; +} +{ +sub.f16x2 r153, %72, %82; +} +{ +mul.f16x2 r156, r153, r32; +} +{ +add.f16x2 r159, r150, r156; +} +{ +sub.f16x2 r162, r144, r159; +} +{ +add.f16x2 r165, %61, %91; +} +{ +mul.f16x2 r168, r165, r27; +} +{ +add.f16x2 r171, %51, r168; +} +{ +add.f16x2 r174, %71, %81; +} +{ +mul.f16x2 r177, r174, r31; +} +{ +add.f16x2 r180, r171, r177; +} +{ +sub.f16x2 r183, %62, %92; +} +{ +mul.f16x2 r186, r183, r29; +} +{ +sub.f16x2 r189, %72, %82; +} +{ +mul.f16x2 r192, r189, r32; +} +{ +add.f16x2 r195, r186, r192; +} +{ +add.f16x2 r198, r180, r195; +} +{ +add.f16x2 r201, %62, %92; +} +{ +mul.f16x2 r204, r201, r23; +} +{ +add.f16x2 r207, %52, r204; +} +{ +add.f16x2 r210, %72, %82; +} +{ +mul.f16x2 r213, r210, r27; +} +{ +add.f16x2 r216, r207, r213; +} +{ +sub.f16x2 r219, %61, %91; +} +{ +mul.f16x2 r222, r219, r25; +} +{ +sub.f16x2 r225, %71, %81; +} +{ +mul.f16x2 r228, r225, r29; +} +{ +add.f16x2 r231, r222, r228; +} +{ +add.f16x2 r234, r216, r231; +} +{ +add.f16x2 r237, %62, %92; +} +{ +mul.f16x2 r240, r237, r23; +} +{ +add.f16x2 r243, %52, r240; +} +{ +add.f16x2 r246, %72, %82; +} +{ +mul.f16x2 r249, r246, r27; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %61, %91; +} +{ +mul.f16x2 r258, r255, r25; +} +{ +sub.f16x2 r261, %71, %81; +} +{ +mul.f16x2 r264, r261, r29; +} +{ +add.f16x2 r267, r258, r264; +} +{ +sub.f16x2 r270, r252, r267; +} +{ +add.f16x2 r273, %62, %92; +} +{ +mul.f16x2 r276, r273, r27; +} +{ +add.f16x2 r279, %52, r276; +} +{ +add.f16x2 r282, %72, %82; +} +{ +mul.f16x2 r285, r282, r31; +} +{ +add.f16x2 r288, r279, r285; +} +{ +sub.f16x2 r291, %61, %91; +} +{ +mul.f16x2 r294, r291, r29; +} +{ +sub.f16x2 r297, %71, %81; +} +{ +mul.f16x2 r300, r297, r32; +} +{ +add.f16x2 r303, r294, r300; +} +{ +add.f16x2 r306, r288, r303; +} +{ +add.f16x2 r309, %62, %92; +} +{ +mul.f16x2 r312, r309, r27; +} +{ +add.f16x2 r315, %52, r312; +} +{ +add.f16x2 r318, %72, %82; +} +{ +mul.f16x2 r321, r318, r31; +} +{ +add.f16x2 r324, r315, r321; +} +{ +sub.f16x2 r327, %61, %91; +} +{ +mul.f16x2 r330, r327, r29; +} +{ +sub.f16x2 r333, %71, %81; +} +{ +mul.f16x2 r336, r333, r32; +} +{ +add.f16x2 r339, r330, r336; +} +{ +sub.f16x2 r342, r324, r339; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r349, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r350, {low, high}; +} +{ +neg.f16x2 r351, r350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r354, {low, high}; +} +{ +add.f16x2 r355, %63, %93; +} +{ +add.f16x2 r358, %53, r355; +} +{ +add.f16x2 r361, %73, %83; +} +{ +add.f16x2 r364, r358, r361; +} +{ +add.f16x2 r367, %64, %94; +} +{ +add.f16x2 r370, %54, r367; +} +{ +add.f16x2 r373, %74, %84; +} +{ +add.f16x2 r376, r370, r373; +} +{ +add.f16x2 r379, %63, %93; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, %53, r382; +} +{ +add.f16x2 r388, %73, %83; +} +{ +mul.f16x2 r391, r388, r349; +} +{ +add.f16x2 r394, r385, r391; +} +{ +sub.f16x2 r397, %64, %94; +} +{ +mul.f16x2 r400, r397, r347; +} +{ +sub.f16x2 r403, %74, %84; +} +{ +mul.f16x2 r406, r403, r351; +} +{ +add.f16x2 r409, r400, r406; +} +{ +sub.f16x2 r412, r394, r409; +} +{ +add.f16x2 r415, %63, %93; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, %53, r418; +} +{ +add.f16x2 r424, %73, %83; +} +{ +mul.f16x2 r427, r424, r349; +} +{ +add.f16x2 r430, r421, r427; +} +{ +sub.f16x2 r433, %64, %94; +} +{ +mul.f16x2 r436, r433, r347; +} +{ +sub.f16x2 r439, %74, %84; +} +{ +mul.f16x2 r442, r439, r351; +} +{ +add.f16x2 r445, r436, r442; +} +{ +add.f16x2 r448, r430, r445; +} +{ +add.f16x2 r451, %63, %93; +} +{ +mul.f16x2 r454, r451, r349; +} +{ +add.f16x2 r457, %53, r454; +} +{ +add.f16x2 r460, %73, %83; +} +{ +mul.f16x2 r463, r460, r353; +} +{ +add.f16x2 r466, r457, r463; +} +{ +sub.f16x2 r469, %64, %94; +} +{ +mul.f16x2 r472, r469, r351; +} +{ +sub.f16x2 r475, %74, %84; +} +{ +mul.f16x2 r478, r475, r354; +} +{ +add.f16x2 r481, r472, r478; +} +{ +sub.f16x2 r484, r466, r481; +} +{ +add.f16x2 r487, %63, %93; +} +{ +mul.f16x2 r490, r487, r349; +} +{ +add.f16x2 r493, %53, r490; +} +{ +add.f16x2 r496, %73, %83; +} +{ +mul.f16x2 r499, r496, r353; +} +{ +add.f16x2 r502, r493, r499; +} +{ +sub.f16x2 r505, %64, %94; +} +{ +mul.f16x2 r508, r505, r351; +} +{ +sub.f16x2 r511, %74, %84; +} +{ +mul.f16x2 r514, r511, r354; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r502, r517; +} +{ +add.f16x2 r523, %64, %94; +} +{ +mul.f16x2 r526, r523, r345; +} +{ +add.f16x2 r529, %54, r526; +} +{ +add.f16x2 r532, %74, %84; +} +{ +mul.f16x2 r535, r532, r349; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, %63, %93; +} +{ +mul.f16x2 r544, r541, r347; +} +{ +sub.f16x2 r547, %73, %83; +} +{ +mul.f16x2 r550, r547, r351; +} +{ +add.f16x2 r553, r544, r550; +} +{ +add.f16x2 r556, r538, r553; +} +{ +add.f16x2 r559, %64, %94; +} +{ +mul.f16x2 r562, r559, r345; +} +{ +add.f16x2 r565, %54, r562; +} +{ +add.f16x2 r568, %74, %84; +} +{ +mul.f16x2 r571, r568, r349; +} +{ +add.f16x2 r574, r565, r571; +} +{ +sub.f16x2 r577, %63, %93; +} +{ +mul.f16x2 r580, r577, r347; +} +{ +sub.f16x2 r583, %73, %83; +} +{ +mul.f16x2 r586, r583, r351; +} +{ +add.f16x2 r589, r580, r586; +} +{ +sub.f16x2 r592, r574, r589; +} +{ +add.f16x2 r595, %64, %94; +} +{ +mul.f16x2 r598, r595, r349; +} +{ +add.f16x2 r601, %54, r598; +} +{ +add.f16x2 r604, %74, %84; +} +{ +mul.f16x2 r607, r604, r353; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, %63, %93; +} +{ +mul.f16x2 r616, r613, r351; +} +{ +sub.f16x2 r619, %73, %83; +} +{ +mul.f16x2 r622, r619, r354; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r610, r625; +} +{ +add.f16x2 r631, %64, %94; +} +{ +mul.f16x2 r634, r631, r349; +} +{ +add.f16x2 r637, %54, r634; +} +{ +add.f16x2 r640, %74, %84; +} +{ +mul.f16x2 r643, r640, r353; +} +{ +add.f16x2 r646, r637, r643; +} +{ +sub.f16x2 r649, %63, %93; +} +{ +mul.f16x2 r652, r649, r351; +} +{ +sub.f16x2 r655, %73, %83; +} +{ +mul.f16x2 r658, r655, r354; +} +{ +add.f16x2 r661, r652, r658; +} +{ +sub.f16x2 r664, r646, r661; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r667, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r668, {low, high}; +} +{ +neg.f16x2 r669, r668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r672, {low, high}; +} +{ +neg.f16x2 r673, r672; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r675, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r676, {low, high}; +} +{ +add.f16x2 r677, %65, %95; +} +{ +add.f16x2 r680, %55, r677; +} +{ +add.f16x2 r683, %75, %85; +} +{ +add.f16x2 r686, r680, r683; +} +{ +add.f16x2 r689, %66, %96; +} +{ +add.f16x2 r692, %56, r689; +} +{ +add.f16x2 r695, %76, %86; +} +{ +add.f16x2 r698, r692, r695; +} +{ +add.f16x2 r701, %65, %95; +} +{ +mul.f16x2 r704, r701, r667; +} +{ +add.f16x2 r707, %55, r704; +} +{ +add.f16x2 r710, %75, %85; +} +{ +mul.f16x2 r713, r710, r671; +} +{ +add.f16x2 r716, r707, r713; +} +{ +sub.f16x2 r719, %66, %96; +} +{ +mul.f16x2 r722, r719, r669; +} +{ +sub.f16x2 r725, %76, %86; +} +{ +mul.f16x2 r728, r725, r673; +} +{ +add.f16x2 r731, r722, r728; +} +{ +sub.f16x2 r734, r716, r731; +} +{ +add.f16x2 r737, %65, %95; +} +{ +mul.f16x2 r740, r737, r667; +} +{ +add.f16x2 r743, %55, r740; +} +{ +add.f16x2 r746, %75, %85; +} +{ +mul.f16x2 r749, r746, r671; +} +{ +add.f16x2 r752, r743, r749; +} +{ +sub.f16x2 r755, %66, %96; +} +{ +mul.f16x2 r758, r755, r669; +} +{ +sub.f16x2 r761, %76, %86; +} +{ +mul.f16x2 r764, r761, r673; +} +{ +add.f16x2 r767, r758, r764; +} +{ +add.f16x2 r770, r752, r767; +} +{ +add.f16x2 r773, %65, %95; +} +{ +mul.f16x2 r776, r773, r671; +} +{ +add.f16x2 r779, %55, r776; +} +{ +add.f16x2 r782, %75, %85; +} +{ +mul.f16x2 r785, r782, r675; +} +{ +add.f16x2 r788, r779, r785; +} +{ +sub.f16x2 r791, %66, %96; +} +{ +mul.f16x2 r794, r791, r673; +} +{ +sub.f16x2 r797, %76, %86; +} +{ +mul.f16x2 r800, r797, r676; +} +{ +add.f16x2 r803, r794, r800; +} +{ +sub.f16x2 r806, r788, r803; +} +{ +add.f16x2 r809, %65, %95; +} +{ +mul.f16x2 r812, r809, r671; +} +{ +add.f16x2 r815, %55, r812; +} +{ +add.f16x2 r818, %75, %85; +} +{ +mul.f16x2 r821, r818, r675; +} +{ +add.f16x2 r824, r815, r821; +} +{ +sub.f16x2 r827, %66, %96; +} +{ +mul.f16x2 r830, r827, r673; +} +{ +sub.f16x2 r833, %76, %86; +} +{ +mul.f16x2 r836, r833, r676; +} +{ +add.f16x2 r839, r830, r836; +} +{ +add.f16x2 r842, r824, r839; +} +{ +add.f16x2 r845, %66, %96; +} +{ +mul.f16x2 r848, r845, r667; +} +{ +add.f16x2 r851, %56, r848; +} +{ +add.f16x2 r854, %76, %86; +} +{ +mul.f16x2 r857, r854, r671; +} +{ +add.f16x2 r860, r851, r857; +} +{ +sub.f16x2 r863, %65, %95; +} +{ +mul.f16x2 r866, r863, r669; +} +{ +sub.f16x2 r869, %75, %85; +} +{ +mul.f16x2 r872, r869, r673; +} +{ +add.f16x2 r875, r866, r872; +} +{ +add.f16x2 r878, r860, r875; +} +{ +add.f16x2 r881, %66, %96; +} +{ +mul.f16x2 r884, r881, r667; +} +{ +add.f16x2 r887, %56, r884; +} +{ +add.f16x2 r890, %76, %86; +} +{ +mul.f16x2 r893, r890, r671; +} +{ +add.f16x2 r896, r887, r893; +} +{ +sub.f16x2 r899, %65, %95; +} +{ +mul.f16x2 r902, r899, r669; +} +{ +sub.f16x2 r905, %75, %85; +} +{ +mul.f16x2 r908, r905, r673; +} +{ +add.f16x2 r911, r902, r908; +} +{ +sub.f16x2 r914, r896, r911; +} +{ +add.f16x2 r917, %66, %96; +} +{ +mul.f16x2 r920, r917, r671; +} +{ +add.f16x2 r923, %56, r920; +} +{ +add.f16x2 r926, %76, %86; +} +{ +mul.f16x2 r929, r926, r675; +} +{ +add.f16x2 r932, r923, r929; +} +{ +sub.f16x2 r935, %65, %95; +} +{ +mul.f16x2 r938, r935, r673; +} +{ +sub.f16x2 r941, %75, %85; +} +{ +mul.f16x2 r944, r941, r676; +} +{ +add.f16x2 r947, r938, r944; +} +{ +add.f16x2 r950, r932, r947; +} +{ +add.f16x2 r953, %66, %96; +} +{ +mul.f16x2 r956, r953, r671; +} +{ +add.f16x2 r959, %56, r956; +} +{ +add.f16x2 r962, %76, %86; +} +{ +mul.f16x2 r965, r962, r675; +} +{ +add.f16x2 r968, r959, r965; +} +{ +sub.f16x2 r971, %65, %95; +} +{ +mul.f16x2 r974, r971, r673; +} +{ +sub.f16x2 r977, %75, %85; +} +{ +mul.f16x2 r980, r977, r676; +} +{ +add.f16x2 r983, r974, r980; +} +{ +sub.f16x2 r986, r968, r983; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r989, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r990, {low, high}; +} +{ +neg.f16x2 r991, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r993, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r994, {low, high}; +} +{ +neg.f16x2 r995, r994; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r998, {low, high}; +} +{ +add.f16x2 r999, %67, %97; +} +{ +add.f16x2 r1002, %57, r999; +} +{ +add.f16x2 r1005, %77, %87; +} +{ +add.f16x2 r1008, r1002, r1005; +} +{ +add.f16x2 r1011, %68, %98; +} +{ +add.f16x2 r1014, %58, r1011; +} +{ +add.f16x2 r1017, %78, %88; +} +{ +add.f16x2 r1020, r1014, r1017; +} +{ +add.f16x2 r1023, %67, %97; +} +{ +mul.f16x2 r1026, r1023, r989; +} +{ +add.f16x2 r1029, %57, r1026; +} +{ +add.f16x2 r1032, %77, %87; +} +{ +mul.f16x2 r1035, r1032, r993; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +sub.f16x2 r1041, %68, %98; +} +{ +mul.f16x2 r1044, r1041, r991; +} +{ +sub.f16x2 r1047, %78, %88; +} +{ +mul.f16x2 r1050, r1047, r995; +} +{ +add.f16x2 r1053, r1044, r1050; +} +{ +sub.f16x2 r1056, r1038, r1053; +} +{ +add.f16x2 r1059, %67, %97; +} +{ +mul.f16x2 r1062, r1059, r989; +} +{ +add.f16x2 r1065, %57, r1062; +} +{ +add.f16x2 r1068, %77, %87; +} +{ +mul.f16x2 r1071, r1068, r993; +} +{ +add.f16x2 r1074, r1065, r1071; +} +{ +sub.f16x2 r1077, %68, %98; +} +{ +mul.f16x2 r1080, r1077, r991; +} +{ +sub.f16x2 r1083, %78, %88; +} +{ +mul.f16x2 r1086, r1083, r995; +} +{ +add.f16x2 r1089, r1080, r1086; +} +{ +add.f16x2 r1092, r1074, r1089; +} +{ +add.f16x2 r1095, %67, %97; +} +{ +mul.f16x2 r1098, r1095, r993; +} +{ +add.f16x2 r1101, %57, r1098; +} +{ +add.f16x2 r1104, %77, %87; +} +{ +mul.f16x2 r1107, r1104, r997; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, %68, %98; +} +{ +mul.f16x2 r1116, r1113, r995; +} +{ +sub.f16x2 r1119, %78, %88; +} +{ +mul.f16x2 r1122, r1119, r998; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r1110, r1125; +} +{ +add.f16x2 r1131, %67, %97; +} +{ +mul.f16x2 r1134, r1131, r993; +} +{ +add.f16x2 r1137, %57, r1134; +} +{ +add.f16x2 r1140, %77, %87; +} +{ +mul.f16x2 r1143, r1140, r997; +} +{ +add.f16x2 r1146, r1137, r1143; +} +{ +sub.f16x2 r1149, %68, %98; +} +{ +mul.f16x2 r1152, r1149, r995; +} +{ +sub.f16x2 r1155, %78, %88; +} +{ +mul.f16x2 r1158, r1155, r998; +} +{ +add.f16x2 r1161, r1152, r1158; +} +{ +add.f16x2 r1164, r1146, r1161; +} +{ +add.f16x2 r1167, %68, %98; +} +{ +mul.f16x2 r1170, r1167, r989; +} +{ +add.f16x2 r1173, %58, r1170; +} +{ +add.f16x2 r1176, %78, %88; +} +{ +mul.f16x2 r1179, r1176, r993; +} +{ +add.f16x2 r1182, r1173, r1179; +} +{ +sub.f16x2 r1185, %67, %97; +} +{ +mul.f16x2 r1188, r1185, r991; +} +{ +sub.f16x2 r1191, %77, %87; +} +{ +mul.f16x2 r1194, r1191, r995; +} +{ +add.f16x2 r1197, r1188, r1194; +} +{ +add.f16x2 r1200, r1182, r1197; +} +{ +add.f16x2 r1203, %68, %98; +} +{ +mul.f16x2 r1206, r1203, r989; +} +{ +add.f16x2 r1209, %58, r1206; +} +{ +add.f16x2 r1212, %78, %88; +} +{ +mul.f16x2 r1215, r1212, r993; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, %67, %97; +} +{ +mul.f16x2 r1224, r1221, r991; +} +{ +sub.f16x2 r1227, %77, %87; +} +{ +mul.f16x2 r1230, r1227, r995; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r1218, r1233; +} +{ +add.f16x2 r1239, %68, %98; +} +{ +mul.f16x2 r1242, r1239, r993; +} +{ +add.f16x2 r1245, %58, r1242; +} +{ +add.f16x2 r1248, %78, %88; +} +{ +mul.f16x2 r1251, r1248, r997; +} +{ +add.f16x2 r1254, r1245, r1251; +} +{ +sub.f16x2 r1257, %67, %97; +} +{ +mul.f16x2 r1260, r1257, r995; +} +{ +sub.f16x2 r1263, %77, %87; +} +{ +mul.f16x2 r1266, r1263, r998; +} +{ +add.f16x2 r1269, r1260, r1266; +} +{ +add.f16x2 r1272, r1254, r1269; +} +{ +add.f16x2 r1275, %68, %98; +} +{ +mul.f16x2 r1278, r1275, r993; +} +{ +add.f16x2 r1281, %58, r1278; +} +{ +add.f16x2 r1284, %78, %88; +} +{ +mul.f16x2 r1287, r1284, r997; +} +{ +add.f16x2 r1290, r1281, r1287; +} +{ +sub.f16x2 r1293, %67, %97; +} +{ +mul.f16x2 r1296, r1293, r995; +} +{ +sub.f16x2 r1299, %77, %87; +} +{ +mul.f16x2 r1302, r1299, r998; +} +{ +add.f16x2 r1305, r1296, r1302; +} +{ +sub.f16x2 r1308, r1290, r1305; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1311, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1312, {low, high}; +} +{ +neg.f16x2 r1313, r1312; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1315, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1316, {low, high}; +} +{ +neg.f16x2 r1317, r1316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1319, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1320, {low, high}; +} +{ +add.f16x2 r1321, %69, %99; +} +{ +add.f16x2 r1324, %59, r1321; +} +{ +add.f16x2 r1327, %79, %89; +} +{ +add.f16x2 r1330, r1324, r1327; +} +{ +add.f16x2 r1333, %70, %100; +} +{ +add.f16x2 r1336, %60, r1333; +} +{ +add.f16x2 r1339, %80, %90; +} +{ +add.f16x2 r1342, r1336, r1339; +} +{ +add.f16x2 r1345, %69, %99; +} +{ +mul.f16x2 r1348, r1345, r1311; +} +{ +add.f16x2 r1351, %59, r1348; +} +{ +add.f16x2 r1354, %79, %89; +} +{ +mul.f16x2 r1357, r1354, r1315; +} +{ +add.f16x2 r1360, r1351, r1357; +} +{ +sub.f16x2 r1363, %70, %100; +} +{ +mul.f16x2 r1366, r1363, r1313; +} +{ +sub.f16x2 r1369, %80, %90; +} +{ +mul.f16x2 r1372, r1369, r1317; +} +{ +add.f16x2 r1375, r1366, r1372; +} +{ +sub.f16x2 r1378, r1360, r1375; +} +{ +add.f16x2 r1381, %69, %99; +} +{ +mul.f16x2 r1384, r1381, r1311; +} +{ +add.f16x2 r1387, %59, r1384; +} +{ +add.f16x2 r1390, %79, %89; +} +{ +mul.f16x2 r1393, r1390, r1315; +} +{ +add.f16x2 r1396, r1387, r1393; +} +{ +sub.f16x2 r1399, %70, %100; +} +{ +mul.f16x2 r1402, r1399, r1313; +} +{ +sub.f16x2 r1405, %80, %90; +} +{ +mul.f16x2 r1408, r1405, r1317; +} +{ +add.f16x2 r1411, r1402, r1408; +} +{ +add.f16x2 r1414, r1396, r1411; +} +{ +add.f16x2 r1417, %69, %99; +} +{ +mul.f16x2 r1420, r1417, r1315; +} +{ +add.f16x2 r1423, %59, r1420; +} +{ +add.f16x2 r1426, %79, %89; +} +{ +mul.f16x2 r1429, r1426, r1319; +} +{ +add.f16x2 r1432, r1423, r1429; +} +{ +sub.f16x2 r1435, %70, %100; +} +{ +mul.f16x2 r1438, r1435, r1317; +} +{ +sub.f16x2 r1441, %80, %90; +} +{ +mul.f16x2 r1444, r1441, r1320; +} +{ +add.f16x2 r1447, r1438, r1444; +} +{ +sub.f16x2 r1450, r1432, r1447; +} +{ +add.f16x2 r1453, %69, %99; +} +{ +mul.f16x2 r1456, r1453, r1315; +} +{ +add.f16x2 r1459, %59, r1456; +} +{ +add.f16x2 r1462, %79, %89; +} +{ +mul.f16x2 r1465, r1462, r1319; +} +{ +add.f16x2 r1468, r1459, r1465; +} +{ +sub.f16x2 r1471, %70, %100; +} +{ +mul.f16x2 r1474, r1471, r1317; +} +{ +sub.f16x2 r1477, %80, %90; +} +{ +mul.f16x2 r1480, r1477, r1320; +} +{ +add.f16x2 r1483, r1474, r1480; +} +{ +add.f16x2 r1486, r1468, r1483; +} +{ +add.f16x2 r1489, %70, %100; +} +{ +mul.f16x2 r1492, r1489, r1311; +} +{ +add.f16x2 r1495, %60, r1492; +} +{ +add.f16x2 r1498, %80, %90; +} +{ +mul.f16x2 r1501, r1498, r1315; +} +{ +add.f16x2 r1504, r1495, r1501; +} +{ +sub.f16x2 r1507, %69, %99; +} +{ +mul.f16x2 r1510, r1507, r1313; +} +{ +sub.f16x2 r1513, %79, %89; +} +{ +mul.f16x2 r1516, r1513, r1317; +} +{ +add.f16x2 r1519, r1510, r1516; +} +{ +add.f16x2 r1522, r1504, r1519; +} +{ +add.f16x2 r1525, %70, %100; +} +{ +mul.f16x2 r1528, r1525, r1311; +} +{ +add.f16x2 r1531, %60, r1528; +} +{ +add.f16x2 r1534, %80, %90; +} +{ +mul.f16x2 r1537, r1534, r1315; +} +{ +add.f16x2 r1540, r1531, r1537; +} +{ +sub.f16x2 r1543, %69, %99; +} +{ +mul.f16x2 r1546, r1543, r1313; +} +{ +sub.f16x2 r1549, %79, %89; +} +{ +mul.f16x2 r1552, r1549, r1317; +} +{ +add.f16x2 r1555, r1546, r1552; +} +{ +sub.f16x2 r1558, r1540, r1555; +} +{ +add.f16x2 r1561, %70, %100; +} +{ +mul.f16x2 r1564, r1561, r1315; +} +{ +add.f16x2 r1567, %60, r1564; +} +{ +add.f16x2 r1570, %80, %90; +} +{ +mul.f16x2 r1573, r1570, r1319; +} +{ +add.f16x2 r1576, r1567, r1573; +} +{ +sub.f16x2 r1579, %69, %99; +} +{ +mul.f16x2 r1582, r1579, r1317; +} +{ +sub.f16x2 r1585, %79, %89; +} +{ +mul.f16x2 r1588, r1585, r1320; +} +{ +add.f16x2 r1591, r1582, r1588; +} +{ +add.f16x2 r1594, r1576, r1591; +} +{ +add.f16x2 r1597, %70, %100; +} +{ +mul.f16x2 r1600, r1597, r1315; +} +{ +add.f16x2 r1603, %60, r1600; +} +{ +add.f16x2 r1606, %80, %90; +} +{ +mul.f16x2 r1609, r1606, r1319; +} +{ +add.f16x2 r1612, r1603, r1609; +} +{ +sub.f16x2 r1615, %69, %99; +} +{ +mul.f16x2 r1618, r1615, r1317; +} +{ +sub.f16x2 r1621, %79, %89; +} +{ +mul.f16x2 r1624, r1621, r1320; +} +{ +add.f16x2 r1627, r1618, r1624; +} +{ +sub.f16x2 r1630, r1612, r1627; +} +mov.f32 f62, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1633, {low, high}; +} +mov.f32 f64, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r1634, {low, high}; +} +mov.f32 f66, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r1635, {low, high}; +} +mov.f32 f68, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r1636, {low, high}; +} +mov.f32 f70, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r1637, {low, high}; +} +mov.f32 f72, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r1638, {low, high}; +} +mov.f32 f74, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1639, {low, high}; +} +mov.f32 f76, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r1640, {low, high}; +} +mov.f32 f82, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1643, {low, high}; +} +mov.f32 f84, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1644, {low, high}; +} +mov.f32 f90, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1647, {low, high}; +} +mov.f32 f92, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1648, {low, high}; +} +mov.f32 f122, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1649, {low, high}; +} +mov.f32 f96, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1650, {low, high}; +} +mov.f32 f106, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1655, {low, high}; +} +mov.f32 f108, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1656, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1663, {low, high}; +} +mov.f32 f124, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1664, {low, high}; +} +{ +mul.f16x2 r1681, r412, r1633; +} +{ +mul.f16x2 r1684, r556, r1634; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r412, r1634; +} +{ +fma.rn.f16x2 r1693, r556, r1633, r1690; +} +{ +mul.f16x2 r1697, r734, r1635; +} +{ +mul.f16x2 r1700, r878, r1636; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r734, r1636; +} +{ +fma.rn.f16x2 r1709, r878, r1635, r1706; +} +{ +mul.f16x2 r1713, r1056, r1637; +} +{ +mul.f16x2 r1716, r1200, r1638; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r1056, r1638; +} +{ +fma.rn.f16x2 r1725, r1200, r1637, r1722; +} +{ +mul.f16x2 r1729, r1378, r1639; +} +{ +mul.f16x2 r1732, r1522, r1640; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r1378, r1640; +} +{ +fma.rn.f16x2 r1741, r1522, r1639, r1738; +} +{ +mul.f16x2 r1745, r484, r1635; +} +{ +mul.f16x2 r1748, r628, r1636; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r484, r1636; +} +{ +fma.rn.f16x2 r1757, r628, r1635, r1754; +} +{ +mul.f16x2 r1761, r806, r1639; +} +{ +mul.f16x2 r1764, r950, r1640; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r806, r1640; +} +{ +fma.rn.f16x2 r1773, r950, r1639, r1770; +} +{ +mul.f16x2 r1777, r1128, r1643; +} +{ +mul.f16x2 r1780, r1272, r1644; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r1128, r1644; +} +{ +fma.rn.f16x2 r1789, r1272, r1643, r1786; +} +{ +mul.f16x2 r1793, r1450, r1647; +} +{ +mul.f16x2 r1796, r1594, r1648; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r1450, r1648; +} +{ +fma.rn.f16x2 r1805, r1594, r1647, r1802; +} +{ +mul.f16x2 r1809, r520, r1637; +} +{ +mul.f16x2 r1812, r664, r1638; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r520, r1638; +} +{ +fma.rn.f16x2 r1821, r664, r1637, r1818; +} +{ +mul.f16x2 r1825, r842, r1643; +} +{ +mul.f16x2 r1828, r986, r1644; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r842, r1644; +} +{ +fma.rn.f16x2 r1837, r986, r1643, r1834; +} +{ +mul.f16x2 r1841, r1164, r1649; +} +{ +mul.f16x2 r1844, r1308, r1650; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1164, r1650; +} +{ +fma.rn.f16x2 r1853, r1308, r1649, r1850; +} +{ +mul.f16x2 r1857, r1486, r1655; +} +{ +mul.f16x2 r1860, r1630, r1656; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1486, r1656; +} +{ +fma.rn.f16x2 r1869, r1630, r1655, r1866; +} +{ +mul.f16x2 r1873, r448, r1639; +} +{ +mul.f16x2 r1876, r592, r1640; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r448, r1640; +} +{ +fma.rn.f16x2 r1885, r592, r1639, r1882; +} +{ +mul.f16x2 r1889, r770, r1647; +} +{ +mul.f16x2 r1892, r914, r1648; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r770, r1648; +} +{ +fma.rn.f16x2 r1901, r914, r1647, r1898; +} +{ +mul.f16x2 r1905, r1092, r1655; +} +{ +mul.f16x2 r1908, r1236, r1656; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r1092, r1656; +} +{ +fma.rn.f16x2 r1917, r1236, r1655, r1914; +} +{ +mul.f16x2 r1921, r1414, r1663; +} +{ +mul.f16x2 r1924, r1558, r1664; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1414, r1664; +} +{ +fma.rn.f16x2 r1933, r1558, r1663, r1930; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1937, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1938, {low, high}; +} +{ +neg.f16x2 r1939, r1938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r1941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1942, {low, high}; +} +{ +neg.f16x2 r1943, r1942; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1946, {low, high}; +} +{ +add.f16x2 r1947, r364, r1330; +} +{ +add.f16x2 r1950, r42, r1947; +} +{ +add.f16x2 r1953, r686, r1008; +} +{ +add.f16x2 r1956, r1950, r1953; +} +st.local.u32 [rd3], r1956; +{ +add.f16x2 r1959, r376, r1342; +} +{ +add.f16x2 r1962, r54, r1959; +} +{ +add.f16x2 r1965, r698, r1020; +} +{ +add.f16x2 r1968, r1962, r1965; +} +st.local.u32 [rd3+4], r1968; +{ +add.f16x2 r1971, r364, r1330; +} +{ +mul.f16x2 r1974, r1971, r1937; +} +{ +add.f16x2 r1977, r42, r1974; +} +{ +add.f16x2 r1980, r686, r1008; +} +{ +mul.f16x2 r1983, r1980, r1941; +} +{ +add.f16x2 r1986, r1977, r1983; +} +{ +sub.f16x2 r1989, r376, r1342; +} +{ +mul.f16x2 r1992, r1989, r1939; +} +{ +sub.f16x2 r1995, r698, r1020; +} +{ +mul.f16x2 r1998, r1995, r1943; +} +{ +add.f16x2 r2001, r1992, r1998; +} +{ +sub.f16x2 r2004, r1986, r2001; +} +st.local.u32 [rd3+40], r2004; +{ +add.f16x2 r2007, r364, r1330; +} +{ +mul.f16x2 r2010, r2007, r1937; +} +{ +add.f16x2 r2013, r42, r2010; +} +{ +add.f16x2 r2016, r686, r1008; +} +{ +mul.f16x2 r2019, r2016, r1941; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +sub.f16x2 r2025, r376, r1342; +} +{ +mul.f16x2 r2028, r2025, r1939; +} +{ +sub.f16x2 r2031, r698, r1020; +} +{ +mul.f16x2 r2034, r2031, r1943; +} +{ +add.f16x2 r2037, r2028, r2034; +} +{ +add.f16x2 r2040, r2022, r2037; +} +st.local.u32 [rd3+160], r2040; +{ +add.f16x2 r2043, r364, r1330; +} +{ +mul.f16x2 r2046, r2043, r1941; +} +{ +add.f16x2 r2049, r42, r2046; +} +{ +add.f16x2 r2052, r686, r1008; +} +{ +mul.f16x2 r2055, r2052, r1945; +} +{ +add.f16x2 r2058, r2049, r2055; +} +{ +sub.f16x2 r2061, r376, r1342; +} +{ +mul.f16x2 r2064, r2061, r1943; +} +{ +sub.f16x2 r2067, r698, r1020; +} +{ +mul.f16x2 r2070, r2067, r1946; +} +{ +add.f16x2 r2073, r2064, r2070; +} +{ +sub.f16x2 r2076, r2058, r2073; +} +st.local.u32 [rd3+80], r2076; +{ +add.f16x2 r2079, r364, r1330; +} +{ +mul.f16x2 r2082, r2079, r1941; +} +{ +add.f16x2 r2085, r42, r2082; +} +{ +add.f16x2 r2088, r686, r1008; +} +{ +mul.f16x2 r2091, r2088, r1945; +} +{ +add.f16x2 r2094, r2085, r2091; +} +{ +sub.f16x2 r2097, r376, r1342; +} +{ +mul.f16x2 r2100, r2097, r1943; +} +{ +sub.f16x2 r2103, r698, r1020; +} +{ +mul.f16x2 r2106, r2103, r1946; +} +{ +add.f16x2 r2109, r2100, r2106; +} +{ +add.f16x2 r2112, r2094, r2109; +} +st.local.u32 [rd3+120], r2112; +{ +add.f16x2 r2115, r376, r1342; +} +{ +mul.f16x2 r2118, r2115, r1937; +} +{ +add.f16x2 r2121, r54, r2118; +} +{ +add.f16x2 r2124, r698, r1020; +} +{ +mul.f16x2 r2127, r2124, r1941; +} +{ +add.f16x2 r2130, r2121, r2127; +} +{ +sub.f16x2 r2133, r364, r1330; +} +{ +mul.f16x2 r2136, r2133, r1939; +} +{ +sub.f16x2 r2139, r686, r1008; +} +{ +mul.f16x2 r2142, r2139, r1943; +} +{ +add.f16x2 r2145, r2136, r2142; +} +{ +add.f16x2 r2148, r2130, r2145; +} +st.local.u32 [rd3+44], r2148; +{ +add.f16x2 r2151, r376, r1342; +} +{ +mul.f16x2 r2154, r2151, r1937; +} +{ +add.f16x2 r2157, r54, r2154; +} +{ +add.f16x2 r2160, r698, r1020; +} +{ +mul.f16x2 r2163, r2160, r1941; +} +{ +add.f16x2 r2166, r2157, r2163; +} +{ +sub.f16x2 r2169, r364, r1330; +} +{ +mul.f16x2 r2172, r2169, r1939; +} +{ +sub.f16x2 r2175, r686, r1008; +} +{ +mul.f16x2 r2178, r2175, r1943; +} +{ +add.f16x2 r2181, r2172, r2178; +} +{ +sub.f16x2 r2184, r2166, r2181; +} +st.local.u32 [rd3+164], r2184; +{ +add.f16x2 r2187, r376, r1342; +} +{ +mul.f16x2 r2190, r2187, r1941; +} +{ +add.f16x2 r2193, r54, r2190; +} +{ +add.f16x2 r2196, r698, r1020; +} +{ +mul.f16x2 r2199, r2196, r1945; +} +{ +add.f16x2 r2202, r2193, r2199; +} +{ +sub.f16x2 r2205, r364, r1330; +} +{ +mul.f16x2 r2208, r2205, r1943; +} +{ +sub.f16x2 r2211, r686, r1008; +} +{ +mul.f16x2 r2214, r2211, r1946; +} +{ +add.f16x2 r2217, r2208, r2214; +} +{ +add.f16x2 r2220, r2202, r2217; +} +st.local.u32 [rd3+84], r2220; +{ +add.f16x2 r2223, r376, r1342; +} +{ +mul.f16x2 r2226, r2223, r1941; +} +{ +add.f16x2 r2229, r54, r2226; +} +{ +add.f16x2 r2232, r698, r1020; +} +{ +mul.f16x2 r2235, r2232, r1945; +} +{ +add.f16x2 r2238, r2229, r2235; +} +{ +sub.f16x2 r2241, r364, r1330; +} +{ +mul.f16x2 r2244, r2241, r1943; +} +{ +sub.f16x2 r2247, r686, r1008; +} +{ +mul.f16x2 r2250, r2247, r1946; +} +{ +add.f16x2 r2253, r2244, r2250; +} +{ +sub.f16x2 r2256, r2238, r2253; +} +st.local.u32 [rd3+124], r2256; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2260, {low, high}; +} +{ +neg.f16x2 r2261, r2260; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2264, {low, high}; +} +{ +neg.f16x2 r2265, r2264; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2268, {low, high}; +} +{ +add.f16x2 r2269, r1687, r1735; +} +{ +add.f16x2 r2272, r90, r2269; +} +{ +add.f16x2 r2275, r1703, r1719; +} +{ +add.f16x2 r11308, r2272, r2275; +} +st.local.u32 [rd3+8], r11308; +{ +add.f16x2 r2281, r1693, r1741; +} +{ +add.f16x2 r2284, r234, r2281; +} +{ +add.f16x2 r2287, r1709, r1725; +} +{ +add.f16x2 r2290, r2284, r2287; +} +st.local.u32 [rd3+12], r2290; +{ +add.f16x2 r2293, r1687, r1735; +} +{ +mul.f16x2 r2296, r2293, r2259; +} +{ +add.f16x2 r2299, r90, r2296; +} +{ +add.f16x2 r2302, r1703, r1719; +} +{ +mul.f16x2 r2305, r2302, r2263; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1693, r1741; +} +{ +mul.f16x2 r2314, r2311, r2261; +} +{ +sub.f16x2 r2317, r1709, r1725; +} +{ +mul.f16x2 r2320, r2317, r2265; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +sub.f16x2 r2326, r2308, r2323; +} +st.local.u32 [rd3+48], r2326; +{ +add.f16x2 r2329, r1687, r1735; +} +{ +mul.f16x2 r2332, r2329, r2259; +} +{ +add.f16x2 r2335, r90, r2332; +} +{ +add.f16x2 r2338, r1703, r1719; +} +{ +mul.f16x2 r2341, r2338, r2263; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1693, r1741; +} +{ +mul.f16x2 r2350, r2347, r2261; +} +{ +sub.f16x2 r2353, r1709, r1725; +} +{ +mul.f16x2 r2356, r2353, r2265; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +add.f16x2 r2362, r2344, r2359; +} +st.local.u32 [rd3+168], r2362; +{ +add.f16x2 r2365, r1687, r1735; +} +{ +mul.f16x2 r2368, r2365, r2263; +} +{ +add.f16x2 r2371, r90, r2368; +} +{ +add.f16x2 r2374, r1703, r1719; +} +{ +mul.f16x2 r2377, r2374, r2267; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1693, r1741; +} +{ +mul.f16x2 r2386, r2383, r2265; +} +{ +sub.f16x2 r2389, r1709, r1725; +} +{ +mul.f16x2 r2392, r2389, r2268; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +sub.f16x2 r2398, r2380, r2395; +} +st.local.u32 [rd3+88], r2398; +{ +add.f16x2 r2401, r1687, r1735; +} +{ +mul.f16x2 r2404, r2401, r2263; +} +{ +add.f16x2 r2407, r90, r2404; +} +{ +add.f16x2 r2410, r1703, r1719; +} +{ +mul.f16x2 r2413, r2410, r2267; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1693, r1741; +} +{ +mul.f16x2 r2422, r2419, r2265; +} +{ +sub.f16x2 r2425, r1709, r1725; +} +{ +mul.f16x2 r2428, r2425, r2268; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +st.local.u32 [rd3+128], r2434; +{ +add.f16x2 r2437, r1693, r1741; +} +{ +mul.f16x2 r2440, r2437, r2259; +} +{ +add.f16x2 r2443, r234, r2440; +} +{ +add.f16x2 r2446, r1709, r1725; +} +{ +mul.f16x2 r2449, r2446, r2263; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1687, r1735; +} +{ +mul.f16x2 r2458, r2455, r2261; +} +{ +sub.f16x2 r2461, r1703, r1719; +} +{ +mul.f16x2 r2464, r2461, r2265; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +add.f16x2 r2470, r2452, r2467; +} +st.local.u32 [rd3+52], r2470; +{ +add.f16x2 r2473, r1693, r1741; +} +{ +mul.f16x2 r2476, r2473, r2259; +} +{ +add.f16x2 r2479, r234, r2476; +} +{ +add.f16x2 r2482, r1709, r1725; +} +{ +mul.f16x2 r2485, r2482, r2263; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1687, r1735; +} +{ +mul.f16x2 r2494, r2491, r2261; +} +{ +sub.f16x2 r2497, r1703, r1719; +} +{ +mul.f16x2 r2500, r2497, r2265; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +sub.f16x2 r2506, r2488, r2503; +} +st.local.u32 [rd3+172], r2506; +{ +add.f16x2 r2509, r1693, r1741; +} +{ +mul.f16x2 r2512, r2509, r2263; +} +{ +add.f16x2 r2515, r234, r2512; +} +{ +add.f16x2 r2518, r1709, r1725; +} +{ +mul.f16x2 r2521, r2518, r2267; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1687, r1735; +} +{ +mul.f16x2 r2530, r2527, r2265; +} +{ +sub.f16x2 r2533, r1703, r1719; +} +{ +mul.f16x2 r2536, r2533, r2268; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +add.f16x2 r2542, r2524, r2539; +} +st.local.u32 [rd3+92], r2542; +{ +add.f16x2 r2545, r1693, r1741; +} +{ +mul.f16x2 r2548, r2545, r2263; +} +{ +add.f16x2 r2551, r234, r2548; +} +{ +add.f16x2 r2554, r1709, r1725; +} +{ +mul.f16x2 r2557, r2554, r2267; +} +{ +add.f16x2 r2560, r2551, r2557; +} +{ +sub.f16x2 r2563, r1687, r1735; +} +{ +mul.f16x2 r2566, r2563, r2265; +} +{ +sub.f16x2 r2569, r1703, r1719; +} +{ +mul.f16x2 r2572, r2569, r2268; +} +{ +add.f16x2 r2575, r2566, r2572; +} +{ +sub.f16x2 r2578, r2560, r2575; +} +st.local.u32 [rd3+132], r2578; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2581, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2582, {low, high}; +} +{ +neg.f16x2 r2583, r2582; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2585, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2586, {low, high}; +} +{ +neg.f16x2 r2587, r2586; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2590, {low, high}; +} +{ +add.f16x2 r2591, r1751, r1799; +} +{ +add.f16x2 r2594, r162, r2591; +} +{ +add.f16x2 r2597, r1767, r1783; +} +{ +add.f16x2 r2600, r2594, r2597; +} +st.local.u32 [rd3+16], r2600; +{ +add.f16x2 r2603, r1757, r1805; +} +{ +add.f16x2 r2606, r306, r2603; +} +{ +add.f16x2 r2609, r1773, r1789; +} +{ +add.f16x2 r2612, r2606, r2609; +} +st.local.u32 [rd3+20], r2612; +{ +add.f16x2 r2615, r1751, r1799; +} +{ +mul.f16x2 r2618, r2615, r2581; +} +{ +add.f16x2 r2621, r162, r2618; +} +{ +add.f16x2 r2624, r1767, r1783; +} +{ +mul.f16x2 r2627, r2624, r2585; +} +{ +add.f16x2 r2630, r2621, r2627; +} +{ +sub.f16x2 r2633, r1757, r1805; +} +{ +mul.f16x2 r2636, r2633, r2583; +} +{ +sub.f16x2 r2639, r1773, r1789; +} +{ +mul.f16x2 r2642, r2639, r2587; +} +{ +add.f16x2 r2645, r2636, r2642; +} +{ +sub.f16x2 r2648, r2630, r2645; +} +st.local.u32 [rd3+56], r2648; +{ +add.f16x2 r2651, r1751, r1799; +} +{ +mul.f16x2 r2654, r2651, r2581; +} +{ +add.f16x2 r2657, r162, r2654; +} +{ +add.f16x2 r2660, r1767, r1783; +} +{ +mul.f16x2 r2663, r2660, r2585; +} +{ +add.f16x2 r2666, r2657, r2663; +} +{ +sub.f16x2 r2669, r1757, r1805; +} +{ +mul.f16x2 r2672, r2669, r2583; +} +{ +sub.f16x2 r2675, r1773, r1789; +} +{ +mul.f16x2 r2678, r2675, r2587; +} +{ +add.f16x2 r2681, r2672, r2678; +} +{ +add.f16x2 r2684, r2666, r2681; +} +st.local.u32 [rd3+176], r2684; +{ +add.f16x2 r2687, r1751, r1799; +} +{ +mul.f16x2 r2690, r2687, r2585; +} +{ +add.f16x2 r2693, r162, r2690; +} +{ +add.f16x2 r2696, r1767, r1783; +} +{ +mul.f16x2 r2699, r2696, r2589; +} +{ +add.f16x2 r2702, r2693, r2699; +} +{ +sub.f16x2 r2705, r1757, r1805; +} +{ +mul.f16x2 r2708, r2705, r2587; +} +{ +sub.f16x2 r2711, r1773, r1789; +} +{ +mul.f16x2 r2714, r2711, r2590; +} +{ +add.f16x2 r2717, r2708, r2714; +} +{ +sub.f16x2 r2720, r2702, r2717; +} +st.local.u32 [rd3+96], r2720; +{ +add.f16x2 r2723, r1751, r1799; +} +{ +mul.f16x2 r2726, r2723, r2585; +} +{ +add.f16x2 r2729, r162, r2726; +} +{ +add.f16x2 r2732, r1767, r1783; +} +{ +mul.f16x2 r2735, r2732, r2589; +} +{ +add.f16x2 r2738, r2729, r2735; +} +{ +sub.f16x2 r2741, r1757, r1805; +} +{ +mul.f16x2 r2744, r2741, r2587; +} +{ +sub.f16x2 r2747, r1773, r1789; +} +{ +mul.f16x2 r2750, r2747, r2590; +} +{ +add.f16x2 r2753, r2744, r2750; +} +{ +add.f16x2 r2756, r2738, r2753; +} +st.local.u32 [rd3+136], r2756; +{ +add.f16x2 r2759, r1757, r1805; +} +{ +mul.f16x2 r2762, r2759, r2581; +} +{ +add.f16x2 r2765, r306, r2762; +} +{ +add.f16x2 r2768, r1773, r1789; +} +{ +mul.f16x2 r2771, r2768, r2585; +} +{ +add.f16x2 r2774, r2765, r2771; +} +{ +sub.f16x2 r2777, r1751, r1799; +} +{ +mul.f16x2 r2780, r2777, r2583; +} +{ +sub.f16x2 r2783, r1767, r1783; +} +{ +mul.f16x2 r2786, r2783, r2587; +} +{ +add.f16x2 r2789, r2780, r2786; +} +{ +add.f16x2 r2792, r2774, r2789; +} +st.local.u32 [rd3+60], r2792; +{ +add.f16x2 r2795, r1757, r1805; +} +{ +mul.f16x2 r2798, r2795, r2581; +} +{ +add.f16x2 r2801, r306, r2798; +} +{ +add.f16x2 r2804, r1773, r1789; +} +{ +mul.f16x2 r2807, r2804, r2585; +} +{ +add.f16x2 r2810, r2801, r2807; +} +{ +sub.f16x2 r2813, r1751, r1799; +} +{ +mul.f16x2 r2816, r2813, r2583; +} +{ +sub.f16x2 r2819, r1767, r1783; +} +{ +mul.f16x2 r2822, r2819, r2587; +} +{ +add.f16x2 r2825, r2816, r2822; +} +{ +sub.f16x2 r2828, r2810, r2825; +} +st.local.u32 [rd3+180], r2828; +{ +add.f16x2 r2831, r1757, r1805; +} +{ +mul.f16x2 r2834, r2831, r2585; +} +{ +add.f16x2 r2837, r306, r2834; +} +{ +add.f16x2 r2840, r1773, r1789; +} +{ +mul.f16x2 r2843, r2840, r2589; +} +{ +add.f16x2 r2846, r2837, r2843; +} +{ +sub.f16x2 r2849, r1751, r1799; +} +{ +mul.f16x2 r2852, r2849, r2587; +} +{ +sub.f16x2 r2855, r1767, r1783; +} +{ +mul.f16x2 r2858, r2855, r2590; +} +{ +add.f16x2 r2861, r2852, r2858; +} +{ +add.f16x2 r2864, r2846, r2861; +} +st.local.u32 [rd3+100], r2864; +{ +add.f16x2 r2867, r1757, r1805; +} +{ +mul.f16x2 r2870, r2867, r2585; +} +{ +add.f16x2 r2873, r306, r2870; +} +{ +add.f16x2 r2876, r1773, r1789; +} +{ +mul.f16x2 r2879, r2876, r2589; +} +{ +add.f16x2 r2882, r2873, r2879; +} +{ +sub.f16x2 r2885, r1751, r1799; +} +{ +mul.f16x2 r2888, r2885, r2587; +} +{ +sub.f16x2 r2891, r1767, r1783; +} +{ +mul.f16x2 r2894, r2891, r2590; +} +{ +add.f16x2 r2897, r2888, r2894; +} +{ +sub.f16x2 r2900, r2882, r2897; +} +st.local.u32 [rd3+140], r2900; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2904, {low, high}; +} +{ +neg.f16x2 r2905, r2904; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r2907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r2908, {low, high}; +} +{ +neg.f16x2 r2909, r2908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r2911, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r2912, {low, high}; +} +{ +add.f16x2 r2913, r1815, r1863; +} +{ +add.f16x2 r2916, r198, r2913; +} +{ +add.f16x2 r2919, r1831, r1847; +} +{ +add.f16x2 r2922, r2916, r2919; +} +st.local.u32 [rd3+24], r2922; +{ +add.f16x2 r2925, r1821, r1869; +} +{ +add.f16x2 r2928, r342, r2925; +} +{ +add.f16x2 r2931, r1837, r1853; +} +{ +add.f16x2 r2934, r2928, r2931; +} +st.local.u32 [rd3+28], r2934; +{ +add.f16x2 r2937, r1815, r1863; +} +{ +mul.f16x2 r2940, r2937, r2903; +} +{ +add.f16x2 r2943, r198, r2940; +} +{ +add.f16x2 r2946, r1831, r1847; +} +{ +mul.f16x2 r2949, r2946, r2907; +} +{ +add.f16x2 r2952, r2943, r2949; +} +{ +sub.f16x2 r2955, r1821, r1869; +} +{ +mul.f16x2 r2958, r2955, r2905; +} +{ +sub.f16x2 r2961, r1837, r1853; +} +{ +mul.f16x2 r2964, r2961, r2909; +} +{ +add.f16x2 r2967, r2958, r2964; +} +{ +sub.f16x2 r2970, r2952, r2967; +} +st.local.u32 [rd3+64], r2970; +{ +add.f16x2 r2973, r1815, r1863; +} +{ +mul.f16x2 r2976, r2973, r2903; +} +{ +add.f16x2 r2979, r198, r2976; +} +{ +add.f16x2 r2982, r1831, r1847; +} +{ +mul.f16x2 r2985, r2982, r2907; +} +{ +add.f16x2 r2988, r2979, r2985; +} +{ +sub.f16x2 r2991, r1821, r1869; +} +{ +mul.f16x2 r2994, r2991, r2905; +} +{ +sub.f16x2 r2997, r1837, r1853; +} +{ +mul.f16x2 r3000, r2997, r2909; +} +{ +add.f16x2 r3003, r2994, r3000; +} +{ +add.f16x2 r3006, r2988, r3003; +} +st.local.u32 [rd3+184], r3006; +{ +add.f16x2 r3009, r1815, r1863; +} +{ +mul.f16x2 r3012, r3009, r2907; +} +{ +add.f16x2 r3015, r198, r3012; +} +{ +add.f16x2 r3018, r1831, r1847; +} +{ +mul.f16x2 r3021, r3018, r2911; +} +{ +add.f16x2 r3024, r3015, r3021; +} +{ +sub.f16x2 r3027, r1821, r1869; +} +{ +mul.f16x2 r3030, r3027, r2909; +} +{ +sub.f16x2 r3033, r1837, r1853; +} +{ +mul.f16x2 r3036, r3033, r2912; +} +{ +add.f16x2 r3039, r3030, r3036; +} +{ +sub.f16x2 r3042, r3024, r3039; +} +st.local.u32 [rd3+104], r3042; +{ +add.f16x2 r3045, r1815, r1863; +} +{ +mul.f16x2 r3048, r3045, r2907; +} +{ +add.f16x2 r3051, r198, r3048; +} +{ +add.f16x2 r3054, r1831, r1847; +} +{ +mul.f16x2 r3057, r3054, r2911; +} +{ +add.f16x2 r3060, r3051, r3057; +} +{ +sub.f16x2 r3063, r1821, r1869; +} +{ +mul.f16x2 r3066, r3063, r2909; +} +{ +sub.f16x2 r3069, r1837, r1853; +} +{ +mul.f16x2 r3072, r3069, r2912; +} +{ +add.f16x2 r3075, r3066, r3072; +} +{ +add.f16x2 r3078, r3060, r3075; +} +st.local.u32 [rd3+144], r3078; +{ +add.f16x2 r3081, r1821, r1869; +} +{ +mul.f16x2 r3084, r3081, r2903; +} +{ +add.f16x2 r3087, r342, r3084; +} +{ +add.f16x2 r3090, r1837, r1853; +} +{ +mul.f16x2 r3093, r3090, r2907; +} +{ +add.f16x2 r3096, r3087, r3093; +} +{ +sub.f16x2 r3099, r1815, r1863; +} +{ +mul.f16x2 r3102, r3099, r2905; +} +{ +sub.f16x2 r3105, r1831, r1847; +} +{ +mul.f16x2 r3108, r3105, r2909; +} +{ +add.f16x2 r3111, r3102, r3108; +} +{ +add.f16x2 r3114, r3096, r3111; +} +st.local.u32 [rd3+68], r3114; +{ +add.f16x2 r3117, r1821, r1869; +} +{ +mul.f16x2 r3120, r3117, r2903; +} +{ +add.f16x2 r3123, r342, r3120; +} +{ +add.f16x2 r3126, r1837, r1853; +} +{ +mul.f16x2 r3129, r3126, r2907; +} +{ +add.f16x2 r3132, r3123, r3129; +} +{ +sub.f16x2 r3135, r1815, r1863; +} +{ +mul.f16x2 r3138, r3135, r2905; +} +{ +sub.f16x2 r3141, r1831, r1847; +} +{ +mul.f16x2 r3144, r3141, r2909; +} +{ +add.f16x2 r3147, r3138, r3144; +} +{ +sub.f16x2 r3150, r3132, r3147; +} +st.local.u32 [rd3+188], r3150; +{ +add.f16x2 r3153, r1821, r1869; +} +{ +mul.f16x2 r3156, r3153, r2907; +} +{ +add.f16x2 r3159, r342, r3156; +} +{ +add.f16x2 r3162, r1837, r1853; +} +{ +mul.f16x2 r3165, r3162, r2911; +} +{ +add.f16x2 r3168, r3159, r3165; +} +{ +sub.f16x2 r3171, r1815, r1863; +} +{ +mul.f16x2 r3174, r3171, r2909; +} +{ +sub.f16x2 r3177, r1831, r1847; +} +{ +mul.f16x2 r3180, r3177, r2912; +} +{ +add.f16x2 r3183, r3174, r3180; +} +{ +add.f16x2 r3186, r3168, r3183; +} +st.local.u32 [rd3+108], r3186; +{ +add.f16x2 r3189, r1821, r1869; +} +{ +mul.f16x2 r3192, r3189, r2907; +} +{ +add.f16x2 r3195, r342, r3192; +} +{ +add.f16x2 r3198, r1837, r1853; +} +{ +mul.f16x2 r3201, r3198, r2911; +} +{ +add.f16x2 r3204, r3195, r3201; +} +{ +sub.f16x2 r3207, r1815, r1863; +} +{ +mul.f16x2 r3210, r3207, r2909; +} +{ +sub.f16x2 r3213, r1831, r1847; +} +{ +mul.f16x2 r3216, r3213, r2912; +} +{ +add.f16x2 r3219, r3210, r3216; +} +{ +sub.f16x2 r3222, r3204, r3219; +} +st.local.u32 [rd3+148], r3222; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3226, {low, high}; +} +{ +neg.f16x2 r3227, r3226; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3230, {low, high}; +} +{ +neg.f16x2 r3231, r3230; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3233, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3234, {low, high}; +} +{ +add.f16x2 r3235, r1879, r1927; +} +{ +add.f16x2 r3238, r126, r3235; +} +{ +add.f16x2 r3241, r1895, r1911; +} +{ +add.f16x2 r3244, r3238, r3241; +} +st.local.u32 [rd3+32], r3244; +{ +add.f16x2 r3247, r1885, r1933; +} +{ +add.f16x2 r3250, r270, r3247; +} +{ +add.f16x2 r3253, r1901, r1917; +} +{ +add.f16x2 r3256, r3250, r3253; +} +st.local.u32 [rd3+36], r3256; +{ +add.f16x2 r3259, r1879, r1927; +} +{ +mul.f16x2 r3262, r3259, r3225; +} +{ +add.f16x2 r3265, r126, r3262; +} +{ +add.f16x2 r3268, r1895, r1911; +} +{ +mul.f16x2 r3271, r3268, r3229; +} +{ +add.f16x2 r3274, r3265, r3271; +} +{ +sub.f16x2 r3277, r1885, r1933; +} +{ +mul.f16x2 r3280, r3277, r3227; +} +{ +sub.f16x2 r3283, r1901, r1917; +} +{ +mul.f16x2 r3286, r3283, r3231; +} +{ +add.f16x2 r3289, r3280, r3286; +} +{ +sub.f16x2 r3292, r3274, r3289; +} +st.local.u32 [rd3+72], r3292; +{ +add.f16x2 r3295, r1879, r1927; +} +{ +mul.f16x2 r3298, r3295, r3225; +} +{ +add.f16x2 r3301, r126, r3298; +} +{ +add.f16x2 r3304, r1895, r1911; +} +{ +mul.f16x2 r3307, r3304, r3229; +} +{ +add.f16x2 r3310, r3301, r3307; +} +{ +sub.f16x2 r3313, r1885, r1933; +} +{ +mul.f16x2 r3316, r3313, r3227; +} +{ +sub.f16x2 r3319, r1901, r1917; +} +{ +mul.f16x2 r3322, r3319, r3231; +} +{ +add.f16x2 r3325, r3316, r3322; +} +{ +add.f16x2 r3328, r3310, r3325; +} +st.local.u32 [rd3+192], r3328; +{ +add.f16x2 r3331, r1879, r1927; +} +{ +mul.f16x2 r3334, r3331, r3229; +} +{ +add.f16x2 r3337, r126, r3334; +} +{ +add.f16x2 r3340, r1895, r1911; +} +{ +mul.f16x2 r3343, r3340, r3233; +} +{ +add.f16x2 r3346, r3337, r3343; +} +{ +sub.f16x2 r3349, r1885, r1933; +} +{ +mul.f16x2 r3352, r3349, r3231; +} +{ +sub.f16x2 r3355, r1901, r1917; +} +{ +mul.f16x2 r3358, r3355, r3234; +} +{ +add.f16x2 r3361, r3352, r3358; +} +{ +sub.f16x2 r3364, r3346, r3361; +} +st.local.u32 [rd3+112], r3364; +{ +add.f16x2 r3367, r1879, r1927; +} +{ +mul.f16x2 r3370, r3367, r3229; +} +{ +add.f16x2 r3373, r126, r3370; +} +{ +add.f16x2 r3376, r1895, r1911; +} +{ +mul.f16x2 r3379, r3376, r3233; +} +{ +add.f16x2 r3382, r3373, r3379; +} +{ +sub.f16x2 r3385, r1885, r1933; +} +{ +mul.f16x2 r3388, r3385, r3231; +} +{ +sub.f16x2 r3391, r1901, r1917; +} +{ +mul.f16x2 r3394, r3391, r3234; +} +{ +add.f16x2 r3397, r3388, r3394; +} +{ +add.f16x2 r3400, r3382, r3397; +} +st.local.u32 [rd3+152], r3400; +{ +add.f16x2 r3403, r1885, r1933; +} +{ +mul.f16x2 r3406, r3403, r3225; +} +{ +add.f16x2 r3409, r270, r3406; +} +{ +add.f16x2 r3412, r1901, r1917; +} +{ +mul.f16x2 r3415, r3412, r3229; +} +{ +add.f16x2 r3418, r3409, r3415; +} +{ +sub.f16x2 r3421, r1879, r1927; +} +{ +mul.f16x2 r3424, r3421, r3227; +} +{ +sub.f16x2 r3427, r1895, r1911; +} +{ +mul.f16x2 r3430, r3427, r3231; +} +{ +add.f16x2 r3433, r3424, r3430; +} +{ +add.f16x2 r3436, r3418, r3433; +} +st.local.u32 [rd3+76], r3436; +{ +add.f16x2 r3439, r1885, r1933; +} +{ +mul.f16x2 r3442, r3439, r3225; +} +{ +add.f16x2 r3445, r270, r3442; +} +{ +add.f16x2 r3448, r1901, r1917; +} +{ +mul.f16x2 r3451, r3448, r3229; +} +{ +add.f16x2 r3454, r3445, r3451; +} +{ +sub.f16x2 r3457, r1879, r1927; +} +{ +mul.f16x2 r3460, r3457, r3227; +} +{ +sub.f16x2 r3463, r1895, r1911; +} +{ +mul.f16x2 r3466, r3463, r3231; +} +{ +add.f16x2 r3469, r3460, r3466; +} +{ +sub.f16x2 r3472, r3454, r3469; +} +st.local.u32 [rd3+196], r3472; +{ +add.f16x2 r3475, r1885, r1933; +} +{ +mul.f16x2 r3478, r3475, r3229; +} +{ +add.f16x2 r3481, r270, r3478; +} +{ +add.f16x2 r3484, r1901, r1917; +} +{ +mul.f16x2 r3487, r3484, r3233; +} +{ +add.f16x2 r3490, r3481, r3487; +} +{ +sub.f16x2 r3493, r1879, r1927; +} +{ +mul.f16x2 r3496, r3493, r3231; +} +{ +sub.f16x2 r3499, r1895, r1911; +} +{ +mul.f16x2 r3502, r3499, r3234; +} +{ +add.f16x2 r3505, r3496, r3502; +} +{ +add.f16x2 r3508, r3490, r3505; +} +st.local.u32 [rd3+116], r3508; +{ +add.f16x2 r3511, r1885, r1933; +} +{ +mul.f16x2 r3514, r3511, r3229; +} +{ +add.f16x2 r3517, r270, r3514; +} +{ +add.f16x2 r3520, r1901, r1917; +} +{ +mul.f16x2 r3523, r3520, r3233; +} +{ +add.f16x2 r3526, r3517, r3523; +} +{ +sub.f16x2 r3529, r1879, r1927; +} +{ +mul.f16x2 r3532, r3529, r3231; +} +{ +sub.f16x2 r3535, r1895, r1911; +} +{ +mul.f16x2 r3538, r3535, r3234; +} +{ +add.f16x2 r3541, r3532, r3538; +} +{ +sub.f16x2 r3544, r3526, r3541; +} +st.local.u32 [rd3+156], r3544; +mov.u32 r3552, %tid.x; +mul.wide.u32 rd9, r3552, -776530087; +shr.u64 rd10, rd9, 41; +cvt.u32.u64 r4, rd10; +mul.lo.s32 r3553, r4, 625; +sub.s32 r3, r3552, r3553; +cvt.rn.f32.u32 f221, r3; +mul.f32 f222, f221, 0f39D2D427; +cos.approx.f32 f217, f222; +sin.approx.f32 f223, f222; +neg.f32 f218, f223; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r11310, {low, high}; +} +mov.u32 r11309, 1; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11310; +mov.b32 r3574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11310; +mov.b32 r3576, {high, high}; +} +bra.uni LBB1_1; +LBB1_2: +ld.local.u32 r11308, [rd5+60]; +LBB1_1: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11310; +mov.b32 r3554, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11310; +mov.b32 r3556, {high, high}; +} +mul.wide.u32 rd11, r11309, 8; +add.s64 rd12, rd3, rd11; +add.s64 rd5, rd12, 4; +ld.local.u32 r3559, [rd12+4]; +{ +mul.f16x2 r3558, r3559, r3556; +} +{ +fma.rn.f16x2 r3561, r11308, r3554, r3558; +} +st.local.u32 [rd12], r3561; +{ +mul.f16x2 r3565, r11308, r3556; +} +{ +neg.f16x2 r3568, r3565; +} +{ +fma.rn.f16x2 r3570, r3559, r3554, r3568; +} +st.local.u32 [rd12+4], r3570; +mov.f32 f238, 0fBF800000; +mov.f32 f239, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3578, {low, high}; +} +{ +mul.f16x2 r3579, r3576, r3578; +} +{ +mul.f16x2 r3582, r11310, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11310; +mov.b32 r3585, {high, low}; +} +{ +fma.rn.f16x2 r3587, r3579, r3585, r3582; +} +ld.local.u32 r3603, [rd12+8]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3587; +mov.b32 r3591, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3587; +mov.b32 r3593, {high, high}; +} +ld.local.u32 r3608, [rd12+12]; +{ +mul.f16x2 r3595, r3608, r3593; +} +{ +fma.rn.f16x2 r3598, r3603, r3591, r3595; +} +st.local.u32 [rd12+8], r3598; +{ +mul.f16x2 r3602, r3603, r3593; +} +{ +neg.f16x2 r3605, r3602; +} +{ +fma.rn.f16x2 r3607, r3608, r3591, r3605; +} +st.local.u32 [rd12+12], r3607; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3615, {low, high}; +} +{ +mul.f16x2 r3616, r3576, r3615; +} +{ +mul.f16x2 r3619, r3587, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3587; +mov.b32 r3622, {high, low}; +} +{ +fma.rn.f16x2 r3624, r3616, r3622, r3619; +} +ld.local.u32 r3640, [rd12+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3624; +mov.b32 r3628, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3624; +mov.b32 r3630, {high, high}; +} +ld.local.u32 r3645, [rd12+20]; +{ +mul.f16x2 r3632, r3645, r3630; +} +{ +fma.rn.f16x2 r3635, r3640, r3628, r3632; +} +st.local.u32 [rd12+16], r3635; +{ +mul.f16x2 r3639, r3640, r3630; +} +{ +neg.f16x2 r3642, r3639; +} +{ +fma.rn.f16x2 r3644, r3645, r3628, r3642; +} +st.local.u32 [rd12+20], r3644; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3652, {low, high}; +} +{ +mul.f16x2 r3653, r3576, r3652; +} +{ +mul.f16x2 r3656, r3624, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3624; +mov.b32 r3659, {high, low}; +} +{ +fma.rn.f16x2 r3661, r3653, r3659, r3656; +} +ld.local.u32 r3677, [rd12+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3661; +mov.b32 r3665, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3661; +mov.b32 r3667, {high, high}; +} +ld.local.u32 r3682, [rd12+28]; +{ +mul.f16x2 r3669, r3682, r3667; +} +{ +fma.rn.f16x2 r3672, r3677, r3665, r3669; +} +st.local.u32 [rd12+24], r3672; +{ +mul.f16x2 r3676, r3677, r3667; +} +{ +neg.f16x2 r3679, r3676; +} +{ +fma.rn.f16x2 r3681, r3682, r3665, r3679; +} +st.local.u32 [rd12+28], r3681; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3689, {low, high}; +} +{ +mul.f16x2 r3690, r3576, r3689; +} +{ +mul.f16x2 r3693, r3661, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3661; +mov.b32 r3696, {high, low}; +} +{ +fma.rn.f16x2 r3698, r3690, r3696, r3693; +} +ld.local.u32 r3710, [rd12+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3698; +mov.b32 r3702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3698; +mov.b32 r3704, {high, high}; +} +ld.local.u32 r3719, [rd12+36]; +{ +mul.f16x2 r3706, r3719, r3704; +} +{ +fma.rn.f16x2 r3709, r3710, r3702, r3706; +} +st.local.u32 [rd12+32], r3709; +{ +mul.f16x2 r3713, r3710, r3704; +} +{ +neg.f16x2 r3716, r3713; +} +{ +fma.rn.f16x2 r3718, r3719, r3702, r3716; +} +st.local.u32 [rd12+36], r3718; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3726, {low, high}; +} +{ +mul.f16x2 r3727, r3576, r3726; +} +{ +mul.f16x2 r3730, r3698, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3698; +mov.b32 r3733, {high, low}; +} +{ +fma.rn.f16x2 r3735, r3727, r3733, r3730; +} +ld.local.u32 r3747, [rd12+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3735; +mov.b32 r3739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3735; +mov.b32 r3741, {high, high}; +} +ld.local.u32 r3744, [rd12+44]; +{ +mul.f16x2 r3743, r3744, r3741; +} +{ +fma.rn.f16x2 r3746, r3747, r3739, r3743; +} +st.local.u32 [rd12+40], r3746; +{ +mul.f16x2 r3750, r3747, r3741; +} +{ +neg.f16x2 r3753, r3750; +} +{ +fma.rn.f16x2 r3755, r3744, r3739, r3753; +} +st.local.u32 [rd12+44], r3755; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3763, {low, high}; +} +{ +mul.f16x2 r3764, r3576, r3763; +} +{ +mul.f16x2 r3767, r3735, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3735; +mov.b32 r3770, {high, low}; +} +{ +fma.rn.f16x2 r3772, r3764, r3770, r3767; +} +ld.local.u32 r3784, [rd12+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3772; +mov.b32 r3776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3772; +mov.b32 r3778, {high, high}; +} +ld.local.u32 r3781, [rd12+52]; +{ +mul.f16x2 r3780, r3781, r3778; +} +{ +fma.rn.f16x2 r3783, r3784, r3776, r3780; +} +st.local.u32 [rd12+48], r3783; +{ +mul.f16x2 r3787, r3784, r3778; +} +{ +neg.f16x2 r3790, r3787; +} +{ +fma.rn.f16x2 r3792, r3781, r3776, r3790; +} +st.local.u32 [rd12+52], r3792; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3800, {low, high}; +} +{ +mul.f16x2 r3801, r3576, r3800; +} +{ +mul.f16x2 r3804, r3772, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3772; +mov.b32 r3807, {high, low}; +} +{ +fma.rn.f16x2 r3809, r3801, r3807, r3804; +} +ld.local.u32 r3821, [rd12+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3809; +mov.b32 r3813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3809; +mov.b32 r3815, {high, high}; +} +ld.local.u32 r3818, [rd12+60]; +{ +mul.f16x2 r3817, r3818, r3815; +} +{ +fma.rn.f16x2 r3820, r3821, r3813, r3817; +} +st.local.u32 [rd12+56], r3820; +{ +mul.f16x2 r3824, r3821, r3815; +} +{ +neg.f16x2 r3827, r3824; +} +{ +fma.rn.f16x2 r3829, r3818, r3813, r3827; +} +st.local.u32 [rd12+60], r3829; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r3837, {low, high}; +} +{ +mul.f16x2 r3838, r3576, r3837; +} +{ +mul.f16x2 r3841, r3809, r3574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3809; +mov.b32 r3844, {high, low}; +} +{ +fma.rn.f16x2 r11310, r3838, r3844, r3841; +} +add.s32 r11309, r11309, 8; +setp.eq.s32 p1, r11309, 25; +@p1 bra LBB1_3; +bra.uni LBB1_2; +LBB1_3: +shl.b32 r7378, r1, 2; +mov.u32 r7379, %50; +add.s32 r7380, r7379, r7378; +mad.lo.s32 r12, r4, 62500, r7380; +barrier.sync 0; +mad.lo.s32 r7381, r3, 100, r12; +ld.local.u32 r7382, [rd3]; +st.shared.u32 [r7381], r7382; +ld.local.u32 r7383, [rd4+4]; +st.shared.u32 [r7381+4], r7383; +ld.local.u32 r7384, [rd4+12]; +st.shared.u32 [r7381+8], r7384; +ld.local.u32 r7385, [rd4+20]; +st.shared.u32 [r7381+12], r7385; +ld.local.u32 r7386, [rd4+28]; +st.shared.u32 [r7381+16], r7386; +ld.local.u32 r7387, [rd4+36]; +st.shared.u32 [r7381+20], r7387; +ld.local.u32 r7388, [rd4+44]; +st.shared.u32 [r7381+24], r7388; +ld.local.u32 r7389, [rd4+52]; +st.shared.u32 [r7381+28], r7389; +ld.local.u32 r7390, [rd4+60]; +st.shared.u32 [r7381+32], r7390; +ld.local.u32 r7391, [rd4+68]; +st.shared.u32 [r7381+36], r7391; +ld.local.u32 r7392, [rd4+76]; +st.shared.u32 [r7381+40], r7392; +ld.local.u32 r7393, [rd4+84]; +st.shared.u32 [r7381+44], r7393; +ld.local.u32 r7394, [rd4+92]; +st.shared.u32 [r7381+48], r7394; +ld.local.u32 r7395, [rd4+100]; +st.shared.u32 [r7381+52], r7395; +ld.local.u32 r7396, [rd4+108]; +st.shared.u32 [r7381+56], r7396; +ld.local.u32 r7397, [rd4+116]; +st.shared.u32 [r7381+60], r7397; +ld.local.u32 r7398, [rd4+124]; +st.shared.u32 [r7381+64], r7398; +ld.local.u32 r7399, [rd4+132]; +st.shared.u32 [r7381+68], r7399; +ld.local.u32 r7400, [rd4+140]; +st.shared.u32 [r7381+72], r7400; +ld.local.u32 r7401, [rd4+148]; +st.shared.u32 [r7381+76], r7401; +ld.local.u32 r7402, [rd4+156]; +st.shared.u32 [r7381+80], r7402; +ld.local.u32 r7403, [rd4+164]; +st.shared.u32 [r7381+84], r7403; +ld.local.u32 r7404, [rd4+172]; +st.shared.u32 [r7381+88], r7404; +ld.local.u32 r7405, [rd4+180]; +st.shared.u32 [r7381+92], r7405; +ld.local.u32 r7406, [rd4+188]; +st.shared.u32 [r7381+96], r7406; +barrier.sync 0; +mad.lo.s32 r13, r3, -96, r7381; +ld.shared.u32 r3864, [r13]; +ld.shared.u32 r4186, [r13+2500]; +ld.shared.u32 r4508, [r13+5000]; +ld.shared.u32 r4830, [r13+7500]; +ld.shared.u32 r5152, [r13+10000]; +ld.shared.u32 r3861, [r13+12500]; +ld.shared.u32 r4183, [r13+15000]; +ld.shared.u32 r4505, [r13+17500]; +ld.shared.u32 r4827, [r13+20000]; +ld.shared.u32 r5149, [r13+22500]; +ld.shared.u32 r3867, [r13+25000]; +ld.shared.u32 r4189, [r13+27500]; +ld.shared.u32 r4511, [r13+30000]; +ld.shared.u32 r4833, [r13+32500]; +ld.shared.u32 r5155, [r13+35000]; +ld.shared.u32 r3868, [r13+37500]; +ld.shared.u32 r4190, [r13+40000]; +ld.shared.u32 r4512, [r13+42500]; +ld.shared.u32 r4834, [r13+45000]; +ld.shared.u32 r5156, [r13+47500]; +ld.shared.u32 r3862, [r13+50000]; +ld.shared.u32 r4184, [r13+52500]; +ld.shared.u32 r4506, [r13+55000]; +ld.shared.u32 r4828, [r13+57500]; +ld.shared.u32 r5150, [r13+60000]; +barrier.sync 0; +ld.local.u32 r7407, [rd4]; +st.shared.u32 [r7381], r7407; +ld.local.u32 r7408, [rd4+8]; +st.shared.u32 [r7381+4], r7408; +ld.local.u32 r7409, [rd4+16]; +st.shared.u32 [r7381+8], r7409; +ld.local.u32 r7410, [rd4+24]; +st.shared.u32 [r7381+12], r7410; +ld.local.u32 r7411, [rd4+32]; +st.shared.u32 [r7381+16], r7411; +ld.local.u32 r7412, [rd4+40]; +st.shared.u32 [r7381+20], r7412; +ld.local.u32 r7413, [rd4+48]; +st.shared.u32 [r7381+24], r7413; +ld.local.u32 r7414, [rd4+56]; +st.shared.u32 [r7381+28], r7414; +ld.local.u32 r7415, [rd4+64]; +st.shared.u32 [r7381+32], r7415; +ld.local.u32 r7416, [rd4+72]; +st.shared.u32 [r7381+36], r7416; +ld.local.u32 r7417, [rd4+80]; +st.shared.u32 [r7381+40], r7417; +ld.local.u32 r7418, [rd4+88]; +st.shared.u32 [r7381+44], r7418; +ld.local.u32 r7419, [rd4+96]; +st.shared.u32 [r7381+48], r7419; +ld.local.u32 r7420, [rd4+104]; +st.shared.u32 [r7381+52], r7420; +ld.local.u32 r7421, [rd4+112]; +st.shared.u32 [r7381+56], r7421; +ld.local.u32 r7422, [rd4+120]; +st.shared.u32 [r7381+60], r7422; +ld.local.u32 r7423, [rd4+128]; +st.shared.u32 [r7381+64], r7423; +ld.local.u32 r7424, [rd4+136]; +st.shared.u32 [r7381+68], r7424; +ld.local.u32 r7425, [rd4+144]; +st.shared.u32 [r7381+72], r7425; +ld.local.u32 r7426, [rd4+152]; +st.shared.u32 [r7381+76], r7426; +ld.local.u32 r7427, [rd4+160]; +st.shared.u32 [r7381+80], r7427; +ld.local.u32 r7428, [rd4+168]; +st.shared.u32 [r7381+84], r7428; +ld.local.u32 r7429, [rd4+176]; +st.shared.u32 [r7381+88], r7429; +ld.local.u32 r7430, [rd4+184]; +st.shared.u32 [r7381+92], r7430; +ld.local.u32 r7431, [rd4+192]; +st.shared.u32 [r7381+96], r7431; +barrier.sync 0; +ld.shared.u32 r3876, [r13]; +ld.shared.u32 r4198, [r13+2500]; +ld.shared.u32 r4520, [r13+5000]; +ld.shared.u32 r4842, [r13+7500]; +ld.shared.u32 r5164, [r13+10000]; +ld.shared.u32 r3873, [r13+12500]; +ld.shared.u32 r4195, [r13+15000]; +ld.shared.u32 r4517, [r13+17500]; +ld.shared.u32 r4839, [r13+20000]; +ld.shared.u32 r5161, [r13+22500]; +ld.shared.u32 r3879, [r13+25000]; +ld.shared.u32 r4201, [r13+27500]; +ld.shared.u32 r4523, [r13+30000]; +ld.shared.u32 r4845, [r13+32500]; +ld.shared.u32 r5167, [r13+35000]; +ld.shared.u32 r3880, [r13+37500]; +ld.shared.u32 r4202, [r13+40000]; +ld.shared.u32 r4524, [r13+42500]; +ld.shared.u32 r4846, [r13+45000]; +ld.shared.u32 r5168, [r13+47500]; +ld.shared.u32 r3874, [r13+50000]; +ld.shared.u32 r4196, [r13+52500]; +ld.shared.u32 r4518, [r13+55000]; +ld.shared.u32 r4840, [r13+57500]; +ld.shared.u32 r5162, [r13+60000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3850, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3851, {low, high}; +} +{ +neg.f16x2 r3852, r3851; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r3854, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r3855, {low, high}; +} +{ +neg.f16x2 r3856, r3855; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r3858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r3859, {low, high}; +} +{ +add.f16x2 r3860, r3861, r3862; +} +{ +add.f16x2 r3863, r3864, r3860; +} +{ +add.f16x2 r3866, r3867, r3868; +} +{ +add.f16x2 r3869, r3863, r3866; +} +{ +add.f16x2 r3872, r3873, r3874; +} +{ +add.f16x2 r3875, r3876, r3872; +} +{ +add.f16x2 r3878, r3879, r3880; +} +{ +add.f16x2 r3881, r3875, r3878; +} +{ +add.f16x2 r3884, r3861, r3862; +} +{ +mul.f16x2 r3887, r3884, r3850; +} +{ +add.f16x2 r3890, r3864, r3887; +} +{ +add.f16x2 r3893, r3867, r3868; +} +{ +mul.f16x2 r3896, r3893, r3854; +} +{ +add.f16x2 r3899, r3890, r3896; +} +{ +sub.f16x2 r3902, r3873, r3874; +} +{ +mul.f16x2 r3905, r3902, r3852; +} +{ +sub.f16x2 r3908, r3879, r3880; +} +{ +mul.f16x2 r3911, r3908, r3856; +} +{ +add.f16x2 r3914, r3905, r3911; +} +{ +sub.f16x2 r3917, r3899, r3914; +} +{ +add.f16x2 r3920, r3861, r3862; +} +{ +mul.f16x2 r3923, r3920, r3850; +} +{ +add.f16x2 r3926, r3864, r3923; +} +{ +add.f16x2 r3929, r3867, r3868; +} +{ +mul.f16x2 r3932, r3929, r3854; +} +{ +add.f16x2 r3935, r3926, r3932; +} +{ +sub.f16x2 r3938, r3873, r3874; +} +{ +mul.f16x2 r3941, r3938, r3852; +} +{ +sub.f16x2 r3944, r3879, r3880; +} +{ +mul.f16x2 r3947, r3944, r3856; +} +{ +add.f16x2 r3950, r3941, r3947; +} +{ +add.f16x2 r3953, r3935, r3950; +} +{ +add.f16x2 r3956, r3861, r3862; +} +{ +mul.f16x2 r3959, r3956, r3854; +} +{ +add.f16x2 r3962, r3864, r3959; +} +{ +add.f16x2 r3965, r3867, r3868; +} +{ +mul.f16x2 r3968, r3965, r3858; +} +{ +add.f16x2 r3971, r3962, r3968; +} +{ +sub.f16x2 r3974, r3873, r3874; +} +{ +mul.f16x2 r3977, r3974, r3856; +} +{ +sub.f16x2 r3980, r3879, r3880; +} +{ +mul.f16x2 r3983, r3980, r3859; +} +{ +add.f16x2 r3986, r3977, r3983; +} +{ +sub.f16x2 r3989, r3971, r3986; +} +{ +add.f16x2 r3992, r3861, r3862; +} +{ +mul.f16x2 r3995, r3992, r3854; +} +{ +add.f16x2 r3998, r3864, r3995; +} +{ +add.f16x2 r4001, r3867, r3868; +} +{ +mul.f16x2 r4004, r4001, r3858; +} +{ +add.f16x2 r4007, r3998, r4004; +} +{ +sub.f16x2 r4010, r3873, r3874; +} +{ +mul.f16x2 r4013, r4010, r3856; +} +{ +sub.f16x2 r4016, r3879, r3880; +} +{ +mul.f16x2 r4019, r4016, r3859; +} +{ +add.f16x2 r4022, r4013, r4019; +} +{ +add.f16x2 r4025, r4007, r4022; +} +{ +add.f16x2 r4028, r3873, r3874; +} +{ +mul.f16x2 r4031, r4028, r3850; +} +{ +add.f16x2 r4034, r3876, r4031; +} +{ +add.f16x2 r4037, r3879, r3880; +} +{ +mul.f16x2 r4040, r4037, r3854; +} +{ +add.f16x2 r4043, r4034, r4040; +} +{ +sub.f16x2 r4046, r3861, r3862; +} +{ +mul.f16x2 r4049, r4046, r3852; +} +{ +sub.f16x2 r4052, r3867, r3868; +} +{ +mul.f16x2 r4055, r4052, r3856; +} +{ +add.f16x2 r4058, r4049, r4055; +} +{ +add.f16x2 r4061, r4043, r4058; +} +{ +add.f16x2 r4064, r3873, r3874; +} +{ +mul.f16x2 r4067, r4064, r3850; +} +{ +add.f16x2 r4070, r3876, r4067; +} +{ +add.f16x2 r4073, r3879, r3880; +} +{ +mul.f16x2 r4076, r4073, r3854; +} +{ +add.f16x2 r4079, r4070, r4076; +} +{ +sub.f16x2 r4082, r3861, r3862; +} +{ +mul.f16x2 r4085, r4082, r3852; +} +{ +sub.f16x2 r4088, r3867, r3868; +} +{ +mul.f16x2 r4091, r4088, r3856; +} +{ +add.f16x2 r4094, r4085, r4091; +} +{ +sub.f16x2 r4097, r4079, r4094; +} +{ +add.f16x2 r4100, r3873, r3874; +} +{ +mul.f16x2 r4103, r4100, r3854; +} +{ +add.f16x2 r4106, r3876, r4103; +} +{ +add.f16x2 r4109, r3879, r3880; +} +{ +mul.f16x2 r4112, r4109, r3858; +} +{ +add.f16x2 r4115, r4106, r4112; +} +{ +sub.f16x2 r4118, r3861, r3862; +} +{ +mul.f16x2 r4121, r4118, r3856; +} +{ +sub.f16x2 r4124, r3867, r3868; +} +{ +mul.f16x2 r4127, r4124, r3859; +} +{ +add.f16x2 r4130, r4121, r4127; +} +{ +add.f16x2 r4133, r4115, r4130; +} +{ +add.f16x2 r4136, r3873, r3874; +} +{ +mul.f16x2 r4139, r4136, r3854; +} +{ +add.f16x2 r4142, r3876, r4139; +} +{ +add.f16x2 r4145, r3879, r3880; +} +{ +mul.f16x2 r4148, r4145, r3858; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +sub.f16x2 r4154, r3861, r3862; +} +{ +mul.f16x2 r4157, r4154, r3856; +} +{ +sub.f16x2 r4160, r3867, r3868; +} +{ +mul.f16x2 r4163, r4160, r3859; +} +{ +add.f16x2 r4166, r4157, r4163; +} +{ +sub.f16x2 r4169, r4151, r4166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4173, {low, high}; +} +{ +neg.f16x2 r4174, r4173; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4176, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4177, {low, high}; +} +{ +neg.f16x2 r4178, r4177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4180, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4181, {low, high}; +} +{ +add.f16x2 r4182, r4183, r4184; +} +{ +add.f16x2 r4185, r4186, r4182; +} +{ +add.f16x2 r4188, r4189, r4190; +} +{ +add.f16x2 r4191, r4185, r4188; +} +{ +add.f16x2 r4194, r4195, r4196; +} +{ +add.f16x2 r4197, r4198, r4194; +} +{ +add.f16x2 r4200, r4201, r4202; +} +{ +add.f16x2 r4203, r4197, r4200; +} +{ +add.f16x2 r4206, r4183, r4184; +} +{ +mul.f16x2 r4209, r4206, r4172; +} +{ +add.f16x2 r4212, r4186, r4209; +} +{ +add.f16x2 r4215, r4189, r4190; +} +{ +mul.f16x2 r4218, r4215, r4176; +} +{ +add.f16x2 r4221, r4212, r4218; +} +{ +sub.f16x2 r4224, r4195, r4196; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +sub.f16x2 r4230, r4201, r4202; +} +{ +mul.f16x2 r4233, r4230, r4178; +} +{ +add.f16x2 r4236, r4227, r4233; +} +{ +sub.f16x2 r4239, r4221, r4236; +} +{ +add.f16x2 r4242, r4183, r4184; +} +{ +mul.f16x2 r4245, r4242, r4172; +} +{ +add.f16x2 r4248, r4186, r4245; +} +{ +add.f16x2 r4251, r4189, r4190; +} +{ +mul.f16x2 r4254, r4251, r4176; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +sub.f16x2 r4260, r4195, r4196; +} +{ +mul.f16x2 r4263, r4260, r4174; +} +{ +sub.f16x2 r4266, r4201, r4202; +} +{ +mul.f16x2 r4269, r4266, r4178; +} +{ +add.f16x2 r4272, r4263, r4269; +} +{ +add.f16x2 r4275, r4257, r4272; +} +{ +add.f16x2 r4278, r4183, r4184; +} +{ +mul.f16x2 r4281, r4278, r4176; +} +{ +add.f16x2 r4284, r4186, r4281; +} +{ +add.f16x2 r4287, r4189, r4190; +} +{ +mul.f16x2 r4290, r4287, r4180; +} +{ +add.f16x2 r4293, r4284, r4290; +} +{ +sub.f16x2 r4296, r4195, r4196; +} +{ +mul.f16x2 r4299, r4296, r4178; +} +{ +sub.f16x2 r4302, r4201, r4202; +} +{ +mul.f16x2 r4305, r4302, r4181; +} +{ +add.f16x2 r4308, r4299, r4305; +} +{ +sub.f16x2 r4311, r4293, r4308; +} +{ +add.f16x2 r4314, r4183, r4184; +} +{ +mul.f16x2 r4317, r4314, r4176; +} +{ +add.f16x2 r4320, r4186, r4317; +} +{ +add.f16x2 r4323, r4189, r4190; +} +{ +mul.f16x2 r4326, r4323, r4180; +} +{ +add.f16x2 r4329, r4320, r4326; +} +{ +sub.f16x2 r4332, r4195, r4196; +} +{ +mul.f16x2 r4335, r4332, r4178; +} +{ +sub.f16x2 r4338, r4201, r4202; +} +{ +mul.f16x2 r4341, r4338, r4181; +} +{ +add.f16x2 r4344, r4335, r4341; +} +{ +add.f16x2 r4347, r4329, r4344; +} +{ +add.f16x2 r4350, r4195, r4196; +} +{ +mul.f16x2 r4353, r4350, r4172; +} +{ +add.f16x2 r4356, r4198, r4353; +} +{ +add.f16x2 r4359, r4201, r4202; +} +{ +mul.f16x2 r4362, r4359, r4176; +} +{ +add.f16x2 r4365, r4356, r4362; +} +{ +sub.f16x2 r4368, r4183, r4184; +} +{ +mul.f16x2 r4371, r4368, r4174; +} +{ +sub.f16x2 r4374, r4189, r4190; +} +{ +mul.f16x2 r4377, r4374, r4178; +} +{ +add.f16x2 r4380, r4371, r4377; +} +{ +add.f16x2 r4383, r4365, r4380; +} +{ +add.f16x2 r4386, r4195, r4196; +} +{ +mul.f16x2 r4389, r4386, r4172; +} +{ +add.f16x2 r4392, r4198, r4389; +} +{ +add.f16x2 r4395, r4201, r4202; +} +{ +mul.f16x2 r4398, r4395, r4176; +} +{ +add.f16x2 r4401, r4392, r4398; +} +{ +sub.f16x2 r4404, r4183, r4184; +} +{ +mul.f16x2 r4407, r4404, r4174; +} +{ +sub.f16x2 r4410, r4189, r4190; +} +{ +mul.f16x2 r4413, r4410, r4178; +} +{ +add.f16x2 r4416, r4407, r4413; +} +{ +sub.f16x2 r4419, r4401, r4416; +} +{ +add.f16x2 r4422, r4195, r4196; +} +{ +mul.f16x2 r4425, r4422, r4176; +} +{ +add.f16x2 r4428, r4198, r4425; +} +{ +add.f16x2 r4431, r4201, r4202; +} +{ +mul.f16x2 r4434, r4431, r4180; +} +{ +add.f16x2 r4437, r4428, r4434; +} +{ +sub.f16x2 r4440, r4183, r4184; +} +{ +mul.f16x2 r4443, r4440, r4178; +} +{ +sub.f16x2 r4446, r4189, r4190; +} +{ +mul.f16x2 r4449, r4446, r4181; +} +{ +add.f16x2 r4452, r4443, r4449; +} +{ +add.f16x2 r4455, r4437, r4452; +} +{ +add.f16x2 r4458, r4195, r4196; +} +{ +mul.f16x2 r4461, r4458, r4176; +} +{ +add.f16x2 r4464, r4198, r4461; +} +{ +add.f16x2 r4467, r4201, r4202; +} +{ +mul.f16x2 r4470, r4467, r4180; +} +{ +add.f16x2 r4473, r4464, r4470; +} +{ +sub.f16x2 r4476, r4183, r4184; +} +{ +mul.f16x2 r4479, r4476, r4178; +} +{ +sub.f16x2 r4482, r4189, r4190; +} +{ +mul.f16x2 r4485, r4482, r4181; +} +{ +add.f16x2 r4488, r4479, r4485; +} +{ +sub.f16x2 r4491, r4473, r4488; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4494, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4495, {low, high}; +} +{ +neg.f16x2 r4496, r4495; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4502, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4503, {low, high}; +} +{ +add.f16x2 r4504, r4505, r4506; +} +{ +add.f16x2 r4507, r4508, r4504; +} +{ +add.f16x2 r4510, r4511, r4512; +} +{ +add.f16x2 r4513, r4507, r4510; +} +{ +add.f16x2 r4516, r4517, r4518; +} +{ +add.f16x2 r4519, r4520, r4516; +} +{ +add.f16x2 r4522, r4523, r4524; +} +{ +add.f16x2 r4525, r4519, r4522; +} +{ +add.f16x2 r4528, r4505, r4506; +} +{ +mul.f16x2 r4531, r4528, r4494; +} +{ +add.f16x2 r4534, r4508, r4531; +} +{ +add.f16x2 r4537, r4511, r4512; +} +{ +mul.f16x2 r4540, r4537, r4498; +} +{ +add.f16x2 r4543, r4534, r4540; +} +{ +sub.f16x2 r4546, r4517, r4518; +} +{ +mul.f16x2 r4549, r4546, r4496; +} +{ +sub.f16x2 r4552, r4523, r4524; +} +{ +mul.f16x2 r4555, r4552, r4500; +} +{ +add.f16x2 r4558, r4549, r4555; +} +{ +sub.f16x2 r4561, r4543, r4558; +} +{ +add.f16x2 r4564, r4505, r4506; +} +{ +mul.f16x2 r4567, r4564, r4494; +} +{ +add.f16x2 r4570, r4508, r4567; +} +{ +add.f16x2 r4573, r4511, r4512; +} +{ +mul.f16x2 r4576, r4573, r4498; +} +{ +add.f16x2 r4579, r4570, r4576; +} +{ +sub.f16x2 r4582, r4517, r4518; +} +{ +mul.f16x2 r4585, r4582, r4496; +} +{ +sub.f16x2 r4588, r4523, r4524; +} +{ +mul.f16x2 r4591, r4588, r4500; +} +{ +add.f16x2 r4594, r4585, r4591; +} +{ +add.f16x2 r4597, r4579, r4594; +} +{ +add.f16x2 r4600, r4505, r4506; +} +{ +mul.f16x2 r4603, r4600, r4498; +} +{ +add.f16x2 r4606, r4508, r4603; +} +{ +add.f16x2 r4609, r4511, r4512; +} +{ +mul.f16x2 r4612, r4609, r4502; +} +{ +add.f16x2 r4615, r4606, r4612; +} +{ +sub.f16x2 r4618, r4517, r4518; +} +{ +mul.f16x2 r4621, r4618, r4500; +} +{ +sub.f16x2 r4624, r4523, r4524; +} +{ +mul.f16x2 r4627, r4624, r4503; +} +{ +add.f16x2 r4630, r4621, r4627; +} +{ +sub.f16x2 r4633, r4615, r4630; +} +{ +add.f16x2 r4636, r4505, r4506; +} +{ +mul.f16x2 r4639, r4636, r4498; +} +{ +add.f16x2 r4642, r4508, r4639; +} +{ +add.f16x2 r4645, r4511, r4512; +} +{ +mul.f16x2 r4648, r4645, r4502; +} +{ +add.f16x2 r4651, r4642, r4648; +} +{ +sub.f16x2 r4654, r4517, r4518; +} +{ +mul.f16x2 r4657, r4654, r4500; +} +{ +sub.f16x2 r4660, r4523, r4524; +} +{ +mul.f16x2 r4663, r4660, r4503; +} +{ +add.f16x2 r4666, r4657, r4663; +} +{ +add.f16x2 r4669, r4651, r4666; +} +{ +add.f16x2 r4672, r4517, r4518; +} +{ +mul.f16x2 r4675, r4672, r4494; +} +{ +add.f16x2 r4678, r4520, r4675; +} +{ +add.f16x2 r4681, r4523, r4524; +} +{ +mul.f16x2 r4684, r4681, r4498; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +sub.f16x2 r4690, r4505, r4506; +} +{ +mul.f16x2 r4693, r4690, r4496; +} +{ +sub.f16x2 r4696, r4511, r4512; +} +{ +mul.f16x2 r4699, r4696, r4500; +} +{ +add.f16x2 r4702, r4693, r4699; +} +{ +add.f16x2 r4705, r4687, r4702; +} +{ +add.f16x2 r4708, r4517, r4518; +} +{ +mul.f16x2 r4711, r4708, r4494; +} +{ +add.f16x2 r4714, r4520, r4711; +} +{ +add.f16x2 r4717, r4523, r4524; +} +{ +mul.f16x2 r4720, r4717, r4498; +} +{ +add.f16x2 r4723, r4714, r4720; +} +{ +sub.f16x2 r4726, r4505, r4506; +} +{ +mul.f16x2 r4729, r4726, r4496; +} +{ +sub.f16x2 r4732, r4511, r4512; +} +{ +mul.f16x2 r4735, r4732, r4500; +} +{ +add.f16x2 r4738, r4729, r4735; +} +{ +sub.f16x2 r4741, r4723, r4738; +} +{ +add.f16x2 r4744, r4517, r4518; +} +{ +mul.f16x2 r4747, r4744, r4498; +} +{ +add.f16x2 r4750, r4520, r4747; +} +{ +add.f16x2 r4753, r4523, r4524; +} +{ +mul.f16x2 r4756, r4753, r4502; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +sub.f16x2 r4762, r4505, r4506; +} +{ +mul.f16x2 r4765, r4762, r4500; +} +{ +sub.f16x2 r4768, r4511, r4512; +} +{ +mul.f16x2 r4771, r4768, r4503; +} +{ +add.f16x2 r4774, r4765, r4771; +} +{ +add.f16x2 r4777, r4759, r4774; +} +{ +add.f16x2 r4780, r4517, r4518; +} +{ +mul.f16x2 r4783, r4780, r4498; +} +{ +add.f16x2 r4786, r4520, r4783; +} +{ +add.f16x2 r4789, r4523, r4524; +} +{ +mul.f16x2 r4792, r4789, r4502; +} +{ +add.f16x2 r4795, r4786, r4792; +} +{ +sub.f16x2 r4798, r4505, r4506; +} +{ +mul.f16x2 r4801, r4798, r4500; +} +{ +sub.f16x2 r4804, r4511, r4512; +} +{ +mul.f16x2 r4807, r4804, r4503; +} +{ +add.f16x2 r4810, r4801, r4807; +} +{ +sub.f16x2 r4813, r4795, r4810; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4817, {low, high}; +} +{ +neg.f16x2 r4818, r4817; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r4820, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r4821, {low, high}; +} +{ +neg.f16x2 r4822, r4821; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r4824, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r4825, {low, high}; +} +{ +add.f16x2 r4826, r4827, r4828; +} +{ +add.f16x2 r4829, r4830, r4826; +} +{ +add.f16x2 r4832, r4833, r4834; +} +{ +add.f16x2 r4835, r4829, r4832; +} +{ +add.f16x2 r4838, r4839, r4840; +} +{ +add.f16x2 r4841, r4842, r4838; +} +{ +add.f16x2 r4844, r4845, r4846; +} +{ +add.f16x2 r4847, r4841, r4844; +} +{ +add.f16x2 r4850, r4827, r4828; +} +{ +mul.f16x2 r4853, r4850, r4816; +} +{ +add.f16x2 r4856, r4830, r4853; +} +{ +add.f16x2 r4859, r4833, r4834; +} +{ +mul.f16x2 r4862, r4859, r4820; +} +{ +add.f16x2 r4865, r4856, r4862; +} +{ +sub.f16x2 r4868, r4839, r4840; +} +{ +mul.f16x2 r4871, r4868, r4818; +} +{ +sub.f16x2 r4874, r4845, r4846; +} +{ +mul.f16x2 r4877, r4874, r4822; +} +{ +add.f16x2 r4880, r4871, r4877; +} +{ +sub.f16x2 r4883, r4865, r4880; +} +{ +add.f16x2 r4886, r4827, r4828; +} +{ +mul.f16x2 r4889, r4886, r4816; +} +{ +add.f16x2 r4892, r4830, r4889; +} +{ +add.f16x2 r4895, r4833, r4834; +} +{ +mul.f16x2 r4898, r4895, r4820; +} +{ +add.f16x2 r4901, r4892, r4898; +} +{ +sub.f16x2 r4904, r4839, r4840; +} +{ +mul.f16x2 r4907, r4904, r4818; +} +{ +sub.f16x2 r4910, r4845, r4846; +} +{ +mul.f16x2 r4913, r4910, r4822; +} +{ +add.f16x2 r4916, r4907, r4913; +} +{ +add.f16x2 r4919, r4901, r4916; +} +{ +add.f16x2 r4922, r4827, r4828; +} +{ +mul.f16x2 r4925, r4922, r4820; +} +{ +add.f16x2 r4928, r4830, r4925; +} +{ +add.f16x2 r4931, r4833, r4834; +} +{ +mul.f16x2 r4934, r4931, r4824; +} +{ +add.f16x2 r4937, r4928, r4934; +} +{ +sub.f16x2 r4940, r4839, r4840; +} +{ +mul.f16x2 r4943, r4940, r4822; +} +{ +sub.f16x2 r4946, r4845, r4846; +} +{ +mul.f16x2 r4949, r4946, r4825; +} +{ +add.f16x2 r4952, r4943, r4949; +} +{ +sub.f16x2 r4955, r4937, r4952; +} +{ +add.f16x2 r4958, r4827, r4828; +} +{ +mul.f16x2 r4961, r4958, r4820; +} +{ +add.f16x2 r4964, r4830, r4961; +} +{ +add.f16x2 r4967, r4833, r4834; +} +{ +mul.f16x2 r4970, r4967, r4824; +} +{ +add.f16x2 r4973, r4964, r4970; +} +{ +sub.f16x2 r4976, r4839, r4840; +} +{ +mul.f16x2 r4979, r4976, r4822; +} +{ +sub.f16x2 r4982, r4845, r4846; +} +{ +mul.f16x2 r4985, r4982, r4825; +} +{ +add.f16x2 r4988, r4979, r4985; +} +{ +add.f16x2 r4991, r4973, r4988; +} +{ +add.f16x2 r4994, r4839, r4840; +} +{ +mul.f16x2 r4997, r4994, r4816; +} +{ +add.f16x2 r5000, r4842, r4997; +} +{ +add.f16x2 r5003, r4845, r4846; +} +{ +mul.f16x2 r5006, r5003, r4820; +} +{ +add.f16x2 r5009, r5000, r5006; +} +{ +sub.f16x2 r5012, r4827, r4828; +} +{ +mul.f16x2 r5015, r5012, r4818; +} +{ +sub.f16x2 r5018, r4833, r4834; +} +{ +mul.f16x2 r5021, r5018, r4822; +} +{ +add.f16x2 r5024, r5015, r5021; +} +{ +add.f16x2 r5027, r5009, r5024; +} +{ +add.f16x2 r5030, r4839, r4840; +} +{ +mul.f16x2 r5033, r5030, r4816; +} +{ +add.f16x2 r5036, r4842, r5033; +} +{ +add.f16x2 r5039, r4845, r4846; +} +{ +mul.f16x2 r5042, r5039, r4820; +} +{ +add.f16x2 r5045, r5036, r5042; +} +{ +sub.f16x2 r5048, r4827, r4828; +} +{ +mul.f16x2 r5051, r5048, r4818; +} +{ +sub.f16x2 r5054, r4833, r4834; +} +{ +mul.f16x2 r5057, r5054, r4822; +} +{ +add.f16x2 r5060, r5051, r5057; +} +{ +sub.f16x2 r5063, r5045, r5060; +} +{ +add.f16x2 r5066, r4839, r4840; +} +{ +mul.f16x2 r5069, r5066, r4820; +} +{ +add.f16x2 r5072, r4842, r5069; +} +{ +add.f16x2 r5075, r4845, r4846; +} +{ +mul.f16x2 r5078, r5075, r4824; +} +{ +add.f16x2 r5081, r5072, r5078; +} +{ +sub.f16x2 r5084, r4827, r4828; +} +{ +mul.f16x2 r5087, r5084, r4822; +} +{ +sub.f16x2 r5090, r4833, r4834; +} +{ +mul.f16x2 r5093, r5090, r4825; +} +{ +add.f16x2 r5096, r5087, r5093; +} +{ +add.f16x2 r5099, r5081, r5096; +} +{ +add.f16x2 r5102, r4839, r4840; +} +{ +mul.f16x2 r5105, r5102, r4820; +} +{ +add.f16x2 r5108, r4842, r5105; +} +{ +add.f16x2 r5111, r4845, r4846; +} +{ +mul.f16x2 r5114, r5111, r4824; +} +{ +add.f16x2 r5117, r5108, r5114; +} +{ +sub.f16x2 r5120, r4827, r4828; +} +{ +mul.f16x2 r5123, r5120, r4822; +} +{ +sub.f16x2 r5126, r4833, r4834; +} +{ +mul.f16x2 r5129, r5126, r4825; +} +{ +add.f16x2 r5132, r5123, r5129; +} +{ +sub.f16x2 r5135, r5117, r5132; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5138, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5139, {low, high}; +} +{ +neg.f16x2 r5140, r5139; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5142, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5143, {low, high}; +} +{ +neg.f16x2 r5144, r5143; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5146, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5147, {low, high}; +} +{ +add.f16x2 r5148, r5149, r5150; +} +{ +add.f16x2 r5151, r5152, r5148; +} +{ +add.f16x2 r5154, r5155, r5156; +} +{ +add.f16x2 r5157, r5151, r5154; +} +{ +add.f16x2 r5160, r5161, r5162; +} +{ +add.f16x2 r5163, r5164, r5160; +} +{ +add.f16x2 r5166, r5167, r5168; +} +{ +add.f16x2 r5169, r5163, r5166; +} +{ +add.f16x2 r5172, r5149, r5150; +} +{ +mul.f16x2 r5175, r5172, r5138; +} +{ +add.f16x2 r5178, r5152, r5175; +} +{ +add.f16x2 r5181, r5155, r5156; +} +{ +mul.f16x2 r5184, r5181, r5142; +} +{ +add.f16x2 r5187, r5178, r5184; +} +{ +sub.f16x2 r5190, r5161, r5162; +} +{ +mul.f16x2 r5193, r5190, r5140; +} +{ +sub.f16x2 r5196, r5167, r5168; +} +{ +mul.f16x2 r5199, r5196, r5144; +} +{ +add.f16x2 r5202, r5193, r5199; +} +{ +sub.f16x2 r5205, r5187, r5202; +} +{ +add.f16x2 r5208, r5149, r5150; +} +{ +mul.f16x2 r5211, r5208, r5138; +} +{ +add.f16x2 r5214, r5152, r5211; +} +{ +add.f16x2 r5217, r5155, r5156; +} +{ +mul.f16x2 r5220, r5217, r5142; +} +{ +add.f16x2 r5223, r5214, r5220; +} +{ +sub.f16x2 r5226, r5161, r5162; +} +{ +mul.f16x2 r5229, r5226, r5140; +} +{ +sub.f16x2 r5232, r5167, r5168; +} +{ +mul.f16x2 r5235, r5232, r5144; +} +{ +add.f16x2 r5238, r5229, r5235; +} +{ +add.f16x2 r5241, r5223, r5238; +} +{ +add.f16x2 r5244, r5149, r5150; +} +{ +mul.f16x2 r5247, r5244, r5142; +} +{ +add.f16x2 r5250, r5152, r5247; +} +{ +add.f16x2 r5253, r5155, r5156; +} +{ +mul.f16x2 r5256, r5253, r5146; +} +{ +add.f16x2 r5259, r5250, r5256; +} +{ +sub.f16x2 r5262, r5161, r5162; +} +{ +mul.f16x2 r5265, r5262, r5144; +} +{ +sub.f16x2 r5268, r5167, r5168; +} +{ +mul.f16x2 r5271, r5268, r5147; +} +{ +add.f16x2 r5274, r5265, r5271; +} +{ +sub.f16x2 r5277, r5259, r5274; +} +{ +add.f16x2 r5280, r5149, r5150; +} +{ +mul.f16x2 r5283, r5280, r5142; +} +{ +add.f16x2 r5286, r5152, r5283; +} +{ +add.f16x2 r5289, r5155, r5156; +} +{ +mul.f16x2 r5292, r5289, r5146; +} +{ +add.f16x2 r5295, r5286, r5292; +} +{ +sub.f16x2 r5298, r5161, r5162; +} +{ +mul.f16x2 r5301, r5298, r5144; +} +{ +sub.f16x2 r5304, r5167, r5168; +} +{ +mul.f16x2 r5307, r5304, r5147; +} +{ +add.f16x2 r5310, r5301, r5307; +} +{ +add.f16x2 r5313, r5295, r5310; +} +{ +add.f16x2 r5316, r5161, r5162; +} +{ +mul.f16x2 r5319, r5316, r5138; +} +{ +add.f16x2 r5322, r5164, r5319; +} +{ +add.f16x2 r5325, r5167, r5168; +} +{ +mul.f16x2 r5328, r5325, r5142; +} +{ +add.f16x2 r5331, r5322, r5328; +} +{ +sub.f16x2 r5334, r5149, r5150; +} +{ +mul.f16x2 r5337, r5334, r5140; +} +{ +sub.f16x2 r5340, r5155, r5156; +} +{ +mul.f16x2 r5343, r5340, r5144; +} +{ +add.f16x2 r5346, r5337, r5343; +} +{ +add.f16x2 r5349, r5331, r5346; +} +{ +add.f16x2 r5352, r5161, r5162; +} +{ +mul.f16x2 r5355, r5352, r5138; +} +{ +add.f16x2 r5358, r5164, r5355; +} +{ +add.f16x2 r5361, r5167, r5168; +} +{ +mul.f16x2 r5364, r5361, r5142; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +sub.f16x2 r5370, r5149, r5150; +} +{ +mul.f16x2 r5373, r5370, r5140; +} +{ +sub.f16x2 r5376, r5155, r5156; +} +{ +mul.f16x2 r5379, r5376, r5144; +} +{ +add.f16x2 r5382, r5373, r5379; +} +{ +sub.f16x2 r5385, r5367, r5382; +} +{ +add.f16x2 r5388, r5161, r5162; +} +{ +mul.f16x2 r5391, r5388, r5142; +} +{ +add.f16x2 r5394, r5164, r5391; +} +{ +add.f16x2 r5397, r5167, r5168; +} +{ +mul.f16x2 r5400, r5397, r5146; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5149, r5150; +} +{ +mul.f16x2 r5409, r5406, r5144; +} +{ +sub.f16x2 r5412, r5155, r5156; +} +{ +mul.f16x2 r5415, r5412, r5147; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +add.f16x2 r5421, r5403, r5418; +} +{ +add.f16x2 r5424, r5161, r5162; +} +{ +mul.f16x2 r5427, r5424, r5142; +} +{ +add.f16x2 r5430, r5164, r5427; +} +{ +add.f16x2 r5433, r5167, r5168; +} +{ +mul.f16x2 r5436, r5433, r5146; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5149, r5150; +} +{ +mul.f16x2 r5445, r5442, r5144; +} +{ +sub.f16x2 r5448, r5155, r5156; +} +{ +mul.f16x2 r5451, r5448, r5147; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +sub.f16x2 r5457, r5439, r5454; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r5460, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r5461, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r5462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r5463, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r5464, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r5465, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r5466, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r5467, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r5470, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r5471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r5474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r5475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r5477, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r5482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r5483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r5491, {low, high}; +} +{ +mul.f16x2 r5508, r4239, r5460; +} +{ +mul.f16x2 r5511, r4383, r5461; +} +{ +sub.f16x2 r5514, r5508, r5511; +} +{ +mul.f16x2 r5517, r4239, r5461; +} +{ +fma.rn.f16x2 r5520, r4383, r5460, r5517; +} +{ +mul.f16x2 r5524, r4561, r5462; +} +{ +mul.f16x2 r5527, r4705, r5463; +} +{ +sub.f16x2 r5530, r5524, r5527; +} +{ +mul.f16x2 r5533, r4561, r5463; +} +{ +fma.rn.f16x2 r5536, r4705, r5462, r5533; +} +{ +mul.f16x2 r5540, r4883, r5464; +} +{ +mul.f16x2 r5543, r5027, r5465; +} +{ +sub.f16x2 r5546, r5540, r5543; +} +{ +mul.f16x2 r5549, r4883, r5465; +} +{ +fma.rn.f16x2 r5552, r5027, r5464, r5549; +} +{ +mul.f16x2 r5556, r5205, r5466; +} +{ +mul.f16x2 r5559, r5349, r5467; +} +{ +sub.f16x2 r5562, r5556, r5559; +} +{ +mul.f16x2 r5565, r5205, r5467; +} +{ +fma.rn.f16x2 r5568, r5349, r5466, r5565; +} +{ +mul.f16x2 r5572, r4311, r5462; +} +{ +mul.f16x2 r5575, r4455, r5463; +} +{ +sub.f16x2 r5578, r5572, r5575; +} +{ +mul.f16x2 r5581, r4311, r5463; +} +{ +fma.rn.f16x2 r5584, r4455, r5462, r5581; +} +{ +mul.f16x2 r5588, r4633, r5466; +} +{ +mul.f16x2 r5591, r4777, r5467; +} +{ +sub.f16x2 r5594, r5588, r5591; +} +{ +mul.f16x2 r5597, r4633, r5467; +} +{ +fma.rn.f16x2 r5600, r4777, r5466, r5597; +} +{ +mul.f16x2 r5604, r4955, r5470; +} +{ +mul.f16x2 r5607, r5099, r5471; +} +{ +sub.f16x2 r5610, r5604, r5607; +} +{ +mul.f16x2 r5613, r4955, r5471; +} +{ +fma.rn.f16x2 r5616, r5099, r5470, r5613; +} +{ +mul.f16x2 r5620, r5277, r5474; +} +{ +mul.f16x2 r5623, r5421, r5475; +} +{ +sub.f16x2 r5626, r5620, r5623; +} +{ +mul.f16x2 r5629, r5277, r5475; +} +{ +fma.rn.f16x2 r5632, r5421, r5474, r5629; +} +{ +mul.f16x2 r5636, r4347, r5464; +} +{ +mul.f16x2 r5639, r4491, r5465; +} +{ +sub.f16x2 r5642, r5636, r5639; +} +{ +mul.f16x2 r5645, r4347, r5465; +} +{ +fma.rn.f16x2 r5648, r4491, r5464, r5645; +} +{ +mul.f16x2 r5652, r4669, r5470; +} +{ +mul.f16x2 r5655, r4813, r5471; +} +{ +sub.f16x2 r5658, r5652, r5655; +} +{ +mul.f16x2 r5661, r4669, r5471; +} +{ +fma.rn.f16x2 r5664, r4813, r5470, r5661; +} +{ +mul.f16x2 r5668, r4991, r5476; +} +{ +mul.f16x2 r5671, r5135, r5477; +} +{ +sub.f16x2 r5674, r5668, r5671; +} +{ +mul.f16x2 r5677, r4991, r5477; +} +{ +fma.rn.f16x2 r5680, r5135, r5476, r5677; +} +{ +mul.f16x2 r5684, r5313, r5482; +} +{ +mul.f16x2 r5687, r5457, r5483; +} +{ +sub.f16x2 r5690, r5684, r5687; +} +{ +mul.f16x2 r5693, r5313, r5483; +} +{ +fma.rn.f16x2 r5696, r5457, r5482, r5693; +} +{ +mul.f16x2 r5700, r4275, r5466; +} +{ +mul.f16x2 r5703, r4419, r5467; +} +{ +sub.f16x2 r5706, r5700, r5703; +} +{ +mul.f16x2 r5709, r4275, r5467; +} +{ +fma.rn.f16x2 r5712, r4419, r5466, r5709; +} +{ +mul.f16x2 r5716, r4597, r5474; +} +{ +mul.f16x2 r5719, r4741, r5475; +} +{ +sub.f16x2 r5722, r5716, r5719; +} +{ +mul.f16x2 r5725, r4597, r5475; +} +{ +fma.rn.f16x2 r5728, r4741, r5474, r5725; +} +{ +mul.f16x2 r5732, r4919, r5482; +} +{ +mul.f16x2 r5735, r5063, r5483; +} +{ +sub.f16x2 r5738, r5732, r5735; +} +{ +mul.f16x2 r5741, r4919, r5483; +} +{ +fma.rn.f16x2 r5744, r5063, r5482, r5741; +} +{ +mul.f16x2 r5748, r5241, r5490; +} +{ +mul.f16x2 r5751, r5385, r5491; +} +{ +sub.f16x2 r5754, r5748, r5751; +} +{ +mul.f16x2 r5757, r5241, r5491; +} +{ +fma.rn.f16x2 r5760, r5385, r5490, r5757; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5765, {low, high}; +} +{ +neg.f16x2 r5766, r5765; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r5768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r5769, {low, high}; +} +{ +neg.f16x2 r5770, r5769; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r5772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r5773, {low, high}; +} +{ +add.f16x2 r5774, r4191, r5157; +} +{ +add.f16x2 r5777, r3869, r5774; +} +{ +add.f16x2 r5780, r4513, r4835; +} +{ +add.f16x2 r5783, r5777, r5780; +} +st.local.u32 [rd3], r5783; +{ +add.f16x2 r5786, r4203, r5169; +} +{ +add.f16x2 r5789, r3881, r5786; +} +{ +add.f16x2 r5792, r4525, r4847; +} +{ +add.f16x2 r5795, r5789, r5792; +} +st.local.u32 [rd4], r5795; +{ +add.f16x2 r5798, r4191, r5157; +} +{ +mul.f16x2 r5801, r5798, r5764; +} +{ +add.f16x2 r5804, r3869, r5801; +} +{ +add.f16x2 r5807, r4513, r4835; +} +{ +mul.f16x2 r5810, r5807, r5768; +} +{ +add.f16x2 r5813, r5804, r5810; +} +{ +sub.f16x2 r5816, r4203, r5169; +} +{ +mul.f16x2 r5819, r5816, r5766; +} +{ +sub.f16x2 r5822, r4525, r4847; +} +{ +mul.f16x2 r5825, r5822, r5770; +} +{ +add.f16x2 r5828, r5819, r5825; +} +{ +sub.f16x2 r5831, r5813, r5828; +} +st.local.u32 [rd4+36], r5831; +{ +add.f16x2 r5834, r4191, r5157; +} +{ +mul.f16x2 r5837, r5834, r5764; +} +{ +add.f16x2 r5840, r3869, r5837; +} +{ +add.f16x2 r5843, r4513, r4835; +} +{ +mul.f16x2 r5846, r5843, r5768; +} +{ +add.f16x2 r5849, r5840, r5846; +} +{ +sub.f16x2 r5852, r4203, r5169; +} +{ +mul.f16x2 r5855, r5852, r5766; +} +{ +sub.f16x2 r5858, r4525, r4847; +} +{ +mul.f16x2 r5861, r5858, r5770; +} +{ +add.f16x2 r5864, r5855, r5861; +} +{ +add.f16x2 r5867, r5849, r5864; +} +st.local.u32 [rd4+156], r5867; +{ +add.f16x2 r5870, r4191, r5157; +} +{ +mul.f16x2 r5873, r5870, r5768; +} +{ +add.f16x2 r5876, r3869, r5873; +} +{ +add.f16x2 r5879, r4513, r4835; +} +{ +mul.f16x2 r5882, r5879, r5772; +} +{ +add.f16x2 r5885, r5876, r5882; +} +{ +sub.f16x2 r5888, r4203, r5169; +} +{ +mul.f16x2 r5891, r5888, r5770; +} +{ +sub.f16x2 r5894, r4525, r4847; +} +{ +mul.f16x2 r5897, r5894, r5773; +} +{ +add.f16x2 r5900, r5891, r5897; +} +{ +sub.f16x2 r5903, r5885, r5900; +} +st.local.u32 [rd4+76], r5903; +{ +add.f16x2 r5906, r4191, r5157; +} +{ +mul.f16x2 r5909, r5906, r5768; +} +{ +add.f16x2 r5912, r3869, r5909; +} +{ +add.f16x2 r5915, r4513, r4835; +} +{ +mul.f16x2 r5918, r5915, r5772; +} +{ +add.f16x2 r5921, r5912, r5918; +} +{ +sub.f16x2 r5924, r4203, r5169; +} +{ +mul.f16x2 r5927, r5924, r5770; +} +{ +sub.f16x2 r5930, r4525, r4847; +} +{ +mul.f16x2 r5933, r5930, r5773; +} +{ +add.f16x2 r5936, r5927, r5933; +} +{ +add.f16x2 r5939, r5921, r5936; +} +st.local.u32 [rd4+116], r5939; +{ +add.f16x2 r5942, r4203, r5169; +} +{ +mul.f16x2 r5945, r5942, r5764; +} +{ +add.f16x2 r5948, r3881, r5945; +} +{ +add.f16x2 r5951, r4525, r4847; +} +{ +mul.f16x2 r5954, r5951, r5768; +} +{ +add.f16x2 r5957, r5948, r5954; +} +{ +sub.f16x2 r5960, r4191, r5157; +} +{ +mul.f16x2 r5963, r5960, r5766; +} +{ +sub.f16x2 r5966, r4513, r4835; +} +{ +mul.f16x2 r5969, r5966, r5770; +} +{ +add.f16x2 r5972, r5963, r5969; +} +{ +add.f16x2 r5975, r5957, r5972; +} +st.local.u32 [rd4+40], r5975; +{ +add.f16x2 r5978, r4203, r5169; +} +{ +mul.f16x2 r5981, r5978, r5764; +} +{ +add.f16x2 r5984, r3881, r5981; +} +{ +add.f16x2 r5987, r4525, r4847; +} +{ +mul.f16x2 r5990, r5987, r5768; +} +{ +add.f16x2 r5993, r5984, r5990; +} +{ +sub.f16x2 r5996, r4191, r5157; +} +{ +mul.f16x2 r5999, r5996, r5766; +} +{ +sub.f16x2 r6002, r4513, r4835; +} +{ +mul.f16x2 r6005, r6002, r5770; +} +{ +add.f16x2 r6008, r5999, r6005; +} +{ +sub.f16x2 r6011, r5993, r6008; +} +st.local.u32 [rd4+160], r6011; +{ +add.f16x2 r6014, r4203, r5169; +} +{ +mul.f16x2 r6017, r6014, r5768; +} +{ +add.f16x2 r6020, r3881, r6017; +} +{ +add.f16x2 r6023, r4525, r4847; +} +{ +mul.f16x2 r6026, r6023, r5772; +} +{ +add.f16x2 r6029, r6020, r6026; +} +{ +sub.f16x2 r6032, r4191, r5157; +} +{ +mul.f16x2 r6035, r6032, r5770; +} +{ +sub.f16x2 r6038, r4513, r4835; +} +{ +mul.f16x2 r6041, r6038, r5773; +} +{ +add.f16x2 r6044, r6035, r6041; +} +{ +add.f16x2 r6047, r6029, r6044; +} +st.local.u32 [rd4+80], r6047; +{ +add.f16x2 r6050, r4203, r5169; +} +{ +mul.f16x2 r6053, r6050, r5768; +} +{ +add.f16x2 r6056, r3881, r6053; +} +{ +add.f16x2 r6059, r4525, r4847; +} +{ +mul.f16x2 r6062, r6059, r5772; +} +{ +add.f16x2 r6065, r6056, r6062; +} +{ +sub.f16x2 r6068, r4191, r5157; +} +{ +mul.f16x2 r6071, r6068, r5770; +} +{ +sub.f16x2 r6074, r4513, r4835; +} +{ +mul.f16x2 r6077, r6074, r5773; +} +{ +add.f16x2 r6080, r6071, r6077; +} +{ +sub.f16x2 r6083, r6065, r6080; +} +st.local.u32 [rd4+120], r6083; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6086, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6087, {low, high}; +} +{ +neg.f16x2 r6088, r6087; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6090, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6091, {low, high}; +} +{ +neg.f16x2 r6092, r6091; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6095, {low, high}; +} +{ +add.f16x2 r6096, r5514, r5562; +} +{ +add.f16x2 r6099, r3917, r6096; +} +{ +add.f16x2 r6102, r5530, r5546; +} +{ +add.f16x2 r11311, r6099, r6102; +} +st.local.u32 [rd4+4], r11311; +{ +add.f16x2 r6108, r5520, r5568; +} +{ +add.f16x2 r6111, r4061, r6108; +} +{ +add.f16x2 r6114, r5536, r5552; +} +{ +add.f16x2 r6117, r6111, r6114; +} +st.local.u32 [rd4+8], r6117; +{ +add.f16x2 r6120, r5514, r5562; +} +{ +mul.f16x2 r6123, r6120, r6086; +} +{ +add.f16x2 r6126, r3917, r6123; +} +{ +add.f16x2 r6129, r5530, r5546; +} +{ +mul.f16x2 r6132, r6129, r6090; +} +{ +add.f16x2 r6135, r6126, r6132; +} +{ +sub.f16x2 r6138, r5520, r5568; +} +{ +mul.f16x2 r6141, r6138, r6088; +} +{ +sub.f16x2 r6144, r5536, r5552; +} +{ +mul.f16x2 r6147, r6144, r6092; +} +{ +add.f16x2 r6150, r6141, r6147; +} +{ +sub.f16x2 r6153, r6135, r6150; +} +st.local.u32 [rd4+44], r6153; +{ +add.f16x2 r6156, r5514, r5562; +} +{ +mul.f16x2 r6159, r6156, r6086; +} +{ +add.f16x2 r6162, r3917, r6159; +} +{ +add.f16x2 r6165, r5530, r5546; +} +{ +mul.f16x2 r6168, r6165, r6090; +} +{ +add.f16x2 r6171, r6162, r6168; +} +{ +sub.f16x2 r6174, r5520, r5568; +} +{ +mul.f16x2 r6177, r6174, r6088; +} +{ +sub.f16x2 r6180, r5536, r5552; +} +{ +mul.f16x2 r6183, r6180, r6092; +} +{ +add.f16x2 r6186, r6177, r6183; +} +{ +add.f16x2 r6189, r6171, r6186; +} +st.local.u32 [rd4+164], r6189; +{ +add.f16x2 r6192, r5514, r5562; +} +{ +mul.f16x2 r6195, r6192, r6090; +} +{ +add.f16x2 r6198, r3917, r6195; +} +{ +add.f16x2 r6201, r5530, r5546; +} +{ +mul.f16x2 r6204, r6201, r6094; +} +{ +add.f16x2 r6207, r6198, r6204; +} +{ +sub.f16x2 r6210, r5520, r5568; +} +{ +mul.f16x2 r6213, r6210, r6092; +} +{ +sub.f16x2 r6216, r5536, r5552; +} +{ +mul.f16x2 r6219, r6216, r6095; +} +{ +add.f16x2 r6222, r6213, r6219; +} +{ +sub.f16x2 r6225, r6207, r6222; +} +st.local.u32 [rd4+84], r6225; +{ +add.f16x2 r6228, r5514, r5562; +} +{ +mul.f16x2 r6231, r6228, r6090; +} +{ +add.f16x2 r6234, r3917, r6231; +} +{ +add.f16x2 r6237, r5530, r5546; +} +{ +mul.f16x2 r6240, r6237, r6094; +} +{ +add.f16x2 r6243, r6234, r6240; +} +{ +sub.f16x2 r6246, r5520, r5568; +} +{ +mul.f16x2 r6249, r6246, r6092; +} +{ +sub.f16x2 r6252, r5536, r5552; +} +{ +mul.f16x2 r6255, r6252, r6095; +} +{ +add.f16x2 r6258, r6249, r6255; +} +{ +add.f16x2 r6261, r6243, r6258; +} +st.local.u32 [rd4+124], r6261; +{ +add.f16x2 r6264, r5520, r5568; +} +{ +mul.f16x2 r6267, r6264, r6086; +} +{ +add.f16x2 r6270, r4061, r6267; +} +{ +add.f16x2 r6273, r5536, r5552; +} +{ +mul.f16x2 r6276, r6273, r6090; +} +{ +add.f16x2 r6279, r6270, r6276; +} +{ +sub.f16x2 r6282, r5514, r5562; +} +{ +mul.f16x2 r6285, r6282, r6088; +} +{ +sub.f16x2 r6288, r5530, r5546; +} +{ +mul.f16x2 r6291, r6288, r6092; +} +{ +add.f16x2 r6294, r6285, r6291; +} +{ +add.f16x2 r6297, r6279, r6294; +} +st.local.u32 [rd4+48], r6297; +{ +add.f16x2 r6300, r5520, r5568; +} +{ +mul.f16x2 r6303, r6300, r6086; +} +{ +add.f16x2 r6306, r4061, r6303; +} +{ +add.f16x2 r6309, r5536, r5552; +} +{ +mul.f16x2 r6312, r6309, r6090; +} +{ +add.f16x2 r6315, r6306, r6312; +} +{ +sub.f16x2 r6318, r5514, r5562; +} +{ +mul.f16x2 r6321, r6318, r6088; +} +{ +sub.f16x2 r6324, r5530, r5546; +} +{ +mul.f16x2 r6327, r6324, r6092; +} +{ +add.f16x2 r6330, r6321, r6327; +} +{ +sub.f16x2 r6333, r6315, r6330; +} +st.local.u32 [rd4+168], r6333; +{ +add.f16x2 r6336, r5520, r5568; +} +{ +mul.f16x2 r6339, r6336, r6090; +} +{ +add.f16x2 r6342, r4061, r6339; +} +{ +add.f16x2 r6345, r5536, r5552; +} +{ +mul.f16x2 r6348, r6345, r6094; +} +{ +add.f16x2 r6351, r6342, r6348; +} +{ +sub.f16x2 r6354, r5514, r5562; +} +{ +mul.f16x2 r6357, r6354, r6092; +} +{ +sub.f16x2 r6360, r5530, r5546; +} +{ +mul.f16x2 r6363, r6360, r6095; +} +{ +add.f16x2 r6366, r6357, r6363; +} +{ +add.f16x2 r6369, r6351, r6366; +} +st.local.u32 [rd4+88], r6369; +{ +add.f16x2 r6372, r5520, r5568; +} +{ +mul.f16x2 r6375, r6372, r6090; +} +{ +add.f16x2 r6378, r4061, r6375; +} +{ +add.f16x2 r6381, r5536, r5552; +} +{ +mul.f16x2 r6384, r6381, r6094; +} +{ +add.f16x2 r6387, r6378, r6384; +} +{ +sub.f16x2 r6390, r5514, r5562; +} +{ +mul.f16x2 r6393, r6390, r6092; +} +{ +sub.f16x2 r6396, r5530, r5546; +} +{ +mul.f16x2 r6399, r6396, r6095; +} +{ +add.f16x2 r6402, r6393, r6399; +} +{ +sub.f16x2 r6405, r6387, r6402; +} +st.local.u32 [rd4+128], r6405; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6408, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6409, {low, high}; +} +{ +neg.f16x2 r6410, r6409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6412, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6413, {low, high}; +} +{ +neg.f16x2 r6414, r6413; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6417, {low, high}; +} +{ +add.f16x2 r6418, r5578, r5626; +} +{ +add.f16x2 r6421, r3989, r6418; +} +{ +add.f16x2 r6424, r5594, r5610; +} +{ +add.f16x2 r6427, r6421, r6424; +} +st.local.u32 [rd4+12], r6427; +{ +add.f16x2 r6430, r5584, r5632; +} +{ +add.f16x2 r6433, r4133, r6430; +} +{ +add.f16x2 r6436, r5600, r5616; +} +{ +add.f16x2 r6439, r6433, r6436; +} +st.local.u32 [rd4+16], r6439; +{ +add.f16x2 r6442, r5578, r5626; +} +{ +mul.f16x2 r6445, r6442, r6408; +} +{ +add.f16x2 r6448, r3989, r6445; +} +{ +add.f16x2 r6451, r5594, r5610; +} +{ +mul.f16x2 r6454, r6451, r6412; +} +{ +add.f16x2 r6457, r6448, r6454; +} +{ +sub.f16x2 r6460, r5584, r5632; +} +{ +mul.f16x2 r6463, r6460, r6410; +} +{ +sub.f16x2 r6466, r5600, r5616; +} +{ +mul.f16x2 r6469, r6466, r6414; +} +{ +add.f16x2 r6472, r6463, r6469; +} +{ +sub.f16x2 r6475, r6457, r6472; +} +st.local.u32 [rd4+52], r6475; +{ +add.f16x2 r6478, r5578, r5626; +} +{ +mul.f16x2 r6481, r6478, r6408; +} +{ +add.f16x2 r6484, r3989, r6481; +} +{ +add.f16x2 r6487, r5594, r5610; +} +{ +mul.f16x2 r6490, r6487, r6412; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +sub.f16x2 r6496, r5584, r5632; +} +{ +mul.f16x2 r6499, r6496, r6410; +} +{ +sub.f16x2 r6502, r5600, r5616; +} +{ +mul.f16x2 r6505, r6502, r6414; +} +{ +add.f16x2 r6508, r6499, r6505; +} +{ +add.f16x2 r6511, r6493, r6508; +} +st.local.u32 [rd4+172], r6511; +{ +add.f16x2 r6514, r5578, r5626; +} +{ +mul.f16x2 r6517, r6514, r6412; +} +{ +add.f16x2 r6520, r3989, r6517; +} +{ +add.f16x2 r6523, r5594, r5610; +} +{ +mul.f16x2 r6526, r6523, r6416; +} +{ +add.f16x2 r6529, r6520, r6526; +} +{ +sub.f16x2 r6532, r5584, r5632; +} +{ +mul.f16x2 r6535, r6532, r6414; +} +{ +sub.f16x2 r6538, r5600, r5616; +} +{ +mul.f16x2 r6541, r6538, r6417; +} +{ +add.f16x2 r6544, r6535, r6541; +} +{ +sub.f16x2 r6547, r6529, r6544; +} +st.local.u32 [rd4+92], r6547; +{ +add.f16x2 r6550, r5578, r5626; +} +{ +mul.f16x2 r6553, r6550, r6412; +} +{ +add.f16x2 r6556, r3989, r6553; +} +{ +add.f16x2 r6559, r5594, r5610; +} +{ +mul.f16x2 r6562, r6559, r6416; +} +{ +add.f16x2 r6565, r6556, r6562; +} +{ +sub.f16x2 r6568, r5584, r5632; +} +{ +mul.f16x2 r6571, r6568, r6414; +} +{ +sub.f16x2 r6574, r5600, r5616; +} +{ +mul.f16x2 r6577, r6574, r6417; +} +{ +add.f16x2 r6580, r6571, r6577; +} +{ +add.f16x2 r6583, r6565, r6580; +} +st.local.u32 [rd4+132], r6583; +{ +add.f16x2 r6586, r5584, r5632; +} +{ +mul.f16x2 r6589, r6586, r6408; +} +{ +add.f16x2 r6592, r4133, r6589; +} +{ +add.f16x2 r6595, r5600, r5616; +} +{ +mul.f16x2 r6598, r6595, r6412; +} +{ +add.f16x2 r6601, r6592, r6598; +} +{ +sub.f16x2 r6604, r5578, r5626; +} +{ +mul.f16x2 r6607, r6604, r6410; +} +{ +sub.f16x2 r6610, r5594, r5610; +} +{ +mul.f16x2 r6613, r6610, r6414; +} +{ +add.f16x2 r6616, r6607, r6613; +} +{ +add.f16x2 r6619, r6601, r6616; +} +st.local.u32 [rd4+56], r6619; +{ +add.f16x2 r6622, r5584, r5632; +} +{ +mul.f16x2 r6625, r6622, r6408; +} +{ +add.f16x2 r6628, r4133, r6625; +} +{ +add.f16x2 r6631, r5600, r5616; +} +{ +mul.f16x2 r6634, r6631, r6412; +} +{ +add.f16x2 r6637, r6628, r6634; +} +{ +sub.f16x2 r6640, r5578, r5626; +} +{ +mul.f16x2 r6643, r6640, r6410; +} +{ +sub.f16x2 r6646, r5594, r5610; +} +{ +mul.f16x2 r6649, r6646, r6414; +} +{ +add.f16x2 r6652, r6643, r6649; +} +{ +sub.f16x2 r6655, r6637, r6652; +} +st.local.u32 [rd4+176], r6655; +{ +add.f16x2 r6658, r5584, r5632; +} +{ +mul.f16x2 r6661, r6658, r6412; +} +{ +add.f16x2 r6664, r4133, r6661; +} +{ +add.f16x2 r6667, r5600, r5616; +} +{ +mul.f16x2 r6670, r6667, r6416; +} +{ +add.f16x2 r6673, r6664, r6670; +} +{ +sub.f16x2 r6676, r5578, r5626; +} +{ +mul.f16x2 r6679, r6676, r6414; +} +{ +sub.f16x2 r6682, r5594, r5610; +} +{ +mul.f16x2 r6685, r6682, r6417; +} +{ +add.f16x2 r6688, r6679, r6685; +} +{ +add.f16x2 r6691, r6673, r6688; +} +st.local.u32 [rd4+96], r6691; +{ +add.f16x2 r6694, r5584, r5632; +} +{ +mul.f16x2 r6697, r6694, r6412; +} +{ +add.f16x2 r6700, r4133, r6697; +} +{ +add.f16x2 r6703, r5600, r5616; +} +{ +mul.f16x2 r6706, r6703, r6416; +} +{ +add.f16x2 r6709, r6700, r6706; +} +{ +sub.f16x2 r6712, r5578, r5626; +} +{ +mul.f16x2 r6715, r6712, r6414; +} +{ +sub.f16x2 r6718, r5594, r5610; +} +{ +mul.f16x2 r6721, r6718, r6417; +} +{ +add.f16x2 r6724, r6715, r6721; +} +{ +sub.f16x2 r6727, r6709, r6724; +} +st.local.u32 [rd4+136], r6727; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6730, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6731, {low, high}; +} +{ +neg.f16x2 r6732, r6731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r6735, {low, high}; +} +{ +neg.f16x2 r6736, r6735; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r6739, {low, high}; +} +{ +add.f16x2 r6740, r5642, r5690; +} +{ +add.f16x2 r6743, r4025, r6740; +} +{ +add.f16x2 r6746, r5658, r5674; +} +{ +add.f16x2 r6749, r6743, r6746; +} +st.local.u32 [rd4+20], r6749; +{ +add.f16x2 r6752, r5648, r5696; +} +{ +add.f16x2 r6755, r4169, r6752; +} +{ +add.f16x2 r6758, r5664, r5680; +} +{ +add.f16x2 r6761, r6755, r6758; +} +st.local.u32 [rd4+24], r6761; +{ +add.f16x2 r6764, r5642, r5690; +} +{ +mul.f16x2 r6767, r6764, r6730; +} +{ +add.f16x2 r6770, r4025, r6767; +} +{ +add.f16x2 r6773, r5658, r5674; +} +{ +mul.f16x2 r6776, r6773, r6734; +} +{ +add.f16x2 r6779, r6770, r6776; +} +{ +sub.f16x2 r6782, r5648, r5696; +} +{ +mul.f16x2 r6785, r6782, r6732; +} +{ +sub.f16x2 r6788, r5664, r5680; +} +{ +mul.f16x2 r6791, r6788, r6736; +} +{ +add.f16x2 r6794, r6785, r6791; +} +{ +sub.f16x2 r6797, r6779, r6794; +} +st.local.u32 [rd4+60], r6797; +{ +add.f16x2 r6800, r5642, r5690; +} +{ +mul.f16x2 r6803, r6800, r6730; +} +{ +add.f16x2 r6806, r4025, r6803; +} +{ +add.f16x2 r6809, r5658, r5674; +} +{ +mul.f16x2 r6812, r6809, r6734; +} +{ +add.f16x2 r6815, r6806, r6812; +} +{ +sub.f16x2 r6818, r5648, r5696; +} +{ +mul.f16x2 r6821, r6818, r6732; +} +{ +sub.f16x2 r6824, r5664, r5680; +} +{ +mul.f16x2 r6827, r6824, r6736; +} +{ +add.f16x2 r6830, r6821, r6827; +} +{ +add.f16x2 r6833, r6815, r6830; +} +st.local.u32 [rd4+180], r6833; +{ +add.f16x2 r6836, r5642, r5690; +} +{ +mul.f16x2 r6839, r6836, r6734; +} +{ +add.f16x2 r6842, r4025, r6839; +} +{ +add.f16x2 r6845, r5658, r5674; +} +{ +mul.f16x2 r6848, r6845, r6738; +} +{ +add.f16x2 r6851, r6842, r6848; +} +{ +sub.f16x2 r6854, r5648, r5696; +} +{ +mul.f16x2 r6857, r6854, r6736; +} +{ +sub.f16x2 r6860, r5664, r5680; +} +{ +mul.f16x2 r6863, r6860, r6739; +} +{ +add.f16x2 r6866, r6857, r6863; +} +{ +sub.f16x2 r6869, r6851, r6866; +} +st.local.u32 [rd4+100], r6869; +{ +add.f16x2 r6872, r5642, r5690; +} +{ +mul.f16x2 r6875, r6872, r6734; +} +{ +add.f16x2 r6878, r4025, r6875; +} +{ +add.f16x2 r6881, r5658, r5674; +} +{ +mul.f16x2 r6884, r6881, r6738; +} +{ +add.f16x2 r6887, r6878, r6884; +} +{ +sub.f16x2 r6890, r5648, r5696; +} +{ +mul.f16x2 r6893, r6890, r6736; +} +{ +sub.f16x2 r6896, r5664, r5680; +} +{ +mul.f16x2 r6899, r6896, r6739; +} +{ +add.f16x2 r6902, r6893, r6899; +} +{ +add.f16x2 r6905, r6887, r6902; +} +st.local.u32 [rd4+140], r6905; +{ +add.f16x2 r6908, r5648, r5696; +} +{ +mul.f16x2 r6911, r6908, r6730; +} +{ +add.f16x2 r6914, r4169, r6911; +} +{ +add.f16x2 r6917, r5664, r5680; +} +{ +mul.f16x2 r6920, r6917, r6734; +} +{ +add.f16x2 r6923, r6914, r6920; +} +{ +sub.f16x2 r6926, r5642, r5690; +} +{ +mul.f16x2 r6929, r6926, r6732; +} +{ +sub.f16x2 r6932, r5658, r5674; +} +{ +mul.f16x2 r6935, r6932, r6736; +} +{ +add.f16x2 r6938, r6929, r6935; +} +{ +add.f16x2 r6941, r6923, r6938; +} +st.local.u32 [rd4+64], r6941; +{ +add.f16x2 r6944, r5648, r5696; +} +{ +mul.f16x2 r6947, r6944, r6730; +} +{ +add.f16x2 r6950, r4169, r6947; +} +{ +add.f16x2 r6953, r5664, r5680; +} +{ +mul.f16x2 r6956, r6953, r6734; +} +{ +add.f16x2 r6959, r6950, r6956; +} +{ +sub.f16x2 r6962, r5642, r5690; +} +{ +mul.f16x2 r6965, r6962, r6732; +} +{ +sub.f16x2 r6968, r5658, r5674; +} +{ +mul.f16x2 r6971, r6968, r6736; +} +{ +add.f16x2 r6974, r6965, r6971; +} +{ +sub.f16x2 r6977, r6959, r6974; +} +st.local.u32 [rd4+184], r6977; +{ +add.f16x2 r6980, r5648, r5696; +} +{ +mul.f16x2 r6983, r6980, r6734; +} +{ +add.f16x2 r6986, r4169, r6983; +} +{ +add.f16x2 r6989, r5664, r5680; +} +{ +mul.f16x2 r6992, r6989, r6738; +} +{ +add.f16x2 r6995, r6986, r6992; +} +{ +sub.f16x2 r6998, r5642, r5690; +} +{ +mul.f16x2 r7001, r6998, r6736; +} +{ +sub.f16x2 r7004, r5658, r5674; +} +{ +mul.f16x2 r7007, r7004, r6739; +} +{ +add.f16x2 r7010, r7001, r7007; +} +{ +add.f16x2 r7013, r6995, r7010; +} +st.local.u32 [rd4+104], r7013; +{ +add.f16x2 r7016, r5648, r5696; +} +{ +mul.f16x2 r7019, r7016, r6734; +} +{ +add.f16x2 r7022, r4169, r7019; +} +{ +add.f16x2 r7025, r5664, r5680; +} +{ +mul.f16x2 r7028, r7025, r6738; +} +{ +add.f16x2 r7031, r7022, r7028; +} +{ +sub.f16x2 r7034, r5642, r5690; +} +{ +mul.f16x2 r7037, r7034, r6736; +} +{ +sub.f16x2 r7040, r5658, r5674; +} +{ +mul.f16x2 r7043, r7040, r6739; +} +{ +add.f16x2 r7046, r7037, r7043; +} +{ +sub.f16x2 r7049, r7031, r7046; +} +st.local.u32 [rd4+144], r7049; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7053, {low, high}; +} +{ +neg.f16x2 r7054, r7053; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7057, {low, high}; +} +{ +neg.f16x2 r7058, r7057; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7061, {low, high}; +} +{ +add.f16x2 r7062, r5706, r5754; +} +{ +add.f16x2 r7065, r3953, r7062; +} +{ +add.f16x2 r7068, r5722, r5738; +} +{ +add.f16x2 r7071, r7065, r7068; +} +st.local.u32 [rd4+28], r7071; +{ +add.f16x2 r7074, r5712, r5760; +} +{ +add.f16x2 r7077, r4097, r7074; +} +{ +add.f16x2 r7080, r5728, r5744; +} +{ +add.f16x2 r7083, r7077, r7080; +} +st.local.u32 [rd4+32], r7083; +{ +add.f16x2 r7086, r5706, r5754; +} +{ +mul.f16x2 r7089, r7086, r7052; +} +{ +add.f16x2 r7092, r3953, r7089; +} +{ +add.f16x2 r7095, r5722, r5738; +} +{ +mul.f16x2 r7098, r7095, r7056; +} +{ +add.f16x2 r7101, r7092, r7098; +} +{ +sub.f16x2 r7104, r5712, r5760; +} +{ +mul.f16x2 r7107, r7104, r7054; +} +{ +sub.f16x2 r7110, r5728, r5744; +} +{ +mul.f16x2 r7113, r7110, r7058; +} +{ +add.f16x2 r7116, r7107, r7113; +} +{ +sub.f16x2 r7119, r7101, r7116; +} +st.local.u32 [rd4+68], r7119; +{ +add.f16x2 r7122, r5706, r5754; +} +{ +mul.f16x2 r7125, r7122, r7052; +} +{ +add.f16x2 r7128, r3953, r7125; +} +{ +add.f16x2 r7131, r5722, r5738; +} +{ +mul.f16x2 r7134, r7131, r7056; +} +{ +add.f16x2 r7137, r7128, r7134; +} +{ +sub.f16x2 r7140, r5712, r5760; +} +{ +mul.f16x2 r7143, r7140, r7054; +} +{ +sub.f16x2 r7146, r5728, r5744; +} +{ +mul.f16x2 r7149, r7146, r7058; +} +{ +add.f16x2 r7152, r7143, r7149; +} +{ +add.f16x2 r7155, r7137, r7152; +} +st.local.u32 [rd4+188], r7155; +{ +add.f16x2 r7158, r5706, r5754; +} +{ +mul.f16x2 r7161, r7158, r7056; +} +{ +add.f16x2 r7164, r3953, r7161; +} +{ +add.f16x2 r7167, r5722, r5738; +} +{ +mul.f16x2 r7170, r7167, r7060; +} +{ +add.f16x2 r7173, r7164, r7170; +} +{ +sub.f16x2 r7176, r5712, r5760; +} +{ +mul.f16x2 r7179, r7176, r7058; +} +{ +sub.f16x2 r7182, r5728, r5744; +} +{ +mul.f16x2 r7185, r7182, r7061; +} +{ +add.f16x2 r7188, r7179, r7185; +} +{ +sub.f16x2 r7191, r7173, r7188; +} +st.local.u32 [rd4+108], r7191; +{ +add.f16x2 r7194, r5706, r5754; +} +{ +mul.f16x2 r7197, r7194, r7056; +} +{ +add.f16x2 r7200, r3953, r7197; +} +{ +add.f16x2 r7203, r5722, r5738; +} +{ +mul.f16x2 r7206, r7203, r7060; +} +{ +add.f16x2 r7209, r7200, r7206; +} +{ +sub.f16x2 r7212, r5712, r5760; +} +{ +mul.f16x2 r7215, r7212, r7058; +} +{ +sub.f16x2 r7218, r5728, r5744; +} +{ +mul.f16x2 r7221, r7218, r7061; +} +{ +add.f16x2 r7224, r7215, r7221; +} +{ +add.f16x2 r7227, r7209, r7224; +} +st.local.u32 [rd4+148], r7227; +{ +add.f16x2 r7230, r5712, r5760; +} +{ +mul.f16x2 r7233, r7230, r7052; +} +{ +add.f16x2 r7236, r4097, r7233; +} +{ +add.f16x2 r7239, r5728, r5744; +} +{ +mul.f16x2 r7242, r7239, r7056; +} +{ +add.f16x2 r7245, r7236, r7242; +} +{ +sub.f16x2 r7248, r5706, r5754; +} +{ +mul.f16x2 r7251, r7248, r7054; +} +{ +sub.f16x2 r7254, r5722, r5738; +} +{ +mul.f16x2 r7257, r7254, r7058; +} +{ +add.f16x2 r7260, r7251, r7257; +} +{ +add.f16x2 r7263, r7245, r7260; +} +st.local.u32 [rd4+72], r7263; +{ +add.f16x2 r7266, r5712, r5760; +} +{ +mul.f16x2 r7269, r7266, r7052; +} +{ +add.f16x2 r7272, r4097, r7269; +} +{ +add.f16x2 r7275, r5728, r5744; +} +{ +mul.f16x2 r7278, r7275, r7056; +} +{ +add.f16x2 r7281, r7272, r7278; +} +{ +sub.f16x2 r7284, r5706, r5754; +} +{ +mul.f16x2 r7287, r7284, r7054; +} +{ +sub.f16x2 r7290, r5722, r5738; +} +{ +mul.f16x2 r7293, r7290, r7058; +} +{ +add.f16x2 r7296, r7287, r7293; +} +{ +sub.f16x2 r7299, r7281, r7296; +} +st.local.u32 [rd4+192], r7299; +{ +add.f16x2 r7302, r5712, r5760; +} +{ +mul.f16x2 r7305, r7302, r7056; +} +{ +add.f16x2 r7308, r4097, r7305; +} +{ +add.f16x2 r7311, r5728, r5744; +} +{ +mul.f16x2 r7314, r7311, r7060; +} +{ +add.f16x2 r7317, r7308, r7314; +} +{ +sub.f16x2 r7320, r5706, r5754; +} +{ +mul.f16x2 r7323, r7320, r7058; +} +{ +sub.f16x2 r7326, r5722, r5738; +} +{ +mul.f16x2 r7329, r7326, r7061; +} +{ +add.f16x2 r7332, r7323, r7329; +} +{ +add.f16x2 r7335, r7317, r7332; +} +st.local.u32 [rd4+112], r7335; +{ +add.f16x2 r7338, r5712, r5760; +} +{ +mul.f16x2 r7341, r7338, r7056; +} +{ +add.f16x2 r7344, r4097, r7341; +} +{ +add.f16x2 r7347, r5728, r5744; +} +{ +mul.f16x2 r7350, r7347, r7060; +} +{ +add.f16x2 r7353, r7344, r7350; +} +{ +sub.f16x2 r7356, r5706, r5754; +} +{ +mul.f16x2 r7359, r7356, r7058; +} +{ +sub.f16x2 r7362, r5722, r5738; +} +{ +mul.f16x2 r7365, r7362, r7061; +} +{ +add.f16x2 r7368, r7359, r7365; +} +{ +sub.f16x2 r7371, r7353, r7368; +} +st.local.u32 [rd4+152], r7371; +mul.wide.u32 rd13, r3, 1374389535; +shr.u64 rd14, rd13, 35; +cvt.u32.u64 r15, rd14; +cvt.rn.f32.u32 f460, r15; +mul.f32 f461, f460, 0f3C24B5BE; +cos.approx.f32 f456, f461; +sin.approx.f32 f462, f461; +neg.f32 f457, f462; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f456; +cvt.rn.f16.f32 high, f457; +mov.b32 r11313, {low, high}; +} +mov.u32 r11312, 1; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11313; +mov.b32 r7452, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11313; +mov.b32 r7454, {high, high}; +} +bra.uni LBB1_4; +LBB1_5: +ld.local.u32 r11311, [rd7+60]; +LBB1_4: +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11313; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11313; +mov.b32 r7434, {high, high}; +} +mul.wide.u32 rd15, r11312, 8; +add.s64 rd16, rd3, rd15; +add.s64 rd7, rd16, 4; +ld.local.u32 r7437, [rd16+4]; +{ +mul.f16x2 r7436, r7437, r7434; +} +{ +fma.rn.f16x2 r7439, r11311, r7432, r7436; +} +st.local.u32 [rd16], r7439; +{ +mul.f16x2 r7443, r11311, r7434; +} +{ +neg.f16x2 r7446, r7443; +} +{ +fma.rn.f16x2 r7448, r7437, r7432, r7446; +} +st.local.u32 [rd16+4], r7448; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7456, {low, high}; +} +{ +mul.f16x2 r7457, r7454, r7456; +} +{ +mul.f16x2 r7460, r11313, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r11313; +mov.b32 r7463, {high, low}; +} +{ +fma.rn.f16x2 r7465, r7457, r7463, r7460; +} +ld.local.u32 r7481, [rd16+8]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7465; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7465; +mov.b32 r7471, {high, high}; +} +ld.local.u32 r7486, [rd16+12]; +{ +mul.f16x2 r7473, r7486, r7471; +} +{ +fma.rn.f16x2 r7476, r7481, r7469, r7473; +} +st.local.u32 [rd16+8], r7476; +{ +mul.f16x2 r7480, r7481, r7471; +} +{ +neg.f16x2 r7483, r7480; +} +{ +fma.rn.f16x2 r7485, r7486, r7469, r7483; +} +st.local.u32 [rd16+12], r7485; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7493, {low, high}; +} +{ +mul.f16x2 r7494, r7454, r7493; +} +{ +mul.f16x2 r7497, r7465, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7465; +mov.b32 r7500, {high, low}; +} +{ +fma.rn.f16x2 r7502, r7494, r7500, r7497; +} +ld.local.u32 r7518, [rd16+16]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7502; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7502; +mov.b32 r7508, {high, high}; +} +ld.local.u32 r7523, [rd16+20]; +{ +mul.f16x2 r7510, r7523, r7508; +} +{ +fma.rn.f16x2 r7513, r7518, r7506, r7510; +} +st.local.u32 [rd16+16], r7513; +{ +mul.f16x2 r7517, r7518, r7508; +} +{ +neg.f16x2 r7520, r7517; +} +{ +fma.rn.f16x2 r7522, r7523, r7506, r7520; +} +st.local.u32 [rd16+20], r7522; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7530, {low, high}; +} +{ +mul.f16x2 r7531, r7454, r7530; +} +{ +mul.f16x2 r7534, r7502, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7502; +mov.b32 r7537, {high, low}; +} +{ +fma.rn.f16x2 r7539, r7531, r7537, r7534; +} +ld.local.u32 r7555, [rd16+24]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7539; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7539; +mov.b32 r7545, {high, high}; +} +ld.local.u32 r7560, [rd16+28]; +{ +mul.f16x2 r7547, r7560, r7545; +} +{ +fma.rn.f16x2 r7550, r7555, r7543, r7547; +} +st.local.u32 [rd16+24], r7550; +{ +mul.f16x2 r7554, r7555, r7545; +} +{ +neg.f16x2 r7557, r7554; +} +{ +fma.rn.f16x2 r7559, r7560, r7543, r7557; +} +st.local.u32 [rd16+28], r7559; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7567, {low, high}; +} +{ +mul.f16x2 r7568, r7454, r7567; +} +{ +mul.f16x2 r7571, r7539, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7539; +mov.b32 r7574, {high, low}; +} +{ +fma.rn.f16x2 r7576, r7568, r7574, r7571; +} +ld.local.u32 r7588, [rd16+32]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7576; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7576; +mov.b32 r7582, {high, high}; +} +ld.local.u32 r7597, [rd16+36]; +{ +mul.f16x2 r7584, r7597, r7582; +} +{ +fma.rn.f16x2 r7587, r7588, r7580, r7584; +} +st.local.u32 [rd16+32], r7587; +{ +mul.f16x2 r7591, r7588, r7582; +} +{ +neg.f16x2 r7594, r7591; +} +{ +fma.rn.f16x2 r7596, r7597, r7580, r7594; +} +st.local.u32 [rd16+36], r7596; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7604, {low, high}; +} +{ +mul.f16x2 r7605, r7454, r7604; +} +{ +mul.f16x2 r7608, r7576, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7576; +mov.b32 r7611, {high, low}; +} +{ +fma.rn.f16x2 r7613, r7605, r7611, r7608; +} +ld.local.u32 r7625, [rd16+40]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7613; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7613; +mov.b32 r7619, {high, high}; +} +ld.local.u32 r7622, [rd16+44]; +{ +mul.f16x2 r7621, r7622, r7619; +} +{ +fma.rn.f16x2 r7624, r7625, r7617, r7621; +} +st.local.u32 [rd16+40], r7624; +{ +mul.f16x2 r7628, r7625, r7619; +} +{ +neg.f16x2 r7631, r7628; +} +{ +fma.rn.f16x2 r7633, r7622, r7617, r7631; +} +st.local.u32 [rd16+44], r7633; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7641, {low, high}; +} +{ +mul.f16x2 r7642, r7454, r7641; +} +{ +mul.f16x2 r7645, r7613, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7613; +mov.b32 r7648, {high, low}; +} +{ +fma.rn.f16x2 r7650, r7642, r7648, r7645; +} +ld.local.u32 r7662, [rd16+48]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7650; +mov.b32 r7654, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7650; +mov.b32 r7656, {high, high}; +} +ld.local.u32 r7659, [rd16+52]; +{ +mul.f16x2 r7658, r7659, r7656; +} +{ +fma.rn.f16x2 r7661, r7662, r7654, r7658; +} +st.local.u32 [rd16+48], r7661; +{ +mul.f16x2 r7665, r7662, r7656; +} +{ +neg.f16x2 r7668, r7665; +} +{ +fma.rn.f16x2 r7670, r7659, r7654, r7668; +} +st.local.u32 [rd16+52], r7670; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7678, {low, high}; +} +{ +mul.f16x2 r7679, r7454, r7678; +} +{ +mul.f16x2 r7682, r7650, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7650; +mov.b32 r7685, {high, low}; +} +{ +fma.rn.f16x2 r7687, r7679, r7685, r7682; +} +ld.local.u32 r7699, [rd16+56]; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7687; +mov.b32 r7691, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7687; +mov.b32 r7693, {high, high}; +} +ld.local.u32 r7696, [rd16+60]; +{ +mul.f16x2 r7695, r7696, r7693; +} +{ +fma.rn.f16x2 r7698, r7699, r7691, r7695; +} +st.local.u32 [rd16+56], r7698; +{ +mul.f16x2 r7702, r7699, r7693; +} +{ +neg.f16x2 r7705, r7702; +} +{ +fma.rn.f16x2 r7707, r7696, r7691, r7705; +} +st.local.u32 [rd16+60], r7707; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f239; +mov.b32 r7715, {low, high}; +} +{ +mul.f16x2 r7716, r7454, r7715; +} +{ +mul.f16x2 r7719, r7687, r7452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7687; +mov.b32 r7722, {high, low}; +} +{ +fma.rn.f16x2 r11313, r7716, r7722, r7719; +} +add.s32 r11312, r11312, 8; +setp.eq.s32 p2, r11312, 25; +@p2 bra LBB1_6; +bra.uni LBB1_5; +LBB1_6: +mul.lo.s32 r11253, r15, 25; +sub.s32 r11254, r3, r11253; +shl.b32 r11255, r11254, 2; +add.s32 r11256, r12, r11255; +barrier.sync 0; +mad.lo.s32 r11257, r15, 2500, r11256; +ld.local.u32 r11258, [rd3]; +st.shared.u32 [r11257], r11258; +ld.local.u32 r11259, [rd4+4]; +st.shared.u32 [r11257+100], r11259; +ld.local.u32 r11260, [rd4+12]; +st.shared.u32 [r11257+200], r11260; +ld.local.u32 r11261, [rd4+20]; +st.shared.u32 [r11257+300], r11261; +ld.local.u32 r11262, [rd4+28]; +st.shared.u32 [r11257+400], r11262; +ld.local.u32 r11263, [rd4+36]; +st.shared.u32 [r11257+500], r11263; +ld.local.u32 r11264, [rd4+44]; +st.shared.u32 [r11257+600], r11264; +ld.local.u32 r11265, [rd4+52]; +st.shared.u32 [r11257+700], r11265; +ld.local.u32 r11266, [rd4+60]; +st.shared.u32 [r11257+800], r11266; +ld.local.u32 r11267, [rd4+68]; +st.shared.u32 [r11257+900], r11267; +ld.local.u32 r11268, [rd4+76]; +st.shared.u32 [r11257+1000], r11268; +ld.local.u32 r11269, [rd4+84]; +st.shared.u32 [r11257+1100], r11269; +ld.local.u32 r11270, [rd4+92]; +st.shared.u32 [r11257+1200], r11270; +ld.local.u32 r11271, [rd4+100]; +st.shared.u32 [r11257+1300], r11271; +ld.local.u32 r11272, [rd4+108]; +st.shared.u32 [r11257+1400], r11272; +ld.local.u32 r11273, [rd4+116]; +st.shared.u32 [r11257+1500], r11273; +ld.local.u32 r11274, [rd4+124]; +st.shared.u32 [r11257+1600], r11274; +ld.local.u32 r11275, [rd4+132]; +st.shared.u32 [r11257+1700], r11275; +ld.local.u32 r11276, [rd4+140]; +st.shared.u32 [r11257+1800], r11276; +ld.local.u32 r11277, [rd4+148]; +st.shared.u32 [r11257+1900], r11277; +ld.local.u32 r11278, [rd4+156]; +st.shared.u32 [r11257+2000], r11278; +ld.local.u32 r11279, [rd4+164]; +st.shared.u32 [r11257+2100], r11279; +ld.local.u32 r11280, [rd4+172]; +st.shared.u32 [r11257+2200], r11280; +ld.local.u32 r11281, [rd4+180]; +st.shared.u32 [r11257+2300], r11281; +ld.local.u32 r11282, [rd4+188]; +st.shared.u32 [r11257+2400], r11282; +barrier.sync 0; +ld.shared.u32 r7742, [r13]; +ld.shared.u32 r8064, [r13+2500]; +ld.shared.u32 r8386, [r13+5000]; +ld.shared.u32 r8708, [r13+7500]; +ld.shared.u32 r9030, [r13+10000]; +ld.shared.u32 r7739, [r13+12500]; +ld.shared.u32 r8061, [r13+15000]; +ld.shared.u32 r8383, [r13+17500]; +ld.shared.u32 r8705, [r13+20000]; +ld.shared.u32 r9027, [r13+22500]; +ld.shared.u32 r7745, [r13+25000]; +ld.shared.u32 r8067, [r13+27500]; +ld.shared.u32 r8389, [r13+30000]; +ld.shared.u32 r8711, [r13+32500]; +ld.shared.u32 r9033, [r13+35000]; +ld.shared.u32 r7746, [r13+37500]; +ld.shared.u32 r8068, [r13+40000]; +ld.shared.u32 r8390, [r13+42500]; +ld.shared.u32 r8712, [r13+45000]; +ld.shared.u32 r9034, [r13+47500]; +ld.shared.u32 r7740, [r13+50000]; +ld.shared.u32 r8062, [r13+52500]; +ld.shared.u32 r8384, [r13+55000]; +ld.shared.u32 r8706, [r13+57500]; +ld.shared.u32 r9028, [r13+60000]; +barrier.sync 0; +ld.local.u32 r11283, [rd4]; +st.shared.u32 [r11257], r11283; +ld.local.u32 r11284, [rd4+8]; +st.shared.u32 [r11257+100], r11284; +ld.local.u32 r11285, [rd4+16]; +st.shared.u32 [r11257+200], r11285; +ld.local.u32 r11286, [rd4+24]; +st.shared.u32 [r11257+300], r11286; +ld.local.u32 r11287, [rd4+32]; +st.shared.u32 [r11257+400], r11287; +ld.local.u32 r11288, [rd4+40]; +st.shared.u32 [r11257+500], r11288; +ld.local.u32 r11289, [rd4+48]; +st.shared.u32 [r11257+600], r11289; +ld.local.u32 r11290, [rd4+56]; +st.shared.u32 [r11257+700], r11290; +ld.local.u32 r11291, [rd4+64]; +st.shared.u32 [r11257+800], r11291; +ld.local.u32 r11292, [rd4+72]; +st.shared.u32 [r11257+900], r11292; +ld.local.u32 r11293, [rd4+80]; +st.shared.u32 [r11257+1000], r11293; +ld.local.u32 r11294, [rd4+88]; +st.shared.u32 [r11257+1100], r11294; +ld.local.u32 r11295, [rd4+96]; +st.shared.u32 [r11257+1200], r11295; +ld.local.u32 r11296, [rd4+104]; +st.shared.u32 [r11257+1300], r11296; +ld.local.u32 r11297, [rd4+112]; +st.shared.u32 [r11257+1400], r11297; +ld.local.u32 r11298, [rd4+120]; +st.shared.u32 [r11257+1500], r11298; +ld.local.u32 r11299, [rd4+128]; +st.shared.u32 [r11257+1600], r11299; +ld.local.u32 r11300, [rd4+136]; +st.shared.u32 [r11257+1700], r11300; +ld.local.u32 r11301, [rd4+144]; +st.shared.u32 [r11257+1800], r11301; +ld.local.u32 r11302, [rd4+152]; +st.shared.u32 [r11257+1900], r11302; +ld.local.u32 r11303, [rd4+160]; +st.shared.u32 [r11257+2000], r11303; +ld.local.u32 r11304, [rd4+168]; +st.shared.u32 [r11257+2100], r11304; +ld.local.u32 r11305, [rd4+176]; +st.shared.u32 [r11257+2200], r11305; +ld.local.u32 r11306, [rd4+184]; +st.shared.u32 [r11257+2300], r11306; +ld.local.u32 r11307, [rd4+192]; +st.shared.u32 [r11257+2400], r11307; +barrier.sync 0; +ld.shared.u32 r7754, [r13]; +ld.shared.u32 r8076, [r13+2500]; +ld.shared.u32 r8398, [r13+5000]; +ld.shared.u32 r8720, [r13+7500]; +ld.shared.u32 r9042, [r13+10000]; +ld.shared.u32 r7751, [r13+12500]; +ld.shared.u32 r8073, [r13+15000]; +ld.shared.u32 r8395, [r13+17500]; +ld.shared.u32 r8717, [r13+20000]; +ld.shared.u32 r9039, [r13+22500]; +ld.shared.u32 r7757, [r13+25000]; +ld.shared.u32 r8079, [r13+27500]; +ld.shared.u32 r8401, [r13+30000]; +ld.shared.u32 r8723, [r13+32500]; +ld.shared.u32 r9045, [r13+35000]; +ld.shared.u32 r7758, [r13+37500]; +ld.shared.u32 r8080, [r13+40000]; +ld.shared.u32 r8402, [r13+42500]; +ld.shared.u32 r8724, [r13+45000]; +ld.shared.u32 r9046, [r13+47500]; +ld.shared.u32 r7752, [r13+50000]; +ld.shared.u32 r8074, [r13+52500]; +ld.shared.u32 r8396, [r13+55000]; +ld.shared.u32 r8718, [r13+57500]; +ld.shared.u32 r9040, [r13+60000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7729, {low, high}; +} +{ +neg.f16x2 r7730, r7729; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r7732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r7733, {low, high}; +} +{ +neg.f16x2 r7734, r7733; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r7736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r7737, {low, high}; +} +{ +add.f16x2 r7738, r7739, r7740; +} +{ +add.f16x2 r7741, r7742, r7738; +} +{ +add.f16x2 r7744, r7745, r7746; +} +{ +add.f16x2 r7747, r7741, r7744; +} +{ +add.f16x2 r7750, r7751, r7752; +} +{ +add.f16x2 r7753, r7754, r7750; +} +{ +add.f16x2 r7756, r7757, r7758; +} +{ +add.f16x2 r7759, r7753, r7756; +} +{ +add.f16x2 r7762, r7739, r7740; +} +{ +mul.f16x2 r7765, r7762, r7728; +} +{ +add.f16x2 r7768, r7742, r7765; +} +{ +add.f16x2 r7771, r7745, r7746; +} +{ +mul.f16x2 r7774, r7771, r7732; +} +{ +add.f16x2 r7777, r7768, r7774; +} +{ +sub.f16x2 r7780, r7751, r7752; +} +{ +mul.f16x2 r7783, r7780, r7730; +} +{ +sub.f16x2 r7786, r7757, r7758; +} +{ +mul.f16x2 r7789, r7786, r7734; +} +{ +add.f16x2 r7792, r7783, r7789; +} +{ +sub.f16x2 r7795, r7777, r7792; +} +{ +add.f16x2 r7798, r7739, r7740; +} +{ +mul.f16x2 r7801, r7798, r7728; +} +{ +add.f16x2 r7804, r7742, r7801; +} +{ +add.f16x2 r7807, r7745, r7746; +} +{ +mul.f16x2 r7810, r7807, r7732; +} +{ +add.f16x2 r7813, r7804, r7810; +} +{ +sub.f16x2 r7816, r7751, r7752; +} +{ +mul.f16x2 r7819, r7816, r7730; +} +{ +sub.f16x2 r7822, r7757, r7758; +} +{ +mul.f16x2 r7825, r7822, r7734; +} +{ +add.f16x2 r7828, r7819, r7825; +} +{ +add.f16x2 r7831, r7813, r7828; +} +{ +add.f16x2 r7834, r7739, r7740; +} +{ +mul.f16x2 r7837, r7834, r7732; +} +{ +add.f16x2 r7840, r7742, r7837; +} +{ +add.f16x2 r7843, r7745, r7746; +} +{ +mul.f16x2 r7846, r7843, r7736; +} +{ +add.f16x2 r7849, r7840, r7846; +} +{ +sub.f16x2 r7852, r7751, r7752; +} +{ +mul.f16x2 r7855, r7852, r7734; +} +{ +sub.f16x2 r7858, r7757, r7758; +} +{ +mul.f16x2 r7861, r7858, r7737; +} +{ +add.f16x2 r7864, r7855, r7861; +} +{ +sub.f16x2 r7867, r7849, r7864; +} +{ +add.f16x2 r7870, r7739, r7740; +} +{ +mul.f16x2 r7873, r7870, r7732; +} +{ +add.f16x2 r7876, r7742, r7873; +} +{ +add.f16x2 r7879, r7745, r7746; +} +{ +mul.f16x2 r7882, r7879, r7736; +} +{ +add.f16x2 r7885, r7876, r7882; +} +{ +sub.f16x2 r7888, r7751, r7752; +} +{ +mul.f16x2 r7891, r7888, r7734; +} +{ +sub.f16x2 r7894, r7757, r7758; +} +{ +mul.f16x2 r7897, r7894, r7737; +} +{ +add.f16x2 r7900, r7891, r7897; +} +{ +add.f16x2 r7903, r7885, r7900; +} +{ +add.f16x2 r7906, r7751, r7752; +} +{ +mul.f16x2 r7909, r7906, r7728; +} +{ +add.f16x2 r7912, r7754, r7909; +} +{ +add.f16x2 r7915, r7757, r7758; +} +{ +mul.f16x2 r7918, r7915, r7732; +} +{ +add.f16x2 r7921, r7912, r7918; +} +{ +sub.f16x2 r7924, r7739, r7740; +} +{ +mul.f16x2 r7927, r7924, r7730; +} +{ +sub.f16x2 r7930, r7745, r7746; +} +{ +mul.f16x2 r7933, r7930, r7734; +} +{ +add.f16x2 r7936, r7927, r7933; +} +{ +add.f16x2 r7939, r7921, r7936; +} +{ +add.f16x2 r7942, r7751, r7752; +} +{ +mul.f16x2 r7945, r7942, r7728; +} +{ +add.f16x2 r7948, r7754, r7945; +} +{ +add.f16x2 r7951, r7757, r7758; +} +{ +mul.f16x2 r7954, r7951, r7732; +} +{ +add.f16x2 r7957, r7948, r7954; +} +{ +sub.f16x2 r7960, r7739, r7740; +} +{ +mul.f16x2 r7963, r7960, r7730; +} +{ +sub.f16x2 r7966, r7745, r7746; +} +{ +mul.f16x2 r7969, r7966, r7734; +} +{ +add.f16x2 r7972, r7963, r7969; +} +{ +sub.f16x2 r7975, r7957, r7972; +} +{ +add.f16x2 r7978, r7751, r7752; +} +{ +mul.f16x2 r7981, r7978, r7732; +} +{ +add.f16x2 r7984, r7754, r7981; +} +{ +add.f16x2 r7987, r7757, r7758; +} +{ +mul.f16x2 r7990, r7987, r7736; +} +{ +add.f16x2 r7993, r7984, r7990; +} +{ +sub.f16x2 r7996, r7739, r7740; +} +{ +mul.f16x2 r7999, r7996, r7734; +} +{ +sub.f16x2 r8002, r7745, r7746; +} +{ +mul.f16x2 r8005, r8002, r7737; +} +{ +add.f16x2 r8008, r7999, r8005; +} +{ +add.f16x2 r8011, r7993, r8008; +} +{ +add.f16x2 r8014, r7751, r7752; +} +{ +mul.f16x2 r8017, r8014, r7732; +} +{ +add.f16x2 r8020, r7754, r8017; +} +{ +add.f16x2 r8023, r7757, r7758; +} +{ +mul.f16x2 r8026, r8023, r7736; +} +{ +add.f16x2 r8029, r8020, r8026; +} +{ +sub.f16x2 r8032, r7739, r7740; +} +{ +mul.f16x2 r8035, r8032, r7734; +} +{ +sub.f16x2 r8038, r7745, r7746; +} +{ +mul.f16x2 r8041, r8038, r7737; +} +{ +add.f16x2 r8044, r8035, r8041; +} +{ +sub.f16x2 r8047, r8029, r8044; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8051, {low, high}; +} +{ +neg.f16x2 r8052, r8051; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8054, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8055, {low, high}; +} +{ +neg.f16x2 r8056, r8055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8058, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8059, {low, high}; +} +{ +add.f16x2 r8060, r8061, r8062; +} +{ +add.f16x2 r8063, r8064, r8060; +} +{ +add.f16x2 r8066, r8067, r8068; +} +{ +add.f16x2 r8069, r8063, r8066; +} +{ +add.f16x2 r8072, r8073, r8074; +} +{ +add.f16x2 r8075, r8076, r8072; +} +{ +add.f16x2 r8078, r8079, r8080; +} +{ +add.f16x2 r8081, r8075, r8078; +} +{ +add.f16x2 r8084, r8061, r8062; +} +{ +mul.f16x2 r8087, r8084, r8050; +} +{ +add.f16x2 r8090, r8064, r8087; +} +{ +add.f16x2 r8093, r8067, r8068; +} +{ +mul.f16x2 r8096, r8093, r8054; +} +{ +add.f16x2 r8099, r8090, r8096; +} +{ +sub.f16x2 r8102, r8073, r8074; +} +{ +mul.f16x2 r8105, r8102, r8052; +} +{ +sub.f16x2 r8108, r8079, r8080; +} +{ +mul.f16x2 r8111, r8108, r8056; +} +{ +add.f16x2 r8114, r8105, r8111; +} +{ +sub.f16x2 r8117, r8099, r8114; +} +{ +add.f16x2 r8120, r8061, r8062; +} +{ +mul.f16x2 r8123, r8120, r8050; +} +{ +add.f16x2 r8126, r8064, r8123; +} +{ +add.f16x2 r8129, r8067, r8068; +} +{ +mul.f16x2 r8132, r8129, r8054; +} +{ +add.f16x2 r8135, r8126, r8132; +} +{ +sub.f16x2 r8138, r8073, r8074; +} +{ +mul.f16x2 r8141, r8138, r8052; +} +{ +sub.f16x2 r8144, r8079, r8080; +} +{ +mul.f16x2 r8147, r8144, r8056; +} +{ +add.f16x2 r8150, r8141, r8147; +} +{ +add.f16x2 r8153, r8135, r8150; +} +{ +add.f16x2 r8156, r8061, r8062; +} +{ +mul.f16x2 r8159, r8156, r8054; +} +{ +add.f16x2 r8162, r8064, r8159; +} +{ +add.f16x2 r8165, r8067, r8068; +} +{ +mul.f16x2 r8168, r8165, r8058; +} +{ +add.f16x2 r8171, r8162, r8168; +} +{ +sub.f16x2 r8174, r8073, r8074; +} +{ +mul.f16x2 r8177, r8174, r8056; +} +{ +sub.f16x2 r8180, r8079, r8080; +} +{ +mul.f16x2 r8183, r8180, r8059; +} +{ +add.f16x2 r8186, r8177, r8183; +} +{ +sub.f16x2 r8189, r8171, r8186; +} +{ +add.f16x2 r8192, r8061, r8062; +} +{ +mul.f16x2 r8195, r8192, r8054; +} +{ +add.f16x2 r8198, r8064, r8195; +} +{ +add.f16x2 r8201, r8067, r8068; +} +{ +mul.f16x2 r8204, r8201, r8058; +} +{ +add.f16x2 r8207, r8198, r8204; +} +{ +sub.f16x2 r8210, r8073, r8074; +} +{ +mul.f16x2 r8213, r8210, r8056; +} +{ +sub.f16x2 r8216, r8079, r8080; +} +{ +mul.f16x2 r8219, r8216, r8059; +} +{ +add.f16x2 r8222, r8213, r8219; +} +{ +add.f16x2 r8225, r8207, r8222; +} +{ +add.f16x2 r8228, r8073, r8074; +} +{ +mul.f16x2 r8231, r8228, r8050; +} +{ +add.f16x2 r8234, r8076, r8231; +} +{ +add.f16x2 r8237, r8079, r8080; +} +{ +mul.f16x2 r8240, r8237, r8054; +} +{ +add.f16x2 r8243, r8234, r8240; +} +{ +sub.f16x2 r8246, r8061, r8062; +} +{ +mul.f16x2 r8249, r8246, r8052; +} +{ +sub.f16x2 r8252, r8067, r8068; +} +{ +mul.f16x2 r8255, r8252, r8056; +} +{ +add.f16x2 r8258, r8249, r8255; +} +{ +add.f16x2 r8261, r8243, r8258; +} +{ +add.f16x2 r8264, r8073, r8074; +} +{ +mul.f16x2 r8267, r8264, r8050; +} +{ +add.f16x2 r8270, r8076, r8267; +} +{ +add.f16x2 r8273, r8079, r8080; +} +{ +mul.f16x2 r8276, r8273, r8054; +} +{ +add.f16x2 r8279, r8270, r8276; +} +{ +sub.f16x2 r8282, r8061, r8062; +} +{ +mul.f16x2 r8285, r8282, r8052; +} +{ +sub.f16x2 r8288, r8067, r8068; +} +{ +mul.f16x2 r8291, r8288, r8056; +} +{ +add.f16x2 r8294, r8285, r8291; +} +{ +sub.f16x2 r8297, r8279, r8294; +} +{ +add.f16x2 r8300, r8073, r8074; +} +{ +mul.f16x2 r8303, r8300, r8054; +} +{ +add.f16x2 r8306, r8076, r8303; +} +{ +add.f16x2 r8309, r8079, r8080; +} +{ +mul.f16x2 r8312, r8309, r8058; +} +{ +add.f16x2 r8315, r8306, r8312; +} +{ +sub.f16x2 r8318, r8061, r8062; +} +{ +mul.f16x2 r8321, r8318, r8056; +} +{ +sub.f16x2 r8324, r8067, r8068; +} +{ +mul.f16x2 r8327, r8324, r8059; +} +{ +add.f16x2 r8330, r8321, r8327; +} +{ +add.f16x2 r8333, r8315, r8330; +} +{ +add.f16x2 r8336, r8073, r8074; +} +{ +mul.f16x2 r8339, r8336, r8054; +} +{ +add.f16x2 r8342, r8076, r8339; +} +{ +add.f16x2 r8345, r8079, r8080; +} +{ +mul.f16x2 r8348, r8345, r8058; +} +{ +add.f16x2 r8351, r8342, r8348; +} +{ +sub.f16x2 r8354, r8061, r8062; +} +{ +mul.f16x2 r8357, r8354, r8056; +} +{ +sub.f16x2 r8360, r8067, r8068; +} +{ +mul.f16x2 r8363, r8360, r8059; +} +{ +add.f16x2 r8366, r8357, r8363; +} +{ +sub.f16x2 r8369, r8351, r8366; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8373, {low, high}; +} +{ +neg.f16x2 r8374, r8373; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8377, {low, high}; +} +{ +neg.f16x2 r8378, r8377; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8380, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8381, {low, high}; +} +{ +add.f16x2 r8382, r8383, r8384; +} +{ +add.f16x2 r8385, r8386, r8382; +} +{ +add.f16x2 r8388, r8389, r8390; +} +{ +add.f16x2 r8391, r8385, r8388; +} +{ +add.f16x2 r8394, r8395, r8396; +} +{ +add.f16x2 r8397, r8398, r8394; +} +{ +add.f16x2 r8400, r8401, r8402; +} +{ +add.f16x2 r8403, r8397, r8400; +} +{ +add.f16x2 r8406, r8383, r8384; +} +{ +mul.f16x2 r8409, r8406, r8372; +} +{ +add.f16x2 r8412, r8386, r8409; +} +{ +add.f16x2 r8415, r8389, r8390; +} +{ +mul.f16x2 r8418, r8415, r8376; +} +{ +add.f16x2 r8421, r8412, r8418; +} +{ +sub.f16x2 r8424, r8395, r8396; +} +{ +mul.f16x2 r8427, r8424, r8374; +} +{ +sub.f16x2 r8430, r8401, r8402; +} +{ +mul.f16x2 r8433, r8430, r8378; +} +{ +add.f16x2 r8436, r8427, r8433; +} +{ +sub.f16x2 r8439, r8421, r8436; +} +{ +add.f16x2 r8442, r8383, r8384; +} +{ +mul.f16x2 r8445, r8442, r8372; +} +{ +add.f16x2 r8448, r8386, r8445; +} +{ +add.f16x2 r8451, r8389, r8390; +} +{ +mul.f16x2 r8454, r8451, r8376; +} +{ +add.f16x2 r8457, r8448, r8454; +} +{ +sub.f16x2 r8460, r8395, r8396; +} +{ +mul.f16x2 r8463, r8460, r8374; +} +{ +sub.f16x2 r8466, r8401, r8402; +} +{ +mul.f16x2 r8469, r8466, r8378; +} +{ +add.f16x2 r8472, r8463, r8469; +} +{ +add.f16x2 r8475, r8457, r8472; +} +{ +add.f16x2 r8478, r8383, r8384; +} +{ +mul.f16x2 r8481, r8478, r8376; +} +{ +add.f16x2 r8484, r8386, r8481; +} +{ +add.f16x2 r8487, r8389, r8390; +} +{ +mul.f16x2 r8490, r8487, r8380; +} +{ +add.f16x2 r8493, r8484, r8490; +} +{ +sub.f16x2 r8496, r8395, r8396; +} +{ +mul.f16x2 r8499, r8496, r8378; +} +{ +sub.f16x2 r8502, r8401, r8402; +} +{ +mul.f16x2 r8505, r8502, r8381; +} +{ +add.f16x2 r8508, r8499, r8505; +} +{ +sub.f16x2 r8511, r8493, r8508; +} +{ +add.f16x2 r8514, r8383, r8384; +} +{ +mul.f16x2 r8517, r8514, r8376; +} +{ +add.f16x2 r8520, r8386, r8517; +} +{ +add.f16x2 r8523, r8389, r8390; +} +{ +mul.f16x2 r8526, r8523, r8380; +} +{ +add.f16x2 r8529, r8520, r8526; +} +{ +sub.f16x2 r8532, r8395, r8396; +} +{ +mul.f16x2 r8535, r8532, r8378; +} +{ +sub.f16x2 r8538, r8401, r8402; +} +{ +mul.f16x2 r8541, r8538, r8381; +} +{ +add.f16x2 r8544, r8535, r8541; +} +{ +add.f16x2 r8547, r8529, r8544; +} +{ +add.f16x2 r8550, r8395, r8396; +} +{ +mul.f16x2 r8553, r8550, r8372; +} +{ +add.f16x2 r8556, r8398, r8553; +} +{ +add.f16x2 r8559, r8401, r8402; +} +{ +mul.f16x2 r8562, r8559, r8376; +} +{ +add.f16x2 r8565, r8556, r8562; +} +{ +sub.f16x2 r8568, r8383, r8384; +} +{ +mul.f16x2 r8571, r8568, r8374; +} +{ +sub.f16x2 r8574, r8389, r8390; +} +{ +mul.f16x2 r8577, r8574, r8378; +} +{ +add.f16x2 r8580, r8571, r8577; +} +{ +add.f16x2 r8583, r8565, r8580; +} +{ +add.f16x2 r8586, r8395, r8396; +} +{ +mul.f16x2 r8589, r8586, r8372; +} +{ +add.f16x2 r8592, r8398, r8589; +} +{ +add.f16x2 r8595, r8401, r8402; +} +{ +mul.f16x2 r8598, r8595, r8376; +} +{ +add.f16x2 r8601, r8592, r8598; +} +{ +sub.f16x2 r8604, r8383, r8384; +} +{ +mul.f16x2 r8607, r8604, r8374; +} +{ +sub.f16x2 r8610, r8389, r8390; +} +{ +mul.f16x2 r8613, r8610, r8378; +} +{ +add.f16x2 r8616, r8607, r8613; +} +{ +sub.f16x2 r8619, r8601, r8616; +} +{ +add.f16x2 r8622, r8395, r8396; +} +{ +mul.f16x2 r8625, r8622, r8376; +} +{ +add.f16x2 r8628, r8398, r8625; +} +{ +add.f16x2 r8631, r8401, r8402; +} +{ +mul.f16x2 r8634, r8631, r8380; +} +{ +add.f16x2 r8637, r8628, r8634; +} +{ +sub.f16x2 r8640, r8383, r8384; +} +{ +mul.f16x2 r8643, r8640, r8378; +} +{ +sub.f16x2 r8646, r8389, r8390; +} +{ +mul.f16x2 r8649, r8646, r8381; +} +{ +add.f16x2 r8652, r8643, r8649; +} +{ +add.f16x2 r8655, r8637, r8652; +} +{ +add.f16x2 r8658, r8395, r8396; +} +{ +mul.f16x2 r8661, r8658, r8376; +} +{ +add.f16x2 r8664, r8398, r8661; +} +{ +add.f16x2 r8667, r8401, r8402; +} +{ +mul.f16x2 r8670, r8667, r8380; +} +{ +add.f16x2 r8673, r8664, r8670; +} +{ +sub.f16x2 r8676, r8383, r8384; +} +{ +mul.f16x2 r8679, r8676, r8378; +} +{ +sub.f16x2 r8682, r8389, r8390; +} +{ +mul.f16x2 r8685, r8682, r8381; +} +{ +add.f16x2 r8688, r8679, r8685; +} +{ +sub.f16x2 r8691, r8673, r8688; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8694, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8695, {low, high}; +} +{ +neg.f16x2 r8696, r8695; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r8698, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r8699, {low, high}; +} +{ +neg.f16x2 r8700, r8699; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r8702, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r8703, {low, high}; +} +{ +add.f16x2 r8704, r8705, r8706; +} +{ +add.f16x2 r8707, r8708, r8704; +} +{ +add.f16x2 r8710, r8711, r8712; +} +{ +add.f16x2 r8713, r8707, r8710; +} +{ +add.f16x2 r8716, r8717, r8718; +} +{ +add.f16x2 r8719, r8720, r8716; +} +{ +add.f16x2 r8722, r8723, r8724; +} +{ +add.f16x2 r8725, r8719, r8722; +} +{ +add.f16x2 r8728, r8705, r8706; +} +{ +mul.f16x2 r8731, r8728, r8694; +} +{ +add.f16x2 r8734, r8708, r8731; +} +{ +add.f16x2 r8737, r8711, r8712; +} +{ +mul.f16x2 r8740, r8737, r8698; +} +{ +add.f16x2 r8743, r8734, r8740; +} +{ +sub.f16x2 r8746, r8717, r8718; +} +{ +mul.f16x2 r8749, r8746, r8696; +} +{ +sub.f16x2 r8752, r8723, r8724; +} +{ +mul.f16x2 r8755, r8752, r8700; +} +{ +add.f16x2 r8758, r8749, r8755; +} +{ +sub.f16x2 r8761, r8743, r8758; +} +{ +add.f16x2 r8764, r8705, r8706; +} +{ +mul.f16x2 r8767, r8764, r8694; +} +{ +add.f16x2 r8770, r8708, r8767; +} +{ +add.f16x2 r8773, r8711, r8712; +} +{ +mul.f16x2 r8776, r8773, r8698; +} +{ +add.f16x2 r8779, r8770, r8776; +} +{ +sub.f16x2 r8782, r8717, r8718; +} +{ +mul.f16x2 r8785, r8782, r8696; +} +{ +sub.f16x2 r8788, r8723, r8724; +} +{ +mul.f16x2 r8791, r8788, r8700; +} +{ +add.f16x2 r8794, r8785, r8791; +} +{ +add.f16x2 r8797, r8779, r8794; +} +{ +add.f16x2 r8800, r8705, r8706; +} +{ +mul.f16x2 r8803, r8800, r8698; +} +{ +add.f16x2 r8806, r8708, r8803; +} +{ +add.f16x2 r8809, r8711, r8712; +} +{ +mul.f16x2 r8812, r8809, r8702; +} +{ +add.f16x2 r8815, r8806, r8812; +} +{ +sub.f16x2 r8818, r8717, r8718; +} +{ +mul.f16x2 r8821, r8818, r8700; +} +{ +sub.f16x2 r8824, r8723, r8724; +} +{ +mul.f16x2 r8827, r8824, r8703; +} +{ +add.f16x2 r8830, r8821, r8827; +} +{ +sub.f16x2 r8833, r8815, r8830; +} +{ +add.f16x2 r8836, r8705, r8706; +} +{ +mul.f16x2 r8839, r8836, r8698; +} +{ +add.f16x2 r8842, r8708, r8839; +} +{ +add.f16x2 r8845, r8711, r8712; +} +{ +mul.f16x2 r8848, r8845, r8702; +} +{ +add.f16x2 r8851, r8842, r8848; +} +{ +sub.f16x2 r8854, r8717, r8718; +} +{ +mul.f16x2 r8857, r8854, r8700; +} +{ +sub.f16x2 r8860, r8723, r8724; +} +{ +mul.f16x2 r8863, r8860, r8703; +} +{ +add.f16x2 r8866, r8857, r8863; +} +{ +add.f16x2 r8869, r8851, r8866; +} +{ +add.f16x2 r8872, r8717, r8718; +} +{ +mul.f16x2 r8875, r8872, r8694; +} +{ +add.f16x2 r8878, r8720, r8875; +} +{ +add.f16x2 r8881, r8723, r8724; +} +{ +mul.f16x2 r8884, r8881, r8698; +} +{ +add.f16x2 r8887, r8878, r8884; +} +{ +sub.f16x2 r8890, r8705, r8706; +} +{ +mul.f16x2 r8893, r8890, r8696; +} +{ +sub.f16x2 r8896, r8711, r8712; +} +{ +mul.f16x2 r8899, r8896, r8700; +} +{ +add.f16x2 r8902, r8893, r8899; +} +{ +add.f16x2 r8905, r8887, r8902; +} +{ +add.f16x2 r8908, r8717, r8718; +} +{ +mul.f16x2 r8911, r8908, r8694; +} +{ +add.f16x2 r8914, r8720, r8911; +} +{ +add.f16x2 r8917, r8723, r8724; +} +{ +mul.f16x2 r8920, r8917, r8698; +} +{ +add.f16x2 r8923, r8914, r8920; +} +{ +sub.f16x2 r8926, r8705, r8706; +} +{ +mul.f16x2 r8929, r8926, r8696; +} +{ +sub.f16x2 r8932, r8711, r8712; +} +{ +mul.f16x2 r8935, r8932, r8700; +} +{ +add.f16x2 r8938, r8929, r8935; +} +{ +sub.f16x2 r8941, r8923, r8938; +} +{ +add.f16x2 r8944, r8717, r8718; +} +{ +mul.f16x2 r8947, r8944, r8698; +} +{ +add.f16x2 r8950, r8720, r8947; +} +{ +add.f16x2 r8953, r8723, r8724; +} +{ +mul.f16x2 r8956, r8953, r8702; +} +{ +add.f16x2 r8959, r8950, r8956; +} +{ +sub.f16x2 r8962, r8705, r8706; +} +{ +mul.f16x2 r8965, r8962, r8700; +} +{ +sub.f16x2 r8968, r8711, r8712; +} +{ +mul.f16x2 r8971, r8968, r8703; +} +{ +add.f16x2 r8974, r8965, r8971; +} +{ +add.f16x2 r8977, r8959, r8974; +} +{ +add.f16x2 r8980, r8717, r8718; +} +{ +mul.f16x2 r8983, r8980, r8698; +} +{ +add.f16x2 r8986, r8720, r8983; +} +{ +add.f16x2 r8989, r8723, r8724; +} +{ +mul.f16x2 r8992, r8989, r8702; +} +{ +add.f16x2 r8995, r8986, r8992; +} +{ +sub.f16x2 r8998, r8705, r8706; +} +{ +mul.f16x2 r9001, r8998, r8700; +} +{ +sub.f16x2 r9004, r8711, r8712; +} +{ +mul.f16x2 r9007, r9004, r8703; +} +{ +add.f16x2 r9010, r9001, r9007; +} +{ +sub.f16x2 r9013, r8995, r9010; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9016, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9017, {low, high}; +} +{ +neg.f16x2 r9018, r9017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9021, {low, high}; +} +{ +neg.f16x2 r9022, r9021; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9024, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9025, {low, high}; +} +{ +add.f16x2 r9026, r9027, r9028; +} +{ +add.f16x2 r9029, r9030, r9026; +} +{ +add.f16x2 r9032, r9033, r9034; +} +{ +add.f16x2 r9035, r9029, r9032; +} +{ +add.f16x2 r9038, r9039, r9040; +} +{ +add.f16x2 r9041, r9042, r9038; +} +{ +add.f16x2 r9044, r9045, r9046; +} +{ +add.f16x2 r9047, r9041, r9044; +} +{ +add.f16x2 r9050, r9027, r9028; +} +{ +mul.f16x2 r9053, r9050, r9016; +} +{ +add.f16x2 r9056, r9030, r9053; +} +{ +add.f16x2 r9059, r9033, r9034; +} +{ +mul.f16x2 r9062, r9059, r9020; +} +{ +add.f16x2 r9065, r9056, r9062; +} +{ +sub.f16x2 r9068, r9039, r9040; +} +{ +mul.f16x2 r9071, r9068, r9018; +} +{ +sub.f16x2 r9074, r9045, r9046; +} +{ +mul.f16x2 r9077, r9074, r9022; +} +{ +add.f16x2 r9080, r9071, r9077; +} +{ +sub.f16x2 r9083, r9065, r9080; +} +{ +add.f16x2 r9086, r9027, r9028; +} +{ +mul.f16x2 r9089, r9086, r9016; +} +{ +add.f16x2 r9092, r9030, r9089; +} +{ +add.f16x2 r9095, r9033, r9034; +} +{ +mul.f16x2 r9098, r9095, r9020; +} +{ +add.f16x2 r9101, r9092, r9098; +} +{ +sub.f16x2 r9104, r9039, r9040; +} +{ +mul.f16x2 r9107, r9104, r9018; +} +{ +sub.f16x2 r9110, r9045, r9046; +} +{ +mul.f16x2 r9113, r9110, r9022; +} +{ +add.f16x2 r9116, r9107, r9113; +} +{ +add.f16x2 r9119, r9101, r9116; +} +{ +add.f16x2 r9122, r9027, r9028; +} +{ +mul.f16x2 r9125, r9122, r9020; +} +{ +add.f16x2 r9128, r9030, r9125; +} +{ +add.f16x2 r9131, r9033, r9034; +} +{ +mul.f16x2 r9134, r9131, r9024; +} +{ +add.f16x2 r9137, r9128, r9134; +} +{ +sub.f16x2 r9140, r9039, r9040; +} +{ +mul.f16x2 r9143, r9140, r9022; +} +{ +sub.f16x2 r9146, r9045, r9046; +} +{ +mul.f16x2 r9149, r9146, r9025; +} +{ +add.f16x2 r9152, r9143, r9149; +} +{ +sub.f16x2 r9155, r9137, r9152; +} +{ +add.f16x2 r9158, r9027, r9028; +} +{ +mul.f16x2 r9161, r9158, r9020; +} +{ +add.f16x2 r9164, r9030, r9161; +} +{ +add.f16x2 r9167, r9033, r9034; +} +{ +mul.f16x2 r9170, r9167, r9024; +} +{ +add.f16x2 r9173, r9164, r9170; +} +{ +sub.f16x2 r9176, r9039, r9040; +} +{ +mul.f16x2 r9179, r9176, r9022; +} +{ +sub.f16x2 r9182, r9045, r9046; +} +{ +mul.f16x2 r9185, r9182, r9025; +} +{ +add.f16x2 r9188, r9179, r9185; +} +{ +add.f16x2 r9191, r9173, r9188; +} +{ +add.f16x2 r9194, r9039, r9040; +} +{ +mul.f16x2 r9197, r9194, r9016; +} +{ +add.f16x2 r9200, r9042, r9197; +} +{ +add.f16x2 r9203, r9045, r9046; +} +{ +mul.f16x2 r9206, r9203, r9020; +} +{ +add.f16x2 r9209, r9200, r9206; +} +{ +sub.f16x2 r9212, r9027, r9028; +} +{ +mul.f16x2 r9215, r9212, r9018; +} +{ +sub.f16x2 r9218, r9033, r9034; +} +{ +mul.f16x2 r9221, r9218, r9022; +} +{ +add.f16x2 r9224, r9215, r9221; +} +{ +add.f16x2 r9227, r9209, r9224; +} +{ +add.f16x2 r9230, r9039, r9040; +} +{ +mul.f16x2 r9233, r9230, r9016; +} +{ +add.f16x2 r9236, r9042, r9233; +} +{ +add.f16x2 r9239, r9045, r9046; +} +{ +mul.f16x2 r9242, r9239, r9020; +} +{ +add.f16x2 r9245, r9236, r9242; +} +{ +sub.f16x2 r9248, r9027, r9028; +} +{ +mul.f16x2 r9251, r9248, r9018; +} +{ +sub.f16x2 r9254, r9033, r9034; +} +{ +mul.f16x2 r9257, r9254, r9022; +} +{ +add.f16x2 r9260, r9251, r9257; +} +{ +sub.f16x2 r9263, r9245, r9260; +} +{ +add.f16x2 r9266, r9039, r9040; +} +{ +mul.f16x2 r9269, r9266, r9020; +} +{ +add.f16x2 r9272, r9042, r9269; +} +{ +add.f16x2 r9275, r9045, r9046; +} +{ +mul.f16x2 r9278, r9275, r9024; +} +{ +add.f16x2 r9281, r9272, r9278; +} +{ +sub.f16x2 r9284, r9027, r9028; +} +{ +mul.f16x2 r9287, r9284, r9022; +} +{ +sub.f16x2 r9290, r9033, r9034; +} +{ +mul.f16x2 r9293, r9290, r9025; +} +{ +add.f16x2 r9296, r9287, r9293; +} +{ +add.f16x2 r9299, r9281, r9296; +} +{ +add.f16x2 r9302, r9039, r9040; +} +{ +mul.f16x2 r9305, r9302, r9020; +} +{ +add.f16x2 r9308, r9042, r9305; +} +{ +add.f16x2 r9311, r9045, r9046; +} +{ +mul.f16x2 r9314, r9311, r9024; +} +{ +add.f16x2 r9317, r9308, r9314; +} +{ +sub.f16x2 r9320, r9027, r9028; +} +{ +mul.f16x2 r9323, r9320, r9022; +} +{ +sub.f16x2 r9326, r9033, r9034; +} +{ +mul.f16x2 r9329, r9326, r9025; +} +{ +add.f16x2 r9332, r9323, r9329; +} +{ +sub.f16x2 r9335, r9317, r9332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r9338, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r9339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r9340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f68; +cvt.rn.f16.f32 high, f68; +mov.b32 r9341, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f70; +mov.b32 r9342, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f72; +mov.b32 r9343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r9344, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r9345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r9348, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r9349, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r9352, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r9353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9354, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r9355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r9360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r9361, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9368, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r9369, {low, high}; +} +{ +mul.f16x2 r9386, r8117, r9338; +} +{ +mul.f16x2 r9389, r8261, r9339; +} +{ +sub.f16x2 r9392, r9386, r9389; +} +{ +mul.f16x2 r9395, r8117, r9339; +} +{ +fma.rn.f16x2 r9398, r8261, r9338, r9395; +} +{ +mul.f16x2 r9402, r8439, r9340; +} +{ +mul.f16x2 r9405, r8583, r9341; +} +{ +sub.f16x2 r9408, r9402, r9405; +} +{ +mul.f16x2 r9411, r8439, r9341; +} +{ +fma.rn.f16x2 r9414, r8583, r9340, r9411; +} +{ +mul.f16x2 r9418, r8761, r9342; +} +{ +mul.f16x2 r9421, r8905, r9343; +} +{ +sub.f16x2 r9424, r9418, r9421; +} +{ +mul.f16x2 r9427, r8761, r9343; +} +{ +fma.rn.f16x2 r9430, r8905, r9342, r9427; +} +{ +mul.f16x2 r9434, r9083, r9344; +} +{ +mul.f16x2 r9437, r9227, r9345; +} +{ +sub.f16x2 r9440, r9434, r9437; +} +{ +mul.f16x2 r9443, r9083, r9345; +} +{ +fma.rn.f16x2 r9446, r9227, r9344, r9443; +} +{ +mul.f16x2 r9450, r8189, r9340; +} +{ +mul.f16x2 r9453, r8333, r9341; +} +{ +sub.f16x2 r9456, r9450, r9453; +} +{ +mul.f16x2 r9459, r8189, r9341; +} +{ +fma.rn.f16x2 r9462, r8333, r9340, r9459; +} +{ +mul.f16x2 r9466, r8511, r9344; +} +{ +mul.f16x2 r9469, r8655, r9345; +} +{ +sub.f16x2 r9472, r9466, r9469; +} +{ +mul.f16x2 r9475, r8511, r9345; +} +{ +fma.rn.f16x2 r9478, r8655, r9344, r9475; +} +{ +mul.f16x2 r9482, r8833, r9348; +} +{ +mul.f16x2 r9485, r8977, r9349; +} +{ +sub.f16x2 r9488, r9482, r9485; +} +{ +mul.f16x2 r9491, r8833, r9349; +} +{ +fma.rn.f16x2 r9494, r8977, r9348, r9491; +} +{ +mul.f16x2 r9498, r9155, r9352; +} +{ +mul.f16x2 r9501, r9299, r9353; +} +{ +sub.f16x2 r9504, r9498, r9501; +} +{ +mul.f16x2 r9507, r9155, r9353; +} +{ +fma.rn.f16x2 r9510, r9299, r9352, r9507; +} +{ +mul.f16x2 r9514, r8225, r9342; +} +{ +mul.f16x2 r9517, r8369, r9343; +} +{ +sub.f16x2 r9520, r9514, r9517; +} +{ +mul.f16x2 r9523, r8225, r9343; +} +{ +fma.rn.f16x2 r9526, r8369, r9342, r9523; +} +{ +mul.f16x2 r9530, r8547, r9348; +} +{ +mul.f16x2 r9533, r8691, r9349; +} +{ +sub.f16x2 r9536, r9530, r9533; +} +{ +mul.f16x2 r9539, r8547, r9349; +} +{ +fma.rn.f16x2 r9542, r8691, r9348, r9539; +} +{ +mul.f16x2 r9546, r8869, r9354; +} +{ +mul.f16x2 r9549, r9013, r9355; +} +{ +sub.f16x2 r9552, r9546, r9549; +} +{ +mul.f16x2 r9555, r8869, r9355; +} +{ +fma.rn.f16x2 r9558, r9013, r9354, r9555; +} +{ +mul.f16x2 r9562, r9191, r9360; +} +{ +mul.f16x2 r9565, r9335, r9361; +} +{ +sub.f16x2 r9568, r9562, r9565; +} +{ +mul.f16x2 r9571, r9191, r9361; +} +{ +fma.rn.f16x2 r9574, r9335, r9360, r9571; +} +{ +mul.f16x2 r9578, r8153, r9344; +} +{ +mul.f16x2 r9581, r8297, r9345; +} +{ +sub.f16x2 r9584, r9578, r9581; +} +{ +mul.f16x2 r9587, r8153, r9345; +} +{ +fma.rn.f16x2 r9590, r8297, r9344, r9587; +} +{ +mul.f16x2 r9594, r8475, r9352; +} +{ +mul.f16x2 r9597, r8619, r9353; +} +{ +sub.f16x2 r9600, r9594, r9597; +} +{ +mul.f16x2 r9603, r8475, r9353; +} +{ +fma.rn.f16x2 r9606, r8619, r9352, r9603; +} +{ +mul.f16x2 r9610, r8797, r9360; +} +{ +mul.f16x2 r9613, r8941, r9361; +} +{ +sub.f16x2 r9616, r9610, r9613; +} +{ +mul.f16x2 r9619, r8797, r9361; +} +{ +fma.rn.f16x2 r9622, r8941, r9360, r9619; +} +{ +mul.f16x2 r9626, r9119, r9368; +} +{ +mul.f16x2 r9629, r9263, r9369; +} +{ +sub.f16x2 r9632, r9626, r9629; +} +{ +mul.f16x2 r9635, r9119, r9369; +} +{ +fma.rn.f16x2 r9638, r9263, r9368, r9635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9643, {low, high}; +} +{ +neg.f16x2 r9644, r9643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9647, {low, high}; +} +{ +neg.f16x2 r9648, r9647; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9651, {low, high}; +} +{ +add.f16x2 r9652, r8069, r9035; +} +{ +add.f16x2 r9655, r7747, r9652; +} +{ +add.f16x2 r9658, r8391, r8713; +} +{ +add.f16x2 %0, r9655, r9658; +} +{ +add.f16x2 r9664, r8081, r9047; +} +{ +add.f16x2 r9667, r7759, r9664; +} +{ +add.f16x2 r9670, r8403, r8725; +} +{ +add.f16x2 %1, r9667, r9670; +} +{ +add.f16x2 r9676, r8069, r9035; +} +{ +mul.f16x2 r9679, r9676, r9642; +} +{ +add.f16x2 r9682, r7747, r9679; +} +{ +add.f16x2 r9685, r8391, r8713; +} +{ +mul.f16x2 r9688, r9685, r9646; +} +{ +add.f16x2 r9691, r9682, r9688; +} +{ +sub.f16x2 r9694, r8081, r9047; +} +{ +mul.f16x2 r9697, r9694, r9644; +} +{ +sub.f16x2 r9700, r8403, r8725; +} +{ +mul.f16x2 r9703, r9700, r9648; +} +{ +add.f16x2 r9706, r9697, r9703; +} +{ +sub.f16x2 %10, r9691, r9706; +} +{ +add.f16x2 r9712, r8069, r9035; +} +{ +mul.f16x2 r9715, r9712, r9642; +} +{ +add.f16x2 r9718, r7747, r9715; +} +{ +add.f16x2 r9721, r8391, r8713; +} +{ +mul.f16x2 r9724, r9721, r9646; +} +{ +add.f16x2 r9727, r9718, r9724; +} +{ +sub.f16x2 r9730, r8081, r9047; +} +{ +mul.f16x2 r9733, r9730, r9644; +} +{ +sub.f16x2 r9736, r8403, r8725; +} +{ +mul.f16x2 r9739, r9736, r9648; +} +{ +add.f16x2 r9742, r9733, r9739; +} +{ +add.f16x2 %40, r9727, r9742; +} +{ +add.f16x2 r9748, r8069, r9035; +} +{ +mul.f16x2 r9751, r9748, r9646; +} +{ +add.f16x2 r9754, r7747, r9751; +} +{ +add.f16x2 r9757, r8391, r8713; +} +{ +mul.f16x2 r9760, r9757, r9650; +} +{ +add.f16x2 r9763, r9754, r9760; +} +{ +sub.f16x2 r9766, r8081, r9047; +} +{ +mul.f16x2 r9769, r9766, r9648; +} +{ +sub.f16x2 r9772, r8403, r8725; +} +{ +mul.f16x2 r9775, r9772, r9651; +} +{ +add.f16x2 r9778, r9769, r9775; +} +{ +sub.f16x2 %20, r9763, r9778; +} +{ +add.f16x2 r9784, r8069, r9035; +} +{ +mul.f16x2 r9787, r9784, r9646; +} +{ +add.f16x2 r9790, r7747, r9787; +} +{ +add.f16x2 r9793, r8391, r8713; +} +{ +mul.f16x2 r9796, r9793, r9650; +} +{ +add.f16x2 r9799, r9790, r9796; +} +{ +sub.f16x2 r9802, r8081, r9047; +} +{ +mul.f16x2 r9805, r9802, r9648; +} +{ +sub.f16x2 r9808, r8403, r8725; +} +{ +mul.f16x2 r9811, r9808, r9651; +} +{ +add.f16x2 r9814, r9805, r9811; +} +{ +add.f16x2 %30, r9799, r9814; +} +{ +add.f16x2 r9820, r8081, r9047; +} +{ +mul.f16x2 r9823, r9820, r9642; +} +{ +add.f16x2 r9826, r7759, r9823; +} +{ +add.f16x2 r9829, r8403, r8725; +} +{ +mul.f16x2 r9832, r9829, r9646; +} +{ +add.f16x2 r9835, r9826, r9832; +} +{ +sub.f16x2 r9838, r8069, r9035; +} +{ +mul.f16x2 r9841, r9838, r9644; +} +{ +sub.f16x2 r9844, r8391, r8713; +} +{ +mul.f16x2 r9847, r9844, r9648; +} +{ +add.f16x2 r9850, r9841, r9847; +} +{ +add.f16x2 %11, r9835, r9850; +} +{ +add.f16x2 r9856, r8081, r9047; +} +{ +mul.f16x2 r9859, r9856, r9642; +} +{ +add.f16x2 r9862, r7759, r9859; +} +{ +add.f16x2 r9865, r8403, r8725; +} +{ +mul.f16x2 r9868, r9865, r9646; +} +{ +add.f16x2 r9871, r9862, r9868; +} +{ +sub.f16x2 r9874, r8069, r9035; +} +{ +mul.f16x2 r9877, r9874, r9644; +} +{ +sub.f16x2 r9880, r8391, r8713; +} +{ +mul.f16x2 r9883, r9880, r9648; +} +{ +add.f16x2 r9886, r9877, r9883; +} +{ +sub.f16x2 %41, r9871, r9886; +} +{ +add.f16x2 r9892, r8081, r9047; +} +{ +mul.f16x2 r9895, r9892, r9646; +} +{ +add.f16x2 r9898, r7759, r9895; +} +{ +add.f16x2 r9901, r8403, r8725; +} +{ +mul.f16x2 r9904, r9901, r9650; +} +{ +add.f16x2 r9907, r9898, r9904; +} +{ +sub.f16x2 r9910, r8069, r9035; +} +{ +mul.f16x2 r9913, r9910, r9648; +} +{ +sub.f16x2 r9916, r8391, r8713; +} +{ +mul.f16x2 r9919, r9916, r9651; +} +{ +add.f16x2 r9922, r9913, r9919; +} +{ +add.f16x2 %21, r9907, r9922; +} +{ +add.f16x2 r9928, r8081, r9047; +} +{ +mul.f16x2 r9931, r9928, r9646; +} +{ +add.f16x2 r9934, r7759, r9931; +} +{ +add.f16x2 r9937, r8403, r8725; +} +{ +mul.f16x2 r9940, r9937, r9650; +} +{ +add.f16x2 r9943, r9934, r9940; +} +{ +sub.f16x2 r9946, r8069, r9035; +} +{ +mul.f16x2 r9949, r9946, r9648; +} +{ +sub.f16x2 r9952, r8391, r8713; +} +{ +mul.f16x2 r9955, r9952, r9651; +} +{ +add.f16x2 r9958, r9949, r9955; +} +{ +sub.f16x2 %31, r9943, r9958; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9965, {low, high}; +} +{ +neg.f16x2 r9966, r9965; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r9968, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r9969, {low, high}; +} +{ +neg.f16x2 r9970, r9969; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r9972, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r9973, {low, high}; +} +{ +add.f16x2 r9974, r9392, r9440; +} +{ +add.f16x2 r9977, r7795, r9974; +} +{ +add.f16x2 r9980, r9408, r9424; +} +{ +add.f16x2 %2, r9977, r9980; +} +{ +add.f16x2 r9986, r9398, r9446; +} +{ +add.f16x2 r9989, r7939, r9986; +} +{ +add.f16x2 r9992, r9414, r9430; +} +{ +add.f16x2 %3, r9989, r9992; +} +{ +add.f16x2 r9998, r9392, r9440; +} +{ +mul.f16x2 r10001, r9998, r9964; +} +{ +add.f16x2 r10004, r7795, r10001; +} +{ +add.f16x2 r10007, r9408, r9424; +} +{ +mul.f16x2 r10010, r10007, r9968; +} +{ +add.f16x2 r10013, r10004, r10010; +} +{ +sub.f16x2 r10016, r9398, r9446; +} +{ +mul.f16x2 r10019, r10016, r9966; +} +{ +sub.f16x2 r10022, r9414, r9430; +} +{ +mul.f16x2 r10025, r10022, r9970; +} +{ +add.f16x2 r10028, r10019, r10025; +} +{ +sub.f16x2 %12, r10013, r10028; +} +{ +add.f16x2 r10034, r9392, r9440; +} +{ +mul.f16x2 r10037, r10034, r9964; +} +{ +add.f16x2 r10040, r7795, r10037; +} +{ +add.f16x2 r10043, r9408, r9424; +} +{ +mul.f16x2 r10046, r10043, r9968; +} +{ +add.f16x2 r10049, r10040, r10046; +} +{ +sub.f16x2 r10052, r9398, r9446; +} +{ +mul.f16x2 r10055, r10052, r9966; +} +{ +sub.f16x2 r10058, r9414, r9430; +} +{ +mul.f16x2 r10061, r10058, r9970; +} +{ +add.f16x2 r10064, r10055, r10061; +} +{ +add.f16x2 %42, r10049, r10064; +} +{ +add.f16x2 r10070, r9392, r9440; +} +{ +mul.f16x2 r10073, r10070, r9968; +} +{ +add.f16x2 r10076, r7795, r10073; +} +{ +add.f16x2 r10079, r9408, r9424; +} +{ +mul.f16x2 r10082, r10079, r9972; +} +{ +add.f16x2 r10085, r10076, r10082; +} +{ +sub.f16x2 r10088, r9398, r9446; +} +{ +mul.f16x2 r10091, r10088, r9970; +} +{ +sub.f16x2 r10094, r9414, r9430; +} +{ +mul.f16x2 r10097, r10094, r9973; +} +{ +add.f16x2 r10100, r10091, r10097; +} +{ +sub.f16x2 %22, r10085, r10100; +} +{ +add.f16x2 r10106, r9392, r9440; +} +{ +mul.f16x2 r10109, r10106, r9968; +} +{ +add.f16x2 r10112, r7795, r10109; +} +{ +add.f16x2 r10115, r9408, r9424; +} +{ +mul.f16x2 r10118, r10115, r9972; +} +{ +add.f16x2 r10121, r10112, r10118; +} +{ +sub.f16x2 r10124, r9398, r9446; +} +{ +mul.f16x2 r10127, r10124, r9970; +} +{ +sub.f16x2 r10130, r9414, r9430; +} +{ +mul.f16x2 r10133, r10130, r9973; +} +{ +add.f16x2 r10136, r10127, r10133; +} +{ +add.f16x2 %32, r10121, r10136; +} +{ +add.f16x2 r10142, r9398, r9446; +} +{ +mul.f16x2 r10145, r10142, r9964; +} +{ +add.f16x2 r10148, r7939, r10145; +} +{ +add.f16x2 r10151, r9414, r9430; +} +{ +mul.f16x2 r10154, r10151, r9968; +} +{ +add.f16x2 r10157, r10148, r10154; +} +{ +sub.f16x2 r10160, r9392, r9440; +} +{ +mul.f16x2 r10163, r10160, r9966; +} +{ +sub.f16x2 r10166, r9408, r9424; +} +{ +mul.f16x2 r10169, r10166, r9970; +} +{ +add.f16x2 r10172, r10163, r10169; +} +{ +add.f16x2 %13, r10157, r10172; +} +{ +add.f16x2 r10178, r9398, r9446; +} +{ +mul.f16x2 r10181, r10178, r9964; +} +{ +add.f16x2 r10184, r7939, r10181; +} +{ +add.f16x2 r10187, r9414, r9430; +} +{ +mul.f16x2 r10190, r10187, r9968; +} +{ +add.f16x2 r10193, r10184, r10190; +} +{ +sub.f16x2 r10196, r9392, r9440; +} +{ +mul.f16x2 r10199, r10196, r9966; +} +{ +sub.f16x2 r10202, r9408, r9424; +} +{ +mul.f16x2 r10205, r10202, r9970; +} +{ +add.f16x2 r10208, r10199, r10205; +} +{ +sub.f16x2 %43, r10193, r10208; +} +{ +add.f16x2 r10214, r9398, r9446; +} +{ +mul.f16x2 r10217, r10214, r9968; +} +{ +add.f16x2 r10220, r7939, r10217; +} +{ +add.f16x2 r10223, r9414, r9430; +} +{ +mul.f16x2 r10226, r10223, r9972; +} +{ +add.f16x2 r10229, r10220, r10226; +} +{ +sub.f16x2 r10232, r9392, r9440; +} +{ +mul.f16x2 r10235, r10232, r9970; +} +{ +sub.f16x2 r10238, r9408, r9424; +} +{ +mul.f16x2 r10241, r10238, r9973; +} +{ +add.f16x2 r10244, r10235, r10241; +} +{ +add.f16x2 %23, r10229, r10244; +} +{ +add.f16x2 r10250, r9398, r9446; +} +{ +mul.f16x2 r10253, r10250, r9968; +} +{ +add.f16x2 r10256, r7939, r10253; +} +{ +add.f16x2 r10259, r9414, r9430; +} +{ +mul.f16x2 r10262, r10259, r9972; +} +{ +add.f16x2 r10265, r10256, r10262; +} +{ +sub.f16x2 r10268, r9392, r9440; +} +{ +mul.f16x2 r10271, r10268, r9970; +} +{ +sub.f16x2 r10274, r9408, r9424; +} +{ +mul.f16x2 r10277, r10274, r9973; +} +{ +add.f16x2 r10280, r10271, r10277; +} +{ +sub.f16x2 %33, r10265, r10280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10287, {low, high}; +} +{ +neg.f16x2 r10288, r10287; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10290, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10291, {low, high}; +} +{ +neg.f16x2 r10292, r10291; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10294, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10295, {low, high}; +} +{ +add.f16x2 r10296, r9456, r9504; +} +{ +add.f16x2 r10299, r7867, r10296; +} +{ +add.f16x2 r10302, r9472, r9488; +} +{ +add.f16x2 %4, r10299, r10302; +} +{ +add.f16x2 r10308, r9462, r9510; +} +{ +add.f16x2 r10311, r8011, r10308; +} +{ +add.f16x2 r10314, r9478, r9494; +} +{ +add.f16x2 %5, r10311, r10314; +} +{ +add.f16x2 r10320, r9456, r9504; +} +{ +mul.f16x2 r10323, r10320, r10286; +} +{ +add.f16x2 r10326, r7867, r10323; +} +{ +add.f16x2 r10329, r9472, r9488; +} +{ +mul.f16x2 r10332, r10329, r10290; +} +{ +add.f16x2 r10335, r10326, r10332; +} +{ +sub.f16x2 r10338, r9462, r9510; +} +{ +mul.f16x2 r10341, r10338, r10288; +} +{ +sub.f16x2 r10344, r9478, r9494; +} +{ +mul.f16x2 r10347, r10344, r10292; +} +{ +add.f16x2 r10350, r10341, r10347; +} +{ +sub.f16x2 %14, r10335, r10350; +} +{ +add.f16x2 r10356, r9456, r9504; +} +{ +mul.f16x2 r10359, r10356, r10286; +} +{ +add.f16x2 r10362, r7867, r10359; +} +{ +add.f16x2 r10365, r9472, r9488; +} +{ +mul.f16x2 r10368, r10365, r10290; +} +{ +add.f16x2 r10371, r10362, r10368; +} +{ +sub.f16x2 r10374, r9462, r9510; +} +{ +mul.f16x2 r10377, r10374, r10288; +} +{ +sub.f16x2 r10380, r9478, r9494; +} +{ +mul.f16x2 r10383, r10380, r10292; +} +{ +add.f16x2 r10386, r10377, r10383; +} +{ +add.f16x2 %44, r10371, r10386; +} +{ +add.f16x2 r10392, r9456, r9504; +} +{ +mul.f16x2 r10395, r10392, r10290; +} +{ +add.f16x2 r10398, r7867, r10395; +} +{ +add.f16x2 r10401, r9472, r9488; +} +{ +mul.f16x2 r10404, r10401, r10294; +} +{ +add.f16x2 r10407, r10398, r10404; +} +{ +sub.f16x2 r10410, r9462, r9510; +} +{ +mul.f16x2 r10413, r10410, r10292; +} +{ +sub.f16x2 r10416, r9478, r9494; +} +{ +mul.f16x2 r10419, r10416, r10295; +} +{ +add.f16x2 r10422, r10413, r10419; +} +{ +sub.f16x2 %24, r10407, r10422; +} +{ +add.f16x2 r10428, r9456, r9504; +} +{ +mul.f16x2 r10431, r10428, r10290; +} +{ +add.f16x2 r10434, r7867, r10431; +} +{ +add.f16x2 r10437, r9472, r9488; +} +{ +mul.f16x2 r10440, r10437, r10294; +} +{ +add.f16x2 r10443, r10434, r10440; +} +{ +sub.f16x2 r10446, r9462, r9510; +} +{ +mul.f16x2 r10449, r10446, r10292; +} +{ +sub.f16x2 r10452, r9478, r9494; +} +{ +mul.f16x2 r10455, r10452, r10295; +} +{ +add.f16x2 r10458, r10449, r10455; +} +{ +add.f16x2 %34, r10443, r10458; +} +{ +add.f16x2 r10464, r9462, r9510; +} +{ +mul.f16x2 r10467, r10464, r10286; +} +{ +add.f16x2 r10470, r8011, r10467; +} +{ +add.f16x2 r10473, r9478, r9494; +} +{ +mul.f16x2 r10476, r10473, r10290; +} +{ +add.f16x2 r10479, r10470, r10476; +} +{ +sub.f16x2 r10482, r9456, r9504; +} +{ +mul.f16x2 r10485, r10482, r10288; +} +{ +sub.f16x2 r10488, r9472, r9488; +} +{ +mul.f16x2 r10491, r10488, r10292; +} +{ +add.f16x2 r10494, r10485, r10491; +} +{ +add.f16x2 %15, r10479, r10494; +} +{ +add.f16x2 r10500, r9462, r9510; +} +{ +mul.f16x2 r10503, r10500, r10286; +} +{ +add.f16x2 r10506, r8011, r10503; +} +{ +add.f16x2 r10509, r9478, r9494; +} +{ +mul.f16x2 r10512, r10509, r10290; +} +{ +add.f16x2 r10515, r10506, r10512; +} +{ +sub.f16x2 r10518, r9456, r9504; +} +{ +mul.f16x2 r10521, r10518, r10288; +} +{ +sub.f16x2 r10524, r9472, r9488; +} +{ +mul.f16x2 r10527, r10524, r10292; +} +{ +add.f16x2 r10530, r10521, r10527; +} +{ +sub.f16x2 %45, r10515, r10530; +} +{ +add.f16x2 r10536, r9462, r9510; +} +{ +mul.f16x2 r10539, r10536, r10290; +} +{ +add.f16x2 r10542, r8011, r10539; +} +{ +add.f16x2 r10545, r9478, r9494; +} +{ +mul.f16x2 r10548, r10545, r10294; +} +{ +add.f16x2 r10551, r10542, r10548; +} +{ +sub.f16x2 r10554, r9456, r9504; +} +{ +mul.f16x2 r10557, r10554, r10292; +} +{ +sub.f16x2 r10560, r9472, r9488; +} +{ +mul.f16x2 r10563, r10560, r10295; +} +{ +add.f16x2 r10566, r10557, r10563; +} +{ +add.f16x2 %25, r10551, r10566; +} +{ +add.f16x2 r10572, r9462, r9510; +} +{ +mul.f16x2 r10575, r10572, r10290; +} +{ +add.f16x2 r10578, r8011, r10575; +} +{ +add.f16x2 r10581, r9478, r9494; +} +{ +mul.f16x2 r10584, r10581, r10294; +} +{ +add.f16x2 r10587, r10578, r10584; +} +{ +sub.f16x2 r10590, r9456, r9504; +} +{ +mul.f16x2 r10593, r10590, r10292; +} +{ +sub.f16x2 r10596, r9472, r9488; +} +{ +mul.f16x2 r10599, r10596, r10295; +} +{ +add.f16x2 r10602, r10593, r10599; +} +{ +sub.f16x2 %35, r10587, r10602; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10608, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10609, {low, high}; +} +{ +neg.f16x2 r10610, r10609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10613, {low, high}; +} +{ +neg.f16x2 r10614, r10613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10616, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10617, {low, high}; +} +{ +add.f16x2 r10618, r9520, r9568; +} +{ +add.f16x2 r10621, r7903, r10618; +} +{ +add.f16x2 r10624, r9536, r9552; +} +{ +add.f16x2 %6, r10621, r10624; +} +{ +add.f16x2 r10630, r9526, r9574; +} +{ +add.f16x2 r10633, r8047, r10630; +} +{ +add.f16x2 r10636, r9542, r9558; +} +{ +add.f16x2 %7, r10633, r10636; +} +{ +add.f16x2 r10642, r9520, r9568; +} +{ +mul.f16x2 r10645, r10642, r10608; +} +{ +add.f16x2 r10648, r7903, r10645; +} +{ +add.f16x2 r10651, r9536, r9552; +} +{ +mul.f16x2 r10654, r10651, r10612; +} +{ +add.f16x2 r10657, r10648, r10654; +} +{ +sub.f16x2 r10660, r9526, r9574; +} +{ +mul.f16x2 r10663, r10660, r10610; +} +{ +sub.f16x2 r10666, r9542, r9558; +} +{ +mul.f16x2 r10669, r10666, r10614; +} +{ +add.f16x2 r10672, r10663, r10669; +} +{ +sub.f16x2 %16, r10657, r10672; +} +{ +add.f16x2 r10678, r9520, r9568; +} +{ +mul.f16x2 r10681, r10678, r10608; +} +{ +add.f16x2 r10684, r7903, r10681; +} +{ +add.f16x2 r10687, r9536, r9552; +} +{ +mul.f16x2 r10690, r10687, r10612; +} +{ +add.f16x2 r10693, r10684, r10690; +} +{ +sub.f16x2 r10696, r9526, r9574; +} +{ +mul.f16x2 r10699, r10696, r10610; +} +{ +sub.f16x2 r10702, r9542, r9558; +} +{ +mul.f16x2 r10705, r10702, r10614; +} +{ +add.f16x2 r10708, r10699, r10705; +} +{ +add.f16x2 %46, r10693, r10708; +} +{ +add.f16x2 r10714, r9520, r9568; +} +{ +mul.f16x2 r10717, r10714, r10612; +} +{ +add.f16x2 r10720, r7903, r10717; +} +{ +add.f16x2 r10723, r9536, r9552; +} +{ +mul.f16x2 r10726, r10723, r10616; +} +{ +add.f16x2 r10729, r10720, r10726; +} +{ +sub.f16x2 r10732, r9526, r9574; +} +{ +mul.f16x2 r10735, r10732, r10614; +} +{ +sub.f16x2 r10738, r9542, r9558; +} +{ +mul.f16x2 r10741, r10738, r10617; +} +{ +add.f16x2 r10744, r10735, r10741; +} +{ +sub.f16x2 %26, r10729, r10744; +} +{ +add.f16x2 r10750, r9520, r9568; +} +{ +mul.f16x2 r10753, r10750, r10612; +} +{ +add.f16x2 r10756, r7903, r10753; +} +{ +add.f16x2 r10759, r9536, r9552; +} +{ +mul.f16x2 r10762, r10759, r10616; +} +{ +add.f16x2 r10765, r10756, r10762; +} +{ +sub.f16x2 r10768, r9526, r9574; +} +{ +mul.f16x2 r10771, r10768, r10614; +} +{ +sub.f16x2 r10774, r9542, r9558; +} +{ +mul.f16x2 r10777, r10774, r10617; +} +{ +add.f16x2 r10780, r10771, r10777; +} +{ +add.f16x2 %36, r10765, r10780; +} +{ +add.f16x2 r10786, r9526, r9574; +} +{ +mul.f16x2 r10789, r10786, r10608; +} +{ +add.f16x2 r10792, r8047, r10789; +} +{ +add.f16x2 r10795, r9542, r9558; +} +{ +mul.f16x2 r10798, r10795, r10612; +} +{ +add.f16x2 r10801, r10792, r10798; +} +{ +sub.f16x2 r10804, r9520, r9568; +} +{ +mul.f16x2 r10807, r10804, r10610; +} +{ +sub.f16x2 r10810, r9536, r9552; +} +{ +mul.f16x2 r10813, r10810, r10614; +} +{ +add.f16x2 r10816, r10807, r10813; +} +{ +add.f16x2 %17, r10801, r10816; +} +{ +add.f16x2 r10822, r9526, r9574; +} +{ +mul.f16x2 r10825, r10822, r10608; +} +{ +add.f16x2 r10828, r8047, r10825; +} +{ +add.f16x2 r10831, r9542, r9558; +} +{ +mul.f16x2 r10834, r10831, r10612; +} +{ +add.f16x2 r10837, r10828, r10834; +} +{ +sub.f16x2 r10840, r9520, r9568; +} +{ +mul.f16x2 r10843, r10840, r10610; +} +{ +sub.f16x2 r10846, r9536, r9552; +} +{ +mul.f16x2 r10849, r10846, r10614; +} +{ +add.f16x2 r10852, r10843, r10849; +} +{ +sub.f16x2 %47, r10837, r10852; +} +{ +add.f16x2 r10858, r9526, r9574; +} +{ +mul.f16x2 r10861, r10858, r10612; +} +{ +add.f16x2 r10864, r8047, r10861; +} +{ +add.f16x2 r10867, r9542, r9558; +} +{ +mul.f16x2 r10870, r10867, r10616; +} +{ +add.f16x2 r10873, r10864, r10870; +} +{ +sub.f16x2 r10876, r9520, r9568; +} +{ +mul.f16x2 r10879, r10876, r10614; +} +{ +sub.f16x2 r10882, r9536, r9552; +} +{ +mul.f16x2 r10885, r10882, r10617; +} +{ +add.f16x2 r10888, r10879, r10885; +} +{ +add.f16x2 %27, r10873, r10888; +} +{ +add.f16x2 r10894, r9526, r9574; +} +{ +mul.f16x2 r10897, r10894, r10612; +} +{ +add.f16x2 r10900, r8047, r10897; +} +{ +add.f16x2 r10903, r9542, r9558; +} +{ +mul.f16x2 r10906, r10903, r10616; +} +{ +add.f16x2 r10909, r10900, r10906; +} +{ +sub.f16x2 r10912, r9520, r9568; +} +{ +mul.f16x2 r10915, r10912, r10614; +} +{ +sub.f16x2 r10918, r9536, r9552; +} +{ +mul.f16x2 r10921, r10918, r10617; +} +{ +add.f16x2 r10924, r10915, r10921; +} +{ +sub.f16x2 %37, r10909, r10924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10931, {low, high}; +} +{ +neg.f16x2 r10932, r10931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f210; +cvt.rn.f16.f32 high, f210; +mov.b32 r10934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r10935, {low, high}; +} +{ +neg.f16x2 r10936, r10935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r10938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r10939, {low, high}; +} +{ +add.f16x2 r10940, r9584, r9632; +} +{ +add.f16x2 r10943, r7831, r10940; +} +{ +add.f16x2 r10946, r9600, r9616; +} +{ +add.f16x2 %8, r10943, r10946; +} +{ +add.f16x2 r10952, r9590, r9638; +} +{ +add.f16x2 r10955, r7975, r10952; +} +{ +add.f16x2 r10958, r9606, r9622; +} +{ +add.f16x2 %9, r10955, r10958; +} +{ +add.f16x2 r10964, r9584, r9632; +} +{ +mul.f16x2 r10967, r10964, r10930; +} +{ +add.f16x2 r10970, r7831, r10967; +} +{ +add.f16x2 r10973, r9600, r9616; +} +{ +mul.f16x2 r10976, r10973, r10934; +} +{ +add.f16x2 r10979, r10970, r10976; +} +{ +sub.f16x2 r10982, r9590, r9638; +} +{ +mul.f16x2 r10985, r10982, r10932; +} +{ +sub.f16x2 r10988, r9606, r9622; +} +{ +mul.f16x2 r10991, r10988, r10936; +} +{ +add.f16x2 r10994, r10985, r10991; +} +{ +sub.f16x2 %18, r10979, r10994; +} +{ +add.f16x2 r11000, r9584, r9632; +} +{ +mul.f16x2 r11003, r11000, r10930; +} +{ +add.f16x2 r11006, r7831, r11003; +} +{ +add.f16x2 r11009, r9600, r9616; +} +{ +mul.f16x2 r11012, r11009, r10934; +} +{ +add.f16x2 r11015, r11006, r11012; +} +{ +sub.f16x2 r11018, r9590, r9638; +} +{ +mul.f16x2 r11021, r11018, r10932; +} +{ +sub.f16x2 r11024, r9606, r9622; +} +{ +mul.f16x2 r11027, r11024, r10936; +} +{ +add.f16x2 r11030, r11021, r11027; +} +{ +add.f16x2 %48, r11015, r11030; +} +{ +add.f16x2 r11036, r9584, r9632; +} +{ +mul.f16x2 r11039, r11036, r10934; +} +{ +add.f16x2 r11042, r7831, r11039; +} +{ +add.f16x2 r11045, r9600, r9616; +} +{ +mul.f16x2 r11048, r11045, r10938; +} +{ +add.f16x2 r11051, r11042, r11048; +} +{ +sub.f16x2 r11054, r9590, r9638; +} +{ +mul.f16x2 r11057, r11054, r10936; +} +{ +sub.f16x2 r11060, r9606, r9622; +} +{ +mul.f16x2 r11063, r11060, r10939; +} +{ +add.f16x2 r11066, r11057, r11063; +} +{ +sub.f16x2 %28, r11051, r11066; +} +{ +add.f16x2 r11072, r9584, r9632; +} +{ +mul.f16x2 r11075, r11072, r10934; +} +{ +add.f16x2 r11078, r7831, r11075; +} +{ +add.f16x2 r11081, r9600, r9616; +} +{ +mul.f16x2 r11084, r11081, r10938; +} +{ +add.f16x2 r11087, r11078, r11084; +} +{ +sub.f16x2 r11090, r9590, r9638; +} +{ +mul.f16x2 r11093, r11090, r10936; +} +{ +sub.f16x2 r11096, r9606, r9622; +} +{ +mul.f16x2 r11099, r11096, r10939; +} +{ +add.f16x2 r11102, r11093, r11099; +} +{ +add.f16x2 %38, r11087, r11102; +} +{ +add.f16x2 r11108, r9590, r9638; +} +{ +mul.f16x2 r11111, r11108, r10930; +} +{ +add.f16x2 r11114, r7975, r11111; +} +{ +add.f16x2 r11117, r9606, r9622; +} +{ +mul.f16x2 r11120, r11117, r10934; +} +{ +add.f16x2 r11123, r11114, r11120; +} +{ +sub.f16x2 r11126, r9584, r9632; +} +{ +mul.f16x2 r11129, r11126, r10932; +} +{ +sub.f16x2 r11132, r9600, r9616; +} +{ +mul.f16x2 r11135, r11132, r10936; +} +{ +add.f16x2 r11138, r11129, r11135; +} +{ +add.f16x2 %19, r11123, r11138; +} +{ +add.f16x2 r11144, r9590, r9638; +} +{ +mul.f16x2 r11147, r11144, r10930; +} +{ +add.f16x2 r11150, r7975, r11147; +} +{ +add.f16x2 r11153, r9606, r9622; +} +{ +mul.f16x2 r11156, r11153, r10934; +} +{ +add.f16x2 r11159, r11150, r11156; +} +{ +sub.f16x2 r11162, r9584, r9632; +} +{ +mul.f16x2 r11165, r11162, r10932; +} +{ +sub.f16x2 r11168, r9600, r9616; +} +{ +mul.f16x2 r11171, r11168, r10936; +} +{ +add.f16x2 r11174, r11165, r11171; +} +{ +sub.f16x2 %49, r11159, r11174; +} +{ +add.f16x2 r11180, r9590, r9638; +} +{ +mul.f16x2 r11183, r11180, r10934; +} +{ +add.f16x2 r11186, r7975, r11183; +} +{ +add.f16x2 r11189, r9606, r9622; +} +{ +mul.f16x2 r11192, r11189, r10938; +} +{ +add.f16x2 r11195, r11186, r11192; +} +{ +sub.f16x2 r11198, r9584, r9632; +} +{ +mul.f16x2 r11201, r11198, r10936; +} +{ +sub.f16x2 r11204, r9600, r9616; +} +{ +mul.f16x2 r11207, r11204, r10939; +} +{ +add.f16x2 r11210, r11201, r11207; +} +{ +add.f16x2 %29, r11195, r11210; +} +{ +add.f16x2 r11216, r9590, r9638; +} +{ +mul.f16x2 r11219, r11216, r10934; +} +{ +add.f16x2 r11222, r7975, r11219; +} +{ +add.f16x2 r11225, r9606, r9622; +} +{ +mul.f16x2 r11228, r11225, r10938; +} +{ +add.f16x2 r11231, r11222, r11228; +} +{ +sub.f16x2 r11234, r9584, r9632; +} +{ +mul.f16x2 r11237, r11234, r10936; +} +{ +sub.f16x2 r11240, r9600, r9616; +} +{ +mul.f16x2 r11243, r11240, r10939; +} +{ +add.f16x2 r11246, r11237, r11243; +} +{ +sub.f16x2 %39, r11231, r11246; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..58476e2ec1d8c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp32_fwd.hpp.inc @@ -0,0 +1,4458 @@ +#ifndef CUFFTDX_FFT_15625_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_15625_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1155, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2811>; +.reg .b32 r<21>; +.reg .b64 rd<15>; +mov.u32 r19, %tid.y; +mov.u32 r20, %50; +mad.lo.s32 r3, r19, 125000, r20; +add.f32 f101, %63, %93; +add.f32 f103, %73, %83; +add.f32 f2810, %53, f101; +add.f32 f104, f103, f2810; +add.f32 f105, %103, %105; +add.f32 f107, %104, %84; +add.f32 f2806, %54, f105; +add.f32 f108, f107, f2806; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f2805, f101, 0f3E9E377A, %53; +sub.f32 f111, f2805, f110; +sub.f32 f112, %103, %105; +sub.f32 f114, %104, %84; +mul.f32 f2803, f112, 0f3F737871; +mul.f32 f2804, f114, 0fBF167918; +sub.f32 f116, f2804, f2803; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %53, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f2801, f105, 0f3E9E377A, %54; +mul.f32 f2802, f107, 0f3F4F1BBD; +sub.f32 f129, f2801, f2802; +sub.f32 f130, %63, %93; +sub.f32 f132, %73, %83; +mul.f32 f2799, f130, 0f3F737871; +mul.f32 f2800, f132, 0fBF167918; +sub.f32 f134, f2800, f2799; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %54, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %65, %95; +add.f32 f147, %75, %85; +add.f32 f2798, %55, f145; +add.f32 f148, f147, f2798; +add.f32 f149, %66, %96; +add.f32 f151, %106, %107; +add.f32 f2794, %108, f149; +add.f32 f152, f151, f2794; +fma.rn.f32 f2792, f145, 0f3E9E377A, %55; +mul.f32 f2793, f147, 0f3F4F1BBD; +sub.f32 f155, f2792, f2793; +sub.f32 f156, %66, %96; +sub.f32 f158, %106, %107; +mul.f32 f2790, f156, 0f3F737871; +mul.f32 f2791, f158, 0fBF167918; +sub.f32 f160, f2791, f2790; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %55, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +mul.f32 f172, f151, 0f3F4F1BBD; +fma.rn.f32 f2789, f149, 0f3E9E377A, %108; +sub.f32 f173, f2789, f172; +sub.f32 f174, %65, %95; +sub.f32 f176, %75, %85; +mul.f32 f177, f176, 0fBF167918; +mul.f32 f2788, f174, 0f3F737871; +sub.f32 f178, f177, f2788; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %108, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %67, %97; +add.f32 f191, %77, %87; +add.f32 f2787, %57, f189; +add.f32 f192, f191, f2787; +add.f32 f193, %112, %111; +add.f32 f195, %78, %109; +add.f32 f2782, %110, f193; +add.f32 f196, f195, f2782; +mul.f32 f198, f191, 0f3F4F1BBD; +fma.rn.f32 f2781, f189, 0f3E9E377A, %57; +sub.f32 f199, f2781, f198; +sub.f32 f200, %112, %111; +sub.f32 f202, %78, %109; +mul.f32 f203, f202, 0fBF167918; +mul.f32 f2780, f200, 0f3F737871; +sub.f32 f204, f203, f2780; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %57, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f2778, f193, 0f3E9E377A, %110; +mul.f32 f2779, f195, 0f3F4F1BBD; +sub.f32 f217, f2778, f2779; +sub.f32 f218, %67, %97; +sub.f32 f220, %77, %87; +mul.f32 f2776, f218, 0f3F737871; +mul.f32 f2777, f220, 0fBF167918; +sub.f32 f222, f2777, f2776; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %110, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %69, %99; +add.f32 f235, %79, %89; +add.f32 f2775, %59, f233; +add.f32 f236, f235, f2775; +add.f32 f237, %114, %113; +add.f32 f239, %115, %90; +add.f32 f2771, %60, f237; +add.f32 f240, f239, f2771; +fma.rn.f32 f2769, f233, 0f3E9E377A, %59; +mul.f32 f2770, f235, 0f3F4F1BBD; +sub.f32 f243, f2769, f2770; +sub.f32 f244, %114, %113; +sub.f32 f246, %115, %90; +mul.f32 f2767, f244, 0f3F737871; +mul.f32 f2768, f246, 0fBF167918; +sub.f32 f248, f2768, f2767; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %59, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +mul.f32 f260, f239, 0f3F4F1BBD; +fma.rn.f32 f2766, f237, 0f3E9E377A, %60; +sub.f32 f261, f2766, f260; +sub.f32 f262, %69, %99; +sub.f32 f264, %79, %89; +mul.f32 f2764, f262, 0f3F737871; +mul.f32 f2765, f264, 0fBF167918; +sub.f32 f266, f2765, f2764; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %60, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %71, %101; +add.f32 f279, %81, %91; +add.f32 f2763, %61, f277; +add.f32 f280, f279, f2763; +add.f32 f281, %72, %102; +add.f32 f283, %118, %116; +add.f32 f2759, %117, f281; +add.f32 f284, f283, f2759; +mul.f32 f286, f279, 0f3F4F1BBD; +fma.rn.f32 f2758, f277, 0f3E9E377A, %61; +sub.f32 f287, f2758, f286; +sub.f32 f288, %72, %102; +sub.f32 f290, %118, %116; +mul.f32 f2756, f288, 0f3F737871; +mul.f32 f2757, f290, 0fBF167918; +sub.f32 f292, f2757, f2756; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %61, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +mul.f32 f304, f283, 0f3F4F1BBD; +fma.rn.f32 f2755, f281, 0f3E9E377A, %117; +sub.f32 f305, f2755, f304; +sub.f32 f306, %71, %101; +sub.f32 f308, %81, %91; +mul.f32 f2753, f306, 0f3F737871; +mul.f32 f2754, f308, 0fBF167918; +sub.f32 f310, f2754, f2753; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %117, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mul.f32 f322, f179, 0fBE7EA890; +mul.f32 f2752, f161, 0f3F77F511; +sub.f32 f323, f2752, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f327, f223, 0fBEF6A86B; +mul.f32 f2751, f205, 0f3F6055A2; +sub.f32 f328, f2751, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f332, f267, 0fBF2F3E7B; +mul.f32 f2750, f249, 0f3F3A9DB0; +sub.f32 f333, f2750, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f337, f311, 0fBF5825E0; +mul.f32 f2749, f293, 0f3F092BF2; +sub.f32 f338, f2749, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f342, f187, 0fBEF6A86B; +mul.f32 f2748, f169, 0f3F6055A2; +sub.f32 f343, f2748, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f2746, f213, 0f3F092BF2; +mul.f32 f2747, f231, 0fBF5825E0; +sub.f32 f348, f2746, f2747; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f2744, f257, 0f3D809851; +mul.f32 f2745, f275, 0fBF7F7EAE; +sub.f32 f353, f2744, f2745; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f2742, f301, 0fBED9FFBE; +mul.f32 f2743, f319, 0fBF67A2BF; +sub.f32 f358, f2742, f2743; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f2740, f170, 0f3F3A9DB0; +mul.f32 f2741, f188, 0fBF2F3E7B; +sub.f32 f363, f2740, f2741; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f367, f232, 0fBF7F7EAE; +mul.f32 f2739, f214, 0f3D809851; +sub.f32 f368, f2739, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f372, f276, 0fBF45405B; +mul.f32 f2738, f258, 0fBF232E38; +sub.f32 f373, f2738, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f377, f320, 0fBE00575B; +mul.f32 f2737, f302, 0fBF7DFB3B; +sub.f32 f378, f2737, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f382, f180, 0fBF5825E0; +mul.f32 f2736, f162, 0f3F092BF2; +sub.f32 f383, f2736, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f387, f224, 0fBF67A2BF; +mul.f32 f2735, f206, 0fBED9FFBE; +sub.f32 f388, f2735, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f2733, f250, 0fBF7DFB3B; +mul.f32 f2734, f268, 0fBE00575B; +sub.f32 f393, f2733, f2734; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f2731, f294, 0fBF232E38; +mul.f32 f2732, f312, 0f3F45405B; +sub.f32 f398, f2731, f2732; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f403, f192, f236; +mul.f32 f408, f403, 0f3F4F1BBD; +fma.rn.f32 f2730, f401, 0f3E9E377A, f104; +sub.f32 f409, f2730, f408; +add.f32 f2729, f152, f284; +sub.f32 f410, f152, f284; +add.f32 f2728, f196, f240; +sub.f32 f412, f196, f240; +mul.f32 f413, f412, 0fBF167918; +mul.f32 f2727, f410, 0f3F737871; +sub.f32 f414, f413, f2727; +sub.f32 f415, f409, f414; +add.f32 f416, f414, f409; +add.f32 f2726, f104, f401; +mul.f32 f417, f401, 0f3F4F1BBD; +sub.f32 f418, f104, f417; +fma.rn.f32 f419, f403, 0f3E9E377A, f418; +mul.f32 f420, f410, 0f3F167918; +mul.f32 f421, f412, 0f3F737871; +sub.f32 f422, f421, f420; +sub.f32 f423, f419, f422; +add.f32 f424, f422, f419; +fma.rn.f32 f2724, f2729, 0f3E9E377A, f108; +mul.f32 f2725, f2728, 0f3F4F1BBD; +sub.f32 f427, f2724, f2725; +sub.f32 f428, f148, f280; +sub.f32 f430, f192, f236; +mul.f32 f2722, f428, 0f3F737871; +mul.f32 f2723, f430, 0fBF167918; +sub.f32 f432, f2723, f2722; +add.f32 f433, f432, f427; +sub.f32 f434, f427, f432; +add.f32 f2721, f108, f2729; +mul.f32 f435, f2729, 0f3F4F1BBD; +sub.f32 f436, f108, f435; +fma.rn.f32 f437, f2728, 0f3E9E377A, f436; +mul.f32 f438, f428, 0f3F167918; +mul.f32 f439, f430, 0f3F737871; +sub.f32 f440, f439, f438; +add.f32 f441, f440, f437; +sub.f32 f442, f437, f440; +add.f32 f443, f323, f338; +add.f32 f445, f328, f333; +add.f32 f2720, f117, f443; +add.f32 f446, f445, f2720; +add.f32 f447, f325, f340; +add.f32 f449, f330, f335; +add.f32 f2719, f135, f447; +add.f32 f450, f449, f2719; +fma.rn.f32 f2717, f443, 0f3E9E377A, f117; +mul.f32 f2718, f445, 0f3F4F1BBD; +sub.f32 f453, f2717, f2718; +sub.f32 f454, f325, f340; +sub.f32 f456, f330, f335; +mul.f32 f2715, f454, 0f3F737871; +mul.f32 f2716, f456, 0fBF167918; +sub.f32 f458, f2716, f2715; +sub.f32 f459, f453, f458; +add.f32 f460, f458, f453; +mul.f32 f461, f443, 0f3F4F1BBD; +sub.f32 f462, f117, f461; +fma.rn.f32 f463, f445, 0f3E9E377A, f462; +mul.f32 f464, f454, 0f3F167918; +mul.f32 f465, f456, 0f3F737871; +sub.f32 f466, f465, f464; +sub.f32 f467, f463, f466; +add.f32 f468, f466, f463; +mul.f32 f470, f449, 0f3F4F1BBD; +fma.rn.f32 f2714, f447, 0f3E9E377A, f135; +sub.f32 f471, f2714, f470; +sub.f32 f472, f323, f338; +sub.f32 f474, f328, f333; +mul.f32 f2712, f472, 0f3F737871; +mul.f32 f2713, f474, 0fBF167918; +sub.f32 f476, f2713, f2712; +add.f32 f477, f476, f471; +sub.f32 f478, f471, f476; +mul.f32 f479, f447, 0f3F4F1BBD; +sub.f32 f480, f135, f479; +fma.rn.f32 f481, f449, 0f3E9E377A, f480; +mul.f32 f482, f472, 0f3F167918; +mul.f32 f483, f474, 0f3F737871; +sub.f32 f484, f483, f482; +add.f32 f485, f484, f481; +sub.f32 f486, f481, f484; +add.f32 f487, f343, f358; +add.f32 f489, f348, f353; +add.f32 f2711, f125, f487; +add.f32 f490, f489, f2711; +add.f32 f491, f345, f360; +add.f32 f493, f350, f355; +add.f32 f2710, f143, f491; +add.f32 f494, f493, f2710; +mul.f32 f496, f489, 0f3F4F1BBD; +fma.rn.f32 f2709, f487, 0f3E9E377A, f125; +sub.f32 f497, f2709, f496; +sub.f32 f498, f345, f360; +sub.f32 f500, f350, f355; +mul.f32 f2707, f498, 0f3F737871; +mul.f32 f2708, f500, 0fBF167918; +sub.f32 f502, f2708, f2707; +sub.f32 f503, f497, f502; +add.f32 f504, f502, f497; +mul.f32 f505, f487, 0f3F4F1BBD; +sub.f32 f506, f125, f505; +fma.rn.f32 f507, f489, 0f3E9E377A, f506; +mul.f32 f508, f498, 0f3F167918; +mul.f32 f509, f500, 0f3F737871; +sub.f32 f510, f509, f508; +sub.f32 f511, f507, f510; +add.f32 f512, f510, f507; +mul.f32 f514, f493, 0f3F4F1BBD; +fma.rn.f32 f2706, f491, 0f3E9E377A, f143; +sub.f32 f515, f2706, f514; +sub.f32 f516, f343, f358; +sub.f32 f518, f348, f353; +mul.f32 f2704, f516, 0f3F737871; +mul.f32 f2705, f518, 0fBF167918; +sub.f32 f520, f2705, f2704; +add.f32 f521, f520, f515; +sub.f32 f522, f515, f520; +mul.f32 f523, f491, 0f3F4F1BBD; +sub.f32 f524, f143, f523; +fma.rn.f32 f525, f493, 0f3E9E377A, f524; +mul.f32 f526, f516, 0f3F167918; +mul.f32 f527, f518, 0f3F737871; +sub.f32 f528, f527, f526; +add.f32 f529, f528, f525; +sub.f32 f530, f525, f528; +add.f32 f531, f363, f378; +add.f32 f533, f368, f373; +add.f32 f2703, f126, f531; +add.f32 f534, f533, f2703; +add.f32 f535, f365, f380; +add.f32 f537, f370, f375; +add.f32 f2702, f144, f535; +add.f32 f538, f537, f2702; +mul.f32 f540, f533, 0f3F4F1BBD; +fma.rn.f32 f2701, f531, 0f3E9E377A, f126; +sub.f32 f541, f2701, f540; +sub.f32 f542, f365, f380; +sub.f32 f544, f370, f375; +mul.f32 f2699, f542, 0f3F737871; +mul.f32 f2700, f544, 0fBF167918; +sub.f32 f546, f2700, f2699; +sub.f32 f547, f541, f546; +add.f32 f548, f546, f541; +mul.f32 f549, f531, 0f3F4F1BBD; +sub.f32 f550, f126, f549; +fma.rn.f32 f551, f533, 0f3E9E377A, f550; +mul.f32 f552, f542, 0f3F167918; +mul.f32 f553, f544, 0f3F737871; +sub.f32 f554, f553, f552; +sub.f32 f555, f551, f554; +add.f32 f556, f554, f551; +fma.rn.f32 f2697, f535, 0f3E9E377A, f144; +mul.f32 f2698, f537, 0f3F4F1BBD; +sub.f32 f559, f2697, f2698; +sub.f32 f560, f363, f378; +sub.f32 f562, f368, f373; +mul.f32 f2695, f560, 0f3F737871; +mul.f32 f2696, f562, 0fBF167918; +sub.f32 f564, f2696, f2695; +add.f32 f565, f564, f559; +sub.f32 f566, f559, f564; +mul.f32 f567, f535, 0f3F4F1BBD; +sub.f32 f568, f144, f567; +fma.rn.f32 f569, f537, 0f3E9E377A, f568; +mul.f32 f570, f560, 0f3F167918; +mul.f32 f571, f562, 0f3F737871; +sub.f32 f572, f571, f570; +add.f32 f573, f572, f569; +sub.f32 f574, f569, f572; +add.f32 f575, f383, f398; +add.f32 f577, f388, f393; +add.f32 f2694, f118, f575; +add.f32 f578, f577, f2694; +add.f32 f579, f385, f400; +add.f32 f581, f390, f395; +add.f32 f2693, f136, f579; +add.f32 f582, f581, f2693; +fma.rn.f32 f2691, f575, 0f3E9E377A, f118; +mul.f32 f2692, f577, 0f3F4F1BBD; +sub.f32 f585, f2691, f2692; +sub.f32 f586, f385, f400; +sub.f32 f588, f390, f395; +mul.f32 f2689, f586, 0f3F737871; +mul.f32 f2690, f588, 0fBF167918; +sub.f32 f590, f2690, f2689; +sub.f32 f591, f585, f590; +add.f32 f592, f590, f585; +mul.f32 f593, f575, 0f3F4F1BBD; +sub.f32 f594, f118, f593; +fma.rn.f32 f595, f577, 0f3E9E377A, f594; +mul.f32 f596, f586, 0f3F167918; +mul.f32 f597, f588, 0f3F737871; +sub.f32 f598, f597, f596; +sub.f32 f599, f595, f598; +add.f32 f600, f598, f595; +mul.f32 f602, f581, 0f3F4F1BBD; +fma.rn.f32 f2688, f579, 0f3E9E377A, f136; +sub.f32 f603, f2688, f602; +sub.f32 f604, f383, f398; +sub.f32 f606, f388, f393; +mul.f32 f2686, f604, 0f3F737871; +mul.f32 f2687, f606, 0fBF167918; +sub.f32 f608, f2687, f2686; +add.f32 f609, f608, f603; +sub.f32 f610, f603, f608; +mul.f32 f611, f579, 0f3F4F1BBD; +sub.f32 f612, f136, f611; +fma.rn.f32 f613, f581, 0f3E9E377A, f612; +mul.f32 f614, f604, 0f3F167918; +mul.f32 f615, f606, 0f3F737871; +sub.f32 f616, f615, f614; +add.f32 f617, f616, f613; +sub.f32 f618, f613, f616; +mov.u32 r18, %tid.x; +mul.wide.u32 rd2, r18, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r18, r6; +mad.lo.s32 r8, r5, 125000, r3; +mov.u64 rd5, %51; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f619, f620}, [rd6]; +mul.f32 f624, f620, f450; +mul.f32 f625, f619, f450; +mul.f32 f627, f620, f620; +mul.f32 f2685, f619, f619; +sub.f32 f628, f2685, f627; +mul.f32 f629, f620, f619; +fma.rn.f32 f630, f620, f619, f629; +mul.f32 f632, f630, f494; +mul.f32 f633, f628, f494; +mul.f32 f2683, f619, f628; +mul.f32 f2684, f620, f630; +sub.f32 f636, f2683, f2684; +mul.f32 f2682, f628, f490; +mul.f32 f637, f619, f630; +fma.rn.f32 f638, f620, f628, f637; +mul.f32 f640, f638, f538; +mul.f32 f641, f636, f538; +mul.f32 f643, f620, f638; +mul.f32 f2681, f619, f636; +sub.f32 f644, f2681, f643; +mul.f32 f2680, f636, f534; +mul.f32 f645, f619, f638; +fma.rn.f32 f646, f620, f636, f645; +mul.f32 f648, f646, f582; +mul.f32 f649, f644, f582; +mul.f32 f2678, f619, f644; +mul.f32 f2679, f620, f646; +sub.f32 f652, f2678, f2679; +mul.f32 f2677, f644, f578; +mul.f32 f653, f619, f646; +fma.rn.f32 f654, f620, f644, f653; +mul.f32 f656, f654, f433; +mul.f32 f657, f652, f433; +mul.f32 f659, f620, f654; +mul.f32 f2676, f619, f652; +sub.f32 f660, f2676, f659; +mul.f32 f2675, f652, f415; +mul.f32 f661, f619, f654; +fma.rn.f32 f662, f620, f652, f661; +mul.f32 f664, f662, f477; +mul.f32 f665, f660, f477; +mul.f32 f667, f620, f662; +mul.f32 f2674, f619, f660; +sub.f32 f668, f2674, f667; +mul.f32 f2673, f660, f459; +mul.f32 f669, f619, f662; +fma.rn.f32 f670, f620, f660, f669; +mul.f32 f672, f670, f521; +mul.f32 f673, f668, f521; +mul.f32 f2671, f619, f668; +mul.f32 f2672, f620, f670; +sub.f32 f676, f2671, f2672; +mul.f32 f2670, f668, f503; +mul.f32 f677, f619, f670; +fma.rn.f32 f678, f620, f668, f677; +mul.f32 f680, f678, f565; +mul.f32 f681, f676, f565; +mul.f32 f683, f620, f678; +mul.f32 f2669, f619, f676; +sub.f32 f684, f2669, f683; +mul.f32 f2668, f676, f547; +mul.f32 f685, f619, f678; +fma.rn.f32 f686, f620, f676, f685; +mul.f32 f688, f686, f609; +mul.f32 f689, f684, f609; +mul.f32 f691, f620, f686; +mul.f32 f2667, f619, f684; +sub.f32 f692, f2667, f691; +mul.f32 f2666, f684, f591; +mul.f32 f693, f619, f686; +fma.rn.f32 f694, f620, f684, f693; +mul.f32 f696, f694, f441; +mul.f32 f697, f692, f441; +mul.f32 f2664, f619, f692; +mul.f32 f2665, f620, f694; +sub.f32 f700, f2664, f2665; +mul.f32 f2663, f692, f423; +mul.f32 f701, f619, f694; +fma.rn.f32 f702, f620, f692, f701; +mul.f32 f704, f702, f485; +mul.f32 f705, f700, f485; +mul.f32 f707, f620, f702; +mul.f32 f2662, f619, f700; +sub.f32 f708, f2662, f707; +mul.f32 f2661, f700, f467; +mul.f32 f709, f619, f702; +fma.rn.f32 f710, f620, f700, f709; +mul.f32 f712, f710, f529; +mul.f32 f713, f708, f529; +mul.f32 f2659, f619, f708; +mul.f32 f2660, f620, f710; +sub.f32 f716, f2659, f2660; +mul.f32 f2658, f708, f511; +mul.f32 f717, f619, f710; +fma.rn.f32 f718, f620, f708, f717; +mul.f32 f720, f718, f573; +mul.f32 f721, f716, f573; +mul.f32 f723, f620, f718; +mul.f32 f2657, f619, f716; +sub.f32 f724, f2657, f723; +mul.f32 f2656, f716, f555; +mul.f32 f725, f619, f718; +fma.rn.f32 f726, f620, f716, f725; +mul.f32 f728, f726, f617; +mul.f32 f729, f724, f617; +mul.f32 f731, f620, f726; +mul.f32 f2655, f619, f724; +sub.f32 f732, f2655, f731; +mul.f32 f2654, f724, f599; +mul.f32 f733, f619, f726; +fma.rn.f32 f734, f620, f724, f733; +mul.f32 f736, f734, f442; +mul.f32 f737, f732, f442; +mul.f32 f2652, f619, f732; +mul.f32 f2653, f620, f734; +sub.f32 f740, f2652, f2653; +mul.f32 f2651, f732, f424; +mul.f32 f741, f619, f734; +fma.rn.f32 f742, f620, f732, f741; +mul.f32 f744, f742, f486; +mul.f32 f745, f740, f486; +mul.f32 f747, f620, f742; +mul.f32 f2650, f619, f740; +sub.f32 f748, f2650, f747; +mul.f32 f2649, f740, f468; +mul.f32 f749, f619, f742; +fma.rn.f32 f750, f620, f740, f749; +mul.f32 f752, f750, f530; +mul.f32 f753, f748, f530; +mul.f32 f755, f620, f750; +mul.f32 f2648, f619, f748; +sub.f32 f756, f2648, f755; +mul.f32 f2647, f748, f512; +mul.f32 f757, f619, f750; +fma.rn.f32 f758, f620, f748, f757; +mul.f32 f760, f758, f574; +mul.f32 f761, f756, f574; +mul.f32 f2645, f619, f756; +mul.f32 f2646, f620, f758; +sub.f32 f764, f2645, f2646; +mul.f32 f2644, f756, f556; +mul.f32 f765, f619, f758; +fma.rn.f32 f766, f620, f756, f765; +mul.f32 f768, f766, f618; +mul.f32 f769, f764, f618; +mul.f32 f771, f620, f766; +mul.f32 f2643, f619, f764; +sub.f32 f772, f2643, f771; +mul.f32 f2642, f764, f600; +mul.f32 f773, f619, f766; +fma.rn.f32 f774, f620, f764, f773; +mul.f32 f776, f774, f434; +mul.f32 f777, f772, f434; +mul.f32 f2640, f619, f772; +mul.f32 f2641, f620, f774; +sub.f32 f780, f2640, f2641; +mul.f32 f2639, f772, f416; +mul.f32 f781, f619, f774; +fma.rn.f32 f782, f620, f772, f781; +mul.f32 f784, f782, f478; +mul.f32 f785, f780, f478; +mul.f32 f787, f620, f782; +mul.f32 f2638, f619, f780; +sub.f32 f788, f2638, f787; +mul.f32 f2637, f780, f460; +mul.f32 f789, f619, f782; +fma.rn.f32 f790, f620, f780, f789; +mul.f32 f792, f790, f522; +mul.f32 f793, f788, f522; +mul.f32 f795, f620, f790; +mul.f32 f2636, f619, f788; +sub.f32 f796, f2636, f795; +mul.f32 f2635, f788, f504; +mul.f32 f797, f619, f790; +fma.rn.f32 f798, f620, f788, f797; +mul.f32 f800, f798, f566; +mul.f32 f801, f796, f566; +mul.f32 f2633, f619, f796; +mul.f32 f2634, f620, f798; +sub.f32 f804, f2633, f2634; +mul.f32 f2632, f619, f446; +mul.f32 f805, f619, f798; +mul.f32 f2631, f796, f548; +fma.rn.f32 f806, f620, f796, f805; +mul.f32 f807, f804, f592; +mul.f32 f808, f806, f610; +mul.f32 f809, f804, f610; +barrier.sync 0; +add.f32 f810, f2728, f2721; +add.f32 f811, f403, f2726; +mad.lo.s32 r17, r7, 200, r8; +st.shared.v2.f32 [r17], {f811, f810}; +fma.rn.f32 f812, f620, f446, f625; +sub.f32 f813, f2632, f624; +st.shared.v2.f32 [r17+8], {f813, f812}; +fma.rn.f32 f814, f630, f490, f633; +sub.f32 f815, f2682, f632; +st.shared.v2.f32 [r17+16], {f815, f814}; +fma.rn.f32 f816, f638, f534, f641; +sub.f32 f817, f2680, f640; +st.shared.v2.f32 [r17+24], {f817, f816}; +fma.rn.f32 f818, f646, f578, f649; +sub.f32 f819, f2677, f648; +st.shared.v2.f32 [r17+32], {f819, f818}; +sub.f32 f820, f2675, f656; +fma.rn.f32 f821, f654, f415, f657; +st.shared.v2.f32 [r17+40], {f820, f821}; +fma.rn.f32 f822, f662, f459, f665; +sub.f32 f823, f2673, f664; +st.shared.v2.f32 [r17+48], {f823, f822}; +sub.f32 f824, f2670, f672; +fma.rn.f32 f825, f670, f503, f673; +st.shared.v2.f32 [r17+56], {f824, f825}; +fma.rn.f32 f826, f678, f547, f681; +sub.f32 f827, f2668, f680; +st.shared.v2.f32 [r17+64], {f827, f826}; +fma.rn.f32 f828, f686, f591, f689; +sub.f32 f829, f2666, f688; +st.shared.v2.f32 [r17+72], {f829, f828}; +fma.rn.f32 f830, f694, f423, f697; +sub.f32 f831, f2663, f696; +st.shared.v2.f32 [r17+80], {f831, f830}; +fma.rn.f32 f832, f702, f467, f705; +sub.f32 f833, f2661, f704; +st.shared.v2.f32 [r17+88], {f833, f832}; +fma.rn.f32 f834, f710, f511, f713; +sub.f32 f835, f2658, f712; +st.shared.v2.f32 [r17+96], {f835, f834}; +fma.rn.f32 f836, f718, f555, f721; +sub.f32 f837, f2656, f720; +st.shared.v2.f32 [r17+104], {f837, f836}; +fma.rn.f32 f838, f726, f599, f729; +sub.f32 f839, f2654, f728; +st.shared.v2.f32 [r17+112], {f839, f838}; +fma.rn.f32 f840, f734, f424, f737; +sub.f32 f841, f2651, f736; +st.shared.v2.f32 [r17+120], {f841, f840}; +fma.rn.f32 f842, f742, f468, f745; +sub.f32 f843, f2649, f744; +st.shared.v2.f32 [r17+128], {f843, f842}; +fma.rn.f32 f844, f750, f512, f753; +sub.f32 f845, f2647, f752; +st.shared.v2.f32 [r17+136], {f845, f844}; +fma.rn.f32 f846, f758, f556, f761; +sub.f32 f847, f2644, f760; +st.shared.v2.f32 [r17+144], {f847, f846}; +fma.rn.f32 f848, f766, f600, f769; +sub.f32 f849, f2642, f768; +st.shared.v2.f32 [r17+152], {f849, f848}; +fma.rn.f32 f850, f774, f416, f777; +sub.f32 f851, f2639, f776; +st.shared.v2.f32 [r17+160], {f851, f850}; +fma.rn.f32 f852, f782, f460, f785; +sub.f32 f853, f2637, f784; +st.shared.v2.f32 [r17+168], {f853, f852}; +fma.rn.f32 f854, f790, f504, f793; +sub.f32 f855, f2635, f792; +st.shared.v2.f32 [r17+176], {f855, f854}; +fma.rn.f32 f856, f798, f548, f801; +sub.f32 f857, f2631, f800; +st.shared.v2.f32 [r17+184], {f857, f856}; +fma.rn.f32 f858, f806, f592, f809; +sub.f32 f859, f807, f808; +st.shared.v2.f32 [r17+192], {f859, f858}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r17; +ld.shared.v2.f32 {f860, f861}, [r10]; +ld.shared.v2.f32 {f864, f865}, [r10+5000]; +ld.shared.v2.f32 {f868, f869}, [r10+10000]; +ld.shared.v2.f32 {f872, f873}, [r10+15000]; +ld.shared.v2.f32 {f876, f877}, [r10+20000]; +ld.shared.v2.f32 {f880, f881}, [r10+25000]; +ld.shared.v2.f32 {f884, f885}, [r10+30000]; +ld.shared.v2.f32 {f888, f889}, [r10+35000]; +ld.shared.v2.f32 {f892, f893}, [r10+40000]; +ld.shared.v2.f32 {f896, f897}, [r10+45000]; +ld.shared.v2.f32 {f900, f901}, [r10+50000]; +ld.shared.v2.f32 {f904, f905}, [r10+55000]; +ld.shared.v2.f32 {f908, f909}, [r10+60000]; +ld.shared.v2.f32 {f912, f913}, [r10+65000]; +ld.shared.v2.f32 {f916, f917}, [r10+70000]; +ld.shared.v2.f32 {f920, f921}, [r10+75000]; +ld.shared.v2.f32 {f924, f925}, [r10+80000]; +ld.shared.v2.f32 {f928, f929}, [r10+85000]; +ld.shared.v2.f32 {f932, f933}, [r10+90000]; +ld.shared.v2.f32 {f936, f937}, [r10+95000]; +ld.shared.v2.f32 {f940, f941}, [r10+100000]; +ld.shared.v2.f32 {f944, f945}, [r10+105000]; +ld.shared.v2.f32 {f948, f949}, [r10+110000]; +ld.shared.v2.f32 {f952, f953}, [r10+115000]; +ld.shared.v2.f32 {f956, f957}, [r10+120000]; +add.f32 f960, f880, f940; +add.f32 f962, f900, f920; +add.f32 f2630, f860, f960; +add.f32 f963, f962, f2630; +add.f32 f964, f881, f941; +add.f32 f966, f901, f921; +add.f32 f2629, f861, f964; +add.f32 f967, f966, f2629; +fma.rn.f32 f2627, f960, 0f3E9E377A, f860; +mul.f32 f2628, f962, 0f3F4F1BBD; +sub.f32 f970, f2627, f2628; +sub.f32 f971, f881, f941; +sub.f32 f973, f901, f921; +mul.f32 f2625, f971, 0f3F737871; +mul.f32 f2626, f973, 0fBF167918; +sub.f32 f975, f2626, f2625; +sub.f32 f976, f970, f975; +add.f32 f977, f975, f970; +mul.f32 f978, f960, 0f3F4F1BBD; +sub.f32 f979, f860, f978; +fma.rn.f32 f980, f962, 0f3E9E377A, f979; +mul.f32 f981, f971, 0f3F167918; +mul.f32 f982, f973, 0f3F737871; +sub.f32 f983, f982, f981; +sub.f32 f984, f980, f983; +add.f32 f985, f983, f980; +mul.f32 f987, f966, 0f3F4F1BBD; +fma.rn.f32 f2624, f964, 0f3E9E377A, f861; +sub.f32 f988, f2624, f987; +sub.f32 f989, f880, f940; +sub.f32 f991, f900, f920; +mul.f32 f2622, f989, 0f3F737871; +mul.f32 f2623, f991, 0fBF167918; +sub.f32 f993, f2623, f2622; +add.f32 f994, f993, f988; +sub.f32 f995, f988, f993; +mul.f32 f996, f964, 0f3F4F1BBD; +sub.f32 f997, f861, f996; +fma.rn.f32 f998, f966, 0f3E9E377A, f997; +mul.f32 f999, f989, 0f3F167918; +mul.f32 f1000, f991, 0f3F737871; +sub.f32 f1001, f1000, f999; +add.f32 f1002, f1001, f998; +sub.f32 f1003, f998, f1001; +add.f32 f1004, f884, f944; +add.f32 f1006, f904, f924; +add.f32 f2621, f864, f1004; +add.f32 f1007, f1006, f2621; +add.f32 f1008, f885, f945; +add.f32 f1010, f905, f925; +add.f32 f2620, f865, f1008; +add.f32 f1011, f1010, f2620; +mul.f32 f1013, f1006, 0f3F4F1BBD; +fma.rn.f32 f2619, f1004, 0f3E9E377A, f864; +sub.f32 f1014, f2619, f1013; +sub.f32 f1015, f885, f945; +sub.f32 f1017, f905, f925; +mul.f32 f2617, f1015, 0f3F737871; +mul.f32 f2618, f1017, 0fBF167918; +sub.f32 f1019, f2618, f2617; +sub.f32 f1020, f1014, f1019; +add.f32 f1021, f1019, f1014; +mul.f32 f1022, f1004, 0f3F4F1BBD; +sub.f32 f1023, f864, f1022; +fma.rn.f32 f1024, f1006, 0f3E9E377A, f1023; +mul.f32 f1025, f1015, 0f3F167918; +mul.f32 f1026, f1017, 0f3F737871; +sub.f32 f1027, f1026, f1025; +sub.f32 f1028, f1024, f1027; +add.f32 f1029, f1027, f1024; +fma.rn.f32 f2615, f1008, 0f3E9E377A, f865; +mul.f32 f2616, f1010, 0f3F4F1BBD; +sub.f32 f1032, f2615, f2616; +sub.f32 f1033, f884, f944; +sub.f32 f1035, f904, f924; +mul.f32 f2613, f1033, 0f3F737871; +mul.f32 f2614, f1035, 0fBF167918; +sub.f32 f1037, f2614, f2613; +add.f32 f1038, f1037, f1032; +sub.f32 f1039, f1032, f1037; +mul.f32 f1040, f1008, 0f3F4F1BBD; +sub.f32 f1041, f865, f1040; +fma.rn.f32 f1042, f1010, 0f3E9E377A, f1041; +mul.f32 f1043, f1033, 0f3F167918; +mul.f32 f1044, f1035, 0f3F737871; +sub.f32 f1045, f1044, f1043; +add.f32 f1046, f1045, f1042; +sub.f32 f1047, f1042, f1045; +add.f32 f1048, f888, f948; +add.f32 f1050, f908, f928; +add.f32 f2612, f868, f1048; +add.f32 f1051, f1050, f2612; +add.f32 f1052, f889, f949; +add.f32 f1054, f909, f929; +add.f32 f2611, f869, f1052; +add.f32 f1055, f1054, f2611; +fma.rn.f32 f2609, f1048, 0f3E9E377A, f868; +mul.f32 f2610, f1050, 0f3F4F1BBD; +sub.f32 f1058, f2609, f2610; +sub.f32 f1059, f889, f949; +sub.f32 f1061, f909, f929; +mul.f32 f2607, f1059, 0f3F737871; +mul.f32 f2608, f1061, 0fBF167918; +sub.f32 f1063, f2608, f2607; +sub.f32 f1064, f1058, f1063; +add.f32 f1065, f1063, f1058; +mul.f32 f1066, f1048, 0f3F4F1BBD; +sub.f32 f1067, f868, f1066; +fma.rn.f32 f1068, f1050, 0f3E9E377A, f1067; +mul.f32 f1069, f1059, 0f3F167918; +mul.f32 f1070, f1061, 0f3F737871; +sub.f32 f1071, f1070, f1069; +sub.f32 f1072, f1068, f1071; +add.f32 f1073, f1071, f1068; +mul.f32 f1075, f1054, 0f3F4F1BBD; +fma.rn.f32 f2606, f1052, 0f3E9E377A, f869; +sub.f32 f1076, f2606, f1075; +sub.f32 f1077, f888, f948; +sub.f32 f1079, f908, f928; +mul.f32 f1080, f1079, 0fBF167918; +mul.f32 f2605, f1077, 0f3F737871; +sub.f32 f1081, f1080, f2605; +add.f32 f1082, f1081, f1076; +sub.f32 f1083, f1076, f1081; +mul.f32 f1084, f1052, 0f3F4F1BBD; +sub.f32 f1085, f869, f1084; +fma.rn.f32 f1086, f1054, 0f3E9E377A, f1085; +mul.f32 f1087, f1077, 0f3F167918; +mul.f32 f1088, f1079, 0f3F737871; +sub.f32 f1089, f1088, f1087; +add.f32 f1090, f1089, f1086; +sub.f32 f1091, f1086, f1089; +add.f32 f1092, f892, f952; +add.f32 f1094, f912, f932; +add.f32 f2604, f872, f1092; +add.f32 f1095, f1094, f2604; +add.f32 f1096, f893, f953; +add.f32 f1098, f913, f933; +add.f32 f2603, f873, f1096; +add.f32 f1099, f1098, f2603; +mul.f32 f1101, f1094, 0f3F4F1BBD; +fma.rn.f32 f2602, f1092, 0f3E9E377A, f872; +sub.f32 f1102, f2602, f1101; +sub.f32 f1103, f893, f953; +sub.f32 f1105, f913, f933; +mul.f32 f1106, f1105, 0fBF167918; +mul.f32 f2601, f1103, 0f3F737871; +sub.f32 f1107, f1106, f2601; +sub.f32 f1108, f1102, f1107; +add.f32 f1109, f1107, f1102; +mul.f32 f1110, f1092, 0f3F4F1BBD; +sub.f32 f1111, f872, f1110; +fma.rn.f32 f1112, f1094, 0f3E9E377A, f1111; +mul.f32 f1113, f1103, 0f3F167918; +mul.f32 f1114, f1105, 0f3F737871; +sub.f32 f1115, f1114, f1113; +sub.f32 f1116, f1112, f1115; +add.f32 f1117, f1115, f1112; +fma.rn.f32 f2599, f1096, 0f3E9E377A, f873; +mul.f32 f2600, f1098, 0f3F4F1BBD; +sub.f32 f1120, f2599, f2600; +sub.f32 f1121, f892, f952; +sub.f32 f1123, f912, f932; +mul.f32 f2597, f1121, 0f3F737871; +mul.f32 f2598, f1123, 0fBF167918; +sub.f32 f1125, f2598, f2597; +add.f32 f1126, f1125, f1120; +sub.f32 f1127, f1120, f1125; +mul.f32 f1128, f1096, 0f3F4F1BBD; +sub.f32 f1129, f873, f1128; +fma.rn.f32 f1130, f1098, 0f3E9E377A, f1129; +mul.f32 f1131, f1121, 0f3F167918; +mul.f32 f1132, f1123, 0f3F737871; +sub.f32 f1133, f1132, f1131; +add.f32 f1134, f1133, f1130; +sub.f32 f1135, f1130, f1133; +add.f32 f1136, f896, f956; +add.f32 f1138, f916, f936; +add.f32 f2596, f876, f1136; +add.f32 f1139, f1138, f2596; +add.f32 f1140, f897, f957; +add.f32 f1142, f917, f937; +add.f32 f2595, f877, f1140; +add.f32 f1143, f1142, f2595; +fma.rn.f32 f2593, f1136, 0f3E9E377A, f876; +mul.f32 f2594, f1138, 0f3F4F1BBD; +sub.f32 f1146, f2593, f2594; +sub.f32 f1147, f897, f957; +sub.f32 f1149, f917, f937; +mul.f32 f2591, f1147, 0f3F737871; +mul.f32 f2592, f1149, 0fBF167918; +sub.f32 f1151, f2592, f2591; +sub.f32 f1152, f1146, f1151; +add.f32 f1153, f1151, f1146; +mul.f32 f1154, f1136, 0f3F4F1BBD; +sub.f32 f1155, f876, f1154; +fma.rn.f32 f1156, f1138, 0f3E9E377A, f1155; +mul.f32 f1157, f1147, 0f3F167918; +mul.f32 f1158, f1149, 0f3F737871; +sub.f32 f1159, f1158, f1157; +sub.f32 f1160, f1156, f1159; +add.f32 f1161, f1159, f1156; +mul.f32 f1163, f1142, 0f3F4F1BBD; +fma.rn.f32 f2590, f1140, 0f3E9E377A, f877; +sub.f32 f1164, f2590, f1163; +sub.f32 f1165, f896, f956; +sub.f32 f1167, f916, f936; +mul.f32 f2588, f1165, 0f3F737871; +mul.f32 f2589, f1167, 0fBF167918; +sub.f32 f1169, f2589, f2588; +add.f32 f1170, f1169, f1164; +sub.f32 f1171, f1164, f1169; +mul.f32 f1172, f1140, 0f3F4F1BBD; +sub.f32 f1173, f877, f1172; +fma.rn.f32 f1174, f1142, 0f3E9E377A, f1173; +mul.f32 f1175, f1165, 0f3F167918; +mul.f32 f1176, f1167, 0f3F737871; +sub.f32 f1177, f1176, f1175; +add.f32 f1178, f1177, f1174; +sub.f32 f1179, f1174, f1177; +mul.f32 f1181, f1038, 0fBE7EA890; +mul.f32 f2587, f1020, 0f3F77F511; +sub.f32 f1182, f2587, f1181; +mul.f32 f1183, f1038, 0f3F77F511; +fma.rn.f32 f1184, f1020, 0fBE7EA890, f1183; +mul.f32 f1186, f1082, 0fBEF6A86B; +mul.f32 f2586, f1064, 0f3F6055A2; +sub.f32 f1187, f2586, f1186; +mul.f32 f1188, f1082, 0f3F6055A2; +fma.rn.f32 f1189, f1064, 0fBEF6A86B, f1188; +mul.f32 f1191, f1126, 0fBF2F3E7B; +mul.f32 f2585, f1108, 0f3F3A9DB0; +sub.f32 f1192, f2585, f1191; +mul.f32 f1193, f1126, 0f3F3A9DB0; +fma.rn.f32 f1194, f1108, 0fBF2F3E7B, f1193; +mul.f32 f1196, f1170, 0fBF5825E0; +mul.f32 f2584, f1152, 0f3F092BF2; +sub.f32 f1197, f2584, f1196; +mul.f32 f1198, f1170, 0f3F092BF2; +fma.rn.f32 f1199, f1152, 0fBF5825E0, f1198; +mul.f32 f1201, f1046, 0fBEF6A86B; +mul.f32 f2583, f1028, 0f3F6055A2; +sub.f32 f1202, f2583, f1201; +mul.f32 f1203, f1046, 0f3F6055A2; +fma.rn.f32 f1204, f1028, 0fBEF6A86B, f1203; +mul.f32 f2581, f1072, 0f3F092BF2; +mul.f32 f2582, f1090, 0fBF5825E0; +sub.f32 f1207, f2581, f2582; +mul.f32 f1208, f1090, 0f3F092BF2; +fma.rn.f32 f1209, f1072, 0fBF5825E0, f1208; +mul.f32 f2579, f1116, 0f3D809851; +mul.f32 f2580, f1134, 0fBF7F7EAE; +sub.f32 f1212, f2579, f2580; +mul.f32 f1213, f1134, 0f3D809851; +fma.rn.f32 f1214, f1116, 0fBF7F7EAE, f1213; +mul.f32 f2577, f1160, 0fBED9FFBE; +mul.f32 f2578, f1178, 0fBF67A2BF; +sub.f32 f1217, f2577, f2578; +mul.f32 f1218, f1178, 0fBED9FFBE; +fma.rn.f32 f1219, f1160, 0fBF67A2BF, f1218; +mul.f32 f1221, f1047, 0fBF2F3E7B; +mul.f32 f2576, f1029, 0f3F3A9DB0; +sub.f32 f1222, f2576, f1221; +mul.f32 f1223, f1047, 0f3F3A9DB0; +fma.rn.f32 f1224, f1029, 0fBF2F3E7B, f1223; +mul.f32 f1226, f1091, 0fBF7F7EAE; +mul.f32 f2575, f1073, 0f3D809851; +sub.f32 f1227, f2575, f1226; +mul.f32 f1228, f1091, 0f3D809851; +fma.rn.f32 f1229, f1073, 0fBF7F7EAE, f1228; +mul.f32 f1231, f1135, 0fBF45405B; +mul.f32 f2574, f1117, 0fBF232E38; +sub.f32 f1232, f2574, f1231; +mul.f32 f1233, f1135, 0fBF232E38; +fma.rn.f32 f1234, f1117, 0fBF45405B, f1233; +mul.f32 f1236, f1179, 0fBE00575B; +mul.f32 f2573, f1161, 0fBF7DFB3B; +sub.f32 f1237, f2573, f1236; +mul.f32 f1238, f1179, 0fBF7DFB3B; +fma.rn.f32 f1239, f1161, 0fBE00575B, f1238; +mul.f32 f1241, f1039, 0fBF5825E0; +mul.f32 f2572, f1021, 0f3F092BF2; +sub.f32 f1242, f2572, f1241; +mul.f32 f1243, f1039, 0f3F092BF2; +fma.rn.f32 f1244, f1021, 0fBF5825E0, f1243; +mul.f32 f1246, f1083, 0fBF67A2BF; +mul.f32 f2571, f1065, 0fBED9FFBE; +sub.f32 f1247, f2571, f1246; +mul.f32 f1248, f1083, 0fBED9FFBE; +fma.rn.f32 f1249, f1065, 0fBF67A2BF, f1248; +mul.f32 f2569, f1109, 0fBF7DFB3B; +mul.f32 f2570, f1127, 0fBE00575B; +sub.f32 f1252, f2569, f2570; +mul.f32 f1253, f1127, 0fBF7DFB3B; +fma.rn.f32 f1254, f1109, 0fBE00575B, f1253; +mul.f32 f2567, f1153, 0fBF232E38; +mul.f32 f2568, f1171, 0f3F45405B; +sub.f32 f1257, f2567, f2568; +mul.f32 f1258, f1171, 0fBF232E38; +fma.rn.f32 f1259, f1153, 0f3F45405B, f1258; +add.f32 f1260, f1007, f1139; +add.f32 f1262, f1051, f1095; +mul.f32 f1267, f1262, 0f3F4F1BBD; +fma.rn.f32 f2566, f1260, 0f3E9E377A, f963; +sub.f32 f1268, f2566, f1267; +add.f32 f2565, f1011, f1143; +sub.f32 f1269, f1011, f1143; +add.f32 f2564, f1055, f1099; +sub.f32 f1271, f1055, f1099; +mul.f32 f1272, f1271, 0fBF167918; +mul.f32 f2563, f1269, 0f3F737871; +sub.f32 f1273, f1272, f2563; +sub.f32 f1274, f1268, f1273; +add.f32 f1275, f1273, f1268; +add.f32 f2562, f963, f1260; +mul.f32 f1276, f1260, 0f3F4F1BBD; +sub.f32 f1277, f963, f1276; +fma.rn.f32 f1278, f1262, 0f3E9E377A, f1277; +mul.f32 f1279, f1269, 0f3F167918; +mul.f32 f1280, f1271, 0f3F737871; +sub.f32 f1281, f1280, f1279; +sub.f32 f1282, f1278, f1281; +add.f32 f1283, f1281, f1278; +mul.f32 f1285, f2564, 0f3F4F1BBD; +fma.rn.f32 f2561, f2565, 0f3E9E377A, f967; +sub.f32 f1286, f2561, f1285; +sub.f32 f1287, f1007, f1139; +sub.f32 f1289, f1051, f1095; +mul.f32 f2559, f1287, 0f3F737871; +mul.f32 f2560, f1289, 0fBF167918; +sub.f32 f1291, f2560, f2559; +add.f32 f1292, f1291, f1286; +sub.f32 f1293, f1286, f1291; +add.f32 f2558, f967, f2565; +mul.f32 f1294, f2565, 0f3F4F1BBD; +sub.f32 f1295, f967, f1294; +fma.rn.f32 f1296, f2564, 0f3E9E377A, f1295; +mul.f32 f1297, f1287, 0f3F167918; +mul.f32 f1298, f1289, 0f3F737871; +sub.f32 f1299, f1298, f1297; +add.f32 f1300, f1299, f1296; +sub.f32 f1301, f1296, f1299; +add.f32 f1302, f1182, f1197; +add.f32 f1304, f1187, f1192; +add.f32 f2557, f976, f1302; +add.f32 f1305, f1304, f2557; +add.f32 f1306, f1184, f1199; +add.f32 f1308, f1189, f1194; +add.f32 f2556, f994, f1306; +add.f32 f1309, f1308, f2556; +fma.rn.f32 f2554, f1302, 0f3E9E377A, f976; +mul.f32 f2555, f1304, 0f3F4F1BBD; +sub.f32 f1312, f2554, f2555; +sub.f32 f1313, f1184, f1199; +sub.f32 f1315, f1189, f1194; +mul.f32 f2552, f1313, 0f3F737871; +mul.f32 f2553, f1315, 0fBF167918; +sub.f32 f1317, f2553, f2552; +sub.f32 f1318, f1312, f1317; +add.f32 f1319, f1317, f1312; +mul.f32 f1320, f1302, 0f3F4F1BBD; +sub.f32 f1321, f976, f1320; +fma.rn.f32 f1322, f1304, 0f3E9E377A, f1321; +mul.f32 f1323, f1313, 0f3F167918; +mul.f32 f1324, f1315, 0f3F737871; +sub.f32 f1325, f1324, f1323; +sub.f32 f1326, f1322, f1325; +add.f32 f1327, f1325, f1322; +mul.f32 f1329, f1308, 0f3F4F1BBD; +fma.rn.f32 f2551, f1306, 0f3E9E377A, f994; +sub.f32 f1330, f2551, f1329; +sub.f32 f1331, f1182, f1197; +sub.f32 f1333, f1187, f1192; +mul.f32 f2549, f1331, 0f3F737871; +mul.f32 f2550, f1333, 0fBF167918; +sub.f32 f1335, f2550, f2549; +add.f32 f1336, f1335, f1330; +sub.f32 f1337, f1330, f1335; +mul.f32 f1338, f1306, 0f3F4F1BBD; +sub.f32 f1339, f994, f1338; +fma.rn.f32 f1340, f1308, 0f3E9E377A, f1339; +mul.f32 f1341, f1331, 0f3F167918; +mul.f32 f1342, f1333, 0f3F737871; +sub.f32 f1343, f1342, f1341; +add.f32 f1344, f1343, f1340; +sub.f32 f1345, f1340, f1343; +add.f32 f1346, f1202, f1217; +add.f32 f1348, f1207, f1212; +add.f32 f2548, f984, f1346; +add.f32 f1349, f1348, f2548; +add.f32 f1350, f1204, f1219; +add.f32 f1352, f1209, f1214; +add.f32 f2547, f1002, f1350; +add.f32 f1353, f1352, f2547; +mul.f32 f1355, f1348, 0f3F4F1BBD; +fma.rn.f32 f2546, f1346, 0f3E9E377A, f984; +sub.f32 f1356, f2546, f1355; +sub.f32 f1357, f1204, f1219; +sub.f32 f1359, f1209, f1214; +mul.f32 f2544, f1357, 0f3F737871; +mul.f32 f2545, f1359, 0fBF167918; +sub.f32 f1361, f2545, f2544; +sub.f32 f1362, f1356, f1361; +add.f32 f1363, f1361, f1356; +mul.f32 f1364, f1346, 0f3F4F1BBD; +sub.f32 f1365, f984, f1364; +fma.rn.f32 f1366, f1348, 0f3E9E377A, f1365; +mul.f32 f1367, f1357, 0f3F167918; +mul.f32 f1368, f1359, 0f3F737871; +sub.f32 f1369, f1368, f1367; +sub.f32 f1370, f1366, f1369; +add.f32 f1371, f1369, f1366; +mul.f32 f1373, f1352, 0f3F4F1BBD; +fma.rn.f32 f2543, f1350, 0f3E9E377A, f1002; +sub.f32 f1374, f2543, f1373; +sub.f32 f1375, f1202, f1217; +sub.f32 f1377, f1207, f1212; +mul.f32 f2541, f1375, 0f3F737871; +mul.f32 f2542, f1377, 0fBF167918; +sub.f32 f1379, f2542, f2541; +add.f32 f1380, f1379, f1374; +sub.f32 f1381, f1374, f1379; +mul.f32 f1382, f1350, 0f3F4F1BBD; +sub.f32 f1383, f1002, f1382; +fma.rn.f32 f1384, f1352, 0f3E9E377A, f1383; +mul.f32 f1385, f1375, 0f3F167918; +mul.f32 f1386, f1377, 0f3F737871; +sub.f32 f1387, f1386, f1385; +add.f32 f1388, f1387, f1384; +sub.f32 f1389, f1384, f1387; +add.f32 f1390, f1222, f1237; +add.f32 f1392, f1227, f1232; +add.f32 f2540, f985, f1390; +add.f32 f1393, f1392, f2540; +add.f32 f1394, f1224, f1239; +add.f32 f1396, f1229, f1234; +add.f32 f2539, f1003, f1394; +add.f32 f1397, f1396, f2539; +mul.f32 f1399, f1392, 0f3F4F1BBD; +fma.rn.f32 f2538, f1390, 0f3E9E377A, f985; +sub.f32 f1400, f2538, f1399; +sub.f32 f1401, f1224, f1239; +sub.f32 f1403, f1229, f1234; +mul.f32 f2536, f1401, 0f3F737871; +mul.f32 f2537, f1403, 0fBF167918; +sub.f32 f1405, f2537, f2536; +sub.f32 f1406, f1400, f1405; +add.f32 f1407, f1405, f1400; +mul.f32 f1408, f1390, 0f3F4F1BBD; +sub.f32 f1409, f985, f1408; +fma.rn.f32 f1410, f1392, 0f3E9E377A, f1409; +mul.f32 f1411, f1401, 0f3F167918; +mul.f32 f1412, f1403, 0f3F737871; +sub.f32 f1413, f1412, f1411; +sub.f32 f1414, f1410, f1413; +add.f32 f1415, f1413, f1410; +fma.rn.f32 f2534, f1394, 0f3E9E377A, f1003; +mul.f32 f2535, f1396, 0f3F4F1BBD; +sub.f32 f1418, f2534, f2535; +sub.f32 f1419, f1222, f1237; +sub.f32 f1421, f1227, f1232; +mul.f32 f2532, f1419, 0f3F737871; +mul.f32 f2533, f1421, 0fBF167918; +sub.f32 f1423, f2533, f2532; +add.f32 f1424, f1423, f1418; +sub.f32 f1425, f1418, f1423; +mul.f32 f1426, f1394, 0f3F4F1BBD; +sub.f32 f1427, f1003, f1426; +fma.rn.f32 f1428, f1396, 0f3E9E377A, f1427; +mul.f32 f1429, f1419, 0f3F167918; +mul.f32 f1430, f1421, 0f3F737871; +sub.f32 f1431, f1430, f1429; +add.f32 f1432, f1431, f1428; +sub.f32 f1433, f1428, f1431; +add.f32 f1434, f1242, f1257; +add.f32 f1436, f1247, f1252; +add.f32 f2531, f977, f1434; +add.f32 f1437, f1436, f2531; +add.f32 f1438, f1244, f1259; +add.f32 f1440, f1249, f1254; +add.f32 f2530, f995, f1438; +add.f32 f1441, f1440, f2530; +fma.rn.f32 f2528, f1434, 0f3E9E377A, f977; +mul.f32 f2529, f1436, 0f3F4F1BBD; +sub.f32 f1444, f2528, f2529; +sub.f32 f1445, f1244, f1259; +sub.f32 f1447, f1249, f1254; +mul.f32 f2526, f1445, 0f3F737871; +mul.f32 f2527, f1447, 0fBF167918; +sub.f32 f1449, f2527, f2526; +sub.f32 f1450, f1444, f1449; +add.f32 f1451, f1449, f1444; +mul.f32 f1452, f1434, 0f3F4F1BBD; +sub.f32 f1453, f977, f1452; +fma.rn.f32 f1454, f1436, 0f3E9E377A, f1453; +mul.f32 f1455, f1445, 0f3F167918; +mul.f32 f1456, f1447, 0f3F737871; +sub.f32 f1457, f1456, f1455; +sub.f32 f1458, f1454, f1457; +add.f32 f1459, f1457, f1454; +mul.f32 f1461, f1440, 0f3F4F1BBD; +fma.rn.f32 f2525, f1438, 0f3E9E377A, f995; +sub.f32 f1462, f2525, f1461; +sub.f32 f1463, f1242, f1257; +sub.f32 f1465, f1247, f1252; +mul.f32 f2523, f1463, 0f3F737871; +mul.f32 f2524, f1465, 0fBF167918; +sub.f32 f1467, f2524, f2523; +add.f32 f1468, f1467, f1462; +sub.f32 f1469, f1462, f1467; +mul.f32 f1470, f1438, 0f3F4F1BBD; +sub.f32 f1471, f995, f1470; +fma.rn.f32 f1472, f1440, 0f3E9E377A, f1471; +mul.f32 f1473, f1463, 0f3F167918; +mul.f32 f1474, f1465, 0f3F737871; +sub.f32 f1475, f1474, f1473; +add.f32 f1476, f1475, f1472; +sub.f32 f1477, f1472, f1475; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mul.wide.u32 rd12, r11, 8; +mov.u64 rd13, %52; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1478, f1479}, [rd11]; +mul.f32 f1483, f1479, f1309; +mul.f32 f1484, f1478, f1309; +mul.f32 f1486, f1479, f1479; +mul.f32 f2522, f1478, f1478; +sub.f32 f1487, f2522, f1486; +mul.f32 f1488, f1479, f1478; +fma.rn.f32 f1489, f1479, f1478, f1488; +mul.f32 f1491, f1489, f1353; +mul.f32 f1492, f1487, f1353; +mul.f32 f1494, f1479, f1489; +mul.f32 f2521, f1478, f1487; +sub.f32 f1495, f2521, f1494; +mul.f32 f2520, f1487, f1349; +mul.f32 f1496, f1478, f1489; +fma.rn.f32 f1497, f1479, f1487, f1496; +mul.f32 f1499, f1497, f1397; +mul.f32 f1500, f1495, f1397; +mul.f32 f2518, f1478, f1495; +mul.f32 f2519, f1479, f1497; +sub.f32 f1503, f2518, f2519; +mul.f32 f2517, f1495, f1393; +mul.f32 f1504, f1478, f1497; +fma.rn.f32 f1505, f1479, f1495, f1504; +mul.f32 f1507, f1505, f1441; +mul.f32 f1508, f1503, f1441; +mul.f32 f1510, f1479, f1505; +mul.f32 f2516, f1478, f1503; +sub.f32 f1511, f2516, f1510; +mul.f32 f2515, f1503, f1437; +mul.f32 f1512, f1478, f1505; +fma.rn.f32 f1513, f1479, f1503, f1512; +mul.f32 f1515, f1513, f1292; +mul.f32 f1516, f1511, f1292; +mul.f32 f2513, f1478, f1511; +mul.f32 f2514, f1479, f1513; +sub.f32 f1519, f2513, f2514; +mul.f32 f2512, f1511, f1274; +mul.f32 f1520, f1478, f1513; +fma.rn.f32 f1521, f1479, f1511, f1520; +mul.f32 f1523, f1521, f1336; +mul.f32 f1524, f1519, f1336; +mul.f32 f1526, f1479, f1521; +mul.f32 f2511, f1478, f1519; +sub.f32 f1527, f2511, f1526; +mul.f32 f2510, f1519, f1318; +mul.f32 f1528, f1478, f1521; +fma.rn.f32 f1529, f1479, f1519, f1528; +mul.f32 f1531, f1529, f1380; +mul.f32 f1532, f1527, f1380; +mul.f32 f1534, f1479, f1529; +mul.f32 f2509, f1478, f1527; +sub.f32 f1535, f2509, f1534; +mul.f32 f2508, f1527, f1362; +mul.f32 f1536, f1478, f1529; +fma.rn.f32 f1537, f1479, f1527, f1536; +mul.f32 f1539, f1537, f1424; +mul.f32 f1540, f1535, f1424; +mul.f32 f2506, f1478, f1535; +mul.f32 f2507, f1479, f1537; +sub.f32 f1543, f2506, f2507; +mul.f32 f2505, f1535, f1406; +mul.f32 f1544, f1478, f1537; +fma.rn.f32 f1545, f1479, f1535, f1544; +mul.f32 f1547, f1545, f1468; +mul.f32 f1548, f1543, f1468; +mul.f32 f1550, f1479, f1545; +mul.f32 f2504, f1478, f1543; +sub.f32 f1551, f2504, f1550; +mul.f32 f2503, f1543, f1450; +mul.f32 f1552, f1478, f1545; +fma.rn.f32 f1553, f1479, f1543, f1552; +mul.f32 f1555, f1553, f1300; +mul.f32 f1556, f1551, f1300; +mul.f32 f1558, f1479, f1553; +mul.f32 f2502, f1478, f1551; +sub.f32 f1559, f2502, f1558; +mul.f32 f2501, f1551, f1282; +mul.f32 f1560, f1478, f1553; +fma.rn.f32 f1561, f1479, f1551, f1560; +mul.f32 f1563, f1561, f1344; +mul.f32 f1564, f1559, f1344; +mul.f32 f2499, f1478, f1559; +mul.f32 f2500, f1479, f1561; +sub.f32 f1567, f2499, f2500; +mul.f32 f2498, f1559, f1326; +mul.f32 f1568, f1478, f1561; +fma.rn.f32 f1569, f1479, f1559, f1568; +mul.f32 f1571, f1569, f1388; +mul.f32 f1572, f1567, f1388; +mul.f32 f1574, f1479, f1569; +mul.f32 f2497, f1478, f1567; +sub.f32 f1575, f2497, f1574; +mul.f32 f2496, f1567, f1370; +mul.f32 f1576, f1478, f1569; +fma.rn.f32 f1577, f1479, f1567, f1576; +mul.f32 f1579, f1577, f1432; +mul.f32 f1580, f1575, f1432; +mul.f32 f2494, f1478, f1575; +mul.f32 f2495, f1479, f1577; +sub.f32 f1583, f2494, f2495; +mul.f32 f2493, f1575, f1414; +mul.f32 f1584, f1478, f1577; +fma.rn.f32 f1585, f1479, f1575, f1584; +mul.f32 f1587, f1585, f1476; +mul.f32 f1588, f1583, f1476; +mul.f32 f1590, f1479, f1585; +mul.f32 f2492, f1478, f1583; +sub.f32 f1591, f2492, f1590; +mul.f32 f2491, f1583, f1458; +mul.f32 f1592, f1478, f1585; +fma.rn.f32 f1593, f1479, f1583, f1592; +mul.f32 f1595, f1593, f1301; +mul.f32 f1596, f1591, f1301; +mul.f32 f1598, f1479, f1593; +mul.f32 f2490, f1478, f1591; +sub.f32 f1599, f2490, f1598; +mul.f32 f2489, f1591, f1283; +mul.f32 f1600, f1478, f1593; +fma.rn.f32 f1601, f1479, f1591, f1600; +mul.f32 f1603, f1601, f1345; +mul.f32 f1604, f1599, f1345; +mul.f32 f2487, f1478, f1599; +mul.f32 f2488, f1479, f1601; +sub.f32 f1607, f2487, f2488; +mul.f32 f2486, f1599, f1327; +mul.f32 f1608, f1478, f1601; +fma.rn.f32 f1609, f1479, f1599, f1608; +mul.f32 f1611, f1609, f1389; +mul.f32 f1612, f1607, f1389; +mul.f32 f1614, f1479, f1609; +mul.f32 f2485, f1478, f1607; +sub.f32 f1615, f2485, f1614; +mul.f32 f2484, f1607, f1371; +mul.f32 f1616, f1478, f1609; +fma.rn.f32 f1617, f1479, f1607, f1616; +mul.f32 f1619, f1617, f1433; +mul.f32 f1620, f1615, f1433; +mul.f32 f1622, f1479, f1617; +mul.f32 f2483, f1478, f1615; +sub.f32 f1623, f2483, f1622; +mul.f32 f2482, f1615, f1415; +mul.f32 f1624, f1478, f1617; +fma.rn.f32 f1625, f1479, f1615, f1624; +mul.f32 f1627, f1625, f1477; +mul.f32 f1628, f1623, f1477; +mul.f32 f2480, f1478, f1623; +mul.f32 f2481, f1479, f1625; +sub.f32 f1631, f2480, f2481; +mul.f32 f2479, f1623, f1459; +mul.f32 f1632, f1478, f1625; +fma.rn.f32 f1633, f1479, f1623, f1632; +mul.f32 f1635, f1633, f1293; +mul.f32 f1636, f1631, f1293; +mul.f32 f1638, f1479, f1633; +mul.f32 f2478, f1478, f1631; +sub.f32 f1639, f2478, f1638; +mul.f32 f2477, f1631, f1275; +mul.f32 f1640, f1478, f1633; +fma.rn.f32 f1641, f1479, f1631, f1640; +mul.f32 f1643, f1641, f1337; +mul.f32 f1644, f1639, f1337; +mul.f32 f2475, f1478, f1639; +mul.f32 f2476, f1479, f1641; +sub.f32 f1647, f2475, f2476; +mul.f32 f2474, f1639, f1319; +mul.f32 f1648, f1478, f1641; +fma.rn.f32 f1649, f1479, f1639, f1648; +mul.f32 f1651, f1649, f1381; +mul.f32 f1652, f1647, f1381; +mul.f32 f1654, f1479, f1649; +mul.f32 f2473, f1478, f1647; +sub.f32 f1655, f2473, f1654; +mul.f32 f2472, f1647, f1363; +mul.f32 f1656, f1478, f1649; +fma.rn.f32 f1657, f1479, f1647, f1656; +mul.f32 f1659, f1657, f1425; +mul.f32 f1660, f1655, f1425; +mul.f32 f1662, f1479, f1657; +mul.f32 f2471, f1478, f1655; +sub.f32 f1663, f2471, f1662; +mul.f32 f2470, f1478, f1305; +mul.f32 f1664, f1478, f1657; +mul.f32 f2469, f1655, f1407; +fma.rn.f32 f1665, f1479, f1655, f1664; +mul.f32 f1666, f1663, f1451; +mul.f32 f1667, f1665, f1469; +mul.f32 f1668, f1663, f1469; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 5000, r15; +add.f32 f1669, f2564, f2558; +add.f32 f1670, f1262, f2562; +st.shared.v2.f32 [r16], {f1670, f1669}; +fma.rn.f32 f1671, f1479, f1305, f1484; +sub.f32 f1672, f2470, f1483; +st.shared.v2.f32 [r16+200], {f1672, f1671}; +fma.rn.f32 f1673, f1489, f1349, f1492; +sub.f32 f1674, f2520, f1491; +st.shared.v2.f32 [r16+400], {f1674, f1673}; +fma.rn.f32 f1675, f1497, f1393, f1500; +sub.f32 f1676, f2517, f1499; +st.shared.v2.f32 [r16+600], {f1676, f1675}; +fma.rn.f32 f1677, f1505, f1437, f1508; +sub.f32 f1678, f2515, f1507; +st.shared.v2.f32 [r16+800], {f1678, f1677}; +fma.rn.f32 f1679, f1513, f1274, f1516; +sub.f32 f1680, f2512, f1515; +st.shared.v2.f32 [r16+1000], {f1680, f1679}; +fma.rn.f32 f1681, f1521, f1318, f1524; +sub.f32 f1682, f2510, f1523; +st.shared.v2.f32 [r16+1200], {f1682, f1681}; +fma.rn.f32 f1683, f1529, f1362, f1532; +sub.f32 f1684, f2508, f1531; +st.shared.v2.f32 [r16+1400], {f1684, f1683}; +fma.rn.f32 f1685, f1537, f1406, f1540; +sub.f32 f1686, f2505, f1539; +st.shared.v2.f32 [r16+1600], {f1686, f1685}; +sub.f32 f1687, f2503, f1547; +fma.rn.f32 f1688, f1545, f1450, f1548; +st.shared.v2.f32 [r16+1800], {f1687, f1688}; +fma.rn.f32 f1689, f1553, f1282, f1556; +sub.f32 f1690, f2501, f1555; +st.shared.v2.f32 [r16+2000], {f1690, f1689}; +fma.rn.f32 f1691, f1561, f1326, f1564; +sub.f32 f1692, f2498, f1563; +st.shared.v2.f32 [r16+2200], {f1692, f1691}; +fma.rn.f32 f1693, f1569, f1370, f1572; +sub.f32 f1694, f2496, f1571; +st.shared.v2.f32 [r16+2400], {f1694, f1693}; +fma.rn.f32 f1695, f1577, f1414, f1580; +sub.f32 f1696, f2493, f1579; +st.shared.v2.f32 [r16+2600], {f1696, f1695}; +fma.rn.f32 f1697, f1585, f1458, f1588; +sub.f32 f1698, f2491, f1587; +st.shared.v2.f32 [r16+2800], {f1698, f1697}; +fma.rn.f32 f1699, f1593, f1283, f1596; +sub.f32 f1700, f2489, f1595; +st.shared.v2.f32 [r16+3000], {f1700, f1699}; +fma.rn.f32 f1701, f1601, f1327, f1604; +sub.f32 f1702, f2486, f1603; +st.shared.v2.f32 [r16+3200], {f1702, f1701}; +fma.rn.f32 f1703, f1609, f1371, f1612; +sub.f32 f1704, f2484, f1611; +st.shared.v2.f32 [r16+3400], {f1704, f1703}; +fma.rn.f32 f1705, f1617, f1415, f1620; +sub.f32 f1706, f2482, f1619; +st.shared.v2.f32 [r16+3600], {f1706, f1705}; +fma.rn.f32 f1707, f1625, f1459, f1628; +sub.f32 f1708, f2479, f1627; +st.shared.v2.f32 [r16+3800], {f1708, f1707}; +fma.rn.f32 f1709, f1633, f1275, f1636; +sub.f32 f1710, f2477, f1635; +st.shared.v2.f32 [r16+4000], {f1710, f1709}; +fma.rn.f32 f1711, f1641, f1319, f1644; +sub.f32 f1712, f2474, f1643; +st.shared.v2.f32 [r16+4200], {f1712, f1711}; +fma.rn.f32 f1713, f1649, f1363, f1652; +sub.f32 f1714, f2472, f1651; +st.shared.v2.f32 [r16+4400], {f1714, f1713}; +fma.rn.f32 f1715, f1657, f1407, f1660; +sub.f32 f1716, f2469, f1659; +st.shared.v2.f32 [r16+4600], {f1716, f1715}; +fma.rn.f32 f1717, f1665, f1451, f1668; +sub.f32 f1718, f1666, f1667; +st.shared.v2.f32 [r16+4800], {f1718, f1717}; +barrier.sync 0; +ld.shared.v2.f32 {f1719, f1720}, [r10]; +ld.shared.v2.f32 {f1723, f1724}, [r10+5000]; +ld.shared.v2.f32 {f1727, f1728}, [r10+10000]; +ld.shared.v2.f32 {f1731, f1732}, [r10+15000]; +ld.shared.v2.f32 {f1735, f1736}, [r10+20000]; +ld.shared.v2.f32 {f1739, f1740}, [r10+25000]; +ld.shared.v2.f32 {f1743, f1744}, [r10+30000]; +ld.shared.v2.f32 {f1747, f1748}, [r10+35000]; +ld.shared.v2.f32 {f1751, f1752}, [r10+40000]; +ld.shared.v2.f32 {f1755, f1756}, [r10+45000]; +ld.shared.v2.f32 {f1759, f1760}, [r10+50000]; +ld.shared.v2.f32 {f1763, f1764}, [r10+55000]; +ld.shared.v2.f32 {f1767, f1768}, [r10+60000]; +ld.shared.v2.f32 {f1771, f1772}, [r10+65000]; +ld.shared.v2.f32 {f1775, f1776}, [r10+70000]; +ld.shared.v2.f32 {f1779, f1780}, [r10+75000]; +ld.shared.v2.f32 {f1783, f1784}, [r10+80000]; +ld.shared.v2.f32 {f1787, f1788}, [r10+85000]; +ld.shared.v2.f32 {f1791, f1792}, [r10+90000]; +ld.shared.v2.f32 {f1795, f1796}, [r10+95000]; +ld.shared.v2.f32 {f1799, f1800}, [r10+100000]; +ld.shared.v2.f32 {f1803, f1804}, [r10+105000]; +ld.shared.v2.f32 {f1807, f1808}, [r10+110000]; +ld.shared.v2.f32 {f1811, f1812}, [r10+115000]; +ld.shared.v2.f32 {f1815, f1816}, [r10+120000]; +add.f32 f1819, f1739, f1799; +add.f32 f1821, f1759, f1779; +add.f32 f2468, f1719, f1819; +add.f32 f1822, f1821, f2468; +add.f32 f1823, f1740, f1800; +add.f32 f1825, f1760, f1780; +add.f32 f2467, f1720, f1823; +add.f32 f1826, f1825, f2467; +fma.rn.f32 f2465, f1819, 0f3E9E377A, f1719; +mul.f32 f2466, f1821, 0f3F4F1BBD; +sub.f32 f1829, f2465, f2466; +sub.f32 f1830, f1740, f1800; +sub.f32 f1832, f1760, f1780; +mul.f32 f2463, f1830, 0f3F737871; +mul.f32 f2464, f1832, 0fBF167918; +sub.f32 f1834, f2464, f2463; +sub.f32 f1835, f1829, f1834; +add.f32 f1836, f1834, f1829; +mul.f32 f1837, f1819, 0f3F4F1BBD; +sub.f32 f1838, f1719, f1837; +fma.rn.f32 f1839, f1821, 0f3E9E377A, f1838; +mul.f32 f1840, f1830, 0f3F167918; +mul.f32 f1841, f1832, 0f3F737871; +sub.f32 f1842, f1841, f1840; +sub.f32 f1843, f1839, f1842; +add.f32 f1844, f1842, f1839; +mul.f32 f1846, f1825, 0f3F4F1BBD; +fma.rn.f32 f2462, f1823, 0f3E9E377A, f1720; +sub.f32 f1847, f2462, f1846; +sub.f32 f1848, f1739, f1799; +sub.f32 f1850, f1759, f1779; +mul.f32 f1851, f1850, 0fBF167918; +mul.f32 f2461, f1848, 0f3F737871; +sub.f32 f1852, f1851, f2461; +add.f32 f1853, f1852, f1847; +sub.f32 f1854, f1847, f1852; +mul.f32 f1855, f1823, 0f3F4F1BBD; +sub.f32 f1856, f1720, f1855; +fma.rn.f32 f1857, f1825, 0f3E9E377A, f1856; +mul.f32 f1858, f1848, 0f3F167918; +mul.f32 f1859, f1850, 0f3F737871; +sub.f32 f1860, f1859, f1858; +add.f32 f1861, f1860, f1857; +sub.f32 f1862, f1857, f1860; +add.f32 f1863, f1743, f1803; +add.f32 f1865, f1763, f1783; +add.f32 f2460, f1723, f1863; +add.f32 f1866, f1865, f2460; +add.f32 f1867, f1744, f1804; +add.f32 f1869, f1764, f1784; +add.f32 f2459, f1724, f1867; +add.f32 f1870, f1869, f2459; +mul.f32 f1872, f1865, 0f3F4F1BBD; +fma.rn.f32 f2458, f1863, 0f3E9E377A, f1723; +sub.f32 f1873, f2458, f1872; +sub.f32 f1874, f1744, f1804; +sub.f32 f1876, f1764, f1784; +mul.f32 f2456, f1874, 0f3F737871; +mul.f32 f2457, f1876, 0fBF167918; +sub.f32 f1878, f2457, f2456; +sub.f32 f1879, f1873, f1878; +add.f32 f1880, f1878, f1873; +mul.f32 f1881, f1863, 0f3F4F1BBD; +sub.f32 f1882, f1723, f1881; +fma.rn.f32 f1883, f1865, 0f3E9E377A, f1882; +mul.f32 f1884, f1874, 0f3F167918; +mul.f32 f1885, f1876, 0f3F737871; +sub.f32 f1886, f1885, f1884; +sub.f32 f1887, f1883, f1886; +add.f32 f1888, f1886, f1883; +fma.rn.f32 f2454, f1867, 0f3E9E377A, f1724; +mul.f32 f2455, f1869, 0f3F4F1BBD; +sub.f32 f1891, f2454, f2455; +sub.f32 f1892, f1743, f1803; +sub.f32 f1894, f1763, f1783; +mul.f32 f2452, f1892, 0f3F737871; +mul.f32 f2453, f1894, 0fBF167918; +sub.f32 f1896, f2453, f2452; +add.f32 f1897, f1896, f1891; +sub.f32 f1898, f1891, f1896; +mul.f32 f1899, f1867, 0f3F4F1BBD; +sub.f32 f1900, f1724, f1899; +fma.rn.f32 f1901, f1869, 0f3E9E377A, f1900; +mul.f32 f1902, f1892, 0f3F167918; +mul.f32 f1903, f1894, 0f3F737871; +sub.f32 f1904, f1903, f1902; +add.f32 f1905, f1904, f1901; +sub.f32 f1906, f1901, f1904; +add.f32 f1907, f1747, f1807; +add.f32 f1909, f1767, f1787; +add.f32 f2451, f1727, f1907; +add.f32 f1910, f1909, f2451; +add.f32 f1911, f1748, f1808; +add.f32 f1913, f1768, f1788; +add.f32 f2450, f1728, f1911; +add.f32 f1914, f1913, f2450; +fma.rn.f32 f2448, f1907, 0f3E9E377A, f1727; +mul.f32 f2449, f1909, 0f3F4F1BBD; +sub.f32 f1917, f2448, f2449; +sub.f32 f1918, f1748, f1808; +sub.f32 f1920, f1768, f1788; +mul.f32 f2446, f1918, 0f3F737871; +mul.f32 f2447, f1920, 0fBF167918; +sub.f32 f1922, f2447, f2446; +sub.f32 f1923, f1917, f1922; +add.f32 f1924, f1922, f1917; +mul.f32 f1925, f1907, 0f3F4F1BBD; +sub.f32 f1926, f1727, f1925; +fma.rn.f32 f1927, f1909, 0f3E9E377A, f1926; +mul.f32 f1928, f1918, 0f3F167918; +mul.f32 f1929, f1920, 0f3F737871; +sub.f32 f1930, f1929, f1928; +sub.f32 f1931, f1927, f1930; +add.f32 f1932, f1930, f1927; +mul.f32 f1934, f1913, 0f3F4F1BBD; +fma.rn.f32 f2445, f1911, 0f3E9E377A, f1728; +sub.f32 f1935, f2445, f1934; +sub.f32 f1936, f1747, f1807; +sub.f32 f1938, f1767, f1787; +mul.f32 f1939, f1938, 0fBF167918; +mul.f32 f2444, f1936, 0f3F737871; +sub.f32 f1940, f1939, f2444; +add.f32 f1941, f1940, f1935; +sub.f32 f1942, f1935, f1940; +mul.f32 f1943, f1911, 0f3F4F1BBD; +sub.f32 f1944, f1728, f1943; +fma.rn.f32 f1945, f1913, 0f3E9E377A, f1944; +mul.f32 f1946, f1936, 0f3F167918; +mul.f32 f1947, f1938, 0f3F737871; +sub.f32 f1948, f1947, f1946; +add.f32 f1949, f1948, f1945; +sub.f32 f1950, f1945, f1948; +add.f32 f1951, f1751, f1811; +add.f32 f1953, f1771, f1791; +add.f32 f2443, f1731, f1951; +add.f32 f1954, f1953, f2443; +add.f32 f1955, f1752, f1812; +add.f32 f1957, f1772, f1792; +add.f32 f2442, f1732, f1955; +add.f32 f1958, f1957, f2442; +mul.f32 f1960, f1953, 0f3F4F1BBD; +fma.rn.f32 f2441, f1951, 0f3E9E377A, f1731; +sub.f32 f1961, f2441, f1960; +sub.f32 f1962, f1752, f1812; +sub.f32 f1964, f1772, f1792; +mul.f32 f1965, f1964, 0fBF167918; +mul.f32 f2440, f1962, 0f3F737871; +sub.f32 f1966, f1965, f2440; +sub.f32 f1967, f1961, f1966; +add.f32 f1968, f1966, f1961; +mul.f32 f1969, f1951, 0f3F4F1BBD; +sub.f32 f1970, f1731, f1969; +fma.rn.f32 f1971, f1953, 0f3E9E377A, f1970; +mul.f32 f1972, f1962, 0f3F167918; +mul.f32 f1973, f1964, 0f3F737871; +sub.f32 f1974, f1973, f1972; +sub.f32 f1975, f1971, f1974; +add.f32 f1976, f1974, f1971; +mul.f32 f1978, f1957, 0f3F4F1BBD; +fma.rn.f32 f2439, f1955, 0f3E9E377A, f1732; +sub.f32 f1979, f2439, f1978; +sub.f32 f1980, f1751, f1811; +sub.f32 f1982, f1771, f1791; +mul.f32 f2437, f1980, 0f3F737871; +mul.f32 f2438, f1982, 0fBF167918; +sub.f32 f1984, f2438, f2437; +add.f32 f1985, f1984, f1979; +sub.f32 f1986, f1979, f1984; +mul.f32 f1987, f1955, 0f3F4F1BBD; +sub.f32 f1988, f1732, f1987; +fma.rn.f32 f1989, f1957, 0f3E9E377A, f1988; +mul.f32 f1990, f1980, 0f3F167918; +mul.f32 f1991, f1982, 0f3F737871; +sub.f32 f1992, f1991, f1990; +add.f32 f1993, f1992, f1989; +sub.f32 f1994, f1989, f1992; +add.f32 f1995, f1755, f1815; +add.f32 f1997, f1775, f1795; +add.f32 f2436, f1735, f1995; +add.f32 f1998, f1997, f2436; +add.f32 f1999, f1756, f1816; +add.f32 f2001, f1776, f1796; +add.f32 f2435, f1736, f1999; +add.f32 f2002, f2001, f2435; +fma.rn.f32 f2433, f1995, 0f3E9E377A, f1735; +mul.f32 f2434, f1997, 0f3F4F1BBD; +sub.f32 f2005, f2433, f2434; +sub.f32 f2006, f1756, f1816; +sub.f32 f2008, f1776, f1796; +mul.f32 f2431, f2006, 0f3F737871; +mul.f32 f2432, f2008, 0fBF167918; +sub.f32 f2010, f2432, f2431; +sub.f32 f2011, f2005, f2010; +add.f32 f2012, f2010, f2005; +mul.f32 f2013, f1995, 0f3F4F1BBD; +sub.f32 f2014, f1735, f2013; +fma.rn.f32 f2015, f1997, 0f3E9E377A, f2014; +mul.f32 f2016, f2006, 0f3F167918; +mul.f32 f2017, f2008, 0f3F737871; +sub.f32 f2018, f2017, f2016; +sub.f32 f2019, f2015, f2018; +add.f32 f2020, f2018, f2015; +mul.f32 f2022, f2001, 0f3F4F1BBD; +fma.rn.f32 f2430, f1999, 0f3E9E377A, f1736; +sub.f32 f2023, f2430, f2022; +sub.f32 f2024, f1755, f1815; +sub.f32 f2026, f1775, f1795; +mul.f32 f2428, f2024, 0f3F737871; +mul.f32 f2429, f2026, 0fBF167918; +sub.f32 f2028, f2429, f2428; +add.f32 f2029, f2028, f2023; +sub.f32 f2030, f2023, f2028; +mul.f32 f2031, f1999, 0f3F4F1BBD; +sub.f32 f2032, f1736, f2031; +fma.rn.f32 f2033, f2001, 0f3E9E377A, f2032; +mul.f32 f2034, f2024, 0f3F167918; +mul.f32 f2035, f2026, 0f3F737871; +sub.f32 f2036, f2035, f2034; +add.f32 f2037, f2036, f2033; +sub.f32 f2038, f2033, f2036; +mul.f32 f2040, f1897, 0fBE7EA890; +mul.f32 f2427, f1879, 0f3F77F511; +sub.f32 f2041, f2427, f2040; +mul.f32 f2042, f1897, 0f3F77F511; +fma.rn.f32 f2043, f1879, 0fBE7EA890, f2042; +mul.f32 f2045, f1941, 0fBEF6A86B; +mul.f32 f2426, f1923, 0f3F6055A2; +sub.f32 f2046, f2426, f2045; +mul.f32 f2047, f1941, 0f3F6055A2; +fma.rn.f32 f2048, f1923, 0fBEF6A86B, f2047; +mul.f32 f2050, f1985, 0fBF2F3E7B; +mul.f32 f2425, f1967, 0f3F3A9DB0; +sub.f32 f2051, f2425, f2050; +mul.f32 f2052, f1985, 0f3F3A9DB0; +fma.rn.f32 f2053, f1967, 0fBF2F3E7B, f2052; +mul.f32 f2055, f2029, 0fBF5825E0; +mul.f32 f2424, f2011, 0f3F092BF2; +sub.f32 f2056, f2424, f2055; +mul.f32 f2057, f2029, 0f3F092BF2; +fma.rn.f32 f2058, f2011, 0fBF5825E0, f2057; +mul.f32 f2422, f1887, 0f3F6055A2; +mul.f32 f2423, f1905, 0fBEF6A86B; +sub.f32 f2061, f2422, f2423; +mul.f32 f2062, f1905, 0f3F6055A2; +fma.rn.f32 f2063, f1887, 0fBEF6A86B, f2062; +mul.f32 f2420, f1931, 0f3F092BF2; +mul.f32 f2421, f1949, 0fBF5825E0; +sub.f32 f2066, f2420, f2421; +mul.f32 f2067, f1949, 0f3F092BF2; +fma.rn.f32 f2068, f1931, 0fBF5825E0, f2067; +mul.f32 f2418, f1975, 0f3D809851; +mul.f32 f2419, f1993, 0fBF7F7EAE; +sub.f32 f2071, f2418, f2419; +mul.f32 f2072, f1993, 0f3D809851; +fma.rn.f32 f2073, f1975, 0fBF7F7EAE, f2072; +mul.f32 f2416, f2019, 0fBED9FFBE; +mul.f32 f2417, f2037, 0fBF67A2BF; +sub.f32 f2076, f2416, f2417; +mul.f32 f2077, f2037, 0fBED9FFBE; +fma.rn.f32 f2078, f2019, 0fBF67A2BF, f2077; +mul.f32 f2080, f1906, 0fBF2F3E7B; +mul.f32 f2415, f1888, 0f3F3A9DB0; +sub.f32 f2081, f2415, f2080; +mul.f32 f2082, f1906, 0f3F3A9DB0; +fma.rn.f32 f2083, f1888, 0fBF2F3E7B, f2082; +mul.f32 f2085, f1950, 0fBF7F7EAE; +mul.f32 f2414, f1932, 0f3D809851; +sub.f32 f2086, f2414, f2085; +mul.f32 f2087, f1950, 0f3D809851; +fma.rn.f32 f2088, f1932, 0fBF7F7EAE, f2087; +mul.f32 f2090, f1994, 0fBF45405B; +mul.f32 f2413, f1976, 0fBF232E38; +sub.f32 f2091, f2413, f2090; +mul.f32 f2092, f1994, 0fBF232E38; +fma.rn.f32 f2093, f1976, 0fBF45405B, f2092; +mul.f32 f2095, f2038, 0fBE00575B; +mul.f32 f2412, f2020, 0fBF7DFB3B; +sub.f32 f2096, f2412, f2095; +mul.f32 f2097, f2038, 0fBF7DFB3B; +fma.rn.f32 f2098, f2020, 0fBE00575B, f2097; +mul.f32 f2100, f1898, 0fBF5825E0; +mul.f32 f2411, f1880, 0f3F092BF2; +sub.f32 f2101, f2411, f2100; +mul.f32 f2102, f1898, 0f3F092BF2; +fma.rn.f32 f2103, f1880, 0fBF5825E0, f2102; +mul.f32 f2409, f1924, 0fBED9FFBE; +mul.f32 f2410, f1942, 0fBF67A2BF; +sub.f32 f2106, f2409, f2410; +mul.f32 f2107, f1942, 0fBED9FFBE; +fma.rn.f32 f2108, f1924, 0fBF67A2BF, f2107; +mul.f32 f2407, f1968, 0fBF7DFB3B; +mul.f32 f2408, f1986, 0fBE00575B; +sub.f32 f2111, f2407, f2408; +mul.f32 f2112, f1986, 0fBF7DFB3B; +fma.rn.f32 f2113, f1968, 0fBE00575B, f2112; +mul.f32 f2405, f2012, 0fBF232E38; +mul.f32 f2406, f2030, 0f3F45405B; +sub.f32 f2116, f2405, f2406; +mul.f32 f2117, f2030, 0fBF232E38; +fma.rn.f32 f2118, f2012, 0f3F45405B, f2117; +add.f32 f2119, f1866, f1998; +add.f32 f2121, f1910, f1954; +mul.f32 f2126, f2121, 0f3F4F1BBD; +fma.rn.f32 f2404, f2119, 0f3E9E377A, f1822; +sub.f32 f2127, f2404, f2126; +add.f32 f2403, f1870, f2002; +sub.f32 f2128, f1870, f2002; +add.f32 f2402, f1914, f1958; +sub.f32 f2130, f1914, f1958; +mul.f32 f2131, f2130, 0fBF167918; +mul.f32 f2401, f2128, 0f3F737871; +sub.f32 f2132, f2131, f2401; +add.f32 f2400, f1822, f2119; +mul.f32 f2133, f2119, 0f3F4F1BBD; +sub.f32 f2134, f1822, f2133; +fma.rn.f32 f2135, f2121, 0f3E9E377A, f2134; +mul.f32 f2136, f2128, 0f3F167918; +mul.f32 f2137, f2130, 0f3F737871; +sub.f32 f2138, f2137, f2136; +mul.f32 f2140, f2402, 0f3F4F1BBD; +fma.rn.f32 f2399, f2403, 0f3E9E377A, f1826; +sub.f32 f2141, f2399, f2140; +sub.f32 f2142, f1866, f1998; +sub.f32 f2144, f1910, f1954; +mul.f32 f2145, f2144, 0fBF167918; +mul.f32 f2398, f2142, 0f3F737871; +sub.f32 f2146, f2145, f2398; +add.f32 f2397, f1826, f2403; +mul.f32 f2147, f2403, 0f3F4F1BBD; +sub.f32 f2148, f1826, f2147; +fma.rn.f32 f2149, f2402, 0f3E9E377A, f2148; +mul.f32 f2150, f2142, 0f3F167918; +mul.f32 f2151, f2144, 0f3F737871; +sub.f32 f2152, f2151, f2150; +add.f32 f2153, f2041, f2056; +add.f32 f2155, f2046, f2051; +fma.rn.f32 f2395, f2153, 0f3E9E377A, f1835; +mul.f32 f2396, f2155, 0f3F4F1BBD; +sub.f32 f2161, f2395, f2396; +add.f32 f2394, f2043, f2058; +sub.f32 f2162, f2043, f2058; +add.f32 f2393, f2048, f2053; +sub.f32 f2164, f2048, f2053; +mul.f32 f2391, f2162, 0f3F737871; +mul.f32 f2392, f2164, 0fBF167918; +sub.f32 f2166, f2392, f2391; +add.f32 f2390, f1835, f2153; +mul.f32 f2167, f2153, 0f3F4F1BBD; +sub.f32 f2168, f1835, f2167; +fma.rn.f32 f2169, f2155, 0f3E9E377A, f2168; +mul.f32 f2170, f2162, 0f3F167918; +mul.f32 f2171, f2164, 0f3F737871; +sub.f32 f2172, f2171, f2170; +fma.rn.f32 f2388, f2394, 0f3E9E377A, f1853; +mul.f32 f2389, f2393, 0f3F4F1BBD; +sub.f32 f2175, f2388, f2389; +sub.f32 f2176, f2041, f2056; +sub.f32 f2178, f2046, f2051; +mul.f32 f2386, f2176, 0f3F737871; +mul.f32 f2387, f2178, 0fBF167918; +sub.f32 f2180, f2387, f2386; +add.f32 f2385, f1853, f2394; +mul.f32 f2181, f2394, 0f3F4F1BBD; +sub.f32 f2182, f1853, f2181; +fma.rn.f32 f2183, f2393, 0f3E9E377A, f2182; +mul.f32 f2184, f2176, 0f3F167918; +mul.f32 f2185, f2178, 0f3F737871; +sub.f32 f2186, f2185, f2184; +add.f32 f2187, f2061, f2076; +add.f32 f2189, f2066, f2071; +mul.f32 f2194, f2189, 0f3F4F1BBD; +fma.rn.f32 f2384, f2187, 0f3E9E377A, f1843; +sub.f32 f2195, f2384, f2194; +add.f32 f2383, f2063, f2078; +sub.f32 f2196, f2063, f2078; +add.f32 f2382, f2068, f2073; +sub.f32 f2198, f2068, f2073; +mul.f32 f2380, f2196, 0f3F737871; +mul.f32 f2381, f2198, 0fBF167918; +sub.f32 f2200, f2381, f2380; +add.f32 f2379, f1843, f2187; +mul.f32 f2201, f2187, 0f3F4F1BBD; +sub.f32 f2202, f1843, f2201; +fma.rn.f32 f2203, f2189, 0f3E9E377A, f2202; +mul.f32 f2204, f2196, 0f3F167918; +mul.f32 f2205, f2198, 0f3F737871; +sub.f32 f2206, f2205, f2204; +mul.f32 f2208, f2382, 0f3F4F1BBD; +fma.rn.f32 f2378, f2383, 0f3E9E377A, f1861; +sub.f32 f2209, f2378, f2208; +sub.f32 f2210, f2061, f2076; +sub.f32 f2212, f2066, f2071; +mul.f32 f2376, f2210, 0f3F737871; +mul.f32 f2377, f2212, 0fBF167918; +sub.f32 f2214, f2377, f2376; +add.f32 f2375, f1861, f2383; +mul.f32 f2215, f2383, 0f3F4F1BBD; +sub.f32 f2216, f1861, f2215; +fma.rn.f32 f2217, f2382, 0f3E9E377A, f2216; +mul.f32 f2218, f2210, 0f3F167918; +mul.f32 f2219, f2212, 0f3F737871; +sub.f32 f2220, f2219, f2218; +add.f32 f2221, f2081, f2096; +add.f32 f2223, f2086, f2091; +mul.f32 f2228, f2223, 0f3F4F1BBD; +fma.rn.f32 f2374, f2221, 0f3E9E377A, f1844; +sub.f32 f2229, f2374, f2228; +add.f32 f2373, f2083, f2098; +sub.f32 f2230, f2083, f2098; +add.f32 f2372, f2088, f2093; +sub.f32 f2232, f2088, f2093; +mul.f32 f2370, f2230, 0f3F737871; +mul.f32 f2371, f2232, 0fBF167918; +sub.f32 f2234, f2371, f2370; +add.f32 f2369, f1844, f2221; +mul.f32 f2235, f2221, 0f3F4F1BBD; +sub.f32 f2236, f1844, f2235; +fma.rn.f32 f2237, f2223, 0f3E9E377A, f2236; +mul.f32 f2238, f2230, 0f3F167918; +mul.f32 f2239, f2232, 0f3F737871; +sub.f32 f2240, f2239, f2238; +mul.f32 f2242, f2372, 0f3F4F1BBD; +fma.rn.f32 f2368, f2373, 0f3E9E377A, f1862; +sub.f32 f2243, f2368, f2242; +sub.f32 f2244, f2081, f2096; +sub.f32 f2246, f2086, f2091; +mul.f32 f2247, f2246, 0fBF167918; +mul.f32 f2367, f2244, 0f3F737871; +sub.f32 f2248, f2247, f2367; +add.f32 f2366, f1862, f2373; +mul.f32 f2249, f2373, 0f3F4F1BBD; +sub.f32 f2250, f1862, f2249; +fma.rn.f32 f2251, f2372, 0f3E9E377A, f2250; +mul.f32 f2252, f2244, 0f3F167918; +mul.f32 f2253, f2246, 0f3F737871; +sub.f32 f2254, f2253, f2252; +add.f32 f2255, f2101, f2116; +add.f32 f2257, f2106, f2111; +fma.rn.f32 f2364, f2255, 0f3E9E377A, f1836; +mul.f32 f2365, f2257, 0f3F4F1BBD; +sub.f32 f2263, f2364, f2365; +add.f32 f2363, f2103, f2118; +sub.f32 f2264, f2103, f2118; +add.f32 f2362, f2108, f2113; +sub.f32 f2266, f2108, f2113; +mul.f32 f2360, f2264, 0f3F737871; +mul.f32 f2361, f2266, 0fBF167918; +sub.f32 f2268, f2361, f2360; +add.f32 f2359, f1836, f2255; +mul.f32 f2269, f2255, 0f3F4F1BBD; +sub.f32 f2270, f1836, f2269; +fma.rn.f32 f2271, f2257, 0f3E9E377A, f2270; +mul.f32 f2272, f2264, 0f3F167918; +mul.f32 f2273, f2266, 0f3F737871; +sub.f32 f2274, f2273, f2272; +fma.rn.f32 f2357, f2363, 0f3E9E377A, f1854; +mul.f32 f2358, f2362, 0f3F4F1BBD; +sub.f32 f2277, f2357, f2358; +sub.f32 f2278, f2101, f2116; +sub.f32 f2280, f2106, f2111; +mul.f32 f2355, f2278, 0f3F737871; +mul.f32 f2356, f2280, 0fBF167918; +sub.f32 f2282, f2356, f2355; +add.f32 f2354, f1854, f2363; +mul.f32 f2283, f2363, 0f3F4F1BBD; +sub.f32 f2284, f1854, f2283; +fma.rn.f32 f2285, f2362, 0f3E9E377A, f2284; +mul.f32 f2286, f2278, 0f3F167918; +mul.f32 f2287, f2280, 0f3F737871; +sub.f32 f2288, f2287, f2286; +add.f32 %1, f2402, f2397; +add.f32 %0, f2121, f2400; +add.f32 %3, f2393, f2385; +add.f32 %2, f2155, f2390; +add.f32 %5, f2382, f2375; +add.f32 %4, f2189, f2379; +add.f32 %7, f2372, f2366; +add.f32 %6, f2223, f2369; +add.f32 %9, f2362, f2354; +add.f32 %8, f2257, f2359; +add.f32 %11, f2146, f2141; +sub.f32 %10, f2127, f2132; +add.f32 %13, f2180, f2175; +sub.f32 %12, f2161, f2166; +sub.f32 %14, f2195, f2200; +add.f32 %15, f2214, f2209; +sub.f32 %16, f2229, f2234; +add.f32 %17, f2248, f2243; +sub.f32 %18, f2263, f2268; +add.f32 %19, f2282, f2277; +sub.f32 %20, f2135, f2138; +add.f32 %21, f2152, f2149; +add.f32 %23, f2186, f2183; +sub.f32 %22, f2169, f2172; +add.f32 %25, f2220, f2217; +sub.f32 %24, f2203, f2206; +add.f32 %27, f2254, f2251; +sub.f32 %26, f2237, f2240; +sub.f32 %28, f2271, f2274; +add.f32 %29, f2288, f2285; +sub.f32 %31, f2149, f2152; +add.f32 %30, f2138, f2135; +sub.f32 %33, f2183, f2186; +add.f32 %32, f2172, f2169; +sub.f32 %35, f2217, f2220; +add.f32 %34, f2206, f2203; +sub.f32 %37, f2251, f2254; +add.f32 %36, f2240, f2237; +sub.f32 %39, f2285, f2288; +add.f32 %38, f2274, f2271; +sub.f32 %41, f2141, f2146; +add.f32 %40, f2132, f2127; +sub.f32 %43, f2175, f2180; +add.f32 %42, f2166, f2161; +sub.f32 %45, f2209, f2214; +add.f32 %44, f2200, f2195; +sub.f32 %47, f2243, f2248; +add.f32 %46, f2234, f2229; +sub.f32 %49, f2277, f2282; +add.f32 %48, f2268, f2263; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_15625), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[11].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[17].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<170, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2239>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 62500, r2; +add.f32 f101, %66, %106; +add.f32 f102, %53, f101; +add.f32 f103, %79, %93; +add.f32 f104, f103, f102; +add.f32 f105, %68, %108; +add.f32 f106, %54, f105; +add.f32 f107, %81, %94; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %53; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %68, %108; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %81, %94; +mul.f32 f115, f114, 0fBF167918; +sub.f32 f116, f115, f113; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %53, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f127, f105, 0f3E9E377A, %54; +mul.f32 f128, f107, 0f3F4F1BBD; +sub.f32 f129, f127, f128; +sub.f32 f130, %66, %106; +mul.f32 f131, f130, 0f3F737871; +sub.f32 f132, %79, %93; +mul.f32 f133, f132, 0fBF167918; +sub.f32 f134, f133, f131; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %54, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %69, %109; +add.f32 f146, %55, f145; +add.f32 f147, %82, %95; +add.f32 f148, f147, f146; +add.f32 f149, %70, %110; +add.f32 f150, %57, f149; +add.f32 f151, %84, %97; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f145, 0f3E9E377A, %55; +mul.f32 f154, f147, 0f3F4F1BBD; +sub.f32 f155, f153, f154; +sub.f32 f156, %70, %110; +mul.f32 f157, f156, 0f3F737871; +sub.f32 f158, %84, %97; +mul.f32 f159, f158, 0fBF167918; +sub.f32 f160, f159, f157; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %55, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +fma.rn.f32 f171, f149, 0f3E9E377A, %57; +mul.f32 f172, f151, 0f3F4F1BBD; +sub.f32 f173, f171, f172; +sub.f32 f174, %69, %109; +mul.f32 f175, f174, 0f3F737871; +sub.f32 f176, %82, %95; +mul.f32 f177, f176, 0fBF167918; +sub.f32 f178, f177, f175; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %57, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %71, %111; +add.f32 f190, %58, f189; +add.f32 f191, %85, %98; +add.f32 f192, f191, f190; +add.f32 f193, %73, %113; +add.f32 f194, %60, f193; +add.f32 f195, %86, %100; +add.f32 f196, f195, f194; +fma.rn.f32 f197, f189, 0f3E9E377A, %58; +mul.f32 f198, f191, 0f3F4F1BBD; +sub.f32 f199, f197, f198; +sub.f32 f200, %73, %113; +mul.f32 f201, f200, 0f3F737871; +sub.f32 f202, %86, %100; +mul.f32 f203, f202, 0fBF167918; +sub.f32 f204, f203, f201; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %58, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f215, f193, 0f3E9E377A, %60; +mul.f32 f216, f195, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, %71, %111; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, %85, %98; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %60, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %74, %114; +add.f32 f234, %61, f233; +add.f32 f235, %87, %101; +add.f32 f236, f235, f234; +add.f32 f237, %76, %116; +add.f32 f238, %62, f237; +add.f32 f239, %89, %102; +add.f32 f240, f239, f238; +fma.rn.f32 f241, f233, 0f3E9E377A, %61; +mul.f32 f242, f235, 0f3F4F1BBD; +sub.f32 f243, f241, f242; +sub.f32 f244, %76, %116; +mul.f32 f245, f244, 0f3F737871; +sub.f32 f246, %89, %102; +mul.f32 f247, f246, 0fBF167918; +sub.f32 f248, f247, f245; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %61, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +fma.rn.f32 f259, f237, 0f3E9E377A, %62; +mul.f32 f260, f239, 0f3F4F1BBD; +sub.f32 f261, f259, f260; +sub.f32 f262, %74, %114; +mul.f32 f263, f262, 0f3F737871; +sub.f32 f264, %87, %101; +mul.f32 f265, f264, 0fBF167918; +sub.f32 f266, f265, f263; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %62, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %77, %117; +add.f32 f278, %63, f277; +add.f32 f279, %90, %103; +add.f32 f280, f279, f278; +add.f32 f281, %78, %118; +add.f32 f282, %65, f281; +add.f32 f283, %92, %105; +add.f32 f284, f283, f282; +fma.rn.f32 f285, f277, 0f3E9E377A, %63; +mul.f32 f286, f279, 0f3F4F1BBD; +sub.f32 f287, f285, f286; +sub.f32 f288, %78, %118; +mul.f32 f289, f288, 0f3F737871; +sub.f32 f290, %92, %105; +mul.f32 f291, f290, 0fBF167918; +sub.f32 f292, f291, f289; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %63, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +fma.rn.f32 f303, f281, 0f3E9E377A, %65; +mul.f32 f304, f283, 0f3F4F1BBD; +sub.f32 f305, f303, f304; +sub.f32 f306, %77, %117; +mul.f32 f307, f306, 0f3F737871; +sub.f32 f308, %90, %103; +mul.f32 f309, f308, 0fBF167918; +sub.f32 f310, f309, f307; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %65, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mov.u32 r4, %tid.x; +mul.f32 f321, f161, 0f3F77F511; +mul.f32 f322, f179, 0fBE7EA890; +sub.f32 f323, f321, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f326, f205, 0f3F6055A2; +mul.f32 f327, f223, 0fBEF6A86B; +sub.f32 f328, f326, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f331, f249, 0f3F3A9DB0; +mul.f32 f332, f267, 0fBF2F3E7B; +sub.f32 f333, f331, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f336, f293, 0f3F092BF2; +mul.f32 f337, f311, 0fBF5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f341, f169, 0f3F6055A2; +mul.f32 f342, f187, 0fBEF6A86B; +sub.f32 f343, f341, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f346, f213, 0f3F092BF2; +mul.f32 f347, f231, 0fBF5825E0; +sub.f32 f348, f346, f347; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f351, f257, 0f3D809851; +mul.f32 f352, f275, 0fBF7F7EAE; +sub.f32 f353, f351, f352; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f356, f301, 0fBED9FFBE; +mul.f32 f357, f319, 0fBF67A2BF; +sub.f32 f358, f356, f357; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f361, f170, 0f3F3A9DB0; +mul.f32 f362, f188, 0fBF2F3E7B; +sub.f32 f363, f361, f362; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f366, f214, 0f3D809851; +mul.f32 f367, f232, 0fBF7F7EAE; +sub.f32 f368, f366, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f371, f258, 0fBF232E38; +mul.f32 f372, f276, 0fBF45405B; +sub.f32 f373, f371, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f376, f302, 0fBF7DFB3B; +mul.f32 f377, f320, 0fBE00575B; +sub.f32 f378, f376, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f381, f162, 0f3F092BF2; +mul.f32 f382, f180, 0fBF5825E0; +sub.f32 f383, f381, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f386, f206, 0fBED9FFBE; +mul.f32 f387, f224, 0fBF67A2BF; +sub.f32 f388, f386, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f391, f250, 0fBF7DFB3B; +mul.f32 f392, f268, 0fBE00575B; +sub.f32 f393, f391, f392; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f396, f294, 0fBF232E38; +mul.f32 f397, f312, 0f3F45405B; +sub.f32 f398, f396, f397; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f402, f104, f401; +add.f32 f403, f192, f236; +add.f32 f404, f403, f402; +add.f32 f405, f152, f284; +add.f32 f406, f108, f405; +add.f32 f407, f196, f240; +add.f32 f408, f407, f406; +fma.rn.f32 f409, f401, 0f3E9E377A, f104; +mul.f32 f410, f403, 0f3F4F1BBD; +sub.f32 f411, f409, f410; +sub.f32 f412, f152, f284; +mul.f32 f413, f412, 0f3F737871; +sub.f32 f414, f196, f240; +mul.f32 f415, f414, 0fBF167918; +sub.f32 f416, f415, f413; +sub.f32 f417, f411, f416; +add.f32 f418, f416, f411; +mul.f32 f419, f401, 0f3F4F1BBD; +sub.f32 f420, f104, f419; +fma.rn.f32 f421, f403, 0f3E9E377A, f420; +mul.f32 f422, f412, 0f3F167918; +mul.f32 f423, f414, 0f3F737871; +sub.f32 f424, f423, f422; +sub.f32 f425, f421, f424; +add.f32 f426, f424, f421; +fma.rn.f32 f427, f405, 0f3E9E377A, f108; +mul.f32 f428, f407, 0f3F4F1BBD; +sub.f32 f429, f427, f428; +sub.f32 f430, f148, f280; +mul.f32 f431, f430, 0f3F737871; +sub.f32 f432, f192, f236; +mul.f32 f433, f432, 0fBF167918; +sub.f32 f434, f433, f431; +add.f32 f435, f434, f429; +sub.f32 f436, f429, f434; +mul.f32 f437, f405, 0f3F4F1BBD; +sub.f32 f438, f108, f437; +fma.rn.f32 f439, f407, 0f3E9E377A, f438; +mul.f32 f440, f430, 0f3F167918; +mul.f32 f441, f432, 0f3F737871; +sub.f32 f442, f441, f440; +add.f32 f443, f442, f439; +sub.f32 f444, f439, f442; +add.f32 f445, f323, f338; +add.f32 f446, f117, f445; +add.f32 f447, f328, f333; +add.f32 f448, f447, f446; +add.f32 f449, f325, f340; +add.f32 f450, f135, f449; +add.f32 f451, f330, f335; +add.f32 f452, f451, f450; +fma.rn.f32 f453, f445, 0f3E9E377A, f117; +mul.f32 f454, f447, 0f3F4F1BBD; +sub.f32 f455, f453, f454; +sub.f32 f456, f325, f340; +mul.f32 f457, f456, 0f3F737871; +sub.f32 f458, f330, f335; +mul.f32 f459, f458, 0fBF167918; +sub.f32 f460, f459, f457; +sub.f32 f461, f455, f460; +add.f32 f462, f460, f455; +mul.f32 f463, f445, 0f3F4F1BBD; +sub.f32 f464, f117, f463; +fma.rn.f32 f465, f447, 0f3E9E377A, f464; +mul.f32 f466, f456, 0f3F167918; +mul.f32 f467, f458, 0f3F737871; +sub.f32 f468, f467, f466; +sub.f32 f469, f465, f468; +add.f32 f470, f468, f465; +fma.rn.f32 f471, f449, 0f3E9E377A, f135; +mul.f32 f472, f451, 0f3F4F1BBD; +sub.f32 f473, f471, f472; +sub.f32 f474, f323, f338; +mul.f32 f475, f474, 0f3F737871; +sub.f32 f476, f328, f333; +mul.f32 f477, f476, 0fBF167918; +sub.f32 f478, f477, f475; +add.f32 f479, f478, f473; +sub.f32 f480, f473, f478; +mul.f32 f481, f449, 0f3F4F1BBD; +sub.f32 f482, f135, f481; +fma.rn.f32 f483, f451, 0f3E9E377A, f482; +mul.f32 f484, f474, 0f3F167918; +mul.f32 f485, f476, 0f3F737871; +sub.f32 f486, f485, f484; +add.f32 f487, f486, f483; +sub.f32 f488, f483, f486; +add.f32 f489, f343, f358; +add.f32 f490, f125, f489; +add.f32 f491, f348, f353; +add.f32 f492, f491, f490; +add.f32 f493, f345, f360; +add.f32 f494, f143, f493; +add.f32 f495, f350, f355; +add.f32 f496, f495, f494; +fma.rn.f32 f497, f489, 0f3E9E377A, f125; +mul.f32 f498, f491, 0f3F4F1BBD; +sub.f32 f499, f497, f498; +sub.f32 f500, f345, f360; +mul.f32 f501, f500, 0f3F737871; +sub.f32 f502, f350, f355; +mul.f32 f503, f502, 0fBF167918; +sub.f32 f504, f503, f501; +sub.f32 f505, f499, f504; +add.f32 f506, f504, f499; +mul.f32 f507, f489, 0f3F4F1BBD; +sub.f32 f508, f125, f507; +fma.rn.f32 f509, f491, 0f3E9E377A, f508; +mul.f32 f510, f500, 0f3F167918; +mul.f32 f511, f502, 0f3F737871; +sub.f32 f512, f511, f510; +sub.f32 f513, f509, f512; +add.f32 f514, f512, f509; +fma.rn.f32 f515, f493, 0f3E9E377A, f143; +mul.f32 f516, f495, 0f3F4F1BBD; +sub.f32 f517, f515, f516; +sub.f32 f518, f343, f358; +mul.f32 f519, f518, 0f3F737871; +sub.f32 f520, f348, f353; +mul.f32 f521, f520, 0fBF167918; +sub.f32 f522, f521, f519; +add.f32 f523, f522, f517; +sub.f32 f524, f517, f522; +mul.f32 f525, f493, 0f3F4F1BBD; +sub.f32 f526, f143, f525; +fma.rn.f32 f527, f495, 0f3E9E377A, f526; +mul.f32 f528, f518, 0f3F167918; +mul.f32 f529, f520, 0f3F737871; +sub.f32 f530, f529, f528; +add.f32 f531, f530, f527; +sub.f32 f532, f527, f530; +add.f32 f533, f363, f378; +add.f32 f534, f126, f533; +add.f32 f535, f368, f373; +add.f32 f536, f535, f534; +add.f32 f537, f365, f380; +add.f32 f538, f144, f537; +add.f32 f539, f370, f375; +add.f32 f540, f539, f538; +fma.rn.f32 f541, f533, 0f3E9E377A, f126; +mul.f32 f542, f535, 0f3F4F1BBD; +sub.f32 f543, f541, f542; +sub.f32 f544, f365, f380; +mul.f32 f545, f544, 0f3F737871; +sub.f32 f546, f370, f375; +mul.f32 f547, f546, 0fBF167918; +sub.f32 f548, f547, f545; +sub.f32 f549, f543, f548; +add.f32 f550, f548, f543; +mul.f32 f551, f533, 0f3F4F1BBD; +sub.f32 f552, f126, f551; +fma.rn.f32 f553, f535, 0f3E9E377A, f552; +mul.f32 f554, f544, 0f3F167918; +mul.f32 f555, f546, 0f3F737871; +sub.f32 f556, f555, f554; +sub.f32 f557, f553, f556; +add.f32 f558, f556, f553; +fma.rn.f32 f559, f537, 0f3E9E377A, f144; +mul.f32 f560, f539, 0f3F4F1BBD; +sub.f32 f561, f559, f560; +sub.f32 f562, f363, f378; +mul.f32 f563, f562, 0f3F737871; +sub.f32 f564, f368, f373; +mul.f32 f565, f564, 0fBF167918; +sub.f32 f566, f565, f563; +add.f32 f567, f566, f561; +sub.f32 f568, f561, f566; +mul.f32 f569, f537, 0f3F4F1BBD; +sub.f32 f570, f144, f569; +fma.rn.f32 f571, f539, 0f3E9E377A, f570; +mul.f32 f572, f562, 0f3F167918; +mul.f32 f573, f564, 0f3F737871; +sub.f32 f574, f573, f572; +add.f32 f575, f574, f571; +sub.f32 f576, f571, f574; +add.f32 f577, f383, f398; +add.f32 f578, f118, f577; +add.f32 f579, f388, f393; +add.f32 f580, f579, f578; +add.f32 f581, f385, f400; +add.f32 f582, f136, f581; +add.f32 f583, f390, f395; +add.f32 f584, f583, f582; +fma.rn.f32 f585, f577, 0f3E9E377A, f118; +mul.f32 f586, f579, 0f3F4F1BBD; +sub.f32 f587, f585, f586; +sub.f32 f588, f385, f400; +mul.f32 f589, f588, 0f3F737871; +sub.f32 f590, f390, f395; +mul.f32 f591, f590, 0fBF167918; +sub.f32 f592, f591, f589; +sub.f32 f593, f587, f592; +add.f32 f594, f592, f587; +mul.f32 f595, f577, 0f3F4F1BBD; +sub.f32 f596, f118, f595; +fma.rn.f32 f597, f579, 0f3E9E377A, f596; +mul.f32 f598, f588, 0f3F167918; +mul.f32 f599, f590, 0f3F737871; +sub.f32 f600, f599, f598; +sub.f32 f601, f597, f600; +add.f32 f602, f600, f597; +fma.rn.f32 f603, f581, 0f3E9E377A, f136; +mul.f32 f604, f583, 0f3F4F1BBD; +sub.f32 f605, f603, f604; +sub.f32 f606, f383, f398; +mul.f32 f607, f606, 0f3F737871; +sub.f32 f608, f388, f393; +mul.f32 f609, f608, 0fBF167918; +sub.f32 f610, f609, f607; +add.f32 f611, f610, f605; +sub.f32 f612, f605, f610; +mul.f32 f613, f581, 0f3F4F1BBD; +sub.f32 f614, f136, f613; +fma.rn.f32 f615, f583, 0f3E9E377A, f614; +mul.f32 f616, f606, 0f3F167918; +mul.f32 f617, f608, 0f3F737871; +sub.f32 f618, f617, f616; +add.f32 f619, f618, f615; +sub.f32 f620, f615, f618; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f621, f622}, [rd6]; +mul.f32 f625, f621, f448; +mul.f32 f626, f622, f452; +sub.f32 f627, f625, f626; +mul.f32 f628, f621, f452; +fma.rn.f32 f629, f622, f448, f628; +mul.f32 f630, f621, f621; +mul.f32 f631, f622, f622; +sub.f32 f632, f630, f631; +mul.f32 f633, f622, f621; +fma.rn.f32 f634, f622, f621, f633; +mul.f32 f635, f632, f492; +mul.f32 f636, f634, f496; +sub.f32 f637, f635, f636; +mul.f32 f638, f632, f496; +fma.rn.f32 f639, f634, f492, f638; +mul.f32 f640, f621, f632; +mul.f32 f641, f622, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f621, f634; +fma.rn.f32 f644, f622, f632, f643; +mul.f32 f645, f642, f536; +mul.f32 f646, f644, f540; +sub.f32 f647, f645, f646; +mul.f32 f648, f642, f540; +fma.rn.f32 f649, f644, f536, f648; +mul.f32 f650, f621, f642; +mul.f32 f651, f622, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f621, f644; +fma.rn.f32 f654, f622, f642, f653; +mul.f32 f655, f652, f580; +mul.f32 f656, f654, f584; +sub.f32 f657, f655, f656; +mul.f32 f658, f652, f584; +fma.rn.f32 f659, f654, f580, f658; +mul.f32 f660, f621, f652; +mul.f32 f661, f622, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f621, f654; +fma.rn.f32 f664, f622, f652, f663; +mul.f32 f665, f662, f417; +mul.f32 f666, f664, f435; +sub.f32 f667, f665, f666; +mul.f32 f668, f662, f435; +fma.rn.f32 f669, f664, f417, f668; +mul.f32 f670, f621, f662; +mul.f32 f671, f622, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f621, f664; +fma.rn.f32 f674, f622, f662, f673; +mul.f32 f675, f672, f461; +mul.f32 f676, f674, f479; +sub.f32 f677, f675, f676; +mul.f32 f678, f672, f479; +fma.rn.f32 f679, f674, f461, f678; +mul.f32 f680, f621, f672; +mul.f32 f681, f622, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f621, f674; +fma.rn.f32 f684, f622, f672, f683; +mul.f32 f685, f682, f505; +mul.f32 f686, f684, f523; +sub.f32 f687, f685, f686; +mul.f32 f688, f682, f523; +fma.rn.f32 f689, f684, f505, f688; +mul.f32 f690, f621, f682; +mul.f32 f691, f622, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f621, f684; +fma.rn.f32 f694, f622, f682, f693; +mul.f32 f695, f692, f549; +mul.f32 f696, f694, f567; +sub.f32 f697, f695, f696; +mul.f32 f698, f692, f567; +fma.rn.f32 f699, f694, f549, f698; +mul.f32 f700, f621, f692; +mul.f32 f701, f622, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f621, f694; +fma.rn.f32 f704, f622, f692, f703; +mul.f32 f705, f702, f593; +mul.f32 f706, f704, f611; +sub.f32 f707, f705, f706; +mul.f32 f708, f702, f611; +fma.rn.f32 f709, f704, f593, f708; +mul.f32 f710, f621, f702; +mul.f32 f711, f622, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f621, f704; +fma.rn.f32 f714, f622, f702, f713; +mul.f32 f715, f712, f425; +mul.f32 f716, f714, f443; +sub.f32 f717, f715, f716; +mul.f32 f718, f712, f443; +fma.rn.f32 f719, f714, f425, f718; +mul.f32 f720, f621, f712; +mul.f32 f721, f622, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f621, f714; +fma.rn.f32 f724, f622, f712, f723; +mul.f32 f725, f722, f469; +mul.f32 f726, f724, f487; +sub.f32 f727, f725, f726; +mul.f32 f728, f722, f487; +fma.rn.f32 f729, f724, f469, f728; +mul.f32 f730, f621, f722; +mul.f32 f731, f622, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f621, f724; +fma.rn.f32 f734, f622, f722, f733; +mul.f32 f735, f732, f513; +mul.f32 f736, f734, f531; +sub.f32 f737, f735, f736; +mul.f32 f738, f732, f531; +fma.rn.f32 f739, f734, f513, f738; +mul.f32 f740, f621, f732; +mul.f32 f741, f622, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f621, f734; +fma.rn.f32 f744, f622, f732, f743; +mul.f32 f745, f742, f557; +mul.f32 f746, f744, f575; +sub.f32 f747, f745, f746; +mul.f32 f748, f742, f575; +fma.rn.f32 f749, f744, f557, f748; +mul.f32 f750, f621, f742; +mul.f32 f751, f622, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f621, f744; +fma.rn.f32 f754, f622, f742, f753; +mul.f32 f755, f752, f601; +mul.f32 f756, f754, f619; +sub.f32 f757, f755, f756; +mul.f32 f758, f752, f619; +fma.rn.f32 f759, f754, f601, f758; +mul.f32 f760, f621, f752; +mul.f32 f761, f622, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f621, f754; +fma.rn.f32 f764, f622, f752, f763; +mul.f32 f765, f762, f426; +mul.f32 f766, f764, f444; +sub.f32 f767, f765, f766; +mul.f32 f768, f762, f444; +fma.rn.f32 f769, f764, f426, f768; +mul.f32 f770, f621, f762; +mul.f32 f771, f622, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f621, f764; +fma.rn.f32 f774, f622, f762, f773; +mul.f32 f775, f772, f470; +mul.f32 f776, f774, f488; +sub.f32 f777, f775, f776; +mul.f32 f778, f772, f488; +fma.rn.f32 f779, f774, f470, f778; +mul.f32 f780, f621, f772; +mul.f32 f781, f622, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f621, f774; +fma.rn.f32 f784, f622, f772, f783; +mul.f32 f785, f782, f514; +mul.f32 f786, f784, f532; +sub.f32 f787, f785, f786; +mul.f32 f788, f782, f532; +fma.rn.f32 f789, f784, f514, f788; +mul.f32 f790, f621, f782; +mul.f32 f791, f622, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f621, f784; +fma.rn.f32 f794, f622, f782, f793; +mul.f32 f795, f792, f558; +mul.f32 f796, f794, f576; +sub.f32 f797, f795, f796; +mul.f32 f798, f792, f576; +fma.rn.f32 f799, f794, f558, f798; +mul.f32 f800, f621, f792; +mul.f32 f801, f622, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f621, f794; +fma.rn.f32 f804, f622, f792, f803; +mul.f32 f805, f802, f602; +mul.f32 f806, f804, f620; +sub.f32 f807, f805, f806; +mul.f32 f808, f802, f620; +fma.rn.f32 f809, f804, f602, f808; +mul.f32 f810, f621, f802; +mul.f32 f811, f622, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f621, f804; +fma.rn.f32 f814, f622, f802, f813; +mul.f32 f815, f812, f418; +mul.f32 f816, f814, f436; +sub.f32 f817, f815, f816; +mul.f32 f818, f812, f436; +fma.rn.f32 f819, f814, f418, f818; +mul.f32 f820, f621, f812; +mul.f32 f821, f622, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f621, f814; +fma.rn.f32 f824, f622, f812, f823; +mul.f32 f825, f822, f462; +mul.f32 f826, f824, f480; +sub.f32 f827, f825, f826; +mul.f32 f828, f822, f480; +fma.rn.f32 f829, f824, f462, f828; +mul.f32 f830, f621, f822; +mul.f32 f831, f622, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f621, f824; +fma.rn.f32 f834, f622, f822, f833; +mul.f32 f835, f832, f506; +mul.f32 f836, f834, f524; +sub.f32 f837, f835, f836; +mul.f32 f838, f832, f524; +fma.rn.f32 f839, f834, f506, f838; +mul.f32 f840, f621, f832; +mul.f32 f841, f622, f834; +sub.f32 f842, f840, f841; +mul.f32 f843, f621, f834; +fma.rn.f32 f844, f622, f832, f843; +mul.f32 f845, f842, f550; +mul.f32 f846, f844, f568; +sub.f32 f847, f845, f846; +mul.f32 f848, f842, f568; +fma.rn.f32 f849, f844, f550, f848; +mul.f32 f850, f621, f842; +mul.f32 f851, f622, f844; +sub.f32 f852, f850, f851; +mul.f32 f853, f621, f844; +fma.rn.f32 f854, f622, f842, f853; +mul.f32 f855, f852, f594; +mul.f32 f856, f854, f612; +sub.f32 f857, f855, f856; +mul.f32 f858, f852, f612; +fma.rn.f32 f859, f854, f594, f858; +mad.lo.s32 r8, r5, 62500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f404; +st.shared.f32 [r9+4], f627; +st.shared.f32 [r9+8], f637; +st.shared.f32 [r9+12], f647; +st.shared.f32 [r9+16], f657; +st.shared.f32 [r9+20], f667; +st.shared.f32 [r9+24], f677; +st.shared.f32 [r9+28], f687; +st.shared.f32 [r9+32], f697; +st.shared.f32 [r9+36], f707; +st.shared.f32 [r9+40], f717; +st.shared.f32 [r9+44], f727; +st.shared.f32 [r9+48], f737; +st.shared.f32 [r9+52], f747; +st.shared.f32 [r9+56], f757; +st.shared.f32 [r9+60], f767; +st.shared.f32 [r9+64], f777; +st.shared.f32 [r9+68], f787; +st.shared.f32 [r9+72], f797; +st.shared.f32 [r9+76], f807; +st.shared.f32 [r9+80], f817; +st.shared.f32 [r9+84], f827; +st.shared.f32 [r9+88], f837; +st.shared.f32 [r9+92], f847; +st.shared.f32 [r9+96], f857; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f860, [r10]; +ld.shared.f32 f861, [r10+2500]; +ld.shared.f32 f862, [r10+5000]; +ld.shared.f32 f863, [r10+7500]; +ld.shared.f32 f864, [r10+10000]; +ld.shared.f32 f865, [r10+12500]; +ld.shared.f32 f866, [r10+15000]; +ld.shared.f32 f867, [r10+17500]; +ld.shared.f32 f868, [r10+20000]; +ld.shared.f32 f869, [r10+22500]; +ld.shared.f32 f870, [r10+25000]; +ld.shared.f32 f871, [r10+27500]; +ld.shared.f32 f872, [r10+30000]; +ld.shared.f32 f873, [r10+32500]; +ld.shared.f32 f874, [r10+35000]; +ld.shared.f32 f875, [r10+37500]; +ld.shared.f32 f876, [r10+40000]; +ld.shared.f32 f877, [r10+42500]; +ld.shared.f32 f878, [r10+45000]; +ld.shared.f32 f879, [r10+47500]; +ld.shared.f32 f880, [r10+50000]; +ld.shared.f32 f881, [r10+52500]; +ld.shared.f32 f882, [r10+55000]; +ld.shared.f32 f883, [r10+57500]; +ld.shared.f32 f884, [r10+60000]; +barrier.sync 0; +st.shared.f32 [r9], f408; +st.shared.f32 [r9+4], f629; +st.shared.f32 [r9+8], f639; +st.shared.f32 [r9+12], f649; +st.shared.f32 [r9+16], f659; +st.shared.f32 [r9+20], f669; +st.shared.f32 [r9+24], f679; +st.shared.f32 [r9+28], f689; +st.shared.f32 [r9+32], f699; +st.shared.f32 [r9+36], f709; +st.shared.f32 [r9+40], f719; +st.shared.f32 [r9+44], f729; +st.shared.f32 [r9+48], f739; +st.shared.f32 [r9+52], f749; +st.shared.f32 [r9+56], f759; +st.shared.f32 [r9+60], f769; +st.shared.f32 [r9+64], f779; +st.shared.f32 [r9+68], f789; +st.shared.f32 [r9+72], f799; +st.shared.f32 [r9+76], f809; +st.shared.f32 [r9+80], f819; +st.shared.f32 [r9+84], f829; +st.shared.f32 [r9+88], f839; +st.shared.f32 [r9+92], f849; +st.shared.f32 [r9+96], f859; +barrier.sync 0; +ld.shared.f32 f885, [r10]; +ld.shared.f32 f886, [r10+2500]; +ld.shared.f32 f887, [r10+5000]; +ld.shared.f32 f888, [r10+7500]; +ld.shared.f32 f889, [r10+10000]; +ld.shared.f32 f890, [r10+12500]; +ld.shared.f32 f891, [r10+15000]; +ld.shared.f32 f892, [r10+17500]; +ld.shared.f32 f893, [r10+20000]; +ld.shared.f32 f894, [r10+22500]; +ld.shared.f32 f895, [r10+25000]; +ld.shared.f32 f896, [r10+27500]; +ld.shared.f32 f897, [r10+30000]; +ld.shared.f32 f898, [r10+32500]; +ld.shared.f32 f899, [r10+35000]; +ld.shared.f32 f900, [r10+37500]; +ld.shared.f32 f901, [r10+40000]; +ld.shared.f32 f902, [r10+42500]; +ld.shared.f32 f903, [r10+45000]; +ld.shared.f32 f904, [r10+47500]; +ld.shared.f32 f905, [r10+50000]; +ld.shared.f32 f906, [r10+52500]; +ld.shared.f32 f907, [r10+55000]; +ld.shared.f32 f908, [r10+57500]; +ld.shared.f32 f909, [r10+60000]; +add.f32 f910, f865, f880; +add.f32 f911, f860, f910; +add.f32 f912, f870, f875; +add.f32 f913, f912, f911; +add.f32 f914, f890, f905; +add.f32 f915, f885, f914; +add.f32 f916, f895, f900; +add.f32 f917, f916, f915; +fma.rn.f32 f918, f910, 0f3E9E377A, f860; +mul.f32 f919, f912, 0f3F4F1BBD; +sub.f32 f920, f918, f919; +sub.f32 f921, f890, f905; +mul.f32 f922, f921, 0f3F737871; +sub.f32 f923, f895, f900; +mul.f32 f924, f923, 0fBF167918; +sub.f32 f925, f924, f922; +sub.f32 f926, f920, f925; +add.f32 f927, f925, f920; +mul.f32 f928, f910, 0f3F4F1BBD; +sub.f32 f929, f860, f928; +fma.rn.f32 f930, f912, 0f3E9E377A, f929; +mul.f32 f931, f921, 0f3F167918; +mul.f32 f932, f923, 0f3F737871; +sub.f32 f933, f932, f931; +sub.f32 f934, f930, f933; +add.f32 f935, f933, f930; +fma.rn.f32 f936, f914, 0f3E9E377A, f885; +mul.f32 f937, f916, 0f3F4F1BBD; +sub.f32 f938, f936, f937; +sub.f32 f939, f865, f880; +mul.f32 f940, f939, 0f3F737871; +sub.f32 f941, f870, f875; +mul.f32 f942, f941, 0fBF167918; +sub.f32 f943, f942, f940; +add.f32 f944, f943, f938; +sub.f32 f945, f938, f943; +mul.f32 f946, f914, 0f3F4F1BBD; +sub.f32 f947, f885, f946; +fma.rn.f32 f948, f916, 0f3E9E377A, f947; +mul.f32 f949, f939, 0f3F167918; +mul.f32 f950, f941, 0f3F737871; +sub.f32 f951, f950, f949; +add.f32 f952, f951, f948; +sub.f32 f953, f948, f951; +add.f32 f954, f866, f881; +add.f32 f955, f861, f954; +add.f32 f956, f871, f876; +add.f32 f957, f956, f955; +add.f32 f958, f891, f906; +add.f32 f959, f886, f958; +add.f32 f960, f896, f901; +add.f32 f961, f960, f959; +fma.rn.f32 f962, f954, 0f3E9E377A, f861; +mul.f32 f963, f956, 0f3F4F1BBD; +sub.f32 f964, f962, f963; +sub.f32 f965, f891, f906; +mul.f32 f966, f965, 0f3F737871; +sub.f32 f967, f896, f901; +mul.f32 f968, f967, 0fBF167918; +sub.f32 f969, f968, f966; +sub.f32 f970, f964, f969; +add.f32 f971, f969, f964; +mul.f32 f972, f954, 0f3F4F1BBD; +sub.f32 f973, f861, f972; +fma.rn.f32 f974, f956, 0f3E9E377A, f973; +mul.f32 f975, f965, 0f3F167918; +mul.f32 f976, f967, 0f3F737871; +sub.f32 f977, f976, f975; +sub.f32 f978, f974, f977; +add.f32 f979, f977, f974; +fma.rn.f32 f980, f958, 0f3E9E377A, f886; +mul.f32 f981, f960, 0f3F4F1BBD; +sub.f32 f982, f980, f981; +sub.f32 f983, f866, f881; +mul.f32 f984, f983, 0f3F737871; +sub.f32 f985, f871, f876; +mul.f32 f986, f985, 0fBF167918; +sub.f32 f987, f986, f984; +add.f32 f988, f987, f982; +sub.f32 f989, f982, f987; +mul.f32 f990, f958, 0f3F4F1BBD; +sub.f32 f991, f886, f990; +fma.rn.f32 f992, f960, 0f3E9E377A, f991; +mul.f32 f993, f983, 0f3F167918; +mul.f32 f994, f985, 0f3F737871; +sub.f32 f995, f994, f993; +add.f32 f996, f995, f992; +sub.f32 f997, f992, f995; +add.f32 f998, f867, f882; +add.f32 f999, f862, f998; +add.f32 f1000, f872, f877; +add.f32 f1001, f1000, f999; +add.f32 f1002, f892, f907; +add.f32 f1003, f887, f1002; +add.f32 f1004, f897, f902; +add.f32 f1005, f1004, f1003; +fma.rn.f32 f1006, f998, 0f3E9E377A, f862; +mul.f32 f1007, f1000, 0f3F4F1BBD; +sub.f32 f1008, f1006, f1007; +sub.f32 f1009, f892, f907; +mul.f32 f1010, f1009, 0f3F737871; +sub.f32 f1011, f897, f902; +mul.f32 f1012, f1011, 0fBF167918; +sub.f32 f1013, f1012, f1010; +sub.f32 f1014, f1008, f1013; +add.f32 f1015, f1013, f1008; +mul.f32 f1016, f998, 0f3F4F1BBD; +sub.f32 f1017, f862, f1016; +fma.rn.f32 f1018, f1000, 0f3E9E377A, f1017; +mul.f32 f1019, f1009, 0f3F167918; +mul.f32 f1020, f1011, 0f3F737871; +sub.f32 f1021, f1020, f1019; +sub.f32 f1022, f1018, f1021; +add.f32 f1023, f1021, f1018; +fma.rn.f32 f1024, f1002, 0f3E9E377A, f887; +mul.f32 f1025, f1004, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f867, f882; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f872, f877; +mul.f32 f1030, f1029, 0fBF167918; +sub.f32 f1031, f1030, f1028; +add.f32 f1032, f1031, f1026; +sub.f32 f1033, f1026, f1031; +mul.f32 f1034, f1002, 0f3F4F1BBD; +sub.f32 f1035, f887, f1034; +fma.rn.f32 f1036, f1004, 0f3E9E377A, f1035; +mul.f32 f1037, f1027, 0f3F167918; +mul.f32 f1038, f1029, 0f3F737871; +sub.f32 f1039, f1038, f1037; +add.f32 f1040, f1039, f1036; +sub.f32 f1041, f1036, f1039; +add.f32 f1042, f868, f883; +add.f32 f1043, f863, f1042; +add.f32 f1044, f873, f878; +add.f32 f1045, f1044, f1043; +add.f32 f1046, f893, f908; +add.f32 f1047, f888, f1046; +add.f32 f1048, f898, f903; +add.f32 f1049, f1048, f1047; +fma.rn.f32 f1050, f1042, 0f3E9E377A, f863; +mul.f32 f1051, f1044, 0f3F4F1BBD; +sub.f32 f1052, f1050, f1051; +sub.f32 f1053, f893, f908; +mul.f32 f1054, f1053, 0f3F737871; +sub.f32 f1055, f898, f903; +mul.f32 f1056, f1055, 0fBF167918; +sub.f32 f1057, f1056, f1054; +sub.f32 f1058, f1052, f1057; +add.f32 f1059, f1057, f1052; +mul.f32 f1060, f1042, 0f3F4F1BBD; +sub.f32 f1061, f863, f1060; +fma.rn.f32 f1062, f1044, 0f3E9E377A, f1061; +mul.f32 f1063, f1053, 0f3F167918; +mul.f32 f1064, f1055, 0f3F737871; +sub.f32 f1065, f1064, f1063; +sub.f32 f1066, f1062, f1065; +add.f32 f1067, f1065, f1062; +fma.rn.f32 f1068, f1046, 0f3E9E377A, f888; +mul.f32 f1069, f1048, 0f3F4F1BBD; +sub.f32 f1070, f1068, f1069; +sub.f32 f1071, f868, f883; +mul.f32 f1072, f1071, 0f3F737871; +sub.f32 f1073, f873, f878; +mul.f32 f1074, f1073, 0fBF167918; +sub.f32 f1075, f1074, f1072; +add.f32 f1076, f1075, f1070; +sub.f32 f1077, f1070, f1075; +mul.f32 f1078, f1046, 0f3F4F1BBD; +sub.f32 f1079, f888, f1078; +fma.rn.f32 f1080, f1048, 0f3E9E377A, f1079; +mul.f32 f1081, f1071, 0f3F167918; +mul.f32 f1082, f1073, 0f3F737871; +sub.f32 f1083, f1082, f1081; +add.f32 f1084, f1083, f1080; +sub.f32 f1085, f1080, f1083; +add.f32 f1086, f869, f884; +add.f32 f1087, f864, f1086; +add.f32 f1088, f874, f879; +add.f32 f1089, f1088, f1087; +add.f32 f1090, f894, f909; +add.f32 f1091, f889, f1090; +add.f32 f1092, f899, f904; +add.f32 f1093, f1092, f1091; +fma.rn.f32 f1094, f1086, 0f3E9E377A, f864; +mul.f32 f1095, f1088, 0f3F4F1BBD; +sub.f32 f1096, f1094, f1095; +sub.f32 f1097, f894, f909; +mul.f32 f1098, f1097, 0f3F737871; +sub.f32 f1099, f899, f904; +mul.f32 f1100, f1099, 0fBF167918; +sub.f32 f1101, f1100, f1098; +sub.f32 f1102, f1096, f1101; +add.f32 f1103, f1101, f1096; +mul.f32 f1104, f1086, 0f3F4F1BBD; +sub.f32 f1105, f864, f1104; +fma.rn.f32 f1106, f1088, 0f3E9E377A, f1105; +mul.f32 f1107, f1097, 0f3F167918; +mul.f32 f1108, f1099, 0f3F737871; +sub.f32 f1109, f1108, f1107; +sub.f32 f1110, f1106, f1109; +add.f32 f1111, f1109, f1106; +fma.rn.f32 f1112, f1090, 0f3E9E377A, f889; +mul.f32 f1113, f1092, 0f3F4F1BBD; +sub.f32 f1114, f1112, f1113; +sub.f32 f1115, f869, f884; +mul.f32 f1116, f1115, 0f3F737871; +sub.f32 f1117, f874, f879; +mul.f32 f1118, f1117, 0fBF167918; +sub.f32 f1119, f1118, f1116; +add.f32 f1120, f1119, f1114; +sub.f32 f1121, f1114, f1119; +mul.f32 f1122, f1090, 0f3F4F1BBD; +sub.f32 f1123, f889, f1122; +fma.rn.f32 f1124, f1092, 0f3E9E377A, f1123; +mul.f32 f1125, f1115, 0f3F167918; +mul.f32 f1126, f1117, 0f3F737871; +sub.f32 f1127, f1126, f1125; +add.f32 f1128, f1127, f1124; +sub.f32 f1129, f1124, f1127; +mul.f32 f1130, f970, 0f3F77F511; +mul.f32 f1131, f988, 0fBE7EA890; +sub.f32 f1132, f1130, f1131; +mul.f32 f1133, f988, 0f3F77F511; +fma.rn.f32 f1134, f970, 0fBE7EA890, f1133; +mul.f32 f1135, f1014, 0f3F6055A2; +mul.f32 f1136, f1032, 0fBEF6A86B; +sub.f32 f1137, f1135, f1136; +mul.f32 f1138, f1032, 0f3F6055A2; +fma.rn.f32 f1139, f1014, 0fBEF6A86B, f1138; +mul.f32 f1140, f1058, 0f3F3A9DB0; +mul.f32 f1141, f1076, 0fBF2F3E7B; +sub.f32 f1142, f1140, f1141; +mul.f32 f1143, f1076, 0f3F3A9DB0; +fma.rn.f32 f1144, f1058, 0fBF2F3E7B, f1143; +mul.f32 f1145, f1102, 0f3F092BF2; +mul.f32 f1146, f1120, 0fBF5825E0; +sub.f32 f1147, f1145, f1146; +mul.f32 f1148, f1120, 0f3F092BF2; +fma.rn.f32 f1149, f1102, 0fBF5825E0, f1148; +mul.f32 f1150, f978, 0f3F6055A2; +mul.f32 f1151, f996, 0fBEF6A86B; +sub.f32 f1152, f1150, f1151; +mul.f32 f1153, f996, 0f3F6055A2; +fma.rn.f32 f1154, f978, 0fBEF6A86B, f1153; +mul.f32 f1155, f1022, 0f3F092BF2; +mul.f32 f1156, f1040, 0fBF5825E0; +sub.f32 f1157, f1155, f1156; +mul.f32 f1158, f1040, 0f3F092BF2; +fma.rn.f32 f1159, f1022, 0fBF5825E0, f1158; +mul.f32 f1160, f1066, 0f3D809851; +mul.f32 f1161, f1084, 0fBF7F7EAE; +sub.f32 f1162, f1160, f1161; +mul.f32 f1163, f1084, 0f3D809851; +fma.rn.f32 f1164, f1066, 0fBF7F7EAE, f1163; +mul.f32 f1165, f1110, 0fBED9FFBE; +mul.f32 f1166, f1128, 0fBF67A2BF; +sub.f32 f1167, f1165, f1166; +mul.f32 f1168, f1128, 0fBED9FFBE; +fma.rn.f32 f1169, f1110, 0fBF67A2BF, f1168; +mul.f32 f1170, f979, 0f3F3A9DB0; +mul.f32 f1171, f997, 0fBF2F3E7B; +sub.f32 f1172, f1170, f1171; +mul.f32 f1173, f997, 0f3F3A9DB0; +fma.rn.f32 f1174, f979, 0fBF2F3E7B, f1173; +mul.f32 f1175, f1023, 0f3D809851; +mul.f32 f1176, f1041, 0fBF7F7EAE; +sub.f32 f1177, f1175, f1176; +mul.f32 f1178, f1041, 0f3D809851; +fma.rn.f32 f1179, f1023, 0fBF7F7EAE, f1178; +mul.f32 f1180, f1067, 0fBF232E38; +mul.f32 f1181, f1085, 0fBF45405B; +sub.f32 f1182, f1180, f1181; +mul.f32 f1183, f1085, 0fBF232E38; +fma.rn.f32 f1184, f1067, 0fBF45405B, f1183; +mul.f32 f1185, f1111, 0fBF7DFB3B; +mul.f32 f1186, f1129, 0fBE00575B; +sub.f32 f1187, f1185, f1186; +mul.f32 f1188, f1129, 0fBF7DFB3B; +fma.rn.f32 f1189, f1111, 0fBE00575B, f1188; +mul.f32 f1190, f971, 0f3F092BF2; +mul.f32 f1191, f989, 0fBF5825E0; +sub.f32 f1192, f1190, f1191; +mul.f32 f1193, f989, 0f3F092BF2; +fma.rn.f32 f1194, f971, 0fBF5825E0, f1193; +mul.f32 f1195, f1015, 0fBED9FFBE; +mul.f32 f1196, f1033, 0fBF67A2BF; +sub.f32 f1197, f1195, f1196; +mul.f32 f1198, f1033, 0fBED9FFBE; +fma.rn.f32 f1199, f1015, 0fBF67A2BF, f1198; +mul.f32 f1200, f1059, 0fBF7DFB3B; +mul.f32 f1201, f1077, 0fBE00575B; +sub.f32 f1202, f1200, f1201; +mul.f32 f1203, f1077, 0fBF7DFB3B; +fma.rn.f32 f1204, f1059, 0fBE00575B, f1203; +mul.f32 f1205, f1103, 0fBF232E38; +mul.f32 f1206, f1121, 0f3F45405B; +sub.f32 f1207, f1205, f1206; +mul.f32 f1208, f1121, 0fBF232E38; +fma.rn.f32 f1209, f1103, 0f3F45405B, f1208; +add.f32 f1210, f957, f1089; +add.f32 f1211, f913, f1210; +add.f32 f1212, f1001, f1045; +add.f32 f1213, f1212, f1211; +add.f32 f1214, f961, f1093; +add.f32 f1215, f917, f1214; +add.f32 f1216, f1005, f1049; +add.f32 f1217, f1216, f1215; +fma.rn.f32 f1218, f1210, 0f3E9E377A, f913; +mul.f32 f1219, f1212, 0f3F4F1BBD; +sub.f32 f1220, f1218, f1219; +sub.f32 f1221, f961, f1093; +mul.f32 f1222, f1221, 0f3F737871; +sub.f32 f1223, f1005, f1049; +mul.f32 f1224, f1223, 0fBF167918; +sub.f32 f1225, f1224, f1222; +sub.f32 f1226, f1220, f1225; +add.f32 f1227, f1225, f1220; +mul.f32 f1228, f1210, 0f3F4F1BBD; +sub.f32 f1229, f913, f1228; +fma.rn.f32 f1230, f1212, 0f3E9E377A, f1229; +mul.f32 f1231, f1221, 0f3F167918; +mul.f32 f1232, f1223, 0f3F737871; +sub.f32 f1233, f1232, f1231; +sub.f32 f1234, f1230, f1233; +add.f32 f1235, f1233, f1230; +fma.rn.f32 f1236, f1214, 0f3E9E377A, f917; +mul.f32 f1237, f1216, 0f3F4F1BBD; +sub.f32 f1238, f1236, f1237; +sub.f32 f1239, f957, f1089; +mul.f32 f1240, f1239, 0f3F737871; +sub.f32 f1241, f1001, f1045; +mul.f32 f1242, f1241, 0fBF167918; +sub.f32 f1243, f1242, f1240; +add.f32 f1244, f1243, f1238; +sub.f32 f1245, f1238, f1243; +mul.f32 f1246, f1214, 0f3F4F1BBD; +sub.f32 f1247, f917, f1246; +fma.rn.f32 f1248, f1216, 0f3E9E377A, f1247; +mul.f32 f1249, f1239, 0f3F167918; +mul.f32 f1250, f1241, 0f3F737871; +sub.f32 f1251, f1250, f1249; +add.f32 f1252, f1251, f1248; +sub.f32 f1253, f1248, f1251; +add.f32 f1254, f1132, f1147; +add.f32 f1255, f926, f1254; +add.f32 f1256, f1137, f1142; +add.f32 f1257, f1256, f1255; +add.f32 f1258, f1134, f1149; +add.f32 f1259, f944, f1258; +add.f32 f1260, f1139, f1144; +add.f32 f1261, f1260, f1259; +fma.rn.f32 f1262, f1254, 0f3E9E377A, f926; +mul.f32 f1263, f1256, 0f3F4F1BBD; +sub.f32 f1264, f1262, f1263; +sub.f32 f1265, f1134, f1149; +mul.f32 f1266, f1265, 0f3F737871; +sub.f32 f1267, f1139, f1144; +mul.f32 f1268, f1267, 0fBF167918; +sub.f32 f1269, f1268, f1266; +sub.f32 f1270, f1264, f1269; +add.f32 f1271, f1269, f1264; +mul.f32 f1272, f1254, 0f3F4F1BBD; +sub.f32 f1273, f926, f1272; +fma.rn.f32 f1274, f1256, 0f3E9E377A, f1273; +mul.f32 f1275, f1265, 0f3F167918; +mul.f32 f1276, f1267, 0f3F737871; +sub.f32 f1277, f1276, f1275; +sub.f32 f1278, f1274, f1277; +add.f32 f1279, f1277, f1274; +fma.rn.f32 f1280, f1258, 0f3E9E377A, f944; +mul.f32 f1281, f1260, 0f3F4F1BBD; +sub.f32 f1282, f1280, f1281; +sub.f32 f1283, f1132, f1147; +mul.f32 f1284, f1283, 0f3F737871; +sub.f32 f1285, f1137, f1142; +mul.f32 f1286, f1285, 0fBF167918; +sub.f32 f1287, f1286, f1284; +add.f32 f1288, f1287, f1282; +sub.f32 f1289, f1282, f1287; +mul.f32 f1290, f1258, 0f3F4F1BBD; +sub.f32 f1291, f944, f1290; +fma.rn.f32 f1292, f1260, 0f3E9E377A, f1291; +mul.f32 f1293, f1283, 0f3F167918; +mul.f32 f1294, f1285, 0f3F737871; +sub.f32 f1295, f1294, f1293; +add.f32 f1296, f1295, f1292; +sub.f32 f1297, f1292, f1295; +add.f32 f1298, f1152, f1167; +add.f32 f1299, f934, f1298; +add.f32 f1300, f1157, f1162; +add.f32 f1301, f1300, f1299; +add.f32 f1302, f1154, f1169; +add.f32 f1303, f952, f1302; +add.f32 f1304, f1159, f1164; +add.f32 f1305, f1304, f1303; +fma.rn.f32 f1306, f1298, 0f3E9E377A, f934; +mul.f32 f1307, f1300, 0f3F4F1BBD; +sub.f32 f1308, f1306, f1307; +sub.f32 f1309, f1154, f1169; +mul.f32 f1310, f1309, 0f3F737871; +sub.f32 f1311, f1159, f1164; +mul.f32 f1312, f1311, 0fBF167918; +sub.f32 f1313, f1312, f1310; +sub.f32 f1314, f1308, f1313; +add.f32 f1315, f1313, f1308; +mul.f32 f1316, f1298, 0f3F4F1BBD; +sub.f32 f1317, f934, f1316; +fma.rn.f32 f1318, f1300, 0f3E9E377A, f1317; +mul.f32 f1319, f1309, 0f3F167918; +mul.f32 f1320, f1311, 0f3F737871; +sub.f32 f1321, f1320, f1319; +sub.f32 f1322, f1318, f1321; +add.f32 f1323, f1321, f1318; +fma.rn.f32 f1324, f1302, 0f3E9E377A, f952; +mul.f32 f1325, f1304, 0f3F4F1BBD; +sub.f32 f1326, f1324, f1325; +sub.f32 f1327, f1152, f1167; +mul.f32 f1328, f1327, 0f3F737871; +sub.f32 f1329, f1157, f1162; +mul.f32 f1330, f1329, 0fBF167918; +sub.f32 f1331, f1330, f1328; +add.f32 f1332, f1331, f1326; +sub.f32 f1333, f1326, f1331; +mul.f32 f1334, f1302, 0f3F4F1BBD; +sub.f32 f1335, f952, f1334; +fma.rn.f32 f1336, f1304, 0f3E9E377A, f1335; +mul.f32 f1337, f1327, 0f3F167918; +mul.f32 f1338, f1329, 0f3F737871; +sub.f32 f1339, f1338, f1337; +add.f32 f1340, f1339, f1336; +sub.f32 f1341, f1336, f1339; +add.f32 f1342, f1172, f1187; +add.f32 f1343, f935, f1342; +add.f32 f1344, f1177, f1182; +add.f32 f1345, f1344, f1343; +add.f32 f1346, f1174, f1189; +add.f32 f1347, f953, f1346; +add.f32 f1348, f1179, f1184; +add.f32 f1349, f1348, f1347; +fma.rn.f32 f1350, f1342, 0f3E9E377A, f935; +mul.f32 f1351, f1344, 0f3F4F1BBD; +sub.f32 f1352, f1350, f1351; +sub.f32 f1353, f1174, f1189; +mul.f32 f1354, f1353, 0f3F737871; +sub.f32 f1355, f1179, f1184; +mul.f32 f1356, f1355, 0fBF167918; +sub.f32 f1357, f1356, f1354; +sub.f32 f1358, f1352, f1357; +add.f32 f1359, f1357, f1352; +mul.f32 f1360, f1342, 0f3F4F1BBD; +sub.f32 f1361, f935, f1360; +fma.rn.f32 f1362, f1344, 0f3E9E377A, f1361; +mul.f32 f1363, f1353, 0f3F167918; +mul.f32 f1364, f1355, 0f3F737871; +sub.f32 f1365, f1364, f1363; +sub.f32 f1366, f1362, f1365; +add.f32 f1367, f1365, f1362; +fma.rn.f32 f1368, f1346, 0f3E9E377A, f953; +mul.f32 f1369, f1348, 0f3F4F1BBD; +sub.f32 f1370, f1368, f1369; +sub.f32 f1371, f1172, f1187; +mul.f32 f1372, f1371, 0f3F737871; +sub.f32 f1373, f1177, f1182; +mul.f32 f1374, f1373, 0fBF167918; +sub.f32 f1375, f1374, f1372; +add.f32 f1376, f1375, f1370; +sub.f32 f1377, f1370, f1375; +mul.f32 f1378, f1346, 0f3F4F1BBD; +sub.f32 f1379, f953, f1378; +fma.rn.f32 f1380, f1348, 0f3E9E377A, f1379; +mul.f32 f1381, f1371, 0f3F167918; +mul.f32 f1382, f1373, 0f3F737871; +sub.f32 f1383, f1382, f1381; +add.f32 f1384, f1383, f1380; +sub.f32 f1385, f1380, f1383; +add.f32 f1386, f1192, f1207; +add.f32 f1387, f927, f1386; +add.f32 f1388, f1197, f1202; +add.f32 f1389, f1388, f1387; +add.f32 f1390, f1194, f1209; +add.f32 f1391, f945, f1390; +add.f32 f1392, f1199, f1204; +add.f32 f1393, f1392, f1391; +fma.rn.f32 f1394, f1386, 0f3E9E377A, f927; +mul.f32 f1395, f1388, 0f3F4F1BBD; +sub.f32 f1396, f1394, f1395; +sub.f32 f1397, f1194, f1209; +mul.f32 f1398, f1397, 0f3F737871; +sub.f32 f1399, f1199, f1204; +mul.f32 f1400, f1399, 0fBF167918; +sub.f32 f1401, f1400, f1398; +sub.f32 f1402, f1396, f1401; +add.f32 f1403, f1401, f1396; +mul.f32 f1404, f1386, 0f3F4F1BBD; +sub.f32 f1405, f927, f1404; +fma.rn.f32 f1406, f1388, 0f3E9E377A, f1405; +mul.f32 f1407, f1397, 0f3F167918; +mul.f32 f1408, f1399, 0f3F737871; +sub.f32 f1409, f1408, f1407; +sub.f32 f1410, f1406, f1409; +add.f32 f1411, f1409, f1406; +fma.rn.f32 f1412, f1390, 0f3E9E377A, f945; +mul.f32 f1413, f1392, 0f3F4F1BBD; +sub.f32 f1414, f1412, f1413; +sub.f32 f1415, f1192, f1207; +mul.f32 f1416, f1415, 0f3F737871; +sub.f32 f1417, f1197, f1202; +mul.f32 f1418, f1417, 0fBF167918; +sub.f32 f1419, f1418, f1416; +add.f32 f1420, f1419, f1414; +sub.f32 f1421, f1414, f1419; +mul.f32 f1422, f1390, 0f3F4F1BBD; +sub.f32 f1423, f945, f1422; +fma.rn.f32 f1424, f1392, 0f3E9E377A, f1423; +mul.f32 f1425, f1415, 0f3F167918; +mul.f32 f1426, f1417, 0f3F737871; +sub.f32 f1427, f1426, f1425; +add.f32 f1428, f1427, f1424; +sub.f32 f1429, f1424, f1427; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %52; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1430, f1431}, [rd11]; +mul.f32 f1434, f1430, f1257; +mul.f32 f1435, f1431, f1261; +sub.f32 f1436, f1434, f1435; +mul.f32 f1437, f1430, f1261; +fma.rn.f32 f1438, f1431, f1257, f1437; +mul.f32 f1439, f1430, f1430; +mul.f32 f1440, f1431, f1431; +sub.f32 f1441, f1439, f1440; +mul.f32 f1442, f1431, f1430; +fma.rn.f32 f1443, f1431, f1430, f1442; +mul.f32 f1444, f1441, f1301; +mul.f32 f1445, f1443, f1305; +sub.f32 f1446, f1444, f1445; +mul.f32 f1447, f1441, f1305; +fma.rn.f32 f1448, f1443, f1301, f1447; +mul.f32 f1449, f1430, f1441; +mul.f32 f1450, f1431, f1443; +sub.f32 f1451, f1449, f1450; +mul.f32 f1452, f1430, f1443; +fma.rn.f32 f1453, f1431, f1441, f1452; +mul.f32 f1454, f1451, f1345; +mul.f32 f1455, f1453, f1349; +sub.f32 f1456, f1454, f1455; +mul.f32 f1457, f1451, f1349; +fma.rn.f32 f1458, f1453, f1345, f1457; +mul.f32 f1459, f1430, f1451; +mul.f32 f1460, f1431, f1453; +sub.f32 f1461, f1459, f1460; +mul.f32 f1462, f1430, f1453; +fma.rn.f32 f1463, f1431, f1451, f1462; +mul.f32 f1464, f1461, f1389; +mul.f32 f1465, f1463, f1393; +sub.f32 f1466, f1464, f1465; +mul.f32 f1467, f1461, f1393; +fma.rn.f32 f1468, f1463, f1389, f1467; +mul.f32 f1469, f1430, f1461; +mul.f32 f1470, f1431, f1463; +sub.f32 f1471, f1469, f1470; +mul.f32 f1472, f1430, f1463; +fma.rn.f32 f1473, f1431, f1461, f1472; +mul.f32 f1474, f1471, f1226; +mul.f32 f1475, f1473, f1244; +sub.f32 f1476, f1474, f1475; +mul.f32 f1477, f1471, f1244; +fma.rn.f32 f1478, f1473, f1226, f1477; +mul.f32 f1479, f1430, f1471; +mul.f32 f1480, f1431, f1473; +sub.f32 f1481, f1479, f1480; +mul.f32 f1482, f1430, f1473; +fma.rn.f32 f1483, f1431, f1471, f1482; +mul.f32 f1484, f1481, f1270; +mul.f32 f1485, f1483, f1288; +sub.f32 f1486, f1484, f1485; +mul.f32 f1487, f1481, f1288; +fma.rn.f32 f1488, f1483, f1270, f1487; +mul.f32 f1489, f1430, f1481; +mul.f32 f1490, f1431, f1483; +sub.f32 f1491, f1489, f1490; +mul.f32 f1492, f1430, f1483; +fma.rn.f32 f1493, f1431, f1481, f1492; +mul.f32 f1494, f1491, f1314; +mul.f32 f1495, f1493, f1332; +sub.f32 f1496, f1494, f1495; +mul.f32 f1497, f1491, f1332; +fma.rn.f32 f1498, f1493, f1314, f1497; +mul.f32 f1499, f1430, f1491; +mul.f32 f1500, f1431, f1493; +sub.f32 f1501, f1499, f1500; +mul.f32 f1502, f1430, f1493; +fma.rn.f32 f1503, f1431, f1491, f1502; +mul.f32 f1504, f1501, f1358; +mul.f32 f1505, f1503, f1376; +sub.f32 f1506, f1504, f1505; +mul.f32 f1507, f1501, f1376; +fma.rn.f32 f1508, f1503, f1358, f1507; +mul.f32 f1509, f1430, f1501; +mul.f32 f1510, f1431, f1503; +sub.f32 f1511, f1509, f1510; +mul.f32 f1512, f1430, f1503; +fma.rn.f32 f1513, f1431, f1501, f1512; +mul.f32 f1514, f1511, f1402; +mul.f32 f1515, f1513, f1420; +sub.f32 f1516, f1514, f1515; +mul.f32 f1517, f1511, f1420; +fma.rn.f32 f1518, f1513, f1402, f1517; +mul.f32 f1519, f1430, f1511; +mul.f32 f1520, f1431, f1513; +sub.f32 f1521, f1519, f1520; +mul.f32 f1522, f1430, f1513; +fma.rn.f32 f1523, f1431, f1511, f1522; +mul.f32 f1524, f1521, f1234; +mul.f32 f1525, f1523, f1252; +sub.f32 f1526, f1524, f1525; +mul.f32 f1527, f1521, f1252; +fma.rn.f32 f1528, f1523, f1234, f1527; +mul.f32 f1529, f1430, f1521; +mul.f32 f1530, f1431, f1523; +sub.f32 f1531, f1529, f1530; +mul.f32 f1532, f1430, f1523; +fma.rn.f32 f1533, f1431, f1521, f1532; +mul.f32 f1534, f1531, f1278; +mul.f32 f1535, f1533, f1296; +sub.f32 f1536, f1534, f1535; +mul.f32 f1537, f1531, f1296; +fma.rn.f32 f1538, f1533, f1278, f1537; +mul.f32 f1539, f1430, f1531; +mul.f32 f1540, f1431, f1533; +sub.f32 f1541, f1539, f1540; +mul.f32 f1542, f1430, f1533; +fma.rn.f32 f1543, f1431, f1531, f1542; +mul.f32 f1544, f1541, f1322; +mul.f32 f1545, f1543, f1340; +sub.f32 f1546, f1544, f1545; +mul.f32 f1547, f1541, f1340; +fma.rn.f32 f1548, f1543, f1322, f1547; +mul.f32 f1549, f1430, f1541; +mul.f32 f1550, f1431, f1543; +sub.f32 f1551, f1549, f1550; +mul.f32 f1552, f1430, f1543; +fma.rn.f32 f1553, f1431, f1541, f1552; +mul.f32 f1554, f1551, f1366; +mul.f32 f1555, f1553, f1384; +sub.f32 f1556, f1554, f1555; +mul.f32 f1557, f1551, f1384; +fma.rn.f32 f1558, f1553, f1366, f1557; +mul.f32 f1559, f1430, f1551; +mul.f32 f1560, f1431, f1553; +sub.f32 f1561, f1559, f1560; +mul.f32 f1562, f1430, f1553; +fma.rn.f32 f1563, f1431, f1551, f1562; +mul.f32 f1564, f1561, f1410; +mul.f32 f1565, f1563, f1428; +sub.f32 f1566, f1564, f1565; +mul.f32 f1567, f1561, f1428; +fma.rn.f32 f1568, f1563, f1410, f1567; +mul.f32 f1569, f1430, f1561; +mul.f32 f1570, f1431, f1563; +sub.f32 f1571, f1569, f1570; +mul.f32 f1572, f1430, f1563; +fma.rn.f32 f1573, f1431, f1561, f1572; +mul.f32 f1574, f1571, f1235; +mul.f32 f1575, f1573, f1253; +sub.f32 f1576, f1574, f1575; +mul.f32 f1577, f1571, f1253; +fma.rn.f32 f1578, f1573, f1235, f1577; +mul.f32 f1579, f1430, f1571; +mul.f32 f1580, f1431, f1573; +sub.f32 f1581, f1579, f1580; +mul.f32 f1582, f1430, f1573; +fma.rn.f32 f1583, f1431, f1571, f1582; +mul.f32 f1584, f1581, f1279; +mul.f32 f1585, f1583, f1297; +sub.f32 f1586, f1584, f1585; +mul.f32 f1587, f1581, f1297; +fma.rn.f32 f1588, f1583, f1279, f1587; +mul.f32 f1589, f1430, f1581; +mul.f32 f1590, f1431, f1583; +sub.f32 f1591, f1589, f1590; +mul.f32 f1592, f1430, f1583; +fma.rn.f32 f1593, f1431, f1581, f1592; +mul.f32 f1594, f1591, f1323; +mul.f32 f1595, f1593, f1341; +sub.f32 f1596, f1594, f1595; +mul.f32 f1597, f1591, f1341; +fma.rn.f32 f1598, f1593, f1323, f1597; +mul.f32 f1599, f1430, f1591; +mul.f32 f1600, f1431, f1593; +sub.f32 f1601, f1599, f1600; +mul.f32 f1602, f1430, f1593; +fma.rn.f32 f1603, f1431, f1591, f1602; +mul.f32 f1604, f1601, f1367; +mul.f32 f1605, f1603, f1385; +sub.f32 f1606, f1604, f1605; +mul.f32 f1607, f1601, f1385; +fma.rn.f32 f1608, f1603, f1367, f1607; +mul.f32 f1609, f1430, f1601; +mul.f32 f1610, f1431, f1603; +sub.f32 f1611, f1609, f1610; +mul.f32 f1612, f1430, f1603; +fma.rn.f32 f1613, f1431, f1601, f1612; +mul.f32 f1614, f1611, f1411; +mul.f32 f1615, f1613, f1429; +sub.f32 f1616, f1614, f1615; +mul.f32 f1617, f1611, f1429; +fma.rn.f32 f1618, f1613, f1411, f1617; +mul.f32 f1619, f1430, f1611; +mul.f32 f1620, f1431, f1613; +sub.f32 f1621, f1619, f1620; +mul.f32 f1622, f1430, f1613; +fma.rn.f32 f1623, f1431, f1611, f1622; +mul.f32 f1624, f1621, f1227; +mul.f32 f1625, f1623, f1245; +sub.f32 f1626, f1624, f1625; +mul.f32 f1627, f1621, f1245; +fma.rn.f32 f1628, f1623, f1227, f1627; +mul.f32 f1629, f1430, f1621; +mul.f32 f1630, f1431, f1623; +sub.f32 f1631, f1629, f1630; +mul.f32 f1632, f1430, f1623; +fma.rn.f32 f1633, f1431, f1621, f1632; +mul.f32 f1634, f1631, f1271; +mul.f32 f1635, f1633, f1289; +sub.f32 f1636, f1634, f1635; +mul.f32 f1637, f1631, f1289; +fma.rn.f32 f1638, f1633, f1271, f1637; +mul.f32 f1639, f1430, f1631; +mul.f32 f1640, f1431, f1633; +sub.f32 f1641, f1639, f1640; +mul.f32 f1642, f1430, f1633; +fma.rn.f32 f1643, f1431, f1631, f1642; +mul.f32 f1644, f1641, f1315; +mul.f32 f1645, f1643, f1333; +sub.f32 f1646, f1644, f1645; +mul.f32 f1647, f1641, f1333; +fma.rn.f32 f1648, f1643, f1315, f1647; +mul.f32 f1649, f1430, f1641; +mul.f32 f1650, f1431, f1643; +sub.f32 f1651, f1649, f1650; +mul.f32 f1652, f1430, f1643; +fma.rn.f32 f1653, f1431, f1641, f1652; +mul.f32 f1654, f1651, f1359; +mul.f32 f1655, f1653, f1377; +sub.f32 f1656, f1654, f1655; +mul.f32 f1657, f1651, f1377; +fma.rn.f32 f1658, f1653, f1359, f1657; +mul.f32 f1659, f1430, f1651; +mul.f32 f1660, f1431, f1653; +sub.f32 f1661, f1659, f1660; +mul.f32 f1662, f1430, f1653; +fma.rn.f32 f1663, f1431, f1651, f1662; +mul.f32 f1664, f1661, f1403; +mul.f32 f1665, f1663, f1421; +sub.f32 f1666, f1664, f1665; +mul.f32 f1667, f1661, f1421; +fma.rn.f32 f1668, f1663, f1403, f1667; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 2500, r15; +st.shared.f32 [r16], f1213; +st.shared.f32 [r16+100], f1436; +st.shared.f32 [r16+200], f1446; +st.shared.f32 [r16+300], f1456; +st.shared.f32 [r16+400], f1466; +st.shared.f32 [r16+500], f1476; +st.shared.f32 [r16+600], f1486; +st.shared.f32 [r16+700], f1496; +st.shared.f32 [r16+800], f1506; +st.shared.f32 [r16+900], f1516; +st.shared.f32 [r16+1000], f1526; +st.shared.f32 [r16+1100], f1536; +st.shared.f32 [r16+1200], f1546; +st.shared.f32 [r16+1300], f1556; +st.shared.f32 [r16+1400], f1566; +st.shared.f32 [r16+1500], f1576; +st.shared.f32 [r16+1600], f1586; +st.shared.f32 [r16+1700], f1596; +st.shared.f32 [r16+1800], f1606; +st.shared.f32 [r16+1900], f1616; +st.shared.f32 [r16+2000], f1626; +st.shared.f32 [r16+2100], f1636; +st.shared.f32 [r16+2200], f1646; +st.shared.f32 [r16+2300], f1656; +st.shared.f32 [r16+2400], f1666; +barrier.sync 0; +ld.shared.f32 f1669, [r10]; +ld.shared.f32 f1670, [r10+2500]; +ld.shared.f32 f1671, [r10+5000]; +ld.shared.f32 f1672, [r10+7500]; +ld.shared.f32 f1673, [r10+10000]; +ld.shared.f32 f1674, [r10+12500]; +ld.shared.f32 f1675, [r10+15000]; +ld.shared.f32 f1676, [r10+17500]; +ld.shared.f32 f1677, [r10+20000]; +ld.shared.f32 f1678, [r10+22500]; +ld.shared.f32 f1679, [r10+25000]; +ld.shared.f32 f1680, [r10+27500]; +ld.shared.f32 f1681, [r10+30000]; +ld.shared.f32 f1682, [r10+32500]; +ld.shared.f32 f1683, [r10+35000]; +ld.shared.f32 f1684, [r10+37500]; +ld.shared.f32 f1685, [r10+40000]; +ld.shared.f32 f1686, [r10+42500]; +ld.shared.f32 f1687, [r10+45000]; +ld.shared.f32 f1688, [r10+47500]; +ld.shared.f32 f1689, [r10+50000]; +ld.shared.f32 f1690, [r10+52500]; +ld.shared.f32 f1691, [r10+55000]; +ld.shared.f32 f1692, [r10+57500]; +ld.shared.f32 f1693, [r10+60000]; +barrier.sync 0; +st.shared.f32 [r16], f1217; +st.shared.f32 [r16+100], f1438; +st.shared.f32 [r16+200], f1448; +st.shared.f32 [r16+300], f1458; +st.shared.f32 [r16+400], f1468; +st.shared.f32 [r16+500], f1478; +st.shared.f32 [r16+600], f1488; +st.shared.f32 [r16+700], f1498; +st.shared.f32 [r16+800], f1508; +st.shared.f32 [r16+900], f1518; +st.shared.f32 [r16+1000], f1528; +st.shared.f32 [r16+1100], f1538; +st.shared.f32 [r16+1200], f1548; +st.shared.f32 [r16+1300], f1558; +st.shared.f32 [r16+1400], f1568; +st.shared.f32 [r16+1500], f1578; +st.shared.f32 [r16+1600], f1588; +st.shared.f32 [r16+1700], f1598; +st.shared.f32 [r16+1800], f1608; +st.shared.f32 [r16+1900], f1618; +st.shared.f32 [r16+2000], f1628; +st.shared.f32 [r16+2100], f1638; +st.shared.f32 [r16+2200], f1648; +st.shared.f32 [r16+2300], f1658; +st.shared.f32 [r16+2400], f1668; +barrier.sync 0; +ld.shared.f32 f1694, [r10]; +ld.shared.f32 f1695, [r10+2500]; +ld.shared.f32 f1696, [r10+5000]; +ld.shared.f32 f1697, [r10+7500]; +ld.shared.f32 f1698, [r10+10000]; +ld.shared.f32 f1699, [r10+12500]; +ld.shared.f32 f1700, [r10+15000]; +ld.shared.f32 f1701, [r10+17500]; +ld.shared.f32 f1702, [r10+20000]; +ld.shared.f32 f1703, [r10+22500]; +ld.shared.f32 f1704, [r10+25000]; +ld.shared.f32 f1705, [r10+27500]; +ld.shared.f32 f1706, [r10+30000]; +ld.shared.f32 f1707, [r10+32500]; +ld.shared.f32 f1708, [r10+35000]; +ld.shared.f32 f1709, [r10+37500]; +ld.shared.f32 f1710, [r10+40000]; +ld.shared.f32 f1711, [r10+42500]; +ld.shared.f32 f1712, [r10+45000]; +ld.shared.f32 f1713, [r10+47500]; +ld.shared.f32 f1714, [r10+50000]; +ld.shared.f32 f1715, [r10+52500]; +ld.shared.f32 f1716, [r10+55000]; +ld.shared.f32 f1717, [r10+57500]; +ld.shared.f32 f1718, [r10+60000]; +add.f32 f1719, f1674, f1689; +add.f32 f1720, f1669, f1719; +add.f32 f1721, f1679, f1684; +add.f32 f1722, f1721, f1720; +add.f32 f1723, f1699, f1714; +add.f32 f1724, f1694, f1723; +add.f32 f1725, f1704, f1709; +add.f32 f1726, f1725, f1724; +fma.rn.f32 f1727, f1719, 0f3E9E377A, f1669; +mul.f32 f1728, f1721, 0f3F4F1BBD; +sub.f32 f1729, f1727, f1728; +sub.f32 f1730, f1699, f1714; +mul.f32 f1731, f1730, 0f3F737871; +sub.f32 f1732, f1704, f1709; +mul.f32 f1733, f1732, 0fBF167918; +sub.f32 f1734, f1733, f1731; +sub.f32 f1735, f1729, f1734; +add.f32 f1736, f1734, f1729; +mul.f32 f1737, f1719, 0f3F4F1BBD; +sub.f32 f1738, f1669, f1737; +fma.rn.f32 f1739, f1721, 0f3E9E377A, f1738; +mul.f32 f1740, f1730, 0f3F167918; +mul.f32 f1741, f1732, 0f3F737871; +sub.f32 f1742, f1741, f1740; +sub.f32 f1743, f1739, f1742; +add.f32 f1744, f1742, f1739; +fma.rn.f32 f1745, f1723, 0f3E9E377A, f1694; +mul.f32 f1746, f1725, 0f3F4F1BBD; +sub.f32 f1747, f1745, f1746; +sub.f32 f1748, f1674, f1689; +mul.f32 f1749, f1748, 0f3F737871; +sub.f32 f1750, f1679, f1684; +mul.f32 f1751, f1750, 0fBF167918; +sub.f32 f1752, f1751, f1749; +add.f32 f1753, f1752, f1747; +sub.f32 f1754, f1747, f1752; +mul.f32 f1755, f1723, 0f3F4F1BBD; +sub.f32 f1756, f1694, f1755; +fma.rn.f32 f1757, f1725, 0f3E9E377A, f1756; +mul.f32 f1758, f1748, 0f3F167918; +mul.f32 f1759, f1750, 0f3F737871; +sub.f32 f1760, f1759, f1758; +add.f32 f1761, f1760, f1757; +sub.f32 f1762, f1757, f1760; +add.f32 f1763, f1675, f1690; +add.f32 f1764, f1670, f1763; +add.f32 f1765, f1680, f1685; +add.f32 f1766, f1765, f1764; +add.f32 f1767, f1700, f1715; +add.f32 f1768, f1695, f1767; +add.f32 f1769, f1705, f1710; +add.f32 f1770, f1769, f1768; +fma.rn.f32 f1771, f1763, 0f3E9E377A, f1670; +mul.f32 f1772, f1765, 0f3F4F1BBD; +sub.f32 f1773, f1771, f1772; +sub.f32 f1774, f1700, f1715; +mul.f32 f1775, f1774, 0f3F737871; +sub.f32 f1776, f1705, f1710; +mul.f32 f1777, f1776, 0fBF167918; +sub.f32 f1778, f1777, f1775; +sub.f32 f1779, f1773, f1778; +add.f32 f1780, f1778, f1773; +mul.f32 f1781, f1763, 0f3F4F1BBD; +sub.f32 f1782, f1670, f1781; +fma.rn.f32 f1783, f1765, 0f3E9E377A, f1782; +mul.f32 f1784, f1774, 0f3F167918; +mul.f32 f1785, f1776, 0f3F737871; +sub.f32 f1786, f1785, f1784; +sub.f32 f1787, f1783, f1786; +add.f32 f1788, f1786, f1783; +fma.rn.f32 f1789, f1767, 0f3E9E377A, f1695; +mul.f32 f1790, f1769, 0f3F4F1BBD; +sub.f32 f1791, f1789, f1790; +sub.f32 f1792, f1675, f1690; +mul.f32 f1793, f1792, 0f3F737871; +sub.f32 f1794, f1680, f1685; +mul.f32 f1795, f1794, 0fBF167918; +sub.f32 f1796, f1795, f1793; +add.f32 f1797, f1796, f1791; +sub.f32 f1798, f1791, f1796; +mul.f32 f1799, f1767, 0f3F4F1BBD; +sub.f32 f1800, f1695, f1799; +fma.rn.f32 f1801, f1769, 0f3E9E377A, f1800; +mul.f32 f1802, f1792, 0f3F167918; +mul.f32 f1803, f1794, 0f3F737871; +sub.f32 f1804, f1803, f1802; +add.f32 f1805, f1804, f1801; +sub.f32 f1806, f1801, f1804; +add.f32 f1807, f1676, f1691; +add.f32 f1808, f1671, f1807; +add.f32 f1809, f1681, f1686; +add.f32 f1810, f1809, f1808; +add.f32 f1811, f1701, f1716; +add.f32 f1812, f1696, f1811; +add.f32 f1813, f1706, f1711; +add.f32 f1814, f1813, f1812; +fma.rn.f32 f1815, f1807, 0f3E9E377A, f1671; +mul.f32 f1816, f1809, 0f3F4F1BBD; +sub.f32 f1817, f1815, f1816; +sub.f32 f1818, f1701, f1716; +mul.f32 f1819, f1818, 0f3F737871; +sub.f32 f1820, f1706, f1711; +mul.f32 f1821, f1820, 0fBF167918; +sub.f32 f1822, f1821, f1819; +sub.f32 f1823, f1817, f1822; +add.f32 f1824, f1822, f1817; +mul.f32 f1825, f1807, 0f3F4F1BBD; +sub.f32 f1826, f1671, f1825; +fma.rn.f32 f1827, f1809, 0f3E9E377A, f1826; +mul.f32 f1828, f1818, 0f3F167918; +mul.f32 f1829, f1820, 0f3F737871; +sub.f32 f1830, f1829, f1828; +sub.f32 f1831, f1827, f1830; +add.f32 f1832, f1830, f1827; +fma.rn.f32 f1833, f1811, 0f3E9E377A, f1696; +mul.f32 f1834, f1813, 0f3F4F1BBD; +sub.f32 f1835, f1833, f1834; +sub.f32 f1836, f1676, f1691; +mul.f32 f1837, f1836, 0f3F737871; +sub.f32 f1838, f1681, f1686; +mul.f32 f1839, f1838, 0fBF167918; +sub.f32 f1840, f1839, f1837; +add.f32 f1841, f1840, f1835; +sub.f32 f1842, f1835, f1840; +mul.f32 f1843, f1811, 0f3F4F1BBD; +sub.f32 f1844, f1696, f1843; +fma.rn.f32 f1845, f1813, 0f3E9E377A, f1844; +mul.f32 f1846, f1836, 0f3F167918; +mul.f32 f1847, f1838, 0f3F737871; +sub.f32 f1848, f1847, f1846; +add.f32 f1849, f1848, f1845; +sub.f32 f1850, f1845, f1848; +add.f32 f1851, f1677, f1692; +add.f32 f1852, f1672, f1851; +add.f32 f1853, f1682, f1687; +add.f32 f1854, f1853, f1852; +add.f32 f1855, f1702, f1717; +add.f32 f1856, f1697, f1855; +add.f32 f1857, f1707, f1712; +add.f32 f1858, f1857, f1856; +fma.rn.f32 f1859, f1851, 0f3E9E377A, f1672; +mul.f32 f1860, f1853, 0f3F4F1BBD; +sub.f32 f1861, f1859, f1860; +sub.f32 f1862, f1702, f1717; +mul.f32 f1863, f1862, 0f3F737871; +sub.f32 f1864, f1707, f1712; +mul.f32 f1865, f1864, 0fBF167918; +sub.f32 f1866, f1865, f1863; +sub.f32 f1867, f1861, f1866; +add.f32 f1868, f1866, f1861; +mul.f32 f1869, f1851, 0f3F4F1BBD; +sub.f32 f1870, f1672, f1869; +fma.rn.f32 f1871, f1853, 0f3E9E377A, f1870; +mul.f32 f1872, f1862, 0f3F167918; +mul.f32 f1873, f1864, 0f3F737871; +sub.f32 f1874, f1873, f1872; +sub.f32 f1875, f1871, f1874; +add.f32 f1876, f1874, f1871; +fma.rn.f32 f1877, f1855, 0f3E9E377A, f1697; +mul.f32 f1878, f1857, 0f3F4F1BBD; +sub.f32 f1879, f1877, f1878; +sub.f32 f1880, f1677, f1692; +mul.f32 f1881, f1880, 0f3F737871; +sub.f32 f1882, f1682, f1687; +mul.f32 f1883, f1882, 0fBF167918; +sub.f32 f1884, f1883, f1881; +add.f32 f1885, f1884, f1879; +sub.f32 f1886, f1879, f1884; +mul.f32 f1887, f1855, 0f3F4F1BBD; +sub.f32 f1888, f1697, f1887; +fma.rn.f32 f1889, f1857, 0f3E9E377A, f1888; +mul.f32 f1890, f1880, 0f3F167918; +mul.f32 f1891, f1882, 0f3F737871; +sub.f32 f1892, f1891, f1890; +add.f32 f1893, f1892, f1889; +sub.f32 f1894, f1889, f1892; +add.f32 f1895, f1678, f1693; +add.f32 f1896, f1673, f1895; +add.f32 f1897, f1683, f1688; +add.f32 f1898, f1897, f1896; +add.f32 f1899, f1703, f1718; +add.f32 f1900, f1698, f1899; +add.f32 f1901, f1708, f1713; +add.f32 f1902, f1901, f1900; +fma.rn.f32 f1903, f1895, 0f3E9E377A, f1673; +mul.f32 f1904, f1897, 0f3F4F1BBD; +sub.f32 f1905, f1903, f1904; +sub.f32 f1906, f1703, f1718; +mul.f32 f1907, f1906, 0f3F737871; +sub.f32 f1908, f1708, f1713; +mul.f32 f1909, f1908, 0fBF167918; +sub.f32 f1910, f1909, f1907; +sub.f32 f1911, f1905, f1910; +add.f32 f1912, f1910, f1905; +mul.f32 f1913, f1895, 0f3F4F1BBD; +sub.f32 f1914, f1673, f1913; +fma.rn.f32 f1915, f1897, 0f3E9E377A, f1914; +mul.f32 f1916, f1906, 0f3F167918; +mul.f32 f1917, f1908, 0f3F737871; +sub.f32 f1918, f1917, f1916; +sub.f32 f1919, f1915, f1918; +add.f32 f1920, f1918, f1915; +fma.rn.f32 f1921, f1899, 0f3E9E377A, f1698; +mul.f32 f1922, f1901, 0f3F4F1BBD; +sub.f32 f1923, f1921, f1922; +sub.f32 f1924, f1678, f1693; +mul.f32 f1925, f1924, 0f3F737871; +sub.f32 f1926, f1683, f1688; +mul.f32 f1927, f1926, 0fBF167918; +sub.f32 f1928, f1927, f1925; +add.f32 f1929, f1928, f1923; +sub.f32 f1930, f1923, f1928; +mul.f32 f1931, f1899, 0f3F4F1BBD; +sub.f32 f1932, f1698, f1931; +fma.rn.f32 f1933, f1901, 0f3E9E377A, f1932; +mul.f32 f1934, f1924, 0f3F167918; +mul.f32 f1935, f1926, 0f3F737871; +sub.f32 f1936, f1935, f1934; +add.f32 f1937, f1936, f1933; +sub.f32 f1938, f1933, f1936; +mul.f32 f1939, f1779, 0f3F77F511; +mul.f32 f1940, f1797, 0fBE7EA890; +sub.f32 f1941, f1939, f1940; +mul.f32 f1942, f1797, 0f3F77F511; +fma.rn.f32 f1943, f1779, 0fBE7EA890, f1942; +mul.f32 f1944, f1823, 0f3F6055A2; +mul.f32 f1945, f1841, 0fBEF6A86B; +sub.f32 f1946, f1944, f1945; +mul.f32 f1947, f1841, 0f3F6055A2; +fma.rn.f32 f1948, f1823, 0fBEF6A86B, f1947; +mul.f32 f1949, f1867, 0f3F3A9DB0; +mul.f32 f1950, f1885, 0fBF2F3E7B; +sub.f32 f1951, f1949, f1950; +mul.f32 f1952, f1885, 0f3F3A9DB0; +fma.rn.f32 f1953, f1867, 0fBF2F3E7B, f1952; +mul.f32 f1954, f1911, 0f3F092BF2; +mul.f32 f1955, f1929, 0fBF5825E0; +sub.f32 f1956, f1954, f1955; +mul.f32 f1957, f1929, 0f3F092BF2; +fma.rn.f32 f1958, f1911, 0fBF5825E0, f1957; +mul.f32 f1959, f1787, 0f3F6055A2; +mul.f32 f1960, f1805, 0fBEF6A86B; +sub.f32 f1961, f1959, f1960; +mul.f32 f1962, f1805, 0f3F6055A2; +fma.rn.f32 f1963, f1787, 0fBEF6A86B, f1962; +mul.f32 f1964, f1831, 0f3F092BF2; +mul.f32 f1965, f1849, 0fBF5825E0; +sub.f32 f1966, f1964, f1965; +mul.f32 f1967, f1849, 0f3F092BF2; +fma.rn.f32 f1968, f1831, 0fBF5825E0, f1967; +mul.f32 f1969, f1875, 0f3D809851; +mul.f32 f1970, f1893, 0fBF7F7EAE; +sub.f32 f1971, f1969, f1970; +mul.f32 f1972, f1893, 0f3D809851; +fma.rn.f32 f1973, f1875, 0fBF7F7EAE, f1972; +mul.f32 f1974, f1919, 0fBED9FFBE; +mul.f32 f1975, f1937, 0fBF67A2BF; +sub.f32 f1976, f1974, f1975; +mul.f32 f1977, f1937, 0fBED9FFBE; +fma.rn.f32 f1978, f1919, 0fBF67A2BF, f1977; +mul.f32 f1979, f1788, 0f3F3A9DB0; +mul.f32 f1980, f1806, 0fBF2F3E7B; +sub.f32 f1981, f1979, f1980; +mul.f32 f1982, f1806, 0f3F3A9DB0; +fma.rn.f32 f1983, f1788, 0fBF2F3E7B, f1982; +mul.f32 f1984, f1832, 0f3D809851; +mul.f32 f1985, f1850, 0fBF7F7EAE; +sub.f32 f1986, f1984, f1985; +mul.f32 f1987, f1850, 0f3D809851; +fma.rn.f32 f1988, f1832, 0fBF7F7EAE, f1987; +mul.f32 f1989, f1876, 0fBF232E38; +mul.f32 f1990, f1894, 0fBF45405B; +sub.f32 f1991, f1989, f1990; +mul.f32 f1992, f1894, 0fBF232E38; +fma.rn.f32 f1993, f1876, 0fBF45405B, f1992; +mul.f32 f1994, f1920, 0fBF7DFB3B; +mul.f32 f1995, f1938, 0fBE00575B; +sub.f32 f1996, f1994, f1995; +mul.f32 f1997, f1938, 0fBF7DFB3B; +fma.rn.f32 f1998, f1920, 0fBE00575B, f1997; +mul.f32 f1999, f1780, 0f3F092BF2; +mul.f32 f2000, f1798, 0fBF5825E0; +sub.f32 f2001, f1999, f2000; +mul.f32 f2002, f1798, 0f3F092BF2; +fma.rn.f32 f2003, f1780, 0fBF5825E0, f2002; +mul.f32 f2004, f1824, 0fBED9FFBE; +mul.f32 f2005, f1842, 0fBF67A2BF; +sub.f32 f2006, f2004, f2005; +mul.f32 f2007, f1842, 0fBED9FFBE; +fma.rn.f32 f2008, f1824, 0fBF67A2BF, f2007; +mul.f32 f2009, f1868, 0fBF7DFB3B; +mul.f32 f2010, f1886, 0fBE00575B; +sub.f32 f2011, f2009, f2010; +mul.f32 f2012, f1886, 0fBF7DFB3B; +fma.rn.f32 f2013, f1868, 0fBE00575B, f2012; +mul.f32 f2014, f1912, 0fBF232E38; +mul.f32 f2015, f1930, 0f3F45405B; +sub.f32 f2016, f2014, f2015; +mul.f32 f2017, f1930, 0fBF232E38; +fma.rn.f32 f2018, f1912, 0f3F45405B, f2017; +add.f32 f2019, f1766, f1898; +add.f32 f2020, f1722, f2019; +add.f32 f2021, f1810, f1854; +add.f32 f2022, f1770, f1902; +add.f32 f2023, f1726, f2022; +add.f32 f2024, f1814, f1858; +fma.rn.f32 f2025, f2019, 0f3E9E377A, f1722; +mul.f32 f2026, f2021, 0f3F4F1BBD; +sub.f32 f2027, f2025, f2026; +sub.f32 f2028, f1770, f1902; +mul.f32 f2029, f2028, 0f3F737871; +sub.f32 f2030, f1814, f1858; +mul.f32 f2031, f2030, 0fBF167918; +sub.f32 f2032, f2031, f2029; +mul.f32 f2033, f2019, 0f3F4F1BBD; +sub.f32 f2034, f1722, f2033; +fma.rn.f32 f2035, f2021, 0f3E9E377A, f2034; +mul.f32 f2036, f2028, 0f3F167918; +mul.f32 f2037, f2030, 0f3F737871; +sub.f32 f2038, f2037, f2036; +fma.rn.f32 f2039, f2022, 0f3E9E377A, f1726; +mul.f32 f2040, f2024, 0f3F4F1BBD; +sub.f32 f2041, f2039, f2040; +sub.f32 f2042, f1766, f1898; +mul.f32 f2043, f2042, 0f3F737871; +sub.f32 f2044, f1810, f1854; +mul.f32 f2045, f2044, 0fBF167918; +sub.f32 f2046, f2045, f2043; +mul.f32 f2047, f2022, 0f3F4F1BBD; +sub.f32 f2048, f1726, f2047; +fma.rn.f32 f2049, f2024, 0f3E9E377A, f2048; +mul.f32 f2050, f2042, 0f3F167918; +mul.f32 f2051, f2044, 0f3F737871; +sub.f32 f2052, f2051, f2050; +add.f32 f2053, f1941, f1956; +add.f32 f2054, f1735, f2053; +add.f32 f2055, f1946, f1951; +add.f32 f2056, f1943, f1958; +add.f32 f2057, f1753, f2056; +add.f32 f2058, f1948, f1953; +fma.rn.f32 f2059, f2053, 0f3E9E377A, f1735; +mul.f32 f2060, f2055, 0f3F4F1BBD; +sub.f32 f2061, f2059, f2060; +sub.f32 f2062, f1943, f1958; +mul.f32 f2063, f2062, 0f3F737871; +sub.f32 f2064, f1948, f1953; +mul.f32 f2065, f2064, 0fBF167918; +sub.f32 f2066, f2065, f2063; +mul.f32 f2067, f2053, 0f3F4F1BBD; +sub.f32 f2068, f1735, f2067; +fma.rn.f32 f2069, f2055, 0f3E9E377A, f2068; +mul.f32 f2070, f2062, 0f3F167918; +mul.f32 f2071, f2064, 0f3F737871; +sub.f32 f2072, f2071, f2070; +fma.rn.f32 f2073, f2056, 0f3E9E377A, f1753; +mul.f32 f2074, f2058, 0f3F4F1BBD; +sub.f32 f2075, f2073, f2074; +sub.f32 f2076, f1941, f1956; +mul.f32 f2077, f2076, 0f3F737871; +sub.f32 f2078, f1946, f1951; +mul.f32 f2079, f2078, 0fBF167918; +sub.f32 f2080, f2079, f2077; +mul.f32 f2081, f2056, 0f3F4F1BBD; +sub.f32 f2082, f1753, f2081; +fma.rn.f32 f2083, f2058, 0f3E9E377A, f2082; +mul.f32 f2084, f2076, 0f3F167918; +mul.f32 f2085, f2078, 0f3F737871; +sub.f32 f2086, f2085, f2084; +add.f32 f2087, f1961, f1976; +add.f32 f2088, f1743, f2087; +add.f32 f2089, f1966, f1971; +add.f32 f2090, f1963, f1978; +add.f32 f2091, f1761, f2090; +add.f32 f2092, f1968, f1973; +fma.rn.f32 f2093, f2087, 0f3E9E377A, f1743; +mul.f32 f2094, f2089, 0f3F4F1BBD; +sub.f32 f2095, f2093, f2094; +sub.f32 f2096, f1963, f1978; +mul.f32 f2097, f2096, 0f3F737871; +sub.f32 f2098, f1968, f1973; +mul.f32 f2099, f2098, 0fBF167918; +sub.f32 f2100, f2099, f2097; +mul.f32 f2101, f2087, 0f3F4F1BBD; +sub.f32 f2102, f1743, f2101; +fma.rn.f32 f2103, f2089, 0f3E9E377A, f2102; +mul.f32 f2104, f2096, 0f3F167918; +mul.f32 f2105, f2098, 0f3F737871; +sub.f32 f2106, f2105, f2104; +fma.rn.f32 f2107, f2090, 0f3E9E377A, f1761; +mul.f32 f2108, f2092, 0f3F4F1BBD; +sub.f32 f2109, f2107, f2108; +sub.f32 f2110, f1961, f1976; +mul.f32 f2111, f2110, 0f3F737871; +sub.f32 f2112, f1966, f1971; +mul.f32 f2113, f2112, 0fBF167918; +sub.f32 f2114, f2113, f2111; +mul.f32 f2115, f2090, 0f3F4F1BBD; +sub.f32 f2116, f1761, f2115; +fma.rn.f32 f2117, f2092, 0f3E9E377A, f2116; +mul.f32 f2118, f2110, 0f3F167918; +mul.f32 f2119, f2112, 0f3F737871; +sub.f32 f2120, f2119, f2118; +add.f32 f2121, f1981, f1996; +add.f32 f2122, f1744, f2121; +add.f32 f2123, f1986, f1991; +add.f32 f2124, f1983, f1998; +add.f32 f2125, f1762, f2124; +add.f32 f2126, f1988, f1993; +fma.rn.f32 f2127, f2121, 0f3E9E377A, f1744; +mul.f32 f2128, f2123, 0f3F4F1BBD; +sub.f32 f2129, f2127, f2128; +sub.f32 f2130, f1983, f1998; +mul.f32 f2131, f2130, 0f3F737871; +sub.f32 f2132, f1988, f1993; +mul.f32 f2133, f2132, 0fBF167918; +sub.f32 f2134, f2133, f2131; +mul.f32 f2135, f2121, 0f3F4F1BBD; +sub.f32 f2136, f1744, f2135; +fma.rn.f32 f2137, f2123, 0f3E9E377A, f2136; +mul.f32 f2138, f2130, 0f3F167918; +mul.f32 f2139, f2132, 0f3F737871; +sub.f32 f2140, f2139, f2138; +fma.rn.f32 f2141, f2124, 0f3E9E377A, f1762; +mul.f32 f2142, f2126, 0f3F4F1BBD; +sub.f32 f2143, f2141, f2142; +sub.f32 f2144, f1981, f1996; +mul.f32 f2145, f2144, 0f3F737871; +sub.f32 f2146, f1986, f1991; +mul.f32 f2147, f2146, 0fBF167918; +sub.f32 f2148, f2147, f2145; +mul.f32 f2149, f2124, 0f3F4F1BBD; +sub.f32 f2150, f1762, f2149; +fma.rn.f32 f2151, f2126, 0f3E9E377A, f2150; +mul.f32 f2152, f2144, 0f3F167918; +mul.f32 f2153, f2146, 0f3F737871; +sub.f32 f2154, f2153, f2152; +add.f32 f2155, f2001, f2016; +add.f32 f2156, f1736, f2155; +add.f32 f2157, f2006, f2011; +add.f32 f2158, f2003, f2018; +add.f32 f2159, f1754, f2158; +add.f32 f2160, f2008, f2013; +fma.rn.f32 f2161, f2155, 0f3E9E377A, f1736; +mul.f32 f2162, f2157, 0f3F4F1BBD; +sub.f32 f2163, f2161, f2162; +sub.f32 f2164, f2003, f2018; +mul.f32 f2165, f2164, 0f3F737871; +sub.f32 f2166, f2008, f2013; +mul.f32 f2167, f2166, 0fBF167918; +sub.f32 f2168, f2167, f2165; +mul.f32 f2169, f2155, 0f3F4F1BBD; +sub.f32 f2170, f1736, f2169; +fma.rn.f32 f2171, f2157, 0f3E9E377A, f2170; +mul.f32 f2172, f2164, 0f3F167918; +mul.f32 f2173, f2166, 0f3F737871; +sub.f32 f2174, f2173, f2172; +fma.rn.f32 f2175, f2158, 0f3E9E377A, f1754; +mul.f32 f2176, f2160, 0f3F4F1BBD; +sub.f32 f2177, f2175, f2176; +sub.f32 f2178, f2001, f2016; +mul.f32 f2179, f2178, 0f3F737871; +sub.f32 f2180, f2006, f2011; +mul.f32 f2181, f2180, 0fBF167918; +sub.f32 f2182, f2181, f2179; +mul.f32 f2183, f2158, 0f3F4F1BBD; +sub.f32 f2184, f1754, f2183; +fma.rn.f32 f2185, f2160, 0f3E9E377A, f2184; +mul.f32 f2186, f2178, 0f3F167918; +mul.f32 f2187, f2180, 0f3F737871; +sub.f32 f2188, f2187, f2186; +add.f32 %0, f2021, f2020; +add.f32 %1, f2024, f2023; +add.f32 %3, f2058, f2057; +add.f32 %2, f2055, f2054; +add.f32 %5, f2092, f2091; +add.f32 %4, f2089, f2088; +add.f32 %7, f2126, f2125; +add.f32 %6, f2123, f2122; +add.f32 %9, f2160, f2159; +add.f32 %8, f2157, f2156; +add.f32 %11, f2046, f2041; +sub.f32 %10, f2027, f2032; +add.f32 %13, f2080, f2075; +sub.f32 %12, f2061, f2066; +add.f32 %15, f2114, f2109; +sub.f32 %14, f2095, f2100; +add.f32 %17, f2148, f2143; +sub.f32 %16, f2129, f2134; +add.f32 %19, f2182, f2177; +sub.f32 %18, f2163, f2168; +sub.f32 %20, f2035, f2038; +add.f32 %21, f2052, f2049; +add.f32 %23, f2086, f2083; +sub.f32 %22, f2069, f2072; +add.f32 %25, f2120, f2117; +sub.f32 %24, f2103, f2106; +add.f32 %27, f2154, f2151; +sub.f32 %26, f2137, f2140; +add.f32 %29, f2188, f2185; +sub.f32 %28, f2171, f2174; +add.f32 %30, f2038, f2035; +sub.f32 %31, f2049, f2052; +sub.f32 %33, f2083, f2086; +add.f32 %32, f2072, f2069; +sub.f32 %35, f2117, f2120; +add.f32 %34, f2106, f2103; +sub.f32 %37, f2151, f2154; +add.f32 %36, f2140, f2137; +sub.f32 %39, f2185, f2188; +add.f32 %38, f2174, f2171; +sub.f32 %41, f2041, f2046; +add.f32 %40, f2032, f2027; +sub.f32 %43, f2075, f2080; +add.f32 %42, f2066, f2061; +sub.f32 %45, f2109, f2114; +add.f32 %44, f2100, f2095; +sub.f32 %47, f2143, f2148; +add.f32 %46, f2134, f2129; +sub.f32 %49, f2177, f2182; +add.f32 %48, f2168, f2163; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_15625), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..e28af28b7133f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15625_fp32_inv.hpp.inc @@ -0,0 +1,4338 @@ +#ifndef CUFFTDX_FFT_15625_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_15625_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1161, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2702>; +.reg .b32 r<21>; +.reg .b64 rd<15>; +mov.u32 r19, %tid.y; +mov.u32 r20, %50; +mad.lo.s32 r3, r19, 125000, r20; +add.f32 f101, %63, %93; +add.f32 f103, %73, %83; +add.f32 f2701, %53, f101; +add.f32 f104, f103, f2701; +add.f32 f105, %103, %105; +add.f32 f107, %104, %84; +add.f32 f2697, %54, f105; +add.f32 f108, f107, f2697; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f2696, f101, 0f3E9E377A, %53; +sub.f32 f111, f2696, f110; +sub.f32 f112, %103, %105; +sub.f32 f114, %104, %84; +mul.f32 f2695, f112, 0f3F737871; +fma.rn.f32 f115, f114, 0f3F167918, f2695; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %53, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +mul.f32 f127, f107, 0f3F4F1BBD; +fma.rn.f32 f2694, f105, 0f3E9E377A, %54; +sub.f32 f128, f2694, f127; +sub.f32 f129, %63, %93; +sub.f32 f131, %73, %83; +mul.f32 f2693, f129, 0f3F737871; +fma.rn.f32 f132, f131, 0f3F167918, f2693; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %54, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %65, %95; +add.f32 f145, %75, %85; +add.f32 f2692, %55, f143; +add.f32 f146, f145, f2692; +add.f32 f147, %66, %96; +add.f32 f149, %108, %106; +add.f32 f2688, %107, f147; +add.f32 f150, f149, f2688; +fma.rn.f32 f2686, f143, 0f3E9E377A, %55; +mul.f32 f2687, f145, 0f3F4F1BBD; +sub.f32 f153, f2686, f2687; +sub.f32 f154, %66, %96; +sub.f32 f156, %108, %106; +mul.f32 f2685, f154, 0f3F737871; +fma.rn.f32 f157, f156, 0f3F167918, f2685; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %55, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +mul.f32 f169, f149, 0f3F4F1BBD; +fma.rn.f32 f2684, f147, 0f3E9E377A, %107; +sub.f32 f170, f2684, f169; +sub.f32 f171, %65, %95; +sub.f32 f173, %75, %85; +mul.f32 f2683, f171, 0f3F737871; +fma.rn.f32 f174, f173, 0f3F167918, f2683; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %107, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %67, %97; +add.f32 f187, %77, %87; +add.f32 f2682, %57, f185; +add.f32 f188, f187, f2682; +add.f32 f189, %111, %110; +add.f32 f191, %78, %112; +add.f32 f2677, %109, f189; +add.f32 f192, f191, f2677; +fma.rn.f32 f2675, f185, 0f3E9E377A, %57; +mul.f32 f2676, f187, 0f3F4F1BBD; +sub.f32 f195, f2675, f2676; +sub.f32 f196, %111, %110; +sub.f32 f198, %78, %112; +mul.f32 f2674, f196, 0f3F737871; +fma.rn.f32 f199, f198, 0f3F167918, f2674; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %57, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f2672, f189, 0f3E9E377A, %109; +mul.f32 f2673, f191, 0f3F4F1BBD; +sub.f32 f212, f2672, f2673; +sub.f32 f213, %67, %97; +sub.f32 f215, %77, %87; +mul.f32 f2671, f213, 0f3F737871; +fma.rn.f32 f216, f215, 0f3F167918, f2671; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %109, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %69, %99; +add.f32 f229, %79, %89; +add.f32 f2670, %59, f227; +add.f32 f230, f229, f2670; +add.f32 f231, %114, %113; +add.f32 f233, %115, %90; +add.f32 f2666, %60, f231; +add.f32 f234, f233, f2666; +mul.f32 f236, f229, 0f3F4F1BBD; +fma.rn.f32 f2665, f227, 0f3E9E377A, %59; +sub.f32 f237, f2665, f236; +sub.f32 f238, %114, %113; +sub.f32 f240, %115, %90; +mul.f32 f2664, f238, 0f3F737871; +fma.rn.f32 f241, f240, 0f3F167918, f2664; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %59, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +mul.f32 f253, f233, 0f3F4F1BBD; +fma.rn.f32 f2663, f231, 0f3E9E377A, %60; +sub.f32 f254, f2663, f253; +sub.f32 f255, %69, %99; +sub.f32 f257, %79, %89; +mul.f32 f2662, f255, 0f3F737871; +fma.rn.f32 f258, f257, 0f3F167918, f2662; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %60, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %71, %101; +add.f32 f271, %81, %91; +add.f32 f2661, %61, f269; +add.f32 f272, f271, f2661; +add.f32 f273, %72, %102; +add.f32 f275, %118, %116; +add.f32 f2657, %117, f273; +add.f32 f276, f275, f2657; +mul.f32 f278, f271, 0f3F4F1BBD; +fma.rn.f32 f2656, f269, 0f3E9E377A, %61; +sub.f32 f279, f2656, f278; +sub.f32 f280, %72, %102; +sub.f32 f282, %118, %116; +mul.f32 f2655, f280, 0f3F737871; +fma.rn.f32 f283, f282, 0f3F167918, f2655; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %61, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +mul.f32 f295, f275, 0f3F4F1BBD; +fma.rn.f32 f2654, f273, 0f3E9E377A, %117; +sub.f32 f296, f2654, f295; +sub.f32 f297, %71, %101; +sub.f32 f299, %81, %91; +mul.f32 f2653, f297, 0f3F737871; +fma.rn.f32 f300, f299, 0f3F167918, f2653; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %117, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mul.f32 f2651, f158, 0f3F77F511; +mul.f32 f2652, f175, 0f3E7EA890; +sub.f32 f313, f2651, f2652; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f2649, f200, 0f3F6055A2; +mul.f32 f2650, f217, 0f3EF6A86B; +sub.f32 f318, f2649, f2650; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f322, f259, 0f3F2F3E7B; +mul.f32 f2648, f242, 0f3F3A9DB0; +sub.f32 f323, f2648, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f327, f301, 0f3F5825E0; +mul.f32 f2647, f284, 0f3F092BF2; +sub.f32 f328, f2647, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f332, f183, 0f3EF6A86B; +mul.f32 f2646, f166, 0f3F6055A2; +sub.f32 f333, f2646, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f337, f225, 0f3F5825E0; +mul.f32 f2645, f208, 0f3F092BF2; +sub.f32 f338, f2645, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f342, f267, 0f3F7F7EAE; +mul.f32 f2644, f250, 0f3D809851; +sub.f32 f343, f2644, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f2642, f292, 0fBED9FFBE; +mul.f32 f2643, f309, 0f3F67A2BF; +sub.f32 f348, f2642, f2643; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f2640, f167, 0f3F3A9DB0; +mul.f32 f2641, f184, 0f3F2F3E7B; +sub.f32 f353, f2640, f2641; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f2638, f209, 0f3D809851; +mul.f32 f2639, f226, 0f3F7F7EAE; +sub.f32 f358, f2638, f2639; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f2636, f251, 0fBF232E38; +mul.f32 f2637, f268, 0f3F45405B; +sub.f32 f363, f2636, f2637; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f367, f310, 0f3E00575B; +mul.f32 f2635, f293, 0fBF7DFB3B; +sub.f32 f368, f2635, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f372, f176, 0f3F5825E0; +mul.f32 f2634, f159, 0f3F092BF2; +sub.f32 f373, f2634, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f377, f218, 0f3F67A2BF; +mul.f32 f2633, f201, 0fBED9FFBE; +sub.f32 f378, f2633, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f382, f260, 0f3E00575B; +mul.f32 f2632, f243, 0fBF7DFB3B; +sub.f32 f383, f2632, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f387, f302, 0fBF45405B; +mul.f32 f2631, f285, 0fBF232E38; +sub.f32 f388, f2631, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f393, f188, f230; +mul.f32 f398, f393, 0f3F4F1BBD; +fma.rn.f32 f2630, f391, 0f3E9E377A, f104; +sub.f32 f399, f2630, f398; +add.f32 f2629, f150, f276; +sub.f32 f400, f150, f276; +add.f32 f2628, f192, f234; +sub.f32 f402, f192, f234; +mul.f32 f2627, f400, 0f3F737871; +fma.rn.f32 f403, f402, 0f3F167918, f2627; +sub.f32 f404, f399, f403; +add.f32 f405, f403, f399; +add.f32 f2626, f104, f391; +mul.f32 f406, f391, 0f3F4F1BBD; +sub.f32 f407, f104, f406; +fma.rn.f32 f408, f393, 0f3E9E377A, f407; +mul.f32 f409, f400, 0f3F167918; +mul.f32 f410, f402, 0f3F737871; +sub.f32 f411, f409, f410; +sub.f32 f412, f408, f411; +add.f32 f413, f411, f408; +mul.f32 f415, f2628, 0f3F4F1BBD; +fma.rn.f32 f2625, f2629, 0f3E9E377A, f108; +sub.f32 f416, f2625, f415; +sub.f32 f417, f146, f272; +sub.f32 f419, f188, f230; +mul.f32 f2624, f417, 0f3F737871; +fma.rn.f32 f420, f419, 0f3F167918, f2624; +add.f32 f421, f420, f416; +sub.f32 f422, f416, f420; +add.f32 f2623, f108, f2629; +mul.f32 f423, f2629, 0f3F4F1BBD; +sub.f32 f424, f108, f423; +fma.rn.f32 f425, f2628, 0f3E9E377A, f424; +mul.f32 f426, f417, 0f3F167918; +mul.f32 f427, f419, 0f3F737871; +sub.f32 f428, f426, f427; +add.f32 f429, f428, f425; +sub.f32 f430, f425, f428; +add.f32 f431, f313, f328; +add.f32 f433, f318, f323; +add.f32 f2622, f116, f431; +add.f32 f434, f433, f2622; +add.f32 f435, f315, f330; +add.f32 f437, f320, f325; +add.f32 f2621, f133, f435; +add.f32 f438, f437, f2621; +fma.rn.f32 f2619, f431, 0f3E9E377A, f116; +mul.f32 f2620, f433, 0f3F4F1BBD; +sub.f32 f441, f2619, f2620; +sub.f32 f442, f315, f330; +sub.f32 f444, f320, f325; +mul.f32 f2618, f442, 0f3F737871; +fma.rn.f32 f445, f444, 0f3F167918, f2618; +sub.f32 f446, f441, f445; +add.f32 f447, f445, f441; +mul.f32 f448, f431, 0f3F4F1BBD; +sub.f32 f449, f116, f448; +fma.rn.f32 f450, f433, 0f3E9E377A, f449; +mul.f32 f451, f442, 0f3F167918; +mul.f32 f452, f444, 0f3F737871; +sub.f32 f453, f451, f452; +sub.f32 f454, f450, f453; +add.f32 f455, f453, f450; +mul.f32 f457, f437, 0f3F4F1BBD; +fma.rn.f32 f2617, f435, 0f3E9E377A, f133; +sub.f32 f458, f2617, f457; +sub.f32 f459, f313, f328; +sub.f32 f461, f318, f323; +mul.f32 f2616, f459, 0f3F737871; +fma.rn.f32 f462, f461, 0f3F167918, f2616; +add.f32 f463, f462, f458; +sub.f32 f464, f458, f462; +mul.f32 f465, f435, 0f3F4F1BBD; +sub.f32 f466, f133, f465; +fma.rn.f32 f467, f437, 0f3E9E377A, f466; +mul.f32 f468, f459, 0f3F167918; +mul.f32 f469, f461, 0f3F737871; +sub.f32 f470, f468, f469; +add.f32 f471, f470, f467; +sub.f32 f472, f467, f470; +add.f32 f473, f333, f348; +add.f32 f475, f338, f343; +add.f32 f2615, f124, f473; +add.f32 f476, f475, f2615; +add.f32 f477, f335, f350; +add.f32 f479, f340, f345; +add.f32 f2614, f141, f477; +add.f32 f480, f479, f2614; +fma.rn.f32 f2612, f473, 0f3E9E377A, f124; +mul.f32 f2613, f475, 0f3F4F1BBD; +sub.f32 f483, f2612, f2613; +sub.f32 f484, f335, f350; +sub.f32 f486, f340, f345; +mul.f32 f2611, f484, 0f3F737871; +fma.rn.f32 f487, f486, 0f3F167918, f2611; +sub.f32 f488, f483, f487; +add.f32 f489, f487, f483; +mul.f32 f490, f473, 0f3F4F1BBD; +sub.f32 f491, f124, f490; +fma.rn.f32 f492, f475, 0f3E9E377A, f491; +mul.f32 f493, f484, 0f3F167918; +mul.f32 f494, f486, 0f3F737871; +sub.f32 f495, f493, f494; +sub.f32 f496, f492, f495; +add.f32 f497, f495, f492; +fma.rn.f32 f2609, f477, 0f3E9E377A, f141; +mul.f32 f2610, f479, 0f3F4F1BBD; +sub.f32 f500, f2609, f2610; +sub.f32 f501, f333, f348; +sub.f32 f503, f338, f343; +mul.f32 f2608, f501, 0f3F737871; +fma.rn.f32 f504, f503, 0f3F167918, f2608; +add.f32 f505, f504, f500; +sub.f32 f506, f500, f504; +mul.f32 f507, f477, 0f3F4F1BBD; +sub.f32 f508, f141, f507; +fma.rn.f32 f509, f479, 0f3E9E377A, f508; +mul.f32 f510, f501, 0f3F167918; +mul.f32 f511, f503, 0f3F737871; +sub.f32 f512, f510, f511; +add.f32 f513, f512, f509; +sub.f32 f514, f509, f512; +add.f32 f515, f353, f368; +add.f32 f517, f358, f363; +add.f32 f2607, f125, f515; +add.f32 f518, f517, f2607; +add.f32 f519, f355, f370; +add.f32 f521, f360, f365; +add.f32 f2606, f142, f519; +add.f32 f522, f521, f2606; +mul.f32 f524, f517, 0f3F4F1BBD; +fma.rn.f32 f2605, f515, 0f3E9E377A, f125; +sub.f32 f525, f2605, f524; +sub.f32 f526, f355, f370; +sub.f32 f528, f360, f365; +mul.f32 f2604, f526, 0f3F737871; +fma.rn.f32 f529, f528, 0f3F167918, f2604; +sub.f32 f530, f525, f529; +add.f32 f531, f529, f525; +mul.f32 f532, f515, 0f3F4F1BBD; +sub.f32 f533, f125, f532; +fma.rn.f32 f534, f517, 0f3E9E377A, f533; +mul.f32 f535, f526, 0f3F167918; +mul.f32 f536, f528, 0f3F737871; +sub.f32 f537, f535, f536; +sub.f32 f538, f534, f537; +add.f32 f539, f537, f534; +mul.f32 f541, f521, 0f3F4F1BBD; +fma.rn.f32 f2603, f519, 0f3E9E377A, f142; +sub.f32 f542, f2603, f541; +sub.f32 f543, f353, f368; +sub.f32 f545, f358, f363; +mul.f32 f2602, f543, 0f3F737871; +fma.rn.f32 f546, f545, 0f3F167918, f2602; +add.f32 f547, f546, f542; +sub.f32 f548, f542, f546; +mul.f32 f549, f519, 0f3F4F1BBD; +sub.f32 f550, f142, f549; +fma.rn.f32 f551, f521, 0f3E9E377A, f550; +mul.f32 f552, f543, 0f3F167918; +mul.f32 f553, f545, 0f3F737871; +sub.f32 f554, f552, f553; +add.f32 f555, f554, f551; +sub.f32 f556, f551, f554; +add.f32 f557, f373, f388; +add.f32 f559, f378, f383; +add.f32 f2601, f117, f557; +add.f32 f560, f559, f2601; +add.f32 f561, f375, f390; +add.f32 f563, f380, f385; +add.f32 f2600, f134, f561; +add.f32 f564, f563, f2600; +mul.f32 f566, f559, 0f3F4F1BBD; +fma.rn.f32 f2599, f557, 0f3E9E377A, f117; +sub.f32 f567, f2599, f566; +sub.f32 f568, f375, f390; +sub.f32 f570, f380, f385; +mul.f32 f2598, f568, 0f3F737871; +fma.rn.f32 f571, f570, 0f3F167918, f2598; +sub.f32 f572, f567, f571; +add.f32 f573, f571, f567; +mul.f32 f574, f557, 0f3F4F1BBD; +sub.f32 f575, f117, f574; +fma.rn.f32 f576, f559, 0f3E9E377A, f575; +mul.f32 f577, f568, 0f3F167918; +mul.f32 f578, f570, 0f3F737871; +sub.f32 f579, f577, f578; +sub.f32 f580, f576, f579; +add.f32 f581, f579, f576; +mul.f32 f583, f563, 0f3F4F1BBD; +fma.rn.f32 f2597, f561, 0f3E9E377A, f134; +sub.f32 f584, f2597, f583; +sub.f32 f585, f373, f388; +sub.f32 f587, f378, f383; +mul.f32 f2596, f585, 0f3F737871; +fma.rn.f32 f588, f587, 0f3F167918, f2596; +add.f32 f589, f588, f584; +sub.f32 f590, f584, f588; +mul.f32 f591, f561, 0f3F4F1BBD; +sub.f32 f592, f134, f591; +fma.rn.f32 f593, f563, 0f3E9E377A, f592; +mul.f32 f594, f585, 0f3F167918; +mul.f32 f595, f587, 0f3F737871; +sub.f32 f596, f594, f595; +add.f32 f597, f596, f593; +sub.f32 f598, f593, f596; +mov.u32 r18, %tid.x; +mul.wide.u32 rd2, r18, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r18, r6; +mad.lo.s32 r8, r5, 125000, r3; +mul.wide.u32 rd13, r7, 8; +mov.u64 rd14, %51; +add.s64 rd6, rd14, rd13; +ld.global.v2.f32 {f599, f600}, [rd6]; +mul.f32 f603, f438, f600; +mul.f32 f605, f599, f438; +mul.f32 f607, f600, f600; +mul.f32 f2595, f599, f599; +sub.f32 f608, f2595, f607; +mul.f32 f609, f600, f599; +fma.rn.f32 f610, f600, f599, f609; +mul.f32 f611, f480, f610; +mul.f32 f613, f608, f480; +mul.f32 f615, f600, f610; +mul.f32 f2594, f599, f608; +sub.f32 f616, f2594, f615; +mul.f32 f2593, f476, f610; +mul.f32 f617, f599, f610; +fma.rn.f32 f618, f600, f608, f617; +mul.f32 f619, f522, f618; +mul.f32 f621, f616, f522; +mul.f32 f2591, f599, f616; +mul.f32 f2592, f600, f618; +sub.f32 f624, f2591, f2592; +mul.f32 f2590, f518, f618; +mul.f32 f625, f599, f618; +fma.rn.f32 f626, f600, f616, f625; +mul.f32 f627, f564, f626; +mul.f32 f629, f624, f564; +mul.f32 f631, f600, f626; +mul.f32 f2589, f599, f624; +sub.f32 f632, f2589, f631; +mul.f32 f2588, f560, f626; +mul.f32 f633, f599, f626; +fma.rn.f32 f634, f600, f624, f633; +mul.f32 f635, f421, f634; +mul.f32 f637, f632, f421; +mul.f32 f2586, f599, f632; +mul.f32 f2587, f600, f634; +sub.f32 f640, f2586, f2587; +mul.f32 f2585, f404, f634; +mul.f32 f641, f599, f634; +fma.rn.f32 f642, f600, f632, f641; +mul.f32 f643, f463, f642; +mul.f32 f645, f640, f463; +mul.f32 f647, f600, f642; +mul.f32 f2584, f599, f640; +sub.f32 f648, f2584, f647; +mul.f32 f2583, f446, f642; +mul.f32 f649, f599, f642; +fma.rn.f32 f650, f600, f640, f649; +mul.f32 f651, f505, f650; +mul.f32 f653, f648, f505; +mul.f32 f655, f600, f650; +mul.f32 f2582, f599, f648; +sub.f32 f656, f2582, f655; +mul.f32 f2581, f488, f650; +mul.f32 f657, f599, f650; +fma.rn.f32 f658, f600, f648, f657; +mul.f32 f659, f547, f658; +mul.f32 f661, f656, f547; +mul.f32 f2579, f599, f656; +mul.f32 f2580, f600, f658; +sub.f32 f664, f2579, f2580; +mul.f32 f2578, f530, f658; +mul.f32 f665, f599, f658; +fma.rn.f32 f666, f600, f656, f665; +mul.f32 f667, f589, f666; +mul.f32 f669, f664, f589; +mul.f32 f671, f600, f666; +mul.f32 f2577, f599, f664; +sub.f32 f672, f2577, f671; +mul.f32 f2576, f572, f666; +mul.f32 f673, f599, f666; +fma.rn.f32 f674, f600, f664, f673; +mul.f32 f675, f429, f674; +mul.f32 f677, f672, f429; +mul.f32 f679, f600, f674; +mul.f32 f2575, f599, f672; +sub.f32 f680, f2575, f679; +mul.f32 f2574, f412, f674; +mul.f32 f681, f599, f674; +fma.rn.f32 f682, f600, f672, f681; +mul.f32 f683, f471, f682; +mul.f32 f685, f680, f471; +mul.f32 f2572, f599, f680; +mul.f32 f2573, f600, f682; +sub.f32 f688, f2572, f2573; +mul.f32 f2571, f454, f682; +mul.f32 f689, f599, f682; +fma.rn.f32 f690, f600, f680, f689; +mul.f32 f691, f513, f690; +mul.f32 f693, f688, f513; +mul.f32 f695, f600, f690; +mul.f32 f2570, f599, f688; +sub.f32 f696, f2570, f695; +mul.f32 f2569, f496, f690; +mul.f32 f697, f599, f690; +fma.rn.f32 f698, f600, f688, f697; +mul.f32 f699, f555, f698; +mul.f32 f701, f696, f555; +mul.f32 f2567, f599, f696; +mul.f32 f2568, f600, f698; +sub.f32 f704, f2567, f2568; +mul.f32 f2566, f538, f698; +mul.f32 f705, f599, f698; +fma.rn.f32 f706, f600, f696, f705; +mul.f32 f707, f597, f706; +mul.f32 f709, f704, f597; +mul.f32 f711, f600, f706; +mul.f32 f2565, f599, f704; +sub.f32 f712, f2565, f711; +mul.f32 f2564, f580, f706; +mul.f32 f713, f599, f706; +fma.rn.f32 f714, f600, f704, f713; +mul.f32 f715, f430, f714; +mul.f32 f717, f712, f430; +mul.f32 f719, f600, f714; +mul.f32 f2563, f599, f712; +sub.f32 f720, f2563, f719; +mul.f32 f2562, f413, f714; +mul.f32 f721, f599, f714; +fma.rn.f32 f722, f600, f712, f721; +mul.f32 f723, f472, f722; +mul.f32 f725, f720, f472; +mul.f32 f2560, f599, f720; +mul.f32 f2561, f600, f722; +sub.f32 f728, f2560, f2561; +mul.f32 f2559, f455, f722; +mul.f32 f729, f599, f722; +fma.rn.f32 f730, f600, f720, f729; +mul.f32 f731, f514, f730; +mul.f32 f733, f728, f514; +mul.f32 f735, f600, f730; +mul.f32 f2558, f599, f728; +sub.f32 f736, f2558, f735; +mul.f32 f2557, f497, f730; +mul.f32 f737, f599, f730; +fma.rn.f32 f738, f600, f728, f737; +mul.f32 f739, f556, f738; +mul.f32 f741, f736, f556; +mul.f32 f743, f600, f738; +mul.f32 f2556, f599, f736; +sub.f32 f744, f2556, f743; +mul.f32 f2555, f539, f738; +mul.f32 f745, f599, f738; +fma.rn.f32 f746, f600, f736, f745; +mul.f32 f747, f598, f746; +mul.f32 f749, f744, f598; +mul.f32 f2553, f599, f744; +mul.f32 f2554, f600, f746; +sub.f32 f752, f2553, f2554; +mul.f32 f2552, f581, f746; +mul.f32 f753, f599, f746; +fma.rn.f32 f754, f600, f744, f753; +mul.f32 f755, f422, f754; +mul.f32 f757, f752, f422; +mul.f32 f759, f600, f754; +mul.f32 f2551, f599, f752; +sub.f32 f760, f2551, f759; +mul.f32 f2550, f405, f754; +mul.f32 f761, f599, f754; +fma.rn.f32 f762, f600, f752, f761; +mul.f32 f763, f464, f762; +mul.f32 f765, f760, f464; +mul.f32 f2548, f599, f760; +mul.f32 f2549, f600, f762; +sub.f32 f768, f2548, f2549; +mul.f32 f2547, f447, f762; +mul.f32 f769, f599, f762; +fma.rn.f32 f770, f600, f760, f769; +mul.f32 f771, f506, f770; +mul.f32 f773, f768, f506; +mul.f32 f775, f600, f770; +mul.f32 f2546, f599, f768; +sub.f32 f776, f2546, f775; +mul.f32 f2545, f489, f770; +mul.f32 f777, f599, f770; +fma.rn.f32 f778, f600, f768, f777; +mul.f32 f779, f548, f778; +mul.f32 f781, f776, f548; +mul.f32 f783, f600, f778; +mul.f32 f2544, f599, f776; +sub.f32 f784, f2544, f783; +mul.f32 f2543, f531, f778; +mul.f32 f785, f599, f778; +mul.f32 f2542, f434, f600; +fma.rn.f32 f786, f600, f776, f785; +mul.f32 f787, f590, f786; +mul.f32 f788, f573, f786; +mul.f32 f789, f784, f590; +barrier.sync 0; +add.f32 f790, f2628, f2623; +add.f32 f791, f393, f2626; +mad.lo.s32 r17, r7, 200, r8; +st.shared.v2.f32 [r17], {f791, f790}; +fma.rn.f32 f792, f599, f434, f603; +sub.f32 f793, f605, f2542; +st.shared.v2.f32 [r17+8], {f792, f793}; +fma.rn.f32 f794, f608, f476, f611; +sub.f32 f795, f613, f2593; +st.shared.v2.f32 [r17+16], {f794, f795}; +fma.rn.f32 f796, f616, f518, f619; +sub.f32 f797, f621, f2590; +st.shared.v2.f32 [r17+24], {f796, f797}; +fma.rn.f32 f798, f624, f560, f627; +sub.f32 f799, f629, f2588; +st.shared.v2.f32 [r17+32], {f798, f799}; +sub.f32 f800, f637, f2585; +fma.rn.f32 f801, f632, f404, f635; +st.shared.v2.f32 [r17+40], {f801, f800}; +fma.rn.f32 f802, f640, f446, f643; +sub.f32 f803, f645, f2583; +st.shared.v2.f32 [r17+48], {f802, f803}; +sub.f32 f804, f653, f2581; +fma.rn.f32 f805, f648, f488, f651; +st.shared.v2.f32 [r17+56], {f805, f804}; +fma.rn.f32 f806, f656, f530, f659; +sub.f32 f807, f661, f2578; +st.shared.v2.f32 [r17+64], {f806, f807}; +fma.rn.f32 f808, f664, f572, f667; +sub.f32 f809, f669, f2576; +st.shared.v2.f32 [r17+72], {f808, f809}; +fma.rn.f32 f810, f672, f412, f675; +sub.f32 f811, f677, f2574; +st.shared.v2.f32 [r17+80], {f810, f811}; +fma.rn.f32 f812, f680, f454, f683; +sub.f32 f813, f685, f2571; +st.shared.v2.f32 [r17+88], {f812, f813}; +fma.rn.f32 f814, f688, f496, f691; +sub.f32 f815, f693, f2569; +st.shared.v2.f32 [r17+96], {f814, f815}; +fma.rn.f32 f816, f696, f538, f699; +sub.f32 f817, f701, f2566; +st.shared.v2.f32 [r17+104], {f816, f817}; +fma.rn.f32 f818, f704, f580, f707; +sub.f32 f819, f709, f2564; +st.shared.v2.f32 [r17+112], {f818, f819}; +fma.rn.f32 f820, f712, f413, f715; +sub.f32 f821, f717, f2562; +st.shared.v2.f32 [r17+120], {f820, f821}; +fma.rn.f32 f822, f720, f455, f723; +sub.f32 f823, f725, f2559; +st.shared.v2.f32 [r17+128], {f822, f823}; +fma.rn.f32 f824, f728, f497, f731; +sub.f32 f825, f733, f2557; +st.shared.v2.f32 [r17+136], {f824, f825}; +fma.rn.f32 f826, f736, f539, f739; +sub.f32 f827, f741, f2555; +st.shared.v2.f32 [r17+144], {f826, f827}; +fma.rn.f32 f828, f744, f581, f747; +sub.f32 f829, f749, f2552; +st.shared.v2.f32 [r17+152], {f828, f829}; +fma.rn.f32 f830, f752, f405, f755; +sub.f32 f831, f757, f2550; +st.shared.v2.f32 [r17+160], {f830, f831}; +fma.rn.f32 f832, f760, f447, f763; +sub.f32 f833, f765, f2547; +st.shared.v2.f32 [r17+168], {f832, f833}; +fma.rn.f32 f834, f768, f489, f771; +sub.f32 f835, f773, f2545; +st.shared.v2.f32 [r17+176], {f834, f835}; +fma.rn.f32 f836, f776, f531, f779; +sub.f32 f837, f781, f2543; +st.shared.v2.f32 [r17+184], {f836, f837}; +fma.rn.f32 f838, f784, f573, f787; +sub.f32 f839, f789, f788; +st.shared.v2.f32 [r17+192], {f838, f839}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r17; +ld.shared.v2.f32 {f840, f841}, [r10]; +ld.shared.v2.f32 {f844, f845}, [r10+5000]; +ld.shared.v2.f32 {f848, f849}, [r10+10000]; +ld.shared.v2.f32 {f852, f853}, [r10+15000]; +ld.shared.v2.f32 {f856, f857}, [r10+20000]; +ld.shared.v2.f32 {f860, f861}, [r10+25000]; +ld.shared.v2.f32 {f864, f865}, [r10+30000]; +ld.shared.v2.f32 {f868, f869}, [r10+35000]; +ld.shared.v2.f32 {f872, f873}, [r10+40000]; +ld.shared.v2.f32 {f876, f877}, [r10+45000]; +ld.shared.v2.f32 {f880, f881}, [r10+50000]; +ld.shared.v2.f32 {f884, f885}, [r10+55000]; +ld.shared.v2.f32 {f888, f889}, [r10+60000]; +ld.shared.v2.f32 {f892, f893}, [r10+65000]; +ld.shared.v2.f32 {f896, f897}, [r10+70000]; +ld.shared.v2.f32 {f900, f901}, [r10+75000]; +ld.shared.v2.f32 {f904, f905}, [r10+80000]; +ld.shared.v2.f32 {f908, f909}, [r10+85000]; +ld.shared.v2.f32 {f912, f913}, [r10+90000]; +ld.shared.v2.f32 {f916, f917}, [r10+95000]; +ld.shared.v2.f32 {f920, f921}, [r10+100000]; +ld.shared.v2.f32 {f924, f925}, [r10+105000]; +ld.shared.v2.f32 {f928, f929}, [r10+110000]; +ld.shared.v2.f32 {f932, f933}, [r10+115000]; +ld.shared.v2.f32 {f936, f937}, [r10+120000]; +add.f32 f940, f860, f920; +add.f32 f942, f880, f900; +add.f32 f2541, f840, f940; +add.f32 f943, f942, f2541; +add.f32 f944, f861, f921; +add.f32 f946, f881, f901; +add.f32 f2540, f841, f944; +add.f32 f947, f946, f2540; +mul.f32 f949, f942, 0f3F4F1BBD; +fma.rn.f32 f2539, f940, 0f3E9E377A, f840; +sub.f32 f950, f2539, f949; +sub.f32 f951, f861, f921; +sub.f32 f953, f881, f901; +mul.f32 f2538, f951, 0f3F737871; +fma.rn.f32 f954, f953, 0f3F167918, f2538; +sub.f32 f955, f950, f954; +add.f32 f956, f954, f950; +mul.f32 f957, f940, 0f3F4F1BBD; +sub.f32 f958, f840, f957; +fma.rn.f32 f959, f942, 0f3E9E377A, f958; +mul.f32 f960, f951, 0f3F167918; +mul.f32 f961, f953, 0f3F737871; +sub.f32 f962, f960, f961; +sub.f32 f963, f959, f962; +add.f32 f964, f962, f959; +mul.f32 f966, f946, 0f3F4F1BBD; +fma.rn.f32 f2537, f944, 0f3E9E377A, f841; +sub.f32 f967, f2537, f966; +sub.f32 f968, f860, f920; +sub.f32 f970, f880, f900; +mul.f32 f2536, f968, 0f3F737871; +fma.rn.f32 f971, f970, 0f3F167918, f2536; +add.f32 f972, f971, f967; +sub.f32 f973, f967, f971; +mul.f32 f974, f944, 0f3F4F1BBD; +sub.f32 f975, f841, f974; +fma.rn.f32 f976, f946, 0f3E9E377A, f975; +mul.f32 f977, f968, 0f3F167918; +mul.f32 f978, f970, 0f3F737871; +sub.f32 f979, f977, f978; +add.f32 f980, f979, f976; +sub.f32 f981, f976, f979; +add.f32 f982, f864, f924; +add.f32 f984, f884, f904; +add.f32 f2535, f844, f982; +add.f32 f985, f984, f2535; +add.f32 f986, f865, f925; +add.f32 f988, f885, f905; +add.f32 f2534, f845, f986; +add.f32 f989, f988, f2534; +fma.rn.f32 f2532, f982, 0f3E9E377A, f844; +mul.f32 f2533, f984, 0f3F4F1BBD; +sub.f32 f992, f2532, f2533; +sub.f32 f993, f865, f925; +sub.f32 f995, f885, f905; +mul.f32 f2531, f993, 0f3F737871; +fma.rn.f32 f996, f995, 0f3F167918, f2531; +sub.f32 f997, f992, f996; +add.f32 f998, f996, f992; +mul.f32 f999, f982, 0f3F4F1BBD; +sub.f32 f1000, f844, f999; +fma.rn.f32 f1001, f984, 0f3E9E377A, f1000; +mul.f32 f1002, f993, 0f3F167918; +mul.f32 f1003, f995, 0f3F737871; +sub.f32 f1004, f1002, f1003; +sub.f32 f1005, f1001, f1004; +add.f32 f1006, f1004, f1001; +mul.f32 f1008, f988, 0f3F4F1BBD; +fma.rn.f32 f2530, f986, 0f3E9E377A, f845; +sub.f32 f1009, f2530, f1008; +sub.f32 f1010, f864, f924; +sub.f32 f1012, f884, f904; +mul.f32 f2529, f1010, 0f3F737871; +fma.rn.f32 f1013, f1012, 0f3F167918, f2529; +add.f32 f1014, f1013, f1009; +sub.f32 f1015, f1009, f1013; +mul.f32 f1016, f986, 0f3F4F1BBD; +sub.f32 f1017, f845, f1016; +fma.rn.f32 f1018, f988, 0f3E9E377A, f1017; +mul.f32 f1019, f1010, 0f3F167918; +mul.f32 f1020, f1012, 0f3F737871; +sub.f32 f1021, f1019, f1020; +add.f32 f1022, f1021, f1018; +sub.f32 f1023, f1018, f1021; +add.f32 f1024, f868, f928; +add.f32 f1026, f888, f908; +add.f32 f2528, f848, f1024; +add.f32 f1027, f1026, f2528; +add.f32 f1028, f869, f929; +add.f32 f1030, f889, f909; +add.f32 f2527, f849, f1028; +add.f32 f1031, f1030, f2527; +fma.rn.f32 f2525, f1024, 0f3E9E377A, f848; +mul.f32 f2526, f1026, 0f3F4F1BBD; +sub.f32 f1034, f2525, f2526; +sub.f32 f1035, f869, f929; +sub.f32 f1037, f889, f909; +mul.f32 f2524, f1035, 0f3F737871; +fma.rn.f32 f1038, f1037, 0f3F167918, f2524; +sub.f32 f1039, f1034, f1038; +add.f32 f1040, f1038, f1034; +mul.f32 f1041, f1024, 0f3F4F1BBD; +sub.f32 f1042, f848, f1041; +fma.rn.f32 f1043, f1026, 0f3E9E377A, f1042; +mul.f32 f1044, f1035, 0f3F167918; +mul.f32 f1045, f1037, 0f3F737871; +sub.f32 f1046, f1044, f1045; +sub.f32 f1047, f1043, f1046; +add.f32 f1048, f1046, f1043; +fma.rn.f32 f2522, f1028, 0f3E9E377A, f849; +mul.f32 f2523, f1030, 0f3F4F1BBD; +sub.f32 f1051, f2522, f2523; +sub.f32 f1052, f868, f928; +sub.f32 f1054, f888, f908; +mul.f32 f2521, f1052, 0f3F737871; +fma.rn.f32 f1055, f1054, 0f3F167918, f2521; +add.f32 f1056, f1055, f1051; +sub.f32 f1057, f1051, f1055; +mul.f32 f1058, f1028, 0f3F4F1BBD; +sub.f32 f1059, f849, f1058; +fma.rn.f32 f1060, f1030, 0f3E9E377A, f1059; +mul.f32 f1061, f1052, 0f3F167918; +mul.f32 f1062, f1054, 0f3F737871; +sub.f32 f1063, f1061, f1062; +add.f32 f1064, f1063, f1060; +sub.f32 f1065, f1060, f1063; +add.f32 f1066, f872, f932; +add.f32 f1068, f892, f912; +add.f32 f2520, f852, f1066; +add.f32 f1069, f1068, f2520; +add.f32 f1070, f873, f933; +add.f32 f1072, f893, f913; +add.f32 f2519, f853, f1070; +add.f32 f1073, f1072, f2519; +mul.f32 f1075, f1068, 0f3F4F1BBD; +fma.rn.f32 f2518, f1066, 0f3E9E377A, f852; +sub.f32 f1076, f2518, f1075; +sub.f32 f1077, f873, f933; +sub.f32 f1079, f893, f913; +mul.f32 f2517, f1077, 0f3F737871; +fma.rn.f32 f1080, f1079, 0f3F167918, f2517; +sub.f32 f1081, f1076, f1080; +add.f32 f1082, f1080, f1076; +mul.f32 f1083, f1066, 0f3F4F1BBD; +sub.f32 f1084, f852, f1083; +fma.rn.f32 f1085, f1068, 0f3E9E377A, f1084; +mul.f32 f1086, f1077, 0f3F167918; +mul.f32 f1087, f1079, 0f3F737871; +sub.f32 f1088, f1086, f1087; +sub.f32 f1089, f1085, f1088; +add.f32 f1090, f1088, f1085; +fma.rn.f32 f2515, f1070, 0f3E9E377A, f853; +mul.f32 f2516, f1072, 0f3F4F1BBD; +sub.f32 f1093, f2515, f2516; +sub.f32 f1094, f872, f932; +sub.f32 f1096, f892, f912; +mul.f32 f2514, f1094, 0f3F737871; +fma.rn.f32 f1097, f1096, 0f3F167918, f2514; +add.f32 f1098, f1097, f1093; +sub.f32 f1099, f1093, f1097; +mul.f32 f1100, f1070, 0f3F4F1BBD; +sub.f32 f1101, f853, f1100; +fma.rn.f32 f1102, f1072, 0f3E9E377A, f1101; +mul.f32 f1103, f1094, 0f3F167918; +mul.f32 f1104, f1096, 0f3F737871; +sub.f32 f1105, f1103, f1104; +add.f32 f1106, f1105, f1102; +sub.f32 f1107, f1102, f1105; +add.f32 f1108, f876, f936; +add.f32 f1110, f896, f916; +add.f32 f2513, f856, f1108; +add.f32 f1111, f1110, f2513; +add.f32 f1112, f877, f937; +add.f32 f1114, f897, f917; +add.f32 f2512, f857, f1112; +add.f32 f1115, f1114, f2512; +mul.f32 f1117, f1110, 0f3F4F1BBD; +fma.rn.f32 f2511, f1108, 0f3E9E377A, f856; +sub.f32 f1118, f2511, f1117; +sub.f32 f1119, f877, f937; +sub.f32 f1121, f897, f917; +mul.f32 f2510, f1119, 0f3F737871; +fma.rn.f32 f1122, f1121, 0f3F167918, f2510; +sub.f32 f1123, f1118, f1122; +add.f32 f1124, f1122, f1118; +mul.f32 f1125, f1108, 0f3F4F1BBD; +sub.f32 f1126, f856, f1125; +fma.rn.f32 f1127, f1110, 0f3E9E377A, f1126; +mul.f32 f1128, f1119, 0f3F167918; +mul.f32 f1129, f1121, 0f3F737871; +sub.f32 f1130, f1128, f1129; +sub.f32 f1131, f1127, f1130; +add.f32 f1132, f1130, f1127; +mul.f32 f1134, f1114, 0f3F4F1BBD; +fma.rn.f32 f2509, f1112, 0f3E9E377A, f857; +sub.f32 f1135, f2509, f1134; +sub.f32 f1136, f876, f936; +sub.f32 f1138, f896, f916; +mul.f32 f2508, f1136, 0f3F737871; +fma.rn.f32 f1139, f1138, 0f3F167918, f2508; +add.f32 f1140, f1139, f1135; +sub.f32 f1141, f1135, f1139; +mul.f32 f1142, f1112, 0f3F4F1BBD; +sub.f32 f1143, f857, f1142; +fma.rn.f32 f1144, f1114, 0f3E9E377A, f1143; +mul.f32 f1145, f1136, 0f3F167918; +mul.f32 f1146, f1138, 0f3F737871; +sub.f32 f1147, f1145, f1146; +add.f32 f1148, f1147, f1144; +sub.f32 f1149, f1144, f1147; +mul.f32 f1151, f1014, 0f3E7EA890; +mul.f32 f2507, f997, 0f3F77F511; +sub.f32 f1152, f2507, f1151; +mul.f32 f1153, f1014, 0f3F77F511; +fma.rn.f32 f1154, f997, 0f3E7EA890, f1153; +mul.f32 f2505, f1039, 0f3F6055A2; +mul.f32 f2506, f1056, 0f3EF6A86B; +sub.f32 f1157, f2505, f2506; +mul.f32 f1158, f1056, 0f3F6055A2; +fma.rn.f32 f1159, f1039, 0f3EF6A86B, f1158; +mul.f32 f2503, f1081, 0f3F3A9DB0; +mul.f32 f2504, f1098, 0f3F2F3E7B; +sub.f32 f1162, f2503, f2504; +mul.f32 f1163, f1098, 0f3F3A9DB0; +fma.rn.f32 f1164, f1081, 0f3F2F3E7B, f1163; +mul.f32 f2501, f1123, 0f3F092BF2; +mul.f32 f2502, f1140, 0f3F5825E0; +sub.f32 f1167, f2501, f2502; +mul.f32 f1168, f1140, 0f3F092BF2; +fma.rn.f32 f1169, f1123, 0f3F5825E0, f1168; +mul.f32 f2499, f1005, 0f3F6055A2; +mul.f32 f2500, f1022, 0f3EF6A86B; +sub.f32 f1172, f2499, f2500; +mul.f32 f1173, f1022, 0f3F6055A2; +fma.rn.f32 f1174, f1005, 0f3EF6A86B, f1173; +mul.f32 f1176, f1064, 0f3F5825E0; +mul.f32 f2498, f1047, 0f3F092BF2; +sub.f32 f1177, f2498, f1176; +mul.f32 f1178, f1064, 0f3F092BF2; +fma.rn.f32 f1179, f1047, 0f3F5825E0, f1178; +mul.f32 f1181, f1106, 0f3F7F7EAE; +mul.f32 f2497, f1089, 0f3D809851; +sub.f32 f1182, f2497, f1181; +mul.f32 f1183, f1106, 0f3D809851; +fma.rn.f32 f1184, f1089, 0f3F7F7EAE, f1183; +mul.f32 f1186, f1148, 0f3F67A2BF; +mul.f32 f2496, f1131, 0fBED9FFBE; +sub.f32 f1187, f2496, f1186; +mul.f32 f1188, f1148, 0fBED9FFBE; +fma.rn.f32 f1189, f1131, 0f3F67A2BF, f1188; +mul.f32 f1191, f1023, 0f3F2F3E7B; +mul.f32 f2495, f1006, 0f3F3A9DB0; +sub.f32 f1192, f2495, f1191; +mul.f32 f1193, f1023, 0f3F3A9DB0; +fma.rn.f32 f1194, f1006, 0f3F2F3E7B, f1193; +mul.f32 f1196, f1065, 0f3F7F7EAE; +mul.f32 f2494, f1048, 0f3D809851; +sub.f32 f1197, f2494, f1196; +mul.f32 f1198, f1065, 0f3D809851; +fma.rn.f32 f1199, f1048, 0f3F7F7EAE, f1198; +mul.f32 f1201, f1107, 0f3F45405B; +mul.f32 f2493, f1090, 0fBF232E38; +sub.f32 f1202, f2493, f1201; +mul.f32 f1203, f1107, 0fBF232E38; +fma.rn.f32 f1204, f1090, 0f3F45405B, f1203; +mul.f32 f2491, f1132, 0fBF7DFB3B; +mul.f32 f2492, f1149, 0f3E00575B; +sub.f32 f1207, f2491, f2492; +mul.f32 f1208, f1149, 0fBF7DFB3B; +fma.rn.f32 f1209, f1132, 0f3E00575B, f1208; +mul.f32 f2489, f998, 0f3F092BF2; +mul.f32 f2490, f1015, 0f3F5825E0; +sub.f32 f1212, f2489, f2490; +mul.f32 f1213, f1015, 0f3F092BF2; +fma.rn.f32 f1214, f998, 0f3F5825E0, f1213; +mul.f32 f2487, f1040, 0fBED9FFBE; +mul.f32 f2488, f1057, 0f3F67A2BF; +sub.f32 f1217, f2487, f2488; +mul.f32 f1218, f1057, 0fBED9FFBE; +fma.rn.f32 f1219, f1040, 0f3F67A2BF, f1218; +mul.f32 f1221, f1099, 0f3E00575B; +mul.f32 f2486, f1082, 0fBF7DFB3B; +sub.f32 f1222, f2486, f1221; +mul.f32 f1223, f1099, 0fBF7DFB3B; +fma.rn.f32 f1224, f1082, 0f3E00575B, f1223; +mul.f32 f1226, f1141, 0fBF45405B; +mul.f32 f2485, f1124, 0fBF232E38; +sub.f32 f1227, f2485, f1226; +mul.f32 f1228, f1141, 0fBF232E38; +fma.rn.f32 f1229, f1124, 0fBF45405B, f1228; +add.f32 f1230, f985, f1111; +add.f32 f1232, f1027, f1069; +fma.rn.f32 f2483, f1230, 0f3E9E377A, f943; +mul.f32 f2484, f1232, 0f3F4F1BBD; +sub.f32 f1238, f2483, f2484; +add.f32 f2482, f989, f1115; +sub.f32 f1239, f989, f1115; +add.f32 f2481, f1031, f1073; +sub.f32 f1241, f1031, f1073; +mul.f32 f2480, f1239, 0f3F737871; +fma.rn.f32 f1242, f1241, 0f3F167918, f2480; +sub.f32 f1243, f1238, f1242; +add.f32 f1244, f1242, f1238; +add.f32 f2479, f943, f1230; +mul.f32 f1245, f1230, 0f3F4F1BBD; +sub.f32 f1246, f943, f1245; +fma.rn.f32 f1247, f1232, 0f3E9E377A, f1246; +mul.f32 f1248, f1239, 0f3F167918; +mul.f32 f1249, f1241, 0f3F737871; +sub.f32 f1250, f1248, f1249; +sub.f32 f1251, f1247, f1250; +add.f32 f1252, f1250, f1247; +fma.rn.f32 f2477, f2482, 0f3E9E377A, f947; +mul.f32 f2478, f2481, 0f3F4F1BBD; +sub.f32 f1255, f2477, f2478; +sub.f32 f1256, f985, f1111; +sub.f32 f1258, f1027, f1069; +mul.f32 f2476, f1256, 0f3F737871; +fma.rn.f32 f1259, f1258, 0f3F167918, f2476; +add.f32 f1260, f1259, f1255; +sub.f32 f1261, f1255, f1259; +add.f32 f2475, f947, f2482; +mul.f32 f1262, f2482, 0f3F4F1BBD; +sub.f32 f1263, f947, f1262; +fma.rn.f32 f1264, f2481, 0f3E9E377A, f1263; +mul.f32 f1265, f1256, 0f3F167918; +mul.f32 f1266, f1258, 0f3F737871; +sub.f32 f1267, f1265, f1266; +add.f32 f1268, f1267, f1264; +sub.f32 f1269, f1264, f1267; +add.f32 f1270, f1152, f1167; +add.f32 f1272, f1157, f1162; +add.f32 f2474, f955, f1270; +add.f32 f1273, f1272, f2474; +add.f32 f1274, f1154, f1169; +add.f32 f1276, f1159, f1164; +add.f32 f2473, f972, f1274; +add.f32 f1277, f1276, f2473; +mul.f32 f1279, f1272, 0f3F4F1BBD; +fma.rn.f32 f2472, f1270, 0f3E9E377A, f955; +sub.f32 f1280, f2472, f1279; +sub.f32 f1281, f1154, f1169; +sub.f32 f1283, f1159, f1164; +mul.f32 f2471, f1281, 0f3F737871; +fma.rn.f32 f1284, f1283, 0f3F167918, f2471; +sub.f32 f1285, f1280, f1284; +add.f32 f1286, f1284, f1280; +mul.f32 f1287, f1270, 0f3F4F1BBD; +sub.f32 f1288, f955, f1287; +fma.rn.f32 f1289, f1272, 0f3E9E377A, f1288; +mul.f32 f1290, f1281, 0f3F167918; +mul.f32 f1291, f1283, 0f3F737871; +sub.f32 f1292, f1290, f1291; +sub.f32 f1293, f1289, f1292; +add.f32 f1294, f1292, f1289; +fma.rn.f32 f2469, f1274, 0f3E9E377A, f972; +mul.f32 f2470, f1276, 0f3F4F1BBD; +sub.f32 f1297, f2469, f2470; +sub.f32 f1298, f1152, f1167; +sub.f32 f1300, f1157, f1162; +mul.f32 f2468, f1298, 0f3F737871; +fma.rn.f32 f1301, f1300, 0f3F167918, f2468; +add.f32 f1302, f1301, f1297; +sub.f32 f1303, f1297, f1301; +mul.f32 f1304, f1274, 0f3F4F1BBD; +sub.f32 f1305, f972, f1304; +fma.rn.f32 f1306, f1276, 0f3E9E377A, f1305; +mul.f32 f1307, f1298, 0f3F167918; +mul.f32 f1308, f1300, 0f3F737871; +sub.f32 f1309, f1307, f1308; +add.f32 f1310, f1309, f1306; +sub.f32 f1311, f1306, f1309; +add.f32 f1312, f1172, f1187; +add.f32 f1314, f1177, f1182; +add.f32 f2467, f963, f1312; +add.f32 f1315, f1314, f2467; +add.f32 f1316, f1174, f1189; +add.f32 f1318, f1179, f1184; +add.f32 f2466, f980, f1316; +add.f32 f1319, f1318, f2466; +mul.f32 f1321, f1314, 0f3F4F1BBD; +fma.rn.f32 f2465, f1312, 0f3E9E377A, f963; +sub.f32 f1322, f2465, f1321; +sub.f32 f1323, f1174, f1189; +sub.f32 f1325, f1179, f1184; +mul.f32 f2464, f1323, 0f3F737871; +fma.rn.f32 f1326, f1325, 0f3F167918, f2464; +sub.f32 f1327, f1322, f1326; +add.f32 f1328, f1326, f1322; +mul.f32 f1329, f1312, 0f3F4F1BBD; +sub.f32 f1330, f963, f1329; +fma.rn.f32 f1331, f1314, 0f3E9E377A, f1330; +mul.f32 f1332, f1323, 0f3F167918; +mul.f32 f1333, f1325, 0f3F737871; +sub.f32 f1334, f1332, f1333; +sub.f32 f1335, f1331, f1334; +add.f32 f1336, f1334, f1331; +mul.f32 f1338, f1318, 0f3F4F1BBD; +fma.rn.f32 f2463, f1316, 0f3E9E377A, f980; +sub.f32 f1339, f2463, f1338; +sub.f32 f1340, f1172, f1187; +sub.f32 f1342, f1177, f1182; +mul.f32 f2462, f1340, 0f3F737871; +fma.rn.f32 f1343, f1342, 0f3F167918, f2462; +add.f32 f1344, f1343, f1339; +sub.f32 f1345, f1339, f1343; +mul.f32 f1346, f1316, 0f3F4F1BBD; +sub.f32 f1347, f980, f1346; +fma.rn.f32 f1348, f1318, 0f3E9E377A, f1347; +mul.f32 f1349, f1340, 0f3F167918; +mul.f32 f1350, f1342, 0f3F737871; +sub.f32 f1351, f1349, f1350; +add.f32 f1352, f1351, f1348; +sub.f32 f1353, f1348, f1351; +add.f32 f1354, f1192, f1207; +add.f32 f1356, f1197, f1202; +add.f32 f2461, f964, f1354; +add.f32 f1357, f1356, f2461; +add.f32 f1358, f1194, f1209; +add.f32 f1360, f1199, f1204; +add.f32 f2460, f981, f1358; +add.f32 f1361, f1360, f2460; +fma.rn.f32 f2458, f1354, 0f3E9E377A, f964; +mul.f32 f2459, f1356, 0f3F4F1BBD; +sub.f32 f1364, f2458, f2459; +sub.f32 f1365, f1194, f1209; +sub.f32 f1367, f1199, f1204; +mul.f32 f2457, f1365, 0f3F737871; +fma.rn.f32 f1368, f1367, 0f3F167918, f2457; +sub.f32 f1369, f1364, f1368; +add.f32 f1370, f1368, f1364; +mul.f32 f1371, f1354, 0f3F4F1BBD; +sub.f32 f1372, f964, f1371; +fma.rn.f32 f1373, f1356, 0f3E9E377A, f1372; +mul.f32 f1374, f1365, 0f3F167918; +mul.f32 f1375, f1367, 0f3F737871; +sub.f32 f1376, f1374, f1375; +sub.f32 f1377, f1373, f1376; +add.f32 f1378, f1376, f1373; +mul.f32 f1380, f1360, 0f3F4F1BBD; +fma.rn.f32 f2456, f1358, 0f3E9E377A, f981; +sub.f32 f1381, f2456, f1380; +sub.f32 f1382, f1192, f1207; +sub.f32 f1384, f1197, f1202; +mul.f32 f2455, f1382, 0f3F737871; +fma.rn.f32 f1385, f1384, 0f3F167918, f2455; +add.f32 f1386, f1385, f1381; +sub.f32 f1387, f1381, f1385; +mul.f32 f1388, f1358, 0f3F4F1BBD; +sub.f32 f1389, f981, f1388; +fma.rn.f32 f1390, f1360, 0f3E9E377A, f1389; +mul.f32 f1391, f1382, 0f3F167918; +mul.f32 f1392, f1384, 0f3F737871; +sub.f32 f1393, f1391, f1392; +add.f32 f1394, f1393, f1390; +sub.f32 f1395, f1390, f1393; +add.f32 f1396, f1212, f1227; +add.f32 f1398, f1217, f1222; +add.f32 f2454, f956, f1396; +add.f32 f1399, f1398, f2454; +add.f32 f1400, f1214, f1229; +add.f32 f1402, f1219, f1224; +add.f32 f2453, f973, f1400; +add.f32 f1403, f1402, f2453; +fma.rn.f32 f2451, f1396, 0f3E9E377A, f956; +mul.f32 f2452, f1398, 0f3F4F1BBD; +sub.f32 f1406, f2451, f2452; +sub.f32 f1407, f1214, f1229; +sub.f32 f1409, f1219, f1224; +mul.f32 f2450, f1407, 0f3F737871; +fma.rn.f32 f1410, f1409, 0f3F167918, f2450; +sub.f32 f1411, f1406, f1410; +add.f32 f1412, f1410, f1406; +mul.f32 f1413, f1396, 0f3F4F1BBD; +sub.f32 f1414, f956, f1413; +fma.rn.f32 f1415, f1398, 0f3E9E377A, f1414; +mul.f32 f1416, f1407, 0f3F167918; +mul.f32 f1417, f1409, 0f3F737871; +sub.f32 f1418, f1416, f1417; +sub.f32 f1419, f1415, f1418; +add.f32 f1420, f1418, f1415; +fma.rn.f32 f2448, f1400, 0f3E9E377A, f973; +mul.f32 f2449, f1402, 0f3F4F1BBD; +sub.f32 f1423, f2448, f2449; +sub.f32 f1424, f1212, f1227; +sub.f32 f1426, f1217, f1222; +mul.f32 f2447, f1424, 0f3F737871; +fma.rn.f32 f1427, f1426, 0f3F167918, f2447; +add.f32 f1428, f1427, f1423; +sub.f32 f1429, f1423, f1427; +mul.f32 f1430, f1400, 0f3F4F1BBD; +sub.f32 f1431, f973, f1430; +fma.rn.f32 f1432, f1402, 0f3E9E377A, f1431; +mul.f32 f1433, f1424, 0f3F167918; +mul.f32 f1434, f1426, 0f3F737871; +sub.f32 f1435, f1433, f1434; +add.f32 f1436, f1435, f1432; +sub.f32 f1437, f1432, f1435; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mov.u64 rd10, %52; +mul.wide.u32 rd12, r11, 8; +add.s64 rd11, rd10, rd12; +ld.global.v2.f32 {f1438, f1439}, [rd11]; +mul.f32 f1442, f1277, f1439; +mul.f32 f1444, f1438, f1277; +mul.f32 f1446, f1439, f1439; +mul.f32 f2446, f1438, f1438; +sub.f32 f1447, f2446, f1446; +mul.f32 f1448, f1439, f1438; +fma.rn.f32 f1449, f1439, f1438, f1448; +mul.f32 f1450, f1319, f1449; +mul.f32 f1452, f1447, f1319; +mul.f32 f2444, f1438, f1447; +mul.f32 f2445, f1439, f1449; +sub.f32 f1455, f2444, f2445; +mul.f32 f2443, f1315, f1449; +mul.f32 f1456, f1438, f1449; +fma.rn.f32 f1457, f1439, f1447, f1456; +mul.f32 f1458, f1361, f1457; +mul.f32 f1460, f1455, f1361; +mul.f32 f1462, f1439, f1457; +mul.f32 f2442, f1438, f1455; +sub.f32 f1463, f2442, f1462; +mul.f32 f2441, f1357, f1457; +mul.f32 f1464, f1438, f1457; +fma.rn.f32 f1465, f1439, f1455, f1464; +mul.f32 f1466, f1403, f1465; +mul.f32 f1468, f1463, f1403; +mul.f32 f1470, f1439, f1465; +mul.f32 f2440, f1438, f1463; +sub.f32 f1471, f2440, f1470; +mul.f32 f2439, f1399, f1465; +mul.f32 f1472, f1438, f1465; +fma.rn.f32 f1473, f1439, f1463, f1472; +mul.f32 f1474, f1260, f1473; +mul.f32 f1476, f1471, f1260; +mul.f32 f2437, f1438, f1471; +mul.f32 f2438, f1439, f1473; +sub.f32 f1479, f2437, f2438; +mul.f32 f2436, f1243, f1473; +mul.f32 f1480, f1438, f1473; +fma.rn.f32 f1481, f1439, f1471, f1480; +mul.f32 f1482, f1302, f1481; +mul.f32 f1484, f1479, f1302; +mul.f32 f1486, f1439, f1481; +mul.f32 f2435, f1438, f1479; +sub.f32 f1487, f2435, f1486; +mul.f32 f2434, f1285, f1481; +mul.f32 f1488, f1438, f1481; +fma.rn.f32 f1489, f1439, f1479, f1488; +mul.f32 f1490, f1344, f1489; +mul.f32 f1492, f1487, f1344; +mul.f32 f1494, f1439, f1489; +mul.f32 f2433, f1438, f1487; +sub.f32 f1495, f2433, f1494; +mul.f32 f2432, f1327, f1489; +mul.f32 f1496, f1438, f1489; +fma.rn.f32 f1497, f1439, f1487, f1496; +mul.f32 f1498, f1386, f1497; +mul.f32 f1500, f1495, f1386; +mul.f32 f2430, f1438, f1495; +mul.f32 f2431, f1439, f1497; +sub.f32 f1503, f2430, f2431; +mul.f32 f2429, f1369, f1497; +mul.f32 f1504, f1438, f1497; +fma.rn.f32 f1505, f1439, f1495, f1504; +mul.f32 f1506, f1428, f1505; +mul.f32 f1508, f1503, f1428; +mul.f32 f1510, f1439, f1505; +mul.f32 f2428, f1438, f1503; +sub.f32 f1511, f2428, f1510; +mul.f32 f2427, f1411, f1505; +mul.f32 f1512, f1438, f1505; +fma.rn.f32 f1513, f1439, f1503, f1512; +mul.f32 f1514, f1268, f1513; +mul.f32 f1516, f1511, f1268; +mul.f32 f2425, f1438, f1511; +mul.f32 f2426, f1439, f1513; +sub.f32 f1519, f2425, f2426; +mul.f32 f2424, f1251, f1513; +mul.f32 f1520, f1438, f1513; +fma.rn.f32 f1521, f1439, f1511, f1520; +mul.f32 f1522, f1310, f1521; +mul.f32 f1524, f1519, f1310; +mul.f32 f1526, f1439, f1521; +mul.f32 f2423, f1438, f1519; +sub.f32 f1527, f2423, f1526; +mul.f32 f2422, f1293, f1521; +mul.f32 f1528, f1438, f1521; +fma.rn.f32 f1529, f1439, f1519, f1528; +mul.f32 f1530, f1352, f1529; +mul.f32 f1532, f1527, f1352; +mul.f32 f1534, f1439, f1529; +mul.f32 f2421, f1438, f1527; +sub.f32 f1535, f2421, f1534; +mul.f32 f2420, f1335, f1529; +mul.f32 f1536, f1438, f1529; +fma.rn.f32 f1537, f1439, f1527, f1536; +mul.f32 f1538, f1394, f1537; +mul.f32 f1540, f1535, f1394; +mul.f32 f2418, f1438, f1535; +mul.f32 f2419, f1439, f1537; +sub.f32 f1543, f2418, f2419; +mul.f32 f2417, f1377, f1537; +mul.f32 f1544, f1438, f1537; +fma.rn.f32 f1545, f1439, f1535, f1544; +mul.f32 f1546, f1436, f1545; +mul.f32 f1548, f1543, f1436; +mul.f32 f1550, f1439, f1545; +mul.f32 f2416, f1438, f1543; +sub.f32 f1551, f2416, f1550; +mul.f32 f2415, f1419, f1545; +mul.f32 f1552, f1438, f1545; +fma.rn.f32 f1553, f1439, f1543, f1552; +mul.f32 f1554, f1269, f1553; +mul.f32 f1556, f1551, f1269; +mul.f32 f1558, f1439, f1553; +mul.f32 f2414, f1438, f1551; +sub.f32 f1559, f2414, f1558; +mul.f32 f2413, f1252, f1553; +mul.f32 f1560, f1438, f1553; +fma.rn.f32 f1561, f1439, f1551, f1560; +mul.f32 f1562, f1311, f1561; +mul.f32 f1564, f1559, f1311; +mul.f32 f2411, f1438, f1559; +mul.f32 f2412, f1439, f1561; +sub.f32 f1567, f2411, f2412; +mul.f32 f2410, f1294, f1561; +mul.f32 f1568, f1438, f1561; +fma.rn.f32 f1569, f1439, f1559, f1568; +mul.f32 f1570, f1353, f1569; +mul.f32 f1572, f1567, f1353; +mul.f32 f1574, f1439, f1569; +mul.f32 f2409, f1438, f1567; +sub.f32 f1575, f2409, f1574; +mul.f32 f2408, f1336, f1569; +mul.f32 f1576, f1438, f1569; +fma.rn.f32 f1577, f1439, f1567, f1576; +mul.f32 f1578, f1395, f1577; +mul.f32 f1580, f1575, f1395; +mul.f32 f2406, f1438, f1575; +mul.f32 f2407, f1439, f1577; +sub.f32 f1583, f2406, f2407; +mul.f32 f2405, f1378, f1577; +mul.f32 f1584, f1438, f1577; +fma.rn.f32 f1585, f1439, f1575, f1584; +mul.f32 f1586, f1437, f1585; +mul.f32 f1588, f1583, f1437; +mul.f32 f1590, f1439, f1585; +mul.f32 f2404, f1438, f1583; +sub.f32 f1591, f2404, f1590; +mul.f32 f2403, f1420, f1585; +mul.f32 f1592, f1438, f1585; +fma.rn.f32 f1593, f1439, f1583, f1592; +mul.f32 f1594, f1261, f1593; +mul.f32 f1596, f1591, f1261; +mul.f32 f1598, f1439, f1593; +mul.f32 f2402, f1438, f1591; +sub.f32 f1599, f2402, f1598; +mul.f32 f2401, f1244, f1593; +mul.f32 f1600, f1438, f1593; +fma.rn.f32 f1601, f1439, f1591, f1600; +mul.f32 f1602, f1303, f1601; +mul.f32 f1604, f1599, f1303; +mul.f32 f2399, f1438, f1599; +mul.f32 f2400, f1439, f1601; +sub.f32 f1607, f2399, f2400; +mul.f32 f2398, f1286, f1601; +mul.f32 f1608, f1438, f1601; +fma.rn.f32 f1609, f1439, f1599, f1608; +mul.f32 f1610, f1345, f1609; +mul.f32 f1612, f1607, f1345; +mul.f32 f1614, f1439, f1609; +mul.f32 f2397, f1438, f1607; +sub.f32 f1615, f2397, f1614; +mul.f32 f2396, f1328, f1609; +mul.f32 f1616, f1438, f1609; +fma.rn.f32 f1617, f1439, f1607, f1616; +mul.f32 f1618, f1387, f1617; +mul.f32 f1620, f1615, f1387; +mul.f32 f1622, f1439, f1617; +mul.f32 f2395, f1438, f1615; +sub.f32 f1623, f2395, f1622; +mul.f32 f2394, f1370, f1617; +mul.f32 f1624, f1438, f1617; +mul.f32 f2393, f1273, f1439; +fma.rn.f32 f1625, f1439, f1615, f1624; +mul.f32 f1626, f1429, f1625; +mul.f32 f1627, f1412, f1625; +mul.f32 f1628, f1623, f1429; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 5000, r15; +add.f32 f1629, f2481, f2475; +add.f32 f1630, f1232, f2479; +st.shared.v2.f32 [r16], {f1630, f1629}; +fma.rn.f32 f1631, f1438, f1273, f1442; +sub.f32 f1632, f1444, f2393; +st.shared.v2.f32 [r16+200], {f1631, f1632}; +fma.rn.f32 f1633, f1447, f1315, f1450; +sub.f32 f1634, f1452, f2443; +st.shared.v2.f32 [r16+400], {f1633, f1634}; +fma.rn.f32 f1635, f1455, f1357, f1458; +sub.f32 f1636, f1460, f2441; +st.shared.v2.f32 [r16+600], {f1635, f1636}; +fma.rn.f32 f1637, f1463, f1399, f1466; +sub.f32 f1638, f1468, f2439; +st.shared.v2.f32 [r16+800], {f1637, f1638}; +fma.rn.f32 f1639, f1471, f1243, f1474; +sub.f32 f1640, f1476, f2436; +st.shared.v2.f32 [r16+1000], {f1639, f1640}; +fma.rn.f32 f1641, f1479, f1285, f1482; +sub.f32 f1642, f1484, f2434; +st.shared.v2.f32 [r16+1200], {f1641, f1642}; +fma.rn.f32 f1643, f1487, f1327, f1490; +sub.f32 f1644, f1492, f2432; +st.shared.v2.f32 [r16+1400], {f1643, f1644}; +fma.rn.f32 f1645, f1495, f1369, f1498; +sub.f32 f1646, f1500, f2429; +st.shared.v2.f32 [r16+1600], {f1645, f1646}; +sub.f32 f1647, f1508, f2427; +fma.rn.f32 f1648, f1503, f1411, f1506; +st.shared.v2.f32 [r16+1800], {f1648, f1647}; +fma.rn.f32 f1649, f1511, f1251, f1514; +sub.f32 f1650, f1516, f2424; +st.shared.v2.f32 [r16+2000], {f1649, f1650}; +fma.rn.f32 f1651, f1519, f1293, f1522; +sub.f32 f1652, f1524, f2422; +st.shared.v2.f32 [r16+2200], {f1651, f1652}; +fma.rn.f32 f1653, f1527, f1335, f1530; +sub.f32 f1654, f1532, f2420; +st.shared.v2.f32 [r16+2400], {f1653, f1654}; +fma.rn.f32 f1655, f1535, f1377, f1538; +sub.f32 f1656, f1540, f2417; +st.shared.v2.f32 [r16+2600], {f1655, f1656}; +fma.rn.f32 f1657, f1543, f1419, f1546; +sub.f32 f1658, f1548, f2415; +st.shared.v2.f32 [r16+2800], {f1657, f1658}; +fma.rn.f32 f1659, f1551, f1252, f1554; +sub.f32 f1660, f1556, f2413; +st.shared.v2.f32 [r16+3000], {f1659, f1660}; +fma.rn.f32 f1661, f1559, f1294, f1562; +sub.f32 f1662, f1564, f2410; +st.shared.v2.f32 [r16+3200], {f1661, f1662}; +fma.rn.f32 f1663, f1567, f1336, f1570; +sub.f32 f1664, f1572, f2408; +st.shared.v2.f32 [r16+3400], {f1663, f1664}; +fma.rn.f32 f1665, f1575, f1378, f1578; +sub.f32 f1666, f1580, f2405; +st.shared.v2.f32 [r16+3600], {f1665, f1666}; +fma.rn.f32 f1667, f1583, f1420, f1586; +sub.f32 f1668, f1588, f2403; +st.shared.v2.f32 [r16+3800], {f1667, f1668}; +fma.rn.f32 f1669, f1591, f1244, f1594; +sub.f32 f1670, f1596, f2401; +st.shared.v2.f32 [r16+4000], {f1669, f1670}; +fma.rn.f32 f1671, f1599, f1286, f1602; +sub.f32 f1672, f1604, f2398; +st.shared.v2.f32 [r16+4200], {f1671, f1672}; +fma.rn.f32 f1673, f1607, f1328, f1610; +sub.f32 f1674, f1612, f2396; +st.shared.v2.f32 [r16+4400], {f1673, f1674}; +fma.rn.f32 f1675, f1615, f1370, f1618; +sub.f32 f1676, f1620, f2394; +st.shared.v2.f32 [r16+4600], {f1675, f1676}; +fma.rn.f32 f1677, f1623, f1412, f1626; +sub.f32 f1678, f1628, f1627; +st.shared.v2.f32 [r16+4800], {f1677, f1678}; +barrier.sync 0; +ld.shared.v2.f32 {f1679, f1680}, [r10]; +ld.shared.v2.f32 {f1683, f1684}, [r10+5000]; +ld.shared.v2.f32 {f1687, f1688}, [r10+10000]; +ld.shared.v2.f32 {f1691, f1692}, [r10+15000]; +ld.shared.v2.f32 {f1695, f1696}, [r10+20000]; +ld.shared.v2.f32 {f1699, f1700}, [r10+25000]; +ld.shared.v2.f32 {f1703, f1704}, [r10+30000]; +ld.shared.v2.f32 {f1707, f1708}, [r10+35000]; +ld.shared.v2.f32 {f1711, f1712}, [r10+40000]; +ld.shared.v2.f32 {f1715, f1716}, [r10+45000]; +ld.shared.v2.f32 {f1719, f1720}, [r10+50000]; +ld.shared.v2.f32 {f1723, f1724}, [r10+55000]; +ld.shared.v2.f32 {f1727, f1728}, [r10+60000]; +ld.shared.v2.f32 {f1731, f1732}, [r10+65000]; +ld.shared.v2.f32 {f1735, f1736}, [r10+70000]; +ld.shared.v2.f32 {f1739, f1740}, [r10+75000]; +ld.shared.v2.f32 {f1743, f1744}, [r10+80000]; +ld.shared.v2.f32 {f1747, f1748}, [r10+85000]; +ld.shared.v2.f32 {f1751, f1752}, [r10+90000]; +ld.shared.v2.f32 {f1755, f1756}, [r10+95000]; +ld.shared.v2.f32 {f1759, f1760}, [r10+100000]; +ld.shared.v2.f32 {f1763, f1764}, [r10+105000]; +ld.shared.v2.f32 {f1767, f1768}, [r10+110000]; +ld.shared.v2.f32 {f1771, f1772}, [r10+115000]; +ld.shared.v2.f32 {f1775, f1776}, [r10+120000]; +add.f32 f1779, f1699, f1759; +add.f32 f1781, f1719, f1739; +add.f32 f2392, f1679, f1779; +add.f32 f1782, f1781, f2392; +add.f32 f1783, f1700, f1760; +add.f32 f1785, f1720, f1740; +add.f32 f2391, f1680, f1783; +add.f32 f1786, f1785, f2391; +fma.rn.f32 f2389, f1779, 0f3E9E377A, f1679; +mul.f32 f2390, f1781, 0f3F4F1BBD; +sub.f32 f1789, f2389, f2390; +sub.f32 f1790, f1700, f1760; +sub.f32 f1792, f1720, f1740; +mul.f32 f2388, f1790, 0f3F737871; +fma.rn.f32 f1793, f1792, 0f3F167918, f2388; +sub.f32 f1794, f1789, f1793; +add.f32 f1795, f1793, f1789; +mul.f32 f1796, f1779, 0f3F4F1BBD; +sub.f32 f1797, f1679, f1796; +fma.rn.f32 f1798, f1781, 0f3E9E377A, f1797; +mul.f32 f1799, f1790, 0f3F167918; +mul.f32 f1800, f1792, 0f3F737871; +sub.f32 f1801, f1799, f1800; +sub.f32 f1802, f1798, f1801; +add.f32 f1803, f1801, f1798; +mul.f32 f1805, f1785, 0f3F4F1BBD; +fma.rn.f32 f2387, f1783, 0f3E9E377A, f1680; +sub.f32 f1806, f2387, f1805; +sub.f32 f1807, f1699, f1759; +sub.f32 f1809, f1719, f1739; +mul.f32 f2386, f1807, 0f3F737871; +fma.rn.f32 f1810, f1809, 0f3F167918, f2386; +add.f32 f1811, f1810, f1806; +sub.f32 f1812, f1806, f1810; +mul.f32 f1813, f1783, 0f3F4F1BBD; +sub.f32 f1814, f1680, f1813; +fma.rn.f32 f1815, f1785, 0f3E9E377A, f1814; +mul.f32 f1816, f1807, 0f3F167918; +mul.f32 f1817, f1809, 0f3F737871; +sub.f32 f1818, f1816, f1817; +add.f32 f1819, f1818, f1815; +sub.f32 f1820, f1815, f1818; +add.f32 f1821, f1703, f1763; +add.f32 f1823, f1723, f1743; +add.f32 f2385, f1683, f1821; +add.f32 f1824, f1823, f2385; +add.f32 f1825, f1704, f1764; +add.f32 f1827, f1724, f1744; +add.f32 f2384, f1684, f1825; +add.f32 f1828, f1827, f2384; +mul.f32 f1830, f1823, 0f3F4F1BBD; +fma.rn.f32 f2383, f1821, 0f3E9E377A, f1683; +sub.f32 f1831, f2383, f1830; +sub.f32 f1832, f1704, f1764; +sub.f32 f1834, f1724, f1744; +mul.f32 f2382, f1832, 0f3F737871; +fma.rn.f32 f1835, f1834, 0f3F167918, f2382; +sub.f32 f1836, f1831, f1835; +add.f32 f1837, f1835, f1831; +mul.f32 f1838, f1821, 0f3F4F1BBD; +sub.f32 f1839, f1683, f1838; +fma.rn.f32 f1840, f1823, 0f3E9E377A, f1839; +mul.f32 f1841, f1832, 0f3F167918; +mul.f32 f1842, f1834, 0f3F737871; +sub.f32 f1843, f1841, f1842; +sub.f32 f1844, f1840, f1843; +add.f32 f1845, f1843, f1840; +fma.rn.f32 f2380, f1825, 0f3E9E377A, f1684; +mul.f32 f2381, f1827, 0f3F4F1BBD; +sub.f32 f1848, f2380, f2381; +sub.f32 f1849, f1703, f1763; +sub.f32 f1851, f1723, f1743; +mul.f32 f2379, f1849, 0f3F737871; +fma.rn.f32 f1852, f1851, 0f3F167918, f2379; +add.f32 f1853, f1852, f1848; +sub.f32 f1854, f1848, f1852; +mul.f32 f1855, f1825, 0f3F4F1BBD; +sub.f32 f1856, f1684, f1855; +fma.rn.f32 f1857, f1827, 0f3E9E377A, f1856; +mul.f32 f1858, f1849, 0f3F167918; +mul.f32 f1859, f1851, 0f3F737871; +sub.f32 f1860, f1858, f1859; +add.f32 f1861, f1860, f1857; +sub.f32 f1862, f1857, f1860; +add.f32 f1863, f1707, f1767; +add.f32 f1865, f1727, f1747; +add.f32 f2378, f1687, f1863; +add.f32 f1866, f1865, f2378; +add.f32 f1867, f1708, f1768; +add.f32 f1869, f1728, f1748; +add.f32 f2377, f1688, f1867; +add.f32 f1870, f1869, f2377; +mul.f32 f1872, f1865, 0f3F4F1BBD; +fma.rn.f32 f2376, f1863, 0f3E9E377A, f1687; +sub.f32 f1873, f2376, f1872; +sub.f32 f1874, f1708, f1768; +sub.f32 f1876, f1728, f1748; +mul.f32 f2375, f1874, 0f3F737871; +fma.rn.f32 f1877, f1876, 0f3F167918, f2375; +sub.f32 f1878, f1873, f1877; +add.f32 f1879, f1877, f1873; +mul.f32 f1880, f1863, 0f3F4F1BBD; +sub.f32 f1881, f1687, f1880; +fma.rn.f32 f1882, f1865, 0f3E9E377A, f1881; +mul.f32 f1883, f1874, 0f3F167918; +mul.f32 f1884, f1876, 0f3F737871; +sub.f32 f1885, f1883, f1884; +sub.f32 f1886, f1882, f1885; +add.f32 f1887, f1885, f1882; +mul.f32 f1889, f1869, 0f3F4F1BBD; +fma.rn.f32 f2374, f1867, 0f3E9E377A, f1688; +sub.f32 f1890, f2374, f1889; +sub.f32 f1891, f1707, f1767; +sub.f32 f1893, f1727, f1747; +mul.f32 f2373, f1891, 0f3F737871; +fma.rn.f32 f1894, f1893, 0f3F167918, f2373; +add.f32 f1895, f1894, f1890; +sub.f32 f1896, f1890, f1894; +mul.f32 f1897, f1867, 0f3F4F1BBD; +sub.f32 f1898, f1688, f1897; +fma.rn.f32 f1899, f1869, 0f3E9E377A, f1898; +mul.f32 f1900, f1891, 0f3F167918; +mul.f32 f1901, f1893, 0f3F737871; +sub.f32 f1902, f1900, f1901; +add.f32 f1903, f1902, f1899; +sub.f32 f1904, f1899, f1902; +add.f32 f1905, f1711, f1771; +add.f32 f1907, f1731, f1751; +add.f32 f2372, f1691, f1905; +add.f32 f1908, f1907, f2372; +add.f32 f1909, f1712, f1772; +add.f32 f1911, f1732, f1752; +add.f32 f2371, f1692, f1909; +add.f32 f1912, f1911, f2371; +mul.f32 f1914, f1907, 0f3F4F1BBD; +fma.rn.f32 f2370, f1905, 0f3E9E377A, f1691; +sub.f32 f1915, f2370, f1914; +sub.f32 f1916, f1712, f1772; +sub.f32 f1918, f1732, f1752; +mul.f32 f2369, f1916, 0f3F737871; +fma.rn.f32 f1919, f1918, 0f3F167918, f2369; +sub.f32 f1920, f1915, f1919; +add.f32 f1921, f1919, f1915; +mul.f32 f1922, f1905, 0f3F4F1BBD; +sub.f32 f1923, f1691, f1922; +fma.rn.f32 f1924, f1907, 0f3E9E377A, f1923; +mul.f32 f1925, f1916, 0f3F167918; +mul.f32 f1926, f1918, 0f3F737871; +sub.f32 f1927, f1925, f1926; +sub.f32 f1928, f1924, f1927; +add.f32 f1929, f1927, f1924; +mul.f32 f1931, f1911, 0f3F4F1BBD; +fma.rn.f32 f2368, f1909, 0f3E9E377A, f1692; +sub.f32 f1932, f2368, f1931; +sub.f32 f1933, f1711, f1771; +sub.f32 f1935, f1731, f1751; +mul.f32 f2367, f1933, 0f3F737871; +fma.rn.f32 f1936, f1935, 0f3F167918, f2367; +add.f32 f1937, f1936, f1932; +sub.f32 f1938, f1932, f1936; +mul.f32 f1939, f1909, 0f3F4F1BBD; +sub.f32 f1940, f1692, f1939; +fma.rn.f32 f1941, f1911, 0f3E9E377A, f1940; +mul.f32 f1942, f1933, 0f3F167918; +mul.f32 f1943, f1935, 0f3F737871; +sub.f32 f1944, f1942, f1943; +add.f32 f1945, f1944, f1941; +sub.f32 f1946, f1941, f1944; +add.f32 f1947, f1715, f1775; +add.f32 f1949, f1735, f1755; +add.f32 f2366, f1695, f1947; +add.f32 f1950, f1949, f2366; +add.f32 f1951, f1716, f1776; +add.f32 f1953, f1736, f1756; +add.f32 f2365, f1696, f1951; +add.f32 f1954, f1953, f2365; +fma.rn.f32 f2363, f1947, 0f3E9E377A, f1695; +mul.f32 f2364, f1949, 0f3F4F1BBD; +sub.f32 f1957, f2363, f2364; +sub.f32 f1958, f1716, f1776; +sub.f32 f1960, f1736, f1756; +mul.f32 f2362, f1958, 0f3F737871; +fma.rn.f32 f1961, f1960, 0f3F167918, f2362; +sub.f32 f1962, f1957, f1961; +add.f32 f1963, f1961, f1957; +mul.f32 f1964, f1947, 0f3F4F1BBD; +sub.f32 f1965, f1695, f1964; +fma.rn.f32 f1966, f1949, 0f3E9E377A, f1965; +mul.f32 f1967, f1958, 0f3F167918; +mul.f32 f1968, f1960, 0f3F737871; +sub.f32 f1969, f1967, f1968; +sub.f32 f1970, f1966, f1969; +add.f32 f1971, f1969, f1966; +fma.rn.f32 f2360, f1951, 0f3E9E377A, f1696; +mul.f32 f2361, f1953, 0f3F4F1BBD; +sub.f32 f1974, f2360, f2361; +sub.f32 f1975, f1715, f1775; +sub.f32 f1977, f1735, f1755; +mul.f32 f2359, f1975, 0f3F737871; +fma.rn.f32 f1978, f1977, 0f3F167918, f2359; +add.f32 f1979, f1978, f1974; +sub.f32 f1980, f1974, f1978; +mul.f32 f1981, f1951, 0f3F4F1BBD; +sub.f32 f1982, f1696, f1981; +fma.rn.f32 f1983, f1953, 0f3E9E377A, f1982; +mul.f32 f1984, f1975, 0f3F167918; +mul.f32 f1985, f1977, 0f3F737871; +sub.f32 f1986, f1984, f1985; +add.f32 f1987, f1986, f1983; +sub.f32 f1988, f1983, f1986; +mul.f32 f1990, f1853, 0f3E7EA890; +mul.f32 f2358, f1836, 0f3F77F511; +sub.f32 f1991, f2358, f1990; +mul.f32 f1992, f1853, 0f3F77F511; +fma.rn.f32 f1993, f1836, 0f3E7EA890, f1992; +mul.f32 f1995, f1895, 0f3EF6A86B; +mul.f32 f2357, f1878, 0f3F6055A2; +sub.f32 f1996, f2357, f1995; +mul.f32 f1997, f1895, 0f3F6055A2; +fma.rn.f32 f1998, f1878, 0f3EF6A86B, f1997; +mul.f32 f2000, f1937, 0f3F2F3E7B; +mul.f32 f2356, f1920, 0f3F3A9DB0; +sub.f32 f2001, f2356, f2000; +mul.f32 f2002, f1937, 0f3F3A9DB0; +fma.rn.f32 f2003, f1920, 0f3F2F3E7B, f2002; +mul.f32 f2005, f1979, 0f3F5825E0; +mul.f32 f2355, f1962, 0f3F092BF2; +sub.f32 f2006, f2355, f2005; +mul.f32 f2007, f1979, 0f3F092BF2; +fma.rn.f32 f2008, f1962, 0f3F5825E0, f2007; +mul.f32 f2010, f1861, 0f3EF6A86B; +mul.f32 f2354, f1844, 0f3F6055A2; +sub.f32 f2011, f2354, f2010; +mul.f32 f2012, f1861, 0f3F6055A2; +fma.rn.f32 f2013, f1844, 0f3EF6A86B, f2012; +mul.f32 f2352, f1886, 0f3F092BF2; +mul.f32 f2353, f1903, 0f3F5825E0; +sub.f32 f2016, f2352, f2353; +mul.f32 f2017, f1903, 0f3F092BF2; +fma.rn.f32 f2018, f1886, 0f3F5825E0, f2017; +mul.f32 f2350, f1928, 0f3D809851; +mul.f32 f2351, f1945, 0f3F7F7EAE; +sub.f32 f2021, f2350, f2351; +mul.f32 f2022, f1945, 0f3D809851; +fma.rn.f32 f2023, f1928, 0f3F7F7EAE, f2022; +mul.f32 f2348, f1970, 0fBED9FFBE; +mul.f32 f2349, f1987, 0f3F67A2BF; +sub.f32 f2026, f2348, f2349; +mul.f32 f2027, f1987, 0fBED9FFBE; +fma.rn.f32 f2028, f1970, 0f3F67A2BF, f2027; +mul.f32 f2346, f1845, 0f3F3A9DB0; +mul.f32 f2347, f1862, 0f3F2F3E7B; +sub.f32 f2031, f2346, f2347; +mul.f32 f2032, f1862, 0f3F3A9DB0; +fma.rn.f32 f2033, f1845, 0f3F2F3E7B, f2032; +mul.f32 f2035, f1904, 0f3F7F7EAE; +mul.f32 f2345, f1887, 0f3D809851; +sub.f32 f2036, f2345, f2035; +mul.f32 f2037, f1904, 0f3D809851; +fma.rn.f32 f2038, f1887, 0f3F7F7EAE, f2037; +mul.f32 f2040, f1946, 0f3F45405B; +mul.f32 f2344, f1929, 0fBF232E38; +sub.f32 f2041, f2344, f2040; +mul.f32 f2042, f1946, 0fBF232E38; +fma.rn.f32 f2043, f1929, 0f3F45405B, f2042; +mul.f32 f2045, f1988, 0f3E00575B; +mul.f32 f2343, f1971, 0fBF7DFB3B; +sub.f32 f2046, f2343, f2045; +mul.f32 f2047, f1988, 0fBF7DFB3B; +fma.rn.f32 f2048, f1971, 0f3E00575B, f2047; +mul.f32 f2050, f1854, 0f3F5825E0; +mul.f32 f2342, f1837, 0f3F092BF2; +sub.f32 f2051, f2342, f2050; +mul.f32 f2052, f1854, 0f3F092BF2; +fma.rn.f32 f2053, f1837, 0f3F5825E0, f2052; +mul.f32 f2055, f1896, 0f3F67A2BF; +mul.f32 f2341, f1879, 0fBED9FFBE; +sub.f32 f2056, f2341, f2055; +mul.f32 f2057, f1896, 0fBED9FFBE; +fma.rn.f32 f2058, f1879, 0f3F67A2BF, f2057; +mul.f32 f2339, f1921, 0fBF7DFB3B; +mul.f32 f2340, f1938, 0f3E00575B; +sub.f32 f2061, f2339, f2340; +mul.f32 f2062, f1938, 0fBF7DFB3B; +fma.rn.f32 f2063, f1921, 0f3E00575B, f2062; +mul.f32 f2337, f1963, 0fBF232E38; +mul.f32 f2338, f1980, 0fBF45405B; +sub.f32 f2066, f2337, f2338; +mul.f32 f2067, f1980, 0fBF232E38; +fma.rn.f32 f2068, f1963, 0fBF45405B, f2067; +add.f32 f2069, f1824, f1950; +add.f32 f2071, f1866, f1908; +mul.f32 f2076, f2071, 0f3F4F1BBD; +fma.rn.f32 f2336, f2069, 0f3E9E377A, f1782; +sub.f32 f2077, f2336, f2076; +add.f32 f2335, f1828, f1954; +sub.f32 f2078, f1828, f1954; +add.f32 f2334, f1870, f1912; +sub.f32 f2080, f1870, f1912; +mul.f32 f2333, f2078, 0f3F737871; +fma.rn.f32 f2081, f2080, 0f3F167918, f2333; +add.f32 f2332, f1782, f2069; +mul.f32 f2082, f2069, 0f3F4F1BBD; +sub.f32 f2083, f1782, f2082; +fma.rn.f32 f2084, f2071, 0f3E9E377A, f2083; +mul.f32 f2085, f2078, 0f3F167918; +mul.f32 f2086, f2080, 0f3F737871; +sub.f32 f2087, f2085, f2086; +fma.rn.f32 f2330, f2335, 0f3E9E377A, f1786; +mul.f32 f2331, f2334, 0f3F4F1BBD; +sub.f32 f2090, f2330, f2331; +sub.f32 f2091, f1824, f1950; +sub.f32 f2093, f1866, f1908; +mul.f32 f2329, f2091, 0f3F737871; +fma.rn.f32 f2094, f2093, 0f3F167918, f2329; +add.f32 f2328, f1786, f2335; +mul.f32 f2095, f2335, 0f3F4F1BBD; +sub.f32 f2096, f1786, f2095; +fma.rn.f32 f2097, f2334, 0f3E9E377A, f2096; +mul.f32 f2098, f2091, 0f3F167918; +mul.f32 f2099, f2093, 0f3F737871; +sub.f32 f2100, f2098, f2099; +add.f32 f2101, f1991, f2006; +add.f32 f2103, f1996, f2001; +fma.rn.f32 f2326, f2101, 0f3E9E377A, f1794; +mul.f32 f2327, f2103, 0f3F4F1BBD; +sub.f32 f2109, f2326, f2327; +add.f32 f2325, f1993, f2008; +sub.f32 f2110, f1993, f2008; +add.f32 f2324, f1998, f2003; +sub.f32 f2112, f1998, f2003; +mul.f32 f2323, f2110, 0f3F737871; +fma.rn.f32 f2113, f2112, 0f3F167918, f2323; +add.f32 f2322, f1794, f2101; +mul.f32 f2114, f2101, 0f3F4F1BBD; +sub.f32 f2115, f1794, f2114; +fma.rn.f32 f2116, f2103, 0f3E9E377A, f2115; +mul.f32 f2117, f2110, 0f3F167918; +mul.f32 f2118, f2112, 0f3F737871; +sub.f32 f2119, f2117, f2118; +mul.f32 f2121, f2324, 0f3F4F1BBD; +fma.rn.f32 f2321, f2325, 0f3E9E377A, f1811; +sub.f32 f2122, f2321, f2121; +sub.f32 f2123, f1991, f2006; +sub.f32 f2125, f1996, f2001; +mul.f32 f2320, f2123, 0f3F737871; +fma.rn.f32 f2126, f2125, 0f3F167918, f2320; +add.f32 f2319, f1811, f2325; +mul.f32 f2127, f2325, 0f3F4F1BBD; +sub.f32 f2128, f1811, f2127; +fma.rn.f32 f2129, f2324, 0f3E9E377A, f2128; +mul.f32 f2130, f2123, 0f3F167918; +mul.f32 f2131, f2125, 0f3F737871; +sub.f32 f2132, f2130, f2131; +add.f32 f2133, f2011, f2026; +add.f32 f2135, f2016, f2021; +mul.f32 f2140, f2135, 0f3F4F1BBD; +fma.rn.f32 f2318, f2133, 0f3E9E377A, f1802; +sub.f32 f2141, f2318, f2140; +add.f32 f2317, f2013, f2028; +sub.f32 f2142, f2013, f2028; +add.f32 f2316, f2018, f2023; +sub.f32 f2144, f2018, f2023; +mul.f32 f2315, f2142, 0f3F737871; +fma.rn.f32 f2145, f2144, 0f3F167918, f2315; +add.f32 f2314, f1802, f2133; +mul.f32 f2146, f2133, 0f3F4F1BBD; +sub.f32 f2147, f1802, f2146; +fma.rn.f32 f2148, f2135, 0f3E9E377A, f2147; +mul.f32 f2149, f2142, 0f3F167918; +mul.f32 f2150, f2144, 0f3F737871; +sub.f32 f2151, f2149, f2150; +fma.rn.f32 f2312, f2317, 0f3E9E377A, f1819; +mul.f32 f2313, f2316, 0f3F4F1BBD; +sub.f32 f2154, f2312, f2313; +sub.f32 f2155, f2011, f2026; +sub.f32 f2157, f2016, f2021; +mul.f32 f2311, f2155, 0f3F737871; +fma.rn.f32 f2158, f2157, 0f3F167918, f2311; +add.f32 f2310, f1819, f2317; +mul.f32 f2159, f2317, 0f3F4F1BBD; +sub.f32 f2160, f1819, f2159; +fma.rn.f32 f2161, f2316, 0f3E9E377A, f2160; +mul.f32 f2162, f2155, 0f3F167918; +mul.f32 f2163, f2157, 0f3F737871; +sub.f32 f2164, f2162, f2163; +add.f32 f2165, f2031, f2046; +add.f32 f2167, f2036, f2041; +fma.rn.f32 f2308, f2165, 0f3E9E377A, f1803; +mul.f32 f2309, f2167, 0f3F4F1BBD; +sub.f32 f2173, f2308, f2309; +add.f32 f2307, f2033, f2048; +sub.f32 f2174, f2033, f2048; +add.f32 f2306, f2038, f2043; +sub.f32 f2176, f2038, f2043; +mul.f32 f2305, f2174, 0f3F737871; +fma.rn.f32 f2177, f2176, 0f3F167918, f2305; +add.f32 f2304, f1803, f2165; +mul.f32 f2178, f2165, 0f3F4F1BBD; +sub.f32 f2179, f1803, f2178; +fma.rn.f32 f2180, f2167, 0f3E9E377A, f2179; +mul.f32 f2181, f2174, 0f3F167918; +mul.f32 f2182, f2176, 0f3F737871; +sub.f32 f2183, f2181, f2182; +mul.f32 f2185, f2306, 0f3F4F1BBD; +fma.rn.f32 f2303, f2307, 0f3E9E377A, f1820; +sub.f32 f2186, f2303, f2185; +sub.f32 f2187, f2031, f2046; +sub.f32 f2189, f2036, f2041; +mul.f32 f2302, f2187, 0f3F737871; +fma.rn.f32 f2190, f2189, 0f3F167918, f2302; +add.f32 f2301, f1820, f2307; +mul.f32 f2191, f2307, 0f3F4F1BBD; +sub.f32 f2192, f1820, f2191; +fma.rn.f32 f2193, f2306, 0f3E9E377A, f2192; +mul.f32 f2194, f2187, 0f3F167918; +mul.f32 f2195, f2189, 0f3F737871; +sub.f32 f2196, f2194, f2195; +add.f32 f2197, f2051, f2066; +add.f32 f2199, f2056, f2061; +mul.f32 f2204, f2199, 0f3F4F1BBD; +fma.rn.f32 f2300, f2197, 0f3E9E377A, f1795; +sub.f32 f2205, f2300, f2204; +add.f32 f2299, f2053, f2068; +sub.f32 f2206, f2053, f2068; +add.f32 f2298, f2058, f2063; +sub.f32 f2208, f2058, f2063; +mul.f32 f2297, f2206, 0f3F737871; +fma.rn.f32 f2209, f2208, 0f3F167918, f2297; +add.f32 f2296, f1795, f2197; +mul.f32 f2210, f2197, 0f3F4F1BBD; +sub.f32 f2211, f1795, f2210; +fma.rn.f32 f2212, f2199, 0f3E9E377A, f2211; +mul.f32 f2213, f2206, 0f3F167918; +mul.f32 f2214, f2208, 0f3F737871; +sub.f32 f2215, f2213, f2214; +fma.rn.f32 f2294, f2299, 0f3E9E377A, f1812; +mul.f32 f2295, f2298, 0f3F4F1BBD; +sub.f32 f2218, f2294, f2295; +sub.f32 f2219, f2051, f2066; +sub.f32 f2221, f2056, f2061; +mul.f32 f2293, f2219, 0f3F737871; +fma.rn.f32 f2222, f2221, 0f3F167918, f2293; +add.f32 f2292, f1812, f2299; +mul.f32 f2223, f2299, 0f3F4F1BBD; +sub.f32 f2224, f1812, f2223; +fma.rn.f32 f2225, f2298, 0f3E9E377A, f2224; +mul.f32 f2226, f2219, 0f3F167918; +mul.f32 f2227, f2221, 0f3F737871; +sub.f32 f2228, f2226, f2227; +add.f32 %1, f2334, f2328; +add.f32 %0, f2071, f2332; +add.f32 %3, f2324, f2319; +add.f32 %2, f2103, f2322; +add.f32 %5, f2316, f2310; +add.f32 %4, f2135, f2314; +add.f32 %7, f2306, f2301; +add.f32 %6, f2167, f2304; +add.f32 %9, f2298, f2292; +add.f32 %8, f2199, f2296; +sub.f32 %10, f2077, f2081; +add.f32 %11, f2094, f2090; +sub.f32 %12, f2109, f2113; +add.f32 %13, f2126, f2122; +sub.f32 %14, f2141, f2145; +add.f32 %15, f2158, f2154; +sub.f32 %16, f2173, f2177; +add.f32 %17, f2190, f2186; +add.f32 %19, f2222, f2218; +sub.f32 %18, f2205, f2209; +add.f32 %21, f2100, f2097; +sub.f32 %20, f2084, f2087; +add.f32 %23, f2132, f2129; +sub.f32 %22, f2116, f2119; +sub.f32 %24, f2148, f2151; +add.f32 %25, f2164, f2161; +sub.f32 %26, f2180, f2183; +add.f32 %27, f2196, f2193; +sub.f32 %28, f2212, f2215; +add.f32 %29, f2228, f2225; +sub.f32 %31, f2097, f2100; +add.f32 %30, f2087, f2084; +sub.f32 %33, f2129, f2132; +add.f32 %32, f2119, f2116; +sub.f32 %35, f2161, f2164; +add.f32 %34, f2151, f2148; +sub.f32 %37, f2193, f2196; +add.f32 %36, f2183, f2180; +sub.f32 %39, f2225, f2228; +add.f32 %38, f2215, f2212; +sub.f32 %41, f2090, f2094; +add.f32 %40, f2081, f2077; +sub.f32 %43, f2122, f2126; +add.f32 %42, f2113, f2109; +sub.f32 %45, f2154, f2158; +add.f32 %44, f2145, f2141; +sub.f32 %47, f2186, f2190; +add.f32 %46, f2177, f2173; +sub.f32 %49, f2218, f2222; +add.f32 %48, f2209, f2205; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_15625), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<372, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2179>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 62500, r2; +add.f32 f101, %66, %106; +add.f32 f102, %53, f101; +add.f32 f103, %79, %93; +add.f32 f104, f103, f102; +add.f32 f105, %68, %108; +add.f32 f106, %54, f105; +add.f32 f107, %81, %94; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %53; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %68, %108; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %81, %94; +fma.rn.f32 f115, f114, 0f3F167918, f113; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %53, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +fma.rn.f32 f126, f105, 0f3E9E377A, %54; +mul.f32 f127, f107, 0f3F4F1BBD; +sub.f32 f128, f126, f127; +sub.f32 f129, %66, %106; +mul.f32 f130, f129, 0f3F737871; +sub.f32 f131, %79, %93; +fma.rn.f32 f132, f131, 0f3F167918, f130; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %54, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %69, %109; +add.f32 f144, %55, f143; +add.f32 f145, %82, %95; +add.f32 f146, f145, f144; +add.f32 f147, %70, %110; +add.f32 f148, %57, f147; +add.f32 f149, %84, %97; +add.f32 f150, f149, f148; +fma.rn.f32 f151, f143, 0f3E9E377A, %55; +mul.f32 f152, f145, 0f3F4F1BBD; +sub.f32 f153, f151, f152; +sub.f32 f154, %70, %110; +mul.f32 f155, f154, 0f3F737871; +sub.f32 f156, %84, %97; +fma.rn.f32 f157, f156, 0f3F167918, f155; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %55, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +fma.rn.f32 f168, f147, 0f3E9E377A, %57; +mul.f32 f169, f149, 0f3F4F1BBD; +sub.f32 f170, f168, f169; +sub.f32 f171, %69, %109; +mul.f32 f172, f171, 0f3F737871; +sub.f32 f173, %82, %95; +fma.rn.f32 f174, f173, 0f3F167918, f172; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %57, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %71, %111; +add.f32 f186, %58, f185; +add.f32 f187, %85, %98; +add.f32 f188, f187, f186; +add.f32 f189, %73, %113; +add.f32 f190, %60, f189; +add.f32 f191, %86, %100; +add.f32 f192, f191, f190; +fma.rn.f32 f193, f185, 0f3E9E377A, %58; +mul.f32 f194, f187, 0f3F4F1BBD; +sub.f32 f195, f193, f194; +sub.f32 f196, %73, %113; +mul.f32 f197, f196, 0f3F737871; +sub.f32 f198, %86, %100; +fma.rn.f32 f199, f198, 0f3F167918, f197; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %58, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f210, f189, 0f3E9E377A, %60; +mul.f32 f211, f191, 0f3F4F1BBD; +sub.f32 f212, f210, f211; +sub.f32 f213, %71, %111; +mul.f32 f214, f213, 0f3F737871; +sub.f32 f215, %85, %98; +fma.rn.f32 f216, f215, 0f3F167918, f214; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %60, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %74, %114; +add.f32 f228, %61, f227; +add.f32 f229, %87, %101; +add.f32 f230, f229, f228; +add.f32 f231, %76, %116; +add.f32 f232, %62, f231; +add.f32 f233, %89, %102; +add.f32 f234, f233, f232; +fma.rn.f32 f235, f227, 0f3E9E377A, %61; +mul.f32 f236, f229, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %76, %116; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %89, %102; +fma.rn.f32 f241, f240, 0f3F167918, f239; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %61, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +fma.rn.f32 f252, f231, 0f3E9E377A, %62; +mul.f32 f253, f233, 0f3F4F1BBD; +sub.f32 f254, f252, f253; +sub.f32 f255, %74, %114; +mul.f32 f256, f255, 0f3F737871; +sub.f32 f257, %87, %101; +fma.rn.f32 f258, f257, 0f3F167918, f256; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %62, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %77, %117; +add.f32 f270, %63, f269; +add.f32 f271, %90, %103; +add.f32 f272, f271, f270; +add.f32 f273, %78, %118; +add.f32 f274, %65, f273; +add.f32 f275, %92, %105; +add.f32 f276, f275, f274; +fma.rn.f32 f277, f269, 0f3E9E377A, %63; +mul.f32 f278, f271, 0f3F4F1BBD; +sub.f32 f279, f277, f278; +sub.f32 f280, %78, %118; +mul.f32 f281, f280, 0f3F737871; +sub.f32 f282, %92, %105; +fma.rn.f32 f283, f282, 0f3F167918, f281; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %63, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +fma.rn.f32 f294, f273, 0f3E9E377A, %65; +mul.f32 f295, f275, 0f3F4F1BBD; +sub.f32 f296, f294, f295; +sub.f32 f297, %77, %117; +mul.f32 f298, f297, 0f3F737871; +sub.f32 f299, %90, %103; +fma.rn.f32 f300, f299, 0f3F167918, f298; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %65, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mov.u32 r4, %tid.x; +mul.f32 f311, f158, 0f3F77F511; +mul.f32 f312, f175, 0f3E7EA890; +sub.f32 f313, f311, f312; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f316, f200, 0f3F6055A2; +mul.f32 f317, f217, 0f3EF6A86B; +sub.f32 f318, f316, f317; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f321, f242, 0f3F3A9DB0; +mul.f32 f322, f259, 0f3F2F3E7B; +sub.f32 f323, f321, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f326, f284, 0f3F092BF2; +mul.f32 f327, f301, 0f3F5825E0; +sub.f32 f328, f326, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f331, f166, 0f3F6055A2; +mul.f32 f332, f183, 0f3EF6A86B; +sub.f32 f333, f331, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f336, f208, 0f3F092BF2; +mul.f32 f337, f225, 0f3F5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f341, f250, 0f3D809851; +mul.f32 f342, f267, 0f3F7F7EAE; +sub.f32 f343, f341, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f346, f292, 0fBED9FFBE; +mul.f32 f347, f309, 0f3F67A2BF; +sub.f32 f348, f346, f347; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f351, f167, 0f3F3A9DB0; +mul.f32 f352, f184, 0f3F2F3E7B; +sub.f32 f353, f351, f352; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f356, f209, 0f3D809851; +mul.f32 f357, f226, 0f3F7F7EAE; +sub.f32 f358, f356, f357; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f361, f251, 0fBF232E38; +mul.f32 f362, f268, 0f3F45405B; +sub.f32 f363, f361, f362; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f366, f293, 0fBF7DFB3B; +mul.f32 f367, f310, 0f3E00575B; +sub.f32 f368, f366, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f371, f159, 0f3F092BF2; +mul.f32 f372, f176, 0f3F5825E0; +sub.f32 f373, f371, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f376, f201, 0fBED9FFBE; +mul.f32 f377, f218, 0f3F67A2BF; +sub.f32 f378, f376, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f381, f243, 0fBF7DFB3B; +mul.f32 f382, f260, 0f3E00575B; +sub.f32 f383, f381, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f386, f285, 0fBF232E38; +mul.f32 f387, f302, 0fBF45405B; +sub.f32 f388, f386, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f392, f104, f391; +add.f32 f393, f188, f230; +add.f32 f394, f393, f392; +add.f32 f395, f150, f276; +add.f32 f396, f108, f395; +add.f32 f397, f192, f234; +add.f32 f398, f397, f396; +fma.rn.f32 f399, f391, 0f3E9E377A, f104; +mul.f32 f400, f393, 0f3F4F1BBD; +sub.f32 f401, f399, f400; +sub.f32 f402, f150, f276; +mul.f32 f403, f402, 0f3F737871; +sub.f32 f404, f192, f234; +fma.rn.f32 f405, f404, 0f3F167918, f403; +sub.f32 f406, f401, f405; +add.f32 f407, f405, f401; +mul.f32 f408, f391, 0f3F4F1BBD; +sub.f32 f409, f104, f408; +fma.rn.f32 f410, f393, 0f3E9E377A, f409; +mul.f32 f411, f402, 0f3F167918; +mul.f32 f412, f404, 0f3F737871; +sub.f32 f413, f411, f412; +sub.f32 f414, f410, f413; +add.f32 f415, f413, f410; +fma.rn.f32 f416, f395, 0f3E9E377A, f108; +mul.f32 f417, f397, 0f3F4F1BBD; +sub.f32 f418, f416, f417; +sub.f32 f419, f146, f272; +mul.f32 f420, f419, 0f3F737871; +sub.f32 f421, f188, f230; +fma.rn.f32 f422, f421, 0f3F167918, f420; +add.f32 f423, f422, f418; +sub.f32 f424, f418, f422; +mul.f32 f425, f395, 0f3F4F1BBD; +sub.f32 f426, f108, f425; +fma.rn.f32 f427, f397, 0f3E9E377A, f426; +mul.f32 f428, f419, 0f3F167918; +mul.f32 f429, f421, 0f3F737871; +sub.f32 f430, f428, f429; +add.f32 f431, f430, f427; +sub.f32 f432, f427, f430; +add.f32 f433, f313, f328; +add.f32 f434, f116, f433; +add.f32 f435, f318, f323; +add.f32 f436, f435, f434; +add.f32 f437, f315, f330; +add.f32 f438, f133, f437; +add.f32 f439, f320, f325; +add.f32 f440, f439, f438; +fma.rn.f32 f441, f433, 0f3E9E377A, f116; +mul.f32 f442, f435, 0f3F4F1BBD; +sub.f32 f443, f441, f442; +sub.f32 f444, f315, f330; +mul.f32 f445, f444, 0f3F737871; +sub.f32 f446, f320, f325; +fma.rn.f32 f447, f446, 0f3F167918, f445; +sub.f32 f448, f443, f447; +add.f32 f449, f447, f443; +mul.f32 f450, f433, 0f3F4F1BBD; +sub.f32 f451, f116, f450; +fma.rn.f32 f452, f435, 0f3E9E377A, f451; +mul.f32 f453, f444, 0f3F167918; +mul.f32 f454, f446, 0f3F737871; +sub.f32 f455, f453, f454; +sub.f32 f456, f452, f455; +add.f32 f457, f455, f452; +fma.rn.f32 f458, f437, 0f3E9E377A, f133; +mul.f32 f459, f439, 0f3F4F1BBD; +sub.f32 f460, f458, f459; +sub.f32 f461, f313, f328; +mul.f32 f462, f461, 0f3F737871; +sub.f32 f463, f318, f323; +fma.rn.f32 f464, f463, 0f3F167918, f462; +add.f32 f465, f464, f460; +sub.f32 f466, f460, f464; +mul.f32 f467, f437, 0f3F4F1BBD; +sub.f32 f468, f133, f467; +fma.rn.f32 f469, f439, 0f3E9E377A, f468; +mul.f32 f470, f461, 0f3F167918; +mul.f32 f471, f463, 0f3F737871; +sub.f32 f472, f470, f471; +add.f32 f473, f472, f469; +sub.f32 f474, f469, f472; +add.f32 f475, f333, f348; +add.f32 f476, f124, f475; +add.f32 f477, f338, f343; +add.f32 f478, f477, f476; +add.f32 f479, f335, f350; +add.f32 f480, f141, f479; +add.f32 f481, f340, f345; +add.f32 f482, f481, f480; +fma.rn.f32 f483, f475, 0f3E9E377A, f124; +mul.f32 f484, f477, 0f3F4F1BBD; +sub.f32 f485, f483, f484; +sub.f32 f486, f335, f350; +mul.f32 f487, f486, 0f3F737871; +sub.f32 f488, f340, f345; +fma.rn.f32 f489, f488, 0f3F167918, f487; +sub.f32 f490, f485, f489; +add.f32 f491, f489, f485; +mul.f32 f492, f475, 0f3F4F1BBD; +sub.f32 f493, f124, f492; +fma.rn.f32 f494, f477, 0f3E9E377A, f493; +mul.f32 f495, f486, 0f3F167918; +mul.f32 f496, f488, 0f3F737871; +sub.f32 f497, f495, f496; +sub.f32 f498, f494, f497; +add.f32 f499, f497, f494; +fma.rn.f32 f500, f479, 0f3E9E377A, f141; +mul.f32 f501, f481, 0f3F4F1BBD; +sub.f32 f502, f500, f501; +sub.f32 f503, f333, f348; +mul.f32 f504, f503, 0f3F737871; +sub.f32 f505, f338, f343; +fma.rn.f32 f506, f505, 0f3F167918, f504; +add.f32 f507, f506, f502; +sub.f32 f508, f502, f506; +mul.f32 f509, f479, 0f3F4F1BBD; +sub.f32 f510, f141, f509; +fma.rn.f32 f511, f481, 0f3E9E377A, f510; +mul.f32 f512, f503, 0f3F167918; +mul.f32 f513, f505, 0f3F737871; +sub.f32 f514, f512, f513; +add.f32 f515, f514, f511; +sub.f32 f516, f511, f514; +add.f32 f517, f353, f368; +add.f32 f518, f125, f517; +add.f32 f519, f358, f363; +add.f32 f520, f519, f518; +add.f32 f521, f355, f370; +add.f32 f522, f142, f521; +add.f32 f523, f360, f365; +add.f32 f524, f523, f522; +fma.rn.f32 f525, f517, 0f3E9E377A, f125; +mul.f32 f526, f519, 0f3F4F1BBD; +sub.f32 f527, f525, f526; +sub.f32 f528, f355, f370; +mul.f32 f529, f528, 0f3F737871; +sub.f32 f530, f360, f365; +fma.rn.f32 f531, f530, 0f3F167918, f529; +sub.f32 f532, f527, f531; +add.f32 f533, f531, f527; +mul.f32 f534, f517, 0f3F4F1BBD; +sub.f32 f535, f125, f534; +fma.rn.f32 f536, f519, 0f3E9E377A, f535; +mul.f32 f537, f528, 0f3F167918; +mul.f32 f538, f530, 0f3F737871; +sub.f32 f539, f537, f538; +sub.f32 f540, f536, f539; +add.f32 f541, f539, f536; +fma.rn.f32 f542, f521, 0f3E9E377A, f142; +mul.f32 f543, f523, 0f3F4F1BBD; +sub.f32 f544, f542, f543; +sub.f32 f545, f353, f368; +mul.f32 f546, f545, 0f3F737871; +sub.f32 f547, f358, f363; +fma.rn.f32 f548, f547, 0f3F167918, f546; +add.f32 f549, f548, f544; +sub.f32 f550, f544, f548; +mul.f32 f551, f521, 0f3F4F1BBD; +sub.f32 f552, f142, f551; +fma.rn.f32 f553, f523, 0f3E9E377A, f552; +mul.f32 f554, f545, 0f3F167918; +mul.f32 f555, f547, 0f3F737871; +sub.f32 f556, f554, f555; +add.f32 f557, f556, f553; +sub.f32 f558, f553, f556; +add.f32 f559, f373, f388; +add.f32 f560, f117, f559; +add.f32 f561, f378, f383; +add.f32 f562, f561, f560; +add.f32 f563, f375, f390; +add.f32 f564, f134, f563; +add.f32 f565, f380, f385; +add.f32 f566, f565, f564; +fma.rn.f32 f567, f559, 0f3E9E377A, f117; +mul.f32 f568, f561, 0f3F4F1BBD; +sub.f32 f569, f567, f568; +sub.f32 f570, f375, f390; +mul.f32 f571, f570, 0f3F737871; +sub.f32 f572, f380, f385; +fma.rn.f32 f573, f572, 0f3F167918, f571; +sub.f32 f574, f569, f573; +add.f32 f575, f573, f569; +mul.f32 f576, f559, 0f3F4F1BBD; +sub.f32 f577, f117, f576; +fma.rn.f32 f578, f561, 0f3E9E377A, f577; +mul.f32 f579, f570, 0f3F167918; +mul.f32 f580, f572, 0f3F737871; +sub.f32 f581, f579, f580; +sub.f32 f582, f578, f581; +add.f32 f583, f581, f578; +fma.rn.f32 f584, f563, 0f3E9E377A, f134; +mul.f32 f585, f565, 0f3F4F1BBD; +sub.f32 f586, f584, f585; +sub.f32 f587, f373, f388; +mul.f32 f588, f587, 0f3F737871; +sub.f32 f589, f378, f383; +fma.rn.f32 f590, f589, 0f3F167918, f588; +add.f32 f591, f590, f586; +sub.f32 f592, f586, f590; +mul.f32 f593, f563, 0f3F4F1BBD; +sub.f32 f594, f134, f593; +fma.rn.f32 f595, f565, 0f3E9E377A, f594; +mul.f32 f596, f587, 0f3F167918; +mul.f32 f597, f589, 0f3F737871; +sub.f32 f598, f596, f597; +add.f32 f599, f598, f595; +sub.f32 f600, f595, f598; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f601, f602}, [rd6]; +mul.f32 f605, f440, f602; +fma.rn.f32 f606, f601, f436, f605; +mul.f32 f607, f436, f602; +mul.f32 f608, f601, f440; +sub.f32 f609, f608, f607; +mul.f32 f610, f601, f601; +mul.f32 f611, f602, f602; +sub.f32 f612, f610, f611; +mul.f32 f613, f602, f601; +fma.rn.f32 f614, f602, f601, f613; +mul.f32 f615, f482, f614; +fma.rn.f32 f616, f612, f478, f615; +mul.f32 f617, f478, f614; +mul.f32 f618, f612, f482; +sub.f32 f619, f618, f617; +mul.f32 f620, f601, f612; +mul.f32 f621, f602, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f601, f614; +fma.rn.f32 f624, f602, f612, f623; +mul.f32 f625, f524, f624; +fma.rn.f32 f626, f622, f520, f625; +mul.f32 f627, f520, f624; +mul.f32 f628, f622, f524; +sub.f32 f629, f628, f627; +mul.f32 f630, f601, f622; +mul.f32 f631, f602, f624; +sub.f32 f632, f630, f631; +mul.f32 f633, f601, f624; +fma.rn.f32 f634, f602, f622, f633; +mul.f32 f635, f566, f634; +fma.rn.f32 f636, f632, f562, f635; +mul.f32 f637, f562, f634; +mul.f32 f638, f632, f566; +sub.f32 f639, f638, f637; +mul.f32 f640, f601, f632; +mul.f32 f641, f602, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f601, f634; +fma.rn.f32 f644, f602, f632, f643; +mul.f32 f645, f423, f644; +fma.rn.f32 f646, f642, f406, f645; +mul.f32 f647, f406, f644; +mul.f32 f648, f642, f423; +sub.f32 f649, f648, f647; +mul.f32 f650, f601, f642; +mul.f32 f651, f602, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f601, f644; +fma.rn.f32 f654, f602, f642, f653; +mul.f32 f655, f465, f654; +fma.rn.f32 f656, f652, f448, f655; +mul.f32 f657, f448, f654; +mul.f32 f658, f652, f465; +sub.f32 f659, f658, f657; +mul.f32 f660, f601, f652; +mul.f32 f661, f602, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f601, f654; +fma.rn.f32 f664, f602, f652, f663; +mul.f32 f665, f507, f664; +fma.rn.f32 f666, f662, f490, f665; +mul.f32 f667, f490, f664; +mul.f32 f668, f662, f507; +sub.f32 f669, f668, f667; +mul.f32 f670, f601, f662; +mul.f32 f671, f602, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f601, f664; +fma.rn.f32 f674, f602, f662, f673; +mul.f32 f675, f549, f674; +fma.rn.f32 f676, f672, f532, f675; +mul.f32 f677, f532, f674; +mul.f32 f678, f672, f549; +sub.f32 f679, f678, f677; +mul.f32 f680, f601, f672; +mul.f32 f681, f602, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f601, f674; +fma.rn.f32 f684, f602, f672, f683; +mul.f32 f685, f591, f684; +fma.rn.f32 f686, f682, f574, f685; +mul.f32 f687, f574, f684; +mul.f32 f688, f682, f591; +sub.f32 f689, f688, f687; +mul.f32 f690, f601, f682; +mul.f32 f691, f602, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f601, f684; +fma.rn.f32 f694, f602, f682, f693; +mul.f32 f695, f431, f694; +fma.rn.f32 f696, f692, f414, f695; +mul.f32 f697, f414, f694; +mul.f32 f698, f692, f431; +sub.f32 f699, f698, f697; +mul.f32 f700, f601, f692; +mul.f32 f701, f602, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f601, f694; +fma.rn.f32 f704, f602, f692, f703; +mul.f32 f705, f473, f704; +fma.rn.f32 f706, f702, f456, f705; +mul.f32 f707, f456, f704; +mul.f32 f708, f702, f473; +sub.f32 f709, f708, f707; +mul.f32 f710, f601, f702; +mul.f32 f711, f602, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f601, f704; +fma.rn.f32 f714, f602, f702, f713; +mul.f32 f715, f515, f714; +fma.rn.f32 f716, f712, f498, f715; +mul.f32 f717, f498, f714; +mul.f32 f718, f712, f515; +sub.f32 f719, f718, f717; +mul.f32 f720, f601, f712; +mul.f32 f721, f602, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f601, f714; +fma.rn.f32 f724, f602, f712, f723; +mul.f32 f725, f557, f724; +fma.rn.f32 f726, f722, f540, f725; +mul.f32 f727, f540, f724; +mul.f32 f728, f722, f557; +sub.f32 f729, f728, f727; +mul.f32 f730, f601, f722; +mul.f32 f731, f602, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f601, f724; +fma.rn.f32 f734, f602, f722, f733; +mul.f32 f735, f599, f734; +fma.rn.f32 f736, f732, f582, f735; +mul.f32 f737, f582, f734; +mul.f32 f738, f732, f599; +sub.f32 f739, f738, f737; +mul.f32 f740, f601, f732; +mul.f32 f741, f602, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f601, f734; +fma.rn.f32 f744, f602, f732, f743; +mul.f32 f745, f432, f744; +fma.rn.f32 f746, f742, f415, f745; +mul.f32 f747, f415, f744; +mul.f32 f748, f742, f432; +sub.f32 f749, f748, f747; +mul.f32 f750, f601, f742; +mul.f32 f751, f602, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f601, f744; +fma.rn.f32 f754, f602, f742, f753; +mul.f32 f755, f474, f754; +fma.rn.f32 f756, f752, f457, f755; +mul.f32 f757, f457, f754; +mul.f32 f758, f752, f474; +sub.f32 f759, f758, f757; +mul.f32 f760, f601, f752; +mul.f32 f761, f602, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f601, f754; +fma.rn.f32 f764, f602, f752, f763; +mul.f32 f765, f516, f764; +fma.rn.f32 f766, f762, f499, f765; +mul.f32 f767, f499, f764; +mul.f32 f768, f762, f516; +sub.f32 f769, f768, f767; +mul.f32 f770, f601, f762; +mul.f32 f771, f602, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f601, f764; +fma.rn.f32 f774, f602, f762, f773; +mul.f32 f775, f558, f774; +fma.rn.f32 f776, f772, f541, f775; +mul.f32 f777, f541, f774; +mul.f32 f778, f772, f558; +sub.f32 f779, f778, f777; +mul.f32 f780, f601, f772; +mul.f32 f781, f602, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f601, f774; +fma.rn.f32 f784, f602, f772, f783; +mul.f32 f785, f600, f784; +fma.rn.f32 f786, f782, f583, f785; +mul.f32 f787, f583, f784; +mul.f32 f788, f782, f600; +sub.f32 f789, f788, f787; +mul.f32 f790, f601, f782; +mul.f32 f791, f602, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f601, f784; +fma.rn.f32 f794, f602, f782, f793; +mul.f32 f795, f424, f794; +fma.rn.f32 f796, f792, f407, f795; +mul.f32 f797, f407, f794; +mul.f32 f798, f792, f424; +sub.f32 f799, f798, f797; +mul.f32 f800, f601, f792; +mul.f32 f801, f602, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f601, f794; +fma.rn.f32 f804, f602, f792, f803; +mul.f32 f805, f466, f804; +fma.rn.f32 f806, f802, f449, f805; +mul.f32 f807, f449, f804; +mul.f32 f808, f802, f466; +sub.f32 f809, f808, f807; +mul.f32 f810, f601, f802; +mul.f32 f811, f602, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f601, f804; +fma.rn.f32 f814, f602, f802, f813; +mul.f32 f815, f508, f814; +fma.rn.f32 f816, f812, f491, f815; +mul.f32 f817, f491, f814; +mul.f32 f818, f812, f508; +sub.f32 f819, f818, f817; +mul.f32 f820, f601, f812; +mul.f32 f821, f602, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f601, f814; +fma.rn.f32 f824, f602, f812, f823; +mul.f32 f825, f550, f824; +fma.rn.f32 f826, f822, f533, f825; +mul.f32 f827, f533, f824; +mul.f32 f828, f822, f550; +sub.f32 f829, f828, f827; +mul.f32 f830, f601, f822; +mul.f32 f831, f602, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f601, f824; +fma.rn.f32 f834, f602, f822, f833; +mul.f32 f835, f592, f834; +fma.rn.f32 f836, f832, f575, f835; +mul.f32 f837, f575, f834; +mul.f32 f838, f832, f592; +sub.f32 f839, f838, f837; +mad.lo.s32 r8, r5, 62500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f394; +st.shared.f32 [r9+4], f606; +st.shared.f32 [r9+8], f616; +st.shared.f32 [r9+12], f626; +st.shared.f32 [r9+16], f636; +st.shared.f32 [r9+20], f646; +st.shared.f32 [r9+24], f656; +st.shared.f32 [r9+28], f666; +st.shared.f32 [r9+32], f676; +st.shared.f32 [r9+36], f686; +st.shared.f32 [r9+40], f696; +st.shared.f32 [r9+44], f706; +st.shared.f32 [r9+48], f716; +st.shared.f32 [r9+52], f726; +st.shared.f32 [r9+56], f736; +st.shared.f32 [r9+60], f746; +st.shared.f32 [r9+64], f756; +st.shared.f32 [r9+68], f766; +st.shared.f32 [r9+72], f776; +st.shared.f32 [r9+76], f786; +st.shared.f32 [r9+80], f796; +st.shared.f32 [r9+84], f806; +st.shared.f32 [r9+88], f816; +st.shared.f32 [r9+92], f826; +st.shared.f32 [r9+96], f836; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f840, [r10]; +ld.shared.f32 f841, [r10+2500]; +ld.shared.f32 f842, [r10+5000]; +ld.shared.f32 f843, [r10+7500]; +ld.shared.f32 f844, [r10+10000]; +ld.shared.f32 f845, [r10+12500]; +ld.shared.f32 f846, [r10+15000]; +ld.shared.f32 f847, [r10+17500]; +ld.shared.f32 f848, [r10+20000]; +ld.shared.f32 f849, [r10+22500]; +ld.shared.f32 f850, [r10+25000]; +ld.shared.f32 f851, [r10+27500]; +ld.shared.f32 f852, [r10+30000]; +ld.shared.f32 f853, [r10+32500]; +ld.shared.f32 f854, [r10+35000]; +ld.shared.f32 f855, [r10+37500]; +ld.shared.f32 f856, [r10+40000]; +ld.shared.f32 f857, [r10+42500]; +ld.shared.f32 f858, [r10+45000]; +ld.shared.f32 f859, [r10+47500]; +ld.shared.f32 f860, [r10+50000]; +ld.shared.f32 f861, [r10+52500]; +ld.shared.f32 f862, [r10+55000]; +ld.shared.f32 f863, [r10+57500]; +ld.shared.f32 f864, [r10+60000]; +barrier.sync 0; +st.shared.f32 [r9], f398; +st.shared.f32 [r9+4], f609; +st.shared.f32 [r9+8], f619; +st.shared.f32 [r9+12], f629; +st.shared.f32 [r9+16], f639; +st.shared.f32 [r9+20], f649; +st.shared.f32 [r9+24], f659; +st.shared.f32 [r9+28], f669; +st.shared.f32 [r9+32], f679; +st.shared.f32 [r9+36], f689; +st.shared.f32 [r9+40], f699; +st.shared.f32 [r9+44], f709; +st.shared.f32 [r9+48], f719; +st.shared.f32 [r9+52], f729; +st.shared.f32 [r9+56], f739; +st.shared.f32 [r9+60], f749; +st.shared.f32 [r9+64], f759; +st.shared.f32 [r9+68], f769; +st.shared.f32 [r9+72], f779; +st.shared.f32 [r9+76], f789; +st.shared.f32 [r9+80], f799; +st.shared.f32 [r9+84], f809; +st.shared.f32 [r9+88], f819; +st.shared.f32 [r9+92], f829; +st.shared.f32 [r9+96], f839; +barrier.sync 0; +ld.shared.f32 f865, [r10]; +ld.shared.f32 f866, [r10+2500]; +ld.shared.f32 f867, [r10+5000]; +ld.shared.f32 f868, [r10+7500]; +ld.shared.f32 f869, [r10+10000]; +ld.shared.f32 f870, [r10+12500]; +ld.shared.f32 f871, [r10+15000]; +ld.shared.f32 f872, [r10+17500]; +ld.shared.f32 f873, [r10+20000]; +ld.shared.f32 f874, [r10+22500]; +ld.shared.f32 f875, [r10+25000]; +ld.shared.f32 f876, [r10+27500]; +ld.shared.f32 f877, [r10+30000]; +ld.shared.f32 f878, [r10+32500]; +ld.shared.f32 f879, [r10+35000]; +ld.shared.f32 f880, [r10+37500]; +ld.shared.f32 f881, [r10+40000]; +ld.shared.f32 f882, [r10+42500]; +ld.shared.f32 f883, [r10+45000]; +ld.shared.f32 f884, [r10+47500]; +ld.shared.f32 f885, [r10+50000]; +ld.shared.f32 f886, [r10+52500]; +ld.shared.f32 f887, [r10+55000]; +ld.shared.f32 f888, [r10+57500]; +ld.shared.f32 f889, [r10+60000]; +add.f32 f890, f845, f860; +add.f32 f891, f840, f890; +add.f32 f892, f850, f855; +add.f32 f893, f892, f891; +add.f32 f894, f870, f885; +add.f32 f895, f865, f894; +add.f32 f896, f875, f880; +add.f32 f897, f896, f895; +fma.rn.f32 f898, f890, 0f3E9E377A, f840; +mul.f32 f899, f892, 0f3F4F1BBD; +sub.f32 f900, f898, f899; +sub.f32 f901, f870, f885; +mul.f32 f902, f901, 0f3F737871; +sub.f32 f903, f875, f880; +fma.rn.f32 f904, f903, 0f3F167918, f902; +sub.f32 f905, f900, f904; +add.f32 f906, f904, f900; +mul.f32 f907, f890, 0f3F4F1BBD; +sub.f32 f908, f840, f907; +fma.rn.f32 f909, f892, 0f3E9E377A, f908; +mul.f32 f910, f901, 0f3F167918; +mul.f32 f911, f903, 0f3F737871; +sub.f32 f912, f910, f911; +sub.f32 f913, f909, f912; +add.f32 f914, f912, f909; +fma.rn.f32 f915, f894, 0f3E9E377A, f865; +mul.f32 f916, f896, 0f3F4F1BBD; +sub.f32 f917, f915, f916; +sub.f32 f918, f845, f860; +mul.f32 f919, f918, 0f3F737871; +sub.f32 f920, f850, f855; +fma.rn.f32 f921, f920, 0f3F167918, f919; +add.f32 f922, f921, f917; +sub.f32 f923, f917, f921; +mul.f32 f924, f894, 0f3F4F1BBD; +sub.f32 f925, f865, f924; +fma.rn.f32 f926, f896, 0f3E9E377A, f925; +mul.f32 f927, f918, 0f3F167918; +mul.f32 f928, f920, 0f3F737871; +sub.f32 f929, f927, f928; +add.f32 f930, f929, f926; +sub.f32 f931, f926, f929; +add.f32 f932, f846, f861; +add.f32 f933, f841, f932; +add.f32 f934, f851, f856; +add.f32 f935, f934, f933; +add.f32 f936, f871, f886; +add.f32 f937, f866, f936; +add.f32 f938, f876, f881; +add.f32 f939, f938, f937; +fma.rn.f32 f940, f932, 0f3E9E377A, f841; +mul.f32 f941, f934, 0f3F4F1BBD; +sub.f32 f942, f940, f941; +sub.f32 f943, f871, f886; +mul.f32 f944, f943, 0f3F737871; +sub.f32 f945, f876, f881; +fma.rn.f32 f946, f945, 0f3F167918, f944; +sub.f32 f947, f942, f946; +add.f32 f948, f946, f942; +mul.f32 f949, f932, 0f3F4F1BBD; +sub.f32 f950, f841, f949; +fma.rn.f32 f951, f934, 0f3E9E377A, f950; +mul.f32 f952, f943, 0f3F167918; +mul.f32 f953, f945, 0f3F737871; +sub.f32 f954, f952, f953; +sub.f32 f955, f951, f954; +add.f32 f956, f954, f951; +fma.rn.f32 f957, f936, 0f3E9E377A, f866; +mul.f32 f958, f938, 0f3F4F1BBD; +sub.f32 f959, f957, f958; +sub.f32 f960, f846, f861; +mul.f32 f961, f960, 0f3F737871; +sub.f32 f962, f851, f856; +fma.rn.f32 f963, f962, 0f3F167918, f961; +add.f32 f964, f963, f959; +sub.f32 f965, f959, f963; +mul.f32 f966, f936, 0f3F4F1BBD; +sub.f32 f967, f866, f966; +fma.rn.f32 f968, f938, 0f3E9E377A, f967; +mul.f32 f969, f960, 0f3F167918; +mul.f32 f970, f962, 0f3F737871; +sub.f32 f971, f969, f970; +add.f32 f972, f971, f968; +sub.f32 f973, f968, f971; +add.f32 f974, f847, f862; +add.f32 f975, f842, f974; +add.f32 f976, f852, f857; +add.f32 f977, f976, f975; +add.f32 f978, f872, f887; +add.f32 f979, f867, f978; +add.f32 f980, f877, f882; +add.f32 f981, f980, f979; +fma.rn.f32 f982, f974, 0f3E9E377A, f842; +mul.f32 f983, f976, 0f3F4F1BBD; +sub.f32 f984, f982, f983; +sub.f32 f985, f872, f887; +mul.f32 f986, f985, 0f3F737871; +sub.f32 f987, f877, f882; +fma.rn.f32 f988, f987, 0f3F167918, f986; +sub.f32 f989, f984, f988; +add.f32 f990, f988, f984; +mul.f32 f991, f974, 0f3F4F1BBD; +sub.f32 f992, f842, f991; +fma.rn.f32 f993, f976, 0f3E9E377A, f992; +mul.f32 f994, f985, 0f3F167918; +mul.f32 f995, f987, 0f3F737871; +sub.f32 f996, f994, f995; +sub.f32 f997, f993, f996; +add.f32 f998, f996, f993; +fma.rn.f32 f999, f978, 0f3E9E377A, f867; +mul.f32 f1000, f980, 0f3F4F1BBD; +sub.f32 f1001, f999, f1000; +sub.f32 f1002, f847, f862; +mul.f32 f1003, f1002, 0f3F737871; +sub.f32 f1004, f852, f857; +fma.rn.f32 f1005, f1004, 0f3F167918, f1003; +add.f32 f1006, f1005, f1001; +sub.f32 f1007, f1001, f1005; +mul.f32 f1008, f978, 0f3F4F1BBD; +sub.f32 f1009, f867, f1008; +fma.rn.f32 f1010, f980, 0f3E9E377A, f1009; +mul.f32 f1011, f1002, 0f3F167918; +mul.f32 f1012, f1004, 0f3F737871; +sub.f32 f1013, f1011, f1012; +add.f32 f1014, f1013, f1010; +sub.f32 f1015, f1010, f1013; +add.f32 f1016, f848, f863; +add.f32 f1017, f843, f1016; +add.f32 f1018, f853, f858; +add.f32 f1019, f1018, f1017; +add.f32 f1020, f873, f888; +add.f32 f1021, f868, f1020; +add.f32 f1022, f878, f883; +add.f32 f1023, f1022, f1021; +fma.rn.f32 f1024, f1016, 0f3E9E377A, f843; +mul.f32 f1025, f1018, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f873, f888; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f878, f883; +fma.rn.f32 f1030, f1029, 0f3F167918, f1028; +sub.f32 f1031, f1026, f1030; +add.f32 f1032, f1030, f1026; +mul.f32 f1033, f1016, 0f3F4F1BBD; +sub.f32 f1034, f843, f1033; +fma.rn.f32 f1035, f1018, 0f3E9E377A, f1034; +mul.f32 f1036, f1027, 0f3F167918; +mul.f32 f1037, f1029, 0f3F737871; +sub.f32 f1038, f1036, f1037; +sub.f32 f1039, f1035, f1038; +add.f32 f1040, f1038, f1035; +fma.rn.f32 f1041, f1020, 0f3E9E377A, f868; +mul.f32 f1042, f1022, 0f3F4F1BBD; +sub.f32 f1043, f1041, f1042; +sub.f32 f1044, f848, f863; +mul.f32 f1045, f1044, 0f3F737871; +sub.f32 f1046, f853, f858; +fma.rn.f32 f1047, f1046, 0f3F167918, f1045; +add.f32 f1048, f1047, f1043; +sub.f32 f1049, f1043, f1047; +mul.f32 f1050, f1020, 0f3F4F1BBD; +sub.f32 f1051, f868, f1050; +fma.rn.f32 f1052, f1022, 0f3E9E377A, f1051; +mul.f32 f1053, f1044, 0f3F167918; +mul.f32 f1054, f1046, 0f3F737871; +sub.f32 f1055, f1053, f1054; +add.f32 f1056, f1055, f1052; +sub.f32 f1057, f1052, f1055; +add.f32 f1058, f849, f864; +add.f32 f1059, f844, f1058; +add.f32 f1060, f854, f859; +add.f32 f1061, f1060, f1059; +add.f32 f1062, f874, f889; +add.f32 f1063, f869, f1062; +add.f32 f1064, f879, f884; +add.f32 f1065, f1064, f1063; +fma.rn.f32 f1066, f1058, 0f3E9E377A, f844; +mul.f32 f1067, f1060, 0f3F4F1BBD; +sub.f32 f1068, f1066, f1067; +sub.f32 f1069, f874, f889; +mul.f32 f1070, f1069, 0f3F737871; +sub.f32 f1071, f879, f884; +fma.rn.f32 f1072, f1071, 0f3F167918, f1070; +sub.f32 f1073, f1068, f1072; +add.f32 f1074, f1072, f1068; +mul.f32 f1075, f1058, 0f3F4F1BBD; +sub.f32 f1076, f844, f1075; +fma.rn.f32 f1077, f1060, 0f3E9E377A, f1076; +mul.f32 f1078, f1069, 0f3F167918; +mul.f32 f1079, f1071, 0f3F737871; +sub.f32 f1080, f1078, f1079; +sub.f32 f1081, f1077, f1080; +add.f32 f1082, f1080, f1077; +fma.rn.f32 f1083, f1062, 0f3E9E377A, f869; +mul.f32 f1084, f1064, 0f3F4F1BBD; +sub.f32 f1085, f1083, f1084; +sub.f32 f1086, f849, f864; +mul.f32 f1087, f1086, 0f3F737871; +sub.f32 f1088, f854, f859; +fma.rn.f32 f1089, f1088, 0f3F167918, f1087; +add.f32 f1090, f1089, f1085; +sub.f32 f1091, f1085, f1089; +mul.f32 f1092, f1062, 0f3F4F1BBD; +sub.f32 f1093, f869, f1092; +fma.rn.f32 f1094, f1064, 0f3E9E377A, f1093; +mul.f32 f1095, f1086, 0f3F167918; +mul.f32 f1096, f1088, 0f3F737871; +sub.f32 f1097, f1095, f1096; +add.f32 f1098, f1097, f1094; +sub.f32 f1099, f1094, f1097; +mul.f32 f1100, f947, 0f3F77F511; +mul.f32 f1101, f964, 0f3E7EA890; +sub.f32 f1102, f1100, f1101; +mul.f32 f1103, f964, 0f3F77F511; +fma.rn.f32 f1104, f947, 0f3E7EA890, f1103; +mul.f32 f1105, f989, 0f3F6055A2; +mul.f32 f1106, f1006, 0f3EF6A86B; +sub.f32 f1107, f1105, f1106; +mul.f32 f1108, f1006, 0f3F6055A2; +fma.rn.f32 f1109, f989, 0f3EF6A86B, f1108; +mul.f32 f1110, f1031, 0f3F3A9DB0; +mul.f32 f1111, f1048, 0f3F2F3E7B; +sub.f32 f1112, f1110, f1111; +mul.f32 f1113, f1048, 0f3F3A9DB0; +fma.rn.f32 f1114, f1031, 0f3F2F3E7B, f1113; +mul.f32 f1115, f1073, 0f3F092BF2; +mul.f32 f1116, f1090, 0f3F5825E0; +sub.f32 f1117, f1115, f1116; +mul.f32 f1118, f1090, 0f3F092BF2; +fma.rn.f32 f1119, f1073, 0f3F5825E0, f1118; +mul.f32 f1120, f955, 0f3F6055A2; +mul.f32 f1121, f972, 0f3EF6A86B; +sub.f32 f1122, f1120, f1121; +mul.f32 f1123, f972, 0f3F6055A2; +fma.rn.f32 f1124, f955, 0f3EF6A86B, f1123; +mul.f32 f1125, f997, 0f3F092BF2; +mul.f32 f1126, f1014, 0f3F5825E0; +sub.f32 f1127, f1125, f1126; +mul.f32 f1128, f1014, 0f3F092BF2; +fma.rn.f32 f1129, f997, 0f3F5825E0, f1128; +mul.f32 f1130, f1039, 0f3D809851; +mul.f32 f1131, f1056, 0f3F7F7EAE; +sub.f32 f1132, f1130, f1131; +mul.f32 f1133, f1056, 0f3D809851; +fma.rn.f32 f1134, f1039, 0f3F7F7EAE, f1133; +mul.f32 f1135, f1081, 0fBED9FFBE; +mul.f32 f1136, f1098, 0f3F67A2BF; +sub.f32 f1137, f1135, f1136; +mul.f32 f1138, f1098, 0fBED9FFBE; +fma.rn.f32 f1139, f1081, 0f3F67A2BF, f1138; +mul.f32 f1140, f956, 0f3F3A9DB0; +mul.f32 f1141, f973, 0f3F2F3E7B; +sub.f32 f1142, f1140, f1141; +mul.f32 f1143, f973, 0f3F3A9DB0; +fma.rn.f32 f1144, f956, 0f3F2F3E7B, f1143; +mul.f32 f1145, f998, 0f3D809851; +mul.f32 f1146, f1015, 0f3F7F7EAE; +sub.f32 f1147, f1145, f1146; +mul.f32 f1148, f1015, 0f3D809851; +fma.rn.f32 f1149, f998, 0f3F7F7EAE, f1148; +mul.f32 f1150, f1040, 0fBF232E38; +mul.f32 f1151, f1057, 0f3F45405B; +sub.f32 f1152, f1150, f1151; +mul.f32 f1153, f1057, 0fBF232E38; +fma.rn.f32 f1154, f1040, 0f3F45405B, f1153; +mul.f32 f1155, f1082, 0fBF7DFB3B; +mul.f32 f1156, f1099, 0f3E00575B; +sub.f32 f1157, f1155, f1156; +mul.f32 f1158, f1099, 0fBF7DFB3B; +fma.rn.f32 f1159, f1082, 0f3E00575B, f1158; +mul.f32 f1160, f948, 0f3F092BF2; +mul.f32 f1161, f965, 0f3F5825E0; +sub.f32 f1162, f1160, f1161; +mul.f32 f1163, f965, 0f3F092BF2; +fma.rn.f32 f1164, f948, 0f3F5825E0, f1163; +mul.f32 f1165, f990, 0fBED9FFBE; +mul.f32 f1166, f1007, 0f3F67A2BF; +sub.f32 f1167, f1165, f1166; +mul.f32 f1168, f1007, 0fBED9FFBE; +fma.rn.f32 f1169, f990, 0f3F67A2BF, f1168; +mul.f32 f1170, f1032, 0fBF7DFB3B; +mul.f32 f1171, f1049, 0f3E00575B; +sub.f32 f1172, f1170, f1171; +mul.f32 f1173, f1049, 0fBF7DFB3B; +fma.rn.f32 f1174, f1032, 0f3E00575B, f1173; +mul.f32 f1175, f1074, 0fBF232E38; +mul.f32 f1176, f1091, 0fBF45405B; +sub.f32 f1177, f1175, f1176; +mul.f32 f1178, f1091, 0fBF232E38; +fma.rn.f32 f1179, f1074, 0fBF45405B, f1178; +add.f32 f1180, f935, f1061; +add.f32 f1181, f893, f1180; +add.f32 f1182, f977, f1019; +add.f32 f1183, f1182, f1181; +add.f32 f1184, f939, f1065; +add.f32 f1185, f897, f1184; +add.f32 f1186, f981, f1023; +add.f32 f1187, f1186, f1185; +fma.rn.f32 f1188, f1180, 0f3E9E377A, f893; +mul.f32 f1189, f1182, 0f3F4F1BBD; +sub.f32 f1190, f1188, f1189; +sub.f32 f1191, f939, f1065; +mul.f32 f1192, f1191, 0f3F737871; +sub.f32 f1193, f981, f1023; +fma.rn.f32 f1194, f1193, 0f3F167918, f1192; +sub.f32 f1195, f1190, f1194; +add.f32 f1196, f1194, f1190; +mul.f32 f1197, f1180, 0f3F4F1BBD; +sub.f32 f1198, f893, f1197; +fma.rn.f32 f1199, f1182, 0f3E9E377A, f1198; +mul.f32 f1200, f1191, 0f3F167918; +mul.f32 f1201, f1193, 0f3F737871; +sub.f32 f1202, f1200, f1201; +sub.f32 f1203, f1199, f1202; +add.f32 f1204, f1202, f1199; +fma.rn.f32 f1205, f1184, 0f3E9E377A, f897; +mul.f32 f1206, f1186, 0f3F4F1BBD; +sub.f32 f1207, f1205, f1206; +sub.f32 f1208, f935, f1061; +mul.f32 f1209, f1208, 0f3F737871; +sub.f32 f1210, f977, f1019; +fma.rn.f32 f1211, f1210, 0f3F167918, f1209; +add.f32 f1212, f1211, f1207; +sub.f32 f1213, f1207, f1211; +mul.f32 f1214, f1184, 0f3F4F1BBD; +sub.f32 f1215, f897, f1214; +fma.rn.f32 f1216, f1186, 0f3E9E377A, f1215; +mul.f32 f1217, f1208, 0f3F167918; +mul.f32 f1218, f1210, 0f3F737871; +sub.f32 f1219, f1217, f1218; +add.f32 f1220, f1219, f1216; +sub.f32 f1221, f1216, f1219; +add.f32 f1222, f1102, f1117; +add.f32 f1223, f905, f1222; +add.f32 f1224, f1107, f1112; +add.f32 f1225, f1224, f1223; +add.f32 f1226, f1104, f1119; +add.f32 f1227, f922, f1226; +add.f32 f1228, f1109, f1114; +add.f32 f1229, f1228, f1227; +fma.rn.f32 f1230, f1222, 0f3E9E377A, f905; +mul.f32 f1231, f1224, 0f3F4F1BBD; +sub.f32 f1232, f1230, f1231; +sub.f32 f1233, f1104, f1119; +mul.f32 f1234, f1233, 0f3F737871; +sub.f32 f1235, f1109, f1114; +fma.rn.f32 f1236, f1235, 0f3F167918, f1234; +sub.f32 f1237, f1232, f1236; +add.f32 f1238, f1236, f1232; +mul.f32 f1239, f1222, 0f3F4F1BBD; +sub.f32 f1240, f905, f1239; +fma.rn.f32 f1241, f1224, 0f3E9E377A, f1240; +mul.f32 f1242, f1233, 0f3F167918; +mul.f32 f1243, f1235, 0f3F737871; +sub.f32 f1244, f1242, f1243; +sub.f32 f1245, f1241, f1244; +add.f32 f1246, f1244, f1241; +fma.rn.f32 f1247, f1226, 0f3E9E377A, f922; +mul.f32 f1248, f1228, 0f3F4F1BBD; +sub.f32 f1249, f1247, f1248; +sub.f32 f1250, f1102, f1117; +mul.f32 f1251, f1250, 0f3F737871; +sub.f32 f1252, f1107, f1112; +fma.rn.f32 f1253, f1252, 0f3F167918, f1251; +add.f32 f1254, f1253, f1249; +sub.f32 f1255, f1249, f1253; +mul.f32 f1256, f1226, 0f3F4F1BBD; +sub.f32 f1257, f922, f1256; +fma.rn.f32 f1258, f1228, 0f3E9E377A, f1257; +mul.f32 f1259, f1250, 0f3F167918; +mul.f32 f1260, f1252, 0f3F737871; +sub.f32 f1261, f1259, f1260; +add.f32 f1262, f1261, f1258; +sub.f32 f1263, f1258, f1261; +add.f32 f1264, f1122, f1137; +add.f32 f1265, f913, f1264; +add.f32 f1266, f1127, f1132; +add.f32 f1267, f1266, f1265; +add.f32 f1268, f1124, f1139; +add.f32 f1269, f930, f1268; +add.f32 f1270, f1129, f1134; +add.f32 f1271, f1270, f1269; +fma.rn.f32 f1272, f1264, 0f3E9E377A, f913; +mul.f32 f1273, f1266, 0f3F4F1BBD; +sub.f32 f1274, f1272, f1273; +sub.f32 f1275, f1124, f1139; +mul.f32 f1276, f1275, 0f3F737871; +sub.f32 f1277, f1129, f1134; +fma.rn.f32 f1278, f1277, 0f3F167918, f1276; +sub.f32 f1279, f1274, f1278; +add.f32 f1280, f1278, f1274; +mul.f32 f1281, f1264, 0f3F4F1BBD; +sub.f32 f1282, f913, f1281; +fma.rn.f32 f1283, f1266, 0f3E9E377A, f1282; +mul.f32 f1284, f1275, 0f3F167918; +mul.f32 f1285, f1277, 0f3F737871; +sub.f32 f1286, f1284, f1285; +sub.f32 f1287, f1283, f1286; +add.f32 f1288, f1286, f1283; +fma.rn.f32 f1289, f1268, 0f3E9E377A, f930; +mul.f32 f1290, f1270, 0f3F4F1BBD; +sub.f32 f1291, f1289, f1290; +sub.f32 f1292, f1122, f1137; +mul.f32 f1293, f1292, 0f3F737871; +sub.f32 f1294, f1127, f1132; +fma.rn.f32 f1295, f1294, 0f3F167918, f1293; +add.f32 f1296, f1295, f1291; +sub.f32 f1297, f1291, f1295; +mul.f32 f1298, f1268, 0f3F4F1BBD; +sub.f32 f1299, f930, f1298; +fma.rn.f32 f1300, f1270, 0f3E9E377A, f1299; +mul.f32 f1301, f1292, 0f3F167918; +mul.f32 f1302, f1294, 0f3F737871; +sub.f32 f1303, f1301, f1302; +add.f32 f1304, f1303, f1300; +sub.f32 f1305, f1300, f1303; +add.f32 f1306, f1142, f1157; +add.f32 f1307, f914, f1306; +add.f32 f1308, f1147, f1152; +add.f32 f1309, f1308, f1307; +add.f32 f1310, f1144, f1159; +add.f32 f1311, f931, f1310; +add.f32 f1312, f1149, f1154; +add.f32 f1313, f1312, f1311; +fma.rn.f32 f1314, f1306, 0f3E9E377A, f914; +mul.f32 f1315, f1308, 0f3F4F1BBD; +sub.f32 f1316, f1314, f1315; +sub.f32 f1317, f1144, f1159; +mul.f32 f1318, f1317, 0f3F737871; +sub.f32 f1319, f1149, f1154; +fma.rn.f32 f1320, f1319, 0f3F167918, f1318; +sub.f32 f1321, f1316, f1320; +add.f32 f1322, f1320, f1316; +mul.f32 f1323, f1306, 0f3F4F1BBD; +sub.f32 f1324, f914, f1323; +fma.rn.f32 f1325, f1308, 0f3E9E377A, f1324; +mul.f32 f1326, f1317, 0f3F167918; +mul.f32 f1327, f1319, 0f3F737871; +sub.f32 f1328, f1326, f1327; +sub.f32 f1329, f1325, f1328; +add.f32 f1330, f1328, f1325; +fma.rn.f32 f1331, f1310, 0f3E9E377A, f931; +mul.f32 f1332, f1312, 0f3F4F1BBD; +sub.f32 f1333, f1331, f1332; +sub.f32 f1334, f1142, f1157; +mul.f32 f1335, f1334, 0f3F737871; +sub.f32 f1336, f1147, f1152; +fma.rn.f32 f1337, f1336, 0f3F167918, f1335; +add.f32 f1338, f1337, f1333; +sub.f32 f1339, f1333, f1337; +mul.f32 f1340, f1310, 0f3F4F1BBD; +sub.f32 f1341, f931, f1340; +fma.rn.f32 f1342, f1312, 0f3E9E377A, f1341; +mul.f32 f1343, f1334, 0f3F167918; +mul.f32 f1344, f1336, 0f3F737871; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1345, f1342; +sub.f32 f1347, f1342, f1345; +add.f32 f1348, f1162, f1177; +add.f32 f1349, f906, f1348; +add.f32 f1350, f1167, f1172; +add.f32 f1351, f1350, f1349; +add.f32 f1352, f1164, f1179; +add.f32 f1353, f923, f1352; +add.f32 f1354, f1169, f1174; +add.f32 f1355, f1354, f1353; +fma.rn.f32 f1356, f1348, 0f3E9E377A, f906; +mul.f32 f1357, f1350, 0f3F4F1BBD; +sub.f32 f1358, f1356, f1357; +sub.f32 f1359, f1164, f1179; +mul.f32 f1360, f1359, 0f3F737871; +sub.f32 f1361, f1169, f1174; +fma.rn.f32 f1362, f1361, 0f3F167918, f1360; +sub.f32 f1363, f1358, f1362; +add.f32 f1364, f1362, f1358; +mul.f32 f1365, f1348, 0f3F4F1BBD; +sub.f32 f1366, f906, f1365; +fma.rn.f32 f1367, f1350, 0f3E9E377A, f1366; +mul.f32 f1368, f1359, 0f3F167918; +mul.f32 f1369, f1361, 0f3F737871; +sub.f32 f1370, f1368, f1369; +sub.f32 f1371, f1367, f1370; +add.f32 f1372, f1370, f1367; +fma.rn.f32 f1373, f1352, 0f3E9E377A, f923; +mul.f32 f1374, f1354, 0f3F4F1BBD; +sub.f32 f1375, f1373, f1374; +sub.f32 f1376, f1162, f1177; +mul.f32 f1377, f1376, 0f3F737871; +sub.f32 f1378, f1167, f1172; +fma.rn.f32 f1379, f1378, 0f3F167918, f1377; +add.f32 f1380, f1379, f1375; +sub.f32 f1381, f1375, f1379; +mul.f32 f1382, f1352, 0f3F4F1BBD; +sub.f32 f1383, f923, f1382; +fma.rn.f32 f1384, f1354, 0f3E9E377A, f1383; +mul.f32 f1385, f1376, 0f3F167918; +mul.f32 f1386, f1378, 0f3F737871; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1387, f1384; +sub.f32 f1389, f1384, f1387; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %52; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1390, f1391}, [rd11]; +mul.f32 f1394, f1229, f1391; +fma.rn.f32 f1395, f1390, f1225, f1394; +mul.f32 f1396, f1225, f1391; +mul.f32 f1397, f1390, f1229; +sub.f32 f1398, f1397, f1396; +mul.f32 f1399, f1390, f1390; +mul.f32 f1400, f1391, f1391; +sub.f32 f1401, f1399, f1400; +mul.f32 f1402, f1391, f1390; +fma.rn.f32 f1403, f1391, f1390, f1402; +mul.f32 f1404, f1271, f1403; +fma.rn.f32 f1405, f1401, f1267, f1404; +mul.f32 f1406, f1267, f1403; +mul.f32 f1407, f1401, f1271; +sub.f32 f1408, f1407, f1406; +mul.f32 f1409, f1390, f1401; +mul.f32 f1410, f1391, f1403; +sub.f32 f1411, f1409, f1410; +mul.f32 f1412, f1390, f1403; +fma.rn.f32 f1413, f1391, f1401, f1412; +mul.f32 f1414, f1313, f1413; +fma.rn.f32 f1415, f1411, f1309, f1414; +mul.f32 f1416, f1309, f1413; +mul.f32 f1417, f1411, f1313; +sub.f32 f1418, f1417, f1416; +mul.f32 f1419, f1390, f1411; +mul.f32 f1420, f1391, f1413; +sub.f32 f1421, f1419, f1420; +mul.f32 f1422, f1390, f1413; +fma.rn.f32 f1423, f1391, f1411, f1422; +mul.f32 f1424, f1355, f1423; +fma.rn.f32 f1425, f1421, f1351, f1424; +mul.f32 f1426, f1351, f1423; +mul.f32 f1427, f1421, f1355; +sub.f32 f1428, f1427, f1426; +mul.f32 f1429, f1390, f1421; +mul.f32 f1430, f1391, f1423; +sub.f32 f1431, f1429, f1430; +mul.f32 f1432, f1390, f1423; +fma.rn.f32 f1433, f1391, f1421, f1432; +mul.f32 f1434, f1212, f1433; +fma.rn.f32 f1435, f1431, f1195, f1434; +mul.f32 f1436, f1195, f1433; +mul.f32 f1437, f1431, f1212; +sub.f32 f1438, f1437, f1436; +mul.f32 f1439, f1390, f1431; +mul.f32 f1440, f1391, f1433; +sub.f32 f1441, f1439, f1440; +mul.f32 f1442, f1390, f1433; +fma.rn.f32 f1443, f1391, f1431, f1442; +mul.f32 f1444, f1254, f1443; +fma.rn.f32 f1445, f1441, f1237, f1444; +mul.f32 f1446, f1237, f1443; +mul.f32 f1447, f1441, f1254; +sub.f32 f1448, f1447, f1446; +mul.f32 f1449, f1390, f1441; +mul.f32 f1450, f1391, f1443; +sub.f32 f1451, f1449, f1450; +mul.f32 f1452, f1390, f1443; +fma.rn.f32 f1453, f1391, f1441, f1452; +mul.f32 f1454, f1296, f1453; +fma.rn.f32 f1455, f1451, f1279, f1454; +mul.f32 f1456, f1279, f1453; +mul.f32 f1457, f1451, f1296; +sub.f32 f1458, f1457, f1456; +mul.f32 f1459, f1390, f1451; +mul.f32 f1460, f1391, f1453; +sub.f32 f1461, f1459, f1460; +mul.f32 f1462, f1390, f1453; +fma.rn.f32 f1463, f1391, f1451, f1462; +mul.f32 f1464, f1338, f1463; +fma.rn.f32 f1465, f1461, f1321, f1464; +mul.f32 f1466, f1321, f1463; +mul.f32 f1467, f1461, f1338; +sub.f32 f1468, f1467, f1466; +mul.f32 f1469, f1390, f1461; +mul.f32 f1470, f1391, f1463; +sub.f32 f1471, f1469, f1470; +mul.f32 f1472, f1390, f1463; +fma.rn.f32 f1473, f1391, f1461, f1472; +mul.f32 f1474, f1380, f1473; +fma.rn.f32 f1475, f1471, f1363, f1474; +mul.f32 f1476, f1363, f1473; +mul.f32 f1477, f1471, f1380; +sub.f32 f1478, f1477, f1476; +mul.f32 f1479, f1390, f1471; +mul.f32 f1480, f1391, f1473; +sub.f32 f1481, f1479, f1480; +mul.f32 f1482, f1390, f1473; +fma.rn.f32 f1483, f1391, f1471, f1482; +mul.f32 f1484, f1220, f1483; +fma.rn.f32 f1485, f1481, f1203, f1484; +mul.f32 f1486, f1203, f1483; +mul.f32 f1487, f1481, f1220; +sub.f32 f1488, f1487, f1486; +mul.f32 f1489, f1390, f1481; +mul.f32 f1490, f1391, f1483; +sub.f32 f1491, f1489, f1490; +mul.f32 f1492, f1390, f1483; +fma.rn.f32 f1493, f1391, f1481, f1492; +mul.f32 f1494, f1262, f1493; +fma.rn.f32 f1495, f1491, f1245, f1494; +mul.f32 f1496, f1245, f1493; +mul.f32 f1497, f1491, f1262; +sub.f32 f1498, f1497, f1496; +mul.f32 f1499, f1390, f1491; +mul.f32 f1500, f1391, f1493; +sub.f32 f1501, f1499, f1500; +mul.f32 f1502, f1390, f1493; +fma.rn.f32 f1503, f1391, f1491, f1502; +mul.f32 f1504, f1304, f1503; +fma.rn.f32 f1505, f1501, f1287, f1504; +mul.f32 f1506, f1287, f1503; +mul.f32 f1507, f1501, f1304; +sub.f32 f1508, f1507, f1506; +mul.f32 f1509, f1390, f1501; +mul.f32 f1510, f1391, f1503; +sub.f32 f1511, f1509, f1510; +mul.f32 f1512, f1390, f1503; +fma.rn.f32 f1513, f1391, f1501, f1512; +mul.f32 f1514, f1346, f1513; +fma.rn.f32 f1515, f1511, f1329, f1514; +mul.f32 f1516, f1329, f1513; +mul.f32 f1517, f1511, f1346; +sub.f32 f1518, f1517, f1516; +mul.f32 f1519, f1390, f1511; +mul.f32 f1520, f1391, f1513; +sub.f32 f1521, f1519, f1520; +mul.f32 f1522, f1390, f1513; +fma.rn.f32 f1523, f1391, f1511, f1522; +mul.f32 f1524, f1388, f1523; +fma.rn.f32 f1525, f1521, f1371, f1524; +mul.f32 f1526, f1371, f1523; +mul.f32 f1527, f1521, f1388; +sub.f32 f1528, f1527, f1526; +mul.f32 f1529, f1390, f1521; +mul.f32 f1530, f1391, f1523; +sub.f32 f1531, f1529, f1530; +mul.f32 f1532, f1390, f1523; +fma.rn.f32 f1533, f1391, f1521, f1532; +mul.f32 f1534, f1221, f1533; +fma.rn.f32 f1535, f1531, f1204, f1534; +mul.f32 f1536, f1204, f1533; +mul.f32 f1537, f1531, f1221; +sub.f32 f1538, f1537, f1536; +mul.f32 f1539, f1390, f1531; +mul.f32 f1540, f1391, f1533; +sub.f32 f1541, f1539, f1540; +mul.f32 f1542, f1390, f1533; +fma.rn.f32 f1543, f1391, f1531, f1542; +mul.f32 f1544, f1263, f1543; +fma.rn.f32 f1545, f1541, f1246, f1544; +mul.f32 f1546, f1246, f1543; +mul.f32 f1547, f1541, f1263; +sub.f32 f1548, f1547, f1546; +mul.f32 f1549, f1390, f1541; +mul.f32 f1550, f1391, f1543; +sub.f32 f1551, f1549, f1550; +mul.f32 f1552, f1390, f1543; +fma.rn.f32 f1553, f1391, f1541, f1552; +mul.f32 f1554, f1305, f1553; +fma.rn.f32 f1555, f1551, f1288, f1554; +mul.f32 f1556, f1288, f1553; +mul.f32 f1557, f1551, f1305; +sub.f32 f1558, f1557, f1556; +mul.f32 f1559, f1390, f1551; +mul.f32 f1560, f1391, f1553; +sub.f32 f1561, f1559, f1560; +mul.f32 f1562, f1390, f1553; +fma.rn.f32 f1563, f1391, f1551, f1562; +mul.f32 f1564, f1347, f1563; +fma.rn.f32 f1565, f1561, f1330, f1564; +mul.f32 f1566, f1330, f1563; +mul.f32 f1567, f1561, f1347; +sub.f32 f1568, f1567, f1566; +mul.f32 f1569, f1390, f1561; +mul.f32 f1570, f1391, f1563; +sub.f32 f1571, f1569, f1570; +mul.f32 f1572, f1390, f1563; +fma.rn.f32 f1573, f1391, f1561, f1572; +mul.f32 f1574, f1389, f1573; +fma.rn.f32 f1575, f1571, f1372, f1574; +mul.f32 f1576, f1372, f1573; +mul.f32 f1577, f1571, f1389; +sub.f32 f1578, f1577, f1576; +mul.f32 f1579, f1390, f1571; +mul.f32 f1580, f1391, f1573; +sub.f32 f1581, f1579, f1580; +mul.f32 f1582, f1390, f1573; +fma.rn.f32 f1583, f1391, f1571, f1582; +mul.f32 f1584, f1213, f1583; +fma.rn.f32 f1585, f1581, f1196, f1584; +mul.f32 f1586, f1196, f1583; +mul.f32 f1587, f1581, f1213; +sub.f32 f1588, f1587, f1586; +mul.f32 f1589, f1390, f1581; +mul.f32 f1590, f1391, f1583; +sub.f32 f1591, f1589, f1590; +mul.f32 f1592, f1390, f1583; +fma.rn.f32 f1593, f1391, f1581, f1592; +mul.f32 f1594, f1255, f1593; +fma.rn.f32 f1595, f1591, f1238, f1594; +mul.f32 f1596, f1238, f1593; +mul.f32 f1597, f1591, f1255; +sub.f32 f1598, f1597, f1596; +mul.f32 f1599, f1390, f1591; +mul.f32 f1600, f1391, f1593; +sub.f32 f1601, f1599, f1600; +mul.f32 f1602, f1390, f1593; +fma.rn.f32 f1603, f1391, f1591, f1602; +mul.f32 f1604, f1297, f1603; +fma.rn.f32 f1605, f1601, f1280, f1604; +mul.f32 f1606, f1280, f1603; +mul.f32 f1607, f1601, f1297; +sub.f32 f1608, f1607, f1606; +mul.f32 f1609, f1390, f1601; +mul.f32 f1610, f1391, f1603; +sub.f32 f1611, f1609, f1610; +mul.f32 f1612, f1390, f1603; +fma.rn.f32 f1613, f1391, f1601, f1612; +mul.f32 f1614, f1339, f1613; +fma.rn.f32 f1615, f1611, f1322, f1614; +mul.f32 f1616, f1322, f1613; +mul.f32 f1617, f1611, f1339; +sub.f32 f1618, f1617, f1616; +mul.f32 f1619, f1390, f1611; +mul.f32 f1620, f1391, f1613; +sub.f32 f1621, f1619, f1620; +mul.f32 f1622, f1390, f1613; +fma.rn.f32 f1623, f1391, f1611, f1622; +mul.f32 f1624, f1381, f1623; +fma.rn.f32 f1625, f1621, f1364, f1624; +mul.f32 f1626, f1364, f1623; +mul.f32 f1627, f1621, f1381; +sub.f32 f1628, f1627, f1626; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 2500, r15; +st.shared.f32 [r16], f1183; +st.shared.f32 [r16+100], f1395; +st.shared.f32 [r16+200], f1405; +st.shared.f32 [r16+300], f1415; +st.shared.f32 [r16+400], f1425; +st.shared.f32 [r16+500], f1435; +st.shared.f32 [r16+600], f1445; +st.shared.f32 [r16+700], f1455; +st.shared.f32 [r16+800], f1465; +st.shared.f32 [r16+900], f1475; +st.shared.f32 [r16+1000], f1485; +st.shared.f32 [r16+1100], f1495; +st.shared.f32 [r16+1200], f1505; +st.shared.f32 [r16+1300], f1515; +st.shared.f32 [r16+1400], f1525; +st.shared.f32 [r16+1500], f1535; +st.shared.f32 [r16+1600], f1545; +st.shared.f32 [r16+1700], f1555; +st.shared.f32 [r16+1800], f1565; +st.shared.f32 [r16+1900], f1575; +st.shared.f32 [r16+2000], f1585; +st.shared.f32 [r16+2100], f1595; +st.shared.f32 [r16+2200], f1605; +st.shared.f32 [r16+2300], f1615; +st.shared.f32 [r16+2400], f1625; +barrier.sync 0; +ld.shared.f32 f1629, [r10]; +ld.shared.f32 f1630, [r10+2500]; +ld.shared.f32 f1631, [r10+5000]; +ld.shared.f32 f1632, [r10+7500]; +ld.shared.f32 f1633, [r10+10000]; +ld.shared.f32 f1634, [r10+12500]; +ld.shared.f32 f1635, [r10+15000]; +ld.shared.f32 f1636, [r10+17500]; +ld.shared.f32 f1637, [r10+20000]; +ld.shared.f32 f1638, [r10+22500]; +ld.shared.f32 f1639, [r10+25000]; +ld.shared.f32 f1640, [r10+27500]; +ld.shared.f32 f1641, [r10+30000]; +ld.shared.f32 f1642, [r10+32500]; +ld.shared.f32 f1643, [r10+35000]; +ld.shared.f32 f1644, [r10+37500]; +ld.shared.f32 f1645, [r10+40000]; +ld.shared.f32 f1646, [r10+42500]; +ld.shared.f32 f1647, [r10+45000]; +ld.shared.f32 f1648, [r10+47500]; +ld.shared.f32 f1649, [r10+50000]; +ld.shared.f32 f1650, [r10+52500]; +ld.shared.f32 f1651, [r10+55000]; +ld.shared.f32 f1652, [r10+57500]; +ld.shared.f32 f1653, [r10+60000]; +barrier.sync 0; +st.shared.f32 [r16], f1187; +st.shared.f32 [r16+100], f1398; +st.shared.f32 [r16+200], f1408; +st.shared.f32 [r16+300], f1418; +st.shared.f32 [r16+400], f1428; +st.shared.f32 [r16+500], f1438; +st.shared.f32 [r16+600], f1448; +st.shared.f32 [r16+700], f1458; +st.shared.f32 [r16+800], f1468; +st.shared.f32 [r16+900], f1478; +st.shared.f32 [r16+1000], f1488; +st.shared.f32 [r16+1100], f1498; +st.shared.f32 [r16+1200], f1508; +st.shared.f32 [r16+1300], f1518; +st.shared.f32 [r16+1400], f1528; +st.shared.f32 [r16+1500], f1538; +st.shared.f32 [r16+1600], f1548; +st.shared.f32 [r16+1700], f1558; +st.shared.f32 [r16+1800], f1568; +st.shared.f32 [r16+1900], f1578; +st.shared.f32 [r16+2000], f1588; +st.shared.f32 [r16+2100], f1598; +st.shared.f32 [r16+2200], f1608; +st.shared.f32 [r16+2300], f1618; +st.shared.f32 [r16+2400], f1628; +barrier.sync 0; +ld.shared.f32 f1654, [r10]; +ld.shared.f32 f1655, [r10+2500]; +ld.shared.f32 f1656, [r10+5000]; +ld.shared.f32 f1657, [r10+7500]; +ld.shared.f32 f1658, [r10+10000]; +ld.shared.f32 f1659, [r10+12500]; +ld.shared.f32 f1660, [r10+15000]; +ld.shared.f32 f1661, [r10+17500]; +ld.shared.f32 f1662, [r10+20000]; +ld.shared.f32 f1663, [r10+22500]; +ld.shared.f32 f1664, [r10+25000]; +ld.shared.f32 f1665, [r10+27500]; +ld.shared.f32 f1666, [r10+30000]; +ld.shared.f32 f1667, [r10+32500]; +ld.shared.f32 f1668, [r10+35000]; +ld.shared.f32 f1669, [r10+37500]; +ld.shared.f32 f1670, [r10+40000]; +ld.shared.f32 f1671, [r10+42500]; +ld.shared.f32 f1672, [r10+45000]; +ld.shared.f32 f1673, [r10+47500]; +ld.shared.f32 f1674, [r10+50000]; +ld.shared.f32 f1675, [r10+52500]; +ld.shared.f32 f1676, [r10+55000]; +ld.shared.f32 f1677, [r10+57500]; +ld.shared.f32 f1678, [r10+60000]; +add.f32 f1679, f1634, f1649; +add.f32 f1680, f1629, f1679; +add.f32 f1681, f1639, f1644; +add.f32 f1682, f1681, f1680; +add.f32 f1683, f1659, f1674; +add.f32 f1684, f1654, f1683; +add.f32 f1685, f1664, f1669; +add.f32 f1686, f1685, f1684; +fma.rn.f32 f1687, f1679, 0f3E9E377A, f1629; +mul.f32 f1688, f1681, 0f3F4F1BBD; +sub.f32 f1689, f1687, f1688; +sub.f32 f1690, f1659, f1674; +mul.f32 f1691, f1690, 0f3F737871; +sub.f32 f1692, f1664, f1669; +fma.rn.f32 f1693, f1692, 0f3F167918, f1691; +sub.f32 f1694, f1689, f1693; +add.f32 f1695, f1693, f1689; +mul.f32 f1696, f1679, 0f3F4F1BBD; +sub.f32 f1697, f1629, f1696; +fma.rn.f32 f1698, f1681, 0f3E9E377A, f1697; +mul.f32 f1699, f1690, 0f3F167918; +mul.f32 f1700, f1692, 0f3F737871; +sub.f32 f1701, f1699, f1700; +sub.f32 f1702, f1698, f1701; +add.f32 f1703, f1701, f1698; +fma.rn.f32 f1704, f1683, 0f3E9E377A, f1654; +mul.f32 f1705, f1685, 0f3F4F1BBD; +sub.f32 f1706, f1704, f1705; +sub.f32 f1707, f1634, f1649; +mul.f32 f1708, f1707, 0f3F737871; +sub.f32 f1709, f1639, f1644; +fma.rn.f32 f1710, f1709, 0f3F167918, f1708; +add.f32 f1711, f1710, f1706; +sub.f32 f1712, f1706, f1710; +mul.f32 f1713, f1683, 0f3F4F1BBD; +sub.f32 f1714, f1654, f1713; +fma.rn.f32 f1715, f1685, 0f3E9E377A, f1714; +mul.f32 f1716, f1707, 0f3F167918; +mul.f32 f1717, f1709, 0f3F737871; +sub.f32 f1718, f1716, f1717; +add.f32 f1719, f1718, f1715; +sub.f32 f1720, f1715, f1718; +add.f32 f1721, f1635, f1650; +add.f32 f1722, f1630, f1721; +add.f32 f1723, f1640, f1645; +add.f32 f1724, f1723, f1722; +add.f32 f1725, f1660, f1675; +add.f32 f1726, f1655, f1725; +add.f32 f1727, f1665, f1670; +add.f32 f1728, f1727, f1726; +fma.rn.f32 f1729, f1721, 0f3E9E377A, f1630; +mul.f32 f1730, f1723, 0f3F4F1BBD; +sub.f32 f1731, f1729, f1730; +sub.f32 f1732, f1660, f1675; +mul.f32 f1733, f1732, 0f3F737871; +sub.f32 f1734, f1665, f1670; +fma.rn.f32 f1735, f1734, 0f3F167918, f1733; +sub.f32 f1736, f1731, f1735; +add.f32 f1737, f1735, f1731; +mul.f32 f1738, f1721, 0f3F4F1BBD; +sub.f32 f1739, f1630, f1738; +fma.rn.f32 f1740, f1723, 0f3E9E377A, f1739; +mul.f32 f1741, f1732, 0f3F167918; +mul.f32 f1742, f1734, 0f3F737871; +sub.f32 f1743, f1741, f1742; +sub.f32 f1744, f1740, f1743; +add.f32 f1745, f1743, f1740; +fma.rn.f32 f1746, f1725, 0f3E9E377A, f1655; +mul.f32 f1747, f1727, 0f3F4F1BBD; +sub.f32 f1748, f1746, f1747; +sub.f32 f1749, f1635, f1650; +mul.f32 f1750, f1749, 0f3F737871; +sub.f32 f1751, f1640, f1645; +fma.rn.f32 f1752, f1751, 0f3F167918, f1750; +add.f32 f1753, f1752, f1748; +sub.f32 f1754, f1748, f1752; +mul.f32 f1755, f1725, 0f3F4F1BBD; +sub.f32 f1756, f1655, f1755; +fma.rn.f32 f1757, f1727, 0f3E9E377A, f1756; +mul.f32 f1758, f1749, 0f3F167918; +mul.f32 f1759, f1751, 0f3F737871; +sub.f32 f1760, f1758, f1759; +add.f32 f1761, f1760, f1757; +sub.f32 f1762, f1757, f1760; +add.f32 f1763, f1636, f1651; +add.f32 f1764, f1631, f1763; +add.f32 f1765, f1641, f1646; +add.f32 f1766, f1765, f1764; +add.f32 f1767, f1661, f1676; +add.f32 f1768, f1656, f1767; +add.f32 f1769, f1666, f1671; +add.f32 f1770, f1769, f1768; +fma.rn.f32 f1771, f1763, 0f3E9E377A, f1631; +mul.f32 f1772, f1765, 0f3F4F1BBD; +sub.f32 f1773, f1771, f1772; +sub.f32 f1774, f1661, f1676; +mul.f32 f1775, f1774, 0f3F737871; +sub.f32 f1776, f1666, f1671; +fma.rn.f32 f1777, f1776, 0f3F167918, f1775; +sub.f32 f1778, f1773, f1777; +add.f32 f1779, f1777, f1773; +mul.f32 f1780, f1763, 0f3F4F1BBD; +sub.f32 f1781, f1631, f1780; +fma.rn.f32 f1782, f1765, 0f3E9E377A, f1781; +mul.f32 f1783, f1774, 0f3F167918; +mul.f32 f1784, f1776, 0f3F737871; +sub.f32 f1785, f1783, f1784; +sub.f32 f1786, f1782, f1785; +add.f32 f1787, f1785, f1782; +fma.rn.f32 f1788, f1767, 0f3E9E377A, f1656; +mul.f32 f1789, f1769, 0f3F4F1BBD; +sub.f32 f1790, f1788, f1789; +sub.f32 f1791, f1636, f1651; +mul.f32 f1792, f1791, 0f3F737871; +sub.f32 f1793, f1641, f1646; +fma.rn.f32 f1794, f1793, 0f3F167918, f1792; +add.f32 f1795, f1794, f1790; +sub.f32 f1796, f1790, f1794; +mul.f32 f1797, f1767, 0f3F4F1BBD; +sub.f32 f1798, f1656, f1797; +fma.rn.f32 f1799, f1769, 0f3E9E377A, f1798; +mul.f32 f1800, f1791, 0f3F167918; +mul.f32 f1801, f1793, 0f3F737871; +sub.f32 f1802, f1800, f1801; +add.f32 f1803, f1802, f1799; +sub.f32 f1804, f1799, f1802; +add.f32 f1805, f1637, f1652; +add.f32 f1806, f1632, f1805; +add.f32 f1807, f1642, f1647; +add.f32 f1808, f1807, f1806; +add.f32 f1809, f1662, f1677; +add.f32 f1810, f1657, f1809; +add.f32 f1811, f1667, f1672; +add.f32 f1812, f1811, f1810; +fma.rn.f32 f1813, f1805, 0f3E9E377A, f1632; +mul.f32 f1814, f1807, 0f3F4F1BBD; +sub.f32 f1815, f1813, f1814; +sub.f32 f1816, f1662, f1677; +mul.f32 f1817, f1816, 0f3F737871; +sub.f32 f1818, f1667, f1672; +fma.rn.f32 f1819, f1818, 0f3F167918, f1817; +sub.f32 f1820, f1815, f1819; +add.f32 f1821, f1819, f1815; +mul.f32 f1822, f1805, 0f3F4F1BBD; +sub.f32 f1823, f1632, f1822; +fma.rn.f32 f1824, f1807, 0f3E9E377A, f1823; +mul.f32 f1825, f1816, 0f3F167918; +mul.f32 f1826, f1818, 0f3F737871; +sub.f32 f1827, f1825, f1826; +sub.f32 f1828, f1824, f1827; +add.f32 f1829, f1827, f1824; +fma.rn.f32 f1830, f1809, 0f3E9E377A, f1657; +mul.f32 f1831, f1811, 0f3F4F1BBD; +sub.f32 f1832, f1830, f1831; +sub.f32 f1833, f1637, f1652; +mul.f32 f1834, f1833, 0f3F737871; +sub.f32 f1835, f1642, f1647; +fma.rn.f32 f1836, f1835, 0f3F167918, f1834; +add.f32 f1837, f1836, f1832; +sub.f32 f1838, f1832, f1836; +mul.f32 f1839, f1809, 0f3F4F1BBD; +sub.f32 f1840, f1657, f1839; +fma.rn.f32 f1841, f1811, 0f3E9E377A, f1840; +mul.f32 f1842, f1833, 0f3F167918; +mul.f32 f1843, f1835, 0f3F737871; +sub.f32 f1844, f1842, f1843; +add.f32 f1845, f1844, f1841; +sub.f32 f1846, f1841, f1844; +add.f32 f1847, f1638, f1653; +add.f32 f1848, f1633, f1847; +add.f32 f1849, f1643, f1648; +add.f32 f1850, f1849, f1848; +add.f32 f1851, f1663, f1678; +add.f32 f1852, f1658, f1851; +add.f32 f1853, f1668, f1673; +add.f32 f1854, f1853, f1852; +fma.rn.f32 f1855, f1847, 0f3E9E377A, f1633; +mul.f32 f1856, f1849, 0f3F4F1BBD; +sub.f32 f1857, f1855, f1856; +sub.f32 f1858, f1663, f1678; +mul.f32 f1859, f1858, 0f3F737871; +sub.f32 f1860, f1668, f1673; +fma.rn.f32 f1861, f1860, 0f3F167918, f1859; +sub.f32 f1862, f1857, f1861; +add.f32 f1863, f1861, f1857; +mul.f32 f1864, f1847, 0f3F4F1BBD; +sub.f32 f1865, f1633, f1864; +fma.rn.f32 f1866, f1849, 0f3E9E377A, f1865; +mul.f32 f1867, f1858, 0f3F167918; +mul.f32 f1868, f1860, 0f3F737871; +sub.f32 f1869, f1867, f1868; +sub.f32 f1870, f1866, f1869; +add.f32 f1871, f1869, f1866; +fma.rn.f32 f1872, f1851, 0f3E9E377A, f1658; +mul.f32 f1873, f1853, 0f3F4F1BBD; +sub.f32 f1874, f1872, f1873; +sub.f32 f1875, f1638, f1653; +mul.f32 f1876, f1875, 0f3F737871; +sub.f32 f1877, f1643, f1648; +fma.rn.f32 f1878, f1877, 0f3F167918, f1876; +add.f32 f1879, f1878, f1874; +sub.f32 f1880, f1874, f1878; +mul.f32 f1881, f1851, 0f3F4F1BBD; +sub.f32 f1882, f1658, f1881; +fma.rn.f32 f1883, f1853, 0f3E9E377A, f1882; +mul.f32 f1884, f1875, 0f3F167918; +mul.f32 f1885, f1877, 0f3F737871; +sub.f32 f1886, f1884, f1885; +add.f32 f1887, f1886, f1883; +sub.f32 f1888, f1883, f1886; +mul.f32 f1889, f1736, 0f3F77F511; +mul.f32 f1890, f1753, 0f3E7EA890; +sub.f32 f1891, f1889, f1890; +mul.f32 f1892, f1753, 0f3F77F511; +fma.rn.f32 f1893, f1736, 0f3E7EA890, f1892; +mul.f32 f1894, f1778, 0f3F6055A2; +mul.f32 f1895, f1795, 0f3EF6A86B; +sub.f32 f1896, f1894, f1895; +mul.f32 f1897, f1795, 0f3F6055A2; +fma.rn.f32 f1898, f1778, 0f3EF6A86B, f1897; +mul.f32 f1899, f1820, 0f3F3A9DB0; +mul.f32 f1900, f1837, 0f3F2F3E7B; +sub.f32 f1901, f1899, f1900; +mul.f32 f1902, f1837, 0f3F3A9DB0; +fma.rn.f32 f1903, f1820, 0f3F2F3E7B, f1902; +mul.f32 f1904, f1862, 0f3F092BF2; +mul.f32 f1905, f1879, 0f3F5825E0; +sub.f32 f1906, f1904, f1905; +mul.f32 f1907, f1879, 0f3F092BF2; +fma.rn.f32 f1908, f1862, 0f3F5825E0, f1907; +mul.f32 f1909, f1744, 0f3F6055A2; +mul.f32 f1910, f1761, 0f3EF6A86B; +sub.f32 f1911, f1909, f1910; +mul.f32 f1912, f1761, 0f3F6055A2; +fma.rn.f32 f1913, f1744, 0f3EF6A86B, f1912; +mul.f32 f1914, f1786, 0f3F092BF2; +mul.f32 f1915, f1803, 0f3F5825E0; +sub.f32 f1916, f1914, f1915; +mul.f32 f1917, f1803, 0f3F092BF2; +fma.rn.f32 f1918, f1786, 0f3F5825E0, f1917; +mul.f32 f1919, f1828, 0f3D809851; +mul.f32 f1920, f1845, 0f3F7F7EAE; +sub.f32 f1921, f1919, f1920; +mul.f32 f1922, f1845, 0f3D809851; +fma.rn.f32 f1923, f1828, 0f3F7F7EAE, f1922; +mul.f32 f1924, f1870, 0fBED9FFBE; +mul.f32 f1925, f1887, 0f3F67A2BF; +sub.f32 f1926, f1924, f1925; +mul.f32 f1927, f1887, 0fBED9FFBE; +fma.rn.f32 f1928, f1870, 0f3F67A2BF, f1927; +mul.f32 f1929, f1745, 0f3F3A9DB0; +mul.f32 f1930, f1762, 0f3F2F3E7B; +sub.f32 f1931, f1929, f1930; +mul.f32 f1932, f1762, 0f3F3A9DB0; +fma.rn.f32 f1933, f1745, 0f3F2F3E7B, f1932; +mul.f32 f1934, f1787, 0f3D809851; +mul.f32 f1935, f1804, 0f3F7F7EAE; +sub.f32 f1936, f1934, f1935; +mul.f32 f1937, f1804, 0f3D809851; +fma.rn.f32 f1938, f1787, 0f3F7F7EAE, f1937; +mul.f32 f1939, f1829, 0fBF232E38; +mul.f32 f1940, f1846, 0f3F45405B; +sub.f32 f1941, f1939, f1940; +mul.f32 f1942, f1846, 0fBF232E38; +fma.rn.f32 f1943, f1829, 0f3F45405B, f1942; +mul.f32 f1944, f1871, 0fBF7DFB3B; +mul.f32 f1945, f1888, 0f3E00575B; +sub.f32 f1946, f1944, f1945; +mul.f32 f1947, f1888, 0fBF7DFB3B; +fma.rn.f32 f1948, f1871, 0f3E00575B, f1947; +mul.f32 f1949, f1737, 0f3F092BF2; +mul.f32 f1950, f1754, 0f3F5825E0; +sub.f32 f1951, f1949, f1950; +mul.f32 f1952, f1754, 0f3F092BF2; +fma.rn.f32 f1953, f1737, 0f3F5825E0, f1952; +mul.f32 f1954, f1779, 0fBED9FFBE; +mul.f32 f1955, f1796, 0f3F67A2BF; +sub.f32 f1956, f1954, f1955; +mul.f32 f1957, f1796, 0fBED9FFBE; +fma.rn.f32 f1958, f1779, 0f3F67A2BF, f1957; +mul.f32 f1959, f1821, 0fBF7DFB3B; +mul.f32 f1960, f1838, 0f3E00575B; +sub.f32 f1961, f1959, f1960; +mul.f32 f1962, f1838, 0fBF7DFB3B; +fma.rn.f32 f1963, f1821, 0f3E00575B, f1962; +mul.f32 f1964, f1863, 0fBF232E38; +mul.f32 f1965, f1880, 0fBF45405B; +sub.f32 f1966, f1964, f1965; +mul.f32 f1967, f1880, 0fBF232E38; +fma.rn.f32 f1968, f1863, 0fBF45405B, f1967; +add.f32 f1969, f1724, f1850; +add.f32 f1970, f1682, f1969; +add.f32 f1971, f1766, f1808; +add.f32 f1972, f1728, f1854; +add.f32 f1973, f1686, f1972; +add.f32 f1974, f1770, f1812; +fma.rn.f32 f1975, f1969, 0f3E9E377A, f1682; +mul.f32 f1976, f1971, 0f3F4F1BBD; +sub.f32 f1977, f1975, f1976; +sub.f32 f1978, f1728, f1854; +mul.f32 f1979, f1978, 0f3F737871; +sub.f32 f1980, f1770, f1812; +fma.rn.f32 f1981, f1980, 0f3F167918, f1979; +mul.f32 f1982, f1969, 0f3F4F1BBD; +sub.f32 f1983, f1682, f1982; +fma.rn.f32 f1984, f1971, 0f3E9E377A, f1983; +mul.f32 f1985, f1978, 0f3F167918; +mul.f32 f1986, f1980, 0f3F737871; +sub.f32 f1987, f1985, f1986; +fma.rn.f32 f1988, f1972, 0f3E9E377A, f1686; +mul.f32 f1989, f1974, 0f3F4F1BBD; +sub.f32 f1990, f1988, f1989; +sub.f32 f1991, f1724, f1850; +mul.f32 f1992, f1991, 0f3F737871; +sub.f32 f1993, f1766, f1808; +fma.rn.f32 f1994, f1993, 0f3F167918, f1992; +mul.f32 f1995, f1972, 0f3F4F1BBD; +sub.f32 f1996, f1686, f1995; +fma.rn.f32 f1997, f1974, 0f3E9E377A, f1996; +mul.f32 f1998, f1991, 0f3F167918; +mul.f32 f1999, f1993, 0f3F737871; +sub.f32 f2000, f1998, f1999; +add.f32 f2001, f1891, f1906; +add.f32 f2002, f1694, f2001; +add.f32 f2003, f1896, f1901; +add.f32 f2004, f1893, f1908; +add.f32 f2005, f1711, f2004; +add.f32 f2006, f1898, f1903; +fma.rn.f32 f2007, f2001, 0f3E9E377A, f1694; +mul.f32 f2008, f2003, 0f3F4F1BBD; +sub.f32 f2009, f2007, f2008; +sub.f32 f2010, f1893, f1908; +mul.f32 f2011, f2010, 0f3F737871; +sub.f32 f2012, f1898, f1903; +fma.rn.f32 f2013, f2012, 0f3F167918, f2011; +mul.f32 f2014, f2001, 0f3F4F1BBD; +sub.f32 f2015, f1694, f2014; +fma.rn.f32 f2016, f2003, 0f3E9E377A, f2015; +mul.f32 f2017, f2010, 0f3F167918; +mul.f32 f2018, f2012, 0f3F737871; +sub.f32 f2019, f2017, f2018; +fma.rn.f32 f2020, f2004, 0f3E9E377A, f1711; +mul.f32 f2021, f2006, 0f3F4F1BBD; +sub.f32 f2022, f2020, f2021; +sub.f32 f2023, f1891, f1906; +mul.f32 f2024, f2023, 0f3F737871; +sub.f32 f2025, f1896, f1901; +fma.rn.f32 f2026, f2025, 0f3F167918, f2024; +mul.f32 f2027, f2004, 0f3F4F1BBD; +sub.f32 f2028, f1711, f2027; +fma.rn.f32 f2029, f2006, 0f3E9E377A, f2028; +mul.f32 f2030, f2023, 0f3F167918; +mul.f32 f2031, f2025, 0f3F737871; +sub.f32 f2032, f2030, f2031; +add.f32 f2033, f1911, f1926; +add.f32 f2034, f1702, f2033; +add.f32 f2035, f1916, f1921; +add.f32 f2036, f1913, f1928; +add.f32 f2037, f1719, f2036; +add.f32 f2038, f1918, f1923; +fma.rn.f32 f2039, f2033, 0f3E9E377A, f1702; +mul.f32 f2040, f2035, 0f3F4F1BBD; +sub.f32 f2041, f2039, f2040; +sub.f32 f2042, f1913, f1928; +mul.f32 f2043, f2042, 0f3F737871; +sub.f32 f2044, f1918, f1923; +fma.rn.f32 f2045, f2044, 0f3F167918, f2043; +mul.f32 f2046, f2033, 0f3F4F1BBD; +sub.f32 f2047, f1702, f2046; +fma.rn.f32 f2048, f2035, 0f3E9E377A, f2047; +mul.f32 f2049, f2042, 0f3F167918; +mul.f32 f2050, f2044, 0f3F737871; +sub.f32 f2051, f2049, f2050; +fma.rn.f32 f2052, f2036, 0f3E9E377A, f1719; +mul.f32 f2053, f2038, 0f3F4F1BBD; +sub.f32 f2054, f2052, f2053; +sub.f32 f2055, f1911, f1926; +mul.f32 f2056, f2055, 0f3F737871; +sub.f32 f2057, f1916, f1921; +fma.rn.f32 f2058, f2057, 0f3F167918, f2056; +mul.f32 f2059, f2036, 0f3F4F1BBD; +sub.f32 f2060, f1719, f2059; +fma.rn.f32 f2061, f2038, 0f3E9E377A, f2060; +mul.f32 f2062, f2055, 0f3F167918; +mul.f32 f2063, f2057, 0f3F737871; +sub.f32 f2064, f2062, f2063; +add.f32 f2065, f1931, f1946; +add.f32 f2066, f1703, f2065; +add.f32 f2067, f1936, f1941; +add.f32 f2068, f1933, f1948; +add.f32 f2069, f1720, f2068; +add.f32 f2070, f1938, f1943; +fma.rn.f32 f2071, f2065, 0f3E9E377A, f1703; +mul.f32 f2072, f2067, 0f3F4F1BBD; +sub.f32 f2073, f2071, f2072; +sub.f32 f2074, f1933, f1948; +mul.f32 f2075, f2074, 0f3F737871; +sub.f32 f2076, f1938, f1943; +fma.rn.f32 f2077, f2076, 0f3F167918, f2075; +mul.f32 f2078, f2065, 0f3F4F1BBD; +sub.f32 f2079, f1703, f2078; +fma.rn.f32 f2080, f2067, 0f3E9E377A, f2079; +mul.f32 f2081, f2074, 0f3F167918; +mul.f32 f2082, f2076, 0f3F737871; +sub.f32 f2083, f2081, f2082; +fma.rn.f32 f2084, f2068, 0f3E9E377A, f1720; +mul.f32 f2085, f2070, 0f3F4F1BBD; +sub.f32 f2086, f2084, f2085; +sub.f32 f2087, f1931, f1946; +mul.f32 f2088, f2087, 0f3F737871; +sub.f32 f2089, f1936, f1941; +fma.rn.f32 f2090, f2089, 0f3F167918, f2088; +mul.f32 f2091, f2068, 0f3F4F1BBD; +sub.f32 f2092, f1720, f2091; +fma.rn.f32 f2093, f2070, 0f3E9E377A, f2092; +mul.f32 f2094, f2087, 0f3F167918; +mul.f32 f2095, f2089, 0f3F737871; +sub.f32 f2096, f2094, f2095; +add.f32 f2097, f1951, f1966; +add.f32 f2098, f1695, f2097; +add.f32 f2099, f1956, f1961; +add.f32 f2100, f1953, f1968; +add.f32 f2101, f1712, f2100; +add.f32 f2102, f1958, f1963; +fma.rn.f32 f2103, f2097, 0f3E9E377A, f1695; +mul.f32 f2104, f2099, 0f3F4F1BBD; +sub.f32 f2105, f2103, f2104; +sub.f32 f2106, f1953, f1968; +mul.f32 f2107, f2106, 0f3F737871; +sub.f32 f2108, f1958, f1963; +fma.rn.f32 f2109, f2108, 0f3F167918, f2107; +mul.f32 f2110, f2097, 0f3F4F1BBD; +sub.f32 f2111, f1695, f2110; +fma.rn.f32 f2112, f2099, 0f3E9E377A, f2111; +mul.f32 f2113, f2106, 0f3F167918; +mul.f32 f2114, f2108, 0f3F737871; +sub.f32 f2115, f2113, f2114; +fma.rn.f32 f2116, f2100, 0f3E9E377A, f1712; +mul.f32 f2117, f2102, 0f3F4F1BBD; +sub.f32 f2118, f2116, f2117; +sub.f32 f2119, f1951, f1966; +mul.f32 f2120, f2119, 0f3F737871; +sub.f32 f2121, f1956, f1961; +fma.rn.f32 f2122, f2121, 0f3F167918, f2120; +mul.f32 f2123, f2100, 0f3F4F1BBD; +sub.f32 f2124, f1712, f2123; +fma.rn.f32 f2125, f2102, 0f3E9E377A, f2124; +mul.f32 f2126, f2119, 0f3F167918; +mul.f32 f2127, f2121, 0f3F737871; +sub.f32 f2128, f2126, f2127; +add.f32 %0, f1971, f1970; +add.f32 %1, f1974, f1973; +add.f32 %3, f2006, f2005; +add.f32 %2, f2003, f2002; +add.f32 %5, f2038, f2037; +add.f32 %4, f2035, f2034; +add.f32 %7, f2070, f2069; +add.f32 %6, f2067, f2066; +add.f32 %9, f2102, f2101; +add.f32 %8, f2099, f2098; +add.f32 %11, f1994, f1990; +sub.f32 %10, f1977, f1981; +add.f32 %13, f2026, f2022; +sub.f32 %12, f2009, f2013; +add.f32 %15, f2058, f2054; +sub.f32 %14, f2041, f2045; +add.f32 %17, f2090, f2086; +sub.f32 %16, f2073, f2077; +add.f32 %19, f2122, f2118; +sub.f32 %18, f2105, f2109; +sub.f32 %20, f1984, f1987; +add.f32 %21, f2000, f1997; +add.f32 %23, f2032, f2029; +sub.f32 %22, f2016, f2019; +add.f32 %25, f2064, f2061; +sub.f32 %24, f2048, f2051; +add.f32 %27, f2096, f2093; +sub.f32 %26, f2080, f2083; +add.f32 %29, f2128, f2125; +sub.f32 %28, f2112, f2115; +add.f32 %30, f1987, f1984; +sub.f32 %31, f1997, f2000; +sub.f32 %33, f2029, f2032; +add.f32 %32, f2019, f2016; +sub.f32 %35, f2061, f2064; +add.f32 %34, f2051, f2048; +sub.f32 %37, f2093, f2096; +add.f32 %36, f2083, f2080; +sub.f32 %39, f2125, f2128; +add.f32 %38, f2115, f2112; +sub.f32 %41, f1990, f1994; +add.f32 %40, f1981, f1977; +sub.f32 %43, f2022, f2026; +add.f32 %42, f2013, f2009; +sub.f32 %45, f2054, f2058; +add.f32 %44, f2045, f2041; +sub.f32 %47, f2086, f2090; +add.f32 %46, f2077, f2073; +sub.f32 %49, f2118, f2122; +add.f32 %48, f2109, f2105; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_15625), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..cd14a4ea4467e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp16_fwd.hpp.inc @@ -0,0 +1,1692 @@ +#ifndef CUFFTDX_FFT_15_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_15_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<748, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<73>; +.reg .b32 r<1485>; +.reg .f64 fd<57>; +.reg .b64 rd<2>; +mov.f64 fd23, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd23; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd24, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd24; +} +mov.b32 r228, {rs2, rs2}; +mov.f64 fd29, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs3, fd29; +} +mov.b32 r282, {rs3, rs3}; +mov.f64 fd30, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs4, fd30; +} +mov.b32 r300, {rs4, rs4}; +{ +cvt.rn.f16.f64 rs5, fd23; +} +mov.b32 r291, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd24; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r306, {rs7, rs7}; +{ +add.f16x2 r1, %36, %54; +} +{ +add.f16x2 r4, %30, r1; +} +{ +add.f16x2 r7, %42, %48; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %37, %55; +} +{ +add.f16x2 r16, %31, r13; +} +{ +add.f16x2 r19, %43, %49; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %36, %54; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %30, r28; +} +{ +add.f16x2 r34, %42, %48; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %37, %55; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %43, %49; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %36, %54; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %30, r64; +} +{ +add.f16x2 r70, %42, %48; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %37, %55; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %43, %49; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %36, %54; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %30, r100; +} +{ +add.f16x2 r106, %42, %48; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %37, %55; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %43, %49; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %36, %54; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %30, r136; +} +{ +add.f16x2 r142, %42, %48; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %37, %55; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %43, %49; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %37, %55; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %31, r172; +} +{ +add.f16x2 r178, %43, %49; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %36, %54; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %42, %48; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %37, %55; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %31, r208; +} +{ +add.f16x2 r214, %43, %49; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %36, %54; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %42, %48; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %37, %55; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %31, r244; +} +{ +add.f16x2 r250, %43, %49; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %36, %54; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %42, %48; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %37, %55; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %31, r280; +} +{ +add.f16x2 r286, %43, %49; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %36, %54; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %42, %48; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs9, fd23; +} +mov.b32 r522, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd24; +} +mov.b32 r540, {rs10, rs10}; +{ +cvt.rn.f16.f64 rs11, fd29; +} +mov.b32 r594, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd30; +} +mov.b32 r612, {rs12, rs12}; +{ +cvt.rn.f16.f64 rs13, fd23; +} +mov.b32 r603, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd24; +} +{ +neg.f16 rs15, rs14; +} +mov.b32 r618, {rs15, rs15}; +{ +add.f16x2 r313, %38, %56; +} +{ +add.f16x2 r316, %32, r313; +} +{ +add.f16x2 r319, %44, %50; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %39, %57; +} +{ +add.f16x2 r328, %33, r325; +} +{ +add.f16x2 r331, %45, %51; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %38, %56; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %32, r340; +} +{ +add.f16x2 r346, %44, %50; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %39, %57; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %45, %51; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %38, %56; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %32, r376; +} +{ +add.f16x2 r382, %44, %50; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %39, %57; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %45, %51; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %38, %56; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %32, r412; +} +{ +add.f16x2 r418, %44, %50; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %39, %57; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %45, %51; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %38, %56; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %32, r448; +} +{ +add.f16x2 r454, %44, %50; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %39, %57; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %45, %51; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %39, %57; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %33, r484; +} +{ +add.f16x2 r490, %45, %51; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %38, %56; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %44, %50; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %39, %57; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %33, r520; +} +{ +add.f16x2 r526, %45, %51; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %38, %56; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %44, %50; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %39, %57; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %33, r556; +} +{ +add.f16x2 r562, %45, %51; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %38, %56; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %44, %50; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %39, %57; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %33, r592; +} +{ +add.f16x2 r598, %45, %51; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %38, %56; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %44, %50; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +{ +cvt.rn.f16.f64 rs17, fd23; +} +mov.b32 r834, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd24; +} +mov.b32 r852, {rs18, rs18}; +{ +cvt.rn.f16.f64 rs19, fd29; +} +mov.b32 r906, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd30; +} +mov.b32 r924, {rs20, rs20}; +{ +cvt.rn.f16.f64 rs21, fd23; +} +mov.b32 r915, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd24; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r930, {rs23, rs23}; +{ +add.f16x2 r625, %40, %58; +} +{ +add.f16x2 r628, %34, r625; +} +{ +add.f16x2 r631, %46, %52; +} +{ +add.f16x2 r634, r628, r631; +} +{ +add.f16x2 r637, %41, %59; +} +{ +add.f16x2 r640, %35, r637; +} +{ +add.f16x2 r643, %47, %53; +} +{ +add.f16x2 r646, r640, r643; +} +{ +add.f16x2 r649, %40, %58; +} +{ +mul.f16x2 r652, r649, r834; +} +{ +add.f16x2 r655, %34, r652; +} +{ +add.f16x2 r658, %46, %52; +} +{ +mul.f16x2 r661, r658, r906; +} +{ +add.f16x2 r664, r655, r661; +} +{ +sub.f16x2 r667, %41, %59; +} +{ +mul.f16x2 r670, r667, r852; +} +{ +sub.f16x2 r673, %47, %53; +} +{ +mul.f16x2 r676, r673, r924; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r664, r679; +} +{ +add.f16x2 r685, %40, %58; +} +{ +mul.f16x2 r688, r685, r834; +} +{ +add.f16x2 r691, %34, r688; +} +{ +add.f16x2 r694, %46, %52; +} +{ +mul.f16x2 r697, r694, r906; +} +{ +add.f16x2 r700, r691, r697; +} +{ +sub.f16x2 r703, %41, %59; +} +{ +mul.f16x2 r706, r703, r852; +} +{ +sub.f16x2 r709, %47, %53; +} +{ +mul.f16x2 r712, r709, r924; +} +{ +add.f16x2 r715, r706, r712; +} +{ +add.f16x2 r718, r700, r715; +} +{ +add.f16x2 r721, %40, %58; +} +{ +mul.f16x2 r724, r721, r906; +} +{ +add.f16x2 r727, %34, r724; +} +{ +add.f16x2 r730, %46, %52; +} +{ +mul.f16x2 r733, r730, r915; +} +{ +add.f16x2 r736, r727, r733; +} +{ +sub.f16x2 r739, %41, %59; +} +{ +mul.f16x2 r742, r739, r924; +} +{ +sub.f16x2 r745, %47, %53; +} +{ +mul.f16x2 r748, r745, r930; +} +{ +add.f16x2 r751, r742, r748; +} +{ +sub.f16x2 r754, r736, r751; +} +{ +add.f16x2 r757, %40, %58; +} +{ +mul.f16x2 r760, r757, r906; +} +{ +add.f16x2 r763, %34, r760; +} +{ +add.f16x2 r766, %46, %52; +} +{ +mul.f16x2 r769, r766, r915; +} +{ +add.f16x2 r772, r763, r769; +} +{ +sub.f16x2 r775, %41, %59; +} +{ +mul.f16x2 r778, r775, r924; +} +{ +sub.f16x2 r781, %47, %53; +} +{ +mul.f16x2 r784, r781, r930; +} +{ +add.f16x2 r787, r778, r784; +} +{ +add.f16x2 r790, r772, r787; +} +{ +add.f16x2 r793, %41, %59; +} +{ +mul.f16x2 r796, r793, r834; +} +{ +add.f16x2 r799, %35, r796; +} +{ +add.f16x2 r802, %47, %53; +} +{ +mul.f16x2 r805, r802, r906; +} +{ +add.f16x2 r808, r799, r805; +} +{ +sub.f16x2 r811, %40, %58; +} +{ +mul.f16x2 r814, r811, r852; +} +{ +sub.f16x2 r817, %46, %52; +} +{ +mul.f16x2 r820, r817, r924; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r808, r823; +} +{ +add.f16x2 r829, %41, %59; +} +{ +mul.f16x2 r832, r829, r834; +} +{ +add.f16x2 r835, %35, r832; +} +{ +add.f16x2 r838, %47, %53; +} +{ +mul.f16x2 r841, r838, r906; +} +{ +add.f16x2 r844, r835, r841; +} +{ +sub.f16x2 r847, %40, %58; +} +{ +mul.f16x2 r850, r847, r852; +} +{ +sub.f16x2 r853, %46, %52; +} +{ +mul.f16x2 r856, r853, r924; +} +{ +add.f16x2 r859, r850, r856; +} +{ +sub.f16x2 r862, r844, r859; +} +{ +add.f16x2 r865, %41, %59; +} +{ +mul.f16x2 r868, r865, r906; +} +{ +add.f16x2 r871, %35, r868; +} +{ +add.f16x2 r874, %47, %53; +} +{ +mul.f16x2 r877, r874, r915; +} +{ +add.f16x2 r880, r871, r877; +} +{ +sub.f16x2 r883, %40, %58; +} +{ +mul.f16x2 r886, r883, r924; +} +{ +sub.f16x2 r889, %46, %52; +} +{ +mul.f16x2 r892, r889, r930; +} +{ +add.f16x2 r895, r886, r892; +} +{ +add.f16x2 r898, r880, r895; +} +{ +add.f16x2 r901, %41, %59; +} +{ +mul.f16x2 r904, r901, r906; +} +{ +add.f16x2 r907, %35, r904; +} +{ +add.f16x2 r910, %47, %53; +} +{ +mul.f16x2 r913, r910, r915; +} +{ +add.f16x2 r916, r907, r913; +} +{ +sub.f16x2 r919, %40, %58; +} +{ +mul.f16x2 r922, r919, r924; +} +{ +sub.f16x2 r925, %46, %52; +} +{ +mul.f16x2 r928, r925, r930; +} +{ +add.f16x2 r931, r922, r928; +} +{ +sub.f16x2 r934, r916, r931; +} +mov.f64 fd19, 0d3FED3BC3AEFF7F95; +{ +cvt.rn.f16.f64 rs25, fd19; +} +mov.f64 fd20, 0dBFDA07F921061AD1; +{ +cvt.rn.f16.f64 rs26, fd20; +} +mov.f64 fd21, 0d3FE5698496E20BD8; +{ +cvt.rn.f16.f64 rs27, fd21; +} +mov.f64 fd22, 0dBFE7C7D7A833BEC2; +{ +cvt.rn.f16.f64 rs28, fd22; +} +{ +cvt.rn.f16.f64 rs29, fd23; +} +{ +cvt.rn.f16.f64 rs30, fd24; +} +mov.f64 fd25, 0dBFBAC2609B3C576C; +{ +cvt.rn.f16.f64 rs31, fd25; +} +mov.f64 fd26, 0dBFEFD31F94F867C6; +{ +cvt.rn.f16.f64 rs32, fd26; +} +mov.f64 fd55, 0dBFE0000000000000; +mov.f64 fd56, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs35, fd29; +} +{ +cvt.rn.f16.f64 rs36, fd30; +} +mov.f64 fd33, 0dBFEF4CFC327A0080; +{ +cvt.rn.f16.f64 rs39, fd33; +} +mov.f64 fd34, 0d3FCA9CD9AC4258F6; +{ +cvt.rn.f16.f64 rs40, fd34; +} +mov.b32 r951, {rs25, rs25}; +{ +mul.f16x2 r937, r370, r951; +} +mov.b32 r948, {rs26, rs26}; +{ +mul.f16x2 r940, r514, r948; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r370, r948; +} +{ +fma.rn.f16x2 r949, r514, r951, r946; +} +mov.b32 r983, {rs27, rs27}; +{ +mul.f16x2 r953, r682, r983; +} +mov.b32 r980, {rs28, rs28}; +{ +mul.f16x2 r956, r826, r980; +} +{ +sub.f16x2 r959, r953, r956; +} +{ +mul.f16x2 r962, r682, r980; +} +{ +fma.rn.f16x2 r965, r826, r983, r962; +} +{ +mul.f16x2 r969, r442, r983; +} +{ +mul.f16x2 r972, r586, r980; +} +{ +sub.f16x2 r975, r969, r972; +} +{ +mul.f16x2 r978, r442, r980; +} +{ +fma.rn.f16x2 r981, r586, r983, r978; +} +mov.b32 r1047, {rs31, rs31}; +{ +mul.f16x2 r985, r754, r1047; +} +mov.b32 r1044, {rs32, rs32}; +{ +mul.f16x2 r988, r898, r1044; +} +{ +sub.f16x2 r991, r985, r988; +} +{ +mul.f16x2 r994, r754, r1044; +} +{ +fma.rn.f16x2 r997, r898, r1047, r994; +} +mov.b32 r1015, {rs29, rs29}; +{ +mul.f16x2 r1001, r478, r1015; +} +mov.b32 r1012, {rs30, rs30}; +{ +mul.f16x2 r1004, r622, r1012; +} +{ +sub.f16x2 r1007, r1001, r1004; +} +{ +mul.f16x2 r1010, r478, r1012; +} +{ +fma.rn.f16x2 r1013, r622, r1015, r1010; +} +mov.b32 r1031, {rs35, rs35}; +{ +mul.f16x2 r1017, r790, r1031; +} +mov.b32 r1028, {rs36, rs36}; +{ +mul.f16x2 r1020, r934, r1028; +} +{ +sub.f16x2 r1023, r1017, r1020; +} +{ +mul.f16x2 r1026, r790, r1028; +} +{ +fma.rn.f16x2 r1029, r934, r1031, r1026; +} +{ +mul.f16x2 r1033, r406, r1047; +} +{ +mul.f16x2 r1036, r550, r1044; +} +{ +sub.f16x2 r1039, r1033, r1036; +} +{ +mul.f16x2 r1042, r406, r1044; +} +{ +fma.rn.f16x2 r1045, r550, r1047, r1042; +} +mov.b32 r1063, {rs39, rs39}; +{ +mul.f16x2 r1049, r718, r1063; +} +mov.b32 r1060, {rs40, rs40}; +{ +mul.f16x2 r1052, r862, r1060; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r718, r1060; +} +{ +fma.rn.f16x2 r1061, r862, r1063, r1058; +} +{ +cvt.rn.f16.f64 rs53, fd55; +} +mov.b32 r1136, {rs53, rs53}; +{ +cvt.rn.f16.f64 rs54, fd56; +} +{ +neg.f16 rs55, rs54; +} +mov.b32 r1145, {rs55, rs55}; +{ +add.f16x2 r1065, r322, r634; +} +{ +add.f16x2 %0, r10, r1065; +} +{ +add.f16x2 r1071, r334, r646; +} +{ +add.f16x2 %1, r22, r1071; +} +{ +add.f16x2 r1077, r322, r634; +} +{ +mul.f16x2 r1080, r1077, r1136; +} +{ +add.f16x2 r1083, r10, r1080; +} +{ +sub.f16x2 r1086, r334, r646; +} +{ +mul.f16x2 r1089, r1086, r1145; +} +{ +add.f16x2 %10, r1083, r1089; +} +{ +add.f16x2 r1095, r322, r634; +} +{ +mul.f16x2 r1098, r1095, r1136; +} +{ +add.f16x2 r1101, r10, r1098; +} +{ +sub.f16x2 r1104, r334, r646; +} +{ +mul.f16x2 r1107, r1104, r1145; +} +{ +sub.f16x2 %20, r1101, r1107; +} +{ +add.f16x2 r1113, r334, r646; +} +{ +mul.f16x2 r1116, r1113, r1136; +} +{ +add.f16x2 r1119, r22, r1116; +} +{ +sub.f16x2 r1122, r322, r634; +} +{ +mul.f16x2 r1125, r1122, r1145; +} +{ +sub.f16x2 %11, r1119, r1125; +} +{ +add.f16x2 r1131, r334, r646; +} +{ +mul.f16x2 r1134, r1131, r1136; +} +{ +add.f16x2 r1137, r22, r1134; +} +{ +sub.f16x2 r1140, r322, r634; +} +{ +mul.f16x2 r1143, r1140, r1145; +} +{ +add.f16x2 %21, r1137, r1143; +} +{ +cvt.rn.f16.f64 rs57, fd55; +} +mov.b32 r1220, {rs57, rs57}; +{ +cvt.rn.f16.f64 rs58, fd56; +} +{ +neg.f16 rs59, rs58; +} +mov.b32 r1229, {rs59, rs59}; +{ +add.f16x2 r1149, r943, r959; +} +{ +add.f16x2 %2, r58, r1149; +} +{ +add.f16x2 r1155, r949, r965; +} +{ +add.f16x2 %3, r202, r1155; +} +{ +add.f16x2 r1161, r943, r959; +} +{ +mul.f16x2 r1164, r1161, r1220; +} +{ +add.f16x2 r1167, r58, r1164; +} +{ +sub.f16x2 r1170, r949, r965; +} +{ +mul.f16x2 r1173, r1170, r1229; +} +{ +add.f16x2 %12, r1167, r1173; +} +{ +add.f16x2 r1179, r943, r959; +} +{ +mul.f16x2 r1182, r1179, r1220; +} +{ +add.f16x2 r1185, r58, r1182; +} +{ +sub.f16x2 r1188, r949, r965; +} +{ +mul.f16x2 r1191, r1188, r1229; +} +{ +sub.f16x2 %22, r1185, r1191; +} +{ +add.f16x2 r1197, r949, r965; +} +{ +mul.f16x2 r1200, r1197, r1220; +} +{ +add.f16x2 r1203, r202, r1200; +} +{ +sub.f16x2 r1206, r943, r959; +} +{ +mul.f16x2 r1209, r1206, r1229; +} +{ +sub.f16x2 %13, r1203, r1209; +} +{ +add.f16x2 r1215, r949, r965; +} +{ +mul.f16x2 r1218, r1215, r1220; +} +{ +add.f16x2 r1221, r202, r1218; +} +{ +sub.f16x2 r1224, r943, r959; +} +{ +mul.f16x2 r1227, r1224, r1229; +} +{ +add.f16x2 %23, r1221, r1227; +} +{ +cvt.rn.f16.f64 rs61, fd55; +} +mov.b32 r1304, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs62, fd56; +} +{ +neg.f16 rs63, rs62; +} +mov.b32 r1313, {rs63, rs63}; +{ +add.f16x2 r1233, r975, r991; +} +{ +add.f16x2 %4, r130, r1233; +} +{ +add.f16x2 r1239, r981, r997; +} +{ +add.f16x2 %5, r274, r1239; +} +{ +add.f16x2 r1245, r975, r991; +} +{ +mul.f16x2 r1248, r1245, r1304; +} +{ +add.f16x2 r1251, r130, r1248; +} +{ +sub.f16x2 r1254, r981, r997; +} +{ +mul.f16x2 r1257, r1254, r1313; +} +{ +add.f16x2 %14, r1251, r1257; +} +{ +add.f16x2 r1263, r975, r991; +} +{ +mul.f16x2 r1266, r1263, r1304; +} +{ +add.f16x2 r1269, r130, r1266; +} +{ +sub.f16x2 r1272, r981, r997; +} +{ +mul.f16x2 r1275, r1272, r1313; +} +{ +sub.f16x2 %24, r1269, r1275; +} +{ +add.f16x2 r1281, r981, r997; +} +{ +mul.f16x2 r1284, r1281, r1304; +} +{ +add.f16x2 r1287, r274, r1284; +} +{ +sub.f16x2 r1290, r975, r991; +} +{ +mul.f16x2 r1293, r1290, r1313; +} +{ +sub.f16x2 %15, r1287, r1293; +} +{ +add.f16x2 r1299, r981, r997; +} +{ +mul.f16x2 r1302, r1299, r1304; +} +{ +add.f16x2 r1305, r274, r1302; +} +{ +sub.f16x2 r1308, r975, r991; +} +{ +mul.f16x2 r1311, r1308, r1313; +} +{ +add.f16x2 %25, r1305, r1311; +} +{ +cvt.rn.f16.f64 rs65, fd55; +} +mov.b32 r1388, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs66, fd56; +} +{ +neg.f16 rs67, rs66; +} +mov.b32 r1397, {rs67, rs67}; +{ +add.f16x2 r1317, r1007, r1023; +} +{ +add.f16x2 %6, r166, r1317; +} +{ +add.f16x2 r1323, r1013, r1029; +} +{ +add.f16x2 %7, r310, r1323; +} +{ +add.f16x2 r1329, r1007, r1023; +} +{ +mul.f16x2 r1332, r1329, r1388; +} +{ +add.f16x2 r1335, r166, r1332; +} +{ +sub.f16x2 r1338, r1013, r1029; +} +{ +mul.f16x2 r1341, r1338, r1397; +} +{ +add.f16x2 %16, r1335, r1341; +} +{ +add.f16x2 r1347, r1007, r1023; +} +{ +mul.f16x2 r1350, r1347, r1388; +} +{ +add.f16x2 r1353, r166, r1350; +} +{ +sub.f16x2 r1356, r1013, r1029; +} +{ +mul.f16x2 r1359, r1356, r1397; +} +{ +sub.f16x2 %26, r1353, r1359; +} +{ +add.f16x2 r1365, r1013, r1029; +} +{ +mul.f16x2 r1368, r1365, r1388; +} +{ +add.f16x2 r1371, r310, r1368; +} +{ +sub.f16x2 r1374, r1007, r1023; +} +{ +mul.f16x2 r1377, r1374, r1397; +} +{ +sub.f16x2 %17, r1371, r1377; +} +{ +add.f16x2 r1383, r1013, r1029; +} +{ +mul.f16x2 r1386, r1383, r1388; +} +{ +add.f16x2 r1389, r310, r1386; +} +{ +sub.f16x2 r1392, r1007, r1023; +} +{ +mul.f16x2 r1395, r1392, r1397; +} +{ +add.f16x2 %27, r1389, r1395; +} +{ +cvt.rn.f16.f64 rs69, fd55; +} +mov.b32 r1472, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs70, fd56; +} +{ +neg.f16 rs71, rs70; +} +mov.b32 r1481, {rs71, rs71}; +{ +add.f16x2 r1401, r1039, r1055; +} +{ +add.f16x2 %8, r94, r1401; +} +{ +add.f16x2 r1407, r1045, r1061; +} +{ +add.f16x2 %9, r238, r1407; +} +{ +add.f16x2 r1413, r1039, r1055; +} +{ +mul.f16x2 r1416, r1413, r1472; +} +{ +add.f16x2 r1419, r94, r1416; +} +{ +sub.f16x2 r1422, r1045, r1061; +} +{ +mul.f16x2 r1425, r1422, r1481; +} +{ +add.f16x2 %18, r1419, r1425; +} +{ +add.f16x2 r1431, r1039, r1055; +} +{ +mul.f16x2 r1434, r1431, r1472; +} +{ +add.f16x2 r1437, r94, r1434; +} +{ +sub.f16x2 r1440, r1045, r1061; +} +{ +mul.f16x2 r1443, r1440, r1481; +} +{ +sub.f16x2 %28, r1437, r1443; +} +{ +add.f16x2 r1449, r1045, r1061; +} +{ +mul.f16x2 r1452, r1449, r1472; +} +{ +add.f16x2 r1455, r238, r1452; +} +{ +sub.f16x2 r1458, r1039, r1055; +} +{ +mul.f16x2 r1461, r1458, r1481; +} +{ +sub.f16x2 %19, r1455, r1461; +} +{ +add.f16x2 r1467, r1045, r1061; +} +{ +mul.f16x2 r1470, r1467, r1472; +} +{ +add.f16x2 r1473, r238, r1470; +} +{ +sub.f16x2 r1476, r1039, r1055; +} +{ +mul.f16x2 r1479, r1476, r1481; +} +{ +add.f16x2 %29, r1473, r1479; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..6196d668ae6b2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp16_inv.hpp.inc @@ -0,0 +1,1688 @@ +#ifndef CUFFTDX_FFT_15_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_15_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<950, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<69>; +.reg .b32 r<1485>; +.reg .f64 fd<57>; +.reg .b64 rd<2>; +mov.f64 fd23, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd23; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd18, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd18; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r228, {rs3, rs3}; +mov.f64 fd29, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs5, fd29; +} +mov.b32 r282, {rs5, rs5}; +mov.f64 fd16, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs6, fd16; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r300, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs9, fd23; +} +mov.b32 r291, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd18; +} +mov.b32 r306, {rs10, rs10}; +{ +add.f16x2 r1, %36, %54; +} +{ +add.f16x2 r4, %30, r1; +} +{ +add.f16x2 r7, %42, %48; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %37, %55; +} +{ +add.f16x2 r16, %31, r13; +} +{ +add.f16x2 r19, %43, %49; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %36, %54; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %30, r28; +} +{ +add.f16x2 r34, %42, %48; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %37, %55; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %43, %49; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %36, %54; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %30, r64; +} +{ +add.f16x2 r70, %42, %48; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %37, %55; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %43, %49; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %36, %54; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %30, r100; +} +{ +add.f16x2 r106, %42, %48; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %37, %55; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %43, %49; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %36, %54; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %30, r136; +} +{ +add.f16x2 r142, %42, %48; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %37, %55; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %43, %49; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %37, %55; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %31, r172; +} +{ +add.f16x2 r178, %43, %49; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %36, %54; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %42, %48; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %37, %55; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %31, r208; +} +{ +add.f16x2 r214, %43, %49; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %36, %54; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %42, %48; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %37, %55; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %31, r244; +} +{ +add.f16x2 r250, %43, %49; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %36, %54; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %42, %48; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %37, %55; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %31, r280; +} +{ +add.f16x2 r286, %43, %49; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %36, %54; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %42, %48; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs11, fd23; +} +mov.b32 r522, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd18; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r540, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs15, fd29; +} +mov.b32 r594, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd16; +} +{ +neg.f16 rs17, rs16; +} +mov.b32 r612, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs19, fd23; +} +mov.b32 r603, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd18; +} +mov.b32 r618, {rs20, rs20}; +{ +add.f16x2 r313, %38, %56; +} +{ +add.f16x2 r316, %32, r313; +} +{ +add.f16x2 r319, %44, %50; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %39, %57; +} +{ +add.f16x2 r328, %33, r325; +} +{ +add.f16x2 r331, %45, %51; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %38, %56; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %32, r340; +} +{ +add.f16x2 r346, %44, %50; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %39, %57; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %45, %51; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %38, %56; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %32, r376; +} +{ +add.f16x2 r382, %44, %50; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %39, %57; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %45, %51; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %38, %56; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %32, r412; +} +{ +add.f16x2 r418, %44, %50; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %39, %57; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %45, %51; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %38, %56; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %32, r448; +} +{ +add.f16x2 r454, %44, %50; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %39, %57; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %45, %51; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %39, %57; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %33, r484; +} +{ +add.f16x2 r490, %45, %51; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %38, %56; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %44, %50; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %39, %57; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %33, r520; +} +{ +add.f16x2 r526, %45, %51; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %38, %56; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %44, %50; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %39, %57; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %33, r556; +} +{ +add.f16x2 r562, %45, %51; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %38, %56; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %44, %50; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %39, %57; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %33, r592; +} +{ +add.f16x2 r598, %45, %51; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %38, %56; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %44, %50; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +{ +cvt.rn.f16.f64 rs21, fd23; +} +mov.b32 r834, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd18; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r852, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd29; +} +mov.b32 r906, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd16; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r924, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs29, fd23; +} +mov.b32 r915, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd18; +} +mov.b32 r930, {rs30, rs30}; +{ +add.f16x2 r625, %40, %58; +} +{ +add.f16x2 r628, %34, r625; +} +{ +add.f16x2 r631, %46, %52; +} +{ +add.f16x2 r634, r628, r631; +} +{ +add.f16x2 r637, %41, %59; +} +{ +add.f16x2 r640, %35, r637; +} +{ +add.f16x2 r643, %47, %53; +} +{ +add.f16x2 r646, r640, r643; +} +{ +add.f16x2 r649, %40, %58; +} +{ +mul.f16x2 r652, r649, r834; +} +{ +add.f16x2 r655, %34, r652; +} +{ +add.f16x2 r658, %46, %52; +} +{ +mul.f16x2 r661, r658, r906; +} +{ +add.f16x2 r664, r655, r661; +} +{ +sub.f16x2 r667, %41, %59; +} +{ +mul.f16x2 r670, r667, r852; +} +{ +sub.f16x2 r673, %47, %53; +} +{ +mul.f16x2 r676, r673, r924; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r664, r679; +} +{ +add.f16x2 r685, %40, %58; +} +{ +mul.f16x2 r688, r685, r834; +} +{ +add.f16x2 r691, %34, r688; +} +{ +add.f16x2 r694, %46, %52; +} +{ +mul.f16x2 r697, r694, r906; +} +{ +add.f16x2 r700, r691, r697; +} +{ +sub.f16x2 r703, %41, %59; +} +{ +mul.f16x2 r706, r703, r852; +} +{ +sub.f16x2 r709, %47, %53; +} +{ +mul.f16x2 r712, r709, r924; +} +{ +add.f16x2 r715, r706, r712; +} +{ +add.f16x2 r718, r700, r715; +} +{ +add.f16x2 r721, %40, %58; +} +{ +mul.f16x2 r724, r721, r906; +} +{ +add.f16x2 r727, %34, r724; +} +{ +add.f16x2 r730, %46, %52; +} +{ +mul.f16x2 r733, r730, r915; +} +{ +add.f16x2 r736, r727, r733; +} +{ +sub.f16x2 r739, %41, %59; +} +{ +mul.f16x2 r742, r739, r924; +} +{ +sub.f16x2 r745, %47, %53; +} +{ +mul.f16x2 r748, r745, r930; +} +{ +add.f16x2 r751, r742, r748; +} +{ +sub.f16x2 r754, r736, r751; +} +{ +add.f16x2 r757, %40, %58; +} +{ +mul.f16x2 r760, r757, r906; +} +{ +add.f16x2 r763, %34, r760; +} +{ +add.f16x2 r766, %46, %52; +} +{ +mul.f16x2 r769, r766, r915; +} +{ +add.f16x2 r772, r763, r769; +} +{ +sub.f16x2 r775, %41, %59; +} +{ +mul.f16x2 r778, r775, r924; +} +{ +sub.f16x2 r781, %47, %53; +} +{ +mul.f16x2 r784, r781, r930; +} +{ +add.f16x2 r787, r778, r784; +} +{ +add.f16x2 r790, r772, r787; +} +{ +add.f16x2 r793, %41, %59; +} +{ +mul.f16x2 r796, r793, r834; +} +{ +add.f16x2 r799, %35, r796; +} +{ +add.f16x2 r802, %47, %53; +} +{ +mul.f16x2 r805, r802, r906; +} +{ +add.f16x2 r808, r799, r805; +} +{ +sub.f16x2 r811, %40, %58; +} +{ +mul.f16x2 r814, r811, r852; +} +{ +sub.f16x2 r817, %46, %52; +} +{ +mul.f16x2 r820, r817, r924; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r808, r823; +} +{ +add.f16x2 r829, %41, %59; +} +{ +mul.f16x2 r832, r829, r834; +} +{ +add.f16x2 r835, %35, r832; +} +{ +add.f16x2 r838, %47, %53; +} +{ +mul.f16x2 r841, r838, r906; +} +{ +add.f16x2 r844, r835, r841; +} +{ +sub.f16x2 r847, %40, %58; +} +{ +mul.f16x2 r850, r847, r852; +} +{ +sub.f16x2 r853, %46, %52; +} +{ +mul.f16x2 r856, r853, r924; +} +{ +add.f16x2 r859, r850, r856; +} +{ +sub.f16x2 r862, r844, r859; +} +{ +add.f16x2 r865, %41, %59; +} +{ +mul.f16x2 r868, r865, r906; +} +{ +add.f16x2 r871, %35, r868; +} +{ +add.f16x2 r874, %47, %53; +} +{ +mul.f16x2 r877, r874, r915; +} +{ +add.f16x2 r880, r871, r877; +} +{ +sub.f16x2 r883, %40, %58; +} +{ +mul.f16x2 r886, r883, r924; +} +{ +sub.f16x2 r889, %46, %52; +} +{ +mul.f16x2 r892, r889, r930; +} +{ +add.f16x2 r895, r886, r892; +} +{ +add.f16x2 r898, r880, r895; +} +{ +add.f16x2 r901, %41, %59; +} +{ +mul.f16x2 r904, r901, r906; +} +{ +add.f16x2 r907, %35, r904; +} +{ +add.f16x2 r910, %47, %53; +} +{ +mul.f16x2 r913, r910, r915; +} +{ +add.f16x2 r916, r907, r913; +} +{ +sub.f16x2 r919, %40, %58; +} +{ +mul.f16x2 r922, r919, r924; +} +{ +sub.f16x2 r925, %46, %52; +} +{ +mul.f16x2 r928, r925, r930; +} +{ +add.f16x2 r931, r922, r928; +} +{ +sub.f16x2 r934, r916, r931; +} +mov.f64 fd19, 0d3FED3BC3AEFF7F95; +{ +cvt.rn.f16.f64 rs31, fd19; +} +mov.f64 fd20, 0d3FDA07F921061AD1; +{ +cvt.rn.f16.f64 rs32, fd20; +} +mov.f64 fd21, 0d3FE5698496E20BD8; +{ +cvt.rn.f16.f64 rs33, fd21; +} +mov.f64 fd22, 0d3FE7C7D7A833BEC2; +{ +cvt.rn.f16.f64 rs34, fd22; +} +{ +cvt.rn.f16.f64 rs35, fd23; +} +mov.f64 fd24, 0d3FEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs36, fd24; +} +mov.f64 fd25, 0dBFBAC2609B3C576C; +{ +cvt.rn.f16.f64 rs37, fd25; +} +mov.f64 fd26, 0d3FEFD31F94F867C6; +{ +cvt.rn.f16.f64 rs38, fd26; +} +mov.f64 fd55, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs41, fd29; +} +mov.f64 fd30, 0d3FE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs42, fd30; +} +mov.f64 fd33, 0dBFEF4CFC327A0080; +{ +cvt.rn.f16.f64 rs45, fd33; +} +mov.f64 fd34, 0dBFCA9CD9AC4258F6; +{ +cvt.rn.f16.f64 rs46, fd34; +} +mov.f64 fd56, 0dBFEBB67AE8584CAA; +mov.b32 r951, {rs31, rs31}; +{ +mul.f16x2 r937, r370, r951; +} +mov.b32 r948, {rs32, rs32}; +{ +mul.f16x2 r940, r514, r948; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r370, r948; +} +{ +fma.rn.f16x2 r949, r514, r951, r946; +} +mov.b32 r983, {rs33, rs33}; +{ +mul.f16x2 r953, r682, r983; +} +mov.b32 r980, {rs34, rs34}; +{ +mul.f16x2 r956, r826, r980; +} +{ +sub.f16x2 r959, r953, r956; +} +{ +mul.f16x2 r962, r682, r980; +} +{ +fma.rn.f16x2 r965, r826, r983, r962; +} +{ +mul.f16x2 r969, r442, r983; +} +{ +mul.f16x2 r972, r586, r980; +} +{ +sub.f16x2 r975, r969, r972; +} +{ +mul.f16x2 r978, r442, r980; +} +{ +fma.rn.f16x2 r981, r586, r983, r978; +} +mov.b32 r1047, {rs37, rs37}; +{ +mul.f16x2 r985, r754, r1047; +} +mov.b32 r1044, {rs38, rs38}; +{ +mul.f16x2 r988, r898, r1044; +} +{ +sub.f16x2 r991, r985, r988; +} +{ +mul.f16x2 r994, r754, r1044; +} +{ +fma.rn.f16x2 r997, r898, r1047, r994; +} +mov.b32 r1015, {rs35, rs35}; +{ +mul.f16x2 r1001, r478, r1015; +} +mov.b32 r1012, {rs36, rs36}; +{ +mul.f16x2 r1004, r622, r1012; +} +{ +sub.f16x2 r1007, r1001, r1004; +} +{ +mul.f16x2 r1010, r478, r1012; +} +{ +fma.rn.f16x2 r1013, r622, r1015, r1010; +} +mov.b32 r1031, {rs41, rs41}; +{ +mul.f16x2 r1017, r790, r1031; +} +mov.b32 r1028, {rs42, rs42}; +{ +mul.f16x2 r1020, r934, r1028; +} +{ +sub.f16x2 r1023, r1017, r1020; +} +{ +mul.f16x2 r1026, r790, r1028; +} +{ +fma.rn.f16x2 r1029, r934, r1031, r1026; +} +{ +mul.f16x2 r1033, r406, r1047; +} +{ +mul.f16x2 r1036, r550, r1044; +} +{ +sub.f16x2 r1039, r1033, r1036; +} +{ +mul.f16x2 r1042, r406, r1044; +} +{ +fma.rn.f16x2 r1045, r550, r1047, r1042; +} +mov.b32 r1063, {rs45, rs45}; +{ +mul.f16x2 r1049, r718, r1063; +} +mov.b32 r1060, {rs46, rs46}; +{ +mul.f16x2 r1052, r862, r1060; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r718, r1060; +} +{ +fma.rn.f16x2 r1061, r862, r1063, r1058; +} +{ +cvt.rn.f16.f64 rs59, fd55; +} +mov.b32 r1136, {rs59, rs59}; +{ +cvt.rn.f16.f64 rs60, fd56; +} +mov.b32 r1145, {rs60, rs60}; +{ +add.f16x2 r1065, r322, r634; +} +{ +add.f16x2 %0, r10, r1065; +} +{ +add.f16x2 r1071, r334, r646; +} +{ +add.f16x2 %1, r22, r1071; +} +{ +add.f16x2 r1077, r322, r634; +} +{ +mul.f16x2 r1080, r1077, r1136; +} +{ +add.f16x2 r1083, r10, r1080; +} +{ +sub.f16x2 r1086, r334, r646; +} +{ +mul.f16x2 r1089, r1086, r1145; +} +{ +add.f16x2 %10, r1083, r1089; +} +{ +add.f16x2 r1095, r322, r634; +} +{ +mul.f16x2 r1098, r1095, r1136; +} +{ +add.f16x2 r1101, r10, r1098; +} +{ +sub.f16x2 r1104, r334, r646; +} +{ +mul.f16x2 r1107, r1104, r1145; +} +{ +sub.f16x2 %20, r1101, r1107; +} +{ +add.f16x2 r1113, r334, r646; +} +{ +mul.f16x2 r1116, r1113, r1136; +} +{ +add.f16x2 r1119, r22, r1116; +} +{ +sub.f16x2 r1122, r322, r634; +} +{ +mul.f16x2 r1125, r1122, r1145; +} +{ +sub.f16x2 %11, r1119, r1125; +} +{ +add.f16x2 r1131, r334, r646; +} +{ +mul.f16x2 r1134, r1131, r1136; +} +{ +add.f16x2 r1137, r22, r1134; +} +{ +sub.f16x2 r1140, r322, r634; +} +{ +mul.f16x2 r1143, r1140, r1145; +} +{ +add.f16x2 %21, r1137, r1143; +} +{ +cvt.rn.f16.f64 rs61, fd55; +} +mov.b32 r1220, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs62, fd56; +} +mov.b32 r1229, {rs62, rs62}; +{ +add.f16x2 r1149, r943, r959; +} +{ +add.f16x2 %2, r58, r1149; +} +{ +add.f16x2 r1155, r949, r965; +} +{ +add.f16x2 %3, r202, r1155; +} +{ +add.f16x2 r1161, r943, r959; +} +{ +mul.f16x2 r1164, r1161, r1220; +} +{ +add.f16x2 r1167, r58, r1164; +} +{ +sub.f16x2 r1170, r949, r965; +} +{ +mul.f16x2 r1173, r1170, r1229; +} +{ +add.f16x2 %12, r1167, r1173; +} +{ +add.f16x2 r1179, r943, r959; +} +{ +mul.f16x2 r1182, r1179, r1220; +} +{ +add.f16x2 r1185, r58, r1182; +} +{ +sub.f16x2 r1188, r949, r965; +} +{ +mul.f16x2 r1191, r1188, r1229; +} +{ +sub.f16x2 %22, r1185, r1191; +} +{ +add.f16x2 r1197, r949, r965; +} +{ +mul.f16x2 r1200, r1197, r1220; +} +{ +add.f16x2 r1203, r202, r1200; +} +{ +sub.f16x2 r1206, r943, r959; +} +{ +mul.f16x2 r1209, r1206, r1229; +} +{ +sub.f16x2 %13, r1203, r1209; +} +{ +add.f16x2 r1215, r949, r965; +} +{ +mul.f16x2 r1218, r1215, r1220; +} +{ +add.f16x2 r1221, r202, r1218; +} +{ +sub.f16x2 r1224, r943, r959; +} +{ +mul.f16x2 r1227, r1224, r1229; +} +{ +add.f16x2 %23, r1221, r1227; +} +{ +cvt.rn.f16.f64 rs63, fd55; +} +mov.b32 r1304, {rs63, rs63}; +{ +cvt.rn.f16.f64 rs64, fd56; +} +mov.b32 r1313, {rs64, rs64}; +{ +add.f16x2 r1233, r975, r991; +} +{ +add.f16x2 %4, r130, r1233; +} +{ +add.f16x2 r1239, r981, r997; +} +{ +add.f16x2 %5, r274, r1239; +} +{ +add.f16x2 r1245, r975, r991; +} +{ +mul.f16x2 r1248, r1245, r1304; +} +{ +add.f16x2 r1251, r130, r1248; +} +{ +sub.f16x2 r1254, r981, r997; +} +{ +mul.f16x2 r1257, r1254, r1313; +} +{ +add.f16x2 %14, r1251, r1257; +} +{ +add.f16x2 r1263, r975, r991; +} +{ +mul.f16x2 r1266, r1263, r1304; +} +{ +add.f16x2 r1269, r130, r1266; +} +{ +sub.f16x2 r1272, r981, r997; +} +{ +mul.f16x2 r1275, r1272, r1313; +} +{ +sub.f16x2 %24, r1269, r1275; +} +{ +add.f16x2 r1281, r981, r997; +} +{ +mul.f16x2 r1284, r1281, r1304; +} +{ +add.f16x2 r1287, r274, r1284; +} +{ +sub.f16x2 r1290, r975, r991; +} +{ +mul.f16x2 r1293, r1290, r1313; +} +{ +sub.f16x2 %15, r1287, r1293; +} +{ +add.f16x2 r1299, r981, r997; +} +{ +mul.f16x2 r1302, r1299, r1304; +} +{ +add.f16x2 r1305, r274, r1302; +} +{ +sub.f16x2 r1308, r975, r991; +} +{ +mul.f16x2 r1311, r1308, r1313; +} +{ +add.f16x2 %25, r1305, r1311; +} +{ +cvt.rn.f16.f64 rs65, fd55; +} +mov.b32 r1388, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs66, fd56; +} +mov.b32 r1397, {rs66, rs66}; +{ +add.f16x2 r1317, r1007, r1023; +} +{ +add.f16x2 %6, r166, r1317; +} +{ +add.f16x2 r1323, r1013, r1029; +} +{ +add.f16x2 %7, r310, r1323; +} +{ +add.f16x2 r1329, r1007, r1023; +} +{ +mul.f16x2 r1332, r1329, r1388; +} +{ +add.f16x2 r1335, r166, r1332; +} +{ +sub.f16x2 r1338, r1013, r1029; +} +{ +mul.f16x2 r1341, r1338, r1397; +} +{ +add.f16x2 %16, r1335, r1341; +} +{ +add.f16x2 r1347, r1007, r1023; +} +{ +mul.f16x2 r1350, r1347, r1388; +} +{ +add.f16x2 r1353, r166, r1350; +} +{ +sub.f16x2 r1356, r1013, r1029; +} +{ +mul.f16x2 r1359, r1356, r1397; +} +{ +sub.f16x2 %26, r1353, r1359; +} +{ +add.f16x2 r1365, r1013, r1029; +} +{ +mul.f16x2 r1368, r1365, r1388; +} +{ +add.f16x2 r1371, r310, r1368; +} +{ +sub.f16x2 r1374, r1007, r1023; +} +{ +mul.f16x2 r1377, r1374, r1397; +} +{ +sub.f16x2 %17, r1371, r1377; +} +{ +add.f16x2 r1383, r1013, r1029; +} +{ +mul.f16x2 r1386, r1383, r1388; +} +{ +add.f16x2 r1389, r310, r1386; +} +{ +sub.f16x2 r1392, r1007, r1023; +} +{ +mul.f16x2 r1395, r1392, r1397; +} +{ +add.f16x2 %27, r1389, r1395; +} +{ +cvt.rn.f16.f64 rs67, fd55; +} +mov.b32 r1472, {rs67, rs67}; +{ +cvt.rn.f16.f64 rs68, fd56; +} +mov.b32 r1481, {rs68, rs68}; +{ +add.f16x2 r1401, r1039, r1055; +} +{ +add.f16x2 %8, r94, r1401; +} +{ +add.f16x2 r1407, r1045, r1061; +} +{ +add.f16x2 %9, r238, r1407; +} +{ +add.f16x2 r1413, r1039, r1055; +} +{ +mul.f16x2 r1416, r1413, r1472; +} +{ +add.f16x2 r1419, r94, r1416; +} +{ +sub.f16x2 r1422, r1045, r1061; +} +{ +mul.f16x2 r1425, r1422, r1481; +} +{ +add.f16x2 %18, r1419, r1425; +} +{ +add.f16x2 r1431, r1039, r1055; +} +{ +mul.f16x2 r1434, r1431, r1472; +} +{ +add.f16x2 r1437, r94, r1434; +} +{ +sub.f16x2 r1440, r1045, r1061; +} +{ +mul.f16x2 r1443, r1440, r1481; +} +{ +sub.f16x2 %28, r1437, r1443; +} +{ +add.f16x2 r1449, r1045, r1061; +} +{ +mul.f16x2 r1452, r1449, r1472; +} +{ +add.f16x2 r1455, r238, r1452; +} +{ +sub.f16x2 r1458, r1039, r1055; +} +{ +mul.f16x2 r1461, r1458, r1481; +} +{ +sub.f16x2 %19, r1455, r1461; +} +{ +add.f16x2 r1467, r1045, r1061; +} +{ +mul.f16x2 r1470, r1467, r1472; +} +{ +add.f16x2 r1473, r238, r1470; +} +{ +sub.f16x2 r1476, r1039, r1055; +} +{ +mul.f16x2 r1479, r1476, r1481; +} +{ +add.f16x2 %29, r1473, r1479; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..92123a506d1de --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp32_fwd.hpp.inc @@ -0,0 +1,268 @@ +#ifndef CUFFTDX_FFT_15_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_15_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<2, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<313>; +.reg .b64 rd<2>; +add.f32 f61, %38, %62; +add.f32 f62, %30, f61; +add.f32 f63, %46, %54; +add.f32 f64, f63, f62; +add.f32 f65, %39, %63; +add.f32 f66, %31, f65; +add.f32 f67, %47, %55; +add.f32 f68, f67, f66; +fma.rn.f32 f69, f61, 0f3E9E377A, %30; +mul.f32 f70, f63, 0f3F4F1BBD; +sub.f32 f71, f69, f70; +sub.f32 f72, %39, %63; +mul.f32 f73, f72, 0f3F737871; +sub.f32 f74, %47, %55; +mul.f32 f75, f74, 0fBF167918; +sub.f32 f76, f75, f73; +sub.f32 f77, f71, f76; +add.f32 f78, f76, f71; +mul.f32 f79, f61, 0f3F4F1BBD; +sub.f32 f80, %30, f79; +fma.rn.f32 f81, f63, 0f3E9E377A, f80; +mul.f32 f82, f72, 0f3F167918; +mul.f32 f83, f74, 0f3F737871; +sub.f32 f84, f83, f82; +sub.f32 f85, f81, f84; +add.f32 f86, f84, f81; +fma.rn.f32 f87, f65, 0f3E9E377A, %31; +mul.f32 f88, f67, 0f3F4F1BBD; +sub.f32 f89, f87, f88; +sub.f32 f90, %38, %62; +mul.f32 f91, f90, 0f3F737871; +sub.f32 f92, %46, %54; +mul.f32 f93, f92, 0fBF167918; +sub.f32 f94, f93, f91; +add.f32 f95, f94, f89; +sub.f32 f96, f89, f94; +mul.f32 f97, f65, 0f3F4F1BBD; +sub.f32 f98, %31, f97; +fma.rn.f32 f99, f67, 0f3E9E377A, f98; +mul.f32 f100, f90, 0f3F167918; +mul.f32 f101, f92, 0f3F737871; +sub.f32 f102, f101, f100; +add.f32 f103, f102, f99; +sub.f32 f104, f99, f102; +add.f32 f105, %40, %64; +add.f32 f106, %32, f105; +add.f32 f107, %48, %56; +add.f32 f108, f107, f106; +add.f32 f109, %42, %66; +add.f32 f110, %34, f109; +add.f32 f111, %50, %58; +add.f32 f112, f111, f110; +fma.rn.f32 f113, f105, 0f3E9E377A, %32; +mul.f32 f114, f107, 0f3F4F1BBD; +sub.f32 f115, f113, f114; +sub.f32 f116, %42, %66; +mul.f32 f117, f116, 0f3F737871; +sub.f32 f118, %50, %58; +mul.f32 f119, f118, 0fBF167918; +sub.f32 f120, f119, f117; +sub.f32 f121, f115, f120; +add.f32 f122, f120, f115; +mul.f32 f123, f105, 0f3F4F1BBD; +sub.f32 f124, %32, f123; +fma.rn.f32 f125, f107, 0f3E9E377A, f124; +mul.f32 f126, f116, 0f3F167918; +mul.f32 f127, f118, 0f3F737871; +sub.f32 f128, f127, f126; +sub.f32 f129, f125, f128; +add.f32 f130, f128, f125; +fma.rn.f32 f131, f109, 0f3E9E377A, %34; +mul.f32 f132, f111, 0f3F4F1BBD; +sub.f32 f133, f131, f132; +sub.f32 f134, %40, %64; +mul.f32 f135, f134, 0f3F737871; +sub.f32 f136, %48, %56; +mul.f32 f137, f136, 0fBF167918; +sub.f32 f138, f137, f135; +add.f32 f139, f138, f133; +sub.f32 f140, f133, f138; +mul.f32 f141, f109, 0f3F4F1BBD; +sub.f32 f142, %34, f141; +fma.rn.f32 f143, f111, 0f3E9E377A, f142; +mul.f32 f144, f134, 0f3F167918; +mul.f32 f145, f136, 0f3F737871; +sub.f32 f146, f145, f144; +add.f32 f147, f146, f143; +sub.f32 f148, f143, f146; +add.f32 f149, %43, %67; +add.f32 f150, %35, f149; +add.f32 f151, %51, %59; +add.f32 f152, f151, f150; +add.f32 f153, %45, %68; +add.f32 f154, %37, f153; +add.f32 f155, %53, %61; +add.f32 f156, f155, f154; +fma.rn.f32 f157, f149, 0f3E9E377A, %35; +mul.f32 f158, f151, 0f3F4F1BBD; +sub.f32 f159, f157, f158; +sub.f32 f160, %45, %68; +mul.f32 f161, f160, 0f3F737871; +sub.f32 f162, %53, %61; +mul.f32 f163, f162, 0fBF167918; +sub.f32 f164, f163, f161; +sub.f32 f165, f159, f164; +add.f32 f166, f164, f159; +mul.f32 f167, f149, 0f3F4F1BBD; +sub.f32 f168, %35, f167; +fma.rn.f32 f169, f151, 0f3E9E377A, f168; +mul.f32 f170, f160, 0f3F167918; +mul.f32 f171, f162, 0f3F737871; +sub.f32 f172, f171, f170; +sub.f32 f173, f169, f172; +add.f32 f174, f172, f169; +fma.rn.f32 f175, f153, 0f3E9E377A, %37; +mul.f32 f176, f155, 0f3F4F1BBD; +sub.f32 f177, f175, f176; +sub.f32 f178, %43, %67; +mul.f32 f179, f178, 0f3F737871; +sub.f32 f180, %51, %59; +mul.f32 f181, f180, 0fBF167918; +sub.f32 f182, f181, f179; +add.f32 f183, f182, f177; +sub.f32 f184, f177, f182; +mul.f32 f185, f153, 0f3F4F1BBD; +sub.f32 f186, %37, f185; +fma.rn.f32 f187, f155, 0f3E9E377A, f186; +mul.f32 f188, f178, 0f3F167918; +mul.f32 f189, f180, 0f3F737871; +sub.f32 f190, f189, f188; +add.f32 f191, f190, f187; +sub.f32 f192, f187, f190; +mul.f32 f193, f121, 0f3F69DE1D; +mul.f32 f194, f139, 0fBED03FC9; +sub.f32 f195, f193, f194; +mul.f32 f196, f139, 0f3F69DE1D; +fma.rn.f32 f197, f121, 0fBED03FC9, f196; +mul.f32 f198, f165, 0f3F2B4C25; +mul.f32 f199, f183, 0fBF3E3EBD; +sub.f32 f200, f198, f199; +mul.f32 f201, f183, 0f3F2B4C25; +fma.rn.f32 f202, f165, 0fBF3E3EBD, f201; +mul.f32 f203, f129, 0f3F2B4C25; +mul.f32 f204, f147, 0fBF3E3EBD; +sub.f32 f205, f203, f204; +mul.f32 f206, f147, 0f3F2B4C25; +fma.rn.f32 f207, f129, 0fBF3E3EBD, f206; +mul.f32 f208, f173, 0fBDD61305; +mul.f32 f209, f191, 0fBF7E98FD; +sub.f32 f210, f208, f209; +mul.f32 f211, f191, 0fBDD61305; +fma.rn.f32 f212, f173, 0fBF7E98FD, f211; +mul.f32 f213, f130, 0f3E9E377A; +mul.f32 f214, f148, 0fBF737871; +sub.f32 f215, f213, f214; +mul.f32 f216, f148, 0f3E9E377A; +fma.rn.f32 f217, f130, 0fBF737871, f216; +mul.f32 f218, f174, 0fBF4F1BBD; +mul.f32 f219, f192, 0fBF167918; +sub.f32 f220, f218, f219; +mul.f32 f221, f192, 0fBF4F1BBD; +fma.rn.f32 f222, f174, 0fBF167918, f221; +mul.f32 f223, f122, 0fBDD61305; +mul.f32 f224, f140, 0fBF7E98FD; +sub.f32 f225, f223, f224; +mul.f32 f226, f140, 0fBDD61305; +fma.rn.f32 f227, f122, 0fBF7E98FD, f226; +mul.f32 f228, f166, 0fBF7A67E2; +mul.f32 f229, f184, 0f3E54E6CD; +sub.f32 f230, f228, f229; +mul.f32 f231, f184, 0fBF7A67E2; +fma.rn.f32 f232, f166, 0f3E54E6CD, f231; +add.f32 f233, f108, f152; +add.f32 f234, f112, f156; +mul.f32 f235, f233, 0f3F000000; +sub.f32 f236, f64, f235; +sub.f32 f237, f112, f156; +mul.f32 f238, f237, 0f3F5DB3D7; +mul.f32 f239, f234, 0f3F000000; +sub.f32 f240, f68, f239; +sub.f32 f241, f108, f152; +mul.f32 f242, f241, 0f3F5DB3D7; +add.f32 f243, f195, f200; +add.f32 f244, f197, f202; +mul.f32 f245, f243, 0f3F000000; +sub.f32 f246, f77, f245; +sub.f32 f247, f197, f202; +mul.f32 f248, f247, 0f3F5DB3D7; +mul.f32 f249, f244, 0f3F000000; +sub.f32 f250, f95, f249; +sub.f32 f251, f195, f200; +mul.f32 f252, f251, 0f3F5DB3D7; +add.f32 f253, f205, f210; +add.f32 f254, f207, f212; +mul.f32 f255, f253, 0f3F000000; +sub.f32 f256, f85, f255; +sub.f32 f257, f207, f212; +mul.f32 f258, f257, 0f3F5DB3D7; +mul.f32 f259, f254, 0f3F000000; +sub.f32 f260, f103, f259; +sub.f32 f261, f205, f210; +mul.f32 f262, f261, 0f3F5DB3D7; +add.f32 f263, f215, f220; +add.f32 f264, f217, f222; +mul.f32 f265, f263, 0f3F000000; +sub.f32 f266, f86, f265; +sub.f32 f267, f217, f222; +mul.f32 f268, f267, 0f3F5DB3D7; +mul.f32 f269, f264, 0f3F000000; +sub.f32 f270, f104, f269; +sub.f32 f271, f215, f220; +mul.f32 f272, f271, 0f3F5DB3D7; +add.f32 f273, f225, f230; +add.f32 f274, f227, f232; +mul.f32 f275, f273, 0f3F000000; +sub.f32 f276, f78, f275; +sub.f32 f277, f227, f232; +mul.f32 f278, f277, 0f3F5DB3D7; +mul.f32 f279, f274, 0f3F000000; +sub.f32 f280, f96, f279; +sub.f32 f281, f225, f230; +mul.f32 f282, f281, 0f3F5DB3D7; +add.f32 %1, f68, f234; +add.f32 %0, f64, f233; +add.f32 %3, f95, f244; +add.f32 %2, f77, f243; +add.f32 %5, f103, f254; +add.f32 %4, f85, f253; +add.f32 %7, f104, f264; +add.f32 %6, f86, f263; +add.f32 %9, f96, f274; +add.f32 %8, f78, f273; +sub.f32 %11, f240, f242; +add.f32 %10, f238, f236; +sub.f32 %13, f250, f252; +add.f32 %12, f248, f246; +sub.f32 %15, f260, f262; +add.f32 %14, f258, f256; +sub.f32 %17, f270, f272; +add.f32 %16, f268, f266; +sub.f32 %19, f280, f282; +add.f32 %18, f278, f276; +add.f32 %21, f242, f240; +sub.f32 %20, f236, f238; +add.f32 %23, f252, f250; +sub.f32 %22, f246, f248; +add.f32 %25, f262, f260; +sub.f32 %24, f256, f258; +add.f32 %27, f272, f270; +sub.f32 %26, f266, f268; +add.f32 %29, f282, f280; +sub.f32 %28, f276, f278; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..f94eaaf5b0c47 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp32_inv.hpp.inc @@ -0,0 +1,262 @@ +#ifndef CUFFTDX_FFT_15_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_15_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<204, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b64 rd<2>; +add.f32 f61, %38, %62; +add.f32 f62, %30, f61; +add.f32 f63, %46, %54; +add.f32 f64, f63, f62; +add.f32 f65, %39, %63; +add.f32 f66, %31, f65; +add.f32 f67, %47, %55; +add.f32 f68, f67, f66; +fma.rn.f32 f69, f61, 0f3E9E377A, %30; +mul.f32 f70, f63, 0f3F4F1BBD; +sub.f32 f71, f69, f70; +sub.f32 f72, %39, %63; +mul.f32 f73, f72, 0f3F737871; +sub.f32 f74, %47, %55; +fma.rn.f32 f75, f74, 0f3F167918, f73; +sub.f32 f76, f71, f75; +add.f32 f77, f75, f71; +mul.f32 f78, f61, 0f3F4F1BBD; +sub.f32 f79, %30, f78; +fma.rn.f32 f80, f63, 0f3E9E377A, f79; +mul.f32 f81, f72, 0f3F167918; +mul.f32 f82, f74, 0f3F737871; +sub.f32 f83, f81, f82; +sub.f32 f84, f80, f83; +add.f32 f85, f83, f80; +fma.rn.f32 f86, f65, 0f3E9E377A, %31; +mul.f32 f87, f67, 0f3F4F1BBD; +sub.f32 f88, f86, f87; +sub.f32 f89, %38, %62; +mul.f32 f90, f89, 0f3F737871; +sub.f32 f91, %46, %54; +fma.rn.f32 f92, f91, 0f3F167918, f90; +add.f32 f93, f92, f88; +sub.f32 f94, f88, f92; +mul.f32 f95, f65, 0f3F4F1BBD; +sub.f32 f96, %31, f95; +fma.rn.f32 f97, f67, 0f3E9E377A, f96; +mul.f32 f98, f89, 0f3F167918; +mul.f32 f99, f91, 0f3F737871; +sub.f32 f100, f98, f99; +add.f32 f101, f100, f97; +sub.f32 f102, f97, f100; +add.f32 f103, %40, %64; +add.f32 f104, %32, f103; +add.f32 f105, %48, %56; +add.f32 f106, f105, f104; +add.f32 f107, %42, %66; +add.f32 f108, %34, f107; +add.f32 f109, %50, %58; +add.f32 f110, f109, f108; +fma.rn.f32 f111, f103, 0f3E9E377A, %32; +mul.f32 f112, f105, 0f3F4F1BBD; +sub.f32 f113, f111, f112; +sub.f32 f114, %42, %66; +mul.f32 f115, f114, 0f3F737871; +sub.f32 f116, %50, %58; +fma.rn.f32 f117, f116, 0f3F167918, f115; +sub.f32 f118, f113, f117; +add.f32 f119, f117, f113; +mul.f32 f120, f103, 0f3F4F1BBD; +sub.f32 f121, %32, f120; +fma.rn.f32 f122, f105, 0f3E9E377A, f121; +mul.f32 f123, f114, 0f3F167918; +mul.f32 f124, f116, 0f3F737871; +sub.f32 f125, f123, f124; +sub.f32 f126, f122, f125; +add.f32 f127, f125, f122; +fma.rn.f32 f128, f107, 0f3E9E377A, %34; +mul.f32 f129, f109, 0f3F4F1BBD; +sub.f32 f130, f128, f129; +sub.f32 f131, %40, %64; +mul.f32 f132, f131, 0f3F737871; +sub.f32 f133, %48, %56; +fma.rn.f32 f134, f133, 0f3F167918, f132; +add.f32 f135, f134, f130; +sub.f32 f136, f130, f134; +mul.f32 f137, f107, 0f3F4F1BBD; +sub.f32 f138, %34, f137; +fma.rn.f32 f139, f109, 0f3E9E377A, f138; +mul.f32 f140, f131, 0f3F167918; +mul.f32 f141, f133, 0f3F737871; +sub.f32 f142, f140, f141; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %43, %67; +add.f32 f146, %35, f145; +add.f32 f147, %51, %59; +add.f32 f148, f147, f146; +add.f32 f149, %45, %68; +add.f32 f150, %37, f149; +add.f32 f151, %53, %61; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f145, 0f3E9E377A, %35; +mul.f32 f154, f147, 0f3F4F1BBD; +sub.f32 f155, f153, f154; +sub.f32 f156, %45, %68; +mul.f32 f157, f156, 0f3F737871; +sub.f32 f158, %53, %61; +fma.rn.f32 f159, f158, 0f3F167918, f157; +sub.f32 f160, f155, f159; +add.f32 f161, f159, f155; +mul.f32 f162, f145, 0f3F4F1BBD; +sub.f32 f163, %35, f162; +fma.rn.f32 f164, f147, 0f3E9E377A, f163; +mul.f32 f165, f156, 0f3F167918; +mul.f32 f166, f158, 0f3F737871; +sub.f32 f167, f165, f166; +sub.f32 f168, f164, f167; +add.f32 f169, f167, f164; +fma.rn.f32 f170, f149, 0f3E9E377A, %37; +mul.f32 f171, f151, 0f3F4F1BBD; +sub.f32 f172, f170, f171; +sub.f32 f173, %43, %67; +mul.f32 f174, f173, 0f3F737871; +sub.f32 f175, %51, %59; +fma.rn.f32 f176, f175, 0f3F167918, f174; +add.f32 f177, f176, f172; +sub.f32 f178, f172, f176; +mul.f32 f179, f149, 0f3F4F1BBD; +sub.f32 f180, %37, f179; +fma.rn.f32 f181, f151, 0f3E9E377A, f180; +mul.f32 f182, f173, 0f3F167918; +mul.f32 f183, f175, 0f3F737871; +sub.f32 f184, f182, f183; +add.f32 f185, f184, f181; +sub.f32 f186, f181, f184; +mul.f32 f187, f118, 0f3F69DE1D; +mul.f32 f188, f135, 0f3ED03FC9; +sub.f32 f189, f187, f188; +mul.f32 f190, f135, 0f3F69DE1D; +fma.rn.f32 f191, f118, 0f3ED03FC9, f190; +mul.f32 f192, f160, 0f3F2B4C25; +mul.f32 f193, f177, 0f3F3E3EBD; +sub.f32 f194, f192, f193; +mul.f32 f195, f177, 0f3F2B4C25; +fma.rn.f32 f196, f160, 0f3F3E3EBD, f195; +mul.f32 f197, f126, 0f3F2B4C25; +mul.f32 f198, f143, 0f3F3E3EBD; +sub.f32 f199, f197, f198; +mul.f32 f200, f143, 0f3F2B4C25; +fma.rn.f32 f201, f126, 0f3F3E3EBD, f200; +mul.f32 f202, f168, 0fBDD61305; +mul.f32 f203, f185, 0f3F7E98FD; +sub.f32 f204, f202, f203; +mul.f32 f205, f185, 0fBDD61305; +fma.rn.f32 f206, f168, 0f3F7E98FD, f205; +mul.f32 f207, f127, 0f3E9E377A; +mul.f32 f208, f144, 0f3F737871; +sub.f32 f209, f207, f208; +mul.f32 f210, f144, 0f3E9E377A; +fma.rn.f32 f211, f127, 0f3F737871, f210; +mul.f32 f212, f169, 0fBF4F1BBD; +mul.f32 f213, f186, 0f3F167918; +sub.f32 f214, f212, f213; +mul.f32 f215, f186, 0fBF4F1BBD; +fma.rn.f32 f216, f169, 0f3F167918, f215; +mul.f32 f217, f119, 0fBDD61305; +mul.f32 f218, f136, 0f3F7E98FD; +sub.f32 f219, f217, f218; +mul.f32 f220, f136, 0fBDD61305; +fma.rn.f32 f221, f119, 0f3F7E98FD, f220; +mul.f32 f222, f161, 0fBF7A67E2; +mul.f32 f223, f178, 0fBE54E6CD; +sub.f32 f224, f222, f223; +mul.f32 f225, f178, 0fBF7A67E2; +fma.rn.f32 f226, f161, 0fBE54E6CD, f225; +add.f32 f227, f106, f148; +add.f32 f228, f110, f152; +mul.f32 f229, f227, 0f3F000000; +sub.f32 f230, f64, f229; +sub.f32 f231, f110, f152; +mul.f32 f232, f231, 0fBF5DB3D7; +mul.f32 f233, f228, 0f3F000000; +sub.f32 f234, f68, f233; +sub.f32 f235, f106, f148; +mul.f32 f236, f235, 0fBF5DB3D7; +add.f32 f237, f189, f194; +add.f32 f238, f191, f196; +mul.f32 f239, f237, 0f3F000000; +sub.f32 f240, f76, f239; +sub.f32 f241, f191, f196; +mul.f32 f242, f241, 0fBF5DB3D7; +mul.f32 f243, f238, 0f3F000000; +sub.f32 f244, f93, f243; +sub.f32 f245, f189, f194; +mul.f32 f246, f245, 0fBF5DB3D7; +add.f32 f247, f199, f204; +add.f32 f248, f201, f206; +mul.f32 f249, f247, 0f3F000000; +sub.f32 f250, f84, f249; +sub.f32 f251, f201, f206; +mul.f32 f252, f251, 0fBF5DB3D7; +mul.f32 f253, f248, 0f3F000000; +sub.f32 f254, f101, f253; +sub.f32 f255, f199, f204; +mul.f32 f256, f255, 0fBF5DB3D7; +add.f32 f257, f209, f214; +add.f32 f258, f211, f216; +mul.f32 f259, f257, 0f3F000000; +sub.f32 f260, f85, f259; +sub.f32 f261, f211, f216; +mul.f32 f262, f261, 0fBF5DB3D7; +mul.f32 f263, f258, 0f3F000000; +sub.f32 f264, f102, f263; +sub.f32 f265, f209, f214; +mul.f32 f266, f265, 0fBF5DB3D7; +add.f32 f267, f219, f224; +add.f32 f268, f221, f226; +mul.f32 f269, f267, 0f3F000000; +sub.f32 f270, f77, f269; +sub.f32 f271, f221, f226; +mul.f32 f272, f271, 0fBF5DB3D7; +mul.f32 f273, f268, 0f3F000000; +sub.f32 f274, f94, f273; +sub.f32 f275, f219, f224; +mul.f32 f276, f275, 0fBF5DB3D7; +add.f32 %1, f68, f228; +add.f32 %0, f64, f227; +add.f32 %3, f93, f238; +add.f32 %2, f76, f237; +add.f32 %5, f101, f248; +add.f32 %4, f84, f247; +add.f32 %7, f102, f258; +add.f32 %6, f85, f257; +add.f32 %9, f94, f268; +add.f32 %8, f77, f267; +sub.f32 %11, f234, f236; +add.f32 %10, f232, f230; +sub.f32 %13, f244, f246; +add.f32 %12, f242, f240; +sub.f32 %15, f254, f256; +add.f32 %14, f252, f250; +sub.f32 %17, f264, f266; +add.f32 %16, f262, f260; +sub.f32 %19, f274, f276; +add.f32 %18, f272, f270; +add.f32 %21, f236, f234; +sub.f32 %20, f230, f232; +add.f32 %23, f246, f244; +sub.f32 %22, f240, f242; +add.f32 %25, f256, f254; +sub.f32 %24, f250, f252; +add.f32 %27, f266, f264; +sub.f32 %26, f260, f262; +add.f32 %29, f276, f274; +sub.f32 %28, f270, f272; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..f81cf7f94f339 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp64_fwd.hpp.inc @@ -0,0 +1,268 @@ +#ifndef CUFFTDX_FFT_15_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_15_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<406, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<313>; +.reg .b64 rd<2>; +add.f64 fd61, %38, %62; +add.f64 fd62, %30, fd61; +add.f64 fd63, %46, %54; +add.f64 fd64, fd63, fd62; +add.f64 fd65, %39, %63; +add.f64 fd66, %31, fd65; +add.f64 fd67, %47, %55; +add.f64 fd68, fd67, fd66; +fma.rn.f64 fd69, fd61, 0d3FD3C6EF372FE950, %30; +mul.f64 fd70, fd63, 0d3FE9E3779B97F4A8; +sub.f64 fd71, fd69, fd70; +sub.f64 fd72, %39, %63; +mul.f64 fd73, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd74, %47, %55; +mul.f64 fd75, fd74, 0dBFE2CF2304755A5E; +sub.f64 fd76, fd75, fd73; +sub.f64 fd77, fd71, fd76; +add.f64 fd78, fd76, fd71; +mul.f64 fd79, fd61, 0d3FE9E3779B97F4A8; +sub.f64 fd80, %30, fd79; +fma.rn.f64 fd81, fd63, 0d3FD3C6EF372FE950, fd80; +mul.f64 fd82, fd72, 0d3FE2CF2304755A5E; +mul.f64 fd83, fd74, 0d3FEE6F0E134454FF; +sub.f64 fd84, fd83, fd82; +sub.f64 fd85, fd81, fd84; +add.f64 fd86, fd84, fd81; +fma.rn.f64 fd87, fd65, 0d3FD3C6EF372FE950, %31; +mul.f64 fd88, fd67, 0d3FE9E3779B97F4A8; +sub.f64 fd89, fd87, fd88; +sub.f64 fd90, %38, %62; +mul.f64 fd91, fd90, 0d3FEE6F0E134454FF; +sub.f64 fd92, %46, %54; +mul.f64 fd93, fd92, 0dBFE2CF2304755A5E; +sub.f64 fd94, fd93, fd91; +add.f64 fd95, fd94, fd89; +sub.f64 fd96, fd89, fd94; +mul.f64 fd97, fd65, 0d3FE9E3779B97F4A8; +sub.f64 fd98, %31, fd97; +fma.rn.f64 fd99, fd67, 0d3FD3C6EF372FE950, fd98; +mul.f64 fd100, fd90, 0d3FE2CF2304755A5E; +mul.f64 fd101, fd92, 0d3FEE6F0E134454FF; +sub.f64 fd102, fd101, fd100; +add.f64 fd103, fd102, fd99; +sub.f64 fd104, fd99, fd102; +add.f64 fd105, %40, %64; +add.f64 fd106, %32, fd105; +add.f64 fd107, %48, %56; +add.f64 fd108, fd107, fd106; +add.f64 fd109, %42, %66; +add.f64 fd110, %34, fd109; +add.f64 fd111, %50, %58; +add.f64 fd112, fd111, fd110; +fma.rn.f64 fd113, fd105, 0d3FD3C6EF372FE950, %32; +mul.f64 fd114, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd115, fd113, fd114; +sub.f64 fd116, %42, %66; +mul.f64 fd117, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd118, %50, %58; +mul.f64 fd119, fd118, 0dBFE2CF2304755A5E; +sub.f64 fd120, fd119, fd117; +sub.f64 fd121, fd115, fd120; +add.f64 fd122, fd120, fd115; +mul.f64 fd123, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd124, %32, fd123; +fma.rn.f64 fd125, fd107, 0d3FD3C6EF372FE950, fd124; +mul.f64 fd126, fd116, 0d3FE2CF2304755A5E; +mul.f64 fd127, fd118, 0d3FEE6F0E134454FF; +sub.f64 fd128, fd127, fd126; +sub.f64 fd129, fd125, fd128; +add.f64 fd130, fd128, fd125; +fma.rn.f64 fd131, fd109, 0d3FD3C6EF372FE950, %34; +mul.f64 fd132, fd111, 0d3FE9E3779B97F4A8; +sub.f64 fd133, fd131, fd132; +sub.f64 fd134, %40, %64; +mul.f64 fd135, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd136, %48, %56; +mul.f64 fd137, fd136, 0dBFE2CF2304755A5E; +sub.f64 fd138, fd137, fd135; +add.f64 fd139, fd138, fd133; +sub.f64 fd140, fd133, fd138; +mul.f64 fd141, fd109, 0d3FE9E3779B97F4A8; +sub.f64 fd142, %34, fd141; +fma.rn.f64 fd143, fd111, 0d3FD3C6EF372FE950, fd142; +mul.f64 fd144, fd134, 0d3FE2CF2304755A5E; +mul.f64 fd145, fd136, 0d3FEE6F0E134454FF; +sub.f64 fd146, fd145, fd144; +add.f64 fd147, fd146, fd143; +sub.f64 fd148, fd143, fd146; +add.f64 fd149, %43, %67; +add.f64 fd150, %35, fd149; +add.f64 fd151, %51, %59; +add.f64 fd152, fd151, fd150; +add.f64 fd153, %45, %68; +add.f64 fd154, %37, fd153; +add.f64 fd155, %53, %61; +add.f64 fd156, fd155, fd154; +fma.rn.f64 fd157, fd149, 0d3FD3C6EF372FE950, %35; +mul.f64 fd158, fd151, 0d3FE9E3779B97F4A8; +sub.f64 fd159, fd157, fd158; +sub.f64 fd160, %45, %68; +mul.f64 fd161, fd160, 0d3FEE6F0E134454FF; +sub.f64 fd162, %53, %61; +mul.f64 fd163, fd162, 0dBFE2CF2304755A5E; +sub.f64 fd164, fd163, fd161; +sub.f64 fd165, fd159, fd164; +add.f64 fd166, fd164, fd159; +mul.f64 fd167, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd168, %35, fd167; +fma.rn.f64 fd169, fd151, 0d3FD3C6EF372FE950, fd168; +mul.f64 fd170, fd160, 0d3FE2CF2304755A5E; +mul.f64 fd171, fd162, 0d3FEE6F0E134454FF; +sub.f64 fd172, fd171, fd170; +sub.f64 fd173, fd169, fd172; +add.f64 fd174, fd172, fd169; +fma.rn.f64 fd175, fd153, 0d3FD3C6EF372FE950, %37; +mul.f64 fd176, fd155, 0d3FE9E3779B97F4A8; +sub.f64 fd177, fd175, fd176; +sub.f64 fd178, %43, %67; +mul.f64 fd179, fd178, 0d3FEE6F0E134454FF; +sub.f64 fd180, %51, %59; +mul.f64 fd181, fd180, 0dBFE2CF2304755A5E; +sub.f64 fd182, fd181, fd179; +add.f64 fd183, fd182, fd177; +sub.f64 fd184, fd177, fd182; +mul.f64 fd185, fd153, 0d3FE9E3779B97F4A8; +sub.f64 fd186, %37, fd185; +fma.rn.f64 fd187, fd155, 0d3FD3C6EF372FE950, fd186; +mul.f64 fd188, fd178, 0d3FE2CF2304755A5E; +mul.f64 fd189, fd180, 0d3FEE6F0E134454FF; +sub.f64 fd190, fd189, fd188; +add.f64 fd191, fd190, fd187; +sub.f64 fd192, fd187, fd190; +mul.f64 fd193, fd121, 0d3FED3BC3AEFF7F95; +mul.f64 fd194, fd139, 0dBFDA07F921061AD1; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd139, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd197, fd121, 0dBFDA07F921061AD1, fd196; +mul.f64 fd198, fd165, 0d3FE5698496E20BD8; +mul.f64 fd199, fd183, 0dBFE7C7D7A833BEC2; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd183, 0d3FE5698496E20BD8; +fma.rn.f64 fd202, fd165, 0dBFE7C7D7A833BEC2, fd201; +mul.f64 fd203, fd129, 0d3FE5698496E20BD8; +mul.f64 fd204, fd147, 0dBFE7C7D7A833BEC2; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd147, 0d3FE5698496E20BD8; +fma.rn.f64 fd207, fd129, 0dBFE7C7D7A833BEC2, fd206; +mul.f64 fd208, fd173, 0dBFBAC2609B3C576C; +mul.f64 fd209, fd191, 0dBFEFD31F94F867C6; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd191, 0dBFBAC2609B3C576C; +fma.rn.f64 fd212, fd173, 0dBFEFD31F94F867C6, fd211; +mul.f64 fd213, fd130, 0d3FD3C6EF372FE950; +mul.f64 fd214, fd148, 0dBFEE6F0E134454FF; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd148, 0d3FD3C6EF372FE950; +fma.rn.f64 fd217, fd130, 0dBFEE6F0E134454FF, fd216; +mul.f64 fd218, fd174, 0dBFE9E3779B97F4A8; +mul.f64 fd219, fd192, 0dBFE2CF2304755A5E; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd192, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd222, fd174, 0dBFE2CF2304755A5E, fd221; +mul.f64 fd223, fd122, 0dBFBAC2609B3C576C; +mul.f64 fd224, fd140, 0dBFEFD31F94F867C6; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd140, 0dBFBAC2609B3C576C; +fma.rn.f64 fd227, fd122, 0dBFEFD31F94F867C6, fd226; +mul.f64 fd228, fd166, 0dBFEF4CFC327A0080; +mul.f64 fd229, fd184, 0d3FCA9CD9AC4258F6; +sub.f64 fd230, fd228, fd229; +mul.f64 fd231, fd184, 0dBFEF4CFC327A0080; +fma.rn.f64 fd232, fd166, 0d3FCA9CD9AC4258F6, fd231; +add.f64 fd233, fd108, fd152; +add.f64 fd234, fd112, fd156; +mul.f64 fd235, fd233, 0d3FE0000000000000; +sub.f64 fd236, fd64, fd235; +sub.f64 fd237, fd112, fd156; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +mul.f64 fd239, fd234, 0d3FE0000000000000; +sub.f64 fd240, fd68, fd239; +sub.f64 fd241, fd108, fd152; +mul.f64 fd242, fd241, 0d3FEBB67AE8584CAA; +add.f64 fd243, fd195, fd200; +add.f64 fd244, fd197, fd202; +mul.f64 fd245, fd243, 0d3FE0000000000000; +sub.f64 fd246, fd77, fd245; +sub.f64 fd247, fd197, fd202; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +mul.f64 fd249, fd244, 0d3FE0000000000000; +sub.f64 fd250, fd95, fd249; +sub.f64 fd251, fd195, fd200; +mul.f64 fd252, fd251, 0d3FEBB67AE8584CAA; +add.f64 fd253, fd205, fd210; +add.f64 fd254, fd207, fd212; +mul.f64 fd255, fd253, 0d3FE0000000000000; +sub.f64 fd256, fd85, fd255; +sub.f64 fd257, fd207, fd212; +mul.f64 fd258, fd257, 0d3FEBB67AE8584CAA; +mul.f64 fd259, fd254, 0d3FE0000000000000; +sub.f64 fd260, fd103, fd259; +sub.f64 fd261, fd205, fd210; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +add.f64 fd263, fd215, fd220; +add.f64 fd264, fd217, fd222; +mul.f64 fd265, fd263, 0d3FE0000000000000; +sub.f64 fd266, fd86, fd265; +sub.f64 fd267, fd217, fd222; +mul.f64 fd268, fd267, 0d3FEBB67AE8584CAA; +mul.f64 fd269, fd264, 0d3FE0000000000000; +sub.f64 fd270, fd104, fd269; +sub.f64 fd271, fd215, fd220; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd225, fd230; +add.f64 fd274, fd227, fd232; +mul.f64 fd275, fd273, 0d3FE0000000000000; +sub.f64 fd276, fd78, fd275; +sub.f64 fd277, fd227, fd232; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +mul.f64 fd279, fd274, 0d3FE0000000000000; +sub.f64 fd280, fd96, fd279; +sub.f64 fd281, fd225, fd230; +mul.f64 fd282, fd281, 0d3FEBB67AE8584CAA; +add.f64 %1, fd68, fd234; +add.f64 %0, fd64, fd233; +add.f64 %3, fd95, fd244; +add.f64 %2, fd77, fd243; +add.f64 %5, fd103, fd254; +add.f64 %4, fd85, fd253; +add.f64 %7, fd104, fd264; +add.f64 %6, fd86, fd263; +add.f64 %9, fd96, fd274; +add.f64 %8, fd78, fd273; +sub.f64 %11, fd240, fd242; +add.f64 %10, fd238, fd236; +sub.f64 %13, fd250, fd252; +add.f64 %12, fd248, fd246; +sub.f64 %15, fd260, fd262; +add.f64 %14, fd258, fd256; +sub.f64 %17, fd270, fd272; +add.f64 %16, fd268, fd266; +sub.f64 %19, fd280, fd282; +add.f64 %18, fd278, fd276; +add.f64 %21, fd242, fd240; +sub.f64 %20, fd236, fd238; +add.f64 %23, fd252, fd250; +sub.f64 %22, fd246, fd248; +add.f64 %25, fd262, fd260; +sub.f64 %24, fd256, fd258; +add.f64 %27, fd272, fd270; +sub.f64 %26, fd266, fd268; +add.f64 %29, fd282, fd280; +sub.f64 %28, fd276, fd278; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..64a1c23d446b5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_15_fp64_inv.hpp.inc @@ -0,0 +1,262 @@ +#ifndef CUFFTDX_FFT_15_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_15_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<577, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<307>; +.reg .b64 rd<2>; +add.f64 fd61, %38, %62; +add.f64 fd62, %30, fd61; +add.f64 fd63, %46, %54; +add.f64 fd64, fd63, fd62; +add.f64 fd65, %39, %63; +add.f64 fd66, %31, fd65; +add.f64 fd67, %47, %55; +add.f64 fd68, fd67, fd66; +fma.rn.f64 fd69, fd61, 0d3FD3C6EF372FE950, %30; +mul.f64 fd70, fd63, 0d3FE9E3779B97F4A8; +sub.f64 fd71, fd69, fd70; +sub.f64 fd72, %39, %63; +mul.f64 fd73, fd72, 0d3FEE6F0E134454FF; +sub.f64 fd74, %47, %55; +fma.rn.f64 fd75, fd74, 0d3FE2CF2304755A5E, fd73; +sub.f64 fd76, fd71, fd75; +add.f64 fd77, fd75, fd71; +mul.f64 fd78, fd61, 0d3FE9E3779B97F4A8; +sub.f64 fd79, %30, fd78; +fma.rn.f64 fd80, fd63, 0d3FD3C6EF372FE950, fd79; +mul.f64 fd81, fd72, 0d3FE2CF2304755A5E; +mul.f64 fd82, fd74, 0d3FEE6F0E134454FF; +sub.f64 fd83, fd81, fd82; +sub.f64 fd84, fd80, fd83; +add.f64 fd85, fd83, fd80; +fma.rn.f64 fd86, fd65, 0d3FD3C6EF372FE950, %31; +mul.f64 fd87, fd67, 0d3FE9E3779B97F4A8; +sub.f64 fd88, fd86, fd87; +sub.f64 fd89, %38, %62; +mul.f64 fd90, fd89, 0d3FEE6F0E134454FF; +sub.f64 fd91, %46, %54; +fma.rn.f64 fd92, fd91, 0d3FE2CF2304755A5E, fd90; +add.f64 fd93, fd92, fd88; +sub.f64 fd94, fd88, fd92; +mul.f64 fd95, fd65, 0d3FE9E3779B97F4A8; +sub.f64 fd96, %31, fd95; +fma.rn.f64 fd97, fd67, 0d3FD3C6EF372FE950, fd96; +mul.f64 fd98, fd89, 0d3FE2CF2304755A5E; +mul.f64 fd99, fd91, 0d3FEE6F0E134454FF; +sub.f64 fd100, fd98, fd99; +add.f64 fd101, fd100, fd97; +sub.f64 fd102, fd97, fd100; +add.f64 fd103, %40, %64; +add.f64 fd104, %32, fd103; +add.f64 fd105, %48, %56; +add.f64 fd106, fd105, fd104; +add.f64 fd107, %42, %66; +add.f64 fd108, %34, fd107; +add.f64 fd109, %50, %58; +add.f64 fd110, fd109, fd108; +fma.rn.f64 fd111, fd103, 0d3FD3C6EF372FE950, %32; +mul.f64 fd112, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd113, fd111, fd112; +sub.f64 fd114, %42, %66; +mul.f64 fd115, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd116, %50, %58; +fma.rn.f64 fd117, fd116, 0d3FE2CF2304755A5E, fd115; +sub.f64 fd118, fd113, fd117; +add.f64 fd119, fd117, fd113; +mul.f64 fd120, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd121, %32, fd120; +fma.rn.f64 fd122, fd105, 0d3FD3C6EF372FE950, fd121; +mul.f64 fd123, fd114, 0d3FE2CF2304755A5E; +mul.f64 fd124, fd116, 0d3FEE6F0E134454FF; +sub.f64 fd125, fd123, fd124; +sub.f64 fd126, fd122, fd125; +add.f64 fd127, fd125, fd122; +fma.rn.f64 fd128, fd107, 0d3FD3C6EF372FE950, %34; +mul.f64 fd129, fd109, 0d3FE9E3779B97F4A8; +sub.f64 fd130, fd128, fd129; +sub.f64 fd131, %40, %64; +mul.f64 fd132, fd131, 0d3FEE6F0E134454FF; +sub.f64 fd133, %48, %56; +fma.rn.f64 fd134, fd133, 0d3FE2CF2304755A5E, fd132; +add.f64 fd135, fd134, fd130; +sub.f64 fd136, fd130, fd134; +mul.f64 fd137, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd138, %34, fd137; +fma.rn.f64 fd139, fd109, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd131, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd133, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd140, fd141; +add.f64 fd143, fd142, fd139; +sub.f64 fd144, fd139, fd142; +add.f64 fd145, %43, %67; +add.f64 fd146, %35, fd145; +add.f64 fd147, %51, %59; +add.f64 fd148, fd147, fd146; +add.f64 fd149, %45, %68; +add.f64 fd150, %37, fd149; +add.f64 fd151, %53, %61; +add.f64 fd152, fd151, fd150; +fma.rn.f64 fd153, fd145, 0d3FD3C6EF372FE950, %35; +mul.f64 fd154, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd155, fd153, fd154; +sub.f64 fd156, %45, %68; +mul.f64 fd157, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd158, %53, %61; +fma.rn.f64 fd159, fd158, 0d3FE2CF2304755A5E, fd157; +sub.f64 fd160, fd155, fd159; +add.f64 fd161, fd159, fd155; +mul.f64 fd162, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd163, %35, fd162; +fma.rn.f64 fd164, fd147, 0d3FD3C6EF372FE950, fd163; +mul.f64 fd165, fd156, 0d3FE2CF2304755A5E; +mul.f64 fd166, fd158, 0d3FEE6F0E134454FF; +sub.f64 fd167, fd165, fd166; +sub.f64 fd168, fd164, fd167; +add.f64 fd169, fd167, fd164; +fma.rn.f64 fd170, fd149, 0d3FD3C6EF372FE950, %37; +mul.f64 fd171, fd151, 0d3FE9E3779B97F4A8; +sub.f64 fd172, fd170, fd171; +sub.f64 fd173, %43, %67; +mul.f64 fd174, fd173, 0d3FEE6F0E134454FF; +sub.f64 fd175, %51, %59; +fma.rn.f64 fd176, fd175, 0d3FE2CF2304755A5E, fd174; +add.f64 fd177, fd176, fd172; +sub.f64 fd178, fd172, fd176; +mul.f64 fd179, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd180, %37, fd179; +fma.rn.f64 fd181, fd151, 0d3FD3C6EF372FE950, fd180; +mul.f64 fd182, fd173, 0d3FE2CF2304755A5E; +mul.f64 fd183, fd175, 0d3FEE6F0E134454FF; +sub.f64 fd184, fd182, fd183; +add.f64 fd185, fd184, fd181; +sub.f64 fd186, fd181, fd184; +mul.f64 fd187, fd118, 0d3FED3BC3AEFF7F95; +mul.f64 fd188, fd135, 0d3FDA07F921061AD1; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd135, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd191, fd118, 0d3FDA07F921061AD1, fd190; +mul.f64 fd192, fd160, 0d3FE5698496E20BD8; +mul.f64 fd193, fd177, 0d3FE7C7D7A833BEC2; +sub.f64 fd194, fd192, fd193; +mul.f64 fd195, fd177, 0d3FE5698496E20BD8; +fma.rn.f64 fd196, fd160, 0d3FE7C7D7A833BEC2, fd195; +mul.f64 fd197, fd126, 0d3FE5698496E20BD8; +mul.f64 fd198, fd143, 0d3FE7C7D7A833BEC2; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd143, 0d3FE5698496E20BD8; +fma.rn.f64 fd201, fd126, 0d3FE7C7D7A833BEC2, fd200; +mul.f64 fd202, fd168, 0dBFBAC2609B3C576C; +mul.f64 fd203, fd185, 0d3FEFD31F94F867C6; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd185, 0dBFBAC2609B3C576C; +fma.rn.f64 fd206, fd168, 0d3FEFD31F94F867C6, fd205; +mul.f64 fd207, fd127, 0d3FD3C6EF372FE950; +mul.f64 fd208, fd144, 0d3FEE6F0E134454FF; +sub.f64 fd209, fd207, fd208; +mul.f64 fd210, fd144, 0d3FD3C6EF372FE950; +fma.rn.f64 fd211, fd127, 0d3FEE6F0E134454FF, fd210; +mul.f64 fd212, fd169, 0dBFE9E3779B97F4A8; +mul.f64 fd213, fd186, 0d3FE2CF2304755A5E; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd186, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd216, fd169, 0d3FE2CF2304755A5E, fd215; +mul.f64 fd217, fd119, 0dBFBAC2609B3C576C; +mul.f64 fd218, fd136, 0d3FEFD31F94F867C6; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd136, 0dBFBAC2609B3C576C; +fma.rn.f64 fd221, fd119, 0d3FEFD31F94F867C6, fd220; +mul.f64 fd222, fd161, 0dBFEF4CFC327A0080; +mul.f64 fd223, fd178, 0dBFCA9CD9AC4258F6; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd178, 0dBFEF4CFC327A0080; +fma.rn.f64 fd226, fd161, 0dBFCA9CD9AC4258F6, fd225; +add.f64 fd227, fd106, fd148; +add.f64 fd228, fd110, fd152; +mul.f64 fd229, fd227, 0d3FE0000000000000; +sub.f64 fd230, fd64, fd229; +sub.f64 fd231, fd110, fd152; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +mul.f64 fd233, fd228, 0d3FE0000000000000; +sub.f64 fd234, fd68, fd233; +sub.f64 fd235, fd106, fd148; +mul.f64 fd236, fd235, 0dBFEBB67AE8584CAA; +add.f64 fd237, fd189, fd194; +add.f64 fd238, fd191, fd196; +mul.f64 fd239, fd237, 0d3FE0000000000000; +sub.f64 fd240, fd76, fd239; +sub.f64 fd241, fd191, fd196; +mul.f64 fd242, fd241, 0dBFEBB67AE8584CAA; +mul.f64 fd243, fd238, 0d3FE0000000000000; +sub.f64 fd244, fd93, fd243; +sub.f64 fd245, fd189, fd194; +mul.f64 fd246, fd245, 0dBFEBB67AE8584CAA; +add.f64 fd247, fd199, fd204; +add.f64 fd248, fd201, fd206; +mul.f64 fd249, fd247, 0d3FE0000000000000; +sub.f64 fd250, fd84, fd249; +sub.f64 fd251, fd201, fd206; +mul.f64 fd252, fd251, 0dBFEBB67AE8584CAA; +mul.f64 fd253, fd248, 0d3FE0000000000000; +sub.f64 fd254, fd101, fd253; +sub.f64 fd255, fd199, fd204; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd209, fd214; +add.f64 fd258, fd211, fd216; +mul.f64 fd259, fd257, 0d3FE0000000000000; +sub.f64 fd260, fd85, fd259; +sub.f64 fd261, fd211, fd216; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +mul.f64 fd263, fd258, 0d3FE0000000000000; +sub.f64 fd264, fd102, fd263; +sub.f64 fd265, fd209, fd214; +mul.f64 fd266, fd265, 0dBFEBB67AE8584CAA; +add.f64 fd267, fd219, fd224; +add.f64 fd268, fd221, fd226; +mul.f64 fd269, fd267, 0d3FE0000000000000; +sub.f64 fd270, fd77, fd269; +sub.f64 fd271, fd221, fd226; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +mul.f64 fd273, fd268, 0d3FE0000000000000; +sub.f64 fd274, fd94, fd273; +sub.f64 fd275, fd219, fd224; +mul.f64 fd276, fd275, 0dBFEBB67AE8584CAA; +add.f64 %1, fd68, fd228; +add.f64 %0, fd64, fd227; +add.f64 %3, fd93, fd238; +add.f64 %2, fd76, fd237; +add.f64 %5, fd101, fd248; +add.f64 %4, fd84, fd247; +add.f64 %7, fd102, fd258; +add.f64 %6, fd85, fd257; +add.f64 %9, fd94, fd268; +add.f64 %8, fd77, fd267; +sub.f64 %11, fd234, fd236; +add.f64 %10, fd232, fd230; +sub.f64 %13, fd244, fd246; +add.f64 %12, fd242, fd240; +sub.f64 %15, fd254, fd256; +add.f64 %14, fd252, fd250; +sub.f64 %17, fd264, fd266; +add.f64 %16, fd262, fd260; +sub.f64 %19, fd274, fd276; +add.f64 %18, fd272, fd270; +add.f64 %21, fd236, fd234; +sub.f64 %20, fd230, fd232; +add.f64 %23, fd246, fd244; +sub.f64 %22, fd240, fd242; +add.f64 %25, fd256, fd254; +sub.f64 %24, fd250, fd252; +add.f64 %27, fd266, fd264; +sub.f64 %26, fd260, fd262; +add.f64 %29, fd276, fd274; +sub.f64 %28, fd270, fd272; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..f58ce7d38b97e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp16_fwd.hpp.inc @@ -0,0 +1,27410 @@ +#ifndef CUFFTDX_FFT_16384_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_16384_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1174, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1085>; +.reg .b32 r<7043>; +.reg .b64 rd<3>; +mov.u32 r6957, %tid.y; +shl.b32 r6958, r6957, 17; +mov.u32 r6959, %64; +add.s32 r6960, r6959, r6958; +mov.u32 r6961, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f1022, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r101, {low, high}; +} +mov.f32 f1040, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f1018, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r393, {low, high}; +} +mov.f32 f1044, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r396, {low, high}; +} +mov.f32 f1026, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r397, {low, high}; +} +mov.f32 f1042, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6963, r6961, 8; +and.b32 r6964, r6963, -131072; +add.s32 r6965, r6960, r6964; +and.b32 r6978, r6961, 511; +cvt.rn.f32.u32 f1077, r6978; +mul.f32 f1078, f1077, 0f39C90FDB; +cos.approx.f32 f357, f1078; +sin.approx.f32 f1079, f1078; +neg.f32 f358, f1079; +mov.f32 f1084, 0f3F800000; +mov.f32 f1083, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r6966, r6963, 130816; +add.s32 r6967, r6965, r6966; +st.shared.v4.f32 [r6967], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r6967+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r6967+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r6967+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r6967+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r6967+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r6967+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r6967+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r6967+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r6967+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r6967+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r6967+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r6967+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r6967+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r6967+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r6967+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r6968, r6978, -248, r6967; +ld.shared.u32 r2864, [r6968]; +ld.shared.u32 r2867, [r6968+4]; +ld.shared.u32 r3480, [r6968+4096]; +ld.shared.u32 r3483, [r6968+4100]; +ld.shared.u32 r3060, [r6968+8192]; +ld.shared.u32 r3063, [r6968+8196]; +ld.shared.u32 r3676, [r6968+12288]; +ld.shared.u32 r3679, [r6968+12292]; +ld.shared.u32 r2914, [r6968+16384]; +ld.shared.u32 r2917, [r6968+16388]; +ld.shared.u32 r3530, [r6968+20480]; +ld.shared.u32 r3533, [r6968+20484]; +ld.shared.u32 r3110, [r6968+24576]; +ld.shared.u32 r3113, [r6968+24580]; +ld.shared.u32 r3726, [r6968+28672]; +ld.shared.u32 r3729, [r6968+28676]; +ld.shared.u32 r2876, [r6968+32768]; +ld.shared.u32 r2879, [r6968+32772]; +ld.shared.u32 r3492, [r6968+36864]; +ld.shared.u32 r3495, [r6968+36868]; +ld.shared.u32 r3072, [r6968+40960]; +ld.shared.u32 r3075, [r6968+40964]; +ld.shared.u32 r3688, [r6968+45056]; +ld.shared.u32 r3691, [r6968+45060]; +ld.shared.u32 r2926, [r6968+49152]; +ld.shared.u32 r2929, [r6968+49156]; +ld.shared.u32 r3542, [r6968+53248]; +ld.shared.u32 r3545, [r6968+53252]; +ld.shared.u32 r3122, [r6968+57344]; +ld.shared.u32 r3125, [r6968+57348]; +ld.shared.u32 r3738, [r6968+61440]; +ld.shared.u32 r3741, [r6968+61444]; +ld.shared.u32 r2865, [r6968+65536]; +ld.shared.u32 r2868, [r6968+65540]; +ld.shared.u32 r3481, [r6968+69632]; +ld.shared.u32 r3484, [r6968+69636]; +ld.shared.u32 r3061, [r6968+73728]; +ld.shared.u32 r3064, [r6968+73732]; +ld.shared.u32 r3677, [r6968+77824]; +ld.shared.u32 r3680, [r6968+77828]; +ld.shared.u32 r2915, [r6968+81920]; +ld.shared.u32 r2918, [r6968+81924]; +ld.shared.u32 r3531, [r6968+86016]; +ld.shared.u32 r3534, [r6968+86020]; +ld.shared.u32 r3111, [r6968+90112]; +ld.shared.u32 r3114, [r6968+90116]; +ld.shared.u32 r3727, [r6968+94208]; +ld.shared.u32 r3730, [r6968+94212]; +ld.shared.u32 r2877, [r6968+98304]; +ld.shared.u32 r2880, [r6968+98308]; +ld.shared.u32 r3493, [r6968+102400]; +ld.shared.u32 r3496, [r6968+102404]; +ld.shared.u32 r3073, [r6968+106496]; +ld.shared.u32 r3076, [r6968+106500]; +ld.shared.u32 r3689, [r6968+110592]; +ld.shared.u32 r3692, [r6968+110596]; +ld.shared.u32 r2927, [r6968+114688]; +ld.shared.u32 r2930, [r6968+114692]; +ld.shared.u32 r3543, [r6968+118784]; +ld.shared.u32 r3546, [r6968+118788]; +ld.shared.u32 r3123, [r6968+122880]; +ld.shared.u32 r3126, [r6968+122884]; +ld.shared.u32 r3739, [r6968+126976]; +ld.shared.u32 r3742, [r6968+126980]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r6970, r6961, 5, 4; +cvt.rn.f32.u32 f1080, r6970; +mul.f32 f1081, f1080, 0f3C490FDB; +cos.approx.f32 f779, f1081; +sin.approx.f32 f1082, f1081; +neg.f32 f780, f1082; +and.b32 r6977, r6961, 480; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +shl.b32 r6971, r6961, 3; +and.b32 r6972, r6971, 248; +add.s32 r6973, r6965, r6972; +barrier.sync 0; +and.b32 r6974, r6963, 122880; +add.s32 r6975, r6973, r6974; +st.shared.u32 [r6975], r4383; +st.shared.u32 [r6975+4], r4386; +st.shared.u32 [r6975+256], r4587; +st.shared.u32 [r6975+260], r4594; +st.shared.u32 [r6975+512], r4624; +st.shared.u32 [r6975+516], r4631; +st.shared.u32 [r6975+768], r4661; +st.shared.u32 [r6975+772], r4668; +st.shared.u32 [r6975+1024], r4698; +st.shared.u32 [r6975+1028], r4705; +st.shared.u32 [r6975+1280], r4735; +st.shared.u32 [r6975+1284], r4742; +st.shared.u32 [r6975+1536], r4772; +st.shared.u32 [r6975+1540], r4779; +st.shared.u32 [r6975+1792], r4809; +st.shared.u32 [r6975+1796], r4816; +st.shared.u32 [r6975+2048], r4846; +st.shared.u32 [r6975+2052], r4853; +st.shared.u32 [r6975+2304], r4883; +st.shared.u32 [r6975+2308], r4890; +st.shared.u32 [r6975+2560], r4920; +st.shared.u32 [r6975+2564], r4927; +st.shared.u32 [r6975+2816], r4957; +st.shared.u32 [r6975+2820], r4964; +st.shared.u32 [r6975+3072], r4994; +st.shared.u32 [r6975+3076], r5001; +st.shared.u32 [r6975+3328], r5031; +st.shared.u32 [r6975+3332], r5038; +st.shared.u32 [r6975+3584], r5068; +st.shared.u32 [r6975+3588], r5075; +st.shared.u32 [r6975+3840], r5105; +st.shared.u32 [r6975+3844], r5112; +st.shared.u32 [r6975+4096], r5142; +st.shared.u32 [r6975+4100], r5149; +st.shared.u32 [r6975+4352], r5179; +st.shared.u32 [r6975+4356], r5186; +st.shared.u32 [r6975+4608], r5216; +st.shared.u32 [r6975+4612], r5223; +st.shared.u32 [r6975+4864], r5253; +st.shared.u32 [r6975+4868], r5260; +st.shared.u32 [r6975+5120], r5290; +st.shared.u32 [r6975+5124], r5297; +st.shared.u32 [r6975+5376], r5327; +st.shared.u32 [r6975+5380], r5334; +st.shared.u32 [r6975+5632], r5364; +st.shared.u32 [r6975+5636], r5371; +st.shared.u32 [r6975+5888], r5401; +st.shared.u32 [r6975+5892], r5408; +st.shared.u32 [r6975+6144], r5438; +st.shared.u32 [r6975+6148], r5445; +st.shared.u32 [r6975+6400], r5475; +st.shared.u32 [r6975+6404], r5482; +st.shared.u32 [r6975+6656], r5512; +st.shared.u32 [r6975+6660], r5519; +st.shared.u32 [r6975+6912], r5549; +st.shared.u32 [r6975+6916], r5556; +st.shared.u32 [r6975+7168], r5586; +st.shared.u32 [r6975+7172], r5593; +st.shared.u32 [r6975+7424], r5623; +st.shared.u32 [r6975+7428], r5630; +st.shared.u32 [r6975+7680], r5660; +st.shared.u32 [r6975+7684], r5667; +st.shared.u32 [r6975+7936], r5697; +st.shared.u32 [r6975+7940], r5704; +barrier.sync 0; +mad.lo.s32 r6976, r6977, -248, r6975; +ld.shared.u32 r5726, [r6976]; +ld.shared.u32 r5729, [r6976+4]; +ld.shared.u32 r6342, [r6976+4096]; +ld.shared.u32 r6345, [r6976+4100]; +ld.shared.u32 r5922, [r6976+8192]; +ld.shared.u32 r5925, [r6976+8196]; +ld.shared.u32 r6538, [r6976+12288]; +ld.shared.u32 r6541, [r6976+12292]; +ld.shared.u32 r5776, [r6976+16384]; +ld.shared.u32 r5779, [r6976+16388]; +ld.shared.u32 r6392, [r6976+20480]; +ld.shared.u32 r6395, [r6976+20484]; +ld.shared.u32 r5972, [r6976+24576]; +ld.shared.u32 r5975, [r6976+24580]; +ld.shared.u32 r6588, [r6976+28672]; +ld.shared.u32 r6591, [r6976+28676]; +ld.shared.u32 r5738, [r6976+32768]; +ld.shared.u32 r5741, [r6976+32772]; +ld.shared.u32 r6354, [r6976+36864]; +ld.shared.u32 r6357, [r6976+36868]; +ld.shared.u32 r5934, [r6976+40960]; +ld.shared.u32 r5937, [r6976+40964]; +ld.shared.u32 r6550, [r6976+45056]; +ld.shared.u32 r6553, [r6976+45060]; +ld.shared.u32 r5788, [r6976+49152]; +ld.shared.u32 r5791, [r6976+49156]; +ld.shared.u32 r6404, [r6976+53248]; +ld.shared.u32 r6407, [r6976+53252]; +ld.shared.u32 r5984, [r6976+57344]; +ld.shared.u32 r5987, [r6976+57348]; +ld.shared.u32 r6600, [r6976+61440]; +ld.shared.u32 r6603, [r6976+61444]; +ld.shared.u32 r5727, [r6976+65536]; +ld.shared.u32 r5730, [r6976+65540]; +ld.shared.u32 r6343, [r6976+69632]; +ld.shared.u32 r6346, [r6976+69636]; +ld.shared.u32 r5923, [r6976+73728]; +ld.shared.u32 r5926, [r6976+73732]; +ld.shared.u32 r6539, [r6976+77824]; +ld.shared.u32 r6542, [r6976+77828]; +ld.shared.u32 r5777, [r6976+81920]; +ld.shared.u32 r5780, [r6976+81924]; +ld.shared.u32 r6393, [r6976+86016]; +ld.shared.u32 r6396, [r6976+86020]; +ld.shared.u32 r5973, [r6976+90112]; +ld.shared.u32 r5976, [r6976+90116]; +ld.shared.u32 r6589, [r6976+94208]; +ld.shared.u32 r6592, [r6976+94212]; +ld.shared.u32 r5739, [r6976+98304]; +ld.shared.u32 r5742, [r6976+98308]; +ld.shared.u32 r6355, [r6976+102400]; +ld.shared.u32 r6358, [r6976+102404]; +ld.shared.u32 r5935, [r6976+106496]; +ld.shared.u32 r5938, [r6976+106500]; +ld.shared.u32 r6551, [r6976+110592]; +ld.shared.u32 r6554, [r6976+110596]; +ld.shared.u32 r5789, [r6976+114688]; +ld.shared.u32 r5792, [r6976+114692]; +ld.shared.u32 r6405, [r6976+118784]; +ld.shared.u32 r6408, [r6976+118788]; +ld.shared.u32 r5985, [r6976+122880]; +ld.shared.u32 r5988, [r6976+122884]; +ld.shared.u32 r6601, [r6976+126976]; +ld.shared.u32 r6604, [r6976+126980]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5746; +} +{ +add.f16x2 r5766, r5734, r5749; +} +{ +sub.f16x2 r5769, r5731, r5746; +} +{ +sub.f16x2 r5772, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5796; +} +{ +add.f16x2 r5816, r5784, r5799; +} +{ +sub.f16x2 r5819, r5781, r5796; +} +{ +sub.f16x2 r5822, r5784, r5799; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5807; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 r5873, r5751, r5801; +} +{ +add.f16x2 r5876, r5754, r5804; +} +{ +sub.f16x2 r5879, r5751, r5801; +} +{ +sub.f16x2 r5882, r5754, r5804; +} +{ +add.f16x2 r5885, r5763, r5845; +} +{ +add.f16x2 r5888, r5766, r5851; +} +{ +sub.f16x2 r5891, r5763, r5845; +} +{ +sub.f16x2 r5894, r5766, r5851; +} +{ +add.f16x2 r5897, r5757, r5810; +} +{ +add.f16x2 r5900, r5760, r5855; +} +{ +sub.f16x2 r5903, r5757, r5810; +} +{ +sub.f16x2 r5906, r5760, r5855; +} +{ +add.f16x2 r5909, r5769, r5863; +} +{ +add.f16x2 r5912, r5772, r5869; +} +{ +sub.f16x2 r5915, r5769, r5863; +} +{ +sub.f16x2 r5918, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5939; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5942; +} +{ +add.f16x2 r5962, r5930, r5945; +} +{ +sub.f16x2 r5965, r5927, r5942; +} +{ +sub.f16x2 r5968, r5930, r5945; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5989; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5992; +} +{ +add.f16x2 r6012, r5980, r5995; +} +{ +sub.f16x2 r6015, r5977, r5992; +} +{ +sub.f16x2 r6018, r5980, r5995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6003; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 r6069, r5947, r5997; +} +{ +add.f16x2 r6072, r5950, r6000; +} +{ +sub.f16x2 r6075, r5947, r5997; +} +{ +sub.f16x2 r6078, r5950, r6000; +} +{ +add.f16x2 r6081, r5959, r6041; +} +{ +add.f16x2 r6084, r5962, r6047; +} +{ +sub.f16x2 r6087, r5959, r6041; +} +{ +sub.f16x2 r6090, r5962, r6047; +} +{ +add.f16x2 r6093, r5953, r6006; +} +{ +add.f16x2 r6096, r5956, r6051; +} +{ +sub.f16x2 r6099, r5953, r6006; +} +{ +sub.f16x2 r6102, r5956, r6051; +} +{ +add.f16x2 r6105, r5965, r6059; +} +{ +add.f16x2 r6108, r5968, r6065; +} +{ +sub.f16x2 r6111, r5965, r6059; +} +{ +sub.f16x2 r6114, r5968, r6065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r6117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r6121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6125, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6126, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6128, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6130, {low, high}; +} +{ +mul.f16x2 r6147, r6081, r6117; +} +{ +mul.f16x2 r6150, r6084, r6118; +} +{ +sub.f16x2 r6153, r6147, r6150; +} +{ +mul.f16x2 r6156, r6081, r6118; +} +{ +fma.rn.f16x2 r6159, r6084, r6117, r6156; +} +{ +mul.f16x2 r6163, r6093, r6119; +} +{ +mul.f16x2 r6166, r6096, r6120; +} +{ +sub.f16x2 r6169, r6163, r6166; +} +{ +mul.f16x2 r6172, r6093, r6120; +} +{ +fma.rn.f16x2 r6175, r6096, r6119, r6172; +} +{ +mul.f16x2 r6179, r6105, r6121; +} +{ +mul.f16x2 r6182, r6108, r6122; +} +{ +sub.f16x2 r6185, r6179, r6182; +} +{ +mul.f16x2 r6188, r6105, r6122; +} +{ +fma.rn.f16x2 r6191, r6108, r6121, r6188; +} +{ +neg.f16x2 r6195, r6075; +} +{ +mul.f16x2 r6197, r6087, r6125; +} +{ +mul.f16x2 r6200, r6090, r6126; +} +{ +sub.f16x2 r6203, r6197, r6200; +} +{ +mul.f16x2 r6206, r6087, r6126; +} +{ +fma.rn.f16x2 r6209, r6090, r6125, r6206; +} +{ +mul.f16x2 r6213, r6099, r6127; +} +{ +mul.f16x2 r6216, r6102, r6128; +} +{ +sub.f16x2 r6219, r6213, r6216; +} +{ +mul.f16x2 r6222, r6099, r6128; +} +{ +fma.rn.f16x2 r6225, r6102, r6127, r6222; +} +{ +mul.f16x2 r6229, r6111, r6129; +} +{ +mul.f16x2 r6232, r6114, r6130; +} +{ +sub.f16x2 r6235, r6229, r6232; +} +{ +mul.f16x2 r6238, r6111, r6130; +} +{ +fma.rn.f16x2 r6241, r6114, r6129, r6238; +} +{ +add.f16x2 %0, r5873, r6069; +} +{ +add.f16x2 %1, r5876, r6072; +} +{ +sub.f16x2 %32, r5873, r6069; +} +{ +sub.f16x2 %33, r5876, r6072; +} +{ +add.f16x2 %4, r5885, r6153; +} +{ +add.f16x2 %5, r5888, r6159; +} +{ +sub.f16x2 %36, r5885, r6153; +} +{ +sub.f16x2 %37, r5888, r6159; +} +{ +add.f16x2 %8, r5897, r6169; +} +{ +add.f16x2 %9, r5900, r6175; +} +{ +sub.f16x2 %40, r5897, r6169; +} +{ +sub.f16x2 %41, r5900, r6175; +} +{ +add.f16x2 %12, r5909, r6185; +} +{ +add.f16x2 %13, r5912, r6191; +} +{ +sub.f16x2 %44, r5909, r6185; +} +{ +sub.f16x2 %45, r5912, r6191; +} +{ +add.f16x2 %16, r5879, r6078; +} +{ +add.f16x2 %17, r5882, r6195; +} +{ +sub.f16x2 %48, r5879, r6078; +} +{ +sub.f16x2 %49, r5882, r6195; +} +{ +add.f16x2 %20, r5891, r6203; +} +{ +add.f16x2 %21, r5894, r6209; +} +{ +sub.f16x2 %52, r5891, r6203; +} +{ +sub.f16x2 %53, r5894, r6209; +} +{ +add.f16x2 %24, r5903, r6219; +} +{ +add.f16x2 %25, r5906, r6225; +} +{ +sub.f16x2 %56, r5903, r6219; +} +{ +sub.f16x2 %57, r5906, r6225; +} +{ +add.f16x2 %28, r5915, r6235; +} +{ +add.f16x2 %29, r5918, r6241; +} +{ +sub.f16x2 %60, r5915, r6235; +} +{ +sub.f16x2 %61, r5918, r6241; +} +{ +add.f16x2 r6341, r6342, r6343; +} +{ +add.f16x2 r6344, r6345, r6346; +} +{ +sub.f16x2 r6347, r6342, r6343; +} +{ +sub.f16x2 r6350, r6345, r6346; +} +{ +add.f16x2 r6353, r6354, r6355; +} +{ +add.f16x2 r6356, r6357, r6358; +} +{ +sub.f16x2 r6359, r6354, r6355; +} +{ +sub.f16x2 r6362, r6357, r6358; +} +{ +neg.f16x2 r6365, r6359; +} +{ +add.f16x2 r6367, r6341, r6353; +} +{ +add.f16x2 r6370, r6344, r6356; +} +{ +sub.f16x2 r6373, r6341, r6353; +} +{ +sub.f16x2 r6376, r6344, r6356; +} +{ +add.f16x2 r6379, r6347, r6362; +} +{ +add.f16x2 r6382, r6350, r6365; +} +{ +sub.f16x2 r6385, r6347, r6362; +} +{ +sub.f16x2 r6388, r6350, r6365; +} +{ +add.f16x2 r6391, r6392, r6393; +} +{ +add.f16x2 r6394, r6395, r6396; +} +{ +sub.f16x2 r6397, r6392, r6393; +} +{ +sub.f16x2 r6400, r6395, r6396; +} +{ +add.f16x2 r6403, r6404, r6405; +} +{ +add.f16x2 r6406, r6407, r6408; +} +{ +sub.f16x2 r6409, r6404, r6405; +} +{ +sub.f16x2 r6412, r6407, r6408; +} +{ +neg.f16x2 r6415, r6409; +} +{ +add.f16x2 r6417, r6391, r6403; +} +{ +add.f16x2 r6420, r6394, r6406; +} +{ +sub.f16x2 r6423, r6391, r6403; +} +{ +sub.f16x2 r6426, r6394, r6406; +} +{ +add.f16x2 r6429, r6397, r6412; +} +{ +add.f16x2 r6432, r6400, r6415; +} +{ +sub.f16x2 r6435, r6397, r6412; +} +{ +sub.f16x2 r6438, r6400, r6415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6446, {low, high}; +} +{ +mul.f16x2 r6455, r6429, r6441; +} +{ +mul.f16x2 r6458, r6432, r6442; +} +{ +sub.f16x2 r6461, r6455, r6458; +} +{ +mul.f16x2 r6464, r6429, r6442; +} +{ +fma.rn.f16x2 r6467, r6432, r6441, r6464; +} +{ +neg.f16x2 r6471, r6423; +} +{ +mul.f16x2 r6473, r6435, r6445; +} +{ +mul.f16x2 r6476, r6438, r6446; +} +{ +sub.f16x2 r6479, r6473, r6476; +} +{ +mul.f16x2 r6482, r6435, r6446; +} +{ +fma.rn.f16x2 r6485, r6438, r6445, r6482; +} +{ +add.f16x2 r6489, r6367, r6417; +} +{ +add.f16x2 r6492, r6370, r6420; +} +{ +sub.f16x2 r6495, r6367, r6417; +} +{ +sub.f16x2 r6498, r6370, r6420; +} +{ +add.f16x2 r6501, r6379, r6461; +} +{ +add.f16x2 r6504, r6382, r6467; +} +{ +sub.f16x2 r6507, r6379, r6461; +} +{ +sub.f16x2 r6510, r6382, r6467; +} +{ +add.f16x2 r6513, r6373, r6426; +} +{ +add.f16x2 r6516, r6376, r6471; +} +{ +sub.f16x2 r6519, r6373, r6426; +} +{ +sub.f16x2 r6522, r6376, r6471; +} +{ +add.f16x2 r6525, r6385, r6479; +} +{ +add.f16x2 r6528, r6388, r6485; +} +{ +sub.f16x2 r6531, r6385, r6479; +} +{ +sub.f16x2 r6534, r6388, r6485; +} +{ +add.f16x2 r6537, r6538, r6539; +} +{ +add.f16x2 r6540, r6541, r6542; +} +{ +sub.f16x2 r6543, r6538, r6539; +} +{ +sub.f16x2 r6546, r6541, r6542; +} +{ +add.f16x2 r6549, r6550, r6551; +} +{ +add.f16x2 r6552, r6553, r6554; +} +{ +sub.f16x2 r6555, r6550, r6551; +} +{ +sub.f16x2 r6558, r6553, r6554; +} +{ +neg.f16x2 r6561, r6555; +} +{ +add.f16x2 r6563, r6537, r6549; +} +{ +add.f16x2 r6566, r6540, r6552; +} +{ +sub.f16x2 r6569, r6537, r6549; +} +{ +sub.f16x2 r6572, r6540, r6552; +} +{ +add.f16x2 r6575, r6543, r6558; +} +{ +add.f16x2 r6578, r6546, r6561; +} +{ +sub.f16x2 r6581, r6543, r6558; +} +{ +sub.f16x2 r6584, r6546, r6561; +} +{ +add.f16x2 r6587, r6588, r6589; +} +{ +add.f16x2 r6590, r6591, r6592; +} +{ +sub.f16x2 r6593, r6588, r6589; +} +{ +sub.f16x2 r6596, r6591, r6592; +} +{ +add.f16x2 r6599, r6600, r6601; +} +{ +add.f16x2 r6602, r6603, r6604; +} +{ +sub.f16x2 r6605, r6600, r6601; +} +{ +sub.f16x2 r6608, r6603, r6604; +} +{ +neg.f16x2 r6611, r6605; +} +{ +add.f16x2 r6613, r6587, r6599; +} +{ +add.f16x2 r6616, r6590, r6602; +} +{ +sub.f16x2 r6619, r6587, r6599; +} +{ +sub.f16x2 r6622, r6590, r6602; +} +{ +add.f16x2 r6625, r6593, r6608; +} +{ +add.f16x2 r6628, r6596, r6611; +} +{ +sub.f16x2 r6631, r6593, r6608; +} +{ +sub.f16x2 r6634, r6596, r6611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6642, {low, high}; +} +{ +mul.f16x2 r6651, r6625, r6637; +} +{ +mul.f16x2 r6654, r6628, r6638; +} +{ +sub.f16x2 r6657, r6651, r6654; +} +{ +mul.f16x2 r6660, r6625, r6638; +} +{ +fma.rn.f16x2 r6663, r6628, r6637, r6660; +} +{ +neg.f16x2 r6667, r6619; +} +{ +mul.f16x2 r6669, r6631, r6641; +} +{ +mul.f16x2 r6672, r6634, r6642; +} +{ +sub.f16x2 r6675, r6669, r6672; +} +{ +mul.f16x2 r6678, r6631, r6642; +} +{ +fma.rn.f16x2 r6681, r6634, r6641, r6678; +} +{ +add.f16x2 r6685, r6563, r6613; +} +{ +add.f16x2 r6688, r6566, r6616; +} +{ +sub.f16x2 r6691, r6563, r6613; +} +{ +sub.f16x2 r6694, r6566, r6616; +} +{ +add.f16x2 r6697, r6575, r6657; +} +{ +add.f16x2 r6700, r6578, r6663; +} +{ +sub.f16x2 r6703, r6575, r6657; +} +{ +sub.f16x2 r6706, r6578, r6663; +} +{ +add.f16x2 r6709, r6569, r6622; +} +{ +add.f16x2 r6712, r6572, r6667; +} +{ +sub.f16x2 r6715, r6569, r6622; +} +{ +sub.f16x2 r6718, r6572, r6667; +} +{ +add.f16x2 r6721, r6581, r6675; +} +{ +add.f16x2 r6724, r6584, r6681; +} +{ +sub.f16x2 r6727, r6581, r6675; +} +{ +sub.f16x2 r6730, r6584, r6681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r6733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r6737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6746, {low, high}; +} +{ +mul.f16x2 r6763, r6697, r6733; +} +{ +mul.f16x2 r6766, r6700, r6734; +} +{ +sub.f16x2 r6769, r6763, r6766; +} +{ +mul.f16x2 r6772, r6697, r6734; +} +{ +fma.rn.f16x2 r6775, r6700, r6733, r6772; +} +{ +mul.f16x2 r6779, r6709, r6735; +} +{ +mul.f16x2 r6782, r6712, r6736; +} +{ +sub.f16x2 r6785, r6779, r6782; +} +{ +mul.f16x2 r6788, r6709, r6736; +} +{ +fma.rn.f16x2 r6791, r6712, r6735, r6788; +} +{ +mul.f16x2 r6795, r6721, r6737; +} +{ +mul.f16x2 r6798, r6724, r6738; +} +{ +sub.f16x2 r6801, r6795, r6798; +} +{ +mul.f16x2 r6804, r6721, r6738; +} +{ +fma.rn.f16x2 r6807, r6724, r6737, r6804; +} +{ +neg.f16x2 r6811, r6691; +} +{ +mul.f16x2 r6813, r6703, r6741; +} +{ +mul.f16x2 r6816, r6706, r6742; +} +{ +sub.f16x2 r6819, r6813, r6816; +} +{ +mul.f16x2 r6822, r6703, r6742; +} +{ +fma.rn.f16x2 r6825, r6706, r6741, r6822; +} +{ +mul.f16x2 r6829, r6715, r6743; +} +{ +mul.f16x2 r6832, r6718, r6744; +} +{ +sub.f16x2 r6835, r6829, r6832; +} +{ +mul.f16x2 r6838, r6715, r6744; +} +{ +fma.rn.f16x2 r6841, r6718, r6743, r6838; +} +{ +mul.f16x2 r6845, r6727, r6745; +} +{ +mul.f16x2 r6848, r6730, r6746; +} +{ +sub.f16x2 r6851, r6845, r6848; +} +{ +mul.f16x2 r6854, r6727, r6746; +} +{ +fma.rn.f16x2 r6857, r6730, r6745, r6854; +} +{ +add.f16x2 %2, r6489, r6685; +} +{ +add.f16x2 %3, r6492, r6688; +} +{ +sub.f16x2 %34, r6489, r6685; +} +{ +sub.f16x2 %35, r6492, r6688; +} +{ +add.f16x2 %6, r6501, r6769; +} +{ +add.f16x2 %7, r6504, r6775; +} +{ +sub.f16x2 %38, r6501, r6769; +} +{ +sub.f16x2 %39, r6504, r6775; +} +{ +add.f16x2 %10, r6513, r6785; +} +{ +add.f16x2 %11, r6516, r6791; +} +{ +sub.f16x2 %42, r6513, r6785; +} +{ +sub.f16x2 %43, r6516, r6791; +} +{ +add.f16x2 %14, r6525, r6801; +} +{ +add.f16x2 %15, r6528, r6807; +} +{ +sub.f16x2 %46, r6525, r6801; +} +{ +sub.f16x2 %47, r6528, r6807; +} +{ +add.f16x2 %18, r6495, r6694; +} +{ +add.f16x2 %19, r6498, r6811; +} +{ +sub.f16x2 %50, r6495, r6694; +} +{ +sub.f16x2 %51, r6498, r6811; +} +{ +add.f16x2 %22, r6507, r6819; +} +{ +add.f16x2 %23, r6510, r6825; +} +{ +sub.f16x2 %54, r6507, r6819; +} +{ +sub.f16x2 %55, r6510, r6825; +} +{ +add.f16x2 %26, r6519, r6835; +} +{ +add.f16x2 %27, r6522, r6841; +} +{ +sub.f16x2 %58, r6519, r6835; +} +{ +sub.f16x2 %59, r6522, r6841; +} +{ +add.f16x2 %30, r6531, r6851; +} +{ +add.f16x2 %31, r6534, r6857; +} +{ +sub.f16x2 %62, r6531, r6851; +} +{ +sub.f16x2 %63, r6534, r6857; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<861, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3750>; +.reg .b64 rd<2>; +mov.u32 r3723, %tid.y; +shl.b32 r3724, r3723, 16; +mov.u32 r3725, %32; +add.s32 r3726, r3725, r3724; +mov.u32 r3727, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f362, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r101, {low, high}; +} +mov.f32 f380, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f448, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f358, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +mov.f32 f366, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r397, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3728, r3727, 1023; +shl.b32 r3729, r3727, 6; +and.b32 r3730, r3729, -65536; +add.s32 r3731, r3726, r3730; +cvt.rn.f32.u32 f451, r3728; +mul.f32 f452, f451, 0f39C90FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r3732, r3729, 65472; +add.s32 r3733, r3731, r3732; +st.shared.v4.f32 [r3733], {r521, r629, r666, r703}; +st.shared.v4.f32 [r3733+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r3733+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r3733+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r3734, r3728, -60, r3733; +ld.shared.u32 r1176, [r3734]; +ld.shared.u32 r1372, [r3734+4096]; +ld.shared.u32 r1226, [r3734+8192]; +ld.shared.u32 r1422, [r3734+12288]; +ld.shared.u32 r1188, [r3734+16384]; +ld.shared.u32 r1384, [r3734+20480]; +ld.shared.u32 r1238, [r3734+24576]; +ld.shared.u32 r1434, [r3734+28672]; +ld.shared.u32 r1177, [r3734+32768]; +ld.shared.u32 r1373, [r3734+36864]; +ld.shared.u32 r1227, [r3734+40960]; +ld.shared.u32 r1423, [r3734+45056]; +ld.shared.u32 r1189, [r3734+49152]; +ld.shared.u32 r1385, [r3734+53248]; +ld.shared.u32 r1239, [r3734+57344]; +ld.shared.u32 r1435, [r3734+61440]; +barrier.sync 0; +st.shared.v4.f32 [r3733], {r524, r636, r673, r710}; +st.shared.v4.f32 [r3733+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r3733+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r3733+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r3734]; +ld.shared.u32 r1375, [r3734+4096]; +ld.shared.u32 r1229, [r3734+8192]; +ld.shared.u32 r1425, [r3734+12288]; +ld.shared.u32 r1191, [r3734+16384]; +ld.shared.u32 r1387, [r3734+20480]; +ld.shared.u32 r1241, [r3734+24576]; +ld.shared.u32 r1437, [r3734+28672]; +ld.shared.u32 r1180, [r3734+32768]; +ld.shared.u32 r1376, [r3734+36864]; +ld.shared.u32 r1230, [r3734+40960]; +ld.shared.u32 r1426, [r3734+45056]; +ld.shared.u32 r1192, [r3734+49152]; +ld.shared.u32 r1388, [r3734+53248]; +ld.shared.u32 r1242, [r3734+57344]; +ld.shared.u32 r1438, [r3734+61440]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3735, r3727, 1008; +bfe.u32 r3736, r3727, 4, 6; +shl.b32 r3737, r3727, 2; +and.b32 r3738, r3737, 60; +add.s32 r3739, r3731, r3738; +cvt.rn.f32.u32 f454, r3736; +mul.f32 f455, f454, 0f3BC90FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +barrier.sync 0; +and.b32 r3740, r3729, 64512; +add.s32 r3741, r3739, r3740; +st.shared.u32 [r3741], r1695; +st.shared.u32 [r3741+64], r1803; +st.shared.u32 [r3741+128], r1840; +st.shared.u32 [r3741+192], r1877; +st.shared.u32 [r3741+256], r1914; +st.shared.u32 [r3741+320], r1951; +st.shared.u32 [r3741+384], r1988; +st.shared.u32 [r3741+448], r2025; +st.shared.u32 [r3741+512], r2062; +st.shared.u32 [r3741+576], r2099; +st.shared.u32 [r3741+640], r2136; +st.shared.u32 [r3741+704], r2173; +st.shared.u32 [r3741+768], r2210; +st.shared.u32 [r3741+832], r2247; +st.shared.u32 [r3741+896], r2284; +st.shared.u32 [r3741+960], r2321; +barrier.sync 0; +mad.lo.s32 r3742, r3735, -60, r3741; +ld.shared.u32 r2350, [r3742]; +ld.shared.u32 r2546, [r3742+4096]; +ld.shared.u32 r2400, [r3742+8192]; +ld.shared.u32 r2596, [r3742+12288]; +ld.shared.u32 r2362, [r3742+16384]; +ld.shared.u32 r2558, [r3742+20480]; +ld.shared.u32 r2412, [r3742+24576]; +ld.shared.u32 r2608, [r3742+28672]; +ld.shared.u32 r2351, [r3742+32768]; +ld.shared.u32 r2547, [r3742+36864]; +ld.shared.u32 r2401, [r3742+40960]; +ld.shared.u32 r2597, [r3742+45056]; +ld.shared.u32 r2363, [r3742+49152]; +ld.shared.u32 r2559, [r3742+53248]; +ld.shared.u32 r2413, [r3742+57344]; +ld.shared.u32 r2609, [r3742+61440]; +barrier.sync 0; +st.shared.u32 [r3741], r1698; +st.shared.u32 [r3741+64], r1810; +st.shared.u32 [r3741+128], r1847; +st.shared.u32 [r3741+192], r1884; +st.shared.u32 [r3741+256], r1921; +st.shared.u32 [r3741+320], r1958; +st.shared.u32 [r3741+384], r1995; +st.shared.u32 [r3741+448], r2032; +st.shared.u32 [r3741+512], r2069; +st.shared.u32 [r3741+576], r2106; +st.shared.u32 [r3741+640], r2143; +st.shared.u32 [r3741+704], r2180; +st.shared.u32 [r3741+768], r2217; +st.shared.u32 [r3741+832], r2254; +st.shared.u32 [r3741+896], r2291; +st.shared.u32 [r3741+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r3742]; +ld.shared.u32 r2549, [r3742+4096]; +ld.shared.u32 r2403, [r3742+8192]; +ld.shared.u32 r2599, [r3742+12288]; +ld.shared.u32 r2365, [r3742+16384]; +ld.shared.u32 r2561, [r3742+20480]; +ld.shared.u32 r2415, [r3742+24576]; +ld.shared.u32 r2611, [r3742+28672]; +ld.shared.u32 r2354, [r3742+32768]; +ld.shared.u32 r2550, [r3742+36864]; +ld.shared.u32 r2404, [r3742+40960]; +ld.shared.u32 r2600, [r3742+45056]; +ld.shared.u32 r2366, [r3742+49152]; +ld.shared.u32 r2562, [r3742+53248]; +ld.shared.u32 r2416, [r3742+57344]; +ld.shared.u32 r2612, [r3742+61440]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2434; +} +{ +add.f16x2 r2524, r2384, r2479; +} +{ +sub.f16x2 r2527, r2381, r2434; +} +{ +sub.f16x2 r2530, r2384, r2479; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2630; +} +{ +add.f16x2 r2720, r2580, r2675; +} +{ +sub.f16x2 r2723, r2577, r2630; +} +{ +sub.f16x2 r2726, r2580, r2675; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2699; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2702; +} +{ +add.f16x2 r2920, r2506, r2819; +} +{ +sub.f16x2 r2923, r2503, r2702; +} +{ +sub.f16x2 r2926, r2506, r2819; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3743, r3727, 768; +bfe.u32 r3744, r3727, 8, 2; +and.b32 r3745, r3737, 1020; +add.s32 r3746, r3731, r3745; +cvt.rn.f32.u32 f457, r3744; +mul.f32 f458, f457, 0f3DC90FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +neg.f16x2 r2975, r2972; +} +{ +fma.rn.f16x2 r2977, r2881, r2968, r2975; +} +{ +mul.f16x2 r2981, r2881, r2970; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +neg.f16x2 r3012, r3009; +} +{ +fma.rn.f16x2 r3014, r2893, r3005, r3012; +} +{ +mul.f16x2 r3018, r2893, r3007; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +neg.f16x2 r3049, r3046; +} +{ +fma.rn.f16x2 r3051, r2905, r3042, r3049; +} +{ +mul.f16x2 r3055, r2905, r3044; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +neg.f16x2 r3086, r3083; +} +{ +fma.rn.f16x2 r3088, r2917, r3079, r3086; +} +{ +mul.f16x2 r3092, r2917, r3081; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +neg.f16x2 r3123, r3120; +} +{ +fma.rn.f16x2 r3125, r2929, r3116, r3123; +} +{ +mul.f16x2 r3129, r2929, r3118; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +neg.f16x2 r3160, r3157; +} +{ +fma.rn.f16x2 r3162, r2941, r3153, r3160; +} +{ +mul.f16x2 r3166, r2941, r3155; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +neg.f16x2 r3197, r3194; +} +{ +fma.rn.f16x2 r3199, r2953, r3190, r3197; +} +{ +mul.f16x2 r3203, r2953, r3192; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +neg.f16x2 r3234, r3231; +} +{ +fma.rn.f16x2 r3236, r2875, r3227, r3234; +} +{ +mul.f16x2 r3240, r2875, r3229; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +neg.f16x2 r3271, r3268; +} +{ +fma.rn.f16x2 r3273, r2887, r3264, r3271; +} +{ +mul.f16x2 r3277, r2887, r3266; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +neg.f16x2 r3308, r3305; +} +{ +fma.rn.f16x2 r3310, r2899, r3301, r3308; +} +{ +mul.f16x2 r3314, r2899, r3303; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +neg.f16x2 r3345, r3342; +} +{ +fma.rn.f16x2 r3347, r2911, r3338, r3345; +} +{ +mul.f16x2 r3351, r2911, r3340; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3351; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +neg.f16x2 r3382, r3379; +} +{ +fma.rn.f16x2 r3384, r2923, r3375, r3382; +} +{ +mul.f16x2 r3388, r2923, r3377; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3388; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +neg.f16x2 r3419, r3416; +} +{ +fma.rn.f16x2 r3421, r2935, r3412, r3419; +} +{ +mul.f16x2 r3425, r2935, r3414; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3425; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +neg.f16x2 r3456, r3453; +} +{ +fma.rn.f16x2 r3458, r2947, r3449, r3456; +} +{ +mul.f16x2 r3462, r2947, r3451; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +neg.f16x2 r3493, r3490; +} +{ +fma.rn.f16x2 r3495, r2959, r3486, r3493; +} +{ +mul.f16x2 r3499, r2959, r3488; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3499; +} +barrier.sync 0; +and.b32 r3747, r3729, 49152; +add.s32 r3748, r3746, r3747; +st.shared.u32 [r3748], r2869; +st.shared.u32 [r3748+1024], r2977; +st.shared.u32 [r3748+2048], r3014; +st.shared.u32 [r3748+3072], r3051; +st.shared.u32 [r3748+4096], r3088; +st.shared.u32 [r3748+5120], r3125; +st.shared.u32 [r3748+6144], r3162; +st.shared.u32 [r3748+7168], r3199; +st.shared.u32 [r3748+8192], r3236; +st.shared.u32 [r3748+9216], r3273; +st.shared.u32 [r3748+10240], r3310; +st.shared.u32 [r3748+11264], r3347; +st.shared.u32 [r3748+12288], r3384; +st.shared.u32 [r3748+13312], r3421; +st.shared.u32 [r3748+14336], r3458; +st.shared.u32 [r3748+15360], r3495; +barrier.sync 0; +mad.lo.s32 r3749, r3743, -60, r3748; +ld.shared.u32 r3524, [r3749]; +ld.shared.u32 r3574, [r3749+4096]; +ld.shared.u32 r3624, [r3749+8192]; +ld.shared.u32 r3674, [r3749+12288]; +ld.shared.u32 r3536, [r3749+16384]; +ld.shared.u32 r3586, [r3749+20480]; +ld.shared.u32 r3636, [r3749+24576]; +ld.shared.u32 r3686, [r3749+28672]; +ld.shared.u32 r3525, [r3749+32768]; +ld.shared.u32 r3575, [r3749+36864]; +ld.shared.u32 r3625, [r3749+40960]; +ld.shared.u32 r3675, [r3749+45056]; +ld.shared.u32 r3537, [r3749+49152]; +ld.shared.u32 r3587, [r3749+53248]; +ld.shared.u32 r3637, [r3749+57344]; +ld.shared.u32 r3687, [r3749+61440]; +barrier.sync 0; +st.shared.u32 [r3748], r2872; +st.shared.u32 [r3748+1024], r2984; +st.shared.u32 [r3748+2048], r3021; +st.shared.u32 [r3748+3072], r3058; +st.shared.u32 [r3748+4096], r3095; +st.shared.u32 [r3748+5120], r3132; +st.shared.u32 [r3748+6144], r3169; +st.shared.u32 [r3748+7168], r3206; +st.shared.u32 [r3748+8192], r3243; +st.shared.u32 [r3748+9216], r3280; +st.shared.u32 [r3748+10240], r3317; +st.shared.u32 [r3748+11264], r3354; +st.shared.u32 [r3748+12288], r3391; +st.shared.u32 [r3748+13312], r3428; +st.shared.u32 [r3748+14336], r3465; +st.shared.u32 [r3748+15360], r3502; +barrier.sync 0; +ld.shared.u32 r3527, [r3749]; +ld.shared.u32 r3577, [r3749+4096]; +ld.shared.u32 r3627, [r3749+8192]; +ld.shared.u32 r3677, [r3749+12288]; +ld.shared.u32 r3539, [r3749+16384]; +ld.shared.u32 r3589, [r3749+20480]; +ld.shared.u32 r3639, [r3749+24576]; +ld.shared.u32 r3689, [r3749+28672]; +ld.shared.u32 r3528, [r3749+32768]; +ld.shared.u32 r3578, [r3749+36864]; +ld.shared.u32 r3628, [r3749+40960]; +ld.shared.u32 r3678, [r3749+45056]; +ld.shared.u32 r3540, [r3749+49152]; +ld.shared.u32 r3590, [r3749+53248]; +ld.shared.u32 r3640, [r3749+57344]; +ld.shared.u32 r3690, [r3749+61440]; +{ +add.f16x2 r3523, r3524, r3525; +} +{ +add.f16x2 r3526, r3527, r3528; +} +{ +sub.f16x2 r3529, r3524, r3525; +} +{ +sub.f16x2 r3532, r3527, r3528; +} +{ +add.f16x2 r3535, r3536, r3537; +} +{ +add.f16x2 r3538, r3539, r3540; +} +{ +sub.f16x2 r3541, r3536, r3537; +} +{ +sub.f16x2 r3544, r3539, r3540; +} +{ +neg.f16x2 r3547, r3541; +} +{ +add.f16x2 %0, r3523, r3535; +} +{ +add.f16x2 %1, r3526, r3538; +} +{ +sub.f16x2 %16, r3523, r3535; +} +{ +sub.f16x2 %17, r3526, r3538; +} +{ +add.f16x2 %8, r3529, r3544; +} +{ +add.f16x2 %9, r3532, r3547; +} +{ +sub.f16x2 %24, r3529, r3544; +} +{ +sub.f16x2 %25, r3532, r3547; +} +{ +add.f16x2 r3573, r3574, r3575; +} +{ +add.f16x2 r3576, r3577, r3578; +} +{ +sub.f16x2 r3579, r3574, r3575; +} +{ +sub.f16x2 r3582, r3577, r3578; +} +{ +add.f16x2 r3585, r3586, r3587; +} +{ +add.f16x2 r3588, r3589, r3590; +} +{ +sub.f16x2 r3591, r3586, r3587; +} +{ +sub.f16x2 r3594, r3589, r3590; +} +{ +neg.f16x2 r3597, r3591; +} +{ +add.f16x2 %2, r3573, r3585; +} +{ +add.f16x2 %3, r3576, r3588; +} +{ +sub.f16x2 %18, r3573, r3585; +} +{ +sub.f16x2 %19, r3576, r3588; +} +{ +add.f16x2 %10, r3579, r3594; +} +{ +add.f16x2 %11, r3582, r3597; +} +{ +sub.f16x2 %26, r3579, r3594; +} +{ +sub.f16x2 %27, r3582, r3597; +} +{ +add.f16x2 r3623, r3624, r3625; +} +{ +add.f16x2 r3626, r3627, r3628; +} +{ +sub.f16x2 r3629, r3624, r3625; +} +{ +sub.f16x2 r3632, r3627, r3628; +} +{ +add.f16x2 r3635, r3636, r3637; +} +{ +add.f16x2 r3638, r3639, r3640; +} +{ +sub.f16x2 r3641, r3636, r3637; +} +{ +sub.f16x2 r3644, r3639, r3640; +} +{ +neg.f16x2 r3647, r3641; +} +{ +add.f16x2 %4, r3623, r3635; +} +{ +add.f16x2 %5, r3626, r3638; +} +{ +sub.f16x2 %20, r3623, r3635; +} +{ +sub.f16x2 %21, r3626, r3638; +} +{ +add.f16x2 %12, r3629, r3644; +} +{ +add.f16x2 %13, r3632, r3647; +} +{ +sub.f16x2 %28, r3629, r3644; +} +{ +sub.f16x2 %29, r3632, r3647; +} +{ +add.f16x2 r3673, r3674, r3675; +} +{ +add.f16x2 r3676, r3677, r3678; +} +{ +sub.f16x2 r3679, r3674, r3675; +} +{ +sub.f16x2 r3682, r3677, r3678; +} +{ +add.f16x2 r3685, r3686, r3687; +} +{ +add.f16x2 r3688, r3689, r3690; +} +{ +sub.f16x2 r3691, r3686, r3687; +} +{ +sub.f16x2 r3694, r3689, r3690; +} +{ +neg.f16x2 r3697, r3691; +} +{ +add.f16x2 %6, r3673, r3685; +} +{ +add.f16x2 %7, r3676, r3688; +} +{ +sub.f16x2 %22, r3673, r3685; +} +{ +sub.f16x2 %23, r3676, r3688; +} +{ +add.f16x2 %14, r3679, r3694; +} +{ +add.f16x2 %15, r3682, r3697; +} +{ +sub.f16x2 %30, r3679, r3694; +} +{ +sub.f16x2 %31, r3682, r3697; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1175, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3750>; +.reg .b64 rd<2>; +mov.u32 r3723, %tid.y; +shl.b32 r3724, r3723, 17; +mov.u32 r3725, %32; +add.s32 r3726, r3725, r3724; +mov.u32 r3727, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f362, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r101, {low, high}; +} +mov.f32 f380, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f448, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f358, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +mov.f32 f366, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r397, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3728, r3727, 1023; +shl.b32 r3729, r3727, 7; +and.b32 r3730, r3729, -131072; +add.s32 r3731, r3726, r3730; +cvt.rn.f32.u32 f451, r3728; +mul.f32 f452, f451, 0f39C90FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r3732, r3729, 130944; +add.s32 r3733, r3731, r3732; +st.shared.v4.f32 [r3733], {r521, r524, r629, r636}; +st.shared.v4.f32 [r3733+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r3733+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r3733+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r3733+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r3733+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r3733+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r3733+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r3734, r3728, -120, r3733; +ld.shared.u32 r1176, [r3734]; +ld.shared.u32 r1179, [r3734+4]; +ld.shared.u32 r1372, [r3734+8192]; +ld.shared.u32 r1375, [r3734+8196]; +ld.shared.u32 r1226, [r3734+16384]; +ld.shared.u32 r1229, [r3734+16388]; +ld.shared.u32 r1422, [r3734+24576]; +ld.shared.u32 r1425, [r3734+24580]; +ld.shared.u32 r1188, [r3734+32768]; +ld.shared.u32 r1191, [r3734+32772]; +ld.shared.u32 r1384, [r3734+40960]; +ld.shared.u32 r1387, [r3734+40964]; +ld.shared.u32 r1238, [r3734+49152]; +ld.shared.u32 r1241, [r3734+49156]; +ld.shared.u32 r1434, [r3734+57344]; +ld.shared.u32 r1437, [r3734+57348]; +ld.shared.u32 r1177, [r3734+65536]; +ld.shared.u32 r1180, [r3734+65540]; +ld.shared.u32 r1373, [r3734+73728]; +ld.shared.u32 r1376, [r3734+73732]; +ld.shared.u32 r1227, [r3734+81920]; +ld.shared.u32 r1230, [r3734+81924]; +ld.shared.u32 r1423, [r3734+90112]; +ld.shared.u32 r1426, [r3734+90116]; +ld.shared.u32 r1189, [r3734+98304]; +ld.shared.u32 r1192, [r3734+98308]; +ld.shared.u32 r1385, [r3734+106496]; +ld.shared.u32 r1388, [r3734+106500]; +ld.shared.u32 r1239, [r3734+114688]; +ld.shared.u32 r1242, [r3734+114692]; +ld.shared.u32 r1435, [r3734+122880]; +ld.shared.u32 r1438, [r3734+122884]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3735, r3727, 1008; +bfe.u32 r3736, r3727, 4, 6; +cvt.rn.f32.u32 f454, r3736; +mul.f32 f455, f454, 0f3BC90FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +shl.b32 r3737, r3727, 3; +and.b32 r3738, r3737, 120; +add.s32 r3739, r3731, r3738; +barrier.sync 0; +and.b32 r3740, r3729, 129024; +add.s32 r3741, r3739, r3740; +st.shared.u32 [r3741], r1695; +st.shared.u32 [r3741+4], r1698; +st.shared.u32 [r3741+128], r1803; +st.shared.u32 [r3741+132], r1810; +st.shared.u32 [r3741+256], r1840; +st.shared.u32 [r3741+260], r1847; +st.shared.u32 [r3741+384], r1877; +st.shared.u32 [r3741+388], r1884; +st.shared.u32 [r3741+512], r1914; +st.shared.u32 [r3741+516], r1921; +st.shared.u32 [r3741+640], r1951; +st.shared.u32 [r3741+644], r1958; +st.shared.u32 [r3741+768], r1988; +st.shared.u32 [r3741+772], r1995; +st.shared.u32 [r3741+896], r2025; +st.shared.u32 [r3741+900], r2032; +st.shared.u32 [r3741+1024], r2062; +st.shared.u32 [r3741+1028], r2069; +st.shared.u32 [r3741+1152], r2099; +st.shared.u32 [r3741+1156], r2106; +st.shared.u32 [r3741+1280], r2136; +st.shared.u32 [r3741+1284], r2143; +st.shared.u32 [r3741+1408], r2173; +st.shared.u32 [r3741+1412], r2180; +st.shared.u32 [r3741+1536], r2210; +st.shared.u32 [r3741+1540], r2217; +st.shared.u32 [r3741+1664], r2247; +st.shared.u32 [r3741+1668], r2254; +st.shared.u32 [r3741+1792], r2284; +st.shared.u32 [r3741+1796], r2291; +st.shared.u32 [r3741+1920], r2321; +st.shared.u32 [r3741+1924], r2328; +barrier.sync 0; +mad.lo.s32 r3742, r3735, -120, r3741; +ld.shared.u32 r2350, [r3742]; +ld.shared.u32 r2353, [r3742+4]; +ld.shared.u32 r2546, [r3742+8192]; +ld.shared.u32 r2549, [r3742+8196]; +ld.shared.u32 r2400, [r3742+16384]; +ld.shared.u32 r2403, [r3742+16388]; +ld.shared.u32 r2596, [r3742+24576]; +ld.shared.u32 r2599, [r3742+24580]; +ld.shared.u32 r2362, [r3742+32768]; +ld.shared.u32 r2365, [r3742+32772]; +ld.shared.u32 r2558, [r3742+40960]; +ld.shared.u32 r2561, [r3742+40964]; +ld.shared.u32 r2412, [r3742+49152]; +ld.shared.u32 r2415, [r3742+49156]; +ld.shared.u32 r2608, [r3742+57344]; +ld.shared.u32 r2611, [r3742+57348]; +ld.shared.u32 r2351, [r3742+65536]; +ld.shared.u32 r2354, [r3742+65540]; +ld.shared.u32 r2547, [r3742+73728]; +ld.shared.u32 r2550, [r3742+73732]; +ld.shared.u32 r2401, [r3742+81920]; +ld.shared.u32 r2404, [r3742+81924]; +ld.shared.u32 r2597, [r3742+90112]; +ld.shared.u32 r2600, [r3742+90116]; +ld.shared.u32 r2363, [r3742+98304]; +ld.shared.u32 r2366, [r3742+98308]; +ld.shared.u32 r2559, [r3742+106496]; +ld.shared.u32 r2562, [r3742+106500]; +ld.shared.u32 r2413, [r3742+114688]; +ld.shared.u32 r2416, [r3742+114692]; +ld.shared.u32 r2609, [r3742+122880]; +ld.shared.u32 r2612, [r3742+122884]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2434; +} +{ +add.f16x2 r2524, r2384, r2479; +} +{ +sub.f16x2 r2527, r2381, r2434; +} +{ +sub.f16x2 r2530, r2384, r2479; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2630; +} +{ +add.f16x2 r2720, r2580, r2675; +} +{ +sub.f16x2 r2723, r2577, r2630; +} +{ +sub.f16x2 r2726, r2580, r2675; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2699; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2702; +} +{ +add.f16x2 r2920, r2506, r2819; +} +{ +sub.f16x2 r2923, r2503, r2702; +} +{ +sub.f16x2 r2926, r2506, r2819; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3743, r3727, 768; +bfe.u32 r3744, r3727, 8, 2; +cvt.rn.f32.u32 f457, r3744; +mul.f32 f458, f457, 0f3DC90FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +neg.f16x2 r2975, r2972; +} +{ +fma.rn.f16x2 r2977, r2881, r2968, r2975; +} +{ +mul.f16x2 r2981, r2881, r2970; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +neg.f16x2 r3012, r3009; +} +{ +fma.rn.f16x2 r3014, r2893, r3005, r3012; +} +{ +mul.f16x2 r3018, r2893, r3007; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +neg.f16x2 r3049, r3046; +} +{ +fma.rn.f16x2 r3051, r2905, r3042, r3049; +} +{ +mul.f16x2 r3055, r2905, r3044; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +neg.f16x2 r3086, r3083; +} +{ +fma.rn.f16x2 r3088, r2917, r3079, r3086; +} +{ +mul.f16x2 r3092, r2917, r3081; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +neg.f16x2 r3123, r3120; +} +{ +fma.rn.f16x2 r3125, r2929, r3116, r3123; +} +{ +mul.f16x2 r3129, r2929, r3118; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +neg.f16x2 r3160, r3157; +} +{ +fma.rn.f16x2 r3162, r2941, r3153, r3160; +} +{ +mul.f16x2 r3166, r2941, r3155; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +neg.f16x2 r3197, r3194; +} +{ +fma.rn.f16x2 r3199, r2953, r3190, r3197; +} +{ +mul.f16x2 r3203, r2953, r3192; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +neg.f16x2 r3234, r3231; +} +{ +fma.rn.f16x2 r3236, r2875, r3227, r3234; +} +{ +mul.f16x2 r3240, r2875, r3229; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +neg.f16x2 r3271, r3268; +} +{ +fma.rn.f16x2 r3273, r2887, r3264, r3271; +} +{ +mul.f16x2 r3277, r2887, r3266; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +neg.f16x2 r3308, r3305; +} +{ +fma.rn.f16x2 r3310, r2899, r3301, r3308; +} +{ +mul.f16x2 r3314, r2899, r3303; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +neg.f16x2 r3345, r3342; +} +{ +fma.rn.f16x2 r3347, r2911, r3338, r3345; +} +{ +mul.f16x2 r3351, r2911, r3340; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3351; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +neg.f16x2 r3382, r3379; +} +{ +fma.rn.f16x2 r3384, r2923, r3375, r3382; +} +{ +mul.f16x2 r3388, r2923, r3377; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3388; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +neg.f16x2 r3419, r3416; +} +{ +fma.rn.f16x2 r3421, r2935, r3412, r3419; +} +{ +mul.f16x2 r3425, r2935, r3414; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3425; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +neg.f16x2 r3456, r3453; +} +{ +fma.rn.f16x2 r3458, r2947, r3449, r3456; +} +{ +mul.f16x2 r3462, r2947, r3451; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +neg.f16x2 r3493, r3490; +} +{ +fma.rn.f16x2 r3495, r2959, r3486, r3493; +} +{ +mul.f16x2 r3499, r2959, r3488; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3499; +} +and.b32 r3745, r3737, 2040; +add.s32 r3746, r3731, r3745; +barrier.sync 0; +and.b32 r3747, r3729, 98304; +add.s32 r3748, r3746, r3747; +st.shared.u32 [r3748], r2869; +st.shared.u32 [r3748+4], r2872; +st.shared.u32 [r3748+2048], r2977; +st.shared.u32 [r3748+2052], r2984; +st.shared.u32 [r3748+4096], r3014; +st.shared.u32 [r3748+4100], r3021; +st.shared.u32 [r3748+6144], r3051; +st.shared.u32 [r3748+6148], r3058; +st.shared.u32 [r3748+8192], r3088; +st.shared.u32 [r3748+8196], r3095; +st.shared.u32 [r3748+10240], r3125; +st.shared.u32 [r3748+10244], r3132; +st.shared.u32 [r3748+12288], r3162; +st.shared.u32 [r3748+12292], r3169; +st.shared.u32 [r3748+14336], r3199; +st.shared.u32 [r3748+14340], r3206; +st.shared.u32 [r3748+16384], r3236; +st.shared.u32 [r3748+16388], r3243; +st.shared.u32 [r3748+18432], r3273; +st.shared.u32 [r3748+18436], r3280; +st.shared.u32 [r3748+20480], r3310; +st.shared.u32 [r3748+20484], r3317; +st.shared.u32 [r3748+22528], r3347; +st.shared.u32 [r3748+22532], r3354; +st.shared.u32 [r3748+24576], r3384; +st.shared.u32 [r3748+24580], r3391; +st.shared.u32 [r3748+26624], r3421; +st.shared.u32 [r3748+26628], r3428; +st.shared.u32 [r3748+28672], r3458; +st.shared.u32 [r3748+28676], r3465; +st.shared.u32 [r3748+30720], r3495; +st.shared.u32 [r3748+30724], r3502; +barrier.sync 0; +mad.lo.s32 r3749, r3743, -120, r3748; +ld.shared.u32 r3524, [r3749]; +ld.shared.u32 r3527, [r3749+4]; +ld.shared.u32 r3574, [r3749+8192]; +ld.shared.u32 r3577, [r3749+8196]; +ld.shared.u32 r3624, [r3749+16384]; +ld.shared.u32 r3627, [r3749+16388]; +ld.shared.u32 r3674, [r3749+24576]; +ld.shared.u32 r3677, [r3749+24580]; +ld.shared.u32 r3536, [r3749+32768]; +ld.shared.u32 r3539, [r3749+32772]; +ld.shared.u32 r3586, [r3749+40960]; +ld.shared.u32 r3589, [r3749+40964]; +ld.shared.u32 r3636, [r3749+49152]; +ld.shared.u32 r3639, [r3749+49156]; +ld.shared.u32 r3686, [r3749+57344]; +ld.shared.u32 r3689, [r3749+57348]; +ld.shared.u32 r3525, [r3749+65536]; +ld.shared.u32 r3528, [r3749+65540]; +ld.shared.u32 r3575, [r3749+73728]; +ld.shared.u32 r3578, [r3749+73732]; +ld.shared.u32 r3625, [r3749+81920]; +ld.shared.u32 r3628, [r3749+81924]; +ld.shared.u32 r3675, [r3749+90112]; +ld.shared.u32 r3678, [r3749+90116]; +ld.shared.u32 r3537, [r3749+98304]; +ld.shared.u32 r3540, [r3749+98308]; +ld.shared.u32 r3587, [r3749+106496]; +ld.shared.u32 r3590, [r3749+106500]; +ld.shared.u32 r3637, [r3749+114688]; +ld.shared.u32 r3640, [r3749+114692]; +ld.shared.u32 r3687, [r3749+122880]; +ld.shared.u32 r3690, [r3749+122884]; +{ +add.f16x2 r3523, r3524, r3525; +} +{ +add.f16x2 r3526, r3527, r3528; +} +{ +sub.f16x2 r3529, r3524, r3525; +} +{ +sub.f16x2 r3532, r3527, r3528; +} +{ +add.f16x2 r3535, r3536, r3537; +} +{ +add.f16x2 r3538, r3539, r3540; +} +{ +sub.f16x2 r3541, r3536, r3537; +} +{ +sub.f16x2 r3544, r3539, r3540; +} +{ +neg.f16x2 r3547, r3541; +} +{ +add.f16x2 %0, r3523, r3535; +} +{ +add.f16x2 %1, r3526, r3538; +} +{ +sub.f16x2 %16, r3523, r3535; +} +{ +sub.f16x2 %17, r3526, r3538; +} +{ +add.f16x2 %8, r3529, r3544; +} +{ +add.f16x2 %9, r3532, r3547; +} +{ +sub.f16x2 %24, r3529, r3544; +} +{ +sub.f16x2 %25, r3532, r3547; +} +{ +add.f16x2 r3573, r3574, r3575; +} +{ +add.f16x2 r3576, r3577, r3578; +} +{ +sub.f16x2 r3579, r3574, r3575; +} +{ +sub.f16x2 r3582, r3577, r3578; +} +{ +add.f16x2 r3585, r3586, r3587; +} +{ +add.f16x2 r3588, r3589, r3590; +} +{ +sub.f16x2 r3591, r3586, r3587; +} +{ +sub.f16x2 r3594, r3589, r3590; +} +{ +neg.f16x2 r3597, r3591; +} +{ +add.f16x2 %2, r3573, r3585; +} +{ +add.f16x2 %3, r3576, r3588; +} +{ +sub.f16x2 %18, r3573, r3585; +} +{ +sub.f16x2 %19, r3576, r3588; +} +{ +add.f16x2 %10, r3579, r3594; +} +{ +add.f16x2 %11, r3582, r3597; +} +{ +sub.f16x2 %26, r3579, r3594; +} +{ +sub.f16x2 %27, r3582, r3597; +} +{ +add.f16x2 r3623, r3624, r3625; +} +{ +add.f16x2 r3626, r3627, r3628; +} +{ +sub.f16x2 r3629, r3624, r3625; +} +{ +sub.f16x2 r3632, r3627, r3628; +} +{ +add.f16x2 r3635, r3636, r3637; +} +{ +add.f16x2 r3638, r3639, r3640; +} +{ +sub.f16x2 r3641, r3636, r3637; +} +{ +sub.f16x2 r3644, r3639, r3640; +} +{ +neg.f16x2 r3647, r3641; +} +{ +add.f16x2 %4, r3623, r3635; +} +{ +add.f16x2 %5, r3626, r3638; +} +{ +sub.f16x2 %20, r3623, r3635; +} +{ +sub.f16x2 %21, r3626, r3638; +} +{ +add.f16x2 %12, r3629, r3644; +} +{ +add.f16x2 %13, r3632, r3647; +} +{ +sub.f16x2 %28, r3629, r3644; +} +{ +sub.f16x2 %29, r3632, r3647; +} +{ +add.f16x2 r3673, r3674, r3675; +} +{ +add.f16x2 r3676, r3677, r3678; +} +{ +sub.f16x2 r3679, r3674, r3675; +} +{ +sub.f16x2 r3682, r3677, r3678; +} +{ +add.f16x2 r3685, r3686, r3687; +} +{ +add.f16x2 r3688, r3689, r3690; +} +{ +sub.f16x2 r3691, r3686, r3687; +} +{ +sub.f16x2 r3694, r3689, r3690; +} +{ +neg.f16x2 r3697, r3691; +} +{ +add.f16x2 %6, r3673, r3685; +} +{ +add.f16x2 %7, r3676, r3688; +} +{ +sub.f16x2 %22, r3673, r3685; +} +{ +sub.f16x2 %23, r3676, r3688; +} +{ +add.f16x2 %14, r3679, r3694; +} +{ +add.f16x2 %15, r3682, r3697; +} +{ +sub.f16x2 %30, r3679, r3694; +} +{ +sub.f16x2 %31, r3682, r3697; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<862, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1085>; +.reg .b32 r<7042>; +.reg .b64 rd<3>; +mov.u32 r6957, %tid.y; +shl.b32 r6958, r6957, 16; +mov.u32 r6959, %64; +add.s32 r6960, r6959, r6958; +mov.u32 r6961, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f1022, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r101, {low, high}; +} +mov.f32 f1040, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f1018, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r393, {low, high}; +} +mov.f32 f1044, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r396, {low, high}; +} +mov.f32 f1026, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r397, {low, high}; +} +mov.f32 f1042, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6963, r6961, 7; +and.b32 r6964, r6963, -65536; +add.s32 r6965, r6960, r6964; +and.b32 r6977, r6961, 511; +cvt.rn.f32.u32 f1077, r6977; +mul.f32 f1078, f1077, 0f39C90FDB; +cos.approx.f32 f357, f1078; +sin.approx.f32 f1079, f1078; +neg.f32 f358, f1079; +mov.f32 f1084, 0f3F800000; +mov.f32 f1083, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r6966, r6963, 65408; +add.s32 r6967, r6965, r6966; +st.shared.v4.f32 [r6967], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r6967+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r6967+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r6967+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r6967+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r6967+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r6967+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r6967+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r6968, r6977, -124, r6967; +ld.shared.u32 r2864, [r6968]; +ld.shared.u32 r3480, [r6968+2048]; +ld.shared.u32 r3060, [r6968+4096]; +ld.shared.u32 r3676, [r6968+6144]; +ld.shared.u32 r2914, [r6968+8192]; +ld.shared.u32 r3530, [r6968+10240]; +ld.shared.u32 r3110, [r6968+12288]; +ld.shared.u32 r3726, [r6968+14336]; +ld.shared.u32 r2876, [r6968+16384]; +ld.shared.u32 r3492, [r6968+18432]; +ld.shared.u32 r3072, [r6968+20480]; +ld.shared.u32 r3688, [r6968+22528]; +ld.shared.u32 r2926, [r6968+24576]; +ld.shared.u32 r3542, [r6968+26624]; +ld.shared.u32 r3122, [r6968+28672]; +ld.shared.u32 r3738, [r6968+30720]; +ld.shared.u32 r2865, [r6968+32768]; +ld.shared.u32 r3481, [r6968+34816]; +ld.shared.u32 r3061, [r6968+36864]; +ld.shared.u32 r3677, [r6968+38912]; +ld.shared.u32 r2915, [r6968+40960]; +ld.shared.u32 r3531, [r6968+43008]; +ld.shared.u32 r3111, [r6968+45056]; +ld.shared.u32 r3727, [r6968+47104]; +ld.shared.u32 r2877, [r6968+49152]; +ld.shared.u32 r3493, [r6968+51200]; +ld.shared.u32 r3073, [r6968+53248]; +ld.shared.u32 r3689, [r6968+55296]; +ld.shared.u32 r2927, [r6968+57344]; +ld.shared.u32 r3543, [r6968+59392]; +ld.shared.u32 r3123, [r6968+61440]; +ld.shared.u32 r3739, [r6968+63488]; +barrier.sync 0; +st.shared.v4.f32 [r6967], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r6967+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r6967+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r6967+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r6967+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r6967+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r6967+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r6967+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r6968]; +ld.shared.u32 r3483, [r6968+2048]; +ld.shared.u32 r3063, [r6968+4096]; +ld.shared.u32 r3679, [r6968+6144]; +ld.shared.u32 r2917, [r6968+8192]; +ld.shared.u32 r3533, [r6968+10240]; +ld.shared.u32 r3113, [r6968+12288]; +ld.shared.u32 r3729, [r6968+14336]; +ld.shared.u32 r2879, [r6968+16384]; +ld.shared.u32 r3495, [r6968+18432]; +ld.shared.u32 r3075, [r6968+20480]; +ld.shared.u32 r3691, [r6968+22528]; +ld.shared.u32 r2929, [r6968+24576]; +ld.shared.u32 r3545, [r6968+26624]; +ld.shared.u32 r3125, [r6968+28672]; +ld.shared.u32 r3741, [r6968+30720]; +ld.shared.u32 r2868, [r6968+32768]; +ld.shared.u32 r3484, [r6968+34816]; +ld.shared.u32 r3064, [r6968+36864]; +ld.shared.u32 r3680, [r6968+38912]; +ld.shared.u32 r2918, [r6968+40960]; +ld.shared.u32 r3534, [r6968+43008]; +ld.shared.u32 r3114, [r6968+45056]; +ld.shared.u32 r3730, [r6968+47104]; +ld.shared.u32 r2880, [r6968+49152]; +ld.shared.u32 r3496, [r6968+51200]; +ld.shared.u32 r3076, [r6968+53248]; +ld.shared.u32 r3692, [r6968+55296]; +ld.shared.u32 r2930, [r6968+57344]; +ld.shared.u32 r3546, [r6968+59392]; +ld.shared.u32 r3126, [r6968+61440]; +ld.shared.u32 r3742, [r6968+63488]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r6969, r6961, 480; +bfe.u32 r6970, r6961, 5, 4; +shl.b32 r6971, r6961, 2; +and.b32 r6972, r6971, 124; +add.s32 r6973, r6965, r6972; +cvt.rn.f32.u32 f1080, r6970; +mul.f32 f1081, f1080, 0f3C490FDB; +cos.approx.f32 f779, f1081; +sin.approx.f32 f1082, f1081; +neg.f32 f780, f1082; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +barrier.sync 0; +and.b32 r6974, r6963, 61440; +add.s32 r6975, r6973, r6974; +st.shared.u32 [r6975], r4383; +st.shared.u32 [r6975+128], r4587; +st.shared.u32 [r6975+256], r4624; +st.shared.u32 [r6975+384], r4661; +st.shared.u32 [r6975+512], r4698; +st.shared.u32 [r6975+640], r4735; +st.shared.u32 [r6975+768], r4772; +st.shared.u32 [r6975+896], r4809; +st.shared.u32 [r6975+1024], r4846; +st.shared.u32 [r6975+1152], r4883; +st.shared.u32 [r6975+1280], r4920; +st.shared.u32 [r6975+1408], r4957; +st.shared.u32 [r6975+1536], r4994; +st.shared.u32 [r6975+1664], r5031; +st.shared.u32 [r6975+1792], r5068; +st.shared.u32 [r6975+1920], r5105; +st.shared.u32 [r6975+2048], r5142; +st.shared.u32 [r6975+2176], r5179; +st.shared.u32 [r6975+2304], r5216; +st.shared.u32 [r6975+2432], r5253; +st.shared.u32 [r6975+2560], r5290; +st.shared.u32 [r6975+2688], r5327; +st.shared.u32 [r6975+2816], r5364; +st.shared.u32 [r6975+2944], r5401; +st.shared.u32 [r6975+3072], r5438; +st.shared.u32 [r6975+3200], r5475; +st.shared.u32 [r6975+3328], r5512; +st.shared.u32 [r6975+3456], r5549; +st.shared.u32 [r6975+3584], r5586; +st.shared.u32 [r6975+3712], r5623; +st.shared.u32 [r6975+3840], r5660; +st.shared.u32 [r6975+3968], r5697; +barrier.sync 0; +mad.lo.s32 r6976, r6969, -124, r6975; +ld.shared.u32 r5726, [r6976]; +ld.shared.u32 r6342, [r6976+2048]; +ld.shared.u32 r5922, [r6976+4096]; +ld.shared.u32 r6538, [r6976+6144]; +ld.shared.u32 r5776, [r6976+8192]; +ld.shared.u32 r6392, [r6976+10240]; +ld.shared.u32 r5972, [r6976+12288]; +ld.shared.u32 r6588, [r6976+14336]; +ld.shared.u32 r5738, [r6976+16384]; +ld.shared.u32 r6354, [r6976+18432]; +ld.shared.u32 r5934, [r6976+20480]; +ld.shared.u32 r6550, [r6976+22528]; +ld.shared.u32 r5788, [r6976+24576]; +ld.shared.u32 r6404, [r6976+26624]; +ld.shared.u32 r5984, [r6976+28672]; +ld.shared.u32 r6600, [r6976+30720]; +ld.shared.u32 r5727, [r6976+32768]; +ld.shared.u32 r6343, [r6976+34816]; +ld.shared.u32 r5923, [r6976+36864]; +ld.shared.u32 r6539, [r6976+38912]; +ld.shared.u32 r5777, [r6976+40960]; +ld.shared.u32 r6393, [r6976+43008]; +ld.shared.u32 r5973, [r6976+45056]; +ld.shared.u32 r6589, [r6976+47104]; +ld.shared.u32 r5739, [r6976+49152]; +ld.shared.u32 r6355, [r6976+51200]; +ld.shared.u32 r5935, [r6976+53248]; +ld.shared.u32 r6551, [r6976+55296]; +ld.shared.u32 r5789, [r6976+57344]; +ld.shared.u32 r6405, [r6976+59392]; +ld.shared.u32 r5985, [r6976+61440]; +ld.shared.u32 r6601, [r6976+63488]; +barrier.sync 0; +st.shared.u32 [r6975], r4386; +st.shared.u32 [r6975+128], r4594; +st.shared.u32 [r6975+256], r4631; +st.shared.u32 [r6975+384], r4668; +st.shared.u32 [r6975+512], r4705; +st.shared.u32 [r6975+640], r4742; +st.shared.u32 [r6975+768], r4779; +st.shared.u32 [r6975+896], r4816; +st.shared.u32 [r6975+1024], r4853; +st.shared.u32 [r6975+1152], r4890; +st.shared.u32 [r6975+1280], r4927; +st.shared.u32 [r6975+1408], r4964; +st.shared.u32 [r6975+1536], r5001; +st.shared.u32 [r6975+1664], r5038; +st.shared.u32 [r6975+1792], r5075; +st.shared.u32 [r6975+1920], r5112; +st.shared.u32 [r6975+2048], r5149; +st.shared.u32 [r6975+2176], r5186; +st.shared.u32 [r6975+2304], r5223; +st.shared.u32 [r6975+2432], r5260; +st.shared.u32 [r6975+2560], r5297; +st.shared.u32 [r6975+2688], r5334; +st.shared.u32 [r6975+2816], r5371; +st.shared.u32 [r6975+2944], r5408; +st.shared.u32 [r6975+3072], r5445; +st.shared.u32 [r6975+3200], r5482; +st.shared.u32 [r6975+3328], r5519; +st.shared.u32 [r6975+3456], r5556; +st.shared.u32 [r6975+3584], r5593; +st.shared.u32 [r6975+3712], r5630; +st.shared.u32 [r6975+3840], r5667; +st.shared.u32 [r6975+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r6976]; +ld.shared.u32 r6345, [r6976+2048]; +ld.shared.u32 r5925, [r6976+4096]; +ld.shared.u32 r6541, [r6976+6144]; +ld.shared.u32 r5779, [r6976+8192]; +ld.shared.u32 r6395, [r6976+10240]; +ld.shared.u32 r5975, [r6976+12288]; +ld.shared.u32 r6591, [r6976+14336]; +ld.shared.u32 r5741, [r6976+16384]; +ld.shared.u32 r6357, [r6976+18432]; +ld.shared.u32 r5937, [r6976+20480]; +ld.shared.u32 r6553, [r6976+22528]; +ld.shared.u32 r5791, [r6976+24576]; +ld.shared.u32 r6407, [r6976+26624]; +ld.shared.u32 r5987, [r6976+28672]; +ld.shared.u32 r6603, [r6976+30720]; +ld.shared.u32 r5730, [r6976+32768]; +ld.shared.u32 r6346, [r6976+34816]; +ld.shared.u32 r5926, [r6976+36864]; +ld.shared.u32 r6542, [r6976+38912]; +ld.shared.u32 r5780, [r6976+40960]; +ld.shared.u32 r6396, [r6976+43008]; +ld.shared.u32 r5976, [r6976+45056]; +ld.shared.u32 r6592, [r6976+47104]; +ld.shared.u32 r5742, [r6976+49152]; +ld.shared.u32 r6358, [r6976+51200]; +ld.shared.u32 r5938, [r6976+53248]; +ld.shared.u32 r6554, [r6976+55296]; +ld.shared.u32 r5792, [r6976+57344]; +ld.shared.u32 r6408, [r6976+59392]; +ld.shared.u32 r5988, [r6976+61440]; +ld.shared.u32 r6604, [r6976+63488]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5746; +} +{ +add.f16x2 r5766, r5734, r5749; +} +{ +sub.f16x2 r5769, r5731, r5746; +} +{ +sub.f16x2 r5772, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5796; +} +{ +add.f16x2 r5816, r5784, r5799; +} +{ +sub.f16x2 r5819, r5781, r5796; +} +{ +sub.f16x2 r5822, r5784, r5799; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5807; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 r5873, r5751, r5801; +} +{ +add.f16x2 r5876, r5754, r5804; +} +{ +sub.f16x2 r5879, r5751, r5801; +} +{ +sub.f16x2 r5882, r5754, r5804; +} +{ +add.f16x2 r5885, r5763, r5845; +} +{ +add.f16x2 r5888, r5766, r5851; +} +{ +sub.f16x2 r5891, r5763, r5845; +} +{ +sub.f16x2 r5894, r5766, r5851; +} +{ +add.f16x2 r5897, r5757, r5810; +} +{ +add.f16x2 r5900, r5760, r5855; +} +{ +sub.f16x2 r5903, r5757, r5810; +} +{ +sub.f16x2 r5906, r5760, r5855; +} +{ +add.f16x2 r5909, r5769, r5863; +} +{ +add.f16x2 r5912, r5772, r5869; +} +{ +sub.f16x2 r5915, r5769, r5863; +} +{ +sub.f16x2 r5918, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5939; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5942; +} +{ +add.f16x2 r5962, r5930, r5945; +} +{ +sub.f16x2 r5965, r5927, r5942; +} +{ +sub.f16x2 r5968, r5930, r5945; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5989; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5992; +} +{ +add.f16x2 r6012, r5980, r5995; +} +{ +sub.f16x2 r6015, r5977, r5992; +} +{ +sub.f16x2 r6018, r5980, r5995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6003; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 r6069, r5947, r5997; +} +{ +add.f16x2 r6072, r5950, r6000; +} +{ +sub.f16x2 r6075, r5947, r5997; +} +{ +sub.f16x2 r6078, r5950, r6000; +} +{ +add.f16x2 r6081, r5959, r6041; +} +{ +add.f16x2 r6084, r5962, r6047; +} +{ +sub.f16x2 r6087, r5959, r6041; +} +{ +sub.f16x2 r6090, r5962, r6047; +} +{ +add.f16x2 r6093, r5953, r6006; +} +{ +add.f16x2 r6096, r5956, r6051; +} +{ +sub.f16x2 r6099, r5953, r6006; +} +{ +sub.f16x2 r6102, r5956, r6051; +} +{ +add.f16x2 r6105, r5965, r6059; +} +{ +add.f16x2 r6108, r5968, r6065; +} +{ +sub.f16x2 r6111, r5965, r6059; +} +{ +sub.f16x2 r6114, r5968, r6065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r6117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r6121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6125, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6126, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6128, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6130, {low, high}; +} +{ +mul.f16x2 r6147, r6081, r6117; +} +{ +mul.f16x2 r6150, r6084, r6118; +} +{ +sub.f16x2 r6153, r6147, r6150; +} +{ +mul.f16x2 r6156, r6081, r6118; +} +{ +fma.rn.f16x2 r6159, r6084, r6117, r6156; +} +{ +mul.f16x2 r6163, r6093, r6119; +} +{ +mul.f16x2 r6166, r6096, r6120; +} +{ +sub.f16x2 r6169, r6163, r6166; +} +{ +mul.f16x2 r6172, r6093, r6120; +} +{ +fma.rn.f16x2 r6175, r6096, r6119, r6172; +} +{ +mul.f16x2 r6179, r6105, r6121; +} +{ +mul.f16x2 r6182, r6108, r6122; +} +{ +sub.f16x2 r6185, r6179, r6182; +} +{ +mul.f16x2 r6188, r6105, r6122; +} +{ +fma.rn.f16x2 r6191, r6108, r6121, r6188; +} +{ +neg.f16x2 r6195, r6075; +} +{ +mul.f16x2 r6197, r6087, r6125; +} +{ +mul.f16x2 r6200, r6090, r6126; +} +{ +sub.f16x2 r6203, r6197, r6200; +} +{ +mul.f16x2 r6206, r6087, r6126; +} +{ +fma.rn.f16x2 r6209, r6090, r6125, r6206; +} +{ +mul.f16x2 r6213, r6099, r6127; +} +{ +mul.f16x2 r6216, r6102, r6128; +} +{ +sub.f16x2 r6219, r6213, r6216; +} +{ +mul.f16x2 r6222, r6099, r6128; +} +{ +fma.rn.f16x2 r6225, r6102, r6127, r6222; +} +{ +mul.f16x2 r6229, r6111, r6129; +} +{ +mul.f16x2 r6232, r6114, r6130; +} +{ +sub.f16x2 r6235, r6229, r6232; +} +{ +mul.f16x2 r6238, r6111, r6130; +} +{ +fma.rn.f16x2 r6241, r6114, r6129, r6238; +} +{ +add.f16x2 %0, r5873, r6069; +} +{ +add.f16x2 %1, r5876, r6072; +} +{ +sub.f16x2 %32, r5873, r6069; +} +{ +sub.f16x2 %33, r5876, r6072; +} +{ +add.f16x2 %4, r5885, r6153; +} +{ +add.f16x2 %5, r5888, r6159; +} +{ +sub.f16x2 %36, r5885, r6153; +} +{ +sub.f16x2 %37, r5888, r6159; +} +{ +add.f16x2 %8, r5897, r6169; +} +{ +add.f16x2 %9, r5900, r6175; +} +{ +sub.f16x2 %40, r5897, r6169; +} +{ +sub.f16x2 %41, r5900, r6175; +} +{ +add.f16x2 %12, r5909, r6185; +} +{ +add.f16x2 %13, r5912, r6191; +} +{ +sub.f16x2 %44, r5909, r6185; +} +{ +sub.f16x2 %45, r5912, r6191; +} +{ +add.f16x2 %16, r5879, r6078; +} +{ +add.f16x2 %17, r5882, r6195; +} +{ +sub.f16x2 %48, r5879, r6078; +} +{ +sub.f16x2 %49, r5882, r6195; +} +{ +add.f16x2 %20, r5891, r6203; +} +{ +add.f16x2 %21, r5894, r6209; +} +{ +sub.f16x2 %52, r5891, r6203; +} +{ +sub.f16x2 %53, r5894, r6209; +} +{ +add.f16x2 %24, r5903, r6219; +} +{ +add.f16x2 %25, r5906, r6225; +} +{ +sub.f16x2 %56, r5903, r6219; +} +{ +sub.f16x2 %57, r5906, r6225; +} +{ +add.f16x2 %28, r5915, r6235; +} +{ +add.f16x2 %29, r5918, r6241; +} +{ +sub.f16x2 %60, r5915, r6235; +} +{ +sub.f16x2 %61, r5918, r6241; +} +{ +add.f16x2 r6341, r6342, r6343; +} +{ +add.f16x2 r6344, r6345, r6346; +} +{ +sub.f16x2 r6347, r6342, r6343; +} +{ +sub.f16x2 r6350, r6345, r6346; +} +{ +add.f16x2 r6353, r6354, r6355; +} +{ +add.f16x2 r6356, r6357, r6358; +} +{ +sub.f16x2 r6359, r6354, r6355; +} +{ +sub.f16x2 r6362, r6357, r6358; +} +{ +neg.f16x2 r6365, r6359; +} +{ +add.f16x2 r6367, r6341, r6353; +} +{ +add.f16x2 r6370, r6344, r6356; +} +{ +sub.f16x2 r6373, r6341, r6353; +} +{ +sub.f16x2 r6376, r6344, r6356; +} +{ +add.f16x2 r6379, r6347, r6362; +} +{ +add.f16x2 r6382, r6350, r6365; +} +{ +sub.f16x2 r6385, r6347, r6362; +} +{ +sub.f16x2 r6388, r6350, r6365; +} +{ +add.f16x2 r6391, r6392, r6393; +} +{ +add.f16x2 r6394, r6395, r6396; +} +{ +sub.f16x2 r6397, r6392, r6393; +} +{ +sub.f16x2 r6400, r6395, r6396; +} +{ +add.f16x2 r6403, r6404, r6405; +} +{ +add.f16x2 r6406, r6407, r6408; +} +{ +sub.f16x2 r6409, r6404, r6405; +} +{ +sub.f16x2 r6412, r6407, r6408; +} +{ +neg.f16x2 r6415, r6409; +} +{ +add.f16x2 r6417, r6391, r6403; +} +{ +add.f16x2 r6420, r6394, r6406; +} +{ +sub.f16x2 r6423, r6391, r6403; +} +{ +sub.f16x2 r6426, r6394, r6406; +} +{ +add.f16x2 r6429, r6397, r6412; +} +{ +add.f16x2 r6432, r6400, r6415; +} +{ +sub.f16x2 r6435, r6397, r6412; +} +{ +sub.f16x2 r6438, r6400, r6415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6446, {low, high}; +} +{ +mul.f16x2 r6455, r6429, r6441; +} +{ +mul.f16x2 r6458, r6432, r6442; +} +{ +sub.f16x2 r6461, r6455, r6458; +} +{ +mul.f16x2 r6464, r6429, r6442; +} +{ +fma.rn.f16x2 r6467, r6432, r6441, r6464; +} +{ +neg.f16x2 r6471, r6423; +} +{ +mul.f16x2 r6473, r6435, r6445; +} +{ +mul.f16x2 r6476, r6438, r6446; +} +{ +sub.f16x2 r6479, r6473, r6476; +} +{ +mul.f16x2 r6482, r6435, r6446; +} +{ +fma.rn.f16x2 r6485, r6438, r6445, r6482; +} +{ +add.f16x2 r6489, r6367, r6417; +} +{ +add.f16x2 r6492, r6370, r6420; +} +{ +sub.f16x2 r6495, r6367, r6417; +} +{ +sub.f16x2 r6498, r6370, r6420; +} +{ +add.f16x2 r6501, r6379, r6461; +} +{ +add.f16x2 r6504, r6382, r6467; +} +{ +sub.f16x2 r6507, r6379, r6461; +} +{ +sub.f16x2 r6510, r6382, r6467; +} +{ +add.f16x2 r6513, r6373, r6426; +} +{ +add.f16x2 r6516, r6376, r6471; +} +{ +sub.f16x2 r6519, r6373, r6426; +} +{ +sub.f16x2 r6522, r6376, r6471; +} +{ +add.f16x2 r6525, r6385, r6479; +} +{ +add.f16x2 r6528, r6388, r6485; +} +{ +sub.f16x2 r6531, r6385, r6479; +} +{ +sub.f16x2 r6534, r6388, r6485; +} +{ +add.f16x2 r6537, r6538, r6539; +} +{ +add.f16x2 r6540, r6541, r6542; +} +{ +sub.f16x2 r6543, r6538, r6539; +} +{ +sub.f16x2 r6546, r6541, r6542; +} +{ +add.f16x2 r6549, r6550, r6551; +} +{ +add.f16x2 r6552, r6553, r6554; +} +{ +sub.f16x2 r6555, r6550, r6551; +} +{ +sub.f16x2 r6558, r6553, r6554; +} +{ +neg.f16x2 r6561, r6555; +} +{ +add.f16x2 r6563, r6537, r6549; +} +{ +add.f16x2 r6566, r6540, r6552; +} +{ +sub.f16x2 r6569, r6537, r6549; +} +{ +sub.f16x2 r6572, r6540, r6552; +} +{ +add.f16x2 r6575, r6543, r6558; +} +{ +add.f16x2 r6578, r6546, r6561; +} +{ +sub.f16x2 r6581, r6543, r6558; +} +{ +sub.f16x2 r6584, r6546, r6561; +} +{ +add.f16x2 r6587, r6588, r6589; +} +{ +add.f16x2 r6590, r6591, r6592; +} +{ +sub.f16x2 r6593, r6588, r6589; +} +{ +sub.f16x2 r6596, r6591, r6592; +} +{ +add.f16x2 r6599, r6600, r6601; +} +{ +add.f16x2 r6602, r6603, r6604; +} +{ +sub.f16x2 r6605, r6600, r6601; +} +{ +sub.f16x2 r6608, r6603, r6604; +} +{ +neg.f16x2 r6611, r6605; +} +{ +add.f16x2 r6613, r6587, r6599; +} +{ +add.f16x2 r6616, r6590, r6602; +} +{ +sub.f16x2 r6619, r6587, r6599; +} +{ +sub.f16x2 r6622, r6590, r6602; +} +{ +add.f16x2 r6625, r6593, r6608; +} +{ +add.f16x2 r6628, r6596, r6611; +} +{ +sub.f16x2 r6631, r6593, r6608; +} +{ +sub.f16x2 r6634, r6596, r6611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6642, {low, high}; +} +{ +mul.f16x2 r6651, r6625, r6637; +} +{ +mul.f16x2 r6654, r6628, r6638; +} +{ +sub.f16x2 r6657, r6651, r6654; +} +{ +mul.f16x2 r6660, r6625, r6638; +} +{ +fma.rn.f16x2 r6663, r6628, r6637, r6660; +} +{ +neg.f16x2 r6667, r6619; +} +{ +mul.f16x2 r6669, r6631, r6641; +} +{ +mul.f16x2 r6672, r6634, r6642; +} +{ +sub.f16x2 r6675, r6669, r6672; +} +{ +mul.f16x2 r6678, r6631, r6642; +} +{ +fma.rn.f16x2 r6681, r6634, r6641, r6678; +} +{ +add.f16x2 r6685, r6563, r6613; +} +{ +add.f16x2 r6688, r6566, r6616; +} +{ +sub.f16x2 r6691, r6563, r6613; +} +{ +sub.f16x2 r6694, r6566, r6616; +} +{ +add.f16x2 r6697, r6575, r6657; +} +{ +add.f16x2 r6700, r6578, r6663; +} +{ +sub.f16x2 r6703, r6575, r6657; +} +{ +sub.f16x2 r6706, r6578, r6663; +} +{ +add.f16x2 r6709, r6569, r6622; +} +{ +add.f16x2 r6712, r6572, r6667; +} +{ +sub.f16x2 r6715, r6569, r6622; +} +{ +sub.f16x2 r6718, r6572, r6667; +} +{ +add.f16x2 r6721, r6581, r6675; +} +{ +add.f16x2 r6724, r6584, r6681; +} +{ +sub.f16x2 r6727, r6581, r6675; +} +{ +sub.f16x2 r6730, r6584, r6681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1018; +cvt.rn.f16.f32 high, f1018; +mov.b32 r6733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1022; +cvt.rn.f16.f32 high, f1022; +mov.b32 r6735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1026; +cvt.rn.f16.f32 high, f1026; +mov.b32 r6737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6746, {low, high}; +} +{ +mul.f16x2 r6763, r6697, r6733; +} +{ +mul.f16x2 r6766, r6700, r6734; +} +{ +sub.f16x2 r6769, r6763, r6766; +} +{ +mul.f16x2 r6772, r6697, r6734; +} +{ +fma.rn.f16x2 r6775, r6700, r6733, r6772; +} +{ +mul.f16x2 r6779, r6709, r6735; +} +{ +mul.f16x2 r6782, r6712, r6736; +} +{ +sub.f16x2 r6785, r6779, r6782; +} +{ +mul.f16x2 r6788, r6709, r6736; +} +{ +fma.rn.f16x2 r6791, r6712, r6735, r6788; +} +{ +mul.f16x2 r6795, r6721, r6737; +} +{ +mul.f16x2 r6798, r6724, r6738; +} +{ +sub.f16x2 r6801, r6795, r6798; +} +{ +mul.f16x2 r6804, r6721, r6738; +} +{ +fma.rn.f16x2 r6807, r6724, r6737, r6804; +} +{ +neg.f16x2 r6811, r6691; +} +{ +mul.f16x2 r6813, r6703, r6741; +} +{ +mul.f16x2 r6816, r6706, r6742; +} +{ +sub.f16x2 r6819, r6813, r6816; +} +{ +mul.f16x2 r6822, r6703, r6742; +} +{ +fma.rn.f16x2 r6825, r6706, r6741, r6822; +} +{ +mul.f16x2 r6829, r6715, r6743; +} +{ +mul.f16x2 r6832, r6718, r6744; +} +{ +sub.f16x2 r6835, r6829, r6832; +} +{ +mul.f16x2 r6838, r6715, r6744; +} +{ +fma.rn.f16x2 r6841, r6718, r6743, r6838; +} +{ +mul.f16x2 r6845, r6727, r6745; +} +{ +mul.f16x2 r6848, r6730, r6746; +} +{ +sub.f16x2 r6851, r6845, r6848; +} +{ +mul.f16x2 r6854, r6727, r6746; +} +{ +fma.rn.f16x2 r6857, r6730, r6745, r6854; +} +{ +add.f16x2 %2, r6489, r6685; +} +{ +add.f16x2 %3, r6492, r6688; +} +{ +sub.f16x2 %34, r6489, r6685; +} +{ +sub.f16x2 %35, r6492, r6688; +} +{ +add.f16x2 %6, r6501, r6769; +} +{ +add.f16x2 %7, r6504, r6775; +} +{ +sub.f16x2 %38, r6501, r6769; +} +{ +sub.f16x2 %39, r6504, r6775; +} +{ +add.f16x2 %10, r6513, r6785; +} +{ +add.f16x2 %11, r6516, r6791; +} +{ +sub.f16x2 %42, r6513, r6785; +} +{ +sub.f16x2 %43, r6516, r6791; +} +{ +add.f16x2 %14, r6525, r6801; +} +{ +add.f16x2 %15, r6528, r6807; +} +{ +sub.f16x2 %46, r6525, r6801; +} +{ +sub.f16x2 %47, r6528, r6807; +} +{ +add.f16x2 %18, r6495, r6694; +} +{ +add.f16x2 %19, r6498, r6811; +} +{ +sub.f16x2 %50, r6495, r6694; +} +{ +sub.f16x2 %51, r6498, r6811; +} +{ +add.f16x2 %22, r6507, r6819; +} +{ +add.f16x2 %23, r6510, r6825; +} +{ +sub.f16x2 %54, r6507, r6819; +} +{ +sub.f16x2 %55, r6510, r6825; +} +{ +add.f16x2 %26, r6519, r6835; +} +{ +add.f16x2 %27, r6522, r6841; +} +{ +sub.f16x2 %58, r6519, r6835; +} +{ +sub.f16x2 %59, r6522, r6841; +} +{ +add.f16x2 %30, r6531, r6851; +} +{ +add.f16x2 %31, r6534, r6857; +} +{ +sub.f16x2 %62, r6531, r6851; +} +{ +sub.f16x2 %63, r6534, r6857; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..126672d39de5c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp16_inv.hpp.inc @@ -0,0 +1,27410 @@ +#ifndef CUFFTDX_FFT_16384_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_16384_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1180, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1085>; +.reg .b32 r<7043>; +.reg .b64 rd<3>; +mov.u32 r6957, %tid.y; +shl.b32 r6958, r6957, 17; +mov.u32 r6959, %64; +add.s32 r6960, r6959, r6958; +mov.u32 r6961, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %100; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %100; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f1040, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r102, {low, high}; +} +mov.f32 f1038, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f1036, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r393, {low, high}; +} +mov.f32 f1044, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r398, {low, high}; +} +mov.f32 f1034, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r404, {low, high}; +} +mov.f32 f1042, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %101; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %101; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6963, r6961, 8; +and.b32 r6964, r6963, -131072; +add.s32 r6965, r6960, r6964; +and.b32 r6978, r6961, 511; +cvt.rn.f32.u32 f1077, r6978; +mul.f32 f1078, f1077, 0f39C90FDB; +cos.approx.f32 f357, f1078; +sin.approx.f32 f1079, f1078; +neg.f32 f358, f1079; +mov.f32 f1084, 0f3F800000; +mov.f32 f1083, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r6966, r6963, 130816; +add.s32 r6967, r6965, r6966; +st.shared.v4.f32 [r6967], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r6967+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r6967+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r6967+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r6967+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r6967+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r6967+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r6967+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r6967+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r6967+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r6967+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r6967+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r6967+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r6967+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r6967+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r6967+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r6968, r6978, -248, r6967; +ld.shared.u32 r2864, [r6968]; +ld.shared.u32 r2867, [r6968+4]; +ld.shared.u32 r3480, [r6968+4096]; +ld.shared.u32 r3483, [r6968+4100]; +ld.shared.u32 r3060, [r6968+8192]; +ld.shared.u32 r3063, [r6968+8196]; +ld.shared.u32 r3676, [r6968+12288]; +ld.shared.u32 r3679, [r6968+12292]; +ld.shared.u32 r2914, [r6968+16384]; +ld.shared.u32 r2917, [r6968+16388]; +ld.shared.u32 r3530, [r6968+20480]; +ld.shared.u32 r3533, [r6968+20484]; +ld.shared.u32 r3110, [r6968+24576]; +ld.shared.u32 r3113, [r6968+24580]; +ld.shared.u32 r3726, [r6968+28672]; +ld.shared.u32 r3729, [r6968+28676]; +ld.shared.u32 r2876, [r6968+32768]; +ld.shared.u32 r2879, [r6968+32772]; +ld.shared.u32 r3492, [r6968+36864]; +ld.shared.u32 r3495, [r6968+36868]; +ld.shared.u32 r3072, [r6968+40960]; +ld.shared.u32 r3075, [r6968+40964]; +ld.shared.u32 r3688, [r6968+45056]; +ld.shared.u32 r3691, [r6968+45060]; +ld.shared.u32 r2926, [r6968+49152]; +ld.shared.u32 r2929, [r6968+49156]; +ld.shared.u32 r3542, [r6968+53248]; +ld.shared.u32 r3545, [r6968+53252]; +ld.shared.u32 r3122, [r6968+57344]; +ld.shared.u32 r3125, [r6968+57348]; +ld.shared.u32 r3738, [r6968+61440]; +ld.shared.u32 r3741, [r6968+61444]; +ld.shared.u32 r2865, [r6968+65536]; +ld.shared.u32 r2868, [r6968+65540]; +ld.shared.u32 r3481, [r6968+69632]; +ld.shared.u32 r3484, [r6968+69636]; +ld.shared.u32 r3061, [r6968+73728]; +ld.shared.u32 r3064, [r6968+73732]; +ld.shared.u32 r3677, [r6968+77824]; +ld.shared.u32 r3680, [r6968+77828]; +ld.shared.u32 r2915, [r6968+81920]; +ld.shared.u32 r2918, [r6968+81924]; +ld.shared.u32 r3531, [r6968+86016]; +ld.shared.u32 r3534, [r6968+86020]; +ld.shared.u32 r3111, [r6968+90112]; +ld.shared.u32 r3114, [r6968+90116]; +ld.shared.u32 r3727, [r6968+94208]; +ld.shared.u32 r3730, [r6968+94212]; +ld.shared.u32 r2877, [r6968+98304]; +ld.shared.u32 r2880, [r6968+98308]; +ld.shared.u32 r3493, [r6968+102400]; +ld.shared.u32 r3496, [r6968+102404]; +ld.shared.u32 r3073, [r6968+106496]; +ld.shared.u32 r3076, [r6968+106500]; +ld.shared.u32 r3689, [r6968+110592]; +ld.shared.u32 r3692, [r6968+110596]; +ld.shared.u32 r2927, [r6968+114688]; +ld.shared.u32 r2930, [r6968+114692]; +ld.shared.u32 r3543, [r6968+118784]; +ld.shared.u32 r3546, [r6968+118788]; +ld.shared.u32 r3123, [r6968+122880]; +ld.shared.u32 r3126, [r6968+122884]; +ld.shared.u32 r3739, [r6968+126976]; +ld.shared.u32 r3742, [r6968+126980]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r6970, r6961, 5, 4; +cvt.rn.f32.u32 f1080, r6970; +mul.f32 f1081, f1080, 0f3C490FDB; +cos.approx.f32 f779, f1081; +sin.approx.f32 f1082, f1081; +neg.f32 f780, f1082; +and.b32 r6977, r6961, 480; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +shl.b32 r6971, r6961, 3; +and.b32 r6972, r6971, 248; +add.s32 r6973, r6965, r6972; +barrier.sync 0; +and.b32 r6974, r6963, 122880; +add.s32 r6975, r6973, r6974; +st.shared.u32 [r6975], r4383; +st.shared.u32 [r6975+4], r4386; +st.shared.u32 [r6975+256], r4585; +st.shared.u32 [r6975+260], r4594; +st.shared.u32 [r6975+512], r4622; +st.shared.u32 [r6975+516], r4631; +st.shared.u32 [r6975+768], r4659; +st.shared.u32 [r6975+772], r4668; +st.shared.u32 [r6975+1024], r4696; +st.shared.u32 [r6975+1028], r4705; +st.shared.u32 [r6975+1280], r4733; +st.shared.u32 [r6975+1284], r4742; +st.shared.u32 [r6975+1536], r4770; +st.shared.u32 [r6975+1540], r4779; +st.shared.u32 [r6975+1792], r4807; +st.shared.u32 [r6975+1796], r4816; +st.shared.u32 [r6975+2048], r4844; +st.shared.u32 [r6975+2052], r4853; +st.shared.u32 [r6975+2304], r4881; +st.shared.u32 [r6975+2308], r4890; +st.shared.u32 [r6975+2560], r4918; +st.shared.u32 [r6975+2564], r4927; +st.shared.u32 [r6975+2816], r4955; +st.shared.u32 [r6975+2820], r4964; +st.shared.u32 [r6975+3072], r4992; +st.shared.u32 [r6975+3076], r5001; +st.shared.u32 [r6975+3328], r5029; +st.shared.u32 [r6975+3332], r5038; +st.shared.u32 [r6975+3584], r5066; +st.shared.u32 [r6975+3588], r5075; +st.shared.u32 [r6975+3840], r5103; +st.shared.u32 [r6975+3844], r5112; +st.shared.u32 [r6975+4096], r5140; +st.shared.u32 [r6975+4100], r5149; +st.shared.u32 [r6975+4352], r5177; +st.shared.u32 [r6975+4356], r5186; +st.shared.u32 [r6975+4608], r5214; +st.shared.u32 [r6975+4612], r5223; +st.shared.u32 [r6975+4864], r5251; +st.shared.u32 [r6975+4868], r5260; +st.shared.u32 [r6975+5120], r5288; +st.shared.u32 [r6975+5124], r5297; +st.shared.u32 [r6975+5376], r5325; +st.shared.u32 [r6975+5380], r5334; +st.shared.u32 [r6975+5632], r5362; +st.shared.u32 [r6975+5636], r5371; +st.shared.u32 [r6975+5888], r5399; +st.shared.u32 [r6975+5892], r5408; +st.shared.u32 [r6975+6144], r5436; +st.shared.u32 [r6975+6148], r5445; +st.shared.u32 [r6975+6400], r5473; +st.shared.u32 [r6975+6404], r5482; +st.shared.u32 [r6975+6656], r5510; +st.shared.u32 [r6975+6660], r5519; +st.shared.u32 [r6975+6912], r5547; +st.shared.u32 [r6975+6916], r5556; +st.shared.u32 [r6975+7168], r5584; +st.shared.u32 [r6975+7172], r5593; +st.shared.u32 [r6975+7424], r5621; +st.shared.u32 [r6975+7428], r5630; +st.shared.u32 [r6975+7680], r5658; +st.shared.u32 [r6975+7684], r5667; +st.shared.u32 [r6975+7936], r5695; +st.shared.u32 [r6975+7940], r5704; +barrier.sync 0; +mad.lo.s32 r6976, r6977, -248, r6975; +ld.shared.u32 r5726, [r6976]; +ld.shared.u32 r5729, [r6976+4]; +ld.shared.u32 r6342, [r6976+4096]; +ld.shared.u32 r6345, [r6976+4100]; +ld.shared.u32 r5922, [r6976+8192]; +ld.shared.u32 r5925, [r6976+8196]; +ld.shared.u32 r6538, [r6976+12288]; +ld.shared.u32 r6541, [r6976+12292]; +ld.shared.u32 r5776, [r6976+16384]; +ld.shared.u32 r5779, [r6976+16388]; +ld.shared.u32 r6392, [r6976+20480]; +ld.shared.u32 r6395, [r6976+20484]; +ld.shared.u32 r5972, [r6976+24576]; +ld.shared.u32 r5975, [r6976+24580]; +ld.shared.u32 r6588, [r6976+28672]; +ld.shared.u32 r6591, [r6976+28676]; +ld.shared.u32 r5738, [r6976+32768]; +ld.shared.u32 r5741, [r6976+32772]; +ld.shared.u32 r6354, [r6976+36864]; +ld.shared.u32 r6357, [r6976+36868]; +ld.shared.u32 r5934, [r6976+40960]; +ld.shared.u32 r5937, [r6976+40964]; +ld.shared.u32 r6550, [r6976+45056]; +ld.shared.u32 r6553, [r6976+45060]; +ld.shared.u32 r5788, [r6976+49152]; +ld.shared.u32 r5791, [r6976+49156]; +ld.shared.u32 r6404, [r6976+53248]; +ld.shared.u32 r6407, [r6976+53252]; +ld.shared.u32 r5984, [r6976+57344]; +ld.shared.u32 r5987, [r6976+57348]; +ld.shared.u32 r6600, [r6976+61440]; +ld.shared.u32 r6603, [r6976+61444]; +ld.shared.u32 r5727, [r6976+65536]; +ld.shared.u32 r5730, [r6976+65540]; +ld.shared.u32 r6343, [r6976+69632]; +ld.shared.u32 r6346, [r6976+69636]; +ld.shared.u32 r5923, [r6976+73728]; +ld.shared.u32 r5926, [r6976+73732]; +ld.shared.u32 r6539, [r6976+77824]; +ld.shared.u32 r6542, [r6976+77828]; +ld.shared.u32 r5777, [r6976+81920]; +ld.shared.u32 r5780, [r6976+81924]; +ld.shared.u32 r6393, [r6976+86016]; +ld.shared.u32 r6396, [r6976+86020]; +ld.shared.u32 r5973, [r6976+90112]; +ld.shared.u32 r5976, [r6976+90116]; +ld.shared.u32 r6589, [r6976+94208]; +ld.shared.u32 r6592, [r6976+94212]; +ld.shared.u32 r5739, [r6976+98304]; +ld.shared.u32 r5742, [r6976+98308]; +ld.shared.u32 r6355, [r6976+102400]; +ld.shared.u32 r6358, [r6976+102404]; +ld.shared.u32 r5935, [r6976+106496]; +ld.shared.u32 r5938, [r6976+106500]; +ld.shared.u32 r6551, [r6976+110592]; +ld.shared.u32 r6554, [r6976+110596]; +ld.shared.u32 r5789, [r6976+114688]; +ld.shared.u32 r5792, [r6976+114692]; +ld.shared.u32 r6405, [r6976+118784]; +ld.shared.u32 r6408, [r6976+118788]; +ld.shared.u32 r5985, [r6976+122880]; +ld.shared.u32 r5988, [r6976+122884]; +ld.shared.u32 r6601, [r6976+126976]; +ld.shared.u32 r6604, [r6976+126980]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5749; +} +{ +add.f16x2 r5766, r5734, r5743; +} +{ +sub.f16x2 r5769, r5731, r5749; +} +{ +sub.f16x2 r5772, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5799; +} +{ +add.f16x2 r5816, r5784, r5793; +} +{ +sub.f16x2 r5819, r5781, r5799; +} +{ +sub.f16x2 r5822, r5784, r5793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5810; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 r5873, r5751, r5801; +} +{ +add.f16x2 r5876, r5754, r5804; +} +{ +sub.f16x2 r5879, r5751, r5801; +} +{ +sub.f16x2 r5882, r5754, r5804; +} +{ +add.f16x2 r5885, r5763, r5845; +} +{ +add.f16x2 r5888, r5766, r5851; +} +{ +sub.f16x2 r5891, r5763, r5845; +} +{ +sub.f16x2 r5894, r5766, r5851; +} +{ +add.f16x2 r5897, r5757, r5855; +} +{ +add.f16x2 r5900, r5760, r5807; +} +{ +sub.f16x2 r5903, r5757, r5855; +} +{ +sub.f16x2 r5906, r5760, r5807; +} +{ +add.f16x2 r5909, r5769, r5863; +} +{ +add.f16x2 r5912, r5772, r5869; +} +{ +sub.f16x2 r5915, r5769, r5863; +} +{ +sub.f16x2 r5918, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5942; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5945; +} +{ +add.f16x2 r5962, r5930, r5939; +} +{ +sub.f16x2 r5965, r5927, r5945; +} +{ +sub.f16x2 r5968, r5930, r5939; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5992; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5995; +} +{ +add.f16x2 r6012, r5980, r5989; +} +{ +sub.f16x2 r6015, r5977, r5995; +} +{ +sub.f16x2 r6018, r5980, r5989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6006; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 r6069, r5947, r5997; +} +{ +add.f16x2 r6072, r5950, r6000; +} +{ +sub.f16x2 r6075, r5947, r5997; +} +{ +sub.f16x2 r6078, r5950, r6000; +} +{ +add.f16x2 r6081, r5959, r6041; +} +{ +add.f16x2 r6084, r5962, r6047; +} +{ +sub.f16x2 r6087, r5959, r6041; +} +{ +sub.f16x2 r6090, r5962, r6047; +} +{ +add.f16x2 r6093, r5953, r6051; +} +{ +add.f16x2 r6096, r5956, r6003; +} +{ +sub.f16x2 r6099, r5953, r6051; +} +{ +sub.f16x2 r6102, r5956, r6003; +} +{ +add.f16x2 r6105, r5965, r6059; +} +{ +add.f16x2 r6108, r5968, r6065; +} +{ +sub.f16x2 r6111, r5965, r6059; +} +{ +sub.f16x2 r6114, r5968, r6065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6125, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6126, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6128, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6130, {low, high}; +} +{ +mul.f16x2 r6147, r6081, r6117; +} +{ +mul.f16x2 r6150, r6084, r6118; +} +{ +sub.f16x2 r6153, r6147, r6150; +} +{ +mul.f16x2 r6156, r6081, r6118; +} +{ +fma.rn.f16x2 r6159, r6084, r6117, r6156; +} +{ +mul.f16x2 r6163, r6093, r6119; +} +{ +mul.f16x2 r6166, r6096, r6120; +} +{ +sub.f16x2 r6169, r6163, r6166; +} +{ +mul.f16x2 r6172, r6093, r6120; +} +{ +fma.rn.f16x2 r6175, r6096, r6119, r6172; +} +{ +mul.f16x2 r6179, r6105, r6121; +} +{ +mul.f16x2 r6182, r6108, r6122; +} +{ +sub.f16x2 r6185, r6179, r6182; +} +{ +mul.f16x2 r6188, r6105, r6122; +} +{ +fma.rn.f16x2 r6191, r6108, r6121, r6188; +} +{ +neg.f16x2 r6195, r6078; +} +{ +mul.f16x2 r6197, r6087, r6125; +} +{ +mul.f16x2 r6200, r6090, r6126; +} +{ +sub.f16x2 r6203, r6197, r6200; +} +{ +mul.f16x2 r6206, r6087, r6126; +} +{ +fma.rn.f16x2 r6209, r6090, r6125, r6206; +} +{ +mul.f16x2 r6213, r6099, r6127; +} +{ +mul.f16x2 r6216, r6102, r6128; +} +{ +sub.f16x2 r6219, r6213, r6216; +} +{ +mul.f16x2 r6222, r6099, r6128; +} +{ +fma.rn.f16x2 r6225, r6102, r6127, r6222; +} +{ +mul.f16x2 r6229, r6111, r6129; +} +{ +mul.f16x2 r6232, r6114, r6130; +} +{ +sub.f16x2 r6235, r6229, r6232; +} +{ +mul.f16x2 r6238, r6111, r6130; +} +{ +fma.rn.f16x2 r6241, r6114, r6129, r6238; +} +{ +add.f16x2 %0, r5873, r6069; +} +{ +add.f16x2 %1, r5876, r6072; +} +{ +sub.f16x2 %32, r5873, r6069; +} +{ +sub.f16x2 %33, r5876, r6072; +} +{ +add.f16x2 %4, r5885, r6153; +} +{ +add.f16x2 %5, r5888, r6159; +} +{ +sub.f16x2 %36, r5885, r6153; +} +{ +sub.f16x2 %37, r5888, r6159; +} +{ +add.f16x2 %8, r5897, r6169; +} +{ +add.f16x2 %9, r5900, r6175; +} +{ +sub.f16x2 %40, r5897, r6169; +} +{ +sub.f16x2 %41, r5900, r6175; +} +{ +add.f16x2 %12, r5909, r6185; +} +{ +add.f16x2 %13, r5912, r6191; +} +{ +sub.f16x2 %44, r5909, r6185; +} +{ +sub.f16x2 %45, r5912, r6191; +} +{ +add.f16x2 %16, r5879, r6195; +} +{ +add.f16x2 %17, r5882, r6075; +} +{ +sub.f16x2 %48, r5879, r6195; +} +{ +sub.f16x2 %49, r5882, r6075; +} +{ +add.f16x2 %20, r5891, r6203; +} +{ +add.f16x2 %21, r5894, r6209; +} +{ +sub.f16x2 %52, r5891, r6203; +} +{ +sub.f16x2 %53, r5894, r6209; +} +{ +add.f16x2 %24, r5903, r6219; +} +{ +add.f16x2 %25, r5906, r6225; +} +{ +sub.f16x2 %56, r5903, r6219; +} +{ +sub.f16x2 %57, r5906, r6225; +} +{ +add.f16x2 %28, r5915, r6235; +} +{ +add.f16x2 %29, r5918, r6241; +} +{ +sub.f16x2 %60, r5915, r6235; +} +{ +sub.f16x2 %61, r5918, r6241; +} +{ +add.f16x2 r6341, r6342, r6343; +} +{ +add.f16x2 r6344, r6345, r6346; +} +{ +sub.f16x2 r6347, r6342, r6343; +} +{ +sub.f16x2 r6350, r6345, r6346; +} +{ +add.f16x2 r6353, r6354, r6355; +} +{ +add.f16x2 r6356, r6357, r6358; +} +{ +sub.f16x2 r6359, r6354, r6355; +} +{ +sub.f16x2 r6362, r6357, r6358; +} +{ +neg.f16x2 r6365, r6362; +} +{ +add.f16x2 r6367, r6341, r6353; +} +{ +add.f16x2 r6370, r6344, r6356; +} +{ +sub.f16x2 r6373, r6341, r6353; +} +{ +sub.f16x2 r6376, r6344, r6356; +} +{ +add.f16x2 r6379, r6347, r6365; +} +{ +add.f16x2 r6382, r6350, r6359; +} +{ +sub.f16x2 r6385, r6347, r6365; +} +{ +sub.f16x2 r6388, r6350, r6359; +} +{ +add.f16x2 r6391, r6392, r6393; +} +{ +add.f16x2 r6394, r6395, r6396; +} +{ +sub.f16x2 r6397, r6392, r6393; +} +{ +sub.f16x2 r6400, r6395, r6396; +} +{ +add.f16x2 r6403, r6404, r6405; +} +{ +add.f16x2 r6406, r6407, r6408; +} +{ +sub.f16x2 r6409, r6404, r6405; +} +{ +sub.f16x2 r6412, r6407, r6408; +} +{ +neg.f16x2 r6415, r6412; +} +{ +add.f16x2 r6417, r6391, r6403; +} +{ +add.f16x2 r6420, r6394, r6406; +} +{ +sub.f16x2 r6423, r6391, r6403; +} +{ +sub.f16x2 r6426, r6394, r6406; +} +{ +add.f16x2 r6429, r6397, r6415; +} +{ +add.f16x2 r6432, r6400, r6409; +} +{ +sub.f16x2 r6435, r6397, r6415; +} +{ +sub.f16x2 r6438, r6400, r6409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6446, {low, high}; +} +{ +mul.f16x2 r6455, r6429, r6441; +} +{ +mul.f16x2 r6458, r6432, r6442; +} +{ +sub.f16x2 r6461, r6455, r6458; +} +{ +mul.f16x2 r6464, r6429, r6442; +} +{ +fma.rn.f16x2 r6467, r6432, r6441, r6464; +} +{ +neg.f16x2 r6471, r6426; +} +{ +mul.f16x2 r6473, r6435, r6445; +} +{ +mul.f16x2 r6476, r6438, r6446; +} +{ +sub.f16x2 r6479, r6473, r6476; +} +{ +mul.f16x2 r6482, r6435, r6446; +} +{ +fma.rn.f16x2 r6485, r6438, r6445, r6482; +} +{ +add.f16x2 r6489, r6367, r6417; +} +{ +add.f16x2 r6492, r6370, r6420; +} +{ +sub.f16x2 r6495, r6367, r6417; +} +{ +sub.f16x2 r6498, r6370, r6420; +} +{ +add.f16x2 r6501, r6379, r6461; +} +{ +add.f16x2 r6504, r6382, r6467; +} +{ +sub.f16x2 r6507, r6379, r6461; +} +{ +sub.f16x2 r6510, r6382, r6467; +} +{ +add.f16x2 r6513, r6373, r6471; +} +{ +add.f16x2 r6516, r6376, r6423; +} +{ +sub.f16x2 r6519, r6373, r6471; +} +{ +sub.f16x2 r6522, r6376, r6423; +} +{ +add.f16x2 r6525, r6385, r6479; +} +{ +add.f16x2 r6528, r6388, r6485; +} +{ +sub.f16x2 r6531, r6385, r6479; +} +{ +sub.f16x2 r6534, r6388, r6485; +} +{ +add.f16x2 r6537, r6538, r6539; +} +{ +add.f16x2 r6540, r6541, r6542; +} +{ +sub.f16x2 r6543, r6538, r6539; +} +{ +sub.f16x2 r6546, r6541, r6542; +} +{ +add.f16x2 r6549, r6550, r6551; +} +{ +add.f16x2 r6552, r6553, r6554; +} +{ +sub.f16x2 r6555, r6550, r6551; +} +{ +sub.f16x2 r6558, r6553, r6554; +} +{ +neg.f16x2 r6561, r6558; +} +{ +add.f16x2 r6563, r6537, r6549; +} +{ +add.f16x2 r6566, r6540, r6552; +} +{ +sub.f16x2 r6569, r6537, r6549; +} +{ +sub.f16x2 r6572, r6540, r6552; +} +{ +add.f16x2 r6575, r6543, r6561; +} +{ +add.f16x2 r6578, r6546, r6555; +} +{ +sub.f16x2 r6581, r6543, r6561; +} +{ +sub.f16x2 r6584, r6546, r6555; +} +{ +add.f16x2 r6587, r6588, r6589; +} +{ +add.f16x2 r6590, r6591, r6592; +} +{ +sub.f16x2 r6593, r6588, r6589; +} +{ +sub.f16x2 r6596, r6591, r6592; +} +{ +add.f16x2 r6599, r6600, r6601; +} +{ +add.f16x2 r6602, r6603, r6604; +} +{ +sub.f16x2 r6605, r6600, r6601; +} +{ +sub.f16x2 r6608, r6603, r6604; +} +{ +neg.f16x2 r6611, r6608; +} +{ +add.f16x2 r6613, r6587, r6599; +} +{ +add.f16x2 r6616, r6590, r6602; +} +{ +sub.f16x2 r6619, r6587, r6599; +} +{ +sub.f16x2 r6622, r6590, r6602; +} +{ +add.f16x2 r6625, r6593, r6611; +} +{ +add.f16x2 r6628, r6596, r6605; +} +{ +sub.f16x2 r6631, r6593, r6611; +} +{ +sub.f16x2 r6634, r6596, r6605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6642, {low, high}; +} +{ +mul.f16x2 r6651, r6625, r6637; +} +{ +mul.f16x2 r6654, r6628, r6638; +} +{ +sub.f16x2 r6657, r6651, r6654; +} +{ +mul.f16x2 r6660, r6625, r6638; +} +{ +fma.rn.f16x2 r6663, r6628, r6637, r6660; +} +{ +neg.f16x2 r6667, r6622; +} +{ +mul.f16x2 r6669, r6631, r6641; +} +{ +mul.f16x2 r6672, r6634, r6642; +} +{ +sub.f16x2 r6675, r6669, r6672; +} +{ +mul.f16x2 r6678, r6631, r6642; +} +{ +fma.rn.f16x2 r6681, r6634, r6641, r6678; +} +{ +add.f16x2 r6685, r6563, r6613; +} +{ +add.f16x2 r6688, r6566, r6616; +} +{ +sub.f16x2 r6691, r6563, r6613; +} +{ +sub.f16x2 r6694, r6566, r6616; +} +{ +add.f16x2 r6697, r6575, r6657; +} +{ +add.f16x2 r6700, r6578, r6663; +} +{ +sub.f16x2 r6703, r6575, r6657; +} +{ +sub.f16x2 r6706, r6578, r6663; +} +{ +add.f16x2 r6709, r6569, r6667; +} +{ +add.f16x2 r6712, r6572, r6619; +} +{ +sub.f16x2 r6715, r6569, r6667; +} +{ +sub.f16x2 r6718, r6572, r6619; +} +{ +add.f16x2 r6721, r6581, r6675; +} +{ +add.f16x2 r6724, r6584, r6681; +} +{ +sub.f16x2 r6727, r6581, r6675; +} +{ +sub.f16x2 r6730, r6584, r6681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6746, {low, high}; +} +{ +mul.f16x2 r6763, r6697, r6733; +} +{ +mul.f16x2 r6766, r6700, r6734; +} +{ +sub.f16x2 r6769, r6763, r6766; +} +{ +mul.f16x2 r6772, r6697, r6734; +} +{ +fma.rn.f16x2 r6775, r6700, r6733, r6772; +} +{ +mul.f16x2 r6779, r6709, r6735; +} +{ +mul.f16x2 r6782, r6712, r6736; +} +{ +sub.f16x2 r6785, r6779, r6782; +} +{ +mul.f16x2 r6788, r6709, r6736; +} +{ +fma.rn.f16x2 r6791, r6712, r6735, r6788; +} +{ +mul.f16x2 r6795, r6721, r6737; +} +{ +mul.f16x2 r6798, r6724, r6738; +} +{ +sub.f16x2 r6801, r6795, r6798; +} +{ +mul.f16x2 r6804, r6721, r6738; +} +{ +fma.rn.f16x2 r6807, r6724, r6737, r6804; +} +{ +neg.f16x2 r6811, r6694; +} +{ +mul.f16x2 r6813, r6703, r6741; +} +{ +mul.f16x2 r6816, r6706, r6742; +} +{ +sub.f16x2 r6819, r6813, r6816; +} +{ +mul.f16x2 r6822, r6703, r6742; +} +{ +fma.rn.f16x2 r6825, r6706, r6741, r6822; +} +{ +mul.f16x2 r6829, r6715, r6743; +} +{ +mul.f16x2 r6832, r6718, r6744; +} +{ +sub.f16x2 r6835, r6829, r6832; +} +{ +mul.f16x2 r6838, r6715, r6744; +} +{ +fma.rn.f16x2 r6841, r6718, r6743, r6838; +} +{ +mul.f16x2 r6845, r6727, r6745; +} +{ +mul.f16x2 r6848, r6730, r6746; +} +{ +sub.f16x2 r6851, r6845, r6848; +} +{ +mul.f16x2 r6854, r6727, r6746; +} +{ +fma.rn.f16x2 r6857, r6730, r6745, r6854; +} +{ +add.f16x2 %2, r6489, r6685; +} +{ +add.f16x2 %3, r6492, r6688; +} +{ +sub.f16x2 %34, r6489, r6685; +} +{ +sub.f16x2 %35, r6492, r6688; +} +{ +add.f16x2 %6, r6501, r6769; +} +{ +add.f16x2 %7, r6504, r6775; +} +{ +sub.f16x2 %38, r6501, r6769; +} +{ +sub.f16x2 %39, r6504, r6775; +} +{ +add.f16x2 %10, r6513, r6785; +} +{ +add.f16x2 %11, r6516, r6791; +} +{ +sub.f16x2 %42, r6513, r6785; +} +{ +sub.f16x2 %43, r6516, r6791; +} +{ +add.f16x2 %14, r6525, r6801; +} +{ +add.f16x2 %15, r6528, r6807; +} +{ +sub.f16x2 %46, r6525, r6801; +} +{ +sub.f16x2 %47, r6528, r6807; +} +{ +add.f16x2 %18, r6495, r6811; +} +{ +add.f16x2 %19, r6498, r6691; +} +{ +sub.f16x2 %50, r6495, r6811; +} +{ +sub.f16x2 %51, r6498, r6691; +} +{ +add.f16x2 %22, r6507, r6819; +} +{ +add.f16x2 %23, r6510, r6825; +} +{ +sub.f16x2 %54, r6507, r6819; +} +{ +sub.f16x2 %55, r6510, r6825; +} +{ +add.f16x2 %26, r6519, r6835; +} +{ +add.f16x2 %27, r6522, r6841; +} +{ +sub.f16x2 %58, r6519, r6835; +} +{ +sub.f16x2 %59, r6522, r6841; +} +{ +add.f16x2 %30, r6531, r6851; +} +{ +add.f16x2 %31, r6534, r6857; +} +{ +sub.f16x2 %62, r6531, r6851; +} +{ +sub.f16x2 %63, r6534, r6857; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1063, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3750>; +.reg .b64 rd<2>; +mov.u32 r3723, %tid.y; +shl.b32 r3724, r3723, 16; +mov.u32 r3725, %32; +add.s32 r3726, r3725, r3724; +mov.u32 r3727, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f380, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f448, 0f3F800000; +mov.f32 f378, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f376, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r398, {low, high}; +} +mov.f32 f374, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3728, r3727, 1023; +shl.b32 r3729, r3727, 6; +and.b32 r3730, r3729, -65536; +add.s32 r3731, r3726, r3730; +cvt.rn.f32.u32 f451, r3728; +mul.f32 f452, f451, 0f39C90FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r3732, r3729, 65472; +add.s32 r3733, r3731, r3732; +st.shared.v4.f32 [r3733], {r521, r627, r664, r701}; +st.shared.v4.f32 [r3733+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r3733+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r3733+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r3734, r3728, -60, r3733; +ld.shared.u32 r1176, [r3734]; +ld.shared.u32 r1372, [r3734+4096]; +ld.shared.u32 r1226, [r3734+8192]; +ld.shared.u32 r1422, [r3734+12288]; +ld.shared.u32 r1188, [r3734+16384]; +ld.shared.u32 r1384, [r3734+20480]; +ld.shared.u32 r1238, [r3734+24576]; +ld.shared.u32 r1434, [r3734+28672]; +ld.shared.u32 r1177, [r3734+32768]; +ld.shared.u32 r1373, [r3734+36864]; +ld.shared.u32 r1227, [r3734+40960]; +ld.shared.u32 r1423, [r3734+45056]; +ld.shared.u32 r1189, [r3734+49152]; +ld.shared.u32 r1385, [r3734+53248]; +ld.shared.u32 r1239, [r3734+57344]; +ld.shared.u32 r1435, [r3734+61440]; +barrier.sync 0; +st.shared.v4.f32 [r3733], {r524, r636, r673, r710}; +st.shared.v4.f32 [r3733+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r3733+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r3733+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r3734]; +ld.shared.u32 r1375, [r3734+4096]; +ld.shared.u32 r1229, [r3734+8192]; +ld.shared.u32 r1425, [r3734+12288]; +ld.shared.u32 r1191, [r3734+16384]; +ld.shared.u32 r1387, [r3734+20480]; +ld.shared.u32 r1241, [r3734+24576]; +ld.shared.u32 r1437, [r3734+28672]; +ld.shared.u32 r1180, [r3734+32768]; +ld.shared.u32 r1376, [r3734+36864]; +ld.shared.u32 r1230, [r3734+40960]; +ld.shared.u32 r1426, [r3734+45056]; +ld.shared.u32 r1192, [r3734+49152]; +ld.shared.u32 r1388, [r3734+53248]; +ld.shared.u32 r1242, [r3734+57344]; +ld.shared.u32 r1438, [r3734+61440]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3735, r3727, 1008; +bfe.u32 r3736, r3727, 4, 6; +shl.b32 r3737, r3727, 2; +and.b32 r3738, r3737, 60; +add.s32 r3739, r3731, r3738; +cvt.rn.f32.u32 f454, r3736; +mul.f32 f455, f454, 0f3BC90FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +barrier.sync 0; +and.b32 r3740, r3729, 64512; +add.s32 r3741, r3739, r3740; +st.shared.u32 [r3741], r1695; +st.shared.u32 [r3741+64], r1801; +st.shared.u32 [r3741+128], r1838; +st.shared.u32 [r3741+192], r1875; +st.shared.u32 [r3741+256], r1912; +st.shared.u32 [r3741+320], r1949; +st.shared.u32 [r3741+384], r1986; +st.shared.u32 [r3741+448], r2023; +st.shared.u32 [r3741+512], r2060; +st.shared.u32 [r3741+576], r2097; +st.shared.u32 [r3741+640], r2134; +st.shared.u32 [r3741+704], r2171; +st.shared.u32 [r3741+768], r2208; +st.shared.u32 [r3741+832], r2245; +st.shared.u32 [r3741+896], r2282; +st.shared.u32 [r3741+960], r2319; +barrier.sync 0; +mad.lo.s32 r3742, r3735, -60, r3741; +ld.shared.u32 r2350, [r3742]; +ld.shared.u32 r2546, [r3742+4096]; +ld.shared.u32 r2400, [r3742+8192]; +ld.shared.u32 r2596, [r3742+12288]; +ld.shared.u32 r2362, [r3742+16384]; +ld.shared.u32 r2558, [r3742+20480]; +ld.shared.u32 r2412, [r3742+24576]; +ld.shared.u32 r2608, [r3742+28672]; +ld.shared.u32 r2351, [r3742+32768]; +ld.shared.u32 r2547, [r3742+36864]; +ld.shared.u32 r2401, [r3742+40960]; +ld.shared.u32 r2597, [r3742+45056]; +ld.shared.u32 r2363, [r3742+49152]; +ld.shared.u32 r2559, [r3742+53248]; +ld.shared.u32 r2413, [r3742+57344]; +ld.shared.u32 r2609, [r3742+61440]; +barrier.sync 0; +st.shared.u32 [r3741], r1698; +st.shared.u32 [r3741+64], r1810; +st.shared.u32 [r3741+128], r1847; +st.shared.u32 [r3741+192], r1884; +st.shared.u32 [r3741+256], r1921; +st.shared.u32 [r3741+320], r1958; +st.shared.u32 [r3741+384], r1995; +st.shared.u32 [r3741+448], r2032; +st.shared.u32 [r3741+512], r2069; +st.shared.u32 [r3741+576], r2106; +st.shared.u32 [r3741+640], r2143; +st.shared.u32 [r3741+704], r2180; +st.shared.u32 [r3741+768], r2217; +st.shared.u32 [r3741+832], r2254; +st.shared.u32 [r3741+896], r2291; +st.shared.u32 [r3741+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r3742]; +ld.shared.u32 r2549, [r3742+4096]; +ld.shared.u32 r2403, [r3742+8192]; +ld.shared.u32 r2599, [r3742+12288]; +ld.shared.u32 r2365, [r3742+16384]; +ld.shared.u32 r2561, [r3742+20480]; +ld.shared.u32 r2415, [r3742+24576]; +ld.shared.u32 r2611, [r3742+28672]; +ld.shared.u32 r2354, [r3742+32768]; +ld.shared.u32 r2550, [r3742+36864]; +ld.shared.u32 r2404, [r3742+40960]; +ld.shared.u32 r2600, [r3742+45056]; +ld.shared.u32 r2366, [r3742+49152]; +ld.shared.u32 r2562, [r3742+53248]; +ld.shared.u32 r2416, [r3742+57344]; +ld.shared.u32 r2612, [r3742+61440]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2479; +} +{ +add.f16x2 r2524, r2384, r2431; +} +{ +sub.f16x2 r2527, r2381, r2479; +} +{ +sub.f16x2 r2530, r2384, r2431; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2675; +} +{ +add.f16x2 r2720, r2580, r2627; +} +{ +sub.f16x2 r2723, r2577, r2675; +} +{ +sub.f16x2 r2726, r2580, r2627; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2702; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2819; +} +{ +add.f16x2 r2920, r2506, r2699; +} +{ +sub.f16x2 r2923, r2503, r2819; +} +{ +sub.f16x2 r2926, r2506, r2699; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3743, r3727, 768; +bfe.u32 r3744, r3727, 8, 2; +and.b32 r3745, r3737, 1020; +add.s32 r3746, r3731, r3745; +cvt.rn.f32.u32 f457, r3744; +mul.f32 f458, f457, 0f3DC90FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +fma.rn.f16x2 r2975, r2881, r2968, r2972; +} +{ +mul.f16x2 r2979, r2881, r2970; +} +{ +neg.f16x2 r2982, r2979; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +fma.rn.f16x2 r3012, r2893, r3005, r3009; +} +{ +mul.f16x2 r3016, r2893, r3007; +} +{ +neg.f16x2 r3019, r3016; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +fma.rn.f16x2 r3049, r2905, r3042, r3046; +} +{ +mul.f16x2 r3053, r2905, r3044; +} +{ +neg.f16x2 r3056, r3053; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +fma.rn.f16x2 r3086, r2917, r3079, r3083; +} +{ +mul.f16x2 r3090, r2917, r3081; +} +{ +neg.f16x2 r3093, r3090; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +fma.rn.f16x2 r3123, r2929, r3116, r3120; +} +{ +mul.f16x2 r3127, r2929, r3118; +} +{ +neg.f16x2 r3130, r3127; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +fma.rn.f16x2 r3160, r2941, r3153, r3157; +} +{ +mul.f16x2 r3164, r2941, r3155; +} +{ +neg.f16x2 r3167, r3164; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +fma.rn.f16x2 r3197, r2953, r3190, r3194; +} +{ +mul.f16x2 r3201, r2953, r3192; +} +{ +neg.f16x2 r3204, r3201; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +fma.rn.f16x2 r3234, r2875, r3227, r3231; +} +{ +mul.f16x2 r3238, r2875, r3229; +} +{ +neg.f16x2 r3241, r3238; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +fma.rn.f16x2 r3271, r2887, r3264, r3268; +} +{ +mul.f16x2 r3275, r2887, r3266; +} +{ +neg.f16x2 r3278, r3275; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +fma.rn.f16x2 r3308, r2899, r3301, r3305; +} +{ +mul.f16x2 r3312, r2899, r3303; +} +{ +neg.f16x2 r3315, r3312; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +fma.rn.f16x2 r3345, r2911, r3338, r3342; +} +{ +mul.f16x2 r3349, r2911, r3340; +} +{ +neg.f16x2 r3352, r3349; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +fma.rn.f16x2 r3382, r2923, r3375, r3379; +} +{ +mul.f16x2 r3386, r2923, r3377; +} +{ +neg.f16x2 r3389, r3386; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +fma.rn.f16x2 r3419, r2935, r3412, r3416; +} +{ +mul.f16x2 r3423, r2935, r3414; +} +{ +neg.f16x2 r3426, r3423; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +fma.rn.f16x2 r3456, r2947, r3449, r3453; +} +{ +mul.f16x2 r3460, r2947, r3451; +} +{ +neg.f16x2 r3463, r3460; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +fma.rn.f16x2 r3493, r2959, r3486, r3490; +} +{ +mul.f16x2 r3497, r2959, r3488; +} +{ +neg.f16x2 r3500, r3497; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3500; +} +barrier.sync 0; +and.b32 r3747, r3729, 49152; +add.s32 r3748, r3746, r3747; +st.shared.u32 [r3748], r2869; +st.shared.u32 [r3748+1024], r2975; +st.shared.u32 [r3748+2048], r3012; +st.shared.u32 [r3748+3072], r3049; +st.shared.u32 [r3748+4096], r3086; +st.shared.u32 [r3748+5120], r3123; +st.shared.u32 [r3748+6144], r3160; +st.shared.u32 [r3748+7168], r3197; +st.shared.u32 [r3748+8192], r3234; +st.shared.u32 [r3748+9216], r3271; +st.shared.u32 [r3748+10240], r3308; +st.shared.u32 [r3748+11264], r3345; +st.shared.u32 [r3748+12288], r3382; +st.shared.u32 [r3748+13312], r3419; +st.shared.u32 [r3748+14336], r3456; +st.shared.u32 [r3748+15360], r3493; +barrier.sync 0; +mad.lo.s32 r3749, r3743, -60, r3748; +ld.shared.u32 r3524, [r3749]; +ld.shared.u32 r3574, [r3749+4096]; +ld.shared.u32 r3624, [r3749+8192]; +ld.shared.u32 r3674, [r3749+12288]; +ld.shared.u32 r3536, [r3749+16384]; +ld.shared.u32 r3586, [r3749+20480]; +ld.shared.u32 r3636, [r3749+24576]; +ld.shared.u32 r3686, [r3749+28672]; +ld.shared.u32 r3525, [r3749+32768]; +ld.shared.u32 r3575, [r3749+36864]; +ld.shared.u32 r3625, [r3749+40960]; +ld.shared.u32 r3675, [r3749+45056]; +ld.shared.u32 r3537, [r3749+49152]; +ld.shared.u32 r3587, [r3749+53248]; +ld.shared.u32 r3637, [r3749+57344]; +ld.shared.u32 r3687, [r3749+61440]; +barrier.sync 0; +st.shared.u32 [r3748], r2872; +st.shared.u32 [r3748+1024], r2984; +st.shared.u32 [r3748+2048], r3021; +st.shared.u32 [r3748+3072], r3058; +st.shared.u32 [r3748+4096], r3095; +st.shared.u32 [r3748+5120], r3132; +st.shared.u32 [r3748+6144], r3169; +st.shared.u32 [r3748+7168], r3206; +st.shared.u32 [r3748+8192], r3243; +st.shared.u32 [r3748+9216], r3280; +st.shared.u32 [r3748+10240], r3317; +st.shared.u32 [r3748+11264], r3354; +st.shared.u32 [r3748+12288], r3391; +st.shared.u32 [r3748+13312], r3428; +st.shared.u32 [r3748+14336], r3465; +st.shared.u32 [r3748+15360], r3502; +barrier.sync 0; +ld.shared.u32 r3527, [r3749]; +ld.shared.u32 r3577, [r3749+4096]; +ld.shared.u32 r3627, [r3749+8192]; +ld.shared.u32 r3677, [r3749+12288]; +ld.shared.u32 r3539, [r3749+16384]; +ld.shared.u32 r3589, [r3749+20480]; +ld.shared.u32 r3639, [r3749+24576]; +ld.shared.u32 r3689, [r3749+28672]; +ld.shared.u32 r3528, [r3749+32768]; +ld.shared.u32 r3578, [r3749+36864]; +ld.shared.u32 r3628, [r3749+40960]; +ld.shared.u32 r3678, [r3749+45056]; +ld.shared.u32 r3540, [r3749+49152]; +ld.shared.u32 r3590, [r3749+53248]; +ld.shared.u32 r3640, [r3749+57344]; +ld.shared.u32 r3690, [r3749+61440]; +{ +add.f16x2 r3523, r3524, r3525; +} +{ +add.f16x2 r3526, r3527, r3528; +} +{ +sub.f16x2 r3529, r3524, r3525; +} +{ +sub.f16x2 r3532, r3527, r3528; +} +{ +add.f16x2 r3535, r3536, r3537; +} +{ +add.f16x2 r3538, r3539, r3540; +} +{ +sub.f16x2 r3541, r3536, r3537; +} +{ +sub.f16x2 r3544, r3539, r3540; +} +{ +neg.f16x2 r3547, r3544; +} +{ +add.f16x2 %0, r3523, r3535; +} +{ +add.f16x2 %1, r3526, r3538; +} +{ +sub.f16x2 %16, r3523, r3535; +} +{ +sub.f16x2 %17, r3526, r3538; +} +{ +add.f16x2 %8, r3529, r3547; +} +{ +add.f16x2 %9, r3532, r3541; +} +{ +sub.f16x2 %24, r3529, r3547; +} +{ +sub.f16x2 %25, r3532, r3541; +} +{ +add.f16x2 r3573, r3574, r3575; +} +{ +add.f16x2 r3576, r3577, r3578; +} +{ +sub.f16x2 r3579, r3574, r3575; +} +{ +sub.f16x2 r3582, r3577, r3578; +} +{ +add.f16x2 r3585, r3586, r3587; +} +{ +add.f16x2 r3588, r3589, r3590; +} +{ +sub.f16x2 r3591, r3586, r3587; +} +{ +sub.f16x2 r3594, r3589, r3590; +} +{ +neg.f16x2 r3597, r3594; +} +{ +add.f16x2 %2, r3573, r3585; +} +{ +add.f16x2 %3, r3576, r3588; +} +{ +sub.f16x2 %18, r3573, r3585; +} +{ +sub.f16x2 %19, r3576, r3588; +} +{ +add.f16x2 %10, r3579, r3597; +} +{ +add.f16x2 %11, r3582, r3591; +} +{ +sub.f16x2 %26, r3579, r3597; +} +{ +sub.f16x2 %27, r3582, r3591; +} +{ +add.f16x2 r3623, r3624, r3625; +} +{ +add.f16x2 r3626, r3627, r3628; +} +{ +sub.f16x2 r3629, r3624, r3625; +} +{ +sub.f16x2 r3632, r3627, r3628; +} +{ +add.f16x2 r3635, r3636, r3637; +} +{ +add.f16x2 r3638, r3639, r3640; +} +{ +sub.f16x2 r3641, r3636, r3637; +} +{ +sub.f16x2 r3644, r3639, r3640; +} +{ +neg.f16x2 r3647, r3644; +} +{ +add.f16x2 %4, r3623, r3635; +} +{ +add.f16x2 %5, r3626, r3638; +} +{ +sub.f16x2 %20, r3623, r3635; +} +{ +sub.f16x2 %21, r3626, r3638; +} +{ +add.f16x2 %12, r3629, r3647; +} +{ +add.f16x2 %13, r3632, r3641; +} +{ +sub.f16x2 %28, r3629, r3647; +} +{ +sub.f16x2 %29, r3632, r3641; +} +{ +add.f16x2 r3673, r3674, r3675; +} +{ +add.f16x2 r3676, r3677, r3678; +} +{ +sub.f16x2 r3679, r3674, r3675; +} +{ +sub.f16x2 r3682, r3677, r3678; +} +{ +add.f16x2 r3685, r3686, r3687; +} +{ +add.f16x2 r3688, r3689, r3690; +} +{ +sub.f16x2 r3691, r3686, r3687; +} +{ +sub.f16x2 r3694, r3689, r3690; +} +{ +neg.f16x2 r3697, r3694; +} +{ +add.f16x2 %6, r3673, r3685; +} +{ +add.f16x2 %7, r3676, r3688; +} +{ +sub.f16x2 %22, r3673, r3685; +} +{ +sub.f16x2 %23, r3676, r3688; +} +{ +add.f16x2 %14, r3679, r3697; +} +{ +add.f16x2 %15, r3682, r3691; +} +{ +sub.f16x2 %30, r3679, r3697; +} +{ +sub.f16x2 %31, r3682, r3691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1181, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3750>; +.reg .b64 rd<2>; +mov.u32 r3723, %tid.y; +shl.b32 r3724, r3723, 17; +mov.u32 r3725, %32; +add.s32 r3726, r3725, r3724; +mov.u32 r3727, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f380, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f448, 0f3F800000; +mov.f32 f378, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f376, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r398, {low, high}; +} +mov.f32 f374, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3728, r3727, 1023; +shl.b32 r3729, r3727, 7; +and.b32 r3730, r3729, -131072; +add.s32 r3731, r3726, r3730; +cvt.rn.f32.u32 f451, r3728; +mul.f32 f452, f451, 0f39C90FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r3732, r3729, 130944; +add.s32 r3733, r3731, r3732; +st.shared.v4.f32 [r3733], {r521, r524, r627, r636}; +st.shared.v4.f32 [r3733+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r3733+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r3733+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r3733+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r3733+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r3733+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r3733+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r3734, r3728, -120, r3733; +ld.shared.u32 r1176, [r3734]; +ld.shared.u32 r1179, [r3734+4]; +ld.shared.u32 r1372, [r3734+8192]; +ld.shared.u32 r1375, [r3734+8196]; +ld.shared.u32 r1226, [r3734+16384]; +ld.shared.u32 r1229, [r3734+16388]; +ld.shared.u32 r1422, [r3734+24576]; +ld.shared.u32 r1425, [r3734+24580]; +ld.shared.u32 r1188, [r3734+32768]; +ld.shared.u32 r1191, [r3734+32772]; +ld.shared.u32 r1384, [r3734+40960]; +ld.shared.u32 r1387, [r3734+40964]; +ld.shared.u32 r1238, [r3734+49152]; +ld.shared.u32 r1241, [r3734+49156]; +ld.shared.u32 r1434, [r3734+57344]; +ld.shared.u32 r1437, [r3734+57348]; +ld.shared.u32 r1177, [r3734+65536]; +ld.shared.u32 r1180, [r3734+65540]; +ld.shared.u32 r1373, [r3734+73728]; +ld.shared.u32 r1376, [r3734+73732]; +ld.shared.u32 r1227, [r3734+81920]; +ld.shared.u32 r1230, [r3734+81924]; +ld.shared.u32 r1423, [r3734+90112]; +ld.shared.u32 r1426, [r3734+90116]; +ld.shared.u32 r1189, [r3734+98304]; +ld.shared.u32 r1192, [r3734+98308]; +ld.shared.u32 r1385, [r3734+106496]; +ld.shared.u32 r1388, [r3734+106500]; +ld.shared.u32 r1239, [r3734+114688]; +ld.shared.u32 r1242, [r3734+114692]; +ld.shared.u32 r1435, [r3734+122880]; +ld.shared.u32 r1438, [r3734+122884]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3735, r3727, 1008; +bfe.u32 r3736, r3727, 4, 6; +cvt.rn.f32.u32 f454, r3736; +mul.f32 f455, f454, 0f3BC90FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +shl.b32 r3737, r3727, 3; +and.b32 r3738, r3737, 120; +add.s32 r3739, r3731, r3738; +barrier.sync 0; +and.b32 r3740, r3729, 129024; +add.s32 r3741, r3739, r3740; +st.shared.u32 [r3741], r1695; +st.shared.u32 [r3741+4], r1698; +st.shared.u32 [r3741+128], r1801; +st.shared.u32 [r3741+132], r1810; +st.shared.u32 [r3741+256], r1838; +st.shared.u32 [r3741+260], r1847; +st.shared.u32 [r3741+384], r1875; +st.shared.u32 [r3741+388], r1884; +st.shared.u32 [r3741+512], r1912; +st.shared.u32 [r3741+516], r1921; +st.shared.u32 [r3741+640], r1949; +st.shared.u32 [r3741+644], r1958; +st.shared.u32 [r3741+768], r1986; +st.shared.u32 [r3741+772], r1995; +st.shared.u32 [r3741+896], r2023; +st.shared.u32 [r3741+900], r2032; +st.shared.u32 [r3741+1024], r2060; +st.shared.u32 [r3741+1028], r2069; +st.shared.u32 [r3741+1152], r2097; +st.shared.u32 [r3741+1156], r2106; +st.shared.u32 [r3741+1280], r2134; +st.shared.u32 [r3741+1284], r2143; +st.shared.u32 [r3741+1408], r2171; +st.shared.u32 [r3741+1412], r2180; +st.shared.u32 [r3741+1536], r2208; +st.shared.u32 [r3741+1540], r2217; +st.shared.u32 [r3741+1664], r2245; +st.shared.u32 [r3741+1668], r2254; +st.shared.u32 [r3741+1792], r2282; +st.shared.u32 [r3741+1796], r2291; +st.shared.u32 [r3741+1920], r2319; +st.shared.u32 [r3741+1924], r2328; +barrier.sync 0; +mad.lo.s32 r3742, r3735, -120, r3741; +ld.shared.u32 r2350, [r3742]; +ld.shared.u32 r2353, [r3742+4]; +ld.shared.u32 r2546, [r3742+8192]; +ld.shared.u32 r2549, [r3742+8196]; +ld.shared.u32 r2400, [r3742+16384]; +ld.shared.u32 r2403, [r3742+16388]; +ld.shared.u32 r2596, [r3742+24576]; +ld.shared.u32 r2599, [r3742+24580]; +ld.shared.u32 r2362, [r3742+32768]; +ld.shared.u32 r2365, [r3742+32772]; +ld.shared.u32 r2558, [r3742+40960]; +ld.shared.u32 r2561, [r3742+40964]; +ld.shared.u32 r2412, [r3742+49152]; +ld.shared.u32 r2415, [r3742+49156]; +ld.shared.u32 r2608, [r3742+57344]; +ld.shared.u32 r2611, [r3742+57348]; +ld.shared.u32 r2351, [r3742+65536]; +ld.shared.u32 r2354, [r3742+65540]; +ld.shared.u32 r2547, [r3742+73728]; +ld.shared.u32 r2550, [r3742+73732]; +ld.shared.u32 r2401, [r3742+81920]; +ld.shared.u32 r2404, [r3742+81924]; +ld.shared.u32 r2597, [r3742+90112]; +ld.shared.u32 r2600, [r3742+90116]; +ld.shared.u32 r2363, [r3742+98304]; +ld.shared.u32 r2366, [r3742+98308]; +ld.shared.u32 r2559, [r3742+106496]; +ld.shared.u32 r2562, [r3742+106500]; +ld.shared.u32 r2413, [r3742+114688]; +ld.shared.u32 r2416, [r3742+114692]; +ld.shared.u32 r2609, [r3742+122880]; +ld.shared.u32 r2612, [r3742+122884]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2479; +} +{ +add.f16x2 r2524, r2384, r2431; +} +{ +sub.f16x2 r2527, r2381, r2479; +} +{ +sub.f16x2 r2530, r2384, r2431; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2675; +} +{ +add.f16x2 r2720, r2580, r2627; +} +{ +sub.f16x2 r2723, r2577, r2675; +} +{ +sub.f16x2 r2726, r2580, r2627; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2702; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2819; +} +{ +add.f16x2 r2920, r2506, r2699; +} +{ +sub.f16x2 r2923, r2503, r2819; +} +{ +sub.f16x2 r2926, r2506, r2699; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3743, r3727, 768; +bfe.u32 r3744, r3727, 8, 2; +cvt.rn.f32.u32 f457, r3744; +mul.f32 f458, f457, 0f3DC90FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +fma.rn.f16x2 r2975, r2881, r2968, r2972; +} +{ +mul.f16x2 r2979, r2881, r2970; +} +{ +neg.f16x2 r2982, r2979; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +fma.rn.f16x2 r3012, r2893, r3005, r3009; +} +{ +mul.f16x2 r3016, r2893, r3007; +} +{ +neg.f16x2 r3019, r3016; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +fma.rn.f16x2 r3049, r2905, r3042, r3046; +} +{ +mul.f16x2 r3053, r2905, r3044; +} +{ +neg.f16x2 r3056, r3053; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +fma.rn.f16x2 r3086, r2917, r3079, r3083; +} +{ +mul.f16x2 r3090, r2917, r3081; +} +{ +neg.f16x2 r3093, r3090; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +fma.rn.f16x2 r3123, r2929, r3116, r3120; +} +{ +mul.f16x2 r3127, r2929, r3118; +} +{ +neg.f16x2 r3130, r3127; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +fma.rn.f16x2 r3160, r2941, r3153, r3157; +} +{ +mul.f16x2 r3164, r2941, r3155; +} +{ +neg.f16x2 r3167, r3164; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +fma.rn.f16x2 r3197, r2953, r3190, r3194; +} +{ +mul.f16x2 r3201, r2953, r3192; +} +{ +neg.f16x2 r3204, r3201; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +fma.rn.f16x2 r3234, r2875, r3227, r3231; +} +{ +mul.f16x2 r3238, r2875, r3229; +} +{ +neg.f16x2 r3241, r3238; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +fma.rn.f16x2 r3271, r2887, r3264, r3268; +} +{ +mul.f16x2 r3275, r2887, r3266; +} +{ +neg.f16x2 r3278, r3275; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +fma.rn.f16x2 r3308, r2899, r3301, r3305; +} +{ +mul.f16x2 r3312, r2899, r3303; +} +{ +neg.f16x2 r3315, r3312; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +fma.rn.f16x2 r3345, r2911, r3338, r3342; +} +{ +mul.f16x2 r3349, r2911, r3340; +} +{ +neg.f16x2 r3352, r3349; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +fma.rn.f16x2 r3382, r2923, r3375, r3379; +} +{ +mul.f16x2 r3386, r2923, r3377; +} +{ +neg.f16x2 r3389, r3386; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +fma.rn.f16x2 r3419, r2935, r3412, r3416; +} +{ +mul.f16x2 r3423, r2935, r3414; +} +{ +neg.f16x2 r3426, r3423; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +fma.rn.f16x2 r3456, r2947, r3449, r3453; +} +{ +mul.f16x2 r3460, r2947, r3451; +} +{ +neg.f16x2 r3463, r3460; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +fma.rn.f16x2 r3493, r2959, r3486, r3490; +} +{ +mul.f16x2 r3497, r2959, r3488; +} +{ +neg.f16x2 r3500, r3497; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3500; +} +and.b32 r3745, r3737, 2040; +add.s32 r3746, r3731, r3745; +barrier.sync 0; +and.b32 r3747, r3729, 98304; +add.s32 r3748, r3746, r3747; +st.shared.u32 [r3748], r2869; +st.shared.u32 [r3748+4], r2872; +st.shared.u32 [r3748+2048], r2975; +st.shared.u32 [r3748+2052], r2984; +st.shared.u32 [r3748+4096], r3012; +st.shared.u32 [r3748+4100], r3021; +st.shared.u32 [r3748+6144], r3049; +st.shared.u32 [r3748+6148], r3058; +st.shared.u32 [r3748+8192], r3086; +st.shared.u32 [r3748+8196], r3095; +st.shared.u32 [r3748+10240], r3123; +st.shared.u32 [r3748+10244], r3132; +st.shared.u32 [r3748+12288], r3160; +st.shared.u32 [r3748+12292], r3169; +st.shared.u32 [r3748+14336], r3197; +st.shared.u32 [r3748+14340], r3206; +st.shared.u32 [r3748+16384], r3234; +st.shared.u32 [r3748+16388], r3243; +st.shared.u32 [r3748+18432], r3271; +st.shared.u32 [r3748+18436], r3280; +st.shared.u32 [r3748+20480], r3308; +st.shared.u32 [r3748+20484], r3317; +st.shared.u32 [r3748+22528], r3345; +st.shared.u32 [r3748+22532], r3354; +st.shared.u32 [r3748+24576], r3382; +st.shared.u32 [r3748+24580], r3391; +st.shared.u32 [r3748+26624], r3419; +st.shared.u32 [r3748+26628], r3428; +st.shared.u32 [r3748+28672], r3456; +st.shared.u32 [r3748+28676], r3465; +st.shared.u32 [r3748+30720], r3493; +st.shared.u32 [r3748+30724], r3502; +barrier.sync 0; +mad.lo.s32 r3749, r3743, -120, r3748; +ld.shared.u32 r3524, [r3749]; +ld.shared.u32 r3527, [r3749+4]; +ld.shared.u32 r3574, [r3749+8192]; +ld.shared.u32 r3577, [r3749+8196]; +ld.shared.u32 r3624, [r3749+16384]; +ld.shared.u32 r3627, [r3749+16388]; +ld.shared.u32 r3674, [r3749+24576]; +ld.shared.u32 r3677, [r3749+24580]; +ld.shared.u32 r3536, [r3749+32768]; +ld.shared.u32 r3539, [r3749+32772]; +ld.shared.u32 r3586, [r3749+40960]; +ld.shared.u32 r3589, [r3749+40964]; +ld.shared.u32 r3636, [r3749+49152]; +ld.shared.u32 r3639, [r3749+49156]; +ld.shared.u32 r3686, [r3749+57344]; +ld.shared.u32 r3689, [r3749+57348]; +ld.shared.u32 r3525, [r3749+65536]; +ld.shared.u32 r3528, [r3749+65540]; +ld.shared.u32 r3575, [r3749+73728]; +ld.shared.u32 r3578, [r3749+73732]; +ld.shared.u32 r3625, [r3749+81920]; +ld.shared.u32 r3628, [r3749+81924]; +ld.shared.u32 r3675, [r3749+90112]; +ld.shared.u32 r3678, [r3749+90116]; +ld.shared.u32 r3537, [r3749+98304]; +ld.shared.u32 r3540, [r3749+98308]; +ld.shared.u32 r3587, [r3749+106496]; +ld.shared.u32 r3590, [r3749+106500]; +ld.shared.u32 r3637, [r3749+114688]; +ld.shared.u32 r3640, [r3749+114692]; +ld.shared.u32 r3687, [r3749+122880]; +ld.shared.u32 r3690, [r3749+122884]; +{ +add.f16x2 r3523, r3524, r3525; +} +{ +add.f16x2 r3526, r3527, r3528; +} +{ +sub.f16x2 r3529, r3524, r3525; +} +{ +sub.f16x2 r3532, r3527, r3528; +} +{ +add.f16x2 r3535, r3536, r3537; +} +{ +add.f16x2 r3538, r3539, r3540; +} +{ +sub.f16x2 r3541, r3536, r3537; +} +{ +sub.f16x2 r3544, r3539, r3540; +} +{ +neg.f16x2 r3547, r3544; +} +{ +add.f16x2 %0, r3523, r3535; +} +{ +add.f16x2 %1, r3526, r3538; +} +{ +sub.f16x2 %16, r3523, r3535; +} +{ +sub.f16x2 %17, r3526, r3538; +} +{ +add.f16x2 %8, r3529, r3547; +} +{ +add.f16x2 %9, r3532, r3541; +} +{ +sub.f16x2 %24, r3529, r3547; +} +{ +sub.f16x2 %25, r3532, r3541; +} +{ +add.f16x2 r3573, r3574, r3575; +} +{ +add.f16x2 r3576, r3577, r3578; +} +{ +sub.f16x2 r3579, r3574, r3575; +} +{ +sub.f16x2 r3582, r3577, r3578; +} +{ +add.f16x2 r3585, r3586, r3587; +} +{ +add.f16x2 r3588, r3589, r3590; +} +{ +sub.f16x2 r3591, r3586, r3587; +} +{ +sub.f16x2 r3594, r3589, r3590; +} +{ +neg.f16x2 r3597, r3594; +} +{ +add.f16x2 %2, r3573, r3585; +} +{ +add.f16x2 %3, r3576, r3588; +} +{ +sub.f16x2 %18, r3573, r3585; +} +{ +sub.f16x2 %19, r3576, r3588; +} +{ +add.f16x2 %10, r3579, r3597; +} +{ +add.f16x2 %11, r3582, r3591; +} +{ +sub.f16x2 %26, r3579, r3597; +} +{ +sub.f16x2 %27, r3582, r3591; +} +{ +add.f16x2 r3623, r3624, r3625; +} +{ +add.f16x2 r3626, r3627, r3628; +} +{ +sub.f16x2 r3629, r3624, r3625; +} +{ +sub.f16x2 r3632, r3627, r3628; +} +{ +add.f16x2 r3635, r3636, r3637; +} +{ +add.f16x2 r3638, r3639, r3640; +} +{ +sub.f16x2 r3641, r3636, r3637; +} +{ +sub.f16x2 r3644, r3639, r3640; +} +{ +neg.f16x2 r3647, r3644; +} +{ +add.f16x2 %4, r3623, r3635; +} +{ +add.f16x2 %5, r3626, r3638; +} +{ +sub.f16x2 %20, r3623, r3635; +} +{ +sub.f16x2 %21, r3626, r3638; +} +{ +add.f16x2 %12, r3629, r3647; +} +{ +add.f16x2 %13, r3632, r3641; +} +{ +sub.f16x2 %28, r3629, r3647; +} +{ +sub.f16x2 %29, r3632, r3641; +} +{ +add.f16x2 r3673, r3674, r3675; +} +{ +add.f16x2 r3676, r3677, r3678; +} +{ +sub.f16x2 r3679, r3674, r3675; +} +{ +sub.f16x2 r3682, r3677, r3678; +} +{ +add.f16x2 r3685, r3686, r3687; +} +{ +add.f16x2 r3688, r3689, r3690; +} +{ +sub.f16x2 r3691, r3686, r3687; +} +{ +sub.f16x2 r3694, r3689, r3690; +} +{ +neg.f16x2 r3697, r3694; +} +{ +add.f16x2 %6, r3673, r3685; +} +{ +add.f16x2 %7, r3676, r3688; +} +{ +sub.f16x2 %22, r3673, r3685; +} +{ +sub.f16x2 %23, r3676, r3688; +} +{ +add.f16x2 %14, r3679, r3697; +} +{ +add.f16x2 %15, r3682, r3691; +} +{ +sub.f16x2 %30, r3679, r3697; +} +{ +sub.f16x2 %31, r3682, r3691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1064, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1085>; +.reg .b32 r<7042>; +.reg .b64 rd<3>; +mov.u32 r6957, %tid.y; +shl.b32 r6958, r6957, 16; +mov.u32 r6959, %64; +add.s32 r6960, r6959, r6958; +mov.u32 r6961, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %100; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %100; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f1040, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r102, {low, high}; +} +mov.f32 f1038, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f1036, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r393, {low, high}; +} +mov.f32 f1044, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r398, {low, high}; +} +mov.f32 f1034, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r404, {low, high}; +} +mov.f32 f1042, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %101; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %101; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6963, r6961, 7; +and.b32 r6964, r6963, -65536; +add.s32 r6965, r6960, r6964; +and.b32 r6977, r6961, 511; +cvt.rn.f32.u32 f1077, r6977; +mul.f32 f1078, f1077, 0f39C90FDB; +cos.approx.f32 f357, f1078; +sin.approx.f32 f1079, f1078; +neg.f32 f358, f1079; +mov.f32 f1084, 0f3F800000; +mov.f32 f1083, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r6966, r6963, 65408; +add.s32 r6967, r6965, r6966; +st.shared.v4.f32 [r6967], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r6967+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r6967+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r6967+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r6967+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r6967+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r6967+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r6967+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r6968, r6977, -124, r6967; +ld.shared.u32 r2864, [r6968]; +ld.shared.u32 r3480, [r6968+2048]; +ld.shared.u32 r3060, [r6968+4096]; +ld.shared.u32 r3676, [r6968+6144]; +ld.shared.u32 r2914, [r6968+8192]; +ld.shared.u32 r3530, [r6968+10240]; +ld.shared.u32 r3110, [r6968+12288]; +ld.shared.u32 r3726, [r6968+14336]; +ld.shared.u32 r2876, [r6968+16384]; +ld.shared.u32 r3492, [r6968+18432]; +ld.shared.u32 r3072, [r6968+20480]; +ld.shared.u32 r3688, [r6968+22528]; +ld.shared.u32 r2926, [r6968+24576]; +ld.shared.u32 r3542, [r6968+26624]; +ld.shared.u32 r3122, [r6968+28672]; +ld.shared.u32 r3738, [r6968+30720]; +ld.shared.u32 r2865, [r6968+32768]; +ld.shared.u32 r3481, [r6968+34816]; +ld.shared.u32 r3061, [r6968+36864]; +ld.shared.u32 r3677, [r6968+38912]; +ld.shared.u32 r2915, [r6968+40960]; +ld.shared.u32 r3531, [r6968+43008]; +ld.shared.u32 r3111, [r6968+45056]; +ld.shared.u32 r3727, [r6968+47104]; +ld.shared.u32 r2877, [r6968+49152]; +ld.shared.u32 r3493, [r6968+51200]; +ld.shared.u32 r3073, [r6968+53248]; +ld.shared.u32 r3689, [r6968+55296]; +ld.shared.u32 r2927, [r6968+57344]; +ld.shared.u32 r3543, [r6968+59392]; +ld.shared.u32 r3123, [r6968+61440]; +ld.shared.u32 r3739, [r6968+63488]; +barrier.sync 0; +st.shared.v4.f32 [r6967], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r6967+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r6967+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r6967+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r6967+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r6967+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r6967+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r6967+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r6968]; +ld.shared.u32 r3483, [r6968+2048]; +ld.shared.u32 r3063, [r6968+4096]; +ld.shared.u32 r3679, [r6968+6144]; +ld.shared.u32 r2917, [r6968+8192]; +ld.shared.u32 r3533, [r6968+10240]; +ld.shared.u32 r3113, [r6968+12288]; +ld.shared.u32 r3729, [r6968+14336]; +ld.shared.u32 r2879, [r6968+16384]; +ld.shared.u32 r3495, [r6968+18432]; +ld.shared.u32 r3075, [r6968+20480]; +ld.shared.u32 r3691, [r6968+22528]; +ld.shared.u32 r2929, [r6968+24576]; +ld.shared.u32 r3545, [r6968+26624]; +ld.shared.u32 r3125, [r6968+28672]; +ld.shared.u32 r3741, [r6968+30720]; +ld.shared.u32 r2868, [r6968+32768]; +ld.shared.u32 r3484, [r6968+34816]; +ld.shared.u32 r3064, [r6968+36864]; +ld.shared.u32 r3680, [r6968+38912]; +ld.shared.u32 r2918, [r6968+40960]; +ld.shared.u32 r3534, [r6968+43008]; +ld.shared.u32 r3114, [r6968+45056]; +ld.shared.u32 r3730, [r6968+47104]; +ld.shared.u32 r2880, [r6968+49152]; +ld.shared.u32 r3496, [r6968+51200]; +ld.shared.u32 r3076, [r6968+53248]; +ld.shared.u32 r3692, [r6968+55296]; +ld.shared.u32 r2930, [r6968+57344]; +ld.shared.u32 r3546, [r6968+59392]; +ld.shared.u32 r3126, [r6968+61440]; +ld.shared.u32 r3742, [r6968+63488]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r6969, r6961, 480; +bfe.u32 r6970, r6961, 5, 4; +shl.b32 r6971, r6961, 2; +and.b32 r6972, r6971, 124; +add.s32 r6973, r6965, r6972; +cvt.rn.f32.u32 f1080, r6970; +mul.f32 f1081, f1080, 0f3C490FDB; +cos.approx.f32 f779, f1081; +sin.approx.f32 f1082, f1081; +neg.f32 f780, f1082; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1083; +cvt.rn.f16.f32 high, f1084; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +barrier.sync 0; +and.b32 r6974, r6963, 61440; +add.s32 r6975, r6973, r6974; +st.shared.u32 [r6975], r4383; +st.shared.u32 [r6975+128], r4585; +st.shared.u32 [r6975+256], r4622; +st.shared.u32 [r6975+384], r4659; +st.shared.u32 [r6975+512], r4696; +st.shared.u32 [r6975+640], r4733; +st.shared.u32 [r6975+768], r4770; +st.shared.u32 [r6975+896], r4807; +st.shared.u32 [r6975+1024], r4844; +st.shared.u32 [r6975+1152], r4881; +st.shared.u32 [r6975+1280], r4918; +st.shared.u32 [r6975+1408], r4955; +st.shared.u32 [r6975+1536], r4992; +st.shared.u32 [r6975+1664], r5029; +st.shared.u32 [r6975+1792], r5066; +st.shared.u32 [r6975+1920], r5103; +st.shared.u32 [r6975+2048], r5140; +st.shared.u32 [r6975+2176], r5177; +st.shared.u32 [r6975+2304], r5214; +st.shared.u32 [r6975+2432], r5251; +st.shared.u32 [r6975+2560], r5288; +st.shared.u32 [r6975+2688], r5325; +st.shared.u32 [r6975+2816], r5362; +st.shared.u32 [r6975+2944], r5399; +st.shared.u32 [r6975+3072], r5436; +st.shared.u32 [r6975+3200], r5473; +st.shared.u32 [r6975+3328], r5510; +st.shared.u32 [r6975+3456], r5547; +st.shared.u32 [r6975+3584], r5584; +st.shared.u32 [r6975+3712], r5621; +st.shared.u32 [r6975+3840], r5658; +st.shared.u32 [r6975+3968], r5695; +barrier.sync 0; +mad.lo.s32 r6976, r6969, -124, r6975; +ld.shared.u32 r5726, [r6976]; +ld.shared.u32 r6342, [r6976+2048]; +ld.shared.u32 r5922, [r6976+4096]; +ld.shared.u32 r6538, [r6976+6144]; +ld.shared.u32 r5776, [r6976+8192]; +ld.shared.u32 r6392, [r6976+10240]; +ld.shared.u32 r5972, [r6976+12288]; +ld.shared.u32 r6588, [r6976+14336]; +ld.shared.u32 r5738, [r6976+16384]; +ld.shared.u32 r6354, [r6976+18432]; +ld.shared.u32 r5934, [r6976+20480]; +ld.shared.u32 r6550, [r6976+22528]; +ld.shared.u32 r5788, [r6976+24576]; +ld.shared.u32 r6404, [r6976+26624]; +ld.shared.u32 r5984, [r6976+28672]; +ld.shared.u32 r6600, [r6976+30720]; +ld.shared.u32 r5727, [r6976+32768]; +ld.shared.u32 r6343, [r6976+34816]; +ld.shared.u32 r5923, [r6976+36864]; +ld.shared.u32 r6539, [r6976+38912]; +ld.shared.u32 r5777, [r6976+40960]; +ld.shared.u32 r6393, [r6976+43008]; +ld.shared.u32 r5973, [r6976+45056]; +ld.shared.u32 r6589, [r6976+47104]; +ld.shared.u32 r5739, [r6976+49152]; +ld.shared.u32 r6355, [r6976+51200]; +ld.shared.u32 r5935, [r6976+53248]; +ld.shared.u32 r6551, [r6976+55296]; +ld.shared.u32 r5789, [r6976+57344]; +ld.shared.u32 r6405, [r6976+59392]; +ld.shared.u32 r5985, [r6976+61440]; +ld.shared.u32 r6601, [r6976+63488]; +barrier.sync 0; +st.shared.u32 [r6975], r4386; +st.shared.u32 [r6975+128], r4594; +st.shared.u32 [r6975+256], r4631; +st.shared.u32 [r6975+384], r4668; +st.shared.u32 [r6975+512], r4705; +st.shared.u32 [r6975+640], r4742; +st.shared.u32 [r6975+768], r4779; +st.shared.u32 [r6975+896], r4816; +st.shared.u32 [r6975+1024], r4853; +st.shared.u32 [r6975+1152], r4890; +st.shared.u32 [r6975+1280], r4927; +st.shared.u32 [r6975+1408], r4964; +st.shared.u32 [r6975+1536], r5001; +st.shared.u32 [r6975+1664], r5038; +st.shared.u32 [r6975+1792], r5075; +st.shared.u32 [r6975+1920], r5112; +st.shared.u32 [r6975+2048], r5149; +st.shared.u32 [r6975+2176], r5186; +st.shared.u32 [r6975+2304], r5223; +st.shared.u32 [r6975+2432], r5260; +st.shared.u32 [r6975+2560], r5297; +st.shared.u32 [r6975+2688], r5334; +st.shared.u32 [r6975+2816], r5371; +st.shared.u32 [r6975+2944], r5408; +st.shared.u32 [r6975+3072], r5445; +st.shared.u32 [r6975+3200], r5482; +st.shared.u32 [r6975+3328], r5519; +st.shared.u32 [r6975+3456], r5556; +st.shared.u32 [r6975+3584], r5593; +st.shared.u32 [r6975+3712], r5630; +st.shared.u32 [r6975+3840], r5667; +st.shared.u32 [r6975+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r6976]; +ld.shared.u32 r6345, [r6976+2048]; +ld.shared.u32 r5925, [r6976+4096]; +ld.shared.u32 r6541, [r6976+6144]; +ld.shared.u32 r5779, [r6976+8192]; +ld.shared.u32 r6395, [r6976+10240]; +ld.shared.u32 r5975, [r6976+12288]; +ld.shared.u32 r6591, [r6976+14336]; +ld.shared.u32 r5741, [r6976+16384]; +ld.shared.u32 r6357, [r6976+18432]; +ld.shared.u32 r5937, [r6976+20480]; +ld.shared.u32 r6553, [r6976+22528]; +ld.shared.u32 r5791, [r6976+24576]; +ld.shared.u32 r6407, [r6976+26624]; +ld.shared.u32 r5987, [r6976+28672]; +ld.shared.u32 r6603, [r6976+30720]; +ld.shared.u32 r5730, [r6976+32768]; +ld.shared.u32 r6346, [r6976+34816]; +ld.shared.u32 r5926, [r6976+36864]; +ld.shared.u32 r6542, [r6976+38912]; +ld.shared.u32 r5780, [r6976+40960]; +ld.shared.u32 r6396, [r6976+43008]; +ld.shared.u32 r5976, [r6976+45056]; +ld.shared.u32 r6592, [r6976+47104]; +ld.shared.u32 r5742, [r6976+49152]; +ld.shared.u32 r6358, [r6976+51200]; +ld.shared.u32 r5938, [r6976+53248]; +ld.shared.u32 r6554, [r6976+55296]; +ld.shared.u32 r5792, [r6976+57344]; +ld.shared.u32 r6408, [r6976+59392]; +ld.shared.u32 r5988, [r6976+61440]; +ld.shared.u32 r6604, [r6976+63488]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5749; +} +{ +add.f16x2 r5766, r5734, r5743; +} +{ +sub.f16x2 r5769, r5731, r5749; +} +{ +sub.f16x2 r5772, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5799; +} +{ +add.f16x2 r5816, r5784, r5793; +} +{ +sub.f16x2 r5819, r5781, r5799; +} +{ +sub.f16x2 r5822, r5784, r5793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5810; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 r5873, r5751, r5801; +} +{ +add.f16x2 r5876, r5754, r5804; +} +{ +sub.f16x2 r5879, r5751, r5801; +} +{ +sub.f16x2 r5882, r5754, r5804; +} +{ +add.f16x2 r5885, r5763, r5845; +} +{ +add.f16x2 r5888, r5766, r5851; +} +{ +sub.f16x2 r5891, r5763, r5845; +} +{ +sub.f16x2 r5894, r5766, r5851; +} +{ +add.f16x2 r5897, r5757, r5855; +} +{ +add.f16x2 r5900, r5760, r5807; +} +{ +sub.f16x2 r5903, r5757, r5855; +} +{ +sub.f16x2 r5906, r5760, r5807; +} +{ +add.f16x2 r5909, r5769, r5863; +} +{ +add.f16x2 r5912, r5772, r5869; +} +{ +sub.f16x2 r5915, r5769, r5863; +} +{ +sub.f16x2 r5918, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5942; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5945; +} +{ +add.f16x2 r5962, r5930, r5939; +} +{ +sub.f16x2 r5965, r5927, r5945; +} +{ +sub.f16x2 r5968, r5930, r5939; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5992; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5995; +} +{ +add.f16x2 r6012, r5980, r5989; +} +{ +sub.f16x2 r6015, r5977, r5995; +} +{ +sub.f16x2 r6018, r5980, r5989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6006; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 r6069, r5947, r5997; +} +{ +add.f16x2 r6072, r5950, r6000; +} +{ +sub.f16x2 r6075, r5947, r5997; +} +{ +sub.f16x2 r6078, r5950, r6000; +} +{ +add.f16x2 r6081, r5959, r6041; +} +{ +add.f16x2 r6084, r5962, r6047; +} +{ +sub.f16x2 r6087, r5959, r6041; +} +{ +sub.f16x2 r6090, r5962, r6047; +} +{ +add.f16x2 r6093, r5953, r6051; +} +{ +add.f16x2 r6096, r5956, r6003; +} +{ +sub.f16x2 r6099, r5953, r6051; +} +{ +sub.f16x2 r6102, r5956, r6003; +} +{ +add.f16x2 r6105, r5965, r6059; +} +{ +add.f16x2 r6108, r5968, r6065; +} +{ +sub.f16x2 r6111, r5965, r6059; +} +{ +sub.f16x2 r6114, r5968, r6065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6125, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6126, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6128, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6130, {low, high}; +} +{ +mul.f16x2 r6147, r6081, r6117; +} +{ +mul.f16x2 r6150, r6084, r6118; +} +{ +sub.f16x2 r6153, r6147, r6150; +} +{ +mul.f16x2 r6156, r6081, r6118; +} +{ +fma.rn.f16x2 r6159, r6084, r6117, r6156; +} +{ +mul.f16x2 r6163, r6093, r6119; +} +{ +mul.f16x2 r6166, r6096, r6120; +} +{ +sub.f16x2 r6169, r6163, r6166; +} +{ +mul.f16x2 r6172, r6093, r6120; +} +{ +fma.rn.f16x2 r6175, r6096, r6119, r6172; +} +{ +mul.f16x2 r6179, r6105, r6121; +} +{ +mul.f16x2 r6182, r6108, r6122; +} +{ +sub.f16x2 r6185, r6179, r6182; +} +{ +mul.f16x2 r6188, r6105, r6122; +} +{ +fma.rn.f16x2 r6191, r6108, r6121, r6188; +} +{ +neg.f16x2 r6195, r6078; +} +{ +mul.f16x2 r6197, r6087, r6125; +} +{ +mul.f16x2 r6200, r6090, r6126; +} +{ +sub.f16x2 r6203, r6197, r6200; +} +{ +mul.f16x2 r6206, r6087, r6126; +} +{ +fma.rn.f16x2 r6209, r6090, r6125, r6206; +} +{ +mul.f16x2 r6213, r6099, r6127; +} +{ +mul.f16x2 r6216, r6102, r6128; +} +{ +sub.f16x2 r6219, r6213, r6216; +} +{ +mul.f16x2 r6222, r6099, r6128; +} +{ +fma.rn.f16x2 r6225, r6102, r6127, r6222; +} +{ +mul.f16x2 r6229, r6111, r6129; +} +{ +mul.f16x2 r6232, r6114, r6130; +} +{ +sub.f16x2 r6235, r6229, r6232; +} +{ +mul.f16x2 r6238, r6111, r6130; +} +{ +fma.rn.f16x2 r6241, r6114, r6129, r6238; +} +{ +add.f16x2 %0, r5873, r6069; +} +{ +add.f16x2 %1, r5876, r6072; +} +{ +sub.f16x2 %32, r5873, r6069; +} +{ +sub.f16x2 %33, r5876, r6072; +} +{ +add.f16x2 %4, r5885, r6153; +} +{ +add.f16x2 %5, r5888, r6159; +} +{ +sub.f16x2 %36, r5885, r6153; +} +{ +sub.f16x2 %37, r5888, r6159; +} +{ +add.f16x2 %8, r5897, r6169; +} +{ +add.f16x2 %9, r5900, r6175; +} +{ +sub.f16x2 %40, r5897, r6169; +} +{ +sub.f16x2 %41, r5900, r6175; +} +{ +add.f16x2 %12, r5909, r6185; +} +{ +add.f16x2 %13, r5912, r6191; +} +{ +sub.f16x2 %44, r5909, r6185; +} +{ +sub.f16x2 %45, r5912, r6191; +} +{ +add.f16x2 %16, r5879, r6195; +} +{ +add.f16x2 %17, r5882, r6075; +} +{ +sub.f16x2 %48, r5879, r6195; +} +{ +sub.f16x2 %49, r5882, r6075; +} +{ +add.f16x2 %20, r5891, r6203; +} +{ +add.f16x2 %21, r5894, r6209; +} +{ +sub.f16x2 %52, r5891, r6203; +} +{ +sub.f16x2 %53, r5894, r6209; +} +{ +add.f16x2 %24, r5903, r6219; +} +{ +add.f16x2 %25, r5906, r6225; +} +{ +sub.f16x2 %56, r5903, r6219; +} +{ +sub.f16x2 %57, r5906, r6225; +} +{ +add.f16x2 %28, r5915, r6235; +} +{ +add.f16x2 %29, r5918, r6241; +} +{ +sub.f16x2 %60, r5915, r6235; +} +{ +sub.f16x2 %61, r5918, r6241; +} +{ +add.f16x2 r6341, r6342, r6343; +} +{ +add.f16x2 r6344, r6345, r6346; +} +{ +sub.f16x2 r6347, r6342, r6343; +} +{ +sub.f16x2 r6350, r6345, r6346; +} +{ +add.f16x2 r6353, r6354, r6355; +} +{ +add.f16x2 r6356, r6357, r6358; +} +{ +sub.f16x2 r6359, r6354, r6355; +} +{ +sub.f16x2 r6362, r6357, r6358; +} +{ +neg.f16x2 r6365, r6362; +} +{ +add.f16x2 r6367, r6341, r6353; +} +{ +add.f16x2 r6370, r6344, r6356; +} +{ +sub.f16x2 r6373, r6341, r6353; +} +{ +sub.f16x2 r6376, r6344, r6356; +} +{ +add.f16x2 r6379, r6347, r6365; +} +{ +add.f16x2 r6382, r6350, r6359; +} +{ +sub.f16x2 r6385, r6347, r6365; +} +{ +sub.f16x2 r6388, r6350, r6359; +} +{ +add.f16x2 r6391, r6392, r6393; +} +{ +add.f16x2 r6394, r6395, r6396; +} +{ +sub.f16x2 r6397, r6392, r6393; +} +{ +sub.f16x2 r6400, r6395, r6396; +} +{ +add.f16x2 r6403, r6404, r6405; +} +{ +add.f16x2 r6406, r6407, r6408; +} +{ +sub.f16x2 r6409, r6404, r6405; +} +{ +sub.f16x2 r6412, r6407, r6408; +} +{ +neg.f16x2 r6415, r6412; +} +{ +add.f16x2 r6417, r6391, r6403; +} +{ +add.f16x2 r6420, r6394, r6406; +} +{ +sub.f16x2 r6423, r6391, r6403; +} +{ +sub.f16x2 r6426, r6394, r6406; +} +{ +add.f16x2 r6429, r6397, r6415; +} +{ +add.f16x2 r6432, r6400, r6409; +} +{ +sub.f16x2 r6435, r6397, r6415; +} +{ +sub.f16x2 r6438, r6400, r6409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6446, {low, high}; +} +{ +mul.f16x2 r6455, r6429, r6441; +} +{ +mul.f16x2 r6458, r6432, r6442; +} +{ +sub.f16x2 r6461, r6455, r6458; +} +{ +mul.f16x2 r6464, r6429, r6442; +} +{ +fma.rn.f16x2 r6467, r6432, r6441, r6464; +} +{ +neg.f16x2 r6471, r6426; +} +{ +mul.f16x2 r6473, r6435, r6445; +} +{ +mul.f16x2 r6476, r6438, r6446; +} +{ +sub.f16x2 r6479, r6473, r6476; +} +{ +mul.f16x2 r6482, r6435, r6446; +} +{ +fma.rn.f16x2 r6485, r6438, r6445, r6482; +} +{ +add.f16x2 r6489, r6367, r6417; +} +{ +add.f16x2 r6492, r6370, r6420; +} +{ +sub.f16x2 r6495, r6367, r6417; +} +{ +sub.f16x2 r6498, r6370, r6420; +} +{ +add.f16x2 r6501, r6379, r6461; +} +{ +add.f16x2 r6504, r6382, r6467; +} +{ +sub.f16x2 r6507, r6379, r6461; +} +{ +sub.f16x2 r6510, r6382, r6467; +} +{ +add.f16x2 r6513, r6373, r6471; +} +{ +add.f16x2 r6516, r6376, r6423; +} +{ +sub.f16x2 r6519, r6373, r6471; +} +{ +sub.f16x2 r6522, r6376, r6423; +} +{ +add.f16x2 r6525, r6385, r6479; +} +{ +add.f16x2 r6528, r6388, r6485; +} +{ +sub.f16x2 r6531, r6385, r6479; +} +{ +sub.f16x2 r6534, r6388, r6485; +} +{ +add.f16x2 r6537, r6538, r6539; +} +{ +add.f16x2 r6540, r6541, r6542; +} +{ +sub.f16x2 r6543, r6538, r6539; +} +{ +sub.f16x2 r6546, r6541, r6542; +} +{ +add.f16x2 r6549, r6550, r6551; +} +{ +add.f16x2 r6552, r6553, r6554; +} +{ +sub.f16x2 r6555, r6550, r6551; +} +{ +sub.f16x2 r6558, r6553, r6554; +} +{ +neg.f16x2 r6561, r6558; +} +{ +add.f16x2 r6563, r6537, r6549; +} +{ +add.f16x2 r6566, r6540, r6552; +} +{ +sub.f16x2 r6569, r6537, r6549; +} +{ +sub.f16x2 r6572, r6540, r6552; +} +{ +add.f16x2 r6575, r6543, r6561; +} +{ +add.f16x2 r6578, r6546, r6555; +} +{ +sub.f16x2 r6581, r6543, r6561; +} +{ +sub.f16x2 r6584, r6546, r6555; +} +{ +add.f16x2 r6587, r6588, r6589; +} +{ +add.f16x2 r6590, r6591, r6592; +} +{ +sub.f16x2 r6593, r6588, r6589; +} +{ +sub.f16x2 r6596, r6591, r6592; +} +{ +add.f16x2 r6599, r6600, r6601; +} +{ +add.f16x2 r6602, r6603, r6604; +} +{ +sub.f16x2 r6605, r6600, r6601; +} +{ +sub.f16x2 r6608, r6603, r6604; +} +{ +neg.f16x2 r6611, r6608; +} +{ +add.f16x2 r6613, r6587, r6599; +} +{ +add.f16x2 r6616, r6590, r6602; +} +{ +sub.f16x2 r6619, r6587, r6599; +} +{ +sub.f16x2 r6622, r6590, r6602; +} +{ +add.f16x2 r6625, r6593, r6611; +} +{ +add.f16x2 r6628, r6596, r6605; +} +{ +sub.f16x2 r6631, r6593, r6611; +} +{ +sub.f16x2 r6634, r6596, r6605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6642, {low, high}; +} +{ +mul.f16x2 r6651, r6625, r6637; +} +{ +mul.f16x2 r6654, r6628, r6638; +} +{ +sub.f16x2 r6657, r6651, r6654; +} +{ +mul.f16x2 r6660, r6625, r6638; +} +{ +fma.rn.f16x2 r6663, r6628, r6637, r6660; +} +{ +neg.f16x2 r6667, r6622; +} +{ +mul.f16x2 r6669, r6631, r6641; +} +{ +mul.f16x2 r6672, r6634, r6642; +} +{ +sub.f16x2 r6675, r6669, r6672; +} +{ +mul.f16x2 r6678, r6631, r6642; +} +{ +fma.rn.f16x2 r6681, r6634, r6641, r6678; +} +{ +add.f16x2 r6685, r6563, r6613; +} +{ +add.f16x2 r6688, r6566, r6616; +} +{ +sub.f16x2 r6691, r6563, r6613; +} +{ +sub.f16x2 r6694, r6566, r6616; +} +{ +add.f16x2 r6697, r6575, r6657; +} +{ +add.f16x2 r6700, r6578, r6663; +} +{ +sub.f16x2 r6703, r6575, r6657; +} +{ +sub.f16x2 r6706, r6578, r6663; +} +{ +add.f16x2 r6709, r6569, r6667; +} +{ +add.f16x2 r6712, r6572, r6619; +} +{ +sub.f16x2 r6715, r6569, r6667; +} +{ +sub.f16x2 r6718, r6572, r6619; +} +{ +add.f16x2 r6721, r6581, r6675; +} +{ +add.f16x2 r6724, r6584, r6681; +} +{ +sub.f16x2 r6727, r6581, r6675; +} +{ +sub.f16x2 r6730, r6584, r6681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1038; +cvt.rn.f16.f32 high, f1038; +mov.b32 r6743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1040; +cvt.rn.f16.f32 high, f1040; +mov.b32 r6744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1042; +cvt.rn.f16.f32 high, f1042; +mov.b32 r6745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1044; +cvt.rn.f16.f32 high, f1044; +mov.b32 r6746, {low, high}; +} +{ +mul.f16x2 r6763, r6697, r6733; +} +{ +mul.f16x2 r6766, r6700, r6734; +} +{ +sub.f16x2 r6769, r6763, r6766; +} +{ +mul.f16x2 r6772, r6697, r6734; +} +{ +fma.rn.f16x2 r6775, r6700, r6733, r6772; +} +{ +mul.f16x2 r6779, r6709, r6735; +} +{ +mul.f16x2 r6782, r6712, r6736; +} +{ +sub.f16x2 r6785, r6779, r6782; +} +{ +mul.f16x2 r6788, r6709, r6736; +} +{ +fma.rn.f16x2 r6791, r6712, r6735, r6788; +} +{ +mul.f16x2 r6795, r6721, r6737; +} +{ +mul.f16x2 r6798, r6724, r6738; +} +{ +sub.f16x2 r6801, r6795, r6798; +} +{ +mul.f16x2 r6804, r6721, r6738; +} +{ +fma.rn.f16x2 r6807, r6724, r6737, r6804; +} +{ +neg.f16x2 r6811, r6694; +} +{ +mul.f16x2 r6813, r6703, r6741; +} +{ +mul.f16x2 r6816, r6706, r6742; +} +{ +sub.f16x2 r6819, r6813, r6816; +} +{ +mul.f16x2 r6822, r6703, r6742; +} +{ +fma.rn.f16x2 r6825, r6706, r6741, r6822; +} +{ +mul.f16x2 r6829, r6715, r6743; +} +{ +mul.f16x2 r6832, r6718, r6744; +} +{ +sub.f16x2 r6835, r6829, r6832; +} +{ +mul.f16x2 r6838, r6715, r6744; +} +{ +fma.rn.f16x2 r6841, r6718, r6743, r6838; +} +{ +mul.f16x2 r6845, r6727, r6745; +} +{ +mul.f16x2 r6848, r6730, r6746; +} +{ +sub.f16x2 r6851, r6845, r6848; +} +{ +mul.f16x2 r6854, r6727, r6746; +} +{ +fma.rn.f16x2 r6857, r6730, r6745, r6854; +} +{ +add.f16x2 %2, r6489, r6685; +} +{ +add.f16x2 %3, r6492, r6688; +} +{ +sub.f16x2 %34, r6489, r6685; +} +{ +sub.f16x2 %35, r6492, r6688; +} +{ +add.f16x2 %6, r6501, r6769; +} +{ +add.f16x2 %7, r6504, r6775; +} +{ +sub.f16x2 %38, r6501, r6769; +} +{ +sub.f16x2 %39, r6504, r6775; +} +{ +add.f16x2 %10, r6513, r6785; +} +{ +add.f16x2 %11, r6516, r6791; +} +{ +sub.f16x2 %42, r6513, r6785; +} +{ +sub.f16x2 %43, r6516, r6791; +} +{ +add.f16x2 %14, r6525, r6801; +} +{ +add.f16x2 %15, r6528, r6807; +} +{ +sub.f16x2 %46, r6525, r6801; +} +{ +sub.f16x2 %47, r6528, r6807; +} +{ +add.f16x2 %18, r6495, r6811; +} +{ +add.f16x2 %19, r6498, r6691; +} +{ +sub.f16x2 %50, r6495, r6811; +} +{ +sub.f16x2 %51, r6498, r6691; +} +{ +add.f16x2 %22, r6507, r6819; +} +{ +add.f16x2 %23, r6510, r6825; +} +{ +sub.f16x2 %54, r6507, r6819; +} +{ +sub.f16x2 %55, r6510, r6825; +} +{ +add.f16x2 %26, r6519, r6835; +} +{ +add.f16x2 %27, r6522, r6841; +} +{ +sub.f16x2 %58, r6519, r6835; +} +{ +sub.f16x2 %59, r6522, r6841; +} +{ +add.f16x2 %30, r6531, r6851; +} +{ +add.f16x2 %31, r6534, r6857; +} +{ +sub.f16x2 %62, r6531, r6851; +} +{ +sub.f16x2 %63, r6534, r6857; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..1e88f6652aad0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp32_fwd.hpp.inc @@ -0,0 +1,6745 @@ +#ifndef CUFFTDX_FFT_16384_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_16384_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1150, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2821>; +.reg .b32 r<38>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2813, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2811, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2810, f2813, f2811; +sub.f32 f140, f2813, f2811; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2809, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2806, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2804, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2803, f2806, f2804; +sub.f32 f156, f2806, f2804; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2802, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2802, 0fBF3504F3; +mul.f32 f2801, f157, 0f3F3504F3; +sub.f32 f163, f2801, f162; +mul.f32 f164, f2802, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2800, f2810, f2803; +sub.f32 f173, f2810, f2803; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2799, f2809, f165; +sub.f32 f177, f2809, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2798, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2797, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2795, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2792, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2791, f2795, f2792; +sub.f32 f197, f2795, f2792; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2790, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2788, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2786, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2785, f2788, f2786; +sub.f32 f213, f2788, f2786; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2784, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2784, 0fBF3504F3; +mul.f32 f2783, f214, 0f3F3504F3; +sub.f32 f220, f2783, f219; +mul.f32 f221, f2784, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2782, f2791, f2785; +sub.f32 f230, f2791, f2785; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2781, f2790, f222; +sub.f32 f234, f2790, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2780, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2779, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2777, f231, 0f3F6C835E; +mul.f32 f2778, f2781, 0fBEC3EF15; +sub.f32 f245, f2777, f2778; +mul.f32 f246, f2781, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2775, f235, 0f3F3504F3; +mul.f32 f2776, f2780, 0fBF3504F3; +sub.f32 f250, f2775, f2776; +mul.f32 f251, f2780, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2773, f239, 0f3EC3EF15; +mul.f32 f2774, f2779, 0fBF6C835E; +sub.f32 f255, f2773, f2774; +mul.f32 f256, f2779, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2771, f233, 0fBEC3EF15; +mul.f32 f2772, f234, 0fBF6C835E; +sub.f32 f260, f2771, f2772; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2769, f241, 0fBF6C835E; +mul.f32 f2770, f242, 0fBEC3EF15; +sub.f32 f269, f2769, f2770; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2768, f2800, f2782; +sub.f32 f275, f2800, f2782; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2767, f2799, f247; +sub.f32 f279, f2799, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2766, f2798, f252; +sub.f32 f283, f2798, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2765, f2797, f257; +sub.f32 f287, f2797, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2764, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2763, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2762, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2761, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2758, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2756, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2755, f2758, f2756; +sub.f32 f315, f2758, f2756; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2754, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2752, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2749, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2748, f2752, f2749; +sub.f32 f331, f2752, f2749; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2747, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2745, f332, 0f3F3504F3; +mul.f32 f2746, f2747, 0fBF3504F3; +sub.f32 f338, f2745, f2746; +mul.f32 f339, f2747, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2744, f2755, f2748; +sub.f32 f348, f2755, f2748; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2743, f2754, f340; +sub.f32 f352, f2754, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2742, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2741, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2739, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2737, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2736, f2739, f2737; +sub.f32 f372, f2739, f2737; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2735, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2732, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2731, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2730, f2732, f2731; +sub.f32 f388, f2732, f2731; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2729, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2727, f389, 0f3F3504F3; +mul.f32 f2728, f2729, 0fBF3504F3; +sub.f32 f395, f2727, f2728; +mul.f32 f396, f2729, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2726, f2736, f2730; +sub.f32 f405, f2736, f2730; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2725, f2735, f397; +sub.f32 f409, f2735, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2724, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2723, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2725, 0fBEC3EF15; +mul.f32 f2722, f406, 0f3F6C835E; +sub.f32 f420, f2722, f419; +mul.f32 f421, f2725, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2724, 0fBF3504F3; +mul.f32 f2721, f410, 0f3F3504F3; +sub.f32 f425, f2721, f424; +mul.f32 f426, f2724, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2719, f414, 0f3EC3EF15; +mul.f32 f2720, f2723, 0fBF6C835E; +sub.f32 f430, f2719, f2720; +mul.f32 f431, f2723, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2717, f408, 0fBEC3EF15; +mul.f32 f2718, f409, 0fBF6C835E; +sub.f32 f435, f2717, f2718; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2716, f416, 0fBF6C835E; +sub.f32 f444, f2716, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2715, f2744, f2726; +sub.f32 f450, f2744, f2726; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2714, f2743, f422; +sub.f32 f454, f2743, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2713, f2742, f427; +sub.f32 f458, f2742, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2712, f2741, f432; +sub.f32 f462, f2741, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2711, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2710, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2709, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2708, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2714, 0fBE47C5C2; +mul.f32 f2707, f451, 0f3F7B14BE; +sub.f32 f481, f2707, f480; +mul.f32 f482, f2714, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2713, 0fBEC3EF15; +mul.f32 f2706, f455, 0f3F6C835E; +sub.f32 f486, f2706, f485; +mul.f32 f487, f2713, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2712, 0fBF0E39DA; +mul.f32 f2705, f459, 0f3F54DB31; +sub.f32 f491, f2705, f490; +mul.f32 f492, f2712, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2711, 0fBF3504F3; +mul.f32 f2704, f463, 0f3F3504F3; +sub.f32 f496, f2704, f495; +mul.f32 f497, f2711, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2702, f467, 0f3F0E39DA; +mul.f32 f2703, f2710, 0fBF54DB31; +sub.f32 f501, f2702, f2703; +mul.f32 f502, f2710, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2700, f471, 0f3EC3EF15; +mul.f32 f2701, f2709, 0fBF6C835E; +sub.f32 f506, f2700, f2701; +mul.f32 f507, f2709, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2698, f475, 0f3E47C5C2; +mul.f32 f2699, f2708, 0fBF7B14BE; +sub.f32 f511, f2698, f2699; +mul.f32 f512, f2708, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2696, f453, 0fBE47C5C2; +mul.f32 f2697, f454, 0fBF7B14BE; +sub.f32 f516, f2696, f2697; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2695, f457, 0fBEC3EF15; +sub.f32 f521, f2695, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2694, f461, 0fBF0E39DA; +sub.f32 f526, f2694, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2692, f469, 0fBF54DB31; +mul.f32 f2693, f470, 0fBF0E39DA; +sub.f32 f535, f2692, f2693; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2691, f473, 0fBF6C835E; +sub.f32 f540, f2691, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2690, f477, 0fBF7B14BE; +sub.f32 f545, f2690, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2689, f2767, f483; +sub.f32 f553, f2767, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2688, f2766, f488; +sub.f32 f557, f2766, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2687, f2765, f493; +sub.f32 f561, f2765, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2686, f2764, f498; +sub.f32 f565, f2764, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f2685, f2763, f503; +sub.f32 f569, f2763, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f2684, f2762, f508; +sub.f32 f573, f2762, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f2683, f2761, f513; +sub.f32 f577, f2761, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f2682, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f2681, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f2680, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f2679, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f2678, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2677, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2676, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2675, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f2689; +mul.f32 f616, f610, f2689; +mul.f32 f618, f611, f611; +mul.f32 f2674, f610, f610; +sub.f32 f619, f2674, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f2688; +mul.f32 f624, f619, f2688; +mul.f32 f626, f611, f621; +mul.f32 f2673, f610, f619; +sub.f32 f627, f2673, f626; +mul.f32 f2672, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f2687; +mul.f32 f632, f627, f2687; +mul.f32 f2670, f610, f627; +mul.f32 f2671, f611, f629; +sub.f32 f635, f2670, f2671; +mul.f32 f2669, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f2686; +mul.f32 f640, f635, f2686; +mul.f32 f642, f611, f637; +mul.f32 f2668, f610, f635; +sub.f32 f643, f2668, f642; +mul.f32 f2667, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f2685; +mul.f32 f648, f643, f2685; +mul.f32 f2665, f610, f643; +mul.f32 f2666, f611, f645; +sub.f32 f651, f2665, f2666; +mul.f32 f2664, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f2684; +mul.f32 f656, f651, f2684; +mul.f32 f658, f611, f653; +mul.f32 f2663, f610, f651; +sub.f32 f659, f2663, f658; +mul.f32 f2662, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f2683; +mul.f32 f664, f659, f2683; +mul.f32 f666, f611, f661; +mul.f32 f2661, f610, f659; +sub.f32 f667, f2661, f666; +mul.f32 f2660, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f2682; +mul.f32 f672, f667, f2682; +mul.f32 f2658, f610, f667; +mul.f32 f2659, f611, f669; +sub.f32 f675, f2658, f2659; +mul.f32 f2657, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f2681; +mul.f32 f680, f675, f2681; +mul.f32 f682, f611, f677; +mul.f32 f2656, f610, f675; +sub.f32 f683, f2656, f682; +mul.f32 f2655, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f2680; +mul.f32 f688, f683, f2680; +mul.f32 f690, f611, f685; +mul.f32 f2654, f610, f683; +sub.f32 f691, f2654, f690; +mul.f32 f2653, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f2679; +mul.f32 f696, f691, f2679; +mul.f32 f2651, f610, f691; +mul.f32 f2652, f611, f693; +sub.f32 f699, f2651, f2652; +mul.f32 f2650, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f2678; +mul.f32 f704, f699, f2678; +mul.f32 f706, f611, f701; +mul.f32 f2649, f610, f699; +sub.f32 f707, f2649, f706; +mul.f32 f2648, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f2677; +mul.f32 f712, f707, f2677; +mul.f32 f2646, f610, f707; +mul.f32 f2647, f611, f709; +sub.f32 f715, f2646, f2647; +mul.f32 f2645, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f2676; +mul.f32 f720, f715, f2676; +mul.f32 f722, f611, f717; +mul.f32 f2644, f610, f715; +sub.f32 f723, f2644, f722; +mul.f32 f2643, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f2675; +mul.f32 f728, f723, f2675; +mul.f32 f730, f611, f725; +mul.f32 f2642, f610, f723; +sub.f32 f731, f2642, f730; +mul.f32 f2641, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2640, f2768, f2715; +mul.f32 f735, f733, f2640; +mul.f32 f736, f731, f2640; +mul.f32 f2638, f610, f731; +mul.f32 f2639, f611, f733; +sub.f32 f739, f2638, f2639; +sub.f32 f2637, f272, f447; +mul.f32 f2636, f731, f2637; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2635, f610, f739; +sub.f32 f747, f2635, f746; +mul.f32 f2634, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2633, f610, f747; +sub.f32 f755, f2633, f754; +mul.f32 f2632, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f2630, f610, f755; +mul.f32 f2631, f611, f757; +sub.f32 f763, f2630, f2631; +mul.f32 f2629, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2628, f610, f763; +sub.f32 f771, f2628, f770; +mul.f32 f2627, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f2625, f610, f771; +mul.f32 f2626, f611, f773; +sub.f32 f779, f2625, f2626; +mul.f32 f2624, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2623, f610, f779; +sub.f32 f787, f2623, f786; +mul.f32 f2622, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2621, f610, f787; +sub.f32 f795, f2621, f794; +mul.f32 f2620, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f2618, f610, f795; +mul.f32 f2619, f611, f797; +sub.f32 f803, f2618, f2619; +mul.f32 f2617, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2616, f610, f803; +sub.f32 f811, f2616, f810; +mul.f32 f2615, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2614, f610, f811; +sub.f32 f819, f2614, f818; +mul.f32 f2613, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f2611, f610, f819; +mul.f32 f2612, f611, f821; +sub.f32 f827, f2611, f2612; +mul.f32 f2610, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2609, f610, f827; +sub.f32 f835, f2609, f834; +mul.f32 f2608, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f2606, f610, f835; +mul.f32 f2607, f611, f837; +sub.f32 f843, f2606, f2607; +mul.f32 f2605, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2604, f610, f843; +sub.f32 f851, f2604, f850; +mul.f32 f2603, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f2602, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +sub.f32 f2820, f2768, f2715; +mul.f32 f2819, f733, f2820; +shl.b32 r27, r22, 8; +barrier.sync 0; +and.b32 r11, r27, 130816; +add.s32 r12, r9, r11; +add.f32 f857, f2768, f2715; +sub.f32 f2817, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r37, %tid.x; +shl.b32 r31, r37, 3; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f2603, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f2672, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f2669, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f2667, f639; +sub.f32 f867, f2664, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f2662, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f2660, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f2657, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f2655, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f2653, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f2650, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f2648, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f2645, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f2643, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f2641, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f2817, f736; +sub.f32 f890, f2636, f2819; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f2634, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f2632, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f2629, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f2627, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f2624, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f2622, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f2620, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f2617, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f2615, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f2613, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f2610, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f2608, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f2605, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f2602, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +and.b32 r21, r37, 511; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+4096]; +ld.shared.v2.f32 {f929, f930}, [r13+8192]; +ld.shared.v2.f32 {f933, f934}, [r13+12288]; +ld.shared.v2.f32 {f937, f938}, [r13+16384]; +ld.shared.v2.f32 {f941, f942}, [r13+20480]; +ld.shared.v2.f32 {f945, f946}, [r13+24576]; +ld.shared.v2.f32 {f949, f950}, [r13+28672]; +ld.shared.v2.f32 {f953, f954}, [r13+32768]; +ld.shared.v2.f32 {f957, f958}, [r13+36864]; +ld.shared.v2.f32 {f961, f962}, [r13+40960]; +ld.shared.v2.f32 {f965, f966}, [r13+45056]; +ld.shared.v2.f32 {f969, f970}, [r13+49152]; +ld.shared.v2.f32 {f973, f974}, [r13+53248]; +ld.shared.v2.f32 {f977, f978}, [r13+57344]; +ld.shared.v2.f32 {f981, f982}, [r13+61440]; +ld.shared.v2.f32 {f985, f986}, [r13+65536]; +ld.shared.v2.f32 {f989, f990}, [r13+69632]; +ld.shared.v2.f32 {f993, f994}, [r13+73728]; +ld.shared.v2.f32 {f997, f998}, [r13+77824]; +ld.shared.v2.f32 {f1001, f1002}, [r13+81920]; +ld.shared.v2.f32 {f1005, f1006}, [r13+86016]; +ld.shared.v2.f32 {f1009, f1010}, [r13+90112]; +ld.shared.v2.f32 {f1013, f1014}, [r13+94208]; +ld.shared.v2.f32 {f1017, f1018}, [r13+98304]; +ld.shared.v2.f32 {f1021, f1022}, [r13+102400]; +ld.shared.v2.f32 {f1025, f1026}, [r13+106496]; +ld.shared.v2.f32 {f1029, f1030}, [r13+110592]; +ld.shared.v2.f32 {f1033, f1034}, [r13+114688]; +ld.shared.v2.f32 {f1037, f1038}, [r13+118784]; +ld.shared.v2.f32 {f1041, f1042}, [r13+122880]; +ld.shared.v2.f32 {f1045, f1046}, [r13+126976]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2601, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2600, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2599, f2601, f2600; +sub.f32 f1060, f2601, f2600; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f2598, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2597, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2596, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2595, f2597, f2596; +sub.f32 f1076, f2597, f2596; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f2594, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f2594, 0fBF3504F3; +mul.f32 f2593, f1077, 0f3F3504F3; +sub.f32 f1083, f2593, f1082; +mul.f32 f1084, f2594, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2592, f2599, f2595; +sub.f32 f1093, f2599, f2595; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2591, f2598, f1085; +sub.f32 f1097, f2598, f1085; +add.f32 f1098, f1059, f1076; +sub.f32 f1100, f1059, f1076; +sub.f32 f2590, f1060, f1075; +add.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1088; +sub.f32 f1104, f1063, f1088; +add.f32 f2589, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2588, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2587, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2586, f2588, f2587; +sub.f32 f1117, f2588, f2587; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f2585, f1109, f1112; +add.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2584, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2583, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2582, f2584, f2583; +sub.f32 f1133, f2584, f2583; +add.f32 f1134, f1124, f1129; +sub.f32 f1136, f1124, f1129; +sub.f32 f2581, f1125, f1128; +add.f32 f1137, f1125, f1128; +mul.f32 f1139, f2581, 0fBF3504F3; +mul.f32 f2580, f1134, 0f3F3504F3; +sub.f32 f1140, f2580, f1139; +mul.f32 f1141, f2581, 0f3F3504F3; +fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141; +mul.f32 f1143, f1136, 0fBF3504F3; +mul.f32 f1144, f1137, 0fBF3504F3; +sub.f32 f1145, f1143, f1144; +add.f32 f1146, f1143, f1144; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2579, f2586, f2582; +sub.f32 f1150, f2586, f2582; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2578, f2585, f1142; +sub.f32 f1154, f2585, f1142; +add.f32 f1155, f1116, f1133; +sub.f32 f1157, f1116, f1133; +sub.f32 f2577, f1117, f1132; +add.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1145; +sub.f32 f1161, f1120, f1145; +add.f32 f2576, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2574, f1151, 0f3F6C835E; +mul.f32 f2575, f2578, 0fBEC3EF15; +sub.f32 f1165, f2574, f2575; +mul.f32 f1166, f2578, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166; +mul.f32 f2572, f1155, 0f3F3504F3; +mul.f32 f2573, f2577, 0fBF3504F3; +sub.f32 f1170, f2572, f2573; +mul.f32 f1171, f2577, 0f3F3504F3; +fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171; +mul.f32 f1174, f2576, 0fBF6C835E; +mul.f32 f2571, f1159, 0f3EC3EF15; +sub.f32 f1175, f2571, f1174; +mul.f32 f1176, f2576, 0f3EC3EF15; +fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176; +mul.f32 f1179, f1154, 0fBF6C835E; +mul.f32 f2570, f1153, 0fBEC3EF15; +sub.f32 f1180, f2570, f1179; +mul.f32 f1181, f1154, 0fBEC3EF15; +fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181; +mul.f32 f1183, f1157, 0fBF3504F3; +mul.f32 f1184, f1158, 0fBF3504F3; +sub.f32 f1185, f1183, f1184; +add.f32 f1186, f1183, f1184; +mul.f32 f2568, f1161, 0fBF6C835E; +mul.f32 f2569, f1162, 0fBEC3EF15; +sub.f32 f1189, f2568, f2569; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2567, f2592, f2579; +sub.f32 f1195, f2592, f2579; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2566, f2591, f1167; +sub.f32 f1199, f2591, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2565, f2590, f1172; +sub.f32 f1203, f2590, f1172; +add.f32 f1204, f1102, f1175; +sub.f32 f1206, f1102, f1175; +add.f32 f2564, f2589, f1177; +sub.f32 f1207, f2589, f1177; +add.f32 f1208, f1092, f1150; +sub.f32 f1210, f1092, f1150; +sub.f32 f2563, f1093, f1149; +add.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1180; +sub.f32 f1214, f1096, f1180; +add.f32 f2562, f1097, f1182; +sub.f32 f1215, f1097, f1182; +add.f32 f1216, f1100, f1185; +sub.f32 f1218, f1100, f1185; +add.f32 f2561, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2560, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2559, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2558, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2557, f2559, f2558; +sub.f32 f1235, f2559, f2558; +add.f32 f1236, f1226, f1231; +sub.f32 f1238, f1226, f1231; +sub.f32 f2556, f1227, f1230; +add.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2555, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2554, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2553, f2555, f2554; +sub.f32 f1251, f2555, f2554; +add.f32 f1252, f1242, f1247; +sub.f32 f1254, f1242, f1247; +sub.f32 f2552, f1243, f1246; +add.f32 f1255, f1243, f1246; +mul.f32 f1257, f2552, 0fBF3504F3; +mul.f32 f2551, f1252, 0f3F3504F3; +sub.f32 f1258, f2551, f1257; +mul.f32 f1259, f2552, 0f3F3504F3; +fma.rn.f32 f1260, f1252, 0fBF3504F3, f1259; +mul.f32 f1261, f1254, 0fBF3504F3; +mul.f32 f1262, f1255, 0fBF3504F3; +sub.f32 f1263, f1261, f1262; +add.f32 f1264, f1261, f1262; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2550, f2557, f2553; +sub.f32 f1268, f2557, f2553; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2549, f2556, f1260; +sub.f32 f1272, f2556, f1260; +add.f32 f1273, f1234, f1251; +sub.f32 f1275, f1234, f1251; +sub.f32 f2548, f1235, f1250; +add.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1263; +sub.f32 f1279, f1238, f1263; +add.f32 f2547, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2546, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2545, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2544, f2546, f2545; +sub.f32 f1292, f2546, f2545; +add.f32 f1293, f1283, f1288; +sub.f32 f1295, f1283, f1288; +sub.f32 f2543, f1284, f1287; +add.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2542, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2541, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2540, f2542, f2541; +sub.f32 f1308, f2542, f2541; +add.f32 f1309, f1299, f1304; +sub.f32 f1311, f1299, f1304; +sub.f32 f2539, f1300, f1303; +add.f32 f1312, f1300, f1303; +mul.f32 f1314, f2539, 0fBF3504F3; +mul.f32 f2538, f1309, 0f3F3504F3; +sub.f32 f1315, f2538, f1314; +mul.f32 f1316, f2539, 0f3F3504F3; +fma.rn.f32 f1317, f1309, 0fBF3504F3, f1316; +mul.f32 f1318, f1311, 0fBF3504F3; +mul.f32 f1319, f1312, 0fBF3504F3; +sub.f32 f1320, f1318, f1319; +add.f32 f1321, f1318, f1319; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2537, f2544, f2540; +sub.f32 f1325, f2544, f2540; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2536, f2543, f1317; +sub.f32 f1329, f2543, f1317; +add.f32 f1330, f1291, f1308; +sub.f32 f1332, f1291, f1308; +sub.f32 f2535, f1292, f1307; +add.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1320; +sub.f32 f1336, f1295, f1320; +add.f32 f2534, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2532, f1326, 0f3F6C835E; +mul.f32 f2533, f2536, 0fBEC3EF15; +sub.f32 f1340, f2532, f2533; +mul.f32 f1341, f2536, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0fBEC3EF15, f1341; +mul.f32 f2530, f1330, 0f3F3504F3; +mul.f32 f2531, f2535, 0fBF3504F3; +sub.f32 f1345, f2530, f2531; +mul.f32 f1346, f2535, 0f3F3504F3; +fma.rn.f32 f1347, f1330, 0fBF3504F3, f1346; +mul.f32 f2528, f1334, 0f3EC3EF15; +mul.f32 f2529, f2534, 0fBF6C835E; +sub.f32 f1350, f2528, f2529; +mul.f32 f1351, f2534, 0f3EC3EF15; +fma.rn.f32 f1352, f1334, 0fBF6C835E, f1351; +mul.f32 f2526, f1328, 0fBEC3EF15; +mul.f32 f2527, f1329, 0fBF6C835E; +sub.f32 f1355, f2526, f2527; +mul.f32 f1356, f1329, 0fBEC3EF15; +fma.rn.f32 f1357, f1328, 0fBF6C835E, f1356; +mul.f32 f1358, f1332, 0fBF3504F3; +mul.f32 f1359, f1333, 0fBF3504F3; +sub.f32 f1360, f1358, f1359; +add.f32 f1361, f1358, f1359; +mul.f32 f2524, f1336, 0fBF6C835E; +mul.f32 f2525, f1337, 0fBEC3EF15; +sub.f32 f1364, f2524, f2525; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0fBEC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2523, f2550, f2537; +sub.f32 f1370, f2550, f2537; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2522, f2549, f1342; +sub.f32 f1374, f2549, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2521, f2548, f1347; +sub.f32 f1378, f2548, f1347; +add.f32 f1379, f1277, f1350; +sub.f32 f1381, f1277, f1350; +add.f32 f2520, f2547, f1352; +sub.f32 f1382, f2547, f1352; +add.f32 f1383, f1267, f1325; +sub.f32 f1385, f1267, f1325; +sub.f32 f2519, f1268, f1324; +add.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1355; +sub.f32 f1389, f1271, f1355; +add.f32 f2518, f1272, f1357; +sub.f32 f1390, f1272, f1357; +add.f32 f1391, f1275, f1360; +sub.f32 f1393, f1275, f1360; +add.f32 f2517, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2516, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2522, 0fBE47C5C2; +mul.f32 f2515, f1371, 0f3F7B14BE; +sub.f32 f1401, f2515, f1400; +mul.f32 f1402, f2522, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0fBE47C5C2, f1402; +mul.f32 f1405, f2521, 0fBEC3EF15; +mul.f32 f2514, f1375, 0f3F6C835E; +sub.f32 f1406, f2514, f1405; +mul.f32 f1407, f2521, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0fBEC3EF15, f1407; +mul.f32 f2512, f1379, 0f3F54DB31; +mul.f32 f2513, f2520, 0fBF0E39DA; +sub.f32 f1411, f2512, f2513; +mul.f32 f1412, f2520, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0fBF0E39DA, f1412; +mul.f32 f2510, f1383, 0f3F3504F3; +mul.f32 f2511, f2519, 0fBF3504F3; +sub.f32 f1416, f2510, f2511; +mul.f32 f1417, f2519, 0f3F3504F3; +fma.rn.f32 f1418, f1383, 0fBF3504F3, f1417; +mul.f32 f2508, f1387, 0f3F0E39DA; +mul.f32 f2509, f2518, 0fBF54DB31; +sub.f32 f1421, f2508, f2509; +mul.f32 f1422, f2518, 0f3F0E39DA; +fma.rn.f32 f1423, f1387, 0fBF54DB31, f1422; +mul.f32 f2506, f1391, 0f3EC3EF15; +mul.f32 f2507, f2517, 0fBF6C835E; +sub.f32 f1426, f2506, f2507; +mul.f32 f1427, f2517, 0f3EC3EF15; +fma.rn.f32 f1428, f1391, 0fBF6C835E, f1427; +mul.f32 f1430, f2516, 0fBF7B14BE; +mul.f32 f2505, f1395, 0f3E47C5C2; +sub.f32 f1431, f2505, f1430; +mul.f32 f1432, f2516, 0f3E47C5C2; +fma.rn.f32 f1433, f1395, 0fBF7B14BE, f1432; +mul.f32 f1435, f1374, 0fBF7B14BE; +mul.f32 f2504, f1373, 0fBE47C5C2; +sub.f32 f1436, f2504, f1435; +mul.f32 f1437, f1374, 0fBE47C5C2; +fma.rn.f32 f1438, f1373, 0fBF7B14BE, f1437; +mul.f32 f1440, f1378, 0fBF6C835E; +mul.f32 f2503, f1377, 0fBEC3EF15; +sub.f32 f1441, f2503, f1440; +mul.f32 f1442, f1378, 0fBEC3EF15; +fma.rn.f32 f1443, f1377, 0fBF6C835E, f1442; +mul.f32 f1445, f1382, 0fBF54DB31; +mul.f32 f2502, f1381, 0fBF0E39DA; +sub.f32 f1446, f2502, f1445; +mul.f32 f1447, f1382, 0fBF0E39DA; +fma.rn.f32 f1448, f1381, 0fBF54DB31, f1447; +mul.f32 f1449, f1385, 0fBF3504F3; +mul.f32 f1450, f1386, 0fBF3504F3; +sub.f32 f1451, f1449, f1450; +add.f32 f1452, f1449, f1450; +mul.f32 f1454, f1390, 0fBF0E39DA; +mul.f32 f2501, f1389, 0fBF54DB31; +sub.f32 f1455, f2501, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0fBF0E39DA, f1456; +mul.f32 f1459, f1394, 0fBEC3EF15; +mul.f32 f2500, f1393, 0fBF6C835E; +sub.f32 f1460, f2500, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0fBEC3EF15, f1461; +mul.f32 f1464, f1398, 0fBE47C5C2; +mul.f32 f2499, f1397, 0fBF7B14BE; +sub.f32 f1465, f2499, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0fBE47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2498, f2566, f1403; +sub.f32 f1473, f2566, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2497, f2565, f1408; +sub.f32 f1477, f2565, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2496, f2564, f1413; +sub.f32 f1481, f2564, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2495, f2563, f1418; +sub.f32 f1485, f2563, f1418; +add.f32 f1486, f1212, f1421; +sub.f32 f1488, f1212, f1421; +add.f32 f2494, f2562, f1423; +sub.f32 f1489, f2562, f1423; +add.f32 f1490, f1216, f1426; +sub.f32 f1492, f1216, f1426; +add.f32 f2493, f2561, f1428; +sub.f32 f1493, f2561, f1428; +add.f32 f1494, f1220, f1431; +sub.f32 f1496, f1220, f1431; +add.f32 f2492, f2560, f1433; +sub.f32 f1497, f2560, f1433; +add.f32 f1498, f1194, f1370; +sub.f32 f1500, f1194, f1370; +sub.f32 f2491, f1195, f1369; +add.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1436; +sub.f32 f1504, f1198, f1436; +add.f32 f2490, f1199, f1438; +sub.f32 f1505, f1199, f1438; +add.f32 f1506, f1202, f1441; +sub.f32 f1508, f1202, f1441; +add.f32 f2489, f1203, f1443; +sub.f32 f1509, f1203, f1443; +add.f32 f1510, f1206, f1446; +sub.f32 f1512, f1206, f1446; +add.f32 f2488, f1207, f1448; +sub.f32 f1513, f1207, f1448; +add.f32 f1514, f1210, f1451; +sub.f32 f1516, f1210, f1451; +add.f32 f2487, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2486, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2485, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2484, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r37, 480; +bfe.u32 r15, r37, 5, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1535, f1531, f2498; +mul.f32 f1536, f1530, f2498; +mul.f32 f2482, f1530, f1530; +mul.f32 f2483, f1531, f1531; +sub.f32 f1539, f2482, f2483; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1543, f1541, f2497; +mul.f32 f1544, f1539, f2497; +mul.f32 f1546, f1531, f1541; +mul.f32 f2481, f1530, f1539; +sub.f32 f1547, f2481, f1546; +mul.f32 f2480, f1539, f1474; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1551, f1549, f2496; +mul.f32 f1552, f1547, f2496; +mul.f32 f1554, f1531, f1549; +mul.f32 f2479, f1530, f1547; +sub.f32 f1555, f2479, f1554; +mul.f32 f2478, f1547, f1478; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1559, f1557, f2495; +mul.f32 f1560, f1555, f2495; +mul.f32 f2476, f1530, f1555; +mul.f32 f2477, f1531, f1557; +sub.f32 f1563, f2476, f2477; +mul.f32 f2475, f1555, f1482; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1567, f1565, f2494; +mul.f32 f1568, f1563, f2494; +mul.f32 f1570, f1531, f1565; +mul.f32 f2474, f1530, f1563; +sub.f32 f1571, f2474, f1570; +mul.f32 f2473, f1563, f1486; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1575, f1573, f2493; +mul.f32 f1576, f1571, f2493; +mul.f32 f1578, f1531, f1573; +mul.f32 f2472, f1530, f1571; +sub.f32 f1579, f2472, f1578; +mul.f32 f2471, f1571, f1490; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1583, f1581, f2492; +mul.f32 f1584, f1579, f2492; +mul.f32 f2469, f1530, f1579; +mul.f32 f2470, f1531, f1581; +sub.f32 f1587, f2469, f2470; +mul.f32 f2468, f1579, f1494; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1591, f1589, f2491; +mul.f32 f1592, f1587, f2491; +mul.f32 f1594, f1531, f1589; +mul.f32 f2467, f1530, f1587; +sub.f32 f1595, f2467, f1594; +mul.f32 f2466, f1587, f1498; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1599, f1597, f2490; +mul.f32 f1600, f1595, f2490; +mul.f32 f2464, f1530, f1595; +mul.f32 f2465, f1531, f1597; +sub.f32 f1603, f2464, f2465; +mul.f32 f2463, f1595, f1502; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1607, f1605, f2489; +mul.f32 f1608, f1603, f2489; +mul.f32 f1610, f1531, f1605; +mul.f32 f2462, f1530, f1603; +sub.f32 f1611, f2462, f1610; +mul.f32 f2461, f1603, f1506; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1615, f1613, f2488; +mul.f32 f1616, f1611, f2488; +mul.f32 f1618, f1531, f1613; +mul.f32 f2460, f1530, f1611; +sub.f32 f1619, f2460, f1618; +mul.f32 f2459, f1611, f1510; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1623, f1621, f2487; +mul.f32 f1624, f1619, f2487; +mul.f32 f2457, f1530, f1619; +mul.f32 f2458, f1531, f1621; +sub.f32 f1627, f2457, f2458; +mul.f32 f2456, f1619, f1514; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1631, f1629, f2486; +mul.f32 f1632, f1627, f2486; +mul.f32 f1634, f1531, f1629; +mul.f32 f2455, f1530, f1627; +sub.f32 f1635, f2455, f1634; +mul.f32 f2454, f1627, f1518; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1639, f1637, f2485; +mul.f32 f1640, f1635, f2485; +mul.f32 f1642, f1531, f1637; +mul.f32 f2453, f1530, f1635; +sub.f32 f1643, f2453, f1642; +mul.f32 f2452, f1635, f1522; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1647, f1645, f2484; +mul.f32 f1648, f1643, f2484; +mul.f32 f2450, f1530, f1643; +mul.f32 f2451, f1531, f1645; +sub.f32 f1651, f2450, f2451; +mul.f32 f2449, f1643, f1526; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2448, f2567, f2523; +mul.f32 f1655, f1653, f2448; +mul.f32 f1656, f1651, f2448; +mul.f32 f1658, f1531, f1653; +mul.f32 f2447, f1530, f1651; +sub.f32 f1659, f2447, f1658; +sub.f32 f2446, f1192, f1367; +mul.f32 f2445, f1651, f2446; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1663, f1661, f1473; +mul.f32 f1664, f1659, f1473; +mul.f32 f2443, f1530, f1659; +mul.f32 f2444, f1531, f1661; +sub.f32 f1667, f2443, f2444; +mul.f32 f2442, f1659, f1472; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1671, f1669, f1477; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2441, f1530, f1667; +sub.f32 f1675, f2441, f1674; +mul.f32 f2440, f1667, f1476; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1679, f1677, f1481; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2439, f1530, f1675; +sub.f32 f1683, f2439, f1682; +mul.f32 f2438, f1675, f1480; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1687, f1685, f1485; +mul.f32 f1688, f1683, f1485; +mul.f32 f2436, f1530, f1683; +mul.f32 f2437, f1531, f1685; +sub.f32 f1691, f2436, f2437; +mul.f32 f2435, f1683, f1484; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1695, f1693, f1489; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2434, f1530, f1691; +sub.f32 f1699, f2434, f1698; +mul.f32 f2433, f1691, f1488; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1703, f1701, f1493; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2432, f1530, f1699; +sub.f32 f1707, f2432, f1706; +mul.f32 f2431, f1699, f1492; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1711, f1709, f1497; +mul.f32 f1712, f1707, f1497; +mul.f32 f2429, f1530, f1707; +mul.f32 f2430, f1531, f1709; +sub.f32 f1715, f2429, f2430; +mul.f32 f2428, f1707, f1496; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1719, f1717, f1501; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2427, f1530, f1715; +sub.f32 f1723, f2427, f1722; +mul.f32 f2426, f1715, f1500; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1727, f1725, f1505; +mul.f32 f1728, f1723, f1505; +mul.f32 f2424, f1530, f1723; +mul.f32 f2425, f1531, f1725; +sub.f32 f1731, f2424, f2425; +mul.f32 f2423, f1723, f1504; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1735, f1733, f1509; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2422, f1530, f1731; +sub.f32 f1739, f2422, f1738; +mul.f32 f2421, f1731, f1508; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1743, f1741, f1513; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2420, f1530, f1739; +sub.f32 f1747, f2420, f1746; +mul.f32 f2419, f1739, f1512; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1751, f1749, f1517; +mul.f32 f1752, f1747, f1517; +mul.f32 f2417, f1530, f1747; +mul.f32 f2418, f1531, f1749; +sub.f32 f1755, f2417, f2418; +mul.f32 f2416, f1747, f1516; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1759, f1757, f1521; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2415, f1530, f1755; +sub.f32 f1763, f2415, f1762; +mul.f32 f2414, f1755, f1520; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1767, f1765, f1525; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2413, f1530, f1763; +sub.f32 f1771, f2413, f1770; +mul.f32 f2412, f1530, f1470; +mul.f32 f1772, f1530, f1765; +mul.f32 f2411, f1763, f1524; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1771, f1528; +mul.f32 f1775, f1773, f1529; +mul.f32 f1776, f1771, f1529; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 8; +and.b32 r16, r31, 248; +add.s32 r17, r9, r16; +mov.u32 r26, %tid.x; +shl.b32 r25, r26, 8; +barrier.sync 0; +and.b32 r18, r25, 122880; +add.s32 r19, r17, r18; +sub.f32 f2816, f2567, f2523; +mul.f32 f2815, f1653, f2816; +add.f32 f1777, f2567, f2523; +sub.f32 f2818, f1192, f1367; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r30, %tid.x; +and.b32 r29, r30, 480; +fma.rn.f32 f1779, f1531, f1470, f1536; +sub.f32 f1780, f2412, f1535; +st.shared.v2.f32 [r19+256], {f1780, f1779}; +fma.rn.f32 f1781, f1541, f1474, f1544; +sub.f32 f1782, f2480, f1543; +st.shared.v2.f32 [r19+512], {f1782, f1781}; +fma.rn.f32 f1783, f1549, f1478, f1552; +sub.f32 f1784, f2478, f1551; +st.shared.v2.f32 [r19+768], {f1784, f1783}; +fma.rn.f32 f1785, f1557, f1482, f1560; +sub.f32 f1786, f2475, f1559; +st.shared.v2.f32 [r19+1024], {f1786, f1785}; +fma.rn.f32 f1787, f1565, f1486, f1568; +sub.f32 f1788, f2473, f1567; +st.shared.v2.f32 [r19+1280], {f1788, f1787}; +sub.f32 f1789, f2471, f1575; +fma.rn.f32 f1790, f1573, f1490, f1576; +st.shared.v2.f32 [r19+1536], {f1789, f1790}; +fma.rn.f32 f1791, f1581, f1494, f1584; +sub.f32 f1792, f2468, f1583; +st.shared.v2.f32 [r19+1792], {f1792, f1791}; +fma.rn.f32 f1793, f1589, f1498, f1592; +sub.f32 f1794, f2466, f1591; +st.shared.v2.f32 [r19+2048], {f1794, f1793}; +fma.rn.f32 f1795, f1597, f1502, f1600; +sub.f32 f1796, f2463, f1599; +st.shared.v2.f32 [r19+2304], {f1796, f1795}; +fma.rn.f32 f1797, f1605, f1506, f1608; +sub.f32 f1798, f2461, f1607; +st.shared.v2.f32 [r19+2560], {f1798, f1797}; +fma.rn.f32 f1799, f1613, f1510, f1616; +sub.f32 f1800, f2459, f1615; +st.shared.v2.f32 [r19+2816], {f1800, f1799}; +fma.rn.f32 f1801, f1621, f1514, f1624; +sub.f32 f1802, f2456, f1623; +st.shared.v2.f32 [r19+3072], {f1802, f1801}; +fma.rn.f32 f1803, f1629, f1518, f1632; +sub.f32 f1804, f2454, f1631; +st.shared.v2.f32 [r19+3328], {f1804, f1803}; +fma.rn.f32 f1805, f1637, f1522, f1640; +sub.f32 f1806, f2452, f1639; +st.shared.v2.f32 [r19+3584], {f1806, f1805}; +fma.rn.f32 f1807, f1645, f1526, f1648; +sub.f32 f1808, f2449, f1647; +st.shared.v2.f32 [r19+3840], {f1808, f1807}; +fma.rn.f32 f1809, f1653, f2818, f1656; +sub.f32 f1810, f2445, f2815; +st.shared.v2.f32 [r19+4096], {f1810, f1809}; +fma.rn.f32 f1811, f1661, f1472, f1664; +sub.f32 f1812, f2442, f1663; +st.shared.v2.f32 [r19+4352], {f1812, f1811}; +fma.rn.f32 f1813, f1669, f1476, f1672; +sub.f32 f1814, f2440, f1671; +st.shared.v2.f32 [r19+4608], {f1814, f1813}; +fma.rn.f32 f1815, f1677, f1480, f1680; +sub.f32 f1816, f2438, f1679; +st.shared.v2.f32 [r19+4864], {f1816, f1815}; +fma.rn.f32 f1817, f1685, f1484, f1688; +sub.f32 f1818, f2435, f1687; +st.shared.v2.f32 [r19+5120], {f1818, f1817}; +fma.rn.f32 f1819, f1693, f1488, f1696; +sub.f32 f1820, f2433, f1695; +st.shared.v2.f32 [r19+5376], {f1820, f1819}; +fma.rn.f32 f1821, f1701, f1492, f1704; +sub.f32 f1822, f2431, f1703; +st.shared.v2.f32 [r19+5632], {f1822, f1821}; +fma.rn.f32 f1823, f1709, f1496, f1712; +sub.f32 f1824, f2428, f1711; +st.shared.v2.f32 [r19+5888], {f1824, f1823}; +fma.rn.f32 f1825, f1717, f1500, f1720; +sub.f32 f1826, f2426, f1719; +st.shared.v2.f32 [r19+6144], {f1826, f1825}; +fma.rn.f32 f1827, f1725, f1504, f1728; +sub.f32 f1828, f2423, f1727; +st.shared.v2.f32 [r19+6400], {f1828, f1827}; +fma.rn.f32 f1829, f1733, f1508, f1736; +sub.f32 f1830, f2421, f1735; +st.shared.v2.f32 [r19+6656], {f1830, f1829}; +fma.rn.f32 f1831, f1741, f1512, f1744; +sub.f32 f1832, f2419, f1743; +st.shared.v2.f32 [r19+6912], {f1832, f1831}; +fma.rn.f32 f1833, f1749, f1516, f1752; +sub.f32 f1834, f2416, f1751; +st.shared.v2.f32 [r19+7168], {f1834, f1833}; +fma.rn.f32 f1835, f1757, f1520, f1760; +sub.f32 f1836, f2414, f1759; +st.shared.v2.f32 [r19+7424], {f1836, f1835}; +fma.rn.f32 f1837, f1765, f1524, f1768; +sub.f32 f1838, f2411, f1767; +st.shared.v2.f32 [r19+7680], {f1838, f1837}; +fma.rn.f32 f1839, f1773, f1528, f1776; +sub.f32 f1840, f1774, f1775; +st.shared.v2.f32 [r19+7936], {f1840, f1839}; +barrier.sync 0; +mad.lo.s32 r20, r29, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+4096]; +ld.shared.v2.f32 {f1849, f1850}, [r20+8192]; +ld.shared.v2.f32 {f1853, f1854}, [r20+12288]; +ld.shared.v2.f32 {f1857, f1858}, [r20+16384]; +ld.shared.v2.f32 {f1861, f1862}, [r20+20480]; +ld.shared.v2.f32 {f1865, f1866}, [r20+24576]; +ld.shared.v2.f32 {f1869, f1870}, [r20+28672]; +ld.shared.v2.f32 {f1873, f1874}, [r20+32768]; +ld.shared.v2.f32 {f1877, f1878}, [r20+36864]; +ld.shared.v2.f32 {f1881, f1882}, [r20+40960]; +ld.shared.v2.f32 {f1885, f1886}, [r20+45056]; +ld.shared.v2.f32 {f1889, f1890}, [r20+49152]; +ld.shared.v2.f32 {f1893, f1894}, [r20+53248]; +ld.shared.v2.f32 {f1897, f1898}, [r20+57344]; +ld.shared.v2.f32 {f1901, f1902}, [r20+61440]; +ld.shared.v2.f32 {f1905, f1906}, [r20+65536]; +ld.shared.v2.f32 {f1909, f1910}, [r20+69632]; +ld.shared.v2.f32 {f1913, f1914}, [r20+73728]; +ld.shared.v2.f32 {f1917, f1918}, [r20+77824]; +ld.shared.v2.f32 {f1921, f1922}, [r20+81920]; +ld.shared.v2.f32 {f1925, f1926}, [r20+86016]; +ld.shared.v2.f32 {f1929, f1930}, [r20+90112]; +ld.shared.v2.f32 {f1933, f1934}, [r20+94208]; +ld.shared.v2.f32 {f1937, f1938}, [r20+98304]; +ld.shared.v2.f32 {f1941, f1942}, [r20+102400]; +ld.shared.v2.f32 {f1945, f1946}, [r20+106496]; +ld.shared.v2.f32 {f1949, f1950}, [r20+110592]; +ld.shared.v2.f32 {f1953, f1954}, [r20+114688]; +ld.shared.v2.f32 {f1957, f1958}, [r20+118784]; +ld.shared.v2.f32 {f1961, f1962}, [r20+122880]; +ld.shared.v2.f32 {f1965, f1966}, [r20+126976]; +add.f32 f1969, f1841, f1905; +sub.f32 f1971, f1841, f1905; +add.f32 f2410, f1842, f1906; +sub.f32 f1972, f1842, f1906; +add.f32 f1973, f1873, f1937; +sub.f32 f1975, f1873, f1937; +add.f32 f2409, f1874, f1938; +sub.f32 f1976, f1874, f1938; +add.f32 f1977, f1969, f1973; +sub.f32 f1979, f1969, f1973; +add.f32 f2408, f2410, f2409; +sub.f32 f1980, f2410, f2409; +add.f32 f1981, f1971, f1976; +sub.f32 f1983, f1971, f1976; +sub.f32 f2407, f1972, f1975; +add.f32 f1984, f1972, f1975; +add.f32 f1985, f1857, f1921; +sub.f32 f1987, f1857, f1921; +add.f32 f2406, f1858, f1922; +sub.f32 f1988, f1858, f1922; +add.f32 f1989, f1889, f1953; +sub.f32 f1991, f1889, f1953; +add.f32 f2405, f1890, f1954; +sub.f32 f1992, f1890, f1954; +add.f32 f1993, f1985, f1989; +sub.f32 f1995, f1985, f1989; +add.f32 f2404, f2406, f2405; +sub.f32 f1996, f2406, f2405; +add.f32 f1997, f1987, f1992; +sub.f32 f1999, f1987, f1992; +sub.f32 f2403, f1988, f1991; +add.f32 f2000, f1988, f1991; +mul.f32 f2002, f2403, 0fBF3504F3; +mul.f32 f2402, f1997, 0f3F3504F3; +sub.f32 f2003, f2402, f2002; +mul.f32 f2004, f2403, 0f3F3504F3; +fma.rn.f32 f2005, f1997, 0fBF3504F3, f2004; +mul.f32 f2006, f1999, 0fBF3504F3; +mul.f32 f2007, f2000, 0fBF3504F3; +sub.f32 f2008, f2006, f2007; +add.f32 f2009, f2006, f2007; +add.f32 f2010, f1977, f1993; +sub.f32 f2012, f1977, f1993; +add.f32 f2401, f2408, f2404; +sub.f32 f2013, f2408, f2404; +add.f32 f2014, f1981, f2003; +sub.f32 f2016, f1981, f2003; +add.f32 f2400, f2407, f2005; +sub.f32 f2017, f2407, f2005; +add.f32 f2018, f1979, f1996; +sub.f32 f2020, f1979, f1996; +sub.f32 f2399, f1980, f1995; +add.f32 f2021, f1980, f1995; +add.f32 f2022, f1983, f2008; +sub.f32 f2024, f1983, f2008; +add.f32 f2398, f1984, f2009; +sub.f32 f2025, f1984, f2009; +add.f32 f2026, f1849, f1913; +sub.f32 f2028, f1849, f1913; +add.f32 f2397, f1850, f1914; +sub.f32 f2029, f1850, f1914; +add.f32 f2030, f1881, f1945; +sub.f32 f2032, f1881, f1945; +add.f32 f2396, f1882, f1946; +sub.f32 f2033, f1882, f1946; +add.f32 f2034, f2026, f2030; +sub.f32 f2036, f2026, f2030; +add.f32 f2395, f2397, f2396; +sub.f32 f2037, f2397, f2396; +add.f32 f2038, f2028, f2033; +sub.f32 f2040, f2028, f2033; +sub.f32 f2394, f2029, f2032; +add.f32 f2041, f2029, f2032; +add.f32 f2042, f1865, f1929; +sub.f32 f2044, f1865, f1929; +add.f32 f2393, f1866, f1930; +sub.f32 f2045, f1866, f1930; +add.f32 f2046, f1897, f1961; +sub.f32 f2048, f1897, f1961; +add.f32 f2392, f1898, f1962; +sub.f32 f2049, f1898, f1962; +add.f32 f2050, f2042, f2046; +sub.f32 f2052, f2042, f2046; +add.f32 f2391, f2393, f2392; +sub.f32 f2053, f2393, f2392; +add.f32 f2054, f2044, f2049; +sub.f32 f2056, f2044, f2049; +sub.f32 f2390, f2045, f2048; +add.f32 f2057, f2045, f2048; +mul.f32 f2388, f2054, 0f3F3504F3; +mul.f32 f2389, f2390, 0fBF3504F3; +sub.f32 f2060, f2388, f2389; +mul.f32 f2061, f2390, 0f3F3504F3; +fma.rn.f32 f2062, f2054, 0fBF3504F3, f2061; +mul.f32 f2063, f2056, 0fBF3504F3; +mul.f32 f2064, f2057, 0fBF3504F3; +sub.f32 f2065, f2063, f2064; +add.f32 f2066, f2063, f2064; +add.f32 f2067, f2034, f2050; +sub.f32 f2069, f2034, f2050; +add.f32 f2387, f2395, f2391; +sub.f32 f2070, f2395, f2391; +add.f32 f2071, f2038, f2060; +sub.f32 f2073, f2038, f2060; +add.f32 f2386, f2394, f2062; +sub.f32 f2074, f2394, f2062; +add.f32 f2075, f2036, f2053; +sub.f32 f2077, f2036, f2053; +sub.f32 f2385, f2037, f2052; +add.f32 f2078, f2037, f2052; +add.f32 f2079, f2040, f2065; +sub.f32 f2081, f2040, f2065; +add.f32 f2384, f2041, f2066; +sub.f32 f2082, f2041, f2066; +mul.f32 f2084, f2386, 0fBEC3EF15; +mul.f32 f2383, f2071, 0f3F6C835E; +sub.f32 f2085, f2383, f2084; +mul.f32 f2086, f2386, 0f3F6C835E; +fma.rn.f32 f2087, f2071, 0fBEC3EF15, f2086; +mul.f32 f2089, f2385, 0fBF3504F3; +mul.f32 f2382, f2075, 0f3F3504F3; +sub.f32 f2090, f2382, f2089; +mul.f32 f2091, f2385, 0f3F3504F3; +fma.rn.f32 f2092, f2075, 0fBF3504F3, f2091; +mul.f32 f2094, f2384, 0fBF6C835E; +mul.f32 f2381, f2079, 0f3EC3EF15; +sub.f32 f2095, f2381, f2094; +mul.f32 f2096, f2384, 0f3EC3EF15; +fma.rn.f32 f2097, f2079, 0fBF6C835E, f2096; +mul.f32 f2099, f2074, 0fBF6C835E; +mul.f32 f2380, f2073, 0fBEC3EF15; +sub.f32 f2100, f2380, f2099; +mul.f32 f2101, f2074, 0fBEC3EF15; +fma.rn.f32 f2102, f2073, 0fBF6C835E, f2101; +mul.f32 f2103, f2077, 0fBF3504F3; +mul.f32 f2104, f2078, 0fBF3504F3; +sub.f32 f2105, f2103, f2104; +add.f32 f2106, f2103, f2104; +mul.f32 f2108, f2082, 0fBEC3EF15; +mul.f32 f2379, f2081, 0fBF6C835E; +sub.f32 f2109, f2379, f2108; +mul.f32 f2110, f2082, 0fBF6C835E; +fma.rn.f32 f2111, f2081, 0fBEC3EF15, f2110; +add.f32 f2112, f1845, f1909; +sub.f32 f2114, f1845, f1909; +add.f32 f2378, f1846, f1910; +sub.f32 f2115, f1846, f1910; +add.f32 f2116, f1877, f1941; +sub.f32 f2118, f1877, f1941; +add.f32 f2377, f1878, f1942; +sub.f32 f2119, f1878, f1942; +add.f32 f2120, f2112, f2116; +sub.f32 f2122, f2112, f2116; +add.f32 f2376, f2378, f2377; +sub.f32 f2123, f2378, f2377; +add.f32 f2124, f2114, f2119; +sub.f32 f2126, f2114, f2119; +sub.f32 f2375, f2115, f2118; +add.f32 f2127, f2115, f2118; +add.f32 f2128, f1861, f1925; +sub.f32 f2130, f1861, f1925; +add.f32 f2374, f1862, f1926; +sub.f32 f2131, f1862, f1926; +add.f32 f2132, f1893, f1957; +sub.f32 f2134, f1893, f1957; +add.f32 f2373, f1894, f1958; +sub.f32 f2135, f1894, f1958; +add.f32 f2136, f2128, f2132; +sub.f32 f2138, f2128, f2132; +add.f32 f2372, f2374, f2373; +sub.f32 f2139, f2374, f2373; +add.f32 f2140, f2130, f2135; +sub.f32 f2142, f2130, f2135; +sub.f32 f2371, f2131, f2134; +add.f32 f2143, f2131, f2134; +mul.f32 f2369, f2140, 0f3F3504F3; +mul.f32 f2370, f2371, 0fBF3504F3; +sub.f32 f2146, f2369, f2370; +mul.f32 f2147, f2371, 0f3F3504F3; +fma.rn.f32 f2148, f2140, 0fBF3504F3, f2147; +mul.f32 f2149, f2142, 0fBF3504F3; +mul.f32 f2150, f2143, 0fBF3504F3; +sub.f32 f2151, f2149, f2150; +add.f32 f2152, f2149, f2150; +add.f32 f2153, f2120, f2136; +sub.f32 f2155, f2120, f2136; +add.f32 f2368, f2376, f2372; +sub.f32 f2156, f2376, f2372; +add.f32 f2157, f2124, f2146; +sub.f32 f2159, f2124, f2146; +add.f32 f2367, f2375, f2148; +sub.f32 f2160, f2375, f2148; +add.f32 f2161, f2122, f2139; +sub.f32 f2163, f2122, f2139; +sub.f32 f2366, f2123, f2138; +add.f32 f2164, f2123, f2138; +add.f32 f2165, f2126, f2151; +sub.f32 f2167, f2126, f2151; +add.f32 f2365, f2127, f2152; +sub.f32 f2168, f2127, f2152; +add.f32 f2169, f1853, f1917; +sub.f32 f2171, f1853, f1917; +add.f32 f2364, f1854, f1918; +sub.f32 f2172, f1854, f1918; +add.f32 f2173, f1885, f1949; +sub.f32 f2175, f1885, f1949; +add.f32 f2363, f1886, f1950; +sub.f32 f2176, f1886, f1950; +add.f32 f2177, f2169, f2173; +sub.f32 f2179, f2169, f2173; +add.f32 f2362, f2364, f2363; +sub.f32 f2180, f2364, f2363; +add.f32 f2181, f2171, f2176; +sub.f32 f2183, f2171, f2176; +sub.f32 f2361, f2172, f2175; +add.f32 f2184, f2172, f2175; +add.f32 f2185, f1869, f1933; +sub.f32 f2187, f1869, f1933; +add.f32 f2360, f1870, f1934; +sub.f32 f2188, f1870, f1934; +add.f32 f2189, f1901, f1965; +sub.f32 f2191, f1901, f1965; +add.f32 f2359, f1902, f1966; +sub.f32 f2192, f1902, f1966; +add.f32 f2193, f2185, f2189; +sub.f32 f2195, f2185, f2189; +add.f32 f2358, f2360, f2359; +sub.f32 f2196, f2360, f2359; +add.f32 f2197, f2187, f2192; +sub.f32 f2199, f2187, f2192; +sub.f32 f2357, f2188, f2191; +add.f32 f2200, f2188, f2191; +mul.f32 f2355, f2197, 0f3F3504F3; +mul.f32 f2356, f2357, 0fBF3504F3; +sub.f32 f2203, f2355, f2356; +mul.f32 f2204, f2357, 0f3F3504F3; +fma.rn.f32 f2205, f2197, 0fBF3504F3, f2204; +mul.f32 f2206, f2199, 0fBF3504F3; +mul.f32 f2207, f2200, 0fBF3504F3; +sub.f32 f2208, f2206, f2207; +add.f32 f2209, f2206, f2207; +add.f32 f2210, f2177, f2193; +sub.f32 f2212, f2177, f2193; +add.f32 f2354, f2362, f2358; +sub.f32 f2213, f2362, f2358; +add.f32 f2214, f2181, f2203; +sub.f32 f2216, f2181, f2203; +add.f32 f2353, f2361, f2205; +sub.f32 f2217, f2361, f2205; +add.f32 f2218, f2179, f2196; +sub.f32 f2220, f2179, f2196; +sub.f32 f2352, f2180, f2195; +add.f32 f2221, f2180, f2195; +add.f32 f2222, f2183, f2208; +sub.f32 f2224, f2183, f2208; +add.f32 f2351, f2184, f2209; +sub.f32 f2225, f2184, f2209; +mul.f32 f2227, f2353, 0fBEC3EF15; +mul.f32 f2350, f2214, 0f3F6C835E; +sub.f32 f2228, f2350, f2227; +mul.f32 f2229, f2353, 0f3F6C835E; +fma.rn.f32 f2230, f2214, 0fBEC3EF15, f2229; +mul.f32 f2348, f2218, 0f3F3504F3; +mul.f32 f2349, f2352, 0fBF3504F3; +sub.f32 f2233, f2348, f2349; +mul.f32 f2234, f2352, 0f3F3504F3; +fma.rn.f32 f2235, f2218, 0fBF3504F3, f2234; +mul.f32 f2346, f2222, 0f3EC3EF15; +mul.f32 f2347, f2351, 0fBF6C835E; +sub.f32 f2238, f2346, f2347; +mul.f32 f2239, f2351, 0f3EC3EF15; +fma.rn.f32 f2240, f2222, 0fBF6C835E, f2239; +mul.f32 f2344, f2216, 0fBEC3EF15; +mul.f32 f2345, f2217, 0fBF6C835E; +sub.f32 f2243, f2344, f2345; +mul.f32 f2244, f2217, 0fBEC3EF15; +fma.rn.f32 f2245, f2216, 0fBF6C835E, f2244; +mul.f32 f2246, f2220, 0fBF3504F3; +mul.f32 f2247, f2221, 0fBF3504F3; +sub.f32 f2248, f2246, f2247; +add.f32 f2249, f2246, f2247; +mul.f32 f2342, f2224, 0fBF6C835E; +mul.f32 f2343, f2225, 0fBEC3EF15; +sub.f32 f2252, f2342, f2343; +mul.f32 f2253, f2225, 0fBF6C835E; +fma.rn.f32 f2254, f2224, 0fBEC3EF15, f2253; +add.f32 %1, f2401, f2387; +add.f32 %0, f2010, f2067; +add.f32 %2, f2153, f2210; +add.f32 %3, f2368, f2354; +add.f32 %4, f2014, f2085; +add.f32 %5, f2400, f2087; +add.f32 %6, f2157, f2228; +add.f32 %7, f2367, f2230; +add.f32 %8, f2018, f2090; +add.f32 %9, f2399, f2092; +add.f32 %11, f2366, f2235; +add.f32 %10, f2161, f2233; +add.f32 %13, f2398, f2097; +add.f32 %12, f2022, f2095; +add.f32 %15, f2365, f2240; +add.f32 %14, f2165, f2238; +add.f32 %16, f2012, f2070; +sub.f32 %17, f2013, f2069; +add.f32 %18, f2155, f2213; +sub.f32 %19, f2156, f2212; +add.f32 %20, f2016, f2100; +add.f32 %21, f2017, f2102; +add.f32 %23, f2160, f2245; +add.f32 %22, f2159, f2243; +add.f32 %25, f2021, f2106; +add.f32 %24, f2020, f2105; +add.f32 %27, f2164, f2249; +add.f32 %26, f2163, f2248; +add.f32 %28, f2024, f2109; +add.f32 %29, f2025, f2111; +add.f32 %30, f2167, f2252; +add.f32 %31, f2168, f2254; +sub.f32 %33, f2401, f2387; +sub.f32 %32, f2010, f2067; +sub.f32 %35, f2368, f2354; +sub.f32 %34, f2153, f2210; +sub.f32 %37, f2400, f2087; +sub.f32 %36, f2014, f2085; +sub.f32 %39, f2367, f2230; +sub.f32 %38, f2157, f2228; +sub.f32 %41, f2399, f2092; +sub.f32 %40, f2018, f2090; +sub.f32 %43, f2366, f2235; +sub.f32 %42, f2161, f2233; +sub.f32 %45, f2398, f2097; +sub.f32 %44, f2022, f2095; +sub.f32 %47, f2365, f2240; +sub.f32 %46, f2165, f2238; +add.f32 %49, f2013, f2069; +sub.f32 %48, f2012, f2070; +add.f32 %51, f2156, f2212; +sub.f32 %50, f2155, f2213; +sub.f32 %53, f2017, f2102; +sub.f32 %52, f2016, f2100; +sub.f32 %55, f2160, f2245; +sub.f32 %54, f2159, f2243; +sub.f32 %57, f2021, f2106; +sub.f32 %56, f2020, f2105; +sub.f32 %59, f2164, f2249; +sub.f32 %58, f2163, f2248; +sub.f32 %61, f2025, f2111; +sub.f32 %60, f2024, f2109; +sub.f32 %63, f2168, f2254; +sub.f32 %62, f2167, f2252; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_16384), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<115, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1197>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %36, %57; +add.f32 f66, %37, %59; +sub.f32 f67, %36, %57; +sub.f32 f68, %37, %59; +add.f32 f69, %46, %68; +add.f32 f70, %48, %69; +sub.f32 f71, %46, %68; +sub.f32 f72, %48, %69; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %41, %62; +add.f32 f82, %43, %64; +sub.f32 f83, %41, %62; +sub.f32 f84, %43, %64; +add.f32 f85, %52, %73; +add.f32 f86, %53, %75; +sub.f32 f87, %52, %73; +sub.f32 f88, %53, %75; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %60; +add.f32 f123, %40, %61; +sub.f32 f124, %38, %60; +sub.f32 f125, %40, %61; +add.f32 f126, %49, %70; +add.f32 f127, %51, %72; +sub.f32 f128, %49, %70; +sub.f32 f129, %51, %72; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %44, %65; +add.f32 f139, %45, %67; +sub.f32 f140, %44, %65; +sub.f32 f141, %45, %67; +add.f32 f142, %54, %76; +add.f32 f143, %56, %77; +sub.f32 f144, %54, %76; +sub.f32 f145, %56, %77; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65472; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+4096]; +ld.shared.f32 f391, [r13+8192]; +ld.shared.f32 f392, [r13+12288]; +ld.shared.f32 f393, [r13+16384]; +ld.shared.f32 f394, [r13+20480]; +ld.shared.f32 f395, [r13+24576]; +ld.shared.f32 f396, [r13+28672]; +ld.shared.f32 f397, [r13+32768]; +ld.shared.f32 f398, [r13+36864]; +ld.shared.f32 f399, [r13+40960]; +ld.shared.f32 f400, [r13+45056]; +ld.shared.f32 f401, [r13+49152]; +ld.shared.f32 f402, [r13+53248]; +ld.shared.f32 f403, [r13+57344]; +ld.shared.f32 f404, [r13+61440]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+4096]; +ld.shared.f32 f407, [r13+8192]; +ld.shared.f32 f408, [r13+12288]; +ld.shared.f32 f409, [r13+16384]; +ld.shared.f32 f410, [r13+20480]; +ld.shared.f32 f411, [r13+24576]; +ld.shared.f32 f412, [r13+28672]; +ld.shared.f32 f413, [r13+32768]; +ld.shared.f32 f414, [r13+36864]; +ld.shared.f32 f415, [r13+40960]; +ld.shared.f32 f416, [r13+45056]; +ld.shared.f32 f417, [r13+49152]; +ld.shared.f32 f418, [r13+53248]; +ld.shared.f32 f419, [r13+57344]; +ld.shared.f32 f420, [r13+61440]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f544; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f544; +add.f32 f576, f474, f547; +add.f32 f577, f475, f549; +sub.f32 f578, f474, f547; +sub.f32 f579, f475, f549; +add.f32 f580, f464, f522; +sub.f32 f581, f465, f521; +sub.f32 f582, f464, f522; +add.f32 f583, f465, f521; +add.f32 f584, f468, f552; +add.f32 f585, f469, f554; +sub.f32 f586, f468, f552; +sub.f32 f587, f469, f554; +add.f32 f588, f472, f557; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f557; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 1008; +bfe.u32 r15, r5, 4, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f596, f568; +mul.f32 f601, f597, f569; +sub.f32 f602, f600, f601; +mul.f32 f603, f596, f569; +fma.rn.f32 f604, f597, f568, f603; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f607, f572; +mul.f32 f611, f609, f573; +sub.f32 f612, f610, f611; +mul.f32 f613, f607, f573; +fma.rn.f32 f614, f609, f572, f613; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f617, f576; +mul.f32 f621, f619, f577; +sub.f32 f622, f620, f621; +mul.f32 f623, f617, f577; +fma.rn.f32 f624, f619, f576, f623; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f627, f580; +mul.f32 f631, f629, f581; +sub.f32 f632, f630, f631; +mul.f32 f633, f627, f581; +fma.rn.f32 f634, f629, f580, f633; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f637, f584; +mul.f32 f641, f639, f585; +sub.f32 f642, f640, f641; +mul.f32 f643, f637, f585; +fma.rn.f32 f644, f639, f584, f643; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f647, f588; +mul.f32 f651, f649, f589; +sub.f32 f652, f650, f651; +mul.f32 f653, f647, f589; +fma.rn.f32 f654, f649, f588, f653; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f657, f592; +mul.f32 f661, f659, f593; +sub.f32 f662, f660, f661; +mul.f32 f663, f657, f593; +fma.rn.f32 f664, f659, f592, f663; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f667, f566; +mul.f32 f671, f669, f567; +sub.f32 f672, f670, f671; +mul.f32 f673, f667, f567; +fma.rn.f32 f674, f669, f566, f673; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f677, f570; +mul.f32 f681, f679, f571; +sub.f32 f682, f680, f681; +mul.f32 f683, f677, f571; +fma.rn.f32 f684, f679, f570, f683; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f687, f574; +mul.f32 f691, f689, f575; +sub.f32 f692, f690, f691; +mul.f32 f693, f687, f575; +fma.rn.f32 f694, f689, f574, f693; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f697, f578; +mul.f32 f701, f699, f579; +sub.f32 f702, f700, f701; +mul.f32 f703, f697, f579; +fma.rn.f32 f704, f699, f578, f703; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f707, f582; +mul.f32 f711, f709, f583; +sub.f32 f712, f710, f711; +mul.f32 f713, f707, f583; +fma.rn.f32 f714, f709, f582, f713; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f717, f586; +mul.f32 f721, f719, f587; +sub.f32 f722, f720, f721; +mul.f32 f723, f717, f587; +fma.rn.f32 f724, f719, f586, f723; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f727, f590; +mul.f32 f731, f729, f591; +sub.f32 f732, f730, f731; +mul.f32 f733, f727, f591; +fma.rn.f32 f734, f729, f590, f733; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f737, f594; +mul.f32 f741, f739, f595; +sub.f32 f742, f740, f741; +mul.f32 f743, f737, f595; +fma.rn.f32 f744, f739, f594, f743; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 64512; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f602; +st.shared.f32 [r20+128], f612; +st.shared.f32 [r20+192], f622; +st.shared.f32 [r20+256], f632; +st.shared.f32 [r20+320], f642; +st.shared.f32 [r20+384], f652; +st.shared.f32 [r20+448], f662; +st.shared.f32 [r20+512], f672; +st.shared.f32 [r20+576], f682; +st.shared.f32 [r20+640], f692; +st.shared.f32 [r20+704], f702; +st.shared.f32 [r20+768], f712; +st.shared.f32 [r20+832], f722; +st.shared.f32 [r20+896], f732; +st.shared.f32 [r20+960], f742; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+4096]; +ld.shared.f32 f747, [r21+8192]; +ld.shared.f32 f748, [r21+12288]; +ld.shared.f32 f749, [r21+16384]; +ld.shared.f32 f750, [r21+20480]; +ld.shared.f32 f751, [r21+24576]; +ld.shared.f32 f752, [r21+28672]; +ld.shared.f32 f753, [r21+32768]; +ld.shared.f32 f754, [r21+36864]; +ld.shared.f32 f755, [r21+40960]; +ld.shared.f32 f756, [r21+45056]; +ld.shared.f32 f757, [r21+49152]; +ld.shared.f32 f758, [r21+53248]; +ld.shared.f32 f759, [r21+57344]; +ld.shared.f32 f760, [r21+61440]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+4096]; +ld.shared.f32 f763, [r21+8192]; +ld.shared.f32 f764, [r21+12288]; +ld.shared.f32 f765, [r21+16384]; +ld.shared.f32 f766, [r21+20480]; +ld.shared.f32 f767, [r21+24576]; +ld.shared.f32 f768, [r21+28672]; +ld.shared.f32 f769, [r21+32768]; +ld.shared.f32 f770, [r21+36864]; +ld.shared.f32 f771, [r21+40960]; +ld.shared.f32 f772, [r21+45056]; +ld.shared.f32 f773, [r21+49152]; +ld.shared.f32 f774, [r21+53248]; +ld.shared.f32 f775, [r21+57344]; +ld.shared.f32 f776, [r21+61440]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +add.f32 f789, f779, f784; +sub.f32 f790, f780, f783; +sub.f32 f791, f779, f784; +add.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +add.f32 f805, f795, f800; +sub.f32 f806, f796, f799; +sub.f32 f807, f795, f800; +add.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0fBF3504F3; +sub.f32 f811, f809, f810; +mul.f32 f812, f806, 0f3F3504F3; +fma.rn.f32 f813, f805, 0fBF3504F3, f812; +mul.f32 f814, f807, 0fBF3504F3; +mul.f32 f815, f808, 0fBF3504F3; +sub.f32 f816, f814, f815; +add.f32 f817, f814, f815; +add.f32 f818, f785, f801; +add.f32 f819, f786, f802; +sub.f32 f820, f785, f801; +sub.f32 f821, f786, f802; +add.f32 f822, f789, f811; +add.f32 f823, f790, f813; +sub.f32 f824, f789, f811; +sub.f32 f825, f790, f813; +add.f32 f826, f787, f804; +sub.f32 f827, f788, f803; +sub.f32 f828, f787, f804; +add.f32 f829, f788, f803; +add.f32 f830, f791, f816; +add.f32 f831, f792, f817; +sub.f32 f832, f791, f816; +sub.f32 f833, f792, f817; +add.f32 f834, f746, f754; +add.f32 f835, f762, f770; +sub.f32 f836, f746, f754; +sub.f32 f837, f762, f770; +add.f32 f838, f750, f758; +add.f32 f839, f766, f774; +sub.f32 f840, f750, f758; +sub.f32 f841, f766, f774; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +add.f32 f846, f836, f841; +sub.f32 f847, f837, f840; +sub.f32 f848, f836, f841; +add.f32 f849, f837, f840; +add.f32 f850, f748, f756; +add.f32 f851, f764, f772; +sub.f32 f852, f748, f756; +sub.f32 f853, f764, f772; +add.f32 f854, f752, f760; +add.f32 f855, f768, f776; +sub.f32 f856, f752, f760; +sub.f32 f857, f768, f776; +add.f32 f858, f850, f854; +add.f32 f859, f851, f855; +sub.f32 f860, f850, f854; +sub.f32 f861, f851, f855; +add.f32 f862, f852, f857; +sub.f32 f863, f853, f856; +sub.f32 f864, f852, f857; +add.f32 f865, f853, f856; +mul.f32 f866, f862, 0f3F3504F3; +mul.f32 f867, f863, 0fBF3504F3; +sub.f32 f868, f866, f867; +mul.f32 f869, f863, 0f3F3504F3; +fma.rn.f32 f870, f862, 0fBF3504F3, f869; +mul.f32 f871, f864, 0fBF3504F3; +mul.f32 f872, f865, 0fBF3504F3; +sub.f32 f873, f871, f872; +add.f32 f874, f871, f872; +add.f32 f875, f842, f858; +add.f32 f876, f843, f859; +sub.f32 f877, f842, f858; +sub.f32 f878, f843, f859; +add.f32 f879, f846, f868; +add.f32 f880, f847, f870; +sub.f32 f881, f846, f868; +sub.f32 f882, f847, f870; +add.f32 f883, f844, f861; +sub.f32 f884, f845, f860; +sub.f32 f885, f844, f861; +add.f32 f886, f845, f860; +add.f32 f887, f848, f873; +add.f32 f888, f849, f874; +sub.f32 f889, f848, f873; +sub.f32 f890, f849, f874; +mul.f32 f891, f879, 0f3F6C835E; +mul.f32 f892, f880, 0fBEC3EF15; +sub.f32 f893, f891, f892; +mul.f32 f894, f880, 0f3F6C835E; +fma.rn.f32 f895, f879, 0fBEC3EF15, f894; +mul.f32 f896, f883, 0f3F3504F3; +mul.f32 f897, f884, 0fBF3504F3; +sub.f32 f898, f896, f897; +mul.f32 f899, f884, 0f3F3504F3; +fma.rn.f32 f900, f883, 0fBF3504F3, f899; +mul.f32 f901, f887, 0f3EC3EF15; +mul.f32 f902, f888, 0fBF6C835E; +sub.f32 f903, f901, f902; +mul.f32 f904, f888, 0f3EC3EF15; +fma.rn.f32 f905, f887, 0fBF6C835E, f904; +mul.f32 f906, f881, 0fBEC3EF15; +mul.f32 f907, f882, 0fBF6C835E; +sub.f32 f908, f906, f907; +mul.f32 f909, f882, 0fBEC3EF15; +fma.rn.f32 f910, f881, 0fBF6C835E, f909; +mul.f32 f911, f885, 0fBF3504F3; +mul.f32 f912, f886, 0fBF3504F3; +sub.f32 f913, f911, f912; +add.f32 f914, f911, f912; +mul.f32 f915, f889, 0fBF6C835E; +mul.f32 f916, f890, 0fBEC3EF15; +sub.f32 f917, f915, f916; +mul.f32 f918, f890, 0fBF6C835E; +fma.rn.f32 f919, f889, 0fBEC3EF15, f918; +add.f32 f920, f818, f875; +add.f32 f921, f819, f876; +sub.f32 f922, f818, f875; +sub.f32 f923, f819, f876; +add.f32 f924, f822, f893; +add.f32 f925, f823, f895; +sub.f32 f926, f822, f893; +sub.f32 f927, f823, f895; +add.f32 f928, f826, f898; +add.f32 f929, f827, f900; +sub.f32 f930, f826, f898; +sub.f32 f931, f827, f900; +add.f32 f932, f830, f903; +add.f32 f933, f831, f905; +sub.f32 f934, f830, f903; +sub.f32 f935, f831, f905; +add.f32 f936, f820, f878; +sub.f32 f937, f821, f877; +sub.f32 f938, f820, f878; +add.f32 f939, f821, f877; +add.f32 f940, f824, f908; +add.f32 f941, f825, f910; +sub.f32 f942, f824, f908; +sub.f32 f943, f825, f910; +add.f32 f944, f828, f913; +add.f32 f945, f829, f914; +sub.f32 f946, f828, f913; +sub.f32 f947, f829, f914; +add.f32 f948, f832, f917; +add.f32 f949, f833, f919; +sub.f32 f950, f832, f917; +sub.f32 f951, f833, f919; +and.b32 r22, r5, 768; +bfe.u32 r23, r5, 8, 2; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f952, f953}, [rd11]; +mul.f32 f956, f952, f924; +mul.f32 f957, f953, f925; +sub.f32 f958, f956, f957; +mul.f32 f959, f952, f925; +fma.rn.f32 f960, f953, f924, f959; +mul.f32 f961, f952, f952; +mul.f32 f962, f953, f953; +sub.f32 f963, f961, f962; +mul.f32 f964, f953, f952; +fma.rn.f32 f965, f953, f952, f964; +mul.f32 f966, f963, f928; +mul.f32 f967, f965, f929; +sub.f32 f968, f966, f967; +mul.f32 f969, f963, f929; +fma.rn.f32 f970, f965, f928, f969; +mul.f32 f971, f952, f963; +mul.f32 f972, f953, f965; +sub.f32 f973, f971, f972; +mul.f32 f974, f952, f965; +fma.rn.f32 f975, f953, f963, f974; +mul.f32 f976, f973, f932; +mul.f32 f977, f975, f933; +sub.f32 f978, f976, f977; +mul.f32 f979, f973, f933; +fma.rn.f32 f980, f975, f932, f979; +mul.f32 f981, f952, f973; +mul.f32 f982, f953, f975; +sub.f32 f983, f981, f982; +mul.f32 f984, f952, f975; +fma.rn.f32 f985, f953, f973, f984; +mul.f32 f986, f983, f936; +mul.f32 f987, f985, f937; +sub.f32 f988, f986, f987; +mul.f32 f989, f983, f937; +fma.rn.f32 f990, f985, f936, f989; +mul.f32 f991, f952, f983; +mul.f32 f992, f953, f985; +sub.f32 f993, f991, f992; +mul.f32 f994, f952, f985; +fma.rn.f32 f995, f953, f983, f994; +mul.f32 f996, f993, f940; +mul.f32 f997, f995, f941; +sub.f32 f998, f996, f997; +mul.f32 f999, f993, f941; +fma.rn.f32 f1000, f995, f940, f999; +mul.f32 f1001, f952, f993; +mul.f32 f1002, f953, f995; +sub.f32 f1003, f1001, f1002; +mul.f32 f1004, f952, f995; +fma.rn.f32 f1005, f953, f993, f1004; +mul.f32 f1006, f1003, f944; +mul.f32 f1007, f1005, f945; +sub.f32 f1008, f1006, f1007; +mul.f32 f1009, f1003, f945; +fma.rn.f32 f1010, f1005, f944, f1009; +mul.f32 f1011, f952, f1003; +mul.f32 f1012, f953, f1005; +sub.f32 f1013, f1011, f1012; +mul.f32 f1014, f952, f1005; +fma.rn.f32 f1015, f953, f1003, f1014; +mul.f32 f1016, f1013, f948; +mul.f32 f1017, f1015, f949; +sub.f32 f1018, f1016, f1017; +mul.f32 f1019, f1013, f949; +fma.rn.f32 f1020, f1015, f948, f1019; +mul.f32 f1021, f952, f1013; +mul.f32 f1022, f953, f1015; +sub.f32 f1023, f1021, f1022; +mul.f32 f1024, f952, f1015; +fma.rn.f32 f1025, f953, f1013, f1024; +mul.f32 f1026, f1023, f922; +mul.f32 f1027, f1025, f923; +sub.f32 f1028, f1026, f1027; +mul.f32 f1029, f1023, f923; +fma.rn.f32 f1030, f1025, f922, f1029; +mul.f32 f1031, f952, f1023; +mul.f32 f1032, f953, f1025; +sub.f32 f1033, f1031, f1032; +mul.f32 f1034, f952, f1025; +fma.rn.f32 f1035, f953, f1023, f1034; +mul.f32 f1036, f1033, f926; +mul.f32 f1037, f1035, f927; +sub.f32 f1038, f1036, f1037; +mul.f32 f1039, f1033, f927; +fma.rn.f32 f1040, f1035, f926, f1039; +mul.f32 f1041, f952, f1033; +mul.f32 f1042, f953, f1035; +sub.f32 f1043, f1041, f1042; +mul.f32 f1044, f952, f1035; +fma.rn.f32 f1045, f953, f1033, f1044; +mul.f32 f1046, f1043, f930; +mul.f32 f1047, f1045, f931; +sub.f32 f1048, f1046, f1047; +mul.f32 f1049, f1043, f931; +fma.rn.f32 f1050, f1045, f930, f1049; +mul.f32 f1051, f952, f1043; +mul.f32 f1052, f953, f1045; +sub.f32 f1053, f1051, f1052; +mul.f32 f1054, f952, f1045; +fma.rn.f32 f1055, f953, f1043, f1054; +mul.f32 f1056, f1053, f934; +mul.f32 f1057, f1055, f935; +sub.f32 f1058, f1056, f1057; +mul.f32 f1059, f1053, f935; +fma.rn.f32 f1060, f1055, f934, f1059; +mul.f32 f1061, f952, f1053; +mul.f32 f1062, f953, f1055; +sub.f32 f1063, f1061, f1062; +mul.f32 f1064, f952, f1055; +fma.rn.f32 f1065, f953, f1053, f1064; +mul.f32 f1066, f1063, f938; +mul.f32 f1067, f1065, f939; +sub.f32 f1068, f1066, f1067; +mul.f32 f1069, f1063, f939; +fma.rn.f32 f1070, f1065, f938, f1069; +mul.f32 f1071, f952, f1063; +mul.f32 f1072, f953, f1065; +sub.f32 f1073, f1071, f1072; +mul.f32 f1074, f952, f1065; +fma.rn.f32 f1075, f953, f1063, f1074; +mul.f32 f1076, f1073, f942; +mul.f32 f1077, f1075, f943; +sub.f32 f1078, f1076, f1077; +mul.f32 f1079, f1073, f943; +fma.rn.f32 f1080, f1075, f942, f1079; +mul.f32 f1081, f952, f1073; +mul.f32 f1082, f953, f1075; +sub.f32 f1083, f1081, f1082; +mul.f32 f1084, f952, f1075; +fma.rn.f32 f1085, f953, f1073, f1084; +mul.f32 f1086, f1083, f946; +mul.f32 f1087, f1085, f947; +sub.f32 f1088, f1086, f1087; +mul.f32 f1089, f1083, f947; +fma.rn.f32 f1090, f1085, f946, f1089; +mul.f32 f1091, f952, f1083; +mul.f32 f1092, f953, f1085; +sub.f32 f1093, f1091, f1092; +mul.f32 f1094, f952, f1085; +fma.rn.f32 f1095, f953, f1083, f1094; +mul.f32 f1096, f1093, f950; +mul.f32 f1097, f1095, f951; +sub.f32 f1098, f1096, f1097; +mul.f32 f1099, f1093, f951; +fma.rn.f32 f1100, f1095, f950, f1099; +and.b32 r24, r16, 1020; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 49152; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f920; +st.shared.f32 [r27+1024], f958; +st.shared.f32 [r27+2048], f968; +st.shared.f32 [r27+3072], f978; +st.shared.f32 [r27+4096], f988; +st.shared.f32 [r27+5120], f998; +st.shared.f32 [r27+6144], f1008; +st.shared.f32 [r27+7168], f1018; +st.shared.f32 [r27+8192], f1028; +st.shared.f32 [r27+9216], f1038; +st.shared.f32 [r27+10240], f1048; +st.shared.f32 [r27+11264], f1058; +st.shared.f32 [r27+12288], f1068; +st.shared.f32 [r27+13312], f1078; +st.shared.f32 [r27+14336], f1088; +st.shared.f32 [r27+15360], f1098; +barrier.sync 0; +mad.lo.s32 r28, r22, -60, r27; +ld.shared.f32 f1101, [r28]; +ld.shared.f32 f1102, [r28+4096]; +ld.shared.f32 f1103, [r28+8192]; +ld.shared.f32 f1104, [r28+12288]; +ld.shared.f32 f1105, [r28+16384]; +ld.shared.f32 f1106, [r28+20480]; +ld.shared.f32 f1107, [r28+24576]; +ld.shared.f32 f1108, [r28+28672]; +ld.shared.f32 f1109, [r28+32768]; +ld.shared.f32 f1110, [r28+36864]; +ld.shared.f32 f1111, [r28+40960]; +ld.shared.f32 f1112, [r28+45056]; +ld.shared.f32 f1113, [r28+49152]; +ld.shared.f32 f1114, [r28+53248]; +ld.shared.f32 f1115, [r28+57344]; +ld.shared.f32 f1116, [r28+61440]; +barrier.sync 0; +st.shared.f32 [r27], f921; +st.shared.f32 [r27+1024], f960; +st.shared.f32 [r27+2048], f970; +st.shared.f32 [r27+3072], f980; +st.shared.f32 [r27+4096], f990; +st.shared.f32 [r27+5120], f1000; +st.shared.f32 [r27+6144], f1010; +st.shared.f32 [r27+7168], f1020; +st.shared.f32 [r27+8192], f1030; +st.shared.f32 [r27+9216], f1040; +st.shared.f32 [r27+10240], f1050; +st.shared.f32 [r27+11264], f1060; +st.shared.f32 [r27+12288], f1070; +st.shared.f32 [r27+13312], f1080; +st.shared.f32 [r27+14336], f1090; +st.shared.f32 [r27+15360], f1100; +barrier.sync 0; +ld.shared.f32 f1117, [r28]; +ld.shared.f32 f1118, [r28+4096]; +ld.shared.f32 f1119, [r28+8192]; +ld.shared.f32 f1120, [r28+12288]; +ld.shared.f32 f1121, [r28+16384]; +ld.shared.f32 f1122, [r28+20480]; +ld.shared.f32 f1123, [r28+24576]; +ld.shared.f32 f1124, [r28+28672]; +ld.shared.f32 f1125, [r28+32768]; +ld.shared.f32 f1126, [r28+36864]; +ld.shared.f32 f1127, [r28+40960]; +ld.shared.f32 f1128, [r28+45056]; +ld.shared.f32 f1129, [r28+49152]; +ld.shared.f32 f1130, [r28+53248]; +ld.shared.f32 f1131, [r28+57344]; +ld.shared.f32 f1132, [r28+61440]; +add.f32 f1133, f1101, f1109; +add.f32 f1134, f1117, f1125; +sub.f32 f1135, f1101, f1109; +sub.f32 f1136, f1117, f1125; +add.f32 f1137, f1105, f1113; +add.f32 f1138, f1121, f1129; +sub.f32 f1139, f1105, f1113; +sub.f32 f1140, f1121, f1129; +add.f32 f1141, f1102, f1110; +add.f32 f1142, f1118, f1126; +sub.f32 f1143, f1102, f1110; +sub.f32 f1144, f1118, f1126; +add.f32 f1145, f1106, f1114; +add.f32 f1146, f1122, f1130; +sub.f32 f1147, f1106, f1114; +sub.f32 f1148, f1122, f1130; +add.f32 f1149, f1103, f1111; +add.f32 f1150, f1119, f1127; +sub.f32 f1151, f1103, f1111; +sub.f32 f1152, f1119, f1127; +add.f32 f1153, f1107, f1115; +add.f32 f1154, f1123, f1131; +sub.f32 f1155, f1107, f1115; +sub.f32 f1156, f1123, f1131; +add.f32 f1157, f1104, f1112; +add.f32 f1158, f1120, f1128; +sub.f32 f1159, f1104, f1112; +sub.f32 f1160, f1120, f1128; +add.f32 f1161, f1108, f1116; +add.f32 f1162, f1124, f1132; +sub.f32 f1163, f1108, f1116; +sub.f32 f1164, f1124, f1132; +add.f32 %0, f1133, f1137; +add.f32 %1, f1134, f1138; +add.f32 %2, f1141, f1145; +add.f32 %3, f1142, f1146; +add.f32 %4, f1149, f1153; +add.f32 %5, f1150, f1154; +add.f32 %6, f1157, f1161; +add.f32 %7, f1158, f1162; +sub.f32 %9, f1136, f1139; +add.f32 %8, f1135, f1140; +sub.f32 %11, f1144, f1147; +add.f32 %10, f1143, f1148; +sub.f32 %13, f1152, f1155; +add.f32 %12, f1151, f1156; +sub.f32 %15, f1160, f1163; +add.f32 %14, f1159, f1164; +sub.f32 %16, f1133, f1137; +sub.f32 %17, f1134, f1138; +sub.f32 %18, f1141, f1145; +sub.f32 %19, f1142, f1146; +sub.f32 %20, f1149, f1153; +sub.f32 %21, f1150, f1154; +sub.f32 %22, f1157, f1161; +sub.f32 %23, f1158, f1162; +add.f32 %25, f1136, f1139; +sub.f32 %24, f1135, f1140; +add.f32 %27, f1144, f1147; +sub.f32 %26, f1143, f1148; +add.f32 %29, f1152, f1155; +sub.f32 %28, f1151, f1156; +add.f32 %31, f1160, f1163; +sub.f32 %30, f1159, f1164; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_16384), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1151, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1567>; +.reg .b32 r<49>; +.reg .b64 rd<16>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %36, %52; +sub.f32 f67, %36, %52; +add.f32 f1554, %37, %68; +sub.f32 f68, %37, %68; +add.f32 f69, %44, %60; +sub.f32 f71, %44, %60; +add.f32 f1552, %69, %61; +sub.f32 f72, %69, %61; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1551, f1554, f1552; +sub.f32 f76, f1554, f1552; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f1550, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %40, %56; +sub.f32 f83, %40, %56; +add.f32 f1547, %71, %70; +sub.f32 f84, %71, %70; +add.f32 f85, %48, %64; +sub.f32 f87, %48, %64; +add.f32 f1545, %49, %72; +sub.f32 f88, %49, %72; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1544, f1547, f1545; +sub.f32 f92, f1547, f1545; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f1543, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f1543, 0fBF3504F3; +mul.f32 f1542, f93, 0f3F3504F3; +sub.f32 f99, f1542, f98; +mul.f32 f100, f1543, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1541, f1551, f1544; +sub.f32 f109, f1551, f1544; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1540, f1550, f101; +sub.f32 f113, f1550, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f1539, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f1538, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %54; +sub.f32 f124, %38, %54; +add.f32 f1536, %73, %55; +sub.f32 f125, %73, %55; +add.f32 f126, %46, %62; +sub.f32 f128, %46, %62; +add.f32 f1533, %74, %75; +sub.f32 f129, %74, %75; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1532, f1536, f1533; +sub.f32 f133, f1536, f1533; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f1531, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %42, %58; +sub.f32 f140, %42, %58; +add.f32 f1529, %43, %76; +sub.f32 f141, %43, %76; +add.f32 f142, %50, %66; +sub.f32 f144, %50, %66; +add.f32 f1527, %77, %67; +sub.f32 f145, %77, %67; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1526, f1529, f1527; +sub.f32 f149, f1529, f1527; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f1525, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f1525, 0fBF3504F3; +mul.f32 f1524, f150, 0f3F3504F3; +sub.f32 f156, f1524, f155; +mul.f32 f157, f1525, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1523, f1532, f1526; +sub.f32 f166, f1532, f1526; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1522, f1531, f158; +sub.f32 f170, f1531, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f1521, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f1520, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1518, f167, 0f3F6C835E; +mul.f32 f1519, f1522, 0fBEC3EF15; +sub.f32 f181, f1518, f1519; +mul.f32 f182, f1522, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f1516, f171, 0f3F3504F3; +mul.f32 f1517, f1521, 0fBF3504F3; +sub.f32 f186, f1516, f1517; +mul.f32 f187, f1521, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f1514, f175, 0f3EC3EF15; +mul.f32 f1515, f1520, 0fBF6C835E; +sub.f32 f191, f1514, f1515; +mul.f32 f192, f1520, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f1512, f169, 0fBEC3EF15; +mul.f32 f1513, f170, 0fBF6C835E; +sub.f32 f196, f1512, f1513; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f1510, f177, 0fBF6C835E; +mul.f32 f1511, f178, 0fBEC3EF15; +sub.f32 f205, f1510, f1511; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1509, f1540, f183; +sub.f32 f213, f1540, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1508, f1539, f188; +sub.f32 f217, f1539, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f1507, f1538, f193; +sub.f32 f221, f1538, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f1506, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f1505, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f1504, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1503, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r30, %tid.x; +shl.b32 r7, r30, 7; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r30, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f1509; +mul.f32 f244, f238, f1509; +mul.f32 f246, f239, f239; +mul.f32 f1502, f238, f238; +sub.f32 f247, f1502, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f1508; +mul.f32 f252, f247, f1508; +mul.f32 f1500, f238, f247; +mul.f32 f1501, f239, f249; +sub.f32 f255, f1500, f1501; +mul.f32 f1499, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f1507; +mul.f32 f260, f255, f1507; +mul.f32 f262, f239, f257; +mul.f32 f1498, f238, f255; +sub.f32 f263, f1498, f262; +mul.f32 f1497, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f1506; +mul.f32 f268, f263, f1506; +mul.f32 f270, f239, f265; +mul.f32 f1496, f238, f263; +sub.f32 f271, f1496, f270; +mul.f32 f1495, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f1505; +mul.f32 f276, f271, f1505; +mul.f32 f1493, f238, f271; +mul.f32 f1494, f239, f273; +sub.f32 f279, f1493, f1494; +mul.f32 f1492, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f1504; +mul.f32 f284, f279, f1504; +mul.f32 f286, f239, f281; +mul.f32 f1491, f238, f279; +sub.f32 f287, f1491, f286; +mul.f32 f1490, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f1503; +mul.f32 f292, f287, f1503; +mul.f32 f294, f239, f289; +mul.f32 f1489, f238, f287; +sub.f32 f295, f1489, f294; +mul.f32 f1488, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1487, f1541, f1523; +mul.f32 f299, f297, f1487; +mul.f32 f300, f295, f1487; +mul.f32 f1485, f238, f295; +mul.f32 f1486, f239, f297; +sub.f32 f303, f1485, f1486; +sub.f32 f1484, f106, f163; +mul.f32 f1483, f295, f1484; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1482, f238, f303; +sub.f32 f311, f1482, f310; +mul.f32 f1481, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f1479, f238, f311; +mul.f32 f1480, f239, f313; +sub.f32 f319, f1479, f1480; +mul.f32 f1478, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1477, f238, f319; +sub.f32 f327, f1477, f326; +mul.f32 f1476, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1475, f238, f327; +sub.f32 f335, f1475, f334; +mul.f32 f1474, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f1472, f238, f335; +mul.f32 f1473, f239, f337; +sub.f32 f343, f1472, f1473; +mul.f32 f1471, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1470, f238, f343; +sub.f32 f351, f1470, f350; +mul.f32 f1469, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f1468, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f1562, f1541, f1523; +mul.f32 f1561, f297, f1562; +barrier.sync 0; +and.b32 r11, r7, 130944; +add.s32 r12, r9, r11; +add.f32 f357, f1541, f1523; +sub.f32 f1557, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r45, %tid.x; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f1469, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f1499, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f1497, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f1495, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f1492, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f1490, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f1488, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f1557, f300; +sub.f32 f374, f1483, f1561; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f1481, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f1478, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f1476, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f1474, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f1471, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f1468, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +and.b32 r29, r45, 1023; +mad.lo.s32 r13, r29, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+8192]; +ld.shared.v2.f32 {f397, f398}, [r13+16384]; +ld.shared.v2.f32 {f401, f402}, [r13+24576]; +ld.shared.v2.f32 {f405, f406}, [r13+32768]; +ld.shared.v2.f32 {f409, f410}, [r13+40960]; +ld.shared.v2.f32 {f413, f414}, [r13+49152]; +ld.shared.v2.f32 {f417, f418}, [r13+57344]; +ld.shared.v2.f32 {f421, f422}, [r13+65536]; +ld.shared.v2.f32 {f425, f426}, [r13+73728]; +ld.shared.v2.f32 {f429, f430}, [r13+81920]; +ld.shared.v2.f32 {f433, f434}, [r13+90112]; +ld.shared.v2.f32 {f437, f438}, [r13+98304]; +ld.shared.v2.f32 {f441, f442}, [r13+106496]; +ld.shared.v2.f32 {f445, f446}, [r13+114688]; +ld.shared.v2.f32 {f449, f450}, [r13+122880]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1467, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1466, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1465, f1467, f1466; +sub.f32 f464, f1467, f1466; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f1464, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1463, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1462, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1461, f1463, f1462; +sub.f32 f480, f1463, f1462; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f1460, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f1458, f481, 0f3F3504F3; +mul.f32 f1459, f1460, 0fBF3504F3; +sub.f32 f487, f1458, f1459; +mul.f32 f488, f1460, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1457, f1465, f1461; +sub.f32 f497, f1465, f1461; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1456, f1464, f489; +sub.f32 f501, f1464, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f1455, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f1454, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1453, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1452, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1451, f1453, f1452; +sub.f32 f521, f1453, f1452; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f1450, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1449, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1448, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1447, f1449, f1448; +sub.f32 f537, f1449, f1448; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f1446, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f1444, f538, 0f3F3504F3; +mul.f32 f1445, f1446, 0fBF3504F3; +sub.f32 f544, f1444, f1445; +mul.f32 f545, f1446, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1443, f1451, f1447; +sub.f32 f554, f1451, f1447; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1442, f1450, f546; +sub.f32 f558, f1450, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f1441, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f1440, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1442, 0fBEC3EF15; +mul.f32 f1439, f555, 0f3F6C835E; +sub.f32 f569, f1439, f568; +mul.f32 f570, f1442, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f1441, 0fBF3504F3; +mul.f32 f1438, f559, 0f3F3504F3; +sub.f32 f574, f1438, f573; +mul.f32 f575, f1441, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f1436, f563, 0f3EC3EF15; +mul.f32 f1437, f1440, 0fBF6C835E; +sub.f32 f579, f1436, f1437; +mul.f32 f580, f1440, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f1434, f557, 0fBEC3EF15; +mul.f32 f1435, f558, 0fBF6C835E; +sub.f32 f584, f1434, f1435; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f1433, f565, 0fBF6C835E; +sub.f32 f593, f1433, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1432, f1456, f571; +sub.f32 f601, f1456, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1431, f1455, f576; +sub.f32 f605, f1455, f576; +add.f32 f606, f506, f579; +sub.f32 f608, f506, f579; +add.f32 f1430, f1454, f581; +sub.f32 f609, f1454, f581; +add.f32 f610, f496, f554; +sub.f32 f612, f496, f554; +sub.f32 f1429, f497, f553; +add.f32 f613, f497, f553; +add.f32 f614, f500, f584; +sub.f32 f616, f500, f584; +add.f32 f1428, f501, f586; +sub.f32 f617, f501, f586; +add.f32 f618, f504, f589; +sub.f32 f620, f504, f589; +add.f32 f1427, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1426, f509, f595; +sub.f32 f625, f509, f595; +bfe.u32 r15, r45, 4, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f631, f627, f1432; +mul.f32 f632, f626, f1432; +mul.f32 f634, f627, f627; +mul.f32 f1425, f626, f626; +sub.f32 f635, f1425, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f639, f637, f1431; +mul.f32 f640, f635, f1431; +mul.f32 f1423, f626, f635; +mul.f32 f1424, f627, f637; +sub.f32 f643, f1423, f1424; +mul.f32 f1422, f635, f602; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f647, f645, f1430; +mul.f32 f648, f643, f1430; +mul.f32 f650, f627, f645; +mul.f32 f1421, f626, f643; +sub.f32 f651, f1421, f650; +mul.f32 f1420, f643, f606; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f655, f653, f1429; +mul.f32 f656, f651, f1429; +mul.f32 f658, f627, f653; +mul.f32 f1419, f626, f651; +sub.f32 f659, f1419, f658; +mul.f32 f1418, f651, f610; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f663, f661, f1428; +mul.f32 f664, f659, f1428; +mul.f32 f1416, f626, f659; +mul.f32 f1417, f627, f661; +sub.f32 f667, f1416, f1417; +mul.f32 f1415, f659, f614; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f671, f669, f1427; +mul.f32 f672, f667, f1427; +mul.f32 f674, f627, f669; +mul.f32 f1414, f626, f667; +sub.f32 f675, f1414, f674; +mul.f32 f1413, f667, f618; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f679, f677, f1426; +mul.f32 f680, f675, f1426; +mul.f32 f682, f627, f677; +mul.f32 f1412, f626, f675; +sub.f32 f683, f1412, f682; +mul.f32 f1411, f675, f622; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1410, f1457, f1443; +mul.f32 f687, f685, f1410; +mul.f32 f688, f683, f1410; +mul.f32 f1408, f626, f683; +mul.f32 f1409, f627, f685; +sub.f32 f691, f1408, f1409; +sub.f32 f1407, f494, f551; +mul.f32 f1406, f683, f1407; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f695, f693, f601; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1405, f626, f691; +sub.f32 f699, f1405, f698; +mul.f32 f1404, f691, f600; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f703, f701, f605; +mul.f32 f704, f699, f605; +mul.f32 f1402, f626, f699; +mul.f32 f1403, f627, f701; +sub.f32 f707, f1402, f1403; +mul.f32 f1401, f699, f604; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f711, f709, f609; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f1400, f626, f707; +sub.f32 f715, f1400, f714; +mul.f32 f1399, f707, f608; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f719, f717, f613; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f1398, f626, f715; +sub.f32 f723, f1398, f722; +mul.f32 f1397, f715, f612; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f727, f725, f617; +mul.f32 f728, f723, f617; +mul.f32 f1395, f626, f723; +mul.f32 f1396, f627, f725; +sub.f32 f731, f1395, f1396; +mul.f32 f1394, f723, f616; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f735, f733, f621; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f1393, f626, f731; +sub.f32 f739, f1393, f738; +mul.f32 f1392, f626, f598; +mul.f32 f740, f626, f733; +mul.f32 f1391, f731, f620; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f739, f624; +mul.f32 f743, f741, f625; +mul.f32 f744, f739, f625; +shl.b32 r39, r45, 7; +shl.b32 r35, r45, 3; +and.b32 r16, r35, 120; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r39, 129024; +add.s32 r19, r17, r18; +mov.u32 r38, %tid.x; +shl.b32 r34, r38, 7; +shl.b32 r31, r38, 3; +sub.f32 f1564, f1457, f1443; +mul.f32 f1563, f685, f1564; +add.f32 f745, f1457, f1443; +sub.f32 f1559, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r48, %tid.x; +fma.rn.f32 f747, f627, f598, f632; +sub.f32 f748, f1392, f631; +st.shared.v2.f32 [r19+128], {f748, f747}; +fma.rn.f32 f749, f637, f602, f640; +sub.f32 f750, f1422, f639; +st.shared.v2.f32 [r19+256], {f750, f749}; +fma.rn.f32 f751, f645, f606, f648; +sub.f32 f752, f1420, f647; +st.shared.v2.f32 [r19+384], {f752, f751}; +fma.rn.f32 f753, f653, f610, f656; +sub.f32 f754, f1418, f655; +st.shared.v2.f32 [r19+512], {f754, f753}; +sub.f32 f755, f1415, f663; +fma.rn.f32 f756, f661, f614, f664; +st.shared.v2.f32 [r19+640], {f755, f756}; +fma.rn.f32 f757, f669, f618, f672; +sub.f32 f758, f1413, f671; +st.shared.v2.f32 [r19+768], {f758, f757}; +fma.rn.f32 f759, f677, f622, f680; +sub.f32 f760, f1411, f679; +st.shared.v2.f32 [r19+896], {f760, f759}; +fma.rn.f32 f761, f685, f1559, f688; +sub.f32 f762, f1406, f1563; +st.shared.v2.f32 [r19+1024], {f762, f761}; +fma.rn.f32 f763, f693, f600, f696; +sub.f32 f764, f1404, f695; +st.shared.v2.f32 [r19+1152], {f764, f763}; +fma.rn.f32 f765, f701, f604, f704; +sub.f32 f766, f1401, f703; +st.shared.v2.f32 [r19+1280], {f766, f765}; +fma.rn.f32 f767, f709, f608, f712; +sub.f32 f768, f1399, f711; +st.shared.v2.f32 [r19+1408], {f768, f767}; +fma.rn.f32 f769, f717, f612, f720; +sub.f32 f770, f1397, f719; +st.shared.v2.f32 [r19+1536], {f770, f769}; +fma.rn.f32 f771, f725, f616, f728; +sub.f32 f772, f1394, f727; +st.shared.v2.f32 [r19+1664], {f772, f771}; +fma.rn.f32 f773, f733, f620, f736; +sub.f32 f774, f1391, f735; +st.shared.v2.f32 [r19+1792], {f774, f773}; +fma.rn.f32 f775, f741, f624, f744; +sub.f32 f776, f742, f743; +st.shared.v2.f32 [r19+1920], {f776, f775}; +barrier.sync 0; +and.b32 r28, r48, 1008; +mad.lo.s32 r20, r28, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+8192]; +ld.shared.v2.f32 {f785, f786}, [r20+16384]; +ld.shared.v2.f32 {f789, f790}, [r20+24576]; +ld.shared.v2.f32 {f793, f794}, [r20+32768]; +ld.shared.v2.f32 {f797, f798}, [r20+40960]; +ld.shared.v2.f32 {f801, f802}, [r20+49152]; +ld.shared.v2.f32 {f805, f806}, [r20+57344]; +ld.shared.v2.f32 {f809, f810}, [r20+65536]; +ld.shared.v2.f32 {f813, f814}, [r20+73728]; +ld.shared.v2.f32 {f817, f818}, [r20+81920]; +ld.shared.v2.f32 {f821, f822}, [r20+90112]; +ld.shared.v2.f32 {f825, f826}, [r20+98304]; +ld.shared.v2.f32 {f829, f830}, [r20+106496]; +ld.shared.v2.f32 {f833, f834}, [r20+114688]; +ld.shared.v2.f32 {f837, f838}, [r20+122880]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f1390, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f1389, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f1388, f1390, f1389; +sub.f32 f852, f1390, f1389; +add.f32 f853, f843, f848; +sub.f32 f855, f843, f848; +sub.f32 f1387, f844, f847; +add.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f1386, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f1385, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f1384, f1386, f1385; +sub.f32 f868, f1386, f1385; +add.f32 f869, f859, f864; +sub.f32 f871, f859, f864; +sub.f32 f1383, f860, f863; +add.f32 f872, f860, f863; +mul.f32 f1381, f869, 0f3F3504F3; +mul.f32 f1382, f1383, 0fBF3504F3; +sub.f32 f875, f1381, f1382; +mul.f32 f876, f1383, 0f3F3504F3; +fma.rn.f32 f877, f869, 0fBF3504F3, f876; +mul.f32 f878, f871, 0fBF3504F3; +mul.f32 f879, f872, 0fBF3504F3; +sub.f32 f880, f878, f879; +add.f32 f881, f878, f879; +add.f32 f882, f849, f865; +sub.f32 f884, f849, f865; +add.f32 f1380, f1388, f1384; +sub.f32 f885, f1388, f1384; +add.f32 f886, f853, f875; +sub.f32 f888, f853, f875; +add.f32 f1379, f1387, f877; +sub.f32 f889, f1387, f877; +add.f32 f890, f851, f868; +sub.f32 f892, f851, f868; +sub.f32 f1378, f852, f867; +add.f32 f893, f852, f867; +add.f32 f894, f855, f880; +sub.f32 f896, f855, f880; +add.f32 f1377, f856, f881; +sub.f32 f897, f856, f881; +add.f32 f898, f781, f813; +sub.f32 f900, f781, f813; +add.f32 f1376, f782, f814; +sub.f32 f901, f782, f814; +add.f32 f902, f797, f829; +sub.f32 f904, f797, f829; +add.f32 f1375, f798, f830; +sub.f32 f905, f798, f830; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f1374, f1376, f1375; +sub.f32 f909, f1376, f1375; +add.f32 f910, f900, f905; +sub.f32 f912, f900, f905; +sub.f32 f1373, f901, f904; +add.f32 f913, f901, f904; +add.f32 f914, f789, f821; +sub.f32 f916, f789, f821; +add.f32 f1372, f790, f822; +sub.f32 f917, f790, f822; +add.f32 f918, f805, f837; +sub.f32 f920, f805, f837; +add.f32 f1371, f806, f838; +sub.f32 f921, f806, f838; +add.f32 f922, f914, f918; +sub.f32 f924, f914, f918; +add.f32 f1370, f1372, f1371; +sub.f32 f925, f1372, f1371; +add.f32 f926, f916, f921; +sub.f32 f928, f916, f921; +sub.f32 f1369, f917, f920; +add.f32 f929, f917, f920; +mul.f32 f1367, f926, 0f3F3504F3; +mul.f32 f1368, f1369, 0fBF3504F3; +sub.f32 f932, f1367, f1368; +mul.f32 f933, f1369, 0f3F3504F3; +fma.rn.f32 f934, f926, 0fBF3504F3, f933; +mul.f32 f935, f928, 0fBF3504F3; +mul.f32 f936, f929, 0fBF3504F3; +sub.f32 f937, f935, f936; +add.f32 f938, f935, f936; +add.f32 f939, f906, f922; +sub.f32 f941, f906, f922; +add.f32 f1366, f1374, f1370; +sub.f32 f942, f1374, f1370; +add.f32 f943, f910, f932; +sub.f32 f945, f910, f932; +add.f32 f1365, f1373, f934; +sub.f32 f946, f1373, f934; +add.f32 f947, f908, f925; +sub.f32 f949, f908, f925; +sub.f32 f1364, f909, f924; +add.f32 f950, f909, f924; +add.f32 f951, f912, f937; +sub.f32 f953, f912, f937; +add.f32 f1363, f913, f938; +sub.f32 f954, f913, f938; +mul.f32 f956, f1365, 0fBEC3EF15; +mul.f32 f1362, f943, 0f3F6C835E; +sub.f32 f957, f1362, f956; +mul.f32 f958, f1365, 0f3F6C835E; +fma.rn.f32 f959, f943, 0fBEC3EF15, f958; +mul.f32 f961, f1364, 0fBF3504F3; +mul.f32 f1361, f947, 0f3F3504F3; +sub.f32 f962, f1361, f961; +mul.f32 f963, f1364, 0f3F3504F3; +fma.rn.f32 f964, f947, 0fBF3504F3, f963; +mul.f32 f966, f1363, 0fBF6C835E; +mul.f32 f1360, f951, 0f3EC3EF15; +sub.f32 f967, f1360, f966; +mul.f32 f968, f1363, 0f3EC3EF15; +fma.rn.f32 f969, f951, 0fBF6C835E, f968; +mul.f32 f1358, f945, 0fBEC3EF15; +mul.f32 f1359, f946, 0fBF6C835E; +sub.f32 f972, f1358, f1359; +mul.f32 f973, f946, 0fBEC3EF15; +fma.rn.f32 f974, f945, 0fBF6C835E, f973; +mul.f32 f975, f949, 0fBF3504F3; +mul.f32 f976, f950, 0fBF3504F3; +sub.f32 f977, f975, f976; +add.f32 f978, f975, f976; +mul.f32 f980, f954, 0fBEC3EF15; +mul.f32 f1357, f953, 0fBF6C835E; +sub.f32 f981, f1357, f980; +mul.f32 f982, f954, 0fBF6C835E; +fma.rn.f32 f983, f953, 0fBEC3EF15, f982; +add.f32 f986, f886, f957; +sub.f32 f988, f886, f957; +add.f32 f1356, f1379, f959; +sub.f32 f989, f1379, f959; +add.f32 f990, f890, f962; +sub.f32 f992, f890, f962; +add.f32 f1355, f1378, f964; +sub.f32 f993, f1378, f964; +add.f32 f994, f894, f967; +sub.f32 f996, f894, f967; +add.f32 f1354, f1377, f969; +sub.f32 f997, f1377, f969; +add.f32 f998, f884, f942; +sub.f32 f1000, f884, f942; +sub.f32 f1353, f885, f941; +add.f32 f1001, f885, f941; +add.f32 f1002, f888, f972; +sub.f32 f1004, f888, f972; +add.f32 f1352, f889, f974; +sub.f32 f1005, f889, f974; +add.f32 f1006, f892, f977; +sub.f32 f1008, f892, f977; +add.f32 f1351, f893, f978; +sub.f32 f1009, f893, f978; +add.f32 f1010, f896, f981; +sub.f32 f1012, f896, f981; +add.f32 f1350, f897, f983; +sub.f32 f1013, f897, f983; +and.b32 r21, r48, 768; +bfe.u32 r22, r48, 8, 2; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1014, f1015}, [rd11]; +mul.f32 f1019, f1015, f1356; +mul.f32 f1020, f1014, f1356; +mul.f32 f1022, f1015, f1015; +mul.f32 f1349, f1014, f1014; +sub.f32 f1023, f1349, f1022; +mul.f32 f1024, f1015, f1014; +fma.rn.f32 f1025, f1015, f1014, f1024; +mul.f32 f1027, f1025, f1355; +mul.f32 f1028, f1023, f1355; +mul.f32 f1347, f1014, f1023; +mul.f32 f1348, f1015, f1025; +sub.f32 f1031, f1347, f1348; +mul.f32 f1346, f1023, f990; +mul.f32 f1032, f1014, f1025; +fma.rn.f32 f1033, f1015, f1023, f1032; +mul.f32 f1035, f1033, f1354; +mul.f32 f1036, f1031, f1354; +mul.f32 f1344, f1014, f1031; +mul.f32 f1345, f1015, f1033; +sub.f32 f1039, f1344, f1345; +mul.f32 f1343, f1031, f994; +mul.f32 f1040, f1014, f1033; +fma.rn.f32 f1041, f1015, f1031, f1040; +mul.f32 f1043, f1041, f1353; +mul.f32 f1044, f1039, f1353; +mul.f32 f1046, f1015, f1041; +mul.f32 f1342, f1014, f1039; +sub.f32 f1047, f1342, f1046; +mul.f32 f1341, f1039, f998; +mul.f32 f1048, f1014, f1041; +fma.rn.f32 f1049, f1015, f1039, f1048; +mul.f32 f1051, f1049, f1352; +mul.f32 f1052, f1047, f1352; +mul.f32 f1339, f1014, f1047; +mul.f32 f1340, f1015, f1049; +sub.f32 f1055, f1339, f1340; +mul.f32 f1338, f1047, f1002; +mul.f32 f1056, f1014, f1049; +fma.rn.f32 f1057, f1015, f1047, f1056; +mul.f32 f1059, f1057, f1351; +mul.f32 f1060, f1055, f1351; +mul.f32 f1062, f1015, f1057; +mul.f32 f1337, f1014, f1055; +sub.f32 f1063, f1337, f1062; +mul.f32 f1336, f1055, f1006; +mul.f32 f1064, f1014, f1057; +fma.rn.f32 f1065, f1015, f1055, f1064; +mul.f32 f1067, f1065, f1350; +mul.f32 f1068, f1063, f1350; +mul.f32 f1070, f1015, f1065; +mul.f32 f1335, f1014, f1063; +sub.f32 f1071, f1335, f1070; +mul.f32 f1334, f1063, f1010; +mul.f32 f1072, f1014, f1065; +fma.rn.f32 f1073, f1015, f1063, f1072; +sub.f32 f1333, f1380, f1366; +mul.f32 f1075, f1073, f1333; +mul.f32 f1076, f1071, f1333; +sub.f32 f1332, f882, f939; +mul.f32 f1330, f1014, f1071; +mul.f32 f1331, f1015, f1073; +sub.f32 f1079, f1330, f1331; +mul.f32 f1329, f1071, f1332; +mul.f32 f1080, f1014, f1073; +fma.rn.f32 f1081, f1015, f1071, f1080; +mul.f32 f1083, f1081, f989; +mul.f32 f1084, f1079, f989; +mul.f32 f1086, f1015, f1081; +mul.f32 f1328, f1014, f1079; +sub.f32 f1087, f1328, f1086; +mul.f32 f1327, f1079, f988; +mul.f32 f1088, f1014, f1081; +fma.rn.f32 f1089, f1015, f1079, f1088; +mul.f32 f1091, f1089, f993; +mul.f32 f1092, f1087, f993; +mul.f32 f1325, f1014, f1087; +mul.f32 f1326, f1015, f1089; +sub.f32 f1095, f1325, f1326; +mul.f32 f1324, f1087, f992; +mul.f32 f1096, f1014, f1089; +fma.rn.f32 f1097, f1015, f1087, f1096; +mul.f32 f1099, f1097, f997; +mul.f32 f1100, f1095, f997; +mul.f32 f1322, f1014, f1095; +mul.f32 f1323, f1015, f1097; +sub.f32 f1103, f1322, f1323; +mul.f32 f1321, f1095, f996; +mul.f32 f1104, f1014, f1097; +fma.rn.f32 f1105, f1015, f1095, f1104; +mul.f32 f1107, f1105, f1001; +mul.f32 f1108, f1103, f1001; +mul.f32 f1110, f1015, f1105; +mul.f32 f1320, f1014, f1103; +sub.f32 f1111, f1320, f1110; +mul.f32 f1319, f1103, f1000; +mul.f32 f1112, f1014, f1105; +fma.rn.f32 f1113, f1015, f1103, f1112; +mul.f32 f1115, f1113, f1005; +mul.f32 f1116, f1111, f1005; +mul.f32 f1317, f1014, f1111; +mul.f32 f1318, f1015, f1113; +sub.f32 f1119, f1317, f1318; +mul.f32 f1316, f1111, f1004; +mul.f32 f1120, f1014, f1113; +fma.rn.f32 f1121, f1015, f1111, f1120; +mul.f32 f1123, f1121, f1009; +mul.f32 f1124, f1119, f1009; +mul.f32 f1126, f1015, f1121; +mul.f32 f1315, f1014, f1119; +sub.f32 f1127, f1315, f1126; +mul.f32 f1314, f1014, f986; +mul.f32 f1128, f1014, f1121; +mul.f32 f1313, f1119, f1008; +fma.rn.f32 f1129, f1015, f1119, f1128; +mul.f32 f1130, f1127, f1012; +mul.f32 f1131, f1129, f1013; +mul.f32 f1132, f1127, f1013; +mov.u32 r33, %tid.x; +shl.b32 r32, r33, 3; +and.b32 r23, r32, 2040; +add.s32 r24, r9, r23; +mov.u32 r37, %tid.x; +shl.b32 r36, r37, 7; +barrier.sync 0; +and.b32 r25, r36, 98304; +add.s32 r26, r24, r25; +mov.u32 r41, %tid.x; +and.b32 r40, r41, 768; +add.f32 f1133, f1380, f1366; +sub.f32 f1560, f882, f939; +add.f32 f1134, f882, f939; +st.shared.v2.f32 [r26], {f1134, f1133}; +mov.u32 r44, %tid.x; +and.b32 r43, r44, 768; +fma.rn.f32 f1135, f1015, f986, f1020; +sub.f32 f1136, f1314, f1019; +st.shared.v2.f32 [r26+2048], {f1136, f1135}; +fma.rn.f32 f1137, f1025, f990, f1028; +sub.f32 f1138, f1346, f1027; +st.shared.v2.f32 [r26+4096], {f1138, f1137}; +fma.rn.f32 f1139, f1033, f994, f1036; +sub.f32 f1140, f1343, f1035; +st.shared.v2.f32 [r26+6144], {f1140, f1139}; +fma.rn.f32 f1141, f1041, f998, f1044; +sub.f32 f1142, f1341, f1043; +st.shared.v2.f32 [r26+8192], {f1142, f1141}; +sub.f32 f1143, f1338, f1051; +fma.rn.f32 f1144, f1049, f1002, f1052; +st.shared.v2.f32 [r26+10240], {f1143, f1144}; +fma.rn.f32 f1145, f1057, f1006, f1060; +sub.f32 f1146, f1336, f1059; +st.shared.v2.f32 [r26+12288], {f1146, f1145}; +fma.rn.f32 f1147, f1065, f1010, f1068; +sub.f32 f1148, f1334, f1067; +st.shared.v2.f32 [r26+14336], {f1148, f1147}; +fma.rn.f32 f1149, f1073, f1560, f1076; +sub.f32 f1150, f1329, f1075; +st.shared.v2.f32 [r26+16384], {f1150, f1149}; +fma.rn.f32 f1151, f1081, f988, f1084; +sub.f32 f1152, f1327, f1083; +st.shared.v2.f32 [r26+18432], {f1152, f1151}; +fma.rn.f32 f1153, f1089, f992, f1092; +sub.f32 f1154, f1324, f1091; +st.shared.v2.f32 [r26+20480], {f1154, f1153}; +fma.rn.f32 f1155, f1097, f996, f1100; +sub.f32 f1156, f1321, f1099; +st.shared.v2.f32 [r26+22528], {f1156, f1155}; +fma.rn.f32 f1157, f1105, f1000, f1108; +sub.f32 f1158, f1319, f1107; +st.shared.v2.f32 [r26+24576], {f1158, f1157}; +fma.rn.f32 f1159, f1113, f1004, f1116; +sub.f32 f1160, f1316, f1115; +st.shared.v2.f32 [r26+26624], {f1160, f1159}; +fma.rn.f32 f1161, f1121, f1008, f1124; +sub.f32 f1162, f1313, f1123; +st.shared.v2.f32 [r26+28672], {f1162, f1161}; +fma.rn.f32 f1163, f1129, f1012, f1132; +sub.f32 f1164, f1130, f1131; +st.shared.v2.f32 [r26+30720], {f1164, f1163}; +barrier.sync 0; +mad.lo.s32 r27, r43, -120, r26; +ld.shared.v2.f32 {f1165, f1166}, [r27]; +ld.shared.v2.f32 {f1169, f1170}, [r27+8192]; +ld.shared.v2.f32 {f1173, f1174}, [r27+16384]; +ld.shared.v2.f32 {f1177, f1178}, [r27+24576]; +ld.shared.v2.f32 {f1181, f1182}, [r27+32768]; +ld.shared.v2.f32 {f1185, f1186}, [r27+40960]; +ld.shared.v2.f32 {f1189, f1190}, [r27+49152]; +ld.shared.v2.f32 {f1193, f1194}, [r27+57344]; +ld.shared.v2.f32 {f1197, f1198}, [r27+65536]; +ld.shared.v2.f32 {f1201, f1202}, [r27+73728]; +ld.shared.v2.f32 {f1205, f1206}, [r27+81920]; +ld.shared.v2.f32 {f1209, f1210}, [r27+90112]; +ld.shared.v2.f32 {f1213, f1214}, [r27+98304]; +ld.shared.v2.f32 {f1217, f1218}, [r27+106496]; +ld.shared.v2.f32 {f1221, f1222}, [r27+114688]; +ld.shared.v2.f32 {f1225, f1226}, [r27+122880]; +add.f32 f1229, f1165, f1197; +sub.f32 f1231, f1165, f1197; +add.f32 f1312, f1166, f1198; +sub.f32 f1232, f1166, f1198; +add.f32 f1233, f1181, f1213; +sub.f32 f1235, f1181, f1213; +add.f32 f1311, f1182, f1214; +sub.f32 f1236, f1182, f1214; +add.f32 f1237, f1169, f1201; +sub.f32 f1239, f1169, f1201; +add.f32 f1310, f1170, f1202; +sub.f32 f1240, f1170, f1202; +add.f32 f1241, f1185, f1217; +sub.f32 f1243, f1185, f1217; +add.f32 f1309, f1186, f1218; +sub.f32 f1244, f1186, f1218; +add.f32 f1245, f1173, f1205; +sub.f32 f1247, f1173, f1205; +add.f32 f1308, f1174, f1206; +sub.f32 f1248, f1174, f1206; +add.f32 f1249, f1189, f1221; +sub.f32 f1251, f1189, f1221; +add.f32 f1307, f1190, f1222; +sub.f32 f1252, f1190, f1222; +add.f32 f1253, f1177, f1209; +sub.f32 f1255, f1177, f1209; +add.f32 f1306, f1178, f1210; +sub.f32 f1256, f1178, f1210; +add.f32 f1257, f1193, f1225; +sub.f32 f1259, f1193, f1225; +add.f32 f1305, f1194, f1226; +sub.f32 f1260, f1194, f1226; +add.f32 %1, f1312, f1311; +add.f32 %0, f1229, f1233; +add.f32 %2, f1237, f1241; +add.f32 %3, f1310, f1309; +add.f32 %4, f1245, f1249; +add.f32 %5, f1308, f1307; +add.f32 %6, f1253, f1257; +add.f32 %7, f1306, f1305; +sub.f32 %9, f1232, f1235; +add.f32 %8, f1231, f1236; +sub.f32 %11, f1240, f1243; +add.f32 %10, f1239, f1244; +sub.f32 %13, f1248, f1251; +add.f32 %12, f1247, f1252; +add.f32 %14, f1255, f1260; +sub.f32 %15, f1256, f1259; +sub.f32 %17, f1312, f1311; +sub.f32 %16, f1229, f1233; +sub.f32 %19, f1310, f1309; +sub.f32 %18, f1237, f1241; +sub.f32 %21, f1308, f1307; +sub.f32 %20, f1245, f1249; +sub.f32 %23, f1306, f1305; +sub.f32 %22, f1253, f1257; +add.f32 %25, f1232, f1235; +sub.f32 %24, f1231, f1236; +add.f32 %27, f1240, f1243; +sub.f32 %26, f1239, f1244; +add.f32 %29, f1248, f1251; +sub.f32 %28, f1247, f1252; +add.f32 %31, f1256, f1259; +sub.f32 %30, f1255, f1260; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_16384), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<116, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2717>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2715, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2713, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2712, f2715, f2713; +sub.f32 f140, f2715, f2713; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2711, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2708, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2706, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2705, f2708, f2706; +sub.f32 f156, f2708, f2706; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2704, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2704, 0fBF3504F3; +mul.f32 f2703, f157, 0f3F3504F3; +sub.f32 f163, f2703, f162; +mul.f32 f164, f2704, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2702, f2712, f2705; +sub.f32 f173, f2712, f2705; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2701, f2711, f165; +sub.f32 f177, f2711, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2700, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2699, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2697, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2694, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2693, f2697, f2694; +sub.f32 f197, f2697, f2694; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2692, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2690, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2688, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2687, f2690, f2688; +sub.f32 f213, f2690, f2688; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2686, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2686, 0fBF3504F3; +mul.f32 f2685, f214, 0f3F3504F3; +sub.f32 f220, f2685, f219; +mul.f32 f221, f2686, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2684, f2693, f2687; +sub.f32 f230, f2693, f2687; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2683, f2692, f222; +sub.f32 f234, f2692, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2682, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2681, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2679, f231, 0f3F6C835E; +mul.f32 f2680, f2683, 0fBEC3EF15; +sub.f32 f245, f2679, f2680; +mul.f32 f246, f2683, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2677, f235, 0f3F3504F3; +mul.f32 f2678, f2682, 0fBF3504F3; +sub.f32 f250, f2677, f2678; +mul.f32 f251, f2682, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2675, f239, 0f3EC3EF15; +mul.f32 f2676, f2681, 0fBF6C835E; +sub.f32 f255, f2675, f2676; +mul.f32 f256, f2681, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2673, f233, 0fBEC3EF15; +mul.f32 f2674, f234, 0fBF6C835E; +sub.f32 f260, f2673, f2674; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2671, f241, 0fBF6C835E; +mul.f32 f2672, f242, 0fBEC3EF15; +sub.f32 f269, f2671, f2672; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2670, f2702, f2684; +sub.f32 f275, f2702, f2684; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2669, f2701, f247; +sub.f32 f279, f2701, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2668, f2700, f252; +sub.f32 f283, f2700, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2667, f2699, f257; +sub.f32 f287, f2699, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2666, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2665, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2664, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2663, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2660, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2658, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2657, f2660, f2658; +sub.f32 f315, f2660, f2658; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2656, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2654, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2651, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2650, f2654, f2651; +sub.f32 f331, f2654, f2651; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2649, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2647, f332, 0f3F3504F3; +mul.f32 f2648, f2649, 0fBF3504F3; +sub.f32 f338, f2647, f2648; +mul.f32 f339, f2649, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2646, f2657, f2650; +sub.f32 f348, f2657, f2650; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2645, f2656, f340; +sub.f32 f352, f2656, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2644, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2643, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2641, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2639, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2638, f2641, f2639; +sub.f32 f372, f2641, f2639; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2637, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2634, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2633, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2632, f2634, f2633; +sub.f32 f388, f2634, f2633; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2631, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2629, f389, 0f3F3504F3; +mul.f32 f2630, f2631, 0fBF3504F3; +sub.f32 f395, f2629, f2630; +mul.f32 f396, f2631, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2628, f2638, f2632; +sub.f32 f405, f2638, f2632; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2627, f2637, f397; +sub.f32 f409, f2637, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2626, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2625, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2627, 0fBEC3EF15; +mul.f32 f2624, f406, 0f3F6C835E; +sub.f32 f420, f2624, f419; +mul.f32 f421, f2627, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2626, 0fBF3504F3; +mul.f32 f2623, f410, 0f3F3504F3; +sub.f32 f425, f2623, f424; +mul.f32 f426, f2626, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2621, f414, 0f3EC3EF15; +mul.f32 f2622, f2625, 0fBF6C835E; +sub.f32 f430, f2621, f2622; +mul.f32 f431, f2625, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2619, f408, 0fBEC3EF15; +mul.f32 f2620, f409, 0fBF6C835E; +sub.f32 f435, f2619, f2620; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2618, f416, 0fBF6C835E; +sub.f32 f444, f2618, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2617, f2646, f2628; +sub.f32 f450, f2646, f2628; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2616, f2645, f422; +sub.f32 f454, f2645, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2615, f2644, f427; +sub.f32 f458, f2644, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2614, f2643, f432; +sub.f32 f462, f2643, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2613, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2612, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2611, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2610, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2616, 0fBE47C5C2; +mul.f32 f2609, f451, 0f3F7B14BE; +sub.f32 f481, f2609, f480; +mul.f32 f482, f2616, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2615, 0fBEC3EF15; +mul.f32 f2608, f455, 0f3F6C835E; +sub.f32 f486, f2608, f485; +mul.f32 f487, f2615, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2614, 0fBF0E39DA; +mul.f32 f2607, f459, 0f3F54DB31; +sub.f32 f491, f2607, f490; +mul.f32 f492, f2614, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2613, 0fBF3504F3; +mul.f32 f2606, f463, 0f3F3504F3; +sub.f32 f496, f2606, f495; +mul.f32 f497, f2613, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2604, f467, 0f3F0E39DA; +mul.f32 f2605, f2612, 0fBF54DB31; +sub.f32 f501, f2604, f2605; +mul.f32 f502, f2612, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2602, f471, 0f3EC3EF15; +mul.f32 f2603, f2611, 0fBF6C835E; +sub.f32 f506, f2602, f2603; +mul.f32 f507, f2611, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2600, f475, 0f3E47C5C2; +mul.f32 f2601, f2610, 0fBF7B14BE; +sub.f32 f511, f2600, f2601; +mul.f32 f512, f2610, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2598, f453, 0fBE47C5C2; +mul.f32 f2599, f454, 0fBF7B14BE; +sub.f32 f516, f2598, f2599; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2597, f457, 0fBEC3EF15; +sub.f32 f521, f2597, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2596, f461, 0fBF0E39DA; +sub.f32 f526, f2596, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2594, f469, 0fBF54DB31; +mul.f32 f2595, f470, 0fBF0E39DA; +sub.f32 f535, f2594, f2595; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2593, f473, 0fBF6C835E; +sub.f32 f540, f2593, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2592, f477, 0fBF7B14BE; +sub.f32 f545, f2592, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2591, f2670, f2617; +sub.f32 f551, f2670, f2617; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2590, f2669, f483; +sub.f32 f555, f2669, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2589, f2668, f488; +sub.f32 f559, f2668, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2588, f2667, f493; +sub.f32 f563, f2667, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2587, f2666, f498; +sub.f32 f567, f2666, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f2586, f2665, f503; +sub.f32 f571, f2665, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f2585, f2664, f508; +sub.f32 f575, f2664, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f2584, f2663, f513; +sub.f32 f579, f2663, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f2583, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f2582, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f2581, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f2580, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f2579, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2578, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2577, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2576, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f2590; +mul.f32 f2575, f612, f552; +sub.f32 f618, f2575, f617; +mul.f32 f619, f612, f2590; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f2573, f612, f612; +mul.f32 f2574, f613, f613; +sub.f32 f623, f2573, f2574; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f2571, f623, f556; +mul.f32 f2572, f625, f2589; +sub.f32 f628, f2571, f2572; +mul.f32 f629, f623, f2589; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f2569, f612, f623; +mul.f32 f2570, f613, f625; +sub.f32 f633, f2569, f2570; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f2567, f633, f560; +mul.f32 f2568, f635, f2588; +sub.f32 f638, f2567, f2568; +mul.f32 f639, f633, f2588; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f2566, f612, f633; +sub.f32 f643, f2566, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f2587; +mul.f32 f2565, f643, f564; +sub.f32 f648, f2565, f647; +mul.f32 f649, f643, f2587; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f2564, f612, f643; +sub.f32 f653, f2564, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f2586; +mul.f32 f2563, f653, f568; +sub.f32 f658, f2563, f657; +mul.f32 f659, f653, f2586; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f2562, f612, f653; +sub.f32 f663, f2562, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f2560, f663, f572; +mul.f32 f2561, f665, f2585; +sub.f32 f668, f2560, f2561; +mul.f32 f669, f663, f2585; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f2558, f612, f663; +mul.f32 f2559, f613, f665; +sub.f32 f673, f2558, f2559; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f2556, f673, f576; +mul.f32 f2557, f675, f2584; +sub.f32 f678, f2556, f2557; +mul.f32 f679, f673, f2584; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f2554, f612, f673; +mul.f32 f2555, f613, f675; +sub.f32 f683, f2554, f2555; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f2583; +mul.f32 f2553, f683, f580; +sub.f32 f688, f2553, f687; +mul.f32 f689, f683, f2583; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f2552, f612, f683; +sub.f32 f693, f2552, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f2582; +mul.f32 f2551, f693, f584; +sub.f32 f698, f2551, f697; +mul.f32 f699, f693, f2582; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f2550, f612, f693; +sub.f32 f703, f2550, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f2581; +mul.f32 f2549, f703, f588; +sub.f32 f708, f2549, f707; +mul.f32 f709, f703, f2581; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f2547, f612, f703; +mul.f32 f2548, f613, f705; +sub.f32 f713, f2547, f2548; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f2545, f713, f592; +mul.f32 f2546, f715, f2580; +sub.f32 f718, f2545, f2546; +mul.f32 f719, f713, f2580; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f2543, f612, f713; +mul.f32 f2544, f613, f715; +sub.f32 f723, f2543, f2544; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f2541, f723, f596; +mul.f32 f2542, f725, f2579; +sub.f32 f728, f2541, f2542; +mul.f32 f729, f723, f2579; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f2540, f612, f723; +sub.f32 f733, f2540, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f2578; +mul.f32 f2539, f733, f600; +sub.f32 f738, f2539, f737; +mul.f32 f739, f733, f2578; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f2538, f612, f733; +sub.f32 f743, f2538, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f2577; +mul.f32 f2537, f743, f604; +sub.f32 f748, f2537, f747; +mul.f32 f749, f743, f2577; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f2536, f612, f743; +sub.f32 f753, f2536, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f2576; +mul.f32 f2535, f753, f608; +sub.f32 f758, f2535, f757; +mul.f32 f759, f753, f2576; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f2533, f612, f753; +mul.f32 f2534, f613, f755; +sub.f32 f763, f2533, f2534; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f2531, f763, f550; +mul.f32 f2532, f765, f551; +sub.f32 f768, f2531, f2532; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f2529, f612, f763; +mul.f32 f2530, f613, f765; +sub.f32 f773, f2529, f2530; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f2528, f773, f554; +sub.f32 f778, f2528, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f2527, f612, f773; +sub.f32 f783, f2527, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f2526, f783, f558; +sub.f32 f788, f2526, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f2525, f612, f783; +sub.f32 f793, f2525, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f2524, f793, f562; +sub.f32 f798, f2524, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f2523, f612, f793; +sub.f32 f803, f2523, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f2521, f803, f566; +mul.f32 f2522, f805, f567; +sub.f32 f808, f2521, f2522; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f2519, f612, f803; +mul.f32 f2520, f613, f805; +sub.f32 f813, f2519, f2520; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f2517, f813, f570; +mul.f32 f2518, f815, f571; +sub.f32 f818, f2517, f2518; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f2515, f612, f813; +mul.f32 f2516, f613, f815; +sub.f32 f823, f2515, f2516; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f2514, f823, f574; +sub.f32 f828, f2514, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f2513, f612, f823; +sub.f32 f833, f2513, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f2512, f833, f578; +sub.f32 f838, f2512, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f2511, f612, f833; +sub.f32 f843, f2511, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f2510, f843, f582; +sub.f32 f848, f2510, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f2508, f612, f843; +mul.f32 f2509, f613, f845; +sub.f32 f853, f2508, f2509; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f2506, f853, f586; +mul.f32 f2507, f855, f587; +sub.f32 f858, f2506, f2507; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f2504, f612, f853; +mul.f32 f2505, f613, f855; +sub.f32 f863, f2504, f2505; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f2502, f863, f590; +mul.f32 f2503, f865, f591; +sub.f32 f868, f2502, f2503; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f2501, f612, f863; +sub.f32 f873, f2501, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f2500, f873, f594; +sub.f32 f878, f2500, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f2499, f612, f873; +sub.f32 f883, f2499, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f2498, f883, f598; +sub.f32 f888, f2498, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f2497, f612, f883; +sub.f32 f893, f2497, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f2495, f893, f602; +mul.f32 f2496, f895, f603; +sub.f32 f898, f2495, f2496; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f2493, f612, f893; +mul.f32 f2494, f613, f895; +sub.f32 f903, f2493, f2494; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f2491, f903, f606; +mul.f32 f2492, f905, f607; +sub.f32 f908, f2491, f2492; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f2489, f612, f903; +mul.f32 f2490, f613, f905; +sub.f32 f913, f2489, f2490; +mov.u32 r32, %tid.x; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f2488, f913, f610; +sub.f32 f918, f2488, f917; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +shl.b32 r8, r32, 7; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65408; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +and.b32 r23, r32, 511; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+2048]; +ld.shared.f32 f923, [r13+4096]; +ld.shared.f32 f924, [r13+6144]; +ld.shared.f32 f925, [r13+8192]; +ld.shared.f32 f926, [r13+10240]; +ld.shared.f32 f927, [r13+12288]; +ld.shared.f32 f928, [r13+14336]; +ld.shared.f32 f929, [r13+16384]; +ld.shared.f32 f930, [r13+18432]; +ld.shared.f32 f931, [r13+20480]; +ld.shared.f32 f932, [r13+22528]; +ld.shared.f32 f933, [r13+24576]; +ld.shared.f32 f934, [r13+26624]; +ld.shared.f32 f935, [r13+28672]; +ld.shared.f32 f936, [r13+30720]; +ld.shared.f32 f937, [r13+32768]; +ld.shared.f32 f938, [r13+34816]; +ld.shared.f32 f939, [r13+36864]; +ld.shared.f32 f940, [r13+38912]; +ld.shared.f32 f941, [r13+40960]; +ld.shared.f32 f942, [r13+43008]; +ld.shared.f32 f943, [r13+45056]; +ld.shared.f32 f944, [r13+47104]; +ld.shared.f32 f945, [r13+49152]; +ld.shared.f32 f946, [r13+51200]; +ld.shared.f32 f947, [r13+53248]; +ld.shared.f32 f948, [r13+55296]; +ld.shared.f32 f949, [r13+57344]; +ld.shared.f32 f950, [r13+59392]; +ld.shared.f32 f951, [r13+61440]; +ld.shared.f32 f952, [r13+63488]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2591, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+2048]; +ld.shared.f32 f955, [r13+4096]; +ld.shared.f32 f956, [r13+6144]; +ld.shared.f32 f957, [r13+8192]; +ld.shared.f32 f958, [r13+10240]; +ld.shared.f32 f959, [r13+12288]; +ld.shared.f32 f960, [r13+14336]; +ld.shared.f32 f961, [r13+16384]; +ld.shared.f32 f962, [r13+18432]; +ld.shared.f32 f963, [r13+20480]; +ld.shared.f32 f964, [r13+22528]; +ld.shared.f32 f965, [r13+24576]; +ld.shared.f32 f966, [r13+26624]; +ld.shared.f32 f967, [r13+28672]; +ld.shared.f32 f968, [r13+30720]; +ld.shared.f32 f969, [r13+32768]; +ld.shared.f32 f970, [r13+34816]; +ld.shared.f32 f971, [r13+36864]; +ld.shared.f32 f972, [r13+38912]; +ld.shared.f32 f973, [r13+40960]; +ld.shared.f32 f974, [r13+43008]; +ld.shared.f32 f975, [r13+45056]; +ld.shared.f32 f976, [r13+47104]; +ld.shared.f32 f977, [r13+49152]; +ld.shared.f32 f978, [r13+51200]; +ld.shared.f32 f979, [r13+53248]; +ld.shared.f32 f980, [r13+55296]; +ld.shared.f32 f981, [r13+57344]; +ld.shared.f32 f982, [r13+59392]; +ld.shared.f32 f983, [r13+61440]; +ld.shared.f32 f984, [r13+63488]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2487, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2486, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2485, f2487, f2486; +sub.f32 f996, f2487, f2486; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f2484, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2483, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2482, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2481, f2483, f2482; +sub.f32 f1012, f2483, f2482; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f2480, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f2480, 0fBF3504F3; +mul.f32 f2479, f1013, 0f3F3504F3; +sub.f32 f1019, f2479, f1018; +mul.f32 f1020, f2480, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2478, f2485, f2481; +sub.f32 f1029, f2485, f2481; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2477, f2484, f1021; +sub.f32 f1033, f2484, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f2476, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f2475, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2474, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2473, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2472, f2474, f2473; +sub.f32 f1053, f2474, f2473; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f2471, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2470, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2469, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2468, f2470, f2469; +sub.f32 f1069, f2470, f2469; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f2467, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f2467, 0fBF3504F3; +mul.f32 f2466, f1070, 0f3F3504F3; +sub.f32 f1076, f2466, f1075; +mul.f32 f1077, f2467, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2465, f2472, f2468; +sub.f32 f1086, f2472, f2468; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2464, f2471, f1078; +sub.f32 f1090, f2471, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f2463, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f2462, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2460, f1087, 0f3F6C835E; +mul.f32 f2461, f2464, 0fBEC3EF15; +sub.f32 f1101, f2460, f2461; +mul.f32 f1102, f2464, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f2458, f1091, 0f3F3504F3; +mul.f32 f2459, f2463, 0fBF3504F3; +sub.f32 f1106, f2458, f2459; +mul.f32 f1107, f2463, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f2462, 0fBF6C835E; +mul.f32 f2457, f1095, 0f3EC3EF15; +sub.f32 f1111, f2457, f1110; +mul.f32 f1112, f2462, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f2456, f1089, 0fBEC3EF15; +sub.f32 f1116, f2456, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f2454, f1097, 0fBF6C835E; +mul.f32 f2455, f1098, 0fBEC3EF15; +sub.f32 f1125, f2454, f2455; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2453, f2478, f2465; +sub.f32 f1131, f2478, f2465; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2452, f2477, f1103; +sub.f32 f1135, f2477, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2451, f2476, f1108; +sub.f32 f1139, f2476, f1108; +add.f32 f1140, f1038, f1111; +sub.f32 f1142, f1038, f1111; +add.f32 f2450, f2475, f1113; +sub.f32 f1143, f2475, f1113; +add.f32 f1144, f1028, f1086; +sub.f32 f1146, f1028, f1086; +sub.f32 f2449, f1029, f1085; +add.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1116; +sub.f32 f1150, f1032, f1116; +add.f32 f2448, f1033, f1118; +sub.f32 f1151, f1033, f1118; +add.f32 f1152, f1036, f1121; +sub.f32 f1154, f1036, f1121; +add.f32 f2447, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2446, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2445, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2444, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2443, f2445, f2444; +sub.f32 f1171, f2445, f2444; +add.f32 f1172, f1162, f1167; +sub.f32 f1174, f1162, f1167; +sub.f32 f2442, f1163, f1166; +add.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2441, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2440, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2439, f2441, f2440; +sub.f32 f1187, f2441, f2440; +add.f32 f1188, f1178, f1183; +sub.f32 f1190, f1178, f1183; +sub.f32 f2438, f1179, f1182; +add.f32 f1191, f1179, f1182; +mul.f32 f1193, f2438, 0fBF3504F3; +mul.f32 f2437, f1188, 0f3F3504F3; +sub.f32 f1194, f2437, f1193; +mul.f32 f1195, f2438, 0f3F3504F3; +fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195; +mul.f32 f1197, f1190, 0fBF3504F3; +mul.f32 f1198, f1191, 0fBF3504F3; +sub.f32 f1199, f1197, f1198; +add.f32 f1200, f1197, f1198; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2436, f2443, f2439; +sub.f32 f1204, f2443, f2439; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2435, f2442, f1196; +sub.f32 f1208, f2442, f1196; +add.f32 f1209, f1170, f1187; +sub.f32 f1211, f1170, f1187; +sub.f32 f2434, f1171, f1186; +add.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1199; +sub.f32 f1215, f1174, f1199; +add.f32 f2433, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2432, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2431, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2430, f2432, f2431; +sub.f32 f1228, f2432, f2431; +add.f32 f1229, f1219, f1224; +sub.f32 f1231, f1219, f1224; +sub.f32 f2429, f1220, f1223; +add.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2428, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2427, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2426, f2428, f2427; +sub.f32 f1244, f2428, f2427; +add.f32 f1245, f1235, f1240; +sub.f32 f1247, f1235, f1240; +sub.f32 f2425, f1236, f1239; +add.f32 f1248, f1236, f1239; +mul.f32 f1250, f2425, 0fBF3504F3; +mul.f32 f2424, f1245, 0f3F3504F3; +sub.f32 f1251, f2424, f1250; +mul.f32 f1252, f2425, 0f3F3504F3; +fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252; +mul.f32 f1254, f1247, 0fBF3504F3; +mul.f32 f1255, f1248, 0fBF3504F3; +sub.f32 f1256, f1254, f1255; +add.f32 f1257, f1254, f1255; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2423, f2430, f2426; +sub.f32 f1261, f2430, f2426; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2422, f2429, f1253; +sub.f32 f1265, f2429, f1253; +add.f32 f1266, f1227, f1244; +sub.f32 f1268, f1227, f1244; +sub.f32 f2421, f1228, f1243; +add.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1256; +sub.f32 f1272, f1231, f1256; +add.f32 f2420, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2418, f1262, 0f3F6C835E; +mul.f32 f2419, f2422, 0fBEC3EF15; +sub.f32 f1276, f2418, f2419; +mul.f32 f1277, f2422, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277; +mul.f32 f2416, f1266, 0f3F3504F3; +mul.f32 f2417, f2421, 0fBF3504F3; +sub.f32 f1281, f2416, f2417; +mul.f32 f1282, f2421, 0f3F3504F3; +fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282; +mul.f32 f2414, f1270, 0f3EC3EF15; +mul.f32 f2415, f2420, 0fBF6C835E; +sub.f32 f1286, f2414, f2415; +mul.f32 f1287, f2420, 0f3EC3EF15; +fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287; +mul.f32 f2412, f1264, 0fBEC3EF15; +mul.f32 f2413, f1265, 0fBF6C835E; +sub.f32 f1291, f2412, f2413; +mul.f32 f1292, f1265, 0fBEC3EF15; +fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292; +mul.f32 f1294, f1268, 0fBF3504F3; +mul.f32 f1295, f1269, 0fBF3504F3; +sub.f32 f1296, f1294, f1295; +add.f32 f1297, f1294, f1295; +mul.f32 f2410, f1272, 0fBF6C835E; +mul.f32 f2411, f1273, 0fBEC3EF15; +sub.f32 f1300, f2410, f2411; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2409, f2436, f2423; +sub.f32 f1306, f2436, f2423; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2408, f2435, f1278; +sub.f32 f1310, f2435, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2407, f2434, f1283; +sub.f32 f1314, f2434, f1283; +add.f32 f1315, f1213, f1286; +sub.f32 f1317, f1213, f1286; +add.f32 f2406, f2433, f1288; +sub.f32 f1318, f2433, f1288; +add.f32 f1319, f1203, f1261; +sub.f32 f1321, f1203, f1261; +sub.f32 f2405, f1204, f1260; +add.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1291; +sub.f32 f1325, f1207, f1291; +add.f32 f2404, f1208, f1293; +sub.f32 f1326, f1208, f1293; +add.f32 f1327, f1211, f1296; +sub.f32 f1329, f1211, f1296; +add.f32 f2403, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2402, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2408, 0fBE47C5C2; +mul.f32 f2401, f1307, 0f3F7B14BE; +sub.f32 f1337, f2401, f1336; +mul.f32 f1338, f2408, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338; +mul.f32 f1341, f2407, 0fBEC3EF15; +mul.f32 f2400, f1311, 0f3F6C835E; +sub.f32 f1342, f2400, f1341; +mul.f32 f1343, f2407, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343; +mul.f32 f2398, f1315, 0f3F54DB31; +mul.f32 f2399, f2406, 0fBF0E39DA; +sub.f32 f1347, f2398, f2399; +mul.f32 f1348, f2406, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348; +mul.f32 f2396, f1319, 0f3F3504F3; +mul.f32 f2397, f2405, 0fBF3504F3; +sub.f32 f1352, f2396, f2397; +mul.f32 f1353, f2405, 0f3F3504F3; +fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353; +mul.f32 f2394, f1323, 0f3F0E39DA; +mul.f32 f2395, f2404, 0fBF54DB31; +sub.f32 f1357, f2394, f2395; +mul.f32 f1358, f2404, 0f3F0E39DA; +fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358; +mul.f32 f2392, f1327, 0f3EC3EF15; +mul.f32 f2393, f2403, 0fBF6C835E; +sub.f32 f1362, f2392, f2393; +mul.f32 f1363, f2403, 0f3EC3EF15; +fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363; +mul.f32 f1366, f2402, 0fBF7B14BE; +mul.f32 f2391, f1331, 0f3E47C5C2; +sub.f32 f1367, f2391, f1366; +mul.f32 f1368, f2402, 0f3E47C5C2; +fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368; +mul.f32 f1371, f1310, 0fBF7B14BE; +mul.f32 f2390, f1309, 0fBE47C5C2; +sub.f32 f1372, f2390, f1371; +mul.f32 f1373, f1310, 0fBE47C5C2; +fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373; +mul.f32 f1376, f1314, 0fBF6C835E; +mul.f32 f2389, f1313, 0fBEC3EF15; +sub.f32 f1377, f2389, f1376; +mul.f32 f1378, f1314, 0fBEC3EF15; +fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378; +mul.f32 f1381, f1318, 0fBF54DB31; +mul.f32 f2388, f1317, 0fBF0E39DA; +sub.f32 f1382, f2388, f1381; +mul.f32 f1383, f1318, 0fBF0E39DA; +fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383; +mul.f32 f1385, f1321, 0fBF3504F3; +mul.f32 f1386, f1322, 0fBF3504F3; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1385, f1386; +mul.f32 f1390, f1326, 0fBF0E39DA; +mul.f32 f2387, f1325, 0fBF54DB31; +sub.f32 f1391, f2387, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392; +mul.f32 f1395, f1330, 0fBEC3EF15; +mul.f32 f2386, f1329, 0fBF6C835E; +sub.f32 f1396, f2386, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397; +mul.f32 f1400, f1334, 0fBE47C5C2; +mul.f32 f2385, f1333, 0fBF7B14BE; +sub.f32 f1401, f2385, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2384, f2453, f2409; +sub.f32 f1407, f2453, f2409; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2383, f2452, f1339; +sub.f32 f1411, f2452, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2382, f2451, f1344; +sub.f32 f1415, f2451, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2381, f2450, f1349; +sub.f32 f1419, f2450, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2380, f2449, f1354; +sub.f32 f1423, f2449, f1354; +add.f32 f1424, f1148, f1357; +sub.f32 f1426, f1148, f1357; +add.f32 f2379, f2448, f1359; +sub.f32 f1427, f2448, f1359; +add.f32 f1428, f1152, f1362; +sub.f32 f1430, f1152, f1362; +add.f32 f2378, f2447, f1364; +sub.f32 f1431, f2447, f1364; +add.f32 f1432, f1156, f1367; +sub.f32 f1434, f1156, f1367; +add.f32 f2377, f2446, f1369; +sub.f32 f1435, f2446, f1369; +add.f32 f1436, f1130, f1306; +sub.f32 f1438, f1130, f1306; +sub.f32 f2376, f1131, f1305; +add.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1372; +sub.f32 f1442, f1134, f1372; +add.f32 f2375, f1135, f1374; +sub.f32 f1443, f1135, f1374; +add.f32 f1444, f1138, f1377; +sub.f32 f1446, f1138, f1377; +add.f32 f2374, f1139, f1379; +sub.f32 f1447, f1139, f1379; +add.f32 f1448, f1142, f1382; +sub.f32 f1450, f1142, f1382; +add.f32 f2373, f1143, f1384; +sub.f32 f1451, f1143, f1384; +add.f32 f1452, f1146, f1387; +sub.f32 f1454, f1146, f1387; +add.f32 f2372, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2371, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2370, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2369, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r32, 5, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1473, f1469, f2383; +mul.f32 f2368, f1468, f1408; +sub.f32 f1474, f2368, f1473; +mul.f32 f1475, f1468, f2383; +fma.rn.f32 f1476, f1469, f1408, f1475; +mul.f32 f1478, f1469, f1469; +mul.f32 f2367, f1468, f1468; +sub.f32 f1479, f2367, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1483, f1481, f2382; +mul.f32 f2366, f1479, f1412; +sub.f32 f1484, f2366, f1483; +mul.f32 f1485, f1479, f2382; +fma.rn.f32 f1486, f1481, f1412, f1485; +mul.f32 f2364, f1468, f1479; +mul.f32 f2365, f1469, f1481; +sub.f32 f1489, f2364, f2365; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f2362, f1489, f1416; +mul.f32 f2363, f1491, f2381; +sub.f32 f1494, f2362, f2363; +mul.f32 f1495, f1489, f2381; +fma.rn.f32 f1496, f1491, f1416, f1495; +mul.f32 f2360, f1468, f1489; +mul.f32 f2361, f1469, f1491; +sub.f32 f1499, f2360, f2361; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f2358, f1499, f1420; +mul.f32 f2359, f1501, f2380; +sub.f32 f1504, f2358, f2359; +mul.f32 f1505, f1499, f2380; +fma.rn.f32 f1506, f1501, f1420, f1505; +mul.f32 f1508, f1469, f1501; +mul.f32 f2357, f1468, f1499; +sub.f32 f1509, f2357, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1513, f1511, f2379; +mul.f32 f2356, f1509, f1424; +sub.f32 f1514, f2356, f1513; +mul.f32 f1515, f1509, f2379; +fma.rn.f32 f1516, f1511, f1424, f1515; +mul.f32 f1518, f1469, f1511; +mul.f32 f2355, f1468, f1509; +sub.f32 f1519, f2355, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1523, f1521, f2378; +mul.f32 f2354, f1519, f1428; +sub.f32 f1524, f2354, f1523; +mul.f32 f1525, f1519, f2378; +fma.rn.f32 f1526, f1521, f1428, f1525; +mul.f32 f1528, f1469, f1521; +mul.f32 f2353, f1468, f1519; +sub.f32 f1529, f2353, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f2351, f1529, f1432; +mul.f32 f2352, f1531, f2377; +sub.f32 f1534, f2351, f2352; +mul.f32 f1535, f1529, f2377; +fma.rn.f32 f1536, f1531, f1432, f1535; +mul.f32 f2349, f1468, f1529; +mul.f32 f2350, f1469, f1531; +sub.f32 f1539, f2349, f2350; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f2347, f1539, f1436; +mul.f32 f2348, f1541, f2376; +sub.f32 f1544, f2347, f2348; +mul.f32 f1545, f1539, f2376; +fma.rn.f32 f1546, f1541, f1436, f1545; +mul.f32 f2345, f1468, f1539; +mul.f32 f2346, f1469, f1541; +sub.f32 f1549, f2345, f2346; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1553, f1551, f2375; +mul.f32 f2344, f1549, f1440; +sub.f32 f1554, f2344, f1553; +mul.f32 f1555, f1549, f2375; +fma.rn.f32 f1556, f1551, f1440, f1555; +mul.f32 f1558, f1469, f1551; +mul.f32 f2343, f1468, f1549; +sub.f32 f1559, f2343, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1563, f1561, f2374; +mul.f32 f2342, f1559, f1444; +sub.f32 f1564, f2342, f1563; +mul.f32 f1565, f1559, f2374; +fma.rn.f32 f1566, f1561, f1444, f1565; +mul.f32 f1568, f1469, f1561; +mul.f32 f2341, f1468, f1559; +sub.f32 f1569, f2341, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1573, f1571, f2373; +mul.f32 f2340, f1569, f1448; +sub.f32 f1574, f2340, f1573; +mul.f32 f1575, f1569, f2373; +fma.rn.f32 f1576, f1571, f1448, f1575; +mul.f32 f1578, f1469, f1571; +mul.f32 f2339, f1468, f1569; +sub.f32 f1579, f2339, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f2337, f1579, f1452; +mul.f32 f2338, f1581, f2372; +sub.f32 f1584, f2337, f2338; +mul.f32 f1585, f1579, f2372; +fma.rn.f32 f1586, f1581, f1452, f1585; +mul.f32 f2335, f1468, f1579; +mul.f32 f2336, f1469, f1581; +sub.f32 f1589, f2335, f2336; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f2333, f1589, f1456; +mul.f32 f2334, f1591, f2371; +sub.f32 f1594, f2333, f2334; +mul.f32 f1595, f1589, f2371; +fma.rn.f32 f1596, f1591, f1456, f1595; +mul.f32 f1598, f1469, f1591; +mul.f32 f2332, f1468, f1589; +sub.f32 f1599, f2332, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1603, f1601, f2370; +mul.f32 f2331, f1599, f1460; +sub.f32 f1604, f2331, f1603; +mul.f32 f1605, f1599, f2370; +fma.rn.f32 f1606, f1601, f1460, f1605; +mul.f32 f1608, f1469, f1601; +mul.f32 f2330, f1468, f1599; +sub.f32 f1609, f2330, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1613, f1611, f2369; +mul.f32 f2329, f1609, f1464; +sub.f32 f1614, f2329, f1613; +mul.f32 f1615, f1609, f2369; +fma.rn.f32 f1616, f1611, f1464, f1615; +mul.f32 f1618, f1469, f1611; +mul.f32 f2328, f1468, f1609; +sub.f32 f1619, f2328, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1623, f1621, f1407; +mul.f32 f2327, f1619, f1406; +sub.f32 f1624, f2327, f1623; +mul.f32 f1625, f1619, f1407; +fma.rn.f32 f1626, f1621, f1406, f1625; +mul.f32 f2325, f1468, f1619; +mul.f32 f2326, f1469, f1621; +sub.f32 f1629, f2325, f2326; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f2323, f1629, f1410; +mul.f32 f2324, f1631, f1411; +sub.f32 f1634, f2323, f2324; +mul.f32 f1635, f1629, f1411; +fma.rn.f32 f1636, f1631, f1410, f1635; +mul.f32 f2321, f1468, f1629; +mul.f32 f2322, f1469, f1631; +sub.f32 f1639, f2321, f2322; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f2319, f1639, f1414; +mul.f32 f2320, f1641, f1415; +sub.f32 f1644, f2319, f2320; +mul.f32 f1645, f1639, f1415; +fma.rn.f32 f1646, f1641, f1414, f1645; +mul.f32 f1648, f1469, f1641; +mul.f32 f2318, f1468, f1639; +sub.f32 f1649, f2318, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1653, f1651, f1419; +mul.f32 f2317, f1649, f1418; +sub.f32 f1654, f2317, f1653; +mul.f32 f1655, f1649, f1419; +fma.rn.f32 f1656, f1651, f1418, f1655; +mul.f32 f1658, f1469, f1651; +mul.f32 f2316, f1468, f1649; +sub.f32 f1659, f2316, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1663, f1661, f1423; +mul.f32 f2315, f1659, f1422; +sub.f32 f1664, f2315, f1663; +mul.f32 f1665, f1659, f1423; +fma.rn.f32 f1666, f1661, f1422, f1665; +mul.f32 f1668, f1469, f1661; +mul.f32 f2314, f1468, f1659; +sub.f32 f1669, f2314, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f2312, f1669, f1426; +mul.f32 f2313, f1671, f1427; +sub.f32 f1674, f2312, f2313; +mul.f32 f1675, f1669, f1427; +fma.rn.f32 f1676, f1671, f1426, f1675; +mul.f32 f2310, f1468, f1669; +mul.f32 f2311, f1469, f1671; +sub.f32 f1679, f2310, f2311; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f2308, f1679, f1430; +mul.f32 f2309, f1681, f1431; +sub.f32 f1684, f2308, f2309; +mul.f32 f1685, f1679, f1431; +fma.rn.f32 f1686, f1681, f1430, f1685; +mul.f32 f2306, f1468, f1679; +mul.f32 f2307, f1469, f1681; +sub.f32 f1689, f2306, f2307; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1693, f1691, f1435; +mul.f32 f2305, f1689, f1434; +sub.f32 f1694, f2305, f1693; +mul.f32 f1695, f1689, f1435; +fma.rn.f32 f1696, f1691, f1434, f1695; +mul.f32 f1698, f1469, f1691; +mul.f32 f2304, f1468, f1689; +sub.f32 f1699, f2304, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1703, f1701, f1439; +mul.f32 f2303, f1699, f1438; +sub.f32 f1704, f2303, f1703; +mul.f32 f1705, f1699, f1439; +fma.rn.f32 f1706, f1701, f1438, f1705; +mul.f32 f1708, f1469, f1701; +mul.f32 f2302, f1468, f1699; +sub.f32 f1709, f2302, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1713, f1711, f1443; +mul.f32 f2301, f1709, f1442; +sub.f32 f1714, f2301, f1713; +mul.f32 f1715, f1709, f1443; +fma.rn.f32 f1716, f1711, f1442, f1715; +mul.f32 f2299, f1468, f1709; +mul.f32 f2300, f1469, f1711; +sub.f32 f1719, f2299, f2300; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f2297, f1719, f1446; +mul.f32 f2298, f1721, f1447; +sub.f32 f1724, f2297, f2298; +mul.f32 f1725, f1719, f1447; +fma.rn.f32 f1726, f1721, f1446, f1725; +mul.f32 f2295, f1468, f1719; +mul.f32 f2296, f1469, f1721; +sub.f32 f1729, f2295, f2296; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f2293, f1729, f1450; +mul.f32 f2294, f1731, f1451; +sub.f32 f1734, f2293, f2294; +mul.f32 f1735, f1729, f1451; +fma.rn.f32 f1736, f1731, f1450, f1735; +mul.f32 f1738, f1469, f1731; +mul.f32 f2292, f1468, f1729; +sub.f32 f1739, f2292, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1743, f1741, f1455; +mul.f32 f2291, f1739, f1454; +sub.f32 f1744, f2291, f1743; +mul.f32 f1745, f1739, f1455; +fma.rn.f32 f1746, f1741, f1454, f1745; +mul.f32 f1748, f1469, f1741; +mul.f32 f2290, f1468, f1739; +sub.f32 f1749, f2290, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1753, f1751, f1459; +mul.f32 f2289, f1749, f1458; +sub.f32 f1754, f2289, f1753; +mul.f32 f1755, f1749, f1459; +fma.rn.f32 f1756, f1751, f1458, f1755; +mul.f32 f1758, f1469, f1751; +mul.f32 f2288, f1468, f1749; +sub.f32 f1759, f2288, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f2286, f1759, f1462; +mul.f32 f2287, f1761, f1463; +sub.f32 f1764, f2286, f2287; +mul.f32 f1765, f1759, f1463; +fma.rn.f32 f1766, f1761, f1462, f1765; +mul.f32 f2284, f1468, f1759; +mul.f32 f2285, f1469, f1761; +sub.f32 f1769, f2284, f2285; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 7; +mul.f32 f1770, f1468, f1761; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f2282, f1769, f1466; +mul.f32 f2283, f1771, f1467; +sub.f32 f1774, f2282, f2283; +mul.f32 f1775, f1769, f1467; +mov.u32 r33, %tid.x; +fma.rn.f32 f1776, f1771, f1466, f1775; +and.b32 r22, r33, 480; +shl.b32 r16, r33, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r30, 61440; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1474; +st.shared.f32 [r20+256], f1484; +st.shared.f32 [r20+384], f1494; +st.shared.f32 [r20+512], f1504; +st.shared.f32 [r20+640], f1514; +st.shared.f32 [r20+768], f1524; +st.shared.f32 [r20+896], f1534; +st.shared.f32 [r20+1024], f1544; +st.shared.f32 [r20+1152], f1554; +st.shared.f32 [r20+1280], f1564; +st.shared.f32 [r20+1408], f1574; +st.shared.f32 [r20+1536], f1584; +st.shared.f32 [r20+1664], f1594; +st.shared.f32 [r20+1792], f1604; +st.shared.f32 [r20+1920], f1614; +st.shared.f32 [r20+2048], f1624; +st.shared.f32 [r20+2176], f1634; +st.shared.f32 [r20+2304], f1644; +st.shared.f32 [r20+2432], f1654; +st.shared.f32 [r20+2560], f1664; +st.shared.f32 [r20+2688], f1674; +st.shared.f32 [r20+2816], f1684; +st.shared.f32 [r20+2944], f1694; +st.shared.f32 [r20+3072], f1704; +st.shared.f32 [r20+3200], f1714; +st.shared.f32 [r20+3328], f1724; +st.shared.f32 [r20+3456], f1734; +st.shared.f32 [r20+3584], f1744; +st.shared.f32 [r20+3712], f1754; +st.shared.f32 [r20+3840], f1764; +st.shared.f32 [r20+3968], f1774; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+2048]; +ld.shared.f32 f1779, [r21+4096]; +ld.shared.f32 f1780, [r21+6144]; +ld.shared.f32 f1781, [r21+8192]; +ld.shared.f32 f1782, [r21+10240]; +ld.shared.f32 f1783, [r21+12288]; +ld.shared.f32 f1784, [r21+14336]; +ld.shared.f32 f1785, [r21+16384]; +ld.shared.f32 f1786, [r21+18432]; +ld.shared.f32 f1787, [r21+20480]; +ld.shared.f32 f1788, [r21+22528]; +ld.shared.f32 f1789, [r21+24576]; +ld.shared.f32 f1790, [r21+26624]; +ld.shared.f32 f1791, [r21+28672]; +ld.shared.f32 f1792, [r21+30720]; +ld.shared.f32 f1793, [r21+32768]; +ld.shared.f32 f1794, [r21+34816]; +ld.shared.f32 f1795, [r21+36864]; +ld.shared.f32 f1796, [r21+38912]; +ld.shared.f32 f1797, [r21+40960]; +ld.shared.f32 f1798, [r21+43008]; +ld.shared.f32 f1799, [r21+45056]; +ld.shared.f32 f1800, [r21+47104]; +ld.shared.f32 f1801, [r21+49152]; +ld.shared.f32 f1802, [r21+51200]; +ld.shared.f32 f1803, [r21+53248]; +ld.shared.f32 f1804, [r21+55296]; +ld.shared.f32 f1805, [r21+57344]; +ld.shared.f32 f1806, [r21+59392]; +ld.shared.f32 f1807, [r21+61440]; +ld.shared.f32 f1808, [r21+63488]; +barrier.sync 0; +st.shared.f32 [r20], f2384; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+2048]; +ld.shared.f32 f1811, [r21+4096]; +ld.shared.f32 f1812, [r21+6144]; +ld.shared.f32 f1813, [r21+8192]; +ld.shared.f32 f1814, [r21+10240]; +ld.shared.f32 f1815, [r21+12288]; +ld.shared.f32 f1816, [r21+14336]; +ld.shared.f32 f1817, [r21+16384]; +ld.shared.f32 f1818, [r21+18432]; +ld.shared.f32 f1819, [r21+20480]; +ld.shared.f32 f1820, [r21+22528]; +ld.shared.f32 f1821, [r21+24576]; +ld.shared.f32 f1822, [r21+26624]; +ld.shared.f32 f1823, [r21+28672]; +ld.shared.f32 f1824, [r21+30720]; +ld.shared.f32 f1825, [r21+32768]; +ld.shared.f32 f1826, [r21+34816]; +ld.shared.f32 f1827, [r21+36864]; +ld.shared.f32 f1828, [r21+38912]; +ld.shared.f32 f1829, [r21+40960]; +ld.shared.f32 f1830, [r21+43008]; +ld.shared.f32 f1831, [r21+45056]; +ld.shared.f32 f1832, [r21+47104]; +ld.shared.f32 f1833, [r21+49152]; +ld.shared.f32 f1834, [r21+51200]; +ld.shared.f32 f1835, [r21+53248]; +ld.shared.f32 f1836, [r21+55296]; +ld.shared.f32 f1837, [r21+57344]; +ld.shared.f32 f1838, [r21+59392]; +ld.shared.f32 f1839, [r21+61440]; +ld.shared.f32 f1840, [r21+63488]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2281, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2280, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1841, f1845; +sub.f32 f1851, f1841, f1845; +add.f32 f2279, f2281, f2280; +sub.f32 f1852, f2281, f2280; +add.f32 f1853, f1843, f1848; +sub.f32 f1855, f1843, f1848; +sub.f32 f2278, f1844, f1847; +add.f32 f1856, f1844, f1847; +add.f32 f1857, f1781, f1797; +sub.f32 f1859, f1781, f1797; +add.f32 f2277, f1813, f1829; +sub.f32 f1860, f1813, f1829; +add.f32 f1861, f1789, f1805; +sub.f32 f1863, f1789, f1805; +add.f32 f2276, f1821, f1837; +sub.f32 f1864, f1821, f1837; +add.f32 f1865, f1857, f1861; +sub.f32 f1867, f1857, f1861; +add.f32 f2275, f2277, f2276; +sub.f32 f1868, f2277, f2276; +add.f32 f1869, f1859, f1864; +sub.f32 f1871, f1859, f1864; +sub.f32 f2274, f1860, f1863; +add.f32 f1872, f1860, f1863; +mul.f32 f2272, f1869, 0f3F3504F3; +mul.f32 f2273, f2274, 0fBF3504F3; +sub.f32 f1875, f2272, f2273; +mul.f32 f1876, f2274, 0f3F3504F3; +fma.rn.f32 f1877, f1869, 0fBF3504F3, f1876; +mul.f32 f1878, f1871, 0fBF3504F3; +mul.f32 f1879, f1872, 0fBF3504F3; +sub.f32 f1880, f1878, f1879; +add.f32 f1881, f1878, f1879; +add.f32 f1882, f1849, f1865; +sub.f32 f1884, f1849, f1865; +add.f32 f2271, f2279, f2275; +sub.f32 f1885, f2279, f2275; +add.f32 f1886, f1853, f1875; +sub.f32 f1888, f1853, f1875; +add.f32 f2270, f2278, f1877; +sub.f32 f1889, f2278, f1877; +add.f32 f1890, f1851, f1868; +sub.f32 f1892, f1851, f1868; +sub.f32 f2269, f1852, f1867; +add.f32 f1893, f1852, f1867; +add.f32 f1894, f1855, f1880; +sub.f32 f1896, f1855, f1880; +add.f32 f2268, f1856, f1881; +sub.f32 f1897, f1856, f1881; +add.f32 f1898, f1779, f1795; +sub.f32 f1900, f1779, f1795; +add.f32 f2267, f1811, f1827; +sub.f32 f1901, f1811, f1827; +add.f32 f1902, f1787, f1803; +sub.f32 f1904, f1787, f1803; +add.f32 f2266, f1819, f1835; +sub.f32 f1905, f1819, f1835; +add.f32 f1906, f1898, f1902; +sub.f32 f1908, f1898, f1902; +add.f32 f2265, f2267, f2266; +sub.f32 f1909, f2267, f2266; +add.f32 f1910, f1900, f1905; +sub.f32 f1912, f1900, f1905; +sub.f32 f2264, f1901, f1904; +add.f32 f1913, f1901, f1904; +add.f32 f1914, f1783, f1799; +sub.f32 f1916, f1783, f1799; +add.f32 f2263, f1815, f1831; +sub.f32 f1917, f1815, f1831; +add.f32 f1918, f1791, f1807; +sub.f32 f1920, f1791, f1807; +add.f32 f2262, f1823, f1839; +sub.f32 f1921, f1823, f1839; +add.f32 f1922, f1914, f1918; +sub.f32 f1924, f1914, f1918; +add.f32 f2261, f2263, f2262; +sub.f32 f1925, f2263, f2262; +add.f32 f1926, f1916, f1921; +sub.f32 f1928, f1916, f1921; +sub.f32 f2260, f1917, f1920; +add.f32 f1929, f1917, f1920; +mul.f32 f1931, f2260, 0fBF3504F3; +mul.f32 f2259, f1926, 0f3F3504F3; +sub.f32 f1932, f2259, f1931; +mul.f32 f1933, f2260, 0f3F3504F3; +fma.rn.f32 f1934, f1926, 0fBF3504F3, f1933; +mul.f32 f1935, f1928, 0fBF3504F3; +mul.f32 f1936, f1929, 0fBF3504F3; +sub.f32 f1937, f1935, f1936; +add.f32 f1938, f1935, f1936; +add.f32 f1939, f1906, f1922; +sub.f32 f1941, f1906, f1922; +add.f32 f2258, f2265, f2261; +sub.f32 f1942, f2265, f2261; +add.f32 f1943, f1910, f1932; +sub.f32 f1945, f1910, f1932; +add.f32 f2257, f2264, f1934; +sub.f32 f1946, f2264, f1934; +add.f32 f1947, f1908, f1925; +sub.f32 f1949, f1908, f1925; +sub.f32 f2256, f1909, f1924; +add.f32 f1950, f1909, f1924; +add.f32 f1951, f1912, f1937; +sub.f32 f1953, f1912, f1937; +add.f32 f2255, f1913, f1938; +sub.f32 f1954, f1913, f1938; +mul.f32 f1956, f2257, 0fBEC3EF15; +mul.f32 f2254, f1943, 0f3F6C835E; +sub.f32 f1957, f2254, f1956; +mul.f32 f1958, f2257, 0f3F6C835E; +fma.rn.f32 f1959, f1943, 0fBEC3EF15, f1958; +mul.f32 f2252, f1947, 0f3F3504F3; +mul.f32 f2253, f2256, 0fBF3504F3; +sub.f32 f1962, f2252, f2253; +mul.f32 f1963, f2256, 0f3F3504F3; +fma.rn.f32 f1964, f1947, 0fBF3504F3, f1963; +mul.f32 f2250, f1951, 0f3EC3EF15; +mul.f32 f2251, f2255, 0fBF6C835E; +sub.f32 f1967, f2250, f2251; +mul.f32 f1968, f2255, 0f3EC3EF15; +fma.rn.f32 f1969, f1951, 0fBF6C835E, f1968; +mul.f32 f2248, f1945, 0fBEC3EF15; +mul.f32 f2249, f1946, 0fBF6C835E; +sub.f32 f1972, f2248, f2249; +mul.f32 f1973, f1946, 0fBEC3EF15; +fma.rn.f32 f1974, f1945, 0fBF6C835E, f1973; +mul.f32 f1975, f1949, 0fBF3504F3; +mul.f32 f1976, f1950, 0fBF3504F3; +sub.f32 f1977, f1975, f1976; +add.f32 f1978, f1975, f1976; +mul.f32 f2246, f1953, 0fBF6C835E; +mul.f32 f2247, f1954, 0fBEC3EF15; +sub.f32 f1981, f2246, f2247; +mul.f32 f1982, f1954, 0fBF6C835E; +fma.rn.f32 f1983, f1953, 0fBEC3EF15, f1982; +add.f32 f1984, f1778, f1794; +sub.f32 f1986, f1778, f1794; +add.f32 f2245, f1810, f1826; +sub.f32 f1987, f1810, f1826; +add.f32 f1988, f1786, f1802; +sub.f32 f1990, f1786, f1802; +add.f32 f2244, f1818, f1834; +sub.f32 f1991, f1818, f1834; +add.f32 f1992, f1984, f1988; +sub.f32 f1994, f1984, f1988; +add.f32 f2243, f2245, f2244; +sub.f32 f1995, f2245, f2244; +add.f32 f1996, f1986, f1991; +sub.f32 f1998, f1986, f1991; +sub.f32 f2242, f1987, f1990; +add.f32 f1999, f1987, f1990; +add.f32 f2000, f1782, f1798; +sub.f32 f2002, f1782, f1798; +add.f32 f2241, f1814, f1830; +sub.f32 f2003, f1814, f1830; +add.f32 f2004, f1790, f1806; +sub.f32 f2006, f1790, f1806; +add.f32 f2240, f1822, f1838; +sub.f32 f2007, f1822, f1838; +add.f32 f2008, f2000, f2004; +sub.f32 f2010, f2000, f2004; +add.f32 f2239, f2241, f2240; +sub.f32 f2011, f2241, f2240; +add.f32 f2012, f2002, f2007; +sub.f32 f2014, f2002, f2007; +sub.f32 f2238, f2003, f2006; +add.f32 f2015, f2003, f2006; +mul.f32 f2017, f2238, 0fBF3504F3; +mul.f32 f2237, f2012, 0f3F3504F3; +sub.f32 f2018, f2237, f2017; +mul.f32 f2019, f2238, 0f3F3504F3; +fma.rn.f32 f2020, f2012, 0fBF3504F3, f2019; +mul.f32 f2021, f2014, 0fBF3504F3; +mul.f32 f2022, f2015, 0fBF3504F3; +sub.f32 f2023, f2021, f2022; +add.f32 f2024, f2021, f2022; +add.f32 f2025, f1992, f2008; +sub.f32 f2027, f1992, f2008; +add.f32 f2236, f2243, f2239; +sub.f32 f2028, f2243, f2239; +add.f32 f2029, f1996, f2018; +sub.f32 f2031, f1996, f2018; +add.f32 f2235, f2242, f2020; +sub.f32 f2032, f2242, f2020; +add.f32 f2033, f1994, f2011; +sub.f32 f2035, f1994, f2011; +sub.f32 f2234, f1995, f2010; +add.f32 f2036, f1995, f2010; +add.f32 f2037, f1998, f2023; +sub.f32 f2039, f1998, f2023; +add.f32 f2233, f1999, f2024; +sub.f32 f2040, f1999, f2024; +add.f32 f2041, f1780, f1796; +sub.f32 f2043, f1780, f1796; +add.f32 f2232, f1812, f1828; +sub.f32 f2044, f1812, f1828; +add.f32 f2045, f1788, f1804; +sub.f32 f2047, f1788, f1804; +add.f32 f2231, f1820, f1836; +sub.f32 f2048, f1820, f1836; +add.f32 f2049, f2041, f2045; +sub.f32 f2051, f2041, f2045; +add.f32 f2230, f2232, f2231; +sub.f32 f2052, f2232, f2231; +add.f32 f2053, f2043, f2048; +sub.f32 f2055, f2043, f2048; +sub.f32 f2229, f2044, f2047; +add.f32 f2056, f2044, f2047; +add.f32 f2057, f1784, f1800; +sub.f32 f2059, f1784, f1800; +add.f32 f2228, f1816, f1832; +sub.f32 f2060, f1816, f1832; +add.f32 f2061, f1792, f1808; +sub.f32 f2063, f1792, f1808; +add.f32 f2227, f1824, f1840; +sub.f32 f2064, f1824, f1840; +add.f32 f2065, f2057, f2061; +sub.f32 f2067, f2057, f2061; +add.f32 f2226, f2228, f2227; +sub.f32 f2068, f2228, f2227; +add.f32 f2069, f2059, f2064; +sub.f32 f2071, f2059, f2064; +sub.f32 f2225, f2060, f2063; +add.f32 f2072, f2060, f2063; +mul.f32 f2074, f2225, 0fBF3504F3; +mul.f32 f2224, f2069, 0f3F3504F3; +sub.f32 f2075, f2224, f2074; +mul.f32 f2076, f2225, 0f3F3504F3; +fma.rn.f32 f2077, f2069, 0fBF3504F3, f2076; +mul.f32 f2078, f2071, 0fBF3504F3; +mul.f32 f2079, f2072, 0fBF3504F3; +sub.f32 f2080, f2078, f2079; +add.f32 f2081, f2078, f2079; +add.f32 f2082, f2049, f2065; +sub.f32 f2084, f2049, f2065; +add.f32 f2223, f2230, f2226; +sub.f32 f2085, f2230, f2226; +add.f32 f2086, f2053, f2075; +sub.f32 f2088, f2053, f2075; +add.f32 f2222, f2229, f2077; +sub.f32 f2089, f2229, f2077; +add.f32 f2090, f2051, f2068; +sub.f32 f2092, f2051, f2068; +sub.f32 f2221, f2052, f2067; +add.f32 f2093, f2052, f2067; +add.f32 f2094, f2055, f2080; +sub.f32 f2096, f2055, f2080; +add.f32 f2220, f2056, f2081; +sub.f32 f2097, f2056, f2081; +mul.f32 f2218, f2086, 0f3F6C835E; +mul.f32 f2219, f2222, 0fBEC3EF15; +sub.f32 f2100, f2218, f2219; +mul.f32 f2101, f2222, 0f3F6C835E; +fma.rn.f32 f2102, f2086, 0fBEC3EF15, f2101; +mul.f32 f2104, f2221, 0fBF3504F3; +mul.f32 f2217, f2090, 0f3F3504F3; +sub.f32 f2105, f2217, f2104; +mul.f32 f2106, f2221, 0f3F3504F3; +fma.rn.f32 f2107, f2090, 0fBF3504F3, f2106; +mul.f32 f2109, f2220, 0fBF6C835E; +mul.f32 f2216, f2094, 0f3EC3EF15; +sub.f32 f2110, f2216, f2109; +mul.f32 f2111, f2220, 0f3EC3EF15; +fma.rn.f32 f2112, f2094, 0fBF6C835E, f2111; +mul.f32 f2114, f2089, 0fBF6C835E; +mul.f32 f2215, f2088, 0fBEC3EF15; +sub.f32 f2115, f2215, f2114; +mul.f32 f2116, f2089, 0fBEC3EF15; +fma.rn.f32 f2117, f2088, 0fBF6C835E, f2116; +mul.f32 f2118, f2092, 0fBF3504F3; +mul.f32 f2119, f2093, 0fBF3504F3; +sub.f32 f2120, f2118, f2119; +add.f32 f2121, f2118, f2119; +mul.f32 f2123, f2097, 0fBEC3EF15; +mul.f32 f2214, f2096, 0fBF6C835E; +sub.f32 f2124, f2214, f2123; +mul.f32 f2125, f2097, 0fBF6C835E; +fma.rn.f32 f2126, f2096, 0fBEC3EF15, f2125; +add.f32 %1, f2271, f2258; +add.f32 %0, f1882, f1939; +add.f32 %2, f2025, f2082; +add.f32 %3, f2236, f2223; +add.f32 %5, f2270, f1959; +add.f32 %4, f1886, f1957; +add.f32 %7, f2235, f2102; +add.f32 %6, f2029, f2100; +add.f32 %8, f1890, f1962; +add.f32 %9, f2269, f1964; +add.f32 %10, f2033, f2105; +add.f32 %11, f2234, f2107; +add.f32 %12, f1894, f1967; +add.f32 %13, f2268, f1969; +add.f32 %14, f2037, f2110; +add.f32 %15, f2233, f2112; +sub.f32 %17, f1885, f1941; +add.f32 %16, f1884, f1942; +sub.f32 %19, f2028, f2084; +add.f32 %18, f2027, f2085; +add.f32 %20, f1888, f1972; +add.f32 %21, f1889, f1974; +add.f32 %22, f2031, f2115; +add.f32 %23, f2032, f2117; +add.f32 %24, f1892, f1977; +add.f32 %25, f1893, f1978; +add.f32 %26, f2035, f2120; +add.f32 %27, f2036, f2121; +add.f32 %29, f1897, f1983; +add.f32 %28, f1896, f1981; +add.f32 %31, f2040, f2126; +add.f32 %30, f2039, f2124; +sub.f32 %32, f1882, f1939; +sub.f32 %33, f2271, f2258; +sub.f32 %34, f2025, f2082; +sub.f32 %35, f2236, f2223; +sub.f32 %37, f2270, f1959; +sub.f32 %36, f1886, f1957; +sub.f32 %39, f2235, f2102; +sub.f32 %38, f2029, f2100; +sub.f32 %41, f2269, f1964; +sub.f32 %40, f1890, f1962; +sub.f32 %43, f2234, f2107; +sub.f32 %42, f2033, f2105; +sub.f32 %45, f2268, f1969; +sub.f32 %44, f1894, f1967; +sub.f32 %47, f2233, f2112; +sub.f32 %46, f2037, f2110; +add.f32 %49, f1885, f1941; +sub.f32 %48, f1884, f1942; +add.f32 %51, f2028, f2084; +sub.f32 %50, f2027, f2085; +sub.f32 %53, f1889, f1974; +sub.f32 %52, f1888, f1972; +sub.f32 %55, f2032, f2117; +sub.f32 %54, f2031, f2115; +sub.f32 %57, f1893, f1978; +sub.f32 %56, f1892, f1977; +sub.f32 %59, f2036, f2121; +sub.f32 %58, f2035, f2120; +sub.f32 %61, f1897, f1983; +sub.f32 %60, f1896, f1981; +sub.f32 %63, f2040, f2126; +sub.f32 %62, f2039, f2124; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_16384), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..27e38536d1703 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp32_inv.hpp.inc @@ -0,0 +1,6737 @@ +#ifndef CUFFTDX_FFT_16384_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_16384_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1156, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2818>; +.reg .b32 r<40>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2810, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2808, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2807, f2810, f2808; +sub.f32 f140, f2810, f2808; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2806, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2803, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2801, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2800, f2803, f2801; +sub.f32 f156, f2803, f2801; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2799, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2799, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2797, f159, 0fBF3504F3; +mul.f32 f2798, f160, 0f3F3504F3; +sub.f32 f167, f2797, f2798; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2796, f2807, f2800; +sub.f32 f173, f2807, f2800; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2795, f2806, f164; +sub.f32 f177, f2806, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2794, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2793, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2791, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2788, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2787, f2791, f2788; +sub.f32 f197, f2791, f2788; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2786, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2784, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2782, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2781, f2784, f2782; +sub.f32 f213, f2784, f2782; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2780, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2780, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2778, f216, 0fBF3504F3; +mul.f32 f2779, f217, 0f3F3504F3; +sub.f32 f224, f2778, f2779; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2777, f2787, f2781; +sub.f32 f230, f2787, f2781; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2776, f2786, f221; +sub.f32 f234, f2786, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2775, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2774, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2772, f231, 0f3F6C835E; +mul.f32 f2773, f2776, 0f3EC3EF15; +sub.f32 f245, f2772, f2773; +mul.f32 f246, f2776, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2775, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2774, 0f3F6C835E; +mul.f32 f2771, f239, 0f3EC3EF15; +sub.f32 f254, f2771, f253; +mul.f32 f255, f2774, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2770, f233, 0fBEC3EF15; +sub.f32 f259, f2770, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2768, f237, 0fBF3504F3; +mul.f32 f2769, f238, 0f3F3504F3; +sub.f32 f264, f2768, f2769; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2766, f241, 0fBF6C835E; +mul.f32 f2767, f242, 0f3EC3EF15; +sub.f32 f269, f2766, f2767; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2765, f2796, f2777; +sub.f32 f275, f2796, f2777; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2764, f2795, f247; +sub.f32 f279, f2795, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2763, f2794, f251; +sub.f32 f283, f2794, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2762, f2793, f256; +sub.f32 f287, f2793, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2761, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2760, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2759, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2758, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2755, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2753, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2752, f2755, f2753; +sub.f32 f315, f2755, f2753; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2751, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2749, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2746, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2745, f2749, f2746; +sub.f32 f331, f2749, f2746; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2744, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2744, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2743, f334, 0fBF3504F3; +sub.f32 f342, f2743, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2742, f2752, f2745; +sub.f32 f348, f2752, f2745; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2741, f2751, f339; +sub.f32 f352, f2751, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2740, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2739, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2737, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2735, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2734, f2737, f2735; +sub.f32 f372, f2737, f2735; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2733, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2730, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2729, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2728, f2730, f2729; +sub.f32 f388, f2730, f2729; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2727, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2727, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2725, f391, 0fBF3504F3; +mul.f32 f2726, f392, 0f3F3504F3; +sub.f32 f399, f2725, f2726; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2724, f2734, f2728; +sub.f32 f405, f2734, f2728; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2723, f2733, f396; +sub.f32 f409, f2733, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2722, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2721, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2723, 0f3EC3EF15; +mul.f32 f2720, f406, 0f3F6C835E; +sub.f32 f420, f2720, f419; +mul.f32 f421, f2723, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2722, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2721, 0f3F6C835E; +mul.f32 f2719, f414, 0f3EC3EF15; +sub.f32 f429, f2719, f428; +mul.f32 f430, f2721, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2718, f408, 0fBEC3EF15; +sub.f32 f434, f2718, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2717, f412, 0fBF3504F3; +sub.f32 f439, f2717, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2716, f416, 0fBF6C835E; +sub.f32 f444, f2716, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2715, f2742, f2724; +sub.f32 f450, f2742, f2724; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2714, f2741, f422; +sub.f32 f454, f2741, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2713, f2740, f426; +sub.f32 f458, f2740, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2712, f2739, f431; +sub.f32 f462, f2739, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2711, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2710, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2709, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2708, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2714, 0f3E47C5C2; +mul.f32 f2707, f451, 0f3F7B14BE; +sub.f32 f481, f2707, f480; +mul.f32 f482, f2714, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2713, 0f3EC3EF15; +mul.f32 f2706, f455, 0f3F6C835E; +sub.f32 f486, f2706, f485; +mul.f32 f487, f2713, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2712, 0f3F0E39DA; +mul.f32 f2705, f459, 0f3F54DB31; +sub.f32 f491, f2705, f490; +mul.f32 f492, f2712, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2711, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2710, 0f3F54DB31; +mul.f32 f2704, f467, 0f3F0E39DA; +sub.f32 f500, f2704, f499; +mul.f32 f501, f2710, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2709, 0f3F6C835E; +mul.f32 f2703, f471, 0f3EC3EF15; +sub.f32 f505, f2703, f504; +mul.f32 f506, f2709, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2708, 0f3F7B14BE; +mul.f32 f2702, f475, 0f3E47C5C2; +sub.f32 f510, f2702, f509; +mul.f32 f511, f2708, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2701, f453, 0fBE47C5C2; +sub.f32 f515, f2701, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2699, f457, 0fBEC3EF15; +mul.f32 f2700, f458, 0f3F6C835E; +sub.f32 f520, f2699, f2700; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2697, f461, 0fBF0E39DA; +mul.f32 f2698, f462, 0f3F54DB31; +sub.f32 f525, f2697, f2698; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2695, f465, 0fBF3504F3; +mul.f32 f2696, f466, 0f3F3504F3; +sub.f32 f530, f2695, f2696; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2693, f469, 0fBF54DB31; +mul.f32 f2694, f470, 0f3F0E39DA; +sub.f32 f535, f2693, f2694; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2692, f473, 0fBF6C835E; +sub.f32 f540, f2692, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2691, f477, 0fBF7B14BE; +sub.f32 f545, f2691, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2690, f2764, f483; +sub.f32 f553, f2764, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2689, f2763, f488; +sub.f32 f557, f2763, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2688, f2762, f493; +sub.f32 f561, f2762, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2687, f2761, f497; +sub.f32 f565, f2761, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f2686, f2760, f502; +sub.f32 f569, f2760, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f2685, f2759, f507; +sub.f32 f573, f2759, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f2684, f2758, f512; +sub.f32 f577, f2758, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f2683, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f2682, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f2681, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f2680, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f2679, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2678, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2677, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2676, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f2690, f611; +mul.f32 f616, f610, f2690; +mul.f32 f618, f611, f611; +mul.f32 f2675, f610, f610; +sub.f32 f619, f2675, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f2689, f621; +mul.f32 f624, f619, f2689; +mul.f32 f626, f611, f621; +mul.f32 f2674, f610, f619; +sub.f32 f627, f2674, f626; +mul.f32 f2673, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f2688, f629; +mul.f32 f632, f627, f2688; +mul.f32 f2671, f610, f627; +mul.f32 f2672, f611, f629; +sub.f32 f635, f2671, f2672; +mul.f32 f2670, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f2687, f637; +mul.f32 f640, f635, f2687; +mul.f32 f642, f611, f637; +mul.f32 f2669, f610, f635; +sub.f32 f643, f2669, f642; +mul.f32 f2668, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f2686, f645; +mul.f32 f648, f643, f2686; +mul.f32 f2666, f610, f643; +mul.f32 f2667, f611, f645; +sub.f32 f651, f2666, f2667; +mul.f32 f2665, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f2685, f653; +mul.f32 f656, f651, f2685; +mul.f32 f658, f611, f653; +mul.f32 f2664, f610, f651; +sub.f32 f659, f2664, f658; +mul.f32 f2663, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f2684, f661; +mul.f32 f664, f659, f2684; +mul.f32 f666, f611, f661; +mul.f32 f2662, f610, f659; +sub.f32 f667, f2662, f666; +mul.f32 f2661, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f2683, f669; +mul.f32 f672, f667, f2683; +mul.f32 f2659, f610, f667; +mul.f32 f2660, f611, f669; +sub.f32 f675, f2659, f2660; +mul.f32 f2658, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f2682, f677; +mul.f32 f680, f675, f2682; +mul.f32 f682, f611, f677; +mul.f32 f2657, f610, f675; +sub.f32 f683, f2657, f682; +mul.f32 f2656, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f2681, f685; +mul.f32 f688, f683, f2681; +mul.f32 f690, f611, f685; +mul.f32 f2655, f610, f683; +sub.f32 f691, f2655, f690; +mul.f32 f2654, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f2680, f693; +mul.f32 f696, f691, f2680; +mul.f32 f2652, f610, f691; +mul.f32 f2653, f611, f693; +sub.f32 f699, f2652, f2653; +mul.f32 f2651, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f2679, f701; +mul.f32 f704, f699, f2679; +mul.f32 f706, f611, f701; +mul.f32 f2650, f610, f699; +sub.f32 f707, f2650, f706; +mul.f32 f2649, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f2678, f709; +mul.f32 f712, f707, f2678; +mul.f32 f2647, f610, f707; +mul.f32 f2648, f611, f709; +sub.f32 f715, f2647, f2648; +mul.f32 f2646, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f2677, f717; +mul.f32 f720, f715, f2677; +mul.f32 f722, f611, f717; +mul.f32 f2645, f610, f715; +sub.f32 f723, f2645, f722; +mul.f32 f2644, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f2676, f725; +mul.f32 f728, f723, f2676; +mul.f32 f730, f611, f725; +mul.f32 f2643, f610, f723; +sub.f32 f731, f2643, f730; +mul.f32 f2642, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2641, f2765, f2715; +mul.f32 f734, f2641, f733; +mul.f32 f736, f731, f2641; +mul.f32 f2639, f610, f731; +mul.f32 f2640, f611, f733; +sub.f32 f739, f2639, f2640; +sub.f32 f2638, f272, f447; +mul.f32 f2637, f2638, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2636, f610, f739; +sub.f32 f747, f2636, f746; +mul.f32 f2635, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2634, f610, f747; +sub.f32 f755, f2634, f754; +mul.f32 f2633, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f2631, f610, f755; +mul.f32 f2632, f611, f757; +sub.f32 f763, f2631, f2632; +mul.f32 f2630, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2629, f610, f763; +sub.f32 f771, f2629, f770; +mul.f32 f2628, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f2626, f610, f771; +mul.f32 f2627, f611, f773; +sub.f32 f779, f2626, f2627; +mul.f32 f2625, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2624, f610, f779; +sub.f32 f787, f2624, f786; +mul.f32 f2623, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2622, f610, f787; +sub.f32 f795, f2622, f794; +mul.f32 f2621, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f2619, f610, f795; +mul.f32 f2620, f611, f797; +sub.f32 f803, f2619, f2620; +mul.f32 f2618, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2617, f610, f803; +sub.f32 f811, f2617, f810; +mul.f32 f2616, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2615, f610, f811; +sub.f32 f819, f2615, f818; +mul.f32 f2614, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f2612, f610, f819; +mul.f32 f2613, f611, f821; +sub.f32 f827, f2612, f2613; +mul.f32 f2611, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2610, f610, f827; +sub.f32 f835, f2610, f834; +mul.f32 f2609, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f2607, f610, f835; +mul.f32 f2608, f611, f837; +sub.f32 f843, f2607, f2608; +mul.f32 f2606, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2605, f610, f843; +sub.f32 f851, f2605, f850; +mul.f32 f2604, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f2603, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 130816; +add.s32 r12, r9, r11; +add.f32 f857, f2765, f2715; +sub.f32 f2814, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r39, %tid.x; +shl.b32 r35, r39, 3; +shl.b32 r27, r39, 8; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f2603; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f2673; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f2670; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f2668; +sub.f32 f867, f648, f2665; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f2663; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f2661; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f2658; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f2656; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f2654; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f2651; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f2649; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f2646; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f2644; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f2642; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f2814, f734; +sub.f32 f890, f736, f2637; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f2635; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f2633; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f2630; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f2628; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f2625; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f2623; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f2621; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f2618; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f2616; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f2614; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f2611; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f2609; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f2606; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f2604; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +and.b32 r21, r39, 511; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+4096]; +ld.shared.v2.f32 {f929, f930}, [r13+8192]; +ld.shared.v2.f32 {f933, f934}, [r13+12288]; +ld.shared.v2.f32 {f937, f938}, [r13+16384]; +ld.shared.v2.f32 {f941, f942}, [r13+20480]; +ld.shared.v2.f32 {f945, f946}, [r13+24576]; +ld.shared.v2.f32 {f949, f950}, [r13+28672]; +ld.shared.v2.f32 {f953, f954}, [r13+32768]; +ld.shared.v2.f32 {f957, f958}, [r13+36864]; +ld.shared.v2.f32 {f961, f962}, [r13+40960]; +ld.shared.v2.f32 {f965, f966}, [r13+45056]; +ld.shared.v2.f32 {f969, f970}, [r13+49152]; +ld.shared.v2.f32 {f973, f974}, [r13+53248]; +ld.shared.v2.f32 {f977, f978}, [r13+57344]; +ld.shared.v2.f32 {f981, f982}, [r13+61440]; +ld.shared.v2.f32 {f985, f986}, [r13+65536]; +ld.shared.v2.f32 {f989, f990}, [r13+69632]; +ld.shared.v2.f32 {f993, f994}, [r13+73728]; +ld.shared.v2.f32 {f997, f998}, [r13+77824]; +ld.shared.v2.f32 {f1001, f1002}, [r13+81920]; +ld.shared.v2.f32 {f1005, f1006}, [r13+86016]; +ld.shared.v2.f32 {f1009, f1010}, [r13+90112]; +ld.shared.v2.f32 {f1013, f1014}, [r13+94208]; +ld.shared.v2.f32 {f1017, f1018}, [r13+98304]; +ld.shared.v2.f32 {f1021, f1022}, [r13+102400]; +ld.shared.v2.f32 {f1025, f1026}, [r13+106496]; +ld.shared.v2.f32 {f1029, f1030}, [r13+110592]; +ld.shared.v2.f32 {f1033, f1034}, [r13+114688]; +ld.shared.v2.f32 {f1037, f1038}, [r13+118784]; +ld.shared.v2.f32 {f1041, f1042}, [r13+122880]; +ld.shared.v2.f32 {f1045, f1046}, [r13+126976]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2602, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2601, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2600, f2602, f2601; +sub.f32 f1060, f2602, f2601; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f2599, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2598, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2597, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2596, f2598, f2597; +sub.f32 f1076, f2598, f2597; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f2595, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f2595, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f2593, f1079, 0fBF3504F3; +mul.f32 f2594, f1080, 0f3F3504F3; +sub.f32 f1087, f2593, f2594; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2592, f2600, f2596; +sub.f32 f1093, f2600, f2596; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2591, f2599, f1084; +sub.f32 f1097, f2599, f1084; +sub.f32 f1098, f1059, f1076; +add.f32 f1100, f1059, f1076; +add.f32 f2590, f1060, f1075; +sub.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1087; +sub.f32 f1104, f1063, f1087; +add.f32 f2589, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2588, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2587, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2586, f2588, f2587; +sub.f32 f1117, f2588, f2587; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f2585, f1109, f1112; +sub.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2584, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2583, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2582, f2584, f2583; +sub.f32 f1133, f2584, f2583; +sub.f32 f1134, f1124, f1129; +add.f32 f1136, f1124, f1129; +add.f32 f2581, f1125, f1128; +sub.f32 f1137, f1125, f1128; +mul.f32 f1138, f1134, 0f3F3504F3; +mul.f32 f1139, f2581, 0f3F3504F3; +sub.f32 f1140, f1138, f1139; +add.f32 f1141, f1138, f1139; +mul.f32 f2579, f1136, 0fBF3504F3; +mul.f32 f2580, f1137, 0f3F3504F3; +sub.f32 f1144, f2579, f2580; +mul.f32 f1145, f1137, 0fBF3504F3; +fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2578, f2586, f2582; +sub.f32 f1150, f2586, f2582; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2577, f2585, f1141; +sub.f32 f1154, f2585, f1141; +sub.f32 f1155, f1116, f1133; +add.f32 f1157, f1116, f1133; +add.f32 f2576, f1117, f1132; +sub.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1144; +sub.f32 f1161, f1120, f1144; +add.f32 f2575, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2573, f1151, 0f3F6C835E; +mul.f32 f2574, f2577, 0f3EC3EF15; +sub.f32 f1165, f2573, f2574; +mul.f32 f1166, f2577, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166; +mul.f32 f1168, f1155, 0f3F3504F3; +mul.f32 f1169, f2576, 0f3F3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +mul.f32 f2571, f1159, 0f3EC3EF15; +mul.f32 f2572, f2575, 0f3F6C835E; +sub.f32 f1174, f2571, f2572; +mul.f32 f1175, f2575, 0f3EC3EF15; +fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175; +mul.f32 f2569, f1153, 0fBEC3EF15; +mul.f32 f2570, f1154, 0f3F6C835E; +sub.f32 f1179, f2569, f2570; +mul.f32 f1180, f1154, 0fBEC3EF15; +fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180; +mul.f32 f2567, f1157, 0fBF3504F3; +mul.f32 f2568, f1158, 0f3F3504F3; +sub.f32 f1184, f2567, f2568; +mul.f32 f1185, f1158, 0fBF3504F3; +fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185; +mul.f32 f2565, f1161, 0fBF6C835E; +mul.f32 f2566, f1162, 0f3EC3EF15; +sub.f32 f1189, f2565, f2566; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2564, f2592, f2578; +sub.f32 f1195, f2592, f2578; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2563, f2591, f1167; +sub.f32 f1199, f2591, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2562, f2590, f1171; +sub.f32 f1203, f2590, f1171; +add.f32 f1204, f1102, f1174; +sub.f32 f1206, f1102, f1174; +add.f32 f2561, f2589, f1176; +sub.f32 f1207, f2589, f1176; +sub.f32 f1208, f1092, f1150; +add.f32 f1210, f1092, f1150; +add.f32 f2560, f1093, f1149; +sub.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1179; +sub.f32 f1214, f1096, f1179; +add.f32 f2559, f1097, f1181; +sub.f32 f1215, f1097, f1181; +add.f32 f1216, f1100, f1184; +sub.f32 f1218, f1100, f1184; +add.f32 f2558, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2557, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2556, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2555, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2554, f2556, f2555; +sub.f32 f1235, f2556, f2555; +sub.f32 f1236, f1226, f1231; +add.f32 f1238, f1226, f1231; +add.f32 f2553, f1227, f1230; +sub.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2552, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2551, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2550, f2552, f2551; +sub.f32 f1251, f2552, f2551; +sub.f32 f1252, f1242, f1247; +add.f32 f1254, f1242, f1247; +add.f32 f2549, f1243, f1246; +sub.f32 f1255, f1243, f1246; +mul.f32 f1256, f1252, 0f3F3504F3; +mul.f32 f1257, f2549, 0f3F3504F3; +sub.f32 f1258, f1256, f1257; +add.f32 f1259, f1256, f1257; +mul.f32 f2547, f1254, 0fBF3504F3; +mul.f32 f2548, f1255, 0f3F3504F3; +sub.f32 f1262, f2547, f2548; +mul.f32 f1263, f1255, 0fBF3504F3; +fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2546, f2554, f2550; +sub.f32 f1268, f2554, f2550; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2545, f2553, f1259; +sub.f32 f1272, f2553, f1259; +sub.f32 f1273, f1234, f1251; +add.f32 f1275, f1234, f1251; +add.f32 f2544, f1235, f1250; +sub.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1262; +sub.f32 f1279, f1238, f1262; +add.f32 f2543, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2542, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2541, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2540, f2542, f2541; +sub.f32 f1292, f2542, f2541; +sub.f32 f1293, f1283, f1288; +add.f32 f1295, f1283, f1288; +add.f32 f2539, f1284, f1287; +sub.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2538, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2537, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2536, f2538, f2537; +sub.f32 f1308, f2538, f2537; +sub.f32 f1309, f1299, f1304; +add.f32 f1311, f1299, f1304; +add.f32 f2535, f1300, f1303; +sub.f32 f1312, f1300, f1303; +mul.f32 f1313, f1309, 0f3F3504F3; +mul.f32 f1314, f2535, 0f3F3504F3; +sub.f32 f1315, f1313, f1314; +add.f32 f1316, f1313, f1314; +mul.f32 f2533, f1311, 0fBF3504F3; +mul.f32 f2534, f1312, 0f3F3504F3; +sub.f32 f1319, f2533, f2534; +mul.f32 f1320, f1312, 0fBF3504F3; +fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2532, f2540, f2536; +sub.f32 f1325, f2540, f2536; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2531, f2539, f1316; +sub.f32 f1329, f2539, f1316; +sub.f32 f1330, f1291, f1308; +add.f32 f1332, f1291, f1308; +add.f32 f2530, f1292, f1307; +sub.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1319; +sub.f32 f1336, f1295, f1319; +add.f32 f2529, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2527, f1326, 0f3F6C835E; +mul.f32 f2528, f2531, 0f3EC3EF15; +sub.f32 f1340, f2527, f2528; +mul.f32 f1341, f2531, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341; +mul.f32 f1343, f1330, 0f3F3504F3; +mul.f32 f1344, f2530, 0f3F3504F3; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1343, f1344; +mul.f32 f1348, f2529, 0f3F6C835E; +mul.f32 f2526, f1334, 0f3EC3EF15; +sub.f32 f1349, f2526, f1348; +mul.f32 f1350, f2529, 0f3EC3EF15; +fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350; +mul.f32 f1353, f1329, 0f3F6C835E; +mul.f32 f2525, f1328, 0fBEC3EF15; +sub.f32 f1354, f2525, f1353; +mul.f32 f1355, f1329, 0fBEC3EF15; +fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355; +mul.f32 f2523, f1332, 0fBF3504F3; +mul.f32 f2524, f1333, 0f3F3504F3; +sub.f32 f1359, f2523, f2524; +mul.f32 f1360, f1333, 0fBF3504F3; +fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360; +mul.f32 f2521, f1336, 0fBF6C835E; +mul.f32 f2522, f1337, 0f3EC3EF15; +sub.f32 f1364, f2521, f2522; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2520, f2546, f2532; +sub.f32 f1370, f2546, f2532; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2519, f2545, f1342; +sub.f32 f1374, f2545, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2518, f2544, f1346; +sub.f32 f1378, f2544, f1346; +add.f32 f1379, f1277, f1349; +sub.f32 f1381, f1277, f1349; +add.f32 f2517, f2543, f1351; +sub.f32 f1382, f2543, f1351; +sub.f32 f1383, f1267, f1325; +add.f32 f1385, f1267, f1325; +add.f32 f2516, f1268, f1324; +sub.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1354; +sub.f32 f1389, f1271, f1354; +add.f32 f2515, f1272, f1356; +sub.f32 f1390, f1272, f1356; +add.f32 f1391, f1275, f1359; +sub.f32 f1393, f1275, f1359; +add.f32 f2514, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2513, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2519, 0f3E47C5C2; +mul.f32 f2512, f1371, 0f3F7B14BE; +sub.f32 f1401, f2512, f1400; +mul.f32 f1402, f2519, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402; +mul.f32 f1405, f2518, 0f3EC3EF15; +mul.f32 f2511, f1375, 0f3F6C835E; +sub.f32 f1406, f2511, f1405; +mul.f32 f1407, f2518, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407; +mul.f32 f2509, f1379, 0f3F54DB31; +mul.f32 f2510, f2517, 0f3F0E39DA; +sub.f32 f1411, f2509, f2510; +mul.f32 f1412, f2517, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412; +mul.f32 f1414, f1383, 0f3F3504F3; +mul.f32 f1415, f2516, 0f3F3504F3; +sub.f32 f1416, f1414, f1415; +add.f32 f1417, f1414, f1415; +mul.f32 f1419, f2515, 0f3F54DB31; +mul.f32 f2508, f1387, 0f3F0E39DA; +sub.f32 f1420, f2508, f1419; +mul.f32 f1421, f2515, 0f3F0E39DA; +fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421; +mul.f32 f1424, f2514, 0f3F6C835E; +mul.f32 f2507, f1391, 0f3EC3EF15; +sub.f32 f1425, f2507, f1424; +mul.f32 f1426, f2514, 0f3EC3EF15; +fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426; +mul.f32 f2505, f1395, 0f3E47C5C2; +mul.f32 f2506, f2513, 0f3F7B14BE; +sub.f32 f1430, f2505, f2506; +mul.f32 f1431, f2513, 0f3E47C5C2; +fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431; +mul.f32 f2503, f1373, 0fBE47C5C2; +mul.f32 f2504, f1374, 0f3F7B14BE; +sub.f32 f1435, f2503, f2504; +mul.f32 f1436, f1374, 0fBE47C5C2; +fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436; +mul.f32 f2501, f1377, 0fBEC3EF15; +mul.f32 f2502, f1378, 0f3F6C835E; +sub.f32 f1440, f2501, f2502; +mul.f32 f1441, f1378, 0fBEC3EF15; +fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441; +mul.f32 f2499, f1381, 0fBF0E39DA; +mul.f32 f2500, f1382, 0f3F54DB31; +sub.f32 f1445, f2499, f2500; +mul.f32 f1446, f1382, 0fBF0E39DA; +fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446; +mul.f32 f1449, f1386, 0f3F3504F3; +mul.f32 f2498, f1385, 0fBF3504F3; +sub.f32 f1450, f2498, f1449; +mul.f32 f1451, f1386, 0fBF3504F3; +fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451; +mul.f32 f1454, f1390, 0f3F0E39DA; +mul.f32 f2497, f1389, 0fBF54DB31; +sub.f32 f1455, f2497, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456; +mul.f32 f1459, f1394, 0f3EC3EF15; +mul.f32 f2496, f1393, 0fBF6C835E; +sub.f32 f1460, f2496, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461; +mul.f32 f1464, f1398, 0f3E47C5C2; +mul.f32 f2495, f1397, 0fBF7B14BE; +sub.f32 f1465, f2495, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2494, f2563, f1403; +sub.f32 f1473, f2563, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2493, f2562, f1408; +sub.f32 f1477, f2562, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2492, f2561, f1413; +sub.f32 f1481, f2561, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2491, f2560, f1417; +sub.f32 f1485, f2560, f1417; +add.f32 f1486, f1212, f1420; +sub.f32 f1488, f1212, f1420; +add.f32 f2490, f2559, f1422; +sub.f32 f1489, f2559, f1422; +add.f32 f1490, f1216, f1425; +sub.f32 f1492, f1216, f1425; +add.f32 f2489, f2558, f1427; +sub.f32 f1493, f2558, f1427; +add.f32 f1494, f1220, f1430; +sub.f32 f1496, f1220, f1430; +add.f32 f2488, f2557, f1432; +sub.f32 f1497, f2557, f1432; +sub.f32 f1498, f1194, f1370; +add.f32 f1500, f1194, f1370; +add.f32 f2487, f1195, f1369; +sub.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1435; +sub.f32 f1504, f1198, f1435; +add.f32 f2486, f1199, f1437; +sub.f32 f1505, f1199, f1437; +add.f32 f1506, f1202, f1440; +sub.f32 f1508, f1202, f1440; +add.f32 f2485, f1203, f1442; +sub.f32 f1509, f1203, f1442; +add.f32 f1510, f1206, f1445; +sub.f32 f1512, f1206, f1445; +add.f32 f2484, f1207, f1447; +sub.f32 f1513, f1207, f1447; +add.f32 f1514, f1210, f1450; +sub.f32 f1516, f1210, f1450; +add.f32 f2483, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2482, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2481, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2480, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r39, 480; +bfe.u32 r15, r39, 5, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1534, f2494, f1531; +mul.f32 f1536, f1530, f2494; +mul.f32 f2478, f1530, f1530; +mul.f32 f2479, f1531, f1531; +sub.f32 f1539, f2478, f2479; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1542, f2493, f1541; +mul.f32 f1544, f1539, f2493; +mul.f32 f1546, f1531, f1541; +mul.f32 f2477, f1530, f1539; +sub.f32 f1547, f2477, f1546; +mul.f32 f2476, f1474, f1541; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1550, f2492, f1549; +mul.f32 f1552, f1547, f2492; +mul.f32 f1554, f1531, f1549; +mul.f32 f2475, f1530, f1547; +sub.f32 f1555, f2475, f1554; +mul.f32 f2474, f1478, f1549; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1558, f2491, f1557; +mul.f32 f1560, f1555, f2491; +mul.f32 f2472, f1530, f1555; +mul.f32 f2473, f1531, f1557; +sub.f32 f1563, f2472, f2473; +mul.f32 f2471, f1482, f1557; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1566, f2490, f1565; +mul.f32 f1568, f1563, f2490; +mul.f32 f1570, f1531, f1565; +mul.f32 f2470, f1530, f1563; +sub.f32 f1571, f2470, f1570; +mul.f32 f2469, f1486, f1565; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1574, f2489, f1573; +mul.f32 f1576, f1571, f2489; +mul.f32 f1578, f1531, f1573; +mul.f32 f2468, f1530, f1571; +sub.f32 f1579, f2468, f1578; +mul.f32 f2467, f1490, f1573; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1582, f2488, f1581; +mul.f32 f1584, f1579, f2488; +mul.f32 f2465, f1530, f1579; +mul.f32 f2466, f1531, f1581; +sub.f32 f1587, f2465, f2466; +mul.f32 f2464, f1494, f1581; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1590, f2487, f1589; +mul.f32 f1592, f1587, f2487; +mul.f32 f1594, f1531, f1589; +mul.f32 f2463, f1530, f1587; +sub.f32 f1595, f2463, f1594; +mul.f32 f2462, f1498, f1589; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1598, f2486, f1597; +mul.f32 f1600, f1595, f2486; +mul.f32 f2460, f1530, f1595; +mul.f32 f2461, f1531, f1597; +sub.f32 f1603, f2460, f2461; +mul.f32 f2459, f1502, f1597; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1606, f2485, f1605; +mul.f32 f1608, f1603, f2485; +mul.f32 f1610, f1531, f1605; +mul.f32 f2458, f1530, f1603; +sub.f32 f1611, f2458, f1610; +mul.f32 f2457, f1506, f1605; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1614, f2484, f1613; +mul.f32 f1616, f1611, f2484; +mul.f32 f1618, f1531, f1613; +mul.f32 f2456, f1530, f1611; +sub.f32 f1619, f2456, f1618; +mul.f32 f2455, f1510, f1613; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1622, f2483, f1621; +mul.f32 f1624, f1619, f2483; +mul.f32 f2453, f1530, f1619; +mul.f32 f2454, f1531, f1621; +sub.f32 f1627, f2453, f2454; +mul.f32 f2452, f1514, f1621; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1630, f2482, f1629; +mul.f32 f1632, f1627, f2482; +mul.f32 f1634, f1531, f1629; +mul.f32 f2451, f1530, f1627; +sub.f32 f1635, f2451, f1634; +mul.f32 f2450, f1518, f1629; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1638, f2481, f1637; +mul.f32 f1640, f1635, f2481; +mul.f32 f1642, f1531, f1637; +mul.f32 f2449, f1530, f1635; +sub.f32 f1643, f2449, f1642; +mul.f32 f2448, f1522, f1637; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1646, f2480, f1645; +mul.f32 f1648, f1643, f2480; +mul.f32 f2446, f1530, f1643; +mul.f32 f2447, f1531, f1645; +sub.f32 f1651, f2446, f2447; +mul.f32 f2445, f1526, f1645; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2444, f2564, f2520; +mul.f32 f1654, f2444, f1653; +mul.f32 f1656, f1651, f2444; +mul.f32 f1658, f1531, f1653; +mul.f32 f2443, f1530, f1651; +sub.f32 f1659, f2443, f1658; +sub.f32 f2442, f1192, f1367; +mul.f32 f2441, f2442, f1653; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1662, f1473, f1661; +mul.f32 f1664, f1659, f1473; +mul.f32 f2439, f1530, f1659; +mul.f32 f2440, f1531, f1661; +sub.f32 f1667, f2439, f2440; +mul.f32 f2438, f1472, f1661; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1670, f1477, f1669; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2437, f1530, f1667; +sub.f32 f1675, f2437, f1674; +mul.f32 f2436, f1476, f1669; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1678, f1481, f1677; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2435, f1530, f1675; +sub.f32 f1683, f2435, f1682; +mul.f32 f2434, f1480, f1677; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1686, f1485, f1685; +mul.f32 f1688, f1683, f1485; +mul.f32 f2432, f1530, f1683; +mul.f32 f2433, f1531, f1685; +sub.f32 f1691, f2432, f2433; +mul.f32 f2431, f1484, f1685; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1694, f1489, f1693; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2430, f1530, f1691; +sub.f32 f1699, f2430, f1698; +mul.f32 f2429, f1488, f1693; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1702, f1493, f1701; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2428, f1530, f1699; +sub.f32 f1707, f2428, f1706; +mul.f32 f2427, f1492, f1701; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1710, f1497, f1709; +mul.f32 f1712, f1707, f1497; +mul.f32 f2425, f1530, f1707; +mul.f32 f2426, f1531, f1709; +sub.f32 f1715, f2425, f2426; +mul.f32 f2424, f1496, f1709; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1718, f1501, f1717; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2423, f1530, f1715; +sub.f32 f1723, f2423, f1722; +mul.f32 f2422, f1500, f1717; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1726, f1505, f1725; +mul.f32 f1728, f1723, f1505; +mul.f32 f2420, f1530, f1723; +mul.f32 f2421, f1531, f1725; +sub.f32 f1731, f2420, f2421; +mul.f32 f2419, f1504, f1725; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1734, f1509, f1733; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2418, f1530, f1731; +sub.f32 f1739, f2418, f1738; +mul.f32 f2417, f1508, f1733; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1742, f1513, f1741; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2416, f1530, f1739; +sub.f32 f1747, f2416, f1746; +mul.f32 f2415, f1512, f1741; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1750, f1517, f1749; +mul.f32 f1752, f1747, f1517; +mul.f32 f2413, f1530, f1747; +mul.f32 f2414, f1531, f1749; +sub.f32 f1755, f2413, f2414; +mul.f32 f2412, f1516, f1749; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1758, f1521, f1757; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2411, f1530, f1755; +sub.f32 f1763, f2411, f1762; +mul.f32 f2410, f1520, f1757; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1766, f1525, f1765; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2409, f1530, f1763; +sub.f32 f1771, f2409, f1770; +mul.f32 f2408, f1524, f1765; +mul.f32 f1772, f1530, f1765; +mul.f32 f2407, f1470, f1531; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1529, f1773; +mul.f32 f1775, f1528, f1773; +mul.f32 f1776, f1771, f1529; +and.b32 r16, r35, 248; +add.s32 r17, r9, r16; +sub.f32 f2813, f2564, f2520; +mul.f32 f2812, f1651, f2813; +mov.u32 r34, %tid.x; +shl.b32 r33, r34, 8; +barrier.sync 0; +and.b32 r18, r33, 122880; +add.s32 r19, r17, r18; +mov.u32 r26, %tid.x; +and.b32 r25, r26, 480; +sub.f32 f2816, f2564, f2520; +mul.f32 f2815, f1651, f2816; +add.f32 f1777, f2564, f2520; +mov.u32 r30, %tid.x; +and.b32 r29, r30, 480; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r32, %tid.x; +and.b32 r31, r32, 480; +fma.rn.f32 f1779, f1530, f1470, f1534; +sub.f32 f1780, f1536, f2407; +st.shared.v2.f32 [r19+256], {f1779, f1780}; +fma.rn.f32 f1781, f1539, f1474, f1542; +sub.f32 f1782, f1544, f2476; +st.shared.v2.f32 [r19+512], {f1781, f1782}; +fma.rn.f32 f1783, f1547, f1478, f1550; +sub.f32 f1784, f1552, f2474; +st.shared.v2.f32 [r19+768], {f1783, f1784}; +fma.rn.f32 f1785, f1555, f1482, f1558; +sub.f32 f1786, f1560, f2471; +st.shared.v2.f32 [r19+1024], {f1785, f1786}; +fma.rn.f32 f1787, f1563, f1486, f1566; +sub.f32 f1788, f1568, f2469; +st.shared.v2.f32 [r19+1280], {f1787, f1788}; +sub.f32 f1789, f1576, f2467; +fma.rn.f32 f1790, f1571, f1490, f1574; +st.shared.v2.f32 [r19+1536], {f1790, f1789}; +fma.rn.f32 f1791, f1579, f1494, f1582; +sub.f32 f1792, f1584, f2464; +st.shared.v2.f32 [r19+1792], {f1791, f1792}; +fma.rn.f32 f1793, f1587, f1498, f1590; +sub.f32 f1794, f1592, f2462; +st.shared.v2.f32 [r19+2048], {f1793, f1794}; +fma.rn.f32 f1795, f1595, f1502, f1598; +sub.f32 f1796, f1600, f2459; +st.shared.v2.f32 [r19+2304], {f1795, f1796}; +fma.rn.f32 f1797, f1603, f1506, f1606; +sub.f32 f1798, f1608, f2457; +st.shared.v2.f32 [r19+2560], {f1797, f1798}; +fma.rn.f32 f1799, f1611, f1510, f1614; +sub.f32 f1800, f1616, f2455; +st.shared.v2.f32 [r19+2816], {f1799, f1800}; +fma.rn.f32 f1801, f1619, f1514, f1622; +sub.f32 f1802, f1624, f2452; +st.shared.v2.f32 [r19+3072], {f1801, f1802}; +fma.rn.f32 f1803, f1627, f1518, f1630; +sub.f32 f1804, f1632, f2450; +st.shared.v2.f32 [r19+3328], {f1803, f1804}; +fma.rn.f32 f1805, f1635, f1522, f1638; +sub.f32 f1806, f1640, f2448; +st.shared.v2.f32 [r19+3584], {f1805, f1806}; +fma.rn.f32 f1807, f1643, f1526, f1646; +sub.f32 f1808, f1648, f2445; +st.shared.v2.f32 [r19+3840], {f1807, f1808}; +fma.rn.f32 f1809, f1651, f2442, f1654; +sub.f32 f1810, f2815, f2441; +st.shared.v2.f32 [r19+4096], {f1809, f1810}; +fma.rn.f32 f1811, f1659, f1472, f1662; +sub.f32 f1812, f1664, f2438; +st.shared.v2.f32 [r19+4352], {f1811, f1812}; +fma.rn.f32 f1813, f1667, f1476, f1670; +sub.f32 f1814, f1672, f2436; +st.shared.v2.f32 [r19+4608], {f1813, f1814}; +fma.rn.f32 f1815, f1675, f1480, f1678; +sub.f32 f1816, f1680, f2434; +st.shared.v2.f32 [r19+4864], {f1815, f1816}; +fma.rn.f32 f1817, f1683, f1484, f1686; +sub.f32 f1818, f1688, f2431; +st.shared.v2.f32 [r19+5120], {f1817, f1818}; +fma.rn.f32 f1819, f1691, f1488, f1694; +sub.f32 f1820, f1696, f2429; +st.shared.v2.f32 [r19+5376], {f1819, f1820}; +fma.rn.f32 f1821, f1699, f1492, f1702; +sub.f32 f1822, f1704, f2427; +st.shared.v2.f32 [r19+5632], {f1821, f1822}; +fma.rn.f32 f1823, f1707, f1496, f1710; +sub.f32 f1824, f1712, f2424; +st.shared.v2.f32 [r19+5888], {f1823, f1824}; +fma.rn.f32 f1825, f1715, f1500, f1718; +sub.f32 f1826, f1720, f2422; +st.shared.v2.f32 [r19+6144], {f1825, f1826}; +fma.rn.f32 f1827, f1723, f1504, f1726; +sub.f32 f1828, f1728, f2419; +st.shared.v2.f32 [r19+6400], {f1827, f1828}; +fma.rn.f32 f1829, f1731, f1508, f1734; +sub.f32 f1830, f1736, f2417; +st.shared.v2.f32 [r19+6656], {f1829, f1830}; +fma.rn.f32 f1831, f1739, f1512, f1742; +sub.f32 f1832, f1744, f2415; +st.shared.v2.f32 [r19+6912], {f1831, f1832}; +fma.rn.f32 f1833, f1747, f1516, f1750; +sub.f32 f1834, f1752, f2412; +st.shared.v2.f32 [r19+7168], {f1833, f1834}; +fma.rn.f32 f1835, f1755, f1520, f1758; +sub.f32 f1836, f1760, f2410; +st.shared.v2.f32 [r19+7424], {f1835, f1836}; +fma.rn.f32 f1837, f1763, f1524, f1766; +sub.f32 f1838, f1768, f2408; +st.shared.v2.f32 [r19+7680], {f1837, f1838}; +fma.rn.f32 f1839, f1771, f1528, f1774; +sub.f32 f1840, f1776, f1775; +st.shared.v2.f32 [r19+7936], {f1839, f1840}; +barrier.sync 0; +mad.lo.s32 r20, r31, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+4096]; +ld.shared.v2.f32 {f1849, f1850}, [r20+8192]; +ld.shared.v2.f32 {f1853, f1854}, [r20+12288]; +ld.shared.v2.f32 {f1857, f1858}, [r20+16384]; +ld.shared.v2.f32 {f1861, f1862}, [r20+20480]; +ld.shared.v2.f32 {f1865, f1866}, [r20+24576]; +ld.shared.v2.f32 {f1869, f1870}, [r20+28672]; +ld.shared.v2.f32 {f1873, f1874}, [r20+32768]; +ld.shared.v2.f32 {f1877, f1878}, [r20+36864]; +ld.shared.v2.f32 {f1881, f1882}, [r20+40960]; +ld.shared.v2.f32 {f1885, f1886}, [r20+45056]; +ld.shared.v2.f32 {f1889, f1890}, [r20+49152]; +ld.shared.v2.f32 {f1893, f1894}, [r20+53248]; +ld.shared.v2.f32 {f1897, f1898}, [r20+57344]; +ld.shared.v2.f32 {f1901, f1902}, [r20+61440]; +ld.shared.v2.f32 {f1905, f1906}, [r20+65536]; +ld.shared.v2.f32 {f1909, f1910}, [r20+69632]; +ld.shared.v2.f32 {f1913, f1914}, [r20+73728]; +ld.shared.v2.f32 {f1917, f1918}, [r20+77824]; +ld.shared.v2.f32 {f1921, f1922}, [r20+81920]; +ld.shared.v2.f32 {f1925, f1926}, [r20+86016]; +ld.shared.v2.f32 {f1929, f1930}, [r20+90112]; +ld.shared.v2.f32 {f1933, f1934}, [r20+94208]; +ld.shared.v2.f32 {f1937, f1938}, [r20+98304]; +ld.shared.v2.f32 {f1941, f1942}, [r20+102400]; +ld.shared.v2.f32 {f1945, f1946}, [r20+106496]; +ld.shared.v2.f32 {f1949, f1950}, [r20+110592]; +ld.shared.v2.f32 {f1953, f1954}, [r20+114688]; +ld.shared.v2.f32 {f1957, f1958}, [r20+118784]; +ld.shared.v2.f32 {f1961, f1962}, [r20+122880]; +ld.shared.v2.f32 {f1965, f1966}, [r20+126976]; +add.f32 f1969, f1841, f1905; +sub.f32 f1971, f1841, f1905; +add.f32 f2406, f1842, f1906; +sub.f32 f1972, f1842, f1906; +add.f32 f1973, f1873, f1937; +sub.f32 f1975, f1873, f1937; +add.f32 f2405, f1874, f1938; +sub.f32 f1976, f1874, f1938; +add.f32 f1977, f1969, f1973; +sub.f32 f1979, f1969, f1973; +add.f32 f2404, f2406, f2405; +sub.f32 f1980, f2406, f2405; +sub.f32 f1981, f1971, f1976; +add.f32 f1983, f1971, f1976; +add.f32 f2403, f1972, f1975; +sub.f32 f1984, f1972, f1975; +add.f32 f1985, f1857, f1921; +sub.f32 f1987, f1857, f1921; +add.f32 f2402, f1858, f1922; +sub.f32 f1988, f1858, f1922; +add.f32 f1989, f1889, f1953; +sub.f32 f1991, f1889, f1953; +add.f32 f2401, f1890, f1954; +sub.f32 f1992, f1890, f1954; +add.f32 f1993, f1985, f1989; +sub.f32 f1995, f1985, f1989; +add.f32 f2400, f2402, f2401; +sub.f32 f1996, f2402, f2401; +sub.f32 f1997, f1987, f1992; +add.f32 f1999, f1987, f1992; +add.f32 f2399, f1988, f1991; +sub.f32 f2000, f1988, f1991; +mul.f32 f2001, f1997, 0f3F3504F3; +mul.f32 f2002, f2399, 0f3F3504F3; +sub.f32 f2003, f2001, f2002; +add.f32 f2004, f2001, f2002; +mul.f32 f2006, f2000, 0f3F3504F3; +mul.f32 f2398, f1999, 0fBF3504F3; +sub.f32 f2007, f2398, f2006; +mul.f32 f2008, f2000, 0fBF3504F3; +fma.rn.f32 f2009, f1999, 0f3F3504F3, f2008; +add.f32 f2010, f1977, f1993; +sub.f32 f2012, f1977, f1993; +add.f32 f2397, f2404, f2400; +sub.f32 f2013, f2404, f2400; +add.f32 f2014, f1981, f2003; +sub.f32 f2016, f1981, f2003; +add.f32 f2396, f2403, f2004; +sub.f32 f2017, f2403, f2004; +sub.f32 f2018, f1979, f1996; +add.f32 f2020, f1979, f1996; +add.f32 f2395, f1980, f1995; +sub.f32 f2021, f1980, f1995; +add.f32 f2022, f1983, f2007; +sub.f32 f2024, f1983, f2007; +add.f32 f2394, f1984, f2009; +sub.f32 f2025, f1984, f2009; +add.f32 f2026, f1849, f1913; +sub.f32 f2028, f1849, f1913; +add.f32 f2393, f1850, f1914; +sub.f32 f2029, f1850, f1914; +add.f32 f2030, f1881, f1945; +sub.f32 f2032, f1881, f1945; +add.f32 f2392, f1882, f1946; +sub.f32 f2033, f1882, f1946; +add.f32 f2034, f2026, f2030; +sub.f32 f2036, f2026, f2030; +add.f32 f2391, f2393, f2392; +sub.f32 f2037, f2393, f2392; +sub.f32 f2038, f2028, f2033; +add.f32 f2040, f2028, f2033; +add.f32 f2390, f2029, f2032; +sub.f32 f2041, f2029, f2032; +add.f32 f2042, f1865, f1929; +sub.f32 f2044, f1865, f1929; +add.f32 f2389, f1866, f1930; +sub.f32 f2045, f1866, f1930; +add.f32 f2046, f1897, f1961; +sub.f32 f2048, f1897, f1961; +add.f32 f2388, f1898, f1962; +sub.f32 f2049, f1898, f1962; +add.f32 f2050, f2042, f2046; +sub.f32 f2052, f2042, f2046; +add.f32 f2387, f2389, f2388; +sub.f32 f2053, f2389, f2388; +sub.f32 f2054, f2044, f2049; +add.f32 f2056, f2044, f2049; +add.f32 f2386, f2045, f2048; +sub.f32 f2057, f2045, f2048; +mul.f32 f2058, f2054, 0f3F3504F3; +mul.f32 f2059, f2386, 0f3F3504F3; +sub.f32 f2060, f2058, f2059; +add.f32 f2061, f2058, f2059; +mul.f32 f2063, f2057, 0f3F3504F3; +mul.f32 f2385, f2056, 0fBF3504F3; +sub.f32 f2064, f2385, f2063; +mul.f32 f2065, f2057, 0fBF3504F3; +fma.rn.f32 f2066, f2056, 0f3F3504F3, f2065; +add.f32 f2067, f2034, f2050; +sub.f32 f2069, f2034, f2050; +add.f32 f2384, f2391, f2387; +sub.f32 f2070, f2391, f2387; +add.f32 f2071, f2038, f2060; +sub.f32 f2073, f2038, f2060; +add.f32 f2383, f2390, f2061; +sub.f32 f2074, f2390, f2061; +sub.f32 f2075, f2036, f2053; +add.f32 f2077, f2036, f2053; +add.f32 f2382, f2037, f2052; +sub.f32 f2078, f2037, f2052; +add.f32 f2079, f2040, f2064; +sub.f32 f2081, f2040, f2064; +add.f32 f2381, f2041, f2066; +sub.f32 f2082, f2041, f2066; +mul.f32 f2084, f2383, 0f3EC3EF15; +mul.f32 f2380, f2071, 0f3F6C835E; +sub.f32 f2085, f2380, f2084; +mul.f32 f2086, f2383, 0f3F6C835E; +fma.rn.f32 f2087, f2071, 0f3EC3EF15, f2086; +mul.f32 f2088, f2075, 0f3F3504F3; +mul.f32 f2089, f2382, 0f3F3504F3; +sub.f32 f2090, f2088, f2089; +add.f32 f2091, f2088, f2089; +mul.f32 f2378, f2079, 0f3EC3EF15; +mul.f32 f2379, f2381, 0f3F6C835E; +sub.f32 f2094, f2378, f2379; +mul.f32 f2095, f2381, 0f3EC3EF15; +fma.rn.f32 f2096, f2079, 0f3F6C835E, f2095; +mul.f32 f2098, f2074, 0f3F6C835E; +mul.f32 f2377, f2073, 0fBEC3EF15; +sub.f32 f2099, f2377, f2098; +mul.f32 f2100, f2074, 0fBEC3EF15; +fma.rn.f32 f2101, f2073, 0f3F6C835E, f2100; +mul.f32 f2103, f2078, 0f3F3504F3; +mul.f32 f2376, f2077, 0fBF3504F3; +sub.f32 f2104, f2376, f2103; +mul.f32 f2105, f2078, 0fBF3504F3; +fma.rn.f32 f2106, f2077, 0f3F3504F3, f2105; +mul.f32 f2108, f2082, 0f3EC3EF15; +mul.f32 f2375, f2081, 0fBF6C835E; +sub.f32 f2109, f2375, f2108; +mul.f32 f2110, f2082, 0fBF6C835E; +fma.rn.f32 f2111, f2081, 0f3EC3EF15, f2110; +add.f32 f2112, f1845, f1909; +sub.f32 f2114, f1845, f1909; +add.f32 f2374, f1846, f1910; +sub.f32 f2115, f1846, f1910; +add.f32 f2116, f1877, f1941; +sub.f32 f2118, f1877, f1941; +add.f32 f2373, f1878, f1942; +sub.f32 f2119, f1878, f1942; +add.f32 f2120, f2112, f2116; +sub.f32 f2122, f2112, f2116; +add.f32 f2372, f2374, f2373; +sub.f32 f2123, f2374, f2373; +sub.f32 f2124, f2114, f2119; +add.f32 f2126, f2114, f2119; +add.f32 f2371, f2115, f2118; +sub.f32 f2127, f2115, f2118; +add.f32 f2128, f1861, f1925; +sub.f32 f2130, f1861, f1925; +add.f32 f2370, f1862, f1926; +sub.f32 f2131, f1862, f1926; +add.f32 f2132, f1893, f1957; +sub.f32 f2134, f1893, f1957; +add.f32 f2369, f1894, f1958; +sub.f32 f2135, f1894, f1958; +add.f32 f2136, f2128, f2132; +sub.f32 f2138, f2128, f2132; +add.f32 f2368, f2370, f2369; +sub.f32 f2139, f2370, f2369; +sub.f32 f2140, f2130, f2135; +add.f32 f2142, f2130, f2135; +add.f32 f2367, f2131, f2134; +sub.f32 f2143, f2131, f2134; +mul.f32 f2144, f2140, 0f3F3504F3; +mul.f32 f2145, f2367, 0f3F3504F3; +sub.f32 f2146, f2144, f2145; +add.f32 f2147, f2144, f2145; +mul.f32 f2149, f2143, 0f3F3504F3; +mul.f32 f2366, f2142, 0fBF3504F3; +sub.f32 f2150, f2366, f2149; +mul.f32 f2151, f2143, 0fBF3504F3; +fma.rn.f32 f2152, f2142, 0f3F3504F3, f2151; +add.f32 f2153, f2120, f2136; +sub.f32 f2155, f2120, f2136; +add.f32 f2365, f2372, f2368; +sub.f32 f2156, f2372, f2368; +add.f32 f2157, f2124, f2146; +sub.f32 f2159, f2124, f2146; +add.f32 f2364, f2371, f2147; +sub.f32 f2160, f2371, f2147; +sub.f32 f2161, f2122, f2139; +add.f32 f2163, f2122, f2139; +add.f32 f2363, f2123, f2138; +sub.f32 f2164, f2123, f2138; +add.f32 f2165, f2126, f2150; +sub.f32 f2167, f2126, f2150; +add.f32 f2362, f2127, f2152; +sub.f32 f2168, f2127, f2152; +add.f32 f2169, f1853, f1917; +sub.f32 f2171, f1853, f1917; +add.f32 f2361, f1854, f1918; +sub.f32 f2172, f1854, f1918; +add.f32 f2173, f1885, f1949; +sub.f32 f2175, f1885, f1949; +add.f32 f2360, f1886, f1950; +sub.f32 f2176, f1886, f1950; +add.f32 f2177, f2169, f2173; +sub.f32 f2179, f2169, f2173; +add.f32 f2359, f2361, f2360; +sub.f32 f2180, f2361, f2360; +sub.f32 f2181, f2171, f2176; +add.f32 f2183, f2171, f2176; +add.f32 f2358, f2172, f2175; +sub.f32 f2184, f2172, f2175; +add.f32 f2185, f1869, f1933; +sub.f32 f2187, f1869, f1933; +add.f32 f2357, f1870, f1934; +sub.f32 f2188, f1870, f1934; +add.f32 f2189, f1901, f1965; +sub.f32 f2191, f1901, f1965; +add.f32 f2356, f1902, f1966; +sub.f32 f2192, f1902, f1966; +add.f32 f2193, f2185, f2189; +sub.f32 f2195, f2185, f2189; +add.f32 f2355, f2357, f2356; +sub.f32 f2196, f2357, f2356; +sub.f32 f2197, f2187, f2192; +add.f32 f2199, f2187, f2192; +add.f32 f2354, f2188, f2191; +sub.f32 f2200, f2188, f2191; +mul.f32 f2201, f2197, 0f3F3504F3; +mul.f32 f2202, f2354, 0f3F3504F3; +sub.f32 f2203, f2201, f2202; +add.f32 f2204, f2201, f2202; +mul.f32 f2352, f2199, 0fBF3504F3; +mul.f32 f2353, f2200, 0f3F3504F3; +sub.f32 f2207, f2352, f2353; +mul.f32 f2208, f2200, 0fBF3504F3; +fma.rn.f32 f2209, f2199, 0f3F3504F3, f2208; +add.f32 f2210, f2177, f2193; +sub.f32 f2212, f2177, f2193; +add.f32 f2351, f2359, f2355; +sub.f32 f2213, f2359, f2355; +add.f32 f2214, f2181, f2203; +sub.f32 f2216, f2181, f2203; +add.f32 f2350, f2358, f2204; +sub.f32 f2217, f2358, f2204; +sub.f32 f2218, f2179, f2196; +add.f32 f2220, f2179, f2196; +add.f32 f2349, f2180, f2195; +sub.f32 f2221, f2180, f2195; +add.f32 f2222, f2183, f2207; +sub.f32 f2224, f2183, f2207; +add.f32 f2348, f2184, f2209; +sub.f32 f2225, f2184, f2209; +mul.f32 f2227, f2350, 0f3EC3EF15; +mul.f32 f2347, f2214, 0f3F6C835E; +sub.f32 f2228, f2347, f2227; +mul.f32 f2229, f2350, 0f3F6C835E; +fma.rn.f32 f2230, f2214, 0f3EC3EF15, f2229; +mul.f32 f2231, f2218, 0f3F3504F3; +mul.f32 f2232, f2349, 0f3F3504F3; +sub.f32 f2233, f2231, f2232; +add.f32 f2234, f2231, f2232; +mul.f32 f2236, f2348, 0f3F6C835E; +mul.f32 f2346, f2222, 0f3EC3EF15; +sub.f32 f2237, f2346, f2236; +mul.f32 f2238, f2348, 0f3EC3EF15; +fma.rn.f32 f2239, f2222, 0f3F6C835E, f2238; +mul.f32 f2241, f2217, 0f3F6C835E; +mul.f32 f2345, f2216, 0fBEC3EF15; +sub.f32 f2242, f2345, f2241; +mul.f32 f2243, f2217, 0fBEC3EF15; +fma.rn.f32 f2244, f2216, 0f3F6C835E, f2243; +mul.f32 f2246, f2221, 0f3F3504F3; +mul.f32 f2344, f2220, 0fBF3504F3; +sub.f32 f2247, f2344, f2246; +mul.f32 f2248, f2221, 0fBF3504F3; +fma.rn.f32 f2249, f2220, 0f3F3504F3, f2248; +mul.f32 f2342, f2224, 0fBF6C835E; +mul.f32 f2343, f2225, 0f3EC3EF15; +sub.f32 f2252, f2342, f2343; +mul.f32 f2253, f2225, 0fBF6C835E; +fma.rn.f32 f2254, f2224, 0f3EC3EF15, f2253; +add.f32 %1, f2397, f2384; +add.f32 %0, f2010, f2067; +add.f32 %2, f2153, f2210; +add.f32 %3, f2365, f2351; +add.f32 %4, f2014, f2085; +add.f32 %5, f2396, f2087; +add.f32 %6, f2157, f2228; +add.f32 %7, f2364, f2230; +add.f32 %8, f2018, f2090; +add.f32 %9, f2395, f2091; +add.f32 %11, f2363, f2234; +add.f32 %10, f2161, f2233; +add.f32 %13, f2394, f2096; +add.f32 %12, f2022, f2094; +add.f32 %15, f2362, f2239; +add.f32 %14, f2165, f2237; +sub.f32 %16, f2012, f2070; +add.f32 %17, f2013, f2069; +sub.f32 %18, f2155, f2213; +add.f32 %19, f2156, f2212; +add.f32 %20, f2016, f2099; +add.f32 %21, f2017, f2101; +add.f32 %23, f2160, f2244; +add.f32 %22, f2159, f2242; +add.f32 %25, f2021, f2106; +add.f32 %24, f2020, f2104; +add.f32 %27, f2164, f2249; +add.f32 %26, f2163, f2247; +add.f32 %28, f2024, f2109; +add.f32 %29, f2025, f2111; +add.f32 %30, f2167, f2252; +add.f32 %31, f2168, f2254; +sub.f32 %33, f2397, f2384; +sub.f32 %32, f2010, f2067; +sub.f32 %35, f2365, f2351; +sub.f32 %34, f2153, f2210; +sub.f32 %37, f2396, f2087; +sub.f32 %36, f2014, f2085; +sub.f32 %39, f2364, f2230; +sub.f32 %38, f2157, f2228; +sub.f32 %41, f2395, f2091; +sub.f32 %40, f2018, f2090; +sub.f32 %43, f2363, f2234; +sub.f32 %42, f2161, f2233; +sub.f32 %45, f2394, f2096; +sub.f32 %44, f2022, f2094; +sub.f32 %47, f2362, f2239; +sub.f32 %46, f2165, f2237; +sub.f32 %49, f2013, f2069; +add.f32 %48, f2012, f2070; +sub.f32 %51, f2156, f2212; +add.f32 %50, f2155, f2213; +sub.f32 %53, f2017, f2101; +sub.f32 %52, f2016, f2099; +sub.f32 %55, f2160, f2244; +sub.f32 %54, f2159, f2242; +sub.f32 %57, f2021, f2106; +sub.f32 %56, f2020, f2104; +sub.f32 %59, f2164, f2249; +sub.f32 %58, f2163, f2247; +sub.f32 %61, f2025, f2111; +sub.f32 %60, f2024, f2109; +sub.f32 %63, f2168, f2254; +sub.f32 %62, f2167, f2252; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_16384), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<317, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1197>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %36, %57; +add.f32 f66, %37, %59; +sub.f32 f67, %36, %57; +sub.f32 f68, %37, %59; +add.f32 f69, %46, %68; +add.f32 f70, %48, %69; +sub.f32 f71, %46, %68; +sub.f32 f72, %48, %69; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %41, %62; +add.f32 f82, %43, %64; +sub.f32 f83, %41, %62; +sub.f32 f84, %43, %64; +add.f32 f85, %52, %73; +add.f32 f86, %53, %75; +sub.f32 f87, %52, %73; +sub.f32 f88, %53, %75; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %60; +add.f32 f123, %40, %61; +sub.f32 f124, %38, %60; +sub.f32 f125, %40, %61; +add.f32 f126, %49, %70; +add.f32 f127, %51, %72; +sub.f32 f128, %49, %70; +sub.f32 f129, %51, %72; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %44, %65; +add.f32 f139, %45, %67; +sub.f32 f140, %44, %65; +sub.f32 f141, %45, %67; +add.f32 f142, %54, %76; +add.f32 f143, %56, %77; +sub.f32 f144, %54, %76; +sub.f32 f145, %56, %77; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65472; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+4096]; +ld.shared.f32 f391, [r13+8192]; +ld.shared.f32 f392, [r13+12288]; +ld.shared.f32 f393, [r13+16384]; +ld.shared.f32 f394, [r13+20480]; +ld.shared.f32 f395, [r13+24576]; +ld.shared.f32 f396, [r13+28672]; +ld.shared.f32 f397, [r13+32768]; +ld.shared.f32 f398, [r13+36864]; +ld.shared.f32 f399, [r13+40960]; +ld.shared.f32 f400, [r13+45056]; +ld.shared.f32 f401, [r13+49152]; +ld.shared.f32 f402, [r13+53248]; +ld.shared.f32 f403, [r13+57344]; +ld.shared.f32 f404, [r13+61440]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+4096]; +ld.shared.f32 f407, [r13+8192]; +ld.shared.f32 f408, [r13+12288]; +ld.shared.f32 f409, [r13+16384]; +ld.shared.f32 f410, [r13+20480]; +ld.shared.f32 f411, [r13+24576]; +ld.shared.f32 f412, [r13+28672]; +ld.shared.f32 f413, [r13+32768]; +ld.shared.f32 f414, [r13+36864]; +ld.shared.f32 f415, [r13+40960]; +ld.shared.f32 f416, [r13+45056]; +ld.shared.f32 f417, [r13+49152]; +ld.shared.f32 f418, [r13+53248]; +ld.shared.f32 f419, [r13+57344]; +ld.shared.f32 f420, [r13+61440]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f543; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f543; +add.f32 f576, f474, f546; +add.f32 f577, f475, f548; +sub.f32 f578, f474, f546; +sub.f32 f579, f475, f548; +sub.f32 f580, f464, f522; +add.f32 f581, f465, f521; +add.f32 f582, f464, f522; +sub.f32 f583, f465, f521; +add.f32 f584, f468, f551; +add.f32 f585, f469, f553; +sub.f32 f586, f468, f551; +sub.f32 f587, f469, f553; +add.f32 f588, f472, f556; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f556; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 1008; +bfe.u32 r15, r5, 4, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f569, f597; +fma.rn.f32 f601, f596, f568, f600; +mul.f32 f602, f568, f597; +mul.f32 f603, f596, f569; +sub.f32 f604, f603, f602; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f573, f609; +fma.rn.f32 f611, f607, f572, f610; +mul.f32 f612, f572, f609; +mul.f32 f613, f607, f573; +sub.f32 f614, f613, f612; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f577, f619; +fma.rn.f32 f621, f617, f576, f620; +mul.f32 f622, f576, f619; +mul.f32 f623, f617, f577; +sub.f32 f624, f623, f622; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f581, f629; +fma.rn.f32 f631, f627, f580, f630; +mul.f32 f632, f580, f629; +mul.f32 f633, f627, f581; +sub.f32 f634, f633, f632; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f585, f639; +fma.rn.f32 f641, f637, f584, f640; +mul.f32 f642, f584, f639; +mul.f32 f643, f637, f585; +sub.f32 f644, f643, f642; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f589, f649; +fma.rn.f32 f651, f647, f588, f650; +mul.f32 f652, f588, f649; +mul.f32 f653, f647, f589; +sub.f32 f654, f653, f652; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f593, f659; +fma.rn.f32 f661, f657, f592, f660; +mul.f32 f662, f592, f659; +mul.f32 f663, f657, f593; +sub.f32 f664, f663, f662; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f567, f669; +fma.rn.f32 f671, f667, f566, f670; +mul.f32 f672, f566, f669; +mul.f32 f673, f667, f567; +sub.f32 f674, f673, f672; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f571, f679; +fma.rn.f32 f681, f677, f570, f680; +mul.f32 f682, f570, f679; +mul.f32 f683, f677, f571; +sub.f32 f684, f683, f682; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f575, f689; +fma.rn.f32 f691, f687, f574, f690; +mul.f32 f692, f574, f689; +mul.f32 f693, f687, f575; +sub.f32 f694, f693, f692; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f579, f699; +fma.rn.f32 f701, f697, f578, f700; +mul.f32 f702, f578, f699; +mul.f32 f703, f697, f579; +sub.f32 f704, f703, f702; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f583, f709; +fma.rn.f32 f711, f707, f582, f710; +mul.f32 f712, f582, f709; +mul.f32 f713, f707, f583; +sub.f32 f714, f713, f712; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f587, f719; +fma.rn.f32 f721, f717, f586, f720; +mul.f32 f722, f586, f719; +mul.f32 f723, f717, f587; +sub.f32 f724, f723, f722; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f591, f729; +fma.rn.f32 f731, f727, f590, f730; +mul.f32 f732, f590, f729; +mul.f32 f733, f727, f591; +sub.f32 f734, f733, f732; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f595, f739; +fma.rn.f32 f741, f737, f594, f740; +mul.f32 f742, f594, f739; +mul.f32 f743, f737, f595; +sub.f32 f744, f743, f742; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 64512; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f601; +st.shared.f32 [r20+128], f611; +st.shared.f32 [r20+192], f621; +st.shared.f32 [r20+256], f631; +st.shared.f32 [r20+320], f641; +st.shared.f32 [r20+384], f651; +st.shared.f32 [r20+448], f661; +st.shared.f32 [r20+512], f671; +st.shared.f32 [r20+576], f681; +st.shared.f32 [r20+640], f691; +st.shared.f32 [r20+704], f701; +st.shared.f32 [r20+768], f711; +st.shared.f32 [r20+832], f721; +st.shared.f32 [r20+896], f731; +st.shared.f32 [r20+960], f741; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+4096]; +ld.shared.f32 f747, [r21+8192]; +ld.shared.f32 f748, [r21+12288]; +ld.shared.f32 f749, [r21+16384]; +ld.shared.f32 f750, [r21+20480]; +ld.shared.f32 f751, [r21+24576]; +ld.shared.f32 f752, [r21+28672]; +ld.shared.f32 f753, [r21+32768]; +ld.shared.f32 f754, [r21+36864]; +ld.shared.f32 f755, [r21+40960]; +ld.shared.f32 f756, [r21+45056]; +ld.shared.f32 f757, [r21+49152]; +ld.shared.f32 f758, [r21+53248]; +ld.shared.f32 f759, [r21+57344]; +ld.shared.f32 f760, [r21+61440]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+4096]; +ld.shared.f32 f763, [r21+8192]; +ld.shared.f32 f764, [r21+12288]; +ld.shared.f32 f765, [r21+16384]; +ld.shared.f32 f766, [r21+20480]; +ld.shared.f32 f767, [r21+24576]; +ld.shared.f32 f768, [r21+28672]; +ld.shared.f32 f769, [r21+32768]; +ld.shared.f32 f770, [r21+36864]; +ld.shared.f32 f771, [r21+40960]; +ld.shared.f32 f772, [r21+45056]; +ld.shared.f32 f773, [r21+49152]; +ld.shared.f32 f774, [r21+53248]; +ld.shared.f32 f775, [r21+57344]; +ld.shared.f32 f776, [r21+61440]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +sub.f32 f789, f779, f784; +add.f32 f790, f780, f783; +add.f32 f791, f779, f784; +sub.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +sub.f32 f805, f795, f800; +add.f32 f806, f796, f799; +add.f32 f807, f795, f800; +sub.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0f3F3504F3; +sub.f32 f811, f809, f810; +add.f32 f812, f809, f810; +mul.f32 f813, f807, 0fBF3504F3; +mul.f32 f814, f808, 0f3F3504F3; +sub.f32 f815, f813, f814; +mul.f32 f816, f808, 0fBF3504F3; +fma.rn.f32 f817, f807, 0f3F3504F3, f816; +add.f32 f818, f785, f801; +add.f32 f819, f786, f802; +sub.f32 f820, f785, f801; +sub.f32 f821, f786, f802; +add.f32 f822, f789, f811; +add.f32 f823, f790, f812; +sub.f32 f824, f789, f811; +sub.f32 f825, f790, f812; +sub.f32 f826, f787, f804; +add.f32 f827, f788, f803; +add.f32 f828, f787, f804; +sub.f32 f829, f788, f803; +add.f32 f830, f791, f815; +add.f32 f831, f792, f817; +sub.f32 f832, f791, f815; +sub.f32 f833, f792, f817; +add.f32 f834, f746, f754; +add.f32 f835, f762, f770; +sub.f32 f836, f746, f754; +sub.f32 f837, f762, f770; +add.f32 f838, f750, f758; +add.f32 f839, f766, f774; +sub.f32 f840, f750, f758; +sub.f32 f841, f766, f774; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +sub.f32 f846, f836, f841; +add.f32 f847, f837, f840; +add.f32 f848, f836, f841; +sub.f32 f849, f837, f840; +add.f32 f850, f748, f756; +add.f32 f851, f764, f772; +sub.f32 f852, f748, f756; +sub.f32 f853, f764, f772; +add.f32 f854, f752, f760; +add.f32 f855, f768, f776; +sub.f32 f856, f752, f760; +sub.f32 f857, f768, f776; +add.f32 f858, f850, f854; +add.f32 f859, f851, f855; +sub.f32 f860, f850, f854; +sub.f32 f861, f851, f855; +sub.f32 f862, f852, f857; +add.f32 f863, f853, f856; +add.f32 f864, f852, f857; +sub.f32 f865, f853, f856; +mul.f32 f866, f862, 0f3F3504F3; +mul.f32 f867, f863, 0f3F3504F3; +sub.f32 f868, f866, f867; +add.f32 f869, f866, f867; +mul.f32 f870, f864, 0fBF3504F3; +mul.f32 f871, f865, 0f3F3504F3; +sub.f32 f872, f870, f871; +mul.f32 f873, f865, 0fBF3504F3; +fma.rn.f32 f874, f864, 0f3F3504F3, f873; +add.f32 f875, f842, f858; +add.f32 f876, f843, f859; +sub.f32 f877, f842, f858; +sub.f32 f878, f843, f859; +add.f32 f879, f846, f868; +add.f32 f880, f847, f869; +sub.f32 f881, f846, f868; +sub.f32 f882, f847, f869; +sub.f32 f883, f844, f861; +add.f32 f884, f845, f860; +add.f32 f885, f844, f861; +sub.f32 f886, f845, f860; +add.f32 f887, f848, f872; +add.f32 f888, f849, f874; +sub.f32 f889, f848, f872; +sub.f32 f890, f849, f874; +mul.f32 f891, f879, 0f3F6C835E; +mul.f32 f892, f880, 0f3EC3EF15; +sub.f32 f893, f891, f892; +mul.f32 f894, f880, 0f3F6C835E; +fma.rn.f32 f895, f879, 0f3EC3EF15, f894; +mul.f32 f896, f883, 0f3F3504F3; +mul.f32 f897, f884, 0f3F3504F3; +sub.f32 f898, f896, f897; +add.f32 f899, f896, f897; +mul.f32 f900, f887, 0f3EC3EF15; +mul.f32 f901, f888, 0f3F6C835E; +sub.f32 f902, f900, f901; +mul.f32 f903, f888, 0f3EC3EF15; +fma.rn.f32 f904, f887, 0f3F6C835E, f903; +mul.f32 f905, f881, 0fBEC3EF15; +mul.f32 f906, f882, 0f3F6C835E; +sub.f32 f907, f905, f906; +mul.f32 f908, f882, 0fBEC3EF15; +fma.rn.f32 f909, f881, 0f3F6C835E, f908; +mul.f32 f910, f885, 0fBF3504F3; +mul.f32 f911, f886, 0f3F3504F3; +sub.f32 f912, f910, f911; +mul.f32 f913, f886, 0fBF3504F3; +fma.rn.f32 f914, f885, 0f3F3504F3, f913; +mul.f32 f915, f889, 0fBF6C835E; +mul.f32 f916, f890, 0f3EC3EF15; +sub.f32 f917, f915, f916; +mul.f32 f918, f890, 0fBF6C835E; +fma.rn.f32 f919, f889, 0f3EC3EF15, f918; +add.f32 f920, f818, f875; +add.f32 f921, f819, f876; +sub.f32 f922, f818, f875; +sub.f32 f923, f819, f876; +add.f32 f924, f822, f893; +add.f32 f925, f823, f895; +sub.f32 f926, f822, f893; +sub.f32 f927, f823, f895; +add.f32 f928, f826, f898; +add.f32 f929, f827, f899; +sub.f32 f930, f826, f898; +sub.f32 f931, f827, f899; +add.f32 f932, f830, f902; +add.f32 f933, f831, f904; +sub.f32 f934, f830, f902; +sub.f32 f935, f831, f904; +sub.f32 f936, f820, f878; +add.f32 f937, f821, f877; +add.f32 f938, f820, f878; +sub.f32 f939, f821, f877; +add.f32 f940, f824, f907; +add.f32 f941, f825, f909; +sub.f32 f942, f824, f907; +sub.f32 f943, f825, f909; +add.f32 f944, f828, f912; +add.f32 f945, f829, f914; +sub.f32 f946, f828, f912; +sub.f32 f947, f829, f914; +add.f32 f948, f832, f917; +add.f32 f949, f833, f919; +sub.f32 f950, f832, f917; +sub.f32 f951, f833, f919; +and.b32 r22, r5, 768; +bfe.u32 r23, r5, 8, 2; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f952, f953}, [rd11]; +mul.f32 f956, f925, f953; +fma.rn.f32 f957, f952, f924, f956; +mul.f32 f958, f924, f953; +mul.f32 f959, f952, f925; +sub.f32 f960, f959, f958; +mul.f32 f961, f952, f952; +mul.f32 f962, f953, f953; +sub.f32 f963, f961, f962; +mul.f32 f964, f953, f952; +fma.rn.f32 f965, f953, f952, f964; +mul.f32 f966, f929, f965; +fma.rn.f32 f967, f963, f928, f966; +mul.f32 f968, f928, f965; +mul.f32 f969, f963, f929; +sub.f32 f970, f969, f968; +mul.f32 f971, f952, f963; +mul.f32 f972, f953, f965; +sub.f32 f973, f971, f972; +mul.f32 f974, f952, f965; +fma.rn.f32 f975, f953, f963, f974; +mul.f32 f976, f933, f975; +fma.rn.f32 f977, f973, f932, f976; +mul.f32 f978, f932, f975; +mul.f32 f979, f973, f933; +sub.f32 f980, f979, f978; +mul.f32 f981, f952, f973; +mul.f32 f982, f953, f975; +sub.f32 f983, f981, f982; +mul.f32 f984, f952, f975; +fma.rn.f32 f985, f953, f973, f984; +mul.f32 f986, f937, f985; +fma.rn.f32 f987, f983, f936, f986; +mul.f32 f988, f936, f985; +mul.f32 f989, f983, f937; +sub.f32 f990, f989, f988; +mul.f32 f991, f952, f983; +mul.f32 f992, f953, f985; +sub.f32 f993, f991, f992; +mul.f32 f994, f952, f985; +fma.rn.f32 f995, f953, f983, f994; +mul.f32 f996, f941, f995; +fma.rn.f32 f997, f993, f940, f996; +mul.f32 f998, f940, f995; +mul.f32 f999, f993, f941; +sub.f32 f1000, f999, f998; +mul.f32 f1001, f952, f993; +mul.f32 f1002, f953, f995; +sub.f32 f1003, f1001, f1002; +mul.f32 f1004, f952, f995; +fma.rn.f32 f1005, f953, f993, f1004; +mul.f32 f1006, f945, f1005; +fma.rn.f32 f1007, f1003, f944, f1006; +mul.f32 f1008, f944, f1005; +mul.f32 f1009, f1003, f945; +sub.f32 f1010, f1009, f1008; +mul.f32 f1011, f952, f1003; +mul.f32 f1012, f953, f1005; +sub.f32 f1013, f1011, f1012; +mul.f32 f1014, f952, f1005; +fma.rn.f32 f1015, f953, f1003, f1014; +mul.f32 f1016, f949, f1015; +fma.rn.f32 f1017, f1013, f948, f1016; +mul.f32 f1018, f948, f1015; +mul.f32 f1019, f1013, f949; +sub.f32 f1020, f1019, f1018; +mul.f32 f1021, f952, f1013; +mul.f32 f1022, f953, f1015; +sub.f32 f1023, f1021, f1022; +mul.f32 f1024, f952, f1015; +fma.rn.f32 f1025, f953, f1013, f1024; +mul.f32 f1026, f923, f1025; +fma.rn.f32 f1027, f1023, f922, f1026; +mul.f32 f1028, f922, f1025; +mul.f32 f1029, f1023, f923; +sub.f32 f1030, f1029, f1028; +mul.f32 f1031, f952, f1023; +mul.f32 f1032, f953, f1025; +sub.f32 f1033, f1031, f1032; +mul.f32 f1034, f952, f1025; +fma.rn.f32 f1035, f953, f1023, f1034; +mul.f32 f1036, f927, f1035; +fma.rn.f32 f1037, f1033, f926, f1036; +mul.f32 f1038, f926, f1035; +mul.f32 f1039, f1033, f927; +sub.f32 f1040, f1039, f1038; +mul.f32 f1041, f952, f1033; +mul.f32 f1042, f953, f1035; +sub.f32 f1043, f1041, f1042; +mul.f32 f1044, f952, f1035; +fma.rn.f32 f1045, f953, f1033, f1044; +mul.f32 f1046, f931, f1045; +fma.rn.f32 f1047, f1043, f930, f1046; +mul.f32 f1048, f930, f1045; +mul.f32 f1049, f1043, f931; +sub.f32 f1050, f1049, f1048; +mul.f32 f1051, f952, f1043; +mul.f32 f1052, f953, f1045; +sub.f32 f1053, f1051, f1052; +mul.f32 f1054, f952, f1045; +fma.rn.f32 f1055, f953, f1043, f1054; +mul.f32 f1056, f935, f1055; +fma.rn.f32 f1057, f1053, f934, f1056; +mul.f32 f1058, f934, f1055; +mul.f32 f1059, f1053, f935; +sub.f32 f1060, f1059, f1058; +mul.f32 f1061, f952, f1053; +mul.f32 f1062, f953, f1055; +sub.f32 f1063, f1061, f1062; +mul.f32 f1064, f952, f1055; +fma.rn.f32 f1065, f953, f1053, f1064; +mul.f32 f1066, f939, f1065; +fma.rn.f32 f1067, f1063, f938, f1066; +mul.f32 f1068, f938, f1065; +mul.f32 f1069, f1063, f939; +sub.f32 f1070, f1069, f1068; +mul.f32 f1071, f952, f1063; +mul.f32 f1072, f953, f1065; +sub.f32 f1073, f1071, f1072; +mul.f32 f1074, f952, f1065; +fma.rn.f32 f1075, f953, f1063, f1074; +mul.f32 f1076, f943, f1075; +fma.rn.f32 f1077, f1073, f942, f1076; +mul.f32 f1078, f942, f1075; +mul.f32 f1079, f1073, f943; +sub.f32 f1080, f1079, f1078; +mul.f32 f1081, f952, f1073; +mul.f32 f1082, f953, f1075; +sub.f32 f1083, f1081, f1082; +mul.f32 f1084, f952, f1075; +fma.rn.f32 f1085, f953, f1073, f1084; +mul.f32 f1086, f947, f1085; +fma.rn.f32 f1087, f1083, f946, f1086; +mul.f32 f1088, f946, f1085; +mul.f32 f1089, f1083, f947; +sub.f32 f1090, f1089, f1088; +mul.f32 f1091, f952, f1083; +mul.f32 f1092, f953, f1085; +sub.f32 f1093, f1091, f1092; +mul.f32 f1094, f952, f1085; +fma.rn.f32 f1095, f953, f1083, f1094; +mul.f32 f1096, f951, f1095; +fma.rn.f32 f1097, f1093, f950, f1096; +mul.f32 f1098, f950, f1095; +mul.f32 f1099, f1093, f951; +sub.f32 f1100, f1099, f1098; +and.b32 r24, r16, 1020; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 49152; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f920; +st.shared.f32 [r27+1024], f957; +st.shared.f32 [r27+2048], f967; +st.shared.f32 [r27+3072], f977; +st.shared.f32 [r27+4096], f987; +st.shared.f32 [r27+5120], f997; +st.shared.f32 [r27+6144], f1007; +st.shared.f32 [r27+7168], f1017; +st.shared.f32 [r27+8192], f1027; +st.shared.f32 [r27+9216], f1037; +st.shared.f32 [r27+10240], f1047; +st.shared.f32 [r27+11264], f1057; +st.shared.f32 [r27+12288], f1067; +st.shared.f32 [r27+13312], f1077; +st.shared.f32 [r27+14336], f1087; +st.shared.f32 [r27+15360], f1097; +barrier.sync 0; +mad.lo.s32 r28, r22, -60, r27; +ld.shared.f32 f1101, [r28]; +ld.shared.f32 f1102, [r28+4096]; +ld.shared.f32 f1103, [r28+8192]; +ld.shared.f32 f1104, [r28+12288]; +ld.shared.f32 f1105, [r28+16384]; +ld.shared.f32 f1106, [r28+20480]; +ld.shared.f32 f1107, [r28+24576]; +ld.shared.f32 f1108, [r28+28672]; +ld.shared.f32 f1109, [r28+32768]; +ld.shared.f32 f1110, [r28+36864]; +ld.shared.f32 f1111, [r28+40960]; +ld.shared.f32 f1112, [r28+45056]; +ld.shared.f32 f1113, [r28+49152]; +ld.shared.f32 f1114, [r28+53248]; +ld.shared.f32 f1115, [r28+57344]; +ld.shared.f32 f1116, [r28+61440]; +barrier.sync 0; +st.shared.f32 [r27], f921; +st.shared.f32 [r27+1024], f960; +st.shared.f32 [r27+2048], f970; +st.shared.f32 [r27+3072], f980; +st.shared.f32 [r27+4096], f990; +st.shared.f32 [r27+5120], f1000; +st.shared.f32 [r27+6144], f1010; +st.shared.f32 [r27+7168], f1020; +st.shared.f32 [r27+8192], f1030; +st.shared.f32 [r27+9216], f1040; +st.shared.f32 [r27+10240], f1050; +st.shared.f32 [r27+11264], f1060; +st.shared.f32 [r27+12288], f1070; +st.shared.f32 [r27+13312], f1080; +st.shared.f32 [r27+14336], f1090; +st.shared.f32 [r27+15360], f1100; +barrier.sync 0; +ld.shared.f32 f1117, [r28]; +ld.shared.f32 f1118, [r28+4096]; +ld.shared.f32 f1119, [r28+8192]; +ld.shared.f32 f1120, [r28+12288]; +ld.shared.f32 f1121, [r28+16384]; +ld.shared.f32 f1122, [r28+20480]; +ld.shared.f32 f1123, [r28+24576]; +ld.shared.f32 f1124, [r28+28672]; +ld.shared.f32 f1125, [r28+32768]; +ld.shared.f32 f1126, [r28+36864]; +ld.shared.f32 f1127, [r28+40960]; +ld.shared.f32 f1128, [r28+45056]; +ld.shared.f32 f1129, [r28+49152]; +ld.shared.f32 f1130, [r28+53248]; +ld.shared.f32 f1131, [r28+57344]; +ld.shared.f32 f1132, [r28+61440]; +add.f32 f1133, f1101, f1109; +add.f32 f1134, f1117, f1125; +sub.f32 f1135, f1101, f1109; +sub.f32 f1136, f1117, f1125; +add.f32 f1137, f1105, f1113; +add.f32 f1138, f1121, f1129; +sub.f32 f1139, f1105, f1113; +sub.f32 f1140, f1121, f1129; +add.f32 f1141, f1102, f1110; +add.f32 f1142, f1118, f1126; +sub.f32 f1143, f1102, f1110; +sub.f32 f1144, f1118, f1126; +add.f32 f1145, f1106, f1114; +add.f32 f1146, f1122, f1130; +sub.f32 f1147, f1106, f1114; +sub.f32 f1148, f1122, f1130; +add.f32 f1149, f1103, f1111; +add.f32 f1150, f1119, f1127; +sub.f32 f1151, f1103, f1111; +sub.f32 f1152, f1119, f1127; +add.f32 f1153, f1107, f1115; +add.f32 f1154, f1123, f1131; +sub.f32 f1155, f1107, f1115; +sub.f32 f1156, f1123, f1131; +add.f32 f1157, f1104, f1112; +add.f32 f1158, f1120, f1128; +sub.f32 f1159, f1104, f1112; +sub.f32 f1160, f1120, f1128; +add.f32 f1161, f1108, f1116; +add.f32 f1162, f1124, f1132; +sub.f32 f1163, f1108, f1116; +sub.f32 f1164, f1124, f1132; +add.f32 %0, f1133, f1137; +add.f32 %1, f1134, f1138; +add.f32 %2, f1141, f1145; +add.f32 %3, f1142, f1146; +add.f32 %4, f1149, f1153; +add.f32 %5, f1150, f1154; +add.f32 %6, f1157, f1161; +add.f32 %7, f1158, f1162; +add.f32 %9, f1136, f1139; +sub.f32 %8, f1135, f1140; +add.f32 %11, f1144, f1147; +sub.f32 %10, f1143, f1148; +add.f32 %13, f1152, f1155; +sub.f32 %12, f1151, f1156; +add.f32 %15, f1160, f1163; +sub.f32 %14, f1159, f1164; +sub.f32 %16, f1133, f1137; +sub.f32 %17, f1134, f1138; +sub.f32 %18, f1141, f1145; +sub.f32 %19, f1142, f1146; +sub.f32 %20, f1149, f1153; +sub.f32 %21, f1150, f1154; +sub.f32 %22, f1157, f1161; +sub.f32 %23, f1158, f1162; +sub.f32 %25, f1136, f1139; +add.f32 %24, f1135, f1140; +sub.f32 %27, f1144, f1147; +add.f32 %26, f1143, f1148; +sub.f32 %29, f1152, f1155; +add.f32 %28, f1151, f1156; +sub.f32 %31, f1160, f1163; +add.f32 %30, f1159, f1164; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_16384), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1157, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1558>; +.reg .b32 r<50>; +.reg .b64 rd<14>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %36, %52; +sub.f32 f67, %36, %52; +add.f32 f1547, %37, %68; +sub.f32 f68, %37, %68; +add.f32 f69, %44, %60; +sub.f32 f71, %44, %60; +add.f32 f1545, %69, %61; +sub.f32 f72, %69, %61; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1544, f1547, f1545; +sub.f32 f76, f1547, f1545; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f1543, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %40, %56; +sub.f32 f83, %40, %56; +add.f32 f1540, %71, %70; +sub.f32 f84, %71, %70; +add.f32 f85, %48, %64; +sub.f32 f87, %48, %64; +add.f32 f1538, %49, %72; +sub.f32 f88, %49, %72; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1537, f1540, f1538; +sub.f32 f92, f1540, f1538; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f1536, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f1536, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f1534, f95, 0fBF3504F3; +mul.f32 f1535, f96, 0f3F3504F3; +sub.f32 f103, f1534, f1535; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1533, f1544, f1537; +sub.f32 f109, f1544, f1537; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1532, f1543, f100; +sub.f32 f113, f1543, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f1531, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f1530, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %54; +sub.f32 f124, %38, %54; +add.f32 f1528, %73, %55; +sub.f32 f125, %73, %55; +add.f32 f126, %46, %62; +sub.f32 f128, %46, %62; +add.f32 f1525, %74, %75; +sub.f32 f129, %74, %75; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1524, f1528, f1525; +sub.f32 f133, f1528, f1525; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f1523, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %42, %58; +sub.f32 f140, %42, %58; +add.f32 f1521, %43, %76; +sub.f32 f141, %43, %76; +add.f32 f142, %50, %66; +sub.f32 f144, %50, %66; +add.f32 f1519, %77, %67; +sub.f32 f145, %77, %67; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1518, f1521, f1519; +sub.f32 f149, f1521, f1519; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f1517, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f1517, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f1515, f152, 0fBF3504F3; +mul.f32 f1516, f153, 0f3F3504F3; +sub.f32 f160, f1515, f1516; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1514, f1524, f1518; +sub.f32 f166, f1524, f1518; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1513, f1523, f157; +sub.f32 f170, f1523, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f1512, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f1511, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1509, f167, 0f3F6C835E; +mul.f32 f1510, f1513, 0f3EC3EF15; +sub.f32 f181, f1509, f1510; +mul.f32 f182, f1513, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f1512, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f1511, 0f3F6C835E; +mul.f32 f1508, f175, 0f3EC3EF15; +sub.f32 f190, f1508, f189; +mul.f32 f191, f1511, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f1507, f169, 0fBEC3EF15; +sub.f32 f195, f1507, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f1505, f173, 0fBF3504F3; +mul.f32 f1506, f174, 0f3F3504F3; +sub.f32 f200, f1505, f1506; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f1503, f177, 0fBF6C835E; +mul.f32 f1504, f178, 0f3EC3EF15; +sub.f32 f205, f1503, f1504; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1502, f1532, f183; +sub.f32 f213, f1532, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1501, f1531, f187; +sub.f32 f217, f1531, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f1500, f1530, f192; +sub.f32 f221, f1530, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f1499, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f1498, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f1497, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1496, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r30, %tid.x; +shl.b32 r7, r30, 7; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r30, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f1502, f239; +mul.f32 f244, f238, f1502; +mul.f32 f246, f239, f239; +mul.f32 f1495, f238, f238; +sub.f32 f247, f1495, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f1501, f249; +mul.f32 f252, f247, f1501; +mul.f32 f1493, f238, f247; +mul.f32 f1494, f239, f249; +sub.f32 f255, f1493, f1494; +mul.f32 f1492, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f1500, f257; +mul.f32 f260, f255, f1500; +mul.f32 f262, f239, f257; +mul.f32 f1491, f238, f255; +sub.f32 f263, f1491, f262; +mul.f32 f1490, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f1499, f265; +mul.f32 f268, f263, f1499; +mul.f32 f270, f239, f265; +mul.f32 f1489, f238, f263; +sub.f32 f271, f1489, f270; +mul.f32 f1488, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f1498, f273; +mul.f32 f276, f271, f1498; +mul.f32 f1486, f238, f271; +mul.f32 f1487, f239, f273; +sub.f32 f279, f1486, f1487; +mul.f32 f1485, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f1497, f281; +mul.f32 f284, f279, f1497; +mul.f32 f286, f239, f281; +mul.f32 f1484, f238, f279; +sub.f32 f287, f1484, f286; +mul.f32 f1483, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f1496, f289; +mul.f32 f292, f287, f1496; +mul.f32 f294, f239, f289; +mul.f32 f1482, f238, f287; +sub.f32 f295, f1482, f294; +mul.f32 f1481, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1480, f1533, f1514; +mul.f32 f298, f1480, f297; +mul.f32 f300, f295, f1480; +mul.f32 f1478, f238, f295; +mul.f32 f1479, f239, f297; +sub.f32 f303, f1478, f1479; +sub.f32 f1477, f106, f163; +mul.f32 f1476, f1477, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1475, f238, f303; +sub.f32 f311, f1475, f310; +mul.f32 f1474, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f1472, f238, f311; +mul.f32 f1473, f239, f313; +sub.f32 f319, f1472, f1473; +mul.f32 f1471, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1470, f238, f319; +sub.f32 f327, f1470, f326; +mul.f32 f1469, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1468, f238, f327; +sub.f32 f335, f1468, f334; +mul.f32 f1467, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f1465, f238, f335; +mul.f32 f1466, f239, f337; +sub.f32 f343, f1465, f1466; +mul.f32 f1464, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1463, f238, f343; +sub.f32 f351, f1463, f350; +mul.f32 f1462, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f1461, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 130944; +add.s32 r12, r9, r11; +add.f32 f357, f1533, f1514; +sub.f32 f1551, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r44, %tid.x; +shl.b32 r43, r44, 7; +shl.b32 r38, r44, 3; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f1461; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f1492; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f1490; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f1488; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f1485; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f1483; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f1481; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f1551, f298; +sub.f32 f374, f300, f1476; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f1474; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f1471; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f1469; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f1467; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f1464; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f1462; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +and.b32 r29, r44, 1023; +mad.lo.s32 r13, r29, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+8192]; +ld.shared.v2.f32 {f397, f398}, [r13+16384]; +ld.shared.v2.f32 {f401, f402}, [r13+24576]; +ld.shared.v2.f32 {f405, f406}, [r13+32768]; +ld.shared.v2.f32 {f409, f410}, [r13+40960]; +ld.shared.v2.f32 {f413, f414}, [r13+49152]; +ld.shared.v2.f32 {f417, f418}, [r13+57344]; +ld.shared.v2.f32 {f421, f422}, [r13+65536]; +ld.shared.v2.f32 {f425, f426}, [r13+73728]; +ld.shared.v2.f32 {f429, f430}, [r13+81920]; +ld.shared.v2.f32 {f433, f434}, [r13+90112]; +ld.shared.v2.f32 {f437, f438}, [r13+98304]; +ld.shared.v2.f32 {f441, f442}, [r13+106496]; +ld.shared.v2.f32 {f445, f446}, [r13+114688]; +ld.shared.v2.f32 {f449, f450}, [r13+122880]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1460, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1459, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1458, f1460, f1459; +sub.f32 f464, f1460, f1459; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f1457, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1456, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1455, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1454, f1456, f1455; +sub.f32 f480, f1456, f1455; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f1453, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f1453, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f1452, f483, 0fBF3504F3; +sub.f32 f491, f1452, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1451, f1458, f1454; +sub.f32 f497, f1458, f1454; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1450, f1457, f488; +sub.f32 f501, f1457, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f1449, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f1448, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1447, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1446, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1445, f1447, f1446; +sub.f32 f521, f1447, f1446; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f1444, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1443, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1442, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1441, f1443, f1442; +sub.f32 f537, f1443, f1442; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f1440, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f1440, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f1439, f540, 0fBF3504F3; +sub.f32 f548, f1439, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1438, f1445, f1441; +sub.f32 f554, f1445, f1441; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1437, f1444, f545; +sub.f32 f558, f1444, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f1436, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f1435, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1437, 0f3EC3EF15; +mul.f32 f1434, f555, 0f3F6C835E; +sub.f32 f569, f1434, f568; +mul.f32 f570, f1437, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f1436, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f1435, 0f3F6C835E; +mul.f32 f1433, f563, 0f3EC3EF15; +sub.f32 f578, f1433, f577; +mul.f32 f579, f1435, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f1432, f557, 0fBEC3EF15; +sub.f32 f583, f1432, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f1431, f561, 0fBF3504F3; +sub.f32 f588, f1431, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f1430, f565, 0fBF6C835E; +sub.f32 f593, f1430, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1429, f1450, f571; +sub.f32 f601, f1450, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1428, f1449, f575; +sub.f32 f605, f1449, f575; +add.f32 f606, f506, f578; +sub.f32 f608, f506, f578; +add.f32 f1427, f1448, f580; +sub.f32 f609, f1448, f580; +sub.f32 f610, f496, f554; +add.f32 f612, f496, f554; +add.f32 f1426, f497, f553; +sub.f32 f613, f497, f553; +add.f32 f614, f500, f583; +sub.f32 f616, f500, f583; +add.f32 f1425, f501, f585; +sub.f32 f617, f501, f585; +add.f32 f618, f504, f588; +sub.f32 f620, f504, f588; +add.f32 f1424, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1423, f509, f595; +sub.f32 f625, f509, f595; +bfe.u32 r15, r44, 4, 6; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f630, f1429, f627; +mul.f32 f632, f626, f1429; +mul.f32 f634, f627, f627; +mul.f32 f1422, f626, f626; +sub.f32 f635, f1422, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f638, f1428, f637; +mul.f32 f640, f635, f1428; +mul.f32 f1420, f626, f635; +mul.f32 f1421, f627, f637; +sub.f32 f643, f1420, f1421; +mul.f32 f1419, f602, f637; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f646, f1427, f645; +mul.f32 f648, f643, f1427; +mul.f32 f650, f627, f645; +mul.f32 f1418, f626, f643; +sub.f32 f651, f1418, f650; +mul.f32 f1417, f606, f645; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f654, f1426, f653; +mul.f32 f656, f651, f1426; +mul.f32 f658, f627, f653; +mul.f32 f1416, f626, f651; +sub.f32 f659, f1416, f658; +mul.f32 f1415, f610, f653; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f662, f1425, f661; +mul.f32 f664, f659, f1425; +mul.f32 f1413, f626, f659; +mul.f32 f1414, f627, f661; +sub.f32 f667, f1413, f1414; +mul.f32 f1412, f614, f661; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f670, f1424, f669; +mul.f32 f672, f667, f1424; +mul.f32 f674, f627, f669; +mul.f32 f1411, f626, f667; +sub.f32 f675, f1411, f674; +mul.f32 f1410, f618, f669; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f678, f1423, f677; +mul.f32 f680, f675, f1423; +mul.f32 f682, f627, f677; +mul.f32 f1409, f626, f675; +sub.f32 f683, f1409, f682; +mul.f32 f1408, f622, f677; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1407, f1451, f1438; +mul.f32 f686, f1407, f685; +mul.f32 f688, f683, f1407; +mul.f32 f1405, f626, f683; +mul.f32 f1406, f627, f685; +sub.f32 f691, f1405, f1406; +sub.f32 f1404, f494, f551; +mul.f32 f1403, f1404, f685; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f694, f601, f693; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1402, f626, f691; +sub.f32 f699, f1402, f698; +mul.f32 f1401, f600, f693; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f702, f605, f701; +mul.f32 f704, f699, f605; +mul.f32 f1399, f626, f699; +mul.f32 f1400, f627, f701; +sub.f32 f707, f1399, f1400; +mul.f32 f1398, f604, f701; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f710, f609, f709; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f1397, f626, f707; +sub.f32 f715, f1397, f714; +mul.f32 f1396, f608, f709; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f718, f613, f717; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f1395, f626, f715; +sub.f32 f723, f1395, f722; +mul.f32 f1394, f612, f717; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f726, f617, f725; +mul.f32 f728, f723, f617; +mul.f32 f1392, f626, f723; +mul.f32 f1393, f627, f725; +sub.f32 f731, f1392, f1393; +mul.f32 f1391, f616, f725; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f734, f621, f733; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f1390, f626, f731; +sub.f32 f739, f1390, f738; +mul.f32 f1389, f620, f733; +mul.f32 f740, f626, f733; +mul.f32 f1388, f598, f627; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f625, f741; +mul.f32 f743, f624, f741; +mul.f32 f744, f739, f625; +and.b32 r16, r38, 120; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r43, 129024; +add.s32 r19, r17, r18; +sub.f32 f1550, f1451, f1438; +mul.f32 f1549, f683, f1550; +add.f32 f745, f1451, f1438; +sub.f32 f1557, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r49, %tid.x; +shl.b32 r36, r49, 7; +shl.b32 r35, r49, 3; +fma.rn.f32 f747, f626, f598, f630; +sub.f32 f748, f632, f1388; +st.shared.v2.f32 [r19+128], {f747, f748}; +fma.rn.f32 f749, f635, f602, f638; +sub.f32 f750, f640, f1419; +st.shared.v2.f32 [r19+256], {f749, f750}; +fma.rn.f32 f751, f643, f606, f646; +sub.f32 f752, f648, f1417; +st.shared.v2.f32 [r19+384], {f751, f752}; +fma.rn.f32 f753, f651, f610, f654; +sub.f32 f754, f656, f1415; +st.shared.v2.f32 [r19+512], {f753, f754}; +sub.f32 f755, f664, f1412; +fma.rn.f32 f756, f659, f614, f662; +st.shared.v2.f32 [r19+640], {f756, f755}; +fma.rn.f32 f757, f667, f618, f670; +sub.f32 f758, f672, f1410; +st.shared.v2.f32 [r19+768], {f757, f758}; +fma.rn.f32 f759, f675, f622, f678; +sub.f32 f760, f680, f1408; +st.shared.v2.f32 [r19+896], {f759, f760}; +fma.rn.f32 f761, f683, f1557, f686; +sub.f32 f762, f1549, f1403; +st.shared.v2.f32 [r19+1024], {f761, f762}; +fma.rn.f32 f763, f691, f600, f694; +sub.f32 f764, f696, f1401; +st.shared.v2.f32 [r19+1152], {f763, f764}; +fma.rn.f32 f765, f699, f604, f702; +sub.f32 f766, f704, f1398; +st.shared.v2.f32 [r19+1280], {f765, f766}; +fma.rn.f32 f767, f707, f608, f710; +sub.f32 f768, f712, f1396; +st.shared.v2.f32 [r19+1408], {f767, f768}; +fma.rn.f32 f769, f715, f612, f718; +sub.f32 f770, f720, f1394; +st.shared.v2.f32 [r19+1536], {f769, f770}; +fma.rn.f32 f771, f723, f616, f726; +sub.f32 f772, f728, f1391; +st.shared.v2.f32 [r19+1664], {f771, f772}; +fma.rn.f32 f773, f731, f620, f734; +sub.f32 f774, f736, f1389; +st.shared.v2.f32 [r19+1792], {f773, f774}; +fma.rn.f32 f775, f739, f624, f742; +sub.f32 f776, f744, f743; +st.shared.v2.f32 [r19+1920], {f775, f776}; +barrier.sync 0; +and.b32 r28, r49, 1008; +mad.lo.s32 r20, r28, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+8192]; +ld.shared.v2.f32 {f785, f786}, [r20+16384]; +ld.shared.v2.f32 {f789, f790}, [r20+24576]; +ld.shared.v2.f32 {f793, f794}, [r20+32768]; +ld.shared.v2.f32 {f797, f798}, [r20+40960]; +ld.shared.v2.f32 {f801, f802}, [r20+49152]; +ld.shared.v2.f32 {f805, f806}, [r20+57344]; +ld.shared.v2.f32 {f809, f810}, [r20+65536]; +ld.shared.v2.f32 {f813, f814}, [r20+73728]; +ld.shared.v2.f32 {f817, f818}, [r20+81920]; +ld.shared.v2.f32 {f821, f822}, [r20+90112]; +ld.shared.v2.f32 {f825, f826}, [r20+98304]; +ld.shared.v2.f32 {f829, f830}, [r20+106496]; +ld.shared.v2.f32 {f833, f834}, [r20+114688]; +ld.shared.v2.f32 {f837, f838}, [r20+122880]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f1387, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f1386, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f1385, f1387, f1386; +sub.f32 f852, f1387, f1386; +sub.f32 f853, f843, f848; +add.f32 f855, f843, f848; +add.f32 f1384, f844, f847; +sub.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f1383, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f1382, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f1381, f1383, f1382; +sub.f32 f868, f1383, f1382; +sub.f32 f869, f859, f864; +add.f32 f871, f859, f864; +add.f32 f1380, f860, f863; +sub.f32 f872, f860, f863; +mul.f32 f873, f869, 0f3F3504F3; +mul.f32 f874, f1380, 0f3F3504F3; +sub.f32 f875, f873, f874; +add.f32 f876, f873, f874; +mul.f32 f878, f872, 0f3F3504F3; +mul.f32 f1379, f871, 0fBF3504F3; +sub.f32 f879, f1379, f878; +mul.f32 f880, f872, 0fBF3504F3; +fma.rn.f32 f881, f871, 0f3F3504F3, f880; +add.f32 f882, f849, f865; +sub.f32 f884, f849, f865; +add.f32 f1378, f1385, f1381; +sub.f32 f885, f1385, f1381; +add.f32 f886, f853, f875; +sub.f32 f888, f853, f875; +add.f32 f1377, f1384, f876; +sub.f32 f889, f1384, f876; +sub.f32 f890, f851, f868; +add.f32 f892, f851, f868; +add.f32 f1376, f852, f867; +sub.f32 f893, f852, f867; +add.f32 f894, f855, f879; +sub.f32 f896, f855, f879; +add.f32 f1375, f856, f881; +sub.f32 f897, f856, f881; +add.f32 f898, f781, f813; +sub.f32 f900, f781, f813; +add.f32 f1374, f782, f814; +sub.f32 f901, f782, f814; +add.f32 f902, f797, f829; +sub.f32 f904, f797, f829; +add.f32 f1373, f798, f830; +sub.f32 f905, f798, f830; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f1372, f1374, f1373; +sub.f32 f909, f1374, f1373; +sub.f32 f910, f900, f905; +add.f32 f912, f900, f905; +add.f32 f1371, f901, f904; +sub.f32 f913, f901, f904; +add.f32 f914, f789, f821; +sub.f32 f916, f789, f821; +add.f32 f1370, f790, f822; +sub.f32 f917, f790, f822; +add.f32 f918, f805, f837; +sub.f32 f920, f805, f837; +add.f32 f1369, f806, f838; +sub.f32 f921, f806, f838; +add.f32 f922, f914, f918; +sub.f32 f924, f914, f918; +add.f32 f1368, f1370, f1369; +sub.f32 f925, f1370, f1369; +sub.f32 f926, f916, f921; +add.f32 f928, f916, f921; +add.f32 f1367, f917, f920; +sub.f32 f929, f917, f920; +mul.f32 f930, f926, 0f3F3504F3; +mul.f32 f931, f1367, 0f3F3504F3; +sub.f32 f932, f930, f931; +add.f32 f933, f930, f931; +mul.f32 f935, f929, 0f3F3504F3; +mul.f32 f1366, f928, 0fBF3504F3; +sub.f32 f936, f1366, f935; +mul.f32 f937, f929, 0fBF3504F3; +fma.rn.f32 f938, f928, 0f3F3504F3, f937; +add.f32 f939, f906, f922; +sub.f32 f941, f906, f922; +add.f32 f1365, f1372, f1368; +sub.f32 f942, f1372, f1368; +add.f32 f943, f910, f932; +sub.f32 f945, f910, f932; +add.f32 f1364, f1371, f933; +sub.f32 f946, f1371, f933; +sub.f32 f947, f908, f925; +add.f32 f949, f908, f925; +add.f32 f1363, f909, f924; +sub.f32 f950, f909, f924; +add.f32 f951, f912, f936; +sub.f32 f953, f912, f936; +add.f32 f1362, f913, f938; +sub.f32 f954, f913, f938; +mul.f32 f956, f1364, 0f3EC3EF15; +mul.f32 f1361, f943, 0f3F6C835E; +sub.f32 f957, f1361, f956; +mul.f32 f958, f1364, 0f3F6C835E; +fma.rn.f32 f959, f943, 0f3EC3EF15, f958; +mul.f32 f960, f947, 0f3F3504F3; +mul.f32 f961, f1363, 0f3F3504F3; +sub.f32 f962, f960, f961; +add.f32 f963, f960, f961; +mul.f32 f965, f1362, 0f3F6C835E; +mul.f32 f1360, f951, 0f3EC3EF15; +sub.f32 f966, f1360, f965; +mul.f32 f967, f1362, 0f3EC3EF15; +fma.rn.f32 f968, f951, 0f3F6C835E, f967; +mul.f32 f970, f946, 0f3F6C835E; +mul.f32 f1359, f945, 0fBEC3EF15; +sub.f32 f971, f1359, f970; +mul.f32 f972, f946, 0fBEC3EF15; +fma.rn.f32 f973, f945, 0f3F6C835E, f972; +mul.f32 f975, f950, 0f3F3504F3; +mul.f32 f1358, f949, 0fBF3504F3; +sub.f32 f976, f1358, f975; +mul.f32 f977, f950, 0fBF3504F3; +fma.rn.f32 f978, f949, 0f3F3504F3, f977; +mul.f32 f980, f954, 0f3EC3EF15; +mul.f32 f1357, f953, 0fBF6C835E; +sub.f32 f981, f1357, f980; +mul.f32 f982, f954, 0fBF6C835E; +fma.rn.f32 f983, f953, 0f3EC3EF15, f982; +add.f32 f986, f886, f957; +sub.f32 f988, f886, f957; +add.f32 f1356, f1377, f959; +sub.f32 f989, f1377, f959; +add.f32 f990, f890, f962; +sub.f32 f992, f890, f962; +add.f32 f1355, f1376, f963; +sub.f32 f993, f1376, f963; +add.f32 f994, f894, f966; +sub.f32 f996, f894, f966; +add.f32 f1354, f1375, f968; +sub.f32 f997, f1375, f968; +sub.f32 f998, f884, f942; +add.f32 f1000, f884, f942; +add.f32 f1353, f885, f941; +sub.f32 f1001, f885, f941; +add.f32 f1002, f888, f971; +sub.f32 f1004, f888, f971; +add.f32 f1352, f889, f973; +sub.f32 f1005, f889, f973; +add.f32 f1006, f892, f976; +sub.f32 f1008, f892, f976; +add.f32 f1351, f893, f978; +sub.f32 f1009, f893, f978; +add.f32 f1010, f896, f981; +sub.f32 f1012, f896, f981; +add.f32 f1350, f897, f983; +sub.f32 f1013, f897, f983; +and.b32 r21, r49, 768; +bfe.u32 r22, r49, 8, 2; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1014, f1015}, [rd11]; +mul.f32 f1018, f1356, f1015; +mul.f32 f1020, f1014, f1356; +mul.f32 f1022, f1015, f1015; +mul.f32 f1349, f1014, f1014; +sub.f32 f1023, f1349, f1022; +mul.f32 f1024, f1015, f1014; +fma.rn.f32 f1025, f1015, f1014, f1024; +mul.f32 f1026, f1355, f1025; +mul.f32 f1028, f1023, f1355; +mul.f32 f1347, f1014, f1023; +mul.f32 f1348, f1015, f1025; +sub.f32 f1031, f1347, f1348; +mul.f32 f1346, f990, f1025; +mul.f32 f1032, f1014, f1025; +fma.rn.f32 f1033, f1015, f1023, f1032; +mul.f32 f1034, f1354, f1033; +mul.f32 f1036, f1031, f1354; +mul.f32 f1344, f1014, f1031; +mul.f32 f1345, f1015, f1033; +sub.f32 f1039, f1344, f1345; +mul.f32 f1343, f994, f1033; +mul.f32 f1040, f1014, f1033; +fma.rn.f32 f1041, f1015, f1031, f1040; +mul.f32 f1042, f1353, f1041; +mul.f32 f1044, f1039, f1353; +mul.f32 f1046, f1015, f1041; +mul.f32 f1342, f1014, f1039; +sub.f32 f1047, f1342, f1046; +mul.f32 f1341, f998, f1041; +mul.f32 f1048, f1014, f1041; +fma.rn.f32 f1049, f1015, f1039, f1048; +mul.f32 f1050, f1352, f1049; +mul.f32 f1052, f1047, f1352; +mul.f32 f1339, f1014, f1047; +mul.f32 f1340, f1015, f1049; +sub.f32 f1055, f1339, f1340; +mul.f32 f1338, f1002, f1049; +mul.f32 f1056, f1014, f1049; +fma.rn.f32 f1057, f1015, f1047, f1056; +mul.f32 f1058, f1351, f1057; +mul.f32 f1060, f1055, f1351; +mul.f32 f1062, f1015, f1057; +mul.f32 f1337, f1014, f1055; +sub.f32 f1063, f1337, f1062; +mul.f32 f1336, f1006, f1057; +mul.f32 f1064, f1014, f1057; +fma.rn.f32 f1065, f1015, f1055, f1064; +mul.f32 f1066, f1350, f1065; +mul.f32 f1068, f1063, f1350; +mul.f32 f1070, f1015, f1065; +mul.f32 f1335, f1014, f1063; +sub.f32 f1071, f1335, f1070; +mul.f32 f1334, f1010, f1065; +mul.f32 f1072, f1014, f1065; +fma.rn.f32 f1073, f1015, f1063, f1072; +sub.f32 f1333, f1378, f1365; +mul.f32 f1074, f1333, f1073; +mul.f32 f1076, f1071, f1333; +sub.f32 f1332, f882, f939; +mul.f32 f1330, f1014, f1071; +mul.f32 f1331, f1015, f1073; +sub.f32 f1079, f1330, f1331; +mul.f32 f1329, f1332, f1073; +mul.f32 f1080, f1014, f1073; +fma.rn.f32 f1081, f1015, f1071, f1080; +mul.f32 f1082, f989, f1081; +mul.f32 f1084, f1079, f989; +mul.f32 f1086, f1015, f1081; +mul.f32 f1328, f1014, f1079; +sub.f32 f1087, f1328, f1086; +mul.f32 f1327, f988, f1081; +mul.f32 f1088, f1014, f1081; +fma.rn.f32 f1089, f1015, f1079, f1088; +mul.f32 f1090, f993, f1089; +mul.f32 f1092, f1087, f993; +mul.f32 f1325, f1014, f1087; +mul.f32 f1326, f1015, f1089; +sub.f32 f1095, f1325, f1326; +mul.f32 f1324, f992, f1089; +mul.f32 f1096, f1014, f1089; +fma.rn.f32 f1097, f1015, f1087, f1096; +mul.f32 f1098, f997, f1097; +mul.f32 f1100, f1095, f997; +mul.f32 f1322, f1014, f1095; +mul.f32 f1323, f1015, f1097; +sub.f32 f1103, f1322, f1323; +mul.f32 f1321, f996, f1097; +mul.f32 f1104, f1014, f1097; +fma.rn.f32 f1105, f1015, f1095, f1104; +mul.f32 f1106, f1001, f1105; +mul.f32 f1108, f1103, f1001; +mul.f32 f1110, f1015, f1105; +mul.f32 f1320, f1014, f1103; +sub.f32 f1111, f1320, f1110; +mul.f32 f1319, f1000, f1105; +mul.f32 f1112, f1014, f1105; +fma.rn.f32 f1113, f1015, f1103, f1112; +mul.f32 f1114, f1005, f1113; +mul.f32 f1116, f1111, f1005; +mul.f32 f1317, f1014, f1111; +mul.f32 f1318, f1015, f1113; +sub.f32 f1119, f1317, f1318; +mul.f32 f1316, f1004, f1113; +mul.f32 f1120, f1014, f1113; +fma.rn.f32 f1121, f1015, f1111, f1120; +mul.f32 f1122, f1009, f1121; +mul.f32 f1124, f1119, f1009; +mul.f32 f1126, f1015, f1121; +mul.f32 f1315, f1014, f1119; +sub.f32 f1127, f1315, f1126; +mul.f32 f1314, f1008, f1121; +mul.f32 f1128, f1014, f1121; +mul.f32 f1313, f986, f1015; +fma.rn.f32 f1129, f1015, f1119, f1128; +mul.f32 f1130, f1013, f1129; +mul.f32 f1131, f1012, f1129; +mul.f32 f1132, f1127, f1013; +and.b32 r23, r35, 2040; +add.s32 r24, r9, r23; +sub.f32 f1553, f1378, f1365; +mul.f32 f1552, f1071, f1553; +barrier.sync 0; +and.b32 r25, r36, 98304; +add.s32 r26, r24, r25; +sub.f32 f1555, f1378, f1365; +mul.f32 f1554, f1071, f1555; +add.f32 f1133, f1378, f1365; +sub.f32 f1556, f882, f939; +add.f32 f1134, f882, f939; +st.shared.v2.f32 [r26], {f1134, f1133}; +mov.u32 r41, %tid.x; +and.b32 r40, r41, 768; +fma.rn.f32 f1135, f1014, f986, f1018; +sub.f32 f1136, f1020, f1313; +st.shared.v2.f32 [r26+2048], {f1135, f1136}; +fma.rn.f32 f1137, f1023, f990, f1026; +sub.f32 f1138, f1028, f1346; +st.shared.v2.f32 [r26+4096], {f1137, f1138}; +fma.rn.f32 f1139, f1031, f994, f1034; +sub.f32 f1140, f1036, f1343; +st.shared.v2.f32 [r26+6144], {f1139, f1140}; +fma.rn.f32 f1141, f1039, f998, f1042; +sub.f32 f1142, f1044, f1341; +st.shared.v2.f32 [r26+8192], {f1141, f1142}; +sub.f32 f1143, f1052, f1338; +fma.rn.f32 f1144, f1047, f1002, f1050; +st.shared.v2.f32 [r26+10240], {f1144, f1143}; +fma.rn.f32 f1145, f1055, f1006, f1058; +sub.f32 f1146, f1060, f1336; +st.shared.v2.f32 [r26+12288], {f1145, f1146}; +fma.rn.f32 f1147, f1063, f1010, f1066; +sub.f32 f1148, f1068, f1334; +st.shared.v2.f32 [r26+14336], {f1147, f1148}; +fma.rn.f32 f1149, f1071, f1556, f1074; +sub.f32 f1150, f1554, f1329; +st.shared.v2.f32 [r26+16384], {f1149, f1150}; +fma.rn.f32 f1151, f1079, f988, f1082; +sub.f32 f1152, f1084, f1327; +st.shared.v2.f32 [r26+18432], {f1151, f1152}; +fma.rn.f32 f1153, f1087, f992, f1090; +sub.f32 f1154, f1092, f1324; +st.shared.v2.f32 [r26+20480], {f1153, f1154}; +fma.rn.f32 f1155, f1095, f996, f1098; +sub.f32 f1156, f1100, f1321; +st.shared.v2.f32 [r26+22528], {f1155, f1156}; +fma.rn.f32 f1157, f1103, f1000, f1106; +sub.f32 f1158, f1108, f1319; +st.shared.v2.f32 [r26+24576], {f1157, f1158}; +fma.rn.f32 f1159, f1111, f1004, f1114; +sub.f32 f1160, f1116, f1316; +st.shared.v2.f32 [r26+26624], {f1159, f1160}; +fma.rn.f32 f1161, f1119, f1008, f1122; +sub.f32 f1162, f1124, f1314; +st.shared.v2.f32 [r26+28672], {f1161, f1162}; +fma.rn.f32 f1163, f1127, f1012, f1130; +sub.f32 f1164, f1132, f1131; +st.shared.v2.f32 [r26+30720], {f1163, f1164}; +barrier.sync 0; +mad.lo.s32 r27, r40, -120, r26; +ld.shared.v2.f32 {f1165, f1166}, [r27]; +ld.shared.v2.f32 {f1169, f1170}, [r27+8192]; +ld.shared.v2.f32 {f1173, f1174}, [r27+16384]; +ld.shared.v2.f32 {f1177, f1178}, [r27+24576]; +ld.shared.v2.f32 {f1181, f1182}, [r27+32768]; +ld.shared.v2.f32 {f1185, f1186}, [r27+40960]; +ld.shared.v2.f32 {f1189, f1190}, [r27+49152]; +ld.shared.v2.f32 {f1193, f1194}, [r27+57344]; +ld.shared.v2.f32 {f1197, f1198}, [r27+65536]; +ld.shared.v2.f32 {f1201, f1202}, [r27+73728]; +ld.shared.v2.f32 {f1205, f1206}, [r27+81920]; +ld.shared.v2.f32 {f1209, f1210}, [r27+90112]; +ld.shared.v2.f32 {f1213, f1214}, [r27+98304]; +ld.shared.v2.f32 {f1217, f1218}, [r27+106496]; +ld.shared.v2.f32 {f1221, f1222}, [r27+114688]; +ld.shared.v2.f32 {f1225, f1226}, [r27+122880]; +add.f32 f1229, f1165, f1197; +sub.f32 f1231, f1165, f1197; +add.f32 f1312, f1166, f1198; +sub.f32 f1232, f1166, f1198; +add.f32 f1233, f1181, f1213; +sub.f32 f1235, f1181, f1213; +add.f32 f1311, f1182, f1214; +sub.f32 f1236, f1182, f1214; +add.f32 f1237, f1169, f1201; +sub.f32 f1239, f1169, f1201; +add.f32 f1310, f1170, f1202; +sub.f32 f1240, f1170, f1202; +add.f32 f1241, f1185, f1217; +sub.f32 f1243, f1185, f1217; +add.f32 f1309, f1186, f1218; +sub.f32 f1244, f1186, f1218; +add.f32 f1245, f1173, f1205; +sub.f32 f1247, f1173, f1205; +add.f32 f1308, f1174, f1206; +sub.f32 f1248, f1174, f1206; +add.f32 f1249, f1189, f1221; +sub.f32 f1251, f1189, f1221; +add.f32 f1307, f1190, f1222; +sub.f32 f1252, f1190, f1222; +add.f32 f1253, f1177, f1209; +sub.f32 f1255, f1177, f1209; +add.f32 f1306, f1178, f1210; +sub.f32 f1256, f1178, f1210; +add.f32 f1257, f1193, f1225; +sub.f32 f1259, f1193, f1225; +add.f32 f1305, f1194, f1226; +sub.f32 f1260, f1194, f1226; +add.f32 %1, f1312, f1311; +add.f32 %0, f1229, f1233; +add.f32 %2, f1237, f1241; +add.f32 %3, f1310, f1309; +add.f32 %4, f1245, f1249; +add.f32 %5, f1308, f1307; +add.f32 %6, f1253, f1257; +add.f32 %7, f1306, f1305; +add.f32 %9, f1232, f1235; +sub.f32 %8, f1231, f1236; +add.f32 %11, f1240, f1243; +sub.f32 %10, f1239, f1244; +add.f32 %13, f1248, f1251; +sub.f32 %12, f1247, f1252; +sub.f32 %14, f1255, f1260; +add.f32 %15, f1256, f1259; +sub.f32 %17, f1312, f1311; +sub.f32 %16, f1229, f1233; +sub.f32 %19, f1310, f1309; +sub.f32 %18, f1237, f1241; +sub.f32 %21, f1308, f1307; +sub.f32 %20, f1245, f1249; +sub.f32 %23, f1306, f1305; +sub.f32 %22, f1253, f1257; +sub.f32 %25, f1232, f1235; +add.f32 %24, f1231, f1236; +sub.f32 %27, f1240, f1243; +add.f32 %26, f1239, f1244; +sub.f32 %29, f1248, f1251; +add.f32 %28, f1247, f1252; +sub.f32 %31, f1256, f1259; +add.f32 %30, f1255, f1260; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_16384), "l"(lut_sp_16_1024), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<318, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2632>; +.reg .b32 r<30>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2630, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2628, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2627, f2630, f2628; +sub.f32 f140, f2630, f2628; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2626, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2623, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2621, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2620, f2623, f2621; +sub.f32 f156, f2623, f2621; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2619, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2619, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2617, f159, 0fBF3504F3; +mul.f32 f2618, f160, 0f3F3504F3; +sub.f32 f167, f2617, f2618; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2616, f2627, f2620; +sub.f32 f173, f2627, f2620; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2615, f2626, f164; +sub.f32 f177, f2626, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2614, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2613, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2611, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2608, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2607, f2611, f2608; +sub.f32 f197, f2611, f2608; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2606, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2604, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2602, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2601, f2604, f2602; +sub.f32 f213, f2604, f2602; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2600, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2600, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2598, f216, 0fBF3504F3; +mul.f32 f2599, f217, 0f3F3504F3; +sub.f32 f224, f2598, f2599; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2597, f2607, f2601; +sub.f32 f230, f2607, f2601; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2596, f2606, f221; +sub.f32 f234, f2606, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2595, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2594, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2592, f231, 0f3F6C835E; +mul.f32 f2593, f2596, 0f3EC3EF15; +sub.f32 f245, f2592, f2593; +mul.f32 f246, f2596, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2595, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2594, 0f3F6C835E; +mul.f32 f2591, f239, 0f3EC3EF15; +sub.f32 f254, f2591, f253; +mul.f32 f255, f2594, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2590, f233, 0fBEC3EF15; +sub.f32 f259, f2590, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2588, f237, 0fBF3504F3; +mul.f32 f2589, f238, 0f3F3504F3; +sub.f32 f264, f2588, f2589; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2586, f241, 0fBF6C835E; +mul.f32 f2587, f242, 0f3EC3EF15; +sub.f32 f269, f2586, f2587; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2585, f2616, f2597; +sub.f32 f275, f2616, f2597; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2584, f2615, f247; +sub.f32 f279, f2615, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2583, f2614, f251; +sub.f32 f283, f2614, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2582, f2613, f256; +sub.f32 f287, f2613, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2581, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2580, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2579, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2578, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2575, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2573, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2572, f2575, f2573; +sub.f32 f315, f2575, f2573; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2571, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2569, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2566, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2565, f2569, f2566; +sub.f32 f331, f2569, f2566; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2564, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2564, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2563, f334, 0fBF3504F3; +sub.f32 f342, f2563, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2562, f2572, f2565; +sub.f32 f348, f2572, f2565; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2561, f2571, f339; +sub.f32 f352, f2571, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2560, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2559, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2557, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2555, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2554, f2557, f2555; +sub.f32 f372, f2557, f2555; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2553, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2550, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2549, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2548, f2550, f2549; +sub.f32 f388, f2550, f2549; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2547, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2547, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2545, f391, 0fBF3504F3; +mul.f32 f2546, f392, 0f3F3504F3; +sub.f32 f399, f2545, f2546; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2544, f2554, f2548; +sub.f32 f405, f2554, f2548; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2543, f2553, f396; +sub.f32 f409, f2553, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2542, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2541, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2543, 0f3EC3EF15; +mul.f32 f2540, f406, 0f3F6C835E; +sub.f32 f420, f2540, f419; +mul.f32 f421, f2543, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2542, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2541, 0f3F6C835E; +mul.f32 f2539, f414, 0f3EC3EF15; +sub.f32 f429, f2539, f428; +mul.f32 f430, f2541, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2538, f408, 0fBEC3EF15; +sub.f32 f434, f2538, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2537, f412, 0fBF3504F3; +sub.f32 f439, f2537, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2536, f416, 0fBF6C835E; +sub.f32 f444, f2536, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2535, f2562, f2544; +sub.f32 f450, f2562, f2544; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2534, f2561, f422; +sub.f32 f454, f2561, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2533, f2560, f426; +sub.f32 f458, f2560, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2532, f2559, f431; +sub.f32 f462, f2559, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2531, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2530, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2529, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2528, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2534, 0f3E47C5C2; +mul.f32 f2527, f451, 0f3F7B14BE; +sub.f32 f481, f2527, f480; +mul.f32 f482, f2534, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2533, 0f3EC3EF15; +mul.f32 f2526, f455, 0f3F6C835E; +sub.f32 f486, f2526, f485; +mul.f32 f487, f2533, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2532, 0f3F0E39DA; +mul.f32 f2525, f459, 0f3F54DB31; +sub.f32 f491, f2525, f490; +mul.f32 f492, f2532, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2531, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2530, 0f3F54DB31; +mul.f32 f2524, f467, 0f3F0E39DA; +sub.f32 f500, f2524, f499; +mul.f32 f501, f2530, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2529, 0f3F6C835E; +mul.f32 f2523, f471, 0f3EC3EF15; +sub.f32 f505, f2523, f504; +mul.f32 f506, f2529, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2528, 0f3F7B14BE; +mul.f32 f2522, f475, 0f3E47C5C2; +sub.f32 f510, f2522, f509; +mul.f32 f511, f2528, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2521, f453, 0fBE47C5C2; +sub.f32 f515, f2521, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2519, f457, 0fBEC3EF15; +mul.f32 f2520, f458, 0f3F6C835E; +sub.f32 f520, f2519, f2520; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2517, f461, 0fBF0E39DA; +mul.f32 f2518, f462, 0f3F54DB31; +sub.f32 f525, f2517, f2518; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2515, f465, 0fBF3504F3; +mul.f32 f2516, f466, 0f3F3504F3; +sub.f32 f530, f2515, f2516; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2513, f469, 0fBF54DB31; +mul.f32 f2514, f470, 0f3F0E39DA; +sub.f32 f535, f2513, f2514; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2512, f473, 0fBF6C835E; +sub.f32 f540, f2512, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2511, f477, 0fBF7B14BE; +sub.f32 f545, f2511, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2510, f2585, f2535; +sub.f32 f551, f2585, f2535; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2509, f2584, f483; +sub.f32 f555, f2584, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2508, f2583, f488; +sub.f32 f559, f2583, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2507, f2582, f493; +sub.f32 f563, f2582, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2506, f2581, f497; +sub.f32 f567, f2581, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f2505, f2580, f502; +sub.f32 f571, f2580, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f2504, f2579, f507; +sub.f32 f575, f2579, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f2503, f2578, f512; +sub.f32 f579, f2578, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f2502, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f2501, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f2500, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f2499, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f2498, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2497, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2496, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2495, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f2509, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f2509; +sub.f32 f620, f619, f618; +mul.f32 f2493, f612, f612; +mul.f32 f2494, f613, f613; +sub.f32 f623, f2493, f2494; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f2508, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f2508; +sub.f32 f630, f629, f628; +mul.f32 f2491, f612, f623; +mul.f32 f2492, f613, f625; +sub.f32 f633, f2491, f2492; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f2507, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f2507; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f2490, f612, f633; +sub.f32 f643, f2490, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f2506, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f2506; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f2489, f612, f643; +sub.f32 f653, f2489, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f2505, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f2505; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f2488, f612, f653; +sub.f32 f663, f2488, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f2504, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f2504; +sub.f32 f670, f669, f668; +mul.f32 f2486, f612, f663; +mul.f32 f2487, f613, f665; +sub.f32 f673, f2486, f2487; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f2503, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f2503; +sub.f32 f680, f679, f678; +mul.f32 f2484, f612, f673; +mul.f32 f2485, f613, f675; +sub.f32 f683, f2484, f2485; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f2502, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f2502; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f2483, f612, f683; +sub.f32 f693, f2483, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f2501, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f2501; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f2482, f612, f693; +sub.f32 f703, f2482, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f2500, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f2500; +sub.f32 f710, f709, f708; +mul.f32 f2480, f612, f703; +mul.f32 f2481, f613, f705; +sub.f32 f713, f2480, f2481; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f2499, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f2499; +sub.f32 f720, f719, f718; +mul.f32 f2478, f612, f713; +mul.f32 f2479, f613, f715; +sub.f32 f723, f2478, f2479; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f2498, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f2498; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f2477, f612, f723; +sub.f32 f733, f2477, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f2497, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f2497; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f2476, f612, f733; +sub.f32 f743, f2476, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f2496, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f2496; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f2475, f612, f743; +sub.f32 f753, f2475, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f2495, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f2495; +sub.f32 f760, f759, f758; +mul.f32 f2473, f612, f753; +mul.f32 f2474, f613, f755; +sub.f32 f763, f2473, f2474; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f2471, f612, f763; +mul.f32 f2472, f613, f765; +sub.f32 f773, f2471, f2472; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f2470, f612, f773; +sub.f32 f783, f2470, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f2469, f612, f783; +sub.f32 f793, f2469, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f2468, f612, f793; +sub.f32 f803, f2468, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f2466, f612, f803; +mul.f32 f2467, f613, f805; +sub.f32 f813, f2466, f2467; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f2464, f612, f813; +mul.f32 f2465, f613, f815; +sub.f32 f823, f2464, f2465; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f2463, f612, f823; +sub.f32 f833, f2463, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f2462, f612, f833; +sub.f32 f843, f2462, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f2460, f612, f843; +mul.f32 f2461, f613, f845; +sub.f32 f853, f2460, f2461; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f2458, f612, f853; +mul.f32 f2459, f613, f855; +sub.f32 f863, f2458, f2459; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f2457, f612, f863; +sub.f32 f873, f2457, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f2456, f612, f873; +sub.f32 f883, f2456, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f2455, f612, f883; +sub.f32 f893, f2455, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f2453, f612, f893; +mul.f32 f2454, f613, f895; +sub.f32 f903, f2453, f2454; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f2451, f612, f903; +mul.f32 f2452, f613, f905; +sub.f32 f913, f2451, f2452; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +shl.b32 r8, r24, 7; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65408; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +and.b32 r23, r24, 511; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+2048]; +ld.shared.f32 f923, [r13+4096]; +ld.shared.f32 f924, [r13+6144]; +ld.shared.f32 f925, [r13+8192]; +ld.shared.f32 f926, [r13+10240]; +ld.shared.f32 f927, [r13+12288]; +ld.shared.f32 f928, [r13+14336]; +ld.shared.f32 f929, [r13+16384]; +ld.shared.f32 f930, [r13+18432]; +ld.shared.f32 f931, [r13+20480]; +ld.shared.f32 f932, [r13+22528]; +ld.shared.f32 f933, [r13+24576]; +ld.shared.f32 f934, [r13+26624]; +ld.shared.f32 f935, [r13+28672]; +ld.shared.f32 f936, [r13+30720]; +ld.shared.f32 f937, [r13+32768]; +ld.shared.f32 f938, [r13+34816]; +ld.shared.f32 f939, [r13+36864]; +ld.shared.f32 f940, [r13+38912]; +ld.shared.f32 f941, [r13+40960]; +ld.shared.f32 f942, [r13+43008]; +ld.shared.f32 f943, [r13+45056]; +ld.shared.f32 f944, [r13+47104]; +ld.shared.f32 f945, [r13+49152]; +ld.shared.f32 f946, [r13+51200]; +ld.shared.f32 f947, [r13+53248]; +ld.shared.f32 f948, [r13+55296]; +ld.shared.f32 f949, [r13+57344]; +ld.shared.f32 f950, [r13+59392]; +ld.shared.f32 f951, [r13+61440]; +ld.shared.f32 f952, [r13+63488]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2510, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+2048]; +ld.shared.f32 f955, [r13+4096]; +ld.shared.f32 f956, [r13+6144]; +ld.shared.f32 f957, [r13+8192]; +ld.shared.f32 f958, [r13+10240]; +ld.shared.f32 f959, [r13+12288]; +ld.shared.f32 f960, [r13+14336]; +ld.shared.f32 f961, [r13+16384]; +ld.shared.f32 f962, [r13+18432]; +ld.shared.f32 f963, [r13+20480]; +ld.shared.f32 f964, [r13+22528]; +ld.shared.f32 f965, [r13+24576]; +ld.shared.f32 f966, [r13+26624]; +ld.shared.f32 f967, [r13+28672]; +ld.shared.f32 f968, [r13+30720]; +ld.shared.f32 f969, [r13+32768]; +ld.shared.f32 f970, [r13+34816]; +ld.shared.f32 f971, [r13+36864]; +ld.shared.f32 f972, [r13+38912]; +ld.shared.f32 f973, [r13+40960]; +ld.shared.f32 f974, [r13+43008]; +ld.shared.f32 f975, [r13+45056]; +ld.shared.f32 f976, [r13+47104]; +ld.shared.f32 f977, [r13+49152]; +ld.shared.f32 f978, [r13+51200]; +ld.shared.f32 f979, [r13+53248]; +ld.shared.f32 f980, [r13+55296]; +ld.shared.f32 f981, [r13+57344]; +ld.shared.f32 f982, [r13+59392]; +ld.shared.f32 f983, [r13+61440]; +ld.shared.f32 f984, [r13+63488]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2450, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2449, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2448, f2450, f2449; +sub.f32 f996, f2450, f2449; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f2447, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2446, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2445, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2444, f2446, f2445; +sub.f32 f1012, f2446, f2445; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f2443, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f2443, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f2441, f1015, 0fBF3504F3; +mul.f32 f2442, f1016, 0f3F3504F3; +sub.f32 f1023, f2441, f2442; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2440, f2448, f2444; +sub.f32 f1029, f2448, f2444; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2439, f2447, f1020; +sub.f32 f1033, f2447, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f2438, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f2437, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2436, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2435, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2434, f2436, f2435; +sub.f32 f1053, f2436, f2435; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f2433, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2432, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2431, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2430, f2432, f2431; +sub.f32 f1069, f2432, f2431; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f2429, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f2429, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f2427, f1072, 0fBF3504F3; +mul.f32 f2428, f1073, 0f3F3504F3; +sub.f32 f1080, f2427, f2428; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2426, f2434, f2430; +sub.f32 f1086, f2434, f2430; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2425, f2433, f1077; +sub.f32 f1090, f2433, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f2424, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f2423, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2421, f1087, 0f3F6C835E; +mul.f32 f2422, f2425, 0f3EC3EF15; +sub.f32 f1101, f2421, f2422; +mul.f32 f1102, f2425, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f2424, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f2419, f1095, 0f3EC3EF15; +mul.f32 f2420, f2423, 0f3F6C835E; +sub.f32 f1110, f2419, f2420; +mul.f32 f1111, f2423, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f2417, f1089, 0fBEC3EF15; +mul.f32 f2418, f1090, 0f3F6C835E; +sub.f32 f1115, f2417, f2418; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f2415, f1093, 0fBF3504F3; +mul.f32 f2416, f1094, 0f3F3504F3; +sub.f32 f1120, f2415, f2416; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f2413, f1097, 0fBF6C835E; +mul.f32 f2414, f1098, 0f3EC3EF15; +sub.f32 f1125, f2413, f2414; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2412, f2440, f2426; +sub.f32 f1131, f2440, f2426; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2411, f2439, f1103; +sub.f32 f1135, f2439, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2410, f2438, f1107; +sub.f32 f1139, f2438, f1107; +add.f32 f1140, f1038, f1110; +sub.f32 f1142, f1038, f1110; +add.f32 f2409, f2437, f1112; +sub.f32 f1143, f2437, f1112; +sub.f32 f1144, f1028, f1086; +add.f32 f1146, f1028, f1086; +add.f32 f2408, f1029, f1085; +sub.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1115; +sub.f32 f1150, f1032, f1115; +add.f32 f2407, f1033, f1117; +sub.f32 f1151, f1033, f1117; +add.f32 f1152, f1036, f1120; +sub.f32 f1154, f1036, f1120; +add.f32 f2406, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2405, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2404, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2403, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2402, f2404, f2403; +sub.f32 f1171, f2404, f2403; +sub.f32 f1172, f1162, f1167; +add.f32 f1174, f1162, f1167; +add.f32 f2401, f1163, f1166; +sub.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2400, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2399, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2398, f2400, f2399; +sub.f32 f1187, f2400, f2399; +sub.f32 f1188, f1178, f1183; +add.f32 f1190, f1178, f1183; +add.f32 f2397, f1179, f1182; +sub.f32 f1191, f1179, f1182; +mul.f32 f1192, f1188, 0f3F3504F3; +mul.f32 f1193, f2397, 0f3F3504F3; +sub.f32 f1194, f1192, f1193; +add.f32 f1195, f1192, f1193; +mul.f32 f2395, f1190, 0fBF3504F3; +mul.f32 f2396, f1191, 0f3F3504F3; +sub.f32 f1198, f2395, f2396; +mul.f32 f1199, f1191, 0fBF3504F3; +fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2394, f2402, f2398; +sub.f32 f1204, f2402, f2398; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2393, f2401, f1195; +sub.f32 f1208, f2401, f1195; +sub.f32 f1209, f1170, f1187; +add.f32 f1211, f1170, f1187; +add.f32 f2392, f1171, f1186; +sub.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1198; +sub.f32 f1215, f1174, f1198; +add.f32 f2391, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2390, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2389, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2388, f2390, f2389; +sub.f32 f1228, f2390, f2389; +sub.f32 f1229, f1219, f1224; +add.f32 f1231, f1219, f1224; +add.f32 f2387, f1220, f1223; +sub.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2386, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2385, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2384, f2386, f2385; +sub.f32 f1244, f2386, f2385; +sub.f32 f1245, f1235, f1240; +add.f32 f1247, f1235, f1240; +add.f32 f2383, f1236, f1239; +sub.f32 f1248, f1236, f1239; +mul.f32 f1249, f1245, 0f3F3504F3; +mul.f32 f1250, f2383, 0f3F3504F3; +sub.f32 f1251, f1249, f1250; +add.f32 f1252, f1249, f1250; +mul.f32 f2381, f1247, 0fBF3504F3; +mul.f32 f2382, f1248, 0f3F3504F3; +sub.f32 f1255, f2381, f2382; +mul.f32 f1256, f1248, 0fBF3504F3; +fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2380, f2388, f2384; +sub.f32 f1261, f2388, f2384; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2379, f2387, f1252; +sub.f32 f1265, f2387, f1252; +sub.f32 f1266, f1227, f1244; +add.f32 f1268, f1227, f1244; +add.f32 f2378, f1228, f1243; +sub.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1255; +sub.f32 f1272, f1231, f1255; +add.f32 f2377, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2375, f1262, 0f3F6C835E; +mul.f32 f2376, f2379, 0f3EC3EF15; +sub.f32 f1276, f2375, f2376; +mul.f32 f1277, f2379, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277; +mul.f32 f1279, f1266, 0f3F3504F3; +mul.f32 f1280, f2378, 0f3F3504F3; +sub.f32 f1281, f1279, f1280; +add.f32 f1282, f1279, f1280; +mul.f32 f1284, f2377, 0f3F6C835E; +mul.f32 f2374, f1270, 0f3EC3EF15; +sub.f32 f1285, f2374, f1284; +mul.f32 f1286, f2377, 0f3EC3EF15; +fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286; +mul.f32 f1289, f1265, 0f3F6C835E; +mul.f32 f2373, f1264, 0fBEC3EF15; +sub.f32 f1290, f2373, f1289; +mul.f32 f1291, f1265, 0fBEC3EF15; +fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291; +mul.f32 f2371, f1268, 0fBF3504F3; +mul.f32 f2372, f1269, 0f3F3504F3; +sub.f32 f1295, f2371, f2372; +mul.f32 f1296, f1269, 0fBF3504F3; +fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296; +mul.f32 f2369, f1272, 0fBF6C835E; +mul.f32 f2370, f1273, 0f3EC3EF15; +sub.f32 f1300, f2369, f2370; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2368, f2394, f2380; +sub.f32 f1306, f2394, f2380; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2367, f2393, f1278; +sub.f32 f1310, f2393, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2366, f2392, f1282; +sub.f32 f1314, f2392, f1282; +add.f32 f1315, f1213, f1285; +sub.f32 f1317, f1213, f1285; +add.f32 f2365, f2391, f1287; +sub.f32 f1318, f2391, f1287; +sub.f32 f1319, f1203, f1261; +add.f32 f1321, f1203, f1261; +add.f32 f2364, f1204, f1260; +sub.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1290; +sub.f32 f1325, f1207, f1290; +add.f32 f2363, f1208, f1292; +sub.f32 f1326, f1208, f1292; +add.f32 f1327, f1211, f1295; +sub.f32 f1329, f1211, f1295; +add.f32 f2362, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2361, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2367, 0f3E47C5C2; +mul.f32 f2360, f1307, 0f3F7B14BE; +sub.f32 f1337, f2360, f1336; +mul.f32 f1338, f2367, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338; +mul.f32 f1341, f2366, 0f3EC3EF15; +mul.f32 f2359, f1311, 0f3F6C835E; +sub.f32 f1342, f2359, f1341; +mul.f32 f1343, f2366, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343; +mul.f32 f2357, f1315, 0f3F54DB31; +mul.f32 f2358, f2365, 0f3F0E39DA; +sub.f32 f1347, f2357, f2358; +mul.f32 f1348, f2365, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348; +mul.f32 f1350, f1319, 0f3F3504F3; +mul.f32 f1351, f2364, 0f3F3504F3; +sub.f32 f1352, f1350, f1351; +add.f32 f1353, f1350, f1351; +mul.f32 f1355, f2363, 0f3F54DB31; +mul.f32 f2356, f1323, 0f3F0E39DA; +sub.f32 f1356, f2356, f1355; +mul.f32 f1357, f2363, 0f3F0E39DA; +fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357; +mul.f32 f1360, f2362, 0f3F6C835E; +mul.f32 f2355, f1327, 0f3EC3EF15; +sub.f32 f1361, f2355, f1360; +mul.f32 f1362, f2362, 0f3EC3EF15; +fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362; +mul.f32 f2353, f1331, 0f3E47C5C2; +mul.f32 f2354, f2361, 0f3F7B14BE; +sub.f32 f1366, f2353, f2354; +mul.f32 f1367, f2361, 0f3E47C5C2; +fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367; +mul.f32 f2351, f1309, 0fBE47C5C2; +mul.f32 f2352, f1310, 0f3F7B14BE; +sub.f32 f1371, f2351, f2352; +mul.f32 f1372, f1310, 0fBE47C5C2; +fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372; +mul.f32 f2349, f1313, 0fBEC3EF15; +mul.f32 f2350, f1314, 0f3F6C835E; +sub.f32 f1376, f2349, f2350; +mul.f32 f1377, f1314, 0fBEC3EF15; +fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377; +mul.f32 f2347, f1317, 0fBF0E39DA; +mul.f32 f2348, f1318, 0f3F54DB31; +sub.f32 f1381, f2347, f2348; +mul.f32 f1382, f1318, 0fBF0E39DA; +fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382; +mul.f32 f1385, f1322, 0f3F3504F3; +mul.f32 f2346, f1321, 0fBF3504F3; +sub.f32 f1386, f2346, f1385; +mul.f32 f1387, f1322, 0fBF3504F3; +fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387; +mul.f32 f1390, f1326, 0f3F0E39DA; +mul.f32 f2345, f1325, 0fBF54DB31; +sub.f32 f1391, f2345, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392; +mul.f32 f1395, f1330, 0f3EC3EF15; +mul.f32 f2344, f1329, 0fBF6C835E; +sub.f32 f1396, f2344, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397; +mul.f32 f1400, f1334, 0f3E47C5C2; +mul.f32 f2343, f1333, 0fBF7B14BE; +sub.f32 f1401, f2343, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2342, f2412, f2368; +sub.f32 f1407, f2412, f2368; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2341, f2411, f1339; +sub.f32 f1411, f2411, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2340, f2410, f1344; +sub.f32 f1415, f2410, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2339, f2409, f1349; +sub.f32 f1419, f2409, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2338, f2408, f1353; +sub.f32 f1423, f2408, f1353; +add.f32 f1424, f1148, f1356; +sub.f32 f1426, f1148, f1356; +add.f32 f2337, f2407, f1358; +sub.f32 f1427, f2407, f1358; +add.f32 f1428, f1152, f1361; +sub.f32 f1430, f1152, f1361; +add.f32 f2336, f2406, f1363; +sub.f32 f1431, f2406, f1363; +add.f32 f1432, f1156, f1366; +sub.f32 f1434, f1156, f1366; +add.f32 f2335, f2405, f1368; +sub.f32 f1435, f2405, f1368; +sub.f32 f1436, f1130, f1306; +add.f32 f1438, f1130, f1306; +add.f32 f2334, f1131, f1305; +sub.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1371; +sub.f32 f1442, f1134, f1371; +add.f32 f2333, f1135, f1373; +sub.f32 f1443, f1135, f1373; +add.f32 f1444, f1138, f1376; +sub.f32 f1446, f1138, f1376; +add.f32 f2332, f1139, f1378; +sub.f32 f1447, f1139, f1378; +add.f32 f1448, f1142, f1381; +sub.f32 f1450, f1142, f1381; +add.f32 f2331, f1143, f1383; +sub.f32 f1451, f1143, f1383; +add.f32 f1452, f1146, f1386; +sub.f32 f1454, f1146, f1386; +add.f32 f2330, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2329, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2328, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2327, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r24, 5, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1472, f2341, f1469; +fma.rn.f32 f1473, f1468, f1408, f1472; +mul.f32 f1474, f1408, f1469; +mul.f32 f1475, f1468, f2341; +sub.f32 f1476, f1475, f1474; +mul.f32 f1478, f1469, f1469; +mul.f32 f2326, f1468, f1468; +sub.f32 f1479, f2326, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1482, f2340, f1481; +fma.rn.f32 f1483, f1479, f1412, f1482; +mul.f32 f1484, f1412, f1481; +mul.f32 f1485, f1479, f2340; +sub.f32 f1486, f1485, f1484; +mul.f32 f2324, f1468, f1479; +mul.f32 f2325, f1469, f1481; +sub.f32 f1489, f2324, f2325; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f1492, f2339, f1491; +fma.rn.f32 f1493, f1489, f1416, f1492; +mul.f32 f1494, f1416, f1491; +mul.f32 f1495, f1489, f2339; +sub.f32 f1496, f1495, f1494; +mul.f32 f2322, f1468, f1489; +mul.f32 f2323, f1469, f1491; +sub.f32 f1499, f2322, f2323; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f1502, f2338, f1501; +fma.rn.f32 f1503, f1499, f1420, f1502; +mul.f32 f1504, f1420, f1501; +mul.f32 f1505, f1499, f2338; +sub.f32 f1506, f1505, f1504; +mul.f32 f1508, f1469, f1501; +mul.f32 f2321, f1468, f1499; +sub.f32 f1509, f2321, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1512, f2337, f1511; +fma.rn.f32 f1513, f1509, f1424, f1512; +mul.f32 f1514, f1424, f1511; +mul.f32 f1515, f1509, f2337; +sub.f32 f1516, f1515, f1514; +mul.f32 f1518, f1469, f1511; +mul.f32 f2320, f1468, f1509; +sub.f32 f1519, f2320, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1522, f2336, f1521; +fma.rn.f32 f1523, f1519, f1428, f1522; +mul.f32 f1524, f1428, f1521; +mul.f32 f1525, f1519, f2336; +sub.f32 f1526, f1525, f1524; +mul.f32 f1528, f1469, f1521; +mul.f32 f2319, f1468, f1519; +sub.f32 f1529, f2319, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f1532, f2335, f1531; +fma.rn.f32 f1533, f1529, f1432, f1532; +mul.f32 f1534, f1432, f1531; +mul.f32 f1535, f1529, f2335; +sub.f32 f1536, f1535, f1534; +mul.f32 f2317, f1468, f1529; +mul.f32 f2318, f1469, f1531; +sub.f32 f1539, f2317, f2318; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f1542, f2334, f1541; +fma.rn.f32 f1543, f1539, f1436, f1542; +mul.f32 f1544, f1436, f1541; +mul.f32 f1545, f1539, f2334; +sub.f32 f1546, f1545, f1544; +mul.f32 f2315, f1468, f1539; +mul.f32 f2316, f1469, f1541; +sub.f32 f1549, f2315, f2316; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1552, f2333, f1551; +fma.rn.f32 f1553, f1549, f1440, f1552; +mul.f32 f1554, f1440, f1551; +mul.f32 f1555, f1549, f2333; +sub.f32 f1556, f1555, f1554; +mul.f32 f1558, f1469, f1551; +mul.f32 f2314, f1468, f1549; +sub.f32 f1559, f2314, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1562, f2332, f1561; +fma.rn.f32 f1563, f1559, f1444, f1562; +mul.f32 f1564, f1444, f1561; +mul.f32 f1565, f1559, f2332; +sub.f32 f1566, f1565, f1564; +mul.f32 f1568, f1469, f1561; +mul.f32 f2313, f1468, f1559; +sub.f32 f1569, f2313, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1572, f2331, f1571; +fma.rn.f32 f1573, f1569, f1448, f1572; +mul.f32 f1574, f1448, f1571; +mul.f32 f1575, f1569, f2331; +sub.f32 f1576, f1575, f1574; +mul.f32 f1578, f1469, f1571; +mul.f32 f2312, f1468, f1569; +sub.f32 f1579, f2312, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f1582, f2330, f1581; +fma.rn.f32 f1583, f1579, f1452, f1582; +mul.f32 f1584, f1452, f1581; +mul.f32 f1585, f1579, f2330; +sub.f32 f1586, f1585, f1584; +mul.f32 f2310, f1468, f1579; +mul.f32 f2311, f1469, f1581; +sub.f32 f1589, f2310, f2311; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f1592, f2329, f1591; +fma.rn.f32 f1593, f1589, f1456, f1592; +mul.f32 f1594, f1456, f1591; +mul.f32 f1595, f1589, f2329; +sub.f32 f1596, f1595, f1594; +mul.f32 f1598, f1469, f1591; +mul.f32 f2309, f1468, f1589; +sub.f32 f1599, f2309, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1602, f2328, f1601; +fma.rn.f32 f1603, f1599, f1460, f1602; +mul.f32 f1604, f1460, f1601; +mul.f32 f1605, f1599, f2328; +sub.f32 f1606, f1605, f1604; +mul.f32 f1608, f1469, f1601; +mul.f32 f2308, f1468, f1599; +sub.f32 f1609, f2308, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1612, f2327, f1611; +fma.rn.f32 f1613, f1609, f1464, f1612; +mul.f32 f1614, f1464, f1611; +mul.f32 f1615, f1609, f2327; +sub.f32 f1616, f1615, f1614; +mul.f32 f1618, f1469, f1611; +mul.f32 f2307, f1468, f1609; +sub.f32 f1619, f2307, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1622, f1407, f1621; +fma.rn.f32 f1623, f1619, f1406, f1622; +mul.f32 f1624, f1406, f1621; +mul.f32 f1625, f1619, f1407; +sub.f32 f1626, f1625, f1624; +mul.f32 f2305, f1468, f1619; +mul.f32 f2306, f1469, f1621; +sub.f32 f1629, f2305, f2306; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f1632, f1411, f1631; +fma.rn.f32 f1633, f1629, f1410, f1632; +mul.f32 f1634, f1410, f1631; +mul.f32 f1635, f1629, f1411; +sub.f32 f1636, f1635, f1634; +mul.f32 f2303, f1468, f1629; +mul.f32 f2304, f1469, f1631; +sub.f32 f1639, f2303, f2304; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f1642, f1415, f1641; +fma.rn.f32 f1643, f1639, f1414, f1642; +mul.f32 f1644, f1414, f1641; +mul.f32 f1645, f1639, f1415; +sub.f32 f1646, f1645, f1644; +mul.f32 f1648, f1469, f1641; +mul.f32 f2302, f1468, f1639; +sub.f32 f1649, f2302, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1652, f1419, f1651; +fma.rn.f32 f1653, f1649, f1418, f1652; +mul.f32 f1654, f1418, f1651; +mul.f32 f1655, f1649, f1419; +sub.f32 f1656, f1655, f1654; +mul.f32 f1658, f1469, f1651; +mul.f32 f2301, f1468, f1649; +sub.f32 f1659, f2301, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1662, f1423, f1661; +fma.rn.f32 f1663, f1659, f1422, f1662; +mul.f32 f1664, f1422, f1661; +mul.f32 f1665, f1659, f1423; +sub.f32 f1666, f1665, f1664; +mul.f32 f1668, f1469, f1661; +mul.f32 f2300, f1468, f1659; +sub.f32 f1669, f2300, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f1672, f1427, f1671; +fma.rn.f32 f1673, f1669, f1426, f1672; +mul.f32 f1674, f1426, f1671; +mul.f32 f1675, f1669, f1427; +sub.f32 f1676, f1675, f1674; +mul.f32 f2298, f1468, f1669; +mul.f32 f2299, f1469, f1671; +sub.f32 f1679, f2298, f2299; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f1682, f1431, f1681; +fma.rn.f32 f1683, f1679, f1430, f1682; +mul.f32 f1684, f1430, f1681; +mul.f32 f1685, f1679, f1431; +sub.f32 f1686, f1685, f1684; +mul.f32 f2296, f1468, f1679; +mul.f32 f2297, f1469, f1681; +sub.f32 f1689, f2296, f2297; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1692, f1435, f1691; +fma.rn.f32 f1693, f1689, f1434, f1692; +mul.f32 f1694, f1434, f1691; +mul.f32 f1695, f1689, f1435; +sub.f32 f1696, f1695, f1694; +mul.f32 f1698, f1469, f1691; +mul.f32 f2295, f1468, f1689; +sub.f32 f1699, f2295, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1702, f1439, f1701; +fma.rn.f32 f1703, f1699, f1438, f1702; +mul.f32 f1704, f1438, f1701; +mul.f32 f1705, f1699, f1439; +sub.f32 f1706, f1705, f1704; +mul.f32 f1708, f1469, f1701; +mul.f32 f2294, f1468, f1699; +sub.f32 f1709, f2294, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1712, f1443, f1711; +fma.rn.f32 f1713, f1709, f1442, f1712; +mul.f32 f1714, f1442, f1711; +mul.f32 f1715, f1709, f1443; +sub.f32 f1716, f1715, f1714; +mul.f32 f2292, f1468, f1709; +mul.f32 f2293, f1469, f1711; +sub.f32 f1719, f2292, f2293; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f1722, f1447, f1721; +fma.rn.f32 f1723, f1719, f1446, f1722; +mul.f32 f1724, f1446, f1721; +mul.f32 f1725, f1719, f1447; +sub.f32 f1726, f1725, f1724; +mul.f32 f2290, f1468, f1719; +mul.f32 f2291, f1469, f1721; +sub.f32 f1729, f2290, f2291; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f1732, f1451, f1731; +fma.rn.f32 f1733, f1729, f1450, f1732; +mul.f32 f1734, f1450, f1731; +mul.f32 f1735, f1729, f1451; +sub.f32 f1736, f1735, f1734; +mul.f32 f1738, f1469, f1731; +mul.f32 f2289, f1468, f1729; +sub.f32 f1739, f2289, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1742, f1455, f1741; +fma.rn.f32 f1743, f1739, f1454, f1742; +mul.f32 f1744, f1454, f1741; +mul.f32 f1745, f1739, f1455; +sub.f32 f1746, f1745, f1744; +mul.f32 f1748, f1469, f1741; +mul.f32 f2288, f1468, f1739; +sub.f32 f1749, f2288, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1752, f1459, f1751; +fma.rn.f32 f1753, f1749, f1458, f1752; +mul.f32 f1754, f1458, f1751; +mul.f32 f1755, f1749, f1459; +sub.f32 f1756, f1755, f1754; +mul.f32 f1758, f1469, f1751; +mul.f32 f2287, f1468, f1749; +sub.f32 f1759, f2287, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f1762, f1463, f1761; +fma.rn.f32 f1763, f1759, f1462, f1762; +mul.f32 f1764, f1462, f1761; +mul.f32 f1765, f1759, f1463; +sub.f32 f1766, f1765, f1764; +mul.f32 f2285, f1468, f1759; +mul.f32 f2286, f1469, f1761; +sub.f32 f1769, f2285, f2286; +mul.f32 f1770, f1468, f1761; +fma.rn.f32 f1771, f1469, f1759, f1770; +mov.u32 r29, %tid.x; +shl.b32 r28, r29, 7; +mul.f32 f1772, f1467, f1771; +fma.rn.f32 f1773, f1769, f1466, f1772; +mul.f32 f1774, f1466, f1771; +mul.f32 f1775, f1769, f1467; +sub.f32 f1776, f1775, f1774; +and.b32 r22, r29, 480; +shl.b32 r16, r29, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r28, 61440; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1473; +st.shared.f32 [r20+256], f1483; +st.shared.f32 [r20+384], f1493; +st.shared.f32 [r20+512], f1503; +st.shared.f32 [r20+640], f1513; +st.shared.f32 [r20+768], f1523; +st.shared.f32 [r20+896], f1533; +st.shared.f32 [r20+1024], f1543; +st.shared.f32 [r20+1152], f1553; +st.shared.f32 [r20+1280], f1563; +st.shared.f32 [r20+1408], f1573; +st.shared.f32 [r20+1536], f1583; +st.shared.f32 [r20+1664], f1593; +st.shared.f32 [r20+1792], f1603; +st.shared.f32 [r20+1920], f1613; +st.shared.f32 [r20+2048], f1623; +st.shared.f32 [r20+2176], f1633; +st.shared.f32 [r20+2304], f1643; +st.shared.f32 [r20+2432], f1653; +st.shared.f32 [r20+2560], f1663; +st.shared.f32 [r20+2688], f1673; +st.shared.f32 [r20+2816], f1683; +st.shared.f32 [r20+2944], f1693; +st.shared.f32 [r20+3072], f1703; +st.shared.f32 [r20+3200], f1713; +st.shared.f32 [r20+3328], f1723; +st.shared.f32 [r20+3456], f1733; +st.shared.f32 [r20+3584], f1743; +st.shared.f32 [r20+3712], f1753; +st.shared.f32 [r20+3840], f1763; +st.shared.f32 [r20+3968], f1773; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+2048]; +ld.shared.f32 f1779, [r21+4096]; +ld.shared.f32 f1780, [r21+6144]; +ld.shared.f32 f1781, [r21+8192]; +ld.shared.f32 f1782, [r21+10240]; +ld.shared.f32 f1783, [r21+12288]; +ld.shared.f32 f1784, [r21+14336]; +ld.shared.f32 f1785, [r21+16384]; +ld.shared.f32 f1786, [r21+18432]; +ld.shared.f32 f1787, [r21+20480]; +ld.shared.f32 f1788, [r21+22528]; +ld.shared.f32 f1789, [r21+24576]; +ld.shared.f32 f1790, [r21+26624]; +ld.shared.f32 f1791, [r21+28672]; +ld.shared.f32 f1792, [r21+30720]; +ld.shared.f32 f1793, [r21+32768]; +ld.shared.f32 f1794, [r21+34816]; +ld.shared.f32 f1795, [r21+36864]; +ld.shared.f32 f1796, [r21+38912]; +ld.shared.f32 f1797, [r21+40960]; +ld.shared.f32 f1798, [r21+43008]; +ld.shared.f32 f1799, [r21+45056]; +ld.shared.f32 f1800, [r21+47104]; +ld.shared.f32 f1801, [r21+49152]; +ld.shared.f32 f1802, [r21+51200]; +ld.shared.f32 f1803, [r21+53248]; +ld.shared.f32 f1804, [r21+55296]; +ld.shared.f32 f1805, [r21+57344]; +ld.shared.f32 f1806, [r21+59392]; +ld.shared.f32 f1807, [r21+61440]; +ld.shared.f32 f1808, [r21+63488]; +barrier.sync 0; +st.shared.f32 [r20], f2342; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+2048]; +ld.shared.f32 f1811, [r21+4096]; +ld.shared.f32 f1812, [r21+6144]; +ld.shared.f32 f1813, [r21+8192]; +ld.shared.f32 f1814, [r21+10240]; +ld.shared.f32 f1815, [r21+12288]; +ld.shared.f32 f1816, [r21+14336]; +ld.shared.f32 f1817, [r21+16384]; +ld.shared.f32 f1818, [r21+18432]; +ld.shared.f32 f1819, [r21+20480]; +ld.shared.f32 f1820, [r21+22528]; +ld.shared.f32 f1821, [r21+24576]; +ld.shared.f32 f1822, [r21+26624]; +ld.shared.f32 f1823, [r21+28672]; +ld.shared.f32 f1824, [r21+30720]; +ld.shared.f32 f1825, [r21+32768]; +ld.shared.f32 f1826, [r21+34816]; +ld.shared.f32 f1827, [r21+36864]; +ld.shared.f32 f1828, [r21+38912]; +ld.shared.f32 f1829, [r21+40960]; +ld.shared.f32 f1830, [r21+43008]; +ld.shared.f32 f1831, [r21+45056]; +ld.shared.f32 f1832, [r21+47104]; +ld.shared.f32 f1833, [r21+49152]; +ld.shared.f32 f1834, [r21+51200]; +ld.shared.f32 f1835, [r21+53248]; +ld.shared.f32 f1836, [r21+55296]; +ld.shared.f32 f1837, [r21+57344]; +ld.shared.f32 f1838, [r21+59392]; +ld.shared.f32 f1839, [r21+61440]; +ld.shared.f32 f1840, [r21+63488]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2284, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2283, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1841, f1845; +sub.f32 f1851, f1841, f1845; +add.f32 f2282, f2284, f2283; +sub.f32 f1852, f2284, f2283; +sub.f32 f1853, f1843, f1848; +add.f32 f1855, f1843, f1848; +add.f32 f2281, f1844, f1847; +sub.f32 f1856, f1844, f1847; +add.f32 f1857, f1781, f1797; +sub.f32 f1859, f1781, f1797; +add.f32 f2280, f1813, f1829; +sub.f32 f1860, f1813, f1829; +add.f32 f1861, f1789, f1805; +sub.f32 f1863, f1789, f1805; +add.f32 f2279, f1821, f1837; +sub.f32 f1864, f1821, f1837; +add.f32 f1865, f1857, f1861; +sub.f32 f1867, f1857, f1861; +add.f32 f2278, f2280, f2279; +sub.f32 f1868, f2280, f2279; +sub.f32 f1869, f1859, f1864; +add.f32 f1871, f1859, f1864; +add.f32 f2277, f1860, f1863; +sub.f32 f1872, f1860, f1863; +mul.f32 f1873, f1869, 0f3F3504F3; +mul.f32 f1874, f2277, 0f3F3504F3; +sub.f32 f1875, f1873, f1874; +add.f32 f1876, f1873, f1874; +mul.f32 f2275, f1871, 0fBF3504F3; +mul.f32 f2276, f1872, 0f3F3504F3; +sub.f32 f1879, f2275, f2276; +mul.f32 f1880, f1872, 0fBF3504F3; +fma.rn.f32 f1881, f1871, 0f3F3504F3, f1880; +add.f32 f1882, f1849, f1865; +sub.f32 f1884, f1849, f1865; +add.f32 f2274, f2282, f2278; +sub.f32 f1885, f2282, f2278; +add.f32 f1886, f1853, f1875; +sub.f32 f1888, f1853, f1875; +add.f32 f2273, f2281, f1876; +sub.f32 f1889, f2281, f1876; +sub.f32 f1890, f1851, f1868; +add.f32 f1892, f1851, f1868; +add.f32 f2272, f1852, f1867; +sub.f32 f1893, f1852, f1867; +add.f32 f1894, f1855, f1879; +sub.f32 f1896, f1855, f1879; +add.f32 f2271, f1856, f1881; +sub.f32 f1897, f1856, f1881; +add.f32 f1898, f1779, f1795; +sub.f32 f1900, f1779, f1795; +add.f32 f2270, f1811, f1827; +sub.f32 f1901, f1811, f1827; +add.f32 f1902, f1787, f1803; +sub.f32 f1904, f1787, f1803; +add.f32 f2269, f1819, f1835; +sub.f32 f1905, f1819, f1835; +add.f32 f1906, f1898, f1902; +sub.f32 f1908, f1898, f1902; +add.f32 f2268, f2270, f2269; +sub.f32 f1909, f2270, f2269; +sub.f32 f1910, f1900, f1905; +add.f32 f1912, f1900, f1905; +add.f32 f2267, f1901, f1904; +sub.f32 f1913, f1901, f1904; +add.f32 f1914, f1783, f1799; +sub.f32 f1916, f1783, f1799; +add.f32 f2266, f1815, f1831; +sub.f32 f1917, f1815, f1831; +add.f32 f1918, f1791, f1807; +sub.f32 f1920, f1791, f1807; +add.f32 f2265, f1823, f1839; +sub.f32 f1921, f1823, f1839; +add.f32 f1922, f1914, f1918; +sub.f32 f1924, f1914, f1918; +add.f32 f2264, f2266, f2265; +sub.f32 f1925, f2266, f2265; +sub.f32 f1926, f1916, f1921; +add.f32 f1928, f1916, f1921; +add.f32 f2263, f1917, f1920; +sub.f32 f1929, f1917, f1920; +mul.f32 f1930, f1926, 0f3F3504F3; +mul.f32 f1931, f2263, 0f3F3504F3; +sub.f32 f1932, f1930, f1931; +add.f32 f1933, f1930, f1931; +mul.f32 f2261, f1928, 0fBF3504F3; +mul.f32 f2262, f1929, 0f3F3504F3; +sub.f32 f1936, f2261, f2262; +mul.f32 f1937, f1929, 0fBF3504F3; +fma.rn.f32 f1938, f1928, 0f3F3504F3, f1937; +add.f32 f1939, f1906, f1922; +sub.f32 f1941, f1906, f1922; +add.f32 f2260, f2268, f2264; +sub.f32 f1942, f2268, f2264; +add.f32 f1943, f1910, f1932; +sub.f32 f1945, f1910, f1932; +add.f32 f2259, f2267, f1933; +sub.f32 f1946, f2267, f1933; +sub.f32 f1947, f1908, f1925; +add.f32 f1949, f1908, f1925; +add.f32 f2258, f1909, f1924; +sub.f32 f1950, f1909, f1924; +add.f32 f1951, f1912, f1936; +sub.f32 f1953, f1912, f1936; +add.f32 f2257, f1913, f1938; +sub.f32 f1954, f1913, f1938; +mul.f32 f1956, f2259, 0f3EC3EF15; +mul.f32 f2256, f1943, 0f3F6C835E; +sub.f32 f1957, f2256, f1956; +mul.f32 f1958, f2259, 0f3F6C835E; +fma.rn.f32 f1959, f1943, 0f3EC3EF15, f1958; +mul.f32 f1960, f1947, 0f3F3504F3; +mul.f32 f1961, f2258, 0f3F3504F3; +sub.f32 f1962, f1960, f1961; +add.f32 f1963, f1960, f1961; +mul.f32 f1965, f2257, 0f3F6C835E; +mul.f32 f2255, f1951, 0f3EC3EF15; +sub.f32 f1966, f2255, f1965; +mul.f32 f1967, f2257, 0f3EC3EF15; +fma.rn.f32 f1968, f1951, 0f3F6C835E, f1967; +mul.f32 f1970, f1946, 0f3F6C835E; +mul.f32 f2254, f1945, 0fBEC3EF15; +sub.f32 f1971, f2254, f1970; +mul.f32 f1972, f1946, 0fBEC3EF15; +fma.rn.f32 f1973, f1945, 0f3F6C835E, f1972; +mul.f32 f2252, f1949, 0fBF3504F3; +mul.f32 f2253, f1950, 0f3F3504F3; +sub.f32 f1976, f2252, f2253; +mul.f32 f1977, f1950, 0fBF3504F3; +fma.rn.f32 f1978, f1949, 0f3F3504F3, f1977; +mul.f32 f2250, f1953, 0fBF6C835E; +mul.f32 f2251, f1954, 0f3EC3EF15; +sub.f32 f1981, f2250, f2251; +mul.f32 f1982, f1954, 0fBF6C835E; +fma.rn.f32 f1983, f1953, 0f3EC3EF15, f1982; +add.f32 f1984, f1778, f1794; +sub.f32 f1986, f1778, f1794; +add.f32 f2249, f1810, f1826; +sub.f32 f1987, f1810, f1826; +add.f32 f1988, f1786, f1802; +sub.f32 f1990, f1786, f1802; +add.f32 f2248, f1818, f1834; +sub.f32 f1991, f1818, f1834; +add.f32 f1992, f1984, f1988; +sub.f32 f1994, f1984, f1988; +add.f32 f2247, f2249, f2248; +sub.f32 f1995, f2249, f2248; +sub.f32 f1996, f1986, f1991; +add.f32 f1998, f1986, f1991; +add.f32 f2246, f1987, f1990; +sub.f32 f1999, f1987, f1990; +add.f32 f2000, f1782, f1798; +sub.f32 f2002, f1782, f1798; +add.f32 f2245, f1814, f1830; +sub.f32 f2003, f1814, f1830; +add.f32 f2004, f1790, f1806; +sub.f32 f2006, f1790, f1806; +add.f32 f2244, f1822, f1838; +sub.f32 f2007, f1822, f1838; +add.f32 f2008, f2000, f2004; +sub.f32 f2010, f2000, f2004; +add.f32 f2243, f2245, f2244; +sub.f32 f2011, f2245, f2244; +sub.f32 f2012, f2002, f2007; +add.f32 f2014, f2002, f2007; +add.f32 f2242, f2003, f2006; +sub.f32 f2015, f2003, f2006; +mul.f32 f2016, f2012, 0f3F3504F3; +mul.f32 f2017, f2242, 0f3F3504F3; +sub.f32 f2018, f2016, f2017; +add.f32 f2019, f2016, f2017; +mul.f32 f2240, f2014, 0fBF3504F3; +mul.f32 f2241, f2015, 0f3F3504F3; +sub.f32 f2022, f2240, f2241; +mul.f32 f2023, f2015, 0fBF3504F3; +fma.rn.f32 f2024, f2014, 0f3F3504F3, f2023; +add.f32 f2025, f1992, f2008; +sub.f32 f2027, f1992, f2008; +add.f32 f2239, f2247, f2243; +sub.f32 f2028, f2247, f2243; +add.f32 f2029, f1996, f2018; +sub.f32 f2031, f1996, f2018; +add.f32 f2238, f2246, f2019; +sub.f32 f2032, f2246, f2019; +sub.f32 f2033, f1994, f2011; +add.f32 f2035, f1994, f2011; +add.f32 f2237, f1995, f2010; +sub.f32 f2036, f1995, f2010; +add.f32 f2037, f1998, f2022; +sub.f32 f2039, f1998, f2022; +add.f32 f2236, f1999, f2024; +sub.f32 f2040, f1999, f2024; +add.f32 f2041, f1780, f1796; +sub.f32 f2043, f1780, f1796; +add.f32 f2235, f1812, f1828; +sub.f32 f2044, f1812, f1828; +add.f32 f2045, f1788, f1804; +sub.f32 f2047, f1788, f1804; +add.f32 f2234, f1820, f1836; +sub.f32 f2048, f1820, f1836; +add.f32 f2049, f2041, f2045; +sub.f32 f2051, f2041, f2045; +add.f32 f2233, f2235, f2234; +sub.f32 f2052, f2235, f2234; +sub.f32 f2053, f2043, f2048; +add.f32 f2055, f2043, f2048; +add.f32 f2232, f2044, f2047; +sub.f32 f2056, f2044, f2047; +add.f32 f2057, f1784, f1800; +sub.f32 f2059, f1784, f1800; +add.f32 f2231, f1816, f1832; +sub.f32 f2060, f1816, f1832; +add.f32 f2061, f1792, f1808; +sub.f32 f2063, f1792, f1808; +add.f32 f2230, f1824, f1840; +sub.f32 f2064, f1824, f1840; +add.f32 f2065, f2057, f2061; +sub.f32 f2067, f2057, f2061; +add.f32 f2229, f2231, f2230; +sub.f32 f2068, f2231, f2230; +sub.f32 f2069, f2059, f2064; +add.f32 f2071, f2059, f2064; +add.f32 f2228, f2060, f2063; +sub.f32 f2072, f2060, f2063; +mul.f32 f2073, f2069, 0f3F3504F3; +mul.f32 f2074, f2228, 0f3F3504F3; +sub.f32 f2075, f2073, f2074; +add.f32 f2076, f2073, f2074; +mul.f32 f2078, f2072, 0f3F3504F3; +mul.f32 f2227, f2071, 0fBF3504F3; +sub.f32 f2079, f2227, f2078; +mul.f32 f2080, f2072, 0fBF3504F3; +fma.rn.f32 f2081, f2071, 0f3F3504F3, f2080; +add.f32 f2082, f2049, f2065; +sub.f32 f2084, f2049, f2065; +add.f32 f2226, f2233, f2229; +sub.f32 f2085, f2233, f2229; +add.f32 f2086, f2053, f2075; +sub.f32 f2088, f2053, f2075; +add.f32 f2225, f2232, f2076; +sub.f32 f2089, f2232, f2076; +sub.f32 f2090, f2051, f2068; +add.f32 f2092, f2051, f2068; +add.f32 f2224, f2052, f2067; +sub.f32 f2093, f2052, f2067; +add.f32 f2094, f2055, f2079; +sub.f32 f2096, f2055, f2079; +add.f32 f2223, f2056, f2081; +sub.f32 f2097, f2056, f2081; +mul.f32 f2221, f2086, 0f3F6C835E; +mul.f32 f2222, f2225, 0f3EC3EF15; +sub.f32 f2100, f2221, f2222; +mul.f32 f2101, f2225, 0f3F6C835E; +fma.rn.f32 f2102, f2086, 0f3EC3EF15, f2101; +mul.f32 f2103, f2090, 0f3F3504F3; +mul.f32 f2104, f2224, 0f3F3504F3; +sub.f32 f2105, f2103, f2104; +add.f32 f2106, f2103, f2104; +mul.f32 f2219, f2094, 0f3EC3EF15; +mul.f32 f2220, f2223, 0f3F6C835E; +sub.f32 f2109, f2219, f2220; +mul.f32 f2110, f2223, 0f3EC3EF15; +fma.rn.f32 f2111, f2094, 0f3F6C835E, f2110; +mul.f32 f2217, f2088, 0fBEC3EF15; +mul.f32 f2218, f2089, 0f3F6C835E; +sub.f32 f2114, f2217, f2218; +mul.f32 f2115, f2089, 0fBEC3EF15; +fma.rn.f32 f2116, f2088, 0f3F6C835E, f2115; +mul.f32 f2215, f2092, 0fBF3504F3; +mul.f32 f2216, f2093, 0f3F3504F3; +sub.f32 f2119, f2215, f2216; +mul.f32 f2120, f2093, 0fBF3504F3; +fma.rn.f32 f2121, f2092, 0f3F3504F3, f2120; +mul.f32 f2123, f2097, 0f3EC3EF15; +mul.f32 f2214, f2096, 0fBF6C835E; +sub.f32 f2124, f2214, f2123; +mul.f32 f2125, f2097, 0fBF6C835E; +fma.rn.f32 f2126, f2096, 0f3EC3EF15, f2125; +add.f32 %1, f2274, f2260; +add.f32 %0, f1882, f1939; +add.f32 %2, f2025, f2082; +add.f32 %3, f2239, f2226; +add.f32 %5, f2273, f1959; +add.f32 %4, f1886, f1957; +add.f32 %7, f2238, f2102; +add.f32 %6, f2029, f2100; +add.f32 %8, f1890, f1962; +add.f32 %9, f2272, f1963; +add.f32 %10, f2033, f2105; +add.f32 %11, f2237, f2106; +add.f32 %12, f1894, f1966; +add.f32 %13, f2271, f1968; +add.f32 %14, f2037, f2109; +add.f32 %15, f2236, f2111; +add.f32 %17, f1885, f1941; +sub.f32 %16, f1884, f1942; +add.f32 %19, f2028, f2084; +sub.f32 %18, f2027, f2085; +add.f32 %20, f1888, f1971; +add.f32 %21, f1889, f1973; +add.f32 %22, f2031, f2114; +add.f32 %23, f2032, f2116; +add.f32 %24, f1892, f1976; +add.f32 %25, f1893, f1978; +add.f32 %26, f2035, f2119; +add.f32 %27, f2036, f2121; +add.f32 %29, f1897, f1983; +add.f32 %28, f1896, f1981; +add.f32 %31, f2040, f2126; +add.f32 %30, f2039, f2124; +sub.f32 %32, f1882, f1939; +sub.f32 %33, f2274, f2260; +sub.f32 %34, f2025, f2082; +sub.f32 %35, f2239, f2226; +sub.f32 %37, f2273, f1959; +sub.f32 %36, f1886, f1957; +sub.f32 %39, f2238, f2102; +sub.f32 %38, f2029, f2100; +sub.f32 %41, f2272, f1963; +sub.f32 %40, f1890, f1962; +sub.f32 %43, f2237, f2106; +sub.f32 %42, f2033, f2105; +sub.f32 %45, f2271, f1968; +sub.f32 %44, f1894, f1966; +sub.f32 %47, f2236, f2111; +sub.f32 %46, f2037, f2109; +sub.f32 %49, f1885, f1941; +add.f32 %48, f1884, f1942; +sub.f32 %51, f2028, f2084; +add.f32 %50, f2027, f2085; +sub.f32 %53, f1889, f1973; +sub.f32 %52, f1888, f1971; +sub.f32 %55, f2032, f2116; +sub.f32 %54, f2031, f2114; +sub.f32 %57, f1893, f1978; +sub.f32 %56, f1892, f1976; +sub.f32 %59, f2036, f2121; +sub.f32 %58, f2035, f2119; +sub.f32 %61, f1897, f1983; +sub.f32 %60, f1896, f1981; +sub.f32 %63, f2040, f2126; +sub.f32 %62, f2039, f2124; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_16384), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..29a3cd5571dbc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp64_fwd.hpp.inc @@ -0,0 +1,1257 @@ +#ifndef CUFFTDX_FFT_16384_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_16384_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1164, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<1194>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %36, %57; +add.f64 fd66, %37, %59; +sub.f64 fd67, %36, %57; +sub.f64 fd68, %37, %59; +add.f64 fd69, %46, %68; +add.f64 fd70, %48, %69; +sub.f64 fd71, %46, %68; +sub.f64 fd72, %48, %69; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %41, %62; +add.f64 fd82, %43, %64; +sub.f64 fd83, %41, %62; +sub.f64 fd84, %43, %64; +add.f64 fd85, %52, %73; +add.f64 fd86, %53, %75; +sub.f64 fd87, %52, %73; +sub.f64 fd88, %53, %75; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %38, %60; +add.f64 fd123, %40, %61; +sub.f64 fd124, %38, %60; +sub.f64 fd125, %40, %61; +add.f64 fd126, %49, %70; +add.f64 fd127, %51, %72; +sub.f64 fd128, %49, %70; +sub.f64 fd129, %51, %72; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %44, %65; +add.f64 fd139, %45, %67; +sub.f64 fd140, %44, %65; +sub.f64 fd141, %45, %67; +add.f64 fd142, %54, %76; +add.f64 fd143, %56, %77; +sub.f64 fd144, %54, %76; +sub.f64 fd145, %56, %77; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+16384]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -131072; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 130944; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+8192]; +ld.shared.f64 fd390, [r13+16384]; +ld.shared.f64 fd391, [r13+24576]; +ld.shared.f64 fd392, [r13+32768]; +ld.shared.f64 fd393, [r13+40960]; +ld.shared.f64 fd394, [r13+49152]; +ld.shared.f64 fd395, [r13+57344]; +ld.shared.f64 fd396, [r13+65536]; +ld.shared.f64 fd397, [r13+73728]; +ld.shared.f64 fd398, [r13+81920]; +ld.shared.f64 fd399, [r13+90112]; +ld.shared.f64 fd400, [r13+98304]; +ld.shared.f64 fd401, [r13+106496]; +ld.shared.f64 fd402, [r13+114688]; +ld.shared.f64 fd403, [r13+122880]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+8192]; +ld.shared.f64 fd406, [r13+16384]; +ld.shared.f64 fd407, [r13+24576]; +ld.shared.f64 fd408, [r13+32768]; +ld.shared.f64 fd409, [r13+40960]; +ld.shared.f64 fd410, [r13+49152]; +ld.shared.f64 fd411, [r13+57344]; +ld.shared.f64 fd412, [r13+65536]; +ld.shared.f64 fd413, [r13+73728]; +ld.shared.f64 fd414, [r13+81920]; +ld.shared.f64 fd415, [r13+90112]; +ld.shared.f64 fd416, [r13+98304]; +ld.shared.f64 fd417, [r13+106496]; +ld.shared.f64 fd418, [r13+114688]; +ld.shared.f64 fd419, [r13+122880]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd543; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd543; +add.f64 fd575, fd473, fd546; +add.f64 fd576, fd474, fd548; +sub.f64 fd577, fd473, fd546; +sub.f64 fd578, fd474, fd548; +add.f64 fd579, fd463, fd521; +sub.f64 fd580, fd464, fd520; +sub.f64 fd581, fd463, fd521; +add.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd551; +add.f64 fd584, fd468, fd553; +sub.f64 fd585, fd467, fd551; +sub.f64 fd586, fd468, fd553; +add.f64 fd587, fd471, fd556; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd556; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 1008; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd595, fd567; +mul.f64 fd600, fd596, fd568; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd595, fd568; +fma.rn.f64 fd603, fd596, fd567, fd602; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd606, fd571; +mul.f64 fd610, fd608, fd572; +sub.f64 fd611, fd609, fd610; +mul.f64 fd612, fd606, fd572; +fma.rn.f64 fd613, fd608, fd571, fd612; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd616, fd575; +mul.f64 fd620, fd618, fd576; +sub.f64 fd621, fd619, fd620; +mul.f64 fd622, fd616, fd576; +fma.rn.f64 fd623, fd618, fd575, fd622; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd626, fd579; +mul.f64 fd630, fd628, fd580; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd626, fd580; +fma.rn.f64 fd633, fd628, fd579, fd632; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd636, fd583; +mul.f64 fd640, fd638, fd584; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd636, fd584; +fma.rn.f64 fd643, fd638, fd583, fd642; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd646, fd587; +mul.f64 fd650, fd648, fd588; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd646, fd588; +fma.rn.f64 fd653, fd648, fd587, fd652; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd656, fd591; +mul.f64 fd660, fd658, fd592; +sub.f64 fd661, fd659, fd660; +mul.f64 fd662, fd656, fd592; +fma.rn.f64 fd663, fd658, fd591, fd662; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd666, fd565; +mul.f64 fd670, fd668, fd566; +sub.f64 fd671, fd669, fd670; +mul.f64 fd672, fd666, fd566; +fma.rn.f64 fd673, fd668, fd565, fd672; +ld.global.v2.f64 {fd674, fd675}, [rd8+1024]; +mul.f64 fd678, fd674, fd569; +mul.f64 fd679, fd675, fd570; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd674, fd570; +fma.rn.f64 fd682, fd675, fd569, fd681; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd685, fd573; +mul.f64 fd689, fd687, fd574; +sub.f64 fd690, fd688, fd689; +mul.f64 fd691, fd685, fd574; +fma.rn.f64 fd692, fd687, fd573, fd691; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd695, fd577; +mul.f64 fd699, fd697, fd578; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd695, fd578; +fma.rn.f64 fd702, fd697, fd577, fd701; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd705, fd581; +mul.f64 fd709, fd707, fd582; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd705, fd582; +fma.rn.f64 fd712, fd707, fd581, fd711; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd715, fd585; +mul.f64 fd719, fd717, fd586; +sub.f64 fd720, fd718, fd719; +mul.f64 fd721, fd715, fd586; +fma.rn.f64 fd722, fd717, fd585, fd721; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd725, fd589; +mul.f64 fd729, fd727, fd590; +sub.f64 fd730, fd728, fd729; +mul.f64 fd731, fd725, fd590; +fma.rn.f64 fd732, fd727, fd589, fd731; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd735, fd593; +mul.f64 fd739, fd737, fd594; +sub.f64 fd740, fd738, fd739; +mul.f64 fd741, fd735, fd594; +fma.rn.f64 fd742, fd737, fd593, fd741; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 129024; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd601; +st.shared.f64 [r19+256], fd611; +st.shared.f64 [r19+384], fd621; +st.shared.f64 [r19+512], fd631; +st.shared.f64 [r19+640], fd641; +st.shared.f64 [r19+768], fd651; +st.shared.f64 [r19+896], fd661; +st.shared.f64 [r19+1024], fd671; +st.shared.f64 [r19+1152], fd680; +st.shared.f64 [r19+1280], fd690; +st.shared.f64 [r19+1408], fd700; +st.shared.f64 [r19+1536], fd710; +st.shared.f64 [r19+1664], fd720; +st.shared.f64 [r19+1792], fd730; +st.shared.f64 [r19+1920], fd740; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+8192]; +ld.shared.f64 fd745, [r20+16384]; +ld.shared.f64 fd746, [r20+24576]; +ld.shared.f64 fd747, [r20+32768]; +ld.shared.f64 fd748, [r20+40960]; +ld.shared.f64 fd749, [r20+49152]; +ld.shared.f64 fd750, [r20+57344]; +ld.shared.f64 fd751, [r20+65536]; +ld.shared.f64 fd752, [r20+73728]; +ld.shared.f64 fd753, [r20+81920]; +ld.shared.f64 fd754, [r20+90112]; +ld.shared.f64 fd755, [r20+98304]; +ld.shared.f64 fd756, [r20+106496]; +ld.shared.f64 fd757, [r20+114688]; +ld.shared.f64 fd758, [r20+122880]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+8192]; +ld.shared.f64 fd761, [r20+16384]; +ld.shared.f64 fd762, [r20+24576]; +ld.shared.f64 fd763, [r20+32768]; +ld.shared.f64 fd764, [r20+40960]; +ld.shared.f64 fd765, [r20+49152]; +ld.shared.f64 fd766, [r20+57344]; +ld.shared.f64 fd767, [r20+65536]; +ld.shared.f64 fd768, [r20+73728]; +ld.shared.f64 fd769, [r20+81920]; +ld.shared.f64 fd770, [r20+90112]; +ld.shared.f64 fd771, [r20+98304]; +ld.shared.f64 fd772, [r20+106496]; +ld.shared.f64 fd773, [r20+114688]; +ld.shared.f64 fd774, [r20+122880]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +add.f64 fd787, fd777, fd782; +sub.f64 fd788, fd778, fd781; +sub.f64 fd789, fd777, fd782; +add.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +add.f64 fd803, fd793, fd798; +sub.f64 fd804, fd794, fd797; +sub.f64 fd805, fd793, fd798; +add.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0dBFE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +mul.f64 fd810, fd804, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd811, fd803, 0dBFE6A09E667F3BCD, fd810; +mul.f64 fd812, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd813, fd806, 0dBFE6A09E667F3BCD; +sub.f64 fd814, fd812, fd813; +add.f64 fd815, fd812, fd813; +add.f64 fd816, fd783, fd799; +add.f64 fd817, fd784, fd800; +sub.f64 fd818, fd783, fd799; +sub.f64 fd819, fd784, fd800; +add.f64 fd820, fd787, fd809; +add.f64 fd821, fd788, fd811; +sub.f64 fd822, fd787, fd809; +sub.f64 fd823, fd788, fd811; +add.f64 fd824, fd785, fd802; +sub.f64 fd825, fd786, fd801; +sub.f64 fd826, fd785, fd802; +add.f64 fd827, fd786, fd801; +add.f64 fd828, fd789, fd814; +add.f64 fd829, fd790, fd815; +sub.f64 fd830, fd789, fd814; +sub.f64 fd831, fd790, fd815; +add.f64 fd832, fd744, fd752; +add.f64 fd833, fd760, fd768; +sub.f64 fd834, fd744, fd752; +sub.f64 fd835, fd760, fd768; +add.f64 fd836, fd748, fd756; +add.f64 fd837, fd764, fd772; +sub.f64 fd838, fd748, fd756; +sub.f64 fd839, fd764, fd772; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +add.f64 fd844, fd834, fd839; +sub.f64 fd845, fd835, fd838; +sub.f64 fd846, fd834, fd839; +add.f64 fd847, fd835, fd838; +add.f64 fd848, fd746, fd754; +add.f64 fd849, fd762, fd770; +sub.f64 fd850, fd746, fd754; +sub.f64 fd851, fd762, fd770; +add.f64 fd852, fd750, fd758; +add.f64 fd853, fd766, fd774; +sub.f64 fd854, fd750, fd758; +sub.f64 fd855, fd766, fd774; +add.f64 fd856, fd848, fd852; +add.f64 fd857, fd849, fd853; +sub.f64 fd858, fd848, fd852; +sub.f64 fd859, fd849, fd853; +add.f64 fd860, fd850, fd855; +sub.f64 fd861, fd851, fd854; +sub.f64 fd862, fd850, fd855; +add.f64 fd863, fd851, fd854; +mul.f64 fd864, fd860, 0d3FE6A09E667F3BCD; +mul.f64 fd865, fd861, 0dBFE6A09E667F3BCD; +sub.f64 fd866, fd864, fd865; +mul.f64 fd867, fd861, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd868, fd860, 0dBFE6A09E667F3BCD, fd867; +mul.f64 fd869, fd862, 0dBFE6A09E667F3BCD; +mul.f64 fd870, fd863, 0dBFE6A09E667F3BCD; +sub.f64 fd871, fd869, fd870; +add.f64 fd872, fd869, fd870; +add.f64 fd873, fd840, fd856; +add.f64 fd874, fd841, fd857; +sub.f64 fd875, fd840, fd856; +sub.f64 fd876, fd841, fd857; +add.f64 fd877, fd844, fd866; +add.f64 fd878, fd845, fd868; +sub.f64 fd879, fd844, fd866; +sub.f64 fd880, fd845, fd868; +add.f64 fd881, fd842, fd859; +sub.f64 fd882, fd843, fd858; +sub.f64 fd883, fd842, fd859; +add.f64 fd884, fd843, fd858; +add.f64 fd885, fd846, fd871; +add.f64 fd886, fd847, fd872; +sub.f64 fd887, fd846, fd871; +sub.f64 fd888, fd847, fd872; +mul.f64 fd889, fd877, 0d3FED906BCF328D46; +mul.f64 fd890, fd878, 0dBFD87DE2A6AEA963; +sub.f64 fd891, fd889, fd890; +mul.f64 fd892, fd878, 0d3FED906BCF328D46; +fma.rn.f64 fd893, fd877, 0dBFD87DE2A6AEA963, fd892; +mul.f64 fd894, fd881, 0d3FE6A09E667F3BCD; +mul.f64 fd895, fd882, 0dBFE6A09E667F3BCD; +sub.f64 fd896, fd894, fd895; +mul.f64 fd897, fd882, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd898, fd881, 0dBFE6A09E667F3BCD, fd897; +mul.f64 fd899, fd885, 0d3FD87DE2A6AEA963; +mul.f64 fd900, fd886, 0dBFED906BCF328D46; +sub.f64 fd901, fd899, fd900; +mul.f64 fd902, fd886, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd903, fd885, 0dBFED906BCF328D46, fd902; +mul.f64 fd904, fd879, 0dBFD87DE2A6AEA963; +mul.f64 fd905, fd880, 0dBFED906BCF328D46; +sub.f64 fd906, fd904, fd905; +mul.f64 fd907, fd880, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd908, fd879, 0dBFED906BCF328D46, fd907; +mul.f64 fd909, fd883, 0dBFE6A09E667F3BCD; +mul.f64 fd910, fd884, 0dBFE6A09E667F3BCD; +sub.f64 fd911, fd909, fd910; +add.f64 fd912, fd909, fd910; +mul.f64 fd913, fd887, 0dBFED906BCF328D46; +mul.f64 fd914, fd888, 0dBFD87DE2A6AEA963; +sub.f64 fd915, fd913, fd914; +mul.f64 fd916, fd888, 0dBFED906BCF328D46; +fma.rn.f64 fd917, fd887, 0dBFD87DE2A6AEA963, fd916; +add.f64 fd918, fd816, fd873; +add.f64 fd919, fd817, fd874; +sub.f64 fd920, fd816, fd873; +sub.f64 fd921, fd817, fd874; +add.f64 fd922, fd820, fd891; +add.f64 fd923, fd821, fd893; +sub.f64 fd924, fd820, fd891; +sub.f64 fd925, fd821, fd893; +add.f64 fd926, fd824, fd896; +add.f64 fd927, fd825, fd898; +sub.f64 fd928, fd824, fd896; +sub.f64 fd929, fd825, fd898; +add.f64 fd930, fd828, fd901; +add.f64 fd931, fd829, fd903; +sub.f64 fd932, fd828, fd901; +sub.f64 fd933, fd829, fd903; +add.f64 fd934, fd818, fd876; +sub.f64 fd935, fd819, fd875; +sub.f64 fd936, fd818, fd876; +add.f64 fd937, fd819, fd875; +add.f64 fd938, fd822, fd906; +add.f64 fd939, fd823, fd908; +sub.f64 fd940, fd822, fd906; +sub.f64 fd941, fd823, fd908; +add.f64 fd942, fd826, fd911; +add.f64 fd943, fd827, fd912; +sub.f64 fd944, fd826, fd911; +sub.f64 fd945, fd827, fd912; +add.f64 fd946, fd830, fd915; +add.f64 fd947, fd831, fd917; +sub.f64 fd948, fd830, fd915; +sub.f64 fd949, fd831, fd917; +and.b32 r21, r5, 768; +bfe.u32 r22, r5, 8, 2; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd950, fd951}, [rd11]; +mul.f64 fd954, fd950, fd922; +mul.f64 fd955, fd951, fd923; +sub.f64 fd956, fd954, fd955; +mul.f64 fd957, fd950, fd923; +fma.rn.f64 fd958, fd951, fd922, fd957; +mul.f64 fd959, fd950, fd950; +mul.f64 fd960, fd951, fd951; +sub.f64 fd961, fd959, fd960; +mul.f64 fd962, fd951, fd950; +fma.rn.f64 fd963, fd951, fd950, fd962; +mul.f64 fd964, fd961, fd926; +mul.f64 fd965, fd963, fd927; +sub.f64 fd966, fd964, fd965; +mul.f64 fd967, fd961, fd927; +fma.rn.f64 fd968, fd963, fd926, fd967; +mul.f64 fd969, fd950, fd961; +mul.f64 fd970, fd951, fd963; +sub.f64 fd971, fd969, fd970; +mul.f64 fd972, fd950, fd963; +fma.rn.f64 fd973, fd951, fd961, fd972; +mul.f64 fd974, fd971, fd930; +mul.f64 fd975, fd973, fd931; +sub.f64 fd976, fd974, fd975; +mul.f64 fd977, fd971, fd931; +fma.rn.f64 fd978, fd973, fd930, fd977; +mul.f64 fd979, fd950, fd971; +mul.f64 fd980, fd951, fd973; +sub.f64 fd981, fd979, fd980; +mul.f64 fd982, fd950, fd973; +fma.rn.f64 fd983, fd951, fd971, fd982; +mul.f64 fd984, fd981, fd934; +mul.f64 fd985, fd983, fd935; +sub.f64 fd986, fd984, fd985; +mul.f64 fd987, fd981, fd935; +fma.rn.f64 fd988, fd983, fd934, fd987; +mul.f64 fd989, fd950, fd981; +mul.f64 fd990, fd951, fd983; +sub.f64 fd991, fd989, fd990; +mul.f64 fd992, fd950, fd983; +fma.rn.f64 fd993, fd951, fd981, fd992; +mul.f64 fd994, fd991, fd938; +mul.f64 fd995, fd993, fd939; +sub.f64 fd996, fd994, fd995; +mul.f64 fd997, fd991, fd939; +fma.rn.f64 fd998, fd993, fd938, fd997; +mul.f64 fd999, fd950, fd991; +mul.f64 fd1000, fd951, fd993; +sub.f64 fd1001, fd999, fd1000; +mul.f64 fd1002, fd950, fd993; +fma.rn.f64 fd1003, fd951, fd991, fd1002; +mul.f64 fd1004, fd1001, fd942; +mul.f64 fd1005, fd1003, fd943; +sub.f64 fd1006, fd1004, fd1005; +mul.f64 fd1007, fd1001, fd943; +fma.rn.f64 fd1008, fd1003, fd942, fd1007; +mul.f64 fd1009, fd950, fd1001; +mul.f64 fd1010, fd951, fd1003; +sub.f64 fd1011, fd1009, fd1010; +mul.f64 fd1012, fd950, fd1003; +fma.rn.f64 fd1013, fd951, fd1001, fd1012; +mul.f64 fd1014, fd1011, fd946; +mul.f64 fd1015, fd1013, fd947; +sub.f64 fd1016, fd1014, fd1015; +mul.f64 fd1017, fd1011, fd947; +fma.rn.f64 fd1018, fd1013, fd946, fd1017; +mul.f64 fd1019, fd950, fd1011; +mul.f64 fd1020, fd951, fd1013; +sub.f64 fd1021, fd1019, fd1020; +mul.f64 fd1022, fd950, fd1013; +fma.rn.f64 fd1023, fd951, fd1011, fd1022; +mul.f64 fd1024, fd1021, fd920; +mul.f64 fd1025, fd1023, fd921; +sub.f64 fd1026, fd1024, fd1025; +mul.f64 fd1027, fd1021, fd921; +fma.rn.f64 fd1028, fd1023, fd920, fd1027; +ld.global.v2.f64 {fd1029, fd1030}, [rd11+64]; +mul.f64 fd1033, fd1029, fd924; +mul.f64 fd1034, fd1030, fd925; +sub.f64 fd1035, fd1033, fd1034; +mul.f64 fd1036, fd1029, fd925; +fma.rn.f64 fd1037, fd1030, fd924, fd1036; +mul.f64 fd1038, fd950, fd1029; +mul.f64 fd1039, fd951, fd1030; +sub.f64 fd1040, fd1038, fd1039; +mul.f64 fd1041, fd950, fd1030; +fma.rn.f64 fd1042, fd951, fd1029, fd1041; +mul.f64 fd1043, fd1040, fd928; +mul.f64 fd1044, fd1042, fd929; +sub.f64 fd1045, fd1043, fd1044; +mul.f64 fd1046, fd1040, fd929; +fma.rn.f64 fd1047, fd1042, fd928, fd1046; +mul.f64 fd1048, fd950, fd1040; +mul.f64 fd1049, fd951, fd1042; +sub.f64 fd1050, fd1048, fd1049; +mul.f64 fd1051, fd950, fd1042; +fma.rn.f64 fd1052, fd951, fd1040, fd1051; +mul.f64 fd1053, fd1050, fd932; +mul.f64 fd1054, fd1052, fd933; +sub.f64 fd1055, fd1053, fd1054; +mul.f64 fd1056, fd1050, fd933; +fma.rn.f64 fd1057, fd1052, fd932, fd1056; +mul.f64 fd1058, fd950, fd1050; +mul.f64 fd1059, fd951, fd1052; +sub.f64 fd1060, fd1058, fd1059; +mul.f64 fd1061, fd950, fd1052; +fma.rn.f64 fd1062, fd951, fd1050, fd1061; +mul.f64 fd1063, fd1060, fd936; +mul.f64 fd1064, fd1062, fd937; +sub.f64 fd1065, fd1063, fd1064; +mul.f64 fd1066, fd1060, fd937; +fma.rn.f64 fd1067, fd1062, fd936, fd1066; +mul.f64 fd1068, fd950, fd1060; +mul.f64 fd1069, fd951, fd1062; +sub.f64 fd1070, fd1068, fd1069; +mul.f64 fd1071, fd950, fd1062; +fma.rn.f64 fd1072, fd951, fd1060, fd1071; +mul.f64 fd1073, fd1070, fd940; +mul.f64 fd1074, fd1072, fd941; +sub.f64 fd1075, fd1073, fd1074; +mul.f64 fd1076, fd1070, fd941; +fma.rn.f64 fd1077, fd1072, fd940, fd1076; +mul.f64 fd1078, fd950, fd1070; +mul.f64 fd1079, fd951, fd1072; +sub.f64 fd1080, fd1078, fd1079; +mul.f64 fd1081, fd950, fd1072; +fma.rn.f64 fd1082, fd951, fd1070, fd1081; +mul.f64 fd1083, fd1080, fd944; +mul.f64 fd1084, fd1082, fd945; +sub.f64 fd1085, fd1083, fd1084; +mul.f64 fd1086, fd1080, fd945; +fma.rn.f64 fd1087, fd1082, fd944, fd1086; +mul.f64 fd1088, fd950, fd1080; +mul.f64 fd1089, fd951, fd1082; +sub.f64 fd1090, fd1088, fd1089; +mul.f64 fd1091, fd950, fd1082; +fma.rn.f64 fd1092, fd951, fd1080, fd1091; +mul.f64 fd1093, fd1090, fd948; +mul.f64 fd1094, fd1092, fd949; +sub.f64 fd1095, fd1093, fd1094; +mul.f64 fd1096, fd1090, fd949; +fma.rn.f64 fd1097, fd1092, fd948, fd1096; +and.b32 r23, r15, 2040; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 98304; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd918; +st.shared.f64 [r26+2048], fd956; +st.shared.f64 [r26+4096], fd966; +st.shared.f64 [r26+6144], fd976; +st.shared.f64 [r26+8192], fd986; +st.shared.f64 [r26+10240], fd996; +st.shared.f64 [r26+12288], fd1006; +st.shared.f64 [r26+14336], fd1016; +st.shared.f64 [r26+16384], fd1026; +st.shared.f64 [r26+18432], fd1035; +st.shared.f64 [r26+20480], fd1045; +st.shared.f64 [r26+22528], fd1055; +st.shared.f64 [r26+24576], fd1065; +st.shared.f64 [r26+26624], fd1075; +st.shared.f64 [r26+28672], fd1085; +st.shared.f64 [r26+30720], fd1095; +barrier.sync 0; +mad.lo.s32 r27, r21, -120, r26; +ld.shared.f64 fd1098, [r27]; +ld.shared.f64 fd1099, [r27+8192]; +ld.shared.f64 fd1100, [r27+16384]; +ld.shared.f64 fd1101, [r27+24576]; +ld.shared.f64 fd1102, [r27+32768]; +ld.shared.f64 fd1103, [r27+40960]; +ld.shared.f64 fd1104, [r27+49152]; +ld.shared.f64 fd1105, [r27+57344]; +ld.shared.f64 fd1106, [r27+65536]; +ld.shared.f64 fd1107, [r27+73728]; +ld.shared.f64 fd1108, [r27+81920]; +ld.shared.f64 fd1109, [r27+90112]; +ld.shared.f64 fd1110, [r27+98304]; +ld.shared.f64 fd1111, [r27+106496]; +ld.shared.f64 fd1112, [r27+114688]; +ld.shared.f64 fd1113, [r27+122880]; +barrier.sync 0; +st.shared.f64 [r26], fd919; +st.shared.f64 [r26+2048], fd958; +st.shared.f64 [r26+4096], fd968; +st.shared.f64 [r26+6144], fd978; +st.shared.f64 [r26+8192], fd988; +st.shared.f64 [r26+10240], fd998; +st.shared.f64 [r26+12288], fd1008; +st.shared.f64 [r26+14336], fd1018; +st.shared.f64 [r26+16384], fd1028; +st.shared.f64 [r26+18432], fd1037; +st.shared.f64 [r26+20480], fd1047; +st.shared.f64 [r26+22528], fd1057; +st.shared.f64 [r26+24576], fd1067; +st.shared.f64 [r26+26624], fd1077; +st.shared.f64 [r26+28672], fd1087; +st.shared.f64 [r26+30720], fd1097; +barrier.sync 0; +ld.shared.f64 fd1114, [r27]; +ld.shared.f64 fd1115, [r27+8192]; +ld.shared.f64 fd1116, [r27+16384]; +ld.shared.f64 fd1117, [r27+24576]; +ld.shared.f64 fd1118, [r27+32768]; +ld.shared.f64 fd1119, [r27+40960]; +ld.shared.f64 fd1120, [r27+49152]; +ld.shared.f64 fd1121, [r27+57344]; +ld.shared.f64 fd1122, [r27+65536]; +ld.shared.f64 fd1123, [r27+73728]; +ld.shared.f64 fd1124, [r27+81920]; +ld.shared.f64 fd1125, [r27+90112]; +ld.shared.f64 fd1126, [r27+98304]; +ld.shared.f64 fd1127, [r27+106496]; +ld.shared.f64 fd1128, [r27+114688]; +ld.shared.f64 fd1129, [r27+122880]; +add.f64 fd1130, fd1098, fd1106; +add.f64 fd1131, fd1114, fd1122; +sub.f64 fd1132, fd1098, fd1106; +sub.f64 fd1133, fd1114, fd1122; +add.f64 fd1134, fd1102, fd1110; +add.f64 fd1135, fd1118, fd1126; +sub.f64 fd1136, fd1102, fd1110; +sub.f64 fd1137, fd1118, fd1126; +add.f64 fd1138, fd1099, fd1107; +add.f64 fd1139, fd1115, fd1123; +sub.f64 fd1140, fd1099, fd1107; +sub.f64 fd1141, fd1115, fd1123; +add.f64 fd1142, fd1103, fd1111; +add.f64 fd1143, fd1119, fd1127; +sub.f64 fd1144, fd1103, fd1111; +sub.f64 fd1145, fd1119, fd1127; +add.f64 fd1146, fd1100, fd1108; +add.f64 fd1147, fd1116, fd1124; +sub.f64 fd1148, fd1100, fd1108; +sub.f64 fd1149, fd1116, fd1124; +add.f64 fd1150, fd1104, fd1112; +add.f64 fd1151, fd1120, fd1128; +sub.f64 fd1152, fd1104, fd1112; +sub.f64 fd1153, fd1120, fd1128; +add.f64 fd1154, fd1101, fd1109; +add.f64 fd1155, fd1117, fd1125; +sub.f64 fd1156, fd1101, fd1109; +sub.f64 fd1157, fd1117, fd1125; +add.f64 fd1158, fd1105, fd1113; +add.f64 fd1159, fd1121, fd1129; +sub.f64 fd1160, fd1105, fd1113; +sub.f64 fd1161, fd1121, fd1129; +add.f64 %0, fd1130, fd1134; +add.f64 %1, fd1131, fd1135; +add.f64 %2, fd1138, fd1142; +add.f64 %3, fd1139, fd1143; +add.f64 %4, fd1146, fd1150; +add.f64 %5, fd1147, fd1151; +add.f64 %6, fd1154, fd1158; +add.f64 %7, fd1155, fd1159; +sub.f64 %9, fd1133, fd1136; +add.f64 %8, fd1132, fd1137; +sub.f64 %11, fd1141, fd1144; +add.f64 %10, fd1140, fd1145; +sub.f64 %13, fd1149, fd1152; +add.f64 %12, fd1148, fd1153; +sub.f64 %15, fd1157, fd1160; +add.f64 %14, fd1156, fd1161; +sub.f64 %16, fd1130, fd1134; +sub.f64 %17, fd1131, fd1135; +sub.f64 %18, fd1138, fd1142; +sub.f64 %19, fd1139, fd1143; +sub.f64 %20, fd1146, fd1150; +sub.f64 %21, fd1147, fd1151; +sub.f64 %22, fd1154, fd1158; +sub.f64 %23, fd1155, fd1159; +add.f64 %25, fd1133, fd1136; +sub.f64 %24, fd1132, fd1137; +add.f64 %27, fd1141, fd1144; +sub.f64 %26, fd1140, fd1145; +add.f64 %29, fd1149, fd1152; +sub.f64 %28, fd1148, fd1153; +add.f64 %31, fd1157, fd1160; +sub.f64 %30, fd1156, fd1161; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_16384), "l"(lut_dp_16_1024), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..d784de2638dd8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16384_fp64_inv.hpp.inc @@ -0,0 +1,1257 @@ +#ifndef CUFFTDX_FFT_16384_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_16384_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1170, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<1194>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %36, %57; +add.f64 fd66, %37, %59; +sub.f64 fd67, %36, %57; +sub.f64 fd68, %37, %59; +add.f64 fd69, %46, %68; +add.f64 fd70, %48, %69; +sub.f64 fd71, %46, %68; +sub.f64 fd72, %48, %69; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %41, %62; +add.f64 fd82, %43, %64; +sub.f64 fd83, %41, %62; +sub.f64 fd84, %43, %64; +add.f64 fd85, %52, %73; +add.f64 fd86, %53, %75; +sub.f64 fd87, %52, %73; +sub.f64 fd88, %53, %75; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %38, %60; +add.f64 fd123, %40, %61; +sub.f64 fd124, %38, %60; +sub.f64 fd125, %40, %61; +add.f64 fd126, %49, %70; +add.f64 fd127, %51, %72; +sub.f64 fd128, %49, %70; +sub.f64 fd129, %51, %72; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %44, %65; +add.f64 fd139, %45, %67; +sub.f64 fd140, %44, %65; +sub.f64 fd141, %45, %67; +add.f64 fd142, %54, %76; +add.f64 fd143, %56, %77; +sub.f64 fd144, %54, %76; +sub.f64 fd145, %56, %77; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+16384]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -131072; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 130944; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+8192]; +ld.shared.f64 fd390, [r13+16384]; +ld.shared.f64 fd391, [r13+24576]; +ld.shared.f64 fd392, [r13+32768]; +ld.shared.f64 fd393, [r13+40960]; +ld.shared.f64 fd394, [r13+49152]; +ld.shared.f64 fd395, [r13+57344]; +ld.shared.f64 fd396, [r13+65536]; +ld.shared.f64 fd397, [r13+73728]; +ld.shared.f64 fd398, [r13+81920]; +ld.shared.f64 fd399, [r13+90112]; +ld.shared.f64 fd400, [r13+98304]; +ld.shared.f64 fd401, [r13+106496]; +ld.shared.f64 fd402, [r13+114688]; +ld.shared.f64 fd403, [r13+122880]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+8192]; +ld.shared.f64 fd406, [r13+16384]; +ld.shared.f64 fd407, [r13+24576]; +ld.shared.f64 fd408, [r13+32768]; +ld.shared.f64 fd409, [r13+40960]; +ld.shared.f64 fd410, [r13+49152]; +ld.shared.f64 fd411, [r13+57344]; +ld.shared.f64 fd412, [r13+65536]; +ld.shared.f64 fd413, [r13+73728]; +ld.shared.f64 fd414, [r13+81920]; +ld.shared.f64 fd415, [r13+90112]; +ld.shared.f64 fd416, [r13+98304]; +ld.shared.f64 fd417, [r13+106496]; +ld.shared.f64 fd418, [r13+114688]; +ld.shared.f64 fd419, [r13+122880]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd542; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd542; +add.f64 fd575, fd473, fd545; +add.f64 fd576, fd474, fd547; +sub.f64 fd577, fd473, fd545; +sub.f64 fd578, fd474, fd547; +sub.f64 fd579, fd463, fd521; +add.f64 fd580, fd464, fd520; +add.f64 fd581, fd463, fd521; +sub.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd550; +add.f64 fd584, fd468, fd552; +sub.f64 fd585, fd467, fd550; +sub.f64 fd586, fd468, fd552; +add.f64 fd587, fd471, fd555; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd555; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 1008; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd568, fd596; +fma.rn.f64 fd600, fd595, fd567, fd599; +mul.f64 fd601, fd567, fd596; +mul.f64 fd602, fd595, fd568; +sub.f64 fd603, fd602, fd601; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd572, fd608; +fma.rn.f64 fd610, fd606, fd571, fd609; +mul.f64 fd611, fd571, fd608; +mul.f64 fd612, fd606, fd572; +sub.f64 fd613, fd612, fd611; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd576, fd618; +fma.rn.f64 fd620, fd616, fd575, fd619; +mul.f64 fd621, fd575, fd618; +mul.f64 fd622, fd616, fd576; +sub.f64 fd623, fd622, fd621; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd580, fd628; +fma.rn.f64 fd630, fd626, fd579, fd629; +mul.f64 fd631, fd579, fd628; +mul.f64 fd632, fd626, fd580; +sub.f64 fd633, fd632, fd631; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd584, fd638; +fma.rn.f64 fd640, fd636, fd583, fd639; +mul.f64 fd641, fd583, fd638; +mul.f64 fd642, fd636, fd584; +sub.f64 fd643, fd642, fd641; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd588, fd648; +fma.rn.f64 fd650, fd646, fd587, fd649; +mul.f64 fd651, fd587, fd648; +mul.f64 fd652, fd646, fd588; +sub.f64 fd653, fd652, fd651; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd592, fd658; +fma.rn.f64 fd660, fd656, fd591, fd659; +mul.f64 fd661, fd591, fd658; +mul.f64 fd662, fd656, fd592; +sub.f64 fd663, fd662, fd661; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd566, fd668; +fma.rn.f64 fd670, fd666, fd565, fd669; +mul.f64 fd671, fd565, fd668; +mul.f64 fd672, fd666, fd566; +sub.f64 fd673, fd672, fd671; +ld.global.v2.f64 {fd674, fd675}, [rd8+1024]; +mul.f64 fd678, fd570, fd675; +fma.rn.f64 fd679, fd674, fd569, fd678; +mul.f64 fd680, fd569, fd675; +mul.f64 fd681, fd674, fd570; +sub.f64 fd682, fd681, fd680; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd574, fd687; +fma.rn.f64 fd689, fd685, fd573, fd688; +mul.f64 fd690, fd573, fd687; +mul.f64 fd691, fd685, fd574; +sub.f64 fd692, fd691, fd690; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd578, fd697; +fma.rn.f64 fd699, fd695, fd577, fd698; +mul.f64 fd700, fd577, fd697; +mul.f64 fd701, fd695, fd578; +sub.f64 fd702, fd701, fd700; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd582, fd707; +fma.rn.f64 fd709, fd705, fd581, fd708; +mul.f64 fd710, fd581, fd707; +mul.f64 fd711, fd705, fd582; +sub.f64 fd712, fd711, fd710; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd586, fd717; +fma.rn.f64 fd719, fd715, fd585, fd718; +mul.f64 fd720, fd585, fd717; +mul.f64 fd721, fd715, fd586; +sub.f64 fd722, fd721, fd720; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd590, fd727; +fma.rn.f64 fd729, fd725, fd589, fd728; +mul.f64 fd730, fd589, fd727; +mul.f64 fd731, fd725, fd590; +sub.f64 fd732, fd731, fd730; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd594, fd737; +fma.rn.f64 fd739, fd735, fd593, fd738; +mul.f64 fd740, fd593, fd737; +mul.f64 fd741, fd735, fd594; +sub.f64 fd742, fd741, fd740; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 129024; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd600; +st.shared.f64 [r19+256], fd610; +st.shared.f64 [r19+384], fd620; +st.shared.f64 [r19+512], fd630; +st.shared.f64 [r19+640], fd640; +st.shared.f64 [r19+768], fd650; +st.shared.f64 [r19+896], fd660; +st.shared.f64 [r19+1024], fd670; +st.shared.f64 [r19+1152], fd679; +st.shared.f64 [r19+1280], fd689; +st.shared.f64 [r19+1408], fd699; +st.shared.f64 [r19+1536], fd709; +st.shared.f64 [r19+1664], fd719; +st.shared.f64 [r19+1792], fd729; +st.shared.f64 [r19+1920], fd739; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+8192]; +ld.shared.f64 fd745, [r20+16384]; +ld.shared.f64 fd746, [r20+24576]; +ld.shared.f64 fd747, [r20+32768]; +ld.shared.f64 fd748, [r20+40960]; +ld.shared.f64 fd749, [r20+49152]; +ld.shared.f64 fd750, [r20+57344]; +ld.shared.f64 fd751, [r20+65536]; +ld.shared.f64 fd752, [r20+73728]; +ld.shared.f64 fd753, [r20+81920]; +ld.shared.f64 fd754, [r20+90112]; +ld.shared.f64 fd755, [r20+98304]; +ld.shared.f64 fd756, [r20+106496]; +ld.shared.f64 fd757, [r20+114688]; +ld.shared.f64 fd758, [r20+122880]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+8192]; +ld.shared.f64 fd761, [r20+16384]; +ld.shared.f64 fd762, [r20+24576]; +ld.shared.f64 fd763, [r20+32768]; +ld.shared.f64 fd764, [r20+40960]; +ld.shared.f64 fd765, [r20+49152]; +ld.shared.f64 fd766, [r20+57344]; +ld.shared.f64 fd767, [r20+65536]; +ld.shared.f64 fd768, [r20+73728]; +ld.shared.f64 fd769, [r20+81920]; +ld.shared.f64 fd770, [r20+90112]; +ld.shared.f64 fd771, [r20+98304]; +ld.shared.f64 fd772, [r20+106496]; +ld.shared.f64 fd773, [r20+114688]; +ld.shared.f64 fd774, [r20+122880]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +sub.f64 fd787, fd777, fd782; +add.f64 fd788, fd778, fd781; +add.f64 fd789, fd777, fd782; +sub.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +sub.f64 fd803, fd793, fd798; +add.f64 fd804, fd794, fd797; +add.f64 fd805, fd793, fd798; +sub.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0d3FE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +add.f64 fd810, fd807, fd808; +mul.f64 fd811, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd812, fd806, 0d3FE6A09E667F3BCD; +sub.f64 fd813, fd811, fd812; +mul.f64 fd814, fd806, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd815, fd805, 0d3FE6A09E667F3BCD, fd814; +add.f64 fd816, fd783, fd799; +add.f64 fd817, fd784, fd800; +sub.f64 fd818, fd783, fd799; +sub.f64 fd819, fd784, fd800; +add.f64 fd820, fd787, fd809; +add.f64 fd821, fd788, fd810; +sub.f64 fd822, fd787, fd809; +sub.f64 fd823, fd788, fd810; +sub.f64 fd824, fd785, fd802; +add.f64 fd825, fd786, fd801; +add.f64 fd826, fd785, fd802; +sub.f64 fd827, fd786, fd801; +add.f64 fd828, fd789, fd813; +add.f64 fd829, fd790, fd815; +sub.f64 fd830, fd789, fd813; +sub.f64 fd831, fd790, fd815; +add.f64 fd832, fd744, fd752; +add.f64 fd833, fd760, fd768; +sub.f64 fd834, fd744, fd752; +sub.f64 fd835, fd760, fd768; +add.f64 fd836, fd748, fd756; +add.f64 fd837, fd764, fd772; +sub.f64 fd838, fd748, fd756; +sub.f64 fd839, fd764, fd772; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +sub.f64 fd844, fd834, fd839; +add.f64 fd845, fd835, fd838; +add.f64 fd846, fd834, fd839; +sub.f64 fd847, fd835, fd838; +add.f64 fd848, fd746, fd754; +add.f64 fd849, fd762, fd770; +sub.f64 fd850, fd746, fd754; +sub.f64 fd851, fd762, fd770; +add.f64 fd852, fd750, fd758; +add.f64 fd853, fd766, fd774; +sub.f64 fd854, fd750, fd758; +sub.f64 fd855, fd766, fd774; +add.f64 fd856, fd848, fd852; +add.f64 fd857, fd849, fd853; +sub.f64 fd858, fd848, fd852; +sub.f64 fd859, fd849, fd853; +sub.f64 fd860, fd850, fd855; +add.f64 fd861, fd851, fd854; +add.f64 fd862, fd850, fd855; +sub.f64 fd863, fd851, fd854; +mul.f64 fd864, fd860, 0d3FE6A09E667F3BCD; +mul.f64 fd865, fd861, 0d3FE6A09E667F3BCD; +sub.f64 fd866, fd864, fd865; +add.f64 fd867, fd864, fd865; +mul.f64 fd868, fd862, 0dBFE6A09E667F3BCD; +mul.f64 fd869, fd863, 0d3FE6A09E667F3BCD; +sub.f64 fd870, fd868, fd869; +mul.f64 fd871, fd863, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd872, fd862, 0d3FE6A09E667F3BCD, fd871; +add.f64 fd873, fd840, fd856; +add.f64 fd874, fd841, fd857; +sub.f64 fd875, fd840, fd856; +sub.f64 fd876, fd841, fd857; +add.f64 fd877, fd844, fd866; +add.f64 fd878, fd845, fd867; +sub.f64 fd879, fd844, fd866; +sub.f64 fd880, fd845, fd867; +sub.f64 fd881, fd842, fd859; +add.f64 fd882, fd843, fd858; +add.f64 fd883, fd842, fd859; +sub.f64 fd884, fd843, fd858; +add.f64 fd885, fd846, fd870; +add.f64 fd886, fd847, fd872; +sub.f64 fd887, fd846, fd870; +sub.f64 fd888, fd847, fd872; +mul.f64 fd889, fd877, 0d3FED906BCF328D46; +mul.f64 fd890, fd878, 0d3FD87DE2A6AEA963; +sub.f64 fd891, fd889, fd890; +mul.f64 fd892, fd878, 0d3FED906BCF328D46; +fma.rn.f64 fd893, fd877, 0d3FD87DE2A6AEA963, fd892; +mul.f64 fd894, fd881, 0d3FE6A09E667F3BCD; +mul.f64 fd895, fd882, 0d3FE6A09E667F3BCD; +sub.f64 fd896, fd894, fd895; +add.f64 fd897, fd894, fd895; +mul.f64 fd898, fd885, 0d3FD87DE2A6AEA963; +mul.f64 fd899, fd886, 0d3FED906BCF328D46; +sub.f64 fd900, fd898, fd899; +mul.f64 fd901, fd886, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd902, fd885, 0d3FED906BCF328D46, fd901; +mul.f64 fd903, fd879, 0dBFD87DE2A6AEA963; +mul.f64 fd904, fd880, 0d3FED906BCF328D46; +sub.f64 fd905, fd903, fd904; +mul.f64 fd906, fd880, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd907, fd879, 0d3FED906BCF328D46, fd906; +mul.f64 fd908, fd883, 0dBFE6A09E667F3BCD; +mul.f64 fd909, fd884, 0d3FE6A09E667F3BCD; +sub.f64 fd910, fd908, fd909; +mul.f64 fd911, fd884, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd912, fd883, 0d3FE6A09E667F3BCD, fd911; +mul.f64 fd913, fd887, 0dBFED906BCF328D46; +mul.f64 fd914, fd888, 0d3FD87DE2A6AEA963; +sub.f64 fd915, fd913, fd914; +mul.f64 fd916, fd888, 0dBFED906BCF328D46; +fma.rn.f64 fd917, fd887, 0d3FD87DE2A6AEA963, fd916; +add.f64 fd918, fd816, fd873; +add.f64 fd919, fd817, fd874; +sub.f64 fd920, fd816, fd873; +sub.f64 fd921, fd817, fd874; +add.f64 fd922, fd820, fd891; +add.f64 fd923, fd821, fd893; +sub.f64 fd924, fd820, fd891; +sub.f64 fd925, fd821, fd893; +add.f64 fd926, fd824, fd896; +add.f64 fd927, fd825, fd897; +sub.f64 fd928, fd824, fd896; +sub.f64 fd929, fd825, fd897; +add.f64 fd930, fd828, fd900; +add.f64 fd931, fd829, fd902; +sub.f64 fd932, fd828, fd900; +sub.f64 fd933, fd829, fd902; +sub.f64 fd934, fd818, fd876; +add.f64 fd935, fd819, fd875; +add.f64 fd936, fd818, fd876; +sub.f64 fd937, fd819, fd875; +add.f64 fd938, fd822, fd905; +add.f64 fd939, fd823, fd907; +sub.f64 fd940, fd822, fd905; +sub.f64 fd941, fd823, fd907; +add.f64 fd942, fd826, fd910; +add.f64 fd943, fd827, fd912; +sub.f64 fd944, fd826, fd910; +sub.f64 fd945, fd827, fd912; +add.f64 fd946, fd830, fd915; +add.f64 fd947, fd831, fd917; +sub.f64 fd948, fd830, fd915; +sub.f64 fd949, fd831, fd917; +and.b32 r21, r5, 768; +bfe.u32 r22, r5, 8, 2; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd950, fd951}, [rd11]; +mul.f64 fd954, fd923, fd951; +fma.rn.f64 fd955, fd950, fd922, fd954; +mul.f64 fd956, fd922, fd951; +mul.f64 fd957, fd950, fd923; +sub.f64 fd958, fd957, fd956; +mul.f64 fd959, fd950, fd950; +mul.f64 fd960, fd951, fd951; +sub.f64 fd961, fd959, fd960; +mul.f64 fd962, fd951, fd950; +fma.rn.f64 fd963, fd951, fd950, fd962; +mul.f64 fd964, fd927, fd963; +fma.rn.f64 fd965, fd961, fd926, fd964; +mul.f64 fd966, fd926, fd963; +mul.f64 fd967, fd961, fd927; +sub.f64 fd968, fd967, fd966; +mul.f64 fd969, fd950, fd961; +mul.f64 fd970, fd951, fd963; +sub.f64 fd971, fd969, fd970; +mul.f64 fd972, fd950, fd963; +fma.rn.f64 fd973, fd951, fd961, fd972; +mul.f64 fd974, fd931, fd973; +fma.rn.f64 fd975, fd971, fd930, fd974; +mul.f64 fd976, fd930, fd973; +mul.f64 fd977, fd971, fd931; +sub.f64 fd978, fd977, fd976; +mul.f64 fd979, fd950, fd971; +mul.f64 fd980, fd951, fd973; +sub.f64 fd981, fd979, fd980; +mul.f64 fd982, fd950, fd973; +fma.rn.f64 fd983, fd951, fd971, fd982; +mul.f64 fd984, fd935, fd983; +fma.rn.f64 fd985, fd981, fd934, fd984; +mul.f64 fd986, fd934, fd983; +mul.f64 fd987, fd981, fd935; +sub.f64 fd988, fd987, fd986; +mul.f64 fd989, fd950, fd981; +mul.f64 fd990, fd951, fd983; +sub.f64 fd991, fd989, fd990; +mul.f64 fd992, fd950, fd983; +fma.rn.f64 fd993, fd951, fd981, fd992; +mul.f64 fd994, fd939, fd993; +fma.rn.f64 fd995, fd991, fd938, fd994; +mul.f64 fd996, fd938, fd993; +mul.f64 fd997, fd991, fd939; +sub.f64 fd998, fd997, fd996; +mul.f64 fd999, fd950, fd991; +mul.f64 fd1000, fd951, fd993; +sub.f64 fd1001, fd999, fd1000; +mul.f64 fd1002, fd950, fd993; +fma.rn.f64 fd1003, fd951, fd991, fd1002; +mul.f64 fd1004, fd943, fd1003; +fma.rn.f64 fd1005, fd1001, fd942, fd1004; +mul.f64 fd1006, fd942, fd1003; +mul.f64 fd1007, fd1001, fd943; +sub.f64 fd1008, fd1007, fd1006; +mul.f64 fd1009, fd950, fd1001; +mul.f64 fd1010, fd951, fd1003; +sub.f64 fd1011, fd1009, fd1010; +mul.f64 fd1012, fd950, fd1003; +fma.rn.f64 fd1013, fd951, fd1001, fd1012; +mul.f64 fd1014, fd947, fd1013; +fma.rn.f64 fd1015, fd1011, fd946, fd1014; +mul.f64 fd1016, fd946, fd1013; +mul.f64 fd1017, fd1011, fd947; +sub.f64 fd1018, fd1017, fd1016; +mul.f64 fd1019, fd950, fd1011; +mul.f64 fd1020, fd951, fd1013; +sub.f64 fd1021, fd1019, fd1020; +mul.f64 fd1022, fd950, fd1013; +fma.rn.f64 fd1023, fd951, fd1011, fd1022; +mul.f64 fd1024, fd921, fd1023; +fma.rn.f64 fd1025, fd1021, fd920, fd1024; +mul.f64 fd1026, fd920, fd1023; +mul.f64 fd1027, fd1021, fd921; +sub.f64 fd1028, fd1027, fd1026; +ld.global.v2.f64 {fd1029, fd1030}, [rd11+64]; +mul.f64 fd1033, fd925, fd1030; +fma.rn.f64 fd1034, fd1029, fd924, fd1033; +mul.f64 fd1035, fd924, fd1030; +mul.f64 fd1036, fd1029, fd925; +sub.f64 fd1037, fd1036, fd1035; +mul.f64 fd1038, fd950, fd1029; +mul.f64 fd1039, fd951, fd1030; +sub.f64 fd1040, fd1038, fd1039; +mul.f64 fd1041, fd950, fd1030; +fma.rn.f64 fd1042, fd951, fd1029, fd1041; +mul.f64 fd1043, fd929, fd1042; +fma.rn.f64 fd1044, fd1040, fd928, fd1043; +mul.f64 fd1045, fd928, fd1042; +mul.f64 fd1046, fd1040, fd929; +sub.f64 fd1047, fd1046, fd1045; +mul.f64 fd1048, fd950, fd1040; +mul.f64 fd1049, fd951, fd1042; +sub.f64 fd1050, fd1048, fd1049; +mul.f64 fd1051, fd950, fd1042; +fma.rn.f64 fd1052, fd951, fd1040, fd1051; +mul.f64 fd1053, fd933, fd1052; +fma.rn.f64 fd1054, fd1050, fd932, fd1053; +mul.f64 fd1055, fd932, fd1052; +mul.f64 fd1056, fd1050, fd933; +sub.f64 fd1057, fd1056, fd1055; +mul.f64 fd1058, fd950, fd1050; +mul.f64 fd1059, fd951, fd1052; +sub.f64 fd1060, fd1058, fd1059; +mul.f64 fd1061, fd950, fd1052; +fma.rn.f64 fd1062, fd951, fd1050, fd1061; +mul.f64 fd1063, fd937, fd1062; +fma.rn.f64 fd1064, fd1060, fd936, fd1063; +mul.f64 fd1065, fd936, fd1062; +mul.f64 fd1066, fd1060, fd937; +sub.f64 fd1067, fd1066, fd1065; +mul.f64 fd1068, fd950, fd1060; +mul.f64 fd1069, fd951, fd1062; +sub.f64 fd1070, fd1068, fd1069; +mul.f64 fd1071, fd950, fd1062; +fma.rn.f64 fd1072, fd951, fd1060, fd1071; +mul.f64 fd1073, fd941, fd1072; +fma.rn.f64 fd1074, fd1070, fd940, fd1073; +mul.f64 fd1075, fd940, fd1072; +mul.f64 fd1076, fd1070, fd941; +sub.f64 fd1077, fd1076, fd1075; +mul.f64 fd1078, fd950, fd1070; +mul.f64 fd1079, fd951, fd1072; +sub.f64 fd1080, fd1078, fd1079; +mul.f64 fd1081, fd950, fd1072; +fma.rn.f64 fd1082, fd951, fd1070, fd1081; +mul.f64 fd1083, fd945, fd1082; +fma.rn.f64 fd1084, fd1080, fd944, fd1083; +mul.f64 fd1085, fd944, fd1082; +mul.f64 fd1086, fd1080, fd945; +sub.f64 fd1087, fd1086, fd1085; +mul.f64 fd1088, fd950, fd1080; +mul.f64 fd1089, fd951, fd1082; +sub.f64 fd1090, fd1088, fd1089; +mul.f64 fd1091, fd950, fd1082; +fma.rn.f64 fd1092, fd951, fd1080, fd1091; +mul.f64 fd1093, fd949, fd1092; +fma.rn.f64 fd1094, fd1090, fd948, fd1093; +mul.f64 fd1095, fd948, fd1092; +mul.f64 fd1096, fd1090, fd949; +sub.f64 fd1097, fd1096, fd1095; +and.b32 r23, r15, 2040; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 98304; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd918; +st.shared.f64 [r26+2048], fd955; +st.shared.f64 [r26+4096], fd965; +st.shared.f64 [r26+6144], fd975; +st.shared.f64 [r26+8192], fd985; +st.shared.f64 [r26+10240], fd995; +st.shared.f64 [r26+12288], fd1005; +st.shared.f64 [r26+14336], fd1015; +st.shared.f64 [r26+16384], fd1025; +st.shared.f64 [r26+18432], fd1034; +st.shared.f64 [r26+20480], fd1044; +st.shared.f64 [r26+22528], fd1054; +st.shared.f64 [r26+24576], fd1064; +st.shared.f64 [r26+26624], fd1074; +st.shared.f64 [r26+28672], fd1084; +st.shared.f64 [r26+30720], fd1094; +barrier.sync 0; +mad.lo.s32 r27, r21, -120, r26; +ld.shared.f64 fd1098, [r27]; +ld.shared.f64 fd1099, [r27+8192]; +ld.shared.f64 fd1100, [r27+16384]; +ld.shared.f64 fd1101, [r27+24576]; +ld.shared.f64 fd1102, [r27+32768]; +ld.shared.f64 fd1103, [r27+40960]; +ld.shared.f64 fd1104, [r27+49152]; +ld.shared.f64 fd1105, [r27+57344]; +ld.shared.f64 fd1106, [r27+65536]; +ld.shared.f64 fd1107, [r27+73728]; +ld.shared.f64 fd1108, [r27+81920]; +ld.shared.f64 fd1109, [r27+90112]; +ld.shared.f64 fd1110, [r27+98304]; +ld.shared.f64 fd1111, [r27+106496]; +ld.shared.f64 fd1112, [r27+114688]; +ld.shared.f64 fd1113, [r27+122880]; +barrier.sync 0; +st.shared.f64 [r26], fd919; +st.shared.f64 [r26+2048], fd958; +st.shared.f64 [r26+4096], fd968; +st.shared.f64 [r26+6144], fd978; +st.shared.f64 [r26+8192], fd988; +st.shared.f64 [r26+10240], fd998; +st.shared.f64 [r26+12288], fd1008; +st.shared.f64 [r26+14336], fd1018; +st.shared.f64 [r26+16384], fd1028; +st.shared.f64 [r26+18432], fd1037; +st.shared.f64 [r26+20480], fd1047; +st.shared.f64 [r26+22528], fd1057; +st.shared.f64 [r26+24576], fd1067; +st.shared.f64 [r26+26624], fd1077; +st.shared.f64 [r26+28672], fd1087; +st.shared.f64 [r26+30720], fd1097; +barrier.sync 0; +ld.shared.f64 fd1114, [r27]; +ld.shared.f64 fd1115, [r27+8192]; +ld.shared.f64 fd1116, [r27+16384]; +ld.shared.f64 fd1117, [r27+24576]; +ld.shared.f64 fd1118, [r27+32768]; +ld.shared.f64 fd1119, [r27+40960]; +ld.shared.f64 fd1120, [r27+49152]; +ld.shared.f64 fd1121, [r27+57344]; +ld.shared.f64 fd1122, [r27+65536]; +ld.shared.f64 fd1123, [r27+73728]; +ld.shared.f64 fd1124, [r27+81920]; +ld.shared.f64 fd1125, [r27+90112]; +ld.shared.f64 fd1126, [r27+98304]; +ld.shared.f64 fd1127, [r27+106496]; +ld.shared.f64 fd1128, [r27+114688]; +ld.shared.f64 fd1129, [r27+122880]; +add.f64 fd1130, fd1098, fd1106; +add.f64 fd1131, fd1114, fd1122; +sub.f64 fd1132, fd1098, fd1106; +sub.f64 fd1133, fd1114, fd1122; +add.f64 fd1134, fd1102, fd1110; +add.f64 fd1135, fd1118, fd1126; +sub.f64 fd1136, fd1102, fd1110; +sub.f64 fd1137, fd1118, fd1126; +add.f64 fd1138, fd1099, fd1107; +add.f64 fd1139, fd1115, fd1123; +sub.f64 fd1140, fd1099, fd1107; +sub.f64 fd1141, fd1115, fd1123; +add.f64 fd1142, fd1103, fd1111; +add.f64 fd1143, fd1119, fd1127; +sub.f64 fd1144, fd1103, fd1111; +sub.f64 fd1145, fd1119, fd1127; +add.f64 fd1146, fd1100, fd1108; +add.f64 fd1147, fd1116, fd1124; +sub.f64 fd1148, fd1100, fd1108; +sub.f64 fd1149, fd1116, fd1124; +add.f64 fd1150, fd1104, fd1112; +add.f64 fd1151, fd1120, fd1128; +sub.f64 fd1152, fd1104, fd1112; +sub.f64 fd1153, fd1120, fd1128; +add.f64 fd1154, fd1101, fd1109; +add.f64 fd1155, fd1117, fd1125; +sub.f64 fd1156, fd1101, fd1109; +sub.f64 fd1157, fd1117, fd1125; +add.f64 fd1158, fd1105, fd1113; +add.f64 fd1159, fd1121, fd1129; +sub.f64 fd1160, fd1105, fd1113; +sub.f64 fd1161, fd1121, fd1129; +add.f64 %0, fd1130, fd1134; +add.f64 %1, fd1131, fd1135; +add.f64 %2, fd1138, fd1142; +add.f64 %3, fd1139, fd1143; +add.f64 %4, fd1146, fd1150; +add.f64 %5, fd1147, fd1151; +add.f64 %6, fd1154, fd1158; +add.f64 %7, fd1155, fd1159; +add.f64 %9, fd1133, fd1136; +sub.f64 %8, fd1132, fd1137; +add.f64 %11, fd1141, fd1144; +sub.f64 %10, fd1140, fd1145; +add.f64 %13, fd1149, fd1152; +sub.f64 %12, fd1148, fd1153; +add.f64 %15, fd1157, fd1160; +sub.f64 %14, fd1156, fd1161; +sub.f64 %16, fd1130, fd1134; +sub.f64 %17, fd1131, fd1135; +sub.f64 %18, fd1138, fd1142; +sub.f64 %19, fd1139, fd1143; +sub.f64 %20, fd1146, fd1150; +sub.f64 %21, fd1147, fd1151; +sub.f64 %22, fd1154, fd1158; +sub.f64 %23, fd1155, fd1159; +sub.f64 %25, fd1133, fd1136; +add.f64 %24, fd1132, fd1137; +sub.f64 %27, fd1141, fd1144; +add.f64 %26, fd1140, fd1145; +sub.f64 %29, fd1149, fd1152; +add.f64 %28, fd1148, fd1153; +sub.f64 %31, fd1157, fd1160; +add.f64 %30, fd1156, fd1161; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_16384), "l"(lut_dp_16_1024), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..d1a1fee2722b4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp16_fwd.hpp.inc @@ -0,0 +1,3037 @@ +#ifndef CUFFTDX_FFT_16_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_16_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<771, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<59>; +.reg .b32 r<559>; +.reg .f64 fd<59>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %32, %48; +} +{ +add.f16x2 r4, %33, %49; +} +{ +sub.f16x2 r7, %32, %48; +} +{ +sub.f16x2 r10, %33, %49; +} +{ +add.f16x2 r13, %40, %56; +} +{ +add.f16x2 r16, %41, %57; +} +{ +sub.f16x2 r19, %40, %56; +} +{ +sub.f16x2 r22, %41, %57; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %36, %52; +} +{ +add.f16x2 r54, %37, %53; +} +{ +sub.f16x2 r57, %36, %52; +} +{ +sub.f16x2 r60, %37, %53; +} +{ +add.f16x2 r63, %44, %60; +} +{ +add.f16x2 r66, %45, %61; +} +{ +sub.f16x2 r69, %44, %60; +} +{ +sub.f16x2 r72, %45, %61; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f64 fd31, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs1, fd31; +} +mov.f64 fd40, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs2, fd40; +} +{ +cvt.rn.f16.f64 rs5, fd40; +} +{ +cvt.rn.f16.f64 rs6, fd40; +} +mov.b32 r115, {rs1, rs1}; +{ +mul.f16x2 r101, r89, r115; +} +mov.b32 r112, {rs2, rs2}; +{ +mul.f16x2 r104, r92, r112; +} +{ +sub.f16x2 r107, r101, r104; +} +{ +mul.f16x2 r110, r89, r112; +} +{ +fma.rn.f16x2 r113, r92, r115, r110; +} +{ +neg.f16x2 r117, r83; +} +mov.b32 r133, {rs5, rs5}; +{ +mul.f16x2 r119, r95, r133; +} +mov.b32 r130, {rs6, rs6}; +{ +mul.f16x2 r122, r98, r130; +} +{ +sub.f16x2 r125, r119, r122; +} +{ +mul.f16x2 r128, r95, r130; +} +{ +fma.rn.f16x2 r131, r98, r133, r128; +} +{ +add.f16x2 r135, r27, r77; +} +{ +add.f16x2 r138, r30, r80; +} +{ +sub.f16x2 r141, r27, r77; +} +{ +sub.f16x2 r144, r30, r80; +} +{ +add.f16x2 r147, r39, r107; +} +{ +add.f16x2 r150, r42, r113; +} +{ +sub.f16x2 r153, r39, r107; +} +{ +sub.f16x2 r156, r42, r113; +} +{ +add.f16x2 r159, r33, r86; +} +{ +add.f16x2 r162, r36, r117; +} +{ +sub.f16x2 r165, r33, r86; +} +{ +sub.f16x2 r168, r36, r117; +} +{ +add.f16x2 r171, r45, r125; +} +{ +add.f16x2 r174, r48, r131; +} +{ +sub.f16x2 r177, r45, r125; +} +{ +sub.f16x2 r180, r48, r131; +} +{ +add.f16x2 r183, %34, %50; +} +{ +add.f16x2 r186, %35, %51; +} +{ +sub.f16x2 r189, %34, %50; +} +{ +sub.f16x2 r192, %35, %51; +} +{ +add.f16x2 r195, %42, %58; +} +{ +add.f16x2 r198, %43, %59; +} +{ +sub.f16x2 r201, %42, %58; +} +{ +sub.f16x2 r204, %43, %59; +} +{ +neg.f16x2 r207, r201; +} +{ +add.f16x2 r209, r183, r195; +} +{ +add.f16x2 r212, r186, r198; +} +{ +sub.f16x2 r215, r183, r195; +} +{ +sub.f16x2 r218, r186, r198; +} +{ +add.f16x2 r221, r189, r204; +} +{ +add.f16x2 r224, r192, r207; +} +{ +sub.f16x2 r227, r189, r204; +} +{ +sub.f16x2 r230, r192, r207; +} +{ +add.f16x2 r233, %38, %54; +} +{ +add.f16x2 r236, %39, %55; +} +{ +sub.f16x2 r239, %38, %54; +} +{ +sub.f16x2 r242, %39, %55; +} +{ +add.f16x2 r245, %46, %62; +} +{ +add.f16x2 r248, %47, %63; +} +{ +sub.f16x2 r251, %46, %62; +} +{ +sub.f16x2 r254, %47, %63; +} +{ +neg.f16x2 r257, r251; +} +{ +add.f16x2 r259, r233, r245; +} +{ +add.f16x2 r262, r236, r248; +} +{ +sub.f16x2 r265, r233, r245; +} +{ +sub.f16x2 r268, r236, r248; +} +{ +add.f16x2 r271, r239, r254; +} +{ +add.f16x2 r274, r242, r257; +} +{ +sub.f16x2 r277, r239, r254; +} +{ +sub.f16x2 r280, r242, r257; +} +{ +cvt.rn.f16.f64 rs15, fd31; +} +{ +cvt.rn.f16.f64 rs16, fd40; +} +{ +cvt.rn.f16.f64 rs19, fd40; +} +{ +cvt.rn.f16.f64 rs20, fd40; +} +mov.b32 r297, {rs15, rs15}; +{ +mul.f16x2 r283, r271, r297; +} +mov.b32 r294, {rs16, rs16}; +{ +mul.f16x2 r286, r274, r294; +} +{ +sub.f16x2 r289, r283, r286; +} +{ +mul.f16x2 r292, r271, r294; +} +{ +fma.rn.f16x2 r295, r274, r297, r292; +} +{ +neg.f16x2 r299, r265; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r277, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r280, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r277, r312; +} +{ +fma.rn.f16x2 r313, r280, r315, r310; +} +{ +add.f16x2 r317, r209, r259; +} +{ +add.f16x2 r320, r212, r262; +} +{ +sub.f16x2 r323, r209, r259; +} +{ +sub.f16x2 r326, r212, r262; +} +{ +add.f16x2 r329, r221, r289; +} +{ +add.f16x2 r332, r224, r295; +} +{ +sub.f16x2 r335, r221, r289; +} +{ +sub.f16x2 r338, r224, r295; +} +{ +add.f16x2 r341, r215, r268; +} +{ +add.f16x2 r344, r218, r299; +} +{ +sub.f16x2 r347, r215, r268; +} +{ +sub.f16x2 r350, r218, r299; +} +{ +add.f16x2 r353, r227, r307; +} +{ +add.f16x2 r356, r230, r313; +} +{ +sub.f16x2 r359, r227, r307; +} +{ +sub.f16x2 r362, r230, r313; +} +mov.f64 fd29, 0d3FED906BCF328D46; +{ +cvt.rn.f16.f64 rs29, fd29; +} +mov.f64 fd42, 0dBFD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs30, fd42; +} +{ +cvt.rn.f16.f64 rs31, fd31; +} +{ +cvt.rn.f16.f64 rs32, fd40; +} +mov.f64 fd33, 0d3FD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs33, fd33; +} +mov.f64 fd41, 0dBFED906BCF328D46; +{ +cvt.rn.f16.f64 rs34, fd41; +} +{ +cvt.rn.f16.f64 rs37, fd42; +} +{ +cvt.rn.f16.f64 rs38, fd41; +} +{ +cvt.rn.f16.f64 rs39, fd40; +} +{ +cvt.rn.f16.f64 rs40, fd40; +} +{ +cvt.rn.f16.f64 rs41, fd41; +} +{ +cvt.rn.f16.f64 rs42, fd42; +} +mov.b32 r379, {rs29, rs29}; +{ +mul.f16x2 r365, r329, r379; +} +mov.b32 r376, {rs30, rs30}; +{ +mul.f16x2 r368, r332, r376; +} +{ +sub.f16x2 r371, r365, r368; +} +{ +mul.f16x2 r374, r329, r376; +} +{ +fma.rn.f16x2 r377, r332, r379, r374; +} +mov.b32 r395, {rs31, rs31}; +{ +mul.f16x2 r381, r341, r395; +} +mov.b32 r392, {rs32, rs32}; +{ +mul.f16x2 r384, r344, r392; +} +{ +sub.f16x2 r387, r381, r384; +} +{ +mul.f16x2 r390, r341, r392; +} +{ +fma.rn.f16x2 r393, r344, r395, r390; +} +mov.b32 r411, {rs33, rs33}; +{ +mul.f16x2 r397, r353, r411; +} +mov.b32 r408, {rs34, rs34}; +{ +mul.f16x2 r400, r356, r408; +} +{ +sub.f16x2 r403, r397, r400; +} +{ +mul.f16x2 r406, r353, r408; +} +{ +fma.rn.f16x2 r409, r356, r411, r406; +} +{ +neg.f16x2 r413, r323; +} +mov.b32 r429, {rs37, rs37}; +{ +mul.f16x2 r415, r335, r429; +} +mov.b32 r426, {rs38, rs38}; +{ +mul.f16x2 r418, r338, r426; +} +{ +sub.f16x2 r421, r415, r418; +} +{ +mul.f16x2 r424, r335, r426; +} +{ +fma.rn.f16x2 r427, r338, r429, r424; +} +mov.b32 r445, {rs39, rs39}; +{ +mul.f16x2 r431, r347, r445; +} +mov.b32 r442, {rs40, rs40}; +{ +mul.f16x2 r434, r350, r442; +} +{ +sub.f16x2 r437, r431, r434; +} +{ +mul.f16x2 r440, r347, r442; +} +{ +fma.rn.f16x2 r443, r350, r445, r440; +} +mov.b32 r461, {rs41, rs41}; +{ +mul.f16x2 r447, r359, r461; +} +mov.b32 r458, {rs42, rs42}; +{ +mul.f16x2 r450, r362, r458; +} +{ +sub.f16x2 r453, r447, r450; +} +{ +mul.f16x2 r456, r359, r458; +} +{ +fma.rn.f16x2 r459, r362, r461, r456; +} +{ +add.f16x2 %0, r135, r317; +} +{ +add.f16x2 %1, r138, r320; +} +{ +sub.f16x2 %16, r135, r317; +} +{ +sub.f16x2 %17, r138, r320; +} +{ +add.f16x2 %2, r147, r371; +} +{ +add.f16x2 %3, r150, r377; +} +{ +sub.f16x2 %18, r147, r371; +} +{ +sub.f16x2 %19, r150, r377; +} +{ +add.f16x2 %4, r159, r387; +} +{ +add.f16x2 %5, r162, r393; +} +{ +sub.f16x2 %20, r159, r387; +} +{ +sub.f16x2 %21, r162, r393; +} +{ +add.f16x2 %6, r171, r403; +} +{ +add.f16x2 %7, r174, r409; +} +{ +sub.f16x2 %22, r171, r403; +} +{ +sub.f16x2 %23, r174, r409; +} +{ +add.f16x2 %8, r141, r326; +} +{ +add.f16x2 %9, r144, r413; +} +{ +sub.f16x2 %24, r141, r326; +} +{ +sub.f16x2 %25, r144, r413; +} +{ +add.f16x2 %10, r153, r421; +} +{ +add.f16x2 %11, r156, r427; +} +{ +sub.f16x2 %26, r153, r421; +} +{ +sub.f16x2 %27, r156, r427; +} +{ +add.f16x2 %12, r165, r437; +} +{ +add.f16x2 %13, r168, r443; +} +{ +sub.f16x2 %28, r165, r437; +} +{ +sub.f16x2 %29, r168, r443; +} +{ +add.f16x2 %14, r177, r453; +} +{ +add.f16x2 %15, r180, r459; +} +{ +sub.f16x2 %30, r177, r453; +} +{ +sub.f16x2 %31, r180, r459; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<772, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<227>; +.reg .b64 rd<2>; +mov.u32 r215, %tid.y; +shl.b32 r216, r215, 6; +mov.u32 r217, %8; +add.s32 r218, r217, r216; +mov.u32 r219, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r220, r219, 3; +shl.b32 r221, r219, 4; +and.b32 r222, r221, -64; +add.s32 r223, r218, r222; +cvt.rn.f32.u32 f11, r220; +mul.f32 f12, f11, 0f3EC90FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r224, r221, 48; +add.s32 r225, r223, r224; +st.shared.v4.f32 [r225], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r226, r220, -12, r225; +ld.shared.u32 r166, [r226]; +ld.shared.u32 r178, [r226+16]; +ld.shared.u32 r167, [r226+32]; +ld.shared.u32 r179, [r226+48]; +barrier.sync 0; +st.shared.v4.f32 [r225], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r226]; +ld.shared.u32 r181, [r226+16]; +ld.shared.u32 r170, [r226+32]; +ld.shared.u32 r182, [r226+48]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 %0, r165, r177; +} +{ +add.f16x2 %1, r168, r180; +} +{ +sub.f16x2 %4, r165, r177; +} +{ +sub.f16x2 %5, r168, r180; +} +{ +add.f16x2 %2, r171, r186; +} +{ +add.f16x2 %3, r174, r189; +} +{ +sub.f16x2 %6, r171, r186; +} +{ +sub.f16x2 %7, r174, r189; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<773, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<227>; +.reg .b64 rd<2>; +mov.u32 r215, %tid.y; +shl.b32 r216, r215, 7; +mov.u32 r217, %8; +add.s32 r218, r217, r216; +mov.u32 r219, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r220, r219, 3; +shl.b32 r221, r219, 5; +and.b32 r222, r221, -128; +add.s32 r223, r218, r222; +cvt.rn.f32.u32 f11, r220; +mul.f32 f12, f11, 0f3EC90FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r224, r221, 96; +add.s32 r225, r223, r224; +st.shared.v4.f32 [r225], {r27, r30, r63, r70}; +st.shared.v4.f32 [r225+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r226, r220, -24, r225; +ld.shared.u32 r166, [r226]; +ld.shared.u32 r169, [r226+4]; +ld.shared.u32 r178, [r226+32]; +ld.shared.u32 r181, [r226+36]; +ld.shared.u32 r167, [r226+64]; +ld.shared.u32 r170, [r226+68]; +ld.shared.u32 r179, [r226+96]; +ld.shared.u32 r182, [r226+100]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 %0, r165, r177; +} +{ +add.f16x2 %1, r168, r180; +} +{ +sub.f16x2 %4, r165, r177; +} +{ +sub.f16x2 %5, r168, r180; +} +{ +add.f16x2 %2, r171, r186; +} +{ +add.f16x2 %3, r174, r189; +} +{ +sub.f16x2 %6, r171, r186; +} +{ +sub.f16x2 %7, r174, r189; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<774, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<519>; +.reg .b64 rd<2>; +mov.u32 r507, %tid.y; +shl.b32 r508, r507, 6; +mov.u32 r509, %16; +add.s32 r510, r509, r508; +mov.u32 r511, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f2, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f2; +cvt.rn.f16.f32 high, f2; +mov.b32 r101, {low, high}; +} +mov.f32 f12, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f44, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r512, r511, 1; +shl.b32 r513, r511, 5; +and.b32 r514, r513, -64; +add.s32 r515, r510, r514; +cvt.rn.f32.u32 f47, r512; +mul.f32 f48, f47, 0f3EC90FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r516, r513, 32; +add.s32 r517, r515, r516; +st.shared.v4.f32 [r517], {r149, r209, r246, r283}; +st.shared.v4.f32 [r517+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r518, r512, -28, r517; +ld.shared.u32 r460, [r518]; +ld.shared.u32 r472, [r518+8]; +ld.shared.u32 r484, [r518+16]; +ld.shared.u32 r496, [r518+24]; +ld.shared.u32 r461, [r518+32]; +ld.shared.u32 r473, [r518+40]; +ld.shared.u32 r485, [r518+48]; +ld.shared.u32 r497, [r518+56]; +barrier.sync 0; +st.shared.v4.f32 [r517], {r152, r216, r253, r290}; +st.shared.v4.f32 [r517+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r518]; +ld.shared.u32 r475, [r518+8]; +ld.shared.u32 r487, [r518+16]; +ld.shared.u32 r499, [r518+24]; +ld.shared.u32 r464, [r518+32]; +ld.shared.u32 r476, [r518+40]; +ld.shared.u32 r488, [r518+48]; +ld.shared.u32 r500, [r518+56]; +{ +add.f16x2 %0, r460, r461; +} +{ +add.f16x2 %1, r463, r464; +} +{ +sub.f16x2 %8, r460, r461; +} +{ +sub.f16x2 %9, r463, r464; +} +{ +add.f16x2 %2, r472, r473; +} +{ +add.f16x2 %3, r475, r476; +} +{ +sub.f16x2 %10, r472, r473; +} +{ +sub.f16x2 %11, r475, r476; +} +{ +add.f16x2 %4, r484, r485; +} +{ +add.f16x2 %5, r487, r488; +} +{ +sub.f16x2 %12, r484, r485; +} +{ +sub.f16x2 %13, r487, r488; +} +{ +add.f16x2 %6, r496, r497; +} +{ +add.f16x2 %7, r499, r500; +} +{ +sub.f16x2 %14, r496, r497; +} +{ +sub.f16x2 %15, r499, r500; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<775, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<519>; +.reg .b64 rd<2>; +mov.u32 r507, %tid.y; +shl.b32 r508, r507, 7; +mov.u32 r509, %16; +add.s32 r510, r509, r508; +mov.u32 r511, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f2, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f2; +cvt.rn.f16.f32 high, f2; +mov.b32 r101, {low, high}; +} +mov.f32 f12, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f44, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r512, r511, 1; +shl.b32 r513, r511, 6; +and.b32 r514, r513, -128; +add.s32 r515, r510, r514; +cvt.rn.f32.u32 f47, r512; +mul.f32 f48, f47, 0f3EC90FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r516, r513, 64; +add.s32 r517, r515, r516; +st.shared.v4.f32 [r517], {r149, r152, r209, r216}; +st.shared.v4.f32 [r517+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r517+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r517+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r518, r512, -56, r517; +ld.shared.u32 r460, [r518]; +ld.shared.u32 r463, [r518+4]; +ld.shared.u32 r472, [r518+16]; +ld.shared.u32 r475, [r518+20]; +ld.shared.u32 r484, [r518+32]; +ld.shared.u32 r487, [r518+36]; +ld.shared.u32 r496, [r518+48]; +ld.shared.u32 r499, [r518+52]; +ld.shared.u32 r461, [r518+64]; +ld.shared.u32 r464, [r518+68]; +ld.shared.u32 r473, [r518+80]; +ld.shared.u32 r476, [r518+84]; +ld.shared.u32 r485, [r518+96]; +ld.shared.u32 r488, [r518+100]; +ld.shared.u32 r497, [r518+112]; +ld.shared.u32 r500, [r518+116]; +{ +add.f16x2 %0, r460, r461; +} +{ +add.f16x2 %1, r463, r464; +} +{ +sub.f16x2 %8, r460, r461; +} +{ +sub.f16x2 %9, r463, r464; +} +{ +add.f16x2 %2, r472, r473; +} +{ +add.f16x2 %3, r475, r476; +} +{ +sub.f16x2 %10, r472, r473; +} +{ +sub.f16x2 %11, r475, r476; +} +{ +add.f16x2 %4, r484, r485; +} +{ +add.f16x2 %5, r487, r488; +} +{ +sub.f16x2 %12, r484, r485; +} +{ +sub.f16x2 %13, r487, r488; +} +{ +add.f16x2 %6, r496, r497; +} +{ +add.f16x2 %7, r499, r500; +} +{ +sub.f16x2 %14, r496, r497; +} +{ +sub.f16x2 %15, r499, r500; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<776, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<28>; +.reg .b32 r<197>; +.reg .b64 rd<2>; +mov.u32 r169, %tid.y; +shl.b32 r170, r169, 6; +mov.u32 r171, %4; +add.s32 r172, r171, r170; +mov.u32 r173, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r174, r173, 7; +shl.b32 r175, r173, 3; +and.b32 r176, r175, -64; +add.s32 r177, r172, r176; +cvt.rn.f32.u32 f19, r174; +mul.f32 f20, f19, 0f3EC90FDB; +cos.approx.f32 f1, f20; +sin.approx.f32 f21, f20; +neg.f32 f2, f21; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r178, r175, 56; +add.s32 r179, r177, r178; +st.shared.v2.f32 [r179], {r1, r25}; +barrier.sync 0; +shl.b32 r180, r173, 2; +and.b32 r181, r180, 28; +sub.s32 r182, r179, r181; +ld.shared.u32 r54, [r182]; +ld.shared.u32 r55, [r182+32]; +barrier.sync 0; +st.shared.v2.f32 [r179], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r182]; +ld.shared.u32 r58, [r182+32]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r183, r173, 1, 2; +and.b32 r184, r180, 4; +add.s32 r185, r177, r184; +cvt.rn.f32.u32 f22, r183; +mul.f32 f23, f22, 0f3F490FDB; +cos.approx.f32 f7, f23; +sin.approx.f32 f24, f23; +neg.f32 f8, f24; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r186, r175, 48; +add.s32 r187, r185, r186; +st.shared.u32 [r187], r53; +st.shared.u32 [r187+8], r77; +barrier.sync 0; +and.b32 r188, r180, 24; +sub.s32 r189, r187, r188; +ld.shared.u32 r106, [r189]; +ld.shared.u32 r107, [r189+32]; +barrier.sync 0; +st.shared.u32 [r187], r56; +st.shared.u32 [r187+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r189]; +ld.shared.u32 r110, [r189+32]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r190, r173, 2, 1; +and.b32 r191, r180, 12; +add.s32 r192, r177, r191; +cvt.rn.f32.u32 f25, r190; +mul.f32 f26, f25, 0f3FC90FDB; +cos.approx.f32 f13, f26; +sin.approx.f32 f27, f26; +neg.f32 f14, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +barrier.sync 0; +and.b32 r193, r175, 32; +add.s32 r194, r192, r193; +st.shared.u32 [r194], r105; +st.shared.u32 [r194+16], r129; +barrier.sync 0; +and.b32 r195, r180, 16; +sub.s32 r196, r194, r195; +ld.shared.u32 r158, [r196]; +ld.shared.u32 r159, [r196+32]; +barrier.sync 0; +st.shared.u32 [r194], r108; +st.shared.u32 [r194+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r196]; +ld.shared.u32 r162, [r196+32]; +{ +add.f16x2 %0, r158, r159; +} +{ +add.f16x2 %1, r161, r162; +} +{ +sub.f16x2 %2, r158, r159; +} +{ +sub.f16x2 %3, r161, r162; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<777, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<28>; +.reg .b32 r<197>; +.reg .b64 rd<2>; +mov.u32 r169, %tid.y; +shl.b32 r170, r169, 7; +mov.u32 r171, %4; +add.s32 r172, r171, r170; +mov.u32 r173, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r174, r173, 7; +shl.b32 r175, r173, 4; +and.b32 r176, r175, -128; +add.s32 r177, r172, r176; +cvt.rn.f32.u32 f19, r174; +mul.f32 f20, f19, 0f3EC90FDB; +cos.approx.f32 f1, f20; +sin.approx.f32 f21, f20; +neg.f32 f2, f21; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r178, r175, 112; +add.s32 r179, r177, r178; +st.shared.v2.f32 [r179], {r1, r4}; +st.shared.v2.f32 [r179+8], {r25, r32}; +barrier.sync 0; +shl.b32 r180, r173, 3; +and.b32 r181, r180, 56; +sub.s32 r182, r179, r181; +ld.shared.u32 r54, [r182]; +ld.shared.u32 r57, [r182+4]; +ld.shared.u32 r55, [r182+64]; +ld.shared.u32 r58, [r182+68]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r183, r173, 1, 2; +cvt.rn.f32.u32 f22, r183; +mul.f32 f23, f22, 0f3F490FDB; +cos.approx.f32 f7, f23; +sin.approx.f32 f24, f23; +neg.f32 f8, f24; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r184, r180, 8; +add.s32 r185, r177, r184; +barrier.sync 0; +and.b32 r186, r175, 96; +add.s32 r187, r185, r186; +st.shared.u32 [r187], r53; +st.shared.u32 [r187+4], r56; +st.shared.u32 [r187+16], r77; +st.shared.u32 [r187+20], r84; +barrier.sync 0; +and.b32 r188, r180, 48; +sub.s32 r189, r187, r188; +ld.shared.u32 r106, [r189]; +ld.shared.u32 r109, [r189+4]; +ld.shared.u32 r107, [r189+64]; +ld.shared.u32 r110, [r189+68]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r190, r173, 2, 1; +cvt.rn.f32.u32 f25, r190; +mul.f32 f26, f25, 0f3FC90FDB; +cos.approx.f32 f13, f26; +sin.approx.f32 f27, f26; +neg.f32 f14, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +and.b32 r191, r180, 24; +add.s32 r192, r177, r191; +barrier.sync 0; +and.b32 r193, r175, 64; +add.s32 r194, r192, r193; +st.shared.u32 [r194], r105; +st.shared.u32 [r194+4], r108; +st.shared.u32 [r194+32], r129; +st.shared.u32 [r194+36], r136; +barrier.sync 0; +and.b32 r195, r180, 32; +sub.s32 r196, r194, r195; +ld.shared.u32 r158, [r196]; +ld.shared.u32 r161, [r196+4]; +ld.shared.u32 r159, [r196+64]; +ld.shared.u32 r162, [r196+68]; +{ +add.f16x2 %0, r158, r159; +} +{ +add.f16x2 %1, r161, r162; +} +{ +sub.f16x2 %2, r158, r159; +} +{ +sub.f16x2 %3, r161, r162; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..0483e4d5bf328 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp16_inv.hpp.inc @@ -0,0 +1,3037 @@ +#ifndef CUFFTDX_FFT_16_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_16_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<973, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<59>; +.reg .b32 r<559>; +.reg .f64 fd<59>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %32, %48; +} +{ +add.f16x2 r4, %33, %49; +} +{ +sub.f16x2 r7, %32, %48; +} +{ +sub.f16x2 r10, %33, %49; +} +{ +add.f16x2 r13, %40, %56; +} +{ +add.f16x2 r16, %41, %57; +} +{ +sub.f16x2 r19, %40, %56; +} +{ +sub.f16x2 r22, %41, %57; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %36, %52; +} +{ +add.f16x2 r54, %37, %53; +} +{ +sub.f16x2 r57, %36, %52; +} +{ +sub.f16x2 r60, %37, %53; +} +{ +add.f16x2 r63, %44, %60; +} +{ +add.f16x2 r66, %45, %61; +} +{ +sub.f16x2 r69, %44, %60; +} +{ +sub.f16x2 r72, %45, %61; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f64 fd40, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs1, fd40; +} +{ +cvt.rn.f16.f64 rs2, fd40; +} +mov.f64 fd39, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs5, fd39; +} +{ +cvt.rn.f16.f64 rs6, fd40; +} +mov.b32 r115, {rs1, rs1}; +{ +mul.f16x2 r101, r89, r115; +} +mov.b32 r112, {rs2, rs2}; +{ +mul.f16x2 r104, r92, r112; +} +{ +sub.f16x2 r107, r101, r104; +} +{ +mul.f16x2 r110, r89, r112; +} +{ +fma.rn.f16x2 r113, r92, r115, r110; +} +{ +neg.f16x2 r117, r86; +} +mov.b32 r133, {rs5, rs5}; +{ +mul.f16x2 r119, r95, r133; +} +mov.b32 r130, {rs6, rs6}; +{ +mul.f16x2 r122, r98, r130; +} +{ +sub.f16x2 r125, r119, r122; +} +{ +mul.f16x2 r128, r95, r130; +} +{ +fma.rn.f16x2 r131, r98, r133, r128; +} +{ +add.f16x2 r135, r27, r77; +} +{ +add.f16x2 r138, r30, r80; +} +{ +sub.f16x2 r141, r27, r77; +} +{ +sub.f16x2 r144, r30, r80; +} +{ +add.f16x2 r147, r39, r107; +} +{ +add.f16x2 r150, r42, r113; +} +{ +sub.f16x2 r153, r39, r107; +} +{ +sub.f16x2 r156, r42, r113; +} +{ +add.f16x2 r159, r33, r117; +} +{ +add.f16x2 r162, r36, r83; +} +{ +sub.f16x2 r165, r33, r117; +} +{ +sub.f16x2 r168, r36, r83; +} +{ +add.f16x2 r171, r45, r125; +} +{ +add.f16x2 r174, r48, r131; +} +{ +sub.f16x2 r177, r45, r125; +} +{ +sub.f16x2 r180, r48, r131; +} +{ +add.f16x2 r183, %34, %50; +} +{ +add.f16x2 r186, %35, %51; +} +{ +sub.f16x2 r189, %34, %50; +} +{ +sub.f16x2 r192, %35, %51; +} +{ +add.f16x2 r195, %42, %58; +} +{ +add.f16x2 r198, %43, %59; +} +{ +sub.f16x2 r201, %42, %58; +} +{ +sub.f16x2 r204, %43, %59; +} +{ +neg.f16x2 r207, r204; +} +{ +add.f16x2 r209, r183, r195; +} +{ +add.f16x2 r212, r186, r198; +} +{ +sub.f16x2 r215, r183, r195; +} +{ +sub.f16x2 r218, r186, r198; +} +{ +add.f16x2 r221, r189, r207; +} +{ +add.f16x2 r224, r192, r201; +} +{ +sub.f16x2 r227, r189, r207; +} +{ +sub.f16x2 r230, r192, r201; +} +{ +add.f16x2 r233, %38, %54; +} +{ +add.f16x2 r236, %39, %55; +} +{ +sub.f16x2 r239, %38, %54; +} +{ +sub.f16x2 r242, %39, %55; +} +{ +add.f16x2 r245, %46, %62; +} +{ +add.f16x2 r248, %47, %63; +} +{ +sub.f16x2 r251, %46, %62; +} +{ +sub.f16x2 r254, %47, %63; +} +{ +neg.f16x2 r257, r254; +} +{ +add.f16x2 r259, r233, r245; +} +{ +add.f16x2 r262, r236, r248; +} +{ +sub.f16x2 r265, r233, r245; +} +{ +sub.f16x2 r268, r236, r248; +} +{ +add.f16x2 r271, r239, r257; +} +{ +add.f16x2 r274, r242, r251; +} +{ +sub.f16x2 r277, r239, r257; +} +{ +sub.f16x2 r280, r242, r251; +} +{ +cvt.rn.f16.f64 rs15, fd40; +} +{ +cvt.rn.f16.f64 rs16, fd40; +} +{ +cvt.rn.f16.f64 rs19, fd39; +} +{ +cvt.rn.f16.f64 rs20, fd40; +} +mov.b32 r297, {rs15, rs15}; +{ +mul.f16x2 r283, r271, r297; +} +mov.b32 r294, {rs16, rs16}; +{ +mul.f16x2 r286, r274, r294; +} +{ +sub.f16x2 r289, r283, r286; +} +{ +mul.f16x2 r292, r271, r294; +} +{ +fma.rn.f16x2 r295, r274, r297, r292; +} +{ +neg.f16x2 r299, r268; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r277, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r280, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r277, r312; +} +{ +fma.rn.f16x2 r313, r280, r315, r310; +} +{ +add.f16x2 r317, r209, r259; +} +{ +add.f16x2 r320, r212, r262; +} +{ +sub.f16x2 r323, r209, r259; +} +{ +sub.f16x2 r326, r212, r262; +} +{ +add.f16x2 r329, r221, r289; +} +{ +add.f16x2 r332, r224, r295; +} +{ +sub.f16x2 r335, r221, r289; +} +{ +sub.f16x2 r338, r224, r295; +} +{ +add.f16x2 r341, r215, r299; +} +{ +add.f16x2 r344, r218, r265; +} +{ +sub.f16x2 r347, r215, r299; +} +{ +sub.f16x2 r350, r218, r265; +} +{ +add.f16x2 r353, r227, r307; +} +{ +add.f16x2 r356, r230, r313; +} +{ +sub.f16x2 r359, r227, r307; +} +{ +sub.f16x2 r362, r230, r313; +} +mov.f64 fd38, 0d3FED906BCF328D46; +{ +cvt.rn.f16.f64 rs29, fd38; +} +mov.f64 fd42, 0d3FD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs30, fd42; +} +{ +cvt.rn.f16.f64 rs31, fd40; +} +{ +cvt.rn.f16.f64 rs32, fd40; +} +{ +cvt.rn.f16.f64 rs33, fd42; +} +{ +cvt.rn.f16.f64 rs34, fd38; +} +mov.f64 fd37, 0dBFD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs37, fd37; +} +{ +cvt.rn.f16.f64 rs38, fd38; +} +{ +cvt.rn.f16.f64 rs39, fd39; +} +{ +cvt.rn.f16.f64 rs40, fd40; +} +mov.f64 fd41, 0dBFED906BCF328D46; +{ +cvt.rn.f16.f64 rs41, fd41; +} +{ +cvt.rn.f16.f64 rs42, fd42; +} +mov.b32 r379, {rs29, rs29}; +{ +mul.f16x2 r365, r329, r379; +} +mov.b32 r376, {rs30, rs30}; +{ +mul.f16x2 r368, r332, r376; +} +{ +sub.f16x2 r371, r365, r368; +} +{ +mul.f16x2 r374, r329, r376; +} +{ +fma.rn.f16x2 r377, r332, r379, r374; +} +mov.b32 r395, {rs31, rs31}; +{ +mul.f16x2 r381, r341, r395; +} +mov.b32 r392, {rs32, rs32}; +{ +mul.f16x2 r384, r344, r392; +} +{ +sub.f16x2 r387, r381, r384; +} +{ +mul.f16x2 r390, r341, r392; +} +{ +fma.rn.f16x2 r393, r344, r395, r390; +} +mov.b32 r411, {rs33, rs33}; +{ +mul.f16x2 r397, r353, r411; +} +mov.b32 r408, {rs34, rs34}; +{ +mul.f16x2 r400, r356, r408; +} +{ +sub.f16x2 r403, r397, r400; +} +{ +mul.f16x2 r406, r353, r408; +} +{ +fma.rn.f16x2 r409, r356, r411, r406; +} +{ +neg.f16x2 r413, r326; +} +mov.b32 r429, {rs37, rs37}; +{ +mul.f16x2 r415, r335, r429; +} +mov.b32 r426, {rs38, rs38}; +{ +mul.f16x2 r418, r338, r426; +} +{ +sub.f16x2 r421, r415, r418; +} +{ +mul.f16x2 r424, r335, r426; +} +{ +fma.rn.f16x2 r427, r338, r429, r424; +} +mov.b32 r445, {rs39, rs39}; +{ +mul.f16x2 r431, r347, r445; +} +mov.b32 r442, {rs40, rs40}; +{ +mul.f16x2 r434, r350, r442; +} +{ +sub.f16x2 r437, r431, r434; +} +{ +mul.f16x2 r440, r347, r442; +} +{ +fma.rn.f16x2 r443, r350, r445, r440; +} +mov.b32 r461, {rs41, rs41}; +{ +mul.f16x2 r447, r359, r461; +} +mov.b32 r458, {rs42, rs42}; +{ +mul.f16x2 r450, r362, r458; +} +{ +sub.f16x2 r453, r447, r450; +} +{ +mul.f16x2 r456, r359, r458; +} +{ +fma.rn.f16x2 r459, r362, r461, r456; +} +{ +add.f16x2 %0, r135, r317; +} +{ +add.f16x2 %1, r138, r320; +} +{ +sub.f16x2 %16, r135, r317; +} +{ +sub.f16x2 %17, r138, r320; +} +{ +add.f16x2 %2, r147, r371; +} +{ +add.f16x2 %3, r150, r377; +} +{ +sub.f16x2 %18, r147, r371; +} +{ +sub.f16x2 %19, r150, r377; +} +{ +add.f16x2 %4, r159, r387; +} +{ +add.f16x2 %5, r162, r393; +} +{ +sub.f16x2 %20, r159, r387; +} +{ +sub.f16x2 %21, r162, r393; +} +{ +add.f16x2 %6, r171, r403; +} +{ +add.f16x2 %7, r174, r409; +} +{ +sub.f16x2 %22, r171, r403; +} +{ +sub.f16x2 %23, r174, r409; +} +{ +add.f16x2 %8, r141, r413; +} +{ +add.f16x2 %9, r144, r323; +} +{ +sub.f16x2 %24, r141, r413; +} +{ +sub.f16x2 %25, r144, r323; +} +{ +add.f16x2 %10, r153, r421; +} +{ +add.f16x2 %11, r156, r427; +} +{ +sub.f16x2 %26, r153, r421; +} +{ +sub.f16x2 %27, r156, r427; +} +{ +add.f16x2 %12, r165, r437; +} +{ +add.f16x2 %13, r168, r443; +} +{ +sub.f16x2 %28, r165, r437; +} +{ +sub.f16x2 %29, r168, r443; +} +{ +add.f16x2 %14, r177, r453; +} +{ +add.f16x2 %15, r180, r459; +} +{ +sub.f16x2 %30, r177, r453; +} +{ +sub.f16x2 %31, r180, r459; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<974, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<227>; +.reg .b64 rd<2>; +mov.u32 r215, %tid.y; +shl.b32 r216, r215, 6; +mov.u32 r217, %8; +add.s32 r218, r217, r216; +mov.u32 r219, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r220, r219, 3; +shl.b32 r221, r219, 4; +and.b32 r222, r221, -64; +add.s32 r223, r218, r222; +cvt.rn.f32.u32 f11, r220; +mul.f32 f12, f11, 0f3EC90FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r224, r221, 48; +add.s32 r225, r223, r224; +st.shared.v4.f32 [r225], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r226, r220, -12, r225; +ld.shared.u32 r166, [r226]; +ld.shared.u32 r178, [r226+16]; +ld.shared.u32 r167, [r226+32]; +ld.shared.u32 r179, [r226+48]; +barrier.sync 0; +st.shared.v4.f32 [r225], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r226]; +ld.shared.u32 r181, [r226+16]; +ld.shared.u32 r170, [r226+32]; +ld.shared.u32 r182, [r226+48]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 %0, r165, r177; +} +{ +add.f16x2 %1, r168, r180; +} +{ +sub.f16x2 %4, r165, r177; +} +{ +sub.f16x2 %5, r168, r180; +} +{ +add.f16x2 %2, r171, r189; +} +{ +add.f16x2 %3, r174, r183; +} +{ +sub.f16x2 %6, r171, r189; +} +{ +sub.f16x2 %7, r174, r183; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<975, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<227>; +.reg .b64 rd<2>; +mov.u32 r215, %tid.y; +shl.b32 r216, r215, 7; +mov.u32 r217, %8; +add.s32 r218, r217, r216; +mov.u32 r219, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r220, r219, 3; +shl.b32 r221, r219, 5; +and.b32 r222, r221, -128; +add.s32 r223, r218, r222; +cvt.rn.f32.u32 f11, r220; +mul.f32 f12, f11, 0f3EC90FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r224, r221, 96; +add.s32 r225, r223, r224; +st.shared.v4.f32 [r225], {r27, r30, r61, r70}; +st.shared.v4.f32 [r225+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r226, r220, -24, r225; +ld.shared.u32 r166, [r226]; +ld.shared.u32 r169, [r226+4]; +ld.shared.u32 r178, [r226+32]; +ld.shared.u32 r181, [r226+36]; +ld.shared.u32 r167, [r226+64]; +ld.shared.u32 r170, [r226+68]; +ld.shared.u32 r179, [r226+96]; +ld.shared.u32 r182, [r226+100]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 %0, r165, r177; +} +{ +add.f16x2 %1, r168, r180; +} +{ +sub.f16x2 %4, r165, r177; +} +{ +sub.f16x2 %5, r168, r180; +} +{ +add.f16x2 %2, r171, r189; +} +{ +add.f16x2 %3, r174, r183; +} +{ +sub.f16x2 %6, r171, r189; +} +{ +sub.f16x2 %7, r174, r183; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<976, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<519>; +.reg .b64 rd<2>; +mov.u32 r507, %tid.y; +shl.b32 r508, r507, 6; +mov.u32 r509, %16; +add.s32 r510, r509, r508; +mov.u32 r511, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f12, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f44, 0f3F800000; +mov.f32 f10, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f10; +cvt.rn.f16.f32 high, f10; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r512, r511, 1; +shl.b32 r513, r511, 5; +and.b32 r514, r513, -64; +add.s32 r515, r510, r514; +cvt.rn.f32.u32 f47, r512; +mul.f32 f48, f47, 0f3EC90FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r516, r513, 32; +add.s32 r517, r515, r516; +st.shared.v4.f32 [r517], {r149, r207, r244, r281}; +st.shared.v4.f32 [r517+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r518, r512, -28, r517; +ld.shared.u32 r460, [r518]; +ld.shared.u32 r472, [r518+8]; +ld.shared.u32 r484, [r518+16]; +ld.shared.u32 r496, [r518+24]; +ld.shared.u32 r461, [r518+32]; +ld.shared.u32 r473, [r518+40]; +ld.shared.u32 r485, [r518+48]; +ld.shared.u32 r497, [r518+56]; +barrier.sync 0; +st.shared.v4.f32 [r517], {r152, r216, r253, r290}; +st.shared.v4.f32 [r517+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r518]; +ld.shared.u32 r475, [r518+8]; +ld.shared.u32 r487, [r518+16]; +ld.shared.u32 r499, [r518+24]; +ld.shared.u32 r464, [r518+32]; +ld.shared.u32 r476, [r518+40]; +ld.shared.u32 r488, [r518+48]; +ld.shared.u32 r500, [r518+56]; +{ +add.f16x2 %0, r460, r461; +} +{ +add.f16x2 %1, r463, r464; +} +{ +sub.f16x2 %8, r460, r461; +} +{ +sub.f16x2 %9, r463, r464; +} +{ +add.f16x2 %2, r472, r473; +} +{ +add.f16x2 %3, r475, r476; +} +{ +sub.f16x2 %10, r472, r473; +} +{ +sub.f16x2 %11, r475, r476; +} +{ +add.f16x2 %4, r484, r485; +} +{ +add.f16x2 %5, r487, r488; +} +{ +sub.f16x2 %12, r484, r485; +} +{ +sub.f16x2 %13, r487, r488; +} +{ +add.f16x2 %6, r496, r497; +} +{ +add.f16x2 %7, r499, r500; +} +{ +sub.f16x2 %14, r496, r497; +} +{ +sub.f16x2 %15, r499, r500; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<977, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<519>; +.reg .b64 rd<2>; +mov.u32 r507, %tid.y; +shl.b32 r508, r507, 7; +mov.u32 r509, %16; +add.s32 r510, r509, r508; +mov.u32 r511, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f12, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f44, 0f3F800000; +mov.f32 f10, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f10; +cvt.rn.f16.f32 high, f10; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r512, r511, 1; +shl.b32 r513, r511, 6; +and.b32 r514, r513, -128; +add.s32 r515, r510, r514; +cvt.rn.f32.u32 f47, r512; +mul.f32 f48, f47, 0f3EC90FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r516, r513, 64; +add.s32 r517, r515, r516; +st.shared.v4.f32 [r517], {r149, r152, r207, r216}; +st.shared.v4.f32 [r517+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r517+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r517+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r518, r512, -56, r517; +ld.shared.u32 r460, [r518]; +ld.shared.u32 r463, [r518+4]; +ld.shared.u32 r472, [r518+16]; +ld.shared.u32 r475, [r518+20]; +ld.shared.u32 r484, [r518+32]; +ld.shared.u32 r487, [r518+36]; +ld.shared.u32 r496, [r518+48]; +ld.shared.u32 r499, [r518+52]; +ld.shared.u32 r461, [r518+64]; +ld.shared.u32 r464, [r518+68]; +ld.shared.u32 r473, [r518+80]; +ld.shared.u32 r476, [r518+84]; +ld.shared.u32 r485, [r518+96]; +ld.shared.u32 r488, [r518+100]; +ld.shared.u32 r497, [r518+112]; +ld.shared.u32 r500, [r518+116]; +{ +add.f16x2 %0, r460, r461; +} +{ +add.f16x2 %1, r463, r464; +} +{ +sub.f16x2 %8, r460, r461; +} +{ +sub.f16x2 %9, r463, r464; +} +{ +add.f16x2 %2, r472, r473; +} +{ +add.f16x2 %3, r475, r476; +} +{ +sub.f16x2 %10, r472, r473; +} +{ +sub.f16x2 %11, r475, r476; +} +{ +add.f16x2 %4, r484, r485; +} +{ +add.f16x2 %5, r487, r488; +} +{ +sub.f16x2 %12, r484, r485; +} +{ +sub.f16x2 %13, r487, r488; +} +{ +add.f16x2 %6, r496, r497; +} +{ +add.f16x2 %7, r499, r500; +} +{ +sub.f16x2 %14, r496, r497; +} +{ +sub.f16x2 %15, r499, r500; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<978, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<28>; +.reg .b32 r<197>; +.reg .b64 rd<2>; +mov.u32 r169, %tid.y; +shl.b32 r170, r169, 6; +mov.u32 r171, %4; +add.s32 r172, r171, r170; +mov.u32 r173, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r174, r173, 7; +shl.b32 r175, r173, 3; +and.b32 r176, r175, -64; +add.s32 r177, r172, r176; +cvt.rn.f32.u32 f19, r174; +mul.f32 f20, f19, 0f3EC90FDB; +cos.approx.f32 f1, f20; +sin.approx.f32 f21, f20; +neg.f32 f2, f21; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r178, r175, 56; +add.s32 r179, r177, r178; +st.shared.v2.f32 [r179], {r1, r23}; +barrier.sync 0; +shl.b32 r180, r173, 2; +and.b32 r181, r180, 28; +sub.s32 r182, r179, r181; +ld.shared.u32 r54, [r182]; +ld.shared.u32 r55, [r182+32]; +barrier.sync 0; +st.shared.v2.f32 [r179], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r182]; +ld.shared.u32 r58, [r182+32]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r183, r173, 1, 2; +and.b32 r184, r180, 4; +add.s32 r185, r177, r184; +cvt.rn.f32.u32 f22, r183; +mul.f32 f23, f22, 0f3F490FDB; +cos.approx.f32 f7, f23; +sin.approx.f32 f24, f23; +neg.f32 f8, f24; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r186, r175, 48; +add.s32 r187, r185, r186; +st.shared.u32 [r187], r53; +st.shared.u32 [r187+8], r75; +barrier.sync 0; +and.b32 r188, r180, 24; +sub.s32 r189, r187, r188; +ld.shared.u32 r106, [r189]; +ld.shared.u32 r107, [r189+32]; +barrier.sync 0; +st.shared.u32 [r187], r56; +st.shared.u32 [r187+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r189]; +ld.shared.u32 r110, [r189+32]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r190, r173, 2, 1; +and.b32 r191, r180, 12; +add.s32 r192, r177, r191; +cvt.rn.f32.u32 f25, r190; +mul.f32 f26, f25, 0f3FC90FDB; +cos.approx.f32 f13, f26; +sin.approx.f32 f27, f26; +neg.f32 f14, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +barrier.sync 0; +and.b32 r193, r175, 32; +add.s32 r194, r192, r193; +st.shared.u32 [r194], r105; +st.shared.u32 [r194+16], r127; +barrier.sync 0; +and.b32 r195, r180, 16; +sub.s32 r196, r194, r195; +ld.shared.u32 r158, [r196]; +ld.shared.u32 r159, [r196+32]; +barrier.sync 0; +st.shared.u32 [r194], r108; +st.shared.u32 [r194+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r196]; +ld.shared.u32 r162, [r196+32]; +{ +add.f16x2 %0, r158, r159; +} +{ +add.f16x2 %1, r161, r162; +} +{ +sub.f16x2 %2, r158, r159; +} +{ +sub.f16x2 %3, r161, r162; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<979, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<28>; +.reg .b32 r<197>; +.reg .b64 rd<2>; +mov.u32 r169, %tid.y; +shl.b32 r170, r169, 7; +mov.u32 r171, %4; +add.s32 r172, r171, r170; +mov.u32 r173, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r174, r173, 7; +shl.b32 r175, r173, 4; +and.b32 r176, r175, -128; +add.s32 r177, r172, r176; +cvt.rn.f32.u32 f19, r174; +mul.f32 f20, f19, 0f3EC90FDB; +cos.approx.f32 f1, f20; +sin.approx.f32 f21, f20; +neg.f32 f2, f21; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r178, r175, 112; +add.s32 r179, r177, r178; +st.shared.v2.f32 [r179], {r1, r4}; +st.shared.v2.f32 [r179+8], {r23, r32}; +barrier.sync 0; +shl.b32 r180, r173, 3; +and.b32 r181, r180, 56; +sub.s32 r182, r179, r181; +ld.shared.u32 r54, [r182]; +ld.shared.u32 r57, [r182+4]; +ld.shared.u32 r55, [r182+64]; +ld.shared.u32 r58, [r182+68]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r183, r173, 1, 2; +cvt.rn.f32.u32 f22, r183; +mul.f32 f23, f22, 0f3F490FDB; +cos.approx.f32 f7, f23; +sin.approx.f32 f24, f23; +neg.f32 f8, f24; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r184, r180, 8; +add.s32 r185, r177, r184; +barrier.sync 0; +and.b32 r186, r175, 96; +add.s32 r187, r185, r186; +st.shared.u32 [r187], r53; +st.shared.u32 [r187+4], r56; +st.shared.u32 [r187+16], r75; +st.shared.u32 [r187+20], r84; +barrier.sync 0; +and.b32 r188, r180, 48; +sub.s32 r189, r187, r188; +ld.shared.u32 r106, [r189]; +ld.shared.u32 r109, [r189+4]; +ld.shared.u32 r107, [r189+64]; +ld.shared.u32 r110, [r189+68]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r190, r173, 2, 1; +cvt.rn.f32.u32 f25, r190; +mul.f32 f26, f25, 0f3FC90FDB; +cos.approx.f32 f13, f26; +sin.approx.f32 f27, f26; +neg.f32 f14, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +and.b32 r191, r180, 24; +add.s32 r192, r177, r191; +barrier.sync 0; +and.b32 r193, r175, 64; +add.s32 r194, r192, r193; +st.shared.u32 [r194], r105; +st.shared.u32 [r194+4], r108; +st.shared.u32 [r194+32], r127; +st.shared.u32 [r194+36], r136; +barrier.sync 0; +and.b32 r195, r180, 32; +sub.s32 r196, r194, r195; +ld.shared.u32 r158, [r196]; +ld.shared.u32 r161, [r196+4]; +ld.shared.u32 r159, [r196+64]; +ld.shared.u32 r162, [r196+68]; +{ +add.f16x2 %0, r158, r159; +} +{ +add.f16x2 %1, r161, r162; +} +{ +sub.f16x2 %2, r158, r159; +} +{ +sub.f16x2 %3, r161, r162; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..2d83b8fb252b0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp32_fwd.hpp.inc @@ -0,0 +1,985 @@ +#ifndef CUFFTDX_FFT_16_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_16_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<25, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<240>; +.reg .b64 rd<2>; +add.f32 f65, %32, %53; +add.f32 f66, %33, %55; +sub.f32 f67, %32, %53; +sub.f32 f68, %33, %55; +add.f32 f69, %42, %64; +add.f32 f70, %44, %65; +sub.f32 f71, %42, %64; +sub.f32 f72, %44, %65; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %37, %58; +add.f32 f82, %39, %60; +sub.f32 f83, %37, %58; +sub.f32 f84, %39, %60; +add.f32 f85, %48, %69; +add.f32 f86, %49, %71; +sub.f32 f87, %48, %69; +sub.f32 f88, %49, %71; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %34, %56; +add.f32 f123, %36, %57; +sub.f32 f124, %34, %56; +sub.f32 f125, %36, %57; +add.f32 f126, %45, %66; +add.f32 f127, %47, %68; +sub.f32 f128, %45, %66; +sub.f32 f129, %47, %68; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %40, %61; +add.f32 f139, %41, %63; +sub.f32 f140, %40, %61; +sub.f32 f141, %41, %63; +add.f32 f142, %50, %72; +add.f32 f143, %52, %73; +sub.f32 f144, %50, %72; +sub.f32 f145, %52, %73; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 %1, f107, f164; +add.f32 %0, f106, f163; +add.f32 %3, f111, f183; +add.f32 %2, f110, f181; +add.f32 %5, f115, f188; +add.f32 %4, f114, f186; +add.f32 %7, f119, f193; +add.f32 %6, f118, f191; +sub.f32 %9, f109, f165; +add.f32 %8, f108, f166; +add.f32 %11, f113, f198; +add.f32 %10, f112, f196; +add.f32 %13, f117, f202; +add.f32 %12, f116, f201; +add.f32 %15, f121, f207; +add.f32 %14, f120, f205; +sub.f32 %17, f107, f164; +sub.f32 %16, f106, f163; +sub.f32 %19, f111, f183; +sub.f32 %18, f110, f181; +sub.f32 %21, f115, f188; +sub.f32 %20, f114, f186; +sub.f32 %23, f119, f193; +sub.f32 %22, f118, f191; +add.f32 %25, f109, f165; +sub.f32 %24, f108, f166; +sub.f32 %27, f113, f198; +sub.f32 %26, f112, f196; +sub.f32 %29, f117, f202; +sub.f32 %28, f116, f201; +sub.f32 %31, f121, f207; +sub.f32 %30, f120, f205; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<26, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<86>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -64; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 48; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+16]; +ld.shared.f32 f64, [r13+32]; +ld.shared.f32 f65, [r13+48]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+16]; +ld.shared.f32 f68, [r13+32]; +ld.shared.f32 f69, [r13+48]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 %0, f70, f74; +add.f32 %1, f71, f75; +sub.f32 %3, f73, f76; +add.f32 %2, f72, f77; +sub.f32 %4, f70, f74; +sub.f32 %5, f71, f75; +add.f32 %7, f73, f76; +sub.f32 %6, f72, f77; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<27, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<94>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -128; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 96; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+32]; +ld.shared.v2.f32 {f70, f71}, [r13+64]; +ld.shared.v2.f32 {f74, f75}, [r13+96]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +add.f32 %1, f79, f83; +add.f32 %0, f78, f82; +sub.f32 %3, f81, f84; +add.f32 %2, f80, f85; +sub.f32 %5, f79, f83; +sub.f32 %4, f78, f82; +add.f32 %7, f81, f84; +sub.f32 %6, f80, f85; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<28, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<191>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -64; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+8]; +ld.shared.f32 f161, [r13+16]; +ld.shared.f32 f162, [r13+24]; +ld.shared.f32 f163, [r13+32]; +ld.shared.f32 f164, [r13+40]; +ld.shared.f32 f165, [r13+48]; +ld.shared.f32 f166, [r13+56]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+8]; +ld.shared.f32 f169, [r13+16]; +ld.shared.f32 f170, [r13+24]; +ld.shared.f32 f171, [r13+32]; +ld.shared.f32 f172, [r13+40]; +ld.shared.f32 f173, [r13+48]; +ld.shared.f32 f174, [r13+56]; +add.f32 %0, f159, f163; +add.f32 %1, f167, f171; +add.f32 %2, f160, f164; +add.f32 %3, f168, f172; +add.f32 %4, f161, f165; +add.f32 %5, f169, f173; +add.f32 %6, f162, f166; +add.f32 %7, f170, f174; +sub.f32 %8, f159, f163; +sub.f32 %9, f167, f171; +sub.f32 %10, f160, f164; +sub.f32 %11, f168, f172; +sub.f32 %12, f161, f165; +sub.f32 %13, f169, f173; +sub.f32 %14, f162, f166; +sub.f32 %15, f170, f174; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<29, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<207>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -128; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 64; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+16]; +ld.shared.v2.f32 {f167, f168}, [r13+32]; +ld.shared.v2.f32 {f171, f172}, [r13+48]; +ld.shared.v2.f32 {f175, f176}, [r13+64]; +ld.shared.v2.f32 {f179, f180}, [r13+80]; +ld.shared.v2.f32 {f183, f184}, [r13+96]; +ld.shared.v2.f32 {f187, f188}, [r13+112]; +add.f32 %1, f160, f176; +add.f32 %0, f159, f175; +add.f32 %3, f164, f180; +add.f32 %2, f163, f179; +add.f32 %5, f168, f184; +add.f32 %4, f167, f183; +add.f32 %7, f172, f188; +add.f32 %6, f171, f187; +sub.f32 %9, f160, f176; +sub.f32 %8, f159, f175; +sub.f32 %11, f164, f180; +sub.f32 %10, f163, f179; +sub.f32 %13, f168, f184; +sub.f32 %12, f167, f183; +sub.f32 %15, f172, f188; +sub.f32 %14, f171, f187; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<30, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<64>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %8, %10; +add.f32 f10, %9, %11; +sub.f32 f11, %8, %10; +sub.f32 f12, %9, %11; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 56; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 28; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+32]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+32]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 48; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 24; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+32]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+32]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 32; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 16; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+32]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+32]; +add.f32 %0, f56, f57; +add.f32 %1, f58, f59; +sub.f32 %2, f56, f57; +sub.f32 %3, f58, f59; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<31, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<76>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %8, %10; +sub.f32 f10, %9, %11; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 112; +add.s32 r11, r8, r10; +add.f32 f18, %9, %11; +add.f32 f19, %8, %10; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 56; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+64]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 96; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 48; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+64]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 64; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 32; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+64]; +add.f32 %1, f65, f69; +add.f32 %0, f64, f68; +sub.f32 %3, f65, f69; +sub.f32 %2, f64, f68; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..a47e52044c76f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp32_inv.hpp.inc @@ -0,0 +1,985 @@ +#ifndef CUFFTDX_FFT_16_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_16_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<227, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<240>; +.reg .b64 rd<2>; +add.f32 f65, %32, %53; +add.f32 f66, %33, %55; +sub.f32 f67, %32, %53; +sub.f32 f68, %33, %55; +add.f32 f69, %42, %64; +add.f32 f70, %44, %65; +sub.f32 f71, %42, %64; +sub.f32 f72, %44, %65; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %37, %58; +add.f32 f82, %39, %60; +sub.f32 f83, %37, %58; +sub.f32 f84, %39, %60; +add.f32 f85, %48, %69; +add.f32 f86, %49, %71; +sub.f32 f87, %48, %69; +sub.f32 f88, %49, %71; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %34, %56; +add.f32 f123, %36, %57; +sub.f32 f124, %34, %56; +sub.f32 f125, %36, %57; +add.f32 f126, %45, %66; +add.f32 f127, %47, %68; +sub.f32 f128, %45, %66; +sub.f32 f129, %47, %68; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %40, %61; +add.f32 f139, %41, %63; +sub.f32 f140, %40, %61; +sub.f32 f141, %41, %63; +add.f32 f142, %50, %72; +add.f32 f143, %52, %73; +sub.f32 f144, %50, %72; +sub.f32 f145, %52, %73; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 %1, f107, f164; +add.f32 %0, f106, f163; +add.f32 %3, f111, f183; +add.f32 %2, f110, f181; +add.f32 %5, f115, f187; +add.f32 %4, f114, f186; +add.f32 %7, f119, f192; +add.f32 %6, f118, f190; +add.f32 %9, f109, f165; +sub.f32 %8, f108, f166; +add.f32 %11, f113, f197; +add.f32 %10, f112, f195; +add.f32 %13, f117, f202; +add.f32 %12, f116, f200; +add.f32 %15, f121, f207; +add.f32 %14, f120, f205; +sub.f32 %17, f107, f164; +sub.f32 %16, f106, f163; +sub.f32 %19, f111, f183; +sub.f32 %18, f110, f181; +sub.f32 %21, f115, f187; +sub.f32 %20, f114, f186; +sub.f32 %23, f119, f192; +sub.f32 %22, f118, f190; +sub.f32 %25, f109, f165; +add.f32 %24, f108, f166; +sub.f32 %27, f113, f197; +sub.f32 %26, f112, f195; +sub.f32 %29, f117, f202; +sub.f32 %28, f116, f200; +sub.f32 %31, f121, f207; +sub.f32 %30, f120, f205; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<228, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<86>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -64; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 48; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+16]; +ld.shared.f32 f64, [r13+32]; +ld.shared.f32 f65, [r13+48]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+16]; +ld.shared.f32 f68, [r13+32]; +ld.shared.f32 f69, [r13+48]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 %0, f70, f74; +add.f32 %1, f71, f75; +add.f32 %3, f73, f76; +sub.f32 %2, f72, f77; +sub.f32 %4, f70, f74; +sub.f32 %5, f71, f75; +sub.f32 %7, f73, f76; +add.f32 %6, f72, f77; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<229, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<94>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -128; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 96; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+32]; +ld.shared.v2.f32 {f70, f71}, [r13+64]; +ld.shared.v2.f32 {f74, f75}, [r13+96]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +add.f32 %1, f79, f83; +add.f32 %0, f78, f82; +add.f32 %3, f81, f84; +sub.f32 %2, f80, f85; +sub.f32 %5, f79, f83; +sub.f32 %4, f78, f82; +sub.f32 %7, f81, f84; +add.f32 %6, f80, f85; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<230, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<191>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -64; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+8]; +ld.shared.f32 f161, [r13+16]; +ld.shared.f32 f162, [r13+24]; +ld.shared.f32 f163, [r13+32]; +ld.shared.f32 f164, [r13+40]; +ld.shared.f32 f165, [r13+48]; +ld.shared.f32 f166, [r13+56]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+8]; +ld.shared.f32 f169, [r13+16]; +ld.shared.f32 f170, [r13+24]; +ld.shared.f32 f171, [r13+32]; +ld.shared.f32 f172, [r13+40]; +ld.shared.f32 f173, [r13+48]; +ld.shared.f32 f174, [r13+56]; +add.f32 %0, f159, f163; +add.f32 %1, f167, f171; +add.f32 %2, f160, f164; +add.f32 %3, f168, f172; +add.f32 %4, f161, f165; +add.f32 %5, f169, f173; +add.f32 %6, f162, f166; +add.f32 %7, f170, f174; +sub.f32 %8, f159, f163; +sub.f32 %9, f167, f171; +sub.f32 %10, f160, f164; +sub.f32 %11, f168, f172; +sub.f32 %12, f161, f165; +sub.f32 %13, f169, f173; +sub.f32 %14, f162, f166; +sub.f32 %15, f170, f174; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<231, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<207>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -128; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 64; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+16]; +ld.shared.v2.f32 {f167, f168}, [r13+32]; +ld.shared.v2.f32 {f171, f172}, [r13+48]; +ld.shared.v2.f32 {f175, f176}, [r13+64]; +ld.shared.v2.f32 {f179, f180}, [r13+80]; +ld.shared.v2.f32 {f183, f184}, [r13+96]; +ld.shared.v2.f32 {f187, f188}, [r13+112]; +add.f32 %1, f160, f176; +add.f32 %0, f159, f175; +add.f32 %3, f164, f180; +add.f32 %2, f163, f179; +add.f32 %5, f168, f184; +add.f32 %4, f167, f183; +add.f32 %7, f172, f188; +add.f32 %6, f171, f187; +sub.f32 %9, f160, f176; +sub.f32 %8, f159, f175; +sub.f32 %11, f164, f180; +sub.f32 %10, f163, f179; +sub.f32 %13, f168, f184; +sub.f32 %12, f167, f183; +sub.f32 %15, f172, f188; +sub.f32 %14, f171, f187; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<232, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<64>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %8, %10; +add.f32 f10, %9, %11; +sub.f32 f11, %8, %10; +sub.f32 f12, %9, %11; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 56; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 28; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+32]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+32]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 48; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 24; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+32]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+32]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 32; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 16; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+32]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+32]; +add.f32 %0, f56, f57; +add.f32 %1, f58, f59; +sub.f32 %2, f56, f57; +sub.f32 %3, f58, f59; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<233, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<76>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %8, %10; +sub.f32 f10, %9, %11; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 112; +add.s32 r11, r8, r10; +add.f32 f18, %9, %11; +add.f32 f19, %8, %10; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 56; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+64]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 96; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 48; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+64]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 64; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 32; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+64]; +add.f32 %1, f65, f69; +add.f32 %0, f64, f68; +sub.f32 %3, f65, f69; +sub.f32 %2, f64, f68; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..4cc13ce572992 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp64_fwd.hpp.inc @@ -0,0 +1,981 @@ +#ifndef CUFFTDX_FFT_16_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_16_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<429, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<240>; +.reg .b64 rd<2>; +add.f64 fd65, %32, %53; +add.f64 fd66, %33, %55; +sub.f64 fd67, %32, %53; +sub.f64 fd68, %33, %55; +add.f64 fd69, %42, %64; +add.f64 fd70, %44, %65; +sub.f64 fd71, %42, %64; +sub.f64 fd72, %44, %65; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %37, %58; +add.f64 fd82, %39, %60; +sub.f64 fd83, %37, %58; +sub.f64 fd84, %39, %60; +add.f64 fd85, %48, %69; +add.f64 fd86, %49, %71; +sub.f64 fd87, %48, %69; +sub.f64 fd88, %49, %71; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %34, %56; +add.f64 fd123, %36, %57; +sub.f64 fd124, %34, %56; +sub.f64 fd125, %36, %57; +add.f64 fd126, %45, %66; +add.f64 fd127, %47, %68; +sub.f64 fd128, %45, %66; +sub.f64 fd129, %47, %68; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %61; +add.f64 fd139, %41, %63; +sub.f64 fd140, %40, %61; +sub.f64 fd141, %41, %63; +add.f64 fd142, %50, %72; +add.f64 fd143, %52, %73; +sub.f64 fd144, %50, %72; +sub.f64 fd145, %52, %73; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 %1, fd107, fd164; +add.f64 %0, fd106, fd163; +add.f64 %3, fd111, fd183; +add.f64 %2, fd110, fd181; +add.f64 %5, fd115, fd188; +add.f64 %4, fd114, fd186; +add.f64 %7, fd119, fd193; +add.f64 %6, fd118, fd191; +sub.f64 %9, fd109, fd165; +add.f64 %8, fd108, fd166; +add.f64 %11, fd113, fd198; +add.f64 %10, fd112, fd196; +add.f64 %13, fd117, fd202; +add.f64 %12, fd116, fd201; +add.f64 %15, fd121, fd207; +add.f64 %14, fd120, fd205; +sub.f64 %17, fd107, fd164; +sub.f64 %16, fd106, fd163; +sub.f64 %19, fd111, fd183; +sub.f64 %18, fd110, fd181; +sub.f64 %21, fd115, fd188; +sub.f64 %20, fd114, fd186; +sub.f64 %23, fd119, fd193; +sub.f64 %22, fd118, fd191; +add.f64 %25, fd109, fd165; +sub.f64 %24, fd108, fd166; +sub.f64 %27, fd113, fd198; +sub.f64 %26, fd112, fd196; +sub.f64 %29, fd117, fd202; +sub.f64 %28, fd116, fd201; +sub.f64 %31, fd121, fd207; +sub.f64 %30, fd120, fd205; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<430, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<93>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+64]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 192; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+64]; +ld.shared.v2.f64 {fd69, fd70}, [r13+128]; +ld.shared.v2.f64 {fd73, fd74}, [r13+192]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +add.f64 %1, fd78, fd82; +add.f64 %0, fd77, fd81; +sub.f64 %3, fd80, fd83; +add.f64 %2, fd79, fd84; +sub.f64 %5, fd78, fd82; +sub.f64 %4, fd77, fd81; +add.f64 %7, fd80, fd83; +sub.f64 %6, fd79, fd84; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<431, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<85>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+64]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 96; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+32]; +ld.shared.f64 fd63, [r13+64]; +ld.shared.f64 fd64, [r13+96]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+32]; +ld.shared.f64 fd67, [r13+64]; +ld.shared.f64 fd68, [r13+96]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 %0, fd69, fd73; +add.f64 %1, fd70, fd74; +sub.f64 %3, fd72, fd75; +add.f64 %2, fd71, fd76; +sub.f64 %4, fd69, fd73; +sub.f64 %5, fd70, fd74; +add.f64 %7, fd72, fd75; +sub.f64 %6, fd71, fd76; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<432, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<76>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %8, %10; +sub.f64 fd10, %9, %11; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 224; +add.s32 r11, r8, r10; +add.f64 fd18, %9, %11; +add.f64 fd19, %8, %10; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 112; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+128]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 192; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 96; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+128]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 128; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 64; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+128]; +add.f64 %1, fd65, fd69; +add.f64 %0, fd64, fd68; +sub.f64 %3, fd65, fd69; +sub.f64 %2, fd64, fd68; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<434, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<206>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+32]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 128; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+32]; +ld.shared.v2.f64 {fd166, fd167}, [r13+64]; +ld.shared.v2.f64 {fd170, fd171}, [r13+96]; +ld.shared.v2.f64 {fd174, fd175}, [r13+128]; +ld.shared.v2.f64 {fd178, fd179}, [r13+160]; +ld.shared.v2.f64 {fd182, fd183}, [r13+192]; +ld.shared.v2.f64 {fd186, fd187}, [r13+224]; +add.f64 %1, fd159, fd175; +add.f64 %0, fd158, fd174; +add.f64 %3, fd163, fd179; +add.f64 %2, fd162, fd178; +add.f64 %5, fd167, fd183; +add.f64 %4, fd166, fd182; +add.f64 %7, fd171, fd187; +add.f64 %6, fd170, fd186; +sub.f64 %9, fd159, fd175; +sub.f64 %8, fd158, fd174; +sub.f64 %11, fd163, fd179; +sub.f64 %10, fd162, fd178; +sub.f64 %13, fd167, fd183; +sub.f64 %12, fd166, fd182; +sub.f64 %15, fd171, fd187; +sub.f64 %14, fd170, fd186; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<433, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<190>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+32]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 64; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+16]; +ld.shared.f64 fd160, [r13+32]; +ld.shared.f64 fd161, [r13+48]; +ld.shared.f64 fd162, [r13+64]; +ld.shared.f64 fd163, [r13+80]; +ld.shared.f64 fd164, [r13+96]; +ld.shared.f64 fd165, [r13+112]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+16]; +ld.shared.f64 fd168, [r13+32]; +ld.shared.f64 fd169, [r13+48]; +ld.shared.f64 fd170, [r13+64]; +ld.shared.f64 fd171, [r13+80]; +ld.shared.f64 fd172, [r13+96]; +ld.shared.f64 fd173, [r13+112]; +add.f64 %0, fd158, fd162; +add.f64 %1, fd166, fd170; +add.f64 %2, fd159, fd163; +add.f64 %3, fd167, fd171; +add.f64 %4, fd160, fd164; +add.f64 %5, fd168, fd172; +add.f64 %6, fd161, fd165; +add.f64 %7, fd169, fd173; +sub.f64 %8, fd158, fd162; +sub.f64 %9, fd166, fd170; +sub.f64 %10, fd159, fd163; +sub.f64 %11, fd167, fd171; +sub.f64 %12, fd160, fd164; +sub.f64 %13, fd168, fd172; +sub.f64 %14, fd161, fd165; +sub.f64 %15, fd169, fd173; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<435, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<64>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %8, %10; +add.f64 fd10, %9, %11; +sub.f64 fd11, %8, %10; +sub.f64 fd12, %9, %11; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 112; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 56; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+64]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+64]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 96; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 48; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+64]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+64]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 64; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 32; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+64]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+64]; +add.f64 %0, fd56, fd57; +add.f64 %1, fd58, fd59; +sub.f64 %2, fd56, fd57; +sub.f64 %3, fd58, fd59; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..7ef5b8e2f9dc0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_16_fp64_inv.hpp.inc @@ -0,0 +1,981 @@ +#ifndef CUFFTDX_FFT_16_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_16_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<600, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<240>; +.reg .b64 rd<2>; +add.f64 fd65, %32, %53; +add.f64 fd66, %33, %55; +sub.f64 fd67, %32, %53; +sub.f64 fd68, %33, %55; +add.f64 fd69, %42, %64; +add.f64 fd70, %44, %65; +sub.f64 fd71, %42, %64; +sub.f64 fd72, %44, %65; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %37, %58; +add.f64 fd82, %39, %60; +sub.f64 fd83, %37, %58; +sub.f64 fd84, %39, %60; +add.f64 fd85, %48, %69; +add.f64 fd86, %49, %71; +sub.f64 fd87, %48, %69; +sub.f64 fd88, %49, %71; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %34, %56; +add.f64 fd123, %36, %57; +sub.f64 fd124, %34, %56; +sub.f64 fd125, %36, %57; +add.f64 fd126, %45, %66; +add.f64 fd127, %47, %68; +sub.f64 fd128, %45, %66; +sub.f64 fd129, %47, %68; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %61; +add.f64 fd139, %41, %63; +sub.f64 fd140, %40, %61; +sub.f64 fd141, %41, %63; +add.f64 fd142, %50, %72; +add.f64 fd143, %52, %73; +sub.f64 fd144, %50, %72; +sub.f64 fd145, %52, %73; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 %1, fd107, fd164; +add.f64 %0, fd106, fd163; +add.f64 %3, fd111, fd183; +add.f64 %2, fd110, fd181; +add.f64 %5, fd115, fd187; +add.f64 %4, fd114, fd186; +add.f64 %7, fd119, fd192; +add.f64 %6, fd118, fd190; +add.f64 %9, fd109, fd165; +sub.f64 %8, fd108, fd166; +add.f64 %11, fd113, fd197; +add.f64 %10, fd112, fd195; +add.f64 %13, fd117, fd202; +add.f64 %12, fd116, fd200; +add.f64 %15, fd121, fd207; +add.f64 %14, fd120, fd205; +sub.f64 %17, fd107, fd164; +sub.f64 %16, fd106, fd163; +sub.f64 %19, fd111, fd183; +sub.f64 %18, fd110, fd181; +sub.f64 %21, fd115, fd187; +sub.f64 %20, fd114, fd186; +sub.f64 %23, fd119, fd192; +sub.f64 %22, fd118, fd190; +sub.f64 %25, fd109, fd165; +add.f64 %24, fd108, fd166; +sub.f64 %27, fd113, fd197; +sub.f64 %26, fd112, fd195; +sub.f64 %29, fd117, fd202; +sub.f64 %28, fd116, fd200; +sub.f64 %31, fd121, fd207; +sub.f64 %30, fd120, fd205; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<601, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<93>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+64]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 192; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+64]; +ld.shared.v2.f64 {fd69, fd70}, [r13+128]; +ld.shared.v2.f64 {fd73, fd74}, [r13+192]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +add.f64 %1, fd78, fd82; +add.f64 %0, fd77, fd81; +add.f64 %3, fd80, fd83; +sub.f64 %2, fd79, fd84; +sub.f64 %5, fd78, fd82; +sub.f64 %4, fd77, fd81; +sub.f64 %7, fd80, fd83; +add.f64 %6, fd79, fd84; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<602, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<85>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+64]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 96; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+32]; +ld.shared.f64 fd63, [r13+64]; +ld.shared.f64 fd64, [r13+96]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+32]; +ld.shared.f64 fd67, [r13+64]; +ld.shared.f64 fd68, [r13+96]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 %0, fd69, fd73; +add.f64 %1, fd70, fd74; +add.f64 %3, fd72, fd75; +sub.f64 %2, fd71, fd76; +sub.f64 %4, fd69, fd73; +sub.f64 %5, fd70, fd74; +sub.f64 %7, fd72, fd75; +add.f64 %6, fd71, fd76; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<603, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<76>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %8, %10; +sub.f64 fd10, %9, %11; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 224; +add.s32 r11, r8, r10; +add.f64 fd18, %9, %11; +add.f64 fd19, %8, %10; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 112; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+128]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 192; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 96; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+128]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 128; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 64; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+128]; +add.f64 %1, fd65, fd69; +add.f64 %0, fd64, fd68; +sub.f64 %3, fd65, fd69; +sub.f64 %2, fd64, fd68; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<605, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<206>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+32]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 128; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+32]; +ld.shared.v2.f64 {fd166, fd167}, [r13+64]; +ld.shared.v2.f64 {fd170, fd171}, [r13+96]; +ld.shared.v2.f64 {fd174, fd175}, [r13+128]; +ld.shared.v2.f64 {fd178, fd179}, [r13+160]; +ld.shared.v2.f64 {fd182, fd183}, [r13+192]; +ld.shared.v2.f64 {fd186, fd187}, [r13+224]; +add.f64 %1, fd159, fd175; +add.f64 %0, fd158, fd174; +add.f64 %3, fd163, fd179; +add.f64 %2, fd162, fd178; +add.f64 %5, fd167, fd183; +add.f64 %4, fd166, fd182; +add.f64 %7, fd171, fd187; +add.f64 %6, fd170, fd186; +sub.f64 %9, fd159, fd175; +sub.f64 %8, fd158, fd174; +sub.f64 %11, fd163, fd179; +sub.f64 %10, fd162, fd178; +sub.f64 %13, fd167, fd183; +sub.f64 %12, fd166, fd182; +sub.f64 %15, fd171, fd187; +sub.f64 %14, fd170, fd186; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<604, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<190>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+32]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 64; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+16]; +ld.shared.f64 fd160, [r13+32]; +ld.shared.f64 fd161, [r13+48]; +ld.shared.f64 fd162, [r13+64]; +ld.shared.f64 fd163, [r13+80]; +ld.shared.f64 fd164, [r13+96]; +ld.shared.f64 fd165, [r13+112]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+16]; +ld.shared.f64 fd168, [r13+32]; +ld.shared.f64 fd169, [r13+48]; +ld.shared.f64 fd170, [r13+64]; +ld.shared.f64 fd171, [r13+80]; +ld.shared.f64 fd172, [r13+96]; +ld.shared.f64 fd173, [r13+112]; +add.f64 %0, fd158, fd162; +add.f64 %1, fd166, fd170; +add.f64 %2, fd159, fd163; +add.f64 %3, fd167, fd171; +add.f64 %4, fd160, fd164; +add.f64 %5, fd168, fd172; +add.f64 %6, fd161, fd165; +add.f64 %7, fd169, fd173; +sub.f64 %8, fd158, fd162; +sub.f64 %9, fd166, fd170; +sub.f64 %10, fd159, fd163; +sub.f64 %11, fd167, fd171; +sub.f64 %12, fd160, fd164; +sub.f64 %13, fd168, fd172; +sub.f64 %14, fd161, fd165; +sub.f64 %15, fd169, fd173; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<606, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<64>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %8, %10; +add.f64 fd10, %9, %11; +sub.f64 fd11, %8, %10; +sub.f64 fd12, %9, %11; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 112; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 56; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+64]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+64]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 2; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 96; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 48; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+64]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+64]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 1; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 64; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 32; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+64]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+64]; +add.f64 %0, fd56, fd57; +add.f64 %1, fd58, fd59; +sub.f64 %2, fd56, fd57; +sub.f64 %3, fd58, fd59; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..89020adb1ff1d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp16_fwd.hpp.inc @@ -0,0 +1,7152 @@ +#ifndef CUFFTDX_FFT_1728_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_1728_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<946, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<359>; +.reg .b32 r<2844>; +.reg .b64 rd<7>; +mov.u32 r2825, %tid.y; +shl.b32 r2826, r2825, 1; +mov.u32 r2827, %24; +mad.lo.s32 r2828, r2826, 6912, r2827; +mov.u32 r2829, %tid.x; +mov.f32 f328, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1, {low, high}; +} +mov.f32 f326, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %33, %41; +} +{ +add.f16x2 r8, %25, r5; +} +{ +add.f16x2 r11, %34, %42; +} +{ +add.f16x2 r14, %26, r11; +} +{ +add.f16x2 r17, %33, %41; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %25, r20; +} +{ +sub.f16x2 r26, %34, %42; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %33, %41; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %25, r38; +} +{ +sub.f16x2 r44, %34, %42; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %34, %42; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %26, r56; +} +{ +sub.f16x2 r62, %33, %41; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %34, %42; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %26, r74; +} +{ +sub.f16x2 r80, %33, %41; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %37, %45; +} +{ +add.f16x2 r96, %29, r93; +} +{ +add.f16x2 r99, %38, %46; +} +{ +add.f16x2 r102, %30, r99; +} +{ +add.f16x2 r105, %37, %45; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %29, r108; +} +{ +sub.f16x2 r114, %38, %46; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %37, %45; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %29, r126; +} +{ +sub.f16x2 r132, %38, %46; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %38, %46; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %30, r144; +} +{ +sub.f16x2 r150, %37, %45; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %38, %46; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %30, r162; +} +{ +sub.f16x2 r168, %37, %45; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f314, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r180, {low, high}; +} +mov.f32 f249, 0fBF800000; +mov.f32 f310, 0f3F5DB3D7; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r256, {low, high}; +} +{ +neg.f16x2 r257, r256; +} +{ +add.f16x2 r259, %35, %43; +} +{ +add.f16x2 r262, %27, r259; +} +{ +add.f16x2 r265, %36, %44; +} +{ +add.f16x2 r268, %28, r265; +} +{ +add.f16x2 r271, %35, %43; +} +{ +mul.f16x2 r274, r271, r255; +} +{ +add.f16x2 r277, %27, r274; +} +{ +sub.f16x2 r280, %36, %44; +} +{ +mul.f16x2 r283, r280, r257; +} +{ +add.f16x2 r286, r277, r283; +} +{ +add.f16x2 r289, %35, %43; +} +{ +mul.f16x2 r292, r289, r255; +} +{ +add.f16x2 r295, %27, r292; +} +{ +sub.f16x2 r298, %36, %44; +} +{ +mul.f16x2 r301, r298, r257; +} +{ +sub.f16x2 r304, r295, r301; +} +{ +add.f16x2 r307, %36, %44; +} +{ +mul.f16x2 r310, r307, r255; +} +{ +add.f16x2 r313, %28, r310; +} +{ +sub.f16x2 r316, %35, %43; +} +{ +mul.f16x2 r319, r316, r257; +} +{ +sub.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %36, %44; +} +{ +mul.f16x2 r328, r325, r255; +} +{ +add.f16x2 r331, %28, r328; +} +{ +sub.f16x2 r334, %35, %43; +} +{ +mul.f16x2 r337, r334, r257; +} +{ +add.f16x2 r340, r331, r337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r344, {low, high}; +} +{ +neg.f16x2 r345, r344; +} +{ +add.f16x2 r347, %39, %47; +} +{ +add.f16x2 r350, %31, r347; +} +{ +add.f16x2 r353, %40, %48; +} +{ +add.f16x2 r356, %32, r353; +} +{ +add.f16x2 r359, %39, %47; +} +{ +mul.f16x2 r362, r359, r343; +} +{ +add.f16x2 r365, %31, r362; +} +{ +sub.f16x2 r368, %40, %48; +} +{ +mul.f16x2 r371, r368, r345; +} +{ +add.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, %39, %47; +} +{ +mul.f16x2 r380, r377, r343; +} +{ +add.f16x2 r383, %31, r380; +} +{ +sub.f16x2 r386, %40, %48; +} +{ +mul.f16x2 r389, r386, r345; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %40, %48; +} +{ +mul.f16x2 r398, r395, r343; +} +{ +add.f16x2 r401, %32, r398; +} +{ +sub.f16x2 r404, %39, %47; +} +{ +mul.f16x2 r407, r404, r345; +} +{ +sub.f16x2 r410, r401, r407; +} +{ +add.f16x2 r413, %40, %48; +} +{ +mul.f16x2 r416, r413, r343; +} +{ +add.f16x2 r419, %32, r416; +} +{ +sub.f16x2 r422, %39, %47; +} +{ +mul.f16x2 r425, r422, r345; +} +{ +add.f16x2 r428, r419, r425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r434, {low, high}; +} +{ +mul.f16x2 r441, r374, r431; +} +{ +mul.f16x2 r444, r410, r432; +} +{ +sub.f16x2 r447, r441, r444; +} +{ +mul.f16x2 r450, r374, r432; +} +{ +fma.rn.f16x2 r453, r410, r431, r450; +} +{ +mul.f16x2 r457, r392, r433; +} +{ +mul.f16x2 r460, r428, r434; +} +{ +sub.f16x2 r463, r457, r460; +} +{ +mul.f16x2 r466, r392, r434; +} +{ +fma.rn.f16x2 r469, r428, r433, r466; +} +{ +add.f16x2 r473, r262, r350; +} +{ +add.f16x2 r476, r268, r356; +} +{ +sub.f16x2 r479, r262, r350; +} +{ +sub.f16x2 r482, r268, r356; +} +{ +add.f16x2 r485, r286, r447; +} +{ +add.f16x2 r488, r322, r453; +} +{ +sub.f16x2 r491, r286, r447; +} +{ +sub.f16x2 r494, r322, r453; +} +{ +add.f16x2 r497, r304, r463; +} +{ +add.f16x2 r500, r340, r469; +} +{ +sub.f16x2 r503, r304, r463; +} +{ +sub.f16x2 r506, r340, r469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f310; +cvt.rn.f16.f32 high, f310; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r512, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r516, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r517, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r518, {low, high}; +} +mov.f32 f250, 0f3F800000; +{ +mul.f16x2 r531, r485, r509; +} +{ +mul.f16x2 r534, r488, r510; +} +{ +sub.f16x2 r537, r531, r534; +} +{ +mul.f16x2 r540, r485, r510; +} +{ +fma.rn.f16x2 r543, r488, r509, r540; +} +{ +mul.f16x2 r547, r497, r511; +} +{ +mul.f16x2 r550, r500, r512; +} +{ +sub.f16x2 r553, r547, r550; +} +{ +mul.f16x2 r556, r497, r512; +} +{ +fma.rn.f16x2 r559, r500, r511, r556; +} +{ +neg.f16x2 r563, r479; +} +{ +mul.f16x2 r565, r491, r515; +} +{ +mul.f16x2 r568, r494, r516; +} +{ +sub.f16x2 r571, r565, r568; +} +{ +mul.f16x2 r574, r491, r516; +} +{ +fma.rn.f16x2 r577, r494, r515, r574; +} +{ +mul.f16x2 r581, r503, r517; +} +{ +mul.f16x2 r584, r506, r518; +} +{ +sub.f16x2 r587, r581, r584; +} +{ +mul.f16x2 r590, r503, r518; +} +{ +fma.rn.f16x2 r593, r506, r517, r590; +} +{ +add.f16x2 r597, r219, r473; +} +{ +add.f16x2 r600, r222, r476; +} +{ +sub.f16x2 r603, r219, r473; +} +{ +sub.f16x2 r606, r222, r476; +} +{ +add.f16x2 r609, r231, r537; +} +{ +add.f16x2 r612, r234, r543; +} +{ +sub.f16x2 r615, r231, r537; +} +{ +sub.f16x2 r618, r234, r543; +} +{ +add.f16x2 r621, r243, r553; +} +{ +add.f16x2 r624, r246, r559; +} +{ +sub.f16x2 r627, r243, r553; +} +{ +sub.f16x2 r630, r246, r559; +} +{ +add.f16x2 r633, r225, r482; +} +{ +add.f16x2 r636, r228, r563; +} +{ +sub.f16x2 r639, r225, r482; +} +{ +sub.f16x2 r642, r228, r563; +} +{ +add.f16x2 r645, r237, r571; +} +{ +add.f16x2 r648, r240, r577; +} +{ +sub.f16x2 r651, r237, r571; +} +{ +sub.f16x2 r654, r240, r577; +} +{ +add.f16x2 r657, r249, r587; +} +{ +add.f16x2 r660, r252, r593; +} +{ +sub.f16x2 r663, r249, r587; +} +{ +sub.f16x2 r666, r252, r593; +} +mul.wide.u32 rd2, r2829, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r2830, rd3; +mul.lo.s32 r2831, r2830, 144; +sub.s32 r2832, r2829, r2831; +shr.u64 rd4, rd2, 36; +cvt.u32.u64 r2833, rd4; +and.b32 r2834, r2833, 268435454; +mad.lo.s32 r2835, r2834, 6912, r2828; +cvt.rn.f32.u32 f353, r2832; +mul.f32 f354, f353, 0f3B6E4BAE; +cos.approx.f32 f101, f354; +sin.approx.f32 f355, f354; +neg.f32 f102, f355; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r669, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r674, {high, high}; +} +{ +mul.f16x2 r676, r612, r674; +} +{ +neg.f16x2 r679, r676; +} +{ +fma.rn.f16x2 r681, r609, r672, r679; +} +{ +mul.f16x2 r685, r609, r674; +} +{ +fma.rn.f16x2 r688, r612, r672, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r694, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r696, {low, high}; +} +{ +mul.f16x2 r697, r694, r696; +} +{ +mul.f16x2 r700, r669, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r703, {high, low}; +} +{ +fma.rn.f16x2 r705, r697, r703, r700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r711, {high, high}; +} +{ +mul.f16x2 r713, r624, r711; +} +{ +neg.f16x2 r716, r713; +} +{ +fma.rn.f16x2 r718, r621, r709, r716; +} +{ +mul.f16x2 r722, r621, r711; +} +{ +fma.rn.f16x2 r725, r624, r709, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r731, r733; +} +{ +mul.f16x2 r737, r705, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r740, {high, low}; +} +{ +fma.rn.f16x2 r742, r734, r740, r737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r748, {high, high}; +} +{ +mul.f16x2 r750, r636, r748; +} +{ +neg.f16x2 r753, r750; +} +{ +fma.rn.f16x2 r755, r633, r746, r753; +} +{ +mul.f16x2 r759, r633, r748; +} +{ +fma.rn.f16x2 r762, r636, r746, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r770, {low, high}; +} +{ +mul.f16x2 r771, r768, r770; +} +{ +mul.f16x2 r774, r742, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r777, {high, low}; +} +{ +fma.rn.f16x2 r779, r771, r777, r774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r785, {high, high}; +} +{ +mul.f16x2 r787, r648, r785; +} +{ +neg.f16x2 r790, r787; +} +{ +fma.rn.f16x2 r792, r645, r783, r790; +} +{ +mul.f16x2 r796, r645, r785; +} +{ +fma.rn.f16x2 r799, r648, r783, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r805, r807; +} +{ +mul.f16x2 r811, r779, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r814, {high, low}; +} +{ +fma.rn.f16x2 r816, r808, r814, r811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r822, {high, high}; +} +{ +mul.f16x2 r824, r660, r822; +} +{ +neg.f16x2 r827, r824; +} +{ +fma.rn.f16x2 r829, r657, r820, r827; +} +{ +mul.f16x2 r833, r657, r822; +} +{ +fma.rn.f16x2 r836, r660, r820, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r844, {low, high}; +} +{ +mul.f16x2 r845, r842, r844; +} +{ +mul.f16x2 r848, r816, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r851, {high, low}; +} +{ +fma.rn.f16x2 r853, r845, r851, r848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r859, {high, high}; +} +{ +mul.f16x2 r861, r606, r859; +} +{ +neg.f16x2 r864, r861; +} +{ +fma.rn.f16x2 r866, r603, r857, r864; +} +{ +mul.f16x2 r870, r603, r859; +} +{ +fma.rn.f16x2 r873, r606, r857, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r881, {low, high}; +} +{ +mul.f16x2 r882, r879, r881; +} +{ +mul.f16x2 r885, r853, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r888, {high, low}; +} +{ +fma.rn.f16x2 r890, r882, r888, r885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r618, r896; +} +{ +neg.f16x2 r901, r898; +} +{ +fma.rn.f16x2 r903, r615, r894, r901; +} +{ +mul.f16x2 r907, r615, r896; +} +{ +fma.rn.f16x2 r910, r618, r894, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r890, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r630, r933; +} +{ +neg.f16x2 r938, r935; +} +{ +fma.rn.f16x2 r940, r627, r931, r938; +} +{ +mul.f16x2 r944, r627, r933; +} +{ +fma.rn.f16x2 r947, r630, r931, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r642, r970; +} +{ +neg.f16x2 r975, r972; +} +{ +fma.rn.f16x2 r977, r639, r968, r975; +} +{ +mul.f16x2 r981, r639, r970; +} +{ +fma.rn.f16x2 r984, r642, r968, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r654, r1007; +} +{ +neg.f16x2 r1012, r1009; +} +{ +fma.rn.f16x2 r1014, r651, r1005, r1012; +} +{ +mul.f16x2 r1018, r651, r1007; +} +{ +fma.rn.f16x2 r1021, r654, r1005, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r666, r1044; +} +{ +neg.f16x2 r1049, r1046; +} +{ +fma.rn.f16x2 r1051, r663, r1042, r1049; +} +{ +mul.f16x2 r1055, r663, r1044; +} +{ +fma.rn.f16x2 r1058, r666, r1042, r1055; +} +barrier.sync 0; +mad.lo.s32 r2836, r2832, 96, r2835; +st.shared.v4.f32 [r2836], {r597, r600, r681, r688}; +st.shared.v4.f32 [r2836+16], {r718, r725, r755, r762}; +st.shared.v4.f32 [r2836+32], {r792, r799, r829, r836}; +st.shared.v4.f32 [r2836+48], {r866, r873, r903, r910}; +st.shared.v4.f32 [r2836+64], {r940, r947, r977, r984}; +st.shared.v4.f32 [r2836+80], {r1014, r1021, r1051, r1058}; +barrier.sync 0; +mad.lo.s32 r2837, r2832, -88, r2836; +ld.shared.u32 r1087, [r2837]; +ld.shared.u32 r1093, [r2837+4]; +ld.shared.u32 r1341, [r2837+1152]; +ld.shared.u32 r1347, [r2837+1156]; +ld.shared.u32 r1175, [r2837+2304]; +ld.shared.u32 r1181, [r2837+2308]; +ld.shared.u32 r1429, [r2837+3456]; +ld.shared.u32 r1435, [r2837+3460]; +ld.shared.u32 r1084, [r2837+4608]; +ld.shared.u32 r1090, [r2837+4612]; +ld.shared.u32 r1338, [r2837+5760]; +ld.shared.u32 r1344, [r2837+5764]; +ld.shared.u32 r1172, [r2837+6912]; +ld.shared.u32 r1178, [r2837+6916]; +ld.shared.u32 r1426, [r2837+8064]; +ld.shared.u32 r1432, [r2837+8068]; +ld.shared.u32 r1085, [r2837+9216]; +ld.shared.u32 r1091, [r2837+9220]; +ld.shared.u32 r1339, [r2837+10368]; +ld.shared.u32 r1345, [r2837+10372]; +ld.shared.u32 r1173, [r2837+11520]; +ld.shared.u32 r1179, [r2837+11524]; +ld.shared.u32 r1427, [r2837+12672]; +ld.shared.u32 r1433, [r2837+12676]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1080, {low, high}; +} +{ +neg.f16x2 r1081, r1080; +} +{ +add.f16x2 r1083, r1084, r1085; +} +{ +add.f16x2 r1086, r1087, r1083; +} +{ +add.f16x2 r1089, r1090, r1091; +} +{ +add.f16x2 r1092, r1093, r1089; +} +{ +add.f16x2 r1095, r1084, r1085; +} +{ +mul.f16x2 r1098, r1095, r1079; +} +{ +add.f16x2 r1101, r1087, r1098; +} +{ +sub.f16x2 r1104, r1090, r1091; +} +{ +mul.f16x2 r1107, r1104, r1081; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +add.f16x2 r1113, r1084, r1085; +} +{ +mul.f16x2 r1116, r1113, r1079; +} +{ +add.f16x2 r1119, r1087, r1116; +} +{ +sub.f16x2 r1122, r1090, r1091; +} +{ +mul.f16x2 r1125, r1122, r1081; +} +{ +sub.f16x2 r1128, r1119, r1125; +} +{ +add.f16x2 r1131, r1090, r1091; +} +{ +mul.f16x2 r1134, r1131, r1079; +} +{ +add.f16x2 r1137, r1093, r1134; +} +{ +sub.f16x2 r1140, r1084, r1085; +} +{ +mul.f16x2 r1143, r1140, r1081; +} +{ +sub.f16x2 r1146, r1137, r1143; +} +{ +add.f16x2 r1149, r1090, r1091; +} +{ +mul.f16x2 r1152, r1149, r1079; +} +{ +add.f16x2 r1155, r1093, r1152; +} +{ +sub.f16x2 r1158, r1084, r1085; +} +{ +mul.f16x2 r1161, r1158, r1081; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1167, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1168, {low, high}; +} +{ +neg.f16x2 r1169, r1168; +} +{ +add.f16x2 r1171, r1172, r1173; +} +{ +add.f16x2 r1174, r1175, r1171; +} +{ +add.f16x2 r1177, r1178, r1179; +} +{ +add.f16x2 r1180, r1181, r1177; +} +{ +add.f16x2 r1183, r1172, r1173; +} +{ +mul.f16x2 r1186, r1183, r1167; +} +{ +add.f16x2 r1189, r1175, r1186; +} +{ +sub.f16x2 r1192, r1178, r1179; +} +{ +mul.f16x2 r1195, r1192, r1169; +} +{ +add.f16x2 r1198, r1189, r1195; +} +{ +add.f16x2 r1201, r1172, r1173; +} +{ +mul.f16x2 r1204, r1201, r1167; +} +{ +add.f16x2 r1207, r1175, r1204; +} +{ +sub.f16x2 r1210, r1178, r1179; +} +{ +mul.f16x2 r1213, r1210, r1169; +} +{ +sub.f16x2 r1216, r1207, r1213; +} +{ +add.f16x2 r1219, r1178, r1179; +} +{ +mul.f16x2 r1222, r1219, r1167; +} +{ +add.f16x2 r1225, r1181, r1222; +} +{ +sub.f16x2 r1228, r1172, r1173; +} +{ +mul.f16x2 r1231, r1228, r1169; +} +{ +sub.f16x2 r1234, r1225, r1231; +} +{ +add.f16x2 r1237, r1178, r1179; +} +{ +mul.f16x2 r1240, r1237, r1167; +} +{ +add.f16x2 r1243, r1181, r1240; +} +{ +sub.f16x2 r1246, r1172, r1173; +} +{ +mul.f16x2 r1249, r1246, r1169; +} +{ +add.f16x2 r1252, r1243, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1258, {low, high}; +} +{ +mul.f16x2 r1265, r1198, r1255; +} +{ +mul.f16x2 r1268, r1234, r1256; +} +{ +sub.f16x2 r1271, r1265, r1268; +} +{ +mul.f16x2 r1274, r1198, r1256; +} +{ +fma.rn.f16x2 r1277, r1234, r1255, r1274; +} +{ +mul.f16x2 r1281, r1216, r1257; +} +{ +mul.f16x2 r1284, r1252, r1258; +} +{ +sub.f16x2 r1287, r1281, r1284; +} +{ +mul.f16x2 r1290, r1216, r1258; +} +{ +fma.rn.f16x2 r1293, r1252, r1257, r1290; +} +{ +add.f16x2 r1297, r1086, r1174; +} +{ +add.f16x2 r1300, r1092, r1180; +} +{ +sub.f16x2 r1303, r1086, r1174; +} +{ +sub.f16x2 r1306, r1092, r1180; +} +{ +add.f16x2 r1309, r1110, r1271; +} +{ +add.f16x2 r1312, r1146, r1277; +} +{ +sub.f16x2 r1315, r1110, r1271; +} +{ +sub.f16x2 r1318, r1146, r1277; +} +{ +add.f16x2 r1321, r1128, r1287; +} +{ +add.f16x2 r1324, r1164, r1293; +} +{ +sub.f16x2 r1327, r1128, r1287; +} +{ +sub.f16x2 r1330, r1164, r1293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1333, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1334, {low, high}; +} +{ +neg.f16x2 r1335, r1334; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1337; +} +{ +add.f16x2 r1343, r1344, r1345; +} +{ +add.f16x2 r1346, r1347, r1343; +} +{ +add.f16x2 r1349, r1338, r1339; +} +{ +mul.f16x2 r1352, r1349, r1333; +} +{ +add.f16x2 r1355, r1341, r1352; +} +{ +sub.f16x2 r1358, r1344, r1345; +} +{ +mul.f16x2 r1361, r1358, r1335; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +add.f16x2 r1367, r1338, r1339; +} +{ +mul.f16x2 r1370, r1367, r1333; +} +{ +add.f16x2 r1373, r1341, r1370; +} +{ +sub.f16x2 r1376, r1344, r1345; +} +{ +mul.f16x2 r1379, r1376, r1335; +} +{ +sub.f16x2 r1382, r1373, r1379; +} +{ +add.f16x2 r1385, r1344, r1345; +} +{ +mul.f16x2 r1388, r1385, r1333; +} +{ +add.f16x2 r1391, r1347, r1388; +} +{ +sub.f16x2 r1394, r1338, r1339; +} +{ +mul.f16x2 r1397, r1394, r1335; +} +{ +sub.f16x2 r1400, r1391, r1397; +} +{ +add.f16x2 r1403, r1344, r1345; +} +{ +mul.f16x2 r1406, r1403, r1333; +} +{ +add.f16x2 r1409, r1347, r1406; +} +{ +sub.f16x2 r1412, r1338, r1339; +} +{ +mul.f16x2 r1415, r1412, r1335; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1421, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1422, {low, high}; +} +{ +neg.f16x2 r1423, r1422; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1425; +} +{ +add.f16x2 r1431, r1432, r1433; +} +{ +add.f16x2 r1434, r1435, r1431; +} +{ +add.f16x2 r1437, r1426, r1427; +} +{ +mul.f16x2 r1440, r1437, r1421; +} +{ +add.f16x2 r1443, r1429, r1440; +} +{ +sub.f16x2 r1446, r1432, r1433; +} +{ +mul.f16x2 r1449, r1446, r1423; +} +{ +add.f16x2 r1452, r1443, r1449; +} +{ +add.f16x2 r1455, r1426, r1427; +} +{ +mul.f16x2 r1458, r1455, r1421; +} +{ +add.f16x2 r1461, r1429, r1458; +} +{ +sub.f16x2 r1464, r1432, r1433; +} +{ +mul.f16x2 r1467, r1464, r1423; +} +{ +sub.f16x2 r1470, r1461, r1467; +} +{ +add.f16x2 r1473, r1432, r1433; +} +{ +mul.f16x2 r1476, r1473, r1421; +} +{ +add.f16x2 r1479, r1435, r1476; +} +{ +sub.f16x2 r1482, r1426, r1427; +} +{ +mul.f16x2 r1485, r1482, r1423; +} +{ +sub.f16x2 r1488, r1479, r1485; +} +{ +add.f16x2 r1491, r1432, r1433; +} +{ +mul.f16x2 r1494, r1491, r1421; +} +{ +add.f16x2 r1497, r1435, r1494; +} +{ +sub.f16x2 r1500, r1426, r1427; +} +{ +mul.f16x2 r1503, r1500, r1423; +} +{ +add.f16x2 r1506, r1497, r1503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1512, {low, high}; +} +{ +mul.f16x2 r1519, r1452, r1509; +} +{ +mul.f16x2 r1522, r1488, r1510; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1452, r1510; +} +{ +fma.rn.f16x2 r1531, r1488, r1509, r1528; +} +{ +mul.f16x2 r1535, r1470, r1511; +} +{ +mul.f16x2 r1538, r1506, r1512; +} +{ +sub.f16x2 r1541, r1535, r1538; +} +{ +mul.f16x2 r1544, r1470, r1512; +} +{ +fma.rn.f16x2 r1547, r1506, r1511, r1544; +} +{ +add.f16x2 r1551, r1340, r1428; +} +{ +add.f16x2 r1554, r1346, r1434; +} +{ +sub.f16x2 r1557, r1340, r1428; +} +{ +sub.f16x2 r1560, r1346, r1434; +} +{ +add.f16x2 r1563, r1364, r1525; +} +{ +add.f16x2 r1566, r1400, r1531; +} +{ +sub.f16x2 r1569, r1364, r1525; +} +{ +sub.f16x2 r1572, r1400, r1531; +} +{ +add.f16x2 r1575, r1382, r1541; +} +{ +add.f16x2 r1578, r1418, r1547; +} +{ +sub.f16x2 r1581, r1382, r1541; +} +{ +sub.f16x2 r1584, r1418, r1547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f310; +cvt.rn.f16.f32 high, f310; +mov.b32 r1587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r1589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1590, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1593, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1595, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1596, {low, high}; +} +{ +mul.f16x2 r1609, r1563, r1587; +} +{ +mul.f16x2 r1612, r1566, r1588; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1563, r1588; +} +{ +fma.rn.f16x2 r1621, r1566, r1587, r1618; +} +{ +mul.f16x2 r1625, r1575, r1589; +} +{ +mul.f16x2 r1628, r1578, r1590; +} +{ +sub.f16x2 r1631, r1625, r1628; +} +{ +mul.f16x2 r1634, r1575, r1590; +} +{ +fma.rn.f16x2 r1637, r1578, r1589, r1634; +} +{ +neg.f16x2 r1641, r1557; +} +{ +mul.f16x2 r1643, r1569, r1593; +} +{ +mul.f16x2 r1646, r1572, r1594; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1569, r1594; +} +{ +fma.rn.f16x2 r1655, r1572, r1593, r1652; +} +{ +mul.f16x2 r1659, r1581, r1595; +} +{ +mul.f16x2 r1662, r1584, r1596; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r1581, r1596; +} +{ +fma.rn.f16x2 r1671, r1584, r1595, r1668; +} +{ +add.f16x2 r1675, r1297, r1551; +} +{ +add.f16x2 r1678, r1300, r1554; +} +{ +sub.f16x2 r1681, r1297, r1551; +} +{ +sub.f16x2 r1684, r1300, r1554; +} +{ +add.f16x2 r1687, r1309, r1615; +} +{ +add.f16x2 r1690, r1312, r1621; +} +{ +sub.f16x2 r1693, r1309, r1615; +} +{ +sub.f16x2 r1696, r1312, r1621; +} +{ +add.f16x2 r1699, r1321, r1631; +} +{ +add.f16x2 r1702, r1324, r1637; +} +{ +sub.f16x2 r1705, r1321, r1631; +} +{ +sub.f16x2 r1708, r1324, r1637; +} +{ +add.f16x2 r1711, r1303, r1560; +} +{ +add.f16x2 r1714, r1306, r1641; +} +{ +sub.f16x2 r1717, r1303, r1560; +} +{ +sub.f16x2 r1720, r1306, r1641; +} +{ +add.f16x2 r1723, r1315, r1649; +} +{ +add.f16x2 r1726, r1318, r1655; +} +{ +sub.f16x2 r1729, r1315, r1649; +} +{ +sub.f16x2 r1732, r1318, r1655; +} +{ +add.f16x2 r1735, r1327, r1665; +} +{ +add.f16x2 r1738, r1330, r1671; +} +{ +sub.f16x2 r1741, r1327, r1665; +} +{ +sub.f16x2 r1744, r1330, r1671; +} +mul.wide.u32 rd5, r2832, -1431655765; +shr.u64 rd6, rd5, 35; +cvt.u32.u64 r2838, rd6; +cvt.rn.f32.u32 f356, r2838; +mul.f32 f357, f356, 0f3D32B8C2; +cos.approx.f32 f227, f357; +sin.approx.f32 f358, f357; +neg.f32 f228, f358; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f227; +cvt.rn.f16.f32 high, f228; +mov.b32 r1747, {low, high}; +} +mul.lo.s32 r2839, r2838, 12; +sub.s32 r2840, r2832, r2839; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1752, {high, high}; +} +{ +mul.f16x2 r1754, r1690, r1752; +} +{ +neg.f16x2 r1757, r1754; +} +{ +fma.rn.f16x2 r1759, r1687, r1750, r1757; +} +{ +mul.f16x2 r1763, r1687, r1752; +} +{ +fma.rn.f16x2 r1766, r1690, r1750, r1763; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1775, r1772, r1774; +} +{ +mul.f16x2 r1778, r1747, r1770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1781, {high, low}; +} +{ +fma.rn.f16x2 r1783, r1775, r1781, r1778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1783; +mov.b32 r1787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1783; +mov.b32 r1789, {high, high}; +} +{ +mul.f16x2 r1791, r1702, r1789; +} +{ +neg.f16x2 r1794, r1791; +} +{ +fma.rn.f16x2 r1796, r1699, r1787, r1794; +} +{ +mul.f16x2 r1800, r1699, r1789; +} +{ +fma.rn.f16x2 r1803, r1702, r1787, r1800; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1811, {low, high}; +} +{ +mul.f16x2 r1812, r1809, r1811; +} +{ +mul.f16x2 r1815, r1783, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1783; +mov.b32 r1818, {high, low}; +} +{ +fma.rn.f16x2 r1820, r1812, r1818, r1815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1820; +mov.b32 r1824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1820; +mov.b32 r1826, {high, high}; +} +{ +mul.f16x2 r1828, r1714, r1826; +} +{ +neg.f16x2 r1831, r1828; +} +{ +fma.rn.f16x2 r1833, r1711, r1824, r1831; +} +{ +mul.f16x2 r1837, r1711, r1826; +} +{ +fma.rn.f16x2 r1840, r1714, r1824, r1837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1848, {low, high}; +} +{ +mul.f16x2 r1849, r1846, r1848; +} +{ +mul.f16x2 r1852, r1820, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1820; +mov.b32 r1855, {high, low}; +} +{ +fma.rn.f16x2 r1857, r1849, r1855, r1852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1857; +mov.b32 r1861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1857; +mov.b32 r1863, {high, high}; +} +{ +mul.f16x2 r1865, r1726, r1863; +} +{ +neg.f16x2 r1868, r1865; +} +{ +fma.rn.f16x2 r1870, r1723, r1861, r1868; +} +{ +mul.f16x2 r1874, r1723, r1863; +} +{ +fma.rn.f16x2 r1877, r1726, r1861, r1874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1885, {low, high}; +} +{ +mul.f16x2 r1886, r1883, r1885; +} +{ +mul.f16x2 r1889, r1857, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1857; +mov.b32 r1892, {high, low}; +} +{ +fma.rn.f16x2 r1894, r1886, r1892, r1889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1894; +mov.b32 r1898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1894; +mov.b32 r1900, {high, high}; +} +{ +mul.f16x2 r1902, r1738, r1900; +} +{ +neg.f16x2 r1905, r1902; +} +{ +fma.rn.f16x2 r1907, r1735, r1898, r1905; +} +{ +mul.f16x2 r1911, r1735, r1900; +} +{ +fma.rn.f16x2 r1914, r1738, r1898, r1911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1922, {low, high}; +} +{ +mul.f16x2 r1923, r1920, r1922; +} +{ +mul.f16x2 r1926, r1894, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1894; +mov.b32 r1929, {high, low}; +} +{ +fma.rn.f16x2 r1931, r1923, r1929, r1926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1931; +mov.b32 r1935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1931; +mov.b32 r1937, {high, high}; +} +{ +mul.f16x2 r1939, r1684, r1937; +} +{ +neg.f16x2 r1942, r1939; +} +{ +fma.rn.f16x2 r1944, r1681, r1935, r1942; +} +{ +mul.f16x2 r1948, r1681, r1937; +} +{ +fma.rn.f16x2 r1951, r1684, r1935, r1948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1959, {low, high}; +} +{ +mul.f16x2 r1960, r1957, r1959; +} +{ +mul.f16x2 r1963, r1931, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1931; +mov.b32 r1966, {high, low}; +} +{ +fma.rn.f16x2 r1968, r1960, r1966, r1963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1968; +mov.b32 r1972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1968; +mov.b32 r1974, {high, high}; +} +{ +mul.f16x2 r1976, r1696, r1974; +} +{ +neg.f16x2 r1979, r1976; +} +{ +fma.rn.f16x2 r1981, r1693, r1972, r1979; +} +{ +mul.f16x2 r1985, r1693, r1974; +} +{ +fma.rn.f16x2 r1988, r1696, r1972, r1985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1996, {low, high}; +} +{ +mul.f16x2 r1997, r1994, r1996; +} +{ +mul.f16x2 r2000, r1968, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1968; +mov.b32 r2003, {high, low}; +} +{ +fma.rn.f16x2 r2005, r1997, r2003, r2000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2005; +mov.b32 r2009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2005; +mov.b32 r2011, {high, high}; +} +{ +mul.f16x2 r2013, r1708, r2011; +} +{ +neg.f16x2 r2016, r2013; +} +{ +fma.rn.f16x2 r2018, r1705, r2009, r2016; +} +{ +mul.f16x2 r2022, r1705, r2011; +} +{ +fma.rn.f16x2 r2025, r1708, r2009, r2022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2033, {low, high}; +} +{ +mul.f16x2 r2034, r2031, r2033; +} +{ +mul.f16x2 r2037, r2005, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2005; +mov.b32 r2040, {high, low}; +} +{ +fma.rn.f16x2 r2042, r2034, r2040, r2037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2042; +mov.b32 r2046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2042; +mov.b32 r2048, {high, high}; +} +{ +mul.f16x2 r2050, r1720, r2048; +} +{ +neg.f16x2 r2053, r2050; +} +{ +fma.rn.f16x2 r2055, r1717, r2046, r2053; +} +{ +mul.f16x2 r2059, r1717, r2048; +} +{ +fma.rn.f16x2 r2062, r1720, r2046, r2059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2070, {low, high}; +} +{ +mul.f16x2 r2071, r2068, r2070; +} +{ +mul.f16x2 r2074, r2042, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2042; +mov.b32 r2077, {high, low}; +} +{ +fma.rn.f16x2 r2079, r2071, r2077, r2074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2079; +mov.b32 r2083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2079; +mov.b32 r2085, {high, high}; +} +{ +mul.f16x2 r2087, r1732, r2085; +} +{ +neg.f16x2 r2090, r2087; +} +{ +fma.rn.f16x2 r2092, r1729, r2083, r2090; +} +{ +mul.f16x2 r2096, r1729, r2085; +} +{ +fma.rn.f16x2 r2099, r1732, r2083, r2096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2107, {low, high}; +} +{ +mul.f16x2 r2108, r2105, r2107; +} +{ +mul.f16x2 r2111, r2079, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2079; +mov.b32 r2114, {high, low}; +} +{ +fma.rn.f16x2 r2116, r2108, r2114, r2111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2116; +mov.b32 r2120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2116; +mov.b32 r2122, {high, high}; +} +{ +mul.f16x2 r2124, r1744, r2122; +} +{ +neg.f16x2 r2127, r2124; +} +{ +fma.rn.f16x2 r2129, r1741, r2120, r2127; +} +{ +mul.f16x2 r2133, r1741, r2122; +} +{ +fma.rn.f16x2 r2136, r1744, r2120, r2133; +} +shl.b32 r2841, r2840, 3; +add.s32 r2842, r2835, r2841; +barrier.sync 0; +mad.lo.s32 r2843, r2838, 1152, r2842; +st.shared.u32 [r2843], r1675; +st.shared.u32 [r2843+4], r1678; +st.shared.u32 [r2843+96], r1759; +st.shared.u32 [r2843+100], r1766; +st.shared.u32 [r2843+192], r1796; +st.shared.u32 [r2843+196], r1803; +st.shared.u32 [r2843+288], r1833; +st.shared.u32 [r2843+292], r1840; +st.shared.u32 [r2843+384], r1870; +st.shared.u32 [r2843+388], r1877; +st.shared.u32 [r2843+480], r1907; +st.shared.u32 [r2843+484], r1914; +st.shared.u32 [r2843+576], r1944; +st.shared.u32 [r2843+580], r1951; +st.shared.u32 [r2843+672], r1981; +st.shared.u32 [r2843+676], r1988; +st.shared.u32 [r2843+768], r2018; +st.shared.u32 [r2843+772], r2025; +st.shared.u32 [r2843+864], r2055; +st.shared.u32 [r2843+868], r2062; +st.shared.u32 [r2843+960], r2092; +st.shared.u32 [r2843+964], r2099; +st.shared.u32 [r2843+1056], r2129; +st.shared.u32 [r2843+1060], r2136; +barrier.sync 0; +ld.shared.u32 r2165, [r2837]; +ld.shared.u32 r2171, [r2837+4]; +ld.shared.u32 r2419, [r2837+1152]; +ld.shared.u32 r2425, [r2837+1156]; +ld.shared.u32 r2253, [r2837+2304]; +ld.shared.u32 r2259, [r2837+2308]; +ld.shared.u32 r2507, [r2837+3456]; +ld.shared.u32 r2513, [r2837+3460]; +ld.shared.u32 r2162, [r2837+4608]; +ld.shared.u32 r2168, [r2837+4612]; +ld.shared.u32 r2416, [r2837+5760]; +ld.shared.u32 r2422, [r2837+5764]; +ld.shared.u32 r2250, [r2837+6912]; +ld.shared.u32 r2256, [r2837+6916]; +ld.shared.u32 r2504, [r2837+8064]; +ld.shared.u32 r2510, [r2837+8068]; +ld.shared.u32 r2163, [r2837+9216]; +ld.shared.u32 r2169, [r2837+9220]; +ld.shared.u32 r2417, [r2837+10368]; +ld.shared.u32 r2423, [r2837+10372]; +ld.shared.u32 r2251, [r2837+11520]; +ld.shared.u32 r2257, [r2837+11524]; +ld.shared.u32 r2505, [r2837+12672]; +ld.shared.u32 r2511, [r2837+12676]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2158, {low, high}; +} +{ +neg.f16x2 r2159, r2158; +} +{ +add.f16x2 r2161, r2162, r2163; +} +{ +add.f16x2 r2164, r2165, r2161; +} +{ +add.f16x2 r2167, r2168, r2169; +} +{ +add.f16x2 r2170, r2171, r2167; +} +{ +add.f16x2 r2173, r2162, r2163; +} +{ +mul.f16x2 r2176, r2173, r2157; +} +{ +add.f16x2 r2179, r2165, r2176; +} +{ +sub.f16x2 r2182, r2168, r2169; +} +{ +mul.f16x2 r2185, r2182, r2159; +} +{ +add.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r2162, r2163; +} +{ +mul.f16x2 r2194, r2191, r2157; +} +{ +add.f16x2 r2197, r2165, r2194; +} +{ +sub.f16x2 r2200, r2168, r2169; +} +{ +mul.f16x2 r2203, r2200, r2159; +} +{ +sub.f16x2 r2206, r2197, r2203; +} +{ +add.f16x2 r2209, r2168, r2169; +} +{ +mul.f16x2 r2212, r2209, r2157; +} +{ +add.f16x2 r2215, r2171, r2212; +} +{ +sub.f16x2 r2218, r2162, r2163; +} +{ +mul.f16x2 r2221, r2218, r2159; +} +{ +sub.f16x2 r2224, r2215, r2221; +} +{ +add.f16x2 r2227, r2168, r2169; +} +{ +mul.f16x2 r2230, r2227, r2157; +} +{ +add.f16x2 r2233, r2171, r2230; +} +{ +sub.f16x2 r2236, r2162, r2163; +} +{ +mul.f16x2 r2239, r2236, r2159; +} +{ +add.f16x2 r2242, r2233, r2239; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2246, {low, high}; +} +{ +neg.f16x2 r2247, r2246; +} +{ +add.f16x2 r2249, r2250, r2251; +} +{ +add.f16x2 r2252, r2253, r2249; +} +{ +add.f16x2 r2255, r2256, r2257; +} +{ +add.f16x2 r2258, r2259, r2255; +} +{ +add.f16x2 r2261, r2250, r2251; +} +{ +mul.f16x2 r2264, r2261, r2245; +} +{ +add.f16x2 r2267, r2253, r2264; +} +{ +sub.f16x2 r2270, r2256, r2257; +} +{ +mul.f16x2 r2273, r2270, r2247; +} +{ +add.f16x2 r2276, r2267, r2273; +} +{ +add.f16x2 r2279, r2250, r2251; +} +{ +mul.f16x2 r2282, r2279, r2245; +} +{ +add.f16x2 r2285, r2253, r2282; +} +{ +sub.f16x2 r2288, r2256, r2257; +} +{ +mul.f16x2 r2291, r2288, r2247; +} +{ +sub.f16x2 r2294, r2285, r2291; +} +{ +add.f16x2 r2297, r2256, r2257; +} +{ +mul.f16x2 r2300, r2297, r2245; +} +{ +add.f16x2 r2303, r2259, r2300; +} +{ +sub.f16x2 r2306, r2250, r2251; +} +{ +mul.f16x2 r2309, r2306, r2247; +} +{ +sub.f16x2 r2312, r2303, r2309; +} +{ +add.f16x2 r2315, r2256, r2257; +} +{ +mul.f16x2 r2318, r2315, r2245; +} +{ +add.f16x2 r2321, r2259, r2318; +} +{ +sub.f16x2 r2324, r2250, r2251; +} +{ +mul.f16x2 r2327, r2324, r2247; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r2333, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2334, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2336, {low, high}; +} +{ +mul.f16x2 r2343, r2276, r2333; +} +{ +mul.f16x2 r2346, r2312, r2334; +} +{ +sub.f16x2 r2349, r2343, r2346; +} +{ +mul.f16x2 r2352, r2276, r2334; +} +{ +fma.rn.f16x2 r2355, r2312, r2333, r2352; +} +{ +mul.f16x2 r2359, r2294, r2335; +} +{ +mul.f16x2 r2362, r2330, r2336; +} +{ +sub.f16x2 r2365, r2359, r2362; +} +{ +mul.f16x2 r2368, r2294, r2336; +} +{ +fma.rn.f16x2 r2371, r2330, r2335, r2368; +} +{ +add.f16x2 r2375, r2164, r2252; +} +{ +add.f16x2 r2378, r2170, r2258; +} +{ +sub.f16x2 r2381, r2164, r2252; +} +{ +sub.f16x2 r2384, r2170, r2258; +} +{ +add.f16x2 r2387, r2188, r2349; +} +{ +add.f16x2 r2390, r2224, r2355; +} +{ +sub.f16x2 r2393, r2188, r2349; +} +{ +sub.f16x2 r2396, r2224, r2355; +} +{ +add.f16x2 r2399, r2206, r2365; +} +{ +add.f16x2 r2402, r2242, r2371; +} +{ +sub.f16x2 r2405, r2206, r2365; +} +{ +sub.f16x2 r2408, r2242, r2371; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2411, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2412, {low, high}; +} +{ +neg.f16x2 r2413, r2412; +} +{ +add.f16x2 r2415, r2416, r2417; +} +{ +add.f16x2 r2418, r2419, r2415; +} +{ +add.f16x2 r2421, r2422, r2423; +} +{ +add.f16x2 r2424, r2425, r2421; +} +{ +add.f16x2 r2427, r2416, r2417; +} +{ +mul.f16x2 r2430, r2427, r2411; +} +{ +add.f16x2 r2433, r2419, r2430; +} +{ +sub.f16x2 r2436, r2422, r2423; +} +{ +mul.f16x2 r2439, r2436, r2413; +} +{ +add.f16x2 r2442, r2433, r2439; +} +{ +add.f16x2 r2445, r2416, r2417; +} +{ +mul.f16x2 r2448, r2445, r2411; +} +{ +add.f16x2 r2451, r2419, r2448; +} +{ +sub.f16x2 r2454, r2422, r2423; +} +{ +mul.f16x2 r2457, r2454, r2413; +} +{ +sub.f16x2 r2460, r2451, r2457; +} +{ +add.f16x2 r2463, r2422, r2423; +} +{ +mul.f16x2 r2466, r2463, r2411; +} +{ +add.f16x2 r2469, r2425, r2466; +} +{ +sub.f16x2 r2472, r2416, r2417; +} +{ +mul.f16x2 r2475, r2472, r2413; +} +{ +sub.f16x2 r2478, r2469, r2475; +} +{ +add.f16x2 r2481, r2422, r2423; +} +{ +mul.f16x2 r2484, r2481, r2411; +} +{ +add.f16x2 r2487, r2425, r2484; +} +{ +sub.f16x2 r2490, r2416, r2417; +} +{ +mul.f16x2 r2493, r2490, r2413; +} +{ +add.f16x2 r2496, r2487, r2493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2499, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2500, {low, high}; +} +{ +neg.f16x2 r2501, r2500; +} +{ +add.f16x2 r2503, r2504, r2505; +} +{ +add.f16x2 r2506, r2507, r2503; +} +{ +add.f16x2 r2509, r2510, r2511; +} +{ +add.f16x2 r2512, r2513, r2509; +} +{ +add.f16x2 r2515, r2504, r2505; +} +{ +mul.f16x2 r2518, r2515, r2499; +} +{ +add.f16x2 r2521, r2507, r2518; +} +{ +sub.f16x2 r2524, r2510, r2511; +} +{ +mul.f16x2 r2527, r2524, r2501; +} +{ +add.f16x2 r2530, r2521, r2527; +} +{ +add.f16x2 r2533, r2504, r2505; +} +{ +mul.f16x2 r2536, r2533, r2499; +} +{ +add.f16x2 r2539, r2507, r2536; +} +{ +sub.f16x2 r2542, r2510, r2511; +} +{ +mul.f16x2 r2545, r2542, r2501; +} +{ +sub.f16x2 r2548, r2539, r2545; +} +{ +add.f16x2 r2551, r2510, r2511; +} +{ +mul.f16x2 r2554, r2551, r2499; +} +{ +add.f16x2 r2557, r2513, r2554; +} +{ +sub.f16x2 r2560, r2504, r2505; +} +{ +mul.f16x2 r2563, r2560, r2501; +} +{ +sub.f16x2 r2566, r2557, r2563; +} +{ +add.f16x2 r2569, r2510, r2511; +} +{ +mul.f16x2 r2572, r2569, r2499; +} +{ +add.f16x2 r2575, r2513, r2572; +} +{ +sub.f16x2 r2578, r2504, r2505; +} +{ +mul.f16x2 r2581, r2578, r2501; +} +{ +add.f16x2 r2584, r2575, r2581; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r2587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2590, {low, high}; +} +{ +mul.f16x2 r2597, r2530, r2587; +} +{ +mul.f16x2 r2600, r2566, r2588; +} +{ +sub.f16x2 r2603, r2597, r2600; +} +{ +mul.f16x2 r2606, r2530, r2588; +} +{ +fma.rn.f16x2 r2609, r2566, r2587, r2606; +} +{ +mul.f16x2 r2613, r2548, r2589; +} +{ +mul.f16x2 r2616, r2584, r2590; +} +{ +sub.f16x2 r2619, r2613, r2616; +} +{ +mul.f16x2 r2622, r2548, r2590; +} +{ +fma.rn.f16x2 r2625, r2584, r2589, r2622; +} +{ +add.f16x2 r2629, r2418, r2506; +} +{ +add.f16x2 r2632, r2424, r2512; +} +{ +sub.f16x2 r2635, r2418, r2506; +} +{ +sub.f16x2 r2638, r2424, r2512; +} +{ +add.f16x2 r2641, r2442, r2603; +} +{ +add.f16x2 r2644, r2478, r2609; +} +{ +sub.f16x2 r2647, r2442, r2603; +} +{ +sub.f16x2 r2650, r2478, r2609; +} +{ +add.f16x2 r2653, r2460, r2619; +} +{ +add.f16x2 r2656, r2496, r2625; +} +{ +sub.f16x2 r2659, r2460, r2619; +} +{ +sub.f16x2 r2662, r2496, r2625; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f310; +cvt.rn.f16.f32 high, f310; +mov.b32 r2665, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2666, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r2667, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2668, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2672, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2673, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2674, {low, high}; +} +{ +mul.f16x2 r2687, r2641, r2665; +} +{ +mul.f16x2 r2690, r2644, r2666; +} +{ +sub.f16x2 r2693, r2687, r2690; +} +{ +mul.f16x2 r2696, r2641, r2666; +} +{ +fma.rn.f16x2 r2699, r2644, r2665, r2696; +} +{ +mul.f16x2 r2703, r2653, r2667; +} +{ +mul.f16x2 r2706, r2656, r2668; +} +{ +sub.f16x2 r2709, r2703, r2706; +} +{ +mul.f16x2 r2712, r2653, r2668; +} +{ +fma.rn.f16x2 r2715, r2656, r2667, r2712; +} +{ +neg.f16x2 r2719, r2635; +} +{ +mul.f16x2 r2721, r2647, r2671; +} +{ +mul.f16x2 r2724, r2650, r2672; +} +{ +sub.f16x2 r2727, r2721, r2724; +} +{ +mul.f16x2 r2730, r2647, r2672; +} +{ +fma.rn.f16x2 r2733, r2650, r2671, r2730; +} +{ +mul.f16x2 r2737, r2659, r2673; +} +{ +mul.f16x2 r2740, r2662, r2674; +} +{ +sub.f16x2 r2743, r2737, r2740; +} +{ +mul.f16x2 r2746, r2659, r2674; +} +{ +fma.rn.f16x2 r2749, r2662, r2673, r2746; +} +{ +add.f16x2 %0, r2375, r2629; +} +{ +add.f16x2 %1, r2378, r2632; +} +{ +sub.f16x2 %12, r2375, r2629; +} +{ +sub.f16x2 %13, r2378, r2632; +} +{ +add.f16x2 %2, r2387, r2693; +} +{ +add.f16x2 %3, r2390, r2699; +} +{ +sub.f16x2 %14, r2387, r2693; +} +{ +sub.f16x2 %15, r2390, r2699; +} +{ +add.f16x2 %4, r2399, r2709; +} +{ +add.f16x2 %5, r2402, r2715; +} +{ +sub.f16x2 %16, r2399, r2709; +} +{ +sub.f16x2 %17, r2402, r2715; +} +{ +add.f16x2 %6, r2381, r2638; +} +{ +add.f16x2 %7, r2384, r2719; +} +{ +sub.f16x2 %18, r2381, r2638; +} +{ +sub.f16x2 %19, r2384, r2719; +} +{ +add.f16x2 %8, r2393, r2727; +} +{ +add.f16x2 %9, r2396, r2733; +} +{ +sub.f16x2 %20, r2393, r2727; +} +{ +sub.f16x2 %21, r2396, r2733; +} +{ +add.f16x2 %10, r2405, r2743; +} +{ +add.f16x2 %11, r2408, r2749; +} +{ +sub.f16x2 %22, r2405, r2743; +} +{ +sub.f16x2 %23, r2408, r2749; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<947, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<359>; +.reg .b32 r<2841>; +.reg .b64 rd<6>; +mov.u32 r2825, %tid.y; +mov.u32 r2826, %24; +mad.lo.s32 r2827, r2825, 6912, r2826; +mov.u32 r2828, %tid.x; +mov.f32 f328, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1, {low, high}; +} +mov.f32 f326, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %33, %41; +} +{ +add.f16x2 r8, %25, r5; +} +{ +add.f16x2 r11, %34, %42; +} +{ +add.f16x2 r14, %26, r11; +} +{ +add.f16x2 r17, %33, %41; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %25, r20; +} +{ +sub.f16x2 r26, %34, %42; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %33, %41; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %25, r38; +} +{ +sub.f16x2 r44, %34, %42; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %34, %42; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %26, r56; +} +{ +sub.f16x2 r62, %33, %41; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %34, %42; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %26, r74; +} +{ +sub.f16x2 r80, %33, %41; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %37, %45; +} +{ +add.f16x2 r96, %29, r93; +} +{ +add.f16x2 r99, %38, %46; +} +{ +add.f16x2 r102, %30, r99; +} +{ +add.f16x2 r105, %37, %45; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %29, r108; +} +{ +sub.f16x2 r114, %38, %46; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %37, %45; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %29, r126; +} +{ +sub.f16x2 r132, %38, %46; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %38, %46; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %30, r144; +} +{ +sub.f16x2 r150, %37, %45; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %38, %46; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %30, r162; +} +{ +sub.f16x2 r168, %37, %45; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f314, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r180, {low, high}; +} +mov.f32 f249, 0fBF800000; +mov.f32 f310, 0f3F5DB3D7; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r256, {low, high}; +} +{ +neg.f16x2 r257, r256; +} +{ +add.f16x2 r259, %35, %43; +} +{ +add.f16x2 r262, %27, r259; +} +{ +add.f16x2 r265, %36, %44; +} +{ +add.f16x2 r268, %28, r265; +} +{ +add.f16x2 r271, %35, %43; +} +{ +mul.f16x2 r274, r271, r255; +} +{ +add.f16x2 r277, %27, r274; +} +{ +sub.f16x2 r280, %36, %44; +} +{ +mul.f16x2 r283, r280, r257; +} +{ +add.f16x2 r286, r277, r283; +} +{ +add.f16x2 r289, %35, %43; +} +{ +mul.f16x2 r292, r289, r255; +} +{ +add.f16x2 r295, %27, r292; +} +{ +sub.f16x2 r298, %36, %44; +} +{ +mul.f16x2 r301, r298, r257; +} +{ +sub.f16x2 r304, r295, r301; +} +{ +add.f16x2 r307, %36, %44; +} +{ +mul.f16x2 r310, r307, r255; +} +{ +add.f16x2 r313, %28, r310; +} +{ +sub.f16x2 r316, %35, %43; +} +{ +mul.f16x2 r319, r316, r257; +} +{ +sub.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %36, %44; +} +{ +mul.f16x2 r328, r325, r255; +} +{ +add.f16x2 r331, %28, r328; +} +{ +sub.f16x2 r334, %35, %43; +} +{ +mul.f16x2 r337, r334, r257; +} +{ +add.f16x2 r340, r331, r337; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r343, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r344, {low, high}; +} +{ +neg.f16x2 r345, r344; +} +{ +add.f16x2 r347, %39, %47; +} +{ +add.f16x2 r350, %31, r347; +} +{ +add.f16x2 r353, %40, %48; +} +{ +add.f16x2 r356, %32, r353; +} +{ +add.f16x2 r359, %39, %47; +} +{ +mul.f16x2 r362, r359, r343; +} +{ +add.f16x2 r365, %31, r362; +} +{ +sub.f16x2 r368, %40, %48; +} +{ +mul.f16x2 r371, r368, r345; +} +{ +add.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, %39, %47; +} +{ +mul.f16x2 r380, r377, r343; +} +{ +add.f16x2 r383, %31, r380; +} +{ +sub.f16x2 r386, %40, %48; +} +{ +mul.f16x2 r389, r386, r345; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %40, %48; +} +{ +mul.f16x2 r398, r395, r343; +} +{ +add.f16x2 r401, %32, r398; +} +{ +sub.f16x2 r404, %39, %47; +} +{ +mul.f16x2 r407, r404, r345; +} +{ +sub.f16x2 r410, r401, r407; +} +{ +add.f16x2 r413, %40, %48; +} +{ +mul.f16x2 r416, r413, r343; +} +{ +add.f16x2 r419, %32, r416; +} +{ +sub.f16x2 r422, %39, %47; +} +{ +mul.f16x2 r425, r422, r345; +} +{ +add.f16x2 r428, r419, r425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r434, {low, high}; +} +{ +mul.f16x2 r441, r374, r431; +} +{ +mul.f16x2 r444, r410, r432; +} +{ +sub.f16x2 r447, r441, r444; +} +{ +mul.f16x2 r450, r374, r432; +} +{ +fma.rn.f16x2 r453, r410, r431, r450; +} +{ +mul.f16x2 r457, r392, r433; +} +{ +mul.f16x2 r460, r428, r434; +} +{ +sub.f16x2 r463, r457, r460; +} +{ +mul.f16x2 r466, r392, r434; +} +{ +fma.rn.f16x2 r469, r428, r433, r466; +} +{ +add.f16x2 r473, r262, r350; +} +{ +add.f16x2 r476, r268, r356; +} +{ +sub.f16x2 r479, r262, r350; +} +{ +sub.f16x2 r482, r268, r356; +} +{ +add.f16x2 r485, r286, r447; +} +{ +add.f16x2 r488, r322, r453; +} +{ +sub.f16x2 r491, r286, r447; +} +{ +sub.f16x2 r494, r322, r453; +} +{ +add.f16x2 r497, r304, r463; +} +{ +add.f16x2 r500, r340, r469; +} +{ +sub.f16x2 r503, r304, r463; +} +{ +sub.f16x2 r506, r340, r469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f310; +cvt.rn.f16.f32 high, f310; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r512, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r516, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r517, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r518, {low, high}; +} +mov.f32 f250, 0f3F800000; +{ +mul.f16x2 r531, r485, r509; +} +{ +mul.f16x2 r534, r488, r510; +} +{ +sub.f16x2 r537, r531, r534; +} +{ +mul.f16x2 r540, r485, r510; +} +{ +fma.rn.f16x2 r543, r488, r509, r540; +} +{ +mul.f16x2 r547, r497, r511; +} +{ +mul.f16x2 r550, r500, r512; +} +{ +sub.f16x2 r553, r547, r550; +} +{ +mul.f16x2 r556, r497, r512; +} +{ +fma.rn.f16x2 r559, r500, r511, r556; +} +{ +neg.f16x2 r563, r479; +} +{ +mul.f16x2 r565, r491, r515; +} +{ +mul.f16x2 r568, r494, r516; +} +{ +sub.f16x2 r571, r565, r568; +} +{ +mul.f16x2 r574, r491, r516; +} +{ +fma.rn.f16x2 r577, r494, r515, r574; +} +{ +mul.f16x2 r581, r503, r517; +} +{ +mul.f16x2 r584, r506, r518; +} +{ +sub.f16x2 r587, r581, r584; +} +{ +mul.f16x2 r590, r503, r518; +} +{ +fma.rn.f16x2 r593, r506, r517, r590; +} +{ +add.f16x2 r597, r219, r473; +} +{ +add.f16x2 r600, r222, r476; +} +{ +sub.f16x2 r603, r219, r473; +} +{ +sub.f16x2 r606, r222, r476; +} +{ +add.f16x2 r609, r231, r537; +} +{ +add.f16x2 r612, r234, r543; +} +{ +sub.f16x2 r615, r231, r537; +} +{ +sub.f16x2 r618, r234, r543; +} +{ +add.f16x2 r621, r243, r553; +} +{ +add.f16x2 r624, r246, r559; +} +{ +sub.f16x2 r627, r243, r553; +} +{ +sub.f16x2 r630, r246, r559; +} +{ +add.f16x2 r633, r225, r482; +} +{ +add.f16x2 r636, r228, r563; +} +{ +sub.f16x2 r639, r225, r482; +} +{ +sub.f16x2 r642, r228, r563; +} +{ +add.f16x2 r645, r237, r571; +} +{ +add.f16x2 r648, r240, r577; +} +{ +sub.f16x2 r651, r237, r571; +} +{ +sub.f16x2 r654, r240, r577; +} +{ +add.f16x2 r657, r249, r587; +} +{ +add.f16x2 r660, r252, r593; +} +{ +sub.f16x2 r663, r249, r587; +} +{ +sub.f16x2 r666, r252, r593; +} +mul.wide.u32 rd2, r2828, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r2829, rd3; +mul.lo.s32 r2830, r2829, 144; +sub.s32 r2831, r2828, r2830; +mad.lo.s32 r2832, r2829, 6912, r2827; +cvt.rn.f32.u32 f353, r2831; +mul.f32 f354, f353, 0f3B6E4BAE; +cos.approx.f32 f101, f354; +sin.approx.f32 f355, f354; +neg.f32 f102, f355; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r669, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r674, {high, high}; +} +{ +mul.f16x2 r676, r612, r674; +} +{ +neg.f16x2 r679, r676; +} +{ +fma.rn.f16x2 r681, r609, r672, r679; +} +{ +mul.f16x2 r685, r609, r674; +} +{ +fma.rn.f16x2 r688, r612, r672, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r694, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r696, {low, high}; +} +{ +mul.f16x2 r697, r694, r696; +} +{ +mul.f16x2 r700, r669, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r703, {high, low}; +} +{ +fma.rn.f16x2 r705, r697, r703, r700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r711, {high, high}; +} +{ +mul.f16x2 r713, r624, r711; +} +{ +neg.f16x2 r716, r713; +} +{ +fma.rn.f16x2 r718, r621, r709, r716; +} +{ +mul.f16x2 r722, r621, r711; +} +{ +fma.rn.f16x2 r725, r624, r709, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r733, {low, high}; +} +{ +mul.f16x2 r734, r731, r733; +} +{ +mul.f16x2 r737, r705, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r705; +mov.b32 r740, {high, low}; +} +{ +fma.rn.f16x2 r742, r734, r740, r737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r748, {high, high}; +} +{ +mul.f16x2 r750, r636, r748; +} +{ +neg.f16x2 r753, r750; +} +{ +fma.rn.f16x2 r755, r633, r746, r753; +} +{ +mul.f16x2 r759, r633, r748; +} +{ +fma.rn.f16x2 r762, r636, r746, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r770, {low, high}; +} +{ +mul.f16x2 r771, r768, r770; +} +{ +mul.f16x2 r774, r742, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r742; +mov.b32 r777, {high, low}; +} +{ +fma.rn.f16x2 r779, r771, r777, r774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r785, {high, high}; +} +{ +mul.f16x2 r787, r648, r785; +} +{ +neg.f16x2 r790, r787; +} +{ +fma.rn.f16x2 r792, r645, r783, r790; +} +{ +mul.f16x2 r796, r645, r785; +} +{ +fma.rn.f16x2 r799, r648, r783, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r807, {low, high}; +} +{ +mul.f16x2 r808, r805, r807; +} +{ +mul.f16x2 r811, r779, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r779; +mov.b32 r814, {high, low}; +} +{ +fma.rn.f16x2 r816, r808, r814, r811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r822, {high, high}; +} +{ +mul.f16x2 r824, r660, r822; +} +{ +neg.f16x2 r827, r824; +} +{ +fma.rn.f16x2 r829, r657, r820, r827; +} +{ +mul.f16x2 r833, r657, r822; +} +{ +fma.rn.f16x2 r836, r660, r820, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r844, {low, high}; +} +{ +mul.f16x2 r845, r842, r844; +} +{ +mul.f16x2 r848, r816, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r816; +mov.b32 r851, {high, low}; +} +{ +fma.rn.f16x2 r853, r845, r851, r848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r859, {high, high}; +} +{ +mul.f16x2 r861, r606, r859; +} +{ +neg.f16x2 r864, r861; +} +{ +fma.rn.f16x2 r866, r603, r857, r864; +} +{ +mul.f16x2 r870, r603, r859; +} +{ +fma.rn.f16x2 r873, r606, r857, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r881, {low, high}; +} +{ +mul.f16x2 r882, r879, r881; +} +{ +mul.f16x2 r885, r853, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r853; +mov.b32 r888, {high, low}; +} +{ +fma.rn.f16x2 r890, r882, r888, r885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r896, {high, high}; +} +{ +mul.f16x2 r898, r618, r896; +} +{ +neg.f16x2 r901, r898; +} +{ +fma.rn.f16x2 r903, r615, r894, r901; +} +{ +mul.f16x2 r907, r615, r896; +} +{ +fma.rn.f16x2 r910, r618, r894, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r919, r916, r918; +} +{ +mul.f16x2 r922, r890, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r890; +mov.b32 r925, {high, low}; +} +{ +fma.rn.f16x2 r927, r919, r925, r922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r933, {high, high}; +} +{ +mul.f16x2 r935, r630, r933; +} +{ +neg.f16x2 r938, r935; +} +{ +fma.rn.f16x2 r940, r627, r931, r938; +} +{ +mul.f16x2 r944, r627, r933; +} +{ +fma.rn.f16x2 r947, r630, r931, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r955, {low, high}; +} +{ +mul.f16x2 r956, r953, r955; +} +{ +mul.f16x2 r959, r927, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r927; +mov.b32 r962, {high, low}; +} +{ +fma.rn.f16x2 r964, r956, r962, r959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r970, {high, high}; +} +{ +mul.f16x2 r972, r642, r970; +} +{ +neg.f16x2 r975, r972; +} +{ +fma.rn.f16x2 r977, r639, r968, r975; +} +{ +mul.f16x2 r981, r639, r970; +} +{ +fma.rn.f16x2 r984, r642, r968, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r992, {low, high}; +} +{ +mul.f16x2 r993, r990, r992; +} +{ +mul.f16x2 r996, r964, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r964; +mov.b32 r999, {high, low}; +} +{ +fma.rn.f16x2 r1001, r993, r999, r996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1007, {high, high}; +} +{ +mul.f16x2 r1009, r654, r1007; +} +{ +neg.f16x2 r1012, r1009; +} +{ +fma.rn.f16x2 r1014, r651, r1005, r1012; +} +{ +mul.f16x2 r1018, r651, r1007; +} +{ +fma.rn.f16x2 r1021, r654, r1005, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r669; +mov.b32 r1027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1029, {low, high}; +} +{ +mul.f16x2 r1030, r1027, r1029; +} +{ +mul.f16x2 r1033, r1001, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1001; +mov.b32 r1036, {high, low}; +} +{ +fma.rn.f16x2 r1038, r1030, r1036, r1033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1038; +mov.b32 r1044, {high, high}; +} +{ +mul.f16x2 r1046, r666, r1044; +} +{ +neg.f16x2 r1049, r1046; +} +{ +fma.rn.f16x2 r1051, r663, r1042, r1049; +} +{ +mul.f16x2 r1055, r663, r1044; +} +{ +fma.rn.f16x2 r1058, r666, r1042, r1055; +} +barrier.sync 0; +mad.lo.s32 r2833, r2831, 48, r2832; +st.shared.v4.f32 [r2833], {r597, r681, r718, r755}; +st.shared.v4.f32 [r2833+16], {r792, r829, r866, r903}; +st.shared.v4.f32 [r2833+32], {r940, r977, r1014, r1051}; +barrier.sync 0; +mad.lo.s32 r2834, r2831, -44, r2833; +ld.shared.u32 r1087, [r2834]; +ld.shared.u32 r1341, [r2834+576]; +ld.shared.u32 r1175, [r2834+1152]; +ld.shared.u32 r1429, [r2834+1728]; +ld.shared.u32 r1084, [r2834+2304]; +ld.shared.u32 r1338, [r2834+2880]; +ld.shared.u32 r1172, [r2834+3456]; +ld.shared.u32 r1426, [r2834+4032]; +ld.shared.u32 r1085, [r2834+4608]; +ld.shared.u32 r1339, [r2834+5184]; +ld.shared.u32 r1173, [r2834+5760]; +ld.shared.u32 r1427, [r2834+6336]; +barrier.sync 0; +st.shared.v4.f32 [r2833], {r600, r688, r725, r762}; +st.shared.v4.f32 [r2833+16], {r799, r836, r873, r910}; +st.shared.v4.f32 [r2833+32], {r947, r984, r1021, r1058}; +barrier.sync 0; +ld.shared.u32 r1093, [r2834]; +ld.shared.u32 r1347, [r2834+576]; +ld.shared.u32 r1181, [r2834+1152]; +ld.shared.u32 r1435, [r2834+1728]; +ld.shared.u32 r1090, [r2834+2304]; +ld.shared.u32 r1344, [r2834+2880]; +ld.shared.u32 r1178, [r2834+3456]; +ld.shared.u32 r1432, [r2834+4032]; +ld.shared.u32 r1091, [r2834+4608]; +ld.shared.u32 r1345, [r2834+5184]; +ld.shared.u32 r1179, [r2834+5760]; +ld.shared.u32 r1433, [r2834+6336]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1080, {low, high}; +} +{ +neg.f16x2 r1081, r1080; +} +{ +add.f16x2 r1083, r1084, r1085; +} +{ +add.f16x2 r1086, r1087, r1083; +} +{ +add.f16x2 r1089, r1090, r1091; +} +{ +add.f16x2 r1092, r1093, r1089; +} +{ +add.f16x2 r1095, r1084, r1085; +} +{ +mul.f16x2 r1098, r1095, r1079; +} +{ +add.f16x2 r1101, r1087, r1098; +} +{ +sub.f16x2 r1104, r1090, r1091; +} +{ +mul.f16x2 r1107, r1104, r1081; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +add.f16x2 r1113, r1084, r1085; +} +{ +mul.f16x2 r1116, r1113, r1079; +} +{ +add.f16x2 r1119, r1087, r1116; +} +{ +sub.f16x2 r1122, r1090, r1091; +} +{ +mul.f16x2 r1125, r1122, r1081; +} +{ +sub.f16x2 r1128, r1119, r1125; +} +{ +add.f16x2 r1131, r1090, r1091; +} +{ +mul.f16x2 r1134, r1131, r1079; +} +{ +add.f16x2 r1137, r1093, r1134; +} +{ +sub.f16x2 r1140, r1084, r1085; +} +{ +mul.f16x2 r1143, r1140, r1081; +} +{ +sub.f16x2 r1146, r1137, r1143; +} +{ +add.f16x2 r1149, r1090, r1091; +} +{ +mul.f16x2 r1152, r1149, r1079; +} +{ +add.f16x2 r1155, r1093, r1152; +} +{ +sub.f16x2 r1158, r1084, r1085; +} +{ +mul.f16x2 r1161, r1158, r1081; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1167, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1168, {low, high}; +} +{ +neg.f16x2 r1169, r1168; +} +{ +add.f16x2 r1171, r1172, r1173; +} +{ +add.f16x2 r1174, r1175, r1171; +} +{ +add.f16x2 r1177, r1178, r1179; +} +{ +add.f16x2 r1180, r1181, r1177; +} +{ +add.f16x2 r1183, r1172, r1173; +} +{ +mul.f16x2 r1186, r1183, r1167; +} +{ +add.f16x2 r1189, r1175, r1186; +} +{ +sub.f16x2 r1192, r1178, r1179; +} +{ +mul.f16x2 r1195, r1192, r1169; +} +{ +add.f16x2 r1198, r1189, r1195; +} +{ +add.f16x2 r1201, r1172, r1173; +} +{ +mul.f16x2 r1204, r1201, r1167; +} +{ +add.f16x2 r1207, r1175, r1204; +} +{ +sub.f16x2 r1210, r1178, r1179; +} +{ +mul.f16x2 r1213, r1210, r1169; +} +{ +sub.f16x2 r1216, r1207, r1213; +} +{ +add.f16x2 r1219, r1178, r1179; +} +{ +mul.f16x2 r1222, r1219, r1167; +} +{ +add.f16x2 r1225, r1181, r1222; +} +{ +sub.f16x2 r1228, r1172, r1173; +} +{ +mul.f16x2 r1231, r1228, r1169; +} +{ +sub.f16x2 r1234, r1225, r1231; +} +{ +add.f16x2 r1237, r1178, r1179; +} +{ +mul.f16x2 r1240, r1237, r1167; +} +{ +add.f16x2 r1243, r1181, r1240; +} +{ +sub.f16x2 r1246, r1172, r1173; +} +{ +mul.f16x2 r1249, r1246, r1169; +} +{ +add.f16x2 r1252, r1243, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1258, {low, high}; +} +{ +mul.f16x2 r1265, r1198, r1255; +} +{ +mul.f16x2 r1268, r1234, r1256; +} +{ +sub.f16x2 r1271, r1265, r1268; +} +{ +mul.f16x2 r1274, r1198, r1256; +} +{ +fma.rn.f16x2 r1277, r1234, r1255, r1274; +} +{ +mul.f16x2 r1281, r1216, r1257; +} +{ +mul.f16x2 r1284, r1252, r1258; +} +{ +sub.f16x2 r1287, r1281, r1284; +} +{ +mul.f16x2 r1290, r1216, r1258; +} +{ +fma.rn.f16x2 r1293, r1252, r1257, r1290; +} +{ +add.f16x2 r1297, r1086, r1174; +} +{ +add.f16x2 r1300, r1092, r1180; +} +{ +sub.f16x2 r1303, r1086, r1174; +} +{ +sub.f16x2 r1306, r1092, r1180; +} +{ +add.f16x2 r1309, r1110, r1271; +} +{ +add.f16x2 r1312, r1146, r1277; +} +{ +sub.f16x2 r1315, r1110, r1271; +} +{ +sub.f16x2 r1318, r1146, r1277; +} +{ +add.f16x2 r1321, r1128, r1287; +} +{ +add.f16x2 r1324, r1164, r1293; +} +{ +sub.f16x2 r1327, r1128, r1287; +} +{ +sub.f16x2 r1330, r1164, r1293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1333, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1334, {low, high}; +} +{ +neg.f16x2 r1335, r1334; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1337; +} +{ +add.f16x2 r1343, r1344, r1345; +} +{ +add.f16x2 r1346, r1347, r1343; +} +{ +add.f16x2 r1349, r1338, r1339; +} +{ +mul.f16x2 r1352, r1349, r1333; +} +{ +add.f16x2 r1355, r1341, r1352; +} +{ +sub.f16x2 r1358, r1344, r1345; +} +{ +mul.f16x2 r1361, r1358, r1335; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +add.f16x2 r1367, r1338, r1339; +} +{ +mul.f16x2 r1370, r1367, r1333; +} +{ +add.f16x2 r1373, r1341, r1370; +} +{ +sub.f16x2 r1376, r1344, r1345; +} +{ +mul.f16x2 r1379, r1376, r1335; +} +{ +sub.f16x2 r1382, r1373, r1379; +} +{ +add.f16x2 r1385, r1344, r1345; +} +{ +mul.f16x2 r1388, r1385, r1333; +} +{ +add.f16x2 r1391, r1347, r1388; +} +{ +sub.f16x2 r1394, r1338, r1339; +} +{ +mul.f16x2 r1397, r1394, r1335; +} +{ +sub.f16x2 r1400, r1391, r1397; +} +{ +add.f16x2 r1403, r1344, r1345; +} +{ +mul.f16x2 r1406, r1403, r1333; +} +{ +add.f16x2 r1409, r1347, r1406; +} +{ +sub.f16x2 r1412, r1338, r1339; +} +{ +mul.f16x2 r1415, r1412, r1335; +} +{ +add.f16x2 r1418, r1409, r1415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1421, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1422, {low, high}; +} +{ +neg.f16x2 r1423, r1422; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1425; +} +{ +add.f16x2 r1431, r1432, r1433; +} +{ +add.f16x2 r1434, r1435, r1431; +} +{ +add.f16x2 r1437, r1426, r1427; +} +{ +mul.f16x2 r1440, r1437, r1421; +} +{ +add.f16x2 r1443, r1429, r1440; +} +{ +sub.f16x2 r1446, r1432, r1433; +} +{ +mul.f16x2 r1449, r1446, r1423; +} +{ +add.f16x2 r1452, r1443, r1449; +} +{ +add.f16x2 r1455, r1426, r1427; +} +{ +mul.f16x2 r1458, r1455, r1421; +} +{ +add.f16x2 r1461, r1429, r1458; +} +{ +sub.f16x2 r1464, r1432, r1433; +} +{ +mul.f16x2 r1467, r1464, r1423; +} +{ +sub.f16x2 r1470, r1461, r1467; +} +{ +add.f16x2 r1473, r1432, r1433; +} +{ +mul.f16x2 r1476, r1473, r1421; +} +{ +add.f16x2 r1479, r1435, r1476; +} +{ +sub.f16x2 r1482, r1426, r1427; +} +{ +mul.f16x2 r1485, r1482, r1423; +} +{ +sub.f16x2 r1488, r1479, r1485; +} +{ +add.f16x2 r1491, r1432, r1433; +} +{ +mul.f16x2 r1494, r1491, r1421; +} +{ +add.f16x2 r1497, r1435, r1494; +} +{ +sub.f16x2 r1500, r1426, r1427; +} +{ +mul.f16x2 r1503, r1500, r1423; +} +{ +add.f16x2 r1506, r1497, r1503; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r1509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1510, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1512, {low, high}; +} +{ +mul.f16x2 r1519, r1452, r1509; +} +{ +mul.f16x2 r1522, r1488, r1510; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1452, r1510; +} +{ +fma.rn.f16x2 r1531, r1488, r1509, r1528; +} +{ +mul.f16x2 r1535, r1470, r1511; +} +{ +mul.f16x2 r1538, r1506, r1512; +} +{ +sub.f16x2 r1541, r1535, r1538; +} +{ +mul.f16x2 r1544, r1470, r1512; +} +{ +fma.rn.f16x2 r1547, r1506, r1511, r1544; +} +{ +add.f16x2 r1551, r1340, r1428; +} +{ +add.f16x2 r1554, r1346, r1434; +} +{ +sub.f16x2 r1557, r1340, r1428; +} +{ +sub.f16x2 r1560, r1346, r1434; +} +{ +add.f16x2 r1563, r1364, r1525; +} +{ +add.f16x2 r1566, r1400, r1531; +} +{ +sub.f16x2 r1569, r1364, r1525; +} +{ +sub.f16x2 r1572, r1400, r1531; +} +{ +add.f16x2 r1575, r1382, r1541; +} +{ +add.f16x2 r1578, r1418, r1547; +} +{ +sub.f16x2 r1581, r1382, r1541; +} +{ +sub.f16x2 r1584, r1418, r1547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f310; +cvt.rn.f16.f32 high, f310; +mov.b32 r1587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r1589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1590, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1593, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1595, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1596, {low, high}; +} +{ +mul.f16x2 r1609, r1563, r1587; +} +{ +mul.f16x2 r1612, r1566, r1588; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1563, r1588; +} +{ +fma.rn.f16x2 r1621, r1566, r1587, r1618; +} +{ +mul.f16x2 r1625, r1575, r1589; +} +{ +mul.f16x2 r1628, r1578, r1590; +} +{ +sub.f16x2 r1631, r1625, r1628; +} +{ +mul.f16x2 r1634, r1575, r1590; +} +{ +fma.rn.f16x2 r1637, r1578, r1589, r1634; +} +{ +neg.f16x2 r1641, r1557; +} +{ +mul.f16x2 r1643, r1569, r1593; +} +{ +mul.f16x2 r1646, r1572, r1594; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1569, r1594; +} +{ +fma.rn.f16x2 r1655, r1572, r1593, r1652; +} +{ +mul.f16x2 r1659, r1581, r1595; +} +{ +mul.f16x2 r1662, r1584, r1596; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r1581, r1596; +} +{ +fma.rn.f16x2 r1671, r1584, r1595, r1668; +} +{ +add.f16x2 r1675, r1297, r1551; +} +{ +add.f16x2 r1678, r1300, r1554; +} +{ +sub.f16x2 r1681, r1297, r1551; +} +{ +sub.f16x2 r1684, r1300, r1554; +} +{ +add.f16x2 r1687, r1309, r1615; +} +{ +add.f16x2 r1690, r1312, r1621; +} +{ +sub.f16x2 r1693, r1309, r1615; +} +{ +sub.f16x2 r1696, r1312, r1621; +} +{ +add.f16x2 r1699, r1321, r1631; +} +{ +add.f16x2 r1702, r1324, r1637; +} +{ +sub.f16x2 r1705, r1321, r1631; +} +{ +sub.f16x2 r1708, r1324, r1637; +} +{ +add.f16x2 r1711, r1303, r1560; +} +{ +add.f16x2 r1714, r1306, r1641; +} +{ +sub.f16x2 r1717, r1303, r1560; +} +{ +sub.f16x2 r1720, r1306, r1641; +} +{ +add.f16x2 r1723, r1315, r1649; +} +{ +add.f16x2 r1726, r1318, r1655; +} +{ +sub.f16x2 r1729, r1315, r1649; +} +{ +sub.f16x2 r1732, r1318, r1655; +} +{ +add.f16x2 r1735, r1327, r1665; +} +{ +add.f16x2 r1738, r1330, r1671; +} +{ +sub.f16x2 r1741, r1327, r1665; +} +{ +sub.f16x2 r1744, r1330, r1671; +} +mul.wide.u32 rd4, r2831, -1431655765; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r2835, rd5; +mul.lo.s32 r2836, r2835, 12; +sub.s32 r2837, r2831, r2836; +shl.b32 r2838, r2837, 2; +add.s32 r2839, r2832, r2838; +cvt.rn.f32.u32 f356, r2835; +mul.f32 f357, f356, 0f3D32B8C2; +cos.approx.f32 f227, f357; +sin.approx.f32 f358, f357; +neg.f32 f228, f358; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f227; +cvt.rn.f16.f32 high, f228; +mov.b32 r1747, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1752, {high, high}; +} +{ +mul.f16x2 r1754, r1690, r1752; +} +{ +neg.f16x2 r1757, r1754; +} +{ +fma.rn.f16x2 r1759, r1687, r1750, r1757; +} +{ +mul.f16x2 r1763, r1687, r1752; +} +{ +fma.rn.f16x2 r1766, r1690, r1750, r1763; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1774, {low, high}; +} +{ +mul.f16x2 r1775, r1772, r1774; +} +{ +mul.f16x2 r1778, r1747, r1770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1781, {high, low}; +} +{ +fma.rn.f16x2 r1783, r1775, r1781, r1778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1783; +mov.b32 r1787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1783; +mov.b32 r1789, {high, high}; +} +{ +mul.f16x2 r1791, r1702, r1789; +} +{ +neg.f16x2 r1794, r1791; +} +{ +fma.rn.f16x2 r1796, r1699, r1787, r1794; +} +{ +mul.f16x2 r1800, r1699, r1789; +} +{ +fma.rn.f16x2 r1803, r1702, r1787, r1800; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1811, {low, high}; +} +{ +mul.f16x2 r1812, r1809, r1811; +} +{ +mul.f16x2 r1815, r1783, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1783; +mov.b32 r1818, {high, low}; +} +{ +fma.rn.f16x2 r1820, r1812, r1818, r1815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1820; +mov.b32 r1824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1820; +mov.b32 r1826, {high, high}; +} +{ +mul.f16x2 r1828, r1714, r1826; +} +{ +neg.f16x2 r1831, r1828; +} +{ +fma.rn.f16x2 r1833, r1711, r1824, r1831; +} +{ +mul.f16x2 r1837, r1711, r1826; +} +{ +fma.rn.f16x2 r1840, r1714, r1824, r1837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1848, {low, high}; +} +{ +mul.f16x2 r1849, r1846, r1848; +} +{ +mul.f16x2 r1852, r1820, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1820; +mov.b32 r1855, {high, low}; +} +{ +fma.rn.f16x2 r1857, r1849, r1855, r1852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1857; +mov.b32 r1861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1857; +mov.b32 r1863, {high, high}; +} +{ +mul.f16x2 r1865, r1726, r1863; +} +{ +neg.f16x2 r1868, r1865; +} +{ +fma.rn.f16x2 r1870, r1723, r1861, r1868; +} +{ +mul.f16x2 r1874, r1723, r1863; +} +{ +fma.rn.f16x2 r1877, r1726, r1861, r1874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1885, {low, high}; +} +{ +mul.f16x2 r1886, r1883, r1885; +} +{ +mul.f16x2 r1889, r1857, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1857; +mov.b32 r1892, {high, low}; +} +{ +fma.rn.f16x2 r1894, r1886, r1892, r1889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1894; +mov.b32 r1898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1894; +mov.b32 r1900, {high, high}; +} +{ +mul.f16x2 r1902, r1738, r1900; +} +{ +neg.f16x2 r1905, r1902; +} +{ +fma.rn.f16x2 r1907, r1735, r1898, r1905; +} +{ +mul.f16x2 r1911, r1735, r1900; +} +{ +fma.rn.f16x2 r1914, r1738, r1898, r1911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1922, {low, high}; +} +{ +mul.f16x2 r1923, r1920, r1922; +} +{ +mul.f16x2 r1926, r1894, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1894; +mov.b32 r1929, {high, low}; +} +{ +fma.rn.f16x2 r1931, r1923, r1929, r1926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1931; +mov.b32 r1935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1931; +mov.b32 r1937, {high, high}; +} +{ +mul.f16x2 r1939, r1684, r1937; +} +{ +neg.f16x2 r1942, r1939; +} +{ +fma.rn.f16x2 r1944, r1681, r1935, r1942; +} +{ +mul.f16x2 r1948, r1681, r1937; +} +{ +fma.rn.f16x2 r1951, r1684, r1935, r1948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1959, {low, high}; +} +{ +mul.f16x2 r1960, r1957, r1959; +} +{ +mul.f16x2 r1963, r1931, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1931; +mov.b32 r1966, {high, low}; +} +{ +fma.rn.f16x2 r1968, r1960, r1966, r1963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1968; +mov.b32 r1972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1968; +mov.b32 r1974, {high, high}; +} +{ +mul.f16x2 r1976, r1696, r1974; +} +{ +neg.f16x2 r1979, r1976; +} +{ +fma.rn.f16x2 r1981, r1693, r1972, r1979; +} +{ +mul.f16x2 r1985, r1693, r1974; +} +{ +fma.rn.f16x2 r1988, r1696, r1972, r1985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r1994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1996, {low, high}; +} +{ +mul.f16x2 r1997, r1994, r1996; +} +{ +mul.f16x2 r2000, r1968, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1968; +mov.b32 r2003, {high, low}; +} +{ +fma.rn.f16x2 r2005, r1997, r2003, r2000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2005; +mov.b32 r2009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2005; +mov.b32 r2011, {high, high}; +} +{ +mul.f16x2 r2013, r1708, r2011; +} +{ +neg.f16x2 r2016, r2013; +} +{ +fma.rn.f16x2 r2018, r1705, r2009, r2016; +} +{ +mul.f16x2 r2022, r1705, r2011; +} +{ +fma.rn.f16x2 r2025, r1708, r2009, r2022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2033, {low, high}; +} +{ +mul.f16x2 r2034, r2031, r2033; +} +{ +mul.f16x2 r2037, r2005, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2005; +mov.b32 r2040, {high, low}; +} +{ +fma.rn.f16x2 r2042, r2034, r2040, r2037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2042; +mov.b32 r2046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2042; +mov.b32 r2048, {high, high}; +} +{ +mul.f16x2 r2050, r1720, r2048; +} +{ +neg.f16x2 r2053, r2050; +} +{ +fma.rn.f16x2 r2055, r1717, r2046, r2053; +} +{ +mul.f16x2 r2059, r1717, r2048; +} +{ +fma.rn.f16x2 r2062, r1720, r2046, r2059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2070, {low, high}; +} +{ +mul.f16x2 r2071, r2068, r2070; +} +{ +mul.f16x2 r2074, r2042, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2042; +mov.b32 r2077, {high, low}; +} +{ +fma.rn.f16x2 r2079, r2071, r2077, r2074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2079; +mov.b32 r2083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2079; +mov.b32 r2085, {high, high}; +} +{ +mul.f16x2 r2087, r1732, r2085; +} +{ +neg.f16x2 r2090, r2087; +} +{ +fma.rn.f16x2 r2092, r1729, r2083, r2090; +} +{ +mul.f16x2 r2096, r1729, r2085; +} +{ +fma.rn.f16x2 r2099, r1732, r2083, r2096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1747; +mov.b32 r2105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2107, {low, high}; +} +{ +mul.f16x2 r2108, r2105, r2107; +} +{ +mul.f16x2 r2111, r2079, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2079; +mov.b32 r2114, {high, low}; +} +{ +fma.rn.f16x2 r2116, r2108, r2114, r2111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2116; +mov.b32 r2120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2116; +mov.b32 r2122, {high, high}; +} +{ +mul.f16x2 r2124, r1744, r2122; +} +{ +neg.f16x2 r2127, r2124; +} +{ +fma.rn.f16x2 r2129, r1741, r2120, r2127; +} +{ +mul.f16x2 r2133, r1741, r2122; +} +{ +fma.rn.f16x2 r2136, r1744, r2120, r2133; +} +barrier.sync 0; +mad.lo.s32 r2840, r2835, 576, r2839; +st.shared.u32 [r2840], r1675; +st.shared.u32 [r2840+48], r1759; +st.shared.u32 [r2840+96], r1796; +st.shared.u32 [r2840+144], r1833; +st.shared.u32 [r2840+192], r1870; +st.shared.u32 [r2840+240], r1907; +st.shared.u32 [r2840+288], r1944; +st.shared.u32 [r2840+336], r1981; +st.shared.u32 [r2840+384], r2018; +st.shared.u32 [r2840+432], r2055; +st.shared.u32 [r2840+480], r2092; +st.shared.u32 [r2840+528], r2129; +barrier.sync 0; +ld.shared.u32 r2165, [r2834]; +ld.shared.u32 r2419, [r2834+576]; +ld.shared.u32 r2253, [r2834+1152]; +ld.shared.u32 r2507, [r2834+1728]; +ld.shared.u32 r2162, [r2834+2304]; +ld.shared.u32 r2416, [r2834+2880]; +ld.shared.u32 r2250, [r2834+3456]; +ld.shared.u32 r2504, [r2834+4032]; +ld.shared.u32 r2163, [r2834+4608]; +ld.shared.u32 r2417, [r2834+5184]; +ld.shared.u32 r2251, [r2834+5760]; +ld.shared.u32 r2505, [r2834+6336]; +barrier.sync 0; +st.shared.u32 [r2840], r1678; +st.shared.u32 [r2840+48], r1766; +st.shared.u32 [r2840+96], r1803; +st.shared.u32 [r2840+144], r1840; +st.shared.u32 [r2840+192], r1877; +st.shared.u32 [r2840+240], r1914; +st.shared.u32 [r2840+288], r1951; +st.shared.u32 [r2840+336], r1988; +st.shared.u32 [r2840+384], r2025; +st.shared.u32 [r2840+432], r2062; +st.shared.u32 [r2840+480], r2099; +st.shared.u32 [r2840+528], r2136; +barrier.sync 0; +ld.shared.u32 r2171, [r2834]; +ld.shared.u32 r2425, [r2834+576]; +ld.shared.u32 r2259, [r2834+1152]; +ld.shared.u32 r2513, [r2834+1728]; +ld.shared.u32 r2168, [r2834+2304]; +ld.shared.u32 r2422, [r2834+2880]; +ld.shared.u32 r2256, [r2834+3456]; +ld.shared.u32 r2510, [r2834+4032]; +ld.shared.u32 r2169, [r2834+4608]; +ld.shared.u32 r2423, [r2834+5184]; +ld.shared.u32 r2257, [r2834+5760]; +ld.shared.u32 r2511, [r2834+6336]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2158, {low, high}; +} +{ +neg.f16x2 r2159, r2158; +} +{ +add.f16x2 r2161, r2162, r2163; +} +{ +add.f16x2 r2164, r2165, r2161; +} +{ +add.f16x2 r2167, r2168, r2169; +} +{ +add.f16x2 r2170, r2171, r2167; +} +{ +add.f16x2 r2173, r2162, r2163; +} +{ +mul.f16x2 r2176, r2173, r2157; +} +{ +add.f16x2 r2179, r2165, r2176; +} +{ +sub.f16x2 r2182, r2168, r2169; +} +{ +mul.f16x2 r2185, r2182, r2159; +} +{ +add.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r2162, r2163; +} +{ +mul.f16x2 r2194, r2191, r2157; +} +{ +add.f16x2 r2197, r2165, r2194; +} +{ +sub.f16x2 r2200, r2168, r2169; +} +{ +mul.f16x2 r2203, r2200, r2159; +} +{ +sub.f16x2 r2206, r2197, r2203; +} +{ +add.f16x2 r2209, r2168, r2169; +} +{ +mul.f16x2 r2212, r2209, r2157; +} +{ +add.f16x2 r2215, r2171, r2212; +} +{ +sub.f16x2 r2218, r2162, r2163; +} +{ +mul.f16x2 r2221, r2218, r2159; +} +{ +sub.f16x2 r2224, r2215, r2221; +} +{ +add.f16x2 r2227, r2168, r2169; +} +{ +mul.f16x2 r2230, r2227, r2157; +} +{ +add.f16x2 r2233, r2171, r2230; +} +{ +sub.f16x2 r2236, r2162, r2163; +} +{ +mul.f16x2 r2239, r2236, r2159; +} +{ +add.f16x2 r2242, r2233, r2239; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2246, {low, high}; +} +{ +neg.f16x2 r2247, r2246; +} +{ +add.f16x2 r2249, r2250, r2251; +} +{ +add.f16x2 r2252, r2253, r2249; +} +{ +add.f16x2 r2255, r2256, r2257; +} +{ +add.f16x2 r2258, r2259, r2255; +} +{ +add.f16x2 r2261, r2250, r2251; +} +{ +mul.f16x2 r2264, r2261, r2245; +} +{ +add.f16x2 r2267, r2253, r2264; +} +{ +sub.f16x2 r2270, r2256, r2257; +} +{ +mul.f16x2 r2273, r2270, r2247; +} +{ +add.f16x2 r2276, r2267, r2273; +} +{ +add.f16x2 r2279, r2250, r2251; +} +{ +mul.f16x2 r2282, r2279, r2245; +} +{ +add.f16x2 r2285, r2253, r2282; +} +{ +sub.f16x2 r2288, r2256, r2257; +} +{ +mul.f16x2 r2291, r2288, r2247; +} +{ +sub.f16x2 r2294, r2285, r2291; +} +{ +add.f16x2 r2297, r2256, r2257; +} +{ +mul.f16x2 r2300, r2297, r2245; +} +{ +add.f16x2 r2303, r2259, r2300; +} +{ +sub.f16x2 r2306, r2250, r2251; +} +{ +mul.f16x2 r2309, r2306, r2247; +} +{ +sub.f16x2 r2312, r2303, r2309; +} +{ +add.f16x2 r2315, r2256, r2257; +} +{ +mul.f16x2 r2318, r2315, r2245; +} +{ +add.f16x2 r2321, r2259, r2318; +} +{ +sub.f16x2 r2324, r2250, r2251; +} +{ +mul.f16x2 r2327, r2324, r2247; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r2333, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2334, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2336, {low, high}; +} +{ +mul.f16x2 r2343, r2276, r2333; +} +{ +mul.f16x2 r2346, r2312, r2334; +} +{ +sub.f16x2 r2349, r2343, r2346; +} +{ +mul.f16x2 r2352, r2276, r2334; +} +{ +fma.rn.f16x2 r2355, r2312, r2333, r2352; +} +{ +mul.f16x2 r2359, r2294, r2335; +} +{ +mul.f16x2 r2362, r2330, r2336; +} +{ +sub.f16x2 r2365, r2359, r2362; +} +{ +mul.f16x2 r2368, r2294, r2336; +} +{ +fma.rn.f16x2 r2371, r2330, r2335, r2368; +} +{ +add.f16x2 r2375, r2164, r2252; +} +{ +add.f16x2 r2378, r2170, r2258; +} +{ +sub.f16x2 r2381, r2164, r2252; +} +{ +sub.f16x2 r2384, r2170, r2258; +} +{ +add.f16x2 r2387, r2188, r2349; +} +{ +add.f16x2 r2390, r2224, r2355; +} +{ +sub.f16x2 r2393, r2188, r2349; +} +{ +sub.f16x2 r2396, r2224, r2355; +} +{ +add.f16x2 r2399, r2206, r2365; +} +{ +add.f16x2 r2402, r2242, r2371; +} +{ +sub.f16x2 r2405, r2206, r2365; +} +{ +sub.f16x2 r2408, r2242, r2371; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2411, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2412, {low, high}; +} +{ +neg.f16x2 r2413, r2412; +} +{ +add.f16x2 r2415, r2416, r2417; +} +{ +add.f16x2 r2418, r2419, r2415; +} +{ +add.f16x2 r2421, r2422, r2423; +} +{ +add.f16x2 r2424, r2425, r2421; +} +{ +add.f16x2 r2427, r2416, r2417; +} +{ +mul.f16x2 r2430, r2427, r2411; +} +{ +add.f16x2 r2433, r2419, r2430; +} +{ +sub.f16x2 r2436, r2422, r2423; +} +{ +mul.f16x2 r2439, r2436, r2413; +} +{ +add.f16x2 r2442, r2433, r2439; +} +{ +add.f16x2 r2445, r2416, r2417; +} +{ +mul.f16x2 r2448, r2445, r2411; +} +{ +add.f16x2 r2451, r2419, r2448; +} +{ +sub.f16x2 r2454, r2422, r2423; +} +{ +mul.f16x2 r2457, r2454, r2413; +} +{ +sub.f16x2 r2460, r2451, r2457; +} +{ +add.f16x2 r2463, r2422, r2423; +} +{ +mul.f16x2 r2466, r2463, r2411; +} +{ +add.f16x2 r2469, r2425, r2466; +} +{ +sub.f16x2 r2472, r2416, r2417; +} +{ +mul.f16x2 r2475, r2472, r2413; +} +{ +sub.f16x2 r2478, r2469, r2475; +} +{ +add.f16x2 r2481, r2422, r2423; +} +{ +mul.f16x2 r2484, r2481, r2411; +} +{ +add.f16x2 r2487, r2425, r2484; +} +{ +sub.f16x2 r2490, r2416, r2417; +} +{ +mul.f16x2 r2493, r2490, r2413; +} +{ +add.f16x2 r2496, r2487, r2493; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2499, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2500, {low, high}; +} +{ +neg.f16x2 r2501, r2500; +} +{ +add.f16x2 r2503, r2504, r2505; +} +{ +add.f16x2 r2506, r2507, r2503; +} +{ +add.f16x2 r2509, r2510, r2511; +} +{ +add.f16x2 r2512, r2513, r2509; +} +{ +add.f16x2 r2515, r2504, r2505; +} +{ +mul.f16x2 r2518, r2515, r2499; +} +{ +add.f16x2 r2521, r2507, r2518; +} +{ +sub.f16x2 r2524, r2510, r2511; +} +{ +mul.f16x2 r2527, r2524, r2501; +} +{ +add.f16x2 r2530, r2521, r2527; +} +{ +add.f16x2 r2533, r2504, r2505; +} +{ +mul.f16x2 r2536, r2533, r2499; +} +{ +add.f16x2 r2539, r2507, r2536; +} +{ +sub.f16x2 r2542, r2510, r2511; +} +{ +mul.f16x2 r2545, r2542, r2501; +} +{ +sub.f16x2 r2548, r2539, r2545; +} +{ +add.f16x2 r2551, r2510, r2511; +} +{ +mul.f16x2 r2554, r2551, r2499; +} +{ +add.f16x2 r2557, r2513, r2554; +} +{ +sub.f16x2 r2560, r2504, r2505; +} +{ +mul.f16x2 r2563, r2560, r2501; +} +{ +sub.f16x2 r2566, r2557, r2563; +} +{ +add.f16x2 r2569, r2510, r2511; +} +{ +mul.f16x2 r2572, r2569, r2499; +} +{ +add.f16x2 r2575, r2513, r2572; +} +{ +sub.f16x2 r2578, r2504, r2505; +} +{ +mul.f16x2 r2581, r2578, r2501; +} +{ +add.f16x2 r2584, r2575, r2581; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r2587, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2588, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2589, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2590, {low, high}; +} +{ +mul.f16x2 r2597, r2530, r2587; +} +{ +mul.f16x2 r2600, r2566, r2588; +} +{ +sub.f16x2 r2603, r2597, r2600; +} +{ +mul.f16x2 r2606, r2530, r2588; +} +{ +fma.rn.f16x2 r2609, r2566, r2587, r2606; +} +{ +mul.f16x2 r2613, r2548, r2589; +} +{ +mul.f16x2 r2616, r2584, r2590; +} +{ +sub.f16x2 r2619, r2613, r2616; +} +{ +mul.f16x2 r2622, r2548, r2590; +} +{ +fma.rn.f16x2 r2625, r2584, r2589, r2622; +} +{ +add.f16x2 r2629, r2418, r2506; +} +{ +add.f16x2 r2632, r2424, r2512; +} +{ +sub.f16x2 r2635, r2418, r2506; +} +{ +sub.f16x2 r2638, r2424, r2512; +} +{ +add.f16x2 r2641, r2442, r2603; +} +{ +add.f16x2 r2644, r2478, r2609; +} +{ +sub.f16x2 r2647, r2442, r2603; +} +{ +sub.f16x2 r2650, r2478, r2609; +} +{ +add.f16x2 r2653, r2460, r2619; +} +{ +add.f16x2 r2656, r2496, r2625; +} +{ +sub.f16x2 r2659, r2460, r2619; +} +{ +sub.f16x2 r2662, r2496, r2625; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f310; +cvt.rn.f16.f32 high, f310; +mov.b32 r2665, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2666, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f314; +cvt.rn.f16.f32 high, f314; +mov.b32 r2667, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2668, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2672, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2673, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2674, {low, high}; +} +{ +mul.f16x2 r2687, r2641, r2665; +} +{ +mul.f16x2 r2690, r2644, r2666; +} +{ +sub.f16x2 r2693, r2687, r2690; +} +{ +mul.f16x2 r2696, r2641, r2666; +} +{ +fma.rn.f16x2 r2699, r2644, r2665, r2696; +} +{ +mul.f16x2 r2703, r2653, r2667; +} +{ +mul.f16x2 r2706, r2656, r2668; +} +{ +sub.f16x2 r2709, r2703, r2706; +} +{ +mul.f16x2 r2712, r2653, r2668; +} +{ +fma.rn.f16x2 r2715, r2656, r2667, r2712; +} +{ +neg.f16x2 r2719, r2635; +} +{ +mul.f16x2 r2721, r2647, r2671; +} +{ +mul.f16x2 r2724, r2650, r2672; +} +{ +sub.f16x2 r2727, r2721, r2724; +} +{ +mul.f16x2 r2730, r2647, r2672; +} +{ +fma.rn.f16x2 r2733, r2650, r2671, r2730; +} +{ +mul.f16x2 r2737, r2659, r2673; +} +{ +mul.f16x2 r2740, r2662, r2674; +} +{ +sub.f16x2 r2743, r2737, r2740; +} +{ +mul.f16x2 r2746, r2659, r2674; +} +{ +fma.rn.f16x2 r2749, r2662, r2673, r2746; +} +{ +add.f16x2 %0, r2375, r2629; +} +{ +add.f16x2 %1, r2378, r2632; +} +{ +sub.f16x2 %12, r2375, r2629; +} +{ +sub.f16x2 %13, r2378, r2632; +} +{ +add.f16x2 %2, r2387, r2693; +} +{ +add.f16x2 %3, r2390, r2699; +} +{ +sub.f16x2 %14, r2387, r2693; +} +{ +sub.f16x2 %15, r2390, r2699; +} +{ +add.f16x2 %4, r2399, r2709; +} +{ +add.f16x2 %5, r2402, r2715; +} +{ +sub.f16x2 %16, r2399, r2709; +} +{ +sub.f16x2 %17, r2402, r2715; +} +{ +add.f16x2 %6, r2381, r2638; +} +{ +add.f16x2 %7, r2384, r2719; +} +{ +sub.f16x2 %18, r2381, r2638; +} +{ +sub.f16x2 %19, r2384, r2719; +} +{ +add.f16x2 %8, r2393, r2727; +} +{ +add.f16x2 %9, r2396, r2733; +} +{ +sub.f16x2 %20, r2393, r2727; +} +{ +sub.f16x2 %21, r2396, r2733; +} +{ +add.f16x2 %10, r2405, r2743; +} +{ +add.f16x2 %11, r2408, r2749; +} +{ +sub.f16x2 %22, r2405, r2743; +} +{ +sub.f16x2 %23, r2408, r2749; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ab94c158efa90 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp16_inv.hpp.inc @@ -0,0 +1,7080 @@ +#ifndef CUFFTDX_FFT_1728_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_1728_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1148, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<359>; +.reg .b32 r<2820>; +.reg .b64 rd<7>; +mov.u32 r2801, %tid.y; +shl.b32 r2802, r2801, 1; +mov.u32 r2803, %24; +mad.lo.s32 r2804, r2802, 6912, r2803; +mov.u32 r2805, %tid.x; +mov.f32 f322, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1, {low, high}; +} +mov.f32 f326, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %33, %41; +} +{ +add.f16x2 r6, %25, r3; +} +{ +add.f16x2 r9, %34, %42; +} +{ +add.f16x2 r12, %26, r9; +} +{ +add.f16x2 r15, %33, %41; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %25, r18; +} +{ +sub.f16x2 r24, %34, %42; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %33, %41; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %25, r36; +} +{ +sub.f16x2 r42, %34, %42; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %34, %42; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %26, r54; +} +{ +sub.f16x2 r60, %33, %41; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %34, %42; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %26, r72; +} +{ +sub.f16x2 r78, %33, %41; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %37, %45; +} +{ +add.f16x2 r92, %29, r89; +} +{ +add.f16x2 r95, %38, %46; +} +{ +add.f16x2 r98, %30, r95; +} +{ +add.f16x2 r101, %37, %45; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %29, r104; +} +{ +sub.f16x2 r110, %38, %46; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %37, %45; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %29, r122; +} +{ +sub.f16x2 r128, %38, %46; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %38, %46; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %30, r140; +} +{ +sub.f16x2 r146, %37, %45; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %38, %46; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %30, r158; +} +{ +sub.f16x2 r164, %37, %45; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f328, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r173, {low, high}; +} +mov.f32 f324, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r176, {low, high}; +} +mov.f32 f249, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r252, {low, high}; +} +{ +add.f16x2 r253, %35, %43; +} +{ +add.f16x2 r256, %27, r253; +} +{ +add.f16x2 r259, %36, %44; +} +{ +add.f16x2 r262, %28, r259; +} +{ +add.f16x2 r265, %35, %43; +} +{ +mul.f16x2 r268, r265, r251; +} +{ +add.f16x2 r271, %27, r268; +} +{ +sub.f16x2 r274, %36, %44; +} +{ +mul.f16x2 r277, r274, r252; +} +{ +add.f16x2 r280, r271, r277; +} +{ +add.f16x2 r283, %35, %43; +} +{ +mul.f16x2 r286, r283, r251; +} +{ +add.f16x2 r289, %27, r286; +} +{ +sub.f16x2 r292, %36, %44; +} +{ +mul.f16x2 r295, r292, r252; +} +{ +sub.f16x2 r298, r289, r295; +} +{ +add.f16x2 r301, %36, %44; +} +{ +mul.f16x2 r304, r301, r251; +} +{ +add.f16x2 r307, %28, r304; +} +{ +sub.f16x2 r310, %35, %43; +} +{ +mul.f16x2 r313, r310, r252; +} +{ +sub.f16x2 r316, r307, r313; +} +{ +add.f16x2 r319, %36, %44; +} +{ +mul.f16x2 r322, r319, r251; +} +{ +add.f16x2 r325, %28, r322; +} +{ +sub.f16x2 r328, %35, %43; +} +{ +mul.f16x2 r331, r328, r252; +} +{ +add.f16x2 r334, r325, r331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r337, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r338, {low, high}; +} +{ +add.f16x2 r339, %39, %47; +} +{ +add.f16x2 r342, %31, r339; +} +{ +add.f16x2 r345, %40, %48; +} +{ +add.f16x2 r348, %32, r345; +} +{ +add.f16x2 r351, %39, %47; +} +{ +mul.f16x2 r354, r351, r337; +} +{ +add.f16x2 r357, %31, r354; +} +{ +sub.f16x2 r360, %40, %48; +} +{ +mul.f16x2 r363, r360, r338; +} +{ +add.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %39, %47; +} +{ +mul.f16x2 r372, r369, r337; +} +{ +add.f16x2 r375, %31, r372; +} +{ +sub.f16x2 r378, %40, %48; +} +{ +mul.f16x2 r381, r378, r338; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %40, %48; +} +{ +mul.f16x2 r390, r387, r337; +} +{ +add.f16x2 r393, %32, r390; +} +{ +sub.f16x2 r396, %39, %47; +} +{ +mul.f16x2 r399, r396, r338; +} +{ +sub.f16x2 r402, r393, r399; +} +{ +add.f16x2 r405, %40, %48; +} +{ +mul.f16x2 r408, r405, r337; +} +{ +add.f16x2 r411, %32, r408; +} +{ +sub.f16x2 r414, %39, %47; +} +{ +mul.f16x2 r417, r414, r338; +} +{ +add.f16x2 r420, r411, r417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r426, {low, high}; +} +{ +mul.f16x2 r433, r366, r423; +} +{ +mul.f16x2 r436, r402, r424; +} +{ +sub.f16x2 r439, r433, r436; +} +{ +mul.f16x2 r442, r366, r424; +} +{ +fma.rn.f16x2 r445, r402, r423, r442; +} +{ +mul.f16x2 r449, r384, r425; +} +{ +mul.f16x2 r452, r420, r426; +} +{ +sub.f16x2 r455, r449, r452; +} +{ +mul.f16x2 r458, r384, r426; +} +{ +fma.rn.f16x2 r461, r420, r425, r458; +} +{ +add.f16x2 r465, r256, r342; +} +{ +add.f16x2 r468, r262, r348; +} +{ +sub.f16x2 r471, r256, r342; +} +{ +sub.f16x2 r474, r262, r348; +} +{ +add.f16x2 r477, r280, r439; +} +{ +add.f16x2 r480, r316, r445; +} +{ +sub.f16x2 r483, r280, r439; +} +{ +sub.f16x2 r486, r316, r445; +} +{ +add.f16x2 r489, r298, r455; +} +{ +add.f16x2 r492, r334, r461; +} +{ +sub.f16x2 r495, r298, r455; +} +{ +sub.f16x2 r498, r334, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r501, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r502, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r503, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r504, {low, high}; +} +mov.f32 f250, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r507, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r510, {low, high}; +} +{ +mul.f16x2 r523, r477, r501; +} +{ +mul.f16x2 r526, r480, r502; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r477, r502; +} +{ +fma.rn.f16x2 r535, r480, r501, r532; +} +{ +mul.f16x2 r539, r489, r503; +} +{ +mul.f16x2 r542, r492, r504; +} +{ +sub.f16x2 r545, r539, r542; +} +{ +mul.f16x2 r548, r489, r504; +} +{ +fma.rn.f16x2 r551, r492, r503, r548; +} +{ +neg.f16x2 r555, r474; +} +{ +mul.f16x2 r557, r483, r507; +} +{ +mul.f16x2 r560, r486, r508; +} +{ +sub.f16x2 r563, r557, r560; +} +{ +mul.f16x2 r566, r483, r508; +} +{ +fma.rn.f16x2 r569, r486, r507, r566; +} +{ +mul.f16x2 r573, r495, r509; +} +{ +mul.f16x2 r576, r498, r510; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r495, r510; +} +{ +fma.rn.f16x2 r585, r498, r509, r582; +} +{ +add.f16x2 r589, r215, r465; +} +{ +add.f16x2 r592, r218, r468; +} +{ +sub.f16x2 r595, r215, r465; +} +{ +sub.f16x2 r598, r218, r468; +} +{ +add.f16x2 r601, r227, r529; +} +{ +add.f16x2 r604, r230, r535; +} +{ +sub.f16x2 r607, r227, r529; +} +{ +sub.f16x2 r610, r230, r535; +} +{ +add.f16x2 r613, r239, r545; +} +{ +add.f16x2 r616, r242, r551; +} +{ +sub.f16x2 r619, r239, r545; +} +{ +sub.f16x2 r622, r242, r551; +} +{ +add.f16x2 r625, r221, r555; +} +{ +add.f16x2 r628, r224, r471; +} +{ +sub.f16x2 r631, r221, r555; +} +{ +sub.f16x2 r634, r224, r471; +} +{ +add.f16x2 r637, r233, r563; +} +{ +add.f16x2 r640, r236, r569; +} +{ +sub.f16x2 r643, r233, r563; +} +{ +sub.f16x2 r646, r236, r569; +} +{ +add.f16x2 r649, r245, r579; +} +{ +add.f16x2 r652, r248, r585; +} +{ +sub.f16x2 r655, r245, r579; +} +{ +sub.f16x2 r658, r248, r585; +} +mul.wide.u32 rd2, r2805, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r2806, rd3; +mul.lo.s32 r2807, r2806, 144; +sub.s32 r2808, r2805, r2807; +shr.u64 rd4, rd2, 36; +cvt.u32.u64 r2809, rd4; +and.b32 r2810, r2809, 268435454; +mad.lo.s32 r2811, r2810, 6912, r2804; +cvt.rn.f32.u32 f353, r2808; +mul.f32 f354, f353, 0f3B6E4BAE; +cos.approx.f32 f101, f354; +sin.approx.f32 f355, f354; +neg.f32 f102, f355; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r664, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r666, {high, high}; +} +{ +mul.f16x2 r668, r604, r666; +} +{ +fma.rn.f16x2 r671, r601, r664, r668; +} +{ +mul.f16x2 r675, r601, r666; +} +{ +neg.f16x2 r678, r675; +} +{ +fma.rn.f16x2 r680, r604, r664, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r684, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r686, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r686, r688; +} +{ +mul.f16x2 r692, r661, r684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r695, {high, low}; +} +{ +fma.rn.f16x2 r697, r689, r695, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r701, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r703, {high, high}; +} +{ +mul.f16x2 r705, r616, r703; +} +{ +fma.rn.f16x2 r708, r613, r701, r705; +} +{ +mul.f16x2 r712, r613, r703; +} +{ +neg.f16x2 r715, r712; +} +{ +fma.rn.f16x2 r717, r616, r701, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r721, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r723, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r725, {low, high}; +} +{ +mul.f16x2 r726, r723, r725; +} +{ +mul.f16x2 r729, r697, r721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r732, {high, low}; +} +{ +fma.rn.f16x2 r734, r726, r732, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r738, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r740, {high, high}; +} +{ +mul.f16x2 r742, r628, r740; +} +{ +fma.rn.f16x2 r745, r625, r738, r742; +} +{ +mul.f16x2 r749, r625, r740; +} +{ +neg.f16x2 r752, r749; +} +{ +fma.rn.f16x2 r754, r628, r738, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r758, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r760, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r762, {low, high}; +} +{ +mul.f16x2 r763, r760, r762; +} +{ +mul.f16x2 r766, r734, r758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r769, {high, low}; +} +{ +fma.rn.f16x2 r771, r763, r769, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r775, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r777, {high, high}; +} +{ +mul.f16x2 r779, r640, r777; +} +{ +fma.rn.f16x2 r782, r637, r775, r779; +} +{ +mul.f16x2 r786, r637, r777; +} +{ +neg.f16x2 r789, r786; +} +{ +fma.rn.f16x2 r791, r640, r775, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r797, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r799, {low, high}; +} +{ +mul.f16x2 r800, r797, r799; +} +{ +mul.f16x2 r803, r771, r795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r806, {high, low}; +} +{ +fma.rn.f16x2 r808, r800, r806, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r812, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r814, {high, high}; +} +{ +mul.f16x2 r816, r652, r814; +} +{ +fma.rn.f16x2 r819, r649, r812, r816; +} +{ +mul.f16x2 r823, r649, r814; +} +{ +neg.f16x2 r826, r823; +} +{ +fma.rn.f16x2 r828, r652, r812, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r834, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r836, {low, high}; +} +{ +mul.f16x2 r837, r834, r836; +} +{ +mul.f16x2 r840, r808, r832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r843, {high, low}; +} +{ +fma.rn.f16x2 r845, r837, r843, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r849, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r851, {high, high}; +} +{ +mul.f16x2 r853, r598, r851; +} +{ +fma.rn.f16x2 r856, r595, r849, r853; +} +{ +mul.f16x2 r860, r595, r851; +} +{ +neg.f16x2 r863, r860; +} +{ +fma.rn.f16x2 r865, r598, r849, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r871, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r873, {low, high}; +} +{ +mul.f16x2 r874, r871, r873; +} +{ +mul.f16x2 r877, r845, r869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r880, {high, low}; +} +{ +fma.rn.f16x2 r882, r874, r880, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r886, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r888, {high, high}; +} +{ +mul.f16x2 r890, r610, r888; +} +{ +fma.rn.f16x2 r893, r607, r886, r890; +} +{ +mul.f16x2 r897, r607, r888; +} +{ +neg.f16x2 r900, r897; +} +{ +fma.rn.f16x2 r902, r610, r886, r900; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r908, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r910, {low, high}; +} +{ +mul.f16x2 r911, r908, r910; +} +{ +mul.f16x2 r914, r882, r906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r917, {high, low}; +} +{ +fma.rn.f16x2 r919, r911, r917, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r923, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r925, {high, high}; +} +{ +mul.f16x2 r927, r622, r925; +} +{ +fma.rn.f16x2 r930, r619, r923, r927; +} +{ +mul.f16x2 r934, r619, r925; +} +{ +neg.f16x2 r937, r934; +} +{ +fma.rn.f16x2 r939, r622, r923, r937; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r943, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r945, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r947, {low, high}; +} +{ +mul.f16x2 r948, r945, r947; +} +{ +mul.f16x2 r951, r919, r943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r954, {high, low}; +} +{ +fma.rn.f16x2 r956, r948, r954, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r960, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r962, {high, high}; +} +{ +mul.f16x2 r964, r634, r962; +} +{ +fma.rn.f16x2 r967, r631, r960, r964; +} +{ +mul.f16x2 r971, r631, r962; +} +{ +neg.f16x2 r974, r971; +} +{ +fma.rn.f16x2 r976, r634, r960, r974; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r980, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r982, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r984, {low, high}; +} +{ +mul.f16x2 r985, r982, r984; +} +{ +mul.f16x2 r988, r956, r980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r991, {high, low}; +} +{ +fma.rn.f16x2 r993, r985, r991, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r997, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r999, {high, high}; +} +{ +mul.f16x2 r1001, r646, r999; +} +{ +fma.rn.f16x2 r1004, r643, r997, r1001; +} +{ +mul.f16x2 r1008, r643, r999; +} +{ +neg.f16x2 r1011, r1008; +} +{ +fma.rn.f16x2 r1013, r646, r997, r1011; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1017, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1019, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1021, {low, high}; +} +{ +mul.f16x2 r1022, r1019, r1021; +} +{ +mul.f16x2 r1025, r993, r1017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r1028, {high, low}; +} +{ +fma.rn.f16x2 r1030, r1022, r1028, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1036, {high, high}; +} +{ +mul.f16x2 r1038, r658, r1036; +} +{ +fma.rn.f16x2 r1041, r655, r1034, r1038; +} +{ +mul.f16x2 r1045, r655, r1036; +} +{ +neg.f16x2 r1048, r1045; +} +{ +fma.rn.f16x2 r1050, r658, r1034, r1048; +} +barrier.sync 0; +mad.lo.s32 r2812, r2808, 96, r2811; +st.shared.v4.f32 [r2812], {r589, r592, r671, r680}; +st.shared.v4.f32 [r2812+16], {r708, r717, r745, r754}; +st.shared.v4.f32 [r2812+32], {r782, r791, r819, r828}; +st.shared.v4.f32 [r2812+48], {r856, r865, r893, r902}; +st.shared.v4.f32 [r2812+64], {r930, r939, r967, r976}; +st.shared.v4.f32 [r2812+80], {r1004, r1013, r1041, r1050}; +barrier.sync 0; +mad.lo.s32 r2813, r2808, -88, r2812; +ld.shared.u32 r1077, [r2813]; +ld.shared.u32 r1083, [r2813+4]; +ld.shared.u32 r1327, [r2813+1152]; +ld.shared.u32 r1333, [r2813+1156]; +ld.shared.u32 r1163, [r2813+2304]; +ld.shared.u32 r1169, [r2813+2308]; +ld.shared.u32 r1413, [r2813+3456]; +ld.shared.u32 r1419, [r2813+3460]; +ld.shared.u32 r1074, [r2813+4608]; +ld.shared.u32 r1080, [r2813+4612]; +ld.shared.u32 r1324, [r2813+5760]; +ld.shared.u32 r1330, [r2813+5764]; +ld.shared.u32 r1160, [r2813+6912]; +ld.shared.u32 r1166, [r2813+6916]; +ld.shared.u32 r1410, [r2813+8064]; +ld.shared.u32 r1416, [r2813+8068]; +ld.shared.u32 r1075, [r2813+9216]; +ld.shared.u32 r1081, [r2813+9220]; +ld.shared.u32 r1325, [r2813+10368]; +ld.shared.u32 r1331, [r2813+10372]; +ld.shared.u32 r1161, [r2813+11520]; +ld.shared.u32 r1167, [r2813+11524]; +ld.shared.u32 r1411, [r2813+12672]; +ld.shared.u32 r1417, [r2813+12676]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1071, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1072, {low, high}; +} +{ +add.f16x2 r1073, r1074, r1075; +} +{ +add.f16x2 r1076, r1077, r1073; +} +{ +add.f16x2 r1079, r1080, r1081; +} +{ +add.f16x2 r1082, r1083, r1079; +} +{ +add.f16x2 r1085, r1074, r1075; +} +{ +mul.f16x2 r1088, r1085, r1071; +} +{ +add.f16x2 r1091, r1077, r1088; +} +{ +sub.f16x2 r1094, r1080, r1081; +} +{ +mul.f16x2 r1097, r1094, r1072; +} +{ +add.f16x2 r1100, r1091, r1097; +} +{ +add.f16x2 r1103, r1074, r1075; +} +{ +mul.f16x2 r1106, r1103, r1071; +} +{ +add.f16x2 r1109, r1077, r1106; +} +{ +sub.f16x2 r1112, r1080, r1081; +} +{ +mul.f16x2 r1115, r1112, r1072; +} +{ +sub.f16x2 r1118, r1109, r1115; +} +{ +add.f16x2 r1121, r1080, r1081; +} +{ +mul.f16x2 r1124, r1121, r1071; +} +{ +add.f16x2 r1127, r1083, r1124; +} +{ +sub.f16x2 r1130, r1074, r1075; +} +{ +mul.f16x2 r1133, r1130, r1072; +} +{ +sub.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r1080, r1081; +} +{ +mul.f16x2 r1142, r1139, r1071; +} +{ +add.f16x2 r1145, r1083, r1142; +} +{ +sub.f16x2 r1148, r1074, r1075; +} +{ +mul.f16x2 r1151, r1148, r1072; +} +{ +add.f16x2 r1154, r1145, r1151; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1158, {low, high}; +} +{ +add.f16x2 r1159, r1160, r1161; +} +{ +add.f16x2 r1162, r1163, r1159; +} +{ +add.f16x2 r1165, r1166, r1167; +} +{ +add.f16x2 r1168, r1169, r1165; +} +{ +add.f16x2 r1171, r1160, r1161; +} +{ +mul.f16x2 r1174, r1171, r1157; +} +{ +add.f16x2 r1177, r1163, r1174; +} +{ +sub.f16x2 r1180, r1166, r1167; +} +{ +mul.f16x2 r1183, r1180, r1158; +} +{ +add.f16x2 r1186, r1177, r1183; +} +{ +add.f16x2 r1189, r1160, r1161; +} +{ +mul.f16x2 r1192, r1189, r1157; +} +{ +add.f16x2 r1195, r1163, r1192; +} +{ +sub.f16x2 r1198, r1166, r1167; +} +{ +mul.f16x2 r1201, r1198, r1158; +} +{ +sub.f16x2 r1204, r1195, r1201; +} +{ +add.f16x2 r1207, r1166, r1167; +} +{ +mul.f16x2 r1210, r1207, r1157; +} +{ +add.f16x2 r1213, r1169, r1210; +} +{ +sub.f16x2 r1216, r1160, r1161; +} +{ +mul.f16x2 r1219, r1216, r1158; +} +{ +sub.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, r1166, r1167; +} +{ +mul.f16x2 r1228, r1225, r1157; +} +{ +add.f16x2 r1231, r1169, r1228; +} +{ +sub.f16x2 r1234, r1160, r1161; +} +{ +mul.f16x2 r1237, r1234, r1158; +} +{ +add.f16x2 r1240, r1231, r1237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1246, {low, high}; +} +{ +mul.f16x2 r1253, r1186, r1243; +} +{ +mul.f16x2 r1256, r1222, r1244; +} +{ +sub.f16x2 r1259, r1253, r1256; +} +{ +mul.f16x2 r1262, r1186, r1244; +} +{ +fma.rn.f16x2 r1265, r1222, r1243, r1262; +} +{ +mul.f16x2 r1269, r1204, r1245; +} +{ +mul.f16x2 r1272, r1240, r1246; +} +{ +sub.f16x2 r1275, r1269, r1272; +} +{ +mul.f16x2 r1278, r1204, r1246; +} +{ +fma.rn.f16x2 r1281, r1240, r1245, r1278; +} +{ +add.f16x2 r1285, r1076, r1162; +} +{ +add.f16x2 r1288, r1082, r1168; +} +{ +sub.f16x2 r1291, r1076, r1162; +} +{ +sub.f16x2 r1294, r1082, r1168; +} +{ +add.f16x2 r1297, r1100, r1259; +} +{ +add.f16x2 r1300, r1136, r1265; +} +{ +sub.f16x2 r1303, r1100, r1259; +} +{ +sub.f16x2 r1306, r1136, r1265; +} +{ +add.f16x2 r1309, r1118, r1275; +} +{ +add.f16x2 r1312, r1154, r1281; +} +{ +sub.f16x2 r1315, r1118, r1275; +} +{ +sub.f16x2 r1318, r1154, r1281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1322, {low, high}; +} +{ +add.f16x2 r1323, r1324, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +add.f16x2 r1329, r1330, r1331; +} +{ +add.f16x2 r1332, r1333, r1329; +} +{ +add.f16x2 r1335, r1324, r1325; +} +{ +mul.f16x2 r1338, r1335, r1321; +} +{ +add.f16x2 r1341, r1327, r1338; +} +{ +sub.f16x2 r1344, r1330, r1331; +} +{ +mul.f16x2 r1347, r1344, r1322; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1324, r1325; +} +{ +mul.f16x2 r1356, r1353, r1321; +} +{ +add.f16x2 r1359, r1327, r1356; +} +{ +sub.f16x2 r1362, r1330, r1331; +} +{ +mul.f16x2 r1365, r1362, r1322; +} +{ +sub.f16x2 r1368, r1359, r1365; +} +{ +add.f16x2 r1371, r1330, r1331; +} +{ +mul.f16x2 r1374, r1371, r1321; +} +{ +add.f16x2 r1377, r1333, r1374; +} +{ +sub.f16x2 r1380, r1324, r1325; +} +{ +mul.f16x2 r1383, r1380, r1322; +} +{ +sub.f16x2 r1386, r1377, r1383; +} +{ +add.f16x2 r1389, r1330, r1331; +} +{ +mul.f16x2 r1392, r1389, r1321; +} +{ +add.f16x2 r1395, r1333, r1392; +} +{ +sub.f16x2 r1398, r1324, r1325; +} +{ +mul.f16x2 r1401, r1398, r1322; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1407, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1408, {low, high}; +} +{ +add.f16x2 r1409, r1410, r1411; +} +{ +add.f16x2 r1412, r1413, r1409; +} +{ +add.f16x2 r1415, r1416, r1417; +} +{ +add.f16x2 r1418, r1419, r1415; +} +{ +add.f16x2 r1421, r1410, r1411; +} +{ +mul.f16x2 r1424, r1421, r1407; +} +{ +add.f16x2 r1427, r1413, r1424; +} +{ +sub.f16x2 r1430, r1416, r1417; +} +{ +mul.f16x2 r1433, r1430, r1408; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +add.f16x2 r1439, r1410, r1411; +} +{ +mul.f16x2 r1442, r1439, r1407; +} +{ +add.f16x2 r1445, r1413, r1442; +} +{ +sub.f16x2 r1448, r1416, r1417; +} +{ +mul.f16x2 r1451, r1448, r1408; +} +{ +sub.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1416, r1417; +} +{ +mul.f16x2 r1460, r1457, r1407; +} +{ +add.f16x2 r1463, r1419, r1460; +} +{ +sub.f16x2 r1466, r1410, r1411; +} +{ +mul.f16x2 r1469, r1466, r1408; +} +{ +sub.f16x2 r1472, r1463, r1469; +} +{ +add.f16x2 r1475, r1416, r1417; +} +{ +mul.f16x2 r1478, r1475, r1407; +} +{ +add.f16x2 r1481, r1419, r1478; +} +{ +sub.f16x2 r1484, r1410, r1411; +} +{ +mul.f16x2 r1487, r1484, r1408; +} +{ +add.f16x2 r1490, r1481, r1487; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1493, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1494, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1495, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1496, {low, high}; +} +{ +mul.f16x2 r1503, r1436, r1493; +} +{ +mul.f16x2 r1506, r1472, r1494; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1436, r1494; +} +{ +fma.rn.f16x2 r1515, r1472, r1493, r1512; +} +{ +mul.f16x2 r1519, r1454, r1495; +} +{ +mul.f16x2 r1522, r1490, r1496; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1454, r1496; +} +{ +fma.rn.f16x2 r1531, r1490, r1495, r1528; +} +{ +add.f16x2 r1535, r1326, r1412; +} +{ +add.f16x2 r1538, r1332, r1418; +} +{ +sub.f16x2 r1541, r1326, r1412; +} +{ +sub.f16x2 r1544, r1332, r1418; +} +{ +add.f16x2 r1547, r1350, r1509; +} +{ +add.f16x2 r1550, r1386, r1515; +} +{ +sub.f16x2 r1553, r1350, r1509; +} +{ +sub.f16x2 r1556, r1386, r1515; +} +{ +add.f16x2 r1559, r1368, r1525; +} +{ +add.f16x2 r1562, r1404, r1531; +} +{ +sub.f16x2 r1565, r1368, r1525; +} +{ +sub.f16x2 r1568, r1404, r1531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1574, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1593, r1547, r1571; +} +{ +mul.f16x2 r1596, r1550, r1572; +} +{ +sub.f16x2 r1599, r1593, r1596; +} +{ +mul.f16x2 r1602, r1547, r1572; +} +{ +fma.rn.f16x2 r1605, r1550, r1571, r1602; +} +{ +mul.f16x2 r1609, r1559, r1573; +} +{ +mul.f16x2 r1612, r1562, r1574; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1559, r1574; +} +{ +fma.rn.f16x2 r1621, r1562, r1573, r1618; +} +{ +neg.f16x2 r1625, r1544; +} +{ +mul.f16x2 r1627, r1553, r1577; +} +{ +mul.f16x2 r1630, r1556, r1578; +} +{ +sub.f16x2 r1633, r1627, r1630; +} +{ +mul.f16x2 r1636, r1553, r1578; +} +{ +fma.rn.f16x2 r1639, r1556, r1577, r1636; +} +{ +mul.f16x2 r1643, r1565, r1579; +} +{ +mul.f16x2 r1646, r1568, r1580; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1565, r1580; +} +{ +fma.rn.f16x2 r1655, r1568, r1579, r1652; +} +{ +add.f16x2 r1659, r1285, r1535; +} +{ +add.f16x2 r1662, r1288, r1538; +} +{ +sub.f16x2 r1665, r1285, r1535; +} +{ +sub.f16x2 r1668, r1288, r1538; +} +{ +add.f16x2 r1671, r1297, r1599; +} +{ +add.f16x2 r1674, r1300, r1605; +} +{ +sub.f16x2 r1677, r1297, r1599; +} +{ +sub.f16x2 r1680, r1300, r1605; +} +{ +add.f16x2 r1683, r1309, r1615; +} +{ +add.f16x2 r1686, r1312, r1621; +} +{ +sub.f16x2 r1689, r1309, r1615; +} +{ +sub.f16x2 r1692, r1312, r1621; +} +{ +add.f16x2 r1695, r1291, r1625; +} +{ +add.f16x2 r1698, r1294, r1541; +} +{ +sub.f16x2 r1701, r1291, r1625; +} +{ +sub.f16x2 r1704, r1294, r1541; +} +{ +add.f16x2 r1707, r1303, r1633; +} +{ +add.f16x2 r1710, r1306, r1639; +} +{ +sub.f16x2 r1713, r1303, r1633; +} +{ +sub.f16x2 r1716, r1306, r1639; +} +{ +add.f16x2 r1719, r1315, r1649; +} +{ +add.f16x2 r1722, r1318, r1655; +} +{ +sub.f16x2 r1725, r1315, r1649; +} +{ +sub.f16x2 r1728, r1318, r1655; +} +mul.wide.u32 rd5, r2808, -1431655765; +shr.u64 rd6, rd5, 35; +cvt.u32.u64 r2814, rd6; +cvt.rn.f32.u32 f356, r2814; +mul.f32 f357, f356, 0f3D32B8C2; +cos.approx.f32 f227, f357; +sin.approx.f32 f358, f357; +neg.f32 f228, f358; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f227; +cvt.rn.f16.f32 high, f228; +mov.b32 r1731, {low, high}; +} +mul.lo.s32 r2815, r2814, 12; +sub.s32 r2816, r2808, r2815; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1734, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1736, {high, high}; +} +{ +mul.f16x2 r1738, r1674, r1736; +} +{ +fma.rn.f16x2 r1741, r1671, r1734, r1738; +} +{ +mul.f16x2 r1745, r1671, r1736; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1674, r1734, r1748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1756, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1758, {low, high}; +} +{ +mul.f16x2 r1759, r1756, r1758; +} +{ +mul.f16x2 r1762, r1731, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1765, {high, low}; +} +{ +fma.rn.f16x2 r1767, r1759, r1765, r1762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1767; +mov.b32 r1771, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1767; +mov.b32 r1773, {high, high}; +} +{ +mul.f16x2 r1775, r1686, r1773; +} +{ +fma.rn.f16x2 r1778, r1683, r1771, r1775; +} +{ +mul.f16x2 r1782, r1683, r1773; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1686, r1771, r1785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1791, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1793, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1795, {low, high}; +} +{ +mul.f16x2 r1796, r1793, r1795; +} +{ +mul.f16x2 r1799, r1767, r1791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1767; +mov.b32 r1802, {high, low}; +} +{ +fma.rn.f16x2 r1804, r1796, r1802, r1799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1804; +mov.b32 r1808, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1804; +mov.b32 r1810, {high, high}; +} +{ +mul.f16x2 r1812, r1698, r1810; +} +{ +fma.rn.f16x2 r1815, r1695, r1808, r1812; +} +{ +mul.f16x2 r1819, r1695, r1810; +} +{ +neg.f16x2 r1822, r1819; +} +{ +fma.rn.f16x2 r1824, r1698, r1808, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1828, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1830, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1832, {low, high}; +} +{ +mul.f16x2 r1833, r1830, r1832; +} +{ +mul.f16x2 r1836, r1804, r1828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1804; +mov.b32 r1839, {high, low}; +} +{ +fma.rn.f16x2 r1841, r1833, r1839, r1836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1841; +mov.b32 r1845, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1841; +mov.b32 r1847, {high, high}; +} +{ +mul.f16x2 r1849, r1710, r1847; +} +{ +fma.rn.f16x2 r1852, r1707, r1845, r1849; +} +{ +mul.f16x2 r1856, r1707, r1847; +} +{ +neg.f16x2 r1859, r1856; +} +{ +fma.rn.f16x2 r1861, r1710, r1845, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1865, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1867, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1869, {low, high}; +} +{ +mul.f16x2 r1870, r1867, r1869; +} +{ +mul.f16x2 r1873, r1841, r1865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1841; +mov.b32 r1876, {high, low}; +} +{ +fma.rn.f16x2 r1878, r1870, r1876, r1873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1878; +mov.b32 r1882, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1878; +mov.b32 r1884, {high, high}; +} +{ +mul.f16x2 r1886, r1722, r1884; +} +{ +fma.rn.f16x2 r1889, r1719, r1882, r1886; +} +{ +mul.f16x2 r1893, r1719, r1884; +} +{ +neg.f16x2 r1896, r1893; +} +{ +fma.rn.f16x2 r1898, r1722, r1882, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1902, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1904, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1906, {low, high}; +} +{ +mul.f16x2 r1907, r1904, r1906; +} +{ +mul.f16x2 r1910, r1878, r1902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1878; +mov.b32 r1913, {high, low}; +} +{ +fma.rn.f16x2 r1915, r1907, r1913, r1910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1915; +mov.b32 r1919, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1915; +mov.b32 r1921, {high, high}; +} +{ +mul.f16x2 r1923, r1668, r1921; +} +{ +fma.rn.f16x2 r1926, r1665, r1919, r1923; +} +{ +mul.f16x2 r1930, r1665, r1921; +} +{ +neg.f16x2 r1933, r1930; +} +{ +fma.rn.f16x2 r1935, r1668, r1919, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1939, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1941, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1943, {low, high}; +} +{ +mul.f16x2 r1944, r1941, r1943; +} +{ +mul.f16x2 r1947, r1915, r1939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1915; +mov.b32 r1950, {high, low}; +} +{ +fma.rn.f16x2 r1952, r1944, r1950, r1947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1952; +mov.b32 r1956, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1952; +mov.b32 r1958, {high, high}; +} +{ +mul.f16x2 r1960, r1680, r1958; +} +{ +fma.rn.f16x2 r1963, r1677, r1956, r1960; +} +{ +mul.f16x2 r1967, r1677, r1958; +} +{ +neg.f16x2 r1970, r1967; +} +{ +fma.rn.f16x2 r1972, r1680, r1956, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1976, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1978, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1980, {low, high}; +} +{ +mul.f16x2 r1981, r1978, r1980; +} +{ +mul.f16x2 r1984, r1952, r1976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1952; +mov.b32 r1987, {high, low}; +} +{ +fma.rn.f16x2 r1989, r1981, r1987, r1984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1989; +mov.b32 r1993, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1989; +mov.b32 r1995, {high, high}; +} +{ +mul.f16x2 r1997, r1692, r1995; +} +{ +fma.rn.f16x2 r2000, r1689, r1993, r1997; +} +{ +mul.f16x2 r2004, r1689, r1995; +} +{ +neg.f16x2 r2007, r2004; +} +{ +fma.rn.f16x2 r2009, r1692, r1993, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2013, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2015, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2017, {low, high}; +} +{ +mul.f16x2 r2018, r2015, r2017; +} +{ +mul.f16x2 r2021, r1989, r2013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1989; +mov.b32 r2024, {high, low}; +} +{ +fma.rn.f16x2 r2026, r2018, r2024, r2021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2026; +mov.b32 r2030, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2026; +mov.b32 r2032, {high, high}; +} +{ +mul.f16x2 r2034, r1704, r2032; +} +{ +fma.rn.f16x2 r2037, r1701, r2030, r2034; +} +{ +mul.f16x2 r2041, r1701, r2032; +} +{ +neg.f16x2 r2044, r2041; +} +{ +fma.rn.f16x2 r2046, r1704, r2030, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2050, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2052, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2054, {low, high}; +} +{ +mul.f16x2 r2055, r2052, r2054; +} +{ +mul.f16x2 r2058, r2026, r2050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2026; +mov.b32 r2061, {high, low}; +} +{ +fma.rn.f16x2 r2063, r2055, r2061, r2058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2063; +mov.b32 r2067, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2063; +mov.b32 r2069, {high, high}; +} +{ +mul.f16x2 r2071, r1716, r2069; +} +{ +fma.rn.f16x2 r2074, r1713, r2067, r2071; +} +{ +mul.f16x2 r2078, r1713, r2069; +} +{ +neg.f16x2 r2081, r2078; +} +{ +fma.rn.f16x2 r2083, r1716, r2067, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2087, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2089, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2091, {low, high}; +} +{ +mul.f16x2 r2092, r2089, r2091; +} +{ +mul.f16x2 r2095, r2063, r2087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2063; +mov.b32 r2098, {high, low}; +} +{ +fma.rn.f16x2 r2100, r2092, r2098, r2095; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2100; +mov.b32 r2104, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2100; +mov.b32 r2106, {high, high}; +} +{ +mul.f16x2 r2108, r1728, r2106; +} +{ +fma.rn.f16x2 r2111, r1725, r2104, r2108; +} +{ +mul.f16x2 r2115, r1725, r2106; +} +{ +neg.f16x2 r2118, r2115; +} +{ +fma.rn.f16x2 r2120, r1728, r2104, r2118; +} +shl.b32 r2817, r2816, 3; +add.s32 r2818, r2811, r2817; +barrier.sync 0; +mad.lo.s32 r2819, r2814, 1152, r2818; +st.shared.u32 [r2819], r1659; +st.shared.u32 [r2819+4], r1662; +st.shared.u32 [r2819+96], r1741; +st.shared.u32 [r2819+100], r1750; +st.shared.u32 [r2819+192], r1778; +st.shared.u32 [r2819+196], r1787; +st.shared.u32 [r2819+288], r1815; +st.shared.u32 [r2819+292], r1824; +st.shared.u32 [r2819+384], r1852; +st.shared.u32 [r2819+388], r1861; +st.shared.u32 [r2819+480], r1889; +st.shared.u32 [r2819+484], r1898; +st.shared.u32 [r2819+576], r1926; +st.shared.u32 [r2819+580], r1935; +st.shared.u32 [r2819+672], r1963; +st.shared.u32 [r2819+676], r1972; +st.shared.u32 [r2819+768], r2000; +st.shared.u32 [r2819+772], r2009; +st.shared.u32 [r2819+864], r2037; +st.shared.u32 [r2819+868], r2046; +st.shared.u32 [r2819+960], r2074; +st.shared.u32 [r2819+964], r2083; +st.shared.u32 [r2819+1056], r2111; +st.shared.u32 [r2819+1060], r2120; +barrier.sync 0; +ld.shared.u32 r2147, [r2813]; +ld.shared.u32 r2153, [r2813+4]; +ld.shared.u32 r2397, [r2813+1152]; +ld.shared.u32 r2403, [r2813+1156]; +ld.shared.u32 r2233, [r2813+2304]; +ld.shared.u32 r2239, [r2813+2308]; +ld.shared.u32 r2483, [r2813+3456]; +ld.shared.u32 r2489, [r2813+3460]; +ld.shared.u32 r2144, [r2813+4608]; +ld.shared.u32 r2150, [r2813+4612]; +ld.shared.u32 r2394, [r2813+5760]; +ld.shared.u32 r2400, [r2813+5764]; +ld.shared.u32 r2230, [r2813+6912]; +ld.shared.u32 r2236, [r2813+6916]; +ld.shared.u32 r2480, [r2813+8064]; +ld.shared.u32 r2486, [r2813+8068]; +ld.shared.u32 r2145, [r2813+9216]; +ld.shared.u32 r2151, [r2813+9220]; +ld.shared.u32 r2395, [r2813+10368]; +ld.shared.u32 r2401, [r2813+10372]; +ld.shared.u32 r2231, [r2813+11520]; +ld.shared.u32 r2237, [r2813+11524]; +ld.shared.u32 r2481, [r2813+12672]; +ld.shared.u32 r2487, [r2813+12676]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2141, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2142, {low, high}; +} +{ +add.f16x2 r2143, r2144, r2145; +} +{ +add.f16x2 r2146, r2147, r2143; +} +{ +add.f16x2 r2149, r2150, r2151; +} +{ +add.f16x2 r2152, r2153, r2149; +} +{ +add.f16x2 r2155, r2144, r2145; +} +{ +mul.f16x2 r2158, r2155, r2141; +} +{ +add.f16x2 r2161, r2147, r2158; +} +{ +sub.f16x2 r2164, r2150, r2151; +} +{ +mul.f16x2 r2167, r2164, r2142; +} +{ +add.f16x2 r2170, r2161, r2167; +} +{ +add.f16x2 r2173, r2144, r2145; +} +{ +mul.f16x2 r2176, r2173, r2141; +} +{ +add.f16x2 r2179, r2147, r2176; +} +{ +sub.f16x2 r2182, r2150, r2151; +} +{ +mul.f16x2 r2185, r2182, r2142; +} +{ +sub.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r2150, r2151; +} +{ +mul.f16x2 r2194, r2191, r2141; +} +{ +add.f16x2 r2197, r2153, r2194; +} +{ +sub.f16x2 r2200, r2144, r2145; +} +{ +mul.f16x2 r2203, r2200, r2142; +} +{ +sub.f16x2 r2206, r2197, r2203; +} +{ +add.f16x2 r2209, r2150, r2151; +} +{ +mul.f16x2 r2212, r2209, r2141; +} +{ +add.f16x2 r2215, r2153, r2212; +} +{ +sub.f16x2 r2218, r2144, r2145; +} +{ +mul.f16x2 r2221, r2218, r2142; +} +{ +add.f16x2 r2224, r2215, r2221; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2228, {low, high}; +} +{ +add.f16x2 r2229, r2230, r2231; +} +{ +add.f16x2 r2232, r2233, r2229; +} +{ +add.f16x2 r2235, r2236, r2237; +} +{ +add.f16x2 r2238, r2239, r2235; +} +{ +add.f16x2 r2241, r2230, r2231; +} +{ +mul.f16x2 r2244, r2241, r2227; +} +{ +add.f16x2 r2247, r2233, r2244; +} +{ +sub.f16x2 r2250, r2236, r2237; +} +{ +mul.f16x2 r2253, r2250, r2228; +} +{ +add.f16x2 r2256, r2247, r2253; +} +{ +add.f16x2 r2259, r2230, r2231; +} +{ +mul.f16x2 r2262, r2259, r2227; +} +{ +add.f16x2 r2265, r2233, r2262; +} +{ +sub.f16x2 r2268, r2236, r2237; +} +{ +mul.f16x2 r2271, r2268, r2228; +} +{ +sub.f16x2 r2274, r2265, r2271; +} +{ +add.f16x2 r2277, r2236, r2237; +} +{ +mul.f16x2 r2280, r2277, r2227; +} +{ +add.f16x2 r2283, r2239, r2280; +} +{ +sub.f16x2 r2286, r2230, r2231; +} +{ +mul.f16x2 r2289, r2286, r2228; +} +{ +sub.f16x2 r2292, r2283, r2289; +} +{ +add.f16x2 r2295, r2236, r2237; +} +{ +mul.f16x2 r2298, r2295, r2227; +} +{ +add.f16x2 r2301, r2239, r2298; +} +{ +sub.f16x2 r2304, r2230, r2231; +} +{ +mul.f16x2 r2307, r2304, r2228; +} +{ +add.f16x2 r2310, r2301, r2307; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2313, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2314, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2315, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2316, {low, high}; +} +{ +mul.f16x2 r2323, r2256, r2313; +} +{ +mul.f16x2 r2326, r2292, r2314; +} +{ +sub.f16x2 r2329, r2323, r2326; +} +{ +mul.f16x2 r2332, r2256, r2314; +} +{ +fma.rn.f16x2 r2335, r2292, r2313, r2332; +} +{ +mul.f16x2 r2339, r2274, r2315; +} +{ +mul.f16x2 r2342, r2310, r2316; +} +{ +sub.f16x2 r2345, r2339, r2342; +} +{ +mul.f16x2 r2348, r2274, r2316; +} +{ +fma.rn.f16x2 r2351, r2310, r2315, r2348; +} +{ +add.f16x2 r2355, r2146, r2232; +} +{ +add.f16x2 r2358, r2152, r2238; +} +{ +sub.f16x2 r2361, r2146, r2232; +} +{ +sub.f16x2 r2364, r2152, r2238; +} +{ +add.f16x2 r2367, r2170, r2329; +} +{ +add.f16x2 r2370, r2206, r2335; +} +{ +sub.f16x2 r2373, r2170, r2329; +} +{ +sub.f16x2 r2376, r2206, r2335; +} +{ +add.f16x2 r2379, r2188, r2345; +} +{ +add.f16x2 r2382, r2224, r2351; +} +{ +sub.f16x2 r2385, r2188, r2345; +} +{ +sub.f16x2 r2388, r2224, r2351; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2391, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2392, {low, high}; +} +{ +add.f16x2 r2393, r2394, r2395; +} +{ +add.f16x2 r2396, r2397, r2393; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2399; +} +{ +add.f16x2 r2405, r2394, r2395; +} +{ +mul.f16x2 r2408, r2405, r2391; +} +{ +add.f16x2 r2411, r2397, r2408; +} +{ +sub.f16x2 r2414, r2400, r2401; +} +{ +mul.f16x2 r2417, r2414, r2392; +} +{ +add.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r2394, r2395; +} +{ +mul.f16x2 r2426, r2423, r2391; +} +{ +add.f16x2 r2429, r2397, r2426; +} +{ +sub.f16x2 r2432, r2400, r2401; +} +{ +mul.f16x2 r2435, r2432, r2392; +} +{ +sub.f16x2 r2438, r2429, r2435; +} +{ +add.f16x2 r2441, r2400, r2401; +} +{ +mul.f16x2 r2444, r2441, r2391; +} +{ +add.f16x2 r2447, r2403, r2444; +} +{ +sub.f16x2 r2450, r2394, r2395; +} +{ +mul.f16x2 r2453, r2450, r2392; +} +{ +sub.f16x2 r2456, r2447, r2453; +} +{ +add.f16x2 r2459, r2400, r2401; +} +{ +mul.f16x2 r2462, r2459, r2391; +} +{ +add.f16x2 r2465, r2403, r2462; +} +{ +sub.f16x2 r2468, r2394, r2395; +} +{ +mul.f16x2 r2471, r2468, r2392; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2477, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2478, {low, high}; +} +{ +add.f16x2 r2479, r2480, r2481; +} +{ +add.f16x2 r2482, r2483, r2479; +} +{ +add.f16x2 r2485, r2486, r2487; +} +{ +add.f16x2 r2488, r2489, r2485; +} +{ +add.f16x2 r2491, r2480, r2481; +} +{ +mul.f16x2 r2494, r2491, r2477; +} +{ +add.f16x2 r2497, r2483, r2494; +} +{ +sub.f16x2 r2500, r2486, r2487; +} +{ +mul.f16x2 r2503, r2500, r2478; +} +{ +add.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r2480, r2481; +} +{ +mul.f16x2 r2512, r2509, r2477; +} +{ +add.f16x2 r2515, r2483, r2512; +} +{ +sub.f16x2 r2518, r2486, r2487; +} +{ +mul.f16x2 r2521, r2518, r2478; +} +{ +sub.f16x2 r2524, r2515, r2521; +} +{ +add.f16x2 r2527, r2486, r2487; +} +{ +mul.f16x2 r2530, r2527, r2477; +} +{ +add.f16x2 r2533, r2489, r2530; +} +{ +sub.f16x2 r2536, r2480, r2481; +} +{ +mul.f16x2 r2539, r2536, r2478; +} +{ +sub.f16x2 r2542, r2533, r2539; +} +{ +add.f16x2 r2545, r2486, r2487; +} +{ +mul.f16x2 r2548, r2545, r2477; +} +{ +add.f16x2 r2551, r2489, r2548; +} +{ +sub.f16x2 r2554, r2480, r2481; +} +{ +mul.f16x2 r2557, r2554, r2478; +} +{ +add.f16x2 r2560, r2551, r2557; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2564, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2565, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2566, {low, high}; +} +{ +mul.f16x2 r2573, r2506, r2563; +} +{ +mul.f16x2 r2576, r2542, r2564; +} +{ +sub.f16x2 r2579, r2573, r2576; +} +{ +mul.f16x2 r2582, r2506, r2564; +} +{ +fma.rn.f16x2 r2585, r2542, r2563, r2582; +} +{ +mul.f16x2 r2589, r2524, r2565; +} +{ +mul.f16x2 r2592, r2560, r2566; +} +{ +sub.f16x2 r2595, r2589, r2592; +} +{ +mul.f16x2 r2598, r2524, r2566; +} +{ +fma.rn.f16x2 r2601, r2560, r2565, r2598; +} +{ +add.f16x2 r2605, r2396, r2482; +} +{ +add.f16x2 r2608, r2402, r2488; +} +{ +sub.f16x2 r2611, r2396, r2482; +} +{ +sub.f16x2 r2614, r2402, r2488; +} +{ +add.f16x2 r2617, r2420, r2579; +} +{ +add.f16x2 r2620, r2456, r2585; +} +{ +sub.f16x2 r2623, r2420, r2579; +} +{ +sub.f16x2 r2626, r2456, r2585; +} +{ +add.f16x2 r2629, r2438, r2595; +} +{ +add.f16x2 r2632, r2474, r2601; +} +{ +sub.f16x2 r2635, r2438, r2595; +} +{ +sub.f16x2 r2638, r2474, r2601; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2648, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2663, r2617, r2641; +} +{ +mul.f16x2 r2666, r2620, r2642; +} +{ +sub.f16x2 r2669, r2663, r2666; +} +{ +mul.f16x2 r2672, r2617, r2642; +} +{ +fma.rn.f16x2 r2675, r2620, r2641, r2672; +} +{ +mul.f16x2 r2679, r2629, r2643; +} +{ +mul.f16x2 r2682, r2632, r2644; +} +{ +sub.f16x2 r2685, r2679, r2682; +} +{ +mul.f16x2 r2688, r2629, r2644; +} +{ +fma.rn.f16x2 r2691, r2632, r2643, r2688; +} +{ +neg.f16x2 r2695, r2614; +} +{ +mul.f16x2 r2697, r2623, r2647; +} +{ +mul.f16x2 r2700, r2626, r2648; +} +{ +sub.f16x2 r2703, r2697, r2700; +} +{ +mul.f16x2 r2706, r2623, r2648; +} +{ +fma.rn.f16x2 r2709, r2626, r2647, r2706; +} +{ +mul.f16x2 r2713, r2635, r2649; +} +{ +mul.f16x2 r2716, r2638, r2650; +} +{ +sub.f16x2 r2719, r2713, r2716; +} +{ +mul.f16x2 r2722, r2635, r2650; +} +{ +fma.rn.f16x2 r2725, r2638, r2649, r2722; +} +{ +add.f16x2 %0, r2355, r2605; +} +{ +add.f16x2 %1, r2358, r2608; +} +{ +sub.f16x2 %12, r2355, r2605; +} +{ +sub.f16x2 %13, r2358, r2608; +} +{ +add.f16x2 %2, r2367, r2669; +} +{ +add.f16x2 %3, r2370, r2675; +} +{ +sub.f16x2 %14, r2367, r2669; +} +{ +sub.f16x2 %15, r2370, r2675; +} +{ +add.f16x2 %4, r2379, r2685; +} +{ +add.f16x2 %5, r2382, r2691; +} +{ +sub.f16x2 %16, r2379, r2685; +} +{ +sub.f16x2 %17, r2382, r2691; +} +{ +add.f16x2 %6, r2361, r2695; +} +{ +add.f16x2 %7, r2364, r2611; +} +{ +sub.f16x2 %18, r2361, r2695; +} +{ +sub.f16x2 %19, r2364, r2611; +} +{ +add.f16x2 %8, r2373, r2703; +} +{ +add.f16x2 %9, r2376, r2709; +} +{ +sub.f16x2 %20, r2373, r2703; +} +{ +sub.f16x2 %21, r2376, r2709; +} +{ +add.f16x2 %10, r2385, r2719; +} +{ +add.f16x2 %11, r2388, r2725; +} +{ +sub.f16x2 %22, r2385, r2719; +} +{ +sub.f16x2 %23, r2388, r2725; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1149, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<359>; +.reg .b32 r<2817>; +.reg .b64 rd<6>; +mov.u32 r2801, %tid.y; +mov.u32 r2802, %24; +mad.lo.s32 r2803, r2801, 6912, r2802; +mov.u32 r2804, %tid.x; +mov.f32 f322, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1, {low, high}; +} +mov.f32 f326, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %33, %41; +} +{ +add.f16x2 r6, %25, r3; +} +{ +add.f16x2 r9, %34, %42; +} +{ +add.f16x2 r12, %26, r9; +} +{ +add.f16x2 r15, %33, %41; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %25, r18; +} +{ +sub.f16x2 r24, %34, %42; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %33, %41; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %25, r36; +} +{ +sub.f16x2 r42, %34, %42; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %34, %42; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %26, r54; +} +{ +sub.f16x2 r60, %33, %41; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %34, %42; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %26, r72; +} +{ +sub.f16x2 r78, %33, %41; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %37, %45; +} +{ +add.f16x2 r92, %29, r89; +} +{ +add.f16x2 r95, %38, %46; +} +{ +add.f16x2 r98, %30, r95; +} +{ +add.f16x2 r101, %37, %45; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %29, r104; +} +{ +sub.f16x2 r110, %38, %46; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %37, %45; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %29, r122; +} +{ +sub.f16x2 r128, %38, %46; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %38, %46; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %30, r140; +} +{ +sub.f16x2 r146, %37, %45; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %38, %46; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %30, r158; +} +{ +sub.f16x2 r164, %37, %45; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f328, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r173, {low, high}; +} +mov.f32 f324, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r176, {low, high}; +} +mov.f32 f249, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r252, {low, high}; +} +{ +add.f16x2 r253, %35, %43; +} +{ +add.f16x2 r256, %27, r253; +} +{ +add.f16x2 r259, %36, %44; +} +{ +add.f16x2 r262, %28, r259; +} +{ +add.f16x2 r265, %35, %43; +} +{ +mul.f16x2 r268, r265, r251; +} +{ +add.f16x2 r271, %27, r268; +} +{ +sub.f16x2 r274, %36, %44; +} +{ +mul.f16x2 r277, r274, r252; +} +{ +add.f16x2 r280, r271, r277; +} +{ +add.f16x2 r283, %35, %43; +} +{ +mul.f16x2 r286, r283, r251; +} +{ +add.f16x2 r289, %27, r286; +} +{ +sub.f16x2 r292, %36, %44; +} +{ +mul.f16x2 r295, r292, r252; +} +{ +sub.f16x2 r298, r289, r295; +} +{ +add.f16x2 r301, %36, %44; +} +{ +mul.f16x2 r304, r301, r251; +} +{ +add.f16x2 r307, %28, r304; +} +{ +sub.f16x2 r310, %35, %43; +} +{ +mul.f16x2 r313, r310, r252; +} +{ +sub.f16x2 r316, r307, r313; +} +{ +add.f16x2 r319, %36, %44; +} +{ +mul.f16x2 r322, r319, r251; +} +{ +add.f16x2 r325, %28, r322; +} +{ +sub.f16x2 r328, %35, %43; +} +{ +mul.f16x2 r331, r328, r252; +} +{ +add.f16x2 r334, r325, r331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r337, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r338, {low, high}; +} +{ +add.f16x2 r339, %39, %47; +} +{ +add.f16x2 r342, %31, r339; +} +{ +add.f16x2 r345, %40, %48; +} +{ +add.f16x2 r348, %32, r345; +} +{ +add.f16x2 r351, %39, %47; +} +{ +mul.f16x2 r354, r351, r337; +} +{ +add.f16x2 r357, %31, r354; +} +{ +sub.f16x2 r360, %40, %48; +} +{ +mul.f16x2 r363, r360, r338; +} +{ +add.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %39, %47; +} +{ +mul.f16x2 r372, r369, r337; +} +{ +add.f16x2 r375, %31, r372; +} +{ +sub.f16x2 r378, %40, %48; +} +{ +mul.f16x2 r381, r378, r338; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %40, %48; +} +{ +mul.f16x2 r390, r387, r337; +} +{ +add.f16x2 r393, %32, r390; +} +{ +sub.f16x2 r396, %39, %47; +} +{ +mul.f16x2 r399, r396, r338; +} +{ +sub.f16x2 r402, r393, r399; +} +{ +add.f16x2 r405, %40, %48; +} +{ +mul.f16x2 r408, r405, r337; +} +{ +add.f16x2 r411, %32, r408; +} +{ +sub.f16x2 r414, %39, %47; +} +{ +mul.f16x2 r417, r414, r338; +} +{ +add.f16x2 r420, r411, r417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r423, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r426, {low, high}; +} +{ +mul.f16x2 r433, r366, r423; +} +{ +mul.f16x2 r436, r402, r424; +} +{ +sub.f16x2 r439, r433, r436; +} +{ +mul.f16x2 r442, r366, r424; +} +{ +fma.rn.f16x2 r445, r402, r423, r442; +} +{ +mul.f16x2 r449, r384, r425; +} +{ +mul.f16x2 r452, r420, r426; +} +{ +sub.f16x2 r455, r449, r452; +} +{ +mul.f16x2 r458, r384, r426; +} +{ +fma.rn.f16x2 r461, r420, r425, r458; +} +{ +add.f16x2 r465, r256, r342; +} +{ +add.f16x2 r468, r262, r348; +} +{ +sub.f16x2 r471, r256, r342; +} +{ +sub.f16x2 r474, r262, r348; +} +{ +add.f16x2 r477, r280, r439; +} +{ +add.f16x2 r480, r316, r445; +} +{ +sub.f16x2 r483, r280, r439; +} +{ +sub.f16x2 r486, r316, r445; +} +{ +add.f16x2 r489, r298, r455; +} +{ +add.f16x2 r492, r334, r461; +} +{ +sub.f16x2 r495, r298, r455; +} +{ +sub.f16x2 r498, r334, r461; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r501, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r502, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r503, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r504, {low, high}; +} +mov.f32 f250, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r507, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r509, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r510, {low, high}; +} +{ +mul.f16x2 r523, r477, r501; +} +{ +mul.f16x2 r526, r480, r502; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r477, r502; +} +{ +fma.rn.f16x2 r535, r480, r501, r532; +} +{ +mul.f16x2 r539, r489, r503; +} +{ +mul.f16x2 r542, r492, r504; +} +{ +sub.f16x2 r545, r539, r542; +} +{ +mul.f16x2 r548, r489, r504; +} +{ +fma.rn.f16x2 r551, r492, r503, r548; +} +{ +neg.f16x2 r555, r474; +} +{ +mul.f16x2 r557, r483, r507; +} +{ +mul.f16x2 r560, r486, r508; +} +{ +sub.f16x2 r563, r557, r560; +} +{ +mul.f16x2 r566, r483, r508; +} +{ +fma.rn.f16x2 r569, r486, r507, r566; +} +{ +mul.f16x2 r573, r495, r509; +} +{ +mul.f16x2 r576, r498, r510; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r495, r510; +} +{ +fma.rn.f16x2 r585, r498, r509, r582; +} +{ +add.f16x2 r589, r215, r465; +} +{ +add.f16x2 r592, r218, r468; +} +{ +sub.f16x2 r595, r215, r465; +} +{ +sub.f16x2 r598, r218, r468; +} +{ +add.f16x2 r601, r227, r529; +} +{ +add.f16x2 r604, r230, r535; +} +{ +sub.f16x2 r607, r227, r529; +} +{ +sub.f16x2 r610, r230, r535; +} +{ +add.f16x2 r613, r239, r545; +} +{ +add.f16x2 r616, r242, r551; +} +{ +sub.f16x2 r619, r239, r545; +} +{ +sub.f16x2 r622, r242, r551; +} +{ +add.f16x2 r625, r221, r555; +} +{ +add.f16x2 r628, r224, r471; +} +{ +sub.f16x2 r631, r221, r555; +} +{ +sub.f16x2 r634, r224, r471; +} +{ +add.f16x2 r637, r233, r563; +} +{ +add.f16x2 r640, r236, r569; +} +{ +sub.f16x2 r643, r233, r563; +} +{ +sub.f16x2 r646, r236, r569; +} +{ +add.f16x2 r649, r245, r579; +} +{ +add.f16x2 r652, r248, r585; +} +{ +sub.f16x2 r655, r245, r579; +} +{ +sub.f16x2 r658, r248, r585; +} +mul.wide.u32 rd2, r2804, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r2805, rd3; +mul.lo.s32 r2806, r2805, 144; +sub.s32 r2807, r2804, r2806; +mad.lo.s32 r2808, r2805, 6912, r2803; +cvt.rn.f32.u32 f353, r2807; +mul.f32 f354, f353, 0f3B6E4BAE; +cos.approx.f32 f101, f354; +sin.approx.f32 f355, f354; +neg.f32 f102, f355; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f101; +cvt.rn.f16.f32 high, f102; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r664, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r666, {high, high}; +} +{ +mul.f16x2 r668, r604, r666; +} +{ +fma.rn.f16x2 r671, r601, r664, r668; +} +{ +mul.f16x2 r675, r601, r666; +} +{ +neg.f16x2 r678, r675; +} +{ +fma.rn.f16x2 r680, r604, r664, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r684, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r686, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r688, {low, high}; +} +{ +mul.f16x2 r689, r686, r688; +} +{ +mul.f16x2 r692, r661, r684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r695, {high, low}; +} +{ +fma.rn.f16x2 r697, r689, r695, r692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r701, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r703, {high, high}; +} +{ +mul.f16x2 r705, r616, r703; +} +{ +fma.rn.f16x2 r708, r613, r701, r705; +} +{ +mul.f16x2 r712, r613, r703; +} +{ +neg.f16x2 r715, r712; +} +{ +fma.rn.f16x2 r717, r616, r701, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r721, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r723, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r725, {low, high}; +} +{ +mul.f16x2 r726, r723, r725; +} +{ +mul.f16x2 r729, r697, r721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r732, {high, low}; +} +{ +fma.rn.f16x2 r734, r726, r732, r729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r738, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r740, {high, high}; +} +{ +mul.f16x2 r742, r628, r740; +} +{ +fma.rn.f16x2 r745, r625, r738, r742; +} +{ +mul.f16x2 r749, r625, r740; +} +{ +neg.f16x2 r752, r749; +} +{ +fma.rn.f16x2 r754, r628, r738, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r758, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r760, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r762, {low, high}; +} +{ +mul.f16x2 r763, r760, r762; +} +{ +mul.f16x2 r766, r734, r758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r734; +mov.b32 r769, {high, low}; +} +{ +fma.rn.f16x2 r771, r763, r769, r766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r775, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r777, {high, high}; +} +{ +mul.f16x2 r779, r640, r777; +} +{ +fma.rn.f16x2 r782, r637, r775, r779; +} +{ +mul.f16x2 r786, r637, r777; +} +{ +neg.f16x2 r789, r786; +} +{ +fma.rn.f16x2 r791, r640, r775, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r797, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r799, {low, high}; +} +{ +mul.f16x2 r800, r797, r799; +} +{ +mul.f16x2 r803, r771, r795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r771; +mov.b32 r806, {high, low}; +} +{ +fma.rn.f16x2 r808, r800, r806, r803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r812, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r814, {high, high}; +} +{ +mul.f16x2 r816, r652, r814; +} +{ +fma.rn.f16x2 r819, r649, r812, r816; +} +{ +mul.f16x2 r823, r649, r814; +} +{ +neg.f16x2 r826, r823; +} +{ +fma.rn.f16x2 r828, r652, r812, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r834, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r836, {low, high}; +} +{ +mul.f16x2 r837, r834, r836; +} +{ +mul.f16x2 r840, r808, r832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r808; +mov.b32 r843, {high, low}; +} +{ +fma.rn.f16x2 r845, r837, r843, r840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r849, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r851, {high, high}; +} +{ +mul.f16x2 r853, r598, r851; +} +{ +fma.rn.f16x2 r856, r595, r849, r853; +} +{ +mul.f16x2 r860, r595, r851; +} +{ +neg.f16x2 r863, r860; +} +{ +fma.rn.f16x2 r865, r598, r849, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r871, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r873, {low, high}; +} +{ +mul.f16x2 r874, r871, r873; +} +{ +mul.f16x2 r877, r845, r869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r845; +mov.b32 r880, {high, low}; +} +{ +fma.rn.f16x2 r882, r874, r880, r877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r886, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r888, {high, high}; +} +{ +mul.f16x2 r890, r610, r888; +} +{ +fma.rn.f16x2 r893, r607, r886, r890; +} +{ +mul.f16x2 r897, r607, r888; +} +{ +neg.f16x2 r900, r897; +} +{ +fma.rn.f16x2 r902, r610, r886, r900; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r908, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r910, {low, high}; +} +{ +mul.f16x2 r911, r908, r910; +} +{ +mul.f16x2 r914, r882, r906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r882; +mov.b32 r917, {high, low}; +} +{ +fma.rn.f16x2 r919, r911, r917, r914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r923, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r925, {high, high}; +} +{ +mul.f16x2 r927, r622, r925; +} +{ +fma.rn.f16x2 r930, r619, r923, r927; +} +{ +mul.f16x2 r934, r619, r925; +} +{ +neg.f16x2 r937, r934; +} +{ +fma.rn.f16x2 r939, r622, r923, r937; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r943, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r945, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r947, {low, high}; +} +{ +mul.f16x2 r948, r945, r947; +} +{ +mul.f16x2 r951, r919, r943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r919; +mov.b32 r954, {high, low}; +} +{ +fma.rn.f16x2 r956, r948, r954, r951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r960, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r962, {high, high}; +} +{ +mul.f16x2 r964, r634, r962; +} +{ +fma.rn.f16x2 r967, r631, r960, r964; +} +{ +mul.f16x2 r971, r631, r962; +} +{ +neg.f16x2 r974, r971; +} +{ +fma.rn.f16x2 r976, r634, r960, r974; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r980, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r982, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r984, {low, high}; +} +{ +mul.f16x2 r985, r982, r984; +} +{ +mul.f16x2 r988, r956, r980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r956; +mov.b32 r991, {high, low}; +} +{ +fma.rn.f16x2 r993, r985, r991, r988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r997, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r999, {high, high}; +} +{ +mul.f16x2 r1001, r646, r999; +} +{ +fma.rn.f16x2 r1004, r643, r997, r1001; +} +{ +mul.f16x2 r1008, r643, r999; +} +{ +neg.f16x2 r1011, r1008; +} +{ +fma.rn.f16x2 r1013, r646, r997, r1011; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1017, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r661; +mov.b32 r1019, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1021, {low, high}; +} +{ +mul.f16x2 r1022, r1019, r1021; +} +{ +mul.f16x2 r1025, r993, r1017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r993; +mov.b32 r1028, {high, low}; +} +{ +fma.rn.f16x2 r1030, r1022, r1028, r1025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1030; +mov.b32 r1036, {high, high}; +} +{ +mul.f16x2 r1038, r658, r1036; +} +{ +fma.rn.f16x2 r1041, r655, r1034, r1038; +} +{ +mul.f16x2 r1045, r655, r1036; +} +{ +neg.f16x2 r1048, r1045; +} +{ +fma.rn.f16x2 r1050, r658, r1034, r1048; +} +barrier.sync 0; +mad.lo.s32 r2809, r2807, 48, r2808; +st.shared.v4.f32 [r2809], {r589, r671, r708, r745}; +st.shared.v4.f32 [r2809+16], {r782, r819, r856, r893}; +st.shared.v4.f32 [r2809+32], {r930, r967, r1004, r1041}; +barrier.sync 0; +mad.lo.s32 r2810, r2807, -44, r2809; +ld.shared.u32 r1077, [r2810]; +ld.shared.u32 r1327, [r2810+576]; +ld.shared.u32 r1163, [r2810+1152]; +ld.shared.u32 r1413, [r2810+1728]; +ld.shared.u32 r1074, [r2810+2304]; +ld.shared.u32 r1324, [r2810+2880]; +ld.shared.u32 r1160, [r2810+3456]; +ld.shared.u32 r1410, [r2810+4032]; +ld.shared.u32 r1075, [r2810+4608]; +ld.shared.u32 r1325, [r2810+5184]; +ld.shared.u32 r1161, [r2810+5760]; +ld.shared.u32 r1411, [r2810+6336]; +barrier.sync 0; +st.shared.v4.f32 [r2809], {r592, r680, r717, r754}; +st.shared.v4.f32 [r2809+16], {r791, r828, r865, r902}; +st.shared.v4.f32 [r2809+32], {r939, r976, r1013, r1050}; +barrier.sync 0; +ld.shared.u32 r1083, [r2810]; +ld.shared.u32 r1333, [r2810+576]; +ld.shared.u32 r1169, [r2810+1152]; +ld.shared.u32 r1419, [r2810+1728]; +ld.shared.u32 r1080, [r2810+2304]; +ld.shared.u32 r1330, [r2810+2880]; +ld.shared.u32 r1166, [r2810+3456]; +ld.shared.u32 r1416, [r2810+4032]; +ld.shared.u32 r1081, [r2810+4608]; +ld.shared.u32 r1331, [r2810+5184]; +ld.shared.u32 r1167, [r2810+5760]; +ld.shared.u32 r1417, [r2810+6336]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1071, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1072, {low, high}; +} +{ +add.f16x2 r1073, r1074, r1075; +} +{ +add.f16x2 r1076, r1077, r1073; +} +{ +add.f16x2 r1079, r1080, r1081; +} +{ +add.f16x2 r1082, r1083, r1079; +} +{ +add.f16x2 r1085, r1074, r1075; +} +{ +mul.f16x2 r1088, r1085, r1071; +} +{ +add.f16x2 r1091, r1077, r1088; +} +{ +sub.f16x2 r1094, r1080, r1081; +} +{ +mul.f16x2 r1097, r1094, r1072; +} +{ +add.f16x2 r1100, r1091, r1097; +} +{ +add.f16x2 r1103, r1074, r1075; +} +{ +mul.f16x2 r1106, r1103, r1071; +} +{ +add.f16x2 r1109, r1077, r1106; +} +{ +sub.f16x2 r1112, r1080, r1081; +} +{ +mul.f16x2 r1115, r1112, r1072; +} +{ +sub.f16x2 r1118, r1109, r1115; +} +{ +add.f16x2 r1121, r1080, r1081; +} +{ +mul.f16x2 r1124, r1121, r1071; +} +{ +add.f16x2 r1127, r1083, r1124; +} +{ +sub.f16x2 r1130, r1074, r1075; +} +{ +mul.f16x2 r1133, r1130, r1072; +} +{ +sub.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r1080, r1081; +} +{ +mul.f16x2 r1142, r1139, r1071; +} +{ +add.f16x2 r1145, r1083, r1142; +} +{ +sub.f16x2 r1148, r1074, r1075; +} +{ +mul.f16x2 r1151, r1148, r1072; +} +{ +add.f16x2 r1154, r1145, r1151; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1158, {low, high}; +} +{ +add.f16x2 r1159, r1160, r1161; +} +{ +add.f16x2 r1162, r1163, r1159; +} +{ +add.f16x2 r1165, r1166, r1167; +} +{ +add.f16x2 r1168, r1169, r1165; +} +{ +add.f16x2 r1171, r1160, r1161; +} +{ +mul.f16x2 r1174, r1171, r1157; +} +{ +add.f16x2 r1177, r1163, r1174; +} +{ +sub.f16x2 r1180, r1166, r1167; +} +{ +mul.f16x2 r1183, r1180, r1158; +} +{ +add.f16x2 r1186, r1177, r1183; +} +{ +add.f16x2 r1189, r1160, r1161; +} +{ +mul.f16x2 r1192, r1189, r1157; +} +{ +add.f16x2 r1195, r1163, r1192; +} +{ +sub.f16x2 r1198, r1166, r1167; +} +{ +mul.f16x2 r1201, r1198, r1158; +} +{ +sub.f16x2 r1204, r1195, r1201; +} +{ +add.f16x2 r1207, r1166, r1167; +} +{ +mul.f16x2 r1210, r1207, r1157; +} +{ +add.f16x2 r1213, r1169, r1210; +} +{ +sub.f16x2 r1216, r1160, r1161; +} +{ +mul.f16x2 r1219, r1216, r1158; +} +{ +sub.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, r1166, r1167; +} +{ +mul.f16x2 r1228, r1225, r1157; +} +{ +add.f16x2 r1231, r1169, r1228; +} +{ +sub.f16x2 r1234, r1160, r1161; +} +{ +mul.f16x2 r1237, r1234, r1158; +} +{ +add.f16x2 r1240, r1231, r1237; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1246, {low, high}; +} +{ +mul.f16x2 r1253, r1186, r1243; +} +{ +mul.f16x2 r1256, r1222, r1244; +} +{ +sub.f16x2 r1259, r1253, r1256; +} +{ +mul.f16x2 r1262, r1186, r1244; +} +{ +fma.rn.f16x2 r1265, r1222, r1243, r1262; +} +{ +mul.f16x2 r1269, r1204, r1245; +} +{ +mul.f16x2 r1272, r1240, r1246; +} +{ +sub.f16x2 r1275, r1269, r1272; +} +{ +mul.f16x2 r1278, r1204, r1246; +} +{ +fma.rn.f16x2 r1281, r1240, r1245, r1278; +} +{ +add.f16x2 r1285, r1076, r1162; +} +{ +add.f16x2 r1288, r1082, r1168; +} +{ +sub.f16x2 r1291, r1076, r1162; +} +{ +sub.f16x2 r1294, r1082, r1168; +} +{ +add.f16x2 r1297, r1100, r1259; +} +{ +add.f16x2 r1300, r1136, r1265; +} +{ +sub.f16x2 r1303, r1100, r1259; +} +{ +sub.f16x2 r1306, r1136, r1265; +} +{ +add.f16x2 r1309, r1118, r1275; +} +{ +add.f16x2 r1312, r1154, r1281; +} +{ +sub.f16x2 r1315, r1118, r1275; +} +{ +sub.f16x2 r1318, r1154, r1281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1322, {low, high}; +} +{ +add.f16x2 r1323, r1324, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +add.f16x2 r1329, r1330, r1331; +} +{ +add.f16x2 r1332, r1333, r1329; +} +{ +add.f16x2 r1335, r1324, r1325; +} +{ +mul.f16x2 r1338, r1335, r1321; +} +{ +add.f16x2 r1341, r1327, r1338; +} +{ +sub.f16x2 r1344, r1330, r1331; +} +{ +mul.f16x2 r1347, r1344, r1322; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1324, r1325; +} +{ +mul.f16x2 r1356, r1353, r1321; +} +{ +add.f16x2 r1359, r1327, r1356; +} +{ +sub.f16x2 r1362, r1330, r1331; +} +{ +mul.f16x2 r1365, r1362, r1322; +} +{ +sub.f16x2 r1368, r1359, r1365; +} +{ +add.f16x2 r1371, r1330, r1331; +} +{ +mul.f16x2 r1374, r1371, r1321; +} +{ +add.f16x2 r1377, r1333, r1374; +} +{ +sub.f16x2 r1380, r1324, r1325; +} +{ +mul.f16x2 r1383, r1380, r1322; +} +{ +sub.f16x2 r1386, r1377, r1383; +} +{ +add.f16x2 r1389, r1330, r1331; +} +{ +mul.f16x2 r1392, r1389, r1321; +} +{ +add.f16x2 r1395, r1333, r1392; +} +{ +sub.f16x2 r1398, r1324, r1325; +} +{ +mul.f16x2 r1401, r1398, r1322; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1407, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1408, {low, high}; +} +{ +add.f16x2 r1409, r1410, r1411; +} +{ +add.f16x2 r1412, r1413, r1409; +} +{ +add.f16x2 r1415, r1416, r1417; +} +{ +add.f16x2 r1418, r1419, r1415; +} +{ +add.f16x2 r1421, r1410, r1411; +} +{ +mul.f16x2 r1424, r1421, r1407; +} +{ +add.f16x2 r1427, r1413, r1424; +} +{ +sub.f16x2 r1430, r1416, r1417; +} +{ +mul.f16x2 r1433, r1430, r1408; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +add.f16x2 r1439, r1410, r1411; +} +{ +mul.f16x2 r1442, r1439, r1407; +} +{ +add.f16x2 r1445, r1413, r1442; +} +{ +sub.f16x2 r1448, r1416, r1417; +} +{ +mul.f16x2 r1451, r1448, r1408; +} +{ +sub.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1416, r1417; +} +{ +mul.f16x2 r1460, r1457, r1407; +} +{ +add.f16x2 r1463, r1419, r1460; +} +{ +sub.f16x2 r1466, r1410, r1411; +} +{ +mul.f16x2 r1469, r1466, r1408; +} +{ +sub.f16x2 r1472, r1463, r1469; +} +{ +add.f16x2 r1475, r1416, r1417; +} +{ +mul.f16x2 r1478, r1475, r1407; +} +{ +add.f16x2 r1481, r1419, r1478; +} +{ +sub.f16x2 r1484, r1410, r1411; +} +{ +mul.f16x2 r1487, r1484, r1408; +} +{ +add.f16x2 r1490, r1481, r1487; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1493, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1494, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1495, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1496, {low, high}; +} +{ +mul.f16x2 r1503, r1436, r1493; +} +{ +mul.f16x2 r1506, r1472, r1494; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1436, r1494; +} +{ +fma.rn.f16x2 r1515, r1472, r1493, r1512; +} +{ +mul.f16x2 r1519, r1454, r1495; +} +{ +mul.f16x2 r1522, r1490, r1496; +} +{ +sub.f16x2 r1525, r1519, r1522; +} +{ +mul.f16x2 r1528, r1454, r1496; +} +{ +fma.rn.f16x2 r1531, r1490, r1495, r1528; +} +{ +add.f16x2 r1535, r1326, r1412; +} +{ +add.f16x2 r1538, r1332, r1418; +} +{ +sub.f16x2 r1541, r1326, r1412; +} +{ +sub.f16x2 r1544, r1332, r1418; +} +{ +add.f16x2 r1547, r1350, r1509; +} +{ +add.f16x2 r1550, r1386, r1515; +} +{ +sub.f16x2 r1553, r1350, r1509; +} +{ +sub.f16x2 r1556, r1386, r1515; +} +{ +add.f16x2 r1559, r1368, r1525; +} +{ +add.f16x2 r1562, r1404, r1531; +} +{ +sub.f16x2 r1565, r1368, r1525; +} +{ +sub.f16x2 r1568, r1404, r1531; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1574, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1593, r1547, r1571; +} +{ +mul.f16x2 r1596, r1550, r1572; +} +{ +sub.f16x2 r1599, r1593, r1596; +} +{ +mul.f16x2 r1602, r1547, r1572; +} +{ +fma.rn.f16x2 r1605, r1550, r1571, r1602; +} +{ +mul.f16x2 r1609, r1559, r1573; +} +{ +mul.f16x2 r1612, r1562, r1574; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1559, r1574; +} +{ +fma.rn.f16x2 r1621, r1562, r1573, r1618; +} +{ +neg.f16x2 r1625, r1544; +} +{ +mul.f16x2 r1627, r1553, r1577; +} +{ +mul.f16x2 r1630, r1556, r1578; +} +{ +sub.f16x2 r1633, r1627, r1630; +} +{ +mul.f16x2 r1636, r1553, r1578; +} +{ +fma.rn.f16x2 r1639, r1556, r1577, r1636; +} +{ +mul.f16x2 r1643, r1565, r1579; +} +{ +mul.f16x2 r1646, r1568, r1580; +} +{ +sub.f16x2 r1649, r1643, r1646; +} +{ +mul.f16x2 r1652, r1565, r1580; +} +{ +fma.rn.f16x2 r1655, r1568, r1579, r1652; +} +{ +add.f16x2 r1659, r1285, r1535; +} +{ +add.f16x2 r1662, r1288, r1538; +} +{ +sub.f16x2 r1665, r1285, r1535; +} +{ +sub.f16x2 r1668, r1288, r1538; +} +{ +add.f16x2 r1671, r1297, r1599; +} +{ +add.f16x2 r1674, r1300, r1605; +} +{ +sub.f16x2 r1677, r1297, r1599; +} +{ +sub.f16x2 r1680, r1300, r1605; +} +{ +add.f16x2 r1683, r1309, r1615; +} +{ +add.f16x2 r1686, r1312, r1621; +} +{ +sub.f16x2 r1689, r1309, r1615; +} +{ +sub.f16x2 r1692, r1312, r1621; +} +{ +add.f16x2 r1695, r1291, r1625; +} +{ +add.f16x2 r1698, r1294, r1541; +} +{ +sub.f16x2 r1701, r1291, r1625; +} +{ +sub.f16x2 r1704, r1294, r1541; +} +{ +add.f16x2 r1707, r1303, r1633; +} +{ +add.f16x2 r1710, r1306, r1639; +} +{ +sub.f16x2 r1713, r1303, r1633; +} +{ +sub.f16x2 r1716, r1306, r1639; +} +{ +add.f16x2 r1719, r1315, r1649; +} +{ +add.f16x2 r1722, r1318, r1655; +} +{ +sub.f16x2 r1725, r1315, r1649; +} +{ +sub.f16x2 r1728, r1318, r1655; +} +mul.wide.u32 rd4, r2807, -1431655765; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r2811, rd5; +mul.lo.s32 r2812, r2811, 12; +sub.s32 r2813, r2807, r2812; +shl.b32 r2814, r2813, 2; +add.s32 r2815, r2808, r2814; +cvt.rn.f32.u32 f356, r2811; +mul.f32 f357, f356, 0f3D32B8C2; +cos.approx.f32 f227, f357; +sin.approx.f32 f358, f357; +neg.f32 f228, f358; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f227; +cvt.rn.f16.f32 high, f228; +mov.b32 r1731, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1734, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1736, {high, high}; +} +{ +mul.f16x2 r1738, r1674, r1736; +} +{ +fma.rn.f16x2 r1741, r1671, r1734, r1738; +} +{ +mul.f16x2 r1745, r1671, r1736; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1674, r1734, r1748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1756, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1758, {low, high}; +} +{ +mul.f16x2 r1759, r1756, r1758; +} +{ +mul.f16x2 r1762, r1731, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1765, {high, low}; +} +{ +fma.rn.f16x2 r1767, r1759, r1765, r1762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1767; +mov.b32 r1771, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1767; +mov.b32 r1773, {high, high}; +} +{ +mul.f16x2 r1775, r1686, r1773; +} +{ +fma.rn.f16x2 r1778, r1683, r1771, r1775; +} +{ +mul.f16x2 r1782, r1683, r1773; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1686, r1771, r1785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1791, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1793, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1795, {low, high}; +} +{ +mul.f16x2 r1796, r1793, r1795; +} +{ +mul.f16x2 r1799, r1767, r1791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1767; +mov.b32 r1802, {high, low}; +} +{ +fma.rn.f16x2 r1804, r1796, r1802, r1799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1804; +mov.b32 r1808, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1804; +mov.b32 r1810, {high, high}; +} +{ +mul.f16x2 r1812, r1698, r1810; +} +{ +fma.rn.f16x2 r1815, r1695, r1808, r1812; +} +{ +mul.f16x2 r1819, r1695, r1810; +} +{ +neg.f16x2 r1822, r1819; +} +{ +fma.rn.f16x2 r1824, r1698, r1808, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1828, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1830, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1832, {low, high}; +} +{ +mul.f16x2 r1833, r1830, r1832; +} +{ +mul.f16x2 r1836, r1804, r1828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1804; +mov.b32 r1839, {high, low}; +} +{ +fma.rn.f16x2 r1841, r1833, r1839, r1836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1841; +mov.b32 r1845, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1841; +mov.b32 r1847, {high, high}; +} +{ +mul.f16x2 r1849, r1710, r1847; +} +{ +fma.rn.f16x2 r1852, r1707, r1845, r1849; +} +{ +mul.f16x2 r1856, r1707, r1847; +} +{ +neg.f16x2 r1859, r1856; +} +{ +fma.rn.f16x2 r1861, r1710, r1845, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1865, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1867, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1869, {low, high}; +} +{ +mul.f16x2 r1870, r1867, r1869; +} +{ +mul.f16x2 r1873, r1841, r1865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1841; +mov.b32 r1876, {high, low}; +} +{ +fma.rn.f16x2 r1878, r1870, r1876, r1873; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1878; +mov.b32 r1882, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1878; +mov.b32 r1884, {high, high}; +} +{ +mul.f16x2 r1886, r1722, r1884; +} +{ +fma.rn.f16x2 r1889, r1719, r1882, r1886; +} +{ +mul.f16x2 r1893, r1719, r1884; +} +{ +neg.f16x2 r1896, r1893; +} +{ +fma.rn.f16x2 r1898, r1722, r1882, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1902, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1904, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1906, {low, high}; +} +{ +mul.f16x2 r1907, r1904, r1906; +} +{ +mul.f16x2 r1910, r1878, r1902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1878; +mov.b32 r1913, {high, low}; +} +{ +fma.rn.f16x2 r1915, r1907, r1913, r1910; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1915; +mov.b32 r1919, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1915; +mov.b32 r1921, {high, high}; +} +{ +mul.f16x2 r1923, r1668, r1921; +} +{ +fma.rn.f16x2 r1926, r1665, r1919, r1923; +} +{ +mul.f16x2 r1930, r1665, r1921; +} +{ +neg.f16x2 r1933, r1930; +} +{ +fma.rn.f16x2 r1935, r1668, r1919, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1939, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1941, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1943, {low, high}; +} +{ +mul.f16x2 r1944, r1941, r1943; +} +{ +mul.f16x2 r1947, r1915, r1939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1915; +mov.b32 r1950, {high, low}; +} +{ +fma.rn.f16x2 r1952, r1944, r1950, r1947; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1952; +mov.b32 r1956, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1952; +mov.b32 r1958, {high, high}; +} +{ +mul.f16x2 r1960, r1680, r1958; +} +{ +fma.rn.f16x2 r1963, r1677, r1956, r1960; +} +{ +mul.f16x2 r1967, r1677, r1958; +} +{ +neg.f16x2 r1970, r1967; +} +{ +fma.rn.f16x2 r1972, r1680, r1956, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1976, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r1978, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r1980, {low, high}; +} +{ +mul.f16x2 r1981, r1978, r1980; +} +{ +mul.f16x2 r1984, r1952, r1976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1952; +mov.b32 r1987, {high, low}; +} +{ +fma.rn.f16x2 r1989, r1981, r1987, r1984; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1989; +mov.b32 r1993, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1989; +mov.b32 r1995, {high, high}; +} +{ +mul.f16x2 r1997, r1692, r1995; +} +{ +fma.rn.f16x2 r2000, r1689, r1993, r1997; +} +{ +mul.f16x2 r2004, r1689, r1995; +} +{ +neg.f16x2 r2007, r2004; +} +{ +fma.rn.f16x2 r2009, r1692, r1993, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2013, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2015, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2017, {low, high}; +} +{ +mul.f16x2 r2018, r2015, r2017; +} +{ +mul.f16x2 r2021, r1989, r2013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1989; +mov.b32 r2024, {high, low}; +} +{ +fma.rn.f16x2 r2026, r2018, r2024, r2021; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2026; +mov.b32 r2030, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2026; +mov.b32 r2032, {high, high}; +} +{ +mul.f16x2 r2034, r1704, r2032; +} +{ +fma.rn.f16x2 r2037, r1701, r2030, r2034; +} +{ +mul.f16x2 r2041, r1701, r2032; +} +{ +neg.f16x2 r2044, r2041; +} +{ +fma.rn.f16x2 r2046, r1704, r2030, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2050, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2052, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2054, {low, high}; +} +{ +mul.f16x2 r2055, r2052, r2054; +} +{ +mul.f16x2 r2058, r2026, r2050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2026; +mov.b32 r2061, {high, low}; +} +{ +fma.rn.f16x2 r2063, r2055, r2061, r2058; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2063; +mov.b32 r2067, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2063; +mov.b32 r2069, {high, high}; +} +{ +mul.f16x2 r2071, r1716, r2069; +} +{ +fma.rn.f16x2 r2074, r1713, r2067, r2071; +} +{ +mul.f16x2 r2078, r1713, r2069; +} +{ +neg.f16x2 r2081, r2078; +} +{ +fma.rn.f16x2 r2083, r1716, r2067, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2087, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1731; +mov.b32 r2089, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f249; +cvt.rn.f16.f32 high, f250; +mov.b32 r2091, {low, high}; +} +{ +mul.f16x2 r2092, r2089, r2091; +} +{ +mul.f16x2 r2095, r2063, r2087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2063; +mov.b32 r2098, {high, low}; +} +{ +fma.rn.f16x2 r2100, r2092, r2098, r2095; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2100; +mov.b32 r2104, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2100; +mov.b32 r2106, {high, high}; +} +{ +mul.f16x2 r2108, r1728, r2106; +} +{ +fma.rn.f16x2 r2111, r1725, r2104, r2108; +} +{ +mul.f16x2 r2115, r1725, r2106; +} +{ +neg.f16x2 r2118, r2115; +} +{ +fma.rn.f16x2 r2120, r1728, r2104, r2118; +} +barrier.sync 0; +mad.lo.s32 r2816, r2811, 576, r2815; +st.shared.u32 [r2816], r1659; +st.shared.u32 [r2816+48], r1741; +st.shared.u32 [r2816+96], r1778; +st.shared.u32 [r2816+144], r1815; +st.shared.u32 [r2816+192], r1852; +st.shared.u32 [r2816+240], r1889; +st.shared.u32 [r2816+288], r1926; +st.shared.u32 [r2816+336], r1963; +st.shared.u32 [r2816+384], r2000; +st.shared.u32 [r2816+432], r2037; +st.shared.u32 [r2816+480], r2074; +st.shared.u32 [r2816+528], r2111; +barrier.sync 0; +ld.shared.u32 r2147, [r2810]; +ld.shared.u32 r2397, [r2810+576]; +ld.shared.u32 r2233, [r2810+1152]; +ld.shared.u32 r2483, [r2810+1728]; +ld.shared.u32 r2144, [r2810+2304]; +ld.shared.u32 r2394, [r2810+2880]; +ld.shared.u32 r2230, [r2810+3456]; +ld.shared.u32 r2480, [r2810+4032]; +ld.shared.u32 r2145, [r2810+4608]; +ld.shared.u32 r2395, [r2810+5184]; +ld.shared.u32 r2231, [r2810+5760]; +ld.shared.u32 r2481, [r2810+6336]; +barrier.sync 0; +st.shared.u32 [r2816], r1662; +st.shared.u32 [r2816+48], r1750; +st.shared.u32 [r2816+96], r1787; +st.shared.u32 [r2816+144], r1824; +st.shared.u32 [r2816+192], r1861; +st.shared.u32 [r2816+240], r1898; +st.shared.u32 [r2816+288], r1935; +st.shared.u32 [r2816+336], r1972; +st.shared.u32 [r2816+384], r2009; +st.shared.u32 [r2816+432], r2046; +st.shared.u32 [r2816+480], r2083; +st.shared.u32 [r2816+528], r2120; +barrier.sync 0; +ld.shared.u32 r2153, [r2810]; +ld.shared.u32 r2403, [r2810+576]; +ld.shared.u32 r2239, [r2810+1152]; +ld.shared.u32 r2489, [r2810+1728]; +ld.shared.u32 r2150, [r2810+2304]; +ld.shared.u32 r2400, [r2810+2880]; +ld.shared.u32 r2236, [r2810+3456]; +ld.shared.u32 r2486, [r2810+4032]; +ld.shared.u32 r2151, [r2810+4608]; +ld.shared.u32 r2401, [r2810+5184]; +ld.shared.u32 r2237, [r2810+5760]; +ld.shared.u32 r2487, [r2810+6336]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2141, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2142, {low, high}; +} +{ +add.f16x2 r2143, r2144, r2145; +} +{ +add.f16x2 r2146, r2147, r2143; +} +{ +add.f16x2 r2149, r2150, r2151; +} +{ +add.f16x2 r2152, r2153, r2149; +} +{ +add.f16x2 r2155, r2144, r2145; +} +{ +mul.f16x2 r2158, r2155, r2141; +} +{ +add.f16x2 r2161, r2147, r2158; +} +{ +sub.f16x2 r2164, r2150, r2151; +} +{ +mul.f16x2 r2167, r2164, r2142; +} +{ +add.f16x2 r2170, r2161, r2167; +} +{ +add.f16x2 r2173, r2144, r2145; +} +{ +mul.f16x2 r2176, r2173, r2141; +} +{ +add.f16x2 r2179, r2147, r2176; +} +{ +sub.f16x2 r2182, r2150, r2151; +} +{ +mul.f16x2 r2185, r2182, r2142; +} +{ +sub.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r2150, r2151; +} +{ +mul.f16x2 r2194, r2191, r2141; +} +{ +add.f16x2 r2197, r2153, r2194; +} +{ +sub.f16x2 r2200, r2144, r2145; +} +{ +mul.f16x2 r2203, r2200, r2142; +} +{ +sub.f16x2 r2206, r2197, r2203; +} +{ +add.f16x2 r2209, r2150, r2151; +} +{ +mul.f16x2 r2212, r2209, r2141; +} +{ +add.f16x2 r2215, r2153, r2212; +} +{ +sub.f16x2 r2218, r2144, r2145; +} +{ +mul.f16x2 r2221, r2218, r2142; +} +{ +add.f16x2 r2224, r2215, r2221; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2228, {low, high}; +} +{ +add.f16x2 r2229, r2230, r2231; +} +{ +add.f16x2 r2232, r2233, r2229; +} +{ +add.f16x2 r2235, r2236, r2237; +} +{ +add.f16x2 r2238, r2239, r2235; +} +{ +add.f16x2 r2241, r2230, r2231; +} +{ +mul.f16x2 r2244, r2241, r2227; +} +{ +add.f16x2 r2247, r2233, r2244; +} +{ +sub.f16x2 r2250, r2236, r2237; +} +{ +mul.f16x2 r2253, r2250, r2228; +} +{ +add.f16x2 r2256, r2247, r2253; +} +{ +add.f16x2 r2259, r2230, r2231; +} +{ +mul.f16x2 r2262, r2259, r2227; +} +{ +add.f16x2 r2265, r2233, r2262; +} +{ +sub.f16x2 r2268, r2236, r2237; +} +{ +mul.f16x2 r2271, r2268, r2228; +} +{ +sub.f16x2 r2274, r2265, r2271; +} +{ +add.f16x2 r2277, r2236, r2237; +} +{ +mul.f16x2 r2280, r2277, r2227; +} +{ +add.f16x2 r2283, r2239, r2280; +} +{ +sub.f16x2 r2286, r2230, r2231; +} +{ +mul.f16x2 r2289, r2286, r2228; +} +{ +sub.f16x2 r2292, r2283, r2289; +} +{ +add.f16x2 r2295, r2236, r2237; +} +{ +mul.f16x2 r2298, r2295, r2227; +} +{ +add.f16x2 r2301, r2239, r2298; +} +{ +sub.f16x2 r2304, r2230, r2231; +} +{ +mul.f16x2 r2307, r2304, r2228; +} +{ +add.f16x2 r2310, r2301, r2307; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2313, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2314, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2315, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2316, {low, high}; +} +{ +mul.f16x2 r2323, r2256, r2313; +} +{ +mul.f16x2 r2326, r2292, r2314; +} +{ +sub.f16x2 r2329, r2323, r2326; +} +{ +mul.f16x2 r2332, r2256, r2314; +} +{ +fma.rn.f16x2 r2335, r2292, r2313, r2332; +} +{ +mul.f16x2 r2339, r2274, r2315; +} +{ +mul.f16x2 r2342, r2310, r2316; +} +{ +sub.f16x2 r2345, r2339, r2342; +} +{ +mul.f16x2 r2348, r2274, r2316; +} +{ +fma.rn.f16x2 r2351, r2310, r2315, r2348; +} +{ +add.f16x2 r2355, r2146, r2232; +} +{ +add.f16x2 r2358, r2152, r2238; +} +{ +sub.f16x2 r2361, r2146, r2232; +} +{ +sub.f16x2 r2364, r2152, r2238; +} +{ +add.f16x2 r2367, r2170, r2329; +} +{ +add.f16x2 r2370, r2206, r2335; +} +{ +sub.f16x2 r2373, r2170, r2329; +} +{ +sub.f16x2 r2376, r2206, r2335; +} +{ +add.f16x2 r2379, r2188, r2345; +} +{ +add.f16x2 r2382, r2224, r2351; +} +{ +sub.f16x2 r2385, r2188, r2345; +} +{ +sub.f16x2 r2388, r2224, r2351; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2391, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2392, {low, high}; +} +{ +add.f16x2 r2393, r2394, r2395; +} +{ +add.f16x2 r2396, r2397, r2393; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2399; +} +{ +add.f16x2 r2405, r2394, r2395; +} +{ +mul.f16x2 r2408, r2405, r2391; +} +{ +add.f16x2 r2411, r2397, r2408; +} +{ +sub.f16x2 r2414, r2400, r2401; +} +{ +mul.f16x2 r2417, r2414, r2392; +} +{ +add.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r2394, r2395; +} +{ +mul.f16x2 r2426, r2423, r2391; +} +{ +add.f16x2 r2429, r2397, r2426; +} +{ +sub.f16x2 r2432, r2400, r2401; +} +{ +mul.f16x2 r2435, r2432, r2392; +} +{ +sub.f16x2 r2438, r2429, r2435; +} +{ +add.f16x2 r2441, r2400, r2401; +} +{ +mul.f16x2 r2444, r2441, r2391; +} +{ +add.f16x2 r2447, r2403, r2444; +} +{ +sub.f16x2 r2450, r2394, r2395; +} +{ +mul.f16x2 r2453, r2450, r2392; +} +{ +sub.f16x2 r2456, r2447, r2453; +} +{ +add.f16x2 r2459, r2400, r2401; +} +{ +mul.f16x2 r2462, r2459, r2391; +} +{ +add.f16x2 r2465, r2403, r2462; +} +{ +sub.f16x2 r2468, r2394, r2395; +} +{ +mul.f16x2 r2471, r2468, r2392; +} +{ +add.f16x2 r2474, r2465, r2471; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2477, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2478, {low, high}; +} +{ +add.f16x2 r2479, r2480, r2481; +} +{ +add.f16x2 r2482, r2483, r2479; +} +{ +add.f16x2 r2485, r2486, r2487; +} +{ +add.f16x2 r2488, r2489, r2485; +} +{ +add.f16x2 r2491, r2480, r2481; +} +{ +mul.f16x2 r2494, r2491, r2477; +} +{ +add.f16x2 r2497, r2483, r2494; +} +{ +sub.f16x2 r2500, r2486, r2487; +} +{ +mul.f16x2 r2503, r2500, r2478; +} +{ +add.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r2480, r2481; +} +{ +mul.f16x2 r2512, r2509, r2477; +} +{ +add.f16x2 r2515, r2483, r2512; +} +{ +sub.f16x2 r2518, r2486, r2487; +} +{ +mul.f16x2 r2521, r2518, r2478; +} +{ +sub.f16x2 r2524, r2515, r2521; +} +{ +add.f16x2 r2527, r2486, r2487; +} +{ +mul.f16x2 r2530, r2527, r2477; +} +{ +add.f16x2 r2533, r2489, r2530; +} +{ +sub.f16x2 r2536, r2480, r2481; +} +{ +mul.f16x2 r2539, r2536, r2478; +} +{ +sub.f16x2 r2542, r2533, r2539; +} +{ +add.f16x2 r2545, r2486, r2487; +} +{ +mul.f16x2 r2548, r2545, r2477; +} +{ +add.f16x2 r2551, r2489, r2548; +} +{ +sub.f16x2 r2554, r2480, r2481; +} +{ +mul.f16x2 r2557, r2554, r2478; +} +{ +add.f16x2 r2560, r2551, r2557; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2564, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2565, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2566, {low, high}; +} +{ +mul.f16x2 r2573, r2506, r2563; +} +{ +mul.f16x2 r2576, r2542, r2564; +} +{ +sub.f16x2 r2579, r2573, r2576; +} +{ +mul.f16x2 r2582, r2506, r2564; +} +{ +fma.rn.f16x2 r2585, r2542, r2563, r2582; +} +{ +mul.f16x2 r2589, r2524, r2565; +} +{ +mul.f16x2 r2592, r2560, r2566; +} +{ +sub.f16x2 r2595, r2589, r2592; +} +{ +mul.f16x2 r2598, r2524, r2566; +} +{ +fma.rn.f16x2 r2601, r2560, r2565, r2598; +} +{ +add.f16x2 r2605, r2396, r2482; +} +{ +add.f16x2 r2608, r2402, r2488; +} +{ +sub.f16x2 r2611, r2396, r2482; +} +{ +sub.f16x2 r2614, r2402, r2488; +} +{ +add.f16x2 r2617, r2420, r2579; +} +{ +add.f16x2 r2620, r2456, r2585; +} +{ +sub.f16x2 r2623, r2420, r2579; +} +{ +sub.f16x2 r2626, r2456, r2585; +} +{ +add.f16x2 r2629, r2438, r2595; +} +{ +add.f16x2 r2632, r2474, r2601; +} +{ +sub.f16x2 r2635, r2438, r2595; +} +{ +sub.f16x2 r2638, r2474, r2601; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f322; +cvt.rn.f16.f32 high, f322; +mov.b32 r2647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f324; +cvt.rn.f16.f32 high, f324; +mov.b32 r2648, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f326; +cvt.rn.f16.f32 high, f326; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f328; +cvt.rn.f16.f32 high, f328; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2663, r2617, r2641; +} +{ +mul.f16x2 r2666, r2620, r2642; +} +{ +sub.f16x2 r2669, r2663, r2666; +} +{ +mul.f16x2 r2672, r2617, r2642; +} +{ +fma.rn.f16x2 r2675, r2620, r2641, r2672; +} +{ +mul.f16x2 r2679, r2629, r2643; +} +{ +mul.f16x2 r2682, r2632, r2644; +} +{ +sub.f16x2 r2685, r2679, r2682; +} +{ +mul.f16x2 r2688, r2629, r2644; +} +{ +fma.rn.f16x2 r2691, r2632, r2643, r2688; +} +{ +neg.f16x2 r2695, r2614; +} +{ +mul.f16x2 r2697, r2623, r2647; +} +{ +mul.f16x2 r2700, r2626, r2648; +} +{ +sub.f16x2 r2703, r2697, r2700; +} +{ +mul.f16x2 r2706, r2623, r2648; +} +{ +fma.rn.f16x2 r2709, r2626, r2647, r2706; +} +{ +mul.f16x2 r2713, r2635, r2649; +} +{ +mul.f16x2 r2716, r2638, r2650; +} +{ +sub.f16x2 r2719, r2713, r2716; +} +{ +mul.f16x2 r2722, r2635, r2650; +} +{ +fma.rn.f16x2 r2725, r2638, r2649, r2722; +} +{ +add.f16x2 %0, r2355, r2605; +} +{ +add.f16x2 %1, r2358, r2608; +} +{ +sub.f16x2 %12, r2355, r2605; +} +{ +sub.f16x2 %13, r2358, r2608; +} +{ +add.f16x2 %2, r2367, r2669; +} +{ +add.f16x2 %3, r2370, r2675; +} +{ +sub.f16x2 %14, r2367, r2669; +} +{ +sub.f16x2 %15, r2370, r2675; +} +{ +add.f16x2 %4, r2379, r2685; +} +{ +add.f16x2 %5, r2382, r2691; +} +{ +sub.f16x2 %16, r2379, r2685; +} +{ +sub.f16x2 %17, r2382, r2691; +} +{ +add.f16x2 %6, r2361, r2695; +} +{ +add.f16x2 %7, r2364, r2611; +} +{ +sub.f16x2 %18, r2361, r2695; +} +{ +sub.f16x2 %19, r2364, r2611; +} +{ +add.f16x2 %8, r2373, r2703; +} +{ +add.f16x2 %9, r2376, r2709; +} +{ +sub.f16x2 %20, r2373, r2703; +} +{ +sub.f16x2 %21, r2376, r2709; +} +{ +add.f16x2 %10, r2385, r2719; +} +{ +add.f16x2 %11, r2388, r2725; +} +{ +sub.f16x2 %22, r2385, r2719; +} +{ +sub.f16x2 %23, r2388, r2725; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..4d48e7340c9c8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp32_fwd.hpp.inc @@ -0,0 +1,1550 @@ +#ifndef CUFFTDX_FFT_1728_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_1728_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<200, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<819>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 13824, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %37, %48; +add.f32 f50, %27, f49; +add.f32 f51, %39, %50; +add.f32 f52, %28, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %27, f53; +sub.f32 f55, %39, %50; +mul.f32 f56, f55, 0f3F5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %28, f59; +sub.f32 f61, %37, %48; +mul.f32 f62, f61, 0f3F5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %43, %53; +add.f32 f66, %32, f65; +add.f32 f67, %44, %55; +add.f32 f68, %34, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %32, f69; +sub.f32 f71, %44, %55; +mul.f32 f72, f71, 0f3F5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %34, f75; +sub.f32 f77, %43, %53; +mul.f32 f78, f77, 0f3F5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0fBF5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0fBF5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0fBF5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0fBF5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %40, %51; +add.f32 f104, %29, f103; +add.f32 f105, %42, %52; +add.f32 f106, %31, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %29, f107; +sub.f32 f109, %42, %52; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %31, f113; +sub.f32 f115, %40, %51; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %45, %56; +add.f32 f120, %35, f119; +add.f32 f121, %47, %57; +add.f32 f122, %36, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %35, f123; +sub.f32 f125, %47, %57; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %36, f129; +sub.f32 f131, %45, %56; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0fBF5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0fBF5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0fBF5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0fBF5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0fBF000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0fBF000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0fBF5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0fBF5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0fBF5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0fBF5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0fBF000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0fBF000000, f175; +sub.f32 f177, f91, f145; +sub.f32 f178, f92, f146; +add.f32 f179, f95, f159; +add.f32 f180, f96, f161; +sub.f32 f181, f95, f159; +sub.f32 f182, f96, f161; +add.f32 f183, f99, f164; +add.f32 f184, f100, f166; +sub.f32 f185, f99, f164; +sub.f32 f186, f100, f166; +add.f32 f187, f93, f148; +sub.f32 f188, f94, f147; +sub.f32 f189, f93, f148; +add.f32 f190, f94, f147; +add.f32 f191, f97, f169; +add.f32 f192, f98, f171; +sub.f32 f193, f97, f169; +sub.f32 f194, f98, f171; +add.f32 f195, f101, f174; +add.f32 f196, f102, f176; +sub.f32 f197, f101, f174; +sub.f32 f198, f102, f176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 13824, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f199, f200}, [rd6]; +mul.f32 f203, f199, f179; +mul.f32 f204, f200, f180; +mul.f32 f205, f199, f180; +mul.f32 f206, f199, f199; +mul.f32 f207, f200, f200; +sub.f32 f208, f206, f207; +mul.f32 f209, f200, f199; +fma.rn.f32 f210, f200, f199, f209; +mul.f32 f211, f208, f183; +mul.f32 f212, f210, f184; +mul.f32 f213, f208, f184; +mul.f32 f214, f199, f208; +mul.f32 f215, f200, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f199, f210; +fma.rn.f32 f218, f200, f208, f217; +mul.f32 f219, f216, f187; +mul.f32 f220, f218, f188; +mul.f32 f221, f216, f188; +mul.f32 f222, f199, f216; +mul.f32 f223, f200, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f199, f218; +fma.rn.f32 f226, f200, f216, f225; +mul.f32 f227, f224, f191; +mul.f32 f228, f226, f192; +mul.f32 f229, f224, f192; +mul.f32 f230, f199, f224; +mul.f32 f231, f200, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f199, f226; +fma.rn.f32 f234, f200, f224, f233; +mul.f32 f235, f232, f195; +mul.f32 f236, f234, f196; +mul.f32 f237, f232, f196; +mul.f32 f238, f199, f232; +mul.f32 f239, f200, f234; +sub.f32 f240, f238, f239; +mul.f32 f241, f199, f234; +fma.rn.f32 f242, f200, f232, f241; +mul.f32 f243, f240, f177; +mul.f32 f244, f242, f178; +mul.f32 f245, f240, f178; +mul.f32 f246, f199, f240; +mul.f32 f247, f200, f242; +sub.f32 f248, f246, f247; +mul.f32 f249, f199, f242; +fma.rn.f32 f250, f200, f240, f249; +mul.f32 f251, f248, f181; +mul.f32 f252, f250, f182; +mul.f32 f253, f248, f182; +mul.f32 f254, f199, f248; +mul.f32 f255, f200, f250; +sub.f32 f256, f254, f255; +mul.f32 f257, f199, f250; +fma.rn.f32 f258, f200, f248, f257; +mul.f32 f259, f256, f185; +mul.f32 f260, f258, f186; +mul.f32 f261, f256, f186; +mul.f32 f262, f199, f256; +mul.f32 f263, f200, f258; +sub.f32 f264, f262, f263; +mul.f32 f265, f199, f258; +fma.rn.f32 f266, f200, f256, f265; +mul.f32 f267, f264, f189; +mul.f32 f268, f266, f190; +mul.f32 f269, f264, f190; +mul.f32 f270, f199, f264; +mul.f32 f271, f200, f266; +sub.f32 f272, f270, f271; +mul.f32 f273, f199, f266; +fma.rn.f32 f274, f200, f264, f273; +mul.f32 f275, f272, f193; +mul.f32 f276, f274, f194; +mul.f32 f277, f272, f194; +mul.f32 f278, f199, f272; +mul.f32 f279, f200, f274; +sub.f32 f280, f278, f279; +mul.f32 f281, f199, f274; +fma.rn.f32 f282, f200, f272, f281; +mul.f32 f283, f280, f197; +mul.f32 f284, f282, f198; +mul.f32 f285, f280, f198; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f32 f286, f92, f146; +add.f32 f287, f91, f145; +fma.rn.f32 f288, f200, f179, f205; +sub.f32 f289, f203, f204; +st.shared.v4.f32 [r9], {f287, f286, f289, f288}; +fma.rn.f32 f290, f210, f183, f213; +sub.f32 f291, f211, f212; +sub.f32 f292, f219, f220; +fma.rn.f32 f293, f218, f187, f221; +st.shared.v4.f32 [r9+16], {f291, f290, f292, f293}; +sub.f32 f294, f227, f228; +fma.rn.f32 f295, f226, f191, f229; +fma.rn.f32 f296, f234, f195, f237; +sub.f32 f297, f235, f236; +st.shared.v4.f32 [r9+32], {f294, f295, f297, f296}; +fma.rn.f32 f298, f242, f177, f245; +sub.f32 f299, f243, f244; +fma.rn.f32 f300, f250, f181, f253; +sub.f32 f301, f251, f252; +st.shared.v4.f32 [r9+48], {f299, f298, f301, f300}; +fma.rn.f32 f302, f258, f185, f261; +sub.f32 f303, f259, f260; +fma.rn.f32 f304, f266, f189, f269; +sub.f32 f305, f267, f268; +st.shared.v4.f32 [r9+64], {f303, f302, f305, f304}; +fma.rn.f32 f306, f274, f193, f277; +sub.f32 f307, f275, f276; +fma.rn.f32 f308, f282, f197, f285; +sub.f32 f309, f283, f284; +st.shared.v4.f32 [r9+80], {f307, f306, f309, f308}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.v2.f32 {f310, f311}, [r10]; +ld.shared.v2.f32 {f314, f315}, [r10+1152]; +ld.shared.v2.f32 {f318, f319}, [r10+2304]; +ld.shared.v2.f32 {f322, f323}, [r10+3456]; +ld.shared.v2.f32 {f326, f327}, [r10+4608]; +ld.shared.v2.f32 {f330, f331}, [r10+5760]; +ld.shared.v2.f32 {f334, f335}, [r10+6912]; +ld.shared.v2.f32 {f338, f339}, [r10+8064]; +ld.shared.v2.f32 {f342, f343}, [r10+9216]; +ld.shared.v2.f32 {f346, f347}, [r10+10368]; +ld.shared.v2.f32 {f350, f351}, [r10+11520]; +ld.shared.v2.f32 {f354, f355}, [r10+12672]; +add.f32 f358, f326, f342; +add.f32 f359, f310, f358; +add.f32 f360, f327, f343; +add.f32 f361, f311, f360; +mul.f32 f362, f358, 0f3F000000; +sub.f32 f363, f310, f362; +sub.f32 f364, f327, f343; +mul.f32 f365, f364, 0f3F5DB3D7; +add.f32 f366, f365, f363; +sub.f32 f367, f363, f365; +mul.f32 f368, f360, 0f3F000000; +sub.f32 f369, f311, f368; +sub.f32 f370, f326, f342; +mul.f32 f371, f370, 0f3F5DB3D7; +sub.f32 f372, f369, f371; +add.f32 f373, f371, f369; +add.f32 f374, f334, f350; +add.f32 f375, f318, f374; +add.f32 f376, f335, f351; +add.f32 f377, f319, f376; +mul.f32 f378, f374, 0f3F000000; +sub.f32 f379, f318, f378; +sub.f32 f380, f335, f351; +mul.f32 f381, f380, 0f3F5DB3D7; +add.f32 f382, f381, f379; +sub.f32 f383, f379, f381; +mul.f32 f384, f376, 0f3F000000; +sub.f32 f385, f319, f384; +sub.f32 f386, f334, f350; +mul.f32 f387, f386, 0f3F5DB3D7; +sub.f32 f388, f385, f387; +add.f32 f389, f387, f385; +mul.f32 f390, f382, 0f3F000000; +mul.f32 f391, f388, 0fBF5DB3D7; +sub.f32 f392, f390, f391; +mul.f32 f393, f388, 0f3F000000; +fma.rn.f32 f394, f382, 0fBF5DB3D7, f393; +mul.f32 f395, f383, 0fBF000000; +mul.f32 f396, f389, 0fBF5DB3D7; +sub.f32 f397, f395, f396; +mul.f32 f398, f389, 0fBF000000; +fma.rn.f32 f399, f383, 0fBF5DB3D7, f398; +add.f32 f400, f359, f375; +add.f32 f401, f361, f377; +sub.f32 f402, f359, f375; +sub.f32 f403, f361, f377; +add.f32 f404, f366, f392; +add.f32 f405, f372, f394; +sub.f32 f406, f366, f392; +sub.f32 f407, f372, f394; +add.f32 f408, f367, f397; +add.f32 f409, f373, f399; +sub.f32 f410, f367, f397; +sub.f32 f411, f373, f399; +add.f32 f412, f330, f346; +add.f32 f413, f314, f412; +add.f32 f414, f331, f347; +add.f32 f415, f315, f414; +mul.f32 f416, f412, 0f3F000000; +sub.f32 f417, f314, f416; +sub.f32 f418, f331, f347; +mul.f32 f419, f418, 0f3F5DB3D7; +add.f32 f420, f419, f417; +sub.f32 f421, f417, f419; +mul.f32 f422, f414, 0f3F000000; +sub.f32 f423, f315, f422; +sub.f32 f424, f330, f346; +mul.f32 f425, f424, 0f3F5DB3D7; +sub.f32 f426, f423, f425; +add.f32 f427, f425, f423; +add.f32 f428, f338, f354; +add.f32 f429, f322, f428; +add.f32 f430, f339, f355; +add.f32 f431, f323, f430; +mul.f32 f432, f428, 0f3F000000; +sub.f32 f433, f322, f432; +sub.f32 f434, f339, f355; +mul.f32 f435, f434, 0f3F5DB3D7; +add.f32 f436, f435, f433; +sub.f32 f437, f433, f435; +mul.f32 f438, f430, 0f3F000000; +sub.f32 f439, f323, f438; +sub.f32 f440, f338, f354; +mul.f32 f441, f440, 0f3F5DB3D7; +sub.f32 f442, f439, f441; +add.f32 f443, f441, f439; +mul.f32 f444, f436, 0f3F000000; +mul.f32 f445, f442, 0fBF5DB3D7; +sub.f32 f446, f444, f445; +mul.f32 f447, f442, 0f3F000000; +fma.rn.f32 f448, f436, 0fBF5DB3D7, f447; +mul.f32 f449, f437, 0fBF000000; +mul.f32 f450, f443, 0fBF5DB3D7; +sub.f32 f451, f449, f450; +mul.f32 f452, f443, 0fBF000000; +fma.rn.f32 f453, f437, 0fBF5DB3D7, f452; +add.f32 f454, f413, f429; +add.f32 f455, f415, f431; +sub.f32 f456, f413, f429; +sub.f32 f457, f415, f431; +add.f32 f458, f420, f446; +add.f32 f459, f426, f448; +sub.f32 f460, f420, f446; +sub.f32 f461, f426, f448; +add.f32 f462, f421, f451; +add.f32 f463, f427, f453; +sub.f32 f464, f421, f451; +sub.f32 f465, f427, f453; +mul.f32 f466, f458, 0f3F5DB3D7; +mul.f32 f467, f459, 0fBF000000; +sub.f32 f468, f466, f467; +mul.f32 f469, f459, 0f3F5DB3D7; +fma.rn.f32 f470, f458, 0fBF000000, f469; +mul.f32 f471, f462, 0f3F000000; +mul.f32 f472, f463, 0fBF5DB3D7; +sub.f32 f473, f471, f472; +mul.f32 f474, f463, 0f3F000000; +fma.rn.f32 f475, f462, 0fBF5DB3D7, f474; +mul.f32 f476, f460, 0fBF000000; +mul.f32 f477, f461, 0fBF5DB3D7; +sub.f32 f478, f476, f477; +mul.f32 f479, f461, 0fBF000000; +fma.rn.f32 f480, f460, 0fBF5DB3D7, f479; +mul.f32 f481, f464, 0fBF5DB3D7; +mul.f32 f482, f465, 0fBF000000; +sub.f32 f483, f481, f482; +mul.f32 f484, f465, 0fBF5DB3D7; +fma.rn.f32 f485, f464, 0fBF000000, f484; +sub.f32 f486, f400, f454; +sub.f32 f487, f401, f455; +add.f32 f488, f404, f468; +add.f32 f489, f405, f470; +sub.f32 f490, f404, f468; +sub.f32 f491, f405, f470; +add.f32 f492, f408, f473; +add.f32 f493, f409, f475; +sub.f32 f494, f408, f473; +sub.f32 f495, f409, f475; +add.f32 f496, f402, f457; +sub.f32 f497, f403, f456; +sub.f32 f498, f402, f457; +add.f32 f499, f403, f456; +add.f32 f500, f406, f478; +add.f32 f501, f407, f480; +sub.f32 f502, f406, f478; +sub.f32 f503, f407, f480; +add.f32 f504, f410, f483; +add.f32 f505, f411, f485; +sub.f32 f506, f410, f483; +sub.f32 f507, f411, f485; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f508, f509}, [rd11]; +mul.f32 f512, f508, f488; +mul.f32 f513, f509, f489; +mul.f32 f514, f508, f489; +mul.f32 f515, f508, f508; +mul.f32 f516, f509, f509; +sub.f32 f517, f515, f516; +mul.f32 f518, f509, f508; +fma.rn.f32 f519, f509, f508, f518; +mul.f32 f520, f517, f492; +mul.f32 f521, f519, f493; +mul.f32 f522, f517, f493; +mul.f32 f523, f508, f517; +mul.f32 f524, f509, f519; +sub.f32 f525, f523, f524; +mul.f32 f526, f508, f519; +fma.rn.f32 f527, f509, f517, f526; +mul.f32 f528, f525, f496; +mul.f32 f529, f527, f497; +mul.f32 f530, f525, f497; +mul.f32 f531, f508, f525; +mul.f32 f532, f509, f527; +sub.f32 f533, f531, f532; +mul.f32 f534, f508, f527; +fma.rn.f32 f535, f509, f525, f534; +mul.f32 f536, f533, f500; +mul.f32 f537, f535, f501; +mul.f32 f538, f533, f501; +mul.f32 f539, f508, f533; +mul.f32 f540, f509, f535; +sub.f32 f541, f539, f540; +mul.f32 f542, f508, f535; +fma.rn.f32 f543, f509, f533, f542; +mul.f32 f544, f541, f504; +mul.f32 f545, f543, f505; +mul.f32 f546, f541, f505; +mul.f32 f547, f508, f541; +mul.f32 f548, f509, f543; +sub.f32 f549, f547, f548; +mul.f32 f550, f508, f543; +fma.rn.f32 f551, f509, f541, f550; +mul.f32 f552, f549, f486; +mul.f32 f553, f551, f487; +mul.f32 f554, f549, f487; +mul.f32 f555, f508, f549; +mul.f32 f556, f509, f551; +sub.f32 f557, f555, f556; +mul.f32 f558, f508, f551; +fma.rn.f32 f559, f509, f549, f558; +mul.f32 f560, f557, f490; +mul.f32 f561, f559, f491; +mul.f32 f562, f557, f491; +mul.f32 f563, f508, f557; +mul.f32 f564, f509, f559; +sub.f32 f565, f563, f564; +mul.f32 f566, f508, f559; +fma.rn.f32 f567, f509, f557, f566; +mul.f32 f568, f565, f494; +mul.f32 f569, f567, f495; +mul.f32 f570, f565, f495; +mul.f32 f571, f508, f565; +mul.f32 f572, f509, f567; +sub.f32 f573, f571, f572; +mul.f32 f574, f508, f567; +fma.rn.f32 f575, f509, f565, f574; +mul.f32 f576, f573, f498; +mul.f32 f577, f575, f499; +mul.f32 f578, f573, f499; +mul.f32 f579, f508, f573; +mul.f32 f580, f509, f575; +sub.f32 f581, f579, f580; +mul.f32 f582, f508, f575; +fma.rn.f32 f583, f509, f573, f582; +mul.f32 f584, f581, f502; +mul.f32 f585, f583, f503; +mul.f32 f586, f581, f503; +mul.f32 f587, f508, f581; +mul.f32 f588, f509, f583; +sub.f32 f589, f587, f588; +mul.f32 f590, f508, f583; +fma.rn.f32 f591, f509, f581, f590; +mul.f32 f592, f589, f506; +mul.f32 f593, f591, f507; +mul.f32 f594, f589, f507; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1152, r15; +add.f32 f595, f401, f455; +add.f32 f596, f400, f454; +st.shared.v2.f32 [r16], {f596, f595}; +fma.rn.f32 f597, f509, f488, f514; +sub.f32 f598, f512, f513; +st.shared.v2.f32 [r16+96], {f598, f597}; +fma.rn.f32 f599, f519, f492, f522; +sub.f32 f600, f520, f521; +st.shared.v2.f32 [r16+192], {f600, f599}; +fma.rn.f32 f601, f527, f496, f530; +sub.f32 f602, f528, f529; +st.shared.v2.f32 [r16+288], {f602, f601}; +fma.rn.f32 f603, f535, f500, f538; +sub.f32 f604, f536, f537; +st.shared.v2.f32 [r16+384], {f604, f603}; +fma.rn.f32 f605, f543, f504, f546; +sub.f32 f606, f544, f545; +st.shared.v2.f32 [r16+480], {f606, f605}; +fma.rn.f32 f607, f551, f486, f554; +sub.f32 f608, f552, f553; +st.shared.v2.f32 [r16+576], {f608, f607}; +fma.rn.f32 f609, f559, f490, f562; +sub.f32 f610, f560, f561; +st.shared.v2.f32 [r16+672], {f610, f609}; +fma.rn.f32 f611, f567, f494, f570; +sub.f32 f612, f568, f569; +st.shared.v2.f32 [r16+768], {f612, f611}; +fma.rn.f32 f613, f575, f498, f578; +sub.f32 f614, f576, f577; +st.shared.v2.f32 [r16+864], {f614, f613}; +fma.rn.f32 f615, f583, f502, f586; +sub.f32 f616, f584, f585; +st.shared.v2.f32 [r16+960], {f616, f615}; +fma.rn.f32 f617, f591, f506, f594; +sub.f32 f618, f592, f593; +st.shared.v2.f32 [r16+1056], {f618, f617}; +barrier.sync 0; +ld.shared.v2.f32 {f619, f620}, [r10]; +ld.shared.v2.f32 {f623, f624}, [r10+1152]; +ld.shared.v2.f32 {f627, f628}, [r10+2304]; +ld.shared.v2.f32 {f631, f632}, [r10+3456]; +ld.shared.v2.f32 {f635, f636}, [r10+4608]; +ld.shared.v2.f32 {f639, f640}, [r10+5760]; +ld.shared.v2.f32 {f643, f644}, [r10+6912]; +ld.shared.v2.f32 {f647, f648}, [r10+8064]; +ld.shared.v2.f32 {f651, f652}, [r10+9216]; +ld.shared.v2.f32 {f655, f656}, [r10+10368]; +ld.shared.v2.f32 {f659, f660}, [r10+11520]; +ld.shared.v2.f32 {f663, f664}, [r10+12672]; +add.f32 f667, f635, f651; +add.f32 f668, f619, f667; +add.f32 f669, f636, f652; +add.f32 f670, f620, f669; +mul.f32 f671, f667, 0f3F000000; +sub.f32 f672, f619, f671; +sub.f32 f673, f636, f652; +mul.f32 f674, f673, 0f3F5DB3D7; +add.f32 f675, f674, f672; +sub.f32 f676, f672, f674; +mul.f32 f677, f669, 0f3F000000; +sub.f32 f678, f620, f677; +sub.f32 f679, f635, f651; +mul.f32 f680, f679, 0f3F5DB3D7; +sub.f32 f681, f678, f680; +add.f32 f682, f680, f678; +add.f32 f683, f643, f659; +add.f32 f684, f627, f683; +add.f32 f685, f644, f660; +add.f32 f686, f628, f685; +mul.f32 f687, f683, 0f3F000000; +sub.f32 f688, f627, f687; +sub.f32 f689, f644, f660; +mul.f32 f690, f689, 0f3F5DB3D7; +add.f32 f691, f690, f688; +sub.f32 f692, f688, f690; +mul.f32 f693, f685, 0f3F000000; +sub.f32 f694, f628, f693; +sub.f32 f695, f643, f659; +mul.f32 f696, f695, 0f3F5DB3D7; +sub.f32 f697, f694, f696; +add.f32 f698, f696, f694; +mul.f32 f699, f691, 0f3F000000; +mul.f32 f700, f697, 0fBF5DB3D7; +sub.f32 f701, f699, f700; +mul.f32 f702, f697, 0f3F000000; +fma.rn.f32 f703, f691, 0fBF5DB3D7, f702; +mul.f32 f704, f692, 0fBF000000; +mul.f32 f705, f698, 0fBF5DB3D7; +sub.f32 f706, f704, f705; +mul.f32 f707, f698, 0fBF000000; +fma.rn.f32 f708, f692, 0fBF5DB3D7, f707; +add.f32 f709, f668, f684; +add.f32 f710, f670, f686; +sub.f32 f711, f668, f684; +sub.f32 f712, f670, f686; +add.f32 f713, f675, f701; +add.f32 f714, f681, f703; +sub.f32 f715, f675, f701; +sub.f32 f716, f681, f703; +add.f32 f717, f676, f706; +add.f32 f718, f682, f708; +sub.f32 f719, f676, f706; +sub.f32 f720, f682, f708; +add.f32 f721, f639, f655; +add.f32 f722, f623, f721; +add.f32 f723, f640, f656; +add.f32 f724, f624, f723; +mul.f32 f725, f721, 0f3F000000; +sub.f32 f726, f623, f725; +sub.f32 f727, f640, f656; +mul.f32 f728, f727, 0f3F5DB3D7; +add.f32 f729, f728, f726; +sub.f32 f730, f726, f728; +mul.f32 f731, f723, 0f3F000000; +sub.f32 f732, f624, f731; +sub.f32 f733, f639, f655; +mul.f32 f734, f733, 0f3F5DB3D7; +sub.f32 f735, f732, f734; +add.f32 f736, f734, f732; +add.f32 f737, f647, f663; +add.f32 f738, f631, f737; +add.f32 f739, f648, f664; +add.f32 f740, f632, f739; +mul.f32 f741, f737, 0f3F000000; +sub.f32 f742, f631, f741; +sub.f32 f743, f648, f664; +mul.f32 f744, f743, 0f3F5DB3D7; +add.f32 f745, f744, f742; +sub.f32 f746, f742, f744; +mul.f32 f747, f739, 0f3F000000; +sub.f32 f748, f632, f747; +sub.f32 f749, f647, f663; +mul.f32 f750, f749, 0f3F5DB3D7; +sub.f32 f751, f748, f750; +add.f32 f752, f750, f748; +mul.f32 f753, f745, 0f3F000000; +mul.f32 f754, f751, 0fBF5DB3D7; +sub.f32 f755, f753, f754; +mul.f32 f756, f751, 0f3F000000; +fma.rn.f32 f757, f745, 0fBF5DB3D7, f756; +mul.f32 f758, f746, 0fBF000000; +mul.f32 f759, f752, 0fBF5DB3D7; +sub.f32 f760, f758, f759; +mul.f32 f761, f752, 0fBF000000; +fma.rn.f32 f762, f746, 0fBF5DB3D7, f761; +add.f32 f763, f722, f738; +add.f32 f764, f724, f740; +sub.f32 f765, f722, f738; +sub.f32 f766, f724, f740; +add.f32 f767, f729, f755; +add.f32 f768, f735, f757; +sub.f32 f769, f729, f755; +sub.f32 f770, f735, f757; +add.f32 f771, f730, f760; +add.f32 f772, f736, f762; +sub.f32 f773, f730, f760; +sub.f32 f774, f736, f762; +mul.f32 f775, f767, 0f3F5DB3D7; +mul.f32 f776, f768, 0fBF000000; +sub.f32 f777, f775, f776; +mul.f32 f778, f768, 0f3F5DB3D7; +fma.rn.f32 f779, f767, 0fBF000000, f778; +mul.f32 f780, f771, 0f3F000000; +mul.f32 f781, f772, 0fBF5DB3D7; +sub.f32 f782, f780, f781; +mul.f32 f783, f772, 0f3F000000; +fma.rn.f32 f784, f771, 0fBF5DB3D7, f783; +mul.f32 f785, f769, 0fBF000000; +mul.f32 f786, f770, 0fBF5DB3D7; +sub.f32 f787, f785, f786; +mul.f32 f788, f770, 0fBF000000; +fma.rn.f32 f789, f769, 0fBF5DB3D7, f788; +mul.f32 f790, f773, 0fBF5DB3D7; +mul.f32 f791, f774, 0fBF000000; +sub.f32 f792, f790, f791; +mul.f32 f793, f774, 0fBF5DB3D7; +fma.rn.f32 f794, f773, 0fBF000000, f793; +add.f32 %1, f710, f764; +add.f32 %0, f709, f763; +add.f32 %3, f714, f779; +add.f32 %2, f713, f777; +add.f32 %5, f718, f784; +add.f32 %4, f717, f782; +sub.f32 %7, f712, f765; +add.f32 %6, f711, f766; +add.f32 %9, f716, f789; +add.f32 %8, f715, f787; +add.f32 %11, f720, f794; +add.f32 %10, f719, f792; +sub.f32 %13, f710, f764; +sub.f32 %12, f709, f763; +sub.f32 %15, f714, f779; +sub.f32 %14, f713, f777; +sub.f32 %17, f718, f784; +sub.f32 %16, f717, f782; +add.f32 %19, f712, f765; +sub.f32 %18, f711, f766; +sub.f32 %21, f716, f789; +sub.f32 %20, f715, f787; +sub.f32 %23, f720, f794; +sub.f32 %22, f719, f792; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_1728), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<201, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<771>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 6912, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %37, %48; +add.f32 f50, %27, f49; +add.f32 f51, %39, %50; +add.f32 f52, %28, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %27, f53; +sub.f32 f55, %39, %50; +mul.f32 f56, f55, 0f3F5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %28, f59; +sub.f32 f61, %37, %48; +mul.f32 f62, f61, 0f3F5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %43, %53; +add.f32 f66, %32, f65; +add.f32 f67, %44, %55; +add.f32 f68, %34, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %32, f69; +sub.f32 f71, %44, %55; +mul.f32 f72, f71, 0f3F5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %34, f75; +sub.f32 f77, %43, %53; +mul.f32 f78, f77, 0f3F5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0fBF5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0fBF5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0fBF5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0fBF5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %40, %51; +add.f32 f104, %29, f103; +add.f32 f105, %42, %52; +add.f32 f106, %31, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %29, f107; +sub.f32 f109, %42, %52; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %31, f113; +sub.f32 f115, %40, %51; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %45, %56; +add.f32 f120, %35, f119; +add.f32 f121, %47, %57; +add.f32 f122, %36, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %35, f123; +sub.f32 f125, %47, %57; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %36, f129; +sub.f32 f131, %45, %56; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0fBF5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0fBF5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0fBF5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0fBF5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0fBF000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0fBF000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0fBF5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0fBF5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0fBF5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0fBF5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0fBF000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0fBF000000, f175; +add.f32 f177, f91, f145; +add.f32 f178, f92, f146; +sub.f32 f179, f91, f145; +sub.f32 f180, f92, f146; +add.f32 f181, f95, f159; +add.f32 f182, f96, f161; +sub.f32 f183, f95, f159; +sub.f32 f184, f96, f161; +add.f32 f185, f99, f164; +add.f32 f186, f100, f166; +sub.f32 f187, f99, f164; +sub.f32 f188, f100, f166; +add.f32 f189, f93, f148; +sub.f32 f190, f94, f147; +sub.f32 f191, f93, f148; +add.f32 f192, f94, f147; +add.f32 f193, f97, f169; +add.f32 f194, f98, f171; +sub.f32 f195, f97, f169; +sub.f32 f196, f98, f171; +add.f32 f197, f101, f174; +add.f32 f198, f102, f176; +sub.f32 f199, f101, f174; +sub.f32 f200, f102, f176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f201, f202}, [rd6]; +mul.f32 f205, f201, f181; +mul.f32 f206, f202, f182; +sub.f32 f207, f205, f206; +mul.f32 f208, f201, f182; +fma.rn.f32 f209, f202, f181, f208; +mul.f32 f210, f201, f201; +mul.f32 f211, f202, f202; +sub.f32 f212, f210, f211; +mul.f32 f213, f202, f201; +fma.rn.f32 f214, f202, f201, f213; +mul.f32 f215, f212, f185; +mul.f32 f216, f214, f186; +sub.f32 f217, f215, f216; +mul.f32 f218, f212, f186; +fma.rn.f32 f219, f214, f185, f218; +mul.f32 f220, f201, f212; +mul.f32 f221, f202, f214; +sub.f32 f222, f220, f221; +mul.f32 f223, f201, f214; +fma.rn.f32 f224, f202, f212, f223; +mul.f32 f225, f222, f189; +mul.f32 f226, f224, f190; +sub.f32 f227, f225, f226; +mul.f32 f228, f222, f190; +fma.rn.f32 f229, f224, f189, f228; +mul.f32 f230, f201, f222; +mul.f32 f231, f202, f224; +sub.f32 f232, f230, f231; +mul.f32 f233, f201, f224; +fma.rn.f32 f234, f202, f222, f233; +mul.f32 f235, f232, f193; +mul.f32 f236, f234, f194; +sub.f32 f237, f235, f236; +mul.f32 f238, f232, f194; +fma.rn.f32 f239, f234, f193, f238; +mul.f32 f240, f201, f232; +mul.f32 f241, f202, f234; +sub.f32 f242, f240, f241; +mul.f32 f243, f201, f234; +fma.rn.f32 f244, f202, f232, f243; +mul.f32 f245, f242, f197; +mul.f32 f246, f244, f198; +sub.f32 f247, f245, f246; +mul.f32 f248, f242, f198; +fma.rn.f32 f249, f244, f197, f248; +mul.f32 f250, f201, f242; +mul.f32 f251, f202, f244; +sub.f32 f252, f250, f251; +mul.f32 f253, f201, f244; +fma.rn.f32 f254, f202, f242, f253; +mul.f32 f255, f252, f179; +mul.f32 f256, f254, f180; +sub.f32 f257, f255, f256; +mul.f32 f258, f252, f180; +fma.rn.f32 f259, f254, f179, f258; +mul.f32 f260, f201, f252; +mul.f32 f261, f202, f254; +sub.f32 f262, f260, f261; +mul.f32 f263, f201, f254; +fma.rn.f32 f264, f202, f252, f263; +mul.f32 f265, f262, f183; +mul.f32 f266, f264, f184; +sub.f32 f267, f265, f266; +mul.f32 f268, f262, f184; +fma.rn.f32 f269, f264, f183, f268; +mul.f32 f270, f201, f262; +mul.f32 f271, f202, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f201, f264; +fma.rn.f32 f274, f202, f262, f273; +mul.f32 f275, f272, f187; +mul.f32 f276, f274, f188; +sub.f32 f277, f275, f276; +mul.f32 f278, f272, f188; +fma.rn.f32 f279, f274, f187, f278; +mul.f32 f280, f201, f272; +mul.f32 f281, f202, f274; +sub.f32 f282, f280, f281; +mul.f32 f283, f201, f274; +fma.rn.f32 f284, f202, f272, f283; +mul.f32 f285, f282, f191; +mul.f32 f286, f284, f192; +sub.f32 f287, f285, f286; +mul.f32 f288, f282, f192; +fma.rn.f32 f289, f284, f191, f288; +mul.f32 f290, f201, f282; +mul.f32 f291, f202, f284; +sub.f32 f292, f290, f291; +mul.f32 f293, f201, f284; +fma.rn.f32 f294, f202, f282, f293; +mul.f32 f295, f292, f195; +mul.f32 f296, f294, f196; +sub.f32 f297, f295, f296; +mul.f32 f298, f292, f196; +fma.rn.f32 f299, f294, f195, f298; +mul.f32 f300, f201, f292; +mul.f32 f301, f202, f294; +sub.f32 f302, f300, f301; +mul.f32 f303, f201, f294; +fma.rn.f32 f304, f202, f292, f303; +mul.f32 f305, f302, f199; +mul.f32 f306, f304, f200; +sub.f32 f307, f305, f306; +mul.f32 f308, f302, f200; +fma.rn.f32 f309, f304, f199, f308; +mad.lo.s32 r8, r5, 6912, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v4.f32 [r9], {f177, f207, f217, f227}; +st.shared.v4.f32 [r9+16], {f237, f247, f257, f267}; +st.shared.v4.f32 [r9+32], {f277, f287, f297, f307}; +barrier.sync 0; +mad.lo.s32 r10, r7, -44, r9; +ld.shared.f32 f310, [r10]; +ld.shared.f32 f311, [r10+576]; +ld.shared.f32 f312, [r10+1152]; +ld.shared.f32 f313, [r10+1728]; +ld.shared.f32 f314, [r10+2304]; +ld.shared.f32 f315, [r10+2880]; +ld.shared.f32 f316, [r10+3456]; +ld.shared.f32 f317, [r10+4032]; +ld.shared.f32 f318, [r10+4608]; +ld.shared.f32 f319, [r10+5184]; +ld.shared.f32 f320, [r10+5760]; +ld.shared.f32 f321, [r10+6336]; +barrier.sync 0; +st.shared.v4.f32 [r9], {f178, f209, f219, f229}; +st.shared.v4.f32 [r9+16], {f239, f249, f259, f269}; +st.shared.v4.f32 [r9+32], {f279, f289, f299, f309}; +barrier.sync 0; +ld.shared.f32 f322, [r10]; +ld.shared.f32 f323, [r10+576]; +ld.shared.f32 f324, [r10+1152]; +ld.shared.f32 f325, [r10+1728]; +ld.shared.f32 f326, [r10+2304]; +ld.shared.f32 f327, [r10+2880]; +ld.shared.f32 f328, [r10+3456]; +ld.shared.f32 f329, [r10+4032]; +ld.shared.f32 f330, [r10+4608]; +ld.shared.f32 f331, [r10+5184]; +ld.shared.f32 f332, [r10+5760]; +ld.shared.f32 f333, [r10+6336]; +add.f32 f334, f314, f318; +add.f32 f335, f310, f334; +add.f32 f336, f326, f330; +add.f32 f337, f322, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f310, f338; +sub.f32 f340, f326, f330; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f322, f344; +sub.f32 f346, f314, f318; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f316, f320; +add.f32 f351, f312, f350; +add.f32 f352, f328, f332; +add.f32 f353, f324, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f312, f354; +sub.f32 f356, f328, f332; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f324, f360; +sub.f32 f362, f316, f320; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.f32 f366, f358, 0f3F000000; +mul.f32 f367, f364, 0fBF5DB3D7; +sub.f32 f368, f366, f367; +mul.f32 f369, f364, 0f3F000000; +fma.rn.f32 f370, f358, 0fBF5DB3D7, f369; +mul.f32 f371, f359, 0fBF000000; +mul.f32 f372, f365, 0fBF5DB3D7; +sub.f32 f373, f371, f372; +mul.f32 f374, f365, 0fBF000000; +fma.rn.f32 f375, f359, 0fBF5DB3D7, f374; +add.f32 f376, f335, f351; +add.f32 f377, f337, f353; +sub.f32 f378, f335, f351; +sub.f32 f379, f337, f353; +add.f32 f380, f342, f368; +add.f32 f381, f348, f370; +sub.f32 f382, f342, f368; +sub.f32 f383, f348, f370; +add.f32 f384, f343, f373; +add.f32 f385, f349, f375; +sub.f32 f386, f343, f373; +sub.f32 f387, f349, f375; +add.f32 f388, f315, f319; +add.f32 f389, f311, f388; +add.f32 f390, f327, f331; +add.f32 f391, f323, f390; +mul.f32 f392, f388, 0f3F000000; +sub.f32 f393, f311, f392; +sub.f32 f394, f327, f331; +mul.f32 f395, f394, 0f3F5DB3D7; +add.f32 f396, f395, f393; +sub.f32 f397, f393, f395; +mul.f32 f398, f390, 0f3F000000; +sub.f32 f399, f323, f398; +sub.f32 f400, f315, f319; +mul.f32 f401, f400, 0f3F5DB3D7; +sub.f32 f402, f399, f401; +add.f32 f403, f401, f399; +add.f32 f404, f317, f321; +add.f32 f405, f313, f404; +add.f32 f406, f329, f333; +add.f32 f407, f325, f406; +mul.f32 f408, f404, 0f3F000000; +sub.f32 f409, f313, f408; +sub.f32 f410, f329, f333; +mul.f32 f411, f410, 0f3F5DB3D7; +add.f32 f412, f411, f409; +sub.f32 f413, f409, f411; +mul.f32 f414, f406, 0f3F000000; +sub.f32 f415, f325, f414; +sub.f32 f416, f317, f321; +mul.f32 f417, f416, 0f3F5DB3D7; +sub.f32 f418, f415, f417; +add.f32 f419, f417, f415; +mul.f32 f420, f412, 0f3F000000; +mul.f32 f421, f418, 0fBF5DB3D7; +sub.f32 f422, f420, f421; +mul.f32 f423, f418, 0f3F000000; +fma.rn.f32 f424, f412, 0fBF5DB3D7, f423; +mul.f32 f425, f413, 0fBF000000; +mul.f32 f426, f419, 0fBF5DB3D7; +sub.f32 f427, f425, f426; +mul.f32 f428, f419, 0fBF000000; +fma.rn.f32 f429, f413, 0fBF5DB3D7, f428; +add.f32 f430, f389, f405; +add.f32 f431, f391, f407; +sub.f32 f432, f389, f405; +sub.f32 f433, f391, f407; +add.f32 f434, f396, f422; +add.f32 f435, f402, f424; +sub.f32 f436, f396, f422; +sub.f32 f437, f402, f424; +add.f32 f438, f397, f427; +add.f32 f439, f403, f429; +sub.f32 f440, f397, f427; +sub.f32 f441, f403, f429; +mul.f32 f442, f434, 0f3F5DB3D7; +mul.f32 f443, f435, 0fBF000000; +sub.f32 f444, f442, f443; +mul.f32 f445, f435, 0f3F5DB3D7; +fma.rn.f32 f446, f434, 0fBF000000, f445; +mul.f32 f447, f438, 0f3F000000; +mul.f32 f448, f439, 0fBF5DB3D7; +sub.f32 f449, f447, f448; +mul.f32 f450, f439, 0f3F000000; +fma.rn.f32 f451, f438, 0fBF5DB3D7, f450; +mul.f32 f452, f436, 0fBF000000; +mul.f32 f453, f437, 0fBF5DB3D7; +sub.f32 f454, f452, f453; +mul.f32 f455, f437, 0fBF000000; +fma.rn.f32 f456, f436, 0fBF5DB3D7, f455; +mul.f32 f457, f440, 0fBF5DB3D7; +mul.f32 f458, f441, 0fBF000000; +sub.f32 f459, f457, f458; +mul.f32 f460, f441, 0fBF5DB3D7; +fma.rn.f32 f461, f440, 0fBF000000, f460; +add.f32 f462, f376, f430; +add.f32 f463, f377, f431; +sub.f32 f464, f376, f430; +sub.f32 f465, f377, f431; +add.f32 f466, f380, f444; +add.f32 f467, f381, f446; +sub.f32 f468, f380, f444; +sub.f32 f469, f381, f446; +add.f32 f470, f384, f449; +add.f32 f471, f385, f451; +sub.f32 f472, f384, f449; +sub.f32 f473, f385, f451; +add.f32 f474, f378, f433; +sub.f32 f475, f379, f432; +sub.f32 f476, f378, f433; +add.f32 f477, f379, f432; +add.f32 f478, f382, f454; +add.f32 f479, f383, f456; +sub.f32 f480, f382, f454; +sub.f32 f481, f383, f456; +add.f32 f482, f386, f459; +add.f32 f483, f387, f461; +sub.f32 f484, f386, f459; +sub.f32 f485, f387, f461; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f486, f487}, [rd11]; +mul.f32 f490, f486, f466; +mul.f32 f491, f487, f467; +sub.f32 f492, f490, f491; +mul.f32 f493, f486, f467; +fma.rn.f32 f494, f487, f466, f493; +mul.f32 f495, f486, f486; +mul.f32 f496, f487, f487; +sub.f32 f497, f495, f496; +mul.f32 f498, f487, f486; +fma.rn.f32 f499, f487, f486, f498; +mul.f32 f500, f497, f470; +mul.f32 f501, f499, f471; +sub.f32 f502, f500, f501; +mul.f32 f503, f497, f471; +fma.rn.f32 f504, f499, f470, f503; +mul.f32 f505, f486, f497; +mul.f32 f506, f487, f499; +sub.f32 f507, f505, f506; +mul.f32 f508, f486, f499; +fma.rn.f32 f509, f487, f497, f508; +mul.f32 f510, f507, f474; +mul.f32 f511, f509, f475; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, f475; +fma.rn.f32 f514, f509, f474, f513; +mul.f32 f515, f486, f507; +mul.f32 f516, f487, f509; +sub.f32 f517, f515, f516; +mul.f32 f518, f486, f509; +fma.rn.f32 f519, f487, f507, f518; +mul.f32 f520, f517, f478; +mul.f32 f521, f519, f479; +sub.f32 f522, f520, f521; +mul.f32 f523, f517, f479; +fma.rn.f32 f524, f519, f478, f523; +mul.f32 f525, f486, f517; +mul.f32 f526, f487, f519; +sub.f32 f527, f525, f526; +mul.f32 f528, f486, f519; +fma.rn.f32 f529, f487, f517, f528; +mul.f32 f530, f527, f482; +mul.f32 f531, f529, f483; +sub.f32 f532, f530, f531; +mul.f32 f533, f527, f483; +fma.rn.f32 f534, f529, f482, f533; +mul.f32 f535, f486, f527; +mul.f32 f536, f487, f529; +sub.f32 f537, f535, f536; +mul.f32 f538, f486, f529; +fma.rn.f32 f539, f487, f527, f538; +mul.f32 f540, f537, f464; +mul.f32 f541, f539, f465; +sub.f32 f542, f540, f541; +mul.f32 f543, f537, f465; +fma.rn.f32 f544, f539, f464, f543; +mul.f32 f545, f486, f537; +mul.f32 f546, f487, f539; +sub.f32 f547, f545, f546; +mul.f32 f548, f486, f539; +fma.rn.f32 f549, f487, f537, f548; +mul.f32 f550, f547, f468; +mul.f32 f551, f549, f469; +sub.f32 f552, f550, f551; +mul.f32 f553, f547, f469; +fma.rn.f32 f554, f549, f468, f553; +mul.f32 f555, f486, f547; +mul.f32 f556, f487, f549; +sub.f32 f557, f555, f556; +mul.f32 f558, f486, f549; +fma.rn.f32 f559, f487, f547, f558; +mul.f32 f560, f557, f472; +mul.f32 f561, f559, f473; +sub.f32 f562, f560, f561; +mul.f32 f563, f557, f473; +fma.rn.f32 f564, f559, f472, f563; +mul.f32 f565, f486, f557; +mul.f32 f566, f487, f559; +sub.f32 f567, f565, f566; +mul.f32 f568, f486, f559; +fma.rn.f32 f569, f487, f557, f568; +mul.f32 f570, f567, f476; +mul.f32 f571, f569, f477; +sub.f32 f572, f570, f571; +mul.f32 f573, f567, f477; +fma.rn.f32 f574, f569, f476, f573; +mul.f32 f575, f486, f567; +mul.f32 f576, f487, f569; +sub.f32 f577, f575, f576; +mul.f32 f578, f486, f569; +fma.rn.f32 f579, f487, f567, f578; +mul.f32 f580, f577, f480; +mul.f32 f581, f579, f481; +sub.f32 f582, f580, f581; +mul.f32 f583, f577, f481; +fma.rn.f32 f584, f579, f480, f583; +mul.f32 f585, f486, f577; +mul.f32 f586, f487, f579; +sub.f32 f587, f585, f586; +mul.f32 f588, f486, f579; +fma.rn.f32 f589, f487, f577, f588; +mul.f32 f590, f587, f484; +mul.f32 f591, f589, f485; +sub.f32 f592, f590, f591; +mul.f32 f593, f587, f485; +fma.rn.f32 f594, f589, f484, f593; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 576, r15; +st.shared.f32 [r16], f462; +st.shared.f32 [r16+48], f492; +st.shared.f32 [r16+96], f502; +st.shared.f32 [r16+144], f512; +st.shared.f32 [r16+192], f522; +st.shared.f32 [r16+240], f532; +st.shared.f32 [r16+288], f542; +st.shared.f32 [r16+336], f552; +st.shared.f32 [r16+384], f562; +st.shared.f32 [r16+432], f572; +st.shared.f32 [r16+480], f582; +st.shared.f32 [r16+528], f592; +barrier.sync 0; +ld.shared.f32 f595, [r10]; +ld.shared.f32 f596, [r10+576]; +ld.shared.f32 f597, [r10+1152]; +ld.shared.f32 f598, [r10+1728]; +ld.shared.f32 f599, [r10+2304]; +ld.shared.f32 f600, [r10+2880]; +ld.shared.f32 f601, [r10+3456]; +ld.shared.f32 f602, [r10+4032]; +ld.shared.f32 f603, [r10+4608]; +ld.shared.f32 f604, [r10+5184]; +ld.shared.f32 f605, [r10+5760]; +ld.shared.f32 f606, [r10+6336]; +barrier.sync 0; +st.shared.f32 [r16], f463; +st.shared.f32 [r16+48], f494; +st.shared.f32 [r16+96], f504; +st.shared.f32 [r16+144], f514; +st.shared.f32 [r16+192], f524; +st.shared.f32 [r16+240], f534; +st.shared.f32 [r16+288], f544; +st.shared.f32 [r16+336], f554; +st.shared.f32 [r16+384], f564; +st.shared.f32 [r16+432], f574; +st.shared.f32 [r16+480], f584; +st.shared.f32 [r16+528], f594; +barrier.sync 0; +ld.shared.f32 f607, [r10]; +ld.shared.f32 f608, [r10+576]; +ld.shared.f32 f609, [r10+1152]; +ld.shared.f32 f610, [r10+1728]; +ld.shared.f32 f611, [r10+2304]; +ld.shared.f32 f612, [r10+2880]; +ld.shared.f32 f613, [r10+3456]; +ld.shared.f32 f614, [r10+4032]; +ld.shared.f32 f615, [r10+4608]; +ld.shared.f32 f616, [r10+5184]; +ld.shared.f32 f617, [r10+5760]; +ld.shared.f32 f618, [r10+6336]; +add.f32 f619, f599, f603; +add.f32 f620, f595, f619; +add.f32 f621, f611, f615; +add.f32 f622, f607, f621; +mul.f32 f623, f619, 0f3F000000; +sub.f32 f624, f595, f623; +sub.f32 f625, f611, f615; +mul.f32 f626, f625, 0f3F5DB3D7; +add.f32 f627, f626, f624; +sub.f32 f628, f624, f626; +mul.f32 f629, f621, 0f3F000000; +sub.f32 f630, f607, f629; +sub.f32 f631, f599, f603; +mul.f32 f632, f631, 0f3F5DB3D7; +sub.f32 f633, f630, f632; +add.f32 f634, f632, f630; +add.f32 f635, f601, f605; +add.f32 f636, f597, f635; +add.f32 f637, f613, f617; +add.f32 f638, f609, f637; +mul.f32 f639, f635, 0f3F000000; +sub.f32 f640, f597, f639; +sub.f32 f641, f613, f617; +mul.f32 f642, f641, 0f3F5DB3D7; +add.f32 f643, f642, f640; +sub.f32 f644, f640, f642; +mul.f32 f645, f637, 0f3F000000; +sub.f32 f646, f609, f645; +sub.f32 f647, f601, f605; +mul.f32 f648, f647, 0f3F5DB3D7; +sub.f32 f649, f646, f648; +add.f32 f650, f648, f646; +mul.f32 f651, f643, 0f3F000000; +mul.f32 f652, f649, 0fBF5DB3D7; +sub.f32 f653, f651, f652; +mul.f32 f654, f649, 0f3F000000; +fma.rn.f32 f655, f643, 0fBF5DB3D7, f654; +mul.f32 f656, f644, 0fBF000000; +mul.f32 f657, f650, 0fBF5DB3D7; +sub.f32 f658, f656, f657; +mul.f32 f659, f650, 0fBF000000; +fma.rn.f32 f660, f644, 0fBF5DB3D7, f659; +add.f32 f661, f620, f636; +add.f32 f662, f622, f638; +sub.f32 f663, f620, f636; +sub.f32 f664, f622, f638; +add.f32 f665, f627, f653; +add.f32 f666, f633, f655; +sub.f32 f667, f627, f653; +sub.f32 f668, f633, f655; +add.f32 f669, f628, f658; +add.f32 f670, f634, f660; +sub.f32 f671, f628, f658; +sub.f32 f672, f634, f660; +add.f32 f673, f600, f604; +add.f32 f674, f596, f673; +add.f32 f675, f612, f616; +add.f32 f676, f608, f675; +mul.f32 f677, f673, 0f3F000000; +sub.f32 f678, f596, f677; +sub.f32 f679, f612, f616; +mul.f32 f680, f679, 0f3F5DB3D7; +add.f32 f681, f680, f678; +sub.f32 f682, f678, f680; +mul.f32 f683, f675, 0f3F000000; +sub.f32 f684, f608, f683; +sub.f32 f685, f600, f604; +mul.f32 f686, f685, 0f3F5DB3D7; +sub.f32 f687, f684, f686; +add.f32 f688, f686, f684; +add.f32 f689, f602, f606; +add.f32 f690, f598, f689; +add.f32 f691, f614, f618; +add.f32 f692, f610, f691; +mul.f32 f693, f689, 0f3F000000; +sub.f32 f694, f598, f693; +sub.f32 f695, f614, f618; +mul.f32 f696, f695, 0f3F5DB3D7; +add.f32 f697, f696, f694; +sub.f32 f698, f694, f696; +mul.f32 f699, f691, 0f3F000000; +sub.f32 f700, f610, f699; +sub.f32 f701, f602, f606; +mul.f32 f702, f701, 0f3F5DB3D7; +sub.f32 f703, f700, f702; +add.f32 f704, f702, f700; +mul.f32 f705, f697, 0f3F000000; +mul.f32 f706, f703, 0fBF5DB3D7; +sub.f32 f707, f705, f706; +mul.f32 f708, f703, 0f3F000000; +fma.rn.f32 f709, f697, 0fBF5DB3D7, f708; +mul.f32 f710, f698, 0fBF000000; +mul.f32 f711, f704, 0fBF5DB3D7; +sub.f32 f712, f710, f711; +mul.f32 f713, f704, 0fBF000000; +fma.rn.f32 f714, f698, 0fBF5DB3D7, f713; +add.f32 f715, f674, f690; +add.f32 f716, f676, f692; +sub.f32 f717, f674, f690; +sub.f32 f718, f676, f692; +add.f32 f719, f681, f707; +add.f32 f720, f687, f709; +sub.f32 f721, f681, f707; +sub.f32 f722, f687, f709; +add.f32 f723, f682, f712; +add.f32 f724, f688, f714; +sub.f32 f725, f682, f712; +sub.f32 f726, f688, f714; +mul.f32 f727, f719, 0f3F5DB3D7; +mul.f32 f728, f720, 0fBF000000; +sub.f32 f729, f727, f728; +mul.f32 f730, f720, 0f3F5DB3D7; +fma.rn.f32 f731, f719, 0fBF000000, f730; +mul.f32 f732, f723, 0f3F000000; +mul.f32 f733, f724, 0fBF5DB3D7; +sub.f32 f734, f732, f733; +mul.f32 f735, f724, 0f3F000000; +fma.rn.f32 f736, f723, 0fBF5DB3D7, f735; +mul.f32 f737, f721, 0fBF000000; +mul.f32 f738, f722, 0fBF5DB3D7; +sub.f32 f739, f737, f738; +mul.f32 f740, f722, 0fBF000000; +fma.rn.f32 f741, f721, 0fBF5DB3D7, f740; +mul.f32 f742, f725, 0fBF5DB3D7; +mul.f32 f743, f726, 0fBF000000; +sub.f32 f744, f742, f743; +mul.f32 f745, f726, 0fBF5DB3D7; +fma.rn.f32 f746, f725, 0fBF000000, f745; +add.f32 %0, f661, f715; +add.f32 %1, f662, f716; +add.f32 %3, f666, f731; +add.f32 %2, f665, f729; +add.f32 %5, f670, f736; +add.f32 %4, f669, f734; +sub.f32 %7, f664, f717; +add.f32 %6, f663, f718; +add.f32 %9, f668, f741; +add.f32 %8, f667, f739; +add.f32 %11, f672, f746; +add.f32 %10, f671, f744; +sub.f32 %12, f661, f715; +sub.f32 %13, f662, f716; +sub.f32 %15, f666, f731; +sub.f32 %14, f665, f729; +sub.f32 %17, f670, f736; +sub.f32 %16, f669, f734; +add.f32 %19, f664, f717; +sub.f32 %18, f663, f718; +sub.f32 %21, f668, f741; +sub.f32 %20, f667, f739; +sub.f32 %23, f672, f746; +sub.f32 %22, f671, f744; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_1728), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..e57efa4d0f436 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp32_inv.hpp.inc @@ -0,0 +1,1550 @@ +#ifndef CUFFTDX_FFT_1728_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_1728_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<402, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<819>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 13824, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %37, %48; +add.f32 f50, %27, f49; +add.f32 f51, %39, %50; +add.f32 f52, %28, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %27, f53; +sub.f32 f55, %39, %50; +mul.f32 f56, f55, 0fBF5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %28, f59; +sub.f32 f61, %37, %48; +mul.f32 f62, f61, 0fBF5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %43, %53; +add.f32 f66, %32, f65; +add.f32 f67, %44, %55; +add.f32 f68, %34, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %32, f69; +sub.f32 f71, %44, %55; +mul.f32 f72, f71, 0fBF5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %34, f75; +sub.f32 f77, %43, %53; +mul.f32 f78, f77, 0fBF5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0f3F5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0f3F5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0f3F5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0f3F5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %40, %51; +add.f32 f104, %29, f103; +add.f32 f105, %42, %52; +add.f32 f106, %31, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %29, f107; +sub.f32 f109, %42, %52; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %31, f113; +sub.f32 f115, %40, %51; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %45, %56; +add.f32 f120, %35, f119; +add.f32 f121, %47, %57; +add.f32 f122, %36, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %35, f123; +sub.f32 f125, %47, %57; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %36, f129; +sub.f32 f131, %45, %56; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0f3F5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0f3F5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0f3F5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0f3F5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0f3F000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0f3F000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0f3F5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0f3F5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0f3F5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0f3F5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0f3F000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0f3F000000, f175; +sub.f32 f177, f91, f145; +sub.f32 f178, f92, f146; +add.f32 f179, f95, f159; +add.f32 f180, f96, f161; +sub.f32 f181, f95, f159; +sub.f32 f182, f96, f161; +add.f32 f183, f99, f164; +add.f32 f184, f100, f166; +sub.f32 f185, f99, f164; +sub.f32 f186, f100, f166; +sub.f32 f187, f93, f148; +add.f32 f188, f94, f147; +add.f32 f189, f93, f148; +sub.f32 f190, f94, f147; +add.f32 f191, f97, f169; +add.f32 f192, f98, f171; +sub.f32 f193, f97, f169; +sub.f32 f194, f98, f171; +add.f32 f195, f101, f174; +add.f32 f196, f102, f176; +sub.f32 f197, f101, f174; +sub.f32 f198, f102, f176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 13824, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f199, f200}, [rd6]; +mul.f32 f203, f180, f200; +mul.f32 f204, f179, f200; +mul.f32 f205, f199, f180; +mul.f32 f206, f199, f199; +mul.f32 f207, f200, f200; +sub.f32 f208, f206, f207; +mul.f32 f209, f200, f199; +fma.rn.f32 f210, f200, f199, f209; +mul.f32 f211, f184, f210; +mul.f32 f212, f183, f210; +mul.f32 f213, f208, f184; +mul.f32 f214, f199, f208; +mul.f32 f215, f200, f210; +sub.f32 f216, f214, f215; +mul.f32 f217, f199, f210; +fma.rn.f32 f218, f200, f208, f217; +mul.f32 f219, f188, f218; +mul.f32 f220, f187, f218; +mul.f32 f221, f216, f188; +mul.f32 f222, f199, f216; +mul.f32 f223, f200, f218; +sub.f32 f224, f222, f223; +mul.f32 f225, f199, f218; +fma.rn.f32 f226, f200, f216, f225; +mul.f32 f227, f192, f226; +mul.f32 f228, f191, f226; +mul.f32 f229, f224, f192; +mul.f32 f230, f199, f224; +mul.f32 f231, f200, f226; +sub.f32 f232, f230, f231; +mul.f32 f233, f199, f226; +fma.rn.f32 f234, f200, f224, f233; +mul.f32 f235, f196, f234; +mul.f32 f236, f195, f234; +mul.f32 f237, f232, f196; +mul.f32 f238, f199, f232; +mul.f32 f239, f200, f234; +sub.f32 f240, f238, f239; +mul.f32 f241, f199, f234; +fma.rn.f32 f242, f200, f232, f241; +mul.f32 f243, f178, f242; +mul.f32 f244, f177, f242; +mul.f32 f245, f240, f178; +mul.f32 f246, f199, f240; +mul.f32 f247, f200, f242; +sub.f32 f248, f246, f247; +mul.f32 f249, f199, f242; +fma.rn.f32 f250, f200, f240, f249; +mul.f32 f251, f182, f250; +mul.f32 f252, f181, f250; +mul.f32 f253, f248, f182; +mul.f32 f254, f199, f248; +mul.f32 f255, f200, f250; +sub.f32 f256, f254, f255; +mul.f32 f257, f199, f250; +fma.rn.f32 f258, f200, f248, f257; +mul.f32 f259, f186, f258; +mul.f32 f260, f185, f258; +mul.f32 f261, f256, f186; +mul.f32 f262, f199, f256; +mul.f32 f263, f200, f258; +sub.f32 f264, f262, f263; +mul.f32 f265, f199, f258; +fma.rn.f32 f266, f200, f256, f265; +mul.f32 f267, f190, f266; +mul.f32 f268, f189, f266; +mul.f32 f269, f264, f190; +mul.f32 f270, f199, f264; +mul.f32 f271, f200, f266; +sub.f32 f272, f270, f271; +mul.f32 f273, f199, f266; +fma.rn.f32 f274, f200, f264, f273; +mul.f32 f275, f194, f274; +mul.f32 f276, f193, f274; +mul.f32 f277, f272, f194; +mul.f32 f278, f199, f272; +mul.f32 f279, f200, f274; +sub.f32 f280, f278, f279; +mul.f32 f281, f199, f274; +fma.rn.f32 f282, f200, f272, f281; +mul.f32 f283, f198, f282; +mul.f32 f284, f197, f282; +mul.f32 f285, f280, f198; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f32 f286, f92, f146; +add.f32 f287, f91, f145; +fma.rn.f32 f288, f199, f179, f203; +sub.f32 f289, f205, f204; +st.shared.v4.f32 [r9], {f287, f286, f288, f289}; +fma.rn.f32 f290, f208, f183, f211; +sub.f32 f291, f213, f212; +sub.f32 f292, f221, f220; +fma.rn.f32 f293, f216, f187, f219; +st.shared.v4.f32 [r9+16], {f290, f291, f293, f292}; +sub.f32 f294, f229, f228; +fma.rn.f32 f295, f224, f191, f227; +fma.rn.f32 f296, f232, f195, f235; +sub.f32 f297, f237, f236; +st.shared.v4.f32 [r9+32], {f295, f294, f296, f297}; +fma.rn.f32 f298, f240, f177, f243; +sub.f32 f299, f245, f244; +fma.rn.f32 f300, f248, f181, f251; +sub.f32 f301, f253, f252; +st.shared.v4.f32 [r9+48], {f298, f299, f300, f301}; +fma.rn.f32 f302, f256, f185, f259; +sub.f32 f303, f261, f260; +fma.rn.f32 f304, f264, f189, f267; +sub.f32 f305, f269, f268; +st.shared.v4.f32 [r9+64], {f302, f303, f304, f305}; +fma.rn.f32 f306, f272, f193, f275; +sub.f32 f307, f277, f276; +fma.rn.f32 f308, f280, f197, f283; +sub.f32 f309, f285, f284; +st.shared.v4.f32 [r9+80], {f306, f307, f308, f309}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.v2.f32 {f310, f311}, [r10]; +ld.shared.v2.f32 {f314, f315}, [r10+1152]; +ld.shared.v2.f32 {f318, f319}, [r10+2304]; +ld.shared.v2.f32 {f322, f323}, [r10+3456]; +ld.shared.v2.f32 {f326, f327}, [r10+4608]; +ld.shared.v2.f32 {f330, f331}, [r10+5760]; +ld.shared.v2.f32 {f334, f335}, [r10+6912]; +ld.shared.v2.f32 {f338, f339}, [r10+8064]; +ld.shared.v2.f32 {f342, f343}, [r10+9216]; +ld.shared.v2.f32 {f346, f347}, [r10+10368]; +ld.shared.v2.f32 {f350, f351}, [r10+11520]; +ld.shared.v2.f32 {f354, f355}, [r10+12672]; +add.f32 f358, f326, f342; +add.f32 f359, f310, f358; +add.f32 f360, f327, f343; +add.f32 f361, f311, f360; +mul.f32 f362, f358, 0f3F000000; +sub.f32 f363, f310, f362; +sub.f32 f364, f327, f343; +mul.f32 f365, f364, 0fBF5DB3D7; +add.f32 f366, f365, f363; +sub.f32 f367, f363, f365; +mul.f32 f368, f360, 0f3F000000; +sub.f32 f369, f311, f368; +sub.f32 f370, f326, f342; +mul.f32 f371, f370, 0fBF5DB3D7; +sub.f32 f372, f369, f371; +add.f32 f373, f371, f369; +add.f32 f374, f334, f350; +add.f32 f375, f318, f374; +add.f32 f376, f335, f351; +add.f32 f377, f319, f376; +mul.f32 f378, f374, 0f3F000000; +sub.f32 f379, f318, f378; +sub.f32 f380, f335, f351; +mul.f32 f381, f380, 0fBF5DB3D7; +add.f32 f382, f381, f379; +sub.f32 f383, f379, f381; +mul.f32 f384, f376, 0f3F000000; +sub.f32 f385, f319, f384; +sub.f32 f386, f334, f350; +mul.f32 f387, f386, 0fBF5DB3D7; +sub.f32 f388, f385, f387; +add.f32 f389, f387, f385; +mul.f32 f390, f382, 0f3F000000; +mul.f32 f391, f388, 0f3F5DB3D7; +sub.f32 f392, f390, f391; +mul.f32 f393, f388, 0f3F000000; +fma.rn.f32 f394, f382, 0f3F5DB3D7, f393; +mul.f32 f395, f383, 0fBF000000; +mul.f32 f396, f389, 0f3F5DB3D7; +sub.f32 f397, f395, f396; +mul.f32 f398, f389, 0fBF000000; +fma.rn.f32 f399, f383, 0f3F5DB3D7, f398; +add.f32 f400, f359, f375; +add.f32 f401, f361, f377; +sub.f32 f402, f359, f375; +sub.f32 f403, f361, f377; +add.f32 f404, f366, f392; +add.f32 f405, f372, f394; +sub.f32 f406, f366, f392; +sub.f32 f407, f372, f394; +add.f32 f408, f367, f397; +add.f32 f409, f373, f399; +sub.f32 f410, f367, f397; +sub.f32 f411, f373, f399; +add.f32 f412, f330, f346; +add.f32 f413, f314, f412; +add.f32 f414, f331, f347; +add.f32 f415, f315, f414; +mul.f32 f416, f412, 0f3F000000; +sub.f32 f417, f314, f416; +sub.f32 f418, f331, f347; +mul.f32 f419, f418, 0fBF5DB3D7; +add.f32 f420, f419, f417; +sub.f32 f421, f417, f419; +mul.f32 f422, f414, 0f3F000000; +sub.f32 f423, f315, f422; +sub.f32 f424, f330, f346; +mul.f32 f425, f424, 0fBF5DB3D7; +sub.f32 f426, f423, f425; +add.f32 f427, f425, f423; +add.f32 f428, f338, f354; +add.f32 f429, f322, f428; +add.f32 f430, f339, f355; +add.f32 f431, f323, f430; +mul.f32 f432, f428, 0f3F000000; +sub.f32 f433, f322, f432; +sub.f32 f434, f339, f355; +mul.f32 f435, f434, 0fBF5DB3D7; +add.f32 f436, f435, f433; +sub.f32 f437, f433, f435; +mul.f32 f438, f430, 0f3F000000; +sub.f32 f439, f323, f438; +sub.f32 f440, f338, f354; +mul.f32 f441, f440, 0fBF5DB3D7; +sub.f32 f442, f439, f441; +add.f32 f443, f441, f439; +mul.f32 f444, f436, 0f3F000000; +mul.f32 f445, f442, 0f3F5DB3D7; +sub.f32 f446, f444, f445; +mul.f32 f447, f442, 0f3F000000; +fma.rn.f32 f448, f436, 0f3F5DB3D7, f447; +mul.f32 f449, f437, 0fBF000000; +mul.f32 f450, f443, 0f3F5DB3D7; +sub.f32 f451, f449, f450; +mul.f32 f452, f443, 0fBF000000; +fma.rn.f32 f453, f437, 0f3F5DB3D7, f452; +add.f32 f454, f413, f429; +add.f32 f455, f415, f431; +sub.f32 f456, f413, f429; +sub.f32 f457, f415, f431; +add.f32 f458, f420, f446; +add.f32 f459, f426, f448; +sub.f32 f460, f420, f446; +sub.f32 f461, f426, f448; +add.f32 f462, f421, f451; +add.f32 f463, f427, f453; +sub.f32 f464, f421, f451; +sub.f32 f465, f427, f453; +mul.f32 f466, f458, 0f3F5DB3D7; +mul.f32 f467, f459, 0f3F000000; +sub.f32 f468, f466, f467; +mul.f32 f469, f459, 0f3F5DB3D7; +fma.rn.f32 f470, f458, 0f3F000000, f469; +mul.f32 f471, f462, 0f3F000000; +mul.f32 f472, f463, 0f3F5DB3D7; +sub.f32 f473, f471, f472; +mul.f32 f474, f463, 0f3F000000; +fma.rn.f32 f475, f462, 0f3F5DB3D7, f474; +mul.f32 f476, f460, 0fBF000000; +mul.f32 f477, f461, 0f3F5DB3D7; +sub.f32 f478, f476, f477; +mul.f32 f479, f461, 0fBF000000; +fma.rn.f32 f480, f460, 0f3F5DB3D7, f479; +mul.f32 f481, f464, 0fBF5DB3D7; +mul.f32 f482, f465, 0f3F000000; +sub.f32 f483, f481, f482; +mul.f32 f484, f465, 0fBF5DB3D7; +fma.rn.f32 f485, f464, 0f3F000000, f484; +sub.f32 f486, f400, f454; +sub.f32 f487, f401, f455; +add.f32 f488, f404, f468; +add.f32 f489, f405, f470; +sub.f32 f490, f404, f468; +sub.f32 f491, f405, f470; +add.f32 f492, f408, f473; +add.f32 f493, f409, f475; +sub.f32 f494, f408, f473; +sub.f32 f495, f409, f475; +sub.f32 f496, f402, f457; +add.f32 f497, f403, f456; +add.f32 f498, f402, f457; +sub.f32 f499, f403, f456; +add.f32 f500, f406, f478; +add.f32 f501, f407, f480; +sub.f32 f502, f406, f478; +sub.f32 f503, f407, f480; +add.f32 f504, f410, f483; +add.f32 f505, f411, f485; +sub.f32 f506, f410, f483; +sub.f32 f507, f411, f485; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f508, f509}, [rd11]; +mul.f32 f512, f489, f509; +mul.f32 f513, f488, f509; +mul.f32 f514, f508, f489; +mul.f32 f515, f508, f508; +mul.f32 f516, f509, f509; +sub.f32 f517, f515, f516; +mul.f32 f518, f509, f508; +fma.rn.f32 f519, f509, f508, f518; +mul.f32 f520, f493, f519; +mul.f32 f521, f492, f519; +mul.f32 f522, f517, f493; +mul.f32 f523, f508, f517; +mul.f32 f524, f509, f519; +sub.f32 f525, f523, f524; +mul.f32 f526, f508, f519; +fma.rn.f32 f527, f509, f517, f526; +mul.f32 f528, f497, f527; +mul.f32 f529, f496, f527; +mul.f32 f530, f525, f497; +mul.f32 f531, f508, f525; +mul.f32 f532, f509, f527; +sub.f32 f533, f531, f532; +mul.f32 f534, f508, f527; +fma.rn.f32 f535, f509, f525, f534; +mul.f32 f536, f501, f535; +mul.f32 f537, f500, f535; +mul.f32 f538, f533, f501; +mul.f32 f539, f508, f533; +mul.f32 f540, f509, f535; +sub.f32 f541, f539, f540; +mul.f32 f542, f508, f535; +fma.rn.f32 f543, f509, f533, f542; +mul.f32 f544, f505, f543; +mul.f32 f545, f504, f543; +mul.f32 f546, f541, f505; +mul.f32 f547, f508, f541; +mul.f32 f548, f509, f543; +sub.f32 f549, f547, f548; +mul.f32 f550, f508, f543; +fma.rn.f32 f551, f509, f541, f550; +mul.f32 f552, f487, f551; +mul.f32 f553, f486, f551; +mul.f32 f554, f549, f487; +mul.f32 f555, f508, f549; +mul.f32 f556, f509, f551; +sub.f32 f557, f555, f556; +mul.f32 f558, f508, f551; +fma.rn.f32 f559, f509, f549, f558; +mul.f32 f560, f491, f559; +mul.f32 f561, f490, f559; +mul.f32 f562, f557, f491; +mul.f32 f563, f508, f557; +mul.f32 f564, f509, f559; +sub.f32 f565, f563, f564; +mul.f32 f566, f508, f559; +fma.rn.f32 f567, f509, f557, f566; +mul.f32 f568, f495, f567; +mul.f32 f569, f494, f567; +mul.f32 f570, f565, f495; +mul.f32 f571, f508, f565; +mul.f32 f572, f509, f567; +sub.f32 f573, f571, f572; +mul.f32 f574, f508, f567; +fma.rn.f32 f575, f509, f565, f574; +mul.f32 f576, f499, f575; +mul.f32 f577, f498, f575; +mul.f32 f578, f573, f499; +mul.f32 f579, f508, f573; +mul.f32 f580, f509, f575; +sub.f32 f581, f579, f580; +mul.f32 f582, f508, f575; +fma.rn.f32 f583, f509, f573, f582; +mul.f32 f584, f503, f583; +mul.f32 f585, f502, f583; +mul.f32 f586, f581, f503; +mul.f32 f587, f508, f581; +mul.f32 f588, f509, f583; +sub.f32 f589, f587, f588; +mul.f32 f590, f508, f583; +fma.rn.f32 f591, f509, f581, f590; +mul.f32 f592, f507, f591; +mul.f32 f593, f506, f591; +mul.f32 f594, f589, f507; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1152, r15; +add.f32 f595, f401, f455; +add.f32 f596, f400, f454; +st.shared.v2.f32 [r16], {f596, f595}; +fma.rn.f32 f597, f508, f488, f512; +sub.f32 f598, f514, f513; +st.shared.v2.f32 [r16+96], {f597, f598}; +fma.rn.f32 f599, f517, f492, f520; +sub.f32 f600, f522, f521; +st.shared.v2.f32 [r16+192], {f599, f600}; +fma.rn.f32 f601, f525, f496, f528; +sub.f32 f602, f530, f529; +st.shared.v2.f32 [r16+288], {f601, f602}; +fma.rn.f32 f603, f533, f500, f536; +sub.f32 f604, f538, f537; +st.shared.v2.f32 [r16+384], {f603, f604}; +fma.rn.f32 f605, f541, f504, f544; +sub.f32 f606, f546, f545; +st.shared.v2.f32 [r16+480], {f605, f606}; +fma.rn.f32 f607, f549, f486, f552; +sub.f32 f608, f554, f553; +st.shared.v2.f32 [r16+576], {f607, f608}; +fma.rn.f32 f609, f557, f490, f560; +sub.f32 f610, f562, f561; +st.shared.v2.f32 [r16+672], {f609, f610}; +fma.rn.f32 f611, f565, f494, f568; +sub.f32 f612, f570, f569; +st.shared.v2.f32 [r16+768], {f611, f612}; +fma.rn.f32 f613, f573, f498, f576; +sub.f32 f614, f578, f577; +st.shared.v2.f32 [r16+864], {f613, f614}; +fma.rn.f32 f615, f581, f502, f584; +sub.f32 f616, f586, f585; +st.shared.v2.f32 [r16+960], {f615, f616}; +fma.rn.f32 f617, f589, f506, f592; +sub.f32 f618, f594, f593; +st.shared.v2.f32 [r16+1056], {f617, f618}; +barrier.sync 0; +ld.shared.v2.f32 {f619, f620}, [r10]; +ld.shared.v2.f32 {f623, f624}, [r10+1152]; +ld.shared.v2.f32 {f627, f628}, [r10+2304]; +ld.shared.v2.f32 {f631, f632}, [r10+3456]; +ld.shared.v2.f32 {f635, f636}, [r10+4608]; +ld.shared.v2.f32 {f639, f640}, [r10+5760]; +ld.shared.v2.f32 {f643, f644}, [r10+6912]; +ld.shared.v2.f32 {f647, f648}, [r10+8064]; +ld.shared.v2.f32 {f651, f652}, [r10+9216]; +ld.shared.v2.f32 {f655, f656}, [r10+10368]; +ld.shared.v2.f32 {f659, f660}, [r10+11520]; +ld.shared.v2.f32 {f663, f664}, [r10+12672]; +add.f32 f667, f635, f651; +add.f32 f668, f619, f667; +add.f32 f669, f636, f652; +add.f32 f670, f620, f669; +mul.f32 f671, f667, 0f3F000000; +sub.f32 f672, f619, f671; +sub.f32 f673, f636, f652; +mul.f32 f674, f673, 0fBF5DB3D7; +add.f32 f675, f674, f672; +sub.f32 f676, f672, f674; +mul.f32 f677, f669, 0f3F000000; +sub.f32 f678, f620, f677; +sub.f32 f679, f635, f651; +mul.f32 f680, f679, 0fBF5DB3D7; +sub.f32 f681, f678, f680; +add.f32 f682, f680, f678; +add.f32 f683, f643, f659; +add.f32 f684, f627, f683; +add.f32 f685, f644, f660; +add.f32 f686, f628, f685; +mul.f32 f687, f683, 0f3F000000; +sub.f32 f688, f627, f687; +sub.f32 f689, f644, f660; +mul.f32 f690, f689, 0fBF5DB3D7; +add.f32 f691, f690, f688; +sub.f32 f692, f688, f690; +mul.f32 f693, f685, 0f3F000000; +sub.f32 f694, f628, f693; +sub.f32 f695, f643, f659; +mul.f32 f696, f695, 0fBF5DB3D7; +sub.f32 f697, f694, f696; +add.f32 f698, f696, f694; +mul.f32 f699, f691, 0f3F000000; +mul.f32 f700, f697, 0f3F5DB3D7; +sub.f32 f701, f699, f700; +mul.f32 f702, f697, 0f3F000000; +fma.rn.f32 f703, f691, 0f3F5DB3D7, f702; +mul.f32 f704, f692, 0fBF000000; +mul.f32 f705, f698, 0f3F5DB3D7; +sub.f32 f706, f704, f705; +mul.f32 f707, f698, 0fBF000000; +fma.rn.f32 f708, f692, 0f3F5DB3D7, f707; +add.f32 f709, f668, f684; +add.f32 f710, f670, f686; +sub.f32 f711, f668, f684; +sub.f32 f712, f670, f686; +add.f32 f713, f675, f701; +add.f32 f714, f681, f703; +sub.f32 f715, f675, f701; +sub.f32 f716, f681, f703; +add.f32 f717, f676, f706; +add.f32 f718, f682, f708; +sub.f32 f719, f676, f706; +sub.f32 f720, f682, f708; +add.f32 f721, f639, f655; +add.f32 f722, f623, f721; +add.f32 f723, f640, f656; +add.f32 f724, f624, f723; +mul.f32 f725, f721, 0f3F000000; +sub.f32 f726, f623, f725; +sub.f32 f727, f640, f656; +mul.f32 f728, f727, 0fBF5DB3D7; +add.f32 f729, f728, f726; +sub.f32 f730, f726, f728; +mul.f32 f731, f723, 0f3F000000; +sub.f32 f732, f624, f731; +sub.f32 f733, f639, f655; +mul.f32 f734, f733, 0fBF5DB3D7; +sub.f32 f735, f732, f734; +add.f32 f736, f734, f732; +add.f32 f737, f647, f663; +add.f32 f738, f631, f737; +add.f32 f739, f648, f664; +add.f32 f740, f632, f739; +mul.f32 f741, f737, 0f3F000000; +sub.f32 f742, f631, f741; +sub.f32 f743, f648, f664; +mul.f32 f744, f743, 0fBF5DB3D7; +add.f32 f745, f744, f742; +sub.f32 f746, f742, f744; +mul.f32 f747, f739, 0f3F000000; +sub.f32 f748, f632, f747; +sub.f32 f749, f647, f663; +mul.f32 f750, f749, 0fBF5DB3D7; +sub.f32 f751, f748, f750; +add.f32 f752, f750, f748; +mul.f32 f753, f745, 0f3F000000; +mul.f32 f754, f751, 0f3F5DB3D7; +sub.f32 f755, f753, f754; +mul.f32 f756, f751, 0f3F000000; +fma.rn.f32 f757, f745, 0f3F5DB3D7, f756; +mul.f32 f758, f746, 0fBF000000; +mul.f32 f759, f752, 0f3F5DB3D7; +sub.f32 f760, f758, f759; +mul.f32 f761, f752, 0fBF000000; +fma.rn.f32 f762, f746, 0f3F5DB3D7, f761; +add.f32 f763, f722, f738; +add.f32 f764, f724, f740; +sub.f32 f765, f722, f738; +sub.f32 f766, f724, f740; +add.f32 f767, f729, f755; +add.f32 f768, f735, f757; +sub.f32 f769, f729, f755; +sub.f32 f770, f735, f757; +add.f32 f771, f730, f760; +add.f32 f772, f736, f762; +sub.f32 f773, f730, f760; +sub.f32 f774, f736, f762; +mul.f32 f775, f767, 0f3F5DB3D7; +mul.f32 f776, f768, 0f3F000000; +sub.f32 f777, f775, f776; +mul.f32 f778, f768, 0f3F5DB3D7; +fma.rn.f32 f779, f767, 0f3F000000, f778; +mul.f32 f780, f771, 0f3F000000; +mul.f32 f781, f772, 0f3F5DB3D7; +sub.f32 f782, f780, f781; +mul.f32 f783, f772, 0f3F000000; +fma.rn.f32 f784, f771, 0f3F5DB3D7, f783; +mul.f32 f785, f769, 0fBF000000; +mul.f32 f786, f770, 0f3F5DB3D7; +sub.f32 f787, f785, f786; +mul.f32 f788, f770, 0fBF000000; +fma.rn.f32 f789, f769, 0f3F5DB3D7, f788; +mul.f32 f790, f773, 0fBF5DB3D7; +mul.f32 f791, f774, 0f3F000000; +sub.f32 f792, f790, f791; +mul.f32 f793, f774, 0fBF5DB3D7; +fma.rn.f32 f794, f773, 0f3F000000, f793; +add.f32 %1, f710, f764; +add.f32 %0, f709, f763; +add.f32 %3, f714, f779; +add.f32 %2, f713, f777; +add.f32 %5, f718, f784; +add.f32 %4, f717, f782; +add.f32 %7, f712, f765; +sub.f32 %6, f711, f766; +add.f32 %9, f716, f789; +add.f32 %8, f715, f787; +add.f32 %11, f720, f794; +add.f32 %10, f719, f792; +sub.f32 %13, f710, f764; +sub.f32 %12, f709, f763; +sub.f32 %15, f714, f779; +sub.f32 %14, f713, f777; +sub.f32 %17, f718, f784; +sub.f32 %16, f717, f782; +sub.f32 %19, f712, f765; +add.f32 %18, f711, f766; +sub.f32 %21, f716, f789; +sub.f32 %20, f715, f787; +sub.f32 %23, f720, f794; +sub.f32 %22, f719, f792; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_1728), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<403, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<771>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 6912, r2; +mov.u32 r4, %tid.x; +add.f32 f49, %37, %48; +add.f32 f50, %27, f49; +add.f32 f51, %39, %50; +add.f32 f52, %28, f51; +mul.f32 f53, f49, 0f3F000000; +sub.f32 f54, %27, f53; +sub.f32 f55, %39, %50; +mul.f32 f56, f55, 0fBF5DB3D7; +add.f32 f57, f56, f54; +sub.f32 f58, f54, f56; +mul.f32 f59, f51, 0f3F000000; +sub.f32 f60, %28, f59; +sub.f32 f61, %37, %48; +mul.f32 f62, f61, 0fBF5DB3D7; +sub.f32 f63, f60, f62; +add.f32 f64, f62, f60; +add.f32 f65, %43, %53; +add.f32 f66, %32, f65; +add.f32 f67, %44, %55; +add.f32 f68, %34, f67; +mul.f32 f69, f65, 0f3F000000; +sub.f32 f70, %32, f69; +sub.f32 f71, %44, %55; +mul.f32 f72, f71, 0fBF5DB3D7; +add.f32 f73, f72, f70; +sub.f32 f74, f70, f72; +mul.f32 f75, f67, 0f3F000000; +sub.f32 f76, %34, f75; +sub.f32 f77, %43, %53; +mul.f32 f78, f77, 0fBF5DB3D7; +sub.f32 f79, f76, f78; +add.f32 f80, f78, f76; +mul.f32 f81, f73, 0f3F000000; +mul.f32 f82, f79, 0f3F5DB3D7; +sub.f32 f83, f81, f82; +mul.f32 f84, f79, 0f3F000000; +fma.rn.f32 f85, f73, 0f3F5DB3D7, f84; +mul.f32 f86, f74, 0fBF000000; +mul.f32 f87, f80, 0f3F5DB3D7; +sub.f32 f88, f86, f87; +mul.f32 f89, f80, 0fBF000000; +fma.rn.f32 f90, f74, 0f3F5DB3D7, f89; +add.f32 f91, f50, f66; +add.f32 f92, f52, f68; +sub.f32 f93, f50, f66; +sub.f32 f94, f52, f68; +add.f32 f95, f57, f83; +add.f32 f96, f63, f85; +sub.f32 f97, f57, f83; +sub.f32 f98, f63, f85; +add.f32 f99, f58, f88; +add.f32 f100, f64, f90; +sub.f32 f101, f58, f88; +sub.f32 f102, f64, f90; +add.f32 f103, %40, %51; +add.f32 f104, %29, f103; +add.f32 f105, %42, %52; +add.f32 f106, %31, f105; +mul.f32 f107, f103, 0f3F000000; +sub.f32 f108, %29, f107; +sub.f32 f109, %42, %52; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f105, 0f3F000000; +sub.f32 f114, %31, f113; +sub.f32 f115, %40, %51; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, %45, %56; +add.f32 f120, %35, f119; +add.f32 f121, %47, %57; +add.f32 f122, %36, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, %35, f123; +sub.f32 f125, %47, %57; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, %36, f129; +sub.f32 f131, %45, %56; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +mul.f32 f135, f127, 0f3F000000; +mul.f32 f136, f133, 0f3F5DB3D7; +sub.f32 f137, f135, f136; +mul.f32 f138, f133, 0f3F000000; +fma.rn.f32 f139, f127, 0f3F5DB3D7, f138; +mul.f32 f140, f128, 0fBF000000; +mul.f32 f141, f134, 0f3F5DB3D7; +sub.f32 f142, f140, f141; +mul.f32 f143, f134, 0fBF000000; +fma.rn.f32 f144, f128, 0f3F5DB3D7, f143; +add.f32 f145, f104, f120; +add.f32 f146, f106, f122; +sub.f32 f147, f104, f120; +sub.f32 f148, f106, f122; +add.f32 f149, f111, f137; +add.f32 f150, f117, f139; +sub.f32 f151, f111, f137; +sub.f32 f152, f117, f139; +add.f32 f153, f112, f142; +add.f32 f154, f118, f144; +sub.f32 f155, f112, f142; +sub.f32 f156, f118, f144; +mul.f32 f157, f149, 0f3F5DB3D7; +mul.f32 f158, f150, 0f3F000000; +sub.f32 f159, f157, f158; +mul.f32 f160, f150, 0f3F5DB3D7; +fma.rn.f32 f161, f149, 0f3F000000, f160; +mul.f32 f162, f153, 0f3F000000; +mul.f32 f163, f154, 0f3F5DB3D7; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, 0f3F000000; +fma.rn.f32 f166, f153, 0f3F5DB3D7, f165; +mul.f32 f167, f151, 0fBF000000; +mul.f32 f168, f152, 0f3F5DB3D7; +sub.f32 f169, f167, f168; +mul.f32 f170, f152, 0fBF000000; +fma.rn.f32 f171, f151, 0f3F5DB3D7, f170; +mul.f32 f172, f155, 0fBF5DB3D7; +mul.f32 f173, f156, 0f3F000000; +sub.f32 f174, f172, f173; +mul.f32 f175, f156, 0fBF5DB3D7; +fma.rn.f32 f176, f155, 0f3F000000, f175; +add.f32 f177, f91, f145; +add.f32 f178, f92, f146; +sub.f32 f179, f91, f145; +sub.f32 f180, f92, f146; +add.f32 f181, f95, f159; +add.f32 f182, f96, f161; +sub.f32 f183, f95, f159; +sub.f32 f184, f96, f161; +add.f32 f185, f99, f164; +add.f32 f186, f100, f166; +sub.f32 f187, f99, f164; +sub.f32 f188, f100, f166; +sub.f32 f189, f93, f148; +add.f32 f190, f94, f147; +add.f32 f191, f93, f148; +sub.f32 f192, f94, f147; +add.f32 f193, f97, f169; +add.f32 f194, f98, f171; +sub.f32 f195, f97, f169; +sub.f32 f196, f98, f171; +add.f32 f197, f101, f174; +add.f32 f198, f102, f176; +sub.f32 f199, f101, f174; +sub.f32 f200, f102, f176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f201, f202}, [rd6]; +mul.f32 f205, f182, f202; +fma.rn.f32 f206, f201, f181, f205; +mul.f32 f207, f181, f202; +mul.f32 f208, f201, f182; +sub.f32 f209, f208, f207; +mul.f32 f210, f201, f201; +mul.f32 f211, f202, f202; +sub.f32 f212, f210, f211; +mul.f32 f213, f202, f201; +fma.rn.f32 f214, f202, f201, f213; +mul.f32 f215, f186, f214; +fma.rn.f32 f216, f212, f185, f215; +mul.f32 f217, f185, f214; +mul.f32 f218, f212, f186; +sub.f32 f219, f218, f217; +mul.f32 f220, f201, f212; +mul.f32 f221, f202, f214; +sub.f32 f222, f220, f221; +mul.f32 f223, f201, f214; +fma.rn.f32 f224, f202, f212, f223; +mul.f32 f225, f190, f224; +fma.rn.f32 f226, f222, f189, f225; +mul.f32 f227, f189, f224; +mul.f32 f228, f222, f190; +sub.f32 f229, f228, f227; +mul.f32 f230, f201, f222; +mul.f32 f231, f202, f224; +sub.f32 f232, f230, f231; +mul.f32 f233, f201, f224; +fma.rn.f32 f234, f202, f222, f233; +mul.f32 f235, f194, f234; +fma.rn.f32 f236, f232, f193, f235; +mul.f32 f237, f193, f234; +mul.f32 f238, f232, f194; +sub.f32 f239, f238, f237; +mul.f32 f240, f201, f232; +mul.f32 f241, f202, f234; +sub.f32 f242, f240, f241; +mul.f32 f243, f201, f234; +fma.rn.f32 f244, f202, f232, f243; +mul.f32 f245, f198, f244; +fma.rn.f32 f246, f242, f197, f245; +mul.f32 f247, f197, f244; +mul.f32 f248, f242, f198; +sub.f32 f249, f248, f247; +mul.f32 f250, f201, f242; +mul.f32 f251, f202, f244; +sub.f32 f252, f250, f251; +mul.f32 f253, f201, f244; +fma.rn.f32 f254, f202, f242, f253; +mul.f32 f255, f180, f254; +fma.rn.f32 f256, f252, f179, f255; +mul.f32 f257, f179, f254; +mul.f32 f258, f252, f180; +sub.f32 f259, f258, f257; +mul.f32 f260, f201, f252; +mul.f32 f261, f202, f254; +sub.f32 f262, f260, f261; +mul.f32 f263, f201, f254; +fma.rn.f32 f264, f202, f252, f263; +mul.f32 f265, f184, f264; +fma.rn.f32 f266, f262, f183, f265; +mul.f32 f267, f183, f264; +mul.f32 f268, f262, f184; +sub.f32 f269, f268, f267; +mul.f32 f270, f201, f262; +mul.f32 f271, f202, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f201, f264; +fma.rn.f32 f274, f202, f262, f273; +mul.f32 f275, f188, f274; +fma.rn.f32 f276, f272, f187, f275; +mul.f32 f277, f187, f274; +mul.f32 f278, f272, f188; +sub.f32 f279, f278, f277; +mul.f32 f280, f201, f272; +mul.f32 f281, f202, f274; +sub.f32 f282, f280, f281; +mul.f32 f283, f201, f274; +fma.rn.f32 f284, f202, f272, f283; +mul.f32 f285, f192, f284; +fma.rn.f32 f286, f282, f191, f285; +mul.f32 f287, f191, f284; +mul.f32 f288, f282, f192; +sub.f32 f289, f288, f287; +mul.f32 f290, f201, f282; +mul.f32 f291, f202, f284; +sub.f32 f292, f290, f291; +mul.f32 f293, f201, f284; +fma.rn.f32 f294, f202, f282, f293; +mul.f32 f295, f196, f294; +fma.rn.f32 f296, f292, f195, f295; +mul.f32 f297, f195, f294; +mul.f32 f298, f292, f196; +sub.f32 f299, f298, f297; +mul.f32 f300, f201, f292; +mul.f32 f301, f202, f294; +sub.f32 f302, f300, f301; +mul.f32 f303, f201, f294; +fma.rn.f32 f304, f202, f292, f303; +mul.f32 f305, f200, f304; +fma.rn.f32 f306, f302, f199, f305; +mul.f32 f307, f199, f304; +mul.f32 f308, f302, f200; +sub.f32 f309, f308, f307; +mad.lo.s32 r8, r5, 6912, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v4.f32 [r9], {f177, f206, f216, f226}; +st.shared.v4.f32 [r9+16], {f236, f246, f256, f266}; +st.shared.v4.f32 [r9+32], {f276, f286, f296, f306}; +barrier.sync 0; +mad.lo.s32 r10, r7, -44, r9; +ld.shared.f32 f310, [r10]; +ld.shared.f32 f311, [r10+576]; +ld.shared.f32 f312, [r10+1152]; +ld.shared.f32 f313, [r10+1728]; +ld.shared.f32 f314, [r10+2304]; +ld.shared.f32 f315, [r10+2880]; +ld.shared.f32 f316, [r10+3456]; +ld.shared.f32 f317, [r10+4032]; +ld.shared.f32 f318, [r10+4608]; +ld.shared.f32 f319, [r10+5184]; +ld.shared.f32 f320, [r10+5760]; +ld.shared.f32 f321, [r10+6336]; +barrier.sync 0; +st.shared.v4.f32 [r9], {f178, f209, f219, f229}; +st.shared.v4.f32 [r9+16], {f239, f249, f259, f269}; +st.shared.v4.f32 [r9+32], {f279, f289, f299, f309}; +barrier.sync 0; +ld.shared.f32 f322, [r10]; +ld.shared.f32 f323, [r10+576]; +ld.shared.f32 f324, [r10+1152]; +ld.shared.f32 f325, [r10+1728]; +ld.shared.f32 f326, [r10+2304]; +ld.shared.f32 f327, [r10+2880]; +ld.shared.f32 f328, [r10+3456]; +ld.shared.f32 f329, [r10+4032]; +ld.shared.f32 f330, [r10+4608]; +ld.shared.f32 f331, [r10+5184]; +ld.shared.f32 f332, [r10+5760]; +ld.shared.f32 f333, [r10+6336]; +add.f32 f334, f314, f318; +add.f32 f335, f310, f334; +add.f32 f336, f326, f330; +add.f32 f337, f322, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f310, f338; +sub.f32 f340, f326, f330; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f322, f344; +sub.f32 f346, f314, f318; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f316, f320; +add.f32 f351, f312, f350; +add.f32 f352, f328, f332; +add.f32 f353, f324, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f312, f354; +sub.f32 f356, f328, f332; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f324, f360; +sub.f32 f362, f316, f320; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.f32 f366, f358, 0f3F000000; +mul.f32 f367, f364, 0f3F5DB3D7; +sub.f32 f368, f366, f367; +mul.f32 f369, f364, 0f3F000000; +fma.rn.f32 f370, f358, 0f3F5DB3D7, f369; +mul.f32 f371, f359, 0fBF000000; +mul.f32 f372, f365, 0f3F5DB3D7; +sub.f32 f373, f371, f372; +mul.f32 f374, f365, 0fBF000000; +fma.rn.f32 f375, f359, 0f3F5DB3D7, f374; +add.f32 f376, f335, f351; +add.f32 f377, f337, f353; +sub.f32 f378, f335, f351; +sub.f32 f379, f337, f353; +add.f32 f380, f342, f368; +add.f32 f381, f348, f370; +sub.f32 f382, f342, f368; +sub.f32 f383, f348, f370; +add.f32 f384, f343, f373; +add.f32 f385, f349, f375; +sub.f32 f386, f343, f373; +sub.f32 f387, f349, f375; +add.f32 f388, f315, f319; +add.f32 f389, f311, f388; +add.f32 f390, f327, f331; +add.f32 f391, f323, f390; +mul.f32 f392, f388, 0f3F000000; +sub.f32 f393, f311, f392; +sub.f32 f394, f327, f331; +mul.f32 f395, f394, 0fBF5DB3D7; +add.f32 f396, f395, f393; +sub.f32 f397, f393, f395; +mul.f32 f398, f390, 0f3F000000; +sub.f32 f399, f323, f398; +sub.f32 f400, f315, f319; +mul.f32 f401, f400, 0fBF5DB3D7; +sub.f32 f402, f399, f401; +add.f32 f403, f401, f399; +add.f32 f404, f317, f321; +add.f32 f405, f313, f404; +add.f32 f406, f329, f333; +add.f32 f407, f325, f406; +mul.f32 f408, f404, 0f3F000000; +sub.f32 f409, f313, f408; +sub.f32 f410, f329, f333; +mul.f32 f411, f410, 0fBF5DB3D7; +add.f32 f412, f411, f409; +sub.f32 f413, f409, f411; +mul.f32 f414, f406, 0f3F000000; +sub.f32 f415, f325, f414; +sub.f32 f416, f317, f321; +mul.f32 f417, f416, 0fBF5DB3D7; +sub.f32 f418, f415, f417; +add.f32 f419, f417, f415; +mul.f32 f420, f412, 0f3F000000; +mul.f32 f421, f418, 0f3F5DB3D7; +sub.f32 f422, f420, f421; +mul.f32 f423, f418, 0f3F000000; +fma.rn.f32 f424, f412, 0f3F5DB3D7, f423; +mul.f32 f425, f413, 0fBF000000; +mul.f32 f426, f419, 0f3F5DB3D7; +sub.f32 f427, f425, f426; +mul.f32 f428, f419, 0fBF000000; +fma.rn.f32 f429, f413, 0f3F5DB3D7, f428; +add.f32 f430, f389, f405; +add.f32 f431, f391, f407; +sub.f32 f432, f389, f405; +sub.f32 f433, f391, f407; +add.f32 f434, f396, f422; +add.f32 f435, f402, f424; +sub.f32 f436, f396, f422; +sub.f32 f437, f402, f424; +add.f32 f438, f397, f427; +add.f32 f439, f403, f429; +sub.f32 f440, f397, f427; +sub.f32 f441, f403, f429; +mul.f32 f442, f434, 0f3F5DB3D7; +mul.f32 f443, f435, 0f3F000000; +sub.f32 f444, f442, f443; +mul.f32 f445, f435, 0f3F5DB3D7; +fma.rn.f32 f446, f434, 0f3F000000, f445; +mul.f32 f447, f438, 0f3F000000; +mul.f32 f448, f439, 0f3F5DB3D7; +sub.f32 f449, f447, f448; +mul.f32 f450, f439, 0f3F000000; +fma.rn.f32 f451, f438, 0f3F5DB3D7, f450; +mul.f32 f452, f436, 0fBF000000; +mul.f32 f453, f437, 0f3F5DB3D7; +sub.f32 f454, f452, f453; +mul.f32 f455, f437, 0fBF000000; +fma.rn.f32 f456, f436, 0f3F5DB3D7, f455; +mul.f32 f457, f440, 0fBF5DB3D7; +mul.f32 f458, f441, 0f3F000000; +sub.f32 f459, f457, f458; +mul.f32 f460, f441, 0fBF5DB3D7; +fma.rn.f32 f461, f440, 0f3F000000, f460; +add.f32 f462, f376, f430; +add.f32 f463, f377, f431; +sub.f32 f464, f376, f430; +sub.f32 f465, f377, f431; +add.f32 f466, f380, f444; +add.f32 f467, f381, f446; +sub.f32 f468, f380, f444; +sub.f32 f469, f381, f446; +add.f32 f470, f384, f449; +add.f32 f471, f385, f451; +sub.f32 f472, f384, f449; +sub.f32 f473, f385, f451; +sub.f32 f474, f378, f433; +add.f32 f475, f379, f432; +add.f32 f476, f378, f433; +sub.f32 f477, f379, f432; +add.f32 f478, f382, f454; +add.f32 f479, f383, f456; +sub.f32 f480, f382, f454; +sub.f32 f481, f383, f456; +add.f32 f482, f386, f459; +add.f32 f483, f387, f461; +sub.f32 f484, f386, f459; +sub.f32 f485, f387, f461; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f486, f487}, [rd11]; +mul.f32 f490, f467, f487; +fma.rn.f32 f491, f486, f466, f490; +mul.f32 f492, f466, f487; +mul.f32 f493, f486, f467; +sub.f32 f494, f493, f492; +mul.f32 f495, f486, f486; +mul.f32 f496, f487, f487; +sub.f32 f497, f495, f496; +mul.f32 f498, f487, f486; +fma.rn.f32 f499, f487, f486, f498; +mul.f32 f500, f471, f499; +fma.rn.f32 f501, f497, f470, f500; +mul.f32 f502, f470, f499; +mul.f32 f503, f497, f471; +sub.f32 f504, f503, f502; +mul.f32 f505, f486, f497; +mul.f32 f506, f487, f499; +sub.f32 f507, f505, f506; +mul.f32 f508, f486, f499; +fma.rn.f32 f509, f487, f497, f508; +mul.f32 f510, f475, f509; +fma.rn.f32 f511, f507, f474, f510; +mul.f32 f512, f474, f509; +mul.f32 f513, f507, f475; +sub.f32 f514, f513, f512; +mul.f32 f515, f486, f507; +mul.f32 f516, f487, f509; +sub.f32 f517, f515, f516; +mul.f32 f518, f486, f509; +fma.rn.f32 f519, f487, f507, f518; +mul.f32 f520, f479, f519; +fma.rn.f32 f521, f517, f478, f520; +mul.f32 f522, f478, f519; +mul.f32 f523, f517, f479; +sub.f32 f524, f523, f522; +mul.f32 f525, f486, f517; +mul.f32 f526, f487, f519; +sub.f32 f527, f525, f526; +mul.f32 f528, f486, f519; +fma.rn.f32 f529, f487, f517, f528; +mul.f32 f530, f483, f529; +fma.rn.f32 f531, f527, f482, f530; +mul.f32 f532, f482, f529; +mul.f32 f533, f527, f483; +sub.f32 f534, f533, f532; +mul.f32 f535, f486, f527; +mul.f32 f536, f487, f529; +sub.f32 f537, f535, f536; +mul.f32 f538, f486, f529; +fma.rn.f32 f539, f487, f527, f538; +mul.f32 f540, f465, f539; +fma.rn.f32 f541, f537, f464, f540; +mul.f32 f542, f464, f539; +mul.f32 f543, f537, f465; +sub.f32 f544, f543, f542; +mul.f32 f545, f486, f537; +mul.f32 f546, f487, f539; +sub.f32 f547, f545, f546; +mul.f32 f548, f486, f539; +fma.rn.f32 f549, f487, f537, f548; +mul.f32 f550, f469, f549; +fma.rn.f32 f551, f547, f468, f550; +mul.f32 f552, f468, f549; +mul.f32 f553, f547, f469; +sub.f32 f554, f553, f552; +mul.f32 f555, f486, f547; +mul.f32 f556, f487, f549; +sub.f32 f557, f555, f556; +mul.f32 f558, f486, f549; +fma.rn.f32 f559, f487, f547, f558; +mul.f32 f560, f473, f559; +fma.rn.f32 f561, f557, f472, f560; +mul.f32 f562, f472, f559; +mul.f32 f563, f557, f473; +sub.f32 f564, f563, f562; +mul.f32 f565, f486, f557; +mul.f32 f566, f487, f559; +sub.f32 f567, f565, f566; +mul.f32 f568, f486, f559; +fma.rn.f32 f569, f487, f557, f568; +mul.f32 f570, f477, f569; +fma.rn.f32 f571, f567, f476, f570; +mul.f32 f572, f476, f569; +mul.f32 f573, f567, f477; +sub.f32 f574, f573, f572; +mul.f32 f575, f486, f567; +mul.f32 f576, f487, f569; +sub.f32 f577, f575, f576; +mul.f32 f578, f486, f569; +fma.rn.f32 f579, f487, f567, f578; +mul.f32 f580, f481, f579; +fma.rn.f32 f581, f577, f480, f580; +mul.f32 f582, f480, f579; +mul.f32 f583, f577, f481; +sub.f32 f584, f583, f582; +mul.f32 f585, f486, f577; +mul.f32 f586, f487, f579; +sub.f32 f587, f585, f586; +mul.f32 f588, f486, f579; +fma.rn.f32 f589, f487, f577, f588; +mul.f32 f590, f485, f589; +fma.rn.f32 f591, f587, f484, f590; +mul.f32 f592, f484, f589; +mul.f32 f593, f587, f485; +sub.f32 f594, f593, f592; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 576, r15; +st.shared.f32 [r16], f462; +st.shared.f32 [r16+48], f491; +st.shared.f32 [r16+96], f501; +st.shared.f32 [r16+144], f511; +st.shared.f32 [r16+192], f521; +st.shared.f32 [r16+240], f531; +st.shared.f32 [r16+288], f541; +st.shared.f32 [r16+336], f551; +st.shared.f32 [r16+384], f561; +st.shared.f32 [r16+432], f571; +st.shared.f32 [r16+480], f581; +st.shared.f32 [r16+528], f591; +barrier.sync 0; +ld.shared.f32 f595, [r10]; +ld.shared.f32 f596, [r10+576]; +ld.shared.f32 f597, [r10+1152]; +ld.shared.f32 f598, [r10+1728]; +ld.shared.f32 f599, [r10+2304]; +ld.shared.f32 f600, [r10+2880]; +ld.shared.f32 f601, [r10+3456]; +ld.shared.f32 f602, [r10+4032]; +ld.shared.f32 f603, [r10+4608]; +ld.shared.f32 f604, [r10+5184]; +ld.shared.f32 f605, [r10+5760]; +ld.shared.f32 f606, [r10+6336]; +barrier.sync 0; +st.shared.f32 [r16], f463; +st.shared.f32 [r16+48], f494; +st.shared.f32 [r16+96], f504; +st.shared.f32 [r16+144], f514; +st.shared.f32 [r16+192], f524; +st.shared.f32 [r16+240], f534; +st.shared.f32 [r16+288], f544; +st.shared.f32 [r16+336], f554; +st.shared.f32 [r16+384], f564; +st.shared.f32 [r16+432], f574; +st.shared.f32 [r16+480], f584; +st.shared.f32 [r16+528], f594; +barrier.sync 0; +ld.shared.f32 f607, [r10]; +ld.shared.f32 f608, [r10+576]; +ld.shared.f32 f609, [r10+1152]; +ld.shared.f32 f610, [r10+1728]; +ld.shared.f32 f611, [r10+2304]; +ld.shared.f32 f612, [r10+2880]; +ld.shared.f32 f613, [r10+3456]; +ld.shared.f32 f614, [r10+4032]; +ld.shared.f32 f615, [r10+4608]; +ld.shared.f32 f616, [r10+5184]; +ld.shared.f32 f617, [r10+5760]; +ld.shared.f32 f618, [r10+6336]; +add.f32 f619, f599, f603; +add.f32 f620, f595, f619; +add.f32 f621, f611, f615; +add.f32 f622, f607, f621; +mul.f32 f623, f619, 0f3F000000; +sub.f32 f624, f595, f623; +sub.f32 f625, f611, f615; +mul.f32 f626, f625, 0fBF5DB3D7; +add.f32 f627, f626, f624; +sub.f32 f628, f624, f626; +mul.f32 f629, f621, 0f3F000000; +sub.f32 f630, f607, f629; +sub.f32 f631, f599, f603; +mul.f32 f632, f631, 0fBF5DB3D7; +sub.f32 f633, f630, f632; +add.f32 f634, f632, f630; +add.f32 f635, f601, f605; +add.f32 f636, f597, f635; +add.f32 f637, f613, f617; +add.f32 f638, f609, f637; +mul.f32 f639, f635, 0f3F000000; +sub.f32 f640, f597, f639; +sub.f32 f641, f613, f617; +mul.f32 f642, f641, 0fBF5DB3D7; +add.f32 f643, f642, f640; +sub.f32 f644, f640, f642; +mul.f32 f645, f637, 0f3F000000; +sub.f32 f646, f609, f645; +sub.f32 f647, f601, f605; +mul.f32 f648, f647, 0fBF5DB3D7; +sub.f32 f649, f646, f648; +add.f32 f650, f648, f646; +mul.f32 f651, f643, 0f3F000000; +mul.f32 f652, f649, 0f3F5DB3D7; +sub.f32 f653, f651, f652; +mul.f32 f654, f649, 0f3F000000; +fma.rn.f32 f655, f643, 0f3F5DB3D7, f654; +mul.f32 f656, f644, 0fBF000000; +mul.f32 f657, f650, 0f3F5DB3D7; +sub.f32 f658, f656, f657; +mul.f32 f659, f650, 0fBF000000; +fma.rn.f32 f660, f644, 0f3F5DB3D7, f659; +add.f32 f661, f620, f636; +add.f32 f662, f622, f638; +sub.f32 f663, f620, f636; +sub.f32 f664, f622, f638; +add.f32 f665, f627, f653; +add.f32 f666, f633, f655; +sub.f32 f667, f627, f653; +sub.f32 f668, f633, f655; +add.f32 f669, f628, f658; +add.f32 f670, f634, f660; +sub.f32 f671, f628, f658; +sub.f32 f672, f634, f660; +add.f32 f673, f600, f604; +add.f32 f674, f596, f673; +add.f32 f675, f612, f616; +add.f32 f676, f608, f675; +mul.f32 f677, f673, 0f3F000000; +sub.f32 f678, f596, f677; +sub.f32 f679, f612, f616; +mul.f32 f680, f679, 0fBF5DB3D7; +add.f32 f681, f680, f678; +sub.f32 f682, f678, f680; +mul.f32 f683, f675, 0f3F000000; +sub.f32 f684, f608, f683; +sub.f32 f685, f600, f604; +mul.f32 f686, f685, 0fBF5DB3D7; +sub.f32 f687, f684, f686; +add.f32 f688, f686, f684; +add.f32 f689, f602, f606; +add.f32 f690, f598, f689; +add.f32 f691, f614, f618; +add.f32 f692, f610, f691; +mul.f32 f693, f689, 0f3F000000; +sub.f32 f694, f598, f693; +sub.f32 f695, f614, f618; +mul.f32 f696, f695, 0fBF5DB3D7; +add.f32 f697, f696, f694; +sub.f32 f698, f694, f696; +mul.f32 f699, f691, 0f3F000000; +sub.f32 f700, f610, f699; +sub.f32 f701, f602, f606; +mul.f32 f702, f701, 0fBF5DB3D7; +sub.f32 f703, f700, f702; +add.f32 f704, f702, f700; +mul.f32 f705, f697, 0f3F000000; +mul.f32 f706, f703, 0f3F5DB3D7; +sub.f32 f707, f705, f706; +mul.f32 f708, f703, 0f3F000000; +fma.rn.f32 f709, f697, 0f3F5DB3D7, f708; +mul.f32 f710, f698, 0fBF000000; +mul.f32 f711, f704, 0f3F5DB3D7; +sub.f32 f712, f710, f711; +mul.f32 f713, f704, 0fBF000000; +fma.rn.f32 f714, f698, 0f3F5DB3D7, f713; +add.f32 f715, f674, f690; +add.f32 f716, f676, f692; +sub.f32 f717, f674, f690; +sub.f32 f718, f676, f692; +add.f32 f719, f681, f707; +add.f32 f720, f687, f709; +sub.f32 f721, f681, f707; +sub.f32 f722, f687, f709; +add.f32 f723, f682, f712; +add.f32 f724, f688, f714; +sub.f32 f725, f682, f712; +sub.f32 f726, f688, f714; +mul.f32 f727, f719, 0f3F5DB3D7; +mul.f32 f728, f720, 0f3F000000; +sub.f32 f729, f727, f728; +mul.f32 f730, f720, 0f3F5DB3D7; +fma.rn.f32 f731, f719, 0f3F000000, f730; +mul.f32 f732, f723, 0f3F000000; +mul.f32 f733, f724, 0f3F5DB3D7; +sub.f32 f734, f732, f733; +mul.f32 f735, f724, 0f3F000000; +fma.rn.f32 f736, f723, 0f3F5DB3D7, f735; +mul.f32 f737, f721, 0fBF000000; +mul.f32 f738, f722, 0f3F5DB3D7; +sub.f32 f739, f737, f738; +mul.f32 f740, f722, 0fBF000000; +fma.rn.f32 f741, f721, 0f3F5DB3D7, f740; +mul.f32 f742, f725, 0fBF5DB3D7; +mul.f32 f743, f726, 0f3F000000; +sub.f32 f744, f742, f743; +mul.f32 f745, f726, 0fBF5DB3D7; +fma.rn.f32 f746, f725, 0f3F000000, f745; +add.f32 %0, f661, f715; +add.f32 %1, f662, f716; +add.f32 %3, f666, f731; +add.f32 %2, f665, f729; +add.f32 %5, f670, f736; +add.f32 %4, f669, f734; +add.f32 %7, f664, f717; +sub.f32 %6, f663, f718; +add.f32 %9, f668, f741; +add.f32 %8, f667, f739; +add.f32 %11, f672, f746; +add.f32 %10, f671, f744; +sub.f32 %12, f661, f715; +sub.f32 %13, f662, f716; +sub.f32 %15, f666, f731; +sub.f32 %14, f665, f729; +sub.f32 %17, f670, f736; +sub.f32 %16, f669, f734; +sub.f32 %19, f664, f717; +add.f32 %18, f663, f718; +sub.f32 %21, f668, f741; +sub.f32 %20, f667, f739; +sub.f32 %23, f672, f746; +sub.f32 %22, f671, f744; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y): "r"(smem), "l"(lut_sp_12_1728), "l"(lut_sp_12_144), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..bc7991a47a180 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp64_fwd.hpp.inc @@ -0,0 +1,1546 @@ +#ifndef CUFFTDX_FFT_1728_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_1728_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<573, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<769>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 13824, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %37, %48; +add.f64 fd50, %27, fd49; +add.f64 fd51, %39, %50; +add.f64 fd52, %28, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %27, fd53; +sub.f64 fd55, %39, %50; +mul.f64 fd56, fd55, 0d3FEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %28, fd59; +sub.f64 fd61, %37, %48; +mul.f64 fd62, fd61, 0d3FEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %43, %53; +add.f64 fd66, %32, fd65; +add.f64 fd67, %44, %55; +add.f64 fd68, %34, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %32, fd69; +sub.f64 fd71, %44, %55; +mul.f64 fd72, fd71, 0d3FEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %34, fd75; +sub.f64 fd77, %43, %53; +mul.f64 fd78, fd77, 0d3FEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0dBFEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0dBFEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0dBFEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %40, %51; +add.f64 fd104, %29, fd103; +add.f64 fd105, %42, %52; +add.f64 fd106, %31, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %29, fd107; +sub.f64 fd109, %42, %52; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %31, fd113; +sub.f64 fd115, %40, %51; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %45, %56; +add.f64 fd120, %35, fd119; +add.f64 fd121, %47, %57; +add.f64 fd122, %36, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %35, fd123; +sub.f64 fd125, %47, %57; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %36, fd129; +sub.f64 fd131, %45, %56; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0dBFEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0dBFEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0dBFEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0dBFE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0dBFE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0dBFEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0dBFEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0dBFEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0dBFE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0dBFE0000000000000, fd175; +add.f64 fd177, fd91, fd145; +add.f64 fd178, fd92, fd146; +sub.f64 fd179, fd91, fd145; +sub.f64 fd180, fd92, fd146; +add.f64 fd181, fd95, fd159; +add.f64 fd182, fd96, fd161; +sub.f64 fd183, fd95, fd159; +sub.f64 fd184, fd96, fd161; +add.f64 fd185, fd99, fd164; +add.f64 fd186, fd100, fd166; +sub.f64 fd187, fd99, fd164; +sub.f64 fd188, fd100, fd166; +add.f64 fd189, fd93, fd148; +sub.f64 fd190, fd94, fd147; +sub.f64 fd191, fd93, fd148; +add.f64 fd192, fd94, fd147; +add.f64 fd193, fd97, fd169; +add.f64 fd194, fd98, fd171; +sub.f64 fd195, fd97, fd169; +sub.f64 fd196, fd98, fd171; +add.f64 fd197, fd101, fd174; +add.f64 fd198, fd102, fd176; +sub.f64 fd199, fd101, fd174; +sub.f64 fd200, fd102, fd176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd201, fd202}, [rd6]; +mul.f64 fd205, fd201, fd181; +mul.f64 fd206, fd202, fd182; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd201, fd182; +fma.rn.f64 fd209, fd202, fd181, fd208; +mul.f64 fd210, fd201, fd201; +mul.f64 fd211, fd202, fd202; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd202, fd201; +fma.rn.f64 fd214, fd202, fd201, fd213; +mul.f64 fd215, fd212, fd185; +mul.f64 fd216, fd214, fd186; +sub.f64 fd217, fd215, fd216; +mul.f64 fd218, fd212, fd186; +fma.rn.f64 fd219, fd214, fd185, fd218; +mul.f64 fd220, fd201, fd212; +mul.f64 fd221, fd202, fd214; +sub.f64 fd222, fd220, fd221; +mul.f64 fd223, fd201, fd214; +fma.rn.f64 fd224, fd202, fd212, fd223; +mul.f64 fd225, fd222, fd189; +mul.f64 fd226, fd224, fd190; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd222, fd190; +fma.rn.f64 fd229, fd224, fd189, fd228; +mul.f64 fd230, fd201, fd222; +mul.f64 fd231, fd202, fd224; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd201, fd224; +fma.rn.f64 fd234, fd202, fd222, fd233; +mul.f64 fd235, fd232, fd193; +mul.f64 fd236, fd234, fd194; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd232, fd194; +fma.rn.f64 fd239, fd234, fd193, fd238; +mul.f64 fd240, fd201, fd232; +mul.f64 fd241, fd202, fd234; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd201, fd234; +fma.rn.f64 fd244, fd202, fd232, fd243; +mul.f64 fd245, fd242, fd197; +mul.f64 fd246, fd244, fd198; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd198; +fma.rn.f64 fd249, fd244, fd197, fd248; +mul.f64 fd250, fd201, fd242; +mul.f64 fd251, fd202, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd201, fd244; +fma.rn.f64 fd254, fd202, fd242, fd253; +mul.f64 fd255, fd252, fd179; +mul.f64 fd256, fd254, fd180; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd180; +fma.rn.f64 fd259, fd254, fd179, fd258; +ld.global.v2.f64 {fd260, fd261}, [rd6+2304]; +mul.f64 fd264, fd260, fd183; +mul.f64 fd265, fd261, fd184; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd260, fd184; +fma.rn.f64 fd268, fd261, fd183, fd267; +mul.f64 fd269, fd201, fd260; +mul.f64 fd270, fd202, fd261; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd201, fd261; +fma.rn.f64 fd273, fd202, fd260, fd272; +mul.f64 fd274, fd271, fd187; +mul.f64 fd275, fd273, fd188; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd188; +fma.rn.f64 fd278, fd273, fd187, fd277; +mul.f64 fd279, fd201, fd271; +mul.f64 fd280, fd202, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd201, fd273; +fma.rn.f64 fd283, fd202, fd271, fd282; +mul.f64 fd284, fd281, fd191; +mul.f64 fd285, fd283, fd192; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd192; +fma.rn.f64 fd288, fd283, fd191, fd287; +mul.f64 fd289, fd201, fd281; +mul.f64 fd290, fd202, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd201, fd283; +fma.rn.f64 fd293, fd202, fd281, fd292; +mul.f64 fd294, fd291, fd195; +mul.f64 fd295, fd293, fd196; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd196; +fma.rn.f64 fd298, fd293, fd195, fd297; +mul.f64 fd299, fd201, fd291; +mul.f64 fd300, fd202, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd201, fd293; +fma.rn.f64 fd303, fd202, fd291, fd302; +mul.f64 fd304, fd301, fd199; +mul.f64 fd305, fd303, fd200; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd200; +fma.rn.f64 fd308, fd303, fd199, fd307; +mad.lo.s32 r8, r5, 13824, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +st.shared.v2.f64 [r9], {fd177, fd207}; +st.shared.v2.f64 [r9+16], {fd217, fd227}; +st.shared.v2.f64 [r9+32], {fd237, fd247}; +st.shared.v2.f64 [r9+48], {fd257, fd266}; +st.shared.v2.f64 [r9+64], {fd276, fd286}; +st.shared.v2.f64 [r9+80], {fd296, fd306}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.f64 fd309, [r10]; +ld.shared.f64 fd310, [r10+1152]; +ld.shared.f64 fd311, [r10+2304]; +ld.shared.f64 fd312, [r10+3456]; +ld.shared.f64 fd313, [r10+4608]; +ld.shared.f64 fd314, [r10+5760]; +ld.shared.f64 fd315, [r10+6912]; +ld.shared.f64 fd316, [r10+8064]; +ld.shared.f64 fd317, [r10+9216]; +ld.shared.f64 fd318, [r10+10368]; +ld.shared.f64 fd319, [r10+11520]; +ld.shared.f64 fd320, [r10+12672]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd178, fd209}; +st.shared.v2.f64 [r9+16], {fd219, fd229}; +st.shared.v2.f64 [r9+32], {fd239, fd249}; +st.shared.v2.f64 [r9+48], {fd259, fd268}; +st.shared.v2.f64 [r9+64], {fd278, fd288}; +st.shared.v2.f64 [r9+80], {fd298, fd308}; +barrier.sync 0; +ld.shared.f64 fd321, [r10]; +ld.shared.f64 fd322, [r10+1152]; +ld.shared.f64 fd323, [r10+2304]; +ld.shared.f64 fd324, [r10+3456]; +ld.shared.f64 fd325, [r10+4608]; +ld.shared.f64 fd326, [r10+5760]; +ld.shared.f64 fd327, [r10+6912]; +ld.shared.f64 fd328, [r10+8064]; +ld.shared.f64 fd329, [r10+9216]; +ld.shared.f64 fd330, [r10+10368]; +ld.shared.f64 fd331, [r10+11520]; +ld.shared.f64 fd332, [r10+12672]; +add.f64 fd333, fd313, fd317; +add.f64 fd334, fd309, fd333; +add.f64 fd335, fd325, fd329; +add.f64 fd336, fd321, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd309, fd337; +sub.f64 fd339, fd325, fd329; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd321, fd343; +sub.f64 fd345, fd313, fd317; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd315, fd319; +add.f64 fd350, fd311, fd349; +add.f64 fd351, fd327, fd331; +add.f64 fd352, fd323, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd311, fd353; +sub.f64 fd355, fd327, fd331; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd323, fd359; +sub.f64 fd361, fd315, fd319; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.f64 fd365, fd357, 0d3FE0000000000000; +mul.f64 fd366, fd363, 0dBFEBB67AE8584CAA; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd363, 0d3FE0000000000000; +fma.rn.f64 fd369, fd357, 0dBFEBB67AE8584CAA, fd368; +mul.f64 fd370, fd358, 0dBFE0000000000000; +mul.f64 fd371, fd364, 0dBFEBB67AE8584CAA; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd364, 0dBFE0000000000000; +fma.rn.f64 fd374, fd358, 0dBFEBB67AE8584CAA, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd336, fd352; +sub.f64 fd377, fd334, fd350; +sub.f64 fd378, fd336, fd352; +add.f64 fd379, fd341, fd367; +add.f64 fd380, fd347, fd369; +sub.f64 fd381, fd341, fd367; +sub.f64 fd382, fd347, fd369; +add.f64 fd383, fd342, fd372; +add.f64 fd384, fd348, fd374; +sub.f64 fd385, fd342, fd372; +sub.f64 fd386, fd348, fd374; +add.f64 fd387, fd314, fd318; +add.f64 fd388, fd310, fd387; +add.f64 fd389, fd326, fd330; +add.f64 fd390, fd322, fd389; +mul.f64 fd391, fd387, 0d3FE0000000000000; +sub.f64 fd392, fd310, fd391; +sub.f64 fd393, fd326, fd330; +mul.f64 fd394, fd393, 0d3FEBB67AE8584CAA; +add.f64 fd395, fd394, fd392; +sub.f64 fd396, fd392, fd394; +mul.f64 fd397, fd389, 0d3FE0000000000000; +sub.f64 fd398, fd322, fd397; +sub.f64 fd399, fd314, fd318; +mul.f64 fd400, fd399, 0d3FEBB67AE8584CAA; +sub.f64 fd401, fd398, fd400; +add.f64 fd402, fd400, fd398; +add.f64 fd403, fd316, fd320; +add.f64 fd404, fd312, fd403; +add.f64 fd405, fd328, fd332; +add.f64 fd406, fd324, fd405; +mul.f64 fd407, fd403, 0d3FE0000000000000; +sub.f64 fd408, fd312, fd407; +sub.f64 fd409, fd328, fd332; +mul.f64 fd410, fd409, 0d3FEBB67AE8584CAA; +add.f64 fd411, fd410, fd408; +sub.f64 fd412, fd408, fd410; +mul.f64 fd413, fd405, 0d3FE0000000000000; +sub.f64 fd414, fd324, fd413; +sub.f64 fd415, fd316, fd320; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +sub.f64 fd417, fd414, fd416; +add.f64 fd418, fd416, fd414; +mul.f64 fd419, fd411, 0d3FE0000000000000; +mul.f64 fd420, fd417, 0dBFEBB67AE8584CAA; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd417, 0d3FE0000000000000; +fma.rn.f64 fd423, fd411, 0dBFEBB67AE8584CAA, fd422; +mul.f64 fd424, fd412, 0dBFE0000000000000; +mul.f64 fd425, fd418, 0dBFEBB67AE8584CAA; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd418, 0dBFE0000000000000; +fma.rn.f64 fd428, fd412, 0dBFEBB67AE8584CAA, fd427; +add.f64 fd429, fd388, fd404; +add.f64 fd430, fd390, fd406; +sub.f64 fd431, fd388, fd404; +sub.f64 fd432, fd390, fd406; +add.f64 fd433, fd395, fd421; +add.f64 fd434, fd401, fd423; +sub.f64 fd435, fd395, fd421; +sub.f64 fd436, fd401, fd423; +add.f64 fd437, fd396, fd426; +add.f64 fd438, fd402, fd428; +sub.f64 fd439, fd396, fd426; +sub.f64 fd440, fd402, fd428; +mul.f64 fd441, fd433, 0d3FEBB67AE8584CAA; +mul.f64 fd442, fd434, 0dBFE0000000000000; +sub.f64 fd443, fd441, fd442; +mul.f64 fd444, fd434, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd445, fd433, 0dBFE0000000000000, fd444; +mul.f64 fd446, fd437, 0d3FE0000000000000; +mul.f64 fd447, fd438, 0dBFEBB67AE8584CAA; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd438, 0d3FE0000000000000; +fma.rn.f64 fd450, fd437, 0dBFEBB67AE8584CAA, fd449; +mul.f64 fd451, fd435, 0dBFE0000000000000; +mul.f64 fd452, fd436, 0dBFEBB67AE8584CAA; +sub.f64 fd453, fd451, fd452; +mul.f64 fd454, fd436, 0dBFE0000000000000; +fma.rn.f64 fd455, fd435, 0dBFEBB67AE8584CAA, fd454; +mul.f64 fd456, fd439, 0dBFEBB67AE8584CAA; +mul.f64 fd457, fd440, 0dBFE0000000000000; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd440, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd460, fd439, 0dBFE0000000000000, fd459; +add.f64 fd461, fd375, fd429; +add.f64 fd462, fd376, fd430; +sub.f64 fd463, fd375, fd429; +sub.f64 fd464, fd376, fd430; +add.f64 fd465, fd379, fd443; +add.f64 fd466, fd380, fd445; +sub.f64 fd467, fd379, fd443; +sub.f64 fd468, fd380, fd445; +add.f64 fd469, fd383, fd448; +add.f64 fd470, fd384, fd450; +sub.f64 fd471, fd383, fd448; +sub.f64 fd472, fd384, fd450; +add.f64 fd473, fd377, fd432; +sub.f64 fd474, fd378, fd431; +sub.f64 fd475, fd377, fd432; +add.f64 fd476, fd378, fd431; +add.f64 fd477, fd381, fd453; +add.f64 fd478, fd382, fd455; +sub.f64 fd479, fd381, fd453; +sub.f64 fd480, fd382, fd455; +add.f64 fd481, fd385, fd458; +add.f64 fd482, fd386, fd460; +sub.f64 fd483, fd385, fd458; +sub.f64 fd484, fd386, fd460; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd485, fd486}, [rd11]; +mul.f64 fd489, fd485, fd465; +mul.f64 fd490, fd486, fd466; +sub.f64 fd491, fd489, fd490; +mul.f64 fd492, fd485, fd466; +fma.rn.f64 fd493, fd486, fd465, fd492; +mul.f64 fd494, fd485, fd485; +mul.f64 fd495, fd486, fd486; +sub.f64 fd496, fd494, fd495; +mul.f64 fd497, fd486, fd485; +fma.rn.f64 fd498, fd486, fd485, fd497; +mul.f64 fd499, fd496, fd469; +mul.f64 fd500, fd498, fd470; +sub.f64 fd501, fd499, fd500; +mul.f64 fd502, fd496, fd470; +fma.rn.f64 fd503, fd498, fd469, fd502; +mul.f64 fd504, fd485, fd496; +mul.f64 fd505, fd486, fd498; +sub.f64 fd506, fd504, fd505; +mul.f64 fd507, fd485, fd498; +fma.rn.f64 fd508, fd486, fd496, fd507; +mul.f64 fd509, fd506, fd473; +mul.f64 fd510, fd508, fd474; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, fd474; +fma.rn.f64 fd513, fd508, fd473, fd512; +mul.f64 fd514, fd485, fd506; +mul.f64 fd515, fd486, fd508; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd485, fd508; +fma.rn.f64 fd518, fd486, fd506, fd517; +mul.f64 fd519, fd516, fd477; +mul.f64 fd520, fd518, fd478; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd516, fd478; +fma.rn.f64 fd523, fd518, fd477, fd522; +mul.f64 fd524, fd485, fd516; +mul.f64 fd525, fd486, fd518; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd485, fd518; +fma.rn.f64 fd528, fd486, fd516, fd527; +mul.f64 fd529, fd526, fd481; +mul.f64 fd530, fd528, fd482; +sub.f64 fd531, fd529, fd530; +mul.f64 fd532, fd526, fd482; +fma.rn.f64 fd533, fd528, fd481, fd532; +mul.f64 fd534, fd485, fd526; +mul.f64 fd535, fd486, fd528; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd485, fd528; +fma.rn.f64 fd538, fd486, fd526, fd537; +mul.f64 fd539, fd536, fd463; +mul.f64 fd540, fd538, fd464; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd536, fd464; +fma.rn.f64 fd543, fd538, fd463, fd542; +ld.global.v2.f64 {fd544, fd545}, [rd11+192]; +mul.f64 fd548, fd544, fd467; +mul.f64 fd549, fd545, fd468; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd544, fd468; +fma.rn.f64 fd552, fd545, fd467, fd551; +mul.f64 fd553, fd485, fd544; +mul.f64 fd554, fd486, fd545; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd485, fd545; +fma.rn.f64 fd557, fd486, fd544, fd556; +mul.f64 fd558, fd555, fd471; +mul.f64 fd559, fd557, fd472; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd555, fd472; +fma.rn.f64 fd562, fd557, fd471, fd561; +mul.f64 fd563, fd485, fd555; +mul.f64 fd564, fd486, fd557; +sub.f64 fd565, fd563, fd564; +mul.f64 fd566, fd485, fd557; +fma.rn.f64 fd567, fd486, fd555, fd566; +mul.f64 fd568, fd565, fd475; +mul.f64 fd569, fd567, fd476; +sub.f64 fd570, fd568, fd569; +mul.f64 fd571, fd565, fd476; +fma.rn.f64 fd572, fd567, fd475, fd571; +mul.f64 fd573, fd485, fd565; +mul.f64 fd574, fd486, fd567; +sub.f64 fd575, fd573, fd574; +mul.f64 fd576, fd485, fd567; +fma.rn.f64 fd577, fd486, fd565, fd576; +mul.f64 fd578, fd575, fd479; +mul.f64 fd579, fd577, fd480; +sub.f64 fd580, fd578, fd579; +mul.f64 fd581, fd575, fd480; +fma.rn.f64 fd582, fd577, fd479, fd581; +mul.f64 fd583, fd485, fd575; +mul.f64 fd584, fd486, fd577; +sub.f64 fd585, fd583, fd584; +mul.f64 fd586, fd485, fd577; +fma.rn.f64 fd587, fd486, fd575, fd586; +mul.f64 fd588, fd585, fd483; +mul.f64 fd589, fd587, fd484; +sub.f64 fd590, fd588, fd589; +mul.f64 fd591, fd585, fd484; +fma.rn.f64 fd592, fd587, fd483, fd591; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1152, r15; +st.shared.f64 [r16], fd461; +st.shared.f64 [r16+96], fd491; +st.shared.f64 [r16+192], fd501; +st.shared.f64 [r16+288], fd511; +st.shared.f64 [r16+384], fd521; +st.shared.f64 [r16+480], fd531; +st.shared.f64 [r16+576], fd541; +st.shared.f64 [r16+672], fd550; +st.shared.f64 [r16+768], fd560; +st.shared.f64 [r16+864], fd570; +st.shared.f64 [r16+960], fd580; +st.shared.f64 [r16+1056], fd590; +barrier.sync 0; +ld.shared.f64 fd593, [r10]; +ld.shared.f64 fd594, [r10+1152]; +ld.shared.f64 fd595, [r10+2304]; +ld.shared.f64 fd596, [r10+3456]; +ld.shared.f64 fd597, [r10+4608]; +ld.shared.f64 fd598, [r10+5760]; +ld.shared.f64 fd599, [r10+6912]; +ld.shared.f64 fd600, [r10+8064]; +ld.shared.f64 fd601, [r10+9216]; +ld.shared.f64 fd602, [r10+10368]; +ld.shared.f64 fd603, [r10+11520]; +ld.shared.f64 fd604, [r10+12672]; +barrier.sync 0; +st.shared.f64 [r16], fd462; +st.shared.f64 [r16+96], fd493; +st.shared.f64 [r16+192], fd503; +st.shared.f64 [r16+288], fd513; +st.shared.f64 [r16+384], fd523; +st.shared.f64 [r16+480], fd533; +st.shared.f64 [r16+576], fd543; +st.shared.f64 [r16+672], fd552; +st.shared.f64 [r16+768], fd562; +st.shared.f64 [r16+864], fd572; +st.shared.f64 [r16+960], fd582; +st.shared.f64 [r16+1056], fd592; +barrier.sync 0; +ld.shared.f64 fd605, [r10]; +ld.shared.f64 fd606, [r10+1152]; +ld.shared.f64 fd607, [r10+2304]; +ld.shared.f64 fd608, [r10+3456]; +ld.shared.f64 fd609, [r10+4608]; +ld.shared.f64 fd610, [r10+5760]; +ld.shared.f64 fd611, [r10+6912]; +ld.shared.f64 fd612, [r10+8064]; +ld.shared.f64 fd613, [r10+9216]; +ld.shared.f64 fd614, [r10+10368]; +ld.shared.f64 fd615, [r10+11520]; +ld.shared.f64 fd616, [r10+12672]; +add.f64 fd617, fd597, fd601; +add.f64 fd618, fd593, fd617; +add.f64 fd619, fd609, fd613; +add.f64 fd620, fd605, fd619; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd593, fd621; +sub.f64 fd623, fd609, fd613; +mul.f64 fd624, fd623, 0d3FEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +mul.f64 fd627, fd619, 0d3FE0000000000000; +sub.f64 fd628, fd605, fd627; +sub.f64 fd629, fd597, fd601; +mul.f64 fd630, fd629, 0d3FEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd599, fd603; +add.f64 fd634, fd595, fd633; +add.f64 fd635, fd611, fd615; +add.f64 fd636, fd607, fd635; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd595, fd637; +sub.f64 fd639, fd611, fd615; +mul.f64 fd640, fd639, 0d3FEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +mul.f64 fd643, fd635, 0d3FE0000000000000; +sub.f64 fd644, fd607, fd643; +sub.f64 fd645, fd599, fd603; +mul.f64 fd646, fd645, 0d3FEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +mul.f64 fd649, fd641, 0d3FE0000000000000; +mul.f64 fd650, fd647, 0dBFEBB67AE8584CAA; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd647, 0d3FE0000000000000; +fma.rn.f64 fd653, fd641, 0dBFEBB67AE8584CAA, fd652; +mul.f64 fd654, fd642, 0dBFE0000000000000; +mul.f64 fd655, fd648, 0dBFEBB67AE8584CAA; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd648, 0dBFE0000000000000; +fma.rn.f64 fd658, fd642, 0dBFEBB67AE8584CAA, fd657; +add.f64 fd659, fd618, fd634; +add.f64 fd660, fd620, fd636; +sub.f64 fd661, fd618, fd634; +sub.f64 fd662, fd620, fd636; +add.f64 fd663, fd625, fd651; +add.f64 fd664, fd631, fd653; +sub.f64 fd665, fd625, fd651; +sub.f64 fd666, fd631, fd653; +add.f64 fd667, fd626, fd656; +add.f64 fd668, fd632, fd658; +sub.f64 fd669, fd626, fd656; +sub.f64 fd670, fd632, fd658; +add.f64 fd671, fd598, fd602; +add.f64 fd672, fd594, fd671; +add.f64 fd673, fd610, fd614; +add.f64 fd674, fd606, fd673; +mul.f64 fd675, fd671, 0d3FE0000000000000; +sub.f64 fd676, fd594, fd675; +sub.f64 fd677, fd610, fd614; +mul.f64 fd678, fd677, 0d3FEBB67AE8584CAA; +add.f64 fd679, fd678, fd676; +sub.f64 fd680, fd676, fd678; +mul.f64 fd681, fd673, 0d3FE0000000000000; +sub.f64 fd682, fd606, fd681; +sub.f64 fd683, fd598, fd602; +mul.f64 fd684, fd683, 0d3FEBB67AE8584CAA; +sub.f64 fd685, fd682, fd684; +add.f64 fd686, fd684, fd682; +add.f64 fd687, fd600, fd604; +add.f64 fd688, fd596, fd687; +add.f64 fd689, fd612, fd616; +add.f64 fd690, fd608, fd689; +mul.f64 fd691, fd687, 0d3FE0000000000000; +sub.f64 fd692, fd596, fd691; +sub.f64 fd693, fd612, fd616; +mul.f64 fd694, fd693, 0d3FEBB67AE8584CAA; +add.f64 fd695, fd694, fd692; +sub.f64 fd696, fd692, fd694; +mul.f64 fd697, fd689, 0d3FE0000000000000; +sub.f64 fd698, fd608, fd697; +sub.f64 fd699, fd600, fd604; +mul.f64 fd700, fd699, 0d3FEBB67AE8584CAA; +sub.f64 fd701, fd698, fd700; +add.f64 fd702, fd700, fd698; +mul.f64 fd703, fd695, 0d3FE0000000000000; +mul.f64 fd704, fd701, 0dBFEBB67AE8584CAA; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd701, 0d3FE0000000000000; +fma.rn.f64 fd707, fd695, 0dBFEBB67AE8584CAA, fd706; +mul.f64 fd708, fd696, 0dBFE0000000000000; +mul.f64 fd709, fd702, 0dBFEBB67AE8584CAA; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd702, 0dBFE0000000000000; +fma.rn.f64 fd712, fd696, 0dBFEBB67AE8584CAA, fd711; +add.f64 fd713, fd672, fd688; +add.f64 fd714, fd674, fd690; +sub.f64 fd715, fd672, fd688; +sub.f64 fd716, fd674, fd690; +add.f64 fd717, fd679, fd705; +add.f64 fd718, fd685, fd707; +sub.f64 fd719, fd679, fd705; +sub.f64 fd720, fd685, fd707; +add.f64 fd721, fd680, fd710; +add.f64 fd722, fd686, fd712; +sub.f64 fd723, fd680, fd710; +sub.f64 fd724, fd686, fd712; +mul.f64 fd725, fd717, 0d3FEBB67AE8584CAA; +mul.f64 fd726, fd718, 0dBFE0000000000000; +sub.f64 fd727, fd725, fd726; +mul.f64 fd728, fd718, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd729, fd717, 0dBFE0000000000000, fd728; +mul.f64 fd730, fd721, 0d3FE0000000000000; +mul.f64 fd731, fd722, 0dBFEBB67AE8584CAA; +sub.f64 fd732, fd730, fd731; +mul.f64 fd733, fd722, 0d3FE0000000000000; +fma.rn.f64 fd734, fd721, 0dBFEBB67AE8584CAA, fd733; +mul.f64 fd735, fd719, 0dBFE0000000000000; +mul.f64 fd736, fd720, 0dBFEBB67AE8584CAA; +sub.f64 fd737, fd735, fd736; +mul.f64 fd738, fd720, 0dBFE0000000000000; +fma.rn.f64 fd739, fd719, 0dBFEBB67AE8584CAA, fd738; +mul.f64 fd740, fd723, 0dBFEBB67AE8584CAA; +mul.f64 fd741, fd724, 0dBFE0000000000000; +sub.f64 fd742, fd740, fd741; +mul.f64 fd743, fd724, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd744, fd723, 0dBFE0000000000000, fd743; +add.f64 %0, fd659, fd713; +add.f64 %1, fd660, fd714; +add.f64 %3, fd664, fd729; +add.f64 %2, fd663, fd727; +add.f64 %5, fd668, fd734; +add.f64 %4, fd667, fd732; +sub.f64 %7, fd662, fd715; +add.f64 %6, fd661, fd716; +add.f64 %9, fd666, fd739; +add.f64 %8, fd665, fd737; +add.f64 %11, fd670, fd744; +add.f64 %10, fd669, fd742; +sub.f64 %12, fd659, fd713; +sub.f64 %13, fd660, fd714; +sub.f64 %15, fd664, fd729; +sub.f64 %14, fd663, fd727; +sub.f64 %17, fd668, fd734; +sub.f64 %16, fd667, fd732; +add.f64 %19, fd662, fd715; +sub.f64 %18, fd661, fd716; +sub.f64 %21, fd666, fd739; +sub.f64 %20, fd665, fd737; +sub.f64 %23, fd670, fd744; +sub.f64 %22, fd669, fd742; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_1728), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<574, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<817>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 27648, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %37, %48; +add.f64 fd50, %27, fd49; +add.f64 fd51, %39, %50; +add.f64 fd52, %28, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %27, fd53; +sub.f64 fd55, %39, %50; +mul.f64 fd56, fd55, 0d3FEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %28, fd59; +sub.f64 fd61, %37, %48; +mul.f64 fd62, fd61, 0d3FEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %43, %53; +add.f64 fd66, %32, fd65; +add.f64 fd67, %44, %55; +add.f64 fd68, %34, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %32, fd69; +sub.f64 fd71, %44, %55; +mul.f64 fd72, fd71, 0d3FEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %34, fd75; +sub.f64 fd77, %43, %53; +mul.f64 fd78, fd77, 0d3FEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0dBFEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0dBFEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0dBFEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %40, %51; +add.f64 fd104, %29, fd103; +add.f64 fd105, %42, %52; +add.f64 fd106, %31, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %29, fd107; +sub.f64 fd109, %42, %52; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %31, fd113; +sub.f64 fd115, %40, %51; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %45, %56; +add.f64 fd120, %35, fd119; +add.f64 fd121, %47, %57; +add.f64 fd122, %36, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %35, fd123; +sub.f64 fd125, %47, %57; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %36, fd129; +sub.f64 fd131, %45, %56; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0dBFEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0dBFEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0dBFEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0dBFE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0dBFE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0dBFEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0dBFEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0dBFEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0dBFE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0dBFE0000000000000, fd175; +sub.f64 fd177, fd91, fd145; +sub.f64 fd178, fd92, fd146; +add.f64 fd179, fd95, fd159; +add.f64 fd180, fd96, fd161; +sub.f64 fd181, fd95, fd159; +sub.f64 fd182, fd96, fd161; +add.f64 fd183, fd99, fd164; +add.f64 fd184, fd100, fd166; +sub.f64 fd185, fd99, fd164; +sub.f64 fd186, fd100, fd166; +add.f64 fd187, fd93, fd148; +sub.f64 fd188, fd94, fd147; +sub.f64 fd189, fd93, fd148; +add.f64 fd190, fd94, fd147; +add.f64 fd191, fd97, fd169; +add.f64 fd192, fd98, fd171; +sub.f64 fd193, fd97, fd169; +sub.f64 fd194, fd98, fd171; +add.f64 fd195, fd101, fd174; +add.f64 fd196, fd102, fd176; +sub.f64 fd197, fd101, fd174; +sub.f64 fd198, fd102, fd176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 27648, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd199, fd200}, [rd6]; +mul.f64 fd203, fd199, fd179; +mul.f64 fd204, fd200, fd180; +mul.f64 fd205, fd199, fd180; +mul.f64 fd206, fd199, fd199; +mul.f64 fd207, fd200, fd200; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd200, fd199; +fma.rn.f64 fd210, fd200, fd199, fd209; +mul.f64 fd211, fd208, fd183; +mul.f64 fd212, fd210, fd184; +mul.f64 fd213, fd208, fd184; +mul.f64 fd214, fd199, fd208; +mul.f64 fd215, fd200, fd210; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd199, fd210; +fma.rn.f64 fd218, fd200, fd208, fd217; +mul.f64 fd219, fd216, fd187; +mul.f64 fd220, fd218, fd188; +mul.f64 fd221, fd216, fd188; +mul.f64 fd222, fd199, fd216; +mul.f64 fd223, fd200, fd218; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd199, fd218; +fma.rn.f64 fd226, fd200, fd216, fd225; +mul.f64 fd227, fd224, fd191; +mul.f64 fd228, fd226, fd192; +mul.f64 fd229, fd224, fd192; +mul.f64 fd230, fd199, fd224; +mul.f64 fd231, fd200, fd226; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd199, fd226; +fma.rn.f64 fd234, fd200, fd224, fd233; +mul.f64 fd235, fd232, fd195; +mul.f64 fd236, fd234, fd196; +mul.f64 fd237, fd232, fd196; +mul.f64 fd238, fd199, fd232; +mul.f64 fd239, fd200, fd234; +sub.f64 fd240, fd238, fd239; +mul.f64 fd241, fd199, fd234; +fma.rn.f64 fd242, fd200, fd232, fd241; +mul.f64 fd243, fd240, fd177; +mul.f64 fd244, fd242, fd178; +mul.f64 fd245, fd240, fd178; +ld.global.v2.f64 {fd246, fd247}, [rd6+2304]; +mul.f64 fd250, fd246, fd181; +mul.f64 fd251, fd247, fd182; +mul.f64 fd252, fd246, fd182; +mul.f64 fd253, fd199, fd246; +mul.f64 fd254, fd200, fd247; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd199, fd247; +fma.rn.f64 fd257, fd200, fd246, fd256; +mul.f64 fd258, fd255, fd185; +mul.f64 fd259, fd257, fd186; +mul.f64 fd260, fd255, fd186; +mul.f64 fd261, fd199, fd255; +mul.f64 fd262, fd200, fd257; +sub.f64 fd263, fd261, fd262; +mul.f64 fd264, fd199, fd257; +fma.rn.f64 fd265, fd200, fd255, fd264; +mul.f64 fd266, fd263, fd189; +mul.f64 fd267, fd265, fd190; +mul.f64 fd268, fd263, fd190; +mul.f64 fd269, fd199, fd263; +mul.f64 fd270, fd200, fd265; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd199, fd265; +fma.rn.f64 fd273, fd200, fd263, fd272; +mul.f64 fd274, fd271, fd193; +mul.f64 fd275, fd273, fd194; +mul.f64 fd276, fd271, fd194; +mul.f64 fd277, fd199, fd271; +mul.f64 fd278, fd200, fd273; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd199, fd273; +fma.rn.f64 fd281, fd200, fd271, fd280; +mul.f64 fd282, fd279, fd197; +mul.f64 fd283, fd281, fd198; +mul.f64 fd284, fd279, fd198; +barrier.sync 0; +mad.lo.s32 r9, r7, 192, r8; +add.f64 fd285, fd92, fd146; +add.f64 fd286, fd91, fd145; +st.shared.v2.f64 [r9], {fd286, fd285}; +fma.rn.f64 fd287, fd200, fd179, fd205; +sub.f64 fd288, fd203, fd204; +st.shared.v2.f64 [r9+16], {fd288, fd287}; +fma.rn.f64 fd289, fd210, fd183, fd213; +sub.f64 fd290, fd211, fd212; +st.shared.v2.f64 [r9+32], {fd290, fd289}; +sub.f64 fd291, fd219, fd220; +fma.rn.f64 fd292, fd218, fd187, fd221; +st.shared.v2.f64 [r9+48], {fd291, fd292}; +sub.f64 fd293, fd227, fd228; +fma.rn.f64 fd294, fd226, fd191, fd229; +st.shared.v2.f64 [r9+64], {fd293, fd294}; +fma.rn.f64 fd295, fd234, fd195, fd237; +sub.f64 fd296, fd235, fd236; +st.shared.v2.f64 [r9+80], {fd296, fd295}; +fma.rn.f64 fd297, fd242, fd177, fd245; +sub.f64 fd298, fd243, fd244; +st.shared.v2.f64 [r9+96], {fd298, fd297}; +fma.rn.f64 fd299, fd247, fd181, fd252; +sub.f64 fd300, fd250, fd251; +st.shared.v2.f64 [r9+112], {fd300, fd299}; +fma.rn.f64 fd301, fd257, fd185, fd260; +sub.f64 fd302, fd258, fd259; +st.shared.v2.f64 [r9+128], {fd302, fd301}; +sub.f64 fd303, fd266, fd267; +fma.rn.f64 fd304, fd265, fd189, fd268; +st.shared.v2.f64 [r9+144], {fd303, fd304}; +sub.f64 fd305, fd274, fd275; +fma.rn.f64 fd306, fd273, fd193, fd276; +st.shared.v2.f64 [r9+160], {fd305, fd306}; +fma.rn.f64 fd307, fd281, fd197, fd284; +sub.f64 fd308, fd282, fd283; +st.shared.v2.f64 [r9+176], {fd308, fd307}; +barrier.sync 0; +mad.lo.s32 r10, r7, -176, r9; +ld.shared.v2.f64 {fd309, fd310}, [r10]; +ld.shared.v2.f64 {fd313, fd314}, [r10+2304]; +ld.shared.v2.f64 {fd317, fd318}, [r10+4608]; +ld.shared.v2.f64 {fd321, fd322}, [r10+6912]; +ld.shared.v2.f64 {fd325, fd326}, [r10+9216]; +ld.shared.v2.f64 {fd329, fd330}, [r10+11520]; +ld.shared.v2.f64 {fd333, fd334}, [r10+13824]; +ld.shared.v2.f64 {fd337, fd338}, [r10+16128]; +ld.shared.v2.f64 {fd341, fd342}, [r10+18432]; +ld.shared.v2.f64 {fd345, fd346}, [r10+20736]; +ld.shared.v2.f64 {fd349, fd350}, [r10+23040]; +ld.shared.v2.f64 {fd353, fd354}, [r10+25344]; +add.f64 fd357, fd325, fd341; +add.f64 fd358, fd309, fd357; +add.f64 fd359, fd326, fd342; +add.f64 fd360, fd310, fd359; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, fd309, fd361; +sub.f64 fd363, fd326, fd342; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +mul.f64 fd367, fd359, 0d3FE0000000000000; +sub.f64 fd368, fd310, fd367; +sub.f64 fd369, fd325, fd341; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, fd333, fd349; +add.f64 fd374, fd317, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd318, fd375; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, fd317, fd377; +sub.f64 fd379, fd334, fd350; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +mul.f64 fd383, fd375, 0d3FE0000000000000; +sub.f64 fd384, fd318, fd383; +sub.f64 fd385, fd333, fd349; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd389, fd381, 0d3FE0000000000000; +mul.f64 fd390, fd387, 0dBFEBB67AE8584CAA; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd387, 0d3FE0000000000000; +fma.rn.f64 fd393, fd381, 0dBFEBB67AE8584CAA, fd392; +mul.f64 fd394, fd382, 0dBFE0000000000000; +mul.f64 fd395, fd388, 0dBFEBB67AE8584CAA; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd388, 0dBFE0000000000000; +fma.rn.f64 fd398, fd382, 0dBFEBB67AE8584CAA, fd397; +add.f64 fd399, fd358, fd374; +add.f64 fd400, fd360, fd376; +sub.f64 fd401, fd358, fd374; +sub.f64 fd402, fd360, fd376; +add.f64 fd403, fd365, fd391; +add.f64 fd404, fd371, fd393; +sub.f64 fd405, fd365, fd391; +sub.f64 fd406, fd371, fd393; +add.f64 fd407, fd366, fd396; +add.f64 fd408, fd372, fd398; +sub.f64 fd409, fd366, fd396; +sub.f64 fd410, fd372, fd398; +add.f64 fd411, fd329, fd345; +add.f64 fd412, fd313, fd411; +add.f64 fd413, fd330, fd346; +add.f64 fd414, fd314, fd413; +mul.f64 fd415, fd411, 0d3FE0000000000000; +sub.f64 fd416, fd313, fd415; +sub.f64 fd417, fd330, fd346; +mul.f64 fd418, fd417, 0d3FEBB67AE8584CAA; +add.f64 fd419, fd418, fd416; +sub.f64 fd420, fd416, fd418; +mul.f64 fd421, fd413, 0d3FE0000000000000; +sub.f64 fd422, fd314, fd421; +sub.f64 fd423, fd329, fd345; +mul.f64 fd424, fd423, 0d3FEBB67AE8584CAA; +sub.f64 fd425, fd422, fd424; +add.f64 fd426, fd424, fd422; +add.f64 fd427, fd337, fd353; +add.f64 fd428, fd321, fd427; +add.f64 fd429, fd338, fd354; +add.f64 fd430, fd322, fd429; +mul.f64 fd431, fd427, 0d3FE0000000000000; +sub.f64 fd432, fd321, fd431; +sub.f64 fd433, fd338, fd354; +mul.f64 fd434, fd433, 0d3FEBB67AE8584CAA; +add.f64 fd435, fd434, fd432; +sub.f64 fd436, fd432, fd434; +mul.f64 fd437, fd429, 0d3FE0000000000000; +sub.f64 fd438, fd322, fd437; +sub.f64 fd439, fd337, fd353; +mul.f64 fd440, fd439, 0d3FEBB67AE8584CAA; +sub.f64 fd441, fd438, fd440; +add.f64 fd442, fd440, fd438; +mul.f64 fd443, fd435, 0d3FE0000000000000; +mul.f64 fd444, fd441, 0dBFEBB67AE8584CAA; +sub.f64 fd445, fd443, fd444; +mul.f64 fd446, fd441, 0d3FE0000000000000; +fma.rn.f64 fd447, fd435, 0dBFEBB67AE8584CAA, fd446; +mul.f64 fd448, fd436, 0dBFE0000000000000; +mul.f64 fd449, fd442, 0dBFEBB67AE8584CAA; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd442, 0dBFE0000000000000; +fma.rn.f64 fd452, fd436, 0dBFEBB67AE8584CAA, fd451; +add.f64 fd453, fd412, fd428; +add.f64 fd454, fd414, fd430; +sub.f64 fd455, fd412, fd428; +sub.f64 fd456, fd414, fd430; +add.f64 fd457, fd419, fd445; +add.f64 fd458, fd425, fd447; +sub.f64 fd459, fd419, fd445; +sub.f64 fd460, fd425, fd447; +add.f64 fd461, fd420, fd450; +add.f64 fd462, fd426, fd452; +sub.f64 fd463, fd420, fd450; +sub.f64 fd464, fd426, fd452; +mul.f64 fd465, fd457, 0d3FEBB67AE8584CAA; +mul.f64 fd466, fd458, 0dBFE0000000000000; +sub.f64 fd467, fd465, fd466; +mul.f64 fd468, fd458, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd469, fd457, 0dBFE0000000000000, fd468; +mul.f64 fd470, fd461, 0d3FE0000000000000; +mul.f64 fd471, fd462, 0dBFEBB67AE8584CAA; +sub.f64 fd472, fd470, fd471; +mul.f64 fd473, fd462, 0d3FE0000000000000; +fma.rn.f64 fd474, fd461, 0dBFEBB67AE8584CAA, fd473; +mul.f64 fd475, fd459, 0dBFE0000000000000; +mul.f64 fd476, fd460, 0dBFEBB67AE8584CAA; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd460, 0dBFE0000000000000; +fma.rn.f64 fd479, fd459, 0dBFEBB67AE8584CAA, fd478; +mul.f64 fd480, fd463, 0dBFEBB67AE8584CAA; +mul.f64 fd481, fd464, 0dBFE0000000000000; +sub.f64 fd482, fd480, fd481; +mul.f64 fd483, fd464, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd484, fd463, 0dBFE0000000000000, fd483; +sub.f64 fd485, fd399, fd453; +sub.f64 fd486, fd400, fd454; +add.f64 fd487, fd403, fd467; +add.f64 fd488, fd404, fd469; +sub.f64 fd489, fd403, fd467; +sub.f64 fd490, fd404, fd469; +add.f64 fd491, fd407, fd472; +add.f64 fd492, fd408, fd474; +sub.f64 fd493, fd407, fd472; +sub.f64 fd494, fd408, fd474; +add.f64 fd495, fd401, fd456; +sub.f64 fd496, fd402, fd455; +sub.f64 fd497, fd401, fd456; +add.f64 fd498, fd402, fd455; +add.f64 fd499, fd405, fd477; +add.f64 fd500, fd406, fd479; +sub.f64 fd501, fd405, fd477; +sub.f64 fd502, fd406, fd479; +add.f64 fd503, fd409, fd482; +add.f64 fd504, fd410, fd484; +sub.f64 fd505, fd409, fd482; +sub.f64 fd506, fd410, fd484; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd507, fd508}, [rd11]; +mul.f64 fd511, fd507, fd487; +mul.f64 fd512, fd508, fd488; +mul.f64 fd513, fd507, fd488; +mul.f64 fd514, fd507, fd507; +mul.f64 fd515, fd508, fd508; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd508, fd507; +fma.rn.f64 fd518, fd508, fd507, fd517; +mul.f64 fd519, fd516, fd491; +mul.f64 fd520, fd518, fd492; +mul.f64 fd521, fd516, fd492; +mul.f64 fd522, fd507, fd516; +mul.f64 fd523, fd508, fd518; +sub.f64 fd524, fd522, fd523; +mul.f64 fd525, fd507, fd518; +fma.rn.f64 fd526, fd508, fd516, fd525; +mul.f64 fd527, fd524, fd495; +mul.f64 fd528, fd526, fd496; +mul.f64 fd529, fd524, fd496; +mul.f64 fd530, fd507, fd524; +mul.f64 fd531, fd508, fd526; +sub.f64 fd532, fd530, fd531; +mul.f64 fd533, fd507, fd526; +fma.rn.f64 fd534, fd508, fd524, fd533; +mul.f64 fd535, fd532, fd499; +mul.f64 fd536, fd534, fd500; +mul.f64 fd537, fd532, fd500; +mul.f64 fd538, fd507, fd532; +mul.f64 fd539, fd508, fd534; +sub.f64 fd540, fd538, fd539; +mul.f64 fd541, fd507, fd534; +fma.rn.f64 fd542, fd508, fd532, fd541; +mul.f64 fd543, fd540, fd503; +mul.f64 fd544, fd542, fd504; +mul.f64 fd545, fd540, fd504; +mul.f64 fd546, fd507, fd540; +mul.f64 fd547, fd508, fd542; +sub.f64 fd548, fd546, fd547; +mul.f64 fd549, fd507, fd542; +fma.rn.f64 fd550, fd508, fd540, fd549; +mul.f64 fd551, fd548, fd485; +mul.f64 fd552, fd550, fd486; +mul.f64 fd553, fd548, fd486; +ld.global.v2.f64 {fd554, fd555}, [rd11+192]; +mul.f64 fd558, fd554, fd489; +mul.f64 fd559, fd555, fd490; +mul.f64 fd560, fd554, fd490; +mul.f64 fd561, fd507, fd554; +mul.f64 fd562, fd508, fd555; +sub.f64 fd563, fd561, fd562; +mul.f64 fd564, fd507, fd555; +fma.rn.f64 fd565, fd508, fd554, fd564; +mul.f64 fd566, fd563, fd493; +mul.f64 fd567, fd565, fd494; +mul.f64 fd568, fd563, fd494; +mul.f64 fd569, fd507, fd563; +mul.f64 fd570, fd508, fd565; +sub.f64 fd571, fd569, fd570; +mul.f64 fd572, fd507, fd565; +fma.rn.f64 fd573, fd508, fd563, fd572; +mul.f64 fd574, fd571, fd497; +mul.f64 fd575, fd573, fd498; +mul.f64 fd576, fd571, fd498; +mul.f64 fd577, fd507, fd571; +mul.f64 fd578, fd508, fd573; +sub.f64 fd579, fd577, fd578; +mul.f64 fd580, fd507, fd573; +fma.rn.f64 fd581, fd508, fd571, fd580; +mul.f64 fd582, fd579, fd501; +mul.f64 fd583, fd581, fd502; +mul.f64 fd584, fd579, fd502; +mul.f64 fd585, fd507, fd579; +mul.f64 fd586, fd508, fd581; +sub.f64 fd587, fd585, fd586; +mul.f64 fd588, fd507, fd581; +fma.rn.f64 fd589, fd508, fd579, fd588; +mul.f64 fd590, fd587, fd505; +mul.f64 fd591, fd589, fd506; +mul.f64 fd592, fd587, fd506; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 2304, r15; +add.f64 fd593, fd400, fd454; +add.f64 fd594, fd399, fd453; +st.shared.v2.f64 [r16], {fd594, fd593}; +fma.rn.f64 fd595, fd508, fd487, fd513; +sub.f64 fd596, fd511, fd512; +st.shared.v2.f64 [r16+192], {fd596, fd595}; +fma.rn.f64 fd597, fd518, fd491, fd521; +sub.f64 fd598, fd519, fd520; +st.shared.v2.f64 [r16+384], {fd598, fd597}; +fma.rn.f64 fd599, fd526, fd495, fd529; +sub.f64 fd600, fd527, fd528; +st.shared.v2.f64 [r16+576], {fd600, fd599}; +fma.rn.f64 fd601, fd534, fd499, fd537; +sub.f64 fd602, fd535, fd536; +st.shared.v2.f64 [r16+768], {fd602, fd601}; +fma.rn.f64 fd603, fd542, fd503, fd545; +sub.f64 fd604, fd543, fd544; +st.shared.v2.f64 [r16+960], {fd604, fd603}; +fma.rn.f64 fd605, fd550, fd485, fd553; +sub.f64 fd606, fd551, fd552; +st.shared.v2.f64 [r16+1152], {fd606, fd605}; +fma.rn.f64 fd607, fd555, fd489, fd560; +sub.f64 fd608, fd558, fd559; +st.shared.v2.f64 [r16+1344], {fd608, fd607}; +fma.rn.f64 fd609, fd565, fd493, fd568; +sub.f64 fd610, fd566, fd567; +st.shared.v2.f64 [r16+1536], {fd610, fd609}; +fma.rn.f64 fd611, fd573, fd497, fd576; +sub.f64 fd612, fd574, fd575; +st.shared.v2.f64 [r16+1728], {fd612, fd611}; +fma.rn.f64 fd613, fd581, fd501, fd584; +sub.f64 fd614, fd582, fd583; +st.shared.v2.f64 [r16+1920], {fd614, fd613}; +fma.rn.f64 fd615, fd589, fd505, fd592; +sub.f64 fd616, fd590, fd591; +st.shared.v2.f64 [r16+2112], {fd616, fd615}; +barrier.sync 0; +ld.shared.v2.f64 {fd617, fd618}, [r10]; +ld.shared.v2.f64 {fd621, fd622}, [r10+2304]; +ld.shared.v2.f64 {fd625, fd626}, [r10+4608]; +ld.shared.v2.f64 {fd629, fd630}, [r10+6912]; +ld.shared.v2.f64 {fd633, fd634}, [r10+9216]; +ld.shared.v2.f64 {fd637, fd638}, [r10+11520]; +ld.shared.v2.f64 {fd641, fd642}, [r10+13824]; +ld.shared.v2.f64 {fd645, fd646}, [r10+16128]; +ld.shared.v2.f64 {fd649, fd650}, [r10+18432]; +ld.shared.v2.f64 {fd653, fd654}, [r10+20736]; +ld.shared.v2.f64 {fd657, fd658}, [r10+23040]; +ld.shared.v2.f64 {fd661, fd662}, [r10+25344]; +add.f64 fd665, fd633, fd649; +add.f64 fd666, fd617, fd665; +add.f64 fd667, fd634, fd650; +add.f64 fd668, fd618, fd667; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd617, fd669; +sub.f64 fd671, fd634, fd650; +mul.f64 fd672, fd671, 0d3FEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +mul.f64 fd675, fd667, 0d3FE0000000000000; +sub.f64 fd676, fd618, fd675; +sub.f64 fd677, fd633, fd649; +mul.f64 fd678, fd677, 0d3FEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +add.f64 fd681, fd641, fd657; +add.f64 fd682, fd625, fd681; +add.f64 fd683, fd642, fd658; +add.f64 fd684, fd626, fd683; +mul.f64 fd685, fd681, 0d3FE0000000000000; +sub.f64 fd686, fd625, fd685; +sub.f64 fd687, fd642, fd658; +mul.f64 fd688, fd687, 0d3FEBB67AE8584CAA; +add.f64 fd689, fd688, fd686; +sub.f64 fd690, fd686, fd688; +mul.f64 fd691, fd683, 0d3FE0000000000000; +sub.f64 fd692, fd626, fd691; +sub.f64 fd693, fd641, fd657; +mul.f64 fd694, fd693, 0d3FEBB67AE8584CAA; +sub.f64 fd695, fd692, fd694; +add.f64 fd696, fd694, fd692; +mul.f64 fd697, fd689, 0d3FE0000000000000; +mul.f64 fd698, fd695, 0dBFEBB67AE8584CAA; +sub.f64 fd699, fd697, fd698; +mul.f64 fd700, fd695, 0d3FE0000000000000; +fma.rn.f64 fd701, fd689, 0dBFEBB67AE8584CAA, fd700; +mul.f64 fd702, fd690, 0dBFE0000000000000; +mul.f64 fd703, fd696, 0dBFEBB67AE8584CAA; +sub.f64 fd704, fd702, fd703; +mul.f64 fd705, fd696, 0dBFE0000000000000; +fma.rn.f64 fd706, fd690, 0dBFEBB67AE8584CAA, fd705; +add.f64 fd707, fd666, fd682; +add.f64 fd708, fd668, fd684; +sub.f64 fd709, fd666, fd682; +sub.f64 fd710, fd668, fd684; +add.f64 fd711, fd673, fd699; +add.f64 fd712, fd679, fd701; +sub.f64 fd713, fd673, fd699; +sub.f64 fd714, fd679, fd701; +add.f64 fd715, fd674, fd704; +add.f64 fd716, fd680, fd706; +sub.f64 fd717, fd674, fd704; +sub.f64 fd718, fd680, fd706; +add.f64 fd719, fd637, fd653; +add.f64 fd720, fd621, fd719; +add.f64 fd721, fd638, fd654; +add.f64 fd722, fd622, fd721; +mul.f64 fd723, fd719, 0d3FE0000000000000; +sub.f64 fd724, fd621, fd723; +sub.f64 fd725, fd638, fd654; +mul.f64 fd726, fd725, 0d3FEBB67AE8584CAA; +add.f64 fd727, fd726, fd724; +sub.f64 fd728, fd724, fd726; +mul.f64 fd729, fd721, 0d3FE0000000000000; +sub.f64 fd730, fd622, fd729; +sub.f64 fd731, fd637, fd653; +mul.f64 fd732, fd731, 0d3FEBB67AE8584CAA; +sub.f64 fd733, fd730, fd732; +add.f64 fd734, fd732, fd730; +add.f64 fd735, fd645, fd661; +add.f64 fd736, fd629, fd735; +add.f64 fd737, fd646, fd662; +add.f64 fd738, fd630, fd737; +mul.f64 fd739, fd735, 0d3FE0000000000000; +sub.f64 fd740, fd629, fd739; +sub.f64 fd741, fd646, fd662; +mul.f64 fd742, fd741, 0d3FEBB67AE8584CAA; +add.f64 fd743, fd742, fd740; +sub.f64 fd744, fd740, fd742; +mul.f64 fd745, fd737, 0d3FE0000000000000; +sub.f64 fd746, fd630, fd745; +sub.f64 fd747, fd645, fd661; +mul.f64 fd748, fd747, 0d3FEBB67AE8584CAA; +sub.f64 fd749, fd746, fd748; +add.f64 fd750, fd748, fd746; +mul.f64 fd751, fd743, 0d3FE0000000000000; +mul.f64 fd752, fd749, 0dBFEBB67AE8584CAA; +sub.f64 fd753, fd751, fd752; +mul.f64 fd754, fd749, 0d3FE0000000000000; +fma.rn.f64 fd755, fd743, 0dBFEBB67AE8584CAA, fd754; +mul.f64 fd756, fd744, 0dBFE0000000000000; +mul.f64 fd757, fd750, 0dBFEBB67AE8584CAA; +sub.f64 fd758, fd756, fd757; +mul.f64 fd759, fd750, 0dBFE0000000000000; +fma.rn.f64 fd760, fd744, 0dBFEBB67AE8584CAA, fd759; +add.f64 fd761, fd720, fd736; +add.f64 fd762, fd722, fd738; +sub.f64 fd763, fd720, fd736; +sub.f64 fd764, fd722, fd738; +add.f64 fd765, fd727, fd753; +add.f64 fd766, fd733, fd755; +sub.f64 fd767, fd727, fd753; +sub.f64 fd768, fd733, fd755; +add.f64 fd769, fd728, fd758; +add.f64 fd770, fd734, fd760; +sub.f64 fd771, fd728, fd758; +sub.f64 fd772, fd734, fd760; +mul.f64 fd773, fd765, 0d3FEBB67AE8584CAA; +mul.f64 fd774, fd766, 0dBFE0000000000000; +sub.f64 fd775, fd773, fd774; +mul.f64 fd776, fd766, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd777, fd765, 0dBFE0000000000000, fd776; +mul.f64 fd778, fd769, 0d3FE0000000000000; +mul.f64 fd779, fd770, 0dBFEBB67AE8584CAA; +sub.f64 fd780, fd778, fd779; +mul.f64 fd781, fd770, 0d3FE0000000000000; +fma.rn.f64 fd782, fd769, 0dBFEBB67AE8584CAA, fd781; +mul.f64 fd783, fd767, 0dBFE0000000000000; +mul.f64 fd784, fd768, 0dBFEBB67AE8584CAA; +sub.f64 fd785, fd783, fd784; +mul.f64 fd786, fd768, 0dBFE0000000000000; +fma.rn.f64 fd787, fd767, 0dBFEBB67AE8584CAA, fd786; +mul.f64 fd788, fd771, 0dBFEBB67AE8584CAA; +mul.f64 fd789, fd772, 0dBFE0000000000000; +sub.f64 fd790, fd788, fd789; +mul.f64 fd791, fd772, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd792, fd771, 0dBFE0000000000000, fd791; +add.f64 %1, fd708, fd762; +add.f64 %0, fd707, fd761; +add.f64 %3, fd712, fd777; +add.f64 %2, fd711, fd775; +add.f64 %5, fd716, fd782; +add.f64 %4, fd715, fd780; +sub.f64 %7, fd710, fd763; +add.f64 %6, fd709, fd764; +add.f64 %9, fd714, fd787; +add.f64 %8, fd713, fd785; +add.f64 %11, fd718, fd792; +add.f64 %10, fd717, fd790; +sub.f64 %13, fd708, fd762; +sub.f64 %12, fd707, fd761; +sub.f64 %15, fd712, fd777; +sub.f64 %14, fd711, fd775; +sub.f64 %17, fd716, fd782; +sub.f64 %16, fd715, fd780; +add.f64 %19, fd710, fd763; +sub.f64 %18, fd709, fd764; +sub.f64 %21, fd714, fd787; +sub.f64 %20, fd713, fd785; +sub.f64 %23, fd718, fd792; +sub.f64 %22, fd717, fd790; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_1728), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..ccdf45b51192f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_1728_fp64_inv.hpp.inc @@ -0,0 +1,1546 @@ +#ifndef CUFFTDX_FFT_1728_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_1728_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<744, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<769>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 13824, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %37, %48; +add.f64 fd50, %27, fd49; +add.f64 fd51, %39, %50; +add.f64 fd52, %28, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %27, fd53; +sub.f64 fd55, %39, %50; +mul.f64 fd56, fd55, 0dBFEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %28, fd59; +sub.f64 fd61, %37, %48; +mul.f64 fd62, fd61, 0dBFEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %43, %53; +add.f64 fd66, %32, fd65; +add.f64 fd67, %44, %55; +add.f64 fd68, %34, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %32, fd69; +sub.f64 fd71, %44, %55; +mul.f64 fd72, fd71, 0dBFEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %34, fd75; +sub.f64 fd77, %43, %53; +mul.f64 fd78, fd77, 0dBFEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0d3FEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0d3FEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0d3FEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %40, %51; +add.f64 fd104, %29, fd103; +add.f64 fd105, %42, %52; +add.f64 fd106, %31, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %29, fd107; +sub.f64 fd109, %42, %52; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %31, fd113; +sub.f64 fd115, %40, %51; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %45, %56; +add.f64 fd120, %35, fd119; +add.f64 fd121, %47, %57; +add.f64 fd122, %36, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %35, fd123; +sub.f64 fd125, %47, %57; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %36, fd129; +sub.f64 fd131, %45, %56; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0d3FEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0d3FEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0d3FEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0d3FE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0d3FE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0d3FEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0d3FEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0d3FEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0d3FE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0d3FE0000000000000, fd175; +add.f64 fd177, fd91, fd145; +add.f64 fd178, fd92, fd146; +sub.f64 fd179, fd91, fd145; +sub.f64 fd180, fd92, fd146; +add.f64 fd181, fd95, fd159; +add.f64 fd182, fd96, fd161; +sub.f64 fd183, fd95, fd159; +sub.f64 fd184, fd96, fd161; +add.f64 fd185, fd99, fd164; +add.f64 fd186, fd100, fd166; +sub.f64 fd187, fd99, fd164; +sub.f64 fd188, fd100, fd166; +sub.f64 fd189, fd93, fd148; +add.f64 fd190, fd94, fd147; +add.f64 fd191, fd93, fd148; +sub.f64 fd192, fd94, fd147; +add.f64 fd193, fd97, fd169; +add.f64 fd194, fd98, fd171; +sub.f64 fd195, fd97, fd169; +sub.f64 fd196, fd98, fd171; +add.f64 fd197, fd101, fd174; +add.f64 fd198, fd102, fd176; +sub.f64 fd199, fd101, fd174; +sub.f64 fd200, fd102, fd176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd201, fd202}, [rd6]; +mul.f64 fd205, fd182, fd202; +fma.rn.f64 fd206, fd201, fd181, fd205; +mul.f64 fd207, fd181, fd202; +mul.f64 fd208, fd201, fd182; +sub.f64 fd209, fd208, fd207; +mul.f64 fd210, fd201, fd201; +mul.f64 fd211, fd202, fd202; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd202, fd201; +fma.rn.f64 fd214, fd202, fd201, fd213; +mul.f64 fd215, fd186, fd214; +fma.rn.f64 fd216, fd212, fd185, fd215; +mul.f64 fd217, fd185, fd214; +mul.f64 fd218, fd212, fd186; +sub.f64 fd219, fd218, fd217; +mul.f64 fd220, fd201, fd212; +mul.f64 fd221, fd202, fd214; +sub.f64 fd222, fd220, fd221; +mul.f64 fd223, fd201, fd214; +fma.rn.f64 fd224, fd202, fd212, fd223; +mul.f64 fd225, fd190, fd224; +fma.rn.f64 fd226, fd222, fd189, fd225; +mul.f64 fd227, fd189, fd224; +mul.f64 fd228, fd222, fd190; +sub.f64 fd229, fd228, fd227; +mul.f64 fd230, fd201, fd222; +mul.f64 fd231, fd202, fd224; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd201, fd224; +fma.rn.f64 fd234, fd202, fd222, fd233; +mul.f64 fd235, fd194, fd234; +fma.rn.f64 fd236, fd232, fd193, fd235; +mul.f64 fd237, fd193, fd234; +mul.f64 fd238, fd232, fd194; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd201, fd232; +mul.f64 fd241, fd202, fd234; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd201, fd234; +fma.rn.f64 fd244, fd202, fd232, fd243; +mul.f64 fd245, fd198, fd244; +fma.rn.f64 fd246, fd242, fd197, fd245; +mul.f64 fd247, fd197, fd244; +mul.f64 fd248, fd242, fd198; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd201, fd242; +mul.f64 fd251, fd202, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd201, fd244; +fma.rn.f64 fd254, fd202, fd242, fd253; +mul.f64 fd255, fd180, fd254; +fma.rn.f64 fd256, fd252, fd179, fd255; +mul.f64 fd257, fd179, fd254; +mul.f64 fd258, fd252, fd180; +sub.f64 fd259, fd258, fd257; +ld.global.v2.f64 {fd260, fd261}, [rd6+2304]; +mul.f64 fd264, fd184, fd261; +fma.rn.f64 fd265, fd260, fd183, fd264; +mul.f64 fd266, fd183, fd261; +mul.f64 fd267, fd260, fd184; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd201, fd260; +mul.f64 fd270, fd202, fd261; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd201, fd261; +fma.rn.f64 fd273, fd202, fd260, fd272; +mul.f64 fd274, fd188, fd273; +fma.rn.f64 fd275, fd271, fd187, fd274; +mul.f64 fd276, fd187, fd273; +mul.f64 fd277, fd271, fd188; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd201, fd271; +mul.f64 fd280, fd202, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd201, fd273; +fma.rn.f64 fd283, fd202, fd271, fd282; +mul.f64 fd284, fd192, fd283; +fma.rn.f64 fd285, fd281, fd191, fd284; +mul.f64 fd286, fd191, fd283; +mul.f64 fd287, fd281, fd192; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd201, fd281; +mul.f64 fd290, fd202, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd201, fd283; +fma.rn.f64 fd293, fd202, fd281, fd292; +mul.f64 fd294, fd196, fd293; +fma.rn.f64 fd295, fd291, fd195, fd294; +mul.f64 fd296, fd195, fd293; +mul.f64 fd297, fd291, fd196; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd201, fd291; +mul.f64 fd300, fd202, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd201, fd293; +fma.rn.f64 fd303, fd202, fd291, fd302; +mul.f64 fd304, fd200, fd303; +fma.rn.f64 fd305, fd301, fd199, fd304; +mul.f64 fd306, fd199, fd303; +mul.f64 fd307, fd301, fd200; +sub.f64 fd308, fd307, fd306; +mad.lo.s32 r8, r5, 13824, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +st.shared.v2.f64 [r9], {fd177, fd206}; +st.shared.v2.f64 [r9+16], {fd216, fd226}; +st.shared.v2.f64 [r9+32], {fd236, fd246}; +st.shared.v2.f64 [r9+48], {fd256, fd265}; +st.shared.v2.f64 [r9+64], {fd275, fd285}; +st.shared.v2.f64 [r9+80], {fd295, fd305}; +barrier.sync 0; +mad.lo.s32 r10, r7, -88, r9; +ld.shared.f64 fd309, [r10]; +ld.shared.f64 fd310, [r10+1152]; +ld.shared.f64 fd311, [r10+2304]; +ld.shared.f64 fd312, [r10+3456]; +ld.shared.f64 fd313, [r10+4608]; +ld.shared.f64 fd314, [r10+5760]; +ld.shared.f64 fd315, [r10+6912]; +ld.shared.f64 fd316, [r10+8064]; +ld.shared.f64 fd317, [r10+9216]; +ld.shared.f64 fd318, [r10+10368]; +ld.shared.f64 fd319, [r10+11520]; +ld.shared.f64 fd320, [r10+12672]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd178, fd209}; +st.shared.v2.f64 [r9+16], {fd219, fd229}; +st.shared.v2.f64 [r9+32], {fd239, fd249}; +st.shared.v2.f64 [r9+48], {fd259, fd268}; +st.shared.v2.f64 [r9+64], {fd278, fd288}; +st.shared.v2.f64 [r9+80], {fd298, fd308}; +barrier.sync 0; +ld.shared.f64 fd321, [r10]; +ld.shared.f64 fd322, [r10+1152]; +ld.shared.f64 fd323, [r10+2304]; +ld.shared.f64 fd324, [r10+3456]; +ld.shared.f64 fd325, [r10+4608]; +ld.shared.f64 fd326, [r10+5760]; +ld.shared.f64 fd327, [r10+6912]; +ld.shared.f64 fd328, [r10+8064]; +ld.shared.f64 fd329, [r10+9216]; +ld.shared.f64 fd330, [r10+10368]; +ld.shared.f64 fd331, [r10+11520]; +ld.shared.f64 fd332, [r10+12672]; +add.f64 fd333, fd313, fd317; +add.f64 fd334, fd309, fd333; +add.f64 fd335, fd325, fd329; +add.f64 fd336, fd321, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd309, fd337; +sub.f64 fd339, fd325, fd329; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd321, fd343; +sub.f64 fd345, fd313, fd317; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd315, fd319; +add.f64 fd350, fd311, fd349; +add.f64 fd351, fd327, fd331; +add.f64 fd352, fd323, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd311, fd353; +sub.f64 fd355, fd327, fd331; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd323, fd359; +sub.f64 fd361, fd315, fd319; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.f64 fd365, fd357, 0d3FE0000000000000; +mul.f64 fd366, fd363, 0d3FEBB67AE8584CAA; +sub.f64 fd367, fd365, fd366; +mul.f64 fd368, fd363, 0d3FE0000000000000; +fma.rn.f64 fd369, fd357, 0d3FEBB67AE8584CAA, fd368; +mul.f64 fd370, fd358, 0dBFE0000000000000; +mul.f64 fd371, fd364, 0d3FEBB67AE8584CAA; +sub.f64 fd372, fd370, fd371; +mul.f64 fd373, fd364, 0dBFE0000000000000; +fma.rn.f64 fd374, fd358, 0d3FEBB67AE8584CAA, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd336, fd352; +sub.f64 fd377, fd334, fd350; +sub.f64 fd378, fd336, fd352; +add.f64 fd379, fd341, fd367; +add.f64 fd380, fd347, fd369; +sub.f64 fd381, fd341, fd367; +sub.f64 fd382, fd347, fd369; +add.f64 fd383, fd342, fd372; +add.f64 fd384, fd348, fd374; +sub.f64 fd385, fd342, fd372; +sub.f64 fd386, fd348, fd374; +add.f64 fd387, fd314, fd318; +add.f64 fd388, fd310, fd387; +add.f64 fd389, fd326, fd330; +add.f64 fd390, fd322, fd389; +mul.f64 fd391, fd387, 0d3FE0000000000000; +sub.f64 fd392, fd310, fd391; +sub.f64 fd393, fd326, fd330; +mul.f64 fd394, fd393, 0dBFEBB67AE8584CAA; +add.f64 fd395, fd394, fd392; +sub.f64 fd396, fd392, fd394; +mul.f64 fd397, fd389, 0d3FE0000000000000; +sub.f64 fd398, fd322, fd397; +sub.f64 fd399, fd314, fd318; +mul.f64 fd400, fd399, 0dBFEBB67AE8584CAA; +sub.f64 fd401, fd398, fd400; +add.f64 fd402, fd400, fd398; +add.f64 fd403, fd316, fd320; +add.f64 fd404, fd312, fd403; +add.f64 fd405, fd328, fd332; +add.f64 fd406, fd324, fd405; +mul.f64 fd407, fd403, 0d3FE0000000000000; +sub.f64 fd408, fd312, fd407; +sub.f64 fd409, fd328, fd332; +mul.f64 fd410, fd409, 0dBFEBB67AE8584CAA; +add.f64 fd411, fd410, fd408; +sub.f64 fd412, fd408, fd410; +mul.f64 fd413, fd405, 0d3FE0000000000000; +sub.f64 fd414, fd324, fd413; +sub.f64 fd415, fd316, fd320; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +sub.f64 fd417, fd414, fd416; +add.f64 fd418, fd416, fd414; +mul.f64 fd419, fd411, 0d3FE0000000000000; +mul.f64 fd420, fd417, 0d3FEBB67AE8584CAA; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd417, 0d3FE0000000000000; +fma.rn.f64 fd423, fd411, 0d3FEBB67AE8584CAA, fd422; +mul.f64 fd424, fd412, 0dBFE0000000000000; +mul.f64 fd425, fd418, 0d3FEBB67AE8584CAA; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd418, 0dBFE0000000000000; +fma.rn.f64 fd428, fd412, 0d3FEBB67AE8584CAA, fd427; +add.f64 fd429, fd388, fd404; +add.f64 fd430, fd390, fd406; +sub.f64 fd431, fd388, fd404; +sub.f64 fd432, fd390, fd406; +add.f64 fd433, fd395, fd421; +add.f64 fd434, fd401, fd423; +sub.f64 fd435, fd395, fd421; +sub.f64 fd436, fd401, fd423; +add.f64 fd437, fd396, fd426; +add.f64 fd438, fd402, fd428; +sub.f64 fd439, fd396, fd426; +sub.f64 fd440, fd402, fd428; +mul.f64 fd441, fd433, 0d3FEBB67AE8584CAA; +mul.f64 fd442, fd434, 0d3FE0000000000000; +sub.f64 fd443, fd441, fd442; +mul.f64 fd444, fd434, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd445, fd433, 0d3FE0000000000000, fd444; +mul.f64 fd446, fd437, 0d3FE0000000000000; +mul.f64 fd447, fd438, 0d3FEBB67AE8584CAA; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd438, 0d3FE0000000000000; +fma.rn.f64 fd450, fd437, 0d3FEBB67AE8584CAA, fd449; +mul.f64 fd451, fd435, 0dBFE0000000000000; +mul.f64 fd452, fd436, 0d3FEBB67AE8584CAA; +sub.f64 fd453, fd451, fd452; +mul.f64 fd454, fd436, 0dBFE0000000000000; +fma.rn.f64 fd455, fd435, 0d3FEBB67AE8584CAA, fd454; +mul.f64 fd456, fd439, 0dBFEBB67AE8584CAA; +mul.f64 fd457, fd440, 0d3FE0000000000000; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd440, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd460, fd439, 0d3FE0000000000000, fd459; +add.f64 fd461, fd375, fd429; +add.f64 fd462, fd376, fd430; +sub.f64 fd463, fd375, fd429; +sub.f64 fd464, fd376, fd430; +add.f64 fd465, fd379, fd443; +add.f64 fd466, fd380, fd445; +sub.f64 fd467, fd379, fd443; +sub.f64 fd468, fd380, fd445; +add.f64 fd469, fd383, fd448; +add.f64 fd470, fd384, fd450; +sub.f64 fd471, fd383, fd448; +sub.f64 fd472, fd384, fd450; +sub.f64 fd473, fd377, fd432; +add.f64 fd474, fd378, fd431; +add.f64 fd475, fd377, fd432; +sub.f64 fd476, fd378, fd431; +add.f64 fd477, fd381, fd453; +add.f64 fd478, fd382, fd455; +sub.f64 fd479, fd381, fd453; +sub.f64 fd480, fd382, fd455; +add.f64 fd481, fd385, fd458; +add.f64 fd482, fd386, fd460; +sub.f64 fd483, fd385, fd458; +sub.f64 fd484, fd386, fd460; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd485, fd486}, [rd11]; +mul.f64 fd489, fd466, fd486; +fma.rn.f64 fd490, fd485, fd465, fd489; +mul.f64 fd491, fd465, fd486; +mul.f64 fd492, fd485, fd466; +sub.f64 fd493, fd492, fd491; +mul.f64 fd494, fd485, fd485; +mul.f64 fd495, fd486, fd486; +sub.f64 fd496, fd494, fd495; +mul.f64 fd497, fd486, fd485; +fma.rn.f64 fd498, fd486, fd485, fd497; +mul.f64 fd499, fd470, fd498; +fma.rn.f64 fd500, fd496, fd469, fd499; +mul.f64 fd501, fd469, fd498; +mul.f64 fd502, fd496, fd470; +sub.f64 fd503, fd502, fd501; +mul.f64 fd504, fd485, fd496; +mul.f64 fd505, fd486, fd498; +sub.f64 fd506, fd504, fd505; +mul.f64 fd507, fd485, fd498; +fma.rn.f64 fd508, fd486, fd496, fd507; +mul.f64 fd509, fd474, fd508; +fma.rn.f64 fd510, fd506, fd473, fd509; +mul.f64 fd511, fd473, fd508; +mul.f64 fd512, fd506, fd474; +sub.f64 fd513, fd512, fd511; +mul.f64 fd514, fd485, fd506; +mul.f64 fd515, fd486, fd508; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd485, fd508; +fma.rn.f64 fd518, fd486, fd506, fd517; +mul.f64 fd519, fd478, fd518; +fma.rn.f64 fd520, fd516, fd477, fd519; +mul.f64 fd521, fd477, fd518; +mul.f64 fd522, fd516, fd478; +sub.f64 fd523, fd522, fd521; +mul.f64 fd524, fd485, fd516; +mul.f64 fd525, fd486, fd518; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd485, fd518; +fma.rn.f64 fd528, fd486, fd516, fd527; +mul.f64 fd529, fd482, fd528; +fma.rn.f64 fd530, fd526, fd481, fd529; +mul.f64 fd531, fd481, fd528; +mul.f64 fd532, fd526, fd482; +sub.f64 fd533, fd532, fd531; +mul.f64 fd534, fd485, fd526; +mul.f64 fd535, fd486, fd528; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd485, fd528; +fma.rn.f64 fd538, fd486, fd526, fd537; +mul.f64 fd539, fd464, fd538; +fma.rn.f64 fd540, fd536, fd463, fd539; +mul.f64 fd541, fd463, fd538; +mul.f64 fd542, fd536, fd464; +sub.f64 fd543, fd542, fd541; +ld.global.v2.f64 {fd544, fd545}, [rd11+192]; +mul.f64 fd548, fd468, fd545; +fma.rn.f64 fd549, fd544, fd467, fd548; +mul.f64 fd550, fd467, fd545; +mul.f64 fd551, fd544, fd468; +sub.f64 fd552, fd551, fd550; +mul.f64 fd553, fd485, fd544; +mul.f64 fd554, fd486, fd545; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd485, fd545; +fma.rn.f64 fd557, fd486, fd544, fd556; +mul.f64 fd558, fd472, fd557; +fma.rn.f64 fd559, fd555, fd471, fd558; +mul.f64 fd560, fd471, fd557; +mul.f64 fd561, fd555, fd472; +sub.f64 fd562, fd561, fd560; +mul.f64 fd563, fd485, fd555; +mul.f64 fd564, fd486, fd557; +sub.f64 fd565, fd563, fd564; +mul.f64 fd566, fd485, fd557; +fma.rn.f64 fd567, fd486, fd555, fd566; +mul.f64 fd568, fd476, fd567; +fma.rn.f64 fd569, fd565, fd475, fd568; +mul.f64 fd570, fd475, fd567; +mul.f64 fd571, fd565, fd476; +sub.f64 fd572, fd571, fd570; +mul.f64 fd573, fd485, fd565; +mul.f64 fd574, fd486, fd567; +sub.f64 fd575, fd573, fd574; +mul.f64 fd576, fd485, fd567; +fma.rn.f64 fd577, fd486, fd565, fd576; +mul.f64 fd578, fd480, fd577; +fma.rn.f64 fd579, fd575, fd479, fd578; +mul.f64 fd580, fd479, fd577; +mul.f64 fd581, fd575, fd480; +sub.f64 fd582, fd581, fd580; +mul.f64 fd583, fd485, fd575; +mul.f64 fd584, fd486, fd577; +sub.f64 fd585, fd583, fd584; +mul.f64 fd586, fd485, fd577; +fma.rn.f64 fd587, fd486, fd575, fd586; +mul.f64 fd588, fd484, fd587; +fma.rn.f64 fd589, fd585, fd483, fd588; +mul.f64 fd590, fd483, fd587; +mul.f64 fd591, fd585, fd484; +sub.f64 fd592, fd591, fd590; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 1152, r15; +st.shared.f64 [r16], fd461; +st.shared.f64 [r16+96], fd490; +st.shared.f64 [r16+192], fd500; +st.shared.f64 [r16+288], fd510; +st.shared.f64 [r16+384], fd520; +st.shared.f64 [r16+480], fd530; +st.shared.f64 [r16+576], fd540; +st.shared.f64 [r16+672], fd549; +st.shared.f64 [r16+768], fd559; +st.shared.f64 [r16+864], fd569; +st.shared.f64 [r16+960], fd579; +st.shared.f64 [r16+1056], fd589; +barrier.sync 0; +ld.shared.f64 fd593, [r10]; +ld.shared.f64 fd594, [r10+1152]; +ld.shared.f64 fd595, [r10+2304]; +ld.shared.f64 fd596, [r10+3456]; +ld.shared.f64 fd597, [r10+4608]; +ld.shared.f64 fd598, [r10+5760]; +ld.shared.f64 fd599, [r10+6912]; +ld.shared.f64 fd600, [r10+8064]; +ld.shared.f64 fd601, [r10+9216]; +ld.shared.f64 fd602, [r10+10368]; +ld.shared.f64 fd603, [r10+11520]; +ld.shared.f64 fd604, [r10+12672]; +barrier.sync 0; +st.shared.f64 [r16], fd462; +st.shared.f64 [r16+96], fd493; +st.shared.f64 [r16+192], fd503; +st.shared.f64 [r16+288], fd513; +st.shared.f64 [r16+384], fd523; +st.shared.f64 [r16+480], fd533; +st.shared.f64 [r16+576], fd543; +st.shared.f64 [r16+672], fd552; +st.shared.f64 [r16+768], fd562; +st.shared.f64 [r16+864], fd572; +st.shared.f64 [r16+960], fd582; +st.shared.f64 [r16+1056], fd592; +barrier.sync 0; +ld.shared.f64 fd605, [r10]; +ld.shared.f64 fd606, [r10+1152]; +ld.shared.f64 fd607, [r10+2304]; +ld.shared.f64 fd608, [r10+3456]; +ld.shared.f64 fd609, [r10+4608]; +ld.shared.f64 fd610, [r10+5760]; +ld.shared.f64 fd611, [r10+6912]; +ld.shared.f64 fd612, [r10+8064]; +ld.shared.f64 fd613, [r10+9216]; +ld.shared.f64 fd614, [r10+10368]; +ld.shared.f64 fd615, [r10+11520]; +ld.shared.f64 fd616, [r10+12672]; +add.f64 fd617, fd597, fd601; +add.f64 fd618, fd593, fd617; +add.f64 fd619, fd609, fd613; +add.f64 fd620, fd605, fd619; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd593, fd621; +sub.f64 fd623, fd609, fd613; +mul.f64 fd624, fd623, 0dBFEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +mul.f64 fd627, fd619, 0d3FE0000000000000; +sub.f64 fd628, fd605, fd627; +sub.f64 fd629, fd597, fd601; +mul.f64 fd630, fd629, 0dBFEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd599, fd603; +add.f64 fd634, fd595, fd633; +add.f64 fd635, fd611, fd615; +add.f64 fd636, fd607, fd635; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd595, fd637; +sub.f64 fd639, fd611, fd615; +mul.f64 fd640, fd639, 0dBFEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +mul.f64 fd643, fd635, 0d3FE0000000000000; +sub.f64 fd644, fd607, fd643; +sub.f64 fd645, fd599, fd603; +mul.f64 fd646, fd645, 0dBFEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +mul.f64 fd649, fd641, 0d3FE0000000000000; +mul.f64 fd650, fd647, 0d3FEBB67AE8584CAA; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd647, 0d3FE0000000000000; +fma.rn.f64 fd653, fd641, 0d3FEBB67AE8584CAA, fd652; +mul.f64 fd654, fd642, 0dBFE0000000000000; +mul.f64 fd655, fd648, 0d3FEBB67AE8584CAA; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd648, 0dBFE0000000000000; +fma.rn.f64 fd658, fd642, 0d3FEBB67AE8584CAA, fd657; +add.f64 fd659, fd618, fd634; +add.f64 fd660, fd620, fd636; +sub.f64 fd661, fd618, fd634; +sub.f64 fd662, fd620, fd636; +add.f64 fd663, fd625, fd651; +add.f64 fd664, fd631, fd653; +sub.f64 fd665, fd625, fd651; +sub.f64 fd666, fd631, fd653; +add.f64 fd667, fd626, fd656; +add.f64 fd668, fd632, fd658; +sub.f64 fd669, fd626, fd656; +sub.f64 fd670, fd632, fd658; +add.f64 fd671, fd598, fd602; +add.f64 fd672, fd594, fd671; +add.f64 fd673, fd610, fd614; +add.f64 fd674, fd606, fd673; +mul.f64 fd675, fd671, 0d3FE0000000000000; +sub.f64 fd676, fd594, fd675; +sub.f64 fd677, fd610, fd614; +mul.f64 fd678, fd677, 0dBFEBB67AE8584CAA; +add.f64 fd679, fd678, fd676; +sub.f64 fd680, fd676, fd678; +mul.f64 fd681, fd673, 0d3FE0000000000000; +sub.f64 fd682, fd606, fd681; +sub.f64 fd683, fd598, fd602; +mul.f64 fd684, fd683, 0dBFEBB67AE8584CAA; +sub.f64 fd685, fd682, fd684; +add.f64 fd686, fd684, fd682; +add.f64 fd687, fd600, fd604; +add.f64 fd688, fd596, fd687; +add.f64 fd689, fd612, fd616; +add.f64 fd690, fd608, fd689; +mul.f64 fd691, fd687, 0d3FE0000000000000; +sub.f64 fd692, fd596, fd691; +sub.f64 fd693, fd612, fd616; +mul.f64 fd694, fd693, 0dBFEBB67AE8584CAA; +add.f64 fd695, fd694, fd692; +sub.f64 fd696, fd692, fd694; +mul.f64 fd697, fd689, 0d3FE0000000000000; +sub.f64 fd698, fd608, fd697; +sub.f64 fd699, fd600, fd604; +mul.f64 fd700, fd699, 0dBFEBB67AE8584CAA; +sub.f64 fd701, fd698, fd700; +add.f64 fd702, fd700, fd698; +mul.f64 fd703, fd695, 0d3FE0000000000000; +mul.f64 fd704, fd701, 0d3FEBB67AE8584CAA; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd701, 0d3FE0000000000000; +fma.rn.f64 fd707, fd695, 0d3FEBB67AE8584CAA, fd706; +mul.f64 fd708, fd696, 0dBFE0000000000000; +mul.f64 fd709, fd702, 0d3FEBB67AE8584CAA; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd702, 0dBFE0000000000000; +fma.rn.f64 fd712, fd696, 0d3FEBB67AE8584CAA, fd711; +add.f64 fd713, fd672, fd688; +add.f64 fd714, fd674, fd690; +sub.f64 fd715, fd672, fd688; +sub.f64 fd716, fd674, fd690; +add.f64 fd717, fd679, fd705; +add.f64 fd718, fd685, fd707; +sub.f64 fd719, fd679, fd705; +sub.f64 fd720, fd685, fd707; +add.f64 fd721, fd680, fd710; +add.f64 fd722, fd686, fd712; +sub.f64 fd723, fd680, fd710; +sub.f64 fd724, fd686, fd712; +mul.f64 fd725, fd717, 0d3FEBB67AE8584CAA; +mul.f64 fd726, fd718, 0d3FE0000000000000; +sub.f64 fd727, fd725, fd726; +mul.f64 fd728, fd718, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd729, fd717, 0d3FE0000000000000, fd728; +mul.f64 fd730, fd721, 0d3FE0000000000000; +mul.f64 fd731, fd722, 0d3FEBB67AE8584CAA; +sub.f64 fd732, fd730, fd731; +mul.f64 fd733, fd722, 0d3FE0000000000000; +fma.rn.f64 fd734, fd721, 0d3FEBB67AE8584CAA, fd733; +mul.f64 fd735, fd719, 0dBFE0000000000000; +mul.f64 fd736, fd720, 0d3FEBB67AE8584CAA; +sub.f64 fd737, fd735, fd736; +mul.f64 fd738, fd720, 0dBFE0000000000000; +fma.rn.f64 fd739, fd719, 0d3FEBB67AE8584CAA, fd738; +mul.f64 fd740, fd723, 0dBFEBB67AE8584CAA; +mul.f64 fd741, fd724, 0d3FE0000000000000; +sub.f64 fd742, fd740, fd741; +mul.f64 fd743, fd724, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd744, fd723, 0d3FE0000000000000, fd743; +add.f64 %0, fd659, fd713; +add.f64 %1, fd660, fd714; +add.f64 %3, fd664, fd729; +add.f64 %2, fd663, fd727; +add.f64 %5, fd668, fd734; +add.f64 %4, fd667, fd732; +add.f64 %7, fd662, fd715; +sub.f64 %6, fd661, fd716; +add.f64 %9, fd666, fd739; +add.f64 %8, fd665, fd737; +add.f64 %11, fd670, fd744; +add.f64 %10, fd669, fd742; +sub.f64 %12, fd659, fd713; +sub.f64 %13, fd660, fd714; +sub.f64 %15, fd664, fd729; +sub.f64 %14, fd663, fd727; +sub.f64 %17, fd668, fd734; +sub.f64 %16, fd667, fd732; +sub.f64 %19, fd662, fd715; +add.f64 %18, fd661, fd716; +sub.f64 %21, fd666, fd739; +sub.f64 %20, fd665, fd737; +sub.f64 %23, fd670, fd744; +sub.f64 %22, fd669, fd742; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_1728), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<745, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<817>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %24; +mad.lo.s32 r3, r1, 27648, r2; +mov.u32 r4, %tid.x; +add.f64 fd49, %37, %48; +add.f64 fd50, %27, fd49; +add.f64 fd51, %39, %50; +add.f64 fd52, %28, fd51; +mul.f64 fd53, fd49, 0d3FE0000000000000; +sub.f64 fd54, %27, fd53; +sub.f64 fd55, %39, %50; +mul.f64 fd56, fd55, 0dBFEBB67AE8584CAA; +add.f64 fd57, fd56, fd54; +sub.f64 fd58, fd54, fd56; +mul.f64 fd59, fd51, 0d3FE0000000000000; +sub.f64 fd60, %28, fd59; +sub.f64 fd61, %37, %48; +mul.f64 fd62, fd61, 0dBFEBB67AE8584CAA; +sub.f64 fd63, fd60, fd62; +add.f64 fd64, fd62, fd60; +add.f64 fd65, %43, %53; +add.f64 fd66, %32, fd65; +add.f64 fd67, %44, %55; +add.f64 fd68, %34, fd67; +mul.f64 fd69, fd65, 0d3FE0000000000000; +sub.f64 fd70, %32, fd69; +sub.f64 fd71, %44, %55; +mul.f64 fd72, fd71, 0dBFEBB67AE8584CAA; +add.f64 fd73, fd72, fd70; +sub.f64 fd74, fd70, fd72; +mul.f64 fd75, fd67, 0d3FE0000000000000; +sub.f64 fd76, %34, fd75; +sub.f64 fd77, %43, %53; +mul.f64 fd78, fd77, 0dBFEBB67AE8584CAA; +sub.f64 fd79, fd76, fd78; +add.f64 fd80, fd78, fd76; +mul.f64 fd81, fd73, 0d3FE0000000000000; +mul.f64 fd82, fd79, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd81, fd82; +mul.f64 fd84, fd79, 0d3FE0000000000000; +fma.rn.f64 fd85, fd73, 0d3FEBB67AE8584CAA, fd84; +mul.f64 fd86, fd74, 0dBFE0000000000000; +mul.f64 fd87, fd80, 0d3FEBB67AE8584CAA; +sub.f64 fd88, fd86, fd87; +mul.f64 fd89, fd80, 0dBFE0000000000000; +fma.rn.f64 fd90, fd74, 0d3FEBB67AE8584CAA, fd89; +add.f64 fd91, fd50, fd66; +add.f64 fd92, fd52, fd68; +sub.f64 fd93, fd50, fd66; +sub.f64 fd94, fd52, fd68; +add.f64 fd95, fd57, fd83; +add.f64 fd96, fd63, fd85; +sub.f64 fd97, fd57, fd83; +sub.f64 fd98, fd63, fd85; +add.f64 fd99, fd58, fd88; +add.f64 fd100, fd64, fd90; +sub.f64 fd101, fd58, fd88; +sub.f64 fd102, fd64, fd90; +add.f64 fd103, %40, %51; +add.f64 fd104, %29, fd103; +add.f64 fd105, %42, %52; +add.f64 fd106, %31, fd105; +mul.f64 fd107, fd103, 0d3FE0000000000000; +sub.f64 fd108, %29, fd107; +sub.f64 fd109, %42, %52; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd105, 0d3FE0000000000000; +sub.f64 fd114, %31, fd113; +sub.f64 fd115, %40, %51; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, %45, %56; +add.f64 fd120, %35, fd119; +add.f64 fd121, %47, %57; +add.f64 fd122, %36, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, %35, fd123; +sub.f64 fd125, %47, %57; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, %36, fd129; +sub.f64 fd131, %45, %56; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +mul.f64 fd135, fd127, 0d3FE0000000000000; +mul.f64 fd136, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd133, 0d3FE0000000000000; +fma.rn.f64 fd139, fd127, 0d3FEBB67AE8584CAA, fd138; +mul.f64 fd140, fd128, 0dBFE0000000000000; +mul.f64 fd141, fd134, 0d3FEBB67AE8584CAA; +sub.f64 fd142, fd140, fd141; +mul.f64 fd143, fd134, 0dBFE0000000000000; +fma.rn.f64 fd144, fd128, 0d3FEBB67AE8584CAA, fd143; +add.f64 fd145, fd104, fd120; +add.f64 fd146, fd106, fd122; +sub.f64 fd147, fd104, fd120; +sub.f64 fd148, fd106, fd122; +add.f64 fd149, fd111, fd137; +add.f64 fd150, fd117, fd139; +sub.f64 fd151, fd111, fd137; +sub.f64 fd152, fd117, fd139; +add.f64 fd153, fd112, fd142; +add.f64 fd154, fd118, fd144; +sub.f64 fd155, fd112, fd142; +sub.f64 fd156, fd118, fd144; +mul.f64 fd157, fd149, 0d3FEBB67AE8584CAA; +mul.f64 fd158, fd150, 0d3FE0000000000000; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd150, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd161, fd149, 0d3FE0000000000000, fd160; +mul.f64 fd162, fd153, 0d3FE0000000000000; +mul.f64 fd163, fd154, 0d3FEBB67AE8584CAA; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, 0d3FE0000000000000; +fma.rn.f64 fd166, fd153, 0d3FEBB67AE8584CAA, fd165; +mul.f64 fd167, fd151, 0dBFE0000000000000; +mul.f64 fd168, fd152, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd152, 0dBFE0000000000000; +fma.rn.f64 fd171, fd151, 0d3FEBB67AE8584CAA, fd170; +mul.f64 fd172, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd173, fd156, 0d3FE0000000000000; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd156, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd176, fd155, 0d3FE0000000000000, fd175; +sub.f64 fd177, fd91, fd145; +sub.f64 fd178, fd92, fd146; +add.f64 fd179, fd95, fd159; +add.f64 fd180, fd96, fd161; +sub.f64 fd181, fd95, fd159; +sub.f64 fd182, fd96, fd161; +add.f64 fd183, fd99, fd164; +add.f64 fd184, fd100, fd166; +sub.f64 fd185, fd99, fd164; +sub.f64 fd186, fd100, fd166; +sub.f64 fd187, fd93, fd148; +add.f64 fd188, fd94, fd147; +add.f64 fd189, fd93, fd148; +sub.f64 fd190, fd94, fd147; +add.f64 fd191, fd97, fd169; +add.f64 fd192, fd98, fd171; +sub.f64 fd193, fd97, fd169; +sub.f64 fd194, fd98, fd171; +add.f64 fd195, fd101, fd174; +add.f64 fd196, fd102, fd176; +sub.f64 fd197, fd101, fd174; +sub.f64 fd198, fd102, fd176; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 37; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 144; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 27648, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %25; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd199, fd200}, [rd6]; +mul.f64 fd203, fd180, fd200; +mul.f64 fd204, fd179, fd200; +mul.f64 fd205, fd199, fd180; +mul.f64 fd206, fd199, fd199; +mul.f64 fd207, fd200, fd200; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd200, fd199; +fma.rn.f64 fd210, fd200, fd199, fd209; +mul.f64 fd211, fd184, fd210; +mul.f64 fd212, fd183, fd210; +mul.f64 fd213, fd208, fd184; +mul.f64 fd214, fd199, fd208; +mul.f64 fd215, fd200, fd210; +sub.f64 fd216, fd214, fd215; +mul.f64 fd217, fd199, fd210; +fma.rn.f64 fd218, fd200, fd208, fd217; +mul.f64 fd219, fd188, fd218; +mul.f64 fd220, fd187, fd218; +mul.f64 fd221, fd216, fd188; +mul.f64 fd222, fd199, fd216; +mul.f64 fd223, fd200, fd218; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd199, fd218; +fma.rn.f64 fd226, fd200, fd216, fd225; +mul.f64 fd227, fd192, fd226; +mul.f64 fd228, fd191, fd226; +mul.f64 fd229, fd224, fd192; +mul.f64 fd230, fd199, fd224; +mul.f64 fd231, fd200, fd226; +sub.f64 fd232, fd230, fd231; +mul.f64 fd233, fd199, fd226; +fma.rn.f64 fd234, fd200, fd224, fd233; +mul.f64 fd235, fd196, fd234; +mul.f64 fd236, fd195, fd234; +mul.f64 fd237, fd232, fd196; +mul.f64 fd238, fd199, fd232; +mul.f64 fd239, fd200, fd234; +sub.f64 fd240, fd238, fd239; +mul.f64 fd241, fd199, fd234; +fma.rn.f64 fd242, fd200, fd232, fd241; +mul.f64 fd243, fd178, fd242; +mul.f64 fd244, fd177, fd242; +mul.f64 fd245, fd240, fd178; +ld.global.v2.f64 {fd246, fd247}, [rd6+2304]; +mul.f64 fd250, fd182, fd247; +mul.f64 fd251, fd181, fd247; +mul.f64 fd252, fd246, fd182; +mul.f64 fd253, fd199, fd246; +mul.f64 fd254, fd200, fd247; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd199, fd247; +fma.rn.f64 fd257, fd200, fd246, fd256; +mul.f64 fd258, fd186, fd257; +mul.f64 fd259, fd185, fd257; +mul.f64 fd260, fd255, fd186; +mul.f64 fd261, fd199, fd255; +mul.f64 fd262, fd200, fd257; +sub.f64 fd263, fd261, fd262; +mul.f64 fd264, fd199, fd257; +fma.rn.f64 fd265, fd200, fd255, fd264; +mul.f64 fd266, fd190, fd265; +mul.f64 fd267, fd189, fd265; +mul.f64 fd268, fd263, fd190; +mul.f64 fd269, fd199, fd263; +mul.f64 fd270, fd200, fd265; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd199, fd265; +fma.rn.f64 fd273, fd200, fd263, fd272; +mul.f64 fd274, fd194, fd273; +mul.f64 fd275, fd193, fd273; +mul.f64 fd276, fd271, fd194; +mul.f64 fd277, fd199, fd271; +mul.f64 fd278, fd200, fd273; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd199, fd273; +fma.rn.f64 fd281, fd200, fd271, fd280; +mul.f64 fd282, fd198, fd281; +mul.f64 fd283, fd197, fd281; +mul.f64 fd284, fd279, fd198; +barrier.sync 0; +mad.lo.s32 r9, r7, 192, r8; +add.f64 fd285, fd92, fd146; +add.f64 fd286, fd91, fd145; +st.shared.v2.f64 [r9], {fd286, fd285}; +fma.rn.f64 fd287, fd199, fd179, fd203; +sub.f64 fd288, fd205, fd204; +st.shared.v2.f64 [r9+16], {fd287, fd288}; +fma.rn.f64 fd289, fd208, fd183, fd211; +sub.f64 fd290, fd213, fd212; +st.shared.v2.f64 [r9+32], {fd289, fd290}; +sub.f64 fd291, fd221, fd220; +fma.rn.f64 fd292, fd216, fd187, fd219; +st.shared.v2.f64 [r9+48], {fd292, fd291}; +sub.f64 fd293, fd229, fd228; +fma.rn.f64 fd294, fd224, fd191, fd227; +st.shared.v2.f64 [r9+64], {fd294, fd293}; +fma.rn.f64 fd295, fd232, fd195, fd235; +sub.f64 fd296, fd237, fd236; +st.shared.v2.f64 [r9+80], {fd295, fd296}; +fma.rn.f64 fd297, fd240, fd177, fd243; +sub.f64 fd298, fd245, fd244; +st.shared.v2.f64 [r9+96], {fd297, fd298}; +fma.rn.f64 fd299, fd246, fd181, fd250; +sub.f64 fd300, fd252, fd251; +st.shared.v2.f64 [r9+112], {fd299, fd300}; +fma.rn.f64 fd301, fd255, fd185, fd258; +sub.f64 fd302, fd260, fd259; +st.shared.v2.f64 [r9+128], {fd301, fd302}; +sub.f64 fd303, fd268, fd267; +fma.rn.f64 fd304, fd263, fd189, fd266; +st.shared.v2.f64 [r9+144], {fd304, fd303}; +sub.f64 fd305, fd276, fd275; +fma.rn.f64 fd306, fd271, fd193, fd274; +st.shared.v2.f64 [r9+160], {fd306, fd305}; +fma.rn.f64 fd307, fd279, fd197, fd282; +sub.f64 fd308, fd284, fd283; +st.shared.v2.f64 [r9+176], {fd307, fd308}; +barrier.sync 0; +mad.lo.s32 r10, r7, -176, r9; +ld.shared.v2.f64 {fd309, fd310}, [r10]; +ld.shared.v2.f64 {fd313, fd314}, [r10+2304]; +ld.shared.v2.f64 {fd317, fd318}, [r10+4608]; +ld.shared.v2.f64 {fd321, fd322}, [r10+6912]; +ld.shared.v2.f64 {fd325, fd326}, [r10+9216]; +ld.shared.v2.f64 {fd329, fd330}, [r10+11520]; +ld.shared.v2.f64 {fd333, fd334}, [r10+13824]; +ld.shared.v2.f64 {fd337, fd338}, [r10+16128]; +ld.shared.v2.f64 {fd341, fd342}, [r10+18432]; +ld.shared.v2.f64 {fd345, fd346}, [r10+20736]; +ld.shared.v2.f64 {fd349, fd350}, [r10+23040]; +ld.shared.v2.f64 {fd353, fd354}, [r10+25344]; +add.f64 fd357, fd325, fd341; +add.f64 fd358, fd309, fd357; +add.f64 fd359, fd326, fd342; +add.f64 fd360, fd310, fd359; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, fd309, fd361; +sub.f64 fd363, fd326, fd342; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +mul.f64 fd367, fd359, 0d3FE0000000000000; +sub.f64 fd368, fd310, fd367; +sub.f64 fd369, fd325, fd341; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, fd333, fd349; +add.f64 fd374, fd317, fd373; +add.f64 fd375, fd334, fd350; +add.f64 fd376, fd318, fd375; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, fd317, fd377; +sub.f64 fd379, fd334, fd350; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +mul.f64 fd383, fd375, 0d3FE0000000000000; +sub.f64 fd384, fd318, fd383; +sub.f64 fd385, fd333, fd349; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd389, fd381, 0d3FE0000000000000; +mul.f64 fd390, fd387, 0d3FEBB67AE8584CAA; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd387, 0d3FE0000000000000; +fma.rn.f64 fd393, fd381, 0d3FEBB67AE8584CAA, fd392; +mul.f64 fd394, fd382, 0dBFE0000000000000; +mul.f64 fd395, fd388, 0d3FEBB67AE8584CAA; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd388, 0dBFE0000000000000; +fma.rn.f64 fd398, fd382, 0d3FEBB67AE8584CAA, fd397; +add.f64 fd399, fd358, fd374; +add.f64 fd400, fd360, fd376; +sub.f64 fd401, fd358, fd374; +sub.f64 fd402, fd360, fd376; +add.f64 fd403, fd365, fd391; +add.f64 fd404, fd371, fd393; +sub.f64 fd405, fd365, fd391; +sub.f64 fd406, fd371, fd393; +add.f64 fd407, fd366, fd396; +add.f64 fd408, fd372, fd398; +sub.f64 fd409, fd366, fd396; +sub.f64 fd410, fd372, fd398; +add.f64 fd411, fd329, fd345; +add.f64 fd412, fd313, fd411; +add.f64 fd413, fd330, fd346; +add.f64 fd414, fd314, fd413; +mul.f64 fd415, fd411, 0d3FE0000000000000; +sub.f64 fd416, fd313, fd415; +sub.f64 fd417, fd330, fd346; +mul.f64 fd418, fd417, 0dBFEBB67AE8584CAA; +add.f64 fd419, fd418, fd416; +sub.f64 fd420, fd416, fd418; +mul.f64 fd421, fd413, 0d3FE0000000000000; +sub.f64 fd422, fd314, fd421; +sub.f64 fd423, fd329, fd345; +mul.f64 fd424, fd423, 0dBFEBB67AE8584CAA; +sub.f64 fd425, fd422, fd424; +add.f64 fd426, fd424, fd422; +add.f64 fd427, fd337, fd353; +add.f64 fd428, fd321, fd427; +add.f64 fd429, fd338, fd354; +add.f64 fd430, fd322, fd429; +mul.f64 fd431, fd427, 0d3FE0000000000000; +sub.f64 fd432, fd321, fd431; +sub.f64 fd433, fd338, fd354; +mul.f64 fd434, fd433, 0dBFEBB67AE8584CAA; +add.f64 fd435, fd434, fd432; +sub.f64 fd436, fd432, fd434; +mul.f64 fd437, fd429, 0d3FE0000000000000; +sub.f64 fd438, fd322, fd437; +sub.f64 fd439, fd337, fd353; +mul.f64 fd440, fd439, 0dBFEBB67AE8584CAA; +sub.f64 fd441, fd438, fd440; +add.f64 fd442, fd440, fd438; +mul.f64 fd443, fd435, 0d3FE0000000000000; +mul.f64 fd444, fd441, 0d3FEBB67AE8584CAA; +sub.f64 fd445, fd443, fd444; +mul.f64 fd446, fd441, 0d3FE0000000000000; +fma.rn.f64 fd447, fd435, 0d3FEBB67AE8584CAA, fd446; +mul.f64 fd448, fd436, 0dBFE0000000000000; +mul.f64 fd449, fd442, 0d3FEBB67AE8584CAA; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd442, 0dBFE0000000000000; +fma.rn.f64 fd452, fd436, 0d3FEBB67AE8584CAA, fd451; +add.f64 fd453, fd412, fd428; +add.f64 fd454, fd414, fd430; +sub.f64 fd455, fd412, fd428; +sub.f64 fd456, fd414, fd430; +add.f64 fd457, fd419, fd445; +add.f64 fd458, fd425, fd447; +sub.f64 fd459, fd419, fd445; +sub.f64 fd460, fd425, fd447; +add.f64 fd461, fd420, fd450; +add.f64 fd462, fd426, fd452; +sub.f64 fd463, fd420, fd450; +sub.f64 fd464, fd426, fd452; +mul.f64 fd465, fd457, 0d3FEBB67AE8584CAA; +mul.f64 fd466, fd458, 0d3FE0000000000000; +sub.f64 fd467, fd465, fd466; +mul.f64 fd468, fd458, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd469, fd457, 0d3FE0000000000000, fd468; +mul.f64 fd470, fd461, 0d3FE0000000000000; +mul.f64 fd471, fd462, 0d3FEBB67AE8584CAA; +sub.f64 fd472, fd470, fd471; +mul.f64 fd473, fd462, 0d3FE0000000000000; +fma.rn.f64 fd474, fd461, 0d3FEBB67AE8584CAA, fd473; +mul.f64 fd475, fd459, 0dBFE0000000000000; +mul.f64 fd476, fd460, 0d3FEBB67AE8584CAA; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd460, 0dBFE0000000000000; +fma.rn.f64 fd479, fd459, 0d3FEBB67AE8584CAA, fd478; +mul.f64 fd480, fd463, 0dBFEBB67AE8584CAA; +mul.f64 fd481, fd464, 0d3FE0000000000000; +sub.f64 fd482, fd480, fd481; +mul.f64 fd483, fd464, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd484, fd463, 0d3FE0000000000000, fd483; +sub.f64 fd485, fd399, fd453; +sub.f64 fd486, fd400, fd454; +add.f64 fd487, fd403, fd467; +add.f64 fd488, fd404, fd469; +sub.f64 fd489, fd403, fd467; +sub.f64 fd490, fd404, fd469; +add.f64 fd491, fd407, fd472; +add.f64 fd492, fd408, fd474; +sub.f64 fd493, fd407, fd472; +sub.f64 fd494, fd408, fd474; +sub.f64 fd495, fd401, fd456; +add.f64 fd496, fd402, fd455; +add.f64 fd497, fd401, fd456; +sub.f64 fd498, fd402, fd455; +add.f64 fd499, fd405, fd477; +add.f64 fd500, fd406, fd479; +sub.f64 fd501, fd405, fd477; +sub.f64 fd502, fd406, fd479; +add.f64 fd503, fd409, fd482; +add.f64 fd504, fd410, fd484; +sub.f64 fd505, fd409, fd482; +sub.f64 fd506, fd410, fd484; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 12; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %26; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd507, fd508}, [rd11]; +mul.f64 fd511, fd488, fd508; +mul.f64 fd512, fd487, fd508; +mul.f64 fd513, fd507, fd488; +mul.f64 fd514, fd507, fd507; +mul.f64 fd515, fd508, fd508; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd508, fd507; +fma.rn.f64 fd518, fd508, fd507, fd517; +mul.f64 fd519, fd492, fd518; +mul.f64 fd520, fd491, fd518; +mul.f64 fd521, fd516, fd492; +mul.f64 fd522, fd507, fd516; +mul.f64 fd523, fd508, fd518; +sub.f64 fd524, fd522, fd523; +mul.f64 fd525, fd507, fd518; +fma.rn.f64 fd526, fd508, fd516, fd525; +mul.f64 fd527, fd496, fd526; +mul.f64 fd528, fd495, fd526; +mul.f64 fd529, fd524, fd496; +mul.f64 fd530, fd507, fd524; +mul.f64 fd531, fd508, fd526; +sub.f64 fd532, fd530, fd531; +mul.f64 fd533, fd507, fd526; +fma.rn.f64 fd534, fd508, fd524, fd533; +mul.f64 fd535, fd500, fd534; +mul.f64 fd536, fd499, fd534; +mul.f64 fd537, fd532, fd500; +mul.f64 fd538, fd507, fd532; +mul.f64 fd539, fd508, fd534; +sub.f64 fd540, fd538, fd539; +mul.f64 fd541, fd507, fd534; +fma.rn.f64 fd542, fd508, fd532, fd541; +mul.f64 fd543, fd504, fd542; +mul.f64 fd544, fd503, fd542; +mul.f64 fd545, fd540, fd504; +mul.f64 fd546, fd507, fd540; +mul.f64 fd547, fd508, fd542; +sub.f64 fd548, fd546, fd547; +mul.f64 fd549, fd507, fd542; +fma.rn.f64 fd550, fd508, fd540, fd549; +mul.f64 fd551, fd486, fd550; +mul.f64 fd552, fd485, fd550; +mul.f64 fd553, fd548, fd486; +ld.global.v2.f64 {fd554, fd555}, [rd11+192]; +mul.f64 fd558, fd490, fd555; +mul.f64 fd559, fd489, fd555; +mul.f64 fd560, fd554, fd490; +mul.f64 fd561, fd507, fd554; +mul.f64 fd562, fd508, fd555; +sub.f64 fd563, fd561, fd562; +mul.f64 fd564, fd507, fd555; +fma.rn.f64 fd565, fd508, fd554, fd564; +mul.f64 fd566, fd494, fd565; +mul.f64 fd567, fd493, fd565; +mul.f64 fd568, fd563, fd494; +mul.f64 fd569, fd507, fd563; +mul.f64 fd570, fd508, fd565; +sub.f64 fd571, fd569, fd570; +mul.f64 fd572, fd507, fd565; +fma.rn.f64 fd573, fd508, fd563, fd572; +mul.f64 fd574, fd498, fd573; +mul.f64 fd575, fd497, fd573; +mul.f64 fd576, fd571, fd498; +mul.f64 fd577, fd507, fd571; +mul.f64 fd578, fd508, fd573; +sub.f64 fd579, fd577, fd578; +mul.f64 fd580, fd507, fd573; +fma.rn.f64 fd581, fd508, fd571, fd580; +mul.f64 fd582, fd502, fd581; +mul.f64 fd583, fd501, fd581; +mul.f64 fd584, fd579, fd502; +mul.f64 fd585, fd507, fd579; +mul.f64 fd586, fd508, fd581; +sub.f64 fd587, fd585, fd586; +mul.f64 fd588, fd507, fd581; +fma.rn.f64 fd589, fd508, fd579, fd588; +mul.f64 fd590, fd506, fd589; +mul.f64 fd591, fd505, fd589; +mul.f64 fd592, fd587, fd506; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 2304, r15; +add.f64 fd593, fd400, fd454; +add.f64 fd594, fd399, fd453; +st.shared.v2.f64 [r16], {fd594, fd593}; +fma.rn.f64 fd595, fd507, fd487, fd511; +sub.f64 fd596, fd513, fd512; +st.shared.v2.f64 [r16+192], {fd595, fd596}; +fma.rn.f64 fd597, fd516, fd491, fd519; +sub.f64 fd598, fd521, fd520; +st.shared.v2.f64 [r16+384], {fd597, fd598}; +fma.rn.f64 fd599, fd524, fd495, fd527; +sub.f64 fd600, fd529, fd528; +st.shared.v2.f64 [r16+576], {fd599, fd600}; +fma.rn.f64 fd601, fd532, fd499, fd535; +sub.f64 fd602, fd537, fd536; +st.shared.v2.f64 [r16+768], {fd601, fd602}; +fma.rn.f64 fd603, fd540, fd503, fd543; +sub.f64 fd604, fd545, fd544; +st.shared.v2.f64 [r16+960], {fd603, fd604}; +fma.rn.f64 fd605, fd548, fd485, fd551; +sub.f64 fd606, fd553, fd552; +st.shared.v2.f64 [r16+1152], {fd605, fd606}; +fma.rn.f64 fd607, fd554, fd489, fd558; +sub.f64 fd608, fd560, fd559; +st.shared.v2.f64 [r16+1344], {fd607, fd608}; +fma.rn.f64 fd609, fd563, fd493, fd566; +sub.f64 fd610, fd568, fd567; +st.shared.v2.f64 [r16+1536], {fd609, fd610}; +fma.rn.f64 fd611, fd571, fd497, fd574; +sub.f64 fd612, fd576, fd575; +st.shared.v2.f64 [r16+1728], {fd611, fd612}; +fma.rn.f64 fd613, fd579, fd501, fd582; +sub.f64 fd614, fd584, fd583; +st.shared.v2.f64 [r16+1920], {fd613, fd614}; +fma.rn.f64 fd615, fd587, fd505, fd590; +sub.f64 fd616, fd592, fd591; +st.shared.v2.f64 [r16+2112], {fd615, fd616}; +barrier.sync 0; +ld.shared.v2.f64 {fd617, fd618}, [r10]; +ld.shared.v2.f64 {fd621, fd622}, [r10+2304]; +ld.shared.v2.f64 {fd625, fd626}, [r10+4608]; +ld.shared.v2.f64 {fd629, fd630}, [r10+6912]; +ld.shared.v2.f64 {fd633, fd634}, [r10+9216]; +ld.shared.v2.f64 {fd637, fd638}, [r10+11520]; +ld.shared.v2.f64 {fd641, fd642}, [r10+13824]; +ld.shared.v2.f64 {fd645, fd646}, [r10+16128]; +ld.shared.v2.f64 {fd649, fd650}, [r10+18432]; +ld.shared.v2.f64 {fd653, fd654}, [r10+20736]; +ld.shared.v2.f64 {fd657, fd658}, [r10+23040]; +ld.shared.v2.f64 {fd661, fd662}, [r10+25344]; +add.f64 fd665, fd633, fd649; +add.f64 fd666, fd617, fd665; +add.f64 fd667, fd634, fd650; +add.f64 fd668, fd618, fd667; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd617, fd669; +sub.f64 fd671, fd634, fd650; +mul.f64 fd672, fd671, 0dBFEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +mul.f64 fd675, fd667, 0d3FE0000000000000; +sub.f64 fd676, fd618, fd675; +sub.f64 fd677, fd633, fd649; +mul.f64 fd678, fd677, 0dBFEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +add.f64 fd681, fd641, fd657; +add.f64 fd682, fd625, fd681; +add.f64 fd683, fd642, fd658; +add.f64 fd684, fd626, fd683; +mul.f64 fd685, fd681, 0d3FE0000000000000; +sub.f64 fd686, fd625, fd685; +sub.f64 fd687, fd642, fd658; +mul.f64 fd688, fd687, 0dBFEBB67AE8584CAA; +add.f64 fd689, fd688, fd686; +sub.f64 fd690, fd686, fd688; +mul.f64 fd691, fd683, 0d3FE0000000000000; +sub.f64 fd692, fd626, fd691; +sub.f64 fd693, fd641, fd657; +mul.f64 fd694, fd693, 0dBFEBB67AE8584CAA; +sub.f64 fd695, fd692, fd694; +add.f64 fd696, fd694, fd692; +mul.f64 fd697, fd689, 0d3FE0000000000000; +mul.f64 fd698, fd695, 0d3FEBB67AE8584CAA; +sub.f64 fd699, fd697, fd698; +mul.f64 fd700, fd695, 0d3FE0000000000000; +fma.rn.f64 fd701, fd689, 0d3FEBB67AE8584CAA, fd700; +mul.f64 fd702, fd690, 0dBFE0000000000000; +mul.f64 fd703, fd696, 0d3FEBB67AE8584CAA; +sub.f64 fd704, fd702, fd703; +mul.f64 fd705, fd696, 0dBFE0000000000000; +fma.rn.f64 fd706, fd690, 0d3FEBB67AE8584CAA, fd705; +add.f64 fd707, fd666, fd682; +add.f64 fd708, fd668, fd684; +sub.f64 fd709, fd666, fd682; +sub.f64 fd710, fd668, fd684; +add.f64 fd711, fd673, fd699; +add.f64 fd712, fd679, fd701; +sub.f64 fd713, fd673, fd699; +sub.f64 fd714, fd679, fd701; +add.f64 fd715, fd674, fd704; +add.f64 fd716, fd680, fd706; +sub.f64 fd717, fd674, fd704; +sub.f64 fd718, fd680, fd706; +add.f64 fd719, fd637, fd653; +add.f64 fd720, fd621, fd719; +add.f64 fd721, fd638, fd654; +add.f64 fd722, fd622, fd721; +mul.f64 fd723, fd719, 0d3FE0000000000000; +sub.f64 fd724, fd621, fd723; +sub.f64 fd725, fd638, fd654; +mul.f64 fd726, fd725, 0dBFEBB67AE8584CAA; +add.f64 fd727, fd726, fd724; +sub.f64 fd728, fd724, fd726; +mul.f64 fd729, fd721, 0d3FE0000000000000; +sub.f64 fd730, fd622, fd729; +sub.f64 fd731, fd637, fd653; +mul.f64 fd732, fd731, 0dBFEBB67AE8584CAA; +sub.f64 fd733, fd730, fd732; +add.f64 fd734, fd732, fd730; +add.f64 fd735, fd645, fd661; +add.f64 fd736, fd629, fd735; +add.f64 fd737, fd646, fd662; +add.f64 fd738, fd630, fd737; +mul.f64 fd739, fd735, 0d3FE0000000000000; +sub.f64 fd740, fd629, fd739; +sub.f64 fd741, fd646, fd662; +mul.f64 fd742, fd741, 0dBFEBB67AE8584CAA; +add.f64 fd743, fd742, fd740; +sub.f64 fd744, fd740, fd742; +mul.f64 fd745, fd737, 0d3FE0000000000000; +sub.f64 fd746, fd630, fd745; +sub.f64 fd747, fd645, fd661; +mul.f64 fd748, fd747, 0dBFEBB67AE8584CAA; +sub.f64 fd749, fd746, fd748; +add.f64 fd750, fd748, fd746; +mul.f64 fd751, fd743, 0d3FE0000000000000; +mul.f64 fd752, fd749, 0d3FEBB67AE8584CAA; +sub.f64 fd753, fd751, fd752; +mul.f64 fd754, fd749, 0d3FE0000000000000; +fma.rn.f64 fd755, fd743, 0d3FEBB67AE8584CAA, fd754; +mul.f64 fd756, fd744, 0dBFE0000000000000; +mul.f64 fd757, fd750, 0d3FEBB67AE8584CAA; +sub.f64 fd758, fd756, fd757; +mul.f64 fd759, fd750, 0dBFE0000000000000; +fma.rn.f64 fd760, fd744, 0d3FEBB67AE8584CAA, fd759; +add.f64 fd761, fd720, fd736; +add.f64 fd762, fd722, fd738; +sub.f64 fd763, fd720, fd736; +sub.f64 fd764, fd722, fd738; +add.f64 fd765, fd727, fd753; +add.f64 fd766, fd733, fd755; +sub.f64 fd767, fd727, fd753; +sub.f64 fd768, fd733, fd755; +add.f64 fd769, fd728, fd758; +add.f64 fd770, fd734, fd760; +sub.f64 fd771, fd728, fd758; +sub.f64 fd772, fd734, fd760; +mul.f64 fd773, fd765, 0d3FEBB67AE8584CAA; +mul.f64 fd774, fd766, 0d3FE0000000000000; +sub.f64 fd775, fd773, fd774; +mul.f64 fd776, fd766, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd777, fd765, 0d3FE0000000000000, fd776; +mul.f64 fd778, fd769, 0d3FE0000000000000; +mul.f64 fd779, fd770, 0d3FEBB67AE8584CAA; +sub.f64 fd780, fd778, fd779; +mul.f64 fd781, fd770, 0d3FE0000000000000; +fma.rn.f64 fd782, fd769, 0d3FEBB67AE8584CAA, fd781; +mul.f64 fd783, fd767, 0dBFE0000000000000; +mul.f64 fd784, fd768, 0d3FEBB67AE8584CAA; +sub.f64 fd785, fd783, fd784; +mul.f64 fd786, fd768, 0dBFE0000000000000; +fma.rn.f64 fd787, fd767, 0d3FEBB67AE8584CAA, fd786; +mul.f64 fd788, fd771, 0dBFEBB67AE8584CAA; +mul.f64 fd789, fd772, 0d3FE0000000000000; +sub.f64 fd790, fd788, fd789; +mul.f64 fd791, fd772, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd792, fd771, 0d3FE0000000000000, fd791; +add.f64 %1, fd708, fd762; +add.f64 %0, fd707, fd761; +add.f64 %3, fd712, fd777; +add.f64 %2, fd711, fd775; +add.f64 %5, fd716, fd782; +add.f64 %4, fd715, fd780; +add.f64 %7, fd710, fd763; +sub.f64 %6, fd709, fd764; +add.f64 %9, fd714, fd787; +add.f64 %8, fd713, fd785; +add.f64 %11, fd718, fd792; +add.f64 %10, fd717, fd790; +sub.f64 %13, fd708, fd762; +sub.f64 %12, fd707, fd761; +sub.f64 %15, fd712, fd777; +sub.f64 %14, fd711, fd775; +sub.f64 %17, fd716, fd782; +sub.f64 %16, fd715, fd780; +sub.f64 %19, fd710, fd763; +add.f64 %18, fd709, fd764; +sub.f64 %21, fd714, fd787; +sub.f64 %20, fd713, fd785; +sub.f64 %23, fd718, fd792; +sub.f64 %22, fd717, fd790; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y): "r"(smem), "l"(lut_dp_12_1728), "l"(lut_dp_12_144), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..30f897dff25d7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp16_fwd.hpp.inc @@ -0,0 +1,2875 @@ +#ifndef CUFFTDX_FFT_17_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_17_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<749, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<273>; +.reg .b32 r<1793>; +.reg .f64 fd<257>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %36, %66; +} +{ +add.f16x2 r4, %37, %67; +} +{ +sub.f16x2 r7, %36, %66; +} +{ +sub.f16x2 r10, %37, %67; +} +{ +add.f16x2 r13, %38, %64; +} +{ +add.f16x2 r16, %39, %65; +} +{ +sub.f16x2 r19, %38, %64; +} +{ +sub.f16x2 r22, %39, %65; +} +{ +add.f16x2 r25, %40, %62; +} +{ +add.f16x2 r28, %41, %63; +} +{ +sub.f16x2 r31, %40, %62; +} +{ +sub.f16x2 r34, %41, %63; +} +{ +add.f16x2 r37, %42, %60; +} +{ +add.f16x2 r40, %43, %61; +} +{ +sub.f16x2 r43, %42, %60; +} +{ +sub.f16x2 r46, %43, %61; +} +{ +add.f16x2 r49, %44, %58; +} +{ +add.f16x2 r52, %45, %59; +} +{ +sub.f16x2 r55, %44, %58; +} +{ +sub.f16x2 r58, %45, %59; +} +{ +add.f16x2 r61, %46, %56; +} +{ +add.f16x2 r64, %47, %57; +} +{ +sub.f16x2 r67, %46, %56; +} +{ +sub.f16x2 r70, %47, %57; +} +{ +add.f16x2 r73, %48, %54; +} +{ +add.f16x2 r76, %49, %55; +} +{ +sub.f16x2 r79, %48, %54; +} +{ +sub.f16x2 r82, %49, %55; +} +{ +add.f16x2 r85, %50, %52; +} +{ +add.f16x2 r88, %51, %53; +} +{ +sub.f16x2 r91, %50, %52; +} +{ +sub.f16x2 r94, %51, %53; +} +{ +add.f16x2 r97, %34, r1; +} +{ +add.f16x2 r100, %35, r4; +} +{ +add.f16x2 r103, r97, r13; +} +{ +add.f16x2 r106, r100, r16; +} +{ +add.f16x2 r109, r103, r25; +} +{ +add.f16x2 r112, r106, r28; +} +{ +add.f16x2 r115, r109, r37; +} +{ +add.f16x2 r118, r112, r40; +} +{ +add.f16x2 r121, r115, r49; +} +{ +add.f16x2 r124, r118, r52; +} +{ +add.f16x2 r127, r121, r61; +} +{ +add.f16x2 r130, r124, r64; +} +{ +add.f16x2 r133, r127, r73; +} +{ +add.f16x2 r136, r130, r76; +} +{ +add.f16x2 %0, r133, r85; +} +{ +add.f16x2 %1, r136, r88; +} +mov.u32 r1588, 0; +cvt.rn.f16.s32 rs1, r1588; +mov.b32 r157, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r1588; +mov.b32 r169, {rs2, rs2}; +mov.f64 fd231, 0d3FEDD6D000370991; +{ +cvt.rn.f16.f64 rs3, fd231; +} +mov.b32 r149, {rs3, rs3}; +{ +mul.f16x2 r147, r1, r149; +} +{ +add.f16x2 r150, %34, r147; +} +mov.f64 fd212, 0dBFD71E955D8E7CDC; +{ +cvt.rn.f16.f64 rs4, fd212; +} +mov.b32 r155, {rs4, rs4}; +{ +mul.f16x2 r153, r10, r155; +} +{ +add.f16x2 r156, r157, r153; +} +{ +cvt.rn.f16.f64 rs5, fd231; +} +mov.b32 r161, {rs5, rs5}; +{ +mul.f16x2 r159, r4, r161; +} +{ +add.f16x2 r162, %35, r159; +} +{ +cvt.rn.f16.f64 rs6, fd212; +} +mov.b32 r167, {rs6, rs6}; +{ +mul.f16x2 r165, r7, r167; +} +{ +add.f16x2 r168, r169, r165; +} +mov.f64 fd239, 0d3FE7A5F6075D4884; +{ +cvt.rn.f16.f64 rs7, fd239; +} +mov.b32 r173, {rs7, rs7}; +{ +mul.f16x2 r171, r13, r173; +} +{ +add.f16x2 r174, r150, r171; +} +mov.f64 fd184, 0dBFE58EEA2A9D6DA3; +{ +cvt.rn.f16.f64 rs8, fd184; +} +mov.b32 r179, {rs8, rs8}; +{ +mul.f16x2 r177, r22, r179; +} +{ +add.f16x2 r180, r156, r177; +} +{ +cvt.rn.f16.f64 rs9, fd239; +} +mov.b32 r185, {rs9, rs9}; +{ +mul.f16x2 r183, r16, r185; +} +{ +add.f16x2 r186, r162, r183; +} +{ +cvt.rn.f16.f64 rs10, fd184; +} +mov.b32 r191, {rs10, rs10}; +{ +mul.f16x2 r189, r19, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd247, 0d3FDC86FA2B2883CD; +{ +cvt.rn.f16.f64 rs11, fd247; +} +mov.b32 r197, {rs11, rs11}; +{ +mul.f16x2 r195, r25, r197; +} +{ +add.f16x2 r198, r174, r195; +} +mov.f64 fd144, 0dBFECA52D7C9E640B; +{ +cvt.rn.f16.f64 rs12, fd144; +} +mov.b32 r203, {rs12, rs12}; +{ +mul.f16x2 r201, r34, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs13, fd247; +} +mov.b32 r209, {rs13, rs13}; +{ +mul.f16x2 r207, r28, r209; +} +{ +add.f16x2 r210, r186, r207; +} +{ +cvt.rn.f16.f64 rs14, fd144; +} +mov.b32 r215, {rs14, rs14}; +{ +mul.f16x2 r213, r31, r215; +} +{ +add.f16x2 r216, r192, r213; +} +mov.f64 fd255, 0d3FB79EE63259B75E; +{ +cvt.rn.f16.f64 rs15, fd255; +} +mov.b32 r221, {rs15, rs15}; +{ +mul.f16x2 r219, r37, r221; +} +{ +add.f16x2 r222, r198, r219; +} +mov.f64 fd204, 0dBFEFDD0DEB564B22; +{ +cvt.rn.f16.f64 rs16, fd204; +} +mov.b32 r227, {rs16, rs16}; +{ +mul.f16x2 r225, r46, r227; +} +{ +add.f16x2 r228, r204, r225; +} +{ +cvt.rn.f16.f64 rs17, fd255; +} +mov.b32 r233, {rs17, rs17}; +{ +mul.f16x2 r231, r40, r233; +} +{ +add.f16x2 r234, r210, r231; +} +{ +cvt.rn.f16.f64 rs18, fd204; +} +mov.b32 r239, {rs18, rs18}; +{ +mul.f16x2 r237, r43, r239; +} +{ +add.f16x2 r240, r216, r237; +} +mov.f64 fd251, 0dBFD183B1C61F0D01; +{ +cvt.rn.f16.f64 rs19, fd251; +} +mov.b32 r245, {rs19, rs19}; +{ +mul.f16x2 r243, r49, r245; +} +{ +add.f16x2 r246, r222, r243; +} +mov.f64 fd252, 0dBFEEC746923C349F; +{ +cvt.rn.f16.f64 rs20, fd252; +} +mov.b32 r251, {rs20, rs20}; +{ +mul.f16x2 r249, r58, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +cvt.rn.f16.f64 rs21, fd251; +} +mov.b32 r257, {rs21, rs21}; +{ +mul.f16x2 r255, r52, r257; +} +{ +add.f16x2 r258, r234, r255; +} +{ +cvt.rn.f16.f64 rs22, fd252; +} +mov.b32 r263, {rs22, rs22}; +{ +mul.f16x2 r261, r55, r263; +} +{ +add.f16x2 r264, r240, r261; +} +mov.f64 fd243, 0dBFE348C86ED5F1BB; +{ +cvt.rn.f16.f64 rs23, fd243; +} +mov.b32 r269, {rs23, rs23}; +{ +mul.f16x2 r267, r61, r269; +} +{ +add.f16x2 r270, r246, r267; +} +mov.f64 fd244, 0dBFE9895B6C9A05F6; +{ +cvt.rn.f16.f64 rs24, fd244; +} +mov.b32 r275, {rs24, rs24}; +{ +mul.f16x2 r273, r70, r275; +} +{ +add.f16x2 r276, r252, r273; +} +{ +cvt.rn.f16.f64 rs25, fd243; +} +mov.b32 r281, {rs25, rs25}; +{ +mul.f16x2 r279, r64, r281; +} +{ +add.f16x2 r282, r258, r279; +} +{ +cvt.rn.f16.f64 rs26, fd244; +} +mov.b32 r287, {rs26, rs26}; +{ +mul.f16x2 r285, r67, r287; +} +{ +add.f16x2 r288, r264, r285; +} +mov.f64 fd235, 0dBFEB34FA910EA3B9; +{ +cvt.rn.f16.f64 rs27, fd235; +} +mov.b32 r293, {rs27, rs27}; +{ +mul.f16x2 r291, r73, r293; +} +{ +add.f16x2 r294, r270, r291; +} +mov.f64 fd236, 0dBFE0D8884363DD80; +{ +cvt.rn.f16.f64 rs28, fd236; +} +mov.b32 r299, {rs28, rs28}; +{ +mul.f16x2 r297, r82, r299; +} +{ +add.f16x2 r300, r276, r297; +} +{ +cvt.rn.f16.f64 rs29, fd235; +} +mov.b32 r305, {rs29, rs29}; +{ +mul.f16x2 r303, r76, r305; +} +{ +add.f16x2 r306, r282, r303; +} +{ +cvt.rn.f16.f64 rs30, fd236; +} +mov.b32 r311, {rs30, rs30}; +{ +mul.f16x2 r309, r79, r311; +} +{ +add.f16x2 r312, r288, r309; +} +mov.f64 fd227, 0dBFEF7484007FAEF3; +{ +cvt.rn.f16.f64 rs31, fd227; +} +mov.b32 r317, {rs31, rs31}; +{ +mul.f16x2 r315, r85, r317; +} +{ +add.f16x2 r318, r294, r315; +} +mov.f64 fd228, 0dBFC7851AACD6C6B4; +{ +cvt.rn.f16.f64 rs32, fd228; +} +mov.b32 r323, {rs32, rs32}; +{ +mul.f16x2 r321, r94, r323; +} +{ +add.f16x2 r324, r300, r321; +} +{ +cvt.rn.f16.f64 rs33, fd227; +} +mov.b32 r329, {rs33, rs33}; +{ +mul.f16x2 r327, r88, r329; +} +{ +add.f16x2 r330, r306, r327; +} +{ +cvt.rn.f16.f64 rs34, fd228; +} +mov.b32 r335, {rs34, rs34}; +{ +mul.f16x2 r333, r91, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +sub.f16x2 %2, r318, r324; +} +{ +add.f16x2 %3, r330, r336; +} +{ +add.f16x2 %32, r318, r324; +} +{ +sub.f16x2 %33, r330, r336; +} +cvt.rn.f16.s32 rs35, r1588; +mov.b32 r363, {rs35, rs35}; +cvt.rn.f16.s32 rs36, r1588; +mov.b32 r375, {rs36, rs36}; +{ +cvt.rn.f16.f64 rs37, fd239; +} +mov.b32 r355, {rs37, rs37}; +{ +mul.f16x2 r353, r1, r355; +} +{ +add.f16x2 r356, %34, r353; +} +{ +cvt.rn.f16.f64 rs38, fd184; +} +mov.b32 r361, {rs38, rs38}; +{ +mul.f16x2 r359, r10, r361; +} +{ +add.f16x2 r362, r363, r359; +} +{ +cvt.rn.f16.f64 rs39, fd239; +} +mov.b32 r367, {rs39, rs39}; +{ +mul.f16x2 r365, r4, r367; +} +{ +add.f16x2 r368, %35, r365; +} +{ +cvt.rn.f16.f64 rs40, fd184; +} +mov.b32 r373, {rs40, rs40}; +{ +mul.f16x2 r371, r7, r373; +} +{ +add.f16x2 r374, r375, r371; +} +{ +cvt.rn.f16.f64 rs41, fd255; +} +mov.b32 r379, {rs41, rs41}; +{ +mul.f16x2 r377, r13, r379; +} +{ +add.f16x2 r380, r356, r377; +} +{ +cvt.rn.f16.f64 rs42, fd204; +} +mov.b32 r385, {rs42, rs42}; +{ +mul.f16x2 r383, r22, r385; +} +{ +add.f16x2 r386, r362, r383; +} +{ +cvt.rn.f16.f64 rs43, fd255; +} +mov.b32 r391, {rs43, rs43}; +{ +mul.f16x2 r389, r16, r391; +} +{ +add.f16x2 r392, r368, r389; +} +{ +cvt.rn.f16.f64 rs44, fd204; +} +mov.b32 r397, {rs44, rs44}; +{ +mul.f16x2 r395, r19, r397; +} +{ +add.f16x2 r398, r374, r395; +} +{ +cvt.rn.f16.f64 rs45, fd243; +} +mov.b32 r403, {rs45, rs45}; +{ +mul.f16x2 r401, r25, r403; +} +{ +add.f16x2 r404, r380, r401; +} +{ +cvt.rn.f16.f64 rs46, fd244; +} +mov.b32 r409, {rs46, rs46}; +{ +mul.f16x2 r407, r34, r409; +} +{ +add.f16x2 r410, r386, r407; +} +{ +cvt.rn.f16.f64 rs47, fd243; +} +mov.b32 r415, {rs47, rs47}; +{ +mul.f16x2 r413, r28, r415; +} +{ +add.f16x2 r416, r392, r413; +} +{ +cvt.rn.f16.f64 rs48, fd244; +} +mov.b32 r421, {rs48, rs48}; +{ +mul.f16x2 r419, r31, r421; +} +{ +add.f16x2 r422, r398, r419; +} +{ +cvt.rn.f16.f64 rs49, fd227; +} +mov.b32 r427, {rs49, rs49}; +{ +mul.f16x2 r425, r37, r427; +} +{ +add.f16x2 r428, r404, r425; +} +{ +cvt.rn.f16.f64 rs50, fd228; +} +mov.b32 r433, {rs50, rs50}; +{ +mul.f16x2 r431, r46, r433; +} +{ +add.f16x2 r434, r410, r431; +} +{ +cvt.rn.f16.f64 rs51, fd227; +} +mov.b32 r439, {rs51, rs51}; +{ +mul.f16x2 r437, r40, r439; +} +{ +add.f16x2 r440, r416, r437; +} +{ +cvt.rn.f16.f64 rs52, fd228; +} +mov.b32 r445, {rs52, rs52}; +{ +mul.f16x2 r443, r43, r445; +} +{ +add.f16x2 r446, r422, r443; +} +{ +cvt.rn.f16.f64 rs53, fd235; +} +mov.b32 r451, {rs53, rs53}; +{ +mul.f16x2 r449, r49, r451; +} +{ +add.f16x2 r452, r428, r449; +} +mov.f64 fd136, 0d3FE0D8884363DD80; +{ +cvt.rn.f16.f64 rs54, fd136; +} +mov.b32 r457, {rs54, rs54}; +{ +mul.f16x2 r455, r58, r457; +} +{ +add.f16x2 r458, r434, r455; +} +{ +cvt.rn.f16.f64 rs55, fd235; +} +mov.b32 r463, {rs55, rs55}; +{ +mul.f16x2 r461, r52, r463; +} +{ +add.f16x2 r464, r440, r461; +} +{ +cvt.rn.f16.f64 rs56, fd136; +} +mov.b32 r469, {rs56, rs56}; +{ +mul.f16x2 r467, r55, r469; +} +{ +add.f16x2 r470, r446, r467; +} +{ +cvt.rn.f16.f64 rs57, fd251; +} +mov.b32 r475, {rs57, rs57}; +{ +mul.f16x2 r473, r61, r475; +} +{ +add.f16x2 r476, r452, r473; +} +mov.f64 fd168, 0d3FEEC746923C349F; +{ +cvt.rn.f16.f64 rs58, fd168; +} +mov.b32 r481, {rs58, rs58}; +{ +mul.f16x2 r479, r70, r481; +} +{ +add.f16x2 r482, r458, r479; +} +{ +cvt.rn.f16.f64 rs59, fd251; +} +mov.b32 r487, {rs59, rs59}; +{ +mul.f16x2 r485, r64, r487; +} +{ +add.f16x2 r488, r464, r485; +} +{ +cvt.rn.f16.f64 rs60, fd168; +} +mov.b32 r493, {rs60, rs60}; +{ +mul.f16x2 r491, r67, r493; +} +{ +add.f16x2 r494, r470, r491; +} +{ +cvt.rn.f16.f64 rs61, fd247; +} +mov.b32 r499, {rs61, rs61}; +{ +mul.f16x2 r497, r73, r499; +} +{ +add.f16x2 r500, r476, r497; +} +mov.f64 fd248, 0d3FECA52D7C9E640B; +{ +cvt.rn.f16.f64 rs62, fd248; +} +mov.b32 r505, {rs62, rs62}; +{ +mul.f16x2 r503, r82, r505; +} +{ +add.f16x2 r506, r482, r503; +} +{ +cvt.rn.f16.f64 rs63, fd247; +} +mov.b32 r511, {rs63, rs63}; +{ +mul.f16x2 r509, r76, r511; +} +{ +add.f16x2 r512, r488, r509; +} +{ +cvt.rn.f16.f64 rs64, fd248; +} +mov.b32 r517, {rs64, rs64}; +{ +mul.f16x2 r515, r79, r517; +} +{ +add.f16x2 r518, r494, r515; +} +{ +cvt.rn.f16.f64 rs65, fd231; +} +mov.b32 r523, {rs65, rs65}; +{ +mul.f16x2 r521, r85, r523; +} +{ +add.f16x2 r524, r500, r521; +} +mov.f64 fd232, 0d3FD71E955D8E7CDC; +{ +cvt.rn.f16.f64 rs66, fd232; +} +mov.b32 r529, {rs66, rs66}; +{ +mul.f16x2 r527, r94, r529; +} +{ +add.f16x2 r530, r506, r527; +} +{ +cvt.rn.f16.f64 rs67, fd231; +} +mov.b32 r535, {rs67, rs67}; +{ +mul.f16x2 r533, r88, r535; +} +{ +add.f16x2 r536, r512, r533; +} +{ +cvt.rn.f16.f64 rs68, fd232; +} +mov.b32 r541, {rs68, rs68}; +{ +mul.f16x2 r539, r91, r541; +} +{ +add.f16x2 r542, r518, r539; +} +{ +sub.f16x2 %4, r524, r530; +} +{ +add.f16x2 %5, r536, r542; +} +{ +add.f16x2 %30, r524, r530; +} +{ +sub.f16x2 %31, r536, r542; +} +cvt.rn.f16.s32 rs69, r1588; +mov.b32 r569, {rs69, rs69}; +cvt.rn.f16.s32 rs70, r1588; +mov.b32 r581, {rs70, rs70}; +{ +cvt.rn.f16.f64 rs71, fd247; +} +mov.b32 r561, {rs71, rs71}; +{ +mul.f16x2 r559, r1, r561; +} +{ +add.f16x2 r562, %34, r559; +} +{ +cvt.rn.f16.f64 rs72, fd144; +} +mov.b32 r567, {rs72, rs72}; +{ +mul.f16x2 r565, r10, r567; +} +{ +add.f16x2 r568, r569, r565; +} +{ +cvt.rn.f16.f64 rs73, fd247; +} +mov.b32 r573, {rs73, rs73}; +{ +mul.f16x2 r571, r4, r573; +} +{ +add.f16x2 r574, %35, r571; +} +{ +cvt.rn.f16.f64 rs74, fd144; +} +mov.b32 r579, {rs74, rs74}; +{ +mul.f16x2 r577, r7, r579; +} +{ +add.f16x2 r580, r581, r577; +} +{ +cvt.rn.f16.f64 rs75, fd243; +} +mov.b32 r585, {rs75, rs75}; +{ +mul.f16x2 r583, r13, r585; +} +{ +add.f16x2 r586, r562, r583; +} +{ +cvt.rn.f16.f64 rs76, fd244; +} +mov.b32 r591, {rs76, rs76}; +{ +mul.f16x2 r589, r22, r591; +} +{ +add.f16x2 r592, r568, r589; +} +{ +cvt.rn.f16.f64 rs77, fd243; +} +mov.b32 r597, {rs77, rs77}; +{ +mul.f16x2 r595, r16, r597; +} +{ +add.f16x2 r598, r574, r595; +} +{ +cvt.rn.f16.f64 rs78, fd244; +} +mov.b32 r603, {rs78, rs78}; +{ +mul.f16x2 r601, r19, r603; +} +{ +add.f16x2 r604, r580, r601; +} +{ +cvt.rn.f16.f64 rs79, fd227; +} +mov.b32 r609, {rs79, rs79}; +{ +mul.f16x2 r607, r25, r609; +} +{ +add.f16x2 r610, r586, r607; +} +mov.f64 fd76, 0d3FC7851AACD6C6B4; +{ +cvt.rn.f16.f64 rs80, fd76; +} +mov.b32 r615, {rs80, rs80}; +{ +mul.f16x2 r613, r34, r615; +} +{ +add.f16x2 r616, r592, r613; +} +{ +cvt.rn.f16.f64 rs81, fd227; +} +mov.b32 r621, {rs81, rs81}; +{ +mul.f16x2 r619, r28, r621; +} +{ +add.f16x2 r622, r598, r619; +} +{ +cvt.rn.f16.f64 rs82, fd76; +} +mov.b32 r627, {rs82, rs82}; +{ +mul.f16x2 r625, r31, r627; +} +{ +add.f16x2 r628, r604, r625; +} +{ +cvt.rn.f16.f64 rs83, fd251; +} +mov.b32 r633, {rs83, rs83}; +{ +mul.f16x2 r631, r37, r633; +} +{ +add.f16x2 r634, r610, r631; +} +{ +cvt.rn.f16.f64 rs84, fd168; +} +mov.b32 r639, {rs84, rs84}; +{ +mul.f16x2 r637, r46, r639; +} +{ +add.f16x2 r640, r616, r637; +} +{ +cvt.rn.f16.f64 rs85, fd251; +} +mov.b32 r645, {rs85, rs85}; +{ +mul.f16x2 r643, r40, r645; +} +{ +add.f16x2 r646, r622, r643; +} +{ +cvt.rn.f16.f64 rs86, fd168; +} +mov.b32 r651, {rs86, rs86}; +{ +mul.f16x2 r649, r43, r651; +} +{ +add.f16x2 r652, r628, r649; +} +{ +cvt.rn.f16.f64 rs87, fd239; +} +mov.b32 r657, {rs87, rs87}; +{ +mul.f16x2 r655, r49, r657; +} +{ +add.f16x2 r658, r634, r655; +} +mov.f64 fd240, 0d3FE58EEA2A9D6DA3; +{ +cvt.rn.f16.f64 rs88, fd240; +} +mov.b32 r663, {rs88, rs88}; +{ +mul.f16x2 r661, r58, r663; +} +{ +add.f16x2 r664, r640, r661; +} +{ +cvt.rn.f16.f64 rs89, fd239; +} +mov.b32 r669, {rs89, rs89}; +{ +mul.f16x2 r667, r52, r669; +} +{ +add.f16x2 r670, r646, r667; +} +{ +cvt.rn.f16.f64 rs90, fd240; +} +mov.b32 r675, {rs90, rs90}; +{ +mul.f16x2 r673, r55, r675; +} +{ +add.f16x2 r676, r652, r673; +} +{ +cvt.rn.f16.f64 rs91, fd231; +} +mov.b32 r681, {rs91, rs91}; +{ +mul.f16x2 r679, r61, r681; +} +{ +add.f16x2 r682, r658, r679; +} +{ +cvt.rn.f16.f64 rs92, fd212; +} +mov.b32 r687, {rs92, rs92}; +{ +mul.f16x2 r685, r70, r687; +} +{ +add.f16x2 r688, r664, r685; +} +{ +cvt.rn.f16.f64 rs93, fd231; +} +mov.b32 r693, {rs93, rs93}; +{ +mul.f16x2 r691, r64, r693; +} +{ +add.f16x2 r694, r670, r691; +} +{ +cvt.rn.f16.f64 rs94, fd212; +} +mov.b32 r699, {rs94, rs94}; +{ +mul.f16x2 r697, r67, r699; +} +{ +add.f16x2 r700, r676, r697; +} +{ +cvt.rn.f16.f64 rs95, fd255; +} +mov.b32 r705, {rs95, rs95}; +{ +mul.f16x2 r703, r73, r705; +} +{ +add.f16x2 r706, r682, r703; +} +{ +cvt.rn.f16.f64 rs96, fd204; +} +mov.b32 r711, {rs96, rs96}; +{ +mul.f16x2 r709, r82, r711; +} +{ +add.f16x2 r712, r688, r709; +} +{ +cvt.rn.f16.f64 rs97, fd255; +} +mov.b32 r717, {rs97, rs97}; +{ +mul.f16x2 r715, r76, r717; +} +{ +add.f16x2 r718, r694, r715; +} +{ +cvt.rn.f16.f64 rs98, fd204; +} +mov.b32 r723, {rs98, rs98}; +{ +mul.f16x2 r721, r79, r723; +} +{ +add.f16x2 r724, r700, r721; +} +{ +cvt.rn.f16.f64 rs99, fd235; +} +mov.b32 r729, {rs99, rs99}; +{ +mul.f16x2 r727, r85, r729; +} +{ +add.f16x2 r730, r706, r727; +} +{ +cvt.rn.f16.f64 rs100, fd236; +} +mov.b32 r735, {rs100, rs100}; +{ +mul.f16x2 r733, r94, r735; +} +{ +add.f16x2 r736, r712, r733; +} +{ +cvt.rn.f16.f64 rs101, fd235; +} +mov.b32 r741, {rs101, rs101}; +{ +mul.f16x2 r739, r88, r741; +} +{ +add.f16x2 r742, r718, r739; +} +{ +cvt.rn.f16.f64 rs102, fd236; +} +mov.b32 r747, {rs102, rs102}; +{ +mul.f16x2 r745, r91, r747; +} +{ +add.f16x2 r748, r724, r745; +} +{ +sub.f16x2 %6, r730, r736; +} +{ +add.f16x2 %7, r742, r748; +} +{ +add.f16x2 %28, r730, r736; +} +{ +sub.f16x2 %29, r742, r748; +} +cvt.rn.f16.s32 rs103, r1588; +mov.b32 r775, {rs103, rs103}; +cvt.rn.f16.s32 rs104, r1588; +mov.b32 r787, {rs104, rs104}; +{ +cvt.rn.f16.f64 rs105, fd255; +} +mov.b32 r767, {rs105, rs105}; +{ +mul.f16x2 r765, r1, r767; +} +{ +add.f16x2 r768, %34, r765; +} +{ +cvt.rn.f16.f64 rs106, fd204; +} +mov.b32 r773, {rs106, rs106}; +{ +mul.f16x2 r771, r10, r773; +} +{ +add.f16x2 r774, r775, r771; +} +{ +cvt.rn.f16.f64 rs107, fd255; +} +mov.b32 r779, {rs107, rs107}; +{ +mul.f16x2 r777, r4, r779; +} +{ +add.f16x2 r780, %35, r777; +} +{ +cvt.rn.f16.f64 rs108, fd204; +} +mov.b32 r785, {rs108, rs108}; +{ +mul.f16x2 r783, r7, r785; +} +{ +add.f16x2 r786, r787, r783; +} +{ +cvt.rn.f16.f64 rs109, fd227; +} +mov.b32 r791, {rs109, rs109}; +{ +mul.f16x2 r789, r13, r791; +} +{ +add.f16x2 r792, r768, r789; +} +{ +cvt.rn.f16.f64 rs110, fd228; +} +mov.b32 r797, {rs110, rs110}; +{ +mul.f16x2 r795, r22, r797; +} +{ +add.f16x2 r798, r774, r795; +} +{ +cvt.rn.f16.f64 rs111, fd227; +} +mov.b32 r803, {rs111, rs111}; +{ +mul.f16x2 r801, r16, r803; +} +{ +add.f16x2 r804, r780, r801; +} +{ +cvt.rn.f16.f64 rs112, fd228; +} +mov.b32 r809, {rs112, rs112}; +{ +mul.f16x2 r807, r19, r809; +} +{ +add.f16x2 r810, r786, r807; +} +{ +cvt.rn.f16.f64 rs113, fd251; +} +mov.b32 r815, {rs113, rs113}; +{ +mul.f16x2 r813, r25, r815; +} +{ +add.f16x2 r816, r792, r813; +} +{ +cvt.rn.f16.f64 rs114, fd168; +} +mov.b32 r821, {rs114, rs114}; +{ +mul.f16x2 r819, r34, r821; +} +{ +add.f16x2 r822, r798, r819; +} +{ +cvt.rn.f16.f64 rs115, fd251; +} +mov.b32 r827, {rs115, rs115}; +{ +mul.f16x2 r825, r28, r827; +} +{ +add.f16x2 r828, r804, r825; +} +{ +cvt.rn.f16.f64 rs116, fd168; +} +mov.b32 r833, {rs116, rs116}; +{ +mul.f16x2 r831, r31, r833; +} +{ +add.f16x2 r834, r810, r831; +} +{ +cvt.rn.f16.f64 rs117, fd231; +} +mov.b32 r839, {rs117, rs117}; +{ +mul.f16x2 r837, r37, r839; +} +{ +add.f16x2 r840, r816, r837; +} +{ +cvt.rn.f16.f64 rs118, fd232; +} +mov.b32 r845, {rs118, rs118}; +{ +mul.f16x2 r843, r46, r845; +} +{ +add.f16x2 r846, r822, r843; +} +{ +cvt.rn.f16.f64 rs119, fd231; +} +mov.b32 r851, {rs119, rs119}; +{ +mul.f16x2 r849, r40, r851; +} +{ +add.f16x2 r852, r828, r849; +} +{ +cvt.rn.f16.f64 rs120, fd232; +} +mov.b32 r857, {rs120, rs120}; +{ +mul.f16x2 r855, r43, r857; +} +{ +add.f16x2 r858, r834, r855; +} +{ +cvt.rn.f16.f64 rs121, fd247; +} +mov.b32 r863, {rs121, rs121}; +{ +mul.f16x2 r861, r49, r863; +} +{ +add.f16x2 r864, r840, r861; +} +{ +cvt.rn.f16.f64 rs122, fd144; +} +mov.b32 r869, {rs122, rs122}; +{ +mul.f16x2 r867, r58, r869; +} +{ +add.f16x2 r870, r846, r867; +} +{ +cvt.rn.f16.f64 rs123, fd247; +} +mov.b32 r875, {rs123, rs123}; +{ +mul.f16x2 r873, r52, r875; +} +{ +add.f16x2 r876, r852, r873; +} +{ +cvt.rn.f16.f64 rs124, fd144; +} +mov.b32 r881, {rs124, rs124}; +{ +mul.f16x2 r879, r55, r881; +} +{ +add.f16x2 r882, r858, r879; +} +{ +cvt.rn.f16.f64 rs125, fd235; +} +mov.b32 r887, {rs125, rs125}; +{ +mul.f16x2 r885, r61, r887; +} +{ +add.f16x2 r888, r864, r885; +} +{ +cvt.rn.f16.f64 rs126, fd236; +} +mov.b32 r893, {rs126, rs126}; +{ +mul.f16x2 r891, r70, r893; +} +{ +add.f16x2 r894, r870, r891; +} +{ +cvt.rn.f16.f64 rs127, fd235; +} +mov.b32 r899, {rs127, rs127}; +{ +mul.f16x2 r897, r64, r899; +} +{ +add.f16x2 r900, r876, r897; +} +{ +cvt.rn.f16.f64 rs128, fd236; +} +mov.b32 r905, {rs128, rs128}; +{ +mul.f16x2 r903, r67, r905; +} +{ +add.f16x2 r906, r882, r903; +} +{ +cvt.rn.f16.f64 rs129, fd243; +} +mov.b32 r911, {rs129, rs129}; +{ +mul.f16x2 r909, r73, r911; +} +{ +add.f16x2 r912, r888, r909; +} +mov.f64 fd208, 0d3FE9895B6C9A05F6; +{ +cvt.rn.f16.f64 rs130, fd208; +} +mov.b32 r917, {rs130, rs130}; +{ +mul.f16x2 r915, r82, r917; +} +{ +add.f16x2 r918, r894, r915; +} +{ +cvt.rn.f16.f64 rs131, fd243; +} +mov.b32 r923, {rs131, rs131}; +{ +mul.f16x2 r921, r76, r923; +} +{ +add.f16x2 r924, r900, r921; +} +{ +cvt.rn.f16.f64 rs132, fd208; +} +mov.b32 r929, {rs132, rs132}; +{ +mul.f16x2 r927, r79, r929; +} +{ +add.f16x2 r930, r906, r927; +} +{ +cvt.rn.f16.f64 rs133, fd239; +} +mov.b32 r935, {rs133, rs133}; +{ +mul.f16x2 r933, r85, r935; +} +{ +add.f16x2 r936, r912, r933; +} +{ +cvt.rn.f16.f64 rs134, fd240; +} +mov.b32 r941, {rs134, rs134}; +{ +mul.f16x2 r939, r94, r941; +} +{ +add.f16x2 r942, r918, r939; +} +{ +cvt.rn.f16.f64 rs135, fd239; +} +mov.b32 r947, {rs135, rs135}; +{ +mul.f16x2 r945, r88, r947; +} +{ +add.f16x2 r948, r924, r945; +} +{ +cvt.rn.f16.f64 rs136, fd240; +} +mov.b32 r953, {rs136, rs136}; +{ +mul.f16x2 r951, r91, r953; +} +{ +add.f16x2 r954, r930, r951; +} +{ +sub.f16x2 %8, r936, r942; +} +{ +add.f16x2 %9, r948, r954; +} +{ +add.f16x2 %26, r936, r942; +} +{ +sub.f16x2 %27, r948, r954; +} +cvt.rn.f16.s32 rs137, r1588; +mov.b32 r981, {rs137, rs137}; +cvt.rn.f16.s32 rs138, r1588; +mov.b32 r993, {rs138, rs138}; +{ +cvt.rn.f16.f64 rs139, fd251; +} +mov.b32 r973, {rs139, rs139}; +{ +mul.f16x2 r971, r1, r973; +} +{ +add.f16x2 r974, %34, r971; +} +{ +cvt.rn.f16.f64 rs140, fd252; +} +mov.b32 r979, {rs140, rs140}; +{ +mul.f16x2 r977, r10, r979; +} +{ +add.f16x2 r980, r981, r977; +} +{ +cvt.rn.f16.f64 rs141, fd251; +} +mov.b32 r985, {rs141, rs141}; +{ +mul.f16x2 r983, r4, r985; +} +{ +add.f16x2 r986, %35, r983; +} +{ +cvt.rn.f16.f64 rs142, fd252; +} +mov.b32 r991, {rs142, rs142}; +{ +mul.f16x2 r989, r7, r991; +} +{ +add.f16x2 r992, r993, r989; +} +{ +cvt.rn.f16.f64 rs143, fd235; +} +mov.b32 r997, {rs143, rs143}; +{ +mul.f16x2 r995, r13, r997; +} +{ +add.f16x2 r998, r974, r995; +} +{ +cvt.rn.f16.f64 rs144, fd136; +} +mov.b32 r1003, {rs144, rs144}; +{ +mul.f16x2 r1001, r22, r1003; +} +{ +add.f16x2 r1004, r980, r1001; +} +{ +cvt.rn.f16.f64 rs145, fd235; +} +mov.b32 r1009, {rs145, rs145}; +{ +mul.f16x2 r1007, r16, r1009; +} +{ +add.f16x2 r1010, r986, r1007; +} +{ +cvt.rn.f16.f64 rs146, fd136; +} +mov.b32 r1015, {rs146, rs146}; +{ +mul.f16x2 r1013, r19, r1015; +} +{ +add.f16x2 r1016, r992, r1013; +} +{ +cvt.rn.f16.f64 rs147, fd239; +} +mov.b32 r1021, {rs147, rs147}; +{ +mul.f16x2 r1019, r25, r1021; +} +{ +add.f16x2 r1022, r998, r1019; +} +{ +cvt.rn.f16.f64 rs148, fd240; +} +mov.b32 r1027, {rs148, rs148}; +{ +mul.f16x2 r1025, r34, r1027; +} +{ +add.f16x2 r1028, r1004, r1025; +} +{ +cvt.rn.f16.f64 rs149, fd239; +} +mov.b32 r1033, {rs149, rs149}; +{ +mul.f16x2 r1031, r28, r1033; +} +{ +add.f16x2 r1034, r1010, r1031; +} +{ +cvt.rn.f16.f64 rs150, fd240; +} +mov.b32 r1039, {rs150, rs150}; +{ +mul.f16x2 r1037, r31, r1039; +} +{ +add.f16x2 r1040, r1016, r1037; +} +{ +cvt.rn.f16.f64 rs151, fd247; +} +mov.b32 r1045, {rs151, rs151}; +{ +mul.f16x2 r1043, r37, r1045; +} +{ +add.f16x2 r1046, r1022, r1043; +} +{ +cvt.rn.f16.f64 rs152, fd144; +} +mov.b32 r1051, {rs152, rs152}; +{ +mul.f16x2 r1049, r46, r1051; +} +{ +add.f16x2 r1052, r1028, r1049; +} +{ +cvt.rn.f16.f64 rs153, fd247; +} +mov.b32 r1057, {rs153, rs153}; +{ +mul.f16x2 r1055, r40, r1057; +} +{ +add.f16x2 r1058, r1034, r1055; +} +{ +cvt.rn.f16.f64 rs154, fd144; +} +mov.b32 r1063, {rs154, rs154}; +{ +mul.f16x2 r1061, r43, r1063; +} +{ +add.f16x2 r1064, r1040, r1061; +} +{ +cvt.rn.f16.f64 rs155, fd227; +} +mov.b32 r1069, {rs155, rs155}; +{ +mul.f16x2 r1067, r49, r1069; +} +{ +add.f16x2 r1070, r1046, r1067; +} +{ +cvt.rn.f16.f64 rs156, fd228; +} +mov.b32 r1075, {rs156, rs156}; +{ +mul.f16x2 r1073, r58, r1075; +} +{ +add.f16x2 r1076, r1052, r1073; +} +{ +cvt.rn.f16.f64 rs157, fd227; +} +mov.b32 r1081, {rs157, rs157}; +{ +mul.f16x2 r1079, r52, r1081; +} +{ +add.f16x2 r1082, r1058, r1079; +} +{ +cvt.rn.f16.f64 rs158, fd228; +} +mov.b32 r1087, {rs158, rs158}; +{ +mul.f16x2 r1085, r55, r1087; +} +{ +add.f16x2 r1088, r1064, r1085; +} +{ +cvt.rn.f16.f64 rs159, fd255; +} +mov.b32 r1093, {rs159, rs159}; +{ +mul.f16x2 r1091, r61, r1093; +} +{ +add.f16x2 r1094, r1070, r1091; +} +mov.f64 fd256, 0d3FEFDD0DEB564B22; +{ +cvt.rn.f16.f64 rs160, fd256; +} +mov.b32 r1099, {rs160, rs160}; +{ +mul.f16x2 r1097, r70, r1099; +} +{ +add.f16x2 r1100, r1076, r1097; +} +{ +cvt.rn.f16.f64 rs161, fd255; +} +mov.b32 r1105, {rs161, rs161}; +{ +mul.f16x2 r1103, r64, r1105; +} +{ +add.f16x2 r1106, r1082, r1103; +} +{ +cvt.rn.f16.f64 rs162, fd256; +} +mov.b32 r1111, {rs162, rs162}; +{ +mul.f16x2 r1109, r67, r1111; +} +{ +add.f16x2 r1112, r1088, r1109; +} +{ +cvt.rn.f16.f64 rs163, fd231; +} +mov.b32 r1117, {rs163, rs163}; +{ +mul.f16x2 r1115, r73, r1117; +} +{ +add.f16x2 r1118, r1094, r1115; +} +{ +cvt.rn.f16.f64 rs164, fd212; +} +mov.b32 r1123, {rs164, rs164}; +{ +mul.f16x2 r1121, r82, r1123; +} +{ +add.f16x2 r1124, r1100, r1121; +} +{ +cvt.rn.f16.f64 rs165, fd231; +} +mov.b32 r1129, {rs165, rs165}; +{ +mul.f16x2 r1127, r76, r1129; +} +{ +add.f16x2 r1130, r1106, r1127; +} +{ +cvt.rn.f16.f64 rs166, fd212; +} +mov.b32 r1135, {rs166, rs166}; +{ +mul.f16x2 r1133, r79, r1135; +} +{ +add.f16x2 r1136, r1112, r1133; +} +{ +cvt.rn.f16.f64 rs167, fd243; +} +mov.b32 r1141, {rs167, rs167}; +{ +mul.f16x2 r1139, r85, r1141; +} +{ +add.f16x2 r1142, r1118, r1139; +} +{ +cvt.rn.f16.f64 rs168, fd244; +} +mov.b32 r1147, {rs168, rs168}; +{ +mul.f16x2 r1145, r94, r1147; +} +{ +add.f16x2 r1148, r1124, r1145; +} +{ +cvt.rn.f16.f64 rs169, fd243; +} +mov.b32 r1153, {rs169, rs169}; +{ +mul.f16x2 r1151, r88, r1153; +} +{ +add.f16x2 r1154, r1130, r1151; +} +{ +cvt.rn.f16.f64 rs170, fd244; +} +mov.b32 r1159, {rs170, rs170}; +{ +mul.f16x2 r1157, r91, r1159; +} +{ +add.f16x2 r1160, r1136, r1157; +} +{ +sub.f16x2 %10, r1142, r1148; +} +{ +add.f16x2 %11, r1154, r1160; +} +{ +add.f16x2 %24, r1142, r1148; +} +{ +sub.f16x2 %25, r1154, r1160; +} +cvt.rn.f16.s32 rs171, r1588; +mov.b32 r1187, {rs171, rs171}; +cvt.rn.f16.s32 rs172, r1588; +mov.b32 r1199, {rs172, rs172}; +{ +cvt.rn.f16.f64 rs173, fd243; +} +mov.b32 r1179, {rs173, rs173}; +{ +mul.f16x2 r1177, r1, r1179; +} +{ +add.f16x2 r1180, %34, r1177; +} +{ +cvt.rn.f16.f64 rs174, fd244; +} +mov.b32 r1185, {rs174, rs174}; +{ +mul.f16x2 r1183, r10, r1185; +} +{ +add.f16x2 r1186, r1187, r1183; +} +{ +cvt.rn.f16.f64 rs175, fd243; +} +mov.b32 r1191, {rs175, rs175}; +{ +mul.f16x2 r1189, r4, r1191; +} +{ +add.f16x2 r1192, %35, r1189; +} +{ +cvt.rn.f16.f64 rs176, fd244; +} +mov.b32 r1197, {rs176, rs176}; +{ +mul.f16x2 r1195, r7, r1197; +} +{ +add.f16x2 r1198, r1199, r1195; +} +{ +cvt.rn.f16.f64 rs177, fd251; +} +mov.b32 r1203, {rs177, rs177}; +{ +mul.f16x2 r1201, r13, r1203; +} +{ +add.f16x2 r1204, r1180, r1201; +} +{ +cvt.rn.f16.f64 rs178, fd168; +} +mov.b32 r1209, {rs178, rs178}; +{ +mul.f16x2 r1207, r22, r1209; +} +{ +add.f16x2 r1210, r1186, r1207; +} +{ +cvt.rn.f16.f64 rs179, fd251; +} +mov.b32 r1215, {rs179, rs179}; +{ +mul.f16x2 r1213, r16, r1215; +} +{ +add.f16x2 r1216, r1192, r1213; +} +{ +cvt.rn.f16.f64 rs180, fd168; +} +mov.b32 r1221, {rs180, rs180}; +{ +mul.f16x2 r1219, r19, r1221; +} +{ +add.f16x2 r1222, r1198, r1219; +} +{ +cvt.rn.f16.f64 rs181, fd231; +} +mov.b32 r1227, {rs181, rs181}; +{ +mul.f16x2 r1225, r25, r1227; +} +{ +add.f16x2 r1228, r1204, r1225; +} +{ +cvt.rn.f16.f64 rs182, fd212; +} +mov.b32 r1233, {rs182, rs182}; +{ +mul.f16x2 r1231, r34, r1233; +} +{ +add.f16x2 r1234, r1210, r1231; +} +{ +cvt.rn.f16.f64 rs183, fd231; +} +mov.b32 r1239, {rs183, rs183}; +{ +mul.f16x2 r1237, r28, r1239; +} +{ +add.f16x2 r1240, r1216, r1237; +} +{ +cvt.rn.f16.f64 rs184, fd212; +} +mov.b32 r1245, {rs184, rs184}; +{ +mul.f16x2 r1243, r31, r1245; +} +{ +add.f16x2 r1246, r1222, r1243; +} +{ +cvt.rn.f16.f64 rs185, fd235; +} +mov.b32 r1251, {rs185, rs185}; +{ +mul.f16x2 r1249, r37, r1251; +} +{ +add.f16x2 r1252, r1228, r1249; +} +{ +cvt.rn.f16.f64 rs186, fd236; +} +mov.b32 r1257, {rs186, rs186}; +{ +mul.f16x2 r1255, r46, r1257; +} +{ +add.f16x2 r1258, r1234, r1255; +} +{ +cvt.rn.f16.f64 rs187, fd235; +} +mov.b32 r1263, {rs187, rs187}; +{ +mul.f16x2 r1261, r40, r1263; +} +{ +add.f16x2 r1264, r1240, r1261; +} +{ +cvt.rn.f16.f64 rs188, fd236; +} +mov.b32 r1269, {rs188, rs188}; +{ +mul.f16x2 r1267, r43, r1269; +} +{ +add.f16x2 r1270, r1246, r1267; +} +{ +cvt.rn.f16.f64 rs189, fd255; +} +mov.b32 r1275, {rs189, rs189}; +{ +mul.f16x2 r1273, r49, r1275; +} +{ +add.f16x2 r1276, r1252, r1273; +} +{ +cvt.rn.f16.f64 rs190, fd256; +} +mov.b32 r1281, {rs190, rs190}; +{ +mul.f16x2 r1279, r58, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs191, fd255; +} +mov.b32 r1287, {rs191, rs191}; +{ +mul.f16x2 r1285, r52, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +cvt.rn.f16.f64 rs192, fd256; +} +mov.b32 r1293, {rs192, rs192}; +{ +mul.f16x2 r1291, r55, r1293; +} +{ +add.f16x2 r1294, r1270, r1291; +} +{ +cvt.rn.f16.f64 rs193, fd239; +} +mov.b32 r1299, {rs193, rs193}; +{ +mul.f16x2 r1297, r61, r1299; +} +{ +add.f16x2 r1300, r1276, r1297; +} +{ +cvt.rn.f16.f64 rs194, fd184; +} +mov.b32 r1305, {rs194, rs194}; +{ +mul.f16x2 r1303, r70, r1305; +} +{ +add.f16x2 r1306, r1282, r1303; +} +{ +cvt.rn.f16.f64 rs195, fd239; +} +mov.b32 r1311, {rs195, rs195}; +{ +mul.f16x2 r1309, r64, r1311; +} +{ +add.f16x2 r1312, r1288, r1309; +} +{ +cvt.rn.f16.f64 rs196, fd184; +} +mov.b32 r1317, {rs196, rs196}; +{ +mul.f16x2 r1315, r67, r1317; +} +{ +add.f16x2 r1318, r1294, r1315; +} +{ +cvt.rn.f16.f64 rs197, fd227; +} +mov.b32 r1323, {rs197, rs197}; +{ +mul.f16x2 r1321, r73, r1323; +} +{ +add.f16x2 r1324, r1300, r1321; +} +{ +cvt.rn.f16.f64 rs198, fd228; +} +mov.b32 r1329, {rs198, rs198}; +{ +mul.f16x2 r1327, r82, r1329; +} +{ +add.f16x2 r1330, r1306, r1327; +} +{ +cvt.rn.f16.f64 rs199, fd227; +} +mov.b32 r1335, {rs199, rs199}; +{ +mul.f16x2 r1333, r76, r1335; +} +{ +add.f16x2 r1336, r1312, r1333; +} +{ +cvt.rn.f16.f64 rs200, fd228; +} +mov.b32 r1341, {rs200, rs200}; +{ +mul.f16x2 r1339, r79, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs201, fd247; +} +mov.b32 r1347, {rs201, rs201}; +{ +mul.f16x2 r1345, r85, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs202, fd248; +} +mov.b32 r1353, {rs202, rs202}; +{ +mul.f16x2 r1351, r94, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs203, fd247; +} +mov.b32 r1359, {rs203, rs203}; +{ +mul.f16x2 r1357, r88, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs204, fd248; +} +mov.b32 r1365, {rs204, rs204}; +{ +mul.f16x2 r1363, r91, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +sub.f16x2 %12, r1348, r1354; +} +{ +add.f16x2 %13, r1360, r1366; +} +{ +add.f16x2 %22, r1348, r1354; +} +{ +sub.f16x2 %23, r1360, r1366; +} +cvt.rn.f16.s32 rs205, r1588; +mov.b32 r1393, {rs205, rs205}; +cvt.rn.f16.s32 rs206, r1588; +mov.b32 r1405, {rs206, rs206}; +{ +cvt.rn.f16.f64 rs207, fd235; +} +mov.b32 r1385, {rs207, rs207}; +{ +mul.f16x2 r1383, r1, r1385; +} +{ +add.f16x2 r1386, %34, r1383; +} +{ +cvt.rn.f16.f64 rs208, fd236; +} +mov.b32 r1391, {rs208, rs208}; +{ +mul.f16x2 r1389, r10, r1391; +} +{ +add.f16x2 r1392, r1393, r1389; +} +{ +cvt.rn.f16.f64 rs209, fd235; +} +mov.b32 r1397, {rs209, rs209}; +{ +mul.f16x2 r1395, r4, r1397; +} +{ +add.f16x2 r1398, %35, r1395; +} +{ +cvt.rn.f16.f64 rs210, fd236; +} +mov.b32 r1403, {rs210, rs210}; +{ +mul.f16x2 r1401, r7, r1403; +} +{ +add.f16x2 r1404, r1405, r1401; +} +{ +cvt.rn.f16.f64 rs211, fd247; +} +mov.b32 r1409, {rs211, rs211}; +{ +mul.f16x2 r1407, r13, r1409; +} +{ +add.f16x2 r1410, r1386, r1407; +} +{ +cvt.rn.f16.f64 rs212, fd248; +} +mov.b32 r1415, {rs212, rs212}; +{ +mul.f16x2 r1413, r22, r1415; +} +{ +add.f16x2 r1416, r1392, r1413; +} +{ +cvt.rn.f16.f64 rs213, fd247; +} +mov.b32 r1421, {rs213, rs213}; +{ +mul.f16x2 r1419, r16, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs214, fd248; +} +mov.b32 r1427, {rs214, rs214}; +{ +mul.f16x2 r1425, r19, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs215, fd255; +} +mov.b32 r1433, {rs215, rs215}; +{ +mul.f16x2 r1431, r25, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs216, fd204; +} +mov.b32 r1439, {rs216, rs216}; +{ +mul.f16x2 r1437, r34, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs217, fd255; +} +mov.b32 r1445, {rs217, rs217}; +{ +mul.f16x2 r1443, r28, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs218, fd204; +} +mov.b32 r1451, {rs218, rs218}; +{ +mul.f16x2 r1449, r31, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs219, fd243; +} +mov.b32 r1457, {rs219, rs219}; +{ +mul.f16x2 r1455, r37, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs220, fd208; +} +mov.b32 r1463, {rs220, rs220}; +{ +mul.f16x2 r1461, r46, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs221, fd243; +} +mov.b32 r1469, {rs221, rs221}; +{ +mul.f16x2 r1467, r40, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs222, fd208; +} +mov.b32 r1475, {rs222, rs222}; +{ +mul.f16x2 r1473, r43, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs223, fd231; +} +mov.b32 r1481, {rs223, rs223}; +{ +mul.f16x2 r1479, r49, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs224, fd212; +} +mov.b32 r1487, {rs224, rs224}; +{ +mul.f16x2 r1485, r58, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs225, fd231; +} +mov.b32 r1493, {rs225, rs225}; +{ +mul.f16x2 r1491, r52, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs226, fd212; +} +mov.b32 r1499, {rs226, rs226}; +{ +mul.f16x2 r1497, r55, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs227, fd227; +} +mov.b32 r1505, {rs227, rs227}; +{ +mul.f16x2 r1503, r61, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +cvt.rn.f16.f64 rs228, fd228; +} +mov.b32 r1511, {rs228, rs228}; +{ +mul.f16x2 r1509, r70, r1511; +} +{ +add.f16x2 r1512, r1488, r1509; +} +{ +cvt.rn.f16.f64 rs229, fd227; +} +mov.b32 r1517, {rs229, rs229}; +{ +mul.f16x2 r1515, r64, r1517; +} +{ +add.f16x2 r1518, r1494, r1515; +} +{ +cvt.rn.f16.f64 rs230, fd228; +} +mov.b32 r1523, {rs230, rs230}; +{ +mul.f16x2 r1521, r67, r1523; +} +{ +add.f16x2 r1524, r1500, r1521; +} +{ +cvt.rn.f16.f64 rs231, fd239; +} +mov.b32 r1529, {rs231, rs231}; +{ +mul.f16x2 r1527, r73, r1529; +} +{ +add.f16x2 r1530, r1506, r1527; +} +{ +cvt.rn.f16.f64 rs232, fd240; +} +mov.b32 r1535, {rs232, rs232}; +{ +mul.f16x2 r1533, r82, r1535; +} +{ +add.f16x2 r1536, r1512, r1533; +} +{ +cvt.rn.f16.f64 rs233, fd239; +} +mov.b32 r1541, {rs233, rs233}; +{ +mul.f16x2 r1539, r76, r1541; +} +{ +add.f16x2 r1542, r1518, r1539; +} +{ +cvt.rn.f16.f64 rs234, fd240; +} +mov.b32 r1547, {rs234, rs234}; +{ +mul.f16x2 r1545, r79, r1547; +} +{ +add.f16x2 r1548, r1524, r1545; +} +{ +cvt.rn.f16.f64 rs235, fd251; +} +mov.b32 r1553, {rs235, rs235}; +{ +mul.f16x2 r1551, r85, r1553; +} +{ +add.f16x2 r1554, r1530, r1551; +} +{ +cvt.rn.f16.f64 rs236, fd252; +} +mov.b32 r1559, {rs236, rs236}; +{ +mul.f16x2 r1557, r94, r1559; +} +{ +add.f16x2 r1560, r1536, r1557; +} +{ +cvt.rn.f16.f64 rs237, fd251; +} +mov.b32 r1565, {rs237, rs237}; +{ +mul.f16x2 r1563, r88, r1565; +} +{ +add.f16x2 r1566, r1542, r1563; +} +{ +cvt.rn.f16.f64 rs238, fd252; +} +mov.b32 r1571, {rs238, rs238}; +{ +mul.f16x2 r1569, r91, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +{ +sub.f16x2 %14, r1554, r1560; +} +{ +add.f16x2 %15, r1566, r1572; +} +{ +add.f16x2 %20, r1554, r1560; +} +{ +sub.f16x2 %21, r1566, r1572; +} +cvt.rn.f16.s32 rs239, r1588; +mov.b32 r1599, {rs239, rs239}; +cvt.rn.f16.s32 rs240, r1588; +mov.b32 r1611, {rs240, rs240}; +{ +cvt.rn.f16.f64 rs241, fd227; +} +mov.b32 r1591, {rs241, rs241}; +{ +mul.f16x2 r1589, r1, r1591; +} +{ +add.f16x2 r1592, %34, r1589; +} +{ +cvt.rn.f16.f64 rs242, fd228; +} +mov.b32 r1597, {rs242, rs242}; +{ +mul.f16x2 r1595, r10, r1597; +} +{ +add.f16x2 r1598, r1599, r1595; +} +{ +cvt.rn.f16.f64 rs243, fd227; +} +mov.b32 r1603, {rs243, rs243}; +{ +mul.f16x2 r1601, r4, r1603; +} +{ +add.f16x2 r1604, %35, r1601; +} +{ +cvt.rn.f16.f64 rs244, fd228; +} +mov.b32 r1609, {rs244, rs244}; +{ +mul.f16x2 r1607, r7, r1609; +} +{ +add.f16x2 r1610, r1611, r1607; +} +{ +cvt.rn.f16.f64 rs245, fd231; +} +mov.b32 r1615, {rs245, rs245}; +{ +mul.f16x2 r1613, r13, r1615; +} +{ +add.f16x2 r1616, r1592, r1613; +} +{ +cvt.rn.f16.f64 rs246, fd232; +} +mov.b32 r1621, {rs246, rs246}; +{ +mul.f16x2 r1619, r22, r1621; +} +{ +add.f16x2 r1622, r1598, r1619; +} +{ +cvt.rn.f16.f64 rs247, fd231; +} +mov.b32 r1627, {rs247, rs247}; +{ +mul.f16x2 r1625, r16, r1627; +} +{ +add.f16x2 r1628, r1604, r1625; +} +{ +cvt.rn.f16.f64 rs248, fd232; +} +mov.b32 r1633, {rs248, rs248}; +{ +mul.f16x2 r1631, r19, r1633; +} +{ +add.f16x2 r1634, r1610, r1631; +} +{ +cvt.rn.f16.f64 rs249, fd235; +} +mov.b32 r1639, {rs249, rs249}; +{ +mul.f16x2 r1637, r25, r1639; +} +{ +add.f16x2 r1640, r1616, r1637; +} +{ +cvt.rn.f16.f64 rs250, fd236; +} +mov.b32 r1645, {rs250, rs250}; +{ +mul.f16x2 r1643, r34, r1645; +} +{ +add.f16x2 r1646, r1622, r1643; +} +{ +cvt.rn.f16.f64 rs251, fd235; +} +mov.b32 r1651, {rs251, rs251}; +{ +mul.f16x2 r1649, r28, r1651; +} +{ +add.f16x2 r1652, r1628, r1649; +} +{ +cvt.rn.f16.f64 rs252, fd236; +} +mov.b32 r1657, {rs252, rs252}; +{ +mul.f16x2 r1655, r31, r1657; +} +{ +add.f16x2 r1658, r1634, r1655; +} +{ +cvt.rn.f16.f64 rs253, fd239; +} +mov.b32 r1663, {rs253, rs253}; +{ +mul.f16x2 r1661, r37, r1663; +} +{ +add.f16x2 r1664, r1640, r1661; +} +{ +cvt.rn.f16.f64 rs254, fd240; +} +mov.b32 r1669, {rs254, rs254}; +{ +mul.f16x2 r1667, r46, r1669; +} +{ +add.f16x2 r1670, r1646, r1667; +} +{ +cvt.rn.f16.f64 rs255, fd239; +} +mov.b32 r1675, {rs255, rs255}; +{ +mul.f16x2 r1673, r40, r1675; +} +{ +add.f16x2 r1676, r1652, r1673; +} +{ +cvt.rn.f16.f64 rs256, fd240; +} +mov.b32 r1681, {rs256, rs256}; +{ +mul.f16x2 r1679, r43, r1681; +} +{ +add.f16x2 r1682, r1658, r1679; +} +{ +cvt.rn.f16.f64 rs257, fd243; +} +mov.b32 r1687, {rs257, rs257}; +{ +mul.f16x2 r1685, r49, r1687; +} +{ +add.f16x2 r1688, r1664, r1685; +} +{ +cvt.rn.f16.f64 rs258, fd244; +} +mov.b32 r1693, {rs258, rs258}; +{ +mul.f16x2 r1691, r58, r1693; +} +{ +add.f16x2 r1694, r1670, r1691; +} +{ +cvt.rn.f16.f64 rs259, fd243; +} +mov.b32 r1699, {rs259, rs259}; +{ +mul.f16x2 r1697, r52, r1699; +} +{ +add.f16x2 r1700, r1676, r1697; +} +{ +cvt.rn.f16.f64 rs260, fd244; +} +mov.b32 r1705, {rs260, rs260}; +{ +mul.f16x2 r1703, r55, r1705; +} +{ +add.f16x2 r1706, r1682, r1703; +} +{ +cvt.rn.f16.f64 rs261, fd247; +} +mov.b32 r1711, {rs261, rs261}; +{ +mul.f16x2 r1709, r61, r1711; +} +{ +add.f16x2 r1712, r1688, r1709; +} +{ +cvt.rn.f16.f64 rs262, fd248; +} +mov.b32 r1717, {rs262, rs262}; +{ +mul.f16x2 r1715, r70, r1717; +} +{ +add.f16x2 r1718, r1694, r1715; +} +{ +cvt.rn.f16.f64 rs263, fd247; +} +mov.b32 r1723, {rs263, rs263}; +{ +mul.f16x2 r1721, r64, r1723; +} +{ +add.f16x2 r1724, r1700, r1721; +} +{ +cvt.rn.f16.f64 rs264, fd248; +} +mov.b32 r1729, {rs264, rs264}; +{ +mul.f16x2 r1727, r67, r1729; +} +{ +add.f16x2 r1730, r1706, r1727; +} +{ +cvt.rn.f16.f64 rs265, fd251; +} +mov.b32 r1735, {rs265, rs265}; +{ +mul.f16x2 r1733, r73, r1735; +} +{ +add.f16x2 r1736, r1712, r1733; +} +{ +cvt.rn.f16.f64 rs266, fd252; +} +mov.b32 r1741, {rs266, rs266}; +{ +mul.f16x2 r1739, r82, r1741; +} +{ +add.f16x2 r1742, r1718, r1739; +} +{ +cvt.rn.f16.f64 rs267, fd251; +} +mov.b32 r1747, {rs267, rs267}; +{ +mul.f16x2 r1745, r76, r1747; +} +{ +add.f16x2 r1748, r1724, r1745; +} +{ +cvt.rn.f16.f64 rs268, fd252; +} +mov.b32 r1753, {rs268, rs268}; +{ +mul.f16x2 r1751, r79, r1753; +} +{ +add.f16x2 r1754, r1730, r1751; +} +{ +cvt.rn.f16.f64 rs269, fd255; +} +mov.b32 r1759, {rs269, rs269}; +{ +mul.f16x2 r1757, r85, r1759; +} +{ +add.f16x2 r1760, r1736, r1757; +} +{ +cvt.rn.f16.f64 rs270, fd256; +} +mov.b32 r1765, {rs270, rs270}; +{ +mul.f16x2 r1763, r94, r1765; +} +{ +add.f16x2 r1766, r1742, r1763; +} +{ +cvt.rn.f16.f64 rs271, fd255; +} +mov.b32 r1771, {rs271, rs271}; +{ +mul.f16x2 r1769, r88, r1771; +} +{ +add.f16x2 r1772, r1748, r1769; +} +{ +cvt.rn.f16.f64 rs272, fd256; +} +mov.b32 r1777, {rs272, rs272}; +{ +mul.f16x2 r1775, r91, r1777; +} +{ +add.f16x2 r1778, r1754, r1775; +} +{ +sub.f16x2 %16, r1760, r1766; +} +{ +add.f16x2 %17, r1772, r1778; +} +{ +add.f16x2 %18, r1760, r1766; +} +{ +sub.f16x2 %19, r1772, r1778; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..8b9d391d47be5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp16_inv.hpp.inc @@ -0,0 +1,2875 @@ +#ifndef CUFFTDX_FFT_17_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_17_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<951, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<273>; +.reg .b32 r<1793>; +.reg .f64 fd<257>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %36, %66; +} +{ +add.f16x2 r4, %37, %67; +} +{ +sub.f16x2 r7, %36, %66; +} +{ +sub.f16x2 r10, %37, %67; +} +{ +add.f16x2 r13, %38, %64; +} +{ +add.f16x2 r16, %39, %65; +} +{ +sub.f16x2 r19, %38, %64; +} +{ +sub.f16x2 r22, %39, %65; +} +{ +add.f16x2 r25, %40, %62; +} +{ +add.f16x2 r28, %41, %63; +} +{ +sub.f16x2 r31, %40, %62; +} +{ +sub.f16x2 r34, %41, %63; +} +{ +add.f16x2 r37, %42, %60; +} +{ +add.f16x2 r40, %43, %61; +} +{ +sub.f16x2 r43, %42, %60; +} +{ +sub.f16x2 r46, %43, %61; +} +{ +add.f16x2 r49, %44, %58; +} +{ +add.f16x2 r52, %45, %59; +} +{ +sub.f16x2 r55, %44, %58; +} +{ +sub.f16x2 r58, %45, %59; +} +{ +add.f16x2 r61, %46, %56; +} +{ +add.f16x2 r64, %47, %57; +} +{ +sub.f16x2 r67, %46, %56; +} +{ +sub.f16x2 r70, %47, %57; +} +{ +add.f16x2 r73, %48, %54; +} +{ +add.f16x2 r76, %49, %55; +} +{ +sub.f16x2 r79, %48, %54; +} +{ +sub.f16x2 r82, %49, %55; +} +{ +add.f16x2 r85, %50, %52; +} +{ +add.f16x2 r88, %51, %53; +} +{ +sub.f16x2 r91, %50, %52; +} +{ +sub.f16x2 r94, %51, %53; +} +{ +add.f16x2 r97, %34, r1; +} +{ +add.f16x2 r100, %35, r4; +} +{ +add.f16x2 r103, r97, r13; +} +{ +add.f16x2 r106, r100, r16; +} +{ +add.f16x2 r109, r103, r25; +} +{ +add.f16x2 r112, r106, r28; +} +{ +add.f16x2 r115, r109, r37; +} +{ +add.f16x2 r118, r112, r40; +} +{ +add.f16x2 r121, r115, r49; +} +{ +add.f16x2 r124, r118, r52; +} +{ +add.f16x2 r127, r121, r61; +} +{ +add.f16x2 r130, r124, r64; +} +{ +add.f16x2 r133, r127, r73; +} +{ +add.f16x2 r136, r130, r76; +} +{ +add.f16x2 %0, r133, r85; +} +{ +add.f16x2 %1, r136, r88; +} +mov.u32 r1588, 0; +cvt.rn.f16.s32 rs1, r1588; +mov.b32 r157, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r1588; +mov.b32 r169, {rs2, rs2}; +mov.f64 fd231, 0d3FEDD6D000370991; +{ +cvt.rn.f16.f64 rs3, fd231; +} +mov.b32 r149, {rs3, rs3}; +{ +mul.f16x2 r147, r1, r149; +} +{ +add.f16x2 r150, %34, r147; +} +mov.f64 fd212, 0d3FD71E955D8E7CDC; +{ +cvt.rn.f16.f64 rs4, fd212; +} +mov.b32 r155, {rs4, rs4}; +{ +mul.f16x2 r153, r10, r155; +} +{ +add.f16x2 r156, r157, r153; +} +{ +cvt.rn.f16.f64 rs5, fd231; +} +mov.b32 r161, {rs5, rs5}; +{ +mul.f16x2 r159, r4, r161; +} +{ +add.f16x2 r162, %35, r159; +} +{ +cvt.rn.f16.f64 rs6, fd212; +} +mov.b32 r167, {rs6, rs6}; +{ +mul.f16x2 r165, r7, r167; +} +{ +add.f16x2 r168, r169, r165; +} +mov.f64 fd239, 0d3FE7A5F6075D4884; +{ +cvt.rn.f16.f64 rs7, fd239; +} +mov.b32 r173, {rs7, rs7}; +{ +mul.f16x2 r171, r13, r173; +} +{ +add.f16x2 r174, r150, r171; +} +mov.f64 fd184, 0d3FE58EEA2A9D6DA3; +{ +cvt.rn.f16.f64 rs8, fd184; +} +mov.b32 r179, {rs8, rs8}; +{ +mul.f16x2 r177, r22, r179; +} +{ +add.f16x2 r180, r156, r177; +} +{ +cvt.rn.f16.f64 rs9, fd239; +} +mov.b32 r185, {rs9, rs9}; +{ +mul.f16x2 r183, r16, r185; +} +{ +add.f16x2 r186, r162, r183; +} +{ +cvt.rn.f16.f64 rs10, fd184; +} +mov.b32 r191, {rs10, rs10}; +{ +mul.f16x2 r189, r19, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd247, 0d3FDC86FA2B2883CD; +{ +cvt.rn.f16.f64 rs11, fd247; +} +mov.b32 r197, {rs11, rs11}; +{ +mul.f16x2 r195, r25, r197; +} +{ +add.f16x2 r198, r174, r195; +} +mov.f64 fd144, 0d3FECA52D7C9E640B; +{ +cvt.rn.f16.f64 rs12, fd144; +} +mov.b32 r203, {rs12, rs12}; +{ +mul.f16x2 r201, r34, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs13, fd247; +} +mov.b32 r209, {rs13, rs13}; +{ +mul.f16x2 r207, r28, r209; +} +{ +add.f16x2 r210, r186, r207; +} +{ +cvt.rn.f16.f64 rs14, fd144; +} +mov.b32 r215, {rs14, rs14}; +{ +mul.f16x2 r213, r31, r215; +} +{ +add.f16x2 r216, r192, r213; +} +mov.f64 fd255, 0d3FB79EE63259B75E; +{ +cvt.rn.f16.f64 rs15, fd255; +} +mov.b32 r221, {rs15, rs15}; +{ +mul.f16x2 r219, r37, r221; +} +{ +add.f16x2 r222, r198, r219; +} +mov.f64 fd204, 0d3FEFDD0DEB564B22; +{ +cvt.rn.f16.f64 rs16, fd204; +} +mov.b32 r227, {rs16, rs16}; +{ +mul.f16x2 r225, r46, r227; +} +{ +add.f16x2 r228, r204, r225; +} +{ +cvt.rn.f16.f64 rs17, fd255; +} +mov.b32 r233, {rs17, rs17}; +{ +mul.f16x2 r231, r40, r233; +} +{ +add.f16x2 r234, r210, r231; +} +{ +cvt.rn.f16.f64 rs18, fd204; +} +mov.b32 r239, {rs18, rs18}; +{ +mul.f16x2 r237, r43, r239; +} +{ +add.f16x2 r240, r216, r237; +} +mov.f64 fd251, 0dBFD183B1C61F0D01; +{ +cvt.rn.f16.f64 rs19, fd251; +} +mov.b32 r245, {rs19, rs19}; +{ +mul.f16x2 r243, r49, r245; +} +{ +add.f16x2 r246, r222, r243; +} +mov.f64 fd252, 0d3FEEC746923C349F; +{ +cvt.rn.f16.f64 rs20, fd252; +} +mov.b32 r251, {rs20, rs20}; +{ +mul.f16x2 r249, r58, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +cvt.rn.f16.f64 rs21, fd251; +} +mov.b32 r257, {rs21, rs21}; +{ +mul.f16x2 r255, r52, r257; +} +{ +add.f16x2 r258, r234, r255; +} +{ +cvt.rn.f16.f64 rs22, fd252; +} +mov.b32 r263, {rs22, rs22}; +{ +mul.f16x2 r261, r55, r263; +} +{ +add.f16x2 r264, r240, r261; +} +mov.f64 fd243, 0dBFE348C86ED5F1BB; +{ +cvt.rn.f16.f64 rs23, fd243; +} +mov.b32 r269, {rs23, rs23}; +{ +mul.f16x2 r267, r61, r269; +} +{ +add.f16x2 r270, r246, r267; +} +mov.f64 fd244, 0d3FE9895B6C9A05F6; +{ +cvt.rn.f16.f64 rs24, fd244; +} +mov.b32 r275, {rs24, rs24}; +{ +mul.f16x2 r273, r70, r275; +} +{ +add.f16x2 r276, r252, r273; +} +{ +cvt.rn.f16.f64 rs25, fd243; +} +mov.b32 r281, {rs25, rs25}; +{ +mul.f16x2 r279, r64, r281; +} +{ +add.f16x2 r282, r258, r279; +} +{ +cvt.rn.f16.f64 rs26, fd244; +} +mov.b32 r287, {rs26, rs26}; +{ +mul.f16x2 r285, r67, r287; +} +{ +add.f16x2 r288, r264, r285; +} +mov.f64 fd235, 0dBFEB34FA910EA3B9; +{ +cvt.rn.f16.f64 rs27, fd235; +} +mov.b32 r293, {rs27, rs27}; +{ +mul.f16x2 r291, r73, r293; +} +{ +add.f16x2 r294, r270, r291; +} +mov.f64 fd236, 0d3FE0D8884363DD80; +{ +cvt.rn.f16.f64 rs28, fd236; +} +mov.b32 r299, {rs28, rs28}; +{ +mul.f16x2 r297, r82, r299; +} +{ +add.f16x2 r300, r276, r297; +} +{ +cvt.rn.f16.f64 rs29, fd235; +} +mov.b32 r305, {rs29, rs29}; +{ +mul.f16x2 r303, r76, r305; +} +{ +add.f16x2 r306, r282, r303; +} +{ +cvt.rn.f16.f64 rs30, fd236; +} +mov.b32 r311, {rs30, rs30}; +{ +mul.f16x2 r309, r79, r311; +} +{ +add.f16x2 r312, r288, r309; +} +mov.f64 fd227, 0dBFEF7484007FAEF3; +{ +cvt.rn.f16.f64 rs31, fd227; +} +mov.b32 r317, {rs31, rs31}; +{ +mul.f16x2 r315, r85, r317; +} +{ +add.f16x2 r318, r294, r315; +} +mov.f64 fd228, 0d3FC7851AACD6C6B4; +{ +cvt.rn.f16.f64 rs32, fd228; +} +mov.b32 r323, {rs32, rs32}; +{ +mul.f16x2 r321, r94, r323; +} +{ +add.f16x2 r324, r300, r321; +} +{ +cvt.rn.f16.f64 rs33, fd227; +} +mov.b32 r329, {rs33, rs33}; +{ +mul.f16x2 r327, r88, r329; +} +{ +add.f16x2 r330, r306, r327; +} +{ +cvt.rn.f16.f64 rs34, fd228; +} +mov.b32 r335, {rs34, rs34}; +{ +mul.f16x2 r333, r91, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +sub.f16x2 %2, r318, r324; +} +{ +add.f16x2 %3, r330, r336; +} +{ +add.f16x2 %32, r318, r324; +} +{ +sub.f16x2 %33, r330, r336; +} +cvt.rn.f16.s32 rs35, r1588; +mov.b32 r363, {rs35, rs35}; +cvt.rn.f16.s32 rs36, r1588; +mov.b32 r375, {rs36, rs36}; +{ +cvt.rn.f16.f64 rs37, fd239; +} +mov.b32 r355, {rs37, rs37}; +{ +mul.f16x2 r353, r1, r355; +} +{ +add.f16x2 r356, %34, r353; +} +{ +cvt.rn.f16.f64 rs38, fd184; +} +mov.b32 r361, {rs38, rs38}; +{ +mul.f16x2 r359, r10, r361; +} +{ +add.f16x2 r362, r363, r359; +} +{ +cvt.rn.f16.f64 rs39, fd239; +} +mov.b32 r367, {rs39, rs39}; +{ +mul.f16x2 r365, r4, r367; +} +{ +add.f16x2 r368, %35, r365; +} +{ +cvt.rn.f16.f64 rs40, fd184; +} +mov.b32 r373, {rs40, rs40}; +{ +mul.f16x2 r371, r7, r373; +} +{ +add.f16x2 r374, r375, r371; +} +{ +cvt.rn.f16.f64 rs41, fd255; +} +mov.b32 r379, {rs41, rs41}; +{ +mul.f16x2 r377, r13, r379; +} +{ +add.f16x2 r380, r356, r377; +} +{ +cvt.rn.f16.f64 rs42, fd204; +} +mov.b32 r385, {rs42, rs42}; +{ +mul.f16x2 r383, r22, r385; +} +{ +add.f16x2 r386, r362, r383; +} +{ +cvt.rn.f16.f64 rs43, fd255; +} +mov.b32 r391, {rs43, rs43}; +{ +mul.f16x2 r389, r16, r391; +} +{ +add.f16x2 r392, r368, r389; +} +{ +cvt.rn.f16.f64 rs44, fd204; +} +mov.b32 r397, {rs44, rs44}; +{ +mul.f16x2 r395, r19, r397; +} +{ +add.f16x2 r398, r374, r395; +} +{ +cvt.rn.f16.f64 rs45, fd243; +} +mov.b32 r403, {rs45, rs45}; +{ +mul.f16x2 r401, r25, r403; +} +{ +add.f16x2 r404, r380, r401; +} +{ +cvt.rn.f16.f64 rs46, fd244; +} +mov.b32 r409, {rs46, rs46}; +{ +mul.f16x2 r407, r34, r409; +} +{ +add.f16x2 r410, r386, r407; +} +{ +cvt.rn.f16.f64 rs47, fd243; +} +mov.b32 r415, {rs47, rs47}; +{ +mul.f16x2 r413, r28, r415; +} +{ +add.f16x2 r416, r392, r413; +} +{ +cvt.rn.f16.f64 rs48, fd244; +} +mov.b32 r421, {rs48, rs48}; +{ +mul.f16x2 r419, r31, r421; +} +{ +add.f16x2 r422, r398, r419; +} +{ +cvt.rn.f16.f64 rs49, fd227; +} +mov.b32 r427, {rs49, rs49}; +{ +mul.f16x2 r425, r37, r427; +} +{ +add.f16x2 r428, r404, r425; +} +{ +cvt.rn.f16.f64 rs50, fd228; +} +mov.b32 r433, {rs50, rs50}; +{ +mul.f16x2 r431, r46, r433; +} +{ +add.f16x2 r434, r410, r431; +} +{ +cvt.rn.f16.f64 rs51, fd227; +} +mov.b32 r439, {rs51, rs51}; +{ +mul.f16x2 r437, r40, r439; +} +{ +add.f16x2 r440, r416, r437; +} +{ +cvt.rn.f16.f64 rs52, fd228; +} +mov.b32 r445, {rs52, rs52}; +{ +mul.f16x2 r443, r43, r445; +} +{ +add.f16x2 r446, r422, r443; +} +{ +cvt.rn.f16.f64 rs53, fd235; +} +mov.b32 r451, {rs53, rs53}; +{ +mul.f16x2 r449, r49, r451; +} +{ +add.f16x2 r452, r428, r449; +} +mov.f64 fd136, 0dBFE0D8884363DD80; +{ +cvt.rn.f16.f64 rs54, fd136; +} +mov.b32 r457, {rs54, rs54}; +{ +mul.f16x2 r455, r58, r457; +} +{ +add.f16x2 r458, r434, r455; +} +{ +cvt.rn.f16.f64 rs55, fd235; +} +mov.b32 r463, {rs55, rs55}; +{ +mul.f16x2 r461, r52, r463; +} +{ +add.f16x2 r464, r440, r461; +} +{ +cvt.rn.f16.f64 rs56, fd136; +} +mov.b32 r469, {rs56, rs56}; +{ +mul.f16x2 r467, r55, r469; +} +{ +add.f16x2 r470, r446, r467; +} +{ +cvt.rn.f16.f64 rs57, fd251; +} +mov.b32 r475, {rs57, rs57}; +{ +mul.f16x2 r473, r61, r475; +} +{ +add.f16x2 r476, r452, r473; +} +mov.f64 fd168, 0dBFEEC746923C349F; +{ +cvt.rn.f16.f64 rs58, fd168; +} +mov.b32 r481, {rs58, rs58}; +{ +mul.f16x2 r479, r70, r481; +} +{ +add.f16x2 r482, r458, r479; +} +{ +cvt.rn.f16.f64 rs59, fd251; +} +mov.b32 r487, {rs59, rs59}; +{ +mul.f16x2 r485, r64, r487; +} +{ +add.f16x2 r488, r464, r485; +} +{ +cvt.rn.f16.f64 rs60, fd168; +} +mov.b32 r493, {rs60, rs60}; +{ +mul.f16x2 r491, r67, r493; +} +{ +add.f16x2 r494, r470, r491; +} +{ +cvt.rn.f16.f64 rs61, fd247; +} +mov.b32 r499, {rs61, rs61}; +{ +mul.f16x2 r497, r73, r499; +} +{ +add.f16x2 r500, r476, r497; +} +mov.f64 fd248, 0dBFECA52D7C9E640B; +{ +cvt.rn.f16.f64 rs62, fd248; +} +mov.b32 r505, {rs62, rs62}; +{ +mul.f16x2 r503, r82, r505; +} +{ +add.f16x2 r506, r482, r503; +} +{ +cvt.rn.f16.f64 rs63, fd247; +} +mov.b32 r511, {rs63, rs63}; +{ +mul.f16x2 r509, r76, r511; +} +{ +add.f16x2 r512, r488, r509; +} +{ +cvt.rn.f16.f64 rs64, fd248; +} +mov.b32 r517, {rs64, rs64}; +{ +mul.f16x2 r515, r79, r517; +} +{ +add.f16x2 r518, r494, r515; +} +{ +cvt.rn.f16.f64 rs65, fd231; +} +mov.b32 r523, {rs65, rs65}; +{ +mul.f16x2 r521, r85, r523; +} +{ +add.f16x2 r524, r500, r521; +} +mov.f64 fd232, 0dBFD71E955D8E7CDC; +{ +cvt.rn.f16.f64 rs66, fd232; +} +mov.b32 r529, {rs66, rs66}; +{ +mul.f16x2 r527, r94, r529; +} +{ +add.f16x2 r530, r506, r527; +} +{ +cvt.rn.f16.f64 rs67, fd231; +} +mov.b32 r535, {rs67, rs67}; +{ +mul.f16x2 r533, r88, r535; +} +{ +add.f16x2 r536, r512, r533; +} +{ +cvt.rn.f16.f64 rs68, fd232; +} +mov.b32 r541, {rs68, rs68}; +{ +mul.f16x2 r539, r91, r541; +} +{ +add.f16x2 r542, r518, r539; +} +{ +sub.f16x2 %4, r524, r530; +} +{ +add.f16x2 %5, r536, r542; +} +{ +add.f16x2 %30, r524, r530; +} +{ +sub.f16x2 %31, r536, r542; +} +cvt.rn.f16.s32 rs69, r1588; +mov.b32 r569, {rs69, rs69}; +cvt.rn.f16.s32 rs70, r1588; +mov.b32 r581, {rs70, rs70}; +{ +cvt.rn.f16.f64 rs71, fd247; +} +mov.b32 r561, {rs71, rs71}; +{ +mul.f16x2 r559, r1, r561; +} +{ +add.f16x2 r562, %34, r559; +} +{ +cvt.rn.f16.f64 rs72, fd144; +} +mov.b32 r567, {rs72, rs72}; +{ +mul.f16x2 r565, r10, r567; +} +{ +add.f16x2 r568, r569, r565; +} +{ +cvt.rn.f16.f64 rs73, fd247; +} +mov.b32 r573, {rs73, rs73}; +{ +mul.f16x2 r571, r4, r573; +} +{ +add.f16x2 r574, %35, r571; +} +{ +cvt.rn.f16.f64 rs74, fd144; +} +mov.b32 r579, {rs74, rs74}; +{ +mul.f16x2 r577, r7, r579; +} +{ +add.f16x2 r580, r581, r577; +} +{ +cvt.rn.f16.f64 rs75, fd243; +} +mov.b32 r585, {rs75, rs75}; +{ +mul.f16x2 r583, r13, r585; +} +{ +add.f16x2 r586, r562, r583; +} +{ +cvt.rn.f16.f64 rs76, fd244; +} +mov.b32 r591, {rs76, rs76}; +{ +mul.f16x2 r589, r22, r591; +} +{ +add.f16x2 r592, r568, r589; +} +{ +cvt.rn.f16.f64 rs77, fd243; +} +mov.b32 r597, {rs77, rs77}; +{ +mul.f16x2 r595, r16, r597; +} +{ +add.f16x2 r598, r574, r595; +} +{ +cvt.rn.f16.f64 rs78, fd244; +} +mov.b32 r603, {rs78, rs78}; +{ +mul.f16x2 r601, r19, r603; +} +{ +add.f16x2 r604, r580, r601; +} +{ +cvt.rn.f16.f64 rs79, fd227; +} +mov.b32 r609, {rs79, rs79}; +{ +mul.f16x2 r607, r25, r609; +} +{ +add.f16x2 r610, r586, r607; +} +mov.f64 fd76, 0dBFC7851AACD6C6B4; +{ +cvt.rn.f16.f64 rs80, fd76; +} +mov.b32 r615, {rs80, rs80}; +{ +mul.f16x2 r613, r34, r615; +} +{ +add.f16x2 r616, r592, r613; +} +{ +cvt.rn.f16.f64 rs81, fd227; +} +mov.b32 r621, {rs81, rs81}; +{ +mul.f16x2 r619, r28, r621; +} +{ +add.f16x2 r622, r598, r619; +} +{ +cvt.rn.f16.f64 rs82, fd76; +} +mov.b32 r627, {rs82, rs82}; +{ +mul.f16x2 r625, r31, r627; +} +{ +add.f16x2 r628, r604, r625; +} +{ +cvt.rn.f16.f64 rs83, fd251; +} +mov.b32 r633, {rs83, rs83}; +{ +mul.f16x2 r631, r37, r633; +} +{ +add.f16x2 r634, r610, r631; +} +{ +cvt.rn.f16.f64 rs84, fd168; +} +mov.b32 r639, {rs84, rs84}; +{ +mul.f16x2 r637, r46, r639; +} +{ +add.f16x2 r640, r616, r637; +} +{ +cvt.rn.f16.f64 rs85, fd251; +} +mov.b32 r645, {rs85, rs85}; +{ +mul.f16x2 r643, r40, r645; +} +{ +add.f16x2 r646, r622, r643; +} +{ +cvt.rn.f16.f64 rs86, fd168; +} +mov.b32 r651, {rs86, rs86}; +{ +mul.f16x2 r649, r43, r651; +} +{ +add.f16x2 r652, r628, r649; +} +{ +cvt.rn.f16.f64 rs87, fd239; +} +mov.b32 r657, {rs87, rs87}; +{ +mul.f16x2 r655, r49, r657; +} +{ +add.f16x2 r658, r634, r655; +} +mov.f64 fd240, 0dBFE58EEA2A9D6DA3; +{ +cvt.rn.f16.f64 rs88, fd240; +} +mov.b32 r663, {rs88, rs88}; +{ +mul.f16x2 r661, r58, r663; +} +{ +add.f16x2 r664, r640, r661; +} +{ +cvt.rn.f16.f64 rs89, fd239; +} +mov.b32 r669, {rs89, rs89}; +{ +mul.f16x2 r667, r52, r669; +} +{ +add.f16x2 r670, r646, r667; +} +{ +cvt.rn.f16.f64 rs90, fd240; +} +mov.b32 r675, {rs90, rs90}; +{ +mul.f16x2 r673, r55, r675; +} +{ +add.f16x2 r676, r652, r673; +} +{ +cvt.rn.f16.f64 rs91, fd231; +} +mov.b32 r681, {rs91, rs91}; +{ +mul.f16x2 r679, r61, r681; +} +{ +add.f16x2 r682, r658, r679; +} +{ +cvt.rn.f16.f64 rs92, fd212; +} +mov.b32 r687, {rs92, rs92}; +{ +mul.f16x2 r685, r70, r687; +} +{ +add.f16x2 r688, r664, r685; +} +{ +cvt.rn.f16.f64 rs93, fd231; +} +mov.b32 r693, {rs93, rs93}; +{ +mul.f16x2 r691, r64, r693; +} +{ +add.f16x2 r694, r670, r691; +} +{ +cvt.rn.f16.f64 rs94, fd212; +} +mov.b32 r699, {rs94, rs94}; +{ +mul.f16x2 r697, r67, r699; +} +{ +add.f16x2 r700, r676, r697; +} +{ +cvt.rn.f16.f64 rs95, fd255; +} +mov.b32 r705, {rs95, rs95}; +{ +mul.f16x2 r703, r73, r705; +} +{ +add.f16x2 r706, r682, r703; +} +{ +cvt.rn.f16.f64 rs96, fd204; +} +mov.b32 r711, {rs96, rs96}; +{ +mul.f16x2 r709, r82, r711; +} +{ +add.f16x2 r712, r688, r709; +} +{ +cvt.rn.f16.f64 rs97, fd255; +} +mov.b32 r717, {rs97, rs97}; +{ +mul.f16x2 r715, r76, r717; +} +{ +add.f16x2 r718, r694, r715; +} +{ +cvt.rn.f16.f64 rs98, fd204; +} +mov.b32 r723, {rs98, rs98}; +{ +mul.f16x2 r721, r79, r723; +} +{ +add.f16x2 r724, r700, r721; +} +{ +cvt.rn.f16.f64 rs99, fd235; +} +mov.b32 r729, {rs99, rs99}; +{ +mul.f16x2 r727, r85, r729; +} +{ +add.f16x2 r730, r706, r727; +} +{ +cvt.rn.f16.f64 rs100, fd236; +} +mov.b32 r735, {rs100, rs100}; +{ +mul.f16x2 r733, r94, r735; +} +{ +add.f16x2 r736, r712, r733; +} +{ +cvt.rn.f16.f64 rs101, fd235; +} +mov.b32 r741, {rs101, rs101}; +{ +mul.f16x2 r739, r88, r741; +} +{ +add.f16x2 r742, r718, r739; +} +{ +cvt.rn.f16.f64 rs102, fd236; +} +mov.b32 r747, {rs102, rs102}; +{ +mul.f16x2 r745, r91, r747; +} +{ +add.f16x2 r748, r724, r745; +} +{ +sub.f16x2 %6, r730, r736; +} +{ +add.f16x2 %7, r742, r748; +} +{ +add.f16x2 %28, r730, r736; +} +{ +sub.f16x2 %29, r742, r748; +} +cvt.rn.f16.s32 rs103, r1588; +mov.b32 r775, {rs103, rs103}; +cvt.rn.f16.s32 rs104, r1588; +mov.b32 r787, {rs104, rs104}; +{ +cvt.rn.f16.f64 rs105, fd255; +} +mov.b32 r767, {rs105, rs105}; +{ +mul.f16x2 r765, r1, r767; +} +{ +add.f16x2 r768, %34, r765; +} +{ +cvt.rn.f16.f64 rs106, fd204; +} +mov.b32 r773, {rs106, rs106}; +{ +mul.f16x2 r771, r10, r773; +} +{ +add.f16x2 r774, r775, r771; +} +{ +cvt.rn.f16.f64 rs107, fd255; +} +mov.b32 r779, {rs107, rs107}; +{ +mul.f16x2 r777, r4, r779; +} +{ +add.f16x2 r780, %35, r777; +} +{ +cvt.rn.f16.f64 rs108, fd204; +} +mov.b32 r785, {rs108, rs108}; +{ +mul.f16x2 r783, r7, r785; +} +{ +add.f16x2 r786, r787, r783; +} +{ +cvt.rn.f16.f64 rs109, fd227; +} +mov.b32 r791, {rs109, rs109}; +{ +mul.f16x2 r789, r13, r791; +} +{ +add.f16x2 r792, r768, r789; +} +{ +cvt.rn.f16.f64 rs110, fd228; +} +mov.b32 r797, {rs110, rs110}; +{ +mul.f16x2 r795, r22, r797; +} +{ +add.f16x2 r798, r774, r795; +} +{ +cvt.rn.f16.f64 rs111, fd227; +} +mov.b32 r803, {rs111, rs111}; +{ +mul.f16x2 r801, r16, r803; +} +{ +add.f16x2 r804, r780, r801; +} +{ +cvt.rn.f16.f64 rs112, fd228; +} +mov.b32 r809, {rs112, rs112}; +{ +mul.f16x2 r807, r19, r809; +} +{ +add.f16x2 r810, r786, r807; +} +{ +cvt.rn.f16.f64 rs113, fd251; +} +mov.b32 r815, {rs113, rs113}; +{ +mul.f16x2 r813, r25, r815; +} +{ +add.f16x2 r816, r792, r813; +} +{ +cvt.rn.f16.f64 rs114, fd168; +} +mov.b32 r821, {rs114, rs114}; +{ +mul.f16x2 r819, r34, r821; +} +{ +add.f16x2 r822, r798, r819; +} +{ +cvt.rn.f16.f64 rs115, fd251; +} +mov.b32 r827, {rs115, rs115}; +{ +mul.f16x2 r825, r28, r827; +} +{ +add.f16x2 r828, r804, r825; +} +{ +cvt.rn.f16.f64 rs116, fd168; +} +mov.b32 r833, {rs116, rs116}; +{ +mul.f16x2 r831, r31, r833; +} +{ +add.f16x2 r834, r810, r831; +} +{ +cvt.rn.f16.f64 rs117, fd231; +} +mov.b32 r839, {rs117, rs117}; +{ +mul.f16x2 r837, r37, r839; +} +{ +add.f16x2 r840, r816, r837; +} +{ +cvt.rn.f16.f64 rs118, fd232; +} +mov.b32 r845, {rs118, rs118}; +{ +mul.f16x2 r843, r46, r845; +} +{ +add.f16x2 r846, r822, r843; +} +{ +cvt.rn.f16.f64 rs119, fd231; +} +mov.b32 r851, {rs119, rs119}; +{ +mul.f16x2 r849, r40, r851; +} +{ +add.f16x2 r852, r828, r849; +} +{ +cvt.rn.f16.f64 rs120, fd232; +} +mov.b32 r857, {rs120, rs120}; +{ +mul.f16x2 r855, r43, r857; +} +{ +add.f16x2 r858, r834, r855; +} +{ +cvt.rn.f16.f64 rs121, fd247; +} +mov.b32 r863, {rs121, rs121}; +{ +mul.f16x2 r861, r49, r863; +} +{ +add.f16x2 r864, r840, r861; +} +{ +cvt.rn.f16.f64 rs122, fd144; +} +mov.b32 r869, {rs122, rs122}; +{ +mul.f16x2 r867, r58, r869; +} +{ +add.f16x2 r870, r846, r867; +} +{ +cvt.rn.f16.f64 rs123, fd247; +} +mov.b32 r875, {rs123, rs123}; +{ +mul.f16x2 r873, r52, r875; +} +{ +add.f16x2 r876, r852, r873; +} +{ +cvt.rn.f16.f64 rs124, fd144; +} +mov.b32 r881, {rs124, rs124}; +{ +mul.f16x2 r879, r55, r881; +} +{ +add.f16x2 r882, r858, r879; +} +{ +cvt.rn.f16.f64 rs125, fd235; +} +mov.b32 r887, {rs125, rs125}; +{ +mul.f16x2 r885, r61, r887; +} +{ +add.f16x2 r888, r864, r885; +} +{ +cvt.rn.f16.f64 rs126, fd236; +} +mov.b32 r893, {rs126, rs126}; +{ +mul.f16x2 r891, r70, r893; +} +{ +add.f16x2 r894, r870, r891; +} +{ +cvt.rn.f16.f64 rs127, fd235; +} +mov.b32 r899, {rs127, rs127}; +{ +mul.f16x2 r897, r64, r899; +} +{ +add.f16x2 r900, r876, r897; +} +{ +cvt.rn.f16.f64 rs128, fd236; +} +mov.b32 r905, {rs128, rs128}; +{ +mul.f16x2 r903, r67, r905; +} +{ +add.f16x2 r906, r882, r903; +} +{ +cvt.rn.f16.f64 rs129, fd243; +} +mov.b32 r911, {rs129, rs129}; +{ +mul.f16x2 r909, r73, r911; +} +{ +add.f16x2 r912, r888, r909; +} +mov.f64 fd208, 0dBFE9895B6C9A05F6; +{ +cvt.rn.f16.f64 rs130, fd208; +} +mov.b32 r917, {rs130, rs130}; +{ +mul.f16x2 r915, r82, r917; +} +{ +add.f16x2 r918, r894, r915; +} +{ +cvt.rn.f16.f64 rs131, fd243; +} +mov.b32 r923, {rs131, rs131}; +{ +mul.f16x2 r921, r76, r923; +} +{ +add.f16x2 r924, r900, r921; +} +{ +cvt.rn.f16.f64 rs132, fd208; +} +mov.b32 r929, {rs132, rs132}; +{ +mul.f16x2 r927, r79, r929; +} +{ +add.f16x2 r930, r906, r927; +} +{ +cvt.rn.f16.f64 rs133, fd239; +} +mov.b32 r935, {rs133, rs133}; +{ +mul.f16x2 r933, r85, r935; +} +{ +add.f16x2 r936, r912, r933; +} +{ +cvt.rn.f16.f64 rs134, fd240; +} +mov.b32 r941, {rs134, rs134}; +{ +mul.f16x2 r939, r94, r941; +} +{ +add.f16x2 r942, r918, r939; +} +{ +cvt.rn.f16.f64 rs135, fd239; +} +mov.b32 r947, {rs135, rs135}; +{ +mul.f16x2 r945, r88, r947; +} +{ +add.f16x2 r948, r924, r945; +} +{ +cvt.rn.f16.f64 rs136, fd240; +} +mov.b32 r953, {rs136, rs136}; +{ +mul.f16x2 r951, r91, r953; +} +{ +add.f16x2 r954, r930, r951; +} +{ +sub.f16x2 %8, r936, r942; +} +{ +add.f16x2 %9, r948, r954; +} +{ +add.f16x2 %26, r936, r942; +} +{ +sub.f16x2 %27, r948, r954; +} +cvt.rn.f16.s32 rs137, r1588; +mov.b32 r981, {rs137, rs137}; +cvt.rn.f16.s32 rs138, r1588; +mov.b32 r993, {rs138, rs138}; +{ +cvt.rn.f16.f64 rs139, fd251; +} +mov.b32 r973, {rs139, rs139}; +{ +mul.f16x2 r971, r1, r973; +} +{ +add.f16x2 r974, %34, r971; +} +{ +cvt.rn.f16.f64 rs140, fd252; +} +mov.b32 r979, {rs140, rs140}; +{ +mul.f16x2 r977, r10, r979; +} +{ +add.f16x2 r980, r981, r977; +} +{ +cvt.rn.f16.f64 rs141, fd251; +} +mov.b32 r985, {rs141, rs141}; +{ +mul.f16x2 r983, r4, r985; +} +{ +add.f16x2 r986, %35, r983; +} +{ +cvt.rn.f16.f64 rs142, fd252; +} +mov.b32 r991, {rs142, rs142}; +{ +mul.f16x2 r989, r7, r991; +} +{ +add.f16x2 r992, r993, r989; +} +{ +cvt.rn.f16.f64 rs143, fd235; +} +mov.b32 r997, {rs143, rs143}; +{ +mul.f16x2 r995, r13, r997; +} +{ +add.f16x2 r998, r974, r995; +} +{ +cvt.rn.f16.f64 rs144, fd136; +} +mov.b32 r1003, {rs144, rs144}; +{ +mul.f16x2 r1001, r22, r1003; +} +{ +add.f16x2 r1004, r980, r1001; +} +{ +cvt.rn.f16.f64 rs145, fd235; +} +mov.b32 r1009, {rs145, rs145}; +{ +mul.f16x2 r1007, r16, r1009; +} +{ +add.f16x2 r1010, r986, r1007; +} +{ +cvt.rn.f16.f64 rs146, fd136; +} +mov.b32 r1015, {rs146, rs146}; +{ +mul.f16x2 r1013, r19, r1015; +} +{ +add.f16x2 r1016, r992, r1013; +} +{ +cvt.rn.f16.f64 rs147, fd239; +} +mov.b32 r1021, {rs147, rs147}; +{ +mul.f16x2 r1019, r25, r1021; +} +{ +add.f16x2 r1022, r998, r1019; +} +{ +cvt.rn.f16.f64 rs148, fd240; +} +mov.b32 r1027, {rs148, rs148}; +{ +mul.f16x2 r1025, r34, r1027; +} +{ +add.f16x2 r1028, r1004, r1025; +} +{ +cvt.rn.f16.f64 rs149, fd239; +} +mov.b32 r1033, {rs149, rs149}; +{ +mul.f16x2 r1031, r28, r1033; +} +{ +add.f16x2 r1034, r1010, r1031; +} +{ +cvt.rn.f16.f64 rs150, fd240; +} +mov.b32 r1039, {rs150, rs150}; +{ +mul.f16x2 r1037, r31, r1039; +} +{ +add.f16x2 r1040, r1016, r1037; +} +{ +cvt.rn.f16.f64 rs151, fd247; +} +mov.b32 r1045, {rs151, rs151}; +{ +mul.f16x2 r1043, r37, r1045; +} +{ +add.f16x2 r1046, r1022, r1043; +} +{ +cvt.rn.f16.f64 rs152, fd144; +} +mov.b32 r1051, {rs152, rs152}; +{ +mul.f16x2 r1049, r46, r1051; +} +{ +add.f16x2 r1052, r1028, r1049; +} +{ +cvt.rn.f16.f64 rs153, fd247; +} +mov.b32 r1057, {rs153, rs153}; +{ +mul.f16x2 r1055, r40, r1057; +} +{ +add.f16x2 r1058, r1034, r1055; +} +{ +cvt.rn.f16.f64 rs154, fd144; +} +mov.b32 r1063, {rs154, rs154}; +{ +mul.f16x2 r1061, r43, r1063; +} +{ +add.f16x2 r1064, r1040, r1061; +} +{ +cvt.rn.f16.f64 rs155, fd227; +} +mov.b32 r1069, {rs155, rs155}; +{ +mul.f16x2 r1067, r49, r1069; +} +{ +add.f16x2 r1070, r1046, r1067; +} +{ +cvt.rn.f16.f64 rs156, fd228; +} +mov.b32 r1075, {rs156, rs156}; +{ +mul.f16x2 r1073, r58, r1075; +} +{ +add.f16x2 r1076, r1052, r1073; +} +{ +cvt.rn.f16.f64 rs157, fd227; +} +mov.b32 r1081, {rs157, rs157}; +{ +mul.f16x2 r1079, r52, r1081; +} +{ +add.f16x2 r1082, r1058, r1079; +} +{ +cvt.rn.f16.f64 rs158, fd228; +} +mov.b32 r1087, {rs158, rs158}; +{ +mul.f16x2 r1085, r55, r1087; +} +{ +add.f16x2 r1088, r1064, r1085; +} +{ +cvt.rn.f16.f64 rs159, fd255; +} +mov.b32 r1093, {rs159, rs159}; +{ +mul.f16x2 r1091, r61, r1093; +} +{ +add.f16x2 r1094, r1070, r1091; +} +mov.f64 fd256, 0dBFEFDD0DEB564B22; +{ +cvt.rn.f16.f64 rs160, fd256; +} +mov.b32 r1099, {rs160, rs160}; +{ +mul.f16x2 r1097, r70, r1099; +} +{ +add.f16x2 r1100, r1076, r1097; +} +{ +cvt.rn.f16.f64 rs161, fd255; +} +mov.b32 r1105, {rs161, rs161}; +{ +mul.f16x2 r1103, r64, r1105; +} +{ +add.f16x2 r1106, r1082, r1103; +} +{ +cvt.rn.f16.f64 rs162, fd256; +} +mov.b32 r1111, {rs162, rs162}; +{ +mul.f16x2 r1109, r67, r1111; +} +{ +add.f16x2 r1112, r1088, r1109; +} +{ +cvt.rn.f16.f64 rs163, fd231; +} +mov.b32 r1117, {rs163, rs163}; +{ +mul.f16x2 r1115, r73, r1117; +} +{ +add.f16x2 r1118, r1094, r1115; +} +{ +cvt.rn.f16.f64 rs164, fd212; +} +mov.b32 r1123, {rs164, rs164}; +{ +mul.f16x2 r1121, r82, r1123; +} +{ +add.f16x2 r1124, r1100, r1121; +} +{ +cvt.rn.f16.f64 rs165, fd231; +} +mov.b32 r1129, {rs165, rs165}; +{ +mul.f16x2 r1127, r76, r1129; +} +{ +add.f16x2 r1130, r1106, r1127; +} +{ +cvt.rn.f16.f64 rs166, fd212; +} +mov.b32 r1135, {rs166, rs166}; +{ +mul.f16x2 r1133, r79, r1135; +} +{ +add.f16x2 r1136, r1112, r1133; +} +{ +cvt.rn.f16.f64 rs167, fd243; +} +mov.b32 r1141, {rs167, rs167}; +{ +mul.f16x2 r1139, r85, r1141; +} +{ +add.f16x2 r1142, r1118, r1139; +} +{ +cvt.rn.f16.f64 rs168, fd244; +} +mov.b32 r1147, {rs168, rs168}; +{ +mul.f16x2 r1145, r94, r1147; +} +{ +add.f16x2 r1148, r1124, r1145; +} +{ +cvt.rn.f16.f64 rs169, fd243; +} +mov.b32 r1153, {rs169, rs169}; +{ +mul.f16x2 r1151, r88, r1153; +} +{ +add.f16x2 r1154, r1130, r1151; +} +{ +cvt.rn.f16.f64 rs170, fd244; +} +mov.b32 r1159, {rs170, rs170}; +{ +mul.f16x2 r1157, r91, r1159; +} +{ +add.f16x2 r1160, r1136, r1157; +} +{ +sub.f16x2 %10, r1142, r1148; +} +{ +add.f16x2 %11, r1154, r1160; +} +{ +add.f16x2 %24, r1142, r1148; +} +{ +sub.f16x2 %25, r1154, r1160; +} +cvt.rn.f16.s32 rs171, r1588; +mov.b32 r1187, {rs171, rs171}; +cvt.rn.f16.s32 rs172, r1588; +mov.b32 r1199, {rs172, rs172}; +{ +cvt.rn.f16.f64 rs173, fd243; +} +mov.b32 r1179, {rs173, rs173}; +{ +mul.f16x2 r1177, r1, r1179; +} +{ +add.f16x2 r1180, %34, r1177; +} +{ +cvt.rn.f16.f64 rs174, fd244; +} +mov.b32 r1185, {rs174, rs174}; +{ +mul.f16x2 r1183, r10, r1185; +} +{ +add.f16x2 r1186, r1187, r1183; +} +{ +cvt.rn.f16.f64 rs175, fd243; +} +mov.b32 r1191, {rs175, rs175}; +{ +mul.f16x2 r1189, r4, r1191; +} +{ +add.f16x2 r1192, %35, r1189; +} +{ +cvt.rn.f16.f64 rs176, fd244; +} +mov.b32 r1197, {rs176, rs176}; +{ +mul.f16x2 r1195, r7, r1197; +} +{ +add.f16x2 r1198, r1199, r1195; +} +{ +cvt.rn.f16.f64 rs177, fd251; +} +mov.b32 r1203, {rs177, rs177}; +{ +mul.f16x2 r1201, r13, r1203; +} +{ +add.f16x2 r1204, r1180, r1201; +} +{ +cvt.rn.f16.f64 rs178, fd168; +} +mov.b32 r1209, {rs178, rs178}; +{ +mul.f16x2 r1207, r22, r1209; +} +{ +add.f16x2 r1210, r1186, r1207; +} +{ +cvt.rn.f16.f64 rs179, fd251; +} +mov.b32 r1215, {rs179, rs179}; +{ +mul.f16x2 r1213, r16, r1215; +} +{ +add.f16x2 r1216, r1192, r1213; +} +{ +cvt.rn.f16.f64 rs180, fd168; +} +mov.b32 r1221, {rs180, rs180}; +{ +mul.f16x2 r1219, r19, r1221; +} +{ +add.f16x2 r1222, r1198, r1219; +} +{ +cvt.rn.f16.f64 rs181, fd231; +} +mov.b32 r1227, {rs181, rs181}; +{ +mul.f16x2 r1225, r25, r1227; +} +{ +add.f16x2 r1228, r1204, r1225; +} +{ +cvt.rn.f16.f64 rs182, fd212; +} +mov.b32 r1233, {rs182, rs182}; +{ +mul.f16x2 r1231, r34, r1233; +} +{ +add.f16x2 r1234, r1210, r1231; +} +{ +cvt.rn.f16.f64 rs183, fd231; +} +mov.b32 r1239, {rs183, rs183}; +{ +mul.f16x2 r1237, r28, r1239; +} +{ +add.f16x2 r1240, r1216, r1237; +} +{ +cvt.rn.f16.f64 rs184, fd212; +} +mov.b32 r1245, {rs184, rs184}; +{ +mul.f16x2 r1243, r31, r1245; +} +{ +add.f16x2 r1246, r1222, r1243; +} +{ +cvt.rn.f16.f64 rs185, fd235; +} +mov.b32 r1251, {rs185, rs185}; +{ +mul.f16x2 r1249, r37, r1251; +} +{ +add.f16x2 r1252, r1228, r1249; +} +{ +cvt.rn.f16.f64 rs186, fd236; +} +mov.b32 r1257, {rs186, rs186}; +{ +mul.f16x2 r1255, r46, r1257; +} +{ +add.f16x2 r1258, r1234, r1255; +} +{ +cvt.rn.f16.f64 rs187, fd235; +} +mov.b32 r1263, {rs187, rs187}; +{ +mul.f16x2 r1261, r40, r1263; +} +{ +add.f16x2 r1264, r1240, r1261; +} +{ +cvt.rn.f16.f64 rs188, fd236; +} +mov.b32 r1269, {rs188, rs188}; +{ +mul.f16x2 r1267, r43, r1269; +} +{ +add.f16x2 r1270, r1246, r1267; +} +{ +cvt.rn.f16.f64 rs189, fd255; +} +mov.b32 r1275, {rs189, rs189}; +{ +mul.f16x2 r1273, r49, r1275; +} +{ +add.f16x2 r1276, r1252, r1273; +} +{ +cvt.rn.f16.f64 rs190, fd256; +} +mov.b32 r1281, {rs190, rs190}; +{ +mul.f16x2 r1279, r58, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs191, fd255; +} +mov.b32 r1287, {rs191, rs191}; +{ +mul.f16x2 r1285, r52, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +cvt.rn.f16.f64 rs192, fd256; +} +mov.b32 r1293, {rs192, rs192}; +{ +mul.f16x2 r1291, r55, r1293; +} +{ +add.f16x2 r1294, r1270, r1291; +} +{ +cvt.rn.f16.f64 rs193, fd239; +} +mov.b32 r1299, {rs193, rs193}; +{ +mul.f16x2 r1297, r61, r1299; +} +{ +add.f16x2 r1300, r1276, r1297; +} +{ +cvt.rn.f16.f64 rs194, fd184; +} +mov.b32 r1305, {rs194, rs194}; +{ +mul.f16x2 r1303, r70, r1305; +} +{ +add.f16x2 r1306, r1282, r1303; +} +{ +cvt.rn.f16.f64 rs195, fd239; +} +mov.b32 r1311, {rs195, rs195}; +{ +mul.f16x2 r1309, r64, r1311; +} +{ +add.f16x2 r1312, r1288, r1309; +} +{ +cvt.rn.f16.f64 rs196, fd184; +} +mov.b32 r1317, {rs196, rs196}; +{ +mul.f16x2 r1315, r67, r1317; +} +{ +add.f16x2 r1318, r1294, r1315; +} +{ +cvt.rn.f16.f64 rs197, fd227; +} +mov.b32 r1323, {rs197, rs197}; +{ +mul.f16x2 r1321, r73, r1323; +} +{ +add.f16x2 r1324, r1300, r1321; +} +{ +cvt.rn.f16.f64 rs198, fd228; +} +mov.b32 r1329, {rs198, rs198}; +{ +mul.f16x2 r1327, r82, r1329; +} +{ +add.f16x2 r1330, r1306, r1327; +} +{ +cvt.rn.f16.f64 rs199, fd227; +} +mov.b32 r1335, {rs199, rs199}; +{ +mul.f16x2 r1333, r76, r1335; +} +{ +add.f16x2 r1336, r1312, r1333; +} +{ +cvt.rn.f16.f64 rs200, fd228; +} +mov.b32 r1341, {rs200, rs200}; +{ +mul.f16x2 r1339, r79, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs201, fd247; +} +mov.b32 r1347, {rs201, rs201}; +{ +mul.f16x2 r1345, r85, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs202, fd248; +} +mov.b32 r1353, {rs202, rs202}; +{ +mul.f16x2 r1351, r94, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs203, fd247; +} +mov.b32 r1359, {rs203, rs203}; +{ +mul.f16x2 r1357, r88, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs204, fd248; +} +mov.b32 r1365, {rs204, rs204}; +{ +mul.f16x2 r1363, r91, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +sub.f16x2 %12, r1348, r1354; +} +{ +add.f16x2 %13, r1360, r1366; +} +{ +add.f16x2 %22, r1348, r1354; +} +{ +sub.f16x2 %23, r1360, r1366; +} +cvt.rn.f16.s32 rs205, r1588; +mov.b32 r1393, {rs205, rs205}; +cvt.rn.f16.s32 rs206, r1588; +mov.b32 r1405, {rs206, rs206}; +{ +cvt.rn.f16.f64 rs207, fd235; +} +mov.b32 r1385, {rs207, rs207}; +{ +mul.f16x2 r1383, r1, r1385; +} +{ +add.f16x2 r1386, %34, r1383; +} +{ +cvt.rn.f16.f64 rs208, fd236; +} +mov.b32 r1391, {rs208, rs208}; +{ +mul.f16x2 r1389, r10, r1391; +} +{ +add.f16x2 r1392, r1393, r1389; +} +{ +cvt.rn.f16.f64 rs209, fd235; +} +mov.b32 r1397, {rs209, rs209}; +{ +mul.f16x2 r1395, r4, r1397; +} +{ +add.f16x2 r1398, %35, r1395; +} +{ +cvt.rn.f16.f64 rs210, fd236; +} +mov.b32 r1403, {rs210, rs210}; +{ +mul.f16x2 r1401, r7, r1403; +} +{ +add.f16x2 r1404, r1405, r1401; +} +{ +cvt.rn.f16.f64 rs211, fd247; +} +mov.b32 r1409, {rs211, rs211}; +{ +mul.f16x2 r1407, r13, r1409; +} +{ +add.f16x2 r1410, r1386, r1407; +} +{ +cvt.rn.f16.f64 rs212, fd248; +} +mov.b32 r1415, {rs212, rs212}; +{ +mul.f16x2 r1413, r22, r1415; +} +{ +add.f16x2 r1416, r1392, r1413; +} +{ +cvt.rn.f16.f64 rs213, fd247; +} +mov.b32 r1421, {rs213, rs213}; +{ +mul.f16x2 r1419, r16, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs214, fd248; +} +mov.b32 r1427, {rs214, rs214}; +{ +mul.f16x2 r1425, r19, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs215, fd255; +} +mov.b32 r1433, {rs215, rs215}; +{ +mul.f16x2 r1431, r25, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs216, fd204; +} +mov.b32 r1439, {rs216, rs216}; +{ +mul.f16x2 r1437, r34, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs217, fd255; +} +mov.b32 r1445, {rs217, rs217}; +{ +mul.f16x2 r1443, r28, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs218, fd204; +} +mov.b32 r1451, {rs218, rs218}; +{ +mul.f16x2 r1449, r31, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs219, fd243; +} +mov.b32 r1457, {rs219, rs219}; +{ +mul.f16x2 r1455, r37, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs220, fd208; +} +mov.b32 r1463, {rs220, rs220}; +{ +mul.f16x2 r1461, r46, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs221, fd243; +} +mov.b32 r1469, {rs221, rs221}; +{ +mul.f16x2 r1467, r40, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs222, fd208; +} +mov.b32 r1475, {rs222, rs222}; +{ +mul.f16x2 r1473, r43, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs223, fd231; +} +mov.b32 r1481, {rs223, rs223}; +{ +mul.f16x2 r1479, r49, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs224, fd212; +} +mov.b32 r1487, {rs224, rs224}; +{ +mul.f16x2 r1485, r58, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs225, fd231; +} +mov.b32 r1493, {rs225, rs225}; +{ +mul.f16x2 r1491, r52, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs226, fd212; +} +mov.b32 r1499, {rs226, rs226}; +{ +mul.f16x2 r1497, r55, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs227, fd227; +} +mov.b32 r1505, {rs227, rs227}; +{ +mul.f16x2 r1503, r61, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +cvt.rn.f16.f64 rs228, fd228; +} +mov.b32 r1511, {rs228, rs228}; +{ +mul.f16x2 r1509, r70, r1511; +} +{ +add.f16x2 r1512, r1488, r1509; +} +{ +cvt.rn.f16.f64 rs229, fd227; +} +mov.b32 r1517, {rs229, rs229}; +{ +mul.f16x2 r1515, r64, r1517; +} +{ +add.f16x2 r1518, r1494, r1515; +} +{ +cvt.rn.f16.f64 rs230, fd228; +} +mov.b32 r1523, {rs230, rs230}; +{ +mul.f16x2 r1521, r67, r1523; +} +{ +add.f16x2 r1524, r1500, r1521; +} +{ +cvt.rn.f16.f64 rs231, fd239; +} +mov.b32 r1529, {rs231, rs231}; +{ +mul.f16x2 r1527, r73, r1529; +} +{ +add.f16x2 r1530, r1506, r1527; +} +{ +cvt.rn.f16.f64 rs232, fd240; +} +mov.b32 r1535, {rs232, rs232}; +{ +mul.f16x2 r1533, r82, r1535; +} +{ +add.f16x2 r1536, r1512, r1533; +} +{ +cvt.rn.f16.f64 rs233, fd239; +} +mov.b32 r1541, {rs233, rs233}; +{ +mul.f16x2 r1539, r76, r1541; +} +{ +add.f16x2 r1542, r1518, r1539; +} +{ +cvt.rn.f16.f64 rs234, fd240; +} +mov.b32 r1547, {rs234, rs234}; +{ +mul.f16x2 r1545, r79, r1547; +} +{ +add.f16x2 r1548, r1524, r1545; +} +{ +cvt.rn.f16.f64 rs235, fd251; +} +mov.b32 r1553, {rs235, rs235}; +{ +mul.f16x2 r1551, r85, r1553; +} +{ +add.f16x2 r1554, r1530, r1551; +} +{ +cvt.rn.f16.f64 rs236, fd252; +} +mov.b32 r1559, {rs236, rs236}; +{ +mul.f16x2 r1557, r94, r1559; +} +{ +add.f16x2 r1560, r1536, r1557; +} +{ +cvt.rn.f16.f64 rs237, fd251; +} +mov.b32 r1565, {rs237, rs237}; +{ +mul.f16x2 r1563, r88, r1565; +} +{ +add.f16x2 r1566, r1542, r1563; +} +{ +cvt.rn.f16.f64 rs238, fd252; +} +mov.b32 r1571, {rs238, rs238}; +{ +mul.f16x2 r1569, r91, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +{ +sub.f16x2 %14, r1554, r1560; +} +{ +add.f16x2 %15, r1566, r1572; +} +{ +add.f16x2 %20, r1554, r1560; +} +{ +sub.f16x2 %21, r1566, r1572; +} +cvt.rn.f16.s32 rs239, r1588; +mov.b32 r1599, {rs239, rs239}; +cvt.rn.f16.s32 rs240, r1588; +mov.b32 r1611, {rs240, rs240}; +{ +cvt.rn.f16.f64 rs241, fd227; +} +mov.b32 r1591, {rs241, rs241}; +{ +mul.f16x2 r1589, r1, r1591; +} +{ +add.f16x2 r1592, %34, r1589; +} +{ +cvt.rn.f16.f64 rs242, fd228; +} +mov.b32 r1597, {rs242, rs242}; +{ +mul.f16x2 r1595, r10, r1597; +} +{ +add.f16x2 r1598, r1599, r1595; +} +{ +cvt.rn.f16.f64 rs243, fd227; +} +mov.b32 r1603, {rs243, rs243}; +{ +mul.f16x2 r1601, r4, r1603; +} +{ +add.f16x2 r1604, %35, r1601; +} +{ +cvt.rn.f16.f64 rs244, fd228; +} +mov.b32 r1609, {rs244, rs244}; +{ +mul.f16x2 r1607, r7, r1609; +} +{ +add.f16x2 r1610, r1611, r1607; +} +{ +cvt.rn.f16.f64 rs245, fd231; +} +mov.b32 r1615, {rs245, rs245}; +{ +mul.f16x2 r1613, r13, r1615; +} +{ +add.f16x2 r1616, r1592, r1613; +} +{ +cvt.rn.f16.f64 rs246, fd232; +} +mov.b32 r1621, {rs246, rs246}; +{ +mul.f16x2 r1619, r22, r1621; +} +{ +add.f16x2 r1622, r1598, r1619; +} +{ +cvt.rn.f16.f64 rs247, fd231; +} +mov.b32 r1627, {rs247, rs247}; +{ +mul.f16x2 r1625, r16, r1627; +} +{ +add.f16x2 r1628, r1604, r1625; +} +{ +cvt.rn.f16.f64 rs248, fd232; +} +mov.b32 r1633, {rs248, rs248}; +{ +mul.f16x2 r1631, r19, r1633; +} +{ +add.f16x2 r1634, r1610, r1631; +} +{ +cvt.rn.f16.f64 rs249, fd235; +} +mov.b32 r1639, {rs249, rs249}; +{ +mul.f16x2 r1637, r25, r1639; +} +{ +add.f16x2 r1640, r1616, r1637; +} +{ +cvt.rn.f16.f64 rs250, fd236; +} +mov.b32 r1645, {rs250, rs250}; +{ +mul.f16x2 r1643, r34, r1645; +} +{ +add.f16x2 r1646, r1622, r1643; +} +{ +cvt.rn.f16.f64 rs251, fd235; +} +mov.b32 r1651, {rs251, rs251}; +{ +mul.f16x2 r1649, r28, r1651; +} +{ +add.f16x2 r1652, r1628, r1649; +} +{ +cvt.rn.f16.f64 rs252, fd236; +} +mov.b32 r1657, {rs252, rs252}; +{ +mul.f16x2 r1655, r31, r1657; +} +{ +add.f16x2 r1658, r1634, r1655; +} +{ +cvt.rn.f16.f64 rs253, fd239; +} +mov.b32 r1663, {rs253, rs253}; +{ +mul.f16x2 r1661, r37, r1663; +} +{ +add.f16x2 r1664, r1640, r1661; +} +{ +cvt.rn.f16.f64 rs254, fd240; +} +mov.b32 r1669, {rs254, rs254}; +{ +mul.f16x2 r1667, r46, r1669; +} +{ +add.f16x2 r1670, r1646, r1667; +} +{ +cvt.rn.f16.f64 rs255, fd239; +} +mov.b32 r1675, {rs255, rs255}; +{ +mul.f16x2 r1673, r40, r1675; +} +{ +add.f16x2 r1676, r1652, r1673; +} +{ +cvt.rn.f16.f64 rs256, fd240; +} +mov.b32 r1681, {rs256, rs256}; +{ +mul.f16x2 r1679, r43, r1681; +} +{ +add.f16x2 r1682, r1658, r1679; +} +{ +cvt.rn.f16.f64 rs257, fd243; +} +mov.b32 r1687, {rs257, rs257}; +{ +mul.f16x2 r1685, r49, r1687; +} +{ +add.f16x2 r1688, r1664, r1685; +} +{ +cvt.rn.f16.f64 rs258, fd244; +} +mov.b32 r1693, {rs258, rs258}; +{ +mul.f16x2 r1691, r58, r1693; +} +{ +add.f16x2 r1694, r1670, r1691; +} +{ +cvt.rn.f16.f64 rs259, fd243; +} +mov.b32 r1699, {rs259, rs259}; +{ +mul.f16x2 r1697, r52, r1699; +} +{ +add.f16x2 r1700, r1676, r1697; +} +{ +cvt.rn.f16.f64 rs260, fd244; +} +mov.b32 r1705, {rs260, rs260}; +{ +mul.f16x2 r1703, r55, r1705; +} +{ +add.f16x2 r1706, r1682, r1703; +} +{ +cvt.rn.f16.f64 rs261, fd247; +} +mov.b32 r1711, {rs261, rs261}; +{ +mul.f16x2 r1709, r61, r1711; +} +{ +add.f16x2 r1712, r1688, r1709; +} +{ +cvt.rn.f16.f64 rs262, fd248; +} +mov.b32 r1717, {rs262, rs262}; +{ +mul.f16x2 r1715, r70, r1717; +} +{ +add.f16x2 r1718, r1694, r1715; +} +{ +cvt.rn.f16.f64 rs263, fd247; +} +mov.b32 r1723, {rs263, rs263}; +{ +mul.f16x2 r1721, r64, r1723; +} +{ +add.f16x2 r1724, r1700, r1721; +} +{ +cvt.rn.f16.f64 rs264, fd248; +} +mov.b32 r1729, {rs264, rs264}; +{ +mul.f16x2 r1727, r67, r1729; +} +{ +add.f16x2 r1730, r1706, r1727; +} +{ +cvt.rn.f16.f64 rs265, fd251; +} +mov.b32 r1735, {rs265, rs265}; +{ +mul.f16x2 r1733, r73, r1735; +} +{ +add.f16x2 r1736, r1712, r1733; +} +{ +cvt.rn.f16.f64 rs266, fd252; +} +mov.b32 r1741, {rs266, rs266}; +{ +mul.f16x2 r1739, r82, r1741; +} +{ +add.f16x2 r1742, r1718, r1739; +} +{ +cvt.rn.f16.f64 rs267, fd251; +} +mov.b32 r1747, {rs267, rs267}; +{ +mul.f16x2 r1745, r76, r1747; +} +{ +add.f16x2 r1748, r1724, r1745; +} +{ +cvt.rn.f16.f64 rs268, fd252; +} +mov.b32 r1753, {rs268, rs268}; +{ +mul.f16x2 r1751, r79, r1753; +} +{ +add.f16x2 r1754, r1730, r1751; +} +{ +cvt.rn.f16.f64 rs269, fd255; +} +mov.b32 r1759, {rs269, rs269}; +{ +mul.f16x2 r1757, r85, r1759; +} +{ +add.f16x2 r1760, r1736, r1757; +} +{ +cvt.rn.f16.f64 rs270, fd256; +} +mov.b32 r1765, {rs270, rs270}; +{ +mul.f16x2 r1763, r94, r1765; +} +{ +add.f16x2 r1766, r1742, r1763; +} +{ +cvt.rn.f16.f64 rs271, fd255; +} +mov.b32 r1771, {rs271, rs271}; +{ +mul.f16x2 r1769, r88, r1771; +} +{ +add.f16x2 r1772, r1748, r1769; +} +{ +cvt.rn.f16.f64 rs272, fd256; +} +mov.b32 r1777, {rs272, rs272}; +{ +mul.f16x2 r1775, r91, r1777; +} +{ +add.f16x2 r1778, r1754, r1775; +} +{ +sub.f16x2 %16, r1760, r1766; +} +{ +add.f16x2 %17, r1772, r1778; +} +{ +add.f16x2 %18, r1760, r1766; +} +{ +sub.f16x2 %19, r1772, r1778; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..78d148103cf22 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp32_fwd.hpp.inc @@ -0,0 +1,352 @@ +#ifndef CUFFTDX_FFT_17_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_17_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<3, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<405>; +.reg .b64 rd<2>; +add.f32 f69, %36, %76; +add.f32 f70, %38, %77; +sub.f32 f71, %36, %76; +sub.f32 f72, %38, %77; +add.f32 f73, %39, %74; +add.f32 f74, %41, %75; +sub.f32 f75, %39, %74; +sub.f32 f76, %41, %75; +add.f32 f77, %42, %71; +add.f32 f78, %43, %73; +sub.f32 f79, %42, %71; +sub.f32 f80, %43, %73; +add.f32 f81, %44, %68; +add.f32 f82, %46, %70; +sub.f32 f83, %44, %68; +sub.f32 f84, %46, %70; +add.f32 f85, %47, %66; +add.f32 f86, %49, %67; +sub.f32 f87, %47, %66; +sub.f32 f88, %49, %67; +add.f32 f89, %50, %63; +add.f32 f90, %51, %65; +sub.f32 f91, %50, %63; +sub.f32 f92, %51, %65; +add.f32 f93, %52, %60; +add.f32 f94, %54, %62; +sub.f32 f95, %52, %60; +sub.f32 f96, %54, %62; +add.f32 f97, %55, %58; +add.f32 f98, %57, %59; +sub.f32 f99, %55, %58; +sub.f32 f100, %57, %59; +add.f32 f101, %34, f69; +add.f32 f102, %35, f70; +add.f32 f103, f101, f73; +add.f32 f104, f102, f74; +add.f32 f105, f103, f77; +add.f32 f106, f104, f78; +add.f32 f107, f105, f81; +add.f32 f108, f106, f82; +add.f32 f109, f107, f85; +add.f32 f110, f108, f86; +add.f32 f111, f109, f89; +add.f32 f112, f110, f90; +add.f32 f113, f111, f93; +add.f32 f114, f112, f94; +fma.rn.f32 f115, f69, 0f3F6EB680, %34; +fma.rn.f32 f116, f72, 0fBEB8F4AB, 0f00000000; +fma.rn.f32 f117, f70, 0f3F6EB680, %35; +fma.rn.f32 f118, f71, 0fBEB8F4AB, 0f00000000; +fma.rn.f32 f119, f73, 0f3F3D2FB0, f115; +fma.rn.f32 f120, f76, 0fBF2C7751, f116; +fma.rn.f32 f121, f74, 0f3F3D2FB0, f117; +fma.rn.f32 f122, f75, 0fBF2C7751, f118; +fma.rn.f32 f123, f77, 0f3EE437D1, f119; +fma.rn.f32 f124, f80, 0fBF65296C, f120; +fma.rn.f32 f125, f78, 0f3EE437D1, f121; +fma.rn.f32 f126, f79, 0fBF65296C, f122; +fma.rn.f32 f127, f81, 0f3DBCF732, f123; +fma.rn.f32 f128, f84, 0fBF7EE86F, f124; +fma.rn.f32 f129, f82, 0f3DBCF732, f125; +fma.rn.f32 f130, f83, 0fBF7EE86F, f126; +fma.rn.f32 f131, f85, 0fBE8C1D8E, f127; +fma.rn.f32 f132, f88, 0fBF763A35, f128; +fma.rn.f32 f133, f86, 0fBE8C1D8E, f129; +fma.rn.f32 f134, f87, 0fBF763A35, f130; +fma.rn.f32 f135, f89, 0fBF1A4643, f131; +fma.rn.f32 f136, f92, 0fBF4C4ADB, f132; +fma.rn.f32 f137, f90, 0fBF1A4643, f133; +fma.rn.f32 f138, f91, 0fBF4C4ADB, f134; +fma.rn.f32 f139, f93, 0fBF59A7D5, f135; +fma.rn.f32 f140, f96, 0fBF06C442, f136; +fma.rn.f32 f141, f94, 0fBF59A7D5, f137; +fma.rn.f32 f142, f95, 0fBF06C442, f138; +fma.rn.f32 f143, f97, 0fBF7BA420, f139; +fma.rn.f32 f144, f100, 0fBE3C28D5, f140; +fma.rn.f32 f145, f98, 0fBF7BA420, f141; +fma.rn.f32 f146, f99, 0fBE3C28D5, f142; +fma.rn.f32 f147, f69, 0f3F3D2FB0, %34; +fma.rn.f32 f148, f72, 0fBF2C7751, 0f00000000; +fma.rn.f32 f149, f70, 0f3F3D2FB0, %35; +fma.rn.f32 f150, f71, 0fBF2C7751, 0f00000000; +fma.rn.f32 f151, f73, 0f3DBCF732, f147; +fma.rn.f32 f152, f76, 0fBF7EE86F, f148; +fma.rn.f32 f153, f74, 0f3DBCF732, f149; +fma.rn.f32 f154, f75, 0fBF7EE86F, f150; +fma.rn.f32 f155, f77, 0fBF1A4643, f151; +fma.rn.f32 f156, f80, 0fBF4C4ADB, f152; +fma.rn.f32 f157, f78, 0fBF1A4643, f153; +fma.rn.f32 f158, f79, 0fBF4C4ADB, f154; +fma.rn.f32 f159, f81, 0fBF7BA420, f155; +fma.rn.f32 f160, f84, 0fBE3C28D5, f156; +fma.rn.f32 f161, f82, 0fBF7BA420, f157; +fma.rn.f32 f162, f83, 0fBE3C28D5, f158; +fma.rn.f32 f163, f85, 0fBF59A7D5, f159; +fma.rn.f32 f164, f88, 0f3F06C442, f160; +fma.rn.f32 f165, f86, 0fBF59A7D5, f161; +fma.rn.f32 f166, f87, 0f3F06C442, f162; +fma.rn.f32 f167, f89, 0fBE8C1D8E, f163; +fma.rn.f32 f168, f92, 0f3F763A35, f164; +fma.rn.f32 f169, f90, 0fBE8C1D8E, f165; +fma.rn.f32 f170, f91, 0f3F763A35, f166; +fma.rn.f32 f171, f93, 0f3EE437D1, f167; +fma.rn.f32 f172, f96, 0f3F65296C, f168; +fma.rn.f32 f173, f94, 0f3EE437D1, f169; +fma.rn.f32 f174, f95, 0f3F65296C, f170; +fma.rn.f32 f175, f97, 0f3F6EB680, f171; +fma.rn.f32 f176, f100, 0f3EB8F4AB, f172; +fma.rn.f32 f177, f98, 0f3F6EB680, f173; +fma.rn.f32 f178, f99, 0f3EB8F4AB, f174; +fma.rn.f32 f179, f69, 0f3EE437D1, %34; +fma.rn.f32 f180, f72, 0fBF65296C, 0f00000000; +fma.rn.f32 f181, f70, 0f3EE437D1, %35; +fma.rn.f32 f182, f71, 0fBF65296C, 0f00000000; +fma.rn.f32 f183, f73, 0fBF1A4643, f179; +fma.rn.f32 f184, f76, 0fBF4C4ADB, f180; +fma.rn.f32 f185, f74, 0fBF1A4643, f181; +fma.rn.f32 f186, f75, 0fBF4C4ADB, f182; +fma.rn.f32 f187, f77, 0fBF7BA420, f183; +fma.rn.f32 f188, f80, 0f3E3C28D5, f184; +fma.rn.f32 f189, f78, 0fBF7BA420, f185; +fma.rn.f32 f190, f79, 0f3E3C28D5, f186; +fma.rn.f32 f191, f81, 0fBE8C1D8E, f187; +fma.rn.f32 f192, f84, 0f3F763A35, f188; +fma.rn.f32 f193, f82, 0fBE8C1D8E, f189; +fma.rn.f32 f194, f83, 0f3F763A35, f190; +fma.rn.f32 f195, f85, 0f3F3D2FB0, f191; +fma.rn.f32 f196, f88, 0f3F2C7751, f192; +fma.rn.f32 f197, f86, 0f3F3D2FB0, f193; +fma.rn.f32 f198, f87, 0f3F2C7751, f194; +fma.rn.f32 f199, f89, 0f3F6EB680, f195; +fma.rn.f32 f200, f92, 0fBEB8F4AB, f196; +fma.rn.f32 f201, f90, 0f3F6EB680, f197; +fma.rn.f32 f202, f91, 0fBEB8F4AB, f198; +fma.rn.f32 f203, f93, 0f3DBCF732, f199; +fma.rn.f32 f204, f96, 0fBF7EE86F, f200; +fma.rn.f32 f205, f94, 0f3DBCF732, f201; +fma.rn.f32 f206, f95, 0fBF7EE86F, f202; +fma.rn.f32 f207, f97, 0fBF59A7D5, f203; +fma.rn.f32 f208, f100, 0fBF06C442, f204; +fma.rn.f32 f209, f98, 0fBF59A7D5, f205; +fma.rn.f32 f210, f99, 0fBF06C442, f206; +fma.rn.f32 f211, f69, 0f3DBCF732, %34; +fma.rn.f32 f212, f72, 0fBF7EE86F, 0f00000000; +fma.rn.f32 f213, f70, 0f3DBCF732, %35; +fma.rn.f32 f214, f71, 0fBF7EE86F, 0f00000000; +fma.rn.f32 f215, f73, 0fBF7BA420, f211; +fma.rn.f32 f216, f76, 0fBE3C28D5, f212; +fma.rn.f32 f217, f74, 0fBF7BA420, f213; +fma.rn.f32 f218, f75, 0fBE3C28D5, f214; +fma.rn.f32 f219, f77, 0fBE8C1D8E, f215; +fma.rn.f32 f220, f80, 0f3F763A35, f216; +fma.rn.f32 f221, f78, 0fBE8C1D8E, f217; +fma.rn.f32 f222, f79, 0f3F763A35, f218; +fma.rn.f32 f223, f81, 0f3F6EB680, f219; +fma.rn.f32 f224, f84, 0f3EB8F4AB, f220; +fma.rn.f32 f225, f82, 0f3F6EB680, f221; +fma.rn.f32 f226, f83, 0f3EB8F4AB, f222; +fma.rn.f32 f227, f85, 0f3EE437D1, f223; +fma.rn.f32 f228, f88, 0fBF65296C, f224; +fma.rn.f32 f229, f86, 0f3EE437D1, f225; +fma.rn.f32 f230, f87, 0fBF65296C, f226; +fma.rn.f32 f231, f89, 0fBF59A7D5, f227; +fma.rn.f32 f232, f92, 0fBF06C442, f228; +fma.rn.f32 f233, f90, 0fBF59A7D5, f229; +fma.rn.f32 f234, f91, 0fBF06C442, f230; +fma.rn.f32 f235, f93, 0fBF1A4643, f231; +fma.rn.f32 f236, f96, 0f3F4C4ADB, f232; +fma.rn.f32 f237, f94, 0fBF1A4643, f233; +fma.rn.f32 f238, f95, 0f3F4C4ADB, f234; +fma.rn.f32 f239, f97, 0f3F3D2FB0, f235; +fma.rn.f32 f240, f100, 0f3F2C7751, f236; +fma.rn.f32 f241, f98, 0f3F3D2FB0, f237; +fma.rn.f32 f242, f99, 0f3F2C7751, f238; +fma.rn.f32 f243, f69, 0fBE8C1D8E, %34; +fma.rn.f32 f244, f72, 0fBF763A35, 0f00000000; +fma.rn.f32 f245, f70, 0fBE8C1D8E, %35; +fma.rn.f32 f246, f71, 0fBF763A35, 0f00000000; +fma.rn.f32 f247, f73, 0fBF59A7D5, f243; +fma.rn.f32 f248, f76, 0f3F06C442, f244; +fma.rn.f32 f249, f74, 0fBF59A7D5, f245; +fma.rn.f32 f250, f75, 0f3F06C442, f246; +fma.rn.f32 f251, f77, 0f3F3D2FB0, f247; +fma.rn.f32 f252, f80, 0f3F2C7751, f248; +fma.rn.f32 f253, f78, 0f3F3D2FB0, f249; +fma.rn.f32 f254, f79, 0f3F2C7751, f250; +fma.rn.f32 f255, f81, 0f3EE437D1, f251; +fma.rn.f32 f256, f84, 0fBF65296C, f252; +fma.rn.f32 f257, f82, 0f3EE437D1, f253; +fma.rn.f32 f258, f83, 0fBF65296C, f254; +fma.rn.f32 f259, f85, 0fBF7BA420, f255; +fma.rn.f32 f260, f88, 0fBE3C28D5, f256; +fma.rn.f32 f261, f86, 0fBF7BA420, f257; +fma.rn.f32 f262, f87, 0fBE3C28D5, f258; +fma.rn.f32 f263, f89, 0f3DBCF732, f259; +fma.rn.f32 f264, f92, 0f3F7EE86F, f260; +fma.rn.f32 f265, f90, 0f3DBCF732, f261; +fma.rn.f32 f266, f91, 0f3F7EE86F, f262; +fma.rn.f32 f267, f93, 0f3F6EB680, f263; +fma.rn.f32 f268, f96, 0fBEB8F4AB, f264; +fma.rn.f32 f269, f94, 0f3F6EB680, f265; +fma.rn.f32 f270, f95, 0fBEB8F4AB, f266; +fma.rn.f32 f271, f97, 0fBF1A4643, f267; +fma.rn.f32 f272, f100, 0fBF4C4ADB, f268; +fma.rn.f32 f273, f98, 0fBF1A4643, f269; +fma.rn.f32 f274, f99, 0fBF4C4ADB, f270; +fma.rn.f32 f275, f69, 0fBF1A4643, %34; +fma.rn.f32 f276, f72, 0fBF4C4ADB, 0f00000000; +fma.rn.f32 f277, f70, 0fBF1A4643, %35; +fma.rn.f32 f278, f71, 0fBF4C4ADB, 0f00000000; +fma.rn.f32 f279, f73, 0fBE8C1D8E, f275; +fma.rn.f32 f280, f76, 0f3F763A35, f276; +fma.rn.f32 f281, f74, 0fBE8C1D8E, f277; +fma.rn.f32 f282, f75, 0f3F763A35, f278; +fma.rn.f32 f283, f77, 0f3F6EB680, f279; +fma.rn.f32 f284, f80, 0fBEB8F4AB, f280; +fma.rn.f32 f285, f78, 0f3F6EB680, f281; +fma.rn.f32 f286, f79, 0fBEB8F4AB, f282; +fma.rn.f32 f287, f81, 0fBF59A7D5, f283; +fma.rn.f32 f288, f84, 0fBF06C442, f284; +fma.rn.f32 f289, f82, 0fBF59A7D5, f285; +fma.rn.f32 f290, f83, 0fBF06C442, f286; +fma.rn.f32 f291, f85, 0f3DBCF732, f287; +fma.rn.f32 f292, f88, 0f3F7EE86F, f288; +fma.rn.f32 f293, f86, 0f3DBCF732, f289; +fma.rn.f32 f294, f87, 0f3F7EE86F, f290; +fma.rn.f32 f295, f89, 0f3F3D2FB0, f291; +fma.rn.f32 f296, f92, 0fBF2C7751, f292; +fma.rn.f32 f297, f90, 0f3F3D2FB0, f293; +fma.rn.f32 f298, f91, 0fBF2C7751, f294; +fma.rn.f32 f299, f93, 0fBF7BA420, f295; +fma.rn.f32 f300, f96, 0fBE3C28D5, f296; +fma.rn.f32 f301, f94, 0fBF7BA420, f297; +fma.rn.f32 f302, f95, 0fBE3C28D5, f298; +fma.rn.f32 f303, f97, 0f3EE437D1, f299; +fma.rn.f32 f304, f100, 0f3F65296C, f300; +fma.rn.f32 f305, f98, 0f3EE437D1, f301; +fma.rn.f32 f306, f99, 0f3F65296C, f302; +fma.rn.f32 f307, f69, 0fBF59A7D5, %34; +fma.rn.f32 f308, f72, 0fBF06C442, 0f00000000; +fma.rn.f32 f309, f70, 0fBF59A7D5, %35; +fma.rn.f32 f310, f71, 0fBF06C442, 0f00000000; +fma.rn.f32 f311, f73, 0f3EE437D1, f307; +fma.rn.f32 f312, f76, 0f3F65296C, f308; +fma.rn.f32 f313, f74, 0f3EE437D1, f309; +fma.rn.f32 f314, f75, 0f3F65296C, f310; +fma.rn.f32 f315, f77, 0f3DBCF732, f311; +fma.rn.f32 f316, f80, 0fBF7EE86F, f312; +fma.rn.f32 f317, f78, 0f3DBCF732, f313; +fma.rn.f32 f318, f79, 0fBF7EE86F, f314; +fma.rn.f32 f319, f81, 0fBF1A4643, f315; +fma.rn.f32 f320, f84, 0f3F4C4ADB, f316; +fma.rn.f32 f321, f82, 0fBF1A4643, f317; +fma.rn.f32 f322, f83, 0f3F4C4ADB, f318; +fma.rn.f32 f323, f85, 0f3F6EB680, f319; +fma.rn.f32 f324, f88, 0fBEB8F4AB, f320; +fma.rn.f32 f325, f86, 0f3F6EB680, f321; +fma.rn.f32 f326, f87, 0fBEB8F4AB, f322; +fma.rn.f32 f327, f89, 0fBF7BA420, f323; +fma.rn.f32 f328, f92, 0fBE3C28D5, f324; +fma.rn.f32 f329, f90, 0fBF7BA420, f325; +fma.rn.f32 f330, f91, 0fBE3C28D5, f326; +fma.rn.f32 f331, f93, 0f3F3D2FB0, f327; +fma.rn.f32 f332, f96, 0f3F2C7751, f328; +fma.rn.f32 f333, f94, 0f3F3D2FB0, f329; +fma.rn.f32 f334, f95, 0f3F2C7751, f330; +fma.rn.f32 f335, f97, 0fBE8C1D8E, f331; +fma.rn.f32 f336, f100, 0fBF763A35, f332; +fma.rn.f32 f337, f98, 0fBE8C1D8E, f333; +fma.rn.f32 f338, f99, 0fBF763A35, f334; +fma.rn.f32 f339, f69, 0fBF7BA420, %34; +fma.rn.f32 f340, f72, 0fBE3C28D5, 0f00000000; +fma.rn.f32 f341, f70, 0fBF7BA420, %35; +fma.rn.f32 f342, f71, 0fBE3C28D5, 0f00000000; +fma.rn.f32 f343, f73, 0f3F6EB680, f339; +fma.rn.f32 f344, f76, 0f3EB8F4AB, f340; +fma.rn.f32 f345, f74, 0f3F6EB680, f341; +fma.rn.f32 f346, f75, 0f3EB8F4AB, f342; +fma.rn.f32 f347, f77, 0fBF59A7D5, f343; +fma.rn.f32 f348, f80, 0fBF06C442, f344; +fma.rn.f32 f349, f78, 0fBF59A7D5, f345; +fma.rn.f32 f350, f79, 0fBF06C442, f346; +fma.rn.f32 f351, f81, 0f3F3D2FB0, f347; +fma.rn.f32 f352, f84, 0f3F2C7751, f348; +fma.rn.f32 f353, f82, 0f3F3D2FB0, f349; +fma.rn.f32 f354, f83, 0f3F2C7751, f350; +fma.rn.f32 f355, f85, 0fBF1A4643, f351; +fma.rn.f32 f356, f88, 0fBF4C4ADB, f352; +fma.rn.f32 f357, f86, 0fBF1A4643, f353; +fma.rn.f32 f358, f87, 0fBF4C4ADB, f354; +fma.rn.f32 f359, f89, 0f3EE437D1, f355; +fma.rn.f32 f360, f92, 0f3F65296C, f356; +fma.rn.f32 f361, f90, 0f3EE437D1, f357; +fma.rn.f32 f362, f91, 0f3F65296C, f358; +fma.rn.f32 f363, f93, 0fBE8C1D8E, f359; +fma.rn.f32 f364, f96, 0fBF763A35, f360; +fma.rn.f32 f365, f94, 0fBE8C1D8E, f361; +fma.rn.f32 f366, f95, 0fBF763A35, f362; +fma.rn.f32 f367, f97, 0f3DBCF732, f363; +fma.rn.f32 f368, f100, 0f3F7EE86F, f364; +fma.rn.f32 f369, f98, 0f3DBCF732, f365; +fma.rn.f32 f370, f99, 0f3F7EE86F, f366; +add.f32 %1, f114, f98; +add.f32 %0, f113, f97; +add.f32 %3, f145, f146; +sub.f32 %2, f143, f144; +add.f32 %5, f177, f178; +sub.f32 %4, f175, f176; +add.f32 %7, f209, f210; +sub.f32 %6, f207, f208; +add.f32 %9, f241, f242; +sub.f32 %8, f239, f240; +add.f32 %11, f273, f274; +sub.f32 %10, f271, f272; +add.f32 %13, f305, f306; +sub.f32 %12, f303, f304; +add.f32 %15, f337, f338; +sub.f32 %14, f335, f336; +add.f32 %17, f369, f370; +sub.f32 %16, f367, f368; +sub.f32 %19, f369, f370; +add.f32 %18, f367, f368; +sub.f32 %21, f337, f338; +add.f32 %20, f335, f336; +sub.f32 %23, f305, f306; +add.f32 %22, f303, f304; +sub.f32 %25, f273, f274; +add.f32 %24, f271, f272; +sub.f32 %27, f241, f242; +add.f32 %26, f239, f240; +sub.f32 %29, f209, f210; +add.f32 %28, f207, f208; +sub.f32 %31, f177, f178; +add.f32 %30, f175, f176; +sub.f32 %33, f145, f146; +add.f32 %32, f143, f144; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..dba9f03bdef92 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp32_inv.hpp.inc @@ -0,0 +1,352 @@ +#ifndef CUFFTDX_FFT_17_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_17_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<205, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<405>; +.reg .b64 rd<2>; +add.f32 f69, %36, %76; +add.f32 f70, %38, %77; +sub.f32 f71, %36, %76; +sub.f32 f72, %38, %77; +add.f32 f73, %39, %74; +add.f32 f74, %41, %75; +sub.f32 f75, %39, %74; +sub.f32 f76, %41, %75; +add.f32 f77, %42, %71; +add.f32 f78, %43, %73; +sub.f32 f79, %42, %71; +sub.f32 f80, %43, %73; +add.f32 f81, %44, %68; +add.f32 f82, %46, %70; +sub.f32 f83, %44, %68; +sub.f32 f84, %46, %70; +add.f32 f85, %47, %66; +add.f32 f86, %49, %67; +sub.f32 f87, %47, %66; +sub.f32 f88, %49, %67; +add.f32 f89, %50, %63; +add.f32 f90, %51, %65; +sub.f32 f91, %50, %63; +sub.f32 f92, %51, %65; +add.f32 f93, %52, %60; +add.f32 f94, %54, %62; +sub.f32 f95, %52, %60; +sub.f32 f96, %54, %62; +add.f32 f97, %55, %58; +add.f32 f98, %57, %59; +sub.f32 f99, %55, %58; +sub.f32 f100, %57, %59; +add.f32 f101, %34, f69; +add.f32 f102, %35, f70; +add.f32 f103, f101, f73; +add.f32 f104, f102, f74; +add.f32 f105, f103, f77; +add.f32 f106, f104, f78; +add.f32 f107, f105, f81; +add.f32 f108, f106, f82; +add.f32 f109, f107, f85; +add.f32 f110, f108, f86; +add.f32 f111, f109, f89; +add.f32 f112, f110, f90; +add.f32 f113, f111, f93; +add.f32 f114, f112, f94; +fma.rn.f32 f115, f69, 0f3F6EB680, %34; +fma.rn.f32 f116, f72, 0f3EB8F4AB, 0f00000000; +fma.rn.f32 f117, f70, 0f3F6EB680, %35; +fma.rn.f32 f118, f71, 0f3EB8F4AB, 0f00000000; +fma.rn.f32 f119, f73, 0f3F3D2FB0, f115; +fma.rn.f32 f120, f76, 0f3F2C7751, f116; +fma.rn.f32 f121, f74, 0f3F3D2FB0, f117; +fma.rn.f32 f122, f75, 0f3F2C7751, f118; +fma.rn.f32 f123, f77, 0f3EE437D1, f119; +fma.rn.f32 f124, f80, 0f3F65296C, f120; +fma.rn.f32 f125, f78, 0f3EE437D1, f121; +fma.rn.f32 f126, f79, 0f3F65296C, f122; +fma.rn.f32 f127, f81, 0f3DBCF732, f123; +fma.rn.f32 f128, f84, 0f3F7EE86F, f124; +fma.rn.f32 f129, f82, 0f3DBCF732, f125; +fma.rn.f32 f130, f83, 0f3F7EE86F, f126; +fma.rn.f32 f131, f85, 0fBE8C1D8E, f127; +fma.rn.f32 f132, f88, 0f3F763A35, f128; +fma.rn.f32 f133, f86, 0fBE8C1D8E, f129; +fma.rn.f32 f134, f87, 0f3F763A35, f130; +fma.rn.f32 f135, f89, 0fBF1A4643, f131; +fma.rn.f32 f136, f92, 0f3F4C4ADB, f132; +fma.rn.f32 f137, f90, 0fBF1A4643, f133; +fma.rn.f32 f138, f91, 0f3F4C4ADB, f134; +fma.rn.f32 f139, f93, 0fBF59A7D5, f135; +fma.rn.f32 f140, f96, 0f3F06C442, f136; +fma.rn.f32 f141, f94, 0fBF59A7D5, f137; +fma.rn.f32 f142, f95, 0f3F06C442, f138; +fma.rn.f32 f143, f97, 0fBF7BA420, f139; +fma.rn.f32 f144, f100, 0f3E3C28D5, f140; +fma.rn.f32 f145, f98, 0fBF7BA420, f141; +fma.rn.f32 f146, f99, 0f3E3C28D5, f142; +fma.rn.f32 f147, f69, 0f3F3D2FB0, %34; +fma.rn.f32 f148, f72, 0f3F2C7751, 0f00000000; +fma.rn.f32 f149, f70, 0f3F3D2FB0, %35; +fma.rn.f32 f150, f71, 0f3F2C7751, 0f00000000; +fma.rn.f32 f151, f73, 0f3DBCF732, f147; +fma.rn.f32 f152, f76, 0f3F7EE86F, f148; +fma.rn.f32 f153, f74, 0f3DBCF732, f149; +fma.rn.f32 f154, f75, 0f3F7EE86F, f150; +fma.rn.f32 f155, f77, 0fBF1A4643, f151; +fma.rn.f32 f156, f80, 0f3F4C4ADB, f152; +fma.rn.f32 f157, f78, 0fBF1A4643, f153; +fma.rn.f32 f158, f79, 0f3F4C4ADB, f154; +fma.rn.f32 f159, f81, 0fBF7BA420, f155; +fma.rn.f32 f160, f84, 0f3E3C28D5, f156; +fma.rn.f32 f161, f82, 0fBF7BA420, f157; +fma.rn.f32 f162, f83, 0f3E3C28D5, f158; +fma.rn.f32 f163, f85, 0fBF59A7D5, f159; +fma.rn.f32 f164, f88, 0fBF06C442, f160; +fma.rn.f32 f165, f86, 0fBF59A7D5, f161; +fma.rn.f32 f166, f87, 0fBF06C442, f162; +fma.rn.f32 f167, f89, 0fBE8C1D8E, f163; +fma.rn.f32 f168, f92, 0fBF763A35, f164; +fma.rn.f32 f169, f90, 0fBE8C1D8E, f165; +fma.rn.f32 f170, f91, 0fBF763A35, f166; +fma.rn.f32 f171, f93, 0f3EE437D1, f167; +fma.rn.f32 f172, f96, 0fBF65296C, f168; +fma.rn.f32 f173, f94, 0f3EE437D1, f169; +fma.rn.f32 f174, f95, 0fBF65296C, f170; +fma.rn.f32 f175, f97, 0f3F6EB680, f171; +fma.rn.f32 f176, f100, 0fBEB8F4AB, f172; +fma.rn.f32 f177, f98, 0f3F6EB680, f173; +fma.rn.f32 f178, f99, 0fBEB8F4AB, f174; +fma.rn.f32 f179, f69, 0f3EE437D1, %34; +fma.rn.f32 f180, f72, 0f3F65296C, 0f00000000; +fma.rn.f32 f181, f70, 0f3EE437D1, %35; +fma.rn.f32 f182, f71, 0f3F65296C, 0f00000000; +fma.rn.f32 f183, f73, 0fBF1A4643, f179; +fma.rn.f32 f184, f76, 0f3F4C4ADB, f180; +fma.rn.f32 f185, f74, 0fBF1A4643, f181; +fma.rn.f32 f186, f75, 0f3F4C4ADB, f182; +fma.rn.f32 f187, f77, 0fBF7BA420, f183; +fma.rn.f32 f188, f80, 0fBE3C28D5, f184; +fma.rn.f32 f189, f78, 0fBF7BA420, f185; +fma.rn.f32 f190, f79, 0fBE3C28D5, f186; +fma.rn.f32 f191, f81, 0fBE8C1D8E, f187; +fma.rn.f32 f192, f84, 0fBF763A35, f188; +fma.rn.f32 f193, f82, 0fBE8C1D8E, f189; +fma.rn.f32 f194, f83, 0fBF763A35, f190; +fma.rn.f32 f195, f85, 0f3F3D2FB0, f191; +fma.rn.f32 f196, f88, 0fBF2C7751, f192; +fma.rn.f32 f197, f86, 0f3F3D2FB0, f193; +fma.rn.f32 f198, f87, 0fBF2C7751, f194; +fma.rn.f32 f199, f89, 0f3F6EB680, f195; +fma.rn.f32 f200, f92, 0f3EB8F4AB, f196; +fma.rn.f32 f201, f90, 0f3F6EB680, f197; +fma.rn.f32 f202, f91, 0f3EB8F4AB, f198; +fma.rn.f32 f203, f93, 0f3DBCF732, f199; +fma.rn.f32 f204, f96, 0f3F7EE86F, f200; +fma.rn.f32 f205, f94, 0f3DBCF732, f201; +fma.rn.f32 f206, f95, 0f3F7EE86F, f202; +fma.rn.f32 f207, f97, 0fBF59A7D5, f203; +fma.rn.f32 f208, f100, 0f3F06C442, f204; +fma.rn.f32 f209, f98, 0fBF59A7D5, f205; +fma.rn.f32 f210, f99, 0f3F06C442, f206; +fma.rn.f32 f211, f69, 0f3DBCF732, %34; +fma.rn.f32 f212, f72, 0f3F7EE86F, 0f00000000; +fma.rn.f32 f213, f70, 0f3DBCF732, %35; +fma.rn.f32 f214, f71, 0f3F7EE86F, 0f00000000; +fma.rn.f32 f215, f73, 0fBF7BA420, f211; +fma.rn.f32 f216, f76, 0f3E3C28D5, f212; +fma.rn.f32 f217, f74, 0fBF7BA420, f213; +fma.rn.f32 f218, f75, 0f3E3C28D5, f214; +fma.rn.f32 f219, f77, 0fBE8C1D8E, f215; +fma.rn.f32 f220, f80, 0fBF763A35, f216; +fma.rn.f32 f221, f78, 0fBE8C1D8E, f217; +fma.rn.f32 f222, f79, 0fBF763A35, f218; +fma.rn.f32 f223, f81, 0f3F6EB680, f219; +fma.rn.f32 f224, f84, 0fBEB8F4AB, f220; +fma.rn.f32 f225, f82, 0f3F6EB680, f221; +fma.rn.f32 f226, f83, 0fBEB8F4AB, f222; +fma.rn.f32 f227, f85, 0f3EE437D1, f223; +fma.rn.f32 f228, f88, 0f3F65296C, f224; +fma.rn.f32 f229, f86, 0f3EE437D1, f225; +fma.rn.f32 f230, f87, 0f3F65296C, f226; +fma.rn.f32 f231, f89, 0fBF59A7D5, f227; +fma.rn.f32 f232, f92, 0f3F06C442, f228; +fma.rn.f32 f233, f90, 0fBF59A7D5, f229; +fma.rn.f32 f234, f91, 0f3F06C442, f230; +fma.rn.f32 f235, f93, 0fBF1A4643, f231; +fma.rn.f32 f236, f96, 0fBF4C4ADB, f232; +fma.rn.f32 f237, f94, 0fBF1A4643, f233; +fma.rn.f32 f238, f95, 0fBF4C4ADB, f234; +fma.rn.f32 f239, f97, 0f3F3D2FB0, f235; +fma.rn.f32 f240, f100, 0fBF2C7751, f236; +fma.rn.f32 f241, f98, 0f3F3D2FB0, f237; +fma.rn.f32 f242, f99, 0fBF2C7751, f238; +fma.rn.f32 f243, f69, 0fBE8C1D8E, %34; +fma.rn.f32 f244, f72, 0f3F763A35, 0f00000000; +fma.rn.f32 f245, f70, 0fBE8C1D8E, %35; +fma.rn.f32 f246, f71, 0f3F763A35, 0f00000000; +fma.rn.f32 f247, f73, 0fBF59A7D5, f243; +fma.rn.f32 f248, f76, 0fBF06C442, f244; +fma.rn.f32 f249, f74, 0fBF59A7D5, f245; +fma.rn.f32 f250, f75, 0fBF06C442, f246; +fma.rn.f32 f251, f77, 0f3F3D2FB0, f247; +fma.rn.f32 f252, f80, 0fBF2C7751, f248; +fma.rn.f32 f253, f78, 0f3F3D2FB0, f249; +fma.rn.f32 f254, f79, 0fBF2C7751, f250; +fma.rn.f32 f255, f81, 0f3EE437D1, f251; +fma.rn.f32 f256, f84, 0f3F65296C, f252; +fma.rn.f32 f257, f82, 0f3EE437D1, f253; +fma.rn.f32 f258, f83, 0f3F65296C, f254; +fma.rn.f32 f259, f85, 0fBF7BA420, f255; +fma.rn.f32 f260, f88, 0f3E3C28D5, f256; +fma.rn.f32 f261, f86, 0fBF7BA420, f257; +fma.rn.f32 f262, f87, 0f3E3C28D5, f258; +fma.rn.f32 f263, f89, 0f3DBCF732, f259; +fma.rn.f32 f264, f92, 0fBF7EE86F, f260; +fma.rn.f32 f265, f90, 0f3DBCF732, f261; +fma.rn.f32 f266, f91, 0fBF7EE86F, f262; +fma.rn.f32 f267, f93, 0f3F6EB680, f263; +fma.rn.f32 f268, f96, 0f3EB8F4AB, f264; +fma.rn.f32 f269, f94, 0f3F6EB680, f265; +fma.rn.f32 f270, f95, 0f3EB8F4AB, f266; +fma.rn.f32 f271, f97, 0fBF1A4643, f267; +fma.rn.f32 f272, f100, 0f3F4C4ADB, f268; +fma.rn.f32 f273, f98, 0fBF1A4643, f269; +fma.rn.f32 f274, f99, 0f3F4C4ADB, f270; +fma.rn.f32 f275, f69, 0fBF1A4643, %34; +fma.rn.f32 f276, f72, 0f3F4C4ADB, 0f00000000; +fma.rn.f32 f277, f70, 0fBF1A4643, %35; +fma.rn.f32 f278, f71, 0f3F4C4ADB, 0f00000000; +fma.rn.f32 f279, f73, 0fBE8C1D8E, f275; +fma.rn.f32 f280, f76, 0fBF763A35, f276; +fma.rn.f32 f281, f74, 0fBE8C1D8E, f277; +fma.rn.f32 f282, f75, 0fBF763A35, f278; +fma.rn.f32 f283, f77, 0f3F6EB680, f279; +fma.rn.f32 f284, f80, 0f3EB8F4AB, f280; +fma.rn.f32 f285, f78, 0f3F6EB680, f281; +fma.rn.f32 f286, f79, 0f3EB8F4AB, f282; +fma.rn.f32 f287, f81, 0fBF59A7D5, f283; +fma.rn.f32 f288, f84, 0f3F06C442, f284; +fma.rn.f32 f289, f82, 0fBF59A7D5, f285; +fma.rn.f32 f290, f83, 0f3F06C442, f286; +fma.rn.f32 f291, f85, 0f3DBCF732, f287; +fma.rn.f32 f292, f88, 0fBF7EE86F, f288; +fma.rn.f32 f293, f86, 0f3DBCF732, f289; +fma.rn.f32 f294, f87, 0fBF7EE86F, f290; +fma.rn.f32 f295, f89, 0f3F3D2FB0, f291; +fma.rn.f32 f296, f92, 0f3F2C7751, f292; +fma.rn.f32 f297, f90, 0f3F3D2FB0, f293; +fma.rn.f32 f298, f91, 0f3F2C7751, f294; +fma.rn.f32 f299, f93, 0fBF7BA420, f295; +fma.rn.f32 f300, f96, 0f3E3C28D5, f296; +fma.rn.f32 f301, f94, 0fBF7BA420, f297; +fma.rn.f32 f302, f95, 0f3E3C28D5, f298; +fma.rn.f32 f303, f97, 0f3EE437D1, f299; +fma.rn.f32 f304, f100, 0fBF65296C, f300; +fma.rn.f32 f305, f98, 0f3EE437D1, f301; +fma.rn.f32 f306, f99, 0fBF65296C, f302; +fma.rn.f32 f307, f69, 0fBF59A7D5, %34; +fma.rn.f32 f308, f72, 0f3F06C442, 0f00000000; +fma.rn.f32 f309, f70, 0fBF59A7D5, %35; +fma.rn.f32 f310, f71, 0f3F06C442, 0f00000000; +fma.rn.f32 f311, f73, 0f3EE437D1, f307; +fma.rn.f32 f312, f76, 0fBF65296C, f308; +fma.rn.f32 f313, f74, 0f3EE437D1, f309; +fma.rn.f32 f314, f75, 0fBF65296C, f310; +fma.rn.f32 f315, f77, 0f3DBCF732, f311; +fma.rn.f32 f316, f80, 0f3F7EE86F, f312; +fma.rn.f32 f317, f78, 0f3DBCF732, f313; +fma.rn.f32 f318, f79, 0f3F7EE86F, f314; +fma.rn.f32 f319, f81, 0fBF1A4643, f315; +fma.rn.f32 f320, f84, 0fBF4C4ADB, f316; +fma.rn.f32 f321, f82, 0fBF1A4643, f317; +fma.rn.f32 f322, f83, 0fBF4C4ADB, f318; +fma.rn.f32 f323, f85, 0f3F6EB680, f319; +fma.rn.f32 f324, f88, 0f3EB8F4AB, f320; +fma.rn.f32 f325, f86, 0f3F6EB680, f321; +fma.rn.f32 f326, f87, 0f3EB8F4AB, f322; +fma.rn.f32 f327, f89, 0fBF7BA420, f323; +fma.rn.f32 f328, f92, 0f3E3C28D5, f324; +fma.rn.f32 f329, f90, 0fBF7BA420, f325; +fma.rn.f32 f330, f91, 0f3E3C28D5, f326; +fma.rn.f32 f331, f93, 0f3F3D2FB0, f327; +fma.rn.f32 f332, f96, 0fBF2C7751, f328; +fma.rn.f32 f333, f94, 0f3F3D2FB0, f329; +fma.rn.f32 f334, f95, 0fBF2C7751, f330; +fma.rn.f32 f335, f97, 0fBE8C1D8E, f331; +fma.rn.f32 f336, f100, 0f3F763A35, f332; +fma.rn.f32 f337, f98, 0fBE8C1D8E, f333; +fma.rn.f32 f338, f99, 0f3F763A35, f334; +fma.rn.f32 f339, f69, 0fBF7BA420, %34; +fma.rn.f32 f340, f72, 0f3E3C28D5, 0f00000000; +fma.rn.f32 f341, f70, 0fBF7BA420, %35; +fma.rn.f32 f342, f71, 0f3E3C28D5, 0f00000000; +fma.rn.f32 f343, f73, 0f3F6EB680, f339; +fma.rn.f32 f344, f76, 0fBEB8F4AB, f340; +fma.rn.f32 f345, f74, 0f3F6EB680, f341; +fma.rn.f32 f346, f75, 0fBEB8F4AB, f342; +fma.rn.f32 f347, f77, 0fBF59A7D5, f343; +fma.rn.f32 f348, f80, 0f3F06C442, f344; +fma.rn.f32 f349, f78, 0fBF59A7D5, f345; +fma.rn.f32 f350, f79, 0f3F06C442, f346; +fma.rn.f32 f351, f81, 0f3F3D2FB0, f347; +fma.rn.f32 f352, f84, 0fBF2C7751, f348; +fma.rn.f32 f353, f82, 0f3F3D2FB0, f349; +fma.rn.f32 f354, f83, 0fBF2C7751, f350; +fma.rn.f32 f355, f85, 0fBF1A4643, f351; +fma.rn.f32 f356, f88, 0f3F4C4ADB, f352; +fma.rn.f32 f357, f86, 0fBF1A4643, f353; +fma.rn.f32 f358, f87, 0f3F4C4ADB, f354; +fma.rn.f32 f359, f89, 0f3EE437D1, f355; +fma.rn.f32 f360, f92, 0fBF65296C, f356; +fma.rn.f32 f361, f90, 0f3EE437D1, f357; +fma.rn.f32 f362, f91, 0fBF65296C, f358; +fma.rn.f32 f363, f93, 0fBE8C1D8E, f359; +fma.rn.f32 f364, f96, 0f3F763A35, f360; +fma.rn.f32 f365, f94, 0fBE8C1D8E, f361; +fma.rn.f32 f366, f95, 0f3F763A35, f362; +fma.rn.f32 f367, f97, 0f3DBCF732, f363; +fma.rn.f32 f368, f100, 0fBF7EE86F, f364; +fma.rn.f32 f369, f98, 0f3DBCF732, f365; +fma.rn.f32 f370, f99, 0fBF7EE86F, f366; +add.f32 %1, f114, f98; +add.f32 %0, f113, f97; +add.f32 %3, f145, f146; +sub.f32 %2, f143, f144; +add.f32 %5, f177, f178; +sub.f32 %4, f175, f176; +add.f32 %7, f209, f210; +sub.f32 %6, f207, f208; +add.f32 %9, f241, f242; +sub.f32 %8, f239, f240; +add.f32 %11, f273, f274; +sub.f32 %10, f271, f272; +add.f32 %13, f305, f306; +sub.f32 %12, f303, f304; +add.f32 %15, f337, f338; +sub.f32 %14, f335, f336; +add.f32 %17, f369, f370; +sub.f32 %16, f367, f368; +sub.f32 %19, f369, f370; +add.f32 %18, f367, f368; +sub.f32 %21, f337, f338; +add.f32 %20, f335, f336; +sub.f32 %23, f305, f306; +add.f32 %22, f303, f304; +sub.f32 %25, f273, f274; +add.f32 %24, f271, f272; +sub.f32 %27, f241, f242; +add.f32 %26, f239, f240; +sub.f32 %29, f209, f210; +add.f32 %28, f207, f208; +sub.f32 %31, f177, f178; +add.f32 %30, f175, f176; +sub.f32 %33, f145, f146; +add.f32 %32, f143, f144; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..6f0ac904cf808 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp64_fwd.hpp.inc @@ -0,0 +1,352 @@ +#ifndef CUFFTDX_FFT_17_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_17_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<407, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<405>; +.reg .b64 rd<2>; +add.f64 fd69, %36, %76; +add.f64 fd70, %38, %77; +sub.f64 fd71, %36, %76; +sub.f64 fd72, %38, %77; +add.f64 fd73, %39, %74; +add.f64 fd74, %41, %75; +sub.f64 fd75, %39, %74; +sub.f64 fd76, %41, %75; +add.f64 fd77, %42, %71; +add.f64 fd78, %43, %73; +sub.f64 fd79, %42, %71; +sub.f64 fd80, %43, %73; +add.f64 fd81, %44, %68; +add.f64 fd82, %46, %70; +sub.f64 fd83, %44, %68; +sub.f64 fd84, %46, %70; +add.f64 fd85, %47, %66; +add.f64 fd86, %49, %67; +sub.f64 fd87, %47, %66; +sub.f64 fd88, %49, %67; +add.f64 fd89, %50, %63; +add.f64 fd90, %51, %65; +sub.f64 fd91, %50, %63; +sub.f64 fd92, %51, %65; +add.f64 fd93, %52, %60; +add.f64 fd94, %54, %62; +sub.f64 fd95, %52, %60; +sub.f64 fd96, %54, %62; +add.f64 fd97, %55, %58; +add.f64 fd98, %57, %59; +sub.f64 fd99, %55, %58; +sub.f64 fd100, %57, %59; +add.f64 fd101, %34, fd69; +add.f64 fd102, %35, fd70; +add.f64 fd103, fd101, fd73; +add.f64 fd104, fd102, fd74; +add.f64 fd105, fd103, fd77; +add.f64 fd106, fd104, fd78; +add.f64 fd107, fd105, fd81; +add.f64 fd108, fd106, fd82; +add.f64 fd109, fd107, fd85; +add.f64 fd110, fd108, fd86; +add.f64 fd111, fd109, fd89; +add.f64 fd112, fd110, fd90; +add.f64 fd113, fd111, fd93; +add.f64 fd114, fd112, fd94; +fma.rn.f64 fd115, fd69, 0d3FEDD6D000370991, %34; +fma.rn.f64 fd116, fd72, 0dBFD71E955D8E7CDC, 0d0000000000000000; +fma.rn.f64 fd117, fd70, 0d3FEDD6D000370991, %35; +fma.rn.f64 fd118, fd71, 0dBFD71E955D8E7CDC, 0d0000000000000000; +fma.rn.f64 fd119, fd73, 0d3FE7A5F6075D4884, fd115; +fma.rn.f64 fd120, fd76, 0dBFE58EEA2A9D6DA3, fd116; +fma.rn.f64 fd121, fd74, 0d3FE7A5F6075D4884, fd117; +fma.rn.f64 fd122, fd75, 0dBFE58EEA2A9D6DA3, fd118; +fma.rn.f64 fd123, fd77, 0d3FDC86FA2B2883CD, fd119; +fma.rn.f64 fd124, fd80, 0dBFECA52D7C9E640B, fd120; +fma.rn.f64 fd125, fd78, 0d3FDC86FA2B2883CD, fd121; +fma.rn.f64 fd126, fd79, 0dBFECA52D7C9E640B, fd122; +fma.rn.f64 fd127, fd81, 0d3FB79EE63259B75E, fd123; +fma.rn.f64 fd128, fd84, 0dBFEFDD0DEB564B22, fd124; +fma.rn.f64 fd129, fd82, 0d3FB79EE63259B75E, fd125; +fma.rn.f64 fd130, fd83, 0dBFEFDD0DEB564B22, fd126; +fma.rn.f64 fd131, fd85, 0dBFD183B1C61F0D01, fd127; +fma.rn.f64 fd132, fd88, 0dBFEEC746923C349F, fd128; +fma.rn.f64 fd133, fd86, 0dBFD183B1C61F0D01, fd129; +fma.rn.f64 fd134, fd87, 0dBFEEC746923C349F, fd130; +fma.rn.f64 fd135, fd89, 0dBFE348C86ED5F1BB, fd131; +fma.rn.f64 fd136, fd92, 0dBFE9895B6C9A05F6, fd132; +fma.rn.f64 fd137, fd90, 0dBFE348C86ED5F1BB, fd133; +fma.rn.f64 fd138, fd91, 0dBFE9895B6C9A05F6, fd134; +fma.rn.f64 fd139, fd93, 0dBFEB34FA910EA3B9, fd135; +fma.rn.f64 fd140, fd96, 0dBFE0D8884363DD80, fd136; +fma.rn.f64 fd141, fd94, 0dBFEB34FA910EA3B9, fd137; +fma.rn.f64 fd142, fd95, 0dBFE0D8884363DD80, fd138; +fma.rn.f64 fd143, fd97, 0dBFEF7484007FAEF3, fd139; +fma.rn.f64 fd144, fd100, 0dBFC7851AACD6C6B4, fd140; +fma.rn.f64 fd145, fd98, 0dBFEF7484007FAEF3, fd141; +fma.rn.f64 fd146, fd99, 0dBFC7851AACD6C6B4, fd142; +fma.rn.f64 fd147, fd69, 0d3FE7A5F6075D4884, %34; +fma.rn.f64 fd148, fd72, 0dBFE58EEA2A9D6DA3, 0d0000000000000000; +fma.rn.f64 fd149, fd70, 0d3FE7A5F6075D4884, %35; +fma.rn.f64 fd150, fd71, 0dBFE58EEA2A9D6DA3, 0d0000000000000000; +fma.rn.f64 fd151, fd73, 0d3FB79EE63259B75E, fd147; +fma.rn.f64 fd152, fd76, 0dBFEFDD0DEB564B22, fd148; +fma.rn.f64 fd153, fd74, 0d3FB79EE63259B75E, fd149; +fma.rn.f64 fd154, fd75, 0dBFEFDD0DEB564B22, fd150; +fma.rn.f64 fd155, fd77, 0dBFE348C86ED5F1BB, fd151; +fma.rn.f64 fd156, fd80, 0dBFE9895B6C9A05F6, fd152; +fma.rn.f64 fd157, fd78, 0dBFE348C86ED5F1BB, fd153; +fma.rn.f64 fd158, fd79, 0dBFE9895B6C9A05F6, fd154; +fma.rn.f64 fd159, fd81, 0dBFEF7484007FAEF3, fd155; +fma.rn.f64 fd160, fd84, 0dBFC7851AACD6C6B4, fd156; +fma.rn.f64 fd161, fd82, 0dBFEF7484007FAEF3, fd157; +fma.rn.f64 fd162, fd83, 0dBFC7851AACD6C6B4, fd158; +fma.rn.f64 fd163, fd85, 0dBFEB34FA910EA3B9, fd159; +fma.rn.f64 fd164, fd88, 0d3FE0D8884363DD80, fd160; +fma.rn.f64 fd165, fd86, 0dBFEB34FA910EA3B9, fd161; +fma.rn.f64 fd166, fd87, 0d3FE0D8884363DD80, fd162; +fma.rn.f64 fd167, fd89, 0dBFD183B1C61F0D01, fd163; +fma.rn.f64 fd168, fd92, 0d3FEEC746923C349F, fd164; +fma.rn.f64 fd169, fd90, 0dBFD183B1C61F0D01, fd165; +fma.rn.f64 fd170, fd91, 0d3FEEC746923C349F, fd166; +fma.rn.f64 fd171, fd93, 0d3FDC86FA2B2883CD, fd167; +fma.rn.f64 fd172, fd96, 0d3FECA52D7C9E640B, fd168; +fma.rn.f64 fd173, fd94, 0d3FDC86FA2B2883CD, fd169; +fma.rn.f64 fd174, fd95, 0d3FECA52D7C9E640B, fd170; +fma.rn.f64 fd175, fd97, 0d3FEDD6D000370991, fd171; +fma.rn.f64 fd176, fd100, 0d3FD71E955D8E7CDC, fd172; +fma.rn.f64 fd177, fd98, 0d3FEDD6D000370991, fd173; +fma.rn.f64 fd178, fd99, 0d3FD71E955D8E7CDC, fd174; +fma.rn.f64 fd179, fd69, 0d3FDC86FA2B2883CD, %34; +fma.rn.f64 fd180, fd72, 0dBFECA52D7C9E640B, 0d0000000000000000; +fma.rn.f64 fd181, fd70, 0d3FDC86FA2B2883CD, %35; +fma.rn.f64 fd182, fd71, 0dBFECA52D7C9E640B, 0d0000000000000000; +fma.rn.f64 fd183, fd73, 0dBFE348C86ED5F1BB, fd179; +fma.rn.f64 fd184, fd76, 0dBFE9895B6C9A05F6, fd180; +fma.rn.f64 fd185, fd74, 0dBFE348C86ED5F1BB, fd181; +fma.rn.f64 fd186, fd75, 0dBFE9895B6C9A05F6, fd182; +fma.rn.f64 fd187, fd77, 0dBFEF7484007FAEF3, fd183; +fma.rn.f64 fd188, fd80, 0d3FC7851AACD6C6B4, fd184; +fma.rn.f64 fd189, fd78, 0dBFEF7484007FAEF3, fd185; +fma.rn.f64 fd190, fd79, 0d3FC7851AACD6C6B4, fd186; +fma.rn.f64 fd191, fd81, 0dBFD183B1C61F0D01, fd187; +fma.rn.f64 fd192, fd84, 0d3FEEC746923C349F, fd188; +fma.rn.f64 fd193, fd82, 0dBFD183B1C61F0D01, fd189; +fma.rn.f64 fd194, fd83, 0d3FEEC746923C349F, fd190; +fma.rn.f64 fd195, fd85, 0d3FE7A5F6075D4884, fd191; +fma.rn.f64 fd196, fd88, 0d3FE58EEA2A9D6DA3, fd192; +fma.rn.f64 fd197, fd86, 0d3FE7A5F6075D4884, fd193; +fma.rn.f64 fd198, fd87, 0d3FE58EEA2A9D6DA3, fd194; +fma.rn.f64 fd199, fd89, 0d3FEDD6D000370991, fd195; +fma.rn.f64 fd200, fd92, 0dBFD71E955D8E7CDC, fd196; +fma.rn.f64 fd201, fd90, 0d3FEDD6D000370991, fd197; +fma.rn.f64 fd202, fd91, 0dBFD71E955D8E7CDC, fd198; +fma.rn.f64 fd203, fd93, 0d3FB79EE63259B75E, fd199; +fma.rn.f64 fd204, fd96, 0dBFEFDD0DEB564B22, fd200; +fma.rn.f64 fd205, fd94, 0d3FB79EE63259B75E, fd201; +fma.rn.f64 fd206, fd95, 0dBFEFDD0DEB564B22, fd202; +fma.rn.f64 fd207, fd97, 0dBFEB34FA910EA3B9, fd203; +fma.rn.f64 fd208, fd100, 0dBFE0D8884363DD80, fd204; +fma.rn.f64 fd209, fd98, 0dBFEB34FA910EA3B9, fd205; +fma.rn.f64 fd210, fd99, 0dBFE0D8884363DD80, fd206; +fma.rn.f64 fd211, fd69, 0d3FB79EE63259B75E, %34; +fma.rn.f64 fd212, fd72, 0dBFEFDD0DEB564B22, 0d0000000000000000; +fma.rn.f64 fd213, fd70, 0d3FB79EE63259B75E, %35; +fma.rn.f64 fd214, fd71, 0dBFEFDD0DEB564B22, 0d0000000000000000; +fma.rn.f64 fd215, fd73, 0dBFEF7484007FAEF3, fd211; +fma.rn.f64 fd216, fd76, 0dBFC7851AACD6C6B4, fd212; +fma.rn.f64 fd217, fd74, 0dBFEF7484007FAEF3, fd213; +fma.rn.f64 fd218, fd75, 0dBFC7851AACD6C6B4, fd214; +fma.rn.f64 fd219, fd77, 0dBFD183B1C61F0D01, fd215; +fma.rn.f64 fd220, fd80, 0d3FEEC746923C349F, fd216; +fma.rn.f64 fd221, fd78, 0dBFD183B1C61F0D01, fd217; +fma.rn.f64 fd222, fd79, 0d3FEEC746923C349F, fd218; +fma.rn.f64 fd223, fd81, 0d3FEDD6D000370991, fd219; +fma.rn.f64 fd224, fd84, 0d3FD71E955D8E7CDC, fd220; +fma.rn.f64 fd225, fd82, 0d3FEDD6D000370991, fd221; +fma.rn.f64 fd226, fd83, 0d3FD71E955D8E7CDC, fd222; +fma.rn.f64 fd227, fd85, 0d3FDC86FA2B2883CD, fd223; +fma.rn.f64 fd228, fd88, 0dBFECA52D7C9E640B, fd224; +fma.rn.f64 fd229, fd86, 0d3FDC86FA2B2883CD, fd225; +fma.rn.f64 fd230, fd87, 0dBFECA52D7C9E640B, fd226; +fma.rn.f64 fd231, fd89, 0dBFEB34FA910EA3B9, fd227; +fma.rn.f64 fd232, fd92, 0dBFE0D8884363DD80, fd228; +fma.rn.f64 fd233, fd90, 0dBFEB34FA910EA3B9, fd229; +fma.rn.f64 fd234, fd91, 0dBFE0D8884363DD80, fd230; +fma.rn.f64 fd235, fd93, 0dBFE348C86ED5F1BB, fd231; +fma.rn.f64 fd236, fd96, 0d3FE9895B6C9A05F6, fd232; +fma.rn.f64 fd237, fd94, 0dBFE348C86ED5F1BB, fd233; +fma.rn.f64 fd238, fd95, 0d3FE9895B6C9A05F6, fd234; +fma.rn.f64 fd239, fd97, 0d3FE7A5F6075D4884, fd235; +fma.rn.f64 fd240, fd100, 0d3FE58EEA2A9D6DA3, fd236; +fma.rn.f64 fd241, fd98, 0d3FE7A5F6075D4884, fd237; +fma.rn.f64 fd242, fd99, 0d3FE58EEA2A9D6DA3, fd238; +fma.rn.f64 fd243, fd69, 0dBFD183B1C61F0D01, %34; +fma.rn.f64 fd244, fd72, 0dBFEEC746923C349F, 0d0000000000000000; +fma.rn.f64 fd245, fd70, 0dBFD183B1C61F0D01, %35; +fma.rn.f64 fd246, fd71, 0dBFEEC746923C349F, 0d0000000000000000; +fma.rn.f64 fd247, fd73, 0dBFEB34FA910EA3B9, fd243; +fma.rn.f64 fd248, fd76, 0d3FE0D8884363DD80, fd244; +fma.rn.f64 fd249, fd74, 0dBFEB34FA910EA3B9, fd245; +fma.rn.f64 fd250, fd75, 0d3FE0D8884363DD80, fd246; +fma.rn.f64 fd251, fd77, 0d3FE7A5F6075D4884, fd247; +fma.rn.f64 fd252, fd80, 0d3FE58EEA2A9D6DA3, fd248; +fma.rn.f64 fd253, fd78, 0d3FE7A5F6075D4884, fd249; +fma.rn.f64 fd254, fd79, 0d3FE58EEA2A9D6DA3, fd250; +fma.rn.f64 fd255, fd81, 0d3FDC86FA2B2883CD, fd251; +fma.rn.f64 fd256, fd84, 0dBFECA52D7C9E640B, fd252; +fma.rn.f64 fd257, fd82, 0d3FDC86FA2B2883CD, fd253; +fma.rn.f64 fd258, fd83, 0dBFECA52D7C9E640B, fd254; +fma.rn.f64 fd259, fd85, 0dBFEF7484007FAEF3, fd255; +fma.rn.f64 fd260, fd88, 0dBFC7851AACD6C6B4, fd256; +fma.rn.f64 fd261, fd86, 0dBFEF7484007FAEF3, fd257; +fma.rn.f64 fd262, fd87, 0dBFC7851AACD6C6B4, fd258; +fma.rn.f64 fd263, fd89, 0d3FB79EE63259B75E, fd259; +fma.rn.f64 fd264, fd92, 0d3FEFDD0DEB564B22, fd260; +fma.rn.f64 fd265, fd90, 0d3FB79EE63259B75E, fd261; +fma.rn.f64 fd266, fd91, 0d3FEFDD0DEB564B22, fd262; +fma.rn.f64 fd267, fd93, 0d3FEDD6D000370991, fd263; +fma.rn.f64 fd268, fd96, 0dBFD71E955D8E7CDC, fd264; +fma.rn.f64 fd269, fd94, 0d3FEDD6D000370991, fd265; +fma.rn.f64 fd270, fd95, 0dBFD71E955D8E7CDC, fd266; +fma.rn.f64 fd271, fd97, 0dBFE348C86ED5F1BB, fd267; +fma.rn.f64 fd272, fd100, 0dBFE9895B6C9A05F6, fd268; +fma.rn.f64 fd273, fd98, 0dBFE348C86ED5F1BB, fd269; +fma.rn.f64 fd274, fd99, 0dBFE9895B6C9A05F6, fd270; +fma.rn.f64 fd275, fd69, 0dBFE348C86ED5F1BB, %34; +fma.rn.f64 fd276, fd72, 0dBFE9895B6C9A05F6, 0d0000000000000000; +fma.rn.f64 fd277, fd70, 0dBFE348C86ED5F1BB, %35; +fma.rn.f64 fd278, fd71, 0dBFE9895B6C9A05F6, 0d0000000000000000; +fma.rn.f64 fd279, fd73, 0dBFD183B1C61F0D01, fd275; +fma.rn.f64 fd280, fd76, 0d3FEEC746923C349F, fd276; +fma.rn.f64 fd281, fd74, 0dBFD183B1C61F0D01, fd277; +fma.rn.f64 fd282, fd75, 0d3FEEC746923C349F, fd278; +fma.rn.f64 fd283, fd77, 0d3FEDD6D000370991, fd279; +fma.rn.f64 fd284, fd80, 0dBFD71E955D8E7CDC, fd280; +fma.rn.f64 fd285, fd78, 0d3FEDD6D000370991, fd281; +fma.rn.f64 fd286, fd79, 0dBFD71E955D8E7CDC, fd282; +fma.rn.f64 fd287, fd81, 0dBFEB34FA910EA3B9, fd283; +fma.rn.f64 fd288, fd84, 0dBFE0D8884363DD80, fd284; +fma.rn.f64 fd289, fd82, 0dBFEB34FA910EA3B9, fd285; +fma.rn.f64 fd290, fd83, 0dBFE0D8884363DD80, fd286; +fma.rn.f64 fd291, fd85, 0d3FB79EE63259B75E, fd287; +fma.rn.f64 fd292, fd88, 0d3FEFDD0DEB564B22, fd288; +fma.rn.f64 fd293, fd86, 0d3FB79EE63259B75E, fd289; +fma.rn.f64 fd294, fd87, 0d3FEFDD0DEB564B22, fd290; +fma.rn.f64 fd295, fd89, 0d3FE7A5F6075D4884, fd291; +fma.rn.f64 fd296, fd92, 0dBFE58EEA2A9D6DA3, fd292; +fma.rn.f64 fd297, fd90, 0d3FE7A5F6075D4884, fd293; +fma.rn.f64 fd298, fd91, 0dBFE58EEA2A9D6DA3, fd294; +fma.rn.f64 fd299, fd93, 0dBFEF7484007FAEF3, fd295; +fma.rn.f64 fd300, fd96, 0dBFC7851AACD6C6B4, fd296; +fma.rn.f64 fd301, fd94, 0dBFEF7484007FAEF3, fd297; +fma.rn.f64 fd302, fd95, 0dBFC7851AACD6C6B4, fd298; +fma.rn.f64 fd303, fd97, 0d3FDC86FA2B2883CD, fd299; +fma.rn.f64 fd304, fd100, 0d3FECA52D7C9E640B, fd300; +fma.rn.f64 fd305, fd98, 0d3FDC86FA2B2883CD, fd301; +fma.rn.f64 fd306, fd99, 0d3FECA52D7C9E640B, fd302; +fma.rn.f64 fd307, fd69, 0dBFEB34FA910EA3B9, %34; +fma.rn.f64 fd308, fd72, 0dBFE0D8884363DD80, 0d0000000000000000; +fma.rn.f64 fd309, fd70, 0dBFEB34FA910EA3B9, %35; +fma.rn.f64 fd310, fd71, 0dBFE0D8884363DD80, 0d0000000000000000; +fma.rn.f64 fd311, fd73, 0d3FDC86FA2B2883CD, fd307; +fma.rn.f64 fd312, fd76, 0d3FECA52D7C9E640B, fd308; +fma.rn.f64 fd313, fd74, 0d3FDC86FA2B2883CD, fd309; +fma.rn.f64 fd314, fd75, 0d3FECA52D7C9E640B, fd310; +fma.rn.f64 fd315, fd77, 0d3FB79EE63259B75E, fd311; +fma.rn.f64 fd316, fd80, 0dBFEFDD0DEB564B22, fd312; +fma.rn.f64 fd317, fd78, 0d3FB79EE63259B75E, fd313; +fma.rn.f64 fd318, fd79, 0dBFEFDD0DEB564B22, fd314; +fma.rn.f64 fd319, fd81, 0dBFE348C86ED5F1BB, fd315; +fma.rn.f64 fd320, fd84, 0d3FE9895B6C9A05F6, fd316; +fma.rn.f64 fd321, fd82, 0dBFE348C86ED5F1BB, fd317; +fma.rn.f64 fd322, fd83, 0d3FE9895B6C9A05F6, fd318; +fma.rn.f64 fd323, fd85, 0d3FEDD6D000370991, fd319; +fma.rn.f64 fd324, fd88, 0dBFD71E955D8E7CDC, fd320; +fma.rn.f64 fd325, fd86, 0d3FEDD6D000370991, fd321; +fma.rn.f64 fd326, fd87, 0dBFD71E955D8E7CDC, fd322; +fma.rn.f64 fd327, fd89, 0dBFEF7484007FAEF3, fd323; +fma.rn.f64 fd328, fd92, 0dBFC7851AACD6C6B4, fd324; +fma.rn.f64 fd329, fd90, 0dBFEF7484007FAEF3, fd325; +fma.rn.f64 fd330, fd91, 0dBFC7851AACD6C6B4, fd326; +fma.rn.f64 fd331, fd93, 0d3FE7A5F6075D4884, fd327; +fma.rn.f64 fd332, fd96, 0d3FE58EEA2A9D6DA3, fd328; +fma.rn.f64 fd333, fd94, 0d3FE7A5F6075D4884, fd329; +fma.rn.f64 fd334, fd95, 0d3FE58EEA2A9D6DA3, fd330; +fma.rn.f64 fd335, fd97, 0dBFD183B1C61F0D01, fd331; +fma.rn.f64 fd336, fd100, 0dBFEEC746923C349F, fd332; +fma.rn.f64 fd337, fd98, 0dBFD183B1C61F0D01, fd333; +fma.rn.f64 fd338, fd99, 0dBFEEC746923C349F, fd334; +fma.rn.f64 fd339, fd69, 0dBFEF7484007FAEF3, %34; +fma.rn.f64 fd340, fd72, 0dBFC7851AACD6C6B4, 0d0000000000000000; +fma.rn.f64 fd341, fd70, 0dBFEF7484007FAEF3, %35; +fma.rn.f64 fd342, fd71, 0dBFC7851AACD6C6B4, 0d0000000000000000; +fma.rn.f64 fd343, fd73, 0d3FEDD6D000370991, fd339; +fma.rn.f64 fd344, fd76, 0d3FD71E955D8E7CDC, fd340; +fma.rn.f64 fd345, fd74, 0d3FEDD6D000370991, fd341; +fma.rn.f64 fd346, fd75, 0d3FD71E955D8E7CDC, fd342; +fma.rn.f64 fd347, fd77, 0dBFEB34FA910EA3B9, fd343; +fma.rn.f64 fd348, fd80, 0dBFE0D8884363DD80, fd344; +fma.rn.f64 fd349, fd78, 0dBFEB34FA910EA3B9, fd345; +fma.rn.f64 fd350, fd79, 0dBFE0D8884363DD80, fd346; +fma.rn.f64 fd351, fd81, 0d3FE7A5F6075D4884, fd347; +fma.rn.f64 fd352, fd84, 0d3FE58EEA2A9D6DA3, fd348; +fma.rn.f64 fd353, fd82, 0d3FE7A5F6075D4884, fd349; +fma.rn.f64 fd354, fd83, 0d3FE58EEA2A9D6DA3, fd350; +fma.rn.f64 fd355, fd85, 0dBFE348C86ED5F1BB, fd351; +fma.rn.f64 fd356, fd88, 0dBFE9895B6C9A05F6, fd352; +fma.rn.f64 fd357, fd86, 0dBFE348C86ED5F1BB, fd353; +fma.rn.f64 fd358, fd87, 0dBFE9895B6C9A05F6, fd354; +fma.rn.f64 fd359, fd89, 0d3FDC86FA2B2883CD, fd355; +fma.rn.f64 fd360, fd92, 0d3FECA52D7C9E640B, fd356; +fma.rn.f64 fd361, fd90, 0d3FDC86FA2B2883CD, fd357; +fma.rn.f64 fd362, fd91, 0d3FECA52D7C9E640B, fd358; +fma.rn.f64 fd363, fd93, 0dBFD183B1C61F0D01, fd359; +fma.rn.f64 fd364, fd96, 0dBFEEC746923C349F, fd360; +fma.rn.f64 fd365, fd94, 0dBFD183B1C61F0D01, fd361; +fma.rn.f64 fd366, fd95, 0dBFEEC746923C349F, fd362; +fma.rn.f64 fd367, fd97, 0d3FB79EE63259B75E, fd363; +fma.rn.f64 fd368, fd100, 0d3FEFDD0DEB564B22, fd364; +fma.rn.f64 fd369, fd98, 0d3FB79EE63259B75E, fd365; +fma.rn.f64 fd370, fd99, 0d3FEFDD0DEB564B22, fd366; +add.f64 %1, fd114, fd98; +add.f64 %0, fd113, fd97; +add.f64 %3, fd145, fd146; +sub.f64 %2, fd143, fd144; +add.f64 %5, fd177, fd178; +sub.f64 %4, fd175, fd176; +add.f64 %7, fd209, fd210; +sub.f64 %6, fd207, fd208; +add.f64 %9, fd241, fd242; +sub.f64 %8, fd239, fd240; +add.f64 %11, fd273, fd274; +sub.f64 %10, fd271, fd272; +add.f64 %13, fd305, fd306; +sub.f64 %12, fd303, fd304; +add.f64 %15, fd337, fd338; +sub.f64 %14, fd335, fd336; +add.f64 %17, fd369, fd370; +sub.f64 %16, fd367, fd368; +sub.f64 %19, fd369, fd370; +add.f64 %18, fd367, fd368; +sub.f64 %21, fd337, fd338; +add.f64 %20, fd335, fd336; +sub.f64 %23, fd305, fd306; +add.f64 %22, fd303, fd304; +sub.f64 %25, fd273, fd274; +add.f64 %24, fd271, fd272; +sub.f64 %27, fd241, fd242; +add.f64 %26, fd239, fd240; +sub.f64 %29, fd209, fd210; +add.f64 %28, fd207, fd208; +sub.f64 %31, fd177, fd178; +add.f64 %30, fd175, fd176; +sub.f64 %33, fd145, fd146; +add.f64 %32, fd143, fd144; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..8a29231c1cdad --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_17_fp64_inv.hpp.inc @@ -0,0 +1,352 @@ +#ifndef CUFFTDX_FFT_17_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_17_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<578, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<405>; +.reg .b64 rd<2>; +add.f64 fd69, %36, %76; +add.f64 fd70, %38, %77; +sub.f64 fd71, %36, %76; +sub.f64 fd72, %38, %77; +add.f64 fd73, %39, %74; +add.f64 fd74, %41, %75; +sub.f64 fd75, %39, %74; +sub.f64 fd76, %41, %75; +add.f64 fd77, %42, %71; +add.f64 fd78, %43, %73; +sub.f64 fd79, %42, %71; +sub.f64 fd80, %43, %73; +add.f64 fd81, %44, %68; +add.f64 fd82, %46, %70; +sub.f64 fd83, %44, %68; +sub.f64 fd84, %46, %70; +add.f64 fd85, %47, %66; +add.f64 fd86, %49, %67; +sub.f64 fd87, %47, %66; +sub.f64 fd88, %49, %67; +add.f64 fd89, %50, %63; +add.f64 fd90, %51, %65; +sub.f64 fd91, %50, %63; +sub.f64 fd92, %51, %65; +add.f64 fd93, %52, %60; +add.f64 fd94, %54, %62; +sub.f64 fd95, %52, %60; +sub.f64 fd96, %54, %62; +add.f64 fd97, %55, %58; +add.f64 fd98, %57, %59; +sub.f64 fd99, %55, %58; +sub.f64 fd100, %57, %59; +add.f64 fd101, %34, fd69; +add.f64 fd102, %35, fd70; +add.f64 fd103, fd101, fd73; +add.f64 fd104, fd102, fd74; +add.f64 fd105, fd103, fd77; +add.f64 fd106, fd104, fd78; +add.f64 fd107, fd105, fd81; +add.f64 fd108, fd106, fd82; +add.f64 fd109, fd107, fd85; +add.f64 fd110, fd108, fd86; +add.f64 fd111, fd109, fd89; +add.f64 fd112, fd110, fd90; +add.f64 fd113, fd111, fd93; +add.f64 fd114, fd112, fd94; +fma.rn.f64 fd115, fd69, 0d3FEDD6D000370991, %34; +fma.rn.f64 fd116, fd72, 0d3FD71E955D8E7CDC, 0d0000000000000000; +fma.rn.f64 fd117, fd70, 0d3FEDD6D000370991, %35; +fma.rn.f64 fd118, fd71, 0d3FD71E955D8E7CDC, 0d0000000000000000; +fma.rn.f64 fd119, fd73, 0d3FE7A5F6075D4884, fd115; +fma.rn.f64 fd120, fd76, 0d3FE58EEA2A9D6DA3, fd116; +fma.rn.f64 fd121, fd74, 0d3FE7A5F6075D4884, fd117; +fma.rn.f64 fd122, fd75, 0d3FE58EEA2A9D6DA3, fd118; +fma.rn.f64 fd123, fd77, 0d3FDC86FA2B2883CD, fd119; +fma.rn.f64 fd124, fd80, 0d3FECA52D7C9E640B, fd120; +fma.rn.f64 fd125, fd78, 0d3FDC86FA2B2883CD, fd121; +fma.rn.f64 fd126, fd79, 0d3FECA52D7C9E640B, fd122; +fma.rn.f64 fd127, fd81, 0d3FB79EE63259B75E, fd123; +fma.rn.f64 fd128, fd84, 0d3FEFDD0DEB564B22, fd124; +fma.rn.f64 fd129, fd82, 0d3FB79EE63259B75E, fd125; +fma.rn.f64 fd130, fd83, 0d3FEFDD0DEB564B22, fd126; +fma.rn.f64 fd131, fd85, 0dBFD183B1C61F0D01, fd127; +fma.rn.f64 fd132, fd88, 0d3FEEC746923C349F, fd128; +fma.rn.f64 fd133, fd86, 0dBFD183B1C61F0D01, fd129; +fma.rn.f64 fd134, fd87, 0d3FEEC746923C349F, fd130; +fma.rn.f64 fd135, fd89, 0dBFE348C86ED5F1BB, fd131; +fma.rn.f64 fd136, fd92, 0d3FE9895B6C9A05F6, fd132; +fma.rn.f64 fd137, fd90, 0dBFE348C86ED5F1BB, fd133; +fma.rn.f64 fd138, fd91, 0d3FE9895B6C9A05F6, fd134; +fma.rn.f64 fd139, fd93, 0dBFEB34FA910EA3B9, fd135; +fma.rn.f64 fd140, fd96, 0d3FE0D8884363DD80, fd136; +fma.rn.f64 fd141, fd94, 0dBFEB34FA910EA3B9, fd137; +fma.rn.f64 fd142, fd95, 0d3FE0D8884363DD80, fd138; +fma.rn.f64 fd143, fd97, 0dBFEF7484007FAEF3, fd139; +fma.rn.f64 fd144, fd100, 0d3FC7851AACD6C6B4, fd140; +fma.rn.f64 fd145, fd98, 0dBFEF7484007FAEF3, fd141; +fma.rn.f64 fd146, fd99, 0d3FC7851AACD6C6B4, fd142; +fma.rn.f64 fd147, fd69, 0d3FE7A5F6075D4884, %34; +fma.rn.f64 fd148, fd72, 0d3FE58EEA2A9D6DA3, 0d0000000000000000; +fma.rn.f64 fd149, fd70, 0d3FE7A5F6075D4884, %35; +fma.rn.f64 fd150, fd71, 0d3FE58EEA2A9D6DA3, 0d0000000000000000; +fma.rn.f64 fd151, fd73, 0d3FB79EE63259B75E, fd147; +fma.rn.f64 fd152, fd76, 0d3FEFDD0DEB564B22, fd148; +fma.rn.f64 fd153, fd74, 0d3FB79EE63259B75E, fd149; +fma.rn.f64 fd154, fd75, 0d3FEFDD0DEB564B22, fd150; +fma.rn.f64 fd155, fd77, 0dBFE348C86ED5F1BB, fd151; +fma.rn.f64 fd156, fd80, 0d3FE9895B6C9A05F6, fd152; +fma.rn.f64 fd157, fd78, 0dBFE348C86ED5F1BB, fd153; +fma.rn.f64 fd158, fd79, 0d3FE9895B6C9A05F6, fd154; +fma.rn.f64 fd159, fd81, 0dBFEF7484007FAEF3, fd155; +fma.rn.f64 fd160, fd84, 0d3FC7851AACD6C6B4, fd156; +fma.rn.f64 fd161, fd82, 0dBFEF7484007FAEF3, fd157; +fma.rn.f64 fd162, fd83, 0d3FC7851AACD6C6B4, fd158; +fma.rn.f64 fd163, fd85, 0dBFEB34FA910EA3B9, fd159; +fma.rn.f64 fd164, fd88, 0dBFE0D8884363DD80, fd160; +fma.rn.f64 fd165, fd86, 0dBFEB34FA910EA3B9, fd161; +fma.rn.f64 fd166, fd87, 0dBFE0D8884363DD80, fd162; +fma.rn.f64 fd167, fd89, 0dBFD183B1C61F0D01, fd163; +fma.rn.f64 fd168, fd92, 0dBFEEC746923C349F, fd164; +fma.rn.f64 fd169, fd90, 0dBFD183B1C61F0D01, fd165; +fma.rn.f64 fd170, fd91, 0dBFEEC746923C349F, fd166; +fma.rn.f64 fd171, fd93, 0d3FDC86FA2B2883CD, fd167; +fma.rn.f64 fd172, fd96, 0dBFECA52D7C9E640B, fd168; +fma.rn.f64 fd173, fd94, 0d3FDC86FA2B2883CD, fd169; +fma.rn.f64 fd174, fd95, 0dBFECA52D7C9E640B, fd170; +fma.rn.f64 fd175, fd97, 0d3FEDD6D000370991, fd171; +fma.rn.f64 fd176, fd100, 0dBFD71E955D8E7CDC, fd172; +fma.rn.f64 fd177, fd98, 0d3FEDD6D000370991, fd173; +fma.rn.f64 fd178, fd99, 0dBFD71E955D8E7CDC, fd174; +fma.rn.f64 fd179, fd69, 0d3FDC86FA2B2883CD, %34; +fma.rn.f64 fd180, fd72, 0d3FECA52D7C9E640B, 0d0000000000000000; +fma.rn.f64 fd181, fd70, 0d3FDC86FA2B2883CD, %35; +fma.rn.f64 fd182, fd71, 0d3FECA52D7C9E640B, 0d0000000000000000; +fma.rn.f64 fd183, fd73, 0dBFE348C86ED5F1BB, fd179; +fma.rn.f64 fd184, fd76, 0d3FE9895B6C9A05F6, fd180; +fma.rn.f64 fd185, fd74, 0dBFE348C86ED5F1BB, fd181; +fma.rn.f64 fd186, fd75, 0d3FE9895B6C9A05F6, fd182; +fma.rn.f64 fd187, fd77, 0dBFEF7484007FAEF3, fd183; +fma.rn.f64 fd188, fd80, 0dBFC7851AACD6C6B4, fd184; +fma.rn.f64 fd189, fd78, 0dBFEF7484007FAEF3, fd185; +fma.rn.f64 fd190, fd79, 0dBFC7851AACD6C6B4, fd186; +fma.rn.f64 fd191, fd81, 0dBFD183B1C61F0D01, fd187; +fma.rn.f64 fd192, fd84, 0dBFEEC746923C349F, fd188; +fma.rn.f64 fd193, fd82, 0dBFD183B1C61F0D01, fd189; +fma.rn.f64 fd194, fd83, 0dBFEEC746923C349F, fd190; +fma.rn.f64 fd195, fd85, 0d3FE7A5F6075D4884, fd191; +fma.rn.f64 fd196, fd88, 0dBFE58EEA2A9D6DA3, fd192; +fma.rn.f64 fd197, fd86, 0d3FE7A5F6075D4884, fd193; +fma.rn.f64 fd198, fd87, 0dBFE58EEA2A9D6DA3, fd194; +fma.rn.f64 fd199, fd89, 0d3FEDD6D000370991, fd195; +fma.rn.f64 fd200, fd92, 0d3FD71E955D8E7CDC, fd196; +fma.rn.f64 fd201, fd90, 0d3FEDD6D000370991, fd197; +fma.rn.f64 fd202, fd91, 0d3FD71E955D8E7CDC, fd198; +fma.rn.f64 fd203, fd93, 0d3FB79EE63259B75E, fd199; +fma.rn.f64 fd204, fd96, 0d3FEFDD0DEB564B22, fd200; +fma.rn.f64 fd205, fd94, 0d3FB79EE63259B75E, fd201; +fma.rn.f64 fd206, fd95, 0d3FEFDD0DEB564B22, fd202; +fma.rn.f64 fd207, fd97, 0dBFEB34FA910EA3B9, fd203; +fma.rn.f64 fd208, fd100, 0d3FE0D8884363DD80, fd204; +fma.rn.f64 fd209, fd98, 0dBFEB34FA910EA3B9, fd205; +fma.rn.f64 fd210, fd99, 0d3FE0D8884363DD80, fd206; +fma.rn.f64 fd211, fd69, 0d3FB79EE63259B75E, %34; +fma.rn.f64 fd212, fd72, 0d3FEFDD0DEB564B22, 0d0000000000000000; +fma.rn.f64 fd213, fd70, 0d3FB79EE63259B75E, %35; +fma.rn.f64 fd214, fd71, 0d3FEFDD0DEB564B22, 0d0000000000000000; +fma.rn.f64 fd215, fd73, 0dBFEF7484007FAEF3, fd211; +fma.rn.f64 fd216, fd76, 0d3FC7851AACD6C6B4, fd212; +fma.rn.f64 fd217, fd74, 0dBFEF7484007FAEF3, fd213; +fma.rn.f64 fd218, fd75, 0d3FC7851AACD6C6B4, fd214; +fma.rn.f64 fd219, fd77, 0dBFD183B1C61F0D01, fd215; +fma.rn.f64 fd220, fd80, 0dBFEEC746923C349F, fd216; +fma.rn.f64 fd221, fd78, 0dBFD183B1C61F0D01, fd217; +fma.rn.f64 fd222, fd79, 0dBFEEC746923C349F, fd218; +fma.rn.f64 fd223, fd81, 0d3FEDD6D000370991, fd219; +fma.rn.f64 fd224, fd84, 0dBFD71E955D8E7CDC, fd220; +fma.rn.f64 fd225, fd82, 0d3FEDD6D000370991, fd221; +fma.rn.f64 fd226, fd83, 0dBFD71E955D8E7CDC, fd222; +fma.rn.f64 fd227, fd85, 0d3FDC86FA2B2883CD, fd223; +fma.rn.f64 fd228, fd88, 0d3FECA52D7C9E640B, fd224; +fma.rn.f64 fd229, fd86, 0d3FDC86FA2B2883CD, fd225; +fma.rn.f64 fd230, fd87, 0d3FECA52D7C9E640B, fd226; +fma.rn.f64 fd231, fd89, 0dBFEB34FA910EA3B9, fd227; +fma.rn.f64 fd232, fd92, 0d3FE0D8884363DD80, fd228; +fma.rn.f64 fd233, fd90, 0dBFEB34FA910EA3B9, fd229; +fma.rn.f64 fd234, fd91, 0d3FE0D8884363DD80, fd230; +fma.rn.f64 fd235, fd93, 0dBFE348C86ED5F1BB, fd231; +fma.rn.f64 fd236, fd96, 0dBFE9895B6C9A05F6, fd232; +fma.rn.f64 fd237, fd94, 0dBFE348C86ED5F1BB, fd233; +fma.rn.f64 fd238, fd95, 0dBFE9895B6C9A05F6, fd234; +fma.rn.f64 fd239, fd97, 0d3FE7A5F6075D4884, fd235; +fma.rn.f64 fd240, fd100, 0dBFE58EEA2A9D6DA3, fd236; +fma.rn.f64 fd241, fd98, 0d3FE7A5F6075D4884, fd237; +fma.rn.f64 fd242, fd99, 0dBFE58EEA2A9D6DA3, fd238; +fma.rn.f64 fd243, fd69, 0dBFD183B1C61F0D01, %34; +fma.rn.f64 fd244, fd72, 0d3FEEC746923C349F, 0d0000000000000000; +fma.rn.f64 fd245, fd70, 0dBFD183B1C61F0D01, %35; +fma.rn.f64 fd246, fd71, 0d3FEEC746923C349F, 0d0000000000000000; +fma.rn.f64 fd247, fd73, 0dBFEB34FA910EA3B9, fd243; +fma.rn.f64 fd248, fd76, 0dBFE0D8884363DD80, fd244; +fma.rn.f64 fd249, fd74, 0dBFEB34FA910EA3B9, fd245; +fma.rn.f64 fd250, fd75, 0dBFE0D8884363DD80, fd246; +fma.rn.f64 fd251, fd77, 0d3FE7A5F6075D4884, fd247; +fma.rn.f64 fd252, fd80, 0dBFE58EEA2A9D6DA3, fd248; +fma.rn.f64 fd253, fd78, 0d3FE7A5F6075D4884, fd249; +fma.rn.f64 fd254, fd79, 0dBFE58EEA2A9D6DA3, fd250; +fma.rn.f64 fd255, fd81, 0d3FDC86FA2B2883CD, fd251; +fma.rn.f64 fd256, fd84, 0d3FECA52D7C9E640B, fd252; +fma.rn.f64 fd257, fd82, 0d3FDC86FA2B2883CD, fd253; +fma.rn.f64 fd258, fd83, 0d3FECA52D7C9E640B, fd254; +fma.rn.f64 fd259, fd85, 0dBFEF7484007FAEF3, fd255; +fma.rn.f64 fd260, fd88, 0d3FC7851AACD6C6B4, fd256; +fma.rn.f64 fd261, fd86, 0dBFEF7484007FAEF3, fd257; +fma.rn.f64 fd262, fd87, 0d3FC7851AACD6C6B4, fd258; +fma.rn.f64 fd263, fd89, 0d3FB79EE63259B75E, fd259; +fma.rn.f64 fd264, fd92, 0dBFEFDD0DEB564B22, fd260; +fma.rn.f64 fd265, fd90, 0d3FB79EE63259B75E, fd261; +fma.rn.f64 fd266, fd91, 0dBFEFDD0DEB564B22, fd262; +fma.rn.f64 fd267, fd93, 0d3FEDD6D000370991, fd263; +fma.rn.f64 fd268, fd96, 0d3FD71E955D8E7CDC, fd264; +fma.rn.f64 fd269, fd94, 0d3FEDD6D000370991, fd265; +fma.rn.f64 fd270, fd95, 0d3FD71E955D8E7CDC, fd266; +fma.rn.f64 fd271, fd97, 0dBFE348C86ED5F1BB, fd267; +fma.rn.f64 fd272, fd100, 0d3FE9895B6C9A05F6, fd268; +fma.rn.f64 fd273, fd98, 0dBFE348C86ED5F1BB, fd269; +fma.rn.f64 fd274, fd99, 0d3FE9895B6C9A05F6, fd270; +fma.rn.f64 fd275, fd69, 0dBFE348C86ED5F1BB, %34; +fma.rn.f64 fd276, fd72, 0d3FE9895B6C9A05F6, 0d0000000000000000; +fma.rn.f64 fd277, fd70, 0dBFE348C86ED5F1BB, %35; +fma.rn.f64 fd278, fd71, 0d3FE9895B6C9A05F6, 0d0000000000000000; +fma.rn.f64 fd279, fd73, 0dBFD183B1C61F0D01, fd275; +fma.rn.f64 fd280, fd76, 0dBFEEC746923C349F, fd276; +fma.rn.f64 fd281, fd74, 0dBFD183B1C61F0D01, fd277; +fma.rn.f64 fd282, fd75, 0dBFEEC746923C349F, fd278; +fma.rn.f64 fd283, fd77, 0d3FEDD6D000370991, fd279; +fma.rn.f64 fd284, fd80, 0d3FD71E955D8E7CDC, fd280; +fma.rn.f64 fd285, fd78, 0d3FEDD6D000370991, fd281; +fma.rn.f64 fd286, fd79, 0d3FD71E955D8E7CDC, fd282; +fma.rn.f64 fd287, fd81, 0dBFEB34FA910EA3B9, fd283; +fma.rn.f64 fd288, fd84, 0d3FE0D8884363DD80, fd284; +fma.rn.f64 fd289, fd82, 0dBFEB34FA910EA3B9, fd285; +fma.rn.f64 fd290, fd83, 0d3FE0D8884363DD80, fd286; +fma.rn.f64 fd291, fd85, 0d3FB79EE63259B75E, fd287; +fma.rn.f64 fd292, fd88, 0dBFEFDD0DEB564B22, fd288; +fma.rn.f64 fd293, fd86, 0d3FB79EE63259B75E, fd289; +fma.rn.f64 fd294, fd87, 0dBFEFDD0DEB564B22, fd290; +fma.rn.f64 fd295, fd89, 0d3FE7A5F6075D4884, fd291; +fma.rn.f64 fd296, fd92, 0d3FE58EEA2A9D6DA3, fd292; +fma.rn.f64 fd297, fd90, 0d3FE7A5F6075D4884, fd293; +fma.rn.f64 fd298, fd91, 0d3FE58EEA2A9D6DA3, fd294; +fma.rn.f64 fd299, fd93, 0dBFEF7484007FAEF3, fd295; +fma.rn.f64 fd300, fd96, 0d3FC7851AACD6C6B4, fd296; +fma.rn.f64 fd301, fd94, 0dBFEF7484007FAEF3, fd297; +fma.rn.f64 fd302, fd95, 0d3FC7851AACD6C6B4, fd298; +fma.rn.f64 fd303, fd97, 0d3FDC86FA2B2883CD, fd299; +fma.rn.f64 fd304, fd100, 0dBFECA52D7C9E640B, fd300; +fma.rn.f64 fd305, fd98, 0d3FDC86FA2B2883CD, fd301; +fma.rn.f64 fd306, fd99, 0dBFECA52D7C9E640B, fd302; +fma.rn.f64 fd307, fd69, 0dBFEB34FA910EA3B9, %34; +fma.rn.f64 fd308, fd72, 0d3FE0D8884363DD80, 0d0000000000000000; +fma.rn.f64 fd309, fd70, 0dBFEB34FA910EA3B9, %35; +fma.rn.f64 fd310, fd71, 0d3FE0D8884363DD80, 0d0000000000000000; +fma.rn.f64 fd311, fd73, 0d3FDC86FA2B2883CD, fd307; +fma.rn.f64 fd312, fd76, 0dBFECA52D7C9E640B, fd308; +fma.rn.f64 fd313, fd74, 0d3FDC86FA2B2883CD, fd309; +fma.rn.f64 fd314, fd75, 0dBFECA52D7C9E640B, fd310; +fma.rn.f64 fd315, fd77, 0d3FB79EE63259B75E, fd311; +fma.rn.f64 fd316, fd80, 0d3FEFDD0DEB564B22, fd312; +fma.rn.f64 fd317, fd78, 0d3FB79EE63259B75E, fd313; +fma.rn.f64 fd318, fd79, 0d3FEFDD0DEB564B22, fd314; +fma.rn.f64 fd319, fd81, 0dBFE348C86ED5F1BB, fd315; +fma.rn.f64 fd320, fd84, 0dBFE9895B6C9A05F6, fd316; +fma.rn.f64 fd321, fd82, 0dBFE348C86ED5F1BB, fd317; +fma.rn.f64 fd322, fd83, 0dBFE9895B6C9A05F6, fd318; +fma.rn.f64 fd323, fd85, 0d3FEDD6D000370991, fd319; +fma.rn.f64 fd324, fd88, 0d3FD71E955D8E7CDC, fd320; +fma.rn.f64 fd325, fd86, 0d3FEDD6D000370991, fd321; +fma.rn.f64 fd326, fd87, 0d3FD71E955D8E7CDC, fd322; +fma.rn.f64 fd327, fd89, 0dBFEF7484007FAEF3, fd323; +fma.rn.f64 fd328, fd92, 0d3FC7851AACD6C6B4, fd324; +fma.rn.f64 fd329, fd90, 0dBFEF7484007FAEF3, fd325; +fma.rn.f64 fd330, fd91, 0d3FC7851AACD6C6B4, fd326; +fma.rn.f64 fd331, fd93, 0d3FE7A5F6075D4884, fd327; +fma.rn.f64 fd332, fd96, 0dBFE58EEA2A9D6DA3, fd328; +fma.rn.f64 fd333, fd94, 0d3FE7A5F6075D4884, fd329; +fma.rn.f64 fd334, fd95, 0dBFE58EEA2A9D6DA3, fd330; +fma.rn.f64 fd335, fd97, 0dBFD183B1C61F0D01, fd331; +fma.rn.f64 fd336, fd100, 0d3FEEC746923C349F, fd332; +fma.rn.f64 fd337, fd98, 0dBFD183B1C61F0D01, fd333; +fma.rn.f64 fd338, fd99, 0d3FEEC746923C349F, fd334; +fma.rn.f64 fd339, fd69, 0dBFEF7484007FAEF3, %34; +fma.rn.f64 fd340, fd72, 0d3FC7851AACD6C6B4, 0d0000000000000000; +fma.rn.f64 fd341, fd70, 0dBFEF7484007FAEF3, %35; +fma.rn.f64 fd342, fd71, 0d3FC7851AACD6C6B4, 0d0000000000000000; +fma.rn.f64 fd343, fd73, 0d3FEDD6D000370991, fd339; +fma.rn.f64 fd344, fd76, 0dBFD71E955D8E7CDC, fd340; +fma.rn.f64 fd345, fd74, 0d3FEDD6D000370991, fd341; +fma.rn.f64 fd346, fd75, 0dBFD71E955D8E7CDC, fd342; +fma.rn.f64 fd347, fd77, 0dBFEB34FA910EA3B9, fd343; +fma.rn.f64 fd348, fd80, 0d3FE0D8884363DD80, fd344; +fma.rn.f64 fd349, fd78, 0dBFEB34FA910EA3B9, fd345; +fma.rn.f64 fd350, fd79, 0d3FE0D8884363DD80, fd346; +fma.rn.f64 fd351, fd81, 0d3FE7A5F6075D4884, fd347; +fma.rn.f64 fd352, fd84, 0dBFE58EEA2A9D6DA3, fd348; +fma.rn.f64 fd353, fd82, 0d3FE7A5F6075D4884, fd349; +fma.rn.f64 fd354, fd83, 0dBFE58EEA2A9D6DA3, fd350; +fma.rn.f64 fd355, fd85, 0dBFE348C86ED5F1BB, fd351; +fma.rn.f64 fd356, fd88, 0d3FE9895B6C9A05F6, fd352; +fma.rn.f64 fd357, fd86, 0dBFE348C86ED5F1BB, fd353; +fma.rn.f64 fd358, fd87, 0d3FE9895B6C9A05F6, fd354; +fma.rn.f64 fd359, fd89, 0d3FDC86FA2B2883CD, fd355; +fma.rn.f64 fd360, fd92, 0dBFECA52D7C9E640B, fd356; +fma.rn.f64 fd361, fd90, 0d3FDC86FA2B2883CD, fd357; +fma.rn.f64 fd362, fd91, 0dBFECA52D7C9E640B, fd358; +fma.rn.f64 fd363, fd93, 0dBFD183B1C61F0D01, fd359; +fma.rn.f64 fd364, fd96, 0d3FEEC746923C349F, fd360; +fma.rn.f64 fd365, fd94, 0dBFD183B1C61F0D01, fd361; +fma.rn.f64 fd366, fd95, 0d3FEEC746923C349F, fd362; +fma.rn.f64 fd367, fd97, 0d3FB79EE63259B75E, fd363; +fma.rn.f64 fd368, fd100, 0dBFEFDD0DEB564B22, fd364; +fma.rn.f64 fd369, fd98, 0d3FB79EE63259B75E, fd365; +fma.rn.f64 fd370, fd99, 0dBFEFDD0DEB564B22, fd366; +add.f64 %1, fd114, fd98; +add.f64 %0, fd113, fd97; +add.f64 %3, fd145, fd146; +sub.f64 %2, fd143, fd144; +add.f64 %5, fd177, fd178; +sub.f64 %4, fd175, fd176; +add.f64 %7, fd209, fd210; +sub.f64 %6, fd207, fd208; +add.f64 %9, fd241, fd242; +sub.f64 %8, fd239, fd240; +add.f64 %11, fd273, fd274; +sub.f64 %10, fd271, fd272; +add.f64 %13, fd305, fd306; +sub.f64 %12, fd303, fd304; +add.f64 %15, fd337, fd338; +sub.f64 %14, fd335, fd336; +add.f64 %17, fd369, fd370; +sub.f64 %16, fd367, fd368; +sub.f64 %19, fd369, fd370; +add.f64 %18, fd367, fd368; +sub.f64 %21, fd337, fd338; +add.f64 %20, fd335, fd336; +sub.f64 %23, fd305, fd306; +add.f64 %22, fd303, fd304; +sub.f64 %25, fd273, fd274; +add.f64 %24, fd271, fd272; +sub.f64 %27, fd241, fd242; +add.f64 %26, fd239, fd240; +sub.f64 %29, fd209, fd210; +add.f64 %28, fd207, fd208; +sub.f64 %31, fd177, fd178; +add.f64 %30, fd175, fd176; +sub.f64 %33, fd145, fd146; +add.f64 %32, fd143, fd144; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..f13dcda1bcdce --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp16_fwd.hpp.inc @@ -0,0 +1,1630 @@ +#ifndef CUFFTDX_FFT_18_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_18_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<750, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<115>; +.reg .b32 r<1373>; +.reg .f64 fd<91>; +.reg .b64 rd<2>; +mov.f64 fd67, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd67; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd68, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd68; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %48, %60; +} +{ +add.f16x2 r4, %36, r1; +} +{ +add.f16x2 r7, %49, %61; +} +{ +add.f16x2 r10, %37, r7; +} +{ +add.f16x2 r13, %48, %60; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %36, r16; +} +{ +sub.f16x2 r22, %49, %61; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %48, %60; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %36, r34; +} +{ +sub.f16x2 r40, %49, %61; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %49, %61; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %37, r52; +} +{ +sub.f16x2 r58, %48, %60; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %49, %61; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %37, r70; +} +{ +sub.f16x2 r76, %48, %60; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs5, fd67; +} +mov.b32 r156, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd68; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r165, {rs7, rs7}; +{ +add.f16x2 r85, %52, %64; +} +{ +add.f16x2 r88, %40, r85; +} +{ +add.f16x2 r91, %53, %65; +} +{ +add.f16x2 r94, %41, r91; +} +{ +add.f16x2 r97, %52, %64; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %40, r100; +} +{ +sub.f16x2 r106, %53, %65; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %52, %64; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %40, r118; +} +{ +sub.f16x2 r124, %53, %65; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %53, %65; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %41, r136; +} +{ +sub.f16x2 r142, %52, %64; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %53, %65; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %41, r154; +} +{ +sub.f16x2 r160, %52, %64; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +{ +cvt.rn.f16.f64 rs9, fd67; +} +mov.b32 r240, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd68; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r249, {rs11, rs11}; +{ +add.f16x2 r169, %56, %68; +} +{ +add.f16x2 r172, %44, r169; +} +{ +add.f16x2 r175, %57, %69; +} +{ +add.f16x2 r178, %45, r175; +} +{ +add.f16x2 r181, %56, %68; +} +{ +mul.f16x2 r184, r181, r240; +} +{ +add.f16x2 r187, %44, r184; +} +{ +sub.f16x2 r190, %57, %69; +} +{ +mul.f16x2 r193, r190, r249; +} +{ +add.f16x2 r196, r187, r193; +} +{ +add.f16x2 r199, %56, %68; +} +{ +mul.f16x2 r202, r199, r240; +} +{ +add.f16x2 r205, %44, r202; +} +{ +sub.f16x2 r208, %57, %69; +} +{ +mul.f16x2 r211, r208, r249; +} +{ +sub.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %57, %69; +} +{ +mul.f16x2 r220, r217, r240; +} +{ +add.f16x2 r223, %45, r220; +} +{ +sub.f16x2 r226, %56, %68; +} +{ +mul.f16x2 r229, r226, r249; +} +{ +sub.f16x2 r232, r223, r229; +} +{ +add.f16x2 r235, %57, %69; +} +{ +mul.f16x2 r238, r235, r240; +} +{ +add.f16x2 r241, %45, r238; +} +{ +sub.f16x2 r244, %56, %68; +} +{ +mul.f16x2 r247, r244, r249; +} +{ +add.f16x2 r250, r241, r247; +} +mov.f64 fd59, 0d3FE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs13, fd59; +} +mov.f64 fd70, 0dBFE491B7523C161D; +{ +cvt.rn.f16.f64 rs14, fd70; +} +mov.f64 fd63, 0d3FC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs15, fd63; +} +mov.f64 fd66, 0dBFEF838B8C811C17; +{ +cvt.rn.f16.f64 rs16, fd66; +} +mov.f64 fd71, 0dBFEE11F642522D1C; +{ +cvt.rn.f16.f64 rs19, fd71; +} +mov.f64 fd72, 0dBFD5E3A8748A0BF5; +{ +cvt.rn.f16.f64 rs20, fd72; +} +mov.b32 r267, {rs13, rs13}; +{ +mul.f16x2 r253, r112, r267; +} +mov.b32 r264, {rs14, rs14}; +{ +mul.f16x2 r256, r148, r264; +} +{ +sub.f16x2 r259, r253, r256; +} +{ +mul.f16x2 r262, r112, r264; +} +{ +fma.rn.f16x2 r265, r148, r267, r262; +} +mov.b32 r299, {rs15, rs15}; +{ +mul.f16x2 r269, r196, r299; +} +mov.b32 r296, {rs16, rs16}; +{ +mul.f16x2 r272, r232, r296; +} +{ +sub.f16x2 r275, r269, r272; +} +{ +mul.f16x2 r278, r196, r296; +} +{ +fma.rn.f16x2 r281, r232, r299, r278; +} +{ +mul.f16x2 r285, r130, r299; +} +{ +mul.f16x2 r288, r166, r296; +} +{ +sub.f16x2 r291, r285, r288; +} +{ +mul.f16x2 r294, r130, r296; +} +{ +fma.rn.f16x2 r297, r166, r299, r294; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r214, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r250, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r214, r312; +} +{ +fma.rn.f16x2 r313, r250, r315, r310; +} +{ +cvt.rn.f16.f64 rs29, fd67; +} +mov.b32 r388, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd68; +} +{ +neg.f16 rs31, rs30; +} +mov.b32 r397, {rs31, rs31}; +{ +add.f16x2 r317, r88, r172; +} +{ +add.f16x2 r320, r4, r317; +} +{ +add.f16x2 r323, r94, r178; +} +{ +add.f16x2 r326, r10, r323; +} +{ +add.f16x2 r329, r88, r172; +} +{ +mul.f16x2 r332, r329, r388; +} +{ +add.f16x2 r335, r4, r332; +} +{ +sub.f16x2 r338, r94, r178; +} +{ +mul.f16x2 r341, r338, r397; +} +{ +add.f16x2 r344, r335, r341; +} +{ +add.f16x2 r347, r88, r172; +} +{ +mul.f16x2 r350, r347, r388; +} +{ +add.f16x2 r353, r4, r350; +} +{ +sub.f16x2 r356, r94, r178; +} +{ +mul.f16x2 r359, r356, r397; +} +{ +sub.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r94, r178; +} +{ +mul.f16x2 r368, r365, r388; +} +{ +add.f16x2 r371, r10, r368; +} +{ +sub.f16x2 r374, r88, r172; +} +{ +mul.f16x2 r377, r374, r397; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r94, r178; +} +{ +mul.f16x2 r386, r383, r388; +} +{ +add.f16x2 r389, r10, r386; +} +{ +sub.f16x2 r392, r88, r172; +} +{ +mul.f16x2 r395, r392, r397; +} +{ +add.f16x2 r398, r389, r395; +} +{ +cvt.rn.f16.f64 rs33, fd67; +} +mov.b32 r472, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd68; +} +{ +neg.f16 rs35, rs34; +} +mov.b32 r481, {rs35, rs35}; +{ +add.f16x2 r401, r259, r275; +} +{ +add.f16x2 r404, r28, r401; +} +{ +add.f16x2 r407, r265, r281; +} +{ +add.f16x2 r410, r64, r407; +} +{ +add.f16x2 r413, r259, r275; +} +{ +mul.f16x2 r416, r413, r472; +} +{ +add.f16x2 r419, r28, r416; +} +{ +sub.f16x2 r422, r265, r281; +} +{ +mul.f16x2 r425, r422, r481; +} +{ +add.f16x2 r428, r419, r425; +} +{ +add.f16x2 r431, r259, r275; +} +{ +mul.f16x2 r434, r431, r472; +} +{ +add.f16x2 r437, r28, r434; +} +{ +sub.f16x2 r440, r265, r281; +} +{ +mul.f16x2 r443, r440, r481; +} +{ +sub.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, r265, r281; +} +{ +mul.f16x2 r452, r449, r472; +} +{ +add.f16x2 r455, r64, r452; +} +{ +sub.f16x2 r458, r259, r275; +} +{ +mul.f16x2 r461, r458, r481; +} +{ +sub.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r265, r281; +} +{ +mul.f16x2 r470, r467, r472; +} +{ +add.f16x2 r473, r64, r470; +} +{ +sub.f16x2 r476, r259, r275; +} +{ +mul.f16x2 r479, r476, r481; +} +{ +add.f16x2 r482, r473, r479; +} +{ +cvt.rn.f16.f64 rs37, fd67; +} +mov.b32 r556, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs38, fd68; +} +{ +neg.f16 rs39, rs38; +} +mov.b32 r565, {rs39, rs39}; +{ +add.f16x2 r485, r291, r307; +} +{ +add.f16x2 r488, r46, r485; +} +{ +add.f16x2 r491, r297, r313; +} +{ +add.f16x2 r494, r82, r491; +} +{ +add.f16x2 r497, r291, r307; +} +{ +mul.f16x2 r500, r497, r556; +} +{ +add.f16x2 r503, r46, r500; +} +{ +sub.f16x2 r506, r297, r313; +} +{ +mul.f16x2 r509, r506, r565; +} +{ +add.f16x2 r512, r503, r509; +} +{ +add.f16x2 r515, r291, r307; +} +{ +mul.f16x2 r518, r515, r556; +} +{ +add.f16x2 r521, r46, r518; +} +{ +sub.f16x2 r524, r297, r313; +} +{ +mul.f16x2 r527, r524, r565; +} +{ +sub.f16x2 r530, r521, r527; +} +{ +add.f16x2 r533, r297, r313; +} +{ +mul.f16x2 r536, r533, r556; +} +{ +add.f16x2 r539, r82, r536; +} +{ +sub.f16x2 r542, r291, r307; +} +{ +mul.f16x2 r545, r542, r565; +} +{ +sub.f16x2 r548, r539, r545; +} +{ +add.f16x2 r551, r297, r313; +} +{ +mul.f16x2 r554, r551, r556; +} +{ +add.f16x2 r557, r82, r554; +} +{ +sub.f16x2 r560, r291, r307; +} +{ +mul.f16x2 r563, r560, r565; +} +{ +add.f16x2 r566, r557, r563; +} +{ +cvt.rn.f16.f64 rs41, fd67; +} +mov.b32 r640, {rs41, rs41}; +{ +cvt.rn.f16.f64 rs42, fd68; +} +{ +neg.f16 rs43, rs42; +} +mov.b32 r649, {rs43, rs43}; +{ +add.f16x2 r569, %50, %62; +} +{ +add.f16x2 r572, %38, r569; +} +{ +add.f16x2 r575, %51, %63; +} +{ +add.f16x2 r578, %39, r575; +} +{ +add.f16x2 r581, %50, %62; +} +{ +mul.f16x2 r584, r581, r640; +} +{ +add.f16x2 r587, %38, r584; +} +{ +sub.f16x2 r590, %51, %63; +} +{ +mul.f16x2 r593, r590, r649; +} +{ +add.f16x2 r596, r587, r593; +} +{ +add.f16x2 r599, %50, %62; +} +{ +mul.f16x2 r602, r599, r640; +} +{ +add.f16x2 r605, %38, r602; +} +{ +sub.f16x2 r608, %51, %63; +} +{ +mul.f16x2 r611, r608, r649; +} +{ +sub.f16x2 r614, r605, r611; +} +{ +add.f16x2 r617, %51, %63; +} +{ +mul.f16x2 r620, r617, r640; +} +{ +add.f16x2 r623, %39, r620; +} +{ +sub.f16x2 r626, %50, %62; +} +{ +mul.f16x2 r629, r626, r649; +} +{ +sub.f16x2 r632, r623, r629; +} +{ +add.f16x2 r635, %51, %63; +} +{ +mul.f16x2 r638, r635, r640; +} +{ +add.f16x2 r641, %39, r638; +} +{ +sub.f16x2 r644, %50, %62; +} +{ +mul.f16x2 r647, r644, r649; +} +{ +add.f16x2 r650, r641, r647; +} +{ +cvt.rn.f16.f64 rs45, fd67; +} +mov.b32 r724, {rs45, rs45}; +{ +cvt.rn.f16.f64 rs46, fd68; +} +{ +neg.f16 rs47, rs46; +} +mov.b32 r733, {rs47, rs47}; +{ +add.f16x2 r653, %54, %66; +} +{ +add.f16x2 r656, %42, r653; +} +{ +add.f16x2 r659, %55, %67; +} +{ +add.f16x2 r662, %43, r659; +} +{ +add.f16x2 r665, %54, %66; +} +{ +mul.f16x2 r668, r665, r724; +} +{ +add.f16x2 r671, %42, r668; +} +{ +sub.f16x2 r674, %55, %67; +} +{ +mul.f16x2 r677, r674, r733; +} +{ +add.f16x2 r680, r671, r677; +} +{ +add.f16x2 r683, %54, %66; +} +{ +mul.f16x2 r686, r683, r724; +} +{ +add.f16x2 r689, %42, r686; +} +{ +sub.f16x2 r692, %55, %67; +} +{ +mul.f16x2 r695, r692, r733; +} +{ +sub.f16x2 r698, r689, r695; +} +{ +add.f16x2 r701, %55, %67; +} +{ +mul.f16x2 r704, r701, r724; +} +{ +add.f16x2 r707, %43, r704; +} +{ +sub.f16x2 r710, %54, %66; +} +{ +mul.f16x2 r713, r710, r733; +} +{ +sub.f16x2 r716, r707, r713; +} +{ +add.f16x2 r719, %55, %67; +} +{ +mul.f16x2 r722, r719, r724; +} +{ +add.f16x2 r725, %43, r722; +} +{ +sub.f16x2 r728, %54, %66; +} +{ +mul.f16x2 r731, r728, r733; +} +{ +add.f16x2 r734, r725, r731; +} +{ +cvt.rn.f16.f64 rs49, fd67; +} +mov.b32 r808, {rs49, rs49}; +{ +cvt.rn.f16.f64 rs50, fd68; +} +{ +neg.f16 rs51, rs50; +} +mov.b32 r817, {rs51, rs51}; +{ +add.f16x2 r737, %58, %70; +} +{ +add.f16x2 r740, %46, r737; +} +{ +add.f16x2 r743, %59, %71; +} +{ +add.f16x2 r746, %47, r743; +} +{ +add.f16x2 r749, %58, %70; +} +{ +mul.f16x2 r752, r749, r808; +} +{ +add.f16x2 r755, %46, r752; +} +{ +sub.f16x2 r758, %59, %71; +} +{ +mul.f16x2 r761, r758, r817; +} +{ +add.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %58, %70; +} +{ +mul.f16x2 r770, r767, r808; +} +{ +add.f16x2 r773, %46, r770; +} +{ +sub.f16x2 r776, %59, %71; +} +{ +mul.f16x2 r779, r776, r817; +} +{ +sub.f16x2 r782, r773, r779; +} +{ +add.f16x2 r785, %59, %71; +} +{ +mul.f16x2 r788, r785, r808; +} +{ +add.f16x2 r791, %47, r788; +} +{ +sub.f16x2 r794, %58, %70; +} +{ +mul.f16x2 r797, r794, r817; +} +{ +sub.f16x2 r800, r791, r797; +} +{ +add.f16x2 r803, %59, %71; +} +{ +mul.f16x2 r806, r803, r808; +} +{ +add.f16x2 r809, %47, r806; +} +{ +sub.f16x2 r812, %58, %70; +} +{ +mul.f16x2 r815, r812, r817; +} +{ +add.f16x2 r818, r809, r815; +} +{ +cvt.rn.f16.f64 rs53, fd59; +} +{ +cvt.rn.f16.f64 rs54, fd70; +} +{ +cvt.rn.f16.f64 rs55, fd63; +} +{ +cvt.rn.f16.f64 rs56, fd66; +} +{ +cvt.rn.f16.f64 rs59, fd71; +} +{ +cvt.rn.f16.f64 rs60, fd72; +} +mov.b32 r835, {rs53, rs53}; +{ +mul.f16x2 r821, r680, r835; +} +mov.b32 r832, {rs54, rs54}; +{ +mul.f16x2 r824, r716, r832; +} +{ +sub.f16x2 r827, r821, r824; +} +{ +mul.f16x2 r830, r680, r832; +} +{ +fma.rn.f16x2 r833, r716, r835, r830; +} +mov.b32 r867, {rs55, rs55}; +{ +mul.f16x2 r837, r764, r867; +} +mov.b32 r864, {rs56, rs56}; +{ +mul.f16x2 r840, r800, r864; +} +{ +sub.f16x2 r843, r837, r840; +} +{ +mul.f16x2 r846, r764, r864; +} +{ +fma.rn.f16x2 r849, r800, r867, r846; +} +{ +mul.f16x2 r853, r698, r867; +} +{ +mul.f16x2 r856, r734, r864; +} +{ +sub.f16x2 r859, r853, r856; +} +{ +mul.f16x2 r862, r698, r864; +} +{ +fma.rn.f16x2 r865, r734, r867, r862; +} +mov.b32 r883, {rs59, rs59}; +{ +mul.f16x2 r869, r782, r883; +} +mov.b32 r880, {rs60, rs60}; +{ +mul.f16x2 r872, r818, r880; +} +{ +sub.f16x2 r875, r869, r872; +} +{ +mul.f16x2 r878, r782, r880; +} +{ +fma.rn.f16x2 r881, r818, r883, r878; +} +{ +cvt.rn.f16.f64 rs69, fd67; +} +mov.b32 r956, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs70, fd68; +} +{ +neg.f16 rs71, rs70; +} +mov.b32 r965, {rs71, rs71}; +{ +add.f16x2 r885, r656, r740; +} +{ +add.f16x2 r888, r572, r885; +} +{ +add.f16x2 r891, r662, r746; +} +{ +add.f16x2 r894, r578, r891; +} +{ +add.f16x2 r897, r656, r740; +} +{ +mul.f16x2 r900, r897, r956; +} +{ +add.f16x2 r903, r572, r900; +} +{ +sub.f16x2 r906, r662, r746; +} +{ +mul.f16x2 r909, r906, r965; +} +{ +add.f16x2 r912, r903, r909; +} +{ +add.f16x2 r915, r656, r740; +} +{ +mul.f16x2 r918, r915, r956; +} +{ +add.f16x2 r921, r572, r918; +} +{ +sub.f16x2 r924, r662, r746; +} +{ +mul.f16x2 r927, r924, r965; +} +{ +sub.f16x2 r930, r921, r927; +} +{ +add.f16x2 r933, r662, r746; +} +{ +mul.f16x2 r936, r933, r956; +} +{ +add.f16x2 r939, r578, r936; +} +{ +sub.f16x2 r942, r656, r740; +} +{ +mul.f16x2 r945, r942, r965; +} +{ +sub.f16x2 r948, r939, r945; +} +{ +add.f16x2 r951, r662, r746; +} +{ +mul.f16x2 r954, r951, r956; +} +{ +add.f16x2 r957, r578, r954; +} +{ +sub.f16x2 r960, r656, r740; +} +{ +mul.f16x2 r963, r960, r965; +} +{ +add.f16x2 r966, r957, r963; +} +{ +cvt.rn.f16.f64 rs73, fd67; +} +mov.b32 r1040, {rs73, rs73}; +{ +cvt.rn.f16.f64 rs74, fd68; +} +{ +neg.f16 rs75, rs74; +} +mov.b32 r1049, {rs75, rs75}; +{ +add.f16x2 r969, r827, r843; +} +{ +add.f16x2 r972, r596, r969; +} +{ +add.f16x2 r975, r833, r849; +} +{ +add.f16x2 r978, r632, r975; +} +{ +add.f16x2 r981, r827, r843; +} +{ +mul.f16x2 r984, r981, r1040; +} +{ +add.f16x2 r987, r596, r984; +} +{ +sub.f16x2 r990, r833, r849; +} +{ +mul.f16x2 r993, r990, r1049; +} +{ +add.f16x2 r996, r987, r993; +} +{ +add.f16x2 r999, r827, r843; +} +{ +mul.f16x2 r1002, r999, r1040; +} +{ +add.f16x2 r1005, r596, r1002; +} +{ +sub.f16x2 r1008, r833, r849; +} +{ +mul.f16x2 r1011, r1008, r1049; +} +{ +sub.f16x2 r1014, r1005, r1011; +} +{ +add.f16x2 r1017, r833, r849; +} +{ +mul.f16x2 r1020, r1017, r1040; +} +{ +add.f16x2 r1023, r632, r1020; +} +{ +sub.f16x2 r1026, r827, r843; +} +{ +mul.f16x2 r1029, r1026, r1049; +} +{ +sub.f16x2 r1032, r1023, r1029; +} +{ +add.f16x2 r1035, r833, r849; +} +{ +mul.f16x2 r1038, r1035, r1040; +} +{ +add.f16x2 r1041, r632, r1038; +} +{ +sub.f16x2 r1044, r827, r843; +} +{ +mul.f16x2 r1047, r1044, r1049; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +cvt.rn.f16.f64 rs77, fd67; +} +mov.b32 r1124, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs78, fd68; +} +{ +neg.f16 rs79, rs78; +} +mov.b32 r1133, {rs79, rs79}; +{ +add.f16x2 r1053, r859, r875; +} +{ +add.f16x2 r1056, r614, r1053; +} +{ +add.f16x2 r1059, r865, r881; +} +{ +add.f16x2 r1062, r650, r1059; +} +{ +add.f16x2 r1065, r859, r875; +} +{ +mul.f16x2 r1068, r1065, r1124; +} +{ +add.f16x2 r1071, r614, r1068; +} +{ +sub.f16x2 r1074, r865, r881; +} +{ +mul.f16x2 r1077, r1074, r1133; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r859, r875; +} +{ +mul.f16x2 r1086, r1083, r1124; +} +{ +add.f16x2 r1089, r614, r1086; +} +{ +sub.f16x2 r1092, r865, r881; +} +{ +mul.f16x2 r1095, r1092, r1133; +} +{ +sub.f16x2 r1098, r1089, r1095; +} +{ +add.f16x2 r1101, r865, r881; +} +{ +mul.f16x2 r1104, r1101, r1124; +} +{ +add.f16x2 r1107, r650, r1104; +} +{ +sub.f16x2 r1110, r859, r875; +} +{ +mul.f16x2 r1113, r1110, r1133; +} +{ +sub.f16x2 r1116, r1107, r1113; +} +{ +add.f16x2 r1119, r865, r881; +} +{ +mul.f16x2 r1122, r1119, r1124; +} +{ +add.f16x2 r1125, r650, r1122; +} +{ +sub.f16x2 r1128, r859, r875; +} +{ +mul.f16x2 r1131, r1128, r1133; +} +{ +add.f16x2 r1134, r1125, r1131; +} +mov.f64 fd57, 0d3FEE11F642522D1C; +{ +cvt.rn.f16.f64 rs81, fd57; +} +{ +cvt.rn.f16.f64 rs82, fd72; +} +{ +cvt.rn.f16.f64 rs83, fd59; +} +{ +cvt.rn.f16.f64 rs84, fd70; +} +mov.f64 fd61, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs85, fd61; +} +{ +cvt.rn.f16.f64 rs86, fd68; +} +{ +cvt.rn.f16.f64 rs87, fd63; +} +{ +cvt.rn.f16.f64 rs88, fd66; +} +mov.f64 fd65, 0dBFC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs89, fd65; +} +{ +cvt.rn.f16.f64 rs90, fd66; +} +{ +cvt.rn.f16.f64 rs91, fd67; +} +{ +cvt.rn.f16.f64 rs92, fd68; +} +mov.f64 fd69, 0dBFE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs93, fd69; +} +{ +cvt.rn.f16.f64 rs94, fd70; +} +{ +cvt.rn.f16.f64 rs95, fd71; +} +{ +cvt.rn.f16.f64 rs96, fd72; +} +mov.b32 r1151, {rs81, rs81}; +{ +mul.f16x2 r1137, r972, r1151; +} +mov.b32 r1148, {rs82, rs82}; +{ +mul.f16x2 r1140, r978, r1148; +} +{ +sub.f16x2 r1143, r1137, r1140; +} +{ +mul.f16x2 r1146, r972, r1148; +} +{ +fma.rn.f16x2 r1149, r978, r1151, r1146; +} +mov.b32 r1167, {rs83, rs83}; +{ +mul.f16x2 r1153, r1056, r1167; +} +mov.b32 r1164, {rs84, rs84}; +{ +mul.f16x2 r1156, r1062, r1164; +} +{ +sub.f16x2 r1159, r1153, r1156; +} +{ +mul.f16x2 r1162, r1056, r1164; +} +{ +fma.rn.f16x2 r1165, r1062, r1167, r1162; +} +mov.b32 r1183, {rs85, rs85}; +{ +mul.f16x2 r1169, r912, r1183; +} +mov.b32 r1180, {rs86, rs86}; +{ +mul.f16x2 r1172, r948, r1180; +} +{ +sub.f16x2 r1175, r1169, r1172; +} +{ +mul.f16x2 r1178, r912, r1180; +} +{ +fma.rn.f16x2 r1181, r948, r1183, r1178; +} +mov.b32 r1199, {rs87, rs87}; +{ +mul.f16x2 r1185, r996, r1199; +} +mov.b32 r1196, {rs88, rs88}; +{ +mul.f16x2 r1188, r1032, r1196; +} +{ +sub.f16x2 r1191, r1185, r1188; +} +{ +mul.f16x2 r1194, r996, r1196; +} +{ +fma.rn.f16x2 r1197, r1032, r1199, r1194; +} +mov.b32 r1215, {rs89, rs89}; +{ +mul.f16x2 r1201, r1080, r1215; +} +mov.b32 r1212, {rs90, rs90}; +{ +mul.f16x2 r1204, r1116, r1212; +} +{ +sub.f16x2 r1207, r1201, r1204; +} +{ +mul.f16x2 r1210, r1080, r1212; +} +{ +fma.rn.f16x2 r1213, r1116, r1215, r1210; +} +mov.b32 r1231, {rs91, rs91}; +{ +mul.f16x2 r1217, r930, r1231; +} +mov.b32 r1228, {rs92, rs92}; +{ +mul.f16x2 r1220, r966, r1228; +} +{ +sub.f16x2 r1223, r1217, r1220; +} +{ +mul.f16x2 r1226, r930, r1228; +} +{ +fma.rn.f16x2 r1229, r966, r1231, r1226; +} +mov.b32 r1247, {rs93, rs93}; +{ +mul.f16x2 r1233, r1014, r1247; +} +mov.b32 r1244, {rs94, rs94}; +{ +mul.f16x2 r1236, r1050, r1244; +} +{ +sub.f16x2 r1239, r1233, r1236; +} +{ +mul.f16x2 r1242, r1014, r1244; +} +{ +fma.rn.f16x2 r1245, r1050, r1247, r1242; +} +mov.b32 r1263, {rs95, rs95}; +{ +mul.f16x2 r1249, r1098, r1263; +} +mov.b32 r1260, {rs96, rs96}; +{ +mul.f16x2 r1252, r1134, r1260; +} +{ +sub.f16x2 r1255, r1249, r1252; +} +{ +mul.f16x2 r1258, r1098, r1260; +} +{ +fma.rn.f16x2 r1261, r1134, r1263, r1258; +} +{ +add.f16x2 %0, r320, r888; +} +{ +add.f16x2 %1, r326, r894; +} +{ +sub.f16x2 %18, r320, r888; +} +{ +sub.f16x2 %19, r326, r894; +} +{ +add.f16x2 %2, r404, r1143; +} +{ +add.f16x2 %3, r410, r1149; +} +{ +sub.f16x2 %20, r404, r1143; +} +{ +sub.f16x2 %21, r410, r1149; +} +{ +add.f16x2 %4, r488, r1159; +} +{ +add.f16x2 %5, r494, r1165; +} +{ +sub.f16x2 %22, r488, r1159; +} +{ +sub.f16x2 %23, r494, r1165; +} +{ +add.f16x2 %6, r344, r1175; +} +{ +add.f16x2 %7, r380, r1181; +} +{ +sub.f16x2 %24, r344, r1175; +} +{ +sub.f16x2 %25, r380, r1181; +} +{ +add.f16x2 %8, r428, r1191; +} +{ +add.f16x2 %9, r464, r1197; +} +{ +sub.f16x2 %26, r428, r1191; +} +{ +sub.f16x2 %27, r464, r1197; +} +{ +add.f16x2 %10, r512, r1207; +} +{ +add.f16x2 %11, r548, r1213; +} +{ +sub.f16x2 %28, r512, r1207; +} +{ +sub.f16x2 %29, r548, r1213; +} +{ +add.f16x2 %12, r362, r1223; +} +{ +add.f16x2 %13, r398, r1229; +} +{ +sub.f16x2 %30, r362, r1223; +} +{ +sub.f16x2 %31, r398, r1229; +} +{ +add.f16x2 %14, r446, r1239; +} +{ +add.f16x2 %15, r482, r1245; +} +{ +sub.f16x2 %32, r446, r1239; +} +{ +sub.f16x2 %33, r482, r1245; +} +{ +add.f16x2 %16, r530, r1255; +} +{ +add.f16x2 %17, r566, r1261; +} +{ +sub.f16x2 %34, r530, r1255; +} +{ +sub.f16x2 %35, r566, r1261; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..686539f3fc91d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp16_inv.hpp.inc @@ -0,0 +1,1595 @@ +#ifndef CUFFTDX_FFT_18_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_18_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<952, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<91>; +.reg .b32 r<1373>; +.reg .f64 fd<91>; +.reg .b64 rd<2>; +mov.f64 fd67, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd67; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd56, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd56; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %48, %60; +} +{ +add.f16x2 r4, %36, r1; +} +{ +add.f16x2 r7, %49, %61; +} +{ +add.f16x2 r10, %37, r7; +} +{ +add.f16x2 r13, %48, %60; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %36, r16; +} +{ +sub.f16x2 r22, %49, %61; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %48, %60; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %36, r34; +} +{ +sub.f16x2 r40, %49, %61; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %49, %61; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %37, r52; +} +{ +sub.f16x2 r58, %48, %60; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %49, %61; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %37, r70; +} +{ +sub.f16x2 r76, %48, %60; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs3, fd67; +} +mov.b32 r156, {rs3, rs3}; +{ +cvt.rn.f16.f64 rs4, fd56; +} +mov.b32 r165, {rs4, rs4}; +{ +add.f16x2 r85, %52, %64; +} +{ +add.f16x2 r88, %40, r85; +} +{ +add.f16x2 r91, %53, %65; +} +{ +add.f16x2 r94, %41, r91; +} +{ +add.f16x2 r97, %52, %64; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %40, r100; +} +{ +sub.f16x2 r106, %53, %65; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %52, %64; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %40, r118; +} +{ +sub.f16x2 r124, %53, %65; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %53, %65; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %41, r136; +} +{ +sub.f16x2 r142, %52, %64; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %53, %65; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %41, r154; +} +{ +sub.f16x2 r160, %52, %64; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +{ +cvt.rn.f16.f64 rs5, fd67; +} +mov.b32 r240, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd56; +} +mov.b32 r249, {rs6, rs6}; +{ +add.f16x2 r169, %56, %68; +} +{ +add.f16x2 r172, %44, r169; +} +{ +add.f16x2 r175, %57, %69; +} +{ +add.f16x2 r178, %45, r175; +} +{ +add.f16x2 r181, %56, %68; +} +{ +mul.f16x2 r184, r181, r240; +} +{ +add.f16x2 r187, %44, r184; +} +{ +sub.f16x2 r190, %57, %69; +} +{ +mul.f16x2 r193, r190, r249; +} +{ +add.f16x2 r196, r187, r193; +} +{ +add.f16x2 r199, %56, %68; +} +{ +mul.f16x2 r202, r199, r240; +} +{ +add.f16x2 r205, %44, r202; +} +{ +sub.f16x2 r208, %57, %69; +} +{ +mul.f16x2 r211, r208, r249; +} +{ +sub.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %57, %69; +} +{ +mul.f16x2 r220, r217, r240; +} +{ +add.f16x2 r223, %45, r220; +} +{ +sub.f16x2 r226, %56, %68; +} +{ +mul.f16x2 r229, r226, r249; +} +{ +sub.f16x2 r232, r223, r229; +} +{ +add.f16x2 r235, %57, %69; +} +{ +mul.f16x2 r238, r235, r240; +} +{ +add.f16x2 r241, %45, r238; +} +{ +sub.f16x2 r244, %56, %68; +} +{ +mul.f16x2 r247, r244, r249; +} +{ +add.f16x2 r250, r241, r247; +} +mov.f64 fd59, 0d3FE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs7, fd59; +} +mov.f64 fd70, 0d3FE491B7523C161D; +{ +cvt.rn.f16.f64 rs8, fd70; +} +mov.f64 fd63, 0d3FC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs9, fd63; +} +mov.f64 fd66, 0d3FEF838B8C811C17; +{ +cvt.rn.f16.f64 rs10, fd66; +} +mov.f64 fd68, 0d3FEBB67AE8584CAA; +mov.f64 fd71, 0dBFEE11F642522D1C; +{ +cvt.rn.f16.f64 rs13, fd71; +} +mov.f64 fd72, 0d3FD5E3A8748A0BF5; +{ +cvt.rn.f16.f64 rs14, fd72; +} +mov.b32 r267, {rs7, rs7}; +{ +mul.f16x2 r253, r112, r267; +} +mov.b32 r264, {rs8, rs8}; +{ +mul.f16x2 r256, r148, r264; +} +{ +sub.f16x2 r259, r253, r256; +} +{ +mul.f16x2 r262, r112, r264; +} +{ +fma.rn.f16x2 r265, r148, r267, r262; +} +mov.b32 r299, {rs9, rs9}; +{ +mul.f16x2 r269, r196, r299; +} +mov.b32 r296, {rs10, rs10}; +{ +mul.f16x2 r272, r232, r296; +} +{ +sub.f16x2 r275, r269, r272; +} +{ +mul.f16x2 r278, r196, r296; +} +{ +fma.rn.f16x2 r281, r232, r299, r278; +} +{ +mul.f16x2 r285, r130, r299; +} +{ +mul.f16x2 r288, r166, r296; +} +{ +sub.f16x2 r291, r285, r288; +} +{ +mul.f16x2 r294, r130, r296; +} +{ +fma.rn.f16x2 r297, r166, r299, r294; +} +mov.b32 r315, {rs13, rs13}; +{ +mul.f16x2 r301, r214, r315; +} +mov.b32 r312, {rs14, rs14}; +{ +mul.f16x2 r304, r250, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r214, r312; +} +{ +fma.rn.f16x2 r313, r250, r315, r310; +} +{ +cvt.rn.f16.f64 rs23, fd67; +} +mov.b32 r388, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs24, fd56; +} +mov.b32 r397, {rs24, rs24}; +{ +add.f16x2 r317, r88, r172; +} +{ +add.f16x2 r320, r4, r317; +} +{ +add.f16x2 r323, r94, r178; +} +{ +add.f16x2 r326, r10, r323; +} +{ +add.f16x2 r329, r88, r172; +} +{ +mul.f16x2 r332, r329, r388; +} +{ +add.f16x2 r335, r4, r332; +} +{ +sub.f16x2 r338, r94, r178; +} +{ +mul.f16x2 r341, r338, r397; +} +{ +add.f16x2 r344, r335, r341; +} +{ +add.f16x2 r347, r88, r172; +} +{ +mul.f16x2 r350, r347, r388; +} +{ +add.f16x2 r353, r4, r350; +} +{ +sub.f16x2 r356, r94, r178; +} +{ +mul.f16x2 r359, r356, r397; +} +{ +sub.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r94, r178; +} +{ +mul.f16x2 r368, r365, r388; +} +{ +add.f16x2 r371, r10, r368; +} +{ +sub.f16x2 r374, r88, r172; +} +{ +mul.f16x2 r377, r374, r397; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r94, r178; +} +{ +mul.f16x2 r386, r383, r388; +} +{ +add.f16x2 r389, r10, r386; +} +{ +sub.f16x2 r392, r88, r172; +} +{ +mul.f16x2 r395, r392, r397; +} +{ +add.f16x2 r398, r389, r395; +} +{ +cvt.rn.f16.f64 rs25, fd67; +} +mov.b32 r472, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd56; +} +mov.b32 r481, {rs26, rs26}; +{ +add.f16x2 r401, r259, r275; +} +{ +add.f16x2 r404, r28, r401; +} +{ +add.f16x2 r407, r265, r281; +} +{ +add.f16x2 r410, r64, r407; +} +{ +add.f16x2 r413, r259, r275; +} +{ +mul.f16x2 r416, r413, r472; +} +{ +add.f16x2 r419, r28, r416; +} +{ +sub.f16x2 r422, r265, r281; +} +{ +mul.f16x2 r425, r422, r481; +} +{ +add.f16x2 r428, r419, r425; +} +{ +add.f16x2 r431, r259, r275; +} +{ +mul.f16x2 r434, r431, r472; +} +{ +add.f16x2 r437, r28, r434; +} +{ +sub.f16x2 r440, r265, r281; +} +{ +mul.f16x2 r443, r440, r481; +} +{ +sub.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, r265, r281; +} +{ +mul.f16x2 r452, r449, r472; +} +{ +add.f16x2 r455, r64, r452; +} +{ +sub.f16x2 r458, r259, r275; +} +{ +mul.f16x2 r461, r458, r481; +} +{ +sub.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r265, r281; +} +{ +mul.f16x2 r470, r467, r472; +} +{ +add.f16x2 r473, r64, r470; +} +{ +sub.f16x2 r476, r259, r275; +} +{ +mul.f16x2 r479, r476, r481; +} +{ +add.f16x2 r482, r473, r479; +} +{ +cvt.rn.f16.f64 rs27, fd67; +} +mov.b32 r556, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs28, fd56; +} +mov.b32 r565, {rs28, rs28}; +{ +add.f16x2 r485, r291, r307; +} +{ +add.f16x2 r488, r46, r485; +} +{ +add.f16x2 r491, r297, r313; +} +{ +add.f16x2 r494, r82, r491; +} +{ +add.f16x2 r497, r291, r307; +} +{ +mul.f16x2 r500, r497, r556; +} +{ +add.f16x2 r503, r46, r500; +} +{ +sub.f16x2 r506, r297, r313; +} +{ +mul.f16x2 r509, r506, r565; +} +{ +add.f16x2 r512, r503, r509; +} +{ +add.f16x2 r515, r291, r307; +} +{ +mul.f16x2 r518, r515, r556; +} +{ +add.f16x2 r521, r46, r518; +} +{ +sub.f16x2 r524, r297, r313; +} +{ +mul.f16x2 r527, r524, r565; +} +{ +sub.f16x2 r530, r521, r527; +} +{ +add.f16x2 r533, r297, r313; +} +{ +mul.f16x2 r536, r533, r556; +} +{ +add.f16x2 r539, r82, r536; +} +{ +sub.f16x2 r542, r291, r307; +} +{ +mul.f16x2 r545, r542, r565; +} +{ +sub.f16x2 r548, r539, r545; +} +{ +add.f16x2 r551, r297, r313; +} +{ +mul.f16x2 r554, r551, r556; +} +{ +add.f16x2 r557, r82, r554; +} +{ +sub.f16x2 r560, r291, r307; +} +{ +mul.f16x2 r563, r560, r565; +} +{ +add.f16x2 r566, r557, r563; +} +{ +cvt.rn.f16.f64 rs29, fd67; +} +mov.b32 r640, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd56; +} +mov.b32 r649, {rs30, rs30}; +{ +add.f16x2 r569, %50, %62; +} +{ +add.f16x2 r572, %38, r569; +} +{ +add.f16x2 r575, %51, %63; +} +{ +add.f16x2 r578, %39, r575; +} +{ +add.f16x2 r581, %50, %62; +} +{ +mul.f16x2 r584, r581, r640; +} +{ +add.f16x2 r587, %38, r584; +} +{ +sub.f16x2 r590, %51, %63; +} +{ +mul.f16x2 r593, r590, r649; +} +{ +add.f16x2 r596, r587, r593; +} +{ +add.f16x2 r599, %50, %62; +} +{ +mul.f16x2 r602, r599, r640; +} +{ +add.f16x2 r605, %38, r602; +} +{ +sub.f16x2 r608, %51, %63; +} +{ +mul.f16x2 r611, r608, r649; +} +{ +sub.f16x2 r614, r605, r611; +} +{ +add.f16x2 r617, %51, %63; +} +{ +mul.f16x2 r620, r617, r640; +} +{ +add.f16x2 r623, %39, r620; +} +{ +sub.f16x2 r626, %50, %62; +} +{ +mul.f16x2 r629, r626, r649; +} +{ +sub.f16x2 r632, r623, r629; +} +{ +add.f16x2 r635, %51, %63; +} +{ +mul.f16x2 r638, r635, r640; +} +{ +add.f16x2 r641, %39, r638; +} +{ +sub.f16x2 r644, %50, %62; +} +{ +mul.f16x2 r647, r644, r649; +} +{ +add.f16x2 r650, r641, r647; +} +{ +cvt.rn.f16.f64 rs31, fd67; +} +mov.b32 r724, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd56; +} +mov.b32 r733, {rs32, rs32}; +{ +add.f16x2 r653, %54, %66; +} +{ +add.f16x2 r656, %42, r653; +} +{ +add.f16x2 r659, %55, %67; +} +{ +add.f16x2 r662, %43, r659; +} +{ +add.f16x2 r665, %54, %66; +} +{ +mul.f16x2 r668, r665, r724; +} +{ +add.f16x2 r671, %42, r668; +} +{ +sub.f16x2 r674, %55, %67; +} +{ +mul.f16x2 r677, r674, r733; +} +{ +add.f16x2 r680, r671, r677; +} +{ +add.f16x2 r683, %54, %66; +} +{ +mul.f16x2 r686, r683, r724; +} +{ +add.f16x2 r689, %42, r686; +} +{ +sub.f16x2 r692, %55, %67; +} +{ +mul.f16x2 r695, r692, r733; +} +{ +sub.f16x2 r698, r689, r695; +} +{ +add.f16x2 r701, %55, %67; +} +{ +mul.f16x2 r704, r701, r724; +} +{ +add.f16x2 r707, %43, r704; +} +{ +sub.f16x2 r710, %54, %66; +} +{ +mul.f16x2 r713, r710, r733; +} +{ +sub.f16x2 r716, r707, r713; +} +{ +add.f16x2 r719, %55, %67; +} +{ +mul.f16x2 r722, r719, r724; +} +{ +add.f16x2 r725, %43, r722; +} +{ +sub.f16x2 r728, %54, %66; +} +{ +mul.f16x2 r731, r728, r733; +} +{ +add.f16x2 r734, r725, r731; +} +{ +cvt.rn.f16.f64 rs33, fd67; +} +mov.b32 r808, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd56; +} +mov.b32 r817, {rs34, rs34}; +{ +add.f16x2 r737, %58, %70; +} +{ +add.f16x2 r740, %46, r737; +} +{ +add.f16x2 r743, %59, %71; +} +{ +add.f16x2 r746, %47, r743; +} +{ +add.f16x2 r749, %58, %70; +} +{ +mul.f16x2 r752, r749, r808; +} +{ +add.f16x2 r755, %46, r752; +} +{ +sub.f16x2 r758, %59, %71; +} +{ +mul.f16x2 r761, r758, r817; +} +{ +add.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %58, %70; +} +{ +mul.f16x2 r770, r767, r808; +} +{ +add.f16x2 r773, %46, r770; +} +{ +sub.f16x2 r776, %59, %71; +} +{ +mul.f16x2 r779, r776, r817; +} +{ +sub.f16x2 r782, r773, r779; +} +{ +add.f16x2 r785, %59, %71; +} +{ +mul.f16x2 r788, r785, r808; +} +{ +add.f16x2 r791, %47, r788; +} +{ +sub.f16x2 r794, %58, %70; +} +{ +mul.f16x2 r797, r794, r817; +} +{ +sub.f16x2 r800, r791, r797; +} +{ +add.f16x2 r803, %59, %71; +} +{ +mul.f16x2 r806, r803, r808; +} +{ +add.f16x2 r809, %47, r806; +} +{ +sub.f16x2 r812, %58, %70; +} +{ +mul.f16x2 r815, r812, r817; +} +{ +add.f16x2 r818, r809, r815; +} +{ +cvt.rn.f16.f64 rs35, fd59; +} +{ +cvt.rn.f16.f64 rs36, fd70; +} +{ +cvt.rn.f16.f64 rs37, fd63; +} +{ +cvt.rn.f16.f64 rs38, fd66; +} +{ +cvt.rn.f16.f64 rs41, fd71; +} +{ +cvt.rn.f16.f64 rs42, fd72; +} +mov.b32 r835, {rs35, rs35}; +{ +mul.f16x2 r821, r680, r835; +} +mov.b32 r832, {rs36, rs36}; +{ +mul.f16x2 r824, r716, r832; +} +{ +sub.f16x2 r827, r821, r824; +} +{ +mul.f16x2 r830, r680, r832; +} +{ +fma.rn.f16x2 r833, r716, r835, r830; +} +mov.b32 r867, {rs37, rs37}; +{ +mul.f16x2 r837, r764, r867; +} +mov.b32 r864, {rs38, rs38}; +{ +mul.f16x2 r840, r800, r864; +} +{ +sub.f16x2 r843, r837, r840; +} +{ +mul.f16x2 r846, r764, r864; +} +{ +fma.rn.f16x2 r849, r800, r867, r846; +} +{ +mul.f16x2 r853, r698, r867; +} +{ +mul.f16x2 r856, r734, r864; +} +{ +sub.f16x2 r859, r853, r856; +} +{ +mul.f16x2 r862, r698, r864; +} +{ +fma.rn.f16x2 r865, r734, r867, r862; +} +mov.b32 r883, {rs41, rs41}; +{ +mul.f16x2 r869, r782, r883; +} +mov.b32 r880, {rs42, rs42}; +{ +mul.f16x2 r872, r818, r880; +} +{ +sub.f16x2 r875, r869, r872; +} +{ +mul.f16x2 r878, r782, r880; +} +{ +fma.rn.f16x2 r881, r818, r883, r878; +} +{ +cvt.rn.f16.f64 rs51, fd67; +} +mov.b32 r956, {rs51, rs51}; +{ +cvt.rn.f16.f64 rs52, fd56; +} +mov.b32 r965, {rs52, rs52}; +{ +add.f16x2 r885, r656, r740; +} +{ +add.f16x2 r888, r572, r885; +} +{ +add.f16x2 r891, r662, r746; +} +{ +add.f16x2 r894, r578, r891; +} +{ +add.f16x2 r897, r656, r740; +} +{ +mul.f16x2 r900, r897, r956; +} +{ +add.f16x2 r903, r572, r900; +} +{ +sub.f16x2 r906, r662, r746; +} +{ +mul.f16x2 r909, r906, r965; +} +{ +add.f16x2 r912, r903, r909; +} +{ +add.f16x2 r915, r656, r740; +} +{ +mul.f16x2 r918, r915, r956; +} +{ +add.f16x2 r921, r572, r918; +} +{ +sub.f16x2 r924, r662, r746; +} +{ +mul.f16x2 r927, r924, r965; +} +{ +sub.f16x2 r930, r921, r927; +} +{ +add.f16x2 r933, r662, r746; +} +{ +mul.f16x2 r936, r933, r956; +} +{ +add.f16x2 r939, r578, r936; +} +{ +sub.f16x2 r942, r656, r740; +} +{ +mul.f16x2 r945, r942, r965; +} +{ +sub.f16x2 r948, r939, r945; +} +{ +add.f16x2 r951, r662, r746; +} +{ +mul.f16x2 r954, r951, r956; +} +{ +add.f16x2 r957, r578, r954; +} +{ +sub.f16x2 r960, r656, r740; +} +{ +mul.f16x2 r963, r960, r965; +} +{ +add.f16x2 r966, r957, r963; +} +{ +cvt.rn.f16.f64 rs53, fd67; +} +mov.b32 r1040, {rs53, rs53}; +{ +cvt.rn.f16.f64 rs54, fd56; +} +mov.b32 r1049, {rs54, rs54}; +{ +add.f16x2 r969, r827, r843; +} +{ +add.f16x2 r972, r596, r969; +} +{ +add.f16x2 r975, r833, r849; +} +{ +add.f16x2 r978, r632, r975; +} +{ +add.f16x2 r981, r827, r843; +} +{ +mul.f16x2 r984, r981, r1040; +} +{ +add.f16x2 r987, r596, r984; +} +{ +sub.f16x2 r990, r833, r849; +} +{ +mul.f16x2 r993, r990, r1049; +} +{ +add.f16x2 r996, r987, r993; +} +{ +add.f16x2 r999, r827, r843; +} +{ +mul.f16x2 r1002, r999, r1040; +} +{ +add.f16x2 r1005, r596, r1002; +} +{ +sub.f16x2 r1008, r833, r849; +} +{ +mul.f16x2 r1011, r1008, r1049; +} +{ +sub.f16x2 r1014, r1005, r1011; +} +{ +add.f16x2 r1017, r833, r849; +} +{ +mul.f16x2 r1020, r1017, r1040; +} +{ +add.f16x2 r1023, r632, r1020; +} +{ +sub.f16x2 r1026, r827, r843; +} +{ +mul.f16x2 r1029, r1026, r1049; +} +{ +sub.f16x2 r1032, r1023, r1029; +} +{ +add.f16x2 r1035, r833, r849; +} +{ +mul.f16x2 r1038, r1035, r1040; +} +{ +add.f16x2 r1041, r632, r1038; +} +{ +sub.f16x2 r1044, r827, r843; +} +{ +mul.f16x2 r1047, r1044, r1049; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +cvt.rn.f16.f64 rs55, fd67; +} +mov.b32 r1124, {rs55, rs55}; +{ +cvt.rn.f16.f64 rs56, fd56; +} +mov.b32 r1133, {rs56, rs56}; +{ +add.f16x2 r1053, r859, r875; +} +{ +add.f16x2 r1056, r614, r1053; +} +{ +add.f16x2 r1059, r865, r881; +} +{ +add.f16x2 r1062, r650, r1059; +} +{ +add.f16x2 r1065, r859, r875; +} +{ +mul.f16x2 r1068, r1065, r1124; +} +{ +add.f16x2 r1071, r614, r1068; +} +{ +sub.f16x2 r1074, r865, r881; +} +{ +mul.f16x2 r1077, r1074, r1133; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r859, r875; +} +{ +mul.f16x2 r1086, r1083, r1124; +} +{ +add.f16x2 r1089, r614, r1086; +} +{ +sub.f16x2 r1092, r865, r881; +} +{ +mul.f16x2 r1095, r1092, r1133; +} +{ +sub.f16x2 r1098, r1089, r1095; +} +{ +add.f16x2 r1101, r865, r881; +} +{ +mul.f16x2 r1104, r1101, r1124; +} +{ +add.f16x2 r1107, r650, r1104; +} +{ +sub.f16x2 r1110, r859, r875; +} +{ +mul.f16x2 r1113, r1110, r1133; +} +{ +sub.f16x2 r1116, r1107, r1113; +} +{ +add.f16x2 r1119, r865, r881; +} +{ +mul.f16x2 r1122, r1119, r1124; +} +{ +add.f16x2 r1125, r650, r1122; +} +{ +sub.f16x2 r1128, r859, r875; +} +{ +mul.f16x2 r1131, r1128, r1133; +} +{ +add.f16x2 r1134, r1125, r1131; +} +mov.f64 fd57, 0d3FEE11F642522D1C; +{ +cvt.rn.f16.f64 rs57, fd57; +} +{ +cvt.rn.f16.f64 rs58, fd72; +} +{ +cvt.rn.f16.f64 rs59, fd59; +} +{ +cvt.rn.f16.f64 rs60, fd70; +} +mov.f64 fd61, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs61, fd61; +} +{ +cvt.rn.f16.f64 rs62, fd68; +} +{ +cvt.rn.f16.f64 rs63, fd63; +} +{ +cvt.rn.f16.f64 rs64, fd66; +} +mov.f64 fd65, 0dBFC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs65, fd65; +} +{ +cvt.rn.f16.f64 rs66, fd66; +} +{ +cvt.rn.f16.f64 rs67, fd67; +} +{ +cvt.rn.f16.f64 rs68, fd68; +} +mov.f64 fd69, 0dBFE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs69, fd69; +} +{ +cvt.rn.f16.f64 rs70, fd70; +} +{ +cvt.rn.f16.f64 rs71, fd71; +} +{ +cvt.rn.f16.f64 rs72, fd72; +} +mov.b32 r1151, {rs57, rs57}; +{ +mul.f16x2 r1137, r972, r1151; +} +mov.b32 r1148, {rs58, rs58}; +{ +mul.f16x2 r1140, r978, r1148; +} +{ +sub.f16x2 r1143, r1137, r1140; +} +{ +mul.f16x2 r1146, r972, r1148; +} +{ +fma.rn.f16x2 r1149, r978, r1151, r1146; +} +mov.b32 r1167, {rs59, rs59}; +{ +mul.f16x2 r1153, r1056, r1167; +} +mov.b32 r1164, {rs60, rs60}; +{ +mul.f16x2 r1156, r1062, r1164; +} +{ +sub.f16x2 r1159, r1153, r1156; +} +{ +mul.f16x2 r1162, r1056, r1164; +} +{ +fma.rn.f16x2 r1165, r1062, r1167, r1162; +} +mov.b32 r1183, {rs61, rs61}; +{ +mul.f16x2 r1169, r912, r1183; +} +mov.b32 r1180, {rs62, rs62}; +{ +mul.f16x2 r1172, r948, r1180; +} +{ +sub.f16x2 r1175, r1169, r1172; +} +{ +mul.f16x2 r1178, r912, r1180; +} +{ +fma.rn.f16x2 r1181, r948, r1183, r1178; +} +mov.b32 r1199, {rs63, rs63}; +{ +mul.f16x2 r1185, r996, r1199; +} +mov.b32 r1196, {rs64, rs64}; +{ +mul.f16x2 r1188, r1032, r1196; +} +{ +sub.f16x2 r1191, r1185, r1188; +} +{ +mul.f16x2 r1194, r996, r1196; +} +{ +fma.rn.f16x2 r1197, r1032, r1199, r1194; +} +mov.b32 r1215, {rs65, rs65}; +{ +mul.f16x2 r1201, r1080, r1215; +} +mov.b32 r1212, {rs66, rs66}; +{ +mul.f16x2 r1204, r1116, r1212; +} +{ +sub.f16x2 r1207, r1201, r1204; +} +{ +mul.f16x2 r1210, r1080, r1212; +} +{ +fma.rn.f16x2 r1213, r1116, r1215, r1210; +} +mov.b32 r1231, {rs67, rs67}; +{ +mul.f16x2 r1217, r930, r1231; +} +mov.b32 r1228, {rs68, rs68}; +{ +mul.f16x2 r1220, r966, r1228; +} +{ +sub.f16x2 r1223, r1217, r1220; +} +{ +mul.f16x2 r1226, r930, r1228; +} +{ +fma.rn.f16x2 r1229, r966, r1231, r1226; +} +mov.b32 r1247, {rs69, rs69}; +{ +mul.f16x2 r1233, r1014, r1247; +} +mov.b32 r1244, {rs70, rs70}; +{ +mul.f16x2 r1236, r1050, r1244; +} +{ +sub.f16x2 r1239, r1233, r1236; +} +{ +mul.f16x2 r1242, r1014, r1244; +} +{ +fma.rn.f16x2 r1245, r1050, r1247, r1242; +} +mov.b32 r1263, {rs71, rs71}; +{ +mul.f16x2 r1249, r1098, r1263; +} +mov.b32 r1260, {rs72, rs72}; +{ +mul.f16x2 r1252, r1134, r1260; +} +{ +sub.f16x2 r1255, r1249, r1252; +} +{ +mul.f16x2 r1258, r1098, r1260; +} +{ +fma.rn.f16x2 r1261, r1134, r1263, r1258; +} +{ +add.f16x2 %0, r320, r888; +} +{ +add.f16x2 %1, r326, r894; +} +{ +sub.f16x2 %18, r320, r888; +} +{ +sub.f16x2 %19, r326, r894; +} +{ +add.f16x2 %2, r404, r1143; +} +{ +add.f16x2 %3, r410, r1149; +} +{ +sub.f16x2 %20, r404, r1143; +} +{ +sub.f16x2 %21, r410, r1149; +} +{ +add.f16x2 %4, r488, r1159; +} +{ +add.f16x2 %5, r494, r1165; +} +{ +sub.f16x2 %22, r488, r1159; +} +{ +sub.f16x2 %23, r494, r1165; +} +{ +add.f16x2 %6, r344, r1175; +} +{ +add.f16x2 %7, r380, r1181; +} +{ +sub.f16x2 %24, r344, r1175; +} +{ +sub.f16x2 %25, r380, r1181; +} +{ +add.f16x2 %8, r428, r1191; +} +{ +add.f16x2 %9, r464, r1197; +} +{ +sub.f16x2 %26, r428, r1191; +} +{ +sub.f16x2 %27, r464, r1197; +} +{ +add.f16x2 %10, r512, r1207; +} +{ +add.f16x2 %11, r548, r1213; +} +{ +sub.f16x2 %28, r512, r1207; +} +{ +sub.f16x2 %29, r548, r1213; +} +{ +add.f16x2 %12, r362, r1223; +} +{ +add.f16x2 %13, r398, r1229; +} +{ +sub.f16x2 %30, r362, r1223; +} +{ +sub.f16x2 %31, r398, r1229; +} +{ +add.f16x2 %14, r446, r1239; +} +{ +add.f16x2 %15, r482, r1245; +} +{ +sub.f16x2 %32, r446, r1239; +} +{ +sub.f16x2 %33, r482, r1245; +} +{ +add.f16x2 %16, r530, r1255; +} +{ +add.f16x2 %17, r566, r1261; +} +{ +sub.f16x2 %34, r530, r1255; +} +{ +sub.f16x2 %35, r566, r1261; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..adb713336ecba --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp32_fwd.hpp.inc @@ -0,0 +1,324 @@ +#ifndef CUFFTDX_FFT_18_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_18_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<4, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<381>; +.reg .b64 rd<2>; +add.f32 f73, %52, %68; +add.f32 f74, %36, f73; +add.f32 f75, %53, %69; +add.f32 f76, %37, f75; +mul.f32 f77, f73, 0f3F000000; +sub.f32 f78, %36, f77; +sub.f32 f79, %53, %69; +mul.f32 f80, f79, 0f3F5DB3D7; +add.f32 f81, f80, f78; +sub.f32 f82, f78, f80; +mul.f32 f83, f75, 0f3F000000; +sub.f32 f84, %37, f83; +sub.f32 f85, %52, %68; +mul.f32 f86, f85, 0f3F5DB3D7; +sub.f32 f87, f84, f86; +add.f32 f88, f86, f84; +add.f32 f89, %57, %73; +add.f32 f90, %41, f89; +add.f32 f91, %59, %75; +add.f32 f92, %43, f91; +mul.f32 f93, f89, 0f3F000000; +sub.f32 f94, %41, f93; +sub.f32 f95, %59, %75; +mul.f32 f96, f95, 0f3F5DB3D7; +add.f32 f97, f96, f94; +sub.f32 f98, f94, f96; +mul.f32 f99, f91, 0f3F000000; +sub.f32 f100, %43, f99; +sub.f32 f101, %57, %73; +mul.f32 f102, f101, 0f3F5DB3D7; +sub.f32 f103, f100, f102; +add.f32 f104, f102, f100; +add.f32 f105, %62, %78; +add.f32 f106, %46, f105; +add.f32 f107, %64, %80; +add.f32 f108, %48, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, %46, f109; +sub.f32 f111, %64, %80; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, %48, f115; +sub.f32 f117, %62, %78; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.f32 f121, f97, 0f3F441B7D; +mul.f32 f122, f103, 0fBF248DBB; +sub.f32 f123, f121, f122; +mul.f32 f124, f103, 0f3F441B7D; +fma.rn.f32 f125, f97, 0fBF248DBB, f124; +mul.f32 f126, f113, 0f3E31D0D4; +mul.f32 f127, f119, 0fBF7C1C5C; +sub.f32 f128, f126, f127; +mul.f32 f129, f119, 0f3E31D0D4; +fma.rn.f32 f130, f113, 0fBF7C1C5C, f129; +mul.f32 f131, f98, 0f3E31D0D4; +mul.f32 f132, f104, 0fBF7C1C5C; +sub.f32 f133, f131, f132; +mul.f32 f134, f104, 0f3E31D0D4; +fma.rn.f32 f135, f98, 0fBF7C1C5C, f134; +mul.f32 f136, f114, 0fBF708FB2; +mul.f32 f137, f120, 0fBEAF1D44; +sub.f32 f138, f136, f137; +mul.f32 f139, f120, 0fBF708FB2; +fma.rn.f32 f140, f114, 0fBEAF1D44, f139; +add.f32 f141, f90, f106; +add.f32 f142, f74, f141; +add.f32 f143, f92, f108; +add.f32 f144, f76, f143; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, f74, f145; +sub.f32 f147, f92, f108; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +mul.f32 f151, f143, 0f3F000000; +sub.f32 f152, f76, f151; +sub.f32 f153, f90, f106; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +add.f32 f157, f123, f128; +add.f32 f158, f81, f157; +add.f32 f159, f125, f130; +add.f32 f160, f87, f159; +mul.f32 f161, f157, 0f3F000000; +sub.f32 f162, f81, f161; +sub.f32 f163, f125, f130; +mul.f32 f164, f163, 0f3F5DB3D7; +add.f32 f165, f164, f162; +sub.f32 f166, f162, f164; +mul.f32 f167, f159, 0f3F000000; +sub.f32 f168, f87, f167; +sub.f32 f169, f123, f128; +mul.f32 f170, f169, 0f3F5DB3D7; +sub.f32 f171, f168, f170; +add.f32 f172, f170, f168; +add.f32 f173, f133, f138; +add.f32 f174, f82, f173; +add.f32 f175, f135, f140; +add.f32 f176, f88, f175; +mul.f32 f177, f173, 0f3F000000; +sub.f32 f178, f82, f177; +sub.f32 f179, f135, f140; +mul.f32 f180, f179, 0f3F5DB3D7; +add.f32 f181, f180, f178; +sub.f32 f182, f178, f180; +mul.f32 f183, f175, 0f3F000000; +sub.f32 f184, f88, f183; +sub.f32 f185, f133, f138; +mul.f32 f186, f185, 0f3F5DB3D7; +sub.f32 f187, f184, f186; +add.f32 f188, f186, f184; +add.f32 f189, %54, %70; +add.f32 f190, %38, f189; +add.f32 f191, %56, %72; +add.f32 f192, %40, f191; +mul.f32 f193, f189, 0f3F000000; +sub.f32 f194, %38, f193; +sub.f32 f195, %56, %72; +mul.f32 f196, f195, 0f3F5DB3D7; +add.f32 f197, f196, f194; +sub.f32 f198, f194, f196; +mul.f32 f199, f191, 0f3F000000; +sub.f32 f200, %40, f199; +sub.f32 f201, %54, %70; +mul.f32 f202, f201, 0f3F5DB3D7; +sub.f32 f203, f200, f202; +add.f32 f204, f202, f200; +add.f32 f205, %60, %76; +add.f32 f206, %44, f205; +add.f32 f207, %61, %77; +add.f32 f208, %45, f207; +mul.f32 f209, f205, 0f3F000000; +sub.f32 f210, %44, f209; +sub.f32 f211, %61, %77; +mul.f32 f212, f211, 0f3F5DB3D7; +add.f32 f213, f212, f210; +sub.f32 f214, f210, f212; +mul.f32 f215, f207, 0f3F000000; +sub.f32 f216, %45, f215; +sub.f32 f217, %60, %76; +mul.f32 f218, f217, 0f3F5DB3D7; +sub.f32 f219, f216, f218; +add.f32 f220, f218, f216; +add.f32 f221, %65, %81; +add.f32 f222, %49, f221; +add.f32 f223, %67, %82; +add.f32 f224, %51, f223; +mul.f32 f225, f221, 0f3F000000; +sub.f32 f226, %49, f225; +sub.f32 f227, %67, %82; +mul.f32 f228, f227, 0f3F5DB3D7; +add.f32 f229, f228, f226; +sub.f32 f230, f226, f228; +mul.f32 f231, f223, 0f3F000000; +sub.f32 f232, %51, f231; +sub.f32 f233, %65, %81; +mul.f32 f234, f233, 0f3F5DB3D7; +sub.f32 f235, f232, f234; +add.f32 f236, f234, f232; +mul.f32 f237, f213, 0f3F441B7D; +mul.f32 f238, f219, 0fBF248DBB; +sub.f32 f239, f237, f238; +mul.f32 f240, f219, 0f3F441B7D; +fma.rn.f32 f241, f213, 0fBF248DBB, f240; +mul.f32 f242, f229, 0f3E31D0D4; +mul.f32 f243, f235, 0fBF7C1C5C; +sub.f32 f244, f242, f243; +mul.f32 f245, f235, 0f3E31D0D4; +fma.rn.f32 f246, f229, 0fBF7C1C5C, f245; +mul.f32 f247, f214, 0f3E31D0D4; +mul.f32 f248, f220, 0fBF7C1C5C; +sub.f32 f249, f247, f248; +mul.f32 f250, f220, 0f3E31D0D4; +fma.rn.f32 f251, f214, 0fBF7C1C5C, f250; +mul.f32 f252, f230, 0fBF708FB2; +mul.f32 f253, f236, 0fBEAF1D44; +sub.f32 f254, f252, f253; +mul.f32 f255, f236, 0fBF708FB2; +fma.rn.f32 f256, f230, 0fBEAF1D44, f255; +add.f32 f257, f206, f222; +add.f32 f258, f190, f257; +add.f32 f259, f208, f224; +add.f32 f260, f192, f259; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, f190, f261; +sub.f32 f263, f208, f224; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +mul.f32 f267, f259, 0f3F000000; +sub.f32 f268, f192, f267; +sub.f32 f269, f206, f222; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +add.f32 f273, f239, f244; +add.f32 f274, f197, f273; +add.f32 f275, f241, f246; +add.f32 f276, f203, f275; +mul.f32 f277, f273, 0f3F000000; +sub.f32 f278, f197, f277; +sub.f32 f279, f241, f246; +mul.f32 f280, f279, 0f3F5DB3D7; +add.f32 f281, f280, f278; +sub.f32 f282, f278, f280; +mul.f32 f283, f275, 0f3F000000; +sub.f32 f284, f203, f283; +sub.f32 f285, f239, f244; +mul.f32 f286, f285, 0f3F5DB3D7; +sub.f32 f287, f284, f286; +add.f32 f288, f286, f284; +add.f32 f289, f249, f254; +add.f32 f290, f198, f289; +add.f32 f291, f251, f256; +add.f32 f292, f204, f291; +mul.f32 f293, f289, 0f3F000000; +sub.f32 f294, f198, f293; +sub.f32 f295, f251, f256; +mul.f32 f296, f295, 0f3F5DB3D7; +add.f32 f297, f296, f294; +sub.f32 f298, f294, f296; +mul.f32 f299, f291, 0f3F000000; +sub.f32 f300, f204, f299; +sub.f32 f301, f249, f254; +mul.f32 f302, f301, 0f3F5DB3D7; +sub.f32 f303, f300, f302; +add.f32 f304, f302, f300; +mul.f32 f305, f274, 0f3F708FB2; +mul.f32 f306, f276, 0fBEAF1D44; +sub.f32 f307, f305, f306; +mul.f32 f308, f276, 0f3F708FB2; +fma.rn.f32 f309, f274, 0fBEAF1D44, f308; +mul.f32 f310, f290, 0f3F441B7D; +mul.f32 f311, f292, 0fBF248DBB; +sub.f32 f312, f310, f311; +mul.f32 f313, f292, 0f3F441B7D; +fma.rn.f32 f314, f290, 0fBF248DBB, f313; +mul.f32 f315, f265, 0f3F000000; +mul.f32 f316, f271, 0fBF5DB3D7; +sub.f32 f317, f315, f316; +mul.f32 f318, f271, 0f3F000000; +fma.rn.f32 f319, f265, 0fBF5DB3D7, f318; +mul.f32 f320, f281, 0f3E31D0D4; +mul.f32 f321, f287, 0fBF7C1C5C; +sub.f32 f322, f320, f321; +mul.f32 f323, f287, 0f3E31D0D4; +fma.rn.f32 f324, f281, 0fBF7C1C5C, f323; +mul.f32 f325, f297, 0fBE31D0D4; +mul.f32 f326, f303, 0fBF7C1C5C; +sub.f32 f327, f325, f326; +mul.f32 f328, f303, 0fBE31D0D4; +fma.rn.f32 f329, f297, 0fBF7C1C5C, f328; +mul.f32 f330, f266, 0fBF000000; +mul.f32 f331, f272, 0fBF5DB3D7; +sub.f32 f332, f330, f331; +mul.f32 f333, f272, 0fBF000000; +fma.rn.f32 f334, f266, 0fBF5DB3D7, f333; +mul.f32 f335, f282, 0fBF441B7D; +mul.f32 f336, f288, 0fBF248DBB; +sub.f32 f337, f335, f336; +mul.f32 f338, f288, 0fBF441B7D; +fma.rn.f32 f339, f282, 0fBF248DBB, f338; +mul.f32 f340, f298, 0fBF708FB2; +mul.f32 f341, f304, 0fBEAF1D44; +sub.f32 f342, f340, f341; +mul.f32 f343, f304, 0fBF708FB2; +fma.rn.f32 f344, f298, 0fBEAF1D44, f343; +add.f32 %1, f144, f260; +add.f32 %0, f142, f258; +add.f32 %3, f160, f309; +add.f32 %2, f158, f307; +add.f32 %5, f176, f314; +add.f32 %4, f174, f312; +add.f32 %7, f155, f319; +add.f32 %6, f149, f317; +add.f32 %9, f171, f324; +add.f32 %8, f165, f322; +add.f32 %11, f187, f329; +add.f32 %10, f181, f327; +add.f32 %13, f156, f334; +add.f32 %12, f150, f332; +add.f32 %15, f172, f339; +add.f32 %14, f166, f337; +add.f32 %17, f188, f344; +add.f32 %16, f182, f342; +sub.f32 %19, f144, f260; +sub.f32 %18, f142, f258; +sub.f32 %21, f160, f309; +sub.f32 %20, f158, f307; +sub.f32 %23, f176, f314; +sub.f32 %22, f174, f312; +sub.f32 %25, f155, f319; +sub.f32 %24, f149, f317; +sub.f32 %27, f171, f324; +sub.f32 %26, f165, f322; +sub.f32 %29, f187, f329; +sub.f32 %28, f181, f327; +sub.f32 %31, f156, f334; +sub.f32 %30, f150, f332; +sub.f32 %33, f172, f339; +sub.f32 %32, f166, f337; +sub.f32 %35, f188, f344; +sub.f32 %34, f182, f342; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..134677062f792 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp32_inv.hpp.inc @@ -0,0 +1,324 @@ +#ifndef CUFFTDX_FFT_18_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_18_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<206, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<381>; +.reg .b64 rd<2>; +add.f32 f73, %52, %68; +add.f32 f74, %36, f73; +add.f32 f75, %53, %69; +add.f32 f76, %37, f75; +mul.f32 f77, f73, 0f3F000000; +sub.f32 f78, %36, f77; +sub.f32 f79, %53, %69; +mul.f32 f80, f79, 0fBF5DB3D7; +add.f32 f81, f80, f78; +sub.f32 f82, f78, f80; +mul.f32 f83, f75, 0f3F000000; +sub.f32 f84, %37, f83; +sub.f32 f85, %52, %68; +mul.f32 f86, f85, 0fBF5DB3D7; +sub.f32 f87, f84, f86; +add.f32 f88, f86, f84; +add.f32 f89, %57, %73; +add.f32 f90, %41, f89; +add.f32 f91, %59, %75; +add.f32 f92, %43, f91; +mul.f32 f93, f89, 0f3F000000; +sub.f32 f94, %41, f93; +sub.f32 f95, %59, %75; +mul.f32 f96, f95, 0fBF5DB3D7; +add.f32 f97, f96, f94; +sub.f32 f98, f94, f96; +mul.f32 f99, f91, 0f3F000000; +sub.f32 f100, %43, f99; +sub.f32 f101, %57, %73; +mul.f32 f102, f101, 0fBF5DB3D7; +sub.f32 f103, f100, f102; +add.f32 f104, f102, f100; +add.f32 f105, %62, %78; +add.f32 f106, %46, f105; +add.f32 f107, %64, %80; +add.f32 f108, %48, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, %46, f109; +sub.f32 f111, %64, %80; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, %48, f115; +sub.f32 f117, %62, %78; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.f32 f121, f97, 0f3F441B7D; +mul.f32 f122, f103, 0f3F248DBB; +sub.f32 f123, f121, f122; +mul.f32 f124, f103, 0f3F441B7D; +fma.rn.f32 f125, f97, 0f3F248DBB, f124; +mul.f32 f126, f113, 0f3E31D0D4; +mul.f32 f127, f119, 0f3F7C1C5C; +sub.f32 f128, f126, f127; +mul.f32 f129, f119, 0f3E31D0D4; +fma.rn.f32 f130, f113, 0f3F7C1C5C, f129; +mul.f32 f131, f98, 0f3E31D0D4; +mul.f32 f132, f104, 0f3F7C1C5C; +sub.f32 f133, f131, f132; +mul.f32 f134, f104, 0f3E31D0D4; +fma.rn.f32 f135, f98, 0f3F7C1C5C, f134; +mul.f32 f136, f114, 0fBF708FB2; +mul.f32 f137, f120, 0f3EAF1D44; +sub.f32 f138, f136, f137; +mul.f32 f139, f120, 0fBF708FB2; +fma.rn.f32 f140, f114, 0f3EAF1D44, f139; +add.f32 f141, f90, f106; +add.f32 f142, f74, f141; +add.f32 f143, f92, f108; +add.f32 f144, f76, f143; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, f74, f145; +sub.f32 f147, f92, f108; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +mul.f32 f151, f143, 0f3F000000; +sub.f32 f152, f76, f151; +sub.f32 f153, f90, f106; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +add.f32 f157, f123, f128; +add.f32 f158, f81, f157; +add.f32 f159, f125, f130; +add.f32 f160, f87, f159; +mul.f32 f161, f157, 0f3F000000; +sub.f32 f162, f81, f161; +sub.f32 f163, f125, f130; +mul.f32 f164, f163, 0fBF5DB3D7; +add.f32 f165, f164, f162; +sub.f32 f166, f162, f164; +mul.f32 f167, f159, 0f3F000000; +sub.f32 f168, f87, f167; +sub.f32 f169, f123, f128; +mul.f32 f170, f169, 0fBF5DB3D7; +sub.f32 f171, f168, f170; +add.f32 f172, f170, f168; +add.f32 f173, f133, f138; +add.f32 f174, f82, f173; +add.f32 f175, f135, f140; +add.f32 f176, f88, f175; +mul.f32 f177, f173, 0f3F000000; +sub.f32 f178, f82, f177; +sub.f32 f179, f135, f140; +mul.f32 f180, f179, 0fBF5DB3D7; +add.f32 f181, f180, f178; +sub.f32 f182, f178, f180; +mul.f32 f183, f175, 0f3F000000; +sub.f32 f184, f88, f183; +sub.f32 f185, f133, f138; +mul.f32 f186, f185, 0fBF5DB3D7; +sub.f32 f187, f184, f186; +add.f32 f188, f186, f184; +add.f32 f189, %54, %70; +add.f32 f190, %38, f189; +add.f32 f191, %56, %72; +add.f32 f192, %40, f191; +mul.f32 f193, f189, 0f3F000000; +sub.f32 f194, %38, f193; +sub.f32 f195, %56, %72; +mul.f32 f196, f195, 0fBF5DB3D7; +add.f32 f197, f196, f194; +sub.f32 f198, f194, f196; +mul.f32 f199, f191, 0f3F000000; +sub.f32 f200, %40, f199; +sub.f32 f201, %54, %70; +mul.f32 f202, f201, 0fBF5DB3D7; +sub.f32 f203, f200, f202; +add.f32 f204, f202, f200; +add.f32 f205, %60, %76; +add.f32 f206, %44, f205; +add.f32 f207, %61, %77; +add.f32 f208, %45, f207; +mul.f32 f209, f205, 0f3F000000; +sub.f32 f210, %44, f209; +sub.f32 f211, %61, %77; +mul.f32 f212, f211, 0fBF5DB3D7; +add.f32 f213, f212, f210; +sub.f32 f214, f210, f212; +mul.f32 f215, f207, 0f3F000000; +sub.f32 f216, %45, f215; +sub.f32 f217, %60, %76; +mul.f32 f218, f217, 0fBF5DB3D7; +sub.f32 f219, f216, f218; +add.f32 f220, f218, f216; +add.f32 f221, %65, %81; +add.f32 f222, %49, f221; +add.f32 f223, %67, %82; +add.f32 f224, %51, f223; +mul.f32 f225, f221, 0f3F000000; +sub.f32 f226, %49, f225; +sub.f32 f227, %67, %82; +mul.f32 f228, f227, 0fBF5DB3D7; +add.f32 f229, f228, f226; +sub.f32 f230, f226, f228; +mul.f32 f231, f223, 0f3F000000; +sub.f32 f232, %51, f231; +sub.f32 f233, %65, %81; +mul.f32 f234, f233, 0fBF5DB3D7; +sub.f32 f235, f232, f234; +add.f32 f236, f234, f232; +mul.f32 f237, f213, 0f3F441B7D; +mul.f32 f238, f219, 0f3F248DBB; +sub.f32 f239, f237, f238; +mul.f32 f240, f219, 0f3F441B7D; +fma.rn.f32 f241, f213, 0f3F248DBB, f240; +mul.f32 f242, f229, 0f3E31D0D4; +mul.f32 f243, f235, 0f3F7C1C5C; +sub.f32 f244, f242, f243; +mul.f32 f245, f235, 0f3E31D0D4; +fma.rn.f32 f246, f229, 0f3F7C1C5C, f245; +mul.f32 f247, f214, 0f3E31D0D4; +mul.f32 f248, f220, 0f3F7C1C5C; +sub.f32 f249, f247, f248; +mul.f32 f250, f220, 0f3E31D0D4; +fma.rn.f32 f251, f214, 0f3F7C1C5C, f250; +mul.f32 f252, f230, 0fBF708FB2; +mul.f32 f253, f236, 0f3EAF1D44; +sub.f32 f254, f252, f253; +mul.f32 f255, f236, 0fBF708FB2; +fma.rn.f32 f256, f230, 0f3EAF1D44, f255; +add.f32 f257, f206, f222; +add.f32 f258, f190, f257; +add.f32 f259, f208, f224; +add.f32 f260, f192, f259; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, f190, f261; +sub.f32 f263, f208, f224; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +mul.f32 f267, f259, 0f3F000000; +sub.f32 f268, f192, f267; +sub.f32 f269, f206, f222; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +add.f32 f273, f239, f244; +add.f32 f274, f197, f273; +add.f32 f275, f241, f246; +add.f32 f276, f203, f275; +mul.f32 f277, f273, 0f3F000000; +sub.f32 f278, f197, f277; +sub.f32 f279, f241, f246; +mul.f32 f280, f279, 0fBF5DB3D7; +add.f32 f281, f280, f278; +sub.f32 f282, f278, f280; +mul.f32 f283, f275, 0f3F000000; +sub.f32 f284, f203, f283; +sub.f32 f285, f239, f244; +mul.f32 f286, f285, 0fBF5DB3D7; +sub.f32 f287, f284, f286; +add.f32 f288, f286, f284; +add.f32 f289, f249, f254; +add.f32 f290, f198, f289; +add.f32 f291, f251, f256; +add.f32 f292, f204, f291; +mul.f32 f293, f289, 0f3F000000; +sub.f32 f294, f198, f293; +sub.f32 f295, f251, f256; +mul.f32 f296, f295, 0fBF5DB3D7; +add.f32 f297, f296, f294; +sub.f32 f298, f294, f296; +mul.f32 f299, f291, 0f3F000000; +sub.f32 f300, f204, f299; +sub.f32 f301, f249, f254; +mul.f32 f302, f301, 0fBF5DB3D7; +sub.f32 f303, f300, f302; +add.f32 f304, f302, f300; +mul.f32 f305, f274, 0f3F708FB2; +mul.f32 f306, f276, 0f3EAF1D44; +sub.f32 f307, f305, f306; +mul.f32 f308, f276, 0f3F708FB2; +fma.rn.f32 f309, f274, 0f3EAF1D44, f308; +mul.f32 f310, f290, 0f3F441B7D; +mul.f32 f311, f292, 0f3F248DBB; +sub.f32 f312, f310, f311; +mul.f32 f313, f292, 0f3F441B7D; +fma.rn.f32 f314, f290, 0f3F248DBB, f313; +mul.f32 f315, f265, 0f3F000000; +mul.f32 f316, f271, 0f3F5DB3D7; +sub.f32 f317, f315, f316; +mul.f32 f318, f271, 0f3F000000; +fma.rn.f32 f319, f265, 0f3F5DB3D7, f318; +mul.f32 f320, f281, 0f3E31D0D4; +mul.f32 f321, f287, 0f3F7C1C5C; +sub.f32 f322, f320, f321; +mul.f32 f323, f287, 0f3E31D0D4; +fma.rn.f32 f324, f281, 0f3F7C1C5C, f323; +mul.f32 f325, f297, 0fBE31D0D4; +mul.f32 f326, f303, 0f3F7C1C5C; +sub.f32 f327, f325, f326; +mul.f32 f328, f303, 0fBE31D0D4; +fma.rn.f32 f329, f297, 0f3F7C1C5C, f328; +mul.f32 f330, f266, 0fBF000000; +mul.f32 f331, f272, 0f3F5DB3D7; +sub.f32 f332, f330, f331; +mul.f32 f333, f272, 0fBF000000; +fma.rn.f32 f334, f266, 0f3F5DB3D7, f333; +mul.f32 f335, f282, 0fBF441B7D; +mul.f32 f336, f288, 0f3F248DBB; +sub.f32 f337, f335, f336; +mul.f32 f338, f288, 0fBF441B7D; +fma.rn.f32 f339, f282, 0f3F248DBB, f338; +mul.f32 f340, f298, 0fBF708FB2; +mul.f32 f341, f304, 0f3EAF1D44; +sub.f32 f342, f340, f341; +mul.f32 f343, f304, 0fBF708FB2; +fma.rn.f32 f344, f298, 0f3EAF1D44, f343; +add.f32 %1, f144, f260; +add.f32 %0, f142, f258; +add.f32 %3, f160, f309; +add.f32 %2, f158, f307; +add.f32 %5, f176, f314; +add.f32 %4, f174, f312; +add.f32 %7, f155, f319; +add.f32 %6, f149, f317; +add.f32 %9, f171, f324; +add.f32 %8, f165, f322; +add.f32 %11, f187, f329; +add.f32 %10, f181, f327; +add.f32 %13, f156, f334; +add.f32 %12, f150, f332; +add.f32 %15, f172, f339; +add.f32 %14, f166, f337; +add.f32 %17, f188, f344; +add.f32 %16, f182, f342; +sub.f32 %19, f144, f260; +sub.f32 %18, f142, f258; +sub.f32 %21, f160, f309; +sub.f32 %20, f158, f307; +sub.f32 %23, f176, f314; +sub.f32 %22, f174, f312; +sub.f32 %25, f155, f319; +sub.f32 %24, f149, f317; +sub.f32 %27, f171, f324; +sub.f32 %26, f165, f322; +sub.f32 %29, f187, f329; +sub.f32 %28, f181, f327; +sub.f32 %31, f156, f334; +sub.f32 %30, f150, f332; +sub.f32 %33, f172, f339; +sub.f32 %32, f166, f337; +sub.f32 %35, f188, f344; +sub.f32 %34, f182, f342; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..25add61016c88 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp64_fwd.hpp.inc @@ -0,0 +1,324 @@ +#ifndef CUFFTDX_FFT_18_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_18_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<408, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<381>; +.reg .b64 rd<2>; +add.f64 fd73, %52, %68; +add.f64 fd74, %36, fd73; +add.f64 fd75, %53, %69; +add.f64 fd76, %37, fd75; +mul.f64 fd77, fd73, 0d3FE0000000000000; +sub.f64 fd78, %36, fd77; +sub.f64 fd79, %53, %69; +mul.f64 fd80, fd79, 0d3FEBB67AE8584CAA; +add.f64 fd81, fd80, fd78; +sub.f64 fd82, fd78, fd80; +mul.f64 fd83, fd75, 0d3FE0000000000000; +sub.f64 fd84, %37, fd83; +sub.f64 fd85, %52, %68; +mul.f64 fd86, fd85, 0d3FEBB67AE8584CAA; +sub.f64 fd87, fd84, fd86; +add.f64 fd88, fd86, fd84; +add.f64 fd89, %57, %73; +add.f64 fd90, %41, fd89; +add.f64 fd91, %59, %75; +add.f64 fd92, %43, fd91; +mul.f64 fd93, fd89, 0d3FE0000000000000; +sub.f64 fd94, %41, fd93; +sub.f64 fd95, %59, %75; +mul.f64 fd96, fd95, 0d3FEBB67AE8584CAA; +add.f64 fd97, fd96, fd94; +sub.f64 fd98, fd94, fd96; +mul.f64 fd99, fd91, 0d3FE0000000000000; +sub.f64 fd100, %43, fd99; +sub.f64 fd101, %57, %73; +mul.f64 fd102, fd101, 0d3FEBB67AE8584CAA; +sub.f64 fd103, fd100, fd102; +add.f64 fd104, fd102, fd100; +add.f64 fd105, %62, %78; +add.f64 fd106, %46, fd105; +add.f64 fd107, %64, %80; +add.f64 fd108, %48, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, %46, fd109; +sub.f64 fd111, %64, %80; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, %48, fd115; +sub.f64 fd117, %62, %78; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +mul.f64 fd121, fd97, 0d3FE8836FA2CF5039; +mul.f64 fd122, fd103, 0dBFE491B7523C161D; +sub.f64 fd123, fd121, fd122; +mul.f64 fd124, fd103, 0d3FE8836FA2CF5039; +fma.rn.f64 fd125, fd97, 0dBFE491B7523C161D, fd124; +mul.f64 fd126, fd113, 0d3FC63A1A7E0B738A; +mul.f64 fd127, fd119, 0dBFEF838B8C811C17; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd119, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd130, fd113, 0dBFEF838B8C811C17, fd129; +mul.f64 fd131, fd98, 0d3FC63A1A7E0B738A; +mul.f64 fd132, fd104, 0dBFEF838B8C811C17; +sub.f64 fd133, fd131, fd132; +mul.f64 fd134, fd104, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd135, fd98, 0dBFEF838B8C811C17, fd134; +mul.f64 fd136, fd114, 0dBFEE11F642522D1C; +mul.f64 fd137, fd120, 0dBFD5E3A8748A0BF5; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd120, 0dBFEE11F642522D1C; +fma.rn.f64 fd140, fd114, 0dBFD5E3A8748A0BF5, fd139; +add.f64 fd141, fd90, fd106; +add.f64 fd142, fd74, fd141; +add.f64 fd143, fd92, fd108; +add.f64 fd144, fd76, fd143; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, fd74, fd145; +sub.f64 fd147, fd92, fd108; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +mul.f64 fd151, fd143, 0d3FE0000000000000; +sub.f64 fd152, fd76, fd151; +sub.f64 fd153, fd90, fd106; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +add.f64 fd157, fd123, fd128; +add.f64 fd158, fd81, fd157; +add.f64 fd159, fd125, fd130; +add.f64 fd160, fd87, fd159; +mul.f64 fd161, fd157, 0d3FE0000000000000; +sub.f64 fd162, fd81, fd161; +sub.f64 fd163, fd125, fd130; +mul.f64 fd164, fd163, 0d3FEBB67AE8584CAA; +add.f64 fd165, fd164, fd162; +sub.f64 fd166, fd162, fd164; +mul.f64 fd167, fd159, 0d3FE0000000000000; +sub.f64 fd168, fd87, fd167; +sub.f64 fd169, fd123, fd128; +mul.f64 fd170, fd169, 0d3FEBB67AE8584CAA; +sub.f64 fd171, fd168, fd170; +add.f64 fd172, fd170, fd168; +add.f64 fd173, fd133, fd138; +add.f64 fd174, fd82, fd173; +add.f64 fd175, fd135, fd140; +add.f64 fd176, fd88, fd175; +mul.f64 fd177, fd173, 0d3FE0000000000000; +sub.f64 fd178, fd82, fd177; +sub.f64 fd179, fd135, fd140; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +add.f64 fd181, fd180, fd178; +sub.f64 fd182, fd178, fd180; +mul.f64 fd183, fd175, 0d3FE0000000000000; +sub.f64 fd184, fd88, fd183; +sub.f64 fd185, fd133, fd138; +mul.f64 fd186, fd185, 0d3FEBB67AE8584CAA; +sub.f64 fd187, fd184, fd186; +add.f64 fd188, fd186, fd184; +add.f64 fd189, %54, %70; +add.f64 fd190, %38, fd189; +add.f64 fd191, %56, %72; +add.f64 fd192, %40, fd191; +mul.f64 fd193, fd189, 0d3FE0000000000000; +sub.f64 fd194, %38, fd193; +sub.f64 fd195, %56, %72; +mul.f64 fd196, fd195, 0d3FEBB67AE8584CAA; +add.f64 fd197, fd196, fd194; +sub.f64 fd198, fd194, fd196; +mul.f64 fd199, fd191, 0d3FE0000000000000; +sub.f64 fd200, %40, fd199; +sub.f64 fd201, %54, %70; +mul.f64 fd202, fd201, 0d3FEBB67AE8584CAA; +sub.f64 fd203, fd200, fd202; +add.f64 fd204, fd202, fd200; +add.f64 fd205, %60, %76; +add.f64 fd206, %44, fd205; +add.f64 fd207, %61, %77; +add.f64 fd208, %45, fd207; +mul.f64 fd209, fd205, 0d3FE0000000000000; +sub.f64 fd210, %44, fd209; +sub.f64 fd211, %61, %77; +mul.f64 fd212, fd211, 0d3FEBB67AE8584CAA; +add.f64 fd213, fd212, fd210; +sub.f64 fd214, fd210, fd212; +mul.f64 fd215, fd207, 0d3FE0000000000000; +sub.f64 fd216, %45, fd215; +sub.f64 fd217, %60, %76; +mul.f64 fd218, fd217, 0d3FEBB67AE8584CAA; +sub.f64 fd219, fd216, fd218; +add.f64 fd220, fd218, fd216; +add.f64 fd221, %65, %81; +add.f64 fd222, %49, fd221; +add.f64 fd223, %67, %82; +add.f64 fd224, %51, fd223; +mul.f64 fd225, fd221, 0d3FE0000000000000; +sub.f64 fd226, %49, fd225; +sub.f64 fd227, %67, %82; +mul.f64 fd228, fd227, 0d3FEBB67AE8584CAA; +add.f64 fd229, fd228, fd226; +sub.f64 fd230, fd226, fd228; +mul.f64 fd231, fd223, 0d3FE0000000000000; +sub.f64 fd232, %51, fd231; +sub.f64 fd233, %65, %81; +mul.f64 fd234, fd233, 0d3FEBB67AE8584CAA; +sub.f64 fd235, fd232, fd234; +add.f64 fd236, fd234, fd232; +mul.f64 fd237, fd213, 0d3FE8836FA2CF5039; +mul.f64 fd238, fd219, 0dBFE491B7523C161D; +sub.f64 fd239, fd237, fd238; +mul.f64 fd240, fd219, 0d3FE8836FA2CF5039; +fma.rn.f64 fd241, fd213, 0dBFE491B7523C161D, fd240; +mul.f64 fd242, fd229, 0d3FC63A1A7E0B738A; +mul.f64 fd243, fd235, 0dBFEF838B8C811C17; +sub.f64 fd244, fd242, fd243; +mul.f64 fd245, fd235, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd246, fd229, 0dBFEF838B8C811C17, fd245; +mul.f64 fd247, fd214, 0d3FC63A1A7E0B738A; +mul.f64 fd248, fd220, 0dBFEF838B8C811C17; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd220, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd251, fd214, 0dBFEF838B8C811C17, fd250; +mul.f64 fd252, fd230, 0dBFEE11F642522D1C; +mul.f64 fd253, fd236, 0dBFD5E3A8748A0BF5; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd236, 0dBFEE11F642522D1C; +fma.rn.f64 fd256, fd230, 0dBFD5E3A8748A0BF5, fd255; +add.f64 fd257, fd206, fd222; +add.f64 fd258, fd190, fd257; +add.f64 fd259, fd208, fd224; +add.f64 fd260, fd192, fd259; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, fd190, fd261; +sub.f64 fd263, fd208, fd224; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +mul.f64 fd267, fd259, 0d3FE0000000000000; +sub.f64 fd268, fd192, fd267; +sub.f64 fd269, fd206, fd222; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +add.f64 fd273, fd239, fd244; +add.f64 fd274, fd197, fd273; +add.f64 fd275, fd241, fd246; +add.f64 fd276, fd203, fd275; +mul.f64 fd277, fd273, 0d3FE0000000000000; +sub.f64 fd278, fd197, fd277; +sub.f64 fd279, fd241, fd246; +mul.f64 fd280, fd279, 0d3FEBB67AE8584CAA; +add.f64 fd281, fd280, fd278; +sub.f64 fd282, fd278, fd280; +mul.f64 fd283, fd275, 0d3FE0000000000000; +sub.f64 fd284, fd203, fd283; +sub.f64 fd285, fd239, fd244; +mul.f64 fd286, fd285, 0d3FEBB67AE8584CAA; +sub.f64 fd287, fd284, fd286; +add.f64 fd288, fd286, fd284; +add.f64 fd289, fd249, fd254; +add.f64 fd290, fd198, fd289; +add.f64 fd291, fd251, fd256; +add.f64 fd292, fd204, fd291; +mul.f64 fd293, fd289, 0d3FE0000000000000; +sub.f64 fd294, fd198, fd293; +sub.f64 fd295, fd251, fd256; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +add.f64 fd297, fd296, fd294; +sub.f64 fd298, fd294, fd296; +mul.f64 fd299, fd291, 0d3FE0000000000000; +sub.f64 fd300, fd204, fd299; +sub.f64 fd301, fd249, fd254; +mul.f64 fd302, fd301, 0d3FEBB67AE8584CAA; +sub.f64 fd303, fd300, fd302; +add.f64 fd304, fd302, fd300; +mul.f64 fd305, fd274, 0d3FEE11F642522D1C; +mul.f64 fd306, fd276, 0dBFD5E3A8748A0BF5; +sub.f64 fd307, fd305, fd306; +mul.f64 fd308, fd276, 0d3FEE11F642522D1C; +fma.rn.f64 fd309, fd274, 0dBFD5E3A8748A0BF5, fd308; +mul.f64 fd310, fd290, 0d3FE8836FA2CF5039; +mul.f64 fd311, fd292, 0dBFE491B7523C161D; +sub.f64 fd312, fd310, fd311; +mul.f64 fd313, fd292, 0d3FE8836FA2CF5039; +fma.rn.f64 fd314, fd290, 0dBFE491B7523C161D, fd313; +mul.f64 fd315, fd265, 0d3FE0000000000000; +mul.f64 fd316, fd271, 0dBFEBB67AE8584CAA; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd271, 0d3FE0000000000000; +fma.rn.f64 fd319, fd265, 0dBFEBB67AE8584CAA, fd318; +mul.f64 fd320, fd281, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd287, 0dBFEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd287, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd281, 0dBFEF838B8C811C17, fd323; +mul.f64 fd325, fd297, 0dBFC63A1A7E0B738A; +mul.f64 fd326, fd303, 0dBFEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd303, 0dBFC63A1A7E0B738A; +fma.rn.f64 fd329, fd297, 0dBFEF838B8C811C17, fd328; +mul.f64 fd330, fd266, 0dBFE0000000000000; +mul.f64 fd331, fd272, 0dBFEBB67AE8584CAA; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd272, 0dBFE0000000000000; +fma.rn.f64 fd334, fd266, 0dBFEBB67AE8584CAA, fd333; +mul.f64 fd335, fd282, 0dBFE8836FA2CF5039; +mul.f64 fd336, fd288, 0dBFE491B7523C161D; +sub.f64 fd337, fd335, fd336; +mul.f64 fd338, fd288, 0dBFE8836FA2CF5039; +fma.rn.f64 fd339, fd282, 0dBFE491B7523C161D, fd338; +mul.f64 fd340, fd298, 0dBFEE11F642522D1C; +mul.f64 fd341, fd304, 0dBFD5E3A8748A0BF5; +sub.f64 fd342, fd340, fd341; +mul.f64 fd343, fd304, 0dBFEE11F642522D1C; +fma.rn.f64 fd344, fd298, 0dBFD5E3A8748A0BF5, fd343; +add.f64 %1, fd144, fd260; +add.f64 %0, fd142, fd258; +add.f64 %3, fd160, fd309; +add.f64 %2, fd158, fd307; +add.f64 %5, fd176, fd314; +add.f64 %4, fd174, fd312; +add.f64 %7, fd155, fd319; +add.f64 %6, fd149, fd317; +add.f64 %9, fd171, fd324; +add.f64 %8, fd165, fd322; +add.f64 %11, fd187, fd329; +add.f64 %10, fd181, fd327; +add.f64 %13, fd156, fd334; +add.f64 %12, fd150, fd332; +add.f64 %15, fd172, fd339; +add.f64 %14, fd166, fd337; +add.f64 %17, fd188, fd344; +add.f64 %16, fd182, fd342; +sub.f64 %19, fd144, fd260; +sub.f64 %18, fd142, fd258; +sub.f64 %21, fd160, fd309; +sub.f64 %20, fd158, fd307; +sub.f64 %23, fd176, fd314; +sub.f64 %22, fd174, fd312; +sub.f64 %25, fd155, fd319; +sub.f64 %24, fd149, fd317; +sub.f64 %27, fd171, fd324; +sub.f64 %26, fd165, fd322; +sub.f64 %29, fd187, fd329; +sub.f64 %28, fd181, fd327; +sub.f64 %31, fd156, fd334; +sub.f64 %30, fd150, fd332; +sub.f64 %33, fd172, fd339; +sub.f64 %32, fd166, fd337; +sub.f64 %35, fd188, fd344; +sub.f64 %34, fd182, fd342; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..c73f08cadad00 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_18_fp64_inv.hpp.inc @@ -0,0 +1,324 @@ +#ifndef CUFFTDX_FFT_18_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_18_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<579, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<381>; +.reg .b64 rd<2>; +add.f64 fd73, %52, %68; +add.f64 fd74, %36, fd73; +add.f64 fd75, %53, %69; +add.f64 fd76, %37, fd75; +mul.f64 fd77, fd73, 0d3FE0000000000000; +sub.f64 fd78, %36, fd77; +sub.f64 fd79, %53, %69; +mul.f64 fd80, fd79, 0dBFEBB67AE8584CAA; +add.f64 fd81, fd80, fd78; +sub.f64 fd82, fd78, fd80; +mul.f64 fd83, fd75, 0d3FE0000000000000; +sub.f64 fd84, %37, fd83; +sub.f64 fd85, %52, %68; +mul.f64 fd86, fd85, 0dBFEBB67AE8584CAA; +sub.f64 fd87, fd84, fd86; +add.f64 fd88, fd86, fd84; +add.f64 fd89, %57, %73; +add.f64 fd90, %41, fd89; +add.f64 fd91, %59, %75; +add.f64 fd92, %43, fd91; +mul.f64 fd93, fd89, 0d3FE0000000000000; +sub.f64 fd94, %41, fd93; +sub.f64 fd95, %59, %75; +mul.f64 fd96, fd95, 0dBFEBB67AE8584CAA; +add.f64 fd97, fd96, fd94; +sub.f64 fd98, fd94, fd96; +mul.f64 fd99, fd91, 0d3FE0000000000000; +sub.f64 fd100, %43, fd99; +sub.f64 fd101, %57, %73; +mul.f64 fd102, fd101, 0dBFEBB67AE8584CAA; +sub.f64 fd103, fd100, fd102; +add.f64 fd104, fd102, fd100; +add.f64 fd105, %62, %78; +add.f64 fd106, %46, fd105; +add.f64 fd107, %64, %80; +add.f64 fd108, %48, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, %46, fd109; +sub.f64 fd111, %64, %80; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, %48, fd115; +sub.f64 fd117, %62, %78; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +mul.f64 fd121, fd97, 0d3FE8836FA2CF5039; +mul.f64 fd122, fd103, 0d3FE491B7523C161D; +sub.f64 fd123, fd121, fd122; +mul.f64 fd124, fd103, 0d3FE8836FA2CF5039; +fma.rn.f64 fd125, fd97, 0d3FE491B7523C161D, fd124; +mul.f64 fd126, fd113, 0d3FC63A1A7E0B738A; +mul.f64 fd127, fd119, 0d3FEF838B8C811C17; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd119, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd130, fd113, 0d3FEF838B8C811C17, fd129; +mul.f64 fd131, fd98, 0d3FC63A1A7E0B738A; +mul.f64 fd132, fd104, 0d3FEF838B8C811C17; +sub.f64 fd133, fd131, fd132; +mul.f64 fd134, fd104, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd135, fd98, 0d3FEF838B8C811C17, fd134; +mul.f64 fd136, fd114, 0dBFEE11F642522D1C; +mul.f64 fd137, fd120, 0d3FD5E3A8748A0BF5; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd120, 0dBFEE11F642522D1C; +fma.rn.f64 fd140, fd114, 0d3FD5E3A8748A0BF5, fd139; +add.f64 fd141, fd90, fd106; +add.f64 fd142, fd74, fd141; +add.f64 fd143, fd92, fd108; +add.f64 fd144, fd76, fd143; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, fd74, fd145; +sub.f64 fd147, fd92, fd108; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +mul.f64 fd151, fd143, 0d3FE0000000000000; +sub.f64 fd152, fd76, fd151; +sub.f64 fd153, fd90, fd106; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +add.f64 fd157, fd123, fd128; +add.f64 fd158, fd81, fd157; +add.f64 fd159, fd125, fd130; +add.f64 fd160, fd87, fd159; +mul.f64 fd161, fd157, 0d3FE0000000000000; +sub.f64 fd162, fd81, fd161; +sub.f64 fd163, fd125, fd130; +mul.f64 fd164, fd163, 0dBFEBB67AE8584CAA; +add.f64 fd165, fd164, fd162; +sub.f64 fd166, fd162, fd164; +mul.f64 fd167, fd159, 0d3FE0000000000000; +sub.f64 fd168, fd87, fd167; +sub.f64 fd169, fd123, fd128; +mul.f64 fd170, fd169, 0dBFEBB67AE8584CAA; +sub.f64 fd171, fd168, fd170; +add.f64 fd172, fd170, fd168; +add.f64 fd173, fd133, fd138; +add.f64 fd174, fd82, fd173; +add.f64 fd175, fd135, fd140; +add.f64 fd176, fd88, fd175; +mul.f64 fd177, fd173, 0d3FE0000000000000; +sub.f64 fd178, fd82, fd177; +sub.f64 fd179, fd135, fd140; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +add.f64 fd181, fd180, fd178; +sub.f64 fd182, fd178, fd180; +mul.f64 fd183, fd175, 0d3FE0000000000000; +sub.f64 fd184, fd88, fd183; +sub.f64 fd185, fd133, fd138; +mul.f64 fd186, fd185, 0dBFEBB67AE8584CAA; +sub.f64 fd187, fd184, fd186; +add.f64 fd188, fd186, fd184; +add.f64 fd189, %54, %70; +add.f64 fd190, %38, fd189; +add.f64 fd191, %56, %72; +add.f64 fd192, %40, fd191; +mul.f64 fd193, fd189, 0d3FE0000000000000; +sub.f64 fd194, %38, fd193; +sub.f64 fd195, %56, %72; +mul.f64 fd196, fd195, 0dBFEBB67AE8584CAA; +add.f64 fd197, fd196, fd194; +sub.f64 fd198, fd194, fd196; +mul.f64 fd199, fd191, 0d3FE0000000000000; +sub.f64 fd200, %40, fd199; +sub.f64 fd201, %54, %70; +mul.f64 fd202, fd201, 0dBFEBB67AE8584CAA; +sub.f64 fd203, fd200, fd202; +add.f64 fd204, fd202, fd200; +add.f64 fd205, %60, %76; +add.f64 fd206, %44, fd205; +add.f64 fd207, %61, %77; +add.f64 fd208, %45, fd207; +mul.f64 fd209, fd205, 0d3FE0000000000000; +sub.f64 fd210, %44, fd209; +sub.f64 fd211, %61, %77; +mul.f64 fd212, fd211, 0dBFEBB67AE8584CAA; +add.f64 fd213, fd212, fd210; +sub.f64 fd214, fd210, fd212; +mul.f64 fd215, fd207, 0d3FE0000000000000; +sub.f64 fd216, %45, fd215; +sub.f64 fd217, %60, %76; +mul.f64 fd218, fd217, 0dBFEBB67AE8584CAA; +sub.f64 fd219, fd216, fd218; +add.f64 fd220, fd218, fd216; +add.f64 fd221, %65, %81; +add.f64 fd222, %49, fd221; +add.f64 fd223, %67, %82; +add.f64 fd224, %51, fd223; +mul.f64 fd225, fd221, 0d3FE0000000000000; +sub.f64 fd226, %49, fd225; +sub.f64 fd227, %67, %82; +mul.f64 fd228, fd227, 0dBFEBB67AE8584CAA; +add.f64 fd229, fd228, fd226; +sub.f64 fd230, fd226, fd228; +mul.f64 fd231, fd223, 0d3FE0000000000000; +sub.f64 fd232, %51, fd231; +sub.f64 fd233, %65, %81; +mul.f64 fd234, fd233, 0dBFEBB67AE8584CAA; +sub.f64 fd235, fd232, fd234; +add.f64 fd236, fd234, fd232; +mul.f64 fd237, fd213, 0d3FE8836FA2CF5039; +mul.f64 fd238, fd219, 0d3FE491B7523C161D; +sub.f64 fd239, fd237, fd238; +mul.f64 fd240, fd219, 0d3FE8836FA2CF5039; +fma.rn.f64 fd241, fd213, 0d3FE491B7523C161D, fd240; +mul.f64 fd242, fd229, 0d3FC63A1A7E0B738A; +mul.f64 fd243, fd235, 0d3FEF838B8C811C17; +sub.f64 fd244, fd242, fd243; +mul.f64 fd245, fd235, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd246, fd229, 0d3FEF838B8C811C17, fd245; +mul.f64 fd247, fd214, 0d3FC63A1A7E0B738A; +mul.f64 fd248, fd220, 0d3FEF838B8C811C17; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd220, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd251, fd214, 0d3FEF838B8C811C17, fd250; +mul.f64 fd252, fd230, 0dBFEE11F642522D1C; +mul.f64 fd253, fd236, 0d3FD5E3A8748A0BF5; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd236, 0dBFEE11F642522D1C; +fma.rn.f64 fd256, fd230, 0d3FD5E3A8748A0BF5, fd255; +add.f64 fd257, fd206, fd222; +add.f64 fd258, fd190, fd257; +add.f64 fd259, fd208, fd224; +add.f64 fd260, fd192, fd259; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, fd190, fd261; +sub.f64 fd263, fd208, fd224; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +mul.f64 fd267, fd259, 0d3FE0000000000000; +sub.f64 fd268, fd192, fd267; +sub.f64 fd269, fd206, fd222; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +add.f64 fd273, fd239, fd244; +add.f64 fd274, fd197, fd273; +add.f64 fd275, fd241, fd246; +add.f64 fd276, fd203, fd275; +mul.f64 fd277, fd273, 0d3FE0000000000000; +sub.f64 fd278, fd197, fd277; +sub.f64 fd279, fd241, fd246; +mul.f64 fd280, fd279, 0dBFEBB67AE8584CAA; +add.f64 fd281, fd280, fd278; +sub.f64 fd282, fd278, fd280; +mul.f64 fd283, fd275, 0d3FE0000000000000; +sub.f64 fd284, fd203, fd283; +sub.f64 fd285, fd239, fd244; +mul.f64 fd286, fd285, 0dBFEBB67AE8584CAA; +sub.f64 fd287, fd284, fd286; +add.f64 fd288, fd286, fd284; +add.f64 fd289, fd249, fd254; +add.f64 fd290, fd198, fd289; +add.f64 fd291, fd251, fd256; +add.f64 fd292, fd204, fd291; +mul.f64 fd293, fd289, 0d3FE0000000000000; +sub.f64 fd294, fd198, fd293; +sub.f64 fd295, fd251, fd256; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +add.f64 fd297, fd296, fd294; +sub.f64 fd298, fd294, fd296; +mul.f64 fd299, fd291, 0d3FE0000000000000; +sub.f64 fd300, fd204, fd299; +sub.f64 fd301, fd249, fd254; +mul.f64 fd302, fd301, 0dBFEBB67AE8584CAA; +sub.f64 fd303, fd300, fd302; +add.f64 fd304, fd302, fd300; +mul.f64 fd305, fd274, 0d3FEE11F642522D1C; +mul.f64 fd306, fd276, 0d3FD5E3A8748A0BF5; +sub.f64 fd307, fd305, fd306; +mul.f64 fd308, fd276, 0d3FEE11F642522D1C; +fma.rn.f64 fd309, fd274, 0d3FD5E3A8748A0BF5, fd308; +mul.f64 fd310, fd290, 0d3FE8836FA2CF5039; +mul.f64 fd311, fd292, 0d3FE491B7523C161D; +sub.f64 fd312, fd310, fd311; +mul.f64 fd313, fd292, 0d3FE8836FA2CF5039; +fma.rn.f64 fd314, fd290, 0d3FE491B7523C161D, fd313; +mul.f64 fd315, fd265, 0d3FE0000000000000; +mul.f64 fd316, fd271, 0d3FEBB67AE8584CAA; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd271, 0d3FE0000000000000; +fma.rn.f64 fd319, fd265, 0d3FEBB67AE8584CAA, fd318; +mul.f64 fd320, fd281, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd287, 0d3FEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd287, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd281, 0d3FEF838B8C811C17, fd323; +mul.f64 fd325, fd297, 0dBFC63A1A7E0B738A; +mul.f64 fd326, fd303, 0d3FEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd303, 0dBFC63A1A7E0B738A; +fma.rn.f64 fd329, fd297, 0d3FEF838B8C811C17, fd328; +mul.f64 fd330, fd266, 0dBFE0000000000000; +mul.f64 fd331, fd272, 0d3FEBB67AE8584CAA; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd272, 0dBFE0000000000000; +fma.rn.f64 fd334, fd266, 0d3FEBB67AE8584CAA, fd333; +mul.f64 fd335, fd282, 0dBFE8836FA2CF5039; +mul.f64 fd336, fd288, 0d3FE491B7523C161D; +sub.f64 fd337, fd335, fd336; +mul.f64 fd338, fd288, 0dBFE8836FA2CF5039; +fma.rn.f64 fd339, fd282, 0d3FE491B7523C161D, fd338; +mul.f64 fd340, fd298, 0dBFEE11F642522D1C; +mul.f64 fd341, fd304, 0d3FD5E3A8748A0BF5; +sub.f64 fd342, fd340, fd341; +mul.f64 fd343, fd304, 0dBFEE11F642522D1C; +fma.rn.f64 fd344, fd298, 0d3FD5E3A8748A0BF5, fd343; +add.f64 %1, fd144, fd260; +add.f64 %0, fd142, fd258; +add.f64 %3, fd160, fd309; +add.f64 %2, fd158, fd307; +add.f64 %5, fd176, fd314; +add.f64 %4, fd174, fd312; +add.f64 %7, fd155, fd319; +add.f64 %6, fd149, fd317; +add.f64 %9, fd171, fd324; +add.f64 %8, fd165, fd322; +add.f64 %11, fd187, fd329; +add.f64 %10, fd181, fd327; +add.f64 %13, fd156, fd334; +add.f64 %12, fd150, fd332; +add.f64 %15, fd172, fd339; +add.f64 %14, fd166, fd337; +add.f64 %17, fd188, fd344; +add.f64 %16, fd182, fd342; +sub.f64 %19, fd144, fd260; +sub.f64 %18, fd142, fd258; +sub.f64 %21, fd160, fd309; +sub.f64 %20, fd158, fd307; +sub.f64 %23, fd176, fd314; +sub.f64 %22, fd174, fd312; +sub.f64 %25, fd155, fd319; +sub.f64 %24, fd149, fd317; +sub.f64 %27, fd171, fd324; +sub.f64 %26, fd165, fd322; +sub.f64 %29, fd187, fd329; +sub.f64 %28, fd181, fd327; +sub.f64 %31, fd156, fd334; +sub.f64 %30, fd150, fd332; +sub.f64 %33, fd172, fd339; +sub.f64 %32, fd166, fd337; +sub.f64 %35, fd188, fd344; +sub.f64 %34, fd182, fd342; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..ea7b6914646f4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp16_fwd.hpp.inc @@ -0,0 +1,26265 @@ +#ifndef CUFFTDX_FFT_19683_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_19683_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1177, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1043>; +.reg .b32 r<10778>; +.reg .b64 rd<6>; +mov.u32 r10704, %54; +mov.u32 r10777, %tid.y; +mad.lo.s32 r10705, r10777, 157464, r10704; +mov.u32 r10706, %tid.x; +mov.f32 f1034, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1, {low, high}; +} +mov.f32 f1036, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f906, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r265, {low, high}; +} +mov.f32 f908, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r266, {low, high}; +} +mov.f32 f918, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r267, {low, high}; +} +mov.f32 f920, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r268, {low, high}; +} +mov.f32 f942, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r271, {low, high}; +} +mov.f32 f944, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %83, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %83, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %83, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %82, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %81; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %82, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %81; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %82, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %81; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %81; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %82, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %81; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %82, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f898, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r1825, {low, high}; +} +mov.f32 f900, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r1826, {low, high}; +} +mov.f32 f902, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r1827, {low, high}; +} +mov.f32 f904, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1830, {low, high}; +} +mov.f32 f910, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r1831, {low, high}; +} +mov.f32 f912, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r1832, {low, high}; +} +mov.f32 f914, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r1833, {low, high}; +} +mov.f32 f916, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1836, {low, high}; +} +mov.f32 f922, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r1837, {low, high}; +} +mov.f32 f924, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r1838, {low, high}; +} +mov.f32 f926, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r1839, {low, high}; +} +mov.f32 f928, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r1840, {low, high}; +} +mov.f32 f934, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r1843, {low, high}; +} +mov.f32 f936, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r1844, {low, high}; +} +mov.f32 f958, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1848, {low, high}; +} +mov.f32 f950, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r1851, {low, high}; +} +mov.f32 f952, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r1855, {low, high}; +} +mov.f32 f960, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r10706, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r10707, rd3; +mul.lo.s32 r10708, r10707, 729; +sub.s32 r10709, r10706, r10708; +cvt.rn.f32.u32 f1037, r10709; +mul.f32 f1038, f1037, 0f39A75CD5; +cos.approx.f32 f309, f1038; +sin.approx.f32 f1039, f1038; +neg.f32 f310, f1039; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +mad.lo.s32 r10710, r10707, 157464, r10705; +barrier.sync 0; +mad.lo.s32 r10711, r10709, 216, r10710; +st.shared.v2.f32 [r10711], {r2140, r2146}; +st.shared.v2.f32 [r10711+8], {r2937, r2944}; +st.shared.v2.f32 [r10711+16], {r2974, r2981}; +st.shared.v2.f32 [r10711+24], {r3011, r3018}; +st.shared.v2.f32 [r10711+32], {r3048, r3055}; +st.shared.v2.f32 [r10711+40], {r3085, r3092}; +st.shared.v2.f32 [r10711+48], {r3122, r3129}; +st.shared.v2.f32 [r10711+56], {r3159, r3166}; +st.shared.v2.f32 [r10711+64], {r3196, r3203}; +st.shared.v2.f32 [r10711+72], {r3233, r3240}; +st.shared.v2.f32 [r10711+80], {r3270, r3277}; +st.shared.v2.f32 [r10711+88], {r3307, r3314}; +st.shared.v2.f32 [r10711+96], {r3344, r3351}; +st.shared.v2.f32 [r10711+104], {r3381, r3388}; +st.shared.v2.f32 [r10711+112], {r3418, r3425}; +st.shared.v2.f32 [r10711+120], {r3455, r3462}; +st.shared.v2.f32 [r10711+128], {r3492, r3499}; +st.shared.v2.f32 [r10711+136], {r3529, r3536}; +st.shared.v2.f32 [r10711+144], {r3566, r3573}; +st.shared.v2.f32 [r10711+152], {r3603, r3610}; +st.shared.v2.f32 [r10711+160], {r3640, r3647}; +st.shared.v2.f32 [r10711+168], {r3677, r3684}; +st.shared.v2.f32 [r10711+176], {r3714, r3721}; +st.shared.v2.f32 [r10711+184], {r3751, r3758}; +st.shared.v2.f32 [r10711+192], {r3788, r3795}; +st.shared.v2.f32 [r10711+200], {r3825, r3832}; +st.shared.v2.f32 [r10711+208], {r3862, r3869}; +barrier.sync 0; +mad.lo.s32 r10712, r10709, -208, r10711; +ld.shared.u32 r3898, [r10712]; +ld.shared.u32 r3904, [r10712+4]; +ld.shared.u32 r4506, [r10712+5832]; +ld.shared.u32 r4512, [r10712+5836]; +ld.shared.u32 r5114, [r10712+11664]; +ld.shared.u32 r5120, [r10712+11668]; +ld.shared.u32 r3986, [r10712+17496]; +ld.shared.u32 r3992, [r10712+17500]; +ld.shared.u32 r4594, [r10712+23328]; +ld.shared.u32 r4600, [r10712+23332]; +ld.shared.u32 r5202, [r10712+29160]; +ld.shared.u32 r5208, [r10712+29164]; +ld.shared.u32 r4074, [r10712+34992]; +ld.shared.u32 r4080, [r10712+34996]; +ld.shared.u32 r4682, [r10712+40824]; +ld.shared.u32 r4688, [r10712+40828]; +ld.shared.u32 r5290, [r10712+46656]; +ld.shared.u32 r5296, [r10712+46660]; +ld.shared.u32 r3895, [r10712+52488]; +ld.shared.u32 r3901, [r10712+52492]; +ld.shared.u32 r4503, [r10712+58320]; +ld.shared.u32 r4509, [r10712+58324]; +ld.shared.u32 r5111, [r10712+64152]; +ld.shared.u32 r5117, [r10712+64156]; +ld.shared.u32 r3983, [r10712+69984]; +ld.shared.u32 r3989, [r10712+69988]; +ld.shared.u32 r4591, [r10712+75816]; +ld.shared.u32 r4597, [r10712+75820]; +ld.shared.u32 r5199, [r10712+81648]; +ld.shared.u32 r5205, [r10712+81652]; +ld.shared.u32 r4071, [r10712+87480]; +ld.shared.u32 r4077, [r10712+87484]; +ld.shared.u32 r4679, [r10712+93312]; +ld.shared.u32 r4685, [r10712+93316]; +ld.shared.u32 r5287, [r10712+99144]; +ld.shared.u32 r5293, [r10712+99148]; +ld.shared.u32 r3896, [r10712+104976]; +ld.shared.u32 r3902, [r10712+104980]; +ld.shared.u32 r4504, [r10712+110808]; +ld.shared.u32 r4510, [r10712+110812]; +ld.shared.u32 r5112, [r10712+116640]; +ld.shared.u32 r5118, [r10712+116644]; +ld.shared.u32 r3984, [r10712+122472]; +ld.shared.u32 r3990, [r10712+122476]; +ld.shared.u32 r4592, [r10712+128304]; +ld.shared.u32 r4598, [r10712+128308]; +ld.shared.u32 r5200, [r10712+134136]; +ld.shared.u32 r5206, [r10712+134140]; +ld.shared.u32 r4072, [r10712+139968]; +ld.shared.u32 r4078, [r10712+139972]; +ld.shared.u32 r4680, [r10712+145800]; +ld.shared.u32 r4686, [r10712+145804]; +ld.shared.u32 r5288, [r10712+151632]; +ld.shared.u32 r5294, [r10712+151636]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 r6029, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 r6035, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 r6053, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 r6071, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 r6089, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 r6107, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 r6117, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 r6123, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 r6141, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 r6159, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 r6177, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 r6195, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 r6205, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 r6211, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 r6229, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 r6247, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 r6265, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 r6283, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 r6293, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 r6299, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 r6335, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 r6353, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 r6371, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 r6381, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 r6387, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 r6405, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 r6423, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 r6441, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 r6459, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 r6469, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 r6475, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 r6511, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 r6529, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 r6547, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 r6557, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 r6563, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 r6617, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 r6635, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 r6645, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 r6651, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 r6669, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 r6687, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 r6705, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 r6723, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 r6733, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 r6739, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 r6757, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 r6775, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 r6793, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 r6811, r6802, r6808; +} +mul.wide.u32 rd4, r10709, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r10713, rd5; +sub.s32 r10714, r10709, r10713; +shr.u32 r10715, r10714, 1; +add.s32 r10716, r10715, r10713; +shr.u32 r10717, r10716, 4; +cvt.rn.f32.u32 f1040, r10717; +mul.f32 f1041, f1040, 0f3C0D3654; +cos.approx.f32 f673, f1041; +sin.approx.f32 f1042, f1041; +neg.f32 f674, f1042; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6814, {low, high}; +} +mul.lo.s32 r10718, r10717, 27; +sub.s32 r10719, r10709, r10718; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6819, {high, high}; +} +{ +mul.f16x2 r6821, r6123, r6819; +} +{ +neg.f16x2 r6824, r6821; +} +{ +fma.rn.f16x2 r6826, r6117, r6817, r6824; +} +{ +mul.f16x2 r6830, r6117, r6819; +} +{ +fma.rn.f16x2 r6833, r6123, r6817, r6830; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6839, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6841, {low, high}; +} +{ +mul.f16x2 r6842, r6839, r6841; +} +{ +mul.f16x2 r6845, r6814, r6837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6848, {high, low}; +} +{ +fma.rn.f16x2 r6850, r6842, r6848, r6845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6856, {high, high}; +} +{ +mul.f16x2 r6858, r6211, r6856; +} +{ +neg.f16x2 r6861, r6858; +} +{ +fma.rn.f16x2 r6863, r6205, r6854, r6861; +} +{ +mul.f16x2 r6867, r6205, r6856; +} +{ +fma.rn.f16x2 r6870, r6211, r6854, r6867; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6876, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6878, {low, high}; +} +{ +mul.f16x2 r6879, r6876, r6878; +} +{ +mul.f16x2 r6882, r6850, r6874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6885, {high, low}; +} +{ +fma.rn.f16x2 r6887, r6879, r6885, r6882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6891, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6893, {high, high}; +} +{ +mul.f16x2 r6895, r6299, r6893; +} +{ +neg.f16x2 r6898, r6895; +} +{ +fma.rn.f16x2 r6900, r6293, r6891, r6898; +} +{ +mul.f16x2 r6904, r6293, r6893; +} +{ +fma.rn.f16x2 r6907, r6299, r6891, r6904; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6913, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6915, {low, high}; +} +{ +mul.f16x2 r6916, r6913, r6915; +} +{ +mul.f16x2 r6919, r6887, r6911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6922, {high, low}; +} +{ +fma.rn.f16x2 r6924, r6916, r6922, r6919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6930, {high, high}; +} +{ +mul.f16x2 r6932, r6387, r6930; +} +{ +neg.f16x2 r6935, r6932; +} +{ +fma.rn.f16x2 r6937, r6381, r6928, r6935; +} +{ +mul.f16x2 r6941, r6381, r6930; +} +{ +fma.rn.f16x2 r6944, r6387, r6928, r6941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6950, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6952, {low, high}; +} +{ +mul.f16x2 r6953, r6950, r6952; +} +{ +mul.f16x2 r6956, r6924, r6948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6959, {high, low}; +} +{ +fma.rn.f16x2 r6961, r6953, r6959, r6956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6967, {high, high}; +} +{ +mul.f16x2 r6969, r6475, r6967; +} +{ +neg.f16x2 r6972, r6969; +} +{ +fma.rn.f16x2 r6974, r6469, r6965, r6972; +} +{ +mul.f16x2 r6978, r6469, r6967; +} +{ +fma.rn.f16x2 r6981, r6475, r6965, r6978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6989, {low, high}; +} +{ +mul.f16x2 r6990, r6987, r6989; +} +{ +mul.f16x2 r6993, r6961, r6985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6996, {high, low}; +} +{ +fma.rn.f16x2 r6998, r6990, r6996, r6993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7004, {high, high}; +} +{ +mul.f16x2 r7006, r6563, r7004; +} +{ +neg.f16x2 r7009, r7006; +} +{ +fma.rn.f16x2 r7011, r6557, r7002, r7009; +} +{ +mul.f16x2 r7015, r6557, r7004; +} +{ +fma.rn.f16x2 r7018, r6563, r7002, r7015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7026, {low, high}; +} +{ +mul.f16x2 r7027, r7024, r7026; +} +{ +mul.f16x2 r7030, r6998, r7022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7033, {high, low}; +} +{ +fma.rn.f16x2 r7035, r7027, r7033, r7030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7041, {high, high}; +} +{ +mul.f16x2 r7043, r6651, r7041; +} +{ +neg.f16x2 r7046, r7043; +} +{ +fma.rn.f16x2 r7048, r6645, r7039, r7046; +} +{ +mul.f16x2 r7052, r6645, r7041; +} +{ +fma.rn.f16x2 r7055, r6651, r7039, r7052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7063, {low, high}; +} +{ +mul.f16x2 r7064, r7061, r7063; +} +{ +mul.f16x2 r7067, r7035, r7059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7070, {high, low}; +} +{ +fma.rn.f16x2 r7072, r7064, r7070, r7067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7078, {high, high}; +} +{ +mul.f16x2 r7080, r6739, r7078; +} +{ +neg.f16x2 r7083, r7080; +} +{ +fma.rn.f16x2 r7085, r6733, r7076, r7083; +} +{ +mul.f16x2 r7089, r6733, r7078; +} +{ +fma.rn.f16x2 r7092, r6739, r7076, r7089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7100, {low, high}; +} +{ +mul.f16x2 r7101, r7098, r7100; +} +{ +mul.f16x2 r7104, r7072, r7096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7107, {high, low}; +} +{ +fma.rn.f16x2 r7109, r7101, r7107, r7104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7115, {high, high}; +} +{ +mul.f16x2 r7117, r6089, r7115; +} +{ +neg.f16x2 r7120, r7117; +} +{ +fma.rn.f16x2 r7122, r6053, r7113, r7120; +} +{ +mul.f16x2 r7126, r6053, r7115; +} +{ +fma.rn.f16x2 r7129, r6089, r7113, r7126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7137, {low, high}; +} +{ +mul.f16x2 r7138, r7135, r7137; +} +{ +mul.f16x2 r7141, r7109, r7133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7144, {high, low}; +} +{ +fma.rn.f16x2 r7146, r7138, r7144, r7141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7152, {high, high}; +} +{ +mul.f16x2 r7154, r6177, r7152; +} +{ +neg.f16x2 r7157, r7154; +} +{ +fma.rn.f16x2 r7159, r6141, r7150, r7157; +} +{ +mul.f16x2 r7163, r6141, r7152; +} +{ +fma.rn.f16x2 r7166, r6177, r7150, r7163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7174, {low, high}; +} +{ +mul.f16x2 r7175, r7172, r7174; +} +{ +mul.f16x2 r7178, r7146, r7170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7181, {high, low}; +} +{ +fma.rn.f16x2 r7183, r7175, r7181, r7178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7189, {high, high}; +} +{ +mul.f16x2 r7191, r6265, r7189; +} +{ +neg.f16x2 r7194, r7191; +} +{ +fma.rn.f16x2 r7196, r6229, r7187, r7194; +} +{ +mul.f16x2 r7200, r6229, r7189; +} +{ +fma.rn.f16x2 r7203, r6265, r7187, r7200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7211, {low, high}; +} +{ +mul.f16x2 r7212, r7209, r7211; +} +{ +mul.f16x2 r7215, r7183, r7207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7218, {high, low}; +} +{ +fma.rn.f16x2 r7220, r7212, r7218, r7215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7226, {high, high}; +} +{ +mul.f16x2 r7228, r6353, r7226; +} +{ +neg.f16x2 r7231, r7228; +} +{ +fma.rn.f16x2 r7233, r6317, r7224, r7231; +} +{ +mul.f16x2 r7237, r6317, r7226; +} +{ +fma.rn.f16x2 r7240, r6353, r7224, r7237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7248, {low, high}; +} +{ +mul.f16x2 r7249, r7246, r7248; +} +{ +mul.f16x2 r7252, r7220, r7244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7255, {high, low}; +} +{ +fma.rn.f16x2 r7257, r7249, r7255, r7252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7263, {high, high}; +} +{ +mul.f16x2 r7265, r6441, r7263; +} +{ +neg.f16x2 r7268, r7265; +} +{ +fma.rn.f16x2 r7270, r6405, r7261, r7268; +} +{ +mul.f16x2 r7274, r6405, r7263; +} +{ +fma.rn.f16x2 r7277, r6441, r7261, r7274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7285, {low, high}; +} +{ +mul.f16x2 r7286, r7283, r7285; +} +{ +mul.f16x2 r7289, r7257, r7281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7292, {high, low}; +} +{ +fma.rn.f16x2 r7294, r7286, r7292, r7289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7300, {high, high}; +} +{ +mul.f16x2 r7302, r6529, r7300; +} +{ +neg.f16x2 r7305, r7302; +} +{ +fma.rn.f16x2 r7307, r6493, r7298, r7305; +} +{ +mul.f16x2 r7311, r6493, r7300; +} +{ +fma.rn.f16x2 r7314, r6529, r7298, r7311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7322, {low, high}; +} +{ +mul.f16x2 r7323, r7320, r7322; +} +{ +mul.f16x2 r7326, r7294, r7318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7329, {high, low}; +} +{ +fma.rn.f16x2 r7331, r7323, r7329, r7326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7337, {high, high}; +} +{ +mul.f16x2 r7339, r6617, r7337; +} +{ +neg.f16x2 r7342, r7339; +} +{ +fma.rn.f16x2 r7344, r6581, r7335, r7342; +} +{ +mul.f16x2 r7348, r6581, r7337; +} +{ +fma.rn.f16x2 r7351, r6617, r7335, r7348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7359, {low, high}; +} +{ +mul.f16x2 r7360, r7357, r7359; +} +{ +mul.f16x2 r7363, r7331, r7355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7366, {high, low}; +} +{ +fma.rn.f16x2 r7368, r7360, r7366, r7363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7374, {high, high}; +} +{ +mul.f16x2 r7376, r6705, r7374; +} +{ +neg.f16x2 r7379, r7376; +} +{ +fma.rn.f16x2 r7381, r6669, r7372, r7379; +} +{ +mul.f16x2 r7385, r6669, r7374; +} +{ +fma.rn.f16x2 r7388, r6705, r7372, r7385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7396, {low, high}; +} +{ +mul.f16x2 r7397, r7394, r7396; +} +{ +mul.f16x2 r7400, r7368, r7392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7403, {high, low}; +} +{ +fma.rn.f16x2 r7405, r7397, r7403, r7400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7411, {high, high}; +} +{ +mul.f16x2 r7413, r6793, r7411; +} +{ +neg.f16x2 r7416, r7413; +} +{ +fma.rn.f16x2 r7418, r6757, r7409, r7416; +} +{ +mul.f16x2 r7422, r6757, r7411; +} +{ +fma.rn.f16x2 r7425, r6793, r7409, r7422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7433, {low, high}; +} +{ +mul.f16x2 r7434, r7431, r7433; +} +{ +mul.f16x2 r7437, r7405, r7429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7440, {high, low}; +} +{ +fma.rn.f16x2 r7442, r7434, r7440, r7437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7448, {high, high}; +} +{ +mul.f16x2 r7450, r6107, r7448; +} +{ +neg.f16x2 r7453, r7450; +} +{ +fma.rn.f16x2 r7455, r6071, r7446, r7453; +} +{ +mul.f16x2 r7459, r6071, r7448; +} +{ +fma.rn.f16x2 r7462, r6107, r7446, r7459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7470, {low, high}; +} +{ +mul.f16x2 r7471, r7468, r7470; +} +{ +mul.f16x2 r7474, r7442, r7466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7477, {high, low}; +} +{ +fma.rn.f16x2 r7479, r7471, r7477, r7474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7485, {high, high}; +} +{ +mul.f16x2 r7487, r6195, r7485; +} +{ +neg.f16x2 r7490, r7487; +} +{ +fma.rn.f16x2 r7492, r6159, r7483, r7490; +} +{ +mul.f16x2 r7496, r6159, r7485; +} +{ +fma.rn.f16x2 r7499, r6195, r7483, r7496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7507, {low, high}; +} +{ +mul.f16x2 r7508, r7505, r7507; +} +{ +mul.f16x2 r7511, r7479, r7503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7514, {high, low}; +} +{ +fma.rn.f16x2 r7516, r7508, r7514, r7511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7522, {high, high}; +} +{ +mul.f16x2 r7524, r6283, r7522; +} +{ +neg.f16x2 r7527, r7524; +} +{ +fma.rn.f16x2 r7529, r6247, r7520, r7527; +} +{ +mul.f16x2 r7533, r6247, r7522; +} +{ +fma.rn.f16x2 r7536, r6283, r7520, r7533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7544, {low, high}; +} +{ +mul.f16x2 r7545, r7542, r7544; +} +{ +mul.f16x2 r7548, r7516, r7540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7551, {high, low}; +} +{ +fma.rn.f16x2 r7553, r7545, r7551, r7548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7559, {high, high}; +} +{ +mul.f16x2 r7561, r6371, r7559; +} +{ +neg.f16x2 r7564, r7561; +} +{ +fma.rn.f16x2 r7566, r6335, r7557, r7564; +} +{ +mul.f16x2 r7570, r6335, r7559; +} +{ +fma.rn.f16x2 r7573, r6371, r7557, r7570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7581, {low, high}; +} +{ +mul.f16x2 r7582, r7579, r7581; +} +{ +mul.f16x2 r7585, r7553, r7577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7588, {high, low}; +} +{ +fma.rn.f16x2 r7590, r7582, r7588, r7585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7596, {high, high}; +} +{ +mul.f16x2 r7598, r6459, r7596; +} +{ +neg.f16x2 r7601, r7598; +} +{ +fma.rn.f16x2 r7603, r6423, r7594, r7601; +} +{ +mul.f16x2 r7607, r6423, r7596; +} +{ +fma.rn.f16x2 r7610, r6459, r7594, r7607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7618, {low, high}; +} +{ +mul.f16x2 r7619, r7616, r7618; +} +{ +mul.f16x2 r7622, r7590, r7614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7625, {high, low}; +} +{ +fma.rn.f16x2 r7627, r7619, r7625, r7622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7633, {high, high}; +} +{ +mul.f16x2 r7635, r6547, r7633; +} +{ +neg.f16x2 r7638, r7635; +} +{ +fma.rn.f16x2 r7640, r6511, r7631, r7638; +} +{ +mul.f16x2 r7644, r6511, r7633; +} +{ +fma.rn.f16x2 r7647, r6547, r7631, r7644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7655, {low, high}; +} +{ +mul.f16x2 r7656, r7653, r7655; +} +{ +mul.f16x2 r7659, r7627, r7651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7662, {high, low}; +} +{ +fma.rn.f16x2 r7664, r7656, r7662, r7659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7670, {high, high}; +} +{ +mul.f16x2 r7672, r6635, r7670; +} +{ +neg.f16x2 r7675, r7672; +} +{ +fma.rn.f16x2 r7677, r6599, r7668, r7675; +} +{ +mul.f16x2 r7681, r6599, r7670; +} +{ +fma.rn.f16x2 r7684, r6635, r7668, r7681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7692, {low, high}; +} +{ +mul.f16x2 r7693, r7690, r7692; +} +{ +mul.f16x2 r7696, r7664, r7688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7699, {high, low}; +} +{ +fma.rn.f16x2 r7701, r7693, r7699, r7696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7707, {high, high}; +} +{ +mul.f16x2 r7709, r6723, r7707; +} +{ +neg.f16x2 r7712, r7709; +} +{ +fma.rn.f16x2 r7714, r6687, r7705, r7712; +} +{ +mul.f16x2 r7718, r6687, r7707; +} +{ +fma.rn.f16x2 r7721, r6723, r7705, r7718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7729, {low, high}; +} +{ +mul.f16x2 r7730, r7727, r7729; +} +{ +mul.f16x2 r7733, r7701, r7725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7736, {high, low}; +} +{ +fma.rn.f16x2 r7738, r7730, r7736, r7733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7744, {high, high}; +} +{ +mul.f16x2 r7746, r6811, r7744; +} +{ +neg.f16x2 r7749, r7746; +} +{ +fma.rn.f16x2 r7751, r6775, r7742, r7749; +} +{ +mul.f16x2 r7755, r6775, r7744; +} +{ +fma.rn.f16x2 r7758, r6811, r7742, r7755; +} +shl.b32 r10720, r10719, 3; +add.s32 r10721, r10710, r10720; +barrier.sync 0; +mad.lo.s32 r10722, r10717, 5832, r10721; +st.shared.u32 [r10722], r6029; +st.shared.u32 [r10722+4], r6035; +st.shared.u32 [r10722+216], r6826; +st.shared.u32 [r10722+220], r6833; +st.shared.u32 [r10722+432], r6863; +st.shared.u32 [r10722+436], r6870; +st.shared.u32 [r10722+648], r6900; +st.shared.u32 [r10722+652], r6907; +st.shared.u32 [r10722+864], r6937; +st.shared.u32 [r10722+868], r6944; +st.shared.u32 [r10722+1080], r6974; +st.shared.u32 [r10722+1084], r6981; +st.shared.u32 [r10722+1296], r7011; +st.shared.u32 [r10722+1300], r7018; +st.shared.u32 [r10722+1512], r7048; +st.shared.u32 [r10722+1516], r7055; +st.shared.u32 [r10722+1728], r7085; +st.shared.u32 [r10722+1732], r7092; +st.shared.u32 [r10722+1944], r7122; +st.shared.u32 [r10722+1948], r7129; +st.shared.u32 [r10722+2160], r7159; +st.shared.u32 [r10722+2164], r7166; +st.shared.u32 [r10722+2376], r7196; +st.shared.u32 [r10722+2380], r7203; +st.shared.u32 [r10722+2592], r7233; +st.shared.u32 [r10722+2596], r7240; +st.shared.u32 [r10722+2808], r7270; +st.shared.u32 [r10722+2812], r7277; +st.shared.u32 [r10722+3024], r7307; +st.shared.u32 [r10722+3028], r7314; +st.shared.u32 [r10722+3240], r7344; +st.shared.u32 [r10722+3244], r7351; +st.shared.u32 [r10722+3456], r7381; +st.shared.u32 [r10722+3460], r7388; +st.shared.u32 [r10722+3672], r7418; +st.shared.u32 [r10722+3676], r7425; +st.shared.u32 [r10722+3888], r7455; +st.shared.u32 [r10722+3892], r7462; +st.shared.u32 [r10722+4104], r7492; +st.shared.u32 [r10722+4108], r7499; +st.shared.u32 [r10722+4320], r7529; +st.shared.u32 [r10722+4324], r7536; +st.shared.u32 [r10722+4536], r7566; +st.shared.u32 [r10722+4540], r7573; +st.shared.u32 [r10722+4752], r7603; +st.shared.u32 [r10722+4756], r7610; +st.shared.u32 [r10722+4968], r7640; +st.shared.u32 [r10722+4972], r7647; +st.shared.u32 [r10722+5184], r7677; +st.shared.u32 [r10722+5188], r7684; +st.shared.u32 [r10722+5400], r7714; +st.shared.u32 [r10722+5404], r7721; +st.shared.u32 [r10722+5616], r7751; +st.shared.u32 [r10722+5620], r7758; +barrier.sync 0; +ld.shared.u32 r7787, [r10712]; +ld.shared.u32 r7793, [r10712+4]; +ld.shared.u32 r8395, [r10712+5832]; +ld.shared.u32 r8401, [r10712+5836]; +ld.shared.u32 r9003, [r10712+11664]; +ld.shared.u32 r9009, [r10712+11668]; +ld.shared.u32 r7875, [r10712+17496]; +ld.shared.u32 r7881, [r10712+17500]; +ld.shared.u32 r8483, [r10712+23328]; +ld.shared.u32 r8489, [r10712+23332]; +ld.shared.u32 r9091, [r10712+29160]; +ld.shared.u32 r9097, [r10712+29164]; +ld.shared.u32 r7963, [r10712+34992]; +ld.shared.u32 r7969, [r10712+34996]; +ld.shared.u32 r8571, [r10712+40824]; +ld.shared.u32 r8577, [r10712+40828]; +ld.shared.u32 r9179, [r10712+46656]; +ld.shared.u32 r9185, [r10712+46660]; +ld.shared.u32 r7784, [r10712+52488]; +ld.shared.u32 r7790, [r10712+52492]; +ld.shared.u32 r8392, [r10712+58320]; +ld.shared.u32 r8398, [r10712+58324]; +ld.shared.u32 r9000, [r10712+64152]; +ld.shared.u32 r9006, [r10712+64156]; +ld.shared.u32 r7872, [r10712+69984]; +ld.shared.u32 r7878, [r10712+69988]; +ld.shared.u32 r8480, [r10712+75816]; +ld.shared.u32 r8486, [r10712+75820]; +ld.shared.u32 r9088, [r10712+81648]; +ld.shared.u32 r9094, [r10712+81652]; +ld.shared.u32 r7960, [r10712+87480]; +ld.shared.u32 r7966, [r10712+87484]; +ld.shared.u32 r8568, [r10712+93312]; +ld.shared.u32 r8574, [r10712+93316]; +ld.shared.u32 r9176, [r10712+99144]; +ld.shared.u32 r9182, [r10712+99148]; +ld.shared.u32 r7785, [r10712+104976]; +ld.shared.u32 r7791, [r10712+104980]; +ld.shared.u32 r8393, [r10712+110808]; +ld.shared.u32 r8399, [r10712+110812]; +ld.shared.u32 r9001, [r10712+116640]; +ld.shared.u32 r9007, [r10712+116644]; +ld.shared.u32 r7873, [r10712+122472]; +ld.shared.u32 r7879, [r10712+122476]; +ld.shared.u32 r8481, [r10712+128304]; +ld.shared.u32 r8487, [r10712+128308]; +ld.shared.u32 r9089, [r10712+134136]; +ld.shared.u32 r9095, [r10712+134140]; +ld.shared.u32 r7961, [r10712+139968]; +ld.shared.u32 r7967, [r10712+139972]; +ld.shared.u32 r8569, [r10712+145800]; +ld.shared.u32 r8575, [r10712+145804]; +ld.shared.u32 r9177, [r10712+151632]; +ld.shared.u32 r9183, [r10712+151636]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7780, {low, high}; +} +{ +neg.f16x2 r7781, r7780; +} +{ +add.f16x2 r7783, r7784, r7785; +} +{ +add.f16x2 r7786, r7787, r7783; +} +{ +add.f16x2 r7789, r7790, r7791; +} +{ +add.f16x2 r7792, r7793, r7789; +} +{ +add.f16x2 r7795, r7784, r7785; +} +{ +mul.f16x2 r7798, r7795, r7779; +} +{ +add.f16x2 r7801, r7787, r7798; +} +{ +sub.f16x2 r7804, r7790, r7791; +} +{ +mul.f16x2 r7807, r7804, r7781; +} +{ +add.f16x2 r7810, r7801, r7807; +} +{ +add.f16x2 r7813, r7784, r7785; +} +{ +mul.f16x2 r7816, r7813, r7779; +} +{ +add.f16x2 r7819, r7787, r7816; +} +{ +sub.f16x2 r7822, r7790, r7791; +} +{ +mul.f16x2 r7825, r7822, r7781; +} +{ +sub.f16x2 r7828, r7819, r7825; +} +{ +add.f16x2 r7831, r7790, r7791; +} +{ +mul.f16x2 r7834, r7831, r7779; +} +{ +add.f16x2 r7837, r7793, r7834; +} +{ +sub.f16x2 r7840, r7784, r7785; +} +{ +mul.f16x2 r7843, r7840, r7781; +} +{ +sub.f16x2 r7846, r7837, r7843; +} +{ +add.f16x2 r7849, r7790, r7791; +} +{ +mul.f16x2 r7852, r7849, r7779; +} +{ +add.f16x2 r7855, r7793, r7852; +} +{ +sub.f16x2 r7858, r7784, r7785; +} +{ +mul.f16x2 r7861, r7858, r7781; +} +{ +add.f16x2 r7864, r7855, r7861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7868, {low, high}; +} +{ +neg.f16x2 r7869, r7868; +} +{ +add.f16x2 r7871, r7872, r7873; +} +{ +add.f16x2 r7874, r7875, r7871; +} +{ +add.f16x2 r7877, r7878, r7879; +} +{ +add.f16x2 r7880, r7881, r7877; +} +{ +add.f16x2 r7883, r7872, r7873; +} +{ +mul.f16x2 r7886, r7883, r7867; +} +{ +add.f16x2 r7889, r7875, r7886; +} +{ +sub.f16x2 r7892, r7878, r7879; +} +{ +mul.f16x2 r7895, r7892, r7869; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 r7901, r7872, r7873; +} +{ +mul.f16x2 r7904, r7901, r7867; +} +{ +add.f16x2 r7907, r7875, r7904; +} +{ +sub.f16x2 r7910, r7878, r7879; +} +{ +mul.f16x2 r7913, r7910, r7869; +} +{ +sub.f16x2 r7916, r7907, r7913; +} +{ +add.f16x2 r7919, r7878, r7879; +} +{ +mul.f16x2 r7922, r7919, r7867; +} +{ +add.f16x2 r7925, r7881, r7922; +} +{ +sub.f16x2 r7928, r7872, r7873; +} +{ +mul.f16x2 r7931, r7928, r7869; +} +{ +sub.f16x2 r7934, r7925, r7931; +} +{ +add.f16x2 r7937, r7878, r7879; +} +{ +mul.f16x2 r7940, r7937, r7867; +} +{ +add.f16x2 r7943, r7881, r7940; +} +{ +sub.f16x2 r7946, r7872, r7873; +} +{ +mul.f16x2 r7949, r7946, r7869; +} +{ +add.f16x2 r7952, r7943, r7949; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7956, {low, high}; +} +{ +neg.f16x2 r7957, r7956; +} +{ +add.f16x2 r7959, r7960, r7961; +} +{ +add.f16x2 r7962, r7963, r7959; +} +{ +add.f16x2 r7965, r7966, r7967; +} +{ +add.f16x2 r7968, r7969, r7965; +} +{ +add.f16x2 r7971, r7960, r7961; +} +{ +mul.f16x2 r7974, r7971, r7955; +} +{ +add.f16x2 r7977, r7963, r7974; +} +{ +sub.f16x2 r7980, r7966, r7967; +} +{ +mul.f16x2 r7983, r7980, r7957; +} +{ +add.f16x2 r7986, r7977, r7983; +} +{ +add.f16x2 r7989, r7960, r7961; +} +{ +mul.f16x2 r7992, r7989, r7955; +} +{ +add.f16x2 r7995, r7963, r7992; +} +{ +sub.f16x2 r7998, r7966, r7967; +} +{ +mul.f16x2 r8001, r7998, r7957; +} +{ +sub.f16x2 r8004, r7995, r8001; +} +{ +add.f16x2 r8007, r7966, r7967; +} +{ +mul.f16x2 r8010, r8007, r7955; +} +{ +add.f16x2 r8013, r7969, r8010; +} +{ +sub.f16x2 r8016, r7960, r7961; +} +{ +mul.f16x2 r8019, r8016, r7957; +} +{ +sub.f16x2 r8022, r8013, r8019; +} +{ +add.f16x2 r8025, r7966, r7967; +} +{ +mul.f16x2 r8028, r8025, r7955; +} +{ +add.f16x2 r8031, r7969, r8028; +} +{ +sub.f16x2 r8034, r7960, r7961; +} +{ +mul.f16x2 r8037, r8034, r7957; +} +{ +add.f16x2 r8040, r8031, r8037; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r8043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r8044, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r8045, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r8046, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r8049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r8050, {low, high}; +} +{ +mul.f16x2 r8059, r7898, r8043; +} +{ +mul.f16x2 r8062, r7934, r8044; +} +{ +sub.f16x2 r8065, r8059, r8062; +} +{ +mul.f16x2 r8068, r7898, r8044; +} +{ +fma.rn.f16x2 r8071, r7934, r8043, r8068; +} +{ +mul.f16x2 r8075, r7986, r8045; +} +{ +mul.f16x2 r8078, r8022, r8046; +} +{ +sub.f16x2 r8081, r8075, r8078; +} +{ +mul.f16x2 r8084, r7986, r8046; +} +{ +fma.rn.f16x2 r8087, r8022, r8045, r8084; +} +{ +mul.f16x2 r8091, r7916, r8045; +} +{ +mul.f16x2 r8094, r7952, r8046; +} +{ +sub.f16x2 r8097, r8091, r8094; +} +{ +mul.f16x2 r8100, r7916, r8046; +} +{ +fma.rn.f16x2 r8103, r7952, r8045, r8100; +} +{ +mul.f16x2 r8107, r8004, r8049; +} +{ +mul.f16x2 r8110, r8040, r8050; +} +{ +sub.f16x2 r8113, r8107, r8110; +} +{ +mul.f16x2 r8116, r8004, r8050; +} +{ +fma.rn.f16x2 r8119, r8040, r8049, r8116; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8124, {low, high}; +} +{ +neg.f16x2 r8125, r8124; +} +{ +add.f16x2 r8127, r7874, r7962; +} +{ +add.f16x2 r8130, r7786, r8127; +} +{ +add.f16x2 r8133, r7880, r7968; +} +{ +add.f16x2 r8136, r7792, r8133; +} +{ +add.f16x2 r8139, r7874, r7962; +} +{ +mul.f16x2 r8142, r8139, r8123; +} +{ +add.f16x2 r8145, r7786, r8142; +} +{ +sub.f16x2 r8148, r7880, r7968; +} +{ +mul.f16x2 r8151, r8148, r8125; +} +{ +add.f16x2 r8154, r8145, r8151; +} +{ +add.f16x2 r8157, r7874, r7962; +} +{ +mul.f16x2 r8160, r8157, r8123; +} +{ +add.f16x2 r8163, r7786, r8160; +} +{ +sub.f16x2 r8166, r7880, r7968; +} +{ +mul.f16x2 r8169, r8166, r8125; +} +{ +sub.f16x2 r8172, r8163, r8169; +} +{ +add.f16x2 r8175, r7880, r7968; +} +{ +mul.f16x2 r8178, r8175, r8123; +} +{ +add.f16x2 r8181, r7792, r8178; +} +{ +sub.f16x2 r8184, r7874, r7962; +} +{ +mul.f16x2 r8187, r8184, r8125; +} +{ +sub.f16x2 r8190, r8181, r8187; +} +{ +add.f16x2 r8193, r7880, r7968; +} +{ +mul.f16x2 r8196, r8193, r8123; +} +{ +add.f16x2 r8199, r7792, r8196; +} +{ +sub.f16x2 r8202, r7874, r7962; +} +{ +mul.f16x2 r8205, r8202, r8125; +} +{ +add.f16x2 r8208, r8199, r8205; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8212, {low, high}; +} +{ +neg.f16x2 r8213, r8212; +} +{ +add.f16x2 r8215, r8065, r8081; +} +{ +add.f16x2 r8218, r7810, r8215; +} +{ +add.f16x2 r8221, r8071, r8087; +} +{ +add.f16x2 r8224, r7846, r8221; +} +{ +add.f16x2 r8227, r8065, r8081; +} +{ +mul.f16x2 r8230, r8227, r8211; +} +{ +add.f16x2 r8233, r7810, r8230; +} +{ +sub.f16x2 r8236, r8071, r8087; +} +{ +mul.f16x2 r8239, r8236, r8213; +} +{ +add.f16x2 r8242, r8233, r8239; +} +{ +add.f16x2 r8245, r8065, r8081; +} +{ +mul.f16x2 r8248, r8245, r8211; +} +{ +add.f16x2 r8251, r7810, r8248; +} +{ +sub.f16x2 r8254, r8071, r8087; +} +{ +mul.f16x2 r8257, r8254, r8213; +} +{ +sub.f16x2 r8260, r8251, r8257; +} +{ +add.f16x2 r8263, r8071, r8087; +} +{ +mul.f16x2 r8266, r8263, r8211; +} +{ +add.f16x2 r8269, r7846, r8266; +} +{ +sub.f16x2 r8272, r8065, r8081; +} +{ +mul.f16x2 r8275, r8272, r8213; +} +{ +sub.f16x2 r8278, r8269, r8275; +} +{ +add.f16x2 r8281, r8071, r8087; +} +{ +mul.f16x2 r8284, r8281, r8211; +} +{ +add.f16x2 r8287, r7846, r8284; +} +{ +sub.f16x2 r8290, r8065, r8081; +} +{ +mul.f16x2 r8293, r8290, r8213; +} +{ +add.f16x2 r8296, r8287, r8293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8299, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8300, {low, high}; +} +{ +neg.f16x2 r8301, r8300; +} +{ +add.f16x2 r8303, r8097, r8113; +} +{ +add.f16x2 r8306, r7828, r8303; +} +{ +add.f16x2 r8309, r8103, r8119; +} +{ +add.f16x2 r8312, r7864, r8309; +} +{ +add.f16x2 r8315, r8097, r8113; +} +{ +mul.f16x2 r8318, r8315, r8299; +} +{ +add.f16x2 r8321, r7828, r8318; +} +{ +sub.f16x2 r8324, r8103, r8119; +} +{ +mul.f16x2 r8327, r8324, r8301; +} +{ +add.f16x2 r8330, r8321, r8327; +} +{ +add.f16x2 r8333, r8097, r8113; +} +{ +mul.f16x2 r8336, r8333, r8299; +} +{ +add.f16x2 r8339, r7828, r8336; +} +{ +sub.f16x2 r8342, r8103, r8119; +} +{ +mul.f16x2 r8345, r8342, r8301; +} +{ +sub.f16x2 r8348, r8339, r8345; +} +{ +add.f16x2 r8351, r8103, r8119; +} +{ +mul.f16x2 r8354, r8351, r8299; +} +{ +add.f16x2 r8357, r7864, r8354; +} +{ +sub.f16x2 r8360, r8097, r8113; +} +{ +mul.f16x2 r8363, r8360, r8301; +} +{ +sub.f16x2 r8366, r8357, r8363; +} +{ +add.f16x2 r8369, r8103, r8119; +} +{ +mul.f16x2 r8372, r8369, r8299; +} +{ +add.f16x2 r8375, r7864, r8372; +} +{ +sub.f16x2 r8378, r8097, r8113; +} +{ +mul.f16x2 r8381, r8378, r8301; +} +{ +add.f16x2 r8384, r8375, r8381; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8387, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8388, {low, high}; +} +{ +neg.f16x2 r8389, r8388; +} +{ +add.f16x2 r8391, r8392, r8393; +} +{ +add.f16x2 r8394, r8395, r8391; +} +{ +add.f16x2 r8397, r8398, r8399; +} +{ +add.f16x2 r8400, r8401, r8397; +} +{ +add.f16x2 r8403, r8392, r8393; +} +{ +mul.f16x2 r8406, r8403, r8387; +} +{ +add.f16x2 r8409, r8395, r8406; +} +{ +sub.f16x2 r8412, r8398, r8399; +} +{ +mul.f16x2 r8415, r8412, r8389; +} +{ +add.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8392, r8393; +} +{ +mul.f16x2 r8424, r8421, r8387; +} +{ +add.f16x2 r8427, r8395, r8424; +} +{ +sub.f16x2 r8430, r8398, r8399; +} +{ +mul.f16x2 r8433, r8430, r8389; +} +{ +sub.f16x2 r8436, r8427, r8433; +} +{ +add.f16x2 r8439, r8398, r8399; +} +{ +mul.f16x2 r8442, r8439, r8387; +} +{ +add.f16x2 r8445, r8401, r8442; +} +{ +sub.f16x2 r8448, r8392, r8393; +} +{ +mul.f16x2 r8451, r8448, r8389; +} +{ +sub.f16x2 r8454, r8445, r8451; +} +{ +add.f16x2 r8457, r8398, r8399; +} +{ +mul.f16x2 r8460, r8457, r8387; +} +{ +add.f16x2 r8463, r8401, r8460; +} +{ +sub.f16x2 r8466, r8392, r8393; +} +{ +mul.f16x2 r8469, r8466, r8389; +} +{ +add.f16x2 r8472, r8463, r8469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8476, {low, high}; +} +{ +neg.f16x2 r8477, r8476; +} +{ +add.f16x2 r8479, r8480, r8481; +} +{ +add.f16x2 r8482, r8483, r8479; +} +{ +add.f16x2 r8485, r8486, r8487; +} +{ +add.f16x2 r8488, r8489, r8485; +} +{ +add.f16x2 r8491, r8480, r8481; +} +{ +mul.f16x2 r8494, r8491, r8475; +} +{ +add.f16x2 r8497, r8483, r8494; +} +{ +sub.f16x2 r8500, r8486, r8487; +} +{ +mul.f16x2 r8503, r8500, r8477; +} +{ +add.f16x2 r8506, r8497, r8503; +} +{ +add.f16x2 r8509, r8480, r8481; +} +{ +mul.f16x2 r8512, r8509, r8475; +} +{ +add.f16x2 r8515, r8483, r8512; +} +{ +sub.f16x2 r8518, r8486, r8487; +} +{ +mul.f16x2 r8521, r8518, r8477; +} +{ +sub.f16x2 r8524, r8515, r8521; +} +{ +add.f16x2 r8527, r8486, r8487; +} +{ +mul.f16x2 r8530, r8527, r8475; +} +{ +add.f16x2 r8533, r8489, r8530; +} +{ +sub.f16x2 r8536, r8480, r8481; +} +{ +mul.f16x2 r8539, r8536, r8477; +} +{ +sub.f16x2 r8542, r8533, r8539; +} +{ +add.f16x2 r8545, r8486, r8487; +} +{ +mul.f16x2 r8548, r8545, r8475; +} +{ +add.f16x2 r8551, r8489, r8548; +} +{ +sub.f16x2 r8554, r8480, r8481; +} +{ +mul.f16x2 r8557, r8554, r8477; +} +{ +add.f16x2 r8560, r8551, r8557; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8564, {low, high}; +} +{ +neg.f16x2 r8565, r8564; +} +{ +add.f16x2 r8567, r8568, r8569; +} +{ +add.f16x2 r8570, r8571, r8567; +} +{ +add.f16x2 r8573, r8574, r8575; +} +{ +add.f16x2 r8576, r8577, r8573; +} +{ +add.f16x2 r8579, r8568, r8569; +} +{ +mul.f16x2 r8582, r8579, r8563; +} +{ +add.f16x2 r8585, r8571, r8582; +} +{ +sub.f16x2 r8588, r8574, r8575; +} +{ +mul.f16x2 r8591, r8588, r8565; +} +{ +add.f16x2 r8594, r8585, r8591; +} +{ +add.f16x2 r8597, r8568, r8569; +} +{ +mul.f16x2 r8600, r8597, r8563; +} +{ +add.f16x2 r8603, r8571, r8600; +} +{ +sub.f16x2 r8606, r8574, r8575; +} +{ +mul.f16x2 r8609, r8606, r8565; +} +{ +sub.f16x2 r8612, r8603, r8609; +} +{ +add.f16x2 r8615, r8574, r8575; +} +{ +mul.f16x2 r8618, r8615, r8563; +} +{ +add.f16x2 r8621, r8577, r8618; +} +{ +sub.f16x2 r8624, r8568, r8569; +} +{ +mul.f16x2 r8627, r8624, r8565; +} +{ +sub.f16x2 r8630, r8621, r8627; +} +{ +add.f16x2 r8633, r8574, r8575; +} +{ +mul.f16x2 r8636, r8633, r8563; +} +{ +add.f16x2 r8639, r8577, r8636; +} +{ +sub.f16x2 r8642, r8568, r8569; +} +{ +mul.f16x2 r8645, r8642, r8565; +} +{ +add.f16x2 r8648, r8639, r8645; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r8651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r8652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r8653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r8654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r8657, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r8658, {low, high}; +} +{ +mul.f16x2 r8667, r8506, r8651; +} +{ +mul.f16x2 r8670, r8542, r8652; +} +{ +sub.f16x2 r8673, r8667, r8670; +} +{ +mul.f16x2 r8676, r8506, r8652; +} +{ +fma.rn.f16x2 r8679, r8542, r8651, r8676; +} +{ +mul.f16x2 r8683, r8594, r8653; +} +{ +mul.f16x2 r8686, r8630, r8654; +} +{ +sub.f16x2 r8689, r8683, r8686; +} +{ +mul.f16x2 r8692, r8594, r8654; +} +{ +fma.rn.f16x2 r8695, r8630, r8653, r8692; +} +{ +mul.f16x2 r8699, r8524, r8653; +} +{ +mul.f16x2 r8702, r8560, r8654; +} +{ +sub.f16x2 r8705, r8699, r8702; +} +{ +mul.f16x2 r8708, r8524, r8654; +} +{ +fma.rn.f16x2 r8711, r8560, r8653, r8708; +} +{ +mul.f16x2 r8715, r8612, r8657; +} +{ +mul.f16x2 r8718, r8648, r8658; +} +{ +sub.f16x2 r8721, r8715, r8718; +} +{ +mul.f16x2 r8724, r8612, r8658; +} +{ +fma.rn.f16x2 r8727, r8648, r8657, r8724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8732, {low, high}; +} +{ +neg.f16x2 r8733, r8732; +} +{ +add.f16x2 r8735, r8482, r8570; +} +{ +add.f16x2 r8738, r8394, r8735; +} +{ +add.f16x2 r8741, r8488, r8576; +} +{ +add.f16x2 r8744, r8400, r8741; +} +{ +add.f16x2 r8747, r8482, r8570; +} +{ +mul.f16x2 r8750, r8747, r8731; +} +{ +add.f16x2 r8753, r8394, r8750; +} +{ +sub.f16x2 r8756, r8488, r8576; +} +{ +mul.f16x2 r8759, r8756, r8733; +} +{ +add.f16x2 r8762, r8753, r8759; +} +{ +add.f16x2 r8765, r8482, r8570; +} +{ +mul.f16x2 r8768, r8765, r8731; +} +{ +add.f16x2 r8771, r8394, r8768; +} +{ +sub.f16x2 r8774, r8488, r8576; +} +{ +mul.f16x2 r8777, r8774, r8733; +} +{ +sub.f16x2 r8780, r8771, r8777; +} +{ +add.f16x2 r8783, r8488, r8576; +} +{ +mul.f16x2 r8786, r8783, r8731; +} +{ +add.f16x2 r8789, r8400, r8786; +} +{ +sub.f16x2 r8792, r8482, r8570; +} +{ +mul.f16x2 r8795, r8792, r8733; +} +{ +sub.f16x2 r8798, r8789, r8795; +} +{ +add.f16x2 r8801, r8488, r8576; +} +{ +mul.f16x2 r8804, r8801, r8731; +} +{ +add.f16x2 r8807, r8400, r8804; +} +{ +sub.f16x2 r8810, r8482, r8570; +} +{ +mul.f16x2 r8813, r8810, r8733; +} +{ +add.f16x2 r8816, r8807, r8813; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8819, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8820, {low, high}; +} +{ +neg.f16x2 r8821, r8820; +} +{ +add.f16x2 r8823, r8673, r8689; +} +{ +add.f16x2 r8826, r8418, r8823; +} +{ +add.f16x2 r8829, r8679, r8695; +} +{ +add.f16x2 r8832, r8454, r8829; +} +{ +add.f16x2 r8835, r8673, r8689; +} +{ +mul.f16x2 r8838, r8835, r8819; +} +{ +add.f16x2 r8841, r8418, r8838; +} +{ +sub.f16x2 r8844, r8679, r8695; +} +{ +mul.f16x2 r8847, r8844, r8821; +} +{ +add.f16x2 r8850, r8841, r8847; +} +{ +add.f16x2 r8853, r8673, r8689; +} +{ +mul.f16x2 r8856, r8853, r8819; +} +{ +add.f16x2 r8859, r8418, r8856; +} +{ +sub.f16x2 r8862, r8679, r8695; +} +{ +mul.f16x2 r8865, r8862, r8821; +} +{ +sub.f16x2 r8868, r8859, r8865; +} +{ +add.f16x2 r8871, r8679, r8695; +} +{ +mul.f16x2 r8874, r8871, r8819; +} +{ +add.f16x2 r8877, r8454, r8874; +} +{ +sub.f16x2 r8880, r8673, r8689; +} +{ +mul.f16x2 r8883, r8880, r8821; +} +{ +sub.f16x2 r8886, r8877, r8883; +} +{ +add.f16x2 r8889, r8679, r8695; +} +{ +mul.f16x2 r8892, r8889, r8819; +} +{ +add.f16x2 r8895, r8454, r8892; +} +{ +sub.f16x2 r8898, r8673, r8689; +} +{ +mul.f16x2 r8901, r8898, r8821; +} +{ +add.f16x2 r8904, r8895, r8901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8908, {low, high}; +} +{ +neg.f16x2 r8909, r8908; +} +{ +add.f16x2 r8911, r8705, r8721; +} +{ +add.f16x2 r8914, r8436, r8911; +} +{ +add.f16x2 r8917, r8711, r8727; +} +{ +add.f16x2 r8920, r8472, r8917; +} +{ +add.f16x2 r8923, r8705, r8721; +} +{ +mul.f16x2 r8926, r8923, r8907; +} +{ +add.f16x2 r8929, r8436, r8926; +} +{ +sub.f16x2 r8932, r8711, r8727; +} +{ +mul.f16x2 r8935, r8932, r8909; +} +{ +add.f16x2 r8938, r8929, r8935; +} +{ +add.f16x2 r8941, r8705, r8721; +} +{ +mul.f16x2 r8944, r8941, r8907; +} +{ +add.f16x2 r8947, r8436, r8944; +} +{ +sub.f16x2 r8950, r8711, r8727; +} +{ +mul.f16x2 r8953, r8950, r8909; +} +{ +sub.f16x2 r8956, r8947, r8953; +} +{ +add.f16x2 r8959, r8711, r8727; +} +{ +mul.f16x2 r8962, r8959, r8907; +} +{ +add.f16x2 r8965, r8472, r8962; +} +{ +sub.f16x2 r8968, r8705, r8721; +} +{ +mul.f16x2 r8971, r8968, r8909; +} +{ +sub.f16x2 r8974, r8965, r8971; +} +{ +add.f16x2 r8977, r8711, r8727; +} +{ +mul.f16x2 r8980, r8977, r8907; +} +{ +add.f16x2 r8983, r8472, r8980; +} +{ +sub.f16x2 r8986, r8705, r8721; +} +{ +mul.f16x2 r8989, r8986, r8909; +} +{ +add.f16x2 r8992, r8983, r8989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8995, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8996, {low, high}; +} +{ +neg.f16x2 r8997, r8996; +} +{ +add.f16x2 r8999, r9000, r9001; +} +{ +add.f16x2 r9002, r9003, r8999; +} +{ +add.f16x2 r9005, r9006, r9007; +} +{ +add.f16x2 r9008, r9009, r9005; +} +{ +add.f16x2 r9011, r9000, r9001; +} +{ +mul.f16x2 r9014, r9011, r8995; +} +{ +add.f16x2 r9017, r9003, r9014; +} +{ +sub.f16x2 r9020, r9006, r9007; +} +{ +mul.f16x2 r9023, r9020, r8997; +} +{ +add.f16x2 r9026, r9017, r9023; +} +{ +add.f16x2 r9029, r9000, r9001; +} +{ +mul.f16x2 r9032, r9029, r8995; +} +{ +add.f16x2 r9035, r9003, r9032; +} +{ +sub.f16x2 r9038, r9006, r9007; +} +{ +mul.f16x2 r9041, r9038, r8997; +} +{ +sub.f16x2 r9044, r9035, r9041; +} +{ +add.f16x2 r9047, r9006, r9007; +} +{ +mul.f16x2 r9050, r9047, r8995; +} +{ +add.f16x2 r9053, r9009, r9050; +} +{ +sub.f16x2 r9056, r9000, r9001; +} +{ +mul.f16x2 r9059, r9056, r8997; +} +{ +sub.f16x2 r9062, r9053, r9059; +} +{ +add.f16x2 r9065, r9006, r9007; +} +{ +mul.f16x2 r9068, r9065, r8995; +} +{ +add.f16x2 r9071, r9009, r9068; +} +{ +sub.f16x2 r9074, r9000, r9001; +} +{ +mul.f16x2 r9077, r9074, r8997; +} +{ +add.f16x2 r9080, r9071, r9077; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9083, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9084, {low, high}; +} +{ +neg.f16x2 r9085, r9084; +} +{ +add.f16x2 r9087, r9088, r9089; +} +{ +add.f16x2 r9090, r9091, r9087; +} +{ +add.f16x2 r9093, r9094, r9095; +} +{ +add.f16x2 r9096, r9097, r9093; +} +{ +add.f16x2 r9099, r9088, r9089; +} +{ +mul.f16x2 r9102, r9099, r9083; +} +{ +add.f16x2 r9105, r9091, r9102; +} +{ +sub.f16x2 r9108, r9094, r9095; +} +{ +mul.f16x2 r9111, r9108, r9085; +} +{ +add.f16x2 r9114, r9105, r9111; +} +{ +add.f16x2 r9117, r9088, r9089; +} +{ +mul.f16x2 r9120, r9117, r9083; +} +{ +add.f16x2 r9123, r9091, r9120; +} +{ +sub.f16x2 r9126, r9094, r9095; +} +{ +mul.f16x2 r9129, r9126, r9085; +} +{ +sub.f16x2 r9132, r9123, r9129; +} +{ +add.f16x2 r9135, r9094, r9095; +} +{ +mul.f16x2 r9138, r9135, r9083; +} +{ +add.f16x2 r9141, r9097, r9138; +} +{ +sub.f16x2 r9144, r9088, r9089; +} +{ +mul.f16x2 r9147, r9144, r9085; +} +{ +sub.f16x2 r9150, r9141, r9147; +} +{ +add.f16x2 r9153, r9094, r9095; +} +{ +mul.f16x2 r9156, r9153, r9083; +} +{ +add.f16x2 r9159, r9097, r9156; +} +{ +sub.f16x2 r9162, r9088, r9089; +} +{ +mul.f16x2 r9165, r9162, r9085; +} +{ +add.f16x2 r9168, r9159, r9165; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9171, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9172, {low, high}; +} +{ +neg.f16x2 r9173, r9172; +} +{ +add.f16x2 r9175, r9176, r9177; +} +{ +add.f16x2 r9178, r9179, r9175; +} +{ +add.f16x2 r9181, r9182, r9183; +} +{ +add.f16x2 r9184, r9185, r9181; +} +{ +add.f16x2 r9187, r9176, r9177; +} +{ +mul.f16x2 r9190, r9187, r9171; +} +{ +add.f16x2 r9193, r9179, r9190; +} +{ +sub.f16x2 r9196, r9182, r9183; +} +{ +mul.f16x2 r9199, r9196, r9173; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +add.f16x2 r9205, r9176, r9177; +} +{ +mul.f16x2 r9208, r9205, r9171; +} +{ +add.f16x2 r9211, r9179, r9208; +} +{ +sub.f16x2 r9214, r9182, r9183; +} +{ +mul.f16x2 r9217, r9214, r9173; +} +{ +sub.f16x2 r9220, r9211, r9217; +} +{ +add.f16x2 r9223, r9182, r9183; +} +{ +mul.f16x2 r9226, r9223, r9171; +} +{ +add.f16x2 r9229, r9185, r9226; +} +{ +sub.f16x2 r9232, r9176, r9177; +} +{ +mul.f16x2 r9235, r9232, r9173; +} +{ +sub.f16x2 r9238, r9229, r9235; +} +{ +add.f16x2 r9241, r9182, r9183; +} +{ +mul.f16x2 r9244, r9241, r9171; +} +{ +add.f16x2 r9247, r9185, r9244; +} +{ +sub.f16x2 r9250, r9176, r9177; +} +{ +mul.f16x2 r9253, r9250, r9173; +} +{ +add.f16x2 r9256, r9247, r9253; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9266, {low, high}; +} +{ +mul.f16x2 r9275, r9114, r9259; +} +{ +mul.f16x2 r9278, r9150, r9260; +} +{ +sub.f16x2 r9281, r9275, r9278; +} +{ +mul.f16x2 r9284, r9114, r9260; +} +{ +fma.rn.f16x2 r9287, r9150, r9259, r9284; +} +{ +mul.f16x2 r9291, r9202, r9261; +} +{ +mul.f16x2 r9294, r9238, r9262; +} +{ +sub.f16x2 r9297, r9291, r9294; +} +{ +mul.f16x2 r9300, r9202, r9262; +} +{ +fma.rn.f16x2 r9303, r9238, r9261, r9300; +} +{ +mul.f16x2 r9307, r9132, r9261; +} +{ +mul.f16x2 r9310, r9168, r9262; +} +{ +sub.f16x2 r9313, r9307, r9310; +} +{ +mul.f16x2 r9316, r9132, r9262; +} +{ +fma.rn.f16x2 r9319, r9168, r9261, r9316; +} +{ +mul.f16x2 r9323, r9220, r9265; +} +{ +mul.f16x2 r9326, r9256, r9266; +} +{ +sub.f16x2 r9329, r9323, r9326; +} +{ +mul.f16x2 r9332, r9220, r9266; +} +{ +fma.rn.f16x2 r9335, r9256, r9265, r9332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9340, {low, high}; +} +{ +neg.f16x2 r9341, r9340; +} +{ +add.f16x2 r9343, r9090, r9178; +} +{ +add.f16x2 r9346, r9002, r9343; +} +{ +add.f16x2 r9349, r9096, r9184; +} +{ +add.f16x2 r9352, r9008, r9349; +} +{ +add.f16x2 r9355, r9090, r9178; +} +{ +mul.f16x2 r9358, r9355, r9339; +} +{ +add.f16x2 r9361, r9002, r9358; +} +{ +sub.f16x2 r9364, r9096, r9184; +} +{ +mul.f16x2 r9367, r9364, r9341; +} +{ +add.f16x2 r9370, r9361, r9367; +} +{ +add.f16x2 r9373, r9090, r9178; +} +{ +mul.f16x2 r9376, r9373, r9339; +} +{ +add.f16x2 r9379, r9002, r9376; +} +{ +sub.f16x2 r9382, r9096, r9184; +} +{ +mul.f16x2 r9385, r9382, r9341; +} +{ +sub.f16x2 r9388, r9379, r9385; +} +{ +add.f16x2 r9391, r9096, r9184; +} +{ +mul.f16x2 r9394, r9391, r9339; +} +{ +add.f16x2 r9397, r9008, r9394; +} +{ +sub.f16x2 r9400, r9090, r9178; +} +{ +mul.f16x2 r9403, r9400, r9341; +} +{ +sub.f16x2 r9406, r9397, r9403; +} +{ +add.f16x2 r9409, r9096, r9184; +} +{ +mul.f16x2 r9412, r9409, r9339; +} +{ +add.f16x2 r9415, r9008, r9412; +} +{ +sub.f16x2 r9418, r9090, r9178; +} +{ +mul.f16x2 r9421, r9418, r9341; +} +{ +add.f16x2 r9424, r9415, r9421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9427, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9428, {low, high}; +} +{ +neg.f16x2 r9429, r9428; +} +{ +add.f16x2 r9431, r9281, r9297; +} +{ +add.f16x2 r9434, r9026, r9431; +} +{ +add.f16x2 r9437, r9287, r9303; +} +{ +add.f16x2 r9440, r9062, r9437; +} +{ +add.f16x2 r9443, r9281, r9297; +} +{ +mul.f16x2 r9446, r9443, r9427; +} +{ +add.f16x2 r9449, r9026, r9446; +} +{ +sub.f16x2 r9452, r9287, r9303; +} +{ +mul.f16x2 r9455, r9452, r9429; +} +{ +add.f16x2 r9458, r9449, r9455; +} +{ +add.f16x2 r9461, r9281, r9297; +} +{ +mul.f16x2 r9464, r9461, r9427; +} +{ +add.f16x2 r9467, r9026, r9464; +} +{ +sub.f16x2 r9470, r9287, r9303; +} +{ +mul.f16x2 r9473, r9470, r9429; +} +{ +sub.f16x2 r9476, r9467, r9473; +} +{ +add.f16x2 r9479, r9287, r9303; +} +{ +mul.f16x2 r9482, r9479, r9427; +} +{ +add.f16x2 r9485, r9062, r9482; +} +{ +sub.f16x2 r9488, r9281, r9297; +} +{ +mul.f16x2 r9491, r9488, r9429; +} +{ +sub.f16x2 r9494, r9485, r9491; +} +{ +add.f16x2 r9497, r9287, r9303; +} +{ +mul.f16x2 r9500, r9497, r9427; +} +{ +add.f16x2 r9503, r9062, r9500; +} +{ +sub.f16x2 r9506, r9281, r9297; +} +{ +mul.f16x2 r9509, r9506, r9429; +} +{ +add.f16x2 r9512, r9503, r9509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9516, {low, high}; +} +{ +neg.f16x2 r9517, r9516; +} +{ +add.f16x2 r9519, r9313, r9329; +} +{ +add.f16x2 r9522, r9044, r9519; +} +{ +add.f16x2 r9525, r9319, r9335; +} +{ +add.f16x2 r9528, r9080, r9525; +} +{ +add.f16x2 r9531, r9313, r9329; +} +{ +mul.f16x2 r9534, r9531, r9515; +} +{ +add.f16x2 r9537, r9044, r9534; +} +{ +sub.f16x2 r9540, r9319, r9335; +} +{ +mul.f16x2 r9543, r9540, r9517; +} +{ +add.f16x2 r9546, r9537, r9543; +} +{ +add.f16x2 r9549, r9313, r9329; +} +{ +mul.f16x2 r9552, r9549, r9515; +} +{ +add.f16x2 r9555, r9044, r9552; +} +{ +sub.f16x2 r9558, r9319, r9335; +} +{ +mul.f16x2 r9561, r9558, r9517; +} +{ +sub.f16x2 r9564, r9555, r9561; +} +{ +add.f16x2 r9567, r9319, r9335; +} +{ +mul.f16x2 r9570, r9567, r9515; +} +{ +add.f16x2 r9573, r9080, r9570; +} +{ +sub.f16x2 r9576, r9313, r9329; +} +{ +mul.f16x2 r9579, r9576, r9517; +} +{ +sub.f16x2 r9582, r9573, r9579; +} +{ +add.f16x2 r9585, r9319, r9335; +} +{ +mul.f16x2 r9588, r9585, r9515; +} +{ +add.f16x2 r9591, r9080, r9588; +} +{ +sub.f16x2 r9594, r9313, r9329; +} +{ +mul.f16x2 r9597, r9594, r9517; +} +{ +add.f16x2 r9600, r9591, r9597; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r9603, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r9604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r9605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r9606, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9607, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9608, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r9609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r9610, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r9611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r9612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9614, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r9615, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r9616, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r9617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r9618, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r9621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r9622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r9629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r9630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r9633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r9634, {low, high}; +} +{ +mul.f16x2 r9655, r8826, r9603; +} +{ +mul.f16x2 r9658, r8832, r9604; +} +{ +sub.f16x2 r9661, r9655, r9658; +} +{ +mul.f16x2 r9664, r8826, r9604; +} +{ +fma.rn.f16x2 r9667, r8832, r9603, r9664; +} +{ +mul.f16x2 r9671, r9434, r9605; +} +{ +mul.f16x2 r9674, r9440, r9606; +} +{ +sub.f16x2 r9677, r9671, r9674; +} +{ +mul.f16x2 r9680, r9434, r9606; +} +{ +fma.rn.f16x2 r9683, r9440, r9605, r9680; +} +{ +mul.f16x2 r9687, r8914, r9605; +} +{ +mul.f16x2 r9690, r8920, r9606; +} +{ +sub.f16x2 r9693, r9687, r9690; +} +{ +mul.f16x2 r9696, r8914, r9606; +} +{ +fma.rn.f16x2 r9699, r8920, r9605, r9696; +} +{ +mul.f16x2 r9703, r9522, r9609; +} +{ +mul.f16x2 r9706, r9528, r9610; +} +{ +sub.f16x2 r9709, r9703, r9706; +} +{ +mul.f16x2 r9712, r9522, r9610; +} +{ +fma.rn.f16x2 r9715, r9528, r9609, r9712; +} +{ +mul.f16x2 r9719, r8762, r9607; +} +{ +mul.f16x2 r9722, r8798, r9608; +} +{ +sub.f16x2 r9725, r9719, r9722; +} +{ +mul.f16x2 r9728, r8762, r9608; +} +{ +fma.rn.f16x2 r9731, r8798, r9607, r9728; +} +{ +mul.f16x2 r9735, r9370, r9613; +} +{ +mul.f16x2 r9738, r9406, r9614; +} +{ +sub.f16x2 r9741, r9735, r9738; +} +{ +mul.f16x2 r9744, r9370, r9614; +} +{ +fma.rn.f16x2 r9747, r9406, r9613, r9744; +} +{ +mul.f16x2 r9751, r8850, r9609; +} +{ +mul.f16x2 r9754, r8886, r9610; +} +{ +sub.f16x2 r9757, r9751, r9754; +} +{ +mul.f16x2 r9760, r8850, r9610; +} +{ +fma.rn.f16x2 r9763, r8886, r9609, r9760; +} +{ +mul.f16x2 r9767, r9458, r9617; +} +{ +mul.f16x2 r9770, r9494, r9618; +} +{ +sub.f16x2 r9773, r9767, r9770; +} +{ +mul.f16x2 r9776, r9458, r9618; +} +{ +fma.rn.f16x2 r9779, r9494, r9617, r9776; +} +{ +mul.f16x2 r9783, r8938, r9611; +} +{ +mul.f16x2 r9786, r8974, r9612; +} +{ +sub.f16x2 r9789, r9783, r9786; +} +{ +mul.f16x2 r9792, r8938, r9612; +} +{ +fma.rn.f16x2 r9795, r8974, r9611, r9792; +} +{ +mul.f16x2 r9799, r9546, r9621; +} +{ +mul.f16x2 r9802, r9582, r9622; +} +{ +sub.f16x2 r9805, r9799, r9802; +} +{ +mul.f16x2 r9808, r9546, r9622; +} +{ +fma.rn.f16x2 r9811, r9582, r9621, r9808; +} +{ +mul.f16x2 r9815, r8780, r9613; +} +{ +mul.f16x2 r9818, r8816, r9614; +} +{ +sub.f16x2 r9821, r9815, r9818; +} +{ +mul.f16x2 r9824, r8780, r9614; +} +{ +fma.rn.f16x2 r9827, r8816, r9613, r9824; +} +{ +mul.f16x2 r9831, r9388, r9625; +} +{ +mul.f16x2 r9834, r9424, r9626; +} +{ +sub.f16x2 r9837, r9831, r9834; +} +{ +mul.f16x2 r9840, r9388, r9626; +} +{ +fma.rn.f16x2 r9843, r9424, r9625, r9840; +} +{ +mul.f16x2 r9847, r8868, r9615; +} +{ +mul.f16x2 r9850, r8904, r9616; +} +{ +sub.f16x2 r9853, r9847, r9850; +} +{ +mul.f16x2 r9856, r8868, r9616; +} +{ +fma.rn.f16x2 r9859, r8904, r9615, r9856; +} +{ +mul.f16x2 r9863, r9476, r9629; +} +{ +mul.f16x2 r9866, r9512, r9630; +} +{ +sub.f16x2 r9869, r9863, r9866; +} +{ +mul.f16x2 r9872, r9476, r9630; +} +{ +fma.rn.f16x2 r9875, r9512, r9629, r9872; +} +{ +mul.f16x2 r9879, r8956, r9617; +} +{ +mul.f16x2 r9882, r8992, r9618; +} +{ +sub.f16x2 r9885, r9879, r9882; +} +{ +mul.f16x2 r9888, r8956, r9618; +} +{ +fma.rn.f16x2 r9891, r8992, r9617, r9888; +} +{ +mul.f16x2 r9895, r9564, r9633; +} +{ +mul.f16x2 r9898, r9600, r9634; +} +{ +sub.f16x2 r9901, r9895, r9898; +} +{ +mul.f16x2 r9904, r9564, r9634; +} +{ +fma.rn.f16x2 r9907, r9600, r9633, r9904; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9911, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9912, {low, high}; +} +{ +neg.f16x2 r9913, r9912; +} +{ +add.f16x2 r9915, r8738, r9346; +} +{ +add.f16x2 %0, r8130, r9915; +} +{ +add.f16x2 r9921, r8744, r9352; +} +{ +add.f16x2 %1, r8136, r9921; +} +{ +add.f16x2 r9927, r8738, r9346; +} +{ +mul.f16x2 r9930, r9927, r9911; +} +{ +add.f16x2 r9933, r8130, r9930; +} +{ +sub.f16x2 r9936, r8744, r9352; +} +{ +mul.f16x2 r9939, r9936, r9913; +} +{ +add.f16x2 %18, r9933, r9939; +} +{ +add.f16x2 r9945, r8738, r9346; +} +{ +mul.f16x2 r9948, r9945, r9911; +} +{ +add.f16x2 r9951, r8130, r9948; +} +{ +sub.f16x2 r9954, r8744, r9352; +} +{ +mul.f16x2 r9957, r9954, r9913; +} +{ +sub.f16x2 %36, r9951, r9957; +} +{ +add.f16x2 r9963, r8744, r9352; +} +{ +mul.f16x2 r9966, r9963, r9911; +} +{ +add.f16x2 r9969, r8136, r9966; +} +{ +sub.f16x2 r9972, r8738, r9346; +} +{ +mul.f16x2 r9975, r9972, r9913; +} +{ +sub.f16x2 %19, r9969, r9975; +} +{ +add.f16x2 r9981, r8744, r9352; +} +{ +mul.f16x2 r9984, r9981, r9911; +} +{ +add.f16x2 r9987, r8136, r9984; +} +{ +sub.f16x2 r9990, r8738, r9346; +} +{ +mul.f16x2 r9993, r9990, r9913; +} +{ +add.f16x2 %37, r9987, r9993; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10000, {low, high}; +} +{ +neg.f16x2 r10001, r10000; +} +{ +add.f16x2 r10003, r9661, r9677; +} +{ +add.f16x2 %2, r8218, r10003; +} +{ +add.f16x2 r10009, r9667, r9683; +} +{ +add.f16x2 %3, r8224, r10009; +} +{ +add.f16x2 r10015, r9661, r9677; +} +{ +mul.f16x2 r10018, r10015, r9999; +} +{ +add.f16x2 r10021, r8218, r10018; +} +{ +sub.f16x2 r10024, r9667, r9683; +} +{ +mul.f16x2 r10027, r10024, r10001; +} +{ +add.f16x2 %20, r10021, r10027; +} +{ +add.f16x2 r10033, r9661, r9677; +} +{ +mul.f16x2 r10036, r10033, r9999; +} +{ +add.f16x2 r10039, r8218, r10036; +} +{ +sub.f16x2 r10042, r9667, r9683; +} +{ +mul.f16x2 r10045, r10042, r10001; +} +{ +sub.f16x2 %38, r10039, r10045; +} +{ +add.f16x2 r10051, r9667, r9683; +} +{ +mul.f16x2 r10054, r10051, r9999; +} +{ +add.f16x2 r10057, r8224, r10054; +} +{ +sub.f16x2 r10060, r9661, r9677; +} +{ +mul.f16x2 r10063, r10060, r10001; +} +{ +sub.f16x2 %21, r10057, r10063; +} +{ +add.f16x2 r10069, r9667, r9683; +} +{ +mul.f16x2 r10072, r10069, r9999; +} +{ +add.f16x2 r10075, r8224, r10072; +} +{ +sub.f16x2 r10078, r9661, r9677; +} +{ +mul.f16x2 r10081, r10078, r10001; +} +{ +add.f16x2 %39, r10075, r10081; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10087, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10088, {low, high}; +} +{ +neg.f16x2 r10089, r10088; +} +{ +add.f16x2 r10091, r9693, r9709; +} +{ +add.f16x2 %4, r8306, r10091; +} +{ +add.f16x2 r10097, r9699, r9715; +} +{ +add.f16x2 %5, r8312, r10097; +} +{ +add.f16x2 r10103, r9693, r9709; +} +{ +mul.f16x2 r10106, r10103, r10087; +} +{ +add.f16x2 r10109, r8306, r10106; +} +{ +sub.f16x2 r10112, r9699, r9715; +} +{ +mul.f16x2 r10115, r10112, r10089; +} +{ +add.f16x2 %22, r10109, r10115; +} +{ +add.f16x2 r10121, r9693, r9709; +} +{ +mul.f16x2 r10124, r10121, r10087; +} +{ +add.f16x2 r10127, r8306, r10124; +} +{ +sub.f16x2 r10130, r9699, r9715; +} +{ +mul.f16x2 r10133, r10130, r10089; +} +{ +sub.f16x2 %40, r10127, r10133; +} +{ +add.f16x2 r10139, r9699, r9715; +} +{ +mul.f16x2 r10142, r10139, r10087; +} +{ +add.f16x2 r10145, r8312, r10142; +} +{ +sub.f16x2 r10148, r9693, r9709; +} +{ +mul.f16x2 r10151, r10148, r10089; +} +{ +sub.f16x2 %23, r10145, r10151; +} +{ +add.f16x2 r10157, r9699, r9715; +} +{ +mul.f16x2 r10160, r10157, r10087; +} +{ +add.f16x2 r10163, r8312, r10160; +} +{ +sub.f16x2 r10166, r9693, r9709; +} +{ +mul.f16x2 r10169, r10166, r10089; +} +{ +add.f16x2 %41, r10163, r10169; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10176, {low, high}; +} +{ +neg.f16x2 r10177, r10176; +} +{ +add.f16x2 r10179, r9725, r9741; +} +{ +add.f16x2 %6, r8154, r10179; +} +{ +add.f16x2 r10185, r9731, r9747; +} +{ +add.f16x2 %7, r8190, r10185; +} +{ +add.f16x2 r10191, r9725, r9741; +} +{ +mul.f16x2 r10194, r10191, r10175; +} +{ +add.f16x2 r10197, r8154, r10194; +} +{ +sub.f16x2 r10200, r9731, r9747; +} +{ +mul.f16x2 r10203, r10200, r10177; +} +{ +add.f16x2 %24, r10197, r10203; +} +{ +add.f16x2 r10209, r9725, r9741; +} +{ +mul.f16x2 r10212, r10209, r10175; +} +{ +add.f16x2 r10215, r8154, r10212; +} +{ +sub.f16x2 r10218, r9731, r9747; +} +{ +mul.f16x2 r10221, r10218, r10177; +} +{ +sub.f16x2 %42, r10215, r10221; +} +{ +add.f16x2 r10227, r9731, r9747; +} +{ +mul.f16x2 r10230, r10227, r10175; +} +{ +add.f16x2 r10233, r8190, r10230; +} +{ +sub.f16x2 r10236, r9725, r9741; +} +{ +mul.f16x2 r10239, r10236, r10177; +} +{ +sub.f16x2 %25, r10233, r10239; +} +{ +add.f16x2 r10245, r9731, r9747; +} +{ +mul.f16x2 r10248, r10245, r10175; +} +{ +add.f16x2 r10251, r8190, r10248; +} +{ +sub.f16x2 r10254, r9725, r9741; +} +{ +mul.f16x2 r10257, r10254, r10177; +} +{ +add.f16x2 %43, r10251, r10257; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10264, {low, high}; +} +{ +neg.f16x2 r10265, r10264; +} +{ +add.f16x2 r10267, r9757, r9773; +} +{ +add.f16x2 %8, r8242, r10267; +} +{ +add.f16x2 r10273, r9763, r9779; +} +{ +add.f16x2 %9, r8278, r10273; +} +{ +add.f16x2 r10279, r9757, r9773; +} +{ +mul.f16x2 r10282, r10279, r10263; +} +{ +add.f16x2 r10285, r8242, r10282; +} +{ +sub.f16x2 r10288, r9763, r9779; +} +{ +mul.f16x2 r10291, r10288, r10265; +} +{ +add.f16x2 %26, r10285, r10291; +} +{ +add.f16x2 r10297, r9757, r9773; +} +{ +mul.f16x2 r10300, r10297, r10263; +} +{ +add.f16x2 r10303, r8242, r10300; +} +{ +sub.f16x2 r10306, r9763, r9779; +} +{ +mul.f16x2 r10309, r10306, r10265; +} +{ +sub.f16x2 %44, r10303, r10309; +} +{ +add.f16x2 r10315, r9763, r9779; +} +{ +mul.f16x2 r10318, r10315, r10263; +} +{ +add.f16x2 r10321, r8278, r10318; +} +{ +sub.f16x2 r10324, r9757, r9773; +} +{ +mul.f16x2 r10327, r10324, r10265; +} +{ +sub.f16x2 %27, r10321, r10327; +} +{ +add.f16x2 r10333, r9763, r9779; +} +{ +mul.f16x2 r10336, r10333, r10263; +} +{ +add.f16x2 r10339, r8278, r10336; +} +{ +sub.f16x2 r10342, r9757, r9773; +} +{ +mul.f16x2 r10345, r10342, r10265; +} +{ +add.f16x2 %45, r10339, r10345; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10351, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10352, {low, high}; +} +{ +neg.f16x2 r10353, r10352; +} +{ +add.f16x2 r10355, r9789, r9805; +} +{ +add.f16x2 %10, r8330, r10355; +} +{ +add.f16x2 r10361, r9795, r9811; +} +{ +add.f16x2 %11, r8366, r10361; +} +{ +add.f16x2 r10367, r9789, r9805; +} +{ +mul.f16x2 r10370, r10367, r10351; +} +{ +add.f16x2 r10373, r8330, r10370; +} +{ +sub.f16x2 r10376, r9795, r9811; +} +{ +mul.f16x2 r10379, r10376, r10353; +} +{ +add.f16x2 %28, r10373, r10379; +} +{ +add.f16x2 r10385, r9789, r9805; +} +{ +mul.f16x2 r10388, r10385, r10351; +} +{ +add.f16x2 r10391, r8330, r10388; +} +{ +sub.f16x2 r10394, r9795, r9811; +} +{ +mul.f16x2 r10397, r10394, r10353; +} +{ +sub.f16x2 %46, r10391, r10397; +} +{ +add.f16x2 r10403, r9795, r9811; +} +{ +mul.f16x2 r10406, r10403, r10351; +} +{ +add.f16x2 r10409, r8366, r10406; +} +{ +sub.f16x2 r10412, r9789, r9805; +} +{ +mul.f16x2 r10415, r10412, r10353; +} +{ +sub.f16x2 %29, r10409, r10415; +} +{ +add.f16x2 r10421, r9795, r9811; +} +{ +mul.f16x2 r10424, r10421, r10351; +} +{ +add.f16x2 r10427, r8366, r10424; +} +{ +sub.f16x2 r10430, r9789, r9805; +} +{ +mul.f16x2 r10433, r10430, r10353; +} +{ +add.f16x2 %47, r10427, r10433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10440, {low, high}; +} +{ +neg.f16x2 r10441, r10440; +} +{ +add.f16x2 r10443, r9821, r9837; +} +{ +add.f16x2 %12, r8172, r10443; +} +{ +add.f16x2 r10449, r9827, r9843; +} +{ +add.f16x2 %13, r8208, r10449; +} +{ +add.f16x2 r10455, r9821, r9837; +} +{ +mul.f16x2 r10458, r10455, r10439; +} +{ +add.f16x2 r10461, r8172, r10458; +} +{ +sub.f16x2 r10464, r9827, r9843; +} +{ +mul.f16x2 r10467, r10464, r10441; +} +{ +add.f16x2 %30, r10461, r10467; +} +{ +add.f16x2 r10473, r9821, r9837; +} +{ +mul.f16x2 r10476, r10473, r10439; +} +{ +add.f16x2 r10479, r8172, r10476; +} +{ +sub.f16x2 r10482, r9827, r9843; +} +{ +mul.f16x2 r10485, r10482, r10441; +} +{ +sub.f16x2 %48, r10479, r10485; +} +{ +add.f16x2 r10491, r9827, r9843; +} +{ +mul.f16x2 r10494, r10491, r10439; +} +{ +add.f16x2 r10497, r8208, r10494; +} +{ +sub.f16x2 r10500, r9821, r9837; +} +{ +mul.f16x2 r10503, r10500, r10441; +} +{ +sub.f16x2 %31, r10497, r10503; +} +{ +add.f16x2 r10509, r9827, r9843; +} +{ +mul.f16x2 r10512, r10509, r10439; +} +{ +add.f16x2 r10515, r8208, r10512; +} +{ +sub.f16x2 r10518, r9821, r9837; +} +{ +mul.f16x2 r10521, r10518, r10441; +} +{ +add.f16x2 %49, r10515, r10521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10528, {low, high}; +} +{ +neg.f16x2 r10529, r10528; +} +{ +add.f16x2 r10531, r9853, r9869; +} +{ +add.f16x2 %14, r8260, r10531; +} +{ +add.f16x2 r10537, r9859, r9875; +} +{ +add.f16x2 %15, r8296, r10537; +} +{ +add.f16x2 r10543, r9853, r9869; +} +{ +mul.f16x2 r10546, r10543, r10527; +} +{ +add.f16x2 r10549, r8260, r10546; +} +{ +sub.f16x2 r10552, r9859, r9875; +} +{ +mul.f16x2 r10555, r10552, r10529; +} +{ +add.f16x2 %32, r10549, r10555; +} +{ +add.f16x2 r10561, r9853, r9869; +} +{ +mul.f16x2 r10564, r10561, r10527; +} +{ +add.f16x2 r10567, r8260, r10564; +} +{ +sub.f16x2 r10570, r9859, r9875; +} +{ +mul.f16x2 r10573, r10570, r10529; +} +{ +sub.f16x2 %50, r10567, r10573; +} +{ +add.f16x2 r10579, r9859, r9875; +} +{ +mul.f16x2 r10582, r10579, r10527; +} +{ +add.f16x2 r10585, r8296, r10582; +} +{ +sub.f16x2 r10588, r9853, r9869; +} +{ +mul.f16x2 r10591, r10588, r10529; +} +{ +sub.f16x2 %33, r10585, r10591; +} +{ +add.f16x2 r10597, r9859, r9875; +} +{ +mul.f16x2 r10600, r10597, r10527; +} +{ +add.f16x2 r10603, r8296, r10600; +} +{ +sub.f16x2 r10606, r9853, r9869; +} +{ +mul.f16x2 r10609, r10606, r10529; +} +{ +add.f16x2 %51, r10603, r10609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10615, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10616, {low, high}; +} +{ +neg.f16x2 r10617, r10616; +} +{ +add.f16x2 r10619, r9885, r9901; +} +{ +add.f16x2 %16, r8348, r10619; +} +{ +add.f16x2 r10625, r9891, r9907; +} +{ +add.f16x2 %17, r8384, r10625; +} +{ +add.f16x2 r10631, r9885, r9901; +} +{ +mul.f16x2 r10634, r10631, r10615; +} +{ +add.f16x2 r10637, r8348, r10634; +} +{ +sub.f16x2 r10640, r9891, r9907; +} +{ +mul.f16x2 r10643, r10640, r10617; +} +{ +add.f16x2 %34, r10637, r10643; +} +{ +add.f16x2 r10649, r9885, r9901; +} +{ +mul.f16x2 r10652, r10649, r10615; +} +{ +add.f16x2 r10655, r8348, r10652; +} +{ +sub.f16x2 r10658, r9891, r9907; +} +{ +mul.f16x2 r10661, r10658, r10617; +} +{ +sub.f16x2 %52, r10655, r10661; +} +{ +add.f16x2 r10667, r9891, r9907; +} +{ +mul.f16x2 r10670, r10667, r10615; +} +{ +add.f16x2 r10673, r8384, r10670; +} +{ +sub.f16x2 r10676, r9885, r9901; +} +{ +mul.f16x2 r10679, r10676, r10617; +} +{ +sub.f16x2 %35, r10673, r10679; +} +{ +add.f16x2 r10685, r9891, r9907; +} +{ +mul.f16x2 r10688, r10685, r10615; +} +{ +add.f16x2 r10691, r8384, r10688; +} +{ +sub.f16x2 r10694, r9885, r9901; +} +{ +mul.f16x2 r10697, r10694, r10617; +} +{ +add.f16x2 %53, r10691, r10697; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1178, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1043>; +.reg .b32 r<10778>; +.reg .b64 rd<6>; +mov.u32 r10704, %54; +mov.u32 r10777, %tid.y; +mad.lo.s32 r10705, r10777, 78732, r10704; +mov.u32 r10706, %tid.x; +mov.f32 f1034, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1, {low, high}; +} +mov.f32 f1036, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f906, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r265, {low, high}; +} +mov.f32 f908, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r266, {low, high}; +} +mov.f32 f918, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r267, {low, high}; +} +mov.f32 f920, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r268, {low, high}; +} +mov.f32 f942, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r271, {low, high}; +} +mov.f32 f944, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %83, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %83, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %83, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %82, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %81; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %82, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %81; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %82, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %81; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %81; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %82, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %81; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %82, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f898, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r1825, {low, high}; +} +mov.f32 f900, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r1826, {low, high}; +} +mov.f32 f902, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r1827, {low, high}; +} +mov.f32 f904, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1830, {low, high}; +} +mov.f32 f910, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r1831, {low, high}; +} +mov.f32 f912, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r1832, {low, high}; +} +mov.f32 f914, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r1833, {low, high}; +} +mov.f32 f916, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1836, {low, high}; +} +mov.f32 f922, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r1837, {low, high}; +} +mov.f32 f924, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r1838, {low, high}; +} +mov.f32 f926, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r1839, {low, high}; +} +mov.f32 f928, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r1840, {low, high}; +} +mov.f32 f934, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r1843, {low, high}; +} +mov.f32 f936, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r1844, {low, high}; +} +mov.f32 f958, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1848, {low, high}; +} +mov.f32 f950, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r1851, {low, high}; +} +mov.f32 f952, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r1855, {low, high}; +} +mov.f32 f960, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r10706, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r10707, rd3; +mul.lo.s32 r10708, r10707, 729; +sub.s32 r10709, r10706, r10708; +mad.lo.s32 r10710, r10707, 78732, r10705; +cvt.rn.f32.u32 f1037, r10709; +mul.f32 f1038, f1037, 0f39A75CD5; +cos.approx.f32 f309, f1038; +sin.approx.f32 f1039, f1038; +neg.f32 f310, f1039; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +barrier.sync 0; +mad.lo.s32 r10711, r10709, 108, r10710; +st.shared.u32 [r10711], r2140; +st.shared.u32 [r10711+4], r2937; +st.shared.u32 [r10711+8], r2974; +st.shared.u32 [r10711+12], r3011; +st.shared.u32 [r10711+16], r3048; +st.shared.u32 [r10711+20], r3085; +st.shared.u32 [r10711+24], r3122; +st.shared.u32 [r10711+28], r3159; +st.shared.u32 [r10711+32], r3196; +st.shared.u32 [r10711+36], r3233; +st.shared.u32 [r10711+40], r3270; +st.shared.u32 [r10711+44], r3307; +st.shared.u32 [r10711+48], r3344; +st.shared.u32 [r10711+52], r3381; +st.shared.u32 [r10711+56], r3418; +st.shared.u32 [r10711+60], r3455; +st.shared.u32 [r10711+64], r3492; +st.shared.u32 [r10711+68], r3529; +st.shared.u32 [r10711+72], r3566; +st.shared.u32 [r10711+76], r3603; +st.shared.u32 [r10711+80], r3640; +st.shared.u32 [r10711+84], r3677; +st.shared.u32 [r10711+88], r3714; +st.shared.u32 [r10711+92], r3751; +st.shared.u32 [r10711+96], r3788; +st.shared.u32 [r10711+100], r3825; +st.shared.u32 [r10711+104], r3862; +barrier.sync 0; +mad.lo.s32 r10712, r10709, -104, r10711; +ld.shared.u32 r3898, [r10712]; +ld.shared.u32 r4506, [r10712+2916]; +ld.shared.u32 r5114, [r10712+5832]; +ld.shared.u32 r3986, [r10712+8748]; +ld.shared.u32 r4594, [r10712+11664]; +ld.shared.u32 r5202, [r10712+14580]; +ld.shared.u32 r4074, [r10712+17496]; +ld.shared.u32 r4682, [r10712+20412]; +ld.shared.u32 r5290, [r10712+23328]; +ld.shared.u32 r3895, [r10712+26244]; +ld.shared.u32 r4503, [r10712+29160]; +ld.shared.u32 r5111, [r10712+32076]; +ld.shared.u32 r3983, [r10712+34992]; +ld.shared.u32 r4591, [r10712+37908]; +ld.shared.u32 r5199, [r10712+40824]; +ld.shared.u32 r4071, [r10712+43740]; +ld.shared.u32 r4679, [r10712+46656]; +ld.shared.u32 r5287, [r10712+49572]; +ld.shared.u32 r3896, [r10712+52488]; +ld.shared.u32 r4504, [r10712+55404]; +ld.shared.u32 r5112, [r10712+58320]; +ld.shared.u32 r3984, [r10712+61236]; +ld.shared.u32 r4592, [r10712+64152]; +ld.shared.u32 r5200, [r10712+67068]; +ld.shared.u32 r4072, [r10712+69984]; +ld.shared.u32 r4680, [r10712+72900]; +ld.shared.u32 r5288, [r10712+75816]; +barrier.sync 0; +st.shared.u32 [r10711], r2146; +st.shared.u32 [r10711+4], r2944; +st.shared.u32 [r10711+8], r2981; +st.shared.u32 [r10711+12], r3018; +st.shared.u32 [r10711+16], r3055; +st.shared.u32 [r10711+20], r3092; +st.shared.u32 [r10711+24], r3129; +st.shared.u32 [r10711+28], r3166; +st.shared.u32 [r10711+32], r3203; +st.shared.u32 [r10711+36], r3240; +st.shared.u32 [r10711+40], r3277; +st.shared.u32 [r10711+44], r3314; +st.shared.u32 [r10711+48], r3351; +st.shared.u32 [r10711+52], r3388; +st.shared.u32 [r10711+56], r3425; +st.shared.u32 [r10711+60], r3462; +st.shared.u32 [r10711+64], r3499; +st.shared.u32 [r10711+68], r3536; +st.shared.u32 [r10711+72], r3573; +st.shared.u32 [r10711+76], r3610; +st.shared.u32 [r10711+80], r3647; +st.shared.u32 [r10711+84], r3684; +st.shared.u32 [r10711+88], r3721; +st.shared.u32 [r10711+92], r3758; +st.shared.u32 [r10711+96], r3795; +st.shared.u32 [r10711+100], r3832; +st.shared.u32 [r10711+104], r3869; +barrier.sync 0; +ld.shared.u32 r3904, [r10712]; +ld.shared.u32 r4512, [r10712+2916]; +ld.shared.u32 r5120, [r10712+5832]; +ld.shared.u32 r3992, [r10712+8748]; +ld.shared.u32 r4600, [r10712+11664]; +ld.shared.u32 r5208, [r10712+14580]; +ld.shared.u32 r4080, [r10712+17496]; +ld.shared.u32 r4688, [r10712+20412]; +ld.shared.u32 r5296, [r10712+23328]; +ld.shared.u32 r3901, [r10712+26244]; +ld.shared.u32 r4509, [r10712+29160]; +ld.shared.u32 r5117, [r10712+32076]; +ld.shared.u32 r3989, [r10712+34992]; +ld.shared.u32 r4597, [r10712+37908]; +ld.shared.u32 r5205, [r10712+40824]; +ld.shared.u32 r4077, [r10712+43740]; +ld.shared.u32 r4685, [r10712+46656]; +ld.shared.u32 r5293, [r10712+49572]; +ld.shared.u32 r3902, [r10712+52488]; +ld.shared.u32 r4510, [r10712+55404]; +ld.shared.u32 r5118, [r10712+58320]; +ld.shared.u32 r3990, [r10712+61236]; +ld.shared.u32 r4598, [r10712+64152]; +ld.shared.u32 r5206, [r10712+67068]; +ld.shared.u32 r4078, [r10712+69984]; +ld.shared.u32 r4686, [r10712+72900]; +ld.shared.u32 r5294, [r10712+75816]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 r6029, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 r6035, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 r6053, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 r6071, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 r6089, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 r6107, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 r6117, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 r6123, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 r6141, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 r6159, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 r6177, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 r6195, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 r6205, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 r6211, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 r6229, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 r6247, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 r6265, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 r6283, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 r6293, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 r6299, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 r6335, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 r6353, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 r6371, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 r6381, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 r6387, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 r6405, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 r6423, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 r6441, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 r6459, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 r6469, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 r6475, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 r6511, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 r6529, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 r6547, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 r6557, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 r6563, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 r6617, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 r6635, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 r6645, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 r6651, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 r6669, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 r6687, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 r6705, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 r6723, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 r6733, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 r6739, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 r6757, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 r6775, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 r6793, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 r6811, r6802, r6808; +} +mul.wide.u32 rd4, r10709, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r10713, rd5; +sub.s32 r10714, r10709, r10713; +shr.u32 r10715, r10714, 1; +add.s32 r10716, r10715, r10713; +shr.u32 r10717, r10716, 4; +mul.lo.s32 r10718, r10717, 27; +sub.s32 r10719, r10709, r10718; +shl.b32 r10720, r10719, 2; +add.s32 r10721, r10710, r10720; +cvt.rn.f32.u32 f1040, r10717; +mul.f32 f1041, f1040, 0f3C0D3654; +cos.approx.f32 f673, f1041; +sin.approx.f32 f1042, f1041; +neg.f32 f674, f1042; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6814, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6819, {high, high}; +} +{ +mul.f16x2 r6821, r6123, r6819; +} +{ +neg.f16x2 r6824, r6821; +} +{ +fma.rn.f16x2 r6826, r6117, r6817, r6824; +} +{ +mul.f16x2 r6830, r6117, r6819; +} +{ +fma.rn.f16x2 r6833, r6123, r6817, r6830; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6839, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6841, {low, high}; +} +{ +mul.f16x2 r6842, r6839, r6841; +} +{ +mul.f16x2 r6845, r6814, r6837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6848, {high, low}; +} +{ +fma.rn.f16x2 r6850, r6842, r6848, r6845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6856, {high, high}; +} +{ +mul.f16x2 r6858, r6211, r6856; +} +{ +neg.f16x2 r6861, r6858; +} +{ +fma.rn.f16x2 r6863, r6205, r6854, r6861; +} +{ +mul.f16x2 r6867, r6205, r6856; +} +{ +fma.rn.f16x2 r6870, r6211, r6854, r6867; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6876, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6878, {low, high}; +} +{ +mul.f16x2 r6879, r6876, r6878; +} +{ +mul.f16x2 r6882, r6850, r6874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6885, {high, low}; +} +{ +fma.rn.f16x2 r6887, r6879, r6885, r6882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6891, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6893, {high, high}; +} +{ +mul.f16x2 r6895, r6299, r6893; +} +{ +neg.f16x2 r6898, r6895; +} +{ +fma.rn.f16x2 r6900, r6293, r6891, r6898; +} +{ +mul.f16x2 r6904, r6293, r6893; +} +{ +fma.rn.f16x2 r6907, r6299, r6891, r6904; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6913, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6915, {low, high}; +} +{ +mul.f16x2 r6916, r6913, r6915; +} +{ +mul.f16x2 r6919, r6887, r6911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6922, {high, low}; +} +{ +fma.rn.f16x2 r6924, r6916, r6922, r6919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6930, {high, high}; +} +{ +mul.f16x2 r6932, r6387, r6930; +} +{ +neg.f16x2 r6935, r6932; +} +{ +fma.rn.f16x2 r6937, r6381, r6928, r6935; +} +{ +mul.f16x2 r6941, r6381, r6930; +} +{ +fma.rn.f16x2 r6944, r6387, r6928, r6941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6950, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6952, {low, high}; +} +{ +mul.f16x2 r6953, r6950, r6952; +} +{ +mul.f16x2 r6956, r6924, r6948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6959, {high, low}; +} +{ +fma.rn.f16x2 r6961, r6953, r6959, r6956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6967, {high, high}; +} +{ +mul.f16x2 r6969, r6475, r6967; +} +{ +neg.f16x2 r6972, r6969; +} +{ +fma.rn.f16x2 r6974, r6469, r6965, r6972; +} +{ +mul.f16x2 r6978, r6469, r6967; +} +{ +fma.rn.f16x2 r6981, r6475, r6965, r6978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6989, {low, high}; +} +{ +mul.f16x2 r6990, r6987, r6989; +} +{ +mul.f16x2 r6993, r6961, r6985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6996, {high, low}; +} +{ +fma.rn.f16x2 r6998, r6990, r6996, r6993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7004, {high, high}; +} +{ +mul.f16x2 r7006, r6563, r7004; +} +{ +neg.f16x2 r7009, r7006; +} +{ +fma.rn.f16x2 r7011, r6557, r7002, r7009; +} +{ +mul.f16x2 r7015, r6557, r7004; +} +{ +fma.rn.f16x2 r7018, r6563, r7002, r7015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7026, {low, high}; +} +{ +mul.f16x2 r7027, r7024, r7026; +} +{ +mul.f16x2 r7030, r6998, r7022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7033, {high, low}; +} +{ +fma.rn.f16x2 r7035, r7027, r7033, r7030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7041, {high, high}; +} +{ +mul.f16x2 r7043, r6651, r7041; +} +{ +neg.f16x2 r7046, r7043; +} +{ +fma.rn.f16x2 r7048, r6645, r7039, r7046; +} +{ +mul.f16x2 r7052, r6645, r7041; +} +{ +fma.rn.f16x2 r7055, r6651, r7039, r7052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7063, {low, high}; +} +{ +mul.f16x2 r7064, r7061, r7063; +} +{ +mul.f16x2 r7067, r7035, r7059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7070, {high, low}; +} +{ +fma.rn.f16x2 r7072, r7064, r7070, r7067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7078, {high, high}; +} +{ +mul.f16x2 r7080, r6739, r7078; +} +{ +neg.f16x2 r7083, r7080; +} +{ +fma.rn.f16x2 r7085, r6733, r7076, r7083; +} +{ +mul.f16x2 r7089, r6733, r7078; +} +{ +fma.rn.f16x2 r7092, r6739, r7076, r7089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7100, {low, high}; +} +{ +mul.f16x2 r7101, r7098, r7100; +} +{ +mul.f16x2 r7104, r7072, r7096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7107, {high, low}; +} +{ +fma.rn.f16x2 r7109, r7101, r7107, r7104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7115, {high, high}; +} +{ +mul.f16x2 r7117, r6089, r7115; +} +{ +neg.f16x2 r7120, r7117; +} +{ +fma.rn.f16x2 r7122, r6053, r7113, r7120; +} +{ +mul.f16x2 r7126, r6053, r7115; +} +{ +fma.rn.f16x2 r7129, r6089, r7113, r7126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7137, {low, high}; +} +{ +mul.f16x2 r7138, r7135, r7137; +} +{ +mul.f16x2 r7141, r7109, r7133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7144, {high, low}; +} +{ +fma.rn.f16x2 r7146, r7138, r7144, r7141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7152, {high, high}; +} +{ +mul.f16x2 r7154, r6177, r7152; +} +{ +neg.f16x2 r7157, r7154; +} +{ +fma.rn.f16x2 r7159, r6141, r7150, r7157; +} +{ +mul.f16x2 r7163, r6141, r7152; +} +{ +fma.rn.f16x2 r7166, r6177, r7150, r7163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7174, {low, high}; +} +{ +mul.f16x2 r7175, r7172, r7174; +} +{ +mul.f16x2 r7178, r7146, r7170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7181, {high, low}; +} +{ +fma.rn.f16x2 r7183, r7175, r7181, r7178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7189, {high, high}; +} +{ +mul.f16x2 r7191, r6265, r7189; +} +{ +neg.f16x2 r7194, r7191; +} +{ +fma.rn.f16x2 r7196, r6229, r7187, r7194; +} +{ +mul.f16x2 r7200, r6229, r7189; +} +{ +fma.rn.f16x2 r7203, r6265, r7187, r7200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7211, {low, high}; +} +{ +mul.f16x2 r7212, r7209, r7211; +} +{ +mul.f16x2 r7215, r7183, r7207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7218, {high, low}; +} +{ +fma.rn.f16x2 r7220, r7212, r7218, r7215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7226, {high, high}; +} +{ +mul.f16x2 r7228, r6353, r7226; +} +{ +neg.f16x2 r7231, r7228; +} +{ +fma.rn.f16x2 r7233, r6317, r7224, r7231; +} +{ +mul.f16x2 r7237, r6317, r7226; +} +{ +fma.rn.f16x2 r7240, r6353, r7224, r7237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7248, {low, high}; +} +{ +mul.f16x2 r7249, r7246, r7248; +} +{ +mul.f16x2 r7252, r7220, r7244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7255, {high, low}; +} +{ +fma.rn.f16x2 r7257, r7249, r7255, r7252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7263, {high, high}; +} +{ +mul.f16x2 r7265, r6441, r7263; +} +{ +neg.f16x2 r7268, r7265; +} +{ +fma.rn.f16x2 r7270, r6405, r7261, r7268; +} +{ +mul.f16x2 r7274, r6405, r7263; +} +{ +fma.rn.f16x2 r7277, r6441, r7261, r7274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7285, {low, high}; +} +{ +mul.f16x2 r7286, r7283, r7285; +} +{ +mul.f16x2 r7289, r7257, r7281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7292, {high, low}; +} +{ +fma.rn.f16x2 r7294, r7286, r7292, r7289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7300, {high, high}; +} +{ +mul.f16x2 r7302, r6529, r7300; +} +{ +neg.f16x2 r7305, r7302; +} +{ +fma.rn.f16x2 r7307, r6493, r7298, r7305; +} +{ +mul.f16x2 r7311, r6493, r7300; +} +{ +fma.rn.f16x2 r7314, r6529, r7298, r7311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7322, {low, high}; +} +{ +mul.f16x2 r7323, r7320, r7322; +} +{ +mul.f16x2 r7326, r7294, r7318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7329, {high, low}; +} +{ +fma.rn.f16x2 r7331, r7323, r7329, r7326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7337, {high, high}; +} +{ +mul.f16x2 r7339, r6617, r7337; +} +{ +neg.f16x2 r7342, r7339; +} +{ +fma.rn.f16x2 r7344, r6581, r7335, r7342; +} +{ +mul.f16x2 r7348, r6581, r7337; +} +{ +fma.rn.f16x2 r7351, r6617, r7335, r7348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7359, {low, high}; +} +{ +mul.f16x2 r7360, r7357, r7359; +} +{ +mul.f16x2 r7363, r7331, r7355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7366, {high, low}; +} +{ +fma.rn.f16x2 r7368, r7360, r7366, r7363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7374, {high, high}; +} +{ +mul.f16x2 r7376, r6705, r7374; +} +{ +neg.f16x2 r7379, r7376; +} +{ +fma.rn.f16x2 r7381, r6669, r7372, r7379; +} +{ +mul.f16x2 r7385, r6669, r7374; +} +{ +fma.rn.f16x2 r7388, r6705, r7372, r7385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7396, {low, high}; +} +{ +mul.f16x2 r7397, r7394, r7396; +} +{ +mul.f16x2 r7400, r7368, r7392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7403, {high, low}; +} +{ +fma.rn.f16x2 r7405, r7397, r7403, r7400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7411, {high, high}; +} +{ +mul.f16x2 r7413, r6793, r7411; +} +{ +neg.f16x2 r7416, r7413; +} +{ +fma.rn.f16x2 r7418, r6757, r7409, r7416; +} +{ +mul.f16x2 r7422, r6757, r7411; +} +{ +fma.rn.f16x2 r7425, r6793, r7409, r7422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7433, {low, high}; +} +{ +mul.f16x2 r7434, r7431, r7433; +} +{ +mul.f16x2 r7437, r7405, r7429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7440, {high, low}; +} +{ +fma.rn.f16x2 r7442, r7434, r7440, r7437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7448, {high, high}; +} +{ +mul.f16x2 r7450, r6107, r7448; +} +{ +neg.f16x2 r7453, r7450; +} +{ +fma.rn.f16x2 r7455, r6071, r7446, r7453; +} +{ +mul.f16x2 r7459, r6071, r7448; +} +{ +fma.rn.f16x2 r7462, r6107, r7446, r7459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7470, {low, high}; +} +{ +mul.f16x2 r7471, r7468, r7470; +} +{ +mul.f16x2 r7474, r7442, r7466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7477, {high, low}; +} +{ +fma.rn.f16x2 r7479, r7471, r7477, r7474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7485, {high, high}; +} +{ +mul.f16x2 r7487, r6195, r7485; +} +{ +neg.f16x2 r7490, r7487; +} +{ +fma.rn.f16x2 r7492, r6159, r7483, r7490; +} +{ +mul.f16x2 r7496, r6159, r7485; +} +{ +fma.rn.f16x2 r7499, r6195, r7483, r7496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7507, {low, high}; +} +{ +mul.f16x2 r7508, r7505, r7507; +} +{ +mul.f16x2 r7511, r7479, r7503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7514, {high, low}; +} +{ +fma.rn.f16x2 r7516, r7508, r7514, r7511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7522, {high, high}; +} +{ +mul.f16x2 r7524, r6283, r7522; +} +{ +neg.f16x2 r7527, r7524; +} +{ +fma.rn.f16x2 r7529, r6247, r7520, r7527; +} +{ +mul.f16x2 r7533, r6247, r7522; +} +{ +fma.rn.f16x2 r7536, r6283, r7520, r7533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7544, {low, high}; +} +{ +mul.f16x2 r7545, r7542, r7544; +} +{ +mul.f16x2 r7548, r7516, r7540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7551, {high, low}; +} +{ +fma.rn.f16x2 r7553, r7545, r7551, r7548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7559, {high, high}; +} +{ +mul.f16x2 r7561, r6371, r7559; +} +{ +neg.f16x2 r7564, r7561; +} +{ +fma.rn.f16x2 r7566, r6335, r7557, r7564; +} +{ +mul.f16x2 r7570, r6335, r7559; +} +{ +fma.rn.f16x2 r7573, r6371, r7557, r7570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7581, {low, high}; +} +{ +mul.f16x2 r7582, r7579, r7581; +} +{ +mul.f16x2 r7585, r7553, r7577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7588, {high, low}; +} +{ +fma.rn.f16x2 r7590, r7582, r7588, r7585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7596, {high, high}; +} +{ +mul.f16x2 r7598, r6459, r7596; +} +{ +neg.f16x2 r7601, r7598; +} +{ +fma.rn.f16x2 r7603, r6423, r7594, r7601; +} +{ +mul.f16x2 r7607, r6423, r7596; +} +{ +fma.rn.f16x2 r7610, r6459, r7594, r7607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7618, {low, high}; +} +{ +mul.f16x2 r7619, r7616, r7618; +} +{ +mul.f16x2 r7622, r7590, r7614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7625, {high, low}; +} +{ +fma.rn.f16x2 r7627, r7619, r7625, r7622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7633, {high, high}; +} +{ +mul.f16x2 r7635, r6547, r7633; +} +{ +neg.f16x2 r7638, r7635; +} +{ +fma.rn.f16x2 r7640, r6511, r7631, r7638; +} +{ +mul.f16x2 r7644, r6511, r7633; +} +{ +fma.rn.f16x2 r7647, r6547, r7631, r7644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7655, {low, high}; +} +{ +mul.f16x2 r7656, r7653, r7655; +} +{ +mul.f16x2 r7659, r7627, r7651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7662, {high, low}; +} +{ +fma.rn.f16x2 r7664, r7656, r7662, r7659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7670, {high, high}; +} +{ +mul.f16x2 r7672, r6635, r7670; +} +{ +neg.f16x2 r7675, r7672; +} +{ +fma.rn.f16x2 r7677, r6599, r7668, r7675; +} +{ +mul.f16x2 r7681, r6599, r7670; +} +{ +fma.rn.f16x2 r7684, r6635, r7668, r7681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7692, {low, high}; +} +{ +mul.f16x2 r7693, r7690, r7692; +} +{ +mul.f16x2 r7696, r7664, r7688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7699, {high, low}; +} +{ +fma.rn.f16x2 r7701, r7693, r7699, r7696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7707, {high, high}; +} +{ +mul.f16x2 r7709, r6723, r7707; +} +{ +neg.f16x2 r7712, r7709; +} +{ +fma.rn.f16x2 r7714, r6687, r7705, r7712; +} +{ +mul.f16x2 r7718, r6687, r7707; +} +{ +fma.rn.f16x2 r7721, r6723, r7705, r7718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7729, {low, high}; +} +{ +mul.f16x2 r7730, r7727, r7729; +} +{ +mul.f16x2 r7733, r7701, r7725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7736, {high, low}; +} +{ +fma.rn.f16x2 r7738, r7730, r7736, r7733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7744, {high, high}; +} +{ +mul.f16x2 r7746, r6811, r7744; +} +{ +neg.f16x2 r7749, r7746; +} +{ +fma.rn.f16x2 r7751, r6775, r7742, r7749; +} +{ +mul.f16x2 r7755, r6775, r7744; +} +{ +fma.rn.f16x2 r7758, r6811, r7742, r7755; +} +barrier.sync 0; +mad.lo.s32 r10722, r10717, 2916, r10721; +st.shared.u32 [r10722], r6029; +st.shared.u32 [r10722+108], r6826; +st.shared.u32 [r10722+216], r6863; +st.shared.u32 [r10722+324], r6900; +st.shared.u32 [r10722+432], r6937; +st.shared.u32 [r10722+540], r6974; +st.shared.u32 [r10722+648], r7011; +st.shared.u32 [r10722+756], r7048; +st.shared.u32 [r10722+864], r7085; +st.shared.u32 [r10722+972], r7122; +st.shared.u32 [r10722+1080], r7159; +st.shared.u32 [r10722+1188], r7196; +st.shared.u32 [r10722+1296], r7233; +st.shared.u32 [r10722+1404], r7270; +st.shared.u32 [r10722+1512], r7307; +st.shared.u32 [r10722+1620], r7344; +st.shared.u32 [r10722+1728], r7381; +st.shared.u32 [r10722+1836], r7418; +st.shared.u32 [r10722+1944], r7455; +st.shared.u32 [r10722+2052], r7492; +st.shared.u32 [r10722+2160], r7529; +st.shared.u32 [r10722+2268], r7566; +st.shared.u32 [r10722+2376], r7603; +st.shared.u32 [r10722+2484], r7640; +st.shared.u32 [r10722+2592], r7677; +st.shared.u32 [r10722+2700], r7714; +st.shared.u32 [r10722+2808], r7751; +barrier.sync 0; +ld.shared.u32 r7787, [r10712]; +ld.shared.u32 r8395, [r10712+2916]; +ld.shared.u32 r9003, [r10712+5832]; +ld.shared.u32 r7875, [r10712+8748]; +ld.shared.u32 r8483, [r10712+11664]; +ld.shared.u32 r9091, [r10712+14580]; +ld.shared.u32 r7963, [r10712+17496]; +ld.shared.u32 r8571, [r10712+20412]; +ld.shared.u32 r9179, [r10712+23328]; +ld.shared.u32 r7784, [r10712+26244]; +ld.shared.u32 r8392, [r10712+29160]; +ld.shared.u32 r9000, [r10712+32076]; +ld.shared.u32 r7872, [r10712+34992]; +ld.shared.u32 r8480, [r10712+37908]; +ld.shared.u32 r9088, [r10712+40824]; +ld.shared.u32 r7960, [r10712+43740]; +ld.shared.u32 r8568, [r10712+46656]; +ld.shared.u32 r9176, [r10712+49572]; +ld.shared.u32 r7785, [r10712+52488]; +ld.shared.u32 r8393, [r10712+55404]; +ld.shared.u32 r9001, [r10712+58320]; +ld.shared.u32 r7873, [r10712+61236]; +ld.shared.u32 r8481, [r10712+64152]; +ld.shared.u32 r9089, [r10712+67068]; +ld.shared.u32 r7961, [r10712+69984]; +ld.shared.u32 r8569, [r10712+72900]; +ld.shared.u32 r9177, [r10712+75816]; +barrier.sync 0; +st.shared.u32 [r10722], r6035; +st.shared.u32 [r10722+108], r6833; +st.shared.u32 [r10722+216], r6870; +st.shared.u32 [r10722+324], r6907; +st.shared.u32 [r10722+432], r6944; +st.shared.u32 [r10722+540], r6981; +st.shared.u32 [r10722+648], r7018; +st.shared.u32 [r10722+756], r7055; +st.shared.u32 [r10722+864], r7092; +st.shared.u32 [r10722+972], r7129; +st.shared.u32 [r10722+1080], r7166; +st.shared.u32 [r10722+1188], r7203; +st.shared.u32 [r10722+1296], r7240; +st.shared.u32 [r10722+1404], r7277; +st.shared.u32 [r10722+1512], r7314; +st.shared.u32 [r10722+1620], r7351; +st.shared.u32 [r10722+1728], r7388; +st.shared.u32 [r10722+1836], r7425; +st.shared.u32 [r10722+1944], r7462; +st.shared.u32 [r10722+2052], r7499; +st.shared.u32 [r10722+2160], r7536; +st.shared.u32 [r10722+2268], r7573; +st.shared.u32 [r10722+2376], r7610; +st.shared.u32 [r10722+2484], r7647; +st.shared.u32 [r10722+2592], r7684; +st.shared.u32 [r10722+2700], r7721; +st.shared.u32 [r10722+2808], r7758; +barrier.sync 0; +ld.shared.u32 r7793, [r10712]; +ld.shared.u32 r8401, [r10712+2916]; +ld.shared.u32 r9009, [r10712+5832]; +ld.shared.u32 r7881, [r10712+8748]; +ld.shared.u32 r8489, [r10712+11664]; +ld.shared.u32 r9097, [r10712+14580]; +ld.shared.u32 r7969, [r10712+17496]; +ld.shared.u32 r8577, [r10712+20412]; +ld.shared.u32 r9185, [r10712+23328]; +ld.shared.u32 r7790, [r10712+26244]; +ld.shared.u32 r8398, [r10712+29160]; +ld.shared.u32 r9006, [r10712+32076]; +ld.shared.u32 r7878, [r10712+34992]; +ld.shared.u32 r8486, [r10712+37908]; +ld.shared.u32 r9094, [r10712+40824]; +ld.shared.u32 r7966, [r10712+43740]; +ld.shared.u32 r8574, [r10712+46656]; +ld.shared.u32 r9182, [r10712+49572]; +ld.shared.u32 r7791, [r10712+52488]; +ld.shared.u32 r8399, [r10712+55404]; +ld.shared.u32 r9007, [r10712+58320]; +ld.shared.u32 r7879, [r10712+61236]; +ld.shared.u32 r8487, [r10712+64152]; +ld.shared.u32 r9095, [r10712+67068]; +ld.shared.u32 r7967, [r10712+69984]; +ld.shared.u32 r8575, [r10712+72900]; +ld.shared.u32 r9183, [r10712+75816]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7780, {low, high}; +} +{ +neg.f16x2 r7781, r7780; +} +{ +add.f16x2 r7783, r7784, r7785; +} +{ +add.f16x2 r7786, r7787, r7783; +} +{ +add.f16x2 r7789, r7790, r7791; +} +{ +add.f16x2 r7792, r7793, r7789; +} +{ +add.f16x2 r7795, r7784, r7785; +} +{ +mul.f16x2 r7798, r7795, r7779; +} +{ +add.f16x2 r7801, r7787, r7798; +} +{ +sub.f16x2 r7804, r7790, r7791; +} +{ +mul.f16x2 r7807, r7804, r7781; +} +{ +add.f16x2 r7810, r7801, r7807; +} +{ +add.f16x2 r7813, r7784, r7785; +} +{ +mul.f16x2 r7816, r7813, r7779; +} +{ +add.f16x2 r7819, r7787, r7816; +} +{ +sub.f16x2 r7822, r7790, r7791; +} +{ +mul.f16x2 r7825, r7822, r7781; +} +{ +sub.f16x2 r7828, r7819, r7825; +} +{ +add.f16x2 r7831, r7790, r7791; +} +{ +mul.f16x2 r7834, r7831, r7779; +} +{ +add.f16x2 r7837, r7793, r7834; +} +{ +sub.f16x2 r7840, r7784, r7785; +} +{ +mul.f16x2 r7843, r7840, r7781; +} +{ +sub.f16x2 r7846, r7837, r7843; +} +{ +add.f16x2 r7849, r7790, r7791; +} +{ +mul.f16x2 r7852, r7849, r7779; +} +{ +add.f16x2 r7855, r7793, r7852; +} +{ +sub.f16x2 r7858, r7784, r7785; +} +{ +mul.f16x2 r7861, r7858, r7781; +} +{ +add.f16x2 r7864, r7855, r7861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7868, {low, high}; +} +{ +neg.f16x2 r7869, r7868; +} +{ +add.f16x2 r7871, r7872, r7873; +} +{ +add.f16x2 r7874, r7875, r7871; +} +{ +add.f16x2 r7877, r7878, r7879; +} +{ +add.f16x2 r7880, r7881, r7877; +} +{ +add.f16x2 r7883, r7872, r7873; +} +{ +mul.f16x2 r7886, r7883, r7867; +} +{ +add.f16x2 r7889, r7875, r7886; +} +{ +sub.f16x2 r7892, r7878, r7879; +} +{ +mul.f16x2 r7895, r7892, r7869; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 r7901, r7872, r7873; +} +{ +mul.f16x2 r7904, r7901, r7867; +} +{ +add.f16x2 r7907, r7875, r7904; +} +{ +sub.f16x2 r7910, r7878, r7879; +} +{ +mul.f16x2 r7913, r7910, r7869; +} +{ +sub.f16x2 r7916, r7907, r7913; +} +{ +add.f16x2 r7919, r7878, r7879; +} +{ +mul.f16x2 r7922, r7919, r7867; +} +{ +add.f16x2 r7925, r7881, r7922; +} +{ +sub.f16x2 r7928, r7872, r7873; +} +{ +mul.f16x2 r7931, r7928, r7869; +} +{ +sub.f16x2 r7934, r7925, r7931; +} +{ +add.f16x2 r7937, r7878, r7879; +} +{ +mul.f16x2 r7940, r7937, r7867; +} +{ +add.f16x2 r7943, r7881, r7940; +} +{ +sub.f16x2 r7946, r7872, r7873; +} +{ +mul.f16x2 r7949, r7946, r7869; +} +{ +add.f16x2 r7952, r7943, r7949; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7956, {low, high}; +} +{ +neg.f16x2 r7957, r7956; +} +{ +add.f16x2 r7959, r7960, r7961; +} +{ +add.f16x2 r7962, r7963, r7959; +} +{ +add.f16x2 r7965, r7966, r7967; +} +{ +add.f16x2 r7968, r7969, r7965; +} +{ +add.f16x2 r7971, r7960, r7961; +} +{ +mul.f16x2 r7974, r7971, r7955; +} +{ +add.f16x2 r7977, r7963, r7974; +} +{ +sub.f16x2 r7980, r7966, r7967; +} +{ +mul.f16x2 r7983, r7980, r7957; +} +{ +add.f16x2 r7986, r7977, r7983; +} +{ +add.f16x2 r7989, r7960, r7961; +} +{ +mul.f16x2 r7992, r7989, r7955; +} +{ +add.f16x2 r7995, r7963, r7992; +} +{ +sub.f16x2 r7998, r7966, r7967; +} +{ +mul.f16x2 r8001, r7998, r7957; +} +{ +sub.f16x2 r8004, r7995, r8001; +} +{ +add.f16x2 r8007, r7966, r7967; +} +{ +mul.f16x2 r8010, r8007, r7955; +} +{ +add.f16x2 r8013, r7969, r8010; +} +{ +sub.f16x2 r8016, r7960, r7961; +} +{ +mul.f16x2 r8019, r8016, r7957; +} +{ +sub.f16x2 r8022, r8013, r8019; +} +{ +add.f16x2 r8025, r7966, r7967; +} +{ +mul.f16x2 r8028, r8025, r7955; +} +{ +add.f16x2 r8031, r7969, r8028; +} +{ +sub.f16x2 r8034, r7960, r7961; +} +{ +mul.f16x2 r8037, r8034, r7957; +} +{ +add.f16x2 r8040, r8031, r8037; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r8043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r8044, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r8045, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r8046, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r8049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r8050, {low, high}; +} +{ +mul.f16x2 r8059, r7898, r8043; +} +{ +mul.f16x2 r8062, r7934, r8044; +} +{ +sub.f16x2 r8065, r8059, r8062; +} +{ +mul.f16x2 r8068, r7898, r8044; +} +{ +fma.rn.f16x2 r8071, r7934, r8043, r8068; +} +{ +mul.f16x2 r8075, r7986, r8045; +} +{ +mul.f16x2 r8078, r8022, r8046; +} +{ +sub.f16x2 r8081, r8075, r8078; +} +{ +mul.f16x2 r8084, r7986, r8046; +} +{ +fma.rn.f16x2 r8087, r8022, r8045, r8084; +} +{ +mul.f16x2 r8091, r7916, r8045; +} +{ +mul.f16x2 r8094, r7952, r8046; +} +{ +sub.f16x2 r8097, r8091, r8094; +} +{ +mul.f16x2 r8100, r7916, r8046; +} +{ +fma.rn.f16x2 r8103, r7952, r8045, r8100; +} +{ +mul.f16x2 r8107, r8004, r8049; +} +{ +mul.f16x2 r8110, r8040, r8050; +} +{ +sub.f16x2 r8113, r8107, r8110; +} +{ +mul.f16x2 r8116, r8004, r8050; +} +{ +fma.rn.f16x2 r8119, r8040, r8049, r8116; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8124, {low, high}; +} +{ +neg.f16x2 r8125, r8124; +} +{ +add.f16x2 r8127, r7874, r7962; +} +{ +add.f16x2 r8130, r7786, r8127; +} +{ +add.f16x2 r8133, r7880, r7968; +} +{ +add.f16x2 r8136, r7792, r8133; +} +{ +add.f16x2 r8139, r7874, r7962; +} +{ +mul.f16x2 r8142, r8139, r8123; +} +{ +add.f16x2 r8145, r7786, r8142; +} +{ +sub.f16x2 r8148, r7880, r7968; +} +{ +mul.f16x2 r8151, r8148, r8125; +} +{ +add.f16x2 r8154, r8145, r8151; +} +{ +add.f16x2 r8157, r7874, r7962; +} +{ +mul.f16x2 r8160, r8157, r8123; +} +{ +add.f16x2 r8163, r7786, r8160; +} +{ +sub.f16x2 r8166, r7880, r7968; +} +{ +mul.f16x2 r8169, r8166, r8125; +} +{ +sub.f16x2 r8172, r8163, r8169; +} +{ +add.f16x2 r8175, r7880, r7968; +} +{ +mul.f16x2 r8178, r8175, r8123; +} +{ +add.f16x2 r8181, r7792, r8178; +} +{ +sub.f16x2 r8184, r7874, r7962; +} +{ +mul.f16x2 r8187, r8184, r8125; +} +{ +sub.f16x2 r8190, r8181, r8187; +} +{ +add.f16x2 r8193, r7880, r7968; +} +{ +mul.f16x2 r8196, r8193, r8123; +} +{ +add.f16x2 r8199, r7792, r8196; +} +{ +sub.f16x2 r8202, r7874, r7962; +} +{ +mul.f16x2 r8205, r8202, r8125; +} +{ +add.f16x2 r8208, r8199, r8205; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8212, {low, high}; +} +{ +neg.f16x2 r8213, r8212; +} +{ +add.f16x2 r8215, r8065, r8081; +} +{ +add.f16x2 r8218, r7810, r8215; +} +{ +add.f16x2 r8221, r8071, r8087; +} +{ +add.f16x2 r8224, r7846, r8221; +} +{ +add.f16x2 r8227, r8065, r8081; +} +{ +mul.f16x2 r8230, r8227, r8211; +} +{ +add.f16x2 r8233, r7810, r8230; +} +{ +sub.f16x2 r8236, r8071, r8087; +} +{ +mul.f16x2 r8239, r8236, r8213; +} +{ +add.f16x2 r8242, r8233, r8239; +} +{ +add.f16x2 r8245, r8065, r8081; +} +{ +mul.f16x2 r8248, r8245, r8211; +} +{ +add.f16x2 r8251, r7810, r8248; +} +{ +sub.f16x2 r8254, r8071, r8087; +} +{ +mul.f16x2 r8257, r8254, r8213; +} +{ +sub.f16x2 r8260, r8251, r8257; +} +{ +add.f16x2 r8263, r8071, r8087; +} +{ +mul.f16x2 r8266, r8263, r8211; +} +{ +add.f16x2 r8269, r7846, r8266; +} +{ +sub.f16x2 r8272, r8065, r8081; +} +{ +mul.f16x2 r8275, r8272, r8213; +} +{ +sub.f16x2 r8278, r8269, r8275; +} +{ +add.f16x2 r8281, r8071, r8087; +} +{ +mul.f16x2 r8284, r8281, r8211; +} +{ +add.f16x2 r8287, r7846, r8284; +} +{ +sub.f16x2 r8290, r8065, r8081; +} +{ +mul.f16x2 r8293, r8290, r8213; +} +{ +add.f16x2 r8296, r8287, r8293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8299, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8300, {low, high}; +} +{ +neg.f16x2 r8301, r8300; +} +{ +add.f16x2 r8303, r8097, r8113; +} +{ +add.f16x2 r8306, r7828, r8303; +} +{ +add.f16x2 r8309, r8103, r8119; +} +{ +add.f16x2 r8312, r7864, r8309; +} +{ +add.f16x2 r8315, r8097, r8113; +} +{ +mul.f16x2 r8318, r8315, r8299; +} +{ +add.f16x2 r8321, r7828, r8318; +} +{ +sub.f16x2 r8324, r8103, r8119; +} +{ +mul.f16x2 r8327, r8324, r8301; +} +{ +add.f16x2 r8330, r8321, r8327; +} +{ +add.f16x2 r8333, r8097, r8113; +} +{ +mul.f16x2 r8336, r8333, r8299; +} +{ +add.f16x2 r8339, r7828, r8336; +} +{ +sub.f16x2 r8342, r8103, r8119; +} +{ +mul.f16x2 r8345, r8342, r8301; +} +{ +sub.f16x2 r8348, r8339, r8345; +} +{ +add.f16x2 r8351, r8103, r8119; +} +{ +mul.f16x2 r8354, r8351, r8299; +} +{ +add.f16x2 r8357, r7864, r8354; +} +{ +sub.f16x2 r8360, r8097, r8113; +} +{ +mul.f16x2 r8363, r8360, r8301; +} +{ +sub.f16x2 r8366, r8357, r8363; +} +{ +add.f16x2 r8369, r8103, r8119; +} +{ +mul.f16x2 r8372, r8369, r8299; +} +{ +add.f16x2 r8375, r7864, r8372; +} +{ +sub.f16x2 r8378, r8097, r8113; +} +{ +mul.f16x2 r8381, r8378, r8301; +} +{ +add.f16x2 r8384, r8375, r8381; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8387, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8388, {low, high}; +} +{ +neg.f16x2 r8389, r8388; +} +{ +add.f16x2 r8391, r8392, r8393; +} +{ +add.f16x2 r8394, r8395, r8391; +} +{ +add.f16x2 r8397, r8398, r8399; +} +{ +add.f16x2 r8400, r8401, r8397; +} +{ +add.f16x2 r8403, r8392, r8393; +} +{ +mul.f16x2 r8406, r8403, r8387; +} +{ +add.f16x2 r8409, r8395, r8406; +} +{ +sub.f16x2 r8412, r8398, r8399; +} +{ +mul.f16x2 r8415, r8412, r8389; +} +{ +add.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8392, r8393; +} +{ +mul.f16x2 r8424, r8421, r8387; +} +{ +add.f16x2 r8427, r8395, r8424; +} +{ +sub.f16x2 r8430, r8398, r8399; +} +{ +mul.f16x2 r8433, r8430, r8389; +} +{ +sub.f16x2 r8436, r8427, r8433; +} +{ +add.f16x2 r8439, r8398, r8399; +} +{ +mul.f16x2 r8442, r8439, r8387; +} +{ +add.f16x2 r8445, r8401, r8442; +} +{ +sub.f16x2 r8448, r8392, r8393; +} +{ +mul.f16x2 r8451, r8448, r8389; +} +{ +sub.f16x2 r8454, r8445, r8451; +} +{ +add.f16x2 r8457, r8398, r8399; +} +{ +mul.f16x2 r8460, r8457, r8387; +} +{ +add.f16x2 r8463, r8401, r8460; +} +{ +sub.f16x2 r8466, r8392, r8393; +} +{ +mul.f16x2 r8469, r8466, r8389; +} +{ +add.f16x2 r8472, r8463, r8469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8476, {low, high}; +} +{ +neg.f16x2 r8477, r8476; +} +{ +add.f16x2 r8479, r8480, r8481; +} +{ +add.f16x2 r8482, r8483, r8479; +} +{ +add.f16x2 r8485, r8486, r8487; +} +{ +add.f16x2 r8488, r8489, r8485; +} +{ +add.f16x2 r8491, r8480, r8481; +} +{ +mul.f16x2 r8494, r8491, r8475; +} +{ +add.f16x2 r8497, r8483, r8494; +} +{ +sub.f16x2 r8500, r8486, r8487; +} +{ +mul.f16x2 r8503, r8500, r8477; +} +{ +add.f16x2 r8506, r8497, r8503; +} +{ +add.f16x2 r8509, r8480, r8481; +} +{ +mul.f16x2 r8512, r8509, r8475; +} +{ +add.f16x2 r8515, r8483, r8512; +} +{ +sub.f16x2 r8518, r8486, r8487; +} +{ +mul.f16x2 r8521, r8518, r8477; +} +{ +sub.f16x2 r8524, r8515, r8521; +} +{ +add.f16x2 r8527, r8486, r8487; +} +{ +mul.f16x2 r8530, r8527, r8475; +} +{ +add.f16x2 r8533, r8489, r8530; +} +{ +sub.f16x2 r8536, r8480, r8481; +} +{ +mul.f16x2 r8539, r8536, r8477; +} +{ +sub.f16x2 r8542, r8533, r8539; +} +{ +add.f16x2 r8545, r8486, r8487; +} +{ +mul.f16x2 r8548, r8545, r8475; +} +{ +add.f16x2 r8551, r8489, r8548; +} +{ +sub.f16x2 r8554, r8480, r8481; +} +{ +mul.f16x2 r8557, r8554, r8477; +} +{ +add.f16x2 r8560, r8551, r8557; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8564, {low, high}; +} +{ +neg.f16x2 r8565, r8564; +} +{ +add.f16x2 r8567, r8568, r8569; +} +{ +add.f16x2 r8570, r8571, r8567; +} +{ +add.f16x2 r8573, r8574, r8575; +} +{ +add.f16x2 r8576, r8577, r8573; +} +{ +add.f16x2 r8579, r8568, r8569; +} +{ +mul.f16x2 r8582, r8579, r8563; +} +{ +add.f16x2 r8585, r8571, r8582; +} +{ +sub.f16x2 r8588, r8574, r8575; +} +{ +mul.f16x2 r8591, r8588, r8565; +} +{ +add.f16x2 r8594, r8585, r8591; +} +{ +add.f16x2 r8597, r8568, r8569; +} +{ +mul.f16x2 r8600, r8597, r8563; +} +{ +add.f16x2 r8603, r8571, r8600; +} +{ +sub.f16x2 r8606, r8574, r8575; +} +{ +mul.f16x2 r8609, r8606, r8565; +} +{ +sub.f16x2 r8612, r8603, r8609; +} +{ +add.f16x2 r8615, r8574, r8575; +} +{ +mul.f16x2 r8618, r8615, r8563; +} +{ +add.f16x2 r8621, r8577, r8618; +} +{ +sub.f16x2 r8624, r8568, r8569; +} +{ +mul.f16x2 r8627, r8624, r8565; +} +{ +sub.f16x2 r8630, r8621, r8627; +} +{ +add.f16x2 r8633, r8574, r8575; +} +{ +mul.f16x2 r8636, r8633, r8563; +} +{ +add.f16x2 r8639, r8577, r8636; +} +{ +sub.f16x2 r8642, r8568, r8569; +} +{ +mul.f16x2 r8645, r8642, r8565; +} +{ +add.f16x2 r8648, r8639, r8645; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r8651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r8652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r8653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r8654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r8657, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r8658, {low, high}; +} +{ +mul.f16x2 r8667, r8506, r8651; +} +{ +mul.f16x2 r8670, r8542, r8652; +} +{ +sub.f16x2 r8673, r8667, r8670; +} +{ +mul.f16x2 r8676, r8506, r8652; +} +{ +fma.rn.f16x2 r8679, r8542, r8651, r8676; +} +{ +mul.f16x2 r8683, r8594, r8653; +} +{ +mul.f16x2 r8686, r8630, r8654; +} +{ +sub.f16x2 r8689, r8683, r8686; +} +{ +mul.f16x2 r8692, r8594, r8654; +} +{ +fma.rn.f16x2 r8695, r8630, r8653, r8692; +} +{ +mul.f16x2 r8699, r8524, r8653; +} +{ +mul.f16x2 r8702, r8560, r8654; +} +{ +sub.f16x2 r8705, r8699, r8702; +} +{ +mul.f16x2 r8708, r8524, r8654; +} +{ +fma.rn.f16x2 r8711, r8560, r8653, r8708; +} +{ +mul.f16x2 r8715, r8612, r8657; +} +{ +mul.f16x2 r8718, r8648, r8658; +} +{ +sub.f16x2 r8721, r8715, r8718; +} +{ +mul.f16x2 r8724, r8612, r8658; +} +{ +fma.rn.f16x2 r8727, r8648, r8657, r8724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8732, {low, high}; +} +{ +neg.f16x2 r8733, r8732; +} +{ +add.f16x2 r8735, r8482, r8570; +} +{ +add.f16x2 r8738, r8394, r8735; +} +{ +add.f16x2 r8741, r8488, r8576; +} +{ +add.f16x2 r8744, r8400, r8741; +} +{ +add.f16x2 r8747, r8482, r8570; +} +{ +mul.f16x2 r8750, r8747, r8731; +} +{ +add.f16x2 r8753, r8394, r8750; +} +{ +sub.f16x2 r8756, r8488, r8576; +} +{ +mul.f16x2 r8759, r8756, r8733; +} +{ +add.f16x2 r8762, r8753, r8759; +} +{ +add.f16x2 r8765, r8482, r8570; +} +{ +mul.f16x2 r8768, r8765, r8731; +} +{ +add.f16x2 r8771, r8394, r8768; +} +{ +sub.f16x2 r8774, r8488, r8576; +} +{ +mul.f16x2 r8777, r8774, r8733; +} +{ +sub.f16x2 r8780, r8771, r8777; +} +{ +add.f16x2 r8783, r8488, r8576; +} +{ +mul.f16x2 r8786, r8783, r8731; +} +{ +add.f16x2 r8789, r8400, r8786; +} +{ +sub.f16x2 r8792, r8482, r8570; +} +{ +mul.f16x2 r8795, r8792, r8733; +} +{ +sub.f16x2 r8798, r8789, r8795; +} +{ +add.f16x2 r8801, r8488, r8576; +} +{ +mul.f16x2 r8804, r8801, r8731; +} +{ +add.f16x2 r8807, r8400, r8804; +} +{ +sub.f16x2 r8810, r8482, r8570; +} +{ +mul.f16x2 r8813, r8810, r8733; +} +{ +add.f16x2 r8816, r8807, r8813; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8819, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8820, {low, high}; +} +{ +neg.f16x2 r8821, r8820; +} +{ +add.f16x2 r8823, r8673, r8689; +} +{ +add.f16x2 r8826, r8418, r8823; +} +{ +add.f16x2 r8829, r8679, r8695; +} +{ +add.f16x2 r8832, r8454, r8829; +} +{ +add.f16x2 r8835, r8673, r8689; +} +{ +mul.f16x2 r8838, r8835, r8819; +} +{ +add.f16x2 r8841, r8418, r8838; +} +{ +sub.f16x2 r8844, r8679, r8695; +} +{ +mul.f16x2 r8847, r8844, r8821; +} +{ +add.f16x2 r8850, r8841, r8847; +} +{ +add.f16x2 r8853, r8673, r8689; +} +{ +mul.f16x2 r8856, r8853, r8819; +} +{ +add.f16x2 r8859, r8418, r8856; +} +{ +sub.f16x2 r8862, r8679, r8695; +} +{ +mul.f16x2 r8865, r8862, r8821; +} +{ +sub.f16x2 r8868, r8859, r8865; +} +{ +add.f16x2 r8871, r8679, r8695; +} +{ +mul.f16x2 r8874, r8871, r8819; +} +{ +add.f16x2 r8877, r8454, r8874; +} +{ +sub.f16x2 r8880, r8673, r8689; +} +{ +mul.f16x2 r8883, r8880, r8821; +} +{ +sub.f16x2 r8886, r8877, r8883; +} +{ +add.f16x2 r8889, r8679, r8695; +} +{ +mul.f16x2 r8892, r8889, r8819; +} +{ +add.f16x2 r8895, r8454, r8892; +} +{ +sub.f16x2 r8898, r8673, r8689; +} +{ +mul.f16x2 r8901, r8898, r8821; +} +{ +add.f16x2 r8904, r8895, r8901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8908, {low, high}; +} +{ +neg.f16x2 r8909, r8908; +} +{ +add.f16x2 r8911, r8705, r8721; +} +{ +add.f16x2 r8914, r8436, r8911; +} +{ +add.f16x2 r8917, r8711, r8727; +} +{ +add.f16x2 r8920, r8472, r8917; +} +{ +add.f16x2 r8923, r8705, r8721; +} +{ +mul.f16x2 r8926, r8923, r8907; +} +{ +add.f16x2 r8929, r8436, r8926; +} +{ +sub.f16x2 r8932, r8711, r8727; +} +{ +mul.f16x2 r8935, r8932, r8909; +} +{ +add.f16x2 r8938, r8929, r8935; +} +{ +add.f16x2 r8941, r8705, r8721; +} +{ +mul.f16x2 r8944, r8941, r8907; +} +{ +add.f16x2 r8947, r8436, r8944; +} +{ +sub.f16x2 r8950, r8711, r8727; +} +{ +mul.f16x2 r8953, r8950, r8909; +} +{ +sub.f16x2 r8956, r8947, r8953; +} +{ +add.f16x2 r8959, r8711, r8727; +} +{ +mul.f16x2 r8962, r8959, r8907; +} +{ +add.f16x2 r8965, r8472, r8962; +} +{ +sub.f16x2 r8968, r8705, r8721; +} +{ +mul.f16x2 r8971, r8968, r8909; +} +{ +sub.f16x2 r8974, r8965, r8971; +} +{ +add.f16x2 r8977, r8711, r8727; +} +{ +mul.f16x2 r8980, r8977, r8907; +} +{ +add.f16x2 r8983, r8472, r8980; +} +{ +sub.f16x2 r8986, r8705, r8721; +} +{ +mul.f16x2 r8989, r8986, r8909; +} +{ +add.f16x2 r8992, r8983, r8989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8995, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8996, {low, high}; +} +{ +neg.f16x2 r8997, r8996; +} +{ +add.f16x2 r8999, r9000, r9001; +} +{ +add.f16x2 r9002, r9003, r8999; +} +{ +add.f16x2 r9005, r9006, r9007; +} +{ +add.f16x2 r9008, r9009, r9005; +} +{ +add.f16x2 r9011, r9000, r9001; +} +{ +mul.f16x2 r9014, r9011, r8995; +} +{ +add.f16x2 r9017, r9003, r9014; +} +{ +sub.f16x2 r9020, r9006, r9007; +} +{ +mul.f16x2 r9023, r9020, r8997; +} +{ +add.f16x2 r9026, r9017, r9023; +} +{ +add.f16x2 r9029, r9000, r9001; +} +{ +mul.f16x2 r9032, r9029, r8995; +} +{ +add.f16x2 r9035, r9003, r9032; +} +{ +sub.f16x2 r9038, r9006, r9007; +} +{ +mul.f16x2 r9041, r9038, r8997; +} +{ +sub.f16x2 r9044, r9035, r9041; +} +{ +add.f16x2 r9047, r9006, r9007; +} +{ +mul.f16x2 r9050, r9047, r8995; +} +{ +add.f16x2 r9053, r9009, r9050; +} +{ +sub.f16x2 r9056, r9000, r9001; +} +{ +mul.f16x2 r9059, r9056, r8997; +} +{ +sub.f16x2 r9062, r9053, r9059; +} +{ +add.f16x2 r9065, r9006, r9007; +} +{ +mul.f16x2 r9068, r9065, r8995; +} +{ +add.f16x2 r9071, r9009, r9068; +} +{ +sub.f16x2 r9074, r9000, r9001; +} +{ +mul.f16x2 r9077, r9074, r8997; +} +{ +add.f16x2 r9080, r9071, r9077; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9083, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9084, {low, high}; +} +{ +neg.f16x2 r9085, r9084; +} +{ +add.f16x2 r9087, r9088, r9089; +} +{ +add.f16x2 r9090, r9091, r9087; +} +{ +add.f16x2 r9093, r9094, r9095; +} +{ +add.f16x2 r9096, r9097, r9093; +} +{ +add.f16x2 r9099, r9088, r9089; +} +{ +mul.f16x2 r9102, r9099, r9083; +} +{ +add.f16x2 r9105, r9091, r9102; +} +{ +sub.f16x2 r9108, r9094, r9095; +} +{ +mul.f16x2 r9111, r9108, r9085; +} +{ +add.f16x2 r9114, r9105, r9111; +} +{ +add.f16x2 r9117, r9088, r9089; +} +{ +mul.f16x2 r9120, r9117, r9083; +} +{ +add.f16x2 r9123, r9091, r9120; +} +{ +sub.f16x2 r9126, r9094, r9095; +} +{ +mul.f16x2 r9129, r9126, r9085; +} +{ +sub.f16x2 r9132, r9123, r9129; +} +{ +add.f16x2 r9135, r9094, r9095; +} +{ +mul.f16x2 r9138, r9135, r9083; +} +{ +add.f16x2 r9141, r9097, r9138; +} +{ +sub.f16x2 r9144, r9088, r9089; +} +{ +mul.f16x2 r9147, r9144, r9085; +} +{ +sub.f16x2 r9150, r9141, r9147; +} +{ +add.f16x2 r9153, r9094, r9095; +} +{ +mul.f16x2 r9156, r9153, r9083; +} +{ +add.f16x2 r9159, r9097, r9156; +} +{ +sub.f16x2 r9162, r9088, r9089; +} +{ +mul.f16x2 r9165, r9162, r9085; +} +{ +add.f16x2 r9168, r9159, r9165; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9171, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9172, {low, high}; +} +{ +neg.f16x2 r9173, r9172; +} +{ +add.f16x2 r9175, r9176, r9177; +} +{ +add.f16x2 r9178, r9179, r9175; +} +{ +add.f16x2 r9181, r9182, r9183; +} +{ +add.f16x2 r9184, r9185, r9181; +} +{ +add.f16x2 r9187, r9176, r9177; +} +{ +mul.f16x2 r9190, r9187, r9171; +} +{ +add.f16x2 r9193, r9179, r9190; +} +{ +sub.f16x2 r9196, r9182, r9183; +} +{ +mul.f16x2 r9199, r9196, r9173; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +add.f16x2 r9205, r9176, r9177; +} +{ +mul.f16x2 r9208, r9205, r9171; +} +{ +add.f16x2 r9211, r9179, r9208; +} +{ +sub.f16x2 r9214, r9182, r9183; +} +{ +mul.f16x2 r9217, r9214, r9173; +} +{ +sub.f16x2 r9220, r9211, r9217; +} +{ +add.f16x2 r9223, r9182, r9183; +} +{ +mul.f16x2 r9226, r9223, r9171; +} +{ +add.f16x2 r9229, r9185, r9226; +} +{ +sub.f16x2 r9232, r9176, r9177; +} +{ +mul.f16x2 r9235, r9232, r9173; +} +{ +sub.f16x2 r9238, r9229, r9235; +} +{ +add.f16x2 r9241, r9182, r9183; +} +{ +mul.f16x2 r9244, r9241, r9171; +} +{ +add.f16x2 r9247, r9185, r9244; +} +{ +sub.f16x2 r9250, r9176, r9177; +} +{ +mul.f16x2 r9253, r9250, r9173; +} +{ +add.f16x2 r9256, r9247, r9253; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9266, {low, high}; +} +{ +mul.f16x2 r9275, r9114, r9259; +} +{ +mul.f16x2 r9278, r9150, r9260; +} +{ +sub.f16x2 r9281, r9275, r9278; +} +{ +mul.f16x2 r9284, r9114, r9260; +} +{ +fma.rn.f16x2 r9287, r9150, r9259, r9284; +} +{ +mul.f16x2 r9291, r9202, r9261; +} +{ +mul.f16x2 r9294, r9238, r9262; +} +{ +sub.f16x2 r9297, r9291, r9294; +} +{ +mul.f16x2 r9300, r9202, r9262; +} +{ +fma.rn.f16x2 r9303, r9238, r9261, r9300; +} +{ +mul.f16x2 r9307, r9132, r9261; +} +{ +mul.f16x2 r9310, r9168, r9262; +} +{ +sub.f16x2 r9313, r9307, r9310; +} +{ +mul.f16x2 r9316, r9132, r9262; +} +{ +fma.rn.f16x2 r9319, r9168, r9261, r9316; +} +{ +mul.f16x2 r9323, r9220, r9265; +} +{ +mul.f16x2 r9326, r9256, r9266; +} +{ +sub.f16x2 r9329, r9323, r9326; +} +{ +mul.f16x2 r9332, r9220, r9266; +} +{ +fma.rn.f16x2 r9335, r9256, r9265, r9332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9340, {low, high}; +} +{ +neg.f16x2 r9341, r9340; +} +{ +add.f16x2 r9343, r9090, r9178; +} +{ +add.f16x2 r9346, r9002, r9343; +} +{ +add.f16x2 r9349, r9096, r9184; +} +{ +add.f16x2 r9352, r9008, r9349; +} +{ +add.f16x2 r9355, r9090, r9178; +} +{ +mul.f16x2 r9358, r9355, r9339; +} +{ +add.f16x2 r9361, r9002, r9358; +} +{ +sub.f16x2 r9364, r9096, r9184; +} +{ +mul.f16x2 r9367, r9364, r9341; +} +{ +add.f16x2 r9370, r9361, r9367; +} +{ +add.f16x2 r9373, r9090, r9178; +} +{ +mul.f16x2 r9376, r9373, r9339; +} +{ +add.f16x2 r9379, r9002, r9376; +} +{ +sub.f16x2 r9382, r9096, r9184; +} +{ +mul.f16x2 r9385, r9382, r9341; +} +{ +sub.f16x2 r9388, r9379, r9385; +} +{ +add.f16x2 r9391, r9096, r9184; +} +{ +mul.f16x2 r9394, r9391, r9339; +} +{ +add.f16x2 r9397, r9008, r9394; +} +{ +sub.f16x2 r9400, r9090, r9178; +} +{ +mul.f16x2 r9403, r9400, r9341; +} +{ +sub.f16x2 r9406, r9397, r9403; +} +{ +add.f16x2 r9409, r9096, r9184; +} +{ +mul.f16x2 r9412, r9409, r9339; +} +{ +add.f16x2 r9415, r9008, r9412; +} +{ +sub.f16x2 r9418, r9090, r9178; +} +{ +mul.f16x2 r9421, r9418, r9341; +} +{ +add.f16x2 r9424, r9415, r9421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9427, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9428, {low, high}; +} +{ +neg.f16x2 r9429, r9428; +} +{ +add.f16x2 r9431, r9281, r9297; +} +{ +add.f16x2 r9434, r9026, r9431; +} +{ +add.f16x2 r9437, r9287, r9303; +} +{ +add.f16x2 r9440, r9062, r9437; +} +{ +add.f16x2 r9443, r9281, r9297; +} +{ +mul.f16x2 r9446, r9443, r9427; +} +{ +add.f16x2 r9449, r9026, r9446; +} +{ +sub.f16x2 r9452, r9287, r9303; +} +{ +mul.f16x2 r9455, r9452, r9429; +} +{ +add.f16x2 r9458, r9449, r9455; +} +{ +add.f16x2 r9461, r9281, r9297; +} +{ +mul.f16x2 r9464, r9461, r9427; +} +{ +add.f16x2 r9467, r9026, r9464; +} +{ +sub.f16x2 r9470, r9287, r9303; +} +{ +mul.f16x2 r9473, r9470, r9429; +} +{ +sub.f16x2 r9476, r9467, r9473; +} +{ +add.f16x2 r9479, r9287, r9303; +} +{ +mul.f16x2 r9482, r9479, r9427; +} +{ +add.f16x2 r9485, r9062, r9482; +} +{ +sub.f16x2 r9488, r9281, r9297; +} +{ +mul.f16x2 r9491, r9488, r9429; +} +{ +sub.f16x2 r9494, r9485, r9491; +} +{ +add.f16x2 r9497, r9287, r9303; +} +{ +mul.f16x2 r9500, r9497, r9427; +} +{ +add.f16x2 r9503, r9062, r9500; +} +{ +sub.f16x2 r9506, r9281, r9297; +} +{ +mul.f16x2 r9509, r9506, r9429; +} +{ +add.f16x2 r9512, r9503, r9509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9516, {low, high}; +} +{ +neg.f16x2 r9517, r9516; +} +{ +add.f16x2 r9519, r9313, r9329; +} +{ +add.f16x2 r9522, r9044, r9519; +} +{ +add.f16x2 r9525, r9319, r9335; +} +{ +add.f16x2 r9528, r9080, r9525; +} +{ +add.f16x2 r9531, r9313, r9329; +} +{ +mul.f16x2 r9534, r9531, r9515; +} +{ +add.f16x2 r9537, r9044, r9534; +} +{ +sub.f16x2 r9540, r9319, r9335; +} +{ +mul.f16x2 r9543, r9540, r9517; +} +{ +add.f16x2 r9546, r9537, r9543; +} +{ +add.f16x2 r9549, r9313, r9329; +} +{ +mul.f16x2 r9552, r9549, r9515; +} +{ +add.f16x2 r9555, r9044, r9552; +} +{ +sub.f16x2 r9558, r9319, r9335; +} +{ +mul.f16x2 r9561, r9558, r9517; +} +{ +sub.f16x2 r9564, r9555, r9561; +} +{ +add.f16x2 r9567, r9319, r9335; +} +{ +mul.f16x2 r9570, r9567, r9515; +} +{ +add.f16x2 r9573, r9080, r9570; +} +{ +sub.f16x2 r9576, r9313, r9329; +} +{ +mul.f16x2 r9579, r9576, r9517; +} +{ +sub.f16x2 r9582, r9573, r9579; +} +{ +add.f16x2 r9585, r9319, r9335; +} +{ +mul.f16x2 r9588, r9585, r9515; +} +{ +add.f16x2 r9591, r9080, r9588; +} +{ +sub.f16x2 r9594, r9313, r9329; +} +{ +mul.f16x2 r9597, r9594, r9517; +} +{ +add.f16x2 r9600, r9591, r9597; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r9603, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r9604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r9605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r9606, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9607, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9608, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r9609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r9610, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r9611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r9612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9614, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r9615, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r9616, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r9617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r9618, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r9621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r9622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r9629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r9630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r9633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r9634, {low, high}; +} +{ +mul.f16x2 r9655, r8826, r9603; +} +{ +mul.f16x2 r9658, r8832, r9604; +} +{ +sub.f16x2 r9661, r9655, r9658; +} +{ +mul.f16x2 r9664, r8826, r9604; +} +{ +fma.rn.f16x2 r9667, r8832, r9603, r9664; +} +{ +mul.f16x2 r9671, r9434, r9605; +} +{ +mul.f16x2 r9674, r9440, r9606; +} +{ +sub.f16x2 r9677, r9671, r9674; +} +{ +mul.f16x2 r9680, r9434, r9606; +} +{ +fma.rn.f16x2 r9683, r9440, r9605, r9680; +} +{ +mul.f16x2 r9687, r8914, r9605; +} +{ +mul.f16x2 r9690, r8920, r9606; +} +{ +sub.f16x2 r9693, r9687, r9690; +} +{ +mul.f16x2 r9696, r8914, r9606; +} +{ +fma.rn.f16x2 r9699, r8920, r9605, r9696; +} +{ +mul.f16x2 r9703, r9522, r9609; +} +{ +mul.f16x2 r9706, r9528, r9610; +} +{ +sub.f16x2 r9709, r9703, r9706; +} +{ +mul.f16x2 r9712, r9522, r9610; +} +{ +fma.rn.f16x2 r9715, r9528, r9609, r9712; +} +{ +mul.f16x2 r9719, r8762, r9607; +} +{ +mul.f16x2 r9722, r8798, r9608; +} +{ +sub.f16x2 r9725, r9719, r9722; +} +{ +mul.f16x2 r9728, r8762, r9608; +} +{ +fma.rn.f16x2 r9731, r8798, r9607, r9728; +} +{ +mul.f16x2 r9735, r9370, r9613; +} +{ +mul.f16x2 r9738, r9406, r9614; +} +{ +sub.f16x2 r9741, r9735, r9738; +} +{ +mul.f16x2 r9744, r9370, r9614; +} +{ +fma.rn.f16x2 r9747, r9406, r9613, r9744; +} +{ +mul.f16x2 r9751, r8850, r9609; +} +{ +mul.f16x2 r9754, r8886, r9610; +} +{ +sub.f16x2 r9757, r9751, r9754; +} +{ +mul.f16x2 r9760, r8850, r9610; +} +{ +fma.rn.f16x2 r9763, r8886, r9609, r9760; +} +{ +mul.f16x2 r9767, r9458, r9617; +} +{ +mul.f16x2 r9770, r9494, r9618; +} +{ +sub.f16x2 r9773, r9767, r9770; +} +{ +mul.f16x2 r9776, r9458, r9618; +} +{ +fma.rn.f16x2 r9779, r9494, r9617, r9776; +} +{ +mul.f16x2 r9783, r8938, r9611; +} +{ +mul.f16x2 r9786, r8974, r9612; +} +{ +sub.f16x2 r9789, r9783, r9786; +} +{ +mul.f16x2 r9792, r8938, r9612; +} +{ +fma.rn.f16x2 r9795, r8974, r9611, r9792; +} +{ +mul.f16x2 r9799, r9546, r9621; +} +{ +mul.f16x2 r9802, r9582, r9622; +} +{ +sub.f16x2 r9805, r9799, r9802; +} +{ +mul.f16x2 r9808, r9546, r9622; +} +{ +fma.rn.f16x2 r9811, r9582, r9621, r9808; +} +{ +mul.f16x2 r9815, r8780, r9613; +} +{ +mul.f16x2 r9818, r8816, r9614; +} +{ +sub.f16x2 r9821, r9815, r9818; +} +{ +mul.f16x2 r9824, r8780, r9614; +} +{ +fma.rn.f16x2 r9827, r8816, r9613, r9824; +} +{ +mul.f16x2 r9831, r9388, r9625; +} +{ +mul.f16x2 r9834, r9424, r9626; +} +{ +sub.f16x2 r9837, r9831, r9834; +} +{ +mul.f16x2 r9840, r9388, r9626; +} +{ +fma.rn.f16x2 r9843, r9424, r9625, r9840; +} +{ +mul.f16x2 r9847, r8868, r9615; +} +{ +mul.f16x2 r9850, r8904, r9616; +} +{ +sub.f16x2 r9853, r9847, r9850; +} +{ +mul.f16x2 r9856, r8868, r9616; +} +{ +fma.rn.f16x2 r9859, r8904, r9615, r9856; +} +{ +mul.f16x2 r9863, r9476, r9629; +} +{ +mul.f16x2 r9866, r9512, r9630; +} +{ +sub.f16x2 r9869, r9863, r9866; +} +{ +mul.f16x2 r9872, r9476, r9630; +} +{ +fma.rn.f16x2 r9875, r9512, r9629, r9872; +} +{ +mul.f16x2 r9879, r8956, r9617; +} +{ +mul.f16x2 r9882, r8992, r9618; +} +{ +sub.f16x2 r9885, r9879, r9882; +} +{ +mul.f16x2 r9888, r8956, r9618; +} +{ +fma.rn.f16x2 r9891, r8992, r9617, r9888; +} +{ +mul.f16x2 r9895, r9564, r9633; +} +{ +mul.f16x2 r9898, r9600, r9634; +} +{ +sub.f16x2 r9901, r9895, r9898; +} +{ +mul.f16x2 r9904, r9564, r9634; +} +{ +fma.rn.f16x2 r9907, r9600, r9633, r9904; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9911, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9912, {low, high}; +} +{ +neg.f16x2 r9913, r9912; +} +{ +add.f16x2 r9915, r8738, r9346; +} +{ +add.f16x2 %0, r8130, r9915; +} +{ +add.f16x2 r9921, r8744, r9352; +} +{ +add.f16x2 %1, r8136, r9921; +} +{ +add.f16x2 r9927, r8738, r9346; +} +{ +mul.f16x2 r9930, r9927, r9911; +} +{ +add.f16x2 r9933, r8130, r9930; +} +{ +sub.f16x2 r9936, r8744, r9352; +} +{ +mul.f16x2 r9939, r9936, r9913; +} +{ +add.f16x2 %18, r9933, r9939; +} +{ +add.f16x2 r9945, r8738, r9346; +} +{ +mul.f16x2 r9948, r9945, r9911; +} +{ +add.f16x2 r9951, r8130, r9948; +} +{ +sub.f16x2 r9954, r8744, r9352; +} +{ +mul.f16x2 r9957, r9954, r9913; +} +{ +sub.f16x2 %36, r9951, r9957; +} +{ +add.f16x2 r9963, r8744, r9352; +} +{ +mul.f16x2 r9966, r9963, r9911; +} +{ +add.f16x2 r9969, r8136, r9966; +} +{ +sub.f16x2 r9972, r8738, r9346; +} +{ +mul.f16x2 r9975, r9972, r9913; +} +{ +sub.f16x2 %19, r9969, r9975; +} +{ +add.f16x2 r9981, r8744, r9352; +} +{ +mul.f16x2 r9984, r9981, r9911; +} +{ +add.f16x2 r9987, r8136, r9984; +} +{ +sub.f16x2 r9990, r8738, r9346; +} +{ +mul.f16x2 r9993, r9990, r9913; +} +{ +add.f16x2 %37, r9987, r9993; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10000, {low, high}; +} +{ +neg.f16x2 r10001, r10000; +} +{ +add.f16x2 r10003, r9661, r9677; +} +{ +add.f16x2 %2, r8218, r10003; +} +{ +add.f16x2 r10009, r9667, r9683; +} +{ +add.f16x2 %3, r8224, r10009; +} +{ +add.f16x2 r10015, r9661, r9677; +} +{ +mul.f16x2 r10018, r10015, r9999; +} +{ +add.f16x2 r10021, r8218, r10018; +} +{ +sub.f16x2 r10024, r9667, r9683; +} +{ +mul.f16x2 r10027, r10024, r10001; +} +{ +add.f16x2 %20, r10021, r10027; +} +{ +add.f16x2 r10033, r9661, r9677; +} +{ +mul.f16x2 r10036, r10033, r9999; +} +{ +add.f16x2 r10039, r8218, r10036; +} +{ +sub.f16x2 r10042, r9667, r9683; +} +{ +mul.f16x2 r10045, r10042, r10001; +} +{ +sub.f16x2 %38, r10039, r10045; +} +{ +add.f16x2 r10051, r9667, r9683; +} +{ +mul.f16x2 r10054, r10051, r9999; +} +{ +add.f16x2 r10057, r8224, r10054; +} +{ +sub.f16x2 r10060, r9661, r9677; +} +{ +mul.f16x2 r10063, r10060, r10001; +} +{ +sub.f16x2 %21, r10057, r10063; +} +{ +add.f16x2 r10069, r9667, r9683; +} +{ +mul.f16x2 r10072, r10069, r9999; +} +{ +add.f16x2 r10075, r8224, r10072; +} +{ +sub.f16x2 r10078, r9661, r9677; +} +{ +mul.f16x2 r10081, r10078, r10001; +} +{ +add.f16x2 %39, r10075, r10081; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10087, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10088, {low, high}; +} +{ +neg.f16x2 r10089, r10088; +} +{ +add.f16x2 r10091, r9693, r9709; +} +{ +add.f16x2 %4, r8306, r10091; +} +{ +add.f16x2 r10097, r9699, r9715; +} +{ +add.f16x2 %5, r8312, r10097; +} +{ +add.f16x2 r10103, r9693, r9709; +} +{ +mul.f16x2 r10106, r10103, r10087; +} +{ +add.f16x2 r10109, r8306, r10106; +} +{ +sub.f16x2 r10112, r9699, r9715; +} +{ +mul.f16x2 r10115, r10112, r10089; +} +{ +add.f16x2 %22, r10109, r10115; +} +{ +add.f16x2 r10121, r9693, r9709; +} +{ +mul.f16x2 r10124, r10121, r10087; +} +{ +add.f16x2 r10127, r8306, r10124; +} +{ +sub.f16x2 r10130, r9699, r9715; +} +{ +mul.f16x2 r10133, r10130, r10089; +} +{ +sub.f16x2 %40, r10127, r10133; +} +{ +add.f16x2 r10139, r9699, r9715; +} +{ +mul.f16x2 r10142, r10139, r10087; +} +{ +add.f16x2 r10145, r8312, r10142; +} +{ +sub.f16x2 r10148, r9693, r9709; +} +{ +mul.f16x2 r10151, r10148, r10089; +} +{ +sub.f16x2 %23, r10145, r10151; +} +{ +add.f16x2 r10157, r9699, r9715; +} +{ +mul.f16x2 r10160, r10157, r10087; +} +{ +add.f16x2 r10163, r8312, r10160; +} +{ +sub.f16x2 r10166, r9693, r9709; +} +{ +mul.f16x2 r10169, r10166, r10089; +} +{ +add.f16x2 %41, r10163, r10169; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10176, {low, high}; +} +{ +neg.f16x2 r10177, r10176; +} +{ +add.f16x2 r10179, r9725, r9741; +} +{ +add.f16x2 %6, r8154, r10179; +} +{ +add.f16x2 r10185, r9731, r9747; +} +{ +add.f16x2 %7, r8190, r10185; +} +{ +add.f16x2 r10191, r9725, r9741; +} +{ +mul.f16x2 r10194, r10191, r10175; +} +{ +add.f16x2 r10197, r8154, r10194; +} +{ +sub.f16x2 r10200, r9731, r9747; +} +{ +mul.f16x2 r10203, r10200, r10177; +} +{ +add.f16x2 %24, r10197, r10203; +} +{ +add.f16x2 r10209, r9725, r9741; +} +{ +mul.f16x2 r10212, r10209, r10175; +} +{ +add.f16x2 r10215, r8154, r10212; +} +{ +sub.f16x2 r10218, r9731, r9747; +} +{ +mul.f16x2 r10221, r10218, r10177; +} +{ +sub.f16x2 %42, r10215, r10221; +} +{ +add.f16x2 r10227, r9731, r9747; +} +{ +mul.f16x2 r10230, r10227, r10175; +} +{ +add.f16x2 r10233, r8190, r10230; +} +{ +sub.f16x2 r10236, r9725, r9741; +} +{ +mul.f16x2 r10239, r10236, r10177; +} +{ +sub.f16x2 %25, r10233, r10239; +} +{ +add.f16x2 r10245, r9731, r9747; +} +{ +mul.f16x2 r10248, r10245, r10175; +} +{ +add.f16x2 r10251, r8190, r10248; +} +{ +sub.f16x2 r10254, r9725, r9741; +} +{ +mul.f16x2 r10257, r10254, r10177; +} +{ +add.f16x2 %43, r10251, r10257; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10264, {low, high}; +} +{ +neg.f16x2 r10265, r10264; +} +{ +add.f16x2 r10267, r9757, r9773; +} +{ +add.f16x2 %8, r8242, r10267; +} +{ +add.f16x2 r10273, r9763, r9779; +} +{ +add.f16x2 %9, r8278, r10273; +} +{ +add.f16x2 r10279, r9757, r9773; +} +{ +mul.f16x2 r10282, r10279, r10263; +} +{ +add.f16x2 r10285, r8242, r10282; +} +{ +sub.f16x2 r10288, r9763, r9779; +} +{ +mul.f16x2 r10291, r10288, r10265; +} +{ +add.f16x2 %26, r10285, r10291; +} +{ +add.f16x2 r10297, r9757, r9773; +} +{ +mul.f16x2 r10300, r10297, r10263; +} +{ +add.f16x2 r10303, r8242, r10300; +} +{ +sub.f16x2 r10306, r9763, r9779; +} +{ +mul.f16x2 r10309, r10306, r10265; +} +{ +sub.f16x2 %44, r10303, r10309; +} +{ +add.f16x2 r10315, r9763, r9779; +} +{ +mul.f16x2 r10318, r10315, r10263; +} +{ +add.f16x2 r10321, r8278, r10318; +} +{ +sub.f16x2 r10324, r9757, r9773; +} +{ +mul.f16x2 r10327, r10324, r10265; +} +{ +sub.f16x2 %27, r10321, r10327; +} +{ +add.f16x2 r10333, r9763, r9779; +} +{ +mul.f16x2 r10336, r10333, r10263; +} +{ +add.f16x2 r10339, r8278, r10336; +} +{ +sub.f16x2 r10342, r9757, r9773; +} +{ +mul.f16x2 r10345, r10342, r10265; +} +{ +add.f16x2 %45, r10339, r10345; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10351, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10352, {low, high}; +} +{ +neg.f16x2 r10353, r10352; +} +{ +add.f16x2 r10355, r9789, r9805; +} +{ +add.f16x2 %10, r8330, r10355; +} +{ +add.f16x2 r10361, r9795, r9811; +} +{ +add.f16x2 %11, r8366, r10361; +} +{ +add.f16x2 r10367, r9789, r9805; +} +{ +mul.f16x2 r10370, r10367, r10351; +} +{ +add.f16x2 r10373, r8330, r10370; +} +{ +sub.f16x2 r10376, r9795, r9811; +} +{ +mul.f16x2 r10379, r10376, r10353; +} +{ +add.f16x2 %28, r10373, r10379; +} +{ +add.f16x2 r10385, r9789, r9805; +} +{ +mul.f16x2 r10388, r10385, r10351; +} +{ +add.f16x2 r10391, r8330, r10388; +} +{ +sub.f16x2 r10394, r9795, r9811; +} +{ +mul.f16x2 r10397, r10394, r10353; +} +{ +sub.f16x2 %46, r10391, r10397; +} +{ +add.f16x2 r10403, r9795, r9811; +} +{ +mul.f16x2 r10406, r10403, r10351; +} +{ +add.f16x2 r10409, r8366, r10406; +} +{ +sub.f16x2 r10412, r9789, r9805; +} +{ +mul.f16x2 r10415, r10412, r10353; +} +{ +sub.f16x2 %29, r10409, r10415; +} +{ +add.f16x2 r10421, r9795, r9811; +} +{ +mul.f16x2 r10424, r10421, r10351; +} +{ +add.f16x2 r10427, r8366, r10424; +} +{ +sub.f16x2 r10430, r9789, r9805; +} +{ +mul.f16x2 r10433, r10430, r10353; +} +{ +add.f16x2 %47, r10427, r10433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10440, {low, high}; +} +{ +neg.f16x2 r10441, r10440; +} +{ +add.f16x2 r10443, r9821, r9837; +} +{ +add.f16x2 %12, r8172, r10443; +} +{ +add.f16x2 r10449, r9827, r9843; +} +{ +add.f16x2 %13, r8208, r10449; +} +{ +add.f16x2 r10455, r9821, r9837; +} +{ +mul.f16x2 r10458, r10455, r10439; +} +{ +add.f16x2 r10461, r8172, r10458; +} +{ +sub.f16x2 r10464, r9827, r9843; +} +{ +mul.f16x2 r10467, r10464, r10441; +} +{ +add.f16x2 %30, r10461, r10467; +} +{ +add.f16x2 r10473, r9821, r9837; +} +{ +mul.f16x2 r10476, r10473, r10439; +} +{ +add.f16x2 r10479, r8172, r10476; +} +{ +sub.f16x2 r10482, r9827, r9843; +} +{ +mul.f16x2 r10485, r10482, r10441; +} +{ +sub.f16x2 %48, r10479, r10485; +} +{ +add.f16x2 r10491, r9827, r9843; +} +{ +mul.f16x2 r10494, r10491, r10439; +} +{ +add.f16x2 r10497, r8208, r10494; +} +{ +sub.f16x2 r10500, r9821, r9837; +} +{ +mul.f16x2 r10503, r10500, r10441; +} +{ +sub.f16x2 %31, r10497, r10503; +} +{ +add.f16x2 r10509, r9827, r9843; +} +{ +mul.f16x2 r10512, r10509, r10439; +} +{ +add.f16x2 r10515, r8208, r10512; +} +{ +sub.f16x2 r10518, r9821, r9837; +} +{ +mul.f16x2 r10521, r10518, r10441; +} +{ +add.f16x2 %49, r10515, r10521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10528, {low, high}; +} +{ +neg.f16x2 r10529, r10528; +} +{ +add.f16x2 r10531, r9853, r9869; +} +{ +add.f16x2 %14, r8260, r10531; +} +{ +add.f16x2 r10537, r9859, r9875; +} +{ +add.f16x2 %15, r8296, r10537; +} +{ +add.f16x2 r10543, r9853, r9869; +} +{ +mul.f16x2 r10546, r10543, r10527; +} +{ +add.f16x2 r10549, r8260, r10546; +} +{ +sub.f16x2 r10552, r9859, r9875; +} +{ +mul.f16x2 r10555, r10552, r10529; +} +{ +add.f16x2 %32, r10549, r10555; +} +{ +add.f16x2 r10561, r9853, r9869; +} +{ +mul.f16x2 r10564, r10561, r10527; +} +{ +add.f16x2 r10567, r8260, r10564; +} +{ +sub.f16x2 r10570, r9859, r9875; +} +{ +mul.f16x2 r10573, r10570, r10529; +} +{ +sub.f16x2 %50, r10567, r10573; +} +{ +add.f16x2 r10579, r9859, r9875; +} +{ +mul.f16x2 r10582, r10579, r10527; +} +{ +add.f16x2 r10585, r8296, r10582; +} +{ +sub.f16x2 r10588, r9853, r9869; +} +{ +mul.f16x2 r10591, r10588, r10529; +} +{ +sub.f16x2 %33, r10585, r10591; +} +{ +add.f16x2 r10597, r9859, r9875; +} +{ +mul.f16x2 r10600, r10597, r10527; +} +{ +add.f16x2 r10603, r8296, r10600; +} +{ +sub.f16x2 r10606, r9853, r9869; +} +{ +mul.f16x2 r10609, r10606, r10529; +} +{ +add.f16x2 %51, r10603, r10609; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10615, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10616, {low, high}; +} +{ +neg.f16x2 r10617, r10616; +} +{ +add.f16x2 r10619, r9885, r9901; +} +{ +add.f16x2 %16, r8348, r10619; +} +{ +add.f16x2 r10625, r9891, r9907; +} +{ +add.f16x2 %17, r8384, r10625; +} +{ +add.f16x2 r10631, r9885, r9901; +} +{ +mul.f16x2 r10634, r10631, r10615; +} +{ +add.f16x2 r10637, r8348, r10634; +} +{ +sub.f16x2 r10640, r9891, r9907; +} +{ +mul.f16x2 r10643, r10640, r10617; +} +{ +add.f16x2 %34, r10637, r10643; +} +{ +add.f16x2 r10649, r9885, r9901; +} +{ +mul.f16x2 r10652, r10649, r10615; +} +{ +add.f16x2 r10655, r8348, r10652; +} +{ +sub.f16x2 r10658, r9891, r9907; +} +{ +mul.f16x2 r10661, r10658, r10617; +} +{ +sub.f16x2 %52, r10655, r10661; +} +{ +add.f16x2 r10667, r9891, r9907; +} +{ +mul.f16x2 r10670, r10667, r10615; +} +{ +add.f16x2 r10673, r8384, r10670; +} +{ +sub.f16x2 r10676, r9885, r9901; +} +{ +mul.f16x2 r10679, r10676, r10617; +} +{ +sub.f16x2 %35, r10673, r10679; +} +{ +add.f16x2 r10685, r9891, r9907; +} +{ +mul.f16x2 r10688, r10685, r10615; +} +{ +add.f16x2 r10691, r8384, r10688; +} +{ +sub.f16x2 r10694, r9885, r9901; +} +{ +mul.f16x2 r10697, r10694, r10617; +} +{ +add.f16x2 %53, r10691, r10697; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..d3ba67c4efcd2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp16_inv.hpp.inc @@ -0,0 +1,25779 @@ +#ifndef CUFFTDX_FFT_19683_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_19683_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1183, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1043>; +.reg .b32 r<10616>; +.reg .b64 rd<6>; +mov.u32 r10542, %54; +mov.u32 r10615, %tid.y; +mad.lo.s32 r10543, r10615, 157464, r10542; +mov.u32 r10544, %tid.x; +mov.f32 f1034, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1, {low, high}; +} +mov.f32 f1036, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f906, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r259, {low, high}; +} +mov.f32 f908, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r260, {low, high}; +} +mov.f32 f918, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r261, {low, high}; +} +mov.f32 f920, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r262, {low, high}; +} +mov.f32 f942, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r265, {low, high}; +} +mov.f32 f944, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %76; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %76; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %76; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %76; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %76; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %74; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %74; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %74; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %74; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %74; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %75, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %75, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %75, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %75, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %75, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f898, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r1789, {low, high}; +} +mov.f32 f900, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r1790, {low, high}; +} +mov.f32 f902, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r1791, {low, high}; +} +mov.f32 f904, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1794, {low, high}; +} +mov.f32 f910, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r1795, {low, high}; +} +mov.f32 f912, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r1796, {low, high}; +} +mov.f32 f914, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r1797, {low, high}; +} +mov.f32 f916, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1800, {low, high}; +} +mov.f32 f922, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r1801, {low, high}; +} +mov.f32 f924, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r1802, {low, high}; +} +mov.f32 f926, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r1803, {low, high}; +} +mov.f32 f928, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r1804, {low, high}; +} +mov.f32 f934, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r1807, {low, high}; +} +mov.f32 f936, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r1808, {low, high}; +} +mov.f32 f958, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1812, {low, high}; +} +mov.f32 f950, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r1815, {low, high}; +} +mov.f32 f952, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r1819, {low, high}; +} +mov.f32 f960, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r10544, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r10545, rd3; +mul.lo.s32 r10546, r10545, 729; +sub.s32 r10547, r10544, r10546; +cvt.rn.f32.u32 f1037, r10547; +mul.f32 f1038, f1037, 0f39A75CD5; +cos.approx.f32 f309, f1038; +sin.approx.f32 f1039, f1038; +neg.f32 f310, f1039; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +mad.lo.s32 r10548, r10545, 157464, r10543; +barrier.sync 0; +mad.lo.s32 r10549, r10547, 216, r10548; +st.shared.v2.f32 [r10549], {r2102, r2108}; +st.shared.v2.f32 [r10549+8], {r2881, r2890}; +st.shared.v2.f32 [r10549+16], {r2918, r2927}; +st.shared.v2.f32 [r10549+24], {r2955, r2964}; +st.shared.v2.f32 [r10549+32], {r2992, r3001}; +st.shared.v2.f32 [r10549+40], {r3029, r3038}; +st.shared.v2.f32 [r10549+48], {r3066, r3075}; +st.shared.v2.f32 [r10549+56], {r3103, r3112}; +st.shared.v2.f32 [r10549+64], {r3140, r3149}; +st.shared.v2.f32 [r10549+72], {r3177, r3186}; +st.shared.v2.f32 [r10549+80], {r3214, r3223}; +st.shared.v2.f32 [r10549+88], {r3251, r3260}; +st.shared.v2.f32 [r10549+96], {r3288, r3297}; +st.shared.v2.f32 [r10549+104], {r3325, r3334}; +st.shared.v2.f32 [r10549+112], {r3362, r3371}; +st.shared.v2.f32 [r10549+120], {r3399, r3408}; +st.shared.v2.f32 [r10549+128], {r3436, r3445}; +st.shared.v2.f32 [r10549+136], {r3473, r3482}; +st.shared.v2.f32 [r10549+144], {r3510, r3519}; +st.shared.v2.f32 [r10549+152], {r3547, r3556}; +st.shared.v2.f32 [r10549+160], {r3584, r3593}; +st.shared.v2.f32 [r10549+168], {r3621, r3630}; +st.shared.v2.f32 [r10549+176], {r3658, r3667}; +st.shared.v2.f32 [r10549+184], {r3695, r3704}; +st.shared.v2.f32 [r10549+192], {r3732, r3741}; +st.shared.v2.f32 [r10549+200], {r3769, r3778}; +st.shared.v2.f32 [r10549+208], {r3806, r3815}; +barrier.sync 0; +mad.lo.s32 r10550, r10547, -208, r10549; +ld.shared.u32 r3842, [r10550]; +ld.shared.u32 r3848, [r10550+4]; +ld.shared.u32 r4438, [r10550+5832]; +ld.shared.u32 r4444, [r10550+5836]; +ld.shared.u32 r5034, [r10550+11664]; +ld.shared.u32 r5040, [r10550+11668]; +ld.shared.u32 r3928, [r10550+17496]; +ld.shared.u32 r3934, [r10550+17500]; +ld.shared.u32 r4524, [r10550+23328]; +ld.shared.u32 r4530, [r10550+23332]; +ld.shared.u32 r5120, [r10550+29160]; +ld.shared.u32 r5126, [r10550+29164]; +ld.shared.u32 r4014, [r10550+34992]; +ld.shared.u32 r4020, [r10550+34996]; +ld.shared.u32 r4610, [r10550+40824]; +ld.shared.u32 r4616, [r10550+40828]; +ld.shared.u32 r5206, [r10550+46656]; +ld.shared.u32 r5212, [r10550+46660]; +ld.shared.u32 r3839, [r10550+52488]; +ld.shared.u32 r3845, [r10550+52492]; +ld.shared.u32 r4435, [r10550+58320]; +ld.shared.u32 r4441, [r10550+58324]; +ld.shared.u32 r5031, [r10550+64152]; +ld.shared.u32 r5037, [r10550+64156]; +ld.shared.u32 r3925, [r10550+69984]; +ld.shared.u32 r3931, [r10550+69988]; +ld.shared.u32 r4521, [r10550+75816]; +ld.shared.u32 r4527, [r10550+75820]; +ld.shared.u32 r5117, [r10550+81648]; +ld.shared.u32 r5123, [r10550+81652]; +ld.shared.u32 r4011, [r10550+87480]; +ld.shared.u32 r4017, [r10550+87484]; +ld.shared.u32 r4607, [r10550+93312]; +ld.shared.u32 r4613, [r10550+93316]; +ld.shared.u32 r5203, [r10550+99144]; +ld.shared.u32 r5209, [r10550+99148]; +ld.shared.u32 r3840, [r10550+104976]; +ld.shared.u32 r3846, [r10550+104980]; +ld.shared.u32 r4436, [r10550+110808]; +ld.shared.u32 r4442, [r10550+110812]; +ld.shared.u32 r5032, [r10550+116640]; +ld.shared.u32 r5038, [r10550+116644]; +ld.shared.u32 r3926, [r10550+122472]; +ld.shared.u32 r3932, [r10550+122476]; +ld.shared.u32 r4522, [r10550+128304]; +ld.shared.u32 r4528, [r10550+128308]; +ld.shared.u32 r5118, [r10550+134136]; +ld.shared.u32 r5124, [r10550+134140]; +ld.shared.u32 r4012, [r10550+139968]; +ld.shared.u32 r4018, [r10550+139972]; +ld.shared.u32 r4608, [r10550+145800]; +ld.shared.u32 r4614, [r10550+145804]; +ld.shared.u32 r5204, [r10550+151632]; +ld.shared.u32 r5210, [r10550+151636]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 r5937, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 r5943, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 r5979, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 r5997, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 r6015, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 r6023, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 r6029, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 r6047, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 r6065, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 r6083, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 r6109, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 r6115, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 r6133, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 r6151, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 r6169, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 r6187, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 r6195, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 r6201, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 r6219, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 r6237, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 r6255, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 r6273, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 r6281, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 r6287, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 r6305, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 r6323, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 r6341, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 r6359, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 r6367, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 r6373, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 r6391, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 r6409, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 r6427, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 r6445, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 r6453, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 r6459, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 r6477, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 r6495, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 r6513, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 r6531, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 r6539, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 r6545, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 r6617, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 r6625, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 r6631, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 r6649, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 r6667, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 r6685, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 r6703, r6694, r6700; +} +mul.wide.u32 rd4, r10547, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r10551, rd5; +sub.s32 r10552, r10547, r10551; +shr.u32 r10553, r10552, 1; +add.s32 r10554, r10553, r10551; +shr.u32 r10555, r10554, 4; +cvt.rn.f32.u32 f1040, r10555; +mul.f32 f1041, f1040, 0f3C0D3654; +cos.approx.f32 f673, f1041; +sin.approx.f32 f1042, f1041; +neg.f32 f674, f1042; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6706, {low, high}; +} +mul.lo.s32 r10556, r10555, 27; +sub.s32 r10557, r10547, r10556; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6711, {high, high}; +} +{ +mul.f16x2 r6713, r6029, r6711; +} +{ +fma.rn.f16x2 r6716, r6023, r6709, r6713; +} +{ +mul.f16x2 r6720, r6023, r6711; +} +{ +neg.f16x2 r6723, r6720; +} +{ +fma.rn.f16x2 r6725, r6029, r6709, r6723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6733, {low, high}; +} +{ +mul.f16x2 r6734, r6731, r6733; +} +{ +mul.f16x2 r6737, r6706, r6729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6740, {high, low}; +} +{ +fma.rn.f16x2 r6742, r6734, r6740, r6737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6748, {high, high}; +} +{ +mul.f16x2 r6750, r6115, r6748; +} +{ +fma.rn.f16x2 r6753, r6109, r6746, r6750; +} +{ +mul.f16x2 r6757, r6109, r6748; +} +{ +neg.f16x2 r6760, r6757; +} +{ +fma.rn.f16x2 r6762, r6115, r6746, r6760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6770, {low, high}; +} +{ +mul.f16x2 r6771, r6768, r6770; +} +{ +mul.f16x2 r6774, r6742, r6766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6777, {high, low}; +} +{ +fma.rn.f16x2 r6779, r6771, r6777, r6774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6785, {high, high}; +} +{ +mul.f16x2 r6787, r6201, r6785; +} +{ +fma.rn.f16x2 r6790, r6195, r6783, r6787; +} +{ +mul.f16x2 r6794, r6195, r6785; +} +{ +neg.f16x2 r6797, r6794; +} +{ +fma.rn.f16x2 r6799, r6201, r6783, r6797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6807, {low, high}; +} +{ +mul.f16x2 r6808, r6805, r6807; +} +{ +mul.f16x2 r6811, r6779, r6803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6814, {high, low}; +} +{ +fma.rn.f16x2 r6816, r6808, r6814, r6811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6822, {high, high}; +} +{ +mul.f16x2 r6824, r6287, r6822; +} +{ +fma.rn.f16x2 r6827, r6281, r6820, r6824; +} +{ +mul.f16x2 r6831, r6281, r6822; +} +{ +neg.f16x2 r6834, r6831; +} +{ +fma.rn.f16x2 r6836, r6287, r6820, r6834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6844, {low, high}; +} +{ +mul.f16x2 r6845, r6842, r6844; +} +{ +mul.f16x2 r6848, r6816, r6840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6851, {high, low}; +} +{ +fma.rn.f16x2 r6853, r6845, r6851, r6848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6859, {high, high}; +} +{ +mul.f16x2 r6861, r6373, r6859; +} +{ +fma.rn.f16x2 r6864, r6367, r6857, r6861; +} +{ +mul.f16x2 r6868, r6367, r6859; +} +{ +neg.f16x2 r6871, r6868; +} +{ +fma.rn.f16x2 r6873, r6373, r6857, r6871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6881, {low, high}; +} +{ +mul.f16x2 r6882, r6879, r6881; +} +{ +mul.f16x2 r6885, r6853, r6877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6888, {high, low}; +} +{ +fma.rn.f16x2 r6890, r6882, r6888, r6885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6896, {high, high}; +} +{ +mul.f16x2 r6898, r6459, r6896; +} +{ +fma.rn.f16x2 r6901, r6453, r6894, r6898; +} +{ +mul.f16x2 r6905, r6453, r6896; +} +{ +neg.f16x2 r6908, r6905; +} +{ +fma.rn.f16x2 r6910, r6459, r6894, r6908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6918, {low, high}; +} +{ +mul.f16x2 r6919, r6916, r6918; +} +{ +mul.f16x2 r6922, r6890, r6914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6925, {high, low}; +} +{ +fma.rn.f16x2 r6927, r6919, r6925, r6922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6933, {high, high}; +} +{ +mul.f16x2 r6935, r6545, r6933; +} +{ +fma.rn.f16x2 r6938, r6539, r6931, r6935; +} +{ +mul.f16x2 r6942, r6539, r6933; +} +{ +neg.f16x2 r6945, r6942; +} +{ +fma.rn.f16x2 r6947, r6545, r6931, r6945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6955, {low, high}; +} +{ +mul.f16x2 r6956, r6953, r6955; +} +{ +mul.f16x2 r6959, r6927, r6951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6962, {high, low}; +} +{ +fma.rn.f16x2 r6964, r6956, r6962, r6959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6970, {high, high}; +} +{ +mul.f16x2 r6972, r6631, r6970; +} +{ +fma.rn.f16x2 r6975, r6625, r6968, r6972; +} +{ +mul.f16x2 r6979, r6625, r6970; +} +{ +neg.f16x2 r6982, r6979; +} +{ +fma.rn.f16x2 r6984, r6631, r6968, r6982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6992, {low, high}; +} +{ +mul.f16x2 r6993, r6990, r6992; +} +{ +mul.f16x2 r6996, r6964, r6988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6999, {high, low}; +} +{ +fma.rn.f16x2 r7001, r6993, r6999, r6996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7007, {high, high}; +} +{ +mul.f16x2 r7009, r5997, r7007; +} +{ +fma.rn.f16x2 r7012, r5961, r7005, r7009; +} +{ +mul.f16x2 r7016, r5961, r7007; +} +{ +neg.f16x2 r7019, r7016; +} +{ +fma.rn.f16x2 r7021, r5997, r7005, r7019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7029, {low, high}; +} +{ +mul.f16x2 r7030, r7027, r7029; +} +{ +mul.f16x2 r7033, r7001, r7025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7036, {high, low}; +} +{ +fma.rn.f16x2 r7038, r7030, r7036, r7033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7044, {high, high}; +} +{ +mul.f16x2 r7046, r6083, r7044; +} +{ +fma.rn.f16x2 r7049, r6047, r7042, r7046; +} +{ +mul.f16x2 r7053, r6047, r7044; +} +{ +neg.f16x2 r7056, r7053; +} +{ +fma.rn.f16x2 r7058, r6083, r7042, r7056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7066, {low, high}; +} +{ +mul.f16x2 r7067, r7064, r7066; +} +{ +mul.f16x2 r7070, r7038, r7062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7073, {high, low}; +} +{ +fma.rn.f16x2 r7075, r7067, r7073, r7070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7081, {high, high}; +} +{ +mul.f16x2 r7083, r6169, r7081; +} +{ +fma.rn.f16x2 r7086, r6133, r7079, r7083; +} +{ +mul.f16x2 r7090, r6133, r7081; +} +{ +neg.f16x2 r7093, r7090; +} +{ +fma.rn.f16x2 r7095, r6169, r7079, r7093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7103, {low, high}; +} +{ +mul.f16x2 r7104, r7101, r7103; +} +{ +mul.f16x2 r7107, r7075, r7099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7110, {high, low}; +} +{ +fma.rn.f16x2 r7112, r7104, r7110, r7107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7118, {high, high}; +} +{ +mul.f16x2 r7120, r6255, r7118; +} +{ +fma.rn.f16x2 r7123, r6219, r7116, r7120; +} +{ +mul.f16x2 r7127, r6219, r7118; +} +{ +neg.f16x2 r7130, r7127; +} +{ +fma.rn.f16x2 r7132, r6255, r7116, r7130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7140, {low, high}; +} +{ +mul.f16x2 r7141, r7138, r7140; +} +{ +mul.f16x2 r7144, r7112, r7136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7147, {high, low}; +} +{ +fma.rn.f16x2 r7149, r7141, r7147, r7144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7155, {high, high}; +} +{ +mul.f16x2 r7157, r6341, r7155; +} +{ +fma.rn.f16x2 r7160, r6305, r7153, r7157; +} +{ +mul.f16x2 r7164, r6305, r7155; +} +{ +neg.f16x2 r7167, r7164; +} +{ +fma.rn.f16x2 r7169, r6341, r7153, r7167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7177, {low, high}; +} +{ +mul.f16x2 r7178, r7175, r7177; +} +{ +mul.f16x2 r7181, r7149, r7173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7184, {high, low}; +} +{ +fma.rn.f16x2 r7186, r7178, r7184, r7181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7192, {high, high}; +} +{ +mul.f16x2 r7194, r6427, r7192; +} +{ +fma.rn.f16x2 r7197, r6391, r7190, r7194; +} +{ +mul.f16x2 r7201, r6391, r7192; +} +{ +neg.f16x2 r7204, r7201; +} +{ +fma.rn.f16x2 r7206, r6427, r7190, r7204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7214, {low, high}; +} +{ +mul.f16x2 r7215, r7212, r7214; +} +{ +mul.f16x2 r7218, r7186, r7210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7221, {high, low}; +} +{ +fma.rn.f16x2 r7223, r7215, r7221, r7218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7229, {high, high}; +} +{ +mul.f16x2 r7231, r6513, r7229; +} +{ +fma.rn.f16x2 r7234, r6477, r7227, r7231; +} +{ +mul.f16x2 r7238, r6477, r7229; +} +{ +neg.f16x2 r7241, r7238; +} +{ +fma.rn.f16x2 r7243, r6513, r7227, r7241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7251, {low, high}; +} +{ +mul.f16x2 r7252, r7249, r7251; +} +{ +mul.f16x2 r7255, r7223, r7247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7258, {high, low}; +} +{ +fma.rn.f16x2 r7260, r7252, r7258, r7255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7266, {high, high}; +} +{ +mul.f16x2 r7268, r6599, r7266; +} +{ +fma.rn.f16x2 r7271, r6563, r7264, r7268; +} +{ +mul.f16x2 r7275, r6563, r7266; +} +{ +neg.f16x2 r7278, r7275; +} +{ +fma.rn.f16x2 r7280, r6599, r7264, r7278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7288, {low, high}; +} +{ +mul.f16x2 r7289, r7286, r7288; +} +{ +mul.f16x2 r7292, r7260, r7284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7295, {high, low}; +} +{ +fma.rn.f16x2 r7297, r7289, r7295, r7292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7303, {high, high}; +} +{ +mul.f16x2 r7305, r6685, r7303; +} +{ +fma.rn.f16x2 r7308, r6649, r7301, r7305; +} +{ +mul.f16x2 r7312, r6649, r7303; +} +{ +neg.f16x2 r7315, r7312; +} +{ +fma.rn.f16x2 r7317, r6685, r7301, r7315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7325, {low, high}; +} +{ +mul.f16x2 r7326, r7323, r7325; +} +{ +mul.f16x2 r7329, r7297, r7321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7332, {high, low}; +} +{ +fma.rn.f16x2 r7334, r7326, r7332, r7329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7340, {high, high}; +} +{ +mul.f16x2 r7342, r6015, r7340; +} +{ +fma.rn.f16x2 r7345, r5979, r7338, r7342; +} +{ +mul.f16x2 r7349, r5979, r7340; +} +{ +neg.f16x2 r7352, r7349; +} +{ +fma.rn.f16x2 r7354, r6015, r7338, r7352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7362, {low, high}; +} +{ +mul.f16x2 r7363, r7360, r7362; +} +{ +mul.f16x2 r7366, r7334, r7358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7369, {high, low}; +} +{ +fma.rn.f16x2 r7371, r7363, r7369, r7366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7377, {high, high}; +} +{ +mul.f16x2 r7379, r6101, r7377; +} +{ +fma.rn.f16x2 r7382, r6065, r7375, r7379; +} +{ +mul.f16x2 r7386, r6065, r7377; +} +{ +neg.f16x2 r7389, r7386; +} +{ +fma.rn.f16x2 r7391, r6101, r7375, r7389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7399, {low, high}; +} +{ +mul.f16x2 r7400, r7397, r7399; +} +{ +mul.f16x2 r7403, r7371, r7395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7406, {high, low}; +} +{ +fma.rn.f16x2 r7408, r7400, r7406, r7403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7414, {high, high}; +} +{ +mul.f16x2 r7416, r6187, r7414; +} +{ +fma.rn.f16x2 r7419, r6151, r7412, r7416; +} +{ +mul.f16x2 r7423, r6151, r7414; +} +{ +neg.f16x2 r7426, r7423; +} +{ +fma.rn.f16x2 r7428, r6187, r7412, r7426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7436, {low, high}; +} +{ +mul.f16x2 r7437, r7434, r7436; +} +{ +mul.f16x2 r7440, r7408, r7432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7443, {high, low}; +} +{ +fma.rn.f16x2 r7445, r7437, r7443, r7440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7451, {high, high}; +} +{ +mul.f16x2 r7453, r6273, r7451; +} +{ +fma.rn.f16x2 r7456, r6237, r7449, r7453; +} +{ +mul.f16x2 r7460, r6237, r7451; +} +{ +neg.f16x2 r7463, r7460; +} +{ +fma.rn.f16x2 r7465, r6273, r7449, r7463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7473, {low, high}; +} +{ +mul.f16x2 r7474, r7471, r7473; +} +{ +mul.f16x2 r7477, r7445, r7469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7480, {high, low}; +} +{ +fma.rn.f16x2 r7482, r7474, r7480, r7477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7488, {high, high}; +} +{ +mul.f16x2 r7490, r6359, r7488; +} +{ +fma.rn.f16x2 r7493, r6323, r7486, r7490; +} +{ +mul.f16x2 r7497, r6323, r7488; +} +{ +neg.f16x2 r7500, r7497; +} +{ +fma.rn.f16x2 r7502, r6359, r7486, r7500; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7508, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7510, {low, high}; +} +{ +mul.f16x2 r7511, r7508, r7510; +} +{ +mul.f16x2 r7514, r7482, r7506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7517, {high, low}; +} +{ +fma.rn.f16x2 r7519, r7511, r7517, r7514; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7525, {high, high}; +} +{ +mul.f16x2 r7527, r6445, r7525; +} +{ +fma.rn.f16x2 r7530, r6409, r7523, r7527; +} +{ +mul.f16x2 r7534, r6409, r7525; +} +{ +neg.f16x2 r7537, r7534; +} +{ +fma.rn.f16x2 r7539, r6445, r7523, r7537; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7545, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7547, {low, high}; +} +{ +mul.f16x2 r7548, r7545, r7547; +} +{ +mul.f16x2 r7551, r7519, r7543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7554, {high, low}; +} +{ +fma.rn.f16x2 r7556, r7548, r7554, r7551; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7562, {high, high}; +} +{ +mul.f16x2 r7564, r6531, r7562; +} +{ +fma.rn.f16x2 r7567, r6495, r7560, r7564; +} +{ +mul.f16x2 r7571, r6495, r7562; +} +{ +neg.f16x2 r7574, r7571; +} +{ +fma.rn.f16x2 r7576, r6531, r7560, r7574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7582, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7584, {low, high}; +} +{ +mul.f16x2 r7585, r7582, r7584; +} +{ +mul.f16x2 r7588, r7556, r7580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7591, {high, low}; +} +{ +fma.rn.f16x2 r7593, r7585, r7591, r7588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7599, {high, high}; +} +{ +mul.f16x2 r7601, r6617, r7599; +} +{ +fma.rn.f16x2 r7604, r6581, r7597, r7601; +} +{ +mul.f16x2 r7608, r6581, r7599; +} +{ +neg.f16x2 r7611, r7608; +} +{ +fma.rn.f16x2 r7613, r6617, r7597, r7611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7619, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7621, {low, high}; +} +{ +mul.f16x2 r7622, r7619, r7621; +} +{ +mul.f16x2 r7625, r7593, r7617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7628, {high, low}; +} +{ +fma.rn.f16x2 r7630, r7622, r7628, r7625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7636, {high, high}; +} +{ +mul.f16x2 r7638, r6703, r7636; +} +{ +fma.rn.f16x2 r7641, r6667, r7634, r7638; +} +{ +mul.f16x2 r7645, r6667, r7636; +} +{ +neg.f16x2 r7648, r7645; +} +{ +fma.rn.f16x2 r7650, r6703, r7634, r7648; +} +shl.b32 r10558, r10557, 3; +add.s32 r10559, r10548, r10558; +barrier.sync 0; +mad.lo.s32 r10560, r10555, 5832, r10559; +st.shared.u32 [r10560], r5937; +st.shared.u32 [r10560+4], r5943; +st.shared.u32 [r10560+216], r6716; +st.shared.u32 [r10560+220], r6725; +st.shared.u32 [r10560+432], r6753; +st.shared.u32 [r10560+436], r6762; +st.shared.u32 [r10560+648], r6790; +st.shared.u32 [r10560+652], r6799; +st.shared.u32 [r10560+864], r6827; +st.shared.u32 [r10560+868], r6836; +st.shared.u32 [r10560+1080], r6864; +st.shared.u32 [r10560+1084], r6873; +st.shared.u32 [r10560+1296], r6901; +st.shared.u32 [r10560+1300], r6910; +st.shared.u32 [r10560+1512], r6938; +st.shared.u32 [r10560+1516], r6947; +st.shared.u32 [r10560+1728], r6975; +st.shared.u32 [r10560+1732], r6984; +st.shared.u32 [r10560+1944], r7012; +st.shared.u32 [r10560+1948], r7021; +st.shared.u32 [r10560+2160], r7049; +st.shared.u32 [r10560+2164], r7058; +st.shared.u32 [r10560+2376], r7086; +st.shared.u32 [r10560+2380], r7095; +st.shared.u32 [r10560+2592], r7123; +st.shared.u32 [r10560+2596], r7132; +st.shared.u32 [r10560+2808], r7160; +st.shared.u32 [r10560+2812], r7169; +st.shared.u32 [r10560+3024], r7197; +st.shared.u32 [r10560+3028], r7206; +st.shared.u32 [r10560+3240], r7234; +st.shared.u32 [r10560+3244], r7243; +st.shared.u32 [r10560+3456], r7271; +st.shared.u32 [r10560+3460], r7280; +st.shared.u32 [r10560+3672], r7308; +st.shared.u32 [r10560+3676], r7317; +st.shared.u32 [r10560+3888], r7345; +st.shared.u32 [r10560+3892], r7354; +st.shared.u32 [r10560+4104], r7382; +st.shared.u32 [r10560+4108], r7391; +st.shared.u32 [r10560+4320], r7419; +st.shared.u32 [r10560+4324], r7428; +st.shared.u32 [r10560+4536], r7456; +st.shared.u32 [r10560+4540], r7465; +st.shared.u32 [r10560+4752], r7493; +st.shared.u32 [r10560+4756], r7502; +st.shared.u32 [r10560+4968], r7530; +st.shared.u32 [r10560+4972], r7539; +st.shared.u32 [r10560+5184], r7567; +st.shared.u32 [r10560+5188], r7576; +st.shared.u32 [r10560+5400], r7604; +st.shared.u32 [r10560+5404], r7613; +st.shared.u32 [r10560+5616], r7641; +st.shared.u32 [r10560+5620], r7650; +barrier.sync 0; +ld.shared.u32 r7677, [r10550]; +ld.shared.u32 r7683, [r10550+4]; +ld.shared.u32 r8273, [r10550+5832]; +ld.shared.u32 r8279, [r10550+5836]; +ld.shared.u32 r8869, [r10550+11664]; +ld.shared.u32 r8875, [r10550+11668]; +ld.shared.u32 r7763, [r10550+17496]; +ld.shared.u32 r7769, [r10550+17500]; +ld.shared.u32 r8359, [r10550+23328]; +ld.shared.u32 r8365, [r10550+23332]; +ld.shared.u32 r8955, [r10550+29160]; +ld.shared.u32 r8961, [r10550+29164]; +ld.shared.u32 r7849, [r10550+34992]; +ld.shared.u32 r7855, [r10550+34996]; +ld.shared.u32 r8445, [r10550+40824]; +ld.shared.u32 r8451, [r10550+40828]; +ld.shared.u32 r9041, [r10550+46656]; +ld.shared.u32 r9047, [r10550+46660]; +ld.shared.u32 r7674, [r10550+52488]; +ld.shared.u32 r7680, [r10550+52492]; +ld.shared.u32 r8270, [r10550+58320]; +ld.shared.u32 r8276, [r10550+58324]; +ld.shared.u32 r8866, [r10550+64152]; +ld.shared.u32 r8872, [r10550+64156]; +ld.shared.u32 r7760, [r10550+69984]; +ld.shared.u32 r7766, [r10550+69988]; +ld.shared.u32 r8356, [r10550+75816]; +ld.shared.u32 r8362, [r10550+75820]; +ld.shared.u32 r8952, [r10550+81648]; +ld.shared.u32 r8958, [r10550+81652]; +ld.shared.u32 r7846, [r10550+87480]; +ld.shared.u32 r7852, [r10550+87484]; +ld.shared.u32 r8442, [r10550+93312]; +ld.shared.u32 r8448, [r10550+93316]; +ld.shared.u32 r9038, [r10550+99144]; +ld.shared.u32 r9044, [r10550+99148]; +ld.shared.u32 r7675, [r10550+104976]; +ld.shared.u32 r7681, [r10550+104980]; +ld.shared.u32 r8271, [r10550+110808]; +ld.shared.u32 r8277, [r10550+110812]; +ld.shared.u32 r8867, [r10550+116640]; +ld.shared.u32 r8873, [r10550+116644]; +ld.shared.u32 r7761, [r10550+122472]; +ld.shared.u32 r7767, [r10550+122476]; +ld.shared.u32 r8357, [r10550+128304]; +ld.shared.u32 r8363, [r10550+128308]; +ld.shared.u32 r8953, [r10550+134136]; +ld.shared.u32 r8959, [r10550+134140]; +ld.shared.u32 r7847, [r10550+139968]; +ld.shared.u32 r7853, [r10550+139972]; +ld.shared.u32 r8443, [r10550+145800]; +ld.shared.u32 r8449, [r10550+145804]; +ld.shared.u32 r9039, [r10550+151632]; +ld.shared.u32 r9045, [r10550+151636]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7672, {low, high}; +} +{ +add.f16x2 r7673, r7674, r7675; +} +{ +add.f16x2 r7676, r7677, r7673; +} +{ +add.f16x2 r7679, r7680, r7681; +} +{ +add.f16x2 r7682, r7683, r7679; +} +{ +add.f16x2 r7685, r7674, r7675; +} +{ +mul.f16x2 r7688, r7685, r7671; +} +{ +add.f16x2 r7691, r7677, r7688; +} +{ +sub.f16x2 r7694, r7680, r7681; +} +{ +mul.f16x2 r7697, r7694, r7672; +} +{ +add.f16x2 r7700, r7691, r7697; +} +{ +add.f16x2 r7703, r7674, r7675; +} +{ +mul.f16x2 r7706, r7703, r7671; +} +{ +add.f16x2 r7709, r7677, r7706; +} +{ +sub.f16x2 r7712, r7680, r7681; +} +{ +mul.f16x2 r7715, r7712, r7672; +} +{ +sub.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 r7721, r7680, r7681; +} +{ +mul.f16x2 r7724, r7721, r7671; +} +{ +add.f16x2 r7727, r7683, r7724; +} +{ +sub.f16x2 r7730, r7674, r7675; +} +{ +mul.f16x2 r7733, r7730, r7672; +} +{ +sub.f16x2 r7736, r7727, r7733; +} +{ +add.f16x2 r7739, r7680, r7681; +} +{ +mul.f16x2 r7742, r7739, r7671; +} +{ +add.f16x2 r7745, r7683, r7742; +} +{ +sub.f16x2 r7748, r7674, r7675; +} +{ +mul.f16x2 r7751, r7748, r7672; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7757, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7758, {low, high}; +} +{ +add.f16x2 r7759, r7760, r7761; +} +{ +add.f16x2 r7762, r7763, r7759; +} +{ +add.f16x2 r7765, r7766, r7767; +} +{ +add.f16x2 r7768, r7769, r7765; +} +{ +add.f16x2 r7771, r7760, r7761; +} +{ +mul.f16x2 r7774, r7771, r7757; +} +{ +add.f16x2 r7777, r7763, r7774; +} +{ +sub.f16x2 r7780, r7766, r7767; +} +{ +mul.f16x2 r7783, r7780, r7758; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7760, r7761; +} +{ +mul.f16x2 r7792, r7789, r7757; +} +{ +add.f16x2 r7795, r7763, r7792; +} +{ +sub.f16x2 r7798, r7766, r7767; +} +{ +mul.f16x2 r7801, r7798, r7758; +} +{ +sub.f16x2 r7804, r7795, r7801; +} +{ +add.f16x2 r7807, r7766, r7767; +} +{ +mul.f16x2 r7810, r7807, r7757; +} +{ +add.f16x2 r7813, r7769, r7810; +} +{ +sub.f16x2 r7816, r7760, r7761; +} +{ +mul.f16x2 r7819, r7816, r7758; +} +{ +sub.f16x2 r7822, r7813, r7819; +} +{ +add.f16x2 r7825, r7766, r7767; +} +{ +mul.f16x2 r7828, r7825, r7757; +} +{ +add.f16x2 r7831, r7769, r7828; +} +{ +sub.f16x2 r7834, r7760, r7761; +} +{ +mul.f16x2 r7837, r7834, r7758; +} +{ +add.f16x2 r7840, r7831, r7837; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7843, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7844, {low, high}; +} +{ +add.f16x2 r7845, r7846, r7847; +} +{ +add.f16x2 r7848, r7849, r7845; +} +{ +add.f16x2 r7851, r7852, r7853; +} +{ +add.f16x2 r7854, r7855, r7851; +} +{ +add.f16x2 r7857, r7846, r7847; +} +{ +mul.f16x2 r7860, r7857, r7843; +} +{ +add.f16x2 r7863, r7849, r7860; +} +{ +sub.f16x2 r7866, r7852, r7853; +} +{ +mul.f16x2 r7869, r7866, r7844; +} +{ +add.f16x2 r7872, r7863, r7869; +} +{ +add.f16x2 r7875, r7846, r7847; +} +{ +mul.f16x2 r7878, r7875, r7843; +} +{ +add.f16x2 r7881, r7849, r7878; +} +{ +sub.f16x2 r7884, r7852, r7853; +} +{ +mul.f16x2 r7887, r7884, r7844; +} +{ +sub.f16x2 r7890, r7881, r7887; +} +{ +add.f16x2 r7893, r7852, r7853; +} +{ +mul.f16x2 r7896, r7893, r7843; +} +{ +add.f16x2 r7899, r7855, r7896; +} +{ +sub.f16x2 r7902, r7846, r7847; +} +{ +mul.f16x2 r7905, r7902, r7844; +} +{ +sub.f16x2 r7908, r7899, r7905; +} +{ +add.f16x2 r7911, r7852, r7853; +} +{ +mul.f16x2 r7914, r7911, r7843; +} +{ +add.f16x2 r7917, r7855, r7914; +} +{ +sub.f16x2 r7920, r7846, r7847; +} +{ +mul.f16x2 r7923, r7920, r7844; +} +{ +add.f16x2 r7926, r7917, r7923; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r7929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r7930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r7931, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r7932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r7935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r7936, {low, high}; +} +{ +mul.f16x2 r7945, r7786, r7929; +} +{ +mul.f16x2 r7948, r7822, r7930; +} +{ +sub.f16x2 r7951, r7945, r7948; +} +{ +mul.f16x2 r7954, r7786, r7930; +} +{ +fma.rn.f16x2 r7957, r7822, r7929, r7954; +} +{ +mul.f16x2 r7961, r7872, r7931; +} +{ +mul.f16x2 r7964, r7908, r7932; +} +{ +sub.f16x2 r7967, r7961, r7964; +} +{ +mul.f16x2 r7970, r7872, r7932; +} +{ +fma.rn.f16x2 r7973, r7908, r7931, r7970; +} +{ +mul.f16x2 r7977, r7804, r7931; +} +{ +mul.f16x2 r7980, r7840, r7932; +} +{ +sub.f16x2 r7983, r7977, r7980; +} +{ +mul.f16x2 r7986, r7804, r7932; +} +{ +fma.rn.f16x2 r7989, r7840, r7931, r7986; +} +{ +mul.f16x2 r7993, r7890, r7935; +} +{ +mul.f16x2 r7996, r7926, r7936; +} +{ +sub.f16x2 r7999, r7993, r7996; +} +{ +mul.f16x2 r8002, r7890, r7936; +} +{ +fma.rn.f16x2 r8005, r7926, r7935, r8002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8010, {low, high}; +} +{ +add.f16x2 r8011, r7762, r7848; +} +{ +add.f16x2 r8014, r7676, r8011; +} +{ +add.f16x2 r8017, r7768, r7854; +} +{ +add.f16x2 r8020, r7682, r8017; +} +{ +add.f16x2 r8023, r7762, r7848; +} +{ +mul.f16x2 r8026, r8023, r8009; +} +{ +add.f16x2 r8029, r7676, r8026; +} +{ +sub.f16x2 r8032, r7768, r7854; +} +{ +mul.f16x2 r8035, r8032, r8010; +} +{ +add.f16x2 r8038, r8029, r8035; +} +{ +add.f16x2 r8041, r7762, r7848; +} +{ +mul.f16x2 r8044, r8041, r8009; +} +{ +add.f16x2 r8047, r7676, r8044; +} +{ +sub.f16x2 r8050, r7768, r7854; +} +{ +mul.f16x2 r8053, r8050, r8010; +} +{ +sub.f16x2 r8056, r8047, r8053; +} +{ +add.f16x2 r8059, r7768, r7854; +} +{ +mul.f16x2 r8062, r8059, r8009; +} +{ +add.f16x2 r8065, r7682, r8062; +} +{ +sub.f16x2 r8068, r7762, r7848; +} +{ +mul.f16x2 r8071, r8068, r8010; +} +{ +sub.f16x2 r8074, r8065, r8071; +} +{ +add.f16x2 r8077, r7768, r7854; +} +{ +mul.f16x2 r8080, r8077, r8009; +} +{ +add.f16x2 r8083, r7682, r8080; +} +{ +sub.f16x2 r8086, r7762, r7848; +} +{ +mul.f16x2 r8089, r8086, r8010; +} +{ +add.f16x2 r8092, r8083, r8089; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8096, {low, high}; +} +{ +add.f16x2 r8097, r7951, r7967; +} +{ +add.f16x2 r8100, r7700, r8097; +} +{ +add.f16x2 r8103, r7957, r7973; +} +{ +add.f16x2 r8106, r7736, r8103; +} +{ +add.f16x2 r8109, r7951, r7967; +} +{ +mul.f16x2 r8112, r8109, r8095; +} +{ +add.f16x2 r8115, r7700, r8112; +} +{ +sub.f16x2 r8118, r7957, r7973; +} +{ +mul.f16x2 r8121, r8118, r8096; +} +{ +add.f16x2 r8124, r8115, r8121; +} +{ +add.f16x2 r8127, r7951, r7967; +} +{ +mul.f16x2 r8130, r8127, r8095; +} +{ +add.f16x2 r8133, r7700, r8130; +} +{ +sub.f16x2 r8136, r7957, r7973; +} +{ +mul.f16x2 r8139, r8136, r8096; +} +{ +sub.f16x2 r8142, r8133, r8139; +} +{ +add.f16x2 r8145, r7957, r7973; +} +{ +mul.f16x2 r8148, r8145, r8095; +} +{ +add.f16x2 r8151, r7736, r8148; +} +{ +sub.f16x2 r8154, r7951, r7967; +} +{ +mul.f16x2 r8157, r8154, r8096; +} +{ +sub.f16x2 r8160, r8151, r8157; +} +{ +add.f16x2 r8163, r7957, r7973; +} +{ +mul.f16x2 r8166, r8163, r8095; +} +{ +add.f16x2 r8169, r7736, r8166; +} +{ +sub.f16x2 r8172, r7951, r7967; +} +{ +mul.f16x2 r8175, r8172, r8096; +} +{ +add.f16x2 r8178, r8169, r8175; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8181, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8182, {low, high}; +} +{ +add.f16x2 r8183, r7983, r7999; +} +{ +add.f16x2 r8186, r7718, r8183; +} +{ +add.f16x2 r8189, r7989, r8005; +} +{ +add.f16x2 r8192, r7754, r8189; +} +{ +add.f16x2 r8195, r7983, r7999; +} +{ +mul.f16x2 r8198, r8195, r8181; +} +{ +add.f16x2 r8201, r7718, r8198; +} +{ +sub.f16x2 r8204, r7989, r8005; +} +{ +mul.f16x2 r8207, r8204, r8182; +} +{ +add.f16x2 r8210, r8201, r8207; +} +{ +add.f16x2 r8213, r7983, r7999; +} +{ +mul.f16x2 r8216, r8213, r8181; +} +{ +add.f16x2 r8219, r7718, r8216; +} +{ +sub.f16x2 r8222, r7989, r8005; +} +{ +mul.f16x2 r8225, r8222, r8182; +} +{ +sub.f16x2 r8228, r8219, r8225; +} +{ +add.f16x2 r8231, r7989, r8005; +} +{ +mul.f16x2 r8234, r8231, r8181; +} +{ +add.f16x2 r8237, r7754, r8234; +} +{ +sub.f16x2 r8240, r7983, r7999; +} +{ +mul.f16x2 r8243, r8240, r8182; +} +{ +sub.f16x2 r8246, r8237, r8243; +} +{ +add.f16x2 r8249, r7989, r8005; +} +{ +mul.f16x2 r8252, r8249, r8181; +} +{ +add.f16x2 r8255, r7754, r8252; +} +{ +sub.f16x2 r8258, r7983, r7999; +} +{ +mul.f16x2 r8261, r8258, r8182; +} +{ +add.f16x2 r8264, r8255, r8261; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8268, {low, high}; +} +{ +add.f16x2 r8269, r8270, r8271; +} +{ +add.f16x2 r8272, r8273, r8269; +} +{ +add.f16x2 r8275, r8276, r8277; +} +{ +add.f16x2 r8278, r8279, r8275; +} +{ +add.f16x2 r8281, r8270, r8271; +} +{ +mul.f16x2 r8284, r8281, r8267; +} +{ +add.f16x2 r8287, r8273, r8284; +} +{ +sub.f16x2 r8290, r8276, r8277; +} +{ +mul.f16x2 r8293, r8290, r8268; +} +{ +add.f16x2 r8296, r8287, r8293; +} +{ +add.f16x2 r8299, r8270, r8271; +} +{ +mul.f16x2 r8302, r8299, r8267; +} +{ +add.f16x2 r8305, r8273, r8302; +} +{ +sub.f16x2 r8308, r8276, r8277; +} +{ +mul.f16x2 r8311, r8308, r8268; +} +{ +sub.f16x2 r8314, r8305, r8311; +} +{ +add.f16x2 r8317, r8276, r8277; +} +{ +mul.f16x2 r8320, r8317, r8267; +} +{ +add.f16x2 r8323, r8279, r8320; +} +{ +sub.f16x2 r8326, r8270, r8271; +} +{ +mul.f16x2 r8329, r8326, r8268; +} +{ +sub.f16x2 r8332, r8323, r8329; +} +{ +add.f16x2 r8335, r8276, r8277; +} +{ +mul.f16x2 r8338, r8335, r8267; +} +{ +add.f16x2 r8341, r8279, r8338; +} +{ +sub.f16x2 r8344, r8270, r8271; +} +{ +mul.f16x2 r8347, r8344, r8268; +} +{ +add.f16x2 r8350, r8341, r8347; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8354, {low, high}; +} +{ +add.f16x2 r8355, r8356, r8357; +} +{ +add.f16x2 r8358, r8359, r8355; +} +{ +add.f16x2 r8361, r8362, r8363; +} +{ +add.f16x2 r8364, r8365, r8361; +} +{ +add.f16x2 r8367, r8356, r8357; +} +{ +mul.f16x2 r8370, r8367, r8353; +} +{ +add.f16x2 r8373, r8359, r8370; +} +{ +sub.f16x2 r8376, r8362, r8363; +} +{ +mul.f16x2 r8379, r8376, r8354; +} +{ +add.f16x2 r8382, r8373, r8379; +} +{ +add.f16x2 r8385, r8356, r8357; +} +{ +mul.f16x2 r8388, r8385, r8353; +} +{ +add.f16x2 r8391, r8359, r8388; +} +{ +sub.f16x2 r8394, r8362, r8363; +} +{ +mul.f16x2 r8397, r8394, r8354; +} +{ +sub.f16x2 r8400, r8391, r8397; +} +{ +add.f16x2 r8403, r8362, r8363; +} +{ +mul.f16x2 r8406, r8403, r8353; +} +{ +add.f16x2 r8409, r8365, r8406; +} +{ +sub.f16x2 r8412, r8356, r8357; +} +{ +mul.f16x2 r8415, r8412, r8354; +} +{ +sub.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8362, r8363; +} +{ +mul.f16x2 r8424, r8421, r8353; +} +{ +add.f16x2 r8427, r8365, r8424; +} +{ +sub.f16x2 r8430, r8356, r8357; +} +{ +mul.f16x2 r8433, r8430, r8354; +} +{ +add.f16x2 r8436, r8427, r8433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8440, {low, high}; +} +{ +add.f16x2 r8441, r8442, r8443; +} +{ +add.f16x2 r8444, r8445, r8441; +} +{ +add.f16x2 r8447, r8448, r8449; +} +{ +add.f16x2 r8450, r8451, r8447; +} +{ +add.f16x2 r8453, r8442, r8443; +} +{ +mul.f16x2 r8456, r8453, r8439; +} +{ +add.f16x2 r8459, r8445, r8456; +} +{ +sub.f16x2 r8462, r8448, r8449; +} +{ +mul.f16x2 r8465, r8462, r8440; +} +{ +add.f16x2 r8468, r8459, r8465; +} +{ +add.f16x2 r8471, r8442, r8443; +} +{ +mul.f16x2 r8474, r8471, r8439; +} +{ +add.f16x2 r8477, r8445, r8474; +} +{ +sub.f16x2 r8480, r8448, r8449; +} +{ +mul.f16x2 r8483, r8480, r8440; +} +{ +sub.f16x2 r8486, r8477, r8483; +} +{ +add.f16x2 r8489, r8448, r8449; +} +{ +mul.f16x2 r8492, r8489, r8439; +} +{ +add.f16x2 r8495, r8451, r8492; +} +{ +sub.f16x2 r8498, r8442, r8443; +} +{ +mul.f16x2 r8501, r8498, r8440; +} +{ +sub.f16x2 r8504, r8495, r8501; +} +{ +add.f16x2 r8507, r8448, r8449; +} +{ +mul.f16x2 r8510, r8507, r8439; +} +{ +add.f16x2 r8513, r8451, r8510; +} +{ +sub.f16x2 r8516, r8442, r8443; +} +{ +mul.f16x2 r8519, r8516, r8440; +} +{ +add.f16x2 r8522, r8513, r8519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r8525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r8526, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r8527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r8528, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r8531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r8532, {low, high}; +} +{ +mul.f16x2 r8541, r8382, r8525; +} +{ +mul.f16x2 r8544, r8418, r8526; +} +{ +sub.f16x2 r8547, r8541, r8544; +} +{ +mul.f16x2 r8550, r8382, r8526; +} +{ +fma.rn.f16x2 r8553, r8418, r8525, r8550; +} +{ +mul.f16x2 r8557, r8468, r8527; +} +{ +mul.f16x2 r8560, r8504, r8528; +} +{ +sub.f16x2 r8563, r8557, r8560; +} +{ +mul.f16x2 r8566, r8468, r8528; +} +{ +fma.rn.f16x2 r8569, r8504, r8527, r8566; +} +{ +mul.f16x2 r8573, r8400, r8527; +} +{ +mul.f16x2 r8576, r8436, r8528; +} +{ +sub.f16x2 r8579, r8573, r8576; +} +{ +mul.f16x2 r8582, r8400, r8528; +} +{ +fma.rn.f16x2 r8585, r8436, r8527, r8582; +} +{ +mul.f16x2 r8589, r8486, r8531; +} +{ +mul.f16x2 r8592, r8522, r8532; +} +{ +sub.f16x2 r8595, r8589, r8592; +} +{ +mul.f16x2 r8598, r8486, r8532; +} +{ +fma.rn.f16x2 r8601, r8522, r8531, r8598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8606, {low, high}; +} +{ +add.f16x2 r8607, r8358, r8444; +} +{ +add.f16x2 r8610, r8272, r8607; +} +{ +add.f16x2 r8613, r8364, r8450; +} +{ +add.f16x2 r8616, r8278, r8613; +} +{ +add.f16x2 r8619, r8358, r8444; +} +{ +mul.f16x2 r8622, r8619, r8605; +} +{ +add.f16x2 r8625, r8272, r8622; +} +{ +sub.f16x2 r8628, r8364, r8450; +} +{ +mul.f16x2 r8631, r8628, r8606; +} +{ +add.f16x2 r8634, r8625, r8631; +} +{ +add.f16x2 r8637, r8358, r8444; +} +{ +mul.f16x2 r8640, r8637, r8605; +} +{ +add.f16x2 r8643, r8272, r8640; +} +{ +sub.f16x2 r8646, r8364, r8450; +} +{ +mul.f16x2 r8649, r8646, r8606; +} +{ +sub.f16x2 r8652, r8643, r8649; +} +{ +add.f16x2 r8655, r8364, r8450; +} +{ +mul.f16x2 r8658, r8655, r8605; +} +{ +add.f16x2 r8661, r8278, r8658; +} +{ +sub.f16x2 r8664, r8358, r8444; +} +{ +mul.f16x2 r8667, r8664, r8606; +} +{ +sub.f16x2 r8670, r8661, r8667; +} +{ +add.f16x2 r8673, r8364, r8450; +} +{ +mul.f16x2 r8676, r8673, r8605; +} +{ +add.f16x2 r8679, r8278, r8676; +} +{ +sub.f16x2 r8682, r8358, r8444; +} +{ +mul.f16x2 r8685, r8682, r8606; +} +{ +add.f16x2 r8688, r8679, r8685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8692, {low, high}; +} +{ +add.f16x2 r8693, r8547, r8563; +} +{ +add.f16x2 r8696, r8296, r8693; +} +{ +add.f16x2 r8699, r8553, r8569; +} +{ +add.f16x2 r8702, r8332, r8699; +} +{ +add.f16x2 r8705, r8547, r8563; +} +{ +mul.f16x2 r8708, r8705, r8691; +} +{ +add.f16x2 r8711, r8296, r8708; +} +{ +sub.f16x2 r8714, r8553, r8569; +} +{ +mul.f16x2 r8717, r8714, r8692; +} +{ +add.f16x2 r8720, r8711, r8717; +} +{ +add.f16x2 r8723, r8547, r8563; +} +{ +mul.f16x2 r8726, r8723, r8691; +} +{ +add.f16x2 r8729, r8296, r8726; +} +{ +sub.f16x2 r8732, r8553, r8569; +} +{ +mul.f16x2 r8735, r8732, r8692; +} +{ +sub.f16x2 r8738, r8729, r8735; +} +{ +add.f16x2 r8741, r8553, r8569; +} +{ +mul.f16x2 r8744, r8741, r8691; +} +{ +add.f16x2 r8747, r8332, r8744; +} +{ +sub.f16x2 r8750, r8547, r8563; +} +{ +mul.f16x2 r8753, r8750, r8692; +} +{ +sub.f16x2 r8756, r8747, r8753; +} +{ +add.f16x2 r8759, r8553, r8569; +} +{ +mul.f16x2 r8762, r8759, r8691; +} +{ +add.f16x2 r8765, r8332, r8762; +} +{ +sub.f16x2 r8768, r8547, r8563; +} +{ +mul.f16x2 r8771, r8768, r8692; +} +{ +add.f16x2 r8774, r8765, r8771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8777, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8778, {low, high}; +} +{ +add.f16x2 r8779, r8579, r8595; +} +{ +add.f16x2 r8782, r8314, r8779; +} +{ +add.f16x2 r8785, r8585, r8601; +} +{ +add.f16x2 r8788, r8350, r8785; +} +{ +add.f16x2 r8791, r8579, r8595; +} +{ +mul.f16x2 r8794, r8791, r8777; +} +{ +add.f16x2 r8797, r8314, r8794; +} +{ +sub.f16x2 r8800, r8585, r8601; +} +{ +mul.f16x2 r8803, r8800, r8778; +} +{ +add.f16x2 r8806, r8797, r8803; +} +{ +add.f16x2 r8809, r8579, r8595; +} +{ +mul.f16x2 r8812, r8809, r8777; +} +{ +add.f16x2 r8815, r8314, r8812; +} +{ +sub.f16x2 r8818, r8585, r8601; +} +{ +mul.f16x2 r8821, r8818, r8778; +} +{ +sub.f16x2 r8824, r8815, r8821; +} +{ +add.f16x2 r8827, r8585, r8601; +} +{ +mul.f16x2 r8830, r8827, r8777; +} +{ +add.f16x2 r8833, r8350, r8830; +} +{ +sub.f16x2 r8836, r8579, r8595; +} +{ +mul.f16x2 r8839, r8836, r8778; +} +{ +sub.f16x2 r8842, r8833, r8839; +} +{ +add.f16x2 r8845, r8585, r8601; +} +{ +mul.f16x2 r8848, r8845, r8777; +} +{ +add.f16x2 r8851, r8350, r8848; +} +{ +sub.f16x2 r8854, r8579, r8595; +} +{ +mul.f16x2 r8857, r8854, r8778; +} +{ +add.f16x2 r8860, r8851, r8857; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8864, {low, high}; +} +{ +add.f16x2 r8865, r8866, r8867; +} +{ +add.f16x2 r8868, r8869, r8865; +} +{ +add.f16x2 r8871, r8872, r8873; +} +{ +add.f16x2 r8874, r8875, r8871; +} +{ +add.f16x2 r8877, r8866, r8867; +} +{ +mul.f16x2 r8880, r8877, r8863; +} +{ +add.f16x2 r8883, r8869, r8880; +} +{ +sub.f16x2 r8886, r8872, r8873; +} +{ +mul.f16x2 r8889, r8886, r8864; +} +{ +add.f16x2 r8892, r8883, r8889; +} +{ +add.f16x2 r8895, r8866, r8867; +} +{ +mul.f16x2 r8898, r8895, r8863; +} +{ +add.f16x2 r8901, r8869, r8898; +} +{ +sub.f16x2 r8904, r8872, r8873; +} +{ +mul.f16x2 r8907, r8904, r8864; +} +{ +sub.f16x2 r8910, r8901, r8907; +} +{ +add.f16x2 r8913, r8872, r8873; +} +{ +mul.f16x2 r8916, r8913, r8863; +} +{ +add.f16x2 r8919, r8875, r8916; +} +{ +sub.f16x2 r8922, r8866, r8867; +} +{ +mul.f16x2 r8925, r8922, r8864; +} +{ +sub.f16x2 r8928, r8919, r8925; +} +{ +add.f16x2 r8931, r8872, r8873; +} +{ +mul.f16x2 r8934, r8931, r8863; +} +{ +add.f16x2 r8937, r8875, r8934; +} +{ +sub.f16x2 r8940, r8866, r8867; +} +{ +mul.f16x2 r8943, r8940, r8864; +} +{ +add.f16x2 r8946, r8937, r8943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8949, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8950, {low, high}; +} +{ +add.f16x2 r8951, r8952, r8953; +} +{ +add.f16x2 r8954, r8955, r8951; +} +{ +add.f16x2 r8957, r8958, r8959; +} +{ +add.f16x2 r8960, r8961, r8957; +} +{ +add.f16x2 r8963, r8952, r8953; +} +{ +mul.f16x2 r8966, r8963, r8949; +} +{ +add.f16x2 r8969, r8955, r8966; +} +{ +sub.f16x2 r8972, r8958, r8959; +} +{ +mul.f16x2 r8975, r8972, r8950; +} +{ +add.f16x2 r8978, r8969, r8975; +} +{ +add.f16x2 r8981, r8952, r8953; +} +{ +mul.f16x2 r8984, r8981, r8949; +} +{ +add.f16x2 r8987, r8955, r8984; +} +{ +sub.f16x2 r8990, r8958, r8959; +} +{ +mul.f16x2 r8993, r8990, r8950; +} +{ +sub.f16x2 r8996, r8987, r8993; +} +{ +add.f16x2 r8999, r8958, r8959; +} +{ +mul.f16x2 r9002, r8999, r8949; +} +{ +add.f16x2 r9005, r8961, r9002; +} +{ +sub.f16x2 r9008, r8952, r8953; +} +{ +mul.f16x2 r9011, r9008, r8950; +} +{ +sub.f16x2 r9014, r9005, r9011; +} +{ +add.f16x2 r9017, r8958, r8959; +} +{ +mul.f16x2 r9020, r9017, r8949; +} +{ +add.f16x2 r9023, r8961, r9020; +} +{ +sub.f16x2 r9026, r8952, r8953; +} +{ +mul.f16x2 r9029, r9026, r8950; +} +{ +add.f16x2 r9032, r9023, r9029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9035, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9036, {low, high}; +} +{ +add.f16x2 r9037, r9038, r9039; +} +{ +add.f16x2 r9040, r9041, r9037; +} +{ +add.f16x2 r9043, r9044, r9045; +} +{ +add.f16x2 r9046, r9047, r9043; +} +{ +add.f16x2 r9049, r9038, r9039; +} +{ +mul.f16x2 r9052, r9049, r9035; +} +{ +add.f16x2 r9055, r9041, r9052; +} +{ +sub.f16x2 r9058, r9044, r9045; +} +{ +mul.f16x2 r9061, r9058, r9036; +} +{ +add.f16x2 r9064, r9055, r9061; +} +{ +add.f16x2 r9067, r9038, r9039; +} +{ +mul.f16x2 r9070, r9067, r9035; +} +{ +add.f16x2 r9073, r9041, r9070; +} +{ +sub.f16x2 r9076, r9044, r9045; +} +{ +mul.f16x2 r9079, r9076, r9036; +} +{ +sub.f16x2 r9082, r9073, r9079; +} +{ +add.f16x2 r9085, r9044, r9045; +} +{ +mul.f16x2 r9088, r9085, r9035; +} +{ +add.f16x2 r9091, r9047, r9088; +} +{ +sub.f16x2 r9094, r9038, r9039; +} +{ +mul.f16x2 r9097, r9094, r9036; +} +{ +sub.f16x2 r9100, r9091, r9097; +} +{ +add.f16x2 r9103, r9044, r9045; +} +{ +mul.f16x2 r9106, r9103, r9035; +} +{ +add.f16x2 r9109, r9047, r9106; +} +{ +sub.f16x2 r9112, r9038, r9039; +} +{ +mul.f16x2 r9115, r9112, r9036; +} +{ +add.f16x2 r9118, r9109, r9115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9124, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9128, {low, high}; +} +{ +mul.f16x2 r9137, r8978, r9121; +} +{ +mul.f16x2 r9140, r9014, r9122; +} +{ +sub.f16x2 r9143, r9137, r9140; +} +{ +mul.f16x2 r9146, r8978, r9122; +} +{ +fma.rn.f16x2 r9149, r9014, r9121, r9146; +} +{ +mul.f16x2 r9153, r9064, r9123; +} +{ +mul.f16x2 r9156, r9100, r9124; +} +{ +sub.f16x2 r9159, r9153, r9156; +} +{ +mul.f16x2 r9162, r9064, r9124; +} +{ +fma.rn.f16x2 r9165, r9100, r9123, r9162; +} +{ +mul.f16x2 r9169, r8996, r9123; +} +{ +mul.f16x2 r9172, r9032, r9124; +} +{ +sub.f16x2 r9175, r9169, r9172; +} +{ +mul.f16x2 r9178, r8996, r9124; +} +{ +fma.rn.f16x2 r9181, r9032, r9123, r9178; +} +{ +mul.f16x2 r9185, r9082, r9127; +} +{ +mul.f16x2 r9188, r9118, r9128; +} +{ +sub.f16x2 r9191, r9185, r9188; +} +{ +mul.f16x2 r9194, r9082, r9128; +} +{ +fma.rn.f16x2 r9197, r9118, r9127, r9194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9201, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9202, {low, high}; +} +{ +add.f16x2 r9203, r8954, r9040; +} +{ +add.f16x2 r9206, r8868, r9203; +} +{ +add.f16x2 r9209, r8960, r9046; +} +{ +add.f16x2 r9212, r8874, r9209; +} +{ +add.f16x2 r9215, r8954, r9040; +} +{ +mul.f16x2 r9218, r9215, r9201; +} +{ +add.f16x2 r9221, r8868, r9218; +} +{ +sub.f16x2 r9224, r8960, r9046; +} +{ +mul.f16x2 r9227, r9224, r9202; +} +{ +add.f16x2 r9230, r9221, r9227; +} +{ +add.f16x2 r9233, r8954, r9040; +} +{ +mul.f16x2 r9236, r9233, r9201; +} +{ +add.f16x2 r9239, r8868, r9236; +} +{ +sub.f16x2 r9242, r8960, r9046; +} +{ +mul.f16x2 r9245, r9242, r9202; +} +{ +sub.f16x2 r9248, r9239, r9245; +} +{ +add.f16x2 r9251, r8960, r9046; +} +{ +mul.f16x2 r9254, r9251, r9201; +} +{ +add.f16x2 r9257, r8874, r9254; +} +{ +sub.f16x2 r9260, r8954, r9040; +} +{ +mul.f16x2 r9263, r9260, r9202; +} +{ +sub.f16x2 r9266, r9257, r9263; +} +{ +add.f16x2 r9269, r8960, r9046; +} +{ +mul.f16x2 r9272, r9269, r9201; +} +{ +add.f16x2 r9275, r8874, r9272; +} +{ +sub.f16x2 r9278, r8954, r9040; +} +{ +mul.f16x2 r9281, r9278, r9202; +} +{ +add.f16x2 r9284, r9275, r9281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9288, {low, high}; +} +{ +add.f16x2 r9289, r9143, r9159; +} +{ +add.f16x2 r9292, r8892, r9289; +} +{ +add.f16x2 r9295, r9149, r9165; +} +{ +add.f16x2 r9298, r8928, r9295; +} +{ +add.f16x2 r9301, r9143, r9159; +} +{ +mul.f16x2 r9304, r9301, r9287; +} +{ +add.f16x2 r9307, r8892, r9304; +} +{ +sub.f16x2 r9310, r9149, r9165; +} +{ +mul.f16x2 r9313, r9310, r9288; +} +{ +add.f16x2 r9316, r9307, r9313; +} +{ +add.f16x2 r9319, r9143, r9159; +} +{ +mul.f16x2 r9322, r9319, r9287; +} +{ +add.f16x2 r9325, r8892, r9322; +} +{ +sub.f16x2 r9328, r9149, r9165; +} +{ +mul.f16x2 r9331, r9328, r9288; +} +{ +sub.f16x2 r9334, r9325, r9331; +} +{ +add.f16x2 r9337, r9149, r9165; +} +{ +mul.f16x2 r9340, r9337, r9287; +} +{ +add.f16x2 r9343, r8928, r9340; +} +{ +sub.f16x2 r9346, r9143, r9159; +} +{ +mul.f16x2 r9349, r9346, r9288; +} +{ +sub.f16x2 r9352, r9343, r9349; +} +{ +add.f16x2 r9355, r9149, r9165; +} +{ +mul.f16x2 r9358, r9355, r9287; +} +{ +add.f16x2 r9361, r8928, r9358; +} +{ +sub.f16x2 r9364, r9143, r9159; +} +{ +mul.f16x2 r9367, r9364, r9288; +} +{ +add.f16x2 r9370, r9361, r9367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9374, {low, high}; +} +{ +add.f16x2 r9375, r9175, r9191; +} +{ +add.f16x2 r9378, r8910, r9375; +} +{ +add.f16x2 r9381, r9181, r9197; +} +{ +add.f16x2 r9384, r8946, r9381; +} +{ +add.f16x2 r9387, r9175, r9191; +} +{ +mul.f16x2 r9390, r9387, r9373; +} +{ +add.f16x2 r9393, r8910, r9390; +} +{ +sub.f16x2 r9396, r9181, r9197; +} +{ +mul.f16x2 r9399, r9396, r9374; +} +{ +add.f16x2 r9402, r9393, r9399; +} +{ +add.f16x2 r9405, r9175, r9191; +} +{ +mul.f16x2 r9408, r9405, r9373; +} +{ +add.f16x2 r9411, r8910, r9408; +} +{ +sub.f16x2 r9414, r9181, r9197; +} +{ +mul.f16x2 r9417, r9414, r9374; +} +{ +sub.f16x2 r9420, r9411, r9417; +} +{ +add.f16x2 r9423, r9181, r9197; +} +{ +mul.f16x2 r9426, r9423, r9373; +} +{ +add.f16x2 r9429, r8946, r9426; +} +{ +sub.f16x2 r9432, r9175, r9191; +} +{ +mul.f16x2 r9435, r9432, r9374; +} +{ +sub.f16x2 r9438, r9429, r9435; +} +{ +add.f16x2 r9441, r9181, r9197; +} +{ +mul.f16x2 r9444, r9441, r9373; +} +{ +add.f16x2 r9447, r8946, r9444; +} +{ +sub.f16x2 r9450, r9175, r9191; +} +{ +mul.f16x2 r9453, r9450, r9374; +} +{ +add.f16x2 r9456, r9447, r9453; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r9459, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r9460, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r9461, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r9462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9463, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9464, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r9465, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r9466, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r9467, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r9468, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9469, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9470, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r9471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r9472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r9473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r9474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r9477, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r9478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r9485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r9486, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r9489, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r9490, {low, high}; +} +{ +mul.f16x2 r9511, r8696, r9459; +} +{ +mul.f16x2 r9514, r8702, r9460; +} +{ +sub.f16x2 r9517, r9511, r9514; +} +{ +mul.f16x2 r9520, r8696, r9460; +} +{ +fma.rn.f16x2 r9523, r8702, r9459, r9520; +} +{ +mul.f16x2 r9527, r9292, r9461; +} +{ +mul.f16x2 r9530, r9298, r9462; +} +{ +sub.f16x2 r9533, r9527, r9530; +} +{ +mul.f16x2 r9536, r9292, r9462; +} +{ +fma.rn.f16x2 r9539, r9298, r9461, r9536; +} +{ +mul.f16x2 r9543, r8782, r9461; +} +{ +mul.f16x2 r9546, r8788, r9462; +} +{ +sub.f16x2 r9549, r9543, r9546; +} +{ +mul.f16x2 r9552, r8782, r9462; +} +{ +fma.rn.f16x2 r9555, r8788, r9461, r9552; +} +{ +mul.f16x2 r9559, r9378, r9465; +} +{ +mul.f16x2 r9562, r9384, r9466; +} +{ +sub.f16x2 r9565, r9559, r9562; +} +{ +mul.f16x2 r9568, r9378, r9466; +} +{ +fma.rn.f16x2 r9571, r9384, r9465, r9568; +} +{ +mul.f16x2 r9575, r8634, r9463; +} +{ +mul.f16x2 r9578, r8670, r9464; +} +{ +sub.f16x2 r9581, r9575, r9578; +} +{ +mul.f16x2 r9584, r8634, r9464; +} +{ +fma.rn.f16x2 r9587, r8670, r9463, r9584; +} +{ +mul.f16x2 r9591, r9230, r9469; +} +{ +mul.f16x2 r9594, r9266, r9470; +} +{ +sub.f16x2 r9597, r9591, r9594; +} +{ +mul.f16x2 r9600, r9230, r9470; +} +{ +fma.rn.f16x2 r9603, r9266, r9469, r9600; +} +{ +mul.f16x2 r9607, r8720, r9465; +} +{ +mul.f16x2 r9610, r8756, r9466; +} +{ +sub.f16x2 r9613, r9607, r9610; +} +{ +mul.f16x2 r9616, r8720, r9466; +} +{ +fma.rn.f16x2 r9619, r8756, r9465, r9616; +} +{ +mul.f16x2 r9623, r9316, r9473; +} +{ +mul.f16x2 r9626, r9352, r9474; +} +{ +sub.f16x2 r9629, r9623, r9626; +} +{ +mul.f16x2 r9632, r9316, r9474; +} +{ +fma.rn.f16x2 r9635, r9352, r9473, r9632; +} +{ +mul.f16x2 r9639, r8806, r9467; +} +{ +mul.f16x2 r9642, r8842, r9468; +} +{ +sub.f16x2 r9645, r9639, r9642; +} +{ +mul.f16x2 r9648, r8806, r9468; +} +{ +fma.rn.f16x2 r9651, r8842, r9467, r9648; +} +{ +mul.f16x2 r9655, r9402, r9477; +} +{ +mul.f16x2 r9658, r9438, r9478; +} +{ +sub.f16x2 r9661, r9655, r9658; +} +{ +mul.f16x2 r9664, r9402, r9478; +} +{ +fma.rn.f16x2 r9667, r9438, r9477, r9664; +} +{ +mul.f16x2 r9671, r8652, r9469; +} +{ +mul.f16x2 r9674, r8688, r9470; +} +{ +sub.f16x2 r9677, r9671, r9674; +} +{ +mul.f16x2 r9680, r8652, r9470; +} +{ +fma.rn.f16x2 r9683, r8688, r9469, r9680; +} +{ +mul.f16x2 r9687, r9248, r9481; +} +{ +mul.f16x2 r9690, r9284, r9482; +} +{ +sub.f16x2 r9693, r9687, r9690; +} +{ +mul.f16x2 r9696, r9248, r9482; +} +{ +fma.rn.f16x2 r9699, r9284, r9481, r9696; +} +{ +mul.f16x2 r9703, r8738, r9471; +} +{ +mul.f16x2 r9706, r8774, r9472; +} +{ +sub.f16x2 r9709, r9703, r9706; +} +{ +mul.f16x2 r9712, r8738, r9472; +} +{ +fma.rn.f16x2 r9715, r8774, r9471, r9712; +} +{ +mul.f16x2 r9719, r9334, r9485; +} +{ +mul.f16x2 r9722, r9370, r9486; +} +{ +sub.f16x2 r9725, r9719, r9722; +} +{ +mul.f16x2 r9728, r9334, r9486; +} +{ +fma.rn.f16x2 r9731, r9370, r9485, r9728; +} +{ +mul.f16x2 r9735, r8824, r9473; +} +{ +mul.f16x2 r9738, r8860, r9474; +} +{ +sub.f16x2 r9741, r9735, r9738; +} +{ +mul.f16x2 r9744, r8824, r9474; +} +{ +fma.rn.f16x2 r9747, r8860, r9473, r9744; +} +{ +mul.f16x2 r9751, r9420, r9489; +} +{ +mul.f16x2 r9754, r9456, r9490; +} +{ +sub.f16x2 r9757, r9751, r9754; +} +{ +mul.f16x2 r9760, r9420, r9490; +} +{ +fma.rn.f16x2 r9763, r9456, r9489, r9760; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9768, {low, high}; +} +{ +add.f16x2 r9769, r8610, r9206; +} +{ +add.f16x2 %0, r8014, r9769; +} +{ +add.f16x2 r9775, r8616, r9212; +} +{ +add.f16x2 %1, r8020, r9775; +} +{ +add.f16x2 r9781, r8610, r9206; +} +{ +mul.f16x2 r9784, r9781, r9767; +} +{ +add.f16x2 r9787, r8014, r9784; +} +{ +sub.f16x2 r9790, r8616, r9212; +} +{ +mul.f16x2 r9793, r9790, r9768; +} +{ +add.f16x2 %18, r9787, r9793; +} +{ +add.f16x2 r9799, r8610, r9206; +} +{ +mul.f16x2 r9802, r9799, r9767; +} +{ +add.f16x2 r9805, r8014, r9802; +} +{ +sub.f16x2 r9808, r8616, r9212; +} +{ +mul.f16x2 r9811, r9808, r9768; +} +{ +sub.f16x2 %36, r9805, r9811; +} +{ +add.f16x2 r9817, r8616, r9212; +} +{ +mul.f16x2 r9820, r9817, r9767; +} +{ +add.f16x2 r9823, r8020, r9820; +} +{ +sub.f16x2 r9826, r8610, r9206; +} +{ +mul.f16x2 r9829, r9826, r9768; +} +{ +sub.f16x2 %19, r9823, r9829; +} +{ +add.f16x2 r9835, r8616, r9212; +} +{ +mul.f16x2 r9838, r9835, r9767; +} +{ +add.f16x2 r9841, r8020, r9838; +} +{ +sub.f16x2 r9844, r8610, r9206; +} +{ +mul.f16x2 r9847, r9844, r9768; +} +{ +add.f16x2 %37, r9841, r9847; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9853, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9854, {low, high}; +} +{ +add.f16x2 r9855, r9517, r9533; +} +{ +add.f16x2 %2, r8100, r9855; +} +{ +add.f16x2 r9861, r9523, r9539; +} +{ +add.f16x2 %3, r8106, r9861; +} +{ +add.f16x2 r9867, r9517, r9533; +} +{ +mul.f16x2 r9870, r9867, r9853; +} +{ +add.f16x2 r9873, r8100, r9870; +} +{ +sub.f16x2 r9876, r9523, r9539; +} +{ +mul.f16x2 r9879, r9876, r9854; +} +{ +add.f16x2 %20, r9873, r9879; +} +{ +add.f16x2 r9885, r9517, r9533; +} +{ +mul.f16x2 r9888, r9885, r9853; +} +{ +add.f16x2 r9891, r8100, r9888; +} +{ +sub.f16x2 r9894, r9523, r9539; +} +{ +mul.f16x2 r9897, r9894, r9854; +} +{ +sub.f16x2 %38, r9891, r9897; +} +{ +add.f16x2 r9903, r9523, r9539; +} +{ +mul.f16x2 r9906, r9903, r9853; +} +{ +add.f16x2 r9909, r8106, r9906; +} +{ +sub.f16x2 r9912, r9517, r9533; +} +{ +mul.f16x2 r9915, r9912, r9854; +} +{ +sub.f16x2 %21, r9909, r9915; +} +{ +add.f16x2 r9921, r9523, r9539; +} +{ +mul.f16x2 r9924, r9921, r9853; +} +{ +add.f16x2 r9927, r8106, r9924; +} +{ +sub.f16x2 r9930, r9517, r9533; +} +{ +mul.f16x2 r9933, r9930, r9854; +} +{ +add.f16x2 %39, r9927, r9933; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9940, {low, high}; +} +{ +add.f16x2 r9941, r9549, r9565; +} +{ +add.f16x2 %4, r8186, r9941; +} +{ +add.f16x2 r9947, r9555, r9571; +} +{ +add.f16x2 %5, r8192, r9947; +} +{ +add.f16x2 r9953, r9549, r9565; +} +{ +mul.f16x2 r9956, r9953, r9939; +} +{ +add.f16x2 r9959, r8186, r9956; +} +{ +sub.f16x2 r9962, r9555, r9571; +} +{ +mul.f16x2 r9965, r9962, r9940; +} +{ +add.f16x2 %22, r9959, r9965; +} +{ +add.f16x2 r9971, r9549, r9565; +} +{ +mul.f16x2 r9974, r9971, r9939; +} +{ +add.f16x2 r9977, r8186, r9974; +} +{ +sub.f16x2 r9980, r9555, r9571; +} +{ +mul.f16x2 r9983, r9980, r9940; +} +{ +sub.f16x2 %40, r9977, r9983; +} +{ +add.f16x2 r9989, r9555, r9571; +} +{ +mul.f16x2 r9992, r9989, r9939; +} +{ +add.f16x2 r9995, r8192, r9992; +} +{ +sub.f16x2 r9998, r9549, r9565; +} +{ +mul.f16x2 r10001, r9998, r9940; +} +{ +sub.f16x2 %23, r9995, r10001; +} +{ +add.f16x2 r10007, r9555, r9571; +} +{ +mul.f16x2 r10010, r10007, r9939; +} +{ +add.f16x2 r10013, r8192, r10010; +} +{ +sub.f16x2 r10016, r9549, r9565; +} +{ +mul.f16x2 r10019, r10016, r9940; +} +{ +add.f16x2 %41, r10013, r10019; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10026, {low, high}; +} +{ +add.f16x2 r10027, r9581, r9597; +} +{ +add.f16x2 %6, r8038, r10027; +} +{ +add.f16x2 r10033, r9587, r9603; +} +{ +add.f16x2 %7, r8074, r10033; +} +{ +add.f16x2 r10039, r9581, r9597; +} +{ +mul.f16x2 r10042, r10039, r10025; +} +{ +add.f16x2 r10045, r8038, r10042; +} +{ +sub.f16x2 r10048, r9587, r9603; +} +{ +mul.f16x2 r10051, r10048, r10026; +} +{ +add.f16x2 %24, r10045, r10051; +} +{ +add.f16x2 r10057, r9581, r9597; +} +{ +mul.f16x2 r10060, r10057, r10025; +} +{ +add.f16x2 r10063, r8038, r10060; +} +{ +sub.f16x2 r10066, r9587, r9603; +} +{ +mul.f16x2 r10069, r10066, r10026; +} +{ +sub.f16x2 %42, r10063, r10069; +} +{ +add.f16x2 r10075, r9587, r9603; +} +{ +mul.f16x2 r10078, r10075, r10025; +} +{ +add.f16x2 r10081, r8074, r10078; +} +{ +sub.f16x2 r10084, r9581, r9597; +} +{ +mul.f16x2 r10087, r10084, r10026; +} +{ +sub.f16x2 %25, r10081, r10087; +} +{ +add.f16x2 r10093, r9587, r9603; +} +{ +mul.f16x2 r10096, r10093, r10025; +} +{ +add.f16x2 r10099, r8074, r10096; +} +{ +sub.f16x2 r10102, r9581, r9597; +} +{ +mul.f16x2 r10105, r10102, r10026; +} +{ +add.f16x2 %43, r10099, r10105; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10112, {low, high}; +} +{ +add.f16x2 r10113, r9613, r9629; +} +{ +add.f16x2 %8, r8124, r10113; +} +{ +add.f16x2 r10119, r9619, r9635; +} +{ +add.f16x2 %9, r8160, r10119; +} +{ +add.f16x2 r10125, r9613, r9629; +} +{ +mul.f16x2 r10128, r10125, r10111; +} +{ +add.f16x2 r10131, r8124, r10128; +} +{ +sub.f16x2 r10134, r9619, r9635; +} +{ +mul.f16x2 r10137, r10134, r10112; +} +{ +add.f16x2 %26, r10131, r10137; +} +{ +add.f16x2 r10143, r9613, r9629; +} +{ +mul.f16x2 r10146, r10143, r10111; +} +{ +add.f16x2 r10149, r8124, r10146; +} +{ +sub.f16x2 r10152, r9619, r9635; +} +{ +mul.f16x2 r10155, r10152, r10112; +} +{ +sub.f16x2 %44, r10149, r10155; +} +{ +add.f16x2 r10161, r9619, r9635; +} +{ +mul.f16x2 r10164, r10161, r10111; +} +{ +add.f16x2 r10167, r8160, r10164; +} +{ +sub.f16x2 r10170, r9613, r9629; +} +{ +mul.f16x2 r10173, r10170, r10112; +} +{ +sub.f16x2 %27, r10167, r10173; +} +{ +add.f16x2 r10179, r9619, r9635; +} +{ +mul.f16x2 r10182, r10179, r10111; +} +{ +add.f16x2 r10185, r8160, r10182; +} +{ +sub.f16x2 r10188, r9613, r9629; +} +{ +mul.f16x2 r10191, r10188, r10112; +} +{ +add.f16x2 %45, r10185, r10191; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10197, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10198, {low, high}; +} +{ +add.f16x2 r10199, r9645, r9661; +} +{ +add.f16x2 %10, r8210, r10199; +} +{ +add.f16x2 r10205, r9651, r9667; +} +{ +add.f16x2 %11, r8246, r10205; +} +{ +add.f16x2 r10211, r9645, r9661; +} +{ +mul.f16x2 r10214, r10211, r10197; +} +{ +add.f16x2 r10217, r8210, r10214; +} +{ +sub.f16x2 r10220, r9651, r9667; +} +{ +mul.f16x2 r10223, r10220, r10198; +} +{ +add.f16x2 %28, r10217, r10223; +} +{ +add.f16x2 r10229, r9645, r9661; +} +{ +mul.f16x2 r10232, r10229, r10197; +} +{ +add.f16x2 r10235, r8210, r10232; +} +{ +sub.f16x2 r10238, r9651, r9667; +} +{ +mul.f16x2 r10241, r10238, r10198; +} +{ +sub.f16x2 %46, r10235, r10241; +} +{ +add.f16x2 r10247, r9651, r9667; +} +{ +mul.f16x2 r10250, r10247, r10197; +} +{ +add.f16x2 r10253, r8246, r10250; +} +{ +sub.f16x2 r10256, r9645, r9661; +} +{ +mul.f16x2 r10259, r10256, r10198; +} +{ +sub.f16x2 %29, r10253, r10259; +} +{ +add.f16x2 r10265, r9651, r9667; +} +{ +mul.f16x2 r10268, r10265, r10197; +} +{ +add.f16x2 r10271, r8246, r10268; +} +{ +sub.f16x2 r10274, r9645, r9661; +} +{ +mul.f16x2 r10277, r10274, r10198; +} +{ +add.f16x2 %47, r10271, r10277; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10284, {low, high}; +} +{ +add.f16x2 r10285, r9677, r9693; +} +{ +add.f16x2 %12, r8056, r10285; +} +{ +add.f16x2 r10291, r9683, r9699; +} +{ +add.f16x2 %13, r8092, r10291; +} +{ +add.f16x2 r10297, r9677, r9693; +} +{ +mul.f16x2 r10300, r10297, r10283; +} +{ +add.f16x2 r10303, r8056, r10300; +} +{ +sub.f16x2 r10306, r9683, r9699; +} +{ +mul.f16x2 r10309, r10306, r10284; +} +{ +add.f16x2 %30, r10303, r10309; +} +{ +add.f16x2 r10315, r9677, r9693; +} +{ +mul.f16x2 r10318, r10315, r10283; +} +{ +add.f16x2 r10321, r8056, r10318; +} +{ +sub.f16x2 r10324, r9683, r9699; +} +{ +mul.f16x2 r10327, r10324, r10284; +} +{ +sub.f16x2 %48, r10321, r10327; +} +{ +add.f16x2 r10333, r9683, r9699; +} +{ +mul.f16x2 r10336, r10333, r10283; +} +{ +add.f16x2 r10339, r8092, r10336; +} +{ +sub.f16x2 r10342, r9677, r9693; +} +{ +mul.f16x2 r10345, r10342, r10284; +} +{ +sub.f16x2 %31, r10339, r10345; +} +{ +add.f16x2 r10351, r9683, r9699; +} +{ +mul.f16x2 r10354, r10351, r10283; +} +{ +add.f16x2 r10357, r8092, r10354; +} +{ +sub.f16x2 r10360, r9677, r9693; +} +{ +mul.f16x2 r10363, r10360, r10284; +} +{ +add.f16x2 %49, r10357, r10363; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10369, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10370, {low, high}; +} +{ +add.f16x2 r10371, r9709, r9725; +} +{ +add.f16x2 %14, r8142, r10371; +} +{ +add.f16x2 r10377, r9715, r9731; +} +{ +add.f16x2 %15, r8178, r10377; +} +{ +add.f16x2 r10383, r9709, r9725; +} +{ +mul.f16x2 r10386, r10383, r10369; +} +{ +add.f16x2 r10389, r8142, r10386; +} +{ +sub.f16x2 r10392, r9715, r9731; +} +{ +mul.f16x2 r10395, r10392, r10370; +} +{ +add.f16x2 %32, r10389, r10395; +} +{ +add.f16x2 r10401, r9709, r9725; +} +{ +mul.f16x2 r10404, r10401, r10369; +} +{ +add.f16x2 r10407, r8142, r10404; +} +{ +sub.f16x2 r10410, r9715, r9731; +} +{ +mul.f16x2 r10413, r10410, r10370; +} +{ +sub.f16x2 %50, r10407, r10413; +} +{ +add.f16x2 r10419, r9715, r9731; +} +{ +mul.f16x2 r10422, r10419, r10369; +} +{ +add.f16x2 r10425, r8178, r10422; +} +{ +sub.f16x2 r10428, r9709, r9725; +} +{ +mul.f16x2 r10431, r10428, r10370; +} +{ +sub.f16x2 %33, r10425, r10431; +} +{ +add.f16x2 r10437, r9715, r9731; +} +{ +mul.f16x2 r10440, r10437, r10369; +} +{ +add.f16x2 r10443, r8178, r10440; +} +{ +sub.f16x2 r10446, r9709, r9725; +} +{ +mul.f16x2 r10449, r10446, r10370; +} +{ +add.f16x2 %51, r10443, r10449; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10455, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10456, {low, high}; +} +{ +add.f16x2 r10457, r9741, r9757; +} +{ +add.f16x2 %16, r8228, r10457; +} +{ +add.f16x2 r10463, r9747, r9763; +} +{ +add.f16x2 %17, r8264, r10463; +} +{ +add.f16x2 r10469, r9741, r9757; +} +{ +mul.f16x2 r10472, r10469, r10455; +} +{ +add.f16x2 r10475, r8228, r10472; +} +{ +sub.f16x2 r10478, r9747, r9763; +} +{ +mul.f16x2 r10481, r10478, r10456; +} +{ +add.f16x2 %34, r10475, r10481; +} +{ +add.f16x2 r10487, r9741, r9757; +} +{ +mul.f16x2 r10490, r10487, r10455; +} +{ +add.f16x2 r10493, r8228, r10490; +} +{ +sub.f16x2 r10496, r9747, r9763; +} +{ +mul.f16x2 r10499, r10496, r10456; +} +{ +sub.f16x2 %52, r10493, r10499; +} +{ +add.f16x2 r10505, r9747, r9763; +} +{ +mul.f16x2 r10508, r10505, r10455; +} +{ +add.f16x2 r10511, r8264, r10508; +} +{ +sub.f16x2 r10514, r9741, r9757; +} +{ +mul.f16x2 r10517, r10514, r10456; +} +{ +sub.f16x2 %35, r10511, r10517; +} +{ +add.f16x2 r10523, r9747, r9763; +} +{ +mul.f16x2 r10526, r10523, r10455; +} +{ +add.f16x2 r10529, r8264, r10526; +} +{ +sub.f16x2 r10532, r9741, r9757; +} +{ +mul.f16x2 r10535, r10532, r10456; +} +{ +add.f16x2 %53, r10529, r10535; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1184, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1043>; +.reg .b32 r<10616>; +.reg .b64 rd<6>; +mov.u32 r10542, %54; +mov.u32 r10615, %tid.y; +mad.lo.s32 r10543, r10615, 78732, r10542; +mov.u32 r10544, %tid.x; +mov.f32 f1034, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1, {low, high}; +} +mov.f32 f1036, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f906, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r259, {low, high}; +} +mov.f32 f908, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r260, {low, high}; +} +mov.f32 f918, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r261, {low, high}; +} +mov.f32 f920, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r262, {low, high}; +} +mov.f32 f942, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r265, {low, high}; +} +mov.f32 f944, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %76; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %76; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %76; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %76; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %76; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %74; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %74; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %74; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %74; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %74; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %75, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %75, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %75, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %75, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %75, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f898, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r1789, {low, high}; +} +mov.f32 f900, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r1790, {low, high}; +} +mov.f32 f902, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r1791, {low, high}; +} +mov.f32 f904, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r1794, {low, high}; +} +mov.f32 f910, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r1795, {low, high}; +} +mov.f32 f912, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r1796, {low, high}; +} +mov.f32 f914, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r1797, {low, high}; +} +mov.f32 f916, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r1800, {low, high}; +} +mov.f32 f922, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r1801, {low, high}; +} +mov.f32 f924, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r1802, {low, high}; +} +mov.f32 f926, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r1803, {low, high}; +} +mov.f32 f928, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r1804, {low, high}; +} +mov.f32 f934, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r1807, {low, high}; +} +mov.f32 f936, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r1808, {low, high}; +} +mov.f32 f958, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r1812, {low, high}; +} +mov.f32 f950, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r1815, {low, high}; +} +mov.f32 f952, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r1819, {low, high}; +} +mov.f32 f960, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r10544, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r10545, rd3; +mul.lo.s32 r10546, r10545, 729; +sub.s32 r10547, r10544, r10546; +mad.lo.s32 r10548, r10545, 78732, r10543; +cvt.rn.f32.u32 f1037, r10547; +mul.f32 f1038, f1037, 0f39A75CD5; +cos.approx.f32 f309, f1038; +sin.approx.f32 f1039, f1038; +neg.f32 f310, f1039; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +barrier.sync 0; +mad.lo.s32 r10549, r10547, 108, r10548; +st.shared.u32 [r10549], r2102; +st.shared.u32 [r10549+4], r2881; +st.shared.u32 [r10549+8], r2918; +st.shared.u32 [r10549+12], r2955; +st.shared.u32 [r10549+16], r2992; +st.shared.u32 [r10549+20], r3029; +st.shared.u32 [r10549+24], r3066; +st.shared.u32 [r10549+28], r3103; +st.shared.u32 [r10549+32], r3140; +st.shared.u32 [r10549+36], r3177; +st.shared.u32 [r10549+40], r3214; +st.shared.u32 [r10549+44], r3251; +st.shared.u32 [r10549+48], r3288; +st.shared.u32 [r10549+52], r3325; +st.shared.u32 [r10549+56], r3362; +st.shared.u32 [r10549+60], r3399; +st.shared.u32 [r10549+64], r3436; +st.shared.u32 [r10549+68], r3473; +st.shared.u32 [r10549+72], r3510; +st.shared.u32 [r10549+76], r3547; +st.shared.u32 [r10549+80], r3584; +st.shared.u32 [r10549+84], r3621; +st.shared.u32 [r10549+88], r3658; +st.shared.u32 [r10549+92], r3695; +st.shared.u32 [r10549+96], r3732; +st.shared.u32 [r10549+100], r3769; +st.shared.u32 [r10549+104], r3806; +barrier.sync 0; +mad.lo.s32 r10550, r10547, -104, r10549; +ld.shared.u32 r3842, [r10550]; +ld.shared.u32 r4438, [r10550+2916]; +ld.shared.u32 r5034, [r10550+5832]; +ld.shared.u32 r3928, [r10550+8748]; +ld.shared.u32 r4524, [r10550+11664]; +ld.shared.u32 r5120, [r10550+14580]; +ld.shared.u32 r4014, [r10550+17496]; +ld.shared.u32 r4610, [r10550+20412]; +ld.shared.u32 r5206, [r10550+23328]; +ld.shared.u32 r3839, [r10550+26244]; +ld.shared.u32 r4435, [r10550+29160]; +ld.shared.u32 r5031, [r10550+32076]; +ld.shared.u32 r3925, [r10550+34992]; +ld.shared.u32 r4521, [r10550+37908]; +ld.shared.u32 r5117, [r10550+40824]; +ld.shared.u32 r4011, [r10550+43740]; +ld.shared.u32 r4607, [r10550+46656]; +ld.shared.u32 r5203, [r10550+49572]; +ld.shared.u32 r3840, [r10550+52488]; +ld.shared.u32 r4436, [r10550+55404]; +ld.shared.u32 r5032, [r10550+58320]; +ld.shared.u32 r3926, [r10550+61236]; +ld.shared.u32 r4522, [r10550+64152]; +ld.shared.u32 r5118, [r10550+67068]; +ld.shared.u32 r4012, [r10550+69984]; +ld.shared.u32 r4608, [r10550+72900]; +ld.shared.u32 r5204, [r10550+75816]; +barrier.sync 0; +st.shared.u32 [r10549], r2108; +st.shared.u32 [r10549+4], r2890; +st.shared.u32 [r10549+8], r2927; +st.shared.u32 [r10549+12], r2964; +st.shared.u32 [r10549+16], r3001; +st.shared.u32 [r10549+20], r3038; +st.shared.u32 [r10549+24], r3075; +st.shared.u32 [r10549+28], r3112; +st.shared.u32 [r10549+32], r3149; +st.shared.u32 [r10549+36], r3186; +st.shared.u32 [r10549+40], r3223; +st.shared.u32 [r10549+44], r3260; +st.shared.u32 [r10549+48], r3297; +st.shared.u32 [r10549+52], r3334; +st.shared.u32 [r10549+56], r3371; +st.shared.u32 [r10549+60], r3408; +st.shared.u32 [r10549+64], r3445; +st.shared.u32 [r10549+68], r3482; +st.shared.u32 [r10549+72], r3519; +st.shared.u32 [r10549+76], r3556; +st.shared.u32 [r10549+80], r3593; +st.shared.u32 [r10549+84], r3630; +st.shared.u32 [r10549+88], r3667; +st.shared.u32 [r10549+92], r3704; +st.shared.u32 [r10549+96], r3741; +st.shared.u32 [r10549+100], r3778; +st.shared.u32 [r10549+104], r3815; +barrier.sync 0; +ld.shared.u32 r3848, [r10550]; +ld.shared.u32 r4444, [r10550+2916]; +ld.shared.u32 r5040, [r10550+5832]; +ld.shared.u32 r3934, [r10550+8748]; +ld.shared.u32 r4530, [r10550+11664]; +ld.shared.u32 r5126, [r10550+14580]; +ld.shared.u32 r4020, [r10550+17496]; +ld.shared.u32 r4616, [r10550+20412]; +ld.shared.u32 r5212, [r10550+23328]; +ld.shared.u32 r3845, [r10550+26244]; +ld.shared.u32 r4441, [r10550+29160]; +ld.shared.u32 r5037, [r10550+32076]; +ld.shared.u32 r3931, [r10550+34992]; +ld.shared.u32 r4527, [r10550+37908]; +ld.shared.u32 r5123, [r10550+40824]; +ld.shared.u32 r4017, [r10550+43740]; +ld.shared.u32 r4613, [r10550+46656]; +ld.shared.u32 r5209, [r10550+49572]; +ld.shared.u32 r3846, [r10550+52488]; +ld.shared.u32 r4442, [r10550+55404]; +ld.shared.u32 r5038, [r10550+58320]; +ld.shared.u32 r3932, [r10550+61236]; +ld.shared.u32 r4528, [r10550+64152]; +ld.shared.u32 r5124, [r10550+67068]; +ld.shared.u32 r4018, [r10550+69984]; +ld.shared.u32 r4614, [r10550+72900]; +ld.shared.u32 r5210, [r10550+75816]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 r5937, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 r5943, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 r5979, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 r5997, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 r6015, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 r6023, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 r6029, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 r6047, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 r6065, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 r6083, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 r6109, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 r6115, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 r6133, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 r6151, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 r6169, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 r6187, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 r6195, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 r6201, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 r6219, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 r6237, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 r6255, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 r6273, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 r6281, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 r6287, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 r6305, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 r6323, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 r6341, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 r6359, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 r6367, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 r6373, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 r6391, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 r6409, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 r6427, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 r6445, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 r6453, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 r6459, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 r6477, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 r6495, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 r6513, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 r6531, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 r6539, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 r6545, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 r6617, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 r6625, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 r6631, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 r6649, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 r6667, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 r6685, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 r6703, r6694, r6700; +} +mul.wide.u32 rd4, r10547, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r10551, rd5; +sub.s32 r10552, r10547, r10551; +shr.u32 r10553, r10552, 1; +add.s32 r10554, r10553, r10551; +shr.u32 r10555, r10554, 4; +mul.lo.s32 r10556, r10555, 27; +sub.s32 r10557, r10547, r10556; +shl.b32 r10558, r10557, 2; +add.s32 r10559, r10548, r10558; +cvt.rn.f32.u32 f1040, r10555; +mul.f32 f1041, f1040, 0f3C0D3654; +cos.approx.f32 f673, f1041; +sin.approx.f32 f1042, f1041; +neg.f32 f674, f1042; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6706, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6711, {high, high}; +} +{ +mul.f16x2 r6713, r6029, r6711; +} +{ +fma.rn.f16x2 r6716, r6023, r6709, r6713; +} +{ +mul.f16x2 r6720, r6023, r6711; +} +{ +neg.f16x2 r6723, r6720; +} +{ +fma.rn.f16x2 r6725, r6029, r6709, r6723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6733, {low, high}; +} +{ +mul.f16x2 r6734, r6731, r6733; +} +{ +mul.f16x2 r6737, r6706, r6729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6740, {high, low}; +} +{ +fma.rn.f16x2 r6742, r6734, r6740, r6737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6748, {high, high}; +} +{ +mul.f16x2 r6750, r6115, r6748; +} +{ +fma.rn.f16x2 r6753, r6109, r6746, r6750; +} +{ +mul.f16x2 r6757, r6109, r6748; +} +{ +neg.f16x2 r6760, r6757; +} +{ +fma.rn.f16x2 r6762, r6115, r6746, r6760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6770, {low, high}; +} +{ +mul.f16x2 r6771, r6768, r6770; +} +{ +mul.f16x2 r6774, r6742, r6766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6777, {high, low}; +} +{ +fma.rn.f16x2 r6779, r6771, r6777, r6774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6785, {high, high}; +} +{ +mul.f16x2 r6787, r6201, r6785; +} +{ +fma.rn.f16x2 r6790, r6195, r6783, r6787; +} +{ +mul.f16x2 r6794, r6195, r6785; +} +{ +neg.f16x2 r6797, r6794; +} +{ +fma.rn.f16x2 r6799, r6201, r6783, r6797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6807, {low, high}; +} +{ +mul.f16x2 r6808, r6805, r6807; +} +{ +mul.f16x2 r6811, r6779, r6803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6814, {high, low}; +} +{ +fma.rn.f16x2 r6816, r6808, r6814, r6811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6822, {high, high}; +} +{ +mul.f16x2 r6824, r6287, r6822; +} +{ +fma.rn.f16x2 r6827, r6281, r6820, r6824; +} +{ +mul.f16x2 r6831, r6281, r6822; +} +{ +neg.f16x2 r6834, r6831; +} +{ +fma.rn.f16x2 r6836, r6287, r6820, r6834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6844, {low, high}; +} +{ +mul.f16x2 r6845, r6842, r6844; +} +{ +mul.f16x2 r6848, r6816, r6840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6851, {high, low}; +} +{ +fma.rn.f16x2 r6853, r6845, r6851, r6848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6859, {high, high}; +} +{ +mul.f16x2 r6861, r6373, r6859; +} +{ +fma.rn.f16x2 r6864, r6367, r6857, r6861; +} +{ +mul.f16x2 r6868, r6367, r6859; +} +{ +neg.f16x2 r6871, r6868; +} +{ +fma.rn.f16x2 r6873, r6373, r6857, r6871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6881, {low, high}; +} +{ +mul.f16x2 r6882, r6879, r6881; +} +{ +mul.f16x2 r6885, r6853, r6877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6888, {high, low}; +} +{ +fma.rn.f16x2 r6890, r6882, r6888, r6885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6896, {high, high}; +} +{ +mul.f16x2 r6898, r6459, r6896; +} +{ +fma.rn.f16x2 r6901, r6453, r6894, r6898; +} +{ +mul.f16x2 r6905, r6453, r6896; +} +{ +neg.f16x2 r6908, r6905; +} +{ +fma.rn.f16x2 r6910, r6459, r6894, r6908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6918, {low, high}; +} +{ +mul.f16x2 r6919, r6916, r6918; +} +{ +mul.f16x2 r6922, r6890, r6914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6925, {high, low}; +} +{ +fma.rn.f16x2 r6927, r6919, r6925, r6922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6933, {high, high}; +} +{ +mul.f16x2 r6935, r6545, r6933; +} +{ +fma.rn.f16x2 r6938, r6539, r6931, r6935; +} +{ +mul.f16x2 r6942, r6539, r6933; +} +{ +neg.f16x2 r6945, r6942; +} +{ +fma.rn.f16x2 r6947, r6545, r6931, r6945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6955, {low, high}; +} +{ +mul.f16x2 r6956, r6953, r6955; +} +{ +mul.f16x2 r6959, r6927, r6951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6962, {high, low}; +} +{ +fma.rn.f16x2 r6964, r6956, r6962, r6959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6970, {high, high}; +} +{ +mul.f16x2 r6972, r6631, r6970; +} +{ +fma.rn.f16x2 r6975, r6625, r6968, r6972; +} +{ +mul.f16x2 r6979, r6625, r6970; +} +{ +neg.f16x2 r6982, r6979; +} +{ +fma.rn.f16x2 r6984, r6631, r6968, r6982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6992, {low, high}; +} +{ +mul.f16x2 r6993, r6990, r6992; +} +{ +mul.f16x2 r6996, r6964, r6988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6999, {high, low}; +} +{ +fma.rn.f16x2 r7001, r6993, r6999, r6996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7007, {high, high}; +} +{ +mul.f16x2 r7009, r5997, r7007; +} +{ +fma.rn.f16x2 r7012, r5961, r7005, r7009; +} +{ +mul.f16x2 r7016, r5961, r7007; +} +{ +neg.f16x2 r7019, r7016; +} +{ +fma.rn.f16x2 r7021, r5997, r7005, r7019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7029, {low, high}; +} +{ +mul.f16x2 r7030, r7027, r7029; +} +{ +mul.f16x2 r7033, r7001, r7025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7036, {high, low}; +} +{ +fma.rn.f16x2 r7038, r7030, r7036, r7033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7044, {high, high}; +} +{ +mul.f16x2 r7046, r6083, r7044; +} +{ +fma.rn.f16x2 r7049, r6047, r7042, r7046; +} +{ +mul.f16x2 r7053, r6047, r7044; +} +{ +neg.f16x2 r7056, r7053; +} +{ +fma.rn.f16x2 r7058, r6083, r7042, r7056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7066, {low, high}; +} +{ +mul.f16x2 r7067, r7064, r7066; +} +{ +mul.f16x2 r7070, r7038, r7062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7073, {high, low}; +} +{ +fma.rn.f16x2 r7075, r7067, r7073, r7070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7081, {high, high}; +} +{ +mul.f16x2 r7083, r6169, r7081; +} +{ +fma.rn.f16x2 r7086, r6133, r7079, r7083; +} +{ +mul.f16x2 r7090, r6133, r7081; +} +{ +neg.f16x2 r7093, r7090; +} +{ +fma.rn.f16x2 r7095, r6169, r7079, r7093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7103, {low, high}; +} +{ +mul.f16x2 r7104, r7101, r7103; +} +{ +mul.f16x2 r7107, r7075, r7099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7110, {high, low}; +} +{ +fma.rn.f16x2 r7112, r7104, r7110, r7107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7118, {high, high}; +} +{ +mul.f16x2 r7120, r6255, r7118; +} +{ +fma.rn.f16x2 r7123, r6219, r7116, r7120; +} +{ +mul.f16x2 r7127, r6219, r7118; +} +{ +neg.f16x2 r7130, r7127; +} +{ +fma.rn.f16x2 r7132, r6255, r7116, r7130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7140, {low, high}; +} +{ +mul.f16x2 r7141, r7138, r7140; +} +{ +mul.f16x2 r7144, r7112, r7136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7147, {high, low}; +} +{ +fma.rn.f16x2 r7149, r7141, r7147, r7144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7155, {high, high}; +} +{ +mul.f16x2 r7157, r6341, r7155; +} +{ +fma.rn.f16x2 r7160, r6305, r7153, r7157; +} +{ +mul.f16x2 r7164, r6305, r7155; +} +{ +neg.f16x2 r7167, r7164; +} +{ +fma.rn.f16x2 r7169, r6341, r7153, r7167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7177, {low, high}; +} +{ +mul.f16x2 r7178, r7175, r7177; +} +{ +mul.f16x2 r7181, r7149, r7173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7184, {high, low}; +} +{ +fma.rn.f16x2 r7186, r7178, r7184, r7181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7192, {high, high}; +} +{ +mul.f16x2 r7194, r6427, r7192; +} +{ +fma.rn.f16x2 r7197, r6391, r7190, r7194; +} +{ +mul.f16x2 r7201, r6391, r7192; +} +{ +neg.f16x2 r7204, r7201; +} +{ +fma.rn.f16x2 r7206, r6427, r7190, r7204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7214, {low, high}; +} +{ +mul.f16x2 r7215, r7212, r7214; +} +{ +mul.f16x2 r7218, r7186, r7210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7221, {high, low}; +} +{ +fma.rn.f16x2 r7223, r7215, r7221, r7218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7229, {high, high}; +} +{ +mul.f16x2 r7231, r6513, r7229; +} +{ +fma.rn.f16x2 r7234, r6477, r7227, r7231; +} +{ +mul.f16x2 r7238, r6477, r7229; +} +{ +neg.f16x2 r7241, r7238; +} +{ +fma.rn.f16x2 r7243, r6513, r7227, r7241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7251, {low, high}; +} +{ +mul.f16x2 r7252, r7249, r7251; +} +{ +mul.f16x2 r7255, r7223, r7247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7258, {high, low}; +} +{ +fma.rn.f16x2 r7260, r7252, r7258, r7255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7266, {high, high}; +} +{ +mul.f16x2 r7268, r6599, r7266; +} +{ +fma.rn.f16x2 r7271, r6563, r7264, r7268; +} +{ +mul.f16x2 r7275, r6563, r7266; +} +{ +neg.f16x2 r7278, r7275; +} +{ +fma.rn.f16x2 r7280, r6599, r7264, r7278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7288, {low, high}; +} +{ +mul.f16x2 r7289, r7286, r7288; +} +{ +mul.f16x2 r7292, r7260, r7284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7295, {high, low}; +} +{ +fma.rn.f16x2 r7297, r7289, r7295, r7292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7303, {high, high}; +} +{ +mul.f16x2 r7305, r6685, r7303; +} +{ +fma.rn.f16x2 r7308, r6649, r7301, r7305; +} +{ +mul.f16x2 r7312, r6649, r7303; +} +{ +neg.f16x2 r7315, r7312; +} +{ +fma.rn.f16x2 r7317, r6685, r7301, r7315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7325, {low, high}; +} +{ +mul.f16x2 r7326, r7323, r7325; +} +{ +mul.f16x2 r7329, r7297, r7321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7332, {high, low}; +} +{ +fma.rn.f16x2 r7334, r7326, r7332, r7329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7340, {high, high}; +} +{ +mul.f16x2 r7342, r6015, r7340; +} +{ +fma.rn.f16x2 r7345, r5979, r7338, r7342; +} +{ +mul.f16x2 r7349, r5979, r7340; +} +{ +neg.f16x2 r7352, r7349; +} +{ +fma.rn.f16x2 r7354, r6015, r7338, r7352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7362, {low, high}; +} +{ +mul.f16x2 r7363, r7360, r7362; +} +{ +mul.f16x2 r7366, r7334, r7358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7369, {high, low}; +} +{ +fma.rn.f16x2 r7371, r7363, r7369, r7366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7377, {high, high}; +} +{ +mul.f16x2 r7379, r6101, r7377; +} +{ +fma.rn.f16x2 r7382, r6065, r7375, r7379; +} +{ +mul.f16x2 r7386, r6065, r7377; +} +{ +neg.f16x2 r7389, r7386; +} +{ +fma.rn.f16x2 r7391, r6101, r7375, r7389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7399, {low, high}; +} +{ +mul.f16x2 r7400, r7397, r7399; +} +{ +mul.f16x2 r7403, r7371, r7395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7406, {high, low}; +} +{ +fma.rn.f16x2 r7408, r7400, r7406, r7403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7414, {high, high}; +} +{ +mul.f16x2 r7416, r6187, r7414; +} +{ +fma.rn.f16x2 r7419, r6151, r7412, r7416; +} +{ +mul.f16x2 r7423, r6151, r7414; +} +{ +neg.f16x2 r7426, r7423; +} +{ +fma.rn.f16x2 r7428, r6187, r7412, r7426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7436, {low, high}; +} +{ +mul.f16x2 r7437, r7434, r7436; +} +{ +mul.f16x2 r7440, r7408, r7432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7443, {high, low}; +} +{ +fma.rn.f16x2 r7445, r7437, r7443, r7440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7451, {high, high}; +} +{ +mul.f16x2 r7453, r6273, r7451; +} +{ +fma.rn.f16x2 r7456, r6237, r7449, r7453; +} +{ +mul.f16x2 r7460, r6237, r7451; +} +{ +neg.f16x2 r7463, r7460; +} +{ +fma.rn.f16x2 r7465, r6273, r7449, r7463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7473, {low, high}; +} +{ +mul.f16x2 r7474, r7471, r7473; +} +{ +mul.f16x2 r7477, r7445, r7469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7480, {high, low}; +} +{ +fma.rn.f16x2 r7482, r7474, r7480, r7477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7488, {high, high}; +} +{ +mul.f16x2 r7490, r6359, r7488; +} +{ +fma.rn.f16x2 r7493, r6323, r7486, r7490; +} +{ +mul.f16x2 r7497, r6323, r7488; +} +{ +neg.f16x2 r7500, r7497; +} +{ +fma.rn.f16x2 r7502, r6359, r7486, r7500; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7508, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7510, {low, high}; +} +{ +mul.f16x2 r7511, r7508, r7510; +} +{ +mul.f16x2 r7514, r7482, r7506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7517, {high, low}; +} +{ +fma.rn.f16x2 r7519, r7511, r7517, r7514; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7525, {high, high}; +} +{ +mul.f16x2 r7527, r6445, r7525; +} +{ +fma.rn.f16x2 r7530, r6409, r7523, r7527; +} +{ +mul.f16x2 r7534, r6409, r7525; +} +{ +neg.f16x2 r7537, r7534; +} +{ +fma.rn.f16x2 r7539, r6445, r7523, r7537; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7545, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7547, {low, high}; +} +{ +mul.f16x2 r7548, r7545, r7547; +} +{ +mul.f16x2 r7551, r7519, r7543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7554, {high, low}; +} +{ +fma.rn.f16x2 r7556, r7548, r7554, r7551; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7562, {high, high}; +} +{ +mul.f16x2 r7564, r6531, r7562; +} +{ +fma.rn.f16x2 r7567, r6495, r7560, r7564; +} +{ +mul.f16x2 r7571, r6495, r7562; +} +{ +neg.f16x2 r7574, r7571; +} +{ +fma.rn.f16x2 r7576, r6531, r7560, r7574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7582, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7584, {low, high}; +} +{ +mul.f16x2 r7585, r7582, r7584; +} +{ +mul.f16x2 r7588, r7556, r7580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7591, {high, low}; +} +{ +fma.rn.f16x2 r7593, r7585, r7591, r7588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7599, {high, high}; +} +{ +mul.f16x2 r7601, r6617, r7599; +} +{ +fma.rn.f16x2 r7604, r6581, r7597, r7601; +} +{ +mul.f16x2 r7608, r6581, r7599; +} +{ +neg.f16x2 r7611, r7608; +} +{ +fma.rn.f16x2 r7613, r6617, r7597, r7611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7619, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7621, {low, high}; +} +{ +mul.f16x2 r7622, r7619, r7621; +} +{ +mul.f16x2 r7625, r7593, r7617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7628, {high, low}; +} +{ +fma.rn.f16x2 r7630, r7622, r7628, r7625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7636, {high, high}; +} +{ +mul.f16x2 r7638, r6703, r7636; +} +{ +fma.rn.f16x2 r7641, r6667, r7634, r7638; +} +{ +mul.f16x2 r7645, r6667, r7636; +} +{ +neg.f16x2 r7648, r7645; +} +{ +fma.rn.f16x2 r7650, r6703, r7634, r7648; +} +barrier.sync 0; +mad.lo.s32 r10560, r10555, 2916, r10559; +st.shared.u32 [r10560], r5937; +st.shared.u32 [r10560+108], r6716; +st.shared.u32 [r10560+216], r6753; +st.shared.u32 [r10560+324], r6790; +st.shared.u32 [r10560+432], r6827; +st.shared.u32 [r10560+540], r6864; +st.shared.u32 [r10560+648], r6901; +st.shared.u32 [r10560+756], r6938; +st.shared.u32 [r10560+864], r6975; +st.shared.u32 [r10560+972], r7012; +st.shared.u32 [r10560+1080], r7049; +st.shared.u32 [r10560+1188], r7086; +st.shared.u32 [r10560+1296], r7123; +st.shared.u32 [r10560+1404], r7160; +st.shared.u32 [r10560+1512], r7197; +st.shared.u32 [r10560+1620], r7234; +st.shared.u32 [r10560+1728], r7271; +st.shared.u32 [r10560+1836], r7308; +st.shared.u32 [r10560+1944], r7345; +st.shared.u32 [r10560+2052], r7382; +st.shared.u32 [r10560+2160], r7419; +st.shared.u32 [r10560+2268], r7456; +st.shared.u32 [r10560+2376], r7493; +st.shared.u32 [r10560+2484], r7530; +st.shared.u32 [r10560+2592], r7567; +st.shared.u32 [r10560+2700], r7604; +st.shared.u32 [r10560+2808], r7641; +barrier.sync 0; +ld.shared.u32 r7677, [r10550]; +ld.shared.u32 r8273, [r10550+2916]; +ld.shared.u32 r8869, [r10550+5832]; +ld.shared.u32 r7763, [r10550+8748]; +ld.shared.u32 r8359, [r10550+11664]; +ld.shared.u32 r8955, [r10550+14580]; +ld.shared.u32 r7849, [r10550+17496]; +ld.shared.u32 r8445, [r10550+20412]; +ld.shared.u32 r9041, [r10550+23328]; +ld.shared.u32 r7674, [r10550+26244]; +ld.shared.u32 r8270, [r10550+29160]; +ld.shared.u32 r8866, [r10550+32076]; +ld.shared.u32 r7760, [r10550+34992]; +ld.shared.u32 r8356, [r10550+37908]; +ld.shared.u32 r8952, [r10550+40824]; +ld.shared.u32 r7846, [r10550+43740]; +ld.shared.u32 r8442, [r10550+46656]; +ld.shared.u32 r9038, [r10550+49572]; +ld.shared.u32 r7675, [r10550+52488]; +ld.shared.u32 r8271, [r10550+55404]; +ld.shared.u32 r8867, [r10550+58320]; +ld.shared.u32 r7761, [r10550+61236]; +ld.shared.u32 r8357, [r10550+64152]; +ld.shared.u32 r8953, [r10550+67068]; +ld.shared.u32 r7847, [r10550+69984]; +ld.shared.u32 r8443, [r10550+72900]; +ld.shared.u32 r9039, [r10550+75816]; +barrier.sync 0; +st.shared.u32 [r10560], r5943; +st.shared.u32 [r10560+108], r6725; +st.shared.u32 [r10560+216], r6762; +st.shared.u32 [r10560+324], r6799; +st.shared.u32 [r10560+432], r6836; +st.shared.u32 [r10560+540], r6873; +st.shared.u32 [r10560+648], r6910; +st.shared.u32 [r10560+756], r6947; +st.shared.u32 [r10560+864], r6984; +st.shared.u32 [r10560+972], r7021; +st.shared.u32 [r10560+1080], r7058; +st.shared.u32 [r10560+1188], r7095; +st.shared.u32 [r10560+1296], r7132; +st.shared.u32 [r10560+1404], r7169; +st.shared.u32 [r10560+1512], r7206; +st.shared.u32 [r10560+1620], r7243; +st.shared.u32 [r10560+1728], r7280; +st.shared.u32 [r10560+1836], r7317; +st.shared.u32 [r10560+1944], r7354; +st.shared.u32 [r10560+2052], r7391; +st.shared.u32 [r10560+2160], r7428; +st.shared.u32 [r10560+2268], r7465; +st.shared.u32 [r10560+2376], r7502; +st.shared.u32 [r10560+2484], r7539; +st.shared.u32 [r10560+2592], r7576; +st.shared.u32 [r10560+2700], r7613; +st.shared.u32 [r10560+2808], r7650; +barrier.sync 0; +ld.shared.u32 r7683, [r10550]; +ld.shared.u32 r8279, [r10550+2916]; +ld.shared.u32 r8875, [r10550+5832]; +ld.shared.u32 r7769, [r10550+8748]; +ld.shared.u32 r8365, [r10550+11664]; +ld.shared.u32 r8961, [r10550+14580]; +ld.shared.u32 r7855, [r10550+17496]; +ld.shared.u32 r8451, [r10550+20412]; +ld.shared.u32 r9047, [r10550+23328]; +ld.shared.u32 r7680, [r10550+26244]; +ld.shared.u32 r8276, [r10550+29160]; +ld.shared.u32 r8872, [r10550+32076]; +ld.shared.u32 r7766, [r10550+34992]; +ld.shared.u32 r8362, [r10550+37908]; +ld.shared.u32 r8958, [r10550+40824]; +ld.shared.u32 r7852, [r10550+43740]; +ld.shared.u32 r8448, [r10550+46656]; +ld.shared.u32 r9044, [r10550+49572]; +ld.shared.u32 r7681, [r10550+52488]; +ld.shared.u32 r8277, [r10550+55404]; +ld.shared.u32 r8873, [r10550+58320]; +ld.shared.u32 r7767, [r10550+61236]; +ld.shared.u32 r8363, [r10550+64152]; +ld.shared.u32 r8959, [r10550+67068]; +ld.shared.u32 r7853, [r10550+69984]; +ld.shared.u32 r8449, [r10550+72900]; +ld.shared.u32 r9045, [r10550+75816]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7672, {low, high}; +} +{ +add.f16x2 r7673, r7674, r7675; +} +{ +add.f16x2 r7676, r7677, r7673; +} +{ +add.f16x2 r7679, r7680, r7681; +} +{ +add.f16x2 r7682, r7683, r7679; +} +{ +add.f16x2 r7685, r7674, r7675; +} +{ +mul.f16x2 r7688, r7685, r7671; +} +{ +add.f16x2 r7691, r7677, r7688; +} +{ +sub.f16x2 r7694, r7680, r7681; +} +{ +mul.f16x2 r7697, r7694, r7672; +} +{ +add.f16x2 r7700, r7691, r7697; +} +{ +add.f16x2 r7703, r7674, r7675; +} +{ +mul.f16x2 r7706, r7703, r7671; +} +{ +add.f16x2 r7709, r7677, r7706; +} +{ +sub.f16x2 r7712, r7680, r7681; +} +{ +mul.f16x2 r7715, r7712, r7672; +} +{ +sub.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 r7721, r7680, r7681; +} +{ +mul.f16x2 r7724, r7721, r7671; +} +{ +add.f16x2 r7727, r7683, r7724; +} +{ +sub.f16x2 r7730, r7674, r7675; +} +{ +mul.f16x2 r7733, r7730, r7672; +} +{ +sub.f16x2 r7736, r7727, r7733; +} +{ +add.f16x2 r7739, r7680, r7681; +} +{ +mul.f16x2 r7742, r7739, r7671; +} +{ +add.f16x2 r7745, r7683, r7742; +} +{ +sub.f16x2 r7748, r7674, r7675; +} +{ +mul.f16x2 r7751, r7748, r7672; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7757, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7758, {low, high}; +} +{ +add.f16x2 r7759, r7760, r7761; +} +{ +add.f16x2 r7762, r7763, r7759; +} +{ +add.f16x2 r7765, r7766, r7767; +} +{ +add.f16x2 r7768, r7769, r7765; +} +{ +add.f16x2 r7771, r7760, r7761; +} +{ +mul.f16x2 r7774, r7771, r7757; +} +{ +add.f16x2 r7777, r7763, r7774; +} +{ +sub.f16x2 r7780, r7766, r7767; +} +{ +mul.f16x2 r7783, r7780, r7758; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7760, r7761; +} +{ +mul.f16x2 r7792, r7789, r7757; +} +{ +add.f16x2 r7795, r7763, r7792; +} +{ +sub.f16x2 r7798, r7766, r7767; +} +{ +mul.f16x2 r7801, r7798, r7758; +} +{ +sub.f16x2 r7804, r7795, r7801; +} +{ +add.f16x2 r7807, r7766, r7767; +} +{ +mul.f16x2 r7810, r7807, r7757; +} +{ +add.f16x2 r7813, r7769, r7810; +} +{ +sub.f16x2 r7816, r7760, r7761; +} +{ +mul.f16x2 r7819, r7816, r7758; +} +{ +sub.f16x2 r7822, r7813, r7819; +} +{ +add.f16x2 r7825, r7766, r7767; +} +{ +mul.f16x2 r7828, r7825, r7757; +} +{ +add.f16x2 r7831, r7769, r7828; +} +{ +sub.f16x2 r7834, r7760, r7761; +} +{ +mul.f16x2 r7837, r7834, r7758; +} +{ +add.f16x2 r7840, r7831, r7837; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r7843, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r7844, {low, high}; +} +{ +add.f16x2 r7845, r7846, r7847; +} +{ +add.f16x2 r7848, r7849, r7845; +} +{ +add.f16x2 r7851, r7852, r7853; +} +{ +add.f16x2 r7854, r7855, r7851; +} +{ +add.f16x2 r7857, r7846, r7847; +} +{ +mul.f16x2 r7860, r7857, r7843; +} +{ +add.f16x2 r7863, r7849, r7860; +} +{ +sub.f16x2 r7866, r7852, r7853; +} +{ +mul.f16x2 r7869, r7866, r7844; +} +{ +add.f16x2 r7872, r7863, r7869; +} +{ +add.f16x2 r7875, r7846, r7847; +} +{ +mul.f16x2 r7878, r7875, r7843; +} +{ +add.f16x2 r7881, r7849, r7878; +} +{ +sub.f16x2 r7884, r7852, r7853; +} +{ +mul.f16x2 r7887, r7884, r7844; +} +{ +sub.f16x2 r7890, r7881, r7887; +} +{ +add.f16x2 r7893, r7852, r7853; +} +{ +mul.f16x2 r7896, r7893, r7843; +} +{ +add.f16x2 r7899, r7855, r7896; +} +{ +sub.f16x2 r7902, r7846, r7847; +} +{ +mul.f16x2 r7905, r7902, r7844; +} +{ +sub.f16x2 r7908, r7899, r7905; +} +{ +add.f16x2 r7911, r7852, r7853; +} +{ +mul.f16x2 r7914, r7911, r7843; +} +{ +add.f16x2 r7917, r7855, r7914; +} +{ +sub.f16x2 r7920, r7846, r7847; +} +{ +mul.f16x2 r7923, r7920, r7844; +} +{ +add.f16x2 r7926, r7917, r7923; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r7929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r7930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r7931, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r7932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r7935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r7936, {low, high}; +} +{ +mul.f16x2 r7945, r7786, r7929; +} +{ +mul.f16x2 r7948, r7822, r7930; +} +{ +sub.f16x2 r7951, r7945, r7948; +} +{ +mul.f16x2 r7954, r7786, r7930; +} +{ +fma.rn.f16x2 r7957, r7822, r7929, r7954; +} +{ +mul.f16x2 r7961, r7872, r7931; +} +{ +mul.f16x2 r7964, r7908, r7932; +} +{ +sub.f16x2 r7967, r7961, r7964; +} +{ +mul.f16x2 r7970, r7872, r7932; +} +{ +fma.rn.f16x2 r7973, r7908, r7931, r7970; +} +{ +mul.f16x2 r7977, r7804, r7931; +} +{ +mul.f16x2 r7980, r7840, r7932; +} +{ +sub.f16x2 r7983, r7977, r7980; +} +{ +mul.f16x2 r7986, r7804, r7932; +} +{ +fma.rn.f16x2 r7989, r7840, r7931, r7986; +} +{ +mul.f16x2 r7993, r7890, r7935; +} +{ +mul.f16x2 r7996, r7926, r7936; +} +{ +sub.f16x2 r7999, r7993, r7996; +} +{ +mul.f16x2 r8002, r7890, r7936; +} +{ +fma.rn.f16x2 r8005, r7926, r7935, r8002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8010, {low, high}; +} +{ +add.f16x2 r8011, r7762, r7848; +} +{ +add.f16x2 r8014, r7676, r8011; +} +{ +add.f16x2 r8017, r7768, r7854; +} +{ +add.f16x2 r8020, r7682, r8017; +} +{ +add.f16x2 r8023, r7762, r7848; +} +{ +mul.f16x2 r8026, r8023, r8009; +} +{ +add.f16x2 r8029, r7676, r8026; +} +{ +sub.f16x2 r8032, r7768, r7854; +} +{ +mul.f16x2 r8035, r8032, r8010; +} +{ +add.f16x2 r8038, r8029, r8035; +} +{ +add.f16x2 r8041, r7762, r7848; +} +{ +mul.f16x2 r8044, r8041, r8009; +} +{ +add.f16x2 r8047, r7676, r8044; +} +{ +sub.f16x2 r8050, r7768, r7854; +} +{ +mul.f16x2 r8053, r8050, r8010; +} +{ +sub.f16x2 r8056, r8047, r8053; +} +{ +add.f16x2 r8059, r7768, r7854; +} +{ +mul.f16x2 r8062, r8059, r8009; +} +{ +add.f16x2 r8065, r7682, r8062; +} +{ +sub.f16x2 r8068, r7762, r7848; +} +{ +mul.f16x2 r8071, r8068, r8010; +} +{ +sub.f16x2 r8074, r8065, r8071; +} +{ +add.f16x2 r8077, r7768, r7854; +} +{ +mul.f16x2 r8080, r8077, r8009; +} +{ +add.f16x2 r8083, r7682, r8080; +} +{ +sub.f16x2 r8086, r7762, r7848; +} +{ +mul.f16x2 r8089, r8086, r8010; +} +{ +add.f16x2 r8092, r8083, r8089; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8096, {low, high}; +} +{ +add.f16x2 r8097, r7951, r7967; +} +{ +add.f16x2 r8100, r7700, r8097; +} +{ +add.f16x2 r8103, r7957, r7973; +} +{ +add.f16x2 r8106, r7736, r8103; +} +{ +add.f16x2 r8109, r7951, r7967; +} +{ +mul.f16x2 r8112, r8109, r8095; +} +{ +add.f16x2 r8115, r7700, r8112; +} +{ +sub.f16x2 r8118, r7957, r7973; +} +{ +mul.f16x2 r8121, r8118, r8096; +} +{ +add.f16x2 r8124, r8115, r8121; +} +{ +add.f16x2 r8127, r7951, r7967; +} +{ +mul.f16x2 r8130, r8127, r8095; +} +{ +add.f16x2 r8133, r7700, r8130; +} +{ +sub.f16x2 r8136, r7957, r7973; +} +{ +mul.f16x2 r8139, r8136, r8096; +} +{ +sub.f16x2 r8142, r8133, r8139; +} +{ +add.f16x2 r8145, r7957, r7973; +} +{ +mul.f16x2 r8148, r8145, r8095; +} +{ +add.f16x2 r8151, r7736, r8148; +} +{ +sub.f16x2 r8154, r7951, r7967; +} +{ +mul.f16x2 r8157, r8154, r8096; +} +{ +sub.f16x2 r8160, r8151, r8157; +} +{ +add.f16x2 r8163, r7957, r7973; +} +{ +mul.f16x2 r8166, r8163, r8095; +} +{ +add.f16x2 r8169, r7736, r8166; +} +{ +sub.f16x2 r8172, r7951, r7967; +} +{ +mul.f16x2 r8175, r8172, r8096; +} +{ +add.f16x2 r8178, r8169, r8175; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8181, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8182, {low, high}; +} +{ +add.f16x2 r8183, r7983, r7999; +} +{ +add.f16x2 r8186, r7718, r8183; +} +{ +add.f16x2 r8189, r7989, r8005; +} +{ +add.f16x2 r8192, r7754, r8189; +} +{ +add.f16x2 r8195, r7983, r7999; +} +{ +mul.f16x2 r8198, r8195, r8181; +} +{ +add.f16x2 r8201, r7718, r8198; +} +{ +sub.f16x2 r8204, r7989, r8005; +} +{ +mul.f16x2 r8207, r8204, r8182; +} +{ +add.f16x2 r8210, r8201, r8207; +} +{ +add.f16x2 r8213, r7983, r7999; +} +{ +mul.f16x2 r8216, r8213, r8181; +} +{ +add.f16x2 r8219, r7718, r8216; +} +{ +sub.f16x2 r8222, r7989, r8005; +} +{ +mul.f16x2 r8225, r8222, r8182; +} +{ +sub.f16x2 r8228, r8219, r8225; +} +{ +add.f16x2 r8231, r7989, r8005; +} +{ +mul.f16x2 r8234, r8231, r8181; +} +{ +add.f16x2 r8237, r7754, r8234; +} +{ +sub.f16x2 r8240, r7983, r7999; +} +{ +mul.f16x2 r8243, r8240, r8182; +} +{ +sub.f16x2 r8246, r8237, r8243; +} +{ +add.f16x2 r8249, r7989, r8005; +} +{ +mul.f16x2 r8252, r8249, r8181; +} +{ +add.f16x2 r8255, r7754, r8252; +} +{ +sub.f16x2 r8258, r7983, r7999; +} +{ +mul.f16x2 r8261, r8258, r8182; +} +{ +add.f16x2 r8264, r8255, r8261; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8268, {low, high}; +} +{ +add.f16x2 r8269, r8270, r8271; +} +{ +add.f16x2 r8272, r8273, r8269; +} +{ +add.f16x2 r8275, r8276, r8277; +} +{ +add.f16x2 r8278, r8279, r8275; +} +{ +add.f16x2 r8281, r8270, r8271; +} +{ +mul.f16x2 r8284, r8281, r8267; +} +{ +add.f16x2 r8287, r8273, r8284; +} +{ +sub.f16x2 r8290, r8276, r8277; +} +{ +mul.f16x2 r8293, r8290, r8268; +} +{ +add.f16x2 r8296, r8287, r8293; +} +{ +add.f16x2 r8299, r8270, r8271; +} +{ +mul.f16x2 r8302, r8299, r8267; +} +{ +add.f16x2 r8305, r8273, r8302; +} +{ +sub.f16x2 r8308, r8276, r8277; +} +{ +mul.f16x2 r8311, r8308, r8268; +} +{ +sub.f16x2 r8314, r8305, r8311; +} +{ +add.f16x2 r8317, r8276, r8277; +} +{ +mul.f16x2 r8320, r8317, r8267; +} +{ +add.f16x2 r8323, r8279, r8320; +} +{ +sub.f16x2 r8326, r8270, r8271; +} +{ +mul.f16x2 r8329, r8326, r8268; +} +{ +sub.f16x2 r8332, r8323, r8329; +} +{ +add.f16x2 r8335, r8276, r8277; +} +{ +mul.f16x2 r8338, r8335, r8267; +} +{ +add.f16x2 r8341, r8279, r8338; +} +{ +sub.f16x2 r8344, r8270, r8271; +} +{ +mul.f16x2 r8347, r8344, r8268; +} +{ +add.f16x2 r8350, r8341, r8347; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8354, {low, high}; +} +{ +add.f16x2 r8355, r8356, r8357; +} +{ +add.f16x2 r8358, r8359, r8355; +} +{ +add.f16x2 r8361, r8362, r8363; +} +{ +add.f16x2 r8364, r8365, r8361; +} +{ +add.f16x2 r8367, r8356, r8357; +} +{ +mul.f16x2 r8370, r8367, r8353; +} +{ +add.f16x2 r8373, r8359, r8370; +} +{ +sub.f16x2 r8376, r8362, r8363; +} +{ +mul.f16x2 r8379, r8376, r8354; +} +{ +add.f16x2 r8382, r8373, r8379; +} +{ +add.f16x2 r8385, r8356, r8357; +} +{ +mul.f16x2 r8388, r8385, r8353; +} +{ +add.f16x2 r8391, r8359, r8388; +} +{ +sub.f16x2 r8394, r8362, r8363; +} +{ +mul.f16x2 r8397, r8394, r8354; +} +{ +sub.f16x2 r8400, r8391, r8397; +} +{ +add.f16x2 r8403, r8362, r8363; +} +{ +mul.f16x2 r8406, r8403, r8353; +} +{ +add.f16x2 r8409, r8365, r8406; +} +{ +sub.f16x2 r8412, r8356, r8357; +} +{ +mul.f16x2 r8415, r8412, r8354; +} +{ +sub.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8362, r8363; +} +{ +mul.f16x2 r8424, r8421, r8353; +} +{ +add.f16x2 r8427, r8365, r8424; +} +{ +sub.f16x2 r8430, r8356, r8357; +} +{ +mul.f16x2 r8433, r8430, r8354; +} +{ +add.f16x2 r8436, r8427, r8433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8440, {low, high}; +} +{ +add.f16x2 r8441, r8442, r8443; +} +{ +add.f16x2 r8444, r8445, r8441; +} +{ +add.f16x2 r8447, r8448, r8449; +} +{ +add.f16x2 r8450, r8451, r8447; +} +{ +add.f16x2 r8453, r8442, r8443; +} +{ +mul.f16x2 r8456, r8453, r8439; +} +{ +add.f16x2 r8459, r8445, r8456; +} +{ +sub.f16x2 r8462, r8448, r8449; +} +{ +mul.f16x2 r8465, r8462, r8440; +} +{ +add.f16x2 r8468, r8459, r8465; +} +{ +add.f16x2 r8471, r8442, r8443; +} +{ +mul.f16x2 r8474, r8471, r8439; +} +{ +add.f16x2 r8477, r8445, r8474; +} +{ +sub.f16x2 r8480, r8448, r8449; +} +{ +mul.f16x2 r8483, r8480, r8440; +} +{ +sub.f16x2 r8486, r8477, r8483; +} +{ +add.f16x2 r8489, r8448, r8449; +} +{ +mul.f16x2 r8492, r8489, r8439; +} +{ +add.f16x2 r8495, r8451, r8492; +} +{ +sub.f16x2 r8498, r8442, r8443; +} +{ +mul.f16x2 r8501, r8498, r8440; +} +{ +sub.f16x2 r8504, r8495, r8501; +} +{ +add.f16x2 r8507, r8448, r8449; +} +{ +mul.f16x2 r8510, r8507, r8439; +} +{ +add.f16x2 r8513, r8451, r8510; +} +{ +sub.f16x2 r8516, r8442, r8443; +} +{ +mul.f16x2 r8519, r8516, r8440; +} +{ +add.f16x2 r8522, r8513, r8519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r8525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r8526, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r8527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r8528, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r8531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r8532, {low, high}; +} +{ +mul.f16x2 r8541, r8382, r8525; +} +{ +mul.f16x2 r8544, r8418, r8526; +} +{ +sub.f16x2 r8547, r8541, r8544; +} +{ +mul.f16x2 r8550, r8382, r8526; +} +{ +fma.rn.f16x2 r8553, r8418, r8525, r8550; +} +{ +mul.f16x2 r8557, r8468, r8527; +} +{ +mul.f16x2 r8560, r8504, r8528; +} +{ +sub.f16x2 r8563, r8557, r8560; +} +{ +mul.f16x2 r8566, r8468, r8528; +} +{ +fma.rn.f16x2 r8569, r8504, r8527, r8566; +} +{ +mul.f16x2 r8573, r8400, r8527; +} +{ +mul.f16x2 r8576, r8436, r8528; +} +{ +sub.f16x2 r8579, r8573, r8576; +} +{ +mul.f16x2 r8582, r8400, r8528; +} +{ +fma.rn.f16x2 r8585, r8436, r8527, r8582; +} +{ +mul.f16x2 r8589, r8486, r8531; +} +{ +mul.f16x2 r8592, r8522, r8532; +} +{ +sub.f16x2 r8595, r8589, r8592; +} +{ +mul.f16x2 r8598, r8486, r8532; +} +{ +fma.rn.f16x2 r8601, r8522, r8531, r8598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8606, {low, high}; +} +{ +add.f16x2 r8607, r8358, r8444; +} +{ +add.f16x2 r8610, r8272, r8607; +} +{ +add.f16x2 r8613, r8364, r8450; +} +{ +add.f16x2 r8616, r8278, r8613; +} +{ +add.f16x2 r8619, r8358, r8444; +} +{ +mul.f16x2 r8622, r8619, r8605; +} +{ +add.f16x2 r8625, r8272, r8622; +} +{ +sub.f16x2 r8628, r8364, r8450; +} +{ +mul.f16x2 r8631, r8628, r8606; +} +{ +add.f16x2 r8634, r8625, r8631; +} +{ +add.f16x2 r8637, r8358, r8444; +} +{ +mul.f16x2 r8640, r8637, r8605; +} +{ +add.f16x2 r8643, r8272, r8640; +} +{ +sub.f16x2 r8646, r8364, r8450; +} +{ +mul.f16x2 r8649, r8646, r8606; +} +{ +sub.f16x2 r8652, r8643, r8649; +} +{ +add.f16x2 r8655, r8364, r8450; +} +{ +mul.f16x2 r8658, r8655, r8605; +} +{ +add.f16x2 r8661, r8278, r8658; +} +{ +sub.f16x2 r8664, r8358, r8444; +} +{ +mul.f16x2 r8667, r8664, r8606; +} +{ +sub.f16x2 r8670, r8661, r8667; +} +{ +add.f16x2 r8673, r8364, r8450; +} +{ +mul.f16x2 r8676, r8673, r8605; +} +{ +add.f16x2 r8679, r8278, r8676; +} +{ +sub.f16x2 r8682, r8358, r8444; +} +{ +mul.f16x2 r8685, r8682, r8606; +} +{ +add.f16x2 r8688, r8679, r8685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8692, {low, high}; +} +{ +add.f16x2 r8693, r8547, r8563; +} +{ +add.f16x2 r8696, r8296, r8693; +} +{ +add.f16x2 r8699, r8553, r8569; +} +{ +add.f16x2 r8702, r8332, r8699; +} +{ +add.f16x2 r8705, r8547, r8563; +} +{ +mul.f16x2 r8708, r8705, r8691; +} +{ +add.f16x2 r8711, r8296, r8708; +} +{ +sub.f16x2 r8714, r8553, r8569; +} +{ +mul.f16x2 r8717, r8714, r8692; +} +{ +add.f16x2 r8720, r8711, r8717; +} +{ +add.f16x2 r8723, r8547, r8563; +} +{ +mul.f16x2 r8726, r8723, r8691; +} +{ +add.f16x2 r8729, r8296, r8726; +} +{ +sub.f16x2 r8732, r8553, r8569; +} +{ +mul.f16x2 r8735, r8732, r8692; +} +{ +sub.f16x2 r8738, r8729, r8735; +} +{ +add.f16x2 r8741, r8553, r8569; +} +{ +mul.f16x2 r8744, r8741, r8691; +} +{ +add.f16x2 r8747, r8332, r8744; +} +{ +sub.f16x2 r8750, r8547, r8563; +} +{ +mul.f16x2 r8753, r8750, r8692; +} +{ +sub.f16x2 r8756, r8747, r8753; +} +{ +add.f16x2 r8759, r8553, r8569; +} +{ +mul.f16x2 r8762, r8759, r8691; +} +{ +add.f16x2 r8765, r8332, r8762; +} +{ +sub.f16x2 r8768, r8547, r8563; +} +{ +mul.f16x2 r8771, r8768, r8692; +} +{ +add.f16x2 r8774, r8765, r8771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8777, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8778, {low, high}; +} +{ +add.f16x2 r8779, r8579, r8595; +} +{ +add.f16x2 r8782, r8314, r8779; +} +{ +add.f16x2 r8785, r8585, r8601; +} +{ +add.f16x2 r8788, r8350, r8785; +} +{ +add.f16x2 r8791, r8579, r8595; +} +{ +mul.f16x2 r8794, r8791, r8777; +} +{ +add.f16x2 r8797, r8314, r8794; +} +{ +sub.f16x2 r8800, r8585, r8601; +} +{ +mul.f16x2 r8803, r8800, r8778; +} +{ +add.f16x2 r8806, r8797, r8803; +} +{ +add.f16x2 r8809, r8579, r8595; +} +{ +mul.f16x2 r8812, r8809, r8777; +} +{ +add.f16x2 r8815, r8314, r8812; +} +{ +sub.f16x2 r8818, r8585, r8601; +} +{ +mul.f16x2 r8821, r8818, r8778; +} +{ +sub.f16x2 r8824, r8815, r8821; +} +{ +add.f16x2 r8827, r8585, r8601; +} +{ +mul.f16x2 r8830, r8827, r8777; +} +{ +add.f16x2 r8833, r8350, r8830; +} +{ +sub.f16x2 r8836, r8579, r8595; +} +{ +mul.f16x2 r8839, r8836, r8778; +} +{ +sub.f16x2 r8842, r8833, r8839; +} +{ +add.f16x2 r8845, r8585, r8601; +} +{ +mul.f16x2 r8848, r8845, r8777; +} +{ +add.f16x2 r8851, r8350, r8848; +} +{ +sub.f16x2 r8854, r8579, r8595; +} +{ +mul.f16x2 r8857, r8854, r8778; +} +{ +add.f16x2 r8860, r8851, r8857; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8864, {low, high}; +} +{ +add.f16x2 r8865, r8866, r8867; +} +{ +add.f16x2 r8868, r8869, r8865; +} +{ +add.f16x2 r8871, r8872, r8873; +} +{ +add.f16x2 r8874, r8875, r8871; +} +{ +add.f16x2 r8877, r8866, r8867; +} +{ +mul.f16x2 r8880, r8877, r8863; +} +{ +add.f16x2 r8883, r8869, r8880; +} +{ +sub.f16x2 r8886, r8872, r8873; +} +{ +mul.f16x2 r8889, r8886, r8864; +} +{ +add.f16x2 r8892, r8883, r8889; +} +{ +add.f16x2 r8895, r8866, r8867; +} +{ +mul.f16x2 r8898, r8895, r8863; +} +{ +add.f16x2 r8901, r8869, r8898; +} +{ +sub.f16x2 r8904, r8872, r8873; +} +{ +mul.f16x2 r8907, r8904, r8864; +} +{ +sub.f16x2 r8910, r8901, r8907; +} +{ +add.f16x2 r8913, r8872, r8873; +} +{ +mul.f16x2 r8916, r8913, r8863; +} +{ +add.f16x2 r8919, r8875, r8916; +} +{ +sub.f16x2 r8922, r8866, r8867; +} +{ +mul.f16x2 r8925, r8922, r8864; +} +{ +sub.f16x2 r8928, r8919, r8925; +} +{ +add.f16x2 r8931, r8872, r8873; +} +{ +mul.f16x2 r8934, r8931, r8863; +} +{ +add.f16x2 r8937, r8875, r8934; +} +{ +sub.f16x2 r8940, r8866, r8867; +} +{ +mul.f16x2 r8943, r8940, r8864; +} +{ +add.f16x2 r8946, r8937, r8943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r8949, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r8950, {low, high}; +} +{ +add.f16x2 r8951, r8952, r8953; +} +{ +add.f16x2 r8954, r8955, r8951; +} +{ +add.f16x2 r8957, r8958, r8959; +} +{ +add.f16x2 r8960, r8961, r8957; +} +{ +add.f16x2 r8963, r8952, r8953; +} +{ +mul.f16x2 r8966, r8963, r8949; +} +{ +add.f16x2 r8969, r8955, r8966; +} +{ +sub.f16x2 r8972, r8958, r8959; +} +{ +mul.f16x2 r8975, r8972, r8950; +} +{ +add.f16x2 r8978, r8969, r8975; +} +{ +add.f16x2 r8981, r8952, r8953; +} +{ +mul.f16x2 r8984, r8981, r8949; +} +{ +add.f16x2 r8987, r8955, r8984; +} +{ +sub.f16x2 r8990, r8958, r8959; +} +{ +mul.f16x2 r8993, r8990, r8950; +} +{ +sub.f16x2 r8996, r8987, r8993; +} +{ +add.f16x2 r8999, r8958, r8959; +} +{ +mul.f16x2 r9002, r8999, r8949; +} +{ +add.f16x2 r9005, r8961, r9002; +} +{ +sub.f16x2 r9008, r8952, r8953; +} +{ +mul.f16x2 r9011, r9008, r8950; +} +{ +sub.f16x2 r9014, r9005, r9011; +} +{ +add.f16x2 r9017, r8958, r8959; +} +{ +mul.f16x2 r9020, r9017, r8949; +} +{ +add.f16x2 r9023, r8961, r9020; +} +{ +sub.f16x2 r9026, r8952, r8953; +} +{ +mul.f16x2 r9029, r9026, r8950; +} +{ +add.f16x2 r9032, r9023, r9029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9035, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9036, {low, high}; +} +{ +add.f16x2 r9037, r9038, r9039; +} +{ +add.f16x2 r9040, r9041, r9037; +} +{ +add.f16x2 r9043, r9044, r9045; +} +{ +add.f16x2 r9046, r9047, r9043; +} +{ +add.f16x2 r9049, r9038, r9039; +} +{ +mul.f16x2 r9052, r9049, r9035; +} +{ +add.f16x2 r9055, r9041, r9052; +} +{ +sub.f16x2 r9058, r9044, r9045; +} +{ +mul.f16x2 r9061, r9058, r9036; +} +{ +add.f16x2 r9064, r9055, r9061; +} +{ +add.f16x2 r9067, r9038, r9039; +} +{ +mul.f16x2 r9070, r9067, r9035; +} +{ +add.f16x2 r9073, r9041, r9070; +} +{ +sub.f16x2 r9076, r9044, r9045; +} +{ +mul.f16x2 r9079, r9076, r9036; +} +{ +sub.f16x2 r9082, r9073, r9079; +} +{ +add.f16x2 r9085, r9044, r9045; +} +{ +mul.f16x2 r9088, r9085, r9035; +} +{ +add.f16x2 r9091, r9047, r9088; +} +{ +sub.f16x2 r9094, r9038, r9039; +} +{ +mul.f16x2 r9097, r9094, r9036; +} +{ +sub.f16x2 r9100, r9091, r9097; +} +{ +add.f16x2 r9103, r9044, r9045; +} +{ +mul.f16x2 r9106, r9103, r9035; +} +{ +add.f16x2 r9109, r9047, r9106; +} +{ +sub.f16x2 r9112, r9038, r9039; +} +{ +mul.f16x2 r9115, r9112, r9036; +} +{ +add.f16x2 r9118, r9109, r9115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9124, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9128, {low, high}; +} +{ +mul.f16x2 r9137, r8978, r9121; +} +{ +mul.f16x2 r9140, r9014, r9122; +} +{ +sub.f16x2 r9143, r9137, r9140; +} +{ +mul.f16x2 r9146, r8978, r9122; +} +{ +fma.rn.f16x2 r9149, r9014, r9121, r9146; +} +{ +mul.f16x2 r9153, r9064, r9123; +} +{ +mul.f16x2 r9156, r9100, r9124; +} +{ +sub.f16x2 r9159, r9153, r9156; +} +{ +mul.f16x2 r9162, r9064, r9124; +} +{ +fma.rn.f16x2 r9165, r9100, r9123, r9162; +} +{ +mul.f16x2 r9169, r8996, r9123; +} +{ +mul.f16x2 r9172, r9032, r9124; +} +{ +sub.f16x2 r9175, r9169, r9172; +} +{ +mul.f16x2 r9178, r8996, r9124; +} +{ +fma.rn.f16x2 r9181, r9032, r9123, r9178; +} +{ +mul.f16x2 r9185, r9082, r9127; +} +{ +mul.f16x2 r9188, r9118, r9128; +} +{ +sub.f16x2 r9191, r9185, r9188; +} +{ +mul.f16x2 r9194, r9082, r9128; +} +{ +fma.rn.f16x2 r9197, r9118, r9127, r9194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9201, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9202, {low, high}; +} +{ +add.f16x2 r9203, r8954, r9040; +} +{ +add.f16x2 r9206, r8868, r9203; +} +{ +add.f16x2 r9209, r8960, r9046; +} +{ +add.f16x2 r9212, r8874, r9209; +} +{ +add.f16x2 r9215, r8954, r9040; +} +{ +mul.f16x2 r9218, r9215, r9201; +} +{ +add.f16x2 r9221, r8868, r9218; +} +{ +sub.f16x2 r9224, r8960, r9046; +} +{ +mul.f16x2 r9227, r9224, r9202; +} +{ +add.f16x2 r9230, r9221, r9227; +} +{ +add.f16x2 r9233, r8954, r9040; +} +{ +mul.f16x2 r9236, r9233, r9201; +} +{ +add.f16x2 r9239, r8868, r9236; +} +{ +sub.f16x2 r9242, r8960, r9046; +} +{ +mul.f16x2 r9245, r9242, r9202; +} +{ +sub.f16x2 r9248, r9239, r9245; +} +{ +add.f16x2 r9251, r8960, r9046; +} +{ +mul.f16x2 r9254, r9251, r9201; +} +{ +add.f16x2 r9257, r8874, r9254; +} +{ +sub.f16x2 r9260, r8954, r9040; +} +{ +mul.f16x2 r9263, r9260, r9202; +} +{ +sub.f16x2 r9266, r9257, r9263; +} +{ +add.f16x2 r9269, r8960, r9046; +} +{ +mul.f16x2 r9272, r9269, r9201; +} +{ +add.f16x2 r9275, r8874, r9272; +} +{ +sub.f16x2 r9278, r8954, r9040; +} +{ +mul.f16x2 r9281, r9278, r9202; +} +{ +add.f16x2 r9284, r9275, r9281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9288, {low, high}; +} +{ +add.f16x2 r9289, r9143, r9159; +} +{ +add.f16x2 r9292, r8892, r9289; +} +{ +add.f16x2 r9295, r9149, r9165; +} +{ +add.f16x2 r9298, r8928, r9295; +} +{ +add.f16x2 r9301, r9143, r9159; +} +{ +mul.f16x2 r9304, r9301, r9287; +} +{ +add.f16x2 r9307, r8892, r9304; +} +{ +sub.f16x2 r9310, r9149, r9165; +} +{ +mul.f16x2 r9313, r9310, r9288; +} +{ +add.f16x2 r9316, r9307, r9313; +} +{ +add.f16x2 r9319, r9143, r9159; +} +{ +mul.f16x2 r9322, r9319, r9287; +} +{ +add.f16x2 r9325, r8892, r9322; +} +{ +sub.f16x2 r9328, r9149, r9165; +} +{ +mul.f16x2 r9331, r9328, r9288; +} +{ +sub.f16x2 r9334, r9325, r9331; +} +{ +add.f16x2 r9337, r9149, r9165; +} +{ +mul.f16x2 r9340, r9337, r9287; +} +{ +add.f16x2 r9343, r8928, r9340; +} +{ +sub.f16x2 r9346, r9143, r9159; +} +{ +mul.f16x2 r9349, r9346, r9288; +} +{ +sub.f16x2 r9352, r9343, r9349; +} +{ +add.f16x2 r9355, r9149, r9165; +} +{ +mul.f16x2 r9358, r9355, r9287; +} +{ +add.f16x2 r9361, r8928, r9358; +} +{ +sub.f16x2 r9364, r9143, r9159; +} +{ +mul.f16x2 r9367, r9364, r9288; +} +{ +add.f16x2 r9370, r9361, r9367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9374, {low, high}; +} +{ +add.f16x2 r9375, r9175, r9191; +} +{ +add.f16x2 r9378, r8910, r9375; +} +{ +add.f16x2 r9381, r9181, r9197; +} +{ +add.f16x2 r9384, r8946, r9381; +} +{ +add.f16x2 r9387, r9175, r9191; +} +{ +mul.f16x2 r9390, r9387, r9373; +} +{ +add.f16x2 r9393, r8910, r9390; +} +{ +sub.f16x2 r9396, r9181, r9197; +} +{ +mul.f16x2 r9399, r9396, r9374; +} +{ +add.f16x2 r9402, r9393, r9399; +} +{ +add.f16x2 r9405, r9175, r9191; +} +{ +mul.f16x2 r9408, r9405, r9373; +} +{ +add.f16x2 r9411, r8910, r9408; +} +{ +sub.f16x2 r9414, r9181, r9197; +} +{ +mul.f16x2 r9417, r9414, r9374; +} +{ +sub.f16x2 r9420, r9411, r9417; +} +{ +add.f16x2 r9423, r9181, r9197; +} +{ +mul.f16x2 r9426, r9423, r9373; +} +{ +add.f16x2 r9429, r8946, r9426; +} +{ +sub.f16x2 r9432, r9175, r9191; +} +{ +mul.f16x2 r9435, r9432, r9374; +} +{ +sub.f16x2 r9438, r9429, r9435; +} +{ +add.f16x2 r9441, r9181, r9197; +} +{ +mul.f16x2 r9444, r9441, r9373; +} +{ +add.f16x2 r9447, r8946, r9444; +} +{ +sub.f16x2 r9450, r9175, r9191; +} +{ +mul.f16x2 r9453, r9450, r9374; +} +{ +add.f16x2 r9456, r9447, r9453; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f898; +cvt.rn.f16.f32 high, f898; +mov.b32 r9459, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f900; +cvt.rn.f16.f32 high, f900; +mov.b32 r9460, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f902; +cvt.rn.f16.f32 high, f902; +mov.b32 r9461, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f904; +cvt.rn.f16.f32 high, f904; +mov.b32 r9462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f906; +cvt.rn.f16.f32 high, f906; +mov.b32 r9463, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f908; +cvt.rn.f16.f32 high, f908; +mov.b32 r9464, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f910; +cvt.rn.f16.f32 high, f910; +mov.b32 r9465, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f912; +cvt.rn.f16.f32 high, f912; +mov.b32 r9466, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f914; +cvt.rn.f16.f32 high, f914; +mov.b32 r9467, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f916; +cvt.rn.f16.f32 high, f916; +mov.b32 r9468, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f918; +cvt.rn.f16.f32 high, f918; +mov.b32 r9469, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f920; +cvt.rn.f16.f32 high, f920; +mov.b32 r9470, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f922; +cvt.rn.f16.f32 high, f922; +mov.b32 r9471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f924; +cvt.rn.f16.f32 high, f924; +mov.b32 r9472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f926; +cvt.rn.f16.f32 high, f926; +mov.b32 r9473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f928; +cvt.rn.f16.f32 high, f928; +mov.b32 r9474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f934; +cvt.rn.f16.f32 high, f934; +mov.b32 r9477, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f936; +cvt.rn.f16.f32 high, f936; +mov.b32 r9478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f942; +cvt.rn.f16.f32 high, f942; +mov.b32 r9481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f944; +cvt.rn.f16.f32 high, f944; +mov.b32 r9482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f950; +cvt.rn.f16.f32 high, f950; +mov.b32 r9485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f952; +cvt.rn.f16.f32 high, f952; +mov.b32 r9486, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f958; +cvt.rn.f16.f32 high, f958; +mov.b32 r9489, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f960; +cvt.rn.f16.f32 high, f960; +mov.b32 r9490, {low, high}; +} +{ +mul.f16x2 r9511, r8696, r9459; +} +{ +mul.f16x2 r9514, r8702, r9460; +} +{ +sub.f16x2 r9517, r9511, r9514; +} +{ +mul.f16x2 r9520, r8696, r9460; +} +{ +fma.rn.f16x2 r9523, r8702, r9459, r9520; +} +{ +mul.f16x2 r9527, r9292, r9461; +} +{ +mul.f16x2 r9530, r9298, r9462; +} +{ +sub.f16x2 r9533, r9527, r9530; +} +{ +mul.f16x2 r9536, r9292, r9462; +} +{ +fma.rn.f16x2 r9539, r9298, r9461, r9536; +} +{ +mul.f16x2 r9543, r8782, r9461; +} +{ +mul.f16x2 r9546, r8788, r9462; +} +{ +sub.f16x2 r9549, r9543, r9546; +} +{ +mul.f16x2 r9552, r8782, r9462; +} +{ +fma.rn.f16x2 r9555, r8788, r9461, r9552; +} +{ +mul.f16x2 r9559, r9378, r9465; +} +{ +mul.f16x2 r9562, r9384, r9466; +} +{ +sub.f16x2 r9565, r9559, r9562; +} +{ +mul.f16x2 r9568, r9378, r9466; +} +{ +fma.rn.f16x2 r9571, r9384, r9465, r9568; +} +{ +mul.f16x2 r9575, r8634, r9463; +} +{ +mul.f16x2 r9578, r8670, r9464; +} +{ +sub.f16x2 r9581, r9575, r9578; +} +{ +mul.f16x2 r9584, r8634, r9464; +} +{ +fma.rn.f16x2 r9587, r8670, r9463, r9584; +} +{ +mul.f16x2 r9591, r9230, r9469; +} +{ +mul.f16x2 r9594, r9266, r9470; +} +{ +sub.f16x2 r9597, r9591, r9594; +} +{ +mul.f16x2 r9600, r9230, r9470; +} +{ +fma.rn.f16x2 r9603, r9266, r9469, r9600; +} +{ +mul.f16x2 r9607, r8720, r9465; +} +{ +mul.f16x2 r9610, r8756, r9466; +} +{ +sub.f16x2 r9613, r9607, r9610; +} +{ +mul.f16x2 r9616, r8720, r9466; +} +{ +fma.rn.f16x2 r9619, r8756, r9465, r9616; +} +{ +mul.f16x2 r9623, r9316, r9473; +} +{ +mul.f16x2 r9626, r9352, r9474; +} +{ +sub.f16x2 r9629, r9623, r9626; +} +{ +mul.f16x2 r9632, r9316, r9474; +} +{ +fma.rn.f16x2 r9635, r9352, r9473, r9632; +} +{ +mul.f16x2 r9639, r8806, r9467; +} +{ +mul.f16x2 r9642, r8842, r9468; +} +{ +sub.f16x2 r9645, r9639, r9642; +} +{ +mul.f16x2 r9648, r8806, r9468; +} +{ +fma.rn.f16x2 r9651, r8842, r9467, r9648; +} +{ +mul.f16x2 r9655, r9402, r9477; +} +{ +mul.f16x2 r9658, r9438, r9478; +} +{ +sub.f16x2 r9661, r9655, r9658; +} +{ +mul.f16x2 r9664, r9402, r9478; +} +{ +fma.rn.f16x2 r9667, r9438, r9477, r9664; +} +{ +mul.f16x2 r9671, r8652, r9469; +} +{ +mul.f16x2 r9674, r8688, r9470; +} +{ +sub.f16x2 r9677, r9671, r9674; +} +{ +mul.f16x2 r9680, r8652, r9470; +} +{ +fma.rn.f16x2 r9683, r8688, r9469, r9680; +} +{ +mul.f16x2 r9687, r9248, r9481; +} +{ +mul.f16x2 r9690, r9284, r9482; +} +{ +sub.f16x2 r9693, r9687, r9690; +} +{ +mul.f16x2 r9696, r9248, r9482; +} +{ +fma.rn.f16x2 r9699, r9284, r9481, r9696; +} +{ +mul.f16x2 r9703, r8738, r9471; +} +{ +mul.f16x2 r9706, r8774, r9472; +} +{ +sub.f16x2 r9709, r9703, r9706; +} +{ +mul.f16x2 r9712, r8738, r9472; +} +{ +fma.rn.f16x2 r9715, r8774, r9471, r9712; +} +{ +mul.f16x2 r9719, r9334, r9485; +} +{ +mul.f16x2 r9722, r9370, r9486; +} +{ +sub.f16x2 r9725, r9719, r9722; +} +{ +mul.f16x2 r9728, r9334, r9486; +} +{ +fma.rn.f16x2 r9731, r9370, r9485, r9728; +} +{ +mul.f16x2 r9735, r8824, r9473; +} +{ +mul.f16x2 r9738, r8860, r9474; +} +{ +sub.f16x2 r9741, r9735, r9738; +} +{ +mul.f16x2 r9744, r8824, r9474; +} +{ +fma.rn.f16x2 r9747, r8860, r9473, r9744; +} +{ +mul.f16x2 r9751, r9420, r9489; +} +{ +mul.f16x2 r9754, r9456, r9490; +} +{ +sub.f16x2 r9757, r9751, r9754; +} +{ +mul.f16x2 r9760, r9420, r9490; +} +{ +fma.rn.f16x2 r9763, r9456, r9489, r9760; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9767, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9768, {low, high}; +} +{ +add.f16x2 r9769, r8610, r9206; +} +{ +add.f16x2 %0, r8014, r9769; +} +{ +add.f16x2 r9775, r8616, r9212; +} +{ +add.f16x2 %1, r8020, r9775; +} +{ +add.f16x2 r9781, r8610, r9206; +} +{ +mul.f16x2 r9784, r9781, r9767; +} +{ +add.f16x2 r9787, r8014, r9784; +} +{ +sub.f16x2 r9790, r8616, r9212; +} +{ +mul.f16x2 r9793, r9790, r9768; +} +{ +add.f16x2 %18, r9787, r9793; +} +{ +add.f16x2 r9799, r8610, r9206; +} +{ +mul.f16x2 r9802, r9799, r9767; +} +{ +add.f16x2 r9805, r8014, r9802; +} +{ +sub.f16x2 r9808, r8616, r9212; +} +{ +mul.f16x2 r9811, r9808, r9768; +} +{ +sub.f16x2 %36, r9805, r9811; +} +{ +add.f16x2 r9817, r8616, r9212; +} +{ +mul.f16x2 r9820, r9817, r9767; +} +{ +add.f16x2 r9823, r8020, r9820; +} +{ +sub.f16x2 r9826, r8610, r9206; +} +{ +mul.f16x2 r9829, r9826, r9768; +} +{ +sub.f16x2 %19, r9823, r9829; +} +{ +add.f16x2 r9835, r8616, r9212; +} +{ +mul.f16x2 r9838, r9835, r9767; +} +{ +add.f16x2 r9841, r8020, r9838; +} +{ +sub.f16x2 r9844, r8610, r9206; +} +{ +mul.f16x2 r9847, r9844, r9768; +} +{ +add.f16x2 %37, r9841, r9847; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9853, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9854, {low, high}; +} +{ +add.f16x2 r9855, r9517, r9533; +} +{ +add.f16x2 %2, r8100, r9855; +} +{ +add.f16x2 r9861, r9523, r9539; +} +{ +add.f16x2 %3, r8106, r9861; +} +{ +add.f16x2 r9867, r9517, r9533; +} +{ +mul.f16x2 r9870, r9867, r9853; +} +{ +add.f16x2 r9873, r8100, r9870; +} +{ +sub.f16x2 r9876, r9523, r9539; +} +{ +mul.f16x2 r9879, r9876, r9854; +} +{ +add.f16x2 %20, r9873, r9879; +} +{ +add.f16x2 r9885, r9517, r9533; +} +{ +mul.f16x2 r9888, r9885, r9853; +} +{ +add.f16x2 r9891, r8100, r9888; +} +{ +sub.f16x2 r9894, r9523, r9539; +} +{ +mul.f16x2 r9897, r9894, r9854; +} +{ +sub.f16x2 %38, r9891, r9897; +} +{ +add.f16x2 r9903, r9523, r9539; +} +{ +mul.f16x2 r9906, r9903, r9853; +} +{ +add.f16x2 r9909, r8106, r9906; +} +{ +sub.f16x2 r9912, r9517, r9533; +} +{ +mul.f16x2 r9915, r9912, r9854; +} +{ +sub.f16x2 %21, r9909, r9915; +} +{ +add.f16x2 r9921, r9523, r9539; +} +{ +mul.f16x2 r9924, r9921, r9853; +} +{ +add.f16x2 r9927, r8106, r9924; +} +{ +sub.f16x2 r9930, r9517, r9533; +} +{ +mul.f16x2 r9933, r9930, r9854; +} +{ +add.f16x2 %39, r9927, r9933; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r9939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r9940, {low, high}; +} +{ +add.f16x2 r9941, r9549, r9565; +} +{ +add.f16x2 %4, r8186, r9941; +} +{ +add.f16x2 r9947, r9555, r9571; +} +{ +add.f16x2 %5, r8192, r9947; +} +{ +add.f16x2 r9953, r9549, r9565; +} +{ +mul.f16x2 r9956, r9953, r9939; +} +{ +add.f16x2 r9959, r8186, r9956; +} +{ +sub.f16x2 r9962, r9555, r9571; +} +{ +mul.f16x2 r9965, r9962, r9940; +} +{ +add.f16x2 %22, r9959, r9965; +} +{ +add.f16x2 r9971, r9549, r9565; +} +{ +mul.f16x2 r9974, r9971, r9939; +} +{ +add.f16x2 r9977, r8186, r9974; +} +{ +sub.f16x2 r9980, r9555, r9571; +} +{ +mul.f16x2 r9983, r9980, r9940; +} +{ +sub.f16x2 %40, r9977, r9983; +} +{ +add.f16x2 r9989, r9555, r9571; +} +{ +mul.f16x2 r9992, r9989, r9939; +} +{ +add.f16x2 r9995, r8192, r9992; +} +{ +sub.f16x2 r9998, r9549, r9565; +} +{ +mul.f16x2 r10001, r9998, r9940; +} +{ +sub.f16x2 %23, r9995, r10001; +} +{ +add.f16x2 r10007, r9555, r9571; +} +{ +mul.f16x2 r10010, r10007, r9939; +} +{ +add.f16x2 r10013, r8192, r10010; +} +{ +sub.f16x2 r10016, r9549, r9565; +} +{ +mul.f16x2 r10019, r10016, r9940; +} +{ +add.f16x2 %41, r10013, r10019; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10026, {low, high}; +} +{ +add.f16x2 r10027, r9581, r9597; +} +{ +add.f16x2 %6, r8038, r10027; +} +{ +add.f16x2 r10033, r9587, r9603; +} +{ +add.f16x2 %7, r8074, r10033; +} +{ +add.f16x2 r10039, r9581, r9597; +} +{ +mul.f16x2 r10042, r10039, r10025; +} +{ +add.f16x2 r10045, r8038, r10042; +} +{ +sub.f16x2 r10048, r9587, r9603; +} +{ +mul.f16x2 r10051, r10048, r10026; +} +{ +add.f16x2 %24, r10045, r10051; +} +{ +add.f16x2 r10057, r9581, r9597; +} +{ +mul.f16x2 r10060, r10057, r10025; +} +{ +add.f16x2 r10063, r8038, r10060; +} +{ +sub.f16x2 r10066, r9587, r9603; +} +{ +mul.f16x2 r10069, r10066, r10026; +} +{ +sub.f16x2 %42, r10063, r10069; +} +{ +add.f16x2 r10075, r9587, r9603; +} +{ +mul.f16x2 r10078, r10075, r10025; +} +{ +add.f16x2 r10081, r8074, r10078; +} +{ +sub.f16x2 r10084, r9581, r9597; +} +{ +mul.f16x2 r10087, r10084, r10026; +} +{ +sub.f16x2 %25, r10081, r10087; +} +{ +add.f16x2 r10093, r9587, r9603; +} +{ +mul.f16x2 r10096, r10093, r10025; +} +{ +add.f16x2 r10099, r8074, r10096; +} +{ +sub.f16x2 r10102, r9581, r9597; +} +{ +mul.f16x2 r10105, r10102, r10026; +} +{ +add.f16x2 %43, r10099, r10105; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10112, {low, high}; +} +{ +add.f16x2 r10113, r9613, r9629; +} +{ +add.f16x2 %8, r8124, r10113; +} +{ +add.f16x2 r10119, r9619, r9635; +} +{ +add.f16x2 %9, r8160, r10119; +} +{ +add.f16x2 r10125, r9613, r9629; +} +{ +mul.f16x2 r10128, r10125, r10111; +} +{ +add.f16x2 r10131, r8124, r10128; +} +{ +sub.f16x2 r10134, r9619, r9635; +} +{ +mul.f16x2 r10137, r10134, r10112; +} +{ +add.f16x2 %26, r10131, r10137; +} +{ +add.f16x2 r10143, r9613, r9629; +} +{ +mul.f16x2 r10146, r10143, r10111; +} +{ +add.f16x2 r10149, r8124, r10146; +} +{ +sub.f16x2 r10152, r9619, r9635; +} +{ +mul.f16x2 r10155, r10152, r10112; +} +{ +sub.f16x2 %44, r10149, r10155; +} +{ +add.f16x2 r10161, r9619, r9635; +} +{ +mul.f16x2 r10164, r10161, r10111; +} +{ +add.f16x2 r10167, r8160, r10164; +} +{ +sub.f16x2 r10170, r9613, r9629; +} +{ +mul.f16x2 r10173, r10170, r10112; +} +{ +sub.f16x2 %27, r10167, r10173; +} +{ +add.f16x2 r10179, r9619, r9635; +} +{ +mul.f16x2 r10182, r10179, r10111; +} +{ +add.f16x2 r10185, r8160, r10182; +} +{ +sub.f16x2 r10188, r9613, r9629; +} +{ +mul.f16x2 r10191, r10188, r10112; +} +{ +add.f16x2 %45, r10185, r10191; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10197, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10198, {low, high}; +} +{ +add.f16x2 r10199, r9645, r9661; +} +{ +add.f16x2 %10, r8210, r10199; +} +{ +add.f16x2 r10205, r9651, r9667; +} +{ +add.f16x2 %11, r8246, r10205; +} +{ +add.f16x2 r10211, r9645, r9661; +} +{ +mul.f16x2 r10214, r10211, r10197; +} +{ +add.f16x2 r10217, r8210, r10214; +} +{ +sub.f16x2 r10220, r9651, r9667; +} +{ +mul.f16x2 r10223, r10220, r10198; +} +{ +add.f16x2 %28, r10217, r10223; +} +{ +add.f16x2 r10229, r9645, r9661; +} +{ +mul.f16x2 r10232, r10229, r10197; +} +{ +add.f16x2 r10235, r8210, r10232; +} +{ +sub.f16x2 r10238, r9651, r9667; +} +{ +mul.f16x2 r10241, r10238, r10198; +} +{ +sub.f16x2 %46, r10235, r10241; +} +{ +add.f16x2 r10247, r9651, r9667; +} +{ +mul.f16x2 r10250, r10247, r10197; +} +{ +add.f16x2 r10253, r8246, r10250; +} +{ +sub.f16x2 r10256, r9645, r9661; +} +{ +mul.f16x2 r10259, r10256, r10198; +} +{ +sub.f16x2 %29, r10253, r10259; +} +{ +add.f16x2 r10265, r9651, r9667; +} +{ +mul.f16x2 r10268, r10265, r10197; +} +{ +add.f16x2 r10271, r8246, r10268; +} +{ +sub.f16x2 r10274, r9645, r9661; +} +{ +mul.f16x2 r10277, r10274, r10198; +} +{ +add.f16x2 %47, r10271, r10277; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10284, {low, high}; +} +{ +add.f16x2 r10285, r9677, r9693; +} +{ +add.f16x2 %12, r8056, r10285; +} +{ +add.f16x2 r10291, r9683, r9699; +} +{ +add.f16x2 %13, r8092, r10291; +} +{ +add.f16x2 r10297, r9677, r9693; +} +{ +mul.f16x2 r10300, r10297, r10283; +} +{ +add.f16x2 r10303, r8056, r10300; +} +{ +sub.f16x2 r10306, r9683, r9699; +} +{ +mul.f16x2 r10309, r10306, r10284; +} +{ +add.f16x2 %30, r10303, r10309; +} +{ +add.f16x2 r10315, r9677, r9693; +} +{ +mul.f16x2 r10318, r10315, r10283; +} +{ +add.f16x2 r10321, r8056, r10318; +} +{ +sub.f16x2 r10324, r9683, r9699; +} +{ +mul.f16x2 r10327, r10324, r10284; +} +{ +sub.f16x2 %48, r10321, r10327; +} +{ +add.f16x2 r10333, r9683, r9699; +} +{ +mul.f16x2 r10336, r10333, r10283; +} +{ +add.f16x2 r10339, r8092, r10336; +} +{ +sub.f16x2 r10342, r9677, r9693; +} +{ +mul.f16x2 r10345, r10342, r10284; +} +{ +sub.f16x2 %31, r10339, r10345; +} +{ +add.f16x2 r10351, r9683, r9699; +} +{ +mul.f16x2 r10354, r10351, r10283; +} +{ +add.f16x2 r10357, r8092, r10354; +} +{ +sub.f16x2 r10360, r9677, r9693; +} +{ +mul.f16x2 r10363, r10360, r10284; +} +{ +add.f16x2 %49, r10357, r10363; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10369, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10370, {low, high}; +} +{ +add.f16x2 r10371, r9709, r9725; +} +{ +add.f16x2 %14, r8142, r10371; +} +{ +add.f16x2 r10377, r9715, r9731; +} +{ +add.f16x2 %15, r8178, r10377; +} +{ +add.f16x2 r10383, r9709, r9725; +} +{ +mul.f16x2 r10386, r10383, r10369; +} +{ +add.f16x2 r10389, r8142, r10386; +} +{ +sub.f16x2 r10392, r9715, r9731; +} +{ +mul.f16x2 r10395, r10392, r10370; +} +{ +add.f16x2 %32, r10389, r10395; +} +{ +add.f16x2 r10401, r9709, r9725; +} +{ +mul.f16x2 r10404, r10401, r10369; +} +{ +add.f16x2 r10407, r8142, r10404; +} +{ +sub.f16x2 r10410, r9715, r9731; +} +{ +mul.f16x2 r10413, r10410, r10370; +} +{ +sub.f16x2 %50, r10407, r10413; +} +{ +add.f16x2 r10419, r9715, r9731; +} +{ +mul.f16x2 r10422, r10419, r10369; +} +{ +add.f16x2 r10425, r8178, r10422; +} +{ +sub.f16x2 r10428, r9709, r9725; +} +{ +mul.f16x2 r10431, r10428, r10370; +} +{ +sub.f16x2 %33, r10425, r10431; +} +{ +add.f16x2 r10437, r9715, r9731; +} +{ +mul.f16x2 r10440, r10437, r10369; +} +{ +add.f16x2 r10443, r8178, r10440; +} +{ +sub.f16x2 r10446, r9709, r9725; +} +{ +mul.f16x2 r10449, r10446, r10370; +} +{ +add.f16x2 %51, r10443, r10449; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1034; +cvt.rn.f16.f32 high, f1034; +mov.b32 r10455, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1036; +cvt.rn.f16.f32 high, f1036; +mov.b32 r10456, {low, high}; +} +{ +add.f16x2 r10457, r9741, r9757; +} +{ +add.f16x2 %16, r8228, r10457; +} +{ +add.f16x2 r10463, r9747, r9763; +} +{ +add.f16x2 %17, r8264, r10463; +} +{ +add.f16x2 r10469, r9741, r9757; +} +{ +mul.f16x2 r10472, r10469, r10455; +} +{ +add.f16x2 r10475, r8228, r10472; +} +{ +sub.f16x2 r10478, r9747, r9763; +} +{ +mul.f16x2 r10481, r10478, r10456; +} +{ +add.f16x2 %34, r10475, r10481; +} +{ +add.f16x2 r10487, r9741, r9757; +} +{ +mul.f16x2 r10490, r10487, r10455; +} +{ +add.f16x2 r10493, r8228, r10490; +} +{ +sub.f16x2 r10496, r9747, r9763; +} +{ +mul.f16x2 r10499, r10496, r10456; +} +{ +sub.f16x2 %52, r10493, r10499; +} +{ +add.f16x2 r10505, r9747, r9763; +} +{ +mul.f16x2 r10508, r10505, r10455; +} +{ +add.f16x2 r10511, r8264, r10508; +} +{ +sub.f16x2 r10514, r9741, r9757; +} +{ +mul.f16x2 r10517, r10514, r10456; +} +{ +sub.f16x2 %35, r10511, r10517; +} +{ +add.f16x2 r10523, r9747, r9763; +} +{ +mul.f16x2 r10526, r10523, r10455; +} +{ +add.f16x2 r10529, r8264, r10526; +} +{ +sub.f16x2 r10532, r9741, r9757; +} +{ +mul.f16x2 r10535, r10532, r10456; +} +{ +add.f16x2 %53, r10529, r10535; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..1965e4d63483a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp32_fwd.hpp.inc @@ -0,0 +1,4890 @@ +#ifndef CUFFTDX_FFT_19683_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_19683_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1153, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2982>; +.reg .b32 r<25>; +.reg .b64 rd<16>; +mov.u32 r23, %tid.y; +mov.u32 r24, %54; +mad.lo.s32 r3, r23, 157464, r24; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2981, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2980, %58, f2981; +mul.f32 f119, f2981, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2979, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2978, %64, f2979; +mul.f32 f135, f2979, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2977, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2976, %70, f2977; +mul.f32 f151, f2977, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f2975, f133, 0f3F441B7D; +sub.f32 f159, f2975, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f2973, f149, 0f3E31D0D4; +mul.f32 f2974, f155, 0fBF7C1C5C; +sub.f32 f164, f2973, f2974; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f2971, f134, 0f3E31D0D4; +mul.f32 f2972, f140, 0fBF7C1C5C; +sub.f32 f169, f2971, f2972; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f2969, f150, 0fBF708FB2; +mul.f32 f2970, f156, 0fBEAF1D44; +sub.f32 f174, f2969, f2970; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2968, f2978, f2976; +sub.f32 f183, f2978, f2976; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2967, f2980, f2968; +mul.f32 f187, f2968, 0f3F000000; +sub.f32 f188, f2980, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2966, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2965, f123, f2966; +mul.f32 f203, f2966, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2964, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2963, f124, f2964; +mul.f32 f219, f2964, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2960, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2958, %113, f2960; +mul.f32 f235, f2960, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2955, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2953, %116, f2955; +mul.f32 f251, f2955, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2950, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2948, %119, f2950; +mul.f32 f267, f2950, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f2947, f249, 0f3F441B7D; +sub.f32 f275, f2947, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f2946, f265, 0f3E31D0D4; +sub.f32 f280, f2946, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f2944, f250, 0f3E31D0D4; +mul.f32 f2945, f256, 0fBF7C1C5C; +sub.f32 f285, f2944, f2945; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f2942, f266, 0fBF708FB2; +mul.f32 f2943, f272, 0fBEAF1D44; +sub.f32 f290, f2942, f2943; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2941, f2953, f2948; +sub.f32 f299, f2953, f2948; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2940, f2958, f2941; +mul.f32 f303, f2941, 0f3F000000; +sub.f32 f304, f2958, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2939, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2938, f239, f2939; +mul.f32 f319, f2939, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2937, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2936, f240, f2937; +mul.f32 f335, f2937, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2933, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2931, %122, f2933; +mul.f32 f351, f2933, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2928, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2926, %125, f2928; +mul.f32 f367, f2928, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2924, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2922, %127, f2924; +mul.f32 f383, f2924, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f2921, f365, 0f3F441B7D; +sub.f32 f391, f2921, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f2920, f381, 0f3E31D0D4; +sub.f32 f396, f2920, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f2918, f366, 0f3E31D0D4; +mul.f32 f2919, f372, 0fBF7C1C5C; +sub.f32 f401, f2918, f2919; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f2916, f382, 0fBF708FB2; +mul.f32 f2917, f388, 0fBEAF1D44; +sub.f32 f406, f2916, f2917; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2915, f2926, f2922; +sub.f32 f415, f2926, f2922; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2914, f2931, f2915; +mul.f32 f419, f2915, 0f3F000000; +sub.f32 f420, f2931, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2913, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2912, f355, f2913; +mul.f32 f435, f2913, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2911, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2910, f356, f2911; +mul.f32 f451, f2911, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2938, 0fBE6C2691; +mul.f32 f2909, f310, 0f3F791978; +sub.f32 f459, f2909, f458; +mul.f32 f460, f2938, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f2907, f426, 0f3F64C51C; +mul.f32 f2908, f2912, 0fBEE5C902; +sub.f32 f464, f2907, f2908; +mul.f32 f465, f2912, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f2905, f326, 0f3F64C51C; +mul.f32 f2906, f2936, 0fBEE5C902; +sub.f32 f469, f2905, f2906; +mul.f32 f470, f2936, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f2903, f442, 0f3F18DF63; +mul.f32 f2904, f2910, 0fBF4D57F2; +sub.f32 f474, f2903, f2904; +mul.f32 f475, f2910, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f2901, f301, 0f3F441B7D; +mul.f32 f2902, f307, 0fBF248DBB; +sub.f32 f479, f2901, f2902; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f2900, f417, 0f3E31D0D4; +sub.f32 f484, f2900, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f2899, f317, 0f3F18DF63; +sub.f32 f489, f2899, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f2898, f433, 0fBE92D7E0; +sub.f32 f494, f2898, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f2897, f333, 0f3ECACAF8; +sub.f32 f499, f2897, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f2896, f449, 0fBF2FAD88; +sub.f32 f504, f2896, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f2895, f302, 0f3E31D0D4; +sub.f32 f509, f2895, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f2893, f418, 0fBF708FB2; +mul.f32 f2894, f424, 0fBEAF1D44; +sub.f32 f514, f2893, f2894; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f2891, f318, 0fBD6E2946; +mul.f32 f2892, f324, 0fBF7F9120; +sub.f32 f519, f2891, f2892; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f2889, f434, 0fBF7E44DE; +mul.f32 f2890, f440, 0f3DEDC21F; +sub.f32 f524, f2889, f2890; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f2888, f334, 0fBE92D7E0; +sub.f32 f529, f2888, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f2887, f450, 0fBF55E287; +sub.f32 f534, f2887, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f2886, f2940, f2914; +sub.f32 f541, f2940, f2914; +mul.f32 f542, f541, 0f3F5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f2886, 0f3F000000; +sub.f32 f546, f2967, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0f3F5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f2885, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0f3F5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f2884, f2965, f2885; +mul.f32 f561, f2885, 0f3F000000; +sub.f32 f562, f2965, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0f3F5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f2883, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0f3F5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f2882, f2963, f2883; +mul.f32 f577, f2883, 0f3F000000; +sub.f32 f578, f2963, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0f3F5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f2881, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0f3F5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f2880, f191, f2881; +mul.f32 f593, f2881, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0f3F5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f2879, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f2878, f207, f2879; +mul.f32 f609, f2879, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0f3F5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f2877, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0f3F5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f2876, f223, f2877; +mul.f32 f625, f2877, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0f3F5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f2875, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0f3F5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f2874, f192, f2875; +mul.f32 f641, f2875, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0f3F5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f2873, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0f3F5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f2872, f208, f2873; +mul.f32 f657, f2873, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0f3F5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f2871, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0f3F5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f2870, f224, f2871; +mul.f32 f673, f2871, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0f3F5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r22, %tid.x; +mul.wide.u32 rd2, r22, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r22, r6; +mad.lo.s32 r8, r5, 157464, r3; +mul.wide.u32 rd14, r7, 8; +mov.u64 rd15, %55; +add.s64 rd6, rd15, rd14; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f684, f680, f2884; +mul.f32 f685, f679, f2884; +mul.f32 f2868, f679, f679; +mul.f32 f2869, f680, f680; +sub.f32 f688, f2868, f2869; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f692, f690, f2882; +mul.f32 f693, f688, f2882; +mul.f32 f695, f680, f690; +mul.f32 f2867, f679, f688; +sub.f32 f696, f2867, f695; +mul.f32 f2866, f688, f568; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f700, f698, f2880; +mul.f32 f701, f696, f2880; +mul.f32 f2864, f679, f696; +mul.f32 f2865, f680, f698; +sub.f32 f704, f2864, f2865; +mul.f32 f2863, f696, f584; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f708, f706, f2878; +mul.f32 f709, f704, f2878; +mul.f32 f711, f680, f706; +mul.f32 f2862, f679, f704; +sub.f32 f712, f2862, f711; +mul.f32 f2861, f704, f600; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f716, f714, f2876; +mul.f32 f717, f712, f2876; +mul.f32 f719, f680, f714; +mul.f32 f2860, f679, f712; +sub.f32 f720, f2860, f719; +mul.f32 f2859, f712, f616; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f724, f722, f2874; +mul.f32 f725, f720, f2874; +mul.f32 f2857, f679, f720; +mul.f32 f2858, f680, f722; +sub.f32 f728, f2857, f2858; +mul.f32 f2856, f720, f632; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f732, f730, f2872; +mul.f32 f733, f728, f2872; +mul.f32 f735, f680, f730; +mul.f32 f2855, f679, f728; +sub.f32 f736, f2855, f735; +mul.f32 f2854, f728, f648; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f740, f738, f2870; +mul.f32 f741, f736, f2870; +mul.f32 f743, f680, f738; +mul.f32 f2853, f679, f736; +sub.f32 f744, f2853, f743; +mul.f32 f2852, f736, f664; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f748, f746, f549; +mul.f32 f749, f744, f549; +mul.f32 f2850, f679, f744; +mul.f32 f2851, f680, f746; +sub.f32 f752, f2850, f2851; +mul.f32 f2849, f744, f543; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f756, f754, f565; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f2848, f679, f752; +sub.f32 f760, f2848, f759; +mul.f32 f2847, f752, f559; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f764, f762, f581; +mul.f32 f765, f760, f581; +mul.f32 f2845, f679, f760; +mul.f32 f2846, f680, f762; +sub.f32 f768, f2845, f2846; +mul.f32 f2844, f760, f575; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f772, f770, f597; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f2843, f679, f768; +sub.f32 f776, f2843, f775; +mul.f32 f2842, f768, f591; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f780, f778, f613; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f2841, f679, f776; +sub.f32 f784, f2841, f783; +mul.f32 f2840, f776, f607; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f788, f786, f629; +mul.f32 f789, f784, f629; +mul.f32 f2838, f679, f784; +mul.f32 f2839, f680, f786; +sub.f32 f792, f2838, f2839; +mul.f32 f2837, f784, f623; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f796, f794, f645; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f2836, f679, f792; +sub.f32 f800, f2836, f799; +mul.f32 f2835, f792, f639; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f804, f802, f661; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f2834, f679, f800; +sub.f32 f808, f2834, f807; +mul.f32 f2833, f800, f655; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f812, f810, f677; +mul.f32 f813, f808, f677; +mul.f32 f2831, f679, f808; +mul.f32 f2832, f680, f810; +sub.f32 f816, f2831, f2832; +mul.f32 f2830, f808, f671; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f820, f818, f550; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f2829, f679, f816; +sub.f32 f824, f2829, f823; +mul.f32 f2828, f816, f544; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f828, f826, f566; +mul.f32 f829, f824, f566; +mul.f32 f2826, f679, f824; +mul.f32 f2827, f680, f826; +sub.f32 f832, f2826, f2827; +mul.f32 f2825, f824, f560; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f836, f834, f582; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f2824, f679, f832; +sub.f32 f840, f2824, f839; +mul.f32 f2823, f832, f576; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f844, f842, f598; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f2822, f679, f840; +sub.f32 f848, f2822, f847; +mul.f32 f2821, f840, f592; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f852, f850, f614; +mul.f32 f853, f848, f614; +mul.f32 f2819, f679, f848; +mul.f32 f2820, f680, f850; +sub.f32 f856, f2819, f2820; +mul.f32 f2818, f848, f608; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f860, f858, f630; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f2817, f679, f856; +sub.f32 f864, f2817, f863; +mul.f32 f2816, f856, f624; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f868, f866, f646; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f2815, f679, f864; +sub.f32 f872, f2815, f871; +mul.f32 f2814, f864, f640; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f876, f874, f662; +mul.f32 f877, f872, f662; +mul.f32 f2812, f679, f872; +mul.f32 f2813, f680, f874; +sub.f32 f880, f2812, f2813; +mul.f32 f2811, f679, f552; +mul.f32 f881, f679, f874; +mul.f32 f2810, f872, f656; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f880, f672; +mul.f32 f884, f882, f678; +mul.f32 f885, f880, f678; +barrier.sync 0; +add.f32 f886, f2967, f2886; +add.f32 f887, f178, f537; +mad.lo.s32 r21, r7, 216, r8; +st.shared.v2.f32 [r21], {f887, f886}; +fma.rn.f32 f888, f680, f552, f685; +sub.f32 f889, f2811, f684; +st.shared.v2.f32 [r21+8], {f889, f888}; +fma.rn.f32 f890, f690, f568, f693; +sub.f32 f891, f2866, f692; +st.shared.v2.f32 [r21+16], {f891, f890}; +fma.rn.f32 f892, f698, f584, f701; +sub.f32 f893, f2863, f700; +st.shared.v2.f32 [r21+24], {f893, f892}; +fma.rn.f32 f894, f706, f600, f709; +sub.f32 f895, f2861, f708; +st.shared.v2.f32 [r21+32], {f895, f894}; +fma.rn.f32 f896, f714, f616, f717; +sub.f32 f897, f2859, f716; +st.shared.v2.f32 [r21+40], {f897, f896}; +fma.rn.f32 f898, f722, f632, f725; +sub.f32 f899, f2856, f724; +st.shared.v2.f32 [r21+48], {f899, f898}; +sub.f32 f900, f2854, f732; +fma.rn.f32 f901, f730, f648, f733; +st.shared.v2.f32 [r21+56], {f900, f901}; +fma.rn.f32 f902, f738, f664, f741; +sub.f32 f903, f2852, f740; +st.shared.v2.f32 [r21+64], {f903, f902}; +fma.rn.f32 f904, f746, f543, f749; +sub.f32 f905, f2849, f748; +st.shared.v2.f32 [r21+72], {f905, f904}; +fma.rn.f32 f906, f754, f559, f757; +sub.f32 f907, f2847, f756; +st.shared.v2.f32 [r21+80], {f907, f906}; +fma.rn.f32 f908, f762, f575, f765; +sub.f32 f909, f2844, f764; +st.shared.v2.f32 [r21+88], {f909, f908}; +fma.rn.f32 f910, f770, f591, f773; +sub.f32 f911, f2842, f772; +st.shared.v2.f32 [r21+96], {f911, f910}; +fma.rn.f32 f912, f778, f607, f781; +sub.f32 f913, f2840, f780; +st.shared.v2.f32 [r21+104], {f913, f912}; +fma.rn.f32 f914, f786, f623, f789; +sub.f32 f915, f2837, f788; +st.shared.v2.f32 [r21+112], {f915, f914}; +fma.rn.f32 f916, f794, f639, f797; +sub.f32 f917, f2835, f796; +st.shared.v2.f32 [r21+120], {f917, f916}; +fma.rn.f32 f918, f802, f655, f805; +sub.f32 f919, f2833, f804; +st.shared.v2.f32 [r21+128], {f919, f918}; +fma.rn.f32 f920, f810, f671, f813; +sub.f32 f921, f2830, f812; +st.shared.v2.f32 [r21+136], {f921, f920}; +fma.rn.f32 f922, f818, f544, f821; +sub.f32 f923, f2828, f820; +st.shared.v2.f32 [r21+144], {f923, f922}; +fma.rn.f32 f924, f826, f560, f829; +sub.f32 f925, f2825, f828; +st.shared.v2.f32 [r21+152], {f925, f924}; +fma.rn.f32 f926, f834, f576, f837; +sub.f32 f927, f2823, f836; +st.shared.v2.f32 [r21+160], {f927, f926}; +fma.rn.f32 f928, f842, f592, f845; +sub.f32 f929, f2821, f844; +st.shared.v2.f32 [r21+168], {f929, f928}; +fma.rn.f32 f930, f850, f608, f853; +sub.f32 f931, f2818, f852; +st.shared.v2.f32 [r21+176], {f931, f930}; +fma.rn.f32 f932, f858, f624, f861; +sub.f32 f933, f2816, f860; +st.shared.v2.f32 [r21+184], {f933, f932}; +fma.rn.f32 f934, f866, f640, f869; +sub.f32 f935, f2814, f868; +st.shared.v2.f32 [r21+192], {f935, f934}; +fma.rn.f32 f936, f874, f656, f877; +sub.f32 f937, f2810, f876; +st.shared.v2.f32 [r21+200], {f937, f936}; +fma.rn.f32 f938, f882, f672, f885; +sub.f32 f939, f883, f884; +st.shared.v2.f32 [r21+208], {f939, f938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r21; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+5832]; +ld.shared.v2.f32 {f948, f949}, [r10+11664]; +ld.shared.v2.f32 {f952, f953}, [r10+17496]; +ld.shared.v2.f32 {f956, f957}, [r10+23328]; +ld.shared.v2.f32 {f960, f961}, [r10+29160]; +ld.shared.v2.f32 {f964, f965}, [r10+34992]; +ld.shared.v2.f32 {f968, f969}, [r10+40824]; +ld.shared.v2.f32 {f972, f973}, [r10+46656]; +ld.shared.v2.f32 {f976, f977}, [r10+52488]; +ld.shared.v2.f32 {f980, f981}, [r10+58320]; +ld.shared.v2.f32 {f984, f985}, [r10+64152]; +ld.shared.v2.f32 {f988, f989}, [r10+69984]; +ld.shared.v2.f32 {f992, f993}, [r10+75816]; +ld.shared.v2.f32 {f996, f997}, [r10+81648]; +ld.shared.v2.f32 {f1000, f1001}, [r10+87480]; +ld.shared.v2.f32 {f1004, f1005}, [r10+93312]; +ld.shared.v2.f32 {f1008, f1009}, [r10+99144]; +ld.shared.v2.f32 {f1012, f1013}, [r10+104976]; +ld.shared.v2.f32 {f1016, f1017}, [r10+110808]; +ld.shared.v2.f32 {f1020, f1021}, [r10+116640]; +ld.shared.v2.f32 {f1024, f1025}, [r10+122472]; +ld.shared.v2.f32 {f1028, f1029}, [r10+128304]; +ld.shared.v2.f32 {f1032, f1033}, [r10+134136]; +ld.shared.v2.f32 {f1036, f1037}, [r10+139968]; +ld.shared.v2.f32 {f1040, f1041}, [r10+145800]; +ld.shared.v2.f32 {f1044, f1045}, [r10+151632]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f2809, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0f3F5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f2808, f941, f2809; +mul.f32 f1058, f2809, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0f3F5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f2807, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0f3F5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f2806, f953, f2807; +mul.f32 f1074, f2807, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0f3F5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f2805, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0f3F5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f2804, f965, f2805; +mul.f32 f1090, f2805, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0f3F5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f2802, f1072, 0f3F441B7D; +mul.f32 f2803, f1078, 0fBF248DBB; +sub.f32 f1098, f2802, f2803; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0fBF248DBB, f1099; +mul.f32 f2800, f1088, 0f3E31D0D4; +mul.f32 f2801, f1094, 0fBF7C1C5C; +sub.f32 f1103, f2800, f2801; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0fBF7C1C5C, f1104; +mul.f32 f2798, f1073, 0f3E31D0D4; +mul.f32 f2799, f1079, 0fBF7C1C5C; +sub.f32 f1108, f2798, f2799; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0fBF7C1C5C, f1109; +mul.f32 f1112, f1095, 0fBEAF1D44; +mul.f32 f2797, f1089, 0fBF708FB2; +sub.f32 f1113, f2797, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0fBEAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f2796, f2806, f2804; +sub.f32 f1122, f2806, f2804; +mul.f32 f1123, f1122, 0f3F5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f2795, f2808, f2796; +mul.f32 f1126, f2796, 0f3F000000; +sub.f32 f1127, f2808, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0f3F5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f2794, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0f3F5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f2793, f1062, f2794; +mul.f32 f1142, f2794, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0f3F5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f2792, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0f3F5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f2791, f1063, f2792; +mul.f32 f1158, f2792, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0f3F5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f2790, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0f3F5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f2789, f945, f2790; +mul.f32 f1174, f2790, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0f3F5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f2788, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0f3F5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f2787, f957, f2788; +mul.f32 f1190, f2788, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0f3F5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f2786, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0f3F5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f2785, f969, f2786; +mul.f32 f1206, f2786, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0f3F5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f2783, f1188, 0f3F441B7D; +mul.f32 f2784, f1194, 0fBF248DBB; +sub.f32 f1214, f2783, f2784; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0fBF248DBB, f1215; +mul.f32 f2781, f1204, 0f3E31D0D4; +mul.f32 f2782, f1210, 0fBF7C1C5C; +sub.f32 f1219, f2781, f2782; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0fBF7C1C5C, f1220; +mul.f32 f2779, f1189, 0f3E31D0D4; +mul.f32 f2780, f1195, 0fBF7C1C5C; +sub.f32 f1224, f2779, f2780; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0fBF7C1C5C, f1225; +mul.f32 f2777, f1205, 0fBF708FB2; +mul.f32 f2778, f1211, 0fBEAF1D44; +sub.f32 f1229, f2777, f2778; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0fBEAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f2776, f2787, f2785; +sub.f32 f1238, f2787, f2785; +mul.f32 f1239, f1238, 0f3F5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f2775, f2789, f2776; +mul.f32 f1242, f2776, 0f3F000000; +sub.f32 f1243, f2789, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0f3F5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f2774, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0f3F5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f2773, f1178, f2774; +mul.f32 f1258, f2774, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0f3F5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f2772, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0f3F5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f2771, f1179, f2772; +mul.f32 f1274, f2772, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0f3F5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f2770, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0f3F5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f2769, f949, f2770; +mul.f32 f1290, f2770, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0f3F5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f2768, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0f3F5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f2767, f961, f2768; +mul.f32 f1306, f2768, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0f3F5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f2766, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0f3F5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f2765, f973, f2766; +mul.f32 f1322, f2766, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0f3F5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0fBF248DBB; +mul.f32 f2764, f1304, 0f3F441B7D; +sub.f32 f1330, f2764, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0fBF248DBB, f1331; +mul.f32 f2762, f1320, 0f3E31D0D4; +mul.f32 f2763, f1326, 0fBF7C1C5C; +sub.f32 f1335, f2762, f2763; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0fBF7C1C5C, f1336; +mul.f32 f2760, f1305, 0f3E31D0D4; +mul.f32 f2761, f1311, 0fBF7C1C5C; +sub.f32 f1340, f2760, f2761; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0fBF7C1C5C, f1341; +mul.f32 f2758, f1321, 0fBF708FB2; +mul.f32 f2759, f1327, 0fBEAF1D44; +sub.f32 f1345, f2758, f2759; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0fBEAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f2757, f2767, f2765; +sub.f32 f1354, f2767, f2765; +mul.f32 f1355, f1354, 0f3F5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f2756, f2769, f2757; +mul.f32 f1358, f2757, 0f3F000000; +sub.f32 f1359, f2769, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0f3F5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f2755, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0f3F5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f2754, f1294, f2755; +mul.f32 f1374, f2755, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0f3F5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f2753, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0f3F5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f2752, f1295, f2753; +mul.f32 f1390, f2753, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0f3F5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1397, f2773, 0fBE6C2691; +mul.f32 f2751, f1249, 0f3F791978; +sub.f32 f1398, f2751, f1397; +mul.f32 f1399, f2773, 0f3F791978; +fma.rn.f32 f1400, f1249, 0fBE6C2691, f1399; +mul.f32 f1402, f2754, 0fBEE5C902; +mul.f32 f2750, f1365, 0f3F64C51C; +sub.f32 f1403, f2750, f1402; +mul.f32 f1404, f2754, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0fBEE5C902, f1404; +mul.f32 f1407, f2771, 0fBEE5C902; +mul.f32 f2749, f1265, 0f3F64C51C; +sub.f32 f1408, f2749, f1407; +mul.f32 f1409, f2771, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0fBEE5C902, f1409; +mul.f32 f2747, f1381, 0f3F18DF63; +mul.f32 f2748, f2752, 0fBF4D57F2; +sub.f32 f1413, f2747, f2748; +mul.f32 f1414, f2752, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0fBF4D57F2, f1414; +mul.f32 f2745, f1240, 0f3F441B7D; +mul.f32 f2746, f1246, 0fBF248DBB; +sub.f32 f1418, f2745, f2746; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0fBF248DBB, f1419; +mul.f32 f2743, f1356, 0f3E31D0D4; +mul.f32 f2744, f1362, 0fBF7C1C5C; +sub.f32 f1423, f2743, f2744; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0fBF7C1C5C, f1424; +mul.f32 f2741, f1256, 0f3F18DF63; +mul.f32 f2742, f1262, 0fBF4D57F2; +sub.f32 f1428, f2741, f2742; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0fBF4D57F2, f1429; +mul.f32 f1432, f1378, 0fBF753ECD; +mul.f32 f2740, f1372, 0fBE92D7E0; +sub.f32 f1433, f2740, f1432; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0fBF753ECD, f1434; +mul.f32 f1437, f1278, 0fBF6B1036; +mul.f32 f2739, f1272, 0f3ECACAF8; +sub.f32 f1438, f2739, f1437; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0fBF6B1036, f1439; +mul.f32 f1442, f1394, 0fBF3A3529; +mul.f32 f2738, f1388, 0fBF2FAD88; +sub.f32 f1443, f2738, f1442; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0fBF3A3529, f1444; +mul.f32 f1447, f1247, 0fBF7C1C5C; +mul.f32 f2737, f1241, 0f3E31D0D4; +sub.f32 f1448, f2737, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0fBF7C1C5C, f1449; +mul.f32 f1452, f1363, 0fBEAF1D44; +mul.f32 f2736, f1357, 0fBF708FB2; +sub.f32 f1453, f2736, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0fBEAF1D44, f1454; +mul.f32 f1457, f1263, 0fBF7F9120; +mul.f32 f2735, f1257, 0fBD6E2946; +sub.f32 f1458, f2735, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0fBF7F9120, f1459; +mul.f32 f2733, f1373, 0fBF7E44DE; +mul.f32 f2734, f1379, 0f3DEDC21F; +sub.f32 f1463, f2733, f2734; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0f3DEDC21F, f1464; +mul.f32 f2731, f1273, 0fBE92D7E0; +mul.f32 f2732, f1279, 0fBF753ECD; +sub.f32 f1468, f2731, f2732; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0fBF753ECD, f1469; +mul.f32 f2729, f1389, 0fBF55E287; +mul.f32 f2730, f1395, 0f3F0CAC9F; +sub.f32 f1473, f2729, f2730; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0f3F0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f2728, f2775, f2756; +sub.f32 f1480, f2775, f2756; +mul.f32 f1481, f1480, 0f3F5DB3D7; +add.f32 f1482, f1481, f1479; +sub.f32 f1483, f1479, f1481; +mul.f32 f1484, f2728, 0f3F000000; +sub.f32 f1485, f2795, f1484; +sub.f32 f1486, f1233, f1349; +mul.f32 f1487, f1486, 0f3F5DB3D7; +sub.f32 f1488, f1485, f1487; +add.f32 f1489, f1487, f1485; +add.f32 f1490, f1398, f1403; +add.f32 f1491, f1133, f1490; +mul.f32 f1494, f1490, 0f3F000000; +sub.f32 f1495, f1133, f1494; +add.f32 f2727, f1400, f1405; +sub.f32 f1496, f1400, f1405; +mul.f32 f1497, f1496, 0f3F5DB3D7; +add.f32 f1498, f1497, f1495; +sub.f32 f1499, f1495, f1497; +add.f32 f2726, f2793, f2727; +mul.f32 f1500, f2727, 0f3F000000; +sub.f32 f1501, f2793, f1500; +sub.f32 f1502, f1398, f1403; +mul.f32 f1503, f1502, 0f3F5DB3D7; +sub.f32 f1504, f1501, f1503; +add.f32 f1505, f1503, f1501; +add.f32 f1506, f1408, f1413; +add.f32 f1507, f1149, f1506; +mul.f32 f1510, f1506, 0f3F000000; +sub.f32 f1511, f1149, f1510; +add.f32 f2725, f1410, f1415; +sub.f32 f1512, f1410, f1415; +mul.f32 f1513, f1512, 0f3F5DB3D7; +add.f32 f1514, f1513, f1511; +sub.f32 f1515, f1511, f1513; +add.f32 f2724, f2791, f2725; +mul.f32 f1516, f2725, 0f3F000000; +sub.f32 f1517, f2791, f1516; +sub.f32 f1518, f1408, f1413; +mul.f32 f1519, f1518, 0f3F5DB3D7; +sub.f32 f1520, f1517, f1519; +add.f32 f1521, f1519, f1517; +add.f32 f1522, f1418, f1423; +add.f32 f1523, f1124, f1522; +mul.f32 f1526, f1522, 0f3F000000; +sub.f32 f1527, f1124, f1526; +add.f32 f2723, f1420, f1425; +sub.f32 f1528, f1420, f1425; +mul.f32 f1529, f1528, 0f3F5DB3D7; +add.f32 f1530, f1529, f1527; +sub.f32 f1531, f1527, f1529; +add.f32 f2722, f1130, f2723; +mul.f32 f1532, f2723, 0f3F000000; +sub.f32 f1533, f1130, f1532; +sub.f32 f1534, f1418, f1423; +mul.f32 f1535, f1534, 0f3F5DB3D7; +sub.f32 f1536, f1533, f1535; +add.f32 f1537, f1535, f1533; +add.f32 f1538, f1428, f1433; +add.f32 f1539, f1140, f1538; +mul.f32 f1542, f1538, 0f3F000000; +sub.f32 f1543, f1140, f1542; +add.f32 f2721, f1430, f1435; +sub.f32 f1544, f1430, f1435; +mul.f32 f1545, f1544, 0f3F5DB3D7; +add.f32 f1546, f1545, f1543; +sub.f32 f1547, f1543, f1545; +add.f32 f2720, f1146, f2721; +mul.f32 f1548, f2721, 0f3F000000; +sub.f32 f1549, f1146, f1548; +sub.f32 f1550, f1428, f1433; +mul.f32 f1551, f1550, 0f3F5DB3D7; +sub.f32 f1552, f1549, f1551; +add.f32 f1553, f1551, f1549; +add.f32 f1554, f1438, f1443; +add.f32 f1555, f1156, f1554; +mul.f32 f1558, f1554, 0f3F000000; +sub.f32 f1559, f1156, f1558; +add.f32 f2719, f1440, f1445; +sub.f32 f1560, f1440, f1445; +mul.f32 f1561, f1560, 0f3F5DB3D7; +add.f32 f1562, f1561, f1559; +sub.f32 f1563, f1559, f1561; +add.f32 f2718, f1162, f2719; +mul.f32 f1564, f2719, 0f3F000000; +sub.f32 f1565, f1162, f1564; +sub.f32 f1566, f1438, f1443; +mul.f32 f1567, f1566, 0f3F5DB3D7; +sub.f32 f1568, f1565, f1567; +add.f32 f1569, f1567, f1565; +add.f32 f1570, f1448, f1453; +add.f32 f1571, f1125, f1570; +mul.f32 f1574, f1570, 0f3F000000; +sub.f32 f1575, f1125, f1574; +add.f32 f2717, f1450, f1455; +sub.f32 f1576, f1450, f1455; +mul.f32 f1577, f1576, 0f3F5DB3D7; +add.f32 f1578, f1577, f1575; +sub.f32 f1579, f1575, f1577; +add.f32 f2716, f1131, f2717; +mul.f32 f1580, f2717, 0f3F000000; +sub.f32 f1581, f1131, f1580; +sub.f32 f1582, f1448, f1453; +mul.f32 f1583, f1582, 0f3F5DB3D7; +sub.f32 f1584, f1581, f1583; +add.f32 f1585, f1583, f1581; +add.f32 f1586, f1458, f1463; +add.f32 f1587, f1141, f1586; +mul.f32 f1590, f1586, 0f3F000000; +sub.f32 f1591, f1141, f1590; +add.f32 f2715, f1460, f1465; +sub.f32 f1592, f1460, f1465; +mul.f32 f1593, f1592, 0f3F5DB3D7; +add.f32 f1594, f1593, f1591; +sub.f32 f1595, f1591, f1593; +add.f32 f2714, f1147, f2715; +mul.f32 f1596, f2715, 0f3F000000; +sub.f32 f1597, f1147, f1596; +sub.f32 f1598, f1458, f1463; +mul.f32 f1599, f1598, 0f3F5DB3D7; +sub.f32 f1600, f1597, f1599; +add.f32 f1601, f1599, f1597; +add.f32 f1602, f1468, f1473; +add.f32 f1603, f1157, f1602; +mul.f32 f1606, f1602, 0f3F000000; +sub.f32 f1607, f1157, f1606; +add.f32 f2713, f1470, f1475; +sub.f32 f1608, f1470, f1475; +mul.f32 f1609, f1608, 0f3F5DB3D7; +add.f32 f1610, f1609, f1607; +sub.f32 f1611, f1607, f1609; +add.f32 f2712, f1163, f2713; +mul.f32 f1612, f2713, 0f3F000000; +sub.f32 f1613, f1163, f1612; +sub.f32 f1614, f1468, f1473; +mul.f32 f1615, f1614, 0f3F5DB3D7; +sub.f32 f1616, f1613, f1615; +add.f32 f1617, f1615, f1613; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1618, f1619}, [rd11]; +mul.f32 f1623, f1619, f2726; +mul.f32 f1624, f1618, f2726; +mul.f32 f2710, f1618, f1618; +mul.f32 f2711, f1619, f1619; +sub.f32 f1627, f2710, f2711; +mul.f32 f1628, f1619, f1618; +fma.rn.f32 f1629, f1619, f1618, f1628; +mul.f32 f1631, f1629, f2724; +mul.f32 f1632, f1627, f2724; +mul.f32 f1634, f1619, f1629; +mul.f32 f2709, f1618, f1627; +sub.f32 f1635, f2709, f1634; +mul.f32 f2708, f1627, f1507; +mul.f32 f1636, f1618, f1629; +fma.rn.f32 f1637, f1619, f1627, f1636; +mul.f32 f1639, f1637, f2722; +mul.f32 f1640, f1635, f2722; +mul.f32 f2706, f1618, f1635; +mul.f32 f2707, f1619, f1637; +sub.f32 f1643, f2706, f2707; +mul.f32 f2705, f1635, f1523; +mul.f32 f1644, f1618, f1637; +fma.rn.f32 f1645, f1619, f1635, f1644; +mul.f32 f1647, f1645, f2720; +mul.f32 f1648, f1643, f2720; +mul.f32 f1650, f1619, f1645; +mul.f32 f2704, f1618, f1643; +sub.f32 f1651, f2704, f1650; +mul.f32 f2703, f1643, f1539; +mul.f32 f1652, f1618, f1645; +fma.rn.f32 f1653, f1619, f1643, f1652; +mul.f32 f1655, f1653, f2718; +mul.f32 f1656, f1651, f2718; +mul.f32 f1658, f1619, f1653; +mul.f32 f2702, f1618, f1651; +sub.f32 f1659, f2702, f1658; +mul.f32 f2701, f1651, f1555; +mul.f32 f1660, f1618, f1653; +fma.rn.f32 f1661, f1619, f1651, f1660; +mul.f32 f1663, f1661, f2716; +mul.f32 f1664, f1659, f2716; +mul.f32 f2699, f1618, f1659; +mul.f32 f2700, f1619, f1661; +sub.f32 f1667, f2699, f2700; +mul.f32 f2698, f1659, f1571; +mul.f32 f1668, f1618, f1661; +fma.rn.f32 f1669, f1619, f1659, f1668; +mul.f32 f1671, f1669, f2714; +mul.f32 f1672, f1667, f2714; +mul.f32 f1674, f1619, f1669; +mul.f32 f2697, f1618, f1667; +sub.f32 f1675, f2697, f1674; +mul.f32 f2696, f1667, f1587; +mul.f32 f1676, f1618, f1669; +fma.rn.f32 f1677, f1619, f1667, f1676; +mul.f32 f1679, f1677, f2712; +mul.f32 f1680, f1675, f2712; +mul.f32 f1682, f1619, f1677; +mul.f32 f2695, f1618, f1675; +sub.f32 f1683, f2695, f1682; +mul.f32 f2694, f1675, f1603; +mul.f32 f1684, f1618, f1677; +fma.rn.f32 f1685, f1619, f1675, f1684; +mul.f32 f1687, f1685, f1488; +mul.f32 f1688, f1683, f1488; +mul.f32 f2692, f1618, f1683; +mul.f32 f2693, f1619, f1685; +sub.f32 f1691, f2692, f2693; +mul.f32 f2691, f1683, f1482; +mul.f32 f1692, f1618, f1685; +fma.rn.f32 f1693, f1619, f1683, f1692; +mul.f32 f1695, f1693, f1504; +mul.f32 f1696, f1691, f1504; +mul.f32 f1698, f1619, f1693; +mul.f32 f2690, f1618, f1691; +sub.f32 f1699, f2690, f1698; +mul.f32 f2689, f1691, f1498; +mul.f32 f1700, f1618, f1693; +fma.rn.f32 f1701, f1619, f1691, f1700; +mul.f32 f1703, f1701, f1520; +mul.f32 f1704, f1699, f1520; +mul.f32 f2687, f1618, f1699; +mul.f32 f2688, f1619, f1701; +sub.f32 f1707, f2687, f2688; +mul.f32 f2686, f1699, f1514; +mul.f32 f1708, f1618, f1701; +fma.rn.f32 f1709, f1619, f1699, f1708; +mul.f32 f1711, f1709, f1536; +mul.f32 f1712, f1707, f1536; +mul.f32 f1714, f1619, f1709; +mul.f32 f2685, f1618, f1707; +sub.f32 f1715, f2685, f1714; +mul.f32 f2684, f1707, f1530; +mul.f32 f1716, f1618, f1709; +fma.rn.f32 f1717, f1619, f1707, f1716; +mul.f32 f1719, f1717, f1552; +mul.f32 f1720, f1715, f1552; +mul.f32 f1722, f1619, f1717; +mul.f32 f2683, f1618, f1715; +sub.f32 f1723, f2683, f1722; +mul.f32 f2682, f1715, f1546; +mul.f32 f1724, f1618, f1717; +fma.rn.f32 f1725, f1619, f1715, f1724; +mul.f32 f1727, f1725, f1568; +mul.f32 f1728, f1723, f1568; +mul.f32 f2680, f1618, f1723; +mul.f32 f2681, f1619, f1725; +sub.f32 f1731, f2680, f2681; +mul.f32 f2679, f1723, f1562; +mul.f32 f1732, f1618, f1725; +fma.rn.f32 f1733, f1619, f1723, f1732; +mul.f32 f1735, f1733, f1584; +mul.f32 f1736, f1731, f1584; +mul.f32 f1738, f1619, f1733; +mul.f32 f2678, f1618, f1731; +sub.f32 f1739, f2678, f1738; +mul.f32 f2677, f1731, f1578; +mul.f32 f1740, f1618, f1733; +fma.rn.f32 f1741, f1619, f1731, f1740; +mul.f32 f1743, f1741, f1600; +mul.f32 f1744, f1739, f1600; +mul.f32 f1746, f1619, f1741; +mul.f32 f2676, f1618, f1739; +sub.f32 f1747, f2676, f1746; +mul.f32 f2675, f1739, f1594; +mul.f32 f1748, f1618, f1741; +fma.rn.f32 f1749, f1619, f1739, f1748; +mul.f32 f1751, f1749, f1616; +mul.f32 f1752, f1747, f1616; +mul.f32 f2673, f1618, f1747; +mul.f32 f2674, f1619, f1749; +sub.f32 f1755, f2673, f2674; +mul.f32 f2672, f1747, f1610; +mul.f32 f1756, f1618, f1749; +fma.rn.f32 f1757, f1619, f1747, f1756; +mul.f32 f1759, f1757, f1489; +mul.f32 f1760, f1755, f1489; +mul.f32 f1762, f1619, f1757; +mul.f32 f2671, f1618, f1755; +sub.f32 f1763, f2671, f1762; +mul.f32 f2670, f1755, f1483; +mul.f32 f1764, f1618, f1757; +fma.rn.f32 f1765, f1619, f1755, f1764; +mul.f32 f1767, f1765, f1505; +mul.f32 f1768, f1763, f1505; +mul.f32 f2668, f1618, f1763; +mul.f32 f2669, f1619, f1765; +sub.f32 f1771, f2668, f2669; +mul.f32 f2667, f1763, f1499; +mul.f32 f1772, f1618, f1765; +fma.rn.f32 f1773, f1619, f1763, f1772; +mul.f32 f1775, f1773, f1521; +mul.f32 f1776, f1771, f1521; +mul.f32 f1778, f1619, f1773; +mul.f32 f2666, f1618, f1771; +sub.f32 f1779, f2666, f1778; +mul.f32 f2665, f1771, f1515; +mul.f32 f1780, f1618, f1773; +fma.rn.f32 f1781, f1619, f1771, f1780; +mul.f32 f1783, f1781, f1537; +mul.f32 f1784, f1779, f1537; +mul.f32 f1786, f1619, f1781; +mul.f32 f2664, f1618, f1779; +sub.f32 f1787, f2664, f1786; +mul.f32 f2663, f1779, f1531; +mul.f32 f1788, f1618, f1781; +fma.rn.f32 f1789, f1619, f1779, f1788; +mul.f32 f1791, f1789, f1553; +mul.f32 f1792, f1787, f1553; +mul.f32 f2661, f1618, f1787; +mul.f32 f2662, f1619, f1789; +sub.f32 f1795, f2661, f2662; +mul.f32 f2660, f1787, f1547; +mul.f32 f1796, f1618, f1789; +fma.rn.f32 f1797, f1619, f1787, f1796; +mul.f32 f1799, f1797, f1569; +mul.f32 f1800, f1795, f1569; +mul.f32 f1802, f1619, f1797; +mul.f32 f2659, f1618, f1795; +sub.f32 f1803, f2659, f1802; +mul.f32 f2658, f1795, f1563; +mul.f32 f1804, f1618, f1797; +fma.rn.f32 f1805, f1619, f1795, f1804; +mul.f32 f1807, f1805, f1585; +mul.f32 f1808, f1803, f1585; +mul.f32 f1810, f1619, f1805; +mul.f32 f2657, f1618, f1803; +sub.f32 f1811, f2657, f1810; +mul.f32 f2656, f1803, f1579; +mul.f32 f1812, f1618, f1805; +fma.rn.f32 f1813, f1619, f1803, f1812; +mul.f32 f1815, f1813, f1601; +mul.f32 f1816, f1811, f1601; +mul.f32 f2654, f1618, f1811; +mul.f32 f2655, f1619, f1813; +sub.f32 f1819, f2654, f2655; +mul.f32 f2653, f1618, f1491; +mul.f32 f1820, f1618, f1813; +mul.f32 f2652, f1811, f1595; +fma.rn.f32 f1821, f1619, f1811, f1820; +mul.f32 f1822, f1819, f1611; +mul.f32 f1823, f1821, f1617; +mul.f32 f1824, f1819, f1617; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 5832, r19; +add.f32 f1825, f2795, f2728; +add.f32 f1826, f1117, f1476; +st.shared.v2.f32 [r20], {f1826, f1825}; +fma.rn.f32 f1827, f1619, f1491, f1624; +sub.f32 f1828, f2653, f1623; +st.shared.v2.f32 [r20+216], {f1828, f1827}; +fma.rn.f32 f1829, f1629, f1507, f1632; +sub.f32 f1830, f2708, f1631; +st.shared.v2.f32 [r20+432], {f1830, f1829}; +fma.rn.f32 f1831, f1637, f1523, f1640; +sub.f32 f1832, f2705, f1639; +st.shared.v2.f32 [r20+648], {f1832, f1831}; +fma.rn.f32 f1833, f1645, f1539, f1648; +sub.f32 f1834, f2703, f1647; +st.shared.v2.f32 [r20+864], {f1834, f1833}; +fma.rn.f32 f1835, f1653, f1555, f1656; +sub.f32 f1836, f2701, f1655; +st.shared.v2.f32 [r20+1080], {f1836, f1835}; +sub.f32 f1837, f2698, f1663; +fma.rn.f32 f1838, f1661, f1571, f1664; +st.shared.v2.f32 [r20+1296], {f1837, f1838}; +fma.rn.f32 f1839, f1669, f1587, f1672; +sub.f32 f1840, f2696, f1671; +st.shared.v2.f32 [r20+1512], {f1840, f1839}; +sub.f32 f1841, f2694, f1679; +fma.rn.f32 f1842, f1677, f1603, f1680; +st.shared.v2.f32 [r20+1728], {f1841, f1842}; +fma.rn.f32 f1843, f1685, f1482, f1688; +sub.f32 f1844, f2691, f1687; +st.shared.v2.f32 [r20+1944], {f1844, f1843}; +fma.rn.f32 f1845, f1693, f1498, f1696; +sub.f32 f1846, f2689, f1695; +st.shared.v2.f32 [r20+2160], {f1846, f1845}; +fma.rn.f32 f1847, f1701, f1514, f1704; +sub.f32 f1848, f2686, f1703; +st.shared.v2.f32 [r20+2376], {f1848, f1847}; +fma.rn.f32 f1849, f1709, f1530, f1712; +sub.f32 f1850, f2684, f1711; +st.shared.v2.f32 [r20+2592], {f1850, f1849}; +fma.rn.f32 f1851, f1717, f1546, f1720; +sub.f32 f1852, f2682, f1719; +st.shared.v2.f32 [r20+2808], {f1852, f1851}; +fma.rn.f32 f1853, f1725, f1562, f1728; +sub.f32 f1854, f2679, f1727; +st.shared.v2.f32 [r20+3024], {f1854, f1853}; +fma.rn.f32 f1855, f1733, f1578, f1736; +sub.f32 f1856, f2677, f1735; +st.shared.v2.f32 [r20+3240], {f1856, f1855}; +fma.rn.f32 f1857, f1741, f1594, f1744; +sub.f32 f1858, f2675, f1743; +st.shared.v2.f32 [r20+3456], {f1858, f1857}; +fma.rn.f32 f1859, f1749, f1610, f1752; +sub.f32 f1860, f2672, f1751; +st.shared.v2.f32 [r20+3672], {f1860, f1859}; +fma.rn.f32 f1861, f1757, f1483, f1760; +sub.f32 f1862, f2670, f1759; +st.shared.v2.f32 [r20+3888], {f1862, f1861}; +fma.rn.f32 f1863, f1765, f1499, f1768; +sub.f32 f1864, f2667, f1767; +st.shared.v2.f32 [r20+4104], {f1864, f1863}; +fma.rn.f32 f1865, f1773, f1515, f1776; +sub.f32 f1866, f2665, f1775; +st.shared.v2.f32 [r20+4320], {f1866, f1865}; +fma.rn.f32 f1867, f1781, f1531, f1784; +sub.f32 f1868, f2663, f1783; +st.shared.v2.f32 [r20+4536], {f1868, f1867}; +fma.rn.f32 f1869, f1789, f1547, f1792; +sub.f32 f1870, f2660, f1791; +st.shared.v2.f32 [r20+4752], {f1870, f1869}; +fma.rn.f32 f1871, f1797, f1563, f1800; +sub.f32 f1872, f2658, f1799; +st.shared.v2.f32 [r20+4968], {f1872, f1871}; +fma.rn.f32 f1873, f1805, f1579, f1808; +sub.f32 f1874, f2656, f1807; +st.shared.v2.f32 [r20+5184], {f1874, f1873}; +fma.rn.f32 f1875, f1813, f1595, f1816; +sub.f32 f1876, f2652, f1815; +st.shared.v2.f32 [r20+5400], {f1876, f1875}; +fma.rn.f32 f1877, f1821, f1611, f1824; +sub.f32 f1878, f1822, f1823; +st.shared.v2.f32 [r20+5616], {f1878, f1877}; +barrier.sync 0; +ld.shared.v2.f32 {f1879, f1880}, [r10]; +ld.shared.v2.f32 {f1883, f1884}, [r10+5832]; +ld.shared.v2.f32 {f1887, f1888}, [r10+11664]; +ld.shared.v2.f32 {f1891, f1892}, [r10+17496]; +ld.shared.v2.f32 {f1895, f1896}, [r10+23328]; +ld.shared.v2.f32 {f1899, f1900}, [r10+29160]; +ld.shared.v2.f32 {f1903, f1904}, [r10+34992]; +ld.shared.v2.f32 {f1907, f1908}, [r10+40824]; +ld.shared.v2.f32 {f1911, f1912}, [r10+46656]; +ld.shared.v2.f32 {f1915, f1916}, [r10+52488]; +ld.shared.v2.f32 {f1919, f1920}, [r10+58320]; +ld.shared.v2.f32 {f1923, f1924}, [r10+64152]; +ld.shared.v2.f32 {f1927, f1928}, [r10+69984]; +ld.shared.v2.f32 {f1931, f1932}, [r10+75816]; +ld.shared.v2.f32 {f1935, f1936}, [r10+81648]; +ld.shared.v2.f32 {f1939, f1940}, [r10+87480]; +ld.shared.v2.f32 {f1943, f1944}, [r10+93312]; +ld.shared.v2.f32 {f1947, f1948}, [r10+99144]; +ld.shared.v2.f32 {f1951, f1952}, [r10+104976]; +ld.shared.v2.f32 {f1955, f1956}, [r10+110808]; +ld.shared.v2.f32 {f1959, f1960}, [r10+116640]; +ld.shared.v2.f32 {f1963, f1964}, [r10+122472]; +ld.shared.v2.f32 {f1967, f1968}, [r10+128304]; +ld.shared.v2.f32 {f1971, f1972}, [r10+134136]; +ld.shared.v2.f32 {f1975, f1976}, [r10+139968]; +ld.shared.v2.f32 {f1979, f1980}, [r10+145800]; +ld.shared.v2.f32 {f1983, f1984}, [r10+151632]; +add.f32 f1987, f1915, f1951; +add.f32 f1988, f1879, f1987; +mul.f32 f1991, f1987, 0f3F000000; +sub.f32 f1992, f1879, f1991; +add.f32 f2651, f1916, f1952; +sub.f32 f1993, f1916, f1952; +mul.f32 f1994, f1993, 0f3F5DB3D7; +add.f32 f1995, f1994, f1992; +sub.f32 f1996, f1992, f1994; +add.f32 f2650, f1880, f2651; +mul.f32 f1997, f2651, 0f3F000000; +sub.f32 f1998, f1880, f1997; +sub.f32 f1999, f1915, f1951; +mul.f32 f2000, f1999, 0f3F5DB3D7; +sub.f32 f2001, f1998, f2000; +add.f32 f2002, f2000, f1998; +add.f32 f2003, f1927, f1963; +add.f32 f2004, f1891, f2003; +mul.f32 f2007, f2003, 0f3F000000; +sub.f32 f2008, f1891, f2007; +add.f32 f2649, f1928, f1964; +sub.f32 f2009, f1928, f1964; +mul.f32 f2010, f2009, 0f3F5DB3D7; +add.f32 f2011, f2010, f2008; +sub.f32 f2012, f2008, f2010; +add.f32 f2648, f1892, f2649; +mul.f32 f2013, f2649, 0f3F000000; +sub.f32 f2014, f1892, f2013; +sub.f32 f2015, f1927, f1963; +mul.f32 f2016, f2015, 0f3F5DB3D7; +sub.f32 f2017, f2014, f2016; +add.f32 f2018, f2016, f2014; +add.f32 f2019, f1939, f1975; +add.f32 f2020, f1903, f2019; +mul.f32 f2023, f2019, 0f3F000000; +sub.f32 f2024, f1903, f2023; +add.f32 f2647, f1940, f1976; +sub.f32 f2025, f1940, f1976; +mul.f32 f2026, f2025, 0f3F5DB3D7; +add.f32 f2027, f2026, f2024; +sub.f32 f2028, f2024, f2026; +add.f32 f2646, f1904, f2647; +mul.f32 f2029, f2647, 0f3F000000; +sub.f32 f2030, f1904, f2029; +sub.f32 f2031, f1939, f1975; +mul.f32 f2032, f2031, 0f3F5DB3D7; +sub.f32 f2033, f2030, f2032; +add.f32 f2034, f2032, f2030; +mul.f32 f2036, f2017, 0fBF248DBB; +mul.f32 f2645, f2011, 0f3F441B7D; +sub.f32 f2037, f2645, f2036; +mul.f32 f2038, f2017, 0f3F441B7D; +fma.rn.f32 f2039, f2011, 0fBF248DBB, f2038; +mul.f32 f2041, f2033, 0fBF7C1C5C; +mul.f32 f2644, f2027, 0f3E31D0D4; +sub.f32 f2042, f2644, f2041; +mul.f32 f2043, f2033, 0f3E31D0D4; +fma.rn.f32 f2044, f2027, 0fBF7C1C5C, f2043; +mul.f32 f2046, f2018, 0fBF7C1C5C; +mul.f32 f2643, f2012, 0f3E31D0D4; +sub.f32 f2047, f2643, f2046; +mul.f32 f2048, f2018, 0f3E31D0D4; +fma.rn.f32 f2049, f2012, 0fBF7C1C5C, f2048; +mul.f32 f2051, f2034, 0fBEAF1D44; +mul.f32 f2642, f2028, 0fBF708FB2; +sub.f32 f2052, f2642, f2051; +mul.f32 f2053, f2034, 0fBF708FB2; +fma.rn.f32 f2054, f2028, 0fBEAF1D44, f2053; +add.f32 f2055, f2004, f2020; +add.f32 f2056, f1988, f2055; +mul.f32 f2059, f2055, 0f3F000000; +sub.f32 f2060, f1988, f2059; +add.f32 f2641, f2648, f2646; +sub.f32 f2061, f2648, f2646; +mul.f32 f2062, f2061, 0f3F5DB3D7; +add.f32 f2063, f2062, f2060; +sub.f32 f2064, f2060, f2062; +add.f32 f2640, f2650, f2641; +mul.f32 f2065, f2641, 0f3F000000; +sub.f32 f2066, f2650, f2065; +sub.f32 f2067, f2004, f2020; +mul.f32 f2068, f2067, 0f3F5DB3D7; +sub.f32 f2069, f2066, f2068; +add.f32 f2070, f2068, f2066; +add.f32 f2071, f2037, f2042; +add.f32 f2072, f1995, f2071; +mul.f32 f2075, f2071, 0f3F000000; +sub.f32 f2076, f1995, f2075; +add.f32 f2639, f2039, f2044; +sub.f32 f2077, f2039, f2044; +mul.f32 f2078, f2077, 0f3F5DB3D7; +add.f32 f2079, f2078, f2076; +sub.f32 f2080, f2076, f2078; +add.f32 f2638, f2001, f2639; +mul.f32 f2081, f2639, 0f3F000000; +sub.f32 f2082, f2001, f2081; +sub.f32 f2083, f2037, f2042; +mul.f32 f2084, f2083, 0f3F5DB3D7; +sub.f32 f2085, f2082, f2084; +add.f32 f2086, f2084, f2082; +add.f32 f2087, f2047, f2052; +add.f32 f2088, f1996, f2087; +mul.f32 f2091, f2087, 0f3F000000; +sub.f32 f2092, f1996, f2091; +add.f32 f2637, f2049, f2054; +sub.f32 f2093, f2049, f2054; +mul.f32 f2094, f2093, 0f3F5DB3D7; +add.f32 f2095, f2094, f2092; +sub.f32 f2096, f2092, f2094; +add.f32 f2636, f2002, f2637; +mul.f32 f2097, f2637, 0f3F000000; +sub.f32 f2098, f2002, f2097; +sub.f32 f2099, f2047, f2052; +mul.f32 f2100, f2099, 0f3F5DB3D7; +sub.f32 f2101, f2098, f2100; +add.f32 f2102, f2100, f2098; +add.f32 f2103, f1919, f1955; +add.f32 f2104, f1883, f2103; +mul.f32 f2107, f2103, 0f3F000000; +sub.f32 f2108, f1883, f2107; +add.f32 f2635, f1920, f1956; +sub.f32 f2109, f1920, f1956; +mul.f32 f2110, f2109, 0f3F5DB3D7; +add.f32 f2111, f2110, f2108; +sub.f32 f2112, f2108, f2110; +add.f32 f2634, f1884, f2635; +mul.f32 f2113, f2635, 0f3F000000; +sub.f32 f2114, f1884, f2113; +sub.f32 f2115, f1919, f1955; +mul.f32 f2116, f2115, 0f3F5DB3D7; +sub.f32 f2117, f2114, f2116; +add.f32 f2118, f2116, f2114; +add.f32 f2119, f1931, f1967; +add.f32 f2120, f1895, f2119; +mul.f32 f2123, f2119, 0f3F000000; +sub.f32 f2124, f1895, f2123; +add.f32 f2633, f1932, f1968; +sub.f32 f2125, f1932, f1968; +mul.f32 f2126, f2125, 0f3F5DB3D7; +add.f32 f2127, f2126, f2124; +sub.f32 f2128, f2124, f2126; +add.f32 f2632, f1896, f2633; +mul.f32 f2129, f2633, 0f3F000000; +sub.f32 f2130, f1896, f2129; +sub.f32 f2131, f1931, f1967; +mul.f32 f2132, f2131, 0f3F5DB3D7; +sub.f32 f2133, f2130, f2132; +add.f32 f2134, f2132, f2130; +add.f32 f2135, f1943, f1979; +add.f32 f2136, f1907, f2135; +mul.f32 f2139, f2135, 0f3F000000; +sub.f32 f2140, f1907, f2139; +add.f32 f2631, f1944, f1980; +sub.f32 f2141, f1944, f1980; +mul.f32 f2142, f2141, 0f3F5DB3D7; +add.f32 f2143, f2142, f2140; +sub.f32 f2144, f2140, f2142; +add.f32 f2630, f1908, f2631; +mul.f32 f2145, f2631, 0f3F000000; +sub.f32 f2146, f1908, f2145; +sub.f32 f2147, f1943, f1979; +mul.f32 f2148, f2147, 0f3F5DB3D7; +sub.f32 f2149, f2146, f2148; +add.f32 f2150, f2148, f2146; +mul.f32 f2152, f2133, 0fBF248DBB; +mul.f32 f2629, f2127, 0f3F441B7D; +sub.f32 f2153, f2629, f2152; +mul.f32 f2154, f2133, 0f3F441B7D; +fma.rn.f32 f2155, f2127, 0fBF248DBB, f2154; +mul.f32 f2157, f2149, 0fBF7C1C5C; +mul.f32 f2628, f2143, 0f3E31D0D4; +sub.f32 f2158, f2628, f2157; +mul.f32 f2159, f2149, 0f3E31D0D4; +fma.rn.f32 f2160, f2143, 0fBF7C1C5C, f2159; +mul.f32 f2162, f2134, 0fBF7C1C5C; +mul.f32 f2627, f2128, 0f3E31D0D4; +sub.f32 f2163, f2627, f2162; +mul.f32 f2164, f2134, 0f3E31D0D4; +fma.rn.f32 f2165, f2128, 0fBF7C1C5C, f2164; +mul.f32 f2167, f2150, 0fBEAF1D44; +mul.f32 f2626, f2144, 0fBF708FB2; +sub.f32 f2168, f2626, f2167; +mul.f32 f2169, f2150, 0fBF708FB2; +fma.rn.f32 f2170, f2144, 0fBEAF1D44, f2169; +add.f32 f2171, f2120, f2136; +add.f32 f2172, f2104, f2171; +mul.f32 f2175, f2171, 0f3F000000; +sub.f32 f2176, f2104, f2175; +add.f32 f2625, f2632, f2630; +sub.f32 f2177, f2632, f2630; +mul.f32 f2178, f2177, 0f3F5DB3D7; +add.f32 f2179, f2178, f2176; +sub.f32 f2180, f2176, f2178; +add.f32 f2624, f2634, f2625; +mul.f32 f2181, f2625, 0f3F000000; +sub.f32 f2182, f2634, f2181; +sub.f32 f2183, f2120, f2136; +mul.f32 f2184, f2183, 0f3F5DB3D7; +sub.f32 f2185, f2182, f2184; +add.f32 f2186, f2184, f2182; +add.f32 f2187, f2153, f2158; +add.f32 f2188, f2111, f2187; +mul.f32 f2191, f2187, 0f3F000000; +sub.f32 f2192, f2111, f2191; +add.f32 f2623, f2155, f2160; +sub.f32 f2193, f2155, f2160; +mul.f32 f2194, f2193, 0f3F5DB3D7; +add.f32 f2195, f2194, f2192; +sub.f32 f2196, f2192, f2194; +add.f32 f2622, f2117, f2623; +mul.f32 f2197, f2623, 0f3F000000; +sub.f32 f2198, f2117, f2197; +sub.f32 f2199, f2153, f2158; +mul.f32 f2200, f2199, 0f3F5DB3D7; +sub.f32 f2201, f2198, f2200; +add.f32 f2202, f2200, f2198; +add.f32 f2203, f2163, f2168; +add.f32 f2204, f2112, f2203; +mul.f32 f2207, f2203, 0f3F000000; +sub.f32 f2208, f2112, f2207; +add.f32 f2621, f2165, f2170; +sub.f32 f2209, f2165, f2170; +mul.f32 f2210, f2209, 0f3F5DB3D7; +add.f32 f2211, f2210, f2208; +sub.f32 f2212, f2208, f2210; +add.f32 f2620, f2118, f2621; +mul.f32 f2213, f2621, 0f3F000000; +sub.f32 f2214, f2118, f2213; +sub.f32 f2215, f2163, f2168; +mul.f32 f2216, f2215, 0f3F5DB3D7; +sub.f32 f2217, f2214, f2216; +add.f32 f2218, f2216, f2214; +add.f32 f2219, f1923, f1959; +add.f32 f2220, f1887, f2219; +mul.f32 f2223, f2219, 0f3F000000; +sub.f32 f2224, f1887, f2223; +add.f32 f2619, f1924, f1960; +sub.f32 f2225, f1924, f1960; +mul.f32 f2226, f2225, 0f3F5DB3D7; +add.f32 f2227, f2226, f2224; +sub.f32 f2228, f2224, f2226; +add.f32 f2618, f1888, f2619; +mul.f32 f2229, f2619, 0f3F000000; +sub.f32 f2230, f1888, f2229; +sub.f32 f2231, f1923, f1959; +mul.f32 f2232, f2231, 0f3F5DB3D7; +sub.f32 f2233, f2230, f2232; +add.f32 f2234, f2232, f2230; +add.f32 f2235, f1935, f1971; +add.f32 f2236, f1899, f2235; +mul.f32 f2239, f2235, 0f3F000000; +sub.f32 f2240, f1899, f2239; +add.f32 f2617, f1936, f1972; +sub.f32 f2241, f1936, f1972; +mul.f32 f2242, f2241, 0f3F5DB3D7; +add.f32 f2243, f2242, f2240; +sub.f32 f2244, f2240, f2242; +add.f32 f2616, f1900, f2617; +mul.f32 f2245, f2617, 0f3F000000; +sub.f32 f2246, f1900, f2245; +sub.f32 f2247, f1935, f1971; +mul.f32 f2248, f2247, 0f3F5DB3D7; +sub.f32 f2249, f2246, f2248; +add.f32 f2250, f2248, f2246; +add.f32 f2251, f1947, f1983; +add.f32 f2252, f1911, f2251; +mul.f32 f2255, f2251, 0f3F000000; +sub.f32 f2256, f1911, f2255; +add.f32 f2615, f1948, f1984; +sub.f32 f2257, f1948, f1984; +mul.f32 f2258, f2257, 0f3F5DB3D7; +add.f32 f2259, f2258, f2256; +sub.f32 f2260, f2256, f2258; +add.f32 f2614, f1912, f2615; +mul.f32 f2261, f2615, 0f3F000000; +sub.f32 f2262, f1912, f2261; +sub.f32 f2263, f1947, f1983; +mul.f32 f2264, f2263, 0f3F5DB3D7; +sub.f32 f2265, f2262, f2264; +add.f32 f2266, f2264, f2262; +mul.f32 f2268, f2249, 0fBF248DBB; +mul.f32 f2613, f2243, 0f3F441B7D; +sub.f32 f2269, f2613, f2268; +mul.f32 f2270, f2249, 0f3F441B7D; +fma.rn.f32 f2271, f2243, 0fBF248DBB, f2270; +mul.f32 f2273, f2265, 0fBF7C1C5C; +mul.f32 f2612, f2259, 0f3E31D0D4; +sub.f32 f2274, f2612, f2273; +mul.f32 f2275, f2265, 0f3E31D0D4; +fma.rn.f32 f2276, f2259, 0fBF7C1C5C, f2275; +mul.f32 f2278, f2250, 0fBF7C1C5C; +mul.f32 f2611, f2244, 0f3E31D0D4; +sub.f32 f2279, f2611, f2278; +mul.f32 f2280, f2250, 0f3E31D0D4; +fma.rn.f32 f2281, f2244, 0fBF7C1C5C, f2280; +mul.f32 f2283, f2266, 0fBEAF1D44; +mul.f32 f2610, f2260, 0fBF708FB2; +sub.f32 f2284, f2610, f2283; +mul.f32 f2285, f2266, 0fBF708FB2; +fma.rn.f32 f2286, f2260, 0fBEAF1D44, f2285; +add.f32 f2287, f2236, f2252; +add.f32 f2288, f2220, f2287; +mul.f32 f2291, f2287, 0f3F000000; +sub.f32 f2292, f2220, f2291; +add.f32 f2609, f2616, f2614; +sub.f32 f2293, f2616, f2614; +mul.f32 f2294, f2293, 0f3F5DB3D7; +add.f32 f2295, f2294, f2292; +sub.f32 f2296, f2292, f2294; +add.f32 f2608, f2618, f2609; +mul.f32 f2297, f2609, 0f3F000000; +sub.f32 f2298, f2618, f2297; +sub.f32 f2299, f2236, f2252; +mul.f32 f2300, f2299, 0f3F5DB3D7; +sub.f32 f2301, f2298, f2300; +add.f32 f2302, f2300, f2298; +add.f32 f2303, f2269, f2274; +add.f32 f2304, f2227, f2303; +mul.f32 f2307, f2303, 0f3F000000; +sub.f32 f2308, f2227, f2307; +add.f32 f2607, f2271, f2276; +sub.f32 f2309, f2271, f2276; +mul.f32 f2310, f2309, 0f3F5DB3D7; +add.f32 f2311, f2310, f2308; +sub.f32 f2312, f2308, f2310; +add.f32 f2606, f2233, f2607; +mul.f32 f2313, f2607, 0f3F000000; +sub.f32 f2314, f2233, f2313; +sub.f32 f2315, f2269, f2274; +mul.f32 f2316, f2315, 0f3F5DB3D7; +sub.f32 f2317, f2314, f2316; +add.f32 f2318, f2316, f2314; +add.f32 f2319, f2279, f2284; +add.f32 f2320, f2228, f2319; +mul.f32 f2323, f2319, 0f3F000000; +sub.f32 f2324, f2228, f2323; +add.f32 f2605, f2281, f2286; +sub.f32 f2325, f2281, f2286; +mul.f32 f2326, f2325, 0f3F5DB3D7; +add.f32 f2327, f2326, f2324; +sub.f32 f2328, f2324, f2326; +add.f32 f2604, f2234, f2605; +mul.f32 f2329, f2605, 0f3F000000; +sub.f32 f2330, f2234, f2329; +sub.f32 f2331, f2279, f2284; +mul.f32 f2332, f2331, 0f3F5DB3D7; +sub.f32 f2333, f2330, f2332; +add.f32 f2334, f2332, f2330; +mul.f32 f2602, f2188, 0f3F791978; +mul.f32 f2603, f2622, 0fBE6C2691; +sub.f32 f2337, f2602, f2603; +mul.f32 f2338, f2622, 0f3F791978; +fma.rn.f32 f2339, f2188, 0fBE6C2691, f2338; +mul.f32 f2600, f2304, 0f3F64C51C; +mul.f32 f2601, f2606, 0fBEE5C902; +sub.f32 f2342, f2600, f2601; +mul.f32 f2343, f2606, 0f3F64C51C; +fma.rn.f32 f2344, f2304, 0fBEE5C902, f2343; +mul.f32 f2598, f2204, 0f3F64C51C; +mul.f32 f2599, f2620, 0fBEE5C902; +sub.f32 f2347, f2598, f2599; +mul.f32 f2348, f2620, 0f3F64C51C; +fma.rn.f32 f2349, f2204, 0fBEE5C902, f2348; +mul.f32 f2351, f2604, 0fBF4D57F2; +mul.f32 f2597, f2320, 0f3F18DF63; +sub.f32 f2352, f2597, f2351; +mul.f32 f2353, f2604, 0f3F18DF63; +fma.rn.f32 f2354, f2320, 0fBF4D57F2, f2353; +mul.f32 f2356, f2185, 0fBF248DBB; +mul.f32 f2596, f2179, 0f3F441B7D; +sub.f32 f2357, f2596, f2356; +mul.f32 f2358, f2185, 0f3F441B7D; +fma.rn.f32 f2359, f2179, 0fBF248DBB, f2358; +mul.f32 f2361, f2301, 0fBF7C1C5C; +mul.f32 f2595, f2295, 0f3E31D0D4; +sub.f32 f2362, f2595, f2361; +mul.f32 f2363, f2301, 0f3E31D0D4; +fma.rn.f32 f2364, f2295, 0fBF7C1C5C, f2363; +mul.f32 f2366, f2201, 0fBF4D57F2; +mul.f32 f2594, f2195, 0f3F18DF63; +sub.f32 f2367, f2594, f2366; +mul.f32 f2368, f2201, 0f3F18DF63; +fma.rn.f32 f2369, f2195, 0fBF4D57F2, f2368; +mul.f32 f2371, f2317, 0fBF753ECD; +mul.f32 f2593, f2311, 0fBE92D7E0; +sub.f32 f2372, f2593, f2371; +mul.f32 f2373, f2317, 0fBE92D7E0; +fma.rn.f32 f2374, f2311, 0fBF753ECD, f2373; +mul.f32 f2591, f2211, 0f3ECACAF8; +mul.f32 f2592, f2217, 0fBF6B1036; +sub.f32 f2377, f2591, f2592; +mul.f32 f2378, f2217, 0f3ECACAF8; +fma.rn.f32 f2379, f2211, 0fBF6B1036, f2378; +mul.f32 f2589, f2327, 0fBF2FAD88; +mul.f32 f2590, f2333, 0fBF3A3529; +sub.f32 f2382, f2589, f2590; +mul.f32 f2383, f2333, 0fBF2FAD88; +fma.rn.f32 f2384, f2327, 0fBF3A3529, f2383; +mul.f32 f2587, f2180, 0f3E31D0D4; +mul.f32 f2588, f2186, 0fBF7C1C5C; +sub.f32 f2387, f2587, f2588; +mul.f32 f2388, f2186, 0f3E31D0D4; +fma.rn.f32 f2389, f2180, 0fBF7C1C5C, f2388; +mul.f32 f2585, f2296, 0fBF708FB2; +mul.f32 f2586, f2302, 0fBEAF1D44; +sub.f32 f2392, f2585, f2586; +mul.f32 f2393, f2302, 0fBF708FB2; +fma.rn.f32 f2394, f2296, 0fBEAF1D44, f2393; +mul.f32 f2396, f2202, 0fBF7F9120; +mul.f32 f2584, f2196, 0fBD6E2946; +sub.f32 f2397, f2584, f2396; +mul.f32 f2398, f2202, 0fBD6E2946; +fma.rn.f32 f2399, f2196, 0fBF7F9120, f2398; +mul.f32 f2401, f2318, 0f3DEDC21F; +mul.f32 f2583, f2312, 0fBF7E44DE; +sub.f32 f2402, f2583, f2401; +mul.f32 f2403, f2318, 0fBF7E44DE; +fma.rn.f32 f2404, f2312, 0f3DEDC21F, f2403; +mul.f32 f2406, f2218, 0fBF753ECD; +mul.f32 f2582, f2212, 0fBE92D7E0; +sub.f32 f2407, f2582, f2406; +mul.f32 f2408, f2218, 0fBE92D7E0; +fma.rn.f32 f2409, f2212, 0fBF753ECD, f2408; +mul.f32 f2411, f2334, 0f3F0CAC9F; +mul.f32 f2581, f2328, 0fBF55E287; +sub.f32 f2412, f2581, f2411; +mul.f32 f2413, f2334, 0fBF55E287; +fma.rn.f32 f2414, f2328, 0f3F0CAC9F, f2413; +add.f32 f2415, f2172, f2288; +mul.f32 f2417, f2415, 0f3F000000; +sub.f32 f2418, f2056, f2417; +add.f32 f2580, f2624, f2608; +sub.f32 f2419, f2624, f2608; +mul.f32 f2420, f2419, 0f3F5DB3D7; +mul.f32 f2421, f2580, 0f3F000000; +sub.f32 f2422, f2640, f2421; +sub.f32 f2423, f2172, f2288; +mul.f32 f2424, f2423, 0f3F5DB3D7; +add.f32 f2425, f2337, f2342; +mul.f32 f2427, f2425, 0f3F000000; +sub.f32 f2428, f2072, f2427; +add.f32 f2579, f2339, f2344; +sub.f32 f2429, f2339, f2344; +mul.f32 f2430, f2429, 0f3F5DB3D7; +mul.f32 f2431, f2579, 0f3F000000; +sub.f32 f2432, f2638, f2431; +sub.f32 f2433, f2337, f2342; +mul.f32 f2434, f2433, 0f3F5DB3D7; +add.f32 f2435, f2347, f2352; +mul.f32 f2437, f2435, 0f3F000000; +sub.f32 f2438, f2088, f2437; +add.f32 f2578, f2349, f2354; +sub.f32 f2439, f2349, f2354; +mul.f32 f2440, f2439, 0f3F5DB3D7; +mul.f32 f2441, f2578, 0f3F000000; +sub.f32 f2442, f2636, f2441; +sub.f32 f2443, f2347, f2352; +mul.f32 f2444, f2443, 0f3F5DB3D7; +add.f32 f2445, f2357, f2362; +mul.f32 f2447, f2445, 0f3F000000; +sub.f32 f2448, f2063, f2447; +add.f32 f2577, f2359, f2364; +sub.f32 f2449, f2359, f2364; +mul.f32 f2450, f2449, 0f3F5DB3D7; +mul.f32 f2451, f2577, 0f3F000000; +sub.f32 f2452, f2069, f2451; +sub.f32 f2453, f2357, f2362; +mul.f32 f2454, f2453, 0f3F5DB3D7; +add.f32 f2455, f2367, f2372; +mul.f32 f2457, f2455, 0f3F000000; +sub.f32 f2458, f2079, f2457; +add.f32 f2576, f2369, f2374; +sub.f32 f2459, f2369, f2374; +mul.f32 f2460, f2459, 0f3F5DB3D7; +mul.f32 f2461, f2576, 0f3F000000; +sub.f32 f2462, f2085, f2461; +sub.f32 f2463, f2367, f2372; +mul.f32 f2464, f2463, 0f3F5DB3D7; +add.f32 f2465, f2377, f2382; +mul.f32 f2467, f2465, 0f3F000000; +sub.f32 f2468, f2095, f2467; +add.f32 f2575, f2379, f2384; +sub.f32 f2469, f2379, f2384; +mul.f32 f2470, f2469, 0f3F5DB3D7; +mul.f32 f2471, f2575, 0f3F000000; +sub.f32 f2472, f2101, f2471; +sub.f32 f2473, f2377, f2382; +mul.f32 f2474, f2473, 0f3F5DB3D7; +add.f32 f2475, f2387, f2392; +mul.f32 f2477, f2475, 0f3F000000; +sub.f32 f2478, f2064, f2477; +add.f32 f2574, f2389, f2394; +sub.f32 f2479, f2389, f2394; +mul.f32 f2480, f2479, 0f3F5DB3D7; +mul.f32 f2481, f2574, 0f3F000000; +sub.f32 f2482, f2070, f2481; +sub.f32 f2483, f2387, f2392; +mul.f32 f2484, f2483, 0f3F5DB3D7; +add.f32 f2485, f2397, f2402; +mul.f32 f2487, f2485, 0f3F000000; +sub.f32 f2488, f2080, f2487; +add.f32 f2573, f2399, f2404; +sub.f32 f2489, f2399, f2404; +mul.f32 f2490, f2489, 0f3F5DB3D7; +mul.f32 f2491, f2573, 0f3F000000; +sub.f32 f2492, f2086, f2491; +sub.f32 f2493, f2397, f2402; +mul.f32 f2494, f2493, 0f3F5DB3D7; +add.f32 f2495, f2407, f2412; +mul.f32 f2497, f2495, 0f3F000000; +sub.f32 f2498, f2096, f2497; +add.f32 f2572, f2409, f2414; +sub.f32 f2499, f2409, f2414; +mul.f32 f2500, f2499, 0f3F5DB3D7; +mul.f32 f2501, f2572, 0f3F000000; +sub.f32 f2502, f2102, f2501; +sub.f32 f2503, f2407, f2412; +mul.f32 f2504, f2503, 0f3F5DB3D7; +add.f32 %1, f2640, f2580; +add.f32 %0, f2056, f2415; +add.f32 %3, f2638, f2579; +add.f32 %2, f2072, f2425; +add.f32 %5, f2636, f2578; +add.f32 %4, f2088, f2435; +add.f32 %7, f2069, f2577; +add.f32 %6, f2063, f2445; +add.f32 %9, f2085, f2576; +add.f32 %8, f2079, f2455; +add.f32 %11, f2101, f2575; +add.f32 %10, f2095, f2465; +add.f32 %13, f2070, f2574; +add.f32 %12, f2064, f2475; +add.f32 %15, f2086, f2573; +add.f32 %14, f2080, f2485; +add.f32 %17, f2102, f2572; +add.f32 %16, f2096, f2495; +add.f32 %18, f2420, f2418; +sub.f32 %19, f2422, f2424; +sub.f32 %21, f2432, f2434; +add.f32 %20, f2430, f2428; +sub.f32 %23, f2442, f2444; +add.f32 %22, f2440, f2438; +add.f32 %24, f2450, f2448; +sub.f32 %25, f2452, f2454; +add.f32 %26, f2460, f2458; +sub.f32 %27, f2462, f2464; +add.f32 %28, f2470, f2468; +sub.f32 %29, f2472, f2474; +add.f32 %30, f2480, f2478; +sub.f32 %31, f2482, f2484; +sub.f32 %33, f2492, f2494; +add.f32 %32, f2490, f2488; +sub.f32 %35, f2502, f2504; +add.f32 %34, f2500, f2498; +add.f32 %37, f2424, f2422; +sub.f32 %36, f2418, f2420; +add.f32 %39, f2434, f2432; +sub.f32 %38, f2428, f2430; +add.f32 %41, f2444, f2442; +sub.f32 %40, f2438, f2440; +add.f32 %43, f2454, f2452; +sub.f32 %42, f2448, f2450; +add.f32 %45, f2464, f2462; +sub.f32 %44, f2458, f2460; +add.f32 %47, f2474, f2472; +sub.f32 %46, f2468, f2470; +add.f32 %49, f2484, f2482; +sub.f32 %48, f2478, f2480; +add.f32 %51, f2494, f2492; +sub.f32 %50, f2488, f2490; +add.f32 %53, f2504, f2502; +sub.f32 %52, f2498, f2500; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_19683), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1154, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2937>; +.reg .b32 r<24>; +.reg .b64 rd<16>; +mov.u32 r22, %tid.y; +mov.u32 r23, %54; +mad.lo.s32 r3, r22, 78732, r23; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2928, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2927, %58, f2928; +mul.f32 f119, f2928, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2926, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2925, %64, f2926; +mul.f32 f135, f2926, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2924, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2923, %70, f2924; +mul.f32 f151, f2924, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f2922, f133, 0f3F441B7D; +sub.f32 f159, f2922, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f2920, f149, 0f3E31D0D4; +mul.f32 f2921, f155, 0fBF7C1C5C; +sub.f32 f164, f2920, f2921; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f2918, f134, 0f3E31D0D4; +mul.f32 f2919, f140, 0fBF7C1C5C; +sub.f32 f169, f2918, f2919; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f2916, f150, 0fBF708FB2; +mul.f32 f2917, f156, 0fBEAF1D44; +sub.f32 f174, f2916, f2917; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2915, f2925, f2923; +sub.f32 f183, f2925, f2923; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2914, f2927, f2915; +mul.f32 f187, f2915, 0f3F000000; +sub.f32 f188, f2927, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2913, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2912, f123, f2913; +mul.f32 f203, f2913, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2911, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2910, f124, f2911; +mul.f32 f219, f2911, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2907, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2905, %113, f2907; +mul.f32 f235, f2907, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2902, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2900, %116, f2902; +mul.f32 f251, f2902, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2897, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2895, %119, f2897; +mul.f32 f267, f2897, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f2894, f249, 0f3F441B7D; +sub.f32 f275, f2894, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f2893, f265, 0f3E31D0D4; +sub.f32 f280, f2893, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f2891, f250, 0f3E31D0D4; +mul.f32 f2892, f256, 0fBF7C1C5C; +sub.f32 f285, f2891, f2892; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f2889, f266, 0fBF708FB2; +mul.f32 f2890, f272, 0fBEAF1D44; +sub.f32 f290, f2889, f2890; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2888, f2900, f2895; +sub.f32 f299, f2900, f2895; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2887, f2905, f2888; +mul.f32 f303, f2888, 0f3F000000; +sub.f32 f304, f2905, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2886, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2885, f239, f2886; +mul.f32 f319, f2886, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2884, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2883, f240, f2884; +mul.f32 f335, f2884, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2880, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2878, %122, f2880; +mul.f32 f351, f2880, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2875, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2873, %125, f2875; +mul.f32 f367, f2875, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2871, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2869, %127, f2871; +mul.f32 f383, f2871, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f2868, f365, 0f3F441B7D; +sub.f32 f391, f2868, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f2867, f381, 0f3E31D0D4; +sub.f32 f396, f2867, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f2865, f366, 0f3E31D0D4; +mul.f32 f2866, f372, 0fBF7C1C5C; +sub.f32 f401, f2865, f2866; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f2863, f382, 0fBF708FB2; +mul.f32 f2864, f388, 0fBEAF1D44; +sub.f32 f406, f2863, f2864; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2862, f2873, f2869; +sub.f32 f415, f2873, f2869; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2861, f2878, f2862; +mul.f32 f419, f2862, 0f3F000000; +sub.f32 f420, f2878, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2860, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2859, f355, f2860; +mul.f32 f435, f2860, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2858, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2857, f356, f2858; +mul.f32 f451, f2858, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2885, 0fBE6C2691; +mul.f32 f2856, f310, 0f3F791978; +sub.f32 f459, f2856, f458; +mul.f32 f460, f2885, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f2854, f426, 0f3F64C51C; +mul.f32 f2855, f2859, 0fBEE5C902; +sub.f32 f464, f2854, f2855; +mul.f32 f465, f2859, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f2852, f326, 0f3F64C51C; +mul.f32 f2853, f2883, 0fBEE5C902; +sub.f32 f469, f2852, f2853; +mul.f32 f470, f2883, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f2850, f442, 0f3F18DF63; +mul.f32 f2851, f2857, 0fBF4D57F2; +sub.f32 f474, f2850, f2851; +mul.f32 f475, f2857, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f2848, f301, 0f3F441B7D; +mul.f32 f2849, f307, 0fBF248DBB; +sub.f32 f479, f2848, f2849; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f2847, f417, 0f3E31D0D4; +sub.f32 f484, f2847, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f2846, f317, 0f3F18DF63; +sub.f32 f489, f2846, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f2845, f433, 0fBE92D7E0; +sub.f32 f494, f2845, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f2844, f333, 0f3ECACAF8; +sub.f32 f499, f2844, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f2843, f449, 0fBF2FAD88; +sub.f32 f504, f2843, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f2842, f302, 0f3E31D0D4; +sub.f32 f509, f2842, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f2840, f418, 0fBF708FB2; +mul.f32 f2841, f424, 0fBEAF1D44; +sub.f32 f514, f2840, f2841; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f2838, f318, 0fBD6E2946; +mul.f32 f2839, f324, 0fBF7F9120; +sub.f32 f519, f2838, f2839; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f2836, f434, 0fBF7E44DE; +mul.f32 f2837, f440, 0f3DEDC21F; +sub.f32 f524, f2836, f2837; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f2835, f334, 0fBE92D7E0; +sub.f32 f529, f2835, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f2834, f450, 0fBF55E287; +sub.f32 f534, f2834, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f2833, f2887, f2861; +sub.f32 f543, f2887, f2861; +mul.f32 f544, f543, 0f3F5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f2832, f2914, f2833; +mul.f32 f547, f2833, 0f3F000000; +sub.f32 f548, f2914, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0f3F5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f2831, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f2830, f2912, f2831; +mul.f32 f563, f2831, 0f3F000000; +sub.f32 f564, f2912, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0f3F5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f2829, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f2828, f2910, f2829; +mul.f32 f579, f2829, 0f3F000000; +sub.f32 f580, f2910, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0f3F5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f2827, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0f3F5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f2826, f191, f2827; +mul.f32 f595, f2827, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0f3F5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f2825, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0f3F5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f2824, f207, f2825; +mul.f32 f611, f2825, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0f3F5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f2823, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0f3F5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f2822, f223, f2823; +mul.f32 f627, f2823, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0f3F5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f2821, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0f3F5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f2820, f192, f2821; +mul.f32 f643, f2821, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0f3F5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f2819, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0f3F5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f2818, f208, f2819; +mul.f32 f659, f2819, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0f3F5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f2817, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0f3F5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f2816, f224, f2817; +mul.f32 f675, f2817, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0f3F5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r21, %tid.x; +mul.wide.u32 rd2, r21, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r21, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f686, f682, f2830; +mul.f32 f2815, f681, f554; +sub.f32 f687, f2815, f686; +mul.f32 f688, f681, f2830; +fma.rn.f32 f689, f682, f554, f688; +mul.f32 f691, f682, f682; +mul.f32 f2814, f681, f681; +sub.f32 f692, f2814, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f696, f694, f2828; +mul.f32 f2813, f692, f570; +sub.f32 f697, f2813, f696; +mul.f32 f698, f692, f2828; +fma.rn.f32 f699, f694, f570, f698; +mul.f32 f701, f682, f694; +mul.f32 f2812, f681, f692; +sub.f32 f702, f2812, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f706, f704, f2826; +mul.f32 f2811, f702, f586; +sub.f32 f707, f2811, f706; +mul.f32 f708, f702, f2826; +fma.rn.f32 f709, f704, f586, f708; +mul.f32 f2809, f681, f702; +mul.f32 f2810, f682, f704; +sub.f32 f712, f2809, f2810; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f2807, f712, f602; +mul.f32 f2808, f714, f2824; +sub.f32 f717, f2807, f2808; +mul.f32 f718, f712, f2824; +fma.rn.f32 f719, f714, f602, f718; +mul.f32 f2805, f681, f712; +mul.f32 f2806, f682, f714; +sub.f32 f722, f2805, f2806; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f2803, f722, f618; +mul.f32 f2804, f724, f2822; +sub.f32 f727, f2803, f2804; +mul.f32 f728, f722, f2822; +fma.rn.f32 f729, f724, f618, f728; +mul.f32 f731, f682, f724; +mul.f32 f2802, f681, f722; +sub.f32 f732, f2802, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f736, f734, f2820; +mul.f32 f2801, f732, f634; +sub.f32 f737, f2801, f736; +mul.f32 f738, f732, f2820; +fma.rn.f32 f739, f734, f634, f738; +mul.f32 f741, f682, f734; +mul.f32 f2800, f681, f732; +sub.f32 f742, f2800, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f746, f744, f2818; +mul.f32 f2799, f742, f650; +sub.f32 f747, f2799, f746; +mul.f32 f748, f742, f2818; +fma.rn.f32 f749, f744, f650, f748; +mul.f32 f751, f682, f744; +mul.f32 f2798, f681, f742; +sub.f32 f752, f2798, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f756, f754, f2816; +mul.f32 f2797, f752, f666; +sub.f32 f757, f2797, f756; +mul.f32 f758, f752, f2816; +fma.rn.f32 f759, f754, f666, f758; +mul.f32 f2795, f681, f752; +mul.f32 f2796, f682, f754; +sub.f32 f762, f2795, f2796; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f2793, f762, f545; +mul.f32 f2794, f764, f551; +sub.f32 f767, f2793, f2794; +mul.f32 f768, f762, f551; +fma.rn.f32 f769, f764, f545, f768; +mul.f32 f2791, f681, f762; +mul.f32 f2792, f682, f764; +sub.f32 f772, f2791, f2792; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f776, f774, f567; +mul.f32 f2790, f772, f561; +sub.f32 f777, f2790, f776; +mul.f32 f778, f772, f567; +fma.rn.f32 f779, f774, f561, f778; +mul.f32 f781, f682, f774; +mul.f32 f2789, f681, f772; +sub.f32 f782, f2789, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f786, f784, f583; +mul.f32 f2788, f782, f577; +sub.f32 f787, f2788, f786; +mul.f32 f788, f782, f583; +fma.rn.f32 f789, f784, f577, f788; +mul.f32 f791, f682, f784; +mul.f32 f2787, f681, f782; +sub.f32 f792, f2787, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f796, f794, f599; +mul.f32 f2786, f792, f593; +sub.f32 f797, f2786, f796; +mul.f32 f798, f792, f599; +fma.rn.f32 f799, f794, f593, f798; +mul.f32 f801, f682, f794; +mul.f32 f2785, f681, f792; +sub.f32 f802, f2785, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f2783, f802, f609; +mul.f32 f2784, f804, f615; +sub.f32 f807, f2783, f2784; +mul.f32 f808, f802, f615; +fma.rn.f32 f809, f804, f609, f808; +mul.f32 f2781, f681, f802; +mul.f32 f2782, f682, f804; +sub.f32 f812, f2781, f2782; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f2779, f812, f625; +mul.f32 f2780, f814, f631; +sub.f32 f817, f2779, f2780; +mul.f32 f818, f812, f631; +fma.rn.f32 f819, f814, f625, f818; +mul.f32 f2777, f681, f812; +mul.f32 f2778, f682, f814; +sub.f32 f822, f2777, f2778; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f826, f824, f647; +mul.f32 f2776, f822, f641; +sub.f32 f827, f2776, f826; +mul.f32 f828, f822, f647; +fma.rn.f32 f829, f824, f641, f828; +mul.f32 f831, f682, f824; +mul.f32 f2775, f681, f822; +sub.f32 f832, f2775, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f836, f834, f663; +mul.f32 f2774, f832, f657; +sub.f32 f837, f2774, f836; +mul.f32 f838, f832, f663; +fma.rn.f32 f839, f834, f657, f838; +mul.f32 f841, f682, f834; +mul.f32 f2773, f681, f832; +sub.f32 f842, f2773, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f846, f844, f679; +mul.f32 f2772, f842, f673; +sub.f32 f847, f2772, f846; +mul.f32 f848, f842, f679; +fma.rn.f32 f849, f844, f673, f848; +mul.f32 f2770, f681, f842; +mul.f32 f2771, f682, f844; +sub.f32 f852, f2770, f2771; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f2768, f852, f546; +mul.f32 f2769, f854, f552; +sub.f32 f857, f2768, f2769; +mul.f32 f858, f852, f552; +fma.rn.f32 f859, f854, f546, f858; +mul.f32 f2766, f681, f852; +mul.f32 f2767, f682, f854; +sub.f32 f862, f2766, f2767; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f2764, f862, f562; +mul.f32 f2765, f864, f568; +sub.f32 f867, f2764, f2765; +mul.f32 f868, f862, f568; +fma.rn.f32 f869, f864, f562, f868; +mul.f32 f871, f682, f864; +mul.f32 f2763, f681, f862; +sub.f32 f872, f2763, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f876, f874, f584; +mul.f32 f2762, f872, f578; +sub.f32 f877, f2762, f876; +mul.f32 f878, f872, f584; +fma.rn.f32 f879, f874, f578, f878; +mul.f32 f881, f682, f874; +mul.f32 f2761, f681, f872; +sub.f32 f882, f2761, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f886, f884, f600; +mul.f32 f2760, f882, f594; +sub.f32 f887, f2760, f886; +mul.f32 f888, f882, f600; +fma.rn.f32 f889, f884, f594, f888; +mul.f32 f891, f682, f884; +mul.f32 f2759, f681, f882; +sub.f32 f892, f2759, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f2757, f892, f610; +mul.f32 f2758, f894, f616; +sub.f32 f897, f2757, f2758; +mul.f32 f898, f892, f616; +fma.rn.f32 f899, f894, f610, f898; +mul.f32 f2755, f681, f892; +mul.f32 f2756, f682, f894; +sub.f32 f902, f2755, f2756; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f2753, f902, f626; +mul.f32 f2754, f904, f632; +sub.f32 f907, f2753, f2754; +mul.f32 f908, f902, f632; +fma.rn.f32 f909, f904, f626, f908; +mul.f32 f2751, f681, f902; +mul.f32 f2752, f682, f904; +sub.f32 f912, f2751, f2752; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f916, f914, f648; +mul.f32 f2750, f912, f642; +sub.f32 f917, f2750, f916; +mul.f32 f918, f912, f648; +fma.rn.f32 f919, f914, f642, f918; +mul.f32 f921, f682, f914; +mul.f32 f2749, f681, f912; +sub.f32 f922, f2749, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f926, f924, f664; +mul.f32 f2748, f922, f658; +sub.f32 f927, f2748, f926; +mul.f32 f928, f922, f664; +fma.rn.f32 f929, f924, f658, f928; +mul.f32 f931, f682, f924; +mul.f32 f2747, f681, f922; +sub.f32 f932, f2747, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f936, f934, f680; +mul.f32 f2746, f932, f674; +sub.f32 f937, f2746, f936; +mul.f32 f938, f932, f680; +fma.rn.f32 f939, f934, f674, f938; +mad.lo.s32 r8, r5, 78732, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f687; +st.shared.f32 [r9+8], f697; +st.shared.f32 [r9+12], f707; +st.shared.f32 [r9+16], f717; +st.shared.f32 [r9+20], f727; +st.shared.f32 [r9+24], f737; +st.shared.f32 [r9+28], f747; +st.shared.f32 [r9+32], f757; +st.shared.f32 [r9+36], f767; +st.shared.f32 [r9+40], f777; +st.shared.f32 [r9+44], f787; +st.shared.f32 [r9+48], f797; +st.shared.f32 [r9+52], f807; +st.shared.f32 [r9+56], f817; +st.shared.f32 [r9+60], f827; +st.shared.f32 [r9+64], f837; +st.shared.f32 [r9+68], f847; +st.shared.f32 [r9+72], f857; +st.shared.f32 [r9+76], f867; +st.shared.f32 [r9+80], f877; +st.shared.f32 [r9+84], f887; +st.shared.f32 [r9+88], f897; +st.shared.f32 [r9+92], f907; +st.shared.f32 [r9+96], f917; +st.shared.f32 [r9+100], f927; +st.shared.f32 [r9+104], f937; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+2916]; +ld.shared.f32 f942, [r10+5832]; +ld.shared.f32 f943, [r10+8748]; +ld.shared.f32 f944, [r10+11664]; +ld.shared.f32 f945, [r10+14580]; +ld.shared.f32 f946, [r10+17496]; +ld.shared.f32 f947, [r10+20412]; +ld.shared.f32 f948, [r10+23328]; +ld.shared.f32 f949, [r10+26244]; +ld.shared.f32 f950, [r10+29160]; +ld.shared.f32 f951, [r10+32076]; +ld.shared.f32 f952, [r10+34992]; +ld.shared.f32 f953, [r10+37908]; +ld.shared.f32 f954, [r10+40824]; +ld.shared.f32 f955, [r10+43740]; +ld.shared.f32 f956, [r10+46656]; +ld.shared.f32 f957, [r10+49572]; +ld.shared.f32 f958, [r10+52488]; +ld.shared.f32 f959, [r10+55404]; +ld.shared.f32 f960, [r10+58320]; +ld.shared.f32 f961, [r10+61236]; +ld.shared.f32 f962, [r10+64152]; +ld.shared.f32 f963, [r10+67068]; +ld.shared.f32 f964, [r10+69984]; +ld.shared.f32 f965, [r10+72900]; +ld.shared.f32 f966, [r10+75816]; +barrier.sync 0; +st.shared.f32 [r9], f2832; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +ld.shared.f32 f2745, [r10+26244]; +ld.shared.f32 f2744, [r10+52488]; +add.f32 f2743, f2745, f2744; +sub.f32 f1000, f2745, f2744; +mul.f32 f1001, f1000, 0f3F5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +ld.shared.f32 f2742, [r10]; +add.f32 f2741, f2742, f2743; +mul.f32 f1004, f2743, 0f3F000000; +sub.f32 f1005, f2742, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0f3F5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +ld.shared.f32 f2740, [r10+34992]; +ld.shared.f32 f2739, [r10+61236]; +add.f32 f2738, f2740, f2739; +sub.f32 f1016, f2740, f2739; +mul.f32 f1017, f1016, 0f3F5DB3D7; +ld.shared.f32 f2737, [r10+8748]; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f2736, f2737, f2738; +mul.f32 f1020, f2738, 0f3F000000; +sub.f32 f1021, f2737, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0f3F5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +ld.shared.f32 f2735, [r10+43740]; +sub.f32 f1031, f946, f1030; +ld.shared.f32 f2734, [r10+69984]; +add.f32 f2733, f2735, f2734; +sub.f32 f1032, f2735, f2734; +mul.f32 f1033, f1032, 0f3F5DB3D7; +ld.shared.f32 f2732, [r10+17496]; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f2731, f2732, f2733; +mul.f32 f1036, f2733, 0f3F000000; +sub.f32 f1037, f2732, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0f3F5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f2729, f1018, 0f3F441B7D; +mul.f32 f2730, f1024, 0fBF248DBB; +sub.f32 f1044, f2729, f2730; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0fBF248DBB, f1045; +mul.f32 f1048, f1040, 0fBF7C1C5C; +mul.f32 f2728, f1034, 0f3E31D0D4; +sub.f32 f1049, f2728, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0fBF7C1C5C, f1050; +mul.f32 f1053, f1025, 0fBF7C1C5C; +mul.f32 f2727, f1019, 0f3E31D0D4; +sub.f32 f1054, f2727, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0fBF7C1C5C, f1055; +mul.f32 f1058, f1041, 0fBEAF1D44; +mul.f32 f2726, f1035, 0fBF708FB2; +sub.f32 f1059, f2726, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0fBEAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f2725, f2736, f2731; +sub.f32 f1068, f2736, f2731; +mul.f32 f1069, f1068, 0f3F5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f2724, f2741, f2725; +mul.f32 f1072, f2725, 0f3F000000; +sub.f32 f1073, f2741, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0f3F5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f2723, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0f3F5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f2722, f1008, f2723; +mul.f32 f1088, f2723, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0f3F5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f2721, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0f3F5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f2720, f1009, f2721; +mul.f32 f1104, f2721, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0f3F5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +ld.shared.f32 f2719, [r10+55404]; +sub.f32 f1115, f941, f1114; +ld.shared.f32 f2718, [r10+29160]; +add.f32 f2717, f2718, f2719; +sub.f32 f1116, f2718, f2719; +mul.f32 f1117, f1116, 0f3F5DB3D7; +ld.shared.f32 f2716, [r10+2916]; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +add.f32 f2715, f2716, f2717; +mul.f32 f1120, f2717, 0f3F000000; +sub.f32 f1121, f2716, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0f3F5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +ld.shared.f32 f2714, [r10+64152]; +sub.f32 f1131, f944, f1130; +ld.shared.f32 f2713, [r10+37908]; +add.f32 f2712, f2713, f2714; +sub.f32 f1132, f2713, f2714; +mul.f32 f1133, f1132, 0f3F5DB3D7; +ld.shared.f32 f2711, [r10+11664]; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +add.f32 f2710, f2711, f2712; +mul.f32 f1136, f2712, 0f3F000000; +sub.f32 f1137, f2711, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0f3F5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +ld.shared.f32 f2709, [r10+46656]; +sub.f32 f1147, f947, f1146; +ld.shared.f32 f2708, [r10+72900]; +add.f32 f2707, f2709, f2708; +sub.f32 f1148, f2709, f2708; +mul.f32 f1149, f1148, 0f3F5DB3D7; +ld.shared.f32 f2706, [r10+20412]; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +add.f32 f2705, f2706, f2707; +mul.f32 f1152, f2707, 0f3F000000; +sub.f32 f1153, f2706, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0f3F5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f2703, f1134, 0f3F441B7D; +mul.f32 f2704, f1140, 0fBF248DBB; +sub.f32 f1160, f2703, f2704; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0fBF248DBB, f1161; +mul.f32 f2701, f1150, 0f3E31D0D4; +mul.f32 f2702, f1156, 0fBF7C1C5C; +sub.f32 f1165, f2701, f2702; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0fBF7C1C5C, f1166; +mul.f32 f1169, f1141, 0fBF7C1C5C; +mul.f32 f2700, f1135, 0f3E31D0D4; +sub.f32 f1170, f2700, f1169; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0fBF7C1C5C, f1171; +mul.f32 f1174, f1157, 0fBEAF1D44; +mul.f32 f2699, f1151, 0fBF708FB2; +sub.f32 f1175, f2699, f1174; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0fBEAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f2698, f2710, f2705; +sub.f32 f1184, f2710, f2705; +mul.f32 f1185, f1184, 0f3F5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f2697, f2715, f2698; +mul.f32 f1188, f2698, 0f3F000000; +sub.f32 f1189, f2715, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0f3F5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f2696, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0f3F5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f2695, f1124, f2696; +mul.f32 f1204, f2696, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0f3F5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f2694, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0f3F5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f2693, f1125, f2694; +mul.f32 f1220, f2694, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0f3F5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +ld.shared.f32 f2692, [r10+32076]; +ld.shared.f32 f2691, [r10+58320]; +sub.f32 f1231, f942, f1230; +add.f32 f2690, f2692, f2691; +sub.f32 f1232, f2692, f2691; +mul.f32 f1233, f1232, 0f3F5DB3D7; +ld.shared.f32 f2689, [r10+5832]; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f2688, f2689, f2690; +mul.f32 f1236, f2690, 0f3F000000; +sub.f32 f1237, f2689, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0f3F5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +ld.shared.f32 f2687, [r10+40824]; +ld.shared.f32 f2686, [r10+67068]; +sub.f32 f1247, f945, f1246; +add.f32 f2685, f2687, f2686; +sub.f32 f1248, f2687, f2686; +mul.f32 f1249, f1248, 0f3F5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +ld.shared.f32 f2684, [r10+14580]; +add.f32 f2683, f2684, f2685; +mul.f32 f1252, f2685, 0f3F000000; +sub.f32 f1253, f2684, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0f3F5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +ld.shared.f32 f2682, [r10+75816]; +ld.shared.f32 f2681, [r10+49572]; +add.f32 f2680, f2681, f2682; +sub.f32 f1264, f2681, f2682; +mul.f32 f1265, f1264, 0f3F5DB3D7; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +ld.shared.f32 f2679, [r10+23328]; +add.f32 f2678, f2679, f2680; +mul.f32 f1268, f2680, 0f3F000000; +sub.f32 f1269, f2679, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0f3F5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f2676, f1250, 0f3F441B7D; +mul.f32 f2677, f1256, 0fBF248DBB; +sub.f32 f1276, f2676, f2677; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0fBF248DBB, f1277; +mul.f32 f2674, f1266, 0f3E31D0D4; +mul.f32 f2675, f1272, 0fBF7C1C5C; +sub.f32 f1281, f2674, f2675; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0fBF7C1C5C, f1282; +mul.f32 f1285, f1257, 0fBF7C1C5C; +mul.f32 f2673, f1251, 0f3E31D0D4; +sub.f32 f1286, f2673, f1285; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0fBF7C1C5C, f1287; +mul.f32 f1290, f1273, 0fBEAF1D44; +mul.f32 f2672, f1267, 0fBF708FB2; +sub.f32 f1291, f2672, f1290; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0fBEAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f2671, f2683, f2678; +sub.f32 f1300, f2683, f2678; +mul.f32 f1301, f1300, 0f3F5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f2670, f2688, f2671; +mul.f32 f1304, f2671, 0f3F000000; +sub.f32 f1305, f2688, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0f3F5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f2669, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0f3F5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f2668, f1240, f2669; +mul.f32 f1320, f2669, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0f3F5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f2667, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0f3F5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f2666, f1241, f2667; +mul.f32 f1336, f2667, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0f3F5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f2695, 0fBE6C2691; +mul.f32 f2665, f1195, 0f3F791978; +sub.f32 f1344, f2665, f1343; +mul.f32 f1345, f2695, 0f3F791978; +fma.rn.f32 f1346, f1195, 0fBE6C2691, f1345; +mul.f32 f2663, f1311, 0f3F64C51C; +mul.f32 f2664, f2668, 0fBEE5C902; +sub.f32 f1349, f2663, f2664; +mul.f32 f1350, f2668, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0fBEE5C902, f1350; +mul.f32 f2661, f1211, 0f3F64C51C; +mul.f32 f2662, f2693, 0fBEE5C902; +sub.f32 f1354, f2661, f2662; +mul.f32 f1355, f2693, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0fBEE5C902, f1355; +mul.f32 f2659, f1327, 0f3F18DF63; +mul.f32 f2660, f2666, 0fBF4D57F2; +sub.f32 f1359, f2659, f2660; +mul.f32 f1360, f2666, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0fBF4D57F2, f1360; +mul.f32 f2657, f1186, 0f3F441B7D; +mul.f32 f2658, f1192, 0fBF248DBB; +sub.f32 f1364, f2657, f2658; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0fBF248DBB, f1365; +mul.f32 f1368, f1308, 0fBF7C1C5C; +mul.f32 f2656, f1302, 0f3E31D0D4; +sub.f32 f1369, f2656, f1368; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0fBF7C1C5C, f1370; +mul.f32 f1373, f1208, 0fBF4D57F2; +mul.f32 f2655, f1202, 0f3F18DF63; +sub.f32 f1374, f2655, f1373; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0fBF4D57F2, f1375; +mul.f32 f1378, f1324, 0fBF753ECD; +mul.f32 f2654, f1318, 0fBE92D7E0; +sub.f32 f1379, f2654, f1378; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0fBF753ECD, f1380; +mul.f32 f1383, f1224, 0fBF6B1036; +mul.f32 f2653, f1218, 0f3ECACAF8; +sub.f32 f1384, f2653, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0fBF6B1036, f1385; +mul.f32 f1388, f1340, 0fBF3A3529; +mul.f32 f2652, f1334, 0fBF2FAD88; +sub.f32 f1389, f2652, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0fBF3A3529, f1390; +mul.f32 f1393, f1193, 0fBF7C1C5C; +mul.f32 f2651, f1187, 0f3E31D0D4; +sub.f32 f1394, f2651, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0fBF7C1C5C, f1395; +mul.f32 f2649, f1303, 0fBF708FB2; +mul.f32 f2650, f1309, 0fBEAF1D44; +sub.f32 f1399, f2649, f2650; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0fBEAF1D44, f1400; +mul.f32 f2647, f1203, 0fBD6E2946; +mul.f32 f2648, f1209, 0fBF7F9120; +sub.f32 f1404, f2647, f2648; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0fBF7F9120, f1405; +mul.f32 f2645, f1319, 0fBF7E44DE; +mul.f32 f2646, f1325, 0f3DEDC21F; +sub.f32 f1409, f2645, f2646; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0f3DEDC21F, f1410; +mul.f32 f1413, f1225, 0fBF753ECD; +mul.f32 f2644, f1219, 0fBE92D7E0; +sub.f32 f1414, f2644, f1413; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0fBF753ECD, f1415; +mul.f32 f1418, f1341, 0f3F0CAC9F; +mul.f32 f2643, f1335, 0fBF55E287; +sub.f32 f1419, f2643, f1418; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0f3F0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +add.f32 f1423, f1063, f1422; +mul.f32 f1426, f1422, 0f3F000000; +sub.f32 f1427, f1063, f1426; +add.f32 f2642, f2697, f2670; +sub.f32 f1428, f2697, f2670; +mul.f32 f1429, f1428, 0f3F5DB3D7; +add.f32 f1430, f1429, f1427; +sub.f32 f1431, f1427, f1429; +add.f32 f2641, f2724, f2642; +mul.f32 f1432, f2642, 0f3F000000; +sub.f32 f1433, f2724, f1432; +sub.f32 f1434, f1179, f1295; +mul.f32 f1435, f1434, 0f3F5DB3D7; +sub.f32 f1436, f1433, f1435; +add.f32 f1437, f1435, f1433; +add.f32 f1438, f1344, f1349; +add.f32 f1439, f1079, f1438; +mul.f32 f1442, f1438, 0f3F000000; +sub.f32 f1443, f1079, f1442; +add.f32 f2640, f1346, f1351; +sub.f32 f1444, f1346, f1351; +mul.f32 f1445, f1444, 0f3F5DB3D7; +add.f32 f1446, f1445, f1443; +sub.f32 f1447, f1443, f1445; +add.f32 f2639, f2722, f2640; +mul.f32 f1448, f2640, 0f3F000000; +sub.f32 f1449, f2722, f1448; +sub.f32 f1450, f1344, f1349; +mul.f32 f1451, f1450, 0f3F5DB3D7; +sub.f32 f1452, f1449, f1451; +add.f32 f1453, f1451, f1449; +add.f32 f1454, f1354, f1359; +add.f32 f1455, f1095, f1454; +mul.f32 f1458, f1454, 0f3F000000; +sub.f32 f1459, f1095, f1458; +add.f32 f2638, f1356, f1361; +sub.f32 f1460, f1356, f1361; +mul.f32 f1461, f1460, 0f3F5DB3D7; +add.f32 f1462, f1461, f1459; +sub.f32 f1463, f1459, f1461; +add.f32 f2637, f2720, f2638; +mul.f32 f1464, f2638, 0f3F000000; +sub.f32 f1465, f2720, f1464; +sub.f32 f1466, f1354, f1359; +mul.f32 f1467, f1466, 0f3F5DB3D7; +sub.f32 f1468, f1465, f1467; +add.f32 f1469, f1467, f1465; +add.f32 f1470, f1364, f1369; +add.f32 f1471, f1070, f1470; +mul.f32 f1474, f1470, 0f3F000000; +sub.f32 f1475, f1070, f1474; +add.f32 f2636, f1366, f1371; +sub.f32 f1476, f1366, f1371; +mul.f32 f1477, f1476, 0f3F5DB3D7; +add.f32 f1478, f1477, f1475; +sub.f32 f1479, f1475, f1477; +add.f32 f2635, f1076, f2636; +mul.f32 f1480, f2636, 0f3F000000; +sub.f32 f1481, f1076, f1480; +sub.f32 f1482, f1364, f1369; +mul.f32 f1483, f1482, 0f3F5DB3D7; +sub.f32 f1484, f1481, f1483; +add.f32 f1485, f1483, f1481; +add.f32 f1486, f1374, f1379; +add.f32 f1487, f1086, f1486; +mul.f32 f1490, f1486, 0f3F000000; +sub.f32 f1491, f1086, f1490; +add.f32 f2634, f1376, f1381; +sub.f32 f1492, f1376, f1381; +mul.f32 f1493, f1492, 0f3F5DB3D7; +add.f32 f1494, f1493, f1491; +sub.f32 f1495, f1491, f1493; +add.f32 f2633, f1092, f2634; +mul.f32 f1496, f2634, 0f3F000000; +sub.f32 f1497, f1092, f1496; +sub.f32 f1498, f1374, f1379; +mul.f32 f1499, f1498, 0f3F5DB3D7; +sub.f32 f1500, f1497, f1499; +add.f32 f1501, f1499, f1497; +add.f32 f1502, f1384, f1389; +add.f32 f1503, f1102, f1502; +mul.f32 f1506, f1502, 0f3F000000; +sub.f32 f1507, f1102, f1506; +add.f32 f2632, f1386, f1391; +sub.f32 f1508, f1386, f1391; +mul.f32 f1509, f1508, 0f3F5DB3D7; +add.f32 f1510, f1509, f1507; +sub.f32 f1511, f1507, f1509; +add.f32 f2631, f1108, f2632; +mul.f32 f1512, f2632, 0f3F000000; +sub.f32 f1513, f1108, f1512; +sub.f32 f1514, f1384, f1389; +mul.f32 f1515, f1514, 0f3F5DB3D7; +sub.f32 f1516, f1513, f1515; +add.f32 f1517, f1515, f1513; +add.f32 f1518, f1394, f1399; +add.f32 f1519, f1071, f1518; +mul.f32 f1522, f1518, 0f3F000000; +sub.f32 f1523, f1071, f1522; +add.f32 f2630, f1396, f1401; +sub.f32 f1524, f1396, f1401; +mul.f32 f1525, f1524, 0f3F5DB3D7; +add.f32 f1526, f1525, f1523; +sub.f32 f1527, f1523, f1525; +add.f32 f2629, f1077, f2630; +mul.f32 f1528, f2630, 0f3F000000; +sub.f32 f1529, f1077, f1528; +sub.f32 f1530, f1394, f1399; +mul.f32 f1531, f1530, 0f3F5DB3D7; +sub.f32 f1532, f1529, f1531; +add.f32 f1533, f1531, f1529; +add.f32 f1534, f1404, f1409; +add.f32 f1535, f1087, f1534; +mul.f32 f1538, f1534, 0f3F000000; +sub.f32 f1539, f1087, f1538; +add.f32 f2628, f1406, f1411; +sub.f32 f1540, f1406, f1411; +mul.f32 f1541, f1540, 0f3F5DB3D7; +add.f32 f1542, f1541, f1539; +sub.f32 f1543, f1539, f1541; +add.f32 f2627, f1093, f2628; +mul.f32 f1544, f2628, 0f3F000000; +sub.f32 f1545, f1093, f1544; +sub.f32 f1546, f1404, f1409; +mul.f32 f1547, f1546, 0f3F5DB3D7; +sub.f32 f1548, f1545, f1547; +add.f32 f1549, f1547, f1545; +add.f32 f1550, f1414, f1419; +add.f32 f1551, f1103, f1550; +mul.f32 f1554, f1550, 0f3F000000; +sub.f32 f1555, f1103, f1554; +add.f32 f2626, f1416, f1421; +sub.f32 f1556, f1416, f1421; +mul.f32 f1557, f1556, 0f3F5DB3D7; +add.f32 f1558, f1557, f1555; +sub.f32 f1559, f1555, f1557; +add.f32 f2625, f1109, f2626; +mul.f32 f1560, f2626, 0f3F000000; +sub.f32 f1561, f1109, f1560; +sub.f32 f1562, f1414, f1419; +mul.f32 f1563, f1562, 0f3F5DB3D7; +sub.f32 f1564, f1561, f1563; +add.f32 f1565, f1563, f1561; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1566, f1567}, [rd11]; +mul.f32 f2623, f1566, f1439; +mul.f32 f2624, f1567, f2639; +sub.f32 f1572, f2623, f2624; +mul.f32 f1573, f1566, f2639; +fma.rn.f32 f1574, f1567, f1439, f1573; +mul.f32 f2621, f1566, f1566; +mul.f32 f2622, f1567, f1567; +sub.f32 f1577, f2621, f2622; +mul.f32 f1578, f1567, f1566; +fma.rn.f32 f1579, f1567, f1566, f1578; +mul.f32 f2619, f1577, f1455; +mul.f32 f2620, f1579, f2637; +sub.f32 f1582, f2619, f2620; +mul.f32 f1583, f1577, f2637; +fma.rn.f32 f1584, f1579, f1455, f1583; +mul.f32 f1586, f1567, f1579; +mul.f32 f2618, f1566, f1577; +sub.f32 f1587, f2618, f1586; +mul.f32 f1588, f1566, f1579; +fma.rn.f32 f1589, f1567, f1577, f1588; +mul.f32 f1591, f1589, f2635; +mul.f32 f2617, f1587, f1471; +sub.f32 f1592, f2617, f1591; +mul.f32 f1593, f1587, f2635; +fma.rn.f32 f1594, f1589, f1471, f1593; +mul.f32 f1596, f1567, f1589; +mul.f32 f2616, f1566, f1587; +sub.f32 f1597, f2616, f1596; +mul.f32 f1598, f1566, f1589; +fma.rn.f32 f1599, f1567, f1587, f1598; +mul.f32 f1601, f1599, f2633; +mul.f32 f2615, f1597, f1487; +sub.f32 f1602, f2615, f1601; +mul.f32 f1603, f1597, f2633; +fma.rn.f32 f1604, f1599, f1487, f1603; +mul.f32 f1606, f1567, f1599; +mul.f32 f2614, f1566, f1597; +sub.f32 f1607, f2614, f1606; +mul.f32 f1608, f1566, f1599; +fma.rn.f32 f1609, f1567, f1597, f1608; +mul.f32 f2612, f1607, f1503; +mul.f32 f2613, f1609, f2631; +sub.f32 f1612, f2612, f2613; +mul.f32 f1613, f1607, f2631; +fma.rn.f32 f1614, f1609, f1503, f1613; +mul.f32 f2610, f1566, f1607; +mul.f32 f2611, f1567, f1609; +sub.f32 f1617, f2610, f2611; +mul.f32 f1618, f1566, f1609; +fma.rn.f32 f1619, f1567, f1607, f1618; +mul.f32 f2608, f1617, f1519; +mul.f32 f2609, f1619, f2629; +sub.f32 f1622, f2608, f2609; +mul.f32 f1623, f1617, f2629; +fma.rn.f32 f1624, f1619, f1519, f1623; +mul.f32 f2606, f1566, f1617; +mul.f32 f2607, f1567, f1619; +sub.f32 f1627, f2606, f2607; +mul.f32 f1628, f1566, f1619; +fma.rn.f32 f1629, f1567, f1617, f1628; +mul.f32 f1631, f1629, f2627; +mul.f32 f2605, f1627, f1535; +sub.f32 f1632, f2605, f1631; +mul.f32 f1633, f1627, f2627; +fma.rn.f32 f1634, f1629, f1535, f1633; +mul.f32 f1636, f1567, f1629; +mul.f32 f2604, f1566, f1627; +sub.f32 f1637, f2604, f1636; +mul.f32 f1638, f1566, f1629; +fma.rn.f32 f1639, f1567, f1627, f1638; +mul.f32 f1641, f1639, f2625; +mul.f32 f2603, f1637, f1551; +sub.f32 f1642, f2603, f1641; +mul.f32 f1643, f1637, f2625; +fma.rn.f32 f1644, f1639, f1551, f1643; +mul.f32 f1646, f1567, f1639; +mul.f32 f2602, f1566, f1637; +sub.f32 f1647, f2602, f1646; +mul.f32 f1648, f1566, f1639; +fma.rn.f32 f1649, f1567, f1637, f1648; +mul.f32 f1651, f1649, f1436; +mul.f32 f2601, f1647, f1430; +sub.f32 f1652, f2601, f1651; +mul.f32 f1653, f1647, f1436; +fma.rn.f32 f1654, f1649, f1430, f1653; +mul.f32 f2599, f1566, f1647; +mul.f32 f2600, f1567, f1649; +sub.f32 f1657, f2599, f2600; +mul.f32 f1658, f1566, f1649; +fma.rn.f32 f1659, f1567, f1647, f1658; +mul.f32 f2597, f1657, f1446; +mul.f32 f2598, f1659, f1452; +sub.f32 f1662, f2597, f2598; +mul.f32 f1663, f1657, f1452; +fma.rn.f32 f1664, f1659, f1446, f1663; +mul.f32 f2595, f1566, f1657; +mul.f32 f2596, f1567, f1659; +sub.f32 f1667, f2595, f2596; +mul.f32 f1668, f1566, f1659; +fma.rn.f32 f1669, f1567, f1657, f1668; +mul.f32 f2593, f1667, f1462; +mul.f32 f2594, f1669, f1468; +sub.f32 f1672, f2593, f2594; +mul.f32 f1673, f1667, f1468; +fma.rn.f32 f1674, f1669, f1462, f1673; +mul.f32 f1676, f1567, f1669; +mul.f32 f2592, f1566, f1667; +sub.f32 f1677, f2592, f1676; +mul.f32 f1678, f1566, f1669; +fma.rn.f32 f1679, f1567, f1667, f1678; +mul.f32 f1681, f1679, f1484; +mul.f32 f2591, f1677, f1478; +sub.f32 f1682, f2591, f1681; +mul.f32 f1683, f1677, f1484; +fma.rn.f32 f1684, f1679, f1478, f1683; +mul.f32 f1686, f1567, f1679; +mul.f32 f2590, f1566, f1677; +sub.f32 f1687, f2590, f1686; +mul.f32 f1688, f1566, f1679; +fma.rn.f32 f1689, f1567, f1677, f1688; +mul.f32 f1691, f1689, f1500; +mul.f32 f2589, f1687, f1494; +sub.f32 f1692, f2589, f1691; +mul.f32 f1693, f1687, f1500; +fma.rn.f32 f1694, f1689, f1494, f1693; +mul.f32 f1696, f1567, f1689; +mul.f32 f2588, f1566, f1687; +sub.f32 f1697, f2588, f1696; +mul.f32 f1698, f1566, f1689; +fma.rn.f32 f1699, f1567, f1687, f1698; +mul.f32 f1701, f1699, f1516; +mul.f32 f2587, f1697, f1510; +sub.f32 f1702, f2587, f1701; +mul.f32 f1703, f1697, f1516; +fma.rn.f32 f1704, f1699, f1510, f1703; +mul.f32 f2585, f1566, f1697; +mul.f32 f2586, f1567, f1699; +sub.f32 f1707, f2585, f2586; +mul.f32 f1708, f1566, f1699; +fma.rn.f32 f1709, f1567, f1697, f1708; +mul.f32 f2583, f1707, f1526; +mul.f32 f2584, f1709, f1532; +sub.f32 f1712, f2583, f2584; +mul.f32 f1713, f1707, f1532; +fma.rn.f32 f1714, f1709, f1526, f1713; +mul.f32 f2581, f1566, f1707; +mul.f32 f2582, f1567, f1709; +sub.f32 f1717, f2581, f2582; +mul.f32 f1718, f1566, f1709; +fma.rn.f32 f1719, f1567, f1707, f1718; +mul.f32 f1721, f1719, f1548; +mul.f32 f2580, f1717, f1542; +sub.f32 f1722, f2580, f1721; +mul.f32 f1723, f1717, f1548; +fma.rn.f32 f1724, f1719, f1542, f1723; +mul.f32 f1726, f1567, f1719; +mul.f32 f2579, f1566, f1717; +sub.f32 f1727, f2579, f1726; +mul.f32 f1728, f1566, f1719; +fma.rn.f32 f1729, f1567, f1717, f1728; +mul.f32 f1731, f1729, f1564; +mul.f32 f2578, f1727, f1558; +sub.f32 f1732, f2578, f1731; +mul.f32 f1733, f1727, f1564; +fma.rn.f32 f1734, f1729, f1558, f1733; +mul.f32 f1736, f1567, f1729; +mul.f32 f2577, f1566, f1727; +sub.f32 f1737, f2577, f1736; +mul.f32 f1738, f1566, f1729; +fma.rn.f32 f1739, f1567, f1727, f1738; +mul.f32 f1741, f1739, f1437; +mul.f32 f2576, f1737, f1431; +sub.f32 f1742, f2576, f1741; +mul.f32 f1743, f1737, f1437; +fma.rn.f32 f1744, f1739, f1431, f1743; +mul.f32 f1746, f1567, f1739; +mul.f32 f2575, f1566, f1737; +sub.f32 f1747, f2575, f1746; +mul.f32 f1748, f1566, f1739; +fma.rn.f32 f1749, f1567, f1737, f1748; +mul.f32 f2573, f1747, f1447; +mul.f32 f2574, f1749, f1453; +sub.f32 f1752, f2573, f2574; +mul.f32 f1753, f1747, f1453; +fma.rn.f32 f1754, f1749, f1447, f1753; +mul.f32 f2571, f1566, f1747; +mul.f32 f2572, f1567, f1749; +sub.f32 f1757, f2571, f2572; +mul.f32 f1758, f1566, f1749; +fma.rn.f32 f1759, f1567, f1747, f1758; +mul.f32 f2569, f1757, f1463; +mul.f32 f2570, f1759, f1469; +sub.f32 f1762, f2569, f2570; +mul.f32 f1763, f1757, f1469; +fma.rn.f32 f1764, f1759, f1463, f1763; +mul.f32 f2567, f1566, f1757; +mul.f32 f2568, f1567, f1759; +sub.f32 f1767, f2567, f2568; +mul.f32 f1768, f1566, f1759; +fma.rn.f32 f1769, f1567, f1757, f1768; +mul.f32 f1771, f1769, f1485; +mul.f32 f2566, f1767, f1479; +sub.f32 f1772, f2566, f1771; +mul.f32 f1773, f1767, f1485; +fma.rn.f32 f1774, f1769, f1479, f1773; +mul.f32 f1776, f1567, f1769; +mul.f32 f2565, f1566, f1767; +sub.f32 f1777, f2565, f1776; +mul.f32 f1778, f1566, f1769; +fma.rn.f32 f1779, f1567, f1767, f1778; +mul.f32 f1781, f1779, f1501; +mul.f32 f2564, f1777, f1495; +sub.f32 f1782, f2564, f1781; +mul.f32 f1783, f1777, f1501; +fma.rn.f32 f1784, f1779, f1495, f1783; +mul.f32 f1786, f1567, f1779; +mul.f32 f2563, f1566, f1777; +sub.f32 f1787, f2563, f1786; +mul.f32 f1788, f1566, f1779; +fma.rn.f32 f1789, f1567, f1777, f1788; +mul.f32 f1791, f1789, f1517; +mul.f32 f2562, f1787, f1511; +sub.f32 f1792, f2562, f1791; +mul.f32 f1793, f1787, f1517; +fma.rn.f32 f1794, f1789, f1511, f1793; +mul.f32 f2560, f1566, f1787; +mul.f32 f2561, f1567, f1789; +sub.f32 f1797, f2560, f2561; +mul.f32 f1798, f1566, f1789; +fma.rn.f32 f1799, f1567, f1787, f1798; +mul.f32 f2558, f1797, f1527; +mul.f32 f2559, f1799, f1533; +sub.f32 f1802, f2558, f2559; +mul.f32 f1803, f1797, f1533; +fma.rn.f32 f1804, f1799, f1527, f1803; +mul.f32 f2556, f1566, f1797; +mul.f32 f2557, f1567, f1799; +sub.f32 f1807, f2556, f2557; +mul.f32 f1808, f1566, f1799; +fma.rn.f32 f1809, f1567, f1797, f1808; +mul.f32 f2554, f1807, f1543; +mul.f32 f2555, f1809, f1549; +sub.f32 f1812, f2554, f2555; +mul.f32 f1813, f1807, f1549; +fma.rn.f32 f1814, f1809, f1543, f1813; +mul.f32 f1816, f1567, f1809; +mul.f32 f2553, f1566, f1807; +sub.f32 f1817, f2553, f1816; +mul.f32 f1818, f1566, f1809; +fma.rn.f32 f1819, f1567, f1807, f1818; +mul.f32 f1821, f1819, f1565; +mul.f32 f2552, f1817, f1559; +sub.f32 f1822, f2552, f1821; +mul.f32 f1823, f1817, f1565; +fma.rn.f32 f1824, f1819, f1559, f1823; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 2916, r19; +st.shared.f32 [r20], f1423; +st.shared.f32 [r20+108], f1572; +st.shared.f32 [r20+216], f1582; +st.shared.f32 [r20+324], f1592; +st.shared.f32 [r20+432], f1602; +st.shared.f32 [r20+540], f1612; +st.shared.f32 [r20+648], f1622; +st.shared.f32 [r20+756], f1632; +st.shared.f32 [r20+864], f1642; +st.shared.f32 [r20+972], f1652; +st.shared.f32 [r20+1080], f1662; +st.shared.f32 [r20+1188], f1672; +st.shared.f32 [r20+1296], f1682; +st.shared.f32 [r20+1404], f1692; +st.shared.f32 [r20+1512], f1702; +st.shared.f32 [r20+1620], f1712; +st.shared.f32 [r20+1728], f1722; +st.shared.f32 [r20+1836], f1732; +st.shared.f32 [r20+1944], f1742; +st.shared.f32 [r20+2052], f1752; +st.shared.f32 [r20+2160], f1762; +st.shared.f32 [r20+2268], f1772; +st.shared.f32 [r20+2376], f1782; +st.shared.f32 [r20+2484], f1792; +st.shared.f32 [r20+2592], f1802; +st.shared.f32 [r20+2700], f1812; +st.shared.f32 [r20+2808], f1822; +barrier.sync 0; +ld.shared.f32 f1825, [r10]; +ld.shared.f32 f1826, [r10+2916]; +ld.shared.f32 f1827, [r10+5832]; +ld.shared.f32 f1828, [r10+8748]; +ld.shared.f32 f1829, [r10+11664]; +ld.shared.f32 f1830, [r10+14580]; +ld.shared.f32 f1831, [r10+17496]; +ld.shared.f32 f1832, [r10+20412]; +ld.shared.f32 f1833, [r10+23328]; +ld.shared.f32 f1834, [r10+26244]; +ld.shared.f32 f1835, [r10+29160]; +ld.shared.f32 f1836, [r10+32076]; +ld.shared.f32 f1837, [r10+34992]; +ld.shared.f32 f1838, [r10+37908]; +ld.shared.f32 f1839, [r10+40824]; +ld.shared.f32 f1840, [r10+43740]; +ld.shared.f32 f1841, [r10+46656]; +ld.shared.f32 f1842, [r10+49572]; +ld.shared.f32 f1843, [r10+52488]; +ld.shared.f32 f1844, [r10+55404]; +ld.shared.f32 f1845, [r10+58320]; +ld.shared.f32 f1846, [r10+61236]; +ld.shared.f32 f1847, [r10+64152]; +ld.shared.f32 f1848, [r10+67068]; +ld.shared.f32 f1849, [r10+69984]; +ld.shared.f32 f1850, [r10+72900]; +ld.shared.f32 f1851, [r10+75816]; +barrier.sync 0; +st.shared.f32 [r20], f2641; +st.shared.f32 [r20+108], f1574; +st.shared.f32 [r20+216], f1584; +st.shared.f32 [r20+324], f1594; +st.shared.f32 [r20+432], f1604; +st.shared.f32 [r20+540], f1614; +st.shared.f32 [r20+648], f1624; +st.shared.f32 [r20+756], f1634; +st.shared.f32 [r20+864], f1644; +st.shared.f32 [r20+972], f1654; +st.shared.f32 [r20+1080], f1664; +st.shared.f32 [r20+1188], f1674; +st.shared.f32 [r20+1296], f1684; +st.shared.f32 [r20+1404], f1694; +st.shared.f32 [r20+1512], f1704; +st.shared.f32 [r20+1620], f1714; +st.shared.f32 [r20+1728], f1724; +st.shared.f32 [r20+1836], f1734; +st.shared.f32 [r20+1944], f1744; +st.shared.f32 [r20+2052], f1754; +st.shared.f32 [r20+2160], f1764; +st.shared.f32 [r20+2268], f1774; +st.shared.f32 [r20+2376], f1784; +st.shared.f32 [r20+2484], f1794; +st.shared.f32 [r20+2592], f1804; +st.shared.f32 [r20+2700], f1814; +st.shared.f32 [r20+2808], f1824; +barrier.sync 0; +ld.shared.f32 f1852, [r10]; +ld.shared.f32 f1853, [r10+2916]; +ld.shared.f32 f1854, [r10+5832]; +ld.shared.f32 f1855, [r10+8748]; +ld.shared.f32 f1856, [r10+11664]; +ld.shared.f32 f1857, [r10+14580]; +ld.shared.f32 f1858, [r10+17496]; +ld.shared.f32 f1859, [r10+20412]; +ld.shared.f32 f1860, [r10+23328]; +ld.shared.f32 f1861, [r10+26244]; +ld.shared.f32 f1862, [r10+29160]; +ld.shared.f32 f1863, [r10+32076]; +ld.shared.f32 f1864, [r10+34992]; +ld.shared.f32 f1865, [r10+37908]; +ld.shared.f32 f1866, [r10+40824]; +ld.shared.f32 f1867, [r10+43740]; +ld.shared.f32 f1868, [r10+46656]; +ld.shared.f32 f1869, [r10+49572]; +ld.shared.f32 f1870, [r10+52488]; +ld.shared.f32 f1871, [r10+55404]; +ld.shared.f32 f1872, [r10+58320]; +ld.shared.f32 f1873, [r10+61236]; +ld.shared.f32 f1874, [r10+64152]; +ld.shared.f32 f1875, [r10+67068]; +ld.shared.f32 f1876, [r10+69984]; +ld.shared.f32 f1877, [r10+72900]; +ld.shared.f32 f1878, [r10+75816]; +add.f32 f1879, f1834, f1843; +add.f32 f1880, f1825, f1879; +mul.f32 f1883, f1879, 0f3F000000; +sub.f32 f1884, f1825, f1883; +add.f32 f2551, f1861, f1870; +sub.f32 f1885, f1861, f1870; +mul.f32 f1886, f1885, 0f3F5DB3D7; +add.f32 f1887, f1886, f1884; +sub.f32 f1888, f1884, f1886; +add.f32 f2550, f1852, f2551; +mul.f32 f1889, f2551, 0f3F000000; +sub.f32 f1890, f1852, f1889; +sub.f32 f1891, f1834, f1843; +mul.f32 f1892, f1891, 0f3F5DB3D7; +sub.f32 f1893, f1890, f1892; +add.f32 f1894, f1892, f1890; +add.f32 f1895, f1837, f1846; +add.f32 f1896, f1828, f1895; +mul.f32 f1899, f1895, 0f3F000000; +sub.f32 f1900, f1828, f1899; +add.f32 f2549, f1864, f1873; +sub.f32 f1901, f1864, f1873; +mul.f32 f1902, f1901, 0f3F5DB3D7; +add.f32 f1903, f1902, f1900; +sub.f32 f1904, f1900, f1902; +add.f32 f2548, f1855, f2549; +mul.f32 f1905, f2549, 0f3F000000; +sub.f32 f1906, f1855, f1905; +sub.f32 f1907, f1837, f1846; +mul.f32 f1908, f1907, 0f3F5DB3D7; +sub.f32 f1909, f1906, f1908; +add.f32 f1910, f1908, f1906; +add.f32 f1911, f1840, f1849; +add.f32 f1912, f1831, f1911; +mul.f32 f1915, f1911, 0f3F000000; +sub.f32 f1916, f1831, f1915; +add.f32 f2547, f1867, f1876; +sub.f32 f1917, f1867, f1876; +mul.f32 f1918, f1917, 0f3F5DB3D7; +add.f32 f1919, f1918, f1916; +sub.f32 f1920, f1916, f1918; +add.f32 f2546, f1858, f2547; +mul.f32 f1921, f2547, 0f3F000000; +sub.f32 f1922, f1858, f1921; +sub.f32 f1923, f1840, f1849; +mul.f32 f1924, f1923, 0f3F5DB3D7; +sub.f32 f1925, f1922, f1924; +add.f32 f1926, f1924, f1922; +mul.f32 f2544, f1903, 0f3F441B7D; +mul.f32 f2545, f1909, 0fBF248DBB; +sub.f32 f1929, f2544, f2545; +mul.f32 f1930, f1909, 0f3F441B7D; +fma.rn.f32 f1931, f1903, 0fBF248DBB, f1930; +mul.f32 f2542, f1919, 0f3E31D0D4; +mul.f32 f2543, f1925, 0fBF7C1C5C; +sub.f32 f1934, f2542, f2543; +mul.f32 f1935, f1925, 0f3E31D0D4; +fma.rn.f32 f1936, f1919, 0fBF7C1C5C, f1935; +mul.f32 f2540, f1904, 0f3E31D0D4; +mul.f32 f2541, f1910, 0fBF7C1C5C; +sub.f32 f1939, f2540, f2541; +mul.f32 f1940, f1910, 0f3E31D0D4; +fma.rn.f32 f1941, f1904, 0fBF7C1C5C, f1940; +mul.f32 f2538, f1920, 0fBF708FB2; +mul.f32 f2539, f1926, 0fBEAF1D44; +sub.f32 f1944, f2538, f2539; +mul.f32 f1945, f1926, 0fBF708FB2; +fma.rn.f32 f1946, f1920, 0fBEAF1D44, f1945; +add.f32 f1947, f1896, f1912; +add.f32 f1948, f1880, f1947; +mul.f32 f1951, f1947, 0f3F000000; +sub.f32 f1952, f1880, f1951; +add.f32 f2537, f2548, f2546; +sub.f32 f1953, f2548, f2546; +mul.f32 f1954, f1953, 0f3F5DB3D7; +add.f32 f1955, f1954, f1952; +sub.f32 f1956, f1952, f1954; +add.f32 f2536, f2550, f2537; +mul.f32 f1957, f2537, 0f3F000000; +sub.f32 f1958, f2550, f1957; +sub.f32 f1959, f1896, f1912; +mul.f32 f1960, f1959, 0f3F5DB3D7; +sub.f32 f1961, f1958, f1960; +add.f32 f1962, f1960, f1958; +add.f32 f1963, f1929, f1934; +add.f32 f1964, f1887, f1963; +mul.f32 f1967, f1963, 0f3F000000; +sub.f32 f1968, f1887, f1967; +add.f32 f2535, f1931, f1936; +sub.f32 f1969, f1931, f1936; +mul.f32 f1970, f1969, 0f3F5DB3D7; +add.f32 f1971, f1970, f1968; +sub.f32 f1972, f1968, f1970; +add.f32 f2534, f1893, f2535; +mul.f32 f1973, f2535, 0f3F000000; +sub.f32 f1974, f1893, f1973; +sub.f32 f1975, f1929, f1934; +mul.f32 f1976, f1975, 0f3F5DB3D7; +sub.f32 f1977, f1974, f1976; +add.f32 f1978, f1976, f1974; +add.f32 f1979, f1939, f1944; +add.f32 f1980, f1888, f1979; +mul.f32 f1983, f1979, 0f3F000000; +sub.f32 f1984, f1888, f1983; +add.f32 f2533, f1941, f1946; +sub.f32 f1985, f1941, f1946; +mul.f32 f1986, f1985, 0f3F5DB3D7; +add.f32 f1987, f1986, f1984; +sub.f32 f1988, f1984, f1986; +add.f32 f2532, f1894, f2533; +mul.f32 f1989, f2533, 0f3F000000; +sub.f32 f1990, f1894, f1989; +sub.f32 f1991, f1939, f1944; +mul.f32 f1992, f1991, 0f3F5DB3D7; +sub.f32 f1993, f1990, f1992; +add.f32 f1994, f1992, f1990; +add.f32 f1995, f1835, f1844; +add.f32 f1996, f1826, f1995; +mul.f32 f1999, f1995, 0f3F000000; +sub.f32 f2000, f1826, f1999; +add.f32 f2531, f1862, f1871; +sub.f32 f2001, f1862, f1871; +mul.f32 f2002, f2001, 0f3F5DB3D7; +add.f32 f2003, f2002, f2000; +sub.f32 f2004, f2000, f2002; +add.f32 f2530, f1853, f2531; +mul.f32 f2005, f2531, 0f3F000000; +sub.f32 f2006, f1853, f2005; +sub.f32 f2007, f1835, f1844; +mul.f32 f2008, f2007, 0f3F5DB3D7; +sub.f32 f2009, f2006, f2008; +add.f32 f2010, f2008, f2006; +add.f32 f2011, f1838, f1847; +add.f32 f2012, f1829, f2011; +mul.f32 f2015, f2011, 0f3F000000; +sub.f32 f2016, f1829, f2015; +add.f32 f2529, f1865, f1874; +sub.f32 f2017, f1865, f1874; +mul.f32 f2018, f2017, 0f3F5DB3D7; +add.f32 f2019, f2018, f2016; +sub.f32 f2020, f2016, f2018; +add.f32 f2528, f1856, f2529; +mul.f32 f2021, f2529, 0f3F000000; +sub.f32 f2022, f1856, f2021; +sub.f32 f2023, f1838, f1847; +mul.f32 f2024, f2023, 0f3F5DB3D7; +sub.f32 f2025, f2022, f2024; +add.f32 f2026, f2024, f2022; +add.f32 f2027, f1841, f1850; +add.f32 f2028, f1832, f2027; +mul.f32 f2031, f2027, 0f3F000000; +sub.f32 f2032, f1832, f2031; +add.f32 f2527, f1868, f1877; +sub.f32 f2033, f1868, f1877; +mul.f32 f2034, f2033, 0f3F5DB3D7; +add.f32 f2035, f2034, f2032; +sub.f32 f2036, f2032, f2034; +add.f32 f2526, f1859, f2527; +mul.f32 f2037, f2527, 0f3F000000; +sub.f32 f2038, f1859, f2037; +sub.f32 f2039, f1841, f1850; +mul.f32 f2040, f2039, 0f3F5DB3D7; +sub.f32 f2041, f2038, f2040; +add.f32 f2042, f2040, f2038; +mul.f32 f2044, f2025, 0fBF248DBB; +mul.f32 f2525, f2019, 0f3F441B7D; +sub.f32 f2045, f2525, f2044; +mul.f32 f2046, f2025, 0f3F441B7D; +fma.rn.f32 f2047, f2019, 0fBF248DBB, f2046; +mul.f32 f2523, f2035, 0f3E31D0D4; +mul.f32 f2524, f2041, 0fBF7C1C5C; +sub.f32 f2050, f2523, f2524; +mul.f32 f2051, f2041, 0f3E31D0D4; +fma.rn.f32 f2052, f2035, 0fBF7C1C5C, f2051; +mul.f32 f2521, f2020, 0f3E31D0D4; +mul.f32 f2522, f2026, 0fBF7C1C5C; +sub.f32 f2055, f2521, f2522; +mul.f32 f2056, f2026, 0f3E31D0D4; +fma.rn.f32 f2057, f2020, 0fBF7C1C5C, f2056; +mul.f32 f2519, f2036, 0fBF708FB2; +mul.f32 f2520, f2042, 0fBEAF1D44; +sub.f32 f2060, f2519, f2520; +mul.f32 f2061, f2042, 0fBF708FB2; +fma.rn.f32 f2062, f2036, 0fBEAF1D44, f2061; +add.f32 f2063, f2012, f2028; +add.f32 f2064, f1996, f2063; +mul.f32 f2067, f2063, 0f3F000000; +sub.f32 f2068, f1996, f2067; +add.f32 f2518, f2528, f2526; +sub.f32 f2069, f2528, f2526; +mul.f32 f2070, f2069, 0f3F5DB3D7; +add.f32 f2071, f2070, f2068; +sub.f32 f2072, f2068, f2070; +add.f32 f2517, f2530, f2518; +mul.f32 f2073, f2518, 0f3F000000; +sub.f32 f2074, f2530, f2073; +sub.f32 f2075, f2012, f2028; +mul.f32 f2076, f2075, 0f3F5DB3D7; +sub.f32 f2077, f2074, f2076; +add.f32 f2078, f2076, f2074; +add.f32 f2079, f2045, f2050; +add.f32 f2080, f2003, f2079; +mul.f32 f2083, f2079, 0f3F000000; +sub.f32 f2084, f2003, f2083; +add.f32 f2516, f2047, f2052; +sub.f32 f2085, f2047, f2052; +mul.f32 f2086, f2085, 0f3F5DB3D7; +add.f32 f2087, f2086, f2084; +sub.f32 f2088, f2084, f2086; +add.f32 f2515, f2009, f2516; +mul.f32 f2089, f2516, 0f3F000000; +sub.f32 f2090, f2009, f2089; +sub.f32 f2091, f2045, f2050; +mul.f32 f2092, f2091, 0f3F5DB3D7; +sub.f32 f2093, f2090, f2092; +add.f32 f2094, f2092, f2090; +add.f32 f2095, f2055, f2060; +add.f32 f2096, f2004, f2095; +mul.f32 f2099, f2095, 0f3F000000; +sub.f32 f2100, f2004, f2099; +add.f32 f2514, f2057, f2062; +sub.f32 f2101, f2057, f2062; +mul.f32 f2102, f2101, 0f3F5DB3D7; +add.f32 f2103, f2102, f2100; +sub.f32 f2104, f2100, f2102; +add.f32 f2513, f2010, f2514; +mul.f32 f2105, f2514, 0f3F000000; +sub.f32 f2106, f2010, f2105; +sub.f32 f2107, f2055, f2060; +mul.f32 f2108, f2107, 0f3F5DB3D7; +sub.f32 f2109, f2106, f2108; +add.f32 f2110, f2108, f2106; +add.f32 f2111, f1836, f1845; +add.f32 f2112, f1827, f2111; +mul.f32 f2115, f2111, 0f3F000000; +sub.f32 f2116, f1827, f2115; +add.f32 f2512, f1863, f1872; +sub.f32 f2117, f1863, f1872; +mul.f32 f2118, f2117, 0f3F5DB3D7; +add.f32 f2119, f2118, f2116; +sub.f32 f2120, f2116, f2118; +add.f32 f2511, f1854, f2512; +mul.f32 f2121, f2512, 0f3F000000; +sub.f32 f2122, f1854, f2121; +sub.f32 f2123, f1836, f1845; +mul.f32 f2124, f2123, 0f3F5DB3D7; +sub.f32 f2125, f2122, f2124; +add.f32 f2126, f2124, f2122; +add.f32 f2127, f1839, f1848; +add.f32 f2128, f1830, f2127; +mul.f32 f2131, f2127, 0f3F000000; +sub.f32 f2132, f1830, f2131; +add.f32 f2510, f1866, f1875; +sub.f32 f2133, f1866, f1875; +mul.f32 f2134, f2133, 0f3F5DB3D7; +add.f32 f2135, f2134, f2132; +sub.f32 f2136, f2132, f2134; +add.f32 f2509, f1857, f2510; +mul.f32 f2137, f2510, 0f3F000000; +sub.f32 f2138, f1857, f2137; +sub.f32 f2139, f1839, f1848; +mul.f32 f2140, f2139, 0f3F5DB3D7; +sub.f32 f2141, f2138, f2140; +add.f32 f2142, f2140, f2138; +add.f32 f2143, f1842, f1851; +add.f32 f2144, f1833, f2143; +mul.f32 f2147, f2143, 0f3F000000; +sub.f32 f2148, f1833, f2147; +add.f32 f2508, f1869, f1878; +sub.f32 f2149, f1869, f1878; +mul.f32 f2150, f2149, 0f3F5DB3D7; +add.f32 f2151, f2150, f2148; +sub.f32 f2152, f2148, f2150; +add.f32 f2507, f1860, f2508; +mul.f32 f2153, f2508, 0f3F000000; +sub.f32 f2154, f1860, f2153; +sub.f32 f2155, f1842, f1851; +mul.f32 f2156, f2155, 0f3F5DB3D7; +sub.f32 f2157, f2154, f2156; +add.f32 f2158, f2156, f2154; +mul.f32 f2160, f2141, 0fBF248DBB; +mul.f32 f2506, f2135, 0f3F441B7D; +sub.f32 f2161, f2506, f2160; +mul.f32 f2162, f2141, 0f3F441B7D; +fma.rn.f32 f2163, f2135, 0fBF248DBB, f2162; +mul.f32 f2504, f2151, 0f3E31D0D4; +mul.f32 f2505, f2157, 0fBF7C1C5C; +sub.f32 f2166, f2504, f2505; +mul.f32 f2167, f2157, 0f3E31D0D4; +fma.rn.f32 f2168, f2151, 0fBF7C1C5C, f2167; +mul.f32 f2502, f2136, 0f3E31D0D4; +mul.f32 f2503, f2142, 0fBF7C1C5C; +sub.f32 f2171, f2502, f2503; +mul.f32 f2172, f2142, 0f3E31D0D4; +fma.rn.f32 f2173, f2136, 0fBF7C1C5C, f2172; +mul.f32 f2500, f2152, 0fBF708FB2; +mul.f32 f2501, f2158, 0fBEAF1D44; +sub.f32 f2176, f2500, f2501; +mul.f32 f2177, f2158, 0fBF708FB2; +fma.rn.f32 f2178, f2152, 0fBEAF1D44, f2177; +add.f32 f2179, f2128, f2144; +add.f32 f2180, f2112, f2179; +mul.f32 f2183, f2179, 0f3F000000; +sub.f32 f2184, f2112, f2183; +add.f32 f2499, f2509, f2507; +sub.f32 f2185, f2509, f2507; +mul.f32 f2186, f2185, 0f3F5DB3D7; +add.f32 f2187, f2186, f2184; +sub.f32 f2188, f2184, f2186; +add.f32 f2498, f2511, f2499; +mul.f32 f2189, f2499, 0f3F000000; +sub.f32 f2190, f2511, f2189; +sub.f32 f2191, f2128, f2144; +mul.f32 f2192, f2191, 0f3F5DB3D7; +sub.f32 f2193, f2190, f2192; +add.f32 f2194, f2192, f2190; +add.f32 f2195, f2161, f2166; +add.f32 f2196, f2119, f2195; +mul.f32 f2199, f2195, 0f3F000000; +sub.f32 f2200, f2119, f2199; +add.f32 f2497, f2163, f2168; +sub.f32 f2201, f2163, f2168; +mul.f32 f2202, f2201, 0f3F5DB3D7; +add.f32 f2203, f2202, f2200; +sub.f32 f2204, f2200, f2202; +add.f32 f2496, f2125, f2497; +mul.f32 f2205, f2497, 0f3F000000; +sub.f32 f2206, f2125, f2205; +sub.f32 f2207, f2161, f2166; +mul.f32 f2208, f2207, 0f3F5DB3D7; +sub.f32 f2209, f2206, f2208; +add.f32 f2210, f2208, f2206; +add.f32 f2211, f2171, f2176; +add.f32 f2212, f2120, f2211; +mul.f32 f2215, f2211, 0f3F000000; +sub.f32 f2216, f2120, f2215; +add.f32 f2495, f2173, f2178; +sub.f32 f2217, f2173, f2178; +mul.f32 f2218, f2217, 0f3F5DB3D7; +add.f32 f2219, f2218, f2216; +sub.f32 f2220, f2216, f2218; +add.f32 f2494, f2126, f2495; +mul.f32 f2221, f2495, 0f3F000000; +sub.f32 f2222, f2126, f2221; +sub.f32 f2223, f2171, f2176; +mul.f32 f2224, f2223, 0f3F5DB3D7; +sub.f32 f2225, f2222, f2224; +add.f32 f2226, f2224, f2222; +mul.f32 f2228, f2515, 0fBE6C2691; +mul.f32 f2493, f2080, 0f3F791978; +sub.f32 f2229, f2493, f2228; +mul.f32 f2230, f2515, 0f3F791978; +fma.rn.f32 f2231, f2080, 0fBE6C2691, f2230; +mul.f32 f2233, f2496, 0fBEE5C902; +mul.f32 f2492, f2196, 0f3F64C51C; +sub.f32 f2234, f2492, f2233; +mul.f32 f2235, f2496, 0f3F64C51C; +fma.rn.f32 f2236, f2196, 0fBEE5C902, f2235; +mul.f32 f2238, f2513, 0fBEE5C902; +mul.f32 f2491, f2096, 0f3F64C51C; +sub.f32 f2239, f2491, f2238; +mul.f32 f2240, f2513, 0f3F64C51C; +fma.rn.f32 f2241, f2096, 0fBEE5C902, f2240; +mul.f32 f2243, f2494, 0fBF4D57F2; +mul.f32 f2490, f2212, 0f3F18DF63; +sub.f32 f2244, f2490, f2243; +mul.f32 f2245, f2494, 0f3F18DF63; +fma.rn.f32 f2246, f2212, 0fBF4D57F2, f2245; +mul.f32 f2488, f2071, 0f3F441B7D; +mul.f32 f2489, f2077, 0fBF248DBB; +sub.f32 f2249, f2488, f2489; +mul.f32 f2250, f2077, 0f3F441B7D; +fma.rn.f32 f2251, f2071, 0fBF248DBB, f2250; +mul.f32 f2486, f2187, 0f3E31D0D4; +mul.f32 f2487, f2193, 0fBF7C1C5C; +sub.f32 f2254, f2486, f2487; +mul.f32 f2255, f2193, 0f3E31D0D4; +fma.rn.f32 f2256, f2187, 0fBF7C1C5C, f2255; +mul.f32 f2484, f2087, 0f3F18DF63; +mul.f32 f2485, f2093, 0fBF4D57F2; +sub.f32 f2259, f2484, f2485; +mul.f32 f2260, f2093, 0f3F18DF63; +fma.rn.f32 f2261, f2087, 0fBF4D57F2, f2260; +mul.f32 f2482, f2203, 0fBE92D7E0; +mul.f32 f2483, f2209, 0fBF753ECD; +sub.f32 f2264, f2482, f2483; +mul.f32 f2265, f2209, 0fBE92D7E0; +fma.rn.f32 f2266, f2203, 0fBF753ECD, f2265; +mul.f32 f2268, f2109, 0fBF6B1036; +mul.f32 f2481, f2103, 0f3ECACAF8; +sub.f32 f2269, f2481, f2268; +mul.f32 f2270, f2109, 0f3ECACAF8; +fma.rn.f32 f2271, f2103, 0fBF6B1036, f2270; +mul.f32 f2273, f2225, 0fBF3A3529; +mul.f32 f2480, f2219, 0fBF2FAD88; +sub.f32 f2274, f2480, f2273; +mul.f32 f2275, f2225, 0fBF2FAD88; +fma.rn.f32 f2276, f2219, 0fBF3A3529, f2275; +mul.f32 f2278, f2078, 0fBF7C1C5C; +mul.f32 f2479, f2072, 0f3E31D0D4; +sub.f32 f2279, f2479, f2278; +mul.f32 f2280, f2078, 0f3E31D0D4; +fma.rn.f32 f2281, f2072, 0fBF7C1C5C, f2280; +mul.f32 f2283, f2194, 0fBEAF1D44; +mul.f32 f2478, f2188, 0fBF708FB2; +sub.f32 f2284, f2478, f2283; +mul.f32 f2285, f2194, 0fBF708FB2; +fma.rn.f32 f2286, f2188, 0fBEAF1D44, f2285; +mul.f32 f2288, f2094, 0fBF7F9120; +mul.f32 f2477, f2088, 0fBD6E2946; +sub.f32 f2289, f2477, f2288; +mul.f32 f2290, f2094, 0fBD6E2946; +fma.rn.f32 f2291, f2088, 0fBF7F9120, f2290; +mul.f32 f2475, f2204, 0fBF7E44DE; +mul.f32 f2476, f2210, 0f3DEDC21F; +sub.f32 f2294, f2475, f2476; +mul.f32 f2295, f2210, 0fBF7E44DE; +fma.rn.f32 f2296, f2204, 0f3DEDC21F, f2295; +mul.f32 f2473, f2104, 0fBE92D7E0; +mul.f32 f2474, f2110, 0fBF753ECD; +sub.f32 f2299, f2473, f2474; +mul.f32 f2300, f2110, 0fBE92D7E0; +fma.rn.f32 f2301, f2104, 0fBF753ECD, f2300; +mul.f32 f2471, f2220, 0fBF55E287; +mul.f32 f2472, f2226, 0f3F0CAC9F; +sub.f32 f2304, f2471, f2472; +mul.f32 f2305, f2226, 0fBF55E287; +fma.rn.f32 f2306, f2220, 0f3F0CAC9F, f2305; +add.f32 f2307, f2064, f2180; +mul.f32 f2309, f2307, 0f3F000000; +sub.f32 f2310, f1948, f2309; +add.f32 f2470, f2517, f2498; +sub.f32 f2311, f2517, f2498; +mul.f32 f2312, f2311, 0f3F5DB3D7; +mul.f32 f2313, f2470, 0f3F000000; +sub.f32 f2314, f2536, f2313; +sub.f32 f2315, f2064, f2180; +mul.f32 f2316, f2315, 0f3F5DB3D7; +add.f32 f2317, f2229, f2234; +mul.f32 f2319, f2317, 0f3F000000; +sub.f32 f2320, f1964, f2319; +add.f32 f2469, f2231, f2236; +sub.f32 f2321, f2231, f2236; +mul.f32 f2322, f2321, 0f3F5DB3D7; +mul.f32 f2323, f2469, 0f3F000000; +sub.f32 f2324, f2534, f2323; +sub.f32 f2325, f2229, f2234; +mul.f32 f2326, f2325, 0f3F5DB3D7; +add.f32 f2327, f2239, f2244; +mul.f32 f2329, f2327, 0f3F000000; +sub.f32 f2330, f1980, f2329; +add.f32 f2468, f2241, f2246; +sub.f32 f2331, f2241, f2246; +mul.f32 f2332, f2331, 0f3F5DB3D7; +mul.f32 f2333, f2468, 0f3F000000; +sub.f32 f2334, f2532, f2333; +sub.f32 f2335, f2239, f2244; +mul.f32 f2336, f2335, 0f3F5DB3D7; +add.f32 f2337, f2249, f2254; +mul.f32 f2339, f2337, 0f3F000000; +sub.f32 f2340, f1955, f2339; +add.f32 f2467, f2251, f2256; +sub.f32 f2341, f2251, f2256; +mul.f32 f2342, f2341, 0f3F5DB3D7; +mul.f32 f2343, f2467, 0f3F000000; +sub.f32 f2344, f1961, f2343; +sub.f32 f2345, f2249, f2254; +mul.f32 f2346, f2345, 0f3F5DB3D7; +add.f32 f2347, f2259, f2264; +mul.f32 f2349, f2347, 0f3F000000; +sub.f32 f2350, f1971, f2349; +add.f32 f2466, f2261, f2266; +sub.f32 f2351, f2261, f2266; +mul.f32 f2352, f2351, 0f3F5DB3D7; +mul.f32 f2353, f2466, 0f3F000000; +sub.f32 f2354, f1977, f2353; +sub.f32 f2355, f2259, f2264; +mul.f32 f2356, f2355, 0f3F5DB3D7; +add.f32 f2357, f2269, f2274; +mul.f32 f2359, f2357, 0f3F000000; +sub.f32 f2360, f1987, f2359; +add.f32 f2465, f2271, f2276; +sub.f32 f2361, f2271, f2276; +mul.f32 f2362, f2361, 0f3F5DB3D7; +mul.f32 f2363, f2465, 0f3F000000; +sub.f32 f2364, f1993, f2363; +sub.f32 f2365, f2269, f2274; +mul.f32 f2366, f2365, 0f3F5DB3D7; +add.f32 f2367, f2279, f2284; +mul.f32 f2369, f2367, 0f3F000000; +sub.f32 f2370, f1956, f2369; +add.f32 f2464, f2281, f2286; +sub.f32 f2371, f2281, f2286; +mul.f32 f2372, f2371, 0f3F5DB3D7; +mul.f32 f2373, f2464, 0f3F000000; +sub.f32 f2374, f1962, f2373; +sub.f32 f2375, f2279, f2284; +mul.f32 f2376, f2375, 0f3F5DB3D7; +add.f32 f2377, f2289, f2294; +mul.f32 f2379, f2377, 0f3F000000; +sub.f32 f2380, f1972, f2379; +add.f32 f2463, f2291, f2296; +sub.f32 f2381, f2291, f2296; +mul.f32 f2382, f2381, 0f3F5DB3D7; +mul.f32 f2383, f2463, 0f3F000000; +sub.f32 f2384, f1978, f2383; +sub.f32 f2385, f2289, f2294; +mul.f32 f2386, f2385, 0f3F5DB3D7; +add.f32 f2387, f2299, f2304; +mul.f32 f2389, f2387, 0f3F000000; +sub.f32 f2390, f1988, f2389; +add.f32 f2462, f2301, f2306; +sub.f32 f2391, f2301, f2306; +mul.f32 f2392, f2391, 0f3F5DB3D7; +mul.f32 f2393, f2462, 0f3F000000; +sub.f32 f2394, f1994, f2393; +sub.f32 f2395, f2299, f2304; +mul.f32 f2930, f2465, 0f3F000000; +sub.f32 f2929, f1993, f2930; +mul.f32 f2396, f2395, 0f3F5DB3D7; +add.f32 %0, f1948, f2307; +mul.f32 f2932, f2465, 0f3F000000; +sub.f32 f2931, f1993, f2932; +add.f32 %1, f2536, f2470; +mul.f32 f2934, f2337, 0f3F000000; +sub.f32 f2933, f1955, f2934; +mul.f32 f2936, f2466, 0f3F000000; +sub.f32 f2935, f1977, f2936; +add.f32 %3, f2534, f2469; +add.f32 %2, f1964, f2317; +add.f32 %5, f2532, f2468; +add.f32 %4, f1980, f2327; +add.f32 %7, f1961, f2467; +add.f32 %6, f1955, f2337; +add.f32 %9, f1977, f2466; +add.f32 %8, f1971, f2347; +add.f32 %11, f1993, f2465; +add.f32 %10, f1987, f2357; +add.f32 %13, f1962, f2464; +add.f32 %12, f1956, f2367; +add.f32 %15, f1978, f2463; +add.f32 %14, f1972, f2377; +add.f32 %17, f1994, f2462; +add.f32 %16, f1988, f2387; +sub.f32 %19, f2314, f2316; +add.f32 %18, f2312, f2310; +add.f32 %20, f2322, f2320; +sub.f32 %21, f2324, f2326; +add.f32 %22, f2332, f2330; +sub.f32 %23, f2334, f2336; +add.f32 %24, f2342, f2933; +sub.f32 %25, f2344, f2346; +sub.f32 %27, f2935, f2356; +add.f32 %26, f2352, f2350; +sub.f32 %29, f2931, f2366; +add.f32 %28, f2362, f2360; +add.f32 %30, f2372, f2370; +sub.f32 %31, f2374, f2376; +add.f32 %32, f2382, f2380; +sub.f32 %33, f2384, f2386; +add.f32 %34, f2392, f2390; +sub.f32 %35, f2394, f2396; +sub.f32 %36, f2310, f2312; +add.f32 %37, f2316, f2314; +add.f32 %39, f2326, f2324; +sub.f32 %38, f2320, f2322; +add.f32 %41, f2336, f2334; +sub.f32 %40, f2330, f2332; +add.f32 %43, f2346, f2344; +sub.f32 %42, f2933, f2342; +add.f32 %45, f2356, f2935; +sub.f32 %44, f2350, f2352; +add.f32 %47, f2366, f2931; +sub.f32 %46, f2360, f2362; +add.f32 %49, f2376, f2374; +sub.f32 %48, f2370, f2372; +add.f32 %51, f2386, f2384; +sub.f32 %50, f2380, f2382; +add.f32 %53, f2396, f2394; +sub.f32 %52, f2390, f2392; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_19683), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..26a4d1844d916 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19683_fp32_inv.hpp.inc @@ -0,0 +1,4890 @@ +#ifndef CUFFTDX_FFT_19683_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_19683_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1159, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2982>; +.reg .b32 r<25>; +.reg .b64 rd<16>; +mov.u32 r23, %tid.y; +mov.u32 r24, %54; +mad.lo.s32 r3, r23, 157464, r24; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2981, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2980, %58, f2981; +mul.f32 f119, f2981, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2979, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2978, %64, f2979; +mul.f32 f135, f2979, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2977, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2976, %70, f2977; +mul.f32 f151, f2977, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f2975, f133, 0f3F441B7D; +sub.f32 f159, f2975, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f2973, f149, 0f3E31D0D4; +mul.f32 f2974, f155, 0f3F7C1C5C; +sub.f32 f164, f2973, f2974; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f2971, f134, 0f3E31D0D4; +mul.f32 f2972, f140, 0f3F7C1C5C; +sub.f32 f169, f2971, f2972; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f2969, f150, 0fBF708FB2; +mul.f32 f2970, f156, 0f3EAF1D44; +sub.f32 f174, f2969, f2970; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2968, f2978, f2976; +sub.f32 f183, f2978, f2976; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2967, f2980, f2968; +mul.f32 f187, f2968, 0f3F000000; +sub.f32 f188, f2980, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2966, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2965, f123, f2966; +mul.f32 f203, f2966, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2964, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2963, f124, f2964; +mul.f32 f219, f2964, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2960, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2958, %113, f2960; +mul.f32 f235, f2960, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2955, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2953, %116, f2955; +mul.f32 f251, f2955, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2950, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2948, %119, f2950; +mul.f32 f267, f2950, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f2947, f249, 0f3F441B7D; +sub.f32 f275, f2947, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f2946, f265, 0f3E31D0D4; +sub.f32 f280, f2946, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f2944, f250, 0f3E31D0D4; +mul.f32 f2945, f256, 0f3F7C1C5C; +sub.f32 f285, f2944, f2945; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f2942, f266, 0fBF708FB2; +mul.f32 f2943, f272, 0f3EAF1D44; +sub.f32 f290, f2942, f2943; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2941, f2953, f2948; +sub.f32 f299, f2953, f2948; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2940, f2958, f2941; +mul.f32 f303, f2941, 0f3F000000; +sub.f32 f304, f2958, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2939, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2938, f239, f2939; +mul.f32 f319, f2939, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2937, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2936, f240, f2937; +mul.f32 f335, f2937, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2933, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2931, %122, f2933; +mul.f32 f351, f2933, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2928, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2926, %125, f2928; +mul.f32 f367, f2928, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2924, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2922, %127, f2924; +mul.f32 f383, f2924, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f2921, f365, 0f3F441B7D; +sub.f32 f391, f2921, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f2920, f381, 0f3E31D0D4; +sub.f32 f396, f2920, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f2918, f366, 0f3E31D0D4; +mul.f32 f2919, f372, 0f3F7C1C5C; +sub.f32 f401, f2918, f2919; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f2916, f382, 0fBF708FB2; +mul.f32 f2917, f388, 0f3EAF1D44; +sub.f32 f406, f2916, f2917; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2915, f2926, f2922; +sub.f32 f415, f2926, f2922; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2914, f2931, f2915; +mul.f32 f419, f2915, 0f3F000000; +sub.f32 f420, f2931, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2913, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2912, f355, f2913; +mul.f32 f435, f2913, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2911, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2910, f356, f2911; +mul.f32 f451, f2911, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2938, 0f3E6C2691; +mul.f32 f2909, f310, 0f3F791978; +sub.f32 f459, f2909, f458; +mul.f32 f460, f2938, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f2907, f426, 0f3F64C51C; +mul.f32 f2908, f2912, 0f3EE5C902; +sub.f32 f464, f2907, f2908; +mul.f32 f465, f2912, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f2905, f326, 0f3F64C51C; +mul.f32 f2906, f2936, 0f3EE5C902; +sub.f32 f469, f2905, f2906; +mul.f32 f470, f2936, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f2903, f442, 0f3F18DF63; +mul.f32 f2904, f2910, 0f3F4D57F2; +sub.f32 f474, f2903, f2904; +mul.f32 f475, f2910, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f2901, f301, 0f3F441B7D; +mul.f32 f2902, f307, 0f3F248DBB; +sub.f32 f479, f2901, f2902; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f2900, f417, 0f3E31D0D4; +sub.f32 f484, f2900, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f2899, f317, 0f3F18DF63; +sub.f32 f489, f2899, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f2898, f433, 0fBE92D7E0; +sub.f32 f494, f2898, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f2897, f333, 0f3ECACAF8; +sub.f32 f499, f2897, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f2896, f449, 0fBF2FAD88; +sub.f32 f504, f2896, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f2895, f302, 0f3E31D0D4; +sub.f32 f509, f2895, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f2893, f418, 0fBF708FB2; +mul.f32 f2894, f424, 0f3EAF1D44; +sub.f32 f514, f2893, f2894; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f2891, f318, 0fBD6E2946; +mul.f32 f2892, f324, 0f3F7F9120; +sub.f32 f519, f2891, f2892; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f2889, f434, 0fBF7E44DE; +mul.f32 f2890, f440, 0fBDEDC21F; +sub.f32 f524, f2889, f2890; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f2888, f334, 0fBE92D7E0; +sub.f32 f529, f2888, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f2887, f450, 0fBF55E287; +sub.f32 f534, f2887, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f2886, f2940, f2914; +sub.f32 f541, f2940, f2914; +mul.f32 f542, f541, 0fBF5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f2886, 0f3F000000; +sub.f32 f546, f2967, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0fBF5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f2885, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0fBF5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f2884, f2965, f2885; +mul.f32 f561, f2885, 0f3F000000; +sub.f32 f562, f2965, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0fBF5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f2883, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0fBF5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f2882, f2963, f2883; +mul.f32 f577, f2883, 0f3F000000; +sub.f32 f578, f2963, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0fBF5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f2881, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0fBF5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f2880, f191, f2881; +mul.f32 f593, f2881, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0fBF5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f2879, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f2878, f207, f2879; +mul.f32 f609, f2879, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0fBF5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f2877, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0fBF5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f2876, f223, f2877; +mul.f32 f625, f2877, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0fBF5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f2875, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0fBF5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f2874, f192, f2875; +mul.f32 f641, f2875, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0fBF5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f2873, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0fBF5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f2872, f208, f2873; +mul.f32 f657, f2873, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0fBF5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f2871, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0fBF5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f2870, f224, f2871; +mul.f32 f673, f2871, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0fBF5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r22, %tid.x; +mul.wide.u32 rd2, r22, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r22, r6; +mad.lo.s32 r8, r5, 157464, r3; +mul.wide.u32 rd14, r7, 8; +mov.u64 rd15, %55; +add.s64 rd6, rd15, rd14; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f683, f2884, f680; +mul.f32 f685, f679, f2884; +mul.f32 f2868, f679, f679; +mul.f32 f2869, f680, f680; +sub.f32 f688, f2868, f2869; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f691, f2882, f690; +mul.f32 f693, f688, f2882; +mul.f32 f695, f680, f690; +mul.f32 f2867, f679, f688; +sub.f32 f696, f2867, f695; +mul.f32 f2866, f568, f690; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f699, f2880, f698; +mul.f32 f701, f696, f2880; +mul.f32 f2864, f679, f696; +mul.f32 f2865, f680, f698; +sub.f32 f704, f2864, f2865; +mul.f32 f2863, f584, f698; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f707, f2878, f706; +mul.f32 f709, f704, f2878; +mul.f32 f711, f680, f706; +mul.f32 f2862, f679, f704; +sub.f32 f712, f2862, f711; +mul.f32 f2861, f600, f706; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f715, f2876, f714; +mul.f32 f717, f712, f2876; +mul.f32 f719, f680, f714; +mul.f32 f2860, f679, f712; +sub.f32 f720, f2860, f719; +mul.f32 f2859, f616, f714; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f723, f2874, f722; +mul.f32 f725, f720, f2874; +mul.f32 f2857, f679, f720; +mul.f32 f2858, f680, f722; +sub.f32 f728, f2857, f2858; +mul.f32 f2856, f632, f722; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f731, f2872, f730; +mul.f32 f733, f728, f2872; +mul.f32 f735, f680, f730; +mul.f32 f2855, f679, f728; +sub.f32 f736, f2855, f735; +mul.f32 f2854, f648, f730; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f739, f2870, f738; +mul.f32 f741, f736, f2870; +mul.f32 f743, f680, f738; +mul.f32 f2853, f679, f736; +sub.f32 f744, f2853, f743; +mul.f32 f2852, f664, f738; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f747, f549, f746; +mul.f32 f749, f744, f549; +mul.f32 f2850, f679, f744; +mul.f32 f2851, f680, f746; +sub.f32 f752, f2850, f2851; +mul.f32 f2849, f543, f746; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f755, f565, f754; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f2848, f679, f752; +sub.f32 f760, f2848, f759; +mul.f32 f2847, f559, f754; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f763, f581, f762; +mul.f32 f765, f760, f581; +mul.f32 f2845, f679, f760; +mul.f32 f2846, f680, f762; +sub.f32 f768, f2845, f2846; +mul.f32 f2844, f575, f762; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f771, f597, f770; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f2843, f679, f768; +sub.f32 f776, f2843, f775; +mul.f32 f2842, f591, f770; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f779, f613, f778; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f2841, f679, f776; +sub.f32 f784, f2841, f783; +mul.f32 f2840, f607, f778; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f787, f629, f786; +mul.f32 f789, f784, f629; +mul.f32 f2838, f679, f784; +mul.f32 f2839, f680, f786; +sub.f32 f792, f2838, f2839; +mul.f32 f2837, f623, f786; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f795, f645, f794; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f2836, f679, f792; +sub.f32 f800, f2836, f799; +mul.f32 f2835, f639, f794; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f803, f661, f802; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f2834, f679, f800; +sub.f32 f808, f2834, f807; +mul.f32 f2833, f655, f802; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f811, f677, f810; +mul.f32 f813, f808, f677; +mul.f32 f2831, f679, f808; +mul.f32 f2832, f680, f810; +sub.f32 f816, f2831, f2832; +mul.f32 f2830, f671, f810; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f819, f550, f818; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f2829, f679, f816; +sub.f32 f824, f2829, f823; +mul.f32 f2828, f544, f818; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f827, f566, f826; +mul.f32 f829, f824, f566; +mul.f32 f2826, f679, f824; +mul.f32 f2827, f680, f826; +sub.f32 f832, f2826, f2827; +mul.f32 f2825, f560, f826; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f835, f582, f834; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f2824, f679, f832; +sub.f32 f840, f2824, f839; +mul.f32 f2823, f576, f834; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f843, f598, f842; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f2822, f679, f840; +sub.f32 f848, f2822, f847; +mul.f32 f2821, f592, f842; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f851, f614, f850; +mul.f32 f853, f848, f614; +mul.f32 f2819, f679, f848; +mul.f32 f2820, f680, f850; +sub.f32 f856, f2819, f2820; +mul.f32 f2818, f608, f850; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f859, f630, f858; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f2817, f679, f856; +sub.f32 f864, f2817, f863; +mul.f32 f2816, f624, f858; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f867, f646, f866; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f2815, f679, f864; +sub.f32 f872, f2815, f871; +mul.f32 f2814, f640, f866; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f875, f662, f874; +mul.f32 f877, f872, f662; +mul.f32 f2812, f679, f872; +mul.f32 f2813, f680, f874; +sub.f32 f880, f2812, f2813; +mul.f32 f2811, f656, f874; +mul.f32 f881, f679, f874; +mul.f32 f2810, f552, f680; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f678, f882; +mul.f32 f884, f672, f882; +mul.f32 f885, f880, f678; +barrier.sync 0; +add.f32 f886, f2967, f2886; +add.f32 f887, f178, f537; +mad.lo.s32 r21, r7, 216, r8; +st.shared.v2.f32 [r21], {f887, f886}; +fma.rn.f32 f888, f679, f552, f683; +sub.f32 f889, f685, f2810; +st.shared.v2.f32 [r21+8], {f888, f889}; +fma.rn.f32 f890, f688, f568, f691; +sub.f32 f891, f693, f2866; +st.shared.v2.f32 [r21+16], {f890, f891}; +fma.rn.f32 f892, f696, f584, f699; +sub.f32 f893, f701, f2863; +st.shared.v2.f32 [r21+24], {f892, f893}; +fma.rn.f32 f894, f704, f600, f707; +sub.f32 f895, f709, f2861; +st.shared.v2.f32 [r21+32], {f894, f895}; +fma.rn.f32 f896, f712, f616, f715; +sub.f32 f897, f717, f2859; +st.shared.v2.f32 [r21+40], {f896, f897}; +fma.rn.f32 f898, f720, f632, f723; +sub.f32 f899, f725, f2856; +st.shared.v2.f32 [r21+48], {f898, f899}; +sub.f32 f900, f733, f2854; +fma.rn.f32 f901, f728, f648, f731; +st.shared.v2.f32 [r21+56], {f901, f900}; +fma.rn.f32 f902, f736, f664, f739; +sub.f32 f903, f741, f2852; +st.shared.v2.f32 [r21+64], {f902, f903}; +fma.rn.f32 f904, f744, f543, f747; +sub.f32 f905, f749, f2849; +st.shared.v2.f32 [r21+72], {f904, f905}; +fma.rn.f32 f906, f752, f559, f755; +sub.f32 f907, f757, f2847; +st.shared.v2.f32 [r21+80], {f906, f907}; +fma.rn.f32 f908, f760, f575, f763; +sub.f32 f909, f765, f2844; +st.shared.v2.f32 [r21+88], {f908, f909}; +fma.rn.f32 f910, f768, f591, f771; +sub.f32 f911, f773, f2842; +st.shared.v2.f32 [r21+96], {f910, f911}; +fma.rn.f32 f912, f776, f607, f779; +sub.f32 f913, f781, f2840; +st.shared.v2.f32 [r21+104], {f912, f913}; +fma.rn.f32 f914, f784, f623, f787; +sub.f32 f915, f789, f2837; +st.shared.v2.f32 [r21+112], {f914, f915}; +fma.rn.f32 f916, f792, f639, f795; +sub.f32 f917, f797, f2835; +st.shared.v2.f32 [r21+120], {f916, f917}; +fma.rn.f32 f918, f800, f655, f803; +sub.f32 f919, f805, f2833; +st.shared.v2.f32 [r21+128], {f918, f919}; +fma.rn.f32 f920, f808, f671, f811; +sub.f32 f921, f813, f2830; +st.shared.v2.f32 [r21+136], {f920, f921}; +fma.rn.f32 f922, f816, f544, f819; +sub.f32 f923, f821, f2828; +st.shared.v2.f32 [r21+144], {f922, f923}; +fma.rn.f32 f924, f824, f560, f827; +sub.f32 f925, f829, f2825; +st.shared.v2.f32 [r21+152], {f924, f925}; +fma.rn.f32 f926, f832, f576, f835; +sub.f32 f927, f837, f2823; +st.shared.v2.f32 [r21+160], {f926, f927}; +fma.rn.f32 f928, f840, f592, f843; +sub.f32 f929, f845, f2821; +st.shared.v2.f32 [r21+168], {f928, f929}; +fma.rn.f32 f930, f848, f608, f851; +sub.f32 f931, f853, f2818; +st.shared.v2.f32 [r21+176], {f930, f931}; +fma.rn.f32 f932, f856, f624, f859; +sub.f32 f933, f861, f2816; +st.shared.v2.f32 [r21+184], {f932, f933}; +fma.rn.f32 f934, f864, f640, f867; +sub.f32 f935, f869, f2814; +st.shared.v2.f32 [r21+192], {f934, f935}; +fma.rn.f32 f936, f872, f656, f875; +sub.f32 f937, f877, f2811; +st.shared.v2.f32 [r21+200], {f936, f937}; +fma.rn.f32 f938, f880, f672, f883; +sub.f32 f939, f885, f884; +st.shared.v2.f32 [r21+208], {f938, f939}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r21; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+5832]; +ld.shared.v2.f32 {f948, f949}, [r10+11664]; +ld.shared.v2.f32 {f952, f953}, [r10+17496]; +ld.shared.v2.f32 {f956, f957}, [r10+23328]; +ld.shared.v2.f32 {f960, f961}, [r10+29160]; +ld.shared.v2.f32 {f964, f965}, [r10+34992]; +ld.shared.v2.f32 {f968, f969}, [r10+40824]; +ld.shared.v2.f32 {f972, f973}, [r10+46656]; +ld.shared.v2.f32 {f976, f977}, [r10+52488]; +ld.shared.v2.f32 {f980, f981}, [r10+58320]; +ld.shared.v2.f32 {f984, f985}, [r10+64152]; +ld.shared.v2.f32 {f988, f989}, [r10+69984]; +ld.shared.v2.f32 {f992, f993}, [r10+75816]; +ld.shared.v2.f32 {f996, f997}, [r10+81648]; +ld.shared.v2.f32 {f1000, f1001}, [r10+87480]; +ld.shared.v2.f32 {f1004, f1005}, [r10+93312]; +ld.shared.v2.f32 {f1008, f1009}, [r10+99144]; +ld.shared.v2.f32 {f1012, f1013}, [r10+104976]; +ld.shared.v2.f32 {f1016, f1017}, [r10+110808]; +ld.shared.v2.f32 {f1020, f1021}, [r10+116640]; +ld.shared.v2.f32 {f1024, f1025}, [r10+122472]; +ld.shared.v2.f32 {f1028, f1029}, [r10+128304]; +ld.shared.v2.f32 {f1032, f1033}, [r10+134136]; +ld.shared.v2.f32 {f1036, f1037}, [r10+139968]; +ld.shared.v2.f32 {f1040, f1041}, [r10+145800]; +ld.shared.v2.f32 {f1044, f1045}, [r10+151632]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f2809, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0fBF5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f2808, f941, f2809; +mul.f32 f1058, f2809, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0fBF5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f2807, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0fBF5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f2806, f953, f2807; +mul.f32 f1074, f2807, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0fBF5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f2805, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0fBF5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f2804, f965, f2805; +mul.f32 f1090, f2805, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0fBF5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f2802, f1072, 0f3F441B7D; +mul.f32 f2803, f1078, 0f3F248DBB; +sub.f32 f1098, f2802, f2803; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0f3F248DBB, f1099; +mul.f32 f2800, f1088, 0f3E31D0D4; +mul.f32 f2801, f1094, 0f3F7C1C5C; +sub.f32 f1103, f2800, f2801; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0f3F7C1C5C, f1104; +mul.f32 f2798, f1073, 0f3E31D0D4; +mul.f32 f2799, f1079, 0f3F7C1C5C; +sub.f32 f1108, f2798, f2799; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0f3F7C1C5C, f1109; +mul.f32 f1112, f1095, 0f3EAF1D44; +mul.f32 f2797, f1089, 0fBF708FB2; +sub.f32 f1113, f2797, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0f3EAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f2796, f2806, f2804; +sub.f32 f1122, f2806, f2804; +mul.f32 f1123, f1122, 0fBF5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f2795, f2808, f2796; +mul.f32 f1126, f2796, 0f3F000000; +sub.f32 f1127, f2808, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0fBF5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f2794, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0fBF5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f2793, f1062, f2794; +mul.f32 f1142, f2794, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0fBF5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f2792, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0fBF5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f2791, f1063, f2792; +mul.f32 f1158, f2792, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0fBF5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f2790, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0fBF5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f2789, f945, f2790; +mul.f32 f1174, f2790, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0fBF5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f2788, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0fBF5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f2787, f957, f2788; +mul.f32 f1190, f2788, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0fBF5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f2786, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0fBF5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f2785, f969, f2786; +mul.f32 f1206, f2786, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0fBF5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f2783, f1188, 0f3F441B7D; +mul.f32 f2784, f1194, 0f3F248DBB; +sub.f32 f1214, f2783, f2784; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0f3F248DBB, f1215; +mul.f32 f2781, f1204, 0f3E31D0D4; +mul.f32 f2782, f1210, 0f3F7C1C5C; +sub.f32 f1219, f2781, f2782; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0f3F7C1C5C, f1220; +mul.f32 f2779, f1189, 0f3E31D0D4; +mul.f32 f2780, f1195, 0f3F7C1C5C; +sub.f32 f1224, f2779, f2780; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0f3F7C1C5C, f1225; +mul.f32 f2777, f1205, 0fBF708FB2; +mul.f32 f2778, f1211, 0f3EAF1D44; +sub.f32 f1229, f2777, f2778; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0f3EAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f2776, f2787, f2785; +sub.f32 f1238, f2787, f2785; +mul.f32 f1239, f1238, 0fBF5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f2775, f2789, f2776; +mul.f32 f1242, f2776, 0f3F000000; +sub.f32 f1243, f2789, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0fBF5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f2774, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0fBF5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f2773, f1178, f2774; +mul.f32 f1258, f2774, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0fBF5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f2772, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0fBF5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f2771, f1179, f2772; +mul.f32 f1274, f2772, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0fBF5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f2770, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0fBF5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f2769, f949, f2770; +mul.f32 f1290, f2770, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0fBF5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f2768, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0fBF5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f2767, f961, f2768; +mul.f32 f1306, f2768, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0fBF5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f2766, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0fBF5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f2765, f973, f2766; +mul.f32 f1322, f2766, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0fBF5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0f3F248DBB; +mul.f32 f2764, f1304, 0f3F441B7D; +sub.f32 f1330, f2764, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0f3F248DBB, f1331; +mul.f32 f2762, f1320, 0f3E31D0D4; +mul.f32 f2763, f1326, 0f3F7C1C5C; +sub.f32 f1335, f2762, f2763; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0f3F7C1C5C, f1336; +mul.f32 f2760, f1305, 0f3E31D0D4; +mul.f32 f2761, f1311, 0f3F7C1C5C; +sub.f32 f1340, f2760, f2761; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0f3F7C1C5C, f1341; +mul.f32 f2758, f1321, 0fBF708FB2; +mul.f32 f2759, f1327, 0f3EAF1D44; +sub.f32 f1345, f2758, f2759; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0f3EAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f2757, f2767, f2765; +sub.f32 f1354, f2767, f2765; +mul.f32 f1355, f1354, 0fBF5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f2756, f2769, f2757; +mul.f32 f1358, f2757, 0f3F000000; +sub.f32 f1359, f2769, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0fBF5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f2755, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0fBF5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f2754, f1294, f2755; +mul.f32 f1374, f2755, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0fBF5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f2753, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0fBF5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f2752, f1295, f2753; +mul.f32 f1390, f2753, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0fBF5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1397, f2773, 0f3E6C2691; +mul.f32 f2751, f1249, 0f3F791978; +sub.f32 f1398, f2751, f1397; +mul.f32 f1399, f2773, 0f3F791978; +fma.rn.f32 f1400, f1249, 0f3E6C2691, f1399; +mul.f32 f1402, f2754, 0f3EE5C902; +mul.f32 f2750, f1365, 0f3F64C51C; +sub.f32 f1403, f2750, f1402; +mul.f32 f1404, f2754, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0f3EE5C902, f1404; +mul.f32 f1407, f2771, 0f3EE5C902; +mul.f32 f2749, f1265, 0f3F64C51C; +sub.f32 f1408, f2749, f1407; +mul.f32 f1409, f2771, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0f3EE5C902, f1409; +mul.f32 f2747, f1381, 0f3F18DF63; +mul.f32 f2748, f2752, 0f3F4D57F2; +sub.f32 f1413, f2747, f2748; +mul.f32 f1414, f2752, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0f3F4D57F2, f1414; +mul.f32 f2745, f1240, 0f3F441B7D; +mul.f32 f2746, f1246, 0f3F248DBB; +sub.f32 f1418, f2745, f2746; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0f3F248DBB, f1419; +mul.f32 f2743, f1356, 0f3E31D0D4; +mul.f32 f2744, f1362, 0f3F7C1C5C; +sub.f32 f1423, f2743, f2744; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0f3F7C1C5C, f1424; +mul.f32 f2741, f1256, 0f3F18DF63; +mul.f32 f2742, f1262, 0f3F4D57F2; +sub.f32 f1428, f2741, f2742; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0f3F4D57F2, f1429; +mul.f32 f1432, f1378, 0f3F753ECD; +mul.f32 f2740, f1372, 0fBE92D7E0; +sub.f32 f1433, f2740, f1432; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0f3F753ECD, f1434; +mul.f32 f1437, f1278, 0f3F6B1036; +mul.f32 f2739, f1272, 0f3ECACAF8; +sub.f32 f1438, f2739, f1437; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0f3F6B1036, f1439; +mul.f32 f1442, f1394, 0f3F3A3529; +mul.f32 f2738, f1388, 0fBF2FAD88; +sub.f32 f1443, f2738, f1442; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0f3F3A3529, f1444; +mul.f32 f1447, f1247, 0f3F7C1C5C; +mul.f32 f2737, f1241, 0f3E31D0D4; +sub.f32 f1448, f2737, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0f3F7C1C5C, f1449; +mul.f32 f1452, f1363, 0f3EAF1D44; +mul.f32 f2736, f1357, 0fBF708FB2; +sub.f32 f1453, f2736, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0f3EAF1D44, f1454; +mul.f32 f1457, f1263, 0f3F7F9120; +mul.f32 f2735, f1257, 0fBD6E2946; +sub.f32 f1458, f2735, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0f3F7F9120, f1459; +mul.f32 f2733, f1373, 0fBF7E44DE; +mul.f32 f2734, f1379, 0fBDEDC21F; +sub.f32 f1463, f2733, f2734; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0fBDEDC21F, f1464; +mul.f32 f2731, f1273, 0fBE92D7E0; +mul.f32 f2732, f1279, 0f3F753ECD; +sub.f32 f1468, f2731, f2732; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0f3F753ECD, f1469; +mul.f32 f2729, f1389, 0fBF55E287; +mul.f32 f2730, f1395, 0fBF0CAC9F; +sub.f32 f1473, f2729, f2730; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0fBF0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f2728, f2775, f2756; +sub.f32 f1480, f2775, f2756; +mul.f32 f1481, f1480, 0fBF5DB3D7; +add.f32 f1482, f1481, f1479; +sub.f32 f1483, f1479, f1481; +mul.f32 f1484, f2728, 0f3F000000; +sub.f32 f1485, f2795, f1484; +sub.f32 f1486, f1233, f1349; +mul.f32 f1487, f1486, 0fBF5DB3D7; +sub.f32 f1488, f1485, f1487; +add.f32 f1489, f1487, f1485; +add.f32 f1490, f1398, f1403; +add.f32 f1491, f1133, f1490; +mul.f32 f1494, f1490, 0f3F000000; +sub.f32 f1495, f1133, f1494; +add.f32 f2727, f1400, f1405; +sub.f32 f1496, f1400, f1405; +mul.f32 f1497, f1496, 0fBF5DB3D7; +add.f32 f1498, f1497, f1495; +sub.f32 f1499, f1495, f1497; +add.f32 f2726, f2793, f2727; +mul.f32 f1500, f2727, 0f3F000000; +sub.f32 f1501, f2793, f1500; +sub.f32 f1502, f1398, f1403; +mul.f32 f1503, f1502, 0fBF5DB3D7; +sub.f32 f1504, f1501, f1503; +add.f32 f1505, f1503, f1501; +add.f32 f1506, f1408, f1413; +add.f32 f1507, f1149, f1506; +mul.f32 f1510, f1506, 0f3F000000; +sub.f32 f1511, f1149, f1510; +add.f32 f2725, f1410, f1415; +sub.f32 f1512, f1410, f1415; +mul.f32 f1513, f1512, 0fBF5DB3D7; +add.f32 f1514, f1513, f1511; +sub.f32 f1515, f1511, f1513; +add.f32 f2724, f2791, f2725; +mul.f32 f1516, f2725, 0f3F000000; +sub.f32 f1517, f2791, f1516; +sub.f32 f1518, f1408, f1413; +mul.f32 f1519, f1518, 0fBF5DB3D7; +sub.f32 f1520, f1517, f1519; +add.f32 f1521, f1519, f1517; +add.f32 f1522, f1418, f1423; +add.f32 f1523, f1124, f1522; +mul.f32 f1526, f1522, 0f3F000000; +sub.f32 f1527, f1124, f1526; +add.f32 f2723, f1420, f1425; +sub.f32 f1528, f1420, f1425; +mul.f32 f1529, f1528, 0fBF5DB3D7; +add.f32 f1530, f1529, f1527; +sub.f32 f1531, f1527, f1529; +add.f32 f2722, f1130, f2723; +mul.f32 f1532, f2723, 0f3F000000; +sub.f32 f1533, f1130, f1532; +sub.f32 f1534, f1418, f1423; +mul.f32 f1535, f1534, 0fBF5DB3D7; +sub.f32 f1536, f1533, f1535; +add.f32 f1537, f1535, f1533; +add.f32 f1538, f1428, f1433; +add.f32 f1539, f1140, f1538; +mul.f32 f1542, f1538, 0f3F000000; +sub.f32 f1543, f1140, f1542; +add.f32 f2721, f1430, f1435; +sub.f32 f1544, f1430, f1435; +mul.f32 f1545, f1544, 0fBF5DB3D7; +add.f32 f1546, f1545, f1543; +sub.f32 f1547, f1543, f1545; +add.f32 f2720, f1146, f2721; +mul.f32 f1548, f2721, 0f3F000000; +sub.f32 f1549, f1146, f1548; +sub.f32 f1550, f1428, f1433; +mul.f32 f1551, f1550, 0fBF5DB3D7; +sub.f32 f1552, f1549, f1551; +add.f32 f1553, f1551, f1549; +add.f32 f1554, f1438, f1443; +add.f32 f1555, f1156, f1554; +mul.f32 f1558, f1554, 0f3F000000; +sub.f32 f1559, f1156, f1558; +add.f32 f2719, f1440, f1445; +sub.f32 f1560, f1440, f1445; +mul.f32 f1561, f1560, 0fBF5DB3D7; +add.f32 f1562, f1561, f1559; +sub.f32 f1563, f1559, f1561; +add.f32 f2718, f1162, f2719; +mul.f32 f1564, f2719, 0f3F000000; +sub.f32 f1565, f1162, f1564; +sub.f32 f1566, f1438, f1443; +mul.f32 f1567, f1566, 0fBF5DB3D7; +sub.f32 f1568, f1565, f1567; +add.f32 f1569, f1567, f1565; +add.f32 f1570, f1448, f1453; +add.f32 f1571, f1125, f1570; +mul.f32 f1574, f1570, 0f3F000000; +sub.f32 f1575, f1125, f1574; +add.f32 f2717, f1450, f1455; +sub.f32 f1576, f1450, f1455; +mul.f32 f1577, f1576, 0fBF5DB3D7; +add.f32 f1578, f1577, f1575; +sub.f32 f1579, f1575, f1577; +add.f32 f2716, f1131, f2717; +mul.f32 f1580, f2717, 0f3F000000; +sub.f32 f1581, f1131, f1580; +sub.f32 f1582, f1448, f1453; +mul.f32 f1583, f1582, 0fBF5DB3D7; +sub.f32 f1584, f1581, f1583; +add.f32 f1585, f1583, f1581; +add.f32 f1586, f1458, f1463; +add.f32 f1587, f1141, f1586; +mul.f32 f1590, f1586, 0f3F000000; +sub.f32 f1591, f1141, f1590; +add.f32 f2715, f1460, f1465; +sub.f32 f1592, f1460, f1465; +mul.f32 f1593, f1592, 0fBF5DB3D7; +add.f32 f1594, f1593, f1591; +sub.f32 f1595, f1591, f1593; +add.f32 f2714, f1147, f2715; +mul.f32 f1596, f2715, 0f3F000000; +sub.f32 f1597, f1147, f1596; +sub.f32 f1598, f1458, f1463; +mul.f32 f1599, f1598, 0fBF5DB3D7; +sub.f32 f1600, f1597, f1599; +add.f32 f1601, f1599, f1597; +add.f32 f1602, f1468, f1473; +add.f32 f1603, f1157, f1602; +mul.f32 f1606, f1602, 0f3F000000; +sub.f32 f1607, f1157, f1606; +add.f32 f2713, f1470, f1475; +sub.f32 f1608, f1470, f1475; +mul.f32 f1609, f1608, 0fBF5DB3D7; +add.f32 f1610, f1609, f1607; +sub.f32 f1611, f1607, f1609; +add.f32 f2712, f1163, f2713; +mul.f32 f1612, f2713, 0f3F000000; +sub.f32 f1613, f1163, f1612; +sub.f32 f1614, f1468, f1473; +mul.f32 f1615, f1614, 0fBF5DB3D7; +sub.f32 f1616, f1613, f1615; +add.f32 f1617, f1615, f1613; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1618, f1619}, [rd11]; +mul.f32 f1622, f2726, f1619; +mul.f32 f1624, f1618, f2726; +mul.f32 f2710, f1618, f1618; +mul.f32 f2711, f1619, f1619; +sub.f32 f1627, f2710, f2711; +mul.f32 f1628, f1619, f1618; +fma.rn.f32 f1629, f1619, f1618, f1628; +mul.f32 f1630, f2724, f1629; +mul.f32 f1632, f1627, f2724; +mul.f32 f1634, f1619, f1629; +mul.f32 f2709, f1618, f1627; +sub.f32 f1635, f2709, f1634; +mul.f32 f2708, f1507, f1629; +mul.f32 f1636, f1618, f1629; +fma.rn.f32 f1637, f1619, f1627, f1636; +mul.f32 f1638, f2722, f1637; +mul.f32 f1640, f1635, f2722; +mul.f32 f2706, f1618, f1635; +mul.f32 f2707, f1619, f1637; +sub.f32 f1643, f2706, f2707; +mul.f32 f2705, f1523, f1637; +mul.f32 f1644, f1618, f1637; +fma.rn.f32 f1645, f1619, f1635, f1644; +mul.f32 f1646, f2720, f1645; +mul.f32 f1648, f1643, f2720; +mul.f32 f1650, f1619, f1645; +mul.f32 f2704, f1618, f1643; +sub.f32 f1651, f2704, f1650; +mul.f32 f2703, f1539, f1645; +mul.f32 f1652, f1618, f1645; +fma.rn.f32 f1653, f1619, f1643, f1652; +mul.f32 f1654, f2718, f1653; +mul.f32 f1656, f1651, f2718; +mul.f32 f1658, f1619, f1653; +mul.f32 f2702, f1618, f1651; +sub.f32 f1659, f2702, f1658; +mul.f32 f2701, f1555, f1653; +mul.f32 f1660, f1618, f1653; +fma.rn.f32 f1661, f1619, f1651, f1660; +mul.f32 f1662, f2716, f1661; +mul.f32 f1664, f1659, f2716; +mul.f32 f2699, f1618, f1659; +mul.f32 f2700, f1619, f1661; +sub.f32 f1667, f2699, f2700; +mul.f32 f2698, f1571, f1661; +mul.f32 f1668, f1618, f1661; +fma.rn.f32 f1669, f1619, f1659, f1668; +mul.f32 f1670, f2714, f1669; +mul.f32 f1672, f1667, f2714; +mul.f32 f1674, f1619, f1669; +mul.f32 f2697, f1618, f1667; +sub.f32 f1675, f2697, f1674; +mul.f32 f2696, f1587, f1669; +mul.f32 f1676, f1618, f1669; +fma.rn.f32 f1677, f1619, f1667, f1676; +mul.f32 f1678, f2712, f1677; +mul.f32 f1680, f1675, f2712; +mul.f32 f1682, f1619, f1677; +mul.f32 f2695, f1618, f1675; +sub.f32 f1683, f2695, f1682; +mul.f32 f2694, f1603, f1677; +mul.f32 f1684, f1618, f1677; +fma.rn.f32 f1685, f1619, f1675, f1684; +mul.f32 f1686, f1488, f1685; +mul.f32 f1688, f1683, f1488; +mul.f32 f2692, f1618, f1683; +mul.f32 f2693, f1619, f1685; +sub.f32 f1691, f2692, f2693; +mul.f32 f2691, f1482, f1685; +mul.f32 f1692, f1618, f1685; +fma.rn.f32 f1693, f1619, f1683, f1692; +mul.f32 f1694, f1504, f1693; +mul.f32 f1696, f1691, f1504; +mul.f32 f1698, f1619, f1693; +mul.f32 f2690, f1618, f1691; +sub.f32 f1699, f2690, f1698; +mul.f32 f2689, f1498, f1693; +mul.f32 f1700, f1618, f1693; +fma.rn.f32 f1701, f1619, f1691, f1700; +mul.f32 f1702, f1520, f1701; +mul.f32 f1704, f1699, f1520; +mul.f32 f2687, f1618, f1699; +mul.f32 f2688, f1619, f1701; +sub.f32 f1707, f2687, f2688; +mul.f32 f2686, f1514, f1701; +mul.f32 f1708, f1618, f1701; +fma.rn.f32 f1709, f1619, f1699, f1708; +mul.f32 f1710, f1536, f1709; +mul.f32 f1712, f1707, f1536; +mul.f32 f1714, f1619, f1709; +mul.f32 f2685, f1618, f1707; +sub.f32 f1715, f2685, f1714; +mul.f32 f2684, f1530, f1709; +mul.f32 f1716, f1618, f1709; +fma.rn.f32 f1717, f1619, f1707, f1716; +mul.f32 f1718, f1552, f1717; +mul.f32 f1720, f1715, f1552; +mul.f32 f1722, f1619, f1717; +mul.f32 f2683, f1618, f1715; +sub.f32 f1723, f2683, f1722; +mul.f32 f2682, f1546, f1717; +mul.f32 f1724, f1618, f1717; +fma.rn.f32 f1725, f1619, f1715, f1724; +mul.f32 f1726, f1568, f1725; +mul.f32 f1728, f1723, f1568; +mul.f32 f2680, f1618, f1723; +mul.f32 f2681, f1619, f1725; +sub.f32 f1731, f2680, f2681; +mul.f32 f2679, f1562, f1725; +mul.f32 f1732, f1618, f1725; +fma.rn.f32 f1733, f1619, f1723, f1732; +mul.f32 f1734, f1584, f1733; +mul.f32 f1736, f1731, f1584; +mul.f32 f1738, f1619, f1733; +mul.f32 f2678, f1618, f1731; +sub.f32 f1739, f2678, f1738; +mul.f32 f2677, f1578, f1733; +mul.f32 f1740, f1618, f1733; +fma.rn.f32 f1741, f1619, f1731, f1740; +mul.f32 f1742, f1600, f1741; +mul.f32 f1744, f1739, f1600; +mul.f32 f1746, f1619, f1741; +mul.f32 f2676, f1618, f1739; +sub.f32 f1747, f2676, f1746; +mul.f32 f2675, f1594, f1741; +mul.f32 f1748, f1618, f1741; +fma.rn.f32 f1749, f1619, f1739, f1748; +mul.f32 f1750, f1616, f1749; +mul.f32 f1752, f1747, f1616; +mul.f32 f2673, f1618, f1747; +mul.f32 f2674, f1619, f1749; +sub.f32 f1755, f2673, f2674; +mul.f32 f2672, f1610, f1749; +mul.f32 f1756, f1618, f1749; +fma.rn.f32 f1757, f1619, f1747, f1756; +mul.f32 f1758, f1489, f1757; +mul.f32 f1760, f1755, f1489; +mul.f32 f1762, f1619, f1757; +mul.f32 f2671, f1618, f1755; +sub.f32 f1763, f2671, f1762; +mul.f32 f2670, f1483, f1757; +mul.f32 f1764, f1618, f1757; +fma.rn.f32 f1765, f1619, f1755, f1764; +mul.f32 f1766, f1505, f1765; +mul.f32 f1768, f1763, f1505; +mul.f32 f2668, f1618, f1763; +mul.f32 f2669, f1619, f1765; +sub.f32 f1771, f2668, f2669; +mul.f32 f2667, f1499, f1765; +mul.f32 f1772, f1618, f1765; +fma.rn.f32 f1773, f1619, f1763, f1772; +mul.f32 f1774, f1521, f1773; +mul.f32 f1776, f1771, f1521; +mul.f32 f1778, f1619, f1773; +mul.f32 f2666, f1618, f1771; +sub.f32 f1779, f2666, f1778; +mul.f32 f2665, f1515, f1773; +mul.f32 f1780, f1618, f1773; +fma.rn.f32 f1781, f1619, f1771, f1780; +mul.f32 f1782, f1537, f1781; +mul.f32 f1784, f1779, f1537; +mul.f32 f1786, f1619, f1781; +mul.f32 f2664, f1618, f1779; +sub.f32 f1787, f2664, f1786; +mul.f32 f2663, f1531, f1781; +mul.f32 f1788, f1618, f1781; +fma.rn.f32 f1789, f1619, f1779, f1788; +mul.f32 f1790, f1553, f1789; +mul.f32 f1792, f1787, f1553; +mul.f32 f2661, f1618, f1787; +mul.f32 f2662, f1619, f1789; +sub.f32 f1795, f2661, f2662; +mul.f32 f2660, f1547, f1789; +mul.f32 f1796, f1618, f1789; +fma.rn.f32 f1797, f1619, f1787, f1796; +mul.f32 f1798, f1569, f1797; +mul.f32 f1800, f1795, f1569; +mul.f32 f1802, f1619, f1797; +mul.f32 f2659, f1618, f1795; +sub.f32 f1803, f2659, f1802; +mul.f32 f2658, f1563, f1797; +mul.f32 f1804, f1618, f1797; +fma.rn.f32 f1805, f1619, f1795, f1804; +mul.f32 f1806, f1585, f1805; +mul.f32 f1808, f1803, f1585; +mul.f32 f1810, f1619, f1805; +mul.f32 f2657, f1618, f1803; +sub.f32 f1811, f2657, f1810; +mul.f32 f2656, f1579, f1805; +mul.f32 f1812, f1618, f1805; +fma.rn.f32 f1813, f1619, f1803, f1812; +mul.f32 f1814, f1601, f1813; +mul.f32 f1816, f1811, f1601; +mul.f32 f2654, f1618, f1811; +mul.f32 f2655, f1619, f1813; +sub.f32 f1819, f2654, f2655; +mul.f32 f2653, f1595, f1813; +mul.f32 f1820, f1618, f1813; +mul.f32 f2652, f1491, f1619; +fma.rn.f32 f1821, f1619, f1811, f1820; +mul.f32 f1822, f1617, f1821; +mul.f32 f1823, f1611, f1821; +mul.f32 f1824, f1819, f1617; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 5832, r19; +add.f32 f1825, f2795, f2728; +add.f32 f1826, f1117, f1476; +st.shared.v2.f32 [r20], {f1826, f1825}; +fma.rn.f32 f1827, f1618, f1491, f1622; +sub.f32 f1828, f1624, f2652; +st.shared.v2.f32 [r20+216], {f1827, f1828}; +fma.rn.f32 f1829, f1627, f1507, f1630; +sub.f32 f1830, f1632, f2708; +st.shared.v2.f32 [r20+432], {f1829, f1830}; +fma.rn.f32 f1831, f1635, f1523, f1638; +sub.f32 f1832, f1640, f2705; +st.shared.v2.f32 [r20+648], {f1831, f1832}; +fma.rn.f32 f1833, f1643, f1539, f1646; +sub.f32 f1834, f1648, f2703; +st.shared.v2.f32 [r20+864], {f1833, f1834}; +fma.rn.f32 f1835, f1651, f1555, f1654; +sub.f32 f1836, f1656, f2701; +st.shared.v2.f32 [r20+1080], {f1835, f1836}; +sub.f32 f1837, f1664, f2698; +fma.rn.f32 f1838, f1659, f1571, f1662; +st.shared.v2.f32 [r20+1296], {f1838, f1837}; +fma.rn.f32 f1839, f1667, f1587, f1670; +sub.f32 f1840, f1672, f2696; +st.shared.v2.f32 [r20+1512], {f1839, f1840}; +sub.f32 f1841, f1680, f2694; +fma.rn.f32 f1842, f1675, f1603, f1678; +st.shared.v2.f32 [r20+1728], {f1842, f1841}; +fma.rn.f32 f1843, f1683, f1482, f1686; +sub.f32 f1844, f1688, f2691; +st.shared.v2.f32 [r20+1944], {f1843, f1844}; +fma.rn.f32 f1845, f1691, f1498, f1694; +sub.f32 f1846, f1696, f2689; +st.shared.v2.f32 [r20+2160], {f1845, f1846}; +fma.rn.f32 f1847, f1699, f1514, f1702; +sub.f32 f1848, f1704, f2686; +st.shared.v2.f32 [r20+2376], {f1847, f1848}; +fma.rn.f32 f1849, f1707, f1530, f1710; +sub.f32 f1850, f1712, f2684; +st.shared.v2.f32 [r20+2592], {f1849, f1850}; +fma.rn.f32 f1851, f1715, f1546, f1718; +sub.f32 f1852, f1720, f2682; +st.shared.v2.f32 [r20+2808], {f1851, f1852}; +fma.rn.f32 f1853, f1723, f1562, f1726; +sub.f32 f1854, f1728, f2679; +st.shared.v2.f32 [r20+3024], {f1853, f1854}; +fma.rn.f32 f1855, f1731, f1578, f1734; +sub.f32 f1856, f1736, f2677; +st.shared.v2.f32 [r20+3240], {f1855, f1856}; +fma.rn.f32 f1857, f1739, f1594, f1742; +sub.f32 f1858, f1744, f2675; +st.shared.v2.f32 [r20+3456], {f1857, f1858}; +fma.rn.f32 f1859, f1747, f1610, f1750; +sub.f32 f1860, f1752, f2672; +st.shared.v2.f32 [r20+3672], {f1859, f1860}; +fma.rn.f32 f1861, f1755, f1483, f1758; +sub.f32 f1862, f1760, f2670; +st.shared.v2.f32 [r20+3888], {f1861, f1862}; +fma.rn.f32 f1863, f1763, f1499, f1766; +sub.f32 f1864, f1768, f2667; +st.shared.v2.f32 [r20+4104], {f1863, f1864}; +fma.rn.f32 f1865, f1771, f1515, f1774; +sub.f32 f1866, f1776, f2665; +st.shared.v2.f32 [r20+4320], {f1865, f1866}; +fma.rn.f32 f1867, f1779, f1531, f1782; +sub.f32 f1868, f1784, f2663; +st.shared.v2.f32 [r20+4536], {f1867, f1868}; +fma.rn.f32 f1869, f1787, f1547, f1790; +sub.f32 f1870, f1792, f2660; +st.shared.v2.f32 [r20+4752], {f1869, f1870}; +fma.rn.f32 f1871, f1795, f1563, f1798; +sub.f32 f1872, f1800, f2658; +st.shared.v2.f32 [r20+4968], {f1871, f1872}; +fma.rn.f32 f1873, f1803, f1579, f1806; +sub.f32 f1874, f1808, f2656; +st.shared.v2.f32 [r20+5184], {f1873, f1874}; +fma.rn.f32 f1875, f1811, f1595, f1814; +sub.f32 f1876, f1816, f2653; +st.shared.v2.f32 [r20+5400], {f1875, f1876}; +fma.rn.f32 f1877, f1819, f1611, f1822; +sub.f32 f1878, f1824, f1823; +st.shared.v2.f32 [r20+5616], {f1877, f1878}; +barrier.sync 0; +ld.shared.v2.f32 {f1879, f1880}, [r10]; +ld.shared.v2.f32 {f1883, f1884}, [r10+5832]; +ld.shared.v2.f32 {f1887, f1888}, [r10+11664]; +ld.shared.v2.f32 {f1891, f1892}, [r10+17496]; +ld.shared.v2.f32 {f1895, f1896}, [r10+23328]; +ld.shared.v2.f32 {f1899, f1900}, [r10+29160]; +ld.shared.v2.f32 {f1903, f1904}, [r10+34992]; +ld.shared.v2.f32 {f1907, f1908}, [r10+40824]; +ld.shared.v2.f32 {f1911, f1912}, [r10+46656]; +ld.shared.v2.f32 {f1915, f1916}, [r10+52488]; +ld.shared.v2.f32 {f1919, f1920}, [r10+58320]; +ld.shared.v2.f32 {f1923, f1924}, [r10+64152]; +ld.shared.v2.f32 {f1927, f1928}, [r10+69984]; +ld.shared.v2.f32 {f1931, f1932}, [r10+75816]; +ld.shared.v2.f32 {f1935, f1936}, [r10+81648]; +ld.shared.v2.f32 {f1939, f1940}, [r10+87480]; +ld.shared.v2.f32 {f1943, f1944}, [r10+93312]; +ld.shared.v2.f32 {f1947, f1948}, [r10+99144]; +ld.shared.v2.f32 {f1951, f1952}, [r10+104976]; +ld.shared.v2.f32 {f1955, f1956}, [r10+110808]; +ld.shared.v2.f32 {f1959, f1960}, [r10+116640]; +ld.shared.v2.f32 {f1963, f1964}, [r10+122472]; +ld.shared.v2.f32 {f1967, f1968}, [r10+128304]; +ld.shared.v2.f32 {f1971, f1972}, [r10+134136]; +ld.shared.v2.f32 {f1975, f1976}, [r10+139968]; +ld.shared.v2.f32 {f1979, f1980}, [r10+145800]; +ld.shared.v2.f32 {f1983, f1984}, [r10+151632]; +add.f32 f1987, f1915, f1951; +add.f32 f1988, f1879, f1987; +mul.f32 f1991, f1987, 0f3F000000; +sub.f32 f1992, f1879, f1991; +add.f32 f2651, f1916, f1952; +sub.f32 f1993, f1916, f1952; +mul.f32 f1994, f1993, 0fBF5DB3D7; +add.f32 f1995, f1994, f1992; +sub.f32 f1996, f1992, f1994; +add.f32 f2650, f1880, f2651; +mul.f32 f1997, f2651, 0f3F000000; +sub.f32 f1998, f1880, f1997; +sub.f32 f1999, f1915, f1951; +mul.f32 f2000, f1999, 0fBF5DB3D7; +sub.f32 f2001, f1998, f2000; +add.f32 f2002, f2000, f1998; +add.f32 f2003, f1927, f1963; +add.f32 f2004, f1891, f2003; +mul.f32 f2007, f2003, 0f3F000000; +sub.f32 f2008, f1891, f2007; +add.f32 f2649, f1928, f1964; +sub.f32 f2009, f1928, f1964; +mul.f32 f2010, f2009, 0fBF5DB3D7; +add.f32 f2011, f2010, f2008; +sub.f32 f2012, f2008, f2010; +add.f32 f2648, f1892, f2649; +mul.f32 f2013, f2649, 0f3F000000; +sub.f32 f2014, f1892, f2013; +sub.f32 f2015, f1927, f1963; +mul.f32 f2016, f2015, 0fBF5DB3D7; +sub.f32 f2017, f2014, f2016; +add.f32 f2018, f2016, f2014; +add.f32 f2019, f1939, f1975; +add.f32 f2020, f1903, f2019; +mul.f32 f2023, f2019, 0f3F000000; +sub.f32 f2024, f1903, f2023; +add.f32 f2647, f1940, f1976; +sub.f32 f2025, f1940, f1976; +mul.f32 f2026, f2025, 0fBF5DB3D7; +add.f32 f2027, f2026, f2024; +sub.f32 f2028, f2024, f2026; +add.f32 f2646, f1904, f2647; +mul.f32 f2029, f2647, 0f3F000000; +sub.f32 f2030, f1904, f2029; +sub.f32 f2031, f1939, f1975; +mul.f32 f2032, f2031, 0fBF5DB3D7; +sub.f32 f2033, f2030, f2032; +add.f32 f2034, f2032, f2030; +mul.f32 f2036, f2017, 0f3F248DBB; +mul.f32 f2645, f2011, 0f3F441B7D; +sub.f32 f2037, f2645, f2036; +mul.f32 f2038, f2017, 0f3F441B7D; +fma.rn.f32 f2039, f2011, 0f3F248DBB, f2038; +mul.f32 f2041, f2033, 0f3F7C1C5C; +mul.f32 f2644, f2027, 0f3E31D0D4; +sub.f32 f2042, f2644, f2041; +mul.f32 f2043, f2033, 0f3E31D0D4; +fma.rn.f32 f2044, f2027, 0f3F7C1C5C, f2043; +mul.f32 f2046, f2018, 0f3F7C1C5C; +mul.f32 f2643, f2012, 0f3E31D0D4; +sub.f32 f2047, f2643, f2046; +mul.f32 f2048, f2018, 0f3E31D0D4; +fma.rn.f32 f2049, f2012, 0f3F7C1C5C, f2048; +mul.f32 f2051, f2034, 0f3EAF1D44; +mul.f32 f2642, f2028, 0fBF708FB2; +sub.f32 f2052, f2642, f2051; +mul.f32 f2053, f2034, 0fBF708FB2; +fma.rn.f32 f2054, f2028, 0f3EAF1D44, f2053; +add.f32 f2055, f2004, f2020; +add.f32 f2056, f1988, f2055; +mul.f32 f2059, f2055, 0f3F000000; +sub.f32 f2060, f1988, f2059; +add.f32 f2641, f2648, f2646; +sub.f32 f2061, f2648, f2646; +mul.f32 f2062, f2061, 0fBF5DB3D7; +add.f32 f2063, f2062, f2060; +sub.f32 f2064, f2060, f2062; +add.f32 f2640, f2650, f2641; +mul.f32 f2065, f2641, 0f3F000000; +sub.f32 f2066, f2650, f2065; +sub.f32 f2067, f2004, f2020; +mul.f32 f2068, f2067, 0fBF5DB3D7; +sub.f32 f2069, f2066, f2068; +add.f32 f2070, f2068, f2066; +add.f32 f2071, f2037, f2042; +add.f32 f2072, f1995, f2071; +mul.f32 f2075, f2071, 0f3F000000; +sub.f32 f2076, f1995, f2075; +add.f32 f2639, f2039, f2044; +sub.f32 f2077, f2039, f2044; +mul.f32 f2078, f2077, 0fBF5DB3D7; +add.f32 f2079, f2078, f2076; +sub.f32 f2080, f2076, f2078; +add.f32 f2638, f2001, f2639; +mul.f32 f2081, f2639, 0f3F000000; +sub.f32 f2082, f2001, f2081; +sub.f32 f2083, f2037, f2042; +mul.f32 f2084, f2083, 0fBF5DB3D7; +sub.f32 f2085, f2082, f2084; +add.f32 f2086, f2084, f2082; +add.f32 f2087, f2047, f2052; +add.f32 f2088, f1996, f2087; +mul.f32 f2091, f2087, 0f3F000000; +sub.f32 f2092, f1996, f2091; +add.f32 f2637, f2049, f2054; +sub.f32 f2093, f2049, f2054; +mul.f32 f2094, f2093, 0fBF5DB3D7; +add.f32 f2095, f2094, f2092; +sub.f32 f2096, f2092, f2094; +add.f32 f2636, f2002, f2637; +mul.f32 f2097, f2637, 0f3F000000; +sub.f32 f2098, f2002, f2097; +sub.f32 f2099, f2047, f2052; +mul.f32 f2100, f2099, 0fBF5DB3D7; +sub.f32 f2101, f2098, f2100; +add.f32 f2102, f2100, f2098; +add.f32 f2103, f1919, f1955; +add.f32 f2104, f1883, f2103; +mul.f32 f2107, f2103, 0f3F000000; +sub.f32 f2108, f1883, f2107; +add.f32 f2635, f1920, f1956; +sub.f32 f2109, f1920, f1956; +mul.f32 f2110, f2109, 0fBF5DB3D7; +add.f32 f2111, f2110, f2108; +sub.f32 f2112, f2108, f2110; +add.f32 f2634, f1884, f2635; +mul.f32 f2113, f2635, 0f3F000000; +sub.f32 f2114, f1884, f2113; +sub.f32 f2115, f1919, f1955; +mul.f32 f2116, f2115, 0fBF5DB3D7; +sub.f32 f2117, f2114, f2116; +add.f32 f2118, f2116, f2114; +add.f32 f2119, f1931, f1967; +add.f32 f2120, f1895, f2119; +mul.f32 f2123, f2119, 0f3F000000; +sub.f32 f2124, f1895, f2123; +add.f32 f2633, f1932, f1968; +sub.f32 f2125, f1932, f1968; +mul.f32 f2126, f2125, 0fBF5DB3D7; +add.f32 f2127, f2126, f2124; +sub.f32 f2128, f2124, f2126; +add.f32 f2632, f1896, f2633; +mul.f32 f2129, f2633, 0f3F000000; +sub.f32 f2130, f1896, f2129; +sub.f32 f2131, f1931, f1967; +mul.f32 f2132, f2131, 0fBF5DB3D7; +sub.f32 f2133, f2130, f2132; +add.f32 f2134, f2132, f2130; +add.f32 f2135, f1943, f1979; +add.f32 f2136, f1907, f2135; +mul.f32 f2139, f2135, 0f3F000000; +sub.f32 f2140, f1907, f2139; +add.f32 f2631, f1944, f1980; +sub.f32 f2141, f1944, f1980; +mul.f32 f2142, f2141, 0fBF5DB3D7; +add.f32 f2143, f2142, f2140; +sub.f32 f2144, f2140, f2142; +add.f32 f2630, f1908, f2631; +mul.f32 f2145, f2631, 0f3F000000; +sub.f32 f2146, f1908, f2145; +sub.f32 f2147, f1943, f1979; +mul.f32 f2148, f2147, 0fBF5DB3D7; +sub.f32 f2149, f2146, f2148; +add.f32 f2150, f2148, f2146; +mul.f32 f2152, f2133, 0f3F248DBB; +mul.f32 f2629, f2127, 0f3F441B7D; +sub.f32 f2153, f2629, f2152; +mul.f32 f2154, f2133, 0f3F441B7D; +fma.rn.f32 f2155, f2127, 0f3F248DBB, f2154; +mul.f32 f2157, f2149, 0f3F7C1C5C; +mul.f32 f2628, f2143, 0f3E31D0D4; +sub.f32 f2158, f2628, f2157; +mul.f32 f2159, f2149, 0f3E31D0D4; +fma.rn.f32 f2160, f2143, 0f3F7C1C5C, f2159; +mul.f32 f2162, f2134, 0f3F7C1C5C; +mul.f32 f2627, f2128, 0f3E31D0D4; +sub.f32 f2163, f2627, f2162; +mul.f32 f2164, f2134, 0f3E31D0D4; +fma.rn.f32 f2165, f2128, 0f3F7C1C5C, f2164; +mul.f32 f2167, f2150, 0f3EAF1D44; +mul.f32 f2626, f2144, 0fBF708FB2; +sub.f32 f2168, f2626, f2167; +mul.f32 f2169, f2150, 0fBF708FB2; +fma.rn.f32 f2170, f2144, 0f3EAF1D44, f2169; +add.f32 f2171, f2120, f2136; +add.f32 f2172, f2104, f2171; +mul.f32 f2175, f2171, 0f3F000000; +sub.f32 f2176, f2104, f2175; +add.f32 f2625, f2632, f2630; +sub.f32 f2177, f2632, f2630; +mul.f32 f2178, f2177, 0fBF5DB3D7; +add.f32 f2179, f2178, f2176; +sub.f32 f2180, f2176, f2178; +add.f32 f2624, f2634, f2625; +mul.f32 f2181, f2625, 0f3F000000; +sub.f32 f2182, f2634, f2181; +sub.f32 f2183, f2120, f2136; +mul.f32 f2184, f2183, 0fBF5DB3D7; +sub.f32 f2185, f2182, f2184; +add.f32 f2186, f2184, f2182; +add.f32 f2187, f2153, f2158; +add.f32 f2188, f2111, f2187; +mul.f32 f2191, f2187, 0f3F000000; +sub.f32 f2192, f2111, f2191; +add.f32 f2623, f2155, f2160; +sub.f32 f2193, f2155, f2160; +mul.f32 f2194, f2193, 0fBF5DB3D7; +add.f32 f2195, f2194, f2192; +sub.f32 f2196, f2192, f2194; +add.f32 f2622, f2117, f2623; +mul.f32 f2197, f2623, 0f3F000000; +sub.f32 f2198, f2117, f2197; +sub.f32 f2199, f2153, f2158; +mul.f32 f2200, f2199, 0fBF5DB3D7; +sub.f32 f2201, f2198, f2200; +add.f32 f2202, f2200, f2198; +add.f32 f2203, f2163, f2168; +add.f32 f2204, f2112, f2203; +mul.f32 f2207, f2203, 0f3F000000; +sub.f32 f2208, f2112, f2207; +add.f32 f2621, f2165, f2170; +sub.f32 f2209, f2165, f2170; +mul.f32 f2210, f2209, 0fBF5DB3D7; +add.f32 f2211, f2210, f2208; +sub.f32 f2212, f2208, f2210; +add.f32 f2620, f2118, f2621; +mul.f32 f2213, f2621, 0f3F000000; +sub.f32 f2214, f2118, f2213; +sub.f32 f2215, f2163, f2168; +mul.f32 f2216, f2215, 0fBF5DB3D7; +sub.f32 f2217, f2214, f2216; +add.f32 f2218, f2216, f2214; +add.f32 f2219, f1923, f1959; +add.f32 f2220, f1887, f2219; +mul.f32 f2223, f2219, 0f3F000000; +sub.f32 f2224, f1887, f2223; +add.f32 f2619, f1924, f1960; +sub.f32 f2225, f1924, f1960; +mul.f32 f2226, f2225, 0fBF5DB3D7; +add.f32 f2227, f2226, f2224; +sub.f32 f2228, f2224, f2226; +add.f32 f2618, f1888, f2619; +mul.f32 f2229, f2619, 0f3F000000; +sub.f32 f2230, f1888, f2229; +sub.f32 f2231, f1923, f1959; +mul.f32 f2232, f2231, 0fBF5DB3D7; +sub.f32 f2233, f2230, f2232; +add.f32 f2234, f2232, f2230; +add.f32 f2235, f1935, f1971; +add.f32 f2236, f1899, f2235; +mul.f32 f2239, f2235, 0f3F000000; +sub.f32 f2240, f1899, f2239; +add.f32 f2617, f1936, f1972; +sub.f32 f2241, f1936, f1972; +mul.f32 f2242, f2241, 0fBF5DB3D7; +add.f32 f2243, f2242, f2240; +sub.f32 f2244, f2240, f2242; +add.f32 f2616, f1900, f2617; +mul.f32 f2245, f2617, 0f3F000000; +sub.f32 f2246, f1900, f2245; +sub.f32 f2247, f1935, f1971; +mul.f32 f2248, f2247, 0fBF5DB3D7; +sub.f32 f2249, f2246, f2248; +add.f32 f2250, f2248, f2246; +add.f32 f2251, f1947, f1983; +add.f32 f2252, f1911, f2251; +mul.f32 f2255, f2251, 0f3F000000; +sub.f32 f2256, f1911, f2255; +add.f32 f2615, f1948, f1984; +sub.f32 f2257, f1948, f1984; +mul.f32 f2258, f2257, 0fBF5DB3D7; +add.f32 f2259, f2258, f2256; +sub.f32 f2260, f2256, f2258; +add.f32 f2614, f1912, f2615; +mul.f32 f2261, f2615, 0f3F000000; +sub.f32 f2262, f1912, f2261; +sub.f32 f2263, f1947, f1983; +mul.f32 f2264, f2263, 0fBF5DB3D7; +sub.f32 f2265, f2262, f2264; +add.f32 f2266, f2264, f2262; +mul.f32 f2268, f2249, 0f3F248DBB; +mul.f32 f2613, f2243, 0f3F441B7D; +sub.f32 f2269, f2613, f2268; +mul.f32 f2270, f2249, 0f3F441B7D; +fma.rn.f32 f2271, f2243, 0f3F248DBB, f2270; +mul.f32 f2273, f2265, 0f3F7C1C5C; +mul.f32 f2612, f2259, 0f3E31D0D4; +sub.f32 f2274, f2612, f2273; +mul.f32 f2275, f2265, 0f3E31D0D4; +fma.rn.f32 f2276, f2259, 0f3F7C1C5C, f2275; +mul.f32 f2278, f2250, 0f3F7C1C5C; +mul.f32 f2611, f2244, 0f3E31D0D4; +sub.f32 f2279, f2611, f2278; +mul.f32 f2280, f2250, 0f3E31D0D4; +fma.rn.f32 f2281, f2244, 0f3F7C1C5C, f2280; +mul.f32 f2283, f2266, 0f3EAF1D44; +mul.f32 f2610, f2260, 0fBF708FB2; +sub.f32 f2284, f2610, f2283; +mul.f32 f2285, f2266, 0fBF708FB2; +fma.rn.f32 f2286, f2260, 0f3EAF1D44, f2285; +add.f32 f2287, f2236, f2252; +add.f32 f2288, f2220, f2287; +mul.f32 f2291, f2287, 0f3F000000; +sub.f32 f2292, f2220, f2291; +add.f32 f2609, f2616, f2614; +sub.f32 f2293, f2616, f2614; +mul.f32 f2294, f2293, 0fBF5DB3D7; +add.f32 f2295, f2294, f2292; +sub.f32 f2296, f2292, f2294; +add.f32 f2608, f2618, f2609; +mul.f32 f2297, f2609, 0f3F000000; +sub.f32 f2298, f2618, f2297; +sub.f32 f2299, f2236, f2252; +mul.f32 f2300, f2299, 0fBF5DB3D7; +sub.f32 f2301, f2298, f2300; +add.f32 f2302, f2300, f2298; +add.f32 f2303, f2269, f2274; +add.f32 f2304, f2227, f2303; +mul.f32 f2307, f2303, 0f3F000000; +sub.f32 f2308, f2227, f2307; +add.f32 f2607, f2271, f2276; +sub.f32 f2309, f2271, f2276; +mul.f32 f2310, f2309, 0fBF5DB3D7; +add.f32 f2311, f2310, f2308; +sub.f32 f2312, f2308, f2310; +add.f32 f2606, f2233, f2607; +mul.f32 f2313, f2607, 0f3F000000; +sub.f32 f2314, f2233, f2313; +sub.f32 f2315, f2269, f2274; +mul.f32 f2316, f2315, 0fBF5DB3D7; +sub.f32 f2317, f2314, f2316; +add.f32 f2318, f2316, f2314; +add.f32 f2319, f2279, f2284; +add.f32 f2320, f2228, f2319; +mul.f32 f2323, f2319, 0f3F000000; +sub.f32 f2324, f2228, f2323; +add.f32 f2605, f2281, f2286; +sub.f32 f2325, f2281, f2286; +mul.f32 f2326, f2325, 0fBF5DB3D7; +add.f32 f2327, f2326, f2324; +sub.f32 f2328, f2324, f2326; +add.f32 f2604, f2234, f2605; +mul.f32 f2329, f2605, 0f3F000000; +sub.f32 f2330, f2234, f2329; +sub.f32 f2331, f2279, f2284; +mul.f32 f2332, f2331, 0fBF5DB3D7; +sub.f32 f2333, f2330, f2332; +add.f32 f2334, f2332, f2330; +mul.f32 f2602, f2188, 0f3F791978; +mul.f32 f2603, f2622, 0f3E6C2691; +sub.f32 f2337, f2602, f2603; +mul.f32 f2338, f2622, 0f3F791978; +fma.rn.f32 f2339, f2188, 0f3E6C2691, f2338; +mul.f32 f2600, f2304, 0f3F64C51C; +mul.f32 f2601, f2606, 0f3EE5C902; +sub.f32 f2342, f2600, f2601; +mul.f32 f2343, f2606, 0f3F64C51C; +fma.rn.f32 f2344, f2304, 0f3EE5C902, f2343; +mul.f32 f2598, f2204, 0f3F64C51C; +mul.f32 f2599, f2620, 0f3EE5C902; +sub.f32 f2347, f2598, f2599; +mul.f32 f2348, f2620, 0f3F64C51C; +fma.rn.f32 f2349, f2204, 0f3EE5C902, f2348; +mul.f32 f2351, f2604, 0f3F4D57F2; +mul.f32 f2597, f2320, 0f3F18DF63; +sub.f32 f2352, f2597, f2351; +mul.f32 f2353, f2604, 0f3F18DF63; +fma.rn.f32 f2354, f2320, 0f3F4D57F2, f2353; +mul.f32 f2356, f2185, 0f3F248DBB; +mul.f32 f2596, f2179, 0f3F441B7D; +sub.f32 f2357, f2596, f2356; +mul.f32 f2358, f2185, 0f3F441B7D; +fma.rn.f32 f2359, f2179, 0f3F248DBB, f2358; +mul.f32 f2361, f2301, 0f3F7C1C5C; +mul.f32 f2595, f2295, 0f3E31D0D4; +sub.f32 f2362, f2595, f2361; +mul.f32 f2363, f2301, 0f3E31D0D4; +fma.rn.f32 f2364, f2295, 0f3F7C1C5C, f2363; +mul.f32 f2366, f2201, 0f3F4D57F2; +mul.f32 f2594, f2195, 0f3F18DF63; +sub.f32 f2367, f2594, f2366; +mul.f32 f2368, f2201, 0f3F18DF63; +fma.rn.f32 f2369, f2195, 0f3F4D57F2, f2368; +mul.f32 f2371, f2317, 0f3F753ECD; +mul.f32 f2593, f2311, 0fBE92D7E0; +sub.f32 f2372, f2593, f2371; +mul.f32 f2373, f2317, 0fBE92D7E0; +fma.rn.f32 f2374, f2311, 0f3F753ECD, f2373; +mul.f32 f2591, f2211, 0f3ECACAF8; +mul.f32 f2592, f2217, 0f3F6B1036; +sub.f32 f2377, f2591, f2592; +mul.f32 f2378, f2217, 0f3ECACAF8; +fma.rn.f32 f2379, f2211, 0f3F6B1036, f2378; +mul.f32 f2589, f2327, 0fBF2FAD88; +mul.f32 f2590, f2333, 0f3F3A3529; +sub.f32 f2382, f2589, f2590; +mul.f32 f2383, f2333, 0fBF2FAD88; +fma.rn.f32 f2384, f2327, 0f3F3A3529, f2383; +mul.f32 f2587, f2180, 0f3E31D0D4; +mul.f32 f2588, f2186, 0f3F7C1C5C; +sub.f32 f2387, f2587, f2588; +mul.f32 f2388, f2186, 0f3E31D0D4; +fma.rn.f32 f2389, f2180, 0f3F7C1C5C, f2388; +mul.f32 f2585, f2296, 0fBF708FB2; +mul.f32 f2586, f2302, 0f3EAF1D44; +sub.f32 f2392, f2585, f2586; +mul.f32 f2393, f2302, 0fBF708FB2; +fma.rn.f32 f2394, f2296, 0f3EAF1D44, f2393; +mul.f32 f2396, f2202, 0f3F7F9120; +mul.f32 f2584, f2196, 0fBD6E2946; +sub.f32 f2397, f2584, f2396; +mul.f32 f2398, f2202, 0fBD6E2946; +fma.rn.f32 f2399, f2196, 0f3F7F9120, f2398; +mul.f32 f2401, f2318, 0fBDEDC21F; +mul.f32 f2583, f2312, 0fBF7E44DE; +sub.f32 f2402, f2583, f2401; +mul.f32 f2403, f2318, 0fBF7E44DE; +fma.rn.f32 f2404, f2312, 0fBDEDC21F, f2403; +mul.f32 f2406, f2218, 0f3F753ECD; +mul.f32 f2582, f2212, 0fBE92D7E0; +sub.f32 f2407, f2582, f2406; +mul.f32 f2408, f2218, 0fBE92D7E0; +fma.rn.f32 f2409, f2212, 0f3F753ECD, f2408; +mul.f32 f2411, f2334, 0fBF0CAC9F; +mul.f32 f2581, f2328, 0fBF55E287; +sub.f32 f2412, f2581, f2411; +mul.f32 f2413, f2334, 0fBF55E287; +fma.rn.f32 f2414, f2328, 0fBF0CAC9F, f2413; +add.f32 f2415, f2172, f2288; +mul.f32 f2417, f2415, 0f3F000000; +sub.f32 f2418, f2056, f2417; +add.f32 f2580, f2624, f2608; +sub.f32 f2419, f2624, f2608; +mul.f32 f2420, f2419, 0fBF5DB3D7; +mul.f32 f2421, f2580, 0f3F000000; +sub.f32 f2422, f2640, f2421; +sub.f32 f2423, f2172, f2288; +mul.f32 f2424, f2423, 0fBF5DB3D7; +add.f32 f2425, f2337, f2342; +mul.f32 f2427, f2425, 0f3F000000; +sub.f32 f2428, f2072, f2427; +add.f32 f2579, f2339, f2344; +sub.f32 f2429, f2339, f2344; +mul.f32 f2430, f2429, 0fBF5DB3D7; +mul.f32 f2431, f2579, 0f3F000000; +sub.f32 f2432, f2638, f2431; +sub.f32 f2433, f2337, f2342; +mul.f32 f2434, f2433, 0fBF5DB3D7; +add.f32 f2435, f2347, f2352; +mul.f32 f2437, f2435, 0f3F000000; +sub.f32 f2438, f2088, f2437; +add.f32 f2578, f2349, f2354; +sub.f32 f2439, f2349, f2354; +mul.f32 f2440, f2439, 0fBF5DB3D7; +mul.f32 f2441, f2578, 0f3F000000; +sub.f32 f2442, f2636, f2441; +sub.f32 f2443, f2347, f2352; +mul.f32 f2444, f2443, 0fBF5DB3D7; +add.f32 f2445, f2357, f2362; +mul.f32 f2447, f2445, 0f3F000000; +sub.f32 f2448, f2063, f2447; +add.f32 f2577, f2359, f2364; +sub.f32 f2449, f2359, f2364; +mul.f32 f2450, f2449, 0fBF5DB3D7; +mul.f32 f2451, f2577, 0f3F000000; +sub.f32 f2452, f2069, f2451; +sub.f32 f2453, f2357, f2362; +mul.f32 f2454, f2453, 0fBF5DB3D7; +add.f32 f2455, f2367, f2372; +mul.f32 f2457, f2455, 0f3F000000; +sub.f32 f2458, f2079, f2457; +add.f32 f2576, f2369, f2374; +sub.f32 f2459, f2369, f2374; +mul.f32 f2460, f2459, 0fBF5DB3D7; +mul.f32 f2461, f2576, 0f3F000000; +sub.f32 f2462, f2085, f2461; +sub.f32 f2463, f2367, f2372; +mul.f32 f2464, f2463, 0fBF5DB3D7; +add.f32 f2465, f2377, f2382; +mul.f32 f2467, f2465, 0f3F000000; +sub.f32 f2468, f2095, f2467; +add.f32 f2575, f2379, f2384; +sub.f32 f2469, f2379, f2384; +mul.f32 f2470, f2469, 0fBF5DB3D7; +mul.f32 f2471, f2575, 0f3F000000; +sub.f32 f2472, f2101, f2471; +sub.f32 f2473, f2377, f2382; +mul.f32 f2474, f2473, 0fBF5DB3D7; +add.f32 f2475, f2387, f2392; +mul.f32 f2477, f2475, 0f3F000000; +sub.f32 f2478, f2064, f2477; +add.f32 f2574, f2389, f2394; +sub.f32 f2479, f2389, f2394; +mul.f32 f2480, f2479, 0fBF5DB3D7; +mul.f32 f2481, f2574, 0f3F000000; +sub.f32 f2482, f2070, f2481; +sub.f32 f2483, f2387, f2392; +mul.f32 f2484, f2483, 0fBF5DB3D7; +add.f32 f2485, f2397, f2402; +mul.f32 f2487, f2485, 0f3F000000; +sub.f32 f2488, f2080, f2487; +add.f32 f2573, f2399, f2404; +sub.f32 f2489, f2399, f2404; +mul.f32 f2490, f2489, 0fBF5DB3D7; +mul.f32 f2491, f2573, 0f3F000000; +sub.f32 f2492, f2086, f2491; +sub.f32 f2493, f2397, f2402; +mul.f32 f2494, f2493, 0fBF5DB3D7; +add.f32 f2495, f2407, f2412; +mul.f32 f2497, f2495, 0f3F000000; +sub.f32 f2498, f2096, f2497; +add.f32 f2572, f2409, f2414; +sub.f32 f2499, f2409, f2414; +mul.f32 f2500, f2499, 0fBF5DB3D7; +mul.f32 f2501, f2572, 0f3F000000; +sub.f32 f2502, f2102, f2501; +sub.f32 f2503, f2407, f2412; +mul.f32 f2504, f2503, 0fBF5DB3D7; +add.f32 %1, f2640, f2580; +add.f32 %0, f2056, f2415; +add.f32 %3, f2638, f2579; +add.f32 %2, f2072, f2425; +add.f32 %5, f2636, f2578; +add.f32 %4, f2088, f2435; +add.f32 %7, f2069, f2577; +add.f32 %6, f2063, f2445; +add.f32 %9, f2085, f2576; +add.f32 %8, f2079, f2455; +add.f32 %11, f2101, f2575; +add.f32 %10, f2095, f2465; +add.f32 %13, f2070, f2574; +add.f32 %12, f2064, f2475; +add.f32 %15, f2086, f2573; +add.f32 %14, f2080, f2485; +add.f32 %17, f2102, f2572; +add.f32 %16, f2096, f2495; +add.f32 %18, f2420, f2418; +sub.f32 %19, f2422, f2424; +sub.f32 %21, f2432, f2434; +add.f32 %20, f2430, f2428; +sub.f32 %23, f2442, f2444; +add.f32 %22, f2440, f2438; +add.f32 %24, f2450, f2448; +sub.f32 %25, f2452, f2454; +add.f32 %26, f2460, f2458; +sub.f32 %27, f2462, f2464; +add.f32 %28, f2470, f2468; +sub.f32 %29, f2472, f2474; +add.f32 %30, f2480, f2478; +sub.f32 %31, f2482, f2484; +sub.f32 %33, f2492, f2494; +add.f32 %32, f2490, f2488; +sub.f32 %35, f2502, f2504; +add.f32 %34, f2500, f2498; +add.f32 %37, f2424, f2422; +sub.f32 %36, f2418, f2420; +add.f32 %39, f2434, f2432; +sub.f32 %38, f2428, f2430; +add.f32 %41, f2444, f2442; +sub.f32 %40, f2438, f2440; +add.f32 %43, f2454, f2452; +sub.f32 %42, f2448, f2450; +add.f32 %45, f2464, f2462; +sub.f32 %44, f2458, f2460; +add.f32 %47, f2474, f2472; +sub.f32 %46, f2468, f2470; +add.f32 %49, f2484, f2482; +sub.f32 %48, f2478, f2480; +add.f32 %51, f2494, f2492; +sub.f32 %50, f2488, f2490; +add.f32 %53, f2504, f2502; +sub.f32 %52, f2498, f2500; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_19683), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1160, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2865>; +.reg .b32 r<24>; +.reg .b64 rd<15>; +mov.u32 r22, %tid.y; +mov.u32 r23, %54; +mad.lo.s32 r3, r22, 78732, r23; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2856, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2855, %58, f2856; +mul.f32 f119, f2856, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2854, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2853, %64, f2854; +mul.f32 f135, f2854, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2852, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2851, %70, f2852; +mul.f32 f151, f2852, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f2850, f133, 0f3F441B7D; +sub.f32 f159, f2850, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f2848, f149, 0f3E31D0D4; +mul.f32 f2849, f155, 0f3F7C1C5C; +sub.f32 f164, f2848, f2849; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f2846, f134, 0f3E31D0D4; +mul.f32 f2847, f140, 0f3F7C1C5C; +sub.f32 f169, f2846, f2847; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f2844, f150, 0fBF708FB2; +mul.f32 f2845, f156, 0f3EAF1D44; +sub.f32 f174, f2844, f2845; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2843, f2853, f2851; +sub.f32 f183, f2853, f2851; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2842, f2855, f2843; +mul.f32 f187, f2843, 0f3F000000; +sub.f32 f188, f2855, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2841, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2840, f123, f2841; +mul.f32 f203, f2841, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2839, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2838, f124, f2839; +mul.f32 f219, f2839, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2835, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2833, %113, f2835; +mul.f32 f235, f2835, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2830, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2828, %116, f2830; +mul.f32 f251, f2830, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2825, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2823, %119, f2825; +mul.f32 f267, f2825, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f2822, f249, 0f3F441B7D; +sub.f32 f275, f2822, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f2821, f265, 0f3E31D0D4; +sub.f32 f280, f2821, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f2819, f250, 0f3E31D0D4; +mul.f32 f2820, f256, 0f3F7C1C5C; +sub.f32 f285, f2819, f2820; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f2817, f266, 0fBF708FB2; +mul.f32 f2818, f272, 0f3EAF1D44; +sub.f32 f290, f2817, f2818; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2816, f2828, f2823; +sub.f32 f299, f2828, f2823; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2815, f2833, f2816; +mul.f32 f303, f2816, 0f3F000000; +sub.f32 f304, f2833, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2814, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2813, f239, f2814; +mul.f32 f319, f2814, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2812, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2811, f240, f2812; +mul.f32 f335, f2812, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2808, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2806, %122, f2808; +mul.f32 f351, f2808, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2803, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2801, %125, f2803; +mul.f32 f367, f2803, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2799, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2797, %127, f2799; +mul.f32 f383, f2799, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f2796, f365, 0f3F441B7D; +sub.f32 f391, f2796, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f2795, f381, 0f3E31D0D4; +sub.f32 f396, f2795, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f2793, f366, 0f3E31D0D4; +mul.f32 f2794, f372, 0f3F7C1C5C; +sub.f32 f401, f2793, f2794; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f2791, f382, 0fBF708FB2; +mul.f32 f2792, f388, 0f3EAF1D44; +sub.f32 f406, f2791, f2792; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2790, f2801, f2797; +sub.f32 f415, f2801, f2797; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2789, f2806, f2790; +mul.f32 f419, f2790, 0f3F000000; +sub.f32 f420, f2806, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2788, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2787, f355, f2788; +mul.f32 f435, f2788, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2786, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2785, f356, f2786; +mul.f32 f451, f2786, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2813, 0f3E6C2691; +mul.f32 f2784, f310, 0f3F791978; +sub.f32 f459, f2784, f458; +mul.f32 f460, f2813, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f2782, f426, 0f3F64C51C; +mul.f32 f2783, f2787, 0f3EE5C902; +sub.f32 f464, f2782, f2783; +mul.f32 f465, f2787, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f2780, f326, 0f3F64C51C; +mul.f32 f2781, f2811, 0f3EE5C902; +sub.f32 f469, f2780, f2781; +mul.f32 f470, f2811, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f2778, f442, 0f3F18DF63; +mul.f32 f2779, f2785, 0f3F4D57F2; +sub.f32 f474, f2778, f2779; +mul.f32 f475, f2785, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f2776, f301, 0f3F441B7D; +mul.f32 f2777, f307, 0f3F248DBB; +sub.f32 f479, f2776, f2777; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f2775, f417, 0f3E31D0D4; +sub.f32 f484, f2775, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f2774, f317, 0f3F18DF63; +sub.f32 f489, f2774, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f2773, f433, 0fBE92D7E0; +sub.f32 f494, f2773, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f2772, f333, 0f3ECACAF8; +sub.f32 f499, f2772, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f2771, f449, 0fBF2FAD88; +sub.f32 f504, f2771, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f2770, f302, 0f3E31D0D4; +sub.f32 f509, f2770, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f2768, f418, 0fBF708FB2; +mul.f32 f2769, f424, 0f3EAF1D44; +sub.f32 f514, f2768, f2769; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f2766, f318, 0fBD6E2946; +mul.f32 f2767, f324, 0f3F7F9120; +sub.f32 f519, f2766, f2767; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f2764, f434, 0fBF7E44DE; +mul.f32 f2765, f440, 0fBDEDC21F; +sub.f32 f524, f2764, f2765; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f2763, f334, 0fBE92D7E0; +sub.f32 f529, f2763, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f2762, f450, 0fBF55E287; +sub.f32 f534, f2762, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f2761, f2815, f2789; +sub.f32 f543, f2815, f2789; +mul.f32 f544, f543, 0fBF5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f2760, f2842, f2761; +mul.f32 f547, f2761, 0f3F000000; +sub.f32 f548, f2842, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0fBF5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f2759, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f2758, f2840, f2759; +mul.f32 f563, f2759, 0f3F000000; +sub.f32 f564, f2840, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0fBF5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f2757, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f2756, f2838, f2757; +mul.f32 f579, f2757, 0f3F000000; +sub.f32 f580, f2838, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0fBF5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f2755, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0fBF5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f2754, f191, f2755; +mul.f32 f595, f2755, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0fBF5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f2753, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0fBF5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f2752, f207, f2753; +mul.f32 f611, f2753, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0fBF5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f2751, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0fBF5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f2750, f223, f2751; +mul.f32 f627, f2751, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0fBF5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f2749, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0fBF5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f2748, f192, f2749; +mul.f32 f643, f2749, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0fBF5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f2747, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0fBF5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f2746, f208, f2747; +mul.f32 f659, f2747, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0fBF5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f2745, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0fBF5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f2744, f224, f2745; +mul.f32 f675, f2745, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0fBF5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r21, %tid.x; +mul.wide.u32 rd2, r21, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r21, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f685, f2758, f682; +fma.rn.f32 f686, f681, f554, f685; +mul.f32 f687, f554, f682; +mul.f32 f688, f681, f2758; +sub.f32 f689, f688, f687; +mul.f32 f691, f682, f682; +mul.f32 f2743, f681, f681; +sub.f32 f692, f2743, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f695, f2756, f694; +fma.rn.f32 f696, f692, f570, f695; +mul.f32 f697, f570, f694; +mul.f32 f698, f692, f2756; +sub.f32 f699, f698, f697; +mul.f32 f701, f682, f694; +mul.f32 f2742, f681, f692; +sub.f32 f702, f2742, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f705, f2754, f704; +fma.rn.f32 f706, f702, f586, f705; +mul.f32 f707, f586, f704; +mul.f32 f708, f702, f2754; +sub.f32 f709, f708, f707; +mul.f32 f2740, f681, f702; +mul.f32 f2741, f682, f704; +sub.f32 f712, f2740, f2741; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f715, f2752, f714; +fma.rn.f32 f716, f712, f602, f715; +mul.f32 f717, f602, f714; +mul.f32 f718, f712, f2752; +sub.f32 f719, f718, f717; +mul.f32 f2738, f681, f712; +mul.f32 f2739, f682, f714; +sub.f32 f722, f2738, f2739; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f725, f2750, f724; +fma.rn.f32 f726, f722, f618, f725; +mul.f32 f727, f618, f724; +mul.f32 f728, f722, f2750; +sub.f32 f729, f728, f727; +mul.f32 f731, f682, f724; +mul.f32 f2737, f681, f722; +sub.f32 f732, f2737, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f735, f2748, f734; +fma.rn.f32 f736, f732, f634, f735; +mul.f32 f737, f634, f734; +mul.f32 f738, f732, f2748; +sub.f32 f739, f738, f737; +mul.f32 f741, f682, f734; +mul.f32 f2736, f681, f732; +sub.f32 f742, f2736, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f745, f2746, f744; +fma.rn.f32 f746, f742, f650, f745; +mul.f32 f747, f650, f744; +mul.f32 f748, f742, f2746; +sub.f32 f749, f748, f747; +mul.f32 f751, f682, f744; +mul.f32 f2735, f681, f742; +sub.f32 f752, f2735, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f755, f2744, f754; +fma.rn.f32 f756, f752, f666, f755; +mul.f32 f757, f666, f754; +mul.f32 f758, f752, f2744; +sub.f32 f759, f758, f757; +mul.f32 f2733, f681, f752; +mul.f32 f2734, f682, f754; +sub.f32 f762, f2733, f2734; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f765, f551, f764; +fma.rn.f32 f766, f762, f545, f765; +mul.f32 f767, f545, f764; +mul.f32 f768, f762, f551; +sub.f32 f769, f768, f767; +mul.f32 f2731, f681, f762; +mul.f32 f2732, f682, f764; +sub.f32 f772, f2731, f2732; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f775, f567, f774; +fma.rn.f32 f776, f772, f561, f775; +mul.f32 f777, f561, f774; +mul.f32 f778, f772, f567; +sub.f32 f779, f778, f777; +mul.f32 f781, f682, f774; +mul.f32 f2730, f681, f772; +sub.f32 f782, f2730, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f785, f583, f784; +fma.rn.f32 f786, f782, f577, f785; +mul.f32 f787, f577, f784; +mul.f32 f788, f782, f583; +sub.f32 f789, f788, f787; +mul.f32 f791, f682, f784; +mul.f32 f2729, f681, f782; +sub.f32 f792, f2729, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f795, f599, f794; +fma.rn.f32 f796, f792, f593, f795; +mul.f32 f797, f593, f794; +mul.f32 f798, f792, f599; +sub.f32 f799, f798, f797; +mul.f32 f801, f682, f794; +mul.f32 f2728, f681, f792; +sub.f32 f802, f2728, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f805, f615, f804; +fma.rn.f32 f806, f802, f609, f805; +mul.f32 f807, f609, f804; +mul.f32 f808, f802, f615; +sub.f32 f809, f808, f807; +mul.f32 f2726, f681, f802; +mul.f32 f2727, f682, f804; +sub.f32 f812, f2726, f2727; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f815, f631, f814; +fma.rn.f32 f816, f812, f625, f815; +mul.f32 f817, f625, f814; +mul.f32 f818, f812, f631; +sub.f32 f819, f818, f817; +mul.f32 f2724, f681, f812; +mul.f32 f2725, f682, f814; +sub.f32 f822, f2724, f2725; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f825, f647, f824; +fma.rn.f32 f826, f822, f641, f825; +mul.f32 f827, f641, f824; +mul.f32 f828, f822, f647; +sub.f32 f829, f828, f827; +mul.f32 f831, f682, f824; +mul.f32 f2723, f681, f822; +sub.f32 f832, f2723, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f835, f663, f834; +fma.rn.f32 f836, f832, f657, f835; +mul.f32 f837, f657, f834; +mul.f32 f838, f832, f663; +sub.f32 f839, f838, f837; +mul.f32 f841, f682, f834; +mul.f32 f2722, f681, f832; +sub.f32 f842, f2722, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f845, f679, f844; +fma.rn.f32 f846, f842, f673, f845; +mul.f32 f847, f673, f844; +mul.f32 f848, f842, f679; +sub.f32 f849, f848, f847; +mul.f32 f2720, f681, f842; +mul.f32 f2721, f682, f844; +sub.f32 f852, f2720, f2721; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f855, f552, f854; +fma.rn.f32 f856, f852, f546, f855; +mul.f32 f857, f546, f854; +mul.f32 f858, f852, f552; +sub.f32 f859, f858, f857; +mul.f32 f2718, f681, f852; +mul.f32 f2719, f682, f854; +sub.f32 f862, f2718, f2719; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f865, f568, f864; +fma.rn.f32 f866, f862, f562, f865; +mul.f32 f867, f562, f864; +mul.f32 f868, f862, f568; +sub.f32 f869, f868, f867; +mul.f32 f871, f682, f864; +mul.f32 f2717, f681, f862; +sub.f32 f872, f2717, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f875, f584, f874; +fma.rn.f32 f876, f872, f578, f875; +mul.f32 f877, f578, f874; +mul.f32 f878, f872, f584; +sub.f32 f879, f878, f877; +mul.f32 f881, f682, f874; +mul.f32 f2716, f681, f872; +sub.f32 f882, f2716, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f885, f600, f884; +fma.rn.f32 f886, f882, f594, f885; +mul.f32 f887, f594, f884; +mul.f32 f888, f882, f600; +sub.f32 f889, f888, f887; +mul.f32 f891, f682, f884; +mul.f32 f2715, f681, f882; +sub.f32 f892, f2715, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f895, f616, f894; +fma.rn.f32 f896, f892, f610, f895; +mul.f32 f897, f610, f894; +mul.f32 f898, f892, f616; +sub.f32 f899, f898, f897; +mul.f32 f2713, f681, f892; +mul.f32 f2714, f682, f894; +sub.f32 f902, f2713, f2714; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f905, f632, f904; +fma.rn.f32 f906, f902, f626, f905; +mul.f32 f907, f626, f904; +mul.f32 f908, f902, f632; +sub.f32 f909, f908, f907; +mul.f32 f2711, f681, f902; +mul.f32 f2712, f682, f904; +sub.f32 f912, f2711, f2712; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f915, f648, f914; +fma.rn.f32 f916, f912, f642, f915; +mul.f32 f917, f642, f914; +mul.f32 f918, f912, f648; +sub.f32 f919, f918, f917; +mul.f32 f921, f682, f914; +mul.f32 f2710, f681, f912; +sub.f32 f922, f2710, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f925, f664, f924; +fma.rn.f32 f926, f922, f658, f925; +mul.f32 f927, f658, f924; +mul.f32 f928, f922, f664; +sub.f32 f929, f928, f927; +mul.f32 f931, f682, f924; +mul.f32 f2709, f681, f922; +sub.f32 f932, f2709, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f935, f680, f934; +fma.rn.f32 f936, f932, f674, f935; +mul.f32 f937, f674, f934; +mul.f32 f938, f932, f680; +sub.f32 f939, f938, f937; +mad.lo.s32 r8, r5, 78732, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f686; +st.shared.f32 [r9+8], f696; +st.shared.f32 [r9+12], f706; +st.shared.f32 [r9+16], f716; +st.shared.f32 [r9+20], f726; +st.shared.f32 [r9+24], f736; +st.shared.f32 [r9+28], f746; +st.shared.f32 [r9+32], f756; +st.shared.f32 [r9+36], f766; +st.shared.f32 [r9+40], f776; +st.shared.f32 [r9+44], f786; +st.shared.f32 [r9+48], f796; +st.shared.f32 [r9+52], f806; +st.shared.f32 [r9+56], f816; +st.shared.f32 [r9+60], f826; +st.shared.f32 [r9+64], f836; +st.shared.f32 [r9+68], f846; +st.shared.f32 [r9+72], f856; +st.shared.f32 [r9+76], f866; +st.shared.f32 [r9+80], f876; +st.shared.f32 [r9+84], f886; +st.shared.f32 [r9+88], f896; +st.shared.f32 [r9+92], f906; +st.shared.f32 [r9+96], f916; +st.shared.f32 [r9+100], f926; +st.shared.f32 [r9+104], f936; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+2916]; +ld.shared.f32 f942, [r10+5832]; +ld.shared.f32 f943, [r10+8748]; +ld.shared.f32 f944, [r10+11664]; +ld.shared.f32 f945, [r10+14580]; +ld.shared.f32 f946, [r10+17496]; +ld.shared.f32 f947, [r10+20412]; +ld.shared.f32 f948, [r10+23328]; +ld.shared.f32 f949, [r10+26244]; +ld.shared.f32 f950, [r10+29160]; +ld.shared.f32 f951, [r10+32076]; +ld.shared.f32 f952, [r10+34992]; +ld.shared.f32 f953, [r10+37908]; +ld.shared.f32 f954, [r10+40824]; +ld.shared.f32 f955, [r10+43740]; +ld.shared.f32 f956, [r10+46656]; +ld.shared.f32 f957, [r10+49572]; +ld.shared.f32 f958, [r10+52488]; +ld.shared.f32 f959, [r10+55404]; +ld.shared.f32 f960, [r10+58320]; +ld.shared.f32 f961, [r10+61236]; +ld.shared.f32 f962, [r10+64152]; +ld.shared.f32 f963, [r10+67068]; +ld.shared.f32 f964, [r10+69984]; +ld.shared.f32 f965, [r10+72900]; +ld.shared.f32 f966, [r10+75816]; +barrier.sync 0; +st.shared.f32 [r9], f2760; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +ld.shared.f32 f2708, [r10+52488]; +ld.shared.f32 f2707, [r10+26244]; +add.f32 f2706, f2707, f2708; +sub.f32 f1000, f2707, f2708; +mul.f32 f1001, f1000, 0fBF5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +ld.shared.f32 f2705, [r10]; +add.f32 f2704, f2705, f2706; +mul.f32 f1004, f2706, 0f3F000000; +sub.f32 f1005, f2705, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0fBF5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +ld.shared.f32 f2703, [r10+61236]; +sub.f32 f1015, f943, f1014; +ld.shared.f32 f2702, [r10+34992]; +add.f32 f2701, f2702, f2703; +sub.f32 f1016, f2702, f2703; +mul.f32 f1017, f1016, 0fBF5DB3D7; +ld.shared.f32 f2700, [r10+8748]; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f2699, f2700, f2701; +mul.f32 f1020, f2701, 0f3F000000; +sub.f32 f1021, f2700, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0fBF5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +ld.shared.f32 f2698, [r10+69984]; +ld.shared.f32 f2697, [r10+43740]; +sub.f32 f1031, f946, f1030; +add.f32 f2696, f2697, f2698; +sub.f32 f1032, f2697, f2698; +mul.f32 f1033, f1032, 0fBF5DB3D7; +ld.shared.f32 f2695, [r10+17496]; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f2694, f2695, f2696; +mul.f32 f1036, f2696, 0f3F000000; +sub.f32 f1037, f2695, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0fBF5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f2692, f1018, 0f3F441B7D; +mul.f32 f2693, f1024, 0f3F248DBB; +sub.f32 f1044, f2692, f2693; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0f3F248DBB, f1045; +mul.f32 f1048, f1040, 0f3F7C1C5C; +mul.f32 f2691, f1034, 0f3E31D0D4; +sub.f32 f1049, f2691, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0f3F7C1C5C, f1050; +mul.f32 f1053, f1025, 0f3F7C1C5C; +mul.f32 f2690, f1019, 0f3E31D0D4; +sub.f32 f1054, f2690, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0f3F7C1C5C, f1055; +mul.f32 f1058, f1041, 0f3EAF1D44; +mul.f32 f2689, f1035, 0fBF708FB2; +sub.f32 f1059, f2689, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0f3EAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f2688, f2699, f2694; +sub.f32 f1068, f2699, f2694; +mul.f32 f1069, f1068, 0fBF5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f2687, f2704, f2688; +mul.f32 f1072, f2688, 0f3F000000; +sub.f32 f1073, f2704, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0fBF5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f2686, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0fBF5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f2685, f1008, f2686; +mul.f32 f1088, f2686, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0fBF5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f2684, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0fBF5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f2683, f1009, f2684; +mul.f32 f1104, f2684, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0fBF5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +sub.f32 f1115, f941, f1114; +ld.shared.f32 f2682, [r10+55404]; +ld.shared.f32 f2681, [r10+29160]; +add.f32 f2680, f2681, f2682; +sub.f32 f1116, f2681, f2682; +mul.f32 f1117, f1116, 0fBF5DB3D7; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +ld.shared.f32 f2679, [r10+2916]; +add.f32 f2678, f2679, f2680; +mul.f32 f1120, f2680, 0f3F000000; +sub.f32 f1121, f2679, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0fBF5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +ld.shared.f32 f2677, [r10+64152]; +sub.f32 f1131, f944, f1130; +ld.shared.f32 f2676, [r10+37908]; +add.f32 f2675, f2676, f2677; +sub.f32 f1132, f2676, f2677; +mul.f32 f1133, f1132, 0fBF5DB3D7; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +ld.shared.f32 f2674, [r10+11664]; +add.f32 f2673, f2674, f2675; +mul.f32 f1136, f2675, 0f3F000000; +sub.f32 f1137, f2674, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0fBF5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +ld.shared.f32 f2672, [r10+46656]; +sub.f32 f1147, f947, f1146; +ld.shared.f32 f2671, [r10+72900]; +add.f32 f2670, f2672, f2671; +sub.f32 f1148, f2672, f2671; +mul.f32 f1149, f1148, 0fBF5DB3D7; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +ld.shared.f32 f2669, [r10+20412]; +add.f32 f2668, f2669, f2670; +mul.f32 f1152, f2670, 0f3F000000; +sub.f32 f1153, f2669, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0fBF5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f2666, f1134, 0f3F441B7D; +mul.f32 f2667, f1140, 0f3F248DBB; +sub.f32 f1160, f2666, f2667; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0f3F248DBB, f1161; +mul.f32 f2664, f1150, 0f3E31D0D4; +mul.f32 f2665, f1156, 0f3F7C1C5C; +sub.f32 f1165, f2664, f2665; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0f3F7C1C5C, f1166; +mul.f32 f1169, f1141, 0f3F7C1C5C; +mul.f32 f2663, f1135, 0f3E31D0D4; +sub.f32 f1170, f2663, f1169; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0f3F7C1C5C, f1171; +mul.f32 f1174, f1157, 0f3EAF1D44; +mul.f32 f2662, f1151, 0fBF708FB2; +sub.f32 f1175, f2662, f1174; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0f3EAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f2661, f2673, f2668; +sub.f32 f1184, f2673, f2668; +mul.f32 f1185, f1184, 0fBF5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f2660, f2678, f2661; +mul.f32 f1188, f2661, 0f3F000000; +sub.f32 f1189, f2678, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0fBF5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f2659, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0fBF5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f2658, f1124, f2659; +mul.f32 f1204, f2659, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0fBF5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f2657, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0fBF5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f2656, f1125, f2657; +mul.f32 f1220, f2657, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0fBF5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +ld.shared.f32 f2655, [r10+32076]; +sub.f32 f1231, f942, f1230; +ld.shared.f32 f2654, [r10+58320]; +add.f32 f2653, f2655, f2654; +sub.f32 f1232, f2655, f2654; +mul.f32 f1233, f1232, 0fBF5DB3D7; +ld.shared.f32 f2652, [r10+5832]; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f2651, f2652, f2653; +mul.f32 f1236, f2653, 0f3F000000; +sub.f32 f1237, f2652, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0fBF5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +ld.shared.f32 f2650, [r10+67068]; +sub.f32 f1247, f945, f1246; +ld.shared.f32 f2649, [r10+40824]; +add.f32 f2648, f2649, f2650; +sub.f32 f1248, f2649, f2650; +mul.f32 f1249, f1248, 0fBF5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +ld.shared.f32 f2647, [r10+14580]; +add.f32 f2646, f2647, f2648; +mul.f32 f1252, f2648, 0f3F000000; +sub.f32 f1253, f2647, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0fBF5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +ld.shared.f32 f2645, [r10+75816]; +ld.shared.f32 f2644, [r10+49572]; +add.f32 f2643, f2644, f2645; +sub.f32 f1264, f2644, f2645; +mul.f32 f1265, f1264, 0fBF5DB3D7; +ld.shared.f32 f2642, [r10+23328]; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +add.f32 f2641, f2642, f2643; +mul.f32 f1268, f2643, 0f3F000000; +sub.f32 f1269, f2642, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0fBF5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f2639, f1250, 0f3F441B7D; +mul.f32 f2640, f1256, 0f3F248DBB; +sub.f32 f1276, f2639, f2640; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0f3F248DBB, f1277; +mul.f32 f2637, f1266, 0f3E31D0D4; +mul.f32 f2638, f1272, 0f3F7C1C5C; +sub.f32 f1281, f2637, f2638; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0f3F7C1C5C, f1282; +mul.f32 f1285, f1257, 0f3F7C1C5C; +mul.f32 f2636, f1251, 0f3E31D0D4; +sub.f32 f1286, f2636, f1285; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0f3F7C1C5C, f1287; +mul.f32 f1290, f1273, 0f3EAF1D44; +mul.f32 f2635, f1267, 0fBF708FB2; +sub.f32 f1291, f2635, f1290; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0f3EAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f2634, f2646, f2641; +sub.f32 f1300, f2646, f2641; +mul.f32 f1301, f1300, 0fBF5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f2633, f2651, f2634; +mul.f32 f1304, f2634, 0f3F000000; +sub.f32 f1305, f2651, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0fBF5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f2632, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0fBF5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f2631, f1240, f2632; +mul.f32 f1320, f2632, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0fBF5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f2630, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0fBF5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f2629, f1241, f2630; +mul.f32 f1336, f2630, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0fBF5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f2658, 0f3E6C2691; +mul.f32 f2628, f1195, 0f3F791978; +sub.f32 f1344, f2628, f1343; +mul.f32 f1345, f2658, 0f3F791978; +fma.rn.f32 f1346, f1195, 0f3E6C2691, f1345; +mul.f32 f2626, f1311, 0f3F64C51C; +mul.f32 f2627, f2631, 0f3EE5C902; +sub.f32 f1349, f2626, f2627; +mul.f32 f1350, f2631, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0f3EE5C902, f1350; +mul.f32 f2624, f1211, 0f3F64C51C; +mul.f32 f2625, f2656, 0f3EE5C902; +sub.f32 f1354, f2624, f2625; +mul.f32 f1355, f2656, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0f3EE5C902, f1355; +mul.f32 f2622, f1327, 0f3F18DF63; +mul.f32 f2623, f2629, 0f3F4D57F2; +sub.f32 f1359, f2622, f2623; +mul.f32 f1360, f2629, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0f3F4D57F2, f1360; +mul.f32 f2620, f1186, 0f3F441B7D; +mul.f32 f2621, f1192, 0f3F248DBB; +sub.f32 f1364, f2620, f2621; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0f3F248DBB, f1365; +mul.f32 f1368, f1308, 0f3F7C1C5C; +mul.f32 f2619, f1302, 0f3E31D0D4; +sub.f32 f1369, f2619, f1368; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0f3F7C1C5C, f1370; +mul.f32 f1373, f1208, 0f3F4D57F2; +mul.f32 f2618, f1202, 0f3F18DF63; +sub.f32 f1374, f2618, f1373; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0f3F4D57F2, f1375; +mul.f32 f1378, f1324, 0f3F753ECD; +mul.f32 f2617, f1318, 0fBE92D7E0; +sub.f32 f1379, f2617, f1378; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0f3F753ECD, f1380; +mul.f32 f1383, f1224, 0f3F6B1036; +mul.f32 f2616, f1218, 0f3ECACAF8; +sub.f32 f1384, f2616, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0f3F6B1036, f1385; +mul.f32 f1388, f1340, 0f3F3A3529; +mul.f32 f2615, f1334, 0fBF2FAD88; +sub.f32 f1389, f2615, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0f3F3A3529, f1390; +mul.f32 f1393, f1193, 0f3F7C1C5C; +mul.f32 f2614, f1187, 0f3E31D0D4; +sub.f32 f1394, f2614, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0f3F7C1C5C, f1395; +mul.f32 f2612, f1303, 0fBF708FB2; +mul.f32 f2613, f1309, 0f3EAF1D44; +sub.f32 f1399, f2612, f2613; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0f3EAF1D44, f1400; +mul.f32 f2610, f1203, 0fBD6E2946; +mul.f32 f2611, f1209, 0f3F7F9120; +sub.f32 f1404, f2610, f2611; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0f3F7F9120, f1405; +mul.f32 f2608, f1319, 0fBF7E44DE; +mul.f32 f2609, f1325, 0fBDEDC21F; +sub.f32 f1409, f2608, f2609; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0fBDEDC21F, f1410; +mul.f32 f1413, f1225, 0f3F753ECD; +mul.f32 f2607, f1219, 0fBE92D7E0; +sub.f32 f1414, f2607, f1413; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0f3F753ECD, f1415; +mul.f32 f1418, f1341, 0fBF0CAC9F; +mul.f32 f2606, f1335, 0fBF55E287; +sub.f32 f1419, f2606, f1418; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0fBF0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +add.f32 f1423, f1063, f1422; +mul.f32 f1426, f1422, 0f3F000000; +sub.f32 f1427, f1063, f1426; +add.f32 f2605, f2660, f2633; +sub.f32 f1428, f2660, f2633; +mul.f32 f1429, f1428, 0fBF5DB3D7; +add.f32 f1430, f1429, f1427; +sub.f32 f1431, f1427, f1429; +add.f32 f2604, f2687, f2605; +mul.f32 f1432, f2605, 0f3F000000; +sub.f32 f1433, f2687, f1432; +sub.f32 f1434, f1179, f1295; +mul.f32 f1435, f1434, 0fBF5DB3D7; +sub.f32 f1436, f1433, f1435; +add.f32 f1437, f1435, f1433; +add.f32 f1438, f1344, f1349; +add.f32 f1439, f1079, f1438; +mul.f32 f1442, f1438, 0f3F000000; +sub.f32 f1443, f1079, f1442; +add.f32 f2603, f1346, f1351; +sub.f32 f1444, f1346, f1351; +mul.f32 f1445, f1444, 0fBF5DB3D7; +add.f32 f1446, f1445, f1443; +sub.f32 f1447, f1443, f1445; +add.f32 f2602, f2685, f2603; +mul.f32 f1448, f2603, 0f3F000000; +sub.f32 f1449, f2685, f1448; +sub.f32 f1450, f1344, f1349; +mul.f32 f1451, f1450, 0fBF5DB3D7; +sub.f32 f1452, f1449, f1451; +add.f32 f1453, f1451, f1449; +add.f32 f1454, f1354, f1359; +add.f32 f1455, f1095, f1454; +mul.f32 f1458, f1454, 0f3F000000; +sub.f32 f1459, f1095, f1458; +add.f32 f2601, f1356, f1361; +sub.f32 f1460, f1356, f1361; +mul.f32 f1461, f1460, 0fBF5DB3D7; +add.f32 f1462, f1461, f1459; +sub.f32 f1463, f1459, f1461; +add.f32 f2600, f2683, f2601; +mul.f32 f1464, f2601, 0f3F000000; +sub.f32 f1465, f2683, f1464; +sub.f32 f1466, f1354, f1359; +mul.f32 f1467, f1466, 0fBF5DB3D7; +sub.f32 f1468, f1465, f1467; +add.f32 f1469, f1467, f1465; +add.f32 f1470, f1364, f1369; +add.f32 f1471, f1070, f1470; +mul.f32 f1474, f1470, 0f3F000000; +sub.f32 f1475, f1070, f1474; +add.f32 f2599, f1366, f1371; +sub.f32 f1476, f1366, f1371; +mul.f32 f1477, f1476, 0fBF5DB3D7; +add.f32 f1478, f1477, f1475; +sub.f32 f1479, f1475, f1477; +add.f32 f2598, f1076, f2599; +mul.f32 f1480, f2599, 0f3F000000; +sub.f32 f1481, f1076, f1480; +sub.f32 f1482, f1364, f1369; +mul.f32 f1483, f1482, 0fBF5DB3D7; +sub.f32 f1484, f1481, f1483; +add.f32 f1485, f1483, f1481; +add.f32 f1486, f1374, f1379; +add.f32 f1487, f1086, f1486; +mul.f32 f1490, f1486, 0f3F000000; +sub.f32 f1491, f1086, f1490; +add.f32 f2597, f1376, f1381; +sub.f32 f1492, f1376, f1381; +mul.f32 f1493, f1492, 0fBF5DB3D7; +add.f32 f1494, f1493, f1491; +sub.f32 f1495, f1491, f1493; +add.f32 f2596, f1092, f2597; +mul.f32 f1496, f2597, 0f3F000000; +sub.f32 f1497, f1092, f1496; +sub.f32 f1498, f1374, f1379; +mul.f32 f1499, f1498, 0fBF5DB3D7; +sub.f32 f1500, f1497, f1499; +add.f32 f1501, f1499, f1497; +add.f32 f1502, f1384, f1389; +add.f32 f1503, f1102, f1502; +mul.f32 f1506, f1502, 0f3F000000; +sub.f32 f1507, f1102, f1506; +add.f32 f2595, f1386, f1391; +sub.f32 f1508, f1386, f1391; +mul.f32 f1509, f1508, 0fBF5DB3D7; +add.f32 f1510, f1509, f1507; +sub.f32 f1511, f1507, f1509; +add.f32 f2594, f1108, f2595; +mul.f32 f1512, f2595, 0f3F000000; +sub.f32 f1513, f1108, f1512; +sub.f32 f1514, f1384, f1389; +mul.f32 f1515, f1514, 0fBF5DB3D7; +sub.f32 f1516, f1513, f1515; +add.f32 f1517, f1515, f1513; +add.f32 f1518, f1394, f1399; +add.f32 f1519, f1071, f1518; +mul.f32 f1522, f1518, 0f3F000000; +sub.f32 f1523, f1071, f1522; +add.f32 f2593, f1396, f1401; +sub.f32 f1524, f1396, f1401; +mul.f32 f1525, f1524, 0fBF5DB3D7; +add.f32 f1526, f1525, f1523; +sub.f32 f1527, f1523, f1525; +add.f32 f2592, f1077, f2593; +mul.f32 f1528, f2593, 0f3F000000; +sub.f32 f1529, f1077, f1528; +sub.f32 f1530, f1394, f1399; +mul.f32 f1531, f1530, 0fBF5DB3D7; +sub.f32 f1532, f1529, f1531; +add.f32 f1533, f1531, f1529; +add.f32 f1534, f1404, f1409; +add.f32 f1535, f1087, f1534; +mul.f32 f1538, f1534, 0f3F000000; +sub.f32 f1539, f1087, f1538; +add.f32 f2591, f1406, f1411; +sub.f32 f1540, f1406, f1411; +mul.f32 f1541, f1540, 0fBF5DB3D7; +add.f32 f1542, f1541, f1539; +sub.f32 f1543, f1539, f1541; +add.f32 f2590, f1093, f2591; +mul.f32 f1544, f2591, 0f3F000000; +sub.f32 f1545, f1093, f1544; +sub.f32 f1546, f1404, f1409; +mul.f32 f1547, f1546, 0fBF5DB3D7; +sub.f32 f1548, f1545, f1547; +add.f32 f1549, f1547, f1545; +add.f32 f1550, f1414, f1419; +add.f32 f1551, f1103, f1550; +mul.f32 f1554, f1550, 0f3F000000; +sub.f32 f1555, f1103, f1554; +add.f32 f2589, f1416, f1421; +sub.f32 f1556, f1416, f1421; +mul.f32 f1557, f1556, 0fBF5DB3D7; +add.f32 f1558, f1557, f1555; +sub.f32 f1559, f1555, f1557; +add.f32 f2588, f1109, f2589; +mul.f32 f1560, f2589, 0f3F000000; +sub.f32 f1561, f1109, f1560; +sub.f32 f1562, f1414, f1419; +mul.f32 f1563, f1562, 0fBF5DB3D7; +sub.f32 f1564, f1561, f1563; +add.f32 f1565, f1563, f1561; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1566, f1567}, [rd11]; +mul.f32 f1570, f2602, f1567; +fma.rn.f32 f1571, f1566, f1439, f1570; +mul.f32 f1572, f1439, f1567; +mul.f32 f1573, f1566, f2602; +sub.f32 f1574, f1573, f1572; +mul.f32 f2586, f1566, f1566; +mul.f32 f2587, f1567, f1567; +sub.f32 f1577, f2586, f2587; +mul.f32 f1578, f1567, f1566; +fma.rn.f32 f1579, f1567, f1566, f1578; +mul.f32 f1580, f2600, f1579; +fma.rn.f32 f1581, f1577, f1455, f1580; +mul.f32 f1582, f1455, f1579; +mul.f32 f1583, f1577, f2600; +sub.f32 f1584, f1583, f1582; +mul.f32 f1586, f1567, f1579; +mul.f32 f2585, f1566, f1577; +sub.f32 f1587, f2585, f1586; +mul.f32 f1588, f1566, f1579; +fma.rn.f32 f1589, f1567, f1577, f1588; +mul.f32 f1590, f2598, f1589; +fma.rn.f32 f1591, f1587, f1471, f1590; +mul.f32 f1592, f1471, f1589; +mul.f32 f1593, f1587, f2598; +sub.f32 f1594, f1593, f1592; +mul.f32 f1596, f1567, f1589; +mul.f32 f2584, f1566, f1587; +sub.f32 f1597, f2584, f1596; +mul.f32 f1598, f1566, f1589; +fma.rn.f32 f1599, f1567, f1587, f1598; +mul.f32 f1600, f2596, f1599; +fma.rn.f32 f1601, f1597, f1487, f1600; +mul.f32 f1602, f1487, f1599; +mul.f32 f1603, f1597, f2596; +sub.f32 f1604, f1603, f1602; +mul.f32 f1606, f1567, f1599; +mul.f32 f2583, f1566, f1597; +sub.f32 f1607, f2583, f1606; +mul.f32 f1608, f1566, f1599; +fma.rn.f32 f1609, f1567, f1597, f1608; +mul.f32 f1610, f2594, f1609; +fma.rn.f32 f1611, f1607, f1503, f1610; +mul.f32 f1612, f1503, f1609; +mul.f32 f1613, f1607, f2594; +sub.f32 f1614, f1613, f1612; +mul.f32 f2581, f1566, f1607; +mul.f32 f2582, f1567, f1609; +sub.f32 f1617, f2581, f2582; +mul.f32 f1618, f1566, f1609; +fma.rn.f32 f1619, f1567, f1607, f1618; +mul.f32 f1620, f2592, f1619; +fma.rn.f32 f1621, f1617, f1519, f1620; +mul.f32 f1622, f1519, f1619; +mul.f32 f1623, f1617, f2592; +sub.f32 f1624, f1623, f1622; +mul.f32 f2579, f1566, f1617; +mul.f32 f2580, f1567, f1619; +sub.f32 f1627, f2579, f2580; +mul.f32 f1628, f1566, f1619; +fma.rn.f32 f1629, f1567, f1617, f1628; +mul.f32 f1630, f2590, f1629; +fma.rn.f32 f1631, f1627, f1535, f1630; +mul.f32 f1632, f1535, f1629; +mul.f32 f1633, f1627, f2590; +sub.f32 f1634, f1633, f1632; +mul.f32 f1636, f1567, f1629; +mul.f32 f2578, f1566, f1627; +sub.f32 f1637, f2578, f1636; +mul.f32 f1638, f1566, f1629; +fma.rn.f32 f1639, f1567, f1627, f1638; +mul.f32 f1640, f2588, f1639; +fma.rn.f32 f1641, f1637, f1551, f1640; +mul.f32 f1642, f1551, f1639; +mul.f32 f1643, f1637, f2588; +sub.f32 f1644, f1643, f1642; +mul.f32 f1646, f1567, f1639; +mul.f32 f2577, f1566, f1637; +sub.f32 f1647, f2577, f1646; +mul.f32 f1648, f1566, f1639; +fma.rn.f32 f1649, f1567, f1637, f1648; +mul.f32 f1650, f1436, f1649; +fma.rn.f32 f1651, f1647, f1430, f1650; +mul.f32 f1652, f1430, f1649; +mul.f32 f1653, f1647, f1436; +sub.f32 f1654, f1653, f1652; +mul.f32 f2575, f1566, f1647; +mul.f32 f2576, f1567, f1649; +sub.f32 f1657, f2575, f2576; +mul.f32 f1658, f1566, f1649; +fma.rn.f32 f1659, f1567, f1647, f1658; +mul.f32 f1660, f1452, f1659; +fma.rn.f32 f1661, f1657, f1446, f1660; +mul.f32 f1662, f1446, f1659; +mul.f32 f1663, f1657, f1452; +sub.f32 f1664, f1663, f1662; +mul.f32 f2573, f1566, f1657; +mul.f32 f2574, f1567, f1659; +sub.f32 f1667, f2573, f2574; +mul.f32 f1668, f1566, f1659; +fma.rn.f32 f1669, f1567, f1657, f1668; +mul.f32 f1670, f1468, f1669; +fma.rn.f32 f1671, f1667, f1462, f1670; +mul.f32 f1672, f1462, f1669; +mul.f32 f1673, f1667, f1468; +sub.f32 f1674, f1673, f1672; +mul.f32 f1676, f1567, f1669; +mul.f32 f2572, f1566, f1667; +sub.f32 f1677, f2572, f1676; +mul.f32 f1678, f1566, f1669; +fma.rn.f32 f1679, f1567, f1667, f1678; +mul.f32 f1680, f1484, f1679; +fma.rn.f32 f1681, f1677, f1478, f1680; +mul.f32 f1682, f1478, f1679; +mul.f32 f1683, f1677, f1484; +sub.f32 f1684, f1683, f1682; +mul.f32 f1686, f1567, f1679; +mul.f32 f2571, f1566, f1677; +sub.f32 f1687, f2571, f1686; +mul.f32 f1688, f1566, f1679; +fma.rn.f32 f1689, f1567, f1677, f1688; +mul.f32 f1690, f1500, f1689; +fma.rn.f32 f1691, f1687, f1494, f1690; +mul.f32 f1692, f1494, f1689; +mul.f32 f1693, f1687, f1500; +sub.f32 f1694, f1693, f1692; +mul.f32 f1696, f1567, f1689; +mul.f32 f2570, f1566, f1687; +sub.f32 f1697, f2570, f1696; +mul.f32 f1698, f1566, f1689; +fma.rn.f32 f1699, f1567, f1687, f1698; +mul.f32 f1700, f1516, f1699; +fma.rn.f32 f1701, f1697, f1510, f1700; +mul.f32 f1702, f1510, f1699; +mul.f32 f1703, f1697, f1516; +sub.f32 f1704, f1703, f1702; +mul.f32 f2568, f1566, f1697; +mul.f32 f2569, f1567, f1699; +sub.f32 f1707, f2568, f2569; +mul.f32 f1708, f1566, f1699; +fma.rn.f32 f1709, f1567, f1697, f1708; +mul.f32 f1710, f1532, f1709; +fma.rn.f32 f1711, f1707, f1526, f1710; +mul.f32 f1712, f1526, f1709; +mul.f32 f1713, f1707, f1532; +sub.f32 f1714, f1713, f1712; +mul.f32 f2566, f1566, f1707; +mul.f32 f2567, f1567, f1709; +sub.f32 f1717, f2566, f2567; +mul.f32 f1718, f1566, f1709; +fma.rn.f32 f1719, f1567, f1707, f1718; +mul.f32 f1720, f1548, f1719; +fma.rn.f32 f1721, f1717, f1542, f1720; +mul.f32 f1722, f1542, f1719; +mul.f32 f1723, f1717, f1548; +sub.f32 f1724, f1723, f1722; +mul.f32 f1726, f1567, f1719; +mul.f32 f2565, f1566, f1717; +sub.f32 f1727, f2565, f1726; +mul.f32 f1728, f1566, f1719; +fma.rn.f32 f1729, f1567, f1717, f1728; +mul.f32 f1730, f1564, f1729; +fma.rn.f32 f1731, f1727, f1558, f1730; +mul.f32 f1732, f1558, f1729; +mul.f32 f1733, f1727, f1564; +sub.f32 f1734, f1733, f1732; +mul.f32 f1736, f1567, f1729; +mul.f32 f2564, f1566, f1727; +sub.f32 f1737, f2564, f1736; +mul.f32 f1738, f1566, f1729; +fma.rn.f32 f1739, f1567, f1727, f1738; +mul.f32 f1740, f1437, f1739; +fma.rn.f32 f1741, f1737, f1431, f1740; +mul.f32 f1742, f1431, f1739; +mul.f32 f1743, f1737, f1437; +sub.f32 f1744, f1743, f1742; +mul.f32 f1746, f1567, f1739; +mul.f32 f2563, f1566, f1737; +sub.f32 f1747, f2563, f1746; +mul.f32 f1748, f1566, f1739; +fma.rn.f32 f1749, f1567, f1737, f1748; +mul.f32 f1750, f1453, f1749; +fma.rn.f32 f1751, f1747, f1447, f1750; +mul.f32 f1752, f1447, f1749; +mul.f32 f1753, f1747, f1453; +sub.f32 f1754, f1753, f1752; +mul.f32 f2561, f1566, f1747; +mul.f32 f2562, f1567, f1749; +sub.f32 f1757, f2561, f2562; +mul.f32 f1758, f1566, f1749; +fma.rn.f32 f1759, f1567, f1747, f1758; +mul.f32 f1760, f1469, f1759; +fma.rn.f32 f1761, f1757, f1463, f1760; +mul.f32 f1762, f1463, f1759; +mul.f32 f1763, f1757, f1469; +sub.f32 f1764, f1763, f1762; +mul.f32 f2559, f1566, f1757; +mul.f32 f2560, f1567, f1759; +sub.f32 f1767, f2559, f2560; +mul.f32 f1768, f1566, f1759; +fma.rn.f32 f1769, f1567, f1757, f1768; +mul.f32 f1770, f1485, f1769; +fma.rn.f32 f1771, f1767, f1479, f1770; +mul.f32 f1772, f1479, f1769; +mul.f32 f1773, f1767, f1485; +sub.f32 f1774, f1773, f1772; +mul.f32 f1776, f1567, f1769; +mul.f32 f2558, f1566, f1767; +sub.f32 f1777, f2558, f1776; +mul.f32 f1778, f1566, f1769; +fma.rn.f32 f1779, f1567, f1767, f1778; +mul.f32 f1780, f1501, f1779; +fma.rn.f32 f1781, f1777, f1495, f1780; +mul.f32 f1782, f1495, f1779; +mul.f32 f1783, f1777, f1501; +sub.f32 f1784, f1783, f1782; +mul.f32 f1786, f1567, f1779; +mul.f32 f2557, f1566, f1777; +sub.f32 f1787, f2557, f1786; +mul.f32 f1788, f1566, f1779; +fma.rn.f32 f1789, f1567, f1777, f1788; +mul.f32 f1790, f1517, f1789; +fma.rn.f32 f1791, f1787, f1511, f1790; +mul.f32 f1792, f1511, f1789; +mul.f32 f1793, f1787, f1517; +sub.f32 f1794, f1793, f1792; +mul.f32 f2555, f1566, f1787; +mul.f32 f2556, f1567, f1789; +sub.f32 f1797, f2555, f2556; +mul.f32 f1798, f1566, f1789; +fma.rn.f32 f1799, f1567, f1787, f1798; +mul.f32 f1800, f1533, f1799; +fma.rn.f32 f1801, f1797, f1527, f1800; +mul.f32 f1802, f1527, f1799; +mul.f32 f1803, f1797, f1533; +sub.f32 f1804, f1803, f1802; +mul.f32 f2553, f1566, f1797; +mul.f32 f2554, f1567, f1799; +sub.f32 f1807, f2553, f2554; +mul.f32 f1808, f1566, f1799; +fma.rn.f32 f1809, f1567, f1797, f1808; +mul.f32 f1810, f1549, f1809; +fma.rn.f32 f1811, f1807, f1543, f1810; +mul.f32 f1812, f1543, f1809; +mul.f32 f1813, f1807, f1549; +sub.f32 f1814, f1813, f1812; +mul.f32 f1816, f1567, f1809; +mul.f32 f2552, f1566, f1807; +sub.f32 f1817, f2552, f1816; +mul.f32 f1818, f1566, f1809; +fma.rn.f32 f1819, f1567, f1807, f1818; +mul.f32 f1820, f1565, f1819; +fma.rn.f32 f1821, f1817, f1559, f1820; +mul.f32 f1822, f1559, f1819; +mul.f32 f1823, f1817, f1565; +sub.f32 f1824, f1823, f1822; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 2916, r19; +st.shared.f32 [r20], f1423; +st.shared.f32 [r20+108], f1571; +st.shared.f32 [r20+216], f1581; +st.shared.f32 [r20+324], f1591; +st.shared.f32 [r20+432], f1601; +st.shared.f32 [r20+540], f1611; +st.shared.f32 [r20+648], f1621; +st.shared.f32 [r20+756], f1631; +st.shared.f32 [r20+864], f1641; +st.shared.f32 [r20+972], f1651; +st.shared.f32 [r20+1080], f1661; +st.shared.f32 [r20+1188], f1671; +st.shared.f32 [r20+1296], f1681; +st.shared.f32 [r20+1404], f1691; +st.shared.f32 [r20+1512], f1701; +st.shared.f32 [r20+1620], f1711; +st.shared.f32 [r20+1728], f1721; +st.shared.f32 [r20+1836], f1731; +st.shared.f32 [r20+1944], f1741; +st.shared.f32 [r20+2052], f1751; +st.shared.f32 [r20+2160], f1761; +st.shared.f32 [r20+2268], f1771; +st.shared.f32 [r20+2376], f1781; +st.shared.f32 [r20+2484], f1791; +st.shared.f32 [r20+2592], f1801; +st.shared.f32 [r20+2700], f1811; +st.shared.f32 [r20+2808], f1821; +barrier.sync 0; +ld.shared.f32 f1825, [r10]; +ld.shared.f32 f1826, [r10+2916]; +ld.shared.f32 f1827, [r10+5832]; +ld.shared.f32 f1828, [r10+8748]; +ld.shared.f32 f1829, [r10+11664]; +ld.shared.f32 f1830, [r10+14580]; +ld.shared.f32 f1831, [r10+17496]; +ld.shared.f32 f1832, [r10+20412]; +ld.shared.f32 f1833, [r10+23328]; +ld.shared.f32 f1834, [r10+26244]; +ld.shared.f32 f1835, [r10+29160]; +ld.shared.f32 f1836, [r10+32076]; +ld.shared.f32 f1837, [r10+34992]; +ld.shared.f32 f1838, [r10+37908]; +ld.shared.f32 f1839, [r10+40824]; +ld.shared.f32 f1840, [r10+43740]; +ld.shared.f32 f1841, [r10+46656]; +ld.shared.f32 f1842, [r10+49572]; +ld.shared.f32 f1843, [r10+52488]; +ld.shared.f32 f1844, [r10+55404]; +ld.shared.f32 f1845, [r10+58320]; +ld.shared.f32 f1846, [r10+61236]; +ld.shared.f32 f1847, [r10+64152]; +ld.shared.f32 f1848, [r10+67068]; +ld.shared.f32 f1849, [r10+69984]; +ld.shared.f32 f1850, [r10+72900]; +ld.shared.f32 f1851, [r10+75816]; +barrier.sync 0; +st.shared.f32 [r20], f2604; +st.shared.f32 [r20+108], f1574; +st.shared.f32 [r20+216], f1584; +st.shared.f32 [r20+324], f1594; +st.shared.f32 [r20+432], f1604; +st.shared.f32 [r20+540], f1614; +st.shared.f32 [r20+648], f1624; +st.shared.f32 [r20+756], f1634; +st.shared.f32 [r20+864], f1644; +st.shared.f32 [r20+972], f1654; +st.shared.f32 [r20+1080], f1664; +st.shared.f32 [r20+1188], f1674; +st.shared.f32 [r20+1296], f1684; +st.shared.f32 [r20+1404], f1694; +st.shared.f32 [r20+1512], f1704; +st.shared.f32 [r20+1620], f1714; +st.shared.f32 [r20+1728], f1724; +st.shared.f32 [r20+1836], f1734; +st.shared.f32 [r20+1944], f1744; +st.shared.f32 [r20+2052], f1754; +st.shared.f32 [r20+2160], f1764; +st.shared.f32 [r20+2268], f1774; +st.shared.f32 [r20+2376], f1784; +st.shared.f32 [r20+2484], f1794; +st.shared.f32 [r20+2592], f1804; +st.shared.f32 [r20+2700], f1814; +st.shared.f32 [r20+2808], f1824; +barrier.sync 0; +ld.shared.f32 f1852, [r10]; +ld.shared.f32 f1853, [r10+2916]; +ld.shared.f32 f1854, [r10+5832]; +ld.shared.f32 f1855, [r10+8748]; +ld.shared.f32 f1856, [r10+11664]; +ld.shared.f32 f1857, [r10+14580]; +ld.shared.f32 f1858, [r10+17496]; +ld.shared.f32 f1859, [r10+20412]; +ld.shared.f32 f1860, [r10+23328]; +ld.shared.f32 f1861, [r10+26244]; +ld.shared.f32 f1862, [r10+29160]; +ld.shared.f32 f1863, [r10+32076]; +ld.shared.f32 f1864, [r10+34992]; +ld.shared.f32 f1865, [r10+37908]; +ld.shared.f32 f1866, [r10+40824]; +ld.shared.f32 f1867, [r10+43740]; +ld.shared.f32 f1868, [r10+46656]; +ld.shared.f32 f1869, [r10+49572]; +ld.shared.f32 f1870, [r10+52488]; +ld.shared.f32 f1871, [r10+55404]; +ld.shared.f32 f1872, [r10+58320]; +ld.shared.f32 f1873, [r10+61236]; +ld.shared.f32 f1874, [r10+64152]; +ld.shared.f32 f1875, [r10+67068]; +ld.shared.f32 f1876, [r10+69984]; +ld.shared.f32 f1877, [r10+72900]; +ld.shared.f32 f1878, [r10+75816]; +add.f32 f1879, f1834, f1843; +add.f32 f1880, f1825, f1879; +mul.f32 f1883, f1879, 0f3F000000; +sub.f32 f1884, f1825, f1883; +add.f32 f2551, f1861, f1870; +sub.f32 f1885, f1861, f1870; +mul.f32 f1886, f1885, 0fBF5DB3D7; +add.f32 f1887, f1886, f1884; +sub.f32 f1888, f1884, f1886; +add.f32 f2550, f1852, f2551; +mul.f32 f1889, f2551, 0f3F000000; +sub.f32 f1890, f1852, f1889; +sub.f32 f1891, f1834, f1843; +mul.f32 f1892, f1891, 0fBF5DB3D7; +sub.f32 f1893, f1890, f1892; +add.f32 f1894, f1892, f1890; +add.f32 f1895, f1837, f1846; +add.f32 f1896, f1828, f1895; +mul.f32 f1899, f1895, 0f3F000000; +sub.f32 f1900, f1828, f1899; +add.f32 f2549, f1864, f1873; +sub.f32 f1901, f1864, f1873; +mul.f32 f1902, f1901, 0fBF5DB3D7; +add.f32 f1903, f1902, f1900; +sub.f32 f1904, f1900, f1902; +add.f32 f2548, f1855, f2549; +mul.f32 f1905, f2549, 0f3F000000; +sub.f32 f1906, f1855, f1905; +sub.f32 f1907, f1837, f1846; +mul.f32 f1908, f1907, 0fBF5DB3D7; +sub.f32 f1909, f1906, f1908; +add.f32 f1910, f1908, f1906; +add.f32 f1911, f1840, f1849; +add.f32 f1912, f1831, f1911; +mul.f32 f1915, f1911, 0f3F000000; +sub.f32 f1916, f1831, f1915; +add.f32 f2547, f1867, f1876; +sub.f32 f1917, f1867, f1876; +mul.f32 f1918, f1917, 0fBF5DB3D7; +add.f32 f1919, f1918, f1916; +sub.f32 f1920, f1916, f1918; +add.f32 f2546, f1858, f2547; +mul.f32 f1921, f2547, 0f3F000000; +sub.f32 f1922, f1858, f1921; +sub.f32 f1923, f1840, f1849; +mul.f32 f1924, f1923, 0fBF5DB3D7; +sub.f32 f1925, f1922, f1924; +add.f32 f1926, f1924, f1922; +mul.f32 f2544, f1903, 0f3F441B7D; +mul.f32 f2545, f1909, 0f3F248DBB; +sub.f32 f1929, f2544, f2545; +mul.f32 f1930, f1909, 0f3F441B7D; +fma.rn.f32 f1931, f1903, 0f3F248DBB, f1930; +mul.f32 f2542, f1919, 0f3E31D0D4; +mul.f32 f2543, f1925, 0f3F7C1C5C; +sub.f32 f1934, f2542, f2543; +mul.f32 f1935, f1925, 0f3E31D0D4; +fma.rn.f32 f1936, f1919, 0f3F7C1C5C, f1935; +mul.f32 f2540, f1904, 0f3E31D0D4; +mul.f32 f2541, f1910, 0f3F7C1C5C; +sub.f32 f1939, f2540, f2541; +mul.f32 f1940, f1910, 0f3E31D0D4; +fma.rn.f32 f1941, f1904, 0f3F7C1C5C, f1940; +mul.f32 f2538, f1920, 0fBF708FB2; +mul.f32 f2539, f1926, 0f3EAF1D44; +sub.f32 f1944, f2538, f2539; +mul.f32 f1945, f1926, 0fBF708FB2; +fma.rn.f32 f1946, f1920, 0f3EAF1D44, f1945; +add.f32 f1947, f1896, f1912; +add.f32 f1948, f1880, f1947; +mul.f32 f1951, f1947, 0f3F000000; +sub.f32 f1952, f1880, f1951; +add.f32 f2537, f2548, f2546; +sub.f32 f1953, f2548, f2546; +mul.f32 f1954, f1953, 0fBF5DB3D7; +add.f32 f1955, f1954, f1952; +sub.f32 f1956, f1952, f1954; +add.f32 f2536, f2550, f2537; +mul.f32 f1957, f2537, 0f3F000000; +sub.f32 f1958, f2550, f1957; +sub.f32 f1959, f1896, f1912; +mul.f32 f1960, f1959, 0fBF5DB3D7; +sub.f32 f1961, f1958, f1960; +add.f32 f1962, f1960, f1958; +add.f32 f1963, f1929, f1934; +add.f32 f1964, f1887, f1963; +mul.f32 f1967, f1963, 0f3F000000; +sub.f32 f1968, f1887, f1967; +add.f32 f2535, f1931, f1936; +sub.f32 f1969, f1931, f1936; +mul.f32 f1970, f1969, 0fBF5DB3D7; +add.f32 f1971, f1970, f1968; +sub.f32 f1972, f1968, f1970; +add.f32 f2534, f1893, f2535; +mul.f32 f1973, f2535, 0f3F000000; +sub.f32 f1974, f1893, f1973; +sub.f32 f1975, f1929, f1934; +mul.f32 f1976, f1975, 0fBF5DB3D7; +sub.f32 f1977, f1974, f1976; +add.f32 f1978, f1976, f1974; +add.f32 f1979, f1939, f1944; +add.f32 f1980, f1888, f1979; +mul.f32 f1983, f1979, 0f3F000000; +sub.f32 f1984, f1888, f1983; +add.f32 f2533, f1941, f1946; +sub.f32 f1985, f1941, f1946; +mul.f32 f1986, f1985, 0fBF5DB3D7; +add.f32 f1987, f1986, f1984; +sub.f32 f1988, f1984, f1986; +add.f32 f2532, f1894, f2533; +mul.f32 f1989, f2533, 0f3F000000; +sub.f32 f1990, f1894, f1989; +sub.f32 f1991, f1939, f1944; +mul.f32 f1992, f1991, 0fBF5DB3D7; +sub.f32 f1993, f1990, f1992; +add.f32 f1994, f1992, f1990; +add.f32 f1995, f1835, f1844; +add.f32 f1996, f1826, f1995; +mul.f32 f1999, f1995, 0f3F000000; +sub.f32 f2000, f1826, f1999; +add.f32 f2531, f1862, f1871; +sub.f32 f2001, f1862, f1871; +mul.f32 f2002, f2001, 0fBF5DB3D7; +add.f32 f2003, f2002, f2000; +sub.f32 f2004, f2000, f2002; +add.f32 f2530, f1853, f2531; +mul.f32 f2005, f2531, 0f3F000000; +sub.f32 f2006, f1853, f2005; +sub.f32 f2007, f1835, f1844; +mul.f32 f2008, f2007, 0fBF5DB3D7; +sub.f32 f2009, f2006, f2008; +add.f32 f2010, f2008, f2006; +add.f32 f2011, f1838, f1847; +add.f32 f2012, f1829, f2011; +mul.f32 f2015, f2011, 0f3F000000; +sub.f32 f2016, f1829, f2015; +add.f32 f2529, f1865, f1874; +sub.f32 f2017, f1865, f1874; +mul.f32 f2018, f2017, 0fBF5DB3D7; +add.f32 f2019, f2018, f2016; +sub.f32 f2020, f2016, f2018; +add.f32 f2528, f1856, f2529; +mul.f32 f2021, f2529, 0f3F000000; +sub.f32 f2022, f1856, f2021; +sub.f32 f2023, f1838, f1847; +mul.f32 f2024, f2023, 0fBF5DB3D7; +sub.f32 f2025, f2022, f2024; +add.f32 f2026, f2024, f2022; +add.f32 f2027, f1841, f1850; +add.f32 f2028, f1832, f2027; +mul.f32 f2031, f2027, 0f3F000000; +sub.f32 f2032, f1832, f2031; +add.f32 f2527, f1868, f1877; +sub.f32 f2033, f1868, f1877; +mul.f32 f2034, f2033, 0fBF5DB3D7; +add.f32 f2035, f2034, f2032; +sub.f32 f2036, f2032, f2034; +add.f32 f2526, f1859, f2527; +mul.f32 f2037, f2527, 0f3F000000; +sub.f32 f2038, f1859, f2037; +sub.f32 f2039, f1841, f1850; +mul.f32 f2040, f2039, 0fBF5DB3D7; +sub.f32 f2041, f2038, f2040; +add.f32 f2042, f2040, f2038; +mul.f32 f2044, f2025, 0f3F248DBB; +mul.f32 f2525, f2019, 0f3F441B7D; +sub.f32 f2045, f2525, f2044; +mul.f32 f2046, f2025, 0f3F441B7D; +fma.rn.f32 f2047, f2019, 0f3F248DBB, f2046; +mul.f32 f2523, f2035, 0f3E31D0D4; +mul.f32 f2524, f2041, 0f3F7C1C5C; +sub.f32 f2050, f2523, f2524; +mul.f32 f2051, f2041, 0f3E31D0D4; +fma.rn.f32 f2052, f2035, 0f3F7C1C5C, f2051; +mul.f32 f2521, f2020, 0f3E31D0D4; +mul.f32 f2522, f2026, 0f3F7C1C5C; +sub.f32 f2055, f2521, f2522; +mul.f32 f2056, f2026, 0f3E31D0D4; +fma.rn.f32 f2057, f2020, 0f3F7C1C5C, f2056; +mul.f32 f2519, f2036, 0fBF708FB2; +mul.f32 f2520, f2042, 0f3EAF1D44; +sub.f32 f2060, f2519, f2520; +mul.f32 f2061, f2042, 0fBF708FB2; +fma.rn.f32 f2062, f2036, 0f3EAF1D44, f2061; +add.f32 f2063, f2012, f2028; +add.f32 f2064, f1996, f2063; +mul.f32 f2067, f2063, 0f3F000000; +sub.f32 f2068, f1996, f2067; +add.f32 f2518, f2528, f2526; +sub.f32 f2069, f2528, f2526; +mul.f32 f2070, f2069, 0fBF5DB3D7; +add.f32 f2071, f2070, f2068; +sub.f32 f2072, f2068, f2070; +add.f32 f2517, f2530, f2518; +mul.f32 f2073, f2518, 0f3F000000; +sub.f32 f2074, f2530, f2073; +sub.f32 f2075, f2012, f2028; +mul.f32 f2076, f2075, 0fBF5DB3D7; +sub.f32 f2077, f2074, f2076; +add.f32 f2078, f2076, f2074; +add.f32 f2079, f2045, f2050; +add.f32 f2080, f2003, f2079; +mul.f32 f2083, f2079, 0f3F000000; +sub.f32 f2084, f2003, f2083; +add.f32 f2516, f2047, f2052; +sub.f32 f2085, f2047, f2052; +mul.f32 f2086, f2085, 0fBF5DB3D7; +add.f32 f2087, f2086, f2084; +sub.f32 f2088, f2084, f2086; +add.f32 f2515, f2009, f2516; +mul.f32 f2089, f2516, 0f3F000000; +sub.f32 f2090, f2009, f2089; +sub.f32 f2091, f2045, f2050; +mul.f32 f2092, f2091, 0fBF5DB3D7; +sub.f32 f2093, f2090, f2092; +add.f32 f2094, f2092, f2090; +add.f32 f2095, f2055, f2060; +add.f32 f2096, f2004, f2095; +mul.f32 f2099, f2095, 0f3F000000; +sub.f32 f2100, f2004, f2099; +add.f32 f2514, f2057, f2062; +sub.f32 f2101, f2057, f2062; +mul.f32 f2102, f2101, 0fBF5DB3D7; +add.f32 f2103, f2102, f2100; +sub.f32 f2104, f2100, f2102; +add.f32 f2513, f2010, f2514; +mul.f32 f2105, f2514, 0f3F000000; +sub.f32 f2106, f2010, f2105; +sub.f32 f2107, f2055, f2060; +mul.f32 f2108, f2107, 0fBF5DB3D7; +sub.f32 f2109, f2106, f2108; +add.f32 f2110, f2108, f2106; +add.f32 f2111, f1836, f1845; +add.f32 f2112, f1827, f2111; +mul.f32 f2115, f2111, 0f3F000000; +sub.f32 f2116, f1827, f2115; +add.f32 f2512, f1863, f1872; +sub.f32 f2117, f1863, f1872; +mul.f32 f2118, f2117, 0fBF5DB3D7; +add.f32 f2119, f2118, f2116; +sub.f32 f2120, f2116, f2118; +add.f32 f2511, f1854, f2512; +mul.f32 f2121, f2512, 0f3F000000; +sub.f32 f2122, f1854, f2121; +sub.f32 f2123, f1836, f1845; +mul.f32 f2124, f2123, 0fBF5DB3D7; +sub.f32 f2125, f2122, f2124; +add.f32 f2126, f2124, f2122; +add.f32 f2127, f1839, f1848; +add.f32 f2128, f1830, f2127; +mul.f32 f2131, f2127, 0f3F000000; +sub.f32 f2132, f1830, f2131; +add.f32 f2510, f1866, f1875; +sub.f32 f2133, f1866, f1875; +mul.f32 f2134, f2133, 0fBF5DB3D7; +add.f32 f2135, f2134, f2132; +sub.f32 f2136, f2132, f2134; +add.f32 f2509, f1857, f2510; +mul.f32 f2137, f2510, 0f3F000000; +sub.f32 f2138, f1857, f2137; +sub.f32 f2139, f1839, f1848; +mul.f32 f2140, f2139, 0fBF5DB3D7; +sub.f32 f2141, f2138, f2140; +add.f32 f2142, f2140, f2138; +add.f32 f2143, f1842, f1851; +add.f32 f2144, f1833, f2143; +mul.f32 f2147, f2143, 0f3F000000; +sub.f32 f2148, f1833, f2147; +add.f32 f2508, f1869, f1878; +sub.f32 f2149, f1869, f1878; +mul.f32 f2150, f2149, 0fBF5DB3D7; +add.f32 f2151, f2150, f2148; +sub.f32 f2152, f2148, f2150; +add.f32 f2507, f1860, f2508; +mul.f32 f2153, f2508, 0f3F000000; +sub.f32 f2154, f1860, f2153; +sub.f32 f2155, f1842, f1851; +mul.f32 f2156, f2155, 0fBF5DB3D7; +sub.f32 f2157, f2154, f2156; +add.f32 f2158, f2156, f2154; +mul.f32 f2160, f2141, 0f3F248DBB; +mul.f32 f2506, f2135, 0f3F441B7D; +sub.f32 f2161, f2506, f2160; +mul.f32 f2162, f2141, 0f3F441B7D; +fma.rn.f32 f2163, f2135, 0f3F248DBB, f2162; +mul.f32 f2504, f2151, 0f3E31D0D4; +mul.f32 f2505, f2157, 0f3F7C1C5C; +sub.f32 f2166, f2504, f2505; +mul.f32 f2167, f2157, 0f3E31D0D4; +fma.rn.f32 f2168, f2151, 0f3F7C1C5C, f2167; +mul.f32 f2502, f2136, 0f3E31D0D4; +mul.f32 f2503, f2142, 0f3F7C1C5C; +sub.f32 f2171, f2502, f2503; +mul.f32 f2172, f2142, 0f3E31D0D4; +fma.rn.f32 f2173, f2136, 0f3F7C1C5C, f2172; +mul.f32 f2500, f2152, 0fBF708FB2; +mul.f32 f2501, f2158, 0f3EAF1D44; +sub.f32 f2176, f2500, f2501; +mul.f32 f2177, f2158, 0fBF708FB2; +fma.rn.f32 f2178, f2152, 0f3EAF1D44, f2177; +add.f32 f2179, f2128, f2144; +add.f32 f2180, f2112, f2179; +mul.f32 f2183, f2179, 0f3F000000; +sub.f32 f2184, f2112, f2183; +add.f32 f2499, f2509, f2507; +sub.f32 f2185, f2509, f2507; +mul.f32 f2186, f2185, 0fBF5DB3D7; +add.f32 f2187, f2186, f2184; +sub.f32 f2188, f2184, f2186; +add.f32 f2498, f2511, f2499; +mul.f32 f2189, f2499, 0f3F000000; +sub.f32 f2190, f2511, f2189; +sub.f32 f2191, f2128, f2144; +mul.f32 f2192, f2191, 0fBF5DB3D7; +sub.f32 f2193, f2190, f2192; +add.f32 f2194, f2192, f2190; +add.f32 f2195, f2161, f2166; +add.f32 f2196, f2119, f2195; +mul.f32 f2199, f2195, 0f3F000000; +sub.f32 f2200, f2119, f2199; +add.f32 f2497, f2163, f2168; +sub.f32 f2201, f2163, f2168; +mul.f32 f2202, f2201, 0fBF5DB3D7; +add.f32 f2203, f2202, f2200; +sub.f32 f2204, f2200, f2202; +add.f32 f2496, f2125, f2497; +mul.f32 f2205, f2497, 0f3F000000; +sub.f32 f2206, f2125, f2205; +sub.f32 f2207, f2161, f2166; +mul.f32 f2208, f2207, 0fBF5DB3D7; +sub.f32 f2209, f2206, f2208; +add.f32 f2210, f2208, f2206; +add.f32 f2211, f2171, f2176; +add.f32 f2212, f2120, f2211; +mul.f32 f2215, f2211, 0f3F000000; +sub.f32 f2216, f2120, f2215; +add.f32 f2495, f2173, f2178; +sub.f32 f2217, f2173, f2178; +mul.f32 f2218, f2217, 0fBF5DB3D7; +add.f32 f2219, f2218, f2216; +sub.f32 f2220, f2216, f2218; +add.f32 f2494, f2126, f2495; +mul.f32 f2221, f2495, 0f3F000000; +sub.f32 f2222, f2126, f2221; +sub.f32 f2223, f2171, f2176; +mul.f32 f2224, f2223, 0fBF5DB3D7; +sub.f32 f2225, f2222, f2224; +add.f32 f2226, f2224, f2222; +mul.f32 f2228, f2515, 0f3E6C2691; +mul.f32 f2493, f2080, 0f3F791978; +sub.f32 f2229, f2493, f2228; +mul.f32 f2230, f2515, 0f3F791978; +fma.rn.f32 f2231, f2080, 0f3E6C2691, f2230; +mul.f32 f2233, f2496, 0f3EE5C902; +mul.f32 f2492, f2196, 0f3F64C51C; +sub.f32 f2234, f2492, f2233; +mul.f32 f2235, f2496, 0f3F64C51C; +fma.rn.f32 f2236, f2196, 0f3EE5C902, f2235; +mul.f32 f2238, f2513, 0f3EE5C902; +mul.f32 f2491, f2096, 0f3F64C51C; +sub.f32 f2239, f2491, f2238; +mul.f32 f2240, f2513, 0f3F64C51C; +fma.rn.f32 f2241, f2096, 0f3EE5C902, f2240; +mul.f32 f2243, f2494, 0f3F4D57F2; +mul.f32 f2490, f2212, 0f3F18DF63; +sub.f32 f2244, f2490, f2243; +mul.f32 f2245, f2494, 0f3F18DF63; +fma.rn.f32 f2246, f2212, 0f3F4D57F2, f2245; +mul.f32 f2488, f2071, 0f3F441B7D; +mul.f32 f2489, f2077, 0f3F248DBB; +sub.f32 f2249, f2488, f2489; +mul.f32 f2250, f2077, 0f3F441B7D; +fma.rn.f32 f2251, f2071, 0f3F248DBB, f2250; +mul.f32 f2486, f2187, 0f3E31D0D4; +mul.f32 f2487, f2193, 0f3F7C1C5C; +sub.f32 f2254, f2486, f2487; +mul.f32 f2255, f2193, 0f3E31D0D4; +fma.rn.f32 f2256, f2187, 0f3F7C1C5C, f2255; +mul.f32 f2484, f2087, 0f3F18DF63; +mul.f32 f2485, f2093, 0f3F4D57F2; +sub.f32 f2259, f2484, f2485; +mul.f32 f2260, f2093, 0f3F18DF63; +fma.rn.f32 f2261, f2087, 0f3F4D57F2, f2260; +mul.f32 f2482, f2203, 0fBE92D7E0; +mul.f32 f2483, f2209, 0f3F753ECD; +sub.f32 f2264, f2482, f2483; +mul.f32 f2265, f2209, 0fBE92D7E0; +fma.rn.f32 f2266, f2203, 0f3F753ECD, f2265; +mul.f32 f2268, f2109, 0f3F6B1036; +mul.f32 f2481, f2103, 0f3ECACAF8; +sub.f32 f2269, f2481, f2268; +mul.f32 f2270, f2109, 0f3ECACAF8; +fma.rn.f32 f2271, f2103, 0f3F6B1036, f2270; +mul.f32 f2273, f2225, 0f3F3A3529; +mul.f32 f2480, f2219, 0fBF2FAD88; +sub.f32 f2274, f2480, f2273; +mul.f32 f2275, f2225, 0fBF2FAD88; +fma.rn.f32 f2276, f2219, 0f3F3A3529, f2275; +mul.f32 f2278, f2078, 0f3F7C1C5C; +mul.f32 f2479, f2072, 0f3E31D0D4; +sub.f32 f2279, f2479, f2278; +mul.f32 f2280, f2078, 0f3E31D0D4; +fma.rn.f32 f2281, f2072, 0f3F7C1C5C, f2280; +mul.f32 f2283, f2194, 0f3EAF1D44; +mul.f32 f2478, f2188, 0fBF708FB2; +sub.f32 f2284, f2478, f2283; +mul.f32 f2285, f2194, 0fBF708FB2; +fma.rn.f32 f2286, f2188, 0f3EAF1D44, f2285; +mul.f32 f2288, f2094, 0f3F7F9120; +mul.f32 f2477, f2088, 0fBD6E2946; +sub.f32 f2289, f2477, f2288; +mul.f32 f2290, f2094, 0fBD6E2946; +fma.rn.f32 f2291, f2088, 0f3F7F9120, f2290; +mul.f32 f2475, f2204, 0fBF7E44DE; +mul.f32 f2476, f2210, 0fBDEDC21F; +sub.f32 f2294, f2475, f2476; +mul.f32 f2295, f2210, 0fBF7E44DE; +fma.rn.f32 f2296, f2204, 0fBDEDC21F, f2295; +mul.f32 f2473, f2104, 0fBE92D7E0; +mul.f32 f2474, f2110, 0f3F753ECD; +sub.f32 f2299, f2473, f2474; +mul.f32 f2300, f2110, 0fBE92D7E0; +fma.rn.f32 f2301, f2104, 0f3F753ECD, f2300; +mul.f32 f2471, f2220, 0fBF55E287; +mul.f32 f2472, f2226, 0fBF0CAC9F; +sub.f32 f2304, f2471, f2472; +mul.f32 f2305, f2226, 0fBF55E287; +fma.rn.f32 f2306, f2220, 0fBF0CAC9F, f2305; +add.f32 f2307, f2064, f2180; +mul.f32 f2309, f2307, 0f3F000000; +sub.f32 f2310, f1948, f2309; +add.f32 f2470, f2517, f2498; +sub.f32 f2311, f2517, f2498; +mul.f32 f2312, f2311, 0fBF5DB3D7; +mul.f32 f2313, f2470, 0f3F000000; +sub.f32 f2314, f2536, f2313; +sub.f32 f2315, f2064, f2180; +mul.f32 f2316, f2315, 0fBF5DB3D7; +add.f32 f2317, f2229, f2234; +mul.f32 f2319, f2317, 0f3F000000; +sub.f32 f2320, f1964, f2319; +add.f32 f2469, f2231, f2236; +sub.f32 f2321, f2231, f2236; +mul.f32 f2322, f2321, 0fBF5DB3D7; +mul.f32 f2323, f2469, 0f3F000000; +sub.f32 f2324, f2534, f2323; +sub.f32 f2325, f2229, f2234; +mul.f32 f2326, f2325, 0fBF5DB3D7; +add.f32 f2327, f2239, f2244; +mul.f32 f2329, f2327, 0f3F000000; +sub.f32 f2330, f1980, f2329; +add.f32 f2468, f2241, f2246; +sub.f32 f2331, f2241, f2246; +mul.f32 f2332, f2331, 0fBF5DB3D7; +mul.f32 f2333, f2468, 0f3F000000; +sub.f32 f2334, f2532, f2333; +sub.f32 f2335, f2239, f2244; +mul.f32 f2336, f2335, 0fBF5DB3D7; +add.f32 f2337, f2249, f2254; +mul.f32 f2339, f2337, 0f3F000000; +sub.f32 f2340, f1955, f2339; +add.f32 f2467, f2251, f2256; +sub.f32 f2341, f2251, f2256; +mul.f32 f2342, f2341, 0fBF5DB3D7; +mul.f32 f2343, f2467, 0f3F000000; +sub.f32 f2344, f1961, f2343; +sub.f32 f2345, f2249, f2254; +mul.f32 f2346, f2345, 0fBF5DB3D7; +add.f32 f2347, f2259, f2264; +mul.f32 f2349, f2347, 0f3F000000; +sub.f32 f2350, f1971, f2349; +add.f32 f2466, f2261, f2266; +sub.f32 f2351, f2261, f2266; +mul.f32 f2352, f2351, 0fBF5DB3D7; +mul.f32 f2353, f2466, 0f3F000000; +sub.f32 f2354, f1977, f2353; +sub.f32 f2355, f2259, f2264; +mul.f32 f2356, f2355, 0fBF5DB3D7; +add.f32 f2357, f2269, f2274; +mul.f32 f2359, f2357, 0f3F000000; +sub.f32 f2360, f1987, f2359; +add.f32 f2465, f2271, f2276; +sub.f32 f2361, f2271, f2276; +mul.f32 f2362, f2361, 0fBF5DB3D7; +mul.f32 f2363, f2465, 0f3F000000; +sub.f32 f2364, f1993, f2363; +sub.f32 f2365, f2269, f2274; +mul.f32 f2366, f2365, 0fBF5DB3D7; +add.f32 f2367, f2279, f2284; +mul.f32 f2369, f2367, 0f3F000000; +sub.f32 f2370, f1956, f2369; +add.f32 f2464, f2281, f2286; +sub.f32 f2371, f2281, f2286; +mul.f32 f2372, f2371, 0fBF5DB3D7; +mul.f32 f2373, f2464, 0f3F000000; +sub.f32 f2374, f1962, f2373; +sub.f32 f2375, f2279, f2284; +mul.f32 f2376, f2375, 0fBF5DB3D7; +add.f32 f2377, f2289, f2294; +mul.f32 f2379, f2377, 0f3F000000; +sub.f32 f2380, f1972, f2379; +add.f32 f2463, f2291, f2296; +sub.f32 f2381, f2291, f2296; +mul.f32 f2382, f2381, 0fBF5DB3D7; +mul.f32 f2383, f2463, 0f3F000000; +sub.f32 f2384, f1978, f2383; +sub.f32 f2385, f2289, f2294; +mul.f32 f2386, f2385, 0fBF5DB3D7; +add.f32 f2387, f2299, f2304; +mul.f32 f2389, f2387, 0f3F000000; +sub.f32 f2390, f1988, f2389; +add.f32 f2462, f2301, f2306; +sub.f32 f2391, f2301, f2306; +mul.f32 f2392, f2391, 0fBF5DB3D7; +mul.f32 f2393, f2462, 0f3F000000; +sub.f32 f2394, f1994, f2393; +sub.f32 f2395, f2299, f2304; +mul.f32 f2858, f2465, 0f3F000000; +sub.f32 f2857, f1993, f2858; +mul.f32 f2396, f2395, 0fBF5DB3D7; +add.f32 %0, f1948, f2307; +mul.f32 f2860, f2337, 0f3F000000; +sub.f32 f2859, f1955, f2860; +add.f32 %1, f2536, f2470; +mul.f32 f2862, f2466, 0f3F000000; +sub.f32 f2861, f1977, f2862; +mul.f32 f2864, f2327, 0f3F000000; +sub.f32 f2863, f1980, f2864; +add.f32 %3, f2534, f2469; +add.f32 %2, f1964, f2317; +add.f32 %5, f2532, f2468; +add.f32 %4, f1980, f2327; +add.f32 %7, f1961, f2467; +add.f32 %6, f1955, f2337; +add.f32 %9, f1977, f2466; +add.f32 %8, f1971, f2347; +add.f32 %11, f1993, f2465; +add.f32 %10, f1987, f2357; +add.f32 %13, f1962, f2464; +add.f32 %12, f1956, f2367; +add.f32 %15, f1978, f2463; +add.f32 %14, f1972, f2377; +add.f32 %17, f1994, f2462; +add.f32 %16, f1988, f2387; +sub.f32 %19, f2314, f2316; +add.f32 %18, f2312, f2310; +add.f32 %20, f2322, f2320; +sub.f32 %21, f2324, f2326; +add.f32 %22, f2332, f2863; +sub.f32 %23, f2334, f2336; +add.f32 %24, f2342, f2859; +sub.f32 %25, f2344, f2346; +sub.f32 %27, f2861, f2356; +add.f32 %26, f2352, f2350; +sub.f32 %29, f2857, f2366; +add.f32 %28, f2362, f2360; +add.f32 %30, f2372, f2370; +sub.f32 %31, f2374, f2376; +add.f32 %32, f2382, f2380; +sub.f32 %33, f2384, f2386; +add.f32 %34, f2392, f2390; +sub.f32 %35, f2394, f2396; +sub.f32 %36, f2310, f2312; +add.f32 %37, f2316, f2314; +add.f32 %39, f2326, f2324; +sub.f32 %38, f2320, f2322; +add.f32 %41, f2336, f2334; +sub.f32 %40, f2863, f2332; +add.f32 %43, f2346, f2344; +sub.f32 %42, f2859, f2342; +add.f32 %45, f2356, f2861; +sub.f32 %44, f2350, f2352; +add.f32 %47, f2366, f2857; +sub.f32 %46, f2360, f2362; +add.f32 %49, f2376, f2374; +sub.f32 %48, f2370, f2372; +add.f32 %51, f2386, f2384; +sub.f32 %50, f2380, f2382; +add.f32 %53, f2396, f2394; +sub.f32 %52, f2390, f2392; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_19683), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..1865e843a390b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp16_fwd.hpp.inc @@ -0,0 +1,3592 @@ +#ifndef CUFFTDX_FFT_19_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_19_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<751, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<343>; +.reg .b32 r<2233>; +.reg .f64 fd<325>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %40, %74; +} +{ +add.f16x2 r4, %41, %75; +} +{ +sub.f16x2 r7, %40, %74; +} +{ +sub.f16x2 r10, %41, %75; +} +{ +add.f16x2 r13, %42, %72; +} +{ +add.f16x2 r16, %43, %73; +} +{ +sub.f16x2 r19, %42, %72; +} +{ +sub.f16x2 r22, %43, %73; +} +{ +add.f16x2 r25, %44, %70; +} +{ +add.f16x2 r28, %45, %71; +} +{ +sub.f16x2 r31, %44, %70; +} +{ +sub.f16x2 r34, %45, %71; +} +{ +add.f16x2 r37, %46, %68; +} +{ +add.f16x2 r40, %47, %69; +} +{ +sub.f16x2 r43, %46, %68; +} +{ +sub.f16x2 r46, %47, %69; +} +{ +add.f16x2 r49, %48, %66; +} +{ +add.f16x2 r52, %49, %67; +} +{ +sub.f16x2 r55, %48, %66; +} +{ +sub.f16x2 r58, %49, %67; +} +{ +add.f16x2 r61, %50, %64; +} +{ +add.f16x2 r64, %51, %65; +} +{ +sub.f16x2 r67, %50, %64; +} +{ +sub.f16x2 r70, %51, %65; +} +{ +add.f16x2 r73, %52, %62; +} +{ +add.f16x2 r76, %53, %63; +} +{ +sub.f16x2 r79, %52, %62; +} +{ +sub.f16x2 r82, %53, %63; +} +{ +add.f16x2 r85, %54, %60; +} +{ +add.f16x2 r88, %55, %61; +} +{ +sub.f16x2 r91, %54, %60; +} +{ +sub.f16x2 r94, %55, %61; +} +{ +add.f16x2 r97, %56, %58; +} +{ +add.f16x2 r100, %57, %59; +} +{ +sub.f16x2 r103, %56, %58; +} +{ +sub.f16x2 r106, %57, %59; +} +{ +add.f16x2 r109, %38, r1; +} +{ +add.f16x2 r112, %39, r4; +} +{ +add.f16x2 r115, r109, r13; +} +{ +add.f16x2 r118, r112, r16; +} +{ +add.f16x2 r121, r115, r25; +} +{ +add.f16x2 r124, r118, r28; +} +{ +add.f16x2 r127, r121, r37; +} +{ +add.f16x2 r130, r124, r40; +} +{ +add.f16x2 r133, r127, r49; +} +{ +add.f16x2 r136, r130, r52; +} +{ +add.f16x2 r139, r133, r61; +} +{ +add.f16x2 r142, r136, r64; +} +{ +add.f16x2 r145, r139, r73; +} +{ +add.f16x2 r148, r142, r76; +} +{ +add.f16x2 r151, r145, r85; +} +{ +add.f16x2 r154, r148, r88; +} +{ +add.f16x2 %0, r151, r97; +} +{ +add.f16x2 %1, r154, r100; +} +mov.u32 r2004, 0; +cvt.rn.f16.s32 rs1, r2004; +mov.b32 r175, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r2004; +mov.b32 r187, {rs2, rs2}; +mov.f64 fd295, 0d3FEE442285231BE1; +{ +cvt.rn.f16.f64 rs3, fd295; +} +mov.b32 r167, {rs3, rs3}; +{ +mul.f16x2 r165, r1, r167; +} +{ +add.f16x2 r168, %38, r165; +} +mov.f64 fd160, 0dBFD4C7E04850CFAA; +{ +cvt.rn.f16.f64 rs4, fd160; +} +mov.b32 r173, {rs4, rs4}; +{ +mul.f16x2 r171, r10, r173; +} +{ +add.f16x2 r174, r175, r171; +} +{ +cvt.rn.f16.f64 rs5, fd295; +} +mov.b32 r179, {rs5, rs5}; +{ +mul.f16x2 r177, r4, r179; +} +{ +add.f16x2 r180, %39, r177; +} +{ +cvt.rn.f16.f64 rs6, fd160; +} +mov.b32 r185, {rs6, rs6}; +{ +mul.f16x2 r183, r7, r185; +} +{ +add.f16x2 r186, r187, r183; +} +mov.f64 fd303, 0d3FE940A398F9CD23; +{ +cvt.rn.f16.f64 rs7, fd303; +} +mov.b32 r191, {rs7, rs7}; +{ +mul.f16x2 r189, r13, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd272, 0dBFE3A7A16B394423; +{ +cvt.rn.f16.f64 rs8, fd272; +} +mov.b32 r197, {rs8, rs8}; +{ +mul.f16x2 r195, r22, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs9, fd303; +} +mov.b32 r203, {rs9, rs9}; +{ +mul.f16x2 r201, r16, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs10, fd272; +} +mov.b32 r209, {rs10, rs10}; +{ +mul.f16x2 r207, r19, r209; +} +{ +add.f16x2 r210, r186, r207; +} +mov.f64 fd311, 0d3FE180996C77C8CA; +{ +cvt.rn.f16.f64 rs11, fd311; +} +mov.b32 r215, {rs11, rs11}; +{ +mul.f16x2 r213, r25, r215; +} +{ +add.f16x2 r216, r192, r213; +} +mov.f64 fd76, 0dBFEACA115AAE3DE4; +{ +cvt.rn.f16.f64 rs12, fd76; +} +mov.b32 r221, {rs12, rs12}; +{ +mul.f16x2 r219, r34, r221; +} +{ +add.f16x2 r222, r198, r219; +} +{ +cvt.rn.f16.f64 rs13, fd311; +} +mov.b32 r227, {rs13, rs13}; +{ +mul.f16x2 r225, r28, r227; +} +{ +add.f16x2 r228, r204, r225; +} +{ +cvt.rn.f16.f64 rs14, fd76; +} +mov.b32 r233, {rs14, rs14}; +{ +mul.f16x2 r231, r31, r233; +} +{ +add.f16x2 r234, r210, r231; +} +mov.f64 fd319, 0d3FCF6C118574C83E; +{ +cvt.rn.f16.f64 rs15, fd319; +} +mov.b32 r239, {rs15, rs15}; +{ +mul.f16x2 r237, r37, r239; +} +{ +add.f16x2 r240, r216, r237; +} +mov.f64 fd240, 0dBFEF0553B4DE2E18; +{ +cvt.rn.f16.f64 rs16, fd240; +} +mov.b32 r245, {rs16, rs16}; +{ +mul.f16x2 r243, r46, r245; +} +{ +add.f16x2 r246, r222, r243; +} +{ +cvt.rn.f16.f64 rs17, fd319; +} +mov.b32 r251, {rs17, rs17}; +{ +mul.f16x2 r249, r40, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +cvt.rn.f16.f64 rs18, fd240; +} +mov.b32 r257, {rs18, rs18}; +{ +mul.f16x2 r255, r43, r257; +} +{ +add.f16x2 r258, r234, r255; +} +mov.f64 fd323, 0dBFB523EB8420F5F5; +{ +cvt.rn.f16.f64 rs19, fd323; +} +mov.b32 r263, {rs19, rs19}; +{ +mul.f16x2 r261, r49, r263; +} +{ +add.f16x2 r264, r240, r261; +} +mov.f64 fd324, 0dBFEFE40529A542AA; +{ +cvt.rn.f16.f64 rs20, fd324; +} +mov.b32 r269, {rs20, rs20}; +{ +mul.f16x2 r267, r58, r269; +} +{ +add.f16x2 r270, r246, r267; +} +{ +cvt.rn.f16.f64 rs21, fd323; +} +mov.b32 r275, {rs21, rs21}; +{ +mul.f16x2 r273, r52, r275; +} +{ +add.f16x2 r276, r252, r273; +} +{ +cvt.rn.f16.f64 rs22, fd324; +} +mov.b32 r281, {rs22, rs22}; +{ +mul.f16x2 r279, r55, r281; +} +{ +add.f16x2 r282, r258, r279; +} +mov.f64 fd315, 0dBFD9B560B9F596EA; +{ +cvt.rn.f16.f64 rs23, fd315; +} +mov.b32 r287, {rs23, rs23}; +{ +mul.f16x2 r285, r61, r287; +} +{ +add.f16x2 r288, r264, r285; +} +mov.f64 fd316, 0dBFED4E03DD110B08; +{ +cvt.rn.f16.f64 rs24, fd316; +} +mov.b32 r293, {rs24, rs24}; +{ +mul.f16x2 r291, r70, r293; +} +{ +add.f16x2 r294, r270, r291; +} +{ +cvt.rn.f16.f64 rs25, fd315; +} +mov.b32 r299, {rs25, rs25}; +{ +mul.f16x2 r297, r64, r299; +} +{ +add.f16x2 r300, r276, r297; +} +{ +cvt.rn.f16.f64 rs26, fd316; +} +mov.b32 r305, {rs26, rs26}; +{ +mul.f16x2 r303, r67, r305; +} +{ +add.f16x2 r306, r282, r303; +} +mov.f64 fd307, 0dBFE5AC4A670A1CFF; +{ +cvt.rn.f16.f64 rs27, fd307; +} +mov.b32 r311, {rs27, rs27}; +{ +mul.f16x2 r309, r73, r311; +} +{ +add.f16x2 r312, r288, r309; +} +mov.f64 fd308, 0dBFE78B0CDEE73E0F; +{ +cvt.rn.f16.f64 rs28, fd308; +} +mov.b32 r317, {rs28, rs28}; +{ +mul.f16x2 r315, r82, r317; +} +{ +add.f16x2 r318, r294, r315; +} +{ +cvt.rn.f16.f64 rs29, fd307; +} +mov.b32 r323, {rs29, rs29}; +{ +mul.f16x2 r321, r76, r323; +} +{ +add.f16x2 r324, r300, r321; +} +{ +cvt.rn.f16.f64 rs30, fd308; +} +mov.b32 r329, {rs30, rs30}; +{ +mul.f16x2 r327, r79, r329; +} +{ +add.f16x2 r330, r306, r327; +} +mov.f64 fd299, 0dBFEC24A622E3E9F9; +{ +cvt.rn.f16.f64 rs31, fd299; +} +mov.b32 r335, {rs31, rs31}; +{ +mul.f16x2 r333, r85, r335; +} +{ +add.f16x2 r336, r312, r333; +} +mov.f64 fd300, 0dBFDE75EC0DED7BEE; +{ +cvt.rn.f16.f64 rs32, fd300; +} +mov.b32 r341, {rs32, rs32}; +{ +mul.f16x2 r339, r94, r341; +} +{ +add.f16x2 r342, r318, r339; +} +{ +cvt.rn.f16.f64 rs33, fd299; +} +mov.b32 r347, {rs33, rs33}; +{ +mul.f16x2 r345, r88, r347; +} +{ +add.f16x2 r348, r324, r345; +} +{ +cvt.rn.f16.f64 rs34, fd300; +} +mov.b32 r353, {rs34, rs34}; +{ +mul.f16x2 r351, r91, r353; +} +{ +add.f16x2 r354, r330, r351; +} +mov.f64 fd291, 0dBFEF90459484F2B2; +{ +cvt.rn.f16.f64 rs35, fd291; +} +mov.b32 r359, {rs35, rs35}; +{ +mul.f16x2 r357, r97, r359; +} +{ +add.f16x2 r360, r336, r357; +} +mov.f64 fd292, 0dBFC5116F7F2D58C5; +{ +cvt.rn.f16.f64 rs36, fd292; +} +mov.b32 r365, {rs36, rs36}; +{ +mul.f16x2 r363, r106, r365; +} +{ +add.f16x2 r366, r342, r363; +} +{ +cvt.rn.f16.f64 rs37, fd291; +} +mov.b32 r371, {rs37, rs37}; +{ +mul.f16x2 r369, r100, r371; +} +{ +add.f16x2 r372, r348, r369; +} +{ +cvt.rn.f16.f64 rs38, fd292; +} +mov.b32 r377, {rs38, rs38}; +{ +mul.f16x2 r375, r103, r377; +} +{ +add.f16x2 r378, r354, r375; +} +{ +sub.f16x2 %2, r360, r366; +} +{ +add.f16x2 %3, r372, r378; +} +{ +add.f16x2 %36, r360, r366; +} +{ +sub.f16x2 %37, r372, r378; +} +cvt.rn.f16.s32 rs39, r2004; +mov.b32 r405, {rs39, rs39}; +cvt.rn.f16.s32 rs40, r2004; +mov.b32 r417, {rs40, rs40}; +{ +cvt.rn.f16.f64 rs41, fd303; +} +mov.b32 r397, {rs41, rs41}; +{ +mul.f16x2 r395, r1, r397; +} +{ +add.f16x2 r398, %38, r395; +} +{ +cvt.rn.f16.f64 rs42, fd272; +} +mov.b32 r403, {rs42, rs42}; +{ +mul.f16x2 r401, r10, r403; +} +{ +add.f16x2 r404, r405, r401; +} +{ +cvt.rn.f16.f64 rs43, fd303; +} +mov.b32 r409, {rs43, rs43}; +{ +mul.f16x2 r407, r4, r409; +} +{ +add.f16x2 r410, %39, r407; +} +{ +cvt.rn.f16.f64 rs44, fd272; +} +mov.b32 r415, {rs44, rs44}; +{ +mul.f16x2 r413, r7, r415; +} +{ +add.f16x2 r416, r417, r413; +} +{ +cvt.rn.f16.f64 rs45, fd319; +} +mov.b32 r421, {rs45, rs45}; +{ +mul.f16x2 r419, r13, r421; +} +{ +add.f16x2 r422, r398, r419; +} +{ +cvt.rn.f16.f64 rs46, fd240; +} +mov.b32 r427, {rs46, rs46}; +{ +mul.f16x2 r425, r22, r427; +} +{ +add.f16x2 r428, r404, r425; +} +{ +cvt.rn.f16.f64 rs47, fd319; +} +mov.b32 r433, {rs47, rs47}; +{ +mul.f16x2 r431, r16, r433; +} +{ +add.f16x2 r434, r410, r431; +} +{ +cvt.rn.f16.f64 rs48, fd240; +} +mov.b32 r439, {rs48, rs48}; +{ +mul.f16x2 r437, r19, r439; +} +{ +add.f16x2 r440, r416, r437; +} +{ +cvt.rn.f16.f64 rs49, fd315; +} +mov.b32 r445, {rs49, rs49}; +{ +mul.f16x2 r443, r25, r445; +} +{ +add.f16x2 r446, r422, r443; +} +{ +cvt.rn.f16.f64 rs50, fd316; +} +mov.b32 r451, {rs50, rs50}; +{ +mul.f16x2 r449, r34, r451; +} +{ +add.f16x2 r452, r428, r449; +} +{ +cvt.rn.f16.f64 rs51, fd315; +} +mov.b32 r457, {rs51, rs51}; +{ +mul.f16x2 r455, r28, r457; +} +{ +add.f16x2 r458, r434, r455; +} +{ +cvt.rn.f16.f64 rs52, fd316; +} +mov.b32 r463, {rs52, rs52}; +{ +mul.f16x2 r461, r31, r463; +} +{ +add.f16x2 r464, r440, r461; +} +{ +cvt.rn.f16.f64 rs53, fd299; +} +mov.b32 r469, {rs53, rs53}; +{ +mul.f16x2 r467, r37, r469; +} +{ +add.f16x2 r470, r446, r467; +} +{ +cvt.rn.f16.f64 rs54, fd300; +} +mov.b32 r475, {rs54, rs54}; +{ +mul.f16x2 r473, r46, r475; +} +{ +add.f16x2 r476, r452, r473; +} +{ +cvt.rn.f16.f64 rs55, fd299; +} +mov.b32 r481, {rs55, rs55}; +{ +mul.f16x2 r479, r40, r481; +} +{ +add.f16x2 r482, r458, r479; +} +{ +cvt.rn.f16.f64 rs56, fd300; +} +mov.b32 r487, {rs56, rs56}; +{ +mul.f16x2 r485, r43, r487; +} +{ +add.f16x2 r488, r464, r485; +} +{ +cvt.rn.f16.f64 rs57, fd291; +} +mov.b32 r493, {rs57, rs57}; +{ +mul.f16x2 r491, r49, r493; +} +{ +add.f16x2 r494, r470, r491; +} +mov.f64 fd276, 0d3FC5116F7F2D58C5; +{ +cvt.rn.f16.f64 rs58, fd276; +} +mov.b32 r499, {rs58, rs58}; +{ +mul.f16x2 r497, r58, r499; +} +{ +add.f16x2 r500, r476, r497; +} +{ +cvt.rn.f16.f64 rs59, fd291; +} +mov.b32 r505, {rs59, rs59}; +{ +mul.f16x2 r503, r52, r505; +} +{ +add.f16x2 r506, r482, r503; +} +{ +cvt.rn.f16.f64 rs60, fd276; +} +mov.b32 r511, {rs60, rs60}; +{ +mul.f16x2 r509, r55, r511; +} +{ +add.f16x2 r512, r488, r509; +} +{ +cvt.rn.f16.f64 rs61, fd307; +} +mov.b32 r517, {rs61, rs61}; +{ +mul.f16x2 r515, r61, r517; +} +{ +add.f16x2 r518, r494, r515; +} +mov.f64 fd188, 0d3FE78B0CDEE73E0F; +{ +cvt.rn.f16.f64 rs62, fd188; +} +mov.b32 r523, {rs62, rs62}; +{ +mul.f16x2 r521, r70, r523; +} +{ +add.f16x2 r524, r500, r521; +} +{ +cvt.rn.f16.f64 rs63, fd307; +} +mov.b32 r529, {rs63, rs63}; +{ +mul.f16x2 r527, r64, r529; +} +{ +add.f16x2 r530, r506, r527; +} +{ +cvt.rn.f16.f64 rs64, fd188; +} +mov.b32 r535, {rs64, rs64}; +{ +mul.f16x2 r533, r67, r535; +} +{ +add.f16x2 r536, r512, r533; +} +{ +cvt.rn.f16.f64 rs65, fd323; +} +mov.b32 r541, {rs65, rs65}; +{ +mul.f16x2 r539, r73, r541; +} +{ +add.f16x2 r542, r518, r539; +} +mov.f64 fd224, 0d3FEFE40529A542AA; +{ +cvt.rn.f16.f64 rs66, fd224; +} +mov.b32 r547, {rs66, rs66}; +{ +mul.f16x2 r545, r82, r547; +} +{ +add.f16x2 r548, r524, r545; +} +{ +cvt.rn.f16.f64 rs67, fd323; +} +mov.b32 r553, {rs67, rs67}; +{ +mul.f16x2 r551, r76, r553; +} +{ +add.f16x2 r554, r530, r551; +} +{ +cvt.rn.f16.f64 rs68, fd224; +} +mov.b32 r559, {rs68, rs68}; +{ +mul.f16x2 r557, r79, r559; +} +{ +add.f16x2 r560, r536, r557; +} +{ +cvt.rn.f16.f64 rs69, fd311; +} +mov.b32 r565, {rs69, rs69}; +{ +mul.f16x2 r563, r85, r565; +} +{ +add.f16x2 r566, r542, r563; +} +mov.f64 fd312, 0d3FEACA115AAE3DE4; +{ +cvt.rn.f16.f64 rs70, fd312; +} +mov.b32 r571, {rs70, rs70}; +{ +mul.f16x2 r569, r94, r571; +} +{ +add.f16x2 r572, r548, r569; +} +{ +cvt.rn.f16.f64 rs71, fd311; +} +mov.b32 r577, {rs71, rs71}; +{ +mul.f16x2 r575, r88, r577; +} +{ +add.f16x2 r578, r554, r575; +} +{ +cvt.rn.f16.f64 rs72, fd312; +} +mov.b32 r583, {rs72, rs72}; +{ +mul.f16x2 r581, r91, r583; +} +{ +add.f16x2 r584, r560, r581; +} +{ +cvt.rn.f16.f64 rs73, fd295; +} +mov.b32 r589, {rs73, rs73}; +{ +mul.f16x2 r587, r97, r589; +} +{ +add.f16x2 r590, r566, r587; +} +mov.f64 fd296, 0d3FD4C7E04850CFAA; +{ +cvt.rn.f16.f64 rs74, fd296; +} +mov.b32 r595, {rs74, rs74}; +{ +mul.f16x2 r593, r106, r595; +} +{ +add.f16x2 r596, r572, r593; +} +{ +cvt.rn.f16.f64 rs75, fd295; +} +mov.b32 r601, {rs75, rs75}; +{ +mul.f16x2 r599, r100, r601; +} +{ +add.f16x2 r602, r578, r599; +} +{ +cvt.rn.f16.f64 rs76, fd296; +} +mov.b32 r607, {rs76, rs76}; +{ +mul.f16x2 r605, r103, r607; +} +{ +add.f16x2 r608, r584, r605; +} +{ +sub.f16x2 %4, r590, r596; +} +{ +add.f16x2 %5, r602, r608; +} +{ +add.f16x2 %34, r590, r596; +} +{ +sub.f16x2 %35, r602, r608; +} +cvt.rn.f16.s32 rs77, r2004; +mov.b32 r635, {rs77, rs77}; +cvt.rn.f16.s32 rs78, r2004; +mov.b32 r647, {rs78, rs78}; +{ +cvt.rn.f16.f64 rs79, fd311; +} +mov.b32 r627, {rs79, rs79}; +{ +mul.f16x2 r625, r1, r627; +} +{ +add.f16x2 r628, %38, r625; +} +{ +cvt.rn.f16.f64 rs80, fd76; +} +mov.b32 r633, {rs80, rs80}; +{ +mul.f16x2 r631, r10, r633; +} +{ +add.f16x2 r634, r635, r631; +} +{ +cvt.rn.f16.f64 rs81, fd311; +} +mov.b32 r639, {rs81, rs81}; +{ +mul.f16x2 r637, r4, r639; +} +{ +add.f16x2 r640, %39, r637; +} +{ +cvt.rn.f16.f64 rs82, fd76; +} +mov.b32 r645, {rs82, rs82}; +{ +mul.f16x2 r643, r7, r645; +} +{ +add.f16x2 r646, r647, r643; +} +{ +cvt.rn.f16.f64 rs83, fd315; +} +mov.b32 r651, {rs83, rs83}; +{ +mul.f16x2 r649, r13, r651; +} +{ +add.f16x2 r652, r628, r649; +} +{ +cvt.rn.f16.f64 rs84, fd316; +} +mov.b32 r657, {rs84, rs84}; +{ +mul.f16x2 r655, r22, r657; +} +{ +add.f16x2 r658, r634, r655; +} +{ +cvt.rn.f16.f64 rs85, fd315; +} +mov.b32 r663, {rs85, rs85}; +{ +mul.f16x2 r661, r16, r663; +} +{ +add.f16x2 r664, r640, r661; +} +{ +cvt.rn.f16.f64 rs86, fd316; +} +mov.b32 r669, {rs86, rs86}; +{ +mul.f16x2 r667, r19, r669; +} +{ +add.f16x2 r670, r646, r667; +} +{ +cvt.rn.f16.f64 rs87, fd291; +} +mov.b32 r675, {rs87, rs87}; +{ +mul.f16x2 r673, r25, r675; +} +{ +add.f16x2 r676, r652, r673; +} +{ +cvt.rn.f16.f64 rs88, fd292; +} +mov.b32 r681, {rs88, rs88}; +{ +mul.f16x2 r679, r34, r681; +} +{ +add.f16x2 r682, r658, r679; +} +{ +cvt.rn.f16.f64 rs89, fd291; +} +mov.b32 r687, {rs89, rs89}; +{ +mul.f16x2 r685, r28, r687; +} +{ +add.f16x2 r688, r664, r685; +} +{ +cvt.rn.f16.f64 rs90, fd292; +} +mov.b32 r693, {rs90, rs90}; +{ +mul.f16x2 r691, r31, r693; +} +{ +add.f16x2 r694, r670, r691; +} +{ +cvt.rn.f16.f64 rs91, fd307; +} +mov.b32 r699, {rs91, rs91}; +{ +mul.f16x2 r697, r37, r699; +} +{ +add.f16x2 r700, r676, r697; +} +{ +cvt.rn.f16.f64 rs92, fd188; +} +mov.b32 r705, {rs92, rs92}; +{ +mul.f16x2 r703, r46, r705; +} +{ +add.f16x2 r706, r682, r703; +} +{ +cvt.rn.f16.f64 rs93, fd307; +} +mov.b32 r711, {rs93, rs93}; +{ +mul.f16x2 r709, r40, r711; +} +{ +add.f16x2 r712, r688, r709; +} +{ +cvt.rn.f16.f64 rs94, fd188; +} +mov.b32 r717, {rs94, rs94}; +{ +mul.f16x2 r715, r43, r717; +} +{ +add.f16x2 r718, r694, r715; +} +{ +cvt.rn.f16.f64 rs95, fd319; +} +mov.b32 r723, {rs95, rs95}; +{ +mul.f16x2 r721, r49, r723; +} +{ +add.f16x2 r724, r700, r721; +} +mov.f64 fd320, 0d3FEF0553B4DE2E18; +{ +cvt.rn.f16.f64 rs96, fd320; +} +mov.b32 r729, {rs96, rs96}; +{ +mul.f16x2 r727, r58, r729; +} +{ +add.f16x2 r730, r706, r727; +} +{ +cvt.rn.f16.f64 rs97, fd319; +} +mov.b32 r735, {rs97, rs97}; +{ +mul.f16x2 r733, r52, r735; +} +{ +add.f16x2 r736, r712, r733; +} +{ +cvt.rn.f16.f64 rs98, fd320; +} +mov.b32 r741, {rs98, rs98}; +{ +mul.f16x2 r739, r55, r741; +} +{ +add.f16x2 r742, r718, r739; +} +{ +cvt.rn.f16.f64 rs99, fd295; +} +mov.b32 r747, {rs99, rs99}; +{ +mul.f16x2 r745, r61, r747; +} +{ +add.f16x2 r748, r724, r745; +} +{ +cvt.rn.f16.f64 rs100, fd296; +} +mov.b32 r753, {rs100, rs100}; +{ +mul.f16x2 r751, r70, r753; +} +{ +add.f16x2 r754, r730, r751; +} +{ +cvt.rn.f16.f64 rs101, fd295; +} +mov.b32 r759, {rs101, rs101}; +{ +mul.f16x2 r757, r64, r759; +} +{ +add.f16x2 r760, r736, r757; +} +{ +cvt.rn.f16.f64 rs102, fd296; +} +mov.b32 r765, {rs102, rs102}; +{ +mul.f16x2 r763, r67, r765; +} +{ +add.f16x2 r766, r742, r763; +} +{ +cvt.rn.f16.f64 rs103, fd303; +} +mov.b32 r771, {rs103, rs103}; +{ +mul.f16x2 r769, r73, r771; +} +{ +add.f16x2 r772, r748, r769; +} +{ +cvt.rn.f16.f64 rs104, fd272; +} +mov.b32 r777, {rs104, rs104}; +{ +mul.f16x2 r775, r82, r777; +} +{ +add.f16x2 r778, r754, r775; +} +{ +cvt.rn.f16.f64 rs105, fd303; +} +mov.b32 r783, {rs105, rs105}; +{ +mul.f16x2 r781, r76, r783; +} +{ +add.f16x2 r784, r760, r781; +} +{ +cvt.rn.f16.f64 rs106, fd272; +} +mov.b32 r789, {rs106, rs106}; +{ +mul.f16x2 r787, r79, r789; +} +{ +add.f16x2 r790, r766, r787; +} +{ +cvt.rn.f16.f64 rs107, fd323; +} +mov.b32 r795, {rs107, rs107}; +{ +mul.f16x2 r793, r85, r795; +} +{ +add.f16x2 r796, r772, r793; +} +{ +cvt.rn.f16.f64 rs108, fd324; +} +mov.b32 r801, {rs108, rs108}; +{ +mul.f16x2 r799, r94, r801; +} +{ +add.f16x2 r802, r778, r799; +} +{ +cvt.rn.f16.f64 rs109, fd323; +} +mov.b32 r807, {rs109, rs109}; +{ +mul.f16x2 r805, r88, r807; +} +{ +add.f16x2 r808, r784, r805; +} +{ +cvt.rn.f16.f64 rs110, fd324; +} +mov.b32 r813, {rs110, rs110}; +{ +mul.f16x2 r811, r91, r813; +} +{ +add.f16x2 r814, r790, r811; +} +{ +cvt.rn.f16.f64 rs111, fd299; +} +mov.b32 r819, {rs111, rs111}; +{ +mul.f16x2 r817, r97, r819; +} +{ +add.f16x2 r820, r796, r817; +} +{ +cvt.rn.f16.f64 rs112, fd300; +} +mov.b32 r825, {rs112, rs112}; +{ +mul.f16x2 r823, r106, r825; +} +{ +add.f16x2 r826, r802, r823; +} +{ +cvt.rn.f16.f64 rs113, fd299; +} +mov.b32 r831, {rs113, rs113}; +{ +mul.f16x2 r829, r100, r831; +} +{ +add.f16x2 r832, r808, r829; +} +{ +cvt.rn.f16.f64 rs114, fd300; +} +mov.b32 r837, {rs114, rs114}; +{ +mul.f16x2 r835, r103, r837; +} +{ +add.f16x2 r838, r814, r835; +} +{ +sub.f16x2 %6, r820, r826; +} +{ +add.f16x2 %7, r832, r838; +} +{ +add.f16x2 %32, r820, r826; +} +{ +sub.f16x2 %33, r832, r838; +} +cvt.rn.f16.s32 rs115, r2004; +mov.b32 r865, {rs115, rs115}; +cvt.rn.f16.s32 rs116, r2004; +mov.b32 r877, {rs116, rs116}; +{ +cvt.rn.f16.f64 rs117, fd319; +} +mov.b32 r857, {rs117, rs117}; +{ +mul.f16x2 r855, r1, r857; +} +{ +add.f16x2 r858, %38, r855; +} +{ +cvt.rn.f16.f64 rs118, fd240; +} +mov.b32 r863, {rs118, rs118}; +{ +mul.f16x2 r861, r10, r863; +} +{ +add.f16x2 r864, r865, r861; +} +{ +cvt.rn.f16.f64 rs119, fd319; +} +mov.b32 r869, {rs119, rs119}; +{ +mul.f16x2 r867, r4, r869; +} +{ +add.f16x2 r870, %39, r867; +} +{ +cvt.rn.f16.f64 rs120, fd240; +} +mov.b32 r875, {rs120, rs120}; +{ +mul.f16x2 r873, r7, r875; +} +{ +add.f16x2 r876, r877, r873; +} +{ +cvt.rn.f16.f64 rs121, fd299; +} +mov.b32 r881, {rs121, rs121}; +{ +mul.f16x2 r879, r13, r881; +} +{ +add.f16x2 r882, r858, r879; +} +{ +cvt.rn.f16.f64 rs122, fd300; +} +mov.b32 r887, {rs122, rs122}; +{ +mul.f16x2 r885, r22, r887; +} +{ +add.f16x2 r888, r864, r885; +} +{ +cvt.rn.f16.f64 rs123, fd299; +} +mov.b32 r893, {rs123, rs123}; +{ +mul.f16x2 r891, r16, r893; +} +{ +add.f16x2 r894, r870, r891; +} +{ +cvt.rn.f16.f64 rs124, fd300; +} +mov.b32 r899, {rs124, rs124}; +{ +mul.f16x2 r897, r19, r899; +} +{ +add.f16x2 r900, r876, r897; +} +{ +cvt.rn.f16.f64 rs125, fd307; +} +mov.b32 r905, {rs125, rs125}; +{ +mul.f16x2 r903, r25, r905; +} +{ +add.f16x2 r906, r882, r903; +} +{ +cvt.rn.f16.f64 rs126, fd188; +} +mov.b32 r911, {rs126, rs126}; +{ +mul.f16x2 r909, r34, r911; +} +{ +add.f16x2 r912, r888, r909; +} +{ +cvt.rn.f16.f64 rs127, fd307; +} +mov.b32 r917, {rs127, rs127}; +{ +mul.f16x2 r915, r28, r917; +} +{ +add.f16x2 r918, r894, r915; +} +{ +cvt.rn.f16.f64 rs128, fd188; +} +mov.b32 r923, {rs128, rs128}; +{ +mul.f16x2 r921, r31, r923; +} +{ +add.f16x2 r924, r900, r921; +} +{ +cvt.rn.f16.f64 rs129, fd311; +} +mov.b32 r929, {rs129, rs129}; +{ +mul.f16x2 r927, r37, r929; +} +{ +add.f16x2 r930, r906, r927; +} +{ +cvt.rn.f16.f64 rs130, fd312; +} +mov.b32 r935, {rs130, rs130}; +{ +mul.f16x2 r933, r46, r935; +} +{ +add.f16x2 r936, r912, r933; +} +{ +cvt.rn.f16.f64 rs131, fd311; +} +mov.b32 r941, {rs131, rs131}; +{ +mul.f16x2 r939, r40, r941; +} +{ +add.f16x2 r942, r918, r939; +} +{ +cvt.rn.f16.f64 rs132, fd312; +} +mov.b32 r947, {rs132, rs132}; +{ +mul.f16x2 r945, r43, r947; +} +{ +add.f16x2 r948, r924, r945; +} +{ +cvt.rn.f16.f64 rs133, fd295; +} +mov.b32 r953, {rs133, rs133}; +{ +mul.f16x2 r951, r49, r953; +} +{ +add.f16x2 r954, r930, r951; +} +{ +cvt.rn.f16.f64 rs134, fd160; +} +mov.b32 r959, {rs134, rs134}; +{ +mul.f16x2 r957, r58, r959; +} +{ +add.f16x2 r960, r936, r957; +} +{ +cvt.rn.f16.f64 rs135, fd295; +} +mov.b32 r965, {rs135, rs135}; +{ +mul.f16x2 r963, r52, r965; +} +{ +add.f16x2 r966, r942, r963; +} +{ +cvt.rn.f16.f64 rs136, fd160; +} +mov.b32 r971, {rs136, rs136}; +{ +mul.f16x2 r969, r55, r971; +} +{ +add.f16x2 r972, r948, r969; +} +{ +cvt.rn.f16.f64 rs137, fd323; +} +mov.b32 r977, {rs137, rs137}; +{ +mul.f16x2 r975, r61, r977; +} +{ +add.f16x2 r978, r954, r975; +} +{ +cvt.rn.f16.f64 rs138, fd324; +} +mov.b32 r983, {rs138, rs138}; +{ +mul.f16x2 r981, r70, r983; +} +{ +add.f16x2 r984, r960, r981; +} +{ +cvt.rn.f16.f64 rs139, fd323; +} +mov.b32 r989, {rs139, rs139}; +{ +mul.f16x2 r987, r64, r989; +} +{ +add.f16x2 r990, r966, r987; +} +{ +cvt.rn.f16.f64 rs140, fd324; +} +mov.b32 r995, {rs140, rs140}; +{ +mul.f16x2 r993, r67, r995; +} +{ +add.f16x2 r996, r972, r993; +} +{ +cvt.rn.f16.f64 rs141, fd291; +} +mov.b32 r1001, {rs141, rs141}; +{ +mul.f16x2 r999, r73, r1001; +} +{ +add.f16x2 r1002, r978, r999; +} +{ +cvt.rn.f16.f64 rs142, fd292; +} +mov.b32 r1007, {rs142, rs142}; +{ +mul.f16x2 r1005, r82, r1007; +} +{ +add.f16x2 r1008, r984, r1005; +} +{ +cvt.rn.f16.f64 rs143, fd291; +} +mov.b32 r1013, {rs143, rs143}; +{ +mul.f16x2 r1011, r76, r1013; +} +{ +add.f16x2 r1014, r990, r1011; +} +{ +cvt.rn.f16.f64 rs144, fd292; +} +mov.b32 r1019, {rs144, rs144}; +{ +mul.f16x2 r1017, r79, r1019; +} +{ +add.f16x2 r1020, r996, r1017; +} +{ +cvt.rn.f16.f64 rs145, fd315; +} +mov.b32 r1025, {rs145, rs145}; +{ +mul.f16x2 r1023, r85, r1025; +} +{ +add.f16x2 r1026, r1002, r1023; +} +mov.f64 fd268, 0d3FED4E03DD110B08; +{ +cvt.rn.f16.f64 rs146, fd268; +} +mov.b32 r1031, {rs146, rs146}; +{ +mul.f16x2 r1029, r94, r1031; +} +{ +add.f16x2 r1032, r1008, r1029; +} +{ +cvt.rn.f16.f64 rs147, fd315; +} +mov.b32 r1037, {rs147, rs147}; +{ +mul.f16x2 r1035, r88, r1037; +} +{ +add.f16x2 r1038, r1014, r1035; +} +{ +cvt.rn.f16.f64 rs148, fd268; +} +mov.b32 r1043, {rs148, rs148}; +{ +mul.f16x2 r1041, r91, r1043; +} +{ +add.f16x2 r1044, r1020, r1041; +} +{ +cvt.rn.f16.f64 rs149, fd303; +} +mov.b32 r1049, {rs149, rs149}; +{ +mul.f16x2 r1047, r97, r1049; +} +{ +add.f16x2 r1050, r1026, r1047; +} +mov.f64 fd304, 0d3FE3A7A16B394423; +{ +cvt.rn.f16.f64 rs150, fd304; +} +mov.b32 r1055, {rs150, rs150}; +{ +mul.f16x2 r1053, r106, r1055; +} +{ +add.f16x2 r1056, r1032, r1053; +} +{ +cvt.rn.f16.f64 rs151, fd303; +} +mov.b32 r1061, {rs151, rs151}; +{ +mul.f16x2 r1059, r100, r1061; +} +{ +add.f16x2 r1062, r1038, r1059; +} +{ +cvt.rn.f16.f64 rs152, fd304; +} +mov.b32 r1067, {rs152, rs152}; +{ +mul.f16x2 r1065, r103, r1067; +} +{ +add.f16x2 r1068, r1044, r1065; +} +{ +sub.f16x2 %8, r1050, r1056; +} +{ +add.f16x2 %9, r1062, r1068; +} +{ +add.f16x2 %30, r1050, r1056; +} +{ +sub.f16x2 %31, r1062, r1068; +} +cvt.rn.f16.s32 rs153, r2004; +mov.b32 r1095, {rs153, rs153}; +cvt.rn.f16.s32 rs154, r2004; +mov.b32 r1107, {rs154, rs154}; +{ +cvt.rn.f16.f64 rs155, fd323; +} +mov.b32 r1087, {rs155, rs155}; +{ +mul.f16x2 r1085, r1, r1087; +} +{ +add.f16x2 r1088, %38, r1085; +} +{ +cvt.rn.f16.f64 rs156, fd324; +} +mov.b32 r1093, {rs156, rs156}; +{ +mul.f16x2 r1091, r10, r1093; +} +{ +add.f16x2 r1094, r1095, r1091; +} +{ +cvt.rn.f16.f64 rs157, fd323; +} +mov.b32 r1099, {rs157, rs157}; +{ +mul.f16x2 r1097, r4, r1099; +} +{ +add.f16x2 r1100, %39, r1097; +} +{ +cvt.rn.f16.f64 rs158, fd324; +} +mov.b32 r1105, {rs158, rs158}; +{ +mul.f16x2 r1103, r7, r1105; +} +{ +add.f16x2 r1106, r1107, r1103; +} +{ +cvt.rn.f16.f64 rs159, fd291; +} +mov.b32 r1111, {rs159, rs159}; +{ +mul.f16x2 r1109, r13, r1111; +} +{ +add.f16x2 r1112, r1088, r1109; +} +{ +cvt.rn.f16.f64 rs160, fd276; +} +mov.b32 r1117, {rs160, rs160}; +{ +mul.f16x2 r1115, r22, r1117; +} +{ +add.f16x2 r1118, r1094, r1115; +} +{ +cvt.rn.f16.f64 rs161, fd291; +} +mov.b32 r1123, {rs161, rs161}; +{ +mul.f16x2 r1121, r16, r1123; +} +{ +add.f16x2 r1124, r1100, r1121; +} +{ +cvt.rn.f16.f64 rs162, fd276; +} +mov.b32 r1129, {rs162, rs162}; +{ +mul.f16x2 r1127, r19, r1129; +} +{ +add.f16x2 r1130, r1106, r1127; +} +{ +cvt.rn.f16.f64 rs163, fd319; +} +mov.b32 r1135, {rs163, rs163}; +{ +mul.f16x2 r1133, r25, r1135; +} +{ +add.f16x2 r1136, r1112, r1133; +} +{ +cvt.rn.f16.f64 rs164, fd320; +} +mov.b32 r1141, {rs164, rs164}; +{ +mul.f16x2 r1139, r34, r1141; +} +{ +add.f16x2 r1142, r1118, r1139; +} +{ +cvt.rn.f16.f64 rs165, fd319; +} +mov.b32 r1147, {rs165, rs165}; +{ +mul.f16x2 r1145, r28, r1147; +} +{ +add.f16x2 r1148, r1124, r1145; +} +{ +cvt.rn.f16.f64 rs166, fd320; +} +mov.b32 r1153, {rs166, rs166}; +{ +mul.f16x2 r1151, r31, r1153; +} +{ +add.f16x2 r1154, r1130, r1151; +} +{ +cvt.rn.f16.f64 rs167, fd295; +} +mov.b32 r1159, {rs167, rs167}; +{ +mul.f16x2 r1157, r37, r1159; +} +{ +add.f16x2 r1160, r1136, r1157; +} +{ +cvt.rn.f16.f64 rs168, fd160; +} +mov.b32 r1165, {rs168, rs168}; +{ +mul.f16x2 r1163, r46, r1165; +} +{ +add.f16x2 r1166, r1142, r1163; +} +{ +cvt.rn.f16.f64 rs169, fd295; +} +mov.b32 r1171, {rs169, rs169}; +{ +mul.f16x2 r1169, r40, r1171; +} +{ +add.f16x2 r1172, r1148, r1169; +} +{ +cvt.rn.f16.f64 rs170, fd160; +} +mov.b32 r1177, {rs170, rs170}; +{ +mul.f16x2 r1175, r43, r1177; +} +{ +add.f16x2 r1178, r1154, r1175; +} +{ +cvt.rn.f16.f64 rs171, fd315; +} +mov.b32 r1183, {rs171, rs171}; +{ +mul.f16x2 r1181, r49, r1183; +} +{ +add.f16x2 r1184, r1160, r1181; +} +{ +cvt.rn.f16.f64 rs172, fd316; +} +mov.b32 r1189, {rs172, rs172}; +{ +mul.f16x2 r1187, r58, r1189; +} +{ +add.f16x2 r1190, r1166, r1187; +} +{ +cvt.rn.f16.f64 rs173, fd315; +} +mov.b32 r1195, {rs173, rs173}; +{ +mul.f16x2 r1193, r52, r1195; +} +{ +add.f16x2 r1196, r1172, r1193; +} +{ +cvt.rn.f16.f64 rs174, fd316; +} +mov.b32 r1201, {rs174, rs174}; +{ +mul.f16x2 r1199, r55, r1201; +} +{ +add.f16x2 r1202, r1178, r1199; +} +{ +cvt.rn.f16.f64 rs175, fd299; +} +mov.b32 r1207, {rs175, rs175}; +{ +mul.f16x2 r1205, r61, r1207; +} +{ +add.f16x2 r1208, r1184, r1205; +} +mov.f64 fd244, 0d3FDE75EC0DED7BEE; +{ +cvt.rn.f16.f64 rs176, fd244; +} +mov.b32 r1213, {rs176, rs176}; +{ +mul.f16x2 r1211, r70, r1213; +} +{ +add.f16x2 r1214, r1190, r1211; +} +{ +cvt.rn.f16.f64 rs177, fd299; +} +mov.b32 r1219, {rs177, rs177}; +{ +mul.f16x2 r1217, r64, r1219; +} +{ +add.f16x2 r1220, r1196, r1217; +} +{ +cvt.rn.f16.f64 rs178, fd244; +} +mov.b32 r1225, {rs178, rs178}; +{ +mul.f16x2 r1223, r67, r1225; +} +{ +add.f16x2 r1226, r1202, r1223; +} +{ +cvt.rn.f16.f64 rs179, fd311; +} +mov.b32 r1231, {rs179, rs179}; +{ +mul.f16x2 r1229, r73, r1231; +} +{ +add.f16x2 r1232, r1208, r1229; +} +{ +cvt.rn.f16.f64 rs180, fd312; +} +mov.b32 r1237, {rs180, rs180}; +{ +mul.f16x2 r1235, r82, r1237; +} +{ +add.f16x2 r1238, r1214, r1235; +} +{ +cvt.rn.f16.f64 rs181, fd311; +} +mov.b32 r1243, {rs181, rs181}; +{ +mul.f16x2 r1241, r76, r1243; +} +{ +add.f16x2 r1244, r1220, r1241; +} +{ +cvt.rn.f16.f64 rs182, fd312; +} +mov.b32 r1249, {rs182, rs182}; +{ +mul.f16x2 r1247, r79, r1249; +} +{ +add.f16x2 r1250, r1226, r1247; +} +{ +cvt.rn.f16.f64 rs183, fd303; +} +mov.b32 r1255, {rs183, rs183}; +{ +mul.f16x2 r1253, r85, r1255; +} +{ +add.f16x2 r1256, r1232, r1253; +} +{ +cvt.rn.f16.f64 rs184, fd272; +} +mov.b32 r1261, {rs184, rs184}; +{ +mul.f16x2 r1259, r94, r1261; +} +{ +add.f16x2 r1262, r1238, r1259; +} +{ +cvt.rn.f16.f64 rs185, fd303; +} +mov.b32 r1267, {rs185, rs185}; +{ +mul.f16x2 r1265, r88, r1267; +} +{ +add.f16x2 r1268, r1244, r1265; +} +{ +cvt.rn.f16.f64 rs186, fd272; +} +mov.b32 r1273, {rs186, rs186}; +{ +mul.f16x2 r1271, r91, r1273; +} +{ +add.f16x2 r1274, r1250, r1271; +} +{ +cvt.rn.f16.f64 rs187, fd307; +} +mov.b32 r1279, {rs187, rs187}; +{ +mul.f16x2 r1277, r97, r1279; +} +{ +add.f16x2 r1280, r1256, r1277; +} +{ +cvt.rn.f16.f64 rs188, fd308; +} +mov.b32 r1285, {rs188, rs188}; +{ +mul.f16x2 r1283, r106, r1285; +} +{ +add.f16x2 r1286, r1262, r1283; +} +{ +cvt.rn.f16.f64 rs189, fd307; +} +mov.b32 r1291, {rs189, rs189}; +{ +mul.f16x2 r1289, r100, r1291; +} +{ +add.f16x2 r1292, r1268, r1289; +} +{ +cvt.rn.f16.f64 rs190, fd308; +} +mov.b32 r1297, {rs190, rs190}; +{ +mul.f16x2 r1295, r103, r1297; +} +{ +add.f16x2 r1298, r1274, r1295; +} +{ +sub.f16x2 %10, r1280, r1286; +} +{ +add.f16x2 %11, r1292, r1298; +} +{ +add.f16x2 %28, r1280, r1286; +} +{ +sub.f16x2 %29, r1292, r1298; +} +cvt.rn.f16.s32 rs191, r2004; +mov.b32 r1325, {rs191, rs191}; +cvt.rn.f16.s32 rs192, r2004; +mov.b32 r1337, {rs192, rs192}; +{ +cvt.rn.f16.f64 rs193, fd315; +} +mov.b32 r1317, {rs193, rs193}; +{ +mul.f16x2 r1315, r1, r1317; +} +{ +add.f16x2 r1318, %38, r1315; +} +{ +cvt.rn.f16.f64 rs194, fd316; +} +mov.b32 r1323, {rs194, rs194}; +{ +mul.f16x2 r1321, r10, r1323; +} +{ +add.f16x2 r1324, r1325, r1321; +} +{ +cvt.rn.f16.f64 rs195, fd315; +} +mov.b32 r1329, {rs195, rs195}; +{ +mul.f16x2 r1327, r4, r1329; +} +{ +add.f16x2 r1330, %39, r1327; +} +{ +cvt.rn.f16.f64 rs196, fd316; +} +mov.b32 r1335, {rs196, rs196}; +{ +mul.f16x2 r1333, r7, r1335; +} +{ +add.f16x2 r1336, r1337, r1333; +} +{ +cvt.rn.f16.f64 rs197, fd307; +} +mov.b32 r1341, {rs197, rs197}; +{ +mul.f16x2 r1339, r13, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs198, fd188; +} +mov.b32 r1347, {rs198, rs198}; +{ +mul.f16x2 r1345, r22, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs199, fd307; +} +mov.b32 r1353, {rs199, rs199}; +{ +mul.f16x2 r1351, r16, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs200, fd188; +} +mov.b32 r1359, {rs200, rs200}; +{ +mul.f16x2 r1357, r19, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs201, fd295; +} +mov.b32 r1365, {rs201, rs201}; +{ +mul.f16x2 r1363, r25, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +cvt.rn.f16.f64 rs202, fd296; +} +mov.b32 r1371, {rs202, rs202}; +{ +mul.f16x2 r1369, r34, r1371; +} +{ +add.f16x2 r1372, r1348, r1369; +} +{ +cvt.rn.f16.f64 rs203, fd295; +} +mov.b32 r1377, {rs203, rs203}; +{ +mul.f16x2 r1375, r28, r1377; +} +{ +add.f16x2 r1378, r1354, r1375; +} +{ +cvt.rn.f16.f64 rs204, fd296; +} +mov.b32 r1383, {rs204, rs204}; +{ +mul.f16x2 r1381, r31, r1383; +} +{ +add.f16x2 r1384, r1360, r1381; +} +{ +cvt.rn.f16.f64 rs205, fd323; +} +mov.b32 r1389, {rs205, rs205}; +{ +mul.f16x2 r1387, r37, r1389; +} +{ +add.f16x2 r1390, r1366, r1387; +} +{ +cvt.rn.f16.f64 rs206, fd324; +} +mov.b32 r1395, {rs206, rs206}; +{ +mul.f16x2 r1393, r46, r1395; +} +{ +add.f16x2 r1396, r1372, r1393; +} +{ +cvt.rn.f16.f64 rs207, fd323; +} +mov.b32 r1401, {rs207, rs207}; +{ +mul.f16x2 r1399, r40, r1401; +} +{ +add.f16x2 r1402, r1378, r1399; +} +{ +cvt.rn.f16.f64 rs208, fd324; +} +mov.b32 r1407, {rs208, rs208}; +{ +mul.f16x2 r1405, r43, r1407; +} +{ +add.f16x2 r1408, r1384, r1405; +} +{ +cvt.rn.f16.f64 rs209, fd299; +} +mov.b32 r1413, {rs209, rs209}; +{ +mul.f16x2 r1411, r49, r1413; +} +{ +add.f16x2 r1414, r1390, r1411; +} +{ +cvt.rn.f16.f64 rs210, fd244; +} +mov.b32 r1419, {rs210, rs210}; +{ +mul.f16x2 r1417, r58, r1419; +} +{ +add.f16x2 r1420, r1396, r1417; +} +{ +cvt.rn.f16.f64 rs211, fd299; +} +mov.b32 r1425, {rs211, rs211}; +{ +mul.f16x2 r1423, r52, r1425; +} +{ +add.f16x2 r1426, r1402, r1423; +} +{ +cvt.rn.f16.f64 rs212, fd244; +} +mov.b32 r1431, {rs212, rs212}; +{ +mul.f16x2 r1429, r55, r1431; +} +{ +add.f16x2 r1432, r1408, r1429; +} +{ +cvt.rn.f16.f64 rs213, fd303; +} +mov.b32 r1437, {rs213, rs213}; +{ +mul.f16x2 r1435, r61, r1437; +} +{ +add.f16x2 r1438, r1414, r1435; +} +{ +cvt.rn.f16.f64 rs214, fd304; +} +mov.b32 r1443, {rs214, rs214}; +{ +mul.f16x2 r1441, r70, r1443; +} +{ +add.f16x2 r1444, r1420, r1441; +} +{ +cvt.rn.f16.f64 rs215, fd303; +} +mov.b32 r1449, {rs215, rs215}; +{ +mul.f16x2 r1447, r64, r1449; +} +{ +add.f16x2 r1450, r1426, r1447; +} +{ +cvt.rn.f16.f64 rs216, fd304; +} +mov.b32 r1455, {rs216, rs216}; +{ +mul.f16x2 r1453, r67, r1455; +} +{ +add.f16x2 r1456, r1432, r1453; +} +{ +cvt.rn.f16.f64 rs217, fd319; +} +mov.b32 r1461, {rs217, rs217}; +{ +mul.f16x2 r1459, r73, r1461; +} +{ +add.f16x2 r1462, r1438, r1459; +} +{ +cvt.rn.f16.f64 rs218, fd240; +} +mov.b32 r1467, {rs218, rs218}; +{ +mul.f16x2 r1465, r82, r1467; +} +{ +add.f16x2 r1468, r1444, r1465; +} +{ +cvt.rn.f16.f64 rs219, fd319; +} +mov.b32 r1473, {rs219, rs219}; +{ +mul.f16x2 r1471, r76, r1473; +} +{ +add.f16x2 r1474, r1450, r1471; +} +{ +cvt.rn.f16.f64 rs220, fd240; +} +mov.b32 r1479, {rs220, rs220}; +{ +mul.f16x2 r1477, r79, r1479; +} +{ +add.f16x2 r1480, r1456, r1477; +} +{ +cvt.rn.f16.f64 rs221, fd291; +} +mov.b32 r1485, {rs221, rs221}; +{ +mul.f16x2 r1483, r85, r1485; +} +{ +add.f16x2 r1486, r1462, r1483; +} +{ +cvt.rn.f16.f64 rs222, fd276; +} +mov.b32 r1491, {rs222, rs222}; +{ +mul.f16x2 r1489, r94, r1491; +} +{ +add.f16x2 r1492, r1468, r1489; +} +{ +cvt.rn.f16.f64 rs223, fd291; +} +mov.b32 r1497, {rs223, rs223}; +{ +mul.f16x2 r1495, r88, r1497; +} +{ +add.f16x2 r1498, r1474, r1495; +} +{ +cvt.rn.f16.f64 rs224, fd276; +} +mov.b32 r1503, {rs224, rs224}; +{ +mul.f16x2 r1501, r91, r1503; +} +{ +add.f16x2 r1504, r1480, r1501; +} +{ +cvt.rn.f16.f64 rs225, fd311; +} +mov.b32 r1509, {rs225, rs225}; +{ +mul.f16x2 r1507, r97, r1509; +} +{ +add.f16x2 r1510, r1486, r1507; +} +{ +cvt.rn.f16.f64 rs226, fd312; +} +mov.b32 r1515, {rs226, rs226}; +{ +mul.f16x2 r1513, r106, r1515; +} +{ +add.f16x2 r1516, r1492, r1513; +} +{ +cvt.rn.f16.f64 rs227, fd311; +} +mov.b32 r1521, {rs227, rs227}; +{ +mul.f16x2 r1519, r100, r1521; +} +{ +add.f16x2 r1522, r1498, r1519; +} +{ +cvt.rn.f16.f64 rs228, fd312; +} +mov.b32 r1527, {rs228, rs228}; +{ +mul.f16x2 r1525, r103, r1527; +} +{ +add.f16x2 r1528, r1504, r1525; +} +{ +sub.f16x2 %12, r1510, r1516; +} +{ +add.f16x2 %13, r1522, r1528; +} +{ +add.f16x2 %26, r1510, r1516; +} +{ +sub.f16x2 %27, r1522, r1528; +} +cvt.rn.f16.s32 rs229, r2004; +mov.b32 r1555, {rs229, rs229}; +cvt.rn.f16.s32 rs230, r2004; +mov.b32 r1567, {rs230, rs230}; +{ +cvt.rn.f16.f64 rs231, fd307; +} +mov.b32 r1547, {rs231, rs231}; +{ +mul.f16x2 r1545, r1, r1547; +} +{ +add.f16x2 r1548, %38, r1545; +} +{ +cvt.rn.f16.f64 rs232, fd308; +} +mov.b32 r1553, {rs232, rs232}; +{ +mul.f16x2 r1551, r10, r1553; +} +{ +add.f16x2 r1554, r1555, r1551; +} +{ +cvt.rn.f16.f64 rs233, fd307; +} +mov.b32 r1559, {rs233, rs233}; +{ +mul.f16x2 r1557, r4, r1559; +} +{ +add.f16x2 r1560, %39, r1557; +} +{ +cvt.rn.f16.f64 rs234, fd308; +} +mov.b32 r1565, {rs234, rs234}; +{ +mul.f16x2 r1563, r7, r1565; +} +{ +add.f16x2 r1566, r1567, r1563; +} +{ +cvt.rn.f16.f64 rs235, fd323; +} +mov.b32 r1571, {rs235, rs235}; +{ +mul.f16x2 r1569, r13, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +{ +cvt.rn.f16.f64 rs236, fd224; +} +mov.b32 r1577, {rs236, rs236}; +{ +mul.f16x2 r1575, r22, r1577; +} +{ +add.f16x2 r1578, r1554, r1575; +} +{ +cvt.rn.f16.f64 rs237, fd323; +} +mov.b32 r1583, {rs237, rs237}; +{ +mul.f16x2 r1581, r16, r1583; +} +{ +add.f16x2 r1584, r1560, r1581; +} +{ +cvt.rn.f16.f64 rs238, fd224; +} +mov.b32 r1589, {rs238, rs238}; +{ +mul.f16x2 r1587, r19, r1589; +} +{ +add.f16x2 r1590, r1566, r1587; +} +{ +cvt.rn.f16.f64 rs239, fd303; +} +mov.b32 r1595, {rs239, rs239}; +{ +mul.f16x2 r1593, r25, r1595; +} +{ +add.f16x2 r1596, r1572, r1593; +} +{ +cvt.rn.f16.f64 rs240, fd272; +} +mov.b32 r1601, {rs240, rs240}; +{ +mul.f16x2 r1599, r34, r1601; +} +{ +add.f16x2 r1602, r1578, r1599; +} +{ +cvt.rn.f16.f64 rs241, fd303; +} +mov.b32 r1607, {rs241, rs241}; +{ +mul.f16x2 r1605, r28, r1607; +} +{ +add.f16x2 r1608, r1584, r1605; +} +{ +cvt.rn.f16.f64 rs242, fd272; +} +mov.b32 r1613, {rs242, rs242}; +{ +mul.f16x2 r1611, r31, r1613; +} +{ +add.f16x2 r1614, r1590, r1611; +} +{ +cvt.rn.f16.f64 rs243, fd291; +} +mov.b32 r1619, {rs243, rs243}; +{ +mul.f16x2 r1617, r37, r1619; +} +{ +add.f16x2 r1620, r1596, r1617; +} +{ +cvt.rn.f16.f64 rs244, fd292; +} +mov.b32 r1625, {rs244, rs244}; +{ +mul.f16x2 r1623, r46, r1625; +} +{ +add.f16x2 r1626, r1602, r1623; +} +{ +cvt.rn.f16.f64 rs245, fd291; +} +mov.b32 r1631, {rs245, rs245}; +{ +mul.f16x2 r1629, r40, r1631; +} +{ +add.f16x2 r1632, r1608, r1629; +} +{ +cvt.rn.f16.f64 rs246, fd292; +} +mov.b32 r1637, {rs246, rs246}; +{ +mul.f16x2 r1635, r43, r1637; +} +{ +add.f16x2 r1638, r1614, r1635; +} +{ +cvt.rn.f16.f64 rs247, fd311; +} +mov.b32 r1643, {rs247, rs247}; +{ +mul.f16x2 r1641, r49, r1643; +} +{ +add.f16x2 r1644, r1620, r1641; +} +{ +cvt.rn.f16.f64 rs248, fd312; +} +mov.b32 r1649, {rs248, rs248}; +{ +mul.f16x2 r1647, r58, r1649; +} +{ +add.f16x2 r1650, r1626, r1647; +} +{ +cvt.rn.f16.f64 rs249, fd311; +} +mov.b32 r1655, {rs249, rs249}; +{ +mul.f16x2 r1653, r52, r1655; +} +{ +add.f16x2 r1656, r1632, r1653; +} +{ +cvt.rn.f16.f64 rs250, fd312; +} +mov.b32 r1661, {rs250, rs250}; +{ +mul.f16x2 r1659, r55, r1661; +} +{ +add.f16x2 r1662, r1638, r1659; +} +{ +cvt.rn.f16.f64 rs251, fd319; +} +mov.b32 r1667, {rs251, rs251}; +{ +mul.f16x2 r1665, r61, r1667; +} +{ +add.f16x2 r1668, r1644, r1665; +} +{ +cvt.rn.f16.f64 rs252, fd240; +} +mov.b32 r1673, {rs252, rs252}; +{ +mul.f16x2 r1671, r70, r1673; +} +{ +add.f16x2 r1674, r1650, r1671; +} +{ +cvt.rn.f16.f64 rs253, fd319; +} +mov.b32 r1679, {rs253, rs253}; +{ +mul.f16x2 r1677, r64, r1679; +} +{ +add.f16x2 r1680, r1656, r1677; +} +{ +cvt.rn.f16.f64 rs254, fd240; +} +mov.b32 r1685, {rs254, rs254}; +{ +mul.f16x2 r1683, r67, r1685; +} +{ +add.f16x2 r1686, r1662, r1683; +} +{ +cvt.rn.f16.f64 rs255, fd299; +} +mov.b32 r1691, {rs255, rs255}; +{ +mul.f16x2 r1689, r73, r1691; +} +{ +add.f16x2 r1692, r1668, r1689; +} +{ +cvt.rn.f16.f64 rs256, fd244; +} +mov.b32 r1697, {rs256, rs256}; +{ +mul.f16x2 r1695, r82, r1697; +} +{ +add.f16x2 r1698, r1674, r1695; +} +{ +cvt.rn.f16.f64 rs257, fd299; +} +mov.b32 r1703, {rs257, rs257}; +{ +mul.f16x2 r1701, r76, r1703; +} +{ +add.f16x2 r1704, r1680, r1701; +} +{ +cvt.rn.f16.f64 rs258, fd244; +} +mov.b32 r1709, {rs258, rs258}; +{ +mul.f16x2 r1707, r79, r1709; +} +{ +add.f16x2 r1710, r1686, r1707; +} +{ +cvt.rn.f16.f64 rs259, fd295; +} +mov.b32 r1715, {rs259, rs259}; +{ +mul.f16x2 r1713, r85, r1715; +} +{ +add.f16x2 r1716, r1692, r1713; +} +{ +cvt.rn.f16.f64 rs260, fd296; +} +mov.b32 r1721, {rs260, rs260}; +{ +mul.f16x2 r1719, r94, r1721; +} +{ +add.f16x2 r1722, r1698, r1719; +} +{ +cvt.rn.f16.f64 rs261, fd295; +} +mov.b32 r1727, {rs261, rs261}; +{ +mul.f16x2 r1725, r88, r1727; +} +{ +add.f16x2 r1728, r1704, r1725; +} +{ +cvt.rn.f16.f64 rs262, fd296; +} +mov.b32 r1733, {rs262, rs262}; +{ +mul.f16x2 r1731, r91, r1733; +} +{ +add.f16x2 r1734, r1710, r1731; +} +{ +cvt.rn.f16.f64 rs263, fd315; +} +mov.b32 r1739, {rs263, rs263}; +{ +mul.f16x2 r1737, r97, r1739; +} +{ +add.f16x2 r1740, r1716, r1737; +} +{ +cvt.rn.f16.f64 rs264, fd316; +} +mov.b32 r1745, {rs264, rs264}; +{ +mul.f16x2 r1743, r106, r1745; +} +{ +add.f16x2 r1746, r1722, r1743; +} +{ +cvt.rn.f16.f64 rs265, fd315; +} +mov.b32 r1751, {rs265, rs265}; +{ +mul.f16x2 r1749, r100, r1751; +} +{ +add.f16x2 r1752, r1728, r1749; +} +{ +cvt.rn.f16.f64 rs266, fd316; +} +mov.b32 r1757, {rs266, rs266}; +{ +mul.f16x2 r1755, r103, r1757; +} +{ +add.f16x2 r1758, r1734, r1755; +} +{ +sub.f16x2 %14, r1740, r1746; +} +{ +add.f16x2 %15, r1752, r1758; +} +{ +add.f16x2 %24, r1740, r1746; +} +{ +sub.f16x2 %25, r1752, r1758; +} +cvt.rn.f16.s32 rs267, r2004; +mov.b32 r1785, {rs267, rs267}; +cvt.rn.f16.s32 rs268, r2004; +mov.b32 r1797, {rs268, rs268}; +{ +cvt.rn.f16.f64 rs269, fd299; +} +mov.b32 r1777, {rs269, rs269}; +{ +mul.f16x2 r1775, r1, r1777; +} +{ +add.f16x2 r1778, %38, r1775; +} +{ +cvt.rn.f16.f64 rs270, fd300; +} +mov.b32 r1783, {rs270, rs270}; +{ +mul.f16x2 r1781, r10, r1783; +} +{ +add.f16x2 r1784, r1785, r1781; +} +{ +cvt.rn.f16.f64 rs271, fd299; +} +mov.b32 r1789, {rs271, rs271}; +{ +mul.f16x2 r1787, r4, r1789; +} +{ +add.f16x2 r1790, %39, r1787; +} +{ +cvt.rn.f16.f64 rs272, fd300; +} +mov.b32 r1795, {rs272, rs272}; +{ +mul.f16x2 r1793, r7, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +cvt.rn.f16.f64 rs273, fd311; +} +mov.b32 r1801, {rs273, rs273}; +{ +mul.f16x2 r1799, r13, r1801; +} +{ +add.f16x2 r1802, r1778, r1799; +} +{ +cvt.rn.f16.f64 rs274, fd312; +} +mov.b32 r1807, {rs274, rs274}; +{ +mul.f16x2 r1805, r22, r1807; +} +{ +add.f16x2 r1808, r1784, r1805; +} +{ +cvt.rn.f16.f64 rs275, fd311; +} +mov.b32 r1813, {rs275, rs275}; +{ +mul.f16x2 r1811, r16, r1813; +} +{ +add.f16x2 r1814, r1790, r1811; +} +{ +cvt.rn.f16.f64 rs276, fd312; +} +mov.b32 r1819, {rs276, rs276}; +{ +mul.f16x2 r1817, r19, r1819; +} +{ +add.f16x2 r1820, r1796, r1817; +} +{ +cvt.rn.f16.f64 rs277, fd323; +} +mov.b32 r1825, {rs277, rs277}; +{ +mul.f16x2 r1823, r25, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs278, fd324; +} +mov.b32 r1831, {rs278, rs278}; +{ +mul.f16x2 r1829, r34, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs279, fd323; +} +mov.b32 r1837, {rs279, rs279}; +{ +mul.f16x2 r1835, r28, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs280, fd324; +} +mov.b32 r1843, {rs280, rs280}; +{ +mul.f16x2 r1841, r31, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs281, fd315; +} +mov.b32 r1849, {rs281, rs281}; +{ +mul.f16x2 r1847, r37, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs282, fd268; +} +mov.b32 r1855, {rs282, rs282}; +{ +mul.f16x2 r1853, r46, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs283, fd315; +} +mov.b32 r1861, {rs283, rs283}; +{ +mul.f16x2 r1859, r40, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs284, fd268; +} +mov.b32 r1867, {rs284, rs284}; +{ +mul.f16x2 r1865, r43, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs285, fd303; +} +mov.b32 r1873, {rs285, rs285}; +{ +mul.f16x2 r1871, r49, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs286, fd272; +} +mov.b32 r1879, {rs286, rs286}; +{ +mul.f16x2 r1877, r58, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs287, fd303; +} +mov.b32 r1885, {rs287, rs287}; +{ +mul.f16x2 r1883, r52, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs288, fd272; +} +mov.b32 r1891, {rs288, rs288}; +{ +mul.f16x2 r1889, r55, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs289, fd291; +} +mov.b32 r1897, {rs289, rs289}; +{ +mul.f16x2 r1895, r61, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs290, fd276; +} +mov.b32 r1903, {rs290, rs290}; +{ +mul.f16x2 r1901, r70, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs291, fd291; +} +mov.b32 r1909, {rs291, rs291}; +{ +mul.f16x2 r1907, r64, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs292, fd276; +} +mov.b32 r1915, {rs292, rs292}; +{ +mul.f16x2 r1913, r67, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs293, fd295; +} +mov.b32 r1921, {rs293, rs293}; +{ +mul.f16x2 r1919, r73, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs294, fd296; +} +mov.b32 r1927, {rs294, rs294}; +{ +mul.f16x2 r1925, r82, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs295, fd295; +} +mov.b32 r1933, {rs295, rs295}; +{ +mul.f16x2 r1931, r76, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs296, fd296; +} +mov.b32 r1939, {rs296, rs296}; +{ +mul.f16x2 r1937, r79, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +cvt.rn.f16.f64 rs297, fd307; +} +mov.b32 r1945, {rs297, rs297}; +{ +mul.f16x2 r1943, r85, r1945; +} +{ +add.f16x2 r1946, r1922, r1943; +} +{ +cvt.rn.f16.f64 rs298, fd308; +} +mov.b32 r1951, {rs298, rs298}; +{ +mul.f16x2 r1949, r94, r1951; +} +{ +add.f16x2 r1952, r1928, r1949; +} +{ +cvt.rn.f16.f64 rs299, fd307; +} +mov.b32 r1957, {rs299, rs299}; +{ +mul.f16x2 r1955, r88, r1957; +} +{ +add.f16x2 r1958, r1934, r1955; +} +{ +cvt.rn.f16.f64 rs300, fd308; +} +mov.b32 r1963, {rs300, rs300}; +{ +mul.f16x2 r1961, r91, r1963; +} +{ +add.f16x2 r1964, r1940, r1961; +} +{ +cvt.rn.f16.f64 rs301, fd319; +} +mov.b32 r1969, {rs301, rs301}; +{ +mul.f16x2 r1967, r97, r1969; +} +{ +add.f16x2 r1970, r1946, r1967; +} +{ +cvt.rn.f16.f64 rs302, fd320; +} +mov.b32 r1975, {rs302, rs302}; +{ +mul.f16x2 r1973, r106, r1975; +} +{ +add.f16x2 r1976, r1952, r1973; +} +{ +cvt.rn.f16.f64 rs303, fd319; +} +mov.b32 r1981, {rs303, rs303}; +{ +mul.f16x2 r1979, r100, r1981; +} +{ +add.f16x2 r1982, r1958, r1979; +} +{ +cvt.rn.f16.f64 rs304, fd320; +} +mov.b32 r1987, {rs304, rs304}; +{ +mul.f16x2 r1985, r103, r1987; +} +{ +add.f16x2 r1988, r1964, r1985; +} +{ +sub.f16x2 %16, r1970, r1976; +} +{ +add.f16x2 %17, r1982, r1988; +} +{ +add.f16x2 %22, r1970, r1976; +} +{ +sub.f16x2 %23, r1982, r1988; +} +cvt.rn.f16.s32 rs305, r2004; +mov.b32 r2015, {rs305, rs305}; +cvt.rn.f16.s32 rs306, r2004; +mov.b32 r2027, {rs306, rs306}; +{ +cvt.rn.f16.f64 rs307, fd291; +} +mov.b32 r2007, {rs307, rs307}; +{ +mul.f16x2 r2005, r1, r2007; +} +{ +add.f16x2 r2008, %38, r2005; +} +{ +cvt.rn.f16.f64 rs308, fd292; +} +mov.b32 r2013, {rs308, rs308}; +{ +mul.f16x2 r2011, r10, r2013; +} +{ +add.f16x2 r2014, r2015, r2011; +} +{ +cvt.rn.f16.f64 rs309, fd291; +} +mov.b32 r2019, {rs309, rs309}; +{ +mul.f16x2 r2017, r4, r2019; +} +{ +add.f16x2 r2020, %39, r2017; +} +{ +cvt.rn.f16.f64 rs310, fd292; +} +mov.b32 r2025, {rs310, rs310}; +{ +mul.f16x2 r2023, r7, r2025; +} +{ +add.f16x2 r2026, r2027, r2023; +} +{ +cvt.rn.f16.f64 rs311, fd295; +} +mov.b32 r2031, {rs311, rs311}; +{ +mul.f16x2 r2029, r13, r2031; +} +{ +add.f16x2 r2032, r2008, r2029; +} +{ +cvt.rn.f16.f64 rs312, fd296; +} +mov.b32 r2037, {rs312, rs312}; +{ +mul.f16x2 r2035, r22, r2037; +} +{ +add.f16x2 r2038, r2014, r2035; +} +{ +cvt.rn.f16.f64 rs313, fd295; +} +mov.b32 r2043, {rs313, rs313}; +{ +mul.f16x2 r2041, r16, r2043; +} +{ +add.f16x2 r2044, r2020, r2041; +} +{ +cvt.rn.f16.f64 rs314, fd296; +} +mov.b32 r2049, {rs314, rs314}; +{ +mul.f16x2 r2047, r19, r2049; +} +{ +add.f16x2 r2050, r2026, r2047; +} +{ +cvt.rn.f16.f64 rs315, fd299; +} +mov.b32 r2055, {rs315, rs315}; +{ +mul.f16x2 r2053, r25, r2055; +} +{ +add.f16x2 r2056, r2032, r2053; +} +{ +cvt.rn.f16.f64 rs316, fd300; +} +mov.b32 r2061, {rs316, rs316}; +{ +mul.f16x2 r2059, r34, r2061; +} +{ +add.f16x2 r2062, r2038, r2059; +} +{ +cvt.rn.f16.f64 rs317, fd299; +} +mov.b32 r2067, {rs317, rs317}; +{ +mul.f16x2 r2065, r28, r2067; +} +{ +add.f16x2 r2068, r2044, r2065; +} +{ +cvt.rn.f16.f64 rs318, fd300; +} +mov.b32 r2073, {rs318, rs318}; +{ +mul.f16x2 r2071, r31, r2073; +} +{ +add.f16x2 r2074, r2050, r2071; +} +{ +cvt.rn.f16.f64 rs319, fd303; +} +mov.b32 r2079, {rs319, rs319}; +{ +mul.f16x2 r2077, r37, r2079; +} +{ +add.f16x2 r2080, r2056, r2077; +} +{ +cvt.rn.f16.f64 rs320, fd304; +} +mov.b32 r2085, {rs320, rs320}; +{ +mul.f16x2 r2083, r46, r2085; +} +{ +add.f16x2 r2086, r2062, r2083; +} +{ +cvt.rn.f16.f64 rs321, fd303; +} +mov.b32 r2091, {rs321, rs321}; +{ +mul.f16x2 r2089, r40, r2091; +} +{ +add.f16x2 r2092, r2068, r2089; +} +{ +cvt.rn.f16.f64 rs322, fd304; +} +mov.b32 r2097, {rs322, rs322}; +{ +mul.f16x2 r2095, r43, r2097; +} +{ +add.f16x2 r2098, r2074, r2095; +} +{ +cvt.rn.f16.f64 rs323, fd307; +} +mov.b32 r2103, {rs323, rs323}; +{ +mul.f16x2 r2101, r49, r2103; +} +{ +add.f16x2 r2104, r2080, r2101; +} +{ +cvt.rn.f16.f64 rs324, fd308; +} +mov.b32 r2109, {rs324, rs324}; +{ +mul.f16x2 r2107, r58, r2109; +} +{ +add.f16x2 r2110, r2086, r2107; +} +{ +cvt.rn.f16.f64 rs325, fd307; +} +mov.b32 r2115, {rs325, rs325}; +{ +mul.f16x2 r2113, r52, r2115; +} +{ +add.f16x2 r2116, r2092, r2113; +} +{ +cvt.rn.f16.f64 rs326, fd308; +} +mov.b32 r2121, {rs326, rs326}; +{ +mul.f16x2 r2119, r55, r2121; +} +{ +add.f16x2 r2122, r2098, r2119; +} +{ +cvt.rn.f16.f64 rs327, fd311; +} +mov.b32 r2127, {rs327, rs327}; +{ +mul.f16x2 r2125, r61, r2127; +} +{ +add.f16x2 r2128, r2104, r2125; +} +{ +cvt.rn.f16.f64 rs328, fd312; +} +mov.b32 r2133, {rs328, rs328}; +{ +mul.f16x2 r2131, r70, r2133; +} +{ +add.f16x2 r2134, r2110, r2131; +} +{ +cvt.rn.f16.f64 rs329, fd311; +} +mov.b32 r2139, {rs329, rs329}; +{ +mul.f16x2 r2137, r64, r2139; +} +{ +add.f16x2 r2140, r2116, r2137; +} +{ +cvt.rn.f16.f64 rs330, fd312; +} +mov.b32 r2145, {rs330, rs330}; +{ +mul.f16x2 r2143, r67, r2145; +} +{ +add.f16x2 r2146, r2122, r2143; +} +{ +cvt.rn.f16.f64 rs331, fd315; +} +mov.b32 r2151, {rs331, rs331}; +{ +mul.f16x2 r2149, r73, r2151; +} +{ +add.f16x2 r2152, r2128, r2149; +} +{ +cvt.rn.f16.f64 rs332, fd316; +} +mov.b32 r2157, {rs332, rs332}; +{ +mul.f16x2 r2155, r82, r2157; +} +{ +add.f16x2 r2158, r2134, r2155; +} +{ +cvt.rn.f16.f64 rs333, fd315; +} +mov.b32 r2163, {rs333, rs333}; +{ +mul.f16x2 r2161, r76, r2163; +} +{ +add.f16x2 r2164, r2140, r2161; +} +{ +cvt.rn.f16.f64 rs334, fd316; +} +mov.b32 r2169, {rs334, rs334}; +{ +mul.f16x2 r2167, r79, r2169; +} +{ +add.f16x2 r2170, r2146, r2167; +} +{ +cvt.rn.f16.f64 rs335, fd319; +} +mov.b32 r2175, {rs335, rs335}; +{ +mul.f16x2 r2173, r85, r2175; +} +{ +add.f16x2 r2176, r2152, r2173; +} +{ +cvt.rn.f16.f64 rs336, fd320; +} +mov.b32 r2181, {rs336, rs336}; +{ +mul.f16x2 r2179, r94, r2181; +} +{ +add.f16x2 r2182, r2158, r2179; +} +{ +cvt.rn.f16.f64 rs337, fd319; +} +mov.b32 r2187, {rs337, rs337}; +{ +mul.f16x2 r2185, r88, r2187; +} +{ +add.f16x2 r2188, r2164, r2185; +} +{ +cvt.rn.f16.f64 rs338, fd320; +} +mov.b32 r2193, {rs338, rs338}; +{ +mul.f16x2 r2191, r91, r2193; +} +{ +add.f16x2 r2194, r2170, r2191; +} +{ +cvt.rn.f16.f64 rs339, fd323; +} +mov.b32 r2199, {rs339, rs339}; +{ +mul.f16x2 r2197, r97, r2199; +} +{ +add.f16x2 r2200, r2176, r2197; +} +{ +cvt.rn.f16.f64 rs340, fd324; +} +mov.b32 r2205, {rs340, rs340}; +{ +mul.f16x2 r2203, r106, r2205; +} +{ +add.f16x2 r2206, r2182, r2203; +} +{ +cvt.rn.f16.f64 rs341, fd323; +} +mov.b32 r2211, {rs341, rs341}; +{ +mul.f16x2 r2209, r100, r2211; +} +{ +add.f16x2 r2212, r2188, r2209; +} +{ +cvt.rn.f16.f64 rs342, fd324; +} +mov.b32 r2217, {rs342, rs342}; +{ +mul.f16x2 r2215, r103, r2217; +} +{ +add.f16x2 r2218, r2194, r2215; +} +{ +sub.f16x2 %18, r2200, r2206; +} +{ +add.f16x2 %19, r2212, r2218; +} +{ +add.f16x2 %20, r2200, r2206; +} +{ +sub.f16x2 %21, r2212, r2218; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..049802c75b0f9 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp16_inv.hpp.inc @@ -0,0 +1,3592 @@ +#ifndef CUFFTDX_FFT_19_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_19_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<953, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<343>; +.reg .b32 r<2233>; +.reg .f64 fd<325>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %40, %74; +} +{ +add.f16x2 r4, %41, %75; +} +{ +sub.f16x2 r7, %40, %74; +} +{ +sub.f16x2 r10, %41, %75; +} +{ +add.f16x2 r13, %42, %72; +} +{ +add.f16x2 r16, %43, %73; +} +{ +sub.f16x2 r19, %42, %72; +} +{ +sub.f16x2 r22, %43, %73; +} +{ +add.f16x2 r25, %44, %70; +} +{ +add.f16x2 r28, %45, %71; +} +{ +sub.f16x2 r31, %44, %70; +} +{ +sub.f16x2 r34, %45, %71; +} +{ +add.f16x2 r37, %46, %68; +} +{ +add.f16x2 r40, %47, %69; +} +{ +sub.f16x2 r43, %46, %68; +} +{ +sub.f16x2 r46, %47, %69; +} +{ +add.f16x2 r49, %48, %66; +} +{ +add.f16x2 r52, %49, %67; +} +{ +sub.f16x2 r55, %48, %66; +} +{ +sub.f16x2 r58, %49, %67; +} +{ +add.f16x2 r61, %50, %64; +} +{ +add.f16x2 r64, %51, %65; +} +{ +sub.f16x2 r67, %50, %64; +} +{ +sub.f16x2 r70, %51, %65; +} +{ +add.f16x2 r73, %52, %62; +} +{ +add.f16x2 r76, %53, %63; +} +{ +sub.f16x2 r79, %52, %62; +} +{ +sub.f16x2 r82, %53, %63; +} +{ +add.f16x2 r85, %54, %60; +} +{ +add.f16x2 r88, %55, %61; +} +{ +sub.f16x2 r91, %54, %60; +} +{ +sub.f16x2 r94, %55, %61; +} +{ +add.f16x2 r97, %56, %58; +} +{ +add.f16x2 r100, %57, %59; +} +{ +sub.f16x2 r103, %56, %58; +} +{ +sub.f16x2 r106, %57, %59; +} +{ +add.f16x2 r109, %38, r1; +} +{ +add.f16x2 r112, %39, r4; +} +{ +add.f16x2 r115, r109, r13; +} +{ +add.f16x2 r118, r112, r16; +} +{ +add.f16x2 r121, r115, r25; +} +{ +add.f16x2 r124, r118, r28; +} +{ +add.f16x2 r127, r121, r37; +} +{ +add.f16x2 r130, r124, r40; +} +{ +add.f16x2 r133, r127, r49; +} +{ +add.f16x2 r136, r130, r52; +} +{ +add.f16x2 r139, r133, r61; +} +{ +add.f16x2 r142, r136, r64; +} +{ +add.f16x2 r145, r139, r73; +} +{ +add.f16x2 r148, r142, r76; +} +{ +add.f16x2 r151, r145, r85; +} +{ +add.f16x2 r154, r148, r88; +} +{ +add.f16x2 %0, r151, r97; +} +{ +add.f16x2 %1, r154, r100; +} +mov.u32 r2004, 0; +cvt.rn.f16.s32 rs1, r2004; +mov.b32 r175, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r2004; +mov.b32 r187, {rs2, rs2}; +mov.f64 fd295, 0d3FEE442285231BE1; +{ +cvt.rn.f16.f64 rs3, fd295; +} +mov.b32 r167, {rs3, rs3}; +{ +mul.f16x2 r165, r1, r167; +} +{ +add.f16x2 r168, %38, r165; +} +mov.f64 fd160, 0d3FD4C7E04850CFAA; +{ +cvt.rn.f16.f64 rs4, fd160; +} +mov.b32 r173, {rs4, rs4}; +{ +mul.f16x2 r171, r10, r173; +} +{ +add.f16x2 r174, r175, r171; +} +{ +cvt.rn.f16.f64 rs5, fd295; +} +mov.b32 r179, {rs5, rs5}; +{ +mul.f16x2 r177, r4, r179; +} +{ +add.f16x2 r180, %39, r177; +} +{ +cvt.rn.f16.f64 rs6, fd160; +} +mov.b32 r185, {rs6, rs6}; +{ +mul.f16x2 r183, r7, r185; +} +{ +add.f16x2 r186, r187, r183; +} +mov.f64 fd303, 0d3FE940A398F9CD23; +{ +cvt.rn.f16.f64 rs7, fd303; +} +mov.b32 r191, {rs7, rs7}; +{ +mul.f16x2 r189, r13, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd272, 0d3FE3A7A16B394423; +{ +cvt.rn.f16.f64 rs8, fd272; +} +mov.b32 r197, {rs8, rs8}; +{ +mul.f16x2 r195, r22, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs9, fd303; +} +mov.b32 r203, {rs9, rs9}; +{ +mul.f16x2 r201, r16, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs10, fd272; +} +mov.b32 r209, {rs10, rs10}; +{ +mul.f16x2 r207, r19, r209; +} +{ +add.f16x2 r210, r186, r207; +} +mov.f64 fd311, 0d3FE180996C77C8CA; +{ +cvt.rn.f16.f64 rs11, fd311; +} +mov.b32 r215, {rs11, rs11}; +{ +mul.f16x2 r213, r25, r215; +} +{ +add.f16x2 r216, r192, r213; +} +mov.f64 fd76, 0d3FEACA115AAE3DE4; +{ +cvt.rn.f16.f64 rs12, fd76; +} +mov.b32 r221, {rs12, rs12}; +{ +mul.f16x2 r219, r34, r221; +} +{ +add.f16x2 r222, r198, r219; +} +{ +cvt.rn.f16.f64 rs13, fd311; +} +mov.b32 r227, {rs13, rs13}; +{ +mul.f16x2 r225, r28, r227; +} +{ +add.f16x2 r228, r204, r225; +} +{ +cvt.rn.f16.f64 rs14, fd76; +} +mov.b32 r233, {rs14, rs14}; +{ +mul.f16x2 r231, r31, r233; +} +{ +add.f16x2 r234, r210, r231; +} +mov.f64 fd319, 0d3FCF6C118574C83E; +{ +cvt.rn.f16.f64 rs15, fd319; +} +mov.b32 r239, {rs15, rs15}; +{ +mul.f16x2 r237, r37, r239; +} +{ +add.f16x2 r240, r216, r237; +} +mov.f64 fd240, 0d3FEF0553B4DE2E18; +{ +cvt.rn.f16.f64 rs16, fd240; +} +mov.b32 r245, {rs16, rs16}; +{ +mul.f16x2 r243, r46, r245; +} +{ +add.f16x2 r246, r222, r243; +} +{ +cvt.rn.f16.f64 rs17, fd319; +} +mov.b32 r251, {rs17, rs17}; +{ +mul.f16x2 r249, r40, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +cvt.rn.f16.f64 rs18, fd240; +} +mov.b32 r257, {rs18, rs18}; +{ +mul.f16x2 r255, r43, r257; +} +{ +add.f16x2 r258, r234, r255; +} +mov.f64 fd323, 0dBFB523EB8420F5F5; +{ +cvt.rn.f16.f64 rs19, fd323; +} +mov.b32 r263, {rs19, rs19}; +{ +mul.f16x2 r261, r49, r263; +} +{ +add.f16x2 r264, r240, r261; +} +mov.f64 fd324, 0d3FEFE40529A542AA; +{ +cvt.rn.f16.f64 rs20, fd324; +} +mov.b32 r269, {rs20, rs20}; +{ +mul.f16x2 r267, r58, r269; +} +{ +add.f16x2 r270, r246, r267; +} +{ +cvt.rn.f16.f64 rs21, fd323; +} +mov.b32 r275, {rs21, rs21}; +{ +mul.f16x2 r273, r52, r275; +} +{ +add.f16x2 r276, r252, r273; +} +{ +cvt.rn.f16.f64 rs22, fd324; +} +mov.b32 r281, {rs22, rs22}; +{ +mul.f16x2 r279, r55, r281; +} +{ +add.f16x2 r282, r258, r279; +} +mov.f64 fd315, 0dBFD9B560B9F596EA; +{ +cvt.rn.f16.f64 rs23, fd315; +} +mov.b32 r287, {rs23, rs23}; +{ +mul.f16x2 r285, r61, r287; +} +{ +add.f16x2 r288, r264, r285; +} +mov.f64 fd316, 0d3FED4E03DD110B08; +{ +cvt.rn.f16.f64 rs24, fd316; +} +mov.b32 r293, {rs24, rs24}; +{ +mul.f16x2 r291, r70, r293; +} +{ +add.f16x2 r294, r270, r291; +} +{ +cvt.rn.f16.f64 rs25, fd315; +} +mov.b32 r299, {rs25, rs25}; +{ +mul.f16x2 r297, r64, r299; +} +{ +add.f16x2 r300, r276, r297; +} +{ +cvt.rn.f16.f64 rs26, fd316; +} +mov.b32 r305, {rs26, rs26}; +{ +mul.f16x2 r303, r67, r305; +} +{ +add.f16x2 r306, r282, r303; +} +mov.f64 fd307, 0dBFE5AC4A670A1CFF; +{ +cvt.rn.f16.f64 rs27, fd307; +} +mov.b32 r311, {rs27, rs27}; +{ +mul.f16x2 r309, r73, r311; +} +{ +add.f16x2 r312, r288, r309; +} +mov.f64 fd308, 0d3FE78B0CDEE73E0F; +{ +cvt.rn.f16.f64 rs28, fd308; +} +mov.b32 r317, {rs28, rs28}; +{ +mul.f16x2 r315, r82, r317; +} +{ +add.f16x2 r318, r294, r315; +} +{ +cvt.rn.f16.f64 rs29, fd307; +} +mov.b32 r323, {rs29, rs29}; +{ +mul.f16x2 r321, r76, r323; +} +{ +add.f16x2 r324, r300, r321; +} +{ +cvt.rn.f16.f64 rs30, fd308; +} +mov.b32 r329, {rs30, rs30}; +{ +mul.f16x2 r327, r79, r329; +} +{ +add.f16x2 r330, r306, r327; +} +mov.f64 fd299, 0dBFEC24A622E3E9F9; +{ +cvt.rn.f16.f64 rs31, fd299; +} +mov.b32 r335, {rs31, rs31}; +{ +mul.f16x2 r333, r85, r335; +} +{ +add.f16x2 r336, r312, r333; +} +mov.f64 fd300, 0d3FDE75EC0DED7BEE; +{ +cvt.rn.f16.f64 rs32, fd300; +} +mov.b32 r341, {rs32, rs32}; +{ +mul.f16x2 r339, r94, r341; +} +{ +add.f16x2 r342, r318, r339; +} +{ +cvt.rn.f16.f64 rs33, fd299; +} +mov.b32 r347, {rs33, rs33}; +{ +mul.f16x2 r345, r88, r347; +} +{ +add.f16x2 r348, r324, r345; +} +{ +cvt.rn.f16.f64 rs34, fd300; +} +mov.b32 r353, {rs34, rs34}; +{ +mul.f16x2 r351, r91, r353; +} +{ +add.f16x2 r354, r330, r351; +} +mov.f64 fd291, 0dBFEF90459484F2B2; +{ +cvt.rn.f16.f64 rs35, fd291; +} +mov.b32 r359, {rs35, rs35}; +{ +mul.f16x2 r357, r97, r359; +} +{ +add.f16x2 r360, r336, r357; +} +mov.f64 fd292, 0d3FC5116F7F2D58C5; +{ +cvt.rn.f16.f64 rs36, fd292; +} +mov.b32 r365, {rs36, rs36}; +{ +mul.f16x2 r363, r106, r365; +} +{ +add.f16x2 r366, r342, r363; +} +{ +cvt.rn.f16.f64 rs37, fd291; +} +mov.b32 r371, {rs37, rs37}; +{ +mul.f16x2 r369, r100, r371; +} +{ +add.f16x2 r372, r348, r369; +} +{ +cvt.rn.f16.f64 rs38, fd292; +} +mov.b32 r377, {rs38, rs38}; +{ +mul.f16x2 r375, r103, r377; +} +{ +add.f16x2 r378, r354, r375; +} +{ +sub.f16x2 %2, r360, r366; +} +{ +add.f16x2 %3, r372, r378; +} +{ +add.f16x2 %36, r360, r366; +} +{ +sub.f16x2 %37, r372, r378; +} +cvt.rn.f16.s32 rs39, r2004; +mov.b32 r405, {rs39, rs39}; +cvt.rn.f16.s32 rs40, r2004; +mov.b32 r417, {rs40, rs40}; +{ +cvt.rn.f16.f64 rs41, fd303; +} +mov.b32 r397, {rs41, rs41}; +{ +mul.f16x2 r395, r1, r397; +} +{ +add.f16x2 r398, %38, r395; +} +{ +cvt.rn.f16.f64 rs42, fd272; +} +mov.b32 r403, {rs42, rs42}; +{ +mul.f16x2 r401, r10, r403; +} +{ +add.f16x2 r404, r405, r401; +} +{ +cvt.rn.f16.f64 rs43, fd303; +} +mov.b32 r409, {rs43, rs43}; +{ +mul.f16x2 r407, r4, r409; +} +{ +add.f16x2 r410, %39, r407; +} +{ +cvt.rn.f16.f64 rs44, fd272; +} +mov.b32 r415, {rs44, rs44}; +{ +mul.f16x2 r413, r7, r415; +} +{ +add.f16x2 r416, r417, r413; +} +{ +cvt.rn.f16.f64 rs45, fd319; +} +mov.b32 r421, {rs45, rs45}; +{ +mul.f16x2 r419, r13, r421; +} +{ +add.f16x2 r422, r398, r419; +} +{ +cvt.rn.f16.f64 rs46, fd240; +} +mov.b32 r427, {rs46, rs46}; +{ +mul.f16x2 r425, r22, r427; +} +{ +add.f16x2 r428, r404, r425; +} +{ +cvt.rn.f16.f64 rs47, fd319; +} +mov.b32 r433, {rs47, rs47}; +{ +mul.f16x2 r431, r16, r433; +} +{ +add.f16x2 r434, r410, r431; +} +{ +cvt.rn.f16.f64 rs48, fd240; +} +mov.b32 r439, {rs48, rs48}; +{ +mul.f16x2 r437, r19, r439; +} +{ +add.f16x2 r440, r416, r437; +} +{ +cvt.rn.f16.f64 rs49, fd315; +} +mov.b32 r445, {rs49, rs49}; +{ +mul.f16x2 r443, r25, r445; +} +{ +add.f16x2 r446, r422, r443; +} +{ +cvt.rn.f16.f64 rs50, fd316; +} +mov.b32 r451, {rs50, rs50}; +{ +mul.f16x2 r449, r34, r451; +} +{ +add.f16x2 r452, r428, r449; +} +{ +cvt.rn.f16.f64 rs51, fd315; +} +mov.b32 r457, {rs51, rs51}; +{ +mul.f16x2 r455, r28, r457; +} +{ +add.f16x2 r458, r434, r455; +} +{ +cvt.rn.f16.f64 rs52, fd316; +} +mov.b32 r463, {rs52, rs52}; +{ +mul.f16x2 r461, r31, r463; +} +{ +add.f16x2 r464, r440, r461; +} +{ +cvt.rn.f16.f64 rs53, fd299; +} +mov.b32 r469, {rs53, rs53}; +{ +mul.f16x2 r467, r37, r469; +} +{ +add.f16x2 r470, r446, r467; +} +{ +cvt.rn.f16.f64 rs54, fd300; +} +mov.b32 r475, {rs54, rs54}; +{ +mul.f16x2 r473, r46, r475; +} +{ +add.f16x2 r476, r452, r473; +} +{ +cvt.rn.f16.f64 rs55, fd299; +} +mov.b32 r481, {rs55, rs55}; +{ +mul.f16x2 r479, r40, r481; +} +{ +add.f16x2 r482, r458, r479; +} +{ +cvt.rn.f16.f64 rs56, fd300; +} +mov.b32 r487, {rs56, rs56}; +{ +mul.f16x2 r485, r43, r487; +} +{ +add.f16x2 r488, r464, r485; +} +{ +cvt.rn.f16.f64 rs57, fd291; +} +mov.b32 r493, {rs57, rs57}; +{ +mul.f16x2 r491, r49, r493; +} +{ +add.f16x2 r494, r470, r491; +} +mov.f64 fd276, 0dBFC5116F7F2D58C5; +{ +cvt.rn.f16.f64 rs58, fd276; +} +mov.b32 r499, {rs58, rs58}; +{ +mul.f16x2 r497, r58, r499; +} +{ +add.f16x2 r500, r476, r497; +} +{ +cvt.rn.f16.f64 rs59, fd291; +} +mov.b32 r505, {rs59, rs59}; +{ +mul.f16x2 r503, r52, r505; +} +{ +add.f16x2 r506, r482, r503; +} +{ +cvt.rn.f16.f64 rs60, fd276; +} +mov.b32 r511, {rs60, rs60}; +{ +mul.f16x2 r509, r55, r511; +} +{ +add.f16x2 r512, r488, r509; +} +{ +cvt.rn.f16.f64 rs61, fd307; +} +mov.b32 r517, {rs61, rs61}; +{ +mul.f16x2 r515, r61, r517; +} +{ +add.f16x2 r518, r494, r515; +} +mov.f64 fd188, 0dBFE78B0CDEE73E0F; +{ +cvt.rn.f16.f64 rs62, fd188; +} +mov.b32 r523, {rs62, rs62}; +{ +mul.f16x2 r521, r70, r523; +} +{ +add.f16x2 r524, r500, r521; +} +{ +cvt.rn.f16.f64 rs63, fd307; +} +mov.b32 r529, {rs63, rs63}; +{ +mul.f16x2 r527, r64, r529; +} +{ +add.f16x2 r530, r506, r527; +} +{ +cvt.rn.f16.f64 rs64, fd188; +} +mov.b32 r535, {rs64, rs64}; +{ +mul.f16x2 r533, r67, r535; +} +{ +add.f16x2 r536, r512, r533; +} +{ +cvt.rn.f16.f64 rs65, fd323; +} +mov.b32 r541, {rs65, rs65}; +{ +mul.f16x2 r539, r73, r541; +} +{ +add.f16x2 r542, r518, r539; +} +mov.f64 fd224, 0dBFEFE40529A542AA; +{ +cvt.rn.f16.f64 rs66, fd224; +} +mov.b32 r547, {rs66, rs66}; +{ +mul.f16x2 r545, r82, r547; +} +{ +add.f16x2 r548, r524, r545; +} +{ +cvt.rn.f16.f64 rs67, fd323; +} +mov.b32 r553, {rs67, rs67}; +{ +mul.f16x2 r551, r76, r553; +} +{ +add.f16x2 r554, r530, r551; +} +{ +cvt.rn.f16.f64 rs68, fd224; +} +mov.b32 r559, {rs68, rs68}; +{ +mul.f16x2 r557, r79, r559; +} +{ +add.f16x2 r560, r536, r557; +} +{ +cvt.rn.f16.f64 rs69, fd311; +} +mov.b32 r565, {rs69, rs69}; +{ +mul.f16x2 r563, r85, r565; +} +{ +add.f16x2 r566, r542, r563; +} +mov.f64 fd312, 0dBFEACA115AAE3DE4; +{ +cvt.rn.f16.f64 rs70, fd312; +} +mov.b32 r571, {rs70, rs70}; +{ +mul.f16x2 r569, r94, r571; +} +{ +add.f16x2 r572, r548, r569; +} +{ +cvt.rn.f16.f64 rs71, fd311; +} +mov.b32 r577, {rs71, rs71}; +{ +mul.f16x2 r575, r88, r577; +} +{ +add.f16x2 r578, r554, r575; +} +{ +cvt.rn.f16.f64 rs72, fd312; +} +mov.b32 r583, {rs72, rs72}; +{ +mul.f16x2 r581, r91, r583; +} +{ +add.f16x2 r584, r560, r581; +} +{ +cvt.rn.f16.f64 rs73, fd295; +} +mov.b32 r589, {rs73, rs73}; +{ +mul.f16x2 r587, r97, r589; +} +{ +add.f16x2 r590, r566, r587; +} +mov.f64 fd296, 0dBFD4C7E04850CFAA; +{ +cvt.rn.f16.f64 rs74, fd296; +} +mov.b32 r595, {rs74, rs74}; +{ +mul.f16x2 r593, r106, r595; +} +{ +add.f16x2 r596, r572, r593; +} +{ +cvt.rn.f16.f64 rs75, fd295; +} +mov.b32 r601, {rs75, rs75}; +{ +mul.f16x2 r599, r100, r601; +} +{ +add.f16x2 r602, r578, r599; +} +{ +cvt.rn.f16.f64 rs76, fd296; +} +mov.b32 r607, {rs76, rs76}; +{ +mul.f16x2 r605, r103, r607; +} +{ +add.f16x2 r608, r584, r605; +} +{ +sub.f16x2 %4, r590, r596; +} +{ +add.f16x2 %5, r602, r608; +} +{ +add.f16x2 %34, r590, r596; +} +{ +sub.f16x2 %35, r602, r608; +} +cvt.rn.f16.s32 rs77, r2004; +mov.b32 r635, {rs77, rs77}; +cvt.rn.f16.s32 rs78, r2004; +mov.b32 r647, {rs78, rs78}; +{ +cvt.rn.f16.f64 rs79, fd311; +} +mov.b32 r627, {rs79, rs79}; +{ +mul.f16x2 r625, r1, r627; +} +{ +add.f16x2 r628, %38, r625; +} +{ +cvt.rn.f16.f64 rs80, fd76; +} +mov.b32 r633, {rs80, rs80}; +{ +mul.f16x2 r631, r10, r633; +} +{ +add.f16x2 r634, r635, r631; +} +{ +cvt.rn.f16.f64 rs81, fd311; +} +mov.b32 r639, {rs81, rs81}; +{ +mul.f16x2 r637, r4, r639; +} +{ +add.f16x2 r640, %39, r637; +} +{ +cvt.rn.f16.f64 rs82, fd76; +} +mov.b32 r645, {rs82, rs82}; +{ +mul.f16x2 r643, r7, r645; +} +{ +add.f16x2 r646, r647, r643; +} +{ +cvt.rn.f16.f64 rs83, fd315; +} +mov.b32 r651, {rs83, rs83}; +{ +mul.f16x2 r649, r13, r651; +} +{ +add.f16x2 r652, r628, r649; +} +{ +cvt.rn.f16.f64 rs84, fd316; +} +mov.b32 r657, {rs84, rs84}; +{ +mul.f16x2 r655, r22, r657; +} +{ +add.f16x2 r658, r634, r655; +} +{ +cvt.rn.f16.f64 rs85, fd315; +} +mov.b32 r663, {rs85, rs85}; +{ +mul.f16x2 r661, r16, r663; +} +{ +add.f16x2 r664, r640, r661; +} +{ +cvt.rn.f16.f64 rs86, fd316; +} +mov.b32 r669, {rs86, rs86}; +{ +mul.f16x2 r667, r19, r669; +} +{ +add.f16x2 r670, r646, r667; +} +{ +cvt.rn.f16.f64 rs87, fd291; +} +mov.b32 r675, {rs87, rs87}; +{ +mul.f16x2 r673, r25, r675; +} +{ +add.f16x2 r676, r652, r673; +} +{ +cvt.rn.f16.f64 rs88, fd292; +} +mov.b32 r681, {rs88, rs88}; +{ +mul.f16x2 r679, r34, r681; +} +{ +add.f16x2 r682, r658, r679; +} +{ +cvt.rn.f16.f64 rs89, fd291; +} +mov.b32 r687, {rs89, rs89}; +{ +mul.f16x2 r685, r28, r687; +} +{ +add.f16x2 r688, r664, r685; +} +{ +cvt.rn.f16.f64 rs90, fd292; +} +mov.b32 r693, {rs90, rs90}; +{ +mul.f16x2 r691, r31, r693; +} +{ +add.f16x2 r694, r670, r691; +} +{ +cvt.rn.f16.f64 rs91, fd307; +} +mov.b32 r699, {rs91, rs91}; +{ +mul.f16x2 r697, r37, r699; +} +{ +add.f16x2 r700, r676, r697; +} +{ +cvt.rn.f16.f64 rs92, fd188; +} +mov.b32 r705, {rs92, rs92}; +{ +mul.f16x2 r703, r46, r705; +} +{ +add.f16x2 r706, r682, r703; +} +{ +cvt.rn.f16.f64 rs93, fd307; +} +mov.b32 r711, {rs93, rs93}; +{ +mul.f16x2 r709, r40, r711; +} +{ +add.f16x2 r712, r688, r709; +} +{ +cvt.rn.f16.f64 rs94, fd188; +} +mov.b32 r717, {rs94, rs94}; +{ +mul.f16x2 r715, r43, r717; +} +{ +add.f16x2 r718, r694, r715; +} +{ +cvt.rn.f16.f64 rs95, fd319; +} +mov.b32 r723, {rs95, rs95}; +{ +mul.f16x2 r721, r49, r723; +} +{ +add.f16x2 r724, r700, r721; +} +mov.f64 fd320, 0dBFEF0553B4DE2E18; +{ +cvt.rn.f16.f64 rs96, fd320; +} +mov.b32 r729, {rs96, rs96}; +{ +mul.f16x2 r727, r58, r729; +} +{ +add.f16x2 r730, r706, r727; +} +{ +cvt.rn.f16.f64 rs97, fd319; +} +mov.b32 r735, {rs97, rs97}; +{ +mul.f16x2 r733, r52, r735; +} +{ +add.f16x2 r736, r712, r733; +} +{ +cvt.rn.f16.f64 rs98, fd320; +} +mov.b32 r741, {rs98, rs98}; +{ +mul.f16x2 r739, r55, r741; +} +{ +add.f16x2 r742, r718, r739; +} +{ +cvt.rn.f16.f64 rs99, fd295; +} +mov.b32 r747, {rs99, rs99}; +{ +mul.f16x2 r745, r61, r747; +} +{ +add.f16x2 r748, r724, r745; +} +{ +cvt.rn.f16.f64 rs100, fd296; +} +mov.b32 r753, {rs100, rs100}; +{ +mul.f16x2 r751, r70, r753; +} +{ +add.f16x2 r754, r730, r751; +} +{ +cvt.rn.f16.f64 rs101, fd295; +} +mov.b32 r759, {rs101, rs101}; +{ +mul.f16x2 r757, r64, r759; +} +{ +add.f16x2 r760, r736, r757; +} +{ +cvt.rn.f16.f64 rs102, fd296; +} +mov.b32 r765, {rs102, rs102}; +{ +mul.f16x2 r763, r67, r765; +} +{ +add.f16x2 r766, r742, r763; +} +{ +cvt.rn.f16.f64 rs103, fd303; +} +mov.b32 r771, {rs103, rs103}; +{ +mul.f16x2 r769, r73, r771; +} +{ +add.f16x2 r772, r748, r769; +} +{ +cvt.rn.f16.f64 rs104, fd272; +} +mov.b32 r777, {rs104, rs104}; +{ +mul.f16x2 r775, r82, r777; +} +{ +add.f16x2 r778, r754, r775; +} +{ +cvt.rn.f16.f64 rs105, fd303; +} +mov.b32 r783, {rs105, rs105}; +{ +mul.f16x2 r781, r76, r783; +} +{ +add.f16x2 r784, r760, r781; +} +{ +cvt.rn.f16.f64 rs106, fd272; +} +mov.b32 r789, {rs106, rs106}; +{ +mul.f16x2 r787, r79, r789; +} +{ +add.f16x2 r790, r766, r787; +} +{ +cvt.rn.f16.f64 rs107, fd323; +} +mov.b32 r795, {rs107, rs107}; +{ +mul.f16x2 r793, r85, r795; +} +{ +add.f16x2 r796, r772, r793; +} +{ +cvt.rn.f16.f64 rs108, fd324; +} +mov.b32 r801, {rs108, rs108}; +{ +mul.f16x2 r799, r94, r801; +} +{ +add.f16x2 r802, r778, r799; +} +{ +cvt.rn.f16.f64 rs109, fd323; +} +mov.b32 r807, {rs109, rs109}; +{ +mul.f16x2 r805, r88, r807; +} +{ +add.f16x2 r808, r784, r805; +} +{ +cvt.rn.f16.f64 rs110, fd324; +} +mov.b32 r813, {rs110, rs110}; +{ +mul.f16x2 r811, r91, r813; +} +{ +add.f16x2 r814, r790, r811; +} +{ +cvt.rn.f16.f64 rs111, fd299; +} +mov.b32 r819, {rs111, rs111}; +{ +mul.f16x2 r817, r97, r819; +} +{ +add.f16x2 r820, r796, r817; +} +{ +cvt.rn.f16.f64 rs112, fd300; +} +mov.b32 r825, {rs112, rs112}; +{ +mul.f16x2 r823, r106, r825; +} +{ +add.f16x2 r826, r802, r823; +} +{ +cvt.rn.f16.f64 rs113, fd299; +} +mov.b32 r831, {rs113, rs113}; +{ +mul.f16x2 r829, r100, r831; +} +{ +add.f16x2 r832, r808, r829; +} +{ +cvt.rn.f16.f64 rs114, fd300; +} +mov.b32 r837, {rs114, rs114}; +{ +mul.f16x2 r835, r103, r837; +} +{ +add.f16x2 r838, r814, r835; +} +{ +sub.f16x2 %6, r820, r826; +} +{ +add.f16x2 %7, r832, r838; +} +{ +add.f16x2 %32, r820, r826; +} +{ +sub.f16x2 %33, r832, r838; +} +cvt.rn.f16.s32 rs115, r2004; +mov.b32 r865, {rs115, rs115}; +cvt.rn.f16.s32 rs116, r2004; +mov.b32 r877, {rs116, rs116}; +{ +cvt.rn.f16.f64 rs117, fd319; +} +mov.b32 r857, {rs117, rs117}; +{ +mul.f16x2 r855, r1, r857; +} +{ +add.f16x2 r858, %38, r855; +} +{ +cvt.rn.f16.f64 rs118, fd240; +} +mov.b32 r863, {rs118, rs118}; +{ +mul.f16x2 r861, r10, r863; +} +{ +add.f16x2 r864, r865, r861; +} +{ +cvt.rn.f16.f64 rs119, fd319; +} +mov.b32 r869, {rs119, rs119}; +{ +mul.f16x2 r867, r4, r869; +} +{ +add.f16x2 r870, %39, r867; +} +{ +cvt.rn.f16.f64 rs120, fd240; +} +mov.b32 r875, {rs120, rs120}; +{ +mul.f16x2 r873, r7, r875; +} +{ +add.f16x2 r876, r877, r873; +} +{ +cvt.rn.f16.f64 rs121, fd299; +} +mov.b32 r881, {rs121, rs121}; +{ +mul.f16x2 r879, r13, r881; +} +{ +add.f16x2 r882, r858, r879; +} +{ +cvt.rn.f16.f64 rs122, fd300; +} +mov.b32 r887, {rs122, rs122}; +{ +mul.f16x2 r885, r22, r887; +} +{ +add.f16x2 r888, r864, r885; +} +{ +cvt.rn.f16.f64 rs123, fd299; +} +mov.b32 r893, {rs123, rs123}; +{ +mul.f16x2 r891, r16, r893; +} +{ +add.f16x2 r894, r870, r891; +} +{ +cvt.rn.f16.f64 rs124, fd300; +} +mov.b32 r899, {rs124, rs124}; +{ +mul.f16x2 r897, r19, r899; +} +{ +add.f16x2 r900, r876, r897; +} +{ +cvt.rn.f16.f64 rs125, fd307; +} +mov.b32 r905, {rs125, rs125}; +{ +mul.f16x2 r903, r25, r905; +} +{ +add.f16x2 r906, r882, r903; +} +{ +cvt.rn.f16.f64 rs126, fd188; +} +mov.b32 r911, {rs126, rs126}; +{ +mul.f16x2 r909, r34, r911; +} +{ +add.f16x2 r912, r888, r909; +} +{ +cvt.rn.f16.f64 rs127, fd307; +} +mov.b32 r917, {rs127, rs127}; +{ +mul.f16x2 r915, r28, r917; +} +{ +add.f16x2 r918, r894, r915; +} +{ +cvt.rn.f16.f64 rs128, fd188; +} +mov.b32 r923, {rs128, rs128}; +{ +mul.f16x2 r921, r31, r923; +} +{ +add.f16x2 r924, r900, r921; +} +{ +cvt.rn.f16.f64 rs129, fd311; +} +mov.b32 r929, {rs129, rs129}; +{ +mul.f16x2 r927, r37, r929; +} +{ +add.f16x2 r930, r906, r927; +} +{ +cvt.rn.f16.f64 rs130, fd312; +} +mov.b32 r935, {rs130, rs130}; +{ +mul.f16x2 r933, r46, r935; +} +{ +add.f16x2 r936, r912, r933; +} +{ +cvt.rn.f16.f64 rs131, fd311; +} +mov.b32 r941, {rs131, rs131}; +{ +mul.f16x2 r939, r40, r941; +} +{ +add.f16x2 r942, r918, r939; +} +{ +cvt.rn.f16.f64 rs132, fd312; +} +mov.b32 r947, {rs132, rs132}; +{ +mul.f16x2 r945, r43, r947; +} +{ +add.f16x2 r948, r924, r945; +} +{ +cvt.rn.f16.f64 rs133, fd295; +} +mov.b32 r953, {rs133, rs133}; +{ +mul.f16x2 r951, r49, r953; +} +{ +add.f16x2 r954, r930, r951; +} +{ +cvt.rn.f16.f64 rs134, fd160; +} +mov.b32 r959, {rs134, rs134}; +{ +mul.f16x2 r957, r58, r959; +} +{ +add.f16x2 r960, r936, r957; +} +{ +cvt.rn.f16.f64 rs135, fd295; +} +mov.b32 r965, {rs135, rs135}; +{ +mul.f16x2 r963, r52, r965; +} +{ +add.f16x2 r966, r942, r963; +} +{ +cvt.rn.f16.f64 rs136, fd160; +} +mov.b32 r971, {rs136, rs136}; +{ +mul.f16x2 r969, r55, r971; +} +{ +add.f16x2 r972, r948, r969; +} +{ +cvt.rn.f16.f64 rs137, fd323; +} +mov.b32 r977, {rs137, rs137}; +{ +mul.f16x2 r975, r61, r977; +} +{ +add.f16x2 r978, r954, r975; +} +{ +cvt.rn.f16.f64 rs138, fd324; +} +mov.b32 r983, {rs138, rs138}; +{ +mul.f16x2 r981, r70, r983; +} +{ +add.f16x2 r984, r960, r981; +} +{ +cvt.rn.f16.f64 rs139, fd323; +} +mov.b32 r989, {rs139, rs139}; +{ +mul.f16x2 r987, r64, r989; +} +{ +add.f16x2 r990, r966, r987; +} +{ +cvt.rn.f16.f64 rs140, fd324; +} +mov.b32 r995, {rs140, rs140}; +{ +mul.f16x2 r993, r67, r995; +} +{ +add.f16x2 r996, r972, r993; +} +{ +cvt.rn.f16.f64 rs141, fd291; +} +mov.b32 r1001, {rs141, rs141}; +{ +mul.f16x2 r999, r73, r1001; +} +{ +add.f16x2 r1002, r978, r999; +} +{ +cvt.rn.f16.f64 rs142, fd292; +} +mov.b32 r1007, {rs142, rs142}; +{ +mul.f16x2 r1005, r82, r1007; +} +{ +add.f16x2 r1008, r984, r1005; +} +{ +cvt.rn.f16.f64 rs143, fd291; +} +mov.b32 r1013, {rs143, rs143}; +{ +mul.f16x2 r1011, r76, r1013; +} +{ +add.f16x2 r1014, r990, r1011; +} +{ +cvt.rn.f16.f64 rs144, fd292; +} +mov.b32 r1019, {rs144, rs144}; +{ +mul.f16x2 r1017, r79, r1019; +} +{ +add.f16x2 r1020, r996, r1017; +} +{ +cvt.rn.f16.f64 rs145, fd315; +} +mov.b32 r1025, {rs145, rs145}; +{ +mul.f16x2 r1023, r85, r1025; +} +{ +add.f16x2 r1026, r1002, r1023; +} +mov.f64 fd268, 0dBFED4E03DD110B08; +{ +cvt.rn.f16.f64 rs146, fd268; +} +mov.b32 r1031, {rs146, rs146}; +{ +mul.f16x2 r1029, r94, r1031; +} +{ +add.f16x2 r1032, r1008, r1029; +} +{ +cvt.rn.f16.f64 rs147, fd315; +} +mov.b32 r1037, {rs147, rs147}; +{ +mul.f16x2 r1035, r88, r1037; +} +{ +add.f16x2 r1038, r1014, r1035; +} +{ +cvt.rn.f16.f64 rs148, fd268; +} +mov.b32 r1043, {rs148, rs148}; +{ +mul.f16x2 r1041, r91, r1043; +} +{ +add.f16x2 r1044, r1020, r1041; +} +{ +cvt.rn.f16.f64 rs149, fd303; +} +mov.b32 r1049, {rs149, rs149}; +{ +mul.f16x2 r1047, r97, r1049; +} +{ +add.f16x2 r1050, r1026, r1047; +} +mov.f64 fd304, 0dBFE3A7A16B394423; +{ +cvt.rn.f16.f64 rs150, fd304; +} +mov.b32 r1055, {rs150, rs150}; +{ +mul.f16x2 r1053, r106, r1055; +} +{ +add.f16x2 r1056, r1032, r1053; +} +{ +cvt.rn.f16.f64 rs151, fd303; +} +mov.b32 r1061, {rs151, rs151}; +{ +mul.f16x2 r1059, r100, r1061; +} +{ +add.f16x2 r1062, r1038, r1059; +} +{ +cvt.rn.f16.f64 rs152, fd304; +} +mov.b32 r1067, {rs152, rs152}; +{ +mul.f16x2 r1065, r103, r1067; +} +{ +add.f16x2 r1068, r1044, r1065; +} +{ +sub.f16x2 %8, r1050, r1056; +} +{ +add.f16x2 %9, r1062, r1068; +} +{ +add.f16x2 %30, r1050, r1056; +} +{ +sub.f16x2 %31, r1062, r1068; +} +cvt.rn.f16.s32 rs153, r2004; +mov.b32 r1095, {rs153, rs153}; +cvt.rn.f16.s32 rs154, r2004; +mov.b32 r1107, {rs154, rs154}; +{ +cvt.rn.f16.f64 rs155, fd323; +} +mov.b32 r1087, {rs155, rs155}; +{ +mul.f16x2 r1085, r1, r1087; +} +{ +add.f16x2 r1088, %38, r1085; +} +{ +cvt.rn.f16.f64 rs156, fd324; +} +mov.b32 r1093, {rs156, rs156}; +{ +mul.f16x2 r1091, r10, r1093; +} +{ +add.f16x2 r1094, r1095, r1091; +} +{ +cvt.rn.f16.f64 rs157, fd323; +} +mov.b32 r1099, {rs157, rs157}; +{ +mul.f16x2 r1097, r4, r1099; +} +{ +add.f16x2 r1100, %39, r1097; +} +{ +cvt.rn.f16.f64 rs158, fd324; +} +mov.b32 r1105, {rs158, rs158}; +{ +mul.f16x2 r1103, r7, r1105; +} +{ +add.f16x2 r1106, r1107, r1103; +} +{ +cvt.rn.f16.f64 rs159, fd291; +} +mov.b32 r1111, {rs159, rs159}; +{ +mul.f16x2 r1109, r13, r1111; +} +{ +add.f16x2 r1112, r1088, r1109; +} +{ +cvt.rn.f16.f64 rs160, fd276; +} +mov.b32 r1117, {rs160, rs160}; +{ +mul.f16x2 r1115, r22, r1117; +} +{ +add.f16x2 r1118, r1094, r1115; +} +{ +cvt.rn.f16.f64 rs161, fd291; +} +mov.b32 r1123, {rs161, rs161}; +{ +mul.f16x2 r1121, r16, r1123; +} +{ +add.f16x2 r1124, r1100, r1121; +} +{ +cvt.rn.f16.f64 rs162, fd276; +} +mov.b32 r1129, {rs162, rs162}; +{ +mul.f16x2 r1127, r19, r1129; +} +{ +add.f16x2 r1130, r1106, r1127; +} +{ +cvt.rn.f16.f64 rs163, fd319; +} +mov.b32 r1135, {rs163, rs163}; +{ +mul.f16x2 r1133, r25, r1135; +} +{ +add.f16x2 r1136, r1112, r1133; +} +{ +cvt.rn.f16.f64 rs164, fd320; +} +mov.b32 r1141, {rs164, rs164}; +{ +mul.f16x2 r1139, r34, r1141; +} +{ +add.f16x2 r1142, r1118, r1139; +} +{ +cvt.rn.f16.f64 rs165, fd319; +} +mov.b32 r1147, {rs165, rs165}; +{ +mul.f16x2 r1145, r28, r1147; +} +{ +add.f16x2 r1148, r1124, r1145; +} +{ +cvt.rn.f16.f64 rs166, fd320; +} +mov.b32 r1153, {rs166, rs166}; +{ +mul.f16x2 r1151, r31, r1153; +} +{ +add.f16x2 r1154, r1130, r1151; +} +{ +cvt.rn.f16.f64 rs167, fd295; +} +mov.b32 r1159, {rs167, rs167}; +{ +mul.f16x2 r1157, r37, r1159; +} +{ +add.f16x2 r1160, r1136, r1157; +} +{ +cvt.rn.f16.f64 rs168, fd160; +} +mov.b32 r1165, {rs168, rs168}; +{ +mul.f16x2 r1163, r46, r1165; +} +{ +add.f16x2 r1166, r1142, r1163; +} +{ +cvt.rn.f16.f64 rs169, fd295; +} +mov.b32 r1171, {rs169, rs169}; +{ +mul.f16x2 r1169, r40, r1171; +} +{ +add.f16x2 r1172, r1148, r1169; +} +{ +cvt.rn.f16.f64 rs170, fd160; +} +mov.b32 r1177, {rs170, rs170}; +{ +mul.f16x2 r1175, r43, r1177; +} +{ +add.f16x2 r1178, r1154, r1175; +} +{ +cvt.rn.f16.f64 rs171, fd315; +} +mov.b32 r1183, {rs171, rs171}; +{ +mul.f16x2 r1181, r49, r1183; +} +{ +add.f16x2 r1184, r1160, r1181; +} +{ +cvt.rn.f16.f64 rs172, fd316; +} +mov.b32 r1189, {rs172, rs172}; +{ +mul.f16x2 r1187, r58, r1189; +} +{ +add.f16x2 r1190, r1166, r1187; +} +{ +cvt.rn.f16.f64 rs173, fd315; +} +mov.b32 r1195, {rs173, rs173}; +{ +mul.f16x2 r1193, r52, r1195; +} +{ +add.f16x2 r1196, r1172, r1193; +} +{ +cvt.rn.f16.f64 rs174, fd316; +} +mov.b32 r1201, {rs174, rs174}; +{ +mul.f16x2 r1199, r55, r1201; +} +{ +add.f16x2 r1202, r1178, r1199; +} +{ +cvt.rn.f16.f64 rs175, fd299; +} +mov.b32 r1207, {rs175, rs175}; +{ +mul.f16x2 r1205, r61, r1207; +} +{ +add.f16x2 r1208, r1184, r1205; +} +mov.f64 fd244, 0dBFDE75EC0DED7BEE; +{ +cvt.rn.f16.f64 rs176, fd244; +} +mov.b32 r1213, {rs176, rs176}; +{ +mul.f16x2 r1211, r70, r1213; +} +{ +add.f16x2 r1214, r1190, r1211; +} +{ +cvt.rn.f16.f64 rs177, fd299; +} +mov.b32 r1219, {rs177, rs177}; +{ +mul.f16x2 r1217, r64, r1219; +} +{ +add.f16x2 r1220, r1196, r1217; +} +{ +cvt.rn.f16.f64 rs178, fd244; +} +mov.b32 r1225, {rs178, rs178}; +{ +mul.f16x2 r1223, r67, r1225; +} +{ +add.f16x2 r1226, r1202, r1223; +} +{ +cvt.rn.f16.f64 rs179, fd311; +} +mov.b32 r1231, {rs179, rs179}; +{ +mul.f16x2 r1229, r73, r1231; +} +{ +add.f16x2 r1232, r1208, r1229; +} +{ +cvt.rn.f16.f64 rs180, fd312; +} +mov.b32 r1237, {rs180, rs180}; +{ +mul.f16x2 r1235, r82, r1237; +} +{ +add.f16x2 r1238, r1214, r1235; +} +{ +cvt.rn.f16.f64 rs181, fd311; +} +mov.b32 r1243, {rs181, rs181}; +{ +mul.f16x2 r1241, r76, r1243; +} +{ +add.f16x2 r1244, r1220, r1241; +} +{ +cvt.rn.f16.f64 rs182, fd312; +} +mov.b32 r1249, {rs182, rs182}; +{ +mul.f16x2 r1247, r79, r1249; +} +{ +add.f16x2 r1250, r1226, r1247; +} +{ +cvt.rn.f16.f64 rs183, fd303; +} +mov.b32 r1255, {rs183, rs183}; +{ +mul.f16x2 r1253, r85, r1255; +} +{ +add.f16x2 r1256, r1232, r1253; +} +{ +cvt.rn.f16.f64 rs184, fd272; +} +mov.b32 r1261, {rs184, rs184}; +{ +mul.f16x2 r1259, r94, r1261; +} +{ +add.f16x2 r1262, r1238, r1259; +} +{ +cvt.rn.f16.f64 rs185, fd303; +} +mov.b32 r1267, {rs185, rs185}; +{ +mul.f16x2 r1265, r88, r1267; +} +{ +add.f16x2 r1268, r1244, r1265; +} +{ +cvt.rn.f16.f64 rs186, fd272; +} +mov.b32 r1273, {rs186, rs186}; +{ +mul.f16x2 r1271, r91, r1273; +} +{ +add.f16x2 r1274, r1250, r1271; +} +{ +cvt.rn.f16.f64 rs187, fd307; +} +mov.b32 r1279, {rs187, rs187}; +{ +mul.f16x2 r1277, r97, r1279; +} +{ +add.f16x2 r1280, r1256, r1277; +} +{ +cvt.rn.f16.f64 rs188, fd308; +} +mov.b32 r1285, {rs188, rs188}; +{ +mul.f16x2 r1283, r106, r1285; +} +{ +add.f16x2 r1286, r1262, r1283; +} +{ +cvt.rn.f16.f64 rs189, fd307; +} +mov.b32 r1291, {rs189, rs189}; +{ +mul.f16x2 r1289, r100, r1291; +} +{ +add.f16x2 r1292, r1268, r1289; +} +{ +cvt.rn.f16.f64 rs190, fd308; +} +mov.b32 r1297, {rs190, rs190}; +{ +mul.f16x2 r1295, r103, r1297; +} +{ +add.f16x2 r1298, r1274, r1295; +} +{ +sub.f16x2 %10, r1280, r1286; +} +{ +add.f16x2 %11, r1292, r1298; +} +{ +add.f16x2 %28, r1280, r1286; +} +{ +sub.f16x2 %29, r1292, r1298; +} +cvt.rn.f16.s32 rs191, r2004; +mov.b32 r1325, {rs191, rs191}; +cvt.rn.f16.s32 rs192, r2004; +mov.b32 r1337, {rs192, rs192}; +{ +cvt.rn.f16.f64 rs193, fd315; +} +mov.b32 r1317, {rs193, rs193}; +{ +mul.f16x2 r1315, r1, r1317; +} +{ +add.f16x2 r1318, %38, r1315; +} +{ +cvt.rn.f16.f64 rs194, fd316; +} +mov.b32 r1323, {rs194, rs194}; +{ +mul.f16x2 r1321, r10, r1323; +} +{ +add.f16x2 r1324, r1325, r1321; +} +{ +cvt.rn.f16.f64 rs195, fd315; +} +mov.b32 r1329, {rs195, rs195}; +{ +mul.f16x2 r1327, r4, r1329; +} +{ +add.f16x2 r1330, %39, r1327; +} +{ +cvt.rn.f16.f64 rs196, fd316; +} +mov.b32 r1335, {rs196, rs196}; +{ +mul.f16x2 r1333, r7, r1335; +} +{ +add.f16x2 r1336, r1337, r1333; +} +{ +cvt.rn.f16.f64 rs197, fd307; +} +mov.b32 r1341, {rs197, rs197}; +{ +mul.f16x2 r1339, r13, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs198, fd188; +} +mov.b32 r1347, {rs198, rs198}; +{ +mul.f16x2 r1345, r22, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs199, fd307; +} +mov.b32 r1353, {rs199, rs199}; +{ +mul.f16x2 r1351, r16, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs200, fd188; +} +mov.b32 r1359, {rs200, rs200}; +{ +mul.f16x2 r1357, r19, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs201, fd295; +} +mov.b32 r1365, {rs201, rs201}; +{ +mul.f16x2 r1363, r25, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +cvt.rn.f16.f64 rs202, fd296; +} +mov.b32 r1371, {rs202, rs202}; +{ +mul.f16x2 r1369, r34, r1371; +} +{ +add.f16x2 r1372, r1348, r1369; +} +{ +cvt.rn.f16.f64 rs203, fd295; +} +mov.b32 r1377, {rs203, rs203}; +{ +mul.f16x2 r1375, r28, r1377; +} +{ +add.f16x2 r1378, r1354, r1375; +} +{ +cvt.rn.f16.f64 rs204, fd296; +} +mov.b32 r1383, {rs204, rs204}; +{ +mul.f16x2 r1381, r31, r1383; +} +{ +add.f16x2 r1384, r1360, r1381; +} +{ +cvt.rn.f16.f64 rs205, fd323; +} +mov.b32 r1389, {rs205, rs205}; +{ +mul.f16x2 r1387, r37, r1389; +} +{ +add.f16x2 r1390, r1366, r1387; +} +{ +cvt.rn.f16.f64 rs206, fd324; +} +mov.b32 r1395, {rs206, rs206}; +{ +mul.f16x2 r1393, r46, r1395; +} +{ +add.f16x2 r1396, r1372, r1393; +} +{ +cvt.rn.f16.f64 rs207, fd323; +} +mov.b32 r1401, {rs207, rs207}; +{ +mul.f16x2 r1399, r40, r1401; +} +{ +add.f16x2 r1402, r1378, r1399; +} +{ +cvt.rn.f16.f64 rs208, fd324; +} +mov.b32 r1407, {rs208, rs208}; +{ +mul.f16x2 r1405, r43, r1407; +} +{ +add.f16x2 r1408, r1384, r1405; +} +{ +cvt.rn.f16.f64 rs209, fd299; +} +mov.b32 r1413, {rs209, rs209}; +{ +mul.f16x2 r1411, r49, r1413; +} +{ +add.f16x2 r1414, r1390, r1411; +} +{ +cvt.rn.f16.f64 rs210, fd244; +} +mov.b32 r1419, {rs210, rs210}; +{ +mul.f16x2 r1417, r58, r1419; +} +{ +add.f16x2 r1420, r1396, r1417; +} +{ +cvt.rn.f16.f64 rs211, fd299; +} +mov.b32 r1425, {rs211, rs211}; +{ +mul.f16x2 r1423, r52, r1425; +} +{ +add.f16x2 r1426, r1402, r1423; +} +{ +cvt.rn.f16.f64 rs212, fd244; +} +mov.b32 r1431, {rs212, rs212}; +{ +mul.f16x2 r1429, r55, r1431; +} +{ +add.f16x2 r1432, r1408, r1429; +} +{ +cvt.rn.f16.f64 rs213, fd303; +} +mov.b32 r1437, {rs213, rs213}; +{ +mul.f16x2 r1435, r61, r1437; +} +{ +add.f16x2 r1438, r1414, r1435; +} +{ +cvt.rn.f16.f64 rs214, fd304; +} +mov.b32 r1443, {rs214, rs214}; +{ +mul.f16x2 r1441, r70, r1443; +} +{ +add.f16x2 r1444, r1420, r1441; +} +{ +cvt.rn.f16.f64 rs215, fd303; +} +mov.b32 r1449, {rs215, rs215}; +{ +mul.f16x2 r1447, r64, r1449; +} +{ +add.f16x2 r1450, r1426, r1447; +} +{ +cvt.rn.f16.f64 rs216, fd304; +} +mov.b32 r1455, {rs216, rs216}; +{ +mul.f16x2 r1453, r67, r1455; +} +{ +add.f16x2 r1456, r1432, r1453; +} +{ +cvt.rn.f16.f64 rs217, fd319; +} +mov.b32 r1461, {rs217, rs217}; +{ +mul.f16x2 r1459, r73, r1461; +} +{ +add.f16x2 r1462, r1438, r1459; +} +{ +cvt.rn.f16.f64 rs218, fd240; +} +mov.b32 r1467, {rs218, rs218}; +{ +mul.f16x2 r1465, r82, r1467; +} +{ +add.f16x2 r1468, r1444, r1465; +} +{ +cvt.rn.f16.f64 rs219, fd319; +} +mov.b32 r1473, {rs219, rs219}; +{ +mul.f16x2 r1471, r76, r1473; +} +{ +add.f16x2 r1474, r1450, r1471; +} +{ +cvt.rn.f16.f64 rs220, fd240; +} +mov.b32 r1479, {rs220, rs220}; +{ +mul.f16x2 r1477, r79, r1479; +} +{ +add.f16x2 r1480, r1456, r1477; +} +{ +cvt.rn.f16.f64 rs221, fd291; +} +mov.b32 r1485, {rs221, rs221}; +{ +mul.f16x2 r1483, r85, r1485; +} +{ +add.f16x2 r1486, r1462, r1483; +} +{ +cvt.rn.f16.f64 rs222, fd276; +} +mov.b32 r1491, {rs222, rs222}; +{ +mul.f16x2 r1489, r94, r1491; +} +{ +add.f16x2 r1492, r1468, r1489; +} +{ +cvt.rn.f16.f64 rs223, fd291; +} +mov.b32 r1497, {rs223, rs223}; +{ +mul.f16x2 r1495, r88, r1497; +} +{ +add.f16x2 r1498, r1474, r1495; +} +{ +cvt.rn.f16.f64 rs224, fd276; +} +mov.b32 r1503, {rs224, rs224}; +{ +mul.f16x2 r1501, r91, r1503; +} +{ +add.f16x2 r1504, r1480, r1501; +} +{ +cvt.rn.f16.f64 rs225, fd311; +} +mov.b32 r1509, {rs225, rs225}; +{ +mul.f16x2 r1507, r97, r1509; +} +{ +add.f16x2 r1510, r1486, r1507; +} +{ +cvt.rn.f16.f64 rs226, fd312; +} +mov.b32 r1515, {rs226, rs226}; +{ +mul.f16x2 r1513, r106, r1515; +} +{ +add.f16x2 r1516, r1492, r1513; +} +{ +cvt.rn.f16.f64 rs227, fd311; +} +mov.b32 r1521, {rs227, rs227}; +{ +mul.f16x2 r1519, r100, r1521; +} +{ +add.f16x2 r1522, r1498, r1519; +} +{ +cvt.rn.f16.f64 rs228, fd312; +} +mov.b32 r1527, {rs228, rs228}; +{ +mul.f16x2 r1525, r103, r1527; +} +{ +add.f16x2 r1528, r1504, r1525; +} +{ +sub.f16x2 %12, r1510, r1516; +} +{ +add.f16x2 %13, r1522, r1528; +} +{ +add.f16x2 %26, r1510, r1516; +} +{ +sub.f16x2 %27, r1522, r1528; +} +cvt.rn.f16.s32 rs229, r2004; +mov.b32 r1555, {rs229, rs229}; +cvt.rn.f16.s32 rs230, r2004; +mov.b32 r1567, {rs230, rs230}; +{ +cvt.rn.f16.f64 rs231, fd307; +} +mov.b32 r1547, {rs231, rs231}; +{ +mul.f16x2 r1545, r1, r1547; +} +{ +add.f16x2 r1548, %38, r1545; +} +{ +cvt.rn.f16.f64 rs232, fd308; +} +mov.b32 r1553, {rs232, rs232}; +{ +mul.f16x2 r1551, r10, r1553; +} +{ +add.f16x2 r1554, r1555, r1551; +} +{ +cvt.rn.f16.f64 rs233, fd307; +} +mov.b32 r1559, {rs233, rs233}; +{ +mul.f16x2 r1557, r4, r1559; +} +{ +add.f16x2 r1560, %39, r1557; +} +{ +cvt.rn.f16.f64 rs234, fd308; +} +mov.b32 r1565, {rs234, rs234}; +{ +mul.f16x2 r1563, r7, r1565; +} +{ +add.f16x2 r1566, r1567, r1563; +} +{ +cvt.rn.f16.f64 rs235, fd323; +} +mov.b32 r1571, {rs235, rs235}; +{ +mul.f16x2 r1569, r13, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +{ +cvt.rn.f16.f64 rs236, fd224; +} +mov.b32 r1577, {rs236, rs236}; +{ +mul.f16x2 r1575, r22, r1577; +} +{ +add.f16x2 r1578, r1554, r1575; +} +{ +cvt.rn.f16.f64 rs237, fd323; +} +mov.b32 r1583, {rs237, rs237}; +{ +mul.f16x2 r1581, r16, r1583; +} +{ +add.f16x2 r1584, r1560, r1581; +} +{ +cvt.rn.f16.f64 rs238, fd224; +} +mov.b32 r1589, {rs238, rs238}; +{ +mul.f16x2 r1587, r19, r1589; +} +{ +add.f16x2 r1590, r1566, r1587; +} +{ +cvt.rn.f16.f64 rs239, fd303; +} +mov.b32 r1595, {rs239, rs239}; +{ +mul.f16x2 r1593, r25, r1595; +} +{ +add.f16x2 r1596, r1572, r1593; +} +{ +cvt.rn.f16.f64 rs240, fd272; +} +mov.b32 r1601, {rs240, rs240}; +{ +mul.f16x2 r1599, r34, r1601; +} +{ +add.f16x2 r1602, r1578, r1599; +} +{ +cvt.rn.f16.f64 rs241, fd303; +} +mov.b32 r1607, {rs241, rs241}; +{ +mul.f16x2 r1605, r28, r1607; +} +{ +add.f16x2 r1608, r1584, r1605; +} +{ +cvt.rn.f16.f64 rs242, fd272; +} +mov.b32 r1613, {rs242, rs242}; +{ +mul.f16x2 r1611, r31, r1613; +} +{ +add.f16x2 r1614, r1590, r1611; +} +{ +cvt.rn.f16.f64 rs243, fd291; +} +mov.b32 r1619, {rs243, rs243}; +{ +mul.f16x2 r1617, r37, r1619; +} +{ +add.f16x2 r1620, r1596, r1617; +} +{ +cvt.rn.f16.f64 rs244, fd292; +} +mov.b32 r1625, {rs244, rs244}; +{ +mul.f16x2 r1623, r46, r1625; +} +{ +add.f16x2 r1626, r1602, r1623; +} +{ +cvt.rn.f16.f64 rs245, fd291; +} +mov.b32 r1631, {rs245, rs245}; +{ +mul.f16x2 r1629, r40, r1631; +} +{ +add.f16x2 r1632, r1608, r1629; +} +{ +cvt.rn.f16.f64 rs246, fd292; +} +mov.b32 r1637, {rs246, rs246}; +{ +mul.f16x2 r1635, r43, r1637; +} +{ +add.f16x2 r1638, r1614, r1635; +} +{ +cvt.rn.f16.f64 rs247, fd311; +} +mov.b32 r1643, {rs247, rs247}; +{ +mul.f16x2 r1641, r49, r1643; +} +{ +add.f16x2 r1644, r1620, r1641; +} +{ +cvt.rn.f16.f64 rs248, fd312; +} +mov.b32 r1649, {rs248, rs248}; +{ +mul.f16x2 r1647, r58, r1649; +} +{ +add.f16x2 r1650, r1626, r1647; +} +{ +cvt.rn.f16.f64 rs249, fd311; +} +mov.b32 r1655, {rs249, rs249}; +{ +mul.f16x2 r1653, r52, r1655; +} +{ +add.f16x2 r1656, r1632, r1653; +} +{ +cvt.rn.f16.f64 rs250, fd312; +} +mov.b32 r1661, {rs250, rs250}; +{ +mul.f16x2 r1659, r55, r1661; +} +{ +add.f16x2 r1662, r1638, r1659; +} +{ +cvt.rn.f16.f64 rs251, fd319; +} +mov.b32 r1667, {rs251, rs251}; +{ +mul.f16x2 r1665, r61, r1667; +} +{ +add.f16x2 r1668, r1644, r1665; +} +{ +cvt.rn.f16.f64 rs252, fd240; +} +mov.b32 r1673, {rs252, rs252}; +{ +mul.f16x2 r1671, r70, r1673; +} +{ +add.f16x2 r1674, r1650, r1671; +} +{ +cvt.rn.f16.f64 rs253, fd319; +} +mov.b32 r1679, {rs253, rs253}; +{ +mul.f16x2 r1677, r64, r1679; +} +{ +add.f16x2 r1680, r1656, r1677; +} +{ +cvt.rn.f16.f64 rs254, fd240; +} +mov.b32 r1685, {rs254, rs254}; +{ +mul.f16x2 r1683, r67, r1685; +} +{ +add.f16x2 r1686, r1662, r1683; +} +{ +cvt.rn.f16.f64 rs255, fd299; +} +mov.b32 r1691, {rs255, rs255}; +{ +mul.f16x2 r1689, r73, r1691; +} +{ +add.f16x2 r1692, r1668, r1689; +} +{ +cvt.rn.f16.f64 rs256, fd244; +} +mov.b32 r1697, {rs256, rs256}; +{ +mul.f16x2 r1695, r82, r1697; +} +{ +add.f16x2 r1698, r1674, r1695; +} +{ +cvt.rn.f16.f64 rs257, fd299; +} +mov.b32 r1703, {rs257, rs257}; +{ +mul.f16x2 r1701, r76, r1703; +} +{ +add.f16x2 r1704, r1680, r1701; +} +{ +cvt.rn.f16.f64 rs258, fd244; +} +mov.b32 r1709, {rs258, rs258}; +{ +mul.f16x2 r1707, r79, r1709; +} +{ +add.f16x2 r1710, r1686, r1707; +} +{ +cvt.rn.f16.f64 rs259, fd295; +} +mov.b32 r1715, {rs259, rs259}; +{ +mul.f16x2 r1713, r85, r1715; +} +{ +add.f16x2 r1716, r1692, r1713; +} +{ +cvt.rn.f16.f64 rs260, fd296; +} +mov.b32 r1721, {rs260, rs260}; +{ +mul.f16x2 r1719, r94, r1721; +} +{ +add.f16x2 r1722, r1698, r1719; +} +{ +cvt.rn.f16.f64 rs261, fd295; +} +mov.b32 r1727, {rs261, rs261}; +{ +mul.f16x2 r1725, r88, r1727; +} +{ +add.f16x2 r1728, r1704, r1725; +} +{ +cvt.rn.f16.f64 rs262, fd296; +} +mov.b32 r1733, {rs262, rs262}; +{ +mul.f16x2 r1731, r91, r1733; +} +{ +add.f16x2 r1734, r1710, r1731; +} +{ +cvt.rn.f16.f64 rs263, fd315; +} +mov.b32 r1739, {rs263, rs263}; +{ +mul.f16x2 r1737, r97, r1739; +} +{ +add.f16x2 r1740, r1716, r1737; +} +{ +cvt.rn.f16.f64 rs264, fd316; +} +mov.b32 r1745, {rs264, rs264}; +{ +mul.f16x2 r1743, r106, r1745; +} +{ +add.f16x2 r1746, r1722, r1743; +} +{ +cvt.rn.f16.f64 rs265, fd315; +} +mov.b32 r1751, {rs265, rs265}; +{ +mul.f16x2 r1749, r100, r1751; +} +{ +add.f16x2 r1752, r1728, r1749; +} +{ +cvt.rn.f16.f64 rs266, fd316; +} +mov.b32 r1757, {rs266, rs266}; +{ +mul.f16x2 r1755, r103, r1757; +} +{ +add.f16x2 r1758, r1734, r1755; +} +{ +sub.f16x2 %14, r1740, r1746; +} +{ +add.f16x2 %15, r1752, r1758; +} +{ +add.f16x2 %24, r1740, r1746; +} +{ +sub.f16x2 %25, r1752, r1758; +} +cvt.rn.f16.s32 rs267, r2004; +mov.b32 r1785, {rs267, rs267}; +cvt.rn.f16.s32 rs268, r2004; +mov.b32 r1797, {rs268, rs268}; +{ +cvt.rn.f16.f64 rs269, fd299; +} +mov.b32 r1777, {rs269, rs269}; +{ +mul.f16x2 r1775, r1, r1777; +} +{ +add.f16x2 r1778, %38, r1775; +} +{ +cvt.rn.f16.f64 rs270, fd300; +} +mov.b32 r1783, {rs270, rs270}; +{ +mul.f16x2 r1781, r10, r1783; +} +{ +add.f16x2 r1784, r1785, r1781; +} +{ +cvt.rn.f16.f64 rs271, fd299; +} +mov.b32 r1789, {rs271, rs271}; +{ +mul.f16x2 r1787, r4, r1789; +} +{ +add.f16x2 r1790, %39, r1787; +} +{ +cvt.rn.f16.f64 rs272, fd300; +} +mov.b32 r1795, {rs272, rs272}; +{ +mul.f16x2 r1793, r7, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +cvt.rn.f16.f64 rs273, fd311; +} +mov.b32 r1801, {rs273, rs273}; +{ +mul.f16x2 r1799, r13, r1801; +} +{ +add.f16x2 r1802, r1778, r1799; +} +{ +cvt.rn.f16.f64 rs274, fd312; +} +mov.b32 r1807, {rs274, rs274}; +{ +mul.f16x2 r1805, r22, r1807; +} +{ +add.f16x2 r1808, r1784, r1805; +} +{ +cvt.rn.f16.f64 rs275, fd311; +} +mov.b32 r1813, {rs275, rs275}; +{ +mul.f16x2 r1811, r16, r1813; +} +{ +add.f16x2 r1814, r1790, r1811; +} +{ +cvt.rn.f16.f64 rs276, fd312; +} +mov.b32 r1819, {rs276, rs276}; +{ +mul.f16x2 r1817, r19, r1819; +} +{ +add.f16x2 r1820, r1796, r1817; +} +{ +cvt.rn.f16.f64 rs277, fd323; +} +mov.b32 r1825, {rs277, rs277}; +{ +mul.f16x2 r1823, r25, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs278, fd324; +} +mov.b32 r1831, {rs278, rs278}; +{ +mul.f16x2 r1829, r34, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs279, fd323; +} +mov.b32 r1837, {rs279, rs279}; +{ +mul.f16x2 r1835, r28, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs280, fd324; +} +mov.b32 r1843, {rs280, rs280}; +{ +mul.f16x2 r1841, r31, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs281, fd315; +} +mov.b32 r1849, {rs281, rs281}; +{ +mul.f16x2 r1847, r37, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs282, fd268; +} +mov.b32 r1855, {rs282, rs282}; +{ +mul.f16x2 r1853, r46, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs283, fd315; +} +mov.b32 r1861, {rs283, rs283}; +{ +mul.f16x2 r1859, r40, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs284, fd268; +} +mov.b32 r1867, {rs284, rs284}; +{ +mul.f16x2 r1865, r43, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs285, fd303; +} +mov.b32 r1873, {rs285, rs285}; +{ +mul.f16x2 r1871, r49, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs286, fd272; +} +mov.b32 r1879, {rs286, rs286}; +{ +mul.f16x2 r1877, r58, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs287, fd303; +} +mov.b32 r1885, {rs287, rs287}; +{ +mul.f16x2 r1883, r52, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs288, fd272; +} +mov.b32 r1891, {rs288, rs288}; +{ +mul.f16x2 r1889, r55, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs289, fd291; +} +mov.b32 r1897, {rs289, rs289}; +{ +mul.f16x2 r1895, r61, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs290, fd276; +} +mov.b32 r1903, {rs290, rs290}; +{ +mul.f16x2 r1901, r70, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs291, fd291; +} +mov.b32 r1909, {rs291, rs291}; +{ +mul.f16x2 r1907, r64, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs292, fd276; +} +mov.b32 r1915, {rs292, rs292}; +{ +mul.f16x2 r1913, r67, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs293, fd295; +} +mov.b32 r1921, {rs293, rs293}; +{ +mul.f16x2 r1919, r73, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs294, fd296; +} +mov.b32 r1927, {rs294, rs294}; +{ +mul.f16x2 r1925, r82, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs295, fd295; +} +mov.b32 r1933, {rs295, rs295}; +{ +mul.f16x2 r1931, r76, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs296, fd296; +} +mov.b32 r1939, {rs296, rs296}; +{ +mul.f16x2 r1937, r79, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +cvt.rn.f16.f64 rs297, fd307; +} +mov.b32 r1945, {rs297, rs297}; +{ +mul.f16x2 r1943, r85, r1945; +} +{ +add.f16x2 r1946, r1922, r1943; +} +{ +cvt.rn.f16.f64 rs298, fd308; +} +mov.b32 r1951, {rs298, rs298}; +{ +mul.f16x2 r1949, r94, r1951; +} +{ +add.f16x2 r1952, r1928, r1949; +} +{ +cvt.rn.f16.f64 rs299, fd307; +} +mov.b32 r1957, {rs299, rs299}; +{ +mul.f16x2 r1955, r88, r1957; +} +{ +add.f16x2 r1958, r1934, r1955; +} +{ +cvt.rn.f16.f64 rs300, fd308; +} +mov.b32 r1963, {rs300, rs300}; +{ +mul.f16x2 r1961, r91, r1963; +} +{ +add.f16x2 r1964, r1940, r1961; +} +{ +cvt.rn.f16.f64 rs301, fd319; +} +mov.b32 r1969, {rs301, rs301}; +{ +mul.f16x2 r1967, r97, r1969; +} +{ +add.f16x2 r1970, r1946, r1967; +} +{ +cvt.rn.f16.f64 rs302, fd320; +} +mov.b32 r1975, {rs302, rs302}; +{ +mul.f16x2 r1973, r106, r1975; +} +{ +add.f16x2 r1976, r1952, r1973; +} +{ +cvt.rn.f16.f64 rs303, fd319; +} +mov.b32 r1981, {rs303, rs303}; +{ +mul.f16x2 r1979, r100, r1981; +} +{ +add.f16x2 r1982, r1958, r1979; +} +{ +cvt.rn.f16.f64 rs304, fd320; +} +mov.b32 r1987, {rs304, rs304}; +{ +mul.f16x2 r1985, r103, r1987; +} +{ +add.f16x2 r1988, r1964, r1985; +} +{ +sub.f16x2 %16, r1970, r1976; +} +{ +add.f16x2 %17, r1982, r1988; +} +{ +add.f16x2 %22, r1970, r1976; +} +{ +sub.f16x2 %23, r1982, r1988; +} +cvt.rn.f16.s32 rs305, r2004; +mov.b32 r2015, {rs305, rs305}; +cvt.rn.f16.s32 rs306, r2004; +mov.b32 r2027, {rs306, rs306}; +{ +cvt.rn.f16.f64 rs307, fd291; +} +mov.b32 r2007, {rs307, rs307}; +{ +mul.f16x2 r2005, r1, r2007; +} +{ +add.f16x2 r2008, %38, r2005; +} +{ +cvt.rn.f16.f64 rs308, fd292; +} +mov.b32 r2013, {rs308, rs308}; +{ +mul.f16x2 r2011, r10, r2013; +} +{ +add.f16x2 r2014, r2015, r2011; +} +{ +cvt.rn.f16.f64 rs309, fd291; +} +mov.b32 r2019, {rs309, rs309}; +{ +mul.f16x2 r2017, r4, r2019; +} +{ +add.f16x2 r2020, %39, r2017; +} +{ +cvt.rn.f16.f64 rs310, fd292; +} +mov.b32 r2025, {rs310, rs310}; +{ +mul.f16x2 r2023, r7, r2025; +} +{ +add.f16x2 r2026, r2027, r2023; +} +{ +cvt.rn.f16.f64 rs311, fd295; +} +mov.b32 r2031, {rs311, rs311}; +{ +mul.f16x2 r2029, r13, r2031; +} +{ +add.f16x2 r2032, r2008, r2029; +} +{ +cvt.rn.f16.f64 rs312, fd296; +} +mov.b32 r2037, {rs312, rs312}; +{ +mul.f16x2 r2035, r22, r2037; +} +{ +add.f16x2 r2038, r2014, r2035; +} +{ +cvt.rn.f16.f64 rs313, fd295; +} +mov.b32 r2043, {rs313, rs313}; +{ +mul.f16x2 r2041, r16, r2043; +} +{ +add.f16x2 r2044, r2020, r2041; +} +{ +cvt.rn.f16.f64 rs314, fd296; +} +mov.b32 r2049, {rs314, rs314}; +{ +mul.f16x2 r2047, r19, r2049; +} +{ +add.f16x2 r2050, r2026, r2047; +} +{ +cvt.rn.f16.f64 rs315, fd299; +} +mov.b32 r2055, {rs315, rs315}; +{ +mul.f16x2 r2053, r25, r2055; +} +{ +add.f16x2 r2056, r2032, r2053; +} +{ +cvt.rn.f16.f64 rs316, fd300; +} +mov.b32 r2061, {rs316, rs316}; +{ +mul.f16x2 r2059, r34, r2061; +} +{ +add.f16x2 r2062, r2038, r2059; +} +{ +cvt.rn.f16.f64 rs317, fd299; +} +mov.b32 r2067, {rs317, rs317}; +{ +mul.f16x2 r2065, r28, r2067; +} +{ +add.f16x2 r2068, r2044, r2065; +} +{ +cvt.rn.f16.f64 rs318, fd300; +} +mov.b32 r2073, {rs318, rs318}; +{ +mul.f16x2 r2071, r31, r2073; +} +{ +add.f16x2 r2074, r2050, r2071; +} +{ +cvt.rn.f16.f64 rs319, fd303; +} +mov.b32 r2079, {rs319, rs319}; +{ +mul.f16x2 r2077, r37, r2079; +} +{ +add.f16x2 r2080, r2056, r2077; +} +{ +cvt.rn.f16.f64 rs320, fd304; +} +mov.b32 r2085, {rs320, rs320}; +{ +mul.f16x2 r2083, r46, r2085; +} +{ +add.f16x2 r2086, r2062, r2083; +} +{ +cvt.rn.f16.f64 rs321, fd303; +} +mov.b32 r2091, {rs321, rs321}; +{ +mul.f16x2 r2089, r40, r2091; +} +{ +add.f16x2 r2092, r2068, r2089; +} +{ +cvt.rn.f16.f64 rs322, fd304; +} +mov.b32 r2097, {rs322, rs322}; +{ +mul.f16x2 r2095, r43, r2097; +} +{ +add.f16x2 r2098, r2074, r2095; +} +{ +cvt.rn.f16.f64 rs323, fd307; +} +mov.b32 r2103, {rs323, rs323}; +{ +mul.f16x2 r2101, r49, r2103; +} +{ +add.f16x2 r2104, r2080, r2101; +} +{ +cvt.rn.f16.f64 rs324, fd308; +} +mov.b32 r2109, {rs324, rs324}; +{ +mul.f16x2 r2107, r58, r2109; +} +{ +add.f16x2 r2110, r2086, r2107; +} +{ +cvt.rn.f16.f64 rs325, fd307; +} +mov.b32 r2115, {rs325, rs325}; +{ +mul.f16x2 r2113, r52, r2115; +} +{ +add.f16x2 r2116, r2092, r2113; +} +{ +cvt.rn.f16.f64 rs326, fd308; +} +mov.b32 r2121, {rs326, rs326}; +{ +mul.f16x2 r2119, r55, r2121; +} +{ +add.f16x2 r2122, r2098, r2119; +} +{ +cvt.rn.f16.f64 rs327, fd311; +} +mov.b32 r2127, {rs327, rs327}; +{ +mul.f16x2 r2125, r61, r2127; +} +{ +add.f16x2 r2128, r2104, r2125; +} +{ +cvt.rn.f16.f64 rs328, fd312; +} +mov.b32 r2133, {rs328, rs328}; +{ +mul.f16x2 r2131, r70, r2133; +} +{ +add.f16x2 r2134, r2110, r2131; +} +{ +cvt.rn.f16.f64 rs329, fd311; +} +mov.b32 r2139, {rs329, rs329}; +{ +mul.f16x2 r2137, r64, r2139; +} +{ +add.f16x2 r2140, r2116, r2137; +} +{ +cvt.rn.f16.f64 rs330, fd312; +} +mov.b32 r2145, {rs330, rs330}; +{ +mul.f16x2 r2143, r67, r2145; +} +{ +add.f16x2 r2146, r2122, r2143; +} +{ +cvt.rn.f16.f64 rs331, fd315; +} +mov.b32 r2151, {rs331, rs331}; +{ +mul.f16x2 r2149, r73, r2151; +} +{ +add.f16x2 r2152, r2128, r2149; +} +{ +cvt.rn.f16.f64 rs332, fd316; +} +mov.b32 r2157, {rs332, rs332}; +{ +mul.f16x2 r2155, r82, r2157; +} +{ +add.f16x2 r2158, r2134, r2155; +} +{ +cvt.rn.f16.f64 rs333, fd315; +} +mov.b32 r2163, {rs333, rs333}; +{ +mul.f16x2 r2161, r76, r2163; +} +{ +add.f16x2 r2164, r2140, r2161; +} +{ +cvt.rn.f16.f64 rs334, fd316; +} +mov.b32 r2169, {rs334, rs334}; +{ +mul.f16x2 r2167, r79, r2169; +} +{ +add.f16x2 r2170, r2146, r2167; +} +{ +cvt.rn.f16.f64 rs335, fd319; +} +mov.b32 r2175, {rs335, rs335}; +{ +mul.f16x2 r2173, r85, r2175; +} +{ +add.f16x2 r2176, r2152, r2173; +} +{ +cvt.rn.f16.f64 rs336, fd320; +} +mov.b32 r2181, {rs336, rs336}; +{ +mul.f16x2 r2179, r94, r2181; +} +{ +add.f16x2 r2182, r2158, r2179; +} +{ +cvt.rn.f16.f64 rs337, fd319; +} +mov.b32 r2187, {rs337, rs337}; +{ +mul.f16x2 r2185, r88, r2187; +} +{ +add.f16x2 r2188, r2164, r2185; +} +{ +cvt.rn.f16.f64 rs338, fd320; +} +mov.b32 r2193, {rs338, rs338}; +{ +mul.f16x2 r2191, r91, r2193; +} +{ +add.f16x2 r2194, r2170, r2191; +} +{ +cvt.rn.f16.f64 rs339, fd323; +} +mov.b32 r2199, {rs339, rs339}; +{ +mul.f16x2 r2197, r97, r2199; +} +{ +add.f16x2 r2200, r2176, r2197; +} +{ +cvt.rn.f16.f64 rs340, fd324; +} +mov.b32 r2205, {rs340, rs340}; +{ +mul.f16x2 r2203, r106, r2205; +} +{ +add.f16x2 r2206, r2182, r2203; +} +{ +cvt.rn.f16.f64 rs341, fd323; +} +mov.b32 r2211, {rs341, rs341}; +{ +mul.f16x2 r2209, r100, r2211; +} +{ +add.f16x2 r2212, r2188, r2209; +} +{ +cvt.rn.f16.f64 rs342, fd324; +} +mov.b32 r2217, {rs342, rs342}; +{ +mul.f16x2 r2215, r103, r2217; +} +{ +add.f16x2 r2218, r2194, r2215; +} +{ +sub.f16x2 %18, r2200, r2206; +} +{ +add.f16x2 %19, r2212, r2218; +} +{ +add.f16x2 %20, r2200, r2206; +} +{ +sub.f16x2 %21, r2212, r2218; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..641f56a69c640 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp32_fwd.hpp.inc @@ -0,0 +1,430 @@ +#ifndef CUFFTDX_FFT_19_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_19_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<5, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<550>; +.reg .b64 rd<4>; +add.f32 f77, %40, %75; +sub.f32 f79, %40, %75; +add.f32 f549, %42, %76; +sub.f32 f80, %42, %76; +add.f32 f81, %43, %73; +sub.f32 f83, %43, %73; +add.f32 f546, %77, %78; +sub.f32 f84, %77, %78; +add.f32 f85, %45, %71; +sub.f32 f87, %45, %71; +add.f32 f544, %46, %79; +sub.f32 f88, %46, %79; +add.f32 f89, %47, %69; +sub.f32 f91, %47, %69; +add.f32 f542, %80, %70; +sub.f32 f92, %80, %70; +add.f32 f93, %49, %67; +sub.f32 f95, %49, %67; +add.f32 f539, %81, %82; +sub.f32 f96, %81, %82; +add.f32 f97, %51, %65; +sub.f32 f99, %51, %65; +add.f32 f537, %52, %83; +sub.f32 f100, %52, %83; +add.f32 f101, %53, %63; +sub.f32 f103, %53, %63; +add.f32 f535, %84, %64; +sub.f32 f104, %84, %64; +add.f32 f105, %55, %61; +sub.f32 f107, %55, %61; +add.f32 f532, %85, %86; +sub.f32 f108, %85, %86; +add.f32 f109, %57, %59; +sub.f32 f111, %57, %59; +add.f32 f530, %58, %87; +sub.f32 f112, %58, %87; +add.f32 f113, %38, f77; +add.f32 f115, f113, f81; +add.f32 f529, %39, f549; +add.f32 f116, f529, f546; +add.f32 f117, f115, f85; +add.f32 f118, f116, f544; +add.f32 f119, f117, f89; +add.f32 f120, f118, f542; +add.f32 f121, f119, f93; +add.f32 f122, f120, f539; +add.f32 f123, f121, f97; +add.f32 f124, f122, f537; +add.f32 f125, f123, f101; +add.f32 f126, f124, f535; +add.f32 f127, f125, f105; +add.f32 f128, f126, f532; +fma.rn.f32 f129, f77, 0f3F722114, %38; +fma.rn.f32 f133, f81, 0f3F4A051D, f129; +fma.rn.f32 f528, f80, 0fBEA63F02, 0f00000000; +fma.rn.f32 f134, f84, 0fBF1D3D0B, f528; +fma.rn.f32 f527, f549, 0f3F722114, %39; +fma.rn.f32 f135, f546, 0f3F4A051D, f527; +fma.rn.f32 f526, f79, 0fBEA63F02, 0f00000000; +fma.rn.f32 f136, f83, 0fBF1D3D0B, f526; +fma.rn.f32 f137, f85, 0f3F0C04CB, f133; +fma.rn.f32 f138, f88, 0fBF56508B, f134; +fma.rn.f32 f139, f544, 0f3F0C04CB, f135; +fma.rn.f32 f140, f87, 0fBF56508B, f136; +fma.rn.f32 f141, f89, 0f3E7B608C, f137; +fma.rn.f32 f142, f92, 0fBF782A9E, f138; +fma.rn.f32 f143, f542, 0f3E7B608C, f139; +fma.rn.f32 f144, f91, 0fBF782A9E, f140; +fma.rn.f32 f145, f93, 0fBDA91F5C, f141; +fma.rn.f32 f146, f96, 0fBF7F2029, f142; +fma.rn.f32 f147, f539, 0fBDA91F5C, f143; +fma.rn.f32 f148, f95, 0fBF7F2029, f144; +fma.rn.f32 f149, f97, 0fBECDAB06, f145; +fma.rn.f32 f150, f100, 0fBF6A701F, f146; +fma.rn.f32 f151, f537, 0fBECDAB06, f147; +fma.rn.f32 f152, f99, 0fBF6A701F, f148; +fma.rn.f32 f153, f101, 0fBF2D6253, f149; +fma.rn.f32 f154, f104, 0fBF3C5867, f150; +fma.rn.f32 f155, f535, 0fBF2D6253, f151; +fma.rn.f32 f156, f103, 0fBF3C5867, f152; +fma.rn.f32 f157, f105, 0fBF612531, f153; +fma.rn.f32 f158, f108, 0fBEF3AF60, f154; +fma.rn.f32 f159, f532, 0fBF612531, f155; +fma.rn.f32 f160, f107, 0fBEF3AF60, f156; +fma.rn.f32 f161, f109, 0fBF7C822D, f157; +fma.rn.f32 f162, f112, 0fBE288B7C, f158; +fma.rn.f32 f163, f530, 0fBF7C822D, f159; +fma.rn.f32 f164, f111, 0fBE288B7C, f160; +fma.rn.f32 f165, f77, 0f3F4A051D, %38; +fma.rn.f32 f169, f81, 0f3E7B608C, f165; +fma.rn.f32 f525, f80, 0fBF1D3D0B, 0f00000000; +fma.rn.f32 f170, f84, 0fBF782A9E, f525; +fma.rn.f32 f524, f549, 0f3F4A051D, %39; +fma.rn.f32 f171, f546, 0f3E7B608C, f524; +fma.rn.f32 f523, f79, 0fBF1D3D0B, 0f00000000; +fma.rn.f32 f172, f83, 0fBF782A9E, f523; +fma.rn.f32 f173, f85, 0fBECDAB06, f169; +fma.rn.f32 f174, f88, 0fBF6A701F, f170; +fma.rn.f32 f175, f544, 0fBECDAB06, f171; +fma.rn.f32 f176, f87, 0fBF6A701F, f172; +fma.rn.f32 f177, f89, 0fBF612531, f173; +fma.rn.f32 f178, f92, 0fBEF3AF60, f174; +fma.rn.f32 f179, f542, 0fBF612531, f175; +fma.rn.f32 f180, f91, 0fBEF3AF60, f176; +fma.rn.f32 f181, f93, 0fBF7C822D, f177; +fma.rn.f32 f182, f96, 0f3E288B7C, f178; +fma.rn.f32 f183, f539, 0fBF7C822D, f179; +fma.rn.f32 f184, f95, 0f3E288B7C, f180; +fma.rn.f32 f185, f97, 0fBF2D6253, f181; +fma.rn.f32 f186, f100, 0f3F3C5867, f182; +fma.rn.f32 f187, f537, 0fBF2D6253, f183; +fma.rn.f32 f188, f99, 0f3F3C5867, f184; +fma.rn.f32 f189, f101, 0fBDA91F5C, f185; +fma.rn.f32 f190, f104, 0f3F7F2029, f186; +fma.rn.f32 f191, f535, 0fBDA91F5C, f187; +fma.rn.f32 f192, f103, 0f3F7F2029, f188; +fma.rn.f32 f193, f105, 0f3F0C04CB, f189; +fma.rn.f32 f194, f108, 0f3F56508B, f190; +fma.rn.f32 f195, f532, 0f3F0C04CB, f191; +fma.rn.f32 f196, f107, 0f3F56508B, f192; +fma.rn.f32 f197, f109, 0f3F722114, f193; +fma.rn.f32 f198, f112, 0f3EA63F02, f194; +fma.rn.f32 f199, f530, 0f3F722114, f195; +fma.rn.f32 f200, f111, 0f3EA63F02, f196; +fma.rn.f32 f201, f77, 0f3F0C04CB, %38; +fma.rn.f32 f205, f81, 0fBECDAB06, f201; +fma.rn.f32 f522, f80, 0fBF56508B, 0f00000000; +fma.rn.f32 f206, f84, 0fBF6A701F, f522; +fma.rn.f32 f521, f549, 0f3F0C04CB, %39; +fma.rn.f32 f207, f546, 0fBECDAB06, f521; +fma.rn.f32 f520, f79, 0fBF56508B, 0f00000000; +fma.rn.f32 f208, f83, 0fBF6A701F, f520; +fma.rn.f32 f209, f85, 0fBF7C822D, f205; +fma.rn.f32 f210, f88, 0fBE288B7C, f206; +fma.rn.f32 f211, f544, 0fBF7C822D, f207; +fma.rn.f32 f212, f87, 0fBE288B7C, f208; +fma.rn.f32 f213, f89, 0fBF2D6253, f209; +fma.rn.f32 f214, f92, 0f3F3C5867, f210; +fma.rn.f32 f215, f542, 0fBF2D6253, f211; +fma.rn.f32 f216, f91, 0f3F3C5867, f212; +fma.rn.f32 f217, f93, 0f3E7B608C, f213; +fma.rn.f32 f218, f96, 0f3F782A9E, f214; +fma.rn.f32 f219, f539, 0f3E7B608C, f215; +fma.rn.f32 f220, f95, 0f3F782A9E, f216; +fma.rn.f32 f221, f97, 0f3F722114, f217; +fma.rn.f32 f222, f100, 0f3EA63F02, f218; +fma.rn.f32 f223, f537, 0f3F722114, f219; +fma.rn.f32 f224, f99, 0f3EA63F02, f220; +fma.rn.f32 f225, f101, 0f3F4A051D, f221; +fma.rn.f32 f226, f104, 0fBF1D3D0B, f222; +fma.rn.f32 f227, f535, 0f3F4A051D, f223; +fma.rn.f32 f228, f103, 0fBF1D3D0B, f224; +fma.rn.f32 f229, f105, 0fBDA91F5C, f225; +fma.rn.f32 f230, f108, 0fBF7F2029, f226; +fma.rn.f32 f231, f532, 0fBDA91F5C, f227; +fma.rn.f32 f232, f107, 0fBF7F2029, f228; +fma.rn.f32 f233, f109, 0fBF612531, f229; +fma.rn.f32 f234, f112, 0fBEF3AF60, f230; +fma.rn.f32 f235, f530, 0fBF612531, f231; +fma.rn.f32 f236, f111, 0fBEF3AF60, f232; +fma.rn.f32 f237, f77, 0f3E7B608C, %38; +fma.rn.f32 f241, f81, 0fBF612531, f237; +fma.rn.f32 f519, f80, 0fBF782A9E, 0f00000000; +fma.rn.f32 f242, f84, 0fBEF3AF60, f519; +fma.rn.f32 f518, f549, 0f3E7B608C, %39; +fma.rn.f32 f243, f546, 0fBF612531, f518; +fma.rn.f32 f517, f79, 0fBF782A9E, 0f00000000; +fma.rn.f32 f244, f83, 0fBEF3AF60, f517; +fma.rn.f32 f245, f85, 0fBF2D6253, f241; +fma.rn.f32 f246, f88, 0f3F3C5867, f242; +fma.rn.f32 f247, f544, 0fBF2D6253, f243; +fma.rn.f32 f248, f87, 0f3F3C5867, f244; +fma.rn.f32 f249, f89, 0f3F0C04CB, f245; +fma.rn.f32 f250, f92, 0f3F56508B, f246; +fma.rn.f32 f251, f542, 0f3F0C04CB, f247; +fma.rn.f32 f252, f91, 0f3F56508B, f248; +fma.rn.f32 f253, f93, 0f3F722114, f249; +fma.rn.f32 f254, f96, 0fBEA63F02, f250; +fma.rn.f32 f255, f539, 0f3F722114, f251; +fma.rn.f32 f256, f95, 0fBEA63F02, f252; +fma.rn.f32 f257, f97, 0fBDA91F5C, f253; +fma.rn.f32 f258, f100, 0fBF7F2029, f254; +fma.rn.f32 f259, f537, 0fBDA91F5C, f255; +fma.rn.f32 f260, f99, 0fBF7F2029, f256; +fma.rn.f32 f261, f101, 0fBF7C822D, f257; +fma.rn.f32 f262, f104, 0fBE288B7C, f258; +fma.rn.f32 f263, f535, 0fBF7C822D, f259; +fma.rn.f32 f264, f103, 0fBE288B7C, f260; +fma.rn.f32 f265, f105, 0fBECDAB06, f261; +fma.rn.f32 f266, f108, 0f3F6A701F, f262; +fma.rn.f32 f267, f532, 0fBECDAB06, f263; +fma.rn.f32 f268, f107, 0f3F6A701F, f264; +fma.rn.f32 f269, f109, 0f3F4A051D, f265; +fma.rn.f32 f270, f112, 0f3F1D3D0B, f266; +fma.rn.f32 f271, f530, 0f3F4A051D, f267; +fma.rn.f32 f272, f111, 0f3F1D3D0B, f268; +fma.rn.f32 f273, f77, 0fBDA91F5C, %38; +fma.rn.f32 f277, f81, 0fBF7C822D, f273; +fma.rn.f32 f516, f80, 0fBF7F2029, 0f00000000; +fma.rn.f32 f278, f84, 0f3E288B7C, f516; +fma.rn.f32 f515, f549, 0fBDA91F5C, %39; +fma.rn.f32 f279, f546, 0fBF7C822D, f515; +fma.rn.f32 f514, f79, 0fBF7F2029, 0f00000000; +fma.rn.f32 f280, f83, 0f3E288B7C, f514; +fma.rn.f32 f281, f85, 0f3E7B608C, f277; +fma.rn.f32 f282, f88, 0f3F782A9E, f278; +fma.rn.f32 f283, f544, 0f3E7B608C, f279; +fma.rn.f32 f284, f87, 0f3F782A9E, f280; +fma.rn.f32 f285, f89, 0f3F722114, f281; +fma.rn.f32 f286, f92, 0fBEA63F02, f282; +fma.rn.f32 f287, f542, 0f3F722114, f283; +fma.rn.f32 f288, f91, 0fBEA63F02, f284; +fma.rn.f32 f289, f93, 0fBECDAB06, f285; +fma.rn.f32 f290, f96, 0fBF6A701F, f286; +fma.rn.f32 f291, f539, 0fBECDAB06, f287; +fma.rn.f32 f292, f95, 0fBF6A701F, f288; +fma.rn.f32 f293, f97, 0fBF612531, f289; +fma.rn.f32 f294, f100, 0f3EF3AF60, f290; +fma.rn.f32 f295, f537, 0fBF612531, f291; +fma.rn.f32 f296, f99, 0f3EF3AF60, f292; +fma.rn.f32 f297, f101, 0f3F0C04CB, f293; +fma.rn.f32 f298, f104, 0f3F56508B, f294; +fma.rn.f32 f299, f535, 0f3F0C04CB, f295; +fma.rn.f32 f300, f103, 0f3F56508B, f296; +fma.rn.f32 f301, f105, 0f3F4A051D, f297; +fma.rn.f32 f302, f108, 0fBF1D3D0B, f298; +fma.rn.f32 f303, f532, 0f3F4A051D, f299; +fma.rn.f32 f304, f107, 0fBF1D3D0B, f300; +fma.rn.f32 f305, f109, 0fBF2D6253, f301; +fma.rn.f32 f306, f112, 0fBF3C5867, f302; +fma.rn.f32 f307, f530, 0fBF2D6253, f303; +fma.rn.f32 f308, f111, 0fBF3C5867, f304; +fma.rn.f32 f309, f77, 0fBECDAB06, %38; +fma.rn.f32 f313, f81, 0fBF2D6253, f309; +fma.rn.f32 f513, f80, 0fBF6A701F, 0f00000000; +fma.rn.f32 f314, f84, 0f3F3C5867, f513; +fma.rn.f32 f512, f549, 0fBECDAB06, %39; +fma.rn.f32 f315, f546, 0fBF2D6253, f512; +fma.rn.f32 f511, f79, 0fBF6A701F, 0f00000000; +fma.rn.f32 f316, f83, 0f3F3C5867, f511; +fma.rn.f32 f317, f85, 0f3F722114, f313; +fma.rn.f32 f318, f88, 0f3EA63F02, f314; +fma.rn.f32 f319, f544, 0f3F722114, f315; +fma.rn.f32 f320, f87, 0f3EA63F02, f316; +fma.rn.f32 f321, f89, 0fBDA91F5C, f317; +fma.rn.f32 f322, f92, 0fBF7F2029, f318; +fma.rn.f32 f323, f542, 0fBDA91F5C, f319; +fma.rn.f32 f324, f91, 0fBF7F2029, f320; +fma.rn.f32 f325, f93, 0fBF612531, f321; +fma.rn.f32 f326, f96, 0f3EF3AF60, f322; +fma.rn.f32 f327, f539, 0fBF612531, f323; +fma.rn.f32 f328, f95, 0f3EF3AF60, f324; +fma.rn.f32 f329, f97, 0f3F4A051D, f325; +fma.rn.f32 f330, f100, 0f3F1D3D0B, f326; +fma.rn.f32 f331, f537, 0f3F4A051D, f327; +fma.rn.f32 f332, f99, 0f3F1D3D0B, f328; +fma.rn.f32 f333, f101, 0f3E7B608C, f329; +fma.rn.f32 f334, f104, 0fBF782A9E, f330; +fma.rn.f32 f335, f535, 0f3E7B608C, f331; +fma.rn.f32 f336, f103, 0fBF782A9E, f332; +fma.rn.f32 f337, f105, 0fBF7C822D, f333; +fma.rn.f32 f338, f108, 0f3E288B7C, f334; +fma.rn.f32 f339, f532, 0fBF7C822D, f335; +fma.rn.f32 f340, f107, 0f3E288B7C, f336; +fma.rn.f32 f341, f109, 0f3F0C04CB, f337; +fma.rn.f32 f342, f112, 0f3F56508B, f338; +fma.rn.f32 f343, f530, 0f3F0C04CB, f339; +fma.rn.f32 f344, f111, 0f3F56508B, f340; +fma.rn.f32 f345, f77, 0fBF2D6253, %38; +fma.rn.f32 f349, f81, 0fBDA91F5C, f345; +fma.rn.f32 f510, f80, 0fBF3C5867, 0f00000000; +fma.rn.f32 f350, f84, 0f3F7F2029, f510; +fma.rn.f32 f509, f549, 0fBF2D6253, %39; +fma.rn.f32 f351, f546, 0fBDA91F5C, f509; +fma.rn.f32 f508, f79, 0fBF3C5867, 0f00000000; +fma.rn.f32 f352, f83, 0f3F7F2029, f508; +fma.rn.f32 f353, f85, 0f3F4A051D, f349; +fma.rn.f32 f354, f88, 0fBF1D3D0B, f350; +fma.rn.f32 f355, f544, 0f3F4A051D, f351; +fma.rn.f32 f356, f87, 0fBF1D3D0B, f352; +fma.rn.f32 f357, f89, 0fBF7C822D, f353; +fma.rn.f32 f358, f92, 0fBE288B7C, f354; +fma.rn.f32 f359, f542, 0fBF7C822D, f355; +fma.rn.f32 f360, f91, 0fBE288B7C, f356; +fma.rn.f32 f361, f93, 0f3F0C04CB, f357; +fma.rn.f32 f362, f96, 0f3F56508B, f358; +fma.rn.f32 f363, f539, 0f3F0C04CB, f359; +fma.rn.f32 f364, f95, 0f3F56508B, f360; +fma.rn.f32 f365, f97, 0f3E7B608C, f361; +fma.rn.f32 f366, f100, 0fBF782A9E, f362; +fma.rn.f32 f367, f537, 0f3E7B608C, f363; +fma.rn.f32 f368, f99, 0fBF782A9E, f364; +fma.rn.f32 f369, f101, 0fBF612531, f365; +fma.rn.f32 f370, f104, 0f3EF3AF60, f366; +fma.rn.f32 f371, f535, 0fBF612531, f367; +fma.rn.f32 f372, f103, 0f3EF3AF60, f368; +fma.rn.f32 f373, f105, 0f3F722114, f369; +fma.rn.f32 f374, f108, 0f3EA63F02, f370; +fma.rn.f32 f375, f532, 0f3F722114, f371; +fma.rn.f32 f376, f107, 0f3EA63F02, f372; +fma.rn.f32 f377, f109, 0fBECDAB06, f373; +fma.rn.f32 f378, f112, 0fBF6A701F, f374; +fma.rn.f32 f379, f530, 0fBECDAB06, f375; +fma.rn.f32 f380, f111, 0fBF6A701F, f376; +fma.rn.f32 f381, f77, 0fBF612531, %38; +fma.rn.f32 f385, f81, 0f3F0C04CB, f381; +fma.rn.f32 f507, f80, 0fBEF3AF60, 0f00000000; +fma.rn.f32 f386, f84, 0f3F56508B, f507; +fma.rn.f32 f506, f549, 0fBF612531, %39; +fma.rn.f32 f387, f546, 0f3F0C04CB, f506; +fma.rn.f32 f505, f79, 0fBEF3AF60, 0f00000000; +fma.rn.f32 f388, f83, 0f3F56508B, f505; +fma.rn.f32 f389, f85, 0fBDA91F5C, f385; +fma.rn.f32 f390, f88, 0fBF7F2029, f386; +fma.rn.f32 f391, f544, 0fBDA91F5C, f387; +fma.rn.f32 f392, f87, 0fBF7F2029, f388; +fma.rn.f32 f393, f89, 0fBECDAB06, f389; +fma.rn.f32 f394, f92, 0f3F6A701F, f390; +fma.rn.f32 f395, f542, 0fBECDAB06, f391; +fma.rn.f32 f396, f91, 0f3F6A701F, f392; +fma.rn.f32 f397, f93, 0f3F4A051D, f393; +fma.rn.f32 f398, f96, 0fBF1D3D0B, f394; +fma.rn.f32 f399, f539, 0f3F4A051D, f395; +fma.rn.f32 f400, f95, 0fBF1D3D0B, f396; +fma.rn.f32 f401, f97, 0fBF7C822D, f397; +fma.rn.f32 f402, f100, 0f3E288B7C, f398; +fma.rn.f32 f403, f537, 0fBF7C822D, f399; +fma.rn.f32 f404, f99, 0f3E288B7C, f400; +fma.rn.f32 f405, f101, 0f3F722114, f401; +fma.rn.f32 f406, f104, 0f3EA63F02, f402; +fma.rn.f32 f407, f535, 0f3F722114, f403; +fma.rn.f32 f408, f103, 0f3EA63F02, f404; +fma.rn.f32 f409, f105, 0fBF2D6253, f405; +fma.rn.f32 f410, f108, 0fBF3C5867, f406; +fma.rn.f32 f411, f532, 0fBF2D6253, f407; +fma.rn.f32 f412, f107, 0fBF3C5867, f408; +fma.rn.f32 f413, f109, 0f3E7B608C, f409; +fma.rn.f32 f414, f112, 0f3F782A9E, f410; +fma.rn.f32 f415, f530, 0f3E7B608C, f411; +fma.rn.f32 f416, f111, 0f3F782A9E, f412; +fma.rn.f32 f417, f77, 0fBF7C822D, %38; +fma.rn.f32 f418, f80, 0fBE288B7C, 0f00000000; +fma.rn.f32 f419, f549, 0fBF7C822D, %39; +fma.rn.f32 f420, f79, 0fBE288B7C, 0f00000000; +fma.rn.f32 f421, f81, 0f3F722114, f417; +fma.rn.f32 f422, f84, 0f3EA63F02, f418; +fma.rn.f32 f423, f546, 0f3F722114, f419; +fma.rn.f32 f424, f83, 0f3EA63F02, f420; +fma.rn.f32 f425, f85, 0fBF612531, f421; +fma.rn.f32 f426, f88, 0fBEF3AF60, f422; +fma.rn.f32 f427, f544, 0fBF612531, f423; +fma.rn.f32 f428, f87, 0fBEF3AF60, f424; +fma.rn.f32 f429, f89, 0f3F4A051D, f425; +fma.rn.f32 f430, f92, 0f3F1D3D0B, f426; +fma.rn.f32 f431, f542, 0f3F4A051D, f427; +fma.rn.f32 f432, f91, 0f3F1D3D0B, f428; +fma.rn.f32 f433, f93, 0fBF2D6253, f429; +fma.rn.f32 f434, f96, 0fBF3C5867, f430; +fma.rn.f32 f435, f539, 0fBF2D6253, f431; +fma.rn.f32 f436, f95, 0fBF3C5867, f432; +fma.rn.f32 f437, f97, 0f3F0C04CB, f433; +fma.rn.f32 f438, f100, 0f3F56508B, f434; +fma.rn.f32 f439, f537, 0f3F0C04CB, f435; +fma.rn.f32 f440, f99, 0f3F56508B, f436; +fma.rn.f32 f441, f101, 0fBECDAB06, f437; +fma.rn.f32 f442, f104, 0fBF6A701F, f438; +fma.rn.f32 f443, f535, 0fBECDAB06, f439; +fma.rn.f32 f444, f103, 0fBF6A701F, f440; +fma.rn.f32 f445, f105, 0f3E7B608C, f441; +fma.rn.f32 f446, f108, 0f3F782A9E, f442; +fma.rn.f32 f447, f532, 0f3E7B608C, f443; +fma.rn.f32 f448, f107, 0f3F782A9E, f444; +fma.rn.f32 f449, f109, 0fBDA91F5C, f445; +fma.rn.f32 f450, f112, 0fBF7F2029, f446; +fma.rn.f32 f451, f530, 0fBDA91F5C, f447; +fma.rn.f32 f452, f111, 0fBF7F2029, f448; +add.f32 %1, f128, f530; +add.f32 %0, f127, f109; +add.f32 %3, f163, f164; +sub.f32 %2, f161, f162; +add.f32 %5, f199, f200; +sub.f32 %4, f197, f198; +add.f32 %7, f235, f236; +sub.f32 %6, f233, f234; +sub.f32 %8, f269, f270; +add.f32 %9, f271, f272; +sub.f32 %10, f305, f306; +add.f32 %11, f307, f308; +sub.f32 %12, f341, f342; +add.f32 %13, f343, f344; +sub.f32 %14, f377, f378; +add.f32 %15, f379, f380; +add.f32 %17, f415, f416; +sub.f32 %16, f413, f414; +add.f32 %19, f451, f452; +sub.f32 %18, f449, f450; +sub.f32 %21, f451, f452; +add.f32 %20, f449, f450; +sub.f32 %23, f415, f416; +add.f32 %22, f413, f414; +sub.f32 %25, f379, f380; +add.f32 %24, f377, f378; +sub.f32 %27, f343, f344; +add.f32 %26, f341, f342; +sub.f32 %29, f307, f308; +add.f32 %28, f305, f306; +sub.f32 %31, f271, f272; +add.f32 %30, f269, f270; +sub.f32 %33, f235, f236; +add.f32 %32, f233, f234; +sub.f32 %35, f199, f200; +add.f32 %34, f197, f198; +sub.f32 %37, f163, f164; +add.f32 %36, f161, f162; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[2].y), "f"(rmem[17].y), "f"(rmem[16].y), "f"(rmem[4].y), "f"(rmem[5].y), "f"(rmem[14].y), "f"(rmem[13].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[11].y), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..487d9c6df06cd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp32_inv.hpp.inc @@ -0,0 +1,430 @@ +#ifndef CUFFTDX_FFT_19_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_19_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<207, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<550>; +.reg .b64 rd<4>; +add.f32 f77, %40, %75; +sub.f32 f79, %40, %75; +add.f32 f549, %42, %76; +sub.f32 f80, %42, %76; +add.f32 f81, %43, %73; +sub.f32 f83, %43, %73; +add.f32 f546, %77, %78; +sub.f32 f84, %77, %78; +add.f32 f85, %45, %71; +sub.f32 f87, %45, %71; +add.f32 f544, %46, %79; +sub.f32 f88, %46, %79; +add.f32 f89, %47, %69; +sub.f32 f91, %47, %69; +add.f32 f542, %80, %70; +sub.f32 f92, %80, %70; +add.f32 f93, %49, %67; +sub.f32 f95, %49, %67; +add.f32 f539, %81, %82; +sub.f32 f96, %81, %82; +add.f32 f97, %51, %65; +sub.f32 f99, %51, %65; +add.f32 f537, %52, %83; +sub.f32 f100, %52, %83; +add.f32 f101, %53, %63; +sub.f32 f103, %53, %63; +add.f32 f535, %84, %64; +sub.f32 f104, %84, %64; +add.f32 f105, %55, %61; +sub.f32 f107, %55, %61; +add.f32 f532, %85, %86; +sub.f32 f108, %85, %86; +add.f32 f109, %57, %59; +sub.f32 f111, %57, %59; +add.f32 f530, %58, %87; +sub.f32 f112, %58, %87; +add.f32 f113, %38, f77; +add.f32 f115, f113, f81; +add.f32 f529, %39, f549; +add.f32 f116, f529, f546; +add.f32 f117, f115, f85; +add.f32 f118, f116, f544; +add.f32 f119, f117, f89; +add.f32 f120, f118, f542; +add.f32 f121, f119, f93; +add.f32 f122, f120, f539; +add.f32 f123, f121, f97; +add.f32 f124, f122, f537; +add.f32 f125, f123, f101; +add.f32 f126, f124, f535; +add.f32 f127, f125, f105; +add.f32 f128, f126, f532; +fma.rn.f32 f129, f77, 0f3F722114, %38; +fma.rn.f32 f133, f81, 0f3F4A051D, f129; +fma.rn.f32 f528, f80, 0f3EA63F02, 0f00000000; +fma.rn.f32 f134, f84, 0f3F1D3D0B, f528; +fma.rn.f32 f527, f549, 0f3F722114, %39; +fma.rn.f32 f135, f546, 0f3F4A051D, f527; +fma.rn.f32 f526, f79, 0f3EA63F02, 0f00000000; +fma.rn.f32 f136, f83, 0f3F1D3D0B, f526; +fma.rn.f32 f137, f85, 0f3F0C04CB, f133; +fma.rn.f32 f138, f88, 0f3F56508B, f134; +fma.rn.f32 f139, f544, 0f3F0C04CB, f135; +fma.rn.f32 f140, f87, 0f3F56508B, f136; +fma.rn.f32 f141, f89, 0f3E7B608C, f137; +fma.rn.f32 f142, f92, 0f3F782A9E, f138; +fma.rn.f32 f143, f542, 0f3E7B608C, f139; +fma.rn.f32 f144, f91, 0f3F782A9E, f140; +fma.rn.f32 f145, f93, 0fBDA91F5C, f141; +fma.rn.f32 f146, f96, 0f3F7F2029, f142; +fma.rn.f32 f147, f539, 0fBDA91F5C, f143; +fma.rn.f32 f148, f95, 0f3F7F2029, f144; +fma.rn.f32 f149, f97, 0fBECDAB06, f145; +fma.rn.f32 f150, f100, 0f3F6A701F, f146; +fma.rn.f32 f151, f537, 0fBECDAB06, f147; +fma.rn.f32 f152, f99, 0f3F6A701F, f148; +fma.rn.f32 f153, f101, 0fBF2D6253, f149; +fma.rn.f32 f154, f104, 0f3F3C5867, f150; +fma.rn.f32 f155, f535, 0fBF2D6253, f151; +fma.rn.f32 f156, f103, 0f3F3C5867, f152; +fma.rn.f32 f157, f105, 0fBF612531, f153; +fma.rn.f32 f158, f108, 0f3EF3AF60, f154; +fma.rn.f32 f159, f532, 0fBF612531, f155; +fma.rn.f32 f160, f107, 0f3EF3AF60, f156; +fma.rn.f32 f161, f109, 0fBF7C822D, f157; +fma.rn.f32 f162, f112, 0f3E288B7C, f158; +fma.rn.f32 f163, f530, 0fBF7C822D, f159; +fma.rn.f32 f164, f111, 0f3E288B7C, f160; +fma.rn.f32 f165, f77, 0f3F4A051D, %38; +fma.rn.f32 f169, f81, 0f3E7B608C, f165; +fma.rn.f32 f525, f80, 0f3F1D3D0B, 0f00000000; +fma.rn.f32 f170, f84, 0f3F782A9E, f525; +fma.rn.f32 f524, f549, 0f3F4A051D, %39; +fma.rn.f32 f171, f546, 0f3E7B608C, f524; +fma.rn.f32 f523, f79, 0f3F1D3D0B, 0f00000000; +fma.rn.f32 f172, f83, 0f3F782A9E, f523; +fma.rn.f32 f173, f85, 0fBECDAB06, f169; +fma.rn.f32 f174, f88, 0f3F6A701F, f170; +fma.rn.f32 f175, f544, 0fBECDAB06, f171; +fma.rn.f32 f176, f87, 0f3F6A701F, f172; +fma.rn.f32 f177, f89, 0fBF612531, f173; +fma.rn.f32 f178, f92, 0f3EF3AF60, f174; +fma.rn.f32 f179, f542, 0fBF612531, f175; +fma.rn.f32 f180, f91, 0f3EF3AF60, f176; +fma.rn.f32 f181, f93, 0fBF7C822D, f177; +fma.rn.f32 f182, f96, 0fBE288B7C, f178; +fma.rn.f32 f183, f539, 0fBF7C822D, f179; +fma.rn.f32 f184, f95, 0fBE288B7C, f180; +fma.rn.f32 f185, f97, 0fBF2D6253, f181; +fma.rn.f32 f186, f100, 0fBF3C5867, f182; +fma.rn.f32 f187, f537, 0fBF2D6253, f183; +fma.rn.f32 f188, f99, 0fBF3C5867, f184; +fma.rn.f32 f189, f101, 0fBDA91F5C, f185; +fma.rn.f32 f190, f104, 0fBF7F2029, f186; +fma.rn.f32 f191, f535, 0fBDA91F5C, f187; +fma.rn.f32 f192, f103, 0fBF7F2029, f188; +fma.rn.f32 f193, f105, 0f3F0C04CB, f189; +fma.rn.f32 f194, f108, 0fBF56508B, f190; +fma.rn.f32 f195, f532, 0f3F0C04CB, f191; +fma.rn.f32 f196, f107, 0fBF56508B, f192; +fma.rn.f32 f197, f109, 0f3F722114, f193; +fma.rn.f32 f198, f112, 0fBEA63F02, f194; +fma.rn.f32 f199, f530, 0f3F722114, f195; +fma.rn.f32 f200, f111, 0fBEA63F02, f196; +fma.rn.f32 f201, f77, 0f3F0C04CB, %38; +fma.rn.f32 f205, f81, 0fBECDAB06, f201; +fma.rn.f32 f522, f80, 0f3F56508B, 0f00000000; +fma.rn.f32 f206, f84, 0f3F6A701F, f522; +fma.rn.f32 f521, f549, 0f3F0C04CB, %39; +fma.rn.f32 f207, f546, 0fBECDAB06, f521; +fma.rn.f32 f520, f79, 0f3F56508B, 0f00000000; +fma.rn.f32 f208, f83, 0f3F6A701F, f520; +fma.rn.f32 f209, f85, 0fBF7C822D, f205; +fma.rn.f32 f210, f88, 0f3E288B7C, f206; +fma.rn.f32 f211, f544, 0fBF7C822D, f207; +fma.rn.f32 f212, f87, 0f3E288B7C, f208; +fma.rn.f32 f213, f89, 0fBF2D6253, f209; +fma.rn.f32 f214, f92, 0fBF3C5867, f210; +fma.rn.f32 f215, f542, 0fBF2D6253, f211; +fma.rn.f32 f216, f91, 0fBF3C5867, f212; +fma.rn.f32 f217, f93, 0f3E7B608C, f213; +fma.rn.f32 f218, f96, 0fBF782A9E, f214; +fma.rn.f32 f219, f539, 0f3E7B608C, f215; +fma.rn.f32 f220, f95, 0fBF782A9E, f216; +fma.rn.f32 f221, f97, 0f3F722114, f217; +fma.rn.f32 f222, f100, 0fBEA63F02, f218; +fma.rn.f32 f223, f537, 0f3F722114, f219; +fma.rn.f32 f224, f99, 0fBEA63F02, f220; +fma.rn.f32 f225, f101, 0f3F4A051D, f221; +fma.rn.f32 f226, f104, 0f3F1D3D0B, f222; +fma.rn.f32 f227, f535, 0f3F4A051D, f223; +fma.rn.f32 f228, f103, 0f3F1D3D0B, f224; +fma.rn.f32 f229, f105, 0fBDA91F5C, f225; +fma.rn.f32 f230, f108, 0f3F7F2029, f226; +fma.rn.f32 f231, f532, 0fBDA91F5C, f227; +fma.rn.f32 f232, f107, 0f3F7F2029, f228; +fma.rn.f32 f233, f109, 0fBF612531, f229; +fma.rn.f32 f234, f112, 0f3EF3AF60, f230; +fma.rn.f32 f235, f530, 0fBF612531, f231; +fma.rn.f32 f236, f111, 0f3EF3AF60, f232; +fma.rn.f32 f237, f77, 0f3E7B608C, %38; +fma.rn.f32 f241, f81, 0fBF612531, f237; +fma.rn.f32 f519, f80, 0f3F782A9E, 0f00000000; +fma.rn.f32 f242, f84, 0f3EF3AF60, f519; +fma.rn.f32 f518, f549, 0f3E7B608C, %39; +fma.rn.f32 f243, f546, 0fBF612531, f518; +fma.rn.f32 f517, f79, 0f3F782A9E, 0f00000000; +fma.rn.f32 f244, f83, 0f3EF3AF60, f517; +fma.rn.f32 f245, f85, 0fBF2D6253, f241; +fma.rn.f32 f246, f88, 0fBF3C5867, f242; +fma.rn.f32 f247, f544, 0fBF2D6253, f243; +fma.rn.f32 f248, f87, 0fBF3C5867, f244; +fma.rn.f32 f249, f89, 0f3F0C04CB, f245; +fma.rn.f32 f250, f92, 0fBF56508B, f246; +fma.rn.f32 f251, f542, 0f3F0C04CB, f247; +fma.rn.f32 f252, f91, 0fBF56508B, f248; +fma.rn.f32 f253, f93, 0f3F722114, f249; +fma.rn.f32 f254, f96, 0f3EA63F02, f250; +fma.rn.f32 f255, f539, 0f3F722114, f251; +fma.rn.f32 f256, f95, 0f3EA63F02, f252; +fma.rn.f32 f257, f97, 0fBDA91F5C, f253; +fma.rn.f32 f258, f100, 0f3F7F2029, f254; +fma.rn.f32 f259, f537, 0fBDA91F5C, f255; +fma.rn.f32 f260, f99, 0f3F7F2029, f256; +fma.rn.f32 f261, f101, 0fBF7C822D, f257; +fma.rn.f32 f262, f104, 0f3E288B7C, f258; +fma.rn.f32 f263, f535, 0fBF7C822D, f259; +fma.rn.f32 f264, f103, 0f3E288B7C, f260; +fma.rn.f32 f265, f105, 0fBECDAB06, f261; +fma.rn.f32 f266, f108, 0fBF6A701F, f262; +fma.rn.f32 f267, f532, 0fBECDAB06, f263; +fma.rn.f32 f268, f107, 0fBF6A701F, f264; +fma.rn.f32 f269, f109, 0f3F4A051D, f265; +fma.rn.f32 f270, f112, 0fBF1D3D0B, f266; +fma.rn.f32 f271, f530, 0f3F4A051D, f267; +fma.rn.f32 f272, f111, 0fBF1D3D0B, f268; +fma.rn.f32 f273, f77, 0fBDA91F5C, %38; +fma.rn.f32 f277, f81, 0fBF7C822D, f273; +fma.rn.f32 f516, f80, 0f3F7F2029, 0f00000000; +fma.rn.f32 f278, f84, 0fBE288B7C, f516; +fma.rn.f32 f515, f549, 0fBDA91F5C, %39; +fma.rn.f32 f279, f546, 0fBF7C822D, f515; +fma.rn.f32 f514, f79, 0f3F7F2029, 0f00000000; +fma.rn.f32 f280, f83, 0fBE288B7C, f514; +fma.rn.f32 f281, f85, 0f3E7B608C, f277; +fma.rn.f32 f282, f88, 0fBF782A9E, f278; +fma.rn.f32 f283, f544, 0f3E7B608C, f279; +fma.rn.f32 f284, f87, 0fBF782A9E, f280; +fma.rn.f32 f285, f89, 0f3F722114, f281; +fma.rn.f32 f286, f92, 0f3EA63F02, f282; +fma.rn.f32 f287, f542, 0f3F722114, f283; +fma.rn.f32 f288, f91, 0f3EA63F02, f284; +fma.rn.f32 f289, f93, 0fBECDAB06, f285; +fma.rn.f32 f290, f96, 0f3F6A701F, f286; +fma.rn.f32 f291, f539, 0fBECDAB06, f287; +fma.rn.f32 f292, f95, 0f3F6A701F, f288; +fma.rn.f32 f293, f97, 0fBF612531, f289; +fma.rn.f32 f294, f100, 0fBEF3AF60, f290; +fma.rn.f32 f295, f537, 0fBF612531, f291; +fma.rn.f32 f296, f99, 0fBEF3AF60, f292; +fma.rn.f32 f297, f101, 0f3F0C04CB, f293; +fma.rn.f32 f298, f104, 0fBF56508B, f294; +fma.rn.f32 f299, f535, 0f3F0C04CB, f295; +fma.rn.f32 f300, f103, 0fBF56508B, f296; +fma.rn.f32 f301, f105, 0f3F4A051D, f297; +fma.rn.f32 f302, f108, 0f3F1D3D0B, f298; +fma.rn.f32 f303, f532, 0f3F4A051D, f299; +fma.rn.f32 f304, f107, 0f3F1D3D0B, f300; +fma.rn.f32 f305, f109, 0fBF2D6253, f301; +fma.rn.f32 f306, f112, 0f3F3C5867, f302; +fma.rn.f32 f307, f530, 0fBF2D6253, f303; +fma.rn.f32 f308, f111, 0f3F3C5867, f304; +fma.rn.f32 f309, f77, 0fBECDAB06, %38; +fma.rn.f32 f313, f81, 0fBF2D6253, f309; +fma.rn.f32 f513, f80, 0f3F6A701F, 0f00000000; +fma.rn.f32 f314, f84, 0fBF3C5867, f513; +fma.rn.f32 f512, f549, 0fBECDAB06, %39; +fma.rn.f32 f315, f546, 0fBF2D6253, f512; +fma.rn.f32 f511, f79, 0f3F6A701F, 0f00000000; +fma.rn.f32 f316, f83, 0fBF3C5867, f511; +fma.rn.f32 f317, f85, 0f3F722114, f313; +fma.rn.f32 f318, f88, 0fBEA63F02, f314; +fma.rn.f32 f319, f544, 0f3F722114, f315; +fma.rn.f32 f320, f87, 0fBEA63F02, f316; +fma.rn.f32 f321, f89, 0fBDA91F5C, f317; +fma.rn.f32 f322, f92, 0f3F7F2029, f318; +fma.rn.f32 f323, f542, 0fBDA91F5C, f319; +fma.rn.f32 f324, f91, 0f3F7F2029, f320; +fma.rn.f32 f325, f93, 0fBF612531, f321; +fma.rn.f32 f326, f96, 0fBEF3AF60, f322; +fma.rn.f32 f327, f539, 0fBF612531, f323; +fma.rn.f32 f328, f95, 0fBEF3AF60, f324; +fma.rn.f32 f329, f97, 0f3F4A051D, f325; +fma.rn.f32 f330, f100, 0fBF1D3D0B, f326; +fma.rn.f32 f331, f537, 0f3F4A051D, f327; +fma.rn.f32 f332, f99, 0fBF1D3D0B, f328; +fma.rn.f32 f333, f101, 0f3E7B608C, f329; +fma.rn.f32 f334, f104, 0f3F782A9E, f330; +fma.rn.f32 f335, f535, 0f3E7B608C, f331; +fma.rn.f32 f336, f103, 0f3F782A9E, f332; +fma.rn.f32 f337, f105, 0fBF7C822D, f333; +fma.rn.f32 f338, f108, 0fBE288B7C, f334; +fma.rn.f32 f339, f532, 0fBF7C822D, f335; +fma.rn.f32 f340, f107, 0fBE288B7C, f336; +fma.rn.f32 f341, f109, 0f3F0C04CB, f337; +fma.rn.f32 f342, f112, 0fBF56508B, f338; +fma.rn.f32 f343, f530, 0f3F0C04CB, f339; +fma.rn.f32 f344, f111, 0fBF56508B, f340; +fma.rn.f32 f345, f77, 0fBF2D6253, %38; +fma.rn.f32 f349, f81, 0fBDA91F5C, f345; +fma.rn.f32 f510, f80, 0f3F3C5867, 0f00000000; +fma.rn.f32 f350, f84, 0fBF7F2029, f510; +fma.rn.f32 f509, f549, 0fBF2D6253, %39; +fma.rn.f32 f351, f546, 0fBDA91F5C, f509; +fma.rn.f32 f508, f79, 0f3F3C5867, 0f00000000; +fma.rn.f32 f352, f83, 0fBF7F2029, f508; +fma.rn.f32 f353, f85, 0f3F4A051D, f349; +fma.rn.f32 f354, f88, 0f3F1D3D0B, f350; +fma.rn.f32 f355, f544, 0f3F4A051D, f351; +fma.rn.f32 f356, f87, 0f3F1D3D0B, f352; +fma.rn.f32 f357, f89, 0fBF7C822D, f353; +fma.rn.f32 f358, f92, 0f3E288B7C, f354; +fma.rn.f32 f359, f542, 0fBF7C822D, f355; +fma.rn.f32 f360, f91, 0f3E288B7C, f356; +fma.rn.f32 f361, f93, 0f3F0C04CB, f357; +fma.rn.f32 f362, f96, 0fBF56508B, f358; +fma.rn.f32 f363, f539, 0f3F0C04CB, f359; +fma.rn.f32 f364, f95, 0fBF56508B, f360; +fma.rn.f32 f365, f97, 0f3E7B608C, f361; +fma.rn.f32 f366, f100, 0f3F782A9E, f362; +fma.rn.f32 f367, f537, 0f3E7B608C, f363; +fma.rn.f32 f368, f99, 0f3F782A9E, f364; +fma.rn.f32 f369, f101, 0fBF612531, f365; +fma.rn.f32 f370, f104, 0fBEF3AF60, f366; +fma.rn.f32 f371, f535, 0fBF612531, f367; +fma.rn.f32 f372, f103, 0fBEF3AF60, f368; +fma.rn.f32 f373, f105, 0f3F722114, f369; +fma.rn.f32 f374, f108, 0fBEA63F02, f370; +fma.rn.f32 f375, f532, 0f3F722114, f371; +fma.rn.f32 f376, f107, 0fBEA63F02, f372; +fma.rn.f32 f377, f109, 0fBECDAB06, f373; +fma.rn.f32 f378, f112, 0f3F6A701F, f374; +fma.rn.f32 f379, f530, 0fBECDAB06, f375; +fma.rn.f32 f380, f111, 0f3F6A701F, f376; +fma.rn.f32 f381, f77, 0fBF612531, %38; +fma.rn.f32 f385, f81, 0f3F0C04CB, f381; +fma.rn.f32 f507, f80, 0f3EF3AF60, 0f00000000; +fma.rn.f32 f386, f84, 0fBF56508B, f507; +fma.rn.f32 f506, f549, 0fBF612531, %39; +fma.rn.f32 f387, f546, 0f3F0C04CB, f506; +fma.rn.f32 f505, f79, 0f3EF3AF60, 0f00000000; +fma.rn.f32 f388, f83, 0fBF56508B, f505; +fma.rn.f32 f389, f85, 0fBDA91F5C, f385; +fma.rn.f32 f390, f88, 0f3F7F2029, f386; +fma.rn.f32 f391, f544, 0fBDA91F5C, f387; +fma.rn.f32 f392, f87, 0f3F7F2029, f388; +fma.rn.f32 f393, f89, 0fBECDAB06, f389; +fma.rn.f32 f394, f92, 0fBF6A701F, f390; +fma.rn.f32 f395, f542, 0fBECDAB06, f391; +fma.rn.f32 f396, f91, 0fBF6A701F, f392; +fma.rn.f32 f397, f93, 0f3F4A051D, f393; +fma.rn.f32 f398, f96, 0f3F1D3D0B, f394; +fma.rn.f32 f399, f539, 0f3F4A051D, f395; +fma.rn.f32 f400, f95, 0f3F1D3D0B, f396; +fma.rn.f32 f401, f97, 0fBF7C822D, f397; +fma.rn.f32 f402, f100, 0fBE288B7C, f398; +fma.rn.f32 f403, f537, 0fBF7C822D, f399; +fma.rn.f32 f404, f99, 0fBE288B7C, f400; +fma.rn.f32 f405, f101, 0f3F722114, f401; +fma.rn.f32 f406, f104, 0fBEA63F02, f402; +fma.rn.f32 f407, f535, 0f3F722114, f403; +fma.rn.f32 f408, f103, 0fBEA63F02, f404; +fma.rn.f32 f409, f105, 0fBF2D6253, f405; +fma.rn.f32 f410, f108, 0f3F3C5867, f406; +fma.rn.f32 f411, f532, 0fBF2D6253, f407; +fma.rn.f32 f412, f107, 0f3F3C5867, f408; +fma.rn.f32 f413, f109, 0f3E7B608C, f409; +fma.rn.f32 f414, f112, 0fBF782A9E, f410; +fma.rn.f32 f415, f530, 0f3E7B608C, f411; +fma.rn.f32 f416, f111, 0fBF782A9E, f412; +fma.rn.f32 f417, f77, 0fBF7C822D, %38; +fma.rn.f32 f418, f80, 0f3E288B7C, 0f00000000; +fma.rn.f32 f419, f549, 0fBF7C822D, %39; +fma.rn.f32 f420, f79, 0f3E288B7C, 0f00000000; +fma.rn.f32 f421, f81, 0f3F722114, f417; +fma.rn.f32 f422, f84, 0fBEA63F02, f418; +fma.rn.f32 f423, f546, 0f3F722114, f419; +fma.rn.f32 f424, f83, 0fBEA63F02, f420; +fma.rn.f32 f425, f85, 0fBF612531, f421; +fma.rn.f32 f426, f88, 0f3EF3AF60, f422; +fma.rn.f32 f427, f544, 0fBF612531, f423; +fma.rn.f32 f428, f87, 0f3EF3AF60, f424; +fma.rn.f32 f429, f89, 0f3F4A051D, f425; +fma.rn.f32 f430, f92, 0fBF1D3D0B, f426; +fma.rn.f32 f431, f542, 0f3F4A051D, f427; +fma.rn.f32 f432, f91, 0fBF1D3D0B, f428; +fma.rn.f32 f433, f93, 0fBF2D6253, f429; +fma.rn.f32 f434, f96, 0f3F3C5867, f430; +fma.rn.f32 f435, f539, 0fBF2D6253, f431; +fma.rn.f32 f436, f95, 0f3F3C5867, f432; +fma.rn.f32 f437, f97, 0f3F0C04CB, f433; +fma.rn.f32 f438, f100, 0fBF56508B, f434; +fma.rn.f32 f439, f537, 0f3F0C04CB, f435; +fma.rn.f32 f440, f99, 0fBF56508B, f436; +fma.rn.f32 f441, f101, 0fBECDAB06, f437; +fma.rn.f32 f442, f104, 0f3F6A701F, f438; +fma.rn.f32 f443, f535, 0fBECDAB06, f439; +fma.rn.f32 f444, f103, 0f3F6A701F, f440; +fma.rn.f32 f445, f105, 0f3E7B608C, f441; +fma.rn.f32 f446, f108, 0fBF782A9E, f442; +fma.rn.f32 f447, f532, 0f3E7B608C, f443; +fma.rn.f32 f448, f107, 0fBF782A9E, f444; +fma.rn.f32 f449, f109, 0fBDA91F5C, f445; +fma.rn.f32 f450, f112, 0f3F7F2029, f446; +fma.rn.f32 f451, f530, 0fBDA91F5C, f447; +fma.rn.f32 f452, f111, 0f3F7F2029, f448; +add.f32 %1, f128, f530; +add.f32 %0, f127, f109; +add.f32 %3, f163, f164; +sub.f32 %2, f161, f162; +add.f32 %5, f199, f200; +sub.f32 %4, f197, f198; +add.f32 %7, f235, f236; +sub.f32 %6, f233, f234; +sub.f32 %8, f269, f270; +add.f32 %9, f271, f272; +sub.f32 %10, f305, f306; +add.f32 %11, f307, f308; +sub.f32 %12, f341, f342; +add.f32 %13, f343, f344; +sub.f32 %14, f377, f378; +add.f32 %15, f379, f380; +add.f32 %17, f415, f416; +sub.f32 %16, f413, f414; +add.f32 %19, f451, f452; +sub.f32 %18, f449, f450; +sub.f32 %21, f451, f452; +add.f32 %20, f449, f450; +sub.f32 %23, f415, f416; +add.f32 %22, f413, f414; +sub.f32 %25, f379, f380; +add.f32 %24, f377, f378; +sub.f32 %27, f343, f344; +add.f32 %26, f341, f342; +sub.f32 %29, f307, f308; +add.f32 %28, f305, f306; +sub.f32 %31, f271, f272; +add.f32 %30, f269, f270; +sub.f32 %33, f235, f236; +add.f32 %32, f233, f234; +sub.f32 %35, f199, f200; +add.f32 %34, f197, f198; +sub.f32 %37, f163, f164; +add.f32 %36, f161, f162; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[2].y), "f"(rmem[17].y), "f"(rmem[16].y), "f"(rmem[4].y), "f"(rmem[5].y), "f"(rmem[14].y), "f"(rmem[13].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[11].y), "f"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..6d90faa107050 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp64_fwd.hpp.inc @@ -0,0 +1,430 @@ +#ifndef CUFFTDX_FFT_19_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_19_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<409, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<550>; +.reg .b64 rd<4>; +add.f64 fd77, %40, %75; +sub.f64 fd79, %40, %75; +add.f64 fd549, %42, %76; +sub.f64 fd80, %42, %76; +add.f64 fd81, %43, %73; +sub.f64 fd83, %43, %73; +add.f64 fd546, %77, %78; +sub.f64 fd84, %77, %78; +add.f64 fd85, %45, %71; +sub.f64 fd87, %45, %71; +add.f64 fd544, %46, %79; +sub.f64 fd88, %46, %79; +add.f64 fd89, %47, %69; +sub.f64 fd91, %47, %69; +add.f64 fd542, %80, %70; +sub.f64 fd92, %80, %70; +add.f64 fd93, %49, %67; +sub.f64 fd95, %49, %67; +add.f64 fd539, %81, %82; +sub.f64 fd96, %81, %82; +add.f64 fd97, %51, %65; +sub.f64 fd99, %51, %65; +add.f64 fd537, %52, %83; +sub.f64 fd100, %52, %83; +add.f64 fd101, %53, %63; +sub.f64 fd103, %53, %63; +add.f64 fd535, %84, %64; +sub.f64 fd104, %84, %64; +add.f64 fd105, %55, %61; +sub.f64 fd107, %55, %61; +add.f64 fd532, %85, %86; +sub.f64 fd108, %85, %86; +add.f64 fd109, %57, %59; +sub.f64 fd111, %57, %59; +add.f64 fd530, %58, %87; +sub.f64 fd112, %58, %87; +add.f64 fd113, %38, fd77; +add.f64 fd115, fd113, fd81; +add.f64 fd529, %39, fd549; +add.f64 fd116, fd529, fd546; +add.f64 fd117, fd115, fd85; +add.f64 fd118, fd116, fd544; +add.f64 fd119, fd117, fd89; +add.f64 fd120, fd118, fd542; +add.f64 fd121, fd119, fd93; +add.f64 fd122, fd120, fd539; +add.f64 fd123, fd121, fd97; +add.f64 fd124, fd122, fd537; +add.f64 fd125, fd123, fd101; +add.f64 fd126, fd124, fd535; +add.f64 fd127, fd125, fd105; +add.f64 fd128, fd126, fd532; +fma.rn.f64 fd129, fd77, 0d3FEE442285231BE1, %38; +fma.rn.f64 fd133, fd81, 0d3FE940A398F9CD23, fd129; +fma.rn.f64 fd528, fd80, 0dBFD4C7E04850CFAA, 0d0000000000000000; +fma.rn.f64 fd134, fd84, 0dBFE3A7A16B394423, fd528; +fma.rn.f64 fd527, fd549, 0d3FEE442285231BE1, %39; +fma.rn.f64 fd135, fd546, 0d3FE940A398F9CD23, fd527; +fma.rn.f64 fd526, fd79, 0dBFD4C7E04850CFAA, 0d0000000000000000; +fma.rn.f64 fd136, fd83, 0dBFE3A7A16B394423, fd526; +fma.rn.f64 fd137, fd85, 0d3FE180996C77C8CA, fd133; +fma.rn.f64 fd138, fd88, 0dBFEACA115AAE3DE4, fd134; +fma.rn.f64 fd139, fd544, 0d3FE180996C77C8CA, fd135; +fma.rn.f64 fd140, fd87, 0dBFEACA115AAE3DE4, fd136; +fma.rn.f64 fd141, fd89, 0d3FCF6C118574C83E, fd137; +fma.rn.f64 fd142, fd92, 0dBFEF0553B4DE2E18, fd138; +fma.rn.f64 fd143, fd542, 0d3FCF6C118574C83E, fd139; +fma.rn.f64 fd144, fd91, 0dBFEF0553B4DE2E18, fd140; +fma.rn.f64 fd145, fd93, 0dBFB523EB8420F5F5, fd141; +fma.rn.f64 fd146, fd96, 0dBFEFE40529A542AA, fd142; +fma.rn.f64 fd147, fd539, 0dBFB523EB8420F5F5, fd143; +fma.rn.f64 fd148, fd95, 0dBFEFE40529A542AA, fd144; +fma.rn.f64 fd149, fd97, 0dBFD9B560B9F596EA, fd145; +fma.rn.f64 fd150, fd100, 0dBFED4E03DD110B08, fd146; +fma.rn.f64 fd151, fd537, 0dBFD9B560B9F596EA, fd147; +fma.rn.f64 fd152, fd99, 0dBFED4E03DD110B08, fd148; +fma.rn.f64 fd153, fd101, 0dBFE5AC4A670A1CFF, fd149; +fma.rn.f64 fd154, fd104, 0dBFE78B0CDEE73E0F, fd150; +fma.rn.f64 fd155, fd535, 0dBFE5AC4A670A1CFF, fd151; +fma.rn.f64 fd156, fd103, 0dBFE78B0CDEE73E0F, fd152; +fma.rn.f64 fd157, fd105, 0dBFEC24A622E3E9F9, fd153; +fma.rn.f64 fd158, fd108, 0dBFDE75EC0DED7BEE, fd154; +fma.rn.f64 fd159, fd532, 0dBFEC24A622E3E9F9, fd155; +fma.rn.f64 fd160, fd107, 0dBFDE75EC0DED7BEE, fd156; +fma.rn.f64 fd161, fd109, 0dBFEF90459484F2B2, fd157; +fma.rn.f64 fd162, fd112, 0dBFC5116F7F2D58C5, fd158; +fma.rn.f64 fd163, fd530, 0dBFEF90459484F2B2, fd159; +fma.rn.f64 fd164, fd111, 0dBFC5116F7F2D58C5, fd160; +fma.rn.f64 fd165, fd77, 0d3FE940A398F9CD23, %38; +fma.rn.f64 fd169, fd81, 0d3FCF6C118574C83E, fd165; +fma.rn.f64 fd525, fd80, 0dBFE3A7A16B394423, 0d0000000000000000; +fma.rn.f64 fd170, fd84, 0dBFEF0553B4DE2E18, fd525; +fma.rn.f64 fd524, fd549, 0d3FE940A398F9CD23, %39; +fma.rn.f64 fd171, fd546, 0d3FCF6C118574C83E, fd524; +fma.rn.f64 fd523, fd79, 0dBFE3A7A16B394423, 0d0000000000000000; +fma.rn.f64 fd172, fd83, 0dBFEF0553B4DE2E18, fd523; +fma.rn.f64 fd173, fd85, 0dBFD9B560B9F596EA, fd169; +fma.rn.f64 fd174, fd88, 0dBFED4E03DD110B08, fd170; +fma.rn.f64 fd175, fd544, 0dBFD9B560B9F596EA, fd171; +fma.rn.f64 fd176, fd87, 0dBFED4E03DD110B08, fd172; +fma.rn.f64 fd177, fd89, 0dBFEC24A622E3E9F9, fd173; +fma.rn.f64 fd178, fd92, 0dBFDE75EC0DED7BEE, fd174; +fma.rn.f64 fd179, fd542, 0dBFEC24A622E3E9F9, fd175; +fma.rn.f64 fd180, fd91, 0dBFDE75EC0DED7BEE, fd176; +fma.rn.f64 fd181, fd93, 0dBFEF90459484F2B2, fd177; +fma.rn.f64 fd182, fd96, 0d3FC5116F7F2D58C5, fd178; +fma.rn.f64 fd183, fd539, 0dBFEF90459484F2B2, fd179; +fma.rn.f64 fd184, fd95, 0d3FC5116F7F2D58C5, fd180; +fma.rn.f64 fd185, fd97, 0dBFE5AC4A670A1CFF, fd181; +fma.rn.f64 fd186, fd100, 0d3FE78B0CDEE73E0F, fd182; +fma.rn.f64 fd187, fd537, 0dBFE5AC4A670A1CFF, fd183; +fma.rn.f64 fd188, fd99, 0d3FE78B0CDEE73E0F, fd184; +fma.rn.f64 fd189, fd101, 0dBFB523EB8420F5F5, fd185; +fma.rn.f64 fd190, fd104, 0d3FEFE40529A542AA, fd186; +fma.rn.f64 fd191, fd535, 0dBFB523EB8420F5F5, fd187; +fma.rn.f64 fd192, fd103, 0d3FEFE40529A542AA, fd188; +fma.rn.f64 fd193, fd105, 0d3FE180996C77C8CA, fd189; +fma.rn.f64 fd194, fd108, 0d3FEACA115AAE3DE4, fd190; +fma.rn.f64 fd195, fd532, 0d3FE180996C77C8CA, fd191; +fma.rn.f64 fd196, fd107, 0d3FEACA115AAE3DE4, fd192; +fma.rn.f64 fd197, fd109, 0d3FEE442285231BE1, fd193; +fma.rn.f64 fd198, fd112, 0d3FD4C7E04850CFAA, fd194; +fma.rn.f64 fd199, fd530, 0d3FEE442285231BE1, fd195; +fma.rn.f64 fd200, fd111, 0d3FD4C7E04850CFAA, fd196; +fma.rn.f64 fd201, fd77, 0d3FE180996C77C8CA, %38; +fma.rn.f64 fd205, fd81, 0dBFD9B560B9F596EA, fd201; +fma.rn.f64 fd522, fd80, 0dBFEACA115AAE3DE4, 0d0000000000000000; +fma.rn.f64 fd206, fd84, 0dBFED4E03DD110B08, fd522; +fma.rn.f64 fd521, fd549, 0d3FE180996C77C8CA, %39; +fma.rn.f64 fd207, fd546, 0dBFD9B560B9F596EA, fd521; +fma.rn.f64 fd520, fd79, 0dBFEACA115AAE3DE4, 0d0000000000000000; +fma.rn.f64 fd208, fd83, 0dBFED4E03DD110B08, fd520; +fma.rn.f64 fd209, fd85, 0dBFEF90459484F2B2, fd205; +fma.rn.f64 fd210, fd88, 0dBFC5116F7F2D58C5, fd206; +fma.rn.f64 fd211, fd544, 0dBFEF90459484F2B2, fd207; +fma.rn.f64 fd212, fd87, 0dBFC5116F7F2D58C5, fd208; +fma.rn.f64 fd213, fd89, 0dBFE5AC4A670A1CFF, fd209; +fma.rn.f64 fd214, fd92, 0d3FE78B0CDEE73E0F, fd210; +fma.rn.f64 fd215, fd542, 0dBFE5AC4A670A1CFF, fd211; +fma.rn.f64 fd216, fd91, 0d3FE78B0CDEE73E0F, fd212; +fma.rn.f64 fd217, fd93, 0d3FCF6C118574C83E, fd213; +fma.rn.f64 fd218, fd96, 0d3FEF0553B4DE2E18, fd214; +fma.rn.f64 fd219, fd539, 0d3FCF6C118574C83E, fd215; +fma.rn.f64 fd220, fd95, 0d3FEF0553B4DE2E18, fd216; +fma.rn.f64 fd221, fd97, 0d3FEE442285231BE1, fd217; +fma.rn.f64 fd222, fd100, 0d3FD4C7E04850CFAA, fd218; +fma.rn.f64 fd223, fd537, 0d3FEE442285231BE1, fd219; +fma.rn.f64 fd224, fd99, 0d3FD4C7E04850CFAA, fd220; +fma.rn.f64 fd225, fd101, 0d3FE940A398F9CD23, fd221; +fma.rn.f64 fd226, fd104, 0dBFE3A7A16B394423, fd222; +fma.rn.f64 fd227, fd535, 0d3FE940A398F9CD23, fd223; +fma.rn.f64 fd228, fd103, 0dBFE3A7A16B394423, fd224; +fma.rn.f64 fd229, fd105, 0dBFB523EB8420F5F5, fd225; +fma.rn.f64 fd230, fd108, 0dBFEFE40529A542AA, fd226; +fma.rn.f64 fd231, fd532, 0dBFB523EB8420F5F5, fd227; +fma.rn.f64 fd232, fd107, 0dBFEFE40529A542AA, fd228; +fma.rn.f64 fd233, fd109, 0dBFEC24A622E3E9F9, fd229; +fma.rn.f64 fd234, fd112, 0dBFDE75EC0DED7BEE, fd230; +fma.rn.f64 fd235, fd530, 0dBFEC24A622E3E9F9, fd231; +fma.rn.f64 fd236, fd111, 0dBFDE75EC0DED7BEE, fd232; +fma.rn.f64 fd237, fd77, 0d3FCF6C118574C83E, %38; +fma.rn.f64 fd241, fd81, 0dBFEC24A622E3E9F9, fd237; +fma.rn.f64 fd519, fd80, 0dBFEF0553B4DE2E18, 0d0000000000000000; +fma.rn.f64 fd242, fd84, 0dBFDE75EC0DED7BEE, fd519; +fma.rn.f64 fd518, fd549, 0d3FCF6C118574C83E, %39; +fma.rn.f64 fd243, fd546, 0dBFEC24A622E3E9F9, fd518; +fma.rn.f64 fd517, fd79, 0dBFEF0553B4DE2E18, 0d0000000000000000; +fma.rn.f64 fd244, fd83, 0dBFDE75EC0DED7BEE, fd517; +fma.rn.f64 fd245, fd85, 0dBFE5AC4A670A1CFF, fd241; +fma.rn.f64 fd246, fd88, 0d3FE78B0CDEE73E0F, fd242; +fma.rn.f64 fd247, fd544, 0dBFE5AC4A670A1CFF, fd243; +fma.rn.f64 fd248, fd87, 0d3FE78B0CDEE73E0F, fd244; +fma.rn.f64 fd249, fd89, 0d3FE180996C77C8CA, fd245; +fma.rn.f64 fd250, fd92, 0d3FEACA115AAE3DE4, fd246; +fma.rn.f64 fd251, fd542, 0d3FE180996C77C8CA, fd247; +fma.rn.f64 fd252, fd91, 0d3FEACA115AAE3DE4, fd248; +fma.rn.f64 fd253, fd93, 0d3FEE442285231BE1, fd249; +fma.rn.f64 fd254, fd96, 0dBFD4C7E04850CFAA, fd250; +fma.rn.f64 fd255, fd539, 0d3FEE442285231BE1, fd251; +fma.rn.f64 fd256, fd95, 0dBFD4C7E04850CFAA, fd252; +fma.rn.f64 fd257, fd97, 0dBFB523EB8420F5F5, fd253; +fma.rn.f64 fd258, fd100, 0dBFEFE40529A542AA, fd254; +fma.rn.f64 fd259, fd537, 0dBFB523EB8420F5F5, fd255; +fma.rn.f64 fd260, fd99, 0dBFEFE40529A542AA, fd256; +fma.rn.f64 fd261, fd101, 0dBFEF90459484F2B2, fd257; +fma.rn.f64 fd262, fd104, 0dBFC5116F7F2D58C5, fd258; +fma.rn.f64 fd263, fd535, 0dBFEF90459484F2B2, fd259; +fma.rn.f64 fd264, fd103, 0dBFC5116F7F2D58C5, fd260; +fma.rn.f64 fd265, fd105, 0dBFD9B560B9F596EA, fd261; +fma.rn.f64 fd266, fd108, 0d3FED4E03DD110B08, fd262; +fma.rn.f64 fd267, fd532, 0dBFD9B560B9F596EA, fd263; +fma.rn.f64 fd268, fd107, 0d3FED4E03DD110B08, fd264; +fma.rn.f64 fd269, fd109, 0d3FE940A398F9CD23, fd265; +fma.rn.f64 fd270, fd112, 0d3FE3A7A16B394423, fd266; +fma.rn.f64 fd271, fd530, 0d3FE940A398F9CD23, fd267; +fma.rn.f64 fd272, fd111, 0d3FE3A7A16B394423, fd268; +fma.rn.f64 fd273, fd77, 0dBFB523EB8420F5F5, %38; +fma.rn.f64 fd277, fd81, 0dBFEF90459484F2B2, fd273; +fma.rn.f64 fd516, fd80, 0dBFEFE40529A542AA, 0d0000000000000000; +fma.rn.f64 fd278, fd84, 0d3FC5116F7F2D58C5, fd516; +fma.rn.f64 fd515, fd549, 0dBFB523EB8420F5F5, %39; +fma.rn.f64 fd279, fd546, 0dBFEF90459484F2B2, fd515; +fma.rn.f64 fd514, fd79, 0dBFEFE40529A542AA, 0d0000000000000000; +fma.rn.f64 fd280, fd83, 0d3FC5116F7F2D58C5, fd514; +fma.rn.f64 fd281, fd85, 0d3FCF6C118574C83E, fd277; +fma.rn.f64 fd282, fd88, 0d3FEF0553B4DE2E18, fd278; +fma.rn.f64 fd283, fd544, 0d3FCF6C118574C83E, fd279; +fma.rn.f64 fd284, fd87, 0d3FEF0553B4DE2E18, fd280; +fma.rn.f64 fd285, fd89, 0d3FEE442285231BE1, fd281; +fma.rn.f64 fd286, fd92, 0dBFD4C7E04850CFAA, fd282; +fma.rn.f64 fd287, fd542, 0d3FEE442285231BE1, fd283; +fma.rn.f64 fd288, fd91, 0dBFD4C7E04850CFAA, fd284; +fma.rn.f64 fd289, fd93, 0dBFD9B560B9F596EA, fd285; +fma.rn.f64 fd290, fd96, 0dBFED4E03DD110B08, fd286; +fma.rn.f64 fd291, fd539, 0dBFD9B560B9F596EA, fd287; +fma.rn.f64 fd292, fd95, 0dBFED4E03DD110B08, fd288; +fma.rn.f64 fd293, fd97, 0dBFEC24A622E3E9F9, fd289; +fma.rn.f64 fd294, fd100, 0d3FDE75EC0DED7BEE, fd290; +fma.rn.f64 fd295, fd537, 0dBFEC24A622E3E9F9, fd291; +fma.rn.f64 fd296, fd99, 0d3FDE75EC0DED7BEE, fd292; +fma.rn.f64 fd297, fd101, 0d3FE180996C77C8CA, fd293; +fma.rn.f64 fd298, fd104, 0d3FEACA115AAE3DE4, fd294; +fma.rn.f64 fd299, fd535, 0d3FE180996C77C8CA, fd295; +fma.rn.f64 fd300, fd103, 0d3FEACA115AAE3DE4, fd296; +fma.rn.f64 fd301, fd105, 0d3FE940A398F9CD23, fd297; +fma.rn.f64 fd302, fd108, 0dBFE3A7A16B394423, fd298; +fma.rn.f64 fd303, fd532, 0d3FE940A398F9CD23, fd299; +fma.rn.f64 fd304, fd107, 0dBFE3A7A16B394423, fd300; +fma.rn.f64 fd305, fd109, 0dBFE5AC4A670A1CFF, fd301; +fma.rn.f64 fd306, fd112, 0dBFE78B0CDEE73E0F, fd302; +fma.rn.f64 fd307, fd530, 0dBFE5AC4A670A1CFF, fd303; +fma.rn.f64 fd308, fd111, 0dBFE78B0CDEE73E0F, fd304; +fma.rn.f64 fd309, fd77, 0dBFD9B560B9F596EA, %38; +fma.rn.f64 fd313, fd81, 0dBFE5AC4A670A1CFF, fd309; +fma.rn.f64 fd513, fd80, 0dBFED4E03DD110B08, 0d0000000000000000; +fma.rn.f64 fd314, fd84, 0d3FE78B0CDEE73E0F, fd513; +fma.rn.f64 fd512, fd549, 0dBFD9B560B9F596EA, %39; +fma.rn.f64 fd315, fd546, 0dBFE5AC4A670A1CFF, fd512; +fma.rn.f64 fd511, fd79, 0dBFED4E03DD110B08, 0d0000000000000000; +fma.rn.f64 fd316, fd83, 0d3FE78B0CDEE73E0F, fd511; +fma.rn.f64 fd317, fd85, 0d3FEE442285231BE1, fd313; +fma.rn.f64 fd318, fd88, 0d3FD4C7E04850CFAA, fd314; +fma.rn.f64 fd319, fd544, 0d3FEE442285231BE1, fd315; +fma.rn.f64 fd320, fd87, 0d3FD4C7E04850CFAA, fd316; +fma.rn.f64 fd321, fd89, 0dBFB523EB8420F5F5, fd317; +fma.rn.f64 fd322, fd92, 0dBFEFE40529A542AA, fd318; +fma.rn.f64 fd323, fd542, 0dBFB523EB8420F5F5, fd319; +fma.rn.f64 fd324, fd91, 0dBFEFE40529A542AA, fd320; +fma.rn.f64 fd325, fd93, 0dBFEC24A622E3E9F9, fd321; +fma.rn.f64 fd326, fd96, 0d3FDE75EC0DED7BEE, fd322; +fma.rn.f64 fd327, fd539, 0dBFEC24A622E3E9F9, fd323; +fma.rn.f64 fd328, fd95, 0d3FDE75EC0DED7BEE, fd324; +fma.rn.f64 fd329, fd97, 0d3FE940A398F9CD23, fd325; +fma.rn.f64 fd330, fd100, 0d3FE3A7A16B394423, fd326; +fma.rn.f64 fd331, fd537, 0d3FE940A398F9CD23, fd327; +fma.rn.f64 fd332, fd99, 0d3FE3A7A16B394423, fd328; +fma.rn.f64 fd333, fd101, 0d3FCF6C118574C83E, fd329; +fma.rn.f64 fd334, fd104, 0dBFEF0553B4DE2E18, fd330; +fma.rn.f64 fd335, fd535, 0d3FCF6C118574C83E, fd331; +fma.rn.f64 fd336, fd103, 0dBFEF0553B4DE2E18, fd332; +fma.rn.f64 fd337, fd105, 0dBFEF90459484F2B2, fd333; +fma.rn.f64 fd338, fd108, 0d3FC5116F7F2D58C5, fd334; +fma.rn.f64 fd339, fd532, 0dBFEF90459484F2B2, fd335; +fma.rn.f64 fd340, fd107, 0d3FC5116F7F2D58C5, fd336; +fma.rn.f64 fd341, fd109, 0d3FE180996C77C8CA, fd337; +fma.rn.f64 fd342, fd112, 0d3FEACA115AAE3DE4, fd338; +fma.rn.f64 fd343, fd530, 0d3FE180996C77C8CA, fd339; +fma.rn.f64 fd344, fd111, 0d3FEACA115AAE3DE4, fd340; +fma.rn.f64 fd345, fd77, 0dBFE5AC4A670A1CFF, %38; +fma.rn.f64 fd349, fd81, 0dBFB523EB8420F5F5, fd345; +fma.rn.f64 fd510, fd80, 0dBFE78B0CDEE73E0F, 0d0000000000000000; +fma.rn.f64 fd350, fd84, 0d3FEFE40529A542AA, fd510; +fma.rn.f64 fd509, fd549, 0dBFE5AC4A670A1CFF, %39; +fma.rn.f64 fd351, fd546, 0dBFB523EB8420F5F5, fd509; +fma.rn.f64 fd508, fd79, 0dBFE78B0CDEE73E0F, 0d0000000000000000; +fma.rn.f64 fd352, fd83, 0d3FEFE40529A542AA, fd508; +fma.rn.f64 fd353, fd85, 0d3FE940A398F9CD23, fd349; +fma.rn.f64 fd354, fd88, 0dBFE3A7A16B394423, fd350; +fma.rn.f64 fd355, fd544, 0d3FE940A398F9CD23, fd351; +fma.rn.f64 fd356, fd87, 0dBFE3A7A16B394423, fd352; +fma.rn.f64 fd357, fd89, 0dBFEF90459484F2B2, fd353; +fma.rn.f64 fd358, fd92, 0dBFC5116F7F2D58C5, fd354; +fma.rn.f64 fd359, fd542, 0dBFEF90459484F2B2, fd355; +fma.rn.f64 fd360, fd91, 0dBFC5116F7F2D58C5, fd356; +fma.rn.f64 fd361, fd93, 0d3FE180996C77C8CA, fd357; +fma.rn.f64 fd362, fd96, 0d3FEACA115AAE3DE4, fd358; +fma.rn.f64 fd363, fd539, 0d3FE180996C77C8CA, fd359; +fma.rn.f64 fd364, fd95, 0d3FEACA115AAE3DE4, fd360; +fma.rn.f64 fd365, fd97, 0d3FCF6C118574C83E, fd361; +fma.rn.f64 fd366, fd100, 0dBFEF0553B4DE2E18, fd362; +fma.rn.f64 fd367, fd537, 0d3FCF6C118574C83E, fd363; +fma.rn.f64 fd368, fd99, 0dBFEF0553B4DE2E18, fd364; +fma.rn.f64 fd369, fd101, 0dBFEC24A622E3E9F9, fd365; +fma.rn.f64 fd370, fd104, 0d3FDE75EC0DED7BEE, fd366; +fma.rn.f64 fd371, fd535, 0dBFEC24A622E3E9F9, fd367; +fma.rn.f64 fd372, fd103, 0d3FDE75EC0DED7BEE, fd368; +fma.rn.f64 fd373, fd105, 0d3FEE442285231BE1, fd369; +fma.rn.f64 fd374, fd108, 0d3FD4C7E04850CFAA, fd370; +fma.rn.f64 fd375, fd532, 0d3FEE442285231BE1, fd371; +fma.rn.f64 fd376, fd107, 0d3FD4C7E04850CFAA, fd372; +fma.rn.f64 fd377, fd109, 0dBFD9B560B9F596EA, fd373; +fma.rn.f64 fd378, fd112, 0dBFED4E03DD110B08, fd374; +fma.rn.f64 fd379, fd530, 0dBFD9B560B9F596EA, fd375; +fma.rn.f64 fd380, fd111, 0dBFED4E03DD110B08, fd376; +fma.rn.f64 fd381, fd77, 0dBFEC24A622E3E9F9, %38; +fma.rn.f64 fd385, fd81, 0d3FE180996C77C8CA, fd381; +fma.rn.f64 fd507, fd80, 0dBFDE75EC0DED7BEE, 0d0000000000000000; +fma.rn.f64 fd386, fd84, 0d3FEACA115AAE3DE4, fd507; +fma.rn.f64 fd506, fd549, 0dBFEC24A622E3E9F9, %39; +fma.rn.f64 fd387, fd546, 0d3FE180996C77C8CA, fd506; +fma.rn.f64 fd505, fd79, 0dBFDE75EC0DED7BEE, 0d0000000000000000; +fma.rn.f64 fd388, fd83, 0d3FEACA115AAE3DE4, fd505; +fma.rn.f64 fd389, fd85, 0dBFB523EB8420F5F5, fd385; +fma.rn.f64 fd390, fd88, 0dBFEFE40529A542AA, fd386; +fma.rn.f64 fd391, fd544, 0dBFB523EB8420F5F5, fd387; +fma.rn.f64 fd392, fd87, 0dBFEFE40529A542AA, fd388; +fma.rn.f64 fd393, fd89, 0dBFD9B560B9F596EA, fd389; +fma.rn.f64 fd394, fd92, 0d3FED4E03DD110B08, fd390; +fma.rn.f64 fd395, fd542, 0dBFD9B560B9F596EA, fd391; +fma.rn.f64 fd396, fd91, 0d3FED4E03DD110B08, fd392; +fma.rn.f64 fd397, fd93, 0d3FE940A398F9CD23, fd393; +fma.rn.f64 fd398, fd96, 0dBFE3A7A16B394423, fd394; +fma.rn.f64 fd399, fd539, 0d3FE940A398F9CD23, fd395; +fma.rn.f64 fd400, fd95, 0dBFE3A7A16B394423, fd396; +fma.rn.f64 fd401, fd97, 0dBFEF90459484F2B2, fd397; +fma.rn.f64 fd402, fd100, 0d3FC5116F7F2D58C5, fd398; +fma.rn.f64 fd403, fd537, 0dBFEF90459484F2B2, fd399; +fma.rn.f64 fd404, fd99, 0d3FC5116F7F2D58C5, fd400; +fma.rn.f64 fd405, fd101, 0d3FEE442285231BE1, fd401; +fma.rn.f64 fd406, fd104, 0d3FD4C7E04850CFAA, fd402; +fma.rn.f64 fd407, fd535, 0d3FEE442285231BE1, fd403; +fma.rn.f64 fd408, fd103, 0d3FD4C7E04850CFAA, fd404; +fma.rn.f64 fd409, fd105, 0dBFE5AC4A670A1CFF, fd405; +fma.rn.f64 fd410, fd108, 0dBFE78B0CDEE73E0F, fd406; +fma.rn.f64 fd411, fd532, 0dBFE5AC4A670A1CFF, fd407; +fma.rn.f64 fd412, fd107, 0dBFE78B0CDEE73E0F, fd408; +fma.rn.f64 fd413, fd109, 0d3FCF6C118574C83E, fd409; +fma.rn.f64 fd414, fd112, 0d3FEF0553B4DE2E18, fd410; +fma.rn.f64 fd415, fd530, 0d3FCF6C118574C83E, fd411; +fma.rn.f64 fd416, fd111, 0d3FEF0553B4DE2E18, fd412; +fma.rn.f64 fd417, fd77, 0dBFEF90459484F2B2, %38; +fma.rn.f64 fd418, fd80, 0dBFC5116F7F2D58C5, 0d0000000000000000; +fma.rn.f64 fd419, fd549, 0dBFEF90459484F2B2, %39; +fma.rn.f64 fd420, fd79, 0dBFC5116F7F2D58C5, 0d0000000000000000; +fma.rn.f64 fd421, fd81, 0d3FEE442285231BE1, fd417; +fma.rn.f64 fd422, fd84, 0d3FD4C7E04850CFAA, fd418; +fma.rn.f64 fd423, fd546, 0d3FEE442285231BE1, fd419; +fma.rn.f64 fd424, fd83, 0d3FD4C7E04850CFAA, fd420; +fma.rn.f64 fd425, fd85, 0dBFEC24A622E3E9F9, fd421; +fma.rn.f64 fd426, fd88, 0dBFDE75EC0DED7BEE, fd422; +fma.rn.f64 fd427, fd544, 0dBFEC24A622E3E9F9, fd423; +fma.rn.f64 fd428, fd87, 0dBFDE75EC0DED7BEE, fd424; +fma.rn.f64 fd429, fd89, 0d3FE940A398F9CD23, fd425; +fma.rn.f64 fd430, fd92, 0d3FE3A7A16B394423, fd426; +fma.rn.f64 fd431, fd542, 0d3FE940A398F9CD23, fd427; +fma.rn.f64 fd432, fd91, 0d3FE3A7A16B394423, fd428; +fma.rn.f64 fd433, fd93, 0dBFE5AC4A670A1CFF, fd429; +fma.rn.f64 fd434, fd96, 0dBFE78B0CDEE73E0F, fd430; +fma.rn.f64 fd435, fd539, 0dBFE5AC4A670A1CFF, fd431; +fma.rn.f64 fd436, fd95, 0dBFE78B0CDEE73E0F, fd432; +fma.rn.f64 fd437, fd97, 0d3FE180996C77C8CA, fd433; +fma.rn.f64 fd438, fd100, 0d3FEACA115AAE3DE4, fd434; +fma.rn.f64 fd439, fd537, 0d3FE180996C77C8CA, fd435; +fma.rn.f64 fd440, fd99, 0d3FEACA115AAE3DE4, fd436; +fma.rn.f64 fd441, fd101, 0dBFD9B560B9F596EA, fd437; +fma.rn.f64 fd442, fd104, 0dBFED4E03DD110B08, fd438; +fma.rn.f64 fd443, fd535, 0dBFD9B560B9F596EA, fd439; +fma.rn.f64 fd444, fd103, 0dBFED4E03DD110B08, fd440; +fma.rn.f64 fd445, fd105, 0d3FCF6C118574C83E, fd441; +fma.rn.f64 fd446, fd108, 0d3FEF0553B4DE2E18, fd442; +fma.rn.f64 fd447, fd532, 0d3FCF6C118574C83E, fd443; +fma.rn.f64 fd448, fd107, 0d3FEF0553B4DE2E18, fd444; +fma.rn.f64 fd449, fd109, 0dBFB523EB8420F5F5, fd445; +fma.rn.f64 fd450, fd112, 0dBFEFE40529A542AA, fd446; +fma.rn.f64 fd451, fd530, 0dBFB523EB8420F5F5, fd447; +fma.rn.f64 fd452, fd111, 0dBFEFE40529A542AA, fd448; +add.f64 %1, fd128, fd530; +add.f64 %0, fd127, fd109; +add.f64 %3, fd163, fd164; +sub.f64 %2, fd161, fd162; +add.f64 %5, fd199, fd200; +sub.f64 %4, fd197, fd198; +add.f64 %7, fd235, fd236; +sub.f64 %6, fd233, fd234; +sub.f64 %8, fd269, fd270; +add.f64 %9, fd271, fd272; +sub.f64 %10, fd305, fd306; +add.f64 %11, fd307, fd308; +sub.f64 %12, fd341, fd342; +add.f64 %13, fd343, fd344; +sub.f64 %14, fd377, fd378; +add.f64 %15, fd379, fd380; +add.f64 %17, fd415, fd416; +sub.f64 %16, fd413, fd414; +add.f64 %19, fd451, fd452; +sub.f64 %18, fd449, fd450; +sub.f64 %21, fd451, fd452; +add.f64 %20, fd449, fd450; +sub.f64 %23, fd415, fd416; +add.f64 %22, fd413, fd414; +sub.f64 %25, fd379, fd380; +add.f64 %24, fd377, fd378; +sub.f64 %27, fd343, fd344; +add.f64 %26, fd341, fd342; +sub.f64 %29, fd307, fd308; +add.f64 %28, fd305, fd306; +sub.f64 %31, fd271, fd272; +add.f64 %30, fd269, fd270; +sub.f64 %33, fd235, fd236; +add.f64 %32, fd233, fd234; +sub.f64 %35, fd199, fd200; +add.f64 %34, fd197, fd198; +sub.f64 %37, fd163, fd164; +add.f64 %36, fd161, fd162; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[2].y), "d"(rmem[17].y), "d"(rmem[16].y), "d"(rmem[4].y), "d"(rmem[5].y), "d"(rmem[14].y), "d"(rmem[13].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[11].y), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..9479643f2b93a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_19_fp64_inv.hpp.inc @@ -0,0 +1,430 @@ +#ifndef CUFFTDX_FFT_19_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_19_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<580, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<550>; +.reg .b64 rd<4>; +add.f64 fd77, %40, %75; +sub.f64 fd79, %40, %75; +add.f64 fd549, %42, %76; +sub.f64 fd80, %42, %76; +add.f64 fd81, %43, %73; +sub.f64 fd83, %43, %73; +add.f64 fd546, %77, %78; +sub.f64 fd84, %77, %78; +add.f64 fd85, %45, %71; +sub.f64 fd87, %45, %71; +add.f64 fd544, %46, %79; +sub.f64 fd88, %46, %79; +add.f64 fd89, %47, %69; +sub.f64 fd91, %47, %69; +add.f64 fd542, %80, %70; +sub.f64 fd92, %80, %70; +add.f64 fd93, %49, %67; +sub.f64 fd95, %49, %67; +add.f64 fd539, %81, %82; +sub.f64 fd96, %81, %82; +add.f64 fd97, %51, %65; +sub.f64 fd99, %51, %65; +add.f64 fd537, %52, %83; +sub.f64 fd100, %52, %83; +add.f64 fd101, %53, %63; +sub.f64 fd103, %53, %63; +add.f64 fd535, %84, %64; +sub.f64 fd104, %84, %64; +add.f64 fd105, %55, %61; +sub.f64 fd107, %55, %61; +add.f64 fd532, %85, %86; +sub.f64 fd108, %85, %86; +add.f64 fd109, %57, %59; +sub.f64 fd111, %57, %59; +add.f64 fd530, %58, %87; +sub.f64 fd112, %58, %87; +add.f64 fd113, %38, fd77; +add.f64 fd115, fd113, fd81; +add.f64 fd529, %39, fd549; +add.f64 fd116, fd529, fd546; +add.f64 fd117, fd115, fd85; +add.f64 fd118, fd116, fd544; +add.f64 fd119, fd117, fd89; +add.f64 fd120, fd118, fd542; +add.f64 fd121, fd119, fd93; +add.f64 fd122, fd120, fd539; +add.f64 fd123, fd121, fd97; +add.f64 fd124, fd122, fd537; +add.f64 fd125, fd123, fd101; +add.f64 fd126, fd124, fd535; +add.f64 fd127, fd125, fd105; +add.f64 fd128, fd126, fd532; +fma.rn.f64 fd129, fd77, 0d3FEE442285231BE1, %38; +fma.rn.f64 fd133, fd81, 0d3FE940A398F9CD23, fd129; +fma.rn.f64 fd528, fd80, 0d3FD4C7E04850CFAA, 0d0000000000000000; +fma.rn.f64 fd134, fd84, 0d3FE3A7A16B394423, fd528; +fma.rn.f64 fd527, fd549, 0d3FEE442285231BE1, %39; +fma.rn.f64 fd135, fd546, 0d3FE940A398F9CD23, fd527; +fma.rn.f64 fd526, fd79, 0d3FD4C7E04850CFAA, 0d0000000000000000; +fma.rn.f64 fd136, fd83, 0d3FE3A7A16B394423, fd526; +fma.rn.f64 fd137, fd85, 0d3FE180996C77C8CA, fd133; +fma.rn.f64 fd138, fd88, 0d3FEACA115AAE3DE4, fd134; +fma.rn.f64 fd139, fd544, 0d3FE180996C77C8CA, fd135; +fma.rn.f64 fd140, fd87, 0d3FEACA115AAE3DE4, fd136; +fma.rn.f64 fd141, fd89, 0d3FCF6C118574C83E, fd137; +fma.rn.f64 fd142, fd92, 0d3FEF0553B4DE2E18, fd138; +fma.rn.f64 fd143, fd542, 0d3FCF6C118574C83E, fd139; +fma.rn.f64 fd144, fd91, 0d3FEF0553B4DE2E18, fd140; +fma.rn.f64 fd145, fd93, 0dBFB523EB8420F5F5, fd141; +fma.rn.f64 fd146, fd96, 0d3FEFE40529A542AA, fd142; +fma.rn.f64 fd147, fd539, 0dBFB523EB8420F5F5, fd143; +fma.rn.f64 fd148, fd95, 0d3FEFE40529A542AA, fd144; +fma.rn.f64 fd149, fd97, 0dBFD9B560B9F596EA, fd145; +fma.rn.f64 fd150, fd100, 0d3FED4E03DD110B08, fd146; +fma.rn.f64 fd151, fd537, 0dBFD9B560B9F596EA, fd147; +fma.rn.f64 fd152, fd99, 0d3FED4E03DD110B08, fd148; +fma.rn.f64 fd153, fd101, 0dBFE5AC4A670A1CFF, fd149; +fma.rn.f64 fd154, fd104, 0d3FE78B0CDEE73E0F, fd150; +fma.rn.f64 fd155, fd535, 0dBFE5AC4A670A1CFF, fd151; +fma.rn.f64 fd156, fd103, 0d3FE78B0CDEE73E0F, fd152; +fma.rn.f64 fd157, fd105, 0dBFEC24A622E3E9F9, fd153; +fma.rn.f64 fd158, fd108, 0d3FDE75EC0DED7BEE, fd154; +fma.rn.f64 fd159, fd532, 0dBFEC24A622E3E9F9, fd155; +fma.rn.f64 fd160, fd107, 0d3FDE75EC0DED7BEE, fd156; +fma.rn.f64 fd161, fd109, 0dBFEF90459484F2B2, fd157; +fma.rn.f64 fd162, fd112, 0d3FC5116F7F2D58C5, fd158; +fma.rn.f64 fd163, fd530, 0dBFEF90459484F2B2, fd159; +fma.rn.f64 fd164, fd111, 0d3FC5116F7F2D58C5, fd160; +fma.rn.f64 fd165, fd77, 0d3FE940A398F9CD23, %38; +fma.rn.f64 fd169, fd81, 0d3FCF6C118574C83E, fd165; +fma.rn.f64 fd525, fd80, 0d3FE3A7A16B394423, 0d0000000000000000; +fma.rn.f64 fd170, fd84, 0d3FEF0553B4DE2E18, fd525; +fma.rn.f64 fd524, fd549, 0d3FE940A398F9CD23, %39; +fma.rn.f64 fd171, fd546, 0d3FCF6C118574C83E, fd524; +fma.rn.f64 fd523, fd79, 0d3FE3A7A16B394423, 0d0000000000000000; +fma.rn.f64 fd172, fd83, 0d3FEF0553B4DE2E18, fd523; +fma.rn.f64 fd173, fd85, 0dBFD9B560B9F596EA, fd169; +fma.rn.f64 fd174, fd88, 0d3FED4E03DD110B08, fd170; +fma.rn.f64 fd175, fd544, 0dBFD9B560B9F596EA, fd171; +fma.rn.f64 fd176, fd87, 0d3FED4E03DD110B08, fd172; +fma.rn.f64 fd177, fd89, 0dBFEC24A622E3E9F9, fd173; +fma.rn.f64 fd178, fd92, 0d3FDE75EC0DED7BEE, fd174; +fma.rn.f64 fd179, fd542, 0dBFEC24A622E3E9F9, fd175; +fma.rn.f64 fd180, fd91, 0d3FDE75EC0DED7BEE, fd176; +fma.rn.f64 fd181, fd93, 0dBFEF90459484F2B2, fd177; +fma.rn.f64 fd182, fd96, 0dBFC5116F7F2D58C5, fd178; +fma.rn.f64 fd183, fd539, 0dBFEF90459484F2B2, fd179; +fma.rn.f64 fd184, fd95, 0dBFC5116F7F2D58C5, fd180; +fma.rn.f64 fd185, fd97, 0dBFE5AC4A670A1CFF, fd181; +fma.rn.f64 fd186, fd100, 0dBFE78B0CDEE73E0F, fd182; +fma.rn.f64 fd187, fd537, 0dBFE5AC4A670A1CFF, fd183; +fma.rn.f64 fd188, fd99, 0dBFE78B0CDEE73E0F, fd184; +fma.rn.f64 fd189, fd101, 0dBFB523EB8420F5F5, fd185; +fma.rn.f64 fd190, fd104, 0dBFEFE40529A542AA, fd186; +fma.rn.f64 fd191, fd535, 0dBFB523EB8420F5F5, fd187; +fma.rn.f64 fd192, fd103, 0dBFEFE40529A542AA, fd188; +fma.rn.f64 fd193, fd105, 0d3FE180996C77C8CA, fd189; +fma.rn.f64 fd194, fd108, 0dBFEACA115AAE3DE4, fd190; +fma.rn.f64 fd195, fd532, 0d3FE180996C77C8CA, fd191; +fma.rn.f64 fd196, fd107, 0dBFEACA115AAE3DE4, fd192; +fma.rn.f64 fd197, fd109, 0d3FEE442285231BE1, fd193; +fma.rn.f64 fd198, fd112, 0dBFD4C7E04850CFAA, fd194; +fma.rn.f64 fd199, fd530, 0d3FEE442285231BE1, fd195; +fma.rn.f64 fd200, fd111, 0dBFD4C7E04850CFAA, fd196; +fma.rn.f64 fd201, fd77, 0d3FE180996C77C8CA, %38; +fma.rn.f64 fd205, fd81, 0dBFD9B560B9F596EA, fd201; +fma.rn.f64 fd522, fd80, 0d3FEACA115AAE3DE4, 0d0000000000000000; +fma.rn.f64 fd206, fd84, 0d3FED4E03DD110B08, fd522; +fma.rn.f64 fd521, fd549, 0d3FE180996C77C8CA, %39; +fma.rn.f64 fd207, fd546, 0dBFD9B560B9F596EA, fd521; +fma.rn.f64 fd520, fd79, 0d3FEACA115AAE3DE4, 0d0000000000000000; +fma.rn.f64 fd208, fd83, 0d3FED4E03DD110B08, fd520; +fma.rn.f64 fd209, fd85, 0dBFEF90459484F2B2, fd205; +fma.rn.f64 fd210, fd88, 0d3FC5116F7F2D58C5, fd206; +fma.rn.f64 fd211, fd544, 0dBFEF90459484F2B2, fd207; +fma.rn.f64 fd212, fd87, 0d3FC5116F7F2D58C5, fd208; +fma.rn.f64 fd213, fd89, 0dBFE5AC4A670A1CFF, fd209; +fma.rn.f64 fd214, fd92, 0dBFE78B0CDEE73E0F, fd210; +fma.rn.f64 fd215, fd542, 0dBFE5AC4A670A1CFF, fd211; +fma.rn.f64 fd216, fd91, 0dBFE78B0CDEE73E0F, fd212; +fma.rn.f64 fd217, fd93, 0d3FCF6C118574C83E, fd213; +fma.rn.f64 fd218, fd96, 0dBFEF0553B4DE2E18, fd214; +fma.rn.f64 fd219, fd539, 0d3FCF6C118574C83E, fd215; +fma.rn.f64 fd220, fd95, 0dBFEF0553B4DE2E18, fd216; +fma.rn.f64 fd221, fd97, 0d3FEE442285231BE1, fd217; +fma.rn.f64 fd222, fd100, 0dBFD4C7E04850CFAA, fd218; +fma.rn.f64 fd223, fd537, 0d3FEE442285231BE1, fd219; +fma.rn.f64 fd224, fd99, 0dBFD4C7E04850CFAA, fd220; +fma.rn.f64 fd225, fd101, 0d3FE940A398F9CD23, fd221; +fma.rn.f64 fd226, fd104, 0d3FE3A7A16B394423, fd222; +fma.rn.f64 fd227, fd535, 0d3FE940A398F9CD23, fd223; +fma.rn.f64 fd228, fd103, 0d3FE3A7A16B394423, fd224; +fma.rn.f64 fd229, fd105, 0dBFB523EB8420F5F5, fd225; +fma.rn.f64 fd230, fd108, 0d3FEFE40529A542AA, fd226; +fma.rn.f64 fd231, fd532, 0dBFB523EB8420F5F5, fd227; +fma.rn.f64 fd232, fd107, 0d3FEFE40529A542AA, fd228; +fma.rn.f64 fd233, fd109, 0dBFEC24A622E3E9F9, fd229; +fma.rn.f64 fd234, fd112, 0d3FDE75EC0DED7BEE, fd230; +fma.rn.f64 fd235, fd530, 0dBFEC24A622E3E9F9, fd231; +fma.rn.f64 fd236, fd111, 0d3FDE75EC0DED7BEE, fd232; +fma.rn.f64 fd237, fd77, 0d3FCF6C118574C83E, %38; +fma.rn.f64 fd241, fd81, 0dBFEC24A622E3E9F9, fd237; +fma.rn.f64 fd519, fd80, 0d3FEF0553B4DE2E18, 0d0000000000000000; +fma.rn.f64 fd242, fd84, 0d3FDE75EC0DED7BEE, fd519; +fma.rn.f64 fd518, fd549, 0d3FCF6C118574C83E, %39; +fma.rn.f64 fd243, fd546, 0dBFEC24A622E3E9F9, fd518; +fma.rn.f64 fd517, fd79, 0d3FEF0553B4DE2E18, 0d0000000000000000; +fma.rn.f64 fd244, fd83, 0d3FDE75EC0DED7BEE, fd517; +fma.rn.f64 fd245, fd85, 0dBFE5AC4A670A1CFF, fd241; +fma.rn.f64 fd246, fd88, 0dBFE78B0CDEE73E0F, fd242; +fma.rn.f64 fd247, fd544, 0dBFE5AC4A670A1CFF, fd243; +fma.rn.f64 fd248, fd87, 0dBFE78B0CDEE73E0F, fd244; +fma.rn.f64 fd249, fd89, 0d3FE180996C77C8CA, fd245; +fma.rn.f64 fd250, fd92, 0dBFEACA115AAE3DE4, fd246; +fma.rn.f64 fd251, fd542, 0d3FE180996C77C8CA, fd247; +fma.rn.f64 fd252, fd91, 0dBFEACA115AAE3DE4, fd248; +fma.rn.f64 fd253, fd93, 0d3FEE442285231BE1, fd249; +fma.rn.f64 fd254, fd96, 0d3FD4C7E04850CFAA, fd250; +fma.rn.f64 fd255, fd539, 0d3FEE442285231BE1, fd251; +fma.rn.f64 fd256, fd95, 0d3FD4C7E04850CFAA, fd252; +fma.rn.f64 fd257, fd97, 0dBFB523EB8420F5F5, fd253; +fma.rn.f64 fd258, fd100, 0d3FEFE40529A542AA, fd254; +fma.rn.f64 fd259, fd537, 0dBFB523EB8420F5F5, fd255; +fma.rn.f64 fd260, fd99, 0d3FEFE40529A542AA, fd256; +fma.rn.f64 fd261, fd101, 0dBFEF90459484F2B2, fd257; +fma.rn.f64 fd262, fd104, 0d3FC5116F7F2D58C5, fd258; +fma.rn.f64 fd263, fd535, 0dBFEF90459484F2B2, fd259; +fma.rn.f64 fd264, fd103, 0d3FC5116F7F2D58C5, fd260; +fma.rn.f64 fd265, fd105, 0dBFD9B560B9F596EA, fd261; +fma.rn.f64 fd266, fd108, 0dBFED4E03DD110B08, fd262; +fma.rn.f64 fd267, fd532, 0dBFD9B560B9F596EA, fd263; +fma.rn.f64 fd268, fd107, 0dBFED4E03DD110B08, fd264; +fma.rn.f64 fd269, fd109, 0d3FE940A398F9CD23, fd265; +fma.rn.f64 fd270, fd112, 0dBFE3A7A16B394423, fd266; +fma.rn.f64 fd271, fd530, 0d3FE940A398F9CD23, fd267; +fma.rn.f64 fd272, fd111, 0dBFE3A7A16B394423, fd268; +fma.rn.f64 fd273, fd77, 0dBFB523EB8420F5F5, %38; +fma.rn.f64 fd277, fd81, 0dBFEF90459484F2B2, fd273; +fma.rn.f64 fd516, fd80, 0d3FEFE40529A542AA, 0d0000000000000000; +fma.rn.f64 fd278, fd84, 0dBFC5116F7F2D58C5, fd516; +fma.rn.f64 fd515, fd549, 0dBFB523EB8420F5F5, %39; +fma.rn.f64 fd279, fd546, 0dBFEF90459484F2B2, fd515; +fma.rn.f64 fd514, fd79, 0d3FEFE40529A542AA, 0d0000000000000000; +fma.rn.f64 fd280, fd83, 0dBFC5116F7F2D58C5, fd514; +fma.rn.f64 fd281, fd85, 0d3FCF6C118574C83E, fd277; +fma.rn.f64 fd282, fd88, 0dBFEF0553B4DE2E18, fd278; +fma.rn.f64 fd283, fd544, 0d3FCF6C118574C83E, fd279; +fma.rn.f64 fd284, fd87, 0dBFEF0553B4DE2E18, fd280; +fma.rn.f64 fd285, fd89, 0d3FEE442285231BE1, fd281; +fma.rn.f64 fd286, fd92, 0d3FD4C7E04850CFAA, fd282; +fma.rn.f64 fd287, fd542, 0d3FEE442285231BE1, fd283; +fma.rn.f64 fd288, fd91, 0d3FD4C7E04850CFAA, fd284; +fma.rn.f64 fd289, fd93, 0dBFD9B560B9F596EA, fd285; +fma.rn.f64 fd290, fd96, 0d3FED4E03DD110B08, fd286; +fma.rn.f64 fd291, fd539, 0dBFD9B560B9F596EA, fd287; +fma.rn.f64 fd292, fd95, 0d3FED4E03DD110B08, fd288; +fma.rn.f64 fd293, fd97, 0dBFEC24A622E3E9F9, fd289; +fma.rn.f64 fd294, fd100, 0dBFDE75EC0DED7BEE, fd290; +fma.rn.f64 fd295, fd537, 0dBFEC24A622E3E9F9, fd291; +fma.rn.f64 fd296, fd99, 0dBFDE75EC0DED7BEE, fd292; +fma.rn.f64 fd297, fd101, 0d3FE180996C77C8CA, fd293; +fma.rn.f64 fd298, fd104, 0dBFEACA115AAE3DE4, fd294; +fma.rn.f64 fd299, fd535, 0d3FE180996C77C8CA, fd295; +fma.rn.f64 fd300, fd103, 0dBFEACA115AAE3DE4, fd296; +fma.rn.f64 fd301, fd105, 0d3FE940A398F9CD23, fd297; +fma.rn.f64 fd302, fd108, 0d3FE3A7A16B394423, fd298; +fma.rn.f64 fd303, fd532, 0d3FE940A398F9CD23, fd299; +fma.rn.f64 fd304, fd107, 0d3FE3A7A16B394423, fd300; +fma.rn.f64 fd305, fd109, 0dBFE5AC4A670A1CFF, fd301; +fma.rn.f64 fd306, fd112, 0d3FE78B0CDEE73E0F, fd302; +fma.rn.f64 fd307, fd530, 0dBFE5AC4A670A1CFF, fd303; +fma.rn.f64 fd308, fd111, 0d3FE78B0CDEE73E0F, fd304; +fma.rn.f64 fd309, fd77, 0dBFD9B560B9F596EA, %38; +fma.rn.f64 fd313, fd81, 0dBFE5AC4A670A1CFF, fd309; +fma.rn.f64 fd513, fd80, 0d3FED4E03DD110B08, 0d0000000000000000; +fma.rn.f64 fd314, fd84, 0dBFE78B0CDEE73E0F, fd513; +fma.rn.f64 fd512, fd549, 0dBFD9B560B9F596EA, %39; +fma.rn.f64 fd315, fd546, 0dBFE5AC4A670A1CFF, fd512; +fma.rn.f64 fd511, fd79, 0d3FED4E03DD110B08, 0d0000000000000000; +fma.rn.f64 fd316, fd83, 0dBFE78B0CDEE73E0F, fd511; +fma.rn.f64 fd317, fd85, 0d3FEE442285231BE1, fd313; +fma.rn.f64 fd318, fd88, 0dBFD4C7E04850CFAA, fd314; +fma.rn.f64 fd319, fd544, 0d3FEE442285231BE1, fd315; +fma.rn.f64 fd320, fd87, 0dBFD4C7E04850CFAA, fd316; +fma.rn.f64 fd321, fd89, 0dBFB523EB8420F5F5, fd317; +fma.rn.f64 fd322, fd92, 0d3FEFE40529A542AA, fd318; +fma.rn.f64 fd323, fd542, 0dBFB523EB8420F5F5, fd319; +fma.rn.f64 fd324, fd91, 0d3FEFE40529A542AA, fd320; +fma.rn.f64 fd325, fd93, 0dBFEC24A622E3E9F9, fd321; +fma.rn.f64 fd326, fd96, 0dBFDE75EC0DED7BEE, fd322; +fma.rn.f64 fd327, fd539, 0dBFEC24A622E3E9F9, fd323; +fma.rn.f64 fd328, fd95, 0dBFDE75EC0DED7BEE, fd324; +fma.rn.f64 fd329, fd97, 0d3FE940A398F9CD23, fd325; +fma.rn.f64 fd330, fd100, 0dBFE3A7A16B394423, fd326; +fma.rn.f64 fd331, fd537, 0d3FE940A398F9CD23, fd327; +fma.rn.f64 fd332, fd99, 0dBFE3A7A16B394423, fd328; +fma.rn.f64 fd333, fd101, 0d3FCF6C118574C83E, fd329; +fma.rn.f64 fd334, fd104, 0d3FEF0553B4DE2E18, fd330; +fma.rn.f64 fd335, fd535, 0d3FCF6C118574C83E, fd331; +fma.rn.f64 fd336, fd103, 0d3FEF0553B4DE2E18, fd332; +fma.rn.f64 fd337, fd105, 0dBFEF90459484F2B2, fd333; +fma.rn.f64 fd338, fd108, 0dBFC5116F7F2D58C5, fd334; +fma.rn.f64 fd339, fd532, 0dBFEF90459484F2B2, fd335; +fma.rn.f64 fd340, fd107, 0dBFC5116F7F2D58C5, fd336; +fma.rn.f64 fd341, fd109, 0d3FE180996C77C8CA, fd337; +fma.rn.f64 fd342, fd112, 0dBFEACA115AAE3DE4, fd338; +fma.rn.f64 fd343, fd530, 0d3FE180996C77C8CA, fd339; +fma.rn.f64 fd344, fd111, 0dBFEACA115AAE3DE4, fd340; +fma.rn.f64 fd345, fd77, 0dBFE5AC4A670A1CFF, %38; +fma.rn.f64 fd349, fd81, 0dBFB523EB8420F5F5, fd345; +fma.rn.f64 fd510, fd80, 0d3FE78B0CDEE73E0F, 0d0000000000000000; +fma.rn.f64 fd350, fd84, 0dBFEFE40529A542AA, fd510; +fma.rn.f64 fd509, fd549, 0dBFE5AC4A670A1CFF, %39; +fma.rn.f64 fd351, fd546, 0dBFB523EB8420F5F5, fd509; +fma.rn.f64 fd508, fd79, 0d3FE78B0CDEE73E0F, 0d0000000000000000; +fma.rn.f64 fd352, fd83, 0dBFEFE40529A542AA, fd508; +fma.rn.f64 fd353, fd85, 0d3FE940A398F9CD23, fd349; +fma.rn.f64 fd354, fd88, 0d3FE3A7A16B394423, fd350; +fma.rn.f64 fd355, fd544, 0d3FE940A398F9CD23, fd351; +fma.rn.f64 fd356, fd87, 0d3FE3A7A16B394423, fd352; +fma.rn.f64 fd357, fd89, 0dBFEF90459484F2B2, fd353; +fma.rn.f64 fd358, fd92, 0d3FC5116F7F2D58C5, fd354; +fma.rn.f64 fd359, fd542, 0dBFEF90459484F2B2, fd355; +fma.rn.f64 fd360, fd91, 0d3FC5116F7F2D58C5, fd356; +fma.rn.f64 fd361, fd93, 0d3FE180996C77C8CA, fd357; +fma.rn.f64 fd362, fd96, 0dBFEACA115AAE3DE4, fd358; +fma.rn.f64 fd363, fd539, 0d3FE180996C77C8CA, fd359; +fma.rn.f64 fd364, fd95, 0dBFEACA115AAE3DE4, fd360; +fma.rn.f64 fd365, fd97, 0d3FCF6C118574C83E, fd361; +fma.rn.f64 fd366, fd100, 0d3FEF0553B4DE2E18, fd362; +fma.rn.f64 fd367, fd537, 0d3FCF6C118574C83E, fd363; +fma.rn.f64 fd368, fd99, 0d3FEF0553B4DE2E18, fd364; +fma.rn.f64 fd369, fd101, 0dBFEC24A622E3E9F9, fd365; +fma.rn.f64 fd370, fd104, 0dBFDE75EC0DED7BEE, fd366; +fma.rn.f64 fd371, fd535, 0dBFEC24A622E3E9F9, fd367; +fma.rn.f64 fd372, fd103, 0dBFDE75EC0DED7BEE, fd368; +fma.rn.f64 fd373, fd105, 0d3FEE442285231BE1, fd369; +fma.rn.f64 fd374, fd108, 0dBFD4C7E04850CFAA, fd370; +fma.rn.f64 fd375, fd532, 0d3FEE442285231BE1, fd371; +fma.rn.f64 fd376, fd107, 0dBFD4C7E04850CFAA, fd372; +fma.rn.f64 fd377, fd109, 0dBFD9B560B9F596EA, fd373; +fma.rn.f64 fd378, fd112, 0d3FED4E03DD110B08, fd374; +fma.rn.f64 fd379, fd530, 0dBFD9B560B9F596EA, fd375; +fma.rn.f64 fd380, fd111, 0d3FED4E03DD110B08, fd376; +fma.rn.f64 fd381, fd77, 0dBFEC24A622E3E9F9, %38; +fma.rn.f64 fd385, fd81, 0d3FE180996C77C8CA, fd381; +fma.rn.f64 fd507, fd80, 0d3FDE75EC0DED7BEE, 0d0000000000000000; +fma.rn.f64 fd386, fd84, 0dBFEACA115AAE3DE4, fd507; +fma.rn.f64 fd506, fd549, 0dBFEC24A622E3E9F9, %39; +fma.rn.f64 fd387, fd546, 0d3FE180996C77C8CA, fd506; +fma.rn.f64 fd505, fd79, 0d3FDE75EC0DED7BEE, 0d0000000000000000; +fma.rn.f64 fd388, fd83, 0dBFEACA115AAE3DE4, fd505; +fma.rn.f64 fd389, fd85, 0dBFB523EB8420F5F5, fd385; +fma.rn.f64 fd390, fd88, 0d3FEFE40529A542AA, fd386; +fma.rn.f64 fd391, fd544, 0dBFB523EB8420F5F5, fd387; +fma.rn.f64 fd392, fd87, 0d3FEFE40529A542AA, fd388; +fma.rn.f64 fd393, fd89, 0dBFD9B560B9F596EA, fd389; +fma.rn.f64 fd394, fd92, 0dBFED4E03DD110B08, fd390; +fma.rn.f64 fd395, fd542, 0dBFD9B560B9F596EA, fd391; +fma.rn.f64 fd396, fd91, 0dBFED4E03DD110B08, fd392; +fma.rn.f64 fd397, fd93, 0d3FE940A398F9CD23, fd393; +fma.rn.f64 fd398, fd96, 0d3FE3A7A16B394423, fd394; +fma.rn.f64 fd399, fd539, 0d3FE940A398F9CD23, fd395; +fma.rn.f64 fd400, fd95, 0d3FE3A7A16B394423, fd396; +fma.rn.f64 fd401, fd97, 0dBFEF90459484F2B2, fd397; +fma.rn.f64 fd402, fd100, 0dBFC5116F7F2D58C5, fd398; +fma.rn.f64 fd403, fd537, 0dBFEF90459484F2B2, fd399; +fma.rn.f64 fd404, fd99, 0dBFC5116F7F2D58C5, fd400; +fma.rn.f64 fd405, fd101, 0d3FEE442285231BE1, fd401; +fma.rn.f64 fd406, fd104, 0dBFD4C7E04850CFAA, fd402; +fma.rn.f64 fd407, fd535, 0d3FEE442285231BE1, fd403; +fma.rn.f64 fd408, fd103, 0dBFD4C7E04850CFAA, fd404; +fma.rn.f64 fd409, fd105, 0dBFE5AC4A670A1CFF, fd405; +fma.rn.f64 fd410, fd108, 0d3FE78B0CDEE73E0F, fd406; +fma.rn.f64 fd411, fd532, 0dBFE5AC4A670A1CFF, fd407; +fma.rn.f64 fd412, fd107, 0d3FE78B0CDEE73E0F, fd408; +fma.rn.f64 fd413, fd109, 0d3FCF6C118574C83E, fd409; +fma.rn.f64 fd414, fd112, 0dBFEF0553B4DE2E18, fd410; +fma.rn.f64 fd415, fd530, 0d3FCF6C118574C83E, fd411; +fma.rn.f64 fd416, fd111, 0dBFEF0553B4DE2E18, fd412; +fma.rn.f64 fd417, fd77, 0dBFEF90459484F2B2, %38; +fma.rn.f64 fd418, fd80, 0d3FC5116F7F2D58C5, 0d0000000000000000; +fma.rn.f64 fd419, fd549, 0dBFEF90459484F2B2, %39; +fma.rn.f64 fd420, fd79, 0d3FC5116F7F2D58C5, 0d0000000000000000; +fma.rn.f64 fd421, fd81, 0d3FEE442285231BE1, fd417; +fma.rn.f64 fd422, fd84, 0dBFD4C7E04850CFAA, fd418; +fma.rn.f64 fd423, fd546, 0d3FEE442285231BE1, fd419; +fma.rn.f64 fd424, fd83, 0dBFD4C7E04850CFAA, fd420; +fma.rn.f64 fd425, fd85, 0dBFEC24A622E3E9F9, fd421; +fma.rn.f64 fd426, fd88, 0d3FDE75EC0DED7BEE, fd422; +fma.rn.f64 fd427, fd544, 0dBFEC24A622E3E9F9, fd423; +fma.rn.f64 fd428, fd87, 0d3FDE75EC0DED7BEE, fd424; +fma.rn.f64 fd429, fd89, 0d3FE940A398F9CD23, fd425; +fma.rn.f64 fd430, fd92, 0dBFE3A7A16B394423, fd426; +fma.rn.f64 fd431, fd542, 0d3FE940A398F9CD23, fd427; +fma.rn.f64 fd432, fd91, 0dBFE3A7A16B394423, fd428; +fma.rn.f64 fd433, fd93, 0dBFE5AC4A670A1CFF, fd429; +fma.rn.f64 fd434, fd96, 0d3FE78B0CDEE73E0F, fd430; +fma.rn.f64 fd435, fd539, 0dBFE5AC4A670A1CFF, fd431; +fma.rn.f64 fd436, fd95, 0d3FE78B0CDEE73E0F, fd432; +fma.rn.f64 fd437, fd97, 0d3FE180996C77C8CA, fd433; +fma.rn.f64 fd438, fd100, 0dBFEACA115AAE3DE4, fd434; +fma.rn.f64 fd439, fd537, 0d3FE180996C77C8CA, fd435; +fma.rn.f64 fd440, fd99, 0dBFEACA115AAE3DE4, fd436; +fma.rn.f64 fd441, fd101, 0dBFD9B560B9F596EA, fd437; +fma.rn.f64 fd442, fd104, 0d3FED4E03DD110B08, fd438; +fma.rn.f64 fd443, fd535, 0dBFD9B560B9F596EA, fd439; +fma.rn.f64 fd444, fd103, 0d3FED4E03DD110B08, fd440; +fma.rn.f64 fd445, fd105, 0d3FCF6C118574C83E, fd441; +fma.rn.f64 fd446, fd108, 0dBFEF0553B4DE2E18, fd442; +fma.rn.f64 fd447, fd532, 0d3FCF6C118574C83E, fd443; +fma.rn.f64 fd448, fd107, 0dBFEF0553B4DE2E18, fd444; +fma.rn.f64 fd449, fd109, 0dBFB523EB8420F5F5, fd445; +fma.rn.f64 fd450, fd112, 0d3FEFE40529A542AA, fd446; +fma.rn.f64 fd451, fd530, 0dBFB523EB8420F5F5, fd447; +fma.rn.f64 fd452, fd111, 0d3FEFE40529A542AA, fd448; +add.f64 %1, fd128, fd530; +add.f64 %0, fd127, fd109; +add.f64 %3, fd163, fd164; +sub.f64 %2, fd161, fd162; +add.f64 %5, fd199, fd200; +sub.f64 %4, fd197, fd198; +add.f64 %7, fd235, fd236; +sub.f64 %6, fd233, fd234; +sub.f64 %8, fd269, fd270; +add.f64 %9, fd271, fd272; +sub.f64 %10, fd305, fd306; +add.f64 %11, fd307, fd308; +sub.f64 %12, fd341, fd342; +add.f64 %13, fd343, fd344; +sub.f64 %14, fd377, fd378; +add.f64 %15, fd379, fd380; +add.f64 %17, fd415, fd416; +sub.f64 %16, fd413, fd414; +add.f64 %19, fd451, fd452; +sub.f64 %18, fd449, fd450; +sub.f64 %21, fd451, fd452; +add.f64 %20, fd449, fd450; +sub.f64 %23, fd415, fd416; +add.f64 %22, fd413, fd414; +sub.f64 %25, fd379, fd380; +add.f64 %24, fd377, fd378; +sub.f64 %27, fd343, fd344; +add.f64 %26, fd341, fd342; +sub.f64 %29, fd307, fd308; +add.f64 %28, fd305, fd306; +sub.f64 %31, fd271, fd272; +add.f64 %30, fd269, fd270; +sub.f64 %33, fd235, fd236; +add.f64 %32, fd233, fd234; +sub.f64 %35, fd199, fd200; +add.f64 %34, fd197, fd198; +sub.f64 %37, fd163, fd164; +add.f64 %36, fd161, fd162; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[2].y), "d"(rmem[17].y), "d"(rmem[16].y), "d"(rmem[4].y), "d"(rmem[5].y), "d"(rmem[14].y), "d"(rmem[13].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[11].y), "d"(rmem[10].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..dc21f75269f5b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp16_fwd.hpp.inc @@ -0,0 +1,30078 @@ +#ifndef CUFFTDX_FFT_2048_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_2048_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<837, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<363>; +.reg .b32 r<2761>; +.reg .b64 rd<2>; +mov.u32 r2741, %tid.y; +shl.b32 r2742, r2741, 13; +mov.u32 r2743, %32; +add.s32 r2744, r2743, r2742; +mov.u32 r2745, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f330, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r101, {low, high}; +} +mov.f32 f340, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2746, r2745, 127; +shl.b32 r2747, r2745, 6; +and.b32 r2748, r2747, -8192; +add.s32 r2749, r2744, r2748; +cvt.rn.f32.u32 f357, r2746; +mul.f32 f358, f357, 0f3B490FDB; +cos.approx.f32 f117, f358; +sin.approx.f32 f359, f358; +neg.f32 f118, f359; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2750, r2747, 8128; +add.s32 r2751, r2749, r2750; +st.shared.v4.f32 [r2751], {r521, r629, r666, r703}; +st.shared.v4.f32 [r2751+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r2751+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r2751+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r2752, r2746, -60, r2751; +ld.shared.u32 r1176, [r2752]; +ld.shared.u32 r1372, [r2752+512]; +ld.shared.u32 r1226, [r2752+1024]; +ld.shared.u32 r1422, [r2752+1536]; +ld.shared.u32 r1188, [r2752+2048]; +ld.shared.u32 r1384, [r2752+2560]; +ld.shared.u32 r1238, [r2752+3072]; +ld.shared.u32 r1434, [r2752+3584]; +ld.shared.u32 r1177, [r2752+4096]; +ld.shared.u32 r1373, [r2752+4608]; +ld.shared.u32 r1227, [r2752+5120]; +ld.shared.u32 r1423, [r2752+5632]; +ld.shared.u32 r1189, [r2752+6144]; +ld.shared.u32 r1385, [r2752+6656]; +ld.shared.u32 r1239, [r2752+7168]; +ld.shared.u32 r1435, [r2752+7680]; +barrier.sync 0; +st.shared.v4.f32 [r2751], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2751+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2751+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2751+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2752]; +ld.shared.u32 r1375, [r2752+512]; +ld.shared.u32 r1229, [r2752+1024]; +ld.shared.u32 r1425, [r2752+1536]; +ld.shared.u32 r1191, [r2752+2048]; +ld.shared.u32 r1387, [r2752+2560]; +ld.shared.u32 r1241, [r2752+3072]; +ld.shared.u32 r1437, [r2752+3584]; +ld.shared.u32 r1180, [r2752+4096]; +ld.shared.u32 r1376, [r2752+4608]; +ld.shared.u32 r1230, [r2752+5120]; +ld.shared.u32 r1426, [r2752+5632]; +ld.shared.u32 r1192, [r2752+6144]; +ld.shared.u32 r1388, [r2752+6656]; +ld.shared.u32 r1242, [r2752+7168]; +ld.shared.u32 r1438, [r2752+7680]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2753, r2745, 112; +bfe.u32 r2754, r2745, 4, 3; +shl.b32 r2755, r2745, 2; +and.b32 r2756, r2755, 60; +add.s32 r2757, r2749, r2756; +cvt.rn.f32.u32 f360, r2754; +mul.f32 f361, f360, 0f3D490FDB; +cos.approx.f32 f267, f361; +sin.approx.f32 f362, f361; +neg.f32 f268, f362; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +barrier.sync 0; +and.b32 r2758, r2747, 7168; +add.s32 r2759, r2757, r2758; +st.shared.u32 [r2759], r1695; +st.shared.u32 [r2759+64], r1803; +st.shared.u32 [r2759+128], r1840; +st.shared.u32 [r2759+192], r1877; +st.shared.u32 [r2759+256], r1914; +st.shared.u32 [r2759+320], r1951; +st.shared.u32 [r2759+384], r1988; +st.shared.u32 [r2759+448], r2025; +st.shared.u32 [r2759+512], r2062; +st.shared.u32 [r2759+576], r2099; +st.shared.u32 [r2759+640], r2136; +st.shared.u32 [r2759+704], r2173; +st.shared.u32 [r2759+768], r2210; +st.shared.u32 [r2759+832], r2247; +st.shared.u32 [r2759+896], r2284; +st.shared.u32 [r2759+960], r2321; +barrier.sync 0; +mad.lo.s32 r2760, r2753, -60, r2759; +ld.shared.u32 r2350, [r2760]; +ld.shared.u32 r2546, [r2760+512]; +ld.shared.u32 r2400, [r2760+1024]; +ld.shared.u32 r2596, [r2760+1536]; +ld.shared.u32 r2362, [r2760+2048]; +ld.shared.u32 r2558, [r2760+2560]; +ld.shared.u32 r2412, [r2760+3072]; +ld.shared.u32 r2608, [r2760+3584]; +ld.shared.u32 r2351, [r2760+4096]; +ld.shared.u32 r2547, [r2760+4608]; +ld.shared.u32 r2401, [r2760+5120]; +ld.shared.u32 r2597, [r2760+5632]; +ld.shared.u32 r2363, [r2760+6144]; +ld.shared.u32 r2559, [r2760+6656]; +ld.shared.u32 r2413, [r2760+7168]; +ld.shared.u32 r2609, [r2760+7680]; +barrier.sync 0; +st.shared.u32 [r2759], r1698; +st.shared.u32 [r2759+64], r1810; +st.shared.u32 [r2759+128], r1847; +st.shared.u32 [r2759+192], r1884; +st.shared.u32 [r2759+256], r1921; +st.shared.u32 [r2759+320], r1958; +st.shared.u32 [r2759+384], r1995; +st.shared.u32 [r2759+448], r2032; +st.shared.u32 [r2759+512], r2069; +st.shared.u32 [r2759+576], r2106; +st.shared.u32 [r2759+640], r2143; +st.shared.u32 [r2759+704], r2180; +st.shared.u32 [r2759+768], r2217; +st.shared.u32 [r2759+832], r2254; +st.shared.u32 [r2759+896], r2291; +st.shared.u32 [r2759+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2760]; +ld.shared.u32 r2549, [r2760+512]; +ld.shared.u32 r2403, [r2760+1024]; +ld.shared.u32 r2599, [r2760+1536]; +ld.shared.u32 r2365, [r2760+2048]; +ld.shared.u32 r2561, [r2760+2560]; +ld.shared.u32 r2415, [r2760+3072]; +ld.shared.u32 r2611, [r2760+3584]; +ld.shared.u32 r2354, [r2760+4096]; +ld.shared.u32 r2550, [r2760+4608]; +ld.shared.u32 r2404, [r2760+5120]; +ld.shared.u32 r2600, [r2760+5632]; +ld.shared.u32 r2366, [r2760+6144]; +ld.shared.u32 r2562, [r2760+6656]; +ld.shared.u32 r2416, [r2760+7168]; +ld.shared.u32 r2612, [r2760+7680]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 %0, r2375, r2425; +} +{ +add.f16x2 %1, r2378, r2428; +} +{ +sub.f16x2 %16, r2375, r2425; +} +{ +sub.f16x2 %17, r2378, r2428; +} +{ +add.f16x2 %4, r2387, r2469; +} +{ +add.f16x2 %5, r2390, r2475; +} +{ +sub.f16x2 %20, r2387, r2469; +} +{ +sub.f16x2 %21, r2390, r2475; +} +{ +add.f16x2 %8, r2381, r2434; +} +{ +add.f16x2 %9, r2384, r2479; +} +{ +sub.f16x2 %24, r2381, r2434; +} +{ +sub.f16x2 %25, r2384, r2479; +} +{ +add.f16x2 %12, r2393, r2487; +} +{ +add.f16x2 %13, r2396, r2493; +} +{ +sub.f16x2 %28, r2393, r2487; +} +{ +sub.f16x2 %29, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 %2, r2571, r2621; +} +{ +add.f16x2 %3, r2574, r2624; +} +{ +sub.f16x2 %18, r2571, r2621; +} +{ +sub.f16x2 %19, r2574, r2624; +} +{ +add.f16x2 %6, r2583, r2665; +} +{ +add.f16x2 %7, r2586, r2671; +} +{ +sub.f16x2 %22, r2583, r2665; +} +{ +sub.f16x2 %23, r2586, r2671; +} +{ +add.f16x2 %10, r2577, r2630; +} +{ +add.f16x2 %11, r2580, r2675; +} +{ +sub.f16x2 %26, r2577, r2630; +} +{ +sub.f16x2 %27, r2580, r2675; +} +{ +add.f16x2 %14, r2589, r2683; +} +{ +add.f16x2 %15, r2592, r2689; +} +{ +sub.f16x2 %30, r2589, r2683; +} +{ +sub.f16x2 %31, r2592, r2689; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<839, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1502>; +.reg .b64 rd<2>; +mov.u32 r1475, %tid.y; +shl.b32 r1476, r1475, 13; +mov.u32 r1477, %16; +add.s32 r1478, r1477, r1476; +mov.u32 r1479, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f94, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r101, {low, high}; +} +mov.f32 f104, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f136, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1480, r1479, 255; +shl.b32 r1481, r1479, 5; +and.b32 r1482, r1481, -8192; +add.s32 r1483, r1478, r1482; +cvt.rn.f32.u32 f139, r1480; +mul.f32 f140, f139, 0f3B490FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1484, r1481, 8160; +add.s32 r1485, r1483, r1484; +st.shared.v4.f32 [r1485], {r149, r209, r246, r283}; +st.shared.v4.f32 [r1485+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r1486, r1480, -28, r1485; +ld.shared.u32 r460, [r1486]; +ld.shared.u32 r510, [r1486+1024]; +ld.shared.u32 r472, [r1486+2048]; +ld.shared.u32 r522, [r1486+3072]; +ld.shared.u32 r461, [r1486+4096]; +ld.shared.u32 r511, [r1486+5120]; +ld.shared.u32 r473, [r1486+6144]; +ld.shared.u32 r523, [r1486+7168]; +barrier.sync 0; +st.shared.v4.f32 [r1485], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1485+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1486]; +ld.shared.u32 r513, [r1486+1024]; +ld.shared.u32 r475, [r1486+2048]; +ld.shared.u32 r525, [r1486+3072]; +ld.shared.u32 r464, [r1486+4096]; +ld.shared.u32 r514, [r1486+5120]; +ld.shared.u32 r476, [r1486+6144]; +ld.shared.u32 r526, [r1486+7168]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1487, r1479, 248; +bfe.u32 r1488, r1479, 3, 5; +shl.b32 r1489, r1479, 2; +and.b32 r1490, r1489, 28; +add.s32 r1491, r1483, r1490; +cvt.rn.f32.u32 f142, r1488; +mul.f32 f143, f142, 0f3CC90FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r1492, r1481, 7936; +add.s32 r1493, r1491, r1492; +st.shared.u32 [r1493], r607; +st.shared.u32 [r1493+32], r667; +st.shared.u32 [r1493+64], r704; +st.shared.u32 [r1493+96], r741; +st.shared.u32 [r1493+128], r778; +st.shared.u32 [r1493+160], r815; +st.shared.u32 [r1493+192], r852; +st.shared.u32 [r1493+224], r889; +barrier.sync 0; +mad.lo.s32 r1494, r1487, -28, r1493; +ld.shared.u32 r918, [r1494]; +ld.shared.u32 r968, [r1494+1024]; +ld.shared.u32 r930, [r1494+2048]; +ld.shared.u32 r980, [r1494+3072]; +ld.shared.u32 r919, [r1494+4096]; +ld.shared.u32 r969, [r1494+5120]; +ld.shared.u32 r931, [r1494+6144]; +ld.shared.u32 r981, [r1494+7168]; +barrier.sync 0; +st.shared.u32 [r1493], r610; +st.shared.u32 [r1493+32], r674; +st.shared.u32 [r1493+64], r711; +st.shared.u32 [r1493+96], r748; +st.shared.u32 [r1493+128], r785; +st.shared.u32 [r1493+160], r822; +st.shared.u32 [r1493+192], r859; +st.shared.u32 [r1493+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1494]; +ld.shared.u32 r971, [r1494+1024]; +ld.shared.u32 r933, [r1494+2048]; +ld.shared.u32 r983, [r1494+3072]; +ld.shared.u32 r922, [r1494+4096]; +ld.shared.u32 r972, [r1494+5120]; +ld.shared.u32 r934, [r1494+6144]; +ld.shared.u32 r984, [r1494+7168]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1495, r1479, 192; +bfe.u32 r1496, r1479, 6, 2; +and.b32 r1497, r1489, 252; +add.s32 r1498, r1483, r1497; +cvt.rn.f32.u32 f145, r1496; +mul.f32 f146, f145, 0f3E490FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +barrier.sync 0; +and.b32 r1499, r1481, 6144; +add.s32 r1500, r1498, r1499; +st.shared.u32 [r1500], r1065; +st.shared.u32 [r1500+256], r1125; +st.shared.u32 [r1500+512], r1162; +st.shared.u32 [r1500+768], r1199; +st.shared.u32 [r1500+1024], r1236; +st.shared.u32 [r1500+1280], r1273; +st.shared.u32 [r1500+1536], r1310; +st.shared.u32 [r1500+1792], r1347; +barrier.sync 0; +mad.lo.s32 r1501, r1495, -28, r1500; +ld.shared.u32 r1376, [r1501]; +ld.shared.u32 r1426, [r1501+1024]; +ld.shared.u32 r1388, [r1501+2048]; +ld.shared.u32 r1438, [r1501+3072]; +ld.shared.u32 r1377, [r1501+4096]; +ld.shared.u32 r1427, [r1501+5120]; +ld.shared.u32 r1389, [r1501+6144]; +ld.shared.u32 r1439, [r1501+7168]; +barrier.sync 0; +st.shared.u32 [r1500], r1068; +st.shared.u32 [r1500+256], r1132; +st.shared.u32 [r1500+512], r1169; +st.shared.u32 [r1500+768], r1206; +st.shared.u32 [r1500+1024], r1243; +st.shared.u32 [r1500+1280], r1280; +st.shared.u32 [r1500+1536], r1317; +st.shared.u32 [r1500+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1501]; +ld.shared.u32 r1429, [r1501+1024]; +ld.shared.u32 r1391, [r1501+2048]; +ld.shared.u32 r1441, [r1501+3072]; +ld.shared.u32 r1380, [r1501+4096]; +ld.shared.u32 r1430, [r1501+5120]; +ld.shared.u32 r1392, [r1501+6144]; +ld.shared.u32 r1442, [r1501+7168]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1393; +} +{ +add.f16x2 %0, r1375, r1387; +} +{ +add.f16x2 %1, r1378, r1390; +} +{ +sub.f16x2 %8, r1375, r1387; +} +{ +sub.f16x2 %9, r1378, r1390; +} +{ +add.f16x2 %4, r1381, r1396; +} +{ +add.f16x2 %5, r1384, r1399; +} +{ +sub.f16x2 %12, r1381, r1396; +} +{ +sub.f16x2 %13, r1384, r1399; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1443; +} +{ +add.f16x2 %2, r1425, r1437; +} +{ +add.f16x2 %3, r1428, r1440; +} +{ +sub.f16x2 %10, r1425, r1437; +} +{ +sub.f16x2 %11, r1428, r1440; +} +{ +add.f16x2 %6, r1431, r1446; +} +{ +add.f16x2 %7, r1434, r1449; +} +{ +sub.f16x2 %14, r1431, r1446; +} +{ +sub.f16x2 %15, r1434, r1449; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<838, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1502>; +.reg .b64 rd<2>; +mov.u32 r1475, %tid.y; +shl.b32 r1476, r1475, 14; +mov.u32 r1477, %16; +add.s32 r1478, r1477, r1476; +mov.u32 r1479, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f94, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r101, {low, high}; +} +mov.f32 f104, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f136, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1480, r1479, 255; +shl.b32 r1481, r1479, 6; +and.b32 r1482, r1481, -16384; +add.s32 r1483, r1478, r1482; +cvt.rn.f32.u32 f139, r1480; +mul.f32 f140, f139, 0f3B490FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1484, r1481, 16320; +add.s32 r1485, r1483, r1484; +st.shared.v4.f32 [r1485], {r149, r152, r209, r216}; +st.shared.v4.f32 [r1485+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r1485+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r1485+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r1486, r1480, -56, r1485; +ld.shared.u32 r460, [r1486]; +ld.shared.u32 r463, [r1486+4]; +ld.shared.u32 r510, [r1486+2048]; +ld.shared.u32 r513, [r1486+2052]; +ld.shared.u32 r472, [r1486+4096]; +ld.shared.u32 r475, [r1486+4100]; +ld.shared.u32 r522, [r1486+6144]; +ld.shared.u32 r525, [r1486+6148]; +ld.shared.u32 r461, [r1486+8192]; +ld.shared.u32 r464, [r1486+8196]; +ld.shared.u32 r511, [r1486+10240]; +ld.shared.u32 r514, [r1486+10244]; +ld.shared.u32 r473, [r1486+12288]; +ld.shared.u32 r476, [r1486+12292]; +ld.shared.u32 r523, [r1486+14336]; +ld.shared.u32 r526, [r1486+14340]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1487, r1479, 248; +bfe.u32 r1488, r1479, 3, 5; +cvt.rn.f32.u32 f142, r1488; +mul.f32 f143, f142, 0f3CC90FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r1489, r1479, 3; +and.b32 r1490, r1489, 56; +add.s32 r1491, r1483, r1490; +barrier.sync 0; +and.b32 r1492, r1481, 15872; +add.s32 r1493, r1491, r1492; +st.shared.u32 [r1493], r607; +st.shared.u32 [r1493+4], r610; +st.shared.u32 [r1493+64], r667; +st.shared.u32 [r1493+68], r674; +st.shared.u32 [r1493+128], r704; +st.shared.u32 [r1493+132], r711; +st.shared.u32 [r1493+192], r741; +st.shared.u32 [r1493+196], r748; +st.shared.u32 [r1493+256], r778; +st.shared.u32 [r1493+260], r785; +st.shared.u32 [r1493+320], r815; +st.shared.u32 [r1493+324], r822; +st.shared.u32 [r1493+384], r852; +st.shared.u32 [r1493+388], r859; +st.shared.u32 [r1493+448], r889; +st.shared.u32 [r1493+452], r896; +barrier.sync 0; +mad.lo.s32 r1494, r1487, -56, r1493; +ld.shared.u32 r918, [r1494]; +ld.shared.u32 r921, [r1494+4]; +ld.shared.u32 r968, [r1494+2048]; +ld.shared.u32 r971, [r1494+2052]; +ld.shared.u32 r930, [r1494+4096]; +ld.shared.u32 r933, [r1494+4100]; +ld.shared.u32 r980, [r1494+6144]; +ld.shared.u32 r983, [r1494+6148]; +ld.shared.u32 r919, [r1494+8192]; +ld.shared.u32 r922, [r1494+8196]; +ld.shared.u32 r969, [r1494+10240]; +ld.shared.u32 r972, [r1494+10244]; +ld.shared.u32 r931, [r1494+12288]; +ld.shared.u32 r934, [r1494+12292]; +ld.shared.u32 r981, [r1494+14336]; +ld.shared.u32 r984, [r1494+14340]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1495, r1479, 192; +bfe.u32 r1496, r1479, 6, 2; +cvt.rn.f32.u32 f145, r1496; +mul.f32 f146, f145, 0f3E490FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +and.b32 r1497, r1489, 504; +add.s32 r1498, r1483, r1497; +barrier.sync 0; +and.b32 r1499, r1481, 12288; +add.s32 r1500, r1498, r1499; +st.shared.u32 [r1500], r1065; +st.shared.u32 [r1500+4], r1068; +st.shared.u32 [r1500+512], r1125; +st.shared.u32 [r1500+516], r1132; +st.shared.u32 [r1500+1024], r1162; +st.shared.u32 [r1500+1028], r1169; +st.shared.u32 [r1500+1536], r1199; +st.shared.u32 [r1500+1540], r1206; +st.shared.u32 [r1500+2048], r1236; +st.shared.u32 [r1500+2052], r1243; +st.shared.u32 [r1500+2560], r1273; +st.shared.u32 [r1500+2564], r1280; +st.shared.u32 [r1500+3072], r1310; +st.shared.u32 [r1500+3076], r1317; +st.shared.u32 [r1500+3584], r1347; +st.shared.u32 [r1500+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1501, r1495, -56, r1500; +ld.shared.u32 r1376, [r1501]; +ld.shared.u32 r1379, [r1501+4]; +ld.shared.u32 r1426, [r1501+2048]; +ld.shared.u32 r1429, [r1501+2052]; +ld.shared.u32 r1388, [r1501+4096]; +ld.shared.u32 r1391, [r1501+4100]; +ld.shared.u32 r1438, [r1501+6144]; +ld.shared.u32 r1441, [r1501+6148]; +ld.shared.u32 r1377, [r1501+8192]; +ld.shared.u32 r1380, [r1501+8196]; +ld.shared.u32 r1427, [r1501+10240]; +ld.shared.u32 r1430, [r1501+10244]; +ld.shared.u32 r1389, [r1501+12288]; +ld.shared.u32 r1392, [r1501+12292]; +ld.shared.u32 r1439, [r1501+14336]; +ld.shared.u32 r1442, [r1501+14340]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1393; +} +{ +add.f16x2 %0, r1375, r1387; +} +{ +add.f16x2 %1, r1378, r1390; +} +{ +sub.f16x2 %8, r1375, r1387; +} +{ +sub.f16x2 %9, r1378, r1390; +} +{ +add.f16x2 %4, r1381, r1396; +} +{ +add.f16x2 %5, r1384, r1399; +} +{ +sub.f16x2 %12, r1381, r1396; +} +{ +sub.f16x2 %13, r1384, r1399; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1443; +} +{ +add.f16x2 %2, r1425, r1437; +} +{ +add.f16x2 %3, r1428, r1440; +} +{ +sub.f16x2 %10, r1425, r1437; +} +{ +sub.f16x2 %11, r1428, r1440; +} +{ +add.f16x2 %6, r1431, r1446; +} +{ +add.f16x2 %7, r1434, r1449; +} +{ +sub.f16x2 %14, r1431, r1446; +} +{ +sub.f16x2 %15, r1434, r1449; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<840, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<363>; +.reg .b32 r<2761>; +.reg .b64 rd<2>; +mov.u32 r2741, %tid.y; +shl.b32 r2742, r2741, 14; +mov.u32 r2743, %32; +add.s32 r2744, r2743, r2742; +mov.u32 r2745, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f330, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r101, {low, high}; +} +mov.f32 f340, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2746, r2745, 127; +shl.b32 r2747, r2745, 7; +and.b32 r2748, r2747, -16384; +add.s32 r2749, r2744, r2748; +cvt.rn.f32.u32 f357, r2746; +mul.f32 f358, f357, 0f3B490FDB; +cos.approx.f32 f117, f358; +sin.approx.f32 f359, f358; +neg.f32 f118, f359; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2750, r2747, 16256; +add.s32 r2751, r2749, r2750; +st.shared.v4.f32 [r2751], {r521, r524, r629, r636}; +st.shared.v4.f32 [r2751+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r2751+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r2751+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r2751+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r2751+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r2751+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r2751+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r2752, r2746, -120, r2751; +ld.shared.u32 r1176, [r2752]; +ld.shared.u32 r1179, [r2752+4]; +ld.shared.u32 r1372, [r2752+1024]; +ld.shared.u32 r1375, [r2752+1028]; +ld.shared.u32 r1226, [r2752+2048]; +ld.shared.u32 r1229, [r2752+2052]; +ld.shared.u32 r1422, [r2752+3072]; +ld.shared.u32 r1425, [r2752+3076]; +ld.shared.u32 r1188, [r2752+4096]; +ld.shared.u32 r1191, [r2752+4100]; +ld.shared.u32 r1384, [r2752+5120]; +ld.shared.u32 r1387, [r2752+5124]; +ld.shared.u32 r1238, [r2752+6144]; +ld.shared.u32 r1241, [r2752+6148]; +ld.shared.u32 r1434, [r2752+7168]; +ld.shared.u32 r1437, [r2752+7172]; +ld.shared.u32 r1177, [r2752+8192]; +ld.shared.u32 r1180, [r2752+8196]; +ld.shared.u32 r1373, [r2752+9216]; +ld.shared.u32 r1376, [r2752+9220]; +ld.shared.u32 r1227, [r2752+10240]; +ld.shared.u32 r1230, [r2752+10244]; +ld.shared.u32 r1423, [r2752+11264]; +ld.shared.u32 r1426, [r2752+11268]; +ld.shared.u32 r1189, [r2752+12288]; +ld.shared.u32 r1192, [r2752+12292]; +ld.shared.u32 r1385, [r2752+13312]; +ld.shared.u32 r1388, [r2752+13316]; +ld.shared.u32 r1239, [r2752+14336]; +ld.shared.u32 r1242, [r2752+14340]; +ld.shared.u32 r1435, [r2752+15360]; +ld.shared.u32 r1438, [r2752+15364]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2753, r2745, 112; +bfe.u32 r2754, r2745, 4, 3; +cvt.rn.f32.u32 f360, r2754; +mul.f32 f361, f360, 0f3D490FDB; +cos.approx.f32 f267, f361; +sin.approx.f32 f362, f361; +neg.f32 f268, f362; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +shl.b32 r2755, r2745, 3; +and.b32 r2756, r2755, 120; +add.s32 r2757, r2749, r2756; +barrier.sync 0; +and.b32 r2758, r2747, 14336; +add.s32 r2759, r2757, r2758; +st.shared.u32 [r2759], r1695; +st.shared.u32 [r2759+4], r1698; +st.shared.u32 [r2759+128], r1803; +st.shared.u32 [r2759+132], r1810; +st.shared.u32 [r2759+256], r1840; +st.shared.u32 [r2759+260], r1847; +st.shared.u32 [r2759+384], r1877; +st.shared.u32 [r2759+388], r1884; +st.shared.u32 [r2759+512], r1914; +st.shared.u32 [r2759+516], r1921; +st.shared.u32 [r2759+640], r1951; +st.shared.u32 [r2759+644], r1958; +st.shared.u32 [r2759+768], r1988; +st.shared.u32 [r2759+772], r1995; +st.shared.u32 [r2759+896], r2025; +st.shared.u32 [r2759+900], r2032; +st.shared.u32 [r2759+1024], r2062; +st.shared.u32 [r2759+1028], r2069; +st.shared.u32 [r2759+1152], r2099; +st.shared.u32 [r2759+1156], r2106; +st.shared.u32 [r2759+1280], r2136; +st.shared.u32 [r2759+1284], r2143; +st.shared.u32 [r2759+1408], r2173; +st.shared.u32 [r2759+1412], r2180; +st.shared.u32 [r2759+1536], r2210; +st.shared.u32 [r2759+1540], r2217; +st.shared.u32 [r2759+1664], r2247; +st.shared.u32 [r2759+1668], r2254; +st.shared.u32 [r2759+1792], r2284; +st.shared.u32 [r2759+1796], r2291; +st.shared.u32 [r2759+1920], r2321; +st.shared.u32 [r2759+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2760, r2753, -120, r2759; +ld.shared.u32 r2350, [r2760]; +ld.shared.u32 r2353, [r2760+4]; +ld.shared.u32 r2546, [r2760+1024]; +ld.shared.u32 r2549, [r2760+1028]; +ld.shared.u32 r2400, [r2760+2048]; +ld.shared.u32 r2403, [r2760+2052]; +ld.shared.u32 r2596, [r2760+3072]; +ld.shared.u32 r2599, [r2760+3076]; +ld.shared.u32 r2362, [r2760+4096]; +ld.shared.u32 r2365, [r2760+4100]; +ld.shared.u32 r2558, [r2760+5120]; +ld.shared.u32 r2561, [r2760+5124]; +ld.shared.u32 r2412, [r2760+6144]; +ld.shared.u32 r2415, [r2760+6148]; +ld.shared.u32 r2608, [r2760+7168]; +ld.shared.u32 r2611, [r2760+7172]; +ld.shared.u32 r2351, [r2760+8192]; +ld.shared.u32 r2354, [r2760+8196]; +ld.shared.u32 r2547, [r2760+9216]; +ld.shared.u32 r2550, [r2760+9220]; +ld.shared.u32 r2401, [r2760+10240]; +ld.shared.u32 r2404, [r2760+10244]; +ld.shared.u32 r2597, [r2760+11264]; +ld.shared.u32 r2600, [r2760+11268]; +ld.shared.u32 r2363, [r2760+12288]; +ld.shared.u32 r2366, [r2760+12292]; +ld.shared.u32 r2559, [r2760+13312]; +ld.shared.u32 r2562, [r2760+13316]; +ld.shared.u32 r2413, [r2760+14336]; +ld.shared.u32 r2416, [r2760+14340]; +ld.shared.u32 r2609, [r2760+15360]; +ld.shared.u32 r2612, [r2760+15364]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 %0, r2375, r2425; +} +{ +add.f16x2 %1, r2378, r2428; +} +{ +sub.f16x2 %16, r2375, r2425; +} +{ +sub.f16x2 %17, r2378, r2428; +} +{ +add.f16x2 %4, r2387, r2469; +} +{ +add.f16x2 %5, r2390, r2475; +} +{ +sub.f16x2 %20, r2387, r2469; +} +{ +sub.f16x2 %21, r2390, r2475; +} +{ +add.f16x2 %8, r2381, r2434; +} +{ +add.f16x2 %9, r2384, r2479; +} +{ +sub.f16x2 %24, r2381, r2434; +} +{ +sub.f16x2 %25, r2384, r2479; +} +{ +add.f16x2 %12, r2393, r2487; +} +{ +add.f16x2 %13, r2396, r2493; +} +{ +sub.f16x2 %28, r2393, r2487; +} +{ +sub.f16x2 %29, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 %2, r2571, r2621; +} +{ +add.f16x2 %3, r2574, r2624; +} +{ +sub.f16x2 %18, r2571, r2621; +} +{ +sub.f16x2 %19, r2574, r2624; +} +{ +add.f16x2 %6, r2583, r2665; +} +{ +add.f16x2 %7, r2586, r2671; +} +{ +sub.f16x2 %22, r2583, r2665; +} +{ +sub.f16x2 %23, r2586, r2671; +} +{ +add.f16x2 %10, r2577, r2630; +} +{ +add.f16x2 %11, r2580, r2675; +} +{ +sub.f16x2 %26, r2577, r2630; +} +{ +sub.f16x2 %27, r2580, r2675; +} +{ +add.f16x2 %14, r2589, r2683; +} +{ +add.f16x2 %15, r2592, r2689; +} +{ +sub.f16x2 %30, r2589, r2683; +} +{ +sub.f16x2 %31, r2592, r2689; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<841, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6003>; +.reg .b64 rd<3>; +mov.u32 r5917, %tid.y; +shl.b32 r5918, r5917, 14; +mov.u32 r5919, %64; +add.s32 r5920, r5919, r5918; +mov.u32 r5921, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f668, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r101, {low, high}; +} +mov.f32 f702, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r5923, r5921, 8; +and.b32 r5924, r5923, -16384; +add.s32 r5925, r5920, r5924; +and.b32 r5938, r5921, 63; +cvt.rn.f32.u32 f845, r5938; +mul.f32 f846, f845, 0f3B490FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r5926, r5923, 16128; +add.s32 r5927, r5925, r5926; +st.shared.v4.f32 [r5927], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r5927+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r5927+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r5927+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r5927+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r5927+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r5927+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r5927+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r5927+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r5927+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r5927+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r5927+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r5927+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r5927+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r5927+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r5927+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r5928, r5938, -248, r5927; +ld.shared.u32 r2864, [r5928]; +ld.shared.u32 r2867, [r5928+4]; +ld.shared.u32 r3480, [r5928+512]; +ld.shared.u32 r3483, [r5928+516]; +ld.shared.u32 r3060, [r5928+1024]; +ld.shared.u32 r3063, [r5928+1028]; +ld.shared.u32 r3676, [r5928+1536]; +ld.shared.u32 r3679, [r5928+1540]; +ld.shared.u32 r2914, [r5928+2048]; +ld.shared.u32 r2917, [r5928+2052]; +ld.shared.u32 r3530, [r5928+2560]; +ld.shared.u32 r3533, [r5928+2564]; +ld.shared.u32 r3110, [r5928+3072]; +ld.shared.u32 r3113, [r5928+3076]; +ld.shared.u32 r3726, [r5928+3584]; +ld.shared.u32 r3729, [r5928+3588]; +ld.shared.u32 r2876, [r5928+4096]; +ld.shared.u32 r2879, [r5928+4100]; +ld.shared.u32 r3492, [r5928+4608]; +ld.shared.u32 r3495, [r5928+4612]; +ld.shared.u32 r3072, [r5928+5120]; +ld.shared.u32 r3075, [r5928+5124]; +ld.shared.u32 r3688, [r5928+5632]; +ld.shared.u32 r3691, [r5928+5636]; +ld.shared.u32 r2926, [r5928+6144]; +ld.shared.u32 r2929, [r5928+6148]; +ld.shared.u32 r3542, [r5928+6656]; +ld.shared.u32 r3545, [r5928+6660]; +ld.shared.u32 r3122, [r5928+7168]; +ld.shared.u32 r3125, [r5928+7172]; +ld.shared.u32 r3738, [r5928+7680]; +ld.shared.u32 r3741, [r5928+7684]; +ld.shared.u32 r2865, [r5928+8192]; +ld.shared.u32 r2868, [r5928+8196]; +ld.shared.u32 r3481, [r5928+8704]; +ld.shared.u32 r3484, [r5928+8708]; +ld.shared.u32 r3061, [r5928+9216]; +ld.shared.u32 r3064, [r5928+9220]; +ld.shared.u32 r3677, [r5928+9728]; +ld.shared.u32 r3680, [r5928+9732]; +ld.shared.u32 r2915, [r5928+10240]; +ld.shared.u32 r2918, [r5928+10244]; +ld.shared.u32 r3531, [r5928+10752]; +ld.shared.u32 r3534, [r5928+10756]; +ld.shared.u32 r3111, [r5928+11264]; +ld.shared.u32 r3114, [r5928+11268]; +ld.shared.u32 r3727, [r5928+11776]; +ld.shared.u32 r3730, [r5928+11780]; +ld.shared.u32 r2877, [r5928+12288]; +ld.shared.u32 r2880, [r5928+12292]; +ld.shared.u32 r3493, [r5928+12800]; +ld.shared.u32 r3496, [r5928+12804]; +ld.shared.u32 r3073, [r5928+13312]; +ld.shared.u32 r3076, [r5928+13316]; +ld.shared.u32 r3689, [r5928+13824]; +ld.shared.u32 r3692, [r5928+13828]; +ld.shared.u32 r2927, [r5928+14336]; +ld.shared.u32 r2930, [r5928+14340]; +ld.shared.u32 r3543, [r5928+14848]; +ld.shared.u32 r3546, [r5928+14852]; +ld.shared.u32 r3123, [r5928+15360]; +ld.shared.u32 r3126, [r5928+15364]; +ld.shared.u32 r3739, [r5928+15872]; +ld.shared.u32 r3742, [r5928+15876]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r5930, r5921, 5, 1; +cvt.rn.f32.u32 f848, r5930; +mul.f32 f849, f848, 0f3DC90FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +and.b32 r5937, r5921, 32; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +shl.b32 r5931, r5921, 3; +and.b32 r5932, r5931, 248; +add.s32 r5933, r5925, r5932; +barrier.sync 0; +and.b32 r5934, r5923, 8192; +add.s32 r5935, r5933, r5934; +st.shared.u32 [r5935], r4383; +st.shared.u32 [r5935+4], r4386; +st.shared.u32 [r5935+256], r4587; +st.shared.u32 [r5935+260], r4594; +st.shared.u32 [r5935+512], r4624; +st.shared.u32 [r5935+516], r4631; +st.shared.u32 [r5935+768], r4661; +st.shared.u32 [r5935+772], r4668; +st.shared.u32 [r5935+1024], r4698; +st.shared.u32 [r5935+1028], r4705; +st.shared.u32 [r5935+1280], r4735; +st.shared.u32 [r5935+1284], r4742; +st.shared.u32 [r5935+1536], r4772; +st.shared.u32 [r5935+1540], r4779; +st.shared.u32 [r5935+1792], r4809; +st.shared.u32 [r5935+1796], r4816; +st.shared.u32 [r5935+2048], r4846; +st.shared.u32 [r5935+2052], r4853; +st.shared.u32 [r5935+2304], r4883; +st.shared.u32 [r5935+2308], r4890; +st.shared.u32 [r5935+2560], r4920; +st.shared.u32 [r5935+2564], r4927; +st.shared.u32 [r5935+2816], r4957; +st.shared.u32 [r5935+2820], r4964; +st.shared.u32 [r5935+3072], r4994; +st.shared.u32 [r5935+3076], r5001; +st.shared.u32 [r5935+3328], r5031; +st.shared.u32 [r5935+3332], r5038; +st.shared.u32 [r5935+3584], r5068; +st.shared.u32 [r5935+3588], r5075; +st.shared.u32 [r5935+3840], r5105; +st.shared.u32 [r5935+3844], r5112; +st.shared.u32 [r5935+4096], r5142; +st.shared.u32 [r5935+4100], r5149; +st.shared.u32 [r5935+4352], r5179; +st.shared.u32 [r5935+4356], r5186; +st.shared.u32 [r5935+4608], r5216; +st.shared.u32 [r5935+4612], r5223; +st.shared.u32 [r5935+4864], r5253; +st.shared.u32 [r5935+4868], r5260; +st.shared.u32 [r5935+5120], r5290; +st.shared.u32 [r5935+5124], r5297; +st.shared.u32 [r5935+5376], r5327; +st.shared.u32 [r5935+5380], r5334; +st.shared.u32 [r5935+5632], r5364; +st.shared.u32 [r5935+5636], r5371; +st.shared.u32 [r5935+5888], r5401; +st.shared.u32 [r5935+5892], r5408; +st.shared.u32 [r5935+6144], r5438; +st.shared.u32 [r5935+6148], r5445; +st.shared.u32 [r5935+6400], r5475; +st.shared.u32 [r5935+6404], r5482; +st.shared.u32 [r5935+6656], r5512; +st.shared.u32 [r5935+6660], r5519; +st.shared.u32 [r5935+6912], r5549; +st.shared.u32 [r5935+6916], r5556; +st.shared.u32 [r5935+7168], r5586; +st.shared.u32 [r5935+7172], r5593; +st.shared.u32 [r5935+7424], r5623; +st.shared.u32 [r5935+7428], r5630; +st.shared.u32 [r5935+7680], r5660; +st.shared.u32 [r5935+7684], r5667; +st.shared.u32 [r5935+7936], r5697; +st.shared.u32 [r5935+7940], r5704; +barrier.sync 0; +mad.lo.s32 r5936, r5937, -248, r5935; +ld.shared.u32 r5726, [r5936]; +ld.shared.u32 r5729, [r5936+4]; +ld.shared.u32 r5738, [r5936+512]; +ld.shared.u32 r5741, [r5936+516]; +ld.shared.u32 r5750, [r5936+1024]; +ld.shared.u32 r5753, [r5936+1028]; +ld.shared.u32 r5762, [r5936+1536]; +ld.shared.u32 r5765, [r5936+1540]; +ld.shared.u32 r5774, [r5936+2048]; +ld.shared.u32 r5777, [r5936+2052]; +ld.shared.u32 r5786, [r5936+2560]; +ld.shared.u32 r5789, [r5936+2564]; +ld.shared.u32 r5798, [r5936+3072]; +ld.shared.u32 r5801, [r5936+3076]; +ld.shared.u32 r5810, [r5936+3584]; +ld.shared.u32 r5813, [r5936+3588]; +ld.shared.u32 r5822, [r5936+4096]; +ld.shared.u32 r5825, [r5936+4100]; +ld.shared.u32 r5834, [r5936+4608]; +ld.shared.u32 r5837, [r5936+4612]; +ld.shared.u32 r5846, [r5936+5120]; +ld.shared.u32 r5849, [r5936+5124]; +ld.shared.u32 r5858, [r5936+5632]; +ld.shared.u32 r5861, [r5936+5636]; +ld.shared.u32 r5870, [r5936+6144]; +ld.shared.u32 r5873, [r5936+6148]; +ld.shared.u32 r5882, [r5936+6656]; +ld.shared.u32 r5885, [r5936+6660]; +ld.shared.u32 r5894, [r5936+7168]; +ld.shared.u32 r5897, [r5936+7172]; +ld.shared.u32 r5906, [r5936+7680]; +ld.shared.u32 r5909, [r5936+7684]; +ld.shared.u32 r5727, [r5936+8192]; +ld.shared.u32 r5730, [r5936+8196]; +ld.shared.u32 r5739, [r5936+8704]; +ld.shared.u32 r5742, [r5936+8708]; +ld.shared.u32 r5751, [r5936+9216]; +ld.shared.u32 r5754, [r5936+9220]; +ld.shared.u32 r5763, [r5936+9728]; +ld.shared.u32 r5766, [r5936+9732]; +ld.shared.u32 r5775, [r5936+10240]; +ld.shared.u32 r5778, [r5936+10244]; +ld.shared.u32 r5787, [r5936+10752]; +ld.shared.u32 r5790, [r5936+10756]; +ld.shared.u32 r5799, [r5936+11264]; +ld.shared.u32 r5802, [r5936+11268]; +ld.shared.u32 r5811, [r5936+11776]; +ld.shared.u32 r5814, [r5936+11780]; +ld.shared.u32 r5823, [r5936+12288]; +ld.shared.u32 r5826, [r5936+12292]; +ld.shared.u32 r5835, [r5936+12800]; +ld.shared.u32 r5838, [r5936+12804]; +ld.shared.u32 r5847, [r5936+13312]; +ld.shared.u32 r5850, [r5936+13316]; +ld.shared.u32 r5859, [r5936+13824]; +ld.shared.u32 r5862, [r5936+13828]; +ld.shared.u32 r5871, [r5936+14336]; +ld.shared.u32 r5874, [r5936+14340]; +ld.shared.u32 r5883, [r5936+14848]; +ld.shared.u32 r5886, [r5936+14852]; +ld.shared.u32 r5895, [r5936+15360]; +ld.shared.u32 r5898, [r5936+15364]; +ld.shared.u32 r5907, [r5936+15872]; +ld.shared.u32 r5910, [r5936+15876]; +{ +add.f16x2 %0, r5726, r5727; +} +{ +add.f16x2 %1, r5729, r5730; +} +{ +sub.f16x2 %32, r5726, r5727; +} +{ +sub.f16x2 %33, r5729, r5730; +} +{ +add.f16x2 %2, r5738, r5739; +} +{ +add.f16x2 %3, r5741, r5742; +} +{ +sub.f16x2 %34, r5738, r5739; +} +{ +sub.f16x2 %35, r5741, r5742; +} +{ +add.f16x2 %4, r5750, r5751; +} +{ +add.f16x2 %5, r5753, r5754; +} +{ +sub.f16x2 %36, r5750, r5751; +} +{ +sub.f16x2 %37, r5753, r5754; +} +{ +add.f16x2 %6, r5762, r5763; +} +{ +add.f16x2 %7, r5765, r5766; +} +{ +sub.f16x2 %38, r5762, r5763; +} +{ +sub.f16x2 %39, r5765, r5766; +} +{ +add.f16x2 %8, r5774, r5775; +} +{ +add.f16x2 %9, r5777, r5778; +} +{ +sub.f16x2 %40, r5774, r5775; +} +{ +sub.f16x2 %41, r5777, r5778; +} +{ +add.f16x2 %10, r5786, r5787; +} +{ +add.f16x2 %11, r5789, r5790; +} +{ +sub.f16x2 %42, r5786, r5787; +} +{ +sub.f16x2 %43, r5789, r5790; +} +{ +add.f16x2 %12, r5798, r5799; +} +{ +add.f16x2 %13, r5801, r5802; +} +{ +sub.f16x2 %44, r5798, r5799; +} +{ +sub.f16x2 %45, r5801, r5802; +} +{ +add.f16x2 %14, r5810, r5811; +} +{ +add.f16x2 %15, r5813, r5814; +} +{ +sub.f16x2 %46, r5810, r5811; +} +{ +sub.f16x2 %47, r5813, r5814; +} +{ +add.f16x2 %16, r5822, r5823; +} +{ +add.f16x2 %17, r5825, r5826; +} +{ +sub.f16x2 %48, r5822, r5823; +} +{ +sub.f16x2 %49, r5825, r5826; +} +{ +add.f16x2 %18, r5834, r5835; +} +{ +add.f16x2 %19, r5837, r5838; +} +{ +sub.f16x2 %50, r5834, r5835; +} +{ +sub.f16x2 %51, r5837, r5838; +} +{ +add.f16x2 %20, r5846, r5847; +} +{ +add.f16x2 %21, r5849, r5850; +} +{ +sub.f16x2 %52, r5846, r5847; +} +{ +sub.f16x2 %53, r5849, r5850; +} +{ +add.f16x2 %22, r5858, r5859; +} +{ +add.f16x2 %23, r5861, r5862; +} +{ +sub.f16x2 %54, r5858, r5859; +} +{ +sub.f16x2 %55, r5861, r5862; +} +{ +add.f16x2 %24, r5870, r5871; +} +{ +add.f16x2 %25, r5873, r5874; +} +{ +sub.f16x2 %56, r5870, r5871; +} +{ +sub.f16x2 %57, r5873, r5874; +} +{ +add.f16x2 %26, r5882, r5883; +} +{ +add.f16x2 %27, r5885, r5886; +} +{ +sub.f16x2 %58, r5882, r5883; +} +{ +sub.f16x2 %59, r5885, r5886; +} +{ +add.f16x2 %28, r5894, r5895; +} +{ +add.f16x2 %29, r5897, r5898; +} +{ +sub.f16x2 %60, r5894, r5895; +} +{ +sub.f16x2 %61, r5897, r5898; +} +{ +add.f16x2 %30, r5906, r5907; +} +{ +add.f16x2 %31, r5909, r5910; +} +{ +sub.f16x2 %62, r5906, r5907; +} +{ +sub.f16x2 %63, r5909, r5910; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<842, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6002>; +.reg .b64 rd<3>; +mov.u32 r5917, %tid.y; +shl.b32 r5918, r5917, 13; +mov.u32 r5919, %64; +add.s32 r5920, r5919, r5918; +mov.u32 r5921, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f668, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r101, {low, high}; +} +mov.f32 f702, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r5923, r5921, 7; +and.b32 r5924, r5923, -8192; +add.s32 r5925, r5920, r5924; +and.b32 r5937, r5921, 63; +cvt.rn.f32.u32 f845, r5937; +mul.f32 f846, f845, 0f3B490FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r5926, r5923, 8064; +add.s32 r5927, r5925, r5926; +st.shared.v4.f32 [r5927], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r5927+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r5927+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r5927+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r5927+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r5927+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r5927+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r5927+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r5928, r5937, -124, r5927; +ld.shared.u32 r2864, [r5928]; +ld.shared.u32 r3480, [r5928+256]; +ld.shared.u32 r3060, [r5928+512]; +ld.shared.u32 r3676, [r5928+768]; +ld.shared.u32 r2914, [r5928+1024]; +ld.shared.u32 r3530, [r5928+1280]; +ld.shared.u32 r3110, [r5928+1536]; +ld.shared.u32 r3726, [r5928+1792]; +ld.shared.u32 r2876, [r5928+2048]; +ld.shared.u32 r3492, [r5928+2304]; +ld.shared.u32 r3072, [r5928+2560]; +ld.shared.u32 r3688, [r5928+2816]; +ld.shared.u32 r2926, [r5928+3072]; +ld.shared.u32 r3542, [r5928+3328]; +ld.shared.u32 r3122, [r5928+3584]; +ld.shared.u32 r3738, [r5928+3840]; +ld.shared.u32 r2865, [r5928+4096]; +ld.shared.u32 r3481, [r5928+4352]; +ld.shared.u32 r3061, [r5928+4608]; +ld.shared.u32 r3677, [r5928+4864]; +ld.shared.u32 r2915, [r5928+5120]; +ld.shared.u32 r3531, [r5928+5376]; +ld.shared.u32 r3111, [r5928+5632]; +ld.shared.u32 r3727, [r5928+5888]; +ld.shared.u32 r2877, [r5928+6144]; +ld.shared.u32 r3493, [r5928+6400]; +ld.shared.u32 r3073, [r5928+6656]; +ld.shared.u32 r3689, [r5928+6912]; +ld.shared.u32 r2927, [r5928+7168]; +ld.shared.u32 r3543, [r5928+7424]; +ld.shared.u32 r3123, [r5928+7680]; +ld.shared.u32 r3739, [r5928+7936]; +barrier.sync 0; +st.shared.v4.f32 [r5927], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r5927+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r5927+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r5927+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r5927+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r5927+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r5927+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r5927+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r5928]; +ld.shared.u32 r3483, [r5928+256]; +ld.shared.u32 r3063, [r5928+512]; +ld.shared.u32 r3679, [r5928+768]; +ld.shared.u32 r2917, [r5928+1024]; +ld.shared.u32 r3533, [r5928+1280]; +ld.shared.u32 r3113, [r5928+1536]; +ld.shared.u32 r3729, [r5928+1792]; +ld.shared.u32 r2879, [r5928+2048]; +ld.shared.u32 r3495, [r5928+2304]; +ld.shared.u32 r3075, [r5928+2560]; +ld.shared.u32 r3691, [r5928+2816]; +ld.shared.u32 r2929, [r5928+3072]; +ld.shared.u32 r3545, [r5928+3328]; +ld.shared.u32 r3125, [r5928+3584]; +ld.shared.u32 r3741, [r5928+3840]; +ld.shared.u32 r2868, [r5928+4096]; +ld.shared.u32 r3484, [r5928+4352]; +ld.shared.u32 r3064, [r5928+4608]; +ld.shared.u32 r3680, [r5928+4864]; +ld.shared.u32 r2918, [r5928+5120]; +ld.shared.u32 r3534, [r5928+5376]; +ld.shared.u32 r3114, [r5928+5632]; +ld.shared.u32 r3730, [r5928+5888]; +ld.shared.u32 r2880, [r5928+6144]; +ld.shared.u32 r3496, [r5928+6400]; +ld.shared.u32 r3076, [r5928+6656]; +ld.shared.u32 r3692, [r5928+6912]; +ld.shared.u32 r2930, [r5928+7168]; +ld.shared.u32 r3546, [r5928+7424]; +ld.shared.u32 r3126, [r5928+7680]; +ld.shared.u32 r3742, [r5928+7936]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r5929, r5921, 32; +bfe.u32 r5930, r5921, 5, 1; +shl.b32 r5931, r5921, 2; +and.b32 r5932, r5931, 124; +add.s32 r5933, r5925, r5932; +cvt.rn.f32.u32 f848, r5930; +mul.f32 f849, f848, 0f3DC90FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +barrier.sync 0; +and.b32 r5934, r5923, 4096; +add.s32 r5935, r5933, r5934; +st.shared.u32 [r5935], r4383; +st.shared.u32 [r5935+128], r4587; +st.shared.u32 [r5935+256], r4624; +st.shared.u32 [r5935+384], r4661; +st.shared.u32 [r5935+512], r4698; +st.shared.u32 [r5935+640], r4735; +st.shared.u32 [r5935+768], r4772; +st.shared.u32 [r5935+896], r4809; +st.shared.u32 [r5935+1024], r4846; +st.shared.u32 [r5935+1152], r4883; +st.shared.u32 [r5935+1280], r4920; +st.shared.u32 [r5935+1408], r4957; +st.shared.u32 [r5935+1536], r4994; +st.shared.u32 [r5935+1664], r5031; +st.shared.u32 [r5935+1792], r5068; +st.shared.u32 [r5935+1920], r5105; +st.shared.u32 [r5935+2048], r5142; +st.shared.u32 [r5935+2176], r5179; +st.shared.u32 [r5935+2304], r5216; +st.shared.u32 [r5935+2432], r5253; +st.shared.u32 [r5935+2560], r5290; +st.shared.u32 [r5935+2688], r5327; +st.shared.u32 [r5935+2816], r5364; +st.shared.u32 [r5935+2944], r5401; +st.shared.u32 [r5935+3072], r5438; +st.shared.u32 [r5935+3200], r5475; +st.shared.u32 [r5935+3328], r5512; +st.shared.u32 [r5935+3456], r5549; +st.shared.u32 [r5935+3584], r5586; +st.shared.u32 [r5935+3712], r5623; +st.shared.u32 [r5935+3840], r5660; +st.shared.u32 [r5935+3968], r5697; +barrier.sync 0; +mad.lo.s32 r5936, r5929, -124, r5935; +ld.shared.u32 r5726, [r5936]; +ld.shared.u32 r5738, [r5936+256]; +ld.shared.u32 r5750, [r5936+512]; +ld.shared.u32 r5762, [r5936+768]; +ld.shared.u32 r5774, [r5936+1024]; +ld.shared.u32 r5786, [r5936+1280]; +ld.shared.u32 r5798, [r5936+1536]; +ld.shared.u32 r5810, [r5936+1792]; +ld.shared.u32 r5822, [r5936+2048]; +ld.shared.u32 r5834, [r5936+2304]; +ld.shared.u32 r5846, [r5936+2560]; +ld.shared.u32 r5858, [r5936+2816]; +ld.shared.u32 r5870, [r5936+3072]; +ld.shared.u32 r5882, [r5936+3328]; +ld.shared.u32 r5894, [r5936+3584]; +ld.shared.u32 r5906, [r5936+3840]; +ld.shared.u32 r5727, [r5936+4096]; +ld.shared.u32 r5739, [r5936+4352]; +ld.shared.u32 r5751, [r5936+4608]; +ld.shared.u32 r5763, [r5936+4864]; +ld.shared.u32 r5775, [r5936+5120]; +ld.shared.u32 r5787, [r5936+5376]; +ld.shared.u32 r5799, [r5936+5632]; +ld.shared.u32 r5811, [r5936+5888]; +ld.shared.u32 r5823, [r5936+6144]; +ld.shared.u32 r5835, [r5936+6400]; +ld.shared.u32 r5847, [r5936+6656]; +ld.shared.u32 r5859, [r5936+6912]; +ld.shared.u32 r5871, [r5936+7168]; +ld.shared.u32 r5883, [r5936+7424]; +ld.shared.u32 r5895, [r5936+7680]; +ld.shared.u32 r5907, [r5936+7936]; +barrier.sync 0; +st.shared.u32 [r5935], r4386; +st.shared.u32 [r5935+128], r4594; +st.shared.u32 [r5935+256], r4631; +st.shared.u32 [r5935+384], r4668; +st.shared.u32 [r5935+512], r4705; +st.shared.u32 [r5935+640], r4742; +st.shared.u32 [r5935+768], r4779; +st.shared.u32 [r5935+896], r4816; +st.shared.u32 [r5935+1024], r4853; +st.shared.u32 [r5935+1152], r4890; +st.shared.u32 [r5935+1280], r4927; +st.shared.u32 [r5935+1408], r4964; +st.shared.u32 [r5935+1536], r5001; +st.shared.u32 [r5935+1664], r5038; +st.shared.u32 [r5935+1792], r5075; +st.shared.u32 [r5935+1920], r5112; +st.shared.u32 [r5935+2048], r5149; +st.shared.u32 [r5935+2176], r5186; +st.shared.u32 [r5935+2304], r5223; +st.shared.u32 [r5935+2432], r5260; +st.shared.u32 [r5935+2560], r5297; +st.shared.u32 [r5935+2688], r5334; +st.shared.u32 [r5935+2816], r5371; +st.shared.u32 [r5935+2944], r5408; +st.shared.u32 [r5935+3072], r5445; +st.shared.u32 [r5935+3200], r5482; +st.shared.u32 [r5935+3328], r5519; +st.shared.u32 [r5935+3456], r5556; +st.shared.u32 [r5935+3584], r5593; +st.shared.u32 [r5935+3712], r5630; +st.shared.u32 [r5935+3840], r5667; +st.shared.u32 [r5935+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r5936]; +ld.shared.u32 r5741, [r5936+256]; +ld.shared.u32 r5753, [r5936+512]; +ld.shared.u32 r5765, [r5936+768]; +ld.shared.u32 r5777, [r5936+1024]; +ld.shared.u32 r5789, [r5936+1280]; +ld.shared.u32 r5801, [r5936+1536]; +ld.shared.u32 r5813, [r5936+1792]; +ld.shared.u32 r5825, [r5936+2048]; +ld.shared.u32 r5837, [r5936+2304]; +ld.shared.u32 r5849, [r5936+2560]; +ld.shared.u32 r5861, [r5936+2816]; +ld.shared.u32 r5873, [r5936+3072]; +ld.shared.u32 r5885, [r5936+3328]; +ld.shared.u32 r5897, [r5936+3584]; +ld.shared.u32 r5909, [r5936+3840]; +ld.shared.u32 r5730, [r5936+4096]; +ld.shared.u32 r5742, [r5936+4352]; +ld.shared.u32 r5754, [r5936+4608]; +ld.shared.u32 r5766, [r5936+4864]; +ld.shared.u32 r5778, [r5936+5120]; +ld.shared.u32 r5790, [r5936+5376]; +ld.shared.u32 r5802, [r5936+5632]; +ld.shared.u32 r5814, [r5936+5888]; +ld.shared.u32 r5826, [r5936+6144]; +ld.shared.u32 r5838, [r5936+6400]; +ld.shared.u32 r5850, [r5936+6656]; +ld.shared.u32 r5862, [r5936+6912]; +ld.shared.u32 r5874, [r5936+7168]; +ld.shared.u32 r5886, [r5936+7424]; +ld.shared.u32 r5898, [r5936+7680]; +ld.shared.u32 r5910, [r5936+7936]; +{ +add.f16x2 %0, r5726, r5727; +} +{ +add.f16x2 %1, r5729, r5730; +} +{ +sub.f16x2 %32, r5726, r5727; +} +{ +sub.f16x2 %33, r5729, r5730; +} +{ +add.f16x2 %2, r5738, r5739; +} +{ +add.f16x2 %3, r5741, r5742; +} +{ +sub.f16x2 %34, r5738, r5739; +} +{ +sub.f16x2 %35, r5741, r5742; +} +{ +add.f16x2 %4, r5750, r5751; +} +{ +add.f16x2 %5, r5753, r5754; +} +{ +sub.f16x2 %36, r5750, r5751; +} +{ +sub.f16x2 %37, r5753, r5754; +} +{ +add.f16x2 %6, r5762, r5763; +} +{ +add.f16x2 %7, r5765, r5766; +} +{ +sub.f16x2 %38, r5762, r5763; +} +{ +sub.f16x2 %39, r5765, r5766; +} +{ +add.f16x2 %8, r5774, r5775; +} +{ +add.f16x2 %9, r5777, r5778; +} +{ +sub.f16x2 %40, r5774, r5775; +} +{ +sub.f16x2 %41, r5777, r5778; +} +{ +add.f16x2 %10, r5786, r5787; +} +{ +add.f16x2 %11, r5789, r5790; +} +{ +sub.f16x2 %42, r5786, r5787; +} +{ +sub.f16x2 %43, r5789, r5790; +} +{ +add.f16x2 %12, r5798, r5799; +} +{ +add.f16x2 %13, r5801, r5802; +} +{ +sub.f16x2 %44, r5798, r5799; +} +{ +sub.f16x2 %45, r5801, r5802; +} +{ +add.f16x2 %14, r5810, r5811; +} +{ +add.f16x2 %15, r5813, r5814; +} +{ +sub.f16x2 %46, r5810, r5811; +} +{ +sub.f16x2 %47, r5813, r5814; +} +{ +add.f16x2 %16, r5822, r5823; +} +{ +add.f16x2 %17, r5825, r5826; +} +{ +sub.f16x2 %48, r5822, r5823; +} +{ +sub.f16x2 %49, r5825, r5826; +} +{ +add.f16x2 %18, r5834, r5835; +} +{ +add.f16x2 %19, r5837, r5838; +} +{ +sub.f16x2 %50, r5834, r5835; +} +{ +sub.f16x2 %51, r5837, r5838; +} +{ +add.f16x2 %20, r5846, r5847; +} +{ +add.f16x2 %21, r5849, r5850; +} +{ +sub.f16x2 %52, r5846, r5847; +} +{ +sub.f16x2 %53, r5849, r5850; +} +{ +add.f16x2 %22, r5858, r5859; +} +{ +add.f16x2 %23, r5861, r5862; +} +{ +sub.f16x2 %54, r5858, r5859; +} +{ +sub.f16x2 %55, r5861, r5862; +} +{ +add.f16x2 %24, r5870, r5871; +} +{ +add.f16x2 %25, r5873, r5874; +} +{ +sub.f16x2 %56, r5870, r5871; +} +{ +sub.f16x2 %57, r5873, r5874; +} +{ +add.f16x2 %26, r5882, r5883; +} +{ +add.f16x2 %27, r5885, r5886; +} +{ +sub.f16x2 %58, r5882, r5883; +} +{ +sub.f16x2 %59, r5885, r5886; +} +{ +add.f16x2 %28, r5894, r5895; +} +{ +add.f16x2 %29, r5897, r5898; +} +{ +sub.f16x2 %60, r5894, r5895; +} +{ +sub.f16x2 %61, r5897, r5898; +} +{ +add.f16x2 %30, r5906, r5907; +} +{ +add.f16x2 %31, r5909, r5910; +} +{ +sub.f16x2 %62, r5906, r5907; +} +{ +sub.f16x2 %63, r5909, r5910; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<843, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .b16 rs<3>; +.reg .f32 f<71>; +.reg .b32 r<894>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r29; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r32; +} +{ +add.f16x2 r52, r20, r35; +} +{ +sub.f16x2 r55, r17, r32; +} +{ +sub.f16x2 r58, r20, r35; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 511; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3B490FDB; +cvt.u16.u32 rs2, r9; +and.b16 rs1, rs2, 511; +mov.f32 f70, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB6_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB6_3; +mov.f32 f70, 0f3B490FC6; +bra.uni LBB6_4; +LBB6_3: +cos.approx.f32 f70, f1; +LBB6_4: +sin.approx.f32 f57, f1; +neg.f32 f8, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f8; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +neg.f16x2 r71, r68; +} +{ +fma.rn.f16x2 r73, r49, r64, r71; +} +{ +mul.f16x2 r77, r49, r66; +} +{ +fma.rn.f16x2 r80, r52, r64, r77; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f53, 0fBF800000; +mov.f32 f54, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +neg.f16x2 r108, r105; +} +{ +fma.rn.f16x2 r110, r43, r101, r108; +} +{ +mul.f16x2 r114, r43, r103; +} +{ +fma.rn.f16x2 r117, r46, r101, r114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +neg.f16x2 r145, r142; +} +{ +fma.rn.f16x2 r147, r55, r138, r145; +} +{ +mul.f16x2 r151, r55, r140; +} +{ +fma.rn.f16x2 r154, r58, r138, r151; +} +barrier.sync 0; +mov.u32 r855, %tid.y; +shl.b32 r856, r855, 14; +mov.u32 r857, %8; +add.s32 r858, r857, r856; +shl.b32 r859, r9, 5; +and.b32 r860, r859, -16384; +add.s32 r861, r858, r860; +shl.b32 r862, r10, 5; +add.s32 r863, r861, r862; +st.shared.v4.f32 [r863], {r37, r40, r73, r80}; +st.shared.v4.f32 [r863+16], {r110, r117, r147, r154}; +barrier.sync 0; +mad.lo.s32 r864, r10, -24, r863; +ld.shared.u32 r176, [r864]; +ld.shared.u32 r179, [r864+4]; +ld.shared.u32 r188, [r864+4096]; +ld.shared.u32 r191, [r864+4100]; +ld.shared.u32 r177, [r864+8192]; +ld.shared.u32 r180, [r864+8196]; +ld.shared.u32 r189, [r864+12288]; +ld.shared.u32 r192, [r864+12292]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r193; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r196; +} +{ +add.f16x2 r216, r184, r199; +} +{ +sub.f16x2 r219, r181, r196; +} +{ +sub.f16x2 r222, r184, r199; +} +and.b32 r865, r9, 508; +bfe.u32 r866, r9, 2, 7; +cvt.rn.f32.u32 f58, r866; +mul.f32 f59, f58, 0f3C490FDB; +cos.approx.f32 f17, f59; +sin.approx.f32 f60, f59; +neg.f32 f18, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +neg.f16x2 r235, r232; +} +{ +fma.rn.f16x2 r237, r213, r228, r235; +} +{ +mul.f16x2 r241, r213, r230; +} +{ +fma.rn.f16x2 r244, r216, r228, r241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +neg.f16x2 r272, r269; +} +{ +fma.rn.f16x2 r274, r207, r265, r272; +} +{ +mul.f16x2 r278, r207, r267; +} +{ +fma.rn.f16x2 r281, r210, r265, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +neg.f16x2 r309, r306; +} +{ +fma.rn.f16x2 r311, r219, r302, r309; +} +{ +mul.f16x2 r315, r219, r304; +} +{ +fma.rn.f16x2 r318, r222, r302, r315; +} +shl.b32 r867, r9, 3; +and.b32 r868, r867, 24; +add.s32 r869, r861, r868; +barrier.sync 0; +and.b32 r870, r859, 16256; +add.s32 r871, r869, r870; +st.shared.u32 [r871], r201; +st.shared.u32 [r871+4], r204; +st.shared.u32 [r871+32], r237; +st.shared.u32 [r871+36], r244; +st.shared.u32 [r871+64], r274; +st.shared.u32 [r871+68], r281; +st.shared.u32 [r871+96], r311; +st.shared.u32 [r871+100], r318; +barrier.sync 0; +mad.lo.s32 r872, r865, -24, r871; +ld.shared.u32 r340, [r872]; +ld.shared.u32 r343, [r872+4]; +ld.shared.u32 r352, [r872+4096]; +ld.shared.u32 r355, [r872+4100]; +ld.shared.u32 r341, [r872+8192]; +ld.shared.u32 r344, [r872+8196]; +ld.shared.u32 r353, [r872+12288]; +ld.shared.u32 r356, [r872+12292]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r357; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r360; +} +{ +add.f16x2 r380, r348, r363; +} +{ +sub.f16x2 r383, r345, r360; +} +{ +sub.f16x2 r386, r348, r363; +} +and.b32 r873, r9, 496; +bfe.u32 r874, r9, 4, 5; +cvt.rn.f32.u32 f61, r874; +mul.f32 f62, f61, 0f3D490FDB; +cos.approx.f32 f27, f62; +sin.approx.f32 f63, f62; +neg.f32 f28, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r377, r392, r399; +} +{ +mul.f16x2 r405, r377, r394; +} +{ +fma.rn.f16x2 r408, r380, r392, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r371, r429, r436; +} +{ +mul.f16x2 r442, r371, r431; +} +{ +fma.rn.f16x2 r445, r374, r429, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +neg.f16x2 r473, r470; +} +{ +fma.rn.f16x2 r475, r383, r466, r473; +} +{ +mul.f16x2 r479, r383, r468; +} +{ +fma.rn.f16x2 r482, r386, r466, r479; +} +and.b32 r875, r867, 120; +add.s32 r876, r861, r875; +barrier.sync 0; +and.b32 r877, r859, 15872; +add.s32 r878, r876, r877; +st.shared.u32 [r878], r365; +st.shared.u32 [r878+4], r368; +st.shared.u32 [r878+128], r401; +st.shared.u32 [r878+132], r408; +st.shared.u32 [r878+256], r438; +st.shared.u32 [r878+260], r445; +st.shared.u32 [r878+384], r475; +st.shared.u32 [r878+388], r482; +barrier.sync 0; +mad.lo.s32 r879, r873, -24, r878; +ld.shared.u32 r504, [r879]; +ld.shared.u32 r507, [r879+4]; +ld.shared.u32 r516, [r879+4096]; +ld.shared.u32 r519, [r879+4100]; +ld.shared.u32 r505, [r879+8192]; +ld.shared.u32 r508, [r879+8196]; +ld.shared.u32 r517, [r879+12288]; +ld.shared.u32 r520, [r879+12292]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r521; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r524; +} +{ +add.f16x2 r544, r512, r527; +} +{ +sub.f16x2 r547, r509, r524; +} +{ +sub.f16x2 r550, r512, r527; +} +and.b32 r880, r9, 448; +bfe.u32 r881, r9, 6, 3; +cvt.rn.f32.u32 f64, r881; +mul.f32 f65, f64, 0f3E490FDB; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +neg.f16x2 r563, r560; +} +{ +fma.rn.f16x2 r565, r541, r556, r563; +} +{ +mul.f16x2 r569, r541, r558; +} +{ +fma.rn.f16x2 r572, r544, r556, r569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +neg.f16x2 r600, r597; +} +{ +fma.rn.f16x2 r602, r535, r593, r600; +} +{ +mul.f16x2 r606, r535, r595; +} +{ +fma.rn.f16x2 r609, r538, r593, r606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +neg.f16x2 r637, r634; +} +{ +fma.rn.f16x2 r639, r547, r630, r637; +} +{ +mul.f16x2 r643, r547, r632; +} +{ +fma.rn.f16x2 r646, r550, r630, r643; +} +and.b32 r882, r867, 504; +add.s32 r883, r861, r882; +barrier.sync 0; +and.b32 r884, r859, 14336; +add.s32 r885, r883, r884; +st.shared.u32 [r885], r529; +st.shared.u32 [r885+4], r532; +st.shared.u32 [r885+512], r565; +st.shared.u32 [r885+516], r572; +st.shared.u32 [r885+1024], r602; +st.shared.u32 [r885+1028], r609; +st.shared.u32 [r885+1536], r639; +st.shared.u32 [r885+1540], r646; +barrier.sync 0; +mad.lo.s32 r886, r880, -24, r885; +ld.shared.u32 r668, [r886]; +ld.shared.u32 r671, [r886+4]; +ld.shared.u32 r680, [r886+4096]; +ld.shared.u32 r683, [r886+4100]; +ld.shared.u32 r669, [r886+8192]; +ld.shared.u32 r672, [r886+8196]; +ld.shared.u32 r681, [r886+12288]; +ld.shared.u32 r684, [r886+12292]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +and.b32 r887, r9, 256; +bfe.u32 r888, r9, 8, 1; +cvt.rn.f32.u32 f67, r888; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f47, f68; +sin.approx.f32 f69, f68; +neg.f32 f48, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f47; +cvt.rn.f16.f32 high, f48; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r722, {high, high}; +} +{ +mul.f16x2 r724, r708, r722; +} +{ +neg.f16x2 r727, r724; +} +{ +fma.rn.f16x2 r729, r705, r720, r727; +} +{ +mul.f16x2 r733, r705, r722; +} +{ +fma.rn.f16x2 r736, r708, r720, r733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r742, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r742, r744; +} +{ +mul.f16x2 r748, r717, r740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r751, {high, low}; +} +{ +fma.rn.f16x2 r753, r745, r751, r748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r759, {high, high}; +} +{ +mul.f16x2 r761, r702, r759; +} +{ +neg.f16x2 r764, r761; +} +{ +fma.rn.f16x2 r766, r699, r757, r764; +} +{ +mul.f16x2 r770, r699, r759; +} +{ +fma.rn.f16x2 r773, r702, r757, r770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r779, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r781, {low, high}; +} +{ +mul.f16x2 r782, r779, r781; +} +{ +mul.f16x2 r785, r753, r777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r788, {high, low}; +} +{ +fma.rn.f16x2 r790, r782, r788, r785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r796, {high, high}; +} +{ +mul.f16x2 r798, r714, r796; +} +{ +neg.f16x2 r801, r798; +} +{ +fma.rn.f16x2 r803, r711, r794, r801; +} +{ +mul.f16x2 r807, r711, r796; +} +{ +fma.rn.f16x2 r810, r714, r794, r807; +} +and.b32 r889, r867, 2040; +add.s32 r890, r861, r889; +barrier.sync 0; +and.b32 r891, r859, 8192; +add.s32 r892, r890, r891; +st.shared.u32 [r892], r693; +st.shared.u32 [r892+4], r696; +st.shared.u32 [r892+2048], r729; +st.shared.u32 [r892+2052], r736; +st.shared.u32 [r892+4096], r766; +st.shared.u32 [r892+4100], r773; +st.shared.u32 [r892+6144], r803; +st.shared.u32 [r892+6148], r810; +barrier.sync 0; +mad.lo.s32 r893, r887, -24, r892; +ld.shared.u32 r832, [r893]; +ld.shared.u32 r835, [r893+4]; +ld.shared.u32 r844, [r893+4096]; +ld.shared.u32 r847, [r893+4100]; +ld.shared.u32 r833, [r893+8192]; +ld.shared.u32 r836, [r893+8196]; +ld.shared.u32 r845, [r893+12288]; +ld.shared.u32 r848, [r893+12292]; +{ +add.f16x2 %0, r832, r833; +} +{ +add.f16x2 %1, r835, r836; +} +{ +sub.f16x2 %4, r832, r833; +} +{ +sub.f16x2 %5, r835, r836; +} +{ +add.f16x2 %2, r844, r845; +} +{ +add.f16x2 %3, r847, r848; +} +{ +sub.f16x2 %6, r844, r845; +} +{ +sub.f16x2 %7, r847, r848; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<844, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .b16 rs<3>; +.reg .f32 f<71>; +.reg .b32 r<894>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r29; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r32; +} +{ +add.f16x2 r52, r20, r35; +} +{ +sub.f16x2 r55, r17, r32; +} +{ +sub.f16x2 r58, r20, r35; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 511; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3B490FDB; +cvt.u16.u32 rs2, r9; +and.b16 rs1, rs2, 511; +mov.f32 f70, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB7_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB7_3; +mov.f32 f70, 0f3B490FC6; +bra.uni LBB7_4; +LBB7_3: +cos.approx.f32 f70, f1; +LBB7_4: +mov.u32 r855, %tid.y; +shl.b32 r856, r855, 13; +mov.u32 r857, %8; +add.s32 r858, r857, r856; +shl.b32 r859, r9, 4; +and.b32 r860, r859, -8192; +add.s32 r861, r858, r860; +sin.approx.f32 f57, f1; +neg.f32 f8, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f8; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +neg.f16x2 r71, r68; +} +{ +fma.rn.f16x2 r73, r49, r64, r71; +} +{ +mul.f16x2 r77, r49, r66; +} +{ +fma.rn.f16x2 r80, r52, r64, r77; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f53, 0fBF800000; +mov.f32 f54, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +neg.f16x2 r108, r105; +} +{ +fma.rn.f16x2 r110, r43, r101, r108; +} +{ +mul.f16x2 r114, r43, r103; +} +{ +fma.rn.f16x2 r117, r46, r101, r114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +neg.f16x2 r145, r142; +} +{ +fma.rn.f16x2 r147, r55, r138, r145; +} +{ +mul.f16x2 r151, r55, r140; +} +{ +fma.rn.f16x2 r154, r58, r138, r151; +} +barrier.sync 0; +shl.b32 r862, r10, 4; +add.s32 r863, r861, r862; +st.shared.v4.f32 [r863], {r37, r73, r110, r147}; +barrier.sync 0; +mad.lo.s32 r864, r10, -12, r863; +ld.shared.u32 r176, [r864]; +ld.shared.u32 r188, [r864+2048]; +ld.shared.u32 r177, [r864+4096]; +ld.shared.u32 r189, [r864+6144]; +barrier.sync 0; +st.shared.v4.f32 [r863], {r40, r80, r117, r154}; +barrier.sync 0; +ld.shared.u32 r179, [r864]; +ld.shared.u32 r191, [r864+2048]; +ld.shared.u32 r180, [r864+4096]; +ld.shared.u32 r192, [r864+6144]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r193; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r196; +} +{ +add.f16x2 r216, r184, r199; +} +{ +sub.f16x2 r219, r181, r196; +} +{ +sub.f16x2 r222, r184, r199; +} +and.b32 r865, r9, 508; +bfe.u32 r866, r9, 2, 7; +shl.b32 r867, r9, 2; +and.b32 r868, r867, 12; +add.s32 r869, r861, r868; +cvt.rn.f32.u32 f58, r866; +mul.f32 f59, f58, 0f3C490FDB; +cos.approx.f32 f17, f59; +sin.approx.f32 f60, f59; +neg.f32 f18, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +neg.f16x2 r235, r232; +} +{ +fma.rn.f16x2 r237, r213, r228, r235; +} +{ +mul.f16x2 r241, r213, r230; +} +{ +fma.rn.f16x2 r244, r216, r228, r241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +neg.f16x2 r272, r269; +} +{ +fma.rn.f16x2 r274, r207, r265, r272; +} +{ +mul.f16x2 r278, r207, r267; +} +{ +fma.rn.f16x2 r281, r210, r265, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +neg.f16x2 r309, r306; +} +{ +fma.rn.f16x2 r311, r219, r302, r309; +} +{ +mul.f16x2 r315, r219, r304; +} +{ +fma.rn.f16x2 r318, r222, r302, r315; +} +barrier.sync 0; +and.b32 r870, r859, 8128; +add.s32 r871, r869, r870; +st.shared.u32 [r871], r201; +st.shared.u32 [r871+16], r237; +st.shared.u32 [r871+32], r274; +st.shared.u32 [r871+48], r311; +barrier.sync 0; +mad.lo.s32 r872, r865, -12, r871; +ld.shared.u32 r340, [r872]; +ld.shared.u32 r352, [r872+2048]; +ld.shared.u32 r341, [r872+4096]; +ld.shared.u32 r353, [r872+6144]; +barrier.sync 0; +st.shared.u32 [r871], r204; +st.shared.u32 [r871+16], r244; +st.shared.u32 [r871+32], r281; +st.shared.u32 [r871+48], r318; +barrier.sync 0; +ld.shared.u32 r343, [r872]; +ld.shared.u32 r355, [r872+2048]; +ld.shared.u32 r344, [r872+4096]; +ld.shared.u32 r356, [r872+6144]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r357; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r360; +} +{ +add.f16x2 r380, r348, r363; +} +{ +sub.f16x2 r383, r345, r360; +} +{ +sub.f16x2 r386, r348, r363; +} +and.b32 r873, r9, 496; +bfe.u32 r874, r9, 4, 5; +and.b32 r875, r867, 60; +add.s32 r876, r861, r875; +cvt.rn.f32.u32 f61, r874; +mul.f32 f62, f61, 0f3D490FDB; +cos.approx.f32 f27, f62; +sin.approx.f32 f63, f62; +neg.f32 f28, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r377, r392, r399; +} +{ +mul.f16x2 r405, r377, r394; +} +{ +fma.rn.f16x2 r408, r380, r392, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r371, r429, r436; +} +{ +mul.f16x2 r442, r371, r431; +} +{ +fma.rn.f16x2 r445, r374, r429, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +neg.f16x2 r473, r470; +} +{ +fma.rn.f16x2 r475, r383, r466, r473; +} +{ +mul.f16x2 r479, r383, r468; +} +{ +fma.rn.f16x2 r482, r386, r466, r479; +} +barrier.sync 0; +and.b32 r877, r859, 7936; +add.s32 r878, r876, r877; +st.shared.u32 [r878], r365; +st.shared.u32 [r878+64], r401; +st.shared.u32 [r878+128], r438; +st.shared.u32 [r878+192], r475; +barrier.sync 0; +mad.lo.s32 r879, r873, -12, r878; +ld.shared.u32 r504, [r879]; +ld.shared.u32 r516, [r879+2048]; +ld.shared.u32 r505, [r879+4096]; +ld.shared.u32 r517, [r879+6144]; +barrier.sync 0; +st.shared.u32 [r878], r368; +st.shared.u32 [r878+64], r408; +st.shared.u32 [r878+128], r445; +st.shared.u32 [r878+192], r482; +barrier.sync 0; +ld.shared.u32 r507, [r879]; +ld.shared.u32 r519, [r879+2048]; +ld.shared.u32 r508, [r879+4096]; +ld.shared.u32 r520, [r879+6144]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r521; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r524; +} +{ +add.f16x2 r544, r512, r527; +} +{ +sub.f16x2 r547, r509, r524; +} +{ +sub.f16x2 r550, r512, r527; +} +and.b32 r880, r9, 448; +bfe.u32 r881, r9, 6, 3; +and.b32 r882, r867, 252; +add.s32 r883, r861, r882; +cvt.rn.f32.u32 f64, r881; +mul.f32 f65, f64, 0f3E490FDB; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +neg.f16x2 r563, r560; +} +{ +fma.rn.f16x2 r565, r541, r556, r563; +} +{ +mul.f16x2 r569, r541, r558; +} +{ +fma.rn.f16x2 r572, r544, r556, r569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +neg.f16x2 r600, r597; +} +{ +fma.rn.f16x2 r602, r535, r593, r600; +} +{ +mul.f16x2 r606, r535, r595; +} +{ +fma.rn.f16x2 r609, r538, r593, r606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +neg.f16x2 r637, r634; +} +{ +fma.rn.f16x2 r639, r547, r630, r637; +} +{ +mul.f16x2 r643, r547, r632; +} +{ +fma.rn.f16x2 r646, r550, r630, r643; +} +barrier.sync 0; +and.b32 r884, r859, 7168; +add.s32 r885, r883, r884; +st.shared.u32 [r885], r529; +st.shared.u32 [r885+256], r565; +st.shared.u32 [r885+512], r602; +st.shared.u32 [r885+768], r639; +barrier.sync 0; +mad.lo.s32 r886, r880, -12, r885; +ld.shared.u32 r668, [r886]; +ld.shared.u32 r680, [r886+2048]; +ld.shared.u32 r669, [r886+4096]; +ld.shared.u32 r681, [r886+6144]; +barrier.sync 0; +st.shared.u32 [r885], r532; +st.shared.u32 [r885+256], r572; +st.shared.u32 [r885+512], r609; +st.shared.u32 [r885+768], r646; +barrier.sync 0; +ld.shared.u32 r671, [r886]; +ld.shared.u32 r683, [r886+2048]; +ld.shared.u32 r672, [r886+4096]; +ld.shared.u32 r684, [r886+6144]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +and.b32 r887, r9, 256; +bfe.u32 r888, r9, 8, 1; +and.b32 r889, r867, 1020; +add.s32 r890, r861, r889; +cvt.rn.f32.u32 f67, r888; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f47, f68; +sin.approx.f32 f69, f68; +neg.f32 f48, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f47; +cvt.rn.f16.f32 high, f48; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r722, {high, high}; +} +{ +mul.f16x2 r724, r708, r722; +} +{ +neg.f16x2 r727, r724; +} +{ +fma.rn.f16x2 r729, r705, r720, r727; +} +{ +mul.f16x2 r733, r705, r722; +} +{ +fma.rn.f16x2 r736, r708, r720, r733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r742, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r742, r744; +} +{ +mul.f16x2 r748, r717, r740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r751, {high, low}; +} +{ +fma.rn.f16x2 r753, r745, r751, r748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r759, {high, high}; +} +{ +mul.f16x2 r761, r702, r759; +} +{ +neg.f16x2 r764, r761; +} +{ +fma.rn.f16x2 r766, r699, r757, r764; +} +{ +mul.f16x2 r770, r699, r759; +} +{ +fma.rn.f16x2 r773, r702, r757, r770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r779, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r781, {low, high}; +} +{ +mul.f16x2 r782, r779, r781; +} +{ +mul.f16x2 r785, r753, r777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r788, {high, low}; +} +{ +fma.rn.f16x2 r790, r782, r788, r785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r796, {high, high}; +} +{ +mul.f16x2 r798, r714, r796; +} +{ +neg.f16x2 r801, r798; +} +{ +fma.rn.f16x2 r803, r711, r794, r801; +} +{ +mul.f16x2 r807, r711, r796; +} +{ +fma.rn.f16x2 r810, r714, r794, r807; +} +barrier.sync 0; +and.b32 r891, r859, 4096; +add.s32 r892, r890, r891; +st.shared.u32 [r892], r693; +st.shared.u32 [r892+1024], r729; +st.shared.u32 [r892+2048], r766; +st.shared.u32 [r892+3072], r803; +barrier.sync 0; +mad.lo.s32 r893, r887, -12, r892; +ld.shared.u32 r832, [r893]; +ld.shared.u32 r844, [r893+2048]; +ld.shared.u32 r833, [r893+4096]; +ld.shared.u32 r845, [r893+6144]; +barrier.sync 0; +st.shared.u32 [r892], r696; +st.shared.u32 [r892+1024], r736; +st.shared.u32 [r892+2048], r773; +st.shared.u32 [r892+3072], r810; +barrier.sync 0; +ld.shared.u32 r835, [r893]; +ld.shared.u32 r847, [r893+2048]; +ld.shared.u32 r836, [r893+4096]; +ld.shared.u32 r848, [r893+6144]; +{ +add.f16x2 %0, r832, r833; +} +{ +add.f16x2 %1, r835, r836; +} +{ +sub.f16x2 %4, r832, r833; +} +{ +sub.f16x2 %5, r835, r836; +} +{ +add.f16x2 %2, r844, r845; +} +{ +add.f16x2 %3, r847, r848; +} +{ +sub.f16x2 %6, r844, r845; +} +{ +sub.f16x2 %7, r847, r848; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<845, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<5>; +.reg .b16 rs<3>; +.reg .f32 f<103>; +.reg .b32 r<631>; +.reg .b64 rd<2>; +{ +add.f16x2 r9, %5, %7; +} +{ +add.f16x2 r12, %6, %8; +} +{ +sub.f16x2 r15, %5, %7; +} +{ +sub.f16x2 r18, %6, %8; +} +mov.u32 r21, %tid.x; +and.b32 r22, r21, 1023; +cvt.rn.f32.u32 f10, r22; +mul.f32 f1, f10, 0f3B490FDB; +cvt.u16.u32 rs2, r21; +and.b16 rs1, rs2, 1023; +mov.f32 f100, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB8_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB8_3; +mov.f32 f100, 0f3B490FC6; +bra.uni LBB8_4; +LBB8_3: +cos.approx.f32 f100, f1; +LBB8_4: +setp.eq.s32 p3, r22, 1023; +mov.f32 f101, 0f3B490FC6; +@p3 bra LBB8_6; +sin.approx.f32 f101, f1; +LBB8_6: +mov.u32 r77, %tid.y; +shl.b32 r78, r77, 14; +mov.u32 r79, %4; +add.s32 r80, r79, r78; +shl.b32 r82, r21, 4; +and.b32 r83, r82, -16384; +add.s32 r84, r80, r83; +neg.f32 f14, f101; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f14; +mov.b32 r25, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r28, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r30, {high, high}; +} +{ +mul.f16x2 r32, r18, r30; +} +{ +neg.f16x2 r35, r32; +} +{ +fma.rn.f16x2 r37, r15, r28, r35; +} +{ +mul.f16x2 r41, r15, r30; +} +{ +fma.rn.f16x2 r44, r18, r28, r41; +} +barrier.sync 0; +and.b32 r85, r82, 16368; +add.s32 r86, r84, r85; +st.shared.v2.f32 [r86], {r9, r12}; +st.shared.v2.f32 [r86+8], {r37, r44}; +barrier.sync 0; +shl.b32 r87, r21, 3; +and.b32 r88, r87, 8184; +sub.s32 r89, r86, r88; +ld.shared.u32 r66, [r89]; +ld.shared.u32 r69, [r89+4]; +ld.shared.u32 r67, [r89+8192]; +ld.shared.u32 r70, [r89+8196]; +{ +add.f16x2 r65, r66, r67; +} +{ +add.f16x2 r68, r69, r70; +} +{ +sub.f16x2 r71, r66, r67; +} +{ +sub.f16x2 r74, r69, r70; +} +and.b32 r90, r21, 1022; +bfe.u32 r91, r21, 1, 9; +cvt.rn.f32.u32 f20, r91; +mul.f32 f6, f20, 0f3BC90FDB; +setp.eq.s32 p4, r90, 510; +mov.f32 f102, 0f3BC90F88; +@p4 bra LBB8_8; +cos.approx.f32 f102, f6; +LBB8_8: +sin.approx.f32 f75, f6; +neg.f32 f22, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f22; +mov.b32 r92, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r95, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r97, {high, high}; +} +{ +mul.f16x2 r99, r74, r97; +} +{ +neg.f16x2 r102, r99; +} +{ +fma.rn.f16x2 r104, r71, r95, r102; +} +{ +mul.f16x2 r108, r71, r97; +} +{ +fma.rn.f16x2 r111, r74, r95, r108; +} +barrier.sync 0; +and.b32 r569, r87, 8; +add.s32 r570, r84, r569; +and.b32 r571, r82, 16352; +add.s32 r572, r570, r571; +st.shared.u32 [r572], r65; +st.shared.u32 [r572+4], r68; +st.shared.u32 [r572+16], r104; +st.shared.u32 [r572+20], r111; +barrier.sync 0; +and.b32 r573, r87, 8176; +sub.s32 r574, r572, r573; +ld.shared.u32 r133, [r574]; +ld.shared.u32 r136, [r574+4]; +ld.shared.u32 r134, [r574+8192]; +ld.shared.u32 r137, [r574+8196]; +{ +add.f16x2 r132, r133, r134; +} +{ +add.f16x2 r135, r136, r137; +} +{ +sub.f16x2 r138, r133, r134; +} +{ +sub.f16x2 r141, r136, r137; +} +bfe.u32 r575, r21, 2, 8; +cvt.rn.f32.u32 f76, r575; +mul.f32 f77, f76, 0f3C490FDB; +cos.approx.f32 f27, f77; +sin.approx.f32 f78, f77; +neg.f32 f28, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r144, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r149, {high, high}; +} +{ +mul.f16x2 r151, r141, r149; +} +{ +neg.f16x2 r154, r151; +} +{ +fma.rn.f16x2 r156, r138, r147, r154; +} +{ +mul.f16x2 r160, r138, r149; +} +{ +fma.rn.f16x2 r163, r141, r147, r160; +} +and.b32 r576, r87, 24; +add.s32 r577, r84, r576; +barrier.sync 0; +and.b32 r578, r82, 16320; +add.s32 r579, r577, r578; +st.shared.u32 [r579], r132; +st.shared.u32 [r579+4], r135; +st.shared.u32 [r579+32], r156; +st.shared.u32 [r579+36], r163; +barrier.sync 0; +and.b32 r580, r87, 8160; +sub.s32 r581, r579, r580; +ld.shared.u32 r185, [r581]; +ld.shared.u32 r188, [r581+4]; +ld.shared.u32 r186, [r581+8192]; +ld.shared.u32 r189, [r581+8196]; +{ +add.f16x2 r184, r185, r186; +} +{ +add.f16x2 r187, r188, r189; +} +{ +sub.f16x2 r190, r185, r186; +} +{ +sub.f16x2 r193, r188, r189; +} +bfe.u32 r582, r21, 3, 7; +cvt.rn.f32.u32 f79, r582; +mul.f32 f80, f79, 0f3CC90FDB; +cos.approx.f32 f33, f80; +sin.approx.f32 f81, f80; +neg.f32 f34, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r196, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r201, {high, high}; +} +{ +mul.f16x2 r203, r193, r201; +} +{ +neg.f16x2 r206, r203; +} +{ +fma.rn.f16x2 r208, r190, r199, r206; +} +{ +mul.f16x2 r212, r190, r201; +} +{ +fma.rn.f16x2 r215, r193, r199, r212; +} +and.b32 r583, r87, 56; +add.s32 r584, r84, r583; +barrier.sync 0; +and.b32 r585, r82, 16256; +add.s32 r586, r584, r585; +st.shared.u32 [r586], r184; +st.shared.u32 [r586+4], r187; +st.shared.u32 [r586+64], r208; +st.shared.u32 [r586+68], r215; +barrier.sync 0; +and.b32 r587, r87, 8128; +sub.s32 r588, r586, r587; +ld.shared.u32 r237, [r588]; +ld.shared.u32 r240, [r588+4]; +ld.shared.u32 r238, [r588+8192]; +ld.shared.u32 r241, [r588+8196]; +{ +add.f16x2 r236, r237, r238; +} +{ +add.f16x2 r239, r240, r241; +} +{ +sub.f16x2 r242, r237, r238; +} +{ +sub.f16x2 r245, r240, r241; +} +bfe.u32 r589, r21, 4, 6; +cvt.rn.f32.u32 f82, r589; +mul.f32 f83, f82, 0f3D490FDB; +cos.approx.f32 f39, f83; +sin.approx.f32 f84, f83; +neg.f32 f40, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r248, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r253, {high, high}; +} +{ +mul.f16x2 r255, r245, r253; +} +{ +neg.f16x2 r258, r255; +} +{ +fma.rn.f16x2 r260, r242, r251, r258; +} +{ +mul.f16x2 r264, r242, r253; +} +{ +fma.rn.f16x2 r267, r245, r251, r264; +} +and.b32 r590, r87, 120; +add.s32 r591, r84, r590; +barrier.sync 0; +and.b32 r592, r82, 16128; +add.s32 r593, r591, r592; +st.shared.u32 [r593], r236; +st.shared.u32 [r593+4], r239; +st.shared.u32 [r593+128], r260; +st.shared.u32 [r593+132], r267; +barrier.sync 0; +and.b32 r594, r87, 8064; +sub.s32 r595, r593, r594; +ld.shared.u32 r289, [r595]; +ld.shared.u32 r292, [r595+4]; +ld.shared.u32 r290, [r595+8192]; +ld.shared.u32 r293, [r595+8196]; +{ +add.f16x2 r288, r289, r290; +} +{ +add.f16x2 r291, r292, r293; +} +{ +sub.f16x2 r294, r289, r290; +} +{ +sub.f16x2 r297, r292, r293; +} +bfe.u32 r596, r21, 5, 5; +cvt.rn.f32.u32 f85, r596; +mul.f32 f86, f85, 0f3DC90FDB; +cos.approx.f32 f45, f86; +sin.approx.f32 f87, f86; +neg.f32 f46, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r300, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r305, {high, high}; +} +{ +mul.f16x2 r307, r297, r305; +} +{ +neg.f16x2 r310, r307; +} +{ +fma.rn.f16x2 r312, r294, r303, r310; +} +{ +mul.f16x2 r316, r294, r305; +} +{ +fma.rn.f16x2 r319, r297, r303, r316; +} +and.b32 r597, r87, 248; +add.s32 r598, r84, r597; +barrier.sync 0; +and.b32 r599, r82, 15872; +add.s32 r600, r598, r599; +st.shared.u32 [r600], r288; +st.shared.u32 [r600+4], r291; +st.shared.u32 [r600+256], r312; +st.shared.u32 [r600+260], r319; +barrier.sync 0; +and.b32 r601, r87, 7936; +sub.s32 r602, r600, r601; +ld.shared.u32 r341, [r602]; +ld.shared.u32 r344, [r602+4]; +ld.shared.u32 r342, [r602+8192]; +ld.shared.u32 r345, [r602+8196]; +{ +add.f16x2 r340, r341, r342; +} +{ +add.f16x2 r343, r344, r345; +} +{ +sub.f16x2 r346, r341, r342; +} +{ +sub.f16x2 r349, r344, r345; +} +bfe.u32 r603, r21, 6, 4; +cvt.rn.f32.u32 f88, r603; +mul.f32 f89, f88, 0f3E490FDB; +cos.approx.f32 f51, f89; +sin.approx.f32 f90, f89; +neg.f32 f52, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f51; +cvt.rn.f16.f32 high, f52; +mov.b32 r352, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r357, {high, high}; +} +{ +mul.f16x2 r359, r349, r357; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r346, r355, r362; +} +{ +mul.f16x2 r368, r346, r357; +} +{ +fma.rn.f16x2 r371, r349, r355, r368; +} +and.b32 r604, r87, 504; +add.s32 r605, r84, r604; +barrier.sync 0; +and.b32 r606, r82, 15360; +add.s32 r607, r605, r606; +st.shared.u32 [r607], r340; +st.shared.u32 [r607+4], r343; +st.shared.u32 [r607+512], r364; +st.shared.u32 [r607+516], r371; +barrier.sync 0; +and.b32 r608, r87, 7680; +sub.s32 r609, r607, r608; +ld.shared.u32 r393, [r609]; +ld.shared.u32 r396, [r609+4]; +ld.shared.u32 r394, [r609+8192]; +ld.shared.u32 r397, [r609+8196]; +{ +add.f16x2 r392, r393, r394; +} +{ +add.f16x2 r395, r396, r397; +} +{ +sub.f16x2 r398, r393, r394; +} +{ +sub.f16x2 r401, r396, r397; +} +bfe.u32 r610, r21, 7, 3; +cvt.rn.f32.u32 f91, r610; +mul.f32 f92, f91, 0f3EC90FDB; +cos.approx.f32 f57, f92; +sin.approx.f32 f93, f92; +neg.f32 f58, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r407, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r409, {high, high}; +} +{ +mul.f16x2 r411, r401, r409; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r398, r407, r414; +} +{ +mul.f16x2 r420, r398, r409; +} +{ +fma.rn.f16x2 r423, r401, r407, r420; +} +and.b32 r611, r87, 1016; +add.s32 r612, r84, r611; +barrier.sync 0; +and.b32 r613, r82, 14336; +add.s32 r614, r612, r613; +st.shared.u32 [r614], r392; +st.shared.u32 [r614+4], r395; +st.shared.u32 [r614+1024], r416; +st.shared.u32 [r614+1028], r423; +barrier.sync 0; +and.b32 r615, r87, 7168; +sub.s32 r616, r614, r615; +ld.shared.u32 r445, [r616]; +ld.shared.u32 r448, [r616+4]; +ld.shared.u32 r446, [r616+8192]; +ld.shared.u32 r449, [r616+8196]; +{ +add.f16x2 r444, r445, r446; +} +{ +add.f16x2 r447, r448, r449; +} +{ +sub.f16x2 r450, r445, r446; +} +{ +sub.f16x2 r453, r448, r449; +} +bfe.u32 r617, r21, 8, 2; +cvt.rn.f32.u32 f94, r617; +mul.f32 f95, f94, 0f3F490FDB; +cos.approx.f32 f63, f95; +sin.approx.f32 f96, f95; +neg.f32 f64, f96; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f63; +cvt.rn.f16.f32 high, f64; +mov.b32 r456, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r453, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r450, r459, r466; +} +{ +mul.f16x2 r472, r450, r461; +} +{ +fma.rn.f16x2 r475, r453, r459, r472; +} +and.b32 r618, r87, 2040; +add.s32 r619, r84, r618; +barrier.sync 0; +and.b32 r620, r82, 12288; +add.s32 r621, r619, r620; +st.shared.u32 [r621], r444; +st.shared.u32 [r621+4], r447; +st.shared.u32 [r621+2048], r468; +st.shared.u32 [r621+2052], r475; +barrier.sync 0; +and.b32 r622, r87, 6144; +sub.s32 r623, r621, r622; +ld.shared.u32 r497, [r623]; +ld.shared.u32 r500, [r623+4]; +ld.shared.u32 r498, [r623+8192]; +ld.shared.u32 r501, [r623+8196]; +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r501; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +sub.f16x2 r505, r500, r501; +} +bfe.u32 r624, r21, 9, 1; +cvt.rn.f32.u32 f97, r624; +mul.f32 f98, f97, 0f3FC90FDB; +cos.approx.f32 f69, f98; +sin.approx.f32 f99, f98; +neg.f32 f70, f99; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r511, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r513, {high, high}; +} +{ +mul.f16x2 r515, r505, r513; +} +{ +neg.f16x2 r518, r515; +} +{ +fma.rn.f16x2 r520, r502, r511, r518; +} +{ +mul.f16x2 r524, r502, r513; +} +{ +fma.rn.f16x2 r527, r505, r511, r524; +} +and.b32 r625, r87, 4088; +add.s32 r626, r84, r625; +barrier.sync 0; +and.b32 r627, r82, 8192; +add.s32 r628, r626, r627; +st.shared.u32 [r628], r496; +st.shared.u32 [r628+4], r499; +st.shared.u32 [r628+4096], r520; +st.shared.u32 [r628+4100], r527; +barrier.sync 0; +and.b32 r629, r87, 4096; +sub.s32 r630, r628, r629; +ld.shared.u32 r549, [r630]; +ld.shared.u32 r552, [r630+4]; +ld.shared.u32 r550, [r630+8192]; +ld.shared.u32 r553, [r630+8196]; +{ +add.f16x2 %0, r549, r550; +} +{ +add.f16x2 %1, r552, r553; +} +{ +sub.f16x2 %2, r549, r550; +} +{ +sub.f16x2 %3, r552, r553; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<846, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<5>; +.reg .b16 rs<3>; +.reg .f32 f<103>; +.reg .b32 r<631>; +.reg .b64 rd<2>; +{ +add.f16x2 r9, %5, %7; +} +{ +add.f16x2 r12, %6, %8; +} +{ +sub.f16x2 r15, %5, %7; +} +{ +sub.f16x2 r18, %6, %8; +} +mov.u32 r21, %tid.x; +and.b32 r22, r21, 1023; +cvt.rn.f32.u32 f10, r22; +mul.f32 f1, f10, 0f3B490FDB; +cvt.u16.u32 rs2, r21; +and.b16 rs1, rs2, 1023; +mov.f32 f100, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB9_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB9_3; +mov.f32 f100, 0f3B490FC6; +bra.uni LBB9_4; +LBB9_3: +cos.approx.f32 f100, f1; +LBB9_4: +setp.eq.s32 p3, r22, 1023; +mov.f32 f101, 0f3B490FC6; +@p3 bra LBB9_6; +sin.approx.f32 f101, f1; +LBB9_6: +mov.u32 r77, %tid.y; +shl.b32 r78, r77, 13; +mov.u32 r79, %4; +add.s32 r80, r79, r78; +shl.b32 r82, r21, 3; +and.b32 r83, r82, -8192; +add.s32 r84, r80, r83; +neg.f32 f14, f101; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f14; +mov.b32 r25, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r28, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r30, {high, high}; +} +{ +mul.f16x2 r32, r18, r30; +} +{ +neg.f16x2 r35, r32; +} +{ +fma.rn.f16x2 r37, r15, r28, r35; +} +{ +mul.f16x2 r41, r15, r30; +} +{ +fma.rn.f16x2 r44, r18, r28, r41; +} +barrier.sync 0; +and.b32 r85, r82, 8184; +add.s32 r86, r84, r85; +st.shared.v2.f32 [r86], {r9, r37}; +barrier.sync 0; +shl.b32 r87, r21, 2; +and.b32 r88, r87, 4092; +sub.s32 r89, r86, r88; +ld.shared.u32 r66, [r89]; +ld.shared.u32 r67, [r89+4096]; +barrier.sync 0; +st.shared.v2.f32 [r86], {r12, r44}; +barrier.sync 0; +ld.shared.u32 r69, [r89]; +ld.shared.u32 r70, [r89+4096]; +{ +add.f16x2 r65, r66, r67; +} +{ +add.f16x2 r68, r69, r70; +} +{ +sub.f16x2 r71, r66, r67; +} +{ +sub.f16x2 r74, r69, r70; +} +and.b32 r90, r21, 1022; +bfe.u32 r91, r21, 1, 9; +cvt.rn.f32.u32 f20, r91; +mul.f32 f6, f20, 0f3BC90FDB; +setp.eq.s32 p4, r90, 510; +mov.f32 f102, 0f3BC90F88; +@p4 bra LBB9_8; +cos.approx.f32 f102, f6; +LBB9_8: +and.b32 r569, r87, 4; +add.s32 r570, r84, r569; +sin.approx.f32 f75, f6; +neg.f32 f22, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f22; +mov.b32 r92, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r95, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r97, {high, high}; +} +{ +mul.f16x2 r99, r74, r97; +} +{ +neg.f16x2 r102, r99; +} +{ +fma.rn.f16x2 r104, r71, r95, r102; +} +{ +mul.f16x2 r108, r71, r97; +} +{ +fma.rn.f16x2 r111, r74, r95, r108; +} +barrier.sync 0; +and.b32 r571, r82, 8176; +add.s32 r572, r570, r571; +st.shared.u32 [r572], r65; +st.shared.u32 [r572+8], r104; +barrier.sync 0; +and.b32 r573, r87, 4088; +sub.s32 r574, r572, r573; +ld.shared.u32 r133, [r574]; +ld.shared.u32 r134, [r574+4096]; +barrier.sync 0; +st.shared.u32 [r572], r68; +st.shared.u32 [r572+8], r111; +barrier.sync 0; +ld.shared.u32 r136, [r574]; +ld.shared.u32 r137, [r574+4096]; +{ +add.f16x2 r132, r133, r134; +} +{ +add.f16x2 r135, r136, r137; +} +{ +sub.f16x2 r138, r133, r134; +} +{ +sub.f16x2 r141, r136, r137; +} +bfe.u32 r575, r21, 2, 8; +and.b32 r576, r87, 12; +add.s32 r577, r84, r576; +cvt.rn.f32.u32 f76, r575; +mul.f32 f77, f76, 0f3C490FDB; +cos.approx.f32 f27, f77; +sin.approx.f32 f78, f77; +neg.f32 f28, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r144, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r149, {high, high}; +} +{ +mul.f16x2 r151, r141, r149; +} +{ +neg.f16x2 r154, r151; +} +{ +fma.rn.f16x2 r156, r138, r147, r154; +} +{ +mul.f16x2 r160, r138, r149; +} +{ +fma.rn.f16x2 r163, r141, r147, r160; +} +barrier.sync 0; +and.b32 r578, r82, 8160; +add.s32 r579, r577, r578; +st.shared.u32 [r579], r132; +st.shared.u32 [r579+16], r156; +barrier.sync 0; +and.b32 r580, r87, 4080; +sub.s32 r581, r579, r580; +ld.shared.u32 r185, [r581]; +ld.shared.u32 r186, [r581+4096]; +barrier.sync 0; +st.shared.u32 [r579], r135; +st.shared.u32 [r579+16], r163; +barrier.sync 0; +ld.shared.u32 r188, [r581]; +ld.shared.u32 r189, [r581+4096]; +{ +add.f16x2 r184, r185, r186; +} +{ +add.f16x2 r187, r188, r189; +} +{ +sub.f16x2 r190, r185, r186; +} +{ +sub.f16x2 r193, r188, r189; +} +bfe.u32 r582, r21, 3, 7; +and.b32 r583, r87, 28; +add.s32 r584, r84, r583; +cvt.rn.f32.u32 f79, r582; +mul.f32 f80, f79, 0f3CC90FDB; +cos.approx.f32 f33, f80; +sin.approx.f32 f81, f80; +neg.f32 f34, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r196, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r201, {high, high}; +} +{ +mul.f16x2 r203, r193, r201; +} +{ +neg.f16x2 r206, r203; +} +{ +fma.rn.f16x2 r208, r190, r199, r206; +} +{ +mul.f16x2 r212, r190, r201; +} +{ +fma.rn.f16x2 r215, r193, r199, r212; +} +barrier.sync 0; +and.b32 r585, r82, 8128; +add.s32 r586, r584, r585; +st.shared.u32 [r586], r184; +st.shared.u32 [r586+32], r208; +barrier.sync 0; +and.b32 r587, r87, 4064; +sub.s32 r588, r586, r587; +ld.shared.u32 r237, [r588]; +ld.shared.u32 r238, [r588+4096]; +barrier.sync 0; +st.shared.u32 [r586], r187; +st.shared.u32 [r586+32], r215; +barrier.sync 0; +ld.shared.u32 r240, [r588]; +ld.shared.u32 r241, [r588+4096]; +{ +add.f16x2 r236, r237, r238; +} +{ +add.f16x2 r239, r240, r241; +} +{ +sub.f16x2 r242, r237, r238; +} +{ +sub.f16x2 r245, r240, r241; +} +bfe.u32 r589, r21, 4, 6; +and.b32 r590, r87, 60; +add.s32 r591, r84, r590; +cvt.rn.f32.u32 f82, r589; +mul.f32 f83, f82, 0f3D490FDB; +cos.approx.f32 f39, f83; +sin.approx.f32 f84, f83; +neg.f32 f40, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r248, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r253, {high, high}; +} +{ +mul.f16x2 r255, r245, r253; +} +{ +neg.f16x2 r258, r255; +} +{ +fma.rn.f16x2 r260, r242, r251, r258; +} +{ +mul.f16x2 r264, r242, r253; +} +{ +fma.rn.f16x2 r267, r245, r251, r264; +} +barrier.sync 0; +and.b32 r592, r82, 8064; +add.s32 r593, r591, r592; +st.shared.u32 [r593], r236; +st.shared.u32 [r593+64], r260; +barrier.sync 0; +and.b32 r594, r87, 4032; +sub.s32 r595, r593, r594; +ld.shared.u32 r289, [r595]; +ld.shared.u32 r290, [r595+4096]; +barrier.sync 0; +st.shared.u32 [r593], r239; +st.shared.u32 [r593+64], r267; +barrier.sync 0; +ld.shared.u32 r292, [r595]; +ld.shared.u32 r293, [r595+4096]; +{ +add.f16x2 r288, r289, r290; +} +{ +add.f16x2 r291, r292, r293; +} +{ +sub.f16x2 r294, r289, r290; +} +{ +sub.f16x2 r297, r292, r293; +} +bfe.u32 r596, r21, 5, 5; +and.b32 r597, r87, 124; +add.s32 r598, r84, r597; +cvt.rn.f32.u32 f85, r596; +mul.f32 f86, f85, 0f3DC90FDB; +cos.approx.f32 f45, f86; +sin.approx.f32 f87, f86; +neg.f32 f46, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r300, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r305, {high, high}; +} +{ +mul.f16x2 r307, r297, r305; +} +{ +neg.f16x2 r310, r307; +} +{ +fma.rn.f16x2 r312, r294, r303, r310; +} +{ +mul.f16x2 r316, r294, r305; +} +{ +fma.rn.f16x2 r319, r297, r303, r316; +} +barrier.sync 0; +and.b32 r599, r82, 7936; +add.s32 r600, r598, r599; +st.shared.u32 [r600], r288; +st.shared.u32 [r600+128], r312; +barrier.sync 0; +and.b32 r601, r87, 3968; +sub.s32 r602, r600, r601; +ld.shared.u32 r341, [r602]; +ld.shared.u32 r342, [r602+4096]; +barrier.sync 0; +st.shared.u32 [r600], r291; +st.shared.u32 [r600+128], r319; +barrier.sync 0; +ld.shared.u32 r344, [r602]; +ld.shared.u32 r345, [r602+4096]; +{ +add.f16x2 r340, r341, r342; +} +{ +add.f16x2 r343, r344, r345; +} +{ +sub.f16x2 r346, r341, r342; +} +{ +sub.f16x2 r349, r344, r345; +} +bfe.u32 r603, r21, 6, 4; +and.b32 r604, r87, 252; +add.s32 r605, r84, r604; +cvt.rn.f32.u32 f88, r603; +mul.f32 f89, f88, 0f3E490FDB; +cos.approx.f32 f51, f89; +sin.approx.f32 f90, f89; +neg.f32 f52, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f51; +cvt.rn.f16.f32 high, f52; +mov.b32 r352, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r357, {high, high}; +} +{ +mul.f16x2 r359, r349, r357; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r346, r355, r362; +} +{ +mul.f16x2 r368, r346, r357; +} +{ +fma.rn.f16x2 r371, r349, r355, r368; +} +barrier.sync 0; +and.b32 r606, r82, 7680; +add.s32 r607, r605, r606; +st.shared.u32 [r607], r340; +st.shared.u32 [r607+256], r364; +barrier.sync 0; +and.b32 r608, r87, 3840; +sub.s32 r609, r607, r608; +ld.shared.u32 r393, [r609]; +ld.shared.u32 r394, [r609+4096]; +barrier.sync 0; +st.shared.u32 [r607], r343; +st.shared.u32 [r607+256], r371; +barrier.sync 0; +ld.shared.u32 r396, [r609]; +ld.shared.u32 r397, [r609+4096]; +{ +add.f16x2 r392, r393, r394; +} +{ +add.f16x2 r395, r396, r397; +} +{ +sub.f16x2 r398, r393, r394; +} +{ +sub.f16x2 r401, r396, r397; +} +bfe.u32 r610, r21, 7, 3; +and.b32 r611, r87, 508; +add.s32 r612, r84, r611; +cvt.rn.f32.u32 f91, r610; +mul.f32 f92, f91, 0f3EC90FDB; +cos.approx.f32 f57, f92; +sin.approx.f32 f93, f92; +neg.f32 f58, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r407, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r409, {high, high}; +} +{ +mul.f16x2 r411, r401, r409; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r398, r407, r414; +} +{ +mul.f16x2 r420, r398, r409; +} +{ +fma.rn.f16x2 r423, r401, r407, r420; +} +barrier.sync 0; +and.b32 r613, r82, 7168; +add.s32 r614, r612, r613; +st.shared.u32 [r614], r392; +st.shared.u32 [r614+512], r416; +barrier.sync 0; +and.b32 r615, r87, 3584; +sub.s32 r616, r614, r615; +ld.shared.u32 r445, [r616]; +ld.shared.u32 r446, [r616+4096]; +barrier.sync 0; +st.shared.u32 [r614], r395; +st.shared.u32 [r614+512], r423; +barrier.sync 0; +ld.shared.u32 r448, [r616]; +ld.shared.u32 r449, [r616+4096]; +{ +add.f16x2 r444, r445, r446; +} +{ +add.f16x2 r447, r448, r449; +} +{ +sub.f16x2 r450, r445, r446; +} +{ +sub.f16x2 r453, r448, r449; +} +bfe.u32 r617, r21, 8, 2; +and.b32 r618, r87, 1020; +add.s32 r619, r84, r618; +cvt.rn.f32.u32 f94, r617; +mul.f32 f95, f94, 0f3F490FDB; +cos.approx.f32 f63, f95; +sin.approx.f32 f96, f95; +neg.f32 f64, f96; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f63; +cvt.rn.f16.f32 high, f64; +mov.b32 r456, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r453, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r450, r459, r466; +} +{ +mul.f16x2 r472, r450, r461; +} +{ +fma.rn.f16x2 r475, r453, r459, r472; +} +barrier.sync 0; +and.b32 r620, r82, 6144; +add.s32 r621, r619, r620; +st.shared.u32 [r621], r444; +st.shared.u32 [r621+1024], r468; +barrier.sync 0; +and.b32 r622, r87, 3072; +sub.s32 r623, r621, r622; +ld.shared.u32 r497, [r623]; +ld.shared.u32 r498, [r623+4096]; +barrier.sync 0; +st.shared.u32 [r621], r447; +st.shared.u32 [r621+1024], r475; +barrier.sync 0; +ld.shared.u32 r500, [r623]; +ld.shared.u32 r501, [r623+4096]; +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r501; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +sub.f16x2 r505, r500, r501; +} +bfe.u32 r624, r21, 9, 1; +and.b32 r625, r87, 2044; +add.s32 r626, r84, r625; +cvt.rn.f32.u32 f97, r624; +mul.f32 f98, f97, 0f3FC90FDB; +cos.approx.f32 f69, f98; +sin.approx.f32 f99, f98; +neg.f32 f70, f99; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r511, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r513, {high, high}; +} +{ +mul.f16x2 r515, r505, r513; +} +{ +neg.f16x2 r518, r515; +} +{ +fma.rn.f16x2 r520, r502, r511, r518; +} +{ +mul.f16x2 r524, r502, r513; +} +{ +fma.rn.f16x2 r527, r505, r511, r524; +} +barrier.sync 0; +and.b32 r627, r82, 4096; +add.s32 r628, r626, r627; +st.shared.u32 [r628], r496; +st.shared.u32 [r628+2048], r520; +barrier.sync 0; +and.b32 r629, r87, 2048; +sub.s32 r630, r628, r629; +ld.shared.u32 r549, [r630]; +ld.shared.u32 r550, [r630+4096]; +barrier.sync 0; +st.shared.u32 [r628], r499; +st.shared.u32 [r628+2048], r527; +barrier.sync 0; +ld.shared.u32 r552, [r630]; +ld.shared.u32 r553, [r630+4096]; +{ +add.f16x2 %0, r549, r550; +} +{ +add.f16x2 %1, r552, r553; +} +{ +sub.f16x2 %2, r549, r550; +} +{ +sub.f16x2 %3, r552, r553; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..1d0402eb3483b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp16_inv.hpp.inc @@ -0,0 +1,30078 @@ +#ifndef CUFFTDX_FFT_2048_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_2048_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1039, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<363>; +.reg .b32 r<2761>; +.reg .b64 rd<2>; +mov.u32 r2741, %tid.y; +shl.b32 r2742, r2741, 13; +mov.u32 r2743, %32; +add.s32 r2744, r2743, r2742; +mov.u32 r2745, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f340, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f338, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2746, r2745, 127; +shl.b32 r2747, r2745, 6; +and.b32 r2748, r2747, -8192; +add.s32 r2749, r2744, r2748; +cvt.rn.f32.u32 f357, r2746; +mul.f32 f358, f357, 0f3B490FDB; +cos.approx.f32 f117, f358; +sin.approx.f32 f359, f358; +neg.f32 f118, f359; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2750, r2747, 8128; +add.s32 r2751, r2749, r2750; +st.shared.v4.f32 [r2751], {r521, r627, r664, r701}; +st.shared.v4.f32 [r2751+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r2751+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r2751+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r2752, r2746, -60, r2751; +ld.shared.u32 r1176, [r2752]; +ld.shared.u32 r1372, [r2752+512]; +ld.shared.u32 r1226, [r2752+1024]; +ld.shared.u32 r1422, [r2752+1536]; +ld.shared.u32 r1188, [r2752+2048]; +ld.shared.u32 r1384, [r2752+2560]; +ld.shared.u32 r1238, [r2752+3072]; +ld.shared.u32 r1434, [r2752+3584]; +ld.shared.u32 r1177, [r2752+4096]; +ld.shared.u32 r1373, [r2752+4608]; +ld.shared.u32 r1227, [r2752+5120]; +ld.shared.u32 r1423, [r2752+5632]; +ld.shared.u32 r1189, [r2752+6144]; +ld.shared.u32 r1385, [r2752+6656]; +ld.shared.u32 r1239, [r2752+7168]; +ld.shared.u32 r1435, [r2752+7680]; +barrier.sync 0; +st.shared.v4.f32 [r2751], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2751+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2751+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2751+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2752]; +ld.shared.u32 r1375, [r2752+512]; +ld.shared.u32 r1229, [r2752+1024]; +ld.shared.u32 r1425, [r2752+1536]; +ld.shared.u32 r1191, [r2752+2048]; +ld.shared.u32 r1387, [r2752+2560]; +ld.shared.u32 r1241, [r2752+3072]; +ld.shared.u32 r1437, [r2752+3584]; +ld.shared.u32 r1180, [r2752+4096]; +ld.shared.u32 r1376, [r2752+4608]; +ld.shared.u32 r1230, [r2752+5120]; +ld.shared.u32 r1426, [r2752+5632]; +ld.shared.u32 r1192, [r2752+6144]; +ld.shared.u32 r1388, [r2752+6656]; +ld.shared.u32 r1242, [r2752+7168]; +ld.shared.u32 r1438, [r2752+7680]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2753, r2745, 112; +bfe.u32 r2754, r2745, 4, 3; +shl.b32 r2755, r2745, 2; +and.b32 r2756, r2755, 60; +add.s32 r2757, r2749, r2756; +cvt.rn.f32.u32 f360, r2754; +mul.f32 f361, f360, 0f3D490FDB; +cos.approx.f32 f267, f361; +sin.approx.f32 f362, f361; +neg.f32 f268, f362; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +barrier.sync 0; +and.b32 r2758, r2747, 7168; +add.s32 r2759, r2757, r2758; +st.shared.u32 [r2759], r1695; +st.shared.u32 [r2759+64], r1801; +st.shared.u32 [r2759+128], r1838; +st.shared.u32 [r2759+192], r1875; +st.shared.u32 [r2759+256], r1912; +st.shared.u32 [r2759+320], r1949; +st.shared.u32 [r2759+384], r1986; +st.shared.u32 [r2759+448], r2023; +st.shared.u32 [r2759+512], r2060; +st.shared.u32 [r2759+576], r2097; +st.shared.u32 [r2759+640], r2134; +st.shared.u32 [r2759+704], r2171; +st.shared.u32 [r2759+768], r2208; +st.shared.u32 [r2759+832], r2245; +st.shared.u32 [r2759+896], r2282; +st.shared.u32 [r2759+960], r2319; +barrier.sync 0; +mad.lo.s32 r2760, r2753, -60, r2759; +ld.shared.u32 r2350, [r2760]; +ld.shared.u32 r2546, [r2760+512]; +ld.shared.u32 r2400, [r2760+1024]; +ld.shared.u32 r2596, [r2760+1536]; +ld.shared.u32 r2362, [r2760+2048]; +ld.shared.u32 r2558, [r2760+2560]; +ld.shared.u32 r2412, [r2760+3072]; +ld.shared.u32 r2608, [r2760+3584]; +ld.shared.u32 r2351, [r2760+4096]; +ld.shared.u32 r2547, [r2760+4608]; +ld.shared.u32 r2401, [r2760+5120]; +ld.shared.u32 r2597, [r2760+5632]; +ld.shared.u32 r2363, [r2760+6144]; +ld.shared.u32 r2559, [r2760+6656]; +ld.shared.u32 r2413, [r2760+7168]; +ld.shared.u32 r2609, [r2760+7680]; +barrier.sync 0; +st.shared.u32 [r2759], r1698; +st.shared.u32 [r2759+64], r1810; +st.shared.u32 [r2759+128], r1847; +st.shared.u32 [r2759+192], r1884; +st.shared.u32 [r2759+256], r1921; +st.shared.u32 [r2759+320], r1958; +st.shared.u32 [r2759+384], r1995; +st.shared.u32 [r2759+448], r2032; +st.shared.u32 [r2759+512], r2069; +st.shared.u32 [r2759+576], r2106; +st.shared.u32 [r2759+640], r2143; +st.shared.u32 [r2759+704], r2180; +st.shared.u32 [r2759+768], r2217; +st.shared.u32 [r2759+832], r2254; +st.shared.u32 [r2759+896], r2291; +st.shared.u32 [r2759+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2760]; +ld.shared.u32 r2549, [r2760+512]; +ld.shared.u32 r2403, [r2760+1024]; +ld.shared.u32 r2599, [r2760+1536]; +ld.shared.u32 r2365, [r2760+2048]; +ld.shared.u32 r2561, [r2760+2560]; +ld.shared.u32 r2415, [r2760+3072]; +ld.shared.u32 r2611, [r2760+3584]; +ld.shared.u32 r2354, [r2760+4096]; +ld.shared.u32 r2550, [r2760+4608]; +ld.shared.u32 r2404, [r2760+5120]; +ld.shared.u32 r2600, [r2760+5632]; +ld.shared.u32 r2366, [r2760+6144]; +ld.shared.u32 r2562, [r2760+6656]; +ld.shared.u32 r2416, [r2760+7168]; +ld.shared.u32 r2612, [r2760+7680]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 %0, r2375, r2425; +} +{ +add.f16x2 %1, r2378, r2428; +} +{ +sub.f16x2 %16, r2375, r2425; +} +{ +sub.f16x2 %17, r2378, r2428; +} +{ +add.f16x2 %4, r2387, r2469; +} +{ +add.f16x2 %5, r2390, r2475; +} +{ +sub.f16x2 %20, r2387, r2469; +} +{ +sub.f16x2 %21, r2390, r2475; +} +{ +add.f16x2 %8, r2381, r2479; +} +{ +add.f16x2 %9, r2384, r2431; +} +{ +sub.f16x2 %24, r2381, r2479; +} +{ +sub.f16x2 %25, r2384, r2431; +} +{ +add.f16x2 %12, r2393, r2487; +} +{ +add.f16x2 %13, r2396, r2493; +} +{ +sub.f16x2 %28, r2393, r2487; +} +{ +sub.f16x2 %29, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 %2, r2571, r2621; +} +{ +add.f16x2 %3, r2574, r2624; +} +{ +sub.f16x2 %18, r2571, r2621; +} +{ +sub.f16x2 %19, r2574, r2624; +} +{ +add.f16x2 %6, r2583, r2665; +} +{ +add.f16x2 %7, r2586, r2671; +} +{ +sub.f16x2 %22, r2583, r2665; +} +{ +sub.f16x2 %23, r2586, r2671; +} +{ +add.f16x2 %10, r2577, r2675; +} +{ +add.f16x2 %11, r2580, r2627; +} +{ +sub.f16x2 %26, r2577, r2675; +} +{ +sub.f16x2 %27, r2580, r2627; +} +{ +add.f16x2 %14, r2589, r2683; +} +{ +add.f16x2 %15, r2592, r2689; +} +{ +sub.f16x2 %30, r2589, r2683; +} +{ +sub.f16x2 %31, r2592, r2689; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1041, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1502>; +.reg .b64 rd<2>; +mov.u32 r1475, %tid.y; +shl.b32 r1476, r1475, 13; +mov.u32 r1477, %16; +add.s32 r1478, r1477, r1476; +mov.u32 r1479, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f104, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f136, 0f3F800000; +mov.f32 f102, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1480, r1479, 255; +shl.b32 r1481, r1479, 5; +and.b32 r1482, r1481, -8192; +add.s32 r1483, r1478, r1482; +cvt.rn.f32.u32 f139, r1480; +mul.f32 f140, f139, 0f3B490FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1484, r1481, 8160; +add.s32 r1485, r1483, r1484; +st.shared.v4.f32 [r1485], {r149, r207, r244, r281}; +st.shared.v4.f32 [r1485+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r1486, r1480, -28, r1485; +ld.shared.u32 r460, [r1486]; +ld.shared.u32 r510, [r1486+1024]; +ld.shared.u32 r472, [r1486+2048]; +ld.shared.u32 r522, [r1486+3072]; +ld.shared.u32 r461, [r1486+4096]; +ld.shared.u32 r511, [r1486+5120]; +ld.shared.u32 r473, [r1486+6144]; +ld.shared.u32 r523, [r1486+7168]; +barrier.sync 0; +st.shared.v4.f32 [r1485], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1485+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1486]; +ld.shared.u32 r513, [r1486+1024]; +ld.shared.u32 r475, [r1486+2048]; +ld.shared.u32 r525, [r1486+3072]; +ld.shared.u32 r464, [r1486+4096]; +ld.shared.u32 r514, [r1486+5120]; +ld.shared.u32 r476, [r1486+6144]; +ld.shared.u32 r526, [r1486+7168]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1487, r1479, 248; +bfe.u32 r1488, r1479, 3, 5; +shl.b32 r1489, r1479, 2; +and.b32 r1490, r1489, 28; +add.s32 r1491, r1483, r1490; +cvt.rn.f32.u32 f142, r1488; +mul.f32 f143, f142, 0f3CC90FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r1492, r1481, 7936; +add.s32 r1493, r1491, r1492; +st.shared.u32 [r1493], r607; +st.shared.u32 [r1493+32], r665; +st.shared.u32 [r1493+64], r702; +st.shared.u32 [r1493+96], r739; +st.shared.u32 [r1493+128], r776; +st.shared.u32 [r1493+160], r813; +st.shared.u32 [r1493+192], r850; +st.shared.u32 [r1493+224], r887; +barrier.sync 0; +mad.lo.s32 r1494, r1487, -28, r1493; +ld.shared.u32 r918, [r1494]; +ld.shared.u32 r968, [r1494+1024]; +ld.shared.u32 r930, [r1494+2048]; +ld.shared.u32 r980, [r1494+3072]; +ld.shared.u32 r919, [r1494+4096]; +ld.shared.u32 r969, [r1494+5120]; +ld.shared.u32 r931, [r1494+6144]; +ld.shared.u32 r981, [r1494+7168]; +barrier.sync 0; +st.shared.u32 [r1493], r610; +st.shared.u32 [r1493+32], r674; +st.shared.u32 [r1493+64], r711; +st.shared.u32 [r1493+96], r748; +st.shared.u32 [r1493+128], r785; +st.shared.u32 [r1493+160], r822; +st.shared.u32 [r1493+192], r859; +st.shared.u32 [r1493+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1494]; +ld.shared.u32 r971, [r1494+1024]; +ld.shared.u32 r933, [r1494+2048]; +ld.shared.u32 r983, [r1494+3072]; +ld.shared.u32 r922, [r1494+4096]; +ld.shared.u32 r972, [r1494+5120]; +ld.shared.u32 r934, [r1494+6144]; +ld.shared.u32 r984, [r1494+7168]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1495, r1479, 192; +bfe.u32 r1496, r1479, 6, 2; +and.b32 r1497, r1489, 252; +add.s32 r1498, r1483, r1497; +cvt.rn.f32.u32 f145, r1496; +mul.f32 f146, f145, 0f3E490FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +barrier.sync 0; +and.b32 r1499, r1481, 6144; +add.s32 r1500, r1498, r1499; +st.shared.u32 [r1500], r1065; +st.shared.u32 [r1500+256], r1123; +st.shared.u32 [r1500+512], r1160; +st.shared.u32 [r1500+768], r1197; +st.shared.u32 [r1500+1024], r1234; +st.shared.u32 [r1500+1280], r1271; +st.shared.u32 [r1500+1536], r1308; +st.shared.u32 [r1500+1792], r1345; +barrier.sync 0; +mad.lo.s32 r1501, r1495, -28, r1500; +ld.shared.u32 r1376, [r1501]; +ld.shared.u32 r1426, [r1501+1024]; +ld.shared.u32 r1388, [r1501+2048]; +ld.shared.u32 r1438, [r1501+3072]; +ld.shared.u32 r1377, [r1501+4096]; +ld.shared.u32 r1427, [r1501+5120]; +ld.shared.u32 r1389, [r1501+6144]; +ld.shared.u32 r1439, [r1501+7168]; +barrier.sync 0; +st.shared.u32 [r1500], r1068; +st.shared.u32 [r1500+256], r1132; +st.shared.u32 [r1500+512], r1169; +st.shared.u32 [r1500+768], r1206; +st.shared.u32 [r1500+1024], r1243; +st.shared.u32 [r1500+1280], r1280; +st.shared.u32 [r1500+1536], r1317; +st.shared.u32 [r1500+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1501]; +ld.shared.u32 r1429, [r1501+1024]; +ld.shared.u32 r1391, [r1501+2048]; +ld.shared.u32 r1441, [r1501+3072]; +ld.shared.u32 r1380, [r1501+4096]; +ld.shared.u32 r1430, [r1501+5120]; +ld.shared.u32 r1392, [r1501+6144]; +ld.shared.u32 r1442, [r1501+7168]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1396; +} +{ +add.f16x2 %0, r1375, r1387; +} +{ +add.f16x2 %1, r1378, r1390; +} +{ +sub.f16x2 %8, r1375, r1387; +} +{ +sub.f16x2 %9, r1378, r1390; +} +{ +add.f16x2 %4, r1381, r1399; +} +{ +add.f16x2 %5, r1384, r1393; +} +{ +sub.f16x2 %12, r1381, r1399; +} +{ +sub.f16x2 %13, r1384, r1393; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1446; +} +{ +add.f16x2 %2, r1425, r1437; +} +{ +add.f16x2 %3, r1428, r1440; +} +{ +sub.f16x2 %10, r1425, r1437; +} +{ +sub.f16x2 %11, r1428, r1440; +} +{ +add.f16x2 %6, r1431, r1449; +} +{ +add.f16x2 %7, r1434, r1443; +} +{ +sub.f16x2 %14, r1431, r1449; +} +{ +sub.f16x2 %15, r1434, r1443; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1040, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<148>; +.reg .b32 r<1502>; +.reg .b64 rd<2>; +mov.u32 r1475, %tid.y; +shl.b32 r1476, r1475, 14; +mov.u32 r1477, %16; +add.s32 r1478, r1477, r1476; +mov.u32 r1479, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f104, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f136, 0f3F800000; +mov.f32 f102, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1480, r1479, 255; +shl.b32 r1481, r1479, 6; +and.b32 r1482, r1481, -16384; +add.s32 r1483, r1478, r1482; +cvt.rn.f32.u32 f139, r1480; +mul.f32 f140, f139, 0f3B490FDB; +cos.approx.f32 f29, f140; +sin.approx.f32 f141, f140; +neg.f32 f30, f141; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1484, r1481, 16320; +add.s32 r1485, r1483, r1484; +st.shared.v4.f32 [r1485], {r149, r152, r207, r216}; +st.shared.v4.f32 [r1485+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r1485+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r1485+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r1486, r1480, -56, r1485; +ld.shared.u32 r460, [r1486]; +ld.shared.u32 r463, [r1486+4]; +ld.shared.u32 r510, [r1486+2048]; +ld.shared.u32 r513, [r1486+2052]; +ld.shared.u32 r472, [r1486+4096]; +ld.shared.u32 r475, [r1486+4100]; +ld.shared.u32 r522, [r1486+6144]; +ld.shared.u32 r525, [r1486+6148]; +ld.shared.u32 r461, [r1486+8192]; +ld.shared.u32 r464, [r1486+8196]; +ld.shared.u32 r511, [r1486+10240]; +ld.shared.u32 r514, [r1486+10244]; +ld.shared.u32 r473, [r1486+12288]; +ld.shared.u32 r476, [r1486+12292]; +ld.shared.u32 r523, [r1486+14336]; +ld.shared.u32 r526, [r1486+14340]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1487, r1479, 248; +bfe.u32 r1488, r1479, 3, 5; +cvt.rn.f32.u32 f142, r1488; +mul.f32 f143, f142, 0f3CC90FDB; +cos.approx.f32 f75, f143; +sin.approx.f32 f144, f143; +neg.f32 f76, f144; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r1489, r1479, 3; +and.b32 r1490, r1489, 56; +add.s32 r1491, r1483, r1490; +barrier.sync 0; +and.b32 r1492, r1481, 15872; +add.s32 r1493, r1491, r1492; +st.shared.u32 [r1493], r607; +st.shared.u32 [r1493+4], r610; +st.shared.u32 [r1493+64], r665; +st.shared.u32 [r1493+68], r674; +st.shared.u32 [r1493+128], r702; +st.shared.u32 [r1493+132], r711; +st.shared.u32 [r1493+192], r739; +st.shared.u32 [r1493+196], r748; +st.shared.u32 [r1493+256], r776; +st.shared.u32 [r1493+260], r785; +st.shared.u32 [r1493+320], r813; +st.shared.u32 [r1493+324], r822; +st.shared.u32 [r1493+384], r850; +st.shared.u32 [r1493+388], r859; +st.shared.u32 [r1493+448], r887; +st.shared.u32 [r1493+452], r896; +barrier.sync 0; +mad.lo.s32 r1494, r1487, -56, r1493; +ld.shared.u32 r918, [r1494]; +ld.shared.u32 r921, [r1494+4]; +ld.shared.u32 r968, [r1494+2048]; +ld.shared.u32 r971, [r1494+2052]; +ld.shared.u32 r930, [r1494+4096]; +ld.shared.u32 r933, [r1494+4100]; +ld.shared.u32 r980, [r1494+6144]; +ld.shared.u32 r983, [r1494+6148]; +ld.shared.u32 r919, [r1494+8192]; +ld.shared.u32 r922, [r1494+8196]; +ld.shared.u32 r969, [r1494+10240]; +ld.shared.u32 r972, [r1494+10244]; +ld.shared.u32 r931, [r1494+12288]; +ld.shared.u32 r934, [r1494+12292]; +ld.shared.u32 r981, [r1494+14336]; +ld.shared.u32 r984, [r1494+14340]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1495, r1479, 192; +bfe.u32 r1496, r1479, 6, 2; +cvt.rn.f32.u32 f145, r1496; +mul.f32 f146, f145, 0f3E490FDB; +cos.approx.f32 f121, f146; +sin.approx.f32 f147, f146; +neg.f32 f122, f147; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +and.b32 r1497, r1489, 504; +add.s32 r1498, r1483, r1497; +barrier.sync 0; +and.b32 r1499, r1481, 12288; +add.s32 r1500, r1498, r1499; +st.shared.u32 [r1500], r1065; +st.shared.u32 [r1500+4], r1068; +st.shared.u32 [r1500+512], r1123; +st.shared.u32 [r1500+516], r1132; +st.shared.u32 [r1500+1024], r1160; +st.shared.u32 [r1500+1028], r1169; +st.shared.u32 [r1500+1536], r1197; +st.shared.u32 [r1500+1540], r1206; +st.shared.u32 [r1500+2048], r1234; +st.shared.u32 [r1500+2052], r1243; +st.shared.u32 [r1500+2560], r1271; +st.shared.u32 [r1500+2564], r1280; +st.shared.u32 [r1500+3072], r1308; +st.shared.u32 [r1500+3076], r1317; +st.shared.u32 [r1500+3584], r1345; +st.shared.u32 [r1500+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1501, r1495, -56, r1500; +ld.shared.u32 r1376, [r1501]; +ld.shared.u32 r1379, [r1501+4]; +ld.shared.u32 r1426, [r1501+2048]; +ld.shared.u32 r1429, [r1501+2052]; +ld.shared.u32 r1388, [r1501+4096]; +ld.shared.u32 r1391, [r1501+4100]; +ld.shared.u32 r1438, [r1501+6144]; +ld.shared.u32 r1441, [r1501+6148]; +ld.shared.u32 r1377, [r1501+8192]; +ld.shared.u32 r1380, [r1501+8196]; +ld.shared.u32 r1427, [r1501+10240]; +ld.shared.u32 r1430, [r1501+10244]; +ld.shared.u32 r1389, [r1501+12288]; +ld.shared.u32 r1392, [r1501+12292]; +ld.shared.u32 r1439, [r1501+14336]; +ld.shared.u32 r1442, [r1501+14340]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1396; +} +{ +add.f16x2 %0, r1375, r1387; +} +{ +add.f16x2 %1, r1378, r1390; +} +{ +sub.f16x2 %8, r1375, r1387; +} +{ +sub.f16x2 %9, r1378, r1390; +} +{ +add.f16x2 %4, r1381, r1399; +} +{ +add.f16x2 %5, r1384, r1393; +} +{ +sub.f16x2 %12, r1381, r1399; +} +{ +sub.f16x2 %13, r1384, r1393; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1446; +} +{ +add.f16x2 %2, r1425, r1437; +} +{ +add.f16x2 %3, r1428, r1440; +} +{ +sub.f16x2 %10, r1425, r1437; +} +{ +sub.f16x2 %11, r1428, r1440; +} +{ +add.f16x2 %6, r1431, r1449; +} +{ +add.f16x2 %7, r1434, r1443; +} +{ +sub.f16x2 %14, r1431, r1449; +} +{ +sub.f16x2 %15, r1434, r1443; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1042, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<363>; +.reg .b32 r<2761>; +.reg .b64 rd<2>; +mov.u32 r2741, %tid.y; +shl.b32 r2742, r2741, 14; +mov.u32 r2743, %32; +add.s32 r2744, r2743, r2742; +mov.u32 r2745, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f340, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f338, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2746, r2745, 127; +shl.b32 r2747, r2745, 7; +and.b32 r2748, r2747, -16384; +add.s32 r2749, r2744, r2748; +cvt.rn.f32.u32 f357, r2746; +mul.f32 f358, f357, 0f3B490FDB; +cos.approx.f32 f117, f358; +sin.approx.f32 f359, f358; +neg.f32 f118, f359; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2750, r2747, 16256; +add.s32 r2751, r2749, r2750; +st.shared.v4.f32 [r2751], {r521, r524, r627, r636}; +st.shared.v4.f32 [r2751+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r2751+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r2751+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r2751+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r2751+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r2751+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r2751+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r2752, r2746, -120, r2751; +ld.shared.u32 r1176, [r2752]; +ld.shared.u32 r1179, [r2752+4]; +ld.shared.u32 r1372, [r2752+1024]; +ld.shared.u32 r1375, [r2752+1028]; +ld.shared.u32 r1226, [r2752+2048]; +ld.shared.u32 r1229, [r2752+2052]; +ld.shared.u32 r1422, [r2752+3072]; +ld.shared.u32 r1425, [r2752+3076]; +ld.shared.u32 r1188, [r2752+4096]; +ld.shared.u32 r1191, [r2752+4100]; +ld.shared.u32 r1384, [r2752+5120]; +ld.shared.u32 r1387, [r2752+5124]; +ld.shared.u32 r1238, [r2752+6144]; +ld.shared.u32 r1241, [r2752+6148]; +ld.shared.u32 r1434, [r2752+7168]; +ld.shared.u32 r1437, [r2752+7172]; +ld.shared.u32 r1177, [r2752+8192]; +ld.shared.u32 r1180, [r2752+8196]; +ld.shared.u32 r1373, [r2752+9216]; +ld.shared.u32 r1376, [r2752+9220]; +ld.shared.u32 r1227, [r2752+10240]; +ld.shared.u32 r1230, [r2752+10244]; +ld.shared.u32 r1423, [r2752+11264]; +ld.shared.u32 r1426, [r2752+11268]; +ld.shared.u32 r1189, [r2752+12288]; +ld.shared.u32 r1192, [r2752+12292]; +ld.shared.u32 r1385, [r2752+13312]; +ld.shared.u32 r1388, [r2752+13316]; +ld.shared.u32 r1239, [r2752+14336]; +ld.shared.u32 r1242, [r2752+14340]; +ld.shared.u32 r1435, [r2752+15360]; +ld.shared.u32 r1438, [r2752+15364]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2753, r2745, 112; +bfe.u32 r2754, r2745, 4, 3; +cvt.rn.f32.u32 f360, r2754; +mul.f32 f361, f360, 0f3D490FDB; +cos.approx.f32 f267, f361; +sin.approx.f32 f362, f361; +neg.f32 f268, f362; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +shl.b32 r2755, r2745, 3; +and.b32 r2756, r2755, 120; +add.s32 r2757, r2749, r2756; +barrier.sync 0; +and.b32 r2758, r2747, 14336; +add.s32 r2759, r2757, r2758; +st.shared.u32 [r2759], r1695; +st.shared.u32 [r2759+4], r1698; +st.shared.u32 [r2759+128], r1801; +st.shared.u32 [r2759+132], r1810; +st.shared.u32 [r2759+256], r1838; +st.shared.u32 [r2759+260], r1847; +st.shared.u32 [r2759+384], r1875; +st.shared.u32 [r2759+388], r1884; +st.shared.u32 [r2759+512], r1912; +st.shared.u32 [r2759+516], r1921; +st.shared.u32 [r2759+640], r1949; +st.shared.u32 [r2759+644], r1958; +st.shared.u32 [r2759+768], r1986; +st.shared.u32 [r2759+772], r1995; +st.shared.u32 [r2759+896], r2023; +st.shared.u32 [r2759+900], r2032; +st.shared.u32 [r2759+1024], r2060; +st.shared.u32 [r2759+1028], r2069; +st.shared.u32 [r2759+1152], r2097; +st.shared.u32 [r2759+1156], r2106; +st.shared.u32 [r2759+1280], r2134; +st.shared.u32 [r2759+1284], r2143; +st.shared.u32 [r2759+1408], r2171; +st.shared.u32 [r2759+1412], r2180; +st.shared.u32 [r2759+1536], r2208; +st.shared.u32 [r2759+1540], r2217; +st.shared.u32 [r2759+1664], r2245; +st.shared.u32 [r2759+1668], r2254; +st.shared.u32 [r2759+1792], r2282; +st.shared.u32 [r2759+1796], r2291; +st.shared.u32 [r2759+1920], r2319; +st.shared.u32 [r2759+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2760, r2753, -120, r2759; +ld.shared.u32 r2350, [r2760]; +ld.shared.u32 r2353, [r2760+4]; +ld.shared.u32 r2546, [r2760+1024]; +ld.shared.u32 r2549, [r2760+1028]; +ld.shared.u32 r2400, [r2760+2048]; +ld.shared.u32 r2403, [r2760+2052]; +ld.shared.u32 r2596, [r2760+3072]; +ld.shared.u32 r2599, [r2760+3076]; +ld.shared.u32 r2362, [r2760+4096]; +ld.shared.u32 r2365, [r2760+4100]; +ld.shared.u32 r2558, [r2760+5120]; +ld.shared.u32 r2561, [r2760+5124]; +ld.shared.u32 r2412, [r2760+6144]; +ld.shared.u32 r2415, [r2760+6148]; +ld.shared.u32 r2608, [r2760+7168]; +ld.shared.u32 r2611, [r2760+7172]; +ld.shared.u32 r2351, [r2760+8192]; +ld.shared.u32 r2354, [r2760+8196]; +ld.shared.u32 r2547, [r2760+9216]; +ld.shared.u32 r2550, [r2760+9220]; +ld.shared.u32 r2401, [r2760+10240]; +ld.shared.u32 r2404, [r2760+10244]; +ld.shared.u32 r2597, [r2760+11264]; +ld.shared.u32 r2600, [r2760+11268]; +ld.shared.u32 r2363, [r2760+12288]; +ld.shared.u32 r2366, [r2760+12292]; +ld.shared.u32 r2559, [r2760+13312]; +ld.shared.u32 r2562, [r2760+13316]; +ld.shared.u32 r2413, [r2760+14336]; +ld.shared.u32 r2416, [r2760+14340]; +ld.shared.u32 r2609, [r2760+15360]; +ld.shared.u32 r2612, [r2760+15364]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 %0, r2375, r2425; +} +{ +add.f16x2 %1, r2378, r2428; +} +{ +sub.f16x2 %16, r2375, r2425; +} +{ +sub.f16x2 %17, r2378, r2428; +} +{ +add.f16x2 %4, r2387, r2469; +} +{ +add.f16x2 %5, r2390, r2475; +} +{ +sub.f16x2 %20, r2387, r2469; +} +{ +sub.f16x2 %21, r2390, r2475; +} +{ +add.f16x2 %8, r2381, r2479; +} +{ +add.f16x2 %9, r2384, r2431; +} +{ +sub.f16x2 %24, r2381, r2479; +} +{ +sub.f16x2 %25, r2384, r2431; +} +{ +add.f16x2 %12, r2393, r2487; +} +{ +add.f16x2 %13, r2396, r2493; +} +{ +sub.f16x2 %28, r2393, r2487; +} +{ +sub.f16x2 %29, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 %2, r2571, r2621; +} +{ +add.f16x2 %3, r2574, r2624; +} +{ +sub.f16x2 %18, r2571, r2621; +} +{ +sub.f16x2 %19, r2574, r2624; +} +{ +add.f16x2 %6, r2583, r2665; +} +{ +add.f16x2 %7, r2586, r2671; +} +{ +sub.f16x2 %22, r2583, r2665; +} +{ +sub.f16x2 %23, r2586, r2671; +} +{ +add.f16x2 %10, r2577, r2675; +} +{ +add.f16x2 %11, r2580, r2627; +} +{ +sub.f16x2 %26, r2577, r2675; +} +{ +sub.f16x2 %27, r2580, r2627; +} +{ +add.f16x2 %14, r2589, r2683; +} +{ +add.f16x2 %15, r2592, r2689; +} +{ +sub.f16x2 %30, r2589, r2683; +} +{ +sub.f16x2 %31, r2592, r2689; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1043, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6003>; +.reg .b64 rd<3>; +mov.u32 r5917, %tid.y; +shl.b32 r5918, r5917, 14; +mov.u32 r5919, %64; +add.s32 r5920, r5919, r5918; +mov.u32 r5921, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f702, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +mov.f32 f700, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r5923, r5921, 8; +and.b32 r5924, r5923, -16384; +add.s32 r5925, r5920, r5924; +and.b32 r5938, r5921, 63; +cvt.rn.f32.u32 f845, r5938; +mul.f32 f846, f845, 0f3B490FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r5926, r5923, 16128; +add.s32 r5927, r5925, r5926; +st.shared.v4.f32 [r5927], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r5927+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r5927+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r5927+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r5927+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r5927+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r5927+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r5927+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r5927+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r5927+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r5927+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r5927+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r5927+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r5927+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r5927+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r5927+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r5928, r5938, -248, r5927; +ld.shared.u32 r2864, [r5928]; +ld.shared.u32 r2867, [r5928+4]; +ld.shared.u32 r3480, [r5928+512]; +ld.shared.u32 r3483, [r5928+516]; +ld.shared.u32 r3060, [r5928+1024]; +ld.shared.u32 r3063, [r5928+1028]; +ld.shared.u32 r3676, [r5928+1536]; +ld.shared.u32 r3679, [r5928+1540]; +ld.shared.u32 r2914, [r5928+2048]; +ld.shared.u32 r2917, [r5928+2052]; +ld.shared.u32 r3530, [r5928+2560]; +ld.shared.u32 r3533, [r5928+2564]; +ld.shared.u32 r3110, [r5928+3072]; +ld.shared.u32 r3113, [r5928+3076]; +ld.shared.u32 r3726, [r5928+3584]; +ld.shared.u32 r3729, [r5928+3588]; +ld.shared.u32 r2876, [r5928+4096]; +ld.shared.u32 r2879, [r5928+4100]; +ld.shared.u32 r3492, [r5928+4608]; +ld.shared.u32 r3495, [r5928+4612]; +ld.shared.u32 r3072, [r5928+5120]; +ld.shared.u32 r3075, [r5928+5124]; +ld.shared.u32 r3688, [r5928+5632]; +ld.shared.u32 r3691, [r5928+5636]; +ld.shared.u32 r2926, [r5928+6144]; +ld.shared.u32 r2929, [r5928+6148]; +ld.shared.u32 r3542, [r5928+6656]; +ld.shared.u32 r3545, [r5928+6660]; +ld.shared.u32 r3122, [r5928+7168]; +ld.shared.u32 r3125, [r5928+7172]; +ld.shared.u32 r3738, [r5928+7680]; +ld.shared.u32 r3741, [r5928+7684]; +ld.shared.u32 r2865, [r5928+8192]; +ld.shared.u32 r2868, [r5928+8196]; +ld.shared.u32 r3481, [r5928+8704]; +ld.shared.u32 r3484, [r5928+8708]; +ld.shared.u32 r3061, [r5928+9216]; +ld.shared.u32 r3064, [r5928+9220]; +ld.shared.u32 r3677, [r5928+9728]; +ld.shared.u32 r3680, [r5928+9732]; +ld.shared.u32 r2915, [r5928+10240]; +ld.shared.u32 r2918, [r5928+10244]; +ld.shared.u32 r3531, [r5928+10752]; +ld.shared.u32 r3534, [r5928+10756]; +ld.shared.u32 r3111, [r5928+11264]; +ld.shared.u32 r3114, [r5928+11268]; +ld.shared.u32 r3727, [r5928+11776]; +ld.shared.u32 r3730, [r5928+11780]; +ld.shared.u32 r2877, [r5928+12288]; +ld.shared.u32 r2880, [r5928+12292]; +ld.shared.u32 r3493, [r5928+12800]; +ld.shared.u32 r3496, [r5928+12804]; +ld.shared.u32 r3073, [r5928+13312]; +ld.shared.u32 r3076, [r5928+13316]; +ld.shared.u32 r3689, [r5928+13824]; +ld.shared.u32 r3692, [r5928+13828]; +ld.shared.u32 r2927, [r5928+14336]; +ld.shared.u32 r2930, [r5928+14340]; +ld.shared.u32 r3543, [r5928+14848]; +ld.shared.u32 r3546, [r5928+14852]; +ld.shared.u32 r3123, [r5928+15360]; +ld.shared.u32 r3126, [r5928+15364]; +ld.shared.u32 r3739, [r5928+15872]; +ld.shared.u32 r3742, [r5928+15876]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r5930, r5921, 5, 1; +cvt.rn.f32.u32 f848, r5930; +mul.f32 f849, f848, 0f3DC90FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +and.b32 r5937, r5921, 32; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +shl.b32 r5931, r5921, 3; +and.b32 r5932, r5931, 248; +add.s32 r5933, r5925, r5932; +barrier.sync 0; +and.b32 r5934, r5923, 8192; +add.s32 r5935, r5933, r5934; +st.shared.u32 [r5935], r4383; +st.shared.u32 [r5935+4], r4386; +st.shared.u32 [r5935+256], r4585; +st.shared.u32 [r5935+260], r4594; +st.shared.u32 [r5935+512], r4622; +st.shared.u32 [r5935+516], r4631; +st.shared.u32 [r5935+768], r4659; +st.shared.u32 [r5935+772], r4668; +st.shared.u32 [r5935+1024], r4696; +st.shared.u32 [r5935+1028], r4705; +st.shared.u32 [r5935+1280], r4733; +st.shared.u32 [r5935+1284], r4742; +st.shared.u32 [r5935+1536], r4770; +st.shared.u32 [r5935+1540], r4779; +st.shared.u32 [r5935+1792], r4807; +st.shared.u32 [r5935+1796], r4816; +st.shared.u32 [r5935+2048], r4844; +st.shared.u32 [r5935+2052], r4853; +st.shared.u32 [r5935+2304], r4881; +st.shared.u32 [r5935+2308], r4890; +st.shared.u32 [r5935+2560], r4918; +st.shared.u32 [r5935+2564], r4927; +st.shared.u32 [r5935+2816], r4955; +st.shared.u32 [r5935+2820], r4964; +st.shared.u32 [r5935+3072], r4992; +st.shared.u32 [r5935+3076], r5001; +st.shared.u32 [r5935+3328], r5029; +st.shared.u32 [r5935+3332], r5038; +st.shared.u32 [r5935+3584], r5066; +st.shared.u32 [r5935+3588], r5075; +st.shared.u32 [r5935+3840], r5103; +st.shared.u32 [r5935+3844], r5112; +st.shared.u32 [r5935+4096], r5140; +st.shared.u32 [r5935+4100], r5149; +st.shared.u32 [r5935+4352], r5177; +st.shared.u32 [r5935+4356], r5186; +st.shared.u32 [r5935+4608], r5214; +st.shared.u32 [r5935+4612], r5223; +st.shared.u32 [r5935+4864], r5251; +st.shared.u32 [r5935+4868], r5260; +st.shared.u32 [r5935+5120], r5288; +st.shared.u32 [r5935+5124], r5297; +st.shared.u32 [r5935+5376], r5325; +st.shared.u32 [r5935+5380], r5334; +st.shared.u32 [r5935+5632], r5362; +st.shared.u32 [r5935+5636], r5371; +st.shared.u32 [r5935+5888], r5399; +st.shared.u32 [r5935+5892], r5408; +st.shared.u32 [r5935+6144], r5436; +st.shared.u32 [r5935+6148], r5445; +st.shared.u32 [r5935+6400], r5473; +st.shared.u32 [r5935+6404], r5482; +st.shared.u32 [r5935+6656], r5510; +st.shared.u32 [r5935+6660], r5519; +st.shared.u32 [r5935+6912], r5547; +st.shared.u32 [r5935+6916], r5556; +st.shared.u32 [r5935+7168], r5584; +st.shared.u32 [r5935+7172], r5593; +st.shared.u32 [r5935+7424], r5621; +st.shared.u32 [r5935+7428], r5630; +st.shared.u32 [r5935+7680], r5658; +st.shared.u32 [r5935+7684], r5667; +st.shared.u32 [r5935+7936], r5695; +st.shared.u32 [r5935+7940], r5704; +barrier.sync 0; +mad.lo.s32 r5936, r5937, -248, r5935; +ld.shared.u32 r5726, [r5936]; +ld.shared.u32 r5729, [r5936+4]; +ld.shared.u32 r5738, [r5936+512]; +ld.shared.u32 r5741, [r5936+516]; +ld.shared.u32 r5750, [r5936+1024]; +ld.shared.u32 r5753, [r5936+1028]; +ld.shared.u32 r5762, [r5936+1536]; +ld.shared.u32 r5765, [r5936+1540]; +ld.shared.u32 r5774, [r5936+2048]; +ld.shared.u32 r5777, [r5936+2052]; +ld.shared.u32 r5786, [r5936+2560]; +ld.shared.u32 r5789, [r5936+2564]; +ld.shared.u32 r5798, [r5936+3072]; +ld.shared.u32 r5801, [r5936+3076]; +ld.shared.u32 r5810, [r5936+3584]; +ld.shared.u32 r5813, [r5936+3588]; +ld.shared.u32 r5822, [r5936+4096]; +ld.shared.u32 r5825, [r5936+4100]; +ld.shared.u32 r5834, [r5936+4608]; +ld.shared.u32 r5837, [r5936+4612]; +ld.shared.u32 r5846, [r5936+5120]; +ld.shared.u32 r5849, [r5936+5124]; +ld.shared.u32 r5858, [r5936+5632]; +ld.shared.u32 r5861, [r5936+5636]; +ld.shared.u32 r5870, [r5936+6144]; +ld.shared.u32 r5873, [r5936+6148]; +ld.shared.u32 r5882, [r5936+6656]; +ld.shared.u32 r5885, [r5936+6660]; +ld.shared.u32 r5894, [r5936+7168]; +ld.shared.u32 r5897, [r5936+7172]; +ld.shared.u32 r5906, [r5936+7680]; +ld.shared.u32 r5909, [r5936+7684]; +ld.shared.u32 r5727, [r5936+8192]; +ld.shared.u32 r5730, [r5936+8196]; +ld.shared.u32 r5739, [r5936+8704]; +ld.shared.u32 r5742, [r5936+8708]; +ld.shared.u32 r5751, [r5936+9216]; +ld.shared.u32 r5754, [r5936+9220]; +ld.shared.u32 r5763, [r5936+9728]; +ld.shared.u32 r5766, [r5936+9732]; +ld.shared.u32 r5775, [r5936+10240]; +ld.shared.u32 r5778, [r5936+10244]; +ld.shared.u32 r5787, [r5936+10752]; +ld.shared.u32 r5790, [r5936+10756]; +ld.shared.u32 r5799, [r5936+11264]; +ld.shared.u32 r5802, [r5936+11268]; +ld.shared.u32 r5811, [r5936+11776]; +ld.shared.u32 r5814, [r5936+11780]; +ld.shared.u32 r5823, [r5936+12288]; +ld.shared.u32 r5826, [r5936+12292]; +ld.shared.u32 r5835, [r5936+12800]; +ld.shared.u32 r5838, [r5936+12804]; +ld.shared.u32 r5847, [r5936+13312]; +ld.shared.u32 r5850, [r5936+13316]; +ld.shared.u32 r5859, [r5936+13824]; +ld.shared.u32 r5862, [r5936+13828]; +ld.shared.u32 r5871, [r5936+14336]; +ld.shared.u32 r5874, [r5936+14340]; +ld.shared.u32 r5883, [r5936+14848]; +ld.shared.u32 r5886, [r5936+14852]; +ld.shared.u32 r5895, [r5936+15360]; +ld.shared.u32 r5898, [r5936+15364]; +ld.shared.u32 r5907, [r5936+15872]; +ld.shared.u32 r5910, [r5936+15876]; +{ +add.f16x2 %0, r5726, r5727; +} +{ +add.f16x2 %1, r5729, r5730; +} +{ +sub.f16x2 %32, r5726, r5727; +} +{ +sub.f16x2 %33, r5729, r5730; +} +{ +add.f16x2 %2, r5738, r5739; +} +{ +add.f16x2 %3, r5741, r5742; +} +{ +sub.f16x2 %34, r5738, r5739; +} +{ +sub.f16x2 %35, r5741, r5742; +} +{ +add.f16x2 %4, r5750, r5751; +} +{ +add.f16x2 %5, r5753, r5754; +} +{ +sub.f16x2 %36, r5750, r5751; +} +{ +sub.f16x2 %37, r5753, r5754; +} +{ +add.f16x2 %6, r5762, r5763; +} +{ +add.f16x2 %7, r5765, r5766; +} +{ +sub.f16x2 %38, r5762, r5763; +} +{ +sub.f16x2 %39, r5765, r5766; +} +{ +add.f16x2 %8, r5774, r5775; +} +{ +add.f16x2 %9, r5777, r5778; +} +{ +sub.f16x2 %40, r5774, r5775; +} +{ +sub.f16x2 %41, r5777, r5778; +} +{ +add.f16x2 %10, r5786, r5787; +} +{ +add.f16x2 %11, r5789, r5790; +} +{ +sub.f16x2 %42, r5786, r5787; +} +{ +sub.f16x2 %43, r5789, r5790; +} +{ +add.f16x2 %12, r5798, r5799; +} +{ +add.f16x2 %13, r5801, r5802; +} +{ +sub.f16x2 %44, r5798, r5799; +} +{ +sub.f16x2 %45, r5801, r5802; +} +{ +add.f16x2 %14, r5810, r5811; +} +{ +add.f16x2 %15, r5813, r5814; +} +{ +sub.f16x2 %46, r5810, r5811; +} +{ +sub.f16x2 %47, r5813, r5814; +} +{ +add.f16x2 %16, r5822, r5823; +} +{ +add.f16x2 %17, r5825, r5826; +} +{ +sub.f16x2 %48, r5822, r5823; +} +{ +sub.f16x2 %49, r5825, r5826; +} +{ +add.f16x2 %18, r5834, r5835; +} +{ +add.f16x2 %19, r5837, r5838; +} +{ +sub.f16x2 %50, r5834, r5835; +} +{ +sub.f16x2 %51, r5837, r5838; +} +{ +add.f16x2 %20, r5846, r5847; +} +{ +add.f16x2 %21, r5849, r5850; +} +{ +sub.f16x2 %52, r5846, r5847; +} +{ +sub.f16x2 %53, r5849, r5850; +} +{ +add.f16x2 %22, r5858, r5859; +} +{ +add.f16x2 %23, r5861, r5862; +} +{ +sub.f16x2 %54, r5858, r5859; +} +{ +sub.f16x2 %55, r5861, r5862; +} +{ +add.f16x2 %24, r5870, r5871; +} +{ +add.f16x2 %25, r5873, r5874; +} +{ +sub.f16x2 %56, r5870, r5871; +} +{ +sub.f16x2 %57, r5873, r5874; +} +{ +add.f16x2 %26, r5882, r5883; +} +{ +add.f16x2 %27, r5885, r5886; +} +{ +sub.f16x2 %58, r5882, r5883; +} +{ +sub.f16x2 %59, r5885, r5886; +} +{ +add.f16x2 %28, r5894, r5895; +} +{ +add.f16x2 %29, r5897, r5898; +} +{ +sub.f16x2 %60, r5894, r5895; +} +{ +sub.f16x2 %61, r5897, r5898; +} +{ +add.f16x2 %30, r5906, r5907; +} +{ +add.f16x2 %31, r5909, r5910; +} +{ +sub.f16x2 %62, r5906, r5907; +} +{ +sub.f16x2 %63, r5909, r5910; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1044, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6002>; +.reg .b64 rd<3>; +mov.u32 r5917, %tid.y; +shl.b32 r5918, r5917, 13; +mov.u32 r5919, %64; +add.s32 r5920, r5919, r5918; +mov.u32 r5921, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f702, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +mov.f32 f700, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r5923, r5921, 7; +and.b32 r5924, r5923, -8192; +add.s32 r5925, r5920, r5924; +and.b32 r5937, r5921, 63; +cvt.rn.f32.u32 f845, r5937; +mul.f32 f846, f845, 0f3B490FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r5926, r5923, 8064; +add.s32 r5927, r5925, r5926; +st.shared.v4.f32 [r5927], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r5927+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r5927+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r5927+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r5927+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r5927+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r5927+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r5927+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r5928, r5937, -124, r5927; +ld.shared.u32 r2864, [r5928]; +ld.shared.u32 r3480, [r5928+256]; +ld.shared.u32 r3060, [r5928+512]; +ld.shared.u32 r3676, [r5928+768]; +ld.shared.u32 r2914, [r5928+1024]; +ld.shared.u32 r3530, [r5928+1280]; +ld.shared.u32 r3110, [r5928+1536]; +ld.shared.u32 r3726, [r5928+1792]; +ld.shared.u32 r2876, [r5928+2048]; +ld.shared.u32 r3492, [r5928+2304]; +ld.shared.u32 r3072, [r5928+2560]; +ld.shared.u32 r3688, [r5928+2816]; +ld.shared.u32 r2926, [r5928+3072]; +ld.shared.u32 r3542, [r5928+3328]; +ld.shared.u32 r3122, [r5928+3584]; +ld.shared.u32 r3738, [r5928+3840]; +ld.shared.u32 r2865, [r5928+4096]; +ld.shared.u32 r3481, [r5928+4352]; +ld.shared.u32 r3061, [r5928+4608]; +ld.shared.u32 r3677, [r5928+4864]; +ld.shared.u32 r2915, [r5928+5120]; +ld.shared.u32 r3531, [r5928+5376]; +ld.shared.u32 r3111, [r5928+5632]; +ld.shared.u32 r3727, [r5928+5888]; +ld.shared.u32 r2877, [r5928+6144]; +ld.shared.u32 r3493, [r5928+6400]; +ld.shared.u32 r3073, [r5928+6656]; +ld.shared.u32 r3689, [r5928+6912]; +ld.shared.u32 r2927, [r5928+7168]; +ld.shared.u32 r3543, [r5928+7424]; +ld.shared.u32 r3123, [r5928+7680]; +ld.shared.u32 r3739, [r5928+7936]; +barrier.sync 0; +st.shared.v4.f32 [r5927], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r5927+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r5927+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r5927+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r5927+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r5927+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r5927+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r5927+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r5928]; +ld.shared.u32 r3483, [r5928+256]; +ld.shared.u32 r3063, [r5928+512]; +ld.shared.u32 r3679, [r5928+768]; +ld.shared.u32 r2917, [r5928+1024]; +ld.shared.u32 r3533, [r5928+1280]; +ld.shared.u32 r3113, [r5928+1536]; +ld.shared.u32 r3729, [r5928+1792]; +ld.shared.u32 r2879, [r5928+2048]; +ld.shared.u32 r3495, [r5928+2304]; +ld.shared.u32 r3075, [r5928+2560]; +ld.shared.u32 r3691, [r5928+2816]; +ld.shared.u32 r2929, [r5928+3072]; +ld.shared.u32 r3545, [r5928+3328]; +ld.shared.u32 r3125, [r5928+3584]; +ld.shared.u32 r3741, [r5928+3840]; +ld.shared.u32 r2868, [r5928+4096]; +ld.shared.u32 r3484, [r5928+4352]; +ld.shared.u32 r3064, [r5928+4608]; +ld.shared.u32 r3680, [r5928+4864]; +ld.shared.u32 r2918, [r5928+5120]; +ld.shared.u32 r3534, [r5928+5376]; +ld.shared.u32 r3114, [r5928+5632]; +ld.shared.u32 r3730, [r5928+5888]; +ld.shared.u32 r2880, [r5928+6144]; +ld.shared.u32 r3496, [r5928+6400]; +ld.shared.u32 r3076, [r5928+6656]; +ld.shared.u32 r3692, [r5928+6912]; +ld.shared.u32 r2930, [r5928+7168]; +ld.shared.u32 r3546, [r5928+7424]; +ld.shared.u32 r3126, [r5928+7680]; +ld.shared.u32 r3742, [r5928+7936]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r5929, r5921, 32; +bfe.u32 r5930, r5921, 5, 1; +shl.b32 r5931, r5921, 2; +and.b32 r5932, r5931, 124; +add.s32 r5933, r5925, r5932; +cvt.rn.f32.u32 f848, r5930; +mul.f32 f849, f848, 0f3DC90FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +barrier.sync 0; +and.b32 r5934, r5923, 4096; +add.s32 r5935, r5933, r5934; +st.shared.u32 [r5935], r4383; +st.shared.u32 [r5935+128], r4585; +st.shared.u32 [r5935+256], r4622; +st.shared.u32 [r5935+384], r4659; +st.shared.u32 [r5935+512], r4696; +st.shared.u32 [r5935+640], r4733; +st.shared.u32 [r5935+768], r4770; +st.shared.u32 [r5935+896], r4807; +st.shared.u32 [r5935+1024], r4844; +st.shared.u32 [r5935+1152], r4881; +st.shared.u32 [r5935+1280], r4918; +st.shared.u32 [r5935+1408], r4955; +st.shared.u32 [r5935+1536], r4992; +st.shared.u32 [r5935+1664], r5029; +st.shared.u32 [r5935+1792], r5066; +st.shared.u32 [r5935+1920], r5103; +st.shared.u32 [r5935+2048], r5140; +st.shared.u32 [r5935+2176], r5177; +st.shared.u32 [r5935+2304], r5214; +st.shared.u32 [r5935+2432], r5251; +st.shared.u32 [r5935+2560], r5288; +st.shared.u32 [r5935+2688], r5325; +st.shared.u32 [r5935+2816], r5362; +st.shared.u32 [r5935+2944], r5399; +st.shared.u32 [r5935+3072], r5436; +st.shared.u32 [r5935+3200], r5473; +st.shared.u32 [r5935+3328], r5510; +st.shared.u32 [r5935+3456], r5547; +st.shared.u32 [r5935+3584], r5584; +st.shared.u32 [r5935+3712], r5621; +st.shared.u32 [r5935+3840], r5658; +st.shared.u32 [r5935+3968], r5695; +barrier.sync 0; +mad.lo.s32 r5936, r5929, -124, r5935; +ld.shared.u32 r5726, [r5936]; +ld.shared.u32 r5738, [r5936+256]; +ld.shared.u32 r5750, [r5936+512]; +ld.shared.u32 r5762, [r5936+768]; +ld.shared.u32 r5774, [r5936+1024]; +ld.shared.u32 r5786, [r5936+1280]; +ld.shared.u32 r5798, [r5936+1536]; +ld.shared.u32 r5810, [r5936+1792]; +ld.shared.u32 r5822, [r5936+2048]; +ld.shared.u32 r5834, [r5936+2304]; +ld.shared.u32 r5846, [r5936+2560]; +ld.shared.u32 r5858, [r5936+2816]; +ld.shared.u32 r5870, [r5936+3072]; +ld.shared.u32 r5882, [r5936+3328]; +ld.shared.u32 r5894, [r5936+3584]; +ld.shared.u32 r5906, [r5936+3840]; +ld.shared.u32 r5727, [r5936+4096]; +ld.shared.u32 r5739, [r5936+4352]; +ld.shared.u32 r5751, [r5936+4608]; +ld.shared.u32 r5763, [r5936+4864]; +ld.shared.u32 r5775, [r5936+5120]; +ld.shared.u32 r5787, [r5936+5376]; +ld.shared.u32 r5799, [r5936+5632]; +ld.shared.u32 r5811, [r5936+5888]; +ld.shared.u32 r5823, [r5936+6144]; +ld.shared.u32 r5835, [r5936+6400]; +ld.shared.u32 r5847, [r5936+6656]; +ld.shared.u32 r5859, [r5936+6912]; +ld.shared.u32 r5871, [r5936+7168]; +ld.shared.u32 r5883, [r5936+7424]; +ld.shared.u32 r5895, [r5936+7680]; +ld.shared.u32 r5907, [r5936+7936]; +barrier.sync 0; +st.shared.u32 [r5935], r4386; +st.shared.u32 [r5935+128], r4594; +st.shared.u32 [r5935+256], r4631; +st.shared.u32 [r5935+384], r4668; +st.shared.u32 [r5935+512], r4705; +st.shared.u32 [r5935+640], r4742; +st.shared.u32 [r5935+768], r4779; +st.shared.u32 [r5935+896], r4816; +st.shared.u32 [r5935+1024], r4853; +st.shared.u32 [r5935+1152], r4890; +st.shared.u32 [r5935+1280], r4927; +st.shared.u32 [r5935+1408], r4964; +st.shared.u32 [r5935+1536], r5001; +st.shared.u32 [r5935+1664], r5038; +st.shared.u32 [r5935+1792], r5075; +st.shared.u32 [r5935+1920], r5112; +st.shared.u32 [r5935+2048], r5149; +st.shared.u32 [r5935+2176], r5186; +st.shared.u32 [r5935+2304], r5223; +st.shared.u32 [r5935+2432], r5260; +st.shared.u32 [r5935+2560], r5297; +st.shared.u32 [r5935+2688], r5334; +st.shared.u32 [r5935+2816], r5371; +st.shared.u32 [r5935+2944], r5408; +st.shared.u32 [r5935+3072], r5445; +st.shared.u32 [r5935+3200], r5482; +st.shared.u32 [r5935+3328], r5519; +st.shared.u32 [r5935+3456], r5556; +st.shared.u32 [r5935+3584], r5593; +st.shared.u32 [r5935+3712], r5630; +st.shared.u32 [r5935+3840], r5667; +st.shared.u32 [r5935+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r5936]; +ld.shared.u32 r5741, [r5936+256]; +ld.shared.u32 r5753, [r5936+512]; +ld.shared.u32 r5765, [r5936+768]; +ld.shared.u32 r5777, [r5936+1024]; +ld.shared.u32 r5789, [r5936+1280]; +ld.shared.u32 r5801, [r5936+1536]; +ld.shared.u32 r5813, [r5936+1792]; +ld.shared.u32 r5825, [r5936+2048]; +ld.shared.u32 r5837, [r5936+2304]; +ld.shared.u32 r5849, [r5936+2560]; +ld.shared.u32 r5861, [r5936+2816]; +ld.shared.u32 r5873, [r5936+3072]; +ld.shared.u32 r5885, [r5936+3328]; +ld.shared.u32 r5897, [r5936+3584]; +ld.shared.u32 r5909, [r5936+3840]; +ld.shared.u32 r5730, [r5936+4096]; +ld.shared.u32 r5742, [r5936+4352]; +ld.shared.u32 r5754, [r5936+4608]; +ld.shared.u32 r5766, [r5936+4864]; +ld.shared.u32 r5778, [r5936+5120]; +ld.shared.u32 r5790, [r5936+5376]; +ld.shared.u32 r5802, [r5936+5632]; +ld.shared.u32 r5814, [r5936+5888]; +ld.shared.u32 r5826, [r5936+6144]; +ld.shared.u32 r5838, [r5936+6400]; +ld.shared.u32 r5850, [r5936+6656]; +ld.shared.u32 r5862, [r5936+6912]; +ld.shared.u32 r5874, [r5936+7168]; +ld.shared.u32 r5886, [r5936+7424]; +ld.shared.u32 r5898, [r5936+7680]; +ld.shared.u32 r5910, [r5936+7936]; +{ +add.f16x2 %0, r5726, r5727; +} +{ +add.f16x2 %1, r5729, r5730; +} +{ +sub.f16x2 %32, r5726, r5727; +} +{ +sub.f16x2 %33, r5729, r5730; +} +{ +add.f16x2 %2, r5738, r5739; +} +{ +add.f16x2 %3, r5741, r5742; +} +{ +sub.f16x2 %34, r5738, r5739; +} +{ +sub.f16x2 %35, r5741, r5742; +} +{ +add.f16x2 %4, r5750, r5751; +} +{ +add.f16x2 %5, r5753, r5754; +} +{ +sub.f16x2 %36, r5750, r5751; +} +{ +sub.f16x2 %37, r5753, r5754; +} +{ +add.f16x2 %6, r5762, r5763; +} +{ +add.f16x2 %7, r5765, r5766; +} +{ +sub.f16x2 %38, r5762, r5763; +} +{ +sub.f16x2 %39, r5765, r5766; +} +{ +add.f16x2 %8, r5774, r5775; +} +{ +add.f16x2 %9, r5777, r5778; +} +{ +sub.f16x2 %40, r5774, r5775; +} +{ +sub.f16x2 %41, r5777, r5778; +} +{ +add.f16x2 %10, r5786, r5787; +} +{ +add.f16x2 %11, r5789, r5790; +} +{ +sub.f16x2 %42, r5786, r5787; +} +{ +sub.f16x2 %43, r5789, r5790; +} +{ +add.f16x2 %12, r5798, r5799; +} +{ +add.f16x2 %13, r5801, r5802; +} +{ +sub.f16x2 %44, r5798, r5799; +} +{ +sub.f16x2 %45, r5801, r5802; +} +{ +add.f16x2 %14, r5810, r5811; +} +{ +add.f16x2 %15, r5813, r5814; +} +{ +sub.f16x2 %46, r5810, r5811; +} +{ +sub.f16x2 %47, r5813, r5814; +} +{ +add.f16x2 %16, r5822, r5823; +} +{ +add.f16x2 %17, r5825, r5826; +} +{ +sub.f16x2 %48, r5822, r5823; +} +{ +sub.f16x2 %49, r5825, r5826; +} +{ +add.f16x2 %18, r5834, r5835; +} +{ +add.f16x2 %19, r5837, r5838; +} +{ +sub.f16x2 %50, r5834, r5835; +} +{ +sub.f16x2 %51, r5837, r5838; +} +{ +add.f16x2 %20, r5846, r5847; +} +{ +add.f16x2 %21, r5849, r5850; +} +{ +sub.f16x2 %52, r5846, r5847; +} +{ +sub.f16x2 %53, r5849, r5850; +} +{ +add.f16x2 %22, r5858, r5859; +} +{ +add.f16x2 %23, r5861, r5862; +} +{ +sub.f16x2 %54, r5858, r5859; +} +{ +sub.f16x2 %55, r5861, r5862; +} +{ +add.f16x2 %24, r5870, r5871; +} +{ +add.f16x2 %25, r5873, r5874; +} +{ +sub.f16x2 %56, r5870, r5871; +} +{ +sub.f16x2 %57, r5873, r5874; +} +{ +add.f16x2 %26, r5882, r5883; +} +{ +add.f16x2 %27, r5885, r5886; +} +{ +sub.f16x2 %58, r5882, r5883; +} +{ +sub.f16x2 %59, r5885, r5886; +} +{ +add.f16x2 %28, r5894, r5895; +} +{ +add.f16x2 %29, r5897, r5898; +} +{ +sub.f16x2 %60, r5894, r5895; +} +{ +sub.f16x2 %61, r5897, r5898; +} +{ +add.f16x2 %30, r5906, r5907; +} +{ +add.f16x2 %31, r5909, r5910; +} +{ +sub.f16x2 %62, r5906, r5907; +} +{ +sub.f16x2 %63, r5909, r5910; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1045, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .b16 rs<3>; +.reg .f32 f<71>; +.reg .b32 r<894>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r32; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r35; +} +{ +add.f16x2 r52, r20, r29; +} +{ +sub.f16x2 r55, r17, r35; +} +{ +sub.f16x2 r58, r20, r29; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 511; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3B490FDB; +cvt.u16.u32 rs2, r9; +and.b16 rs1, rs2, 511; +mov.f32 f70, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB6_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB6_3; +mov.f32 f70, 0f3B490FC6; +bra.uni LBB6_4; +LBB6_3: +cos.approx.f32 f70, f1; +LBB6_4: +sin.approx.f32 f57, f1; +neg.f32 f8, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f8; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +fma.rn.f16x2 r71, r49, r64, r68; +} +{ +mul.f16x2 r75, r49, r66; +} +{ +neg.f16x2 r78, r75; +} +{ +fma.rn.f16x2 r80, r52, r64, r78; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f53, 0fBF800000; +mov.f32 f54, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +fma.rn.f16x2 r108, r43, r101, r105; +} +{ +mul.f16x2 r112, r43, r103; +} +{ +neg.f16x2 r115, r112; +} +{ +fma.rn.f16x2 r117, r46, r101, r115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +fma.rn.f16x2 r145, r55, r138, r142; +} +{ +mul.f16x2 r149, r55, r140; +} +{ +neg.f16x2 r152, r149; +} +{ +fma.rn.f16x2 r154, r58, r138, r152; +} +barrier.sync 0; +mov.u32 r855, %tid.y; +shl.b32 r856, r855, 14; +mov.u32 r857, %8; +add.s32 r858, r857, r856; +shl.b32 r859, r9, 5; +and.b32 r860, r859, -16384; +add.s32 r861, r858, r860; +shl.b32 r862, r10, 5; +add.s32 r863, r861, r862; +st.shared.v4.f32 [r863], {r37, r40, r71, r80}; +st.shared.v4.f32 [r863+16], {r108, r117, r145, r154}; +barrier.sync 0; +mad.lo.s32 r864, r10, -24, r863; +ld.shared.u32 r176, [r864]; +ld.shared.u32 r179, [r864+4]; +ld.shared.u32 r188, [r864+4096]; +ld.shared.u32 r191, [r864+4100]; +ld.shared.u32 r177, [r864+8192]; +ld.shared.u32 r180, [r864+8196]; +ld.shared.u32 r189, [r864+12288]; +ld.shared.u32 r192, [r864+12292]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r196; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r199; +} +{ +add.f16x2 r216, r184, r193; +} +{ +sub.f16x2 r219, r181, r199; +} +{ +sub.f16x2 r222, r184, r193; +} +and.b32 r865, r9, 508; +bfe.u32 r866, r9, 2, 7; +cvt.rn.f32.u32 f58, r866; +mul.f32 f59, f58, 0f3C490FDB; +cos.approx.f32 f17, f59; +sin.approx.f32 f60, f59; +neg.f32 f18, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +fma.rn.f16x2 r235, r213, r228, r232; +} +{ +mul.f16x2 r239, r213, r230; +} +{ +neg.f16x2 r242, r239; +} +{ +fma.rn.f16x2 r244, r216, r228, r242; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +fma.rn.f16x2 r272, r207, r265, r269; +} +{ +mul.f16x2 r276, r207, r267; +} +{ +neg.f16x2 r279, r276; +} +{ +fma.rn.f16x2 r281, r210, r265, r279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +fma.rn.f16x2 r309, r219, r302, r306; +} +{ +mul.f16x2 r313, r219, r304; +} +{ +neg.f16x2 r316, r313; +} +{ +fma.rn.f16x2 r318, r222, r302, r316; +} +shl.b32 r867, r9, 3; +and.b32 r868, r867, 24; +add.s32 r869, r861, r868; +barrier.sync 0; +and.b32 r870, r859, 16256; +add.s32 r871, r869, r870; +st.shared.u32 [r871], r201; +st.shared.u32 [r871+4], r204; +st.shared.u32 [r871+32], r235; +st.shared.u32 [r871+36], r244; +st.shared.u32 [r871+64], r272; +st.shared.u32 [r871+68], r281; +st.shared.u32 [r871+96], r309; +st.shared.u32 [r871+100], r318; +barrier.sync 0; +mad.lo.s32 r872, r865, -24, r871; +ld.shared.u32 r340, [r872]; +ld.shared.u32 r343, [r872+4]; +ld.shared.u32 r352, [r872+4096]; +ld.shared.u32 r355, [r872+4100]; +ld.shared.u32 r341, [r872+8192]; +ld.shared.u32 r344, [r872+8196]; +ld.shared.u32 r353, [r872+12288]; +ld.shared.u32 r356, [r872+12292]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r360; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r363; +} +{ +add.f16x2 r380, r348, r357; +} +{ +sub.f16x2 r383, r345, r363; +} +{ +sub.f16x2 r386, r348, r357; +} +and.b32 r873, r9, 496; +bfe.u32 r874, r9, 4, 5; +cvt.rn.f32.u32 f61, r874; +mul.f32 f62, f61, 0f3D490FDB; +cos.approx.f32 f27, f62; +sin.approx.f32 f63, f62; +neg.f32 f28, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +fma.rn.f16x2 r399, r377, r392, r396; +} +{ +mul.f16x2 r403, r377, r394; +} +{ +neg.f16x2 r406, r403; +} +{ +fma.rn.f16x2 r408, r380, r392, r406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +fma.rn.f16x2 r436, r371, r429, r433; +} +{ +mul.f16x2 r440, r371, r431; +} +{ +neg.f16x2 r443, r440; +} +{ +fma.rn.f16x2 r445, r374, r429, r443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +fma.rn.f16x2 r473, r383, r466, r470; +} +{ +mul.f16x2 r477, r383, r468; +} +{ +neg.f16x2 r480, r477; +} +{ +fma.rn.f16x2 r482, r386, r466, r480; +} +and.b32 r875, r867, 120; +add.s32 r876, r861, r875; +barrier.sync 0; +and.b32 r877, r859, 15872; +add.s32 r878, r876, r877; +st.shared.u32 [r878], r365; +st.shared.u32 [r878+4], r368; +st.shared.u32 [r878+128], r399; +st.shared.u32 [r878+132], r408; +st.shared.u32 [r878+256], r436; +st.shared.u32 [r878+260], r445; +st.shared.u32 [r878+384], r473; +st.shared.u32 [r878+388], r482; +barrier.sync 0; +mad.lo.s32 r879, r873, -24, r878; +ld.shared.u32 r504, [r879]; +ld.shared.u32 r507, [r879+4]; +ld.shared.u32 r516, [r879+4096]; +ld.shared.u32 r519, [r879+4100]; +ld.shared.u32 r505, [r879+8192]; +ld.shared.u32 r508, [r879+8196]; +ld.shared.u32 r517, [r879+12288]; +ld.shared.u32 r520, [r879+12292]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r524; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r527; +} +{ +add.f16x2 r544, r512, r521; +} +{ +sub.f16x2 r547, r509, r527; +} +{ +sub.f16x2 r550, r512, r521; +} +and.b32 r880, r9, 448; +bfe.u32 r881, r9, 6, 3; +cvt.rn.f32.u32 f64, r881; +mul.f32 f65, f64, 0f3E490FDB; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +fma.rn.f16x2 r563, r541, r556, r560; +} +{ +mul.f16x2 r567, r541, r558; +} +{ +neg.f16x2 r570, r567; +} +{ +fma.rn.f16x2 r572, r544, r556, r570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +fma.rn.f16x2 r600, r535, r593, r597; +} +{ +mul.f16x2 r604, r535, r595; +} +{ +neg.f16x2 r607, r604; +} +{ +fma.rn.f16x2 r609, r538, r593, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +fma.rn.f16x2 r637, r547, r630, r634; +} +{ +mul.f16x2 r641, r547, r632; +} +{ +neg.f16x2 r644, r641; +} +{ +fma.rn.f16x2 r646, r550, r630, r644; +} +and.b32 r882, r867, 504; +add.s32 r883, r861, r882; +barrier.sync 0; +and.b32 r884, r859, 14336; +add.s32 r885, r883, r884; +st.shared.u32 [r885], r529; +st.shared.u32 [r885+4], r532; +st.shared.u32 [r885+512], r563; +st.shared.u32 [r885+516], r572; +st.shared.u32 [r885+1024], r600; +st.shared.u32 [r885+1028], r609; +st.shared.u32 [r885+1536], r637; +st.shared.u32 [r885+1540], r646; +barrier.sync 0; +mad.lo.s32 r886, r880, -24, r885; +ld.shared.u32 r668, [r886]; +ld.shared.u32 r671, [r886+4]; +ld.shared.u32 r680, [r886+4096]; +ld.shared.u32 r683, [r886+4100]; +ld.shared.u32 r669, [r886+8192]; +ld.shared.u32 r672, [r886+8196]; +ld.shared.u32 r681, [r886+12288]; +ld.shared.u32 r684, [r886+12292]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +and.b32 r887, r9, 256; +bfe.u32 r888, r9, 8, 1; +cvt.rn.f32.u32 f67, r888; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f47, f68; +sin.approx.f32 f69, f68; +neg.f32 f48, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f47; +cvt.rn.f16.f32 high, f48; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r722, {high, high}; +} +{ +mul.f16x2 r724, r708, r722; +} +{ +fma.rn.f16x2 r727, r705, r720, r724; +} +{ +mul.f16x2 r731, r705, r722; +} +{ +neg.f16x2 r734, r731; +} +{ +fma.rn.f16x2 r736, r708, r720, r734; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r742, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r742, r744; +} +{ +mul.f16x2 r748, r717, r740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r751, {high, low}; +} +{ +fma.rn.f16x2 r753, r745, r751, r748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r759, {high, high}; +} +{ +mul.f16x2 r761, r702, r759; +} +{ +fma.rn.f16x2 r764, r699, r757, r761; +} +{ +mul.f16x2 r768, r699, r759; +} +{ +neg.f16x2 r771, r768; +} +{ +fma.rn.f16x2 r773, r702, r757, r771; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r779, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r781, {low, high}; +} +{ +mul.f16x2 r782, r779, r781; +} +{ +mul.f16x2 r785, r753, r777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r788, {high, low}; +} +{ +fma.rn.f16x2 r790, r782, r788, r785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r796, {high, high}; +} +{ +mul.f16x2 r798, r714, r796; +} +{ +fma.rn.f16x2 r801, r711, r794, r798; +} +{ +mul.f16x2 r805, r711, r796; +} +{ +neg.f16x2 r808, r805; +} +{ +fma.rn.f16x2 r810, r714, r794, r808; +} +and.b32 r889, r867, 2040; +add.s32 r890, r861, r889; +barrier.sync 0; +and.b32 r891, r859, 8192; +add.s32 r892, r890, r891; +st.shared.u32 [r892], r693; +st.shared.u32 [r892+4], r696; +st.shared.u32 [r892+2048], r727; +st.shared.u32 [r892+2052], r736; +st.shared.u32 [r892+4096], r764; +st.shared.u32 [r892+4100], r773; +st.shared.u32 [r892+6144], r801; +st.shared.u32 [r892+6148], r810; +barrier.sync 0; +mad.lo.s32 r893, r887, -24, r892; +ld.shared.u32 r832, [r893]; +ld.shared.u32 r835, [r893+4]; +ld.shared.u32 r844, [r893+4096]; +ld.shared.u32 r847, [r893+4100]; +ld.shared.u32 r833, [r893+8192]; +ld.shared.u32 r836, [r893+8196]; +ld.shared.u32 r845, [r893+12288]; +ld.shared.u32 r848, [r893+12292]; +{ +add.f16x2 %0, r832, r833; +} +{ +add.f16x2 %1, r835, r836; +} +{ +sub.f16x2 %4, r832, r833; +} +{ +sub.f16x2 %5, r835, r836; +} +{ +add.f16x2 %2, r844, r845; +} +{ +add.f16x2 %3, r847, r848; +} +{ +sub.f16x2 %6, r844, r845; +} +{ +sub.f16x2 %7, r847, r848; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1046, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .b16 rs<3>; +.reg .f32 f<71>; +.reg .b32 r<894>; +.reg .b64 rd<2>; +{ +add.f16x2 r11, %9, %13; +} +{ +add.f16x2 r14, %10, %14; +} +{ +sub.f16x2 r17, %9, %13; +} +{ +sub.f16x2 r20, %10, %14; +} +{ +add.f16x2 r23, %11, %15; +} +{ +add.f16x2 r26, %12, %16; +} +{ +sub.f16x2 r29, %11, %15; +} +{ +sub.f16x2 r32, %12, %16; +} +{ +neg.f16x2 r35, r32; +} +{ +add.f16x2 r37, r11, r23; +} +{ +add.f16x2 r40, r14, r26; +} +{ +sub.f16x2 r43, r11, r23; +} +{ +sub.f16x2 r46, r14, r26; +} +{ +add.f16x2 r49, r17, r35; +} +{ +add.f16x2 r52, r20, r29; +} +{ +sub.f16x2 r55, r17, r35; +} +{ +sub.f16x2 r58, r20, r29; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 511; +cvt.rn.f32.u32 f5, r10; +mul.f32 f1, f5, 0f3B490FDB; +cvt.u16.u32 rs2, r9; +and.b16 rs1, rs2, 511; +mov.f32 f70, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB7_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB7_3; +mov.f32 f70, 0f3B490FC6; +bra.uni LBB7_4; +LBB7_3: +cos.approx.f32 f70, f1; +LBB7_4: +mov.u32 r855, %tid.y; +shl.b32 r856, r855, 13; +mov.u32 r857, %8; +add.s32 r858, r857, r856; +shl.b32 r859, r9, 4; +and.b32 r860, r859, -8192; +add.s32 r861, r858, r860; +sin.approx.f32 f57, f1; +neg.f32 f8, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f70; +cvt.rn.f16.f32 high, f8; +mov.b32 r61, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r64, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r66, {high, high}; +} +{ +mul.f16x2 r68, r52, r66; +} +{ +fma.rn.f16x2 r71, r49, r64, r68; +} +{ +mul.f16x2 r75, r49, r66; +} +{ +neg.f16x2 r78, r75; +} +{ +fma.rn.f16x2 r80, r52, r64, r78; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r84, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r86, {high, high}; +} +mov.f32 f53, 0fBF800000; +mov.f32 f54, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r88, {low, high}; +} +{ +mul.f16x2 r89, r86, r88; +} +{ +mul.f16x2 r92, r61, r84; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r95, {high, low}; +} +{ +fma.rn.f16x2 r97, r89, r95, r92; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r103, {high, high}; +} +{ +mul.f16x2 r105, r46, r103; +} +{ +fma.rn.f16x2 r108, r43, r101, r105; +} +{ +mul.f16x2 r112, r43, r103; +} +{ +neg.f16x2 r115, r112; +} +{ +fma.rn.f16x2 r117, r46, r101, r115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r61; +mov.b32 r123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r125, {low, high}; +} +{ +mul.f16x2 r126, r123, r125; +} +{ +mul.f16x2 r129, r97, r121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r97; +mov.b32 r132, {high, low}; +} +{ +fma.rn.f16x2 r134, r126, r132, r129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r134; +mov.b32 r140, {high, high}; +} +{ +mul.f16x2 r142, r58, r140; +} +{ +fma.rn.f16x2 r145, r55, r138, r142; +} +{ +mul.f16x2 r149, r55, r140; +} +{ +neg.f16x2 r152, r149; +} +{ +fma.rn.f16x2 r154, r58, r138, r152; +} +barrier.sync 0; +shl.b32 r862, r10, 4; +add.s32 r863, r861, r862; +st.shared.v4.f32 [r863], {r37, r71, r108, r145}; +barrier.sync 0; +mad.lo.s32 r864, r10, -12, r863; +ld.shared.u32 r176, [r864]; +ld.shared.u32 r188, [r864+2048]; +ld.shared.u32 r177, [r864+4096]; +ld.shared.u32 r189, [r864+6144]; +barrier.sync 0; +st.shared.v4.f32 [r863], {r40, r80, r117, r154}; +barrier.sync 0; +ld.shared.u32 r179, [r864]; +ld.shared.u32 r191, [r864+2048]; +ld.shared.u32 r180, [r864+4096]; +ld.shared.u32 r192, [r864+6144]; +{ +add.f16x2 r175, r176, r177; +} +{ +add.f16x2 r178, r179, r180; +} +{ +sub.f16x2 r181, r176, r177; +} +{ +sub.f16x2 r184, r179, r180; +} +{ +add.f16x2 r187, r188, r189; +} +{ +add.f16x2 r190, r191, r192; +} +{ +sub.f16x2 r193, r188, r189; +} +{ +sub.f16x2 r196, r191, r192; +} +{ +neg.f16x2 r199, r196; +} +{ +add.f16x2 r201, r175, r187; +} +{ +add.f16x2 r204, r178, r190; +} +{ +sub.f16x2 r207, r175, r187; +} +{ +sub.f16x2 r210, r178, r190; +} +{ +add.f16x2 r213, r181, r199; +} +{ +add.f16x2 r216, r184, r193; +} +{ +sub.f16x2 r219, r181, r199; +} +{ +sub.f16x2 r222, r184, r193; +} +and.b32 r865, r9, 508; +bfe.u32 r866, r9, 2, 7; +shl.b32 r867, r9, 2; +and.b32 r868, r867, 12; +add.s32 r869, r861, r868; +cvt.rn.f32.u32 f58, r866; +mul.f32 f59, f58, 0f3C490FDB; +cos.approx.f32 f17, f59; +sin.approx.f32 f60, f59; +neg.f32 f18, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r225, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r228, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r230, {high, high}; +} +{ +mul.f16x2 r232, r216, r230; +} +{ +fma.rn.f16x2 r235, r213, r228, r232; +} +{ +mul.f16x2 r239, r213, r230; +} +{ +neg.f16x2 r242, r239; +} +{ +fma.rn.f16x2 r244, r216, r228, r242; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r250, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r252, {low, high}; +} +{ +mul.f16x2 r253, r250, r252; +} +{ +mul.f16x2 r256, r225, r248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r259, {high, low}; +} +{ +fma.rn.f16x2 r261, r253, r259, r256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r265, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r267, {high, high}; +} +{ +mul.f16x2 r269, r210, r267; +} +{ +fma.rn.f16x2 r272, r207, r265, r269; +} +{ +mul.f16x2 r276, r207, r267; +} +{ +neg.f16x2 r279, r276; +} +{ +fma.rn.f16x2 r281, r210, r265, r279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r225; +mov.b32 r287, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r289, {low, high}; +} +{ +mul.f16x2 r290, r287, r289; +} +{ +mul.f16x2 r293, r261, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r261; +mov.b32 r296, {high, low}; +} +{ +fma.rn.f16x2 r298, r290, r296, r293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r302, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r298; +mov.b32 r304, {high, high}; +} +{ +mul.f16x2 r306, r222, r304; +} +{ +fma.rn.f16x2 r309, r219, r302, r306; +} +{ +mul.f16x2 r313, r219, r304; +} +{ +neg.f16x2 r316, r313; +} +{ +fma.rn.f16x2 r318, r222, r302, r316; +} +barrier.sync 0; +and.b32 r870, r859, 8128; +add.s32 r871, r869, r870; +st.shared.u32 [r871], r201; +st.shared.u32 [r871+16], r235; +st.shared.u32 [r871+32], r272; +st.shared.u32 [r871+48], r309; +barrier.sync 0; +mad.lo.s32 r872, r865, -12, r871; +ld.shared.u32 r340, [r872]; +ld.shared.u32 r352, [r872+2048]; +ld.shared.u32 r341, [r872+4096]; +ld.shared.u32 r353, [r872+6144]; +barrier.sync 0; +st.shared.u32 [r871], r204; +st.shared.u32 [r871+16], r244; +st.shared.u32 [r871+32], r281; +st.shared.u32 [r871+48], r318; +barrier.sync 0; +ld.shared.u32 r343, [r872]; +ld.shared.u32 r355, [r872+2048]; +ld.shared.u32 r344, [r872+4096]; +ld.shared.u32 r356, [r872+6144]; +{ +add.f16x2 r339, r340, r341; +} +{ +add.f16x2 r342, r343, r344; +} +{ +sub.f16x2 r345, r340, r341; +} +{ +sub.f16x2 r348, r343, r344; +} +{ +add.f16x2 r351, r352, r353; +} +{ +add.f16x2 r354, r355, r356; +} +{ +sub.f16x2 r357, r352, r353; +} +{ +sub.f16x2 r360, r355, r356; +} +{ +neg.f16x2 r363, r360; +} +{ +add.f16x2 r365, r339, r351; +} +{ +add.f16x2 r368, r342, r354; +} +{ +sub.f16x2 r371, r339, r351; +} +{ +sub.f16x2 r374, r342, r354; +} +{ +add.f16x2 r377, r345, r363; +} +{ +add.f16x2 r380, r348, r357; +} +{ +sub.f16x2 r383, r345, r363; +} +{ +sub.f16x2 r386, r348, r357; +} +and.b32 r873, r9, 496; +bfe.u32 r874, r9, 4, 5; +and.b32 r875, r867, 60; +add.s32 r876, r861, r875; +cvt.rn.f32.u32 f61, r874; +mul.f32 f62, f61, 0f3D490FDB; +cos.approx.f32 f27, f62; +sin.approx.f32 f63, f62; +neg.f32 f28, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r389, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r394, {high, high}; +} +{ +mul.f16x2 r396, r380, r394; +} +{ +fma.rn.f16x2 r399, r377, r392, r396; +} +{ +mul.f16x2 r403, r377, r394; +} +{ +neg.f16x2 r406, r403; +} +{ +fma.rn.f16x2 r408, r380, r392, r406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r416, {low, high}; +} +{ +mul.f16x2 r417, r414, r416; +} +{ +mul.f16x2 r420, r389, r412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r423, {high, low}; +} +{ +fma.rn.f16x2 r425, r417, r423, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r431, {high, high}; +} +{ +mul.f16x2 r433, r374, r431; +} +{ +fma.rn.f16x2 r436, r371, r429, r433; +} +{ +mul.f16x2 r440, r371, r431; +} +{ +neg.f16x2 r443, r440; +} +{ +fma.rn.f16x2 r445, r374, r429, r443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r389; +mov.b32 r451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r453, {low, high}; +} +{ +mul.f16x2 r454, r451, r453; +} +{ +mul.f16x2 r457, r425, r449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r425; +mov.b32 r460, {high, low}; +} +{ +fma.rn.f16x2 r462, r454, r460, r457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r462; +mov.b32 r468, {high, high}; +} +{ +mul.f16x2 r470, r386, r468; +} +{ +fma.rn.f16x2 r473, r383, r466, r470; +} +{ +mul.f16x2 r477, r383, r468; +} +{ +neg.f16x2 r480, r477; +} +{ +fma.rn.f16x2 r482, r386, r466, r480; +} +barrier.sync 0; +and.b32 r877, r859, 7936; +add.s32 r878, r876, r877; +st.shared.u32 [r878], r365; +st.shared.u32 [r878+64], r399; +st.shared.u32 [r878+128], r436; +st.shared.u32 [r878+192], r473; +barrier.sync 0; +mad.lo.s32 r879, r873, -12, r878; +ld.shared.u32 r504, [r879]; +ld.shared.u32 r516, [r879+2048]; +ld.shared.u32 r505, [r879+4096]; +ld.shared.u32 r517, [r879+6144]; +barrier.sync 0; +st.shared.u32 [r878], r368; +st.shared.u32 [r878+64], r408; +st.shared.u32 [r878+128], r445; +st.shared.u32 [r878+192], r482; +barrier.sync 0; +ld.shared.u32 r507, [r879]; +ld.shared.u32 r519, [r879+2048]; +ld.shared.u32 r508, [r879+4096]; +ld.shared.u32 r520, [r879+6144]; +{ +add.f16x2 r503, r504, r505; +} +{ +add.f16x2 r506, r507, r508; +} +{ +sub.f16x2 r509, r504, r505; +} +{ +sub.f16x2 r512, r507, r508; +} +{ +add.f16x2 r515, r516, r517; +} +{ +add.f16x2 r518, r519, r520; +} +{ +sub.f16x2 r521, r516, r517; +} +{ +sub.f16x2 r524, r519, r520; +} +{ +neg.f16x2 r527, r524; +} +{ +add.f16x2 r529, r503, r515; +} +{ +add.f16x2 r532, r506, r518; +} +{ +sub.f16x2 r535, r503, r515; +} +{ +sub.f16x2 r538, r506, r518; +} +{ +add.f16x2 r541, r509, r527; +} +{ +add.f16x2 r544, r512, r521; +} +{ +sub.f16x2 r547, r509, r527; +} +{ +sub.f16x2 r550, r512, r521; +} +and.b32 r880, r9, 448; +bfe.u32 r881, r9, 6, 3; +and.b32 r882, r867, 252; +add.s32 r883, r861, r882; +cvt.rn.f32.u32 f64, r881; +mul.f32 f65, f64, 0f3E490FDB; +cos.approx.f32 f37, f65; +sin.approx.f32 f66, f65; +neg.f32 f38, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r553, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r558, {high, high}; +} +{ +mul.f16x2 r560, r544, r558; +} +{ +fma.rn.f16x2 r563, r541, r556, r560; +} +{ +mul.f16x2 r567, r541, r558; +} +{ +neg.f16x2 r570, r567; +} +{ +fma.rn.f16x2 r572, r544, r556, r570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r580, {low, high}; +} +{ +mul.f16x2 r581, r578, r580; +} +{ +mul.f16x2 r584, r553, r576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r587, {high, low}; +} +{ +fma.rn.f16x2 r589, r581, r587, r584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r595, {high, high}; +} +{ +mul.f16x2 r597, r538, r595; +} +{ +fma.rn.f16x2 r600, r535, r593, r597; +} +{ +mul.f16x2 r604, r535, r595; +} +{ +neg.f16x2 r607, r604; +} +{ +fma.rn.f16x2 r609, r538, r593, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r553; +mov.b32 r615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r617, {low, high}; +} +{ +mul.f16x2 r618, r615, r617; +} +{ +mul.f16x2 r621, r589, r613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r589; +mov.b32 r624, {high, low}; +} +{ +fma.rn.f16x2 r626, r618, r624, r621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r626; +mov.b32 r632, {high, high}; +} +{ +mul.f16x2 r634, r550, r632; +} +{ +fma.rn.f16x2 r637, r547, r630, r634; +} +{ +mul.f16x2 r641, r547, r632; +} +{ +neg.f16x2 r644, r641; +} +{ +fma.rn.f16x2 r646, r550, r630, r644; +} +barrier.sync 0; +and.b32 r884, r859, 7168; +add.s32 r885, r883, r884; +st.shared.u32 [r885], r529; +st.shared.u32 [r885+256], r563; +st.shared.u32 [r885+512], r600; +st.shared.u32 [r885+768], r637; +barrier.sync 0; +mad.lo.s32 r886, r880, -12, r885; +ld.shared.u32 r668, [r886]; +ld.shared.u32 r680, [r886+2048]; +ld.shared.u32 r669, [r886+4096]; +ld.shared.u32 r681, [r886+6144]; +barrier.sync 0; +st.shared.u32 [r885], r532; +st.shared.u32 [r885+256], r572; +st.shared.u32 [r885+512], r609; +st.shared.u32 [r885+768], r646; +barrier.sync 0; +ld.shared.u32 r671, [r886]; +ld.shared.u32 r683, [r886+2048]; +ld.shared.u32 r672, [r886+4096]; +ld.shared.u32 r684, [r886+6144]; +{ +add.f16x2 r667, r668, r669; +} +{ +add.f16x2 r670, r671, r672; +} +{ +sub.f16x2 r673, r668, r669; +} +{ +sub.f16x2 r676, r671, r672; +} +{ +add.f16x2 r679, r680, r681; +} +{ +add.f16x2 r682, r683, r684; +} +{ +sub.f16x2 r685, r680, r681; +} +{ +sub.f16x2 r688, r683, r684; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +and.b32 r887, r9, 256; +bfe.u32 r888, r9, 8, 1; +and.b32 r889, r867, 1020; +add.s32 r890, r861, r889; +cvt.rn.f32.u32 f67, r888; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f47, f68; +sin.approx.f32 f69, f68; +neg.f32 f48, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f47; +cvt.rn.f16.f32 high, f48; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r722, {high, high}; +} +{ +mul.f16x2 r724, r708, r722; +} +{ +fma.rn.f16x2 r727, r705, r720, r724; +} +{ +mul.f16x2 r731, r705, r722; +} +{ +neg.f16x2 r734, r731; +} +{ +fma.rn.f16x2 r736, r708, r720, r734; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r742, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r744, {low, high}; +} +{ +mul.f16x2 r745, r742, r744; +} +{ +mul.f16x2 r748, r717, r740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r751, {high, low}; +} +{ +fma.rn.f16x2 r753, r745, r751, r748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r759, {high, high}; +} +{ +mul.f16x2 r761, r702, r759; +} +{ +fma.rn.f16x2 r764, r699, r757, r761; +} +{ +mul.f16x2 r768, r699, r759; +} +{ +neg.f16x2 r771, r768; +} +{ +fma.rn.f16x2 r773, r702, r757, r771; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r717; +mov.b32 r779, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r781, {low, high}; +} +{ +mul.f16x2 r782, r779, r781; +} +{ +mul.f16x2 r785, r753, r777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r753; +mov.b32 r788, {high, low}; +} +{ +fma.rn.f16x2 r790, r782, r788, r785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r790; +mov.b32 r796, {high, high}; +} +{ +mul.f16x2 r798, r714, r796; +} +{ +fma.rn.f16x2 r801, r711, r794, r798; +} +{ +mul.f16x2 r805, r711, r796; +} +{ +neg.f16x2 r808, r805; +} +{ +fma.rn.f16x2 r810, r714, r794, r808; +} +barrier.sync 0; +and.b32 r891, r859, 4096; +add.s32 r892, r890, r891; +st.shared.u32 [r892], r693; +st.shared.u32 [r892+1024], r727; +st.shared.u32 [r892+2048], r764; +st.shared.u32 [r892+3072], r801; +barrier.sync 0; +mad.lo.s32 r893, r887, -12, r892; +ld.shared.u32 r832, [r893]; +ld.shared.u32 r844, [r893+2048]; +ld.shared.u32 r833, [r893+4096]; +ld.shared.u32 r845, [r893+6144]; +barrier.sync 0; +st.shared.u32 [r892], r696; +st.shared.u32 [r892+1024], r736; +st.shared.u32 [r892+2048], r773; +st.shared.u32 [r892+3072], r810; +barrier.sync 0; +ld.shared.u32 r835, [r893]; +ld.shared.u32 r847, [r893+2048]; +ld.shared.u32 r836, [r893+4096]; +ld.shared.u32 r848, [r893+6144]; +{ +add.f16x2 %0, r832, r833; +} +{ +add.f16x2 %1, r835, r836; +} +{ +sub.f16x2 %4, r832, r833; +} +{ +sub.f16x2 %5, r835, r836; +} +{ +add.f16x2 %2, r844, r845; +} +{ +add.f16x2 %3, r847, r848; +} +{ +sub.f16x2 %6, r844, r845; +} +{ +sub.f16x2 %7, r847, r848; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1047, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<5>; +.reg .b16 rs<3>; +.reg .f32 f<103>; +.reg .b32 r<631>; +.reg .b64 rd<2>; +{ +add.f16x2 r9, %5, %7; +} +{ +add.f16x2 r12, %6, %8; +} +{ +sub.f16x2 r15, %5, %7; +} +{ +sub.f16x2 r18, %6, %8; +} +mov.u32 r21, %tid.x; +and.b32 r22, r21, 1023; +cvt.rn.f32.u32 f10, r22; +mul.f32 f1, f10, 0f3B490FDB; +cvt.u16.u32 rs2, r21; +and.b16 rs1, rs2, 1023; +mov.f32 f100, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB8_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB8_3; +mov.f32 f100, 0f3B490FC6; +bra.uni LBB8_4; +LBB8_3: +cos.approx.f32 f100, f1; +LBB8_4: +setp.eq.s32 p3, r22, 1023; +mov.f32 f101, 0f3B490FC6; +@p3 bra LBB8_6; +sin.approx.f32 f101, f1; +LBB8_6: +mov.u32 r77, %tid.y; +shl.b32 r78, r77, 14; +mov.u32 r79, %4; +add.s32 r80, r79, r78; +shl.b32 r82, r21, 4; +and.b32 r83, r82, -16384; +add.s32 r84, r80, r83; +neg.f32 f14, f101; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f14; +mov.b32 r25, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r28, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r30, {high, high}; +} +{ +mul.f16x2 r32, r18, r30; +} +{ +fma.rn.f16x2 r35, r15, r28, r32; +} +{ +mul.f16x2 r39, r15, r30; +} +{ +neg.f16x2 r42, r39; +} +{ +fma.rn.f16x2 r44, r18, r28, r42; +} +barrier.sync 0; +and.b32 r85, r82, 16368; +add.s32 r86, r84, r85; +st.shared.v2.f32 [r86], {r9, r12}; +st.shared.v2.f32 [r86+8], {r35, r44}; +barrier.sync 0; +shl.b32 r87, r21, 3; +and.b32 r88, r87, 8184; +sub.s32 r89, r86, r88; +ld.shared.u32 r66, [r89]; +ld.shared.u32 r69, [r89+4]; +ld.shared.u32 r67, [r89+8192]; +ld.shared.u32 r70, [r89+8196]; +{ +add.f16x2 r65, r66, r67; +} +{ +add.f16x2 r68, r69, r70; +} +{ +sub.f16x2 r71, r66, r67; +} +{ +sub.f16x2 r74, r69, r70; +} +and.b32 r90, r21, 1022; +bfe.u32 r91, r21, 1, 9; +cvt.rn.f32.u32 f20, r91; +mul.f32 f6, f20, 0f3BC90FDB; +setp.eq.s32 p4, r90, 510; +mov.f32 f102, 0f3BC90F88; +@p4 bra LBB8_8; +cos.approx.f32 f102, f6; +LBB8_8: +sin.approx.f32 f75, f6; +neg.f32 f22, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f22; +mov.b32 r92, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r95, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r97, {high, high}; +} +{ +mul.f16x2 r99, r74, r97; +} +{ +fma.rn.f16x2 r102, r71, r95, r99; +} +{ +mul.f16x2 r106, r71, r97; +} +{ +neg.f16x2 r109, r106; +} +{ +fma.rn.f16x2 r111, r74, r95, r109; +} +barrier.sync 0; +and.b32 r569, r87, 8; +add.s32 r570, r84, r569; +and.b32 r571, r82, 16352; +add.s32 r572, r570, r571; +st.shared.u32 [r572], r65; +st.shared.u32 [r572+4], r68; +st.shared.u32 [r572+16], r102; +st.shared.u32 [r572+20], r111; +barrier.sync 0; +and.b32 r573, r87, 8176; +sub.s32 r574, r572, r573; +ld.shared.u32 r133, [r574]; +ld.shared.u32 r136, [r574+4]; +ld.shared.u32 r134, [r574+8192]; +ld.shared.u32 r137, [r574+8196]; +{ +add.f16x2 r132, r133, r134; +} +{ +add.f16x2 r135, r136, r137; +} +{ +sub.f16x2 r138, r133, r134; +} +{ +sub.f16x2 r141, r136, r137; +} +bfe.u32 r575, r21, 2, 8; +cvt.rn.f32.u32 f76, r575; +mul.f32 f77, f76, 0f3C490FDB; +cos.approx.f32 f27, f77; +sin.approx.f32 f78, f77; +neg.f32 f28, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r144, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r149, {high, high}; +} +{ +mul.f16x2 r151, r141, r149; +} +{ +fma.rn.f16x2 r154, r138, r147, r151; +} +{ +mul.f16x2 r158, r138, r149; +} +{ +neg.f16x2 r161, r158; +} +{ +fma.rn.f16x2 r163, r141, r147, r161; +} +and.b32 r576, r87, 24; +add.s32 r577, r84, r576; +barrier.sync 0; +and.b32 r578, r82, 16320; +add.s32 r579, r577, r578; +st.shared.u32 [r579], r132; +st.shared.u32 [r579+4], r135; +st.shared.u32 [r579+32], r154; +st.shared.u32 [r579+36], r163; +barrier.sync 0; +and.b32 r580, r87, 8160; +sub.s32 r581, r579, r580; +ld.shared.u32 r185, [r581]; +ld.shared.u32 r188, [r581+4]; +ld.shared.u32 r186, [r581+8192]; +ld.shared.u32 r189, [r581+8196]; +{ +add.f16x2 r184, r185, r186; +} +{ +add.f16x2 r187, r188, r189; +} +{ +sub.f16x2 r190, r185, r186; +} +{ +sub.f16x2 r193, r188, r189; +} +bfe.u32 r582, r21, 3, 7; +cvt.rn.f32.u32 f79, r582; +mul.f32 f80, f79, 0f3CC90FDB; +cos.approx.f32 f33, f80; +sin.approx.f32 f81, f80; +neg.f32 f34, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r196, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r201, {high, high}; +} +{ +mul.f16x2 r203, r193, r201; +} +{ +fma.rn.f16x2 r206, r190, r199, r203; +} +{ +mul.f16x2 r210, r190, r201; +} +{ +neg.f16x2 r213, r210; +} +{ +fma.rn.f16x2 r215, r193, r199, r213; +} +and.b32 r583, r87, 56; +add.s32 r584, r84, r583; +barrier.sync 0; +and.b32 r585, r82, 16256; +add.s32 r586, r584, r585; +st.shared.u32 [r586], r184; +st.shared.u32 [r586+4], r187; +st.shared.u32 [r586+64], r206; +st.shared.u32 [r586+68], r215; +barrier.sync 0; +and.b32 r587, r87, 8128; +sub.s32 r588, r586, r587; +ld.shared.u32 r237, [r588]; +ld.shared.u32 r240, [r588+4]; +ld.shared.u32 r238, [r588+8192]; +ld.shared.u32 r241, [r588+8196]; +{ +add.f16x2 r236, r237, r238; +} +{ +add.f16x2 r239, r240, r241; +} +{ +sub.f16x2 r242, r237, r238; +} +{ +sub.f16x2 r245, r240, r241; +} +bfe.u32 r589, r21, 4, 6; +cvt.rn.f32.u32 f82, r589; +mul.f32 f83, f82, 0f3D490FDB; +cos.approx.f32 f39, f83; +sin.approx.f32 f84, f83; +neg.f32 f40, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r248, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r253, {high, high}; +} +{ +mul.f16x2 r255, r245, r253; +} +{ +fma.rn.f16x2 r258, r242, r251, r255; +} +{ +mul.f16x2 r262, r242, r253; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r245, r251, r265; +} +and.b32 r590, r87, 120; +add.s32 r591, r84, r590; +barrier.sync 0; +and.b32 r592, r82, 16128; +add.s32 r593, r591, r592; +st.shared.u32 [r593], r236; +st.shared.u32 [r593+4], r239; +st.shared.u32 [r593+128], r258; +st.shared.u32 [r593+132], r267; +barrier.sync 0; +and.b32 r594, r87, 8064; +sub.s32 r595, r593, r594; +ld.shared.u32 r289, [r595]; +ld.shared.u32 r292, [r595+4]; +ld.shared.u32 r290, [r595+8192]; +ld.shared.u32 r293, [r595+8196]; +{ +add.f16x2 r288, r289, r290; +} +{ +add.f16x2 r291, r292, r293; +} +{ +sub.f16x2 r294, r289, r290; +} +{ +sub.f16x2 r297, r292, r293; +} +bfe.u32 r596, r21, 5, 5; +cvt.rn.f32.u32 f85, r596; +mul.f32 f86, f85, 0f3DC90FDB; +cos.approx.f32 f45, f86; +sin.approx.f32 f87, f86; +neg.f32 f46, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r300, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r305, {high, high}; +} +{ +mul.f16x2 r307, r297, r305; +} +{ +fma.rn.f16x2 r310, r294, r303, r307; +} +{ +mul.f16x2 r314, r294, r305; +} +{ +neg.f16x2 r317, r314; +} +{ +fma.rn.f16x2 r319, r297, r303, r317; +} +and.b32 r597, r87, 248; +add.s32 r598, r84, r597; +barrier.sync 0; +and.b32 r599, r82, 15872; +add.s32 r600, r598, r599; +st.shared.u32 [r600], r288; +st.shared.u32 [r600+4], r291; +st.shared.u32 [r600+256], r310; +st.shared.u32 [r600+260], r319; +barrier.sync 0; +and.b32 r601, r87, 7936; +sub.s32 r602, r600, r601; +ld.shared.u32 r341, [r602]; +ld.shared.u32 r344, [r602+4]; +ld.shared.u32 r342, [r602+8192]; +ld.shared.u32 r345, [r602+8196]; +{ +add.f16x2 r340, r341, r342; +} +{ +add.f16x2 r343, r344, r345; +} +{ +sub.f16x2 r346, r341, r342; +} +{ +sub.f16x2 r349, r344, r345; +} +bfe.u32 r603, r21, 6, 4; +cvt.rn.f32.u32 f88, r603; +mul.f32 f89, f88, 0f3E490FDB; +cos.approx.f32 f51, f89; +sin.approx.f32 f90, f89; +neg.f32 f52, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f51; +cvt.rn.f16.f32 high, f52; +mov.b32 r352, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r357, {high, high}; +} +{ +mul.f16x2 r359, r349, r357; +} +{ +fma.rn.f16x2 r362, r346, r355, r359; +} +{ +mul.f16x2 r366, r346, r357; +} +{ +neg.f16x2 r369, r366; +} +{ +fma.rn.f16x2 r371, r349, r355, r369; +} +and.b32 r604, r87, 504; +add.s32 r605, r84, r604; +barrier.sync 0; +and.b32 r606, r82, 15360; +add.s32 r607, r605, r606; +st.shared.u32 [r607], r340; +st.shared.u32 [r607+4], r343; +st.shared.u32 [r607+512], r362; +st.shared.u32 [r607+516], r371; +barrier.sync 0; +and.b32 r608, r87, 7680; +sub.s32 r609, r607, r608; +ld.shared.u32 r393, [r609]; +ld.shared.u32 r396, [r609+4]; +ld.shared.u32 r394, [r609+8192]; +ld.shared.u32 r397, [r609+8196]; +{ +add.f16x2 r392, r393, r394; +} +{ +add.f16x2 r395, r396, r397; +} +{ +sub.f16x2 r398, r393, r394; +} +{ +sub.f16x2 r401, r396, r397; +} +bfe.u32 r610, r21, 7, 3; +cvt.rn.f32.u32 f91, r610; +mul.f32 f92, f91, 0f3EC90FDB; +cos.approx.f32 f57, f92; +sin.approx.f32 f93, f92; +neg.f32 f58, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r407, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r409, {high, high}; +} +{ +mul.f16x2 r411, r401, r409; +} +{ +fma.rn.f16x2 r414, r398, r407, r411; +} +{ +mul.f16x2 r418, r398, r409; +} +{ +neg.f16x2 r421, r418; +} +{ +fma.rn.f16x2 r423, r401, r407, r421; +} +and.b32 r611, r87, 1016; +add.s32 r612, r84, r611; +barrier.sync 0; +and.b32 r613, r82, 14336; +add.s32 r614, r612, r613; +st.shared.u32 [r614], r392; +st.shared.u32 [r614+4], r395; +st.shared.u32 [r614+1024], r414; +st.shared.u32 [r614+1028], r423; +barrier.sync 0; +and.b32 r615, r87, 7168; +sub.s32 r616, r614, r615; +ld.shared.u32 r445, [r616]; +ld.shared.u32 r448, [r616+4]; +ld.shared.u32 r446, [r616+8192]; +ld.shared.u32 r449, [r616+8196]; +{ +add.f16x2 r444, r445, r446; +} +{ +add.f16x2 r447, r448, r449; +} +{ +sub.f16x2 r450, r445, r446; +} +{ +sub.f16x2 r453, r448, r449; +} +bfe.u32 r617, r21, 8, 2; +cvt.rn.f32.u32 f94, r617; +mul.f32 f95, f94, 0f3F490FDB; +cos.approx.f32 f63, f95; +sin.approx.f32 f96, f95; +neg.f32 f64, f96; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f63; +cvt.rn.f16.f32 high, f64; +mov.b32 r456, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r453, r461; +} +{ +fma.rn.f16x2 r466, r450, r459, r463; +} +{ +mul.f16x2 r470, r450, r461; +} +{ +neg.f16x2 r473, r470; +} +{ +fma.rn.f16x2 r475, r453, r459, r473; +} +and.b32 r618, r87, 2040; +add.s32 r619, r84, r618; +barrier.sync 0; +and.b32 r620, r82, 12288; +add.s32 r621, r619, r620; +st.shared.u32 [r621], r444; +st.shared.u32 [r621+4], r447; +st.shared.u32 [r621+2048], r466; +st.shared.u32 [r621+2052], r475; +barrier.sync 0; +and.b32 r622, r87, 6144; +sub.s32 r623, r621, r622; +ld.shared.u32 r497, [r623]; +ld.shared.u32 r500, [r623+4]; +ld.shared.u32 r498, [r623+8192]; +ld.shared.u32 r501, [r623+8196]; +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r501; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +sub.f16x2 r505, r500, r501; +} +bfe.u32 r624, r21, 9, 1; +cvt.rn.f32.u32 f97, r624; +mul.f32 f98, f97, 0f3FC90FDB; +cos.approx.f32 f69, f98; +sin.approx.f32 f99, f98; +neg.f32 f70, f99; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r511, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r513, {high, high}; +} +{ +mul.f16x2 r515, r505, r513; +} +{ +fma.rn.f16x2 r518, r502, r511, r515; +} +{ +mul.f16x2 r522, r502, r513; +} +{ +neg.f16x2 r525, r522; +} +{ +fma.rn.f16x2 r527, r505, r511, r525; +} +and.b32 r625, r87, 4088; +add.s32 r626, r84, r625; +barrier.sync 0; +and.b32 r627, r82, 8192; +add.s32 r628, r626, r627; +st.shared.u32 [r628], r496; +st.shared.u32 [r628+4], r499; +st.shared.u32 [r628+4096], r518; +st.shared.u32 [r628+4100], r527; +barrier.sync 0; +and.b32 r629, r87, 4096; +sub.s32 r630, r628, r629; +ld.shared.u32 r549, [r630]; +ld.shared.u32 r552, [r630+4]; +ld.shared.u32 r550, [r630+8192]; +ld.shared.u32 r553, [r630+8196]; +{ +add.f16x2 %0, r549, r550; +} +{ +add.f16x2 %1, r552, r553; +} +{ +sub.f16x2 %2, r549, r550; +} +{ +sub.f16x2 %3, r552, r553; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1048, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<5>; +.reg .b16 rs<3>; +.reg .f32 f<103>; +.reg .b32 r<631>; +.reg .b64 rd<2>; +{ +add.f16x2 r9, %5, %7; +} +{ +add.f16x2 r12, %6, %8; +} +{ +sub.f16x2 r15, %5, %7; +} +{ +sub.f16x2 r18, %6, %8; +} +mov.u32 r21, %tid.x; +and.b32 r22, r21, 1023; +cvt.rn.f32.u32 f10, r22; +mul.f32 f1, f10, 0f3B490FDB; +cvt.u16.u32 rs2, r21; +and.b16 rs1, rs2, 1023; +mov.f32 f100, 0f3BC90F88; +setp.eq.s16 p1, rs1, 510; +@p1 bra LBB9_4; +setp.ne.s16 p2, rs1, 511; +@p2 bra LBB9_3; +mov.f32 f100, 0f3B490FC6; +bra.uni LBB9_4; +LBB9_3: +cos.approx.f32 f100, f1; +LBB9_4: +setp.eq.s32 p3, r22, 1023; +mov.f32 f101, 0f3B490FC6; +@p3 bra LBB9_6; +sin.approx.f32 f101, f1; +LBB9_6: +mov.u32 r77, %tid.y; +shl.b32 r78, r77, 13; +mov.u32 r79, %4; +add.s32 r80, r79, r78; +shl.b32 r82, r21, 3; +and.b32 r83, r82, -8192; +add.s32 r84, r80, r83; +neg.f32 f14, f101; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f14; +mov.b32 r25, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r28, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r25; +mov.b32 r30, {high, high}; +} +{ +mul.f16x2 r32, r18, r30; +} +{ +fma.rn.f16x2 r35, r15, r28, r32; +} +{ +mul.f16x2 r39, r15, r30; +} +{ +neg.f16x2 r42, r39; +} +{ +fma.rn.f16x2 r44, r18, r28, r42; +} +barrier.sync 0; +and.b32 r85, r82, 8184; +add.s32 r86, r84, r85; +st.shared.v2.f32 [r86], {r9, r35}; +barrier.sync 0; +shl.b32 r87, r21, 2; +and.b32 r88, r87, 4092; +sub.s32 r89, r86, r88; +ld.shared.u32 r66, [r89]; +ld.shared.u32 r67, [r89+4096]; +barrier.sync 0; +st.shared.v2.f32 [r86], {r12, r44}; +barrier.sync 0; +ld.shared.u32 r69, [r89]; +ld.shared.u32 r70, [r89+4096]; +{ +add.f16x2 r65, r66, r67; +} +{ +add.f16x2 r68, r69, r70; +} +{ +sub.f16x2 r71, r66, r67; +} +{ +sub.f16x2 r74, r69, r70; +} +and.b32 r90, r21, 1022; +bfe.u32 r91, r21, 1, 9; +cvt.rn.f32.u32 f20, r91; +mul.f32 f6, f20, 0f3BC90FDB; +setp.eq.s32 p4, r90, 510; +mov.f32 f102, 0f3BC90F88; +@p4 bra LBB9_8; +cos.approx.f32 f102, f6; +LBB9_8: +and.b32 r569, r87, 4; +add.s32 r570, r84, r569; +sin.approx.f32 f75, f6; +neg.f32 f22, f75; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f22; +mov.b32 r92, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r95, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r92; +mov.b32 r97, {high, high}; +} +{ +mul.f16x2 r99, r74, r97; +} +{ +fma.rn.f16x2 r102, r71, r95, r99; +} +{ +mul.f16x2 r106, r71, r97; +} +{ +neg.f16x2 r109, r106; +} +{ +fma.rn.f16x2 r111, r74, r95, r109; +} +barrier.sync 0; +and.b32 r571, r82, 8176; +add.s32 r572, r570, r571; +st.shared.u32 [r572], r65; +st.shared.u32 [r572+8], r102; +barrier.sync 0; +and.b32 r573, r87, 4088; +sub.s32 r574, r572, r573; +ld.shared.u32 r133, [r574]; +ld.shared.u32 r134, [r574+4096]; +barrier.sync 0; +st.shared.u32 [r572], r68; +st.shared.u32 [r572+8], r111; +barrier.sync 0; +ld.shared.u32 r136, [r574]; +ld.shared.u32 r137, [r574+4096]; +{ +add.f16x2 r132, r133, r134; +} +{ +add.f16x2 r135, r136, r137; +} +{ +sub.f16x2 r138, r133, r134; +} +{ +sub.f16x2 r141, r136, r137; +} +bfe.u32 r575, r21, 2, 8; +and.b32 r576, r87, 12; +add.s32 r577, r84, r576; +cvt.rn.f32.u32 f76, r575; +mul.f32 f77, f76, 0f3C490FDB; +cos.approx.f32 f27, f77; +sin.approx.f32 f78, f77; +neg.f32 f28, f78; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r144, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r149, {high, high}; +} +{ +mul.f16x2 r151, r141, r149; +} +{ +fma.rn.f16x2 r154, r138, r147, r151; +} +{ +mul.f16x2 r158, r138, r149; +} +{ +neg.f16x2 r161, r158; +} +{ +fma.rn.f16x2 r163, r141, r147, r161; +} +barrier.sync 0; +and.b32 r578, r82, 8160; +add.s32 r579, r577, r578; +st.shared.u32 [r579], r132; +st.shared.u32 [r579+16], r154; +barrier.sync 0; +and.b32 r580, r87, 4080; +sub.s32 r581, r579, r580; +ld.shared.u32 r185, [r581]; +ld.shared.u32 r186, [r581+4096]; +barrier.sync 0; +st.shared.u32 [r579], r135; +st.shared.u32 [r579+16], r163; +barrier.sync 0; +ld.shared.u32 r188, [r581]; +ld.shared.u32 r189, [r581+4096]; +{ +add.f16x2 r184, r185, r186; +} +{ +add.f16x2 r187, r188, r189; +} +{ +sub.f16x2 r190, r185, r186; +} +{ +sub.f16x2 r193, r188, r189; +} +bfe.u32 r582, r21, 3, 7; +and.b32 r583, r87, 28; +add.s32 r584, r84, r583; +cvt.rn.f32.u32 f79, r582; +mul.f32 f80, f79, 0f3CC90FDB; +cos.approx.f32 f33, f80; +sin.approx.f32 f81, f80; +neg.f32 f34, f81; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r196, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r196; +mov.b32 r201, {high, high}; +} +{ +mul.f16x2 r203, r193, r201; +} +{ +fma.rn.f16x2 r206, r190, r199, r203; +} +{ +mul.f16x2 r210, r190, r201; +} +{ +neg.f16x2 r213, r210; +} +{ +fma.rn.f16x2 r215, r193, r199, r213; +} +barrier.sync 0; +and.b32 r585, r82, 8128; +add.s32 r586, r584, r585; +st.shared.u32 [r586], r184; +st.shared.u32 [r586+32], r206; +barrier.sync 0; +and.b32 r587, r87, 4064; +sub.s32 r588, r586, r587; +ld.shared.u32 r237, [r588]; +ld.shared.u32 r238, [r588+4096]; +barrier.sync 0; +st.shared.u32 [r586], r187; +st.shared.u32 [r586+32], r215; +barrier.sync 0; +ld.shared.u32 r240, [r588]; +ld.shared.u32 r241, [r588+4096]; +{ +add.f16x2 r236, r237, r238; +} +{ +add.f16x2 r239, r240, r241; +} +{ +sub.f16x2 r242, r237, r238; +} +{ +sub.f16x2 r245, r240, r241; +} +bfe.u32 r589, r21, 4, 6; +and.b32 r590, r87, 60; +add.s32 r591, r84, r590; +cvt.rn.f32.u32 f82, r589; +mul.f32 f83, f82, 0f3D490FDB; +cos.approx.f32 f39, f83; +sin.approx.f32 f84, f83; +neg.f32 f40, f84; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r248, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r248; +mov.b32 r253, {high, high}; +} +{ +mul.f16x2 r255, r245, r253; +} +{ +fma.rn.f16x2 r258, r242, r251, r255; +} +{ +mul.f16x2 r262, r242, r253; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r245, r251, r265; +} +barrier.sync 0; +and.b32 r592, r82, 8064; +add.s32 r593, r591, r592; +st.shared.u32 [r593], r236; +st.shared.u32 [r593+64], r258; +barrier.sync 0; +and.b32 r594, r87, 4032; +sub.s32 r595, r593, r594; +ld.shared.u32 r289, [r595]; +ld.shared.u32 r290, [r595+4096]; +barrier.sync 0; +st.shared.u32 [r593], r239; +st.shared.u32 [r593+64], r267; +barrier.sync 0; +ld.shared.u32 r292, [r595]; +ld.shared.u32 r293, [r595+4096]; +{ +add.f16x2 r288, r289, r290; +} +{ +add.f16x2 r291, r292, r293; +} +{ +sub.f16x2 r294, r289, r290; +} +{ +sub.f16x2 r297, r292, r293; +} +bfe.u32 r596, r21, 5, 5; +and.b32 r597, r87, 124; +add.s32 r598, r84, r597; +cvt.rn.f32.u32 f85, r596; +mul.f32 f86, f85, 0f3DC90FDB; +cos.approx.f32 f45, f86; +sin.approx.f32 f87, f86; +neg.f32 f46, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r300, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r300; +mov.b32 r305, {high, high}; +} +{ +mul.f16x2 r307, r297, r305; +} +{ +fma.rn.f16x2 r310, r294, r303, r307; +} +{ +mul.f16x2 r314, r294, r305; +} +{ +neg.f16x2 r317, r314; +} +{ +fma.rn.f16x2 r319, r297, r303, r317; +} +barrier.sync 0; +and.b32 r599, r82, 7936; +add.s32 r600, r598, r599; +st.shared.u32 [r600], r288; +st.shared.u32 [r600+128], r310; +barrier.sync 0; +and.b32 r601, r87, 3968; +sub.s32 r602, r600, r601; +ld.shared.u32 r341, [r602]; +ld.shared.u32 r342, [r602+4096]; +barrier.sync 0; +st.shared.u32 [r600], r291; +st.shared.u32 [r600+128], r319; +barrier.sync 0; +ld.shared.u32 r344, [r602]; +ld.shared.u32 r345, [r602+4096]; +{ +add.f16x2 r340, r341, r342; +} +{ +add.f16x2 r343, r344, r345; +} +{ +sub.f16x2 r346, r341, r342; +} +{ +sub.f16x2 r349, r344, r345; +} +bfe.u32 r603, r21, 6, 4; +and.b32 r604, r87, 252; +add.s32 r605, r84, r604; +cvt.rn.f32.u32 f88, r603; +mul.f32 f89, f88, 0f3E490FDB; +cos.approx.f32 f51, f89; +sin.approx.f32 f90, f89; +neg.f32 f52, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f51; +cvt.rn.f16.f32 high, f52; +mov.b32 r352, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r352; +mov.b32 r357, {high, high}; +} +{ +mul.f16x2 r359, r349, r357; +} +{ +fma.rn.f16x2 r362, r346, r355, r359; +} +{ +mul.f16x2 r366, r346, r357; +} +{ +neg.f16x2 r369, r366; +} +{ +fma.rn.f16x2 r371, r349, r355, r369; +} +barrier.sync 0; +and.b32 r606, r82, 7680; +add.s32 r607, r605, r606; +st.shared.u32 [r607], r340; +st.shared.u32 [r607+256], r362; +barrier.sync 0; +and.b32 r608, r87, 3840; +sub.s32 r609, r607, r608; +ld.shared.u32 r393, [r609]; +ld.shared.u32 r394, [r609+4096]; +barrier.sync 0; +st.shared.u32 [r607], r343; +st.shared.u32 [r607+256], r371; +barrier.sync 0; +ld.shared.u32 r396, [r609]; +ld.shared.u32 r397, [r609+4096]; +{ +add.f16x2 r392, r393, r394; +} +{ +add.f16x2 r395, r396, r397; +} +{ +sub.f16x2 r398, r393, r394; +} +{ +sub.f16x2 r401, r396, r397; +} +bfe.u32 r610, r21, 7, 3; +and.b32 r611, r87, 508; +add.s32 r612, r84, r611; +cvt.rn.f32.u32 f91, r610; +mul.f32 f92, f91, 0f3EC90FDB; +cos.approx.f32 f57, f92; +sin.approx.f32 f93, f92; +neg.f32 f58, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r407, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r404; +mov.b32 r409, {high, high}; +} +{ +mul.f16x2 r411, r401, r409; +} +{ +fma.rn.f16x2 r414, r398, r407, r411; +} +{ +mul.f16x2 r418, r398, r409; +} +{ +neg.f16x2 r421, r418; +} +{ +fma.rn.f16x2 r423, r401, r407, r421; +} +barrier.sync 0; +and.b32 r613, r82, 7168; +add.s32 r614, r612, r613; +st.shared.u32 [r614], r392; +st.shared.u32 [r614+512], r414; +barrier.sync 0; +and.b32 r615, r87, 3584; +sub.s32 r616, r614, r615; +ld.shared.u32 r445, [r616]; +ld.shared.u32 r446, [r616+4096]; +barrier.sync 0; +st.shared.u32 [r614], r395; +st.shared.u32 [r614+512], r423; +barrier.sync 0; +ld.shared.u32 r448, [r616]; +ld.shared.u32 r449, [r616+4096]; +{ +add.f16x2 r444, r445, r446; +} +{ +add.f16x2 r447, r448, r449; +} +{ +sub.f16x2 r450, r445, r446; +} +{ +sub.f16x2 r453, r448, r449; +} +bfe.u32 r617, r21, 8, 2; +and.b32 r618, r87, 1020; +add.s32 r619, r84, r618; +cvt.rn.f32.u32 f94, r617; +mul.f32 f95, f94, 0f3F490FDB; +cos.approx.f32 f63, f95; +sin.approx.f32 f96, f95; +neg.f32 f64, f96; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f63; +cvt.rn.f16.f32 high, f64; +mov.b32 r456, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r456; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r453, r461; +} +{ +fma.rn.f16x2 r466, r450, r459, r463; +} +{ +mul.f16x2 r470, r450, r461; +} +{ +neg.f16x2 r473, r470; +} +{ +fma.rn.f16x2 r475, r453, r459, r473; +} +barrier.sync 0; +and.b32 r620, r82, 6144; +add.s32 r621, r619, r620; +st.shared.u32 [r621], r444; +st.shared.u32 [r621+1024], r466; +barrier.sync 0; +and.b32 r622, r87, 3072; +sub.s32 r623, r621, r622; +ld.shared.u32 r497, [r623]; +ld.shared.u32 r498, [r623+4096]; +barrier.sync 0; +st.shared.u32 [r621], r447; +st.shared.u32 [r621+1024], r475; +barrier.sync 0; +ld.shared.u32 r500, [r623]; +ld.shared.u32 r501, [r623+4096]; +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r501; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +sub.f16x2 r505, r500, r501; +} +bfe.u32 r624, r21, 9, 1; +and.b32 r625, r87, 2044; +add.s32 r626, r84, r625; +cvt.rn.f32.u32 f97, r624; +mul.f32 f98, f97, 0f3FC90FDB; +cos.approx.f32 f69, f98; +sin.approx.f32 f99, f98; +neg.f32 f70, f99; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r508, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r511, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r508; +mov.b32 r513, {high, high}; +} +{ +mul.f16x2 r515, r505, r513; +} +{ +fma.rn.f16x2 r518, r502, r511, r515; +} +{ +mul.f16x2 r522, r502, r513; +} +{ +neg.f16x2 r525, r522; +} +{ +fma.rn.f16x2 r527, r505, r511, r525; +} +barrier.sync 0; +and.b32 r627, r82, 4096; +add.s32 r628, r626, r627; +st.shared.u32 [r628], r496; +st.shared.u32 [r628+2048], r518; +barrier.sync 0; +and.b32 r629, r87, 2048; +sub.s32 r630, r628, r629; +ld.shared.u32 r549, [r630]; +ld.shared.u32 r550, [r630+4096]; +barrier.sync 0; +st.shared.u32 [r628], r499; +st.shared.u32 [r628+2048], r527; +barrier.sync 0; +ld.shared.u32 r552, [r630]; +ld.shared.u32 r553, [r630+4096]; +{ +add.f16x2 %0, r549, r550; +} +{ +add.f16x2 %1, r552, r553; +} +{ +sub.f16x2 %2, r549, r550; +} +{ +sub.f16x2 %3, r552, r553; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..bd205ef141035 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp32_fwd.hpp.inc @@ -0,0 +1,7891 @@ +#ifndef CUFFTDX_FFT_2048_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_2048_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<91, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<891>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8128; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+512]; +ld.shared.f32 f391, [r13+1024]; +ld.shared.f32 f392, [r13+1536]; +ld.shared.f32 f393, [r13+2048]; +ld.shared.f32 f394, [r13+2560]; +ld.shared.f32 f395, [r13+3072]; +ld.shared.f32 f396, [r13+3584]; +ld.shared.f32 f397, [r13+4096]; +ld.shared.f32 f398, [r13+4608]; +ld.shared.f32 f399, [r13+5120]; +ld.shared.f32 f400, [r13+5632]; +ld.shared.f32 f401, [r13+6144]; +ld.shared.f32 f402, [r13+6656]; +ld.shared.f32 f403, [r13+7168]; +ld.shared.f32 f404, [r13+7680]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+512]; +ld.shared.f32 f407, [r13+1024]; +ld.shared.f32 f408, [r13+1536]; +ld.shared.f32 f409, [r13+2048]; +ld.shared.f32 f410, [r13+2560]; +ld.shared.f32 f411, [r13+3072]; +ld.shared.f32 f412, [r13+3584]; +ld.shared.f32 f413, [r13+4096]; +ld.shared.f32 f414, [r13+4608]; +ld.shared.f32 f415, [r13+5120]; +ld.shared.f32 f416, [r13+5632]; +ld.shared.f32 f417, [r13+6144]; +ld.shared.f32 f418, [r13+6656]; +ld.shared.f32 f419, [r13+7168]; +ld.shared.f32 f420, [r13+7680]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f544; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f544; +add.f32 f576, f474, f547; +add.f32 f577, f475, f549; +sub.f32 f578, f474, f547; +sub.f32 f579, f475, f549; +add.f32 f580, f464, f522; +sub.f32 f581, f465, f521; +sub.f32 f582, f464, f522; +add.f32 f583, f465, f521; +add.f32 f584, f468, f552; +add.f32 f585, f469, f554; +sub.f32 f586, f468, f552; +sub.f32 f587, f469, f554; +add.f32 f588, f472, f557; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f557; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 112; +bfe.u32 r15, r5, 4, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f596, f568; +mul.f32 f601, f597, f569; +sub.f32 f602, f600, f601; +mul.f32 f603, f596, f569; +fma.rn.f32 f604, f597, f568, f603; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f607, f572; +mul.f32 f611, f609, f573; +sub.f32 f612, f610, f611; +mul.f32 f613, f607, f573; +fma.rn.f32 f614, f609, f572, f613; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f617, f576; +mul.f32 f621, f619, f577; +sub.f32 f622, f620, f621; +mul.f32 f623, f617, f577; +fma.rn.f32 f624, f619, f576, f623; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f627, f580; +mul.f32 f631, f629, f581; +sub.f32 f632, f630, f631; +mul.f32 f633, f627, f581; +fma.rn.f32 f634, f629, f580, f633; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f637, f584; +mul.f32 f641, f639, f585; +sub.f32 f642, f640, f641; +mul.f32 f643, f637, f585; +fma.rn.f32 f644, f639, f584, f643; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f647, f588; +mul.f32 f651, f649, f589; +sub.f32 f652, f650, f651; +mul.f32 f653, f647, f589; +fma.rn.f32 f654, f649, f588, f653; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f657, f592; +mul.f32 f661, f659, f593; +sub.f32 f662, f660, f661; +mul.f32 f663, f657, f593; +fma.rn.f32 f664, f659, f592, f663; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f667, f566; +mul.f32 f671, f669, f567; +sub.f32 f672, f670, f671; +mul.f32 f673, f667, f567; +fma.rn.f32 f674, f669, f566, f673; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f677, f570; +mul.f32 f681, f679, f571; +sub.f32 f682, f680, f681; +mul.f32 f683, f677, f571; +fma.rn.f32 f684, f679, f570, f683; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f687, f574; +mul.f32 f691, f689, f575; +sub.f32 f692, f690, f691; +mul.f32 f693, f687, f575; +fma.rn.f32 f694, f689, f574, f693; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f697, f578; +mul.f32 f701, f699, f579; +sub.f32 f702, f700, f701; +mul.f32 f703, f697, f579; +fma.rn.f32 f704, f699, f578, f703; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f707, f582; +mul.f32 f711, f709, f583; +sub.f32 f712, f710, f711; +mul.f32 f713, f707, f583; +fma.rn.f32 f714, f709, f582, f713; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f717, f586; +mul.f32 f721, f719, f587; +sub.f32 f722, f720, f721; +mul.f32 f723, f717, f587; +fma.rn.f32 f724, f719, f586, f723; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f727, f590; +mul.f32 f731, f729, f591; +sub.f32 f732, f730, f731; +mul.f32 f733, f727, f591; +fma.rn.f32 f734, f729, f590, f733; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f737, f594; +mul.f32 f741, f739, f595; +sub.f32 f742, f740, f741; +mul.f32 f743, f737, f595; +fma.rn.f32 f744, f739, f594, f743; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 7168; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f602; +st.shared.f32 [r20+128], f612; +st.shared.f32 [r20+192], f622; +st.shared.f32 [r20+256], f632; +st.shared.f32 [r20+320], f642; +st.shared.f32 [r20+384], f652; +st.shared.f32 [r20+448], f662; +st.shared.f32 [r20+512], f672; +st.shared.f32 [r20+576], f682; +st.shared.f32 [r20+640], f692; +st.shared.f32 [r20+704], f702; +st.shared.f32 [r20+768], f712; +st.shared.f32 [r20+832], f722; +st.shared.f32 [r20+896], f732; +st.shared.f32 [r20+960], f742; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+512]; +ld.shared.f32 f747, [r21+1024]; +ld.shared.f32 f748, [r21+1536]; +ld.shared.f32 f749, [r21+2048]; +ld.shared.f32 f750, [r21+2560]; +ld.shared.f32 f751, [r21+3072]; +ld.shared.f32 f752, [r21+3584]; +ld.shared.f32 f753, [r21+4096]; +ld.shared.f32 f754, [r21+4608]; +ld.shared.f32 f755, [r21+5120]; +ld.shared.f32 f756, [r21+5632]; +ld.shared.f32 f757, [r21+6144]; +ld.shared.f32 f758, [r21+6656]; +ld.shared.f32 f759, [r21+7168]; +ld.shared.f32 f760, [r21+7680]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+512]; +ld.shared.f32 f763, [r21+1024]; +ld.shared.f32 f764, [r21+1536]; +ld.shared.f32 f765, [r21+2048]; +ld.shared.f32 f766, [r21+2560]; +ld.shared.f32 f767, [r21+3072]; +ld.shared.f32 f768, [r21+3584]; +ld.shared.f32 f769, [r21+4096]; +ld.shared.f32 f770, [r21+4608]; +ld.shared.f32 f771, [r21+5120]; +ld.shared.f32 f772, [r21+5632]; +ld.shared.f32 f773, [r21+6144]; +ld.shared.f32 f774, [r21+6656]; +ld.shared.f32 f775, [r21+7168]; +ld.shared.f32 f776, [r21+7680]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +add.f32 f789, f779, f784; +sub.f32 f790, f780, f783; +sub.f32 f791, f779, f784; +add.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +add.f32 f805, f795, f800; +sub.f32 f806, f796, f799; +sub.f32 f807, f795, f800; +add.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0fBF3504F3; +sub.f32 f811, f809, f810; +mul.f32 f812, f806, 0f3F3504F3; +fma.rn.f32 f813, f805, 0fBF3504F3, f812; +mul.f32 f814, f807, 0fBF3504F3; +mul.f32 f815, f808, 0fBF3504F3; +sub.f32 f816, f814, f815; +add.f32 f817, f814, f815; +add.f32 f818, f746, f754; +add.f32 f819, f762, f770; +sub.f32 f820, f746, f754; +sub.f32 f821, f762, f770; +add.f32 f822, f750, f758; +add.f32 f823, f766, f774; +sub.f32 f824, f750, f758; +sub.f32 f825, f766, f774; +add.f32 f826, f818, f822; +add.f32 f827, f819, f823; +sub.f32 f828, f818, f822; +sub.f32 f829, f819, f823; +add.f32 f830, f820, f825; +sub.f32 f831, f821, f824; +sub.f32 f832, f820, f825; +add.f32 f833, f821, f824; +add.f32 f834, f748, f756; +add.f32 f835, f764, f772; +sub.f32 f836, f748, f756; +sub.f32 f837, f764, f772; +add.f32 f838, f752, f760; +add.f32 f839, f768, f776; +sub.f32 f840, f752, f760; +sub.f32 f841, f768, f776; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +add.f32 f846, f836, f841; +sub.f32 f847, f837, f840; +sub.f32 f848, f836, f841; +add.f32 f849, f837, f840; +mul.f32 f850, f846, 0f3F3504F3; +mul.f32 f851, f847, 0fBF3504F3; +sub.f32 f852, f850, f851; +mul.f32 f853, f847, 0f3F3504F3; +fma.rn.f32 f854, f846, 0fBF3504F3, f853; +mul.f32 f855, f848, 0fBF3504F3; +mul.f32 f856, f849, 0fBF3504F3; +sub.f32 f857, f855, f856; +add.f32 f858, f855, f856; +add.f32 %0, f785, f801; +add.f32 %1, f786, f802; +add.f32 %2, f826, f842; +add.f32 %3, f827, f843; +add.f32 %5, f790, f813; +add.f32 %4, f789, f811; +add.f32 %7, f831, f854; +add.f32 %6, f830, f852; +sub.f32 %9, f788, f803; +add.f32 %8, f787, f804; +sub.f32 %11, f829, f844; +add.f32 %10, f828, f845; +add.f32 %13, f792, f817; +add.f32 %12, f791, f816; +add.f32 %15, f833, f858; +add.f32 %14, f832, f857; +sub.f32 %16, f785, f801; +sub.f32 %17, f786, f802; +sub.f32 %18, f826, f842; +sub.f32 %19, f827, f843; +sub.f32 %21, f790, f813; +sub.f32 %20, f789, f811; +sub.f32 %23, f831, f854; +sub.f32 %22, f830, f852; +add.f32 %25, f788, f803; +sub.f32 %24, f787, f804; +add.f32 %27, f829, f844; +sub.f32 %26, f828, f845; +sub.f32 %29, f792, f817; +sub.f32 %28, f791, f816; +sub.f32 %31, f833, f858; +sub.f32 %30, f832, f857; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_2048), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<93, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<491>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8160; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+1024]; +ld.shared.f32 f161, [r13+2048]; +ld.shared.f32 f162, [r13+3072]; +ld.shared.f32 f163, [r13+4096]; +ld.shared.f32 f164, [r13+5120]; +ld.shared.f32 f165, [r13+6144]; +ld.shared.f32 f166, [r13+7168]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+1024]; +ld.shared.f32 f169, [r13+2048]; +ld.shared.f32 f170, [r13+3072]; +ld.shared.f32 f171, [r13+4096]; +ld.shared.f32 f172, [r13+5120]; +ld.shared.f32 f173, [r13+6144]; +ld.shared.f32 f174, [r13+7168]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 248; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 7936; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+1024]; +ld.shared.f32 f303, [r20+2048]; +ld.shared.f32 f304, [r20+3072]; +ld.shared.f32 f305, [r20+4096]; +ld.shared.f32 f306, [r20+5120]; +ld.shared.f32 f307, [r20+6144]; +ld.shared.f32 f308, [r20+7168]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+1024]; +ld.shared.f32 f311, [r20+2048]; +ld.shared.f32 f312, [r20+3072]; +ld.shared.f32 f313, [r20+4096]; +ld.shared.f32 f314, [r20+5120]; +ld.shared.f32 f315, [r20+6144]; +ld.shared.f32 f316, [r20+7168]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +add.f32 f329, f319, f324; +sub.f32 f330, f320, f323; +sub.f32 f331, f319, f324; +add.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +add.f32 f345, f335, f340; +sub.f32 f346, f336, f339; +sub.f32 f347, f335, f340; +add.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0fBF3504F3; +sub.f32 f351, f349, f350; +mul.f32 f352, f346, 0f3F3504F3; +fma.rn.f32 f353, f345, 0fBF3504F3, f352; +mul.f32 f354, f347, 0fBF3504F3; +mul.f32 f355, f348, 0fBF3504F3; +sub.f32 f356, f354, f355; +add.f32 f357, f354, f355; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f353; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f353; +add.f32 f366, f327, f344; +sub.f32 f367, f328, f343; +sub.f32 f368, f327, f344; +add.f32 f369, f328, f343; +add.f32 f370, f331, f356; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f356; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 192; +bfe.u32 r22, r5, 6, 2; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f374, f362; +mul.f32 f379, f375, f363; +sub.f32 f380, f378, f379; +mul.f32 f381, f374, f363; +fma.rn.f32 f382, f375, f362, f381; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f385, f366; +mul.f32 f389, f387, f367; +sub.f32 f390, f388, f389; +mul.f32 f391, f385, f367; +fma.rn.f32 f392, f387, f366, f391; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f395, f370; +mul.f32 f399, f397, f371; +sub.f32 f400, f398, f399; +mul.f32 f401, f395, f371; +fma.rn.f32 f402, f397, f370, f401; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f405, f360; +mul.f32 f409, f407, f361; +sub.f32 f410, f408, f409; +mul.f32 f411, f405, f361; +fma.rn.f32 f412, f407, f360, f411; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f415, f364; +mul.f32 f419, f417, f365; +sub.f32 f420, f418, f419; +mul.f32 f421, f415, f365; +fma.rn.f32 f422, f417, f364, f421; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f425, f368; +mul.f32 f429, f427, f369; +sub.f32 f430, f428, f429; +mul.f32 f431, f425, f369; +fma.rn.f32 f432, f427, f368, f431; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f435, f372; +mul.f32 f439, f437, f373; +sub.f32 f440, f438, f439; +mul.f32 f441, f435, f373; +fma.rn.f32 f442, f437, f372, f441; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 6144; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f380; +st.shared.f32 [r26+512], f390; +st.shared.f32 [r26+768], f400; +st.shared.f32 [r26+1024], f410; +st.shared.f32 [r26+1280], f420; +st.shared.f32 [r26+1536], f430; +st.shared.f32 [r26+1792], f440; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+1024]; +ld.shared.f32 f445, [r27+2048]; +ld.shared.f32 f446, [r27+3072]; +ld.shared.f32 f447, [r27+4096]; +ld.shared.f32 f448, [r27+5120]; +ld.shared.f32 f449, [r27+6144]; +ld.shared.f32 f450, [r27+7168]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+1024]; +ld.shared.f32 f453, [r27+2048]; +ld.shared.f32 f454, [r27+3072]; +ld.shared.f32 f455, [r27+4096]; +ld.shared.f32 f456, [r27+5120]; +ld.shared.f32 f457, [r27+6144]; +ld.shared.f32 f458, [r27+7168]; +add.f32 f459, f443, f447; +add.f32 f460, f451, f455; +sub.f32 f461, f443, f447; +sub.f32 f462, f451, f455; +add.f32 f463, f445, f449; +add.f32 f464, f453, f457; +sub.f32 f465, f445, f449; +sub.f32 f466, f453, f457; +add.f32 f467, f444, f448; +add.f32 f468, f452, f456; +sub.f32 f469, f444, f448; +sub.f32 f470, f452, f456; +add.f32 f471, f446, f450; +add.f32 f472, f454, f458; +sub.f32 f473, f446, f450; +sub.f32 f474, f454, f458; +add.f32 %0, f459, f463; +add.f32 %1, f460, f464; +add.f32 %2, f467, f471; +add.f32 %3, f468, f472; +sub.f32 %5, f462, f465; +add.f32 %4, f461, f466; +sub.f32 %7, f470, f473; +add.f32 %6, f469, f474; +sub.f32 %8, f459, f463; +sub.f32 %9, f460, f464; +sub.f32 %10, f467, f471; +sub.f32 %11, f468, f472; +add.f32 %13, f462, f465; +sub.f32 %12, f461, f466; +add.f32 %15, f470, f473; +sub.f32 %14, f469, f474; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_2048), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<92, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<539>; +.reg .b32 r<27>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 16320; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+2048]; +ld.shared.v2.f32 {f167, f168}, [r13+4096]; +ld.shared.v2.f32 {f171, f172}, [r13+6144]; +ld.shared.v2.f32 {f175, f176}, [r13+8192]; +ld.shared.v2.f32 {f179, f180}, [r13+10240]; +ld.shared.v2.f32 {f183, f184}, [r13+12288]; +ld.shared.v2.f32 {f187, f188}, [r13+14336]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 248; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 15872; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+2048]; +ld.shared.v2.f32 {f325, f326}, [r19+4096]; +ld.shared.v2.f32 {f329, f330}, [r19+6144]; +ld.shared.v2.f32 {f333, f334}, [r19+8192]; +ld.shared.v2.f32 {f337, f338}, [r19+10240]; +ld.shared.v2.f32 {f341, f342}, [r19+12288]; +ld.shared.v2.f32 {f345, f346}, [r19+14336]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +add.f32 f361, f351, f356; +sub.f32 f362, f352, f355; +sub.f32 f363, f351, f356; +add.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +add.f32 f377, f367, f372; +sub.f32 f378, f368, f371; +sub.f32 f379, f367, f372; +add.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0fBF3504F3; +sub.f32 f383, f381, f382; +mul.f32 f384, f378, 0f3F3504F3; +fma.rn.f32 f385, f377, 0fBF3504F3, f384; +mul.f32 f386, f379, 0fBF3504F3; +mul.f32 f387, f380, 0fBF3504F3; +sub.f32 f388, f386, f387; +add.f32 f389, f386, f387; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f385; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f385; +add.f32 f396, f359, f376; +sub.f32 f397, f360, f375; +sub.f32 f398, f359, f376; +add.f32 f399, f360, f375; +add.f32 f400, f363, f388; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f388; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 192; +bfe.u32 r21, r5, 6, 2; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f404, f392; +mul.f32 f409, f405, f393; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f413, f396; +mul.f32 f417, f415, f397; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f421, f400; +mul.f32 f425, f423, f401; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f429, f390; +mul.f32 f433, f431, f391; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f437, f394; +mul.f32 f441, f439, f395; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f445, f398; +mul.f32 f449, f447, f399; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f453, f402; +mul.f32 f457, f455, f403; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 12288; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f405, f392, f410; +sub.f32 f462, f408, f409; +st.shared.v2.f32 [r25+512], {f462, f461}; +fma.rn.f32 f463, f415, f396, f418; +sub.f32 f464, f416, f417; +st.shared.v2.f32 [r25+1024], {f464, f463}; +fma.rn.f32 f465, f423, f400, f426; +sub.f32 f466, f424, f425; +st.shared.v2.f32 [r25+1536], {f466, f465}; +sub.f32 f467, f432, f433; +fma.rn.f32 f468, f431, f390, f434; +st.shared.v2.f32 [r25+2048], {f467, f468}; +fma.rn.f32 f469, f439, f394, f442; +sub.f32 f470, f440, f441; +st.shared.v2.f32 [r25+2560], {f470, f469}; +fma.rn.f32 f471, f447, f398, f450; +sub.f32 f472, f448, f449; +st.shared.v2.f32 [r25+3072], {f472, f471}; +fma.rn.f32 f473, f455, f402, f458; +sub.f32 f474, f456, f457; +st.shared.v2.f32 [r25+3584], {f474, f473}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+2048]; +ld.shared.v2.f32 {f483, f484}, [r26+4096]; +ld.shared.v2.f32 {f487, f488}, [r26+6144]; +ld.shared.v2.f32 {f491, f492}, [r26+8192]; +ld.shared.v2.f32 {f495, f496}, [r26+10240]; +ld.shared.v2.f32 {f499, f500}, [r26+12288]; +ld.shared.v2.f32 {f503, f504}, [r26+14336]; +add.f32 f507, f475, f491; +add.f32 f508, f476, f492; +sub.f32 f509, f475, f491; +sub.f32 f510, f476, f492; +add.f32 f511, f483, f499; +add.f32 f512, f484, f500; +sub.f32 f513, f483, f499; +sub.f32 f514, f484, f500; +add.f32 f515, f479, f495; +add.f32 f516, f480, f496; +sub.f32 f517, f479, f495; +sub.f32 f518, f480, f496; +add.f32 f519, f487, f503; +add.f32 f520, f488, f504; +sub.f32 f521, f487, f503; +sub.f32 f522, f488, f504; +add.f32 %1, f508, f512; +add.f32 %0, f507, f511; +add.f32 %3, f516, f520; +add.f32 %2, f515, f519; +sub.f32 %5, f510, f513; +add.f32 %4, f509, f514; +sub.f32 %7, f518, f521; +add.f32 %6, f517, f522; +sub.f32 %9, f508, f512; +sub.f32 %8, f507, f511; +sub.f32 %11, f516, f520; +sub.f32 %10, f515, f519; +add.f32 %13, f510, f513; +sub.f32 %12, f509, f514; +add.f32 %15, f518, f521; +sub.f32 %14, f517, f522; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_2048), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<94, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1162>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1151, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1149, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1148, f1151, f1149; +sub.f32 f76, f1151, f1149; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f1147, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1144, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1142, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1141, f1144, f1142; +sub.f32 f92, f1144, f1142; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f1140, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f1140, 0fBF3504F3; +mul.f32 f1139, f93, 0f3F3504F3; +sub.f32 f99, f1139, f98; +mul.f32 f100, f1140, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1138, f1148, f1141; +sub.f32 f109, f1148, f1141; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1137, f1147, f101; +sub.f32 f113, f1147, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f1136, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f1135, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1133, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1130, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1129, f1133, f1130; +sub.f32 f133, f1133, f1130; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f1128, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1126, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1124, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1123, f1126, f1124; +sub.f32 f149, f1126, f1124; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f1122, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f1122, 0fBF3504F3; +mul.f32 f1121, f150, 0f3F3504F3; +sub.f32 f156, f1121, f155; +mul.f32 f157, f1122, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1120, f1129, f1123; +sub.f32 f166, f1129, f1123; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1119, f1128, f158; +sub.f32 f170, f1128, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f1118, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f1117, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1115, f167, 0f3F6C835E; +mul.f32 f1116, f1119, 0fBEC3EF15; +sub.f32 f181, f1115, f1116; +mul.f32 f182, f1119, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f1113, f171, 0f3F3504F3; +mul.f32 f1114, f1118, 0fBF3504F3; +sub.f32 f186, f1113, f1114; +mul.f32 f187, f1118, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f1111, f175, 0f3EC3EF15; +mul.f32 f1112, f1117, 0fBF6C835E; +sub.f32 f191, f1111, f1112; +mul.f32 f192, f1117, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f1109, f169, 0fBEC3EF15; +mul.f32 f1110, f170, 0fBF6C835E; +sub.f32 f196, f1109, f1110; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f1107, f177, 0fBF6C835E; +mul.f32 f1108, f178, 0fBEC3EF15; +sub.f32 f205, f1107, f1108; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1106, f1137, f183; +sub.f32 f213, f1137, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1105, f1136, f188; +sub.f32 f217, f1136, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f1104, f1135, f193; +sub.f32 f221, f1135, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f1103, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f1102, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f1101, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1100, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f1106; +mul.f32 f244, f238, f1106; +mul.f32 f246, f239, f239; +mul.f32 f1099, f238, f238; +sub.f32 f247, f1099, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f1105; +mul.f32 f252, f247, f1105; +mul.f32 f1097, f238, f247; +mul.f32 f1098, f239, f249; +sub.f32 f255, f1097, f1098; +mul.f32 f1096, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f1104; +mul.f32 f260, f255, f1104; +mul.f32 f262, f239, f257; +mul.f32 f1095, f238, f255; +sub.f32 f263, f1095, f262; +mul.f32 f1094, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f1103; +mul.f32 f268, f263, f1103; +mul.f32 f270, f239, f265; +mul.f32 f1093, f238, f263; +sub.f32 f271, f1093, f270; +mul.f32 f1092, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f1102; +mul.f32 f276, f271, f1102; +mul.f32 f1090, f238, f271; +mul.f32 f1091, f239, f273; +sub.f32 f279, f1090, f1091; +mul.f32 f1089, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f1101; +mul.f32 f284, f279, f1101; +mul.f32 f286, f239, f281; +mul.f32 f1088, f238, f279; +sub.f32 f287, f1088, f286; +mul.f32 f1087, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f1100; +mul.f32 f292, f287, f1100; +mul.f32 f294, f239, f289; +mul.f32 f1086, f238, f287; +sub.f32 f295, f1086, f294; +mul.f32 f1085, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1084, f1138, f1120; +mul.f32 f299, f297, f1084; +mul.f32 f300, f295, f1084; +mul.f32 f1082, f238, f295; +mul.f32 f1083, f239, f297; +sub.f32 f303, f1082, f1083; +sub.f32 f1081, f106, f163; +mul.f32 f1080, f295, f1081; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1079, f238, f303; +sub.f32 f311, f1079, f310; +mul.f32 f1078, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f1076, f238, f311; +mul.f32 f1077, f239, f313; +sub.f32 f319, f1076, f1077; +mul.f32 f1075, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1074, f238, f319; +sub.f32 f327, f1074, f326; +mul.f32 f1073, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1072, f238, f327; +sub.f32 f335, f1072, f334; +mul.f32 f1071, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f1069, f238, f335; +mul.f32 f1070, f239, f337; +sub.f32 f343, f1069, f1070; +mul.f32 f1068, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1067, f238, f343; +sub.f32 f351, f1067, f350; +mul.f32 f1066, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f1065, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f1159, f1138, f1120; +mul.f32 f1158, f297, f1159; +barrier.sync 0; +and.b32 r11, r7, 16256; +add.s32 r12, r9, r11; +add.f32 f357, f1138, f1120; +sub.f32 f1157, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r35, %tid.x; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f1066, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f1096, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f1094, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f1092, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f1089, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f1087, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f1085, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f1157, f300; +sub.f32 f374, f1080, f1158; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f1078, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f1075, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f1073, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f1071, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f1068, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f1065, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +and.b32 r21, r35, 127; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+1024]; +ld.shared.v2.f32 {f397, f398}, [r13+2048]; +ld.shared.v2.f32 {f401, f402}, [r13+3072]; +ld.shared.v2.f32 {f405, f406}, [r13+4096]; +ld.shared.v2.f32 {f409, f410}, [r13+5120]; +ld.shared.v2.f32 {f413, f414}, [r13+6144]; +ld.shared.v2.f32 {f417, f418}, [r13+7168]; +ld.shared.v2.f32 {f421, f422}, [r13+8192]; +ld.shared.v2.f32 {f425, f426}, [r13+9216]; +ld.shared.v2.f32 {f429, f430}, [r13+10240]; +ld.shared.v2.f32 {f433, f434}, [r13+11264]; +ld.shared.v2.f32 {f437, f438}, [r13+12288]; +ld.shared.v2.f32 {f441, f442}, [r13+13312]; +ld.shared.v2.f32 {f445, f446}, [r13+14336]; +ld.shared.v2.f32 {f449, f450}, [r13+15360]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1064, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1063, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1062, f1064, f1063; +sub.f32 f464, f1064, f1063; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f1061, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1060, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1059, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1058, f1060, f1059; +sub.f32 f480, f1060, f1059; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f1057, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f1055, f481, 0f3F3504F3; +mul.f32 f1056, f1057, 0fBF3504F3; +sub.f32 f487, f1055, f1056; +mul.f32 f488, f1057, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1054, f1062, f1058; +sub.f32 f497, f1062, f1058; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1053, f1061, f489; +sub.f32 f501, f1061, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f1052, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f1051, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1050, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1049, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1048, f1050, f1049; +sub.f32 f521, f1050, f1049; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f1047, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1046, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1045, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1044, f1046, f1045; +sub.f32 f537, f1046, f1045; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f1043, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f1041, f538, 0f3F3504F3; +mul.f32 f1042, f1043, 0fBF3504F3; +sub.f32 f544, f1041, f1042; +mul.f32 f545, f1043, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1040, f1048, f1044; +sub.f32 f554, f1048, f1044; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1039, f1047, f546; +sub.f32 f558, f1047, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f1038, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f1037, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1039, 0fBEC3EF15; +mul.f32 f1036, f555, 0f3F6C835E; +sub.f32 f569, f1036, f568; +mul.f32 f570, f1039, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f1038, 0fBF3504F3; +mul.f32 f1035, f559, 0f3F3504F3; +sub.f32 f574, f1035, f573; +mul.f32 f575, f1038, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f1033, f563, 0f3EC3EF15; +mul.f32 f1034, f1037, 0fBF6C835E; +sub.f32 f579, f1033, f1034; +mul.f32 f580, f1037, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f1031, f557, 0fBEC3EF15; +mul.f32 f1032, f558, 0fBF6C835E; +sub.f32 f584, f1031, f1032; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f1030, f565, 0fBF6C835E; +sub.f32 f593, f1030, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1029, f1053, f571; +sub.f32 f601, f1053, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1028, f1052, f576; +sub.f32 f605, f1052, f576; +add.f32 f606, f506, f579; +sub.f32 f608, f506, f579; +add.f32 f1027, f1051, f581; +sub.f32 f609, f1051, f581; +add.f32 f610, f496, f554; +sub.f32 f612, f496, f554; +sub.f32 f1026, f497, f553; +add.f32 f613, f497, f553; +add.f32 f614, f500, f584; +sub.f32 f616, f500, f584; +add.f32 f1025, f501, f586; +sub.f32 f617, f501, f586; +add.f32 f618, f504, f589; +sub.f32 f620, f504, f589; +add.f32 f1024, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1023, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r35, 112; +bfe.u32 r15, r35, 4, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f631, f627, f1029; +mul.f32 f632, f626, f1029; +mul.f32 f634, f627, f627; +mul.f32 f1022, f626, f626; +sub.f32 f635, f1022, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f639, f637, f1028; +mul.f32 f640, f635, f1028; +mul.f32 f1020, f626, f635; +mul.f32 f1021, f627, f637; +sub.f32 f643, f1020, f1021; +mul.f32 f1019, f635, f602; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f647, f645, f1027; +mul.f32 f648, f643, f1027; +mul.f32 f650, f627, f645; +mul.f32 f1018, f626, f643; +sub.f32 f651, f1018, f650; +mul.f32 f1017, f643, f606; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f655, f653, f1026; +mul.f32 f656, f651, f1026; +mul.f32 f658, f627, f653; +mul.f32 f1016, f626, f651; +sub.f32 f659, f1016, f658; +mul.f32 f1015, f651, f610; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f663, f661, f1025; +mul.f32 f664, f659, f1025; +mul.f32 f1013, f626, f659; +mul.f32 f1014, f627, f661; +sub.f32 f667, f1013, f1014; +mul.f32 f1012, f659, f614; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f671, f669, f1024; +mul.f32 f672, f667, f1024; +mul.f32 f674, f627, f669; +mul.f32 f1011, f626, f667; +sub.f32 f675, f1011, f674; +mul.f32 f1010, f667, f618; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f679, f677, f1023; +mul.f32 f680, f675, f1023; +mul.f32 f682, f627, f677; +mul.f32 f1009, f626, f675; +sub.f32 f683, f1009, f682; +mul.f32 f1008, f675, f622; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1007, f1054, f1040; +mul.f32 f687, f685, f1007; +mul.f32 f688, f683, f1007; +mul.f32 f1005, f626, f683; +mul.f32 f1006, f627, f685; +sub.f32 f691, f1005, f1006; +sub.f32 f1004, f494, f551; +mul.f32 f1003, f683, f1004; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f695, f693, f601; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1002, f626, f691; +sub.f32 f699, f1002, f698; +mul.f32 f1001, f691, f600; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f703, f701, f605; +mul.f32 f704, f699, f605; +mul.f32 f999, f626, f699; +mul.f32 f1000, f627, f701; +sub.f32 f707, f999, f1000; +mul.f32 f998, f699, f604; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f711, f709, f609; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f997, f626, f707; +sub.f32 f715, f997, f714; +mul.f32 f996, f707, f608; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f719, f717, f613; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f995, f626, f715; +sub.f32 f723, f995, f722; +mul.f32 f994, f715, f612; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f727, f725, f617; +mul.f32 f728, f723, f617; +mul.f32 f992, f626, f723; +mul.f32 f993, f627, f725; +sub.f32 f731, f992, f993; +mul.f32 f991, f723, f616; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f735, f733, f621; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f990, f626, f731; +sub.f32 f739, f990, f738; +mul.f32 f989, f626, f598; +mul.f32 f740, f626, f733; +mul.f32 f988, f731, f620; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f739, f624; +mul.f32 f743, f741, f625; +mul.f32 f744, f739, f625; +mov.u32 r25, %tid.x; +shl.b32 r24, r25, 3; +and.b32 r16, r24, 120; +add.s32 r17, r9, r16; +mov.u32 r27, %tid.x; +shl.b32 r26, r27, 7; +barrier.sync 0; +and.b32 r18, r26, 14336; +add.s32 r19, r17, r18; +mov.u32 r29, %tid.x; +and.b32 r28, r29, 112; +add.f32 f745, f1054, f1040; +sub.f32 f1156, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r32, %tid.x; +and.b32 r31, r32, 112; +fma.rn.f32 f747, f627, f598, f632; +sub.f32 f748, f989, f631; +st.shared.v2.f32 [r19+128], {f748, f747}; +fma.rn.f32 f749, f637, f602, f640; +sub.f32 f750, f1019, f639; +st.shared.v2.f32 [r19+256], {f750, f749}; +fma.rn.f32 f751, f645, f606, f648; +sub.f32 f752, f1017, f647; +st.shared.v2.f32 [r19+384], {f752, f751}; +fma.rn.f32 f753, f653, f610, f656; +sub.f32 f754, f1015, f655; +st.shared.v2.f32 [r19+512], {f754, f753}; +sub.f32 f755, f1012, f663; +fma.rn.f32 f756, f661, f614, f664; +st.shared.v2.f32 [r19+640], {f755, f756}; +fma.rn.f32 f757, f669, f618, f672; +sub.f32 f758, f1010, f671; +st.shared.v2.f32 [r19+768], {f758, f757}; +fma.rn.f32 f759, f677, f622, f680; +sub.f32 f760, f1008, f679; +st.shared.v2.f32 [r19+896], {f760, f759}; +fma.rn.f32 f761, f685, f1156, f688; +sub.f32 f762, f1003, f687; +st.shared.v2.f32 [r19+1024], {f762, f761}; +fma.rn.f32 f763, f693, f600, f696; +sub.f32 f764, f1001, f695; +st.shared.v2.f32 [r19+1152], {f764, f763}; +fma.rn.f32 f765, f701, f604, f704; +sub.f32 f766, f998, f703; +st.shared.v2.f32 [r19+1280], {f766, f765}; +fma.rn.f32 f767, f709, f608, f712; +sub.f32 f768, f996, f711; +st.shared.v2.f32 [r19+1408], {f768, f767}; +fma.rn.f32 f769, f717, f612, f720; +sub.f32 f770, f994, f719; +st.shared.v2.f32 [r19+1536], {f770, f769}; +fma.rn.f32 f771, f725, f616, f728; +sub.f32 f772, f991, f727; +st.shared.v2.f32 [r19+1664], {f772, f771}; +fma.rn.f32 f773, f733, f620, f736; +sub.f32 f774, f988, f735; +st.shared.v2.f32 [r19+1792], {f774, f773}; +fma.rn.f32 f775, f741, f624, f744; +sub.f32 f776, f742, f743; +st.shared.v2.f32 [r19+1920], {f776, f775}; +barrier.sync 0; +mad.lo.s32 r20, r31, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+1024]; +ld.shared.v2.f32 {f785, f786}, [r20+2048]; +ld.shared.v2.f32 {f789, f790}, [r20+3072]; +ld.shared.v2.f32 {f793, f794}, [r20+4096]; +ld.shared.v2.f32 {f797, f798}, [r20+5120]; +ld.shared.v2.f32 {f801, f802}, [r20+6144]; +ld.shared.v2.f32 {f805, f806}, [r20+7168]; +ld.shared.v2.f32 {f809, f810}, [r20+8192]; +ld.shared.v2.f32 {f813, f814}, [r20+9216]; +ld.shared.v2.f32 {f817, f818}, [r20+10240]; +ld.shared.v2.f32 {f821, f822}, [r20+11264]; +ld.shared.v2.f32 {f825, f826}, [r20+12288]; +ld.shared.v2.f32 {f829, f830}, [r20+13312]; +ld.shared.v2.f32 {f833, f834}, [r20+14336]; +ld.shared.v2.f32 {f837, f838}, [r20+15360]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f987, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f986, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f985, f987, f986; +sub.f32 f852, f987, f986; +add.f32 f853, f843, f848; +sub.f32 f855, f843, f848; +sub.f32 f984, f844, f847; +add.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f983, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f982, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f981, f983, f982; +sub.f32 f868, f983, f982; +add.f32 f869, f859, f864; +sub.f32 f871, f859, f864; +sub.f32 f980, f860, f863; +add.f32 f872, f860, f863; +mul.f32 f978, f869, 0f3F3504F3; +mul.f32 f979, f980, 0fBF3504F3; +sub.f32 f875, f978, f979; +mul.f32 f876, f980, 0f3F3504F3; +fma.rn.f32 f877, f869, 0fBF3504F3, f876; +mul.f32 f878, f871, 0fBF3504F3; +mul.f32 f879, f872, 0fBF3504F3; +sub.f32 f880, f878, f879; +add.f32 f881, f878, f879; +add.f32 f882, f781, f813; +sub.f32 f884, f781, f813; +add.f32 f977, f782, f814; +sub.f32 f885, f782, f814; +add.f32 f886, f797, f829; +sub.f32 f888, f797, f829; +add.f32 f976, f798, f830; +sub.f32 f889, f798, f830; +add.f32 f890, f882, f886; +sub.f32 f892, f882, f886; +add.f32 f975, f977, f976; +sub.f32 f893, f977, f976; +add.f32 f894, f884, f889; +sub.f32 f896, f884, f889; +sub.f32 f974, f885, f888; +add.f32 f897, f885, f888; +add.f32 f898, f789, f821; +sub.f32 f900, f789, f821; +add.f32 f973, f790, f822; +sub.f32 f901, f790, f822; +add.f32 f902, f805, f837; +sub.f32 f904, f805, f837; +add.f32 f972, f806, f838; +sub.f32 f905, f806, f838; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f971, f973, f972; +sub.f32 f909, f973, f972; +add.f32 f910, f900, f905; +sub.f32 f912, f900, f905; +sub.f32 f970, f901, f904; +add.f32 f913, f901, f904; +mul.f32 f968, f910, 0f3F3504F3; +mul.f32 f969, f970, 0fBF3504F3; +sub.f32 f916, f968, f969; +mul.f32 f917, f970, 0f3F3504F3; +fma.rn.f32 f918, f910, 0fBF3504F3, f917; +mul.f32 f919, f912, 0fBF3504F3; +mul.f32 f920, f913, 0fBF3504F3; +sub.f32 f921, f919, f920; +add.f32 f922, f919, f920; +add.f32 %1, f985, f981; +add.f32 %0, f849, f865; +add.f32 %3, f975, f971; +add.f32 %2, f890, f906; +add.f32 %4, f853, f875; +add.f32 %5, f984, f877; +add.f32 %6, f894, f916; +add.f32 %7, f974, f918; +add.f32 %8, f851, f868; +sub.f32 %9, f852, f867; +sub.f32 %11, f893, f908; +add.f32 %10, f892, f909; +add.f32 %13, f856, f881; +add.f32 %12, f855, f880; +add.f32 %15, f897, f922; +add.f32 %14, f896, f921; +sub.f32 %17, f985, f981; +sub.f32 %16, f849, f865; +sub.f32 %19, f975, f971; +sub.f32 %18, f890, f906; +sub.f32 %21, f984, f877; +sub.f32 %20, f853, f875; +sub.f32 %23, f974, f918; +sub.f32 %22, f894, f916; +add.f32 %25, f852, f867; +sub.f32 %24, f851, f868; +add.f32 %27, f893, f908; +sub.f32 %26, f892, f909; +sub.f32 %29, f856, f881; +sub.f32 %28, f855, f880; +sub.f32 %31, f897, f922; +sub.f32 %30, f896, f921; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_2048), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<95, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2469>; +.reg .b32 r<41>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2459, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2457, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2456, f2459, f2457; +sub.f32 f140, f2459, f2457; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2455, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2452, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2450, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2449, f2452, f2450; +sub.f32 f156, f2452, f2450; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2448, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2448, 0fBF3504F3; +mul.f32 f2447, f157, 0f3F3504F3; +sub.f32 f163, f2447, f162; +mul.f32 f164, f2448, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2446, f2456, f2449; +sub.f32 f173, f2456, f2449; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2445, f2455, f165; +sub.f32 f177, f2455, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2444, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2443, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2441, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2438, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2437, f2441, f2438; +sub.f32 f197, f2441, f2438; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2436, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2434, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2432, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2431, f2434, f2432; +sub.f32 f213, f2434, f2432; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2430, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2430, 0fBF3504F3; +mul.f32 f2429, f214, 0f3F3504F3; +sub.f32 f220, f2429, f219; +mul.f32 f221, f2430, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2428, f2437, f2431; +sub.f32 f230, f2437, f2431; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2427, f2436, f222; +sub.f32 f234, f2436, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2426, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2425, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2423, f231, 0f3F6C835E; +mul.f32 f2424, f2427, 0fBEC3EF15; +sub.f32 f245, f2423, f2424; +mul.f32 f246, f2427, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2421, f235, 0f3F3504F3; +mul.f32 f2422, f2426, 0fBF3504F3; +sub.f32 f250, f2421, f2422; +mul.f32 f251, f2426, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2419, f239, 0f3EC3EF15; +mul.f32 f2420, f2425, 0fBF6C835E; +sub.f32 f255, f2419, f2420; +mul.f32 f256, f2425, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2417, f233, 0fBEC3EF15; +mul.f32 f2418, f234, 0fBF6C835E; +sub.f32 f260, f2417, f2418; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2415, f241, 0fBF6C835E; +mul.f32 f2416, f242, 0fBEC3EF15; +sub.f32 f269, f2415, f2416; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2414, f2446, f2428; +sub.f32 f275, f2446, f2428; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2413, f2445, f247; +sub.f32 f279, f2445, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2412, f2444, f252; +sub.f32 f283, f2444, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2411, f2443, f257; +sub.f32 f287, f2443, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2410, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2409, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2408, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2407, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2404, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2402, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2401, f2404, f2402; +sub.f32 f315, f2404, f2402; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2400, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2398, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2395, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2394, f2398, f2395; +sub.f32 f331, f2398, f2395; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2393, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2391, f332, 0f3F3504F3; +mul.f32 f2392, f2393, 0fBF3504F3; +sub.f32 f338, f2391, f2392; +mul.f32 f339, f2393, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2390, f2401, f2394; +sub.f32 f348, f2401, f2394; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2389, f2400, f340; +sub.f32 f352, f2400, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2388, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2387, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2385, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2383, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2382, f2385, f2383; +sub.f32 f372, f2385, f2383; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2381, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2378, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2377, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2376, f2378, f2377; +sub.f32 f388, f2378, f2377; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2375, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2373, f389, 0f3F3504F3; +mul.f32 f2374, f2375, 0fBF3504F3; +sub.f32 f395, f2373, f2374; +mul.f32 f396, f2375, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2372, f2382, f2376; +sub.f32 f405, f2382, f2376; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2371, f2381, f397; +sub.f32 f409, f2381, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2370, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2369, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2371, 0fBEC3EF15; +mul.f32 f2368, f406, 0f3F6C835E; +sub.f32 f420, f2368, f419; +mul.f32 f421, f2371, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2370, 0fBF3504F3; +mul.f32 f2367, f410, 0f3F3504F3; +sub.f32 f425, f2367, f424; +mul.f32 f426, f2370, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2365, f414, 0f3EC3EF15; +mul.f32 f2366, f2369, 0fBF6C835E; +sub.f32 f430, f2365, f2366; +mul.f32 f431, f2369, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2363, f408, 0fBEC3EF15; +mul.f32 f2364, f409, 0fBF6C835E; +sub.f32 f435, f2363, f2364; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2362, f416, 0fBF6C835E; +sub.f32 f444, f2362, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2361, f2390, f2372; +sub.f32 f450, f2390, f2372; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2360, f2389, f422; +sub.f32 f454, f2389, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2359, f2388, f427; +sub.f32 f458, f2388, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2358, f2387, f432; +sub.f32 f462, f2387, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2357, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2356, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2355, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2354, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2360, 0fBE47C5C2; +mul.f32 f2353, f451, 0f3F7B14BE; +sub.f32 f481, f2353, f480; +mul.f32 f482, f2360, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2359, 0fBEC3EF15; +mul.f32 f2352, f455, 0f3F6C835E; +sub.f32 f486, f2352, f485; +mul.f32 f487, f2359, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2358, 0fBF0E39DA; +mul.f32 f2351, f459, 0f3F54DB31; +sub.f32 f491, f2351, f490; +mul.f32 f492, f2358, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2357, 0fBF3504F3; +mul.f32 f2350, f463, 0f3F3504F3; +sub.f32 f496, f2350, f495; +mul.f32 f497, f2357, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2348, f467, 0f3F0E39DA; +mul.f32 f2349, f2356, 0fBF54DB31; +sub.f32 f501, f2348, f2349; +mul.f32 f502, f2356, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2346, f471, 0f3EC3EF15; +mul.f32 f2347, f2355, 0fBF6C835E; +sub.f32 f506, f2346, f2347; +mul.f32 f507, f2355, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2344, f475, 0f3E47C5C2; +mul.f32 f2345, f2354, 0fBF7B14BE; +sub.f32 f511, f2344, f2345; +mul.f32 f512, f2354, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2342, f453, 0fBE47C5C2; +mul.f32 f2343, f454, 0fBF7B14BE; +sub.f32 f516, f2342, f2343; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2341, f457, 0fBEC3EF15; +sub.f32 f521, f2341, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2340, f461, 0fBF0E39DA; +sub.f32 f526, f2340, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2338, f469, 0fBF54DB31; +mul.f32 f2339, f470, 0fBF0E39DA; +sub.f32 f535, f2338, f2339; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2337, f473, 0fBF6C835E; +sub.f32 f540, f2337, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2336, f477, 0fBF7B14BE; +sub.f32 f545, f2336, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2335, f2413, f483; +sub.f32 f553, f2413, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2334, f2412, f488; +sub.f32 f557, f2412, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2333, f2411, f493; +sub.f32 f561, f2411, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2332, f2410, f498; +sub.f32 f565, f2410, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f2331, f2409, f503; +sub.f32 f569, f2409, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f2330, f2408, f508; +sub.f32 f573, f2408, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f2329, f2407, f513; +sub.f32 f577, f2407, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f2328, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f2327, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f2326, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f2325, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f2324, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2323, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2322, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2321, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f2335; +mul.f32 f616, f610, f2335; +mul.f32 f618, f611, f611; +mul.f32 f2320, f610, f610; +sub.f32 f619, f2320, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f2334; +mul.f32 f624, f619, f2334; +mul.f32 f626, f611, f621; +mul.f32 f2319, f610, f619; +sub.f32 f627, f2319, f626; +mul.f32 f2318, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f2333; +mul.f32 f632, f627, f2333; +mul.f32 f2316, f610, f627; +mul.f32 f2317, f611, f629; +sub.f32 f635, f2316, f2317; +mul.f32 f2315, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f2332; +mul.f32 f640, f635, f2332; +mul.f32 f642, f611, f637; +mul.f32 f2314, f610, f635; +sub.f32 f643, f2314, f642; +mul.f32 f2313, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f2331; +mul.f32 f648, f643, f2331; +mul.f32 f2311, f610, f643; +mul.f32 f2312, f611, f645; +sub.f32 f651, f2311, f2312; +mul.f32 f2310, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f2330; +mul.f32 f656, f651, f2330; +mul.f32 f658, f611, f653; +mul.f32 f2309, f610, f651; +sub.f32 f659, f2309, f658; +mul.f32 f2308, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f2329; +mul.f32 f664, f659, f2329; +mul.f32 f666, f611, f661; +mul.f32 f2307, f610, f659; +sub.f32 f667, f2307, f666; +mul.f32 f2306, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f2328; +mul.f32 f672, f667, f2328; +mul.f32 f2304, f610, f667; +mul.f32 f2305, f611, f669; +sub.f32 f675, f2304, f2305; +mul.f32 f2303, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f2327; +mul.f32 f680, f675, f2327; +mul.f32 f682, f611, f677; +mul.f32 f2302, f610, f675; +sub.f32 f683, f2302, f682; +mul.f32 f2301, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f2326; +mul.f32 f688, f683, f2326; +mul.f32 f690, f611, f685; +mul.f32 f2300, f610, f683; +sub.f32 f691, f2300, f690; +mul.f32 f2299, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f2325; +mul.f32 f696, f691, f2325; +mul.f32 f2297, f610, f691; +mul.f32 f2298, f611, f693; +sub.f32 f699, f2297, f2298; +mul.f32 f2296, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f2324; +mul.f32 f704, f699, f2324; +mul.f32 f706, f611, f701; +mul.f32 f2295, f610, f699; +sub.f32 f707, f2295, f706; +mul.f32 f2294, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f2323; +mul.f32 f712, f707, f2323; +mul.f32 f2292, f610, f707; +mul.f32 f2293, f611, f709; +sub.f32 f715, f2292, f2293; +mul.f32 f2291, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f2322; +mul.f32 f720, f715, f2322; +mul.f32 f722, f611, f717; +mul.f32 f2290, f610, f715; +sub.f32 f723, f2290, f722; +mul.f32 f2289, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f2321; +mul.f32 f728, f723, f2321; +mul.f32 f730, f611, f725; +mul.f32 f2288, f610, f723; +sub.f32 f731, f2288, f730; +mul.f32 f2287, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2286, f2414, f2361; +mul.f32 f735, f733, f2286; +mul.f32 f736, f731, f2286; +mul.f32 f2284, f610, f731; +mul.f32 f2285, f611, f733; +sub.f32 f739, f2284, f2285; +sub.f32 f2283, f272, f447; +mul.f32 f2282, f731, f2283; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2281, f610, f739; +sub.f32 f747, f2281, f746; +mul.f32 f2280, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2279, f610, f747; +sub.f32 f755, f2279, f754; +mul.f32 f2278, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f2276, f610, f755; +mul.f32 f2277, f611, f757; +sub.f32 f763, f2276, f2277; +mul.f32 f2275, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2274, f610, f763; +sub.f32 f771, f2274, f770; +mul.f32 f2273, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f2271, f610, f771; +mul.f32 f2272, f611, f773; +sub.f32 f779, f2271, f2272; +mul.f32 f2270, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2269, f610, f779; +sub.f32 f787, f2269, f786; +mul.f32 f2268, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2267, f610, f787; +sub.f32 f795, f2267, f794; +mul.f32 f2266, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f2264, f610, f795; +mul.f32 f2265, f611, f797; +sub.f32 f803, f2264, f2265; +mul.f32 f2263, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2262, f610, f803; +sub.f32 f811, f2262, f810; +mul.f32 f2261, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2260, f610, f811; +sub.f32 f819, f2260, f818; +mul.f32 f2259, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f2257, f610, f819; +mul.f32 f2258, f611, f821; +sub.f32 f827, f2257, f2258; +mul.f32 f2256, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2255, f610, f827; +sub.f32 f835, f2255, f834; +mul.f32 f2254, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f2252, f610, f835; +mul.f32 f2253, f611, f837; +sub.f32 f843, f2252, f2253; +mul.f32 f2251, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2250, f610, f843; +sub.f32 f851, f2250, f850; +mul.f32 f2249, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f2248, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r33, %tid.x; +shl.b32 r32, r33, 8; +barrier.sync 0; +and.b32 r11, r32, 16128; +add.s32 r12, r9, r11; +add.f32 f857, f2414, f2361; +sub.f32 f2462, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r40, %tid.x; +shl.b32 r28, r40, 3; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f2249, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f2318, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f2315, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f2313, f639; +sub.f32 f867, f2310, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f2308, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f2306, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f2303, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f2301, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f2299, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f2296, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f2294, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f2291, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f2289, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f2287, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f2462, f736; +sub.f32 f890, f2282, f735; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f2280, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f2278, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f2275, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f2273, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f2270, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f2268, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f2266, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f2263, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f2261, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f2259, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f2256, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f2254, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f2251, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f2248, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +and.b32 r21, r40, 63; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+512]; +ld.shared.v2.f32 {f929, f930}, [r13+1024]; +ld.shared.v2.f32 {f933, f934}, [r13+1536]; +ld.shared.v2.f32 {f937, f938}, [r13+2048]; +ld.shared.v2.f32 {f941, f942}, [r13+2560]; +ld.shared.v2.f32 {f945, f946}, [r13+3072]; +ld.shared.v2.f32 {f949, f950}, [r13+3584]; +ld.shared.v2.f32 {f953, f954}, [r13+4096]; +ld.shared.v2.f32 {f957, f958}, [r13+4608]; +ld.shared.v2.f32 {f961, f962}, [r13+5120]; +ld.shared.v2.f32 {f965, f966}, [r13+5632]; +ld.shared.v2.f32 {f969, f970}, [r13+6144]; +ld.shared.v2.f32 {f973, f974}, [r13+6656]; +ld.shared.v2.f32 {f977, f978}, [r13+7168]; +ld.shared.v2.f32 {f981, f982}, [r13+7680]; +ld.shared.v2.f32 {f985, f986}, [r13+8192]; +ld.shared.v2.f32 {f989, f990}, [r13+8704]; +ld.shared.v2.f32 {f993, f994}, [r13+9216]; +ld.shared.v2.f32 {f997, f998}, [r13+9728]; +ld.shared.v2.f32 {f1001, f1002}, [r13+10240]; +ld.shared.v2.f32 {f1005, f1006}, [r13+10752]; +ld.shared.v2.f32 {f1009, f1010}, [r13+11264]; +ld.shared.v2.f32 {f1013, f1014}, [r13+11776]; +ld.shared.v2.f32 {f1017, f1018}, [r13+12288]; +ld.shared.v2.f32 {f1021, f1022}, [r13+12800]; +ld.shared.v2.f32 {f1025, f1026}, [r13+13312]; +ld.shared.v2.f32 {f1029, f1030}, [r13+13824]; +ld.shared.v2.f32 {f1033, f1034}, [r13+14336]; +ld.shared.v2.f32 {f1037, f1038}, [r13+14848]; +ld.shared.v2.f32 {f1041, f1042}, [r13+15360]; +ld.shared.v2.f32 {f1045, f1046}, [r13+15872]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2247, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2246, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2245, f2247, f2246; +sub.f32 f1060, f2247, f2246; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f2244, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2243, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2242, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2241, f2243, f2242; +sub.f32 f1076, f2243, f2242; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f2240, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f2240, 0fBF3504F3; +mul.f32 f2239, f1077, 0f3F3504F3; +sub.f32 f1083, f2239, f1082; +mul.f32 f1084, f2240, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2238, f2245, f2241; +sub.f32 f1093, f2245, f2241; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2237, f2244, f1085; +sub.f32 f1097, f2244, f1085; +add.f32 f1098, f1059, f1076; +sub.f32 f1100, f1059, f1076; +sub.f32 f2236, f1060, f1075; +add.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1088; +sub.f32 f1104, f1063, f1088; +add.f32 f2235, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2234, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2233, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2232, f2234, f2233; +sub.f32 f1117, f2234, f2233; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f2231, f1109, f1112; +add.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2230, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2229, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2228, f2230, f2229; +sub.f32 f1133, f2230, f2229; +add.f32 f1134, f1124, f1129; +sub.f32 f1136, f1124, f1129; +sub.f32 f2227, f1125, f1128; +add.f32 f1137, f1125, f1128; +mul.f32 f1139, f2227, 0fBF3504F3; +mul.f32 f2226, f1134, 0f3F3504F3; +sub.f32 f1140, f2226, f1139; +mul.f32 f1141, f2227, 0f3F3504F3; +fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141; +mul.f32 f1143, f1136, 0fBF3504F3; +mul.f32 f1144, f1137, 0fBF3504F3; +sub.f32 f1145, f1143, f1144; +add.f32 f1146, f1143, f1144; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2225, f2232, f2228; +sub.f32 f1150, f2232, f2228; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2224, f2231, f1142; +sub.f32 f1154, f2231, f1142; +add.f32 f1155, f1116, f1133; +sub.f32 f1157, f1116, f1133; +sub.f32 f2223, f1117, f1132; +add.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1145; +sub.f32 f1161, f1120, f1145; +add.f32 f2222, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2220, f1151, 0f3F6C835E; +mul.f32 f2221, f2224, 0fBEC3EF15; +sub.f32 f1165, f2220, f2221; +mul.f32 f1166, f2224, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166; +mul.f32 f2218, f1155, 0f3F3504F3; +mul.f32 f2219, f2223, 0fBF3504F3; +sub.f32 f1170, f2218, f2219; +mul.f32 f1171, f2223, 0f3F3504F3; +fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171; +mul.f32 f1174, f2222, 0fBF6C835E; +mul.f32 f2217, f1159, 0f3EC3EF15; +sub.f32 f1175, f2217, f1174; +mul.f32 f1176, f2222, 0f3EC3EF15; +fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176; +mul.f32 f1179, f1154, 0fBF6C835E; +mul.f32 f2216, f1153, 0fBEC3EF15; +sub.f32 f1180, f2216, f1179; +mul.f32 f1181, f1154, 0fBEC3EF15; +fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181; +mul.f32 f1183, f1157, 0fBF3504F3; +mul.f32 f1184, f1158, 0fBF3504F3; +sub.f32 f1185, f1183, f1184; +add.f32 f1186, f1183, f1184; +mul.f32 f2214, f1161, 0fBF6C835E; +mul.f32 f2215, f1162, 0fBEC3EF15; +sub.f32 f1189, f2214, f2215; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2213, f2238, f2225; +sub.f32 f1195, f2238, f2225; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2212, f2237, f1167; +sub.f32 f1199, f2237, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2211, f2236, f1172; +sub.f32 f1203, f2236, f1172; +add.f32 f1204, f1102, f1175; +sub.f32 f1206, f1102, f1175; +add.f32 f2210, f2235, f1177; +sub.f32 f1207, f2235, f1177; +add.f32 f1208, f1092, f1150; +sub.f32 f1210, f1092, f1150; +sub.f32 f2209, f1093, f1149; +add.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1180; +sub.f32 f1214, f1096, f1180; +add.f32 f2208, f1097, f1182; +sub.f32 f1215, f1097, f1182; +add.f32 f1216, f1100, f1185; +sub.f32 f1218, f1100, f1185; +add.f32 f2207, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2206, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2205, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2204, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2203, f2205, f2204; +sub.f32 f1235, f2205, f2204; +add.f32 f1236, f1226, f1231; +sub.f32 f1238, f1226, f1231; +sub.f32 f2202, f1227, f1230; +add.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2201, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2200, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2199, f2201, f2200; +sub.f32 f1251, f2201, f2200; +add.f32 f1252, f1242, f1247; +sub.f32 f1254, f1242, f1247; +sub.f32 f2198, f1243, f1246; +add.f32 f1255, f1243, f1246; +mul.f32 f1257, f2198, 0fBF3504F3; +mul.f32 f2197, f1252, 0f3F3504F3; +sub.f32 f1258, f2197, f1257; +mul.f32 f1259, f2198, 0f3F3504F3; +fma.rn.f32 f1260, f1252, 0fBF3504F3, f1259; +mul.f32 f1261, f1254, 0fBF3504F3; +mul.f32 f1262, f1255, 0fBF3504F3; +sub.f32 f1263, f1261, f1262; +add.f32 f1264, f1261, f1262; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2196, f2203, f2199; +sub.f32 f1268, f2203, f2199; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2195, f2202, f1260; +sub.f32 f1272, f2202, f1260; +add.f32 f1273, f1234, f1251; +sub.f32 f1275, f1234, f1251; +sub.f32 f2194, f1235, f1250; +add.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1263; +sub.f32 f1279, f1238, f1263; +add.f32 f2193, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2192, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2191, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2190, f2192, f2191; +sub.f32 f1292, f2192, f2191; +add.f32 f1293, f1283, f1288; +sub.f32 f1295, f1283, f1288; +sub.f32 f2189, f1284, f1287; +add.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2188, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2187, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2186, f2188, f2187; +sub.f32 f1308, f2188, f2187; +add.f32 f1309, f1299, f1304; +sub.f32 f1311, f1299, f1304; +sub.f32 f2185, f1300, f1303; +add.f32 f1312, f1300, f1303; +mul.f32 f1314, f2185, 0fBF3504F3; +mul.f32 f2184, f1309, 0f3F3504F3; +sub.f32 f1315, f2184, f1314; +mul.f32 f1316, f2185, 0f3F3504F3; +fma.rn.f32 f1317, f1309, 0fBF3504F3, f1316; +mul.f32 f1318, f1311, 0fBF3504F3; +mul.f32 f1319, f1312, 0fBF3504F3; +sub.f32 f1320, f1318, f1319; +add.f32 f1321, f1318, f1319; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2183, f2190, f2186; +sub.f32 f1325, f2190, f2186; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2182, f2189, f1317; +sub.f32 f1329, f2189, f1317; +add.f32 f1330, f1291, f1308; +sub.f32 f1332, f1291, f1308; +sub.f32 f2181, f1292, f1307; +add.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1320; +sub.f32 f1336, f1295, f1320; +add.f32 f2180, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2178, f1326, 0f3F6C835E; +mul.f32 f2179, f2182, 0fBEC3EF15; +sub.f32 f1340, f2178, f2179; +mul.f32 f1341, f2182, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0fBEC3EF15, f1341; +mul.f32 f2176, f1330, 0f3F3504F3; +mul.f32 f2177, f2181, 0fBF3504F3; +sub.f32 f1345, f2176, f2177; +mul.f32 f1346, f2181, 0f3F3504F3; +fma.rn.f32 f1347, f1330, 0fBF3504F3, f1346; +mul.f32 f2174, f1334, 0f3EC3EF15; +mul.f32 f2175, f2180, 0fBF6C835E; +sub.f32 f1350, f2174, f2175; +mul.f32 f1351, f2180, 0f3EC3EF15; +fma.rn.f32 f1352, f1334, 0fBF6C835E, f1351; +mul.f32 f2172, f1328, 0fBEC3EF15; +mul.f32 f2173, f1329, 0fBF6C835E; +sub.f32 f1355, f2172, f2173; +mul.f32 f1356, f1329, 0fBEC3EF15; +fma.rn.f32 f1357, f1328, 0fBF6C835E, f1356; +mul.f32 f1358, f1332, 0fBF3504F3; +mul.f32 f1359, f1333, 0fBF3504F3; +sub.f32 f1360, f1358, f1359; +add.f32 f1361, f1358, f1359; +mul.f32 f2170, f1336, 0fBF6C835E; +mul.f32 f2171, f1337, 0fBEC3EF15; +sub.f32 f1364, f2170, f2171; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0fBEC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2169, f2196, f2183; +sub.f32 f1370, f2196, f2183; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2168, f2195, f1342; +sub.f32 f1374, f2195, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2167, f2194, f1347; +sub.f32 f1378, f2194, f1347; +add.f32 f1379, f1277, f1350; +sub.f32 f1381, f1277, f1350; +add.f32 f2166, f2193, f1352; +sub.f32 f1382, f2193, f1352; +add.f32 f1383, f1267, f1325; +sub.f32 f1385, f1267, f1325; +sub.f32 f2165, f1268, f1324; +add.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1355; +sub.f32 f1389, f1271, f1355; +add.f32 f2164, f1272, f1357; +sub.f32 f1390, f1272, f1357; +add.f32 f1391, f1275, f1360; +sub.f32 f1393, f1275, f1360; +add.f32 f2163, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2162, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2168, 0fBE47C5C2; +mul.f32 f2161, f1371, 0f3F7B14BE; +sub.f32 f1401, f2161, f1400; +mul.f32 f1402, f2168, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0fBE47C5C2, f1402; +mul.f32 f1405, f2167, 0fBEC3EF15; +mul.f32 f2160, f1375, 0f3F6C835E; +sub.f32 f1406, f2160, f1405; +mul.f32 f1407, f2167, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0fBEC3EF15, f1407; +mul.f32 f2158, f1379, 0f3F54DB31; +mul.f32 f2159, f2166, 0fBF0E39DA; +sub.f32 f1411, f2158, f2159; +mul.f32 f1412, f2166, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0fBF0E39DA, f1412; +mul.f32 f2156, f1383, 0f3F3504F3; +mul.f32 f2157, f2165, 0fBF3504F3; +sub.f32 f1416, f2156, f2157; +mul.f32 f1417, f2165, 0f3F3504F3; +fma.rn.f32 f1418, f1383, 0fBF3504F3, f1417; +mul.f32 f2154, f1387, 0f3F0E39DA; +mul.f32 f2155, f2164, 0fBF54DB31; +sub.f32 f1421, f2154, f2155; +mul.f32 f1422, f2164, 0f3F0E39DA; +fma.rn.f32 f1423, f1387, 0fBF54DB31, f1422; +mul.f32 f2152, f1391, 0f3EC3EF15; +mul.f32 f2153, f2163, 0fBF6C835E; +sub.f32 f1426, f2152, f2153; +mul.f32 f1427, f2163, 0f3EC3EF15; +fma.rn.f32 f1428, f1391, 0fBF6C835E, f1427; +mul.f32 f1430, f2162, 0fBF7B14BE; +mul.f32 f2151, f1395, 0f3E47C5C2; +sub.f32 f1431, f2151, f1430; +mul.f32 f1432, f2162, 0f3E47C5C2; +fma.rn.f32 f1433, f1395, 0fBF7B14BE, f1432; +mul.f32 f1435, f1374, 0fBF7B14BE; +mul.f32 f2150, f1373, 0fBE47C5C2; +sub.f32 f1436, f2150, f1435; +mul.f32 f1437, f1374, 0fBE47C5C2; +fma.rn.f32 f1438, f1373, 0fBF7B14BE, f1437; +mul.f32 f1440, f1378, 0fBF6C835E; +mul.f32 f2149, f1377, 0fBEC3EF15; +sub.f32 f1441, f2149, f1440; +mul.f32 f1442, f1378, 0fBEC3EF15; +fma.rn.f32 f1443, f1377, 0fBF6C835E, f1442; +mul.f32 f1445, f1382, 0fBF54DB31; +mul.f32 f2148, f1381, 0fBF0E39DA; +sub.f32 f1446, f2148, f1445; +mul.f32 f1447, f1382, 0fBF0E39DA; +fma.rn.f32 f1448, f1381, 0fBF54DB31, f1447; +mul.f32 f1449, f1385, 0fBF3504F3; +mul.f32 f1450, f1386, 0fBF3504F3; +sub.f32 f1451, f1449, f1450; +add.f32 f1452, f1449, f1450; +mul.f32 f1454, f1390, 0fBF0E39DA; +mul.f32 f2147, f1389, 0fBF54DB31; +sub.f32 f1455, f2147, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0fBF0E39DA, f1456; +mul.f32 f1459, f1394, 0fBEC3EF15; +mul.f32 f2146, f1393, 0fBF6C835E; +sub.f32 f1460, f2146, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0fBEC3EF15, f1461; +mul.f32 f1464, f1398, 0fBE47C5C2; +mul.f32 f2145, f1397, 0fBF7B14BE; +sub.f32 f1465, f2145, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0fBE47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2144, f2212, f1403; +sub.f32 f1473, f2212, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2143, f2211, f1408; +sub.f32 f1477, f2211, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2142, f2210, f1413; +sub.f32 f1481, f2210, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2141, f2209, f1418; +sub.f32 f1485, f2209, f1418; +add.f32 f1486, f1212, f1421; +sub.f32 f1488, f1212, f1421; +add.f32 f2140, f2208, f1423; +sub.f32 f1489, f2208, f1423; +add.f32 f1490, f1216, f1426; +sub.f32 f1492, f1216, f1426; +add.f32 f2139, f2207, f1428; +sub.f32 f1493, f2207, f1428; +add.f32 f1494, f1220, f1431; +sub.f32 f1496, f1220, f1431; +add.f32 f2138, f2206, f1433; +sub.f32 f1497, f2206, f1433; +add.f32 f1498, f1194, f1370; +sub.f32 f1500, f1194, f1370; +sub.f32 f2137, f1195, f1369; +add.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1436; +sub.f32 f1504, f1198, f1436; +add.f32 f2136, f1199, f1438; +sub.f32 f1505, f1199, f1438; +add.f32 f1506, f1202, f1441; +sub.f32 f1508, f1202, f1441; +add.f32 f2135, f1203, f1443; +sub.f32 f1509, f1203, f1443; +add.f32 f1510, f1206, f1446; +sub.f32 f1512, f1206, f1446; +add.f32 f2134, f1207, f1448; +sub.f32 f1513, f1207, f1448; +add.f32 f1514, f1210, f1451; +sub.f32 f1516, f1210, f1451; +add.f32 f2133, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2132, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2131, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2130, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r40, 32; +bfe.u32 r15, r40, 5, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1535, f1531, f2144; +mul.f32 f1536, f1530, f2144; +mul.f32 f2128, f1530, f1530; +mul.f32 f2129, f1531, f1531; +sub.f32 f1539, f2128, f2129; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1543, f1541, f2143; +mul.f32 f1544, f1539, f2143; +mul.f32 f1546, f1531, f1541; +mul.f32 f2127, f1530, f1539; +sub.f32 f1547, f2127, f1546; +mul.f32 f2126, f1539, f1474; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1551, f1549, f2142; +mul.f32 f1552, f1547, f2142; +mul.f32 f1554, f1531, f1549; +mul.f32 f2125, f1530, f1547; +sub.f32 f1555, f2125, f1554; +mul.f32 f2124, f1547, f1478; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1559, f1557, f2141; +mul.f32 f1560, f1555, f2141; +mul.f32 f2122, f1530, f1555; +mul.f32 f2123, f1531, f1557; +sub.f32 f1563, f2122, f2123; +mul.f32 f2121, f1555, f1482; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1567, f1565, f2140; +mul.f32 f1568, f1563, f2140; +mul.f32 f1570, f1531, f1565; +mul.f32 f2120, f1530, f1563; +sub.f32 f1571, f2120, f1570; +mul.f32 f2119, f1563, f1486; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1575, f1573, f2139; +mul.f32 f1576, f1571, f2139; +mul.f32 f1578, f1531, f1573; +mul.f32 f2118, f1530, f1571; +sub.f32 f1579, f2118, f1578; +mul.f32 f2117, f1571, f1490; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1583, f1581, f2138; +mul.f32 f1584, f1579, f2138; +mul.f32 f2115, f1530, f1579; +mul.f32 f2116, f1531, f1581; +sub.f32 f1587, f2115, f2116; +mul.f32 f2114, f1579, f1494; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1591, f1589, f2137; +mul.f32 f1592, f1587, f2137; +mul.f32 f1594, f1531, f1589; +mul.f32 f2113, f1530, f1587; +sub.f32 f1595, f2113, f1594; +mul.f32 f2112, f1587, f1498; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1599, f1597, f2136; +mul.f32 f1600, f1595, f2136; +mul.f32 f2110, f1530, f1595; +mul.f32 f2111, f1531, f1597; +sub.f32 f1603, f2110, f2111; +mul.f32 f2109, f1595, f1502; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1607, f1605, f2135; +mul.f32 f1608, f1603, f2135; +mul.f32 f1610, f1531, f1605; +mul.f32 f2108, f1530, f1603; +sub.f32 f1611, f2108, f1610; +mul.f32 f2107, f1603, f1506; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1615, f1613, f2134; +mul.f32 f1616, f1611, f2134; +mul.f32 f1618, f1531, f1613; +mul.f32 f2106, f1530, f1611; +sub.f32 f1619, f2106, f1618; +mul.f32 f2105, f1611, f1510; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1623, f1621, f2133; +mul.f32 f1624, f1619, f2133; +mul.f32 f2103, f1530, f1619; +mul.f32 f2104, f1531, f1621; +sub.f32 f1627, f2103, f2104; +mul.f32 f2102, f1619, f1514; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1631, f1629, f2132; +mul.f32 f1632, f1627, f2132; +mul.f32 f1634, f1531, f1629; +mul.f32 f2101, f1530, f1627; +sub.f32 f1635, f2101, f1634; +mul.f32 f2100, f1627, f1518; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1639, f1637, f2131; +mul.f32 f1640, f1635, f2131; +mul.f32 f1642, f1531, f1637; +mul.f32 f2099, f1530, f1635; +sub.f32 f1643, f2099, f1642; +mul.f32 f2098, f1635, f1522; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1647, f1645, f2130; +mul.f32 f1648, f1643, f2130; +mul.f32 f2096, f1530, f1643; +mul.f32 f2097, f1531, f1645; +sub.f32 f1651, f2096, f2097; +mul.f32 f2095, f1643, f1526; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2094, f2213, f2169; +mul.f32 f1655, f1653, f2094; +mul.f32 f1656, f1651, f2094; +mul.f32 f1658, f1531, f1653; +mul.f32 f2093, f1530, f1651; +sub.f32 f1659, f2093, f1658; +sub.f32 f2092, f1192, f1367; +mul.f32 f2091, f1651, f2092; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1663, f1661, f1473; +mul.f32 f1664, f1659, f1473; +mul.f32 f2089, f1530, f1659; +mul.f32 f2090, f1531, f1661; +sub.f32 f1667, f2089, f2090; +mul.f32 f2088, f1659, f1472; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1671, f1669, f1477; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2087, f1530, f1667; +sub.f32 f1675, f2087, f1674; +mul.f32 f2086, f1667, f1476; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1679, f1677, f1481; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2085, f1530, f1675; +sub.f32 f1683, f2085, f1682; +mul.f32 f2084, f1675, f1480; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1687, f1685, f1485; +mul.f32 f1688, f1683, f1485; +mul.f32 f2082, f1530, f1683; +mul.f32 f2083, f1531, f1685; +sub.f32 f1691, f2082, f2083; +mul.f32 f2081, f1683, f1484; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1695, f1693, f1489; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2080, f1530, f1691; +sub.f32 f1699, f2080, f1698; +mul.f32 f2079, f1691, f1488; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1703, f1701, f1493; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2078, f1530, f1699; +sub.f32 f1707, f2078, f1706; +mul.f32 f2077, f1699, f1492; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1711, f1709, f1497; +mul.f32 f1712, f1707, f1497; +mul.f32 f2075, f1530, f1707; +mul.f32 f2076, f1531, f1709; +sub.f32 f1715, f2075, f2076; +mul.f32 f2074, f1707, f1496; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1719, f1717, f1501; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2073, f1530, f1715; +sub.f32 f1723, f2073, f1722; +mul.f32 f2072, f1715, f1500; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1727, f1725, f1505; +mul.f32 f1728, f1723, f1505; +mul.f32 f2070, f1530, f1723; +mul.f32 f2071, f1531, f1725; +sub.f32 f1731, f2070, f2071; +mul.f32 f2069, f1723, f1504; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1735, f1733, f1509; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2068, f1530, f1731; +sub.f32 f1739, f2068, f1738; +mul.f32 f2067, f1731, f1508; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1743, f1741, f1513; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2066, f1530, f1739; +sub.f32 f1747, f2066, f1746; +mul.f32 f2065, f1739, f1512; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1751, f1749, f1517; +mul.f32 f1752, f1747, f1517; +mul.f32 f2063, f1530, f1747; +mul.f32 f2064, f1531, f1749; +sub.f32 f1755, f2063, f2064; +mul.f32 f2062, f1747, f1516; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1759, f1757, f1521; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2061, f1530, f1755; +sub.f32 f1763, f2061, f1762; +mul.f32 f2060, f1755, f1520; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1767, f1765, f1525; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2059, f1530, f1763; +sub.f32 f1771, f2059, f1770; +mul.f32 f2058, f1530, f1470; +mul.f32 f1772, f1530, f1765; +mul.f32 f2057, f1763, f1524; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1771, f1528; +mul.f32 f1775, f1773, f1529; +mul.f32 f1776, f1771, f1529; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 8; +and.b32 r16, r28, 248; +add.s32 r17, r9, r16; +sub.f32 f2465, f2213, f2169; +mul.f32 f2464, f1653, f2465; +barrier.sync 0; +and.b32 r18, r23, 8192; +add.s32 r19, r17, r18; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 32; +add.f32 f1777, f2213, f2169; +sub.f32 f2466, f1192, f1367; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 32; +fma.rn.f32 f1779, f1531, f1470, f1536; +sub.f32 f1780, f2058, f1535; +st.shared.v2.f32 [r19+256], {f1780, f1779}; +fma.rn.f32 f1781, f1541, f1474, f1544; +sub.f32 f1782, f2126, f1543; +st.shared.v2.f32 [r19+512], {f1782, f1781}; +fma.rn.f32 f1783, f1549, f1478, f1552; +sub.f32 f1784, f2124, f1551; +st.shared.v2.f32 [r19+768], {f1784, f1783}; +fma.rn.f32 f1785, f1557, f1482, f1560; +sub.f32 f1786, f2121, f1559; +st.shared.v2.f32 [r19+1024], {f1786, f1785}; +fma.rn.f32 f1787, f1565, f1486, f1568; +sub.f32 f1788, f2119, f1567; +st.shared.v2.f32 [r19+1280], {f1788, f1787}; +sub.f32 f1789, f2117, f1575; +fma.rn.f32 f1790, f1573, f1490, f1576; +st.shared.v2.f32 [r19+1536], {f1789, f1790}; +fma.rn.f32 f1791, f1581, f1494, f1584; +sub.f32 f1792, f2114, f1583; +st.shared.v2.f32 [r19+1792], {f1792, f1791}; +fma.rn.f32 f1793, f1589, f1498, f1592; +sub.f32 f1794, f2112, f1591; +st.shared.v2.f32 [r19+2048], {f1794, f1793}; +fma.rn.f32 f1795, f1597, f1502, f1600; +sub.f32 f1796, f2109, f1599; +st.shared.v2.f32 [r19+2304], {f1796, f1795}; +fma.rn.f32 f1797, f1605, f1506, f1608; +sub.f32 f1798, f2107, f1607; +st.shared.v2.f32 [r19+2560], {f1798, f1797}; +fma.rn.f32 f1799, f1613, f1510, f1616; +sub.f32 f1800, f2105, f1615; +st.shared.v2.f32 [r19+2816], {f1800, f1799}; +fma.rn.f32 f1801, f1621, f1514, f1624; +sub.f32 f1802, f2102, f1623; +st.shared.v2.f32 [r19+3072], {f1802, f1801}; +fma.rn.f32 f1803, f1629, f1518, f1632; +sub.f32 f1804, f2100, f1631; +st.shared.v2.f32 [r19+3328], {f1804, f1803}; +fma.rn.f32 f1805, f1637, f1522, f1640; +sub.f32 f1806, f2098, f1639; +st.shared.v2.f32 [r19+3584], {f1806, f1805}; +fma.rn.f32 f1807, f1645, f1526, f1648; +sub.f32 f1808, f2095, f1647; +st.shared.v2.f32 [r19+3840], {f1808, f1807}; +fma.rn.f32 f1809, f1653, f2466, f1656; +sub.f32 f1810, f2091, f2464; +st.shared.v2.f32 [r19+4096], {f1810, f1809}; +fma.rn.f32 f1811, f1661, f1472, f1664; +sub.f32 f1812, f2088, f1663; +st.shared.v2.f32 [r19+4352], {f1812, f1811}; +fma.rn.f32 f1813, f1669, f1476, f1672; +sub.f32 f1814, f2086, f1671; +st.shared.v2.f32 [r19+4608], {f1814, f1813}; +fma.rn.f32 f1815, f1677, f1480, f1680; +sub.f32 f1816, f2084, f1679; +st.shared.v2.f32 [r19+4864], {f1816, f1815}; +fma.rn.f32 f1817, f1685, f1484, f1688; +sub.f32 f1818, f2081, f1687; +st.shared.v2.f32 [r19+5120], {f1818, f1817}; +fma.rn.f32 f1819, f1693, f1488, f1696; +sub.f32 f1820, f2079, f1695; +st.shared.v2.f32 [r19+5376], {f1820, f1819}; +fma.rn.f32 f1821, f1701, f1492, f1704; +sub.f32 f1822, f2077, f1703; +st.shared.v2.f32 [r19+5632], {f1822, f1821}; +fma.rn.f32 f1823, f1709, f1496, f1712; +sub.f32 f1824, f2074, f1711; +st.shared.v2.f32 [r19+5888], {f1824, f1823}; +fma.rn.f32 f1825, f1717, f1500, f1720; +sub.f32 f1826, f2072, f1719; +st.shared.v2.f32 [r19+6144], {f1826, f1825}; +fma.rn.f32 f1827, f1725, f1504, f1728; +sub.f32 f1828, f2069, f1727; +st.shared.v2.f32 [r19+6400], {f1828, f1827}; +fma.rn.f32 f1829, f1733, f1508, f1736; +sub.f32 f1830, f2067, f1735; +st.shared.v2.f32 [r19+6656], {f1830, f1829}; +fma.rn.f32 f1831, f1741, f1512, f1744; +sub.f32 f1832, f2065, f1743; +st.shared.v2.f32 [r19+6912], {f1832, f1831}; +fma.rn.f32 f1833, f1749, f1516, f1752; +sub.f32 f1834, f2062, f1751; +st.shared.v2.f32 [r19+7168], {f1834, f1833}; +fma.rn.f32 f1835, f1757, f1520, f1760; +sub.f32 f1836, f2060, f1759; +st.shared.v2.f32 [r19+7424], {f1836, f1835}; +fma.rn.f32 f1837, f1765, f1524, f1768; +sub.f32 f1838, f2057, f1767; +st.shared.v2.f32 [r19+7680], {f1838, f1837}; +fma.rn.f32 f1839, f1773, f1528, f1776; +sub.f32 f1840, f1774, f1775; +st.shared.v2.f32 [r19+7936], {f1840, f1839}; +barrier.sync 0; +mad.lo.s32 r20, r30, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+512]; +ld.shared.v2.f32 {f1849, f1850}, [r20+1024]; +ld.shared.v2.f32 {f1853, f1854}, [r20+1536]; +ld.shared.v2.f32 {f1857, f1858}, [r20+2048]; +ld.shared.v2.f32 {f1861, f1862}, [r20+2560]; +ld.shared.v2.f32 {f1865, f1866}, [r20+3072]; +ld.shared.v2.f32 {f1869, f1870}, [r20+3584]; +ld.shared.v2.f32 {f1873, f1874}, [r20+4096]; +ld.shared.v2.f32 {f1877, f1878}, [r20+4608]; +ld.shared.v2.f32 {f1881, f1882}, [r20+5120]; +ld.shared.v2.f32 {f1885, f1886}, [r20+5632]; +ld.shared.v2.f32 {f1889, f1890}, [r20+6144]; +ld.shared.v2.f32 {f1893, f1894}, [r20+6656]; +ld.shared.v2.f32 {f1897, f1898}, [r20+7168]; +ld.shared.v2.f32 {f1901, f1902}, [r20+7680]; +ld.shared.v2.f32 {f1905, f1906}, [r20+8192]; +ld.shared.v2.f32 {f1909, f1910}, [r20+8704]; +ld.shared.v2.f32 {f1913, f1914}, [r20+9216]; +ld.shared.v2.f32 {f1917, f1918}, [r20+9728]; +ld.shared.v2.f32 {f1921, f1922}, [r20+10240]; +ld.shared.v2.f32 {f1925, f1926}, [r20+10752]; +ld.shared.v2.f32 {f1929, f1930}, [r20+11264]; +ld.shared.v2.f32 {f1933, f1934}, [r20+11776]; +ld.shared.v2.f32 {f1937, f1938}, [r20+12288]; +ld.shared.v2.f32 {f1941, f1942}, [r20+12800]; +ld.shared.v2.f32 {f1945, f1946}, [r20+13312]; +ld.shared.v2.f32 {f1949, f1950}, [r20+13824]; +ld.shared.v2.f32 {f1953, f1954}, [r20+14336]; +ld.shared.v2.f32 {f1957, f1958}, [r20+14848]; +ld.shared.v2.f32 {f1961, f1962}, [r20+15360]; +ld.shared.v2.f32 {f1965, f1966}, [r20+15872]; +add.f32 %0, f1841, f1905; +add.f32 %1, f1842, f1906; +add.f32 %3, f1846, f1910; +add.f32 %2, f1845, f1909; +add.f32 %5, f1850, f1914; +add.f32 %4, f1849, f1913; +add.f32 %7, f1854, f1918; +add.f32 %6, f1853, f1917; +add.f32 %8, f1857, f1921; +add.f32 %9, f1858, f1922; +add.f32 %10, f1861, f1925; +add.f32 %11, f1862, f1926; +add.f32 %12, f1865, f1929; +add.f32 %13, f1866, f1930; +add.f32 %15, f1870, f1934; +add.f32 %14, f1869, f1933; +add.f32 %17, f1874, f1938; +add.f32 %16, f1873, f1937; +add.f32 %19, f1878, f1942; +add.f32 %18, f1877, f1941; +add.f32 %20, f1881, f1945; +add.f32 %21, f1882, f1946; +add.f32 %22, f1885, f1949; +add.f32 %23, f1886, f1950; +add.f32 %24, f1889, f1953; +add.f32 %25, f1890, f1954; +add.f32 %26, f1893, f1957; +add.f32 %27, f1894, f1958; +add.f32 %29, f1898, f1962; +add.f32 %28, f1897, f1961; +add.f32 %31, f1902, f1966; +add.f32 %30, f1901, f1965; +sub.f32 %33, f1842, f1906; +sub.f32 %32, f1841, f1905; +sub.f32 %35, f1846, f1910; +sub.f32 %34, f1845, f1909; +sub.f32 %37, f1850, f1914; +sub.f32 %36, f1849, f1913; +sub.f32 %39, f1854, f1918; +sub.f32 %38, f1853, f1917; +sub.f32 %41, f1858, f1922; +sub.f32 %40, f1857, f1921; +sub.f32 %43, f1862, f1926; +sub.f32 %42, f1861, f1925; +sub.f32 %45, f1866, f1930; +sub.f32 %44, f1865, f1929; +sub.f32 %47, f1870, f1934; +sub.f32 %46, f1869, f1933; +sub.f32 %49, f1874, f1938; +sub.f32 %48, f1873, f1937; +sub.f32 %51, f1878, f1942; +sub.f32 %50, f1877, f1941; +sub.f32 %53, f1882, f1946; +sub.f32 %52, f1881, f1945; +sub.f32 %55, f1886, f1950; +sub.f32 %54, f1885, f1949; +sub.f32 %57, f1890, f1954; +sub.f32 %56, f1889, f1953; +sub.f32 %59, f1894, f1958; +sub.f32 %58, f1893, f1957; +sub.f32 %61, f1898, f1962; +sub.f32 %60, f1897, f1961; +sub.f32 %63, f1902, f1966; +sub.f32 %62, f1901, f1965; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_2048), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<96, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2361>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2359, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2357, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2356, f2359, f2357; +sub.f32 f140, f2359, f2357; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2355, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2352, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2350, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2349, f2352, f2350; +sub.f32 f156, f2352, f2350; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2348, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2348, 0fBF3504F3; +mul.f32 f2347, f157, 0f3F3504F3; +sub.f32 f163, f2347, f162; +mul.f32 f164, f2348, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2346, f2356, f2349; +sub.f32 f173, f2356, f2349; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2345, f2355, f165; +sub.f32 f177, f2355, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2344, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2343, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2341, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2338, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2337, f2341, f2338; +sub.f32 f197, f2341, f2338; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2336, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2334, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2332, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2331, f2334, f2332; +sub.f32 f213, f2334, f2332; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2330, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2330, 0fBF3504F3; +mul.f32 f2329, f214, 0f3F3504F3; +sub.f32 f220, f2329, f219; +mul.f32 f221, f2330, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2328, f2337, f2331; +sub.f32 f230, f2337, f2331; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2327, f2336, f222; +sub.f32 f234, f2336, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2326, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2325, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2323, f231, 0f3F6C835E; +mul.f32 f2324, f2327, 0fBEC3EF15; +sub.f32 f245, f2323, f2324; +mul.f32 f246, f2327, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2321, f235, 0f3F3504F3; +mul.f32 f2322, f2326, 0fBF3504F3; +sub.f32 f250, f2321, f2322; +mul.f32 f251, f2326, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2319, f239, 0f3EC3EF15; +mul.f32 f2320, f2325, 0fBF6C835E; +sub.f32 f255, f2319, f2320; +mul.f32 f256, f2325, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2317, f233, 0fBEC3EF15; +mul.f32 f2318, f234, 0fBF6C835E; +sub.f32 f260, f2317, f2318; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2315, f241, 0fBF6C835E; +mul.f32 f2316, f242, 0fBEC3EF15; +sub.f32 f269, f2315, f2316; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2314, f2346, f2328; +sub.f32 f275, f2346, f2328; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2313, f2345, f247; +sub.f32 f279, f2345, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2312, f2344, f252; +sub.f32 f283, f2344, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2311, f2343, f257; +sub.f32 f287, f2343, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2310, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2309, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2308, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2307, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2304, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2302, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2301, f2304, f2302; +sub.f32 f315, f2304, f2302; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2300, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2298, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2295, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2294, f2298, f2295; +sub.f32 f331, f2298, f2295; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2293, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2291, f332, 0f3F3504F3; +mul.f32 f2292, f2293, 0fBF3504F3; +sub.f32 f338, f2291, f2292; +mul.f32 f339, f2293, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2290, f2301, f2294; +sub.f32 f348, f2301, f2294; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2289, f2300, f340; +sub.f32 f352, f2300, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2288, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2287, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2285, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2283, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2282, f2285, f2283; +sub.f32 f372, f2285, f2283; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2281, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2278, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2277, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2276, f2278, f2277; +sub.f32 f388, f2278, f2277; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2275, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2273, f389, 0f3F3504F3; +mul.f32 f2274, f2275, 0fBF3504F3; +sub.f32 f395, f2273, f2274; +mul.f32 f396, f2275, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2272, f2282, f2276; +sub.f32 f405, f2282, f2276; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2271, f2281, f397; +sub.f32 f409, f2281, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2270, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2269, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2271, 0fBEC3EF15; +mul.f32 f2268, f406, 0f3F6C835E; +sub.f32 f420, f2268, f419; +mul.f32 f421, f2271, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2270, 0fBF3504F3; +mul.f32 f2267, f410, 0f3F3504F3; +sub.f32 f425, f2267, f424; +mul.f32 f426, f2270, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2265, f414, 0f3EC3EF15; +mul.f32 f2266, f2269, 0fBF6C835E; +sub.f32 f430, f2265, f2266; +mul.f32 f431, f2269, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2263, f408, 0fBEC3EF15; +mul.f32 f2264, f409, 0fBF6C835E; +sub.f32 f435, f2263, f2264; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2262, f416, 0fBF6C835E; +sub.f32 f444, f2262, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2261, f2290, f2272; +sub.f32 f450, f2290, f2272; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2260, f2289, f422; +sub.f32 f454, f2289, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2259, f2288, f427; +sub.f32 f458, f2288, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2258, f2287, f432; +sub.f32 f462, f2287, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2257, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2256, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2255, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2254, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2260, 0fBE47C5C2; +mul.f32 f2253, f451, 0f3F7B14BE; +sub.f32 f481, f2253, f480; +mul.f32 f482, f2260, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2259, 0fBEC3EF15; +mul.f32 f2252, f455, 0f3F6C835E; +sub.f32 f486, f2252, f485; +mul.f32 f487, f2259, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2258, 0fBF0E39DA; +mul.f32 f2251, f459, 0f3F54DB31; +sub.f32 f491, f2251, f490; +mul.f32 f492, f2258, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2257, 0fBF3504F3; +mul.f32 f2250, f463, 0f3F3504F3; +sub.f32 f496, f2250, f495; +mul.f32 f497, f2257, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2248, f467, 0f3F0E39DA; +mul.f32 f2249, f2256, 0fBF54DB31; +sub.f32 f501, f2248, f2249; +mul.f32 f502, f2256, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2246, f471, 0f3EC3EF15; +mul.f32 f2247, f2255, 0fBF6C835E; +sub.f32 f506, f2246, f2247; +mul.f32 f507, f2255, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2244, f475, 0f3E47C5C2; +mul.f32 f2245, f2254, 0fBF7B14BE; +sub.f32 f511, f2244, f2245; +mul.f32 f512, f2254, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2242, f453, 0fBE47C5C2; +mul.f32 f2243, f454, 0fBF7B14BE; +sub.f32 f516, f2242, f2243; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2241, f457, 0fBEC3EF15; +sub.f32 f521, f2241, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2240, f461, 0fBF0E39DA; +sub.f32 f526, f2240, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2238, f469, 0fBF54DB31; +mul.f32 f2239, f470, 0fBF0E39DA; +sub.f32 f535, f2238, f2239; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2237, f473, 0fBF6C835E; +sub.f32 f540, f2237, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2236, f477, 0fBF7B14BE; +sub.f32 f545, f2236, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2235, f2314, f2261; +sub.f32 f551, f2314, f2261; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2234, f2313, f483; +sub.f32 f555, f2313, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2233, f2312, f488; +sub.f32 f559, f2312, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2232, f2311, f493; +sub.f32 f563, f2311, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2231, f2310, f498; +sub.f32 f567, f2310, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f2230, f2309, f503; +sub.f32 f571, f2309, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f2229, f2308, f508; +sub.f32 f575, f2308, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f2228, f2307, f513; +sub.f32 f579, f2307, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f2227, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f2226, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f2225, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f2224, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f2223, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2222, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2221, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2220, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f2234; +mul.f32 f2219, f612, f552; +sub.f32 f618, f2219, f617; +mul.f32 f619, f612, f2234; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f2217, f612, f612; +mul.f32 f2218, f613, f613; +sub.f32 f623, f2217, f2218; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f2215, f623, f556; +mul.f32 f2216, f625, f2233; +sub.f32 f628, f2215, f2216; +mul.f32 f629, f623, f2233; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f2213, f612, f623; +mul.f32 f2214, f613, f625; +sub.f32 f633, f2213, f2214; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f2211, f633, f560; +mul.f32 f2212, f635, f2232; +sub.f32 f638, f2211, f2212; +mul.f32 f639, f633, f2232; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f2210, f612, f633; +sub.f32 f643, f2210, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f2231; +mul.f32 f2209, f643, f564; +sub.f32 f648, f2209, f647; +mul.f32 f649, f643, f2231; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f2208, f612, f643; +sub.f32 f653, f2208, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f2230; +mul.f32 f2207, f653, f568; +sub.f32 f658, f2207, f657; +mul.f32 f659, f653, f2230; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f2206, f612, f653; +sub.f32 f663, f2206, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f2204, f663, f572; +mul.f32 f2205, f665, f2229; +sub.f32 f668, f2204, f2205; +mul.f32 f669, f663, f2229; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f2202, f612, f663; +mul.f32 f2203, f613, f665; +sub.f32 f673, f2202, f2203; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f2200, f673, f576; +mul.f32 f2201, f675, f2228; +sub.f32 f678, f2200, f2201; +mul.f32 f679, f673, f2228; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f2198, f612, f673; +mul.f32 f2199, f613, f675; +sub.f32 f683, f2198, f2199; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f2227; +mul.f32 f2197, f683, f580; +sub.f32 f688, f2197, f687; +mul.f32 f689, f683, f2227; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f2196, f612, f683; +sub.f32 f693, f2196, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f2226; +mul.f32 f2195, f693, f584; +sub.f32 f698, f2195, f697; +mul.f32 f699, f693, f2226; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f2194, f612, f693; +sub.f32 f703, f2194, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f2225; +mul.f32 f2193, f703, f588; +sub.f32 f708, f2193, f707; +mul.f32 f709, f703, f2225; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f2191, f612, f703; +mul.f32 f2192, f613, f705; +sub.f32 f713, f2191, f2192; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f2189, f713, f592; +mul.f32 f2190, f715, f2224; +sub.f32 f718, f2189, f2190; +mul.f32 f719, f713, f2224; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f2187, f612, f713; +mul.f32 f2188, f613, f715; +sub.f32 f723, f2187, f2188; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f2185, f723, f596; +mul.f32 f2186, f725, f2223; +sub.f32 f728, f2185, f2186; +mul.f32 f729, f723, f2223; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f2184, f612, f723; +sub.f32 f733, f2184, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f2222; +mul.f32 f2183, f733, f600; +sub.f32 f738, f2183, f737; +mul.f32 f739, f733, f2222; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f2182, f612, f733; +sub.f32 f743, f2182, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f2221; +mul.f32 f2181, f743, f604; +sub.f32 f748, f2181, f747; +mul.f32 f749, f743, f2221; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f2180, f612, f743; +sub.f32 f753, f2180, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f2220; +mul.f32 f2179, f753, f608; +sub.f32 f758, f2179, f757; +mul.f32 f759, f753, f2220; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f2177, f612, f753; +mul.f32 f2178, f613, f755; +sub.f32 f763, f2177, f2178; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f2175, f763, f550; +mul.f32 f2176, f765, f551; +sub.f32 f768, f2175, f2176; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f2173, f612, f763; +mul.f32 f2174, f613, f765; +sub.f32 f773, f2173, f2174; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f2172, f773, f554; +sub.f32 f778, f2172, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f2171, f612, f773; +sub.f32 f783, f2171, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f2170, f783, f558; +sub.f32 f788, f2170, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f2169, f612, f783; +sub.f32 f793, f2169, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f2168, f793, f562; +sub.f32 f798, f2168, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f2167, f612, f793; +sub.f32 f803, f2167, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f2165, f803, f566; +mul.f32 f2166, f805, f567; +sub.f32 f808, f2165, f2166; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f2163, f612, f803; +mul.f32 f2164, f613, f805; +sub.f32 f813, f2163, f2164; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f2161, f813, f570; +mul.f32 f2162, f815, f571; +sub.f32 f818, f2161, f2162; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f2159, f612, f813; +mul.f32 f2160, f613, f815; +sub.f32 f823, f2159, f2160; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f2158, f823, f574; +sub.f32 f828, f2158, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f2157, f612, f823; +sub.f32 f833, f2157, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f2156, f833, f578; +sub.f32 f838, f2156, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f2155, f612, f833; +sub.f32 f843, f2155, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f2154, f843, f582; +sub.f32 f848, f2154, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f2152, f612, f843; +mul.f32 f2153, f613, f845; +sub.f32 f853, f2152, f2153; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f2150, f853, f586; +mul.f32 f2151, f855, f587; +sub.f32 f858, f2150, f2151; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f2148, f612, f853; +mul.f32 f2149, f613, f855; +sub.f32 f863, f2148, f2149; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f2146, f863, f590; +mul.f32 f2147, f865, f591; +sub.f32 f868, f2146, f2147; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f2145, f612, f863; +sub.f32 f873, f2145, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f2144, f873, f594; +sub.f32 f878, f2144, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f2143, f612, f873; +sub.f32 f883, f2143, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f2142, f883, f598; +sub.f32 f888, f2142, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f2141, f612, f883; +sub.f32 f893, f2141, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f2139, f893, f602; +mul.f32 f2140, f895, f603; +sub.f32 f898, f2139, f2140; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f2137, f612, f893; +mul.f32 f2138, f613, f895; +sub.f32 f903, f2137, f2138; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f2135, f903, f606; +mul.f32 f2136, f905, f607; +sub.f32 f908, f2135, f2136; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f2133, f612, f903; +mul.f32 f2134, f613, f905; +sub.f32 f913, f2133, f2134; +mov.u32 r32, %tid.x; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f2132, f913, f610; +sub.f32 f918, f2132, f917; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +shl.b32 r8, r32, 7; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8064; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +and.b32 r23, r32, 63; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+256]; +ld.shared.f32 f923, [r13+512]; +ld.shared.f32 f924, [r13+768]; +ld.shared.f32 f925, [r13+1024]; +ld.shared.f32 f926, [r13+1280]; +ld.shared.f32 f927, [r13+1536]; +ld.shared.f32 f928, [r13+1792]; +ld.shared.f32 f929, [r13+2048]; +ld.shared.f32 f930, [r13+2304]; +ld.shared.f32 f931, [r13+2560]; +ld.shared.f32 f932, [r13+2816]; +ld.shared.f32 f933, [r13+3072]; +ld.shared.f32 f934, [r13+3328]; +ld.shared.f32 f935, [r13+3584]; +ld.shared.f32 f936, [r13+3840]; +ld.shared.f32 f937, [r13+4096]; +ld.shared.f32 f938, [r13+4352]; +ld.shared.f32 f939, [r13+4608]; +ld.shared.f32 f940, [r13+4864]; +ld.shared.f32 f941, [r13+5120]; +ld.shared.f32 f942, [r13+5376]; +ld.shared.f32 f943, [r13+5632]; +ld.shared.f32 f944, [r13+5888]; +ld.shared.f32 f945, [r13+6144]; +ld.shared.f32 f946, [r13+6400]; +ld.shared.f32 f947, [r13+6656]; +ld.shared.f32 f948, [r13+6912]; +ld.shared.f32 f949, [r13+7168]; +ld.shared.f32 f950, [r13+7424]; +ld.shared.f32 f951, [r13+7680]; +ld.shared.f32 f952, [r13+7936]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2235, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+256]; +ld.shared.f32 f955, [r13+512]; +ld.shared.f32 f956, [r13+768]; +ld.shared.f32 f957, [r13+1024]; +ld.shared.f32 f958, [r13+1280]; +ld.shared.f32 f959, [r13+1536]; +ld.shared.f32 f960, [r13+1792]; +ld.shared.f32 f961, [r13+2048]; +ld.shared.f32 f962, [r13+2304]; +ld.shared.f32 f963, [r13+2560]; +ld.shared.f32 f964, [r13+2816]; +ld.shared.f32 f965, [r13+3072]; +ld.shared.f32 f966, [r13+3328]; +ld.shared.f32 f967, [r13+3584]; +ld.shared.f32 f968, [r13+3840]; +ld.shared.f32 f969, [r13+4096]; +ld.shared.f32 f970, [r13+4352]; +ld.shared.f32 f971, [r13+4608]; +ld.shared.f32 f972, [r13+4864]; +ld.shared.f32 f973, [r13+5120]; +ld.shared.f32 f974, [r13+5376]; +ld.shared.f32 f975, [r13+5632]; +ld.shared.f32 f976, [r13+5888]; +ld.shared.f32 f977, [r13+6144]; +ld.shared.f32 f978, [r13+6400]; +ld.shared.f32 f979, [r13+6656]; +ld.shared.f32 f980, [r13+6912]; +ld.shared.f32 f981, [r13+7168]; +ld.shared.f32 f982, [r13+7424]; +ld.shared.f32 f983, [r13+7680]; +ld.shared.f32 f984, [r13+7936]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2131, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2130, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2129, f2131, f2130; +sub.f32 f996, f2131, f2130; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f2128, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2127, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2126, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2125, f2127, f2126; +sub.f32 f1012, f2127, f2126; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f2124, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f2124, 0fBF3504F3; +mul.f32 f2123, f1013, 0f3F3504F3; +sub.f32 f1019, f2123, f1018; +mul.f32 f1020, f2124, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2122, f2129, f2125; +sub.f32 f1029, f2129, f2125; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2121, f2128, f1021; +sub.f32 f1033, f2128, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f2120, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f2119, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2118, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2117, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2116, f2118, f2117; +sub.f32 f1053, f2118, f2117; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f2115, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2114, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2113, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2112, f2114, f2113; +sub.f32 f1069, f2114, f2113; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f2111, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f2111, 0fBF3504F3; +mul.f32 f2110, f1070, 0f3F3504F3; +sub.f32 f1076, f2110, f1075; +mul.f32 f1077, f2111, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2109, f2116, f2112; +sub.f32 f1086, f2116, f2112; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2108, f2115, f1078; +sub.f32 f1090, f2115, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f2107, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f2106, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2104, f1087, 0f3F6C835E; +mul.f32 f2105, f2108, 0fBEC3EF15; +sub.f32 f1101, f2104, f2105; +mul.f32 f1102, f2108, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f2102, f1091, 0f3F3504F3; +mul.f32 f2103, f2107, 0fBF3504F3; +sub.f32 f1106, f2102, f2103; +mul.f32 f1107, f2107, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f2106, 0fBF6C835E; +mul.f32 f2101, f1095, 0f3EC3EF15; +sub.f32 f1111, f2101, f1110; +mul.f32 f1112, f2106, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f2100, f1089, 0fBEC3EF15; +sub.f32 f1116, f2100, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f2098, f1097, 0fBF6C835E; +mul.f32 f2099, f1098, 0fBEC3EF15; +sub.f32 f1125, f2098, f2099; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2097, f2122, f2109; +sub.f32 f1131, f2122, f2109; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2096, f2121, f1103; +sub.f32 f1135, f2121, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2095, f2120, f1108; +sub.f32 f1139, f2120, f1108; +add.f32 f1140, f1038, f1111; +sub.f32 f1142, f1038, f1111; +add.f32 f2094, f2119, f1113; +sub.f32 f1143, f2119, f1113; +add.f32 f1144, f1028, f1086; +sub.f32 f1146, f1028, f1086; +sub.f32 f2093, f1029, f1085; +add.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1116; +sub.f32 f1150, f1032, f1116; +add.f32 f2092, f1033, f1118; +sub.f32 f1151, f1033, f1118; +add.f32 f1152, f1036, f1121; +sub.f32 f1154, f1036, f1121; +add.f32 f2091, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2090, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2089, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2088, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2087, f2089, f2088; +sub.f32 f1171, f2089, f2088; +add.f32 f1172, f1162, f1167; +sub.f32 f1174, f1162, f1167; +sub.f32 f2086, f1163, f1166; +add.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2085, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2084, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2083, f2085, f2084; +sub.f32 f1187, f2085, f2084; +add.f32 f1188, f1178, f1183; +sub.f32 f1190, f1178, f1183; +sub.f32 f2082, f1179, f1182; +add.f32 f1191, f1179, f1182; +mul.f32 f1193, f2082, 0fBF3504F3; +mul.f32 f2081, f1188, 0f3F3504F3; +sub.f32 f1194, f2081, f1193; +mul.f32 f1195, f2082, 0f3F3504F3; +fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195; +mul.f32 f1197, f1190, 0fBF3504F3; +mul.f32 f1198, f1191, 0fBF3504F3; +sub.f32 f1199, f1197, f1198; +add.f32 f1200, f1197, f1198; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2080, f2087, f2083; +sub.f32 f1204, f2087, f2083; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2079, f2086, f1196; +sub.f32 f1208, f2086, f1196; +add.f32 f1209, f1170, f1187; +sub.f32 f1211, f1170, f1187; +sub.f32 f2078, f1171, f1186; +add.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1199; +sub.f32 f1215, f1174, f1199; +add.f32 f2077, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2076, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2075, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2074, f2076, f2075; +sub.f32 f1228, f2076, f2075; +add.f32 f1229, f1219, f1224; +sub.f32 f1231, f1219, f1224; +sub.f32 f2073, f1220, f1223; +add.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2072, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2071, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2070, f2072, f2071; +sub.f32 f1244, f2072, f2071; +add.f32 f1245, f1235, f1240; +sub.f32 f1247, f1235, f1240; +sub.f32 f2069, f1236, f1239; +add.f32 f1248, f1236, f1239; +mul.f32 f1250, f2069, 0fBF3504F3; +mul.f32 f2068, f1245, 0f3F3504F3; +sub.f32 f1251, f2068, f1250; +mul.f32 f1252, f2069, 0f3F3504F3; +fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252; +mul.f32 f1254, f1247, 0fBF3504F3; +mul.f32 f1255, f1248, 0fBF3504F3; +sub.f32 f1256, f1254, f1255; +add.f32 f1257, f1254, f1255; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2067, f2074, f2070; +sub.f32 f1261, f2074, f2070; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2066, f2073, f1253; +sub.f32 f1265, f2073, f1253; +add.f32 f1266, f1227, f1244; +sub.f32 f1268, f1227, f1244; +sub.f32 f2065, f1228, f1243; +add.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1256; +sub.f32 f1272, f1231, f1256; +add.f32 f2064, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2062, f1262, 0f3F6C835E; +mul.f32 f2063, f2066, 0fBEC3EF15; +sub.f32 f1276, f2062, f2063; +mul.f32 f1277, f2066, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277; +mul.f32 f2060, f1266, 0f3F3504F3; +mul.f32 f2061, f2065, 0fBF3504F3; +sub.f32 f1281, f2060, f2061; +mul.f32 f1282, f2065, 0f3F3504F3; +fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282; +mul.f32 f2058, f1270, 0f3EC3EF15; +mul.f32 f2059, f2064, 0fBF6C835E; +sub.f32 f1286, f2058, f2059; +mul.f32 f1287, f2064, 0f3EC3EF15; +fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287; +mul.f32 f2056, f1264, 0fBEC3EF15; +mul.f32 f2057, f1265, 0fBF6C835E; +sub.f32 f1291, f2056, f2057; +mul.f32 f1292, f1265, 0fBEC3EF15; +fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292; +mul.f32 f1294, f1268, 0fBF3504F3; +mul.f32 f1295, f1269, 0fBF3504F3; +sub.f32 f1296, f1294, f1295; +add.f32 f1297, f1294, f1295; +mul.f32 f2054, f1272, 0fBF6C835E; +mul.f32 f2055, f1273, 0fBEC3EF15; +sub.f32 f1300, f2054, f2055; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2053, f2080, f2067; +sub.f32 f1306, f2080, f2067; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2052, f2079, f1278; +sub.f32 f1310, f2079, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2051, f2078, f1283; +sub.f32 f1314, f2078, f1283; +add.f32 f1315, f1213, f1286; +sub.f32 f1317, f1213, f1286; +add.f32 f2050, f2077, f1288; +sub.f32 f1318, f2077, f1288; +add.f32 f1319, f1203, f1261; +sub.f32 f1321, f1203, f1261; +sub.f32 f2049, f1204, f1260; +add.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1291; +sub.f32 f1325, f1207, f1291; +add.f32 f2048, f1208, f1293; +sub.f32 f1326, f1208, f1293; +add.f32 f1327, f1211, f1296; +sub.f32 f1329, f1211, f1296; +add.f32 f2047, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2046, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2052, 0fBE47C5C2; +mul.f32 f2045, f1307, 0f3F7B14BE; +sub.f32 f1337, f2045, f1336; +mul.f32 f1338, f2052, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338; +mul.f32 f1341, f2051, 0fBEC3EF15; +mul.f32 f2044, f1311, 0f3F6C835E; +sub.f32 f1342, f2044, f1341; +mul.f32 f1343, f2051, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343; +mul.f32 f2042, f1315, 0f3F54DB31; +mul.f32 f2043, f2050, 0fBF0E39DA; +sub.f32 f1347, f2042, f2043; +mul.f32 f1348, f2050, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348; +mul.f32 f2040, f1319, 0f3F3504F3; +mul.f32 f2041, f2049, 0fBF3504F3; +sub.f32 f1352, f2040, f2041; +mul.f32 f1353, f2049, 0f3F3504F3; +fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353; +mul.f32 f2038, f1323, 0f3F0E39DA; +mul.f32 f2039, f2048, 0fBF54DB31; +sub.f32 f1357, f2038, f2039; +mul.f32 f1358, f2048, 0f3F0E39DA; +fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358; +mul.f32 f2036, f1327, 0f3EC3EF15; +mul.f32 f2037, f2047, 0fBF6C835E; +sub.f32 f1362, f2036, f2037; +mul.f32 f1363, f2047, 0f3EC3EF15; +fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363; +mul.f32 f1366, f2046, 0fBF7B14BE; +mul.f32 f2035, f1331, 0f3E47C5C2; +sub.f32 f1367, f2035, f1366; +mul.f32 f1368, f2046, 0f3E47C5C2; +fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368; +mul.f32 f1371, f1310, 0fBF7B14BE; +mul.f32 f2034, f1309, 0fBE47C5C2; +sub.f32 f1372, f2034, f1371; +mul.f32 f1373, f1310, 0fBE47C5C2; +fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373; +mul.f32 f1376, f1314, 0fBF6C835E; +mul.f32 f2033, f1313, 0fBEC3EF15; +sub.f32 f1377, f2033, f1376; +mul.f32 f1378, f1314, 0fBEC3EF15; +fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378; +mul.f32 f1381, f1318, 0fBF54DB31; +mul.f32 f2032, f1317, 0fBF0E39DA; +sub.f32 f1382, f2032, f1381; +mul.f32 f1383, f1318, 0fBF0E39DA; +fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383; +mul.f32 f1385, f1321, 0fBF3504F3; +mul.f32 f1386, f1322, 0fBF3504F3; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1385, f1386; +mul.f32 f1390, f1326, 0fBF0E39DA; +mul.f32 f2031, f1325, 0fBF54DB31; +sub.f32 f1391, f2031, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392; +mul.f32 f1395, f1330, 0fBEC3EF15; +mul.f32 f2030, f1329, 0fBF6C835E; +sub.f32 f1396, f2030, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397; +mul.f32 f1400, f1334, 0fBE47C5C2; +mul.f32 f2029, f1333, 0fBF7B14BE; +sub.f32 f1401, f2029, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2028, f2097, f2053; +sub.f32 f1407, f2097, f2053; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2027, f2096, f1339; +sub.f32 f1411, f2096, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2026, f2095, f1344; +sub.f32 f1415, f2095, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2025, f2094, f1349; +sub.f32 f1419, f2094, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2024, f2093, f1354; +sub.f32 f1423, f2093, f1354; +add.f32 f1424, f1148, f1357; +sub.f32 f1426, f1148, f1357; +add.f32 f2023, f2092, f1359; +sub.f32 f1427, f2092, f1359; +add.f32 f1428, f1152, f1362; +sub.f32 f1430, f1152, f1362; +add.f32 f2022, f2091, f1364; +sub.f32 f1431, f2091, f1364; +add.f32 f1432, f1156, f1367; +sub.f32 f1434, f1156, f1367; +add.f32 f2021, f2090, f1369; +sub.f32 f1435, f2090, f1369; +add.f32 f1436, f1130, f1306; +sub.f32 f1438, f1130, f1306; +sub.f32 f2020, f1131, f1305; +add.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1372; +sub.f32 f1442, f1134, f1372; +add.f32 f2019, f1135, f1374; +sub.f32 f1443, f1135, f1374; +add.f32 f1444, f1138, f1377; +sub.f32 f1446, f1138, f1377; +add.f32 f2018, f1139, f1379; +sub.f32 f1447, f1139, f1379; +add.f32 f1448, f1142, f1382; +sub.f32 f1450, f1142, f1382; +add.f32 f2017, f1143, f1384; +sub.f32 f1451, f1143, f1384; +add.f32 f1452, f1146, f1387; +sub.f32 f1454, f1146, f1387; +add.f32 f2016, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2015, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2014, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2013, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r32, 5, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1473, f1469, f2027; +mul.f32 f2012, f1468, f1408; +sub.f32 f1474, f2012, f1473; +mul.f32 f1475, f1468, f2027; +fma.rn.f32 f1476, f1469, f1408, f1475; +mul.f32 f1478, f1469, f1469; +mul.f32 f2011, f1468, f1468; +sub.f32 f1479, f2011, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1483, f1481, f2026; +mul.f32 f2010, f1479, f1412; +sub.f32 f1484, f2010, f1483; +mul.f32 f1485, f1479, f2026; +fma.rn.f32 f1486, f1481, f1412, f1485; +mul.f32 f2008, f1468, f1479; +mul.f32 f2009, f1469, f1481; +sub.f32 f1489, f2008, f2009; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f2006, f1489, f1416; +mul.f32 f2007, f1491, f2025; +sub.f32 f1494, f2006, f2007; +mul.f32 f1495, f1489, f2025; +fma.rn.f32 f1496, f1491, f1416, f1495; +mul.f32 f2004, f1468, f1489; +mul.f32 f2005, f1469, f1491; +sub.f32 f1499, f2004, f2005; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f2002, f1499, f1420; +mul.f32 f2003, f1501, f2024; +sub.f32 f1504, f2002, f2003; +mul.f32 f1505, f1499, f2024; +fma.rn.f32 f1506, f1501, f1420, f1505; +mul.f32 f1508, f1469, f1501; +mul.f32 f2001, f1468, f1499; +sub.f32 f1509, f2001, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1513, f1511, f2023; +mul.f32 f2000, f1509, f1424; +sub.f32 f1514, f2000, f1513; +mul.f32 f1515, f1509, f2023; +fma.rn.f32 f1516, f1511, f1424, f1515; +mul.f32 f1518, f1469, f1511; +mul.f32 f1999, f1468, f1509; +sub.f32 f1519, f1999, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1523, f1521, f2022; +mul.f32 f1998, f1519, f1428; +sub.f32 f1524, f1998, f1523; +mul.f32 f1525, f1519, f2022; +fma.rn.f32 f1526, f1521, f1428, f1525; +mul.f32 f1528, f1469, f1521; +mul.f32 f1997, f1468, f1519; +sub.f32 f1529, f1997, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f1995, f1529, f1432; +mul.f32 f1996, f1531, f2021; +sub.f32 f1534, f1995, f1996; +mul.f32 f1535, f1529, f2021; +fma.rn.f32 f1536, f1531, f1432, f1535; +mul.f32 f1993, f1468, f1529; +mul.f32 f1994, f1469, f1531; +sub.f32 f1539, f1993, f1994; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f1991, f1539, f1436; +mul.f32 f1992, f1541, f2020; +sub.f32 f1544, f1991, f1992; +mul.f32 f1545, f1539, f2020; +fma.rn.f32 f1546, f1541, f1436, f1545; +mul.f32 f1989, f1468, f1539; +mul.f32 f1990, f1469, f1541; +sub.f32 f1549, f1989, f1990; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1553, f1551, f2019; +mul.f32 f1988, f1549, f1440; +sub.f32 f1554, f1988, f1553; +mul.f32 f1555, f1549, f2019; +fma.rn.f32 f1556, f1551, f1440, f1555; +mul.f32 f1558, f1469, f1551; +mul.f32 f1987, f1468, f1549; +sub.f32 f1559, f1987, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1563, f1561, f2018; +mul.f32 f1986, f1559, f1444; +sub.f32 f1564, f1986, f1563; +mul.f32 f1565, f1559, f2018; +fma.rn.f32 f1566, f1561, f1444, f1565; +mul.f32 f1568, f1469, f1561; +mul.f32 f1985, f1468, f1559; +sub.f32 f1569, f1985, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1573, f1571, f2017; +mul.f32 f1984, f1569, f1448; +sub.f32 f1574, f1984, f1573; +mul.f32 f1575, f1569, f2017; +fma.rn.f32 f1576, f1571, f1448, f1575; +mul.f32 f1578, f1469, f1571; +mul.f32 f1983, f1468, f1569; +sub.f32 f1579, f1983, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f1981, f1579, f1452; +mul.f32 f1982, f1581, f2016; +sub.f32 f1584, f1981, f1982; +mul.f32 f1585, f1579, f2016; +fma.rn.f32 f1586, f1581, f1452, f1585; +mul.f32 f1979, f1468, f1579; +mul.f32 f1980, f1469, f1581; +sub.f32 f1589, f1979, f1980; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f1977, f1589, f1456; +mul.f32 f1978, f1591, f2015; +sub.f32 f1594, f1977, f1978; +mul.f32 f1595, f1589, f2015; +fma.rn.f32 f1596, f1591, f1456, f1595; +mul.f32 f1598, f1469, f1591; +mul.f32 f1976, f1468, f1589; +sub.f32 f1599, f1976, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1603, f1601, f2014; +mul.f32 f1975, f1599, f1460; +sub.f32 f1604, f1975, f1603; +mul.f32 f1605, f1599, f2014; +fma.rn.f32 f1606, f1601, f1460, f1605; +mul.f32 f1608, f1469, f1601; +mul.f32 f1974, f1468, f1599; +sub.f32 f1609, f1974, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1613, f1611, f2013; +mul.f32 f1973, f1609, f1464; +sub.f32 f1614, f1973, f1613; +mul.f32 f1615, f1609, f2013; +fma.rn.f32 f1616, f1611, f1464, f1615; +mul.f32 f1618, f1469, f1611; +mul.f32 f1972, f1468, f1609; +sub.f32 f1619, f1972, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1623, f1621, f1407; +mul.f32 f1971, f1619, f1406; +sub.f32 f1624, f1971, f1623; +mul.f32 f1625, f1619, f1407; +fma.rn.f32 f1626, f1621, f1406, f1625; +mul.f32 f1969, f1468, f1619; +mul.f32 f1970, f1469, f1621; +sub.f32 f1629, f1969, f1970; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f1967, f1629, f1410; +mul.f32 f1968, f1631, f1411; +sub.f32 f1634, f1967, f1968; +mul.f32 f1635, f1629, f1411; +fma.rn.f32 f1636, f1631, f1410, f1635; +mul.f32 f1965, f1468, f1629; +mul.f32 f1966, f1469, f1631; +sub.f32 f1639, f1965, f1966; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f1963, f1639, f1414; +mul.f32 f1964, f1641, f1415; +sub.f32 f1644, f1963, f1964; +mul.f32 f1645, f1639, f1415; +fma.rn.f32 f1646, f1641, f1414, f1645; +mul.f32 f1648, f1469, f1641; +mul.f32 f1962, f1468, f1639; +sub.f32 f1649, f1962, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1653, f1651, f1419; +mul.f32 f1961, f1649, f1418; +sub.f32 f1654, f1961, f1653; +mul.f32 f1655, f1649, f1419; +fma.rn.f32 f1656, f1651, f1418, f1655; +mul.f32 f1658, f1469, f1651; +mul.f32 f1960, f1468, f1649; +sub.f32 f1659, f1960, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1663, f1661, f1423; +mul.f32 f1959, f1659, f1422; +sub.f32 f1664, f1959, f1663; +mul.f32 f1665, f1659, f1423; +fma.rn.f32 f1666, f1661, f1422, f1665; +mul.f32 f1668, f1469, f1661; +mul.f32 f1958, f1468, f1659; +sub.f32 f1669, f1958, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f1956, f1669, f1426; +mul.f32 f1957, f1671, f1427; +sub.f32 f1674, f1956, f1957; +mul.f32 f1675, f1669, f1427; +fma.rn.f32 f1676, f1671, f1426, f1675; +mul.f32 f1954, f1468, f1669; +mul.f32 f1955, f1469, f1671; +sub.f32 f1679, f1954, f1955; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f1952, f1679, f1430; +mul.f32 f1953, f1681, f1431; +sub.f32 f1684, f1952, f1953; +mul.f32 f1685, f1679, f1431; +fma.rn.f32 f1686, f1681, f1430, f1685; +mul.f32 f1950, f1468, f1679; +mul.f32 f1951, f1469, f1681; +sub.f32 f1689, f1950, f1951; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1693, f1691, f1435; +mul.f32 f1949, f1689, f1434; +sub.f32 f1694, f1949, f1693; +mul.f32 f1695, f1689, f1435; +fma.rn.f32 f1696, f1691, f1434, f1695; +mul.f32 f1698, f1469, f1691; +mul.f32 f1948, f1468, f1689; +sub.f32 f1699, f1948, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1703, f1701, f1439; +mul.f32 f1947, f1699, f1438; +sub.f32 f1704, f1947, f1703; +mul.f32 f1705, f1699, f1439; +fma.rn.f32 f1706, f1701, f1438, f1705; +mul.f32 f1708, f1469, f1701; +mul.f32 f1946, f1468, f1699; +sub.f32 f1709, f1946, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1713, f1711, f1443; +mul.f32 f1945, f1709, f1442; +sub.f32 f1714, f1945, f1713; +mul.f32 f1715, f1709, f1443; +fma.rn.f32 f1716, f1711, f1442, f1715; +mul.f32 f1943, f1468, f1709; +mul.f32 f1944, f1469, f1711; +sub.f32 f1719, f1943, f1944; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f1941, f1719, f1446; +mul.f32 f1942, f1721, f1447; +sub.f32 f1724, f1941, f1942; +mul.f32 f1725, f1719, f1447; +fma.rn.f32 f1726, f1721, f1446, f1725; +mul.f32 f1939, f1468, f1719; +mul.f32 f1940, f1469, f1721; +sub.f32 f1729, f1939, f1940; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f1937, f1729, f1450; +mul.f32 f1938, f1731, f1451; +sub.f32 f1734, f1937, f1938; +mul.f32 f1735, f1729, f1451; +fma.rn.f32 f1736, f1731, f1450, f1735; +mul.f32 f1738, f1469, f1731; +mul.f32 f1936, f1468, f1729; +sub.f32 f1739, f1936, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1743, f1741, f1455; +mul.f32 f1935, f1739, f1454; +sub.f32 f1744, f1935, f1743; +mul.f32 f1745, f1739, f1455; +fma.rn.f32 f1746, f1741, f1454, f1745; +mul.f32 f1748, f1469, f1741; +mul.f32 f1934, f1468, f1739; +sub.f32 f1749, f1934, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1753, f1751, f1459; +mul.f32 f1933, f1749, f1458; +sub.f32 f1754, f1933, f1753; +mul.f32 f1755, f1749, f1459; +fma.rn.f32 f1756, f1751, f1458, f1755; +mul.f32 f1758, f1469, f1751; +mul.f32 f1932, f1468, f1749; +sub.f32 f1759, f1932, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f1930, f1759, f1462; +mul.f32 f1931, f1761, f1463; +sub.f32 f1764, f1930, f1931; +mul.f32 f1765, f1759, f1463; +fma.rn.f32 f1766, f1761, f1462, f1765; +mul.f32 f1928, f1468, f1759; +mul.f32 f1929, f1469, f1761; +sub.f32 f1769, f1928, f1929; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 7; +mul.f32 f1770, f1468, f1761; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f1926, f1769, f1466; +mul.f32 f1927, f1771, f1467; +sub.f32 f1774, f1926, f1927; +mul.f32 f1775, f1769, f1467; +mov.u32 r33, %tid.x; +fma.rn.f32 f1776, f1771, f1466, f1775; +and.b32 r22, r33, 32; +shl.b32 r16, r33, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r30, 4096; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1474; +st.shared.f32 [r20+256], f1484; +st.shared.f32 [r20+384], f1494; +st.shared.f32 [r20+512], f1504; +st.shared.f32 [r20+640], f1514; +st.shared.f32 [r20+768], f1524; +st.shared.f32 [r20+896], f1534; +st.shared.f32 [r20+1024], f1544; +st.shared.f32 [r20+1152], f1554; +st.shared.f32 [r20+1280], f1564; +st.shared.f32 [r20+1408], f1574; +st.shared.f32 [r20+1536], f1584; +st.shared.f32 [r20+1664], f1594; +st.shared.f32 [r20+1792], f1604; +st.shared.f32 [r20+1920], f1614; +st.shared.f32 [r20+2048], f1624; +st.shared.f32 [r20+2176], f1634; +st.shared.f32 [r20+2304], f1644; +st.shared.f32 [r20+2432], f1654; +st.shared.f32 [r20+2560], f1664; +st.shared.f32 [r20+2688], f1674; +st.shared.f32 [r20+2816], f1684; +st.shared.f32 [r20+2944], f1694; +st.shared.f32 [r20+3072], f1704; +st.shared.f32 [r20+3200], f1714; +st.shared.f32 [r20+3328], f1724; +st.shared.f32 [r20+3456], f1734; +st.shared.f32 [r20+3584], f1744; +st.shared.f32 [r20+3712], f1754; +st.shared.f32 [r20+3840], f1764; +st.shared.f32 [r20+3968], f1774; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+256]; +ld.shared.f32 f1779, [r21+512]; +ld.shared.f32 f1780, [r21+768]; +ld.shared.f32 f1781, [r21+1024]; +ld.shared.f32 f1782, [r21+1280]; +ld.shared.f32 f1783, [r21+1536]; +ld.shared.f32 f1784, [r21+1792]; +ld.shared.f32 f1785, [r21+2048]; +ld.shared.f32 f1786, [r21+2304]; +ld.shared.f32 f1787, [r21+2560]; +ld.shared.f32 f1788, [r21+2816]; +ld.shared.f32 f1789, [r21+3072]; +ld.shared.f32 f1790, [r21+3328]; +ld.shared.f32 f1791, [r21+3584]; +ld.shared.f32 f1792, [r21+3840]; +ld.shared.f32 f1793, [r21+4096]; +ld.shared.f32 f1794, [r21+4352]; +ld.shared.f32 f1795, [r21+4608]; +ld.shared.f32 f1796, [r21+4864]; +ld.shared.f32 f1797, [r21+5120]; +ld.shared.f32 f1798, [r21+5376]; +ld.shared.f32 f1799, [r21+5632]; +ld.shared.f32 f1800, [r21+5888]; +ld.shared.f32 f1801, [r21+6144]; +ld.shared.f32 f1802, [r21+6400]; +ld.shared.f32 f1803, [r21+6656]; +ld.shared.f32 f1804, [r21+6912]; +ld.shared.f32 f1805, [r21+7168]; +ld.shared.f32 f1806, [r21+7424]; +ld.shared.f32 f1807, [r21+7680]; +ld.shared.f32 f1808, [r21+7936]; +barrier.sync 0; +st.shared.f32 [r20], f2028; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+256]; +ld.shared.f32 f1811, [r21+512]; +ld.shared.f32 f1812, [r21+768]; +ld.shared.f32 f1813, [r21+1024]; +ld.shared.f32 f1814, [r21+1280]; +ld.shared.f32 f1815, [r21+1536]; +ld.shared.f32 f1816, [r21+1792]; +ld.shared.f32 f1817, [r21+2048]; +ld.shared.f32 f1818, [r21+2304]; +ld.shared.f32 f1819, [r21+2560]; +ld.shared.f32 f1820, [r21+2816]; +ld.shared.f32 f1821, [r21+3072]; +ld.shared.f32 f1822, [r21+3328]; +ld.shared.f32 f1823, [r21+3584]; +ld.shared.f32 f1824, [r21+3840]; +ld.shared.f32 f1825, [r21+4096]; +ld.shared.f32 f1826, [r21+4352]; +ld.shared.f32 f1827, [r21+4608]; +ld.shared.f32 f1828, [r21+4864]; +ld.shared.f32 f1829, [r21+5120]; +ld.shared.f32 f1830, [r21+5376]; +ld.shared.f32 f1831, [r21+5632]; +ld.shared.f32 f1832, [r21+5888]; +ld.shared.f32 f1833, [r21+6144]; +ld.shared.f32 f1834, [r21+6400]; +ld.shared.f32 f1835, [r21+6656]; +ld.shared.f32 f1836, [r21+6912]; +ld.shared.f32 f1837, [r21+7168]; +ld.shared.f32 f1838, [r21+7424]; +ld.shared.f32 f1839, [r21+7680]; +ld.shared.f32 f1840, [r21+7936]; +add.f32 %1, f1809, f1825; +add.f32 %0, f1777, f1793; +add.f32 %3, f1810, f1826; +add.f32 %2, f1778, f1794; +add.f32 %5, f1811, f1827; +add.f32 %4, f1779, f1795; +add.f32 %7, f1812, f1828; +add.f32 %6, f1780, f1796; +add.f32 %8, f1781, f1797; +add.f32 %9, f1813, f1829; +add.f32 %10, f1782, f1798; +add.f32 %11, f1814, f1830; +add.f32 %13, f1815, f1831; +add.f32 %12, f1783, f1799; +add.f32 %15, f1816, f1832; +add.f32 %14, f1784, f1800; +add.f32 %17, f1817, f1833; +add.f32 %16, f1785, f1801; +add.f32 %19, f1818, f1834; +add.f32 %18, f1786, f1802; +add.f32 %20, f1787, f1803; +add.f32 %21, f1819, f1835; +add.f32 %22, f1788, f1804; +add.f32 %23, f1820, f1836; +add.f32 %24, f1789, f1805; +add.f32 %25, f1821, f1837; +add.f32 %27, f1822, f1838; +add.f32 %26, f1790, f1806; +add.f32 %29, f1823, f1839; +add.f32 %28, f1791, f1807; +add.f32 %31, f1824, f1840; +add.f32 %30, f1792, f1808; +sub.f32 %32, f1777, f1793; +sub.f32 %33, f1809, f1825; +sub.f32 %34, f1778, f1794; +sub.f32 %35, f1810, f1826; +sub.f32 %36, f1779, f1795; +sub.f32 %37, f1811, f1827; +sub.f32 %38, f1780, f1796; +sub.f32 %39, f1812, f1828; +sub.f32 %40, f1781, f1797; +sub.f32 %41, f1813, f1829; +sub.f32 %42, f1782, f1798; +sub.f32 %43, f1814, f1830; +sub.f32 %44, f1783, f1799; +sub.f32 %45, f1815, f1831; +sub.f32 %46, f1784, f1800; +sub.f32 %47, f1816, f1832; +sub.f32 %48, f1785, f1801; +sub.f32 %49, f1817, f1833; +sub.f32 %50, f1786, f1802; +sub.f32 %51, f1818, f1834; +sub.f32 %52, f1787, f1803; +sub.f32 %53, f1819, f1835; +sub.f32 %54, f1788, f1804; +sub.f32 %55, f1820, f1836; +sub.f32 %56, f1789, f1805; +sub.f32 %57, f1821, f1837; +sub.f32 %58, f1790, f1806; +sub.f32 %59, f1822, f1838; +sub.f32 %60, f1791, f1807; +sub.f32 %61, f1823, f1839; +sub.f32 %62, f1792, f1808; +sub.f32 %63, f1824, f1840; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_2048), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<97, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<330>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 16352; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+4096]; +ld.shared.v2.f32 {f70, f71}, [r13+8192]; +ld.shared.v2.f32 {f74, f75}, [r13+12288]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 16256; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+4096]; +ld.shared.v2.f32 {f131, f132}, [r20+8192]; +ld.shared.v2.f32 {f135, f136}, [r20+12288]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +add.f32 f149, f141, f146; +sub.f32 f150, f142, f145; +sub.f32 f151, f141, f146; +add.f32 f152, f142, f145; +and.b32 r21, r5, 496; +bfe.u32 r22, r5, 4, 5; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f153, f149; +mul.f32 f158, f154, f150; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f162, f147; +mul.f32 f166, f164, f148; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f170, f151; +mul.f32 f174, f172, f152; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 15872; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f154, f149, f159; +sub.f32 f179, f157, f158; +st.shared.v2.f32 [r26+128], {f179, f178}; +fma.rn.f32 f180, f164, f147, f167; +sub.f32 f181, f165, f166; +st.shared.v2.f32 [r26+256], {f181, f180}; +sub.f32 f182, f173, f174; +fma.rn.f32 f183, f172, f151, f175; +st.shared.v2.f32 [r26+384], {f182, f183}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+4096]; +ld.shared.v2.f32 {f192, f193}, [r27+8192]; +ld.shared.v2.f32 {f196, f197}, [r27+12288]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +add.f32 f210, f202, f207; +sub.f32 f211, f203, f206; +sub.f32 f212, f202, f207; +add.f32 f213, f203, f206; +and.b32 r28, r5, 448; +bfe.u32 r29, r5, 6, 3; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f214, f210; +mul.f32 f219, f215, f211; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f223, f208; +mul.f32 f227, f225, f209; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f231, f212; +mul.f32 f235, f233, f213; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 14336; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f215, f210, f220; +sub.f32 f240, f218, f219; +st.shared.v2.f32 [r33+512], {f240, f239}; +fma.rn.f32 f241, f225, f208, f228; +sub.f32 f242, f226, f227; +st.shared.v2.f32 [r33+1024], {f242, f241}; +sub.f32 f243, f234, f235; +fma.rn.f32 f244, f233, f212, f236; +st.shared.v2.f32 [r33+1536], {f243, f244}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+4096]; +ld.shared.v2.f32 {f253, f254}, [r34+8192]; +ld.shared.v2.f32 {f257, f258}, [r34+12288]; +add.f32 f261, f245, f253; +add.f32 f262, f246, f254; +sub.f32 f263, f245, f253; +sub.f32 f264, f246, f254; +add.f32 f265, f249, f257; +add.f32 f266, f250, f258; +sub.f32 f267, f249, f257; +sub.f32 f268, f250, f258; +sub.f32 f269, f261, f265; +sub.f32 f270, f262, f266; +add.f32 f271, f263, f268; +sub.f32 f272, f264, f267; +sub.f32 f273, f263, f268; +add.f32 f274, f264, f267; +and.b32 r35, r5, 256; +bfe.u32 r36, r5, 8, 1; +mul.wide.u32 rd15, r36, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f275, f276}, [rd17]; +mul.f32 f279, f275, f271; +mul.f32 f280, f276, f272; +mul.f32 f281, f275, f272; +mul.f32 f282, f275, f275; +mul.f32 f283, f276, f276; +sub.f32 f284, f282, f283; +mul.f32 f285, f276, f275; +fma.rn.f32 f286, f276, f275, f285; +mul.f32 f287, f284, f269; +mul.f32 f288, f286, f270; +mul.f32 f289, f284, f270; +mul.f32 f290, f275, f284; +mul.f32 f291, f276, f286; +sub.f32 f292, f290, f291; +mul.f32 f293, f275, f286; +fma.rn.f32 f294, f276, f284, f293; +mul.f32 f295, f292, f273; +mul.f32 f296, f294, f274; +mul.f32 f297, f292, f274; +and.b32 r37, r10, 2040; +add.s32 r38, r9, r37; +barrier.sync 0; +and.b32 r39, r7, 8192; +add.s32 r40, r38, r39; +add.f32 f298, f262, f266; +add.f32 f299, f261, f265; +st.shared.v2.f32 [r40], {f299, f298}; +fma.rn.f32 f300, f276, f271, f281; +sub.f32 f301, f279, f280; +st.shared.v2.f32 [r40+2048], {f301, f300}; +fma.rn.f32 f302, f286, f269, f289; +sub.f32 f303, f287, f288; +st.shared.v2.f32 [r40+4096], {f303, f302}; +sub.f32 f304, f295, f296; +fma.rn.f32 f305, f294, f273, f297; +st.shared.v2.f32 [r40+6144], {f304, f305}; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.v2.f32 {f306, f307}, [r41]; +ld.shared.v2.f32 {f310, f311}, [r41+4096]; +ld.shared.v2.f32 {f314, f315}, [r41+8192]; +ld.shared.v2.f32 {f318, f319}, [r41+12288]; +add.f32 %1, f307, f315; +add.f32 %0, f306, f314; +add.f32 %3, f311, f319; +add.f32 %2, f310, f318; +sub.f32 %5, f307, f315; +sub.f32 %4, f306, f314; +sub.f32 %7, f311, f319; +sub.f32 %6, f310, f318; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_2048), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<98, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<290>; +.reg .b32 r<43>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8176; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+2048]; +ld.shared.f32 f64, [r13+4096]; +ld.shared.f32 f65, [r13+6144]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+2048]; +ld.shared.f32 f68, [r13+4096]; +ld.shared.f32 f69, [r13+6144]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 8128; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+2048]; +ld.shared.f32 f117, [r21+4096]; +ld.shared.f32 f118, [r21+6144]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+2048]; +ld.shared.f32 f121, [r21+4096]; +ld.shared.f32 f122, [r21+6144]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +add.f32 f135, f125, f130; +sub.f32 f136, f126, f129; +sub.f32 f137, f125, f130; +add.f32 f138, f126, f129; +and.b32 r22, r5, 496; +bfe.u32 r23, r5, 4, 5; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f139, f135; +mul.f32 f144, f140, f136; +sub.f32 f145, f143, f144; +mul.f32 f146, f139, f136; +fma.rn.f32 f147, f140, f135, f146; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f150, f133; +mul.f32 f154, f152, f134; +sub.f32 f155, f153, f154; +mul.f32 f156, f150, f134; +fma.rn.f32 f157, f152, f133, f156; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f160, f137; +mul.f32 f164, f162, f138; +sub.f32 f165, f163, f164; +mul.f32 f166, f160, f138; +fma.rn.f32 f167, f162, f137, f166; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 7936; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f145; +st.shared.f32 [r27+128], f155; +st.shared.f32 [r27+192], f165; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+2048]; +ld.shared.f32 f170, [r28+4096]; +ld.shared.f32 f171, [r28+6144]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+2048]; +ld.shared.f32 f174, [r28+4096]; +ld.shared.f32 f175, [r28+6144]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +add.f32 f188, f178, f183; +sub.f32 f189, f179, f182; +sub.f32 f190, f178, f183; +add.f32 f191, f179, f182; +and.b32 r29, r5, 448; +bfe.u32 r30, r5, 6, 3; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f192, f188; +mul.f32 f197, f193, f189; +sub.f32 f198, f196, f197; +mul.f32 f199, f192, f189; +fma.rn.f32 f200, f193, f188, f199; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f203, f186; +mul.f32 f207, f205, f187; +sub.f32 f208, f206, f207; +mul.f32 f209, f203, f187; +fma.rn.f32 f210, f205, f186, f209; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f213, f190; +mul.f32 f217, f215, f191; +sub.f32 f218, f216, f217; +mul.f32 f219, f213, f191; +fma.rn.f32 f220, f215, f190, f219; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 7168; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f198; +st.shared.f32 [r34+512], f208; +st.shared.f32 [r34+768], f218; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+2048]; +ld.shared.f32 f223, [r35+4096]; +ld.shared.f32 f224, [r35+6144]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+2048]; +ld.shared.f32 f227, [r35+4096]; +ld.shared.f32 f228, [r35+6144]; +add.f32 f229, f221, f223; +add.f32 f230, f225, f227; +sub.f32 f231, f221, f223; +sub.f32 f232, f225, f227; +add.f32 f233, f222, f224; +add.f32 f234, f226, f228; +sub.f32 f235, f222, f224; +sub.f32 f236, f226, f228; +add.f32 f237, f229, f233; +add.f32 f238, f230, f234; +sub.f32 f239, f229, f233; +sub.f32 f240, f230, f234; +add.f32 f241, f231, f236; +sub.f32 f242, f232, f235; +sub.f32 f243, f231, f236; +add.f32 f244, f232, f235; +and.b32 r36, r5, 256; +bfe.u32 r37, r5, 8, 1; +mul.wide.u32 rd15, r37, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f245, f246}, [rd17]; +mul.f32 f249, f245, f241; +mul.f32 f250, f246, f242; +sub.f32 f251, f249, f250; +mul.f32 f252, f245, f242; +fma.rn.f32 f253, f246, f241, f252; +mul.f32 f254, f245, f245; +mul.f32 f255, f246, f246; +sub.f32 f256, f254, f255; +mul.f32 f257, f246, f245; +fma.rn.f32 f258, f246, f245, f257; +mul.f32 f259, f256, f239; +mul.f32 f260, f258, f240; +sub.f32 f261, f259, f260; +mul.f32 f262, f256, f240; +fma.rn.f32 f263, f258, f239, f262; +mul.f32 f264, f245, f256; +mul.f32 f265, f246, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f245, f258; +fma.rn.f32 f268, f246, f256, f267; +mul.f32 f269, f266, f243; +mul.f32 f270, f268, f244; +sub.f32 f271, f269, f270; +mul.f32 f272, f266, f244; +fma.rn.f32 f273, f268, f243, f272; +and.b32 r38, r16, 1020; +add.s32 r39, r10, r38; +barrier.sync 0; +and.b32 r40, r8, 4096; +add.s32 r41, r39, r40; +st.shared.f32 [r41], f237; +st.shared.f32 [r41+1024], f251; +st.shared.f32 [r41+2048], f261; +st.shared.f32 [r41+3072], f271; +barrier.sync 0; +mad.lo.s32 r42, r36, -12, r41; +ld.shared.f32 f274, [r42]; +ld.shared.f32 f275, [r42+2048]; +ld.shared.f32 f276, [r42+4096]; +ld.shared.f32 f277, [r42+6144]; +barrier.sync 0; +st.shared.f32 [r41], f238; +st.shared.f32 [r41+1024], f253; +st.shared.f32 [r41+2048], f263; +st.shared.f32 [r41+3072], f273; +barrier.sync 0; +ld.shared.f32 f278, [r42]; +ld.shared.f32 f279, [r42+2048]; +ld.shared.f32 f280, [r42+4096]; +ld.shared.f32 f281, [r42+6144]; +add.f32 %0, f274, f276; +add.f32 %1, f278, f280; +add.f32 %2, f275, f277; +add.f32 %3, f279, f281; +sub.f32 %4, f274, f276; +sub.f32 %5, f278, f280; +sub.f32 %6, f275, f277; +sub.f32 %7, f279, f281; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_2048), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<99, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<223>; +.reg .b32 r<77>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %15, %17; +sub.f32 f10, %16, %18; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -16384; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 16368; +add.s32 r11, r8, r10; +add.f32 f18, %16, %18; +add.f32 f19, %15, %17; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 8184; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+8192]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16352; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 8176; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+8192]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 16320; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 8160; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+8192]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 1016; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 16256; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 8128; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+8192]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 6; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f95, f93; +mul.f32 f100, f96, f94; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 16128; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f96, f93, f101; +sub.f32 f105, f99, f100; +st.shared.v2.f32 [r39+128], {f105, f104}; +barrier.sync 0; +and.b32 r40, r9, 8064; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+8192]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f116, f114; +mul.f32 f121, f117, f115; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 15872; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f117, f114, f122; +sub.f32 f126, f120, f121; +st.shared.v2.f32 [r46+256], {f126, f125}; +barrier.sync 0; +and.b32 r47, r9, 7936; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+8192]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f137, f135; +mul.f32 f142, f138, f136; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 15360; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f138, f135, f143; +sub.f32 f147, f141, f142; +st.shared.v2.f32 [r53+512], {f147, f146}; +barrier.sync 0; +and.b32 r54, r9, 7680; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+8192]; +sub.f32 f156, f148, f152; +sub.f32 f157, f149, f153; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f158, f159}, [rd26]; +mul.f32 f162, f158, f156; +mul.f32 f163, f159, f157; +mul.f32 f164, f158, f157; +and.b32 r57, r9, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 14336; +add.s32 r60, r58, r59; +add.f32 f165, f149, f153; +add.f32 f166, f148, f152; +st.shared.v2.f32 [r60], {f166, f165}; +fma.rn.f32 f167, f159, f156, f164; +sub.f32 f168, f162, f163; +st.shared.v2.f32 [r60+1024], {f168, f167}; +barrier.sync 0; +and.b32 r61, r9, 7168; +sub.s32 r62, r60, r61; +ld.shared.v2.f32 {f169, f170}, [r62]; +ld.shared.v2.f32 {f173, f174}, [r62+8192]; +sub.f32 f177, f169, f173; +sub.f32 f178, f170, f174; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f179, f180}, [rd29]; +mul.f32 f183, f179, f177; +mul.f32 f184, f180, f178; +mul.f32 f185, f179, f178; +and.b32 r64, r9, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 12288; +add.s32 r67, r65, r66; +add.f32 f186, f170, f174; +add.f32 f187, f169, f173; +st.shared.v2.f32 [r67], {f187, f186}; +fma.rn.f32 f188, f180, f177, f185; +sub.f32 f189, f183, f184; +st.shared.v2.f32 [r67+2048], {f189, f188}; +barrier.sync 0; +and.b32 r68, r9, 6144; +sub.s32 r69, r67, r68; +ld.shared.v2.f32 {f190, f191}, [r69]; +ld.shared.v2.f32 {f194, f195}, [r69+8192]; +sub.f32 f198, f190, f194; +sub.f32 f199, f191, f195; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 8; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f32 {f200, f201}, [rd32]; +mul.f32 f204, f200, f198; +mul.f32 f205, f201, f199; +mul.f32 f206, f200, f199; +and.b32 r71, r9, 4088; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 8192; +add.s32 r74, r72, r73; +add.f32 f207, f191, f195; +add.f32 f208, f190, f194; +st.shared.v2.f32 [r74], {f208, f207}; +fma.rn.f32 f209, f201, f198, f206; +sub.f32 f210, f204, f205; +st.shared.v2.f32 [r74+4096], {f210, f209}; +barrier.sync 0; +and.b32 r75, r9, 4096; +sub.s32 r76, r74, r75; +ld.shared.v2.f32 {f211, f212}, [r76]; +ld.shared.v2.f32 {f215, f216}, [r76+8192]; +add.f32 %1, f212, f216; +add.f32 %0, f211, f215; +sub.f32 %3, f212, f216; +sub.f32 %2, f211, f215; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_2048), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<100, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<183>; +.reg .b32 r<77>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %15, %17; +add.f32 f10, %16, %18; +sub.f32 f11, %15, %17; +sub.f32 f12, %16, %18; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 8184; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 4092; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+4096]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+4096]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8176; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 4088; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+4096]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+4096]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8160; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 4080; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+4096]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+4096]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 1016; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 8128; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 4064; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+4096]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+4096]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 6; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f81, f79; +mul.f32 f86, f82, f80; +sub.f32 f87, f85, f86; +mul.f32 f88, f81, f80; +fma.rn.f32 f89, f82, f79, f88; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 8064; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f87; +barrier.sync 0; +and.b32 r40, r11, 4032; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+4096]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+4096]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f98, f96; +mul.f32 f103, f99, f97; +sub.f32 f104, f102, f103; +mul.f32 f105, f98, f97; +fma.rn.f32 f106, f99, f96, f105; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7936; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f104; +barrier.sync 0; +and.b32 r47, r11, 3968; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+4096]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+4096]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f115, f113; +mul.f32 f120, f116, f114; +sub.f32 f121, f119, f120; +mul.f32 f122, f115, f114; +fma.rn.f32 f123, f116, f113, f122; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 7680; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f121; +barrier.sync 0; +and.b32 r54, r11, 3840; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+4096]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+4096]; +add.f32 f128, f124, f125; +add.f32 f129, f126, f127; +sub.f32 f130, f124, f125; +sub.f32 f131, f126, f127; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f132, f133}, [rd26]; +mul.f32 f136, f132, f130; +mul.f32 f137, f133, f131; +sub.f32 f138, f136, f137; +mul.f32 f139, f132, f131; +fma.rn.f32 f140, f133, f130, f139; +and.b32 r57, r11, 508; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 7168; +add.s32 r60, r58, r59; +st.shared.f32 [r60], f128; +st.shared.f32 [r60+512], f138; +barrier.sync 0; +and.b32 r61, r11, 3584; +sub.s32 r62, r60, r61; +ld.shared.f32 f141, [r62]; +ld.shared.f32 f142, [r62+4096]; +barrier.sync 0; +st.shared.f32 [r60], f129; +st.shared.f32 [r60+512], f140; +barrier.sync 0; +ld.shared.f32 f143, [r62]; +ld.shared.f32 f144, [r62+4096]; +add.f32 f145, f141, f142; +add.f32 f146, f143, f144; +sub.f32 f147, f141, f142; +sub.f32 f148, f143, f144; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f149, f150}, [rd29]; +mul.f32 f153, f149, f147; +mul.f32 f154, f150, f148; +sub.f32 f155, f153, f154; +mul.f32 f156, f149, f148; +fma.rn.f32 f157, f150, f147, f156; +and.b32 r64, r11, 1020; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 6144; +add.s32 r67, r65, r66; +st.shared.f32 [r67], f145; +st.shared.f32 [r67+1024], f155; +barrier.sync 0; +and.b32 r68, r11, 3072; +sub.s32 r69, r67, r68; +ld.shared.f32 f158, [r69]; +ld.shared.f32 f159, [r69+4096]; +barrier.sync 0; +st.shared.f32 [r67], f146; +st.shared.f32 [r67+1024], f157; +barrier.sync 0; +ld.shared.f32 f160, [r69]; +ld.shared.f32 f161, [r69+4096]; +add.f32 f162, f158, f159; +add.f32 f163, f160, f161; +sub.f32 f164, f158, f159; +sub.f32 f165, f160, f161; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 8; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f32 {f166, f167}, [rd32]; +mul.f32 f170, f166, f164; +mul.f32 f171, f167, f165; +sub.f32 f172, f170, f171; +mul.f32 f173, f166, f165; +fma.rn.f32 f174, f167, f164, f173; +and.b32 r71, r11, 2044; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 4096; +add.s32 r74, r72, r73; +st.shared.f32 [r74], f162; +st.shared.f32 [r74+2048], f172; +barrier.sync 0; +and.b32 r75, r11, 2048; +sub.s32 r76, r74, r75; +ld.shared.f32 f175, [r76]; +ld.shared.f32 f176, [r76+4096]; +barrier.sync 0; +st.shared.f32 [r74], f163; +st.shared.f32 [r74+2048], f174; +barrier.sync 0; +ld.shared.f32 f177, [r76]; +ld.shared.f32 f178, [r76+4096]; +add.f32 %0, f175, f176; +add.f32 %1, f177, f178; +sub.f32 %2, f175, f176; +sub.f32 %3, f177, f178; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_2048), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..a66a5c79743e0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp32_inv.hpp.inc @@ -0,0 +1,7890 @@ +#ifndef CUFFTDX_FFT_2048_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_2048_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<293, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<891>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8128; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+512]; +ld.shared.f32 f391, [r13+1024]; +ld.shared.f32 f392, [r13+1536]; +ld.shared.f32 f393, [r13+2048]; +ld.shared.f32 f394, [r13+2560]; +ld.shared.f32 f395, [r13+3072]; +ld.shared.f32 f396, [r13+3584]; +ld.shared.f32 f397, [r13+4096]; +ld.shared.f32 f398, [r13+4608]; +ld.shared.f32 f399, [r13+5120]; +ld.shared.f32 f400, [r13+5632]; +ld.shared.f32 f401, [r13+6144]; +ld.shared.f32 f402, [r13+6656]; +ld.shared.f32 f403, [r13+7168]; +ld.shared.f32 f404, [r13+7680]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+512]; +ld.shared.f32 f407, [r13+1024]; +ld.shared.f32 f408, [r13+1536]; +ld.shared.f32 f409, [r13+2048]; +ld.shared.f32 f410, [r13+2560]; +ld.shared.f32 f411, [r13+3072]; +ld.shared.f32 f412, [r13+3584]; +ld.shared.f32 f413, [r13+4096]; +ld.shared.f32 f414, [r13+4608]; +ld.shared.f32 f415, [r13+5120]; +ld.shared.f32 f416, [r13+5632]; +ld.shared.f32 f417, [r13+6144]; +ld.shared.f32 f418, [r13+6656]; +ld.shared.f32 f419, [r13+7168]; +ld.shared.f32 f420, [r13+7680]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f543; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f543; +add.f32 f576, f474, f546; +add.f32 f577, f475, f548; +sub.f32 f578, f474, f546; +sub.f32 f579, f475, f548; +sub.f32 f580, f464, f522; +add.f32 f581, f465, f521; +add.f32 f582, f464, f522; +sub.f32 f583, f465, f521; +add.f32 f584, f468, f551; +add.f32 f585, f469, f553; +sub.f32 f586, f468, f551; +sub.f32 f587, f469, f553; +add.f32 f588, f472, f556; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f556; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 112; +bfe.u32 r15, r5, 4, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f569, f597; +fma.rn.f32 f601, f596, f568, f600; +mul.f32 f602, f568, f597; +mul.f32 f603, f596, f569; +sub.f32 f604, f603, f602; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f573, f609; +fma.rn.f32 f611, f607, f572, f610; +mul.f32 f612, f572, f609; +mul.f32 f613, f607, f573; +sub.f32 f614, f613, f612; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f577, f619; +fma.rn.f32 f621, f617, f576, f620; +mul.f32 f622, f576, f619; +mul.f32 f623, f617, f577; +sub.f32 f624, f623, f622; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f581, f629; +fma.rn.f32 f631, f627, f580, f630; +mul.f32 f632, f580, f629; +mul.f32 f633, f627, f581; +sub.f32 f634, f633, f632; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f585, f639; +fma.rn.f32 f641, f637, f584, f640; +mul.f32 f642, f584, f639; +mul.f32 f643, f637, f585; +sub.f32 f644, f643, f642; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f589, f649; +fma.rn.f32 f651, f647, f588, f650; +mul.f32 f652, f588, f649; +mul.f32 f653, f647, f589; +sub.f32 f654, f653, f652; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f593, f659; +fma.rn.f32 f661, f657, f592, f660; +mul.f32 f662, f592, f659; +mul.f32 f663, f657, f593; +sub.f32 f664, f663, f662; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f567, f669; +fma.rn.f32 f671, f667, f566, f670; +mul.f32 f672, f566, f669; +mul.f32 f673, f667, f567; +sub.f32 f674, f673, f672; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f571, f679; +fma.rn.f32 f681, f677, f570, f680; +mul.f32 f682, f570, f679; +mul.f32 f683, f677, f571; +sub.f32 f684, f683, f682; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f575, f689; +fma.rn.f32 f691, f687, f574, f690; +mul.f32 f692, f574, f689; +mul.f32 f693, f687, f575; +sub.f32 f694, f693, f692; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f579, f699; +fma.rn.f32 f701, f697, f578, f700; +mul.f32 f702, f578, f699; +mul.f32 f703, f697, f579; +sub.f32 f704, f703, f702; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f583, f709; +fma.rn.f32 f711, f707, f582, f710; +mul.f32 f712, f582, f709; +mul.f32 f713, f707, f583; +sub.f32 f714, f713, f712; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f587, f719; +fma.rn.f32 f721, f717, f586, f720; +mul.f32 f722, f586, f719; +mul.f32 f723, f717, f587; +sub.f32 f724, f723, f722; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f591, f729; +fma.rn.f32 f731, f727, f590, f730; +mul.f32 f732, f590, f729; +mul.f32 f733, f727, f591; +sub.f32 f734, f733, f732; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f595, f739; +fma.rn.f32 f741, f737, f594, f740; +mul.f32 f742, f594, f739; +mul.f32 f743, f737, f595; +sub.f32 f744, f743, f742; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 7168; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f601; +st.shared.f32 [r20+128], f611; +st.shared.f32 [r20+192], f621; +st.shared.f32 [r20+256], f631; +st.shared.f32 [r20+320], f641; +st.shared.f32 [r20+384], f651; +st.shared.f32 [r20+448], f661; +st.shared.f32 [r20+512], f671; +st.shared.f32 [r20+576], f681; +st.shared.f32 [r20+640], f691; +st.shared.f32 [r20+704], f701; +st.shared.f32 [r20+768], f711; +st.shared.f32 [r20+832], f721; +st.shared.f32 [r20+896], f731; +st.shared.f32 [r20+960], f741; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+512]; +ld.shared.f32 f747, [r21+1024]; +ld.shared.f32 f748, [r21+1536]; +ld.shared.f32 f749, [r21+2048]; +ld.shared.f32 f750, [r21+2560]; +ld.shared.f32 f751, [r21+3072]; +ld.shared.f32 f752, [r21+3584]; +ld.shared.f32 f753, [r21+4096]; +ld.shared.f32 f754, [r21+4608]; +ld.shared.f32 f755, [r21+5120]; +ld.shared.f32 f756, [r21+5632]; +ld.shared.f32 f757, [r21+6144]; +ld.shared.f32 f758, [r21+6656]; +ld.shared.f32 f759, [r21+7168]; +ld.shared.f32 f760, [r21+7680]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+512]; +ld.shared.f32 f763, [r21+1024]; +ld.shared.f32 f764, [r21+1536]; +ld.shared.f32 f765, [r21+2048]; +ld.shared.f32 f766, [r21+2560]; +ld.shared.f32 f767, [r21+3072]; +ld.shared.f32 f768, [r21+3584]; +ld.shared.f32 f769, [r21+4096]; +ld.shared.f32 f770, [r21+4608]; +ld.shared.f32 f771, [r21+5120]; +ld.shared.f32 f772, [r21+5632]; +ld.shared.f32 f773, [r21+6144]; +ld.shared.f32 f774, [r21+6656]; +ld.shared.f32 f775, [r21+7168]; +ld.shared.f32 f776, [r21+7680]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +sub.f32 f789, f779, f784; +add.f32 f790, f780, f783; +add.f32 f791, f779, f784; +sub.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +sub.f32 f805, f795, f800; +add.f32 f806, f796, f799; +add.f32 f807, f795, f800; +sub.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0f3F3504F3; +sub.f32 f811, f809, f810; +add.f32 f812, f809, f810; +mul.f32 f813, f807, 0fBF3504F3; +mul.f32 f814, f808, 0f3F3504F3; +sub.f32 f815, f813, f814; +mul.f32 f816, f808, 0fBF3504F3; +fma.rn.f32 f817, f807, 0f3F3504F3, f816; +add.f32 f818, f746, f754; +add.f32 f819, f762, f770; +sub.f32 f820, f746, f754; +sub.f32 f821, f762, f770; +add.f32 f822, f750, f758; +add.f32 f823, f766, f774; +sub.f32 f824, f750, f758; +sub.f32 f825, f766, f774; +add.f32 f826, f818, f822; +add.f32 f827, f819, f823; +sub.f32 f828, f818, f822; +sub.f32 f829, f819, f823; +sub.f32 f830, f820, f825; +add.f32 f831, f821, f824; +add.f32 f832, f820, f825; +sub.f32 f833, f821, f824; +add.f32 f834, f748, f756; +add.f32 f835, f764, f772; +sub.f32 f836, f748, f756; +sub.f32 f837, f764, f772; +add.f32 f838, f752, f760; +add.f32 f839, f768, f776; +sub.f32 f840, f752, f760; +sub.f32 f841, f768, f776; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +sub.f32 f846, f836, f841; +add.f32 f847, f837, f840; +add.f32 f848, f836, f841; +sub.f32 f849, f837, f840; +mul.f32 f850, f846, 0f3F3504F3; +mul.f32 f851, f847, 0f3F3504F3; +sub.f32 f852, f850, f851; +add.f32 f853, f850, f851; +mul.f32 f854, f848, 0fBF3504F3; +mul.f32 f855, f849, 0f3F3504F3; +sub.f32 f856, f854, f855; +mul.f32 f857, f849, 0fBF3504F3; +fma.rn.f32 f858, f848, 0f3F3504F3, f857; +add.f32 %0, f785, f801; +add.f32 %1, f786, f802; +add.f32 %2, f826, f842; +add.f32 %3, f827, f843; +add.f32 %5, f790, f812; +add.f32 %4, f789, f811; +add.f32 %7, f831, f853; +add.f32 %6, f830, f852; +add.f32 %9, f788, f803; +sub.f32 %8, f787, f804; +add.f32 %11, f829, f844; +sub.f32 %10, f828, f845; +add.f32 %13, f792, f817; +add.f32 %12, f791, f815; +add.f32 %15, f833, f858; +add.f32 %14, f832, f856; +sub.f32 %16, f785, f801; +sub.f32 %17, f786, f802; +sub.f32 %18, f826, f842; +sub.f32 %19, f827, f843; +sub.f32 %21, f790, f812; +sub.f32 %20, f789, f811; +sub.f32 %23, f831, f853; +sub.f32 %22, f830, f852; +sub.f32 %25, f788, f803; +add.f32 %24, f787, f804; +sub.f32 %27, f829, f844; +add.f32 %26, f828, f845; +sub.f32 %29, f792, f817; +sub.f32 %28, f791, f815; +sub.f32 %31, f833, f858; +sub.f32 %30, f832, f856; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_2048), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<295, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<491>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8160; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+1024]; +ld.shared.f32 f161, [r13+2048]; +ld.shared.f32 f162, [r13+3072]; +ld.shared.f32 f163, [r13+4096]; +ld.shared.f32 f164, [r13+5120]; +ld.shared.f32 f165, [r13+6144]; +ld.shared.f32 f166, [r13+7168]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+1024]; +ld.shared.f32 f169, [r13+2048]; +ld.shared.f32 f170, [r13+3072]; +ld.shared.f32 f171, [r13+4096]; +ld.shared.f32 f172, [r13+5120]; +ld.shared.f32 f173, [r13+6144]; +ld.shared.f32 f174, [r13+7168]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 248; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 7936; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+1024]; +ld.shared.f32 f303, [r20+2048]; +ld.shared.f32 f304, [r20+3072]; +ld.shared.f32 f305, [r20+4096]; +ld.shared.f32 f306, [r20+5120]; +ld.shared.f32 f307, [r20+6144]; +ld.shared.f32 f308, [r20+7168]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+1024]; +ld.shared.f32 f311, [r20+2048]; +ld.shared.f32 f312, [r20+3072]; +ld.shared.f32 f313, [r20+4096]; +ld.shared.f32 f314, [r20+5120]; +ld.shared.f32 f315, [r20+6144]; +ld.shared.f32 f316, [r20+7168]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +sub.f32 f329, f319, f324; +add.f32 f330, f320, f323; +add.f32 f331, f319, f324; +sub.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +sub.f32 f345, f335, f340; +add.f32 f346, f336, f339; +add.f32 f347, f335, f340; +sub.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0f3F3504F3; +sub.f32 f351, f349, f350; +add.f32 f352, f349, f350; +mul.f32 f353, f347, 0fBF3504F3; +mul.f32 f354, f348, 0f3F3504F3; +sub.f32 f355, f353, f354; +mul.f32 f356, f348, 0fBF3504F3; +fma.rn.f32 f357, f347, 0f3F3504F3, f356; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f352; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f352; +sub.f32 f366, f327, f344; +add.f32 f367, f328, f343; +add.f32 f368, f327, f344; +sub.f32 f369, f328, f343; +add.f32 f370, f331, f355; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f355; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 192; +bfe.u32 r22, r5, 6, 2; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f363, f375; +fma.rn.f32 f379, f374, f362, f378; +mul.f32 f380, f362, f375; +mul.f32 f381, f374, f363; +sub.f32 f382, f381, f380; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f367, f387; +fma.rn.f32 f389, f385, f366, f388; +mul.f32 f390, f366, f387; +mul.f32 f391, f385, f367; +sub.f32 f392, f391, f390; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f371, f397; +fma.rn.f32 f399, f395, f370, f398; +mul.f32 f400, f370, f397; +mul.f32 f401, f395, f371; +sub.f32 f402, f401, f400; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f361, f407; +fma.rn.f32 f409, f405, f360, f408; +mul.f32 f410, f360, f407; +mul.f32 f411, f405, f361; +sub.f32 f412, f411, f410; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f365, f417; +fma.rn.f32 f419, f415, f364, f418; +mul.f32 f420, f364, f417; +mul.f32 f421, f415, f365; +sub.f32 f422, f421, f420; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f369, f427; +fma.rn.f32 f429, f425, f368, f428; +mul.f32 f430, f368, f427; +mul.f32 f431, f425, f369; +sub.f32 f432, f431, f430; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f373, f437; +fma.rn.f32 f439, f435, f372, f438; +mul.f32 f440, f372, f437; +mul.f32 f441, f435, f373; +sub.f32 f442, f441, f440; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 6144; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f379; +st.shared.f32 [r26+512], f389; +st.shared.f32 [r26+768], f399; +st.shared.f32 [r26+1024], f409; +st.shared.f32 [r26+1280], f419; +st.shared.f32 [r26+1536], f429; +st.shared.f32 [r26+1792], f439; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+1024]; +ld.shared.f32 f445, [r27+2048]; +ld.shared.f32 f446, [r27+3072]; +ld.shared.f32 f447, [r27+4096]; +ld.shared.f32 f448, [r27+5120]; +ld.shared.f32 f449, [r27+6144]; +ld.shared.f32 f450, [r27+7168]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+1024]; +ld.shared.f32 f453, [r27+2048]; +ld.shared.f32 f454, [r27+3072]; +ld.shared.f32 f455, [r27+4096]; +ld.shared.f32 f456, [r27+5120]; +ld.shared.f32 f457, [r27+6144]; +ld.shared.f32 f458, [r27+7168]; +add.f32 f459, f443, f447; +add.f32 f460, f451, f455; +sub.f32 f461, f443, f447; +sub.f32 f462, f451, f455; +add.f32 f463, f445, f449; +add.f32 f464, f453, f457; +sub.f32 f465, f445, f449; +sub.f32 f466, f453, f457; +add.f32 f467, f444, f448; +add.f32 f468, f452, f456; +sub.f32 f469, f444, f448; +sub.f32 f470, f452, f456; +add.f32 f471, f446, f450; +add.f32 f472, f454, f458; +sub.f32 f473, f446, f450; +sub.f32 f474, f454, f458; +add.f32 %0, f459, f463; +add.f32 %1, f460, f464; +add.f32 %2, f467, f471; +add.f32 %3, f468, f472; +add.f32 %5, f462, f465; +sub.f32 %4, f461, f466; +add.f32 %7, f470, f473; +sub.f32 %6, f469, f474; +sub.f32 %8, f459, f463; +sub.f32 %9, f460, f464; +sub.f32 %10, f467, f471; +sub.f32 %11, f468, f472; +sub.f32 %13, f462, f465; +add.f32 %12, f461, f466; +sub.f32 %15, f470, f473; +add.f32 %14, f469, f474; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_2048), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<294, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<539>; +.reg .b32 r<27>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 16320; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+2048]; +ld.shared.v2.f32 {f167, f168}, [r13+4096]; +ld.shared.v2.f32 {f171, f172}, [r13+6144]; +ld.shared.v2.f32 {f175, f176}, [r13+8192]; +ld.shared.v2.f32 {f179, f180}, [r13+10240]; +ld.shared.v2.f32 {f183, f184}, [r13+12288]; +ld.shared.v2.f32 {f187, f188}, [r13+14336]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 248; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 15872; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+2048]; +ld.shared.v2.f32 {f325, f326}, [r19+4096]; +ld.shared.v2.f32 {f329, f330}, [r19+6144]; +ld.shared.v2.f32 {f333, f334}, [r19+8192]; +ld.shared.v2.f32 {f337, f338}, [r19+10240]; +ld.shared.v2.f32 {f341, f342}, [r19+12288]; +ld.shared.v2.f32 {f345, f346}, [r19+14336]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +sub.f32 f361, f351, f356; +add.f32 f362, f352, f355; +add.f32 f363, f351, f356; +sub.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +sub.f32 f377, f367, f372; +add.f32 f378, f368, f371; +add.f32 f379, f367, f372; +sub.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0f3F3504F3; +sub.f32 f383, f381, f382; +add.f32 f384, f381, f382; +mul.f32 f385, f379, 0fBF3504F3; +mul.f32 f386, f380, 0f3F3504F3; +sub.f32 f387, f385, f386; +mul.f32 f388, f380, 0fBF3504F3; +fma.rn.f32 f389, f379, 0f3F3504F3, f388; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f384; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f384; +sub.f32 f396, f359, f376; +add.f32 f397, f360, f375; +add.f32 f398, f359, f376; +sub.f32 f399, f360, f375; +add.f32 f400, f363, f387; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f387; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 192; +bfe.u32 r21, r5, 6, 2; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f393, f405; +mul.f32 f409, f392, f405; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f397, f415; +mul.f32 f417, f396, f415; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f401, f423; +mul.f32 f425, f400, f423; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f391, f431; +mul.f32 f433, f390, f431; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f395, f439; +mul.f32 f441, f394, f439; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f399, f447; +mul.f32 f449, f398, f447; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f403, f455; +mul.f32 f457, f402, f455; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 12288; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f404, f392, f408; +sub.f32 f462, f410, f409; +st.shared.v2.f32 [r25+512], {f461, f462}; +fma.rn.f32 f463, f413, f396, f416; +sub.f32 f464, f418, f417; +st.shared.v2.f32 [r25+1024], {f463, f464}; +fma.rn.f32 f465, f421, f400, f424; +sub.f32 f466, f426, f425; +st.shared.v2.f32 [r25+1536], {f465, f466}; +sub.f32 f467, f434, f433; +fma.rn.f32 f468, f429, f390, f432; +st.shared.v2.f32 [r25+2048], {f468, f467}; +fma.rn.f32 f469, f437, f394, f440; +sub.f32 f470, f442, f441; +st.shared.v2.f32 [r25+2560], {f469, f470}; +fma.rn.f32 f471, f445, f398, f448; +sub.f32 f472, f450, f449; +st.shared.v2.f32 [r25+3072], {f471, f472}; +fma.rn.f32 f473, f453, f402, f456; +sub.f32 f474, f458, f457; +st.shared.v2.f32 [r25+3584], {f473, f474}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+2048]; +ld.shared.v2.f32 {f483, f484}, [r26+4096]; +ld.shared.v2.f32 {f487, f488}, [r26+6144]; +ld.shared.v2.f32 {f491, f492}, [r26+8192]; +ld.shared.v2.f32 {f495, f496}, [r26+10240]; +ld.shared.v2.f32 {f499, f500}, [r26+12288]; +ld.shared.v2.f32 {f503, f504}, [r26+14336]; +add.f32 f507, f475, f491; +add.f32 f508, f476, f492; +sub.f32 f509, f475, f491; +sub.f32 f510, f476, f492; +add.f32 f511, f483, f499; +add.f32 f512, f484, f500; +sub.f32 f513, f483, f499; +sub.f32 f514, f484, f500; +add.f32 f515, f479, f495; +add.f32 f516, f480, f496; +sub.f32 f517, f479, f495; +sub.f32 f518, f480, f496; +add.f32 f519, f487, f503; +add.f32 f520, f488, f504; +sub.f32 f521, f487, f503; +sub.f32 f522, f488, f504; +add.f32 %1, f508, f512; +add.f32 %0, f507, f511; +add.f32 %3, f516, f520; +add.f32 %2, f515, f519; +add.f32 %5, f510, f513; +sub.f32 %4, f509, f514; +add.f32 %7, f518, f521; +sub.f32 %6, f517, f522; +sub.f32 %9, f508, f512; +sub.f32 %8, f507, f511; +sub.f32 %11, f516, f520; +sub.f32 %10, f515, f519; +sub.f32 %13, f510, f513; +add.f32 %12, f509, f514; +sub.f32 %15, f518, f521; +add.f32 %14, f517, f522; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_2048), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<296, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1154>; +.reg .b32 r<35>; +.reg .b64 rd<11>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1146, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1144, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1143, f1146, f1144; +sub.f32 f76, f1146, f1144; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f1142, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1139, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1137, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1136, f1139, f1137; +sub.f32 f92, f1139, f1137; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f1135, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f1135, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f1133, f95, 0fBF3504F3; +mul.f32 f1134, f96, 0f3F3504F3; +sub.f32 f103, f1133, f1134; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1132, f1143, f1136; +sub.f32 f109, f1143, f1136; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1131, f1142, f100; +sub.f32 f113, f1142, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f1130, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f1129, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1127, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1124, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1123, f1127, f1124; +sub.f32 f133, f1127, f1124; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f1122, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1120, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1118, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1117, f1120, f1118; +sub.f32 f149, f1120, f1118; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f1116, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f1116, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f1114, f152, 0fBF3504F3; +mul.f32 f1115, f153, 0f3F3504F3; +sub.f32 f160, f1114, f1115; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1113, f1123, f1117; +sub.f32 f166, f1123, f1117; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1112, f1122, f157; +sub.f32 f170, f1122, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f1111, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f1110, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1108, f167, 0f3F6C835E; +mul.f32 f1109, f1112, 0f3EC3EF15; +sub.f32 f181, f1108, f1109; +mul.f32 f182, f1112, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f1111, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f1110, 0f3F6C835E; +mul.f32 f1107, f175, 0f3EC3EF15; +sub.f32 f190, f1107, f189; +mul.f32 f191, f1110, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f1106, f169, 0fBEC3EF15; +sub.f32 f195, f1106, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f1104, f173, 0fBF3504F3; +mul.f32 f1105, f174, 0f3F3504F3; +sub.f32 f200, f1104, f1105; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f1102, f177, 0fBF6C835E; +mul.f32 f1103, f178, 0f3EC3EF15; +sub.f32 f205, f1102, f1103; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1101, f1131, f183; +sub.f32 f213, f1131, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1100, f1130, f187; +sub.f32 f217, f1130, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f1099, f1129, f192; +sub.f32 f221, f1129, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f1098, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f1097, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f1096, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1095, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f1101, f239; +mul.f32 f244, f238, f1101; +mul.f32 f246, f239, f239; +mul.f32 f1094, f238, f238; +sub.f32 f247, f1094, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f1100, f249; +mul.f32 f252, f247, f1100; +mul.f32 f1092, f238, f247; +mul.f32 f1093, f239, f249; +sub.f32 f255, f1092, f1093; +mul.f32 f1091, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f1099, f257; +mul.f32 f260, f255, f1099; +mul.f32 f262, f239, f257; +mul.f32 f1090, f238, f255; +sub.f32 f263, f1090, f262; +mul.f32 f1089, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f1098, f265; +mul.f32 f268, f263, f1098; +mul.f32 f270, f239, f265; +mul.f32 f1088, f238, f263; +sub.f32 f271, f1088, f270; +mul.f32 f1087, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f1097, f273; +mul.f32 f276, f271, f1097; +mul.f32 f1085, f238, f271; +mul.f32 f1086, f239, f273; +sub.f32 f279, f1085, f1086; +mul.f32 f1084, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f1096, f281; +mul.f32 f284, f279, f1096; +mul.f32 f286, f239, f281; +mul.f32 f1083, f238, f279; +sub.f32 f287, f1083, f286; +mul.f32 f1082, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f1095, f289; +mul.f32 f292, f287, f1095; +mul.f32 f294, f239, f289; +mul.f32 f1081, f238, f287; +sub.f32 f295, f1081, f294; +mul.f32 f1080, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1079, f1132, f1113; +mul.f32 f298, f1079, f297; +mul.f32 f300, f295, f1079; +mul.f32 f1077, f238, f295; +mul.f32 f1078, f239, f297; +sub.f32 f303, f1077, f1078; +sub.f32 f1076, f106, f163; +mul.f32 f1075, f1076, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1074, f238, f303; +sub.f32 f311, f1074, f310; +mul.f32 f1073, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f1071, f238, f311; +mul.f32 f1072, f239, f313; +sub.f32 f319, f1071, f1072; +mul.f32 f1070, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1069, f238, f319; +sub.f32 f327, f1069, f326; +mul.f32 f1068, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1067, f238, f327; +sub.f32 f335, f1067, f334; +mul.f32 f1066, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f1064, f238, f335; +mul.f32 f1065, f239, f337; +sub.f32 f343, f1064, f1065; +mul.f32 f1063, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1062, f238, f343; +sub.f32 f351, f1062, f350; +mul.f32 f1061, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f1060, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 16256; +add.s32 r12, r9, r11; +add.f32 f357, f1132, f1113; +sub.f32 f1148, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r34, %tid.x; +shl.b32 r28, r34, 7; +shl.b32 r24, r34, 3; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f1060; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f1091; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f1089; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f1087; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f1084; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f1082; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f1080; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f1148, f298; +sub.f32 f374, f300, f1075; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f1073; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f1070; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f1068; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f1066; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f1063; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f1061; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +and.b32 r21, r34, 127; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+1024]; +ld.shared.v2.f32 {f397, f398}, [r13+2048]; +ld.shared.v2.f32 {f401, f402}, [r13+3072]; +ld.shared.v2.f32 {f405, f406}, [r13+4096]; +ld.shared.v2.f32 {f409, f410}, [r13+5120]; +ld.shared.v2.f32 {f413, f414}, [r13+6144]; +ld.shared.v2.f32 {f417, f418}, [r13+7168]; +ld.shared.v2.f32 {f421, f422}, [r13+8192]; +ld.shared.v2.f32 {f425, f426}, [r13+9216]; +ld.shared.v2.f32 {f429, f430}, [r13+10240]; +ld.shared.v2.f32 {f433, f434}, [r13+11264]; +ld.shared.v2.f32 {f437, f438}, [r13+12288]; +ld.shared.v2.f32 {f441, f442}, [r13+13312]; +ld.shared.v2.f32 {f445, f446}, [r13+14336]; +ld.shared.v2.f32 {f449, f450}, [r13+15360]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1059, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1058, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1057, f1059, f1058; +sub.f32 f464, f1059, f1058; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f1056, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1055, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1054, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1053, f1055, f1054; +sub.f32 f480, f1055, f1054; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f1052, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f1052, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f1051, f483, 0fBF3504F3; +sub.f32 f491, f1051, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1050, f1057, f1053; +sub.f32 f497, f1057, f1053; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1049, f1056, f488; +sub.f32 f501, f1056, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f1048, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f1047, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1046, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1045, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1044, f1046, f1045; +sub.f32 f521, f1046, f1045; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f1043, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1042, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1041, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1040, f1042, f1041; +sub.f32 f537, f1042, f1041; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f1039, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f1039, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f1038, f540, 0fBF3504F3; +sub.f32 f548, f1038, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1037, f1044, f1040; +sub.f32 f554, f1044, f1040; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1036, f1043, f545; +sub.f32 f558, f1043, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f1035, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f1034, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1036, 0f3EC3EF15; +mul.f32 f1033, f555, 0f3F6C835E; +sub.f32 f569, f1033, f568; +mul.f32 f570, f1036, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f1035, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f1034, 0f3F6C835E; +mul.f32 f1032, f563, 0f3EC3EF15; +sub.f32 f578, f1032, f577; +mul.f32 f579, f1034, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f1031, f557, 0fBEC3EF15; +sub.f32 f583, f1031, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f1030, f561, 0fBF3504F3; +sub.f32 f588, f1030, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f1029, f565, 0fBF6C835E; +sub.f32 f593, f1029, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1028, f1049, f571; +sub.f32 f601, f1049, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1027, f1048, f575; +sub.f32 f605, f1048, f575; +add.f32 f606, f506, f578; +sub.f32 f608, f506, f578; +add.f32 f1026, f1047, f580; +sub.f32 f609, f1047, f580; +sub.f32 f610, f496, f554; +add.f32 f612, f496, f554; +add.f32 f1025, f497, f553; +sub.f32 f613, f497, f553; +add.f32 f614, f500, f583; +sub.f32 f616, f500, f583; +add.f32 f1024, f501, f585; +sub.f32 f617, f501, f585; +add.f32 f618, f504, f588; +sub.f32 f620, f504, f588; +add.f32 f1023, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1022, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r34, 112; +bfe.u32 r15, r34, 4, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f630, f1028, f627; +mul.f32 f632, f626, f1028; +mul.f32 f634, f627, f627; +mul.f32 f1021, f626, f626; +sub.f32 f635, f1021, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f638, f1027, f637; +mul.f32 f640, f635, f1027; +mul.f32 f1019, f626, f635; +mul.f32 f1020, f627, f637; +sub.f32 f643, f1019, f1020; +mul.f32 f1018, f602, f637; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f646, f1026, f645; +mul.f32 f648, f643, f1026; +mul.f32 f650, f627, f645; +mul.f32 f1017, f626, f643; +sub.f32 f651, f1017, f650; +mul.f32 f1016, f606, f645; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f654, f1025, f653; +mul.f32 f656, f651, f1025; +mul.f32 f658, f627, f653; +mul.f32 f1015, f626, f651; +sub.f32 f659, f1015, f658; +mul.f32 f1014, f610, f653; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f662, f1024, f661; +mul.f32 f664, f659, f1024; +mul.f32 f1012, f626, f659; +mul.f32 f1013, f627, f661; +sub.f32 f667, f1012, f1013; +mul.f32 f1011, f614, f661; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f670, f1023, f669; +mul.f32 f672, f667, f1023; +mul.f32 f674, f627, f669; +mul.f32 f1010, f626, f667; +sub.f32 f675, f1010, f674; +mul.f32 f1009, f618, f669; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f678, f1022, f677; +mul.f32 f680, f675, f1022; +mul.f32 f682, f627, f677; +mul.f32 f1008, f626, f675; +sub.f32 f683, f1008, f682; +mul.f32 f1007, f622, f677; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1006, f1050, f1037; +mul.f32 f686, f1006, f685; +mul.f32 f688, f683, f1006; +mul.f32 f1004, f626, f683; +mul.f32 f1005, f627, f685; +sub.f32 f691, f1004, f1005; +sub.f32 f1003, f494, f551; +mul.f32 f1002, f1003, f685; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f694, f601, f693; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1001, f626, f691; +sub.f32 f699, f1001, f698; +mul.f32 f1000, f600, f693; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f702, f605, f701; +mul.f32 f704, f699, f605; +mul.f32 f998, f626, f699; +mul.f32 f999, f627, f701; +sub.f32 f707, f998, f999; +mul.f32 f997, f604, f701; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f710, f609, f709; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f996, f626, f707; +sub.f32 f715, f996, f714; +mul.f32 f995, f608, f709; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f718, f613, f717; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f994, f626, f715; +sub.f32 f723, f994, f722; +mul.f32 f993, f612, f717; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f726, f617, f725; +mul.f32 f728, f723, f617; +mul.f32 f991, f626, f723; +mul.f32 f992, f627, f725; +sub.f32 f731, f991, f992; +mul.f32 f990, f616, f725; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f734, f621, f733; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f989, f626, f731; +sub.f32 f739, f989, f738; +mul.f32 f988, f620, f733; +mul.f32 f740, f626, f733; +mul.f32 f987, f598, f627; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f625, f741; +mul.f32 f743, f624, f741; +mul.f32 f744, f739, f625; +and.b32 r16, r24, 120; +add.s32 r17, r9, r16; +sub.f32 f1150, f1050, f1037; +mul.f32 f1149, f683, f1150; +barrier.sync 0; +and.b32 r18, r28, 14336; +add.s32 r19, r17, r18; +sub.f32 f1152, f1050, f1037; +mul.f32 f1151, f683, f1152; +add.f32 f745, f1050, f1037; +sub.f32 f1153, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 112; +fma.rn.f32 f747, f626, f598, f630; +sub.f32 f748, f632, f987; +st.shared.v2.f32 [r19+128], {f747, f748}; +fma.rn.f32 f749, f635, f602, f638; +sub.f32 f750, f640, f1018; +st.shared.v2.f32 [r19+256], {f749, f750}; +fma.rn.f32 f751, f643, f606, f646; +sub.f32 f752, f648, f1016; +st.shared.v2.f32 [r19+384], {f751, f752}; +fma.rn.f32 f753, f651, f610, f654; +sub.f32 f754, f656, f1014; +st.shared.v2.f32 [r19+512], {f753, f754}; +sub.f32 f755, f664, f1011; +fma.rn.f32 f756, f659, f614, f662; +st.shared.v2.f32 [r19+640], {f756, f755}; +fma.rn.f32 f757, f667, f618, f670; +sub.f32 f758, f672, f1009; +st.shared.v2.f32 [r19+768], {f757, f758}; +fma.rn.f32 f759, f675, f622, f678; +sub.f32 f760, f680, f1007; +st.shared.v2.f32 [r19+896], {f759, f760}; +fma.rn.f32 f761, f683, f1153, f686; +sub.f32 f762, f1151, f1002; +st.shared.v2.f32 [r19+1024], {f761, f762}; +fma.rn.f32 f763, f691, f600, f694; +sub.f32 f764, f696, f1000; +st.shared.v2.f32 [r19+1152], {f763, f764}; +fma.rn.f32 f765, f699, f604, f702; +sub.f32 f766, f704, f997; +st.shared.v2.f32 [r19+1280], {f765, f766}; +fma.rn.f32 f767, f707, f608, f710; +sub.f32 f768, f712, f995; +st.shared.v2.f32 [r19+1408], {f767, f768}; +fma.rn.f32 f769, f715, f612, f718; +sub.f32 f770, f720, f993; +st.shared.v2.f32 [r19+1536], {f769, f770}; +fma.rn.f32 f771, f723, f616, f726; +sub.f32 f772, f728, f990; +st.shared.v2.f32 [r19+1664], {f771, f772}; +fma.rn.f32 f773, f731, f620, f734; +sub.f32 f774, f736, f988; +st.shared.v2.f32 [r19+1792], {f773, f774}; +fma.rn.f32 f775, f739, f624, f742; +sub.f32 f776, f744, f743; +st.shared.v2.f32 [r19+1920], {f775, f776}; +barrier.sync 0; +mad.lo.s32 r20, r26, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+1024]; +ld.shared.v2.f32 {f785, f786}, [r20+2048]; +ld.shared.v2.f32 {f789, f790}, [r20+3072]; +ld.shared.v2.f32 {f793, f794}, [r20+4096]; +ld.shared.v2.f32 {f797, f798}, [r20+5120]; +ld.shared.v2.f32 {f801, f802}, [r20+6144]; +ld.shared.v2.f32 {f805, f806}, [r20+7168]; +ld.shared.v2.f32 {f809, f810}, [r20+8192]; +ld.shared.v2.f32 {f813, f814}, [r20+9216]; +ld.shared.v2.f32 {f817, f818}, [r20+10240]; +ld.shared.v2.f32 {f821, f822}, [r20+11264]; +ld.shared.v2.f32 {f825, f826}, [r20+12288]; +ld.shared.v2.f32 {f829, f830}, [r20+13312]; +ld.shared.v2.f32 {f833, f834}, [r20+14336]; +ld.shared.v2.f32 {f837, f838}, [r20+15360]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f986, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f985, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f984, f986, f985; +sub.f32 f852, f986, f985; +sub.f32 f853, f843, f848; +add.f32 f855, f843, f848; +add.f32 f983, f844, f847; +sub.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f982, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f981, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f980, f982, f981; +sub.f32 f868, f982, f981; +sub.f32 f869, f859, f864; +add.f32 f871, f859, f864; +add.f32 f979, f860, f863; +sub.f32 f872, f860, f863; +mul.f32 f873, f869, 0f3F3504F3; +mul.f32 f874, f979, 0f3F3504F3; +sub.f32 f875, f873, f874; +add.f32 f876, f873, f874; +mul.f32 f878, f872, 0f3F3504F3; +mul.f32 f978, f871, 0fBF3504F3; +sub.f32 f879, f978, f878; +mul.f32 f880, f872, 0fBF3504F3; +fma.rn.f32 f881, f871, 0f3F3504F3, f880; +add.f32 f882, f781, f813; +sub.f32 f884, f781, f813; +add.f32 f977, f782, f814; +sub.f32 f885, f782, f814; +add.f32 f886, f797, f829; +sub.f32 f888, f797, f829; +add.f32 f976, f798, f830; +sub.f32 f889, f798, f830; +add.f32 f890, f882, f886; +sub.f32 f892, f882, f886; +add.f32 f975, f977, f976; +sub.f32 f893, f977, f976; +sub.f32 f894, f884, f889; +add.f32 f896, f884, f889; +add.f32 f974, f885, f888; +sub.f32 f897, f885, f888; +add.f32 f898, f789, f821; +sub.f32 f900, f789, f821; +add.f32 f973, f790, f822; +sub.f32 f901, f790, f822; +add.f32 f902, f805, f837; +sub.f32 f904, f805, f837; +add.f32 f972, f806, f838; +sub.f32 f905, f806, f838; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f971, f973, f972; +sub.f32 f909, f973, f972; +sub.f32 f910, f900, f905; +add.f32 f912, f900, f905; +add.f32 f970, f901, f904; +sub.f32 f913, f901, f904; +mul.f32 f914, f910, 0f3F3504F3; +mul.f32 f915, f970, 0f3F3504F3; +sub.f32 f916, f914, f915; +add.f32 f917, f914, f915; +mul.f32 f968, f912, 0fBF3504F3; +mul.f32 f969, f913, 0f3F3504F3; +sub.f32 f920, f968, f969; +mul.f32 f921, f913, 0fBF3504F3; +fma.rn.f32 f922, f912, 0f3F3504F3, f921; +add.f32 %1, f984, f980; +add.f32 %0, f849, f865; +add.f32 %3, f975, f971; +add.f32 %2, f890, f906; +add.f32 %4, f853, f875; +add.f32 %5, f983, f876; +add.f32 %6, f894, f916; +add.f32 %7, f974, f917; +sub.f32 %8, f851, f868; +add.f32 %9, f852, f867; +add.f32 %11, f893, f908; +sub.f32 %10, f892, f909; +add.f32 %13, f856, f881; +add.f32 %12, f855, f879; +add.f32 %15, f897, f922; +add.f32 %14, f896, f920; +sub.f32 %17, f984, f980; +sub.f32 %16, f849, f865; +sub.f32 %19, f975, f971; +sub.f32 %18, f890, f906; +sub.f32 %21, f983, f876; +sub.f32 %20, f853, f875; +sub.f32 %23, f974, f917; +sub.f32 %22, f894, f916; +sub.f32 %25, f852, f867; +add.f32 %24, f851, f868; +sub.f32 %27, f893, f908; +add.f32 %26, f892, f909; +sub.f32 %29, f856, f881; +sub.f32 %28, f855, f879; +sub.f32 %31, f897, f922; +sub.f32 %30, f896, f920; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_2048), "l"(lut_sp_16_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<297, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2467>; +.reg .b32 r<37>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2460, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2458, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2457, f2460, f2458; +sub.f32 f140, f2460, f2458; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2456, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2453, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2451, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2450, f2453, f2451; +sub.f32 f156, f2453, f2451; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2449, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2449, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2447, f159, 0fBF3504F3; +mul.f32 f2448, f160, 0f3F3504F3; +sub.f32 f167, f2447, f2448; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2446, f2457, f2450; +sub.f32 f173, f2457, f2450; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2445, f2456, f164; +sub.f32 f177, f2456, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2444, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2443, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2441, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2438, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2437, f2441, f2438; +sub.f32 f197, f2441, f2438; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2436, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2434, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2432, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2431, f2434, f2432; +sub.f32 f213, f2434, f2432; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2430, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2430, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2428, f216, 0fBF3504F3; +mul.f32 f2429, f217, 0f3F3504F3; +sub.f32 f224, f2428, f2429; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2427, f2437, f2431; +sub.f32 f230, f2437, f2431; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2426, f2436, f221; +sub.f32 f234, f2436, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2425, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2424, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2422, f231, 0f3F6C835E; +mul.f32 f2423, f2426, 0f3EC3EF15; +sub.f32 f245, f2422, f2423; +mul.f32 f246, f2426, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2425, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2424, 0f3F6C835E; +mul.f32 f2421, f239, 0f3EC3EF15; +sub.f32 f254, f2421, f253; +mul.f32 f255, f2424, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2420, f233, 0fBEC3EF15; +sub.f32 f259, f2420, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2418, f237, 0fBF3504F3; +mul.f32 f2419, f238, 0f3F3504F3; +sub.f32 f264, f2418, f2419; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2416, f241, 0fBF6C835E; +mul.f32 f2417, f242, 0f3EC3EF15; +sub.f32 f269, f2416, f2417; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2415, f2446, f2427; +sub.f32 f275, f2446, f2427; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2414, f2445, f247; +sub.f32 f279, f2445, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2413, f2444, f251; +sub.f32 f283, f2444, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2412, f2443, f256; +sub.f32 f287, f2443, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2411, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2410, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2409, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2408, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2405, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2403, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2402, f2405, f2403; +sub.f32 f315, f2405, f2403; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2401, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2399, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2396, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2395, f2399, f2396; +sub.f32 f331, f2399, f2396; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2394, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2394, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2393, f334, 0fBF3504F3; +sub.f32 f342, f2393, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2392, f2402, f2395; +sub.f32 f348, f2402, f2395; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2391, f2401, f339; +sub.f32 f352, f2401, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2390, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2389, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2387, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2385, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2384, f2387, f2385; +sub.f32 f372, f2387, f2385; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2383, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2380, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2379, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2378, f2380, f2379; +sub.f32 f388, f2380, f2379; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2377, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2377, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2375, f391, 0fBF3504F3; +mul.f32 f2376, f392, 0f3F3504F3; +sub.f32 f399, f2375, f2376; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2374, f2384, f2378; +sub.f32 f405, f2384, f2378; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2373, f2383, f396; +sub.f32 f409, f2383, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2372, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2371, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2373, 0f3EC3EF15; +mul.f32 f2370, f406, 0f3F6C835E; +sub.f32 f420, f2370, f419; +mul.f32 f421, f2373, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2372, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2371, 0f3F6C835E; +mul.f32 f2369, f414, 0f3EC3EF15; +sub.f32 f429, f2369, f428; +mul.f32 f430, f2371, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2368, f408, 0fBEC3EF15; +sub.f32 f434, f2368, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2367, f412, 0fBF3504F3; +sub.f32 f439, f2367, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2366, f416, 0fBF6C835E; +sub.f32 f444, f2366, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2365, f2392, f2374; +sub.f32 f450, f2392, f2374; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2364, f2391, f422; +sub.f32 f454, f2391, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2363, f2390, f426; +sub.f32 f458, f2390, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2362, f2389, f431; +sub.f32 f462, f2389, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2361, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2360, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2359, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2358, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2364, 0f3E47C5C2; +mul.f32 f2357, f451, 0f3F7B14BE; +sub.f32 f481, f2357, f480; +mul.f32 f482, f2364, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2363, 0f3EC3EF15; +mul.f32 f2356, f455, 0f3F6C835E; +sub.f32 f486, f2356, f485; +mul.f32 f487, f2363, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2362, 0f3F0E39DA; +mul.f32 f2355, f459, 0f3F54DB31; +sub.f32 f491, f2355, f490; +mul.f32 f492, f2362, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2361, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2360, 0f3F54DB31; +mul.f32 f2354, f467, 0f3F0E39DA; +sub.f32 f500, f2354, f499; +mul.f32 f501, f2360, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2359, 0f3F6C835E; +mul.f32 f2353, f471, 0f3EC3EF15; +sub.f32 f505, f2353, f504; +mul.f32 f506, f2359, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2358, 0f3F7B14BE; +mul.f32 f2352, f475, 0f3E47C5C2; +sub.f32 f510, f2352, f509; +mul.f32 f511, f2358, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2351, f453, 0fBE47C5C2; +sub.f32 f515, f2351, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2349, f457, 0fBEC3EF15; +mul.f32 f2350, f458, 0f3F6C835E; +sub.f32 f520, f2349, f2350; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2347, f461, 0fBF0E39DA; +mul.f32 f2348, f462, 0f3F54DB31; +sub.f32 f525, f2347, f2348; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2345, f465, 0fBF3504F3; +mul.f32 f2346, f466, 0f3F3504F3; +sub.f32 f530, f2345, f2346; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2343, f469, 0fBF54DB31; +mul.f32 f2344, f470, 0f3F0E39DA; +sub.f32 f535, f2343, f2344; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2342, f473, 0fBF6C835E; +sub.f32 f540, f2342, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2341, f477, 0fBF7B14BE; +sub.f32 f545, f2341, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2340, f2414, f483; +sub.f32 f553, f2414, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2339, f2413, f488; +sub.f32 f557, f2413, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2338, f2412, f493; +sub.f32 f561, f2412, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2337, f2411, f497; +sub.f32 f565, f2411, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f2336, f2410, f502; +sub.f32 f569, f2410, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f2335, f2409, f507; +sub.f32 f573, f2409, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f2334, f2408, f512; +sub.f32 f577, f2408, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f2333, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f2332, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f2331, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f2330, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f2329, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2328, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2327, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2326, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f2340, f611; +mul.f32 f616, f610, f2340; +mul.f32 f618, f611, f611; +mul.f32 f2325, f610, f610; +sub.f32 f619, f2325, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f2339, f621; +mul.f32 f624, f619, f2339; +mul.f32 f626, f611, f621; +mul.f32 f2324, f610, f619; +sub.f32 f627, f2324, f626; +mul.f32 f2323, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f2338, f629; +mul.f32 f632, f627, f2338; +mul.f32 f2321, f610, f627; +mul.f32 f2322, f611, f629; +sub.f32 f635, f2321, f2322; +mul.f32 f2320, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f2337, f637; +mul.f32 f640, f635, f2337; +mul.f32 f642, f611, f637; +mul.f32 f2319, f610, f635; +sub.f32 f643, f2319, f642; +mul.f32 f2318, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f2336, f645; +mul.f32 f648, f643, f2336; +mul.f32 f2316, f610, f643; +mul.f32 f2317, f611, f645; +sub.f32 f651, f2316, f2317; +mul.f32 f2315, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f2335, f653; +mul.f32 f656, f651, f2335; +mul.f32 f658, f611, f653; +mul.f32 f2314, f610, f651; +sub.f32 f659, f2314, f658; +mul.f32 f2313, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f2334, f661; +mul.f32 f664, f659, f2334; +mul.f32 f666, f611, f661; +mul.f32 f2312, f610, f659; +sub.f32 f667, f2312, f666; +mul.f32 f2311, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f2333, f669; +mul.f32 f672, f667, f2333; +mul.f32 f2309, f610, f667; +mul.f32 f2310, f611, f669; +sub.f32 f675, f2309, f2310; +mul.f32 f2308, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f2332, f677; +mul.f32 f680, f675, f2332; +mul.f32 f682, f611, f677; +mul.f32 f2307, f610, f675; +sub.f32 f683, f2307, f682; +mul.f32 f2306, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f2331, f685; +mul.f32 f688, f683, f2331; +mul.f32 f690, f611, f685; +mul.f32 f2305, f610, f683; +sub.f32 f691, f2305, f690; +mul.f32 f2304, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f2330, f693; +mul.f32 f696, f691, f2330; +mul.f32 f2302, f610, f691; +mul.f32 f2303, f611, f693; +sub.f32 f699, f2302, f2303; +mul.f32 f2301, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f2329, f701; +mul.f32 f704, f699, f2329; +mul.f32 f706, f611, f701; +mul.f32 f2300, f610, f699; +sub.f32 f707, f2300, f706; +mul.f32 f2299, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f2328, f709; +mul.f32 f712, f707, f2328; +mul.f32 f2297, f610, f707; +mul.f32 f2298, f611, f709; +sub.f32 f715, f2297, f2298; +mul.f32 f2296, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f2327, f717; +mul.f32 f720, f715, f2327; +mul.f32 f722, f611, f717; +mul.f32 f2295, f610, f715; +sub.f32 f723, f2295, f722; +mul.f32 f2294, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f2326, f725; +mul.f32 f728, f723, f2326; +mul.f32 f730, f611, f725; +mul.f32 f2293, f610, f723; +sub.f32 f731, f2293, f730; +mul.f32 f2292, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2291, f2415, f2365; +mul.f32 f734, f2291, f733; +mul.f32 f736, f731, f2291; +mul.f32 f2289, f610, f731; +mul.f32 f2290, f611, f733; +sub.f32 f739, f2289, f2290; +sub.f32 f2288, f272, f447; +mul.f32 f2287, f2288, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2286, f610, f739; +sub.f32 f747, f2286, f746; +mul.f32 f2285, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2284, f610, f747; +sub.f32 f755, f2284, f754; +mul.f32 f2283, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f2281, f610, f755; +mul.f32 f2282, f611, f757; +sub.f32 f763, f2281, f2282; +mul.f32 f2280, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2279, f610, f763; +sub.f32 f771, f2279, f770; +mul.f32 f2278, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f2276, f610, f771; +mul.f32 f2277, f611, f773; +sub.f32 f779, f2276, f2277; +mul.f32 f2275, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2274, f610, f779; +sub.f32 f787, f2274, f786; +mul.f32 f2273, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2272, f610, f787; +sub.f32 f795, f2272, f794; +mul.f32 f2271, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f2269, f610, f795; +mul.f32 f2270, f611, f797; +sub.f32 f803, f2269, f2270; +mul.f32 f2268, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2267, f610, f803; +sub.f32 f811, f2267, f810; +mul.f32 f2266, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2265, f610, f811; +sub.f32 f819, f2265, f818; +mul.f32 f2264, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f2262, f610, f819; +mul.f32 f2263, f611, f821; +sub.f32 f827, f2262, f2263; +mul.f32 f2261, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2260, f610, f827; +sub.f32 f835, f2260, f834; +mul.f32 f2259, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f2257, f610, f835; +mul.f32 f2258, f611, f837; +sub.f32 f843, f2257, f2258; +mul.f32 f2256, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2255, f610, f843; +sub.f32 f851, f2255, f850; +mul.f32 f2254, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f2253, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 16128; +add.s32 r12, r9, r11; +add.f32 f857, f2415, f2365; +mov.u32 r26, %tid.x; +shl.b32 r23, r26, 8; +sub.f32 f2466, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r30, %tid.x; +shl.b32 r29, r30, 3; +mov.u32 r34, %tid.x; +shl.b32 r33, r34, 3; +mov.u32 r28, %tid.x; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f2253; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f2323; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f2320; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f2318; +sub.f32 f867, f648, f2315; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f2313; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f2311; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f2308; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f2306; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f2304; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f2301; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f2299; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f2296; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f2294; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f2292; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f2466, f734; +sub.f32 f890, f736, f2287; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f2285; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f2283; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f2280; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f2278; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f2275; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f2273; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f2271; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f2268; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f2266; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f2264; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f2261; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f2259; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f2256; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f2254; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +and.b32 r21, r28, 63; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+512]; +ld.shared.v2.f32 {f929, f930}, [r13+1024]; +ld.shared.v2.f32 {f933, f934}, [r13+1536]; +ld.shared.v2.f32 {f937, f938}, [r13+2048]; +ld.shared.v2.f32 {f941, f942}, [r13+2560]; +ld.shared.v2.f32 {f945, f946}, [r13+3072]; +ld.shared.v2.f32 {f949, f950}, [r13+3584]; +ld.shared.v2.f32 {f953, f954}, [r13+4096]; +ld.shared.v2.f32 {f957, f958}, [r13+4608]; +ld.shared.v2.f32 {f961, f962}, [r13+5120]; +ld.shared.v2.f32 {f965, f966}, [r13+5632]; +ld.shared.v2.f32 {f969, f970}, [r13+6144]; +ld.shared.v2.f32 {f973, f974}, [r13+6656]; +ld.shared.v2.f32 {f977, f978}, [r13+7168]; +ld.shared.v2.f32 {f981, f982}, [r13+7680]; +ld.shared.v2.f32 {f985, f986}, [r13+8192]; +ld.shared.v2.f32 {f989, f990}, [r13+8704]; +ld.shared.v2.f32 {f993, f994}, [r13+9216]; +ld.shared.v2.f32 {f997, f998}, [r13+9728]; +ld.shared.v2.f32 {f1001, f1002}, [r13+10240]; +ld.shared.v2.f32 {f1005, f1006}, [r13+10752]; +ld.shared.v2.f32 {f1009, f1010}, [r13+11264]; +ld.shared.v2.f32 {f1013, f1014}, [r13+11776]; +ld.shared.v2.f32 {f1017, f1018}, [r13+12288]; +ld.shared.v2.f32 {f1021, f1022}, [r13+12800]; +ld.shared.v2.f32 {f1025, f1026}, [r13+13312]; +ld.shared.v2.f32 {f1029, f1030}, [r13+13824]; +ld.shared.v2.f32 {f1033, f1034}, [r13+14336]; +ld.shared.v2.f32 {f1037, f1038}, [r13+14848]; +ld.shared.v2.f32 {f1041, f1042}, [r13+15360]; +ld.shared.v2.f32 {f1045, f1046}, [r13+15872]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2252, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2251, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2250, f2252, f2251; +sub.f32 f1060, f2252, f2251; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f2249, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2248, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2247, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2246, f2248, f2247; +sub.f32 f1076, f2248, f2247; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f2245, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f2245, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f2243, f1079, 0fBF3504F3; +mul.f32 f2244, f1080, 0f3F3504F3; +sub.f32 f1087, f2243, f2244; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2242, f2250, f2246; +sub.f32 f1093, f2250, f2246; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2241, f2249, f1084; +sub.f32 f1097, f2249, f1084; +sub.f32 f1098, f1059, f1076; +add.f32 f1100, f1059, f1076; +add.f32 f2240, f1060, f1075; +sub.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1087; +sub.f32 f1104, f1063, f1087; +add.f32 f2239, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2238, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2237, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2236, f2238, f2237; +sub.f32 f1117, f2238, f2237; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f2235, f1109, f1112; +sub.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2234, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2233, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2232, f2234, f2233; +sub.f32 f1133, f2234, f2233; +sub.f32 f1134, f1124, f1129; +add.f32 f1136, f1124, f1129; +add.f32 f2231, f1125, f1128; +sub.f32 f1137, f1125, f1128; +mul.f32 f1138, f1134, 0f3F3504F3; +mul.f32 f1139, f2231, 0f3F3504F3; +sub.f32 f1140, f1138, f1139; +add.f32 f1141, f1138, f1139; +mul.f32 f2229, f1136, 0fBF3504F3; +mul.f32 f2230, f1137, 0f3F3504F3; +sub.f32 f1144, f2229, f2230; +mul.f32 f1145, f1137, 0fBF3504F3; +fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2228, f2236, f2232; +sub.f32 f1150, f2236, f2232; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2227, f2235, f1141; +sub.f32 f1154, f2235, f1141; +sub.f32 f1155, f1116, f1133; +add.f32 f1157, f1116, f1133; +add.f32 f2226, f1117, f1132; +sub.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1144; +sub.f32 f1161, f1120, f1144; +add.f32 f2225, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2223, f1151, 0f3F6C835E; +mul.f32 f2224, f2227, 0f3EC3EF15; +sub.f32 f1165, f2223, f2224; +mul.f32 f1166, f2227, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166; +mul.f32 f1168, f1155, 0f3F3504F3; +mul.f32 f1169, f2226, 0f3F3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +mul.f32 f2221, f1159, 0f3EC3EF15; +mul.f32 f2222, f2225, 0f3F6C835E; +sub.f32 f1174, f2221, f2222; +mul.f32 f1175, f2225, 0f3EC3EF15; +fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175; +mul.f32 f2219, f1153, 0fBEC3EF15; +mul.f32 f2220, f1154, 0f3F6C835E; +sub.f32 f1179, f2219, f2220; +mul.f32 f1180, f1154, 0fBEC3EF15; +fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180; +mul.f32 f2217, f1157, 0fBF3504F3; +mul.f32 f2218, f1158, 0f3F3504F3; +sub.f32 f1184, f2217, f2218; +mul.f32 f1185, f1158, 0fBF3504F3; +fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185; +mul.f32 f2215, f1161, 0fBF6C835E; +mul.f32 f2216, f1162, 0f3EC3EF15; +sub.f32 f1189, f2215, f2216; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2214, f2242, f2228; +sub.f32 f1195, f2242, f2228; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2213, f2241, f1167; +sub.f32 f1199, f2241, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2212, f2240, f1171; +sub.f32 f1203, f2240, f1171; +add.f32 f1204, f1102, f1174; +sub.f32 f1206, f1102, f1174; +add.f32 f2211, f2239, f1176; +sub.f32 f1207, f2239, f1176; +sub.f32 f1208, f1092, f1150; +add.f32 f1210, f1092, f1150; +add.f32 f2210, f1093, f1149; +sub.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1179; +sub.f32 f1214, f1096, f1179; +add.f32 f2209, f1097, f1181; +sub.f32 f1215, f1097, f1181; +add.f32 f1216, f1100, f1184; +sub.f32 f1218, f1100, f1184; +add.f32 f2208, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2207, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2206, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2205, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2204, f2206, f2205; +sub.f32 f1235, f2206, f2205; +sub.f32 f1236, f1226, f1231; +add.f32 f1238, f1226, f1231; +add.f32 f2203, f1227, f1230; +sub.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2202, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2201, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2200, f2202, f2201; +sub.f32 f1251, f2202, f2201; +sub.f32 f1252, f1242, f1247; +add.f32 f1254, f1242, f1247; +add.f32 f2199, f1243, f1246; +sub.f32 f1255, f1243, f1246; +mul.f32 f1256, f1252, 0f3F3504F3; +mul.f32 f1257, f2199, 0f3F3504F3; +sub.f32 f1258, f1256, f1257; +add.f32 f1259, f1256, f1257; +mul.f32 f2197, f1254, 0fBF3504F3; +mul.f32 f2198, f1255, 0f3F3504F3; +sub.f32 f1262, f2197, f2198; +mul.f32 f1263, f1255, 0fBF3504F3; +fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2196, f2204, f2200; +sub.f32 f1268, f2204, f2200; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2195, f2203, f1259; +sub.f32 f1272, f2203, f1259; +sub.f32 f1273, f1234, f1251; +add.f32 f1275, f1234, f1251; +add.f32 f2194, f1235, f1250; +sub.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1262; +sub.f32 f1279, f1238, f1262; +add.f32 f2193, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2192, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2191, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2190, f2192, f2191; +sub.f32 f1292, f2192, f2191; +sub.f32 f1293, f1283, f1288; +add.f32 f1295, f1283, f1288; +add.f32 f2189, f1284, f1287; +sub.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2188, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2187, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2186, f2188, f2187; +sub.f32 f1308, f2188, f2187; +sub.f32 f1309, f1299, f1304; +add.f32 f1311, f1299, f1304; +add.f32 f2185, f1300, f1303; +sub.f32 f1312, f1300, f1303; +mul.f32 f1313, f1309, 0f3F3504F3; +mul.f32 f1314, f2185, 0f3F3504F3; +sub.f32 f1315, f1313, f1314; +add.f32 f1316, f1313, f1314; +mul.f32 f2183, f1311, 0fBF3504F3; +mul.f32 f2184, f1312, 0f3F3504F3; +sub.f32 f1319, f2183, f2184; +mul.f32 f1320, f1312, 0fBF3504F3; +fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2182, f2190, f2186; +sub.f32 f1325, f2190, f2186; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2181, f2189, f1316; +sub.f32 f1329, f2189, f1316; +sub.f32 f1330, f1291, f1308; +add.f32 f1332, f1291, f1308; +add.f32 f2180, f1292, f1307; +sub.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1319; +sub.f32 f1336, f1295, f1319; +add.f32 f2179, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2177, f1326, 0f3F6C835E; +mul.f32 f2178, f2181, 0f3EC3EF15; +sub.f32 f1340, f2177, f2178; +mul.f32 f1341, f2181, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341; +mul.f32 f1343, f1330, 0f3F3504F3; +mul.f32 f1344, f2180, 0f3F3504F3; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1343, f1344; +mul.f32 f1348, f2179, 0f3F6C835E; +mul.f32 f2176, f1334, 0f3EC3EF15; +sub.f32 f1349, f2176, f1348; +mul.f32 f1350, f2179, 0f3EC3EF15; +fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350; +mul.f32 f1353, f1329, 0f3F6C835E; +mul.f32 f2175, f1328, 0fBEC3EF15; +sub.f32 f1354, f2175, f1353; +mul.f32 f1355, f1329, 0fBEC3EF15; +fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355; +mul.f32 f2173, f1332, 0fBF3504F3; +mul.f32 f2174, f1333, 0f3F3504F3; +sub.f32 f1359, f2173, f2174; +mul.f32 f1360, f1333, 0fBF3504F3; +fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360; +mul.f32 f2171, f1336, 0fBF6C835E; +mul.f32 f2172, f1337, 0f3EC3EF15; +sub.f32 f1364, f2171, f2172; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2170, f2196, f2182; +sub.f32 f1370, f2196, f2182; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2169, f2195, f1342; +sub.f32 f1374, f2195, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2168, f2194, f1346; +sub.f32 f1378, f2194, f1346; +add.f32 f1379, f1277, f1349; +sub.f32 f1381, f1277, f1349; +add.f32 f2167, f2193, f1351; +sub.f32 f1382, f2193, f1351; +sub.f32 f1383, f1267, f1325; +add.f32 f1385, f1267, f1325; +add.f32 f2166, f1268, f1324; +sub.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1354; +sub.f32 f1389, f1271, f1354; +add.f32 f2165, f1272, f1356; +sub.f32 f1390, f1272, f1356; +add.f32 f1391, f1275, f1359; +sub.f32 f1393, f1275, f1359; +add.f32 f2164, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2163, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2169, 0f3E47C5C2; +mul.f32 f2162, f1371, 0f3F7B14BE; +sub.f32 f1401, f2162, f1400; +mul.f32 f1402, f2169, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402; +mul.f32 f1405, f2168, 0f3EC3EF15; +mul.f32 f2161, f1375, 0f3F6C835E; +sub.f32 f1406, f2161, f1405; +mul.f32 f1407, f2168, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407; +mul.f32 f2159, f1379, 0f3F54DB31; +mul.f32 f2160, f2167, 0f3F0E39DA; +sub.f32 f1411, f2159, f2160; +mul.f32 f1412, f2167, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412; +mul.f32 f1414, f1383, 0f3F3504F3; +mul.f32 f1415, f2166, 0f3F3504F3; +sub.f32 f1416, f1414, f1415; +add.f32 f1417, f1414, f1415; +mul.f32 f1419, f2165, 0f3F54DB31; +mul.f32 f2158, f1387, 0f3F0E39DA; +sub.f32 f1420, f2158, f1419; +mul.f32 f1421, f2165, 0f3F0E39DA; +fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421; +mul.f32 f1424, f2164, 0f3F6C835E; +mul.f32 f2157, f1391, 0f3EC3EF15; +sub.f32 f1425, f2157, f1424; +mul.f32 f1426, f2164, 0f3EC3EF15; +fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426; +mul.f32 f2155, f1395, 0f3E47C5C2; +mul.f32 f2156, f2163, 0f3F7B14BE; +sub.f32 f1430, f2155, f2156; +mul.f32 f1431, f2163, 0f3E47C5C2; +fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431; +mul.f32 f2153, f1373, 0fBE47C5C2; +mul.f32 f2154, f1374, 0f3F7B14BE; +sub.f32 f1435, f2153, f2154; +mul.f32 f1436, f1374, 0fBE47C5C2; +fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436; +mul.f32 f2151, f1377, 0fBEC3EF15; +mul.f32 f2152, f1378, 0f3F6C835E; +sub.f32 f1440, f2151, f2152; +mul.f32 f1441, f1378, 0fBEC3EF15; +fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441; +mul.f32 f2149, f1381, 0fBF0E39DA; +mul.f32 f2150, f1382, 0f3F54DB31; +sub.f32 f1445, f2149, f2150; +mul.f32 f1446, f1382, 0fBF0E39DA; +fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446; +mul.f32 f1449, f1386, 0f3F3504F3; +mul.f32 f2148, f1385, 0fBF3504F3; +sub.f32 f1450, f2148, f1449; +mul.f32 f1451, f1386, 0fBF3504F3; +fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451; +mul.f32 f1454, f1390, 0f3F0E39DA; +mul.f32 f2147, f1389, 0fBF54DB31; +sub.f32 f1455, f2147, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456; +mul.f32 f1459, f1394, 0f3EC3EF15; +mul.f32 f2146, f1393, 0fBF6C835E; +sub.f32 f1460, f2146, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461; +mul.f32 f1464, f1398, 0f3E47C5C2; +mul.f32 f2145, f1397, 0fBF7B14BE; +sub.f32 f1465, f2145, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2144, f2213, f1403; +sub.f32 f1473, f2213, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2143, f2212, f1408; +sub.f32 f1477, f2212, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2142, f2211, f1413; +sub.f32 f1481, f2211, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2141, f2210, f1417; +sub.f32 f1485, f2210, f1417; +add.f32 f1486, f1212, f1420; +sub.f32 f1488, f1212, f1420; +add.f32 f2140, f2209, f1422; +sub.f32 f1489, f2209, f1422; +add.f32 f1490, f1216, f1425; +sub.f32 f1492, f1216, f1425; +add.f32 f2139, f2208, f1427; +sub.f32 f1493, f2208, f1427; +add.f32 f1494, f1220, f1430; +sub.f32 f1496, f1220, f1430; +add.f32 f2138, f2207, f1432; +sub.f32 f1497, f2207, f1432; +sub.f32 f1498, f1194, f1370; +add.f32 f1500, f1194, f1370; +add.f32 f2137, f1195, f1369; +sub.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1435; +sub.f32 f1504, f1198, f1435; +add.f32 f2136, f1199, f1437; +sub.f32 f1505, f1199, f1437; +add.f32 f1506, f1202, f1440; +sub.f32 f1508, f1202, f1440; +add.f32 f2135, f1203, f1442; +sub.f32 f1509, f1203, f1442; +add.f32 f1510, f1206, f1445; +sub.f32 f1512, f1206, f1445; +add.f32 f2134, f1207, f1447; +sub.f32 f1513, f1207, f1447; +add.f32 f1514, f1210, f1450; +sub.f32 f1516, f1210, f1450; +add.f32 f2133, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2132, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2131, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2130, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r28, 32; +bfe.u32 r15, r28, 5, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1534, f2144, f1531; +mul.f32 f1536, f1530, f2144; +mul.f32 f2128, f1530, f1530; +mul.f32 f2129, f1531, f1531; +sub.f32 f1539, f2128, f2129; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1542, f2143, f1541; +mul.f32 f1544, f1539, f2143; +mul.f32 f1546, f1531, f1541; +mul.f32 f2127, f1530, f1539; +sub.f32 f1547, f2127, f1546; +mul.f32 f2126, f1474, f1541; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1550, f2142, f1549; +mul.f32 f1552, f1547, f2142; +mul.f32 f1554, f1531, f1549; +mul.f32 f2125, f1530, f1547; +sub.f32 f1555, f2125, f1554; +mul.f32 f2124, f1478, f1549; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1558, f2141, f1557; +mul.f32 f1560, f1555, f2141; +mul.f32 f2122, f1530, f1555; +mul.f32 f2123, f1531, f1557; +sub.f32 f1563, f2122, f2123; +mul.f32 f2121, f1482, f1557; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1566, f2140, f1565; +mul.f32 f1568, f1563, f2140; +mul.f32 f1570, f1531, f1565; +mul.f32 f2120, f1530, f1563; +sub.f32 f1571, f2120, f1570; +mul.f32 f2119, f1486, f1565; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1574, f2139, f1573; +mul.f32 f1576, f1571, f2139; +mul.f32 f1578, f1531, f1573; +mul.f32 f2118, f1530, f1571; +sub.f32 f1579, f2118, f1578; +mul.f32 f2117, f1490, f1573; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1582, f2138, f1581; +mul.f32 f1584, f1579, f2138; +mul.f32 f2115, f1530, f1579; +mul.f32 f2116, f1531, f1581; +sub.f32 f1587, f2115, f2116; +mul.f32 f2114, f1494, f1581; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1590, f2137, f1589; +mul.f32 f1592, f1587, f2137; +mul.f32 f1594, f1531, f1589; +mul.f32 f2113, f1530, f1587; +sub.f32 f1595, f2113, f1594; +mul.f32 f2112, f1498, f1589; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1598, f2136, f1597; +mul.f32 f1600, f1595, f2136; +mul.f32 f2110, f1530, f1595; +mul.f32 f2111, f1531, f1597; +sub.f32 f1603, f2110, f2111; +mul.f32 f2109, f1502, f1597; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1606, f2135, f1605; +mul.f32 f1608, f1603, f2135; +mul.f32 f1610, f1531, f1605; +mul.f32 f2108, f1530, f1603; +sub.f32 f1611, f2108, f1610; +mul.f32 f2107, f1506, f1605; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1614, f2134, f1613; +mul.f32 f1616, f1611, f2134; +mul.f32 f1618, f1531, f1613; +mul.f32 f2106, f1530, f1611; +sub.f32 f1619, f2106, f1618; +mul.f32 f2105, f1510, f1613; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1622, f2133, f1621; +mul.f32 f1624, f1619, f2133; +mul.f32 f2103, f1530, f1619; +mul.f32 f2104, f1531, f1621; +sub.f32 f1627, f2103, f2104; +mul.f32 f2102, f1514, f1621; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1630, f2132, f1629; +mul.f32 f1632, f1627, f2132; +mul.f32 f1634, f1531, f1629; +mul.f32 f2101, f1530, f1627; +sub.f32 f1635, f2101, f1634; +mul.f32 f2100, f1518, f1629; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1638, f2131, f1637; +mul.f32 f1640, f1635, f2131; +mul.f32 f1642, f1531, f1637; +mul.f32 f2099, f1530, f1635; +sub.f32 f1643, f2099, f1642; +mul.f32 f2098, f1522, f1637; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1646, f2130, f1645; +mul.f32 f1648, f1643, f2130; +mul.f32 f2096, f1530, f1643; +mul.f32 f2097, f1531, f1645; +sub.f32 f1651, f2096, f2097; +mul.f32 f2095, f1526, f1645; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2094, f2214, f2170; +mul.f32 f1654, f2094, f1653; +mul.f32 f1656, f1651, f2094; +mul.f32 f1658, f1531, f1653; +mul.f32 f2093, f1530, f1651; +sub.f32 f1659, f2093, f1658; +sub.f32 f2092, f1192, f1367; +mul.f32 f2091, f2092, f1653; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1662, f1473, f1661; +mul.f32 f1664, f1659, f1473; +mul.f32 f2089, f1530, f1659; +mul.f32 f2090, f1531, f1661; +sub.f32 f1667, f2089, f2090; +mul.f32 f2088, f1472, f1661; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1670, f1477, f1669; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2087, f1530, f1667; +sub.f32 f1675, f2087, f1674; +mul.f32 f2086, f1476, f1669; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1678, f1481, f1677; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2085, f1530, f1675; +sub.f32 f1683, f2085, f1682; +mul.f32 f2084, f1480, f1677; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1686, f1485, f1685; +mul.f32 f1688, f1683, f1485; +mul.f32 f2082, f1530, f1683; +mul.f32 f2083, f1531, f1685; +sub.f32 f1691, f2082, f2083; +mul.f32 f2081, f1484, f1685; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1694, f1489, f1693; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2080, f1530, f1691; +sub.f32 f1699, f2080, f1698; +mul.f32 f2079, f1488, f1693; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1702, f1493, f1701; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2078, f1530, f1699; +sub.f32 f1707, f2078, f1706; +mul.f32 f2077, f1492, f1701; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1710, f1497, f1709; +mul.f32 f1712, f1707, f1497; +mul.f32 f2075, f1530, f1707; +mul.f32 f2076, f1531, f1709; +sub.f32 f1715, f2075, f2076; +mul.f32 f2074, f1496, f1709; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1718, f1501, f1717; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2073, f1530, f1715; +sub.f32 f1723, f2073, f1722; +mul.f32 f2072, f1500, f1717; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1726, f1505, f1725; +mul.f32 f1728, f1723, f1505; +mul.f32 f2070, f1530, f1723; +mul.f32 f2071, f1531, f1725; +sub.f32 f1731, f2070, f2071; +mul.f32 f2069, f1504, f1725; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1734, f1509, f1733; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2068, f1530, f1731; +sub.f32 f1739, f2068, f1738; +mul.f32 f2067, f1508, f1733; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1742, f1513, f1741; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2066, f1530, f1739; +sub.f32 f1747, f2066, f1746; +mul.f32 f2065, f1512, f1741; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1750, f1517, f1749; +mul.f32 f1752, f1747, f1517; +mul.f32 f2063, f1530, f1747; +mul.f32 f2064, f1531, f1749; +sub.f32 f1755, f2063, f2064; +mul.f32 f2062, f1516, f1749; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1758, f1521, f1757; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2061, f1530, f1755; +sub.f32 f1763, f2061, f1762; +mul.f32 f2060, f1520, f1757; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1766, f1525, f1765; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2059, f1530, f1763; +sub.f32 f1771, f2059, f1770; +mul.f32 f2058, f1524, f1765; +mul.f32 f1772, f1530, f1765; +mul.f32 f2057, f1470, f1531; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1529, f1773; +mul.f32 f1775, f1528, f1773; +mul.f32 f1776, f1771, f1529; +and.b32 r16, r33, 248; +add.s32 r17, r9, r16; +mov.u32 r25, %tid.x; +shl.b32 r24, r25, 8; +barrier.sync 0; +and.b32 r18, r24, 8192; +add.s32 r19, r17, r18; +sub.f32 f2465, f2214, f2170; +mul.f32 f2464, f1651, f2465; +add.f32 f1777, f2214, f2170; +sub.f32 f2463, f1192, f1367; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r32, %tid.x; +and.b32 r31, r32, 32; +fma.rn.f32 f1779, f1530, f1470, f1534; +sub.f32 f1780, f1536, f2057; +st.shared.v2.f32 [r19+256], {f1779, f1780}; +fma.rn.f32 f1781, f1539, f1474, f1542; +sub.f32 f1782, f1544, f2126; +st.shared.v2.f32 [r19+512], {f1781, f1782}; +fma.rn.f32 f1783, f1547, f1478, f1550; +sub.f32 f1784, f1552, f2124; +st.shared.v2.f32 [r19+768], {f1783, f1784}; +fma.rn.f32 f1785, f1555, f1482, f1558; +sub.f32 f1786, f1560, f2121; +st.shared.v2.f32 [r19+1024], {f1785, f1786}; +fma.rn.f32 f1787, f1563, f1486, f1566; +sub.f32 f1788, f1568, f2119; +st.shared.v2.f32 [r19+1280], {f1787, f1788}; +sub.f32 f1789, f1576, f2117; +fma.rn.f32 f1790, f1571, f1490, f1574; +st.shared.v2.f32 [r19+1536], {f1790, f1789}; +fma.rn.f32 f1791, f1579, f1494, f1582; +sub.f32 f1792, f1584, f2114; +st.shared.v2.f32 [r19+1792], {f1791, f1792}; +fma.rn.f32 f1793, f1587, f1498, f1590; +sub.f32 f1794, f1592, f2112; +st.shared.v2.f32 [r19+2048], {f1793, f1794}; +fma.rn.f32 f1795, f1595, f1502, f1598; +sub.f32 f1796, f1600, f2109; +st.shared.v2.f32 [r19+2304], {f1795, f1796}; +fma.rn.f32 f1797, f1603, f1506, f1606; +sub.f32 f1798, f1608, f2107; +st.shared.v2.f32 [r19+2560], {f1797, f1798}; +fma.rn.f32 f1799, f1611, f1510, f1614; +sub.f32 f1800, f1616, f2105; +st.shared.v2.f32 [r19+2816], {f1799, f1800}; +fma.rn.f32 f1801, f1619, f1514, f1622; +sub.f32 f1802, f1624, f2102; +st.shared.v2.f32 [r19+3072], {f1801, f1802}; +fma.rn.f32 f1803, f1627, f1518, f1630; +sub.f32 f1804, f1632, f2100; +st.shared.v2.f32 [r19+3328], {f1803, f1804}; +fma.rn.f32 f1805, f1635, f1522, f1638; +sub.f32 f1806, f1640, f2098; +st.shared.v2.f32 [r19+3584], {f1805, f1806}; +fma.rn.f32 f1807, f1643, f1526, f1646; +sub.f32 f1808, f1648, f2095; +st.shared.v2.f32 [r19+3840], {f1807, f1808}; +fma.rn.f32 f1809, f1651, f2463, f1654; +sub.f32 f1810, f2464, f2091; +st.shared.v2.f32 [r19+4096], {f1809, f1810}; +fma.rn.f32 f1811, f1659, f1472, f1662; +sub.f32 f1812, f1664, f2088; +st.shared.v2.f32 [r19+4352], {f1811, f1812}; +fma.rn.f32 f1813, f1667, f1476, f1670; +sub.f32 f1814, f1672, f2086; +st.shared.v2.f32 [r19+4608], {f1813, f1814}; +fma.rn.f32 f1815, f1675, f1480, f1678; +sub.f32 f1816, f1680, f2084; +st.shared.v2.f32 [r19+4864], {f1815, f1816}; +fma.rn.f32 f1817, f1683, f1484, f1686; +sub.f32 f1818, f1688, f2081; +st.shared.v2.f32 [r19+5120], {f1817, f1818}; +fma.rn.f32 f1819, f1691, f1488, f1694; +sub.f32 f1820, f1696, f2079; +st.shared.v2.f32 [r19+5376], {f1819, f1820}; +fma.rn.f32 f1821, f1699, f1492, f1702; +sub.f32 f1822, f1704, f2077; +st.shared.v2.f32 [r19+5632], {f1821, f1822}; +fma.rn.f32 f1823, f1707, f1496, f1710; +sub.f32 f1824, f1712, f2074; +st.shared.v2.f32 [r19+5888], {f1823, f1824}; +fma.rn.f32 f1825, f1715, f1500, f1718; +sub.f32 f1826, f1720, f2072; +st.shared.v2.f32 [r19+6144], {f1825, f1826}; +fma.rn.f32 f1827, f1723, f1504, f1726; +sub.f32 f1828, f1728, f2069; +st.shared.v2.f32 [r19+6400], {f1827, f1828}; +fma.rn.f32 f1829, f1731, f1508, f1734; +sub.f32 f1830, f1736, f2067; +st.shared.v2.f32 [r19+6656], {f1829, f1830}; +fma.rn.f32 f1831, f1739, f1512, f1742; +sub.f32 f1832, f1744, f2065; +st.shared.v2.f32 [r19+6912], {f1831, f1832}; +fma.rn.f32 f1833, f1747, f1516, f1750; +sub.f32 f1834, f1752, f2062; +st.shared.v2.f32 [r19+7168], {f1833, f1834}; +fma.rn.f32 f1835, f1755, f1520, f1758; +sub.f32 f1836, f1760, f2060; +st.shared.v2.f32 [r19+7424], {f1835, f1836}; +fma.rn.f32 f1837, f1763, f1524, f1766; +sub.f32 f1838, f1768, f2058; +st.shared.v2.f32 [r19+7680], {f1837, f1838}; +fma.rn.f32 f1839, f1771, f1528, f1774; +sub.f32 f1840, f1776, f1775; +st.shared.v2.f32 [r19+7936], {f1839, f1840}; +barrier.sync 0; +mad.lo.s32 r20, r31, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+512]; +ld.shared.v2.f32 {f1849, f1850}, [r20+1024]; +ld.shared.v2.f32 {f1853, f1854}, [r20+1536]; +ld.shared.v2.f32 {f1857, f1858}, [r20+2048]; +ld.shared.v2.f32 {f1861, f1862}, [r20+2560]; +ld.shared.v2.f32 {f1865, f1866}, [r20+3072]; +ld.shared.v2.f32 {f1869, f1870}, [r20+3584]; +ld.shared.v2.f32 {f1873, f1874}, [r20+4096]; +ld.shared.v2.f32 {f1877, f1878}, [r20+4608]; +ld.shared.v2.f32 {f1881, f1882}, [r20+5120]; +ld.shared.v2.f32 {f1885, f1886}, [r20+5632]; +ld.shared.v2.f32 {f1889, f1890}, [r20+6144]; +ld.shared.v2.f32 {f1893, f1894}, [r20+6656]; +ld.shared.v2.f32 {f1897, f1898}, [r20+7168]; +ld.shared.v2.f32 {f1901, f1902}, [r20+7680]; +ld.shared.v2.f32 {f1905, f1906}, [r20+8192]; +ld.shared.v2.f32 {f1909, f1910}, [r20+8704]; +ld.shared.v2.f32 {f1913, f1914}, [r20+9216]; +ld.shared.v2.f32 {f1917, f1918}, [r20+9728]; +ld.shared.v2.f32 {f1921, f1922}, [r20+10240]; +ld.shared.v2.f32 {f1925, f1926}, [r20+10752]; +ld.shared.v2.f32 {f1929, f1930}, [r20+11264]; +ld.shared.v2.f32 {f1933, f1934}, [r20+11776]; +ld.shared.v2.f32 {f1937, f1938}, [r20+12288]; +ld.shared.v2.f32 {f1941, f1942}, [r20+12800]; +ld.shared.v2.f32 {f1945, f1946}, [r20+13312]; +ld.shared.v2.f32 {f1949, f1950}, [r20+13824]; +ld.shared.v2.f32 {f1953, f1954}, [r20+14336]; +ld.shared.v2.f32 {f1957, f1958}, [r20+14848]; +ld.shared.v2.f32 {f1961, f1962}, [r20+15360]; +ld.shared.v2.f32 {f1965, f1966}, [r20+15872]; +add.f32 %0, f1841, f1905; +add.f32 %1, f1842, f1906; +add.f32 %3, f1846, f1910; +add.f32 %2, f1845, f1909; +add.f32 %5, f1850, f1914; +add.f32 %4, f1849, f1913; +add.f32 %7, f1854, f1918; +add.f32 %6, f1853, f1917; +add.f32 %8, f1857, f1921; +add.f32 %9, f1858, f1922; +add.f32 %10, f1861, f1925; +add.f32 %11, f1862, f1926; +add.f32 %12, f1865, f1929; +add.f32 %13, f1866, f1930; +add.f32 %15, f1870, f1934; +add.f32 %14, f1869, f1933; +add.f32 %17, f1874, f1938; +add.f32 %16, f1873, f1937; +add.f32 %19, f1878, f1942; +add.f32 %18, f1877, f1941; +add.f32 %20, f1881, f1945; +add.f32 %21, f1882, f1946; +add.f32 %22, f1885, f1949; +add.f32 %23, f1886, f1950; +add.f32 %24, f1889, f1953; +add.f32 %25, f1890, f1954; +add.f32 %26, f1893, f1957; +add.f32 %27, f1894, f1958; +add.f32 %29, f1898, f1962; +add.f32 %28, f1897, f1961; +add.f32 %31, f1902, f1966; +add.f32 %30, f1901, f1965; +sub.f32 %33, f1842, f1906; +sub.f32 %32, f1841, f1905; +sub.f32 %35, f1846, f1910; +sub.f32 %34, f1845, f1909; +sub.f32 %37, f1850, f1914; +sub.f32 %36, f1849, f1913; +sub.f32 %39, f1854, f1918; +sub.f32 %38, f1853, f1917; +sub.f32 %41, f1858, f1922; +sub.f32 %40, f1857, f1921; +sub.f32 %43, f1862, f1926; +sub.f32 %42, f1861, f1925; +sub.f32 %45, f1866, f1930; +sub.f32 %44, f1865, f1929; +sub.f32 %47, f1870, f1934; +sub.f32 %46, f1869, f1933; +sub.f32 %49, f1874, f1938; +sub.f32 %48, f1873, f1937; +sub.f32 %51, f1878, f1942; +sub.f32 %50, f1877, f1941; +sub.f32 %53, f1882, f1946; +sub.f32 %52, f1881, f1945; +sub.f32 %55, f1886, f1950; +sub.f32 %54, f1885, f1949; +sub.f32 %57, f1890, f1954; +sub.f32 %56, f1889, f1953; +sub.f32 %59, f1894, f1958; +sub.f32 %58, f1893, f1957; +sub.f32 %61, f1898, f1962; +sub.f32 %60, f1897, f1961; +sub.f32 %63, f1902, f1966; +sub.f32 %62, f1901, f1965; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_2048), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<298, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2273>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2271, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2269, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2268, f2271, f2269; +sub.f32 f140, f2271, f2269; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2267, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2264, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2262, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2261, f2264, f2262; +sub.f32 f156, f2264, f2262; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2260, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2260, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2258, f159, 0fBF3504F3; +mul.f32 f2259, f160, 0f3F3504F3; +sub.f32 f167, f2258, f2259; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2257, f2268, f2261; +sub.f32 f173, f2268, f2261; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2256, f2267, f164; +sub.f32 f177, f2267, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2255, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2254, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2252, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2249, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2248, f2252, f2249; +sub.f32 f197, f2252, f2249; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2247, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2245, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2243, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2242, f2245, f2243; +sub.f32 f213, f2245, f2243; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2241, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2241, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2239, f216, 0fBF3504F3; +mul.f32 f2240, f217, 0f3F3504F3; +sub.f32 f224, f2239, f2240; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2238, f2248, f2242; +sub.f32 f230, f2248, f2242; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2237, f2247, f221; +sub.f32 f234, f2247, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2236, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2235, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2233, f231, 0f3F6C835E; +mul.f32 f2234, f2237, 0f3EC3EF15; +sub.f32 f245, f2233, f2234; +mul.f32 f246, f2237, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2236, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2235, 0f3F6C835E; +mul.f32 f2232, f239, 0f3EC3EF15; +sub.f32 f254, f2232, f253; +mul.f32 f255, f2235, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2231, f233, 0fBEC3EF15; +sub.f32 f259, f2231, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2229, f237, 0fBF3504F3; +mul.f32 f2230, f238, 0f3F3504F3; +sub.f32 f264, f2229, f2230; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2227, f241, 0fBF6C835E; +mul.f32 f2228, f242, 0f3EC3EF15; +sub.f32 f269, f2227, f2228; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2226, f2257, f2238; +sub.f32 f275, f2257, f2238; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2225, f2256, f247; +sub.f32 f279, f2256, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2224, f2255, f251; +sub.f32 f283, f2255, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2223, f2254, f256; +sub.f32 f287, f2254, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2222, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2221, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2220, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2219, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2216, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2214, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2213, f2216, f2214; +sub.f32 f315, f2216, f2214; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2212, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2210, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2207, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2206, f2210, f2207; +sub.f32 f331, f2210, f2207; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2205, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2205, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2204, f334, 0fBF3504F3; +sub.f32 f342, f2204, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2203, f2213, f2206; +sub.f32 f348, f2213, f2206; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2202, f2212, f339; +sub.f32 f352, f2212, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2201, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2200, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2198, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2196, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2195, f2198, f2196; +sub.f32 f372, f2198, f2196; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2194, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2191, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2190, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2189, f2191, f2190; +sub.f32 f388, f2191, f2190; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2188, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2188, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2186, f391, 0fBF3504F3; +mul.f32 f2187, f392, 0f3F3504F3; +sub.f32 f399, f2186, f2187; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2185, f2195, f2189; +sub.f32 f405, f2195, f2189; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2184, f2194, f396; +sub.f32 f409, f2194, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2183, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2182, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2184, 0f3EC3EF15; +mul.f32 f2181, f406, 0f3F6C835E; +sub.f32 f420, f2181, f419; +mul.f32 f421, f2184, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2183, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2182, 0f3F6C835E; +mul.f32 f2180, f414, 0f3EC3EF15; +sub.f32 f429, f2180, f428; +mul.f32 f430, f2182, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2179, f408, 0fBEC3EF15; +sub.f32 f434, f2179, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2178, f412, 0fBF3504F3; +sub.f32 f439, f2178, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2177, f416, 0fBF6C835E; +sub.f32 f444, f2177, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2176, f2203, f2185; +sub.f32 f450, f2203, f2185; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2175, f2202, f422; +sub.f32 f454, f2202, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2174, f2201, f426; +sub.f32 f458, f2201, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2173, f2200, f431; +sub.f32 f462, f2200, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2172, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2171, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2170, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2169, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2175, 0f3E47C5C2; +mul.f32 f2168, f451, 0f3F7B14BE; +sub.f32 f481, f2168, f480; +mul.f32 f482, f2175, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2174, 0f3EC3EF15; +mul.f32 f2167, f455, 0f3F6C835E; +sub.f32 f486, f2167, f485; +mul.f32 f487, f2174, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2173, 0f3F0E39DA; +mul.f32 f2166, f459, 0f3F54DB31; +sub.f32 f491, f2166, f490; +mul.f32 f492, f2173, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2172, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2171, 0f3F54DB31; +mul.f32 f2165, f467, 0f3F0E39DA; +sub.f32 f500, f2165, f499; +mul.f32 f501, f2171, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2170, 0f3F6C835E; +mul.f32 f2164, f471, 0f3EC3EF15; +sub.f32 f505, f2164, f504; +mul.f32 f506, f2170, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2169, 0f3F7B14BE; +mul.f32 f2163, f475, 0f3E47C5C2; +sub.f32 f510, f2163, f509; +mul.f32 f511, f2169, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2162, f453, 0fBE47C5C2; +sub.f32 f515, f2162, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2160, f457, 0fBEC3EF15; +mul.f32 f2161, f458, 0f3F6C835E; +sub.f32 f520, f2160, f2161; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2158, f461, 0fBF0E39DA; +mul.f32 f2159, f462, 0f3F54DB31; +sub.f32 f525, f2158, f2159; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2156, f465, 0fBF3504F3; +mul.f32 f2157, f466, 0f3F3504F3; +sub.f32 f530, f2156, f2157; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2154, f469, 0fBF54DB31; +mul.f32 f2155, f470, 0f3F0E39DA; +sub.f32 f535, f2154, f2155; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2153, f473, 0fBF6C835E; +sub.f32 f540, f2153, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2152, f477, 0fBF7B14BE; +sub.f32 f545, f2152, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2151, f2226, f2176; +sub.f32 f551, f2226, f2176; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2150, f2225, f483; +sub.f32 f555, f2225, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2149, f2224, f488; +sub.f32 f559, f2224, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2148, f2223, f493; +sub.f32 f563, f2223, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2147, f2222, f497; +sub.f32 f567, f2222, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f2146, f2221, f502; +sub.f32 f571, f2221, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f2145, f2220, f507; +sub.f32 f575, f2220, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f2144, f2219, f512; +sub.f32 f579, f2219, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f2143, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f2142, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f2141, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f2140, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f2139, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2138, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2137, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2136, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f2150, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f2150; +sub.f32 f620, f619, f618; +mul.f32 f2134, f612, f612; +mul.f32 f2135, f613, f613; +sub.f32 f623, f2134, f2135; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f2149, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f2149; +sub.f32 f630, f629, f628; +mul.f32 f2132, f612, f623; +mul.f32 f2133, f613, f625; +sub.f32 f633, f2132, f2133; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f2148, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f2148; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f2131, f612, f633; +sub.f32 f643, f2131, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f2147, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f2147; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f2130, f612, f643; +sub.f32 f653, f2130, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f2146, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f2146; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f2129, f612, f653; +sub.f32 f663, f2129, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f2145, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f2145; +sub.f32 f670, f669, f668; +mul.f32 f2127, f612, f663; +mul.f32 f2128, f613, f665; +sub.f32 f673, f2127, f2128; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f2144, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f2144; +sub.f32 f680, f679, f678; +mul.f32 f2125, f612, f673; +mul.f32 f2126, f613, f675; +sub.f32 f683, f2125, f2126; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f2143, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f2143; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f2124, f612, f683; +sub.f32 f693, f2124, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f2142, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f2142; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f2123, f612, f693; +sub.f32 f703, f2123, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f2141, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f2141; +sub.f32 f710, f709, f708; +mul.f32 f2121, f612, f703; +mul.f32 f2122, f613, f705; +sub.f32 f713, f2121, f2122; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f2140, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f2140; +sub.f32 f720, f719, f718; +mul.f32 f2119, f612, f713; +mul.f32 f2120, f613, f715; +sub.f32 f723, f2119, f2120; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f2139, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f2139; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f2118, f612, f723; +sub.f32 f733, f2118, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f2138, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f2138; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f2117, f612, f733; +sub.f32 f743, f2117, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f2137, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f2137; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f2116, f612, f743; +sub.f32 f753, f2116, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f2136, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f2136; +sub.f32 f760, f759, f758; +mul.f32 f2114, f612, f753; +mul.f32 f2115, f613, f755; +sub.f32 f763, f2114, f2115; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f2112, f612, f763; +mul.f32 f2113, f613, f765; +sub.f32 f773, f2112, f2113; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f2111, f612, f773; +sub.f32 f783, f2111, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f2110, f612, f783; +sub.f32 f793, f2110, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f2109, f612, f793; +sub.f32 f803, f2109, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f2107, f612, f803; +mul.f32 f2108, f613, f805; +sub.f32 f813, f2107, f2108; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f2105, f612, f813; +mul.f32 f2106, f613, f815; +sub.f32 f823, f2105, f2106; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f2104, f612, f823; +sub.f32 f833, f2104, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f2103, f612, f833; +sub.f32 f843, f2103, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f2101, f612, f843; +mul.f32 f2102, f613, f845; +sub.f32 f853, f2101, f2102; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f2099, f612, f853; +mul.f32 f2100, f613, f855; +sub.f32 f863, f2099, f2100; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f2098, f612, f863; +sub.f32 f873, f2098, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f2097, f612, f873; +sub.f32 f883, f2097, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f2096, f612, f883; +sub.f32 f893, f2096, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f2094, f612, f893; +mul.f32 f2095, f613, f895; +sub.f32 f903, f2094, f2095; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f2092, f612, f903; +mul.f32 f2093, f613, f905; +sub.f32 f913, f2092, f2093; +mov.u32 r32, %tid.x; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +shl.b32 r8, r32, 7; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8064; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +and.b32 r23, r32, 63; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+256]; +ld.shared.f32 f923, [r13+512]; +ld.shared.f32 f924, [r13+768]; +ld.shared.f32 f925, [r13+1024]; +ld.shared.f32 f926, [r13+1280]; +ld.shared.f32 f927, [r13+1536]; +ld.shared.f32 f928, [r13+1792]; +ld.shared.f32 f929, [r13+2048]; +ld.shared.f32 f930, [r13+2304]; +ld.shared.f32 f931, [r13+2560]; +ld.shared.f32 f932, [r13+2816]; +ld.shared.f32 f933, [r13+3072]; +ld.shared.f32 f934, [r13+3328]; +ld.shared.f32 f935, [r13+3584]; +ld.shared.f32 f936, [r13+3840]; +ld.shared.f32 f937, [r13+4096]; +ld.shared.f32 f938, [r13+4352]; +ld.shared.f32 f939, [r13+4608]; +ld.shared.f32 f940, [r13+4864]; +ld.shared.f32 f941, [r13+5120]; +ld.shared.f32 f942, [r13+5376]; +ld.shared.f32 f943, [r13+5632]; +ld.shared.f32 f944, [r13+5888]; +ld.shared.f32 f945, [r13+6144]; +ld.shared.f32 f946, [r13+6400]; +ld.shared.f32 f947, [r13+6656]; +ld.shared.f32 f948, [r13+6912]; +ld.shared.f32 f949, [r13+7168]; +ld.shared.f32 f950, [r13+7424]; +ld.shared.f32 f951, [r13+7680]; +ld.shared.f32 f952, [r13+7936]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2151, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+256]; +ld.shared.f32 f955, [r13+512]; +ld.shared.f32 f956, [r13+768]; +ld.shared.f32 f957, [r13+1024]; +ld.shared.f32 f958, [r13+1280]; +ld.shared.f32 f959, [r13+1536]; +ld.shared.f32 f960, [r13+1792]; +ld.shared.f32 f961, [r13+2048]; +ld.shared.f32 f962, [r13+2304]; +ld.shared.f32 f963, [r13+2560]; +ld.shared.f32 f964, [r13+2816]; +ld.shared.f32 f965, [r13+3072]; +ld.shared.f32 f966, [r13+3328]; +ld.shared.f32 f967, [r13+3584]; +ld.shared.f32 f968, [r13+3840]; +ld.shared.f32 f969, [r13+4096]; +ld.shared.f32 f970, [r13+4352]; +ld.shared.f32 f971, [r13+4608]; +ld.shared.f32 f972, [r13+4864]; +ld.shared.f32 f973, [r13+5120]; +ld.shared.f32 f974, [r13+5376]; +ld.shared.f32 f975, [r13+5632]; +ld.shared.f32 f976, [r13+5888]; +ld.shared.f32 f977, [r13+6144]; +ld.shared.f32 f978, [r13+6400]; +ld.shared.f32 f979, [r13+6656]; +ld.shared.f32 f980, [r13+6912]; +ld.shared.f32 f981, [r13+7168]; +ld.shared.f32 f982, [r13+7424]; +ld.shared.f32 f983, [r13+7680]; +ld.shared.f32 f984, [r13+7936]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2091, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2090, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2089, f2091, f2090; +sub.f32 f996, f2091, f2090; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f2088, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2087, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2086, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2085, f2087, f2086; +sub.f32 f1012, f2087, f2086; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f2084, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f2084, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f2082, f1015, 0fBF3504F3; +mul.f32 f2083, f1016, 0f3F3504F3; +sub.f32 f1023, f2082, f2083; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2081, f2089, f2085; +sub.f32 f1029, f2089, f2085; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2080, f2088, f1020; +sub.f32 f1033, f2088, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f2079, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f2078, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2077, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2076, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2075, f2077, f2076; +sub.f32 f1053, f2077, f2076; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f2074, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2073, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2072, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2071, f2073, f2072; +sub.f32 f1069, f2073, f2072; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f2070, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f2070, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f2068, f1072, 0fBF3504F3; +mul.f32 f2069, f1073, 0f3F3504F3; +sub.f32 f1080, f2068, f2069; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2067, f2075, f2071; +sub.f32 f1086, f2075, f2071; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2066, f2074, f1077; +sub.f32 f1090, f2074, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f2065, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f2064, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2062, f1087, 0f3F6C835E; +mul.f32 f2063, f2066, 0f3EC3EF15; +sub.f32 f1101, f2062, f2063; +mul.f32 f1102, f2066, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f2065, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f2060, f1095, 0f3EC3EF15; +mul.f32 f2061, f2064, 0f3F6C835E; +sub.f32 f1110, f2060, f2061; +mul.f32 f1111, f2064, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f2058, f1089, 0fBEC3EF15; +mul.f32 f2059, f1090, 0f3F6C835E; +sub.f32 f1115, f2058, f2059; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f2056, f1093, 0fBF3504F3; +mul.f32 f2057, f1094, 0f3F3504F3; +sub.f32 f1120, f2056, f2057; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f2054, f1097, 0fBF6C835E; +mul.f32 f2055, f1098, 0f3EC3EF15; +sub.f32 f1125, f2054, f2055; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2053, f2081, f2067; +sub.f32 f1131, f2081, f2067; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2052, f2080, f1103; +sub.f32 f1135, f2080, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2051, f2079, f1107; +sub.f32 f1139, f2079, f1107; +add.f32 f1140, f1038, f1110; +sub.f32 f1142, f1038, f1110; +add.f32 f2050, f2078, f1112; +sub.f32 f1143, f2078, f1112; +sub.f32 f1144, f1028, f1086; +add.f32 f1146, f1028, f1086; +add.f32 f2049, f1029, f1085; +sub.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1115; +sub.f32 f1150, f1032, f1115; +add.f32 f2048, f1033, f1117; +sub.f32 f1151, f1033, f1117; +add.f32 f1152, f1036, f1120; +sub.f32 f1154, f1036, f1120; +add.f32 f2047, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2046, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2045, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2044, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2043, f2045, f2044; +sub.f32 f1171, f2045, f2044; +sub.f32 f1172, f1162, f1167; +add.f32 f1174, f1162, f1167; +add.f32 f2042, f1163, f1166; +sub.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2041, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2040, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2039, f2041, f2040; +sub.f32 f1187, f2041, f2040; +sub.f32 f1188, f1178, f1183; +add.f32 f1190, f1178, f1183; +add.f32 f2038, f1179, f1182; +sub.f32 f1191, f1179, f1182; +mul.f32 f1192, f1188, 0f3F3504F3; +mul.f32 f1193, f2038, 0f3F3504F3; +sub.f32 f1194, f1192, f1193; +add.f32 f1195, f1192, f1193; +mul.f32 f2036, f1190, 0fBF3504F3; +mul.f32 f2037, f1191, 0f3F3504F3; +sub.f32 f1198, f2036, f2037; +mul.f32 f1199, f1191, 0fBF3504F3; +fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2035, f2043, f2039; +sub.f32 f1204, f2043, f2039; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2034, f2042, f1195; +sub.f32 f1208, f2042, f1195; +sub.f32 f1209, f1170, f1187; +add.f32 f1211, f1170, f1187; +add.f32 f2033, f1171, f1186; +sub.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1198; +sub.f32 f1215, f1174, f1198; +add.f32 f2032, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2031, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2030, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2029, f2031, f2030; +sub.f32 f1228, f2031, f2030; +sub.f32 f1229, f1219, f1224; +add.f32 f1231, f1219, f1224; +add.f32 f2028, f1220, f1223; +sub.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2027, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2026, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2025, f2027, f2026; +sub.f32 f1244, f2027, f2026; +sub.f32 f1245, f1235, f1240; +add.f32 f1247, f1235, f1240; +add.f32 f2024, f1236, f1239; +sub.f32 f1248, f1236, f1239; +mul.f32 f1249, f1245, 0f3F3504F3; +mul.f32 f1250, f2024, 0f3F3504F3; +sub.f32 f1251, f1249, f1250; +add.f32 f1252, f1249, f1250; +mul.f32 f2022, f1247, 0fBF3504F3; +mul.f32 f2023, f1248, 0f3F3504F3; +sub.f32 f1255, f2022, f2023; +mul.f32 f1256, f1248, 0fBF3504F3; +fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2021, f2029, f2025; +sub.f32 f1261, f2029, f2025; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2020, f2028, f1252; +sub.f32 f1265, f2028, f1252; +sub.f32 f1266, f1227, f1244; +add.f32 f1268, f1227, f1244; +add.f32 f2019, f1228, f1243; +sub.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1255; +sub.f32 f1272, f1231, f1255; +add.f32 f2018, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2016, f1262, 0f3F6C835E; +mul.f32 f2017, f2020, 0f3EC3EF15; +sub.f32 f1276, f2016, f2017; +mul.f32 f1277, f2020, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277; +mul.f32 f1279, f1266, 0f3F3504F3; +mul.f32 f1280, f2019, 0f3F3504F3; +sub.f32 f1281, f1279, f1280; +add.f32 f1282, f1279, f1280; +mul.f32 f1284, f2018, 0f3F6C835E; +mul.f32 f2015, f1270, 0f3EC3EF15; +sub.f32 f1285, f2015, f1284; +mul.f32 f1286, f2018, 0f3EC3EF15; +fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286; +mul.f32 f1289, f1265, 0f3F6C835E; +mul.f32 f2014, f1264, 0fBEC3EF15; +sub.f32 f1290, f2014, f1289; +mul.f32 f1291, f1265, 0fBEC3EF15; +fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291; +mul.f32 f2012, f1268, 0fBF3504F3; +mul.f32 f2013, f1269, 0f3F3504F3; +sub.f32 f1295, f2012, f2013; +mul.f32 f1296, f1269, 0fBF3504F3; +fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296; +mul.f32 f2010, f1272, 0fBF6C835E; +mul.f32 f2011, f1273, 0f3EC3EF15; +sub.f32 f1300, f2010, f2011; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2009, f2035, f2021; +sub.f32 f1306, f2035, f2021; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2008, f2034, f1278; +sub.f32 f1310, f2034, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2007, f2033, f1282; +sub.f32 f1314, f2033, f1282; +add.f32 f1315, f1213, f1285; +sub.f32 f1317, f1213, f1285; +add.f32 f2006, f2032, f1287; +sub.f32 f1318, f2032, f1287; +sub.f32 f1319, f1203, f1261; +add.f32 f1321, f1203, f1261; +add.f32 f2005, f1204, f1260; +sub.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1290; +sub.f32 f1325, f1207, f1290; +add.f32 f2004, f1208, f1292; +sub.f32 f1326, f1208, f1292; +add.f32 f1327, f1211, f1295; +sub.f32 f1329, f1211, f1295; +add.f32 f2003, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2002, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2008, 0f3E47C5C2; +mul.f32 f2001, f1307, 0f3F7B14BE; +sub.f32 f1337, f2001, f1336; +mul.f32 f1338, f2008, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338; +mul.f32 f1341, f2007, 0f3EC3EF15; +mul.f32 f2000, f1311, 0f3F6C835E; +sub.f32 f1342, f2000, f1341; +mul.f32 f1343, f2007, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343; +mul.f32 f1998, f1315, 0f3F54DB31; +mul.f32 f1999, f2006, 0f3F0E39DA; +sub.f32 f1347, f1998, f1999; +mul.f32 f1348, f2006, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348; +mul.f32 f1350, f1319, 0f3F3504F3; +mul.f32 f1351, f2005, 0f3F3504F3; +sub.f32 f1352, f1350, f1351; +add.f32 f1353, f1350, f1351; +mul.f32 f1355, f2004, 0f3F54DB31; +mul.f32 f1997, f1323, 0f3F0E39DA; +sub.f32 f1356, f1997, f1355; +mul.f32 f1357, f2004, 0f3F0E39DA; +fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357; +mul.f32 f1360, f2003, 0f3F6C835E; +mul.f32 f1996, f1327, 0f3EC3EF15; +sub.f32 f1361, f1996, f1360; +mul.f32 f1362, f2003, 0f3EC3EF15; +fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362; +mul.f32 f1994, f1331, 0f3E47C5C2; +mul.f32 f1995, f2002, 0f3F7B14BE; +sub.f32 f1366, f1994, f1995; +mul.f32 f1367, f2002, 0f3E47C5C2; +fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367; +mul.f32 f1992, f1309, 0fBE47C5C2; +mul.f32 f1993, f1310, 0f3F7B14BE; +sub.f32 f1371, f1992, f1993; +mul.f32 f1372, f1310, 0fBE47C5C2; +fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372; +mul.f32 f1990, f1313, 0fBEC3EF15; +mul.f32 f1991, f1314, 0f3F6C835E; +sub.f32 f1376, f1990, f1991; +mul.f32 f1377, f1314, 0fBEC3EF15; +fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377; +mul.f32 f1988, f1317, 0fBF0E39DA; +mul.f32 f1989, f1318, 0f3F54DB31; +sub.f32 f1381, f1988, f1989; +mul.f32 f1382, f1318, 0fBF0E39DA; +fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382; +mul.f32 f1385, f1322, 0f3F3504F3; +mul.f32 f1987, f1321, 0fBF3504F3; +sub.f32 f1386, f1987, f1385; +mul.f32 f1387, f1322, 0fBF3504F3; +fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387; +mul.f32 f1390, f1326, 0f3F0E39DA; +mul.f32 f1986, f1325, 0fBF54DB31; +sub.f32 f1391, f1986, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392; +mul.f32 f1395, f1330, 0f3EC3EF15; +mul.f32 f1985, f1329, 0fBF6C835E; +sub.f32 f1396, f1985, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397; +mul.f32 f1400, f1334, 0f3E47C5C2; +mul.f32 f1984, f1333, 0fBF7B14BE; +sub.f32 f1401, f1984, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f1983, f2053, f2009; +sub.f32 f1407, f2053, f2009; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f1982, f2052, f1339; +sub.f32 f1411, f2052, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f1981, f2051, f1344; +sub.f32 f1415, f2051, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f1980, f2050, f1349; +sub.f32 f1419, f2050, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f1979, f2049, f1353; +sub.f32 f1423, f2049, f1353; +add.f32 f1424, f1148, f1356; +sub.f32 f1426, f1148, f1356; +add.f32 f1978, f2048, f1358; +sub.f32 f1427, f2048, f1358; +add.f32 f1428, f1152, f1361; +sub.f32 f1430, f1152, f1361; +add.f32 f1977, f2047, f1363; +sub.f32 f1431, f2047, f1363; +add.f32 f1432, f1156, f1366; +sub.f32 f1434, f1156, f1366; +add.f32 f1976, f2046, f1368; +sub.f32 f1435, f2046, f1368; +sub.f32 f1436, f1130, f1306; +add.f32 f1438, f1130, f1306; +add.f32 f1975, f1131, f1305; +sub.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1371; +sub.f32 f1442, f1134, f1371; +add.f32 f1974, f1135, f1373; +sub.f32 f1443, f1135, f1373; +add.f32 f1444, f1138, f1376; +sub.f32 f1446, f1138, f1376; +add.f32 f1973, f1139, f1378; +sub.f32 f1447, f1139, f1378; +add.f32 f1448, f1142, f1381; +sub.f32 f1450, f1142, f1381; +add.f32 f1972, f1143, f1383; +sub.f32 f1451, f1143, f1383; +add.f32 f1452, f1146, f1386; +sub.f32 f1454, f1146, f1386; +add.f32 f1971, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f1970, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f1969, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f1968, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r32, 5, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1472, f1982, f1469; +fma.rn.f32 f1473, f1468, f1408, f1472; +mul.f32 f1474, f1408, f1469; +mul.f32 f1475, f1468, f1982; +sub.f32 f1476, f1475, f1474; +mul.f32 f1478, f1469, f1469; +mul.f32 f1967, f1468, f1468; +sub.f32 f1479, f1967, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1482, f1981, f1481; +fma.rn.f32 f1483, f1479, f1412, f1482; +mul.f32 f1484, f1412, f1481; +mul.f32 f1485, f1479, f1981; +sub.f32 f1486, f1485, f1484; +mul.f32 f1965, f1468, f1479; +mul.f32 f1966, f1469, f1481; +sub.f32 f1489, f1965, f1966; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f1492, f1980, f1491; +fma.rn.f32 f1493, f1489, f1416, f1492; +mul.f32 f1494, f1416, f1491; +mul.f32 f1495, f1489, f1980; +sub.f32 f1496, f1495, f1494; +mul.f32 f1963, f1468, f1489; +mul.f32 f1964, f1469, f1491; +sub.f32 f1499, f1963, f1964; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f1502, f1979, f1501; +fma.rn.f32 f1503, f1499, f1420, f1502; +mul.f32 f1504, f1420, f1501; +mul.f32 f1505, f1499, f1979; +sub.f32 f1506, f1505, f1504; +mul.f32 f1508, f1469, f1501; +mul.f32 f1962, f1468, f1499; +sub.f32 f1509, f1962, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1512, f1978, f1511; +fma.rn.f32 f1513, f1509, f1424, f1512; +mul.f32 f1514, f1424, f1511; +mul.f32 f1515, f1509, f1978; +sub.f32 f1516, f1515, f1514; +mul.f32 f1518, f1469, f1511; +mul.f32 f1961, f1468, f1509; +sub.f32 f1519, f1961, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1522, f1977, f1521; +fma.rn.f32 f1523, f1519, f1428, f1522; +mul.f32 f1524, f1428, f1521; +mul.f32 f1525, f1519, f1977; +sub.f32 f1526, f1525, f1524; +mul.f32 f1528, f1469, f1521; +mul.f32 f1960, f1468, f1519; +sub.f32 f1529, f1960, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f1532, f1976, f1531; +fma.rn.f32 f1533, f1529, f1432, f1532; +mul.f32 f1534, f1432, f1531; +mul.f32 f1535, f1529, f1976; +sub.f32 f1536, f1535, f1534; +mul.f32 f1958, f1468, f1529; +mul.f32 f1959, f1469, f1531; +sub.f32 f1539, f1958, f1959; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f1542, f1975, f1541; +fma.rn.f32 f1543, f1539, f1436, f1542; +mul.f32 f1544, f1436, f1541; +mul.f32 f1545, f1539, f1975; +sub.f32 f1546, f1545, f1544; +mul.f32 f1956, f1468, f1539; +mul.f32 f1957, f1469, f1541; +sub.f32 f1549, f1956, f1957; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1552, f1974, f1551; +fma.rn.f32 f1553, f1549, f1440, f1552; +mul.f32 f1554, f1440, f1551; +mul.f32 f1555, f1549, f1974; +sub.f32 f1556, f1555, f1554; +mul.f32 f1558, f1469, f1551; +mul.f32 f1955, f1468, f1549; +sub.f32 f1559, f1955, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1562, f1973, f1561; +fma.rn.f32 f1563, f1559, f1444, f1562; +mul.f32 f1564, f1444, f1561; +mul.f32 f1565, f1559, f1973; +sub.f32 f1566, f1565, f1564; +mul.f32 f1568, f1469, f1561; +mul.f32 f1954, f1468, f1559; +sub.f32 f1569, f1954, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1572, f1972, f1571; +fma.rn.f32 f1573, f1569, f1448, f1572; +mul.f32 f1574, f1448, f1571; +mul.f32 f1575, f1569, f1972; +sub.f32 f1576, f1575, f1574; +mul.f32 f1578, f1469, f1571; +mul.f32 f1953, f1468, f1569; +sub.f32 f1579, f1953, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f1582, f1971, f1581; +fma.rn.f32 f1583, f1579, f1452, f1582; +mul.f32 f1584, f1452, f1581; +mul.f32 f1585, f1579, f1971; +sub.f32 f1586, f1585, f1584; +mul.f32 f1951, f1468, f1579; +mul.f32 f1952, f1469, f1581; +sub.f32 f1589, f1951, f1952; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f1592, f1970, f1591; +fma.rn.f32 f1593, f1589, f1456, f1592; +mul.f32 f1594, f1456, f1591; +mul.f32 f1595, f1589, f1970; +sub.f32 f1596, f1595, f1594; +mul.f32 f1598, f1469, f1591; +mul.f32 f1950, f1468, f1589; +sub.f32 f1599, f1950, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1602, f1969, f1601; +fma.rn.f32 f1603, f1599, f1460, f1602; +mul.f32 f1604, f1460, f1601; +mul.f32 f1605, f1599, f1969; +sub.f32 f1606, f1605, f1604; +mul.f32 f1608, f1469, f1601; +mul.f32 f1949, f1468, f1599; +sub.f32 f1609, f1949, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1612, f1968, f1611; +fma.rn.f32 f1613, f1609, f1464, f1612; +mul.f32 f1614, f1464, f1611; +mul.f32 f1615, f1609, f1968; +sub.f32 f1616, f1615, f1614; +mul.f32 f1618, f1469, f1611; +mul.f32 f1948, f1468, f1609; +sub.f32 f1619, f1948, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1622, f1407, f1621; +fma.rn.f32 f1623, f1619, f1406, f1622; +mul.f32 f1624, f1406, f1621; +mul.f32 f1625, f1619, f1407; +sub.f32 f1626, f1625, f1624; +mul.f32 f1946, f1468, f1619; +mul.f32 f1947, f1469, f1621; +sub.f32 f1629, f1946, f1947; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f1632, f1411, f1631; +fma.rn.f32 f1633, f1629, f1410, f1632; +mul.f32 f1634, f1410, f1631; +mul.f32 f1635, f1629, f1411; +sub.f32 f1636, f1635, f1634; +mul.f32 f1944, f1468, f1629; +mul.f32 f1945, f1469, f1631; +sub.f32 f1639, f1944, f1945; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f1642, f1415, f1641; +fma.rn.f32 f1643, f1639, f1414, f1642; +mul.f32 f1644, f1414, f1641; +mul.f32 f1645, f1639, f1415; +sub.f32 f1646, f1645, f1644; +mul.f32 f1648, f1469, f1641; +mul.f32 f1943, f1468, f1639; +sub.f32 f1649, f1943, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1652, f1419, f1651; +fma.rn.f32 f1653, f1649, f1418, f1652; +mul.f32 f1654, f1418, f1651; +mul.f32 f1655, f1649, f1419; +sub.f32 f1656, f1655, f1654; +mul.f32 f1658, f1469, f1651; +mul.f32 f1942, f1468, f1649; +sub.f32 f1659, f1942, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1662, f1423, f1661; +fma.rn.f32 f1663, f1659, f1422, f1662; +mul.f32 f1664, f1422, f1661; +mul.f32 f1665, f1659, f1423; +sub.f32 f1666, f1665, f1664; +mul.f32 f1668, f1469, f1661; +mul.f32 f1941, f1468, f1659; +sub.f32 f1669, f1941, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f1672, f1427, f1671; +fma.rn.f32 f1673, f1669, f1426, f1672; +mul.f32 f1674, f1426, f1671; +mul.f32 f1675, f1669, f1427; +sub.f32 f1676, f1675, f1674; +mul.f32 f1939, f1468, f1669; +mul.f32 f1940, f1469, f1671; +sub.f32 f1679, f1939, f1940; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f1682, f1431, f1681; +fma.rn.f32 f1683, f1679, f1430, f1682; +mul.f32 f1684, f1430, f1681; +mul.f32 f1685, f1679, f1431; +sub.f32 f1686, f1685, f1684; +mul.f32 f1937, f1468, f1679; +mul.f32 f1938, f1469, f1681; +sub.f32 f1689, f1937, f1938; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1692, f1435, f1691; +fma.rn.f32 f1693, f1689, f1434, f1692; +mul.f32 f1694, f1434, f1691; +mul.f32 f1695, f1689, f1435; +sub.f32 f1696, f1695, f1694; +mul.f32 f1698, f1469, f1691; +mul.f32 f1936, f1468, f1689; +sub.f32 f1699, f1936, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1702, f1439, f1701; +fma.rn.f32 f1703, f1699, f1438, f1702; +mul.f32 f1704, f1438, f1701; +mul.f32 f1705, f1699, f1439; +sub.f32 f1706, f1705, f1704; +mul.f32 f1708, f1469, f1701; +mul.f32 f1935, f1468, f1699; +sub.f32 f1709, f1935, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1712, f1443, f1711; +fma.rn.f32 f1713, f1709, f1442, f1712; +mul.f32 f1714, f1442, f1711; +mul.f32 f1715, f1709, f1443; +sub.f32 f1716, f1715, f1714; +mul.f32 f1933, f1468, f1709; +mul.f32 f1934, f1469, f1711; +sub.f32 f1719, f1933, f1934; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f1722, f1447, f1721; +fma.rn.f32 f1723, f1719, f1446, f1722; +mul.f32 f1724, f1446, f1721; +mul.f32 f1725, f1719, f1447; +sub.f32 f1726, f1725, f1724; +mul.f32 f1931, f1468, f1719; +mul.f32 f1932, f1469, f1721; +sub.f32 f1729, f1931, f1932; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f1732, f1451, f1731; +fma.rn.f32 f1733, f1729, f1450, f1732; +mul.f32 f1734, f1450, f1731; +mul.f32 f1735, f1729, f1451; +sub.f32 f1736, f1735, f1734; +mul.f32 f1738, f1469, f1731; +mul.f32 f1930, f1468, f1729; +sub.f32 f1739, f1930, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1742, f1455, f1741; +fma.rn.f32 f1743, f1739, f1454, f1742; +mul.f32 f1744, f1454, f1741; +mul.f32 f1745, f1739, f1455; +sub.f32 f1746, f1745, f1744; +mul.f32 f1748, f1469, f1741; +mul.f32 f1929, f1468, f1739; +sub.f32 f1749, f1929, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1752, f1459, f1751; +fma.rn.f32 f1753, f1749, f1458, f1752; +mul.f32 f1754, f1458, f1751; +mul.f32 f1755, f1749, f1459; +sub.f32 f1756, f1755, f1754; +mul.f32 f1758, f1469, f1751; +mul.f32 f1928, f1468, f1749; +sub.f32 f1759, f1928, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f1762, f1463, f1761; +fma.rn.f32 f1763, f1759, f1462, f1762; +mul.f32 f1764, f1462, f1761; +mul.f32 f1765, f1759, f1463; +sub.f32 f1766, f1765, f1764; +mul.f32 f1926, f1468, f1759; +mul.f32 f1927, f1469, f1761; +sub.f32 f1769, f1926, f1927; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 7; +mul.f32 f1770, f1468, f1761; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f1772, f1467, f1771; +fma.rn.f32 f1773, f1769, f1466, f1772; +mul.f32 f1774, f1466, f1771; +mov.u32 r33, %tid.x; +mul.f32 f1775, f1769, f1467; +sub.f32 f1776, f1775, f1774; +and.b32 r22, r33, 32; +shl.b32 r16, r33, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r30, 4096; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1473; +st.shared.f32 [r20+256], f1483; +st.shared.f32 [r20+384], f1493; +st.shared.f32 [r20+512], f1503; +st.shared.f32 [r20+640], f1513; +st.shared.f32 [r20+768], f1523; +st.shared.f32 [r20+896], f1533; +st.shared.f32 [r20+1024], f1543; +st.shared.f32 [r20+1152], f1553; +st.shared.f32 [r20+1280], f1563; +st.shared.f32 [r20+1408], f1573; +st.shared.f32 [r20+1536], f1583; +st.shared.f32 [r20+1664], f1593; +st.shared.f32 [r20+1792], f1603; +st.shared.f32 [r20+1920], f1613; +st.shared.f32 [r20+2048], f1623; +st.shared.f32 [r20+2176], f1633; +st.shared.f32 [r20+2304], f1643; +st.shared.f32 [r20+2432], f1653; +st.shared.f32 [r20+2560], f1663; +st.shared.f32 [r20+2688], f1673; +st.shared.f32 [r20+2816], f1683; +st.shared.f32 [r20+2944], f1693; +st.shared.f32 [r20+3072], f1703; +st.shared.f32 [r20+3200], f1713; +st.shared.f32 [r20+3328], f1723; +st.shared.f32 [r20+3456], f1733; +st.shared.f32 [r20+3584], f1743; +st.shared.f32 [r20+3712], f1753; +st.shared.f32 [r20+3840], f1763; +st.shared.f32 [r20+3968], f1773; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+256]; +ld.shared.f32 f1779, [r21+512]; +ld.shared.f32 f1780, [r21+768]; +ld.shared.f32 f1781, [r21+1024]; +ld.shared.f32 f1782, [r21+1280]; +ld.shared.f32 f1783, [r21+1536]; +ld.shared.f32 f1784, [r21+1792]; +ld.shared.f32 f1785, [r21+2048]; +ld.shared.f32 f1786, [r21+2304]; +ld.shared.f32 f1787, [r21+2560]; +ld.shared.f32 f1788, [r21+2816]; +ld.shared.f32 f1789, [r21+3072]; +ld.shared.f32 f1790, [r21+3328]; +ld.shared.f32 f1791, [r21+3584]; +ld.shared.f32 f1792, [r21+3840]; +ld.shared.f32 f1793, [r21+4096]; +ld.shared.f32 f1794, [r21+4352]; +ld.shared.f32 f1795, [r21+4608]; +ld.shared.f32 f1796, [r21+4864]; +ld.shared.f32 f1797, [r21+5120]; +ld.shared.f32 f1798, [r21+5376]; +ld.shared.f32 f1799, [r21+5632]; +ld.shared.f32 f1800, [r21+5888]; +ld.shared.f32 f1801, [r21+6144]; +ld.shared.f32 f1802, [r21+6400]; +ld.shared.f32 f1803, [r21+6656]; +ld.shared.f32 f1804, [r21+6912]; +ld.shared.f32 f1805, [r21+7168]; +ld.shared.f32 f1806, [r21+7424]; +ld.shared.f32 f1807, [r21+7680]; +ld.shared.f32 f1808, [r21+7936]; +barrier.sync 0; +st.shared.f32 [r20], f1983; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+256]; +ld.shared.f32 f1811, [r21+512]; +ld.shared.f32 f1812, [r21+768]; +ld.shared.f32 f1813, [r21+1024]; +ld.shared.f32 f1814, [r21+1280]; +ld.shared.f32 f1815, [r21+1536]; +ld.shared.f32 f1816, [r21+1792]; +ld.shared.f32 f1817, [r21+2048]; +ld.shared.f32 f1818, [r21+2304]; +ld.shared.f32 f1819, [r21+2560]; +ld.shared.f32 f1820, [r21+2816]; +ld.shared.f32 f1821, [r21+3072]; +ld.shared.f32 f1822, [r21+3328]; +ld.shared.f32 f1823, [r21+3584]; +ld.shared.f32 f1824, [r21+3840]; +ld.shared.f32 f1825, [r21+4096]; +ld.shared.f32 f1826, [r21+4352]; +ld.shared.f32 f1827, [r21+4608]; +ld.shared.f32 f1828, [r21+4864]; +ld.shared.f32 f1829, [r21+5120]; +ld.shared.f32 f1830, [r21+5376]; +ld.shared.f32 f1831, [r21+5632]; +ld.shared.f32 f1832, [r21+5888]; +ld.shared.f32 f1833, [r21+6144]; +ld.shared.f32 f1834, [r21+6400]; +ld.shared.f32 f1835, [r21+6656]; +ld.shared.f32 f1836, [r21+6912]; +ld.shared.f32 f1837, [r21+7168]; +ld.shared.f32 f1838, [r21+7424]; +ld.shared.f32 f1839, [r21+7680]; +ld.shared.f32 f1840, [r21+7936]; +add.f32 %1, f1809, f1825; +add.f32 %0, f1777, f1793; +add.f32 %3, f1810, f1826; +add.f32 %2, f1778, f1794; +add.f32 %5, f1811, f1827; +add.f32 %4, f1779, f1795; +add.f32 %7, f1812, f1828; +add.f32 %6, f1780, f1796; +add.f32 %8, f1781, f1797; +add.f32 %9, f1813, f1829; +add.f32 %10, f1782, f1798; +add.f32 %11, f1814, f1830; +add.f32 %13, f1815, f1831; +add.f32 %12, f1783, f1799; +add.f32 %15, f1816, f1832; +add.f32 %14, f1784, f1800; +add.f32 %17, f1817, f1833; +add.f32 %16, f1785, f1801; +add.f32 %19, f1818, f1834; +add.f32 %18, f1786, f1802; +add.f32 %20, f1787, f1803; +add.f32 %21, f1819, f1835; +add.f32 %22, f1788, f1804; +add.f32 %23, f1820, f1836; +add.f32 %24, f1789, f1805; +add.f32 %25, f1821, f1837; +add.f32 %27, f1822, f1838; +add.f32 %26, f1790, f1806; +add.f32 %29, f1823, f1839; +add.f32 %28, f1791, f1807; +add.f32 %31, f1824, f1840; +add.f32 %30, f1792, f1808; +sub.f32 %32, f1777, f1793; +sub.f32 %33, f1809, f1825; +sub.f32 %34, f1778, f1794; +sub.f32 %35, f1810, f1826; +sub.f32 %36, f1779, f1795; +sub.f32 %37, f1811, f1827; +sub.f32 %38, f1780, f1796; +sub.f32 %39, f1812, f1828; +sub.f32 %40, f1781, f1797; +sub.f32 %41, f1813, f1829; +sub.f32 %42, f1782, f1798; +sub.f32 %43, f1814, f1830; +sub.f32 %44, f1783, f1799; +sub.f32 %45, f1815, f1831; +sub.f32 %46, f1784, f1800; +sub.f32 %47, f1816, f1832; +sub.f32 %48, f1785, f1801; +sub.f32 %49, f1817, f1833; +sub.f32 %50, f1786, f1802; +sub.f32 %51, f1818, f1834; +sub.f32 %52, f1787, f1803; +sub.f32 %53, f1819, f1835; +sub.f32 %54, f1788, f1804; +sub.f32 %55, f1820, f1836; +sub.f32 %56, f1789, f1805; +sub.f32 %57, f1821, f1837; +sub.f32 %58, f1790, f1806; +sub.f32 %59, f1822, f1838; +sub.f32 %60, f1791, f1807; +sub.f32 %61, f1823, f1839; +sub.f32 %62, f1792, f1808; +sub.f32 %63, f1824, f1840; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_2048), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<299, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<330>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -16384; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 16352; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+4096]; +ld.shared.v2.f32 {f70, f71}, [r13+8192]; +ld.shared.v2.f32 {f74, f75}, [r13+12288]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 16256; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+4096]; +ld.shared.v2.f32 {f131, f132}, [r20+8192]; +ld.shared.v2.f32 {f135, f136}, [r20+12288]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +sub.f32 f149, f141, f146; +add.f32 f150, f142, f145; +add.f32 f151, f141, f146; +sub.f32 f152, f142, f145; +and.b32 r21, r5, 496; +bfe.u32 r22, r5, 4, 5; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f150, f154; +mul.f32 f158, f149, f154; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f148, f164; +mul.f32 f166, f147, f164; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f152, f172; +mul.f32 f174, f151, f172; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 15872; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f153, f149, f157; +sub.f32 f179, f159, f158; +st.shared.v2.f32 [r26+128], {f178, f179}; +fma.rn.f32 f180, f162, f147, f165; +sub.f32 f181, f167, f166; +st.shared.v2.f32 [r26+256], {f180, f181}; +sub.f32 f182, f175, f174; +fma.rn.f32 f183, f170, f151, f173; +st.shared.v2.f32 [r26+384], {f183, f182}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+4096]; +ld.shared.v2.f32 {f192, f193}, [r27+8192]; +ld.shared.v2.f32 {f196, f197}, [r27+12288]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +sub.f32 f210, f202, f207; +add.f32 f211, f203, f206; +add.f32 f212, f202, f207; +sub.f32 f213, f203, f206; +and.b32 r28, r5, 448; +bfe.u32 r29, r5, 6, 3; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f211, f215; +mul.f32 f219, f210, f215; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f209, f225; +mul.f32 f227, f208, f225; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f213, f233; +mul.f32 f235, f212, f233; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 14336; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f214, f210, f218; +sub.f32 f240, f220, f219; +st.shared.v2.f32 [r33+512], {f239, f240}; +fma.rn.f32 f241, f223, f208, f226; +sub.f32 f242, f228, f227; +st.shared.v2.f32 [r33+1024], {f241, f242}; +sub.f32 f243, f236, f235; +fma.rn.f32 f244, f231, f212, f234; +st.shared.v2.f32 [r33+1536], {f244, f243}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+4096]; +ld.shared.v2.f32 {f253, f254}, [r34+8192]; +ld.shared.v2.f32 {f257, f258}, [r34+12288]; +add.f32 f261, f245, f253; +add.f32 f262, f246, f254; +sub.f32 f263, f245, f253; +sub.f32 f264, f246, f254; +add.f32 f265, f249, f257; +add.f32 f266, f250, f258; +sub.f32 f267, f249, f257; +sub.f32 f268, f250, f258; +sub.f32 f269, f261, f265; +sub.f32 f270, f262, f266; +sub.f32 f271, f263, f268; +add.f32 f272, f264, f267; +add.f32 f273, f263, f268; +sub.f32 f274, f264, f267; +and.b32 r35, r5, 256; +bfe.u32 r36, r5, 8, 1; +mul.wide.u32 rd15, r36, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f275, f276}, [rd17]; +mul.f32 f279, f272, f276; +mul.f32 f280, f271, f276; +mul.f32 f281, f275, f272; +mul.f32 f282, f275, f275; +mul.f32 f283, f276, f276; +sub.f32 f284, f282, f283; +mul.f32 f285, f276, f275; +fma.rn.f32 f286, f276, f275, f285; +mul.f32 f287, f270, f286; +mul.f32 f288, f269, f286; +mul.f32 f289, f284, f270; +mul.f32 f290, f275, f284; +mul.f32 f291, f276, f286; +sub.f32 f292, f290, f291; +mul.f32 f293, f275, f286; +fma.rn.f32 f294, f276, f284, f293; +mul.f32 f295, f274, f294; +mul.f32 f296, f273, f294; +mul.f32 f297, f292, f274; +and.b32 r37, r10, 2040; +add.s32 r38, r9, r37; +barrier.sync 0; +and.b32 r39, r7, 8192; +add.s32 r40, r38, r39; +add.f32 f298, f262, f266; +add.f32 f299, f261, f265; +st.shared.v2.f32 [r40], {f299, f298}; +fma.rn.f32 f300, f275, f271, f279; +sub.f32 f301, f281, f280; +st.shared.v2.f32 [r40+2048], {f300, f301}; +fma.rn.f32 f302, f284, f269, f287; +sub.f32 f303, f289, f288; +st.shared.v2.f32 [r40+4096], {f302, f303}; +sub.f32 f304, f297, f296; +fma.rn.f32 f305, f292, f273, f295; +st.shared.v2.f32 [r40+6144], {f305, f304}; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.v2.f32 {f306, f307}, [r41]; +ld.shared.v2.f32 {f310, f311}, [r41+4096]; +ld.shared.v2.f32 {f314, f315}, [r41+8192]; +ld.shared.v2.f32 {f318, f319}, [r41+12288]; +add.f32 %1, f307, f315; +add.f32 %0, f306, f314; +add.f32 %3, f311, f319; +add.f32 %2, f310, f318; +sub.f32 %5, f307, f315; +sub.f32 %4, f306, f314; +sub.f32 %7, f311, f319; +sub.f32 %6, f310, f318; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_2048), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<300, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<290>; +.reg .b32 r<43>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -8192; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 8176; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+2048]; +ld.shared.f32 f64, [r13+4096]; +ld.shared.f32 f65, [r13+6144]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+2048]; +ld.shared.f32 f68, [r13+4096]; +ld.shared.f32 f69, [r13+6144]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 8128; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+2048]; +ld.shared.f32 f117, [r21+4096]; +ld.shared.f32 f118, [r21+6144]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+2048]; +ld.shared.f32 f121, [r21+4096]; +ld.shared.f32 f122, [r21+6144]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +sub.f32 f135, f125, f130; +add.f32 f136, f126, f129; +add.f32 f137, f125, f130; +sub.f32 f138, f126, f129; +and.b32 r22, r5, 496; +bfe.u32 r23, r5, 4, 5; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f136, f140; +fma.rn.f32 f144, f139, f135, f143; +mul.f32 f145, f135, f140; +mul.f32 f146, f139, f136; +sub.f32 f147, f146, f145; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f134, f152; +fma.rn.f32 f154, f150, f133, f153; +mul.f32 f155, f133, f152; +mul.f32 f156, f150, f134; +sub.f32 f157, f156, f155; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f138, f162; +fma.rn.f32 f164, f160, f137, f163; +mul.f32 f165, f137, f162; +mul.f32 f166, f160, f138; +sub.f32 f167, f166, f165; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 7936; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f144; +st.shared.f32 [r27+128], f154; +st.shared.f32 [r27+192], f164; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+2048]; +ld.shared.f32 f170, [r28+4096]; +ld.shared.f32 f171, [r28+6144]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+2048]; +ld.shared.f32 f174, [r28+4096]; +ld.shared.f32 f175, [r28+6144]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +sub.f32 f188, f178, f183; +add.f32 f189, f179, f182; +add.f32 f190, f178, f183; +sub.f32 f191, f179, f182; +and.b32 r29, r5, 448; +bfe.u32 r30, r5, 6, 3; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f189, f193; +fma.rn.f32 f197, f192, f188, f196; +mul.f32 f198, f188, f193; +mul.f32 f199, f192, f189; +sub.f32 f200, f199, f198; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f187, f205; +fma.rn.f32 f207, f203, f186, f206; +mul.f32 f208, f186, f205; +mul.f32 f209, f203, f187; +sub.f32 f210, f209, f208; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f191, f215; +fma.rn.f32 f217, f213, f190, f216; +mul.f32 f218, f190, f215; +mul.f32 f219, f213, f191; +sub.f32 f220, f219, f218; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 7168; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f197; +st.shared.f32 [r34+512], f207; +st.shared.f32 [r34+768], f217; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+2048]; +ld.shared.f32 f223, [r35+4096]; +ld.shared.f32 f224, [r35+6144]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+2048]; +ld.shared.f32 f227, [r35+4096]; +ld.shared.f32 f228, [r35+6144]; +add.f32 f229, f221, f223; +add.f32 f230, f225, f227; +sub.f32 f231, f221, f223; +sub.f32 f232, f225, f227; +add.f32 f233, f222, f224; +add.f32 f234, f226, f228; +sub.f32 f235, f222, f224; +sub.f32 f236, f226, f228; +add.f32 f237, f229, f233; +add.f32 f238, f230, f234; +sub.f32 f239, f229, f233; +sub.f32 f240, f230, f234; +sub.f32 f241, f231, f236; +add.f32 f242, f232, f235; +add.f32 f243, f231, f236; +sub.f32 f244, f232, f235; +and.b32 r36, r5, 256; +bfe.u32 r37, r5, 8, 1; +mul.wide.u32 rd15, r37, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f245, f246}, [rd17]; +mul.f32 f249, f242, f246; +fma.rn.f32 f250, f245, f241, f249; +mul.f32 f251, f241, f246; +mul.f32 f252, f245, f242; +sub.f32 f253, f252, f251; +mul.f32 f254, f245, f245; +mul.f32 f255, f246, f246; +sub.f32 f256, f254, f255; +mul.f32 f257, f246, f245; +fma.rn.f32 f258, f246, f245, f257; +mul.f32 f259, f240, f258; +fma.rn.f32 f260, f256, f239, f259; +mul.f32 f261, f239, f258; +mul.f32 f262, f256, f240; +sub.f32 f263, f262, f261; +mul.f32 f264, f245, f256; +mul.f32 f265, f246, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f245, f258; +fma.rn.f32 f268, f246, f256, f267; +mul.f32 f269, f244, f268; +fma.rn.f32 f270, f266, f243, f269; +mul.f32 f271, f243, f268; +mul.f32 f272, f266, f244; +sub.f32 f273, f272, f271; +and.b32 r38, r16, 1020; +add.s32 r39, r10, r38; +barrier.sync 0; +and.b32 r40, r8, 4096; +add.s32 r41, r39, r40; +st.shared.f32 [r41], f237; +st.shared.f32 [r41+1024], f250; +st.shared.f32 [r41+2048], f260; +st.shared.f32 [r41+3072], f270; +barrier.sync 0; +mad.lo.s32 r42, r36, -12, r41; +ld.shared.f32 f274, [r42]; +ld.shared.f32 f275, [r42+2048]; +ld.shared.f32 f276, [r42+4096]; +ld.shared.f32 f277, [r42+6144]; +barrier.sync 0; +st.shared.f32 [r41], f238; +st.shared.f32 [r41+1024], f253; +st.shared.f32 [r41+2048], f263; +st.shared.f32 [r41+3072], f273; +barrier.sync 0; +ld.shared.f32 f278, [r42]; +ld.shared.f32 f279, [r42+2048]; +ld.shared.f32 f280, [r42+4096]; +ld.shared.f32 f281, [r42+6144]; +add.f32 %0, f274, f276; +add.f32 %1, f278, f280; +add.f32 %2, f275, f277; +add.f32 %3, f279, f281; +sub.f32 %4, f274, f276; +sub.f32 %5, f278, f280; +sub.f32 %6, f275, f277; +sub.f32 %7, f279, f281; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_2048), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<301, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<223>; +.reg .b32 r<77>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %15, %17; +sub.f32 f10, %16, %18; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -16384; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 16368; +add.s32 r11, r8, r10; +add.f32 f18, %16, %18; +add.f32 f19, %15, %17; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 8184; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+8192]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16352; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 8176; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+8192]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 16320; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 8160; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+8192]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 1016; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 16256; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 8128; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+8192]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 6; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f94, f96; +mul.f32 f100, f93, f96; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 16128; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f95, f93, f99; +sub.f32 f105, f101, f100; +st.shared.v2.f32 [r39+128], {f104, f105}; +barrier.sync 0; +and.b32 r40, r9, 8064; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+8192]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f115, f117; +mul.f32 f121, f114, f117; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 15872; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f116, f114, f120; +sub.f32 f126, f122, f121; +st.shared.v2.f32 [r46+256], {f125, f126}; +barrier.sync 0; +and.b32 r47, r9, 7936; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+8192]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f136, f138; +mul.f32 f142, f135, f138; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 15360; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f137, f135, f141; +sub.f32 f147, f143, f142; +st.shared.v2.f32 [r53+512], {f146, f147}; +barrier.sync 0; +and.b32 r54, r9, 7680; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+8192]; +sub.f32 f156, f148, f152; +sub.f32 f157, f149, f153; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f158, f159}, [rd26]; +mul.f32 f162, f157, f159; +mul.f32 f163, f156, f159; +mul.f32 f164, f158, f157; +and.b32 r57, r9, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 14336; +add.s32 r60, r58, r59; +add.f32 f165, f149, f153; +add.f32 f166, f148, f152; +st.shared.v2.f32 [r60], {f166, f165}; +fma.rn.f32 f167, f158, f156, f162; +sub.f32 f168, f164, f163; +st.shared.v2.f32 [r60+1024], {f167, f168}; +barrier.sync 0; +and.b32 r61, r9, 7168; +sub.s32 r62, r60, r61; +ld.shared.v2.f32 {f169, f170}, [r62]; +ld.shared.v2.f32 {f173, f174}, [r62+8192]; +sub.f32 f177, f169, f173; +sub.f32 f178, f170, f174; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f179, f180}, [rd29]; +mul.f32 f183, f178, f180; +mul.f32 f184, f177, f180; +mul.f32 f185, f179, f178; +and.b32 r64, r9, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 12288; +add.s32 r67, r65, r66; +add.f32 f186, f170, f174; +add.f32 f187, f169, f173; +st.shared.v2.f32 [r67], {f187, f186}; +fma.rn.f32 f188, f179, f177, f183; +sub.f32 f189, f185, f184; +st.shared.v2.f32 [r67+2048], {f188, f189}; +barrier.sync 0; +and.b32 r68, r9, 6144; +sub.s32 r69, r67, r68; +ld.shared.v2.f32 {f190, f191}, [r69]; +ld.shared.v2.f32 {f194, f195}, [r69+8192]; +sub.f32 f198, f190, f194; +sub.f32 f199, f191, f195; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 8; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f32 {f200, f201}, [rd32]; +mul.f32 f204, f199, f201; +mul.f32 f205, f198, f201; +mul.f32 f206, f200, f199; +and.b32 r71, r9, 4088; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 8192; +add.s32 r74, r72, r73; +add.f32 f207, f191, f195; +add.f32 f208, f190, f194; +st.shared.v2.f32 [r74], {f208, f207}; +fma.rn.f32 f209, f200, f198, f204; +sub.f32 f210, f206, f205; +st.shared.v2.f32 [r74+4096], {f209, f210}; +barrier.sync 0; +and.b32 r75, r9, 4096; +sub.s32 r76, r74, r75; +ld.shared.v2.f32 {f211, f212}, [r76]; +ld.shared.v2.f32 {f215, f216}, [r76+8192]; +add.f32 %1, f212, f216; +add.f32 %0, f211, f215; +sub.f32 %3, f212, f216; +sub.f32 %2, f211, f215; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_2048), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<302, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<183>; +.reg .b32 r<77>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %15, %17; +add.f32 f10, %16, %18; +sub.f32 f11, %15, %17; +sub.f32 f12, %16, %18; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 8184; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 4092; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+4096]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+4096]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8176; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 4088; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+4096]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+4096]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8160; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 4080; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+4096]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+4096]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 1016; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 8128; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 4064; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+4096]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+4096]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 6; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f80, f82; +fma.rn.f32 f86, f81, f79, f85; +mul.f32 f87, f79, f82; +mul.f32 f88, f81, f80; +sub.f32 f89, f88, f87; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 8064; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f86; +barrier.sync 0; +and.b32 r40, r11, 4032; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+4096]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+4096]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f97, f99; +fma.rn.f32 f103, f98, f96, f102; +mul.f32 f104, f96, f99; +mul.f32 f105, f98, f97; +sub.f32 f106, f105, f104; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7936; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f103; +barrier.sync 0; +and.b32 r47, r11, 3968; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+4096]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+4096]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f114, f116; +fma.rn.f32 f120, f115, f113, f119; +mul.f32 f121, f113, f116; +mul.f32 f122, f115, f114; +sub.f32 f123, f122, f121; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 7680; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f120; +barrier.sync 0; +and.b32 r54, r11, 3840; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+4096]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+4096]; +add.f32 f128, f124, f125; +add.f32 f129, f126, f127; +sub.f32 f130, f124, f125; +sub.f32 f131, f126, f127; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f132, f133}, [rd26]; +mul.f32 f136, f131, f133; +fma.rn.f32 f137, f132, f130, f136; +mul.f32 f138, f130, f133; +mul.f32 f139, f132, f131; +sub.f32 f140, f139, f138; +and.b32 r57, r11, 508; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 7168; +add.s32 r60, r58, r59; +st.shared.f32 [r60], f128; +st.shared.f32 [r60+512], f137; +barrier.sync 0; +and.b32 r61, r11, 3584; +sub.s32 r62, r60, r61; +ld.shared.f32 f141, [r62]; +ld.shared.f32 f142, [r62+4096]; +barrier.sync 0; +st.shared.f32 [r60], f129; +st.shared.f32 [r60+512], f140; +barrier.sync 0; +ld.shared.f32 f143, [r62]; +ld.shared.f32 f144, [r62+4096]; +add.f32 f145, f141, f142; +add.f32 f146, f143, f144; +sub.f32 f147, f141, f142; +sub.f32 f148, f143, f144; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 8; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f32 {f149, f150}, [rd29]; +mul.f32 f153, f148, f150; +fma.rn.f32 f154, f149, f147, f153; +mul.f32 f155, f147, f150; +mul.f32 f156, f149, f148; +sub.f32 f157, f156, f155; +and.b32 r64, r11, 1020; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 6144; +add.s32 r67, r65, r66; +st.shared.f32 [r67], f145; +st.shared.f32 [r67+1024], f154; +barrier.sync 0; +and.b32 r68, r11, 3072; +sub.s32 r69, r67, r68; +ld.shared.f32 f158, [r69]; +ld.shared.f32 f159, [r69+4096]; +barrier.sync 0; +st.shared.f32 [r67], f146; +st.shared.f32 [r67+1024], f157; +barrier.sync 0; +ld.shared.f32 f160, [r69]; +ld.shared.f32 f161, [r69+4096]; +add.f32 f162, f158, f159; +add.f32 f163, f160, f161; +sub.f32 f164, f158, f159; +sub.f32 f165, f160, f161; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 8; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f32 {f166, f167}, [rd32]; +mul.f32 f170, f165, f167; +fma.rn.f32 f171, f166, f164, f170; +mul.f32 f172, f164, f167; +mul.f32 f173, f166, f165; +sub.f32 f174, f173, f172; +and.b32 r71, r11, 2044; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 4096; +add.s32 r74, r72, r73; +st.shared.f32 [r74], f162; +st.shared.f32 [r74+2048], f171; +barrier.sync 0; +and.b32 r75, r11, 2048; +sub.s32 r76, r74, r75; +ld.shared.f32 f175, [r76]; +ld.shared.f32 f176, [r76+4096]; +barrier.sync 0; +st.shared.f32 [r74], f163; +st.shared.f32 [r74+2048], f174; +barrier.sync 0; +ld.shared.f32 f177, [r76]; +ld.shared.f32 f178, [r76+4096]; +add.f32 %0, f175, f176; +add.f32 %1, f177, f178; +sub.f32 %2, f175, f176; +sub.f32 %3, f177, f178; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_2048), "l"(lut_sp_2_1024), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..ab16ad49fffb4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp64_fwd.hpp.inc @@ -0,0 +1,4141 @@ +#ifndef CUFFTDX_FFT_2048_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_2048_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<484, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<29>; +.reg .f64 fd<488>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+4096]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16320; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+2048]; +ld.shared.f64 fd160, [r13+4096]; +ld.shared.f64 fd161, [r13+6144]; +ld.shared.f64 fd162, [r13+8192]; +ld.shared.f64 fd163, [r13+10240]; +ld.shared.f64 fd164, [r13+12288]; +ld.shared.f64 fd165, [r13+14336]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+2048]; +ld.shared.f64 fd168, [r13+4096]; +ld.shared.f64 fd169, [r13+6144]; +ld.shared.f64 fd170, [r13+8192]; +ld.shared.f64 fd171, [r13+10240]; +ld.shared.f64 fd172, [r13+12288]; +ld.shared.f64 fd173, [r13+14336]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 248; +bfe.u32 r15, r5, 3, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+512]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 15872; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+2048]; +ld.shared.f64 fd301, [r21+4096]; +ld.shared.f64 fd302, [r21+6144]; +ld.shared.f64 fd303, [r21+8192]; +ld.shared.f64 fd304, [r21+10240]; +ld.shared.f64 fd305, [r21+12288]; +ld.shared.f64 fd306, [r21+14336]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+2048]; +ld.shared.f64 fd309, [r21+4096]; +ld.shared.f64 fd310, [r21+6144]; +ld.shared.f64 fd311, [r21+8192]; +ld.shared.f64 fd312, [r21+10240]; +ld.shared.f64 fd313, [r21+12288]; +ld.shared.f64 fd314, [r21+14336]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +add.f64 fd327, fd317, fd322; +sub.f64 fd328, fd318, fd321; +sub.f64 fd329, fd317, fd322; +add.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +add.f64 fd343, fd333, fd338; +sub.f64 fd344, fd334, fd337; +sub.f64 fd345, fd333, fd338; +add.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0dBFE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd344, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd351, fd343, 0dBFE6A09E667F3BCD, fd350; +mul.f64 fd352, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd353, fd346, 0dBFE6A09E667F3BCD; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd352, fd353; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd351; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd351; +add.f64 fd364, fd325, fd342; +sub.f64 fd365, fd326, fd341; +sub.f64 fd366, fd325, fd342; +add.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd354; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd354; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 192; +bfe.u32 r23, r5, 6, 2; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd372, fd360; +mul.f64 fd377, fd373, fd361; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd372, fd361; +fma.rn.f64 fd380, fd373, fd360, fd379; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd383, fd364; +mul.f64 fd387, fd385, fd365; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd383, fd365; +fma.rn.f64 fd390, fd385, fd364, fd389; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd393, fd368; +mul.f64 fd397, fd395, fd369; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd393, fd369; +fma.rn.f64 fd400, fd395, fd368, fd399; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd403, fd358; +mul.f64 fd407, fd405, fd359; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd403, fd359; +fma.rn.f64 fd410, fd405, fd358, fd409; +ld.global.v2.f64 {fd411, fd412}, [rd11+64]; +mul.f64 fd415, fd411, fd362; +mul.f64 fd416, fd412, fd363; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd411, fd363; +fma.rn.f64 fd419, fd412, fd362, fd418; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd422, fd366; +mul.f64 fd426, fd424, fd367; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd422, fd367; +fma.rn.f64 fd429, fd424, fd366, fd428; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd432, fd370; +mul.f64 fd436, fd434, fd371; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd432, fd371; +fma.rn.f64 fd439, fd434, fd370, fd438; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 12288; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd378; +st.shared.f64 [r27+1024], fd388; +st.shared.f64 [r27+1536], fd398; +st.shared.f64 [r27+2048], fd408; +st.shared.f64 [r27+2560], fd417; +st.shared.f64 [r27+3072], fd427; +st.shared.f64 [r27+3584], fd437; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+2048]; +ld.shared.f64 fd442, [r28+4096]; +ld.shared.f64 fd443, [r28+6144]; +ld.shared.f64 fd444, [r28+8192]; +ld.shared.f64 fd445, [r28+10240]; +ld.shared.f64 fd446, [r28+12288]; +ld.shared.f64 fd447, [r28+14336]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+2048]; +ld.shared.f64 fd450, [r28+4096]; +ld.shared.f64 fd451, [r28+6144]; +ld.shared.f64 fd452, [r28+8192]; +ld.shared.f64 fd453, [r28+10240]; +ld.shared.f64 fd454, [r28+12288]; +ld.shared.f64 fd455, [r28+14336]; +add.f64 fd456, fd440, fd444; +add.f64 fd457, fd448, fd452; +sub.f64 fd458, fd440, fd444; +sub.f64 fd459, fd448, fd452; +add.f64 fd460, fd442, fd446; +add.f64 fd461, fd450, fd454; +sub.f64 fd462, fd442, fd446; +sub.f64 fd463, fd450, fd454; +add.f64 fd464, fd441, fd445; +add.f64 fd465, fd449, fd453; +sub.f64 fd466, fd441, fd445; +sub.f64 fd467, fd449, fd453; +add.f64 fd468, fd443, fd447; +add.f64 fd469, fd451, fd455; +sub.f64 fd470, fd443, fd447; +sub.f64 fd471, fd451, fd455; +add.f64 %0, fd456, fd460; +add.f64 %1, fd457, fd461; +add.f64 %2, fd464, fd468; +add.f64 %3, fd465, fd469; +sub.f64 %5, fd459, fd462; +add.f64 %4, fd458, fd463; +sub.f64 %7, fd467, fd470; +add.f64 %6, fd466, fd471; +sub.f64 %8, fd456, fd460; +sub.f64 %9, fd457, fd461; +sub.f64 %10, fd464, fd468; +sub.f64 %11, fd465, fd469; +add.f64 %13, fd459, fd462; +sub.f64 %12, fd458, fd463; +add.f64 %15, fd467, fd470; +sub.f64 %14, fd466, fd471; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_2048), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<485, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<889>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+2048]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16256; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+1024]; +ld.shared.f64 fd390, [r13+2048]; +ld.shared.f64 fd391, [r13+3072]; +ld.shared.f64 fd392, [r13+4096]; +ld.shared.f64 fd393, [r13+5120]; +ld.shared.f64 fd394, [r13+6144]; +ld.shared.f64 fd395, [r13+7168]; +ld.shared.f64 fd396, [r13+8192]; +ld.shared.f64 fd397, [r13+9216]; +ld.shared.f64 fd398, [r13+10240]; +ld.shared.f64 fd399, [r13+11264]; +ld.shared.f64 fd400, [r13+12288]; +ld.shared.f64 fd401, [r13+13312]; +ld.shared.f64 fd402, [r13+14336]; +ld.shared.f64 fd403, [r13+15360]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+1024]; +ld.shared.f64 fd406, [r13+2048]; +ld.shared.f64 fd407, [r13+3072]; +ld.shared.f64 fd408, [r13+4096]; +ld.shared.f64 fd409, [r13+5120]; +ld.shared.f64 fd410, [r13+6144]; +ld.shared.f64 fd411, [r13+7168]; +ld.shared.f64 fd412, [r13+8192]; +ld.shared.f64 fd413, [r13+9216]; +ld.shared.f64 fd414, [r13+10240]; +ld.shared.f64 fd415, [r13+11264]; +ld.shared.f64 fd416, [r13+12288]; +ld.shared.f64 fd417, [r13+13312]; +ld.shared.f64 fd418, [r13+14336]; +ld.shared.f64 fd419, [r13+15360]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd543; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd543; +add.f64 fd575, fd473, fd546; +add.f64 fd576, fd474, fd548; +sub.f64 fd577, fd473, fd546; +sub.f64 fd578, fd474, fd548; +add.f64 fd579, fd463, fd521; +sub.f64 fd580, fd464, fd520; +sub.f64 fd581, fd463, fd521; +add.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd551; +add.f64 fd584, fd468, fd553; +sub.f64 fd585, fd467, fd551; +sub.f64 fd586, fd468, fd553; +add.f64 fd587, fd471, fd556; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd556; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 112; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd595, fd567; +mul.f64 fd600, fd596, fd568; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd595, fd568; +fma.rn.f64 fd603, fd596, fd567, fd602; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd606, fd571; +mul.f64 fd610, fd608, fd572; +sub.f64 fd611, fd609, fd610; +mul.f64 fd612, fd606, fd572; +fma.rn.f64 fd613, fd608, fd571, fd612; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd616, fd575; +mul.f64 fd620, fd618, fd576; +sub.f64 fd621, fd619, fd620; +mul.f64 fd622, fd616, fd576; +fma.rn.f64 fd623, fd618, fd575, fd622; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd626, fd579; +mul.f64 fd630, fd628, fd580; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd626, fd580; +fma.rn.f64 fd633, fd628, fd579, fd632; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd636, fd583; +mul.f64 fd640, fd638, fd584; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd636, fd584; +fma.rn.f64 fd643, fd638, fd583, fd642; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd646, fd587; +mul.f64 fd650, fd648, fd588; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd646, fd588; +fma.rn.f64 fd653, fd648, fd587, fd652; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd656, fd591; +mul.f64 fd660, fd658, fd592; +sub.f64 fd661, fd659, fd660; +mul.f64 fd662, fd656, fd592; +fma.rn.f64 fd663, fd658, fd591, fd662; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd666, fd565; +mul.f64 fd670, fd668, fd566; +sub.f64 fd671, fd669, fd670; +mul.f64 fd672, fd666, fd566; +fma.rn.f64 fd673, fd668, fd565, fd672; +ld.global.v2.f64 {fd674, fd675}, [rd8+128]; +mul.f64 fd678, fd674, fd569; +mul.f64 fd679, fd675, fd570; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd674, fd570; +fma.rn.f64 fd682, fd675, fd569, fd681; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd685, fd573; +mul.f64 fd689, fd687, fd574; +sub.f64 fd690, fd688, fd689; +mul.f64 fd691, fd685, fd574; +fma.rn.f64 fd692, fd687, fd573, fd691; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd695, fd577; +mul.f64 fd699, fd697, fd578; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd695, fd578; +fma.rn.f64 fd702, fd697, fd577, fd701; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd705, fd581; +mul.f64 fd709, fd707, fd582; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd705, fd582; +fma.rn.f64 fd712, fd707, fd581, fd711; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd715, fd585; +mul.f64 fd719, fd717, fd586; +sub.f64 fd720, fd718, fd719; +mul.f64 fd721, fd715, fd586; +fma.rn.f64 fd722, fd717, fd585, fd721; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd725, fd589; +mul.f64 fd729, fd727, fd590; +sub.f64 fd730, fd728, fd729; +mul.f64 fd731, fd725, fd590; +fma.rn.f64 fd732, fd727, fd589, fd731; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd735, fd593; +mul.f64 fd739, fd737, fd594; +sub.f64 fd740, fd738, fd739; +mul.f64 fd741, fd735, fd594; +fma.rn.f64 fd742, fd737, fd593, fd741; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 14336; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd601; +st.shared.f64 [r19+256], fd611; +st.shared.f64 [r19+384], fd621; +st.shared.f64 [r19+512], fd631; +st.shared.f64 [r19+640], fd641; +st.shared.f64 [r19+768], fd651; +st.shared.f64 [r19+896], fd661; +st.shared.f64 [r19+1024], fd671; +st.shared.f64 [r19+1152], fd680; +st.shared.f64 [r19+1280], fd690; +st.shared.f64 [r19+1408], fd700; +st.shared.f64 [r19+1536], fd710; +st.shared.f64 [r19+1664], fd720; +st.shared.f64 [r19+1792], fd730; +st.shared.f64 [r19+1920], fd740; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+1024]; +ld.shared.f64 fd745, [r20+2048]; +ld.shared.f64 fd746, [r20+3072]; +ld.shared.f64 fd747, [r20+4096]; +ld.shared.f64 fd748, [r20+5120]; +ld.shared.f64 fd749, [r20+6144]; +ld.shared.f64 fd750, [r20+7168]; +ld.shared.f64 fd751, [r20+8192]; +ld.shared.f64 fd752, [r20+9216]; +ld.shared.f64 fd753, [r20+10240]; +ld.shared.f64 fd754, [r20+11264]; +ld.shared.f64 fd755, [r20+12288]; +ld.shared.f64 fd756, [r20+13312]; +ld.shared.f64 fd757, [r20+14336]; +ld.shared.f64 fd758, [r20+15360]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+1024]; +ld.shared.f64 fd761, [r20+2048]; +ld.shared.f64 fd762, [r20+3072]; +ld.shared.f64 fd763, [r20+4096]; +ld.shared.f64 fd764, [r20+5120]; +ld.shared.f64 fd765, [r20+6144]; +ld.shared.f64 fd766, [r20+7168]; +ld.shared.f64 fd767, [r20+8192]; +ld.shared.f64 fd768, [r20+9216]; +ld.shared.f64 fd769, [r20+10240]; +ld.shared.f64 fd770, [r20+11264]; +ld.shared.f64 fd771, [r20+12288]; +ld.shared.f64 fd772, [r20+13312]; +ld.shared.f64 fd773, [r20+14336]; +ld.shared.f64 fd774, [r20+15360]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +add.f64 fd787, fd777, fd782; +sub.f64 fd788, fd778, fd781; +sub.f64 fd789, fd777, fd782; +add.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +add.f64 fd803, fd793, fd798; +sub.f64 fd804, fd794, fd797; +sub.f64 fd805, fd793, fd798; +add.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0dBFE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +mul.f64 fd810, fd804, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd811, fd803, 0dBFE6A09E667F3BCD, fd810; +mul.f64 fd812, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd813, fd806, 0dBFE6A09E667F3BCD; +sub.f64 fd814, fd812, fd813; +add.f64 fd815, fd812, fd813; +add.f64 fd816, fd744, fd752; +add.f64 fd817, fd760, fd768; +sub.f64 fd818, fd744, fd752; +sub.f64 fd819, fd760, fd768; +add.f64 fd820, fd748, fd756; +add.f64 fd821, fd764, fd772; +sub.f64 fd822, fd748, fd756; +sub.f64 fd823, fd764, fd772; +add.f64 fd824, fd816, fd820; +add.f64 fd825, fd817, fd821; +sub.f64 fd826, fd816, fd820; +sub.f64 fd827, fd817, fd821; +add.f64 fd828, fd818, fd823; +sub.f64 fd829, fd819, fd822; +sub.f64 fd830, fd818, fd823; +add.f64 fd831, fd819, fd822; +add.f64 fd832, fd746, fd754; +add.f64 fd833, fd762, fd770; +sub.f64 fd834, fd746, fd754; +sub.f64 fd835, fd762, fd770; +add.f64 fd836, fd750, fd758; +add.f64 fd837, fd766, fd774; +sub.f64 fd838, fd750, fd758; +sub.f64 fd839, fd766, fd774; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +add.f64 fd844, fd834, fd839; +sub.f64 fd845, fd835, fd838; +sub.f64 fd846, fd834, fd839; +add.f64 fd847, fd835, fd838; +mul.f64 fd848, fd844, 0d3FE6A09E667F3BCD; +mul.f64 fd849, fd845, 0dBFE6A09E667F3BCD; +sub.f64 fd850, fd848, fd849; +mul.f64 fd851, fd845, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd852, fd844, 0dBFE6A09E667F3BCD, fd851; +mul.f64 fd853, fd846, 0dBFE6A09E667F3BCD; +mul.f64 fd854, fd847, 0dBFE6A09E667F3BCD; +sub.f64 fd855, fd853, fd854; +add.f64 fd856, fd853, fd854; +add.f64 %0, fd783, fd799; +add.f64 %1, fd784, fd800; +add.f64 %2, fd824, fd840; +add.f64 %3, fd825, fd841; +add.f64 %5, fd788, fd811; +add.f64 %4, fd787, fd809; +add.f64 %7, fd829, fd852; +add.f64 %6, fd828, fd850; +sub.f64 %9, fd786, fd801; +add.f64 %8, fd785, fd802; +sub.f64 %11, fd827, fd842; +add.f64 %10, fd826, fd843; +add.f64 %13, fd790, fd815; +add.f64 %12, fd789, fd814; +add.f64 %15, fd831, fd856; +add.f64 %14, fd830, fd855; +sub.f64 %16, fd783, fd799; +sub.f64 %17, fd784, fd800; +sub.f64 %18, fd824, fd840; +sub.f64 %19, fd825, fd841; +sub.f64 %21, fd788, fd811; +sub.f64 %20, fd787, fd809; +sub.f64 %23, fd829, fd852; +sub.f64 %22, fd828, fd850; +add.f64 %25, fd786, fd801; +sub.f64 %24, fd785, fd802; +add.f64 %27, fd827, fd842; +sub.f64 %26, fd826, fd843; +sub.f64 %29, fd790, fd815; +sub.f64 %28, fd789, fd814; +sub.f64 %31, fd831, fd856; +sub.f64 %30, fd830, fd855; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_2048), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<486, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<50>; +.reg .f64 fd<1152>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1140, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1138, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1137, fd1140, fd1138; +sub.f64 fd76, fd1140, fd1138; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd1136, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1133, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1131, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1130, fd1133, fd1131; +sub.f64 fd92, fd1133, fd1131; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd1129, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd1129, 0dBFE6A09E667F3BCD; +mul.f64 fd1128, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd1128, fd98; +mul.f64 fd100, fd1129, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1127, fd1137, fd1130; +sub.f64 fd109, fd1137, fd1130; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1126, fd1136, fd101; +sub.f64 fd113, fd1136, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd1125, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd1124, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1122, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1119, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1118, fd1122, fd1119; +sub.f64 fd133, fd1122, fd1119; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd1117, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1115, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1113, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1112, fd1115, fd1113; +sub.f64 fd149, fd1115, fd1113; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd1111, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd1111, 0dBFE6A09E667F3BCD; +mul.f64 fd1110, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd1110, fd155; +mul.f64 fd157, fd1111, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1109, fd1118, fd1112; +sub.f64 fd166, fd1118, fd1112; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1108, fd1117, fd158; +sub.f64 fd170, fd1117, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd1107, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd1106, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1104, fd167, 0d3FED906BCF328D46; +mul.f64 fd1105, fd1108, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd1104, fd1105; +mul.f64 fd182, fd1108, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd1102, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd1103, fd1107, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd1102, fd1103; +mul.f64 fd187, fd1107, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd1100, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd1101, fd1106, 0dBFED906BCF328D46; +sub.f64 fd191, fd1100, fd1101; +mul.f64 fd192, fd1106, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd1098, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd1099, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd1098, fd1099; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd1096, fd177, 0dBFED906BCF328D46; +mul.f64 fd1097, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd1096, fd1097; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1095, fd1126, fd183; +sub.f64 fd213, fd1126, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1094, fd1125, fd188; +sub.f64 fd217, fd1125, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd1093, fd1124, fd193; +sub.f64 fd221, fd1124, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd1092, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd1091, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd1090, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1089, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd1095; +mul.f64 fd244, fd238, fd1095; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1088, fd238, fd238; +sub.f64 fd247, fd1088, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd1094; +mul.f64 fd252, fd247, fd1094; +mul.f64 fd1086, fd238, fd247; +mul.f64 fd1087, fd239, fd249; +sub.f64 fd255, fd1086, fd1087; +mul.f64 fd1085, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd1093; +mul.f64 fd260, fd255, fd1093; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1084, fd238, fd255; +sub.f64 fd263, fd1084, fd262; +mul.f64 fd1083, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd1092; +mul.f64 fd268, fd263, fd1092; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1082, fd238, fd263; +sub.f64 fd271, fd1082, fd270; +mul.f64 fd1081, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd1091; +mul.f64 fd276, fd271, fd1091; +mul.f64 fd1079, fd238, fd271; +mul.f64 fd1080, fd239, fd273; +sub.f64 fd279, fd1079, fd1080; +mul.f64 fd1078, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd1090; +mul.f64 fd284, fd279, fd1090; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1077, fd238, fd279; +sub.f64 fd287, fd1077, fd286; +mul.f64 fd1076, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd1089; +mul.f64 fd292, fd287, fd1089; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1075, fd238, fd287; +sub.f64 fd295, fd1075, fd294; +mul.f64 fd1074, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1073, fd1127, fd1109; +sub.f64 fd1072, fd106, fd163; +mul.f64 fd298, fd295, fd1072; +mul.f64 fd299, fd297, fd1073; +mul.f64 fd300, fd295, fd1073; +ld.global.v2.f64 {fd301, fd302}, [rd5+2048]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1070, fd238, fd301; +mul.f64 fd1071, fd239, fd302; +sub.f64 fd310, fd1070, fd1071; +mul.f64 fd1069, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1068, fd238, fd310; +sub.f64 fd318, fd1068, fd317; +mul.f64 fd1067, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1065, fd238, fd318; +mul.f64 fd1066, fd239, fd320; +sub.f64 fd326, fd1065, fd1066; +mul.f64 fd1064, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1062, fd238, fd326; +mul.f64 fd1063, fd239, fd328; +sub.f64 fd334, fd1062, fd1063; +mul.f64 fd1061, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1060, fd238, fd334; +sub.f64 fd342, fd1060, fd341; +mul.f64 fd1059, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1057, fd238, fd342; +mul.f64 fd1058, fd239, fd344; +sub.f64 fd350, fd1057, fd1058; +mul.f64 fd1056, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd1055, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r41, %tid.x; +shl.b32 r33, r41, 8; +barrier.sync 0; +and.b32 r11, r33, 32512; +add.s32 r12, r9, r11; +sub.f64 fd1147, fd1127, fd1109; +mul.f64 fd1146, fd297, fd1147; +add.f64 fd356, fd1127, fd1109; +mov.u32 r40, %tid.x; +shl.b32 r32, r40, 8; +and.b32 r23, r32, 32512; +add.s32 r22, r9, r23; +sub.f64 fd1151, fd106, fd163; +add.f64 fd357, fd106, fd163; +mov.u32 r49, %tid.x; +shl.b32 r38, r49, 8; +and.b32 r30, r38, 32512; +add.s32 r29, r9, r30; +st.shared.v2.f64 [r29], {fd357, fd356}; +mov.u32 r48, %tid.x; +shl.b32 r28, r48, 4; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd1056, fd243; +st.shared.v2.f64 [r29+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd1085, fd251; +st.shared.v2.f64 [r29+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd1083, fd259; +st.shared.v2.f64 [r29+48], {fd363, fd362}; +sub.f64 fd364, fd1081, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r29+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd1078, fd275; +st.shared.v2.f64 [r29+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd1076, fd283; +st.shared.v2.f64 [r29+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd1074, fd291; +st.shared.v2.f64 [r29+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd1151, fd300; +sub.f64 fd373, fd298, fd1146; +st.shared.v2.f64 [r29+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd1069, fd306; +st.shared.v2.f64 [r29+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd1067, fd314; +st.shared.v2.f64 [r29+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd1064, fd322; +st.shared.v2.f64 [r29+176], {fd379, fd378}; +sub.f64 fd380, fd1061, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r29+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd1059, fd338; +st.shared.v2.f64 [r29+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd1055, fd346; +st.shared.v2.f64 [r29+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r29+240], {fd387, fd386}; +barrier.sync 0; +and.b32 r20, r48, 127; +mad.lo.s32 r13, r20, -240, r29; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+2048]; +ld.shared.v2.f64 {fd396, fd397}, [r13+4096]; +ld.shared.v2.f64 {fd400, fd401}, [r13+6144]; +ld.shared.v2.f64 {fd404, fd405}, [r13+8192]; +ld.shared.v2.f64 {fd408, fd409}, [r13+10240]; +ld.shared.v2.f64 {fd412, fd413}, [r13+12288]; +ld.shared.v2.f64 {fd416, fd417}, [r13+14336]; +ld.shared.v2.f64 {fd420, fd421}, [r13+16384]; +ld.shared.v2.f64 {fd424, fd425}, [r13+18432]; +ld.shared.v2.f64 {fd428, fd429}, [r13+20480]; +ld.shared.v2.f64 {fd432, fd433}, [r13+22528]; +ld.shared.v2.f64 {fd436, fd437}, [r13+24576]; +ld.shared.v2.f64 {fd440, fd441}, [r13+26624]; +ld.shared.v2.f64 {fd444, fd445}, [r13+28672]; +ld.shared.v2.f64 {fd448, fd449}, [r13+30720]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd1054, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd1053, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd1052, fd1054, fd1053; +sub.f64 fd463, fd1054, fd1053; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd1051, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd1050, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd1049, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd1048, fd1050, fd1049; +sub.f64 fd479, fd1050, fd1049; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd1047, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd1047, 0dBFE6A09E667F3BCD; +mul.f64 fd1046, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd1046, fd485; +mul.f64 fd487, fd1047, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd1045, fd1052, fd1048; +sub.f64 fd496, fd1052, fd1048; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd1044, fd1051, fd488; +sub.f64 fd500, fd1051, fd488; +add.f64 fd501, fd462, fd479; +sub.f64 fd503, fd462, fd479; +sub.f64 fd1043, fd463, fd478; +add.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd491; +sub.f64 fd507, fd466, fd491; +add.f64 fd1042, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd1041, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd1040, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd1039, fd1041, fd1040; +sub.f64 fd520, fd1041, fd1040; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd1038, fd512, fd515; +add.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd1037, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd1036, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd1035, fd1037, fd1036; +sub.f64 fd536, fd1037, fd1036; +add.f64 fd537, fd527, fd532; +sub.f64 fd539, fd527, fd532; +sub.f64 fd1034, fd528, fd531; +add.f64 fd540, fd528, fd531; +mul.f64 fd542, fd1034, 0dBFE6A09E667F3BCD; +mul.f64 fd1033, fd537, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd1033, fd542; +mul.f64 fd544, fd1034, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd545, fd537, 0dBFE6A09E667F3BCD, fd544; +mul.f64 fd546, fd539, 0dBFE6A09E667F3BCD; +mul.f64 fd547, fd540, 0dBFE6A09E667F3BCD; +sub.f64 fd548, fd546, fd547; +add.f64 fd549, fd546, fd547; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd1032, fd1039, fd1035; +sub.f64 fd553, fd1039, fd1035; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd1031, fd1038, fd545; +sub.f64 fd557, fd1038, fd545; +add.f64 fd558, fd519, fd536; +sub.f64 fd560, fd519, fd536; +sub.f64 fd1030, fd520, fd535; +add.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd548; +sub.f64 fd564, fd523, fd548; +add.f64 fd1029, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd1027, fd554, 0d3FED906BCF328D46; +mul.f64 fd1028, fd1031, 0dBFD87DE2A6AEA963; +sub.f64 fd568, fd1027, fd1028; +mul.f64 fd569, fd1031, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0dBFD87DE2A6AEA963, fd569; +mul.f64 fd572, fd1030, 0dBFE6A09E667F3BCD; +mul.f64 fd1026, fd558, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd1026, fd572; +mul.f64 fd574, fd1030, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd575, fd558, 0dBFE6A09E667F3BCD, fd574; +mul.f64 fd577, fd1029, 0dBFED906BCF328D46; +mul.f64 fd1025, fd562, 0d3FD87DE2A6AEA963; +sub.f64 fd578, fd1025, fd577; +mul.f64 fd579, fd1029, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd580, fd562, 0dBFED906BCF328D46, fd579; +mul.f64 fd582, fd557, 0dBFED906BCF328D46; +mul.f64 fd1024, fd556, 0dBFD87DE2A6AEA963; +sub.f64 fd583, fd1024, fd582; +mul.f64 fd584, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd585, fd556, 0dBFED906BCF328D46, fd584; +mul.f64 fd586, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd561, 0dBFE6A09E667F3BCD; +sub.f64 fd588, fd586, fd587; +add.f64 fd589, fd586, fd587; +mul.f64 fd591, fd565, 0dBFD87DE2A6AEA963; +mul.f64 fd1023, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd1023, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0dBFD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd1022, fd1044, fd570; +sub.f64 fd600, fd1044, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd1021, fd1043, fd575; +sub.f64 fd604, fd1043, fd575; +add.f64 fd605, fd505, fd578; +sub.f64 fd607, fd505, fd578; +add.f64 fd1020, fd1042, fd580; +sub.f64 fd608, fd1042, fd580; +add.f64 fd609, fd495, fd553; +sub.f64 fd611, fd495, fd553; +sub.f64 fd1019, fd496, fd552; +add.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd583; +sub.f64 fd615, fd499, fd583; +add.f64 fd1018, fd500, fd585; +sub.f64 fd616, fd500, fd585; +add.f64 fd617, fd503, fd588; +sub.f64 fd619, fd503, fd588; +add.f64 fd1017, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd1016, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r48, 112; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd630, fd626, fd1022; +mul.f64 fd631, fd625, fd1022; +mul.f64 fd633, fd626, fd626; +mul.f64 fd1015, fd625, fd625; +sub.f64 fd634, fd1015, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd638, fd636, fd1021; +mul.f64 fd639, fd634, fd1021; +mul.f64 fd1013, fd625, fd634; +mul.f64 fd1014, fd626, fd636; +sub.f64 fd642, fd1013, fd1014; +mul.f64 fd1012, fd634, fd601; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd646, fd644, fd1020; +mul.f64 fd647, fd642, fd1020; +mul.f64 fd649, fd626, fd644; +mul.f64 fd1011, fd625, fd642; +sub.f64 fd650, fd1011, fd649; +mul.f64 fd1010, fd642, fd605; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd654, fd652, fd1019; +mul.f64 fd655, fd650, fd1019; +mul.f64 fd1008, fd625, fd650; +mul.f64 fd1009, fd626, fd652; +sub.f64 fd658, fd1008, fd1009; +mul.f64 fd1007, fd650, fd609; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd662, fd660, fd1018; +mul.f64 fd663, fd658, fd1018; +mul.f64 fd1005, fd625, fd658; +mul.f64 fd1006, fd626, fd660; +sub.f64 fd666, fd1005, fd1006; +mul.f64 fd1004, fd658, fd613; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd670, fd668, fd1017; +mul.f64 fd671, fd666, fd1017; +mul.f64 fd673, fd626, fd668; +mul.f64 fd1003, fd625, fd666; +sub.f64 fd674, fd1003, fd673; +mul.f64 fd1002, fd666, fd617; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd678, fd676, fd1016; +mul.f64 fd679, fd674, fd1016; +mul.f64 fd1000, fd625, fd674; +mul.f64 fd1001, fd626, fd676; +sub.f64 fd682, fd1000, fd1001; +mul.f64 fd999, fd674, fd621; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd998, fd1045, fd1032; +sub.f64 fd997, fd493, fd550; +mul.f64 fd685, fd682, fd997; +mul.f64 fd686, fd684, fd998; +mul.f64 fd687, fd682, fd998; +ld.global.v2.f64 {fd688, fd689}, [rd8+128]; +mul.f64 fd693, fd689, fd600; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd996, fd625, fd688; +sub.f64 fd697, fd996, fd696; +mul.f64 fd995, fd688, fd599; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd701, fd699, fd604; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd994, fd625, fd697; +sub.f64 fd705, fd994, fd704; +mul.f64 fd993, fd697, fd603; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd709, fd707, fd608; +mul.f64 fd710, fd705, fd608; +mul.f64 fd991, fd625, fd705; +mul.f64 fd992, fd626, fd707; +sub.f64 fd713, fd991, fd992; +mul.f64 fd990, fd705, fd607; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd717, fd715, fd612; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd989, fd625, fd713; +sub.f64 fd721, fd989, fd720; +mul.f64 fd988, fd713, fd611; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd725, fd723, fd616; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd987, fd625, fd721; +sub.f64 fd729, fd987, fd728; +mul.f64 fd986, fd721, fd615; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd733, fd731, fd620; +mul.f64 fd734, fd729, fd620; +mul.f64 fd984, fd625, fd729; +mul.f64 fd985, fd626, fd731; +sub.f64 fd737, fd984, fd985; +mul.f64 fd983, fd625, fd597; +mul.f64 fd738, fd625, fd731; +mul.f64 fd982, fd729, fd619; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd737, fd623; +mul.f64 fd741, fd739, fd624; +mul.f64 fd742, fd737, fd624; +sub.f64 fd1143, fd1045, fd1032; +mul.f64 fd1142, fd684, fd1143; +mov.u32 r37, %tid.x; +shl.b32 r36, r37, 8; +mov.u32 r47, %tid.x; +shl.b32 r46, r47, 4; +and.b32 r15, r46, 240; +add.s32 r16, r9, r15; +sub.f64 fd1145, fd1045, fd1032; +mul.f64 fd1144, fd684, fd1145; +mov.u32 r45, %tid.x; +shl.b32 r44, r45, 8; +barrier.sync 0; +and.b32 r17, r44, 28672; +add.s32 r18, r16, r17; +mov.u32 r26, %tid.x; +and.b32 r25, r26, 112; +sub.f64 fd1150, fd1045, fd1032; +mul.f64 fd1149, fd684, fd1150; +add.f64 fd743, fd1045, fd1032; +sub.f64 fd1148, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r35, %tid.x; +and.b32 r34, r35, 112; +mov.u32 r43, %tid.x; +and.b32 r42, r43, 112; +fma.rn.f64 fd745, fd626, fd597, fd631; +sub.f64 fd746, fd983, fd630; +st.shared.v2.f64 [r18+256], {fd746, fd745}; +fma.rn.f64 fd747, fd636, fd601, fd639; +sub.f64 fd748, fd1012, fd638; +st.shared.v2.f64 [r18+512], {fd748, fd747}; +fma.rn.f64 fd749, fd644, fd605, fd647; +sub.f64 fd750, fd1010, fd646; +st.shared.v2.f64 [r18+768], {fd750, fd749}; +fma.rn.f64 fd751, fd652, fd609, fd655; +sub.f64 fd752, fd1007, fd654; +st.shared.v2.f64 [r18+1024], {fd752, fd751}; +sub.f64 fd753, fd1004, fd662; +fma.rn.f64 fd754, fd660, fd613, fd663; +st.shared.v2.f64 [r18+1280], {fd753, fd754}; +fma.rn.f64 fd755, fd668, fd617, fd671; +sub.f64 fd756, fd1002, fd670; +st.shared.v2.f64 [r18+1536], {fd756, fd755}; +fma.rn.f64 fd757, fd676, fd621, fd679; +sub.f64 fd758, fd999, fd678; +st.shared.v2.f64 [r18+1792], {fd758, fd757}; +fma.rn.f64 fd759, fd684, fd1148, fd687; +sub.f64 fd760, fd685, fd1149; +st.shared.v2.f64 [r18+2048], {fd760, fd759}; +fma.rn.f64 fd761, fd689, fd599, fd694; +sub.f64 fd762, fd995, fd693; +st.shared.v2.f64 [r18+2304], {fd762, fd761}; +fma.rn.f64 fd763, fd699, fd603, fd702; +sub.f64 fd764, fd993, fd701; +st.shared.v2.f64 [r18+2560], {fd764, fd763}; +fma.rn.f64 fd765, fd707, fd607, fd710; +sub.f64 fd766, fd990, fd709; +st.shared.v2.f64 [r18+2816], {fd766, fd765}; +fma.rn.f64 fd767, fd715, fd611, fd718; +sub.f64 fd768, fd988, fd717; +st.shared.v2.f64 [r18+3072], {fd768, fd767}; +sub.f64 fd769, fd986, fd725; +fma.rn.f64 fd770, fd723, fd615, fd726; +st.shared.v2.f64 [r18+3328], {fd769, fd770}; +fma.rn.f64 fd771, fd731, fd619, fd734; +sub.f64 fd772, fd982, fd733; +st.shared.v2.f64 [r18+3584], {fd772, fd771}; +fma.rn.f64 fd773, fd739, fd623, fd742; +sub.f64 fd774, fd740, fd741; +st.shared.v2.f64 [r18+3840], {fd774, fd773}; +barrier.sync 0; +mad.lo.s32 r19, r42, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+2048]; +ld.shared.v2.f64 {fd783, fd784}, [r19+4096]; +ld.shared.v2.f64 {fd787, fd788}, [r19+6144]; +ld.shared.v2.f64 {fd791, fd792}, [r19+8192]; +ld.shared.v2.f64 {fd795, fd796}, [r19+10240]; +ld.shared.v2.f64 {fd799, fd800}, [r19+12288]; +ld.shared.v2.f64 {fd803, fd804}, [r19+14336]; +ld.shared.v2.f64 {fd807, fd808}, [r19+16384]; +ld.shared.v2.f64 {fd811, fd812}, [r19+18432]; +ld.shared.v2.f64 {fd815, fd816}, [r19+20480]; +ld.shared.v2.f64 {fd819, fd820}, [r19+22528]; +ld.shared.v2.f64 {fd823, fd824}, [r19+24576]; +ld.shared.v2.f64 {fd827, fd828}, [r19+26624]; +ld.shared.v2.f64 {fd831, fd832}, [r19+28672]; +ld.shared.v2.f64 {fd835, fd836}, [r19+30720]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd981, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd980, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd839, fd843; +sub.f64 fd849, fd839, fd843; +add.f64 fd979, fd981, fd980; +sub.f64 fd850, fd981, fd980; +add.f64 fd851, fd841, fd846; +sub.f64 fd853, fd841, fd846; +sub.f64 fd978, fd842, fd845; +add.f64 fd854, fd842, fd845; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd977, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd976, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd855, fd859; +sub.f64 fd865, fd855, fd859; +add.f64 fd975, fd977, fd976; +sub.f64 fd866, fd977, fd976; +add.f64 fd867, fd857, fd862; +sub.f64 fd869, fd857, fd862; +sub.f64 fd974, fd858, fd861; +add.f64 fd870, fd858, fd861; +mul.f64 fd872, fd974, 0dBFE6A09E667F3BCD; +mul.f64 fd973, fd867, 0d3FE6A09E667F3BCD; +sub.f64 fd873, fd973, fd872; +mul.f64 fd874, fd974, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd875, fd867, 0dBFE6A09E667F3BCD, fd874; +mul.f64 fd876, fd869, 0dBFE6A09E667F3BCD; +mul.f64 fd877, fd870, 0dBFE6A09E667F3BCD; +sub.f64 fd878, fd876, fd877; +add.f64 fd879, fd876, fd877; +add.f64 fd880, fd779, fd811; +sub.f64 fd882, fd779, fd811; +add.f64 fd972, fd780, fd812; +sub.f64 fd883, fd780, fd812; +add.f64 fd884, fd795, fd827; +sub.f64 fd886, fd795, fd827; +add.f64 fd971, fd796, fd828; +sub.f64 fd887, fd796, fd828; +add.f64 fd888, fd880, fd884; +sub.f64 fd890, fd880, fd884; +add.f64 fd970, fd972, fd971; +sub.f64 fd891, fd972, fd971; +add.f64 fd892, fd882, fd887; +sub.f64 fd894, fd882, fd887; +sub.f64 fd969, fd883, fd886; +add.f64 fd895, fd883, fd886; +add.f64 fd896, fd787, fd819; +sub.f64 fd898, fd787, fd819; +add.f64 fd968, fd788, fd820; +sub.f64 fd899, fd788, fd820; +add.f64 fd900, fd803, fd835; +sub.f64 fd902, fd803, fd835; +add.f64 fd967, fd804, fd836; +sub.f64 fd903, fd804, fd836; +add.f64 fd904, fd896, fd900; +sub.f64 fd906, fd896, fd900; +add.f64 fd966, fd968, fd967; +sub.f64 fd907, fd968, fd967; +add.f64 fd908, fd898, fd903; +sub.f64 fd910, fd898, fd903; +sub.f64 fd965, fd899, fd902; +add.f64 fd911, fd899, fd902; +mul.f64 fd963, fd908, 0d3FE6A09E667F3BCD; +mul.f64 fd964, fd965, 0dBFE6A09E667F3BCD; +sub.f64 fd914, fd963, fd964; +mul.f64 fd915, fd965, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd916, fd908, 0dBFE6A09E667F3BCD, fd915; +mul.f64 fd917, fd910, 0dBFE6A09E667F3BCD; +mul.f64 fd918, fd911, 0dBFE6A09E667F3BCD; +sub.f64 fd919, fd917, fd918; +add.f64 fd920, fd917, fd918; +add.f64 %0, fd847, fd863; +add.f64 %1, fd979, fd975; +add.f64 %2, fd888, fd904; +add.f64 %3, fd970, fd966; +add.f64 %4, fd851, fd873; +add.f64 %5, fd978, fd875; +add.f64 %6, fd892, fd914; +add.f64 %7, fd969, fd916; +sub.f64 %9, fd850, fd865; +add.f64 %8, fd849, fd866; +sub.f64 %11, fd891, fd906; +add.f64 %10, fd890, fd907; +add.f64 %12, fd853, fd878; +add.f64 %13, fd854, fd879; +add.f64 %14, fd894, fd919; +add.f64 %15, fd895, fd920; +sub.f64 %17, fd979, fd975; +sub.f64 %16, fd847, fd863; +sub.f64 %19, fd970, fd966; +sub.f64 %18, fd888, fd904; +sub.f64 %21, fd978, fd875; +sub.f64 %20, fd851, fd873; +sub.f64 %23, fd969, fd916; +sub.f64 %22, fd892, fd914; +add.f64 %25, fd850, fd865; +sub.f64 %24, fd849, fd866; +add.f64 %27, fd891, fd906; +sub.f64 %26, fd890, fd907; +sub.f64 %29, fd854, fd879; +sub.f64 %28, fd853, fd878; +sub.f64 %31, fd895, fd920; +sub.f64 %30, fd894, fd919; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_2048), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<487, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<536>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+4096]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 32640; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+4096]; +ld.shared.v2.f64 {fd166, fd167}, [r13+8192]; +ld.shared.v2.f64 {fd170, fd171}, [r13+12288]; +ld.shared.v2.f64 {fd174, fd175}, [r13+16384]; +ld.shared.v2.f64 {fd178, fd179}, [r13+20480]; +ld.shared.v2.f64 {fd182, fd183}, [r13+24576]; +ld.shared.v2.f64 {fd186, fd187}, [r13+28672]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 248; +bfe.u32 r15, r5, 3, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+512]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 31744; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+4096]; +ld.shared.v2.f64 {fd323, fd324}, [r20+8192]; +ld.shared.v2.f64 {fd327, fd328}, [r20+12288]; +ld.shared.v2.f64 {fd331, fd332}, [r20+16384]; +ld.shared.v2.f64 {fd335, fd336}, [r20+20480]; +ld.shared.v2.f64 {fd339, fd340}, [r20+24576]; +ld.shared.v2.f64 {fd343, fd344}, [r20+28672]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +add.f64 fd359, fd349, fd354; +sub.f64 fd360, fd350, fd353; +sub.f64 fd361, fd349, fd354; +add.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +add.f64 fd375, fd365, fd370; +sub.f64 fd376, fd366, fd369; +sub.f64 fd377, fd365, fd370; +add.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0dBFE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd383, fd375, 0dBFE6A09E667F3BCD, fd382; +mul.f64 fd384, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd385, fd378, 0dBFE6A09E667F3BCD; +sub.f64 fd386, fd384, fd385; +add.f64 fd387, fd384, fd385; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd383; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd383; +add.f64 fd394, fd357, fd374; +sub.f64 fd395, fd358, fd373; +sub.f64 fd396, fd357, fd374; +add.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd386; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd386; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 192; +bfe.u32 r22, r5, 6, 2; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd402, fd390; +mul.f64 fd407, fd403, fd391; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd411, fd394; +mul.f64 fd415, fd413, fd395; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd419, fd398; +mul.f64 fd423, fd421, fd399; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd427, fd388; +mul.f64 fd431, fd429, fd389; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+64]; +mul.f64 fd437, fd433, fd392; +mul.f64 fd438, fd434, fd393; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd442, fd396; +mul.f64 fd446, fd444, fd397; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd450, fd400; +mul.f64 fd454, fd452, fd401; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 24576; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd403, fd390, fd408; +sub.f64 fd459, fd406, fd407; +st.shared.v2.f64 [r26+1024], {fd459, fd458}; +fma.rn.f64 fd460, fd413, fd394, fd416; +sub.f64 fd461, fd414, fd415; +st.shared.v2.f64 [r26+2048], {fd461, fd460}; +fma.rn.f64 fd462, fd421, fd398, fd424; +sub.f64 fd463, fd422, fd423; +st.shared.v2.f64 [r26+3072], {fd463, fd462}; +sub.f64 fd464, fd430, fd431; +fma.rn.f64 fd465, fd429, fd388, fd432; +st.shared.v2.f64 [r26+4096], {fd464, fd465}; +fma.rn.f64 fd466, fd434, fd392, fd439; +sub.f64 fd467, fd437, fd438; +st.shared.v2.f64 [r26+5120], {fd467, fd466}; +fma.rn.f64 fd468, fd444, fd396, fd447; +sub.f64 fd469, fd445, fd446; +st.shared.v2.f64 [r26+6144], {fd469, fd468}; +fma.rn.f64 fd470, fd452, fd400, fd455; +sub.f64 fd471, fd453, fd454; +st.shared.v2.f64 [r26+7168], {fd471, fd470}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+4096]; +ld.shared.v2.f64 {fd480, fd481}, [r27+8192]; +ld.shared.v2.f64 {fd484, fd485}, [r27+12288]; +ld.shared.v2.f64 {fd488, fd489}, [r27+16384]; +ld.shared.v2.f64 {fd492, fd493}, [r27+20480]; +ld.shared.v2.f64 {fd496, fd497}, [r27+24576]; +ld.shared.v2.f64 {fd500, fd501}, [r27+28672]; +add.f64 fd504, fd472, fd488; +add.f64 fd505, fd473, fd489; +sub.f64 fd506, fd472, fd488; +sub.f64 fd507, fd473, fd489; +add.f64 fd508, fd480, fd496; +add.f64 fd509, fd481, fd497; +sub.f64 fd510, fd480, fd496; +sub.f64 fd511, fd481, fd497; +add.f64 fd512, fd476, fd492; +add.f64 fd513, fd477, fd493; +sub.f64 fd514, fd476, fd492; +sub.f64 fd515, fd477, fd493; +add.f64 fd516, fd484, fd500; +add.f64 fd517, fd485, fd501; +sub.f64 fd518, fd484, fd500; +sub.f64 fd519, fd485, fd501; +add.f64 %1, fd505, fd509; +add.f64 %0, fd504, fd508; +add.f64 %3, fd513, fd517; +add.f64 %2, fd512, fd516; +sub.f64 %5, fd507, fd510; +add.f64 %4, fd506, fd511; +sub.f64 %7, fd515, fd518; +add.f64 %6, fd514, fd519; +sub.f64 %9, fd505, fd509; +sub.f64 %8, fd504, fd508; +sub.f64 %11, fd513, fd517; +sub.f64 %10, fd512, fd516; +add.f64 %13, fd507, fd510; +sub.f64 %12, fd506, fd511; +add.f64 %15, fd515, fd518; +sub.f64 %14, fd514, fd519; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_2048), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<488, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<285>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+8192]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16352; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+4096]; +ld.shared.f64 fd63, [r13+8192]; +ld.shared.f64 fd64, [r13+12288]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+4096]; +ld.shared.f64 fd67, [r13+8192]; +ld.shared.f64 fd68, [r13+12288]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+2048]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 16256; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+4096]; +ld.shared.f64 fd115, [r21+8192]; +ld.shared.f64 fd116, [r21+12288]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+4096]; +ld.shared.f64 fd119, [r21+8192]; +ld.shared.f64 fd120, [r21+12288]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +add.f64 fd133, fd123, fd128; +sub.f64 fd134, fd124, fd127; +sub.f64 fd135, fd123, fd128; +add.f64 fd136, fd124, fd127; +and.b32 r22, r5, 496; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd137, fd133; +mul.f64 fd142, fd138, fd134; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd137, fd134; +fma.rn.f64 fd145, fd138, fd133, fd144; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd148, fd131; +mul.f64 fd152, fd150, fd132; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd148, fd132; +fma.rn.f64 fd155, fd150, fd131, fd154; +ld.global.v2.f64 {fd156, fd157}, [rd11+512]; +mul.f64 fd160, fd156, fd135; +mul.f64 fd161, fd157, fd136; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd156, fd136; +fma.rn.f64 fd164, fd157, fd135, fd163; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 15872; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd143; +st.shared.f64 [r26+256], fd153; +st.shared.f64 [r26+384], fd162; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+4096]; +ld.shared.f64 fd167, [r27+8192]; +ld.shared.f64 fd168, [r27+12288]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+4096]; +ld.shared.f64 fd171, [r27+8192]; +ld.shared.f64 fd172, [r27+12288]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +add.f64 fd185, fd175, fd180; +sub.f64 fd186, fd176, fd179; +sub.f64 fd187, fd175, fd180; +add.f64 fd188, fd176, fd179; +and.b32 r28, r5, 448; +bfe.u32 r29, r5, 6, 3; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd189, fd185; +mul.f64 fd194, fd190, fd186; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd189, fd186; +fma.rn.f64 fd197, fd190, fd185, fd196; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd200, fd183; +mul.f64 fd204, fd202, fd184; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd184; +fma.rn.f64 fd207, fd202, fd183, fd206; +ld.global.v2.f64 {fd208, fd209}, [rd14+128]; +mul.f64 fd212, fd208, fd187; +mul.f64 fd213, fd209, fd188; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd208, fd188; +fma.rn.f64 fd216, fd209, fd187, fd215; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 14336; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd195; +st.shared.f64 [r33+1024], fd205; +st.shared.f64 [r33+1536], fd214; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+4096]; +ld.shared.f64 fd219, [r34+8192]; +ld.shared.f64 fd220, [r34+12288]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+4096]; +ld.shared.f64 fd223, [r34+8192]; +ld.shared.f64 fd224, [r34+12288]; +add.f64 fd225, fd217, fd219; +add.f64 fd226, fd221, fd223; +sub.f64 fd227, fd217, fd219; +sub.f64 fd228, fd221, fd223; +add.f64 fd229, fd218, fd220; +add.f64 fd230, fd222, fd224; +sub.f64 fd231, fd218, fd220; +sub.f64 fd232, fd222, fd224; +add.f64 fd233, fd225, fd229; +add.f64 fd234, fd226, fd230; +sub.f64 fd235, fd225, fd229; +sub.f64 fd236, fd226, fd230; +add.f64 fd237, fd227, fd232; +sub.f64 fd238, fd228, fd231; +sub.f64 fd239, fd227, fd232; +add.f64 fd240, fd228, fd231; +and.b32 r35, r5, 256; +bfe.u32 r36, r5, 8, 1; +mul.wide.u32 rd15, r36, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd241, fd242}, [rd17]; +mul.f64 fd245, fd241, fd237; +mul.f64 fd246, fd242, fd238; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd241, fd238; +fma.rn.f64 fd249, fd242, fd237, fd248; +mul.f64 fd250, fd241, fd241; +mul.f64 fd251, fd242, fd242; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd242, fd241; +fma.rn.f64 fd254, fd242, fd241, fd253; +mul.f64 fd255, fd252, fd235; +mul.f64 fd256, fd254, fd236; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd236; +fma.rn.f64 fd259, fd254, fd235, fd258; +ld.global.v2.f64 {fd260, fd261}, [rd17+32]; +mul.f64 fd264, fd260, fd239; +mul.f64 fd265, fd261, fd240; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd260, fd240; +fma.rn.f64 fd268, fd261, fd239, fd267; +and.b32 r37, r16, 2040; +add.s32 r38, r10, r37; +barrier.sync 0; +and.b32 r39, r8, 8192; +add.s32 r40, r38, r39; +st.shared.f64 [r40], fd233; +st.shared.f64 [r40+2048], fd247; +st.shared.f64 [r40+4096], fd257; +st.shared.f64 [r40+6144], fd266; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.f64 fd269, [r41]; +ld.shared.f64 fd270, [r41+4096]; +ld.shared.f64 fd271, [r41+8192]; +ld.shared.f64 fd272, [r41+12288]; +barrier.sync 0; +st.shared.f64 [r40], fd234; +st.shared.f64 [r40+2048], fd249; +st.shared.f64 [r40+4096], fd259; +st.shared.f64 [r40+6144], fd268; +barrier.sync 0; +ld.shared.f64 fd273, [r41]; +ld.shared.f64 fd274, [r41+4096]; +ld.shared.f64 fd275, [r41+8192]; +ld.shared.f64 fd276, [r41+12288]; +add.f64 %0, fd269, fd271; +add.f64 %1, fd273, fd275; +add.f64 %2, fd270, fd272; +add.f64 %3, fd274, fd276; +sub.f64 %4, fd269, fd271; +sub.f64 %5, fd273, fd275; +sub.f64 %6, fd270, fd272; +sub.f64 %7, fd274, fd276; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_2048), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<489, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<41>; +.reg .f64 fd<325>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+8192]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 32704; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+8192]; +ld.shared.v2.f64 {fd69, fd70}, [r13+16384]; +ld.shared.v2.f64 {fd73, fd74}, [r13+24576]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+2048]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 32512; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+8192]; +ld.shared.v2.f64 {fd129, fd130}, [r20+16384]; +ld.shared.v2.f64 {fd133, fd134}, [r20+24576]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd144; +sub.f64 fd148, fd140, fd143; +sub.f64 fd149, fd139, fd144; +add.f64 fd150, fd140, fd143; +and.b32 r21, r5, 496; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd151, fd147; +mul.f64 fd156, fd152, fd148; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd145; +mul.f64 fd164, fd162, fd146; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+512]; +mul.f64 fd170, fd166, fd149; +mul.f64 fd171, fd167, fd150; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 31744; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd152, fd147, fd157; +sub.f64 fd176, fd155, fd156; +st.shared.v2.f64 [r25+256], {fd176, fd175}; +fma.rn.f64 fd177, fd162, fd145, fd165; +sub.f64 fd178, fd163, fd164; +st.shared.v2.f64 [r25+512], {fd178, fd177}; +fma.rn.f64 fd179, fd167, fd149, fd172; +sub.f64 fd180, fd170, fd171; +st.shared.v2.f64 [r25+768], {fd180, fd179}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+8192]; +ld.shared.v2.f64 {fd189, fd190}, [r26+16384]; +ld.shared.v2.f64 {fd193, fd194}, [r26+24576]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +add.f64 fd207, fd199, fd204; +sub.f64 fd208, fd200, fd203; +sub.f64 fd209, fd199, fd204; +add.f64 fd210, fd200, fd203; +and.b32 r27, r5, 448; +bfe.u32 r28, r5, 6, 3; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd211, fd207; +mul.f64 fd216, fd212, fd208; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd220, fd205; +mul.f64 fd224, fd222, fd206; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+128]; +mul.f64 fd230, fd226, fd209; +mul.f64 fd231, fd227, fd210; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 28672; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd212, fd207, fd217; +sub.f64 fd236, fd215, fd216; +st.shared.v2.f64 [r32+1024], {fd236, fd235}; +fma.rn.f64 fd237, fd222, fd205, fd225; +sub.f64 fd238, fd223, fd224; +st.shared.v2.f64 [r32+2048], {fd238, fd237}; +fma.rn.f64 fd239, fd227, fd209, fd232; +sub.f64 fd240, fd230, fd231; +st.shared.v2.f64 [r32+3072], {fd240, fd239}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+8192]; +ld.shared.v2.f64 {fd249, fd250}, [r33+16384]; +ld.shared.v2.f64 {fd253, fd254}, [r33+24576]; +add.f64 fd257, fd241, fd249; +add.f64 fd258, fd242, fd250; +sub.f64 fd259, fd241, fd249; +sub.f64 fd260, fd242, fd250; +add.f64 fd261, fd245, fd253; +add.f64 fd262, fd246, fd254; +sub.f64 fd263, fd245, fd253; +sub.f64 fd264, fd246, fd254; +sub.f64 fd265, fd257, fd261; +sub.f64 fd266, fd258, fd262; +add.f64 fd267, fd259, fd264; +sub.f64 fd268, fd260, fd263; +sub.f64 fd269, fd259, fd264; +add.f64 fd270, fd260, fd263; +and.b32 r34, r5, 256; +bfe.u32 r35, r5, 8, 1; +mul.wide.u32 rd15, r35, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd271, fd272}, [rd17]; +mul.f64 fd275, fd271, fd267; +mul.f64 fd276, fd272, fd268; +mul.f64 fd277, fd271, fd268; +mul.f64 fd278, fd271, fd271; +mul.f64 fd279, fd272, fd272; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd272, fd271; +fma.rn.f64 fd282, fd272, fd271, fd281; +mul.f64 fd283, fd280, fd265; +mul.f64 fd284, fd282, fd266; +mul.f64 fd285, fd280, fd266; +ld.global.v2.f64 {fd286, fd287}, [rd17+32]; +mul.f64 fd290, fd286, fd269; +mul.f64 fd291, fd287, fd270; +mul.f64 fd292, fd286, fd270; +and.b32 r36, r10, 4080; +add.s32 r37, r9, r36; +barrier.sync 0; +and.b32 r38, r7, 16384; +add.s32 r39, r37, r38; +add.f64 fd293, fd258, fd262; +add.f64 fd294, fd257, fd261; +st.shared.v2.f64 [r39], {fd294, fd293}; +fma.rn.f64 fd295, fd272, fd267, fd277; +sub.f64 fd296, fd275, fd276; +st.shared.v2.f64 [r39+4096], {fd296, fd295}; +fma.rn.f64 fd297, fd282, fd265, fd285; +sub.f64 fd298, fd283, fd284; +st.shared.v2.f64 [r39+8192], {fd298, fd297}; +fma.rn.f64 fd299, fd287, fd269, fd292; +sub.f64 fd300, fd290, fd291; +st.shared.v2.f64 [r39+12288], {fd300, fd299}; +barrier.sync 0; +mad.lo.s32 r40, r34, -48, r39; +ld.shared.v2.f64 {fd301, fd302}, [r40]; +ld.shared.v2.f64 {fd305, fd306}, [r40+8192]; +ld.shared.v2.f64 {fd309, fd310}, [r40+16384]; +ld.shared.v2.f64 {fd313, fd314}, [r40+24576]; +add.f64 %1, fd302, fd310; +add.f64 %0, fd301, fd309; +add.f64 %3, fd306, fd314; +add.f64 %2, fd305, fd313; +sub.f64 %5, fd302, fd310; +sub.f64 %4, fd301, fd309; +sub.f64 %7, fd306, fd314; +sub.f64 %6, fd305, fd313; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_2048), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<490, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<77>; +.reg .f64 fd<183>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %15, %17; +add.f64 fd10, %16, %18; +sub.f64 fd11, %15, %17; +sub.f64 fd12, %16, %18; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -16384; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 16368; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 8184; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+8192]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+8192]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16352; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 8176; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+8192]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+8192]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 16320; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 8160; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+8192]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+8192]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 7; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 16256; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 8128; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+8192]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+8192]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 1008; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd81, fd79; +mul.f64 fd86, fd82, fd80; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd81, fd80; +fma.rn.f64 fd89, fd82, fd79, fd88; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 16128; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd87; +barrier.sync 0; +and.b32 r40, r11, 8064; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+8192]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+8192]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd98, fd96; +mul.f64 fd103, fd99, fd97; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd98, fd97; +fma.rn.f64 fd106, fd99, fd96, fd105; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 15872; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd104; +barrier.sync 0; +and.b32 r47, r11, 7936; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+8192]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+8192]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd115, fd113; +mul.f64 fd120, fd116, fd114; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd115, fd114; +fma.rn.f64 fd123, fd116, fd113, fd122; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 15360; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd121; +barrier.sync 0; +and.b32 r54, r11, 7680; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+8192]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+8192]; +add.f64 fd128, fd124, fd125; +add.f64 fd129, fd126, fd127; +sub.f64 fd130, fd124, fd125; +sub.f64 fd131, fd126, fd127; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd132, fd133}, [rd26]; +mul.f64 fd136, fd132, fd130; +mul.f64 fd137, fd133, fd131; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd132, fd131; +fma.rn.f64 fd140, fd133, fd130, fd139; +and.b32 r57, r11, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 14336; +add.s32 r60, r58, r59; +st.shared.f64 [r60], fd128; +st.shared.f64 [r60+1024], fd138; +barrier.sync 0; +and.b32 r61, r11, 7168; +sub.s32 r62, r60, r61; +ld.shared.f64 fd141, [r62]; +ld.shared.f64 fd142, [r62+8192]; +barrier.sync 0; +st.shared.f64 [r60], fd129; +st.shared.f64 [r60+1024], fd140; +barrier.sync 0; +ld.shared.f64 fd143, [r62]; +ld.shared.f64 fd144, [r62+8192]; +add.f64 fd145, fd141, fd142; +add.f64 fd146, fd143, fd144; +sub.f64 fd147, fd141, fd142; +sub.f64 fd148, fd143, fd144; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd149, fd150}, [rd29]; +mul.f64 fd153, fd149, fd147; +mul.f64 fd154, fd150, fd148; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd149, fd148; +fma.rn.f64 fd157, fd150, fd147, fd156; +and.b32 r64, r11, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 12288; +add.s32 r67, r65, r66; +st.shared.f64 [r67], fd145; +st.shared.f64 [r67+2048], fd155; +barrier.sync 0; +and.b32 r68, r11, 6144; +sub.s32 r69, r67, r68; +ld.shared.f64 fd158, [r69]; +ld.shared.f64 fd159, [r69+8192]; +barrier.sync 0; +st.shared.f64 [r67], fd146; +st.shared.f64 [r67+2048], fd157; +barrier.sync 0; +ld.shared.f64 fd160, [r69]; +ld.shared.f64 fd161, [r69+8192]; +add.f64 fd162, fd158, fd159; +add.f64 fd163, fd160, fd161; +sub.f64 fd164, fd158, fd159; +sub.f64 fd165, fd160, fd161; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 16; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f64 {fd166, fd167}, [rd32]; +mul.f64 fd170, fd166, fd164; +mul.f64 fd171, fd167, fd165; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd166, fd165; +fma.rn.f64 fd174, fd167, fd164, fd173; +and.b32 r71, r11, 4088; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 8192; +add.s32 r74, r72, r73; +st.shared.f64 [r74], fd162; +st.shared.f64 [r74+4096], fd172; +barrier.sync 0; +and.b32 r75, r11, 4096; +sub.s32 r76, r74, r75; +ld.shared.f64 fd175, [r76]; +ld.shared.f64 fd176, [r76+8192]; +barrier.sync 0; +st.shared.f64 [r74], fd163; +st.shared.f64 [r74+4096], fd174; +barrier.sync 0; +ld.shared.f64 fd177, [r76]; +ld.shared.f64 fd178, [r76+8192]; +add.f64 %0, fd175, fd176; +add.f64 %1, fd177, fd178; +sub.f64 %2, fd175, fd176; +sub.f64 %3, fd177, fd178; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_2048), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<491, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<77>; +.reg .f64 fd<223>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %15, %17; +sub.f64 fd10, %16, %18; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -32768; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 32736; +add.s32 r11, r8, r10; +add.f64 fd18, %16, %18; +add.f64 fd19, %15, %17; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 16368; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+16384]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 32704; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 16352; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+16384]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 32640; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 16320; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+16384]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 7; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 32512; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 16256; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+16384]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 1008; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd95, fd93; +mul.f64 fd100, fd96, fd94; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 32256; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd96, fd93, fd101; +sub.f64 fd105, fd99, fd100; +st.shared.v2.f64 [r39+256], {fd105, fd104}; +barrier.sync 0; +and.b32 r40, r9, 16128; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+16384]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd116, fd114; +mul.f64 fd121, fd117, fd115; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 31744; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd117, fd114, fd122; +sub.f64 fd126, fd120, fd121; +st.shared.v2.f64 [r46+512], {fd126, fd125}; +barrier.sync 0; +and.b32 r47, r9, 15872; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+16384]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd137, fd135; +mul.f64 fd142, fd138, fd136; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 30720; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd138, fd135, fd143; +sub.f64 fd147, fd141, fd142; +st.shared.v2.f64 [r53+1024], {fd147, fd146}; +barrier.sync 0; +and.b32 r54, r9, 15360; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+16384]; +sub.f64 fd156, fd148, fd152; +sub.f64 fd157, fd149, fd153; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd158, fd159}, [rd26]; +mul.f64 fd162, fd158, fd156; +mul.f64 fd163, fd159, fd157; +mul.f64 fd164, fd158, fd157; +and.b32 r57, r9, 2032; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 28672; +add.s32 r60, r58, r59; +add.f64 fd165, fd149, fd153; +add.f64 fd166, fd148, fd152; +st.shared.v2.f64 [r60], {fd166, fd165}; +fma.rn.f64 fd167, fd159, fd156, fd164; +sub.f64 fd168, fd162, fd163; +st.shared.v2.f64 [r60+2048], {fd168, fd167}; +barrier.sync 0; +and.b32 r61, r9, 14336; +sub.s32 r62, r60, r61; +ld.shared.v2.f64 {fd169, fd170}, [r62]; +ld.shared.v2.f64 {fd173, fd174}, [r62+16384]; +sub.f64 fd177, fd169, fd173; +sub.f64 fd178, fd170, fd174; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd179, fd180}, [rd29]; +mul.f64 fd183, fd179, fd177; +mul.f64 fd184, fd180, fd178; +mul.f64 fd185, fd179, fd178; +and.b32 r64, r9, 4080; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 24576; +add.s32 r67, r65, r66; +add.f64 fd186, fd170, fd174; +add.f64 fd187, fd169, fd173; +st.shared.v2.f64 [r67], {fd187, fd186}; +fma.rn.f64 fd188, fd180, fd177, fd185; +sub.f64 fd189, fd183, fd184; +st.shared.v2.f64 [r67+4096], {fd189, fd188}; +barrier.sync 0; +and.b32 r68, r9, 12288; +sub.s32 r69, r67, r68; +ld.shared.v2.f64 {fd190, fd191}, [r69]; +ld.shared.v2.f64 {fd194, fd195}, [r69+16384]; +sub.f64 fd198, fd190, fd194; +sub.f64 fd199, fd191, fd195; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 16; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f64 {fd200, fd201}, [rd32]; +mul.f64 fd204, fd200, fd198; +mul.f64 fd205, fd201, fd199; +mul.f64 fd206, fd200, fd199; +and.b32 r71, r9, 8176; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 16384; +add.s32 r74, r72, r73; +add.f64 fd207, fd191, fd195; +add.f64 fd208, fd190, fd194; +st.shared.v2.f64 [r74], {fd208, fd207}; +fma.rn.f64 fd209, fd201, fd198, fd206; +sub.f64 fd210, fd204, fd205; +st.shared.v2.f64 [r74+8192], {fd210, fd209}; +barrier.sync 0; +and.b32 r75, r9, 8192; +sub.s32 r76, r74, r75; +ld.shared.v2.f64 {fd211, fd212}, [r76]; +ld.shared.v2.f64 {fd215, fd216}, [r76+16384]; +add.f64 %1, fd212, fd216; +add.f64 %0, fd211, fd215; +sub.f64 %3, fd212, fd216; +sub.f64 %2, fd211, fd215; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_2048), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..6df1225b69b31 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2048_fp64_inv.hpp.inc @@ -0,0 +1,4121 @@ +#ifndef CUFFTDX_FFT_2048_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_2048_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<655, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<29>; +.reg .f64 fd<488>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+4096]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16320; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+2048]; +ld.shared.f64 fd160, [r13+4096]; +ld.shared.f64 fd161, [r13+6144]; +ld.shared.f64 fd162, [r13+8192]; +ld.shared.f64 fd163, [r13+10240]; +ld.shared.f64 fd164, [r13+12288]; +ld.shared.f64 fd165, [r13+14336]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+2048]; +ld.shared.f64 fd168, [r13+4096]; +ld.shared.f64 fd169, [r13+6144]; +ld.shared.f64 fd170, [r13+8192]; +ld.shared.f64 fd171, [r13+10240]; +ld.shared.f64 fd172, [r13+12288]; +ld.shared.f64 fd173, [r13+14336]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 248; +bfe.u32 r15, r5, 3, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+512]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 15872; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+2048]; +ld.shared.f64 fd301, [r21+4096]; +ld.shared.f64 fd302, [r21+6144]; +ld.shared.f64 fd303, [r21+8192]; +ld.shared.f64 fd304, [r21+10240]; +ld.shared.f64 fd305, [r21+12288]; +ld.shared.f64 fd306, [r21+14336]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+2048]; +ld.shared.f64 fd309, [r21+4096]; +ld.shared.f64 fd310, [r21+6144]; +ld.shared.f64 fd311, [r21+8192]; +ld.shared.f64 fd312, [r21+10240]; +ld.shared.f64 fd313, [r21+12288]; +ld.shared.f64 fd314, [r21+14336]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +sub.f64 fd327, fd317, fd322; +add.f64 fd328, fd318, fd321; +add.f64 fd329, fd317, fd322; +sub.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +sub.f64 fd343, fd333, fd338; +add.f64 fd344, fd334, fd337; +add.f64 fd345, fd333, fd338; +sub.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0d3FE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +add.f64 fd350, fd347, fd348; +mul.f64 fd351, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd352, fd346, 0d3FE6A09E667F3BCD; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd346, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd355, fd345, 0d3FE6A09E667F3BCD, fd354; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd350; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd350; +sub.f64 fd364, fd325, fd342; +add.f64 fd365, fd326, fd341; +add.f64 fd366, fd325, fd342; +sub.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd353; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd353; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 192; +bfe.u32 r23, r5, 6, 2; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd361, fd373; +fma.rn.f64 fd377, fd372, fd360, fd376; +mul.f64 fd378, fd360, fd373; +mul.f64 fd379, fd372, fd361; +sub.f64 fd380, fd379, fd378; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd365, fd385; +fma.rn.f64 fd387, fd383, fd364, fd386; +mul.f64 fd388, fd364, fd385; +mul.f64 fd389, fd383, fd365; +sub.f64 fd390, fd389, fd388; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd369, fd395; +fma.rn.f64 fd397, fd393, fd368, fd396; +mul.f64 fd398, fd368, fd395; +mul.f64 fd399, fd393, fd369; +sub.f64 fd400, fd399, fd398; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd359, fd405; +fma.rn.f64 fd407, fd403, fd358, fd406; +mul.f64 fd408, fd358, fd405; +mul.f64 fd409, fd403, fd359; +sub.f64 fd410, fd409, fd408; +ld.global.v2.f64 {fd411, fd412}, [rd11+64]; +mul.f64 fd415, fd363, fd412; +fma.rn.f64 fd416, fd411, fd362, fd415; +mul.f64 fd417, fd362, fd412; +mul.f64 fd418, fd411, fd363; +sub.f64 fd419, fd418, fd417; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd367, fd424; +fma.rn.f64 fd426, fd422, fd366, fd425; +mul.f64 fd427, fd366, fd424; +mul.f64 fd428, fd422, fd367; +sub.f64 fd429, fd428, fd427; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd371, fd434; +fma.rn.f64 fd436, fd432, fd370, fd435; +mul.f64 fd437, fd370, fd434; +mul.f64 fd438, fd432, fd371; +sub.f64 fd439, fd438, fd437; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 12288; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd377; +st.shared.f64 [r27+1024], fd387; +st.shared.f64 [r27+1536], fd397; +st.shared.f64 [r27+2048], fd407; +st.shared.f64 [r27+2560], fd416; +st.shared.f64 [r27+3072], fd426; +st.shared.f64 [r27+3584], fd436; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+2048]; +ld.shared.f64 fd442, [r28+4096]; +ld.shared.f64 fd443, [r28+6144]; +ld.shared.f64 fd444, [r28+8192]; +ld.shared.f64 fd445, [r28+10240]; +ld.shared.f64 fd446, [r28+12288]; +ld.shared.f64 fd447, [r28+14336]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+2048]; +ld.shared.f64 fd450, [r28+4096]; +ld.shared.f64 fd451, [r28+6144]; +ld.shared.f64 fd452, [r28+8192]; +ld.shared.f64 fd453, [r28+10240]; +ld.shared.f64 fd454, [r28+12288]; +ld.shared.f64 fd455, [r28+14336]; +add.f64 fd456, fd440, fd444; +add.f64 fd457, fd448, fd452; +sub.f64 fd458, fd440, fd444; +sub.f64 fd459, fd448, fd452; +add.f64 fd460, fd442, fd446; +add.f64 fd461, fd450, fd454; +sub.f64 fd462, fd442, fd446; +sub.f64 fd463, fd450, fd454; +add.f64 fd464, fd441, fd445; +add.f64 fd465, fd449, fd453; +sub.f64 fd466, fd441, fd445; +sub.f64 fd467, fd449, fd453; +add.f64 fd468, fd443, fd447; +add.f64 fd469, fd451, fd455; +sub.f64 fd470, fd443, fd447; +sub.f64 fd471, fd451, fd455; +add.f64 %0, fd456, fd460; +add.f64 %1, fd457, fd461; +add.f64 %2, fd464, fd468; +add.f64 %3, fd465, fd469; +add.f64 %5, fd459, fd462; +sub.f64 %4, fd458, fd463; +add.f64 %7, fd467, fd470; +sub.f64 %6, fd466, fd471; +sub.f64 %8, fd456, fd460; +sub.f64 %9, fd457, fd461; +sub.f64 %10, fd464, fd468; +sub.f64 %11, fd465, fd469; +sub.f64 %13, fd459, fd462; +add.f64 %12, fd458, fd463; +sub.f64 %15, fd467, fd470; +add.f64 %14, fd466, fd471; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_2048), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<656, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<889>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+2048]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16256; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+1024]; +ld.shared.f64 fd390, [r13+2048]; +ld.shared.f64 fd391, [r13+3072]; +ld.shared.f64 fd392, [r13+4096]; +ld.shared.f64 fd393, [r13+5120]; +ld.shared.f64 fd394, [r13+6144]; +ld.shared.f64 fd395, [r13+7168]; +ld.shared.f64 fd396, [r13+8192]; +ld.shared.f64 fd397, [r13+9216]; +ld.shared.f64 fd398, [r13+10240]; +ld.shared.f64 fd399, [r13+11264]; +ld.shared.f64 fd400, [r13+12288]; +ld.shared.f64 fd401, [r13+13312]; +ld.shared.f64 fd402, [r13+14336]; +ld.shared.f64 fd403, [r13+15360]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+1024]; +ld.shared.f64 fd406, [r13+2048]; +ld.shared.f64 fd407, [r13+3072]; +ld.shared.f64 fd408, [r13+4096]; +ld.shared.f64 fd409, [r13+5120]; +ld.shared.f64 fd410, [r13+6144]; +ld.shared.f64 fd411, [r13+7168]; +ld.shared.f64 fd412, [r13+8192]; +ld.shared.f64 fd413, [r13+9216]; +ld.shared.f64 fd414, [r13+10240]; +ld.shared.f64 fd415, [r13+11264]; +ld.shared.f64 fd416, [r13+12288]; +ld.shared.f64 fd417, [r13+13312]; +ld.shared.f64 fd418, [r13+14336]; +ld.shared.f64 fd419, [r13+15360]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd542; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd542; +add.f64 fd575, fd473, fd545; +add.f64 fd576, fd474, fd547; +sub.f64 fd577, fd473, fd545; +sub.f64 fd578, fd474, fd547; +sub.f64 fd579, fd463, fd521; +add.f64 fd580, fd464, fd520; +add.f64 fd581, fd463, fd521; +sub.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd550; +add.f64 fd584, fd468, fd552; +sub.f64 fd585, fd467, fd550; +sub.f64 fd586, fd468, fd552; +add.f64 fd587, fd471, fd555; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd555; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 112; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd568, fd596; +fma.rn.f64 fd600, fd595, fd567, fd599; +mul.f64 fd601, fd567, fd596; +mul.f64 fd602, fd595, fd568; +sub.f64 fd603, fd602, fd601; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd572, fd608; +fma.rn.f64 fd610, fd606, fd571, fd609; +mul.f64 fd611, fd571, fd608; +mul.f64 fd612, fd606, fd572; +sub.f64 fd613, fd612, fd611; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd576, fd618; +fma.rn.f64 fd620, fd616, fd575, fd619; +mul.f64 fd621, fd575, fd618; +mul.f64 fd622, fd616, fd576; +sub.f64 fd623, fd622, fd621; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd580, fd628; +fma.rn.f64 fd630, fd626, fd579, fd629; +mul.f64 fd631, fd579, fd628; +mul.f64 fd632, fd626, fd580; +sub.f64 fd633, fd632, fd631; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd584, fd638; +fma.rn.f64 fd640, fd636, fd583, fd639; +mul.f64 fd641, fd583, fd638; +mul.f64 fd642, fd636, fd584; +sub.f64 fd643, fd642, fd641; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd588, fd648; +fma.rn.f64 fd650, fd646, fd587, fd649; +mul.f64 fd651, fd587, fd648; +mul.f64 fd652, fd646, fd588; +sub.f64 fd653, fd652, fd651; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd592, fd658; +fma.rn.f64 fd660, fd656, fd591, fd659; +mul.f64 fd661, fd591, fd658; +mul.f64 fd662, fd656, fd592; +sub.f64 fd663, fd662, fd661; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd566, fd668; +fma.rn.f64 fd670, fd666, fd565, fd669; +mul.f64 fd671, fd565, fd668; +mul.f64 fd672, fd666, fd566; +sub.f64 fd673, fd672, fd671; +ld.global.v2.f64 {fd674, fd675}, [rd8+128]; +mul.f64 fd678, fd570, fd675; +fma.rn.f64 fd679, fd674, fd569, fd678; +mul.f64 fd680, fd569, fd675; +mul.f64 fd681, fd674, fd570; +sub.f64 fd682, fd681, fd680; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd574, fd687; +fma.rn.f64 fd689, fd685, fd573, fd688; +mul.f64 fd690, fd573, fd687; +mul.f64 fd691, fd685, fd574; +sub.f64 fd692, fd691, fd690; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd578, fd697; +fma.rn.f64 fd699, fd695, fd577, fd698; +mul.f64 fd700, fd577, fd697; +mul.f64 fd701, fd695, fd578; +sub.f64 fd702, fd701, fd700; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd582, fd707; +fma.rn.f64 fd709, fd705, fd581, fd708; +mul.f64 fd710, fd581, fd707; +mul.f64 fd711, fd705, fd582; +sub.f64 fd712, fd711, fd710; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd586, fd717; +fma.rn.f64 fd719, fd715, fd585, fd718; +mul.f64 fd720, fd585, fd717; +mul.f64 fd721, fd715, fd586; +sub.f64 fd722, fd721, fd720; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd590, fd727; +fma.rn.f64 fd729, fd725, fd589, fd728; +mul.f64 fd730, fd589, fd727; +mul.f64 fd731, fd725, fd590; +sub.f64 fd732, fd731, fd730; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd594, fd737; +fma.rn.f64 fd739, fd735, fd593, fd738; +mul.f64 fd740, fd593, fd737; +mul.f64 fd741, fd735, fd594; +sub.f64 fd742, fd741, fd740; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 14336; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd600; +st.shared.f64 [r19+256], fd610; +st.shared.f64 [r19+384], fd620; +st.shared.f64 [r19+512], fd630; +st.shared.f64 [r19+640], fd640; +st.shared.f64 [r19+768], fd650; +st.shared.f64 [r19+896], fd660; +st.shared.f64 [r19+1024], fd670; +st.shared.f64 [r19+1152], fd679; +st.shared.f64 [r19+1280], fd689; +st.shared.f64 [r19+1408], fd699; +st.shared.f64 [r19+1536], fd709; +st.shared.f64 [r19+1664], fd719; +st.shared.f64 [r19+1792], fd729; +st.shared.f64 [r19+1920], fd739; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+1024]; +ld.shared.f64 fd745, [r20+2048]; +ld.shared.f64 fd746, [r20+3072]; +ld.shared.f64 fd747, [r20+4096]; +ld.shared.f64 fd748, [r20+5120]; +ld.shared.f64 fd749, [r20+6144]; +ld.shared.f64 fd750, [r20+7168]; +ld.shared.f64 fd751, [r20+8192]; +ld.shared.f64 fd752, [r20+9216]; +ld.shared.f64 fd753, [r20+10240]; +ld.shared.f64 fd754, [r20+11264]; +ld.shared.f64 fd755, [r20+12288]; +ld.shared.f64 fd756, [r20+13312]; +ld.shared.f64 fd757, [r20+14336]; +ld.shared.f64 fd758, [r20+15360]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+1024]; +ld.shared.f64 fd761, [r20+2048]; +ld.shared.f64 fd762, [r20+3072]; +ld.shared.f64 fd763, [r20+4096]; +ld.shared.f64 fd764, [r20+5120]; +ld.shared.f64 fd765, [r20+6144]; +ld.shared.f64 fd766, [r20+7168]; +ld.shared.f64 fd767, [r20+8192]; +ld.shared.f64 fd768, [r20+9216]; +ld.shared.f64 fd769, [r20+10240]; +ld.shared.f64 fd770, [r20+11264]; +ld.shared.f64 fd771, [r20+12288]; +ld.shared.f64 fd772, [r20+13312]; +ld.shared.f64 fd773, [r20+14336]; +ld.shared.f64 fd774, [r20+15360]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +sub.f64 fd787, fd777, fd782; +add.f64 fd788, fd778, fd781; +add.f64 fd789, fd777, fd782; +sub.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +sub.f64 fd803, fd793, fd798; +add.f64 fd804, fd794, fd797; +add.f64 fd805, fd793, fd798; +sub.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0d3FE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +add.f64 fd810, fd807, fd808; +mul.f64 fd811, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd812, fd806, 0d3FE6A09E667F3BCD; +sub.f64 fd813, fd811, fd812; +mul.f64 fd814, fd806, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd815, fd805, 0d3FE6A09E667F3BCD, fd814; +add.f64 fd816, fd744, fd752; +add.f64 fd817, fd760, fd768; +sub.f64 fd818, fd744, fd752; +sub.f64 fd819, fd760, fd768; +add.f64 fd820, fd748, fd756; +add.f64 fd821, fd764, fd772; +sub.f64 fd822, fd748, fd756; +sub.f64 fd823, fd764, fd772; +add.f64 fd824, fd816, fd820; +add.f64 fd825, fd817, fd821; +sub.f64 fd826, fd816, fd820; +sub.f64 fd827, fd817, fd821; +sub.f64 fd828, fd818, fd823; +add.f64 fd829, fd819, fd822; +add.f64 fd830, fd818, fd823; +sub.f64 fd831, fd819, fd822; +add.f64 fd832, fd746, fd754; +add.f64 fd833, fd762, fd770; +sub.f64 fd834, fd746, fd754; +sub.f64 fd835, fd762, fd770; +add.f64 fd836, fd750, fd758; +add.f64 fd837, fd766, fd774; +sub.f64 fd838, fd750, fd758; +sub.f64 fd839, fd766, fd774; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +sub.f64 fd844, fd834, fd839; +add.f64 fd845, fd835, fd838; +add.f64 fd846, fd834, fd839; +sub.f64 fd847, fd835, fd838; +mul.f64 fd848, fd844, 0d3FE6A09E667F3BCD; +mul.f64 fd849, fd845, 0d3FE6A09E667F3BCD; +sub.f64 fd850, fd848, fd849; +add.f64 fd851, fd848, fd849; +mul.f64 fd852, fd846, 0dBFE6A09E667F3BCD; +mul.f64 fd853, fd847, 0d3FE6A09E667F3BCD; +sub.f64 fd854, fd852, fd853; +mul.f64 fd855, fd847, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd856, fd846, 0d3FE6A09E667F3BCD, fd855; +add.f64 %0, fd783, fd799; +add.f64 %1, fd784, fd800; +add.f64 %2, fd824, fd840; +add.f64 %3, fd825, fd841; +add.f64 %5, fd788, fd810; +add.f64 %4, fd787, fd809; +add.f64 %7, fd829, fd851; +add.f64 %6, fd828, fd850; +add.f64 %9, fd786, fd801; +sub.f64 %8, fd785, fd802; +add.f64 %11, fd827, fd842; +sub.f64 %10, fd826, fd843; +add.f64 %13, fd790, fd815; +add.f64 %12, fd789, fd813; +add.f64 %15, fd831, fd856; +add.f64 %14, fd830, fd854; +sub.f64 %16, fd783, fd799; +sub.f64 %17, fd784, fd800; +sub.f64 %18, fd824, fd840; +sub.f64 %19, fd825, fd841; +sub.f64 %21, fd788, fd810; +sub.f64 %20, fd787, fd809; +sub.f64 %23, fd829, fd851; +sub.f64 %22, fd828, fd850; +sub.f64 %25, fd786, fd801; +add.f64 %24, fd785, fd802; +sub.f64 %27, fd827, fd842; +add.f64 %26, fd826, fd843; +sub.f64 %29, fd790, fd815; +sub.f64 %28, fd789, fd813; +sub.f64 %31, fd831, fd856; +sub.f64 %30, fd830, fd854; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_2048), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<657, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<1151>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1142, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1140, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1139, fd1142, fd1140; +sub.f64 fd76, fd1142, fd1140; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd1138, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1135, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1133, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1132, fd1135, fd1133; +sub.f64 fd92, fd1135, fd1133; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd1131, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd1131, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd1129, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd1130, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd1129, fd1130; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1128, fd1139, fd1132; +sub.f64 fd109, fd1139, fd1132; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1127, fd1138, fd100; +sub.f64 fd113, fd1138, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd1126, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd1125, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1123, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1120, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1119, fd1123, fd1120; +sub.f64 fd133, fd1123, fd1120; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd1118, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1116, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1114, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1113, fd1116, fd1114; +sub.f64 fd149, fd1116, fd1114; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd1112, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd1112, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd1110, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd1111, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd1110, fd1111; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1109, fd1119, fd1113; +sub.f64 fd166, fd1119, fd1113; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1108, fd1118, fd157; +sub.f64 fd170, fd1118, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd1107, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd1106, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1104, fd167, 0d3FED906BCF328D46; +mul.f64 fd1105, fd1108, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd1104, fd1105; +mul.f64 fd182, fd1108, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd1107, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd1106, 0d3FED906BCF328D46; +mul.f64 fd1103, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd1103, fd189; +mul.f64 fd191, fd1106, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd1102, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd1102, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd1100, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd1101, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd1100, fd1101; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd1098, fd177, 0dBFED906BCF328D46; +mul.f64 fd1099, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd1098, fd1099; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1097, fd1127, fd183; +sub.f64 fd213, fd1127, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1096, fd1126, fd187; +sub.f64 fd217, fd1126, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd1095, fd1125, fd192; +sub.f64 fd221, fd1125, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd1094, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd1093, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd1092, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1091, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd1097, fd239; +mul.f64 fd244, fd238, fd1097; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1090, fd238, fd238; +sub.f64 fd247, fd1090, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd1096, fd249; +mul.f64 fd252, fd247, fd1096; +mul.f64 fd1088, fd238, fd247; +mul.f64 fd1089, fd239, fd249; +sub.f64 fd255, fd1088, fd1089; +mul.f64 fd1087, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd1095, fd257; +mul.f64 fd260, fd255, fd1095; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1086, fd238, fd255; +sub.f64 fd263, fd1086, fd262; +mul.f64 fd1085, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd1094, fd265; +mul.f64 fd268, fd263, fd1094; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1084, fd238, fd263; +sub.f64 fd271, fd1084, fd270; +mul.f64 fd1083, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd1093, fd273; +mul.f64 fd276, fd271, fd1093; +mul.f64 fd1081, fd238, fd271; +mul.f64 fd1082, fd239, fd273; +sub.f64 fd279, fd1081, fd1082; +mul.f64 fd1080, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd1092, fd281; +mul.f64 fd284, fd279, fd1092; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1079, fd238, fd279; +sub.f64 fd287, fd1079, fd286; +mul.f64 fd1078, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd1091, fd289; +mul.f64 fd292, fd287, fd1091; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1077, fd238, fd287; +sub.f64 fd295, fd1077, fd294; +mul.f64 fd1076, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1075, fd1128, fd1109; +mul.f64 fd298, fd1075, fd297; +sub.f64 fd1074, fd106, fd163; +mul.f64 fd299, fd1074, fd297; +mul.f64 fd300, fd295, fd1075; +ld.global.v2.f64 {fd301, fd302}, [rd5+2048]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1072, fd238, fd301; +mul.f64 fd1073, fd239, fd302; +sub.f64 fd310, fd1072, fd1073; +mul.f64 fd1071, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1070, fd238, fd310; +sub.f64 fd318, fd1070, fd317; +mul.f64 fd1069, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1067, fd238, fd318; +mul.f64 fd1068, fd239, fd320; +sub.f64 fd326, fd1067, fd1068; +mul.f64 fd1066, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1064, fd238, fd326; +mul.f64 fd1065, fd239, fd328; +sub.f64 fd334, fd1064, fd1065; +mul.f64 fd1063, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1062, fd238, fd334; +sub.f64 fd342, fd1062, fd341; +mul.f64 fd1061, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1059, fd238, fd342; +mul.f64 fd1060, fd239, fd344; +sub.f64 fd350, fd1059, fd1060; +mul.f64 fd1058, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd1057, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 32512; +add.s32 r12, r9, r11; +mov.u32 r35, %tid.x; +shl.b32 r34, r35, 8; +add.f64 fd356, fd1128, fd1109; +sub.f64 fd1148, fd106, fd163; +and.b32 r23, r34, 32512; +add.s32 r22, r9, r23; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r22], {fd357, fd356}; +mov.u32 r33, %tid.x; +shl.b32 r32, r33, 4; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd1057; +st.shared.v2.f64 [r22+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd1087; +st.shared.v2.f64 [r22+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd1085; +st.shared.v2.f64 [r22+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd1083; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r22+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd1080; +st.shared.v2.f64 [r22+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd1078; +st.shared.v2.f64 [r22+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd1076; +st.shared.v2.f64 [r22+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd1148, fd298; +sub.f64 fd373, fd300, fd299; +st.shared.v2.f64 [r22+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd1071; +st.shared.v2.f64 [r22+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd1069; +st.shared.v2.f64 [r22+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd1066; +st.shared.v2.f64 [r22+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd1063; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r22+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd1061; +st.shared.v2.f64 [r22+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd1058; +st.shared.v2.f64 [r22+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r22+240], {fd386, fd387}; +barrier.sync 0; +and.b32 r20, r33, 127; +mad.lo.s32 r13, r20, -240, r22; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+2048]; +ld.shared.v2.f64 {fd396, fd397}, [r13+4096]; +ld.shared.v2.f64 {fd400, fd401}, [r13+6144]; +ld.shared.v2.f64 {fd404, fd405}, [r13+8192]; +ld.shared.v2.f64 {fd408, fd409}, [r13+10240]; +ld.shared.v2.f64 {fd412, fd413}, [r13+12288]; +ld.shared.v2.f64 {fd416, fd417}, [r13+14336]; +ld.shared.v2.f64 {fd420, fd421}, [r13+16384]; +ld.shared.v2.f64 {fd424, fd425}, [r13+18432]; +ld.shared.v2.f64 {fd428, fd429}, [r13+20480]; +ld.shared.v2.f64 {fd432, fd433}, [r13+22528]; +ld.shared.v2.f64 {fd436, fd437}, [r13+24576]; +ld.shared.v2.f64 {fd440, fd441}, [r13+26624]; +ld.shared.v2.f64 {fd444, fd445}, [r13+28672]; +ld.shared.v2.f64 {fd448, fd449}, [r13+30720]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd1056, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd1055, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd1054, fd1056, fd1055; +sub.f64 fd463, fd1056, fd1055; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd1053, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd1052, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd1051, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd1050, fd1052, fd1051; +sub.f64 fd479, fd1052, fd1051; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd1049, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd1049, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd1048, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd1048, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd1047, fd1054, fd1050; +sub.f64 fd496, fd1054, fd1050; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd1046, fd1053, fd487; +sub.f64 fd500, fd1053, fd487; +sub.f64 fd501, fd462, fd479; +add.f64 fd503, fd462, fd479; +add.f64 fd1045, fd463, fd478; +sub.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd490; +sub.f64 fd507, fd466, fd490; +add.f64 fd1044, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd1043, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd1042, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd1041, fd1043, fd1042; +sub.f64 fd520, fd1043, fd1042; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd1040, fd512, fd515; +sub.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd1039, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd1038, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd1037, fd1039, fd1038; +sub.f64 fd536, fd1039, fd1038; +sub.f64 fd537, fd527, fd532; +add.f64 fd539, fd527, fd532; +add.f64 fd1036, fd528, fd531; +sub.f64 fd540, fd528, fd531; +mul.f64 fd541, fd537, 0d3FE6A09E667F3BCD; +mul.f64 fd542, fd1036, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +mul.f64 fd546, fd540, 0d3FE6A09E667F3BCD; +mul.f64 fd1035, fd539, 0dBFE6A09E667F3BCD; +sub.f64 fd547, fd1035, fd546; +mul.f64 fd548, fd540, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd549, fd539, 0d3FE6A09E667F3BCD, fd548; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd1034, fd1041, fd1037; +sub.f64 fd553, fd1041, fd1037; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd1033, fd1040, fd544; +sub.f64 fd557, fd1040, fd544; +sub.f64 fd558, fd519, fd536; +add.f64 fd560, fd519, fd536; +add.f64 fd1032, fd520, fd535; +sub.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd547; +sub.f64 fd564, fd523, fd547; +add.f64 fd1031, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd1029, fd554, 0d3FED906BCF328D46; +mul.f64 fd1030, fd1033, 0d3FD87DE2A6AEA963; +sub.f64 fd568, fd1029, fd1030; +mul.f64 fd569, fd1033, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0d3FD87DE2A6AEA963, fd569; +mul.f64 fd571, fd558, 0d3FE6A09E667F3BCD; +mul.f64 fd572, fd1032, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd571, fd572; +add.f64 fd574, fd571, fd572; +mul.f64 fd1027, fd562, 0d3FD87DE2A6AEA963; +mul.f64 fd1028, fd1031, 0d3FED906BCF328D46; +sub.f64 fd577, fd1027, fd1028; +mul.f64 fd578, fd1031, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd579, fd562, 0d3FED906BCF328D46, fd578; +mul.f64 fd1025, fd556, 0dBFD87DE2A6AEA963; +mul.f64 fd1026, fd557, 0d3FED906BCF328D46; +sub.f64 fd582, fd1025, fd1026; +mul.f64 fd583, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd584, fd556, 0d3FED906BCF328D46, fd583; +mul.f64 fd1023, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd1024, fd561, 0d3FE6A09E667F3BCD; +sub.f64 fd587, fd1023, fd1024; +mul.f64 fd588, fd561, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd589, fd560, 0d3FE6A09E667F3BCD, fd588; +mul.f64 fd591, fd565, 0d3FD87DE2A6AEA963; +mul.f64 fd1022, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd1022, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0d3FD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd1021, fd1046, fd570; +sub.f64 fd600, fd1046, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd1020, fd1045, fd574; +sub.f64 fd604, fd1045, fd574; +add.f64 fd605, fd505, fd577; +sub.f64 fd607, fd505, fd577; +add.f64 fd1019, fd1044, fd579; +sub.f64 fd608, fd1044, fd579; +sub.f64 fd609, fd495, fd553; +add.f64 fd611, fd495, fd553; +add.f64 fd1018, fd496, fd552; +sub.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd582; +sub.f64 fd615, fd499, fd582; +add.f64 fd1017, fd500, fd584; +sub.f64 fd616, fd500, fd584; +add.f64 fd617, fd503, fd587; +sub.f64 fd619, fd503, fd587; +add.f64 fd1016, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd1015, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r33, 112; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd629, fd1021, fd626; +mul.f64 fd631, fd625, fd1021; +mul.f64 fd633, fd626, fd626; +mul.f64 fd1014, fd625, fd625; +sub.f64 fd634, fd1014, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd637, fd1020, fd636; +mul.f64 fd639, fd634, fd1020; +mul.f64 fd1012, fd625, fd634; +mul.f64 fd1013, fd626, fd636; +sub.f64 fd642, fd1012, fd1013; +mul.f64 fd1011, fd601, fd636; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd645, fd1019, fd644; +mul.f64 fd647, fd642, fd1019; +mul.f64 fd649, fd626, fd644; +mul.f64 fd1010, fd625, fd642; +sub.f64 fd650, fd1010, fd649; +mul.f64 fd1009, fd605, fd644; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd653, fd1018, fd652; +mul.f64 fd655, fd650, fd1018; +mul.f64 fd1007, fd625, fd650; +mul.f64 fd1008, fd626, fd652; +sub.f64 fd658, fd1007, fd1008; +mul.f64 fd1006, fd609, fd652; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd661, fd1017, fd660; +mul.f64 fd663, fd658, fd1017; +mul.f64 fd1004, fd625, fd658; +mul.f64 fd1005, fd626, fd660; +sub.f64 fd666, fd1004, fd1005; +mul.f64 fd1003, fd613, fd660; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd669, fd1016, fd668; +mul.f64 fd671, fd666, fd1016; +mul.f64 fd673, fd626, fd668; +mul.f64 fd1002, fd625, fd666; +sub.f64 fd674, fd1002, fd673; +mul.f64 fd1001, fd617, fd668; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd677, fd1015, fd676; +mul.f64 fd679, fd674, fd1015; +mul.f64 fd999, fd625, fd674; +mul.f64 fd1000, fd626, fd676; +sub.f64 fd682, fd999, fd1000; +mul.f64 fd998, fd621, fd676; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd997, fd1047, fd1034; +mul.f64 fd685, fd997, fd684; +sub.f64 fd996, fd493, fd550; +mul.f64 fd686, fd996, fd684; +mul.f64 fd687, fd682, fd997; +ld.global.v2.f64 {fd688, fd689}, [rd8+128]; +mul.f64 fd692, fd600, fd689; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd995, fd625, fd688; +sub.f64 fd697, fd995, fd696; +mul.f64 fd994, fd599, fd689; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd700, fd604, fd699; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd993, fd625, fd697; +sub.f64 fd705, fd993, fd704; +mul.f64 fd992, fd603, fd699; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd708, fd608, fd707; +mul.f64 fd710, fd705, fd608; +mul.f64 fd990, fd625, fd705; +mul.f64 fd991, fd626, fd707; +sub.f64 fd713, fd990, fd991; +mul.f64 fd989, fd607, fd707; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd716, fd612, fd715; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd988, fd625, fd713; +sub.f64 fd721, fd988, fd720; +mul.f64 fd987, fd611, fd715; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd724, fd616, fd723; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd986, fd625, fd721; +sub.f64 fd729, fd986, fd728; +mul.f64 fd985, fd615, fd723; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd732, fd620, fd731; +mul.f64 fd734, fd729, fd620; +mul.f64 fd983, fd625, fd729; +mul.f64 fd984, fd626, fd731; +sub.f64 fd737, fd983, fd984; +mul.f64 fd982, fd619, fd731; +mul.f64 fd738, fd625, fd731; +mul.f64 fd981, fd597, fd626; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd624, fd739; +mul.f64 fd741, fd623, fd739; +mul.f64 fd742, fd737, fd624; +and.b32 r15, r32, 240; +add.s32 r16, r9, r15; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 8; +barrier.sync 0; +and.b32 r17, r27, 28672; +add.s32 r18, r16, r17; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 112; +add.f64 fd743, fd1047, fd1034; +sub.f64 fd1147, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r37, %tid.x; +and.b32 r36, r37, 112; +fma.rn.f64 fd745, fd625, fd597, fd629; +sub.f64 fd746, fd631, fd981; +st.shared.v2.f64 [r18+256], {fd745, fd746}; +fma.rn.f64 fd747, fd634, fd601, fd637; +sub.f64 fd748, fd639, fd1011; +st.shared.v2.f64 [r18+512], {fd747, fd748}; +fma.rn.f64 fd749, fd642, fd605, fd645; +sub.f64 fd750, fd647, fd1009; +st.shared.v2.f64 [r18+768], {fd749, fd750}; +fma.rn.f64 fd751, fd650, fd609, fd653; +sub.f64 fd752, fd655, fd1006; +st.shared.v2.f64 [r18+1024], {fd751, fd752}; +sub.f64 fd753, fd663, fd1003; +fma.rn.f64 fd754, fd658, fd613, fd661; +st.shared.v2.f64 [r18+1280], {fd754, fd753}; +fma.rn.f64 fd755, fd666, fd617, fd669; +sub.f64 fd756, fd671, fd1001; +st.shared.v2.f64 [r18+1536], {fd755, fd756}; +fma.rn.f64 fd757, fd674, fd621, fd677; +sub.f64 fd758, fd679, fd998; +st.shared.v2.f64 [r18+1792], {fd757, fd758}; +fma.rn.f64 fd759, fd682, fd1147, fd685; +sub.f64 fd760, fd687, fd686; +st.shared.v2.f64 [r18+2048], {fd759, fd760}; +fma.rn.f64 fd761, fd688, fd599, fd692; +sub.f64 fd762, fd694, fd994; +st.shared.v2.f64 [r18+2304], {fd761, fd762}; +fma.rn.f64 fd763, fd697, fd603, fd700; +sub.f64 fd764, fd702, fd992; +st.shared.v2.f64 [r18+2560], {fd763, fd764}; +fma.rn.f64 fd765, fd705, fd607, fd708; +sub.f64 fd766, fd710, fd989; +st.shared.v2.f64 [r18+2816], {fd765, fd766}; +fma.rn.f64 fd767, fd713, fd611, fd716; +sub.f64 fd768, fd718, fd987; +st.shared.v2.f64 [r18+3072], {fd767, fd768}; +sub.f64 fd769, fd726, fd985; +fma.rn.f64 fd770, fd721, fd615, fd724; +st.shared.v2.f64 [r18+3328], {fd770, fd769}; +fma.rn.f64 fd771, fd729, fd619, fd732; +sub.f64 fd772, fd734, fd982; +st.shared.v2.f64 [r18+3584], {fd771, fd772}; +fma.rn.f64 fd773, fd737, fd623, fd740; +sub.f64 fd774, fd742, fd741; +st.shared.v2.f64 [r18+3840], {fd773, fd774}; +barrier.sync 0; +mad.lo.s32 r19, r36, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+2048]; +ld.shared.v2.f64 {fd783, fd784}, [r19+4096]; +ld.shared.v2.f64 {fd787, fd788}, [r19+6144]; +ld.shared.v2.f64 {fd791, fd792}, [r19+8192]; +ld.shared.v2.f64 {fd795, fd796}, [r19+10240]; +ld.shared.v2.f64 {fd799, fd800}, [r19+12288]; +ld.shared.v2.f64 {fd803, fd804}, [r19+14336]; +ld.shared.v2.f64 {fd807, fd808}, [r19+16384]; +ld.shared.v2.f64 {fd811, fd812}, [r19+18432]; +ld.shared.v2.f64 {fd815, fd816}, [r19+20480]; +ld.shared.v2.f64 {fd819, fd820}, [r19+22528]; +ld.shared.v2.f64 {fd823, fd824}, [r19+24576]; +ld.shared.v2.f64 {fd827, fd828}, [r19+26624]; +ld.shared.v2.f64 {fd831, fd832}, [r19+28672]; +ld.shared.v2.f64 {fd835, fd836}, [r19+30720]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd980, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd979, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd839, fd843; +sub.f64 fd849, fd839, fd843; +add.f64 fd978, fd980, fd979; +sub.f64 fd850, fd980, fd979; +sub.f64 fd851, fd841, fd846; +add.f64 fd853, fd841, fd846; +add.f64 fd977, fd842, fd845; +sub.f64 fd854, fd842, fd845; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd976, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd975, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd855, fd859; +sub.f64 fd865, fd855, fd859; +add.f64 fd974, fd976, fd975; +sub.f64 fd866, fd976, fd975; +sub.f64 fd867, fd857, fd862; +add.f64 fd869, fd857, fd862; +add.f64 fd973, fd858, fd861; +sub.f64 fd870, fd858, fd861; +mul.f64 fd871, fd867, 0d3FE6A09E667F3BCD; +mul.f64 fd872, fd973, 0d3FE6A09E667F3BCD; +sub.f64 fd873, fd871, fd872; +add.f64 fd874, fd871, fd872; +mul.f64 fd876, fd870, 0d3FE6A09E667F3BCD; +mul.f64 fd972, fd869, 0dBFE6A09E667F3BCD; +sub.f64 fd877, fd972, fd876; +mul.f64 fd878, fd870, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd879, fd869, 0d3FE6A09E667F3BCD, fd878; +add.f64 fd880, fd779, fd811; +sub.f64 fd882, fd779, fd811; +add.f64 fd971, fd780, fd812; +sub.f64 fd883, fd780, fd812; +add.f64 fd884, fd795, fd827; +sub.f64 fd886, fd795, fd827; +add.f64 fd970, fd796, fd828; +sub.f64 fd887, fd796, fd828; +add.f64 fd888, fd880, fd884; +sub.f64 fd890, fd880, fd884; +add.f64 fd969, fd971, fd970; +sub.f64 fd891, fd971, fd970; +sub.f64 fd892, fd882, fd887; +add.f64 fd894, fd882, fd887; +add.f64 fd968, fd883, fd886; +sub.f64 fd895, fd883, fd886; +add.f64 fd896, fd787, fd819; +sub.f64 fd898, fd787, fd819; +add.f64 fd967, fd788, fd820; +sub.f64 fd899, fd788, fd820; +add.f64 fd900, fd803, fd835; +sub.f64 fd902, fd803, fd835; +add.f64 fd966, fd804, fd836; +sub.f64 fd903, fd804, fd836; +add.f64 fd904, fd896, fd900; +sub.f64 fd906, fd896, fd900; +add.f64 fd965, fd967, fd966; +sub.f64 fd907, fd967, fd966; +sub.f64 fd908, fd898, fd903; +add.f64 fd910, fd898, fd903; +add.f64 fd964, fd899, fd902; +sub.f64 fd911, fd899, fd902; +mul.f64 fd912, fd908, 0d3FE6A09E667F3BCD; +mul.f64 fd913, fd964, 0d3FE6A09E667F3BCD; +sub.f64 fd914, fd912, fd913; +add.f64 fd915, fd912, fd913; +mul.f64 fd917, fd911, 0d3FE6A09E667F3BCD; +mul.f64 fd963, fd910, 0dBFE6A09E667F3BCD; +sub.f64 fd918, fd963, fd917; +mul.f64 fd919, fd911, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd920, fd910, 0d3FE6A09E667F3BCD, fd919; +add.f64 %0, fd847, fd863; +add.f64 %1, fd978, fd974; +add.f64 %2, fd888, fd904; +add.f64 %3, fd969, fd965; +add.f64 %4, fd851, fd873; +add.f64 %5, fd977, fd874; +add.f64 %6, fd892, fd914; +add.f64 %7, fd968, fd915; +add.f64 %9, fd850, fd865; +sub.f64 %8, fd849, fd866; +add.f64 %11, fd891, fd906; +sub.f64 %10, fd890, fd907; +add.f64 %12, fd853, fd877; +add.f64 %13, fd854, fd879; +add.f64 %14, fd894, fd918; +add.f64 %15, fd895, fd920; +sub.f64 %17, fd978, fd974; +sub.f64 %16, fd847, fd863; +sub.f64 %19, fd969, fd965; +sub.f64 %18, fd888, fd904; +sub.f64 %21, fd977, fd874; +sub.f64 %20, fd851, fd873; +sub.f64 %23, fd968, fd915; +sub.f64 %22, fd892, fd914; +sub.f64 %25, fd850, fd865; +add.f64 %24, fd849, fd866; +sub.f64 %27, fd891, fd906; +add.f64 %26, fd890, fd907; +sub.f64 %29, fd854, fd879; +sub.f64 %28, fd853, fd877; +sub.f64 %31, fd895, fd920; +sub.f64 %30, fd894, fd918; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_2048), "l"(lut_dp_16_128), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<658, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<536>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+4096]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 32640; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+4096]; +ld.shared.v2.f64 {fd166, fd167}, [r13+8192]; +ld.shared.v2.f64 {fd170, fd171}, [r13+12288]; +ld.shared.v2.f64 {fd174, fd175}, [r13+16384]; +ld.shared.v2.f64 {fd178, fd179}, [r13+20480]; +ld.shared.v2.f64 {fd182, fd183}, [r13+24576]; +ld.shared.v2.f64 {fd186, fd187}, [r13+28672]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 248; +bfe.u32 r15, r5, 3, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+512]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 31744; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+4096]; +ld.shared.v2.f64 {fd323, fd324}, [r20+8192]; +ld.shared.v2.f64 {fd327, fd328}, [r20+12288]; +ld.shared.v2.f64 {fd331, fd332}, [r20+16384]; +ld.shared.v2.f64 {fd335, fd336}, [r20+20480]; +ld.shared.v2.f64 {fd339, fd340}, [r20+24576]; +ld.shared.v2.f64 {fd343, fd344}, [r20+28672]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +sub.f64 fd359, fd349, fd354; +add.f64 fd360, fd350, fd353; +add.f64 fd361, fd349, fd354; +sub.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +sub.f64 fd375, fd365, fd370; +add.f64 fd376, fd366, fd369; +add.f64 fd377, fd365, fd370; +sub.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0d3FE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +add.f64 fd382, fd379, fd380; +mul.f64 fd383, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd384, fd378, 0d3FE6A09E667F3BCD; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd378, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd387, fd377, 0d3FE6A09E667F3BCD, fd386; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd382; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd382; +sub.f64 fd394, fd357, fd374; +add.f64 fd395, fd358, fd373; +add.f64 fd396, fd357, fd374; +sub.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd385; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd385; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 192; +bfe.u32 r22, r5, 6, 2; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd391, fd403; +mul.f64 fd407, fd390, fd403; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd395, fd413; +mul.f64 fd415, fd394, fd413; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd399, fd421; +mul.f64 fd423, fd398, fd421; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd389, fd429; +mul.f64 fd431, fd388, fd429; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+64]; +mul.f64 fd437, fd393, fd434; +mul.f64 fd438, fd392, fd434; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd397, fd444; +mul.f64 fd446, fd396, fd444; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd401, fd452; +mul.f64 fd454, fd400, fd452; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 24576; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd402, fd390, fd406; +sub.f64 fd459, fd408, fd407; +st.shared.v2.f64 [r26+1024], {fd458, fd459}; +fma.rn.f64 fd460, fd411, fd394, fd414; +sub.f64 fd461, fd416, fd415; +st.shared.v2.f64 [r26+2048], {fd460, fd461}; +fma.rn.f64 fd462, fd419, fd398, fd422; +sub.f64 fd463, fd424, fd423; +st.shared.v2.f64 [r26+3072], {fd462, fd463}; +sub.f64 fd464, fd432, fd431; +fma.rn.f64 fd465, fd427, fd388, fd430; +st.shared.v2.f64 [r26+4096], {fd465, fd464}; +fma.rn.f64 fd466, fd433, fd392, fd437; +sub.f64 fd467, fd439, fd438; +st.shared.v2.f64 [r26+5120], {fd466, fd467}; +fma.rn.f64 fd468, fd442, fd396, fd445; +sub.f64 fd469, fd447, fd446; +st.shared.v2.f64 [r26+6144], {fd468, fd469}; +fma.rn.f64 fd470, fd450, fd400, fd453; +sub.f64 fd471, fd455, fd454; +st.shared.v2.f64 [r26+7168], {fd470, fd471}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+4096]; +ld.shared.v2.f64 {fd480, fd481}, [r27+8192]; +ld.shared.v2.f64 {fd484, fd485}, [r27+12288]; +ld.shared.v2.f64 {fd488, fd489}, [r27+16384]; +ld.shared.v2.f64 {fd492, fd493}, [r27+20480]; +ld.shared.v2.f64 {fd496, fd497}, [r27+24576]; +ld.shared.v2.f64 {fd500, fd501}, [r27+28672]; +add.f64 fd504, fd472, fd488; +add.f64 fd505, fd473, fd489; +sub.f64 fd506, fd472, fd488; +sub.f64 fd507, fd473, fd489; +add.f64 fd508, fd480, fd496; +add.f64 fd509, fd481, fd497; +sub.f64 fd510, fd480, fd496; +sub.f64 fd511, fd481, fd497; +add.f64 fd512, fd476, fd492; +add.f64 fd513, fd477, fd493; +sub.f64 fd514, fd476, fd492; +sub.f64 fd515, fd477, fd493; +add.f64 fd516, fd484, fd500; +add.f64 fd517, fd485, fd501; +sub.f64 fd518, fd484, fd500; +sub.f64 fd519, fd485, fd501; +add.f64 %1, fd505, fd509; +add.f64 %0, fd504, fd508; +add.f64 %3, fd513, fd517; +add.f64 %2, fd512, fd516; +add.f64 %5, fd507, fd510; +sub.f64 %4, fd506, fd511; +add.f64 %7, fd515, fd518; +sub.f64 %6, fd514, fd519; +sub.f64 %9, fd505, fd509; +sub.f64 %8, fd504, fd508; +sub.f64 %11, fd513, fd517; +sub.f64 %10, fd512, fd516; +sub.f64 %13, fd507, fd510; +add.f64 %12, fd506, fd511; +sub.f64 %15, fd515, fd518; +add.f64 %14, fd514, fd519; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_2048), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<659, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<285>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+8192]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16352; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+4096]; +ld.shared.f64 fd63, [r13+8192]; +ld.shared.f64 fd64, [r13+12288]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+4096]; +ld.shared.f64 fd67, [r13+8192]; +ld.shared.f64 fd68, [r13+12288]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+2048]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 16256; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+4096]; +ld.shared.f64 fd115, [r21+8192]; +ld.shared.f64 fd116, [r21+12288]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+4096]; +ld.shared.f64 fd119, [r21+8192]; +ld.shared.f64 fd120, [r21+12288]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd128; +add.f64 fd134, fd124, fd127; +add.f64 fd135, fd123, fd128; +sub.f64 fd136, fd124, fd127; +and.b32 r22, r5, 496; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd134, fd138; +fma.rn.f64 fd142, fd137, fd133, fd141; +mul.f64 fd143, fd133, fd138; +mul.f64 fd144, fd137, fd134; +sub.f64 fd145, fd144, fd143; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd132, fd150; +fma.rn.f64 fd152, fd148, fd131, fd151; +mul.f64 fd153, fd131, fd150; +mul.f64 fd154, fd148, fd132; +sub.f64 fd155, fd154, fd153; +ld.global.v2.f64 {fd156, fd157}, [rd11+512]; +mul.f64 fd160, fd136, fd157; +fma.rn.f64 fd161, fd156, fd135, fd160; +mul.f64 fd162, fd135, fd157; +mul.f64 fd163, fd156, fd136; +sub.f64 fd164, fd163, fd162; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 15872; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd142; +st.shared.f64 [r26+256], fd152; +st.shared.f64 [r26+384], fd161; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+4096]; +ld.shared.f64 fd167, [r27+8192]; +ld.shared.f64 fd168, [r27+12288]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+4096]; +ld.shared.f64 fd171, [r27+8192]; +ld.shared.f64 fd172, [r27+12288]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd180; +add.f64 fd186, fd176, fd179; +add.f64 fd187, fd175, fd180; +sub.f64 fd188, fd176, fd179; +and.b32 r28, r5, 448; +bfe.u32 r29, r5, 6, 3; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd186, fd190; +fma.rn.f64 fd194, fd189, fd185, fd193; +mul.f64 fd195, fd185, fd190; +mul.f64 fd196, fd189, fd186; +sub.f64 fd197, fd196, fd195; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd184, fd202; +fma.rn.f64 fd204, fd200, fd183, fd203; +mul.f64 fd205, fd183, fd202; +mul.f64 fd206, fd200, fd184; +sub.f64 fd207, fd206, fd205; +ld.global.v2.f64 {fd208, fd209}, [rd14+128]; +mul.f64 fd212, fd188, fd209; +fma.rn.f64 fd213, fd208, fd187, fd212; +mul.f64 fd214, fd187, fd209; +mul.f64 fd215, fd208, fd188; +sub.f64 fd216, fd215, fd214; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 14336; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd194; +st.shared.f64 [r33+1024], fd204; +st.shared.f64 [r33+1536], fd213; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+4096]; +ld.shared.f64 fd219, [r34+8192]; +ld.shared.f64 fd220, [r34+12288]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+4096]; +ld.shared.f64 fd223, [r34+8192]; +ld.shared.f64 fd224, [r34+12288]; +add.f64 fd225, fd217, fd219; +add.f64 fd226, fd221, fd223; +sub.f64 fd227, fd217, fd219; +sub.f64 fd228, fd221, fd223; +add.f64 fd229, fd218, fd220; +add.f64 fd230, fd222, fd224; +sub.f64 fd231, fd218, fd220; +sub.f64 fd232, fd222, fd224; +add.f64 fd233, fd225, fd229; +add.f64 fd234, fd226, fd230; +sub.f64 fd235, fd225, fd229; +sub.f64 fd236, fd226, fd230; +sub.f64 fd237, fd227, fd232; +add.f64 fd238, fd228, fd231; +add.f64 fd239, fd227, fd232; +sub.f64 fd240, fd228, fd231; +and.b32 r35, r5, 256; +bfe.u32 r36, r5, 8, 1; +mul.wide.u32 rd15, r36, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd241, fd242}, [rd17]; +mul.f64 fd245, fd238, fd242; +fma.rn.f64 fd246, fd241, fd237, fd245; +mul.f64 fd247, fd237, fd242; +mul.f64 fd248, fd241, fd238; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd241, fd241; +mul.f64 fd251, fd242, fd242; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd242, fd241; +fma.rn.f64 fd254, fd242, fd241, fd253; +mul.f64 fd255, fd236, fd254; +fma.rn.f64 fd256, fd252, fd235, fd255; +mul.f64 fd257, fd235, fd254; +mul.f64 fd258, fd252, fd236; +sub.f64 fd259, fd258, fd257; +ld.global.v2.f64 {fd260, fd261}, [rd17+32]; +mul.f64 fd264, fd240, fd261; +fma.rn.f64 fd265, fd260, fd239, fd264; +mul.f64 fd266, fd239, fd261; +mul.f64 fd267, fd260, fd240; +sub.f64 fd268, fd267, fd266; +and.b32 r37, r16, 2040; +add.s32 r38, r10, r37; +barrier.sync 0; +and.b32 r39, r8, 8192; +add.s32 r40, r38, r39; +st.shared.f64 [r40], fd233; +st.shared.f64 [r40+2048], fd246; +st.shared.f64 [r40+4096], fd256; +st.shared.f64 [r40+6144], fd265; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.f64 fd269, [r41]; +ld.shared.f64 fd270, [r41+4096]; +ld.shared.f64 fd271, [r41+8192]; +ld.shared.f64 fd272, [r41+12288]; +barrier.sync 0; +st.shared.f64 [r40], fd234; +st.shared.f64 [r40+2048], fd249; +st.shared.f64 [r40+4096], fd259; +st.shared.f64 [r40+6144], fd268; +barrier.sync 0; +ld.shared.f64 fd273, [r41]; +ld.shared.f64 fd274, [r41+4096]; +ld.shared.f64 fd275, [r41+8192]; +ld.shared.f64 fd276, [r41+12288]; +add.f64 %0, fd269, fd271; +add.f64 %1, fd273, fd275; +add.f64 %2, fd270, fd272; +add.f64 %3, fd274, fd276; +sub.f64 %4, fd269, fd271; +sub.f64 %5, fd273, fd275; +sub.f64 %6, fd270, fd272; +sub.f64 %7, fd274, fd276; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_2048), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<660, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<41>; +.reg .f64 fd<325>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+8192]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 32704; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+8192]; +ld.shared.v2.f64 {fd69, fd70}, [r13+16384]; +ld.shared.v2.f64 {fd73, fd74}, [r13+24576]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 508; +bfe.u32 r15, r5, 2, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+2048]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 32512; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+8192]; +ld.shared.v2.f64 {fd129, fd130}, [r20+16384]; +ld.shared.v2.f64 {fd133, fd134}, [r20+24576]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +sub.f64 fd147, fd139, fd144; +add.f64 fd148, fd140, fd143; +add.f64 fd149, fd139, fd144; +sub.f64 fd150, fd140, fd143; +and.b32 r21, r5, 496; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd148, fd152; +mul.f64 fd156, fd147, fd152; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd146, fd162; +mul.f64 fd164, fd145, fd162; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+512]; +mul.f64 fd170, fd150, fd167; +mul.f64 fd171, fd149, fd167; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 31744; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd151, fd147, fd155; +sub.f64 fd176, fd157, fd156; +st.shared.v2.f64 [r25+256], {fd175, fd176}; +fma.rn.f64 fd177, fd160, fd145, fd163; +sub.f64 fd178, fd165, fd164; +st.shared.v2.f64 [r25+512], {fd177, fd178}; +fma.rn.f64 fd179, fd166, fd149, fd170; +sub.f64 fd180, fd172, fd171; +st.shared.v2.f64 [r25+768], {fd179, fd180}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+8192]; +ld.shared.v2.f64 {fd189, fd190}, [r26+16384]; +ld.shared.v2.f64 {fd193, fd194}, [r26+24576]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +sub.f64 fd207, fd199, fd204; +add.f64 fd208, fd200, fd203; +add.f64 fd209, fd199, fd204; +sub.f64 fd210, fd200, fd203; +and.b32 r27, r5, 448; +bfe.u32 r28, r5, 6, 3; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd208, fd212; +mul.f64 fd216, fd207, fd212; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd206, fd222; +mul.f64 fd224, fd205, fd222; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+128]; +mul.f64 fd230, fd210, fd227; +mul.f64 fd231, fd209, fd227; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 28672; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd211, fd207, fd215; +sub.f64 fd236, fd217, fd216; +st.shared.v2.f64 [r32+1024], {fd235, fd236}; +fma.rn.f64 fd237, fd220, fd205, fd223; +sub.f64 fd238, fd225, fd224; +st.shared.v2.f64 [r32+2048], {fd237, fd238}; +fma.rn.f64 fd239, fd226, fd209, fd230; +sub.f64 fd240, fd232, fd231; +st.shared.v2.f64 [r32+3072], {fd239, fd240}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+8192]; +ld.shared.v2.f64 {fd249, fd250}, [r33+16384]; +ld.shared.v2.f64 {fd253, fd254}, [r33+24576]; +add.f64 fd257, fd241, fd249; +add.f64 fd258, fd242, fd250; +sub.f64 fd259, fd241, fd249; +sub.f64 fd260, fd242, fd250; +add.f64 fd261, fd245, fd253; +add.f64 fd262, fd246, fd254; +sub.f64 fd263, fd245, fd253; +sub.f64 fd264, fd246, fd254; +sub.f64 fd265, fd257, fd261; +sub.f64 fd266, fd258, fd262; +sub.f64 fd267, fd259, fd264; +add.f64 fd268, fd260, fd263; +add.f64 fd269, fd259, fd264; +sub.f64 fd270, fd260, fd263; +and.b32 r34, r5, 256; +bfe.u32 r35, r5, 8, 1; +mul.wide.u32 rd15, r35, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd271, fd272}, [rd17]; +mul.f64 fd275, fd268, fd272; +mul.f64 fd276, fd267, fd272; +mul.f64 fd277, fd271, fd268; +mul.f64 fd278, fd271, fd271; +mul.f64 fd279, fd272, fd272; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd272, fd271; +fma.rn.f64 fd282, fd272, fd271, fd281; +mul.f64 fd283, fd266, fd282; +mul.f64 fd284, fd265, fd282; +mul.f64 fd285, fd280, fd266; +ld.global.v2.f64 {fd286, fd287}, [rd17+32]; +mul.f64 fd290, fd270, fd287; +mul.f64 fd291, fd269, fd287; +mul.f64 fd292, fd286, fd270; +and.b32 r36, r10, 4080; +add.s32 r37, r9, r36; +barrier.sync 0; +and.b32 r38, r7, 16384; +add.s32 r39, r37, r38; +add.f64 fd293, fd258, fd262; +add.f64 fd294, fd257, fd261; +st.shared.v2.f64 [r39], {fd294, fd293}; +fma.rn.f64 fd295, fd271, fd267, fd275; +sub.f64 fd296, fd277, fd276; +st.shared.v2.f64 [r39+4096], {fd295, fd296}; +fma.rn.f64 fd297, fd280, fd265, fd283; +sub.f64 fd298, fd285, fd284; +st.shared.v2.f64 [r39+8192], {fd297, fd298}; +fma.rn.f64 fd299, fd286, fd269, fd290; +sub.f64 fd300, fd292, fd291; +st.shared.v2.f64 [r39+12288], {fd299, fd300}; +barrier.sync 0; +mad.lo.s32 r40, r34, -48, r39; +ld.shared.v2.f64 {fd301, fd302}, [r40]; +ld.shared.v2.f64 {fd305, fd306}, [r40+8192]; +ld.shared.v2.f64 {fd309, fd310}, [r40+16384]; +ld.shared.v2.f64 {fd313, fd314}, [r40+24576]; +add.f64 %1, fd302, fd310; +add.f64 %0, fd301, fd309; +add.f64 %3, fd306, fd314; +add.f64 %2, fd305, fd313; +sub.f64 %5, fd302, fd310; +sub.f64 %4, fd301, fd309; +sub.f64 %7, fd306, fd314; +sub.f64 %6, fd305, fd313; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_2048), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<661, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<77>; +.reg .f64 fd<183>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %15, %17; +add.f64 fd10, %16, %18; +sub.f64 fd11, %15, %17; +sub.f64 fd12, %16, %18; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -16384; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 16368; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 8184; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+8192]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+8192]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16352; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 8176; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+8192]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+8192]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 16320; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 8160; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+8192]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+8192]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 7; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 16256; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 8128; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+8192]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+8192]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 1008; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd80, fd82; +fma.rn.f64 fd86, fd81, fd79, fd85; +mul.f64 fd87, fd79, fd82; +mul.f64 fd88, fd81, fd80; +sub.f64 fd89, fd88, fd87; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 16128; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd86; +barrier.sync 0; +and.b32 r40, r11, 8064; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+8192]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+8192]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd97, fd99; +fma.rn.f64 fd103, fd98, fd96, fd102; +mul.f64 fd104, fd96, fd99; +mul.f64 fd105, fd98, fd97; +sub.f64 fd106, fd105, fd104; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 15872; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd103; +barrier.sync 0; +and.b32 r47, r11, 7936; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+8192]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+8192]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd114, fd116; +fma.rn.f64 fd120, fd115, fd113, fd119; +mul.f64 fd121, fd113, fd116; +mul.f64 fd122, fd115, fd114; +sub.f64 fd123, fd122, fd121; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 15360; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd120; +barrier.sync 0; +and.b32 r54, r11, 7680; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+8192]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+8192]; +add.f64 fd128, fd124, fd125; +add.f64 fd129, fd126, fd127; +sub.f64 fd130, fd124, fd125; +sub.f64 fd131, fd126, fd127; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd132, fd133}, [rd26]; +mul.f64 fd136, fd131, fd133; +fma.rn.f64 fd137, fd132, fd130, fd136; +mul.f64 fd138, fd130, fd133; +mul.f64 fd139, fd132, fd131; +sub.f64 fd140, fd139, fd138; +and.b32 r57, r11, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 14336; +add.s32 r60, r58, r59; +st.shared.f64 [r60], fd128; +st.shared.f64 [r60+1024], fd137; +barrier.sync 0; +and.b32 r61, r11, 7168; +sub.s32 r62, r60, r61; +ld.shared.f64 fd141, [r62]; +ld.shared.f64 fd142, [r62+8192]; +barrier.sync 0; +st.shared.f64 [r60], fd129; +st.shared.f64 [r60+1024], fd140; +barrier.sync 0; +ld.shared.f64 fd143, [r62]; +ld.shared.f64 fd144, [r62+8192]; +add.f64 fd145, fd141, fd142; +add.f64 fd146, fd143, fd144; +sub.f64 fd147, fd141, fd142; +sub.f64 fd148, fd143, fd144; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd149, fd150}, [rd29]; +mul.f64 fd153, fd148, fd150; +fma.rn.f64 fd154, fd149, fd147, fd153; +mul.f64 fd155, fd147, fd150; +mul.f64 fd156, fd149, fd148; +sub.f64 fd157, fd156, fd155; +and.b32 r64, r11, 2040; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 12288; +add.s32 r67, r65, r66; +st.shared.f64 [r67], fd145; +st.shared.f64 [r67+2048], fd154; +barrier.sync 0; +and.b32 r68, r11, 6144; +sub.s32 r69, r67, r68; +ld.shared.f64 fd158, [r69]; +ld.shared.f64 fd159, [r69+8192]; +barrier.sync 0; +st.shared.f64 [r67], fd146; +st.shared.f64 [r67+2048], fd157; +barrier.sync 0; +ld.shared.f64 fd160, [r69]; +ld.shared.f64 fd161, [r69+8192]; +add.f64 fd162, fd158, fd159; +add.f64 fd163, fd160, fd161; +sub.f64 fd164, fd158, fd159; +sub.f64 fd165, fd160, fd161; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 16; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f64 {fd166, fd167}, [rd32]; +mul.f64 fd170, fd165, fd167; +fma.rn.f64 fd171, fd166, fd164, fd170; +mul.f64 fd172, fd164, fd167; +mul.f64 fd173, fd166, fd165; +sub.f64 fd174, fd173, fd172; +and.b32 r71, r11, 4088; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 8192; +add.s32 r74, r72, r73; +st.shared.f64 [r74], fd162; +st.shared.f64 [r74+4096], fd171; +barrier.sync 0; +and.b32 r75, r11, 4096; +sub.s32 r76, r74, r75; +ld.shared.f64 fd175, [r76]; +ld.shared.f64 fd176, [r76+8192]; +barrier.sync 0; +st.shared.f64 [r74], fd163; +st.shared.f64 [r74+4096], fd174; +barrier.sync 0; +ld.shared.f64 fd177, [r76]; +ld.shared.f64 fd178, [r76+8192]; +add.f64 %0, fd175, fd176; +add.f64 %1, fd177, fd178; +sub.f64 %2, fd175, fd176; +sub.f64 %3, fd177, fd178; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_2048), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<662, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<77>; +.reg .f64 fd<223>; +.reg .b64 rd<33>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %15, %17; +sub.f64 fd10, %16, %18; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -32768; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 32736; +add.s32 r11, r8, r10; +add.f64 fd18, %16, %18; +add.f64 fd19, %15, %17; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 16368; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+16384]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 9; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 32704; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 16352; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+16384]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 8; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 32640; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 16320; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+16384]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 7; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 32512; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 16256; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+16384]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 1008; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd94, fd96; +mul.f64 fd100, fd93, fd96; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 32256; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd95, fd93, fd99; +sub.f64 fd105, fd101, fd100; +st.shared.v2.f64 [r39+256], {fd104, fd105}; +barrier.sync 0; +and.b32 r40, r9, 16128; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+16384]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 5; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd115, fd117; +mul.f64 fd121, fd114, fd117; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 31744; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd116, fd114, fd120; +sub.f64 fd126, fd122, fd121; +st.shared.v2.f64 [r46+512], {fd125, fd126}; +barrier.sync 0; +and.b32 r47, r9, 15872; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+16384]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 4; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd136, fd138; +mul.f64 fd142, fd135, fd138; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 30720; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd137, fd135, fd141; +sub.f64 fd147, fd143, fd142; +st.shared.v2.f64 [r53+1024], {fd146, fd147}; +barrier.sync 0; +and.b32 r54, r9, 15360; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+16384]; +sub.f64 fd156, fd148, fd152; +sub.f64 fd157, fd149, fd153; +bfe.u32 r56, r5, 7, 3; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd158, fd159}, [rd26]; +mul.f64 fd162, fd157, fd159; +mul.f64 fd163, fd156, fd159; +mul.f64 fd164, fd158, fd157; +and.b32 r57, r9, 2032; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 28672; +add.s32 r60, r58, r59; +add.f64 fd165, fd149, fd153; +add.f64 fd166, fd148, fd152; +st.shared.v2.f64 [r60], {fd166, fd165}; +fma.rn.f64 fd167, fd158, fd156, fd162; +sub.f64 fd168, fd164, fd163; +st.shared.v2.f64 [r60+2048], {fd167, fd168}; +barrier.sync 0; +and.b32 r61, r9, 14336; +sub.s32 r62, r60, r61; +ld.shared.v2.f64 {fd169, fd170}, [r62]; +ld.shared.v2.f64 {fd173, fd174}, [r62+16384]; +sub.f64 fd177, fd169, fd173; +sub.f64 fd178, fd170, fd174; +bfe.u32 r63, r5, 8, 2; +mul.wide.u32 rd27, r63, 16; +mov.u64 rd28, %13; +add.s64 rd29, rd28, rd27; +ld.global.v2.f64 {fd179, fd180}, [rd29]; +mul.f64 fd183, fd178, fd180; +mul.f64 fd184, fd177, fd180; +mul.f64 fd185, fd179, fd178; +and.b32 r64, r9, 4080; +add.s32 r65, r8, r64; +barrier.sync 0; +and.b32 r66, r6, 24576; +add.s32 r67, r65, r66; +add.f64 fd186, fd170, fd174; +add.f64 fd187, fd169, fd173; +st.shared.v2.f64 [r67], {fd187, fd186}; +fma.rn.f64 fd188, fd179, fd177, fd183; +sub.f64 fd189, fd185, fd184; +st.shared.v2.f64 [r67+4096], {fd188, fd189}; +barrier.sync 0; +and.b32 r68, r9, 12288; +sub.s32 r69, r67, r68; +ld.shared.v2.f64 {fd190, fd191}, [r69]; +ld.shared.v2.f64 {fd194, fd195}, [r69+16384]; +sub.f64 fd198, fd190, fd194; +sub.f64 fd199, fd191, fd195; +bfe.u32 r70, r5, 9, 1; +mul.wide.u32 rd30, r70, 16; +mov.u64 rd31, %14; +add.s64 rd32, rd31, rd30; +ld.global.v2.f64 {fd200, fd201}, [rd32]; +mul.f64 fd204, fd199, fd201; +mul.f64 fd205, fd198, fd201; +mul.f64 fd206, fd200, fd199; +and.b32 r71, r9, 8176; +add.s32 r72, r8, r71; +barrier.sync 0; +and.b32 r73, r6, 16384; +add.s32 r74, r72, r73; +add.f64 fd207, fd191, fd195; +add.f64 fd208, fd190, fd194; +st.shared.v2.f64 [r74], {fd208, fd207}; +fma.rn.f64 fd209, fd200, fd198, fd204; +sub.f64 fd210, fd206, fd205; +st.shared.v2.f64 [r74+8192], {fd209, fd210}; +barrier.sync 0; +and.b32 r75, r9, 8192; +sub.s32 r76, r74, r75; +ld.shared.v2.f64 {fd211, fd212}, [r76]; +ld.shared.v2.f64 {fd215, fd216}, [r76+16384]; +add.f64 %1, fd212, fd216; +add.f64 %0, fd211, fd215; +sub.f64 %3, fd212, fd216; +sub.f64 %2, fd211, fd215; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_2048), "l"(lut_dp_2_1024), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..34692db2ca02b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp16_fwd.hpp.inc @@ -0,0 +1,1993 @@ +#ifndef CUFFTDX_FFT_20_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_20_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<752, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<107>; +.reg .b32 r<1747>; +.reg .f64 fd<99>; +.reg .b64 rd<2>; +mov.f64 fd67, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd67; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd77, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd77; +} +mov.b32 r228, {rs2, rs2}; +mov.f64 fd75, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs3, fd75; +} +mov.b32 r282, {rs3, rs3}; +mov.f64 fd76, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs4, fd76; +} +mov.b32 r300, {rs4, rs4}; +{ +cvt.rn.f16.f64 rs5, fd67; +} +mov.b32 r291, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd77; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r306, {rs7, rs7}; +{ +add.f16x2 r1, %48, %72; +} +{ +add.f16x2 r4, %40, r1; +} +{ +add.f16x2 r7, %56, %64; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %49, %73; +} +{ +add.f16x2 r16, %41, r13; +} +{ +add.f16x2 r19, %57, %65; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %48, %72; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %40, r28; +} +{ +add.f16x2 r34, %56, %64; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %49, %73; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %57, %65; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %48, %72; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %40, r64; +} +{ +add.f16x2 r70, %56, %64; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %49, %73; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %57, %65; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %48, %72; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %40, r100; +} +{ +add.f16x2 r106, %56, %64; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %49, %73; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %57, %65; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %48, %72; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %40, r136; +} +{ +add.f16x2 r142, %56, %64; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %49, %73; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %57, %65; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %49, %73; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %41, r172; +} +{ +add.f16x2 r178, %57, %65; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %48, %72; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %56, %64; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %49, %73; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %41, r208; +} +{ +add.f16x2 r214, %57, %65; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %48, %72; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %56, %64; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %49, %73; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %41, r244; +} +{ +add.f16x2 r250, %57, %65; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %48, %72; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %56, %64; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %49, %73; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %41, r280; +} +{ +add.f16x2 r286, %57, %65; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %48, %72; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %56, %64; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs9, fd67; +} +mov.b32 r522, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd77; +} +mov.b32 r540, {rs10, rs10}; +{ +cvt.rn.f16.f64 rs11, fd75; +} +mov.b32 r594, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd76; +} +mov.b32 r612, {rs12, rs12}; +{ +cvt.rn.f16.f64 rs13, fd67; +} +mov.b32 r603, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd77; +} +{ +neg.f16 rs15, rs14; +} +mov.b32 r618, {rs15, rs15}; +{ +add.f16x2 r313, %52, %76; +} +{ +add.f16x2 r316, %44, r313; +} +{ +add.f16x2 r319, %60, %68; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %53, %77; +} +{ +add.f16x2 r328, %45, r325; +} +{ +add.f16x2 r331, %61, %69; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %52, %76; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %44, r340; +} +{ +add.f16x2 r346, %60, %68; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %53, %77; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %61, %69; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %52, %76; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %44, r376; +} +{ +add.f16x2 r382, %60, %68; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %53, %77; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %61, %69; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %52, %76; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %44, r412; +} +{ +add.f16x2 r418, %60, %68; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %53, %77; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %61, %69; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %52, %76; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %44, r448; +} +{ +add.f16x2 r454, %60, %68; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %53, %77; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %61, %69; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %53, %77; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %45, r484; +} +{ +add.f16x2 r490, %61, %69; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %52, %76; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %60, %68; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %53, %77; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %45, r520; +} +{ +add.f16x2 r526, %61, %69; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %52, %76; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %60, %68; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %53, %77; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %45, r556; +} +{ +add.f16x2 r562, %61, %69; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %52, %76; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %60, %68; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %53, %77; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %45, r592; +} +{ +add.f16x2 r598, %61, %69; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %52, %76; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %60, %68; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +mov.f64 fd63, 0d3FE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs17, fd63; +} +{ +cvt.rn.f16.f64 rs18, fd76; +} +{ +cvt.rn.f16.f64 rs19, fd67; +} +{ +cvt.rn.f16.f64 rs20, fd77; +} +mov.f64 fd78, 0dBFD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs21, fd78; +} +{ +cvt.rn.f16.f64 rs22, fd77; +} +{ +cvt.rn.f16.f64 rs23, fd75; +} +{ +cvt.rn.f16.f64 rs24, fd76; +} +mov.f64 fd65, 0d3FE2CF2304755A5E; +mov.f64 fd61, 0d3FEE6F0E134454FF; +mov.b32 r639, {rs17, rs17}; +{ +mul.f16x2 r625, r370, r639; +} +mov.b32 r636, {rs18, rs18}; +{ +mul.f16x2 r628, r514, r636; +} +{ +sub.f16x2 r631, r625, r628; +} +{ +mul.f16x2 r634, r370, r636; +} +{ +fma.rn.f16x2 r637, r514, r639, r634; +} +mov.b32 r655, {rs19, rs19}; +{ +mul.f16x2 r641, r442, r655; +} +mov.b32 r652, {rs20, rs20}; +{ +mul.f16x2 r644, r586, r652; +} +{ +sub.f16x2 r647, r641, r644; +} +{ +mul.f16x2 r650, r442, r652; +} +{ +fma.rn.f16x2 r653, r586, r655, r650; +} +mov.b32 r671, {rs21, rs21}; +{ +mul.f16x2 r657, r478, r671; +} +mov.b32 r668, {rs22, rs22}; +{ +mul.f16x2 r660, r622, r668; +} +{ +sub.f16x2 r663, r657, r660; +} +{ +mul.f16x2 r666, r478, r668; +} +{ +fma.rn.f16x2 r669, r622, r671, r666; +} +mov.b32 r687, {rs23, rs23}; +{ +mul.f16x2 r673, r406, r687; +} +mov.b32 r684, {rs24, rs24}; +{ +mul.f16x2 r676, r550, r684; +} +{ +sub.f16x2 r679, r673, r676; +} +{ +mul.f16x2 r682, r406, r684; +} +{ +fma.rn.f16x2 r685, r550, r687, r682; +} +{ +add.f16x2 r689, r10, r322; +} +{ +add.f16x2 r692, r22, r334; +} +{ +sub.f16x2 r695, r10, r322; +} +{ +sub.f16x2 r698, r22, r334; +} +{ +add.f16x2 r701, r58, r631; +} +{ +add.f16x2 r704, r202, r637; +} +{ +sub.f16x2 r707, r58, r631; +} +{ +sub.f16x2 r710, r202, r637; +} +{ +add.f16x2 r713, r130, r647; +} +{ +add.f16x2 r716, r274, r653; +} +{ +sub.f16x2 r719, r130, r647; +} +{ +sub.f16x2 r722, r274, r653; +} +{ +add.f16x2 r725, r166, r663; +} +{ +add.f16x2 r728, r310, r669; +} +{ +sub.f16x2 r731, r166, r663; +} +{ +sub.f16x2 r734, r310, r669; +} +{ +add.f16x2 r737, r94, r679; +} +{ +add.f16x2 r740, r238, r685; +} +{ +sub.f16x2 r743, r94, r679; +} +{ +sub.f16x2 r746, r238, r685; +} +{ +cvt.rn.f16.f64 rs35, fd67; +} +mov.b32 r958, {rs35, rs35}; +{ +cvt.rn.f16.f64 rs36, fd77; +} +mov.b32 r976, {rs36, rs36}; +{ +cvt.rn.f16.f64 rs37, fd75; +} +mov.b32 r1030, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs38, fd76; +} +mov.b32 r1048, {rs38, rs38}; +{ +cvt.rn.f16.f64 rs39, fd67; +} +mov.b32 r1039, {rs39, rs39}; +{ +cvt.rn.f16.f64 rs40, fd77; +} +{ +neg.f16 rs41, rs40; +} +mov.b32 r1054, {rs41, rs41}; +{ +add.f16x2 r749, %50, %74; +} +{ +add.f16x2 r752, %42, r749; +} +{ +add.f16x2 r755, %58, %66; +} +{ +add.f16x2 r758, r752, r755; +} +{ +add.f16x2 r761, %51, %75; +} +{ +add.f16x2 r764, %43, r761; +} +{ +add.f16x2 r767, %59, %67; +} +{ +add.f16x2 r770, r764, r767; +} +{ +add.f16x2 r773, %50, %74; +} +{ +mul.f16x2 r776, r773, r958; +} +{ +add.f16x2 r779, %42, r776; +} +{ +add.f16x2 r782, %58, %66; +} +{ +mul.f16x2 r785, r782, r1030; +} +{ +add.f16x2 r788, r779, r785; +} +{ +sub.f16x2 r791, %51, %75; +} +{ +mul.f16x2 r794, r791, r976; +} +{ +sub.f16x2 r797, %59, %67; +} +{ +mul.f16x2 r800, r797, r1048; +} +{ +add.f16x2 r803, r794, r800; +} +{ +sub.f16x2 r806, r788, r803; +} +{ +add.f16x2 r809, %50, %74; +} +{ +mul.f16x2 r812, r809, r958; +} +{ +add.f16x2 r815, %42, r812; +} +{ +add.f16x2 r818, %58, %66; +} +{ +mul.f16x2 r821, r818, r1030; +} +{ +add.f16x2 r824, r815, r821; +} +{ +sub.f16x2 r827, %51, %75; +} +{ +mul.f16x2 r830, r827, r976; +} +{ +sub.f16x2 r833, %59, %67; +} +{ +mul.f16x2 r836, r833, r1048; +} +{ +add.f16x2 r839, r830, r836; +} +{ +add.f16x2 r842, r824, r839; +} +{ +add.f16x2 r845, %50, %74; +} +{ +mul.f16x2 r848, r845, r1030; +} +{ +add.f16x2 r851, %42, r848; +} +{ +add.f16x2 r854, %58, %66; +} +{ +mul.f16x2 r857, r854, r1039; +} +{ +add.f16x2 r860, r851, r857; +} +{ +sub.f16x2 r863, %51, %75; +} +{ +mul.f16x2 r866, r863, r1048; +} +{ +sub.f16x2 r869, %59, %67; +} +{ +mul.f16x2 r872, r869, r1054; +} +{ +add.f16x2 r875, r866, r872; +} +{ +sub.f16x2 r878, r860, r875; +} +{ +add.f16x2 r881, %50, %74; +} +{ +mul.f16x2 r884, r881, r1030; +} +{ +add.f16x2 r887, %42, r884; +} +{ +add.f16x2 r890, %58, %66; +} +{ +mul.f16x2 r893, r890, r1039; +} +{ +add.f16x2 r896, r887, r893; +} +{ +sub.f16x2 r899, %51, %75; +} +{ +mul.f16x2 r902, r899, r1048; +} +{ +sub.f16x2 r905, %59, %67; +} +{ +mul.f16x2 r908, r905, r1054; +} +{ +add.f16x2 r911, r902, r908; +} +{ +add.f16x2 r914, r896, r911; +} +{ +add.f16x2 r917, %51, %75; +} +{ +mul.f16x2 r920, r917, r958; +} +{ +add.f16x2 r923, %43, r920; +} +{ +add.f16x2 r926, %59, %67; +} +{ +mul.f16x2 r929, r926, r1030; +} +{ +add.f16x2 r932, r923, r929; +} +{ +sub.f16x2 r935, %50, %74; +} +{ +mul.f16x2 r938, r935, r976; +} +{ +sub.f16x2 r941, %58, %66; +} +{ +mul.f16x2 r944, r941, r1048; +} +{ +add.f16x2 r947, r938, r944; +} +{ +add.f16x2 r950, r932, r947; +} +{ +add.f16x2 r953, %51, %75; +} +{ +mul.f16x2 r956, r953, r958; +} +{ +add.f16x2 r959, %43, r956; +} +{ +add.f16x2 r962, %59, %67; +} +{ +mul.f16x2 r965, r962, r1030; +} +{ +add.f16x2 r968, r959, r965; +} +{ +sub.f16x2 r971, %50, %74; +} +{ +mul.f16x2 r974, r971, r976; +} +{ +sub.f16x2 r977, %58, %66; +} +{ +mul.f16x2 r980, r977, r1048; +} +{ +add.f16x2 r983, r974, r980; +} +{ +sub.f16x2 r986, r968, r983; +} +{ +add.f16x2 r989, %51, %75; +} +{ +mul.f16x2 r992, r989, r1030; +} +{ +add.f16x2 r995, %43, r992; +} +{ +add.f16x2 r998, %59, %67; +} +{ +mul.f16x2 r1001, r998, r1039; +} +{ +add.f16x2 r1004, r995, r1001; +} +{ +sub.f16x2 r1007, %50, %74; +} +{ +mul.f16x2 r1010, r1007, r1048; +} +{ +sub.f16x2 r1013, %58, %66; +} +{ +mul.f16x2 r1016, r1013, r1054; +} +{ +add.f16x2 r1019, r1010, r1016; +} +{ +add.f16x2 r1022, r1004, r1019; +} +{ +add.f16x2 r1025, %51, %75; +} +{ +mul.f16x2 r1028, r1025, r1030; +} +{ +add.f16x2 r1031, %43, r1028; +} +{ +add.f16x2 r1034, %59, %67; +} +{ +mul.f16x2 r1037, r1034, r1039; +} +{ +add.f16x2 r1040, r1031, r1037; +} +{ +sub.f16x2 r1043, %50, %74; +} +{ +mul.f16x2 r1046, r1043, r1048; +} +{ +sub.f16x2 r1049, %58, %66; +} +{ +mul.f16x2 r1052, r1049, r1054; +} +{ +add.f16x2 r1055, r1046, r1052; +} +{ +sub.f16x2 r1058, r1040, r1055; +} +{ +cvt.rn.f16.f64 rs43, fd67; +} +mov.b32 r1270, {rs43, rs43}; +{ +cvt.rn.f16.f64 rs44, fd77; +} +mov.b32 r1288, {rs44, rs44}; +{ +cvt.rn.f16.f64 rs45, fd75; +} +mov.b32 r1342, {rs45, rs45}; +{ +cvt.rn.f16.f64 rs46, fd76; +} +mov.b32 r1360, {rs46, rs46}; +{ +cvt.rn.f16.f64 rs47, fd67; +} +mov.b32 r1351, {rs47, rs47}; +{ +cvt.rn.f16.f64 rs48, fd77; +} +{ +neg.f16 rs49, rs48; +} +mov.b32 r1366, {rs49, rs49}; +{ +add.f16x2 r1061, %54, %78; +} +{ +add.f16x2 r1064, %46, r1061; +} +{ +add.f16x2 r1067, %62, %70; +} +{ +add.f16x2 r1070, r1064, r1067; +} +{ +add.f16x2 r1073, %55, %79; +} +{ +add.f16x2 r1076, %47, r1073; +} +{ +add.f16x2 r1079, %63, %71; +} +{ +add.f16x2 r1082, r1076, r1079; +} +{ +add.f16x2 r1085, %54, %78; +} +{ +mul.f16x2 r1088, r1085, r1270; +} +{ +add.f16x2 r1091, %46, r1088; +} +{ +add.f16x2 r1094, %62, %70; +} +{ +mul.f16x2 r1097, r1094, r1342; +} +{ +add.f16x2 r1100, r1091, r1097; +} +{ +sub.f16x2 r1103, %55, %79; +} +{ +mul.f16x2 r1106, r1103, r1288; +} +{ +sub.f16x2 r1109, %63, %71; +} +{ +mul.f16x2 r1112, r1109, r1360; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +sub.f16x2 r1118, r1100, r1115; +} +{ +add.f16x2 r1121, %54, %78; +} +{ +mul.f16x2 r1124, r1121, r1270; +} +{ +add.f16x2 r1127, %46, r1124; +} +{ +add.f16x2 r1130, %62, %70; +} +{ +mul.f16x2 r1133, r1130, r1342; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +sub.f16x2 r1139, %55, %79; +} +{ +mul.f16x2 r1142, r1139, r1288; +} +{ +sub.f16x2 r1145, %63, %71; +} +{ +mul.f16x2 r1148, r1145, r1360; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1136, r1151; +} +{ +add.f16x2 r1157, %54, %78; +} +{ +mul.f16x2 r1160, r1157, r1342; +} +{ +add.f16x2 r1163, %46, r1160; +} +{ +add.f16x2 r1166, %62, %70; +} +{ +mul.f16x2 r1169, r1166, r1351; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, %55, %79; +} +{ +mul.f16x2 r1178, r1175, r1360; +} +{ +sub.f16x2 r1181, %63, %71; +} +{ +mul.f16x2 r1184, r1181, r1366; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, %54, %78; +} +{ +mul.f16x2 r1196, r1193, r1342; +} +{ +add.f16x2 r1199, %46, r1196; +} +{ +add.f16x2 r1202, %62, %70; +} +{ +mul.f16x2 r1205, r1202, r1351; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, %55, %79; +} +{ +mul.f16x2 r1214, r1211, r1360; +} +{ +sub.f16x2 r1217, %63, %71; +} +{ +mul.f16x2 r1220, r1217, r1366; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, %55, %79; +} +{ +mul.f16x2 r1232, r1229, r1270; +} +{ +add.f16x2 r1235, %47, r1232; +} +{ +add.f16x2 r1238, %63, %71; +} +{ +mul.f16x2 r1241, r1238, r1342; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, %54, %78; +} +{ +mul.f16x2 r1250, r1247, r1288; +} +{ +sub.f16x2 r1253, %62, %70; +} +{ +mul.f16x2 r1256, r1253, r1360; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, %55, %79; +} +{ +mul.f16x2 r1268, r1265, r1270; +} +{ +add.f16x2 r1271, %47, r1268; +} +{ +add.f16x2 r1274, %63, %71; +} +{ +mul.f16x2 r1277, r1274, r1342; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, %54, %78; +} +{ +mul.f16x2 r1286, r1283, r1288; +} +{ +sub.f16x2 r1289, %62, %70; +} +{ +mul.f16x2 r1292, r1289, r1360; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +sub.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, %55, %79; +} +{ +mul.f16x2 r1304, r1301, r1342; +} +{ +add.f16x2 r1307, %47, r1304; +} +{ +add.f16x2 r1310, %63, %71; +} +{ +mul.f16x2 r1313, r1310, r1351; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, %54, %78; +} +{ +mul.f16x2 r1322, r1319, r1360; +} +{ +sub.f16x2 r1325, %62, %70; +} +{ +mul.f16x2 r1328, r1325, r1366; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, %55, %79; +} +{ +mul.f16x2 r1340, r1337, r1342; +} +{ +add.f16x2 r1343, %47, r1340; +} +{ +add.f16x2 r1346, %63, %71; +} +{ +mul.f16x2 r1349, r1346, r1351; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, %54, %78; +} +{ +mul.f16x2 r1358, r1355, r1360; +} +{ +sub.f16x2 r1361, %62, %70; +} +{ +mul.f16x2 r1364, r1361, r1366; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +cvt.rn.f16.f64 rs51, fd63; +} +{ +cvt.rn.f16.f64 rs52, fd76; +} +{ +cvt.rn.f16.f64 rs53, fd67; +} +{ +cvt.rn.f16.f64 rs54, fd77; +} +{ +cvt.rn.f16.f64 rs55, fd78; +} +{ +cvt.rn.f16.f64 rs56, fd77; +} +{ +cvt.rn.f16.f64 rs57, fd75; +} +{ +cvt.rn.f16.f64 rs58, fd76; +} +mov.b32 r1387, {rs51, rs51}; +{ +mul.f16x2 r1373, r1118, r1387; +} +mov.b32 r1384, {rs52, rs52}; +{ +mul.f16x2 r1376, r1262, r1384; +} +{ +sub.f16x2 r1379, r1373, r1376; +} +{ +mul.f16x2 r1382, r1118, r1384; +} +{ +fma.rn.f16x2 r1385, r1262, r1387, r1382; +} +mov.b32 r1403, {rs53, rs53}; +{ +mul.f16x2 r1389, r1190, r1403; +} +mov.b32 r1400, {rs54, rs54}; +{ +mul.f16x2 r1392, r1334, r1400; +} +{ +sub.f16x2 r1395, r1389, r1392; +} +{ +mul.f16x2 r1398, r1190, r1400; +} +{ +fma.rn.f16x2 r1401, r1334, r1403, r1398; +} +mov.b32 r1419, {rs55, rs55}; +{ +mul.f16x2 r1405, r1226, r1419; +} +mov.b32 r1416, {rs56, rs56}; +{ +mul.f16x2 r1408, r1370, r1416; +} +{ +sub.f16x2 r1411, r1405, r1408; +} +{ +mul.f16x2 r1414, r1226, r1416; +} +{ +fma.rn.f16x2 r1417, r1370, r1419, r1414; +} +mov.b32 r1435, {rs57, rs57}; +{ +mul.f16x2 r1421, r1154, r1435; +} +mov.b32 r1432, {rs58, rs58}; +{ +mul.f16x2 r1424, r1298, r1432; +} +{ +sub.f16x2 r1427, r1421, r1424; +} +{ +mul.f16x2 r1430, r1154, r1432; +} +{ +fma.rn.f16x2 r1433, r1298, r1435, r1430; +} +{ +add.f16x2 r1437, r758, r1070; +} +{ +add.f16x2 r1440, r770, r1082; +} +{ +sub.f16x2 r1443, r758, r1070; +} +{ +sub.f16x2 r1446, r770, r1082; +} +{ +add.f16x2 r1449, r806, r1379; +} +{ +add.f16x2 r1452, r950, r1385; +} +{ +sub.f16x2 r1455, r806, r1379; +} +{ +sub.f16x2 r1458, r950, r1385; +} +{ +add.f16x2 r1461, r878, r1395; +} +{ +add.f16x2 r1464, r1022, r1401; +} +{ +sub.f16x2 r1467, r878, r1395; +} +{ +sub.f16x2 r1470, r1022, r1401; +} +{ +add.f16x2 r1473, r914, r1411; +} +{ +add.f16x2 r1476, r1058, r1417; +} +{ +sub.f16x2 r1479, r914, r1411; +} +{ +sub.f16x2 r1482, r1058, r1417; +} +{ +add.f16x2 r1485, r842, r1427; +} +{ +add.f16x2 r1488, r986, r1433; +} +{ +sub.f16x2 r1491, r842, r1427; +} +{ +sub.f16x2 r1494, r986, r1433; +} +{ +cvt.rn.f16.f64 rs69, fd61; +} +{ +cvt.rn.f16.f64 rs70, fd78; +} +{ +cvt.rn.f16.f64 rs71, fd63; +} +{ +cvt.rn.f16.f64 rs72, fd76; +} +{ +cvt.rn.f16.f64 rs73, fd65; +} +{ +cvt.rn.f16.f64 rs74, fd75; +} +{ +cvt.rn.f16.f64 rs75, fd67; +} +{ +cvt.rn.f16.f64 rs76, fd77; +} +{ +cvt.rn.f16.f64 rs79, fd78; +} +{ +cvt.rn.f16.f64 rs80, fd77; +} +{ +cvt.rn.f16.f64 rs81, fd76; +} +{ +cvt.rn.f16.f64 rs82, fd75; +} +{ +cvt.rn.f16.f64 rs83, fd75; +} +{ +cvt.rn.f16.f64 rs84, fd76; +} +{ +cvt.rn.f16.f64 rs85, fd77; +} +{ +cvt.rn.f16.f64 rs86, fd78; +} +mov.b32 r1511, {rs69, rs69}; +{ +mul.f16x2 r1497, r1449, r1511; +} +mov.b32 r1508, {rs70, rs70}; +{ +mul.f16x2 r1500, r1452, r1508; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1449, r1508; +} +{ +fma.rn.f16x2 r1509, r1452, r1511, r1506; +} +mov.b32 r1527, {rs71, rs71}; +{ +mul.f16x2 r1513, r1461, r1527; +} +mov.b32 r1524, {rs72, rs72}; +{ +mul.f16x2 r1516, r1464, r1524; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1461, r1524; +} +{ +fma.rn.f16x2 r1525, r1464, r1527, r1522; +} +mov.b32 r1543, {rs73, rs73}; +{ +mul.f16x2 r1529, r1473, r1543; +} +mov.b32 r1540, {rs74, rs74}; +{ +mul.f16x2 r1532, r1476, r1540; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1473, r1540; +} +{ +fma.rn.f16x2 r1541, r1476, r1543, r1538; +} +mov.b32 r1559, {rs75, rs75}; +{ +mul.f16x2 r1545, r1485, r1559; +} +mov.b32 r1556, {rs76, rs76}; +{ +mul.f16x2 r1548, r1488, r1556; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1485, r1556; +} +{ +fma.rn.f16x2 r1557, r1488, r1559, r1554; +} +{ +neg.f16x2 r1561, r1443; +} +mov.b32 r1577, {rs79, rs79}; +{ +mul.f16x2 r1563, r1455, r1577; +} +mov.b32 r1574, {rs80, rs80}; +{ +mul.f16x2 r1566, r1458, r1574; +} +{ +sub.f16x2 r1569, r1563, r1566; +} +{ +mul.f16x2 r1572, r1455, r1574; +} +{ +fma.rn.f16x2 r1575, r1458, r1577, r1572; +} +mov.b32 r1593, {rs81, rs81}; +{ +mul.f16x2 r1579, r1467, r1593; +} +mov.b32 r1590, {rs82, rs82}; +{ +mul.f16x2 r1582, r1470, r1590; +} +{ +sub.f16x2 r1585, r1579, r1582; +} +{ +mul.f16x2 r1588, r1467, r1590; +} +{ +fma.rn.f16x2 r1591, r1470, r1593, r1588; +} +mov.b32 r1609, {rs83, rs83}; +{ +mul.f16x2 r1595, r1479, r1609; +} +mov.b32 r1606, {rs84, rs84}; +{ +mul.f16x2 r1598, r1482, r1606; +} +{ +sub.f16x2 r1601, r1595, r1598; +} +{ +mul.f16x2 r1604, r1479, r1606; +} +{ +fma.rn.f16x2 r1607, r1482, r1609, r1604; +} +mov.b32 r1625, {rs85, rs85}; +{ +mul.f16x2 r1611, r1491, r1625; +} +mov.b32 r1622, {rs86, rs86}; +{ +mul.f16x2 r1614, r1494, r1622; +} +{ +sub.f16x2 r1617, r1611, r1614; +} +{ +mul.f16x2 r1620, r1491, r1622; +} +{ +fma.rn.f16x2 r1623, r1494, r1625, r1620; +} +{ +add.f16x2 %0, r689, r1437; +} +{ +add.f16x2 %1, r692, r1440; +} +{ +sub.f16x2 %20, r689, r1437; +} +{ +sub.f16x2 %21, r692, r1440; +} +{ +add.f16x2 %2, r701, r1503; +} +{ +add.f16x2 %3, r704, r1509; +} +{ +sub.f16x2 %22, r701, r1503; +} +{ +sub.f16x2 %23, r704, r1509; +} +{ +add.f16x2 %4, r713, r1519; +} +{ +add.f16x2 %5, r716, r1525; +} +{ +sub.f16x2 %24, r713, r1519; +} +{ +sub.f16x2 %25, r716, r1525; +} +{ +add.f16x2 %6, r725, r1535; +} +{ +add.f16x2 %7, r728, r1541; +} +{ +sub.f16x2 %26, r725, r1535; +} +{ +sub.f16x2 %27, r728, r1541; +} +{ +add.f16x2 %8, r737, r1551; +} +{ +add.f16x2 %9, r740, r1557; +} +{ +sub.f16x2 %28, r737, r1551; +} +{ +sub.f16x2 %29, r740, r1557; +} +{ +add.f16x2 %10, r695, r1446; +} +{ +add.f16x2 %11, r698, r1561; +} +{ +sub.f16x2 %30, r695, r1446; +} +{ +sub.f16x2 %31, r698, r1561; +} +{ +add.f16x2 %12, r707, r1569; +} +{ +add.f16x2 %13, r710, r1575; +} +{ +sub.f16x2 %32, r707, r1569; +} +{ +sub.f16x2 %33, r710, r1575; +} +{ +add.f16x2 %14, r719, r1585; +} +{ +add.f16x2 %15, r722, r1591; +} +{ +sub.f16x2 %34, r719, r1585; +} +{ +sub.f16x2 %35, r722, r1591; +} +{ +add.f16x2 %16, r731, r1601; +} +{ +add.f16x2 %17, r734, r1607; +} +{ +sub.f16x2 %36, r731, r1601; +} +{ +sub.f16x2 %37, r734, r1607; +} +{ +add.f16x2 %18, r743, r1617; +} +{ +add.f16x2 %19, r746, r1623; +} +{ +sub.f16x2 %38, r743, r1617; +} +{ +sub.f16x2 %39, r746, r1623; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..25935813a20a8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp16_inv.hpp.inc @@ -0,0 +1,2005 @@ +#ifndef CUFFTDX_FFT_20_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_20_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<954, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<115>; +.reg .b32 r<1747>; +.reg .f64 fd<99>; +.reg .b64 rd<2>; +mov.f64 fd78, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd78; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd77, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd77; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r228, {rs3, rs3}; +mov.f64 fd75, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs5, fd75; +} +mov.b32 r282, {rs5, rs5}; +mov.f64 fd73, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs6, fd73; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r300, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs9, fd78; +} +mov.b32 r291, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd77; +} +mov.b32 r306, {rs10, rs10}; +{ +add.f16x2 r1, %48, %72; +} +{ +add.f16x2 r4, %40, r1; +} +{ +add.f16x2 r7, %56, %64; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %49, %73; +} +{ +add.f16x2 r16, %41, r13; +} +{ +add.f16x2 r19, %57, %65; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %48, %72; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %40, r28; +} +{ +add.f16x2 r34, %56, %64; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %49, %73; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %57, %65; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %48, %72; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %40, r64; +} +{ +add.f16x2 r70, %56, %64; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %49, %73; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %57, %65; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %48, %72; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %40, r100; +} +{ +add.f16x2 r106, %56, %64; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %49, %73; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %57, %65; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %48, %72; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %40, r136; +} +{ +add.f16x2 r142, %56, %64; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %49, %73; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %57, %65; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %49, %73; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %41, r172; +} +{ +add.f16x2 r178, %57, %65; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %48, %72; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %56, %64; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %49, %73; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %41, r208; +} +{ +add.f16x2 r214, %57, %65; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %48, %72; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %56, %64; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %49, %73; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %41, r244; +} +{ +add.f16x2 r250, %57, %65; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %48, %72; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %56, %64; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %49, %73; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %41, r280; +} +{ +add.f16x2 r286, %57, %65; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %48, %72; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %56, %64; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs11, fd78; +} +mov.b32 r522, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd77; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r540, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs15, fd75; +} +mov.b32 r594, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd73; +} +{ +neg.f16 rs17, rs16; +} +mov.b32 r612, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs19, fd78; +} +mov.b32 r603, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd77; +} +mov.b32 r618, {rs20, rs20}; +{ +add.f16x2 r313, %52, %76; +} +{ +add.f16x2 r316, %44, r313; +} +{ +add.f16x2 r319, %60, %68; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %53, %77; +} +{ +add.f16x2 r328, %45, r325; +} +{ +add.f16x2 r331, %61, %69; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %52, %76; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %44, r340; +} +{ +add.f16x2 r346, %60, %68; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %53, %77; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %61, %69; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %52, %76; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %44, r376; +} +{ +add.f16x2 r382, %60, %68; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %53, %77; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %61, %69; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %52, %76; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %44, r412; +} +{ +add.f16x2 r418, %60, %68; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %53, %77; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %61, %69; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %52, %76; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %44, r448; +} +{ +add.f16x2 r454, %60, %68; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %53, %77; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %61, %69; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %53, %77; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %45, r484; +} +{ +add.f16x2 r490, %61, %69; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %52, %76; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %60, %68; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %53, %77; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %45, r520; +} +{ +add.f16x2 r526, %61, %69; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %52, %76; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %60, %68; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %53, %77; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %45, r556; +} +{ +add.f16x2 r562, %61, %69; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %52, %76; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %60, %68; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %53, %77; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %45, r592; +} +{ +add.f16x2 r598, %61, %69; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %52, %76; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %60, %68; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +mov.f64 fd74, 0d3FE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs21, fd74; +} +mov.f64 fd76, 0d3FE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs22, fd76; +} +{ +cvt.rn.f16.f64 rs23, fd78; +} +mov.f64 fd72, 0d3FEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs24, fd72; +} +mov.f64 fd71, 0dBFD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs25, fd71; +} +{ +cvt.rn.f16.f64 rs26, fd72; +} +{ +cvt.rn.f16.f64 rs27, fd75; +} +{ +cvt.rn.f16.f64 rs28, fd76; +} +mov.b32 r639, {rs21, rs21}; +{ +mul.f16x2 r625, r370, r639; +} +mov.b32 r636, {rs22, rs22}; +{ +mul.f16x2 r628, r514, r636; +} +{ +sub.f16x2 r631, r625, r628; +} +{ +mul.f16x2 r634, r370, r636; +} +{ +fma.rn.f16x2 r637, r514, r639, r634; +} +mov.b32 r655, {rs23, rs23}; +{ +mul.f16x2 r641, r442, r655; +} +mov.b32 r652, {rs24, rs24}; +{ +mul.f16x2 r644, r586, r652; +} +{ +sub.f16x2 r647, r641, r644; +} +{ +mul.f16x2 r650, r442, r652; +} +{ +fma.rn.f16x2 r653, r586, r655, r650; +} +mov.b32 r671, {rs25, rs25}; +{ +mul.f16x2 r657, r478, r671; +} +mov.b32 r668, {rs26, rs26}; +{ +mul.f16x2 r660, r622, r668; +} +{ +sub.f16x2 r663, r657, r660; +} +{ +mul.f16x2 r666, r478, r668; +} +{ +fma.rn.f16x2 r669, r622, r671, r666; +} +mov.b32 r687, {rs27, rs27}; +{ +mul.f16x2 r673, r406, r687; +} +mov.b32 r684, {rs28, rs28}; +{ +mul.f16x2 r676, r550, r684; +} +{ +sub.f16x2 r679, r673, r676; +} +{ +mul.f16x2 r682, r406, r684; +} +{ +fma.rn.f16x2 r685, r550, r687, r682; +} +{ +add.f16x2 r689, r10, r322; +} +{ +add.f16x2 r692, r22, r334; +} +{ +sub.f16x2 r695, r10, r322; +} +{ +sub.f16x2 r698, r22, r334; +} +{ +add.f16x2 r701, r58, r631; +} +{ +add.f16x2 r704, r202, r637; +} +{ +sub.f16x2 r707, r58, r631; +} +{ +sub.f16x2 r710, r202, r637; +} +{ +add.f16x2 r713, r130, r647; +} +{ +add.f16x2 r716, r274, r653; +} +{ +sub.f16x2 r719, r130, r647; +} +{ +sub.f16x2 r722, r274, r653; +} +{ +add.f16x2 r725, r166, r663; +} +{ +add.f16x2 r728, r310, r669; +} +{ +sub.f16x2 r731, r166, r663; +} +{ +sub.f16x2 r734, r310, r669; +} +{ +add.f16x2 r737, r94, r679; +} +{ +add.f16x2 r740, r238, r685; +} +{ +sub.f16x2 r743, r94, r679; +} +{ +sub.f16x2 r746, r238, r685; +} +{ +cvt.rn.f16.f64 rs39, fd78; +} +mov.b32 r958, {rs39, rs39}; +{ +cvt.rn.f16.f64 rs40, fd77; +} +{ +neg.f16 rs41, rs40; +} +mov.b32 r976, {rs41, rs41}; +{ +cvt.rn.f16.f64 rs43, fd75; +} +mov.b32 r1030, {rs43, rs43}; +{ +cvt.rn.f16.f64 rs44, fd73; +} +{ +neg.f16 rs45, rs44; +} +mov.b32 r1048, {rs45, rs45}; +{ +cvt.rn.f16.f64 rs47, fd78; +} +mov.b32 r1039, {rs47, rs47}; +{ +cvt.rn.f16.f64 rs48, fd77; +} +mov.b32 r1054, {rs48, rs48}; +{ +add.f16x2 r749, %50, %74; +} +{ +add.f16x2 r752, %42, r749; +} +{ +add.f16x2 r755, %58, %66; +} +{ +add.f16x2 r758, r752, r755; +} +{ +add.f16x2 r761, %51, %75; +} +{ +add.f16x2 r764, %43, r761; +} +{ +add.f16x2 r767, %59, %67; +} +{ +add.f16x2 r770, r764, r767; +} +{ +add.f16x2 r773, %50, %74; +} +{ +mul.f16x2 r776, r773, r958; +} +{ +add.f16x2 r779, %42, r776; +} +{ +add.f16x2 r782, %58, %66; +} +{ +mul.f16x2 r785, r782, r1030; +} +{ +add.f16x2 r788, r779, r785; +} +{ +sub.f16x2 r791, %51, %75; +} +{ +mul.f16x2 r794, r791, r976; +} +{ +sub.f16x2 r797, %59, %67; +} +{ +mul.f16x2 r800, r797, r1048; +} +{ +add.f16x2 r803, r794, r800; +} +{ +sub.f16x2 r806, r788, r803; +} +{ +add.f16x2 r809, %50, %74; +} +{ +mul.f16x2 r812, r809, r958; +} +{ +add.f16x2 r815, %42, r812; +} +{ +add.f16x2 r818, %58, %66; +} +{ +mul.f16x2 r821, r818, r1030; +} +{ +add.f16x2 r824, r815, r821; +} +{ +sub.f16x2 r827, %51, %75; +} +{ +mul.f16x2 r830, r827, r976; +} +{ +sub.f16x2 r833, %59, %67; +} +{ +mul.f16x2 r836, r833, r1048; +} +{ +add.f16x2 r839, r830, r836; +} +{ +add.f16x2 r842, r824, r839; +} +{ +add.f16x2 r845, %50, %74; +} +{ +mul.f16x2 r848, r845, r1030; +} +{ +add.f16x2 r851, %42, r848; +} +{ +add.f16x2 r854, %58, %66; +} +{ +mul.f16x2 r857, r854, r1039; +} +{ +add.f16x2 r860, r851, r857; +} +{ +sub.f16x2 r863, %51, %75; +} +{ +mul.f16x2 r866, r863, r1048; +} +{ +sub.f16x2 r869, %59, %67; +} +{ +mul.f16x2 r872, r869, r1054; +} +{ +add.f16x2 r875, r866, r872; +} +{ +sub.f16x2 r878, r860, r875; +} +{ +add.f16x2 r881, %50, %74; +} +{ +mul.f16x2 r884, r881, r1030; +} +{ +add.f16x2 r887, %42, r884; +} +{ +add.f16x2 r890, %58, %66; +} +{ +mul.f16x2 r893, r890, r1039; +} +{ +add.f16x2 r896, r887, r893; +} +{ +sub.f16x2 r899, %51, %75; +} +{ +mul.f16x2 r902, r899, r1048; +} +{ +sub.f16x2 r905, %59, %67; +} +{ +mul.f16x2 r908, r905, r1054; +} +{ +add.f16x2 r911, r902, r908; +} +{ +add.f16x2 r914, r896, r911; +} +{ +add.f16x2 r917, %51, %75; +} +{ +mul.f16x2 r920, r917, r958; +} +{ +add.f16x2 r923, %43, r920; +} +{ +add.f16x2 r926, %59, %67; +} +{ +mul.f16x2 r929, r926, r1030; +} +{ +add.f16x2 r932, r923, r929; +} +{ +sub.f16x2 r935, %50, %74; +} +{ +mul.f16x2 r938, r935, r976; +} +{ +sub.f16x2 r941, %58, %66; +} +{ +mul.f16x2 r944, r941, r1048; +} +{ +add.f16x2 r947, r938, r944; +} +{ +add.f16x2 r950, r932, r947; +} +{ +add.f16x2 r953, %51, %75; +} +{ +mul.f16x2 r956, r953, r958; +} +{ +add.f16x2 r959, %43, r956; +} +{ +add.f16x2 r962, %59, %67; +} +{ +mul.f16x2 r965, r962, r1030; +} +{ +add.f16x2 r968, r959, r965; +} +{ +sub.f16x2 r971, %50, %74; +} +{ +mul.f16x2 r974, r971, r976; +} +{ +sub.f16x2 r977, %58, %66; +} +{ +mul.f16x2 r980, r977, r1048; +} +{ +add.f16x2 r983, r974, r980; +} +{ +sub.f16x2 r986, r968, r983; +} +{ +add.f16x2 r989, %51, %75; +} +{ +mul.f16x2 r992, r989, r1030; +} +{ +add.f16x2 r995, %43, r992; +} +{ +add.f16x2 r998, %59, %67; +} +{ +mul.f16x2 r1001, r998, r1039; +} +{ +add.f16x2 r1004, r995, r1001; +} +{ +sub.f16x2 r1007, %50, %74; +} +{ +mul.f16x2 r1010, r1007, r1048; +} +{ +sub.f16x2 r1013, %58, %66; +} +{ +mul.f16x2 r1016, r1013, r1054; +} +{ +add.f16x2 r1019, r1010, r1016; +} +{ +add.f16x2 r1022, r1004, r1019; +} +{ +add.f16x2 r1025, %51, %75; +} +{ +mul.f16x2 r1028, r1025, r1030; +} +{ +add.f16x2 r1031, %43, r1028; +} +{ +add.f16x2 r1034, %59, %67; +} +{ +mul.f16x2 r1037, r1034, r1039; +} +{ +add.f16x2 r1040, r1031, r1037; +} +{ +sub.f16x2 r1043, %50, %74; +} +{ +mul.f16x2 r1046, r1043, r1048; +} +{ +sub.f16x2 r1049, %58, %66; +} +{ +mul.f16x2 r1052, r1049, r1054; +} +{ +add.f16x2 r1055, r1046, r1052; +} +{ +sub.f16x2 r1058, r1040, r1055; +} +{ +cvt.rn.f16.f64 rs49, fd78; +} +mov.b32 r1270, {rs49, rs49}; +{ +cvt.rn.f16.f64 rs50, fd77; +} +{ +neg.f16 rs51, rs50; +} +mov.b32 r1288, {rs51, rs51}; +{ +cvt.rn.f16.f64 rs53, fd75; +} +mov.b32 r1342, {rs53, rs53}; +{ +cvt.rn.f16.f64 rs54, fd73; +} +{ +neg.f16 rs55, rs54; +} +mov.b32 r1360, {rs55, rs55}; +{ +cvt.rn.f16.f64 rs57, fd78; +} +mov.b32 r1351, {rs57, rs57}; +{ +cvt.rn.f16.f64 rs58, fd77; +} +mov.b32 r1366, {rs58, rs58}; +{ +add.f16x2 r1061, %54, %78; +} +{ +add.f16x2 r1064, %46, r1061; +} +{ +add.f16x2 r1067, %62, %70; +} +{ +add.f16x2 r1070, r1064, r1067; +} +{ +add.f16x2 r1073, %55, %79; +} +{ +add.f16x2 r1076, %47, r1073; +} +{ +add.f16x2 r1079, %63, %71; +} +{ +add.f16x2 r1082, r1076, r1079; +} +{ +add.f16x2 r1085, %54, %78; +} +{ +mul.f16x2 r1088, r1085, r1270; +} +{ +add.f16x2 r1091, %46, r1088; +} +{ +add.f16x2 r1094, %62, %70; +} +{ +mul.f16x2 r1097, r1094, r1342; +} +{ +add.f16x2 r1100, r1091, r1097; +} +{ +sub.f16x2 r1103, %55, %79; +} +{ +mul.f16x2 r1106, r1103, r1288; +} +{ +sub.f16x2 r1109, %63, %71; +} +{ +mul.f16x2 r1112, r1109, r1360; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +sub.f16x2 r1118, r1100, r1115; +} +{ +add.f16x2 r1121, %54, %78; +} +{ +mul.f16x2 r1124, r1121, r1270; +} +{ +add.f16x2 r1127, %46, r1124; +} +{ +add.f16x2 r1130, %62, %70; +} +{ +mul.f16x2 r1133, r1130, r1342; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +sub.f16x2 r1139, %55, %79; +} +{ +mul.f16x2 r1142, r1139, r1288; +} +{ +sub.f16x2 r1145, %63, %71; +} +{ +mul.f16x2 r1148, r1145, r1360; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1136, r1151; +} +{ +add.f16x2 r1157, %54, %78; +} +{ +mul.f16x2 r1160, r1157, r1342; +} +{ +add.f16x2 r1163, %46, r1160; +} +{ +add.f16x2 r1166, %62, %70; +} +{ +mul.f16x2 r1169, r1166, r1351; +} +{ +add.f16x2 r1172, r1163, r1169; +} +{ +sub.f16x2 r1175, %55, %79; +} +{ +mul.f16x2 r1178, r1175, r1360; +} +{ +sub.f16x2 r1181, %63, %71; +} +{ +mul.f16x2 r1184, r1181, r1366; +} +{ +add.f16x2 r1187, r1178, r1184; +} +{ +sub.f16x2 r1190, r1172, r1187; +} +{ +add.f16x2 r1193, %54, %78; +} +{ +mul.f16x2 r1196, r1193, r1342; +} +{ +add.f16x2 r1199, %46, r1196; +} +{ +add.f16x2 r1202, %62, %70; +} +{ +mul.f16x2 r1205, r1202, r1351; +} +{ +add.f16x2 r1208, r1199, r1205; +} +{ +sub.f16x2 r1211, %55, %79; +} +{ +mul.f16x2 r1214, r1211, r1360; +} +{ +sub.f16x2 r1217, %63, %71; +} +{ +mul.f16x2 r1220, r1217, r1366; +} +{ +add.f16x2 r1223, r1214, r1220; +} +{ +add.f16x2 r1226, r1208, r1223; +} +{ +add.f16x2 r1229, %55, %79; +} +{ +mul.f16x2 r1232, r1229, r1270; +} +{ +add.f16x2 r1235, %47, r1232; +} +{ +add.f16x2 r1238, %63, %71; +} +{ +mul.f16x2 r1241, r1238, r1342; +} +{ +add.f16x2 r1244, r1235, r1241; +} +{ +sub.f16x2 r1247, %54, %78; +} +{ +mul.f16x2 r1250, r1247, r1288; +} +{ +sub.f16x2 r1253, %62, %70; +} +{ +mul.f16x2 r1256, r1253, r1360; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r1244, r1259; +} +{ +add.f16x2 r1265, %55, %79; +} +{ +mul.f16x2 r1268, r1265, r1270; +} +{ +add.f16x2 r1271, %47, r1268; +} +{ +add.f16x2 r1274, %63, %71; +} +{ +mul.f16x2 r1277, r1274, r1342; +} +{ +add.f16x2 r1280, r1271, r1277; +} +{ +sub.f16x2 r1283, %54, %78; +} +{ +mul.f16x2 r1286, r1283, r1288; +} +{ +sub.f16x2 r1289, %62, %70; +} +{ +mul.f16x2 r1292, r1289, r1360; +} +{ +add.f16x2 r1295, r1286, r1292; +} +{ +sub.f16x2 r1298, r1280, r1295; +} +{ +add.f16x2 r1301, %55, %79; +} +{ +mul.f16x2 r1304, r1301, r1342; +} +{ +add.f16x2 r1307, %47, r1304; +} +{ +add.f16x2 r1310, %63, %71; +} +{ +mul.f16x2 r1313, r1310, r1351; +} +{ +add.f16x2 r1316, r1307, r1313; +} +{ +sub.f16x2 r1319, %54, %78; +} +{ +mul.f16x2 r1322, r1319, r1360; +} +{ +sub.f16x2 r1325, %62, %70; +} +{ +mul.f16x2 r1328, r1325, r1366; +} +{ +add.f16x2 r1331, r1322, r1328; +} +{ +add.f16x2 r1334, r1316, r1331; +} +{ +add.f16x2 r1337, %55, %79; +} +{ +mul.f16x2 r1340, r1337, r1342; +} +{ +add.f16x2 r1343, %47, r1340; +} +{ +add.f16x2 r1346, %63, %71; +} +{ +mul.f16x2 r1349, r1346, r1351; +} +{ +add.f16x2 r1352, r1343, r1349; +} +{ +sub.f16x2 r1355, %54, %78; +} +{ +mul.f16x2 r1358, r1355, r1360; +} +{ +sub.f16x2 r1361, %62, %70; +} +{ +mul.f16x2 r1364, r1361, r1366; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +sub.f16x2 r1370, r1352, r1367; +} +{ +cvt.rn.f16.f64 rs59, fd74; +} +{ +cvt.rn.f16.f64 rs60, fd76; +} +{ +cvt.rn.f16.f64 rs61, fd78; +} +{ +cvt.rn.f16.f64 rs62, fd72; +} +{ +cvt.rn.f16.f64 rs63, fd71; +} +{ +cvt.rn.f16.f64 rs64, fd72; +} +{ +cvt.rn.f16.f64 rs65, fd75; +} +{ +cvt.rn.f16.f64 rs66, fd76; +} +mov.b32 r1387, {rs59, rs59}; +{ +mul.f16x2 r1373, r1118, r1387; +} +mov.b32 r1384, {rs60, rs60}; +{ +mul.f16x2 r1376, r1262, r1384; +} +{ +sub.f16x2 r1379, r1373, r1376; +} +{ +mul.f16x2 r1382, r1118, r1384; +} +{ +fma.rn.f16x2 r1385, r1262, r1387, r1382; +} +mov.b32 r1403, {rs61, rs61}; +{ +mul.f16x2 r1389, r1190, r1403; +} +mov.b32 r1400, {rs62, rs62}; +{ +mul.f16x2 r1392, r1334, r1400; +} +{ +sub.f16x2 r1395, r1389, r1392; +} +{ +mul.f16x2 r1398, r1190, r1400; +} +{ +fma.rn.f16x2 r1401, r1334, r1403, r1398; +} +mov.b32 r1419, {rs63, rs63}; +{ +mul.f16x2 r1405, r1226, r1419; +} +mov.b32 r1416, {rs64, rs64}; +{ +mul.f16x2 r1408, r1370, r1416; +} +{ +sub.f16x2 r1411, r1405, r1408; +} +{ +mul.f16x2 r1414, r1226, r1416; +} +{ +fma.rn.f16x2 r1417, r1370, r1419, r1414; +} +mov.b32 r1435, {rs65, rs65}; +{ +mul.f16x2 r1421, r1154, r1435; +} +mov.b32 r1432, {rs66, rs66}; +{ +mul.f16x2 r1424, r1298, r1432; +} +{ +sub.f16x2 r1427, r1421, r1424; +} +{ +mul.f16x2 r1430, r1154, r1432; +} +{ +fma.rn.f16x2 r1433, r1298, r1435, r1430; +} +{ +add.f16x2 r1437, r758, r1070; +} +{ +add.f16x2 r1440, r770, r1082; +} +{ +sub.f16x2 r1443, r758, r1070; +} +{ +sub.f16x2 r1446, r770, r1082; +} +{ +add.f16x2 r1449, r806, r1379; +} +{ +add.f16x2 r1452, r950, r1385; +} +{ +sub.f16x2 r1455, r806, r1379; +} +{ +sub.f16x2 r1458, r950, r1385; +} +{ +add.f16x2 r1461, r878, r1395; +} +{ +add.f16x2 r1464, r1022, r1401; +} +{ +sub.f16x2 r1467, r878, r1395; +} +{ +sub.f16x2 r1470, r1022, r1401; +} +{ +add.f16x2 r1473, r914, r1411; +} +{ +add.f16x2 r1476, r1058, r1417; +} +{ +sub.f16x2 r1479, r914, r1411; +} +{ +sub.f16x2 r1482, r1058, r1417; +} +{ +add.f16x2 r1485, r842, r1427; +} +{ +add.f16x2 r1488, r986, r1433; +} +{ +sub.f16x2 r1491, r842, r1427; +} +{ +sub.f16x2 r1494, r986, r1433; +} +{ +cvt.rn.f16.f64 rs77, fd72; +} +{ +cvt.rn.f16.f64 rs78, fd78; +} +{ +cvt.rn.f16.f64 rs79, fd74; +} +{ +cvt.rn.f16.f64 rs80, fd76; +} +{ +cvt.rn.f16.f64 rs81, fd76; +} +{ +cvt.rn.f16.f64 rs82, fd74; +} +{ +cvt.rn.f16.f64 rs83, fd78; +} +{ +cvt.rn.f16.f64 rs84, fd72; +} +{ +cvt.rn.f16.f64 rs87, fd71; +} +{ +cvt.rn.f16.f64 rs88, fd72; +} +{ +cvt.rn.f16.f64 rs89, fd73; +} +{ +cvt.rn.f16.f64 rs90, fd74; +} +{ +cvt.rn.f16.f64 rs91, fd75; +} +{ +cvt.rn.f16.f64 rs92, fd76; +} +{ +cvt.rn.f16.f64 rs93, fd77; +} +{ +cvt.rn.f16.f64 rs94, fd78; +} +mov.b32 r1511, {rs77, rs77}; +{ +mul.f16x2 r1497, r1449, r1511; +} +mov.b32 r1508, {rs78, rs78}; +{ +mul.f16x2 r1500, r1452, r1508; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1449, r1508; +} +{ +fma.rn.f16x2 r1509, r1452, r1511, r1506; +} +mov.b32 r1527, {rs79, rs79}; +{ +mul.f16x2 r1513, r1461, r1527; +} +mov.b32 r1524, {rs80, rs80}; +{ +mul.f16x2 r1516, r1464, r1524; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1461, r1524; +} +{ +fma.rn.f16x2 r1525, r1464, r1527, r1522; +} +mov.b32 r1543, {rs81, rs81}; +{ +mul.f16x2 r1529, r1473, r1543; +} +mov.b32 r1540, {rs82, rs82}; +{ +mul.f16x2 r1532, r1476, r1540; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1473, r1540; +} +{ +fma.rn.f16x2 r1541, r1476, r1543, r1538; +} +mov.b32 r1559, {rs83, rs83}; +{ +mul.f16x2 r1545, r1485, r1559; +} +mov.b32 r1556, {rs84, rs84}; +{ +mul.f16x2 r1548, r1488, r1556; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1485, r1556; +} +{ +fma.rn.f16x2 r1557, r1488, r1559, r1554; +} +{ +neg.f16x2 r1561, r1446; +} +mov.b32 r1577, {rs87, rs87}; +{ +mul.f16x2 r1563, r1455, r1577; +} +mov.b32 r1574, {rs88, rs88}; +{ +mul.f16x2 r1566, r1458, r1574; +} +{ +sub.f16x2 r1569, r1563, r1566; +} +{ +mul.f16x2 r1572, r1455, r1574; +} +{ +fma.rn.f16x2 r1575, r1458, r1577, r1572; +} +mov.b32 r1593, {rs89, rs89}; +{ +mul.f16x2 r1579, r1467, r1593; +} +mov.b32 r1590, {rs90, rs90}; +{ +mul.f16x2 r1582, r1470, r1590; +} +{ +sub.f16x2 r1585, r1579, r1582; +} +{ +mul.f16x2 r1588, r1467, r1590; +} +{ +fma.rn.f16x2 r1591, r1470, r1593, r1588; +} +mov.b32 r1609, {rs91, rs91}; +{ +mul.f16x2 r1595, r1479, r1609; +} +mov.b32 r1606, {rs92, rs92}; +{ +mul.f16x2 r1598, r1482, r1606; +} +{ +sub.f16x2 r1601, r1595, r1598; +} +{ +mul.f16x2 r1604, r1479, r1606; +} +{ +fma.rn.f16x2 r1607, r1482, r1609, r1604; +} +mov.b32 r1625, {rs93, rs93}; +{ +mul.f16x2 r1611, r1491, r1625; +} +mov.b32 r1622, {rs94, rs94}; +{ +mul.f16x2 r1614, r1494, r1622; +} +{ +sub.f16x2 r1617, r1611, r1614; +} +{ +mul.f16x2 r1620, r1491, r1622; +} +{ +fma.rn.f16x2 r1623, r1494, r1625, r1620; +} +{ +add.f16x2 %0, r689, r1437; +} +{ +add.f16x2 %1, r692, r1440; +} +{ +sub.f16x2 %20, r689, r1437; +} +{ +sub.f16x2 %21, r692, r1440; +} +{ +add.f16x2 %2, r701, r1503; +} +{ +add.f16x2 %3, r704, r1509; +} +{ +sub.f16x2 %22, r701, r1503; +} +{ +sub.f16x2 %23, r704, r1509; +} +{ +add.f16x2 %4, r713, r1519; +} +{ +add.f16x2 %5, r716, r1525; +} +{ +sub.f16x2 %24, r713, r1519; +} +{ +sub.f16x2 %25, r716, r1525; +} +{ +add.f16x2 %6, r725, r1535; +} +{ +add.f16x2 %7, r728, r1541; +} +{ +sub.f16x2 %26, r725, r1535; +} +{ +sub.f16x2 %27, r728, r1541; +} +{ +add.f16x2 %8, r737, r1551; +} +{ +add.f16x2 %9, r740, r1557; +} +{ +sub.f16x2 %28, r737, r1551; +} +{ +sub.f16x2 %29, r740, r1557; +} +{ +add.f16x2 %10, r695, r1561; +} +{ +add.f16x2 %11, r698, r1443; +} +{ +sub.f16x2 %30, r695, r1561; +} +{ +sub.f16x2 %31, r698, r1443; +} +{ +add.f16x2 %12, r707, r1569; +} +{ +add.f16x2 %13, r710, r1575; +} +{ +sub.f16x2 %32, r707, r1569; +} +{ +sub.f16x2 %33, r710, r1575; +} +{ +add.f16x2 %14, r719, r1585; +} +{ +add.f16x2 %15, r722, r1591; +} +{ +sub.f16x2 %34, r719, r1585; +} +{ +sub.f16x2 %35, r722, r1591; +} +{ +add.f16x2 %16, r731, r1601; +} +{ +add.f16x2 %17, r734, r1607; +} +{ +sub.f16x2 %36, r731, r1601; +} +{ +sub.f16x2 %37, r734, r1607; +} +{ +add.f16x2 %18, r743, r1617; +} +{ +add.f16x2 %19, r746, r1623; +} +{ +sub.f16x2 %38, r743, r1617; +} +{ +sub.f16x2 %39, r746, r1623; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..dc99804b760f9 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp32_fwd.hpp.inc @@ -0,0 +1,352 @@ +#ifndef CUFFTDX_FFT_20_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_20_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<6, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<417>; +.reg .b64 rd<2>; +add.f32 f81, %50, %82; +add.f32 f82, %40, f81; +add.f32 f83, %61, %72; +add.f32 f84, f83, f82; +add.f32 f85, %52, %84; +add.f32 f86, %41, f85; +add.f32 f87, %63, %73; +add.f32 f88, f87, f86; +fma.rn.f32 f89, f81, 0f3E9E377A, %40; +mul.f32 f90, f83, 0f3F4F1BBD; +sub.f32 f91, f89, f90; +sub.f32 f92, %52, %84; +mul.f32 f93, f92, 0f3F737871; +sub.f32 f94, %63, %73; +mul.f32 f95, f94, 0fBF167918; +sub.f32 f96, f95, f93; +sub.f32 f97, f91, f96; +add.f32 f98, f96, f91; +mul.f32 f99, f81, 0f3F4F1BBD; +sub.f32 f100, %40, f99; +fma.rn.f32 f101, f83, 0f3E9E377A, f100; +mul.f32 f102, f92, 0f3F167918; +mul.f32 f103, f94, 0f3F737871; +sub.f32 f104, f103, f102; +sub.f32 f105, f101, f104; +add.f32 f106, f104, f101; +fma.rn.f32 f107, f85, 0f3E9E377A, %41; +mul.f32 f108, f87, 0f3F4F1BBD; +sub.f32 f109, f107, f108; +sub.f32 f110, %50, %82; +mul.f32 f111, f110, 0f3F737871; +sub.f32 f112, %61, %72; +mul.f32 f113, f112, 0fBF167918; +sub.f32 f114, f113, f111; +add.f32 f115, f114, f109; +sub.f32 f116, f109, f114; +mul.f32 f117, f85, 0f3F4F1BBD; +sub.f32 f118, %41, f117; +fma.rn.f32 f119, f87, 0f3E9E377A, f118; +mul.f32 f120, f110, 0f3F167918; +mul.f32 f121, f112, 0f3F737871; +sub.f32 f122, f121, f120; +add.f32 f123, f122, f119; +sub.f32 f124, f119, f122; +add.f32 f125, %56, %88; +add.f32 f126, %45, f125; +add.f32 f127, %66, %77; +add.f32 f128, f127, f126; +add.f32 f129, %57, %89; +add.f32 f130, %47, f129; +add.f32 f131, %68, %79; +add.f32 f132, f131, f130; +fma.rn.f32 f133, f125, 0f3E9E377A, %45; +mul.f32 f134, f127, 0f3F4F1BBD; +sub.f32 f135, f133, f134; +sub.f32 f136, %57, %89; +mul.f32 f137, f136, 0f3F737871; +sub.f32 f138, %68, %79; +mul.f32 f139, f138, 0fBF167918; +sub.f32 f140, f139, f137; +sub.f32 f141, f135, f140; +add.f32 f142, f140, f135; +mul.f32 f143, f125, 0f3F4F1BBD; +sub.f32 f144, %45, f143; +fma.rn.f32 f145, f127, 0f3E9E377A, f144; +mul.f32 f146, f136, 0f3F167918; +mul.f32 f147, f138, 0f3F737871; +sub.f32 f148, f147, f146; +sub.f32 f149, f145, f148; +add.f32 f150, f148, f145; +fma.rn.f32 f151, f129, 0f3E9E377A, %47; +mul.f32 f152, f131, 0f3F4F1BBD; +sub.f32 f153, f151, f152; +sub.f32 f154, %56, %88; +mul.f32 f155, f154, 0f3F737871; +sub.f32 f156, %66, %77; +mul.f32 f157, f156, 0fBF167918; +sub.f32 f158, f157, f155; +add.f32 f159, f158, f153; +sub.f32 f160, f153, f158; +mul.f32 f161, f129, 0f3F4F1BBD; +sub.f32 f162, %47, f161; +fma.rn.f32 f163, f131, 0f3E9E377A, f162; +mul.f32 f164, f154, 0f3F167918; +mul.f32 f165, f156, 0f3F737871; +sub.f32 f166, f165, f164; +add.f32 f167, f166, f163; +sub.f32 f168, f163, f166; +mul.f32 f169, f141, 0f3F4F1BBD; +mul.f32 f170, f159, 0fBF167918; +sub.f32 f171, f169, f170; +mul.f32 f172, f159, 0f3F4F1BBD; +fma.rn.f32 f173, f141, 0fBF167918, f172; +mul.f32 f174, f149, 0f3E9E377A; +mul.f32 f175, f167, 0fBF737871; +sub.f32 f176, f174, f175; +mul.f32 f177, f167, 0f3E9E377A; +fma.rn.f32 f178, f149, 0fBF737871, f177; +mul.f32 f179, f150, 0fBE9E377A; +mul.f32 f180, f168, 0fBF737871; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0fBE9E377A; +fma.rn.f32 f183, f150, 0fBF737871, f182; +mul.f32 f184, f142, 0fBF4F1BBD; +mul.f32 f185, f160, 0fBF167918; +sub.f32 f186, f184, f185; +mul.f32 f187, f160, 0fBF4F1BBD; +fma.rn.f32 f188, f142, 0fBF167918, f187; +add.f32 f189, f84, f128; +add.f32 f190, f88, f132; +sub.f32 f191, f84, f128; +sub.f32 f192, f88, f132; +add.f32 f193, f97, f171; +add.f32 f194, f115, f173; +sub.f32 f195, f97, f171; +sub.f32 f196, f115, f173; +add.f32 f197, f105, f176; +add.f32 f198, f123, f178; +sub.f32 f199, f105, f176; +sub.f32 f200, f123, f178; +add.f32 f201, f106, f181; +add.f32 f202, f124, f183; +sub.f32 f203, f106, f181; +sub.f32 f204, f124, f183; +add.f32 f205, f98, f186; +add.f32 f206, f116, f188; +sub.f32 f207, f98, f186; +sub.f32 f208, f116, f188; +add.f32 f209, %53, %85; +add.f32 f210, %42, f209; +add.f32 f211, %64, %74; +add.f32 f212, f211, f210; +add.f32 f213, %55, %87; +add.f32 f214, %44, f213; +add.f32 f215, %65, %76; +add.f32 f216, f215, f214; +fma.rn.f32 f217, f209, 0f3E9E377A, %42; +mul.f32 f218, f211, 0f3F4F1BBD; +sub.f32 f219, f217, f218; +sub.f32 f220, %55, %87; +mul.f32 f221, f220, 0f3F737871; +sub.f32 f222, %65, %76; +mul.f32 f223, f222, 0fBF167918; +sub.f32 f224, f223, f221; +sub.f32 f225, f219, f224; +add.f32 f226, f224, f219; +mul.f32 f227, f209, 0f3F4F1BBD; +sub.f32 f228, %42, f227; +fma.rn.f32 f229, f211, 0f3E9E377A, f228; +mul.f32 f230, f220, 0f3F167918; +mul.f32 f231, f222, 0f3F737871; +sub.f32 f232, f231, f230; +sub.f32 f233, f229, f232; +add.f32 f234, f232, f229; +fma.rn.f32 f235, f213, 0f3E9E377A, %44; +mul.f32 f236, f215, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %53, %85; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %64, %74; +mul.f32 f241, f240, 0fBF167918; +sub.f32 f242, f241, f239; +add.f32 f243, f242, f237; +sub.f32 f244, f237, f242; +mul.f32 f245, f213, 0f3F4F1BBD; +sub.f32 f246, %44, f245; +fma.rn.f32 f247, f215, 0f3E9E377A, f246; +mul.f32 f248, f238, 0f3F167918; +mul.f32 f249, f240, 0f3F737871; +sub.f32 f250, f249, f248; +add.f32 f251, f250, f247; +sub.f32 f252, f247, f250; +add.f32 f253, %58, %90; +add.f32 f254, %48, f253; +add.f32 f255, %69, %80; +add.f32 f256, f255, f254; +add.f32 f257, %60, %91; +add.f32 f258, %49, f257; +add.f32 f259, %71, %81; +add.f32 f260, f259, f258; +fma.rn.f32 f261, f253, 0f3E9E377A, %48; +mul.f32 f262, f255, 0f3F4F1BBD; +sub.f32 f263, f261, f262; +sub.f32 f264, %60, %91; +mul.f32 f265, f264, 0f3F737871; +sub.f32 f266, %71, %81; +mul.f32 f267, f266, 0fBF167918; +sub.f32 f268, f267, f265; +sub.f32 f269, f263, f268; +add.f32 f270, f268, f263; +mul.f32 f271, f253, 0f3F4F1BBD; +sub.f32 f272, %48, f271; +fma.rn.f32 f273, f255, 0f3E9E377A, f272; +mul.f32 f274, f264, 0f3F167918; +mul.f32 f275, f266, 0f3F737871; +sub.f32 f276, f275, f274; +sub.f32 f277, f273, f276; +add.f32 f278, f276, f273; +fma.rn.f32 f279, f257, 0f3E9E377A, %49; +mul.f32 f280, f259, 0f3F4F1BBD; +sub.f32 f281, f279, f280; +sub.f32 f282, %58, %90; +mul.f32 f283, f282, 0f3F737871; +sub.f32 f284, %69, %80; +mul.f32 f285, f284, 0fBF167918; +sub.f32 f286, f285, f283; +add.f32 f287, f286, f281; +sub.f32 f288, f281, f286; +mul.f32 f289, f257, 0f3F4F1BBD; +sub.f32 f290, %49, f289; +fma.rn.f32 f291, f259, 0f3E9E377A, f290; +mul.f32 f292, f282, 0f3F167918; +mul.f32 f293, f284, 0f3F737871; +sub.f32 f294, f293, f292; +add.f32 f295, f294, f291; +sub.f32 f296, f291, f294; +mul.f32 f297, f269, 0f3F4F1BBD; +mul.f32 f298, f287, 0fBF167918; +sub.f32 f299, f297, f298; +mul.f32 f300, f287, 0f3F4F1BBD; +fma.rn.f32 f301, f269, 0fBF167918, f300; +mul.f32 f302, f277, 0f3E9E377A; +mul.f32 f303, f295, 0fBF737871; +sub.f32 f304, f302, f303; +mul.f32 f305, f295, 0f3E9E377A; +fma.rn.f32 f306, f277, 0fBF737871, f305; +mul.f32 f307, f278, 0fBE9E377A; +mul.f32 f308, f296, 0fBF737871; +sub.f32 f309, f307, f308; +mul.f32 f310, f296, 0fBE9E377A; +fma.rn.f32 f311, f278, 0fBF737871, f310; +mul.f32 f312, f270, 0fBF4F1BBD; +mul.f32 f313, f288, 0fBF167918; +sub.f32 f314, f312, f313; +mul.f32 f315, f288, 0fBF4F1BBD; +fma.rn.f32 f316, f270, 0fBF167918, f315; +add.f32 f317, f212, f256; +add.f32 f318, f216, f260; +sub.f32 f319, f212, f256; +sub.f32 f320, f216, f260; +add.f32 f321, f225, f299; +add.f32 f322, f243, f301; +sub.f32 f323, f225, f299; +sub.f32 f324, f243, f301; +add.f32 f325, f233, f304; +add.f32 f326, f251, f306; +sub.f32 f327, f233, f304; +sub.f32 f328, f251, f306; +add.f32 f329, f234, f309; +add.f32 f330, f252, f311; +sub.f32 f331, f234, f309; +sub.f32 f332, f252, f311; +add.f32 f333, f226, f314; +add.f32 f334, f244, f316; +sub.f32 f335, f226, f314; +sub.f32 f336, f244, f316; +mul.f32 f337, f321, 0f3F737871; +mul.f32 f338, f322, 0fBE9E377A; +sub.f32 f339, f337, f338; +mul.f32 f340, f322, 0f3F737871; +fma.rn.f32 f341, f321, 0fBE9E377A, f340; +mul.f32 f342, f325, 0f3F4F1BBD; +mul.f32 f343, f326, 0fBF167918; +sub.f32 f344, f342, f343; +mul.f32 f345, f326, 0f3F4F1BBD; +fma.rn.f32 f346, f325, 0fBF167918, f345; +mul.f32 f347, f329, 0f3F167918; +mul.f32 f348, f330, 0fBF4F1BBD; +sub.f32 f349, f347, f348; +mul.f32 f350, f330, 0f3F167918; +fma.rn.f32 f351, f329, 0fBF4F1BBD, f350; +mul.f32 f352, f333, 0f3E9E377A; +mul.f32 f353, f334, 0fBF737871; +sub.f32 f354, f352, f353; +mul.f32 f355, f334, 0f3E9E377A; +fma.rn.f32 f356, f333, 0fBF737871, f355; +mul.f32 f357, f323, 0fBE9E377A; +mul.f32 f358, f324, 0fBF737871; +sub.f32 f359, f357, f358; +mul.f32 f360, f324, 0fBE9E377A; +fma.rn.f32 f361, f323, 0fBF737871, f360; +mul.f32 f362, f327, 0fBF167918; +mul.f32 f363, f328, 0fBF4F1BBD; +sub.f32 f364, f362, f363; +mul.f32 f365, f328, 0fBF167918; +fma.rn.f32 f366, f327, 0fBF4F1BBD, f365; +mul.f32 f367, f331, 0fBF4F1BBD; +mul.f32 f368, f332, 0fBF167918; +sub.f32 f369, f367, f368; +mul.f32 f370, f332, 0fBF4F1BBD; +fma.rn.f32 f371, f331, 0fBF167918, f370; +mul.f32 f372, f335, 0fBF737871; +mul.f32 f373, f336, 0fBE9E377A; +sub.f32 f374, f372, f373; +mul.f32 f375, f336, 0fBF737871; +fma.rn.f32 f376, f335, 0fBE9E377A, f375; +add.f32 %1, f190, f318; +add.f32 %0, f189, f317; +add.f32 %3, f194, f341; +add.f32 %2, f193, f339; +add.f32 %5, f198, f346; +add.f32 %4, f197, f344; +add.f32 %7, f202, f351; +add.f32 %6, f201, f349; +add.f32 %9, f206, f356; +add.f32 %8, f205, f354; +sub.f32 %11, f192, f319; +add.f32 %10, f191, f320; +add.f32 %13, f196, f361; +add.f32 %12, f195, f359; +add.f32 %15, f200, f366; +add.f32 %14, f199, f364; +add.f32 %17, f204, f371; +add.f32 %16, f203, f369; +add.f32 %19, f208, f376; +add.f32 %18, f207, f374; +sub.f32 %21, f190, f318; +sub.f32 %20, f189, f317; +sub.f32 %23, f194, f341; +sub.f32 %22, f193, f339; +sub.f32 %25, f198, f346; +sub.f32 %24, f197, f344; +sub.f32 %27, f202, f351; +sub.f32 %26, f201, f349; +sub.f32 %29, f206, f356; +sub.f32 %28, f205, f354; +add.f32 %31, f192, f319; +sub.f32 %30, f191, f320; +sub.f32 %33, f196, f361; +sub.f32 %32, f195, f359; +sub.f32 %35, f200, f366; +sub.f32 %34, f199, f364; +sub.f32 %37, f204, f371; +sub.f32 %36, f203, f369; +sub.f32 %39, f208, f376; +sub.f32 %38, f207, f374; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..d09ffb8fa9aaa --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp32_inv.hpp.inc @@ -0,0 +1,344 @@ +#ifndef CUFFTDX_FFT_20_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_20_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<208, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<409>; +.reg .b64 rd<2>; +add.f32 f81, %50, %82; +add.f32 f82, %40, f81; +add.f32 f83, %61, %72; +add.f32 f84, f83, f82; +add.f32 f85, %52, %84; +add.f32 f86, %41, f85; +add.f32 f87, %63, %73; +add.f32 f88, f87, f86; +fma.rn.f32 f89, f81, 0f3E9E377A, %40; +mul.f32 f90, f83, 0f3F4F1BBD; +sub.f32 f91, f89, f90; +sub.f32 f92, %52, %84; +mul.f32 f93, f92, 0f3F737871; +sub.f32 f94, %63, %73; +fma.rn.f32 f95, f94, 0f3F167918, f93; +sub.f32 f96, f91, f95; +add.f32 f97, f95, f91; +mul.f32 f98, f81, 0f3F4F1BBD; +sub.f32 f99, %40, f98; +fma.rn.f32 f100, f83, 0f3E9E377A, f99; +mul.f32 f101, f92, 0f3F167918; +mul.f32 f102, f94, 0f3F737871; +sub.f32 f103, f101, f102; +sub.f32 f104, f100, f103; +add.f32 f105, f103, f100; +fma.rn.f32 f106, f85, 0f3E9E377A, %41; +mul.f32 f107, f87, 0f3F4F1BBD; +sub.f32 f108, f106, f107; +sub.f32 f109, %50, %82; +mul.f32 f110, f109, 0f3F737871; +sub.f32 f111, %61, %72; +fma.rn.f32 f112, f111, 0f3F167918, f110; +add.f32 f113, f112, f108; +sub.f32 f114, f108, f112; +mul.f32 f115, f85, 0f3F4F1BBD; +sub.f32 f116, %41, f115; +fma.rn.f32 f117, f87, 0f3E9E377A, f116; +mul.f32 f118, f109, 0f3F167918; +mul.f32 f119, f111, 0f3F737871; +sub.f32 f120, f118, f119; +add.f32 f121, f120, f117; +sub.f32 f122, f117, f120; +add.f32 f123, %56, %88; +add.f32 f124, %45, f123; +add.f32 f125, %66, %77; +add.f32 f126, f125, f124; +add.f32 f127, %57, %89; +add.f32 f128, %47, f127; +add.f32 f129, %68, %79; +add.f32 f130, f129, f128; +fma.rn.f32 f131, f123, 0f3E9E377A, %45; +mul.f32 f132, f125, 0f3F4F1BBD; +sub.f32 f133, f131, f132; +sub.f32 f134, %57, %89; +mul.f32 f135, f134, 0f3F737871; +sub.f32 f136, %68, %79; +fma.rn.f32 f137, f136, 0f3F167918, f135; +sub.f32 f138, f133, f137; +add.f32 f139, f137, f133; +mul.f32 f140, f123, 0f3F4F1BBD; +sub.f32 f141, %45, f140; +fma.rn.f32 f142, f125, 0f3E9E377A, f141; +mul.f32 f143, f134, 0f3F167918; +mul.f32 f144, f136, 0f3F737871; +sub.f32 f145, f143, f144; +sub.f32 f146, f142, f145; +add.f32 f147, f145, f142; +fma.rn.f32 f148, f127, 0f3E9E377A, %47; +mul.f32 f149, f129, 0f3F4F1BBD; +sub.f32 f150, f148, f149; +sub.f32 f151, %56, %88; +mul.f32 f152, f151, 0f3F737871; +sub.f32 f153, %66, %77; +fma.rn.f32 f154, f153, 0f3F167918, f152; +add.f32 f155, f154, f150; +sub.f32 f156, f150, f154; +mul.f32 f157, f127, 0f3F4F1BBD; +sub.f32 f158, %47, f157; +fma.rn.f32 f159, f129, 0f3E9E377A, f158; +mul.f32 f160, f151, 0f3F167918; +mul.f32 f161, f153, 0f3F737871; +sub.f32 f162, f160, f161; +add.f32 f163, f162, f159; +sub.f32 f164, f159, f162; +mul.f32 f165, f138, 0f3F4F1BBD; +mul.f32 f166, f155, 0f3F167918; +sub.f32 f167, f165, f166; +mul.f32 f168, f155, 0f3F4F1BBD; +fma.rn.f32 f169, f138, 0f3F167918, f168; +mul.f32 f170, f146, 0f3E9E377A; +mul.f32 f171, f163, 0f3F737871; +sub.f32 f172, f170, f171; +mul.f32 f173, f163, 0f3E9E377A; +fma.rn.f32 f174, f146, 0f3F737871, f173; +mul.f32 f175, f147, 0fBE9E377A; +mul.f32 f176, f164, 0f3F737871; +sub.f32 f177, f175, f176; +mul.f32 f178, f164, 0fBE9E377A; +fma.rn.f32 f179, f147, 0f3F737871, f178; +mul.f32 f180, f139, 0fBF4F1BBD; +mul.f32 f181, f156, 0f3F167918; +sub.f32 f182, f180, f181; +mul.f32 f183, f156, 0fBF4F1BBD; +fma.rn.f32 f184, f139, 0f3F167918, f183; +add.f32 f185, f84, f126; +add.f32 f186, f88, f130; +sub.f32 f187, f84, f126; +sub.f32 f188, f88, f130; +add.f32 f189, f96, f167; +add.f32 f190, f113, f169; +sub.f32 f191, f96, f167; +sub.f32 f192, f113, f169; +add.f32 f193, f104, f172; +add.f32 f194, f121, f174; +sub.f32 f195, f104, f172; +sub.f32 f196, f121, f174; +add.f32 f197, f105, f177; +add.f32 f198, f122, f179; +sub.f32 f199, f105, f177; +sub.f32 f200, f122, f179; +add.f32 f201, f97, f182; +add.f32 f202, f114, f184; +sub.f32 f203, f97, f182; +sub.f32 f204, f114, f184; +add.f32 f205, %53, %85; +add.f32 f206, %42, f205; +add.f32 f207, %64, %74; +add.f32 f208, f207, f206; +add.f32 f209, %55, %87; +add.f32 f210, %44, f209; +add.f32 f211, %65, %76; +add.f32 f212, f211, f210; +fma.rn.f32 f213, f205, 0f3E9E377A, %42; +mul.f32 f214, f207, 0f3F4F1BBD; +sub.f32 f215, f213, f214; +sub.f32 f216, %55, %87; +mul.f32 f217, f216, 0f3F737871; +sub.f32 f218, %65, %76; +fma.rn.f32 f219, f218, 0f3F167918, f217; +sub.f32 f220, f215, f219; +add.f32 f221, f219, f215; +mul.f32 f222, f205, 0f3F4F1BBD; +sub.f32 f223, %42, f222; +fma.rn.f32 f224, f207, 0f3E9E377A, f223; +mul.f32 f225, f216, 0f3F167918; +mul.f32 f226, f218, 0f3F737871; +sub.f32 f227, f225, f226; +sub.f32 f228, f224, f227; +add.f32 f229, f227, f224; +fma.rn.f32 f230, f209, 0f3E9E377A, %44; +mul.f32 f231, f211, 0f3F4F1BBD; +sub.f32 f232, f230, f231; +sub.f32 f233, %53, %85; +mul.f32 f234, f233, 0f3F737871; +sub.f32 f235, %64, %74; +fma.rn.f32 f236, f235, 0f3F167918, f234; +add.f32 f237, f236, f232; +sub.f32 f238, f232, f236; +mul.f32 f239, f209, 0f3F4F1BBD; +sub.f32 f240, %44, f239; +fma.rn.f32 f241, f211, 0f3E9E377A, f240; +mul.f32 f242, f233, 0f3F167918; +mul.f32 f243, f235, 0f3F737871; +sub.f32 f244, f242, f243; +add.f32 f245, f244, f241; +sub.f32 f246, f241, f244; +add.f32 f247, %58, %90; +add.f32 f248, %48, f247; +add.f32 f249, %69, %80; +add.f32 f250, f249, f248; +add.f32 f251, %60, %91; +add.f32 f252, %49, f251; +add.f32 f253, %71, %81; +add.f32 f254, f253, f252; +fma.rn.f32 f255, f247, 0f3E9E377A, %48; +mul.f32 f256, f249, 0f3F4F1BBD; +sub.f32 f257, f255, f256; +sub.f32 f258, %60, %91; +mul.f32 f259, f258, 0f3F737871; +sub.f32 f260, %71, %81; +fma.rn.f32 f261, f260, 0f3F167918, f259; +sub.f32 f262, f257, f261; +add.f32 f263, f261, f257; +mul.f32 f264, f247, 0f3F4F1BBD; +sub.f32 f265, %48, f264; +fma.rn.f32 f266, f249, 0f3E9E377A, f265; +mul.f32 f267, f258, 0f3F167918; +mul.f32 f268, f260, 0f3F737871; +sub.f32 f269, f267, f268; +sub.f32 f270, f266, f269; +add.f32 f271, f269, f266; +fma.rn.f32 f272, f251, 0f3E9E377A, %49; +mul.f32 f273, f253, 0f3F4F1BBD; +sub.f32 f274, f272, f273; +sub.f32 f275, %58, %90; +mul.f32 f276, f275, 0f3F737871; +sub.f32 f277, %69, %80; +fma.rn.f32 f278, f277, 0f3F167918, f276; +add.f32 f279, f278, f274; +sub.f32 f280, f274, f278; +mul.f32 f281, f251, 0f3F4F1BBD; +sub.f32 f282, %49, f281; +fma.rn.f32 f283, f253, 0f3E9E377A, f282; +mul.f32 f284, f275, 0f3F167918; +mul.f32 f285, f277, 0f3F737871; +sub.f32 f286, f284, f285; +add.f32 f287, f286, f283; +sub.f32 f288, f283, f286; +mul.f32 f289, f262, 0f3F4F1BBD; +mul.f32 f290, f279, 0f3F167918; +sub.f32 f291, f289, f290; +mul.f32 f292, f279, 0f3F4F1BBD; +fma.rn.f32 f293, f262, 0f3F167918, f292; +mul.f32 f294, f270, 0f3E9E377A; +mul.f32 f295, f287, 0f3F737871; +sub.f32 f296, f294, f295; +mul.f32 f297, f287, 0f3E9E377A; +fma.rn.f32 f298, f270, 0f3F737871, f297; +mul.f32 f299, f271, 0fBE9E377A; +mul.f32 f300, f288, 0f3F737871; +sub.f32 f301, f299, f300; +mul.f32 f302, f288, 0fBE9E377A; +fma.rn.f32 f303, f271, 0f3F737871, f302; +mul.f32 f304, f263, 0fBF4F1BBD; +mul.f32 f305, f280, 0f3F167918; +sub.f32 f306, f304, f305; +mul.f32 f307, f280, 0fBF4F1BBD; +fma.rn.f32 f308, f263, 0f3F167918, f307; +add.f32 f309, f208, f250; +add.f32 f310, f212, f254; +sub.f32 f311, f208, f250; +sub.f32 f312, f212, f254; +add.f32 f313, f220, f291; +add.f32 f314, f237, f293; +sub.f32 f315, f220, f291; +sub.f32 f316, f237, f293; +add.f32 f317, f228, f296; +add.f32 f318, f245, f298; +sub.f32 f319, f228, f296; +sub.f32 f320, f245, f298; +add.f32 f321, f229, f301; +add.f32 f322, f246, f303; +sub.f32 f323, f229, f301; +sub.f32 f324, f246, f303; +add.f32 f325, f221, f306; +add.f32 f326, f238, f308; +sub.f32 f327, f221, f306; +sub.f32 f328, f238, f308; +mul.f32 f329, f313, 0f3F737871; +mul.f32 f330, f314, 0f3E9E377A; +sub.f32 f331, f329, f330; +mul.f32 f332, f314, 0f3F737871; +fma.rn.f32 f333, f313, 0f3E9E377A, f332; +mul.f32 f334, f317, 0f3F4F1BBD; +mul.f32 f335, f318, 0f3F167918; +sub.f32 f336, f334, f335; +mul.f32 f337, f318, 0f3F4F1BBD; +fma.rn.f32 f338, f317, 0f3F167918, f337; +mul.f32 f339, f321, 0f3F167918; +mul.f32 f340, f322, 0f3F4F1BBD; +sub.f32 f341, f339, f340; +mul.f32 f342, f322, 0f3F167918; +fma.rn.f32 f343, f321, 0f3F4F1BBD, f342; +mul.f32 f344, f325, 0f3E9E377A; +mul.f32 f345, f326, 0f3F737871; +sub.f32 f346, f344, f345; +mul.f32 f347, f326, 0f3E9E377A; +fma.rn.f32 f348, f325, 0f3F737871, f347; +mul.f32 f349, f315, 0fBE9E377A; +mul.f32 f350, f316, 0f3F737871; +sub.f32 f351, f349, f350; +mul.f32 f352, f316, 0fBE9E377A; +fma.rn.f32 f353, f315, 0f3F737871, f352; +mul.f32 f354, f319, 0fBF167918; +mul.f32 f355, f320, 0f3F4F1BBD; +sub.f32 f356, f354, f355; +mul.f32 f357, f320, 0fBF167918; +fma.rn.f32 f358, f319, 0f3F4F1BBD, f357; +mul.f32 f359, f323, 0fBF4F1BBD; +mul.f32 f360, f324, 0f3F167918; +sub.f32 f361, f359, f360; +mul.f32 f362, f324, 0fBF4F1BBD; +fma.rn.f32 f363, f323, 0f3F167918, f362; +mul.f32 f364, f327, 0fBF737871; +mul.f32 f365, f328, 0f3E9E377A; +sub.f32 f366, f364, f365; +mul.f32 f367, f328, 0fBF737871; +fma.rn.f32 f368, f327, 0f3E9E377A, f367; +add.f32 %1, f186, f310; +add.f32 %0, f185, f309; +add.f32 %3, f190, f333; +add.f32 %2, f189, f331; +add.f32 %5, f194, f338; +add.f32 %4, f193, f336; +add.f32 %7, f198, f343; +add.f32 %6, f197, f341; +add.f32 %9, f202, f348; +add.f32 %8, f201, f346; +add.f32 %11, f188, f311; +sub.f32 %10, f187, f312; +add.f32 %13, f192, f353; +add.f32 %12, f191, f351; +add.f32 %15, f196, f358; +add.f32 %14, f195, f356; +add.f32 %17, f200, f363; +add.f32 %16, f199, f361; +add.f32 %19, f204, f368; +add.f32 %18, f203, f366; +sub.f32 %21, f186, f310; +sub.f32 %20, f185, f309; +sub.f32 %23, f190, f333; +sub.f32 %22, f189, f331; +sub.f32 %25, f194, f338; +sub.f32 %24, f193, f336; +sub.f32 %27, f198, f343; +sub.f32 %26, f197, f341; +sub.f32 %29, f202, f348; +sub.f32 %28, f201, f346; +sub.f32 %31, f188, f311; +add.f32 %30, f187, f312; +sub.f32 %33, f192, f353; +sub.f32 %32, f191, f351; +sub.f32 %35, f196, f358; +sub.f32 %34, f195, f356; +sub.f32 %37, f200, f363; +sub.f32 %36, f199, f361; +sub.f32 %39, f204, f368; +sub.f32 %38, f203, f366; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..47cfd845e998b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp64_fwd.hpp.inc @@ -0,0 +1,352 @@ +#ifndef CUFFTDX_FFT_20_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_20_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<410, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<417>; +.reg .b64 rd<2>; +add.f64 fd81, %50, %82; +add.f64 fd82, %40, fd81; +add.f64 fd83, %61, %72; +add.f64 fd84, fd83, fd82; +add.f64 fd85, %52, %84; +add.f64 fd86, %41, fd85; +add.f64 fd87, %63, %73; +add.f64 fd88, fd87, fd86; +fma.rn.f64 fd89, fd81, 0d3FD3C6EF372FE950, %40; +mul.f64 fd90, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd91, fd89, fd90; +sub.f64 fd92, %52, %84; +mul.f64 fd93, fd92, 0d3FEE6F0E134454FF; +sub.f64 fd94, %63, %73; +mul.f64 fd95, fd94, 0dBFE2CF2304755A5E; +sub.f64 fd96, fd95, fd93; +sub.f64 fd97, fd91, fd96; +add.f64 fd98, fd96, fd91; +mul.f64 fd99, fd81, 0d3FE9E3779B97F4A8; +sub.f64 fd100, %40, fd99; +fma.rn.f64 fd101, fd83, 0d3FD3C6EF372FE950, fd100; +mul.f64 fd102, fd92, 0d3FE2CF2304755A5E; +mul.f64 fd103, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd104, fd103, fd102; +sub.f64 fd105, fd101, fd104; +add.f64 fd106, fd104, fd101; +fma.rn.f64 fd107, fd85, 0d3FD3C6EF372FE950, %41; +mul.f64 fd108, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd109, fd107, fd108; +sub.f64 fd110, %50, %82; +mul.f64 fd111, fd110, 0d3FEE6F0E134454FF; +sub.f64 fd112, %61, %72; +mul.f64 fd113, fd112, 0dBFE2CF2304755A5E; +sub.f64 fd114, fd113, fd111; +add.f64 fd115, fd114, fd109; +sub.f64 fd116, fd109, fd114; +mul.f64 fd117, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd118, %41, fd117; +fma.rn.f64 fd119, fd87, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd120, fd110, 0d3FE2CF2304755A5E; +mul.f64 fd121, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd121, fd120; +add.f64 fd123, fd122, fd119; +sub.f64 fd124, fd119, fd122; +add.f64 fd125, %56, %88; +add.f64 fd126, %45, fd125; +add.f64 fd127, %66, %77; +add.f64 fd128, fd127, fd126; +add.f64 fd129, %57, %89; +add.f64 fd130, %47, fd129; +add.f64 fd131, %68, %79; +add.f64 fd132, fd131, fd130; +fma.rn.f64 fd133, fd125, 0d3FD3C6EF372FE950, %45; +mul.f64 fd134, fd127, 0d3FE9E3779B97F4A8; +sub.f64 fd135, fd133, fd134; +sub.f64 fd136, %57, %89; +mul.f64 fd137, fd136, 0d3FEE6F0E134454FF; +sub.f64 fd138, %68, %79; +mul.f64 fd139, fd138, 0dBFE2CF2304755A5E; +sub.f64 fd140, fd139, fd137; +sub.f64 fd141, fd135, fd140; +add.f64 fd142, fd140, fd135; +mul.f64 fd143, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd144, %45, fd143; +fma.rn.f64 fd145, fd127, 0d3FD3C6EF372FE950, fd144; +mul.f64 fd146, fd136, 0d3FE2CF2304755A5E; +mul.f64 fd147, fd138, 0d3FEE6F0E134454FF; +sub.f64 fd148, fd147, fd146; +sub.f64 fd149, fd145, fd148; +add.f64 fd150, fd148, fd145; +fma.rn.f64 fd151, fd129, 0d3FD3C6EF372FE950, %47; +mul.f64 fd152, fd131, 0d3FE9E3779B97F4A8; +sub.f64 fd153, fd151, fd152; +sub.f64 fd154, %56, %88; +mul.f64 fd155, fd154, 0d3FEE6F0E134454FF; +sub.f64 fd156, %66, %77; +mul.f64 fd157, fd156, 0dBFE2CF2304755A5E; +sub.f64 fd158, fd157, fd155; +add.f64 fd159, fd158, fd153; +sub.f64 fd160, fd153, fd158; +mul.f64 fd161, fd129, 0d3FE9E3779B97F4A8; +sub.f64 fd162, %47, fd161; +fma.rn.f64 fd163, fd131, 0d3FD3C6EF372FE950, fd162; +mul.f64 fd164, fd154, 0d3FE2CF2304755A5E; +mul.f64 fd165, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd166, fd165, fd164; +add.f64 fd167, fd166, fd163; +sub.f64 fd168, fd163, fd166; +mul.f64 fd169, fd141, 0d3FE9E3779B97F4A8; +mul.f64 fd170, fd159, 0dBFE2CF2304755A5E; +sub.f64 fd171, fd169, fd170; +mul.f64 fd172, fd159, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd173, fd141, 0dBFE2CF2304755A5E, fd172; +mul.f64 fd174, fd149, 0d3FD3C6EF372FE950; +mul.f64 fd175, fd167, 0dBFEE6F0E134454FF; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd167, 0d3FD3C6EF372FE950; +fma.rn.f64 fd178, fd149, 0dBFEE6F0E134454FF, fd177; +mul.f64 fd179, fd150, 0dBFD3C6EF372FE950; +mul.f64 fd180, fd168, 0dBFEE6F0E134454FF; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0dBFD3C6EF372FE950; +fma.rn.f64 fd183, fd150, 0dBFEE6F0E134454FF, fd182; +mul.f64 fd184, fd142, 0dBFE9E3779B97F4A8; +mul.f64 fd185, fd160, 0dBFE2CF2304755A5E; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd160, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd188, fd142, 0dBFE2CF2304755A5E, fd187; +add.f64 fd189, fd84, fd128; +add.f64 fd190, fd88, fd132; +sub.f64 fd191, fd84, fd128; +sub.f64 fd192, fd88, fd132; +add.f64 fd193, fd97, fd171; +add.f64 fd194, fd115, fd173; +sub.f64 fd195, fd97, fd171; +sub.f64 fd196, fd115, fd173; +add.f64 fd197, fd105, fd176; +add.f64 fd198, fd123, fd178; +sub.f64 fd199, fd105, fd176; +sub.f64 fd200, fd123, fd178; +add.f64 fd201, fd106, fd181; +add.f64 fd202, fd124, fd183; +sub.f64 fd203, fd106, fd181; +sub.f64 fd204, fd124, fd183; +add.f64 fd205, fd98, fd186; +add.f64 fd206, fd116, fd188; +sub.f64 fd207, fd98, fd186; +sub.f64 fd208, fd116, fd188; +add.f64 fd209, %53, %85; +add.f64 fd210, %42, fd209; +add.f64 fd211, %64, %74; +add.f64 fd212, fd211, fd210; +add.f64 fd213, %55, %87; +add.f64 fd214, %44, fd213; +add.f64 fd215, %65, %76; +add.f64 fd216, fd215, fd214; +fma.rn.f64 fd217, fd209, 0d3FD3C6EF372FE950, %42; +mul.f64 fd218, fd211, 0d3FE9E3779B97F4A8; +sub.f64 fd219, fd217, fd218; +sub.f64 fd220, %55, %87; +mul.f64 fd221, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd222, %65, %76; +mul.f64 fd223, fd222, 0dBFE2CF2304755A5E; +sub.f64 fd224, fd223, fd221; +sub.f64 fd225, fd219, fd224; +add.f64 fd226, fd224, fd219; +mul.f64 fd227, fd209, 0d3FE9E3779B97F4A8; +sub.f64 fd228, %42, fd227; +fma.rn.f64 fd229, fd211, 0d3FD3C6EF372FE950, fd228; +mul.f64 fd230, fd220, 0d3FE2CF2304755A5E; +mul.f64 fd231, fd222, 0d3FEE6F0E134454FF; +sub.f64 fd232, fd231, fd230; +sub.f64 fd233, fd229, fd232; +add.f64 fd234, fd232, fd229; +fma.rn.f64 fd235, fd213, 0d3FD3C6EF372FE950, %44; +mul.f64 fd236, fd215, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, %53, %85; +mul.f64 fd239, fd238, 0d3FEE6F0E134454FF; +sub.f64 fd240, %64, %74; +mul.f64 fd241, fd240, 0dBFE2CF2304755A5E; +sub.f64 fd242, fd241, fd239; +add.f64 fd243, fd242, fd237; +sub.f64 fd244, fd237, fd242; +mul.f64 fd245, fd213, 0d3FE9E3779B97F4A8; +sub.f64 fd246, %44, fd245; +fma.rn.f64 fd247, fd215, 0d3FD3C6EF372FE950, fd246; +mul.f64 fd248, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd249, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd250, fd249, fd248; +add.f64 fd251, fd250, fd247; +sub.f64 fd252, fd247, fd250; +add.f64 fd253, %58, %90; +add.f64 fd254, %48, fd253; +add.f64 fd255, %69, %80; +add.f64 fd256, fd255, fd254; +add.f64 fd257, %60, %91; +add.f64 fd258, %49, fd257; +add.f64 fd259, %71, %81; +add.f64 fd260, fd259, fd258; +fma.rn.f64 fd261, fd253, 0d3FD3C6EF372FE950, %48; +mul.f64 fd262, fd255, 0d3FE9E3779B97F4A8; +sub.f64 fd263, fd261, fd262; +sub.f64 fd264, %60, %91; +mul.f64 fd265, fd264, 0d3FEE6F0E134454FF; +sub.f64 fd266, %71, %81; +mul.f64 fd267, fd266, 0dBFE2CF2304755A5E; +sub.f64 fd268, fd267, fd265; +sub.f64 fd269, fd263, fd268; +add.f64 fd270, fd268, fd263; +mul.f64 fd271, fd253, 0d3FE9E3779B97F4A8; +sub.f64 fd272, %48, fd271; +fma.rn.f64 fd273, fd255, 0d3FD3C6EF372FE950, fd272; +mul.f64 fd274, fd264, 0d3FE2CF2304755A5E; +mul.f64 fd275, fd266, 0d3FEE6F0E134454FF; +sub.f64 fd276, fd275, fd274; +sub.f64 fd277, fd273, fd276; +add.f64 fd278, fd276, fd273; +fma.rn.f64 fd279, fd257, 0d3FD3C6EF372FE950, %49; +mul.f64 fd280, fd259, 0d3FE9E3779B97F4A8; +sub.f64 fd281, fd279, fd280; +sub.f64 fd282, %58, %90; +mul.f64 fd283, fd282, 0d3FEE6F0E134454FF; +sub.f64 fd284, %69, %80; +mul.f64 fd285, fd284, 0dBFE2CF2304755A5E; +sub.f64 fd286, fd285, fd283; +add.f64 fd287, fd286, fd281; +sub.f64 fd288, fd281, fd286; +mul.f64 fd289, fd257, 0d3FE9E3779B97F4A8; +sub.f64 fd290, %49, fd289; +fma.rn.f64 fd291, fd259, 0d3FD3C6EF372FE950, fd290; +mul.f64 fd292, fd282, 0d3FE2CF2304755A5E; +mul.f64 fd293, fd284, 0d3FEE6F0E134454FF; +sub.f64 fd294, fd293, fd292; +add.f64 fd295, fd294, fd291; +sub.f64 fd296, fd291, fd294; +mul.f64 fd297, fd269, 0d3FE9E3779B97F4A8; +mul.f64 fd298, fd287, 0dBFE2CF2304755A5E; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd287, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd301, fd269, 0dBFE2CF2304755A5E, fd300; +mul.f64 fd302, fd277, 0d3FD3C6EF372FE950; +mul.f64 fd303, fd295, 0dBFEE6F0E134454FF; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FD3C6EF372FE950; +fma.rn.f64 fd306, fd277, 0dBFEE6F0E134454FF, fd305; +mul.f64 fd307, fd278, 0dBFD3C6EF372FE950; +mul.f64 fd308, fd296, 0dBFEE6F0E134454FF; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd296, 0dBFD3C6EF372FE950; +fma.rn.f64 fd311, fd278, 0dBFEE6F0E134454FF, fd310; +mul.f64 fd312, fd270, 0dBFE9E3779B97F4A8; +mul.f64 fd313, fd288, 0dBFE2CF2304755A5E; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd288, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd316, fd270, 0dBFE2CF2304755A5E, fd315; +add.f64 fd317, fd212, fd256; +add.f64 fd318, fd216, fd260; +sub.f64 fd319, fd212, fd256; +sub.f64 fd320, fd216, fd260; +add.f64 fd321, fd225, fd299; +add.f64 fd322, fd243, fd301; +sub.f64 fd323, fd225, fd299; +sub.f64 fd324, fd243, fd301; +add.f64 fd325, fd233, fd304; +add.f64 fd326, fd251, fd306; +sub.f64 fd327, fd233, fd304; +sub.f64 fd328, fd251, fd306; +add.f64 fd329, fd234, fd309; +add.f64 fd330, fd252, fd311; +sub.f64 fd331, fd234, fd309; +sub.f64 fd332, fd252, fd311; +add.f64 fd333, fd226, fd314; +add.f64 fd334, fd244, fd316; +sub.f64 fd335, fd226, fd314; +sub.f64 fd336, fd244, fd316; +mul.f64 fd337, fd321, 0d3FEE6F0E134454FF; +mul.f64 fd338, fd322, 0dBFD3C6EF372FE950; +sub.f64 fd339, fd337, fd338; +mul.f64 fd340, fd322, 0d3FEE6F0E134454FF; +fma.rn.f64 fd341, fd321, 0dBFD3C6EF372FE950, fd340; +mul.f64 fd342, fd325, 0d3FE9E3779B97F4A8; +mul.f64 fd343, fd326, 0dBFE2CF2304755A5E; +sub.f64 fd344, fd342, fd343; +mul.f64 fd345, fd326, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd346, fd325, 0dBFE2CF2304755A5E, fd345; +mul.f64 fd347, fd329, 0d3FE2CF2304755A5E; +mul.f64 fd348, fd330, 0dBFE9E3779B97F4A8; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd330, 0d3FE2CF2304755A5E; +fma.rn.f64 fd351, fd329, 0dBFE9E3779B97F4A8, fd350; +mul.f64 fd352, fd333, 0d3FD3C6EF372FE950; +mul.f64 fd353, fd334, 0dBFEE6F0E134454FF; +sub.f64 fd354, fd352, fd353; +mul.f64 fd355, fd334, 0d3FD3C6EF372FE950; +fma.rn.f64 fd356, fd333, 0dBFEE6F0E134454FF, fd355; +mul.f64 fd357, fd323, 0dBFD3C6EF372FE950; +mul.f64 fd358, fd324, 0dBFEE6F0E134454FF; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd324, 0dBFD3C6EF372FE950; +fma.rn.f64 fd361, fd323, 0dBFEE6F0E134454FF, fd360; +mul.f64 fd362, fd327, 0dBFE2CF2304755A5E; +mul.f64 fd363, fd328, 0dBFE9E3779B97F4A8; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd328, 0dBFE2CF2304755A5E; +fma.rn.f64 fd366, fd327, 0dBFE9E3779B97F4A8, fd365; +mul.f64 fd367, fd331, 0dBFE9E3779B97F4A8; +mul.f64 fd368, fd332, 0dBFE2CF2304755A5E; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd332, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd371, fd331, 0dBFE2CF2304755A5E, fd370; +mul.f64 fd372, fd335, 0dBFEE6F0E134454FF; +mul.f64 fd373, fd336, 0dBFD3C6EF372FE950; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd336, 0dBFEE6F0E134454FF; +fma.rn.f64 fd376, fd335, 0dBFD3C6EF372FE950, fd375; +add.f64 %1, fd190, fd318; +add.f64 %0, fd189, fd317; +add.f64 %3, fd194, fd341; +add.f64 %2, fd193, fd339; +add.f64 %5, fd198, fd346; +add.f64 %4, fd197, fd344; +add.f64 %7, fd202, fd351; +add.f64 %6, fd201, fd349; +add.f64 %9, fd206, fd356; +add.f64 %8, fd205, fd354; +sub.f64 %11, fd192, fd319; +add.f64 %10, fd191, fd320; +add.f64 %13, fd196, fd361; +add.f64 %12, fd195, fd359; +add.f64 %15, fd200, fd366; +add.f64 %14, fd199, fd364; +add.f64 %17, fd204, fd371; +add.f64 %16, fd203, fd369; +add.f64 %19, fd208, fd376; +add.f64 %18, fd207, fd374; +sub.f64 %21, fd190, fd318; +sub.f64 %20, fd189, fd317; +sub.f64 %23, fd194, fd341; +sub.f64 %22, fd193, fd339; +sub.f64 %25, fd198, fd346; +sub.f64 %24, fd197, fd344; +sub.f64 %27, fd202, fd351; +sub.f64 %26, fd201, fd349; +sub.f64 %29, fd206, fd356; +sub.f64 %28, fd205, fd354; +add.f64 %31, fd192, fd319; +sub.f64 %30, fd191, fd320; +sub.f64 %33, fd196, fd361; +sub.f64 %32, fd195, fd359; +sub.f64 %35, fd200, fd366; +sub.f64 %34, fd199, fd364; +sub.f64 %37, fd204, fd371; +sub.f64 %36, fd203, fd369; +sub.f64 %39, fd208, fd376; +sub.f64 %38, fd207, fd374; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..e1465c69290d4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_20_fp64_inv.hpp.inc @@ -0,0 +1,344 @@ +#ifndef CUFFTDX_FFT_20_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_20_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<581, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<409>; +.reg .b64 rd<2>; +add.f64 fd81, %50, %82; +add.f64 fd82, %40, fd81; +add.f64 fd83, %61, %72; +add.f64 fd84, fd83, fd82; +add.f64 fd85, %52, %84; +add.f64 fd86, %41, fd85; +add.f64 fd87, %63, %73; +add.f64 fd88, fd87, fd86; +fma.rn.f64 fd89, fd81, 0d3FD3C6EF372FE950, %40; +mul.f64 fd90, fd83, 0d3FE9E3779B97F4A8; +sub.f64 fd91, fd89, fd90; +sub.f64 fd92, %52, %84; +mul.f64 fd93, fd92, 0d3FEE6F0E134454FF; +sub.f64 fd94, %63, %73; +fma.rn.f64 fd95, fd94, 0d3FE2CF2304755A5E, fd93; +sub.f64 fd96, fd91, fd95; +add.f64 fd97, fd95, fd91; +mul.f64 fd98, fd81, 0d3FE9E3779B97F4A8; +sub.f64 fd99, %40, fd98; +fma.rn.f64 fd100, fd83, 0d3FD3C6EF372FE950, fd99; +mul.f64 fd101, fd92, 0d3FE2CF2304755A5E; +mul.f64 fd102, fd94, 0d3FEE6F0E134454FF; +sub.f64 fd103, fd101, fd102; +sub.f64 fd104, fd100, fd103; +add.f64 fd105, fd103, fd100; +fma.rn.f64 fd106, fd85, 0d3FD3C6EF372FE950, %41; +mul.f64 fd107, fd87, 0d3FE9E3779B97F4A8; +sub.f64 fd108, fd106, fd107; +sub.f64 fd109, %50, %82; +mul.f64 fd110, fd109, 0d3FEE6F0E134454FF; +sub.f64 fd111, %61, %72; +fma.rn.f64 fd112, fd111, 0d3FE2CF2304755A5E, fd110; +add.f64 fd113, fd112, fd108; +sub.f64 fd114, fd108, fd112; +mul.f64 fd115, fd85, 0d3FE9E3779B97F4A8; +sub.f64 fd116, %41, fd115; +fma.rn.f64 fd117, fd87, 0d3FD3C6EF372FE950, fd116; +mul.f64 fd118, fd109, 0d3FE2CF2304755A5E; +mul.f64 fd119, fd111, 0d3FEE6F0E134454FF; +sub.f64 fd120, fd118, fd119; +add.f64 fd121, fd120, fd117; +sub.f64 fd122, fd117, fd120; +add.f64 fd123, %56, %88; +add.f64 fd124, %45, fd123; +add.f64 fd125, %66, %77; +add.f64 fd126, fd125, fd124; +add.f64 fd127, %57, %89; +add.f64 fd128, %47, fd127; +add.f64 fd129, %68, %79; +add.f64 fd130, fd129, fd128; +fma.rn.f64 fd131, fd123, 0d3FD3C6EF372FE950, %45; +mul.f64 fd132, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd133, fd131, fd132; +sub.f64 fd134, %57, %89; +mul.f64 fd135, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd136, %68, %79; +fma.rn.f64 fd137, fd136, 0d3FE2CF2304755A5E, fd135; +sub.f64 fd138, fd133, fd137; +add.f64 fd139, fd137, fd133; +mul.f64 fd140, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd141, %45, fd140; +fma.rn.f64 fd142, fd125, 0d3FD3C6EF372FE950, fd141; +mul.f64 fd143, fd134, 0d3FE2CF2304755A5E; +mul.f64 fd144, fd136, 0d3FEE6F0E134454FF; +sub.f64 fd145, fd143, fd144; +sub.f64 fd146, fd142, fd145; +add.f64 fd147, fd145, fd142; +fma.rn.f64 fd148, fd127, 0d3FD3C6EF372FE950, %47; +mul.f64 fd149, fd129, 0d3FE9E3779B97F4A8; +sub.f64 fd150, fd148, fd149; +sub.f64 fd151, %56, %88; +mul.f64 fd152, fd151, 0d3FEE6F0E134454FF; +sub.f64 fd153, %66, %77; +fma.rn.f64 fd154, fd153, 0d3FE2CF2304755A5E, fd152; +add.f64 fd155, fd154, fd150; +sub.f64 fd156, fd150, fd154; +mul.f64 fd157, fd127, 0d3FE9E3779B97F4A8; +sub.f64 fd158, %47, fd157; +fma.rn.f64 fd159, fd129, 0d3FD3C6EF372FE950, fd158; +mul.f64 fd160, fd151, 0d3FE2CF2304755A5E; +mul.f64 fd161, fd153, 0d3FEE6F0E134454FF; +sub.f64 fd162, fd160, fd161; +add.f64 fd163, fd162, fd159; +sub.f64 fd164, fd159, fd162; +mul.f64 fd165, fd138, 0d3FE9E3779B97F4A8; +mul.f64 fd166, fd155, 0d3FE2CF2304755A5E; +sub.f64 fd167, fd165, fd166; +mul.f64 fd168, fd155, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd169, fd138, 0d3FE2CF2304755A5E, fd168; +mul.f64 fd170, fd146, 0d3FD3C6EF372FE950; +mul.f64 fd171, fd163, 0d3FEE6F0E134454FF; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd163, 0d3FD3C6EF372FE950; +fma.rn.f64 fd174, fd146, 0d3FEE6F0E134454FF, fd173; +mul.f64 fd175, fd147, 0dBFD3C6EF372FE950; +mul.f64 fd176, fd164, 0d3FEE6F0E134454FF; +sub.f64 fd177, fd175, fd176; +mul.f64 fd178, fd164, 0dBFD3C6EF372FE950; +fma.rn.f64 fd179, fd147, 0d3FEE6F0E134454FF, fd178; +mul.f64 fd180, fd139, 0dBFE9E3779B97F4A8; +mul.f64 fd181, fd156, 0d3FE2CF2304755A5E; +sub.f64 fd182, fd180, fd181; +mul.f64 fd183, fd156, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd184, fd139, 0d3FE2CF2304755A5E, fd183; +add.f64 fd185, fd84, fd126; +add.f64 fd186, fd88, fd130; +sub.f64 fd187, fd84, fd126; +sub.f64 fd188, fd88, fd130; +add.f64 fd189, fd96, fd167; +add.f64 fd190, fd113, fd169; +sub.f64 fd191, fd96, fd167; +sub.f64 fd192, fd113, fd169; +add.f64 fd193, fd104, fd172; +add.f64 fd194, fd121, fd174; +sub.f64 fd195, fd104, fd172; +sub.f64 fd196, fd121, fd174; +add.f64 fd197, fd105, fd177; +add.f64 fd198, fd122, fd179; +sub.f64 fd199, fd105, fd177; +sub.f64 fd200, fd122, fd179; +add.f64 fd201, fd97, fd182; +add.f64 fd202, fd114, fd184; +sub.f64 fd203, fd97, fd182; +sub.f64 fd204, fd114, fd184; +add.f64 fd205, %53, %85; +add.f64 fd206, %42, fd205; +add.f64 fd207, %64, %74; +add.f64 fd208, fd207, fd206; +add.f64 fd209, %55, %87; +add.f64 fd210, %44, fd209; +add.f64 fd211, %65, %76; +add.f64 fd212, fd211, fd210; +fma.rn.f64 fd213, fd205, 0d3FD3C6EF372FE950, %42; +mul.f64 fd214, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd215, fd213, fd214; +sub.f64 fd216, %55, %87; +mul.f64 fd217, fd216, 0d3FEE6F0E134454FF; +sub.f64 fd218, %65, %76; +fma.rn.f64 fd219, fd218, 0d3FE2CF2304755A5E, fd217; +sub.f64 fd220, fd215, fd219; +add.f64 fd221, fd219, fd215; +mul.f64 fd222, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd223, %42, fd222; +fma.rn.f64 fd224, fd207, 0d3FD3C6EF372FE950, fd223; +mul.f64 fd225, fd216, 0d3FE2CF2304755A5E; +mul.f64 fd226, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd227, fd225, fd226; +sub.f64 fd228, fd224, fd227; +add.f64 fd229, fd227, fd224; +fma.rn.f64 fd230, fd209, 0d3FD3C6EF372FE950, %44; +mul.f64 fd231, fd211, 0d3FE9E3779B97F4A8; +sub.f64 fd232, fd230, fd231; +sub.f64 fd233, %53, %85; +mul.f64 fd234, fd233, 0d3FEE6F0E134454FF; +sub.f64 fd235, %64, %74; +fma.rn.f64 fd236, fd235, 0d3FE2CF2304755A5E, fd234; +add.f64 fd237, fd236, fd232; +sub.f64 fd238, fd232, fd236; +mul.f64 fd239, fd209, 0d3FE9E3779B97F4A8; +sub.f64 fd240, %44, fd239; +fma.rn.f64 fd241, fd211, 0d3FD3C6EF372FE950, fd240; +mul.f64 fd242, fd233, 0d3FE2CF2304755A5E; +mul.f64 fd243, fd235, 0d3FEE6F0E134454FF; +sub.f64 fd244, fd242, fd243; +add.f64 fd245, fd244, fd241; +sub.f64 fd246, fd241, fd244; +add.f64 fd247, %58, %90; +add.f64 fd248, %48, fd247; +add.f64 fd249, %69, %80; +add.f64 fd250, fd249, fd248; +add.f64 fd251, %60, %91; +add.f64 fd252, %49, fd251; +add.f64 fd253, %71, %81; +add.f64 fd254, fd253, fd252; +fma.rn.f64 fd255, fd247, 0d3FD3C6EF372FE950, %48; +mul.f64 fd256, fd249, 0d3FE9E3779B97F4A8; +sub.f64 fd257, fd255, fd256; +sub.f64 fd258, %60, %91; +mul.f64 fd259, fd258, 0d3FEE6F0E134454FF; +sub.f64 fd260, %71, %81; +fma.rn.f64 fd261, fd260, 0d3FE2CF2304755A5E, fd259; +sub.f64 fd262, fd257, fd261; +add.f64 fd263, fd261, fd257; +mul.f64 fd264, fd247, 0d3FE9E3779B97F4A8; +sub.f64 fd265, %48, fd264; +fma.rn.f64 fd266, fd249, 0d3FD3C6EF372FE950, fd265; +mul.f64 fd267, fd258, 0d3FE2CF2304755A5E; +mul.f64 fd268, fd260, 0d3FEE6F0E134454FF; +sub.f64 fd269, fd267, fd268; +sub.f64 fd270, fd266, fd269; +add.f64 fd271, fd269, fd266; +fma.rn.f64 fd272, fd251, 0d3FD3C6EF372FE950, %49; +mul.f64 fd273, fd253, 0d3FE9E3779B97F4A8; +sub.f64 fd274, fd272, fd273; +sub.f64 fd275, %58, %90; +mul.f64 fd276, fd275, 0d3FEE6F0E134454FF; +sub.f64 fd277, %69, %80; +fma.rn.f64 fd278, fd277, 0d3FE2CF2304755A5E, fd276; +add.f64 fd279, fd278, fd274; +sub.f64 fd280, fd274, fd278; +mul.f64 fd281, fd251, 0d3FE9E3779B97F4A8; +sub.f64 fd282, %49, fd281; +fma.rn.f64 fd283, fd253, 0d3FD3C6EF372FE950, fd282; +mul.f64 fd284, fd275, 0d3FE2CF2304755A5E; +mul.f64 fd285, fd277, 0d3FEE6F0E134454FF; +sub.f64 fd286, fd284, fd285; +add.f64 fd287, fd286, fd283; +sub.f64 fd288, fd283, fd286; +mul.f64 fd289, fd262, 0d3FE9E3779B97F4A8; +mul.f64 fd290, fd279, 0d3FE2CF2304755A5E; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd279, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd293, fd262, 0d3FE2CF2304755A5E, fd292; +mul.f64 fd294, fd270, 0d3FD3C6EF372FE950; +mul.f64 fd295, fd287, 0d3FEE6F0E134454FF; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd287, 0d3FD3C6EF372FE950; +fma.rn.f64 fd298, fd270, 0d3FEE6F0E134454FF, fd297; +mul.f64 fd299, fd271, 0dBFD3C6EF372FE950; +mul.f64 fd300, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd288, 0dBFD3C6EF372FE950; +fma.rn.f64 fd303, fd271, 0d3FEE6F0E134454FF, fd302; +mul.f64 fd304, fd263, 0dBFE9E3779B97F4A8; +mul.f64 fd305, fd280, 0d3FE2CF2304755A5E; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd280, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd308, fd263, 0d3FE2CF2304755A5E, fd307; +add.f64 fd309, fd208, fd250; +add.f64 fd310, fd212, fd254; +sub.f64 fd311, fd208, fd250; +sub.f64 fd312, fd212, fd254; +add.f64 fd313, fd220, fd291; +add.f64 fd314, fd237, fd293; +sub.f64 fd315, fd220, fd291; +sub.f64 fd316, fd237, fd293; +add.f64 fd317, fd228, fd296; +add.f64 fd318, fd245, fd298; +sub.f64 fd319, fd228, fd296; +sub.f64 fd320, fd245, fd298; +add.f64 fd321, fd229, fd301; +add.f64 fd322, fd246, fd303; +sub.f64 fd323, fd229, fd301; +sub.f64 fd324, fd246, fd303; +add.f64 fd325, fd221, fd306; +add.f64 fd326, fd238, fd308; +sub.f64 fd327, fd221, fd306; +sub.f64 fd328, fd238, fd308; +mul.f64 fd329, fd313, 0d3FEE6F0E134454FF; +mul.f64 fd330, fd314, 0d3FD3C6EF372FE950; +sub.f64 fd331, fd329, fd330; +mul.f64 fd332, fd314, 0d3FEE6F0E134454FF; +fma.rn.f64 fd333, fd313, 0d3FD3C6EF372FE950, fd332; +mul.f64 fd334, fd317, 0d3FE9E3779B97F4A8; +mul.f64 fd335, fd318, 0d3FE2CF2304755A5E; +sub.f64 fd336, fd334, fd335; +mul.f64 fd337, fd318, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd338, fd317, 0d3FE2CF2304755A5E, fd337; +mul.f64 fd339, fd321, 0d3FE2CF2304755A5E; +mul.f64 fd340, fd322, 0d3FE9E3779B97F4A8; +sub.f64 fd341, fd339, fd340; +mul.f64 fd342, fd322, 0d3FE2CF2304755A5E; +fma.rn.f64 fd343, fd321, 0d3FE9E3779B97F4A8, fd342; +mul.f64 fd344, fd325, 0d3FD3C6EF372FE950; +mul.f64 fd345, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd346, fd344, fd345; +mul.f64 fd347, fd326, 0d3FD3C6EF372FE950; +fma.rn.f64 fd348, fd325, 0d3FEE6F0E134454FF, fd347; +mul.f64 fd349, fd315, 0dBFD3C6EF372FE950; +mul.f64 fd350, fd316, 0d3FEE6F0E134454FF; +sub.f64 fd351, fd349, fd350; +mul.f64 fd352, fd316, 0dBFD3C6EF372FE950; +fma.rn.f64 fd353, fd315, 0d3FEE6F0E134454FF, fd352; +mul.f64 fd354, fd319, 0dBFE2CF2304755A5E; +mul.f64 fd355, fd320, 0d3FE9E3779B97F4A8; +sub.f64 fd356, fd354, fd355; +mul.f64 fd357, fd320, 0dBFE2CF2304755A5E; +fma.rn.f64 fd358, fd319, 0d3FE9E3779B97F4A8, fd357; +mul.f64 fd359, fd323, 0dBFE9E3779B97F4A8; +mul.f64 fd360, fd324, 0d3FE2CF2304755A5E; +sub.f64 fd361, fd359, fd360; +mul.f64 fd362, fd324, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd363, fd323, 0d3FE2CF2304755A5E, fd362; +mul.f64 fd364, fd327, 0dBFEE6F0E134454FF; +mul.f64 fd365, fd328, 0d3FD3C6EF372FE950; +sub.f64 fd366, fd364, fd365; +mul.f64 fd367, fd328, 0dBFEE6F0E134454FF; +fma.rn.f64 fd368, fd327, 0d3FD3C6EF372FE950, fd367; +add.f64 %1, fd186, fd310; +add.f64 %0, fd185, fd309; +add.f64 %3, fd190, fd333; +add.f64 %2, fd189, fd331; +add.f64 %5, fd194, fd338; +add.f64 %4, fd193, fd336; +add.f64 %7, fd198, fd343; +add.f64 %6, fd197, fd341; +add.f64 %9, fd202, fd348; +add.f64 %8, fd201, fd346; +add.f64 %11, fd188, fd311; +sub.f64 %10, fd187, fd312; +add.f64 %13, fd192, fd353; +add.f64 %12, fd191, fd351; +add.f64 %15, fd196, fd358; +add.f64 %14, fd195, fd356; +add.f64 %17, fd200, fd363; +add.f64 %16, fd199, fd361; +add.f64 %19, fd204, fd368; +add.f64 %18, fd203, fd366; +sub.f64 %21, fd186, fd310; +sub.f64 %20, fd185, fd309; +sub.f64 %23, fd190, fd333; +sub.f64 %22, fd189, fd331; +sub.f64 %25, fd194, fd338; +sub.f64 %24, fd193, fd336; +sub.f64 %27, fd198, fd343; +sub.f64 %26, fd197, fd341; +sub.f64 %29, fd202, fd348; +sub.f64 %28, fd201, fd346; +sub.f64 %31, fd188, fd311; +add.f64 %30, fd187, fd312; +sub.f64 %33, fd192, fd353; +sub.f64 %32, fd191, fd351; +sub.f64 %35, fd196, fd358; +sub.f64 %34, fd195, fd356; +sub.f64 %37, fd200, fd363; +sub.f64 %36, fd199, fd361; +sub.f64 %39, fd204, fd368; +sub.f64 %38, fd203, fd366; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..48b68939fbb25 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp16_fwd.hpp.inc @@ -0,0 +1,2932 @@ +#ifndef CUFFTDX_FFT_216_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_216_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<932, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<119>; +.reg .b32 r<1158>; +.reg .b64 rd<7>; +mov.u32 r1139, %tid.y; +shl.b32 r1140, r1139, 1; +mov.u32 r1141, %12; +mad.lo.s32 r1142, r1140, 864, r1141; +mov.u32 r1143, %tid.x; +mov.f32 f98, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1, {low, high}; +} +mov.f32 f100, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %17, %21; +} +{ +add.f16x2 r8, %13, r5; +} +{ +add.f16x2 r11, %18, %22; +} +{ +add.f16x2 r14, %14, r11; +} +{ +add.f16x2 r17, %17, %21; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %13, r20; +} +{ +sub.f16x2 r26, %18, %22; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %17, %21; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %13, r38; +} +{ +sub.f16x2 r44, %18, %22; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %18, %22; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %14, r56; +} +{ +sub.f16x2 r62, %17, %21; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %18, %22; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %14, r74; +} +{ +sub.f16x2 r80, %17, %21; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %19, %23; +} +{ +add.f16x2 r96, %15, r93; +} +{ +add.f16x2 r99, %20, %24; +} +{ +add.f16x2 r102, %16, r99; +} +{ +add.f16x2 r105, %19, %23; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %15, r108; +} +{ +sub.f16x2 r114, %20, %24; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %19, %23; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %15, r126; +} +{ +sub.f16x2 r132, %20, %24; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %20, %24; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %16, r144; +} +{ +sub.f16x2 r150, %19, %23; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %20, %24; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %16, r162; +} +{ +sub.f16x2 r168, %19, %23; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f94, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r180, {low, high}; +} +mov.f32 f81, 0fBF800000; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +mul.wide.u32 rd2, r1143, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1144, rd3; +mul.lo.s32 r1145, r1144, 36; +sub.s32 r1146, r1143, r1145; +shr.u64 rd4, rd2, 34; +cvt.u32.u64 r1147, rd4; +and.b32 r1148, r1147, 1073741822; +mad.lo.s32 r1149, r1148, 864, r1142; +cvt.rn.f32.u32 f113, r1146; +mul.f32 f114, f113, 0f3CEE4BAE; +cos.approx.f32 f29, f114; +sin.approx.f32 f115, f114; +neg.f32 f30, f115; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r260, {high, high}; +} +{ +mul.f16x2 r262, r234, r260; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r231, r258, r265; +} +{ +mul.f16x2 r271, r231, r260; +} +{ +fma.rn.f16x2 r274, r234, r258, r271; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r280, {high, high}; +} +mov.f32 f82, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r280, r282; +} +{ +mul.f16x2 r286, r255, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r289, {high, low}; +} +{ +fma.rn.f16x2 r291, r283, r289, r286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r297, {high, high}; +} +{ +mul.f16x2 r299, r246, r297; +} +{ +neg.f16x2 r302, r299; +} +{ +fma.rn.f16x2 r304, r243, r295, r302; +} +{ +mul.f16x2 r308, r243, r297; +} +{ +fma.rn.f16x2 r311, r246, r295, r308; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r315, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r317, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r319, {low, high}; +} +{ +mul.f16x2 r320, r317, r319; +} +{ +mul.f16x2 r323, r291, r315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r326, {high, low}; +} +{ +fma.rn.f16x2 r328, r320, r326, r323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r334, {high, high}; +} +{ +mul.f16x2 r336, r228, r334; +} +{ +neg.f16x2 r339, r336; +} +{ +fma.rn.f16x2 r341, r225, r332, r339; +} +{ +mul.f16x2 r345, r225, r334; +} +{ +fma.rn.f16x2 r348, r228, r332, r345; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r352, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r354, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r356, {low, high}; +} +{ +mul.f16x2 r357, r354, r356; +} +{ +mul.f16x2 r360, r328, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r363, {high, low}; +} +{ +fma.rn.f16x2 r365, r357, r363, r360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r371, {high, high}; +} +{ +mul.f16x2 r373, r240, r371; +} +{ +neg.f16x2 r376, r373; +} +{ +fma.rn.f16x2 r378, r237, r369, r376; +} +{ +mul.f16x2 r382, r237, r371; +} +{ +fma.rn.f16x2 r385, r240, r369, r382; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r389, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r391, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r393, {low, high}; +} +{ +mul.f16x2 r394, r391, r393; +} +{ +mul.f16x2 r397, r365, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r400, {high, low}; +} +{ +fma.rn.f16x2 r402, r394, r400, r397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r408, {high, high}; +} +{ +mul.f16x2 r410, r252, r408; +} +{ +neg.f16x2 r413, r410; +} +{ +fma.rn.f16x2 r415, r249, r406, r413; +} +{ +mul.f16x2 r419, r249, r408; +} +{ +fma.rn.f16x2 r422, r252, r406, r419; +} +barrier.sync 0; +mad.lo.s32 r1150, r1146, 48, r1149; +st.shared.v2.f32 [r1150], {r219, r222}; +st.shared.v2.f32 [r1150+8], {r267, r274}; +st.shared.v2.f32 [r1150+16], {r304, r311}; +st.shared.v2.f32 [r1150+24], {r341, r348}; +st.shared.v2.f32 [r1150+32], {r378, r385}; +st.shared.v2.f32 [r1150+40], {r415, r422}; +barrier.sync 0; +mad.lo.s32 r1151, r1146, -40, r1150; +ld.shared.u32 r451, [r1151]; +ld.shared.u32 r457, [r1151+4]; +ld.shared.u32 r539, [r1151+288]; +ld.shared.u32 r545, [r1151+292]; +ld.shared.u32 r448, [r1151+576]; +ld.shared.u32 r454, [r1151+580]; +ld.shared.u32 r536, [r1151+864]; +ld.shared.u32 r542, [r1151+868]; +ld.shared.u32 r449, [r1151+1152]; +ld.shared.u32 r455, [r1151+1156]; +ld.shared.u32 r537, [r1151+1440]; +ld.shared.u32 r543, [r1151+1444]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r444, {low, high}; +} +{ +neg.f16x2 r445, r444; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r454, r455; +} +{ +add.f16x2 r456, r457, r453; +} +{ +add.f16x2 r459, r448, r449; +} +{ +mul.f16x2 r462, r459, r443; +} +{ +add.f16x2 r465, r451, r462; +} +{ +sub.f16x2 r468, r454, r455; +} +{ +mul.f16x2 r471, r468, r445; +} +{ +add.f16x2 r474, r465, r471; +} +{ +add.f16x2 r477, r448, r449; +} +{ +mul.f16x2 r480, r477, r443; +} +{ +add.f16x2 r483, r451, r480; +} +{ +sub.f16x2 r486, r454, r455; +} +{ +mul.f16x2 r489, r486, r445; +} +{ +sub.f16x2 r492, r483, r489; +} +{ +add.f16x2 r495, r454, r455; +} +{ +mul.f16x2 r498, r495, r443; +} +{ +add.f16x2 r501, r457, r498; +} +{ +sub.f16x2 r504, r448, r449; +} +{ +mul.f16x2 r507, r504, r445; +} +{ +sub.f16x2 r510, r501, r507; +} +{ +add.f16x2 r513, r454, r455; +} +{ +mul.f16x2 r516, r513, r443; +} +{ +add.f16x2 r519, r457, r516; +} +{ +sub.f16x2 r522, r448, r449; +} +{ +mul.f16x2 r525, r522, r445; +} +{ +add.f16x2 r528, r519, r525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r532, {low, high}; +} +{ +neg.f16x2 r533, r532; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r535; +} +{ +add.f16x2 r541, r542, r543; +} +{ +add.f16x2 r544, r545, r541; +} +{ +add.f16x2 r547, r536, r537; +} +{ +mul.f16x2 r550, r547, r531; +} +{ +add.f16x2 r553, r539, r550; +} +{ +sub.f16x2 r556, r542, r543; +} +{ +mul.f16x2 r559, r556, r533; +} +{ +add.f16x2 r562, r553, r559; +} +{ +add.f16x2 r565, r536, r537; +} +{ +mul.f16x2 r568, r565, r531; +} +{ +add.f16x2 r571, r539, r568; +} +{ +sub.f16x2 r574, r542, r543; +} +{ +mul.f16x2 r577, r574, r533; +} +{ +sub.f16x2 r580, r571, r577; +} +{ +add.f16x2 r583, r542, r543; +} +{ +mul.f16x2 r586, r583, r531; +} +{ +add.f16x2 r589, r545, r586; +} +{ +sub.f16x2 r592, r536, r537; +} +{ +mul.f16x2 r595, r592, r533; +} +{ +sub.f16x2 r598, r589, r595; +} +{ +add.f16x2 r601, r542, r543; +} +{ +mul.f16x2 r604, r601, r531; +} +{ +add.f16x2 r607, r545, r604; +} +{ +sub.f16x2 r610, r536, r537; +} +{ +mul.f16x2 r613, r610, r533; +} +{ +add.f16x2 r616, r607, r613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r619, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r622, {low, high}; +} +{ +mul.f16x2 r629, r562, r619; +} +{ +mul.f16x2 r632, r598, r620; +} +{ +sub.f16x2 r635, r629, r632; +} +{ +mul.f16x2 r638, r562, r620; +} +{ +fma.rn.f16x2 r641, r598, r619, r638; +} +{ +mul.f16x2 r645, r580, r621; +} +{ +mul.f16x2 r648, r616, r622; +} +{ +sub.f16x2 r651, r645, r648; +} +{ +mul.f16x2 r654, r580, r622; +} +{ +fma.rn.f16x2 r657, r616, r621, r654; +} +{ +add.f16x2 r661, r450, r538; +} +{ +add.f16x2 r664, r456, r544; +} +{ +sub.f16x2 r667, r450, r538; +} +{ +sub.f16x2 r670, r456, r544; +} +{ +add.f16x2 r673, r474, r635; +} +{ +add.f16x2 r676, r510, r641; +} +{ +sub.f16x2 r679, r474, r635; +} +{ +sub.f16x2 r682, r510, r641; +} +{ +add.f16x2 r685, r492, r651; +} +{ +add.f16x2 r688, r528, r657; +} +{ +sub.f16x2 r691, r492, r651; +} +{ +sub.f16x2 r694, r528, r657; +} +mul.wide.u32 rd5, r1146, -1431655765; +shr.u64 rd6, rd5, 34; +cvt.u32.u64 r1152, rd6; +cvt.rn.f32.u32 f116, r1152; +mul.f32 f117, f116, 0f3E32B8C2; +cos.approx.f32 f71, f117; +sin.approx.f32 f118, f117; +neg.f32 f72, f118; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r697, {low, high}; +} +mul.lo.s32 r1153, r1152, 6; +sub.s32 r1154, r1146, r1153; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r702, {high, high}; +} +{ +mul.f16x2 r704, r676, r702; +} +{ +neg.f16x2 r707, r704; +} +{ +fma.rn.f16x2 r709, r673, r700, r707; +} +{ +mul.f16x2 r713, r673, r702; +} +{ +fma.rn.f16x2 r716, r676, r700, r713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r722, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r724, {low, high}; +} +{ +mul.f16x2 r725, r722, r724; +} +{ +mul.f16x2 r728, r697, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r731, {high, low}; +} +{ +fma.rn.f16x2 r733, r725, r731, r728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r739, {high, high}; +} +{ +mul.f16x2 r741, r688, r739; +} +{ +neg.f16x2 r744, r741; +} +{ +fma.rn.f16x2 r746, r685, r737, r744; +} +{ +mul.f16x2 r750, r685, r739; +} +{ +fma.rn.f16x2 r753, r688, r737, r750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r759, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r759, r761; +} +{ +mul.f16x2 r765, r733, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r768, {high, low}; +} +{ +fma.rn.f16x2 r770, r762, r768, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r774, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r776, {high, high}; +} +{ +mul.f16x2 r778, r670, r776; +} +{ +neg.f16x2 r781, r778; +} +{ +fma.rn.f16x2 r783, r667, r774, r781; +} +{ +mul.f16x2 r787, r667, r776; +} +{ +fma.rn.f16x2 r790, r670, r774, r787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r796, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r798, {low, high}; +} +{ +mul.f16x2 r799, r796, r798; +} +{ +mul.f16x2 r802, r770, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r805, {high, low}; +} +{ +fma.rn.f16x2 r807, r799, r805, r802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r811, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r813, {high, high}; +} +{ +mul.f16x2 r815, r682, r813; +} +{ +neg.f16x2 r818, r815; +} +{ +fma.rn.f16x2 r820, r679, r811, r818; +} +{ +mul.f16x2 r824, r679, r813; +} +{ +fma.rn.f16x2 r827, r682, r811, r824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r833, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r833, r835; +} +{ +mul.f16x2 r839, r807, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r842, {high, low}; +} +{ +fma.rn.f16x2 r844, r836, r842, r839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r848, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r850, {high, high}; +} +{ +mul.f16x2 r852, r694, r850; +} +{ +neg.f16x2 r855, r852; +} +{ +fma.rn.f16x2 r857, r691, r848, r855; +} +{ +mul.f16x2 r861, r691, r850; +} +{ +fma.rn.f16x2 r864, r694, r848, r861; +} +shl.b32 r1155, r1154, 3; +add.s32 r1156, r1149, r1155; +barrier.sync 0; +mad.lo.s32 r1157, r1152, 288, r1156; +st.shared.u32 [r1157], r661; +st.shared.u32 [r1157+4], r664; +st.shared.u32 [r1157+48], r709; +st.shared.u32 [r1157+52], r716; +st.shared.u32 [r1157+96], r746; +st.shared.u32 [r1157+100], r753; +st.shared.u32 [r1157+144], r783; +st.shared.u32 [r1157+148], r790; +st.shared.u32 [r1157+192], r820; +st.shared.u32 [r1157+196], r827; +st.shared.u32 [r1157+240], r857; +st.shared.u32 [r1157+244], r864; +barrier.sync 0; +ld.shared.u32 r893, [r1151]; +ld.shared.u32 r899, [r1151+4]; +ld.shared.u32 r981, [r1151+288]; +ld.shared.u32 r987, [r1151+292]; +ld.shared.u32 r890, [r1151+576]; +ld.shared.u32 r896, [r1151+580]; +ld.shared.u32 r978, [r1151+864]; +ld.shared.u32 r984, [r1151+868]; +ld.shared.u32 r891, [r1151+1152]; +ld.shared.u32 r897, [r1151+1156]; +ld.shared.u32 r979, [r1151+1440]; +ld.shared.u32 r985, [r1151+1444]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r886, {low, high}; +} +{ +neg.f16x2 r887, r886; +} +{ +add.f16x2 r889, r890, r891; +} +{ +add.f16x2 r892, r893, r889; +} +{ +add.f16x2 r895, r896, r897; +} +{ +add.f16x2 r898, r899, r895; +} +{ +add.f16x2 r901, r890, r891; +} +{ +mul.f16x2 r904, r901, r885; +} +{ +add.f16x2 r907, r893, r904; +} +{ +sub.f16x2 r910, r896, r897; +} +{ +mul.f16x2 r913, r910, r887; +} +{ +add.f16x2 r916, r907, r913; +} +{ +add.f16x2 r919, r890, r891; +} +{ +mul.f16x2 r922, r919, r885; +} +{ +add.f16x2 r925, r893, r922; +} +{ +sub.f16x2 r928, r896, r897; +} +{ +mul.f16x2 r931, r928, r887; +} +{ +sub.f16x2 r934, r925, r931; +} +{ +add.f16x2 r937, r896, r897; +} +{ +mul.f16x2 r940, r937, r885; +} +{ +add.f16x2 r943, r899, r940; +} +{ +sub.f16x2 r946, r890, r891; +} +{ +mul.f16x2 r949, r946, r887; +} +{ +sub.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, r896, r897; +} +{ +mul.f16x2 r958, r955, r885; +} +{ +add.f16x2 r961, r899, r958; +} +{ +sub.f16x2 r964, r890, r891; +} +{ +mul.f16x2 r967, r964, r887; +} +{ +add.f16x2 r970, r961, r967; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r973, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r974, {low, high}; +} +{ +neg.f16x2 r975, r974; +} +{ +add.f16x2 r977, r978, r979; +} +{ +add.f16x2 r980, r981, r977; +} +{ +add.f16x2 r983, r984, r985; +} +{ +add.f16x2 r986, r987, r983; +} +{ +add.f16x2 r989, r978, r979; +} +{ +mul.f16x2 r992, r989, r973; +} +{ +add.f16x2 r995, r981, r992; +} +{ +sub.f16x2 r998, r984, r985; +} +{ +mul.f16x2 r1001, r998, r975; +} +{ +add.f16x2 r1004, r995, r1001; +} +{ +add.f16x2 r1007, r978, r979; +} +{ +mul.f16x2 r1010, r1007, r973; +} +{ +add.f16x2 r1013, r981, r1010; +} +{ +sub.f16x2 r1016, r984, r985; +} +{ +mul.f16x2 r1019, r1016, r975; +} +{ +sub.f16x2 r1022, r1013, r1019; +} +{ +add.f16x2 r1025, r984, r985; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r987, r1028; +} +{ +sub.f16x2 r1034, r978, r979; +} +{ +mul.f16x2 r1037, r1034, r975; +} +{ +sub.f16x2 r1040, r1031, r1037; +} +{ +add.f16x2 r1043, r984, r985; +} +{ +mul.f16x2 r1046, r1043, r973; +} +{ +add.f16x2 r1049, r987, r1046; +} +{ +sub.f16x2 r1052, r978, r979; +} +{ +mul.f16x2 r1055, r1052, r975; +} +{ +add.f16x2 r1058, r1049, r1055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1061, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1062, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1063, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1064, {low, high}; +} +{ +mul.f16x2 r1071, r1004, r1061; +} +{ +mul.f16x2 r1074, r1040, r1062; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r1004, r1062; +} +{ +fma.rn.f16x2 r1083, r1040, r1061, r1080; +} +{ +mul.f16x2 r1087, r1022, r1063; +} +{ +mul.f16x2 r1090, r1058, r1064; +} +{ +sub.f16x2 r1093, r1087, r1090; +} +{ +mul.f16x2 r1096, r1022, r1064; +} +{ +fma.rn.f16x2 r1099, r1058, r1063, r1096; +} +{ +add.f16x2 %0, r892, r980; +} +{ +add.f16x2 %1, r898, r986; +} +{ +sub.f16x2 %6, r892, r980; +} +{ +sub.f16x2 %7, r898, r986; +} +{ +add.f16x2 %2, r916, r1077; +} +{ +add.f16x2 %3, r952, r1083; +} +{ +sub.f16x2 %8, r916, r1077; +} +{ +sub.f16x2 %9, r952, r1083; +} +{ +add.f16x2 %4, r934, r1093; +} +{ +add.f16x2 %5, r970, r1099; +} +{ +sub.f16x2 %10, r934, r1093; +} +{ +sub.f16x2 %11, r970, r1099; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<933, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<119>; +.reg .b32 r<1155>; +.reg .b64 rd<6>; +mov.u32 r1139, %tid.y; +mov.u32 r1140, %12; +mad.lo.s32 r1141, r1139, 864, r1140; +mov.u32 r1142, %tid.x; +mov.f32 f98, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1, {low, high}; +} +mov.f32 f100, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %17, %21; +} +{ +add.f16x2 r8, %13, r5; +} +{ +add.f16x2 r11, %18, %22; +} +{ +add.f16x2 r14, %14, r11; +} +{ +add.f16x2 r17, %17, %21; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %13, r20; +} +{ +sub.f16x2 r26, %18, %22; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %17, %21; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %13, r38; +} +{ +sub.f16x2 r44, %18, %22; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %18, %22; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %14, r56; +} +{ +sub.f16x2 r62, %17, %21; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %18, %22; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %14, r74; +} +{ +sub.f16x2 r80, %17, %21; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %19, %23; +} +{ +add.f16x2 r96, %15, r93; +} +{ +add.f16x2 r99, %20, %24; +} +{ +add.f16x2 r102, %16, r99; +} +{ +add.f16x2 r105, %19, %23; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %15, r108; +} +{ +sub.f16x2 r114, %20, %24; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %19, %23; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %15, r126; +} +{ +sub.f16x2 r132, %20, %24; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %20, %24; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %16, r144; +} +{ +sub.f16x2 r150, %19, %23; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %20, %24; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %16, r162; +} +{ +sub.f16x2 r168, %19, %23; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f94, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r180, {low, high}; +} +mov.f32 f81, 0fBF800000; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +mul.wide.u32 rd2, r1142, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1143, rd3; +mul.lo.s32 r1144, r1143, 36; +sub.s32 r1145, r1142, r1144; +mad.lo.s32 r1146, r1143, 864, r1141; +cvt.rn.f32.u32 f113, r1145; +mul.f32 f114, f113, 0f3CEE4BAE; +cos.approx.f32 f29, f114; +sin.approx.f32 f115, f114; +neg.f32 f30, f115; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r260, {high, high}; +} +{ +mul.f16x2 r262, r234, r260; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r231, r258, r265; +} +{ +mul.f16x2 r271, r231, r260; +} +{ +fma.rn.f16x2 r274, r234, r258, r271; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r280, {high, high}; +} +mov.f32 f82, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r280, r282; +} +{ +mul.f16x2 r286, r255, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r289, {high, low}; +} +{ +fma.rn.f16x2 r291, r283, r289, r286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r297, {high, high}; +} +{ +mul.f16x2 r299, r246, r297; +} +{ +neg.f16x2 r302, r299; +} +{ +fma.rn.f16x2 r304, r243, r295, r302; +} +{ +mul.f16x2 r308, r243, r297; +} +{ +fma.rn.f16x2 r311, r246, r295, r308; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r315, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r317, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r319, {low, high}; +} +{ +mul.f16x2 r320, r317, r319; +} +{ +mul.f16x2 r323, r291, r315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r326, {high, low}; +} +{ +fma.rn.f16x2 r328, r320, r326, r323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r334, {high, high}; +} +{ +mul.f16x2 r336, r228, r334; +} +{ +neg.f16x2 r339, r336; +} +{ +fma.rn.f16x2 r341, r225, r332, r339; +} +{ +mul.f16x2 r345, r225, r334; +} +{ +fma.rn.f16x2 r348, r228, r332, r345; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r352, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r354, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r356, {low, high}; +} +{ +mul.f16x2 r357, r354, r356; +} +{ +mul.f16x2 r360, r328, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r363, {high, low}; +} +{ +fma.rn.f16x2 r365, r357, r363, r360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r371, {high, high}; +} +{ +mul.f16x2 r373, r240, r371; +} +{ +neg.f16x2 r376, r373; +} +{ +fma.rn.f16x2 r378, r237, r369, r376; +} +{ +mul.f16x2 r382, r237, r371; +} +{ +fma.rn.f16x2 r385, r240, r369, r382; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r389, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r391, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r393, {low, high}; +} +{ +mul.f16x2 r394, r391, r393; +} +{ +mul.f16x2 r397, r365, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r400, {high, low}; +} +{ +fma.rn.f16x2 r402, r394, r400, r397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r408, {high, high}; +} +{ +mul.f16x2 r410, r252, r408; +} +{ +neg.f16x2 r413, r410; +} +{ +fma.rn.f16x2 r415, r249, r406, r413; +} +{ +mul.f16x2 r419, r249, r408; +} +{ +fma.rn.f16x2 r422, r252, r406, r419; +} +barrier.sync 0; +mad.lo.s32 r1147, r1145, 24, r1146; +st.shared.v2.f32 [r1147], {r219, r267}; +st.shared.v2.f32 [r1147+8], {r304, r341}; +st.shared.v2.f32 [r1147+16], {r378, r415}; +barrier.sync 0; +mad.lo.s32 r1148, r1145, -20, r1147; +ld.shared.u32 r451, [r1148]; +ld.shared.u32 r539, [r1148+144]; +ld.shared.u32 r448, [r1148+288]; +ld.shared.u32 r536, [r1148+432]; +ld.shared.u32 r449, [r1148+576]; +ld.shared.u32 r537, [r1148+720]; +barrier.sync 0; +st.shared.v2.f32 [r1147], {r222, r274}; +st.shared.v2.f32 [r1147+8], {r311, r348}; +st.shared.v2.f32 [r1147+16], {r385, r422}; +barrier.sync 0; +ld.shared.u32 r457, [r1148]; +ld.shared.u32 r545, [r1148+144]; +ld.shared.u32 r454, [r1148+288]; +ld.shared.u32 r542, [r1148+432]; +ld.shared.u32 r455, [r1148+576]; +ld.shared.u32 r543, [r1148+720]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r444, {low, high}; +} +{ +neg.f16x2 r445, r444; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r454, r455; +} +{ +add.f16x2 r456, r457, r453; +} +{ +add.f16x2 r459, r448, r449; +} +{ +mul.f16x2 r462, r459, r443; +} +{ +add.f16x2 r465, r451, r462; +} +{ +sub.f16x2 r468, r454, r455; +} +{ +mul.f16x2 r471, r468, r445; +} +{ +add.f16x2 r474, r465, r471; +} +{ +add.f16x2 r477, r448, r449; +} +{ +mul.f16x2 r480, r477, r443; +} +{ +add.f16x2 r483, r451, r480; +} +{ +sub.f16x2 r486, r454, r455; +} +{ +mul.f16x2 r489, r486, r445; +} +{ +sub.f16x2 r492, r483, r489; +} +{ +add.f16x2 r495, r454, r455; +} +{ +mul.f16x2 r498, r495, r443; +} +{ +add.f16x2 r501, r457, r498; +} +{ +sub.f16x2 r504, r448, r449; +} +{ +mul.f16x2 r507, r504, r445; +} +{ +sub.f16x2 r510, r501, r507; +} +{ +add.f16x2 r513, r454, r455; +} +{ +mul.f16x2 r516, r513, r443; +} +{ +add.f16x2 r519, r457, r516; +} +{ +sub.f16x2 r522, r448, r449; +} +{ +mul.f16x2 r525, r522, r445; +} +{ +add.f16x2 r528, r519, r525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r532, {low, high}; +} +{ +neg.f16x2 r533, r532; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r535; +} +{ +add.f16x2 r541, r542, r543; +} +{ +add.f16x2 r544, r545, r541; +} +{ +add.f16x2 r547, r536, r537; +} +{ +mul.f16x2 r550, r547, r531; +} +{ +add.f16x2 r553, r539, r550; +} +{ +sub.f16x2 r556, r542, r543; +} +{ +mul.f16x2 r559, r556, r533; +} +{ +add.f16x2 r562, r553, r559; +} +{ +add.f16x2 r565, r536, r537; +} +{ +mul.f16x2 r568, r565, r531; +} +{ +add.f16x2 r571, r539, r568; +} +{ +sub.f16x2 r574, r542, r543; +} +{ +mul.f16x2 r577, r574, r533; +} +{ +sub.f16x2 r580, r571, r577; +} +{ +add.f16x2 r583, r542, r543; +} +{ +mul.f16x2 r586, r583, r531; +} +{ +add.f16x2 r589, r545, r586; +} +{ +sub.f16x2 r592, r536, r537; +} +{ +mul.f16x2 r595, r592, r533; +} +{ +sub.f16x2 r598, r589, r595; +} +{ +add.f16x2 r601, r542, r543; +} +{ +mul.f16x2 r604, r601, r531; +} +{ +add.f16x2 r607, r545, r604; +} +{ +sub.f16x2 r610, r536, r537; +} +{ +mul.f16x2 r613, r610, r533; +} +{ +add.f16x2 r616, r607, r613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r619, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r622, {low, high}; +} +{ +mul.f16x2 r629, r562, r619; +} +{ +mul.f16x2 r632, r598, r620; +} +{ +sub.f16x2 r635, r629, r632; +} +{ +mul.f16x2 r638, r562, r620; +} +{ +fma.rn.f16x2 r641, r598, r619, r638; +} +{ +mul.f16x2 r645, r580, r621; +} +{ +mul.f16x2 r648, r616, r622; +} +{ +sub.f16x2 r651, r645, r648; +} +{ +mul.f16x2 r654, r580, r622; +} +{ +fma.rn.f16x2 r657, r616, r621, r654; +} +{ +add.f16x2 r661, r450, r538; +} +{ +add.f16x2 r664, r456, r544; +} +{ +sub.f16x2 r667, r450, r538; +} +{ +sub.f16x2 r670, r456, r544; +} +{ +add.f16x2 r673, r474, r635; +} +{ +add.f16x2 r676, r510, r641; +} +{ +sub.f16x2 r679, r474, r635; +} +{ +sub.f16x2 r682, r510, r641; +} +{ +add.f16x2 r685, r492, r651; +} +{ +add.f16x2 r688, r528, r657; +} +{ +sub.f16x2 r691, r492, r651; +} +{ +sub.f16x2 r694, r528, r657; +} +mul.wide.u32 rd4, r1145, -1431655765; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1149, rd5; +mul.lo.s32 r1150, r1149, 6; +sub.s32 r1151, r1145, r1150; +shl.b32 r1152, r1151, 2; +add.s32 r1153, r1146, r1152; +cvt.rn.f32.u32 f116, r1149; +mul.f32 f117, f116, 0f3E32B8C2; +cos.approx.f32 f71, f117; +sin.approx.f32 f118, f117; +neg.f32 f72, f118; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r702, {high, high}; +} +{ +mul.f16x2 r704, r676, r702; +} +{ +neg.f16x2 r707, r704; +} +{ +fma.rn.f16x2 r709, r673, r700, r707; +} +{ +mul.f16x2 r713, r673, r702; +} +{ +fma.rn.f16x2 r716, r676, r700, r713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r722, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r724, {low, high}; +} +{ +mul.f16x2 r725, r722, r724; +} +{ +mul.f16x2 r728, r697, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r731, {high, low}; +} +{ +fma.rn.f16x2 r733, r725, r731, r728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r739, {high, high}; +} +{ +mul.f16x2 r741, r688, r739; +} +{ +neg.f16x2 r744, r741; +} +{ +fma.rn.f16x2 r746, r685, r737, r744; +} +{ +mul.f16x2 r750, r685, r739; +} +{ +fma.rn.f16x2 r753, r688, r737, r750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r759, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r761, {low, high}; +} +{ +mul.f16x2 r762, r759, r761; +} +{ +mul.f16x2 r765, r733, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r733; +mov.b32 r768, {high, low}; +} +{ +fma.rn.f16x2 r770, r762, r768, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r774, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r776, {high, high}; +} +{ +mul.f16x2 r778, r670, r776; +} +{ +neg.f16x2 r781, r778; +} +{ +fma.rn.f16x2 r783, r667, r774, r781; +} +{ +mul.f16x2 r787, r667, r776; +} +{ +fma.rn.f16x2 r790, r670, r774, r787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r796, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r798, {low, high}; +} +{ +mul.f16x2 r799, r796, r798; +} +{ +mul.f16x2 r802, r770, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r770; +mov.b32 r805, {high, low}; +} +{ +fma.rn.f16x2 r807, r799, r805, r802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r811, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r813, {high, high}; +} +{ +mul.f16x2 r815, r682, r813; +} +{ +neg.f16x2 r818, r815; +} +{ +fma.rn.f16x2 r820, r679, r811, r818; +} +{ +mul.f16x2 r824, r679, r813; +} +{ +fma.rn.f16x2 r827, r682, r811, r824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r697; +mov.b32 r833, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r835, {low, high}; +} +{ +mul.f16x2 r836, r833, r835; +} +{ +mul.f16x2 r839, r807, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r807; +mov.b32 r842, {high, low}; +} +{ +fma.rn.f16x2 r844, r836, r842, r839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r848, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r844; +mov.b32 r850, {high, high}; +} +{ +mul.f16x2 r852, r694, r850; +} +{ +neg.f16x2 r855, r852; +} +{ +fma.rn.f16x2 r857, r691, r848, r855; +} +{ +mul.f16x2 r861, r691, r850; +} +{ +fma.rn.f16x2 r864, r694, r848, r861; +} +barrier.sync 0; +mad.lo.s32 r1154, r1149, 144, r1153; +st.shared.u32 [r1154], r661; +st.shared.u32 [r1154+24], r709; +st.shared.u32 [r1154+48], r746; +st.shared.u32 [r1154+72], r783; +st.shared.u32 [r1154+96], r820; +st.shared.u32 [r1154+120], r857; +barrier.sync 0; +ld.shared.u32 r893, [r1148]; +ld.shared.u32 r981, [r1148+144]; +ld.shared.u32 r890, [r1148+288]; +ld.shared.u32 r978, [r1148+432]; +ld.shared.u32 r891, [r1148+576]; +ld.shared.u32 r979, [r1148+720]; +barrier.sync 0; +st.shared.u32 [r1154], r664; +st.shared.u32 [r1154+24], r716; +st.shared.u32 [r1154+48], r753; +st.shared.u32 [r1154+72], r790; +st.shared.u32 [r1154+96], r827; +st.shared.u32 [r1154+120], r864; +barrier.sync 0; +ld.shared.u32 r899, [r1148]; +ld.shared.u32 r987, [r1148+144]; +ld.shared.u32 r896, [r1148+288]; +ld.shared.u32 r984, [r1148+432]; +ld.shared.u32 r897, [r1148+576]; +ld.shared.u32 r985, [r1148+720]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r886, {low, high}; +} +{ +neg.f16x2 r887, r886; +} +{ +add.f16x2 r889, r890, r891; +} +{ +add.f16x2 r892, r893, r889; +} +{ +add.f16x2 r895, r896, r897; +} +{ +add.f16x2 r898, r899, r895; +} +{ +add.f16x2 r901, r890, r891; +} +{ +mul.f16x2 r904, r901, r885; +} +{ +add.f16x2 r907, r893, r904; +} +{ +sub.f16x2 r910, r896, r897; +} +{ +mul.f16x2 r913, r910, r887; +} +{ +add.f16x2 r916, r907, r913; +} +{ +add.f16x2 r919, r890, r891; +} +{ +mul.f16x2 r922, r919, r885; +} +{ +add.f16x2 r925, r893, r922; +} +{ +sub.f16x2 r928, r896, r897; +} +{ +mul.f16x2 r931, r928, r887; +} +{ +sub.f16x2 r934, r925, r931; +} +{ +add.f16x2 r937, r896, r897; +} +{ +mul.f16x2 r940, r937, r885; +} +{ +add.f16x2 r943, r899, r940; +} +{ +sub.f16x2 r946, r890, r891; +} +{ +mul.f16x2 r949, r946, r887; +} +{ +sub.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, r896, r897; +} +{ +mul.f16x2 r958, r955, r885; +} +{ +add.f16x2 r961, r899, r958; +} +{ +sub.f16x2 r964, r890, r891; +} +{ +mul.f16x2 r967, r964, r887; +} +{ +add.f16x2 r970, r961, r967; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r973, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r974, {low, high}; +} +{ +neg.f16x2 r975, r974; +} +{ +add.f16x2 r977, r978, r979; +} +{ +add.f16x2 r980, r981, r977; +} +{ +add.f16x2 r983, r984, r985; +} +{ +add.f16x2 r986, r987, r983; +} +{ +add.f16x2 r989, r978, r979; +} +{ +mul.f16x2 r992, r989, r973; +} +{ +add.f16x2 r995, r981, r992; +} +{ +sub.f16x2 r998, r984, r985; +} +{ +mul.f16x2 r1001, r998, r975; +} +{ +add.f16x2 r1004, r995, r1001; +} +{ +add.f16x2 r1007, r978, r979; +} +{ +mul.f16x2 r1010, r1007, r973; +} +{ +add.f16x2 r1013, r981, r1010; +} +{ +sub.f16x2 r1016, r984, r985; +} +{ +mul.f16x2 r1019, r1016, r975; +} +{ +sub.f16x2 r1022, r1013, r1019; +} +{ +add.f16x2 r1025, r984, r985; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r987, r1028; +} +{ +sub.f16x2 r1034, r978, r979; +} +{ +mul.f16x2 r1037, r1034, r975; +} +{ +sub.f16x2 r1040, r1031, r1037; +} +{ +add.f16x2 r1043, r984, r985; +} +{ +mul.f16x2 r1046, r1043, r973; +} +{ +add.f16x2 r1049, r987, r1046; +} +{ +sub.f16x2 r1052, r978, r979; +} +{ +mul.f16x2 r1055, r1052, r975; +} +{ +add.f16x2 r1058, r1049, r1055; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1061, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1062, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1063, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1064, {low, high}; +} +{ +mul.f16x2 r1071, r1004, r1061; +} +{ +mul.f16x2 r1074, r1040, r1062; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r1004, r1062; +} +{ +fma.rn.f16x2 r1083, r1040, r1061, r1080; +} +{ +mul.f16x2 r1087, r1022, r1063; +} +{ +mul.f16x2 r1090, r1058, r1064; +} +{ +sub.f16x2 r1093, r1087, r1090; +} +{ +mul.f16x2 r1096, r1022, r1064; +} +{ +fma.rn.f16x2 r1099, r1058, r1063, r1096; +} +{ +add.f16x2 %0, r892, r980; +} +{ +add.f16x2 %1, r898, r986; +} +{ +sub.f16x2 %6, r892, r980; +} +{ +sub.f16x2 %7, r898, r986; +} +{ +add.f16x2 %2, r916, r1077; +} +{ +add.f16x2 %3, r952, r1083; +} +{ +sub.f16x2 %8, r916, r1077; +} +{ +sub.f16x2 %9, r952, r1083; +} +{ +add.f16x2 %4, r934, r1093; +} +{ +add.f16x2 %5, r970, r1099; +} +{ +sub.f16x2 %10, r934, r1093; +} +{ +sub.f16x2 %11, r970, r1099; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..812e95fb8d30d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp16_inv.hpp.inc @@ -0,0 +1,2898 @@ +#ifndef CUFFTDX_FFT_216_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_216_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1134, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<119>; +.reg .b32 r<1146>; +.reg .b64 rd<7>; +mov.u32 r1127, %tid.y; +shl.b32 r1128, r1127, 1; +mov.u32 r1129, %12; +mad.lo.s32 r1130, r1128, 864, r1129; +mov.u32 r1131, %tid.x; +mov.f32 f98, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1, {low, high}; +} +mov.f32 f92, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %17, %21; +} +{ +add.f16x2 r6, %13, r3; +} +{ +add.f16x2 r9, %18, %22; +} +{ +add.f16x2 r12, %14, r9; +} +{ +add.f16x2 r15, %17, %21; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %13, r18; +} +{ +sub.f16x2 r24, %18, %22; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %17, %21; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %13, r36; +} +{ +sub.f16x2 r42, %18, %22; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %18, %22; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %14, r54; +} +{ +sub.f16x2 r60, %17, %21; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %18, %22; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %14, r72; +} +{ +sub.f16x2 r78, %17, %21; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %19, %23; +} +{ +add.f16x2 r92, %15, r89; +} +{ +add.f16x2 r95, %20, %24; +} +{ +add.f16x2 r98, %16, r95; +} +{ +add.f16x2 r101, %19, %23; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %15, r104; +} +{ +sub.f16x2 r110, %20, %24; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %19, %23; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %15, r122; +} +{ +sub.f16x2 r128, %20, %24; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %20, %24; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %16, r140; +} +{ +sub.f16x2 r146, %19, %23; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %20, %24; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %16, r158; +} +{ +sub.f16x2 r164, %19, %23; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f94, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r173, {low, high}; +} +mov.f32 f100, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r176, {low, high}; +} +mov.f32 f81, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +mul.wide.u32 rd2, r1131, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1132, rd3; +mul.lo.s32 r1133, r1132, 36; +sub.s32 r1134, r1131, r1133; +shr.u64 rd4, rd2, 34; +cvt.u32.u64 r1135, rd4; +and.b32 r1136, r1135, 1073741822; +mad.lo.s32 r1137, r1136, 864, r1130; +cvt.rn.f32.u32 f113, r1134; +mul.f32 f114, f113, 0f3CEE4BAE; +cos.approx.f32 f29, f114; +sin.approx.f32 f115, f114; +neg.f32 f30, f115; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r256, {high, high}; +} +{ +mul.f16x2 r258, r230, r256; +} +{ +fma.rn.f16x2 r261, r227, r254, r258; +} +{ +mul.f16x2 r265, r227, r256; +} +{ +neg.f16x2 r268, r265; +} +{ +fma.rn.f16x2 r270, r230, r254, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r276, {high, high}; +} +mov.f32 f82, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r276, r278; +} +{ +mul.f16x2 r282, r251, r274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r285, {high, low}; +} +{ +fma.rn.f16x2 r287, r279, r285, r282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r293, {high, high}; +} +{ +mul.f16x2 r295, r242, r293; +} +{ +fma.rn.f16x2 r298, r239, r291, r295; +} +{ +mul.f16x2 r302, r239, r293; +} +{ +neg.f16x2 r305, r302; +} +{ +fma.rn.f16x2 r307, r242, r291, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r313, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r315, {low, high}; +} +{ +mul.f16x2 r316, r313, r315; +} +{ +mul.f16x2 r319, r287, r311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r322, {high, low}; +} +{ +fma.rn.f16x2 r324, r316, r322, r319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r224, r330; +} +{ +fma.rn.f16x2 r335, r221, r328, r332; +} +{ +mul.f16x2 r339, r221, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r224, r328, r342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r350, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r350, r352; +} +{ +mul.f16x2 r356, r324, r348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r359, {high, low}; +} +{ +fma.rn.f16x2 r361, r353, r359, r356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r367, {high, high}; +} +{ +mul.f16x2 r369, r236, r367; +} +{ +fma.rn.f16x2 r372, r233, r365, r369; +} +{ +mul.f16x2 r376, r233, r367; +} +{ +neg.f16x2 r379, r376; +} +{ +fma.rn.f16x2 r381, r236, r365, r379; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r387, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r389, {low, high}; +} +{ +mul.f16x2 r390, r387, r389; +} +{ +mul.f16x2 r393, r361, r385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r396, {high, low}; +} +{ +fma.rn.f16x2 r398, r390, r396, r393; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r404, {high, high}; +} +{ +mul.f16x2 r406, r248, r404; +} +{ +fma.rn.f16x2 r409, r245, r402, r406; +} +{ +mul.f16x2 r413, r245, r404; +} +{ +neg.f16x2 r416, r413; +} +{ +fma.rn.f16x2 r418, r248, r402, r416; +} +barrier.sync 0; +mad.lo.s32 r1138, r1134, 48, r1137; +st.shared.v2.f32 [r1138], {r215, r218}; +st.shared.v2.f32 [r1138+8], {r261, r270}; +st.shared.v2.f32 [r1138+16], {r298, r307}; +st.shared.v2.f32 [r1138+24], {r335, r344}; +st.shared.v2.f32 [r1138+32], {r372, r381}; +st.shared.v2.f32 [r1138+40], {r409, r418}; +barrier.sync 0; +mad.lo.s32 r1139, r1134, -40, r1138; +ld.shared.u32 r445, [r1139]; +ld.shared.u32 r451, [r1139+4]; +ld.shared.u32 r531, [r1139+288]; +ld.shared.u32 r537, [r1139+292]; +ld.shared.u32 r442, [r1139+576]; +ld.shared.u32 r448, [r1139+580]; +ld.shared.u32 r528, [r1139+864]; +ld.shared.u32 r534, [r1139+868]; +ld.shared.u32 r443, [r1139+1152]; +ld.shared.u32 r449, [r1139+1156]; +ld.shared.u32 r529, [r1139+1440]; +ld.shared.u32 r535, [r1139+1444]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r440, {low, high}; +} +{ +add.f16x2 r441, r442, r443; +} +{ +add.f16x2 r444, r445, r441; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r442, r443; +} +{ +mul.f16x2 r456, r453, r439; +} +{ +add.f16x2 r459, r445, r456; +} +{ +sub.f16x2 r462, r448, r449; +} +{ +mul.f16x2 r465, r462, r440; +} +{ +add.f16x2 r468, r459, r465; +} +{ +add.f16x2 r471, r442, r443; +} +{ +mul.f16x2 r474, r471, r439; +} +{ +add.f16x2 r477, r445, r474; +} +{ +sub.f16x2 r480, r448, r449; +} +{ +mul.f16x2 r483, r480, r440; +} +{ +sub.f16x2 r486, r477, r483; +} +{ +add.f16x2 r489, r448, r449; +} +{ +mul.f16x2 r492, r489, r439; +} +{ +add.f16x2 r495, r451, r492; +} +{ +sub.f16x2 r498, r442, r443; +} +{ +mul.f16x2 r501, r498, r440; +} +{ +sub.f16x2 r504, r495, r501; +} +{ +add.f16x2 r507, r448, r449; +} +{ +mul.f16x2 r510, r507, r439; +} +{ +add.f16x2 r513, r451, r510; +} +{ +sub.f16x2 r516, r442, r443; +} +{ +mul.f16x2 r519, r516, r440; +} +{ +add.f16x2 r522, r513, r519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r526, {low, high}; +} +{ +add.f16x2 r527, r528, r529; +} +{ +add.f16x2 r530, r531, r527; +} +{ +add.f16x2 r533, r534, r535; +} +{ +add.f16x2 r536, r537, r533; +} +{ +add.f16x2 r539, r528, r529; +} +{ +mul.f16x2 r542, r539, r525; +} +{ +add.f16x2 r545, r531, r542; +} +{ +sub.f16x2 r548, r534, r535; +} +{ +mul.f16x2 r551, r548, r526; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, r528, r529; +} +{ +mul.f16x2 r560, r557, r525; +} +{ +add.f16x2 r563, r531, r560; +} +{ +sub.f16x2 r566, r534, r535; +} +{ +mul.f16x2 r569, r566, r526; +} +{ +sub.f16x2 r572, r563, r569; +} +{ +add.f16x2 r575, r534, r535; +} +{ +mul.f16x2 r578, r575, r525; +} +{ +add.f16x2 r581, r537, r578; +} +{ +sub.f16x2 r584, r528, r529; +} +{ +mul.f16x2 r587, r584, r526; +} +{ +sub.f16x2 r590, r581, r587; +} +{ +add.f16x2 r593, r534, r535; +} +{ +mul.f16x2 r596, r593, r525; +} +{ +add.f16x2 r599, r537, r596; +} +{ +sub.f16x2 r602, r528, r529; +} +{ +mul.f16x2 r605, r602, r526; +} +{ +add.f16x2 r608, r599, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r621, r554, r611; +} +{ +mul.f16x2 r624, r590, r612; +} +{ +sub.f16x2 r627, r621, r624; +} +{ +mul.f16x2 r630, r554, r612; +} +{ +fma.rn.f16x2 r633, r590, r611, r630; +} +{ +mul.f16x2 r637, r572, r613; +} +{ +mul.f16x2 r640, r608, r614; +} +{ +sub.f16x2 r643, r637, r640; +} +{ +mul.f16x2 r646, r572, r614; +} +{ +fma.rn.f16x2 r649, r608, r613, r646; +} +{ +add.f16x2 r653, r444, r530; +} +{ +add.f16x2 r656, r450, r536; +} +{ +sub.f16x2 r659, r444, r530; +} +{ +sub.f16x2 r662, r450, r536; +} +{ +add.f16x2 r665, r468, r627; +} +{ +add.f16x2 r668, r504, r633; +} +{ +sub.f16x2 r671, r468, r627; +} +{ +sub.f16x2 r674, r504, r633; +} +{ +add.f16x2 r677, r486, r643; +} +{ +add.f16x2 r680, r522, r649; +} +{ +sub.f16x2 r683, r486, r643; +} +{ +sub.f16x2 r686, r522, r649; +} +mul.wide.u32 rd5, r1134, -1431655765; +shr.u64 rd6, rd5, 34; +cvt.u32.u64 r1140, rd6; +cvt.rn.f32.u32 f116, r1140; +mul.f32 f117, f116, 0f3E32B8C2; +cos.approx.f32 f71, f117; +sin.approx.f32 f118, f117; +neg.f32 f72, f118; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r689, {low, high}; +} +mul.lo.s32 r1141, r1140, 6; +sub.s32 r1142, r1134, r1141; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r694, {high, high}; +} +{ +mul.f16x2 r696, r668, r694; +} +{ +fma.rn.f16x2 r699, r665, r692, r696; +} +{ +mul.f16x2 r703, r665, r694; +} +{ +neg.f16x2 r706, r703; +} +{ +fma.rn.f16x2 r708, r668, r692, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r712, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r714, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r714, r716; +} +{ +mul.f16x2 r720, r689, r712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r723, {high, low}; +} +{ +fma.rn.f16x2 r725, r717, r723, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r731, {high, high}; +} +{ +mul.f16x2 r733, r680, r731; +} +{ +fma.rn.f16x2 r736, r677, r729, r733; +} +{ +mul.f16x2 r740, r677, r731; +} +{ +neg.f16x2 r743, r740; +} +{ +fma.rn.f16x2 r745, r680, r729, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r749, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r751, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r753, {low, high}; +} +{ +mul.f16x2 r754, r751, r753; +} +{ +mul.f16x2 r757, r725, r749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r760, {high, low}; +} +{ +fma.rn.f16x2 r762, r754, r760, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r768, {high, high}; +} +{ +mul.f16x2 r770, r662, r768; +} +{ +fma.rn.f16x2 r773, r659, r766, r770; +} +{ +mul.f16x2 r777, r659, r768; +} +{ +neg.f16x2 r780, r777; +} +{ +fma.rn.f16x2 r782, r662, r766, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r788, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r790, {low, high}; +} +{ +mul.f16x2 r791, r788, r790; +} +{ +mul.f16x2 r794, r762, r786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r797, {high, low}; +} +{ +fma.rn.f16x2 r799, r791, r797, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r805, {high, high}; +} +{ +mul.f16x2 r807, r674, r805; +} +{ +fma.rn.f16x2 r810, r671, r803, r807; +} +{ +mul.f16x2 r814, r671, r805; +} +{ +neg.f16x2 r817, r814; +} +{ +fma.rn.f16x2 r819, r674, r803, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r825, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r827, {low, high}; +} +{ +mul.f16x2 r828, r825, r827; +} +{ +mul.f16x2 r831, r799, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r834, {high, low}; +} +{ +fma.rn.f16x2 r836, r828, r834, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r842, {high, high}; +} +{ +mul.f16x2 r844, r686, r842; +} +{ +fma.rn.f16x2 r847, r683, r840, r844; +} +{ +mul.f16x2 r851, r683, r842; +} +{ +neg.f16x2 r854, r851; +} +{ +fma.rn.f16x2 r856, r686, r840, r854; +} +shl.b32 r1143, r1142, 3; +add.s32 r1144, r1137, r1143; +barrier.sync 0; +mad.lo.s32 r1145, r1140, 288, r1144; +st.shared.u32 [r1145], r653; +st.shared.u32 [r1145+4], r656; +st.shared.u32 [r1145+48], r699; +st.shared.u32 [r1145+52], r708; +st.shared.u32 [r1145+96], r736; +st.shared.u32 [r1145+100], r745; +st.shared.u32 [r1145+144], r773; +st.shared.u32 [r1145+148], r782; +st.shared.u32 [r1145+192], r810; +st.shared.u32 [r1145+196], r819; +st.shared.u32 [r1145+240], r847; +st.shared.u32 [r1145+244], r856; +barrier.sync 0; +ld.shared.u32 r883, [r1139]; +ld.shared.u32 r889, [r1139+4]; +ld.shared.u32 r969, [r1139+288]; +ld.shared.u32 r975, [r1139+292]; +ld.shared.u32 r880, [r1139+576]; +ld.shared.u32 r886, [r1139+580]; +ld.shared.u32 r966, [r1139+864]; +ld.shared.u32 r972, [r1139+868]; +ld.shared.u32 r881, [r1139+1152]; +ld.shared.u32 r887, [r1139+1156]; +ld.shared.u32 r967, [r1139+1440]; +ld.shared.u32 r973, [r1139+1444]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r878, {low, high}; +} +{ +add.f16x2 r879, r880, r881; +} +{ +add.f16x2 r882, r883, r879; +} +{ +add.f16x2 r885, r886, r887; +} +{ +add.f16x2 r888, r889, r885; +} +{ +add.f16x2 r891, r880, r881; +} +{ +mul.f16x2 r894, r891, r877; +} +{ +add.f16x2 r897, r883, r894; +} +{ +sub.f16x2 r900, r886, r887; +} +{ +mul.f16x2 r903, r900, r878; +} +{ +add.f16x2 r906, r897, r903; +} +{ +add.f16x2 r909, r880, r881; +} +{ +mul.f16x2 r912, r909, r877; +} +{ +add.f16x2 r915, r883, r912; +} +{ +sub.f16x2 r918, r886, r887; +} +{ +mul.f16x2 r921, r918, r878; +} +{ +sub.f16x2 r924, r915, r921; +} +{ +add.f16x2 r927, r886, r887; +} +{ +mul.f16x2 r930, r927, r877; +} +{ +add.f16x2 r933, r889, r930; +} +{ +sub.f16x2 r936, r880, r881; +} +{ +mul.f16x2 r939, r936, r878; +} +{ +sub.f16x2 r942, r933, r939; +} +{ +add.f16x2 r945, r886, r887; +} +{ +mul.f16x2 r948, r945, r877; +} +{ +add.f16x2 r951, r889, r948; +} +{ +sub.f16x2 r954, r880, r881; +} +{ +mul.f16x2 r957, r954, r878; +} +{ +add.f16x2 r960, r951, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r964, {low, high}; +} +{ +add.f16x2 r965, r966, r967; +} +{ +add.f16x2 r968, r969, r965; +} +{ +add.f16x2 r971, r972, r973; +} +{ +add.f16x2 r974, r975, r971; +} +{ +add.f16x2 r977, r966, r967; +} +{ +mul.f16x2 r980, r977, r963; +} +{ +add.f16x2 r983, r969, r980; +} +{ +sub.f16x2 r986, r972, r973; +} +{ +mul.f16x2 r989, r986, r964; +} +{ +add.f16x2 r992, r983, r989; +} +{ +add.f16x2 r995, r966, r967; +} +{ +mul.f16x2 r998, r995, r963; +} +{ +add.f16x2 r1001, r969, r998; +} +{ +sub.f16x2 r1004, r972, r973; +} +{ +mul.f16x2 r1007, r1004, r964; +} +{ +sub.f16x2 r1010, r1001, r1007; +} +{ +add.f16x2 r1013, r972, r973; +} +{ +mul.f16x2 r1016, r1013, r963; +} +{ +add.f16x2 r1019, r975, r1016; +} +{ +sub.f16x2 r1022, r966, r967; +} +{ +mul.f16x2 r1025, r1022, r964; +} +{ +sub.f16x2 r1028, r1019, r1025; +} +{ +add.f16x2 r1031, r972, r973; +} +{ +mul.f16x2 r1034, r1031, r963; +} +{ +add.f16x2 r1037, r975, r1034; +} +{ +sub.f16x2 r1040, r966, r967; +} +{ +mul.f16x2 r1043, r1040, r964; +} +{ +add.f16x2 r1046, r1037, r1043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1052, {low, high}; +} +{ +mul.f16x2 r1059, r992, r1049; +} +{ +mul.f16x2 r1062, r1028, r1050; +} +{ +sub.f16x2 r1065, r1059, r1062; +} +{ +mul.f16x2 r1068, r992, r1050; +} +{ +fma.rn.f16x2 r1071, r1028, r1049, r1068; +} +{ +mul.f16x2 r1075, r1010, r1051; +} +{ +mul.f16x2 r1078, r1046, r1052; +} +{ +sub.f16x2 r1081, r1075, r1078; +} +{ +mul.f16x2 r1084, r1010, r1052; +} +{ +fma.rn.f16x2 r1087, r1046, r1051, r1084; +} +{ +add.f16x2 %0, r882, r968; +} +{ +add.f16x2 %1, r888, r974; +} +{ +sub.f16x2 %6, r882, r968; +} +{ +sub.f16x2 %7, r888, r974; +} +{ +add.f16x2 %2, r906, r1065; +} +{ +add.f16x2 %3, r942, r1071; +} +{ +sub.f16x2 %8, r906, r1065; +} +{ +sub.f16x2 %9, r942, r1071; +} +{ +add.f16x2 %4, r924, r1081; +} +{ +add.f16x2 %5, r960, r1087; +} +{ +sub.f16x2 %10, r924, r1081; +} +{ +sub.f16x2 %11, r960, r1087; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1135, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<119>; +.reg .b32 r<1143>; +.reg .b64 rd<6>; +mov.u32 r1127, %tid.y; +mov.u32 r1128, %12; +mad.lo.s32 r1129, r1127, 864, r1128; +mov.u32 r1130, %tid.x; +mov.f32 f98, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1, {low, high}; +} +mov.f32 f92, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %17, %21; +} +{ +add.f16x2 r6, %13, r3; +} +{ +add.f16x2 r9, %18, %22; +} +{ +add.f16x2 r12, %14, r9; +} +{ +add.f16x2 r15, %17, %21; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %13, r18; +} +{ +sub.f16x2 r24, %18, %22; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %17, %21; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %13, r36; +} +{ +sub.f16x2 r42, %18, %22; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %18, %22; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %14, r54; +} +{ +sub.f16x2 r60, %17, %21; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %18, %22; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %14, r72; +} +{ +sub.f16x2 r78, %17, %21; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %19, %23; +} +{ +add.f16x2 r92, %15, r89; +} +{ +add.f16x2 r95, %20, %24; +} +{ +add.f16x2 r98, %16, r95; +} +{ +add.f16x2 r101, %19, %23; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %15, r104; +} +{ +sub.f16x2 r110, %20, %24; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %19, %23; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %15, r122; +} +{ +sub.f16x2 r128, %20, %24; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %20, %24; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %16, r140; +} +{ +sub.f16x2 r146, %19, %23; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %20, %24; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %16, r158; +} +{ +sub.f16x2 r164, %19, %23; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f94, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r173, {low, high}; +} +mov.f32 f100, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r176, {low, high}; +} +mov.f32 f81, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +mul.wide.u32 rd2, r1130, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1131, rd3; +mul.lo.s32 r1132, r1131, 36; +sub.s32 r1133, r1130, r1132; +mad.lo.s32 r1134, r1131, 864, r1129; +cvt.rn.f32.u32 f113, r1133; +mul.f32 f114, f113, 0f3CEE4BAE; +cos.approx.f32 f29, f114; +sin.approx.f32 f115, f114; +neg.f32 f30, f115; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r256, {high, high}; +} +{ +mul.f16x2 r258, r230, r256; +} +{ +fma.rn.f16x2 r261, r227, r254, r258; +} +{ +mul.f16x2 r265, r227, r256; +} +{ +neg.f16x2 r268, r265; +} +{ +fma.rn.f16x2 r270, r230, r254, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r276, {high, high}; +} +mov.f32 f82, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r276, r278; +} +{ +mul.f16x2 r282, r251, r274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r285, {high, low}; +} +{ +fma.rn.f16x2 r287, r279, r285, r282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r293, {high, high}; +} +{ +mul.f16x2 r295, r242, r293; +} +{ +fma.rn.f16x2 r298, r239, r291, r295; +} +{ +mul.f16x2 r302, r239, r293; +} +{ +neg.f16x2 r305, r302; +} +{ +fma.rn.f16x2 r307, r242, r291, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r313, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r315, {low, high}; +} +{ +mul.f16x2 r316, r313, r315; +} +{ +mul.f16x2 r319, r287, r311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r322, {high, low}; +} +{ +fma.rn.f16x2 r324, r316, r322, r319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r224, r330; +} +{ +fma.rn.f16x2 r335, r221, r328, r332; +} +{ +mul.f16x2 r339, r221, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r224, r328, r342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r350, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r350, r352; +} +{ +mul.f16x2 r356, r324, r348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r359, {high, low}; +} +{ +fma.rn.f16x2 r361, r353, r359, r356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r367, {high, high}; +} +{ +mul.f16x2 r369, r236, r367; +} +{ +fma.rn.f16x2 r372, r233, r365, r369; +} +{ +mul.f16x2 r376, r233, r367; +} +{ +neg.f16x2 r379, r376; +} +{ +fma.rn.f16x2 r381, r236, r365, r379; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r387, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r389, {low, high}; +} +{ +mul.f16x2 r390, r387, r389; +} +{ +mul.f16x2 r393, r361, r385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r396, {high, low}; +} +{ +fma.rn.f16x2 r398, r390, r396, r393; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r404, {high, high}; +} +{ +mul.f16x2 r406, r248, r404; +} +{ +fma.rn.f16x2 r409, r245, r402, r406; +} +{ +mul.f16x2 r413, r245, r404; +} +{ +neg.f16x2 r416, r413; +} +{ +fma.rn.f16x2 r418, r248, r402, r416; +} +barrier.sync 0; +mad.lo.s32 r1135, r1133, 24, r1134; +st.shared.v2.f32 [r1135], {r215, r261}; +st.shared.v2.f32 [r1135+8], {r298, r335}; +st.shared.v2.f32 [r1135+16], {r372, r409}; +barrier.sync 0; +mad.lo.s32 r1136, r1133, -20, r1135; +ld.shared.u32 r445, [r1136]; +ld.shared.u32 r531, [r1136+144]; +ld.shared.u32 r442, [r1136+288]; +ld.shared.u32 r528, [r1136+432]; +ld.shared.u32 r443, [r1136+576]; +ld.shared.u32 r529, [r1136+720]; +barrier.sync 0; +st.shared.v2.f32 [r1135], {r218, r270}; +st.shared.v2.f32 [r1135+8], {r307, r344}; +st.shared.v2.f32 [r1135+16], {r381, r418}; +barrier.sync 0; +ld.shared.u32 r451, [r1136]; +ld.shared.u32 r537, [r1136+144]; +ld.shared.u32 r448, [r1136+288]; +ld.shared.u32 r534, [r1136+432]; +ld.shared.u32 r449, [r1136+576]; +ld.shared.u32 r535, [r1136+720]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r440, {low, high}; +} +{ +add.f16x2 r441, r442, r443; +} +{ +add.f16x2 r444, r445, r441; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r442, r443; +} +{ +mul.f16x2 r456, r453, r439; +} +{ +add.f16x2 r459, r445, r456; +} +{ +sub.f16x2 r462, r448, r449; +} +{ +mul.f16x2 r465, r462, r440; +} +{ +add.f16x2 r468, r459, r465; +} +{ +add.f16x2 r471, r442, r443; +} +{ +mul.f16x2 r474, r471, r439; +} +{ +add.f16x2 r477, r445, r474; +} +{ +sub.f16x2 r480, r448, r449; +} +{ +mul.f16x2 r483, r480, r440; +} +{ +sub.f16x2 r486, r477, r483; +} +{ +add.f16x2 r489, r448, r449; +} +{ +mul.f16x2 r492, r489, r439; +} +{ +add.f16x2 r495, r451, r492; +} +{ +sub.f16x2 r498, r442, r443; +} +{ +mul.f16x2 r501, r498, r440; +} +{ +sub.f16x2 r504, r495, r501; +} +{ +add.f16x2 r507, r448, r449; +} +{ +mul.f16x2 r510, r507, r439; +} +{ +add.f16x2 r513, r451, r510; +} +{ +sub.f16x2 r516, r442, r443; +} +{ +mul.f16x2 r519, r516, r440; +} +{ +add.f16x2 r522, r513, r519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r526, {low, high}; +} +{ +add.f16x2 r527, r528, r529; +} +{ +add.f16x2 r530, r531, r527; +} +{ +add.f16x2 r533, r534, r535; +} +{ +add.f16x2 r536, r537, r533; +} +{ +add.f16x2 r539, r528, r529; +} +{ +mul.f16x2 r542, r539, r525; +} +{ +add.f16x2 r545, r531, r542; +} +{ +sub.f16x2 r548, r534, r535; +} +{ +mul.f16x2 r551, r548, r526; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, r528, r529; +} +{ +mul.f16x2 r560, r557, r525; +} +{ +add.f16x2 r563, r531, r560; +} +{ +sub.f16x2 r566, r534, r535; +} +{ +mul.f16x2 r569, r566, r526; +} +{ +sub.f16x2 r572, r563, r569; +} +{ +add.f16x2 r575, r534, r535; +} +{ +mul.f16x2 r578, r575, r525; +} +{ +add.f16x2 r581, r537, r578; +} +{ +sub.f16x2 r584, r528, r529; +} +{ +mul.f16x2 r587, r584, r526; +} +{ +sub.f16x2 r590, r581, r587; +} +{ +add.f16x2 r593, r534, r535; +} +{ +mul.f16x2 r596, r593, r525; +} +{ +add.f16x2 r599, r537, r596; +} +{ +sub.f16x2 r602, r528, r529; +} +{ +mul.f16x2 r605, r602, r526; +} +{ +add.f16x2 r608, r599, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r621, r554, r611; +} +{ +mul.f16x2 r624, r590, r612; +} +{ +sub.f16x2 r627, r621, r624; +} +{ +mul.f16x2 r630, r554, r612; +} +{ +fma.rn.f16x2 r633, r590, r611, r630; +} +{ +mul.f16x2 r637, r572, r613; +} +{ +mul.f16x2 r640, r608, r614; +} +{ +sub.f16x2 r643, r637, r640; +} +{ +mul.f16x2 r646, r572, r614; +} +{ +fma.rn.f16x2 r649, r608, r613, r646; +} +{ +add.f16x2 r653, r444, r530; +} +{ +add.f16x2 r656, r450, r536; +} +{ +sub.f16x2 r659, r444, r530; +} +{ +sub.f16x2 r662, r450, r536; +} +{ +add.f16x2 r665, r468, r627; +} +{ +add.f16x2 r668, r504, r633; +} +{ +sub.f16x2 r671, r468, r627; +} +{ +sub.f16x2 r674, r504, r633; +} +{ +add.f16x2 r677, r486, r643; +} +{ +add.f16x2 r680, r522, r649; +} +{ +sub.f16x2 r683, r486, r643; +} +{ +sub.f16x2 r686, r522, r649; +} +mul.wide.u32 rd4, r1133, -1431655765; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1137, rd5; +mul.lo.s32 r1138, r1137, 6; +sub.s32 r1139, r1133, r1138; +shl.b32 r1140, r1139, 2; +add.s32 r1141, r1134, r1140; +cvt.rn.f32.u32 f116, r1137; +mul.f32 f117, f116, 0f3E32B8C2; +cos.approx.f32 f71, f117; +sin.approx.f32 f118, f117; +neg.f32 f72, f118; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f71; +cvt.rn.f16.f32 high, f72; +mov.b32 r689, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r692, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r694, {high, high}; +} +{ +mul.f16x2 r696, r668, r694; +} +{ +fma.rn.f16x2 r699, r665, r692, r696; +} +{ +mul.f16x2 r703, r665, r694; +} +{ +neg.f16x2 r706, r703; +} +{ +fma.rn.f16x2 r708, r668, r692, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r712, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r714, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r716, {low, high}; +} +{ +mul.f16x2 r717, r714, r716; +} +{ +mul.f16x2 r720, r689, r712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r723, {high, low}; +} +{ +fma.rn.f16x2 r725, r717, r723, r720; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r731, {high, high}; +} +{ +mul.f16x2 r733, r680, r731; +} +{ +fma.rn.f16x2 r736, r677, r729, r733; +} +{ +mul.f16x2 r740, r677, r731; +} +{ +neg.f16x2 r743, r740; +} +{ +fma.rn.f16x2 r745, r680, r729, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r749, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r751, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r753, {low, high}; +} +{ +mul.f16x2 r754, r751, r753; +} +{ +mul.f16x2 r757, r725, r749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r725; +mov.b32 r760, {high, low}; +} +{ +fma.rn.f16x2 r762, r754, r760, r757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r768, {high, high}; +} +{ +mul.f16x2 r770, r662, r768; +} +{ +fma.rn.f16x2 r773, r659, r766, r770; +} +{ +mul.f16x2 r777, r659, r768; +} +{ +neg.f16x2 r780, r777; +} +{ +fma.rn.f16x2 r782, r662, r766, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r786, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r788, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r790, {low, high}; +} +{ +mul.f16x2 r791, r788, r790; +} +{ +mul.f16x2 r794, r762, r786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r762; +mov.b32 r797, {high, low}; +} +{ +fma.rn.f16x2 r799, r791, r797, r794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r805, {high, high}; +} +{ +mul.f16x2 r807, r674, r805; +} +{ +fma.rn.f16x2 r810, r671, r803, r807; +} +{ +mul.f16x2 r814, r671, r805; +} +{ +neg.f16x2 r817, r814; +} +{ +fma.rn.f16x2 r819, r674, r803, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r823, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r689; +mov.b32 r825, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f81; +cvt.rn.f16.f32 high, f82; +mov.b32 r827, {low, high}; +} +{ +mul.f16x2 r828, r825, r827; +} +{ +mul.f16x2 r831, r799, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r799; +mov.b32 r834, {high, low}; +} +{ +fma.rn.f16x2 r836, r828, r834, r831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r836; +mov.b32 r842, {high, high}; +} +{ +mul.f16x2 r844, r686, r842; +} +{ +fma.rn.f16x2 r847, r683, r840, r844; +} +{ +mul.f16x2 r851, r683, r842; +} +{ +neg.f16x2 r854, r851; +} +{ +fma.rn.f16x2 r856, r686, r840, r854; +} +barrier.sync 0; +mad.lo.s32 r1142, r1137, 144, r1141; +st.shared.u32 [r1142], r653; +st.shared.u32 [r1142+24], r699; +st.shared.u32 [r1142+48], r736; +st.shared.u32 [r1142+72], r773; +st.shared.u32 [r1142+96], r810; +st.shared.u32 [r1142+120], r847; +barrier.sync 0; +ld.shared.u32 r883, [r1136]; +ld.shared.u32 r969, [r1136+144]; +ld.shared.u32 r880, [r1136+288]; +ld.shared.u32 r966, [r1136+432]; +ld.shared.u32 r881, [r1136+576]; +ld.shared.u32 r967, [r1136+720]; +barrier.sync 0; +st.shared.u32 [r1142], r656; +st.shared.u32 [r1142+24], r708; +st.shared.u32 [r1142+48], r745; +st.shared.u32 [r1142+72], r782; +st.shared.u32 [r1142+96], r819; +st.shared.u32 [r1142+120], r856; +barrier.sync 0; +ld.shared.u32 r889, [r1136]; +ld.shared.u32 r975, [r1136+144]; +ld.shared.u32 r886, [r1136+288]; +ld.shared.u32 r972, [r1136+432]; +ld.shared.u32 r887, [r1136+576]; +ld.shared.u32 r973, [r1136+720]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r878, {low, high}; +} +{ +add.f16x2 r879, r880, r881; +} +{ +add.f16x2 r882, r883, r879; +} +{ +add.f16x2 r885, r886, r887; +} +{ +add.f16x2 r888, r889, r885; +} +{ +add.f16x2 r891, r880, r881; +} +{ +mul.f16x2 r894, r891, r877; +} +{ +add.f16x2 r897, r883, r894; +} +{ +sub.f16x2 r900, r886, r887; +} +{ +mul.f16x2 r903, r900, r878; +} +{ +add.f16x2 r906, r897, r903; +} +{ +add.f16x2 r909, r880, r881; +} +{ +mul.f16x2 r912, r909, r877; +} +{ +add.f16x2 r915, r883, r912; +} +{ +sub.f16x2 r918, r886, r887; +} +{ +mul.f16x2 r921, r918, r878; +} +{ +sub.f16x2 r924, r915, r921; +} +{ +add.f16x2 r927, r886, r887; +} +{ +mul.f16x2 r930, r927, r877; +} +{ +add.f16x2 r933, r889, r930; +} +{ +sub.f16x2 r936, r880, r881; +} +{ +mul.f16x2 r939, r936, r878; +} +{ +sub.f16x2 r942, r933, r939; +} +{ +add.f16x2 r945, r886, r887; +} +{ +mul.f16x2 r948, r945, r877; +} +{ +add.f16x2 r951, r889, r948; +} +{ +sub.f16x2 r954, r880, r881; +} +{ +mul.f16x2 r957, r954, r878; +} +{ +add.f16x2 r960, r951, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r964, {low, high}; +} +{ +add.f16x2 r965, r966, r967; +} +{ +add.f16x2 r968, r969, r965; +} +{ +add.f16x2 r971, r972, r973; +} +{ +add.f16x2 r974, r975, r971; +} +{ +add.f16x2 r977, r966, r967; +} +{ +mul.f16x2 r980, r977, r963; +} +{ +add.f16x2 r983, r969, r980; +} +{ +sub.f16x2 r986, r972, r973; +} +{ +mul.f16x2 r989, r986, r964; +} +{ +add.f16x2 r992, r983, r989; +} +{ +add.f16x2 r995, r966, r967; +} +{ +mul.f16x2 r998, r995, r963; +} +{ +add.f16x2 r1001, r969, r998; +} +{ +sub.f16x2 r1004, r972, r973; +} +{ +mul.f16x2 r1007, r1004, r964; +} +{ +sub.f16x2 r1010, r1001, r1007; +} +{ +add.f16x2 r1013, r972, r973; +} +{ +mul.f16x2 r1016, r1013, r963; +} +{ +add.f16x2 r1019, r975, r1016; +} +{ +sub.f16x2 r1022, r966, r967; +} +{ +mul.f16x2 r1025, r1022, r964; +} +{ +sub.f16x2 r1028, r1019, r1025; +} +{ +add.f16x2 r1031, r972, r973; +} +{ +mul.f16x2 r1034, r1031, r963; +} +{ +add.f16x2 r1037, r975, r1034; +} +{ +sub.f16x2 r1040, r966, r967; +} +{ +mul.f16x2 r1043, r1040, r964; +} +{ +add.f16x2 r1046, r1037, r1043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f98; +cvt.rn.f16.f32 high, f98; +mov.b32 r1051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f100; +cvt.rn.f16.f32 high, f100; +mov.b32 r1052, {low, high}; +} +{ +mul.f16x2 r1059, r992, r1049; +} +{ +mul.f16x2 r1062, r1028, r1050; +} +{ +sub.f16x2 r1065, r1059, r1062; +} +{ +mul.f16x2 r1068, r992, r1050; +} +{ +fma.rn.f16x2 r1071, r1028, r1049, r1068; +} +{ +mul.f16x2 r1075, r1010, r1051; +} +{ +mul.f16x2 r1078, r1046, r1052; +} +{ +sub.f16x2 r1081, r1075, r1078; +} +{ +mul.f16x2 r1084, r1010, r1052; +} +{ +fma.rn.f16x2 r1087, r1046, r1051, r1084; +} +{ +add.f16x2 %0, r882, r968; +} +{ +add.f16x2 %1, r888, r974; +} +{ +sub.f16x2 %6, r882, r968; +} +{ +sub.f16x2 %7, r888, r974; +} +{ +add.f16x2 %2, r906, r1065; +} +{ +add.f16x2 %3, r942, r1071; +} +{ +sub.f16x2 %8, r906, r1065; +} +{ +sub.f16x2 %9, r942, r1071; +} +{ +add.f16x2 %4, r924, r1081; +} +{ +add.f16x2 %5, r960, r1087; +} +{ +sub.f16x2 %10, r924, r1081; +} +{ +sub.f16x2 %11, r960, r1087; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..84f0d3ba98102 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp32_fwd.hpp.inc @@ -0,0 +1,668 @@ +#ifndef CUFFTDX_FFT_216_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_216_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<186, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<333>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 1728, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %20, %25; +add.f32 f26, %15, f25; +add.f32 f27, %22, %27; +add.f32 f28, %16, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %15, f29; +sub.f32 f31, %22, %27; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %16, f35; +sub.f32 f37, %20, %25; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %23, %28; +add.f32 f42, %17, f41; +add.f32 f43, %24, %29; +add.f32 f44, %19, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %17, f45; +sub.f32 f47, %24, %29; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %19, f51; +sub.f32 f53, %23, %28; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +sub.f32 f67, f26, f42; +sub.f32 f68, f28, f44; +add.f32 f69, f33, f59; +add.f32 f70, f39, f61; +sub.f32 f71, f33, f59; +sub.f32 f72, f39, f61; +add.f32 f73, f34, f64; +add.f32 f74, f40, f66; +sub.f32 f75, f34, f64; +sub.f32 f76, f40, f66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1728, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f77, f78}, [rd6]; +mul.f32 f81, f77, f69; +mul.f32 f82, f78, f70; +mul.f32 f83, f77, f70; +mul.f32 f84, f77, f77; +mul.f32 f85, f78, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f78, f77; +fma.rn.f32 f88, f78, f77, f87; +mul.f32 f89, f86, f73; +mul.f32 f90, f88, f74; +mul.f32 f91, f86, f74; +mul.f32 f92, f77, f86; +mul.f32 f93, f78, f88; +sub.f32 f94, f92, f93; +mul.f32 f95, f77, f88; +fma.rn.f32 f96, f78, f86, f95; +mul.f32 f97, f94, f67; +mul.f32 f98, f96, f68; +mul.f32 f99, f94, f68; +mul.f32 f100, f77, f94; +mul.f32 f101, f78, f96; +sub.f32 f102, f100, f101; +mul.f32 f103, f77, f96; +fma.rn.f32 f104, f78, f94, f103; +mul.f32 f105, f102, f71; +mul.f32 f106, f104, f72; +mul.f32 f107, f102, f72; +mul.f32 f108, f77, f102; +mul.f32 f109, f78, f104; +sub.f32 f110, f108, f109; +mul.f32 f111, f77, f104; +fma.rn.f32 f112, f78, f102, f111; +mul.f32 f113, f110, f75; +mul.f32 f114, f112, f76; +mul.f32 f115, f110, f76; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f32 f116, f28, f44; +add.f32 f117, f26, f42; +st.shared.v2.f32 [r9], {f117, f116}; +fma.rn.f32 f118, f78, f69, f83; +sub.f32 f119, f81, f82; +st.shared.v2.f32 [r9+8], {f119, f118}; +fma.rn.f32 f120, f88, f73, f91; +sub.f32 f121, f89, f90; +st.shared.v2.f32 [r9+16], {f121, f120}; +fma.rn.f32 f122, f96, f67, f99; +sub.f32 f123, f97, f98; +st.shared.v2.f32 [r9+24], {f123, f122}; +fma.rn.f32 f124, f104, f71, f107; +sub.f32 f125, f105, f106; +st.shared.v2.f32 [r9+32], {f125, f124}; +fma.rn.f32 f126, f112, f75, f115; +sub.f32 f127, f113, f114; +st.shared.v2.f32 [r9+40], {f127, f126}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.v2.f32 {f128, f129}, [r10]; +ld.shared.v2.f32 {f132, f133}, [r10+288]; +ld.shared.v2.f32 {f136, f137}, [r10+576]; +ld.shared.v2.f32 {f140, f141}, [r10+864]; +ld.shared.v2.f32 {f144, f145}, [r10+1152]; +ld.shared.v2.f32 {f148, f149}, [r10+1440]; +add.f32 f152, f136, f144; +add.f32 f153, f128, f152; +add.f32 f154, f137, f145; +add.f32 f155, f129, f154; +mul.f32 f156, f152, 0f3F000000; +sub.f32 f157, f128, f156; +sub.f32 f158, f137, f145; +mul.f32 f159, f158, 0f3F5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f154, 0f3F000000; +sub.f32 f163, f129, f162; +sub.f32 f164, f136, f144; +mul.f32 f165, f164, 0f3F5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +add.f32 f168, f140, f148; +add.f32 f169, f132, f168; +add.f32 f170, f141, f149; +add.f32 f171, f133, f170; +mul.f32 f172, f168, 0f3F000000; +sub.f32 f173, f132, f172; +sub.f32 f174, f141, f149; +mul.f32 f175, f174, 0f3F5DB3D7; +add.f32 f176, f175, f173; +sub.f32 f177, f173, f175; +mul.f32 f178, f170, 0f3F000000; +sub.f32 f179, f133, f178; +sub.f32 f180, f140, f148; +mul.f32 f181, f180, 0f3F5DB3D7; +sub.f32 f182, f179, f181; +add.f32 f183, f181, f179; +mul.f32 f184, f176, 0f3F000000; +mul.f32 f185, f182, 0fBF5DB3D7; +sub.f32 f186, f184, f185; +mul.f32 f187, f182, 0f3F000000; +fma.rn.f32 f188, f176, 0fBF5DB3D7, f187; +mul.f32 f189, f177, 0fBF000000; +mul.f32 f190, f183, 0fBF5DB3D7; +sub.f32 f191, f189, f190; +mul.f32 f192, f183, 0fBF000000; +fma.rn.f32 f193, f177, 0fBF5DB3D7, f192; +sub.f32 f194, f153, f169; +sub.f32 f195, f155, f171; +add.f32 f196, f160, f186; +add.f32 f197, f166, f188; +sub.f32 f198, f160, f186; +sub.f32 f199, f166, f188; +add.f32 f200, f161, f191; +add.f32 f201, f167, f193; +sub.f32 f202, f161, f191; +sub.f32 f203, f167, f193; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f204, f205}, [rd11]; +mul.f32 f208, f204, f196; +mul.f32 f209, f205, f197; +mul.f32 f210, f204, f197; +mul.f32 f211, f204, f204; +mul.f32 f212, f205, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f205, f204; +fma.rn.f32 f215, f205, f204, f214; +mul.f32 f216, f213, f200; +mul.f32 f217, f215, f201; +mul.f32 f218, f213, f201; +mul.f32 f219, f204, f213; +mul.f32 f220, f205, f215; +sub.f32 f221, f219, f220; +mul.f32 f222, f204, f215; +fma.rn.f32 f223, f205, f213, f222; +mul.f32 f224, f221, f194; +mul.f32 f225, f223, f195; +mul.f32 f226, f221, f195; +mul.f32 f227, f204, f221; +mul.f32 f228, f205, f223; +sub.f32 f229, f227, f228; +mul.f32 f230, f204, f223; +fma.rn.f32 f231, f205, f221, f230; +mul.f32 f232, f229, f198; +mul.f32 f233, f231, f199; +mul.f32 f234, f229, f199; +mul.f32 f235, f204, f229; +mul.f32 f236, f205, f231; +sub.f32 f237, f235, f236; +mul.f32 f238, f204, f231; +fma.rn.f32 f239, f205, f229, f238; +mul.f32 f240, f237, f202; +mul.f32 f241, f239, f203; +mul.f32 f242, f237, f203; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 288, r15; +add.f32 f243, f155, f171; +add.f32 f244, f153, f169; +st.shared.v2.f32 [r16], {f244, f243}; +fma.rn.f32 f245, f205, f196, f210; +sub.f32 f246, f208, f209; +st.shared.v2.f32 [r16+48], {f246, f245}; +fma.rn.f32 f247, f215, f200, f218; +sub.f32 f248, f216, f217; +st.shared.v2.f32 [r16+96], {f248, f247}; +fma.rn.f32 f249, f223, f194, f226; +sub.f32 f250, f224, f225; +st.shared.v2.f32 [r16+144], {f250, f249}; +fma.rn.f32 f251, f231, f198, f234; +sub.f32 f252, f232, f233; +st.shared.v2.f32 [r16+192], {f252, f251}; +sub.f32 f253, f240, f241; +fma.rn.f32 f254, f239, f202, f242; +st.shared.v2.f32 [r16+240], {f253, f254}; +barrier.sync 0; +ld.shared.v2.f32 {f255, f256}, [r10]; +ld.shared.v2.f32 {f259, f260}, [r10+288]; +ld.shared.v2.f32 {f263, f264}, [r10+576]; +ld.shared.v2.f32 {f267, f268}, [r10+864]; +ld.shared.v2.f32 {f271, f272}, [r10+1152]; +ld.shared.v2.f32 {f275, f276}, [r10+1440]; +add.f32 f279, f263, f271; +add.f32 f280, f255, f279; +add.f32 f281, f264, f272; +add.f32 f282, f256, f281; +mul.f32 f283, f279, 0f3F000000; +sub.f32 f284, f255, f283; +sub.f32 f285, f264, f272; +mul.f32 f286, f285, 0f3F5DB3D7; +add.f32 f287, f286, f284; +sub.f32 f288, f284, f286; +mul.f32 f289, f281, 0f3F000000; +sub.f32 f290, f256, f289; +sub.f32 f291, f263, f271; +mul.f32 f292, f291, 0f3F5DB3D7; +sub.f32 f293, f290, f292; +add.f32 f294, f292, f290; +add.f32 f295, f267, f275; +add.f32 f296, f259, f295; +add.f32 f297, f268, f276; +add.f32 f298, f260, f297; +mul.f32 f299, f295, 0f3F000000; +sub.f32 f300, f259, f299; +sub.f32 f301, f268, f276; +mul.f32 f302, f301, 0f3F5DB3D7; +add.f32 f303, f302, f300; +sub.f32 f304, f300, f302; +mul.f32 f305, f297, 0f3F000000; +sub.f32 f306, f260, f305; +sub.f32 f307, f267, f275; +mul.f32 f308, f307, 0f3F5DB3D7; +sub.f32 f309, f306, f308; +add.f32 f310, f308, f306; +mul.f32 f311, f303, 0f3F000000; +mul.f32 f312, f309, 0fBF5DB3D7; +sub.f32 f313, f311, f312; +mul.f32 f314, f309, 0f3F000000; +fma.rn.f32 f315, f303, 0fBF5DB3D7, f314; +mul.f32 f316, f304, 0fBF000000; +mul.f32 f317, f310, 0fBF5DB3D7; +sub.f32 f318, f316, f317; +mul.f32 f319, f310, 0fBF000000; +fma.rn.f32 f320, f304, 0fBF5DB3D7, f319; +add.f32 %1, f282, f298; +add.f32 %0, f280, f296; +add.f32 %3, f293, f315; +add.f32 %2, f287, f313; +add.f32 %5, f294, f320; +add.f32 %4, f288, f318; +sub.f32 %7, f282, f298; +sub.f32 %6, f280, f296; +sub.f32 %9, f293, f315; +sub.f32 %8, f287, f313; +sub.f32 %11, f294, f320; +sub.f32 %10, f288, f318; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<187, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<309>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 864, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %20, %25; +add.f32 f26, %15, f25; +add.f32 f27, %22, %27; +add.f32 f28, %16, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %15, f29; +sub.f32 f31, %22, %27; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %16, f35; +sub.f32 f37, %20, %25; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %23, %28; +add.f32 f42, %17, f41; +add.f32 f43, %24, %29; +add.f32 f44, %19, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %17, f45; +sub.f32 f47, %24, %29; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %19, f51; +sub.f32 f53, %23, %28; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +add.f32 f67, f26, f42; +add.f32 f68, f28, f44; +sub.f32 f69, f26, f42; +sub.f32 f70, f28, f44; +add.f32 f71, f33, f59; +add.f32 f72, f39, f61; +sub.f32 f73, f33, f59; +sub.f32 f74, f39, f61; +add.f32 f75, f34, f64; +add.f32 f76, f40, f66; +sub.f32 f77, f34, f64; +sub.f32 f78, f40, f66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f79, f80}, [rd6]; +mul.f32 f83, f79, f71; +mul.f32 f84, f80, f72; +sub.f32 f85, f83, f84; +mul.f32 f86, f79, f72; +fma.rn.f32 f87, f80, f71, f86; +mul.f32 f88, f79, f79; +mul.f32 f89, f80, f80; +sub.f32 f90, f88, f89; +mul.f32 f91, f80, f79; +fma.rn.f32 f92, f80, f79, f91; +mul.f32 f93, f90, f75; +mul.f32 f94, f92, f76; +sub.f32 f95, f93, f94; +mul.f32 f96, f90, f76; +fma.rn.f32 f97, f92, f75, f96; +mul.f32 f98, f79, f90; +mul.f32 f99, f80, f92; +sub.f32 f100, f98, f99; +mul.f32 f101, f79, f92; +fma.rn.f32 f102, f80, f90, f101; +mul.f32 f103, f100, f69; +mul.f32 f104, f102, f70; +sub.f32 f105, f103, f104; +mul.f32 f106, f100, f70; +fma.rn.f32 f107, f102, f69, f106; +mul.f32 f108, f79, f100; +mul.f32 f109, f80, f102; +sub.f32 f110, f108, f109; +mul.f32 f111, f79, f102; +fma.rn.f32 f112, f80, f100, f111; +mul.f32 f113, f110, f73; +mul.f32 f114, f112, f74; +sub.f32 f115, f113, f114; +mul.f32 f116, f110, f74; +fma.rn.f32 f117, f112, f73, f116; +mul.f32 f118, f79, f110; +mul.f32 f119, f80, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f79, f112; +fma.rn.f32 f122, f80, f110, f121; +mul.f32 f123, f120, f77; +mul.f32 f124, f122, f78; +sub.f32 f125, f123, f124; +mul.f32 f126, f120, f78; +fma.rn.f32 f127, f122, f77, f126; +mad.lo.s32 r8, r5, 864, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.v2.f32 [r9], {f67, f85}; +st.shared.v2.f32 [r9+8], {f95, f105}; +st.shared.v2.f32 [r9+16], {f115, f125}; +barrier.sync 0; +mad.lo.s32 r10, r7, -20, r9; +ld.shared.f32 f128, [r10]; +ld.shared.f32 f129, [r10+144]; +ld.shared.f32 f130, [r10+288]; +ld.shared.f32 f131, [r10+432]; +ld.shared.f32 f132, [r10+576]; +ld.shared.f32 f133, [r10+720]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f68, f87}; +st.shared.v2.f32 [r9+8], {f97, f107}; +st.shared.v2.f32 [r9+16], {f117, f127}; +barrier.sync 0; +ld.shared.f32 f134, [r10]; +ld.shared.f32 f135, [r10+144]; +ld.shared.f32 f136, [r10+288]; +ld.shared.f32 f137, [r10+432]; +ld.shared.f32 f138, [r10+576]; +ld.shared.f32 f139, [r10+720]; +add.f32 f140, f130, f132; +add.f32 f141, f128, f140; +add.f32 f142, f136, f138; +add.f32 f143, f134, f142; +mul.f32 f144, f140, 0f3F000000; +sub.f32 f145, f128, f144; +sub.f32 f146, f136, f138; +mul.f32 f147, f146, 0f3F5DB3D7; +add.f32 f148, f147, f145; +sub.f32 f149, f145, f147; +mul.f32 f150, f142, 0f3F000000; +sub.f32 f151, f134, f150; +sub.f32 f152, f130, f132; +mul.f32 f153, f152, 0f3F5DB3D7; +sub.f32 f154, f151, f153; +add.f32 f155, f153, f151; +add.f32 f156, f131, f133; +add.f32 f157, f129, f156; +add.f32 f158, f137, f139; +add.f32 f159, f135, f158; +mul.f32 f160, f156, 0f3F000000; +sub.f32 f161, f129, f160; +sub.f32 f162, f137, f139; +mul.f32 f163, f162, 0f3F5DB3D7; +add.f32 f164, f163, f161; +sub.f32 f165, f161, f163; +mul.f32 f166, f158, 0f3F000000; +sub.f32 f167, f135, f166; +sub.f32 f168, f131, f133; +mul.f32 f169, f168, 0f3F5DB3D7; +sub.f32 f170, f167, f169; +add.f32 f171, f169, f167; +mul.f32 f172, f164, 0f3F000000; +mul.f32 f173, f170, 0fBF5DB3D7; +sub.f32 f174, f172, f173; +mul.f32 f175, f170, 0f3F000000; +fma.rn.f32 f176, f164, 0fBF5DB3D7, f175; +mul.f32 f177, f165, 0fBF000000; +mul.f32 f178, f171, 0fBF5DB3D7; +sub.f32 f179, f177, f178; +mul.f32 f180, f171, 0fBF000000; +fma.rn.f32 f181, f165, 0fBF5DB3D7, f180; +add.f32 f182, f141, f157; +add.f32 f183, f143, f159; +sub.f32 f184, f141, f157; +sub.f32 f185, f143, f159; +add.f32 f186, f148, f174; +add.f32 f187, f154, f176; +sub.f32 f188, f148, f174; +sub.f32 f189, f154, f176; +add.f32 f190, f149, f179; +add.f32 f191, f155, f181; +sub.f32 f192, f149, f179; +sub.f32 f193, f155, f181; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f194, f195}, [rd11]; +mul.f32 f198, f194, f186; +mul.f32 f199, f195, f187; +sub.f32 f200, f198, f199; +mul.f32 f201, f194, f187; +fma.rn.f32 f202, f195, f186, f201; +mul.f32 f203, f194, f194; +mul.f32 f204, f195, f195; +sub.f32 f205, f203, f204; +mul.f32 f206, f195, f194; +fma.rn.f32 f207, f195, f194, f206; +mul.f32 f208, f205, f190; +mul.f32 f209, f207, f191; +sub.f32 f210, f208, f209; +mul.f32 f211, f205, f191; +fma.rn.f32 f212, f207, f190, f211; +mul.f32 f213, f194, f205; +mul.f32 f214, f195, f207; +sub.f32 f215, f213, f214; +mul.f32 f216, f194, f207; +fma.rn.f32 f217, f195, f205, f216; +mul.f32 f218, f215, f184; +mul.f32 f219, f217, f185; +sub.f32 f220, f218, f219; +mul.f32 f221, f215, f185; +fma.rn.f32 f222, f217, f184, f221; +mul.f32 f223, f194, f215; +mul.f32 f224, f195, f217; +sub.f32 f225, f223, f224; +mul.f32 f226, f194, f217; +fma.rn.f32 f227, f195, f215, f226; +mul.f32 f228, f225, f188; +mul.f32 f229, f227, f189; +sub.f32 f230, f228, f229; +mul.f32 f231, f225, f189; +fma.rn.f32 f232, f227, f188, f231; +mul.f32 f233, f194, f225; +mul.f32 f234, f195, f227; +sub.f32 f235, f233, f234; +mul.f32 f236, f194, f227; +fma.rn.f32 f237, f195, f225, f236; +mul.f32 f238, f235, f192; +mul.f32 f239, f237, f193; +sub.f32 f240, f238, f239; +mul.f32 f241, f235, f193; +fma.rn.f32 f242, f237, f192, f241; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 144, r15; +st.shared.f32 [r16], f182; +st.shared.f32 [r16+24], f200; +st.shared.f32 [r16+48], f210; +st.shared.f32 [r16+72], f220; +st.shared.f32 [r16+96], f230; +st.shared.f32 [r16+120], f240; +barrier.sync 0; +ld.shared.f32 f243, [r10]; +ld.shared.f32 f244, [r10+144]; +ld.shared.f32 f245, [r10+288]; +ld.shared.f32 f246, [r10+432]; +ld.shared.f32 f247, [r10+576]; +ld.shared.f32 f248, [r10+720]; +barrier.sync 0; +st.shared.f32 [r16], f183; +st.shared.f32 [r16+24], f202; +st.shared.f32 [r16+48], f212; +st.shared.f32 [r16+72], f222; +st.shared.f32 [r16+96], f232; +st.shared.f32 [r16+120], f242; +barrier.sync 0; +ld.shared.f32 f249, [r10]; +ld.shared.f32 f250, [r10+144]; +ld.shared.f32 f251, [r10+288]; +ld.shared.f32 f252, [r10+432]; +ld.shared.f32 f253, [r10+576]; +ld.shared.f32 f254, [r10+720]; +add.f32 f255, f245, f247; +add.f32 f256, f243, f255; +add.f32 f257, f251, f253; +add.f32 f258, f249, f257; +mul.f32 f259, f255, 0f3F000000; +sub.f32 f260, f243, f259; +sub.f32 f261, f251, f253; +mul.f32 f262, f261, 0f3F5DB3D7; +add.f32 f263, f262, f260; +sub.f32 f264, f260, f262; +mul.f32 f265, f257, 0f3F000000; +sub.f32 f266, f249, f265; +sub.f32 f267, f245, f247; +mul.f32 f268, f267, 0f3F5DB3D7; +sub.f32 f269, f266, f268; +add.f32 f270, f268, f266; +add.f32 f271, f246, f248; +add.f32 f272, f244, f271; +add.f32 f273, f252, f254; +add.f32 f274, f250, f273; +mul.f32 f275, f271, 0f3F000000; +sub.f32 f276, f244, f275; +sub.f32 f277, f252, f254; +mul.f32 f278, f277, 0f3F5DB3D7; +add.f32 f279, f278, f276; +sub.f32 f280, f276, f278; +mul.f32 f281, f273, 0f3F000000; +sub.f32 f282, f250, f281; +sub.f32 f283, f246, f248; +mul.f32 f284, f283, 0f3F5DB3D7; +sub.f32 f285, f282, f284; +add.f32 f286, f284, f282; +mul.f32 f287, f279, 0f3F000000; +mul.f32 f288, f285, 0fBF5DB3D7; +sub.f32 f289, f287, f288; +mul.f32 f290, f285, 0f3F000000; +fma.rn.f32 f291, f279, 0fBF5DB3D7, f290; +mul.f32 f292, f280, 0fBF000000; +mul.f32 f293, f286, 0fBF5DB3D7; +sub.f32 f294, f292, f293; +mul.f32 f295, f286, 0fBF000000; +fma.rn.f32 f296, f280, 0fBF5DB3D7, f295; +add.f32 %0, f256, f272; +add.f32 %1, f258, f274; +add.f32 %3, f269, f291; +add.f32 %2, f263, f289; +add.f32 %5, f270, f296; +add.f32 %4, f264, f294; +sub.f32 %6, f256, f272; +sub.f32 %7, f258, f274; +sub.f32 %9, f269, f291; +sub.f32 %8, f263, f289; +sub.f32 %11, f270, f296; +sub.f32 %10, f264, f294; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..752b1dd2f0d4e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp32_inv.hpp.inc @@ -0,0 +1,668 @@ +#ifndef CUFFTDX_FFT_216_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_216_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<388, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<333>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 1728, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %20, %25; +add.f32 f26, %15, f25; +add.f32 f27, %22, %27; +add.f32 f28, %16, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %15, f29; +sub.f32 f31, %22, %27; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %16, f35; +sub.f32 f37, %20, %25; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %23, %28; +add.f32 f42, %17, f41; +add.f32 f43, %24, %29; +add.f32 f44, %19, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %17, f45; +sub.f32 f47, %24, %29; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %19, f51; +sub.f32 f53, %23, %28; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +sub.f32 f67, f26, f42; +sub.f32 f68, f28, f44; +add.f32 f69, f33, f59; +add.f32 f70, f39, f61; +sub.f32 f71, f33, f59; +sub.f32 f72, f39, f61; +add.f32 f73, f34, f64; +add.f32 f74, f40, f66; +sub.f32 f75, f34, f64; +sub.f32 f76, f40, f66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1728, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f77, f78}, [rd6]; +mul.f32 f81, f70, f78; +mul.f32 f82, f69, f78; +mul.f32 f83, f77, f70; +mul.f32 f84, f77, f77; +mul.f32 f85, f78, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f78, f77; +fma.rn.f32 f88, f78, f77, f87; +mul.f32 f89, f74, f88; +mul.f32 f90, f73, f88; +mul.f32 f91, f86, f74; +mul.f32 f92, f77, f86; +mul.f32 f93, f78, f88; +sub.f32 f94, f92, f93; +mul.f32 f95, f77, f88; +fma.rn.f32 f96, f78, f86, f95; +mul.f32 f97, f68, f96; +mul.f32 f98, f67, f96; +mul.f32 f99, f94, f68; +mul.f32 f100, f77, f94; +mul.f32 f101, f78, f96; +sub.f32 f102, f100, f101; +mul.f32 f103, f77, f96; +fma.rn.f32 f104, f78, f94, f103; +mul.f32 f105, f72, f104; +mul.f32 f106, f71, f104; +mul.f32 f107, f102, f72; +mul.f32 f108, f77, f102; +mul.f32 f109, f78, f104; +sub.f32 f110, f108, f109; +mul.f32 f111, f77, f104; +fma.rn.f32 f112, f78, f102, f111; +mul.f32 f113, f76, f112; +mul.f32 f114, f75, f112; +mul.f32 f115, f110, f76; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f32 f116, f28, f44; +add.f32 f117, f26, f42; +st.shared.v2.f32 [r9], {f117, f116}; +fma.rn.f32 f118, f77, f69, f81; +sub.f32 f119, f83, f82; +st.shared.v2.f32 [r9+8], {f118, f119}; +fma.rn.f32 f120, f86, f73, f89; +sub.f32 f121, f91, f90; +st.shared.v2.f32 [r9+16], {f120, f121}; +fma.rn.f32 f122, f94, f67, f97; +sub.f32 f123, f99, f98; +st.shared.v2.f32 [r9+24], {f122, f123}; +fma.rn.f32 f124, f102, f71, f105; +sub.f32 f125, f107, f106; +st.shared.v2.f32 [r9+32], {f124, f125}; +fma.rn.f32 f126, f110, f75, f113; +sub.f32 f127, f115, f114; +st.shared.v2.f32 [r9+40], {f126, f127}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.v2.f32 {f128, f129}, [r10]; +ld.shared.v2.f32 {f132, f133}, [r10+288]; +ld.shared.v2.f32 {f136, f137}, [r10+576]; +ld.shared.v2.f32 {f140, f141}, [r10+864]; +ld.shared.v2.f32 {f144, f145}, [r10+1152]; +ld.shared.v2.f32 {f148, f149}, [r10+1440]; +add.f32 f152, f136, f144; +add.f32 f153, f128, f152; +add.f32 f154, f137, f145; +add.f32 f155, f129, f154; +mul.f32 f156, f152, 0f3F000000; +sub.f32 f157, f128, f156; +sub.f32 f158, f137, f145; +mul.f32 f159, f158, 0fBF5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f154, 0f3F000000; +sub.f32 f163, f129, f162; +sub.f32 f164, f136, f144; +mul.f32 f165, f164, 0fBF5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +add.f32 f168, f140, f148; +add.f32 f169, f132, f168; +add.f32 f170, f141, f149; +add.f32 f171, f133, f170; +mul.f32 f172, f168, 0f3F000000; +sub.f32 f173, f132, f172; +sub.f32 f174, f141, f149; +mul.f32 f175, f174, 0fBF5DB3D7; +add.f32 f176, f175, f173; +sub.f32 f177, f173, f175; +mul.f32 f178, f170, 0f3F000000; +sub.f32 f179, f133, f178; +sub.f32 f180, f140, f148; +mul.f32 f181, f180, 0fBF5DB3D7; +sub.f32 f182, f179, f181; +add.f32 f183, f181, f179; +mul.f32 f184, f176, 0f3F000000; +mul.f32 f185, f182, 0f3F5DB3D7; +sub.f32 f186, f184, f185; +mul.f32 f187, f182, 0f3F000000; +fma.rn.f32 f188, f176, 0f3F5DB3D7, f187; +mul.f32 f189, f177, 0fBF000000; +mul.f32 f190, f183, 0f3F5DB3D7; +sub.f32 f191, f189, f190; +mul.f32 f192, f183, 0fBF000000; +fma.rn.f32 f193, f177, 0f3F5DB3D7, f192; +sub.f32 f194, f153, f169; +sub.f32 f195, f155, f171; +add.f32 f196, f160, f186; +add.f32 f197, f166, f188; +sub.f32 f198, f160, f186; +sub.f32 f199, f166, f188; +add.f32 f200, f161, f191; +add.f32 f201, f167, f193; +sub.f32 f202, f161, f191; +sub.f32 f203, f167, f193; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f204, f205}, [rd11]; +mul.f32 f208, f197, f205; +mul.f32 f209, f196, f205; +mul.f32 f210, f204, f197; +mul.f32 f211, f204, f204; +mul.f32 f212, f205, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f205, f204; +fma.rn.f32 f215, f205, f204, f214; +mul.f32 f216, f201, f215; +mul.f32 f217, f200, f215; +mul.f32 f218, f213, f201; +mul.f32 f219, f204, f213; +mul.f32 f220, f205, f215; +sub.f32 f221, f219, f220; +mul.f32 f222, f204, f215; +fma.rn.f32 f223, f205, f213, f222; +mul.f32 f224, f195, f223; +mul.f32 f225, f194, f223; +mul.f32 f226, f221, f195; +mul.f32 f227, f204, f221; +mul.f32 f228, f205, f223; +sub.f32 f229, f227, f228; +mul.f32 f230, f204, f223; +fma.rn.f32 f231, f205, f221, f230; +mul.f32 f232, f199, f231; +mul.f32 f233, f198, f231; +mul.f32 f234, f229, f199; +mul.f32 f235, f204, f229; +mul.f32 f236, f205, f231; +sub.f32 f237, f235, f236; +mul.f32 f238, f204, f231; +fma.rn.f32 f239, f205, f229, f238; +mul.f32 f240, f203, f239; +mul.f32 f241, f202, f239; +mul.f32 f242, f237, f203; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 288, r15; +add.f32 f243, f155, f171; +add.f32 f244, f153, f169; +st.shared.v2.f32 [r16], {f244, f243}; +fma.rn.f32 f245, f204, f196, f208; +sub.f32 f246, f210, f209; +st.shared.v2.f32 [r16+48], {f245, f246}; +fma.rn.f32 f247, f213, f200, f216; +sub.f32 f248, f218, f217; +st.shared.v2.f32 [r16+96], {f247, f248}; +fma.rn.f32 f249, f221, f194, f224; +sub.f32 f250, f226, f225; +st.shared.v2.f32 [r16+144], {f249, f250}; +fma.rn.f32 f251, f229, f198, f232; +sub.f32 f252, f234, f233; +st.shared.v2.f32 [r16+192], {f251, f252}; +sub.f32 f253, f242, f241; +fma.rn.f32 f254, f237, f202, f240; +st.shared.v2.f32 [r16+240], {f254, f253}; +barrier.sync 0; +ld.shared.v2.f32 {f255, f256}, [r10]; +ld.shared.v2.f32 {f259, f260}, [r10+288]; +ld.shared.v2.f32 {f263, f264}, [r10+576]; +ld.shared.v2.f32 {f267, f268}, [r10+864]; +ld.shared.v2.f32 {f271, f272}, [r10+1152]; +ld.shared.v2.f32 {f275, f276}, [r10+1440]; +add.f32 f279, f263, f271; +add.f32 f280, f255, f279; +add.f32 f281, f264, f272; +add.f32 f282, f256, f281; +mul.f32 f283, f279, 0f3F000000; +sub.f32 f284, f255, f283; +sub.f32 f285, f264, f272; +mul.f32 f286, f285, 0fBF5DB3D7; +add.f32 f287, f286, f284; +sub.f32 f288, f284, f286; +mul.f32 f289, f281, 0f3F000000; +sub.f32 f290, f256, f289; +sub.f32 f291, f263, f271; +mul.f32 f292, f291, 0fBF5DB3D7; +sub.f32 f293, f290, f292; +add.f32 f294, f292, f290; +add.f32 f295, f267, f275; +add.f32 f296, f259, f295; +add.f32 f297, f268, f276; +add.f32 f298, f260, f297; +mul.f32 f299, f295, 0f3F000000; +sub.f32 f300, f259, f299; +sub.f32 f301, f268, f276; +mul.f32 f302, f301, 0fBF5DB3D7; +add.f32 f303, f302, f300; +sub.f32 f304, f300, f302; +mul.f32 f305, f297, 0f3F000000; +sub.f32 f306, f260, f305; +sub.f32 f307, f267, f275; +mul.f32 f308, f307, 0fBF5DB3D7; +sub.f32 f309, f306, f308; +add.f32 f310, f308, f306; +mul.f32 f311, f303, 0f3F000000; +mul.f32 f312, f309, 0f3F5DB3D7; +sub.f32 f313, f311, f312; +mul.f32 f314, f309, 0f3F000000; +fma.rn.f32 f315, f303, 0f3F5DB3D7, f314; +mul.f32 f316, f304, 0fBF000000; +mul.f32 f317, f310, 0f3F5DB3D7; +sub.f32 f318, f316, f317; +mul.f32 f319, f310, 0fBF000000; +fma.rn.f32 f320, f304, 0f3F5DB3D7, f319; +add.f32 %1, f282, f298; +add.f32 %0, f280, f296; +add.f32 %3, f293, f315; +add.f32 %2, f287, f313; +add.f32 %5, f294, f320; +add.f32 %4, f288, f318; +sub.f32 %7, f282, f298; +sub.f32 %6, f280, f296; +sub.f32 %9, f293, f315; +sub.f32 %8, f287, f313; +sub.f32 %11, f294, f320; +sub.f32 %10, f288, f318; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<389, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<309>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 864, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %20, %25; +add.f32 f26, %15, f25; +add.f32 f27, %22, %27; +add.f32 f28, %16, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %15, f29; +sub.f32 f31, %22, %27; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %16, f35; +sub.f32 f37, %20, %25; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %23, %28; +add.f32 f42, %17, f41; +add.f32 f43, %24, %29; +add.f32 f44, %19, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %17, f45; +sub.f32 f47, %24, %29; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %19, f51; +sub.f32 f53, %23, %28; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +add.f32 f67, f26, f42; +add.f32 f68, f28, f44; +sub.f32 f69, f26, f42; +sub.f32 f70, f28, f44; +add.f32 f71, f33, f59; +add.f32 f72, f39, f61; +sub.f32 f73, f33, f59; +sub.f32 f74, f39, f61; +add.f32 f75, f34, f64; +add.f32 f76, f40, f66; +sub.f32 f77, f34, f64; +sub.f32 f78, f40, f66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f79, f80}, [rd6]; +mul.f32 f83, f72, f80; +fma.rn.f32 f84, f79, f71, f83; +mul.f32 f85, f71, f80; +mul.f32 f86, f79, f72; +sub.f32 f87, f86, f85; +mul.f32 f88, f79, f79; +mul.f32 f89, f80, f80; +sub.f32 f90, f88, f89; +mul.f32 f91, f80, f79; +fma.rn.f32 f92, f80, f79, f91; +mul.f32 f93, f76, f92; +fma.rn.f32 f94, f90, f75, f93; +mul.f32 f95, f75, f92; +mul.f32 f96, f90, f76; +sub.f32 f97, f96, f95; +mul.f32 f98, f79, f90; +mul.f32 f99, f80, f92; +sub.f32 f100, f98, f99; +mul.f32 f101, f79, f92; +fma.rn.f32 f102, f80, f90, f101; +mul.f32 f103, f70, f102; +fma.rn.f32 f104, f100, f69, f103; +mul.f32 f105, f69, f102; +mul.f32 f106, f100, f70; +sub.f32 f107, f106, f105; +mul.f32 f108, f79, f100; +mul.f32 f109, f80, f102; +sub.f32 f110, f108, f109; +mul.f32 f111, f79, f102; +fma.rn.f32 f112, f80, f100, f111; +mul.f32 f113, f74, f112; +fma.rn.f32 f114, f110, f73, f113; +mul.f32 f115, f73, f112; +mul.f32 f116, f110, f74; +sub.f32 f117, f116, f115; +mul.f32 f118, f79, f110; +mul.f32 f119, f80, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f79, f112; +fma.rn.f32 f122, f80, f110, f121; +mul.f32 f123, f78, f122; +fma.rn.f32 f124, f120, f77, f123; +mul.f32 f125, f77, f122; +mul.f32 f126, f120, f78; +sub.f32 f127, f126, f125; +mad.lo.s32 r8, r5, 864, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.v2.f32 [r9], {f67, f84}; +st.shared.v2.f32 [r9+8], {f94, f104}; +st.shared.v2.f32 [r9+16], {f114, f124}; +barrier.sync 0; +mad.lo.s32 r10, r7, -20, r9; +ld.shared.f32 f128, [r10]; +ld.shared.f32 f129, [r10+144]; +ld.shared.f32 f130, [r10+288]; +ld.shared.f32 f131, [r10+432]; +ld.shared.f32 f132, [r10+576]; +ld.shared.f32 f133, [r10+720]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f68, f87}; +st.shared.v2.f32 [r9+8], {f97, f107}; +st.shared.v2.f32 [r9+16], {f117, f127}; +barrier.sync 0; +ld.shared.f32 f134, [r10]; +ld.shared.f32 f135, [r10+144]; +ld.shared.f32 f136, [r10+288]; +ld.shared.f32 f137, [r10+432]; +ld.shared.f32 f138, [r10+576]; +ld.shared.f32 f139, [r10+720]; +add.f32 f140, f130, f132; +add.f32 f141, f128, f140; +add.f32 f142, f136, f138; +add.f32 f143, f134, f142; +mul.f32 f144, f140, 0f3F000000; +sub.f32 f145, f128, f144; +sub.f32 f146, f136, f138; +mul.f32 f147, f146, 0fBF5DB3D7; +add.f32 f148, f147, f145; +sub.f32 f149, f145, f147; +mul.f32 f150, f142, 0f3F000000; +sub.f32 f151, f134, f150; +sub.f32 f152, f130, f132; +mul.f32 f153, f152, 0fBF5DB3D7; +sub.f32 f154, f151, f153; +add.f32 f155, f153, f151; +add.f32 f156, f131, f133; +add.f32 f157, f129, f156; +add.f32 f158, f137, f139; +add.f32 f159, f135, f158; +mul.f32 f160, f156, 0f3F000000; +sub.f32 f161, f129, f160; +sub.f32 f162, f137, f139; +mul.f32 f163, f162, 0fBF5DB3D7; +add.f32 f164, f163, f161; +sub.f32 f165, f161, f163; +mul.f32 f166, f158, 0f3F000000; +sub.f32 f167, f135, f166; +sub.f32 f168, f131, f133; +mul.f32 f169, f168, 0fBF5DB3D7; +sub.f32 f170, f167, f169; +add.f32 f171, f169, f167; +mul.f32 f172, f164, 0f3F000000; +mul.f32 f173, f170, 0f3F5DB3D7; +sub.f32 f174, f172, f173; +mul.f32 f175, f170, 0f3F000000; +fma.rn.f32 f176, f164, 0f3F5DB3D7, f175; +mul.f32 f177, f165, 0fBF000000; +mul.f32 f178, f171, 0f3F5DB3D7; +sub.f32 f179, f177, f178; +mul.f32 f180, f171, 0fBF000000; +fma.rn.f32 f181, f165, 0f3F5DB3D7, f180; +add.f32 f182, f141, f157; +add.f32 f183, f143, f159; +sub.f32 f184, f141, f157; +sub.f32 f185, f143, f159; +add.f32 f186, f148, f174; +add.f32 f187, f154, f176; +sub.f32 f188, f148, f174; +sub.f32 f189, f154, f176; +add.f32 f190, f149, f179; +add.f32 f191, f155, f181; +sub.f32 f192, f149, f179; +sub.f32 f193, f155, f181; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f194, f195}, [rd11]; +mul.f32 f198, f187, f195; +fma.rn.f32 f199, f194, f186, f198; +mul.f32 f200, f186, f195; +mul.f32 f201, f194, f187; +sub.f32 f202, f201, f200; +mul.f32 f203, f194, f194; +mul.f32 f204, f195, f195; +sub.f32 f205, f203, f204; +mul.f32 f206, f195, f194; +fma.rn.f32 f207, f195, f194, f206; +mul.f32 f208, f191, f207; +fma.rn.f32 f209, f205, f190, f208; +mul.f32 f210, f190, f207; +mul.f32 f211, f205, f191; +sub.f32 f212, f211, f210; +mul.f32 f213, f194, f205; +mul.f32 f214, f195, f207; +sub.f32 f215, f213, f214; +mul.f32 f216, f194, f207; +fma.rn.f32 f217, f195, f205, f216; +mul.f32 f218, f185, f217; +fma.rn.f32 f219, f215, f184, f218; +mul.f32 f220, f184, f217; +mul.f32 f221, f215, f185; +sub.f32 f222, f221, f220; +mul.f32 f223, f194, f215; +mul.f32 f224, f195, f217; +sub.f32 f225, f223, f224; +mul.f32 f226, f194, f217; +fma.rn.f32 f227, f195, f215, f226; +mul.f32 f228, f189, f227; +fma.rn.f32 f229, f225, f188, f228; +mul.f32 f230, f188, f227; +mul.f32 f231, f225, f189; +sub.f32 f232, f231, f230; +mul.f32 f233, f194, f225; +mul.f32 f234, f195, f227; +sub.f32 f235, f233, f234; +mul.f32 f236, f194, f227; +fma.rn.f32 f237, f195, f225, f236; +mul.f32 f238, f193, f237; +fma.rn.f32 f239, f235, f192, f238; +mul.f32 f240, f192, f237; +mul.f32 f241, f235, f193; +sub.f32 f242, f241, f240; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 144, r15; +st.shared.f32 [r16], f182; +st.shared.f32 [r16+24], f199; +st.shared.f32 [r16+48], f209; +st.shared.f32 [r16+72], f219; +st.shared.f32 [r16+96], f229; +st.shared.f32 [r16+120], f239; +barrier.sync 0; +ld.shared.f32 f243, [r10]; +ld.shared.f32 f244, [r10+144]; +ld.shared.f32 f245, [r10+288]; +ld.shared.f32 f246, [r10+432]; +ld.shared.f32 f247, [r10+576]; +ld.shared.f32 f248, [r10+720]; +barrier.sync 0; +st.shared.f32 [r16], f183; +st.shared.f32 [r16+24], f202; +st.shared.f32 [r16+48], f212; +st.shared.f32 [r16+72], f222; +st.shared.f32 [r16+96], f232; +st.shared.f32 [r16+120], f242; +barrier.sync 0; +ld.shared.f32 f249, [r10]; +ld.shared.f32 f250, [r10+144]; +ld.shared.f32 f251, [r10+288]; +ld.shared.f32 f252, [r10+432]; +ld.shared.f32 f253, [r10+576]; +ld.shared.f32 f254, [r10+720]; +add.f32 f255, f245, f247; +add.f32 f256, f243, f255; +add.f32 f257, f251, f253; +add.f32 f258, f249, f257; +mul.f32 f259, f255, 0f3F000000; +sub.f32 f260, f243, f259; +sub.f32 f261, f251, f253; +mul.f32 f262, f261, 0fBF5DB3D7; +add.f32 f263, f262, f260; +sub.f32 f264, f260, f262; +mul.f32 f265, f257, 0f3F000000; +sub.f32 f266, f249, f265; +sub.f32 f267, f245, f247; +mul.f32 f268, f267, 0fBF5DB3D7; +sub.f32 f269, f266, f268; +add.f32 f270, f268, f266; +add.f32 f271, f246, f248; +add.f32 f272, f244, f271; +add.f32 f273, f252, f254; +add.f32 f274, f250, f273; +mul.f32 f275, f271, 0f3F000000; +sub.f32 f276, f244, f275; +sub.f32 f277, f252, f254; +mul.f32 f278, f277, 0fBF5DB3D7; +add.f32 f279, f278, f276; +sub.f32 f280, f276, f278; +mul.f32 f281, f273, 0f3F000000; +sub.f32 f282, f250, f281; +sub.f32 f283, f246, f248; +mul.f32 f284, f283, 0fBF5DB3D7; +sub.f32 f285, f282, f284; +add.f32 f286, f284, f282; +mul.f32 f287, f279, 0f3F000000; +mul.f32 f288, f285, 0f3F5DB3D7; +sub.f32 f289, f287, f288; +mul.f32 f290, f285, 0f3F000000; +fma.rn.f32 f291, f279, 0f3F5DB3D7, f290; +mul.f32 f292, f280, 0fBF000000; +mul.f32 f293, f286, 0f3F5DB3D7; +sub.f32 f294, f292, f293; +mul.f32 f295, f286, 0fBF000000; +fma.rn.f32 f296, f280, 0f3F5DB3D7, f295; +add.f32 %0, f256, f272; +add.f32 %1, f258, f274; +add.f32 %3, f269, f291; +add.f32 %2, f263, f289; +add.f32 %5, f270, f296; +add.f32 %4, f264, f294; +sub.f32 %6, f256, f272; +sub.f32 %7, f258, f274; +sub.f32 %9, f269, f291; +sub.f32 %8, f263, f289; +sub.f32 %11, f270, f296; +sub.f32 %10, f264, f294; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_216), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..cffab59167cc0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp64_fwd.hpp.inc @@ -0,0 +1,652 @@ +#ifndef CUFFTDX_FFT_216_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_216_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<561, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<307>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 1728, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %20, %25; +add.f64 fd26, %15, fd25; +add.f64 fd27, %22, %27; +add.f64 fd28, %16, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %15, fd29; +sub.f64 fd31, %22, %27; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %16, fd35; +sub.f64 fd37, %20, %25; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %23, %28; +add.f64 fd42, %17, fd41; +add.f64 fd43, %24, %29; +add.f64 fd44, %19, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %17, fd45; +sub.f64 fd47, %24, %29; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %19, fd51; +sub.f64 fd53, %23, %28; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +add.f64 fd67, fd26, fd42; +add.f64 fd68, fd28, fd44; +sub.f64 fd69, fd26, fd42; +sub.f64 fd70, fd28, fd44; +add.f64 fd71, fd33, fd59; +add.f64 fd72, fd39, fd61; +sub.f64 fd73, fd33, fd59; +sub.f64 fd74, fd39, fd61; +add.f64 fd75, fd34, fd64; +add.f64 fd76, fd40, fd66; +sub.f64 fd77, fd34, fd64; +sub.f64 fd78, fd40, fd66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd79, fd80}, [rd6]; +mul.f64 fd83, fd79, fd71; +mul.f64 fd84, fd80, fd72; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd79, fd72; +fma.rn.f64 fd87, fd80, fd71, fd86; +mul.f64 fd88, fd79, fd79; +mul.f64 fd89, fd80, fd80; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd80, fd79; +fma.rn.f64 fd92, fd80, fd79, fd91; +mul.f64 fd93, fd90, fd75; +mul.f64 fd94, fd92, fd76; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd90, fd76; +fma.rn.f64 fd97, fd92, fd75, fd96; +mul.f64 fd98, fd79, fd90; +mul.f64 fd99, fd80, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd79, fd92; +fma.rn.f64 fd102, fd80, fd90, fd101; +mul.f64 fd103, fd100, fd69; +mul.f64 fd104, fd102, fd70; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd100, fd70; +fma.rn.f64 fd107, fd102, fd69, fd106; +ld.global.v2.f64 {fd108, fd109}, [rd6+576]; +mul.f64 fd112, fd108, fd73; +mul.f64 fd113, fd109, fd74; +sub.f64 fd114, fd112, fd113; +mul.f64 fd115, fd108, fd74; +fma.rn.f64 fd116, fd109, fd73, fd115; +mul.f64 fd117, fd79, fd108; +mul.f64 fd118, fd80, fd109; +sub.f64 fd119, fd117, fd118; +mul.f64 fd120, fd79, fd109; +fma.rn.f64 fd121, fd80, fd108, fd120; +mul.f64 fd122, fd119, fd77; +mul.f64 fd123, fd121, fd78; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd119, fd78; +fma.rn.f64 fd126, fd121, fd77, fd125; +mad.lo.s32 r8, r5, 1728, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v2.f64 [r9], {fd67, fd85}; +st.shared.v2.f64 [r9+16], {fd95, fd105}; +st.shared.v2.f64 [r9+32], {fd114, fd124}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.f64 fd127, [r10]; +ld.shared.f64 fd128, [r10+288]; +ld.shared.f64 fd129, [r10+576]; +ld.shared.f64 fd130, [r10+864]; +ld.shared.f64 fd131, [r10+1152]; +ld.shared.f64 fd132, [r10+1440]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd68, fd87}; +st.shared.v2.f64 [r9+16], {fd97, fd107}; +st.shared.v2.f64 [r9+32], {fd116, fd126}; +barrier.sync 0; +ld.shared.f64 fd133, [r10]; +ld.shared.f64 fd134, [r10+288]; +ld.shared.f64 fd135, [r10+576]; +ld.shared.f64 fd136, [r10+864]; +ld.shared.f64 fd137, [r10+1152]; +ld.shared.f64 fd138, [r10+1440]; +add.f64 fd139, fd129, fd131; +add.f64 fd140, fd127, fd139; +add.f64 fd141, fd135, fd137; +add.f64 fd142, fd133, fd141; +mul.f64 fd143, fd139, 0d3FE0000000000000; +sub.f64 fd144, fd127, fd143; +sub.f64 fd145, fd135, fd137; +mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA; +add.f64 fd147, fd146, fd144; +sub.f64 fd148, fd144, fd146; +mul.f64 fd149, fd141, 0d3FE0000000000000; +sub.f64 fd150, fd133, fd149; +sub.f64 fd151, fd129, fd131; +mul.f64 fd152, fd151, 0d3FEBB67AE8584CAA; +sub.f64 fd153, fd150, fd152; +add.f64 fd154, fd152, fd150; +add.f64 fd155, fd130, fd132; +add.f64 fd156, fd128, fd155; +add.f64 fd157, fd136, fd138; +add.f64 fd158, fd134, fd157; +mul.f64 fd159, fd155, 0d3FE0000000000000; +sub.f64 fd160, fd128, fd159; +sub.f64 fd161, fd136, fd138; +mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA; +add.f64 fd163, fd162, fd160; +sub.f64 fd164, fd160, fd162; +mul.f64 fd165, fd157, 0d3FE0000000000000; +sub.f64 fd166, fd134, fd165; +sub.f64 fd167, fd130, fd132; +mul.f64 fd168, fd167, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd166, fd168; +add.f64 fd170, fd168, fd166; +mul.f64 fd171, fd163, 0d3FE0000000000000; +mul.f64 fd172, fd169, 0dBFEBB67AE8584CAA; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd169, 0d3FE0000000000000; +fma.rn.f64 fd175, fd163, 0dBFEBB67AE8584CAA, fd174; +mul.f64 fd176, fd164, 0dBFE0000000000000; +mul.f64 fd177, fd170, 0dBFEBB67AE8584CAA; +sub.f64 fd178, fd176, fd177; +mul.f64 fd179, fd170, 0dBFE0000000000000; +fma.rn.f64 fd180, fd164, 0dBFEBB67AE8584CAA, fd179; +add.f64 fd181, fd140, fd156; +add.f64 fd182, fd142, fd158; +sub.f64 fd183, fd140, fd156; +sub.f64 fd184, fd142, fd158; +add.f64 fd185, fd147, fd173; +add.f64 fd186, fd153, fd175; +sub.f64 fd187, fd147, fd173; +sub.f64 fd188, fd153, fd175; +add.f64 fd189, fd148, fd178; +add.f64 fd190, fd154, fd180; +sub.f64 fd191, fd148, fd178; +sub.f64 fd192, fd154, fd180; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd193, fd194}, [rd11]; +mul.f64 fd197, fd193, fd185; +mul.f64 fd198, fd194, fd186; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd193, fd186; +fma.rn.f64 fd201, fd194, fd185, fd200; +mul.f64 fd202, fd193, fd193; +mul.f64 fd203, fd194, fd194; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd194, fd193; +fma.rn.f64 fd206, fd194, fd193, fd205; +mul.f64 fd207, fd204, fd189; +mul.f64 fd208, fd206, fd190; +sub.f64 fd209, fd207, fd208; +mul.f64 fd210, fd204, fd190; +fma.rn.f64 fd211, fd206, fd189, fd210; +mul.f64 fd212, fd193, fd204; +mul.f64 fd213, fd194, fd206; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd193, fd206; +fma.rn.f64 fd216, fd194, fd204, fd215; +mul.f64 fd217, fd214, fd183; +mul.f64 fd218, fd216, fd184; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd214, fd184; +fma.rn.f64 fd221, fd216, fd183, fd220; +ld.global.v2.f64 {fd222, fd223}, [rd11+96]; +mul.f64 fd226, fd222, fd187; +mul.f64 fd227, fd223, fd188; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd222, fd188; +fma.rn.f64 fd230, fd223, fd187, fd229; +mul.f64 fd231, fd193, fd222; +mul.f64 fd232, fd194, fd223; +sub.f64 fd233, fd231, fd232; +mul.f64 fd234, fd193, fd223; +fma.rn.f64 fd235, fd194, fd222, fd234; +mul.f64 fd236, fd233, fd191; +mul.f64 fd237, fd235, fd192; +sub.f64 fd238, fd236, fd237; +mul.f64 fd239, fd233, fd192; +fma.rn.f64 fd240, fd235, fd191, fd239; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 288, r15; +st.shared.f64 [r16], fd181; +st.shared.f64 [r16+48], fd199; +st.shared.f64 [r16+96], fd209; +st.shared.f64 [r16+144], fd219; +st.shared.f64 [r16+192], fd228; +st.shared.f64 [r16+240], fd238; +barrier.sync 0; +ld.shared.f64 fd241, [r10]; +ld.shared.f64 fd242, [r10+288]; +ld.shared.f64 fd243, [r10+576]; +ld.shared.f64 fd244, [r10+864]; +ld.shared.f64 fd245, [r10+1152]; +ld.shared.f64 fd246, [r10+1440]; +barrier.sync 0; +st.shared.f64 [r16], fd182; +st.shared.f64 [r16+48], fd201; +st.shared.f64 [r16+96], fd211; +st.shared.f64 [r16+144], fd221; +st.shared.f64 [r16+192], fd230; +st.shared.f64 [r16+240], fd240; +barrier.sync 0; +ld.shared.f64 fd247, [r10]; +ld.shared.f64 fd248, [r10+288]; +ld.shared.f64 fd249, [r10+576]; +ld.shared.f64 fd250, [r10+864]; +ld.shared.f64 fd251, [r10+1152]; +ld.shared.f64 fd252, [r10+1440]; +add.f64 fd253, fd243, fd245; +add.f64 fd254, fd241, fd253; +add.f64 fd255, fd249, fd251; +add.f64 fd256, fd247, fd255; +mul.f64 fd257, fd253, 0d3FE0000000000000; +sub.f64 fd258, fd241, fd257; +sub.f64 fd259, fd249, fd251; +mul.f64 fd260, fd259, 0d3FEBB67AE8584CAA; +add.f64 fd261, fd260, fd258; +sub.f64 fd262, fd258, fd260; +mul.f64 fd263, fd255, 0d3FE0000000000000; +sub.f64 fd264, fd247, fd263; +sub.f64 fd265, fd243, fd245; +mul.f64 fd266, fd265, 0d3FEBB67AE8584CAA; +sub.f64 fd267, fd264, fd266; +add.f64 fd268, fd266, fd264; +add.f64 fd269, fd244, fd246; +add.f64 fd270, fd242, fd269; +add.f64 fd271, fd250, fd252; +add.f64 fd272, fd248, fd271; +mul.f64 fd273, fd269, 0d3FE0000000000000; +sub.f64 fd274, fd242, fd273; +sub.f64 fd275, fd250, fd252; +mul.f64 fd276, fd275, 0d3FEBB67AE8584CAA; +add.f64 fd277, fd276, fd274; +sub.f64 fd278, fd274, fd276; +mul.f64 fd279, fd271, 0d3FE0000000000000; +sub.f64 fd280, fd248, fd279; +sub.f64 fd281, fd244, fd246; +mul.f64 fd282, fd281, 0d3FEBB67AE8584CAA; +sub.f64 fd283, fd280, fd282; +add.f64 fd284, fd282, fd280; +mul.f64 fd285, fd277, 0d3FE0000000000000; +mul.f64 fd286, fd283, 0dBFEBB67AE8584CAA; +sub.f64 fd287, fd285, fd286; +mul.f64 fd288, fd283, 0d3FE0000000000000; +fma.rn.f64 fd289, fd277, 0dBFEBB67AE8584CAA, fd288; +mul.f64 fd290, fd278, 0dBFE0000000000000; +mul.f64 fd291, fd284, 0dBFEBB67AE8584CAA; +sub.f64 fd292, fd290, fd291; +mul.f64 fd293, fd284, 0dBFE0000000000000; +fma.rn.f64 fd294, fd278, 0dBFEBB67AE8584CAA, fd293; +add.f64 %0, fd254, fd270; +add.f64 %1, fd256, fd272; +add.f64 %3, fd267, fd289; +add.f64 %2, fd261, fd287; +add.f64 %5, fd268, fd294; +add.f64 %4, fd262, fd292; +sub.f64 %6, fd254, fd270; +sub.f64 %7, fd256, fd272; +sub.f64 %9, fd267, fd289; +sub.f64 %8, fd261, fd287; +sub.f64 %11, fd268, fd294; +sub.f64 %10, fd262, fd292; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<562, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<331>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 3456, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %20, %25; +add.f64 fd26, %15, fd25; +add.f64 fd27, %22, %27; +add.f64 fd28, %16, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %15, fd29; +sub.f64 fd31, %22, %27; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %16, fd35; +sub.f64 fd37, %20, %25; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %23, %28; +add.f64 fd42, %17, fd41; +add.f64 fd43, %24, %29; +add.f64 fd44, %19, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %17, fd45; +sub.f64 fd47, %24, %29; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %19, fd51; +sub.f64 fd53, %23, %28; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +sub.f64 fd67, fd26, fd42; +sub.f64 fd68, fd28, fd44; +add.f64 fd69, fd33, fd59; +add.f64 fd70, fd39, fd61; +sub.f64 fd71, fd33, fd59; +sub.f64 fd72, fd39, fd61; +add.f64 fd73, fd34, fd64; +add.f64 fd74, fd40, fd66; +sub.f64 fd75, fd34, fd64; +sub.f64 fd76, fd40, fd66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 3456, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd77, fd78}, [rd6]; +mul.f64 fd81, fd77, fd69; +mul.f64 fd82, fd78, fd70; +mul.f64 fd83, fd77, fd70; +mul.f64 fd84, fd77, fd77; +mul.f64 fd85, fd78, fd78; +sub.f64 fd86, fd84, fd85; +mul.f64 fd87, fd78, fd77; +fma.rn.f64 fd88, fd78, fd77, fd87; +mul.f64 fd89, fd86, fd73; +mul.f64 fd90, fd88, fd74; +mul.f64 fd91, fd86, fd74; +mul.f64 fd92, fd77, fd86; +mul.f64 fd93, fd78, fd88; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd77, fd88; +fma.rn.f64 fd96, fd78, fd86, fd95; +mul.f64 fd97, fd94, fd67; +mul.f64 fd98, fd96, fd68; +mul.f64 fd99, fd94, fd68; +ld.global.v2.f64 {fd100, fd101}, [rd6+576]; +mul.f64 fd104, fd100, fd71; +mul.f64 fd105, fd101, fd72; +mul.f64 fd106, fd100, fd72; +mul.f64 fd107, fd77, fd100; +mul.f64 fd108, fd78, fd101; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd77, fd101; +fma.rn.f64 fd111, fd78, fd100, fd110; +mul.f64 fd112, fd109, fd75; +mul.f64 fd113, fd111, fd76; +mul.f64 fd114, fd109, fd76; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f64 fd115, fd28, fd44; +add.f64 fd116, fd26, fd42; +st.shared.v2.f64 [r9], {fd116, fd115}; +fma.rn.f64 fd117, fd78, fd69, fd83; +sub.f64 fd118, fd81, fd82; +st.shared.v2.f64 [r9+16], {fd118, fd117}; +fma.rn.f64 fd119, fd88, fd73, fd91; +sub.f64 fd120, fd89, fd90; +st.shared.v2.f64 [r9+32], {fd120, fd119}; +fma.rn.f64 fd121, fd96, fd67, fd99; +sub.f64 fd122, fd97, fd98; +st.shared.v2.f64 [r9+48], {fd122, fd121}; +fma.rn.f64 fd123, fd101, fd71, fd106; +sub.f64 fd124, fd104, fd105; +st.shared.v2.f64 [r9+64], {fd124, fd123}; +fma.rn.f64 fd125, fd111, fd75, fd114; +sub.f64 fd126, fd112, fd113; +st.shared.v2.f64 [r9+80], {fd126, fd125}; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.v2.f64 {fd127, fd128}, [r10]; +ld.shared.v2.f64 {fd131, fd132}, [r10+576]; +ld.shared.v2.f64 {fd135, fd136}, [r10+1152]; +ld.shared.v2.f64 {fd139, fd140}, [r10+1728]; +ld.shared.v2.f64 {fd143, fd144}, [r10+2304]; +ld.shared.v2.f64 {fd147, fd148}, [r10+2880]; +add.f64 fd151, fd135, fd143; +add.f64 fd152, fd127, fd151; +add.f64 fd153, fd136, fd144; +add.f64 fd154, fd128, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, fd127, fd155; +sub.f64 fd157, fd136, fd144; +mul.f64 fd158, fd157, 0d3FEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, fd128, fd161; +sub.f64 fd163, fd135, fd143; +mul.f64 fd164, fd163, 0d3FEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, fd139, fd147; +add.f64 fd168, fd131, fd167; +add.f64 fd169, fd140, fd148; +add.f64 fd170, fd132, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, fd131, fd171; +sub.f64 fd173, fd140, fd148; +mul.f64 fd174, fd173, 0d3FEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, fd132, fd177; +sub.f64 fd179, fd139, fd147; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0dBFEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0dBFEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0dBFEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0dBFEBB67AE8584CAA, fd191; +sub.f64 fd193, fd152, fd168; +sub.f64 fd194, fd154, fd170; +add.f64 fd195, fd159, fd185; +add.f64 fd196, fd165, fd187; +sub.f64 fd197, fd159, fd185; +sub.f64 fd198, fd165, fd187; +add.f64 fd199, fd160, fd190; +add.f64 fd200, fd166, fd192; +sub.f64 fd201, fd160, fd190; +sub.f64 fd202, fd166, fd192; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd203, fd204}, [rd11]; +mul.f64 fd207, fd203, fd195; +mul.f64 fd208, fd204, fd196; +mul.f64 fd209, fd203, fd196; +mul.f64 fd210, fd203, fd203; +mul.f64 fd211, fd204, fd204; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd204, fd203; +fma.rn.f64 fd214, fd204, fd203, fd213; +mul.f64 fd215, fd212, fd199; +mul.f64 fd216, fd214, fd200; +mul.f64 fd217, fd212, fd200; +mul.f64 fd218, fd203, fd212; +mul.f64 fd219, fd204, fd214; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd203, fd214; +fma.rn.f64 fd222, fd204, fd212, fd221; +mul.f64 fd223, fd220, fd193; +mul.f64 fd224, fd222, fd194; +mul.f64 fd225, fd220, fd194; +ld.global.v2.f64 {fd226, fd227}, [rd11+96]; +mul.f64 fd230, fd226, fd197; +mul.f64 fd231, fd227, fd198; +mul.f64 fd232, fd226, fd198; +mul.f64 fd233, fd203, fd226; +mul.f64 fd234, fd204, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd203, fd227; +fma.rn.f64 fd237, fd204, fd226, fd236; +mul.f64 fd238, fd235, fd201; +mul.f64 fd239, fd237, fd202; +mul.f64 fd240, fd235, fd202; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 576, r15; +add.f64 fd241, fd154, fd170; +add.f64 fd242, fd152, fd168; +st.shared.v2.f64 [r16], {fd242, fd241}; +fma.rn.f64 fd243, fd204, fd195, fd209; +sub.f64 fd244, fd207, fd208; +st.shared.v2.f64 [r16+96], {fd244, fd243}; +fma.rn.f64 fd245, fd214, fd199, fd217; +sub.f64 fd246, fd215, fd216; +st.shared.v2.f64 [r16+192], {fd246, fd245}; +fma.rn.f64 fd247, fd222, fd193, fd225; +sub.f64 fd248, fd223, fd224; +st.shared.v2.f64 [r16+288], {fd248, fd247}; +fma.rn.f64 fd249, fd227, fd197, fd232; +sub.f64 fd250, fd230, fd231; +st.shared.v2.f64 [r16+384], {fd250, fd249}; +fma.rn.f64 fd251, fd237, fd201, fd240; +sub.f64 fd252, fd238, fd239; +st.shared.v2.f64 [r16+480], {fd252, fd251}; +barrier.sync 0; +ld.shared.v2.f64 {fd253, fd254}, [r10]; +ld.shared.v2.f64 {fd257, fd258}, [r10+576]; +ld.shared.v2.f64 {fd261, fd262}, [r10+1152]; +ld.shared.v2.f64 {fd265, fd266}, [r10+1728]; +ld.shared.v2.f64 {fd269, fd270}, [r10+2304]; +ld.shared.v2.f64 {fd273, fd274}, [r10+2880]; +add.f64 fd277, fd261, fd269; +add.f64 fd278, fd253, fd277; +add.f64 fd279, fd262, fd270; +add.f64 fd280, fd254, fd279; +mul.f64 fd281, fd277, 0d3FE0000000000000; +sub.f64 fd282, fd253, fd281; +sub.f64 fd283, fd262, fd270; +mul.f64 fd284, fd283, 0d3FEBB67AE8584CAA; +add.f64 fd285, fd284, fd282; +sub.f64 fd286, fd282, fd284; +mul.f64 fd287, fd279, 0d3FE0000000000000; +sub.f64 fd288, fd254, fd287; +sub.f64 fd289, fd261, fd269; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +sub.f64 fd291, fd288, fd290; +add.f64 fd292, fd290, fd288; +add.f64 fd293, fd265, fd273; +add.f64 fd294, fd257, fd293; +add.f64 fd295, fd266, fd274; +add.f64 fd296, fd258, fd295; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd257, fd297; +sub.f64 fd299, fd266, fd274; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +mul.f64 fd303, fd295, 0d3FE0000000000000; +sub.f64 fd304, fd258, fd303; +sub.f64 fd305, fd265, fd273; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +mul.f64 fd309, fd301, 0d3FE0000000000000; +mul.f64 fd310, fd307, 0dBFEBB67AE8584CAA; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd307, 0d3FE0000000000000; +fma.rn.f64 fd313, fd301, 0dBFEBB67AE8584CAA, fd312; +mul.f64 fd314, fd302, 0dBFE0000000000000; +mul.f64 fd315, fd308, 0dBFEBB67AE8584CAA; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd308, 0dBFE0000000000000; +fma.rn.f64 fd318, fd302, 0dBFEBB67AE8584CAA, fd317; +add.f64 %1, fd280, fd296; +add.f64 %0, fd278, fd294; +add.f64 %3, fd291, fd313; +add.f64 %2, fd285, fd311; +add.f64 %5, fd292, fd318; +add.f64 %4, fd286, fd316; +sub.f64 %7, fd280, fd296; +sub.f64 %6, fd278, fd294; +sub.f64 %9, fd291, fd313; +sub.f64 %8, fd285, fd311; +sub.f64 %11, fd292, fd318; +sub.f64 %10, fd286, fd316; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..9f562772a0f5f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_216_fp64_inv.hpp.inc @@ -0,0 +1,652 @@ +#ifndef CUFFTDX_FFT_216_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_216_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<732, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<307>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 1728, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %20, %25; +add.f64 fd26, %15, fd25; +add.f64 fd27, %22, %27; +add.f64 fd28, %16, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %15, fd29; +sub.f64 fd31, %22, %27; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %16, fd35; +sub.f64 fd37, %20, %25; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %23, %28; +add.f64 fd42, %17, fd41; +add.f64 fd43, %24, %29; +add.f64 fd44, %19, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %17, fd45; +sub.f64 fd47, %24, %29; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %19, fd51; +sub.f64 fd53, %23, %28; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +add.f64 fd67, fd26, fd42; +add.f64 fd68, fd28, fd44; +sub.f64 fd69, fd26, fd42; +sub.f64 fd70, fd28, fd44; +add.f64 fd71, fd33, fd59; +add.f64 fd72, fd39, fd61; +sub.f64 fd73, fd33, fd59; +sub.f64 fd74, fd39, fd61; +add.f64 fd75, fd34, fd64; +add.f64 fd76, fd40, fd66; +sub.f64 fd77, fd34, fd64; +sub.f64 fd78, fd40, fd66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd79, fd80}, [rd6]; +mul.f64 fd83, fd72, fd80; +fma.rn.f64 fd84, fd79, fd71, fd83; +mul.f64 fd85, fd71, fd80; +mul.f64 fd86, fd79, fd72; +sub.f64 fd87, fd86, fd85; +mul.f64 fd88, fd79, fd79; +mul.f64 fd89, fd80, fd80; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd80, fd79; +fma.rn.f64 fd92, fd80, fd79, fd91; +mul.f64 fd93, fd76, fd92; +fma.rn.f64 fd94, fd90, fd75, fd93; +mul.f64 fd95, fd75, fd92; +mul.f64 fd96, fd90, fd76; +sub.f64 fd97, fd96, fd95; +mul.f64 fd98, fd79, fd90; +mul.f64 fd99, fd80, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd79, fd92; +fma.rn.f64 fd102, fd80, fd90, fd101; +mul.f64 fd103, fd70, fd102; +fma.rn.f64 fd104, fd100, fd69, fd103; +mul.f64 fd105, fd69, fd102; +mul.f64 fd106, fd100, fd70; +sub.f64 fd107, fd106, fd105; +ld.global.v2.f64 {fd108, fd109}, [rd6+576]; +mul.f64 fd112, fd74, fd109; +fma.rn.f64 fd113, fd108, fd73, fd112; +mul.f64 fd114, fd73, fd109; +mul.f64 fd115, fd108, fd74; +sub.f64 fd116, fd115, fd114; +mul.f64 fd117, fd79, fd108; +mul.f64 fd118, fd80, fd109; +sub.f64 fd119, fd117, fd118; +mul.f64 fd120, fd79, fd109; +fma.rn.f64 fd121, fd80, fd108, fd120; +mul.f64 fd122, fd78, fd121; +fma.rn.f64 fd123, fd119, fd77, fd122; +mul.f64 fd124, fd77, fd121; +mul.f64 fd125, fd119, fd78; +sub.f64 fd126, fd125, fd124; +mad.lo.s32 r8, r5, 1728, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v2.f64 [r9], {fd67, fd84}; +st.shared.v2.f64 [r9+16], {fd94, fd104}; +st.shared.v2.f64 [r9+32], {fd113, fd123}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.f64 fd127, [r10]; +ld.shared.f64 fd128, [r10+288]; +ld.shared.f64 fd129, [r10+576]; +ld.shared.f64 fd130, [r10+864]; +ld.shared.f64 fd131, [r10+1152]; +ld.shared.f64 fd132, [r10+1440]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd68, fd87}; +st.shared.v2.f64 [r9+16], {fd97, fd107}; +st.shared.v2.f64 [r9+32], {fd116, fd126}; +barrier.sync 0; +ld.shared.f64 fd133, [r10]; +ld.shared.f64 fd134, [r10+288]; +ld.shared.f64 fd135, [r10+576]; +ld.shared.f64 fd136, [r10+864]; +ld.shared.f64 fd137, [r10+1152]; +ld.shared.f64 fd138, [r10+1440]; +add.f64 fd139, fd129, fd131; +add.f64 fd140, fd127, fd139; +add.f64 fd141, fd135, fd137; +add.f64 fd142, fd133, fd141; +mul.f64 fd143, fd139, 0d3FE0000000000000; +sub.f64 fd144, fd127, fd143; +sub.f64 fd145, fd135, fd137; +mul.f64 fd146, fd145, 0dBFEBB67AE8584CAA; +add.f64 fd147, fd146, fd144; +sub.f64 fd148, fd144, fd146; +mul.f64 fd149, fd141, 0d3FE0000000000000; +sub.f64 fd150, fd133, fd149; +sub.f64 fd151, fd129, fd131; +mul.f64 fd152, fd151, 0dBFEBB67AE8584CAA; +sub.f64 fd153, fd150, fd152; +add.f64 fd154, fd152, fd150; +add.f64 fd155, fd130, fd132; +add.f64 fd156, fd128, fd155; +add.f64 fd157, fd136, fd138; +add.f64 fd158, fd134, fd157; +mul.f64 fd159, fd155, 0d3FE0000000000000; +sub.f64 fd160, fd128, fd159; +sub.f64 fd161, fd136, fd138; +mul.f64 fd162, fd161, 0dBFEBB67AE8584CAA; +add.f64 fd163, fd162, fd160; +sub.f64 fd164, fd160, fd162; +mul.f64 fd165, fd157, 0d3FE0000000000000; +sub.f64 fd166, fd134, fd165; +sub.f64 fd167, fd130, fd132; +mul.f64 fd168, fd167, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd166, fd168; +add.f64 fd170, fd168, fd166; +mul.f64 fd171, fd163, 0d3FE0000000000000; +mul.f64 fd172, fd169, 0d3FEBB67AE8584CAA; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd169, 0d3FE0000000000000; +fma.rn.f64 fd175, fd163, 0d3FEBB67AE8584CAA, fd174; +mul.f64 fd176, fd164, 0dBFE0000000000000; +mul.f64 fd177, fd170, 0d3FEBB67AE8584CAA; +sub.f64 fd178, fd176, fd177; +mul.f64 fd179, fd170, 0dBFE0000000000000; +fma.rn.f64 fd180, fd164, 0d3FEBB67AE8584CAA, fd179; +add.f64 fd181, fd140, fd156; +add.f64 fd182, fd142, fd158; +sub.f64 fd183, fd140, fd156; +sub.f64 fd184, fd142, fd158; +add.f64 fd185, fd147, fd173; +add.f64 fd186, fd153, fd175; +sub.f64 fd187, fd147, fd173; +sub.f64 fd188, fd153, fd175; +add.f64 fd189, fd148, fd178; +add.f64 fd190, fd154, fd180; +sub.f64 fd191, fd148, fd178; +sub.f64 fd192, fd154, fd180; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd193, fd194}, [rd11]; +mul.f64 fd197, fd186, fd194; +fma.rn.f64 fd198, fd193, fd185, fd197; +mul.f64 fd199, fd185, fd194; +mul.f64 fd200, fd193, fd186; +sub.f64 fd201, fd200, fd199; +mul.f64 fd202, fd193, fd193; +mul.f64 fd203, fd194, fd194; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd194, fd193; +fma.rn.f64 fd206, fd194, fd193, fd205; +mul.f64 fd207, fd190, fd206; +fma.rn.f64 fd208, fd204, fd189, fd207; +mul.f64 fd209, fd189, fd206; +mul.f64 fd210, fd204, fd190; +sub.f64 fd211, fd210, fd209; +mul.f64 fd212, fd193, fd204; +mul.f64 fd213, fd194, fd206; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd193, fd206; +fma.rn.f64 fd216, fd194, fd204, fd215; +mul.f64 fd217, fd184, fd216; +fma.rn.f64 fd218, fd214, fd183, fd217; +mul.f64 fd219, fd183, fd216; +mul.f64 fd220, fd214, fd184; +sub.f64 fd221, fd220, fd219; +ld.global.v2.f64 {fd222, fd223}, [rd11+96]; +mul.f64 fd226, fd188, fd223; +fma.rn.f64 fd227, fd222, fd187, fd226; +mul.f64 fd228, fd187, fd223; +mul.f64 fd229, fd222, fd188; +sub.f64 fd230, fd229, fd228; +mul.f64 fd231, fd193, fd222; +mul.f64 fd232, fd194, fd223; +sub.f64 fd233, fd231, fd232; +mul.f64 fd234, fd193, fd223; +fma.rn.f64 fd235, fd194, fd222, fd234; +mul.f64 fd236, fd192, fd235; +fma.rn.f64 fd237, fd233, fd191, fd236; +mul.f64 fd238, fd191, fd235; +mul.f64 fd239, fd233, fd192; +sub.f64 fd240, fd239, fd238; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 288, r15; +st.shared.f64 [r16], fd181; +st.shared.f64 [r16+48], fd198; +st.shared.f64 [r16+96], fd208; +st.shared.f64 [r16+144], fd218; +st.shared.f64 [r16+192], fd227; +st.shared.f64 [r16+240], fd237; +barrier.sync 0; +ld.shared.f64 fd241, [r10]; +ld.shared.f64 fd242, [r10+288]; +ld.shared.f64 fd243, [r10+576]; +ld.shared.f64 fd244, [r10+864]; +ld.shared.f64 fd245, [r10+1152]; +ld.shared.f64 fd246, [r10+1440]; +barrier.sync 0; +st.shared.f64 [r16], fd182; +st.shared.f64 [r16+48], fd201; +st.shared.f64 [r16+96], fd211; +st.shared.f64 [r16+144], fd221; +st.shared.f64 [r16+192], fd230; +st.shared.f64 [r16+240], fd240; +barrier.sync 0; +ld.shared.f64 fd247, [r10]; +ld.shared.f64 fd248, [r10+288]; +ld.shared.f64 fd249, [r10+576]; +ld.shared.f64 fd250, [r10+864]; +ld.shared.f64 fd251, [r10+1152]; +ld.shared.f64 fd252, [r10+1440]; +add.f64 fd253, fd243, fd245; +add.f64 fd254, fd241, fd253; +add.f64 fd255, fd249, fd251; +add.f64 fd256, fd247, fd255; +mul.f64 fd257, fd253, 0d3FE0000000000000; +sub.f64 fd258, fd241, fd257; +sub.f64 fd259, fd249, fd251; +mul.f64 fd260, fd259, 0dBFEBB67AE8584CAA; +add.f64 fd261, fd260, fd258; +sub.f64 fd262, fd258, fd260; +mul.f64 fd263, fd255, 0d3FE0000000000000; +sub.f64 fd264, fd247, fd263; +sub.f64 fd265, fd243, fd245; +mul.f64 fd266, fd265, 0dBFEBB67AE8584CAA; +sub.f64 fd267, fd264, fd266; +add.f64 fd268, fd266, fd264; +add.f64 fd269, fd244, fd246; +add.f64 fd270, fd242, fd269; +add.f64 fd271, fd250, fd252; +add.f64 fd272, fd248, fd271; +mul.f64 fd273, fd269, 0d3FE0000000000000; +sub.f64 fd274, fd242, fd273; +sub.f64 fd275, fd250, fd252; +mul.f64 fd276, fd275, 0dBFEBB67AE8584CAA; +add.f64 fd277, fd276, fd274; +sub.f64 fd278, fd274, fd276; +mul.f64 fd279, fd271, 0d3FE0000000000000; +sub.f64 fd280, fd248, fd279; +sub.f64 fd281, fd244, fd246; +mul.f64 fd282, fd281, 0dBFEBB67AE8584CAA; +sub.f64 fd283, fd280, fd282; +add.f64 fd284, fd282, fd280; +mul.f64 fd285, fd277, 0d3FE0000000000000; +mul.f64 fd286, fd283, 0d3FEBB67AE8584CAA; +sub.f64 fd287, fd285, fd286; +mul.f64 fd288, fd283, 0d3FE0000000000000; +fma.rn.f64 fd289, fd277, 0d3FEBB67AE8584CAA, fd288; +mul.f64 fd290, fd278, 0dBFE0000000000000; +mul.f64 fd291, fd284, 0d3FEBB67AE8584CAA; +sub.f64 fd292, fd290, fd291; +mul.f64 fd293, fd284, 0dBFE0000000000000; +fma.rn.f64 fd294, fd278, 0d3FEBB67AE8584CAA, fd293; +add.f64 %0, fd254, fd270; +add.f64 %1, fd256, fd272; +add.f64 %3, fd267, fd289; +add.f64 %2, fd261, fd287; +add.f64 %5, fd268, fd294; +add.f64 %4, fd262, fd292; +sub.f64 %6, fd254, fd270; +sub.f64 %7, fd256, fd272; +sub.f64 %9, fd267, fd289; +sub.f64 %8, fd261, fd287; +sub.f64 %11, fd268, fd294; +sub.f64 %10, fd262, fd292; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<733, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<17>; +.reg .f64 fd<331>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 3456, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %20, %25; +add.f64 fd26, %15, fd25; +add.f64 fd27, %22, %27; +add.f64 fd28, %16, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %15, fd29; +sub.f64 fd31, %22, %27; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %16, fd35; +sub.f64 fd37, %20, %25; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %23, %28; +add.f64 fd42, %17, fd41; +add.f64 fd43, %24, %29; +add.f64 fd44, %19, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %17, fd45; +sub.f64 fd47, %24, %29; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %19, fd51; +sub.f64 fd53, %23, %28; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +sub.f64 fd67, fd26, fd42; +sub.f64 fd68, fd28, fd44; +add.f64 fd69, fd33, fd59; +add.f64 fd70, fd39, fd61; +sub.f64 fd71, fd33, fd59; +sub.f64 fd72, fd39, fd61; +add.f64 fd73, fd34, fd64; +add.f64 fd74, fd40, fd66; +sub.f64 fd75, fd34, fd64; +sub.f64 fd76, fd40, fd66; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 36; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 3456, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd77, fd78}, [rd6]; +mul.f64 fd81, fd70, fd78; +mul.f64 fd82, fd69, fd78; +mul.f64 fd83, fd77, fd70; +mul.f64 fd84, fd77, fd77; +mul.f64 fd85, fd78, fd78; +sub.f64 fd86, fd84, fd85; +mul.f64 fd87, fd78, fd77; +fma.rn.f64 fd88, fd78, fd77, fd87; +mul.f64 fd89, fd74, fd88; +mul.f64 fd90, fd73, fd88; +mul.f64 fd91, fd86, fd74; +mul.f64 fd92, fd77, fd86; +mul.f64 fd93, fd78, fd88; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd77, fd88; +fma.rn.f64 fd96, fd78, fd86, fd95; +mul.f64 fd97, fd68, fd96; +mul.f64 fd98, fd67, fd96; +mul.f64 fd99, fd94, fd68; +ld.global.v2.f64 {fd100, fd101}, [rd6+576]; +mul.f64 fd104, fd72, fd101; +mul.f64 fd105, fd71, fd101; +mul.f64 fd106, fd100, fd72; +mul.f64 fd107, fd77, fd100; +mul.f64 fd108, fd78, fd101; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd77, fd101; +fma.rn.f64 fd111, fd78, fd100, fd110; +mul.f64 fd112, fd76, fd111; +mul.f64 fd113, fd75, fd111; +mul.f64 fd114, fd109, fd76; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f64 fd115, fd28, fd44; +add.f64 fd116, fd26, fd42; +st.shared.v2.f64 [r9], {fd116, fd115}; +fma.rn.f64 fd117, fd77, fd69, fd81; +sub.f64 fd118, fd83, fd82; +st.shared.v2.f64 [r9+16], {fd117, fd118}; +fma.rn.f64 fd119, fd86, fd73, fd89; +sub.f64 fd120, fd91, fd90; +st.shared.v2.f64 [r9+32], {fd119, fd120}; +fma.rn.f64 fd121, fd94, fd67, fd97; +sub.f64 fd122, fd99, fd98; +st.shared.v2.f64 [r9+48], {fd121, fd122}; +fma.rn.f64 fd123, fd100, fd71, fd104; +sub.f64 fd124, fd106, fd105; +st.shared.v2.f64 [r9+64], {fd123, fd124}; +fma.rn.f64 fd125, fd109, fd75, fd112; +sub.f64 fd126, fd114, fd113; +st.shared.v2.f64 [r9+80], {fd125, fd126}; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.v2.f64 {fd127, fd128}, [r10]; +ld.shared.v2.f64 {fd131, fd132}, [r10+576]; +ld.shared.v2.f64 {fd135, fd136}, [r10+1152]; +ld.shared.v2.f64 {fd139, fd140}, [r10+1728]; +ld.shared.v2.f64 {fd143, fd144}, [r10+2304]; +ld.shared.v2.f64 {fd147, fd148}, [r10+2880]; +add.f64 fd151, fd135, fd143; +add.f64 fd152, fd127, fd151; +add.f64 fd153, fd136, fd144; +add.f64 fd154, fd128, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, fd127, fd155; +sub.f64 fd157, fd136, fd144; +mul.f64 fd158, fd157, 0dBFEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, fd128, fd161; +sub.f64 fd163, fd135, fd143; +mul.f64 fd164, fd163, 0dBFEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, fd139, fd147; +add.f64 fd168, fd131, fd167; +add.f64 fd169, fd140, fd148; +add.f64 fd170, fd132, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, fd131, fd171; +sub.f64 fd173, fd140, fd148; +mul.f64 fd174, fd173, 0dBFEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, fd132, fd177; +sub.f64 fd179, fd139, fd147; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0d3FEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0d3FEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0d3FEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0d3FEBB67AE8584CAA, fd191; +sub.f64 fd193, fd152, fd168; +sub.f64 fd194, fd154, fd170; +add.f64 fd195, fd159, fd185; +add.f64 fd196, fd165, fd187; +sub.f64 fd197, fd159, fd185; +sub.f64 fd198, fd165, fd187; +add.f64 fd199, fd160, fd190; +add.f64 fd200, fd166, fd192; +sub.f64 fd201, fd160, fd190; +sub.f64 fd202, fd166, fd192; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 6; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 16; +mov.u64 rd10, %14; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd203, fd204}, [rd11]; +mul.f64 fd207, fd196, fd204; +mul.f64 fd208, fd195, fd204; +mul.f64 fd209, fd203, fd196; +mul.f64 fd210, fd203, fd203; +mul.f64 fd211, fd204, fd204; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd204, fd203; +fma.rn.f64 fd214, fd204, fd203, fd213; +mul.f64 fd215, fd200, fd214; +mul.f64 fd216, fd199, fd214; +mul.f64 fd217, fd212, fd200; +mul.f64 fd218, fd203, fd212; +mul.f64 fd219, fd204, fd214; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd203, fd214; +fma.rn.f64 fd222, fd204, fd212, fd221; +mul.f64 fd223, fd194, fd222; +mul.f64 fd224, fd193, fd222; +mul.f64 fd225, fd220, fd194; +ld.global.v2.f64 {fd226, fd227}, [rd11+96]; +mul.f64 fd230, fd198, fd227; +mul.f64 fd231, fd197, fd227; +mul.f64 fd232, fd226, fd198; +mul.f64 fd233, fd203, fd226; +mul.f64 fd234, fd204, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd203, fd227; +fma.rn.f64 fd237, fd204, fd226, fd236; +mul.f64 fd238, fd202, fd237; +mul.f64 fd239, fd201, fd237; +mul.f64 fd240, fd235, fd202; +shl.b32 r14, r13, 4; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 576, r15; +add.f64 fd241, fd154, fd170; +add.f64 fd242, fd152, fd168; +st.shared.v2.f64 [r16], {fd242, fd241}; +fma.rn.f64 fd243, fd203, fd195, fd207; +sub.f64 fd244, fd209, fd208; +st.shared.v2.f64 [r16+96], {fd243, fd244}; +fma.rn.f64 fd245, fd212, fd199, fd215; +sub.f64 fd246, fd217, fd216; +st.shared.v2.f64 [r16+192], {fd245, fd246}; +fma.rn.f64 fd247, fd220, fd193, fd223; +sub.f64 fd248, fd225, fd224; +st.shared.v2.f64 [r16+288], {fd247, fd248}; +fma.rn.f64 fd249, fd226, fd197, fd230; +sub.f64 fd250, fd232, fd231; +st.shared.v2.f64 [r16+384], {fd249, fd250}; +fma.rn.f64 fd251, fd235, fd201, fd238; +sub.f64 fd252, fd240, fd239; +st.shared.v2.f64 [r16+480], {fd251, fd252}; +barrier.sync 0; +ld.shared.v2.f64 {fd253, fd254}, [r10]; +ld.shared.v2.f64 {fd257, fd258}, [r10+576]; +ld.shared.v2.f64 {fd261, fd262}, [r10+1152]; +ld.shared.v2.f64 {fd265, fd266}, [r10+1728]; +ld.shared.v2.f64 {fd269, fd270}, [r10+2304]; +ld.shared.v2.f64 {fd273, fd274}, [r10+2880]; +add.f64 fd277, fd261, fd269; +add.f64 fd278, fd253, fd277; +add.f64 fd279, fd262, fd270; +add.f64 fd280, fd254, fd279; +mul.f64 fd281, fd277, 0d3FE0000000000000; +sub.f64 fd282, fd253, fd281; +sub.f64 fd283, fd262, fd270; +mul.f64 fd284, fd283, 0dBFEBB67AE8584CAA; +add.f64 fd285, fd284, fd282; +sub.f64 fd286, fd282, fd284; +mul.f64 fd287, fd279, 0d3FE0000000000000; +sub.f64 fd288, fd254, fd287; +sub.f64 fd289, fd261, fd269; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +sub.f64 fd291, fd288, fd290; +add.f64 fd292, fd290, fd288; +add.f64 fd293, fd265, fd273; +add.f64 fd294, fd257, fd293; +add.f64 fd295, fd266, fd274; +add.f64 fd296, fd258, fd295; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd257, fd297; +sub.f64 fd299, fd266, fd274; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +mul.f64 fd303, fd295, 0d3FE0000000000000; +sub.f64 fd304, fd258, fd303; +sub.f64 fd305, fd265, fd273; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +mul.f64 fd309, fd301, 0d3FE0000000000000; +mul.f64 fd310, fd307, 0d3FEBB67AE8584CAA; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd307, 0d3FE0000000000000; +fma.rn.f64 fd313, fd301, 0d3FEBB67AE8584CAA, fd312; +mul.f64 fd314, fd302, 0dBFE0000000000000; +mul.f64 fd315, fd308, 0d3FEBB67AE8584CAA; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd308, 0dBFE0000000000000; +fma.rn.f64 fd318, fd302, 0d3FEBB67AE8584CAA, fd317; +add.f64 %1, fd280, fd296; +add.f64 %0, fd278, fd294; +add.f64 %3, fd291, fd313; +add.f64 %2, fd285, fd311; +add.f64 %5, fd292, fd318; +add.f64 %4, fd286, fd316; +sub.f64 %7, fd280, fd296; +sub.f64 %6, fd278, fd294; +sub.f64 %9, fd291, fd313; +sub.f64 %8, fd285, fd311; +sub.f64 %11, fd292, fd318; +sub.f64 %10, fd286, fd316; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_216), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..fee482b94f95e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp16_fwd.hpp.inc @@ -0,0 +1,31749 @@ +#ifndef CUFFTDX_FFT_2187_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_2187_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<891, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<771>; +.reg .b32 r<8646>; +.reg .b64 rd<6>; +mov.u32 r8572, %54; +mov.u32 r8645, %tid.y; +mad.lo.s32 r8573, r8645, 17496, r8572; +mov.u32 r8574, %tid.x; +mov.f32 f762, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1, {low, high}; +} +mov.f32 f764, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r265, {low, high}; +} +mov.f32 f544, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r266, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r267, {low, high}; +} +mov.f32 f556, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r268, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r271, {low, high}; +} +mov.f32 f580, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %82, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %82, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %82, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %83, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %81; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %83, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %81; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %83, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %81; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %81; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %83, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %81; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %83, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1825, {low, high}; +} +mov.f32 f536, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1826, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1827, {low, high}; +} +mov.f32 f540, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1830, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1831, {low, high}; +} +mov.f32 f548, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1832, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1833, {low, high}; +} +mov.f32 f552, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1836, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1837, {low, high}; +} +mov.f32 f560, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1838, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1839, {low, high}; +} +mov.f32 f564, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1840, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1843, {low, high}; +} +mov.f32 f572, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1844, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1848, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1851, {low, high}; +} +mov.f32 f588, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1855, {low, high}; +} +mov.f32 f596, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r8574, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r8575, rd3; +mul.lo.s32 r8576, r8575, 81; +sub.s32 r8577, r8574, r8576; +cvt.rn.f32.u32 f765, r8577; +mul.f32 f766, f765, 0f3B3C4870; +cos.approx.f32 f309, f766; +sin.approx.f32 f767, f766; +neg.f32 f310, f767; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +mad.lo.s32 r8578, r8575, 17496, r8573; +barrier.sync 0; +mad.lo.s32 r8579, r8577, 216, r8578; +st.shared.v2.f32 [r8579], {r2140, r2146}; +st.shared.v2.f32 [r8579+8], {r2937, r2944}; +st.shared.v2.f32 [r8579+16], {r2974, r2981}; +st.shared.v2.f32 [r8579+24], {r3011, r3018}; +st.shared.v2.f32 [r8579+32], {r3048, r3055}; +st.shared.v2.f32 [r8579+40], {r3085, r3092}; +st.shared.v2.f32 [r8579+48], {r3122, r3129}; +st.shared.v2.f32 [r8579+56], {r3159, r3166}; +st.shared.v2.f32 [r8579+64], {r3196, r3203}; +st.shared.v2.f32 [r8579+72], {r3233, r3240}; +st.shared.v2.f32 [r8579+80], {r3270, r3277}; +st.shared.v2.f32 [r8579+88], {r3307, r3314}; +st.shared.v2.f32 [r8579+96], {r3344, r3351}; +st.shared.v2.f32 [r8579+104], {r3381, r3388}; +st.shared.v2.f32 [r8579+112], {r3418, r3425}; +st.shared.v2.f32 [r8579+120], {r3455, r3462}; +st.shared.v2.f32 [r8579+128], {r3492, r3499}; +st.shared.v2.f32 [r8579+136], {r3529, r3536}; +st.shared.v2.f32 [r8579+144], {r3566, r3573}; +st.shared.v2.f32 [r8579+152], {r3603, r3610}; +st.shared.v2.f32 [r8579+160], {r3640, r3647}; +st.shared.v2.f32 [r8579+168], {r3677, r3684}; +st.shared.v2.f32 [r8579+176], {r3714, r3721}; +st.shared.v2.f32 [r8579+184], {r3751, r3758}; +st.shared.v2.f32 [r8579+192], {r3788, r3795}; +st.shared.v2.f32 [r8579+200], {r3825, r3832}; +st.shared.v2.f32 [r8579+208], {r3862, r3869}; +barrier.sync 0; +mad.lo.s32 r8580, r8577, -208, r8579; +ld.shared.u32 r3898, [r8580]; +ld.shared.u32 r3904, [r8580+4]; +ld.shared.u32 r4506, [r8580+648]; +ld.shared.u32 r4512, [r8580+652]; +ld.shared.u32 r5114, [r8580+1296]; +ld.shared.u32 r5120, [r8580+1300]; +ld.shared.u32 r3986, [r8580+1944]; +ld.shared.u32 r3992, [r8580+1948]; +ld.shared.u32 r4594, [r8580+2592]; +ld.shared.u32 r4600, [r8580+2596]; +ld.shared.u32 r5202, [r8580+3240]; +ld.shared.u32 r5208, [r8580+3244]; +ld.shared.u32 r4074, [r8580+3888]; +ld.shared.u32 r4080, [r8580+3892]; +ld.shared.u32 r4682, [r8580+4536]; +ld.shared.u32 r4688, [r8580+4540]; +ld.shared.u32 r5290, [r8580+5184]; +ld.shared.u32 r5296, [r8580+5188]; +ld.shared.u32 r3895, [r8580+5832]; +ld.shared.u32 r3901, [r8580+5836]; +ld.shared.u32 r4503, [r8580+6480]; +ld.shared.u32 r4509, [r8580+6484]; +ld.shared.u32 r5111, [r8580+7128]; +ld.shared.u32 r5117, [r8580+7132]; +ld.shared.u32 r3983, [r8580+7776]; +ld.shared.u32 r3989, [r8580+7780]; +ld.shared.u32 r4591, [r8580+8424]; +ld.shared.u32 r4597, [r8580+8428]; +ld.shared.u32 r5199, [r8580+9072]; +ld.shared.u32 r5205, [r8580+9076]; +ld.shared.u32 r4071, [r8580+9720]; +ld.shared.u32 r4077, [r8580+9724]; +ld.shared.u32 r4679, [r8580+10368]; +ld.shared.u32 r4685, [r8580+10372]; +ld.shared.u32 r5287, [r8580+11016]; +ld.shared.u32 r5293, [r8580+11020]; +ld.shared.u32 r3896, [r8580+11664]; +ld.shared.u32 r3902, [r8580+11668]; +ld.shared.u32 r4504, [r8580+12312]; +ld.shared.u32 r4510, [r8580+12316]; +ld.shared.u32 r5112, [r8580+12960]; +ld.shared.u32 r5118, [r8580+12964]; +ld.shared.u32 r3984, [r8580+13608]; +ld.shared.u32 r3990, [r8580+13612]; +ld.shared.u32 r4592, [r8580+14256]; +ld.shared.u32 r4598, [r8580+14260]; +ld.shared.u32 r5200, [r8580+14904]; +ld.shared.u32 r5206, [r8580+14908]; +ld.shared.u32 r4072, [r8580+15552]; +ld.shared.u32 r4078, [r8580+15556]; +ld.shared.u32 r4680, [r8580+16200]; +ld.shared.u32 r4686, [r8580+16204]; +ld.shared.u32 r5288, [r8580+16848]; +ld.shared.u32 r5294, [r8580+16852]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 r6029, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 r6035, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 r6053, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 r6071, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 r6089, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 r6107, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 r6117, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 r6123, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 r6141, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 r6159, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 r6177, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 r6195, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 r6205, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 r6211, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 r6229, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 r6247, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 r6265, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 r6283, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 r6293, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 r6299, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 r6335, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 r6353, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 r6371, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 r6381, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 r6387, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 r6405, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 r6423, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 r6441, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 r6459, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 r6469, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 r6475, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 r6511, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 r6529, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 r6547, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 r6557, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 r6563, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 r6617, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 r6635, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 r6645, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 r6651, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 r6669, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 r6687, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 r6705, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 r6723, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 r6733, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 r6739, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 r6757, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 r6775, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 r6793, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 r6811, r6802, r6808; +} +mul.wide.u32 rd4, r8577, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r8581, rd5; +sub.s32 r8582, r8577, r8581; +shr.u32 r8583, r8582, 1; +add.s32 r8584, r8583, r8581; +shr.u32 r8585, r8584, 4; +cvt.rn.f32.u32 f768, r8585; +mul.f32 f769, f768, 0f3D9EDD1F; +cos.approx.f32 f673, f769; +sin.approx.f32 f770, f769; +neg.f32 f674, f770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6814, {low, high}; +} +mul.lo.s32 r8586, r8585, 27; +sub.s32 r8587, r8577, r8586; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6819, {high, high}; +} +{ +mul.f16x2 r6821, r6123, r6819; +} +{ +neg.f16x2 r6824, r6821; +} +{ +fma.rn.f16x2 r6826, r6117, r6817, r6824; +} +{ +mul.f16x2 r6830, r6117, r6819; +} +{ +fma.rn.f16x2 r6833, r6123, r6817, r6830; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6839, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6841, {low, high}; +} +{ +mul.f16x2 r6842, r6839, r6841; +} +{ +mul.f16x2 r6845, r6814, r6837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6848, {high, low}; +} +{ +fma.rn.f16x2 r6850, r6842, r6848, r6845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6856, {high, high}; +} +{ +mul.f16x2 r6858, r6211, r6856; +} +{ +neg.f16x2 r6861, r6858; +} +{ +fma.rn.f16x2 r6863, r6205, r6854, r6861; +} +{ +mul.f16x2 r6867, r6205, r6856; +} +{ +fma.rn.f16x2 r6870, r6211, r6854, r6867; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6876, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6878, {low, high}; +} +{ +mul.f16x2 r6879, r6876, r6878; +} +{ +mul.f16x2 r6882, r6850, r6874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6885, {high, low}; +} +{ +fma.rn.f16x2 r6887, r6879, r6885, r6882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6891, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6893, {high, high}; +} +{ +mul.f16x2 r6895, r6299, r6893; +} +{ +neg.f16x2 r6898, r6895; +} +{ +fma.rn.f16x2 r6900, r6293, r6891, r6898; +} +{ +mul.f16x2 r6904, r6293, r6893; +} +{ +fma.rn.f16x2 r6907, r6299, r6891, r6904; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6913, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6915, {low, high}; +} +{ +mul.f16x2 r6916, r6913, r6915; +} +{ +mul.f16x2 r6919, r6887, r6911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6922, {high, low}; +} +{ +fma.rn.f16x2 r6924, r6916, r6922, r6919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6930, {high, high}; +} +{ +mul.f16x2 r6932, r6387, r6930; +} +{ +neg.f16x2 r6935, r6932; +} +{ +fma.rn.f16x2 r6937, r6381, r6928, r6935; +} +{ +mul.f16x2 r6941, r6381, r6930; +} +{ +fma.rn.f16x2 r6944, r6387, r6928, r6941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6950, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6952, {low, high}; +} +{ +mul.f16x2 r6953, r6950, r6952; +} +{ +mul.f16x2 r6956, r6924, r6948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6959, {high, low}; +} +{ +fma.rn.f16x2 r6961, r6953, r6959, r6956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6967, {high, high}; +} +{ +mul.f16x2 r6969, r6475, r6967; +} +{ +neg.f16x2 r6972, r6969; +} +{ +fma.rn.f16x2 r6974, r6469, r6965, r6972; +} +{ +mul.f16x2 r6978, r6469, r6967; +} +{ +fma.rn.f16x2 r6981, r6475, r6965, r6978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6989, {low, high}; +} +{ +mul.f16x2 r6990, r6987, r6989; +} +{ +mul.f16x2 r6993, r6961, r6985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6996, {high, low}; +} +{ +fma.rn.f16x2 r6998, r6990, r6996, r6993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7004, {high, high}; +} +{ +mul.f16x2 r7006, r6563, r7004; +} +{ +neg.f16x2 r7009, r7006; +} +{ +fma.rn.f16x2 r7011, r6557, r7002, r7009; +} +{ +mul.f16x2 r7015, r6557, r7004; +} +{ +fma.rn.f16x2 r7018, r6563, r7002, r7015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7026, {low, high}; +} +{ +mul.f16x2 r7027, r7024, r7026; +} +{ +mul.f16x2 r7030, r6998, r7022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7033, {high, low}; +} +{ +fma.rn.f16x2 r7035, r7027, r7033, r7030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7041, {high, high}; +} +{ +mul.f16x2 r7043, r6651, r7041; +} +{ +neg.f16x2 r7046, r7043; +} +{ +fma.rn.f16x2 r7048, r6645, r7039, r7046; +} +{ +mul.f16x2 r7052, r6645, r7041; +} +{ +fma.rn.f16x2 r7055, r6651, r7039, r7052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7063, {low, high}; +} +{ +mul.f16x2 r7064, r7061, r7063; +} +{ +mul.f16x2 r7067, r7035, r7059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7070, {high, low}; +} +{ +fma.rn.f16x2 r7072, r7064, r7070, r7067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7078, {high, high}; +} +{ +mul.f16x2 r7080, r6739, r7078; +} +{ +neg.f16x2 r7083, r7080; +} +{ +fma.rn.f16x2 r7085, r6733, r7076, r7083; +} +{ +mul.f16x2 r7089, r6733, r7078; +} +{ +fma.rn.f16x2 r7092, r6739, r7076, r7089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7100, {low, high}; +} +{ +mul.f16x2 r7101, r7098, r7100; +} +{ +mul.f16x2 r7104, r7072, r7096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7107, {high, low}; +} +{ +fma.rn.f16x2 r7109, r7101, r7107, r7104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7115, {high, high}; +} +{ +mul.f16x2 r7117, r6089, r7115; +} +{ +neg.f16x2 r7120, r7117; +} +{ +fma.rn.f16x2 r7122, r6053, r7113, r7120; +} +{ +mul.f16x2 r7126, r6053, r7115; +} +{ +fma.rn.f16x2 r7129, r6089, r7113, r7126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7137, {low, high}; +} +{ +mul.f16x2 r7138, r7135, r7137; +} +{ +mul.f16x2 r7141, r7109, r7133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7144, {high, low}; +} +{ +fma.rn.f16x2 r7146, r7138, r7144, r7141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7152, {high, high}; +} +{ +mul.f16x2 r7154, r6177, r7152; +} +{ +neg.f16x2 r7157, r7154; +} +{ +fma.rn.f16x2 r7159, r6141, r7150, r7157; +} +{ +mul.f16x2 r7163, r6141, r7152; +} +{ +fma.rn.f16x2 r7166, r6177, r7150, r7163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7174, {low, high}; +} +{ +mul.f16x2 r7175, r7172, r7174; +} +{ +mul.f16x2 r7178, r7146, r7170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7181, {high, low}; +} +{ +fma.rn.f16x2 r7183, r7175, r7181, r7178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7189, {high, high}; +} +{ +mul.f16x2 r7191, r6265, r7189; +} +{ +neg.f16x2 r7194, r7191; +} +{ +fma.rn.f16x2 r7196, r6229, r7187, r7194; +} +{ +mul.f16x2 r7200, r6229, r7189; +} +{ +fma.rn.f16x2 r7203, r6265, r7187, r7200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7211, {low, high}; +} +{ +mul.f16x2 r7212, r7209, r7211; +} +{ +mul.f16x2 r7215, r7183, r7207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7218, {high, low}; +} +{ +fma.rn.f16x2 r7220, r7212, r7218, r7215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7226, {high, high}; +} +{ +mul.f16x2 r7228, r6353, r7226; +} +{ +neg.f16x2 r7231, r7228; +} +{ +fma.rn.f16x2 r7233, r6317, r7224, r7231; +} +{ +mul.f16x2 r7237, r6317, r7226; +} +{ +fma.rn.f16x2 r7240, r6353, r7224, r7237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7248, {low, high}; +} +{ +mul.f16x2 r7249, r7246, r7248; +} +{ +mul.f16x2 r7252, r7220, r7244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7255, {high, low}; +} +{ +fma.rn.f16x2 r7257, r7249, r7255, r7252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7263, {high, high}; +} +{ +mul.f16x2 r7265, r6441, r7263; +} +{ +neg.f16x2 r7268, r7265; +} +{ +fma.rn.f16x2 r7270, r6405, r7261, r7268; +} +{ +mul.f16x2 r7274, r6405, r7263; +} +{ +fma.rn.f16x2 r7277, r6441, r7261, r7274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7285, {low, high}; +} +{ +mul.f16x2 r7286, r7283, r7285; +} +{ +mul.f16x2 r7289, r7257, r7281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7292, {high, low}; +} +{ +fma.rn.f16x2 r7294, r7286, r7292, r7289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7300, {high, high}; +} +{ +mul.f16x2 r7302, r6529, r7300; +} +{ +neg.f16x2 r7305, r7302; +} +{ +fma.rn.f16x2 r7307, r6493, r7298, r7305; +} +{ +mul.f16x2 r7311, r6493, r7300; +} +{ +fma.rn.f16x2 r7314, r6529, r7298, r7311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7322, {low, high}; +} +{ +mul.f16x2 r7323, r7320, r7322; +} +{ +mul.f16x2 r7326, r7294, r7318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7329, {high, low}; +} +{ +fma.rn.f16x2 r7331, r7323, r7329, r7326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7337, {high, high}; +} +{ +mul.f16x2 r7339, r6617, r7337; +} +{ +neg.f16x2 r7342, r7339; +} +{ +fma.rn.f16x2 r7344, r6581, r7335, r7342; +} +{ +mul.f16x2 r7348, r6581, r7337; +} +{ +fma.rn.f16x2 r7351, r6617, r7335, r7348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7359, {low, high}; +} +{ +mul.f16x2 r7360, r7357, r7359; +} +{ +mul.f16x2 r7363, r7331, r7355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7366, {high, low}; +} +{ +fma.rn.f16x2 r7368, r7360, r7366, r7363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7374, {high, high}; +} +{ +mul.f16x2 r7376, r6705, r7374; +} +{ +neg.f16x2 r7379, r7376; +} +{ +fma.rn.f16x2 r7381, r6669, r7372, r7379; +} +{ +mul.f16x2 r7385, r6669, r7374; +} +{ +fma.rn.f16x2 r7388, r6705, r7372, r7385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7396, {low, high}; +} +{ +mul.f16x2 r7397, r7394, r7396; +} +{ +mul.f16x2 r7400, r7368, r7392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7403, {high, low}; +} +{ +fma.rn.f16x2 r7405, r7397, r7403, r7400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7411, {high, high}; +} +{ +mul.f16x2 r7413, r6793, r7411; +} +{ +neg.f16x2 r7416, r7413; +} +{ +fma.rn.f16x2 r7418, r6757, r7409, r7416; +} +{ +mul.f16x2 r7422, r6757, r7411; +} +{ +fma.rn.f16x2 r7425, r6793, r7409, r7422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7433, {low, high}; +} +{ +mul.f16x2 r7434, r7431, r7433; +} +{ +mul.f16x2 r7437, r7405, r7429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7440, {high, low}; +} +{ +fma.rn.f16x2 r7442, r7434, r7440, r7437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7448, {high, high}; +} +{ +mul.f16x2 r7450, r6107, r7448; +} +{ +neg.f16x2 r7453, r7450; +} +{ +fma.rn.f16x2 r7455, r6071, r7446, r7453; +} +{ +mul.f16x2 r7459, r6071, r7448; +} +{ +fma.rn.f16x2 r7462, r6107, r7446, r7459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7470, {low, high}; +} +{ +mul.f16x2 r7471, r7468, r7470; +} +{ +mul.f16x2 r7474, r7442, r7466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7477, {high, low}; +} +{ +fma.rn.f16x2 r7479, r7471, r7477, r7474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7485, {high, high}; +} +{ +mul.f16x2 r7487, r6195, r7485; +} +{ +neg.f16x2 r7490, r7487; +} +{ +fma.rn.f16x2 r7492, r6159, r7483, r7490; +} +{ +mul.f16x2 r7496, r6159, r7485; +} +{ +fma.rn.f16x2 r7499, r6195, r7483, r7496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7507, {low, high}; +} +{ +mul.f16x2 r7508, r7505, r7507; +} +{ +mul.f16x2 r7511, r7479, r7503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7514, {high, low}; +} +{ +fma.rn.f16x2 r7516, r7508, r7514, r7511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7522, {high, high}; +} +{ +mul.f16x2 r7524, r6283, r7522; +} +{ +neg.f16x2 r7527, r7524; +} +{ +fma.rn.f16x2 r7529, r6247, r7520, r7527; +} +{ +mul.f16x2 r7533, r6247, r7522; +} +{ +fma.rn.f16x2 r7536, r6283, r7520, r7533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7544, {low, high}; +} +{ +mul.f16x2 r7545, r7542, r7544; +} +{ +mul.f16x2 r7548, r7516, r7540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7551, {high, low}; +} +{ +fma.rn.f16x2 r7553, r7545, r7551, r7548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7559, {high, high}; +} +{ +mul.f16x2 r7561, r6371, r7559; +} +{ +neg.f16x2 r7564, r7561; +} +{ +fma.rn.f16x2 r7566, r6335, r7557, r7564; +} +{ +mul.f16x2 r7570, r6335, r7559; +} +{ +fma.rn.f16x2 r7573, r6371, r7557, r7570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7581, {low, high}; +} +{ +mul.f16x2 r7582, r7579, r7581; +} +{ +mul.f16x2 r7585, r7553, r7577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7588, {high, low}; +} +{ +fma.rn.f16x2 r7590, r7582, r7588, r7585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7596, {high, high}; +} +{ +mul.f16x2 r7598, r6459, r7596; +} +{ +neg.f16x2 r7601, r7598; +} +{ +fma.rn.f16x2 r7603, r6423, r7594, r7601; +} +{ +mul.f16x2 r7607, r6423, r7596; +} +{ +fma.rn.f16x2 r7610, r6459, r7594, r7607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7618, {low, high}; +} +{ +mul.f16x2 r7619, r7616, r7618; +} +{ +mul.f16x2 r7622, r7590, r7614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7625, {high, low}; +} +{ +fma.rn.f16x2 r7627, r7619, r7625, r7622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7633, {high, high}; +} +{ +mul.f16x2 r7635, r6547, r7633; +} +{ +neg.f16x2 r7638, r7635; +} +{ +fma.rn.f16x2 r7640, r6511, r7631, r7638; +} +{ +mul.f16x2 r7644, r6511, r7633; +} +{ +fma.rn.f16x2 r7647, r6547, r7631, r7644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7655, {low, high}; +} +{ +mul.f16x2 r7656, r7653, r7655; +} +{ +mul.f16x2 r7659, r7627, r7651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7662, {high, low}; +} +{ +fma.rn.f16x2 r7664, r7656, r7662, r7659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7670, {high, high}; +} +{ +mul.f16x2 r7672, r6635, r7670; +} +{ +neg.f16x2 r7675, r7672; +} +{ +fma.rn.f16x2 r7677, r6599, r7668, r7675; +} +{ +mul.f16x2 r7681, r6599, r7670; +} +{ +fma.rn.f16x2 r7684, r6635, r7668, r7681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7692, {low, high}; +} +{ +mul.f16x2 r7693, r7690, r7692; +} +{ +mul.f16x2 r7696, r7664, r7688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7699, {high, low}; +} +{ +fma.rn.f16x2 r7701, r7693, r7699, r7696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7707, {high, high}; +} +{ +mul.f16x2 r7709, r6723, r7707; +} +{ +neg.f16x2 r7712, r7709; +} +{ +fma.rn.f16x2 r7714, r6687, r7705, r7712; +} +{ +mul.f16x2 r7718, r6687, r7707; +} +{ +fma.rn.f16x2 r7721, r6723, r7705, r7718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7729, {low, high}; +} +{ +mul.f16x2 r7730, r7727, r7729; +} +{ +mul.f16x2 r7733, r7701, r7725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7736, {high, low}; +} +{ +fma.rn.f16x2 r7738, r7730, r7736, r7733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7744, {high, high}; +} +{ +mul.f16x2 r7746, r6811, r7744; +} +{ +neg.f16x2 r7749, r7746; +} +{ +fma.rn.f16x2 r7751, r6775, r7742, r7749; +} +{ +mul.f16x2 r7755, r6775, r7744; +} +{ +fma.rn.f16x2 r7758, r6811, r7742, r7755; +} +shl.b32 r8588, r8587, 3; +add.s32 r8589, r8578, r8588; +barrier.sync 0; +mad.lo.s32 r8590, r8585, 5832, r8589; +st.shared.u32 [r8590], r6029; +st.shared.u32 [r8590+4], r6035; +st.shared.u32 [r8590+216], r6826; +st.shared.u32 [r8590+220], r6833; +st.shared.u32 [r8590+432], r6863; +st.shared.u32 [r8590+436], r6870; +st.shared.u32 [r8590+648], r6900; +st.shared.u32 [r8590+652], r6907; +st.shared.u32 [r8590+864], r6937; +st.shared.u32 [r8590+868], r6944; +st.shared.u32 [r8590+1080], r6974; +st.shared.u32 [r8590+1084], r6981; +st.shared.u32 [r8590+1296], r7011; +st.shared.u32 [r8590+1300], r7018; +st.shared.u32 [r8590+1512], r7048; +st.shared.u32 [r8590+1516], r7055; +st.shared.u32 [r8590+1728], r7085; +st.shared.u32 [r8590+1732], r7092; +st.shared.u32 [r8590+1944], r7122; +st.shared.u32 [r8590+1948], r7129; +st.shared.u32 [r8590+2160], r7159; +st.shared.u32 [r8590+2164], r7166; +st.shared.u32 [r8590+2376], r7196; +st.shared.u32 [r8590+2380], r7203; +st.shared.u32 [r8590+2592], r7233; +st.shared.u32 [r8590+2596], r7240; +st.shared.u32 [r8590+2808], r7270; +st.shared.u32 [r8590+2812], r7277; +st.shared.u32 [r8590+3024], r7307; +st.shared.u32 [r8590+3028], r7314; +st.shared.u32 [r8590+3240], r7344; +st.shared.u32 [r8590+3244], r7351; +st.shared.u32 [r8590+3456], r7381; +st.shared.u32 [r8590+3460], r7388; +st.shared.u32 [r8590+3672], r7418; +st.shared.u32 [r8590+3676], r7425; +st.shared.u32 [r8590+3888], r7455; +st.shared.u32 [r8590+3892], r7462; +st.shared.u32 [r8590+4104], r7492; +st.shared.u32 [r8590+4108], r7499; +st.shared.u32 [r8590+4320], r7529; +st.shared.u32 [r8590+4324], r7536; +st.shared.u32 [r8590+4536], r7566; +st.shared.u32 [r8590+4540], r7573; +st.shared.u32 [r8590+4752], r7603; +st.shared.u32 [r8590+4756], r7610; +st.shared.u32 [r8590+4968], r7640; +st.shared.u32 [r8590+4972], r7647; +st.shared.u32 [r8590+5184], r7677; +st.shared.u32 [r8590+5188], r7684; +st.shared.u32 [r8590+5400], r7714; +st.shared.u32 [r8590+5404], r7721; +st.shared.u32 [r8590+5616], r7751; +st.shared.u32 [r8590+5620], r7758; +barrier.sync 0; +ld.shared.u32 r7787, [r8580]; +ld.shared.u32 r7793, [r8580+4]; +ld.shared.u32 r7875, [r8580+648]; +ld.shared.u32 r7881, [r8580+652]; +ld.shared.u32 r7963, [r8580+1296]; +ld.shared.u32 r7969, [r8580+1300]; +ld.shared.u32 r8051, [r8580+1944]; +ld.shared.u32 r8057, [r8580+1948]; +ld.shared.u32 r8139, [r8580+2592]; +ld.shared.u32 r8145, [r8580+2596]; +ld.shared.u32 r8227, [r8580+3240]; +ld.shared.u32 r8233, [r8580+3244]; +ld.shared.u32 r8315, [r8580+3888]; +ld.shared.u32 r8321, [r8580+3892]; +ld.shared.u32 r8403, [r8580+4536]; +ld.shared.u32 r8409, [r8580+4540]; +ld.shared.u32 r8491, [r8580+5184]; +ld.shared.u32 r8497, [r8580+5188]; +ld.shared.u32 r7784, [r8580+5832]; +ld.shared.u32 r7790, [r8580+5836]; +ld.shared.u32 r7872, [r8580+6480]; +ld.shared.u32 r7878, [r8580+6484]; +ld.shared.u32 r7960, [r8580+7128]; +ld.shared.u32 r7966, [r8580+7132]; +ld.shared.u32 r8048, [r8580+7776]; +ld.shared.u32 r8054, [r8580+7780]; +ld.shared.u32 r8136, [r8580+8424]; +ld.shared.u32 r8142, [r8580+8428]; +ld.shared.u32 r8224, [r8580+9072]; +ld.shared.u32 r8230, [r8580+9076]; +ld.shared.u32 r8312, [r8580+9720]; +ld.shared.u32 r8318, [r8580+9724]; +ld.shared.u32 r8400, [r8580+10368]; +ld.shared.u32 r8406, [r8580+10372]; +ld.shared.u32 r8488, [r8580+11016]; +ld.shared.u32 r8494, [r8580+11020]; +ld.shared.u32 r7785, [r8580+11664]; +ld.shared.u32 r7791, [r8580+11668]; +ld.shared.u32 r7873, [r8580+12312]; +ld.shared.u32 r7879, [r8580+12316]; +ld.shared.u32 r7961, [r8580+12960]; +ld.shared.u32 r7967, [r8580+12964]; +ld.shared.u32 r8049, [r8580+13608]; +ld.shared.u32 r8055, [r8580+13612]; +ld.shared.u32 r8137, [r8580+14256]; +ld.shared.u32 r8143, [r8580+14260]; +ld.shared.u32 r8225, [r8580+14904]; +ld.shared.u32 r8231, [r8580+14908]; +ld.shared.u32 r8313, [r8580+15552]; +ld.shared.u32 r8319, [r8580+15556]; +ld.shared.u32 r8401, [r8580+16200]; +ld.shared.u32 r8407, [r8580+16204]; +ld.shared.u32 r8489, [r8580+16848]; +ld.shared.u32 r8495, [r8580+16852]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7780, {low, high}; +} +{ +neg.f16x2 r7781, r7780; +} +{ +add.f16x2 r7783, r7784, r7785; +} +{ +add.f16x2 %0, r7787, r7783; +} +{ +add.f16x2 r7789, r7790, r7791; +} +{ +add.f16x2 %1, r7793, r7789; +} +{ +add.f16x2 r7795, r7784, r7785; +} +{ +mul.f16x2 r7798, r7795, r7779; +} +{ +add.f16x2 r7801, r7787, r7798; +} +{ +sub.f16x2 r7804, r7790, r7791; +} +{ +mul.f16x2 r7807, r7804, r7781; +} +{ +add.f16x2 %18, r7801, r7807; +} +{ +add.f16x2 r7813, r7784, r7785; +} +{ +mul.f16x2 r7816, r7813, r7779; +} +{ +add.f16x2 r7819, r7787, r7816; +} +{ +sub.f16x2 r7822, r7790, r7791; +} +{ +mul.f16x2 r7825, r7822, r7781; +} +{ +sub.f16x2 %36, r7819, r7825; +} +{ +add.f16x2 r7831, r7790, r7791; +} +{ +mul.f16x2 r7834, r7831, r7779; +} +{ +add.f16x2 r7837, r7793, r7834; +} +{ +sub.f16x2 r7840, r7784, r7785; +} +{ +mul.f16x2 r7843, r7840, r7781; +} +{ +sub.f16x2 %19, r7837, r7843; +} +{ +add.f16x2 r7849, r7790, r7791; +} +{ +mul.f16x2 r7852, r7849, r7779; +} +{ +add.f16x2 r7855, r7793, r7852; +} +{ +sub.f16x2 r7858, r7784, r7785; +} +{ +mul.f16x2 r7861, r7858, r7781; +} +{ +add.f16x2 %37, r7855, r7861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7868, {low, high}; +} +{ +neg.f16x2 r7869, r7868; +} +{ +add.f16x2 r7871, r7872, r7873; +} +{ +add.f16x2 %2, r7875, r7871; +} +{ +add.f16x2 r7877, r7878, r7879; +} +{ +add.f16x2 %3, r7881, r7877; +} +{ +add.f16x2 r7883, r7872, r7873; +} +{ +mul.f16x2 r7886, r7883, r7867; +} +{ +add.f16x2 r7889, r7875, r7886; +} +{ +sub.f16x2 r7892, r7878, r7879; +} +{ +mul.f16x2 r7895, r7892, r7869; +} +{ +add.f16x2 %20, r7889, r7895; +} +{ +add.f16x2 r7901, r7872, r7873; +} +{ +mul.f16x2 r7904, r7901, r7867; +} +{ +add.f16x2 r7907, r7875, r7904; +} +{ +sub.f16x2 r7910, r7878, r7879; +} +{ +mul.f16x2 r7913, r7910, r7869; +} +{ +sub.f16x2 %38, r7907, r7913; +} +{ +add.f16x2 r7919, r7878, r7879; +} +{ +mul.f16x2 r7922, r7919, r7867; +} +{ +add.f16x2 r7925, r7881, r7922; +} +{ +sub.f16x2 r7928, r7872, r7873; +} +{ +mul.f16x2 r7931, r7928, r7869; +} +{ +sub.f16x2 %21, r7925, r7931; +} +{ +add.f16x2 r7937, r7878, r7879; +} +{ +mul.f16x2 r7940, r7937, r7867; +} +{ +add.f16x2 r7943, r7881, r7940; +} +{ +sub.f16x2 r7946, r7872, r7873; +} +{ +mul.f16x2 r7949, r7946, r7869; +} +{ +add.f16x2 %39, r7943, r7949; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7956, {low, high}; +} +{ +neg.f16x2 r7957, r7956; +} +{ +add.f16x2 r7959, r7960, r7961; +} +{ +add.f16x2 %4, r7963, r7959; +} +{ +add.f16x2 r7965, r7966, r7967; +} +{ +add.f16x2 %5, r7969, r7965; +} +{ +add.f16x2 r7971, r7960, r7961; +} +{ +mul.f16x2 r7974, r7971, r7955; +} +{ +add.f16x2 r7977, r7963, r7974; +} +{ +sub.f16x2 r7980, r7966, r7967; +} +{ +mul.f16x2 r7983, r7980, r7957; +} +{ +add.f16x2 %22, r7977, r7983; +} +{ +add.f16x2 r7989, r7960, r7961; +} +{ +mul.f16x2 r7992, r7989, r7955; +} +{ +add.f16x2 r7995, r7963, r7992; +} +{ +sub.f16x2 r7998, r7966, r7967; +} +{ +mul.f16x2 r8001, r7998, r7957; +} +{ +sub.f16x2 %40, r7995, r8001; +} +{ +add.f16x2 r8007, r7966, r7967; +} +{ +mul.f16x2 r8010, r8007, r7955; +} +{ +add.f16x2 r8013, r7969, r8010; +} +{ +sub.f16x2 r8016, r7960, r7961; +} +{ +mul.f16x2 r8019, r8016, r7957; +} +{ +sub.f16x2 %23, r8013, r8019; +} +{ +add.f16x2 r8025, r7966, r7967; +} +{ +mul.f16x2 r8028, r8025, r7955; +} +{ +add.f16x2 r8031, r7969, r8028; +} +{ +sub.f16x2 r8034, r7960, r7961; +} +{ +mul.f16x2 r8037, r8034, r7957; +} +{ +add.f16x2 %41, r8031, r8037; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8044, {low, high}; +} +{ +neg.f16x2 r8045, r8044; +} +{ +add.f16x2 r8047, r8048, r8049; +} +{ +add.f16x2 %6, r8051, r8047; +} +{ +add.f16x2 r8053, r8054, r8055; +} +{ +add.f16x2 %7, r8057, r8053; +} +{ +add.f16x2 r8059, r8048, r8049; +} +{ +mul.f16x2 r8062, r8059, r8043; +} +{ +add.f16x2 r8065, r8051, r8062; +} +{ +sub.f16x2 r8068, r8054, r8055; +} +{ +mul.f16x2 r8071, r8068, r8045; +} +{ +add.f16x2 %24, r8065, r8071; +} +{ +add.f16x2 r8077, r8048, r8049; +} +{ +mul.f16x2 r8080, r8077, r8043; +} +{ +add.f16x2 r8083, r8051, r8080; +} +{ +sub.f16x2 r8086, r8054, r8055; +} +{ +mul.f16x2 r8089, r8086, r8045; +} +{ +sub.f16x2 %42, r8083, r8089; +} +{ +add.f16x2 r8095, r8054, r8055; +} +{ +mul.f16x2 r8098, r8095, r8043; +} +{ +add.f16x2 r8101, r8057, r8098; +} +{ +sub.f16x2 r8104, r8048, r8049; +} +{ +mul.f16x2 r8107, r8104, r8045; +} +{ +sub.f16x2 %25, r8101, r8107; +} +{ +add.f16x2 r8113, r8054, r8055; +} +{ +mul.f16x2 r8116, r8113, r8043; +} +{ +add.f16x2 r8119, r8057, r8116; +} +{ +sub.f16x2 r8122, r8048, r8049; +} +{ +mul.f16x2 r8125, r8122, r8045; +} +{ +add.f16x2 %43, r8119, r8125; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8132, {low, high}; +} +{ +neg.f16x2 r8133, r8132; +} +{ +add.f16x2 r8135, r8136, r8137; +} +{ +add.f16x2 %8, r8139, r8135; +} +{ +add.f16x2 r8141, r8142, r8143; +} +{ +add.f16x2 %9, r8145, r8141; +} +{ +add.f16x2 r8147, r8136, r8137; +} +{ +mul.f16x2 r8150, r8147, r8131; +} +{ +add.f16x2 r8153, r8139, r8150; +} +{ +sub.f16x2 r8156, r8142, r8143; +} +{ +mul.f16x2 r8159, r8156, r8133; +} +{ +add.f16x2 %26, r8153, r8159; +} +{ +add.f16x2 r8165, r8136, r8137; +} +{ +mul.f16x2 r8168, r8165, r8131; +} +{ +add.f16x2 r8171, r8139, r8168; +} +{ +sub.f16x2 r8174, r8142, r8143; +} +{ +mul.f16x2 r8177, r8174, r8133; +} +{ +sub.f16x2 %44, r8171, r8177; +} +{ +add.f16x2 r8183, r8142, r8143; +} +{ +mul.f16x2 r8186, r8183, r8131; +} +{ +add.f16x2 r8189, r8145, r8186; +} +{ +sub.f16x2 r8192, r8136, r8137; +} +{ +mul.f16x2 r8195, r8192, r8133; +} +{ +sub.f16x2 %27, r8189, r8195; +} +{ +add.f16x2 r8201, r8142, r8143; +} +{ +mul.f16x2 r8204, r8201, r8131; +} +{ +add.f16x2 r8207, r8145, r8204; +} +{ +sub.f16x2 r8210, r8136, r8137; +} +{ +mul.f16x2 r8213, r8210, r8133; +} +{ +add.f16x2 %45, r8207, r8213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8219, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8220, {low, high}; +} +{ +neg.f16x2 r8221, r8220; +} +{ +add.f16x2 r8223, r8224, r8225; +} +{ +add.f16x2 %10, r8227, r8223; +} +{ +add.f16x2 r8229, r8230, r8231; +} +{ +add.f16x2 %11, r8233, r8229; +} +{ +add.f16x2 r8235, r8224, r8225; +} +{ +mul.f16x2 r8238, r8235, r8219; +} +{ +add.f16x2 r8241, r8227, r8238; +} +{ +sub.f16x2 r8244, r8230, r8231; +} +{ +mul.f16x2 r8247, r8244, r8221; +} +{ +add.f16x2 %28, r8241, r8247; +} +{ +add.f16x2 r8253, r8224, r8225; +} +{ +mul.f16x2 r8256, r8253, r8219; +} +{ +add.f16x2 r8259, r8227, r8256; +} +{ +sub.f16x2 r8262, r8230, r8231; +} +{ +mul.f16x2 r8265, r8262, r8221; +} +{ +sub.f16x2 %46, r8259, r8265; +} +{ +add.f16x2 r8271, r8230, r8231; +} +{ +mul.f16x2 r8274, r8271, r8219; +} +{ +add.f16x2 r8277, r8233, r8274; +} +{ +sub.f16x2 r8280, r8224, r8225; +} +{ +mul.f16x2 r8283, r8280, r8221; +} +{ +sub.f16x2 %29, r8277, r8283; +} +{ +add.f16x2 r8289, r8230, r8231; +} +{ +mul.f16x2 r8292, r8289, r8219; +} +{ +add.f16x2 r8295, r8233, r8292; +} +{ +sub.f16x2 r8298, r8224, r8225; +} +{ +mul.f16x2 r8301, r8298, r8221; +} +{ +add.f16x2 %47, r8295, r8301; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8307, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8308, {low, high}; +} +{ +neg.f16x2 r8309, r8308; +} +{ +add.f16x2 r8311, r8312, r8313; +} +{ +add.f16x2 %12, r8315, r8311; +} +{ +add.f16x2 r8317, r8318, r8319; +} +{ +add.f16x2 %13, r8321, r8317; +} +{ +add.f16x2 r8323, r8312, r8313; +} +{ +mul.f16x2 r8326, r8323, r8307; +} +{ +add.f16x2 r8329, r8315, r8326; +} +{ +sub.f16x2 r8332, r8318, r8319; +} +{ +mul.f16x2 r8335, r8332, r8309; +} +{ +add.f16x2 %30, r8329, r8335; +} +{ +add.f16x2 r8341, r8312, r8313; +} +{ +mul.f16x2 r8344, r8341, r8307; +} +{ +add.f16x2 r8347, r8315, r8344; +} +{ +sub.f16x2 r8350, r8318, r8319; +} +{ +mul.f16x2 r8353, r8350, r8309; +} +{ +sub.f16x2 %48, r8347, r8353; +} +{ +add.f16x2 r8359, r8318, r8319; +} +{ +mul.f16x2 r8362, r8359, r8307; +} +{ +add.f16x2 r8365, r8321, r8362; +} +{ +sub.f16x2 r8368, r8312, r8313; +} +{ +mul.f16x2 r8371, r8368, r8309; +} +{ +sub.f16x2 %31, r8365, r8371; +} +{ +add.f16x2 r8377, r8318, r8319; +} +{ +mul.f16x2 r8380, r8377, r8307; +} +{ +add.f16x2 r8383, r8321, r8380; +} +{ +sub.f16x2 r8386, r8312, r8313; +} +{ +mul.f16x2 r8389, r8386, r8309; +} +{ +add.f16x2 %49, r8383, r8389; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8396, {low, high}; +} +{ +neg.f16x2 r8397, r8396; +} +{ +add.f16x2 r8399, r8400, r8401; +} +{ +add.f16x2 %14, r8403, r8399; +} +{ +add.f16x2 r8405, r8406, r8407; +} +{ +add.f16x2 %15, r8409, r8405; +} +{ +add.f16x2 r8411, r8400, r8401; +} +{ +mul.f16x2 r8414, r8411, r8395; +} +{ +add.f16x2 r8417, r8403, r8414; +} +{ +sub.f16x2 r8420, r8406, r8407; +} +{ +mul.f16x2 r8423, r8420, r8397; +} +{ +add.f16x2 %32, r8417, r8423; +} +{ +add.f16x2 r8429, r8400, r8401; +} +{ +mul.f16x2 r8432, r8429, r8395; +} +{ +add.f16x2 r8435, r8403, r8432; +} +{ +sub.f16x2 r8438, r8406, r8407; +} +{ +mul.f16x2 r8441, r8438, r8397; +} +{ +sub.f16x2 %50, r8435, r8441; +} +{ +add.f16x2 r8447, r8406, r8407; +} +{ +mul.f16x2 r8450, r8447, r8395; +} +{ +add.f16x2 r8453, r8409, r8450; +} +{ +sub.f16x2 r8456, r8400, r8401; +} +{ +mul.f16x2 r8459, r8456, r8397; +} +{ +sub.f16x2 %33, r8453, r8459; +} +{ +add.f16x2 r8465, r8406, r8407; +} +{ +mul.f16x2 r8468, r8465, r8395; +} +{ +add.f16x2 r8471, r8409, r8468; +} +{ +sub.f16x2 r8474, r8400, r8401; +} +{ +mul.f16x2 r8477, r8474, r8397; +} +{ +add.f16x2 %51, r8471, r8477; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8484, {low, high}; +} +{ +neg.f16x2 r8485, r8484; +} +{ +add.f16x2 r8487, r8488, r8489; +} +{ +add.f16x2 %16, r8491, r8487; +} +{ +add.f16x2 r8493, r8494, r8495; +} +{ +add.f16x2 %17, r8497, r8493; +} +{ +add.f16x2 r8499, r8488, r8489; +} +{ +mul.f16x2 r8502, r8499, r8483; +} +{ +add.f16x2 r8505, r8491, r8502; +} +{ +sub.f16x2 r8508, r8494, r8495; +} +{ +mul.f16x2 r8511, r8508, r8485; +} +{ +add.f16x2 %34, r8505, r8511; +} +{ +add.f16x2 r8517, r8488, r8489; +} +{ +mul.f16x2 r8520, r8517, r8483; +} +{ +add.f16x2 r8523, r8491, r8520; +} +{ +sub.f16x2 r8526, r8494, r8495; +} +{ +mul.f16x2 r8529, r8526, r8485; +} +{ +sub.f16x2 %52, r8523, r8529; +} +{ +add.f16x2 r8535, r8494, r8495; +} +{ +mul.f16x2 r8538, r8535, r8483; +} +{ +add.f16x2 r8541, r8497, r8538; +} +{ +sub.f16x2 r8544, r8488, r8489; +} +{ +mul.f16x2 r8547, r8544, r8485; +} +{ +sub.f16x2 %35, r8541, r8547; +} +{ +add.f16x2 r8553, r8494, r8495; +} +{ +mul.f16x2 r8556, r8553, r8483; +} +{ +add.f16x2 r8559, r8497, r8556; +} +{ +sub.f16x2 r8562, r8488, r8489; +} +{ +mul.f16x2 r8565, r8562, r8485; +} +{ +add.f16x2 %53, r8559, r8565; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<890, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<250>; +.reg .b32 r<3009>; +.reg .b64 rd<8>; +mov.u32 r2986, %tid.y; +mov.u32 r2987, %18; +mad.lo.s32 r2988, r2986, 17496, r2987; +mov.u32 r2989, %tid.x; +mov.f32 f238, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1, {low, high}; +} +mov.f32 f240, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r265, {low, high}; +} +mov.f32 f168, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r266, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r267, {low, high}; +} +mov.f32 f172, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r268, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r271, {low, high}; +} +mov.f32 f180, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r2989, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r2990, rd3; +mul.lo.s32 r2991, r2990, 243; +sub.s32 r2992, r2989, r2991; +cvt.rn.f32.u32 f241, r2992; +mul.f32 f242, f241, 0f3B3C4870; +cos.approx.f32 f57, f242; +sin.approx.f32 f243, f242; +neg.f32 f58, f243; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +mad.lo.s32 r2993, r2990, 17496, r2988; +barrier.sync 0; +mad.lo.s32 r2994, r2992, 72, r2993; +st.shared.v2.f32 [r2994], {r352, r358}; +st.shared.v2.f32 [r2994+8], {r621, r628}; +st.shared.v2.f32 [r2994+16], {r658, r665}; +st.shared.v2.f32 [r2994+24], {r695, r702}; +st.shared.v2.f32 [r2994+32], {r732, r739}; +st.shared.v2.f32 [r2994+40], {r769, r776}; +st.shared.v2.f32 [r2994+48], {r806, r813}; +st.shared.v2.f32 [r2994+56], {r843, r850}; +st.shared.v2.f32 [r2994+64], {r880, r887}; +barrier.sync 0; +shl.b32 r2995, r2992, 6; +sub.s32 r2996, r2994, r2995; +ld.shared.u32 r916, [r2996]; +ld.shared.u32 r922, [r2996+4]; +ld.shared.u32 r1004, [r2996+1944]; +ld.shared.u32 r1010, [r2996+1948]; +ld.shared.u32 r1092, [r2996+3888]; +ld.shared.u32 r1098, [r2996+3892]; +ld.shared.u32 r913, [r2996+5832]; +ld.shared.u32 r919, [r2996+5836]; +ld.shared.u32 r1001, [r2996+7776]; +ld.shared.u32 r1007, [r2996+7780]; +ld.shared.u32 r1089, [r2996+9720]; +ld.shared.u32 r1095, [r2996+9724]; +ld.shared.u32 r914, [r2996+11664]; +ld.shared.u32 r920, [r2996+11668]; +ld.shared.u32 r1002, [r2996+13608]; +ld.shared.u32 r1008, [r2996+13612]; +ld.shared.u32 r1090, [r2996+15552]; +ld.shared.u32 r1096, [r2996+15556]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r2992, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2997, rd5; +cvt.rn.f32.u32 f244, r2997; +mul.f32 f245, f244, 0f3CD3D17E; +cos.approx.f32 f133, f245; +sin.approx.f32 f246, f245; +neg.f32 f134, f246; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +mul.lo.s32 r2998, r2997, 9; +sub.s32 r2999, r2992, r2998; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +shl.b32 r3000, r2999, 3; +add.s32 r3001, r2993, r3000; +barrier.sync 0; +mad.lo.s32 r3002, r2997, 648, r3001; +st.shared.u32 [r3002], r1259; +st.shared.u32 [r3002+4], r1265; +st.shared.u32 [r3002+72], r1528; +st.shared.u32 [r3002+76], r1535; +st.shared.u32 [r3002+144], r1565; +st.shared.u32 [r3002+148], r1572; +st.shared.u32 [r3002+216], r1602; +st.shared.u32 [r3002+220], r1609; +st.shared.u32 [r3002+288], r1639; +st.shared.u32 [r3002+292], r1646; +st.shared.u32 [r3002+360], r1676; +st.shared.u32 [r3002+364], r1683; +st.shared.u32 [r3002+432], r1713; +st.shared.u32 [r3002+436], r1720; +st.shared.u32 [r3002+504], r1750; +st.shared.u32 [r3002+508], r1757; +st.shared.u32 [r3002+576], r1787; +st.shared.u32 [r3002+580], r1794; +barrier.sync 0; +ld.shared.u32 r1823, [r2996]; +ld.shared.u32 r1829, [r2996+4]; +ld.shared.u32 r1911, [r2996+1944]; +ld.shared.u32 r1917, [r2996+1948]; +ld.shared.u32 r1999, [r2996+3888]; +ld.shared.u32 r2005, [r2996+3892]; +ld.shared.u32 r1820, [r2996+5832]; +ld.shared.u32 r1826, [r2996+5836]; +ld.shared.u32 r1908, [r2996+7776]; +ld.shared.u32 r1914, [r2996+7780]; +ld.shared.u32 r1996, [r2996+9720]; +ld.shared.u32 r2002, [r2996+9724]; +ld.shared.u32 r1821, [r2996+11664]; +ld.shared.u32 r1827, [r2996+11668]; +ld.shared.u32 r1909, [r2996+13608]; +ld.shared.u32 r1915, [r2996+13612]; +ld.shared.u32 r1997, [r2996+15552]; +ld.shared.u32 r2003, [r2996+15556]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 r1822, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 r1828, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 r1846, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 r1864, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 r1882, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 r1900, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 r1910, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 r1916, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 r1934, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 r1952, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 r1970, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 r1998, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 r2004, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 r2040, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 r2058, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2080, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2081, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2082, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2085, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2086, {low, high}; +} +{ +mul.f16x2 r2095, r1934, r2079; +} +{ +mul.f16x2 r2098, r1970, r2080; +} +{ +sub.f16x2 r2101, r2095, r2098; +} +{ +mul.f16x2 r2104, r1934, r2080; +} +{ +fma.rn.f16x2 r2107, r1970, r2079, r2104; +} +{ +mul.f16x2 r2111, r2022, r2081; +} +{ +mul.f16x2 r2114, r2058, r2082; +} +{ +sub.f16x2 r2117, r2111, r2114; +} +{ +mul.f16x2 r2120, r2022, r2082; +} +{ +fma.rn.f16x2 r2123, r2058, r2081, r2120; +} +{ +mul.f16x2 r2127, r1952, r2081; +} +{ +mul.f16x2 r2130, r1988, r2082; +} +{ +sub.f16x2 r2133, r2127, r2130; +} +{ +mul.f16x2 r2136, r1952, r2082; +} +{ +fma.rn.f16x2 r2139, r1988, r2081, r2136; +} +{ +mul.f16x2 r2143, r2040, r2085; +} +{ +mul.f16x2 r2146, r2076, r2086; +} +{ +sub.f16x2 r2149, r2143, r2146; +} +{ +mul.f16x2 r2152, r2040, r2086; +} +{ +fma.rn.f16x2 r2155, r2076, r2085, r2152; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2160, {low, high}; +} +{ +neg.f16x2 r2161, r2160; +} +{ +add.f16x2 r2163, r1910, r1998; +} +{ +add.f16x2 r2166, r1822, r2163; +} +{ +add.f16x2 r2169, r1916, r2004; +} +{ +add.f16x2 r2172, r1828, r2169; +} +{ +add.f16x2 r2175, r1910, r1998; +} +{ +mul.f16x2 r2178, r2175, r2159; +} +{ +add.f16x2 r2181, r1822, r2178; +} +{ +sub.f16x2 r2184, r1916, r2004; +} +{ +mul.f16x2 r2187, r2184, r2161; +} +{ +add.f16x2 r2190, r2181, r2187; +} +{ +add.f16x2 r2193, r1910, r1998; +} +{ +mul.f16x2 r2196, r2193, r2159; +} +{ +add.f16x2 r2199, r1822, r2196; +} +{ +sub.f16x2 r2202, r1916, r2004; +} +{ +mul.f16x2 r2205, r2202, r2161; +} +{ +sub.f16x2 r2208, r2199, r2205; +} +{ +add.f16x2 r2211, r1916, r2004; +} +{ +mul.f16x2 r2214, r2211, r2159; +} +{ +add.f16x2 r2217, r1828, r2214; +} +{ +sub.f16x2 r2220, r1910, r1998; +} +{ +mul.f16x2 r2223, r2220, r2161; +} +{ +sub.f16x2 r2226, r2217, r2223; +} +{ +add.f16x2 r2229, r1916, r2004; +} +{ +mul.f16x2 r2232, r2229, r2159; +} +{ +add.f16x2 r2235, r1828, r2232; +} +{ +sub.f16x2 r2238, r1910, r1998; +} +{ +mul.f16x2 r2241, r2238, r2161; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2248, {low, high}; +} +{ +neg.f16x2 r2249, r2248; +} +{ +add.f16x2 r2251, r2101, r2117; +} +{ +add.f16x2 r2254, r1846, r2251; +} +{ +add.f16x2 r2257, r2107, r2123; +} +{ +add.f16x2 r2260, r1882, r2257; +} +{ +add.f16x2 r2263, r2101, r2117; +} +{ +mul.f16x2 r2266, r2263, r2247; +} +{ +add.f16x2 r2269, r1846, r2266; +} +{ +sub.f16x2 r2272, r2107, r2123; +} +{ +mul.f16x2 r2275, r2272, r2249; +} +{ +add.f16x2 r2278, r2269, r2275; +} +{ +add.f16x2 r2281, r2101, r2117; +} +{ +mul.f16x2 r2284, r2281, r2247; +} +{ +add.f16x2 r2287, r1846, r2284; +} +{ +sub.f16x2 r2290, r2107, r2123; +} +{ +mul.f16x2 r2293, r2290, r2249; +} +{ +sub.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r2107, r2123; +} +{ +mul.f16x2 r2302, r2299, r2247; +} +{ +add.f16x2 r2305, r1882, r2302; +} +{ +sub.f16x2 r2308, r2101, r2117; +} +{ +mul.f16x2 r2311, r2308, r2249; +} +{ +sub.f16x2 r2314, r2305, r2311; +} +{ +add.f16x2 r2317, r2107, r2123; +} +{ +mul.f16x2 r2320, r2317, r2247; +} +{ +add.f16x2 r2323, r1882, r2320; +} +{ +sub.f16x2 r2326, r2101, r2117; +} +{ +mul.f16x2 r2329, r2326, r2249; +} +{ +add.f16x2 r2332, r2323, r2329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2336, {low, high}; +} +{ +neg.f16x2 r2337, r2336; +} +{ +add.f16x2 r2339, r2133, r2149; +} +{ +add.f16x2 r2342, r1864, r2339; +} +{ +add.f16x2 r2345, r2139, r2155; +} +{ +add.f16x2 r2348, r1900, r2345; +} +{ +add.f16x2 r2351, r2133, r2149; +} +{ +mul.f16x2 r2354, r2351, r2335; +} +{ +add.f16x2 r2357, r1864, r2354; +} +{ +sub.f16x2 r2360, r2139, r2155; +} +{ +mul.f16x2 r2363, r2360, r2337; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2133, r2149; +} +{ +mul.f16x2 r2372, r2369, r2335; +} +{ +add.f16x2 r2375, r1864, r2372; +} +{ +sub.f16x2 r2378, r2139, r2155; +} +{ +mul.f16x2 r2381, r2378, r2337; +} +{ +sub.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r2139, r2155; +} +{ +mul.f16x2 r2390, r2387, r2335; +} +{ +add.f16x2 r2393, r1900, r2390; +} +{ +sub.f16x2 r2396, r2133, r2149; +} +{ +mul.f16x2 r2399, r2396, r2337; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r2139, r2155; +} +{ +mul.f16x2 r2408, r2405, r2335; +} +{ +add.f16x2 r2411, r1900, r2408; +} +{ +sub.f16x2 r2414, r2133, r2149; +} +{ +mul.f16x2 r2417, r2414, r2337; +} +{ +add.f16x2 r2420, r2411, r2417; +} +mul.wide.u32 rd6, r2992, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r3003, rd7; +cvt.rn.f32.u32 f247, r3003; +mul.f32 f248, f247, 0f3E6E4BAE; +cos.approx.f32 f209, f248; +sin.approx.f32 f249, f248; +neg.f32 f210, f249; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2423, {low, high}; +} +mul.lo.s32 r3004, r3003, 81; +sub.s32 r3005, r2992, r3004; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2426, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2428, {high, high}; +} +{ +mul.f16x2 r2430, r2260, r2428; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r2254, r2426, r2433; +} +{ +mul.f16x2 r2439, r2254, r2428; +} +{ +fma.rn.f16x2 r2442, r2260, r2426, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2448, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2450, {low, high}; +} +{ +mul.f16x2 r2451, r2448, r2450; +} +{ +mul.f16x2 r2454, r2423, r2446; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2457, {high, low}; +} +{ +fma.rn.f16x2 r2459, r2451, r2457, r2454; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2463, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2465, {high, high}; +} +{ +mul.f16x2 r2467, r2348, r2465; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r2342, r2463, r2470; +} +{ +mul.f16x2 r2476, r2342, r2465; +} +{ +fma.rn.f16x2 r2479, r2348, r2463, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2485, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2487, {low, high}; +} +{ +mul.f16x2 r2488, r2485, r2487; +} +{ +mul.f16x2 r2491, r2459, r2483; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2494, {high, low}; +} +{ +fma.rn.f16x2 r2496, r2488, r2494, r2491; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2500, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2502, {high, high}; +} +{ +mul.f16x2 r2504, r2226, r2502; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r2190, r2500, r2507; +} +{ +mul.f16x2 r2513, r2190, r2502; +} +{ +fma.rn.f16x2 r2516, r2226, r2500, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2522, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2524, {low, high}; +} +{ +mul.f16x2 r2525, r2522, r2524; +} +{ +mul.f16x2 r2528, r2496, r2520; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2531, {high, low}; +} +{ +fma.rn.f16x2 r2533, r2525, r2531, r2528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2537, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2539, {high, high}; +} +{ +mul.f16x2 r2541, r2314, r2539; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r2278, r2537, r2544; +} +{ +mul.f16x2 r2550, r2278, r2539; +} +{ +fma.rn.f16x2 r2553, r2314, r2537, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2559, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2561, {low, high}; +} +{ +mul.f16x2 r2562, r2559, r2561; +} +{ +mul.f16x2 r2565, r2533, r2557; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2568, {high, low}; +} +{ +fma.rn.f16x2 r2570, r2562, r2568, r2565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2576, {high, high}; +} +{ +mul.f16x2 r2578, r2402, r2576; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r2366, r2574, r2581; +} +{ +mul.f16x2 r2587, r2366, r2576; +} +{ +fma.rn.f16x2 r2590, r2402, r2574, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2598, {low, high}; +} +{ +mul.f16x2 r2599, r2596, r2598; +} +{ +mul.f16x2 r2602, r2570, r2594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2605, {high, low}; +} +{ +fma.rn.f16x2 r2607, r2599, r2605, r2602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2613, {high, high}; +} +{ +mul.f16x2 r2615, r2244, r2613; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r2208, r2611, r2618; +} +{ +mul.f16x2 r2624, r2208, r2613; +} +{ +fma.rn.f16x2 r2627, r2244, r2611, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2635, {low, high}; +} +{ +mul.f16x2 r2636, r2633, r2635; +} +{ +mul.f16x2 r2639, r2607, r2631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2642, {high, low}; +} +{ +fma.rn.f16x2 r2644, r2636, r2642, r2639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2650, {high, high}; +} +{ +mul.f16x2 r2652, r2332, r2650; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r2296, r2648, r2655; +} +{ +mul.f16x2 r2661, r2296, r2650; +} +{ +fma.rn.f16x2 r2664, r2332, r2648, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2672, {low, high}; +} +{ +mul.f16x2 r2673, r2670, r2672; +} +{ +mul.f16x2 r2676, r2644, r2668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2679, {high, low}; +} +{ +fma.rn.f16x2 r2681, r2673, r2679, r2676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2687, {high, high}; +} +{ +mul.f16x2 r2689, r2420, r2687; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r2384, r2685, r2692; +} +{ +mul.f16x2 r2698, r2384, r2687; +} +{ +fma.rn.f16x2 r2701, r2420, r2685, r2698; +} +shl.b32 r3006, r3005, 3; +add.s32 r3007, r2993, r3006; +barrier.sync 0; +mad.lo.s32 r3008, r3003, 5832, r3007; +st.shared.u32 [r3008], r2166; +st.shared.u32 [r3008+4], r2172; +st.shared.u32 [r3008+648], r2435; +st.shared.u32 [r3008+652], r2442; +st.shared.u32 [r3008+1296], r2472; +st.shared.u32 [r3008+1300], r2479; +st.shared.u32 [r3008+1944], r2509; +st.shared.u32 [r3008+1948], r2516; +st.shared.u32 [r3008+2592], r2546; +st.shared.u32 [r3008+2596], r2553; +st.shared.u32 [r3008+3240], r2583; +st.shared.u32 [r3008+3244], r2590; +st.shared.u32 [r3008+3888], r2620; +st.shared.u32 [r3008+3892], r2627; +st.shared.u32 [r3008+4536], r2657; +st.shared.u32 [r3008+4540], r2664; +st.shared.u32 [r3008+5184], r2694; +st.shared.u32 [r3008+5188], r2701; +barrier.sync 0; +ld.shared.u32 r2730, [r2996]; +ld.shared.u32 r2736, [r2996+4]; +ld.shared.u32 r2818, [r2996+1944]; +ld.shared.u32 r2824, [r2996+1948]; +ld.shared.u32 r2906, [r2996+3888]; +ld.shared.u32 r2912, [r2996+3892]; +ld.shared.u32 r2727, [r2996+5832]; +ld.shared.u32 r2733, [r2996+5836]; +ld.shared.u32 r2815, [r2996+7776]; +ld.shared.u32 r2821, [r2996+7780]; +ld.shared.u32 r2903, [r2996+9720]; +ld.shared.u32 r2909, [r2996+9724]; +ld.shared.u32 r2728, [r2996+11664]; +ld.shared.u32 r2734, [r2996+11668]; +ld.shared.u32 r2816, [r2996+13608]; +ld.shared.u32 r2822, [r2996+13612]; +ld.shared.u32 r2904, [r2996+15552]; +ld.shared.u32 r2910, [r2996+15556]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2723, {low, high}; +} +{ +neg.f16x2 r2724, r2723; +} +{ +add.f16x2 r2726, r2727, r2728; +} +{ +add.f16x2 %0, r2730, r2726; +} +{ +add.f16x2 r2732, r2733, r2734; +} +{ +add.f16x2 %1, r2736, r2732; +} +{ +add.f16x2 r2738, r2727, r2728; +} +{ +mul.f16x2 r2741, r2738, r2722; +} +{ +add.f16x2 r2744, r2730, r2741; +} +{ +sub.f16x2 r2747, r2733, r2734; +} +{ +mul.f16x2 r2750, r2747, r2724; +} +{ +add.f16x2 %6, r2744, r2750; +} +{ +add.f16x2 r2756, r2727, r2728; +} +{ +mul.f16x2 r2759, r2756, r2722; +} +{ +add.f16x2 r2762, r2730, r2759; +} +{ +sub.f16x2 r2765, r2733, r2734; +} +{ +mul.f16x2 r2768, r2765, r2724; +} +{ +sub.f16x2 %12, r2762, r2768; +} +{ +add.f16x2 r2774, r2733, r2734; +} +{ +mul.f16x2 r2777, r2774, r2722; +} +{ +add.f16x2 r2780, r2736, r2777; +} +{ +sub.f16x2 r2783, r2727, r2728; +} +{ +mul.f16x2 r2786, r2783, r2724; +} +{ +sub.f16x2 %7, r2780, r2786; +} +{ +add.f16x2 r2792, r2733, r2734; +} +{ +mul.f16x2 r2795, r2792, r2722; +} +{ +add.f16x2 r2798, r2736, r2795; +} +{ +sub.f16x2 r2801, r2727, r2728; +} +{ +mul.f16x2 r2804, r2801, r2724; +} +{ +add.f16x2 %13, r2798, r2804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2810, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2811, {low, high}; +} +{ +neg.f16x2 r2812, r2811; +} +{ +add.f16x2 r2814, r2815, r2816; +} +{ +add.f16x2 %2, r2818, r2814; +} +{ +add.f16x2 r2820, r2821, r2822; +} +{ +add.f16x2 %3, r2824, r2820; +} +{ +add.f16x2 r2826, r2815, r2816; +} +{ +mul.f16x2 r2829, r2826, r2810; +} +{ +add.f16x2 r2832, r2818, r2829; +} +{ +sub.f16x2 r2835, r2821, r2822; +} +{ +mul.f16x2 r2838, r2835, r2812; +} +{ +add.f16x2 %8, r2832, r2838; +} +{ +add.f16x2 r2844, r2815, r2816; +} +{ +mul.f16x2 r2847, r2844, r2810; +} +{ +add.f16x2 r2850, r2818, r2847; +} +{ +sub.f16x2 r2853, r2821, r2822; +} +{ +mul.f16x2 r2856, r2853, r2812; +} +{ +sub.f16x2 %14, r2850, r2856; +} +{ +add.f16x2 r2862, r2821, r2822; +} +{ +mul.f16x2 r2865, r2862, r2810; +} +{ +add.f16x2 r2868, r2824, r2865; +} +{ +sub.f16x2 r2871, r2815, r2816; +} +{ +mul.f16x2 r2874, r2871, r2812; +} +{ +sub.f16x2 %9, r2868, r2874; +} +{ +add.f16x2 r2880, r2821, r2822; +} +{ +mul.f16x2 r2883, r2880, r2810; +} +{ +add.f16x2 r2886, r2824, r2883; +} +{ +sub.f16x2 r2889, r2815, r2816; +} +{ +mul.f16x2 r2892, r2889, r2812; +} +{ +add.f16x2 %15, r2886, r2892; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2898, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2899, {low, high}; +} +{ +neg.f16x2 r2900, r2899; +} +{ +add.f16x2 r2902, r2903, r2904; +} +{ +add.f16x2 %4, r2906, r2902; +} +{ +add.f16x2 r2908, r2909, r2910; +} +{ +add.f16x2 %5, r2912, r2908; +} +{ +add.f16x2 r2914, r2903, r2904; +} +{ +mul.f16x2 r2917, r2914, r2898; +} +{ +add.f16x2 r2920, r2906, r2917; +} +{ +sub.f16x2 r2923, r2909, r2910; +} +{ +mul.f16x2 r2926, r2923, r2900; +} +{ +add.f16x2 %10, r2920, r2926; +} +{ +add.f16x2 r2932, r2903, r2904; +} +{ +mul.f16x2 r2935, r2932, r2898; +} +{ +add.f16x2 r2938, r2906, r2935; +} +{ +sub.f16x2 r2941, r2909, r2910; +} +{ +mul.f16x2 r2944, r2941, r2900; +} +{ +sub.f16x2 %16, r2938, r2944; +} +{ +add.f16x2 r2950, r2909, r2910; +} +{ +mul.f16x2 r2953, r2950, r2898; +} +{ +add.f16x2 r2956, r2912, r2953; +} +{ +sub.f16x2 r2959, r2903, r2904; +} +{ +mul.f16x2 r2962, r2959, r2900; +} +{ +sub.f16x2 %11, r2956, r2962; +} +{ +add.f16x2 r2968, r2909, r2910; +} +{ +mul.f16x2 r2971, r2968, r2898; +} +{ +add.f16x2 r2974, r2912, r2971; +} +{ +sub.f16x2 r2977, r2903, r2904; +} +{ +mul.f16x2 r2980, r2977, r2900; +} +{ +add.f16x2 %17, r2974, r2980; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<892, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<250>; +.reg .b32 r<3009>; +.reg .b64 rd<8>; +mov.u32 r2986, %tid.y; +mov.u32 r2987, %18; +mad.lo.s32 r2988, r2986, 8748, r2987; +mov.u32 r2989, %tid.x; +mov.f32 f238, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1, {low, high}; +} +mov.f32 f240, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r265, {low, high}; +} +mov.f32 f168, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r266, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r267, {low, high}; +} +mov.f32 f172, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r268, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r271, {low, high}; +} +mov.f32 f180, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r2989, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r2990, rd3; +mul.lo.s32 r2991, r2990, 243; +sub.s32 r2992, r2989, r2991; +mad.lo.s32 r2993, r2990, 8748, r2988; +cvt.rn.f32.u32 f241, r2992; +mul.f32 f242, f241, 0f3B3C4870; +cos.approx.f32 f57, f242; +sin.approx.f32 f243, f242; +neg.f32 f58, f243; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +barrier.sync 0; +mad.lo.s32 r2994, r2992, 36, r2993; +st.shared.u32 [r2994], r352; +st.shared.u32 [r2994+4], r621; +st.shared.u32 [r2994+8], r658; +st.shared.u32 [r2994+12], r695; +st.shared.u32 [r2994+16], r732; +st.shared.u32 [r2994+20], r769; +st.shared.u32 [r2994+24], r806; +st.shared.u32 [r2994+28], r843; +st.shared.u32 [r2994+32], r880; +barrier.sync 0; +shl.b32 r2995, r2992, 5; +sub.s32 r2996, r2994, r2995; +ld.shared.u32 r916, [r2996]; +ld.shared.u32 r1004, [r2996+972]; +ld.shared.u32 r1092, [r2996+1944]; +ld.shared.u32 r913, [r2996+2916]; +ld.shared.u32 r1001, [r2996+3888]; +ld.shared.u32 r1089, [r2996+4860]; +ld.shared.u32 r914, [r2996+5832]; +ld.shared.u32 r1002, [r2996+6804]; +ld.shared.u32 r1090, [r2996+7776]; +barrier.sync 0; +st.shared.u32 [r2994], r358; +st.shared.u32 [r2994+4], r628; +st.shared.u32 [r2994+8], r665; +st.shared.u32 [r2994+12], r702; +st.shared.u32 [r2994+16], r739; +st.shared.u32 [r2994+20], r776; +st.shared.u32 [r2994+24], r813; +st.shared.u32 [r2994+28], r850; +st.shared.u32 [r2994+32], r887; +barrier.sync 0; +ld.shared.u32 r922, [r2996]; +ld.shared.u32 r1010, [r2996+972]; +ld.shared.u32 r1098, [r2996+1944]; +ld.shared.u32 r919, [r2996+2916]; +ld.shared.u32 r1007, [r2996+3888]; +ld.shared.u32 r1095, [r2996+4860]; +ld.shared.u32 r920, [r2996+5832]; +ld.shared.u32 r1008, [r2996+6804]; +ld.shared.u32 r1096, [r2996+7776]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r2992, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2997, rd5; +mul.lo.s32 r2998, r2997, 9; +sub.s32 r2999, r2992, r2998; +shl.b32 r3000, r2999, 2; +add.s32 r3001, r2993, r3000; +cvt.rn.f32.u32 f244, r2997; +mul.f32 f245, f244, 0f3CD3D17E; +cos.approx.f32 f133, f245; +sin.approx.f32 f246, f245; +neg.f32 f134, f246; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +barrier.sync 0; +mad.lo.s32 r3002, r2997, 324, r3001; +st.shared.u32 [r3002], r1259; +st.shared.u32 [r3002+36], r1528; +st.shared.u32 [r3002+72], r1565; +st.shared.u32 [r3002+108], r1602; +st.shared.u32 [r3002+144], r1639; +st.shared.u32 [r3002+180], r1676; +st.shared.u32 [r3002+216], r1713; +st.shared.u32 [r3002+252], r1750; +st.shared.u32 [r3002+288], r1787; +barrier.sync 0; +ld.shared.u32 r1823, [r2996]; +ld.shared.u32 r1911, [r2996+972]; +ld.shared.u32 r1999, [r2996+1944]; +ld.shared.u32 r1820, [r2996+2916]; +ld.shared.u32 r1908, [r2996+3888]; +ld.shared.u32 r1996, [r2996+4860]; +ld.shared.u32 r1821, [r2996+5832]; +ld.shared.u32 r1909, [r2996+6804]; +ld.shared.u32 r1997, [r2996+7776]; +barrier.sync 0; +st.shared.u32 [r3002], r1265; +st.shared.u32 [r3002+36], r1535; +st.shared.u32 [r3002+72], r1572; +st.shared.u32 [r3002+108], r1609; +st.shared.u32 [r3002+144], r1646; +st.shared.u32 [r3002+180], r1683; +st.shared.u32 [r3002+216], r1720; +st.shared.u32 [r3002+252], r1757; +st.shared.u32 [r3002+288], r1794; +barrier.sync 0; +ld.shared.u32 r1829, [r2996]; +ld.shared.u32 r1917, [r2996+972]; +ld.shared.u32 r2005, [r2996+1944]; +ld.shared.u32 r1826, [r2996+2916]; +ld.shared.u32 r1914, [r2996+3888]; +ld.shared.u32 r2002, [r2996+4860]; +ld.shared.u32 r1827, [r2996+5832]; +ld.shared.u32 r1915, [r2996+6804]; +ld.shared.u32 r2003, [r2996+7776]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 r1822, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 r1828, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 r1846, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 r1864, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 r1882, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 r1900, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 r1910, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 r1916, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 r1934, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 r1952, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 r1970, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 r1998, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 r2004, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 r2040, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 r2058, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2080, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2081, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2082, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2085, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2086, {low, high}; +} +{ +mul.f16x2 r2095, r1934, r2079; +} +{ +mul.f16x2 r2098, r1970, r2080; +} +{ +sub.f16x2 r2101, r2095, r2098; +} +{ +mul.f16x2 r2104, r1934, r2080; +} +{ +fma.rn.f16x2 r2107, r1970, r2079, r2104; +} +{ +mul.f16x2 r2111, r2022, r2081; +} +{ +mul.f16x2 r2114, r2058, r2082; +} +{ +sub.f16x2 r2117, r2111, r2114; +} +{ +mul.f16x2 r2120, r2022, r2082; +} +{ +fma.rn.f16x2 r2123, r2058, r2081, r2120; +} +{ +mul.f16x2 r2127, r1952, r2081; +} +{ +mul.f16x2 r2130, r1988, r2082; +} +{ +sub.f16x2 r2133, r2127, r2130; +} +{ +mul.f16x2 r2136, r1952, r2082; +} +{ +fma.rn.f16x2 r2139, r1988, r2081, r2136; +} +{ +mul.f16x2 r2143, r2040, r2085; +} +{ +mul.f16x2 r2146, r2076, r2086; +} +{ +sub.f16x2 r2149, r2143, r2146; +} +{ +mul.f16x2 r2152, r2040, r2086; +} +{ +fma.rn.f16x2 r2155, r2076, r2085, r2152; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2160, {low, high}; +} +{ +neg.f16x2 r2161, r2160; +} +{ +add.f16x2 r2163, r1910, r1998; +} +{ +add.f16x2 r2166, r1822, r2163; +} +{ +add.f16x2 r2169, r1916, r2004; +} +{ +add.f16x2 r2172, r1828, r2169; +} +{ +add.f16x2 r2175, r1910, r1998; +} +{ +mul.f16x2 r2178, r2175, r2159; +} +{ +add.f16x2 r2181, r1822, r2178; +} +{ +sub.f16x2 r2184, r1916, r2004; +} +{ +mul.f16x2 r2187, r2184, r2161; +} +{ +add.f16x2 r2190, r2181, r2187; +} +{ +add.f16x2 r2193, r1910, r1998; +} +{ +mul.f16x2 r2196, r2193, r2159; +} +{ +add.f16x2 r2199, r1822, r2196; +} +{ +sub.f16x2 r2202, r1916, r2004; +} +{ +mul.f16x2 r2205, r2202, r2161; +} +{ +sub.f16x2 r2208, r2199, r2205; +} +{ +add.f16x2 r2211, r1916, r2004; +} +{ +mul.f16x2 r2214, r2211, r2159; +} +{ +add.f16x2 r2217, r1828, r2214; +} +{ +sub.f16x2 r2220, r1910, r1998; +} +{ +mul.f16x2 r2223, r2220, r2161; +} +{ +sub.f16x2 r2226, r2217, r2223; +} +{ +add.f16x2 r2229, r1916, r2004; +} +{ +mul.f16x2 r2232, r2229, r2159; +} +{ +add.f16x2 r2235, r1828, r2232; +} +{ +sub.f16x2 r2238, r1910, r1998; +} +{ +mul.f16x2 r2241, r2238, r2161; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2248, {low, high}; +} +{ +neg.f16x2 r2249, r2248; +} +{ +add.f16x2 r2251, r2101, r2117; +} +{ +add.f16x2 r2254, r1846, r2251; +} +{ +add.f16x2 r2257, r2107, r2123; +} +{ +add.f16x2 r2260, r1882, r2257; +} +{ +add.f16x2 r2263, r2101, r2117; +} +{ +mul.f16x2 r2266, r2263, r2247; +} +{ +add.f16x2 r2269, r1846, r2266; +} +{ +sub.f16x2 r2272, r2107, r2123; +} +{ +mul.f16x2 r2275, r2272, r2249; +} +{ +add.f16x2 r2278, r2269, r2275; +} +{ +add.f16x2 r2281, r2101, r2117; +} +{ +mul.f16x2 r2284, r2281, r2247; +} +{ +add.f16x2 r2287, r1846, r2284; +} +{ +sub.f16x2 r2290, r2107, r2123; +} +{ +mul.f16x2 r2293, r2290, r2249; +} +{ +sub.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r2107, r2123; +} +{ +mul.f16x2 r2302, r2299, r2247; +} +{ +add.f16x2 r2305, r1882, r2302; +} +{ +sub.f16x2 r2308, r2101, r2117; +} +{ +mul.f16x2 r2311, r2308, r2249; +} +{ +sub.f16x2 r2314, r2305, r2311; +} +{ +add.f16x2 r2317, r2107, r2123; +} +{ +mul.f16x2 r2320, r2317, r2247; +} +{ +add.f16x2 r2323, r1882, r2320; +} +{ +sub.f16x2 r2326, r2101, r2117; +} +{ +mul.f16x2 r2329, r2326, r2249; +} +{ +add.f16x2 r2332, r2323, r2329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2336, {low, high}; +} +{ +neg.f16x2 r2337, r2336; +} +{ +add.f16x2 r2339, r2133, r2149; +} +{ +add.f16x2 r2342, r1864, r2339; +} +{ +add.f16x2 r2345, r2139, r2155; +} +{ +add.f16x2 r2348, r1900, r2345; +} +{ +add.f16x2 r2351, r2133, r2149; +} +{ +mul.f16x2 r2354, r2351, r2335; +} +{ +add.f16x2 r2357, r1864, r2354; +} +{ +sub.f16x2 r2360, r2139, r2155; +} +{ +mul.f16x2 r2363, r2360, r2337; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2133, r2149; +} +{ +mul.f16x2 r2372, r2369, r2335; +} +{ +add.f16x2 r2375, r1864, r2372; +} +{ +sub.f16x2 r2378, r2139, r2155; +} +{ +mul.f16x2 r2381, r2378, r2337; +} +{ +sub.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r2139, r2155; +} +{ +mul.f16x2 r2390, r2387, r2335; +} +{ +add.f16x2 r2393, r1900, r2390; +} +{ +sub.f16x2 r2396, r2133, r2149; +} +{ +mul.f16x2 r2399, r2396, r2337; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r2139, r2155; +} +{ +mul.f16x2 r2408, r2405, r2335; +} +{ +add.f16x2 r2411, r1900, r2408; +} +{ +sub.f16x2 r2414, r2133, r2149; +} +{ +mul.f16x2 r2417, r2414, r2337; +} +{ +add.f16x2 r2420, r2411, r2417; +} +mul.wide.u32 rd6, r2992, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r3003, rd7; +mul.lo.s32 r3004, r3003, 81; +sub.s32 r3005, r2992, r3004; +shl.b32 r3006, r3005, 2; +add.s32 r3007, r2993, r3006; +cvt.rn.f32.u32 f247, r3003; +mul.f32 f248, f247, 0f3E6E4BAE; +cos.approx.f32 f209, f248; +sin.approx.f32 f249, f248; +neg.f32 f210, f249; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2423, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2426, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2428, {high, high}; +} +{ +mul.f16x2 r2430, r2260, r2428; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r2254, r2426, r2433; +} +{ +mul.f16x2 r2439, r2254, r2428; +} +{ +fma.rn.f16x2 r2442, r2260, r2426, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2448, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2450, {low, high}; +} +{ +mul.f16x2 r2451, r2448, r2450; +} +{ +mul.f16x2 r2454, r2423, r2446; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2457, {high, low}; +} +{ +fma.rn.f16x2 r2459, r2451, r2457, r2454; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2463, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2465, {high, high}; +} +{ +mul.f16x2 r2467, r2348, r2465; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r2342, r2463, r2470; +} +{ +mul.f16x2 r2476, r2342, r2465; +} +{ +fma.rn.f16x2 r2479, r2348, r2463, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2485, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2487, {low, high}; +} +{ +mul.f16x2 r2488, r2485, r2487; +} +{ +mul.f16x2 r2491, r2459, r2483; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2494, {high, low}; +} +{ +fma.rn.f16x2 r2496, r2488, r2494, r2491; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2500, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2502, {high, high}; +} +{ +mul.f16x2 r2504, r2226, r2502; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r2190, r2500, r2507; +} +{ +mul.f16x2 r2513, r2190, r2502; +} +{ +fma.rn.f16x2 r2516, r2226, r2500, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2522, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2524, {low, high}; +} +{ +mul.f16x2 r2525, r2522, r2524; +} +{ +mul.f16x2 r2528, r2496, r2520; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2531, {high, low}; +} +{ +fma.rn.f16x2 r2533, r2525, r2531, r2528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2537, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2539, {high, high}; +} +{ +mul.f16x2 r2541, r2314, r2539; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r2278, r2537, r2544; +} +{ +mul.f16x2 r2550, r2278, r2539; +} +{ +fma.rn.f16x2 r2553, r2314, r2537, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2559, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2561, {low, high}; +} +{ +mul.f16x2 r2562, r2559, r2561; +} +{ +mul.f16x2 r2565, r2533, r2557; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2568, {high, low}; +} +{ +fma.rn.f16x2 r2570, r2562, r2568, r2565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2576, {high, high}; +} +{ +mul.f16x2 r2578, r2402, r2576; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r2366, r2574, r2581; +} +{ +mul.f16x2 r2587, r2366, r2576; +} +{ +fma.rn.f16x2 r2590, r2402, r2574, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2598, {low, high}; +} +{ +mul.f16x2 r2599, r2596, r2598; +} +{ +mul.f16x2 r2602, r2570, r2594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2605, {high, low}; +} +{ +fma.rn.f16x2 r2607, r2599, r2605, r2602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2613, {high, high}; +} +{ +mul.f16x2 r2615, r2244, r2613; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r2208, r2611, r2618; +} +{ +mul.f16x2 r2624, r2208, r2613; +} +{ +fma.rn.f16x2 r2627, r2244, r2611, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2635, {low, high}; +} +{ +mul.f16x2 r2636, r2633, r2635; +} +{ +mul.f16x2 r2639, r2607, r2631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2642, {high, low}; +} +{ +fma.rn.f16x2 r2644, r2636, r2642, r2639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2650, {high, high}; +} +{ +mul.f16x2 r2652, r2332, r2650; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r2296, r2648, r2655; +} +{ +mul.f16x2 r2661, r2296, r2650; +} +{ +fma.rn.f16x2 r2664, r2332, r2648, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2672, {low, high}; +} +{ +mul.f16x2 r2673, r2670, r2672; +} +{ +mul.f16x2 r2676, r2644, r2668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2679, {high, low}; +} +{ +fma.rn.f16x2 r2681, r2673, r2679, r2676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2687, {high, high}; +} +{ +mul.f16x2 r2689, r2420, r2687; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r2384, r2685, r2692; +} +{ +mul.f16x2 r2698, r2384, r2687; +} +{ +fma.rn.f16x2 r2701, r2420, r2685, r2698; +} +barrier.sync 0; +mad.lo.s32 r3008, r3003, 2916, r3007; +st.shared.u32 [r3008], r2166; +st.shared.u32 [r3008+324], r2435; +st.shared.u32 [r3008+648], r2472; +st.shared.u32 [r3008+972], r2509; +st.shared.u32 [r3008+1296], r2546; +st.shared.u32 [r3008+1620], r2583; +st.shared.u32 [r3008+1944], r2620; +st.shared.u32 [r3008+2268], r2657; +st.shared.u32 [r3008+2592], r2694; +barrier.sync 0; +ld.shared.u32 r2730, [r2996]; +ld.shared.u32 r2818, [r2996+972]; +ld.shared.u32 r2906, [r2996+1944]; +ld.shared.u32 r2727, [r2996+2916]; +ld.shared.u32 r2815, [r2996+3888]; +ld.shared.u32 r2903, [r2996+4860]; +ld.shared.u32 r2728, [r2996+5832]; +ld.shared.u32 r2816, [r2996+6804]; +ld.shared.u32 r2904, [r2996+7776]; +barrier.sync 0; +st.shared.u32 [r3008], r2172; +st.shared.u32 [r3008+324], r2442; +st.shared.u32 [r3008+648], r2479; +st.shared.u32 [r3008+972], r2516; +st.shared.u32 [r3008+1296], r2553; +st.shared.u32 [r3008+1620], r2590; +st.shared.u32 [r3008+1944], r2627; +st.shared.u32 [r3008+2268], r2664; +st.shared.u32 [r3008+2592], r2701; +barrier.sync 0; +ld.shared.u32 r2736, [r2996]; +ld.shared.u32 r2824, [r2996+972]; +ld.shared.u32 r2912, [r2996+1944]; +ld.shared.u32 r2733, [r2996+2916]; +ld.shared.u32 r2821, [r2996+3888]; +ld.shared.u32 r2909, [r2996+4860]; +ld.shared.u32 r2734, [r2996+5832]; +ld.shared.u32 r2822, [r2996+6804]; +ld.shared.u32 r2910, [r2996+7776]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2723, {low, high}; +} +{ +neg.f16x2 r2724, r2723; +} +{ +add.f16x2 r2726, r2727, r2728; +} +{ +add.f16x2 %0, r2730, r2726; +} +{ +add.f16x2 r2732, r2733, r2734; +} +{ +add.f16x2 %1, r2736, r2732; +} +{ +add.f16x2 r2738, r2727, r2728; +} +{ +mul.f16x2 r2741, r2738, r2722; +} +{ +add.f16x2 r2744, r2730, r2741; +} +{ +sub.f16x2 r2747, r2733, r2734; +} +{ +mul.f16x2 r2750, r2747, r2724; +} +{ +add.f16x2 %6, r2744, r2750; +} +{ +add.f16x2 r2756, r2727, r2728; +} +{ +mul.f16x2 r2759, r2756, r2722; +} +{ +add.f16x2 r2762, r2730, r2759; +} +{ +sub.f16x2 r2765, r2733, r2734; +} +{ +mul.f16x2 r2768, r2765, r2724; +} +{ +sub.f16x2 %12, r2762, r2768; +} +{ +add.f16x2 r2774, r2733, r2734; +} +{ +mul.f16x2 r2777, r2774, r2722; +} +{ +add.f16x2 r2780, r2736, r2777; +} +{ +sub.f16x2 r2783, r2727, r2728; +} +{ +mul.f16x2 r2786, r2783, r2724; +} +{ +sub.f16x2 %7, r2780, r2786; +} +{ +add.f16x2 r2792, r2733, r2734; +} +{ +mul.f16x2 r2795, r2792, r2722; +} +{ +add.f16x2 r2798, r2736, r2795; +} +{ +sub.f16x2 r2801, r2727, r2728; +} +{ +mul.f16x2 r2804, r2801, r2724; +} +{ +add.f16x2 %13, r2798, r2804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2810, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2811, {low, high}; +} +{ +neg.f16x2 r2812, r2811; +} +{ +add.f16x2 r2814, r2815, r2816; +} +{ +add.f16x2 %2, r2818, r2814; +} +{ +add.f16x2 r2820, r2821, r2822; +} +{ +add.f16x2 %3, r2824, r2820; +} +{ +add.f16x2 r2826, r2815, r2816; +} +{ +mul.f16x2 r2829, r2826, r2810; +} +{ +add.f16x2 r2832, r2818, r2829; +} +{ +sub.f16x2 r2835, r2821, r2822; +} +{ +mul.f16x2 r2838, r2835, r2812; +} +{ +add.f16x2 %8, r2832, r2838; +} +{ +add.f16x2 r2844, r2815, r2816; +} +{ +mul.f16x2 r2847, r2844, r2810; +} +{ +add.f16x2 r2850, r2818, r2847; +} +{ +sub.f16x2 r2853, r2821, r2822; +} +{ +mul.f16x2 r2856, r2853, r2812; +} +{ +sub.f16x2 %14, r2850, r2856; +} +{ +add.f16x2 r2862, r2821, r2822; +} +{ +mul.f16x2 r2865, r2862, r2810; +} +{ +add.f16x2 r2868, r2824, r2865; +} +{ +sub.f16x2 r2871, r2815, r2816; +} +{ +mul.f16x2 r2874, r2871, r2812; +} +{ +sub.f16x2 %9, r2868, r2874; +} +{ +add.f16x2 r2880, r2821, r2822; +} +{ +mul.f16x2 r2883, r2880, r2810; +} +{ +add.f16x2 r2886, r2824, r2883; +} +{ +sub.f16x2 r2889, r2815, r2816; +} +{ +mul.f16x2 r2892, r2889, r2812; +} +{ +add.f16x2 %15, r2886, r2892; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2898, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2899, {low, high}; +} +{ +neg.f16x2 r2900, r2899; +} +{ +add.f16x2 r2902, r2903, r2904; +} +{ +add.f16x2 %4, r2906, r2902; +} +{ +add.f16x2 r2908, r2909, r2910; +} +{ +add.f16x2 %5, r2912, r2908; +} +{ +add.f16x2 r2914, r2903, r2904; +} +{ +mul.f16x2 r2917, r2914, r2898; +} +{ +add.f16x2 r2920, r2906, r2917; +} +{ +sub.f16x2 r2923, r2909, r2910; +} +{ +mul.f16x2 r2926, r2923, r2900; +} +{ +add.f16x2 %10, r2920, r2926; +} +{ +add.f16x2 r2932, r2903, r2904; +} +{ +mul.f16x2 r2935, r2932, r2898; +} +{ +add.f16x2 r2938, r2906, r2935; +} +{ +sub.f16x2 r2941, r2909, r2910; +} +{ +mul.f16x2 r2944, r2941, r2900; +} +{ +sub.f16x2 %16, r2938, r2944; +} +{ +add.f16x2 r2950, r2909, r2910; +} +{ +mul.f16x2 r2953, r2950, r2898; +} +{ +add.f16x2 r2956, r2912, r2953; +} +{ +sub.f16x2 r2959, r2903, r2904; +} +{ +mul.f16x2 r2962, r2959, r2900; +} +{ +sub.f16x2 %11, r2956, r2962; +} +{ +add.f16x2 r2968, r2909, r2910; +} +{ +mul.f16x2 r2971, r2968, r2898; +} +{ +add.f16x2 r2974, r2912, r2971; +} +{ +sub.f16x2 r2977, r2903, r2904; +} +{ +mul.f16x2 r2980, r2977, r2900; +} +{ +add.f16x2 %17, r2974, r2980; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<893, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<771>; +.reg .b32 r<8646>; +.reg .b64 rd<6>; +mov.u32 r8572, %54; +mov.u32 r8645, %tid.y; +mad.lo.s32 r8573, r8645, 8748, r8572; +mov.u32 r8574, %tid.x; +mov.f32 f762, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1, {low, high}; +} +mov.f32 f764, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r265, {low, high}; +} +mov.f32 f544, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r266, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r267, {low, high}; +} +mov.f32 f556, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r268, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r271, {low, high}; +} +mov.f32 f580, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %82, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %82, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %82, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %83, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %81; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %83, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %81; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %83, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %81; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %81; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %83, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %81; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %83, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1825, {low, high}; +} +mov.f32 f536, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1826, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1827, {low, high}; +} +mov.f32 f540, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1830, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1831, {low, high}; +} +mov.f32 f548, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1832, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1833, {low, high}; +} +mov.f32 f552, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1836, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1837, {low, high}; +} +mov.f32 f560, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1838, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1839, {low, high}; +} +mov.f32 f564, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1840, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1843, {low, high}; +} +mov.f32 f572, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1844, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1848, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1851, {low, high}; +} +mov.f32 f588, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1855, {low, high}; +} +mov.f32 f596, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r8574, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r8575, rd3; +mul.lo.s32 r8576, r8575, 81; +sub.s32 r8577, r8574, r8576; +mad.lo.s32 r8578, r8575, 8748, r8573; +cvt.rn.f32.u32 f765, r8577; +mul.f32 f766, f765, 0f3B3C4870; +cos.approx.f32 f309, f766; +sin.approx.f32 f767, f766; +neg.f32 f310, f767; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +barrier.sync 0; +mad.lo.s32 r8579, r8577, 108, r8578; +st.shared.u32 [r8579], r2140; +st.shared.u32 [r8579+4], r2937; +st.shared.u32 [r8579+8], r2974; +st.shared.u32 [r8579+12], r3011; +st.shared.u32 [r8579+16], r3048; +st.shared.u32 [r8579+20], r3085; +st.shared.u32 [r8579+24], r3122; +st.shared.u32 [r8579+28], r3159; +st.shared.u32 [r8579+32], r3196; +st.shared.u32 [r8579+36], r3233; +st.shared.u32 [r8579+40], r3270; +st.shared.u32 [r8579+44], r3307; +st.shared.u32 [r8579+48], r3344; +st.shared.u32 [r8579+52], r3381; +st.shared.u32 [r8579+56], r3418; +st.shared.u32 [r8579+60], r3455; +st.shared.u32 [r8579+64], r3492; +st.shared.u32 [r8579+68], r3529; +st.shared.u32 [r8579+72], r3566; +st.shared.u32 [r8579+76], r3603; +st.shared.u32 [r8579+80], r3640; +st.shared.u32 [r8579+84], r3677; +st.shared.u32 [r8579+88], r3714; +st.shared.u32 [r8579+92], r3751; +st.shared.u32 [r8579+96], r3788; +st.shared.u32 [r8579+100], r3825; +st.shared.u32 [r8579+104], r3862; +barrier.sync 0; +mad.lo.s32 r8580, r8577, -104, r8579; +ld.shared.u32 r3898, [r8580]; +ld.shared.u32 r4506, [r8580+324]; +ld.shared.u32 r5114, [r8580+648]; +ld.shared.u32 r3986, [r8580+972]; +ld.shared.u32 r4594, [r8580+1296]; +ld.shared.u32 r5202, [r8580+1620]; +ld.shared.u32 r4074, [r8580+1944]; +ld.shared.u32 r4682, [r8580+2268]; +ld.shared.u32 r5290, [r8580+2592]; +ld.shared.u32 r3895, [r8580+2916]; +ld.shared.u32 r4503, [r8580+3240]; +ld.shared.u32 r5111, [r8580+3564]; +ld.shared.u32 r3983, [r8580+3888]; +ld.shared.u32 r4591, [r8580+4212]; +ld.shared.u32 r5199, [r8580+4536]; +ld.shared.u32 r4071, [r8580+4860]; +ld.shared.u32 r4679, [r8580+5184]; +ld.shared.u32 r5287, [r8580+5508]; +ld.shared.u32 r3896, [r8580+5832]; +ld.shared.u32 r4504, [r8580+6156]; +ld.shared.u32 r5112, [r8580+6480]; +ld.shared.u32 r3984, [r8580+6804]; +ld.shared.u32 r4592, [r8580+7128]; +ld.shared.u32 r5200, [r8580+7452]; +ld.shared.u32 r4072, [r8580+7776]; +ld.shared.u32 r4680, [r8580+8100]; +ld.shared.u32 r5288, [r8580+8424]; +barrier.sync 0; +st.shared.u32 [r8579], r2146; +st.shared.u32 [r8579+4], r2944; +st.shared.u32 [r8579+8], r2981; +st.shared.u32 [r8579+12], r3018; +st.shared.u32 [r8579+16], r3055; +st.shared.u32 [r8579+20], r3092; +st.shared.u32 [r8579+24], r3129; +st.shared.u32 [r8579+28], r3166; +st.shared.u32 [r8579+32], r3203; +st.shared.u32 [r8579+36], r3240; +st.shared.u32 [r8579+40], r3277; +st.shared.u32 [r8579+44], r3314; +st.shared.u32 [r8579+48], r3351; +st.shared.u32 [r8579+52], r3388; +st.shared.u32 [r8579+56], r3425; +st.shared.u32 [r8579+60], r3462; +st.shared.u32 [r8579+64], r3499; +st.shared.u32 [r8579+68], r3536; +st.shared.u32 [r8579+72], r3573; +st.shared.u32 [r8579+76], r3610; +st.shared.u32 [r8579+80], r3647; +st.shared.u32 [r8579+84], r3684; +st.shared.u32 [r8579+88], r3721; +st.shared.u32 [r8579+92], r3758; +st.shared.u32 [r8579+96], r3795; +st.shared.u32 [r8579+100], r3832; +st.shared.u32 [r8579+104], r3869; +barrier.sync 0; +ld.shared.u32 r3904, [r8580]; +ld.shared.u32 r4512, [r8580+324]; +ld.shared.u32 r5120, [r8580+648]; +ld.shared.u32 r3992, [r8580+972]; +ld.shared.u32 r4600, [r8580+1296]; +ld.shared.u32 r5208, [r8580+1620]; +ld.shared.u32 r4080, [r8580+1944]; +ld.shared.u32 r4688, [r8580+2268]; +ld.shared.u32 r5296, [r8580+2592]; +ld.shared.u32 r3901, [r8580+2916]; +ld.shared.u32 r4509, [r8580+3240]; +ld.shared.u32 r5117, [r8580+3564]; +ld.shared.u32 r3989, [r8580+3888]; +ld.shared.u32 r4597, [r8580+4212]; +ld.shared.u32 r5205, [r8580+4536]; +ld.shared.u32 r4077, [r8580+4860]; +ld.shared.u32 r4685, [r8580+5184]; +ld.shared.u32 r5293, [r8580+5508]; +ld.shared.u32 r3902, [r8580+5832]; +ld.shared.u32 r4510, [r8580+6156]; +ld.shared.u32 r5118, [r8580+6480]; +ld.shared.u32 r3990, [r8580+6804]; +ld.shared.u32 r4598, [r8580+7128]; +ld.shared.u32 r5206, [r8580+7452]; +ld.shared.u32 r4078, [r8580+7776]; +ld.shared.u32 r4686, [r8580+8100]; +ld.shared.u32 r5294, [r8580+8424]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 r6029, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 r6035, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 r6053, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 r6071, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 r6089, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 r6107, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 r6117, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 r6123, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 r6141, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 r6159, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 r6177, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 r6195, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 r6205, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 r6211, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 r6229, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 r6247, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 r6265, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 r6283, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 r6293, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 r6299, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 r6335, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 r6353, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 r6371, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 r6381, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 r6387, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 r6405, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 r6423, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 r6441, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 r6459, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 r6469, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 r6475, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 r6511, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 r6529, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 r6547, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 r6557, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 r6563, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 r6617, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 r6635, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 r6645, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 r6651, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 r6669, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 r6687, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 r6705, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 r6723, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 r6733, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 r6739, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 r6757, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 r6775, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 r6793, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 r6811, r6802, r6808; +} +mul.wide.u32 rd4, r8577, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r8581, rd5; +sub.s32 r8582, r8577, r8581; +shr.u32 r8583, r8582, 1; +add.s32 r8584, r8583, r8581; +shr.u32 r8585, r8584, 4; +mul.lo.s32 r8586, r8585, 27; +sub.s32 r8587, r8577, r8586; +shl.b32 r8588, r8587, 2; +add.s32 r8589, r8578, r8588; +cvt.rn.f32.u32 f768, r8585; +mul.f32 f769, f768, 0f3D9EDD1F; +cos.approx.f32 f673, f769; +sin.approx.f32 f770, f769; +neg.f32 f674, f770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6814, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6819, {high, high}; +} +{ +mul.f16x2 r6821, r6123, r6819; +} +{ +neg.f16x2 r6824, r6821; +} +{ +fma.rn.f16x2 r6826, r6117, r6817, r6824; +} +{ +mul.f16x2 r6830, r6117, r6819; +} +{ +fma.rn.f16x2 r6833, r6123, r6817, r6830; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6839, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6841, {low, high}; +} +{ +mul.f16x2 r6842, r6839, r6841; +} +{ +mul.f16x2 r6845, r6814, r6837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6848, {high, low}; +} +{ +fma.rn.f16x2 r6850, r6842, r6848, r6845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6856, {high, high}; +} +{ +mul.f16x2 r6858, r6211, r6856; +} +{ +neg.f16x2 r6861, r6858; +} +{ +fma.rn.f16x2 r6863, r6205, r6854, r6861; +} +{ +mul.f16x2 r6867, r6205, r6856; +} +{ +fma.rn.f16x2 r6870, r6211, r6854, r6867; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6876, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6878, {low, high}; +} +{ +mul.f16x2 r6879, r6876, r6878; +} +{ +mul.f16x2 r6882, r6850, r6874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6885, {high, low}; +} +{ +fma.rn.f16x2 r6887, r6879, r6885, r6882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6891, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6893, {high, high}; +} +{ +mul.f16x2 r6895, r6299, r6893; +} +{ +neg.f16x2 r6898, r6895; +} +{ +fma.rn.f16x2 r6900, r6293, r6891, r6898; +} +{ +mul.f16x2 r6904, r6293, r6893; +} +{ +fma.rn.f16x2 r6907, r6299, r6891, r6904; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6913, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6915, {low, high}; +} +{ +mul.f16x2 r6916, r6913, r6915; +} +{ +mul.f16x2 r6919, r6887, r6911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6922, {high, low}; +} +{ +fma.rn.f16x2 r6924, r6916, r6922, r6919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6930, {high, high}; +} +{ +mul.f16x2 r6932, r6387, r6930; +} +{ +neg.f16x2 r6935, r6932; +} +{ +fma.rn.f16x2 r6937, r6381, r6928, r6935; +} +{ +mul.f16x2 r6941, r6381, r6930; +} +{ +fma.rn.f16x2 r6944, r6387, r6928, r6941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6950, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6952, {low, high}; +} +{ +mul.f16x2 r6953, r6950, r6952; +} +{ +mul.f16x2 r6956, r6924, r6948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6959, {high, low}; +} +{ +fma.rn.f16x2 r6961, r6953, r6959, r6956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6967, {high, high}; +} +{ +mul.f16x2 r6969, r6475, r6967; +} +{ +neg.f16x2 r6972, r6969; +} +{ +fma.rn.f16x2 r6974, r6469, r6965, r6972; +} +{ +mul.f16x2 r6978, r6469, r6967; +} +{ +fma.rn.f16x2 r6981, r6475, r6965, r6978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6989, {low, high}; +} +{ +mul.f16x2 r6990, r6987, r6989; +} +{ +mul.f16x2 r6993, r6961, r6985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6996, {high, low}; +} +{ +fma.rn.f16x2 r6998, r6990, r6996, r6993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7004, {high, high}; +} +{ +mul.f16x2 r7006, r6563, r7004; +} +{ +neg.f16x2 r7009, r7006; +} +{ +fma.rn.f16x2 r7011, r6557, r7002, r7009; +} +{ +mul.f16x2 r7015, r6557, r7004; +} +{ +fma.rn.f16x2 r7018, r6563, r7002, r7015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7026, {low, high}; +} +{ +mul.f16x2 r7027, r7024, r7026; +} +{ +mul.f16x2 r7030, r6998, r7022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7033, {high, low}; +} +{ +fma.rn.f16x2 r7035, r7027, r7033, r7030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7041, {high, high}; +} +{ +mul.f16x2 r7043, r6651, r7041; +} +{ +neg.f16x2 r7046, r7043; +} +{ +fma.rn.f16x2 r7048, r6645, r7039, r7046; +} +{ +mul.f16x2 r7052, r6645, r7041; +} +{ +fma.rn.f16x2 r7055, r6651, r7039, r7052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7063, {low, high}; +} +{ +mul.f16x2 r7064, r7061, r7063; +} +{ +mul.f16x2 r7067, r7035, r7059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7070, {high, low}; +} +{ +fma.rn.f16x2 r7072, r7064, r7070, r7067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7078, {high, high}; +} +{ +mul.f16x2 r7080, r6739, r7078; +} +{ +neg.f16x2 r7083, r7080; +} +{ +fma.rn.f16x2 r7085, r6733, r7076, r7083; +} +{ +mul.f16x2 r7089, r6733, r7078; +} +{ +fma.rn.f16x2 r7092, r6739, r7076, r7089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7100, {low, high}; +} +{ +mul.f16x2 r7101, r7098, r7100; +} +{ +mul.f16x2 r7104, r7072, r7096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7107, {high, low}; +} +{ +fma.rn.f16x2 r7109, r7101, r7107, r7104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7115, {high, high}; +} +{ +mul.f16x2 r7117, r6089, r7115; +} +{ +neg.f16x2 r7120, r7117; +} +{ +fma.rn.f16x2 r7122, r6053, r7113, r7120; +} +{ +mul.f16x2 r7126, r6053, r7115; +} +{ +fma.rn.f16x2 r7129, r6089, r7113, r7126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7137, {low, high}; +} +{ +mul.f16x2 r7138, r7135, r7137; +} +{ +mul.f16x2 r7141, r7109, r7133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7144, {high, low}; +} +{ +fma.rn.f16x2 r7146, r7138, r7144, r7141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7152, {high, high}; +} +{ +mul.f16x2 r7154, r6177, r7152; +} +{ +neg.f16x2 r7157, r7154; +} +{ +fma.rn.f16x2 r7159, r6141, r7150, r7157; +} +{ +mul.f16x2 r7163, r6141, r7152; +} +{ +fma.rn.f16x2 r7166, r6177, r7150, r7163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7174, {low, high}; +} +{ +mul.f16x2 r7175, r7172, r7174; +} +{ +mul.f16x2 r7178, r7146, r7170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7181, {high, low}; +} +{ +fma.rn.f16x2 r7183, r7175, r7181, r7178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7189, {high, high}; +} +{ +mul.f16x2 r7191, r6265, r7189; +} +{ +neg.f16x2 r7194, r7191; +} +{ +fma.rn.f16x2 r7196, r6229, r7187, r7194; +} +{ +mul.f16x2 r7200, r6229, r7189; +} +{ +fma.rn.f16x2 r7203, r6265, r7187, r7200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7211, {low, high}; +} +{ +mul.f16x2 r7212, r7209, r7211; +} +{ +mul.f16x2 r7215, r7183, r7207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7218, {high, low}; +} +{ +fma.rn.f16x2 r7220, r7212, r7218, r7215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7226, {high, high}; +} +{ +mul.f16x2 r7228, r6353, r7226; +} +{ +neg.f16x2 r7231, r7228; +} +{ +fma.rn.f16x2 r7233, r6317, r7224, r7231; +} +{ +mul.f16x2 r7237, r6317, r7226; +} +{ +fma.rn.f16x2 r7240, r6353, r7224, r7237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7248, {low, high}; +} +{ +mul.f16x2 r7249, r7246, r7248; +} +{ +mul.f16x2 r7252, r7220, r7244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7255, {high, low}; +} +{ +fma.rn.f16x2 r7257, r7249, r7255, r7252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7263, {high, high}; +} +{ +mul.f16x2 r7265, r6441, r7263; +} +{ +neg.f16x2 r7268, r7265; +} +{ +fma.rn.f16x2 r7270, r6405, r7261, r7268; +} +{ +mul.f16x2 r7274, r6405, r7263; +} +{ +fma.rn.f16x2 r7277, r6441, r7261, r7274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7285, {low, high}; +} +{ +mul.f16x2 r7286, r7283, r7285; +} +{ +mul.f16x2 r7289, r7257, r7281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7292, {high, low}; +} +{ +fma.rn.f16x2 r7294, r7286, r7292, r7289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7300, {high, high}; +} +{ +mul.f16x2 r7302, r6529, r7300; +} +{ +neg.f16x2 r7305, r7302; +} +{ +fma.rn.f16x2 r7307, r6493, r7298, r7305; +} +{ +mul.f16x2 r7311, r6493, r7300; +} +{ +fma.rn.f16x2 r7314, r6529, r7298, r7311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7322, {low, high}; +} +{ +mul.f16x2 r7323, r7320, r7322; +} +{ +mul.f16x2 r7326, r7294, r7318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7329, {high, low}; +} +{ +fma.rn.f16x2 r7331, r7323, r7329, r7326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7337, {high, high}; +} +{ +mul.f16x2 r7339, r6617, r7337; +} +{ +neg.f16x2 r7342, r7339; +} +{ +fma.rn.f16x2 r7344, r6581, r7335, r7342; +} +{ +mul.f16x2 r7348, r6581, r7337; +} +{ +fma.rn.f16x2 r7351, r6617, r7335, r7348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7359, {low, high}; +} +{ +mul.f16x2 r7360, r7357, r7359; +} +{ +mul.f16x2 r7363, r7331, r7355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7366, {high, low}; +} +{ +fma.rn.f16x2 r7368, r7360, r7366, r7363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7374, {high, high}; +} +{ +mul.f16x2 r7376, r6705, r7374; +} +{ +neg.f16x2 r7379, r7376; +} +{ +fma.rn.f16x2 r7381, r6669, r7372, r7379; +} +{ +mul.f16x2 r7385, r6669, r7374; +} +{ +fma.rn.f16x2 r7388, r6705, r7372, r7385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7396, {low, high}; +} +{ +mul.f16x2 r7397, r7394, r7396; +} +{ +mul.f16x2 r7400, r7368, r7392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7403, {high, low}; +} +{ +fma.rn.f16x2 r7405, r7397, r7403, r7400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7411, {high, high}; +} +{ +mul.f16x2 r7413, r6793, r7411; +} +{ +neg.f16x2 r7416, r7413; +} +{ +fma.rn.f16x2 r7418, r6757, r7409, r7416; +} +{ +mul.f16x2 r7422, r6757, r7411; +} +{ +fma.rn.f16x2 r7425, r6793, r7409, r7422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7433, {low, high}; +} +{ +mul.f16x2 r7434, r7431, r7433; +} +{ +mul.f16x2 r7437, r7405, r7429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7440, {high, low}; +} +{ +fma.rn.f16x2 r7442, r7434, r7440, r7437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7448, {high, high}; +} +{ +mul.f16x2 r7450, r6107, r7448; +} +{ +neg.f16x2 r7453, r7450; +} +{ +fma.rn.f16x2 r7455, r6071, r7446, r7453; +} +{ +mul.f16x2 r7459, r6071, r7448; +} +{ +fma.rn.f16x2 r7462, r6107, r7446, r7459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7470, {low, high}; +} +{ +mul.f16x2 r7471, r7468, r7470; +} +{ +mul.f16x2 r7474, r7442, r7466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7477, {high, low}; +} +{ +fma.rn.f16x2 r7479, r7471, r7477, r7474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7485, {high, high}; +} +{ +mul.f16x2 r7487, r6195, r7485; +} +{ +neg.f16x2 r7490, r7487; +} +{ +fma.rn.f16x2 r7492, r6159, r7483, r7490; +} +{ +mul.f16x2 r7496, r6159, r7485; +} +{ +fma.rn.f16x2 r7499, r6195, r7483, r7496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7507, {low, high}; +} +{ +mul.f16x2 r7508, r7505, r7507; +} +{ +mul.f16x2 r7511, r7479, r7503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7514, {high, low}; +} +{ +fma.rn.f16x2 r7516, r7508, r7514, r7511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7522, {high, high}; +} +{ +mul.f16x2 r7524, r6283, r7522; +} +{ +neg.f16x2 r7527, r7524; +} +{ +fma.rn.f16x2 r7529, r6247, r7520, r7527; +} +{ +mul.f16x2 r7533, r6247, r7522; +} +{ +fma.rn.f16x2 r7536, r6283, r7520, r7533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7544, {low, high}; +} +{ +mul.f16x2 r7545, r7542, r7544; +} +{ +mul.f16x2 r7548, r7516, r7540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7551, {high, low}; +} +{ +fma.rn.f16x2 r7553, r7545, r7551, r7548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7559, {high, high}; +} +{ +mul.f16x2 r7561, r6371, r7559; +} +{ +neg.f16x2 r7564, r7561; +} +{ +fma.rn.f16x2 r7566, r6335, r7557, r7564; +} +{ +mul.f16x2 r7570, r6335, r7559; +} +{ +fma.rn.f16x2 r7573, r6371, r7557, r7570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7581, {low, high}; +} +{ +mul.f16x2 r7582, r7579, r7581; +} +{ +mul.f16x2 r7585, r7553, r7577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7588, {high, low}; +} +{ +fma.rn.f16x2 r7590, r7582, r7588, r7585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7596, {high, high}; +} +{ +mul.f16x2 r7598, r6459, r7596; +} +{ +neg.f16x2 r7601, r7598; +} +{ +fma.rn.f16x2 r7603, r6423, r7594, r7601; +} +{ +mul.f16x2 r7607, r6423, r7596; +} +{ +fma.rn.f16x2 r7610, r6459, r7594, r7607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7618, {low, high}; +} +{ +mul.f16x2 r7619, r7616, r7618; +} +{ +mul.f16x2 r7622, r7590, r7614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7625, {high, low}; +} +{ +fma.rn.f16x2 r7627, r7619, r7625, r7622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7633, {high, high}; +} +{ +mul.f16x2 r7635, r6547, r7633; +} +{ +neg.f16x2 r7638, r7635; +} +{ +fma.rn.f16x2 r7640, r6511, r7631, r7638; +} +{ +mul.f16x2 r7644, r6511, r7633; +} +{ +fma.rn.f16x2 r7647, r6547, r7631, r7644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7655, {low, high}; +} +{ +mul.f16x2 r7656, r7653, r7655; +} +{ +mul.f16x2 r7659, r7627, r7651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7662, {high, low}; +} +{ +fma.rn.f16x2 r7664, r7656, r7662, r7659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7670, {high, high}; +} +{ +mul.f16x2 r7672, r6635, r7670; +} +{ +neg.f16x2 r7675, r7672; +} +{ +fma.rn.f16x2 r7677, r6599, r7668, r7675; +} +{ +mul.f16x2 r7681, r6599, r7670; +} +{ +fma.rn.f16x2 r7684, r6635, r7668, r7681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7692, {low, high}; +} +{ +mul.f16x2 r7693, r7690, r7692; +} +{ +mul.f16x2 r7696, r7664, r7688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7699, {high, low}; +} +{ +fma.rn.f16x2 r7701, r7693, r7699, r7696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7707, {high, high}; +} +{ +mul.f16x2 r7709, r6723, r7707; +} +{ +neg.f16x2 r7712, r7709; +} +{ +fma.rn.f16x2 r7714, r6687, r7705, r7712; +} +{ +mul.f16x2 r7718, r6687, r7707; +} +{ +fma.rn.f16x2 r7721, r6723, r7705, r7718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7729, {low, high}; +} +{ +mul.f16x2 r7730, r7727, r7729; +} +{ +mul.f16x2 r7733, r7701, r7725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7736, {high, low}; +} +{ +fma.rn.f16x2 r7738, r7730, r7736, r7733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7744, {high, high}; +} +{ +mul.f16x2 r7746, r6811, r7744; +} +{ +neg.f16x2 r7749, r7746; +} +{ +fma.rn.f16x2 r7751, r6775, r7742, r7749; +} +{ +mul.f16x2 r7755, r6775, r7744; +} +{ +fma.rn.f16x2 r7758, r6811, r7742, r7755; +} +barrier.sync 0; +mad.lo.s32 r8590, r8585, 2916, r8589; +st.shared.u32 [r8590], r6029; +st.shared.u32 [r8590+108], r6826; +st.shared.u32 [r8590+216], r6863; +st.shared.u32 [r8590+324], r6900; +st.shared.u32 [r8590+432], r6937; +st.shared.u32 [r8590+540], r6974; +st.shared.u32 [r8590+648], r7011; +st.shared.u32 [r8590+756], r7048; +st.shared.u32 [r8590+864], r7085; +st.shared.u32 [r8590+972], r7122; +st.shared.u32 [r8590+1080], r7159; +st.shared.u32 [r8590+1188], r7196; +st.shared.u32 [r8590+1296], r7233; +st.shared.u32 [r8590+1404], r7270; +st.shared.u32 [r8590+1512], r7307; +st.shared.u32 [r8590+1620], r7344; +st.shared.u32 [r8590+1728], r7381; +st.shared.u32 [r8590+1836], r7418; +st.shared.u32 [r8590+1944], r7455; +st.shared.u32 [r8590+2052], r7492; +st.shared.u32 [r8590+2160], r7529; +st.shared.u32 [r8590+2268], r7566; +st.shared.u32 [r8590+2376], r7603; +st.shared.u32 [r8590+2484], r7640; +st.shared.u32 [r8590+2592], r7677; +st.shared.u32 [r8590+2700], r7714; +st.shared.u32 [r8590+2808], r7751; +barrier.sync 0; +ld.shared.u32 r7787, [r8580]; +ld.shared.u32 r7875, [r8580+324]; +ld.shared.u32 r7963, [r8580+648]; +ld.shared.u32 r8051, [r8580+972]; +ld.shared.u32 r8139, [r8580+1296]; +ld.shared.u32 r8227, [r8580+1620]; +ld.shared.u32 r8315, [r8580+1944]; +ld.shared.u32 r8403, [r8580+2268]; +ld.shared.u32 r8491, [r8580+2592]; +ld.shared.u32 r7784, [r8580+2916]; +ld.shared.u32 r7872, [r8580+3240]; +ld.shared.u32 r7960, [r8580+3564]; +ld.shared.u32 r8048, [r8580+3888]; +ld.shared.u32 r8136, [r8580+4212]; +ld.shared.u32 r8224, [r8580+4536]; +ld.shared.u32 r8312, [r8580+4860]; +ld.shared.u32 r8400, [r8580+5184]; +ld.shared.u32 r8488, [r8580+5508]; +ld.shared.u32 r7785, [r8580+5832]; +ld.shared.u32 r7873, [r8580+6156]; +ld.shared.u32 r7961, [r8580+6480]; +ld.shared.u32 r8049, [r8580+6804]; +ld.shared.u32 r8137, [r8580+7128]; +ld.shared.u32 r8225, [r8580+7452]; +ld.shared.u32 r8313, [r8580+7776]; +ld.shared.u32 r8401, [r8580+8100]; +ld.shared.u32 r8489, [r8580+8424]; +barrier.sync 0; +st.shared.u32 [r8590], r6035; +st.shared.u32 [r8590+108], r6833; +st.shared.u32 [r8590+216], r6870; +st.shared.u32 [r8590+324], r6907; +st.shared.u32 [r8590+432], r6944; +st.shared.u32 [r8590+540], r6981; +st.shared.u32 [r8590+648], r7018; +st.shared.u32 [r8590+756], r7055; +st.shared.u32 [r8590+864], r7092; +st.shared.u32 [r8590+972], r7129; +st.shared.u32 [r8590+1080], r7166; +st.shared.u32 [r8590+1188], r7203; +st.shared.u32 [r8590+1296], r7240; +st.shared.u32 [r8590+1404], r7277; +st.shared.u32 [r8590+1512], r7314; +st.shared.u32 [r8590+1620], r7351; +st.shared.u32 [r8590+1728], r7388; +st.shared.u32 [r8590+1836], r7425; +st.shared.u32 [r8590+1944], r7462; +st.shared.u32 [r8590+2052], r7499; +st.shared.u32 [r8590+2160], r7536; +st.shared.u32 [r8590+2268], r7573; +st.shared.u32 [r8590+2376], r7610; +st.shared.u32 [r8590+2484], r7647; +st.shared.u32 [r8590+2592], r7684; +st.shared.u32 [r8590+2700], r7721; +st.shared.u32 [r8590+2808], r7758; +barrier.sync 0; +ld.shared.u32 r7793, [r8580]; +ld.shared.u32 r7881, [r8580+324]; +ld.shared.u32 r7969, [r8580+648]; +ld.shared.u32 r8057, [r8580+972]; +ld.shared.u32 r8145, [r8580+1296]; +ld.shared.u32 r8233, [r8580+1620]; +ld.shared.u32 r8321, [r8580+1944]; +ld.shared.u32 r8409, [r8580+2268]; +ld.shared.u32 r8497, [r8580+2592]; +ld.shared.u32 r7790, [r8580+2916]; +ld.shared.u32 r7878, [r8580+3240]; +ld.shared.u32 r7966, [r8580+3564]; +ld.shared.u32 r8054, [r8580+3888]; +ld.shared.u32 r8142, [r8580+4212]; +ld.shared.u32 r8230, [r8580+4536]; +ld.shared.u32 r8318, [r8580+4860]; +ld.shared.u32 r8406, [r8580+5184]; +ld.shared.u32 r8494, [r8580+5508]; +ld.shared.u32 r7791, [r8580+5832]; +ld.shared.u32 r7879, [r8580+6156]; +ld.shared.u32 r7967, [r8580+6480]; +ld.shared.u32 r8055, [r8580+6804]; +ld.shared.u32 r8143, [r8580+7128]; +ld.shared.u32 r8231, [r8580+7452]; +ld.shared.u32 r8319, [r8580+7776]; +ld.shared.u32 r8407, [r8580+8100]; +ld.shared.u32 r8495, [r8580+8424]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7780, {low, high}; +} +{ +neg.f16x2 r7781, r7780; +} +{ +add.f16x2 r7783, r7784, r7785; +} +{ +add.f16x2 %0, r7787, r7783; +} +{ +add.f16x2 r7789, r7790, r7791; +} +{ +add.f16x2 %1, r7793, r7789; +} +{ +add.f16x2 r7795, r7784, r7785; +} +{ +mul.f16x2 r7798, r7795, r7779; +} +{ +add.f16x2 r7801, r7787, r7798; +} +{ +sub.f16x2 r7804, r7790, r7791; +} +{ +mul.f16x2 r7807, r7804, r7781; +} +{ +add.f16x2 %18, r7801, r7807; +} +{ +add.f16x2 r7813, r7784, r7785; +} +{ +mul.f16x2 r7816, r7813, r7779; +} +{ +add.f16x2 r7819, r7787, r7816; +} +{ +sub.f16x2 r7822, r7790, r7791; +} +{ +mul.f16x2 r7825, r7822, r7781; +} +{ +sub.f16x2 %36, r7819, r7825; +} +{ +add.f16x2 r7831, r7790, r7791; +} +{ +mul.f16x2 r7834, r7831, r7779; +} +{ +add.f16x2 r7837, r7793, r7834; +} +{ +sub.f16x2 r7840, r7784, r7785; +} +{ +mul.f16x2 r7843, r7840, r7781; +} +{ +sub.f16x2 %19, r7837, r7843; +} +{ +add.f16x2 r7849, r7790, r7791; +} +{ +mul.f16x2 r7852, r7849, r7779; +} +{ +add.f16x2 r7855, r7793, r7852; +} +{ +sub.f16x2 r7858, r7784, r7785; +} +{ +mul.f16x2 r7861, r7858, r7781; +} +{ +add.f16x2 %37, r7855, r7861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7868, {low, high}; +} +{ +neg.f16x2 r7869, r7868; +} +{ +add.f16x2 r7871, r7872, r7873; +} +{ +add.f16x2 %2, r7875, r7871; +} +{ +add.f16x2 r7877, r7878, r7879; +} +{ +add.f16x2 %3, r7881, r7877; +} +{ +add.f16x2 r7883, r7872, r7873; +} +{ +mul.f16x2 r7886, r7883, r7867; +} +{ +add.f16x2 r7889, r7875, r7886; +} +{ +sub.f16x2 r7892, r7878, r7879; +} +{ +mul.f16x2 r7895, r7892, r7869; +} +{ +add.f16x2 %20, r7889, r7895; +} +{ +add.f16x2 r7901, r7872, r7873; +} +{ +mul.f16x2 r7904, r7901, r7867; +} +{ +add.f16x2 r7907, r7875, r7904; +} +{ +sub.f16x2 r7910, r7878, r7879; +} +{ +mul.f16x2 r7913, r7910, r7869; +} +{ +sub.f16x2 %38, r7907, r7913; +} +{ +add.f16x2 r7919, r7878, r7879; +} +{ +mul.f16x2 r7922, r7919, r7867; +} +{ +add.f16x2 r7925, r7881, r7922; +} +{ +sub.f16x2 r7928, r7872, r7873; +} +{ +mul.f16x2 r7931, r7928, r7869; +} +{ +sub.f16x2 %21, r7925, r7931; +} +{ +add.f16x2 r7937, r7878, r7879; +} +{ +mul.f16x2 r7940, r7937, r7867; +} +{ +add.f16x2 r7943, r7881, r7940; +} +{ +sub.f16x2 r7946, r7872, r7873; +} +{ +mul.f16x2 r7949, r7946, r7869; +} +{ +add.f16x2 %39, r7943, r7949; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7956, {low, high}; +} +{ +neg.f16x2 r7957, r7956; +} +{ +add.f16x2 r7959, r7960, r7961; +} +{ +add.f16x2 %4, r7963, r7959; +} +{ +add.f16x2 r7965, r7966, r7967; +} +{ +add.f16x2 %5, r7969, r7965; +} +{ +add.f16x2 r7971, r7960, r7961; +} +{ +mul.f16x2 r7974, r7971, r7955; +} +{ +add.f16x2 r7977, r7963, r7974; +} +{ +sub.f16x2 r7980, r7966, r7967; +} +{ +mul.f16x2 r7983, r7980, r7957; +} +{ +add.f16x2 %22, r7977, r7983; +} +{ +add.f16x2 r7989, r7960, r7961; +} +{ +mul.f16x2 r7992, r7989, r7955; +} +{ +add.f16x2 r7995, r7963, r7992; +} +{ +sub.f16x2 r7998, r7966, r7967; +} +{ +mul.f16x2 r8001, r7998, r7957; +} +{ +sub.f16x2 %40, r7995, r8001; +} +{ +add.f16x2 r8007, r7966, r7967; +} +{ +mul.f16x2 r8010, r8007, r7955; +} +{ +add.f16x2 r8013, r7969, r8010; +} +{ +sub.f16x2 r8016, r7960, r7961; +} +{ +mul.f16x2 r8019, r8016, r7957; +} +{ +sub.f16x2 %23, r8013, r8019; +} +{ +add.f16x2 r8025, r7966, r7967; +} +{ +mul.f16x2 r8028, r8025, r7955; +} +{ +add.f16x2 r8031, r7969, r8028; +} +{ +sub.f16x2 r8034, r7960, r7961; +} +{ +mul.f16x2 r8037, r8034, r7957; +} +{ +add.f16x2 %41, r8031, r8037; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8044, {low, high}; +} +{ +neg.f16x2 r8045, r8044; +} +{ +add.f16x2 r8047, r8048, r8049; +} +{ +add.f16x2 %6, r8051, r8047; +} +{ +add.f16x2 r8053, r8054, r8055; +} +{ +add.f16x2 %7, r8057, r8053; +} +{ +add.f16x2 r8059, r8048, r8049; +} +{ +mul.f16x2 r8062, r8059, r8043; +} +{ +add.f16x2 r8065, r8051, r8062; +} +{ +sub.f16x2 r8068, r8054, r8055; +} +{ +mul.f16x2 r8071, r8068, r8045; +} +{ +add.f16x2 %24, r8065, r8071; +} +{ +add.f16x2 r8077, r8048, r8049; +} +{ +mul.f16x2 r8080, r8077, r8043; +} +{ +add.f16x2 r8083, r8051, r8080; +} +{ +sub.f16x2 r8086, r8054, r8055; +} +{ +mul.f16x2 r8089, r8086, r8045; +} +{ +sub.f16x2 %42, r8083, r8089; +} +{ +add.f16x2 r8095, r8054, r8055; +} +{ +mul.f16x2 r8098, r8095, r8043; +} +{ +add.f16x2 r8101, r8057, r8098; +} +{ +sub.f16x2 r8104, r8048, r8049; +} +{ +mul.f16x2 r8107, r8104, r8045; +} +{ +sub.f16x2 %25, r8101, r8107; +} +{ +add.f16x2 r8113, r8054, r8055; +} +{ +mul.f16x2 r8116, r8113, r8043; +} +{ +add.f16x2 r8119, r8057, r8116; +} +{ +sub.f16x2 r8122, r8048, r8049; +} +{ +mul.f16x2 r8125, r8122, r8045; +} +{ +add.f16x2 %43, r8119, r8125; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8131, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8132, {low, high}; +} +{ +neg.f16x2 r8133, r8132; +} +{ +add.f16x2 r8135, r8136, r8137; +} +{ +add.f16x2 %8, r8139, r8135; +} +{ +add.f16x2 r8141, r8142, r8143; +} +{ +add.f16x2 %9, r8145, r8141; +} +{ +add.f16x2 r8147, r8136, r8137; +} +{ +mul.f16x2 r8150, r8147, r8131; +} +{ +add.f16x2 r8153, r8139, r8150; +} +{ +sub.f16x2 r8156, r8142, r8143; +} +{ +mul.f16x2 r8159, r8156, r8133; +} +{ +add.f16x2 %26, r8153, r8159; +} +{ +add.f16x2 r8165, r8136, r8137; +} +{ +mul.f16x2 r8168, r8165, r8131; +} +{ +add.f16x2 r8171, r8139, r8168; +} +{ +sub.f16x2 r8174, r8142, r8143; +} +{ +mul.f16x2 r8177, r8174, r8133; +} +{ +sub.f16x2 %44, r8171, r8177; +} +{ +add.f16x2 r8183, r8142, r8143; +} +{ +mul.f16x2 r8186, r8183, r8131; +} +{ +add.f16x2 r8189, r8145, r8186; +} +{ +sub.f16x2 r8192, r8136, r8137; +} +{ +mul.f16x2 r8195, r8192, r8133; +} +{ +sub.f16x2 %27, r8189, r8195; +} +{ +add.f16x2 r8201, r8142, r8143; +} +{ +mul.f16x2 r8204, r8201, r8131; +} +{ +add.f16x2 r8207, r8145, r8204; +} +{ +sub.f16x2 r8210, r8136, r8137; +} +{ +mul.f16x2 r8213, r8210, r8133; +} +{ +add.f16x2 %45, r8207, r8213; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8219, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8220, {low, high}; +} +{ +neg.f16x2 r8221, r8220; +} +{ +add.f16x2 r8223, r8224, r8225; +} +{ +add.f16x2 %10, r8227, r8223; +} +{ +add.f16x2 r8229, r8230, r8231; +} +{ +add.f16x2 %11, r8233, r8229; +} +{ +add.f16x2 r8235, r8224, r8225; +} +{ +mul.f16x2 r8238, r8235, r8219; +} +{ +add.f16x2 r8241, r8227, r8238; +} +{ +sub.f16x2 r8244, r8230, r8231; +} +{ +mul.f16x2 r8247, r8244, r8221; +} +{ +add.f16x2 %28, r8241, r8247; +} +{ +add.f16x2 r8253, r8224, r8225; +} +{ +mul.f16x2 r8256, r8253, r8219; +} +{ +add.f16x2 r8259, r8227, r8256; +} +{ +sub.f16x2 r8262, r8230, r8231; +} +{ +mul.f16x2 r8265, r8262, r8221; +} +{ +sub.f16x2 %46, r8259, r8265; +} +{ +add.f16x2 r8271, r8230, r8231; +} +{ +mul.f16x2 r8274, r8271, r8219; +} +{ +add.f16x2 r8277, r8233, r8274; +} +{ +sub.f16x2 r8280, r8224, r8225; +} +{ +mul.f16x2 r8283, r8280, r8221; +} +{ +sub.f16x2 %29, r8277, r8283; +} +{ +add.f16x2 r8289, r8230, r8231; +} +{ +mul.f16x2 r8292, r8289, r8219; +} +{ +add.f16x2 r8295, r8233, r8292; +} +{ +sub.f16x2 r8298, r8224, r8225; +} +{ +mul.f16x2 r8301, r8298, r8221; +} +{ +add.f16x2 %47, r8295, r8301; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8307, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8308, {low, high}; +} +{ +neg.f16x2 r8309, r8308; +} +{ +add.f16x2 r8311, r8312, r8313; +} +{ +add.f16x2 %12, r8315, r8311; +} +{ +add.f16x2 r8317, r8318, r8319; +} +{ +add.f16x2 %13, r8321, r8317; +} +{ +add.f16x2 r8323, r8312, r8313; +} +{ +mul.f16x2 r8326, r8323, r8307; +} +{ +add.f16x2 r8329, r8315, r8326; +} +{ +sub.f16x2 r8332, r8318, r8319; +} +{ +mul.f16x2 r8335, r8332, r8309; +} +{ +add.f16x2 %30, r8329, r8335; +} +{ +add.f16x2 r8341, r8312, r8313; +} +{ +mul.f16x2 r8344, r8341, r8307; +} +{ +add.f16x2 r8347, r8315, r8344; +} +{ +sub.f16x2 r8350, r8318, r8319; +} +{ +mul.f16x2 r8353, r8350, r8309; +} +{ +sub.f16x2 %48, r8347, r8353; +} +{ +add.f16x2 r8359, r8318, r8319; +} +{ +mul.f16x2 r8362, r8359, r8307; +} +{ +add.f16x2 r8365, r8321, r8362; +} +{ +sub.f16x2 r8368, r8312, r8313; +} +{ +mul.f16x2 r8371, r8368, r8309; +} +{ +sub.f16x2 %31, r8365, r8371; +} +{ +add.f16x2 r8377, r8318, r8319; +} +{ +mul.f16x2 r8380, r8377, r8307; +} +{ +add.f16x2 r8383, r8321, r8380; +} +{ +sub.f16x2 r8386, r8312, r8313; +} +{ +mul.f16x2 r8389, r8386, r8309; +} +{ +add.f16x2 %49, r8383, r8389; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8396, {low, high}; +} +{ +neg.f16x2 r8397, r8396; +} +{ +add.f16x2 r8399, r8400, r8401; +} +{ +add.f16x2 %14, r8403, r8399; +} +{ +add.f16x2 r8405, r8406, r8407; +} +{ +add.f16x2 %15, r8409, r8405; +} +{ +add.f16x2 r8411, r8400, r8401; +} +{ +mul.f16x2 r8414, r8411, r8395; +} +{ +add.f16x2 r8417, r8403, r8414; +} +{ +sub.f16x2 r8420, r8406, r8407; +} +{ +mul.f16x2 r8423, r8420, r8397; +} +{ +add.f16x2 %32, r8417, r8423; +} +{ +add.f16x2 r8429, r8400, r8401; +} +{ +mul.f16x2 r8432, r8429, r8395; +} +{ +add.f16x2 r8435, r8403, r8432; +} +{ +sub.f16x2 r8438, r8406, r8407; +} +{ +mul.f16x2 r8441, r8438, r8397; +} +{ +sub.f16x2 %50, r8435, r8441; +} +{ +add.f16x2 r8447, r8406, r8407; +} +{ +mul.f16x2 r8450, r8447, r8395; +} +{ +add.f16x2 r8453, r8409, r8450; +} +{ +sub.f16x2 r8456, r8400, r8401; +} +{ +mul.f16x2 r8459, r8456, r8397; +} +{ +sub.f16x2 %33, r8453, r8459; +} +{ +add.f16x2 r8465, r8406, r8407; +} +{ +mul.f16x2 r8468, r8465, r8395; +} +{ +add.f16x2 r8471, r8409, r8468; +} +{ +sub.f16x2 r8474, r8400, r8401; +} +{ +mul.f16x2 r8477, r8474, r8397; +} +{ +add.f16x2 %51, r8471, r8477; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8484, {low, high}; +} +{ +neg.f16x2 r8485, r8484; +} +{ +add.f16x2 r8487, r8488, r8489; +} +{ +add.f16x2 %16, r8491, r8487; +} +{ +add.f16x2 r8493, r8494, r8495; +} +{ +add.f16x2 %17, r8497, r8493; +} +{ +add.f16x2 r8499, r8488, r8489; +} +{ +mul.f16x2 r8502, r8499, r8483; +} +{ +add.f16x2 r8505, r8491, r8502; +} +{ +sub.f16x2 r8508, r8494, r8495; +} +{ +mul.f16x2 r8511, r8508, r8485; +} +{ +add.f16x2 %34, r8505, r8511; +} +{ +add.f16x2 r8517, r8488, r8489; +} +{ +mul.f16x2 r8520, r8517, r8483; +} +{ +add.f16x2 r8523, r8491, r8520; +} +{ +sub.f16x2 r8526, r8494, r8495; +} +{ +mul.f16x2 r8529, r8526, r8485; +} +{ +sub.f16x2 %52, r8523, r8529; +} +{ +add.f16x2 r8535, r8494, r8495; +} +{ +mul.f16x2 r8538, r8535, r8483; +} +{ +add.f16x2 r8541, r8497, r8538; +} +{ +sub.f16x2 r8544, r8488, r8489; +} +{ +mul.f16x2 r8547, r8544, r8485; +} +{ +sub.f16x2 %35, r8541, r8547; +} +{ +add.f16x2 r8553, r8494, r8495; +} +{ +mul.f16x2 r8556, r8553, r8483; +} +{ +add.f16x2 r8559, r8497, r8556; +} +{ +sub.f16x2 r8562, r8488, r8489; +} +{ +mul.f16x2 r8565, r8562, r8485; +} +{ +add.f16x2 %53, r8559, r8565; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<894, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<95>; +.reg .b32 r<1124>; +.reg .b64 rd<14>; +mov.u32 r1079, %tid.y; +mov.u32 r1080, %6; +mad.lo.s32 r1081, r1079, 17496, r1080; +mov.u32 r1082, %tid.x; +mov.f32 f74, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1, {low, high}; +} +mov.f32 f76, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r1082, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r1083, rd3; +mul.lo.s32 r1084, r1083, 729; +sub.s32 r1085, r1082, r1084; +mad.lo.s32 r1086, r1083, 17496, r1081; +cvt.rn.f32.u32 f77, r1085; +mul.f32 f78, f77, 0f3B3C4870; +cos.approx.f32 f5, f78; +sin.approx.f32 f79, f78; +neg.f32 f6, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r1087, r1085, 24, r1086; +st.shared.v2.f32 [r1087], {r8, r14}; +st.shared.v2.f32 [r1087+8], {r101, r108}; +st.shared.v2.f32 [r1087+16], {r138, r145}; +barrier.sync 0; +shl.b32 r1088, r1085, 4; +sub.s32 r1089, r1087, r1088; +ld.shared.u32 r174, [r1089]; +ld.shared.u32 r180, [r1089+4]; +ld.shared.u32 r171, [r1089+5832]; +ld.shared.u32 r177, [r1089+5836]; +ld.shared.u32 r172, [r1089+11664]; +ld.shared.u32 r178, [r1089+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r1085, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r1090, rd5; +mul.lo.s32 r1091, r1090, 3; +sub.s32 r1092, r1085, r1091; +shl.b32 r1093, r1092, 3; +add.s32 r1094, r1086, r1093; +cvt.rn.f32.u32 f80, r1090; +mul.f32 f81, f80, 0f3C0D3654; +cos.approx.f32 f17, f81; +sin.approx.f32 f82, f81; +neg.f32 f18, f82; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r1095, r1090, 72, r1094; +st.shared.u32 [r1095], r173; +st.shared.u32 [r1095+4], r179; +st.shared.u32 [r1095+24], r266; +st.shared.u32 [r1095+28], r273; +st.shared.u32 [r1095+48], r303; +st.shared.u32 [r1095+52], r310; +barrier.sync 0; +ld.shared.u32 r339, [r1089]; +ld.shared.u32 r345, [r1089+4]; +ld.shared.u32 r336, [r1089+5832]; +ld.shared.u32 r342, [r1089+5836]; +ld.shared.u32 r337, [r1089+11664]; +ld.shared.u32 r343, [r1089+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r1085, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r1096, rd7; +mul.lo.s32 r1097, r1096, 9; +sub.s32 r1098, r1085, r1097; +shl.b32 r1099, r1098, 3; +add.s32 r1100, r1086, r1099; +cvt.rn.f32.u32 f83, r1096; +mul.f32 f84, f83, 0f3CD3D17E; +cos.approx.f32 f29, f84; +sin.approx.f32 f85, f84; +neg.f32 f30, f85; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r1101, r1096, 216, r1100; +st.shared.u32 [r1101], r338; +st.shared.u32 [r1101+4], r344; +st.shared.u32 [r1101+72], r431; +st.shared.u32 [r1101+76], r438; +st.shared.u32 [r1101+144], r468; +st.shared.u32 [r1101+148], r475; +barrier.sync 0; +ld.shared.u32 r504, [r1089]; +ld.shared.u32 r510, [r1089+4]; +ld.shared.u32 r501, [r1089+5832]; +ld.shared.u32 r507, [r1089+5836]; +ld.shared.u32 r502, [r1089+11664]; +ld.shared.u32 r508, [r1089+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 r503, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 r509, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 r545, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 r563, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 r581, r572, r578; +} +mul.wide.u32 rd8, r1085, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r1102, rd9; +sub.s32 r1103, r1085, r1102; +shr.u32 r1104, r1103, 1; +add.s32 r1105, r1104, r1102; +shr.u32 r1106, r1105, 4; +mul.lo.s32 r1107, r1106, 27; +sub.s32 r1108, r1085, r1107; +shl.b32 r1109, r1108, 3; +add.s32 r1110, r1086, r1109; +cvt.rn.f32.u32 f86, r1106; +mul.f32 f87, f86, 0f3D9EDD1F; +cos.approx.f32 f41, f87; +sin.approx.f32 f88, f87; +neg.f32 f42, f88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r584, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r589, {high, high}; +} +{ +mul.f16x2 r591, r563, r589; +} +{ +neg.f16x2 r594, r591; +} +{ +fma.rn.f16x2 r596, r527, r587, r594; +} +{ +mul.f16x2 r600, r527, r589; +} +{ +fma.rn.f16x2 r603, r563, r587, r600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r607, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r609, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r609, r611; +} +{ +mul.f16x2 r615, r584, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r618, {high, low}; +} +{ +fma.rn.f16x2 r620, r612, r618, r615; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r626, {high, high}; +} +{ +mul.f16x2 r628, r581, r626; +} +{ +neg.f16x2 r631, r628; +} +{ +fma.rn.f16x2 r633, r545, r624, r631; +} +{ +mul.f16x2 r637, r545, r626; +} +{ +fma.rn.f16x2 r640, r581, r624, r637; +} +barrier.sync 0; +mad.lo.s32 r1111, r1106, 648, r1110; +st.shared.u32 [r1111], r503; +st.shared.u32 [r1111+4], r509; +st.shared.u32 [r1111+216], r596; +st.shared.u32 [r1111+220], r603; +st.shared.u32 [r1111+432], r633; +st.shared.u32 [r1111+436], r640; +barrier.sync 0; +ld.shared.u32 r669, [r1089]; +ld.shared.u32 r675, [r1089+4]; +ld.shared.u32 r666, [r1089+5832]; +ld.shared.u32 r672, [r1089+5836]; +ld.shared.u32 r667, [r1089+11664]; +ld.shared.u32 r673, [r1089+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r662, {low, high}; +} +{ +neg.f16x2 r663, r662; +} +{ +add.f16x2 r665, r666, r667; +} +{ +add.f16x2 r668, r669, r665; +} +{ +add.f16x2 r671, r672, r673; +} +{ +add.f16x2 r674, r675, r671; +} +{ +add.f16x2 r677, r666, r667; +} +{ +mul.f16x2 r680, r677, r661; +} +{ +add.f16x2 r683, r669, r680; +} +{ +sub.f16x2 r686, r672, r673; +} +{ +mul.f16x2 r689, r686, r663; +} +{ +add.f16x2 r692, r683, r689; +} +{ +add.f16x2 r695, r666, r667; +} +{ +mul.f16x2 r698, r695, r661; +} +{ +add.f16x2 r701, r669, r698; +} +{ +sub.f16x2 r704, r672, r673; +} +{ +mul.f16x2 r707, r704, r663; +} +{ +sub.f16x2 r710, r701, r707; +} +{ +add.f16x2 r713, r672, r673; +} +{ +mul.f16x2 r716, r713, r661; +} +{ +add.f16x2 r719, r675, r716; +} +{ +sub.f16x2 r722, r666, r667; +} +{ +mul.f16x2 r725, r722, r663; +} +{ +sub.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, r672, r673; +} +{ +mul.f16x2 r734, r731, r661; +} +{ +add.f16x2 r737, r675, r734; +} +{ +sub.f16x2 r740, r666, r667; +} +{ +mul.f16x2 r743, r740, r663; +} +{ +add.f16x2 r746, r737, r743; +} +mul.wide.u32 rd10, r1085, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r1112, rd11; +mul.lo.s32 r1113, r1112, 81; +sub.s32 r1114, r1085, r1113; +shl.b32 r1115, r1114, 3; +add.s32 r1116, r1086, r1115; +cvt.rn.f32.u32 f89, r1112; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f53, f90; +sin.approx.f32 f91, f90; +neg.f32 f54, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r749, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r754, {high, high}; +} +{ +mul.f16x2 r756, r728, r754; +} +{ +neg.f16x2 r759, r756; +} +{ +fma.rn.f16x2 r761, r692, r752, r759; +} +{ +mul.f16x2 r765, r692, r754; +} +{ +fma.rn.f16x2 r768, r728, r752, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r776, {low, high}; +} +{ +mul.f16x2 r777, r774, r776; +} +{ +mul.f16x2 r780, r749, r772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r783, {high, low}; +} +{ +fma.rn.f16x2 r785, r777, r783, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r791, {high, high}; +} +{ +mul.f16x2 r793, r746, r791; +} +{ +neg.f16x2 r796, r793; +} +{ +fma.rn.f16x2 r798, r710, r789, r796; +} +{ +mul.f16x2 r802, r710, r791; +} +{ +fma.rn.f16x2 r805, r746, r789, r802; +} +barrier.sync 0; +mad.lo.s32 r1117, r1112, 1944, r1116; +st.shared.u32 [r1117], r668; +st.shared.u32 [r1117+4], r674; +st.shared.u32 [r1117+648], r761; +st.shared.u32 [r1117+652], r768; +st.shared.u32 [r1117+1296], r798; +st.shared.u32 [r1117+1300], r805; +barrier.sync 0; +ld.shared.u32 r834, [r1089]; +ld.shared.u32 r840, [r1089+4]; +ld.shared.u32 r831, [r1089+5832]; +ld.shared.u32 r837, [r1089+5836]; +ld.shared.u32 r832, [r1089+11664]; +ld.shared.u32 r838, [r1089+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r827, {low, high}; +} +{ +neg.f16x2 r828, r827; +} +{ +add.f16x2 r830, r831, r832; +} +{ +add.f16x2 r833, r834, r830; +} +{ +add.f16x2 r836, r837, r838; +} +{ +add.f16x2 r839, r840, r836; +} +{ +add.f16x2 r842, r831, r832; +} +{ +mul.f16x2 r845, r842, r826; +} +{ +add.f16x2 r848, r834, r845; +} +{ +sub.f16x2 r851, r837, r838; +} +{ +mul.f16x2 r854, r851, r828; +} +{ +add.f16x2 r857, r848, r854; +} +{ +add.f16x2 r860, r831, r832; +} +{ +mul.f16x2 r863, r860, r826; +} +{ +add.f16x2 r866, r834, r863; +} +{ +sub.f16x2 r869, r837, r838; +} +{ +mul.f16x2 r872, r869, r828; +} +{ +sub.f16x2 r875, r866, r872; +} +{ +add.f16x2 r878, r837, r838; +} +{ +mul.f16x2 r881, r878, r826; +} +{ +add.f16x2 r884, r840, r881; +} +{ +sub.f16x2 r887, r831, r832; +} +{ +mul.f16x2 r890, r887, r828; +} +{ +sub.f16x2 r893, r884, r890; +} +{ +add.f16x2 r896, r837, r838; +} +{ +mul.f16x2 r899, r896, r826; +} +{ +add.f16x2 r902, r840, r899; +} +{ +sub.f16x2 r905, r831, r832; +} +{ +mul.f16x2 r908, r905, r828; +} +{ +add.f16x2 r911, r902, r908; +} +mul.wide.u32 rd12, r1085, -2032597691; +shr.u64 rd13, rd12, 39; +cvt.u32.u64 r1118, rd13; +mul.lo.s32 r1119, r1118, 243; +sub.s32 r1120, r1085, r1119; +shl.b32 r1121, r1120, 3; +add.s32 r1122, r1086, r1121; +cvt.rn.f32.u32 f92, r1118; +mul.f32 f93, f92, 0f3F32B8C2; +cos.approx.f32 f65, f93; +sin.approx.f32 f94, f93; +neg.f32 f66, f94; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f65; +cvt.rn.f16.f32 high, f66; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r919, {high, high}; +} +{ +mul.f16x2 r921, r893, r919; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r857, r917, r924; +} +{ +mul.f16x2 r930, r857, r919; +} +{ +fma.rn.f16x2 r933, r893, r917, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r937, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r939, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r941, {low, high}; +} +{ +mul.f16x2 r942, r939, r941; +} +{ +mul.f16x2 r945, r914, r937; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r948, {high, low}; +} +{ +fma.rn.f16x2 r950, r942, r948, r945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r950; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r950; +mov.b32 r956, {high, high}; +} +{ +mul.f16x2 r958, r911, r956; +} +{ +neg.f16x2 r961, r958; +} +{ +fma.rn.f16x2 r963, r875, r954, r961; +} +{ +mul.f16x2 r967, r875, r956; +} +{ +fma.rn.f16x2 r970, r911, r954, r967; +} +barrier.sync 0; +mad.lo.s32 r1123, r1118, 5832, r1122; +st.shared.u32 [r1123], r833; +st.shared.u32 [r1123+4], r839; +st.shared.u32 [r1123+1944], r926; +st.shared.u32 [r1123+1948], r933; +st.shared.u32 [r1123+3888], r963; +st.shared.u32 [r1123+3892], r970; +barrier.sync 0; +ld.shared.u32 r999, [r1089]; +ld.shared.u32 r1005, [r1089+4]; +ld.shared.u32 r996, [r1089+5832]; +ld.shared.u32 r1002, [r1089+5836]; +ld.shared.u32 r997, [r1089+11664]; +ld.shared.u32 r1003, [r1089+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r992, {low, high}; +} +{ +neg.f16x2 r993, r992; +} +{ +add.f16x2 r995, r996, r997; +} +{ +add.f16x2 %0, r999, r995; +} +{ +add.f16x2 r1001, r1002, r1003; +} +{ +add.f16x2 %1, r1005, r1001; +} +{ +add.f16x2 r1007, r996, r997; +} +{ +mul.f16x2 r1010, r1007, r991; +} +{ +add.f16x2 r1013, r999, r1010; +} +{ +sub.f16x2 r1016, r1002, r1003; +} +{ +mul.f16x2 r1019, r1016, r993; +} +{ +add.f16x2 %2, r1013, r1019; +} +{ +add.f16x2 r1025, r996, r997; +} +{ +mul.f16x2 r1028, r1025, r991; +} +{ +add.f16x2 r1031, r999, r1028; +} +{ +sub.f16x2 r1034, r1002, r1003; +} +{ +mul.f16x2 r1037, r1034, r993; +} +{ +sub.f16x2 %4, r1031, r1037; +} +{ +add.f16x2 r1043, r1002, r1003; +} +{ +mul.f16x2 r1046, r1043, r991; +} +{ +add.f16x2 r1049, r1005, r1046; +} +{ +sub.f16x2 r1052, r996, r997; +} +{ +mul.f16x2 r1055, r1052, r993; +} +{ +sub.f16x2 %3, r1049, r1055; +} +{ +add.f16x2 r1061, r1002, r1003; +} +{ +mul.f16x2 r1064, r1061, r991; +} +{ +add.f16x2 r1067, r1005, r1064; +} +{ +sub.f16x2 r1070, r996, r997; +} +{ +mul.f16x2 r1073, r1070, r993; +} +{ +add.f16x2 %5, r1067, r1073; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<895, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<95>; +.reg .b32 r<1124>; +.reg .b64 rd<14>; +mov.u32 r1079, %tid.y; +mov.u32 r1080, %6; +mad.lo.s32 r1081, r1079, 8748, r1080; +mov.u32 r1082, %tid.x; +mov.f32 f74, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1, {low, high}; +} +mov.f32 f76, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r1082, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r1083, rd3; +mul.lo.s32 r1084, r1083, 729; +sub.s32 r1085, r1082, r1084; +mad.lo.s32 r1086, r1083, 8748, r1081; +cvt.rn.f32.u32 f77, r1085; +mul.f32 f78, f77, 0f3B3C4870; +cos.approx.f32 f5, f78; +sin.approx.f32 f79, f78; +neg.f32 f6, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r1087, r1085, 12, r1086; +st.shared.u32 [r1087], r8; +st.shared.u32 [r1087+4], r101; +st.shared.u32 [r1087+8], r138; +barrier.sync 0; +shl.b32 r1088, r1085, 3; +sub.s32 r1089, r1087, r1088; +ld.shared.u32 r174, [r1089]; +ld.shared.u32 r171, [r1089+2916]; +ld.shared.u32 r172, [r1089+5832]; +barrier.sync 0; +st.shared.u32 [r1087], r14; +st.shared.u32 [r1087+4], r108; +st.shared.u32 [r1087+8], r145; +barrier.sync 0; +ld.shared.u32 r180, [r1089]; +ld.shared.u32 r177, [r1089+2916]; +ld.shared.u32 r178, [r1089+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r1085, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r1090, rd5; +mul.lo.s32 r1091, r1090, 3; +sub.s32 r1092, r1085, r1091; +shl.b32 r1093, r1092, 2; +add.s32 r1094, r1086, r1093; +cvt.rn.f32.u32 f80, r1090; +mul.f32 f81, f80, 0f3C0D3654; +cos.approx.f32 f17, f81; +sin.approx.f32 f82, f81; +neg.f32 f18, f82; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r1095, r1090, 36, r1094; +st.shared.u32 [r1095], r173; +st.shared.u32 [r1095+12], r266; +st.shared.u32 [r1095+24], r303; +barrier.sync 0; +ld.shared.u32 r339, [r1089]; +ld.shared.u32 r336, [r1089+2916]; +ld.shared.u32 r337, [r1089+5832]; +barrier.sync 0; +st.shared.u32 [r1095], r179; +st.shared.u32 [r1095+12], r273; +st.shared.u32 [r1095+24], r310; +barrier.sync 0; +ld.shared.u32 r345, [r1089]; +ld.shared.u32 r342, [r1089+2916]; +ld.shared.u32 r343, [r1089+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r1085, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r1096, rd7; +mul.lo.s32 r1097, r1096, 9; +sub.s32 r1098, r1085, r1097; +shl.b32 r1099, r1098, 2; +add.s32 r1100, r1086, r1099; +cvt.rn.f32.u32 f83, r1096; +mul.f32 f84, f83, 0f3CD3D17E; +cos.approx.f32 f29, f84; +sin.approx.f32 f85, f84; +neg.f32 f30, f85; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r1101, r1096, 108, r1100; +st.shared.u32 [r1101], r338; +st.shared.u32 [r1101+36], r431; +st.shared.u32 [r1101+72], r468; +barrier.sync 0; +ld.shared.u32 r504, [r1089]; +ld.shared.u32 r501, [r1089+2916]; +ld.shared.u32 r502, [r1089+5832]; +barrier.sync 0; +st.shared.u32 [r1101], r344; +st.shared.u32 [r1101+36], r438; +st.shared.u32 [r1101+72], r475; +barrier.sync 0; +ld.shared.u32 r510, [r1089]; +ld.shared.u32 r507, [r1089+2916]; +ld.shared.u32 r508, [r1089+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 r503, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 r509, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 r545, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 r563, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 r581, r572, r578; +} +mul.wide.u32 rd8, r1085, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r1102, rd9; +sub.s32 r1103, r1085, r1102; +shr.u32 r1104, r1103, 1; +add.s32 r1105, r1104, r1102; +shr.u32 r1106, r1105, 4; +mul.lo.s32 r1107, r1106, 27; +sub.s32 r1108, r1085, r1107; +shl.b32 r1109, r1108, 2; +add.s32 r1110, r1086, r1109; +cvt.rn.f32.u32 f86, r1106; +mul.f32 f87, f86, 0f3D9EDD1F; +cos.approx.f32 f41, f87; +sin.approx.f32 f88, f87; +neg.f32 f42, f88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r584, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r589, {high, high}; +} +{ +mul.f16x2 r591, r563, r589; +} +{ +neg.f16x2 r594, r591; +} +{ +fma.rn.f16x2 r596, r527, r587, r594; +} +{ +mul.f16x2 r600, r527, r589; +} +{ +fma.rn.f16x2 r603, r563, r587, r600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r607, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r609, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r609, r611; +} +{ +mul.f16x2 r615, r584, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r618, {high, low}; +} +{ +fma.rn.f16x2 r620, r612, r618, r615; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r626, {high, high}; +} +{ +mul.f16x2 r628, r581, r626; +} +{ +neg.f16x2 r631, r628; +} +{ +fma.rn.f16x2 r633, r545, r624, r631; +} +{ +mul.f16x2 r637, r545, r626; +} +{ +fma.rn.f16x2 r640, r581, r624, r637; +} +barrier.sync 0; +mad.lo.s32 r1111, r1106, 324, r1110; +st.shared.u32 [r1111], r503; +st.shared.u32 [r1111+108], r596; +st.shared.u32 [r1111+216], r633; +barrier.sync 0; +ld.shared.u32 r669, [r1089]; +ld.shared.u32 r666, [r1089+2916]; +ld.shared.u32 r667, [r1089+5832]; +barrier.sync 0; +st.shared.u32 [r1111], r509; +st.shared.u32 [r1111+108], r603; +st.shared.u32 [r1111+216], r640; +barrier.sync 0; +ld.shared.u32 r675, [r1089]; +ld.shared.u32 r672, [r1089+2916]; +ld.shared.u32 r673, [r1089+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r662, {low, high}; +} +{ +neg.f16x2 r663, r662; +} +{ +add.f16x2 r665, r666, r667; +} +{ +add.f16x2 r668, r669, r665; +} +{ +add.f16x2 r671, r672, r673; +} +{ +add.f16x2 r674, r675, r671; +} +{ +add.f16x2 r677, r666, r667; +} +{ +mul.f16x2 r680, r677, r661; +} +{ +add.f16x2 r683, r669, r680; +} +{ +sub.f16x2 r686, r672, r673; +} +{ +mul.f16x2 r689, r686, r663; +} +{ +add.f16x2 r692, r683, r689; +} +{ +add.f16x2 r695, r666, r667; +} +{ +mul.f16x2 r698, r695, r661; +} +{ +add.f16x2 r701, r669, r698; +} +{ +sub.f16x2 r704, r672, r673; +} +{ +mul.f16x2 r707, r704, r663; +} +{ +sub.f16x2 r710, r701, r707; +} +{ +add.f16x2 r713, r672, r673; +} +{ +mul.f16x2 r716, r713, r661; +} +{ +add.f16x2 r719, r675, r716; +} +{ +sub.f16x2 r722, r666, r667; +} +{ +mul.f16x2 r725, r722, r663; +} +{ +sub.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, r672, r673; +} +{ +mul.f16x2 r734, r731, r661; +} +{ +add.f16x2 r737, r675, r734; +} +{ +sub.f16x2 r740, r666, r667; +} +{ +mul.f16x2 r743, r740, r663; +} +{ +add.f16x2 r746, r737, r743; +} +mul.wide.u32 rd10, r1085, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r1112, rd11; +mul.lo.s32 r1113, r1112, 81; +sub.s32 r1114, r1085, r1113; +shl.b32 r1115, r1114, 2; +add.s32 r1116, r1086, r1115; +cvt.rn.f32.u32 f89, r1112; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f53, f90; +sin.approx.f32 f91, f90; +neg.f32 f54, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r749, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r754, {high, high}; +} +{ +mul.f16x2 r756, r728, r754; +} +{ +neg.f16x2 r759, r756; +} +{ +fma.rn.f16x2 r761, r692, r752, r759; +} +{ +mul.f16x2 r765, r692, r754; +} +{ +fma.rn.f16x2 r768, r728, r752, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r776, {low, high}; +} +{ +mul.f16x2 r777, r774, r776; +} +{ +mul.f16x2 r780, r749, r772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r783, {high, low}; +} +{ +fma.rn.f16x2 r785, r777, r783, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r791, {high, high}; +} +{ +mul.f16x2 r793, r746, r791; +} +{ +neg.f16x2 r796, r793; +} +{ +fma.rn.f16x2 r798, r710, r789, r796; +} +{ +mul.f16x2 r802, r710, r791; +} +{ +fma.rn.f16x2 r805, r746, r789, r802; +} +barrier.sync 0; +mad.lo.s32 r1117, r1112, 972, r1116; +st.shared.u32 [r1117], r668; +st.shared.u32 [r1117+324], r761; +st.shared.u32 [r1117+648], r798; +barrier.sync 0; +ld.shared.u32 r834, [r1089]; +ld.shared.u32 r831, [r1089+2916]; +ld.shared.u32 r832, [r1089+5832]; +barrier.sync 0; +st.shared.u32 [r1117], r674; +st.shared.u32 [r1117+324], r768; +st.shared.u32 [r1117+648], r805; +barrier.sync 0; +ld.shared.u32 r840, [r1089]; +ld.shared.u32 r837, [r1089+2916]; +ld.shared.u32 r838, [r1089+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r827, {low, high}; +} +{ +neg.f16x2 r828, r827; +} +{ +add.f16x2 r830, r831, r832; +} +{ +add.f16x2 r833, r834, r830; +} +{ +add.f16x2 r836, r837, r838; +} +{ +add.f16x2 r839, r840, r836; +} +{ +add.f16x2 r842, r831, r832; +} +{ +mul.f16x2 r845, r842, r826; +} +{ +add.f16x2 r848, r834, r845; +} +{ +sub.f16x2 r851, r837, r838; +} +{ +mul.f16x2 r854, r851, r828; +} +{ +add.f16x2 r857, r848, r854; +} +{ +add.f16x2 r860, r831, r832; +} +{ +mul.f16x2 r863, r860, r826; +} +{ +add.f16x2 r866, r834, r863; +} +{ +sub.f16x2 r869, r837, r838; +} +{ +mul.f16x2 r872, r869, r828; +} +{ +sub.f16x2 r875, r866, r872; +} +{ +add.f16x2 r878, r837, r838; +} +{ +mul.f16x2 r881, r878, r826; +} +{ +add.f16x2 r884, r840, r881; +} +{ +sub.f16x2 r887, r831, r832; +} +{ +mul.f16x2 r890, r887, r828; +} +{ +sub.f16x2 r893, r884, r890; +} +{ +add.f16x2 r896, r837, r838; +} +{ +mul.f16x2 r899, r896, r826; +} +{ +add.f16x2 r902, r840, r899; +} +{ +sub.f16x2 r905, r831, r832; +} +{ +mul.f16x2 r908, r905, r828; +} +{ +add.f16x2 r911, r902, r908; +} +mul.wide.u32 rd12, r1085, -2032597691; +shr.u64 rd13, rd12, 39; +cvt.u32.u64 r1118, rd13; +mul.lo.s32 r1119, r1118, 243; +sub.s32 r1120, r1085, r1119; +shl.b32 r1121, r1120, 2; +add.s32 r1122, r1086, r1121; +cvt.rn.f32.u32 f92, r1118; +mul.f32 f93, f92, 0f3F32B8C2; +cos.approx.f32 f65, f93; +sin.approx.f32 f94, f93; +neg.f32 f66, f94; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f65; +cvt.rn.f16.f32 high, f66; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r917, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r919, {high, high}; +} +{ +mul.f16x2 r921, r893, r919; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r857, r917, r924; +} +{ +mul.f16x2 r930, r857, r919; +} +{ +fma.rn.f16x2 r933, r893, r917, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r937, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r939, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r941, {low, high}; +} +{ +mul.f16x2 r942, r939, r941; +} +{ +mul.f16x2 r945, r914, r937; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r914; +mov.b32 r948, {high, low}; +} +{ +fma.rn.f16x2 r950, r942, r948, r945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r950; +mov.b32 r954, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r950; +mov.b32 r956, {high, high}; +} +{ +mul.f16x2 r958, r911, r956; +} +{ +neg.f16x2 r961, r958; +} +{ +fma.rn.f16x2 r963, r875, r954, r961; +} +{ +mul.f16x2 r967, r875, r956; +} +{ +fma.rn.f16x2 r970, r911, r954, r967; +} +barrier.sync 0; +mad.lo.s32 r1123, r1118, 2916, r1122; +st.shared.u32 [r1123], r833; +st.shared.u32 [r1123+972], r926; +st.shared.u32 [r1123+1944], r963; +barrier.sync 0; +ld.shared.u32 r999, [r1089]; +ld.shared.u32 r996, [r1089+2916]; +ld.shared.u32 r997, [r1089+5832]; +barrier.sync 0; +st.shared.u32 [r1123], r839; +st.shared.u32 [r1123+972], r933; +st.shared.u32 [r1123+1944], r970; +barrier.sync 0; +ld.shared.u32 r1005, [r1089]; +ld.shared.u32 r1002, [r1089+2916]; +ld.shared.u32 r1003, [r1089+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r992, {low, high}; +} +{ +neg.f16x2 r993, r992; +} +{ +add.f16x2 r995, r996, r997; +} +{ +add.f16x2 %0, r999, r995; +} +{ +add.f16x2 r1001, r1002, r1003; +} +{ +add.f16x2 %1, r1005, r1001; +} +{ +add.f16x2 r1007, r996, r997; +} +{ +mul.f16x2 r1010, r1007, r991; +} +{ +add.f16x2 r1013, r999, r1010; +} +{ +sub.f16x2 r1016, r1002, r1003; +} +{ +mul.f16x2 r1019, r1016, r993; +} +{ +add.f16x2 %2, r1013, r1019; +} +{ +add.f16x2 r1025, r996, r997; +} +{ +mul.f16x2 r1028, r1025, r991; +} +{ +add.f16x2 r1031, r999, r1028; +} +{ +sub.f16x2 r1034, r1002, r1003; +} +{ +mul.f16x2 r1037, r1034, r993; +} +{ +sub.f16x2 %4, r1031, r1037; +} +{ +add.f16x2 r1043, r1002, r1003; +} +{ +mul.f16x2 r1046, r1043, r991; +} +{ +add.f16x2 r1049, r1005, r1046; +} +{ +sub.f16x2 r1052, r996, r997; +} +{ +mul.f16x2 r1055, r1052, r993; +} +{ +sub.f16x2 %3, r1049, r1055; +} +{ +add.f16x2 r1061, r1002, r1003; +} +{ +mul.f16x2 r1064, r1061, r991; +} +{ +add.f16x2 r1067, r1005, r1064; +} +{ +sub.f16x2 r1070, r996, r997; +} +{ +mul.f16x2 r1073, r1070, r993; +} +{ +add.f16x2 %5, r1067, r1073; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..15c6ac566068a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp16_inv.hpp.inc @@ -0,0 +1,31203 @@ +#ifndef CUFFTDX_FFT_2187_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_2187_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1093, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<771>; +.reg .b32 r<8520>; +.reg .b64 rd<6>; +mov.u32 r8446, %54; +mov.u32 r8519, %tid.y; +mad.lo.s32 r8447, r8519, 17496, r8446; +mov.u32 r8448, %tid.x; +mov.f32 f762, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1, {low, high}; +} +mov.f32 f764, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r259, {low, high}; +} +mov.f32 f544, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r260, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r261, {low, high}; +} +mov.f32 f556, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r262, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r265, {low, high}; +} +mov.f32 f580, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %76; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %76; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %76; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %76; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %76; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %74; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %74; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %74; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %74; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %74; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %75, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %75, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %75, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %75, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %75, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1789, {low, high}; +} +mov.f32 f536, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1790, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1791, {low, high}; +} +mov.f32 f540, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1794, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1795, {low, high}; +} +mov.f32 f548, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1796, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1797, {low, high}; +} +mov.f32 f552, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1800, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1801, {low, high}; +} +mov.f32 f560, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1802, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1803, {low, high}; +} +mov.f32 f564, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1804, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1807, {low, high}; +} +mov.f32 f572, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1808, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1812, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1815, {low, high}; +} +mov.f32 f588, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1819, {low, high}; +} +mov.f32 f596, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r8448, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r8449, rd3; +mul.lo.s32 r8450, r8449, 81; +sub.s32 r8451, r8448, r8450; +cvt.rn.f32.u32 f765, r8451; +mul.f32 f766, f765, 0f3B3C4870; +cos.approx.f32 f309, f766; +sin.approx.f32 f767, f766; +neg.f32 f310, f767; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +mad.lo.s32 r8452, r8449, 17496, r8447; +barrier.sync 0; +mad.lo.s32 r8453, r8451, 216, r8452; +st.shared.v2.f32 [r8453], {r2102, r2108}; +st.shared.v2.f32 [r8453+8], {r2881, r2890}; +st.shared.v2.f32 [r8453+16], {r2918, r2927}; +st.shared.v2.f32 [r8453+24], {r2955, r2964}; +st.shared.v2.f32 [r8453+32], {r2992, r3001}; +st.shared.v2.f32 [r8453+40], {r3029, r3038}; +st.shared.v2.f32 [r8453+48], {r3066, r3075}; +st.shared.v2.f32 [r8453+56], {r3103, r3112}; +st.shared.v2.f32 [r8453+64], {r3140, r3149}; +st.shared.v2.f32 [r8453+72], {r3177, r3186}; +st.shared.v2.f32 [r8453+80], {r3214, r3223}; +st.shared.v2.f32 [r8453+88], {r3251, r3260}; +st.shared.v2.f32 [r8453+96], {r3288, r3297}; +st.shared.v2.f32 [r8453+104], {r3325, r3334}; +st.shared.v2.f32 [r8453+112], {r3362, r3371}; +st.shared.v2.f32 [r8453+120], {r3399, r3408}; +st.shared.v2.f32 [r8453+128], {r3436, r3445}; +st.shared.v2.f32 [r8453+136], {r3473, r3482}; +st.shared.v2.f32 [r8453+144], {r3510, r3519}; +st.shared.v2.f32 [r8453+152], {r3547, r3556}; +st.shared.v2.f32 [r8453+160], {r3584, r3593}; +st.shared.v2.f32 [r8453+168], {r3621, r3630}; +st.shared.v2.f32 [r8453+176], {r3658, r3667}; +st.shared.v2.f32 [r8453+184], {r3695, r3704}; +st.shared.v2.f32 [r8453+192], {r3732, r3741}; +st.shared.v2.f32 [r8453+200], {r3769, r3778}; +st.shared.v2.f32 [r8453+208], {r3806, r3815}; +barrier.sync 0; +mad.lo.s32 r8454, r8451, -208, r8453; +ld.shared.u32 r3842, [r8454]; +ld.shared.u32 r3848, [r8454+4]; +ld.shared.u32 r4438, [r8454+648]; +ld.shared.u32 r4444, [r8454+652]; +ld.shared.u32 r5034, [r8454+1296]; +ld.shared.u32 r5040, [r8454+1300]; +ld.shared.u32 r3928, [r8454+1944]; +ld.shared.u32 r3934, [r8454+1948]; +ld.shared.u32 r4524, [r8454+2592]; +ld.shared.u32 r4530, [r8454+2596]; +ld.shared.u32 r5120, [r8454+3240]; +ld.shared.u32 r5126, [r8454+3244]; +ld.shared.u32 r4014, [r8454+3888]; +ld.shared.u32 r4020, [r8454+3892]; +ld.shared.u32 r4610, [r8454+4536]; +ld.shared.u32 r4616, [r8454+4540]; +ld.shared.u32 r5206, [r8454+5184]; +ld.shared.u32 r5212, [r8454+5188]; +ld.shared.u32 r3839, [r8454+5832]; +ld.shared.u32 r3845, [r8454+5836]; +ld.shared.u32 r4435, [r8454+6480]; +ld.shared.u32 r4441, [r8454+6484]; +ld.shared.u32 r5031, [r8454+7128]; +ld.shared.u32 r5037, [r8454+7132]; +ld.shared.u32 r3925, [r8454+7776]; +ld.shared.u32 r3931, [r8454+7780]; +ld.shared.u32 r4521, [r8454+8424]; +ld.shared.u32 r4527, [r8454+8428]; +ld.shared.u32 r5117, [r8454+9072]; +ld.shared.u32 r5123, [r8454+9076]; +ld.shared.u32 r4011, [r8454+9720]; +ld.shared.u32 r4017, [r8454+9724]; +ld.shared.u32 r4607, [r8454+10368]; +ld.shared.u32 r4613, [r8454+10372]; +ld.shared.u32 r5203, [r8454+11016]; +ld.shared.u32 r5209, [r8454+11020]; +ld.shared.u32 r3840, [r8454+11664]; +ld.shared.u32 r3846, [r8454+11668]; +ld.shared.u32 r4436, [r8454+12312]; +ld.shared.u32 r4442, [r8454+12316]; +ld.shared.u32 r5032, [r8454+12960]; +ld.shared.u32 r5038, [r8454+12964]; +ld.shared.u32 r3926, [r8454+13608]; +ld.shared.u32 r3932, [r8454+13612]; +ld.shared.u32 r4522, [r8454+14256]; +ld.shared.u32 r4528, [r8454+14260]; +ld.shared.u32 r5118, [r8454+14904]; +ld.shared.u32 r5124, [r8454+14908]; +ld.shared.u32 r4012, [r8454+15552]; +ld.shared.u32 r4018, [r8454+15556]; +ld.shared.u32 r4608, [r8454+16200]; +ld.shared.u32 r4614, [r8454+16204]; +ld.shared.u32 r5204, [r8454+16848]; +ld.shared.u32 r5210, [r8454+16852]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 r5937, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 r5943, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 r5979, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 r5997, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 r6015, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 r6023, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 r6029, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 r6047, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 r6065, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 r6083, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 r6109, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 r6115, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 r6133, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 r6151, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 r6169, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 r6187, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 r6195, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 r6201, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 r6219, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 r6237, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 r6255, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 r6273, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 r6281, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 r6287, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 r6305, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 r6323, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 r6341, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 r6359, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 r6367, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 r6373, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 r6391, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 r6409, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 r6427, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 r6445, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 r6453, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 r6459, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 r6477, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 r6495, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 r6513, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 r6531, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 r6539, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 r6545, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 r6617, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 r6625, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 r6631, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 r6649, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 r6667, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 r6685, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 r6703, r6694, r6700; +} +mul.wide.u32 rd4, r8451, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r8455, rd5; +sub.s32 r8456, r8451, r8455; +shr.u32 r8457, r8456, 1; +add.s32 r8458, r8457, r8455; +shr.u32 r8459, r8458, 4; +cvt.rn.f32.u32 f768, r8459; +mul.f32 f769, f768, 0f3D9EDD1F; +cos.approx.f32 f673, f769; +sin.approx.f32 f770, f769; +neg.f32 f674, f770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6706, {low, high}; +} +mul.lo.s32 r8460, r8459, 27; +sub.s32 r8461, r8451, r8460; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6711, {high, high}; +} +{ +mul.f16x2 r6713, r6029, r6711; +} +{ +fma.rn.f16x2 r6716, r6023, r6709, r6713; +} +{ +mul.f16x2 r6720, r6023, r6711; +} +{ +neg.f16x2 r6723, r6720; +} +{ +fma.rn.f16x2 r6725, r6029, r6709, r6723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6733, {low, high}; +} +{ +mul.f16x2 r6734, r6731, r6733; +} +{ +mul.f16x2 r6737, r6706, r6729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6740, {high, low}; +} +{ +fma.rn.f16x2 r6742, r6734, r6740, r6737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6748, {high, high}; +} +{ +mul.f16x2 r6750, r6115, r6748; +} +{ +fma.rn.f16x2 r6753, r6109, r6746, r6750; +} +{ +mul.f16x2 r6757, r6109, r6748; +} +{ +neg.f16x2 r6760, r6757; +} +{ +fma.rn.f16x2 r6762, r6115, r6746, r6760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6770, {low, high}; +} +{ +mul.f16x2 r6771, r6768, r6770; +} +{ +mul.f16x2 r6774, r6742, r6766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6777, {high, low}; +} +{ +fma.rn.f16x2 r6779, r6771, r6777, r6774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6785, {high, high}; +} +{ +mul.f16x2 r6787, r6201, r6785; +} +{ +fma.rn.f16x2 r6790, r6195, r6783, r6787; +} +{ +mul.f16x2 r6794, r6195, r6785; +} +{ +neg.f16x2 r6797, r6794; +} +{ +fma.rn.f16x2 r6799, r6201, r6783, r6797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6807, {low, high}; +} +{ +mul.f16x2 r6808, r6805, r6807; +} +{ +mul.f16x2 r6811, r6779, r6803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6814, {high, low}; +} +{ +fma.rn.f16x2 r6816, r6808, r6814, r6811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6822, {high, high}; +} +{ +mul.f16x2 r6824, r6287, r6822; +} +{ +fma.rn.f16x2 r6827, r6281, r6820, r6824; +} +{ +mul.f16x2 r6831, r6281, r6822; +} +{ +neg.f16x2 r6834, r6831; +} +{ +fma.rn.f16x2 r6836, r6287, r6820, r6834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6844, {low, high}; +} +{ +mul.f16x2 r6845, r6842, r6844; +} +{ +mul.f16x2 r6848, r6816, r6840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6851, {high, low}; +} +{ +fma.rn.f16x2 r6853, r6845, r6851, r6848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6859, {high, high}; +} +{ +mul.f16x2 r6861, r6373, r6859; +} +{ +fma.rn.f16x2 r6864, r6367, r6857, r6861; +} +{ +mul.f16x2 r6868, r6367, r6859; +} +{ +neg.f16x2 r6871, r6868; +} +{ +fma.rn.f16x2 r6873, r6373, r6857, r6871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6881, {low, high}; +} +{ +mul.f16x2 r6882, r6879, r6881; +} +{ +mul.f16x2 r6885, r6853, r6877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6888, {high, low}; +} +{ +fma.rn.f16x2 r6890, r6882, r6888, r6885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6896, {high, high}; +} +{ +mul.f16x2 r6898, r6459, r6896; +} +{ +fma.rn.f16x2 r6901, r6453, r6894, r6898; +} +{ +mul.f16x2 r6905, r6453, r6896; +} +{ +neg.f16x2 r6908, r6905; +} +{ +fma.rn.f16x2 r6910, r6459, r6894, r6908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6918, {low, high}; +} +{ +mul.f16x2 r6919, r6916, r6918; +} +{ +mul.f16x2 r6922, r6890, r6914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6925, {high, low}; +} +{ +fma.rn.f16x2 r6927, r6919, r6925, r6922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6933, {high, high}; +} +{ +mul.f16x2 r6935, r6545, r6933; +} +{ +fma.rn.f16x2 r6938, r6539, r6931, r6935; +} +{ +mul.f16x2 r6942, r6539, r6933; +} +{ +neg.f16x2 r6945, r6942; +} +{ +fma.rn.f16x2 r6947, r6545, r6931, r6945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6955, {low, high}; +} +{ +mul.f16x2 r6956, r6953, r6955; +} +{ +mul.f16x2 r6959, r6927, r6951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6962, {high, low}; +} +{ +fma.rn.f16x2 r6964, r6956, r6962, r6959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6970, {high, high}; +} +{ +mul.f16x2 r6972, r6631, r6970; +} +{ +fma.rn.f16x2 r6975, r6625, r6968, r6972; +} +{ +mul.f16x2 r6979, r6625, r6970; +} +{ +neg.f16x2 r6982, r6979; +} +{ +fma.rn.f16x2 r6984, r6631, r6968, r6982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6992, {low, high}; +} +{ +mul.f16x2 r6993, r6990, r6992; +} +{ +mul.f16x2 r6996, r6964, r6988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6999, {high, low}; +} +{ +fma.rn.f16x2 r7001, r6993, r6999, r6996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7007, {high, high}; +} +{ +mul.f16x2 r7009, r5997, r7007; +} +{ +fma.rn.f16x2 r7012, r5961, r7005, r7009; +} +{ +mul.f16x2 r7016, r5961, r7007; +} +{ +neg.f16x2 r7019, r7016; +} +{ +fma.rn.f16x2 r7021, r5997, r7005, r7019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7029, {low, high}; +} +{ +mul.f16x2 r7030, r7027, r7029; +} +{ +mul.f16x2 r7033, r7001, r7025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7036, {high, low}; +} +{ +fma.rn.f16x2 r7038, r7030, r7036, r7033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7044, {high, high}; +} +{ +mul.f16x2 r7046, r6083, r7044; +} +{ +fma.rn.f16x2 r7049, r6047, r7042, r7046; +} +{ +mul.f16x2 r7053, r6047, r7044; +} +{ +neg.f16x2 r7056, r7053; +} +{ +fma.rn.f16x2 r7058, r6083, r7042, r7056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7066, {low, high}; +} +{ +mul.f16x2 r7067, r7064, r7066; +} +{ +mul.f16x2 r7070, r7038, r7062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7073, {high, low}; +} +{ +fma.rn.f16x2 r7075, r7067, r7073, r7070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7081, {high, high}; +} +{ +mul.f16x2 r7083, r6169, r7081; +} +{ +fma.rn.f16x2 r7086, r6133, r7079, r7083; +} +{ +mul.f16x2 r7090, r6133, r7081; +} +{ +neg.f16x2 r7093, r7090; +} +{ +fma.rn.f16x2 r7095, r6169, r7079, r7093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7103, {low, high}; +} +{ +mul.f16x2 r7104, r7101, r7103; +} +{ +mul.f16x2 r7107, r7075, r7099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7110, {high, low}; +} +{ +fma.rn.f16x2 r7112, r7104, r7110, r7107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7118, {high, high}; +} +{ +mul.f16x2 r7120, r6255, r7118; +} +{ +fma.rn.f16x2 r7123, r6219, r7116, r7120; +} +{ +mul.f16x2 r7127, r6219, r7118; +} +{ +neg.f16x2 r7130, r7127; +} +{ +fma.rn.f16x2 r7132, r6255, r7116, r7130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7140, {low, high}; +} +{ +mul.f16x2 r7141, r7138, r7140; +} +{ +mul.f16x2 r7144, r7112, r7136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7147, {high, low}; +} +{ +fma.rn.f16x2 r7149, r7141, r7147, r7144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7155, {high, high}; +} +{ +mul.f16x2 r7157, r6341, r7155; +} +{ +fma.rn.f16x2 r7160, r6305, r7153, r7157; +} +{ +mul.f16x2 r7164, r6305, r7155; +} +{ +neg.f16x2 r7167, r7164; +} +{ +fma.rn.f16x2 r7169, r6341, r7153, r7167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7177, {low, high}; +} +{ +mul.f16x2 r7178, r7175, r7177; +} +{ +mul.f16x2 r7181, r7149, r7173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7184, {high, low}; +} +{ +fma.rn.f16x2 r7186, r7178, r7184, r7181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7192, {high, high}; +} +{ +mul.f16x2 r7194, r6427, r7192; +} +{ +fma.rn.f16x2 r7197, r6391, r7190, r7194; +} +{ +mul.f16x2 r7201, r6391, r7192; +} +{ +neg.f16x2 r7204, r7201; +} +{ +fma.rn.f16x2 r7206, r6427, r7190, r7204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7214, {low, high}; +} +{ +mul.f16x2 r7215, r7212, r7214; +} +{ +mul.f16x2 r7218, r7186, r7210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7221, {high, low}; +} +{ +fma.rn.f16x2 r7223, r7215, r7221, r7218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7229, {high, high}; +} +{ +mul.f16x2 r7231, r6513, r7229; +} +{ +fma.rn.f16x2 r7234, r6477, r7227, r7231; +} +{ +mul.f16x2 r7238, r6477, r7229; +} +{ +neg.f16x2 r7241, r7238; +} +{ +fma.rn.f16x2 r7243, r6513, r7227, r7241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7251, {low, high}; +} +{ +mul.f16x2 r7252, r7249, r7251; +} +{ +mul.f16x2 r7255, r7223, r7247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7258, {high, low}; +} +{ +fma.rn.f16x2 r7260, r7252, r7258, r7255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7266, {high, high}; +} +{ +mul.f16x2 r7268, r6599, r7266; +} +{ +fma.rn.f16x2 r7271, r6563, r7264, r7268; +} +{ +mul.f16x2 r7275, r6563, r7266; +} +{ +neg.f16x2 r7278, r7275; +} +{ +fma.rn.f16x2 r7280, r6599, r7264, r7278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7288, {low, high}; +} +{ +mul.f16x2 r7289, r7286, r7288; +} +{ +mul.f16x2 r7292, r7260, r7284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7295, {high, low}; +} +{ +fma.rn.f16x2 r7297, r7289, r7295, r7292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7303, {high, high}; +} +{ +mul.f16x2 r7305, r6685, r7303; +} +{ +fma.rn.f16x2 r7308, r6649, r7301, r7305; +} +{ +mul.f16x2 r7312, r6649, r7303; +} +{ +neg.f16x2 r7315, r7312; +} +{ +fma.rn.f16x2 r7317, r6685, r7301, r7315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7325, {low, high}; +} +{ +mul.f16x2 r7326, r7323, r7325; +} +{ +mul.f16x2 r7329, r7297, r7321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7332, {high, low}; +} +{ +fma.rn.f16x2 r7334, r7326, r7332, r7329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7340, {high, high}; +} +{ +mul.f16x2 r7342, r6015, r7340; +} +{ +fma.rn.f16x2 r7345, r5979, r7338, r7342; +} +{ +mul.f16x2 r7349, r5979, r7340; +} +{ +neg.f16x2 r7352, r7349; +} +{ +fma.rn.f16x2 r7354, r6015, r7338, r7352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7362, {low, high}; +} +{ +mul.f16x2 r7363, r7360, r7362; +} +{ +mul.f16x2 r7366, r7334, r7358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7369, {high, low}; +} +{ +fma.rn.f16x2 r7371, r7363, r7369, r7366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7377, {high, high}; +} +{ +mul.f16x2 r7379, r6101, r7377; +} +{ +fma.rn.f16x2 r7382, r6065, r7375, r7379; +} +{ +mul.f16x2 r7386, r6065, r7377; +} +{ +neg.f16x2 r7389, r7386; +} +{ +fma.rn.f16x2 r7391, r6101, r7375, r7389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7399, {low, high}; +} +{ +mul.f16x2 r7400, r7397, r7399; +} +{ +mul.f16x2 r7403, r7371, r7395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7406, {high, low}; +} +{ +fma.rn.f16x2 r7408, r7400, r7406, r7403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7414, {high, high}; +} +{ +mul.f16x2 r7416, r6187, r7414; +} +{ +fma.rn.f16x2 r7419, r6151, r7412, r7416; +} +{ +mul.f16x2 r7423, r6151, r7414; +} +{ +neg.f16x2 r7426, r7423; +} +{ +fma.rn.f16x2 r7428, r6187, r7412, r7426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7436, {low, high}; +} +{ +mul.f16x2 r7437, r7434, r7436; +} +{ +mul.f16x2 r7440, r7408, r7432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7443, {high, low}; +} +{ +fma.rn.f16x2 r7445, r7437, r7443, r7440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7451, {high, high}; +} +{ +mul.f16x2 r7453, r6273, r7451; +} +{ +fma.rn.f16x2 r7456, r6237, r7449, r7453; +} +{ +mul.f16x2 r7460, r6237, r7451; +} +{ +neg.f16x2 r7463, r7460; +} +{ +fma.rn.f16x2 r7465, r6273, r7449, r7463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7473, {low, high}; +} +{ +mul.f16x2 r7474, r7471, r7473; +} +{ +mul.f16x2 r7477, r7445, r7469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7480, {high, low}; +} +{ +fma.rn.f16x2 r7482, r7474, r7480, r7477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7488, {high, high}; +} +{ +mul.f16x2 r7490, r6359, r7488; +} +{ +fma.rn.f16x2 r7493, r6323, r7486, r7490; +} +{ +mul.f16x2 r7497, r6323, r7488; +} +{ +neg.f16x2 r7500, r7497; +} +{ +fma.rn.f16x2 r7502, r6359, r7486, r7500; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7508, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7510, {low, high}; +} +{ +mul.f16x2 r7511, r7508, r7510; +} +{ +mul.f16x2 r7514, r7482, r7506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7517, {high, low}; +} +{ +fma.rn.f16x2 r7519, r7511, r7517, r7514; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7525, {high, high}; +} +{ +mul.f16x2 r7527, r6445, r7525; +} +{ +fma.rn.f16x2 r7530, r6409, r7523, r7527; +} +{ +mul.f16x2 r7534, r6409, r7525; +} +{ +neg.f16x2 r7537, r7534; +} +{ +fma.rn.f16x2 r7539, r6445, r7523, r7537; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7545, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7547, {low, high}; +} +{ +mul.f16x2 r7548, r7545, r7547; +} +{ +mul.f16x2 r7551, r7519, r7543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7554, {high, low}; +} +{ +fma.rn.f16x2 r7556, r7548, r7554, r7551; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7562, {high, high}; +} +{ +mul.f16x2 r7564, r6531, r7562; +} +{ +fma.rn.f16x2 r7567, r6495, r7560, r7564; +} +{ +mul.f16x2 r7571, r6495, r7562; +} +{ +neg.f16x2 r7574, r7571; +} +{ +fma.rn.f16x2 r7576, r6531, r7560, r7574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7582, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7584, {low, high}; +} +{ +mul.f16x2 r7585, r7582, r7584; +} +{ +mul.f16x2 r7588, r7556, r7580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7591, {high, low}; +} +{ +fma.rn.f16x2 r7593, r7585, r7591, r7588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7599, {high, high}; +} +{ +mul.f16x2 r7601, r6617, r7599; +} +{ +fma.rn.f16x2 r7604, r6581, r7597, r7601; +} +{ +mul.f16x2 r7608, r6581, r7599; +} +{ +neg.f16x2 r7611, r7608; +} +{ +fma.rn.f16x2 r7613, r6617, r7597, r7611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7619, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7621, {low, high}; +} +{ +mul.f16x2 r7622, r7619, r7621; +} +{ +mul.f16x2 r7625, r7593, r7617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7628, {high, low}; +} +{ +fma.rn.f16x2 r7630, r7622, r7628, r7625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7636, {high, high}; +} +{ +mul.f16x2 r7638, r6703, r7636; +} +{ +fma.rn.f16x2 r7641, r6667, r7634, r7638; +} +{ +mul.f16x2 r7645, r6667, r7636; +} +{ +neg.f16x2 r7648, r7645; +} +{ +fma.rn.f16x2 r7650, r6703, r7634, r7648; +} +shl.b32 r8462, r8461, 3; +add.s32 r8463, r8452, r8462; +barrier.sync 0; +mad.lo.s32 r8464, r8459, 5832, r8463; +st.shared.u32 [r8464], r5937; +st.shared.u32 [r8464+4], r5943; +st.shared.u32 [r8464+216], r6716; +st.shared.u32 [r8464+220], r6725; +st.shared.u32 [r8464+432], r6753; +st.shared.u32 [r8464+436], r6762; +st.shared.u32 [r8464+648], r6790; +st.shared.u32 [r8464+652], r6799; +st.shared.u32 [r8464+864], r6827; +st.shared.u32 [r8464+868], r6836; +st.shared.u32 [r8464+1080], r6864; +st.shared.u32 [r8464+1084], r6873; +st.shared.u32 [r8464+1296], r6901; +st.shared.u32 [r8464+1300], r6910; +st.shared.u32 [r8464+1512], r6938; +st.shared.u32 [r8464+1516], r6947; +st.shared.u32 [r8464+1728], r6975; +st.shared.u32 [r8464+1732], r6984; +st.shared.u32 [r8464+1944], r7012; +st.shared.u32 [r8464+1948], r7021; +st.shared.u32 [r8464+2160], r7049; +st.shared.u32 [r8464+2164], r7058; +st.shared.u32 [r8464+2376], r7086; +st.shared.u32 [r8464+2380], r7095; +st.shared.u32 [r8464+2592], r7123; +st.shared.u32 [r8464+2596], r7132; +st.shared.u32 [r8464+2808], r7160; +st.shared.u32 [r8464+2812], r7169; +st.shared.u32 [r8464+3024], r7197; +st.shared.u32 [r8464+3028], r7206; +st.shared.u32 [r8464+3240], r7234; +st.shared.u32 [r8464+3244], r7243; +st.shared.u32 [r8464+3456], r7271; +st.shared.u32 [r8464+3460], r7280; +st.shared.u32 [r8464+3672], r7308; +st.shared.u32 [r8464+3676], r7317; +st.shared.u32 [r8464+3888], r7345; +st.shared.u32 [r8464+3892], r7354; +st.shared.u32 [r8464+4104], r7382; +st.shared.u32 [r8464+4108], r7391; +st.shared.u32 [r8464+4320], r7419; +st.shared.u32 [r8464+4324], r7428; +st.shared.u32 [r8464+4536], r7456; +st.shared.u32 [r8464+4540], r7465; +st.shared.u32 [r8464+4752], r7493; +st.shared.u32 [r8464+4756], r7502; +st.shared.u32 [r8464+4968], r7530; +st.shared.u32 [r8464+4972], r7539; +st.shared.u32 [r8464+5184], r7567; +st.shared.u32 [r8464+5188], r7576; +st.shared.u32 [r8464+5400], r7604; +st.shared.u32 [r8464+5404], r7613; +st.shared.u32 [r8464+5616], r7641; +st.shared.u32 [r8464+5620], r7650; +barrier.sync 0; +ld.shared.u32 r7677, [r8454]; +ld.shared.u32 r7683, [r8454+4]; +ld.shared.u32 r7763, [r8454+648]; +ld.shared.u32 r7769, [r8454+652]; +ld.shared.u32 r7849, [r8454+1296]; +ld.shared.u32 r7855, [r8454+1300]; +ld.shared.u32 r7935, [r8454+1944]; +ld.shared.u32 r7941, [r8454+1948]; +ld.shared.u32 r8021, [r8454+2592]; +ld.shared.u32 r8027, [r8454+2596]; +ld.shared.u32 r8107, [r8454+3240]; +ld.shared.u32 r8113, [r8454+3244]; +ld.shared.u32 r8193, [r8454+3888]; +ld.shared.u32 r8199, [r8454+3892]; +ld.shared.u32 r8279, [r8454+4536]; +ld.shared.u32 r8285, [r8454+4540]; +ld.shared.u32 r8365, [r8454+5184]; +ld.shared.u32 r8371, [r8454+5188]; +ld.shared.u32 r7674, [r8454+5832]; +ld.shared.u32 r7680, [r8454+5836]; +ld.shared.u32 r7760, [r8454+6480]; +ld.shared.u32 r7766, [r8454+6484]; +ld.shared.u32 r7846, [r8454+7128]; +ld.shared.u32 r7852, [r8454+7132]; +ld.shared.u32 r7932, [r8454+7776]; +ld.shared.u32 r7938, [r8454+7780]; +ld.shared.u32 r8018, [r8454+8424]; +ld.shared.u32 r8024, [r8454+8428]; +ld.shared.u32 r8104, [r8454+9072]; +ld.shared.u32 r8110, [r8454+9076]; +ld.shared.u32 r8190, [r8454+9720]; +ld.shared.u32 r8196, [r8454+9724]; +ld.shared.u32 r8276, [r8454+10368]; +ld.shared.u32 r8282, [r8454+10372]; +ld.shared.u32 r8362, [r8454+11016]; +ld.shared.u32 r8368, [r8454+11020]; +ld.shared.u32 r7675, [r8454+11664]; +ld.shared.u32 r7681, [r8454+11668]; +ld.shared.u32 r7761, [r8454+12312]; +ld.shared.u32 r7767, [r8454+12316]; +ld.shared.u32 r7847, [r8454+12960]; +ld.shared.u32 r7853, [r8454+12964]; +ld.shared.u32 r7933, [r8454+13608]; +ld.shared.u32 r7939, [r8454+13612]; +ld.shared.u32 r8019, [r8454+14256]; +ld.shared.u32 r8025, [r8454+14260]; +ld.shared.u32 r8105, [r8454+14904]; +ld.shared.u32 r8111, [r8454+14908]; +ld.shared.u32 r8191, [r8454+15552]; +ld.shared.u32 r8197, [r8454+15556]; +ld.shared.u32 r8277, [r8454+16200]; +ld.shared.u32 r8283, [r8454+16204]; +ld.shared.u32 r8363, [r8454+16848]; +ld.shared.u32 r8369, [r8454+16852]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7672, {low, high}; +} +{ +add.f16x2 r7673, r7674, r7675; +} +{ +add.f16x2 %0, r7677, r7673; +} +{ +add.f16x2 r7679, r7680, r7681; +} +{ +add.f16x2 %1, r7683, r7679; +} +{ +add.f16x2 r7685, r7674, r7675; +} +{ +mul.f16x2 r7688, r7685, r7671; +} +{ +add.f16x2 r7691, r7677, r7688; +} +{ +sub.f16x2 r7694, r7680, r7681; +} +{ +mul.f16x2 r7697, r7694, r7672; +} +{ +add.f16x2 %18, r7691, r7697; +} +{ +add.f16x2 r7703, r7674, r7675; +} +{ +mul.f16x2 r7706, r7703, r7671; +} +{ +add.f16x2 r7709, r7677, r7706; +} +{ +sub.f16x2 r7712, r7680, r7681; +} +{ +mul.f16x2 r7715, r7712, r7672; +} +{ +sub.f16x2 %36, r7709, r7715; +} +{ +add.f16x2 r7721, r7680, r7681; +} +{ +mul.f16x2 r7724, r7721, r7671; +} +{ +add.f16x2 r7727, r7683, r7724; +} +{ +sub.f16x2 r7730, r7674, r7675; +} +{ +mul.f16x2 r7733, r7730, r7672; +} +{ +sub.f16x2 %19, r7727, r7733; +} +{ +add.f16x2 r7739, r7680, r7681; +} +{ +mul.f16x2 r7742, r7739, r7671; +} +{ +add.f16x2 r7745, r7683, r7742; +} +{ +sub.f16x2 r7748, r7674, r7675; +} +{ +mul.f16x2 r7751, r7748, r7672; +} +{ +add.f16x2 %37, r7745, r7751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7757, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7758, {low, high}; +} +{ +add.f16x2 r7759, r7760, r7761; +} +{ +add.f16x2 %2, r7763, r7759; +} +{ +add.f16x2 r7765, r7766, r7767; +} +{ +add.f16x2 %3, r7769, r7765; +} +{ +add.f16x2 r7771, r7760, r7761; +} +{ +mul.f16x2 r7774, r7771, r7757; +} +{ +add.f16x2 r7777, r7763, r7774; +} +{ +sub.f16x2 r7780, r7766, r7767; +} +{ +mul.f16x2 r7783, r7780, r7758; +} +{ +add.f16x2 %20, r7777, r7783; +} +{ +add.f16x2 r7789, r7760, r7761; +} +{ +mul.f16x2 r7792, r7789, r7757; +} +{ +add.f16x2 r7795, r7763, r7792; +} +{ +sub.f16x2 r7798, r7766, r7767; +} +{ +mul.f16x2 r7801, r7798, r7758; +} +{ +sub.f16x2 %38, r7795, r7801; +} +{ +add.f16x2 r7807, r7766, r7767; +} +{ +mul.f16x2 r7810, r7807, r7757; +} +{ +add.f16x2 r7813, r7769, r7810; +} +{ +sub.f16x2 r7816, r7760, r7761; +} +{ +mul.f16x2 r7819, r7816, r7758; +} +{ +sub.f16x2 %21, r7813, r7819; +} +{ +add.f16x2 r7825, r7766, r7767; +} +{ +mul.f16x2 r7828, r7825, r7757; +} +{ +add.f16x2 r7831, r7769, r7828; +} +{ +sub.f16x2 r7834, r7760, r7761; +} +{ +mul.f16x2 r7837, r7834, r7758; +} +{ +add.f16x2 %39, r7831, r7837; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7843, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7844, {low, high}; +} +{ +add.f16x2 r7845, r7846, r7847; +} +{ +add.f16x2 %4, r7849, r7845; +} +{ +add.f16x2 r7851, r7852, r7853; +} +{ +add.f16x2 %5, r7855, r7851; +} +{ +add.f16x2 r7857, r7846, r7847; +} +{ +mul.f16x2 r7860, r7857, r7843; +} +{ +add.f16x2 r7863, r7849, r7860; +} +{ +sub.f16x2 r7866, r7852, r7853; +} +{ +mul.f16x2 r7869, r7866, r7844; +} +{ +add.f16x2 %22, r7863, r7869; +} +{ +add.f16x2 r7875, r7846, r7847; +} +{ +mul.f16x2 r7878, r7875, r7843; +} +{ +add.f16x2 r7881, r7849, r7878; +} +{ +sub.f16x2 r7884, r7852, r7853; +} +{ +mul.f16x2 r7887, r7884, r7844; +} +{ +sub.f16x2 %40, r7881, r7887; +} +{ +add.f16x2 r7893, r7852, r7853; +} +{ +mul.f16x2 r7896, r7893, r7843; +} +{ +add.f16x2 r7899, r7855, r7896; +} +{ +sub.f16x2 r7902, r7846, r7847; +} +{ +mul.f16x2 r7905, r7902, r7844; +} +{ +sub.f16x2 %23, r7899, r7905; +} +{ +add.f16x2 r7911, r7852, r7853; +} +{ +mul.f16x2 r7914, r7911, r7843; +} +{ +add.f16x2 r7917, r7855, r7914; +} +{ +sub.f16x2 r7920, r7846, r7847; +} +{ +mul.f16x2 r7923, r7920, r7844; +} +{ +add.f16x2 %41, r7917, r7923; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7930, {low, high}; +} +{ +add.f16x2 r7931, r7932, r7933; +} +{ +add.f16x2 %6, r7935, r7931; +} +{ +add.f16x2 r7937, r7938, r7939; +} +{ +add.f16x2 %7, r7941, r7937; +} +{ +add.f16x2 r7943, r7932, r7933; +} +{ +mul.f16x2 r7946, r7943, r7929; +} +{ +add.f16x2 r7949, r7935, r7946; +} +{ +sub.f16x2 r7952, r7938, r7939; +} +{ +mul.f16x2 r7955, r7952, r7930; +} +{ +add.f16x2 %24, r7949, r7955; +} +{ +add.f16x2 r7961, r7932, r7933; +} +{ +mul.f16x2 r7964, r7961, r7929; +} +{ +add.f16x2 r7967, r7935, r7964; +} +{ +sub.f16x2 r7970, r7938, r7939; +} +{ +mul.f16x2 r7973, r7970, r7930; +} +{ +sub.f16x2 %42, r7967, r7973; +} +{ +add.f16x2 r7979, r7938, r7939; +} +{ +mul.f16x2 r7982, r7979, r7929; +} +{ +add.f16x2 r7985, r7941, r7982; +} +{ +sub.f16x2 r7988, r7932, r7933; +} +{ +mul.f16x2 r7991, r7988, r7930; +} +{ +sub.f16x2 %25, r7985, r7991; +} +{ +add.f16x2 r7997, r7938, r7939; +} +{ +mul.f16x2 r8000, r7997, r7929; +} +{ +add.f16x2 r8003, r7941, r8000; +} +{ +sub.f16x2 r8006, r7932, r7933; +} +{ +mul.f16x2 r8009, r8006, r7930; +} +{ +add.f16x2 %43, r8003, r8009; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8015, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8016, {low, high}; +} +{ +add.f16x2 r8017, r8018, r8019; +} +{ +add.f16x2 %8, r8021, r8017; +} +{ +add.f16x2 r8023, r8024, r8025; +} +{ +add.f16x2 %9, r8027, r8023; +} +{ +add.f16x2 r8029, r8018, r8019; +} +{ +mul.f16x2 r8032, r8029, r8015; +} +{ +add.f16x2 r8035, r8021, r8032; +} +{ +sub.f16x2 r8038, r8024, r8025; +} +{ +mul.f16x2 r8041, r8038, r8016; +} +{ +add.f16x2 %26, r8035, r8041; +} +{ +add.f16x2 r8047, r8018, r8019; +} +{ +mul.f16x2 r8050, r8047, r8015; +} +{ +add.f16x2 r8053, r8021, r8050; +} +{ +sub.f16x2 r8056, r8024, r8025; +} +{ +mul.f16x2 r8059, r8056, r8016; +} +{ +sub.f16x2 %44, r8053, r8059; +} +{ +add.f16x2 r8065, r8024, r8025; +} +{ +mul.f16x2 r8068, r8065, r8015; +} +{ +add.f16x2 r8071, r8027, r8068; +} +{ +sub.f16x2 r8074, r8018, r8019; +} +{ +mul.f16x2 r8077, r8074, r8016; +} +{ +sub.f16x2 %27, r8071, r8077; +} +{ +add.f16x2 r8083, r8024, r8025; +} +{ +mul.f16x2 r8086, r8083, r8015; +} +{ +add.f16x2 r8089, r8027, r8086; +} +{ +sub.f16x2 r8092, r8018, r8019; +} +{ +mul.f16x2 r8095, r8092, r8016; +} +{ +add.f16x2 %45, r8089, r8095; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8102, {low, high}; +} +{ +add.f16x2 r8103, r8104, r8105; +} +{ +add.f16x2 %10, r8107, r8103; +} +{ +add.f16x2 r8109, r8110, r8111; +} +{ +add.f16x2 %11, r8113, r8109; +} +{ +add.f16x2 r8115, r8104, r8105; +} +{ +mul.f16x2 r8118, r8115, r8101; +} +{ +add.f16x2 r8121, r8107, r8118; +} +{ +sub.f16x2 r8124, r8110, r8111; +} +{ +mul.f16x2 r8127, r8124, r8102; +} +{ +add.f16x2 %28, r8121, r8127; +} +{ +add.f16x2 r8133, r8104, r8105; +} +{ +mul.f16x2 r8136, r8133, r8101; +} +{ +add.f16x2 r8139, r8107, r8136; +} +{ +sub.f16x2 r8142, r8110, r8111; +} +{ +mul.f16x2 r8145, r8142, r8102; +} +{ +sub.f16x2 %46, r8139, r8145; +} +{ +add.f16x2 r8151, r8110, r8111; +} +{ +mul.f16x2 r8154, r8151, r8101; +} +{ +add.f16x2 r8157, r8113, r8154; +} +{ +sub.f16x2 r8160, r8104, r8105; +} +{ +mul.f16x2 r8163, r8160, r8102; +} +{ +sub.f16x2 %29, r8157, r8163; +} +{ +add.f16x2 r8169, r8110, r8111; +} +{ +mul.f16x2 r8172, r8169, r8101; +} +{ +add.f16x2 r8175, r8113, r8172; +} +{ +sub.f16x2 r8178, r8104, r8105; +} +{ +mul.f16x2 r8181, r8178, r8102; +} +{ +add.f16x2 %47, r8175, r8181; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8188, {low, high}; +} +{ +add.f16x2 r8189, r8190, r8191; +} +{ +add.f16x2 %12, r8193, r8189; +} +{ +add.f16x2 r8195, r8196, r8197; +} +{ +add.f16x2 %13, r8199, r8195; +} +{ +add.f16x2 r8201, r8190, r8191; +} +{ +mul.f16x2 r8204, r8201, r8187; +} +{ +add.f16x2 r8207, r8193, r8204; +} +{ +sub.f16x2 r8210, r8196, r8197; +} +{ +mul.f16x2 r8213, r8210, r8188; +} +{ +add.f16x2 %30, r8207, r8213; +} +{ +add.f16x2 r8219, r8190, r8191; +} +{ +mul.f16x2 r8222, r8219, r8187; +} +{ +add.f16x2 r8225, r8193, r8222; +} +{ +sub.f16x2 r8228, r8196, r8197; +} +{ +mul.f16x2 r8231, r8228, r8188; +} +{ +sub.f16x2 %48, r8225, r8231; +} +{ +add.f16x2 r8237, r8196, r8197; +} +{ +mul.f16x2 r8240, r8237, r8187; +} +{ +add.f16x2 r8243, r8199, r8240; +} +{ +sub.f16x2 r8246, r8190, r8191; +} +{ +mul.f16x2 r8249, r8246, r8188; +} +{ +sub.f16x2 %31, r8243, r8249; +} +{ +add.f16x2 r8255, r8196, r8197; +} +{ +mul.f16x2 r8258, r8255, r8187; +} +{ +add.f16x2 r8261, r8199, r8258; +} +{ +sub.f16x2 r8264, r8190, r8191; +} +{ +mul.f16x2 r8267, r8264, r8188; +} +{ +add.f16x2 %49, r8261, r8267; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8273, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8274, {low, high}; +} +{ +add.f16x2 r8275, r8276, r8277; +} +{ +add.f16x2 %14, r8279, r8275; +} +{ +add.f16x2 r8281, r8282, r8283; +} +{ +add.f16x2 %15, r8285, r8281; +} +{ +add.f16x2 r8287, r8276, r8277; +} +{ +mul.f16x2 r8290, r8287, r8273; +} +{ +add.f16x2 r8293, r8279, r8290; +} +{ +sub.f16x2 r8296, r8282, r8283; +} +{ +mul.f16x2 r8299, r8296, r8274; +} +{ +add.f16x2 %32, r8293, r8299; +} +{ +add.f16x2 r8305, r8276, r8277; +} +{ +mul.f16x2 r8308, r8305, r8273; +} +{ +add.f16x2 r8311, r8279, r8308; +} +{ +sub.f16x2 r8314, r8282, r8283; +} +{ +mul.f16x2 r8317, r8314, r8274; +} +{ +sub.f16x2 %50, r8311, r8317; +} +{ +add.f16x2 r8323, r8282, r8283; +} +{ +mul.f16x2 r8326, r8323, r8273; +} +{ +add.f16x2 r8329, r8285, r8326; +} +{ +sub.f16x2 r8332, r8276, r8277; +} +{ +mul.f16x2 r8335, r8332, r8274; +} +{ +sub.f16x2 %33, r8329, r8335; +} +{ +add.f16x2 r8341, r8282, r8283; +} +{ +mul.f16x2 r8344, r8341, r8273; +} +{ +add.f16x2 r8347, r8285, r8344; +} +{ +sub.f16x2 r8350, r8276, r8277; +} +{ +mul.f16x2 r8353, r8350, r8274; +} +{ +add.f16x2 %51, r8347, r8353; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8360, {low, high}; +} +{ +add.f16x2 r8361, r8362, r8363; +} +{ +add.f16x2 %16, r8365, r8361; +} +{ +add.f16x2 r8367, r8368, r8369; +} +{ +add.f16x2 %17, r8371, r8367; +} +{ +add.f16x2 r8373, r8362, r8363; +} +{ +mul.f16x2 r8376, r8373, r8359; +} +{ +add.f16x2 r8379, r8365, r8376; +} +{ +sub.f16x2 r8382, r8368, r8369; +} +{ +mul.f16x2 r8385, r8382, r8360; +} +{ +add.f16x2 %34, r8379, r8385; +} +{ +add.f16x2 r8391, r8362, r8363; +} +{ +mul.f16x2 r8394, r8391, r8359; +} +{ +add.f16x2 r8397, r8365, r8394; +} +{ +sub.f16x2 r8400, r8368, r8369; +} +{ +mul.f16x2 r8403, r8400, r8360; +} +{ +sub.f16x2 %52, r8397, r8403; +} +{ +add.f16x2 r8409, r8368, r8369; +} +{ +mul.f16x2 r8412, r8409, r8359; +} +{ +add.f16x2 r8415, r8371, r8412; +} +{ +sub.f16x2 r8418, r8362, r8363; +} +{ +mul.f16x2 r8421, r8418, r8360; +} +{ +sub.f16x2 %35, r8415, r8421; +} +{ +add.f16x2 r8427, r8368, r8369; +} +{ +mul.f16x2 r8430, r8427, r8359; +} +{ +add.f16x2 r8433, r8371, r8430; +} +{ +sub.f16x2 r8436, r8362, r8363; +} +{ +mul.f16x2 r8439, r8436, r8360; +} +{ +add.f16x2 %53, r8433, r8439; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1092, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<250>; +.reg .b32 r<2967>; +.reg .b64 rd<8>; +mov.u32 r2944, %tid.y; +mov.u32 r2945, %18; +mad.lo.s32 r2946, r2944, 17496, r2945; +mov.u32 r2947, %tid.x; +mov.f32 f238, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1, {low, high}; +} +mov.f32 f240, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r259, {low, high}; +} +mov.f32 f168, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r260, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r261, {low, high}; +} +mov.f32 f172, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r262, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r265, {low, high}; +} +mov.f32 f180, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r2947, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r2948, rd3; +mul.lo.s32 r2949, r2948, 243; +sub.s32 r2950, r2947, r2949; +cvt.rn.f32.u32 f241, r2950; +mul.f32 f242, f241, 0f3B3C4870; +cos.approx.f32 f57, f242; +sin.approx.f32 f243, f242; +neg.f32 f58, f243; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +mad.lo.s32 r2951, r2948, 17496, r2946; +barrier.sync 0; +mad.lo.s32 r2952, r2950, 72, r2951; +st.shared.v2.f32 [r2952], {r344, r350}; +st.shared.v2.f32 [r2952+8], {r607, r616}; +st.shared.v2.f32 [r2952+16], {r644, r653}; +st.shared.v2.f32 [r2952+24], {r681, r690}; +st.shared.v2.f32 [r2952+32], {r718, r727}; +st.shared.v2.f32 [r2952+40], {r755, r764}; +st.shared.v2.f32 [r2952+48], {r792, r801}; +st.shared.v2.f32 [r2952+56], {r829, r838}; +st.shared.v2.f32 [r2952+64], {r866, r875}; +barrier.sync 0; +shl.b32 r2953, r2950, 6; +sub.s32 r2954, r2952, r2953; +ld.shared.u32 r902, [r2954]; +ld.shared.u32 r908, [r2954+4]; +ld.shared.u32 r988, [r2954+1944]; +ld.shared.u32 r994, [r2954+1948]; +ld.shared.u32 r1074, [r2954+3888]; +ld.shared.u32 r1080, [r2954+3892]; +ld.shared.u32 r899, [r2954+5832]; +ld.shared.u32 r905, [r2954+5836]; +ld.shared.u32 r985, [r2954+7776]; +ld.shared.u32 r991, [r2954+7780]; +ld.shared.u32 r1071, [r2954+9720]; +ld.shared.u32 r1077, [r2954+9724]; +ld.shared.u32 r900, [r2954+11664]; +ld.shared.u32 r906, [r2954+11668]; +ld.shared.u32 r986, [r2954+13608]; +ld.shared.u32 r992, [r2954+13612]; +ld.shared.u32 r1072, [r2954+15552]; +ld.shared.u32 r1078, [r2954+15556]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r2950, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2955, rd5; +cvt.rn.f32.u32 f244, r2955; +mul.f32 f245, f244, 0f3CD3D17E; +cos.approx.f32 f133, f245; +sin.approx.f32 f246, f245; +neg.f32 f134, f246; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +mul.lo.s32 r2956, r2955, 9; +sub.s32 r2957, r2950, r2956; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +shl.b32 r2958, r2957, 3; +add.s32 r2959, r2951, r2958; +barrier.sync 0; +mad.lo.s32 r2960, r2955, 648, r2959; +st.shared.u32 [r2960], r1239; +st.shared.u32 [r2960+4], r1245; +st.shared.u32 [r2960+72], r1502; +st.shared.u32 [r2960+76], r1511; +st.shared.u32 [r2960+144], r1539; +st.shared.u32 [r2960+148], r1548; +st.shared.u32 [r2960+216], r1576; +st.shared.u32 [r2960+220], r1585; +st.shared.u32 [r2960+288], r1613; +st.shared.u32 [r2960+292], r1622; +st.shared.u32 [r2960+360], r1650; +st.shared.u32 [r2960+364], r1659; +st.shared.u32 [r2960+432], r1687; +st.shared.u32 [r2960+436], r1696; +st.shared.u32 [r2960+504], r1724; +st.shared.u32 [r2960+508], r1733; +st.shared.u32 [r2960+576], r1761; +st.shared.u32 [r2960+580], r1770; +barrier.sync 0; +ld.shared.u32 r1797, [r2954]; +ld.shared.u32 r1803, [r2954+4]; +ld.shared.u32 r1883, [r2954+1944]; +ld.shared.u32 r1889, [r2954+1948]; +ld.shared.u32 r1969, [r2954+3888]; +ld.shared.u32 r1975, [r2954+3892]; +ld.shared.u32 r1794, [r2954+5832]; +ld.shared.u32 r1800, [r2954+5836]; +ld.shared.u32 r1880, [r2954+7776]; +ld.shared.u32 r1886, [r2954+7780]; +ld.shared.u32 r1966, [r2954+9720]; +ld.shared.u32 r1972, [r2954+9724]; +ld.shared.u32 r1795, [r2954+11664]; +ld.shared.u32 r1801, [r2954+11668]; +ld.shared.u32 r1881, [r2954+13608]; +ld.shared.u32 r1887, [r2954+13612]; +ld.shared.u32 r1967, [r2954+15552]; +ld.shared.u32 r1973, [r2954+15556]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 r1802, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 r1820, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 r1838, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 r1856, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 r1874, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 r1942, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 r1968, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 r1974, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 r1992, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 r2010, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 r2028, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2065, r1906, r2049; +} +{ +mul.f16x2 r2068, r1942, r2050; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1906, r2050; +} +{ +fma.rn.f16x2 r2077, r1942, r2049, r2074; +} +{ +mul.f16x2 r2081, r1992, r2051; +} +{ +mul.f16x2 r2084, r2028, r2052; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1992, r2052; +} +{ +fma.rn.f16x2 r2093, r2028, r2051, r2090; +} +{ +mul.f16x2 r2097, r1924, r2051; +} +{ +mul.f16x2 r2100, r1960, r2052; +} +{ +sub.f16x2 r2103, r2097, r2100; +} +{ +mul.f16x2 r2106, r1924, r2052; +} +{ +fma.rn.f16x2 r2109, r1960, r2051, r2106; +} +{ +mul.f16x2 r2113, r2010, r2055; +} +{ +mul.f16x2 r2116, r2046, r2056; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r2010, r2056; +} +{ +fma.rn.f16x2 r2125, r2046, r2055, r2122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2130, {low, high}; +} +{ +add.f16x2 r2131, r1882, r1968; +} +{ +add.f16x2 r2134, r1796, r2131; +} +{ +add.f16x2 r2137, r1888, r1974; +} +{ +add.f16x2 r2140, r1802, r2137; +} +{ +add.f16x2 r2143, r1882, r1968; +} +{ +mul.f16x2 r2146, r2143, r2129; +} +{ +add.f16x2 r2149, r1796, r2146; +} +{ +sub.f16x2 r2152, r1888, r1974; +} +{ +mul.f16x2 r2155, r2152, r2130; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +add.f16x2 r2161, r1882, r1968; +} +{ +mul.f16x2 r2164, r2161, r2129; +} +{ +add.f16x2 r2167, r1796, r2164; +} +{ +sub.f16x2 r2170, r1888, r1974; +} +{ +mul.f16x2 r2173, r2170, r2130; +} +{ +sub.f16x2 r2176, r2167, r2173; +} +{ +add.f16x2 r2179, r1888, r1974; +} +{ +mul.f16x2 r2182, r2179, r2129; +} +{ +add.f16x2 r2185, r1802, r2182; +} +{ +sub.f16x2 r2188, r1882, r1968; +} +{ +mul.f16x2 r2191, r2188, r2130; +} +{ +sub.f16x2 r2194, r2185, r2191; +} +{ +add.f16x2 r2197, r1888, r1974; +} +{ +mul.f16x2 r2200, r2197, r2129; +} +{ +add.f16x2 r2203, r1802, r2200; +} +{ +sub.f16x2 r2206, r1882, r1968; +} +{ +mul.f16x2 r2209, r2206, r2130; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2215, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2216, {low, high}; +} +{ +add.f16x2 r2217, r2071, r2087; +} +{ +add.f16x2 r2220, r1820, r2217; +} +{ +add.f16x2 r2223, r2077, r2093; +} +{ +add.f16x2 r2226, r1856, r2223; +} +{ +add.f16x2 r2229, r2071, r2087; +} +{ +mul.f16x2 r2232, r2229, r2215; +} +{ +add.f16x2 r2235, r1820, r2232; +} +{ +sub.f16x2 r2238, r2077, r2093; +} +{ +mul.f16x2 r2241, r2238, r2216; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +add.f16x2 r2247, r2071, r2087; +} +{ +mul.f16x2 r2250, r2247, r2215; +} +{ +add.f16x2 r2253, r1820, r2250; +} +{ +sub.f16x2 r2256, r2077, r2093; +} +{ +mul.f16x2 r2259, r2256, r2216; +} +{ +sub.f16x2 r2262, r2253, r2259; +} +{ +add.f16x2 r2265, r2077, r2093; +} +{ +mul.f16x2 r2268, r2265, r2215; +} +{ +add.f16x2 r2271, r1856, r2268; +} +{ +sub.f16x2 r2274, r2071, r2087; +} +{ +mul.f16x2 r2277, r2274, r2216; +} +{ +sub.f16x2 r2280, r2271, r2277; +} +{ +add.f16x2 r2283, r2077, r2093; +} +{ +mul.f16x2 r2286, r2283, r2215; +} +{ +add.f16x2 r2289, r1856, r2286; +} +{ +sub.f16x2 r2292, r2071, r2087; +} +{ +mul.f16x2 r2295, r2292, r2216; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2302, {low, high}; +} +{ +add.f16x2 r2303, r2103, r2119; +} +{ +add.f16x2 r2306, r1838, r2303; +} +{ +add.f16x2 r2309, r2109, r2125; +} +{ +add.f16x2 r2312, r1874, r2309; +} +{ +add.f16x2 r2315, r2103, r2119; +} +{ +mul.f16x2 r2318, r2315, r2301; +} +{ +add.f16x2 r2321, r1838, r2318; +} +{ +sub.f16x2 r2324, r2109, r2125; +} +{ +mul.f16x2 r2327, r2324, r2302; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +add.f16x2 r2333, r2103, r2119; +} +{ +mul.f16x2 r2336, r2333, r2301; +} +{ +add.f16x2 r2339, r1838, r2336; +} +{ +sub.f16x2 r2342, r2109, r2125; +} +{ +mul.f16x2 r2345, r2342, r2302; +} +{ +sub.f16x2 r2348, r2339, r2345; +} +{ +add.f16x2 r2351, r2109, r2125; +} +{ +mul.f16x2 r2354, r2351, r2301; +} +{ +add.f16x2 r2357, r1874, r2354; +} +{ +sub.f16x2 r2360, r2103, r2119; +} +{ +mul.f16x2 r2363, r2360, r2302; +} +{ +sub.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2109, r2125; +} +{ +mul.f16x2 r2372, r2369, r2301; +} +{ +add.f16x2 r2375, r1874, r2372; +} +{ +sub.f16x2 r2378, r2103, r2119; +} +{ +mul.f16x2 r2381, r2378, r2302; +} +{ +add.f16x2 r2384, r2375, r2381; +} +mul.wide.u32 rd6, r2950, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r2961, rd7; +cvt.rn.f32.u32 f247, r2961; +mul.f32 f248, f247, 0f3E6E4BAE; +cos.approx.f32 f209, f248; +sin.approx.f32 f249, f248; +neg.f32 f210, f249; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2387, {low, high}; +} +mul.lo.s32 r2962, r2961, 81; +sub.s32 r2963, r2950, r2962; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2390, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2392, {high, high}; +} +{ +mul.f16x2 r2394, r2226, r2392; +} +{ +fma.rn.f16x2 r2397, r2220, r2390, r2394; +} +{ +mul.f16x2 r2401, r2220, r2392; +} +{ +neg.f16x2 r2404, r2401; +} +{ +fma.rn.f16x2 r2406, r2226, r2390, r2404; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2410, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2412, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2414, {low, high}; +} +{ +mul.f16x2 r2415, r2412, r2414; +} +{ +mul.f16x2 r2418, r2387, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2421, {high, low}; +} +{ +fma.rn.f16x2 r2423, r2415, r2421, r2418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2427, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2429, {high, high}; +} +{ +mul.f16x2 r2431, r2312, r2429; +} +{ +fma.rn.f16x2 r2434, r2306, r2427, r2431; +} +{ +mul.f16x2 r2438, r2306, r2429; +} +{ +neg.f16x2 r2441, r2438; +} +{ +fma.rn.f16x2 r2443, r2312, r2427, r2441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2447, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2449, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2451, {low, high}; +} +{ +mul.f16x2 r2452, r2449, r2451; +} +{ +mul.f16x2 r2455, r2423, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2458, {high, low}; +} +{ +fma.rn.f16x2 r2460, r2452, r2458, r2455; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2464, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2466, {high, high}; +} +{ +mul.f16x2 r2468, r2194, r2466; +} +{ +fma.rn.f16x2 r2471, r2158, r2464, r2468; +} +{ +mul.f16x2 r2475, r2158, r2466; +} +{ +neg.f16x2 r2478, r2475; +} +{ +fma.rn.f16x2 r2480, r2194, r2464, r2478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2484, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2486, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2488, {low, high}; +} +{ +mul.f16x2 r2489, r2486, r2488; +} +{ +mul.f16x2 r2492, r2460, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2495, {high, low}; +} +{ +fma.rn.f16x2 r2497, r2489, r2495, r2492; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2501, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2503, {high, high}; +} +{ +mul.f16x2 r2505, r2280, r2503; +} +{ +fma.rn.f16x2 r2508, r2244, r2501, r2505; +} +{ +mul.f16x2 r2512, r2244, r2503; +} +{ +neg.f16x2 r2515, r2512; +} +{ +fma.rn.f16x2 r2517, r2280, r2501, r2515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2521, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2523, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2525, {low, high}; +} +{ +mul.f16x2 r2526, r2523, r2525; +} +{ +mul.f16x2 r2529, r2497, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2532, {high, low}; +} +{ +fma.rn.f16x2 r2534, r2526, r2532, r2529; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2538, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2540, {high, high}; +} +{ +mul.f16x2 r2542, r2366, r2540; +} +{ +fma.rn.f16x2 r2545, r2330, r2538, r2542; +} +{ +mul.f16x2 r2549, r2330, r2540; +} +{ +neg.f16x2 r2552, r2549; +} +{ +fma.rn.f16x2 r2554, r2366, r2538, r2552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2558, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2560, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2562, {low, high}; +} +{ +mul.f16x2 r2563, r2560, r2562; +} +{ +mul.f16x2 r2566, r2534, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2569, {high, low}; +} +{ +fma.rn.f16x2 r2571, r2563, r2569, r2566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2575, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2577, {high, high}; +} +{ +mul.f16x2 r2579, r2212, r2577; +} +{ +fma.rn.f16x2 r2582, r2176, r2575, r2579; +} +{ +mul.f16x2 r2586, r2176, r2577; +} +{ +neg.f16x2 r2589, r2586; +} +{ +fma.rn.f16x2 r2591, r2212, r2575, r2589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2595, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2597, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2599, {low, high}; +} +{ +mul.f16x2 r2600, r2597, r2599; +} +{ +mul.f16x2 r2603, r2571, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2606, {high, low}; +} +{ +fma.rn.f16x2 r2608, r2600, r2606, r2603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2614, {high, high}; +} +{ +mul.f16x2 r2616, r2298, r2614; +} +{ +fma.rn.f16x2 r2619, r2262, r2612, r2616; +} +{ +mul.f16x2 r2623, r2262, r2614; +} +{ +neg.f16x2 r2626, r2623; +} +{ +fma.rn.f16x2 r2628, r2298, r2612, r2626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2634, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2636, {low, high}; +} +{ +mul.f16x2 r2637, r2634, r2636; +} +{ +mul.f16x2 r2640, r2608, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2643, {high, low}; +} +{ +fma.rn.f16x2 r2645, r2637, r2643, r2640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2651, {high, high}; +} +{ +mul.f16x2 r2653, r2384, r2651; +} +{ +fma.rn.f16x2 r2656, r2348, r2649, r2653; +} +{ +mul.f16x2 r2660, r2348, r2651; +} +{ +neg.f16x2 r2663, r2660; +} +{ +fma.rn.f16x2 r2665, r2384, r2649, r2663; +} +shl.b32 r2964, r2963, 3; +add.s32 r2965, r2951, r2964; +barrier.sync 0; +mad.lo.s32 r2966, r2961, 5832, r2965; +st.shared.u32 [r2966], r2134; +st.shared.u32 [r2966+4], r2140; +st.shared.u32 [r2966+648], r2397; +st.shared.u32 [r2966+652], r2406; +st.shared.u32 [r2966+1296], r2434; +st.shared.u32 [r2966+1300], r2443; +st.shared.u32 [r2966+1944], r2471; +st.shared.u32 [r2966+1948], r2480; +st.shared.u32 [r2966+2592], r2508; +st.shared.u32 [r2966+2596], r2517; +st.shared.u32 [r2966+3240], r2545; +st.shared.u32 [r2966+3244], r2554; +st.shared.u32 [r2966+3888], r2582; +st.shared.u32 [r2966+3892], r2591; +st.shared.u32 [r2966+4536], r2619; +st.shared.u32 [r2966+4540], r2628; +st.shared.u32 [r2966+5184], r2656; +st.shared.u32 [r2966+5188], r2665; +barrier.sync 0; +ld.shared.u32 r2692, [r2954]; +ld.shared.u32 r2698, [r2954+4]; +ld.shared.u32 r2778, [r2954+1944]; +ld.shared.u32 r2784, [r2954+1948]; +ld.shared.u32 r2864, [r2954+3888]; +ld.shared.u32 r2870, [r2954+3892]; +ld.shared.u32 r2689, [r2954+5832]; +ld.shared.u32 r2695, [r2954+5836]; +ld.shared.u32 r2775, [r2954+7776]; +ld.shared.u32 r2781, [r2954+7780]; +ld.shared.u32 r2861, [r2954+9720]; +ld.shared.u32 r2867, [r2954+9724]; +ld.shared.u32 r2690, [r2954+11664]; +ld.shared.u32 r2696, [r2954+11668]; +ld.shared.u32 r2776, [r2954+13608]; +ld.shared.u32 r2782, [r2954+13612]; +ld.shared.u32 r2862, [r2954+15552]; +ld.shared.u32 r2868, [r2954+15556]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2686, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2687, {low, high}; +} +{ +add.f16x2 r2688, r2689, r2690; +} +{ +add.f16x2 %0, r2692, r2688; +} +{ +add.f16x2 r2694, r2695, r2696; +} +{ +add.f16x2 %1, r2698, r2694; +} +{ +add.f16x2 r2700, r2689, r2690; +} +{ +mul.f16x2 r2703, r2700, r2686; +} +{ +add.f16x2 r2706, r2692, r2703; +} +{ +sub.f16x2 r2709, r2695, r2696; +} +{ +mul.f16x2 r2712, r2709, r2687; +} +{ +add.f16x2 %6, r2706, r2712; +} +{ +add.f16x2 r2718, r2689, r2690; +} +{ +mul.f16x2 r2721, r2718, r2686; +} +{ +add.f16x2 r2724, r2692, r2721; +} +{ +sub.f16x2 r2727, r2695, r2696; +} +{ +mul.f16x2 r2730, r2727, r2687; +} +{ +sub.f16x2 %12, r2724, r2730; +} +{ +add.f16x2 r2736, r2695, r2696; +} +{ +mul.f16x2 r2739, r2736, r2686; +} +{ +add.f16x2 r2742, r2698, r2739; +} +{ +sub.f16x2 r2745, r2689, r2690; +} +{ +mul.f16x2 r2748, r2745, r2687; +} +{ +sub.f16x2 %7, r2742, r2748; +} +{ +add.f16x2 r2754, r2695, r2696; +} +{ +mul.f16x2 r2757, r2754, r2686; +} +{ +add.f16x2 r2760, r2698, r2757; +} +{ +sub.f16x2 r2763, r2689, r2690; +} +{ +mul.f16x2 r2766, r2763, r2687; +} +{ +add.f16x2 %13, r2760, r2766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2773, {low, high}; +} +{ +add.f16x2 r2774, r2775, r2776; +} +{ +add.f16x2 %2, r2778, r2774; +} +{ +add.f16x2 r2780, r2781, r2782; +} +{ +add.f16x2 %3, r2784, r2780; +} +{ +add.f16x2 r2786, r2775, r2776; +} +{ +mul.f16x2 r2789, r2786, r2772; +} +{ +add.f16x2 r2792, r2778, r2789; +} +{ +sub.f16x2 r2795, r2781, r2782; +} +{ +mul.f16x2 r2798, r2795, r2773; +} +{ +add.f16x2 %8, r2792, r2798; +} +{ +add.f16x2 r2804, r2775, r2776; +} +{ +mul.f16x2 r2807, r2804, r2772; +} +{ +add.f16x2 r2810, r2778, r2807; +} +{ +sub.f16x2 r2813, r2781, r2782; +} +{ +mul.f16x2 r2816, r2813, r2773; +} +{ +sub.f16x2 %14, r2810, r2816; +} +{ +add.f16x2 r2822, r2781, r2782; +} +{ +mul.f16x2 r2825, r2822, r2772; +} +{ +add.f16x2 r2828, r2784, r2825; +} +{ +sub.f16x2 r2831, r2775, r2776; +} +{ +mul.f16x2 r2834, r2831, r2773; +} +{ +sub.f16x2 %9, r2828, r2834; +} +{ +add.f16x2 r2840, r2781, r2782; +} +{ +mul.f16x2 r2843, r2840, r2772; +} +{ +add.f16x2 r2846, r2784, r2843; +} +{ +sub.f16x2 r2849, r2775, r2776; +} +{ +mul.f16x2 r2852, r2849, r2773; +} +{ +add.f16x2 %15, r2846, r2852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2859, {low, high}; +} +{ +add.f16x2 r2860, r2861, r2862; +} +{ +add.f16x2 %4, r2864, r2860; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +add.f16x2 %5, r2870, r2866; +} +{ +add.f16x2 r2872, r2861, r2862; +} +{ +mul.f16x2 r2875, r2872, r2858; +} +{ +add.f16x2 r2878, r2864, r2875; +} +{ +sub.f16x2 r2881, r2867, r2868; +} +{ +mul.f16x2 r2884, r2881, r2859; +} +{ +add.f16x2 %10, r2878, r2884; +} +{ +add.f16x2 r2890, r2861, r2862; +} +{ +mul.f16x2 r2893, r2890, r2858; +} +{ +add.f16x2 r2896, r2864, r2893; +} +{ +sub.f16x2 r2899, r2867, r2868; +} +{ +mul.f16x2 r2902, r2899, r2859; +} +{ +sub.f16x2 %16, r2896, r2902; +} +{ +add.f16x2 r2908, r2867, r2868; +} +{ +mul.f16x2 r2911, r2908, r2858; +} +{ +add.f16x2 r2914, r2870, r2911; +} +{ +sub.f16x2 r2917, r2861, r2862; +} +{ +mul.f16x2 r2920, r2917, r2859; +} +{ +sub.f16x2 %11, r2914, r2920; +} +{ +add.f16x2 r2926, r2867, r2868; +} +{ +mul.f16x2 r2929, r2926, r2858; +} +{ +add.f16x2 r2932, r2870, r2929; +} +{ +sub.f16x2 r2935, r2861, r2862; +} +{ +mul.f16x2 r2938, r2935, r2859; +} +{ +add.f16x2 %17, r2932, r2938; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1094, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<250>; +.reg .b32 r<2967>; +.reg .b64 rd<8>; +mov.u32 r2944, %tid.y; +mov.u32 r2945, %18; +mad.lo.s32 r2946, r2944, 8748, r2945; +mov.u32 r2947, %tid.x; +mov.f32 f238, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1, {low, high}; +} +mov.f32 f240, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r259, {low, high}; +} +mov.f32 f168, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r260, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r261, {low, high}; +} +mov.f32 f172, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r262, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r265, {low, high}; +} +mov.f32 f180, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r2947, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r2948, rd3; +mul.lo.s32 r2949, r2948, 243; +sub.s32 r2950, r2947, r2949; +mad.lo.s32 r2951, r2948, 8748, r2946; +cvt.rn.f32.u32 f241, r2950; +mul.f32 f242, f241, 0f3B3C4870; +cos.approx.f32 f57, f242; +sin.approx.f32 f243, f242; +neg.f32 f58, f243; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +barrier.sync 0; +mad.lo.s32 r2952, r2950, 36, r2951; +st.shared.u32 [r2952], r344; +st.shared.u32 [r2952+4], r607; +st.shared.u32 [r2952+8], r644; +st.shared.u32 [r2952+12], r681; +st.shared.u32 [r2952+16], r718; +st.shared.u32 [r2952+20], r755; +st.shared.u32 [r2952+24], r792; +st.shared.u32 [r2952+28], r829; +st.shared.u32 [r2952+32], r866; +barrier.sync 0; +shl.b32 r2953, r2950, 5; +sub.s32 r2954, r2952, r2953; +ld.shared.u32 r902, [r2954]; +ld.shared.u32 r988, [r2954+972]; +ld.shared.u32 r1074, [r2954+1944]; +ld.shared.u32 r899, [r2954+2916]; +ld.shared.u32 r985, [r2954+3888]; +ld.shared.u32 r1071, [r2954+4860]; +ld.shared.u32 r900, [r2954+5832]; +ld.shared.u32 r986, [r2954+6804]; +ld.shared.u32 r1072, [r2954+7776]; +barrier.sync 0; +st.shared.u32 [r2952], r350; +st.shared.u32 [r2952+4], r616; +st.shared.u32 [r2952+8], r653; +st.shared.u32 [r2952+12], r690; +st.shared.u32 [r2952+16], r727; +st.shared.u32 [r2952+20], r764; +st.shared.u32 [r2952+24], r801; +st.shared.u32 [r2952+28], r838; +st.shared.u32 [r2952+32], r875; +barrier.sync 0; +ld.shared.u32 r908, [r2954]; +ld.shared.u32 r994, [r2954+972]; +ld.shared.u32 r1080, [r2954+1944]; +ld.shared.u32 r905, [r2954+2916]; +ld.shared.u32 r991, [r2954+3888]; +ld.shared.u32 r1077, [r2954+4860]; +ld.shared.u32 r906, [r2954+5832]; +ld.shared.u32 r992, [r2954+6804]; +ld.shared.u32 r1078, [r2954+7776]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r2950, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2955, rd5; +mul.lo.s32 r2956, r2955, 9; +sub.s32 r2957, r2950, r2956; +shl.b32 r2958, r2957, 2; +add.s32 r2959, r2951, r2958; +cvt.rn.f32.u32 f244, r2955; +mul.f32 f245, f244, 0f3CD3D17E; +cos.approx.f32 f133, f245; +sin.approx.f32 f246, f245; +neg.f32 f134, f246; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +barrier.sync 0; +mad.lo.s32 r2960, r2955, 324, r2959; +st.shared.u32 [r2960], r1239; +st.shared.u32 [r2960+36], r1502; +st.shared.u32 [r2960+72], r1539; +st.shared.u32 [r2960+108], r1576; +st.shared.u32 [r2960+144], r1613; +st.shared.u32 [r2960+180], r1650; +st.shared.u32 [r2960+216], r1687; +st.shared.u32 [r2960+252], r1724; +st.shared.u32 [r2960+288], r1761; +barrier.sync 0; +ld.shared.u32 r1797, [r2954]; +ld.shared.u32 r1883, [r2954+972]; +ld.shared.u32 r1969, [r2954+1944]; +ld.shared.u32 r1794, [r2954+2916]; +ld.shared.u32 r1880, [r2954+3888]; +ld.shared.u32 r1966, [r2954+4860]; +ld.shared.u32 r1795, [r2954+5832]; +ld.shared.u32 r1881, [r2954+6804]; +ld.shared.u32 r1967, [r2954+7776]; +barrier.sync 0; +st.shared.u32 [r2960], r1245; +st.shared.u32 [r2960+36], r1511; +st.shared.u32 [r2960+72], r1548; +st.shared.u32 [r2960+108], r1585; +st.shared.u32 [r2960+144], r1622; +st.shared.u32 [r2960+180], r1659; +st.shared.u32 [r2960+216], r1696; +st.shared.u32 [r2960+252], r1733; +st.shared.u32 [r2960+288], r1770; +barrier.sync 0; +ld.shared.u32 r1803, [r2954]; +ld.shared.u32 r1889, [r2954+972]; +ld.shared.u32 r1975, [r2954+1944]; +ld.shared.u32 r1800, [r2954+2916]; +ld.shared.u32 r1886, [r2954+3888]; +ld.shared.u32 r1972, [r2954+4860]; +ld.shared.u32 r1801, [r2954+5832]; +ld.shared.u32 r1887, [r2954+6804]; +ld.shared.u32 r1973, [r2954+7776]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 r1802, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 r1820, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 r1838, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 r1856, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 r1874, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 r1942, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 r1968, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 r1974, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 r1992, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 r2010, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 r2028, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2065, r1906, r2049; +} +{ +mul.f16x2 r2068, r1942, r2050; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1906, r2050; +} +{ +fma.rn.f16x2 r2077, r1942, r2049, r2074; +} +{ +mul.f16x2 r2081, r1992, r2051; +} +{ +mul.f16x2 r2084, r2028, r2052; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1992, r2052; +} +{ +fma.rn.f16x2 r2093, r2028, r2051, r2090; +} +{ +mul.f16x2 r2097, r1924, r2051; +} +{ +mul.f16x2 r2100, r1960, r2052; +} +{ +sub.f16x2 r2103, r2097, r2100; +} +{ +mul.f16x2 r2106, r1924, r2052; +} +{ +fma.rn.f16x2 r2109, r1960, r2051, r2106; +} +{ +mul.f16x2 r2113, r2010, r2055; +} +{ +mul.f16x2 r2116, r2046, r2056; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r2010, r2056; +} +{ +fma.rn.f16x2 r2125, r2046, r2055, r2122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2130, {low, high}; +} +{ +add.f16x2 r2131, r1882, r1968; +} +{ +add.f16x2 r2134, r1796, r2131; +} +{ +add.f16x2 r2137, r1888, r1974; +} +{ +add.f16x2 r2140, r1802, r2137; +} +{ +add.f16x2 r2143, r1882, r1968; +} +{ +mul.f16x2 r2146, r2143, r2129; +} +{ +add.f16x2 r2149, r1796, r2146; +} +{ +sub.f16x2 r2152, r1888, r1974; +} +{ +mul.f16x2 r2155, r2152, r2130; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +add.f16x2 r2161, r1882, r1968; +} +{ +mul.f16x2 r2164, r2161, r2129; +} +{ +add.f16x2 r2167, r1796, r2164; +} +{ +sub.f16x2 r2170, r1888, r1974; +} +{ +mul.f16x2 r2173, r2170, r2130; +} +{ +sub.f16x2 r2176, r2167, r2173; +} +{ +add.f16x2 r2179, r1888, r1974; +} +{ +mul.f16x2 r2182, r2179, r2129; +} +{ +add.f16x2 r2185, r1802, r2182; +} +{ +sub.f16x2 r2188, r1882, r1968; +} +{ +mul.f16x2 r2191, r2188, r2130; +} +{ +sub.f16x2 r2194, r2185, r2191; +} +{ +add.f16x2 r2197, r1888, r1974; +} +{ +mul.f16x2 r2200, r2197, r2129; +} +{ +add.f16x2 r2203, r1802, r2200; +} +{ +sub.f16x2 r2206, r1882, r1968; +} +{ +mul.f16x2 r2209, r2206, r2130; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2215, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2216, {low, high}; +} +{ +add.f16x2 r2217, r2071, r2087; +} +{ +add.f16x2 r2220, r1820, r2217; +} +{ +add.f16x2 r2223, r2077, r2093; +} +{ +add.f16x2 r2226, r1856, r2223; +} +{ +add.f16x2 r2229, r2071, r2087; +} +{ +mul.f16x2 r2232, r2229, r2215; +} +{ +add.f16x2 r2235, r1820, r2232; +} +{ +sub.f16x2 r2238, r2077, r2093; +} +{ +mul.f16x2 r2241, r2238, r2216; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +add.f16x2 r2247, r2071, r2087; +} +{ +mul.f16x2 r2250, r2247, r2215; +} +{ +add.f16x2 r2253, r1820, r2250; +} +{ +sub.f16x2 r2256, r2077, r2093; +} +{ +mul.f16x2 r2259, r2256, r2216; +} +{ +sub.f16x2 r2262, r2253, r2259; +} +{ +add.f16x2 r2265, r2077, r2093; +} +{ +mul.f16x2 r2268, r2265, r2215; +} +{ +add.f16x2 r2271, r1856, r2268; +} +{ +sub.f16x2 r2274, r2071, r2087; +} +{ +mul.f16x2 r2277, r2274, r2216; +} +{ +sub.f16x2 r2280, r2271, r2277; +} +{ +add.f16x2 r2283, r2077, r2093; +} +{ +mul.f16x2 r2286, r2283, r2215; +} +{ +add.f16x2 r2289, r1856, r2286; +} +{ +sub.f16x2 r2292, r2071, r2087; +} +{ +mul.f16x2 r2295, r2292, r2216; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2302, {low, high}; +} +{ +add.f16x2 r2303, r2103, r2119; +} +{ +add.f16x2 r2306, r1838, r2303; +} +{ +add.f16x2 r2309, r2109, r2125; +} +{ +add.f16x2 r2312, r1874, r2309; +} +{ +add.f16x2 r2315, r2103, r2119; +} +{ +mul.f16x2 r2318, r2315, r2301; +} +{ +add.f16x2 r2321, r1838, r2318; +} +{ +sub.f16x2 r2324, r2109, r2125; +} +{ +mul.f16x2 r2327, r2324, r2302; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +add.f16x2 r2333, r2103, r2119; +} +{ +mul.f16x2 r2336, r2333, r2301; +} +{ +add.f16x2 r2339, r1838, r2336; +} +{ +sub.f16x2 r2342, r2109, r2125; +} +{ +mul.f16x2 r2345, r2342, r2302; +} +{ +sub.f16x2 r2348, r2339, r2345; +} +{ +add.f16x2 r2351, r2109, r2125; +} +{ +mul.f16x2 r2354, r2351, r2301; +} +{ +add.f16x2 r2357, r1874, r2354; +} +{ +sub.f16x2 r2360, r2103, r2119; +} +{ +mul.f16x2 r2363, r2360, r2302; +} +{ +sub.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2109, r2125; +} +{ +mul.f16x2 r2372, r2369, r2301; +} +{ +add.f16x2 r2375, r1874, r2372; +} +{ +sub.f16x2 r2378, r2103, r2119; +} +{ +mul.f16x2 r2381, r2378, r2302; +} +{ +add.f16x2 r2384, r2375, r2381; +} +mul.wide.u32 rd6, r2950, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r2961, rd7; +mul.lo.s32 r2962, r2961, 81; +sub.s32 r2963, r2950, r2962; +shl.b32 r2964, r2963, 2; +add.s32 r2965, r2951, r2964; +cvt.rn.f32.u32 f247, r2961; +mul.f32 f248, f247, 0f3E6E4BAE; +cos.approx.f32 f209, f248; +sin.approx.f32 f249, f248; +neg.f32 f210, f249; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2387, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2390, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2392, {high, high}; +} +{ +mul.f16x2 r2394, r2226, r2392; +} +{ +fma.rn.f16x2 r2397, r2220, r2390, r2394; +} +{ +mul.f16x2 r2401, r2220, r2392; +} +{ +neg.f16x2 r2404, r2401; +} +{ +fma.rn.f16x2 r2406, r2226, r2390, r2404; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2410, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2412, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2414, {low, high}; +} +{ +mul.f16x2 r2415, r2412, r2414; +} +{ +mul.f16x2 r2418, r2387, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2421, {high, low}; +} +{ +fma.rn.f16x2 r2423, r2415, r2421, r2418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2427, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2429, {high, high}; +} +{ +mul.f16x2 r2431, r2312, r2429; +} +{ +fma.rn.f16x2 r2434, r2306, r2427, r2431; +} +{ +mul.f16x2 r2438, r2306, r2429; +} +{ +neg.f16x2 r2441, r2438; +} +{ +fma.rn.f16x2 r2443, r2312, r2427, r2441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2447, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2449, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2451, {low, high}; +} +{ +mul.f16x2 r2452, r2449, r2451; +} +{ +mul.f16x2 r2455, r2423, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2458, {high, low}; +} +{ +fma.rn.f16x2 r2460, r2452, r2458, r2455; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2464, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2466, {high, high}; +} +{ +mul.f16x2 r2468, r2194, r2466; +} +{ +fma.rn.f16x2 r2471, r2158, r2464, r2468; +} +{ +mul.f16x2 r2475, r2158, r2466; +} +{ +neg.f16x2 r2478, r2475; +} +{ +fma.rn.f16x2 r2480, r2194, r2464, r2478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2484, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2486, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2488, {low, high}; +} +{ +mul.f16x2 r2489, r2486, r2488; +} +{ +mul.f16x2 r2492, r2460, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2495, {high, low}; +} +{ +fma.rn.f16x2 r2497, r2489, r2495, r2492; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2501, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2503, {high, high}; +} +{ +mul.f16x2 r2505, r2280, r2503; +} +{ +fma.rn.f16x2 r2508, r2244, r2501, r2505; +} +{ +mul.f16x2 r2512, r2244, r2503; +} +{ +neg.f16x2 r2515, r2512; +} +{ +fma.rn.f16x2 r2517, r2280, r2501, r2515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2521, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2523, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2525, {low, high}; +} +{ +mul.f16x2 r2526, r2523, r2525; +} +{ +mul.f16x2 r2529, r2497, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2532, {high, low}; +} +{ +fma.rn.f16x2 r2534, r2526, r2532, r2529; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2538, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2540, {high, high}; +} +{ +mul.f16x2 r2542, r2366, r2540; +} +{ +fma.rn.f16x2 r2545, r2330, r2538, r2542; +} +{ +mul.f16x2 r2549, r2330, r2540; +} +{ +neg.f16x2 r2552, r2549; +} +{ +fma.rn.f16x2 r2554, r2366, r2538, r2552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2558, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2560, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2562, {low, high}; +} +{ +mul.f16x2 r2563, r2560, r2562; +} +{ +mul.f16x2 r2566, r2534, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2569, {high, low}; +} +{ +fma.rn.f16x2 r2571, r2563, r2569, r2566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2575, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2577, {high, high}; +} +{ +mul.f16x2 r2579, r2212, r2577; +} +{ +fma.rn.f16x2 r2582, r2176, r2575, r2579; +} +{ +mul.f16x2 r2586, r2176, r2577; +} +{ +neg.f16x2 r2589, r2586; +} +{ +fma.rn.f16x2 r2591, r2212, r2575, r2589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2595, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2597, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2599, {low, high}; +} +{ +mul.f16x2 r2600, r2597, r2599; +} +{ +mul.f16x2 r2603, r2571, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2606, {high, low}; +} +{ +fma.rn.f16x2 r2608, r2600, r2606, r2603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2614, {high, high}; +} +{ +mul.f16x2 r2616, r2298, r2614; +} +{ +fma.rn.f16x2 r2619, r2262, r2612, r2616; +} +{ +mul.f16x2 r2623, r2262, r2614; +} +{ +neg.f16x2 r2626, r2623; +} +{ +fma.rn.f16x2 r2628, r2298, r2612, r2626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2634, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2636, {low, high}; +} +{ +mul.f16x2 r2637, r2634, r2636; +} +{ +mul.f16x2 r2640, r2608, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2643, {high, low}; +} +{ +fma.rn.f16x2 r2645, r2637, r2643, r2640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2651, {high, high}; +} +{ +mul.f16x2 r2653, r2384, r2651; +} +{ +fma.rn.f16x2 r2656, r2348, r2649, r2653; +} +{ +mul.f16x2 r2660, r2348, r2651; +} +{ +neg.f16x2 r2663, r2660; +} +{ +fma.rn.f16x2 r2665, r2384, r2649, r2663; +} +barrier.sync 0; +mad.lo.s32 r2966, r2961, 2916, r2965; +st.shared.u32 [r2966], r2134; +st.shared.u32 [r2966+324], r2397; +st.shared.u32 [r2966+648], r2434; +st.shared.u32 [r2966+972], r2471; +st.shared.u32 [r2966+1296], r2508; +st.shared.u32 [r2966+1620], r2545; +st.shared.u32 [r2966+1944], r2582; +st.shared.u32 [r2966+2268], r2619; +st.shared.u32 [r2966+2592], r2656; +barrier.sync 0; +ld.shared.u32 r2692, [r2954]; +ld.shared.u32 r2778, [r2954+972]; +ld.shared.u32 r2864, [r2954+1944]; +ld.shared.u32 r2689, [r2954+2916]; +ld.shared.u32 r2775, [r2954+3888]; +ld.shared.u32 r2861, [r2954+4860]; +ld.shared.u32 r2690, [r2954+5832]; +ld.shared.u32 r2776, [r2954+6804]; +ld.shared.u32 r2862, [r2954+7776]; +barrier.sync 0; +st.shared.u32 [r2966], r2140; +st.shared.u32 [r2966+324], r2406; +st.shared.u32 [r2966+648], r2443; +st.shared.u32 [r2966+972], r2480; +st.shared.u32 [r2966+1296], r2517; +st.shared.u32 [r2966+1620], r2554; +st.shared.u32 [r2966+1944], r2591; +st.shared.u32 [r2966+2268], r2628; +st.shared.u32 [r2966+2592], r2665; +barrier.sync 0; +ld.shared.u32 r2698, [r2954]; +ld.shared.u32 r2784, [r2954+972]; +ld.shared.u32 r2870, [r2954+1944]; +ld.shared.u32 r2695, [r2954+2916]; +ld.shared.u32 r2781, [r2954+3888]; +ld.shared.u32 r2867, [r2954+4860]; +ld.shared.u32 r2696, [r2954+5832]; +ld.shared.u32 r2782, [r2954+6804]; +ld.shared.u32 r2868, [r2954+7776]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2686, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2687, {low, high}; +} +{ +add.f16x2 r2688, r2689, r2690; +} +{ +add.f16x2 %0, r2692, r2688; +} +{ +add.f16x2 r2694, r2695, r2696; +} +{ +add.f16x2 %1, r2698, r2694; +} +{ +add.f16x2 r2700, r2689, r2690; +} +{ +mul.f16x2 r2703, r2700, r2686; +} +{ +add.f16x2 r2706, r2692, r2703; +} +{ +sub.f16x2 r2709, r2695, r2696; +} +{ +mul.f16x2 r2712, r2709, r2687; +} +{ +add.f16x2 %6, r2706, r2712; +} +{ +add.f16x2 r2718, r2689, r2690; +} +{ +mul.f16x2 r2721, r2718, r2686; +} +{ +add.f16x2 r2724, r2692, r2721; +} +{ +sub.f16x2 r2727, r2695, r2696; +} +{ +mul.f16x2 r2730, r2727, r2687; +} +{ +sub.f16x2 %12, r2724, r2730; +} +{ +add.f16x2 r2736, r2695, r2696; +} +{ +mul.f16x2 r2739, r2736, r2686; +} +{ +add.f16x2 r2742, r2698, r2739; +} +{ +sub.f16x2 r2745, r2689, r2690; +} +{ +mul.f16x2 r2748, r2745, r2687; +} +{ +sub.f16x2 %7, r2742, r2748; +} +{ +add.f16x2 r2754, r2695, r2696; +} +{ +mul.f16x2 r2757, r2754, r2686; +} +{ +add.f16x2 r2760, r2698, r2757; +} +{ +sub.f16x2 r2763, r2689, r2690; +} +{ +mul.f16x2 r2766, r2763, r2687; +} +{ +add.f16x2 %13, r2760, r2766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2773, {low, high}; +} +{ +add.f16x2 r2774, r2775, r2776; +} +{ +add.f16x2 %2, r2778, r2774; +} +{ +add.f16x2 r2780, r2781, r2782; +} +{ +add.f16x2 %3, r2784, r2780; +} +{ +add.f16x2 r2786, r2775, r2776; +} +{ +mul.f16x2 r2789, r2786, r2772; +} +{ +add.f16x2 r2792, r2778, r2789; +} +{ +sub.f16x2 r2795, r2781, r2782; +} +{ +mul.f16x2 r2798, r2795, r2773; +} +{ +add.f16x2 %8, r2792, r2798; +} +{ +add.f16x2 r2804, r2775, r2776; +} +{ +mul.f16x2 r2807, r2804, r2772; +} +{ +add.f16x2 r2810, r2778, r2807; +} +{ +sub.f16x2 r2813, r2781, r2782; +} +{ +mul.f16x2 r2816, r2813, r2773; +} +{ +sub.f16x2 %14, r2810, r2816; +} +{ +add.f16x2 r2822, r2781, r2782; +} +{ +mul.f16x2 r2825, r2822, r2772; +} +{ +add.f16x2 r2828, r2784, r2825; +} +{ +sub.f16x2 r2831, r2775, r2776; +} +{ +mul.f16x2 r2834, r2831, r2773; +} +{ +sub.f16x2 %9, r2828, r2834; +} +{ +add.f16x2 r2840, r2781, r2782; +} +{ +mul.f16x2 r2843, r2840, r2772; +} +{ +add.f16x2 r2846, r2784, r2843; +} +{ +sub.f16x2 r2849, r2775, r2776; +} +{ +mul.f16x2 r2852, r2849, r2773; +} +{ +add.f16x2 %15, r2846, r2852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r2858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f240; +cvt.rn.f16.f32 high, f240; +mov.b32 r2859, {low, high}; +} +{ +add.f16x2 r2860, r2861, r2862; +} +{ +add.f16x2 %4, r2864, r2860; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +add.f16x2 %5, r2870, r2866; +} +{ +add.f16x2 r2872, r2861, r2862; +} +{ +mul.f16x2 r2875, r2872, r2858; +} +{ +add.f16x2 r2878, r2864, r2875; +} +{ +sub.f16x2 r2881, r2867, r2868; +} +{ +mul.f16x2 r2884, r2881, r2859; +} +{ +add.f16x2 %10, r2878, r2884; +} +{ +add.f16x2 r2890, r2861, r2862; +} +{ +mul.f16x2 r2893, r2890, r2858; +} +{ +add.f16x2 r2896, r2864, r2893; +} +{ +sub.f16x2 r2899, r2867, r2868; +} +{ +mul.f16x2 r2902, r2899, r2859; +} +{ +sub.f16x2 %16, r2896, r2902; +} +{ +add.f16x2 r2908, r2867, r2868; +} +{ +mul.f16x2 r2911, r2908, r2858; +} +{ +add.f16x2 r2914, r2870, r2911; +} +{ +sub.f16x2 r2917, r2861, r2862; +} +{ +mul.f16x2 r2920, r2917, r2859; +} +{ +sub.f16x2 %11, r2914, r2920; +} +{ +add.f16x2 r2926, r2867, r2868; +} +{ +mul.f16x2 r2929, r2926, r2858; +} +{ +add.f16x2 r2932, r2870, r2929; +} +{ +sub.f16x2 r2935, r2861, r2862; +} +{ +mul.f16x2 r2938, r2935, r2859; +} +{ +add.f16x2 %17, r2932, r2938; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1095, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<771>; +.reg .b32 r<8520>; +.reg .b64 rd<6>; +mov.u32 r8446, %54; +mov.u32 r8519, %tid.y; +mad.lo.s32 r8447, r8519, 8748, r8446; +mov.u32 r8448, %tid.x; +mov.f32 f762, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1, {low, high}; +} +mov.f32 f764, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r259, {low, high}; +} +mov.f32 f544, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r260, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r261, {low, high}; +} +mov.f32 f556, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r262, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r265, {low, high}; +} +mov.f32 f580, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %76; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %76; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %76; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %76; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %76; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %74; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %74; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %74; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %74; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %74; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %75, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %75, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %75, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %75, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %75, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1789, {low, high}; +} +mov.f32 f536, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1790, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1791, {low, high}; +} +mov.f32 f540, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1794, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1795, {low, high}; +} +mov.f32 f548, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1796, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1797, {low, high}; +} +mov.f32 f552, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1800, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1801, {low, high}; +} +mov.f32 f560, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1802, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1803, {low, high}; +} +mov.f32 f564, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1804, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1807, {low, high}; +} +mov.f32 f572, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1808, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1812, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1815, {low, high}; +} +mov.f32 f588, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1819, {low, high}; +} +mov.f32 f596, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r8448, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r8449, rd3; +mul.lo.s32 r8450, r8449, 81; +sub.s32 r8451, r8448, r8450; +mad.lo.s32 r8452, r8449, 8748, r8447; +cvt.rn.f32.u32 f765, r8451; +mul.f32 f766, f765, 0f3B3C4870; +cos.approx.f32 f309, f766; +sin.approx.f32 f767, f766; +neg.f32 f310, f767; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +barrier.sync 0; +mad.lo.s32 r8453, r8451, 108, r8452; +st.shared.u32 [r8453], r2102; +st.shared.u32 [r8453+4], r2881; +st.shared.u32 [r8453+8], r2918; +st.shared.u32 [r8453+12], r2955; +st.shared.u32 [r8453+16], r2992; +st.shared.u32 [r8453+20], r3029; +st.shared.u32 [r8453+24], r3066; +st.shared.u32 [r8453+28], r3103; +st.shared.u32 [r8453+32], r3140; +st.shared.u32 [r8453+36], r3177; +st.shared.u32 [r8453+40], r3214; +st.shared.u32 [r8453+44], r3251; +st.shared.u32 [r8453+48], r3288; +st.shared.u32 [r8453+52], r3325; +st.shared.u32 [r8453+56], r3362; +st.shared.u32 [r8453+60], r3399; +st.shared.u32 [r8453+64], r3436; +st.shared.u32 [r8453+68], r3473; +st.shared.u32 [r8453+72], r3510; +st.shared.u32 [r8453+76], r3547; +st.shared.u32 [r8453+80], r3584; +st.shared.u32 [r8453+84], r3621; +st.shared.u32 [r8453+88], r3658; +st.shared.u32 [r8453+92], r3695; +st.shared.u32 [r8453+96], r3732; +st.shared.u32 [r8453+100], r3769; +st.shared.u32 [r8453+104], r3806; +barrier.sync 0; +mad.lo.s32 r8454, r8451, -104, r8453; +ld.shared.u32 r3842, [r8454]; +ld.shared.u32 r4438, [r8454+324]; +ld.shared.u32 r5034, [r8454+648]; +ld.shared.u32 r3928, [r8454+972]; +ld.shared.u32 r4524, [r8454+1296]; +ld.shared.u32 r5120, [r8454+1620]; +ld.shared.u32 r4014, [r8454+1944]; +ld.shared.u32 r4610, [r8454+2268]; +ld.shared.u32 r5206, [r8454+2592]; +ld.shared.u32 r3839, [r8454+2916]; +ld.shared.u32 r4435, [r8454+3240]; +ld.shared.u32 r5031, [r8454+3564]; +ld.shared.u32 r3925, [r8454+3888]; +ld.shared.u32 r4521, [r8454+4212]; +ld.shared.u32 r5117, [r8454+4536]; +ld.shared.u32 r4011, [r8454+4860]; +ld.shared.u32 r4607, [r8454+5184]; +ld.shared.u32 r5203, [r8454+5508]; +ld.shared.u32 r3840, [r8454+5832]; +ld.shared.u32 r4436, [r8454+6156]; +ld.shared.u32 r5032, [r8454+6480]; +ld.shared.u32 r3926, [r8454+6804]; +ld.shared.u32 r4522, [r8454+7128]; +ld.shared.u32 r5118, [r8454+7452]; +ld.shared.u32 r4012, [r8454+7776]; +ld.shared.u32 r4608, [r8454+8100]; +ld.shared.u32 r5204, [r8454+8424]; +barrier.sync 0; +st.shared.u32 [r8453], r2108; +st.shared.u32 [r8453+4], r2890; +st.shared.u32 [r8453+8], r2927; +st.shared.u32 [r8453+12], r2964; +st.shared.u32 [r8453+16], r3001; +st.shared.u32 [r8453+20], r3038; +st.shared.u32 [r8453+24], r3075; +st.shared.u32 [r8453+28], r3112; +st.shared.u32 [r8453+32], r3149; +st.shared.u32 [r8453+36], r3186; +st.shared.u32 [r8453+40], r3223; +st.shared.u32 [r8453+44], r3260; +st.shared.u32 [r8453+48], r3297; +st.shared.u32 [r8453+52], r3334; +st.shared.u32 [r8453+56], r3371; +st.shared.u32 [r8453+60], r3408; +st.shared.u32 [r8453+64], r3445; +st.shared.u32 [r8453+68], r3482; +st.shared.u32 [r8453+72], r3519; +st.shared.u32 [r8453+76], r3556; +st.shared.u32 [r8453+80], r3593; +st.shared.u32 [r8453+84], r3630; +st.shared.u32 [r8453+88], r3667; +st.shared.u32 [r8453+92], r3704; +st.shared.u32 [r8453+96], r3741; +st.shared.u32 [r8453+100], r3778; +st.shared.u32 [r8453+104], r3815; +barrier.sync 0; +ld.shared.u32 r3848, [r8454]; +ld.shared.u32 r4444, [r8454+324]; +ld.shared.u32 r5040, [r8454+648]; +ld.shared.u32 r3934, [r8454+972]; +ld.shared.u32 r4530, [r8454+1296]; +ld.shared.u32 r5126, [r8454+1620]; +ld.shared.u32 r4020, [r8454+1944]; +ld.shared.u32 r4616, [r8454+2268]; +ld.shared.u32 r5212, [r8454+2592]; +ld.shared.u32 r3845, [r8454+2916]; +ld.shared.u32 r4441, [r8454+3240]; +ld.shared.u32 r5037, [r8454+3564]; +ld.shared.u32 r3931, [r8454+3888]; +ld.shared.u32 r4527, [r8454+4212]; +ld.shared.u32 r5123, [r8454+4536]; +ld.shared.u32 r4017, [r8454+4860]; +ld.shared.u32 r4613, [r8454+5184]; +ld.shared.u32 r5209, [r8454+5508]; +ld.shared.u32 r3846, [r8454+5832]; +ld.shared.u32 r4442, [r8454+6156]; +ld.shared.u32 r5038, [r8454+6480]; +ld.shared.u32 r3932, [r8454+6804]; +ld.shared.u32 r4528, [r8454+7128]; +ld.shared.u32 r5124, [r8454+7452]; +ld.shared.u32 r4018, [r8454+7776]; +ld.shared.u32 r4614, [r8454+8100]; +ld.shared.u32 r5210, [r8454+8424]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 r5937, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 r5943, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 r5979, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 r5997, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 r6015, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 r6023, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 r6029, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 r6047, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 r6065, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 r6083, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 r6109, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 r6115, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 r6133, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 r6151, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 r6169, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 r6187, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 r6195, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 r6201, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 r6219, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 r6237, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 r6255, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 r6273, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 r6281, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 r6287, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 r6305, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 r6323, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 r6341, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 r6359, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 r6367, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 r6373, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 r6391, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 r6409, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 r6427, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 r6445, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 r6453, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 r6459, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 r6477, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 r6495, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 r6513, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 r6531, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 r6539, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 r6545, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 r6617, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 r6625, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 r6631, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 r6649, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 r6667, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 r6685, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 r6703, r6694, r6700; +} +mul.wide.u32 rd4, r8451, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r8455, rd5; +sub.s32 r8456, r8451, r8455; +shr.u32 r8457, r8456, 1; +add.s32 r8458, r8457, r8455; +shr.u32 r8459, r8458, 4; +mul.lo.s32 r8460, r8459, 27; +sub.s32 r8461, r8451, r8460; +shl.b32 r8462, r8461, 2; +add.s32 r8463, r8452, r8462; +cvt.rn.f32.u32 f768, r8459; +mul.f32 f769, f768, 0f3D9EDD1F; +cos.approx.f32 f673, f769; +sin.approx.f32 f770, f769; +neg.f32 f674, f770; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6706, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6711, {high, high}; +} +{ +mul.f16x2 r6713, r6029, r6711; +} +{ +fma.rn.f16x2 r6716, r6023, r6709, r6713; +} +{ +mul.f16x2 r6720, r6023, r6711; +} +{ +neg.f16x2 r6723, r6720; +} +{ +fma.rn.f16x2 r6725, r6029, r6709, r6723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6733, {low, high}; +} +{ +mul.f16x2 r6734, r6731, r6733; +} +{ +mul.f16x2 r6737, r6706, r6729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6740, {high, low}; +} +{ +fma.rn.f16x2 r6742, r6734, r6740, r6737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6748, {high, high}; +} +{ +mul.f16x2 r6750, r6115, r6748; +} +{ +fma.rn.f16x2 r6753, r6109, r6746, r6750; +} +{ +mul.f16x2 r6757, r6109, r6748; +} +{ +neg.f16x2 r6760, r6757; +} +{ +fma.rn.f16x2 r6762, r6115, r6746, r6760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6770, {low, high}; +} +{ +mul.f16x2 r6771, r6768, r6770; +} +{ +mul.f16x2 r6774, r6742, r6766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6777, {high, low}; +} +{ +fma.rn.f16x2 r6779, r6771, r6777, r6774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6785, {high, high}; +} +{ +mul.f16x2 r6787, r6201, r6785; +} +{ +fma.rn.f16x2 r6790, r6195, r6783, r6787; +} +{ +mul.f16x2 r6794, r6195, r6785; +} +{ +neg.f16x2 r6797, r6794; +} +{ +fma.rn.f16x2 r6799, r6201, r6783, r6797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6807, {low, high}; +} +{ +mul.f16x2 r6808, r6805, r6807; +} +{ +mul.f16x2 r6811, r6779, r6803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6814, {high, low}; +} +{ +fma.rn.f16x2 r6816, r6808, r6814, r6811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6822, {high, high}; +} +{ +mul.f16x2 r6824, r6287, r6822; +} +{ +fma.rn.f16x2 r6827, r6281, r6820, r6824; +} +{ +mul.f16x2 r6831, r6281, r6822; +} +{ +neg.f16x2 r6834, r6831; +} +{ +fma.rn.f16x2 r6836, r6287, r6820, r6834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6844, {low, high}; +} +{ +mul.f16x2 r6845, r6842, r6844; +} +{ +mul.f16x2 r6848, r6816, r6840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6851, {high, low}; +} +{ +fma.rn.f16x2 r6853, r6845, r6851, r6848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6859, {high, high}; +} +{ +mul.f16x2 r6861, r6373, r6859; +} +{ +fma.rn.f16x2 r6864, r6367, r6857, r6861; +} +{ +mul.f16x2 r6868, r6367, r6859; +} +{ +neg.f16x2 r6871, r6868; +} +{ +fma.rn.f16x2 r6873, r6373, r6857, r6871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6881, {low, high}; +} +{ +mul.f16x2 r6882, r6879, r6881; +} +{ +mul.f16x2 r6885, r6853, r6877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6888, {high, low}; +} +{ +fma.rn.f16x2 r6890, r6882, r6888, r6885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6896, {high, high}; +} +{ +mul.f16x2 r6898, r6459, r6896; +} +{ +fma.rn.f16x2 r6901, r6453, r6894, r6898; +} +{ +mul.f16x2 r6905, r6453, r6896; +} +{ +neg.f16x2 r6908, r6905; +} +{ +fma.rn.f16x2 r6910, r6459, r6894, r6908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6918, {low, high}; +} +{ +mul.f16x2 r6919, r6916, r6918; +} +{ +mul.f16x2 r6922, r6890, r6914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6925, {high, low}; +} +{ +fma.rn.f16x2 r6927, r6919, r6925, r6922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6933, {high, high}; +} +{ +mul.f16x2 r6935, r6545, r6933; +} +{ +fma.rn.f16x2 r6938, r6539, r6931, r6935; +} +{ +mul.f16x2 r6942, r6539, r6933; +} +{ +neg.f16x2 r6945, r6942; +} +{ +fma.rn.f16x2 r6947, r6545, r6931, r6945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6955, {low, high}; +} +{ +mul.f16x2 r6956, r6953, r6955; +} +{ +mul.f16x2 r6959, r6927, r6951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6962, {high, low}; +} +{ +fma.rn.f16x2 r6964, r6956, r6962, r6959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6970, {high, high}; +} +{ +mul.f16x2 r6972, r6631, r6970; +} +{ +fma.rn.f16x2 r6975, r6625, r6968, r6972; +} +{ +mul.f16x2 r6979, r6625, r6970; +} +{ +neg.f16x2 r6982, r6979; +} +{ +fma.rn.f16x2 r6984, r6631, r6968, r6982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6992, {low, high}; +} +{ +mul.f16x2 r6993, r6990, r6992; +} +{ +mul.f16x2 r6996, r6964, r6988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6999, {high, low}; +} +{ +fma.rn.f16x2 r7001, r6993, r6999, r6996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7007, {high, high}; +} +{ +mul.f16x2 r7009, r5997, r7007; +} +{ +fma.rn.f16x2 r7012, r5961, r7005, r7009; +} +{ +mul.f16x2 r7016, r5961, r7007; +} +{ +neg.f16x2 r7019, r7016; +} +{ +fma.rn.f16x2 r7021, r5997, r7005, r7019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7029, {low, high}; +} +{ +mul.f16x2 r7030, r7027, r7029; +} +{ +mul.f16x2 r7033, r7001, r7025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7036, {high, low}; +} +{ +fma.rn.f16x2 r7038, r7030, r7036, r7033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7044, {high, high}; +} +{ +mul.f16x2 r7046, r6083, r7044; +} +{ +fma.rn.f16x2 r7049, r6047, r7042, r7046; +} +{ +mul.f16x2 r7053, r6047, r7044; +} +{ +neg.f16x2 r7056, r7053; +} +{ +fma.rn.f16x2 r7058, r6083, r7042, r7056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7066, {low, high}; +} +{ +mul.f16x2 r7067, r7064, r7066; +} +{ +mul.f16x2 r7070, r7038, r7062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7073, {high, low}; +} +{ +fma.rn.f16x2 r7075, r7067, r7073, r7070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7081, {high, high}; +} +{ +mul.f16x2 r7083, r6169, r7081; +} +{ +fma.rn.f16x2 r7086, r6133, r7079, r7083; +} +{ +mul.f16x2 r7090, r6133, r7081; +} +{ +neg.f16x2 r7093, r7090; +} +{ +fma.rn.f16x2 r7095, r6169, r7079, r7093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7103, {low, high}; +} +{ +mul.f16x2 r7104, r7101, r7103; +} +{ +mul.f16x2 r7107, r7075, r7099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7110, {high, low}; +} +{ +fma.rn.f16x2 r7112, r7104, r7110, r7107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7118, {high, high}; +} +{ +mul.f16x2 r7120, r6255, r7118; +} +{ +fma.rn.f16x2 r7123, r6219, r7116, r7120; +} +{ +mul.f16x2 r7127, r6219, r7118; +} +{ +neg.f16x2 r7130, r7127; +} +{ +fma.rn.f16x2 r7132, r6255, r7116, r7130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7140, {low, high}; +} +{ +mul.f16x2 r7141, r7138, r7140; +} +{ +mul.f16x2 r7144, r7112, r7136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7147, {high, low}; +} +{ +fma.rn.f16x2 r7149, r7141, r7147, r7144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7155, {high, high}; +} +{ +mul.f16x2 r7157, r6341, r7155; +} +{ +fma.rn.f16x2 r7160, r6305, r7153, r7157; +} +{ +mul.f16x2 r7164, r6305, r7155; +} +{ +neg.f16x2 r7167, r7164; +} +{ +fma.rn.f16x2 r7169, r6341, r7153, r7167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7177, {low, high}; +} +{ +mul.f16x2 r7178, r7175, r7177; +} +{ +mul.f16x2 r7181, r7149, r7173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7184, {high, low}; +} +{ +fma.rn.f16x2 r7186, r7178, r7184, r7181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7192, {high, high}; +} +{ +mul.f16x2 r7194, r6427, r7192; +} +{ +fma.rn.f16x2 r7197, r6391, r7190, r7194; +} +{ +mul.f16x2 r7201, r6391, r7192; +} +{ +neg.f16x2 r7204, r7201; +} +{ +fma.rn.f16x2 r7206, r6427, r7190, r7204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7214, {low, high}; +} +{ +mul.f16x2 r7215, r7212, r7214; +} +{ +mul.f16x2 r7218, r7186, r7210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7221, {high, low}; +} +{ +fma.rn.f16x2 r7223, r7215, r7221, r7218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7229, {high, high}; +} +{ +mul.f16x2 r7231, r6513, r7229; +} +{ +fma.rn.f16x2 r7234, r6477, r7227, r7231; +} +{ +mul.f16x2 r7238, r6477, r7229; +} +{ +neg.f16x2 r7241, r7238; +} +{ +fma.rn.f16x2 r7243, r6513, r7227, r7241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7251, {low, high}; +} +{ +mul.f16x2 r7252, r7249, r7251; +} +{ +mul.f16x2 r7255, r7223, r7247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7258, {high, low}; +} +{ +fma.rn.f16x2 r7260, r7252, r7258, r7255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7266, {high, high}; +} +{ +mul.f16x2 r7268, r6599, r7266; +} +{ +fma.rn.f16x2 r7271, r6563, r7264, r7268; +} +{ +mul.f16x2 r7275, r6563, r7266; +} +{ +neg.f16x2 r7278, r7275; +} +{ +fma.rn.f16x2 r7280, r6599, r7264, r7278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7288, {low, high}; +} +{ +mul.f16x2 r7289, r7286, r7288; +} +{ +mul.f16x2 r7292, r7260, r7284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7295, {high, low}; +} +{ +fma.rn.f16x2 r7297, r7289, r7295, r7292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7303, {high, high}; +} +{ +mul.f16x2 r7305, r6685, r7303; +} +{ +fma.rn.f16x2 r7308, r6649, r7301, r7305; +} +{ +mul.f16x2 r7312, r6649, r7303; +} +{ +neg.f16x2 r7315, r7312; +} +{ +fma.rn.f16x2 r7317, r6685, r7301, r7315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7325, {low, high}; +} +{ +mul.f16x2 r7326, r7323, r7325; +} +{ +mul.f16x2 r7329, r7297, r7321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7332, {high, low}; +} +{ +fma.rn.f16x2 r7334, r7326, r7332, r7329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7340, {high, high}; +} +{ +mul.f16x2 r7342, r6015, r7340; +} +{ +fma.rn.f16x2 r7345, r5979, r7338, r7342; +} +{ +mul.f16x2 r7349, r5979, r7340; +} +{ +neg.f16x2 r7352, r7349; +} +{ +fma.rn.f16x2 r7354, r6015, r7338, r7352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7362, {low, high}; +} +{ +mul.f16x2 r7363, r7360, r7362; +} +{ +mul.f16x2 r7366, r7334, r7358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7369, {high, low}; +} +{ +fma.rn.f16x2 r7371, r7363, r7369, r7366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7377, {high, high}; +} +{ +mul.f16x2 r7379, r6101, r7377; +} +{ +fma.rn.f16x2 r7382, r6065, r7375, r7379; +} +{ +mul.f16x2 r7386, r6065, r7377; +} +{ +neg.f16x2 r7389, r7386; +} +{ +fma.rn.f16x2 r7391, r6101, r7375, r7389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7399, {low, high}; +} +{ +mul.f16x2 r7400, r7397, r7399; +} +{ +mul.f16x2 r7403, r7371, r7395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7406, {high, low}; +} +{ +fma.rn.f16x2 r7408, r7400, r7406, r7403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7414, {high, high}; +} +{ +mul.f16x2 r7416, r6187, r7414; +} +{ +fma.rn.f16x2 r7419, r6151, r7412, r7416; +} +{ +mul.f16x2 r7423, r6151, r7414; +} +{ +neg.f16x2 r7426, r7423; +} +{ +fma.rn.f16x2 r7428, r6187, r7412, r7426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7436, {low, high}; +} +{ +mul.f16x2 r7437, r7434, r7436; +} +{ +mul.f16x2 r7440, r7408, r7432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7443, {high, low}; +} +{ +fma.rn.f16x2 r7445, r7437, r7443, r7440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7451, {high, high}; +} +{ +mul.f16x2 r7453, r6273, r7451; +} +{ +fma.rn.f16x2 r7456, r6237, r7449, r7453; +} +{ +mul.f16x2 r7460, r6237, r7451; +} +{ +neg.f16x2 r7463, r7460; +} +{ +fma.rn.f16x2 r7465, r6273, r7449, r7463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7473, {low, high}; +} +{ +mul.f16x2 r7474, r7471, r7473; +} +{ +mul.f16x2 r7477, r7445, r7469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7480, {high, low}; +} +{ +fma.rn.f16x2 r7482, r7474, r7480, r7477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7488, {high, high}; +} +{ +mul.f16x2 r7490, r6359, r7488; +} +{ +fma.rn.f16x2 r7493, r6323, r7486, r7490; +} +{ +mul.f16x2 r7497, r6323, r7488; +} +{ +neg.f16x2 r7500, r7497; +} +{ +fma.rn.f16x2 r7502, r6359, r7486, r7500; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7508, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7510, {low, high}; +} +{ +mul.f16x2 r7511, r7508, r7510; +} +{ +mul.f16x2 r7514, r7482, r7506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7517, {high, low}; +} +{ +fma.rn.f16x2 r7519, r7511, r7517, r7514; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7525, {high, high}; +} +{ +mul.f16x2 r7527, r6445, r7525; +} +{ +fma.rn.f16x2 r7530, r6409, r7523, r7527; +} +{ +mul.f16x2 r7534, r6409, r7525; +} +{ +neg.f16x2 r7537, r7534; +} +{ +fma.rn.f16x2 r7539, r6445, r7523, r7537; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7545, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7547, {low, high}; +} +{ +mul.f16x2 r7548, r7545, r7547; +} +{ +mul.f16x2 r7551, r7519, r7543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7554, {high, low}; +} +{ +fma.rn.f16x2 r7556, r7548, r7554, r7551; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7562, {high, high}; +} +{ +mul.f16x2 r7564, r6531, r7562; +} +{ +fma.rn.f16x2 r7567, r6495, r7560, r7564; +} +{ +mul.f16x2 r7571, r6495, r7562; +} +{ +neg.f16x2 r7574, r7571; +} +{ +fma.rn.f16x2 r7576, r6531, r7560, r7574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7582, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7584, {low, high}; +} +{ +mul.f16x2 r7585, r7582, r7584; +} +{ +mul.f16x2 r7588, r7556, r7580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7591, {high, low}; +} +{ +fma.rn.f16x2 r7593, r7585, r7591, r7588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7599, {high, high}; +} +{ +mul.f16x2 r7601, r6617, r7599; +} +{ +fma.rn.f16x2 r7604, r6581, r7597, r7601; +} +{ +mul.f16x2 r7608, r6581, r7599; +} +{ +neg.f16x2 r7611, r7608; +} +{ +fma.rn.f16x2 r7613, r6617, r7597, r7611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7619, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7621, {low, high}; +} +{ +mul.f16x2 r7622, r7619, r7621; +} +{ +mul.f16x2 r7625, r7593, r7617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7628, {high, low}; +} +{ +fma.rn.f16x2 r7630, r7622, r7628, r7625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7636, {high, high}; +} +{ +mul.f16x2 r7638, r6703, r7636; +} +{ +fma.rn.f16x2 r7641, r6667, r7634, r7638; +} +{ +mul.f16x2 r7645, r6667, r7636; +} +{ +neg.f16x2 r7648, r7645; +} +{ +fma.rn.f16x2 r7650, r6703, r7634, r7648; +} +barrier.sync 0; +mad.lo.s32 r8464, r8459, 2916, r8463; +st.shared.u32 [r8464], r5937; +st.shared.u32 [r8464+108], r6716; +st.shared.u32 [r8464+216], r6753; +st.shared.u32 [r8464+324], r6790; +st.shared.u32 [r8464+432], r6827; +st.shared.u32 [r8464+540], r6864; +st.shared.u32 [r8464+648], r6901; +st.shared.u32 [r8464+756], r6938; +st.shared.u32 [r8464+864], r6975; +st.shared.u32 [r8464+972], r7012; +st.shared.u32 [r8464+1080], r7049; +st.shared.u32 [r8464+1188], r7086; +st.shared.u32 [r8464+1296], r7123; +st.shared.u32 [r8464+1404], r7160; +st.shared.u32 [r8464+1512], r7197; +st.shared.u32 [r8464+1620], r7234; +st.shared.u32 [r8464+1728], r7271; +st.shared.u32 [r8464+1836], r7308; +st.shared.u32 [r8464+1944], r7345; +st.shared.u32 [r8464+2052], r7382; +st.shared.u32 [r8464+2160], r7419; +st.shared.u32 [r8464+2268], r7456; +st.shared.u32 [r8464+2376], r7493; +st.shared.u32 [r8464+2484], r7530; +st.shared.u32 [r8464+2592], r7567; +st.shared.u32 [r8464+2700], r7604; +st.shared.u32 [r8464+2808], r7641; +barrier.sync 0; +ld.shared.u32 r7677, [r8454]; +ld.shared.u32 r7763, [r8454+324]; +ld.shared.u32 r7849, [r8454+648]; +ld.shared.u32 r7935, [r8454+972]; +ld.shared.u32 r8021, [r8454+1296]; +ld.shared.u32 r8107, [r8454+1620]; +ld.shared.u32 r8193, [r8454+1944]; +ld.shared.u32 r8279, [r8454+2268]; +ld.shared.u32 r8365, [r8454+2592]; +ld.shared.u32 r7674, [r8454+2916]; +ld.shared.u32 r7760, [r8454+3240]; +ld.shared.u32 r7846, [r8454+3564]; +ld.shared.u32 r7932, [r8454+3888]; +ld.shared.u32 r8018, [r8454+4212]; +ld.shared.u32 r8104, [r8454+4536]; +ld.shared.u32 r8190, [r8454+4860]; +ld.shared.u32 r8276, [r8454+5184]; +ld.shared.u32 r8362, [r8454+5508]; +ld.shared.u32 r7675, [r8454+5832]; +ld.shared.u32 r7761, [r8454+6156]; +ld.shared.u32 r7847, [r8454+6480]; +ld.shared.u32 r7933, [r8454+6804]; +ld.shared.u32 r8019, [r8454+7128]; +ld.shared.u32 r8105, [r8454+7452]; +ld.shared.u32 r8191, [r8454+7776]; +ld.shared.u32 r8277, [r8454+8100]; +ld.shared.u32 r8363, [r8454+8424]; +barrier.sync 0; +st.shared.u32 [r8464], r5943; +st.shared.u32 [r8464+108], r6725; +st.shared.u32 [r8464+216], r6762; +st.shared.u32 [r8464+324], r6799; +st.shared.u32 [r8464+432], r6836; +st.shared.u32 [r8464+540], r6873; +st.shared.u32 [r8464+648], r6910; +st.shared.u32 [r8464+756], r6947; +st.shared.u32 [r8464+864], r6984; +st.shared.u32 [r8464+972], r7021; +st.shared.u32 [r8464+1080], r7058; +st.shared.u32 [r8464+1188], r7095; +st.shared.u32 [r8464+1296], r7132; +st.shared.u32 [r8464+1404], r7169; +st.shared.u32 [r8464+1512], r7206; +st.shared.u32 [r8464+1620], r7243; +st.shared.u32 [r8464+1728], r7280; +st.shared.u32 [r8464+1836], r7317; +st.shared.u32 [r8464+1944], r7354; +st.shared.u32 [r8464+2052], r7391; +st.shared.u32 [r8464+2160], r7428; +st.shared.u32 [r8464+2268], r7465; +st.shared.u32 [r8464+2376], r7502; +st.shared.u32 [r8464+2484], r7539; +st.shared.u32 [r8464+2592], r7576; +st.shared.u32 [r8464+2700], r7613; +st.shared.u32 [r8464+2808], r7650; +barrier.sync 0; +ld.shared.u32 r7683, [r8454]; +ld.shared.u32 r7769, [r8454+324]; +ld.shared.u32 r7855, [r8454+648]; +ld.shared.u32 r7941, [r8454+972]; +ld.shared.u32 r8027, [r8454+1296]; +ld.shared.u32 r8113, [r8454+1620]; +ld.shared.u32 r8199, [r8454+1944]; +ld.shared.u32 r8285, [r8454+2268]; +ld.shared.u32 r8371, [r8454+2592]; +ld.shared.u32 r7680, [r8454+2916]; +ld.shared.u32 r7766, [r8454+3240]; +ld.shared.u32 r7852, [r8454+3564]; +ld.shared.u32 r7938, [r8454+3888]; +ld.shared.u32 r8024, [r8454+4212]; +ld.shared.u32 r8110, [r8454+4536]; +ld.shared.u32 r8196, [r8454+4860]; +ld.shared.u32 r8282, [r8454+5184]; +ld.shared.u32 r8368, [r8454+5508]; +ld.shared.u32 r7681, [r8454+5832]; +ld.shared.u32 r7767, [r8454+6156]; +ld.shared.u32 r7853, [r8454+6480]; +ld.shared.u32 r7939, [r8454+6804]; +ld.shared.u32 r8025, [r8454+7128]; +ld.shared.u32 r8111, [r8454+7452]; +ld.shared.u32 r8197, [r8454+7776]; +ld.shared.u32 r8283, [r8454+8100]; +ld.shared.u32 r8369, [r8454+8424]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7672, {low, high}; +} +{ +add.f16x2 r7673, r7674, r7675; +} +{ +add.f16x2 %0, r7677, r7673; +} +{ +add.f16x2 r7679, r7680, r7681; +} +{ +add.f16x2 %1, r7683, r7679; +} +{ +add.f16x2 r7685, r7674, r7675; +} +{ +mul.f16x2 r7688, r7685, r7671; +} +{ +add.f16x2 r7691, r7677, r7688; +} +{ +sub.f16x2 r7694, r7680, r7681; +} +{ +mul.f16x2 r7697, r7694, r7672; +} +{ +add.f16x2 %18, r7691, r7697; +} +{ +add.f16x2 r7703, r7674, r7675; +} +{ +mul.f16x2 r7706, r7703, r7671; +} +{ +add.f16x2 r7709, r7677, r7706; +} +{ +sub.f16x2 r7712, r7680, r7681; +} +{ +mul.f16x2 r7715, r7712, r7672; +} +{ +sub.f16x2 %36, r7709, r7715; +} +{ +add.f16x2 r7721, r7680, r7681; +} +{ +mul.f16x2 r7724, r7721, r7671; +} +{ +add.f16x2 r7727, r7683, r7724; +} +{ +sub.f16x2 r7730, r7674, r7675; +} +{ +mul.f16x2 r7733, r7730, r7672; +} +{ +sub.f16x2 %19, r7727, r7733; +} +{ +add.f16x2 r7739, r7680, r7681; +} +{ +mul.f16x2 r7742, r7739, r7671; +} +{ +add.f16x2 r7745, r7683, r7742; +} +{ +sub.f16x2 r7748, r7674, r7675; +} +{ +mul.f16x2 r7751, r7748, r7672; +} +{ +add.f16x2 %37, r7745, r7751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7757, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7758, {low, high}; +} +{ +add.f16x2 r7759, r7760, r7761; +} +{ +add.f16x2 %2, r7763, r7759; +} +{ +add.f16x2 r7765, r7766, r7767; +} +{ +add.f16x2 %3, r7769, r7765; +} +{ +add.f16x2 r7771, r7760, r7761; +} +{ +mul.f16x2 r7774, r7771, r7757; +} +{ +add.f16x2 r7777, r7763, r7774; +} +{ +sub.f16x2 r7780, r7766, r7767; +} +{ +mul.f16x2 r7783, r7780, r7758; +} +{ +add.f16x2 %20, r7777, r7783; +} +{ +add.f16x2 r7789, r7760, r7761; +} +{ +mul.f16x2 r7792, r7789, r7757; +} +{ +add.f16x2 r7795, r7763, r7792; +} +{ +sub.f16x2 r7798, r7766, r7767; +} +{ +mul.f16x2 r7801, r7798, r7758; +} +{ +sub.f16x2 %38, r7795, r7801; +} +{ +add.f16x2 r7807, r7766, r7767; +} +{ +mul.f16x2 r7810, r7807, r7757; +} +{ +add.f16x2 r7813, r7769, r7810; +} +{ +sub.f16x2 r7816, r7760, r7761; +} +{ +mul.f16x2 r7819, r7816, r7758; +} +{ +sub.f16x2 %21, r7813, r7819; +} +{ +add.f16x2 r7825, r7766, r7767; +} +{ +mul.f16x2 r7828, r7825, r7757; +} +{ +add.f16x2 r7831, r7769, r7828; +} +{ +sub.f16x2 r7834, r7760, r7761; +} +{ +mul.f16x2 r7837, r7834, r7758; +} +{ +add.f16x2 %39, r7831, r7837; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7843, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7844, {low, high}; +} +{ +add.f16x2 r7845, r7846, r7847; +} +{ +add.f16x2 %4, r7849, r7845; +} +{ +add.f16x2 r7851, r7852, r7853; +} +{ +add.f16x2 %5, r7855, r7851; +} +{ +add.f16x2 r7857, r7846, r7847; +} +{ +mul.f16x2 r7860, r7857, r7843; +} +{ +add.f16x2 r7863, r7849, r7860; +} +{ +sub.f16x2 r7866, r7852, r7853; +} +{ +mul.f16x2 r7869, r7866, r7844; +} +{ +add.f16x2 %22, r7863, r7869; +} +{ +add.f16x2 r7875, r7846, r7847; +} +{ +mul.f16x2 r7878, r7875, r7843; +} +{ +add.f16x2 r7881, r7849, r7878; +} +{ +sub.f16x2 r7884, r7852, r7853; +} +{ +mul.f16x2 r7887, r7884, r7844; +} +{ +sub.f16x2 %40, r7881, r7887; +} +{ +add.f16x2 r7893, r7852, r7853; +} +{ +mul.f16x2 r7896, r7893, r7843; +} +{ +add.f16x2 r7899, r7855, r7896; +} +{ +sub.f16x2 r7902, r7846, r7847; +} +{ +mul.f16x2 r7905, r7902, r7844; +} +{ +sub.f16x2 %23, r7899, r7905; +} +{ +add.f16x2 r7911, r7852, r7853; +} +{ +mul.f16x2 r7914, r7911, r7843; +} +{ +add.f16x2 r7917, r7855, r7914; +} +{ +sub.f16x2 r7920, r7846, r7847; +} +{ +mul.f16x2 r7923, r7920, r7844; +} +{ +add.f16x2 %41, r7917, r7923; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r7929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r7930, {low, high}; +} +{ +add.f16x2 r7931, r7932, r7933; +} +{ +add.f16x2 %6, r7935, r7931; +} +{ +add.f16x2 r7937, r7938, r7939; +} +{ +add.f16x2 %7, r7941, r7937; +} +{ +add.f16x2 r7943, r7932, r7933; +} +{ +mul.f16x2 r7946, r7943, r7929; +} +{ +add.f16x2 r7949, r7935, r7946; +} +{ +sub.f16x2 r7952, r7938, r7939; +} +{ +mul.f16x2 r7955, r7952, r7930; +} +{ +add.f16x2 %24, r7949, r7955; +} +{ +add.f16x2 r7961, r7932, r7933; +} +{ +mul.f16x2 r7964, r7961, r7929; +} +{ +add.f16x2 r7967, r7935, r7964; +} +{ +sub.f16x2 r7970, r7938, r7939; +} +{ +mul.f16x2 r7973, r7970, r7930; +} +{ +sub.f16x2 %42, r7967, r7973; +} +{ +add.f16x2 r7979, r7938, r7939; +} +{ +mul.f16x2 r7982, r7979, r7929; +} +{ +add.f16x2 r7985, r7941, r7982; +} +{ +sub.f16x2 r7988, r7932, r7933; +} +{ +mul.f16x2 r7991, r7988, r7930; +} +{ +sub.f16x2 %25, r7985, r7991; +} +{ +add.f16x2 r7997, r7938, r7939; +} +{ +mul.f16x2 r8000, r7997, r7929; +} +{ +add.f16x2 r8003, r7941, r8000; +} +{ +sub.f16x2 r8006, r7932, r7933; +} +{ +mul.f16x2 r8009, r8006, r7930; +} +{ +add.f16x2 %43, r8003, r8009; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8015, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8016, {low, high}; +} +{ +add.f16x2 r8017, r8018, r8019; +} +{ +add.f16x2 %8, r8021, r8017; +} +{ +add.f16x2 r8023, r8024, r8025; +} +{ +add.f16x2 %9, r8027, r8023; +} +{ +add.f16x2 r8029, r8018, r8019; +} +{ +mul.f16x2 r8032, r8029, r8015; +} +{ +add.f16x2 r8035, r8021, r8032; +} +{ +sub.f16x2 r8038, r8024, r8025; +} +{ +mul.f16x2 r8041, r8038, r8016; +} +{ +add.f16x2 %26, r8035, r8041; +} +{ +add.f16x2 r8047, r8018, r8019; +} +{ +mul.f16x2 r8050, r8047, r8015; +} +{ +add.f16x2 r8053, r8021, r8050; +} +{ +sub.f16x2 r8056, r8024, r8025; +} +{ +mul.f16x2 r8059, r8056, r8016; +} +{ +sub.f16x2 %44, r8053, r8059; +} +{ +add.f16x2 r8065, r8024, r8025; +} +{ +mul.f16x2 r8068, r8065, r8015; +} +{ +add.f16x2 r8071, r8027, r8068; +} +{ +sub.f16x2 r8074, r8018, r8019; +} +{ +mul.f16x2 r8077, r8074, r8016; +} +{ +sub.f16x2 %27, r8071, r8077; +} +{ +add.f16x2 r8083, r8024, r8025; +} +{ +mul.f16x2 r8086, r8083, r8015; +} +{ +add.f16x2 r8089, r8027, r8086; +} +{ +sub.f16x2 r8092, r8018, r8019; +} +{ +mul.f16x2 r8095, r8092, r8016; +} +{ +add.f16x2 %45, r8089, r8095; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8102, {low, high}; +} +{ +add.f16x2 r8103, r8104, r8105; +} +{ +add.f16x2 %10, r8107, r8103; +} +{ +add.f16x2 r8109, r8110, r8111; +} +{ +add.f16x2 %11, r8113, r8109; +} +{ +add.f16x2 r8115, r8104, r8105; +} +{ +mul.f16x2 r8118, r8115, r8101; +} +{ +add.f16x2 r8121, r8107, r8118; +} +{ +sub.f16x2 r8124, r8110, r8111; +} +{ +mul.f16x2 r8127, r8124, r8102; +} +{ +add.f16x2 %28, r8121, r8127; +} +{ +add.f16x2 r8133, r8104, r8105; +} +{ +mul.f16x2 r8136, r8133, r8101; +} +{ +add.f16x2 r8139, r8107, r8136; +} +{ +sub.f16x2 r8142, r8110, r8111; +} +{ +mul.f16x2 r8145, r8142, r8102; +} +{ +sub.f16x2 %46, r8139, r8145; +} +{ +add.f16x2 r8151, r8110, r8111; +} +{ +mul.f16x2 r8154, r8151, r8101; +} +{ +add.f16x2 r8157, r8113, r8154; +} +{ +sub.f16x2 r8160, r8104, r8105; +} +{ +mul.f16x2 r8163, r8160, r8102; +} +{ +sub.f16x2 %29, r8157, r8163; +} +{ +add.f16x2 r8169, r8110, r8111; +} +{ +mul.f16x2 r8172, r8169, r8101; +} +{ +add.f16x2 r8175, r8113, r8172; +} +{ +sub.f16x2 r8178, r8104, r8105; +} +{ +mul.f16x2 r8181, r8178, r8102; +} +{ +add.f16x2 %47, r8175, r8181; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8188, {low, high}; +} +{ +add.f16x2 r8189, r8190, r8191; +} +{ +add.f16x2 %12, r8193, r8189; +} +{ +add.f16x2 r8195, r8196, r8197; +} +{ +add.f16x2 %13, r8199, r8195; +} +{ +add.f16x2 r8201, r8190, r8191; +} +{ +mul.f16x2 r8204, r8201, r8187; +} +{ +add.f16x2 r8207, r8193, r8204; +} +{ +sub.f16x2 r8210, r8196, r8197; +} +{ +mul.f16x2 r8213, r8210, r8188; +} +{ +add.f16x2 %30, r8207, r8213; +} +{ +add.f16x2 r8219, r8190, r8191; +} +{ +mul.f16x2 r8222, r8219, r8187; +} +{ +add.f16x2 r8225, r8193, r8222; +} +{ +sub.f16x2 r8228, r8196, r8197; +} +{ +mul.f16x2 r8231, r8228, r8188; +} +{ +sub.f16x2 %48, r8225, r8231; +} +{ +add.f16x2 r8237, r8196, r8197; +} +{ +mul.f16x2 r8240, r8237, r8187; +} +{ +add.f16x2 r8243, r8199, r8240; +} +{ +sub.f16x2 r8246, r8190, r8191; +} +{ +mul.f16x2 r8249, r8246, r8188; +} +{ +sub.f16x2 %31, r8243, r8249; +} +{ +add.f16x2 r8255, r8196, r8197; +} +{ +mul.f16x2 r8258, r8255, r8187; +} +{ +add.f16x2 r8261, r8199, r8258; +} +{ +sub.f16x2 r8264, r8190, r8191; +} +{ +mul.f16x2 r8267, r8264, r8188; +} +{ +add.f16x2 %49, r8261, r8267; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8273, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8274, {low, high}; +} +{ +add.f16x2 r8275, r8276, r8277; +} +{ +add.f16x2 %14, r8279, r8275; +} +{ +add.f16x2 r8281, r8282, r8283; +} +{ +add.f16x2 %15, r8285, r8281; +} +{ +add.f16x2 r8287, r8276, r8277; +} +{ +mul.f16x2 r8290, r8287, r8273; +} +{ +add.f16x2 r8293, r8279, r8290; +} +{ +sub.f16x2 r8296, r8282, r8283; +} +{ +mul.f16x2 r8299, r8296, r8274; +} +{ +add.f16x2 %32, r8293, r8299; +} +{ +add.f16x2 r8305, r8276, r8277; +} +{ +mul.f16x2 r8308, r8305, r8273; +} +{ +add.f16x2 r8311, r8279, r8308; +} +{ +sub.f16x2 r8314, r8282, r8283; +} +{ +mul.f16x2 r8317, r8314, r8274; +} +{ +sub.f16x2 %50, r8311, r8317; +} +{ +add.f16x2 r8323, r8282, r8283; +} +{ +mul.f16x2 r8326, r8323, r8273; +} +{ +add.f16x2 r8329, r8285, r8326; +} +{ +sub.f16x2 r8332, r8276, r8277; +} +{ +mul.f16x2 r8335, r8332, r8274; +} +{ +sub.f16x2 %33, r8329, r8335; +} +{ +add.f16x2 r8341, r8282, r8283; +} +{ +mul.f16x2 r8344, r8341, r8273; +} +{ +add.f16x2 r8347, r8285, r8344; +} +{ +sub.f16x2 r8350, r8276, r8277; +} +{ +mul.f16x2 r8353, r8350, r8274; +} +{ +add.f16x2 %51, r8347, r8353; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f762; +cvt.rn.f16.f32 high, f762; +mov.b32 r8359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f764; +cvt.rn.f16.f32 high, f764; +mov.b32 r8360, {low, high}; +} +{ +add.f16x2 r8361, r8362, r8363; +} +{ +add.f16x2 %16, r8365, r8361; +} +{ +add.f16x2 r8367, r8368, r8369; +} +{ +add.f16x2 %17, r8371, r8367; +} +{ +add.f16x2 r8373, r8362, r8363; +} +{ +mul.f16x2 r8376, r8373, r8359; +} +{ +add.f16x2 r8379, r8365, r8376; +} +{ +sub.f16x2 r8382, r8368, r8369; +} +{ +mul.f16x2 r8385, r8382, r8360; +} +{ +add.f16x2 %34, r8379, r8385; +} +{ +add.f16x2 r8391, r8362, r8363; +} +{ +mul.f16x2 r8394, r8391, r8359; +} +{ +add.f16x2 r8397, r8365, r8394; +} +{ +sub.f16x2 r8400, r8368, r8369; +} +{ +mul.f16x2 r8403, r8400, r8360; +} +{ +sub.f16x2 %52, r8397, r8403; +} +{ +add.f16x2 r8409, r8368, r8369; +} +{ +mul.f16x2 r8412, r8409, r8359; +} +{ +add.f16x2 r8415, r8371, r8412; +} +{ +sub.f16x2 r8418, r8362, r8363; +} +{ +mul.f16x2 r8421, r8418, r8360; +} +{ +sub.f16x2 %35, r8415, r8421; +} +{ +add.f16x2 r8427, r8368, r8369; +} +{ +mul.f16x2 r8430, r8427, r8359; +} +{ +add.f16x2 r8433, r8371, r8430; +} +{ +sub.f16x2 r8436, r8362, r8363; +} +{ +mul.f16x2 r8439, r8436, r8360; +} +{ +add.f16x2 %53, r8433, r8439; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1096, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<95>; +.reg .b32 r<1110>; +.reg .b64 rd<14>; +mov.u32 r1065, %tid.y; +mov.u32 r1066, %6; +mad.lo.s32 r1067, r1065, 17496, r1066; +mov.u32 r1068, %tid.x; +mov.f32 f74, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1, {low, high}; +} +mov.f32 f76, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r1068, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r1069, rd3; +mul.lo.s32 r1070, r1069, 729; +sub.s32 r1071, r1068, r1070; +mad.lo.s32 r1072, r1069, 17496, r1067; +cvt.rn.f32.u32 f77, r1071; +mul.f32 f78, f77, 0f3B3C4870; +cos.approx.f32 f5, f78; +sin.approx.f32 f79, f78; +neg.f32 f6, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r1073, r1071, 24, r1072; +st.shared.v2.f32 [r1073], {r6, r12}; +st.shared.v2.f32 [r1073+8], {r97, r106}; +st.shared.v2.f32 [r1073+16], {r134, r143}; +barrier.sync 0; +shl.b32 r1074, r1071, 4; +sub.s32 r1075, r1073, r1074; +ld.shared.u32 r170, [r1075]; +ld.shared.u32 r176, [r1075+4]; +ld.shared.u32 r167, [r1075+5832]; +ld.shared.u32 r173, [r1075+5836]; +ld.shared.u32 r168, [r1075+11664]; +ld.shared.u32 r174, [r1075+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r1071, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r1076, rd5; +mul.lo.s32 r1077, r1076, 3; +sub.s32 r1078, r1071, r1077; +shl.b32 r1079, r1078, 3; +add.s32 r1080, r1072, r1079; +cvt.rn.f32.u32 f80, r1076; +mul.f32 f81, f80, 0f3C0D3654; +cos.approx.f32 f17, f81; +sin.approx.f32 f82, f81; +neg.f32 f18, f82; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r1081, r1076, 72, r1080; +st.shared.u32 [r1081], r169; +st.shared.u32 [r1081+4], r175; +st.shared.u32 [r1081+24], r260; +st.shared.u32 [r1081+28], r269; +st.shared.u32 [r1081+48], r297; +st.shared.u32 [r1081+52], r306; +barrier.sync 0; +ld.shared.u32 r333, [r1075]; +ld.shared.u32 r339, [r1075+4]; +ld.shared.u32 r330, [r1075+5832]; +ld.shared.u32 r336, [r1075+5836]; +ld.shared.u32 r331, [r1075+11664]; +ld.shared.u32 r337, [r1075+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r1071, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r1082, rd7; +mul.lo.s32 r1083, r1082, 9; +sub.s32 r1084, r1071, r1083; +shl.b32 r1085, r1084, 3; +add.s32 r1086, r1072, r1085; +cvt.rn.f32.u32 f83, r1082; +mul.f32 f84, f83, 0f3CD3D17E; +cos.approx.f32 f29, f84; +sin.approx.f32 f85, f84; +neg.f32 f30, f85; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r1087, r1082, 216, r1086; +st.shared.u32 [r1087], r332; +st.shared.u32 [r1087+4], r338; +st.shared.u32 [r1087+72], r423; +st.shared.u32 [r1087+76], r432; +st.shared.u32 [r1087+144], r460; +st.shared.u32 [r1087+148], r469; +barrier.sync 0; +ld.shared.u32 r496, [r1075]; +ld.shared.u32 r502, [r1075+4]; +ld.shared.u32 r493, [r1075+5832]; +ld.shared.u32 r499, [r1075+5836]; +ld.shared.u32 r494, [r1075+11664]; +ld.shared.u32 r500, [r1075+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 r519, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 r537, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 r555, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 r573, r564, r570; +} +mul.wide.u32 rd8, r1071, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r1088, rd9; +sub.s32 r1089, r1071, r1088; +shr.u32 r1090, r1089, 1; +add.s32 r1091, r1090, r1088; +shr.u32 r1092, r1091, 4; +mul.lo.s32 r1093, r1092, 27; +sub.s32 r1094, r1071, r1093; +shl.b32 r1095, r1094, 3; +add.s32 r1096, r1072, r1095; +cvt.rn.f32.u32 f86, r1092; +mul.f32 f87, f86, 0f3D9EDD1F; +cos.approx.f32 f41, f87; +sin.approx.f32 f88, f87; +neg.f32 f42, f88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r576, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r579, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r581, {high, high}; +} +{ +mul.f16x2 r583, r555, r581; +} +{ +fma.rn.f16x2 r586, r519, r579, r583; +} +{ +mul.f16x2 r590, r519, r581; +} +{ +neg.f16x2 r593, r590; +} +{ +fma.rn.f16x2 r595, r555, r579, r593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r599, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r601, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r603, {low, high}; +} +{ +mul.f16x2 r604, r601, r603; +} +{ +mul.f16x2 r607, r576, r599; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r610, {high, low}; +} +{ +fma.rn.f16x2 r612, r604, r610, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r616, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r618, {high, high}; +} +{ +mul.f16x2 r620, r573, r618; +} +{ +fma.rn.f16x2 r623, r537, r616, r620; +} +{ +mul.f16x2 r627, r537, r618; +} +{ +neg.f16x2 r630, r627; +} +{ +fma.rn.f16x2 r632, r573, r616, r630; +} +barrier.sync 0; +mad.lo.s32 r1097, r1092, 648, r1096; +st.shared.u32 [r1097], r495; +st.shared.u32 [r1097+4], r501; +st.shared.u32 [r1097+216], r586; +st.shared.u32 [r1097+220], r595; +st.shared.u32 [r1097+432], r623; +st.shared.u32 [r1097+436], r632; +barrier.sync 0; +ld.shared.u32 r659, [r1075]; +ld.shared.u32 r665, [r1075+4]; +ld.shared.u32 r656, [r1075+5832]; +ld.shared.u32 r662, [r1075+5836]; +ld.shared.u32 r657, [r1075+11664]; +ld.shared.u32 r663, [r1075+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, r656, r657; +} +{ +add.f16x2 r658, r659, r655; +} +{ +add.f16x2 r661, r662, r663; +} +{ +add.f16x2 r664, r665, r661; +} +{ +add.f16x2 r667, r656, r657; +} +{ +mul.f16x2 r670, r667, r653; +} +{ +add.f16x2 r673, r659, r670; +} +{ +sub.f16x2 r676, r662, r663; +} +{ +mul.f16x2 r679, r676, r654; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r656, r657; +} +{ +mul.f16x2 r688, r685, r653; +} +{ +add.f16x2 r691, r659, r688; +} +{ +sub.f16x2 r694, r662, r663; +} +{ +mul.f16x2 r697, r694, r654; +} +{ +sub.f16x2 r700, r691, r697; +} +{ +add.f16x2 r703, r662, r663; +} +{ +mul.f16x2 r706, r703, r653; +} +{ +add.f16x2 r709, r665, r706; +} +{ +sub.f16x2 r712, r656, r657; +} +{ +mul.f16x2 r715, r712, r654; +} +{ +sub.f16x2 r718, r709, r715; +} +{ +add.f16x2 r721, r662, r663; +} +{ +mul.f16x2 r724, r721, r653; +} +{ +add.f16x2 r727, r665, r724; +} +{ +sub.f16x2 r730, r656, r657; +} +{ +mul.f16x2 r733, r730, r654; +} +{ +add.f16x2 r736, r727, r733; +} +mul.wide.u32 rd10, r1071, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r1098, rd11; +mul.lo.s32 r1099, r1098, 81; +sub.s32 r1100, r1071, r1099; +shl.b32 r1101, r1100, 3; +add.s32 r1102, r1072, r1101; +cvt.rn.f32.u32 f89, r1098; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f53, f90; +sin.approx.f32 f91, f90; +neg.f32 f54, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r739, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r744, {high, high}; +} +{ +mul.f16x2 r746, r718, r744; +} +{ +fma.rn.f16x2 r749, r682, r742, r746; +} +{ +mul.f16x2 r753, r682, r744; +} +{ +neg.f16x2 r756, r753; +} +{ +fma.rn.f16x2 r758, r718, r742, r756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r766, {low, high}; +} +{ +mul.f16x2 r767, r764, r766; +} +{ +mul.f16x2 r770, r739, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r773, {high, low}; +} +{ +fma.rn.f16x2 r775, r767, r773, r770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r781, {high, high}; +} +{ +mul.f16x2 r783, r736, r781; +} +{ +fma.rn.f16x2 r786, r700, r779, r783; +} +{ +mul.f16x2 r790, r700, r781; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r736, r779, r793; +} +barrier.sync 0; +mad.lo.s32 r1103, r1098, 1944, r1102; +st.shared.u32 [r1103], r658; +st.shared.u32 [r1103+4], r664; +st.shared.u32 [r1103+648], r749; +st.shared.u32 [r1103+652], r758; +st.shared.u32 [r1103+1296], r786; +st.shared.u32 [r1103+1300], r795; +barrier.sync 0; +ld.shared.u32 r822, [r1075]; +ld.shared.u32 r828, [r1075+4]; +ld.shared.u32 r819, [r1075+5832]; +ld.shared.u32 r825, [r1075+5836]; +ld.shared.u32 r820, [r1075+11664]; +ld.shared.u32 r826, [r1075+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r817, {low, high}; +} +{ +add.f16x2 r818, r819, r820; +} +{ +add.f16x2 r821, r822, r818; +} +{ +add.f16x2 r824, r825, r826; +} +{ +add.f16x2 r827, r828, r824; +} +{ +add.f16x2 r830, r819, r820; +} +{ +mul.f16x2 r833, r830, r816; +} +{ +add.f16x2 r836, r822, r833; +} +{ +sub.f16x2 r839, r825, r826; +} +{ +mul.f16x2 r842, r839, r817; +} +{ +add.f16x2 r845, r836, r842; +} +{ +add.f16x2 r848, r819, r820; +} +{ +mul.f16x2 r851, r848, r816; +} +{ +add.f16x2 r854, r822, r851; +} +{ +sub.f16x2 r857, r825, r826; +} +{ +mul.f16x2 r860, r857, r817; +} +{ +sub.f16x2 r863, r854, r860; +} +{ +add.f16x2 r866, r825, r826; +} +{ +mul.f16x2 r869, r866, r816; +} +{ +add.f16x2 r872, r828, r869; +} +{ +sub.f16x2 r875, r819, r820; +} +{ +mul.f16x2 r878, r875, r817; +} +{ +sub.f16x2 r881, r872, r878; +} +{ +add.f16x2 r884, r825, r826; +} +{ +mul.f16x2 r887, r884, r816; +} +{ +add.f16x2 r890, r828, r887; +} +{ +sub.f16x2 r893, r819, r820; +} +{ +mul.f16x2 r896, r893, r817; +} +{ +add.f16x2 r899, r890, r896; +} +mul.wide.u32 rd12, r1071, -2032597691; +shr.u64 rd13, rd12, 39; +cvt.u32.u64 r1104, rd13; +mul.lo.s32 r1105, r1104, 243; +sub.s32 r1106, r1071, r1105; +shl.b32 r1107, r1106, 3; +add.s32 r1108, r1072, r1107; +cvt.rn.f32.u32 f92, r1104; +mul.f32 f93, f92, 0f3F32B8C2; +cos.approx.f32 f65, f93; +sin.approx.f32 f94, f93; +neg.f32 f66, f94; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f65; +cvt.rn.f16.f32 high, f66; +mov.b32 r902, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r907, {high, high}; +} +{ +mul.f16x2 r909, r881, r907; +} +{ +fma.rn.f16x2 r912, r845, r905, r909; +} +{ +mul.f16x2 r916, r845, r907; +} +{ +neg.f16x2 r919, r916; +} +{ +fma.rn.f16x2 r921, r881, r905, r919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r929, {low, high}; +} +{ +mul.f16x2 r930, r927, r929; +} +{ +mul.f16x2 r933, r902, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r936, {high, low}; +} +{ +fma.rn.f16x2 r938, r930, r936, r933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r938; +mov.b32 r942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r938; +mov.b32 r944, {high, high}; +} +{ +mul.f16x2 r946, r899, r944; +} +{ +fma.rn.f16x2 r949, r863, r942, r946; +} +{ +mul.f16x2 r953, r863, r944; +} +{ +neg.f16x2 r956, r953; +} +{ +fma.rn.f16x2 r958, r899, r942, r956; +} +barrier.sync 0; +mad.lo.s32 r1109, r1104, 5832, r1108; +st.shared.u32 [r1109], r821; +st.shared.u32 [r1109+4], r827; +st.shared.u32 [r1109+1944], r912; +st.shared.u32 [r1109+1948], r921; +st.shared.u32 [r1109+3888], r949; +st.shared.u32 [r1109+3892], r958; +barrier.sync 0; +ld.shared.u32 r985, [r1075]; +ld.shared.u32 r991, [r1075+4]; +ld.shared.u32 r982, [r1075+5832]; +ld.shared.u32 r988, [r1075+5836]; +ld.shared.u32 r983, [r1075+11664]; +ld.shared.u32 r989, [r1075+11668]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r979, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r980, {low, high}; +} +{ +add.f16x2 r981, r982, r983; +} +{ +add.f16x2 %0, r985, r981; +} +{ +add.f16x2 r987, r988, r989; +} +{ +add.f16x2 %1, r991, r987; +} +{ +add.f16x2 r993, r982, r983; +} +{ +mul.f16x2 r996, r993, r979; +} +{ +add.f16x2 r999, r985, r996; +} +{ +sub.f16x2 r1002, r988, r989; +} +{ +mul.f16x2 r1005, r1002, r980; +} +{ +add.f16x2 %2, r999, r1005; +} +{ +add.f16x2 r1011, r982, r983; +} +{ +mul.f16x2 r1014, r1011, r979; +} +{ +add.f16x2 r1017, r985, r1014; +} +{ +sub.f16x2 r1020, r988, r989; +} +{ +mul.f16x2 r1023, r1020, r980; +} +{ +sub.f16x2 %4, r1017, r1023; +} +{ +add.f16x2 r1029, r988, r989; +} +{ +mul.f16x2 r1032, r1029, r979; +} +{ +add.f16x2 r1035, r991, r1032; +} +{ +sub.f16x2 r1038, r982, r983; +} +{ +mul.f16x2 r1041, r1038, r980; +} +{ +sub.f16x2 %3, r1035, r1041; +} +{ +add.f16x2 r1047, r988, r989; +} +{ +mul.f16x2 r1050, r1047, r979; +} +{ +add.f16x2 r1053, r991, r1050; +} +{ +sub.f16x2 r1056, r982, r983; +} +{ +mul.f16x2 r1059, r1056, r980; +} +{ +add.f16x2 %5, r1053, r1059; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1097, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<95>; +.reg .b32 r<1110>; +.reg .b64 rd<14>; +mov.u32 r1065, %tid.y; +mov.u32 r1066, %6; +mad.lo.s32 r1067, r1065, 8748, r1066; +mov.u32 r1068, %tid.x; +mov.f32 f74, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r1, {low, high}; +} +mov.f32 f76, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r1068, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r1069, rd3; +mul.lo.s32 r1070, r1069, 729; +sub.s32 r1071, r1068, r1070; +mad.lo.s32 r1072, r1069, 8748, r1067; +cvt.rn.f32.u32 f77, r1071; +mul.f32 f78, f77, 0f3B3C4870; +cos.approx.f32 f5, f78; +sin.approx.f32 f79, f78; +neg.f32 f6, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r1073, r1071, 12, r1072; +st.shared.u32 [r1073], r6; +st.shared.u32 [r1073+4], r97; +st.shared.u32 [r1073+8], r134; +barrier.sync 0; +shl.b32 r1074, r1071, 3; +sub.s32 r1075, r1073, r1074; +ld.shared.u32 r170, [r1075]; +ld.shared.u32 r167, [r1075+2916]; +ld.shared.u32 r168, [r1075+5832]; +barrier.sync 0; +st.shared.u32 [r1073], r12; +st.shared.u32 [r1073+4], r106; +st.shared.u32 [r1073+8], r143; +barrier.sync 0; +ld.shared.u32 r176, [r1075]; +ld.shared.u32 r173, [r1075+2916]; +ld.shared.u32 r174, [r1075+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r1071, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r1076, rd5; +mul.lo.s32 r1077, r1076, 3; +sub.s32 r1078, r1071, r1077; +shl.b32 r1079, r1078, 2; +add.s32 r1080, r1072, r1079; +cvt.rn.f32.u32 f80, r1076; +mul.f32 f81, f80, 0f3C0D3654; +cos.approx.f32 f17, f81; +sin.approx.f32 f82, f81; +neg.f32 f18, f82; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r1081, r1076, 36, r1080; +st.shared.u32 [r1081], r169; +st.shared.u32 [r1081+12], r260; +st.shared.u32 [r1081+24], r297; +barrier.sync 0; +ld.shared.u32 r333, [r1075]; +ld.shared.u32 r330, [r1075+2916]; +ld.shared.u32 r331, [r1075+5832]; +barrier.sync 0; +st.shared.u32 [r1081], r175; +st.shared.u32 [r1081+12], r269; +st.shared.u32 [r1081+24], r306; +barrier.sync 0; +ld.shared.u32 r339, [r1075]; +ld.shared.u32 r336, [r1075+2916]; +ld.shared.u32 r337, [r1075+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r1071, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r1082, rd7; +mul.lo.s32 r1083, r1082, 9; +sub.s32 r1084, r1071, r1083; +shl.b32 r1085, r1084, 2; +add.s32 r1086, r1072, r1085; +cvt.rn.f32.u32 f83, r1082; +mul.f32 f84, f83, 0f3CD3D17E; +cos.approx.f32 f29, f84; +sin.approx.f32 f85, f84; +neg.f32 f30, f85; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r1087, r1082, 108, r1086; +st.shared.u32 [r1087], r332; +st.shared.u32 [r1087+36], r423; +st.shared.u32 [r1087+72], r460; +barrier.sync 0; +ld.shared.u32 r496, [r1075]; +ld.shared.u32 r493, [r1075+2916]; +ld.shared.u32 r494, [r1075+5832]; +barrier.sync 0; +st.shared.u32 [r1087], r338; +st.shared.u32 [r1087+36], r432; +st.shared.u32 [r1087+72], r469; +barrier.sync 0; +ld.shared.u32 r502, [r1075]; +ld.shared.u32 r499, [r1075+2916]; +ld.shared.u32 r500, [r1075+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 r519, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 r537, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 r555, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 r573, r564, r570; +} +mul.wide.u32 rd8, r1071, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r1088, rd9; +sub.s32 r1089, r1071, r1088; +shr.u32 r1090, r1089, 1; +add.s32 r1091, r1090, r1088; +shr.u32 r1092, r1091, 4; +mul.lo.s32 r1093, r1092, 27; +sub.s32 r1094, r1071, r1093; +shl.b32 r1095, r1094, 2; +add.s32 r1096, r1072, r1095; +cvt.rn.f32.u32 f86, r1092; +mul.f32 f87, f86, 0f3D9EDD1F; +cos.approx.f32 f41, f87; +sin.approx.f32 f88, f87; +neg.f32 f42, f88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r576, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r579, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r581, {high, high}; +} +{ +mul.f16x2 r583, r555, r581; +} +{ +fma.rn.f16x2 r586, r519, r579, r583; +} +{ +mul.f16x2 r590, r519, r581; +} +{ +neg.f16x2 r593, r590; +} +{ +fma.rn.f16x2 r595, r555, r579, r593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r599, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r601, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r603, {low, high}; +} +{ +mul.f16x2 r604, r601, r603; +} +{ +mul.f16x2 r607, r576, r599; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r610, {high, low}; +} +{ +fma.rn.f16x2 r612, r604, r610, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r616, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r618, {high, high}; +} +{ +mul.f16x2 r620, r573, r618; +} +{ +fma.rn.f16x2 r623, r537, r616, r620; +} +{ +mul.f16x2 r627, r537, r618; +} +{ +neg.f16x2 r630, r627; +} +{ +fma.rn.f16x2 r632, r573, r616, r630; +} +barrier.sync 0; +mad.lo.s32 r1097, r1092, 324, r1096; +st.shared.u32 [r1097], r495; +st.shared.u32 [r1097+108], r586; +st.shared.u32 [r1097+216], r623; +barrier.sync 0; +ld.shared.u32 r659, [r1075]; +ld.shared.u32 r656, [r1075+2916]; +ld.shared.u32 r657, [r1075+5832]; +barrier.sync 0; +st.shared.u32 [r1097], r501; +st.shared.u32 [r1097+108], r595; +st.shared.u32 [r1097+216], r632; +barrier.sync 0; +ld.shared.u32 r665, [r1075]; +ld.shared.u32 r662, [r1075+2916]; +ld.shared.u32 r663, [r1075+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, r656, r657; +} +{ +add.f16x2 r658, r659, r655; +} +{ +add.f16x2 r661, r662, r663; +} +{ +add.f16x2 r664, r665, r661; +} +{ +add.f16x2 r667, r656, r657; +} +{ +mul.f16x2 r670, r667, r653; +} +{ +add.f16x2 r673, r659, r670; +} +{ +sub.f16x2 r676, r662, r663; +} +{ +mul.f16x2 r679, r676, r654; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r656, r657; +} +{ +mul.f16x2 r688, r685, r653; +} +{ +add.f16x2 r691, r659, r688; +} +{ +sub.f16x2 r694, r662, r663; +} +{ +mul.f16x2 r697, r694, r654; +} +{ +sub.f16x2 r700, r691, r697; +} +{ +add.f16x2 r703, r662, r663; +} +{ +mul.f16x2 r706, r703, r653; +} +{ +add.f16x2 r709, r665, r706; +} +{ +sub.f16x2 r712, r656, r657; +} +{ +mul.f16x2 r715, r712, r654; +} +{ +sub.f16x2 r718, r709, r715; +} +{ +add.f16x2 r721, r662, r663; +} +{ +mul.f16x2 r724, r721, r653; +} +{ +add.f16x2 r727, r665, r724; +} +{ +sub.f16x2 r730, r656, r657; +} +{ +mul.f16x2 r733, r730, r654; +} +{ +add.f16x2 r736, r727, r733; +} +mul.wide.u32 rd10, r1071, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r1098, rd11; +mul.lo.s32 r1099, r1098, 81; +sub.s32 r1100, r1071, r1099; +shl.b32 r1101, r1100, 2; +add.s32 r1102, r1072, r1101; +cvt.rn.f32.u32 f89, r1098; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f53, f90; +sin.approx.f32 f91, f90; +neg.f32 f54, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r739, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r744, {high, high}; +} +{ +mul.f16x2 r746, r718, r744; +} +{ +fma.rn.f16x2 r749, r682, r742, r746; +} +{ +mul.f16x2 r753, r682, r744; +} +{ +neg.f16x2 r756, r753; +} +{ +fma.rn.f16x2 r758, r718, r742, r756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r766, {low, high}; +} +{ +mul.f16x2 r767, r764, r766; +} +{ +mul.f16x2 r770, r739, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r773, {high, low}; +} +{ +fma.rn.f16x2 r775, r767, r773, r770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r781, {high, high}; +} +{ +mul.f16x2 r783, r736, r781; +} +{ +fma.rn.f16x2 r786, r700, r779, r783; +} +{ +mul.f16x2 r790, r700, r781; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r736, r779, r793; +} +barrier.sync 0; +mad.lo.s32 r1103, r1098, 972, r1102; +st.shared.u32 [r1103], r658; +st.shared.u32 [r1103+324], r749; +st.shared.u32 [r1103+648], r786; +barrier.sync 0; +ld.shared.u32 r822, [r1075]; +ld.shared.u32 r819, [r1075+2916]; +ld.shared.u32 r820, [r1075+5832]; +barrier.sync 0; +st.shared.u32 [r1103], r664; +st.shared.u32 [r1103+324], r758; +st.shared.u32 [r1103+648], r795; +barrier.sync 0; +ld.shared.u32 r828, [r1075]; +ld.shared.u32 r825, [r1075+2916]; +ld.shared.u32 r826, [r1075+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r817, {low, high}; +} +{ +add.f16x2 r818, r819, r820; +} +{ +add.f16x2 r821, r822, r818; +} +{ +add.f16x2 r824, r825, r826; +} +{ +add.f16x2 r827, r828, r824; +} +{ +add.f16x2 r830, r819, r820; +} +{ +mul.f16x2 r833, r830, r816; +} +{ +add.f16x2 r836, r822, r833; +} +{ +sub.f16x2 r839, r825, r826; +} +{ +mul.f16x2 r842, r839, r817; +} +{ +add.f16x2 r845, r836, r842; +} +{ +add.f16x2 r848, r819, r820; +} +{ +mul.f16x2 r851, r848, r816; +} +{ +add.f16x2 r854, r822, r851; +} +{ +sub.f16x2 r857, r825, r826; +} +{ +mul.f16x2 r860, r857, r817; +} +{ +sub.f16x2 r863, r854, r860; +} +{ +add.f16x2 r866, r825, r826; +} +{ +mul.f16x2 r869, r866, r816; +} +{ +add.f16x2 r872, r828, r869; +} +{ +sub.f16x2 r875, r819, r820; +} +{ +mul.f16x2 r878, r875, r817; +} +{ +sub.f16x2 r881, r872, r878; +} +{ +add.f16x2 r884, r825, r826; +} +{ +mul.f16x2 r887, r884, r816; +} +{ +add.f16x2 r890, r828, r887; +} +{ +sub.f16x2 r893, r819, r820; +} +{ +mul.f16x2 r896, r893, r817; +} +{ +add.f16x2 r899, r890, r896; +} +mul.wide.u32 rd12, r1071, -2032597691; +shr.u64 rd13, rd12, 39; +cvt.u32.u64 r1104, rd13; +mul.lo.s32 r1105, r1104, 243; +sub.s32 r1106, r1071, r1105; +shl.b32 r1107, r1106, 2; +add.s32 r1108, r1072, r1107; +cvt.rn.f32.u32 f92, r1104; +mul.f32 f93, f92, 0f3F32B8C2; +cos.approx.f32 f65, f93; +sin.approx.f32 f94, f93; +neg.f32 f66, f94; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f65; +cvt.rn.f16.f32 high, f66; +mov.b32 r902, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r907, {high, high}; +} +{ +mul.f16x2 r909, r881, r907; +} +{ +fma.rn.f16x2 r912, r845, r905, r909; +} +{ +mul.f16x2 r916, r845, r907; +} +{ +neg.f16x2 r919, r916; +} +{ +fma.rn.f16x2 r921, r881, r905, r919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r929, {low, high}; +} +{ +mul.f16x2 r930, r927, r929; +} +{ +mul.f16x2 r933, r902, r925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r936, {high, low}; +} +{ +fma.rn.f16x2 r938, r930, r936, r933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r938; +mov.b32 r942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r938; +mov.b32 r944, {high, high}; +} +{ +mul.f16x2 r946, r899, r944; +} +{ +fma.rn.f16x2 r949, r863, r942, r946; +} +{ +mul.f16x2 r953, r863, r944; +} +{ +neg.f16x2 r956, r953; +} +{ +fma.rn.f16x2 r958, r899, r942, r956; +} +barrier.sync 0; +mad.lo.s32 r1109, r1104, 2916, r1108; +st.shared.u32 [r1109], r821; +st.shared.u32 [r1109+972], r912; +st.shared.u32 [r1109+1944], r949; +barrier.sync 0; +ld.shared.u32 r985, [r1075]; +ld.shared.u32 r982, [r1075+2916]; +ld.shared.u32 r983, [r1075+5832]; +barrier.sync 0; +st.shared.u32 [r1109], r827; +st.shared.u32 [r1109+972], r921; +st.shared.u32 [r1109+1944], r958; +barrier.sync 0; +ld.shared.u32 r991, [r1075]; +ld.shared.u32 r988, [r1075+2916]; +ld.shared.u32 r989, [r1075+5832]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r979, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r980, {low, high}; +} +{ +add.f16x2 r981, r982, r983; +} +{ +add.f16x2 %0, r985, r981; +} +{ +add.f16x2 r987, r988, r989; +} +{ +add.f16x2 %1, r991, r987; +} +{ +add.f16x2 r993, r982, r983; +} +{ +mul.f16x2 r996, r993, r979; +} +{ +add.f16x2 r999, r985, r996; +} +{ +sub.f16x2 r1002, r988, r989; +} +{ +mul.f16x2 r1005, r1002, r980; +} +{ +add.f16x2 %2, r999, r1005; +} +{ +add.f16x2 r1011, r982, r983; +} +{ +mul.f16x2 r1014, r1011, r979; +} +{ +add.f16x2 r1017, r985, r1014; +} +{ +sub.f16x2 r1020, r988, r989; +} +{ +mul.f16x2 r1023, r1020, r980; +} +{ +sub.f16x2 %4, r1017, r1023; +} +{ +add.f16x2 r1029, r988, r989; +} +{ +mul.f16x2 r1032, r1029, r979; +} +{ +add.f16x2 r1035, r991, r1032; +} +{ +sub.f16x2 r1038, r982, r983; +} +{ +mul.f16x2 r1041, r1038, r980; +} +{ +sub.f16x2 %3, r1035, r1041; +} +{ +add.f16x2 r1047, r988, r989; +} +{ +mul.f16x2 r1050, r1047, r979; +} +{ +add.f16x2 r1053, r991, r1050; +} +{ +sub.f16x2 r1056, r982, r983; +} +{ +mul.f16x2 r1059, r1056, r980; +} +{ +add.f16x2 %5, r1053, r1059; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..d603310a2c4bc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp32_fwd.hpp.inc @@ -0,0 +1,6300 @@ +#ifndef CUFFTDX_FFT_2187_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_2187_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<145, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2481>; +.reg .b32 r<25>; +.reg .b64 rd<16>; +mov.u32 r23, %tid.y; +mov.u32 r24, %54; +mad.lo.s32 r3, r23, 17496, r24; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2480, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2479, %58, f2480; +mul.f32 f119, f2480, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2478, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2477, %64, f2478; +mul.f32 f135, f2478, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2476, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2475, %70, f2476; +mul.f32 f151, f2476, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f2474, f133, 0f3F441B7D; +sub.f32 f159, f2474, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f2472, f149, 0f3E31D0D4; +mul.f32 f2473, f155, 0fBF7C1C5C; +sub.f32 f164, f2472, f2473; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f2470, f134, 0f3E31D0D4; +mul.f32 f2471, f140, 0fBF7C1C5C; +sub.f32 f169, f2470, f2471; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f2468, f150, 0fBF708FB2; +mul.f32 f2469, f156, 0fBEAF1D44; +sub.f32 f174, f2468, f2469; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2467, f2477, f2475; +sub.f32 f183, f2477, f2475; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2466, f2479, f2467; +mul.f32 f187, f2467, 0f3F000000; +sub.f32 f188, f2479, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2465, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2464, f123, f2465; +mul.f32 f203, f2465, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2463, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2462, f124, f2463; +mul.f32 f219, f2463, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2459, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2457, %113, f2459; +mul.f32 f235, f2459, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2454, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2452, %116, f2454; +mul.f32 f251, f2454, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2449, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2447, %119, f2449; +mul.f32 f267, f2449, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f2446, f249, 0f3F441B7D; +sub.f32 f275, f2446, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f2445, f265, 0f3E31D0D4; +sub.f32 f280, f2445, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f2443, f250, 0f3E31D0D4; +mul.f32 f2444, f256, 0fBF7C1C5C; +sub.f32 f285, f2443, f2444; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f2441, f266, 0fBF708FB2; +mul.f32 f2442, f272, 0fBEAF1D44; +sub.f32 f290, f2441, f2442; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2440, f2452, f2447; +sub.f32 f299, f2452, f2447; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2439, f2457, f2440; +mul.f32 f303, f2440, 0f3F000000; +sub.f32 f304, f2457, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2438, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2437, f239, f2438; +mul.f32 f319, f2438, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2436, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2435, f240, f2436; +mul.f32 f335, f2436, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2432, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2430, %122, f2432; +mul.f32 f351, f2432, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2427, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2425, %125, f2427; +mul.f32 f367, f2427, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2423, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2421, %127, f2423; +mul.f32 f383, f2423, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f2420, f365, 0f3F441B7D; +sub.f32 f391, f2420, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f2419, f381, 0f3E31D0D4; +sub.f32 f396, f2419, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f2417, f366, 0f3E31D0D4; +mul.f32 f2418, f372, 0fBF7C1C5C; +sub.f32 f401, f2417, f2418; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f2415, f382, 0fBF708FB2; +mul.f32 f2416, f388, 0fBEAF1D44; +sub.f32 f406, f2415, f2416; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2414, f2425, f2421; +sub.f32 f415, f2425, f2421; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2413, f2430, f2414; +mul.f32 f419, f2414, 0f3F000000; +sub.f32 f420, f2430, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2412, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2411, f355, f2412; +mul.f32 f435, f2412, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2410, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2409, f356, f2410; +mul.f32 f451, f2410, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2437, 0fBE6C2691; +mul.f32 f2408, f310, 0f3F791978; +sub.f32 f459, f2408, f458; +mul.f32 f460, f2437, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f2406, f426, 0f3F64C51C; +mul.f32 f2407, f2411, 0fBEE5C902; +sub.f32 f464, f2406, f2407; +mul.f32 f465, f2411, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f2404, f326, 0f3F64C51C; +mul.f32 f2405, f2435, 0fBEE5C902; +sub.f32 f469, f2404, f2405; +mul.f32 f470, f2435, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f2402, f442, 0f3F18DF63; +mul.f32 f2403, f2409, 0fBF4D57F2; +sub.f32 f474, f2402, f2403; +mul.f32 f475, f2409, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f2400, f301, 0f3F441B7D; +mul.f32 f2401, f307, 0fBF248DBB; +sub.f32 f479, f2400, f2401; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f2399, f417, 0f3E31D0D4; +sub.f32 f484, f2399, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f2398, f317, 0f3F18DF63; +sub.f32 f489, f2398, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f2397, f433, 0fBE92D7E0; +sub.f32 f494, f2397, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f2396, f333, 0f3ECACAF8; +sub.f32 f499, f2396, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f2395, f449, 0fBF2FAD88; +sub.f32 f504, f2395, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f2394, f302, 0f3E31D0D4; +sub.f32 f509, f2394, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f2392, f418, 0fBF708FB2; +mul.f32 f2393, f424, 0fBEAF1D44; +sub.f32 f514, f2392, f2393; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f2390, f318, 0fBD6E2946; +mul.f32 f2391, f324, 0fBF7F9120; +sub.f32 f519, f2390, f2391; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f2388, f434, 0fBF7E44DE; +mul.f32 f2389, f440, 0f3DEDC21F; +sub.f32 f524, f2388, f2389; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f2387, f334, 0fBE92D7E0; +sub.f32 f529, f2387, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f2386, f450, 0fBF55E287; +sub.f32 f534, f2386, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f2385, f2439, f2413; +sub.f32 f541, f2439, f2413; +mul.f32 f542, f541, 0f3F5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f2385, 0f3F000000; +sub.f32 f546, f2466, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0f3F5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f2384, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0f3F5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f2383, f2464, f2384; +mul.f32 f561, f2384, 0f3F000000; +sub.f32 f562, f2464, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0f3F5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f2382, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0f3F5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f2381, f2462, f2382; +mul.f32 f577, f2382, 0f3F000000; +sub.f32 f578, f2462, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0f3F5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f2380, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0f3F5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f2379, f191, f2380; +mul.f32 f593, f2380, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0f3F5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f2378, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f2377, f207, f2378; +mul.f32 f609, f2378, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0f3F5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f2376, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0f3F5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f2375, f223, f2376; +mul.f32 f625, f2376, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0f3F5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f2374, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0f3F5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f2373, f192, f2374; +mul.f32 f641, f2374, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0f3F5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f2372, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0f3F5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f2371, f208, f2372; +mul.f32 f657, f2372, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0f3F5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f2370, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0f3F5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f2369, f224, f2370; +mul.f32 f673, f2370, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0f3F5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r22, %tid.x; +mul.wide.u32 rd2, r22, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r22, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd14, r7, 8; +mov.u64 rd15, %55; +add.s64 rd6, rd15, rd14; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f684, f680, f2383; +mul.f32 f685, f679, f2383; +mul.f32 f2367, f679, f679; +mul.f32 f2368, f680, f680; +sub.f32 f688, f2367, f2368; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f692, f690, f2381; +mul.f32 f693, f688, f2381; +mul.f32 f695, f680, f690; +mul.f32 f2366, f679, f688; +sub.f32 f696, f2366, f695; +mul.f32 f2365, f688, f568; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f700, f698, f2379; +mul.f32 f701, f696, f2379; +mul.f32 f2363, f679, f696; +mul.f32 f2364, f680, f698; +sub.f32 f704, f2363, f2364; +mul.f32 f2362, f696, f584; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f708, f706, f2377; +mul.f32 f709, f704, f2377; +mul.f32 f711, f680, f706; +mul.f32 f2361, f679, f704; +sub.f32 f712, f2361, f711; +mul.f32 f2360, f704, f600; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f716, f714, f2375; +mul.f32 f717, f712, f2375; +mul.f32 f719, f680, f714; +mul.f32 f2359, f679, f712; +sub.f32 f720, f2359, f719; +mul.f32 f2358, f712, f616; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f724, f722, f2373; +mul.f32 f725, f720, f2373; +mul.f32 f2356, f679, f720; +mul.f32 f2357, f680, f722; +sub.f32 f728, f2356, f2357; +mul.f32 f2355, f720, f632; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f732, f730, f2371; +mul.f32 f733, f728, f2371; +mul.f32 f735, f680, f730; +mul.f32 f2354, f679, f728; +sub.f32 f736, f2354, f735; +mul.f32 f2353, f728, f648; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f740, f738, f2369; +mul.f32 f741, f736, f2369; +mul.f32 f743, f680, f738; +mul.f32 f2352, f679, f736; +sub.f32 f744, f2352, f743; +mul.f32 f2351, f736, f664; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f748, f746, f549; +mul.f32 f749, f744, f549; +mul.f32 f2349, f679, f744; +mul.f32 f2350, f680, f746; +sub.f32 f752, f2349, f2350; +mul.f32 f2348, f744, f543; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f756, f754, f565; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f2347, f679, f752; +sub.f32 f760, f2347, f759; +mul.f32 f2346, f752, f559; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f764, f762, f581; +mul.f32 f765, f760, f581; +mul.f32 f2344, f679, f760; +mul.f32 f2345, f680, f762; +sub.f32 f768, f2344, f2345; +mul.f32 f2343, f760, f575; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f772, f770, f597; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f2342, f679, f768; +sub.f32 f776, f2342, f775; +mul.f32 f2341, f768, f591; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f780, f778, f613; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f2340, f679, f776; +sub.f32 f784, f2340, f783; +mul.f32 f2339, f776, f607; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f788, f786, f629; +mul.f32 f789, f784, f629; +mul.f32 f2337, f679, f784; +mul.f32 f2338, f680, f786; +sub.f32 f792, f2337, f2338; +mul.f32 f2336, f784, f623; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f796, f794, f645; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f2335, f679, f792; +sub.f32 f800, f2335, f799; +mul.f32 f2334, f792, f639; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f804, f802, f661; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f2333, f679, f800; +sub.f32 f808, f2333, f807; +mul.f32 f2332, f800, f655; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f812, f810, f677; +mul.f32 f813, f808, f677; +mul.f32 f2330, f679, f808; +mul.f32 f2331, f680, f810; +sub.f32 f816, f2330, f2331; +mul.f32 f2329, f808, f671; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f820, f818, f550; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f2328, f679, f816; +sub.f32 f824, f2328, f823; +mul.f32 f2327, f816, f544; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f828, f826, f566; +mul.f32 f829, f824, f566; +mul.f32 f2325, f679, f824; +mul.f32 f2326, f680, f826; +sub.f32 f832, f2325, f2326; +mul.f32 f2324, f824, f560; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f836, f834, f582; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f2323, f679, f832; +sub.f32 f840, f2323, f839; +mul.f32 f2322, f832, f576; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f844, f842, f598; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f2321, f679, f840; +sub.f32 f848, f2321, f847; +mul.f32 f2320, f840, f592; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f852, f850, f614; +mul.f32 f853, f848, f614; +mul.f32 f2318, f679, f848; +mul.f32 f2319, f680, f850; +sub.f32 f856, f2318, f2319; +mul.f32 f2317, f848, f608; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f860, f858, f630; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f2316, f679, f856; +sub.f32 f864, f2316, f863; +mul.f32 f2315, f856, f624; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f868, f866, f646; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f2314, f679, f864; +sub.f32 f872, f2314, f871; +mul.f32 f2313, f864, f640; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f876, f874, f662; +mul.f32 f877, f872, f662; +mul.f32 f2311, f679, f872; +mul.f32 f2312, f680, f874; +sub.f32 f880, f2311, f2312; +mul.f32 f2310, f679, f552; +mul.f32 f881, f679, f874; +mul.f32 f2309, f872, f656; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f880, f672; +mul.f32 f884, f882, f678; +mul.f32 f885, f880, f678; +barrier.sync 0; +add.f32 f886, f2466, f2385; +add.f32 f887, f178, f537; +mad.lo.s32 r21, r7, 216, r8; +st.shared.v2.f32 [r21], {f887, f886}; +fma.rn.f32 f888, f680, f552, f685; +sub.f32 f889, f2310, f684; +st.shared.v2.f32 [r21+8], {f889, f888}; +fma.rn.f32 f890, f690, f568, f693; +sub.f32 f891, f2365, f692; +st.shared.v2.f32 [r21+16], {f891, f890}; +fma.rn.f32 f892, f698, f584, f701; +sub.f32 f893, f2362, f700; +st.shared.v2.f32 [r21+24], {f893, f892}; +fma.rn.f32 f894, f706, f600, f709; +sub.f32 f895, f2360, f708; +st.shared.v2.f32 [r21+32], {f895, f894}; +fma.rn.f32 f896, f714, f616, f717; +sub.f32 f897, f2358, f716; +st.shared.v2.f32 [r21+40], {f897, f896}; +fma.rn.f32 f898, f722, f632, f725; +sub.f32 f899, f2355, f724; +st.shared.v2.f32 [r21+48], {f899, f898}; +sub.f32 f900, f2353, f732; +fma.rn.f32 f901, f730, f648, f733; +st.shared.v2.f32 [r21+56], {f900, f901}; +fma.rn.f32 f902, f738, f664, f741; +sub.f32 f903, f2351, f740; +st.shared.v2.f32 [r21+64], {f903, f902}; +fma.rn.f32 f904, f746, f543, f749; +sub.f32 f905, f2348, f748; +st.shared.v2.f32 [r21+72], {f905, f904}; +fma.rn.f32 f906, f754, f559, f757; +sub.f32 f907, f2346, f756; +st.shared.v2.f32 [r21+80], {f907, f906}; +fma.rn.f32 f908, f762, f575, f765; +sub.f32 f909, f2343, f764; +st.shared.v2.f32 [r21+88], {f909, f908}; +fma.rn.f32 f910, f770, f591, f773; +sub.f32 f911, f2341, f772; +st.shared.v2.f32 [r21+96], {f911, f910}; +fma.rn.f32 f912, f778, f607, f781; +sub.f32 f913, f2339, f780; +st.shared.v2.f32 [r21+104], {f913, f912}; +fma.rn.f32 f914, f786, f623, f789; +sub.f32 f915, f2336, f788; +st.shared.v2.f32 [r21+112], {f915, f914}; +fma.rn.f32 f916, f794, f639, f797; +sub.f32 f917, f2334, f796; +st.shared.v2.f32 [r21+120], {f917, f916}; +fma.rn.f32 f918, f802, f655, f805; +sub.f32 f919, f2332, f804; +st.shared.v2.f32 [r21+128], {f919, f918}; +fma.rn.f32 f920, f810, f671, f813; +sub.f32 f921, f2329, f812; +st.shared.v2.f32 [r21+136], {f921, f920}; +fma.rn.f32 f922, f818, f544, f821; +sub.f32 f923, f2327, f820; +st.shared.v2.f32 [r21+144], {f923, f922}; +fma.rn.f32 f924, f826, f560, f829; +sub.f32 f925, f2324, f828; +st.shared.v2.f32 [r21+152], {f925, f924}; +fma.rn.f32 f926, f834, f576, f837; +sub.f32 f927, f2322, f836; +st.shared.v2.f32 [r21+160], {f927, f926}; +fma.rn.f32 f928, f842, f592, f845; +sub.f32 f929, f2320, f844; +st.shared.v2.f32 [r21+168], {f929, f928}; +fma.rn.f32 f930, f850, f608, f853; +sub.f32 f931, f2317, f852; +st.shared.v2.f32 [r21+176], {f931, f930}; +fma.rn.f32 f932, f858, f624, f861; +sub.f32 f933, f2315, f860; +st.shared.v2.f32 [r21+184], {f933, f932}; +fma.rn.f32 f934, f866, f640, f869; +sub.f32 f935, f2313, f868; +st.shared.v2.f32 [r21+192], {f935, f934}; +fma.rn.f32 f936, f874, f656, f877; +sub.f32 f937, f2309, f876; +st.shared.v2.f32 [r21+200], {f937, f936}; +fma.rn.f32 f938, f882, f672, f885; +sub.f32 f939, f883, f884; +st.shared.v2.f32 [r21+208], {f939, f938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r21; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+648]; +ld.shared.v2.f32 {f948, f949}, [r10+1296]; +ld.shared.v2.f32 {f952, f953}, [r10+1944]; +ld.shared.v2.f32 {f956, f957}, [r10+2592]; +ld.shared.v2.f32 {f960, f961}, [r10+3240]; +ld.shared.v2.f32 {f964, f965}, [r10+3888]; +ld.shared.v2.f32 {f968, f969}, [r10+4536]; +ld.shared.v2.f32 {f972, f973}, [r10+5184]; +ld.shared.v2.f32 {f976, f977}, [r10+5832]; +ld.shared.v2.f32 {f980, f981}, [r10+6480]; +ld.shared.v2.f32 {f984, f985}, [r10+7128]; +ld.shared.v2.f32 {f988, f989}, [r10+7776]; +ld.shared.v2.f32 {f992, f993}, [r10+8424]; +ld.shared.v2.f32 {f996, f997}, [r10+9072]; +ld.shared.v2.f32 {f1000, f1001}, [r10+9720]; +ld.shared.v2.f32 {f1004, f1005}, [r10+10368]; +ld.shared.v2.f32 {f1008, f1009}, [r10+11016]; +ld.shared.v2.f32 {f1012, f1013}, [r10+11664]; +ld.shared.v2.f32 {f1016, f1017}, [r10+12312]; +ld.shared.v2.f32 {f1020, f1021}, [r10+12960]; +ld.shared.v2.f32 {f1024, f1025}, [r10+13608]; +ld.shared.v2.f32 {f1028, f1029}, [r10+14256]; +ld.shared.v2.f32 {f1032, f1033}, [r10+14904]; +ld.shared.v2.f32 {f1036, f1037}, [r10+15552]; +ld.shared.v2.f32 {f1040, f1041}, [r10+16200]; +ld.shared.v2.f32 {f1044, f1045}, [r10+16848]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f2308, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0f3F5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f2307, f941, f2308; +mul.f32 f1058, f2308, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0f3F5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f2306, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0f3F5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f2305, f953, f2306; +mul.f32 f1074, f2306, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0f3F5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f2304, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0f3F5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f2303, f965, f2304; +mul.f32 f1090, f2304, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0f3F5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f2301, f1072, 0f3F441B7D; +mul.f32 f2302, f1078, 0fBF248DBB; +sub.f32 f1098, f2301, f2302; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0fBF248DBB, f1099; +mul.f32 f2299, f1088, 0f3E31D0D4; +mul.f32 f2300, f1094, 0fBF7C1C5C; +sub.f32 f1103, f2299, f2300; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0fBF7C1C5C, f1104; +mul.f32 f2297, f1073, 0f3E31D0D4; +mul.f32 f2298, f1079, 0fBF7C1C5C; +sub.f32 f1108, f2297, f2298; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0fBF7C1C5C, f1109; +mul.f32 f1112, f1095, 0fBEAF1D44; +mul.f32 f2296, f1089, 0fBF708FB2; +sub.f32 f1113, f2296, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0fBEAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f2295, f2305, f2303; +sub.f32 f1122, f2305, f2303; +mul.f32 f1123, f1122, 0f3F5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f2294, f2307, f2295; +mul.f32 f1126, f2295, 0f3F000000; +sub.f32 f1127, f2307, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0f3F5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f2293, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0f3F5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f2292, f1062, f2293; +mul.f32 f1142, f2293, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0f3F5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f2291, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0f3F5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f2290, f1063, f2291; +mul.f32 f1158, f2291, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0f3F5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f2289, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0f3F5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f2288, f945, f2289; +mul.f32 f1174, f2289, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0f3F5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f2287, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0f3F5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f2286, f957, f2287; +mul.f32 f1190, f2287, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0f3F5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f2285, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0f3F5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f2284, f969, f2285; +mul.f32 f1206, f2285, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0f3F5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f2282, f1188, 0f3F441B7D; +mul.f32 f2283, f1194, 0fBF248DBB; +sub.f32 f1214, f2282, f2283; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0fBF248DBB, f1215; +mul.f32 f2280, f1204, 0f3E31D0D4; +mul.f32 f2281, f1210, 0fBF7C1C5C; +sub.f32 f1219, f2280, f2281; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0fBF7C1C5C, f1220; +mul.f32 f2278, f1189, 0f3E31D0D4; +mul.f32 f2279, f1195, 0fBF7C1C5C; +sub.f32 f1224, f2278, f2279; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0fBF7C1C5C, f1225; +mul.f32 f2276, f1205, 0fBF708FB2; +mul.f32 f2277, f1211, 0fBEAF1D44; +sub.f32 f1229, f2276, f2277; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0fBEAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f2275, f2286, f2284; +sub.f32 f1238, f2286, f2284; +mul.f32 f1239, f1238, 0f3F5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f2274, f2288, f2275; +mul.f32 f1242, f2275, 0f3F000000; +sub.f32 f1243, f2288, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0f3F5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f2273, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0f3F5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f2272, f1178, f2273; +mul.f32 f1258, f2273, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0f3F5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f2271, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0f3F5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f2270, f1179, f2271; +mul.f32 f1274, f2271, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0f3F5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f2269, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0f3F5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f2268, f949, f2269; +mul.f32 f1290, f2269, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0f3F5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f2267, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0f3F5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f2266, f961, f2267; +mul.f32 f1306, f2267, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0f3F5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f2265, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0f3F5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f2264, f973, f2265; +mul.f32 f1322, f2265, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0f3F5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0fBF248DBB; +mul.f32 f2263, f1304, 0f3F441B7D; +sub.f32 f1330, f2263, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0fBF248DBB, f1331; +mul.f32 f2261, f1320, 0f3E31D0D4; +mul.f32 f2262, f1326, 0fBF7C1C5C; +sub.f32 f1335, f2261, f2262; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0fBF7C1C5C, f1336; +mul.f32 f2259, f1305, 0f3E31D0D4; +mul.f32 f2260, f1311, 0fBF7C1C5C; +sub.f32 f1340, f2259, f2260; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0fBF7C1C5C, f1341; +mul.f32 f2257, f1321, 0fBF708FB2; +mul.f32 f2258, f1327, 0fBEAF1D44; +sub.f32 f1345, f2257, f2258; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0fBEAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f2256, f2266, f2264; +sub.f32 f1354, f2266, f2264; +mul.f32 f1355, f1354, 0f3F5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f2255, f2268, f2256; +mul.f32 f1358, f2256, 0f3F000000; +sub.f32 f1359, f2268, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0f3F5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f2254, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0f3F5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f2253, f1294, f2254; +mul.f32 f1374, f2254, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0f3F5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f2252, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0f3F5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f2251, f1295, f2252; +mul.f32 f1390, f2252, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0f3F5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1397, f2272, 0fBE6C2691; +mul.f32 f2250, f1249, 0f3F791978; +sub.f32 f1398, f2250, f1397; +mul.f32 f1399, f2272, 0f3F791978; +fma.rn.f32 f1400, f1249, 0fBE6C2691, f1399; +mul.f32 f1402, f2253, 0fBEE5C902; +mul.f32 f2249, f1365, 0f3F64C51C; +sub.f32 f1403, f2249, f1402; +mul.f32 f1404, f2253, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0fBEE5C902, f1404; +mul.f32 f1407, f2270, 0fBEE5C902; +mul.f32 f2248, f1265, 0f3F64C51C; +sub.f32 f1408, f2248, f1407; +mul.f32 f1409, f2270, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0fBEE5C902, f1409; +mul.f32 f2246, f1381, 0f3F18DF63; +mul.f32 f2247, f2251, 0fBF4D57F2; +sub.f32 f1413, f2246, f2247; +mul.f32 f1414, f2251, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0fBF4D57F2, f1414; +mul.f32 f2244, f1240, 0f3F441B7D; +mul.f32 f2245, f1246, 0fBF248DBB; +sub.f32 f1418, f2244, f2245; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0fBF248DBB, f1419; +mul.f32 f2242, f1356, 0f3E31D0D4; +mul.f32 f2243, f1362, 0fBF7C1C5C; +sub.f32 f1423, f2242, f2243; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0fBF7C1C5C, f1424; +mul.f32 f2240, f1256, 0f3F18DF63; +mul.f32 f2241, f1262, 0fBF4D57F2; +sub.f32 f1428, f2240, f2241; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0fBF4D57F2, f1429; +mul.f32 f1432, f1378, 0fBF753ECD; +mul.f32 f2239, f1372, 0fBE92D7E0; +sub.f32 f1433, f2239, f1432; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0fBF753ECD, f1434; +mul.f32 f1437, f1278, 0fBF6B1036; +mul.f32 f2238, f1272, 0f3ECACAF8; +sub.f32 f1438, f2238, f1437; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0fBF6B1036, f1439; +mul.f32 f1442, f1394, 0fBF3A3529; +mul.f32 f2237, f1388, 0fBF2FAD88; +sub.f32 f1443, f2237, f1442; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0fBF3A3529, f1444; +mul.f32 f1447, f1247, 0fBF7C1C5C; +mul.f32 f2236, f1241, 0f3E31D0D4; +sub.f32 f1448, f2236, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0fBF7C1C5C, f1449; +mul.f32 f1452, f1363, 0fBEAF1D44; +mul.f32 f2235, f1357, 0fBF708FB2; +sub.f32 f1453, f2235, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0fBEAF1D44, f1454; +mul.f32 f1457, f1263, 0fBF7F9120; +mul.f32 f2234, f1257, 0fBD6E2946; +sub.f32 f1458, f2234, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0fBF7F9120, f1459; +mul.f32 f2232, f1373, 0fBF7E44DE; +mul.f32 f2233, f1379, 0f3DEDC21F; +sub.f32 f1463, f2232, f2233; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0f3DEDC21F, f1464; +mul.f32 f2230, f1273, 0fBE92D7E0; +mul.f32 f2231, f1279, 0fBF753ECD; +sub.f32 f1468, f2230, f2231; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0fBF753ECD, f1469; +mul.f32 f2228, f1389, 0fBF55E287; +mul.f32 f2229, f1395, 0f3F0CAC9F; +sub.f32 f1473, f2228, f2229; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0f3F0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f2227, f2274, f2255; +sub.f32 f1480, f2274, f2255; +mul.f32 f1481, f1480, 0f3F5DB3D7; +add.f32 f1482, f1481, f1479; +sub.f32 f1483, f1479, f1481; +mul.f32 f1484, f2227, 0f3F000000; +sub.f32 f1485, f2294, f1484; +sub.f32 f1486, f1233, f1349; +mul.f32 f1487, f1486, 0f3F5DB3D7; +sub.f32 f1488, f1485, f1487; +add.f32 f1489, f1487, f1485; +add.f32 f1490, f1398, f1403; +add.f32 f1491, f1133, f1490; +mul.f32 f1494, f1490, 0f3F000000; +sub.f32 f1495, f1133, f1494; +add.f32 f2226, f1400, f1405; +sub.f32 f1496, f1400, f1405; +mul.f32 f1497, f1496, 0f3F5DB3D7; +add.f32 f1498, f1497, f1495; +sub.f32 f1499, f1495, f1497; +add.f32 f2225, f2292, f2226; +mul.f32 f1500, f2226, 0f3F000000; +sub.f32 f1501, f2292, f1500; +sub.f32 f1502, f1398, f1403; +mul.f32 f1503, f1502, 0f3F5DB3D7; +sub.f32 f1504, f1501, f1503; +add.f32 f1505, f1503, f1501; +add.f32 f1506, f1408, f1413; +add.f32 f1507, f1149, f1506; +mul.f32 f1510, f1506, 0f3F000000; +sub.f32 f1511, f1149, f1510; +add.f32 f2224, f1410, f1415; +sub.f32 f1512, f1410, f1415; +mul.f32 f1513, f1512, 0f3F5DB3D7; +add.f32 f1514, f1513, f1511; +sub.f32 f1515, f1511, f1513; +add.f32 f2223, f2290, f2224; +mul.f32 f1516, f2224, 0f3F000000; +sub.f32 f1517, f2290, f1516; +sub.f32 f1518, f1408, f1413; +mul.f32 f1519, f1518, 0f3F5DB3D7; +sub.f32 f1520, f1517, f1519; +add.f32 f1521, f1519, f1517; +add.f32 f1522, f1418, f1423; +add.f32 f1523, f1124, f1522; +mul.f32 f1526, f1522, 0f3F000000; +sub.f32 f1527, f1124, f1526; +add.f32 f2222, f1420, f1425; +sub.f32 f1528, f1420, f1425; +mul.f32 f1529, f1528, 0f3F5DB3D7; +add.f32 f1530, f1529, f1527; +sub.f32 f1531, f1527, f1529; +add.f32 f2221, f1130, f2222; +mul.f32 f1532, f2222, 0f3F000000; +sub.f32 f1533, f1130, f1532; +sub.f32 f1534, f1418, f1423; +mul.f32 f1535, f1534, 0f3F5DB3D7; +sub.f32 f1536, f1533, f1535; +add.f32 f1537, f1535, f1533; +add.f32 f1538, f1428, f1433; +add.f32 f1539, f1140, f1538; +mul.f32 f1542, f1538, 0f3F000000; +sub.f32 f1543, f1140, f1542; +add.f32 f2220, f1430, f1435; +sub.f32 f1544, f1430, f1435; +mul.f32 f1545, f1544, 0f3F5DB3D7; +add.f32 f1546, f1545, f1543; +sub.f32 f1547, f1543, f1545; +add.f32 f2219, f1146, f2220; +mul.f32 f1548, f2220, 0f3F000000; +sub.f32 f1549, f1146, f1548; +sub.f32 f1550, f1428, f1433; +mul.f32 f1551, f1550, 0f3F5DB3D7; +sub.f32 f1552, f1549, f1551; +add.f32 f1553, f1551, f1549; +add.f32 f1554, f1438, f1443; +add.f32 f1555, f1156, f1554; +mul.f32 f1558, f1554, 0f3F000000; +sub.f32 f1559, f1156, f1558; +add.f32 f2218, f1440, f1445; +sub.f32 f1560, f1440, f1445; +mul.f32 f1561, f1560, 0f3F5DB3D7; +add.f32 f1562, f1561, f1559; +sub.f32 f1563, f1559, f1561; +add.f32 f2217, f1162, f2218; +mul.f32 f1564, f2218, 0f3F000000; +sub.f32 f1565, f1162, f1564; +sub.f32 f1566, f1438, f1443; +mul.f32 f1567, f1566, 0f3F5DB3D7; +sub.f32 f1568, f1565, f1567; +add.f32 f1569, f1567, f1565; +add.f32 f1570, f1448, f1453; +add.f32 f1571, f1125, f1570; +mul.f32 f1574, f1570, 0f3F000000; +sub.f32 f1575, f1125, f1574; +add.f32 f2216, f1450, f1455; +sub.f32 f1576, f1450, f1455; +mul.f32 f1577, f1576, 0f3F5DB3D7; +add.f32 f1578, f1577, f1575; +sub.f32 f1579, f1575, f1577; +add.f32 f2215, f1131, f2216; +mul.f32 f1580, f2216, 0f3F000000; +sub.f32 f1581, f1131, f1580; +sub.f32 f1582, f1448, f1453; +mul.f32 f1583, f1582, 0f3F5DB3D7; +sub.f32 f1584, f1581, f1583; +add.f32 f1585, f1583, f1581; +add.f32 f1586, f1458, f1463; +add.f32 f1587, f1141, f1586; +mul.f32 f1590, f1586, 0f3F000000; +sub.f32 f1591, f1141, f1590; +add.f32 f2214, f1460, f1465; +sub.f32 f1592, f1460, f1465; +mul.f32 f1593, f1592, 0f3F5DB3D7; +add.f32 f1594, f1593, f1591; +sub.f32 f1595, f1591, f1593; +add.f32 f2213, f1147, f2214; +mul.f32 f1596, f2214, 0f3F000000; +sub.f32 f1597, f1147, f1596; +sub.f32 f1598, f1458, f1463; +mul.f32 f1599, f1598, 0f3F5DB3D7; +sub.f32 f1600, f1597, f1599; +add.f32 f1601, f1599, f1597; +add.f32 f1602, f1468, f1473; +add.f32 f1603, f1157, f1602; +mul.f32 f1606, f1602, 0f3F000000; +sub.f32 f1607, f1157, f1606; +add.f32 f2212, f1470, f1475; +sub.f32 f1608, f1470, f1475; +mul.f32 f1609, f1608, 0f3F5DB3D7; +add.f32 f1610, f1609, f1607; +sub.f32 f1611, f1607, f1609; +add.f32 f2211, f1163, f2212; +mul.f32 f1612, f2212, 0f3F000000; +sub.f32 f1613, f1163, f1612; +sub.f32 f1614, f1468, f1473; +mul.f32 f1615, f1614, 0f3F5DB3D7; +sub.f32 f1616, f1613, f1615; +add.f32 f1617, f1615, f1613; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1618, f1619}, [rd11]; +mul.f32 f1623, f1619, f2225; +mul.f32 f1624, f1618, f2225; +mul.f32 f2209, f1618, f1618; +mul.f32 f2210, f1619, f1619; +sub.f32 f1627, f2209, f2210; +mul.f32 f1628, f1619, f1618; +fma.rn.f32 f1629, f1619, f1618, f1628; +mul.f32 f1631, f1629, f2223; +mul.f32 f1632, f1627, f2223; +mul.f32 f1634, f1619, f1629; +mul.f32 f2208, f1618, f1627; +sub.f32 f1635, f2208, f1634; +mul.f32 f2207, f1627, f1507; +mul.f32 f1636, f1618, f1629; +fma.rn.f32 f1637, f1619, f1627, f1636; +mul.f32 f1639, f1637, f2221; +mul.f32 f1640, f1635, f2221; +mul.f32 f2205, f1618, f1635; +mul.f32 f2206, f1619, f1637; +sub.f32 f1643, f2205, f2206; +mul.f32 f2204, f1635, f1523; +mul.f32 f1644, f1618, f1637; +fma.rn.f32 f1645, f1619, f1635, f1644; +mul.f32 f1647, f1645, f2219; +mul.f32 f1648, f1643, f2219; +mul.f32 f1650, f1619, f1645; +mul.f32 f2203, f1618, f1643; +sub.f32 f1651, f2203, f1650; +mul.f32 f2202, f1643, f1539; +mul.f32 f1652, f1618, f1645; +fma.rn.f32 f1653, f1619, f1643, f1652; +mul.f32 f1655, f1653, f2217; +mul.f32 f1656, f1651, f2217; +mul.f32 f1658, f1619, f1653; +mul.f32 f2201, f1618, f1651; +sub.f32 f1659, f2201, f1658; +mul.f32 f2200, f1651, f1555; +mul.f32 f1660, f1618, f1653; +fma.rn.f32 f1661, f1619, f1651, f1660; +mul.f32 f1663, f1661, f2215; +mul.f32 f1664, f1659, f2215; +mul.f32 f2198, f1618, f1659; +mul.f32 f2199, f1619, f1661; +sub.f32 f1667, f2198, f2199; +mul.f32 f2197, f1659, f1571; +mul.f32 f1668, f1618, f1661; +fma.rn.f32 f1669, f1619, f1659, f1668; +mul.f32 f1671, f1669, f2213; +mul.f32 f1672, f1667, f2213; +mul.f32 f1674, f1619, f1669; +mul.f32 f2196, f1618, f1667; +sub.f32 f1675, f2196, f1674; +mul.f32 f2195, f1667, f1587; +mul.f32 f1676, f1618, f1669; +fma.rn.f32 f1677, f1619, f1667, f1676; +mul.f32 f1679, f1677, f2211; +mul.f32 f1680, f1675, f2211; +mul.f32 f1682, f1619, f1677; +mul.f32 f2194, f1618, f1675; +sub.f32 f1683, f2194, f1682; +mul.f32 f2193, f1675, f1603; +mul.f32 f1684, f1618, f1677; +fma.rn.f32 f1685, f1619, f1675, f1684; +mul.f32 f1687, f1685, f1488; +mul.f32 f1688, f1683, f1488; +mul.f32 f2191, f1618, f1683; +mul.f32 f2192, f1619, f1685; +sub.f32 f1691, f2191, f2192; +mul.f32 f2190, f1683, f1482; +mul.f32 f1692, f1618, f1685; +fma.rn.f32 f1693, f1619, f1683, f1692; +mul.f32 f1695, f1693, f1504; +mul.f32 f1696, f1691, f1504; +mul.f32 f1698, f1619, f1693; +mul.f32 f2189, f1618, f1691; +sub.f32 f1699, f2189, f1698; +mul.f32 f2188, f1691, f1498; +mul.f32 f1700, f1618, f1693; +fma.rn.f32 f1701, f1619, f1691, f1700; +mul.f32 f1703, f1701, f1520; +mul.f32 f1704, f1699, f1520; +mul.f32 f2186, f1618, f1699; +mul.f32 f2187, f1619, f1701; +sub.f32 f1707, f2186, f2187; +mul.f32 f2185, f1699, f1514; +mul.f32 f1708, f1618, f1701; +fma.rn.f32 f1709, f1619, f1699, f1708; +mul.f32 f1711, f1709, f1536; +mul.f32 f1712, f1707, f1536; +mul.f32 f1714, f1619, f1709; +mul.f32 f2184, f1618, f1707; +sub.f32 f1715, f2184, f1714; +mul.f32 f2183, f1707, f1530; +mul.f32 f1716, f1618, f1709; +fma.rn.f32 f1717, f1619, f1707, f1716; +mul.f32 f1719, f1717, f1552; +mul.f32 f1720, f1715, f1552; +mul.f32 f1722, f1619, f1717; +mul.f32 f2182, f1618, f1715; +sub.f32 f1723, f2182, f1722; +mul.f32 f2181, f1715, f1546; +mul.f32 f1724, f1618, f1717; +fma.rn.f32 f1725, f1619, f1715, f1724; +mul.f32 f1727, f1725, f1568; +mul.f32 f1728, f1723, f1568; +mul.f32 f2179, f1618, f1723; +mul.f32 f2180, f1619, f1725; +sub.f32 f1731, f2179, f2180; +mul.f32 f2178, f1723, f1562; +mul.f32 f1732, f1618, f1725; +fma.rn.f32 f1733, f1619, f1723, f1732; +mul.f32 f1735, f1733, f1584; +mul.f32 f1736, f1731, f1584; +mul.f32 f1738, f1619, f1733; +mul.f32 f2177, f1618, f1731; +sub.f32 f1739, f2177, f1738; +mul.f32 f2176, f1731, f1578; +mul.f32 f1740, f1618, f1733; +fma.rn.f32 f1741, f1619, f1731, f1740; +mul.f32 f1743, f1741, f1600; +mul.f32 f1744, f1739, f1600; +mul.f32 f1746, f1619, f1741; +mul.f32 f2175, f1618, f1739; +sub.f32 f1747, f2175, f1746; +mul.f32 f2174, f1739, f1594; +mul.f32 f1748, f1618, f1741; +fma.rn.f32 f1749, f1619, f1739, f1748; +mul.f32 f1751, f1749, f1616; +mul.f32 f1752, f1747, f1616; +mul.f32 f2172, f1618, f1747; +mul.f32 f2173, f1619, f1749; +sub.f32 f1755, f2172, f2173; +mul.f32 f2171, f1747, f1610; +mul.f32 f1756, f1618, f1749; +fma.rn.f32 f1757, f1619, f1747, f1756; +mul.f32 f1759, f1757, f1489; +mul.f32 f1760, f1755, f1489; +mul.f32 f1762, f1619, f1757; +mul.f32 f2170, f1618, f1755; +sub.f32 f1763, f2170, f1762; +mul.f32 f2169, f1755, f1483; +mul.f32 f1764, f1618, f1757; +fma.rn.f32 f1765, f1619, f1755, f1764; +mul.f32 f1767, f1765, f1505; +mul.f32 f1768, f1763, f1505; +mul.f32 f2167, f1618, f1763; +mul.f32 f2168, f1619, f1765; +sub.f32 f1771, f2167, f2168; +mul.f32 f2166, f1763, f1499; +mul.f32 f1772, f1618, f1765; +fma.rn.f32 f1773, f1619, f1763, f1772; +mul.f32 f1775, f1773, f1521; +mul.f32 f1776, f1771, f1521; +mul.f32 f1778, f1619, f1773; +mul.f32 f2165, f1618, f1771; +sub.f32 f1779, f2165, f1778; +mul.f32 f2164, f1771, f1515; +mul.f32 f1780, f1618, f1773; +fma.rn.f32 f1781, f1619, f1771, f1780; +mul.f32 f1783, f1781, f1537; +mul.f32 f1784, f1779, f1537; +mul.f32 f1786, f1619, f1781; +mul.f32 f2163, f1618, f1779; +sub.f32 f1787, f2163, f1786; +mul.f32 f2162, f1779, f1531; +mul.f32 f1788, f1618, f1781; +fma.rn.f32 f1789, f1619, f1779, f1788; +mul.f32 f1791, f1789, f1553; +mul.f32 f1792, f1787, f1553; +mul.f32 f2160, f1618, f1787; +mul.f32 f2161, f1619, f1789; +sub.f32 f1795, f2160, f2161; +mul.f32 f2159, f1787, f1547; +mul.f32 f1796, f1618, f1789; +fma.rn.f32 f1797, f1619, f1787, f1796; +mul.f32 f1799, f1797, f1569; +mul.f32 f1800, f1795, f1569; +mul.f32 f1802, f1619, f1797; +mul.f32 f2158, f1618, f1795; +sub.f32 f1803, f2158, f1802; +mul.f32 f2157, f1795, f1563; +mul.f32 f1804, f1618, f1797; +fma.rn.f32 f1805, f1619, f1795, f1804; +mul.f32 f1807, f1805, f1585; +mul.f32 f1808, f1803, f1585; +mul.f32 f1810, f1619, f1805; +mul.f32 f2156, f1618, f1803; +sub.f32 f1811, f2156, f1810; +mul.f32 f2155, f1803, f1579; +mul.f32 f1812, f1618, f1805; +fma.rn.f32 f1813, f1619, f1803, f1812; +mul.f32 f1815, f1813, f1601; +mul.f32 f1816, f1811, f1601; +mul.f32 f2153, f1618, f1811; +mul.f32 f2154, f1619, f1813; +sub.f32 f1819, f2153, f2154; +mul.f32 f2152, f1618, f1491; +mul.f32 f1820, f1618, f1813; +mul.f32 f2151, f1811, f1595; +fma.rn.f32 f1821, f1619, f1811, f1820; +mul.f32 f1822, f1819, f1611; +mul.f32 f1823, f1821, f1617; +mul.f32 f1824, f1819, f1617; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 5832, r19; +add.f32 f1825, f2294, f2227; +add.f32 f1826, f1117, f1476; +st.shared.v2.f32 [r20], {f1826, f1825}; +fma.rn.f32 f1827, f1619, f1491, f1624; +sub.f32 f1828, f2152, f1623; +st.shared.v2.f32 [r20+216], {f1828, f1827}; +fma.rn.f32 f1829, f1629, f1507, f1632; +sub.f32 f1830, f2207, f1631; +st.shared.v2.f32 [r20+432], {f1830, f1829}; +fma.rn.f32 f1831, f1637, f1523, f1640; +sub.f32 f1832, f2204, f1639; +st.shared.v2.f32 [r20+648], {f1832, f1831}; +fma.rn.f32 f1833, f1645, f1539, f1648; +sub.f32 f1834, f2202, f1647; +st.shared.v2.f32 [r20+864], {f1834, f1833}; +fma.rn.f32 f1835, f1653, f1555, f1656; +sub.f32 f1836, f2200, f1655; +st.shared.v2.f32 [r20+1080], {f1836, f1835}; +sub.f32 f1837, f2197, f1663; +fma.rn.f32 f1838, f1661, f1571, f1664; +st.shared.v2.f32 [r20+1296], {f1837, f1838}; +fma.rn.f32 f1839, f1669, f1587, f1672; +sub.f32 f1840, f2195, f1671; +st.shared.v2.f32 [r20+1512], {f1840, f1839}; +sub.f32 f1841, f2193, f1679; +fma.rn.f32 f1842, f1677, f1603, f1680; +st.shared.v2.f32 [r20+1728], {f1841, f1842}; +fma.rn.f32 f1843, f1685, f1482, f1688; +sub.f32 f1844, f2190, f1687; +st.shared.v2.f32 [r20+1944], {f1844, f1843}; +fma.rn.f32 f1845, f1693, f1498, f1696; +sub.f32 f1846, f2188, f1695; +st.shared.v2.f32 [r20+2160], {f1846, f1845}; +fma.rn.f32 f1847, f1701, f1514, f1704; +sub.f32 f1848, f2185, f1703; +st.shared.v2.f32 [r20+2376], {f1848, f1847}; +fma.rn.f32 f1849, f1709, f1530, f1712; +sub.f32 f1850, f2183, f1711; +st.shared.v2.f32 [r20+2592], {f1850, f1849}; +fma.rn.f32 f1851, f1717, f1546, f1720; +sub.f32 f1852, f2181, f1719; +st.shared.v2.f32 [r20+2808], {f1852, f1851}; +fma.rn.f32 f1853, f1725, f1562, f1728; +sub.f32 f1854, f2178, f1727; +st.shared.v2.f32 [r20+3024], {f1854, f1853}; +fma.rn.f32 f1855, f1733, f1578, f1736; +sub.f32 f1856, f2176, f1735; +st.shared.v2.f32 [r20+3240], {f1856, f1855}; +fma.rn.f32 f1857, f1741, f1594, f1744; +sub.f32 f1858, f2174, f1743; +st.shared.v2.f32 [r20+3456], {f1858, f1857}; +fma.rn.f32 f1859, f1749, f1610, f1752; +sub.f32 f1860, f2171, f1751; +st.shared.v2.f32 [r20+3672], {f1860, f1859}; +fma.rn.f32 f1861, f1757, f1483, f1760; +sub.f32 f1862, f2169, f1759; +st.shared.v2.f32 [r20+3888], {f1862, f1861}; +fma.rn.f32 f1863, f1765, f1499, f1768; +sub.f32 f1864, f2166, f1767; +st.shared.v2.f32 [r20+4104], {f1864, f1863}; +fma.rn.f32 f1865, f1773, f1515, f1776; +sub.f32 f1866, f2164, f1775; +st.shared.v2.f32 [r20+4320], {f1866, f1865}; +fma.rn.f32 f1867, f1781, f1531, f1784; +sub.f32 f1868, f2162, f1783; +st.shared.v2.f32 [r20+4536], {f1868, f1867}; +fma.rn.f32 f1869, f1789, f1547, f1792; +sub.f32 f1870, f2159, f1791; +st.shared.v2.f32 [r20+4752], {f1870, f1869}; +fma.rn.f32 f1871, f1797, f1563, f1800; +sub.f32 f1872, f2157, f1799; +st.shared.v2.f32 [r20+4968], {f1872, f1871}; +fma.rn.f32 f1873, f1805, f1579, f1808; +sub.f32 f1874, f2155, f1807; +st.shared.v2.f32 [r20+5184], {f1874, f1873}; +fma.rn.f32 f1875, f1813, f1595, f1816; +sub.f32 f1876, f2151, f1815; +st.shared.v2.f32 [r20+5400], {f1876, f1875}; +fma.rn.f32 f1877, f1821, f1611, f1824; +sub.f32 f1878, f1822, f1823; +st.shared.v2.f32 [r20+5616], {f1878, f1877}; +barrier.sync 0; +ld.shared.v2.f32 {f1879, f1880}, [r10]; +ld.shared.v2.f32 {f1883, f1884}, [r10+648]; +ld.shared.v2.f32 {f1887, f1888}, [r10+1296]; +ld.shared.v2.f32 {f1891, f1892}, [r10+1944]; +ld.shared.v2.f32 {f1895, f1896}, [r10+2592]; +ld.shared.v2.f32 {f1899, f1900}, [r10+3240]; +ld.shared.v2.f32 {f1903, f1904}, [r10+3888]; +ld.shared.v2.f32 {f1907, f1908}, [r10+4536]; +ld.shared.v2.f32 {f1911, f1912}, [r10+5184]; +ld.shared.v2.f32 {f1915, f1916}, [r10+5832]; +ld.shared.v2.f32 {f1919, f1920}, [r10+6480]; +ld.shared.v2.f32 {f1923, f1924}, [r10+7128]; +ld.shared.v2.f32 {f1927, f1928}, [r10+7776]; +ld.shared.v2.f32 {f1931, f1932}, [r10+8424]; +ld.shared.v2.f32 {f1935, f1936}, [r10+9072]; +ld.shared.v2.f32 {f1939, f1940}, [r10+9720]; +ld.shared.v2.f32 {f1943, f1944}, [r10+10368]; +ld.shared.v2.f32 {f1947, f1948}, [r10+11016]; +ld.shared.v2.f32 {f1951, f1952}, [r10+11664]; +ld.shared.v2.f32 {f1955, f1956}, [r10+12312]; +ld.shared.v2.f32 {f1959, f1960}, [r10+12960]; +ld.shared.v2.f32 {f1963, f1964}, [r10+13608]; +ld.shared.v2.f32 {f1967, f1968}, [r10+14256]; +ld.shared.v2.f32 {f1971, f1972}, [r10+14904]; +ld.shared.v2.f32 {f1975, f1976}, [r10+15552]; +ld.shared.v2.f32 {f1979, f1980}, [r10+16200]; +ld.shared.v2.f32 {f1983, f1984}, [r10+16848]; +add.f32 f1987, f1915, f1951; +mul.f32 f1989, f1987, 0f3F000000; +sub.f32 f1990, f1879, f1989; +add.f32 f2150, f1916, f1952; +sub.f32 f1991, f1916, f1952; +mul.f32 f1992, f1991, 0f3F5DB3D7; +mul.f32 f1993, f2150, 0f3F000000; +sub.f32 f1994, f1880, f1993; +sub.f32 f1995, f1915, f1951; +mul.f32 f1996, f1995, 0f3F5DB3D7; +add.f32 f1997, f1919, f1955; +mul.f32 f1999, f1997, 0f3F000000; +sub.f32 f2000, f1883, f1999; +add.f32 f2149, f1920, f1956; +sub.f32 f2001, f1920, f1956; +mul.f32 f2002, f2001, 0f3F5DB3D7; +mul.f32 f2003, f2149, 0f3F000000; +sub.f32 f2004, f1884, f2003; +sub.f32 f2005, f1919, f1955; +mul.f32 f2006, f2005, 0f3F5DB3D7; +add.f32 f2007, f1923, f1959; +mul.f32 f2009, f2007, 0f3F000000; +sub.f32 f2010, f1887, f2009; +add.f32 f2148, f1924, f1960; +sub.f32 f2011, f1924, f1960; +mul.f32 f2012, f2011, 0f3F5DB3D7; +mul.f32 f2013, f2148, 0f3F000000; +sub.f32 f2014, f1888, f2013; +sub.f32 f2015, f1923, f1959; +mul.f32 f2016, f2015, 0f3F5DB3D7; +add.f32 f2017, f1927, f1963; +mul.f32 f2019, f2017, 0f3F000000; +sub.f32 f2020, f1891, f2019; +add.f32 f2147, f1928, f1964; +sub.f32 f2021, f1928, f1964; +mul.f32 f2022, f2021, 0f3F5DB3D7; +mul.f32 f2023, f2147, 0f3F000000; +sub.f32 f2024, f1892, f2023; +sub.f32 f2025, f1927, f1963; +mul.f32 f2026, f2025, 0f3F5DB3D7; +add.f32 f2027, f1931, f1967; +mul.f32 f2029, f2027, 0f3F000000; +sub.f32 f2030, f1895, f2029; +add.f32 f2146, f1932, f1968; +sub.f32 f2031, f1932, f1968; +mul.f32 f2032, f2031, 0f3F5DB3D7; +mul.f32 f2033, f2146, 0f3F000000; +sub.f32 f2034, f1896, f2033; +sub.f32 f2035, f1931, f1967; +mul.f32 f2036, f2035, 0f3F5DB3D7; +add.f32 f2037, f1935, f1971; +mul.f32 f2039, f2037, 0f3F000000; +sub.f32 f2040, f1899, f2039; +add.f32 f2145, f1936, f1972; +sub.f32 f2041, f1936, f1972; +mul.f32 f2042, f2041, 0f3F5DB3D7; +mul.f32 f2043, f2145, 0f3F000000; +sub.f32 f2044, f1900, f2043; +sub.f32 f2045, f1935, f1971; +mul.f32 f2046, f2045, 0f3F5DB3D7; +add.f32 f2047, f1939, f1975; +mul.f32 f2049, f2047, 0f3F000000; +sub.f32 f2050, f1903, f2049; +add.f32 f2144, f1940, f1976; +sub.f32 f2051, f1940, f1976; +mul.f32 f2052, f2051, 0f3F5DB3D7; +mul.f32 f2053, f2144, 0f3F000000; +sub.f32 f2054, f1904, f2053; +sub.f32 f2055, f1939, f1975; +mul.f32 f2056, f2055, 0f3F5DB3D7; +add.f32 f2057, f1943, f1979; +mul.f32 f2059, f2057, 0f3F000000; +sub.f32 f2060, f1907, f2059; +add.f32 f2143, f1944, f1980; +sub.f32 f2061, f1944, f1980; +mul.f32 f2062, f2061, 0f3F5DB3D7; +mul.f32 f2063, f2143, 0f3F000000; +sub.f32 f2064, f1908, f2063; +sub.f32 f2065, f1943, f1979; +mul.f32 f2066, f2065, 0f3F5DB3D7; +add.f32 f2067, f1947, f1983; +mul.f32 f2069, f2067, 0f3F000000; +sub.f32 f2070, f1911, f2069; +add.f32 f2142, f1948, f1984; +sub.f32 f2071, f1948, f1984; +mul.f32 f2072, f2071, 0f3F5DB3D7; +mul.f32 f2073, f2142, 0f3F000000; +sub.f32 f2074, f1912, f2073; +sub.f32 f2075, f1947, f1983; +mul.f32 f2076, f2075, 0f3F5DB3D7; +add.f32 %1, f1880, f2150; +add.f32 %0, f1879, f1987; +add.f32 %3, f1884, f2149; +add.f32 %2, f1883, f1997; +add.f32 %5, f1888, f2148; +add.f32 %4, f1887, f2007; +add.f32 %7, f1892, f2147; +add.f32 %6, f1891, f2017; +add.f32 %9, f1896, f2146; +add.f32 %8, f1895, f2027; +add.f32 %11, f1900, f2145; +add.f32 %10, f1899, f2037; +add.f32 %13, f1904, f2144; +add.f32 %12, f1903, f2047; +add.f32 %15, f1908, f2143; +add.f32 %14, f1907, f2057; +add.f32 %17, f1912, f2142; +add.f32 %16, f1911, f2067; +add.f32 %18, f1992, f1990; +sub.f32 %19, f1994, f1996; +add.f32 %20, f2002, f2000; +sub.f32 %21, f2004, f2006; +add.f32 %22, f2012, f2010; +sub.f32 %23, f2014, f2016; +add.f32 %24, f2022, f2020; +sub.f32 %25, f2024, f2026; +sub.f32 %27, f2034, f2036; +add.f32 %26, f2032, f2030; +sub.f32 %29, f2044, f2046; +add.f32 %28, f2042, f2040; +add.f32 %30, f2052, f2050; +sub.f32 %31, f2054, f2056; +add.f32 %32, f2062, f2060; +sub.f32 %33, f2064, f2066; +add.f32 %34, f2072, f2070; +sub.f32 %35, f2074, f2076; +add.f32 %37, f1996, f1994; +sub.f32 %36, f1990, f1992; +add.f32 %39, f2006, f2004; +sub.f32 %38, f2000, f2002; +add.f32 %41, f2016, f2014; +sub.f32 %40, f2010, f2012; +add.f32 %43, f2026, f2024; +sub.f32 %42, f2020, f2022; +add.f32 %45, f2036, f2034; +sub.f32 %44, f2030, f2032; +add.f32 %47, f2046, f2044; +sub.f32 %46, f2040, f2042; +add.f32 %49, f2056, f2054; +sub.f32 %48, f2050, f2052; +add.f32 %51, f2066, f2064; +sub.f32 %50, f2060, f2062; +add.f32 %53, f2076, f2074; +sub.f32 %52, f2070, f2072; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_2187), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<144, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<778>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 17496, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0f3F5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0f3F5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f151, f120; +mul.f32 f156, f152, f122; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f160, f136; +mul.f32 f164, f162, f138; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f168, f111; +mul.f32 f172, f170, f117; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f176, f127; +mul.f32 f180, f178, f133; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f184, f143; +mul.f32 f188, f186, f149; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f192, f112; +mul.f32 f196, f194, f118; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f200, f128; +mul.f32 f204, f202, f134; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f208, f144; +mul.f32 f212, f210, f150; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f152, f120, f157; +sub.f32 f217, f155, f156; +st.shared.v2.f32 [r9+8], {f217, f216}; +fma.rn.f32 f218, f162, f136, f165; +sub.f32 f219, f163, f164; +st.shared.v2.f32 [r9+16], {f219, f218}; +sub.f32 f220, f171, f172; +fma.rn.f32 f221, f170, f111, f173; +st.shared.v2.f32 [r9+24], {f220, f221}; +fma.rn.f32 f222, f178, f127, f181; +sub.f32 f223, f179, f180; +st.shared.v2.f32 [r9+32], {f223, f222}; +sub.f32 f224, f187, f188; +fma.rn.f32 f225, f186, f143, f189; +st.shared.v2.f32 [r9+40], {f224, f225}; +fma.rn.f32 f226, f194, f112, f197; +sub.f32 f227, f195, f196; +st.shared.v2.f32 [r9+48], {f227, f226}; +fma.rn.f32 f228, f202, f128, f205; +sub.f32 f229, f203, f204; +st.shared.v2.f32 [r9+56], {f229, f228}; +fma.rn.f32 f230, f210, f144, f213; +sub.f32 f231, f211, f212; +st.shared.v2.f32 [r9+64], {f231, f230}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+1944]; +ld.shared.v2.f32 {f240, f241}, [r11+3888]; +ld.shared.v2.f32 {f244, f245}, [r11+5832]; +ld.shared.v2.f32 {f248, f249}, [r11+7776]; +ld.shared.v2.f32 {f252, f253}, [r11+9720]; +ld.shared.v2.f32 {f256, f257}, [r11+11664]; +ld.shared.v2.f32 {f260, f261}, [r11+13608]; +ld.shared.v2.f32 {f264, f265}, [r11+15552]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0f3F5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0f3F5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0f3F5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0f3F5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0f3F5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0f3F5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0fBF248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0fBF248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0fBF7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0fBF7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0fBF7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0fBF7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0fBEAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0fBEAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0f3F5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0f3F5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f382, f351; +mul.f32 f387, f383, f353; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f391, f367; +mul.f32 f395, f393, f369; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f399, f342; +mul.f32 f403, f401, f348; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f415, f374; +mul.f32 f419, f417, f380; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f423, f343; +mul.f32 f427, f425, f349; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f431, f359; +mul.f32 f435, f433, f365; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f439, f375; +mul.f32 f443, f441, f381; +mul.f32 f444, f439, f381; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r17], {f446, f445}; +fma.rn.f32 f447, f383, f351, f388; +sub.f32 f448, f386, f387; +st.shared.v2.f32 [r17+72], {f448, f447}; +fma.rn.f32 f449, f393, f367, f396; +sub.f32 f450, f394, f395; +st.shared.v2.f32 [r17+144], {f450, f449}; +fma.rn.f32 f451, f401, f342, f404; +sub.f32 f452, f402, f403; +st.shared.v2.f32 [r17+216], {f452, f451}; +fma.rn.f32 f453, f409, f358, f412; +sub.f32 f454, f410, f411; +st.shared.v2.f32 [r17+288], {f454, f453}; +fma.rn.f32 f455, f417, f374, f420; +sub.f32 f456, f418, f419; +st.shared.v2.f32 [r17+360], {f456, f455}; +fma.rn.f32 f457, f425, f343, f428; +sub.f32 f458, f426, f427; +st.shared.v2.f32 [r17+432], {f458, f457}; +sub.f32 f459, f434, f435; +fma.rn.f32 f460, f433, f359, f436; +st.shared.v2.f32 [r17+504], {f459, f460}; +fma.rn.f32 f461, f441, f375, f444; +sub.f32 f462, f442, f443; +st.shared.v2.f32 [r17+576], {f462, f461}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r11]; +ld.shared.v2.f32 {f467, f468}, [r11+1944]; +ld.shared.v2.f32 {f471, f472}, [r11+3888]; +ld.shared.v2.f32 {f475, f476}, [r11+5832]; +ld.shared.v2.f32 {f479, f480}, [r11+7776]; +ld.shared.v2.f32 {f483, f484}, [r11+9720]; +ld.shared.v2.f32 {f487, f488}, [r11+11664]; +ld.shared.v2.f32 {f491, f492}, [r11+13608]; +ld.shared.v2.f32 {f495, f496}, [r11+15552]; +add.f32 f499, f475, f487; +add.f32 f500, f463, f499; +add.f32 f501, f476, f488; +add.f32 f502, f464, f501; +mul.f32 f503, f499, 0f3F000000; +sub.f32 f504, f463, f503; +sub.f32 f505, f476, f488; +mul.f32 f506, f505, 0f3F5DB3D7; +add.f32 f507, f506, f504; +sub.f32 f508, f504, f506; +mul.f32 f509, f501, 0f3F000000; +sub.f32 f510, f464, f509; +sub.f32 f511, f475, f487; +mul.f32 f512, f511, 0f3F5DB3D7; +sub.f32 f513, f510, f512; +add.f32 f514, f512, f510; +add.f32 f515, f479, f491; +add.f32 f516, f467, f515; +add.f32 f517, f480, f492; +add.f32 f518, f468, f517; +mul.f32 f519, f515, 0f3F000000; +sub.f32 f520, f467, f519; +sub.f32 f521, f480, f492; +mul.f32 f522, f521, 0f3F5DB3D7; +add.f32 f523, f522, f520; +sub.f32 f524, f520, f522; +mul.f32 f525, f517, 0f3F000000; +sub.f32 f526, f468, f525; +sub.f32 f527, f479, f491; +mul.f32 f528, f527, 0f3F5DB3D7; +sub.f32 f529, f526, f528; +add.f32 f530, f528, f526; +add.f32 f531, f483, f495; +add.f32 f532, f471, f531; +add.f32 f533, f484, f496; +add.f32 f534, f472, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f471, f535; +sub.f32 f537, f484, f496; +mul.f32 f538, f537, 0f3F5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f472, f541; +sub.f32 f543, f483, f495; +mul.f32 f544, f543, 0f3F5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +mul.f32 f547, f523, 0f3F441B7D; +mul.f32 f548, f529, 0fBF248DBB; +sub.f32 f549, f547, f548; +mul.f32 f550, f529, 0f3F441B7D; +fma.rn.f32 f551, f523, 0fBF248DBB, f550; +mul.f32 f552, f539, 0f3E31D0D4; +mul.f32 f553, f545, 0fBF7C1C5C; +sub.f32 f554, f552, f553; +mul.f32 f555, f545, 0f3E31D0D4; +fma.rn.f32 f556, f539, 0fBF7C1C5C, f555; +mul.f32 f557, f524, 0f3E31D0D4; +mul.f32 f558, f530, 0fBF7C1C5C; +sub.f32 f559, f557, f558; +mul.f32 f560, f530, 0f3E31D0D4; +fma.rn.f32 f561, f524, 0fBF7C1C5C, f560; +mul.f32 f562, f540, 0fBF708FB2; +mul.f32 f563, f546, 0fBEAF1D44; +sub.f32 f564, f562, f563; +mul.f32 f565, f546, 0fBF708FB2; +fma.rn.f32 f566, f540, 0fBEAF1D44, f565; +add.f32 f567, f516, f532; +add.f32 f568, f518, f534; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f500, f569; +sub.f32 f571, f518, f534; +mul.f32 f572, f571, 0f3F5DB3D7; +add.f32 f573, f572, f570; +sub.f32 f574, f570, f572; +mul.f32 f575, f568, 0f3F000000; +sub.f32 f576, f502, f575; +sub.f32 f577, f516, f532; +mul.f32 f578, f577, 0f3F5DB3D7; +sub.f32 f579, f576, f578; +add.f32 f580, f578, f576; +add.f32 f581, f549, f554; +add.f32 f582, f507, f581; +add.f32 f583, f551, f556; +add.f32 f584, f513, f583; +mul.f32 f585, f581, 0f3F000000; +sub.f32 f586, f507, f585; +sub.f32 f587, f551, f556; +mul.f32 f588, f587, 0f3F5DB3D7; +add.f32 f589, f588, f586; +sub.f32 f590, f586, f588; +mul.f32 f591, f583, 0f3F000000; +sub.f32 f592, f513, f591; +sub.f32 f593, f549, f554; +mul.f32 f594, f593, 0f3F5DB3D7; +sub.f32 f595, f592, f594; +add.f32 f596, f594, f592; +add.f32 f597, f559, f564; +add.f32 f598, f508, f597; +add.f32 f599, f561, f566; +add.f32 f600, f514, f599; +mul.f32 f601, f597, 0f3F000000; +sub.f32 f602, f508, f601; +sub.f32 f603, f561, f566; +mul.f32 f604, f603, 0f3F5DB3D7; +add.f32 f605, f604, f602; +sub.f32 f606, f602, f604; +mul.f32 f607, f599, 0f3F000000; +sub.f32 f608, f514, f607; +sub.f32 f609, f559, f564; +mul.f32 f610, f609, 0f3F5DB3D7; +sub.f32 f611, f608, f610; +add.f32 f612, f610, f608; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f613, f614}, [rd16]; +mul.f32 f617, f613, f582; +mul.f32 f618, f614, f584; +mul.f32 f619, f613, f584; +mul.f32 f620, f613, f613; +mul.f32 f621, f614, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f614, f613; +fma.rn.f32 f624, f614, f613, f623; +mul.f32 f625, f622, f598; +mul.f32 f626, f624, f600; +mul.f32 f627, f622, f600; +mul.f32 f628, f613, f622; +mul.f32 f629, f614, f624; +sub.f32 f630, f628, f629; +mul.f32 f631, f613, f624; +fma.rn.f32 f632, f614, f622, f631; +mul.f32 f633, f630, f573; +mul.f32 f634, f632, f579; +mul.f32 f635, f630, f579; +mul.f32 f636, f613, f630; +mul.f32 f637, f614, f632; +sub.f32 f638, f636, f637; +mul.f32 f639, f613, f632; +fma.rn.f32 f640, f614, f630, f639; +mul.f32 f641, f638, f589; +mul.f32 f642, f640, f595; +mul.f32 f643, f638, f595; +mul.f32 f644, f613, f638; +mul.f32 f645, f614, f640; +sub.f32 f646, f644, f645; +mul.f32 f647, f613, f640; +fma.rn.f32 f648, f614, f638, f647; +mul.f32 f649, f646, f605; +mul.f32 f650, f648, f611; +mul.f32 f651, f646, f611; +mul.f32 f652, f613, f646; +mul.f32 f653, f614, f648; +sub.f32 f654, f652, f653; +mul.f32 f655, f613, f648; +fma.rn.f32 f656, f614, f646, f655; +mul.f32 f657, f654, f574; +mul.f32 f658, f656, f580; +mul.f32 f659, f654, f580; +mul.f32 f660, f613, f654; +mul.f32 f661, f614, f656; +sub.f32 f662, f660, f661; +mul.f32 f663, f613, f656; +fma.rn.f32 f664, f614, f654, f663; +mul.f32 f665, f662, f590; +mul.f32 f666, f664, f596; +mul.f32 f667, f662, f596; +mul.f32 f668, f613, f662; +mul.f32 f669, f614, f664; +sub.f32 f670, f668, f669; +mul.f32 f671, f613, f664; +fma.rn.f32 f672, f614, f662, f671; +mul.f32 f673, f670, f606; +mul.f32 f674, f672, f612; +mul.f32 f675, f670, f612; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +add.f32 f676, f502, f568; +add.f32 f677, f500, f567; +st.shared.v2.f32 [r23], {f677, f676}; +fma.rn.f32 f678, f614, f582, f619; +sub.f32 f679, f617, f618; +st.shared.v2.f32 [r23+648], {f679, f678}; +fma.rn.f32 f680, f624, f598, f627; +sub.f32 f681, f625, f626; +st.shared.v2.f32 [r23+1296], {f681, f680}; +fma.rn.f32 f682, f632, f573, f635; +sub.f32 f683, f633, f634; +st.shared.v2.f32 [r23+1944], {f683, f682}; +fma.rn.f32 f684, f640, f589, f643; +sub.f32 f685, f641, f642; +st.shared.v2.f32 [r23+2592], {f685, f684}; +fma.rn.f32 f686, f648, f605, f651; +sub.f32 f687, f649, f650; +st.shared.v2.f32 [r23+3240], {f687, f686}; +fma.rn.f32 f688, f656, f574, f659; +sub.f32 f689, f657, f658; +st.shared.v2.f32 [r23+3888], {f689, f688}; +sub.f32 f690, f665, f666; +fma.rn.f32 f691, f664, f590, f667; +st.shared.v2.f32 [r23+4536], {f690, f691}; +fma.rn.f32 f692, f672, f606, f675; +sub.f32 f693, f673, f674; +st.shared.v2.f32 [r23+5184], {f693, f692}; +barrier.sync 0; +ld.shared.v2.f32 {f694, f695}, [r11]; +ld.shared.v2.f32 {f698, f699}, [r11+1944]; +ld.shared.v2.f32 {f702, f703}, [r11+3888]; +ld.shared.v2.f32 {f706, f707}, [r11+5832]; +ld.shared.v2.f32 {f710, f711}, [r11+7776]; +ld.shared.v2.f32 {f714, f715}, [r11+9720]; +ld.shared.v2.f32 {f718, f719}, [r11+11664]; +ld.shared.v2.f32 {f722, f723}, [r11+13608]; +ld.shared.v2.f32 {f726, f727}, [r11+15552]; +add.f32 f730, f706, f718; +add.f32 f731, f707, f719; +mul.f32 f732, f730, 0f3F000000; +sub.f32 f733, f694, f732; +sub.f32 f734, f707, f719; +mul.f32 f735, f734, 0f3F5DB3D7; +mul.f32 f736, f731, 0f3F000000; +sub.f32 f737, f695, f736; +sub.f32 f738, f706, f718; +mul.f32 f739, f738, 0f3F5DB3D7; +add.f32 f740, f710, f722; +add.f32 f741, f711, f723; +mul.f32 f742, f740, 0f3F000000; +sub.f32 f743, f698, f742; +sub.f32 f744, f711, f723; +mul.f32 f745, f744, 0f3F5DB3D7; +mul.f32 f746, f741, 0f3F000000; +sub.f32 f747, f699, f746; +sub.f32 f748, f710, f722; +mul.f32 f749, f748, 0f3F5DB3D7; +add.f32 f750, f714, f726; +add.f32 f751, f715, f727; +mul.f32 f752, f750, 0f3F000000; +sub.f32 f753, f702, f752; +sub.f32 f754, f715, f727; +mul.f32 f755, f754, 0f3F5DB3D7; +mul.f32 f756, f751, 0f3F000000; +sub.f32 f757, f703, f756; +sub.f32 f758, f714, f726; +mul.f32 f759, f758, 0f3F5DB3D7; +add.f32 %1, f695, f731; +add.f32 %0, f694, f730; +add.f32 %3, f699, f741; +add.f32 %2, f698, f740; +add.f32 %5, f703, f751; +add.f32 %4, f702, f750; +sub.f32 %7, f737, f739; +add.f32 %6, f735, f733; +sub.f32 %9, f747, f749; +add.f32 %8, f745, f743; +sub.f32 %11, f757, f759; +add.f32 %10, f755, f753; +add.f32 %13, f739, f737; +sub.f32 %12, f733, f735; +add.f32 %15, f749, f747; +sub.f32 %14, f743, f745; +add.f32 %17, f759, f757; +sub.f32 %16, f753, f755; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_2187), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<146, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<724>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 8748, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0f3F5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0f3F5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0f3F5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0f3F5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f153, f122; +mul.f32 f158, f154, f124; +sub.f32 f159, f157, f158; +mul.f32 f160, f153, f124; +fma.rn.f32 f161, f154, f122, f160; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f164, f138; +mul.f32 f168, f166, f140; +sub.f32 f169, f167, f168; +mul.f32 f170, f164, f140; +fma.rn.f32 f171, f166, f138, f170; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f174, f113; +mul.f32 f178, f176, f119; +sub.f32 f179, f177, f178; +mul.f32 f180, f174, f119; +fma.rn.f32 f181, f176, f113, f180; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f184, f129; +mul.f32 f188, f186, f135; +sub.f32 f189, f187, f188; +mul.f32 f190, f184, f135; +fma.rn.f32 f191, f186, f129, f190; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f194, f145; +mul.f32 f198, f196, f151; +sub.f32 f199, f197, f198; +mul.f32 f200, f194, f151; +fma.rn.f32 f201, f196, f145, f200; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f204, f114; +mul.f32 f208, f206, f120; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f120; +fma.rn.f32 f211, f206, f114, f210; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f214, f130; +mul.f32 f218, f216, f136; +sub.f32 f219, f217, f218; +mul.f32 f220, f214, f136; +fma.rn.f32 f221, f216, f130, f220; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f224, f146; +mul.f32 f228, f226, f152; +sub.f32 f229, f227, f228; +mul.f32 f230, f224, f152; +fma.rn.f32 f231, f226, f146, f230; +mad.lo.s32 r8, r5, 8748, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f159; +st.shared.f32 [r9+8], f169; +st.shared.f32 [r9+12], f179; +st.shared.f32 [r9+16], f189; +st.shared.f32 [r9+20], f199; +st.shared.f32 [r9+24], f209; +st.shared.f32 [r9+28], f219; +st.shared.f32 [r9+32], f229; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+972]; +ld.shared.f32 f234, [r11+1944]; +ld.shared.f32 f235, [r11+2916]; +ld.shared.f32 f236, [r11+3888]; +ld.shared.f32 f237, [r11+4860]; +ld.shared.f32 f238, [r11+5832]; +ld.shared.f32 f239, [r11+6804]; +ld.shared.f32 f240, [r11+7776]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+972]; +ld.shared.f32 f243, [r11+1944]; +ld.shared.f32 f244, [r11+2916]; +ld.shared.f32 f245, [r11+3888]; +ld.shared.f32 f246, [r11+4860]; +ld.shared.f32 f247, [r11+5832]; +ld.shared.f32 f248, [r11+6804]; +ld.shared.f32 f249, [r11+7776]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0f3F5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0f3F5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0f3F5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0f3F5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0f3F5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0f3F5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0fBF248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0fBF248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0fBF7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0fBF7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0fBF7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0fBF7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0fBEAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0fBEAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0f3F5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0f3F5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f366, f335; +mul.f32 f371, f367, f337; +sub.f32 f372, f370, f371; +mul.f32 f373, f366, f337; +fma.rn.f32 f374, f367, f335, f373; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f377, f351; +mul.f32 f381, f379, f353; +sub.f32 f382, f380, f381; +mul.f32 f383, f377, f353; +fma.rn.f32 f384, f379, f351, f383; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f387, f326; +mul.f32 f391, f389, f332; +sub.f32 f392, f390, f391; +mul.f32 f393, f387, f332; +fma.rn.f32 f394, f389, f326, f393; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f397, f342; +mul.f32 f401, f399, f348; +sub.f32 f402, f400, f401; +mul.f32 f403, f397, f348; +fma.rn.f32 f404, f399, f342, f403; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +sub.f32 f412, f410, f411; +mul.f32 f413, f407, f364; +fma.rn.f32 f414, f409, f358, f413; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f417, f327; +mul.f32 f421, f419, f333; +sub.f32 f422, f420, f421; +mul.f32 f423, f417, f333; +fma.rn.f32 f424, f419, f327, f423; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f427, f343; +mul.f32 f431, f429, f349; +sub.f32 f432, f430, f431; +mul.f32 f433, f427, f349; +fma.rn.f32 f434, f429, f343, f433; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f437, f359; +mul.f32 f441, f439, f365; +sub.f32 f442, f440, f441; +mul.f32 f443, f437, f365; +fma.rn.f32 f444, f439, f359, f443; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 324, r16; +st.shared.f32 [r17], f319; +st.shared.f32 [r17+36], f372; +st.shared.f32 [r17+72], f382; +st.shared.f32 [r17+108], f392; +st.shared.f32 [r17+144], f402; +st.shared.f32 [r17+180], f412; +st.shared.f32 [r17+216], f422; +st.shared.f32 [r17+252], f432; +st.shared.f32 [r17+288], f442; +barrier.sync 0; +ld.shared.f32 f445, [r11]; +ld.shared.f32 f446, [r11+972]; +ld.shared.f32 f447, [r11+1944]; +ld.shared.f32 f448, [r11+2916]; +ld.shared.f32 f449, [r11+3888]; +ld.shared.f32 f450, [r11+4860]; +ld.shared.f32 f451, [r11+5832]; +ld.shared.f32 f452, [r11+6804]; +ld.shared.f32 f453, [r11+7776]; +barrier.sync 0; +st.shared.f32 [r17], f321; +st.shared.f32 [r17+36], f374; +st.shared.f32 [r17+72], f384; +st.shared.f32 [r17+108], f394; +st.shared.f32 [r17+144], f404; +st.shared.f32 [r17+180], f414; +st.shared.f32 [r17+216], f424; +st.shared.f32 [r17+252], f434; +st.shared.f32 [r17+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r11]; +ld.shared.f32 f455, [r11+972]; +ld.shared.f32 f456, [r11+1944]; +ld.shared.f32 f457, [r11+2916]; +ld.shared.f32 f458, [r11+3888]; +ld.shared.f32 f459, [r11+4860]; +ld.shared.f32 f460, [r11+5832]; +ld.shared.f32 f461, [r11+6804]; +ld.shared.f32 f462, [r11+7776]; +add.f32 f463, f448, f451; +add.f32 f464, f445, f463; +add.f32 f465, f457, f460; +add.f32 f466, f454, f465; +mul.f32 f467, f463, 0f3F000000; +sub.f32 f468, f445, f467; +sub.f32 f469, f457, f460; +mul.f32 f470, f469, 0f3F5DB3D7; +add.f32 f471, f470, f468; +sub.f32 f472, f468, f470; +mul.f32 f473, f465, 0f3F000000; +sub.f32 f474, f454, f473; +sub.f32 f475, f448, f451; +mul.f32 f476, f475, 0f3F5DB3D7; +sub.f32 f477, f474, f476; +add.f32 f478, f476, f474; +add.f32 f479, f449, f452; +add.f32 f480, f446, f479; +add.f32 f481, f458, f461; +add.f32 f482, f455, f481; +mul.f32 f483, f479, 0f3F000000; +sub.f32 f484, f446, f483; +sub.f32 f485, f458, f461; +mul.f32 f486, f485, 0f3F5DB3D7; +add.f32 f487, f486, f484; +sub.f32 f488, f484, f486; +mul.f32 f489, f481, 0f3F000000; +sub.f32 f490, f455, f489; +sub.f32 f491, f449, f452; +mul.f32 f492, f491, 0f3F5DB3D7; +sub.f32 f493, f490, f492; +add.f32 f494, f492, f490; +add.f32 f495, f450, f453; +add.f32 f496, f447, f495; +add.f32 f497, f459, f462; +add.f32 f498, f456, f497; +mul.f32 f499, f495, 0f3F000000; +sub.f32 f500, f447, f499; +sub.f32 f501, f459, f462; +mul.f32 f502, f501, 0f3F5DB3D7; +add.f32 f503, f502, f500; +sub.f32 f504, f500, f502; +mul.f32 f505, f497, 0f3F000000; +sub.f32 f506, f456, f505; +sub.f32 f507, f450, f453; +mul.f32 f508, f507, 0f3F5DB3D7; +sub.f32 f509, f506, f508; +add.f32 f510, f508, f506; +mul.f32 f511, f487, 0f3F441B7D; +mul.f32 f512, f493, 0fBF248DBB; +sub.f32 f513, f511, f512; +mul.f32 f514, f493, 0f3F441B7D; +fma.rn.f32 f515, f487, 0fBF248DBB, f514; +mul.f32 f516, f503, 0f3E31D0D4; +mul.f32 f517, f509, 0fBF7C1C5C; +sub.f32 f518, f516, f517; +mul.f32 f519, f509, 0f3E31D0D4; +fma.rn.f32 f520, f503, 0fBF7C1C5C, f519; +mul.f32 f521, f488, 0f3E31D0D4; +mul.f32 f522, f494, 0fBF7C1C5C; +sub.f32 f523, f521, f522; +mul.f32 f524, f494, 0f3E31D0D4; +fma.rn.f32 f525, f488, 0fBF7C1C5C, f524; +mul.f32 f526, f504, 0fBF708FB2; +mul.f32 f527, f510, 0fBEAF1D44; +sub.f32 f528, f526, f527; +mul.f32 f529, f510, 0fBF708FB2; +fma.rn.f32 f530, f504, 0fBEAF1D44, f529; +add.f32 f531, f480, f496; +add.f32 f532, f464, f531; +add.f32 f533, f482, f498; +add.f32 f534, f466, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f464, f535; +sub.f32 f537, f482, f498; +mul.f32 f538, f537, 0f3F5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f466, f541; +sub.f32 f543, f480, f496; +mul.f32 f544, f543, 0f3F5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +add.f32 f547, f513, f518; +add.f32 f548, f471, f547; +add.f32 f549, f515, f520; +add.f32 f550, f477, f549; +mul.f32 f551, f547, 0f3F000000; +sub.f32 f552, f471, f551; +sub.f32 f553, f515, f520; +mul.f32 f554, f553, 0f3F5DB3D7; +add.f32 f555, f554, f552; +sub.f32 f556, f552, f554; +mul.f32 f557, f549, 0f3F000000; +sub.f32 f558, f477, f557; +sub.f32 f559, f513, f518; +mul.f32 f560, f559, 0f3F5DB3D7; +sub.f32 f561, f558, f560; +add.f32 f562, f560, f558; +add.f32 f563, f523, f528; +add.f32 f564, f472, f563; +add.f32 f565, f525, f530; +add.f32 f566, f478, f565; +mul.f32 f567, f563, 0f3F000000; +sub.f32 f568, f472, f567; +sub.f32 f569, f525, f530; +mul.f32 f570, f569, 0f3F5DB3D7; +add.f32 f571, f570, f568; +sub.f32 f572, f568, f570; +mul.f32 f573, f565, 0f3F000000; +sub.f32 f574, f478, f573; +sub.f32 f575, f523, f528; +mul.f32 f576, f575, 0f3F5DB3D7; +sub.f32 f577, f574, f576; +add.f32 f578, f576, f574; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f579, f580}, [rd16]; +mul.f32 f583, f579, f548; +mul.f32 f584, f580, f550; +sub.f32 f585, f583, f584; +mul.f32 f586, f579, f550; +fma.rn.f32 f587, f580, f548, f586; +mul.f32 f588, f579, f579; +mul.f32 f589, f580, f580; +sub.f32 f590, f588, f589; +mul.f32 f591, f580, f579; +fma.rn.f32 f592, f580, f579, f591; +mul.f32 f593, f590, f564; +mul.f32 f594, f592, f566; +sub.f32 f595, f593, f594; +mul.f32 f596, f590, f566; +fma.rn.f32 f597, f592, f564, f596; +mul.f32 f598, f579, f590; +mul.f32 f599, f580, f592; +sub.f32 f600, f598, f599; +mul.f32 f601, f579, f592; +fma.rn.f32 f602, f580, f590, f601; +mul.f32 f603, f600, f539; +mul.f32 f604, f602, f545; +sub.f32 f605, f603, f604; +mul.f32 f606, f600, f545; +fma.rn.f32 f607, f602, f539, f606; +mul.f32 f608, f579, f600; +mul.f32 f609, f580, f602; +sub.f32 f610, f608, f609; +mul.f32 f611, f579, f602; +fma.rn.f32 f612, f580, f600, f611; +mul.f32 f613, f610, f555; +mul.f32 f614, f612, f561; +sub.f32 f615, f613, f614; +mul.f32 f616, f610, f561; +fma.rn.f32 f617, f612, f555, f616; +mul.f32 f618, f579, f610; +mul.f32 f619, f580, f612; +sub.f32 f620, f618, f619; +mul.f32 f621, f579, f612; +fma.rn.f32 f622, f580, f610, f621; +mul.f32 f623, f620, f571; +mul.f32 f624, f622, f577; +sub.f32 f625, f623, f624; +mul.f32 f626, f620, f577; +fma.rn.f32 f627, f622, f571, f626; +mul.f32 f628, f579, f620; +mul.f32 f629, f580, f622; +sub.f32 f630, f628, f629; +mul.f32 f631, f579, f622; +fma.rn.f32 f632, f580, f620, f631; +mul.f32 f633, f630, f540; +mul.f32 f634, f632, f546; +sub.f32 f635, f633, f634; +mul.f32 f636, f630, f546; +fma.rn.f32 f637, f632, f540, f636; +mul.f32 f638, f579, f630; +mul.f32 f639, f580, f632; +sub.f32 f640, f638, f639; +mul.f32 f641, f579, f632; +fma.rn.f32 f642, f580, f630, f641; +mul.f32 f643, f640, f556; +mul.f32 f644, f642, f562; +sub.f32 f645, f643, f644; +mul.f32 f646, f640, f562; +fma.rn.f32 f647, f642, f556, f646; +mul.f32 f648, f579, f640; +mul.f32 f649, f580, f642; +sub.f32 f650, f648, f649; +mul.f32 f651, f579, f642; +fma.rn.f32 f652, f580, f640, f651; +mul.f32 f653, f650, f572; +mul.f32 f654, f652, f578; +sub.f32 f655, f653, f654; +mul.f32 f656, f650, f578; +fma.rn.f32 f657, f652, f572, f656; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2916, r22; +st.shared.f32 [r23], f532; +st.shared.f32 [r23+324], f585; +st.shared.f32 [r23+648], f595; +st.shared.f32 [r23+972], f605; +st.shared.f32 [r23+1296], f615; +st.shared.f32 [r23+1620], f625; +st.shared.f32 [r23+1944], f635; +st.shared.f32 [r23+2268], f645; +st.shared.f32 [r23+2592], f655; +barrier.sync 0; +ld.shared.f32 f658, [r11]; +ld.shared.f32 f659, [r11+972]; +ld.shared.f32 f660, [r11+1944]; +ld.shared.f32 f661, [r11+2916]; +ld.shared.f32 f662, [r11+3888]; +ld.shared.f32 f663, [r11+4860]; +ld.shared.f32 f664, [r11+5832]; +ld.shared.f32 f665, [r11+6804]; +ld.shared.f32 f666, [r11+7776]; +barrier.sync 0; +st.shared.f32 [r23], f534; +st.shared.f32 [r23+324], f587; +st.shared.f32 [r23+648], f597; +st.shared.f32 [r23+972], f607; +st.shared.f32 [r23+1296], f617; +st.shared.f32 [r23+1620], f627; +st.shared.f32 [r23+1944], f637; +st.shared.f32 [r23+2268], f647; +st.shared.f32 [r23+2592], f657; +barrier.sync 0; +ld.shared.f32 f667, [r11]; +ld.shared.f32 f668, [r11+972]; +ld.shared.f32 f669, [r11+1944]; +ld.shared.f32 f670, [r11+2916]; +ld.shared.f32 f671, [r11+3888]; +ld.shared.f32 f672, [r11+4860]; +ld.shared.f32 f673, [r11+5832]; +ld.shared.f32 f674, [r11+6804]; +ld.shared.f32 f675, [r11+7776]; +add.f32 f676, f661, f664; +add.f32 f677, f670, f673; +mul.f32 f678, f676, 0f3F000000; +sub.f32 f679, f658, f678; +sub.f32 f680, f670, f673; +mul.f32 f681, f680, 0f3F5DB3D7; +mul.f32 f682, f677, 0f3F000000; +sub.f32 f683, f667, f682; +sub.f32 f684, f661, f664; +mul.f32 f685, f684, 0f3F5DB3D7; +add.f32 f686, f662, f665; +add.f32 f687, f671, f674; +mul.f32 f688, f686, 0f3F000000; +sub.f32 f689, f659, f688; +sub.f32 f690, f671, f674; +mul.f32 f691, f690, 0f3F5DB3D7; +mul.f32 f692, f687, 0f3F000000; +sub.f32 f693, f668, f692; +sub.f32 f694, f662, f665; +mul.f32 f695, f694, 0f3F5DB3D7; +add.f32 f696, f663, f666; +add.f32 f697, f672, f675; +mul.f32 f698, f696, 0f3F000000; +sub.f32 f699, f660, f698; +sub.f32 f700, f672, f675; +mul.f32 f701, f700, 0f3F5DB3D7; +mul.f32 f702, f697, 0f3F000000; +sub.f32 f703, f669, f702; +sub.f32 f704, f663, f666; +mul.f32 f705, f704, 0f3F5DB3D7; +add.f32 %0, f658, f676; +add.f32 %1, f667, f677; +add.f32 %2, f659, f686; +add.f32 %3, f668, f687; +add.f32 %4, f660, f696; +add.f32 %5, f669, f697; +add.f32 %6, f681, f679; +sub.f32 %7, f683, f685; +add.f32 %8, f691, f689; +sub.f32 %9, f693, f695; +add.f32 %10, f701, f699; +sub.f32 %11, f703, f705; +sub.f32 %12, f679, f681; +add.f32 %13, f685, f683; +sub.f32 %14, f689, f691; +add.f32 %15, f695, f693; +sub.f32 %16, f699, f701; +add.f32 %17, f705, f703; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_2187), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<147, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2432>; +.reg .b32 r<24>; +.reg .b64 rd<16>; +mov.u32 r22, %tid.y; +mov.u32 r23, %54; +mad.lo.s32 r3, r22, 8748, r23; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2423, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2422, %58, f2423; +mul.f32 f119, f2423, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2421, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2420, %64, f2421; +mul.f32 f135, f2421, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2419, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2418, %70, f2419; +mul.f32 f151, f2419, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f2417, f133, 0f3F441B7D; +sub.f32 f159, f2417, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f2415, f149, 0f3E31D0D4; +mul.f32 f2416, f155, 0fBF7C1C5C; +sub.f32 f164, f2415, f2416; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f2413, f134, 0f3E31D0D4; +mul.f32 f2414, f140, 0fBF7C1C5C; +sub.f32 f169, f2413, f2414; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f2411, f150, 0fBF708FB2; +mul.f32 f2412, f156, 0fBEAF1D44; +sub.f32 f174, f2411, f2412; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2410, f2420, f2418; +sub.f32 f183, f2420, f2418; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2409, f2422, f2410; +mul.f32 f187, f2410, 0f3F000000; +sub.f32 f188, f2422, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2408, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2407, f123, f2408; +mul.f32 f203, f2408, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2406, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2405, f124, f2406; +mul.f32 f219, f2406, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2402, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2400, %113, f2402; +mul.f32 f235, f2402, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2397, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2395, %116, f2397; +mul.f32 f251, f2397, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2392, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2390, %119, f2392; +mul.f32 f267, f2392, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f2389, f249, 0f3F441B7D; +sub.f32 f275, f2389, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f2388, f265, 0f3E31D0D4; +sub.f32 f280, f2388, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f2386, f250, 0f3E31D0D4; +mul.f32 f2387, f256, 0fBF7C1C5C; +sub.f32 f285, f2386, f2387; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f2384, f266, 0fBF708FB2; +mul.f32 f2385, f272, 0fBEAF1D44; +sub.f32 f290, f2384, f2385; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2383, f2395, f2390; +sub.f32 f299, f2395, f2390; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2382, f2400, f2383; +mul.f32 f303, f2383, 0f3F000000; +sub.f32 f304, f2400, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2381, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2380, f239, f2381; +mul.f32 f319, f2381, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2379, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2378, f240, f2379; +mul.f32 f335, f2379, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2375, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2373, %122, f2375; +mul.f32 f351, f2375, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2370, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2368, %125, f2370; +mul.f32 f367, f2370, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2366, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2364, %127, f2366; +mul.f32 f383, f2366, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f2363, f365, 0f3F441B7D; +sub.f32 f391, f2363, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f2362, f381, 0f3E31D0D4; +sub.f32 f396, f2362, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f2360, f366, 0f3E31D0D4; +mul.f32 f2361, f372, 0fBF7C1C5C; +sub.f32 f401, f2360, f2361; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f2358, f382, 0fBF708FB2; +mul.f32 f2359, f388, 0fBEAF1D44; +sub.f32 f406, f2358, f2359; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2357, f2368, f2364; +sub.f32 f415, f2368, f2364; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2356, f2373, f2357; +mul.f32 f419, f2357, 0f3F000000; +sub.f32 f420, f2373, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2355, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2354, f355, f2355; +mul.f32 f435, f2355, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2353, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2352, f356, f2353; +mul.f32 f451, f2353, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2380, 0fBE6C2691; +mul.f32 f2351, f310, 0f3F791978; +sub.f32 f459, f2351, f458; +mul.f32 f460, f2380, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f2349, f426, 0f3F64C51C; +mul.f32 f2350, f2354, 0fBEE5C902; +sub.f32 f464, f2349, f2350; +mul.f32 f465, f2354, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f2347, f326, 0f3F64C51C; +mul.f32 f2348, f2378, 0fBEE5C902; +sub.f32 f469, f2347, f2348; +mul.f32 f470, f2378, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f2345, f442, 0f3F18DF63; +mul.f32 f2346, f2352, 0fBF4D57F2; +sub.f32 f474, f2345, f2346; +mul.f32 f475, f2352, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f2343, f301, 0f3F441B7D; +mul.f32 f2344, f307, 0fBF248DBB; +sub.f32 f479, f2343, f2344; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f2342, f417, 0f3E31D0D4; +sub.f32 f484, f2342, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f2341, f317, 0f3F18DF63; +sub.f32 f489, f2341, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f2340, f433, 0fBE92D7E0; +sub.f32 f494, f2340, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f2339, f333, 0f3ECACAF8; +sub.f32 f499, f2339, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f2338, f449, 0fBF2FAD88; +sub.f32 f504, f2338, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f2337, f302, 0f3E31D0D4; +sub.f32 f509, f2337, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f2335, f418, 0fBF708FB2; +mul.f32 f2336, f424, 0fBEAF1D44; +sub.f32 f514, f2335, f2336; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f2333, f318, 0fBD6E2946; +mul.f32 f2334, f324, 0fBF7F9120; +sub.f32 f519, f2333, f2334; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f2331, f434, 0fBF7E44DE; +mul.f32 f2332, f440, 0f3DEDC21F; +sub.f32 f524, f2331, f2332; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f2330, f334, 0fBE92D7E0; +sub.f32 f529, f2330, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f2329, f450, 0fBF55E287; +sub.f32 f534, f2329, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f2328, f2382, f2356; +sub.f32 f543, f2382, f2356; +mul.f32 f544, f543, 0f3F5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f2327, f2409, f2328; +mul.f32 f547, f2328, 0f3F000000; +sub.f32 f548, f2409, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0f3F5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f2326, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f2325, f2407, f2326; +mul.f32 f563, f2326, 0f3F000000; +sub.f32 f564, f2407, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0f3F5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f2324, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f2323, f2405, f2324; +mul.f32 f579, f2324, 0f3F000000; +sub.f32 f580, f2405, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0f3F5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f2322, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0f3F5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f2321, f191, f2322; +mul.f32 f595, f2322, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0f3F5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f2320, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0f3F5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f2319, f207, f2320; +mul.f32 f611, f2320, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0f3F5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f2318, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0f3F5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f2317, f223, f2318; +mul.f32 f627, f2318, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0f3F5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f2316, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0f3F5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f2315, f192, f2316; +mul.f32 f643, f2316, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0f3F5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f2314, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0f3F5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f2313, f208, f2314; +mul.f32 f659, f2314, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0f3F5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f2312, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0f3F5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f2311, f224, f2312; +mul.f32 f675, f2312, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0f3F5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r21, %tid.x; +mul.wide.u32 rd2, r21, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r21, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f686, f682, f2325; +mul.f32 f2310, f681, f554; +sub.f32 f687, f2310, f686; +mul.f32 f688, f681, f2325; +fma.rn.f32 f689, f682, f554, f688; +mul.f32 f691, f682, f682; +mul.f32 f2309, f681, f681; +sub.f32 f692, f2309, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f696, f694, f2323; +mul.f32 f2308, f692, f570; +sub.f32 f697, f2308, f696; +mul.f32 f698, f692, f2323; +fma.rn.f32 f699, f694, f570, f698; +mul.f32 f701, f682, f694; +mul.f32 f2307, f681, f692; +sub.f32 f702, f2307, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f706, f704, f2321; +mul.f32 f2306, f702, f586; +sub.f32 f707, f2306, f706; +mul.f32 f708, f702, f2321; +fma.rn.f32 f709, f704, f586, f708; +mul.f32 f2304, f681, f702; +mul.f32 f2305, f682, f704; +sub.f32 f712, f2304, f2305; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f2302, f712, f602; +mul.f32 f2303, f714, f2319; +sub.f32 f717, f2302, f2303; +mul.f32 f718, f712, f2319; +fma.rn.f32 f719, f714, f602, f718; +mul.f32 f2300, f681, f712; +mul.f32 f2301, f682, f714; +sub.f32 f722, f2300, f2301; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f2298, f722, f618; +mul.f32 f2299, f724, f2317; +sub.f32 f727, f2298, f2299; +mul.f32 f728, f722, f2317; +fma.rn.f32 f729, f724, f618, f728; +mul.f32 f731, f682, f724; +mul.f32 f2297, f681, f722; +sub.f32 f732, f2297, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f736, f734, f2315; +mul.f32 f2296, f732, f634; +sub.f32 f737, f2296, f736; +mul.f32 f738, f732, f2315; +fma.rn.f32 f739, f734, f634, f738; +mul.f32 f741, f682, f734; +mul.f32 f2295, f681, f732; +sub.f32 f742, f2295, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f746, f744, f2313; +mul.f32 f2294, f742, f650; +sub.f32 f747, f2294, f746; +mul.f32 f748, f742, f2313; +fma.rn.f32 f749, f744, f650, f748; +mul.f32 f751, f682, f744; +mul.f32 f2293, f681, f742; +sub.f32 f752, f2293, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f756, f754, f2311; +mul.f32 f2292, f752, f666; +sub.f32 f757, f2292, f756; +mul.f32 f758, f752, f2311; +fma.rn.f32 f759, f754, f666, f758; +mul.f32 f2290, f681, f752; +mul.f32 f2291, f682, f754; +sub.f32 f762, f2290, f2291; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f2288, f762, f545; +mul.f32 f2289, f764, f551; +sub.f32 f767, f2288, f2289; +mul.f32 f768, f762, f551; +fma.rn.f32 f769, f764, f545, f768; +mul.f32 f2286, f681, f762; +mul.f32 f2287, f682, f764; +sub.f32 f772, f2286, f2287; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f776, f774, f567; +mul.f32 f2285, f772, f561; +sub.f32 f777, f2285, f776; +mul.f32 f778, f772, f567; +fma.rn.f32 f779, f774, f561, f778; +mul.f32 f781, f682, f774; +mul.f32 f2284, f681, f772; +sub.f32 f782, f2284, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f786, f784, f583; +mul.f32 f2283, f782, f577; +sub.f32 f787, f2283, f786; +mul.f32 f788, f782, f583; +fma.rn.f32 f789, f784, f577, f788; +mul.f32 f791, f682, f784; +mul.f32 f2282, f681, f782; +sub.f32 f792, f2282, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f796, f794, f599; +mul.f32 f2281, f792, f593; +sub.f32 f797, f2281, f796; +mul.f32 f798, f792, f599; +fma.rn.f32 f799, f794, f593, f798; +mul.f32 f801, f682, f794; +mul.f32 f2280, f681, f792; +sub.f32 f802, f2280, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f2278, f802, f609; +mul.f32 f2279, f804, f615; +sub.f32 f807, f2278, f2279; +mul.f32 f808, f802, f615; +fma.rn.f32 f809, f804, f609, f808; +mul.f32 f2276, f681, f802; +mul.f32 f2277, f682, f804; +sub.f32 f812, f2276, f2277; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f2274, f812, f625; +mul.f32 f2275, f814, f631; +sub.f32 f817, f2274, f2275; +mul.f32 f818, f812, f631; +fma.rn.f32 f819, f814, f625, f818; +mul.f32 f2272, f681, f812; +mul.f32 f2273, f682, f814; +sub.f32 f822, f2272, f2273; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f826, f824, f647; +mul.f32 f2271, f822, f641; +sub.f32 f827, f2271, f826; +mul.f32 f828, f822, f647; +fma.rn.f32 f829, f824, f641, f828; +mul.f32 f831, f682, f824; +mul.f32 f2270, f681, f822; +sub.f32 f832, f2270, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f836, f834, f663; +mul.f32 f2269, f832, f657; +sub.f32 f837, f2269, f836; +mul.f32 f838, f832, f663; +fma.rn.f32 f839, f834, f657, f838; +mul.f32 f841, f682, f834; +mul.f32 f2268, f681, f832; +sub.f32 f842, f2268, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f846, f844, f679; +mul.f32 f2267, f842, f673; +sub.f32 f847, f2267, f846; +mul.f32 f848, f842, f679; +fma.rn.f32 f849, f844, f673, f848; +mul.f32 f2265, f681, f842; +mul.f32 f2266, f682, f844; +sub.f32 f852, f2265, f2266; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f2263, f852, f546; +mul.f32 f2264, f854, f552; +sub.f32 f857, f2263, f2264; +mul.f32 f858, f852, f552; +fma.rn.f32 f859, f854, f546, f858; +mul.f32 f2261, f681, f852; +mul.f32 f2262, f682, f854; +sub.f32 f862, f2261, f2262; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f2259, f862, f562; +mul.f32 f2260, f864, f568; +sub.f32 f867, f2259, f2260; +mul.f32 f868, f862, f568; +fma.rn.f32 f869, f864, f562, f868; +mul.f32 f871, f682, f864; +mul.f32 f2258, f681, f862; +sub.f32 f872, f2258, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f876, f874, f584; +mul.f32 f2257, f872, f578; +sub.f32 f877, f2257, f876; +mul.f32 f878, f872, f584; +fma.rn.f32 f879, f874, f578, f878; +mul.f32 f881, f682, f874; +mul.f32 f2256, f681, f872; +sub.f32 f882, f2256, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f886, f884, f600; +mul.f32 f2255, f882, f594; +sub.f32 f887, f2255, f886; +mul.f32 f888, f882, f600; +fma.rn.f32 f889, f884, f594, f888; +mul.f32 f891, f682, f884; +mul.f32 f2254, f681, f882; +sub.f32 f892, f2254, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f2252, f892, f610; +mul.f32 f2253, f894, f616; +sub.f32 f897, f2252, f2253; +mul.f32 f898, f892, f616; +fma.rn.f32 f899, f894, f610, f898; +mul.f32 f2250, f681, f892; +mul.f32 f2251, f682, f894; +sub.f32 f902, f2250, f2251; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f2248, f902, f626; +mul.f32 f2249, f904, f632; +sub.f32 f907, f2248, f2249; +mul.f32 f908, f902, f632; +fma.rn.f32 f909, f904, f626, f908; +mul.f32 f2246, f681, f902; +mul.f32 f2247, f682, f904; +sub.f32 f912, f2246, f2247; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f916, f914, f648; +mul.f32 f2245, f912, f642; +sub.f32 f917, f2245, f916; +mul.f32 f918, f912, f648; +fma.rn.f32 f919, f914, f642, f918; +mul.f32 f921, f682, f914; +mul.f32 f2244, f681, f912; +sub.f32 f922, f2244, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f926, f924, f664; +mul.f32 f2243, f922, f658; +sub.f32 f927, f2243, f926; +mul.f32 f928, f922, f664; +fma.rn.f32 f929, f924, f658, f928; +mul.f32 f931, f682, f924; +mul.f32 f2242, f681, f922; +sub.f32 f932, f2242, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f936, f934, f680; +mul.f32 f2241, f932, f674; +sub.f32 f937, f2241, f936; +mul.f32 f938, f932, f680; +fma.rn.f32 f939, f934, f674, f938; +mad.lo.s32 r8, r5, 8748, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f687; +st.shared.f32 [r9+8], f697; +st.shared.f32 [r9+12], f707; +st.shared.f32 [r9+16], f717; +st.shared.f32 [r9+20], f727; +st.shared.f32 [r9+24], f737; +st.shared.f32 [r9+28], f747; +st.shared.f32 [r9+32], f757; +st.shared.f32 [r9+36], f767; +st.shared.f32 [r9+40], f777; +st.shared.f32 [r9+44], f787; +st.shared.f32 [r9+48], f797; +st.shared.f32 [r9+52], f807; +st.shared.f32 [r9+56], f817; +st.shared.f32 [r9+60], f827; +st.shared.f32 [r9+64], f837; +st.shared.f32 [r9+68], f847; +st.shared.f32 [r9+72], f857; +st.shared.f32 [r9+76], f867; +st.shared.f32 [r9+80], f877; +st.shared.f32 [r9+84], f887; +st.shared.f32 [r9+88], f897; +st.shared.f32 [r9+92], f907; +st.shared.f32 [r9+96], f917; +st.shared.f32 [r9+100], f927; +st.shared.f32 [r9+104], f937; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+324]; +ld.shared.f32 f942, [r10+648]; +ld.shared.f32 f943, [r10+972]; +ld.shared.f32 f944, [r10+1296]; +ld.shared.f32 f945, [r10+1620]; +ld.shared.f32 f946, [r10+1944]; +ld.shared.f32 f947, [r10+2268]; +ld.shared.f32 f948, [r10+2592]; +ld.shared.f32 f949, [r10+2916]; +ld.shared.f32 f950, [r10+3240]; +ld.shared.f32 f951, [r10+3564]; +ld.shared.f32 f952, [r10+3888]; +ld.shared.f32 f953, [r10+4212]; +ld.shared.f32 f954, [r10+4536]; +ld.shared.f32 f955, [r10+4860]; +ld.shared.f32 f956, [r10+5184]; +ld.shared.f32 f957, [r10+5508]; +ld.shared.f32 f958, [r10+5832]; +ld.shared.f32 f959, [r10+6156]; +ld.shared.f32 f960, [r10+6480]; +ld.shared.f32 f961, [r10+6804]; +ld.shared.f32 f962, [r10+7128]; +ld.shared.f32 f963, [r10+7452]; +ld.shared.f32 f964, [r10+7776]; +ld.shared.f32 f965, [r10+8100]; +ld.shared.f32 f966, [r10+8424]; +barrier.sync 0; +st.shared.f32 [r9], f2327; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +ld.shared.f32 f2240, [r10+5832]; +ld.shared.f32 f2239, [r10+2916]; +add.f32 f2238, f2239, f2240; +sub.f32 f1000, f2239, f2240; +mul.f32 f1001, f1000, 0f3F5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +ld.shared.f32 f2237, [r10]; +add.f32 f2236, f2237, f2238; +mul.f32 f1004, f2238, 0f3F000000; +sub.f32 f1005, f2237, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0f3F5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +ld.shared.f32 f2235, [r10+6804]; +sub.f32 f1015, f943, f1014; +ld.shared.f32 f2234, [r10+3888]; +add.f32 f2233, f2234, f2235; +sub.f32 f1016, f2234, f2235; +mul.f32 f1017, f1016, 0f3F5DB3D7; +ld.shared.f32 f2232, [r10+972]; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f2231, f2232, f2233; +mul.f32 f1020, f2233, 0f3F000000; +sub.f32 f1021, f2232, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0f3F5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +ld.shared.f32 f2230, [r10+7776]; +ld.shared.f32 f2229, [r10+4860]; +sub.f32 f1031, f946, f1030; +add.f32 f2228, f2229, f2230; +sub.f32 f1032, f2229, f2230; +mul.f32 f1033, f1032, 0f3F5DB3D7; +ld.shared.f32 f2227, [r10+1944]; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f2226, f2227, f2228; +mul.f32 f1036, f2228, 0f3F000000; +sub.f32 f1037, f2227, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0f3F5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f2224, f1018, 0f3F441B7D; +mul.f32 f2225, f1024, 0fBF248DBB; +sub.f32 f1044, f2224, f2225; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0fBF248DBB, f1045; +mul.f32 f1048, f1040, 0fBF7C1C5C; +mul.f32 f2223, f1034, 0f3E31D0D4; +sub.f32 f1049, f2223, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0fBF7C1C5C, f1050; +mul.f32 f1053, f1025, 0fBF7C1C5C; +mul.f32 f2222, f1019, 0f3E31D0D4; +sub.f32 f1054, f2222, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0fBF7C1C5C, f1055; +mul.f32 f1058, f1041, 0fBEAF1D44; +mul.f32 f2221, f1035, 0fBF708FB2; +sub.f32 f1059, f2221, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0fBEAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f2220, f2231, f2226; +sub.f32 f1068, f2231, f2226; +mul.f32 f1069, f1068, 0f3F5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f2219, f2236, f2220; +mul.f32 f1072, f2220, 0f3F000000; +sub.f32 f1073, f2236, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0f3F5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f2218, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0f3F5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f2217, f1008, f2218; +mul.f32 f1088, f2218, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0f3F5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f2216, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0f3F5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f2215, f1009, f2216; +mul.f32 f1104, f2216, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0f3F5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +sub.f32 f1115, f941, f1114; +ld.shared.f32 f2214, [r10+6156]; +ld.shared.f32 f2213, [r10+3240]; +add.f32 f2212, f2213, f2214; +sub.f32 f1116, f2213, f2214; +mul.f32 f1117, f1116, 0f3F5DB3D7; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +ld.shared.f32 f2211, [r10+324]; +add.f32 f2210, f2211, f2212; +mul.f32 f1120, f2212, 0f3F000000; +sub.f32 f1121, f2211, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0f3F5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +ld.shared.f32 f2209, [r10+7128]; +sub.f32 f1131, f944, f1130; +ld.shared.f32 f2208, [r10+4212]; +add.f32 f2207, f2208, f2209; +sub.f32 f1132, f2208, f2209; +mul.f32 f1133, f1132, 0f3F5DB3D7; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +ld.shared.f32 f2206, [r10+1296]; +add.f32 f2205, f2206, f2207; +mul.f32 f1136, f2207, 0f3F000000; +sub.f32 f1137, f2206, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0f3F5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +ld.shared.f32 f2204, [r10+5184]; +sub.f32 f1147, f947, f1146; +ld.shared.f32 f2203, [r10+8100]; +add.f32 f2202, f2204, f2203; +sub.f32 f1148, f2204, f2203; +mul.f32 f1149, f1148, 0f3F5DB3D7; +ld.shared.f32 f2201, [r10+2268]; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +add.f32 f2200, f2201, f2202; +mul.f32 f1152, f2202, 0f3F000000; +sub.f32 f1153, f2201, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0f3F5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f2198, f1134, 0f3F441B7D; +mul.f32 f2199, f1140, 0fBF248DBB; +sub.f32 f1160, f2198, f2199; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0fBF248DBB, f1161; +mul.f32 f2196, f1150, 0f3E31D0D4; +mul.f32 f2197, f1156, 0fBF7C1C5C; +sub.f32 f1165, f2196, f2197; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0fBF7C1C5C, f1166; +mul.f32 f1169, f1141, 0fBF7C1C5C; +mul.f32 f2195, f1135, 0f3E31D0D4; +sub.f32 f1170, f2195, f1169; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0fBF7C1C5C, f1171; +mul.f32 f1174, f1157, 0fBEAF1D44; +mul.f32 f2194, f1151, 0fBF708FB2; +sub.f32 f1175, f2194, f1174; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0fBEAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f2193, f2205, f2200; +sub.f32 f1184, f2205, f2200; +mul.f32 f1185, f1184, 0f3F5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f2192, f2210, f2193; +mul.f32 f1188, f2193, 0f3F000000; +sub.f32 f1189, f2210, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0f3F5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f2191, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0f3F5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f2190, f1124, f2191; +mul.f32 f1204, f2191, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0f3F5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f2189, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0f3F5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f2188, f1125, f2189; +mul.f32 f1220, f2189, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0f3F5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +ld.shared.f32 f2187, [r10+3564]; +sub.f32 f1231, f942, f1230; +ld.shared.f32 f2186, [r10+6480]; +add.f32 f2185, f2187, f2186; +sub.f32 f1232, f2187, f2186; +mul.f32 f1233, f1232, 0f3F5DB3D7; +ld.shared.f32 f2184, [r10+648]; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f2183, f2184, f2185; +mul.f32 f1236, f2185, 0f3F000000; +sub.f32 f1237, f2184, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0f3F5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +sub.f32 f1247, f945, f1246; +ld.shared.f32 f2182, [r10+4536]; +ld.shared.f32 f2181, [r10+7452]; +add.f32 f2180, f2182, f2181; +sub.f32 f1248, f2182, f2181; +mul.f32 f1249, f1248, 0f3F5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +ld.shared.f32 f2179, [r10+1620]; +add.f32 f2178, f2179, f2180; +mul.f32 f1252, f2180, 0f3F000000; +sub.f32 f1253, f2179, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0f3F5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +ld.shared.f32 f2177, [r10+8424]; +ld.shared.f32 f2176, [r10+5508]; +add.f32 f2175, f2176, f2177; +sub.f32 f1264, f2176, f2177; +mul.f32 f1265, f1264, 0f3F5DB3D7; +ld.shared.f32 f2174, [r10+2592]; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +add.f32 f2173, f2174, f2175; +mul.f32 f1268, f2175, 0f3F000000; +sub.f32 f1269, f2174, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0f3F5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f2171, f1250, 0f3F441B7D; +mul.f32 f2172, f1256, 0fBF248DBB; +sub.f32 f1276, f2171, f2172; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0fBF248DBB, f1277; +mul.f32 f2169, f1266, 0f3E31D0D4; +mul.f32 f2170, f1272, 0fBF7C1C5C; +sub.f32 f1281, f2169, f2170; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0fBF7C1C5C, f1282; +mul.f32 f1285, f1257, 0fBF7C1C5C; +mul.f32 f2168, f1251, 0f3E31D0D4; +sub.f32 f1286, f2168, f1285; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0fBF7C1C5C, f1287; +mul.f32 f1290, f1273, 0fBEAF1D44; +mul.f32 f2167, f1267, 0fBF708FB2; +sub.f32 f1291, f2167, f1290; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0fBEAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f2166, f2178, f2173; +sub.f32 f1300, f2178, f2173; +mul.f32 f1301, f1300, 0f3F5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f2165, f2183, f2166; +mul.f32 f1304, f2166, 0f3F000000; +sub.f32 f1305, f2183, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0f3F5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f2164, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0f3F5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f2163, f1240, f2164; +mul.f32 f1320, f2164, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0f3F5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f2162, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0f3F5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f2161, f1241, f2162; +mul.f32 f1336, f2162, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0f3F5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f2190, 0fBE6C2691; +mul.f32 f2160, f1195, 0f3F791978; +sub.f32 f1344, f2160, f1343; +mul.f32 f1345, f2190, 0f3F791978; +fma.rn.f32 f1346, f1195, 0fBE6C2691, f1345; +mul.f32 f2158, f1311, 0f3F64C51C; +mul.f32 f2159, f2163, 0fBEE5C902; +sub.f32 f1349, f2158, f2159; +mul.f32 f1350, f2163, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0fBEE5C902, f1350; +mul.f32 f2156, f1211, 0f3F64C51C; +mul.f32 f2157, f2188, 0fBEE5C902; +sub.f32 f1354, f2156, f2157; +mul.f32 f1355, f2188, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0fBEE5C902, f1355; +mul.f32 f2154, f1327, 0f3F18DF63; +mul.f32 f2155, f2161, 0fBF4D57F2; +sub.f32 f1359, f2154, f2155; +mul.f32 f1360, f2161, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0fBF4D57F2, f1360; +mul.f32 f2152, f1186, 0f3F441B7D; +mul.f32 f2153, f1192, 0fBF248DBB; +sub.f32 f1364, f2152, f2153; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0fBF248DBB, f1365; +mul.f32 f1368, f1308, 0fBF7C1C5C; +mul.f32 f2151, f1302, 0f3E31D0D4; +sub.f32 f1369, f2151, f1368; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0fBF7C1C5C, f1370; +mul.f32 f1373, f1208, 0fBF4D57F2; +mul.f32 f2150, f1202, 0f3F18DF63; +sub.f32 f1374, f2150, f1373; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0fBF4D57F2, f1375; +mul.f32 f1378, f1324, 0fBF753ECD; +mul.f32 f2149, f1318, 0fBE92D7E0; +sub.f32 f1379, f2149, f1378; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0fBF753ECD, f1380; +mul.f32 f1383, f1224, 0fBF6B1036; +mul.f32 f2148, f1218, 0f3ECACAF8; +sub.f32 f1384, f2148, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0fBF6B1036, f1385; +mul.f32 f1388, f1340, 0fBF3A3529; +mul.f32 f2147, f1334, 0fBF2FAD88; +sub.f32 f1389, f2147, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0fBF3A3529, f1390; +mul.f32 f1393, f1193, 0fBF7C1C5C; +mul.f32 f2146, f1187, 0f3E31D0D4; +sub.f32 f1394, f2146, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0fBF7C1C5C, f1395; +mul.f32 f2144, f1303, 0fBF708FB2; +mul.f32 f2145, f1309, 0fBEAF1D44; +sub.f32 f1399, f2144, f2145; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0fBEAF1D44, f1400; +mul.f32 f2142, f1203, 0fBD6E2946; +mul.f32 f2143, f1209, 0fBF7F9120; +sub.f32 f1404, f2142, f2143; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0fBF7F9120, f1405; +mul.f32 f2140, f1319, 0fBF7E44DE; +mul.f32 f2141, f1325, 0f3DEDC21F; +sub.f32 f1409, f2140, f2141; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0f3DEDC21F, f1410; +mul.f32 f1413, f1225, 0fBF753ECD; +mul.f32 f2139, f1219, 0fBE92D7E0; +sub.f32 f1414, f2139, f1413; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0fBF753ECD, f1415; +mul.f32 f1418, f1341, 0f3F0CAC9F; +mul.f32 f2138, f1335, 0fBF55E287; +sub.f32 f1419, f2138, f1418; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0f3F0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +add.f32 f1423, f1063, f1422; +mul.f32 f1426, f1422, 0f3F000000; +sub.f32 f1427, f1063, f1426; +add.f32 f2137, f2192, f2165; +sub.f32 f1428, f2192, f2165; +mul.f32 f1429, f1428, 0f3F5DB3D7; +add.f32 f1430, f1429, f1427; +sub.f32 f1431, f1427, f1429; +add.f32 f2136, f2219, f2137; +mul.f32 f1432, f2137, 0f3F000000; +sub.f32 f1433, f2219, f1432; +sub.f32 f1434, f1179, f1295; +mul.f32 f1435, f1434, 0f3F5DB3D7; +sub.f32 f1436, f1433, f1435; +add.f32 f1437, f1435, f1433; +add.f32 f1438, f1344, f1349; +add.f32 f1439, f1079, f1438; +mul.f32 f1442, f1438, 0f3F000000; +sub.f32 f1443, f1079, f1442; +add.f32 f2135, f1346, f1351; +sub.f32 f1444, f1346, f1351; +mul.f32 f1445, f1444, 0f3F5DB3D7; +add.f32 f1446, f1445, f1443; +sub.f32 f1447, f1443, f1445; +add.f32 f2134, f2217, f2135; +mul.f32 f1448, f2135, 0f3F000000; +sub.f32 f1449, f2217, f1448; +sub.f32 f1450, f1344, f1349; +mul.f32 f1451, f1450, 0f3F5DB3D7; +sub.f32 f1452, f1449, f1451; +add.f32 f1453, f1451, f1449; +add.f32 f1454, f1354, f1359; +add.f32 f1455, f1095, f1454; +mul.f32 f1458, f1454, 0f3F000000; +sub.f32 f1459, f1095, f1458; +add.f32 f2133, f1356, f1361; +sub.f32 f1460, f1356, f1361; +mul.f32 f1461, f1460, 0f3F5DB3D7; +add.f32 f1462, f1461, f1459; +sub.f32 f1463, f1459, f1461; +add.f32 f2132, f2215, f2133; +mul.f32 f1464, f2133, 0f3F000000; +sub.f32 f1465, f2215, f1464; +sub.f32 f1466, f1354, f1359; +mul.f32 f1467, f1466, 0f3F5DB3D7; +sub.f32 f1468, f1465, f1467; +add.f32 f1469, f1467, f1465; +add.f32 f1470, f1364, f1369; +add.f32 f1471, f1070, f1470; +mul.f32 f1474, f1470, 0f3F000000; +sub.f32 f1475, f1070, f1474; +add.f32 f2131, f1366, f1371; +sub.f32 f1476, f1366, f1371; +mul.f32 f1477, f1476, 0f3F5DB3D7; +add.f32 f1478, f1477, f1475; +sub.f32 f1479, f1475, f1477; +add.f32 f2130, f1076, f2131; +mul.f32 f1480, f2131, 0f3F000000; +sub.f32 f1481, f1076, f1480; +sub.f32 f1482, f1364, f1369; +mul.f32 f1483, f1482, 0f3F5DB3D7; +sub.f32 f1484, f1481, f1483; +add.f32 f1485, f1483, f1481; +add.f32 f1486, f1374, f1379; +add.f32 f1487, f1086, f1486; +mul.f32 f1490, f1486, 0f3F000000; +sub.f32 f1491, f1086, f1490; +add.f32 f2129, f1376, f1381; +sub.f32 f1492, f1376, f1381; +mul.f32 f1493, f1492, 0f3F5DB3D7; +add.f32 f1494, f1493, f1491; +sub.f32 f1495, f1491, f1493; +add.f32 f2128, f1092, f2129; +mul.f32 f1496, f2129, 0f3F000000; +sub.f32 f1497, f1092, f1496; +sub.f32 f1498, f1374, f1379; +mul.f32 f1499, f1498, 0f3F5DB3D7; +sub.f32 f1500, f1497, f1499; +add.f32 f1501, f1499, f1497; +add.f32 f1502, f1384, f1389; +add.f32 f1503, f1102, f1502; +mul.f32 f1506, f1502, 0f3F000000; +sub.f32 f1507, f1102, f1506; +add.f32 f2127, f1386, f1391; +sub.f32 f1508, f1386, f1391; +mul.f32 f1509, f1508, 0f3F5DB3D7; +add.f32 f1510, f1509, f1507; +sub.f32 f1511, f1507, f1509; +add.f32 f2126, f1108, f2127; +mul.f32 f1512, f2127, 0f3F000000; +sub.f32 f1513, f1108, f1512; +sub.f32 f1514, f1384, f1389; +mul.f32 f1515, f1514, 0f3F5DB3D7; +sub.f32 f1516, f1513, f1515; +add.f32 f1517, f1515, f1513; +add.f32 f1518, f1394, f1399; +add.f32 f1519, f1071, f1518; +mul.f32 f1522, f1518, 0f3F000000; +sub.f32 f1523, f1071, f1522; +add.f32 f2125, f1396, f1401; +sub.f32 f1524, f1396, f1401; +mul.f32 f1525, f1524, 0f3F5DB3D7; +add.f32 f1526, f1525, f1523; +sub.f32 f1527, f1523, f1525; +add.f32 f2124, f1077, f2125; +mul.f32 f1528, f2125, 0f3F000000; +sub.f32 f1529, f1077, f1528; +sub.f32 f1530, f1394, f1399; +mul.f32 f1531, f1530, 0f3F5DB3D7; +sub.f32 f1532, f1529, f1531; +add.f32 f1533, f1531, f1529; +add.f32 f1534, f1404, f1409; +add.f32 f1535, f1087, f1534; +mul.f32 f1538, f1534, 0f3F000000; +sub.f32 f1539, f1087, f1538; +add.f32 f2123, f1406, f1411; +sub.f32 f1540, f1406, f1411; +mul.f32 f1541, f1540, 0f3F5DB3D7; +add.f32 f1542, f1541, f1539; +sub.f32 f1543, f1539, f1541; +add.f32 f2122, f1093, f2123; +mul.f32 f1544, f2123, 0f3F000000; +sub.f32 f1545, f1093, f1544; +sub.f32 f1546, f1404, f1409; +mul.f32 f1547, f1546, 0f3F5DB3D7; +sub.f32 f1548, f1545, f1547; +add.f32 f1549, f1547, f1545; +add.f32 f1550, f1414, f1419; +add.f32 f1551, f1103, f1550; +mul.f32 f1554, f1550, 0f3F000000; +sub.f32 f1555, f1103, f1554; +add.f32 f2121, f1416, f1421; +sub.f32 f1556, f1416, f1421; +mul.f32 f1557, f1556, 0f3F5DB3D7; +add.f32 f1558, f1557, f1555; +sub.f32 f1559, f1555, f1557; +add.f32 f2120, f1109, f2121; +mul.f32 f1560, f2121, 0f3F000000; +sub.f32 f1561, f1109, f1560; +sub.f32 f1562, f1414, f1419; +mul.f32 f1563, f1562, 0f3F5DB3D7; +sub.f32 f1564, f1561, f1563; +add.f32 f1565, f1563, f1561; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1566, f1567}, [rd11]; +mul.f32 f2118, f1566, f1439; +mul.f32 f2119, f1567, f2134; +sub.f32 f1572, f2118, f2119; +mul.f32 f1573, f1566, f2134; +fma.rn.f32 f1574, f1567, f1439, f1573; +mul.f32 f2116, f1566, f1566; +mul.f32 f2117, f1567, f1567; +sub.f32 f1577, f2116, f2117; +mul.f32 f1578, f1567, f1566; +fma.rn.f32 f1579, f1567, f1566, f1578; +mul.f32 f2114, f1577, f1455; +mul.f32 f2115, f1579, f2132; +sub.f32 f1582, f2114, f2115; +mul.f32 f1583, f1577, f2132; +fma.rn.f32 f1584, f1579, f1455, f1583; +mul.f32 f1586, f1567, f1579; +mul.f32 f2113, f1566, f1577; +sub.f32 f1587, f2113, f1586; +mul.f32 f1588, f1566, f1579; +fma.rn.f32 f1589, f1567, f1577, f1588; +mul.f32 f1591, f1589, f2130; +mul.f32 f2112, f1587, f1471; +sub.f32 f1592, f2112, f1591; +mul.f32 f1593, f1587, f2130; +fma.rn.f32 f1594, f1589, f1471, f1593; +mul.f32 f1596, f1567, f1589; +mul.f32 f2111, f1566, f1587; +sub.f32 f1597, f2111, f1596; +mul.f32 f1598, f1566, f1589; +fma.rn.f32 f1599, f1567, f1587, f1598; +mul.f32 f1601, f1599, f2128; +mul.f32 f2110, f1597, f1487; +sub.f32 f1602, f2110, f1601; +mul.f32 f1603, f1597, f2128; +fma.rn.f32 f1604, f1599, f1487, f1603; +mul.f32 f1606, f1567, f1599; +mul.f32 f2109, f1566, f1597; +sub.f32 f1607, f2109, f1606; +mul.f32 f1608, f1566, f1599; +fma.rn.f32 f1609, f1567, f1597, f1608; +mul.f32 f2107, f1607, f1503; +mul.f32 f2108, f1609, f2126; +sub.f32 f1612, f2107, f2108; +mul.f32 f1613, f1607, f2126; +fma.rn.f32 f1614, f1609, f1503, f1613; +mul.f32 f2105, f1566, f1607; +mul.f32 f2106, f1567, f1609; +sub.f32 f1617, f2105, f2106; +mul.f32 f1618, f1566, f1609; +fma.rn.f32 f1619, f1567, f1607, f1618; +mul.f32 f2103, f1617, f1519; +mul.f32 f2104, f1619, f2124; +sub.f32 f1622, f2103, f2104; +mul.f32 f1623, f1617, f2124; +fma.rn.f32 f1624, f1619, f1519, f1623; +mul.f32 f2101, f1566, f1617; +mul.f32 f2102, f1567, f1619; +sub.f32 f1627, f2101, f2102; +mul.f32 f1628, f1566, f1619; +fma.rn.f32 f1629, f1567, f1617, f1628; +mul.f32 f1631, f1629, f2122; +mul.f32 f2100, f1627, f1535; +sub.f32 f1632, f2100, f1631; +mul.f32 f1633, f1627, f2122; +fma.rn.f32 f1634, f1629, f1535, f1633; +mul.f32 f1636, f1567, f1629; +mul.f32 f2099, f1566, f1627; +sub.f32 f1637, f2099, f1636; +mul.f32 f1638, f1566, f1629; +fma.rn.f32 f1639, f1567, f1627, f1638; +mul.f32 f1641, f1639, f2120; +mul.f32 f2098, f1637, f1551; +sub.f32 f1642, f2098, f1641; +mul.f32 f1643, f1637, f2120; +fma.rn.f32 f1644, f1639, f1551, f1643; +mul.f32 f1646, f1567, f1639; +mul.f32 f2097, f1566, f1637; +sub.f32 f1647, f2097, f1646; +mul.f32 f1648, f1566, f1639; +fma.rn.f32 f1649, f1567, f1637, f1648; +mul.f32 f1651, f1649, f1436; +mul.f32 f2096, f1647, f1430; +sub.f32 f1652, f2096, f1651; +mul.f32 f1653, f1647, f1436; +fma.rn.f32 f1654, f1649, f1430, f1653; +mul.f32 f2094, f1566, f1647; +mul.f32 f2095, f1567, f1649; +sub.f32 f1657, f2094, f2095; +mul.f32 f1658, f1566, f1649; +fma.rn.f32 f1659, f1567, f1647, f1658; +mul.f32 f2092, f1657, f1446; +mul.f32 f2093, f1659, f1452; +sub.f32 f1662, f2092, f2093; +mul.f32 f1663, f1657, f1452; +fma.rn.f32 f1664, f1659, f1446, f1663; +mul.f32 f2090, f1566, f1657; +mul.f32 f2091, f1567, f1659; +sub.f32 f1667, f2090, f2091; +mul.f32 f1668, f1566, f1659; +fma.rn.f32 f1669, f1567, f1657, f1668; +mul.f32 f2088, f1667, f1462; +mul.f32 f2089, f1669, f1468; +sub.f32 f1672, f2088, f2089; +mul.f32 f1673, f1667, f1468; +fma.rn.f32 f1674, f1669, f1462, f1673; +mul.f32 f1676, f1567, f1669; +mul.f32 f2087, f1566, f1667; +sub.f32 f1677, f2087, f1676; +mul.f32 f1678, f1566, f1669; +fma.rn.f32 f1679, f1567, f1667, f1678; +mul.f32 f1681, f1679, f1484; +mul.f32 f2086, f1677, f1478; +sub.f32 f1682, f2086, f1681; +mul.f32 f1683, f1677, f1484; +fma.rn.f32 f1684, f1679, f1478, f1683; +mul.f32 f1686, f1567, f1679; +mul.f32 f2085, f1566, f1677; +sub.f32 f1687, f2085, f1686; +mul.f32 f1688, f1566, f1679; +fma.rn.f32 f1689, f1567, f1677, f1688; +mul.f32 f1691, f1689, f1500; +mul.f32 f2084, f1687, f1494; +sub.f32 f1692, f2084, f1691; +mul.f32 f1693, f1687, f1500; +fma.rn.f32 f1694, f1689, f1494, f1693; +mul.f32 f1696, f1567, f1689; +mul.f32 f2083, f1566, f1687; +sub.f32 f1697, f2083, f1696; +mul.f32 f1698, f1566, f1689; +fma.rn.f32 f1699, f1567, f1687, f1698; +mul.f32 f1701, f1699, f1516; +mul.f32 f2082, f1697, f1510; +sub.f32 f1702, f2082, f1701; +mul.f32 f1703, f1697, f1516; +fma.rn.f32 f1704, f1699, f1510, f1703; +mul.f32 f2080, f1566, f1697; +mul.f32 f2081, f1567, f1699; +sub.f32 f1707, f2080, f2081; +mul.f32 f1708, f1566, f1699; +fma.rn.f32 f1709, f1567, f1697, f1708; +mul.f32 f2078, f1707, f1526; +mul.f32 f2079, f1709, f1532; +sub.f32 f1712, f2078, f2079; +mul.f32 f1713, f1707, f1532; +fma.rn.f32 f1714, f1709, f1526, f1713; +mul.f32 f2076, f1566, f1707; +mul.f32 f2077, f1567, f1709; +sub.f32 f1717, f2076, f2077; +mul.f32 f1718, f1566, f1709; +fma.rn.f32 f1719, f1567, f1707, f1718; +mul.f32 f1721, f1719, f1548; +mul.f32 f2075, f1717, f1542; +sub.f32 f1722, f2075, f1721; +mul.f32 f1723, f1717, f1548; +fma.rn.f32 f1724, f1719, f1542, f1723; +mul.f32 f1726, f1567, f1719; +mul.f32 f2074, f1566, f1717; +sub.f32 f1727, f2074, f1726; +mul.f32 f1728, f1566, f1719; +fma.rn.f32 f1729, f1567, f1717, f1728; +mul.f32 f1731, f1729, f1564; +mul.f32 f2073, f1727, f1558; +sub.f32 f1732, f2073, f1731; +mul.f32 f1733, f1727, f1564; +fma.rn.f32 f1734, f1729, f1558, f1733; +mul.f32 f1736, f1567, f1729; +mul.f32 f2072, f1566, f1727; +sub.f32 f1737, f2072, f1736; +mul.f32 f1738, f1566, f1729; +fma.rn.f32 f1739, f1567, f1727, f1738; +mul.f32 f1741, f1739, f1437; +mul.f32 f2071, f1737, f1431; +sub.f32 f1742, f2071, f1741; +mul.f32 f1743, f1737, f1437; +fma.rn.f32 f1744, f1739, f1431, f1743; +mul.f32 f1746, f1567, f1739; +mul.f32 f2070, f1566, f1737; +sub.f32 f1747, f2070, f1746; +mul.f32 f1748, f1566, f1739; +fma.rn.f32 f1749, f1567, f1737, f1748; +mul.f32 f2068, f1747, f1447; +mul.f32 f2069, f1749, f1453; +sub.f32 f1752, f2068, f2069; +mul.f32 f1753, f1747, f1453; +fma.rn.f32 f1754, f1749, f1447, f1753; +mul.f32 f2066, f1566, f1747; +mul.f32 f2067, f1567, f1749; +sub.f32 f1757, f2066, f2067; +mul.f32 f1758, f1566, f1749; +fma.rn.f32 f1759, f1567, f1747, f1758; +mul.f32 f2064, f1757, f1463; +mul.f32 f2065, f1759, f1469; +sub.f32 f1762, f2064, f2065; +mul.f32 f1763, f1757, f1469; +fma.rn.f32 f1764, f1759, f1463, f1763; +mul.f32 f2062, f1566, f1757; +mul.f32 f2063, f1567, f1759; +sub.f32 f1767, f2062, f2063; +mul.f32 f1768, f1566, f1759; +fma.rn.f32 f1769, f1567, f1757, f1768; +mul.f32 f1771, f1769, f1485; +mul.f32 f2061, f1767, f1479; +sub.f32 f1772, f2061, f1771; +mul.f32 f1773, f1767, f1485; +fma.rn.f32 f1774, f1769, f1479, f1773; +mul.f32 f1776, f1567, f1769; +mul.f32 f2060, f1566, f1767; +sub.f32 f1777, f2060, f1776; +mul.f32 f1778, f1566, f1769; +fma.rn.f32 f1779, f1567, f1767, f1778; +mul.f32 f1781, f1779, f1501; +mul.f32 f2059, f1777, f1495; +sub.f32 f1782, f2059, f1781; +mul.f32 f1783, f1777, f1501; +fma.rn.f32 f1784, f1779, f1495, f1783; +mul.f32 f1786, f1567, f1779; +mul.f32 f2058, f1566, f1777; +sub.f32 f1787, f2058, f1786; +mul.f32 f1788, f1566, f1779; +fma.rn.f32 f1789, f1567, f1777, f1788; +mul.f32 f1791, f1789, f1517; +mul.f32 f2057, f1787, f1511; +sub.f32 f1792, f2057, f1791; +mul.f32 f1793, f1787, f1517; +fma.rn.f32 f1794, f1789, f1511, f1793; +mul.f32 f2055, f1566, f1787; +mul.f32 f2056, f1567, f1789; +sub.f32 f1797, f2055, f2056; +mul.f32 f1798, f1566, f1789; +fma.rn.f32 f1799, f1567, f1787, f1798; +mul.f32 f2053, f1797, f1527; +mul.f32 f2054, f1799, f1533; +sub.f32 f1802, f2053, f2054; +mul.f32 f1803, f1797, f1533; +fma.rn.f32 f1804, f1799, f1527, f1803; +mul.f32 f2051, f1566, f1797; +mul.f32 f2052, f1567, f1799; +sub.f32 f1807, f2051, f2052; +mul.f32 f1808, f1566, f1799; +fma.rn.f32 f1809, f1567, f1797, f1808; +mul.f32 f2049, f1807, f1543; +mul.f32 f2050, f1809, f1549; +sub.f32 f1812, f2049, f2050; +mul.f32 f1813, f1807, f1549; +fma.rn.f32 f1814, f1809, f1543, f1813; +mul.f32 f1816, f1567, f1809; +mul.f32 f2048, f1566, f1807; +sub.f32 f1817, f2048, f1816; +mul.f32 f1818, f1566, f1809; +fma.rn.f32 f1819, f1567, f1807, f1818; +mul.f32 f1821, f1819, f1565; +mul.f32 f2047, f1817, f1559; +sub.f32 f1822, f2047, f1821; +mul.f32 f1823, f1817, f1565; +fma.rn.f32 f1824, f1819, f1559, f1823; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 2916, r19; +st.shared.f32 [r20], f1423; +st.shared.f32 [r20+108], f1572; +st.shared.f32 [r20+216], f1582; +st.shared.f32 [r20+324], f1592; +st.shared.f32 [r20+432], f1602; +st.shared.f32 [r20+540], f1612; +st.shared.f32 [r20+648], f1622; +st.shared.f32 [r20+756], f1632; +st.shared.f32 [r20+864], f1642; +st.shared.f32 [r20+972], f1652; +st.shared.f32 [r20+1080], f1662; +st.shared.f32 [r20+1188], f1672; +st.shared.f32 [r20+1296], f1682; +st.shared.f32 [r20+1404], f1692; +st.shared.f32 [r20+1512], f1702; +st.shared.f32 [r20+1620], f1712; +st.shared.f32 [r20+1728], f1722; +st.shared.f32 [r20+1836], f1732; +st.shared.f32 [r20+1944], f1742; +st.shared.f32 [r20+2052], f1752; +st.shared.f32 [r20+2160], f1762; +st.shared.f32 [r20+2268], f1772; +st.shared.f32 [r20+2376], f1782; +st.shared.f32 [r20+2484], f1792; +st.shared.f32 [r20+2592], f1802; +st.shared.f32 [r20+2700], f1812; +st.shared.f32 [r20+2808], f1822; +barrier.sync 0; +ld.shared.f32 f1825, [r10]; +ld.shared.f32 f1826, [r10+324]; +ld.shared.f32 f1827, [r10+648]; +ld.shared.f32 f1828, [r10+972]; +ld.shared.f32 f1829, [r10+1296]; +ld.shared.f32 f1830, [r10+1620]; +ld.shared.f32 f1831, [r10+1944]; +ld.shared.f32 f1832, [r10+2268]; +ld.shared.f32 f1833, [r10+2592]; +ld.shared.f32 f1834, [r10+2916]; +ld.shared.f32 f1835, [r10+3240]; +ld.shared.f32 f1836, [r10+3564]; +ld.shared.f32 f1837, [r10+3888]; +ld.shared.f32 f1838, [r10+4212]; +ld.shared.f32 f1839, [r10+4536]; +ld.shared.f32 f1840, [r10+4860]; +ld.shared.f32 f1841, [r10+5184]; +ld.shared.f32 f1842, [r10+5508]; +ld.shared.f32 f1843, [r10+5832]; +ld.shared.f32 f1844, [r10+6156]; +ld.shared.f32 f1845, [r10+6480]; +ld.shared.f32 f1846, [r10+6804]; +ld.shared.f32 f1847, [r10+7128]; +ld.shared.f32 f1848, [r10+7452]; +ld.shared.f32 f1849, [r10+7776]; +ld.shared.f32 f1850, [r10+8100]; +ld.shared.f32 f1851, [r10+8424]; +barrier.sync 0; +st.shared.f32 [r20], f2136; +st.shared.f32 [r20+108], f1574; +st.shared.f32 [r20+216], f1584; +st.shared.f32 [r20+324], f1594; +st.shared.f32 [r20+432], f1604; +st.shared.f32 [r20+540], f1614; +st.shared.f32 [r20+648], f1624; +st.shared.f32 [r20+756], f1634; +st.shared.f32 [r20+864], f1644; +st.shared.f32 [r20+972], f1654; +st.shared.f32 [r20+1080], f1664; +st.shared.f32 [r20+1188], f1674; +st.shared.f32 [r20+1296], f1684; +st.shared.f32 [r20+1404], f1694; +st.shared.f32 [r20+1512], f1704; +st.shared.f32 [r20+1620], f1714; +st.shared.f32 [r20+1728], f1724; +st.shared.f32 [r20+1836], f1734; +st.shared.f32 [r20+1944], f1744; +st.shared.f32 [r20+2052], f1754; +st.shared.f32 [r20+2160], f1764; +st.shared.f32 [r20+2268], f1774; +st.shared.f32 [r20+2376], f1784; +st.shared.f32 [r20+2484], f1794; +st.shared.f32 [r20+2592], f1804; +st.shared.f32 [r20+2700], f1814; +st.shared.f32 [r20+2808], f1824; +barrier.sync 0; +ld.shared.f32 f1852, [r10]; +ld.shared.f32 f1853, [r10+324]; +ld.shared.f32 f1854, [r10+648]; +ld.shared.f32 f1855, [r10+972]; +ld.shared.f32 f1856, [r10+1296]; +ld.shared.f32 f1857, [r10+1620]; +ld.shared.f32 f1858, [r10+1944]; +ld.shared.f32 f1859, [r10+2268]; +ld.shared.f32 f1860, [r10+2592]; +ld.shared.f32 f1861, [r10+2916]; +ld.shared.f32 f1862, [r10+3240]; +ld.shared.f32 f1863, [r10+3564]; +ld.shared.f32 f1864, [r10+3888]; +ld.shared.f32 f1865, [r10+4212]; +ld.shared.f32 f1866, [r10+4536]; +ld.shared.f32 f1867, [r10+4860]; +ld.shared.f32 f1868, [r10+5184]; +ld.shared.f32 f1869, [r10+5508]; +ld.shared.f32 f1870, [r10+5832]; +ld.shared.f32 f1871, [r10+6156]; +ld.shared.f32 f1872, [r10+6480]; +ld.shared.f32 f1873, [r10+6804]; +ld.shared.f32 f1874, [r10+7128]; +ld.shared.f32 f1875, [r10+7452]; +ld.shared.f32 f1876, [r10+7776]; +ld.shared.f32 f1877, [r10+8100]; +ld.shared.f32 f1878, [r10+8424]; +add.f32 f1879, f1834, f1843; +mul.f32 f1881, f1879, 0f3F000000; +sub.f32 f1882, f1825, f1881; +add.f32 f2046, f1861, f1870; +sub.f32 f1883, f1861, f1870; +mul.f32 f1884, f1883, 0f3F5DB3D7; +mul.f32 f1885, f2046, 0f3F000000; +sub.f32 f1886, f1852, f1885; +sub.f32 f1887, f1834, f1843; +mul.f32 f1888, f1887, 0f3F5DB3D7; +add.f32 f1889, f1835, f1844; +mul.f32 f1891, f1889, 0f3F000000; +sub.f32 f1892, f1826, f1891; +add.f32 f2045, f1862, f1871; +sub.f32 f1893, f1862, f1871; +mul.f32 f1894, f1893, 0f3F5DB3D7; +mul.f32 f1895, f2045, 0f3F000000; +sub.f32 f1896, f1853, f1895; +sub.f32 f1897, f1835, f1844; +mul.f32 f1898, f1897, 0f3F5DB3D7; +add.f32 f1899, f1836, f1845; +mul.f32 f1901, f1899, 0f3F000000; +sub.f32 f1902, f1827, f1901; +add.f32 f2044, f1863, f1872; +sub.f32 f1903, f1863, f1872; +mul.f32 f1904, f1903, 0f3F5DB3D7; +mul.f32 f1905, f2044, 0f3F000000; +sub.f32 f1906, f1854, f1905; +sub.f32 f1907, f1836, f1845; +mul.f32 f1908, f1907, 0f3F5DB3D7; +add.f32 f1909, f1837, f1846; +mul.f32 f1911, f1909, 0f3F000000; +sub.f32 f1912, f1828, f1911; +add.f32 f2043, f1864, f1873; +sub.f32 f1913, f1864, f1873; +mul.f32 f1914, f1913, 0f3F5DB3D7; +mul.f32 f1915, f2043, 0f3F000000; +sub.f32 f1916, f1855, f1915; +sub.f32 f1917, f1837, f1846; +mul.f32 f1918, f1917, 0f3F5DB3D7; +add.f32 f1919, f1838, f1847; +mul.f32 f1921, f1919, 0f3F000000; +sub.f32 f1922, f1829, f1921; +add.f32 f2042, f1865, f1874; +sub.f32 f1923, f1865, f1874; +mul.f32 f1924, f1923, 0f3F5DB3D7; +mul.f32 f1925, f2042, 0f3F000000; +sub.f32 f1926, f1856, f1925; +sub.f32 f1927, f1838, f1847; +mul.f32 f1928, f1927, 0f3F5DB3D7; +add.f32 f1929, f1839, f1848; +mul.f32 f1931, f1929, 0f3F000000; +sub.f32 f1932, f1830, f1931; +add.f32 f2041, f1866, f1875; +sub.f32 f1933, f1866, f1875; +mul.f32 f1934, f1933, 0f3F5DB3D7; +mul.f32 f1935, f2041, 0f3F000000; +sub.f32 f1936, f1857, f1935; +sub.f32 f1937, f1839, f1848; +mul.f32 f1938, f1937, 0f3F5DB3D7; +add.f32 f1939, f1840, f1849; +mul.f32 f1941, f1939, 0f3F000000; +sub.f32 f1942, f1831, f1941; +add.f32 f2040, f1867, f1876; +sub.f32 f1943, f1867, f1876; +mul.f32 f1944, f1943, 0f3F5DB3D7; +mul.f32 f1945, f2040, 0f3F000000; +sub.f32 f1946, f1858, f1945; +sub.f32 f1947, f1840, f1849; +mul.f32 f1948, f1947, 0f3F5DB3D7; +add.f32 f1949, f1841, f1850; +mul.f32 f1951, f1949, 0f3F000000; +sub.f32 f1952, f1832, f1951; +add.f32 f2039, f1868, f1877; +sub.f32 f1953, f1868, f1877; +mul.f32 f1954, f1953, 0f3F5DB3D7; +mul.f32 f1955, f2039, 0f3F000000; +sub.f32 f1956, f1859, f1955; +sub.f32 f1957, f1841, f1850; +mul.f32 f1958, f1957, 0f3F5DB3D7; +add.f32 f1959, f1842, f1851; +mul.f32 f1961, f1959, 0f3F000000; +sub.f32 f1962, f1833, f1961; +add.f32 f2038, f1869, f1878; +sub.f32 f1963, f1869, f1878; +mul.f32 f1964, f1963, 0f3F5DB3D7; +mul.f32 f1965, f2038, 0f3F000000; +sub.f32 f1966, f1860, f1965; +sub.f32 f1967, f1842, f1851; +mul.f32 f2425, f1939, 0f3F000000; +sub.f32 f2424, f1831, f2425; +mul.f32 f1968, f1967, 0f3F5DB3D7; +add.f32 %0, f1825, f1879; +mul.f32 f2427, f2039, 0f3F000000; +sub.f32 f2426, f1859, f2427; +add.f32 %1, f1852, f2046; +mul.f32 f2429, f1939, 0f3F000000; +sub.f32 f2428, f1831, f2429; +mul.f32 f2431, f1929, 0f3F000000; +sub.f32 f2430, f1830, f2431; +add.f32 %2, f1826, f1889; +add.f32 %3, f1853, f2045; +add.f32 %4, f1827, f1899; +add.f32 %5, f1854, f2044; +add.f32 %6, f1828, f1909; +add.f32 %7, f1855, f2043; +add.f32 %8, f1829, f1919; +add.f32 %9, f1856, f2042; +add.f32 %10, f1830, f1929; +add.f32 %11, f1857, f2041; +add.f32 %12, f1831, f1939; +add.f32 %13, f1858, f2040; +add.f32 %14, f1832, f1949; +add.f32 %15, f1859, f2039; +add.f32 %16, f1833, f1959; +add.f32 %17, f1860, f2038; +add.f32 %18, f1884, f1882; +sub.f32 %19, f1886, f1888; +add.f32 %20, f1894, f1892; +sub.f32 %21, f1896, f1898; +add.f32 %22, f1904, f1902; +sub.f32 %23, f1906, f1908; +sub.f32 %25, f1916, f1918; +add.f32 %24, f1914, f1912; +sub.f32 %27, f1926, f1928; +add.f32 %26, f1924, f1922; +sub.f32 %29, f1936, f1938; +add.f32 %28, f1934, f2430; +add.f32 %30, f1944, f2428; +sub.f32 %31, f1946, f1948; +add.f32 %32, f1954, f1952; +sub.f32 %33, f2426, f1958; +add.f32 %34, f1964, f1962; +sub.f32 %35, f1966, f1968; +sub.f32 %36, f1882, f1884; +add.f32 %37, f1888, f1886; +sub.f32 %38, f1892, f1894; +add.f32 %39, f1898, f1896; +sub.f32 %40, f1902, f1904; +add.f32 %41, f1908, f1906; +sub.f32 %42, f1912, f1914; +add.f32 %43, f1918, f1916; +sub.f32 %44, f1922, f1924; +add.f32 %45, f1928, f1926; +sub.f32 %46, f2430, f1934; +add.f32 %47, f1938, f1936; +sub.f32 %48, f2428, f1944; +add.f32 %49, f1948, f1946; +sub.f32 %50, f1952, f1954; +add.f32 %51, f1958, f2426; +sub.f32 %52, f1962, f1964; +add.f32 %53, f1968, f1966; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_2187), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<148, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<311>; +.reg .b32 r<46>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 17496, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %15, %18; +add.f32 f14, %17, %19; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %13, f15; +sub.f32 f17, %17, %19; +mul.f32 f18, f17, 0f3F5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %14, f21; +sub.f32 f23, %15, %18; +mul.f32 f24, f23, 0f3F5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f27, f19; +mul.f32 f32, f28, f25; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f36, f20; +mul.f32 f40, f38, f26; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %14, f14; +add.f32 f43, %13, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f28, f19, f33; +sub.f32 f45, f31, f32; +st.shared.v2.f32 [r9+8], {f45, f44}; +sub.f32 f46, f39, f40; +fma.rn.f32 f47, f38, f20, f41; +st.shared.v2.f32 [r9+16], {f46, f47}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+5832]; +ld.shared.v2.f32 {f56, f57}, [r11+11664]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0f3F5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0f3F5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f74, f66; +mul.f32 f79, f75, f72; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f83, f67; +mul.f32 f87, f85, f73; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f75, f66, f80; +sub.f32 f92, f78, f79; +st.shared.v2.f32 [r17+24], {f92, f91}; +fma.rn.f32 f93, f85, f67, f88; +sub.f32 f94, f86, f87; +st.shared.v2.f32 [r17+48], {f94, f93}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+5832]; +ld.shared.v2.f32 {f103, f104}, [r11+11664]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f121, f113; +mul.f32 f126, f122, f119; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f130, f114; +mul.f32 f134, f132, f120; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r23], {f137, f136}; +fma.rn.f32 f138, f122, f113, f127; +sub.f32 f139, f125, f126; +st.shared.v2.f32 [r23+72], {f139, f138}; +fma.rn.f32 f140, f132, f114, f135; +sub.f32 f141, f133, f134; +st.shared.v2.f32 [r23+144], {f141, f140}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r11]; +ld.shared.v2.f32 {f146, f147}, [r11+5832]; +ld.shared.v2.f32 {f150, f151}, [r11+11664]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0f3F5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f155, 0f3F000000; +sub.f32 f163, f143, f162; +sub.f32 f164, f146, f150; +mul.f32 f165, f164, 0f3F5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f168, f169}, [rd21]; +mul.f32 f172, f168, f160; +mul.f32 f173, f169, f166; +mul.f32 f174, f168, f166; +mul.f32 f175, f168, f168; +mul.f32 f176, f169, f169; +sub.f32 f177, f175, f176; +mul.f32 f178, f169, f168; +fma.rn.f32 f179, f169, f168, f178; +mul.f32 f180, f177, f161; +mul.f32 f181, f179, f167; +mul.f32 f182, f177, f167; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +add.f32 f183, f143, f155; +add.f32 f184, f142, f154; +st.shared.v2.f32 [r33], {f184, f183}; +fma.rn.f32 f185, f169, f160, f174; +sub.f32 f186, f172, f173; +st.shared.v2.f32 [r33+216], {f186, f185}; +fma.rn.f32 f187, f179, f161, f182; +sub.f32 f188, f180, f181; +st.shared.v2.f32 [r33+432], {f188, f187}; +barrier.sync 0; +ld.shared.v2.f32 {f189, f190}, [r11]; +ld.shared.v2.f32 {f193, f194}, [r11+5832]; +ld.shared.v2.f32 {f197, f198}, [r11+11664]; +add.f32 f201, f193, f197; +add.f32 f202, f194, f198; +mul.f32 f203, f201, 0f3F000000; +sub.f32 f204, f189, f203; +sub.f32 f205, f194, f198; +mul.f32 f206, f205, 0f3F5DB3D7; +add.f32 f207, f206, f204; +sub.f32 f208, f204, f206; +mul.f32 f209, f202, 0f3F000000; +sub.f32 f210, f190, f209; +sub.f32 f211, f193, f197; +mul.f32 f212, f211, 0f3F5DB3D7; +sub.f32 f213, f210, f212; +add.f32 f214, f212, f210; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f215, f216}, [rd26]; +mul.f32 f219, f215, f207; +mul.f32 f220, f216, f213; +mul.f32 f221, f215, f213; +mul.f32 f222, f215, f215; +mul.f32 f223, f216, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f216, f215; +fma.rn.f32 f226, f216, f215, f225; +mul.f32 f227, f224, f208; +mul.f32 f228, f226, f214; +mul.f32 f229, f224, f214; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +add.f32 f230, f190, f202; +add.f32 f231, f189, f201; +st.shared.v2.f32 [r39], {f231, f230}; +fma.rn.f32 f232, f216, f207, f221; +sub.f32 f233, f219, f220; +st.shared.v2.f32 [r39+648], {f233, f232}; +fma.rn.f32 f234, f226, f208, f229; +sub.f32 f235, f227, f228; +st.shared.v2.f32 [r39+1296], {f235, f234}; +barrier.sync 0; +ld.shared.v2.f32 {f236, f237}, [r11]; +ld.shared.v2.f32 {f240, f241}, [r11+5832]; +ld.shared.v2.f32 {f244, f245}, [r11+11664]; +add.f32 f248, f240, f244; +add.f32 f249, f241, f245; +mul.f32 f250, f248, 0f3F000000; +sub.f32 f251, f236, f250; +sub.f32 f252, f241, f245; +mul.f32 f253, f252, 0f3F5DB3D7; +add.f32 f254, f253, f251; +sub.f32 f255, f251, f253; +mul.f32 f256, f249, 0f3F000000; +sub.f32 f257, f237, f256; +sub.f32 f258, f240, f244; +mul.f32 f259, f258, 0f3F5DB3D7; +sub.f32 f260, f257, f259; +add.f32 f261, f259, f257; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 3; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 8; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f32 {f262, f263}, [rd31]; +mul.f32 f266, f262, f254; +mul.f32 f267, f263, f260; +mul.f32 f268, f262, f260; +mul.f32 f269, f262, f262; +mul.f32 f270, f263, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f263, f262; +fma.rn.f32 f273, f263, f262, f272; +mul.f32 f274, f271, f255; +mul.f32 f275, f273, f261; +mul.f32 f276, f271, f261; +barrier.sync 0; +mad.lo.s32 r45, r40, 5832, r44; +add.f32 f277, f237, f249; +add.f32 f278, f236, f248; +st.shared.v2.f32 [r45], {f278, f277}; +fma.rn.f32 f279, f263, f254, f268; +sub.f32 f280, f266, f267; +st.shared.v2.f32 [r45+1944], {f280, f279}; +fma.rn.f32 f281, f273, f255, f276; +sub.f32 f282, f274, f275; +st.shared.v2.f32 [r45+3888], {f282, f281}; +barrier.sync 0; +ld.shared.v2.f32 {f283, f284}, [r11]; +ld.shared.v2.f32 {f287, f288}, [r11+5832]; +ld.shared.v2.f32 {f291, f292}, [r11+11664]; +add.f32 f295, f287, f291; +add.f32 f296, f288, f292; +mul.f32 f297, f295, 0f3F000000; +sub.f32 f298, f283, f297; +sub.f32 f299, f288, f292; +mul.f32 f300, f299, 0f3F5DB3D7; +mul.f32 f301, f296, 0f3F000000; +sub.f32 f302, f284, f301; +sub.f32 f303, f287, f291; +mul.f32 f304, f303, 0f3F5DB3D7; +add.f32 %1, f284, f296; +add.f32 %0, f283, f295; +sub.f32 %3, f302, f304; +add.f32 %2, f300, f298; +add.f32 %5, f304, f302; +sub.f32 %4, f298, f300; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_2187), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<149, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<275>; +.reg .b32 r<46>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 8748, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %15, %18; +add.f32 f14, %13, f13; +add.f32 f15, %17, %19; +add.f32 f16, %14, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %13, f17; +sub.f32 f19, %17, %19; +mul.f32 f20, f19, 0f3F5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %14, f23; +sub.f32 f25, %15, %18; +mul.f32 f26, f25, 0f3F5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 8748, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f29, f21; +mul.f32 f34, f30, f27; +sub.f32 f35, f33, f34; +mul.f32 f36, f29, f27; +fma.rn.f32 f37, f30, f21, f36; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f40, f22; +mul.f32 f44, f42, f28; +sub.f32 f45, f43, f44; +mul.f32 f46, f40, f28; +fma.rn.f32 f47, f42, f22, f46; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f35; +st.shared.f32 [r9+8], f45; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+2916]; +ld.shared.f32 f50, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+2916]; +ld.shared.f32 f53, [r11+5832]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0f3F5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0f3F5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f70, f62; +mul.f32 f75, f71, f68; +sub.f32 f76, f74, f75; +mul.f32 f77, f70, f68; +fma.rn.f32 f78, f71, f62, f77; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f81, f63; +mul.f32 f85, f83, f69; +sub.f32 f86, f84, f85; +mul.f32 f87, f81, f69; +fma.rn.f32 f88, f83, f63, f87; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f76; +st.shared.f32 [r17+24], f86; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+2916]; +ld.shared.f32 f91, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+2916]; +ld.shared.f32 f94, [r11+5832]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0f3F5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0f3F5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f111, f103; +mul.f32 f116, f112, f109; +sub.f32 f117, f115, f116; +mul.f32 f118, f111, f109; +fma.rn.f32 f119, f112, f103, f118; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f122, f104; +mul.f32 f126, f124, f110; +sub.f32 f127, f125, f126; +mul.f32 f128, f122, f110; +fma.rn.f32 f129, f124, f104, f128; +barrier.sync 0; +mad.lo.s32 r23, r18, 108, r22; +st.shared.f32 [r23], f96; +st.shared.f32 [r23+36], f117; +st.shared.f32 [r23+72], f127; +barrier.sync 0; +ld.shared.f32 f130, [r11]; +ld.shared.f32 f131, [r11+2916]; +ld.shared.f32 f132, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r23], f98; +st.shared.f32 [r23+36], f119; +st.shared.f32 [r23+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r11]; +ld.shared.f32 f134, [r11+2916]; +ld.shared.f32 f135, [r11+5832]; +add.f32 f136, f131, f132; +add.f32 f137, f130, f136; +add.f32 f138, f134, f135; +add.f32 f139, f133, f138; +mul.f32 f140, f136, 0f3F000000; +sub.f32 f141, f130, f140; +sub.f32 f142, f134, f135; +mul.f32 f143, f142, 0f3F5DB3D7; +add.f32 f144, f143, f141; +sub.f32 f145, f141, f143; +mul.f32 f146, f138, 0f3F000000; +sub.f32 f147, f133, f146; +sub.f32 f148, f131, f132; +mul.f32 f149, f148, 0f3F5DB3D7; +sub.f32 f150, f147, f149; +add.f32 f151, f149, f147; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 2; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f152, f153}, [rd21]; +mul.f32 f156, f152, f144; +mul.f32 f157, f153, f150; +sub.f32 f158, f156, f157; +mul.f32 f159, f152, f150; +fma.rn.f32 f160, f153, f144, f159; +mul.f32 f161, f152, f152; +mul.f32 f162, f153, f153; +sub.f32 f163, f161, f162; +mul.f32 f164, f153, f152; +fma.rn.f32 f165, f153, f152, f164; +mul.f32 f166, f163, f145; +mul.f32 f167, f165, f151; +sub.f32 f168, f166, f167; +mul.f32 f169, f163, f151; +fma.rn.f32 f170, f165, f145, f169; +barrier.sync 0; +mad.lo.s32 r33, r28, 324, r32; +st.shared.f32 [r33], f137; +st.shared.f32 [r33+108], f158; +st.shared.f32 [r33+216], f168; +barrier.sync 0; +ld.shared.f32 f171, [r11]; +ld.shared.f32 f172, [r11+2916]; +ld.shared.f32 f173, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r33], f139; +st.shared.f32 [r33+108], f160; +st.shared.f32 [r33+216], f170; +barrier.sync 0; +ld.shared.f32 f174, [r11]; +ld.shared.f32 f175, [r11+2916]; +ld.shared.f32 f176, [r11+5832]; +add.f32 f177, f172, f173; +add.f32 f178, f171, f177; +add.f32 f179, f175, f176; +add.f32 f180, f174, f179; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f171, f181; +sub.f32 f183, f175, f176; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +mul.f32 f187, f179, 0f3F000000; +sub.f32 f188, f174, f187; +sub.f32 f189, f172, f173; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 2; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f193, f194}, [rd26]; +mul.f32 f197, f193, f185; +mul.f32 f198, f194, f191; +sub.f32 f199, f197, f198; +mul.f32 f200, f193, f191; +fma.rn.f32 f201, f194, f185, f200; +mul.f32 f202, f193, f193; +mul.f32 f203, f194, f194; +sub.f32 f204, f202, f203; +mul.f32 f205, f194, f193; +fma.rn.f32 f206, f194, f193, f205; +mul.f32 f207, f204, f186; +mul.f32 f208, f206, f192; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f192; +fma.rn.f32 f211, f206, f186, f210; +barrier.sync 0; +mad.lo.s32 r39, r34, 972, r38; +st.shared.f32 [r39], f178; +st.shared.f32 [r39+324], f199; +st.shared.f32 [r39+648], f209; +barrier.sync 0; +ld.shared.f32 f212, [r11]; +ld.shared.f32 f213, [r11+2916]; +ld.shared.f32 f214, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r39], f180; +st.shared.f32 [r39+324], f201; +st.shared.f32 [r39+648], f211; +barrier.sync 0; +ld.shared.f32 f215, [r11]; +ld.shared.f32 f216, [r11+2916]; +ld.shared.f32 f217, [r11+5832]; +add.f32 f218, f213, f214; +add.f32 f219, f212, f218; +add.f32 f220, f216, f217; +add.f32 f221, f215, f220; +mul.f32 f222, f218, 0f3F000000; +sub.f32 f223, f212, f222; +sub.f32 f224, f216, f217; +mul.f32 f225, f224, 0f3F5DB3D7; +add.f32 f226, f225, f223; +sub.f32 f227, f223, f225; +mul.f32 f228, f220, 0f3F000000; +sub.f32 f229, f215, f228; +sub.f32 f230, f213, f214; +mul.f32 f231, f230, 0f3F5DB3D7; +sub.f32 f232, f229, f231; +add.f32 f233, f231, f229; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 2; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 8; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f32 {f234, f235}, [rd31]; +mul.f32 f238, f234, f226; +mul.f32 f239, f235, f232; +sub.f32 f240, f238, f239; +mul.f32 f241, f234, f232; +fma.rn.f32 f242, f235, f226, f241; +mul.f32 f243, f234, f234; +mul.f32 f244, f235, f235; +sub.f32 f245, f243, f244; +mul.f32 f246, f235, f234; +fma.rn.f32 f247, f235, f234, f246; +mul.f32 f248, f245, f227; +mul.f32 f249, f247, f233; +sub.f32 f250, f248, f249; +mul.f32 f251, f245, f233; +fma.rn.f32 f252, f247, f227, f251; +barrier.sync 0; +mad.lo.s32 r45, r40, 2916, r44; +st.shared.f32 [r45], f219; +st.shared.f32 [r45+972], f240; +st.shared.f32 [r45+1944], f250; +barrier.sync 0; +ld.shared.f32 f253, [r11]; +ld.shared.f32 f254, [r11+2916]; +ld.shared.f32 f255, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r45], f221; +st.shared.f32 [r45+972], f242; +st.shared.f32 [r45+1944], f252; +barrier.sync 0; +ld.shared.f32 f256, [r11]; +ld.shared.f32 f257, [r11+2916]; +ld.shared.f32 f258, [r11+5832]; +add.f32 f259, f254, f255; +add.f32 f260, f257, f258; +mul.f32 f261, f259, 0f3F000000; +sub.f32 f262, f253, f261; +sub.f32 f263, f257, f258; +mul.f32 f264, f263, 0f3F5DB3D7; +mul.f32 f265, f260, 0f3F000000; +sub.f32 f266, f256, f265; +sub.f32 f267, f254, f255; +mul.f32 f268, f267, 0f3F5DB3D7; +add.f32 %0, f253, f259; +add.f32 %1, f256, f260; +add.f32 %2, f264, f262; +sub.f32 %3, f266, f268; +sub.f32 %4, f262, f264; +add.f32 %5, f268, f266; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_2187), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..09d9880a6360f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp32_inv.hpp.inc @@ -0,0 +1,6300 @@ +#ifndef CUFFTDX_FFT_2187_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_2187_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<347, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2481>; +.reg .b32 r<25>; +.reg .b64 rd<16>; +mov.u32 r23, %tid.y; +mov.u32 r24, %54; +mad.lo.s32 r3, r23, 17496, r24; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2480, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2479, %58, f2480; +mul.f32 f119, f2480, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2478, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2477, %64, f2478; +mul.f32 f135, f2478, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2476, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2475, %70, f2476; +mul.f32 f151, f2476, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f2474, f133, 0f3F441B7D; +sub.f32 f159, f2474, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f2472, f149, 0f3E31D0D4; +mul.f32 f2473, f155, 0f3F7C1C5C; +sub.f32 f164, f2472, f2473; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f2470, f134, 0f3E31D0D4; +mul.f32 f2471, f140, 0f3F7C1C5C; +sub.f32 f169, f2470, f2471; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f2468, f150, 0fBF708FB2; +mul.f32 f2469, f156, 0f3EAF1D44; +sub.f32 f174, f2468, f2469; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2467, f2477, f2475; +sub.f32 f183, f2477, f2475; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2466, f2479, f2467; +mul.f32 f187, f2467, 0f3F000000; +sub.f32 f188, f2479, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2465, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2464, f123, f2465; +mul.f32 f203, f2465, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2463, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2462, f124, f2463; +mul.f32 f219, f2463, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2459, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2457, %113, f2459; +mul.f32 f235, f2459, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2454, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2452, %116, f2454; +mul.f32 f251, f2454, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2449, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2447, %119, f2449; +mul.f32 f267, f2449, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f2446, f249, 0f3F441B7D; +sub.f32 f275, f2446, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f2445, f265, 0f3E31D0D4; +sub.f32 f280, f2445, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f2443, f250, 0f3E31D0D4; +mul.f32 f2444, f256, 0f3F7C1C5C; +sub.f32 f285, f2443, f2444; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f2441, f266, 0fBF708FB2; +mul.f32 f2442, f272, 0f3EAF1D44; +sub.f32 f290, f2441, f2442; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2440, f2452, f2447; +sub.f32 f299, f2452, f2447; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2439, f2457, f2440; +mul.f32 f303, f2440, 0f3F000000; +sub.f32 f304, f2457, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2438, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2437, f239, f2438; +mul.f32 f319, f2438, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2436, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2435, f240, f2436; +mul.f32 f335, f2436, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2432, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2430, %122, f2432; +mul.f32 f351, f2432, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2427, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2425, %125, f2427; +mul.f32 f367, f2427, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2423, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2421, %127, f2423; +mul.f32 f383, f2423, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f2420, f365, 0f3F441B7D; +sub.f32 f391, f2420, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f2419, f381, 0f3E31D0D4; +sub.f32 f396, f2419, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f2417, f366, 0f3E31D0D4; +mul.f32 f2418, f372, 0f3F7C1C5C; +sub.f32 f401, f2417, f2418; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f2415, f382, 0fBF708FB2; +mul.f32 f2416, f388, 0f3EAF1D44; +sub.f32 f406, f2415, f2416; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2414, f2425, f2421; +sub.f32 f415, f2425, f2421; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2413, f2430, f2414; +mul.f32 f419, f2414, 0f3F000000; +sub.f32 f420, f2430, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2412, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2411, f355, f2412; +mul.f32 f435, f2412, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2410, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2409, f356, f2410; +mul.f32 f451, f2410, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2437, 0f3E6C2691; +mul.f32 f2408, f310, 0f3F791978; +sub.f32 f459, f2408, f458; +mul.f32 f460, f2437, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f2406, f426, 0f3F64C51C; +mul.f32 f2407, f2411, 0f3EE5C902; +sub.f32 f464, f2406, f2407; +mul.f32 f465, f2411, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f2404, f326, 0f3F64C51C; +mul.f32 f2405, f2435, 0f3EE5C902; +sub.f32 f469, f2404, f2405; +mul.f32 f470, f2435, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f2402, f442, 0f3F18DF63; +mul.f32 f2403, f2409, 0f3F4D57F2; +sub.f32 f474, f2402, f2403; +mul.f32 f475, f2409, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f2400, f301, 0f3F441B7D; +mul.f32 f2401, f307, 0f3F248DBB; +sub.f32 f479, f2400, f2401; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f2399, f417, 0f3E31D0D4; +sub.f32 f484, f2399, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f2398, f317, 0f3F18DF63; +sub.f32 f489, f2398, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f2397, f433, 0fBE92D7E0; +sub.f32 f494, f2397, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f2396, f333, 0f3ECACAF8; +sub.f32 f499, f2396, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f2395, f449, 0fBF2FAD88; +sub.f32 f504, f2395, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f2394, f302, 0f3E31D0D4; +sub.f32 f509, f2394, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f2392, f418, 0fBF708FB2; +mul.f32 f2393, f424, 0f3EAF1D44; +sub.f32 f514, f2392, f2393; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f2390, f318, 0fBD6E2946; +mul.f32 f2391, f324, 0f3F7F9120; +sub.f32 f519, f2390, f2391; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f2388, f434, 0fBF7E44DE; +mul.f32 f2389, f440, 0fBDEDC21F; +sub.f32 f524, f2388, f2389; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f2387, f334, 0fBE92D7E0; +sub.f32 f529, f2387, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f2386, f450, 0fBF55E287; +sub.f32 f534, f2386, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f2385, f2439, f2413; +sub.f32 f541, f2439, f2413; +mul.f32 f542, f541, 0fBF5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f2385, 0f3F000000; +sub.f32 f546, f2466, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0fBF5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f2384, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0fBF5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f2383, f2464, f2384; +mul.f32 f561, f2384, 0f3F000000; +sub.f32 f562, f2464, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0fBF5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f2382, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0fBF5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f2381, f2462, f2382; +mul.f32 f577, f2382, 0f3F000000; +sub.f32 f578, f2462, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0fBF5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f2380, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0fBF5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f2379, f191, f2380; +mul.f32 f593, f2380, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0fBF5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f2378, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f2377, f207, f2378; +mul.f32 f609, f2378, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0fBF5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f2376, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0fBF5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f2375, f223, f2376; +mul.f32 f625, f2376, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0fBF5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f2374, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0fBF5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f2373, f192, f2374; +mul.f32 f641, f2374, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0fBF5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f2372, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0fBF5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f2371, f208, f2372; +mul.f32 f657, f2372, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0fBF5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f2370, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0fBF5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f2369, f224, f2370; +mul.f32 f673, f2370, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0fBF5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r22, %tid.x; +mul.wide.u32 rd2, r22, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r22, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd14, r7, 8; +mov.u64 rd15, %55; +add.s64 rd6, rd15, rd14; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f683, f2383, f680; +mul.f32 f685, f679, f2383; +mul.f32 f2367, f679, f679; +mul.f32 f2368, f680, f680; +sub.f32 f688, f2367, f2368; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f691, f2381, f690; +mul.f32 f693, f688, f2381; +mul.f32 f695, f680, f690; +mul.f32 f2366, f679, f688; +sub.f32 f696, f2366, f695; +mul.f32 f2365, f568, f690; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f699, f2379, f698; +mul.f32 f701, f696, f2379; +mul.f32 f2363, f679, f696; +mul.f32 f2364, f680, f698; +sub.f32 f704, f2363, f2364; +mul.f32 f2362, f584, f698; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f707, f2377, f706; +mul.f32 f709, f704, f2377; +mul.f32 f711, f680, f706; +mul.f32 f2361, f679, f704; +sub.f32 f712, f2361, f711; +mul.f32 f2360, f600, f706; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f715, f2375, f714; +mul.f32 f717, f712, f2375; +mul.f32 f719, f680, f714; +mul.f32 f2359, f679, f712; +sub.f32 f720, f2359, f719; +mul.f32 f2358, f616, f714; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f723, f2373, f722; +mul.f32 f725, f720, f2373; +mul.f32 f2356, f679, f720; +mul.f32 f2357, f680, f722; +sub.f32 f728, f2356, f2357; +mul.f32 f2355, f632, f722; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f731, f2371, f730; +mul.f32 f733, f728, f2371; +mul.f32 f735, f680, f730; +mul.f32 f2354, f679, f728; +sub.f32 f736, f2354, f735; +mul.f32 f2353, f648, f730; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f739, f2369, f738; +mul.f32 f741, f736, f2369; +mul.f32 f743, f680, f738; +mul.f32 f2352, f679, f736; +sub.f32 f744, f2352, f743; +mul.f32 f2351, f664, f738; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f747, f549, f746; +mul.f32 f749, f744, f549; +mul.f32 f2349, f679, f744; +mul.f32 f2350, f680, f746; +sub.f32 f752, f2349, f2350; +mul.f32 f2348, f543, f746; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f755, f565, f754; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f2347, f679, f752; +sub.f32 f760, f2347, f759; +mul.f32 f2346, f559, f754; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f763, f581, f762; +mul.f32 f765, f760, f581; +mul.f32 f2344, f679, f760; +mul.f32 f2345, f680, f762; +sub.f32 f768, f2344, f2345; +mul.f32 f2343, f575, f762; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f771, f597, f770; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f2342, f679, f768; +sub.f32 f776, f2342, f775; +mul.f32 f2341, f591, f770; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f779, f613, f778; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f2340, f679, f776; +sub.f32 f784, f2340, f783; +mul.f32 f2339, f607, f778; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f787, f629, f786; +mul.f32 f789, f784, f629; +mul.f32 f2337, f679, f784; +mul.f32 f2338, f680, f786; +sub.f32 f792, f2337, f2338; +mul.f32 f2336, f623, f786; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f795, f645, f794; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f2335, f679, f792; +sub.f32 f800, f2335, f799; +mul.f32 f2334, f639, f794; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f803, f661, f802; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f2333, f679, f800; +sub.f32 f808, f2333, f807; +mul.f32 f2332, f655, f802; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f811, f677, f810; +mul.f32 f813, f808, f677; +mul.f32 f2330, f679, f808; +mul.f32 f2331, f680, f810; +sub.f32 f816, f2330, f2331; +mul.f32 f2329, f671, f810; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f819, f550, f818; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f2328, f679, f816; +sub.f32 f824, f2328, f823; +mul.f32 f2327, f544, f818; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f827, f566, f826; +mul.f32 f829, f824, f566; +mul.f32 f2325, f679, f824; +mul.f32 f2326, f680, f826; +sub.f32 f832, f2325, f2326; +mul.f32 f2324, f560, f826; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f835, f582, f834; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f2323, f679, f832; +sub.f32 f840, f2323, f839; +mul.f32 f2322, f576, f834; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f843, f598, f842; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f2321, f679, f840; +sub.f32 f848, f2321, f847; +mul.f32 f2320, f592, f842; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f851, f614, f850; +mul.f32 f853, f848, f614; +mul.f32 f2318, f679, f848; +mul.f32 f2319, f680, f850; +sub.f32 f856, f2318, f2319; +mul.f32 f2317, f608, f850; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f859, f630, f858; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f2316, f679, f856; +sub.f32 f864, f2316, f863; +mul.f32 f2315, f624, f858; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f867, f646, f866; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f2314, f679, f864; +sub.f32 f872, f2314, f871; +mul.f32 f2313, f640, f866; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f875, f662, f874; +mul.f32 f877, f872, f662; +mul.f32 f2311, f679, f872; +mul.f32 f2312, f680, f874; +sub.f32 f880, f2311, f2312; +mul.f32 f2310, f656, f874; +mul.f32 f881, f679, f874; +mul.f32 f2309, f552, f680; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f678, f882; +mul.f32 f884, f672, f882; +mul.f32 f885, f880, f678; +barrier.sync 0; +add.f32 f886, f2466, f2385; +add.f32 f887, f178, f537; +mad.lo.s32 r21, r7, 216, r8; +st.shared.v2.f32 [r21], {f887, f886}; +fma.rn.f32 f888, f679, f552, f683; +sub.f32 f889, f685, f2309; +st.shared.v2.f32 [r21+8], {f888, f889}; +fma.rn.f32 f890, f688, f568, f691; +sub.f32 f891, f693, f2365; +st.shared.v2.f32 [r21+16], {f890, f891}; +fma.rn.f32 f892, f696, f584, f699; +sub.f32 f893, f701, f2362; +st.shared.v2.f32 [r21+24], {f892, f893}; +fma.rn.f32 f894, f704, f600, f707; +sub.f32 f895, f709, f2360; +st.shared.v2.f32 [r21+32], {f894, f895}; +fma.rn.f32 f896, f712, f616, f715; +sub.f32 f897, f717, f2358; +st.shared.v2.f32 [r21+40], {f896, f897}; +fma.rn.f32 f898, f720, f632, f723; +sub.f32 f899, f725, f2355; +st.shared.v2.f32 [r21+48], {f898, f899}; +sub.f32 f900, f733, f2353; +fma.rn.f32 f901, f728, f648, f731; +st.shared.v2.f32 [r21+56], {f901, f900}; +fma.rn.f32 f902, f736, f664, f739; +sub.f32 f903, f741, f2351; +st.shared.v2.f32 [r21+64], {f902, f903}; +fma.rn.f32 f904, f744, f543, f747; +sub.f32 f905, f749, f2348; +st.shared.v2.f32 [r21+72], {f904, f905}; +fma.rn.f32 f906, f752, f559, f755; +sub.f32 f907, f757, f2346; +st.shared.v2.f32 [r21+80], {f906, f907}; +fma.rn.f32 f908, f760, f575, f763; +sub.f32 f909, f765, f2343; +st.shared.v2.f32 [r21+88], {f908, f909}; +fma.rn.f32 f910, f768, f591, f771; +sub.f32 f911, f773, f2341; +st.shared.v2.f32 [r21+96], {f910, f911}; +fma.rn.f32 f912, f776, f607, f779; +sub.f32 f913, f781, f2339; +st.shared.v2.f32 [r21+104], {f912, f913}; +fma.rn.f32 f914, f784, f623, f787; +sub.f32 f915, f789, f2336; +st.shared.v2.f32 [r21+112], {f914, f915}; +fma.rn.f32 f916, f792, f639, f795; +sub.f32 f917, f797, f2334; +st.shared.v2.f32 [r21+120], {f916, f917}; +fma.rn.f32 f918, f800, f655, f803; +sub.f32 f919, f805, f2332; +st.shared.v2.f32 [r21+128], {f918, f919}; +fma.rn.f32 f920, f808, f671, f811; +sub.f32 f921, f813, f2329; +st.shared.v2.f32 [r21+136], {f920, f921}; +fma.rn.f32 f922, f816, f544, f819; +sub.f32 f923, f821, f2327; +st.shared.v2.f32 [r21+144], {f922, f923}; +fma.rn.f32 f924, f824, f560, f827; +sub.f32 f925, f829, f2324; +st.shared.v2.f32 [r21+152], {f924, f925}; +fma.rn.f32 f926, f832, f576, f835; +sub.f32 f927, f837, f2322; +st.shared.v2.f32 [r21+160], {f926, f927}; +fma.rn.f32 f928, f840, f592, f843; +sub.f32 f929, f845, f2320; +st.shared.v2.f32 [r21+168], {f928, f929}; +fma.rn.f32 f930, f848, f608, f851; +sub.f32 f931, f853, f2317; +st.shared.v2.f32 [r21+176], {f930, f931}; +fma.rn.f32 f932, f856, f624, f859; +sub.f32 f933, f861, f2315; +st.shared.v2.f32 [r21+184], {f932, f933}; +fma.rn.f32 f934, f864, f640, f867; +sub.f32 f935, f869, f2313; +st.shared.v2.f32 [r21+192], {f934, f935}; +fma.rn.f32 f936, f872, f656, f875; +sub.f32 f937, f877, f2310; +st.shared.v2.f32 [r21+200], {f936, f937}; +fma.rn.f32 f938, f880, f672, f883; +sub.f32 f939, f885, f884; +st.shared.v2.f32 [r21+208], {f938, f939}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r21; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+648]; +ld.shared.v2.f32 {f948, f949}, [r10+1296]; +ld.shared.v2.f32 {f952, f953}, [r10+1944]; +ld.shared.v2.f32 {f956, f957}, [r10+2592]; +ld.shared.v2.f32 {f960, f961}, [r10+3240]; +ld.shared.v2.f32 {f964, f965}, [r10+3888]; +ld.shared.v2.f32 {f968, f969}, [r10+4536]; +ld.shared.v2.f32 {f972, f973}, [r10+5184]; +ld.shared.v2.f32 {f976, f977}, [r10+5832]; +ld.shared.v2.f32 {f980, f981}, [r10+6480]; +ld.shared.v2.f32 {f984, f985}, [r10+7128]; +ld.shared.v2.f32 {f988, f989}, [r10+7776]; +ld.shared.v2.f32 {f992, f993}, [r10+8424]; +ld.shared.v2.f32 {f996, f997}, [r10+9072]; +ld.shared.v2.f32 {f1000, f1001}, [r10+9720]; +ld.shared.v2.f32 {f1004, f1005}, [r10+10368]; +ld.shared.v2.f32 {f1008, f1009}, [r10+11016]; +ld.shared.v2.f32 {f1012, f1013}, [r10+11664]; +ld.shared.v2.f32 {f1016, f1017}, [r10+12312]; +ld.shared.v2.f32 {f1020, f1021}, [r10+12960]; +ld.shared.v2.f32 {f1024, f1025}, [r10+13608]; +ld.shared.v2.f32 {f1028, f1029}, [r10+14256]; +ld.shared.v2.f32 {f1032, f1033}, [r10+14904]; +ld.shared.v2.f32 {f1036, f1037}, [r10+15552]; +ld.shared.v2.f32 {f1040, f1041}, [r10+16200]; +ld.shared.v2.f32 {f1044, f1045}, [r10+16848]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f2308, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0fBF5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f2307, f941, f2308; +mul.f32 f1058, f2308, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0fBF5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f2306, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0fBF5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f2305, f953, f2306; +mul.f32 f1074, f2306, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0fBF5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f2304, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0fBF5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f2303, f965, f2304; +mul.f32 f1090, f2304, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0fBF5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f2301, f1072, 0f3F441B7D; +mul.f32 f2302, f1078, 0f3F248DBB; +sub.f32 f1098, f2301, f2302; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0f3F248DBB, f1099; +mul.f32 f2299, f1088, 0f3E31D0D4; +mul.f32 f2300, f1094, 0f3F7C1C5C; +sub.f32 f1103, f2299, f2300; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0f3F7C1C5C, f1104; +mul.f32 f2297, f1073, 0f3E31D0D4; +mul.f32 f2298, f1079, 0f3F7C1C5C; +sub.f32 f1108, f2297, f2298; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0f3F7C1C5C, f1109; +mul.f32 f1112, f1095, 0f3EAF1D44; +mul.f32 f2296, f1089, 0fBF708FB2; +sub.f32 f1113, f2296, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0f3EAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f2295, f2305, f2303; +sub.f32 f1122, f2305, f2303; +mul.f32 f1123, f1122, 0fBF5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f2294, f2307, f2295; +mul.f32 f1126, f2295, 0f3F000000; +sub.f32 f1127, f2307, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0fBF5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f2293, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0fBF5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f2292, f1062, f2293; +mul.f32 f1142, f2293, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0fBF5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f2291, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0fBF5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f2290, f1063, f2291; +mul.f32 f1158, f2291, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0fBF5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f2289, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0fBF5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f2288, f945, f2289; +mul.f32 f1174, f2289, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0fBF5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f2287, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0fBF5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f2286, f957, f2287; +mul.f32 f1190, f2287, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0fBF5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f2285, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0fBF5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f2284, f969, f2285; +mul.f32 f1206, f2285, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0fBF5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f2282, f1188, 0f3F441B7D; +mul.f32 f2283, f1194, 0f3F248DBB; +sub.f32 f1214, f2282, f2283; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0f3F248DBB, f1215; +mul.f32 f2280, f1204, 0f3E31D0D4; +mul.f32 f2281, f1210, 0f3F7C1C5C; +sub.f32 f1219, f2280, f2281; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0f3F7C1C5C, f1220; +mul.f32 f2278, f1189, 0f3E31D0D4; +mul.f32 f2279, f1195, 0f3F7C1C5C; +sub.f32 f1224, f2278, f2279; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0f3F7C1C5C, f1225; +mul.f32 f2276, f1205, 0fBF708FB2; +mul.f32 f2277, f1211, 0f3EAF1D44; +sub.f32 f1229, f2276, f2277; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0f3EAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f2275, f2286, f2284; +sub.f32 f1238, f2286, f2284; +mul.f32 f1239, f1238, 0fBF5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f2274, f2288, f2275; +mul.f32 f1242, f2275, 0f3F000000; +sub.f32 f1243, f2288, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0fBF5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f2273, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0fBF5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f2272, f1178, f2273; +mul.f32 f1258, f2273, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0fBF5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f2271, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0fBF5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f2270, f1179, f2271; +mul.f32 f1274, f2271, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0fBF5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f2269, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0fBF5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f2268, f949, f2269; +mul.f32 f1290, f2269, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0fBF5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f2267, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0fBF5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f2266, f961, f2267; +mul.f32 f1306, f2267, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0fBF5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f2265, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0fBF5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f2264, f973, f2265; +mul.f32 f1322, f2265, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0fBF5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0f3F248DBB; +mul.f32 f2263, f1304, 0f3F441B7D; +sub.f32 f1330, f2263, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0f3F248DBB, f1331; +mul.f32 f2261, f1320, 0f3E31D0D4; +mul.f32 f2262, f1326, 0f3F7C1C5C; +sub.f32 f1335, f2261, f2262; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0f3F7C1C5C, f1336; +mul.f32 f2259, f1305, 0f3E31D0D4; +mul.f32 f2260, f1311, 0f3F7C1C5C; +sub.f32 f1340, f2259, f2260; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0f3F7C1C5C, f1341; +mul.f32 f2257, f1321, 0fBF708FB2; +mul.f32 f2258, f1327, 0f3EAF1D44; +sub.f32 f1345, f2257, f2258; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0f3EAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f2256, f2266, f2264; +sub.f32 f1354, f2266, f2264; +mul.f32 f1355, f1354, 0fBF5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f2255, f2268, f2256; +mul.f32 f1358, f2256, 0f3F000000; +sub.f32 f1359, f2268, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0fBF5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f2254, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0fBF5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f2253, f1294, f2254; +mul.f32 f1374, f2254, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0fBF5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f2252, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0fBF5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f2251, f1295, f2252; +mul.f32 f1390, f2252, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0fBF5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1397, f2272, 0f3E6C2691; +mul.f32 f2250, f1249, 0f3F791978; +sub.f32 f1398, f2250, f1397; +mul.f32 f1399, f2272, 0f3F791978; +fma.rn.f32 f1400, f1249, 0f3E6C2691, f1399; +mul.f32 f1402, f2253, 0f3EE5C902; +mul.f32 f2249, f1365, 0f3F64C51C; +sub.f32 f1403, f2249, f1402; +mul.f32 f1404, f2253, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0f3EE5C902, f1404; +mul.f32 f1407, f2270, 0f3EE5C902; +mul.f32 f2248, f1265, 0f3F64C51C; +sub.f32 f1408, f2248, f1407; +mul.f32 f1409, f2270, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0f3EE5C902, f1409; +mul.f32 f2246, f1381, 0f3F18DF63; +mul.f32 f2247, f2251, 0f3F4D57F2; +sub.f32 f1413, f2246, f2247; +mul.f32 f1414, f2251, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0f3F4D57F2, f1414; +mul.f32 f2244, f1240, 0f3F441B7D; +mul.f32 f2245, f1246, 0f3F248DBB; +sub.f32 f1418, f2244, f2245; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0f3F248DBB, f1419; +mul.f32 f2242, f1356, 0f3E31D0D4; +mul.f32 f2243, f1362, 0f3F7C1C5C; +sub.f32 f1423, f2242, f2243; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0f3F7C1C5C, f1424; +mul.f32 f2240, f1256, 0f3F18DF63; +mul.f32 f2241, f1262, 0f3F4D57F2; +sub.f32 f1428, f2240, f2241; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0f3F4D57F2, f1429; +mul.f32 f1432, f1378, 0f3F753ECD; +mul.f32 f2239, f1372, 0fBE92D7E0; +sub.f32 f1433, f2239, f1432; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0f3F753ECD, f1434; +mul.f32 f1437, f1278, 0f3F6B1036; +mul.f32 f2238, f1272, 0f3ECACAF8; +sub.f32 f1438, f2238, f1437; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0f3F6B1036, f1439; +mul.f32 f1442, f1394, 0f3F3A3529; +mul.f32 f2237, f1388, 0fBF2FAD88; +sub.f32 f1443, f2237, f1442; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0f3F3A3529, f1444; +mul.f32 f1447, f1247, 0f3F7C1C5C; +mul.f32 f2236, f1241, 0f3E31D0D4; +sub.f32 f1448, f2236, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0f3F7C1C5C, f1449; +mul.f32 f1452, f1363, 0f3EAF1D44; +mul.f32 f2235, f1357, 0fBF708FB2; +sub.f32 f1453, f2235, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0f3EAF1D44, f1454; +mul.f32 f1457, f1263, 0f3F7F9120; +mul.f32 f2234, f1257, 0fBD6E2946; +sub.f32 f1458, f2234, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0f3F7F9120, f1459; +mul.f32 f2232, f1373, 0fBF7E44DE; +mul.f32 f2233, f1379, 0fBDEDC21F; +sub.f32 f1463, f2232, f2233; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0fBDEDC21F, f1464; +mul.f32 f2230, f1273, 0fBE92D7E0; +mul.f32 f2231, f1279, 0f3F753ECD; +sub.f32 f1468, f2230, f2231; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0f3F753ECD, f1469; +mul.f32 f2228, f1389, 0fBF55E287; +mul.f32 f2229, f1395, 0fBF0CAC9F; +sub.f32 f1473, f2228, f2229; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0fBF0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f2227, f2274, f2255; +sub.f32 f1480, f2274, f2255; +mul.f32 f1481, f1480, 0fBF5DB3D7; +add.f32 f1482, f1481, f1479; +sub.f32 f1483, f1479, f1481; +mul.f32 f1484, f2227, 0f3F000000; +sub.f32 f1485, f2294, f1484; +sub.f32 f1486, f1233, f1349; +mul.f32 f1487, f1486, 0fBF5DB3D7; +sub.f32 f1488, f1485, f1487; +add.f32 f1489, f1487, f1485; +add.f32 f1490, f1398, f1403; +add.f32 f1491, f1133, f1490; +mul.f32 f1494, f1490, 0f3F000000; +sub.f32 f1495, f1133, f1494; +add.f32 f2226, f1400, f1405; +sub.f32 f1496, f1400, f1405; +mul.f32 f1497, f1496, 0fBF5DB3D7; +add.f32 f1498, f1497, f1495; +sub.f32 f1499, f1495, f1497; +add.f32 f2225, f2292, f2226; +mul.f32 f1500, f2226, 0f3F000000; +sub.f32 f1501, f2292, f1500; +sub.f32 f1502, f1398, f1403; +mul.f32 f1503, f1502, 0fBF5DB3D7; +sub.f32 f1504, f1501, f1503; +add.f32 f1505, f1503, f1501; +add.f32 f1506, f1408, f1413; +add.f32 f1507, f1149, f1506; +mul.f32 f1510, f1506, 0f3F000000; +sub.f32 f1511, f1149, f1510; +add.f32 f2224, f1410, f1415; +sub.f32 f1512, f1410, f1415; +mul.f32 f1513, f1512, 0fBF5DB3D7; +add.f32 f1514, f1513, f1511; +sub.f32 f1515, f1511, f1513; +add.f32 f2223, f2290, f2224; +mul.f32 f1516, f2224, 0f3F000000; +sub.f32 f1517, f2290, f1516; +sub.f32 f1518, f1408, f1413; +mul.f32 f1519, f1518, 0fBF5DB3D7; +sub.f32 f1520, f1517, f1519; +add.f32 f1521, f1519, f1517; +add.f32 f1522, f1418, f1423; +add.f32 f1523, f1124, f1522; +mul.f32 f1526, f1522, 0f3F000000; +sub.f32 f1527, f1124, f1526; +add.f32 f2222, f1420, f1425; +sub.f32 f1528, f1420, f1425; +mul.f32 f1529, f1528, 0fBF5DB3D7; +add.f32 f1530, f1529, f1527; +sub.f32 f1531, f1527, f1529; +add.f32 f2221, f1130, f2222; +mul.f32 f1532, f2222, 0f3F000000; +sub.f32 f1533, f1130, f1532; +sub.f32 f1534, f1418, f1423; +mul.f32 f1535, f1534, 0fBF5DB3D7; +sub.f32 f1536, f1533, f1535; +add.f32 f1537, f1535, f1533; +add.f32 f1538, f1428, f1433; +add.f32 f1539, f1140, f1538; +mul.f32 f1542, f1538, 0f3F000000; +sub.f32 f1543, f1140, f1542; +add.f32 f2220, f1430, f1435; +sub.f32 f1544, f1430, f1435; +mul.f32 f1545, f1544, 0fBF5DB3D7; +add.f32 f1546, f1545, f1543; +sub.f32 f1547, f1543, f1545; +add.f32 f2219, f1146, f2220; +mul.f32 f1548, f2220, 0f3F000000; +sub.f32 f1549, f1146, f1548; +sub.f32 f1550, f1428, f1433; +mul.f32 f1551, f1550, 0fBF5DB3D7; +sub.f32 f1552, f1549, f1551; +add.f32 f1553, f1551, f1549; +add.f32 f1554, f1438, f1443; +add.f32 f1555, f1156, f1554; +mul.f32 f1558, f1554, 0f3F000000; +sub.f32 f1559, f1156, f1558; +add.f32 f2218, f1440, f1445; +sub.f32 f1560, f1440, f1445; +mul.f32 f1561, f1560, 0fBF5DB3D7; +add.f32 f1562, f1561, f1559; +sub.f32 f1563, f1559, f1561; +add.f32 f2217, f1162, f2218; +mul.f32 f1564, f2218, 0f3F000000; +sub.f32 f1565, f1162, f1564; +sub.f32 f1566, f1438, f1443; +mul.f32 f1567, f1566, 0fBF5DB3D7; +sub.f32 f1568, f1565, f1567; +add.f32 f1569, f1567, f1565; +add.f32 f1570, f1448, f1453; +add.f32 f1571, f1125, f1570; +mul.f32 f1574, f1570, 0f3F000000; +sub.f32 f1575, f1125, f1574; +add.f32 f2216, f1450, f1455; +sub.f32 f1576, f1450, f1455; +mul.f32 f1577, f1576, 0fBF5DB3D7; +add.f32 f1578, f1577, f1575; +sub.f32 f1579, f1575, f1577; +add.f32 f2215, f1131, f2216; +mul.f32 f1580, f2216, 0f3F000000; +sub.f32 f1581, f1131, f1580; +sub.f32 f1582, f1448, f1453; +mul.f32 f1583, f1582, 0fBF5DB3D7; +sub.f32 f1584, f1581, f1583; +add.f32 f1585, f1583, f1581; +add.f32 f1586, f1458, f1463; +add.f32 f1587, f1141, f1586; +mul.f32 f1590, f1586, 0f3F000000; +sub.f32 f1591, f1141, f1590; +add.f32 f2214, f1460, f1465; +sub.f32 f1592, f1460, f1465; +mul.f32 f1593, f1592, 0fBF5DB3D7; +add.f32 f1594, f1593, f1591; +sub.f32 f1595, f1591, f1593; +add.f32 f2213, f1147, f2214; +mul.f32 f1596, f2214, 0f3F000000; +sub.f32 f1597, f1147, f1596; +sub.f32 f1598, f1458, f1463; +mul.f32 f1599, f1598, 0fBF5DB3D7; +sub.f32 f1600, f1597, f1599; +add.f32 f1601, f1599, f1597; +add.f32 f1602, f1468, f1473; +add.f32 f1603, f1157, f1602; +mul.f32 f1606, f1602, 0f3F000000; +sub.f32 f1607, f1157, f1606; +add.f32 f2212, f1470, f1475; +sub.f32 f1608, f1470, f1475; +mul.f32 f1609, f1608, 0fBF5DB3D7; +add.f32 f1610, f1609, f1607; +sub.f32 f1611, f1607, f1609; +add.f32 f2211, f1163, f2212; +mul.f32 f1612, f2212, 0f3F000000; +sub.f32 f1613, f1163, f1612; +sub.f32 f1614, f1468, f1473; +mul.f32 f1615, f1614, 0fBF5DB3D7; +sub.f32 f1616, f1613, f1615; +add.f32 f1617, f1615, f1613; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1618, f1619}, [rd11]; +mul.f32 f1622, f2225, f1619; +mul.f32 f1624, f1618, f2225; +mul.f32 f2209, f1618, f1618; +mul.f32 f2210, f1619, f1619; +sub.f32 f1627, f2209, f2210; +mul.f32 f1628, f1619, f1618; +fma.rn.f32 f1629, f1619, f1618, f1628; +mul.f32 f1630, f2223, f1629; +mul.f32 f1632, f1627, f2223; +mul.f32 f1634, f1619, f1629; +mul.f32 f2208, f1618, f1627; +sub.f32 f1635, f2208, f1634; +mul.f32 f2207, f1507, f1629; +mul.f32 f1636, f1618, f1629; +fma.rn.f32 f1637, f1619, f1627, f1636; +mul.f32 f1638, f2221, f1637; +mul.f32 f1640, f1635, f2221; +mul.f32 f2205, f1618, f1635; +mul.f32 f2206, f1619, f1637; +sub.f32 f1643, f2205, f2206; +mul.f32 f2204, f1523, f1637; +mul.f32 f1644, f1618, f1637; +fma.rn.f32 f1645, f1619, f1635, f1644; +mul.f32 f1646, f2219, f1645; +mul.f32 f1648, f1643, f2219; +mul.f32 f1650, f1619, f1645; +mul.f32 f2203, f1618, f1643; +sub.f32 f1651, f2203, f1650; +mul.f32 f2202, f1539, f1645; +mul.f32 f1652, f1618, f1645; +fma.rn.f32 f1653, f1619, f1643, f1652; +mul.f32 f1654, f2217, f1653; +mul.f32 f1656, f1651, f2217; +mul.f32 f1658, f1619, f1653; +mul.f32 f2201, f1618, f1651; +sub.f32 f1659, f2201, f1658; +mul.f32 f2200, f1555, f1653; +mul.f32 f1660, f1618, f1653; +fma.rn.f32 f1661, f1619, f1651, f1660; +mul.f32 f1662, f2215, f1661; +mul.f32 f1664, f1659, f2215; +mul.f32 f2198, f1618, f1659; +mul.f32 f2199, f1619, f1661; +sub.f32 f1667, f2198, f2199; +mul.f32 f2197, f1571, f1661; +mul.f32 f1668, f1618, f1661; +fma.rn.f32 f1669, f1619, f1659, f1668; +mul.f32 f1670, f2213, f1669; +mul.f32 f1672, f1667, f2213; +mul.f32 f1674, f1619, f1669; +mul.f32 f2196, f1618, f1667; +sub.f32 f1675, f2196, f1674; +mul.f32 f2195, f1587, f1669; +mul.f32 f1676, f1618, f1669; +fma.rn.f32 f1677, f1619, f1667, f1676; +mul.f32 f1678, f2211, f1677; +mul.f32 f1680, f1675, f2211; +mul.f32 f1682, f1619, f1677; +mul.f32 f2194, f1618, f1675; +sub.f32 f1683, f2194, f1682; +mul.f32 f2193, f1603, f1677; +mul.f32 f1684, f1618, f1677; +fma.rn.f32 f1685, f1619, f1675, f1684; +mul.f32 f1686, f1488, f1685; +mul.f32 f1688, f1683, f1488; +mul.f32 f2191, f1618, f1683; +mul.f32 f2192, f1619, f1685; +sub.f32 f1691, f2191, f2192; +mul.f32 f2190, f1482, f1685; +mul.f32 f1692, f1618, f1685; +fma.rn.f32 f1693, f1619, f1683, f1692; +mul.f32 f1694, f1504, f1693; +mul.f32 f1696, f1691, f1504; +mul.f32 f1698, f1619, f1693; +mul.f32 f2189, f1618, f1691; +sub.f32 f1699, f2189, f1698; +mul.f32 f2188, f1498, f1693; +mul.f32 f1700, f1618, f1693; +fma.rn.f32 f1701, f1619, f1691, f1700; +mul.f32 f1702, f1520, f1701; +mul.f32 f1704, f1699, f1520; +mul.f32 f2186, f1618, f1699; +mul.f32 f2187, f1619, f1701; +sub.f32 f1707, f2186, f2187; +mul.f32 f2185, f1514, f1701; +mul.f32 f1708, f1618, f1701; +fma.rn.f32 f1709, f1619, f1699, f1708; +mul.f32 f1710, f1536, f1709; +mul.f32 f1712, f1707, f1536; +mul.f32 f1714, f1619, f1709; +mul.f32 f2184, f1618, f1707; +sub.f32 f1715, f2184, f1714; +mul.f32 f2183, f1530, f1709; +mul.f32 f1716, f1618, f1709; +fma.rn.f32 f1717, f1619, f1707, f1716; +mul.f32 f1718, f1552, f1717; +mul.f32 f1720, f1715, f1552; +mul.f32 f1722, f1619, f1717; +mul.f32 f2182, f1618, f1715; +sub.f32 f1723, f2182, f1722; +mul.f32 f2181, f1546, f1717; +mul.f32 f1724, f1618, f1717; +fma.rn.f32 f1725, f1619, f1715, f1724; +mul.f32 f1726, f1568, f1725; +mul.f32 f1728, f1723, f1568; +mul.f32 f2179, f1618, f1723; +mul.f32 f2180, f1619, f1725; +sub.f32 f1731, f2179, f2180; +mul.f32 f2178, f1562, f1725; +mul.f32 f1732, f1618, f1725; +fma.rn.f32 f1733, f1619, f1723, f1732; +mul.f32 f1734, f1584, f1733; +mul.f32 f1736, f1731, f1584; +mul.f32 f1738, f1619, f1733; +mul.f32 f2177, f1618, f1731; +sub.f32 f1739, f2177, f1738; +mul.f32 f2176, f1578, f1733; +mul.f32 f1740, f1618, f1733; +fma.rn.f32 f1741, f1619, f1731, f1740; +mul.f32 f1742, f1600, f1741; +mul.f32 f1744, f1739, f1600; +mul.f32 f1746, f1619, f1741; +mul.f32 f2175, f1618, f1739; +sub.f32 f1747, f2175, f1746; +mul.f32 f2174, f1594, f1741; +mul.f32 f1748, f1618, f1741; +fma.rn.f32 f1749, f1619, f1739, f1748; +mul.f32 f1750, f1616, f1749; +mul.f32 f1752, f1747, f1616; +mul.f32 f2172, f1618, f1747; +mul.f32 f2173, f1619, f1749; +sub.f32 f1755, f2172, f2173; +mul.f32 f2171, f1610, f1749; +mul.f32 f1756, f1618, f1749; +fma.rn.f32 f1757, f1619, f1747, f1756; +mul.f32 f1758, f1489, f1757; +mul.f32 f1760, f1755, f1489; +mul.f32 f1762, f1619, f1757; +mul.f32 f2170, f1618, f1755; +sub.f32 f1763, f2170, f1762; +mul.f32 f2169, f1483, f1757; +mul.f32 f1764, f1618, f1757; +fma.rn.f32 f1765, f1619, f1755, f1764; +mul.f32 f1766, f1505, f1765; +mul.f32 f1768, f1763, f1505; +mul.f32 f2167, f1618, f1763; +mul.f32 f2168, f1619, f1765; +sub.f32 f1771, f2167, f2168; +mul.f32 f2166, f1499, f1765; +mul.f32 f1772, f1618, f1765; +fma.rn.f32 f1773, f1619, f1763, f1772; +mul.f32 f1774, f1521, f1773; +mul.f32 f1776, f1771, f1521; +mul.f32 f1778, f1619, f1773; +mul.f32 f2165, f1618, f1771; +sub.f32 f1779, f2165, f1778; +mul.f32 f2164, f1515, f1773; +mul.f32 f1780, f1618, f1773; +fma.rn.f32 f1781, f1619, f1771, f1780; +mul.f32 f1782, f1537, f1781; +mul.f32 f1784, f1779, f1537; +mul.f32 f1786, f1619, f1781; +mul.f32 f2163, f1618, f1779; +sub.f32 f1787, f2163, f1786; +mul.f32 f2162, f1531, f1781; +mul.f32 f1788, f1618, f1781; +fma.rn.f32 f1789, f1619, f1779, f1788; +mul.f32 f1790, f1553, f1789; +mul.f32 f1792, f1787, f1553; +mul.f32 f2160, f1618, f1787; +mul.f32 f2161, f1619, f1789; +sub.f32 f1795, f2160, f2161; +mul.f32 f2159, f1547, f1789; +mul.f32 f1796, f1618, f1789; +fma.rn.f32 f1797, f1619, f1787, f1796; +mul.f32 f1798, f1569, f1797; +mul.f32 f1800, f1795, f1569; +mul.f32 f1802, f1619, f1797; +mul.f32 f2158, f1618, f1795; +sub.f32 f1803, f2158, f1802; +mul.f32 f2157, f1563, f1797; +mul.f32 f1804, f1618, f1797; +fma.rn.f32 f1805, f1619, f1795, f1804; +mul.f32 f1806, f1585, f1805; +mul.f32 f1808, f1803, f1585; +mul.f32 f1810, f1619, f1805; +mul.f32 f2156, f1618, f1803; +sub.f32 f1811, f2156, f1810; +mul.f32 f2155, f1579, f1805; +mul.f32 f1812, f1618, f1805; +fma.rn.f32 f1813, f1619, f1803, f1812; +mul.f32 f1814, f1601, f1813; +mul.f32 f1816, f1811, f1601; +mul.f32 f2153, f1618, f1811; +mul.f32 f2154, f1619, f1813; +sub.f32 f1819, f2153, f2154; +mul.f32 f2152, f1595, f1813; +mul.f32 f1820, f1618, f1813; +mul.f32 f2151, f1491, f1619; +fma.rn.f32 f1821, f1619, f1811, f1820; +mul.f32 f1822, f1617, f1821; +mul.f32 f1823, f1611, f1821; +mul.f32 f1824, f1819, f1617; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 5832, r19; +add.f32 f1825, f2294, f2227; +add.f32 f1826, f1117, f1476; +st.shared.v2.f32 [r20], {f1826, f1825}; +fma.rn.f32 f1827, f1618, f1491, f1622; +sub.f32 f1828, f1624, f2151; +st.shared.v2.f32 [r20+216], {f1827, f1828}; +fma.rn.f32 f1829, f1627, f1507, f1630; +sub.f32 f1830, f1632, f2207; +st.shared.v2.f32 [r20+432], {f1829, f1830}; +fma.rn.f32 f1831, f1635, f1523, f1638; +sub.f32 f1832, f1640, f2204; +st.shared.v2.f32 [r20+648], {f1831, f1832}; +fma.rn.f32 f1833, f1643, f1539, f1646; +sub.f32 f1834, f1648, f2202; +st.shared.v2.f32 [r20+864], {f1833, f1834}; +fma.rn.f32 f1835, f1651, f1555, f1654; +sub.f32 f1836, f1656, f2200; +st.shared.v2.f32 [r20+1080], {f1835, f1836}; +sub.f32 f1837, f1664, f2197; +fma.rn.f32 f1838, f1659, f1571, f1662; +st.shared.v2.f32 [r20+1296], {f1838, f1837}; +fma.rn.f32 f1839, f1667, f1587, f1670; +sub.f32 f1840, f1672, f2195; +st.shared.v2.f32 [r20+1512], {f1839, f1840}; +sub.f32 f1841, f1680, f2193; +fma.rn.f32 f1842, f1675, f1603, f1678; +st.shared.v2.f32 [r20+1728], {f1842, f1841}; +fma.rn.f32 f1843, f1683, f1482, f1686; +sub.f32 f1844, f1688, f2190; +st.shared.v2.f32 [r20+1944], {f1843, f1844}; +fma.rn.f32 f1845, f1691, f1498, f1694; +sub.f32 f1846, f1696, f2188; +st.shared.v2.f32 [r20+2160], {f1845, f1846}; +fma.rn.f32 f1847, f1699, f1514, f1702; +sub.f32 f1848, f1704, f2185; +st.shared.v2.f32 [r20+2376], {f1847, f1848}; +fma.rn.f32 f1849, f1707, f1530, f1710; +sub.f32 f1850, f1712, f2183; +st.shared.v2.f32 [r20+2592], {f1849, f1850}; +fma.rn.f32 f1851, f1715, f1546, f1718; +sub.f32 f1852, f1720, f2181; +st.shared.v2.f32 [r20+2808], {f1851, f1852}; +fma.rn.f32 f1853, f1723, f1562, f1726; +sub.f32 f1854, f1728, f2178; +st.shared.v2.f32 [r20+3024], {f1853, f1854}; +fma.rn.f32 f1855, f1731, f1578, f1734; +sub.f32 f1856, f1736, f2176; +st.shared.v2.f32 [r20+3240], {f1855, f1856}; +fma.rn.f32 f1857, f1739, f1594, f1742; +sub.f32 f1858, f1744, f2174; +st.shared.v2.f32 [r20+3456], {f1857, f1858}; +fma.rn.f32 f1859, f1747, f1610, f1750; +sub.f32 f1860, f1752, f2171; +st.shared.v2.f32 [r20+3672], {f1859, f1860}; +fma.rn.f32 f1861, f1755, f1483, f1758; +sub.f32 f1862, f1760, f2169; +st.shared.v2.f32 [r20+3888], {f1861, f1862}; +fma.rn.f32 f1863, f1763, f1499, f1766; +sub.f32 f1864, f1768, f2166; +st.shared.v2.f32 [r20+4104], {f1863, f1864}; +fma.rn.f32 f1865, f1771, f1515, f1774; +sub.f32 f1866, f1776, f2164; +st.shared.v2.f32 [r20+4320], {f1865, f1866}; +fma.rn.f32 f1867, f1779, f1531, f1782; +sub.f32 f1868, f1784, f2162; +st.shared.v2.f32 [r20+4536], {f1867, f1868}; +fma.rn.f32 f1869, f1787, f1547, f1790; +sub.f32 f1870, f1792, f2159; +st.shared.v2.f32 [r20+4752], {f1869, f1870}; +fma.rn.f32 f1871, f1795, f1563, f1798; +sub.f32 f1872, f1800, f2157; +st.shared.v2.f32 [r20+4968], {f1871, f1872}; +fma.rn.f32 f1873, f1803, f1579, f1806; +sub.f32 f1874, f1808, f2155; +st.shared.v2.f32 [r20+5184], {f1873, f1874}; +fma.rn.f32 f1875, f1811, f1595, f1814; +sub.f32 f1876, f1816, f2152; +st.shared.v2.f32 [r20+5400], {f1875, f1876}; +fma.rn.f32 f1877, f1819, f1611, f1822; +sub.f32 f1878, f1824, f1823; +st.shared.v2.f32 [r20+5616], {f1877, f1878}; +barrier.sync 0; +ld.shared.v2.f32 {f1879, f1880}, [r10]; +ld.shared.v2.f32 {f1883, f1884}, [r10+648]; +ld.shared.v2.f32 {f1887, f1888}, [r10+1296]; +ld.shared.v2.f32 {f1891, f1892}, [r10+1944]; +ld.shared.v2.f32 {f1895, f1896}, [r10+2592]; +ld.shared.v2.f32 {f1899, f1900}, [r10+3240]; +ld.shared.v2.f32 {f1903, f1904}, [r10+3888]; +ld.shared.v2.f32 {f1907, f1908}, [r10+4536]; +ld.shared.v2.f32 {f1911, f1912}, [r10+5184]; +ld.shared.v2.f32 {f1915, f1916}, [r10+5832]; +ld.shared.v2.f32 {f1919, f1920}, [r10+6480]; +ld.shared.v2.f32 {f1923, f1924}, [r10+7128]; +ld.shared.v2.f32 {f1927, f1928}, [r10+7776]; +ld.shared.v2.f32 {f1931, f1932}, [r10+8424]; +ld.shared.v2.f32 {f1935, f1936}, [r10+9072]; +ld.shared.v2.f32 {f1939, f1940}, [r10+9720]; +ld.shared.v2.f32 {f1943, f1944}, [r10+10368]; +ld.shared.v2.f32 {f1947, f1948}, [r10+11016]; +ld.shared.v2.f32 {f1951, f1952}, [r10+11664]; +ld.shared.v2.f32 {f1955, f1956}, [r10+12312]; +ld.shared.v2.f32 {f1959, f1960}, [r10+12960]; +ld.shared.v2.f32 {f1963, f1964}, [r10+13608]; +ld.shared.v2.f32 {f1967, f1968}, [r10+14256]; +ld.shared.v2.f32 {f1971, f1972}, [r10+14904]; +ld.shared.v2.f32 {f1975, f1976}, [r10+15552]; +ld.shared.v2.f32 {f1979, f1980}, [r10+16200]; +ld.shared.v2.f32 {f1983, f1984}, [r10+16848]; +add.f32 f1987, f1915, f1951; +mul.f32 f1989, f1987, 0f3F000000; +sub.f32 f1990, f1879, f1989; +add.f32 f2150, f1916, f1952; +sub.f32 f1991, f1916, f1952; +mul.f32 f1992, f1991, 0fBF5DB3D7; +mul.f32 f1993, f2150, 0f3F000000; +sub.f32 f1994, f1880, f1993; +sub.f32 f1995, f1915, f1951; +mul.f32 f1996, f1995, 0fBF5DB3D7; +add.f32 f1997, f1919, f1955; +mul.f32 f1999, f1997, 0f3F000000; +sub.f32 f2000, f1883, f1999; +add.f32 f2149, f1920, f1956; +sub.f32 f2001, f1920, f1956; +mul.f32 f2002, f2001, 0fBF5DB3D7; +mul.f32 f2003, f2149, 0f3F000000; +sub.f32 f2004, f1884, f2003; +sub.f32 f2005, f1919, f1955; +mul.f32 f2006, f2005, 0fBF5DB3D7; +add.f32 f2007, f1923, f1959; +mul.f32 f2009, f2007, 0f3F000000; +sub.f32 f2010, f1887, f2009; +add.f32 f2148, f1924, f1960; +sub.f32 f2011, f1924, f1960; +mul.f32 f2012, f2011, 0fBF5DB3D7; +mul.f32 f2013, f2148, 0f3F000000; +sub.f32 f2014, f1888, f2013; +sub.f32 f2015, f1923, f1959; +mul.f32 f2016, f2015, 0fBF5DB3D7; +add.f32 f2017, f1927, f1963; +mul.f32 f2019, f2017, 0f3F000000; +sub.f32 f2020, f1891, f2019; +add.f32 f2147, f1928, f1964; +sub.f32 f2021, f1928, f1964; +mul.f32 f2022, f2021, 0fBF5DB3D7; +mul.f32 f2023, f2147, 0f3F000000; +sub.f32 f2024, f1892, f2023; +sub.f32 f2025, f1927, f1963; +mul.f32 f2026, f2025, 0fBF5DB3D7; +add.f32 f2027, f1931, f1967; +mul.f32 f2029, f2027, 0f3F000000; +sub.f32 f2030, f1895, f2029; +add.f32 f2146, f1932, f1968; +sub.f32 f2031, f1932, f1968; +mul.f32 f2032, f2031, 0fBF5DB3D7; +mul.f32 f2033, f2146, 0f3F000000; +sub.f32 f2034, f1896, f2033; +sub.f32 f2035, f1931, f1967; +mul.f32 f2036, f2035, 0fBF5DB3D7; +add.f32 f2037, f1935, f1971; +mul.f32 f2039, f2037, 0f3F000000; +sub.f32 f2040, f1899, f2039; +add.f32 f2145, f1936, f1972; +sub.f32 f2041, f1936, f1972; +mul.f32 f2042, f2041, 0fBF5DB3D7; +mul.f32 f2043, f2145, 0f3F000000; +sub.f32 f2044, f1900, f2043; +sub.f32 f2045, f1935, f1971; +mul.f32 f2046, f2045, 0fBF5DB3D7; +add.f32 f2047, f1939, f1975; +mul.f32 f2049, f2047, 0f3F000000; +sub.f32 f2050, f1903, f2049; +add.f32 f2144, f1940, f1976; +sub.f32 f2051, f1940, f1976; +mul.f32 f2052, f2051, 0fBF5DB3D7; +mul.f32 f2053, f2144, 0f3F000000; +sub.f32 f2054, f1904, f2053; +sub.f32 f2055, f1939, f1975; +mul.f32 f2056, f2055, 0fBF5DB3D7; +add.f32 f2057, f1943, f1979; +mul.f32 f2059, f2057, 0f3F000000; +sub.f32 f2060, f1907, f2059; +add.f32 f2143, f1944, f1980; +sub.f32 f2061, f1944, f1980; +mul.f32 f2062, f2061, 0fBF5DB3D7; +mul.f32 f2063, f2143, 0f3F000000; +sub.f32 f2064, f1908, f2063; +sub.f32 f2065, f1943, f1979; +mul.f32 f2066, f2065, 0fBF5DB3D7; +add.f32 f2067, f1947, f1983; +mul.f32 f2069, f2067, 0f3F000000; +sub.f32 f2070, f1911, f2069; +add.f32 f2142, f1948, f1984; +sub.f32 f2071, f1948, f1984; +mul.f32 f2072, f2071, 0fBF5DB3D7; +mul.f32 f2073, f2142, 0f3F000000; +sub.f32 f2074, f1912, f2073; +sub.f32 f2075, f1947, f1983; +mul.f32 f2076, f2075, 0fBF5DB3D7; +add.f32 %1, f1880, f2150; +add.f32 %0, f1879, f1987; +add.f32 %3, f1884, f2149; +add.f32 %2, f1883, f1997; +add.f32 %5, f1888, f2148; +add.f32 %4, f1887, f2007; +add.f32 %7, f1892, f2147; +add.f32 %6, f1891, f2017; +add.f32 %9, f1896, f2146; +add.f32 %8, f1895, f2027; +add.f32 %11, f1900, f2145; +add.f32 %10, f1899, f2037; +add.f32 %13, f1904, f2144; +add.f32 %12, f1903, f2047; +add.f32 %15, f1908, f2143; +add.f32 %14, f1907, f2057; +add.f32 %17, f1912, f2142; +add.f32 %16, f1911, f2067; +add.f32 %18, f1992, f1990; +sub.f32 %19, f1994, f1996; +add.f32 %20, f2002, f2000; +sub.f32 %21, f2004, f2006; +add.f32 %22, f2012, f2010; +sub.f32 %23, f2014, f2016; +add.f32 %24, f2022, f2020; +sub.f32 %25, f2024, f2026; +sub.f32 %27, f2034, f2036; +add.f32 %26, f2032, f2030; +sub.f32 %29, f2044, f2046; +add.f32 %28, f2042, f2040; +add.f32 %30, f2052, f2050; +sub.f32 %31, f2054, f2056; +add.f32 %32, f2062, f2060; +sub.f32 %33, f2064, f2066; +add.f32 %34, f2072, f2070; +sub.f32 %35, f2074, f2076; +add.f32 %37, f1996, f1994; +sub.f32 %36, f1990, f1992; +add.f32 %39, f2006, f2004; +sub.f32 %38, f2000, f2002; +add.f32 %41, f2016, f2014; +sub.f32 %40, f2010, f2012; +add.f32 %43, f2026, f2024; +sub.f32 %42, f2020, f2022; +add.f32 %45, f2036, f2034; +sub.f32 %44, f2030, f2032; +add.f32 %47, f2046, f2044; +sub.f32 %46, f2040, f2042; +add.f32 %49, f2056, f2054; +sub.f32 %48, f2050, f2052; +add.f32 %51, f2066, f2064; +sub.f32 %50, f2060, f2062; +add.f32 %53, f2076, f2074; +sub.f32 %52, f2070, f2072; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_2187), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<346, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<778>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 17496, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0fBF5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0fBF5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f122, f152; +mul.f32 f156, f120, f152; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f138, f162; +mul.f32 f164, f136, f162; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f117, f170; +mul.f32 f172, f111, f170; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f133, f178; +mul.f32 f180, f127, f178; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f149, f186; +mul.f32 f188, f143, f186; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f118, f194; +mul.f32 f196, f112, f194; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f134, f202; +mul.f32 f204, f128, f202; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f150, f210; +mul.f32 f212, f144, f210; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f151, f120, f155; +sub.f32 f217, f157, f156; +st.shared.v2.f32 [r9+8], {f216, f217}; +fma.rn.f32 f218, f160, f136, f163; +sub.f32 f219, f165, f164; +st.shared.v2.f32 [r9+16], {f218, f219}; +sub.f32 f220, f173, f172; +fma.rn.f32 f221, f168, f111, f171; +st.shared.v2.f32 [r9+24], {f221, f220}; +fma.rn.f32 f222, f176, f127, f179; +sub.f32 f223, f181, f180; +st.shared.v2.f32 [r9+32], {f222, f223}; +sub.f32 f224, f189, f188; +fma.rn.f32 f225, f184, f143, f187; +st.shared.v2.f32 [r9+40], {f225, f224}; +fma.rn.f32 f226, f192, f112, f195; +sub.f32 f227, f197, f196; +st.shared.v2.f32 [r9+48], {f226, f227}; +fma.rn.f32 f228, f200, f128, f203; +sub.f32 f229, f205, f204; +st.shared.v2.f32 [r9+56], {f228, f229}; +fma.rn.f32 f230, f208, f144, f211; +sub.f32 f231, f213, f212; +st.shared.v2.f32 [r9+64], {f230, f231}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+1944]; +ld.shared.v2.f32 {f240, f241}, [r11+3888]; +ld.shared.v2.f32 {f244, f245}, [r11+5832]; +ld.shared.v2.f32 {f248, f249}, [r11+7776]; +ld.shared.v2.f32 {f252, f253}, [r11+9720]; +ld.shared.v2.f32 {f256, f257}, [r11+11664]; +ld.shared.v2.f32 {f260, f261}, [r11+13608]; +ld.shared.v2.f32 {f264, f265}, [r11+15552]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0fBF5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0fBF5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0fBF5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0fBF5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0fBF5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0fBF5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0f3F248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0f3F248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0f3F7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0f3F7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0f3F7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0f3F7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0f3EAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0f3EAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0fBF5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0fBF5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f353, f383; +mul.f32 f387, f351, f383; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f369, f393; +mul.f32 f395, f367, f393; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f348, f401; +mul.f32 f403, f342, f401; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f364, f409; +mul.f32 f411, f358, f409; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f380, f417; +mul.f32 f419, f374, f417; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f349, f425; +mul.f32 f427, f343, f425; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f365, f433; +mul.f32 f435, f359, f433; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f381, f441; +mul.f32 f443, f375, f441; +mul.f32 f444, f439, f381; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r17], {f446, f445}; +fma.rn.f32 f447, f382, f351, f386; +sub.f32 f448, f388, f387; +st.shared.v2.f32 [r17+72], {f447, f448}; +fma.rn.f32 f449, f391, f367, f394; +sub.f32 f450, f396, f395; +st.shared.v2.f32 [r17+144], {f449, f450}; +fma.rn.f32 f451, f399, f342, f402; +sub.f32 f452, f404, f403; +st.shared.v2.f32 [r17+216], {f451, f452}; +fma.rn.f32 f453, f407, f358, f410; +sub.f32 f454, f412, f411; +st.shared.v2.f32 [r17+288], {f453, f454}; +fma.rn.f32 f455, f415, f374, f418; +sub.f32 f456, f420, f419; +st.shared.v2.f32 [r17+360], {f455, f456}; +fma.rn.f32 f457, f423, f343, f426; +sub.f32 f458, f428, f427; +st.shared.v2.f32 [r17+432], {f457, f458}; +sub.f32 f459, f436, f435; +fma.rn.f32 f460, f431, f359, f434; +st.shared.v2.f32 [r17+504], {f460, f459}; +fma.rn.f32 f461, f439, f375, f442; +sub.f32 f462, f444, f443; +st.shared.v2.f32 [r17+576], {f461, f462}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r11]; +ld.shared.v2.f32 {f467, f468}, [r11+1944]; +ld.shared.v2.f32 {f471, f472}, [r11+3888]; +ld.shared.v2.f32 {f475, f476}, [r11+5832]; +ld.shared.v2.f32 {f479, f480}, [r11+7776]; +ld.shared.v2.f32 {f483, f484}, [r11+9720]; +ld.shared.v2.f32 {f487, f488}, [r11+11664]; +ld.shared.v2.f32 {f491, f492}, [r11+13608]; +ld.shared.v2.f32 {f495, f496}, [r11+15552]; +add.f32 f499, f475, f487; +add.f32 f500, f463, f499; +add.f32 f501, f476, f488; +add.f32 f502, f464, f501; +mul.f32 f503, f499, 0f3F000000; +sub.f32 f504, f463, f503; +sub.f32 f505, f476, f488; +mul.f32 f506, f505, 0fBF5DB3D7; +add.f32 f507, f506, f504; +sub.f32 f508, f504, f506; +mul.f32 f509, f501, 0f3F000000; +sub.f32 f510, f464, f509; +sub.f32 f511, f475, f487; +mul.f32 f512, f511, 0fBF5DB3D7; +sub.f32 f513, f510, f512; +add.f32 f514, f512, f510; +add.f32 f515, f479, f491; +add.f32 f516, f467, f515; +add.f32 f517, f480, f492; +add.f32 f518, f468, f517; +mul.f32 f519, f515, 0f3F000000; +sub.f32 f520, f467, f519; +sub.f32 f521, f480, f492; +mul.f32 f522, f521, 0fBF5DB3D7; +add.f32 f523, f522, f520; +sub.f32 f524, f520, f522; +mul.f32 f525, f517, 0f3F000000; +sub.f32 f526, f468, f525; +sub.f32 f527, f479, f491; +mul.f32 f528, f527, 0fBF5DB3D7; +sub.f32 f529, f526, f528; +add.f32 f530, f528, f526; +add.f32 f531, f483, f495; +add.f32 f532, f471, f531; +add.f32 f533, f484, f496; +add.f32 f534, f472, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f471, f535; +sub.f32 f537, f484, f496; +mul.f32 f538, f537, 0fBF5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f472, f541; +sub.f32 f543, f483, f495; +mul.f32 f544, f543, 0fBF5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +mul.f32 f547, f523, 0f3F441B7D; +mul.f32 f548, f529, 0f3F248DBB; +sub.f32 f549, f547, f548; +mul.f32 f550, f529, 0f3F441B7D; +fma.rn.f32 f551, f523, 0f3F248DBB, f550; +mul.f32 f552, f539, 0f3E31D0D4; +mul.f32 f553, f545, 0f3F7C1C5C; +sub.f32 f554, f552, f553; +mul.f32 f555, f545, 0f3E31D0D4; +fma.rn.f32 f556, f539, 0f3F7C1C5C, f555; +mul.f32 f557, f524, 0f3E31D0D4; +mul.f32 f558, f530, 0f3F7C1C5C; +sub.f32 f559, f557, f558; +mul.f32 f560, f530, 0f3E31D0D4; +fma.rn.f32 f561, f524, 0f3F7C1C5C, f560; +mul.f32 f562, f540, 0fBF708FB2; +mul.f32 f563, f546, 0f3EAF1D44; +sub.f32 f564, f562, f563; +mul.f32 f565, f546, 0fBF708FB2; +fma.rn.f32 f566, f540, 0f3EAF1D44, f565; +add.f32 f567, f516, f532; +add.f32 f568, f518, f534; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f500, f569; +sub.f32 f571, f518, f534; +mul.f32 f572, f571, 0fBF5DB3D7; +add.f32 f573, f572, f570; +sub.f32 f574, f570, f572; +mul.f32 f575, f568, 0f3F000000; +sub.f32 f576, f502, f575; +sub.f32 f577, f516, f532; +mul.f32 f578, f577, 0fBF5DB3D7; +sub.f32 f579, f576, f578; +add.f32 f580, f578, f576; +add.f32 f581, f549, f554; +add.f32 f582, f507, f581; +add.f32 f583, f551, f556; +add.f32 f584, f513, f583; +mul.f32 f585, f581, 0f3F000000; +sub.f32 f586, f507, f585; +sub.f32 f587, f551, f556; +mul.f32 f588, f587, 0fBF5DB3D7; +add.f32 f589, f588, f586; +sub.f32 f590, f586, f588; +mul.f32 f591, f583, 0f3F000000; +sub.f32 f592, f513, f591; +sub.f32 f593, f549, f554; +mul.f32 f594, f593, 0fBF5DB3D7; +sub.f32 f595, f592, f594; +add.f32 f596, f594, f592; +add.f32 f597, f559, f564; +add.f32 f598, f508, f597; +add.f32 f599, f561, f566; +add.f32 f600, f514, f599; +mul.f32 f601, f597, 0f3F000000; +sub.f32 f602, f508, f601; +sub.f32 f603, f561, f566; +mul.f32 f604, f603, 0fBF5DB3D7; +add.f32 f605, f604, f602; +sub.f32 f606, f602, f604; +mul.f32 f607, f599, 0f3F000000; +sub.f32 f608, f514, f607; +sub.f32 f609, f559, f564; +mul.f32 f610, f609, 0fBF5DB3D7; +sub.f32 f611, f608, f610; +add.f32 f612, f610, f608; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f613, f614}, [rd16]; +mul.f32 f617, f584, f614; +mul.f32 f618, f582, f614; +mul.f32 f619, f613, f584; +mul.f32 f620, f613, f613; +mul.f32 f621, f614, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f614, f613; +fma.rn.f32 f624, f614, f613, f623; +mul.f32 f625, f600, f624; +mul.f32 f626, f598, f624; +mul.f32 f627, f622, f600; +mul.f32 f628, f613, f622; +mul.f32 f629, f614, f624; +sub.f32 f630, f628, f629; +mul.f32 f631, f613, f624; +fma.rn.f32 f632, f614, f622, f631; +mul.f32 f633, f579, f632; +mul.f32 f634, f573, f632; +mul.f32 f635, f630, f579; +mul.f32 f636, f613, f630; +mul.f32 f637, f614, f632; +sub.f32 f638, f636, f637; +mul.f32 f639, f613, f632; +fma.rn.f32 f640, f614, f630, f639; +mul.f32 f641, f595, f640; +mul.f32 f642, f589, f640; +mul.f32 f643, f638, f595; +mul.f32 f644, f613, f638; +mul.f32 f645, f614, f640; +sub.f32 f646, f644, f645; +mul.f32 f647, f613, f640; +fma.rn.f32 f648, f614, f638, f647; +mul.f32 f649, f611, f648; +mul.f32 f650, f605, f648; +mul.f32 f651, f646, f611; +mul.f32 f652, f613, f646; +mul.f32 f653, f614, f648; +sub.f32 f654, f652, f653; +mul.f32 f655, f613, f648; +fma.rn.f32 f656, f614, f646, f655; +mul.f32 f657, f580, f656; +mul.f32 f658, f574, f656; +mul.f32 f659, f654, f580; +mul.f32 f660, f613, f654; +mul.f32 f661, f614, f656; +sub.f32 f662, f660, f661; +mul.f32 f663, f613, f656; +fma.rn.f32 f664, f614, f654, f663; +mul.f32 f665, f596, f664; +mul.f32 f666, f590, f664; +mul.f32 f667, f662, f596; +mul.f32 f668, f613, f662; +mul.f32 f669, f614, f664; +sub.f32 f670, f668, f669; +mul.f32 f671, f613, f664; +fma.rn.f32 f672, f614, f662, f671; +mul.f32 f673, f612, f672; +mul.f32 f674, f606, f672; +mul.f32 f675, f670, f612; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +add.f32 f676, f502, f568; +add.f32 f677, f500, f567; +st.shared.v2.f32 [r23], {f677, f676}; +fma.rn.f32 f678, f613, f582, f617; +sub.f32 f679, f619, f618; +st.shared.v2.f32 [r23+648], {f678, f679}; +fma.rn.f32 f680, f622, f598, f625; +sub.f32 f681, f627, f626; +st.shared.v2.f32 [r23+1296], {f680, f681}; +fma.rn.f32 f682, f630, f573, f633; +sub.f32 f683, f635, f634; +st.shared.v2.f32 [r23+1944], {f682, f683}; +fma.rn.f32 f684, f638, f589, f641; +sub.f32 f685, f643, f642; +st.shared.v2.f32 [r23+2592], {f684, f685}; +fma.rn.f32 f686, f646, f605, f649; +sub.f32 f687, f651, f650; +st.shared.v2.f32 [r23+3240], {f686, f687}; +fma.rn.f32 f688, f654, f574, f657; +sub.f32 f689, f659, f658; +st.shared.v2.f32 [r23+3888], {f688, f689}; +sub.f32 f690, f667, f666; +fma.rn.f32 f691, f662, f590, f665; +st.shared.v2.f32 [r23+4536], {f691, f690}; +fma.rn.f32 f692, f670, f606, f673; +sub.f32 f693, f675, f674; +st.shared.v2.f32 [r23+5184], {f692, f693}; +barrier.sync 0; +ld.shared.v2.f32 {f694, f695}, [r11]; +ld.shared.v2.f32 {f698, f699}, [r11+1944]; +ld.shared.v2.f32 {f702, f703}, [r11+3888]; +ld.shared.v2.f32 {f706, f707}, [r11+5832]; +ld.shared.v2.f32 {f710, f711}, [r11+7776]; +ld.shared.v2.f32 {f714, f715}, [r11+9720]; +ld.shared.v2.f32 {f718, f719}, [r11+11664]; +ld.shared.v2.f32 {f722, f723}, [r11+13608]; +ld.shared.v2.f32 {f726, f727}, [r11+15552]; +add.f32 f730, f706, f718; +add.f32 f731, f707, f719; +mul.f32 f732, f730, 0f3F000000; +sub.f32 f733, f694, f732; +sub.f32 f734, f707, f719; +mul.f32 f735, f734, 0fBF5DB3D7; +mul.f32 f736, f731, 0f3F000000; +sub.f32 f737, f695, f736; +sub.f32 f738, f706, f718; +mul.f32 f739, f738, 0fBF5DB3D7; +add.f32 f740, f710, f722; +add.f32 f741, f711, f723; +mul.f32 f742, f740, 0f3F000000; +sub.f32 f743, f698, f742; +sub.f32 f744, f711, f723; +mul.f32 f745, f744, 0fBF5DB3D7; +mul.f32 f746, f741, 0f3F000000; +sub.f32 f747, f699, f746; +sub.f32 f748, f710, f722; +mul.f32 f749, f748, 0fBF5DB3D7; +add.f32 f750, f714, f726; +add.f32 f751, f715, f727; +mul.f32 f752, f750, 0f3F000000; +sub.f32 f753, f702, f752; +sub.f32 f754, f715, f727; +mul.f32 f755, f754, 0fBF5DB3D7; +mul.f32 f756, f751, 0f3F000000; +sub.f32 f757, f703, f756; +sub.f32 f758, f714, f726; +mul.f32 f759, f758, 0fBF5DB3D7; +add.f32 %1, f695, f731; +add.f32 %0, f694, f730; +add.f32 %3, f699, f741; +add.f32 %2, f698, f740; +add.f32 %5, f703, f751; +add.f32 %4, f702, f750; +sub.f32 %7, f737, f739; +add.f32 %6, f735, f733; +sub.f32 %9, f747, f749; +add.f32 %8, f745, f743; +sub.f32 %11, f757, f759; +add.f32 %10, f755, f753; +add.f32 %13, f739, f737; +sub.f32 %12, f733, f735; +add.f32 %15, f749, f747; +sub.f32 %14, f743, f745; +add.f32 %17, f759, f757; +sub.f32 %16, f753, f755; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_2187), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<348, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<724>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 8748, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0fBF5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0fBF5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0fBF5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0fBF5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f124, f154; +fma.rn.f32 f158, f153, f122, f157; +mul.f32 f159, f122, f154; +mul.f32 f160, f153, f124; +sub.f32 f161, f160, f159; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f140, f166; +fma.rn.f32 f168, f164, f138, f167; +mul.f32 f169, f138, f166; +mul.f32 f170, f164, f140; +sub.f32 f171, f170, f169; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f119, f176; +fma.rn.f32 f178, f174, f113, f177; +mul.f32 f179, f113, f176; +mul.f32 f180, f174, f119; +sub.f32 f181, f180, f179; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f135, f186; +fma.rn.f32 f188, f184, f129, f187; +mul.f32 f189, f129, f186; +mul.f32 f190, f184, f135; +sub.f32 f191, f190, f189; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f151, f196; +fma.rn.f32 f198, f194, f145, f197; +mul.f32 f199, f145, f196; +mul.f32 f200, f194, f151; +sub.f32 f201, f200, f199; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f120, f206; +fma.rn.f32 f208, f204, f114, f207; +mul.f32 f209, f114, f206; +mul.f32 f210, f204, f120; +sub.f32 f211, f210, f209; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f136, f216; +fma.rn.f32 f218, f214, f130, f217; +mul.f32 f219, f130, f216; +mul.f32 f220, f214, f136; +sub.f32 f221, f220, f219; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f152, f226; +fma.rn.f32 f228, f224, f146, f227; +mul.f32 f229, f146, f226; +mul.f32 f230, f224, f152; +sub.f32 f231, f230, f229; +mad.lo.s32 r8, r5, 8748, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f158; +st.shared.f32 [r9+8], f168; +st.shared.f32 [r9+12], f178; +st.shared.f32 [r9+16], f188; +st.shared.f32 [r9+20], f198; +st.shared.f32 [r9+24], f208; +st.shared.f32 [r9+28], f218; +st.shared.f32 [r9+32], f228; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+972]; +ld.shared.f32 f234, [r11+1944]; +ld.shared.f32 f235, [r11+2916]; +ld.shared.f32 f236, [r11+3888]; +ld.shared.f32 f237, [r11+4860]; +ld.shared.f32 f238, [r11+5832]; +ld.shared.f32 f239, [r11+6804]; +ld.shared.f32 f240, [r11+7776]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+972]; +ld.shared.f32 f243, [r11+1944]; +ld.shared.f32 f244, [r11+2916]; +ld.shared.f32 f245, [r11+3888]; +ld.shared.f32 f246, [r11+4860]; +ld.shared.f32 f247, [r11+5832]; +ld.shared.f32 f248, [r11+6804]; +ld.shared.f32 f249, [r11+7776]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0fBF5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0fBF5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0fBF5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0fBF5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0fBF5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0fBF5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0f3F248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0f3F248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0f3F7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0f3F7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0f3F7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0f3F7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0f3EAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0f3EAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0fBF5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0fBF5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f337, f367; +fma.rn.f32 f371, f366, f335, f370; +mul.f32 f372, f335, f367; +mul.f32 f373, f366, f337; +sub.f32 f374, f373, f372; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f353, f379; +fma.rn.f32 f381, f377, f351, f380; +mul.f32 f382, f351, f379; +mul.f32 f383, f377, f353; +sub.f32 f384, f383, f382; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f332, f389; +fma.rn.f32 f391, f387, f326, f390; +mul.f32 f392, f326, f389; +mul.f32 f393, f387, f332; +sub.f32 f394, f393, f392; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f348, f399; +fma.rn.f32 f401, f397, f342, f400; +mul.f32 f402, f342, f399; +mul.f32 f403, f397, f348; +sub.f32 f404, f403, f402; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f364, f409; +fma.rn.f32 f411, f407, f358, f410; +mul.f32 f412, f358, f409; +mul.f32 f413, f407, f364; +sub.f32 f414, f413, f412; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f333, f419; +fma.rn.f32 f421, f417, f327, f420; +mul.f32 f422, f327, f419; +mul.f32 f423, f417, f333; +sub.f32 f424, f423, f422; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f349, f429; +fma.rn.f32 f431, f427, f343, f430; +mul.f32 f432, f343, f429; +mul.f32 f433, f427, f349; +sub.f32 f434, f433, f432; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f365, f439; +fma.rn.f32 f441, f437, f359, f440; +mul.f32 f442, f359, f439; +mul.f32 f443, f437, f365; +sub.f32 f444, f443, f442; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 324, r16; +st.shared.f32 [r17], f319; +st.shared.f32 [r17+36], f371; +st.shared.f32 [r17+72], f381; +st.shared.f32 [r17+108], f391; +st.shared.f32 [r17+144], f401; +st.shared.f32 [r17+180], f411; +st.shared.f32 [r17+216], f421; +st.shared.f32 [r17+252], f431; +st.shared.f32 [r17+288], f441; +barrier.sync 0; +ld.shared.f32 f445, [r11]; +ld.shared.f32 f446, [r11+972]; +ld.shared.f32 f447, [r11+1944]; +ld.shared.f32 f448, [r11+2916]; +ld.shared.f32 f449, [r11+3888]; +ld.shared.f32 f450, [r11+4860]; +ld.shared.f32 f451, [r11+5832]; +ld.shared.f32 f452, [r11+6804]; +ld.shared.f32 f453, [r11+7776]; +barrier.sync 0; +st.shared.f32 [r17], f321; +st.shared.f32 [r17+36], f374; +st.shared.f32 [r17+72], f384; +st.shared.f32 [r17+108], f394; +st.shared.f32 [r17+144], f404; +st.shared.f32 [r17+180], f414; +st.shared.f32 [r17+216], f424; +st.shared.f32 [r17+252], f434; +st.shared.f32 [r17+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r11]; +ld.shared.f32 f455, [r11+972]; +ld.shared.f32 f456, [r11+1944]; +ld.shared.f32 f457, [r11+2916]; +ld.shared.f32 f458, [r11+3888]; +ld.shared.f32 f459, [r11+4860]; +ld.shared.f32 f460, [r11+5832]; +ld.shared.f32 f461, [r11+6804]; +ld.shared.f32 f462, [r11+7776]; +add.f32 f463, f448, f451; +add.f32 f464, f445, f463; +add.f32 f465, f457, f460; +add.f32 f466, f454, f465; +mul.f32 f467, f463, 0f3F000000; +sub.f32 f468, f445, f467; +sub.f32 f469, f457, f460; +mul.f32 f470, f469, 0fBF5DB3D7; +add.f32 f471, f470, f468; +sub.f32 f472, f468, f470; +mul.f32 f473, f465, 0f3F000000; +sub.f32 f474, f454, f473; +sub.f32 f475, f448, f451; +mul.f32 f476, f475, 0fBF5DB3D7; +sub.f32 f477, f474, f476; +add.f32 f478, f476, f474; +add.f32 f479, f449, f452; +add.f32 f480, f446, f479; +add.f32 f481, f458, f461; +add.f32 f482, f455, f481; +mul.f32 f483, f479, 0f3F000000; +sub.f32 f484, f446, f483; +sub.f32 f485, f458, f461; +mul.f32 f486, f485, 0fBF5DB3D7; +add.f32 f487, f486, f484; +sub.f32 f488, f484, f486; +mul.f32 f489, f481, 0f3F000000; +sub.f32 f490, f455, f489; +sub.f32 f491, f449, f452; +mul.f32 f492, f491, 0fBF5DB3D7; +sub.f32 f493, f490, f492; +add.f32 f494, f492, f490; +add.f32 f495, f450, f453; +add.f32 f496, f447, f495; +add.f32 f497, f459, f462; +add.f32 f498, f456, f497; +mul.f32 f499, f495, 0f3F000000; +sub.f32 f500, f447, f499; +sub.f32 f501, f459, f462; +mul.f32 f502, f501, 0fBF5DB3D7; +add.f32 f503, f502, f500; +sub.f32 f504, f500, f502; +mul.f32 f505, f497, 0f3F000000; +sub.f32 f506, f456, f505; +sub.f32 f507, f450, f453; +mul.f32 f508, f507, 0fBF5DB3D7; +sub.f32 f509, f506, f508; +add.f32 f510, f508, f506; +mul.f32 f511, f487, 0f3F441B7D; +mul.f32 f512, f493, 0f3F248DBB; +sub.f32 f513, f511, f512; +mul.f32 f514, f493, 0f3F441B7D; +fma.rn.f32 f515, f487, 0f3F248DBB, f514; +mul.f32 f516, f503, 0f3E31D0D4; +mul.f32 f517, f509, 0f3F7C1C5C; +sub.f32 f518, f516, f517; +mul.f32 f519, f509, 0f3E31D0D4; +fma.rn.f32 f520, f503, 0f3F7C1C5C, f519; +mul.f32 f521, f488, 0f3E31D0D4; +mul.f32 f522, f494, 0f3F7C1C5C; +sub.f32 f523, f521, f522; +mul.f32 f524, f494, 0f3E31D0D4; +fma.rn.f32 f525, f488, 0f3F7C1C5C, f524; +mul.f32 f526, f504, 0fBF708FB2; +mul.f32 f527, f510, 0f3EAF1D44; +sub.f32 f528, f526, f527; +mul.f32 f529, f510, 0fBF708FB2; +fma.rn.f32 f530, f504, 0f3EAF1D44, f529; +add.f32 f531, f480, f496; +add.f32 f532, f464, f531; +add.f32 f533, f482, f498; +add.f32 f534, f466, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f464, f535; +sub.f32 f537, f482, f498; +mul.f32 f538, f537, 0fBF5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f466, f541; +sub.f32 f543, f480, f496; +mul.f32 f544, f543, 0fBF5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +add.f32 f547, f513, f518; +add.f32 f548, f471, f547; +add.f32 f549, f515, f520; +add.f32 f550, f477, f549; +mul.f32 f551, f547, 0f3F000000; +sub.f32 f552, f471, f551; +sub.f32 f553, f515, f520; +mul.f32 f554, f553, 0fBF5DB3D7; +add.f32 f555, f554, f552; +sub.f32 f556, f552, f554; +mul.f32 f557, f549, 0f3F000000; +sub.f32 f558, f477, f557; +sub.f32 f559, f513, f518; +mul.f32 f560, f559, 0fBF5DB3D7; +sub.f32 f561, f558, f560; +add.f32 f562, f560, f558; +add.f32 f563, f523, f528; +add.f32 f564, f472, f563; +add.f32 f565, f525, f530; +add.f32 f566, f478, f565; +mul.f32 f567, f563, 0f3F000000; +sub.f32 f568, f472, f567; +sub.f32 f569, f525, f530; +mul.f32 f570, f569, 0fBF5DB3D7; +add.f32 f571, f570, f568; +sub.f32 f572, f568, f570; +mul.f32 f573, f565, 0f3F000000; +sub.f32 f574, f478, f573; +sub.f32 f575, f523, f528; +mul.f32 f576, f575, 0fBF5DB3D7; +sub.f32 f577, f574, f576; +add.f32 f578, f576, f574; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f579, f580}, [rd16]; +mul.f32 f583, f550, f580; +fma.rn.f32 f584, f579, f548, f583; +mul.f32 f585, f548, f580; +mul.f32 f586, f579, f550; +sub.f32 f587, f586, f585; +mul.f32 f588, f579, f579; +mul.f32 f589, f580, f580; +sub.f32 f590, f588, f589; +mul.f32 f591, f580, f579; +fma.rn.f32 f592, f580, f579, f591; +mul.f32 f593, f566, f592; +fma.rn.f32 f594, f590, f564, f593; +mul.f32 f595, f564, f592; +mul.f32 f596, f590, f566; +sub.f32 f597, f596, f595; +mul.f32 f598, f579, f590; +mul.f32 f599, f580, f592; +sub.f32 f600, f598, f599; +mul.f32 f601, f579, f592; +fma.rn.f32 f602, f580, f590, f601; +mul.f32 f603, f545, f602; +fma.rn.f32 f604, f600, f539, f603; +mul.f32 f605, f539, f602; +mul.f32 f606, f600, f545; +sub.f32 f607, f606, f605; +mul.f32 f608, f579, f600; +mul.f32 f609, f580, f602; +sub.f32 f610, f608, f609; +mul.f32 f611, f579, f602; +fma.rn.f32 f612, f580, f600, f611; +mul.f32 f613, f561, f612; +fma.rn.f32 f614, f610, f555, f613; +mul.f32 f615, f555, f612; +mul.f32 f616, f610, f561; +sub.f32 f617, f616, f615; +mul.f32 f618, f579, f610; +mul.f32 f619, f580, f612; +sub.f32 f620, f618, f619; +mul.f32 f621, f579, f612; +fma.rn.f32 f622, f580, f610, f621; +mul.f32 f623, f577, f622; +fma.rn.f32 f624, f620, f571, f623; +mul.f32 f625, f571, f622; +mul.f32 f626, f620, f577; +sub.f32 f627, f626, f625; +mul.f32 f628, f579, f620; +mul.f32 f629, f580, f622; +sub.f32 f630, f628, f629; +mul.f32 f631, f579, f622; +fma.rn.f32 f632, f580, f620, f631; +mul.f32 f633, f546, f632; +fma.rn.f32 f634, f630, f540, f633; +mul.f32 f635, f540, f632; +mul.f32 f636, f630, f546; +sub.f32 f637, f636, f635; +mul.f32 f638, f579, f630; +mul.f32 f639, f580, f632; +sub.f32 f640, f638, f639; +mul.f32 f641, f579, f632; +fma.rn.f32 f642, f580, f630, f641; +mul.f32 f643, f562, f642; +fma.rn.f32 f644, f640, f556, f643; +mul.f32 f645, f556, f642; +mul.f32 f646, f640, f562; +sub.f32 f647, f646, f645; +mul.f32 f648, f579, f640; +mul.f32 f649, f580, f642; +sub.f32 f650, f648, f649; +mul.f32 f651, f579, f642; +fma.rn.f32 f652, f580, f640, f651; +mul.f32 f653, f578, f652; +fma.rn.f32 f654, f650, f572, f653; +mul.f32 f655, f572, f652; +mul.f32 f656, f650, f578; +sub.f32 f657, f656, f655; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2916, r22; +st.shared.f32 [r23], f532; +st.shared.f32 [r23+324], f584; +st.shared.f32 [r23+648], f594; +st.shared.f32 [r23+972], f604; +st.shared.f32 [r23+1296], f614; +st.shared.f32 [r23+1620], f624; +st.shared.f32 [r23+1944], f634; +st.shared.f32 [r23+2268], f644; +st.shared.f32 [r23+2592], f654; +barrier.sync 0; +ld.shared.f32 f658, [r11]; +ld.shared.f32 f659, [r11+972]; +ld.shared.f32 f660, [r11+1944]; +ld.shared.f32 f661, [r11+2916]; +ld.shared.f32 f662, [r11+3888]; +ld.shared.f32 f663, [r11+4860]; +ld.shared.f32 f664, [r11+5832]; +ld.shared.f32 f665, [r11+6804]; +ld.shared.f32 f666, [r11+7776]; +barrier.sync 0; +st.shared.f32 [r23], f534; +st.shared.f32 [r23+324], f587; +st.shared.f32 [r23+648], f597; +st.shared.f32 [r23+972], f607; +st.shared.f32 [r23+1296], f617; +st.shared.f32 [r23+1620], f627; +st.shared.f32 [r23+1944], f637; +st.shared.f32 [r23+2268], f647; +st.shared.f32 [r23+2592], f657; +barrier.sync 0; +ld.shared.f32 f667, [r11]; +ld.shared.f32 f668, [r11+972]; +ld.shared.f32 f669, [r11+1944]; +ld.shared.f32 f670, [r11+2916]; +ld.shared.f32 f671, [r11+3888]; +ld.shared.f32 f672, [r11+4860]; +ld.shared.f32 f673, [r11+5832]; +ld.shared.f32 f674, [r11+6804]; +ld.shared.f32 f675, [r11+7776]; +add.f32 f676, f661, f664; +add.f32 f677, f670, f673; +mul.f32 f678, f676, 0f3F000000; +sub.f32 f679, f658, f678; +sub.f32 f680, f670, f673; +mul.f32 f681, f680, 0fBF5DB3D7; +mul.f32 f682, f677, 0f3F000000; +sub.f32 f683, f667, f682; +sub.f32 f684, f661, f664; +mul.f32 f685, f684, 0fBF5DB3D7; +add.f32 f686, f662, f665; +add.f32 f687, f671, f674; +mul.f32 f688, f686, 0f3F000000; +sub.f32 f689, f659, f688; +sub.f32 f690, f671, f674; +mul.f32 f691, f690, 0fBF5DB3D7; +mul.f32 f692, f687, 0f3F000000; +sub.f32 f693, f668, f692; +sub.f32 f694, f662, f665; +mul.f32 f695, f694, 0fBF5DB3D7; +add.f32 f696, f663, f666; +add.f32 f697, f672, f675; +mul.f32 f698, f696, 0f3F000000; +sub.f32 f699, f660, f698; +sub.f32 f700, f672, f675; +mul.f32 f701, f700, 0fBF5DB3D7; +mul.f32 f702, f697, 0f3F000000; +sub.f32 f703, f669, f702; +sub.f32 f704, f663, f666; +mul.f32 f705, f704, 0fBF5DB3D7; +add.f32 %0, f658, f676; +add.f32 %1, f667, f677; +add.f32 %2, f659, f686; +add.f32 %3, f668, f687; +add.f32 %4, f660, f696; +add.f32 %5, f669, f697; +add.f32 %6, f681, f679; +sub.f32 %7, f683, f685; +add.f32 %8, f691, f689; +sub.f32 %9, f693, f695; +add.f32 %10, f701, f699; +sub.f32 %11, f703, f705; +sub.f32 %12, f679, f681; +add.f32 %13, f685, f683; +sub.f32 %14, f689, f691; +add.f32 %15, f695, f693; +sub.f32 %16, f699, f701; +add.f32 %17, f705, f703; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_2187), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<349, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2360>; +.reg .b32 r<24>; +.reg .b64 rd<15>; +mov.u32 r22, %tid.y; +mov.u32 r23, %54; +mad.lo.s32 r3, r22, 8748, r23; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2351, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2350, %58, f2351; +mul.f32 f119, f2351, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2349, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2348, %64, f2349; +mul.f32 f135, f2349, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2347, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2346, %70, f2347; +mul.f32 f151, f2347, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f2345, f133, 0f3F441B7D; +sub.f32 f159, f2345, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f2343, f149, 0f3E31D0D4; +mul.f32 f2344, f155, 0f3F7C1C5C; +sub.f32 f164, f2343, f2344; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f2341, f134, 0f3E31D0D4; +mul.f32 f2342, f140, 0f3F7C1C5C; +sub.f32 f169, f2341, f2342; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f2339, f150, 0fBF708FB2; +mul.f32 f2340, f156, 0f3EAF1D44; +sub.f32 f174, f2339, f2340; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2338, f2348, f2346; +sub.f32 f183, f2348, f2346; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2337, f2350, f2338; +mul.f32 f187, f2338, 0f3F000000; +sub.f32 f188, f2350, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2336, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2335, f123, f2336; +mul.f32 f203, f2336, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2334, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2333, f124, f2334; +mul.f32 f219, f2334, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2330, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2328, %113, f2330; +mul.f32 f235, f2330, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2325, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2323, %116, f2325; +mul.f32 f251, f2325, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2320, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2318, %119, f2320; +mul.f32 f267, f2320, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f2317, f249, 0f3F441B7D; +sub.f32 f275, f2317, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f2316, f265, 0f3E31D0D4; +sub.f32 f280, f2316, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f2314, f250, 0f3E31D0D4; +mul.f32 f2315, f256, 0f3F7C1C5C; +sub.f32 f285, f2314, f2315; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f2312, f266, 0fBF708FB2; +mul.f32 f2313, f272, 0f3EAF1D44; +sub.f32 f290, f2312, f2313; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2311, f2323, f2318; +sub.f32 f299, f2323, f2318; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2310, f2328, f2311; +mul.f32 f303, f2311, 0f3F000000; +sub.f32 f304, f2328, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2309, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2308, f239, f2309; +mul.f32 f319, f2309, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2307, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2306, f240, f2307; +mul.f32 f335, f2307, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2303, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2301, %122, f2303; +mul.f32 f351, f2303, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2298, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2296, %125, f2298; +mul.f32 f367, f2298, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2294, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2292, %127, f2294; +mul.f32 f383, f2294, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f2291, f365, 0f3F441B7D; +sub.f32 f391, f2291, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f2290, f381, 0f3E31D0D4; +sub.f32 f396, f2290, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f2288, f366, 0f3E31D0D4; +mul.f32 f2289, f372, 0f3F7C1C5C; +sub.f32 f401, f2288, f2289; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f2286, f382, 0fBF708FB2; +mul.f32 f2287, f388, 0f3EAF1D44; +sub.f32 f406, f2286, f2287; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2285, f2296, f2292; +sub.f32 f415, f2296, f2292; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2284, f2301, f2285; +mul.f32 f419, f2285, 0f3F000000; +sub.f32 f420, f2301, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2283, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2282, f355, f2283; +mul.f32 f435, f2283, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2281, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2280, f356, f2281; +mul.f32 f451, f2281, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2308, 0f3E6C2691; +mul.f32 f2279, f310, 0f3F791978; +sub.f32 f459, f2279, f458; +mul.f32 f460, f2308, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f2277, f426, 0f3F64C51C; +mul.f32 f2278, f2282, 0f3EE5C902; +sub.f32 f464, f2277, f2278; +mul.f32 f465, f2282, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f2275, f326, 0f3F64C51C; +mul.f32 f2276, f2306, 0f3EE5C902; +sub.f32 f469, f2275, f2276; +mul.f32 f470, f2306, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f2273, f442, 0f3F18DF63; +mul.f32 f2274, f2280, 0f3F4D57F2; +sub.f32 f474, f2273, f2274; +mul.f32 f475, f2280, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f2271, f301, 0f3F441B7D; +mul.f32 f2272, f307, 0f3F248DBB; +sub.f32 f479, f2271, f2272; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f2270, f417, 0f3E31D0D4; +sub.f32 f484, f2270, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f2269, f317, 0f3F18DF63; +sub.f32 f489, f2269, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f2268, f433, 0fBE92D7E0; +sub.f32 f494, f2268, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f2267, f333, 0f3ECACAF8; +sub.f32 f499, f2267, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f2266, f449, 0fBF2FAD88; +sub.f32 f504, f2266, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f2265, f302, 0f3E31D0D4; +sub.f32 f509, f2265, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f2263, f418, 0fBF708FB2; +mul.f32 f2264, f424, 0f3EAF1D44; +sub.f32 f514, f2263, f2264; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f2261, f318, 0fBD6E2946; +mul.f32 f2262, f324, 0f3F7F9120; +sub.f32 f519, f2261, f2262; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f2259, f434, 0fBF7E44DE; +mul.f32 f2260, f440, 0fBDEDC21F; +sub.f32 f524, f2259, f2260; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f2258, f334, 0fBE92D7E0; +sub.f32 f529, f2258, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f2257, f450, 0fBF55E287; +sub.f32 f534, f2257, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f2256, f2310, f2284; +sub.f32 f543, f2310, f2284; +mul.f32 f544, f543, 0fBF5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f2255, f2337, f2256; +mul.f32 f547, f2256, 0f3F000000; +sub.f32 f548, f2337, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0fBF5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f2254, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f2253, f2335, f2254; +mul.f32 f563, f2254, 0f3F000000; +sub.f32 f564, f2335, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0fBF5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f2252, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f2251, f2333, f2252; +mul.f32 f579, f2252, 0f3F000000; +sub.f32 f580, f2333, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0fBF5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f2250, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0fBF5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f2249, f191, f2250; +mul.f32 f595, f2250, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0fBF5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f2248, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0fBF5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f2247, f207, f2248; +mul.f32 f611, f2248, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0fBF5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f2246, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0fBF5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f2245, f223, f2246; +mul.f32 f627, f2246, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0fBF5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f2244, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0fBF5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f2243, f192, f2244; +mul.f32 f643, f2244, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0fBF5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f2242, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0fBF5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f2241, f208, f2242; +mul.f32 f659, f2242, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0fBF5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f2240, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0fBF5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f2239, f224, f2240; +mul.f32 f675, f2240, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0fBF5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r21, %tid.x; +mul.wide.u32 rd2, r21, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r21, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f685, f2253, f682; +fma.rn.f32 f686, f681, f554, f685; +mul.f32 f687, f554, f682; +mul.f32 f688, f681, f2253; +sub.f32 f689, f688, f687; +mul.f32 f691, f682, f682; +mul.f32 f2238, f681, f681; +sub.f32 f692, f2238, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f695, f2251, f694; +fma.rn.f32 f696, f692, f570, f695; +mul.f32 f697, f570, f694; +mul.f32 f698, f692, f2251; +sub.f32 f699, f698, f697; +mul.f32 f701, f682, f694; +mul.f32 f2237, f681, f692; +sub.f32 f702, f2237, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f705, f2249, f704; +fma.rn.f32 f706, f702, f586, f705; +mul.f32 f707, f586, f704; +mul.f32 f708, f702, f2249; +sub.f32 f709, f708, f707; +mul.f32 f2235, f681, f702; +mul.f32 f2236, f682, f704; +sub.f32 f712, f2235, f2236; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f715, f2247, f714; +fma.rn.f32 f716, f712, f602, f715; +mul.f32 f717, f602, f714; +mul.f32 f718, f712, f2247; +sub.f32 f719, f718, f717; +mul.f32 f2233, f681, f712; +mul.f32 f2234, f682, f714; +sub.f32 f722, f2233, f2234; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f725, f2245, f724; +fma.rn.f32 f726, f722, f618, f725; +mul.f32 f727, f618, f724; +mul.f32 f728, f722, f2245; +sub.f32 f729, f728, f727; +mul.f32 f731, f682, f724; +mul.f32 f2232, f681, f722; +sub.f32 f732, f2232, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f735, f2243, f734; +fma.rn.f32 f736, f732, f634, f735; +mul.f32 f737, f634, f734; +mul.f32 f738, f732, f2243; +sub.f32 f739, f738, f737; +mul.f32 f741, f682, f734; +mul.f32 f2231, f681, f732; +sub.f32 f742, f2231, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f745, f2241, f744; +fma.rn.f32 f746, f742, f650, f745; +mul.f32 f747, f650, f744; +mul.f32 f748, f742, f2241; +sub.f32 f749, f748, f747; +mul.f32 f751, f682, f744; +mul.f32 f2230, f681, f742; +sub.f32 f752, f2230, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f755, f2239, f754; +fma.rn.f32 f756, f752, f666, f755; +mul.f32 f757, f666, f754; +mul.f32 f758, f752, f2239; +sub.f32 f759, f758, f757; +mul.f32 f2228, f681, f752; +mul.f32 f2229, f682, f754; +sub.f32 f762, f2228, f2229; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f765, f551, f764; +fma.rn.f32 f766, f762, f545, f765; +mul.f32 f767, f545, f764; +mul.f32 f768, f762, f551; +sub.f32 f769, f768, f767; +mul.f32 f2226, f681, f762; +mul.f32 f2227, f682, f764; +sub.f32 f772, f2226, f2227; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f775, f567, f774; +fma.rn.f32 f776, f772, f561, f775; +mul.f32 f777, f561, f774; +mul.f32 f778, f772, f567; +sub.f32 f779, f778, f777; +mul.f32 f781, f682, f774; +mul.f32 f2225, f681, f772; +sub.f32 f782, f2225, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f785, f583, f784; +fma.rn.f32 f786, f782, f577, f785; +mul.f32 f787, f577, f784; +mul.f32 f788, f782, f583; +sub.f32 f789, f788, f787; +mul.f32 f791, f682, f784; +mul.f32 f2224, f681, f782; +sub.f32 f792, f2224, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f795, f599, f794; +fma.rn.f32 f796, f792, f593, f795; +mul.f32 f797, f593, f794; +mul.f32 f798, f792, f599; +sub.f32 f799, f798, f797; +mul.f32 f801, f682, f794; +mul.f32 f2223, f681, f792; +sub.f32 f802, f2223, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f805, f615, f804; +fma.rn.f32 f806, f802, f609, f805; +mul.f32 f807, f609, f804; +mul.f32 f808, f802, f615; +sub.f32 f809, f808, f807; +mul.f32 f2221, f681, f802; +mul.f32 f2222, f682, f804; +sub.f32 f812, f2221, f2222; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f815, f631, f814; +fma.rn.f32 f816, f812, f625, f815; +mul.f32 f817, f625, f814; +mul.f32 f818, f812, f631; +sub.f32 f819, f818, f817; +mul.f32 f2219, f681, f812; +mul.f32 f2220, f682, f814; +sub.f32 f822, f2219, f2220; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f825, f647, f824; +fma.rn.f32 f826, f822, f641, f825; +mul.f32 f827, f641, f824; +mul.f32 f828, f822, f647; +sub.f32 f829, f828, f827; +mul.f32 f831, f682, f824; +mul.f32 f2218, f681, f822; +sub.f32 f832, f2218, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f835, f663, f834; +fma.rn.f32 f836, f832, f657, f835; +mul.f32 f837, f657, f834; +mul.f32 f838, f832, f663; +sub.f32 f839, f838, f837; +mul.f32 f841, f682, f834; +mul.f32 f2217, f681, f832; +sub.f32 f842, f2217, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f845, f679, f844; +fma.rn.f32 f846, f842, f673, f845; +mul.f32 f847, f673, f844; +mul.f32 f848, f842, f679; +sub.f32 f849, f848, f847; +mul.f32 f2215, f681, f842; +mul.f32 f2216, f682, f844; +sub.f32 f852, f2215, f2216; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f855, f552, f854; +fma.rn.f32 f856, f852, f546, f855; +mul.f32 f857, f546, f854; +mul.f32 f858, f852, f552; +sub.f32 f859, f858, f857; +mul.f32 f2213, f681, f852; +mul.f32 f2214, f682, f854; +sub.f32 f862, f2213, f2214; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f865, f568, f864; +fma.rn.f32 f866, f862, f562, f865; +mul.f32 f867, f562, f864; +mul.f32 f868, f862, f568; +sub.f32 f869, f868, f867; +mul.f32 f871, f682, f864; +mul.f32 f2212, f681, f862; +sub.f32 f872, f2212, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f875, f584, f874; +fma.rn.f32 f876, f872, f578, f875; +mul.f32 f877, f578, f874; +mul.f32 f878, f872, f584; +sub.f32 f879, f878, f877; +mul.f32 f881, f682, f874; +mul.f32 f2211, f681, f872; +sub.f32 f882, f2211, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f885, f600, f884; +fma.rn.f32 f886, f882, f594, f885; +mul.f32 f887, f594, f884; +mul.f32 f888, f882, f600; +sub.f32 f889, f888, f887; +mul.f32 f891, f682, f884; +mul.f32 f2210, f681, f882; +sub.f32 f892, f2210, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f895, f616, f894; +fma.rn.f32 f896, f892, f610, f895; +mul.f32 f897, f610, f894; +mul.f32 f898, f892, f616; +sub.f32 f899, f898, f897; +mul.f32 f2208, f681, f892; +mul.f32 f2209, f682, f894; +sub.f32 f902, f2208, f2209; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f905, f632, f904; +fma.rn.f32 f906, f902, f626, f905; +mul.f32 f907, f626, f904; +mul.f32 f908, f902, f632; +sub.f32 f909, f908, f907; +mul.f32 f2206, f681, f902; +mul.f32 f2207, f682, f904; +sub.f32 f912, f2206, f2207; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f915, f648, f914; +fma.rn.f32 f916, f912, f642, f915; +mul.f32 f917, f642, f914; +mul.f32 f918, f912, f648; +sub.f32 f919, f918, f917; +mul.f32 f921, f682, f914; +mul.f32 f2205, f681, f912; +sub.f32 f922, f2205, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f925, f664, f924; +fma.rn.f32 f926, f922, f658, f925; +mul.f32 f927, f658, f924; +mul.f32 f928, f922, f664; +sub.f32 f929, f928, f927; +mul.f32 f931, f682, f924; +mul.f32 f2204, f681, f922; +sub.f32 f932, f2204, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f935, f680, f934; +fma.rn.f32 f936, f932, f674, f935; +mul.f32 f937, f674, f934; +mul.f32 f938, f932, f680; +sub.f32 f939, f938, f937; +mad.lo.s32 r8, r5, 8748, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f686; +st.shared.f32 [r9+8], f696; +st.shared.f32 [r9+12], f706; +st.shared.f32 [r9+16], f716; +st.shared.f32 [r9+20], f726; +st.shared.f32 [r9+24], f736; +st.shared.f32 [r9+28], f746; +st.shared.f32 [r9+32], f756; +st.shared.f32 [r9+36], f766; +st.shared.f32 [r9+40], f776; +st.shared.f32 [r9+44], f786; +st.shared.f32 [r9+48], f796; +st.shared.f32 [r9+52], f806; +st.shared.f32 [r9+56], f816; +st.shared.f32 [r9+60], f826; +st.shared.f32 [r9+64], f836; +st.shared.f32 [r9+68], f846; +st.shared.f32 [r9+72], f856; +st.shared.f32 [r9+76], f866; +st.shared.f32 [r9+80], f876; +st.shared.f32 [r9+84], f886; +st.shared.f32 [r9+88], f896; +st.shared.f32 [r9+92], f906; +st.shared.f32 [r9+96], f916; +st.shared.f32 [r9+100], f926; +st.shared.f32 [r9+104], f936; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+324]; +ld.shared.f32 f942, [r10+648]; +ld.shared.f32 f943, [r10+972]; +ld.shared.f32 f944, [r10+1296]; +ld.shared.f32 f945, [r10+1620]; +ld.shared.f32 f946, [r10+1944]; +ld.shared.f32 f947, [r10+2268]; +ld.shared.f32 f948, [r10+2592]; +ld.shared.f32 f949, [r10+2916]; +ld.shared.f32 f950, [r10+3240]; +ld.shared.f32 f951, [r10+3564]; +ld.shared.f32 f952, [r10+3888]; +ld.shared.f32 f953, [r10+4212]; +ld.shared.f32 f954, [r10+4536]; +ld.shared.f32 f955, [r10+4860]; +ld.shared.f32 f956, [r10+5184]; +ld.shared.f32 f957, [r10+5508]; +ld.shared.f32 f958, [r10+5832]; +ld.shared.f32 f959, [r10+6156]; +ld.shared.f32 f960, [r10+6480]; +ld.shared.f32 f961, [r10+6804]; +ld.shared.f32 f962, [r10+7128]; +ld.shared.f32 f963, [r10+7452]; +ld.shared.f32 f964, [r10+7776]; +ld.shared.f32 f965, [r10+8100]; +ld.shared.f32 f966, [r10+8424]; +barrier.sync 0; +st.shared.f32 [r9], f2255; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +ld.shared.f32 f2203, [r10+5832]; +ld.shared.f32 f2202, [r10+2916]; +add.f32 f2201, f2202, f2203; +sub.f32 f1000, f2202, f2203; +mul.f32 f1001, f1000, 0fBF5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +ld.shared.f32 f2200, [r10]; +add.f32 f2199, f2200, f2201; +mul.f32 f1004, f2201, 0f3F000000; +sub.f32 f1005, f2200, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0fBF5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +ld.shared.f32 f2198, [r10+6804]; +sub.f32 f1015, f943, f1014; +ld.shared.f32 f2197, [r10+3888]; +add.f32 f2196, f2197, f2198; +sub.f32 f1016, f2197, f2198; +mul.f32 f1017, f1016, 0fBF5DB3D7; +ld.shared.f32 f2195, [r10+972]; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f2194, f2195, f2196; +mul.f32 f1020, f2196, 0f3F000000; +sub.f32 f1021, f2195, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0fBF5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +ld.shared.f32 f2193, [r10+7776]; +ld.shared.f32 f2192, [r10+4860]; +sub.f32 f1031, f946, f1030; +add.f32 f2191, f2192, f2193; +sub.f32 f1032, f2192, f2193; +mul.f32 f1033, f1032, 0fBF5DB3D7; +ld.shared.f32 f2190, [r10+1944]; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f2189, f2190, f2191; +mul.f32 f1036, f2191, 0f3F000000; +sub.f32 f1037, f2190, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0fBF5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f2187, f1018, 0f3F441B7D; +mul.f32 f2188, f1024, 0f3F248DBB; +sub.f32 f1044, f2187, f2188; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0f3F248DBB, f1045; +mul.f32 f1048, f1040, 0f3F7C1C5C; +mul.f32 f2186, f1034, 0f3E31D0D4; +sub.f32 f1049, f2186, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0f3F7C1C5C, f1050; +mul.f32 f1053, f1025, 0f3F7C1C5C; +mul.f32 f2185, f1019, 0f3E31D0D4; +sub.f32 f1054, f2185, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0f3F7C1C5C, f1055; +mul.f32 f1058, f1041, 0f3EAF1D44; +mul.f32 f2184, f1035, 0fBF708FB2; +sub.f32 f1059, f2184, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0f3EAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f2183, f2194, f2189; +sub.f32 f1068, f2194, f2189; +mul.f32 f1069, f1068, 0fBF5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f2182, f2199, f2183; +mul.f32 f1072, f2183, 0f3F000000; +sub.f32 f1073, f2199, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0fBF5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f2181, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0fBF5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f2180, f1008, f2181; +mul.f32 f1088, f2181, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0fBF5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f2179, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0fBF5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f2178, f1009, f2179; +mul.f32 f1104, f2179, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0fBF5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +sub.f32 f1115, f941, f1114; +ld.shared.f32 f2177, [r10+6156]; +ld.shared.f32 f2176, [r10+3240]; +add.f32 f2175, f2176, f2177; +sub.f32 f1116, f2176, f2177; +mul.f32 f1117, f1116, 0fBF5DB3D7; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +ld.shared.f32 f2174, [r10+324]; +add.f32 f2173, f2174, f2175; +mul.f32 f1120, f2175, 0f3F000000; +sub.f32 f1121, f2174, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0fBF5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +ld.shared.f32 f2172, [r10+7128]; +sub.f32 f1131, f944, f1130; +ld.shared.f32 f2171, [r10+4212]; +add.f32 f2170, f2171, f2172; +sub.f32 f1132, f2171, f2172; +mul.f32 f1133, f1132, 0fBF5DB3D7; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +ld.shared.f32 f2169, [r10+1296]; +add.f32 f2168, f2169, f2170; +mul.f32 f1136, f2170, 0f3F000000; +sub.f32 f1137, f2169, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0fBF5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +ld.shared.f32 f2167, [r10+5184]; +sub.f32 f1147, f947, f1146; +ld.shared.f32 f2166, [r10+8100]; +add.f32 f2165, f2167, f2166; +sub.f32 f1148, f2167, f2166; +mul.f32 f1149, f1148, 0fBF5DB3D7; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +ld.shared.f32 f2164, [r10+2268]; +add.f32 f2163, f2164, f2165; +mul.f32 f1152, f2165, 0f3F000000; +sub.f32 f1153, f2164, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0fBF5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f2161, f1134, 0f3F441B7D; +mul.f32 f2162, f1140, 0f3F248DBB; +sub.f32 f1160, f2161, f2162; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0f3F248DBB, f1161; +mul.f32 f2159, f1150, 0f3E31D0D4; +mul.f32 f2160, f1156, 0f3F7C1C5C; +sub.f32 f1165, f2159, f2160; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0f3F7C1C5C, f1166; +mul.f32 f1169, f1141, 0f3F7C1C5C; +mul.f32 f2158, f1135, 0f3E31D0D4; +sub.f32 f1170, f2158, f1169; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0f3F7C1C5C, f1171; +mul.f32 f1174, f1157, 0f3EAF1D44; +mul.f32 f2157, f1151, 0fBF708FB2; +sub.f32 f1175, f2157, f1174; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0f3EAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f2156, f2168, f2163; +sub.f32 f1184, f2168, f2163; +mul.f32 f1185, f1184, 0fBF5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f2155, f2173, f2156; +mul.f32 f1188, f2156, 0f3F000000; +sub.f32 f1189, f2173, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0fBF5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f2154, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0fBF5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f2153, f1124, f2154; +mul.f32 f1204, f2154, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0fBF5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f2152, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0fBF5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f2151, f1125, f2152; +mul.f32 f1220, f2152, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0fBF5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +ld.shared.f32 f2150, [r10+3564]; +sub.f32 f1231, f942, f1230; +ld.shared.f32 f2149, [r10+6480]; +add.f32 f2148, f2150, f2149; +sub.f32 f1232, f2150, f2149; +mul.f32 f1233, f1232, 0fBF5DB3D7; +ld.shared.f32 f2147, [r10+648]; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f2146, f2147, f2148; +mul.f32 f1236, f2148, 0f3F000000; +sub.f32 f1237, f2147, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0fBF5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +ld.shared.f32 f2145, [r10+4536]; +ld.shared.f32 f2144, [r10+7452]; +sub.f32 f1247, f945, f1246; +add.f32 f2143, f2145, f2144; +sub.f32 f1248, f2145, f2144; +mul.f32 f1249, f1248, 0fBF5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +ld.shared.f32 f2142, [r10+1620]; +add.f32 f2141, f2142, f2143; +mul.f32 f1252, f2143, 0f3F000000; +sub.f32 f1253, f2142, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0fBF5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +ld.shared.f32 f2140, [r10+8424]; +ld.shared.f32 f2139, [r10+5508]; +add.f32 f2138, f2139, f2140; +sub.f32 f1264, f2139, f2140; +mul.f32 f1265, f1264, 0fBF5DB3D7; +ld.shared.f32 f2137, [r10+2592]; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +add.f32 f2136, f2137, f2138; +mul.f32 f1268, f2138, 0f3F000000; +sub.f32 f1269, f2137, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0fBF5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f2134, f1250, 0f3F441B7D; +mul.f32 f2135, f1256, 0f3F248DBB; +sub.f32 f1276, f2134, f2135; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0f3F248DBB, f1277; +mul.f32 f2132, f1266, 0f3E31D0D4; +mul.f32 f2133, f1272, 0f3F7C1C5C; +sub.f32 f1281, f2132, f2133; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0f3F7C1C5C, f1282; +mul.f32 f1285, f1257, 0f3F7C1C5C; +mul.f32 f2131, f1251, 0f3E31D0D4; +sub.f32 f1286, f2131, f1285; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0f3F7C1C5C, f1287; +mul.f32 f1290, f1273, 0f3EAF1D44; +mul.f32 f2130, f1267, 0fBF708FB2; +sub.f32 f1291, f2130, f1290; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0f3EAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f2129, f2141, f2136; +sub.f32 f1300, f2141, f2136; +mul.f32 f1301, f1300, 0fBF5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f2128, f2146, f2129; +mul.f32 f1304, f2129, 0f3F000000; +sub.f32 f1305, f2146, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0fBF5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f2127, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0fBF5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f2126, f1240, f2127; +mul.f32 f1320, f2127, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0fBF5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f2125, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0fBF5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f2124, f1241, f2125; +mul.f32 f1336, f2125, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0fBF5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f2153, 0f3E6C2691; +mul.f32 f2123, f1195, 0f3F791978; +sub.f32 f1344, f2123, f1343; +mul.f32 f1345, f2153, 0f3F791978; +fma.rn.f32 f1346, f1195, 0f3E6C2691, f1345; +mul.f32 f2121, f1311, 0f3F64C51C; +mul.f32 f2122, f2126, 0f3EE5C902; +sub.f32 f1349, f2121, f2122; +mul.f32 f1350, f2126, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0f3EE5C902, f1350; +mul.f32 f2119, f1211, 0f3F64C51C; +mul.f32 f2120, f2151, 0f3EE5C902; +sub.f32 f1354, f2119, f2120; +mul.f32 f1355, f2151, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0f3EE5C902, f1355; +mul.f32 f2117, f1327, 0f3F18DF63; +mul.f32 f2118, f2124, 0f3F4D57F2; +sub.f32 f1359, f2117, f2118; +mul.f32 f1360, f2124, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0f3F4D57F2, f1360; +mul.f32 f2115, f1186, 0f3F441B7D; +mul.f32 f2116, f1192, 0f3F248DBB; +sub.f32 f1364, f2115, f2116; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0f3F248DBB, f1365; +mul.f32 f1368, f1308, 0f3F7C1C5C; +mul.f32 f2114, f1302, 0f3E31D0D4; +sub.f32 f1369, f2114, f1368; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0f3F7C1C5C, f1370; +mul.f32 f1373, f1208, 0f3F4D57F2; +mul.f32 f2113, f1202, 0f3F18DF63; +sub.f32 f1374, f2113, f1373; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0f3F4D57F2, f1375; +mul.f32 f1378, f1324, 0f3F753ECD; +mul.f32 f2112, f1318, 0fBE92D7E0; +sub.f32 f1379, f2112, f1378; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0f3F753ECD, f1380; +mul.f32 f1383, f1224, 0f3F6B1036; +mul.f32 f2111, f1218, 0f3ECACAF8; +sub.f32 f1384, f2111, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0f3F6B1036, f1385; +mul.f32 f1388, f1340, 0f3F3A3529; +mul.f32 f2110, f1334, 0fBF2FAD88; +sub.f32 f1389, f2110, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0f3F3A3529, f1390; +mul.f32 f1393, f1193, 0f3F7C1C5C; +mul.f32 f2109, f1187, 0f3E31D0D4; +sub.f32 f1394, f2109, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0f3F7C1C5C, f1395; +mul.f32 f2107, f1303, 0fBF708FB2; +mul.f32 f2108, f1309, 0f3EAF1D44; +sub.f32 f1399, f2107, f2108; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0f3EAF1D44, f1400; +mul.f32 f2105, f1203, 0fBD6E2946; +mul.f32 f2106, f1209, 0f3F7F9120; +sub.f32 f1404, f2105, f2106; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0f3F7F9120, f1405; +mul.f32 f2103, f1319, 0fBF7E44DE; +mul.f32 f2104, f1325, 0fBDEDC21F; +sub.f32 f1409, f2103, f2104; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0fBDEDC21F, f1410; +mul.f32 f1413, f1225, 0f3F753ECD; +mul.f32 f2102, f1219, 0fBE92D7E0; +sub.f32 f1414, f2102, f1413; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0f3F753ECD, f1415; +mul.f32 f1418, f1341, 0fBF0CAC9F; +mul.f32 f2101, f1335, 0fBF55E287; +sub.f32 f1419, f2101, f1418; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0fBF0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +add.f32 f1423, f1063, f1422; +mul.f32 f1426, f1422, 0f3F000000; +sub.f32 f1427, f1063, f1426; +add.f32 f2100, f2155, f2128; +sub.f32 f1428, f2155, f2128; +mul.f32 f1429, f1428, 0fBF5DB3D7; +add.f32 f1430, f1429, f1427; +sub.f32 f1431, f1427, f1429; +add.f32 f2099, f2182, f2100; +mul.f32 f1432, f2100, 0f3F000000; +sub.f32 f1433, f2182, f1432; +sub.f32 f1434, f1179, f1295; +mul.f32 f1435, f1434, 0fBF5DB3D7; +sub.f32 f1436, f1433, f1435; +add.f32 f1437, f1435, f1433; +add.f32 f1438, f1344, f1349; +add.f32 f1439, f1079, f1438; +mul.f32 f1442, f1438, 0f3F000000; +sub.f32 f1443, f1079, f1442; +add.f32 f2098, f1346, f1351; +sub.f32 f1444, f1346, f1351; +mul.f32 f1445, f1444, 0fBF5DB3D7; +add.f32 f1446, f1445, f1443; +sub.f32 f1447, f1443, f1445; +add.f32 f2097, f2180, f2098; +mul.f32 f1448, f2098, 0f3F000000; +sub.f32 f1449, f2180, f1448; +sub.f32 f1450, f1344, f1349; +mul.f32 f1451, f1450, 0fBF5DB3D7; +sub.f32 f1452, f1449, f1451; +add.f32 f1453, f1451, f1449; +add.f32 f1454, f1354, f1359; +add.f32 f1455, f1095, f1454; +mul.f32 f1458, f1454, 0f3F000000; +sub.f32 f1459, f1095, f1458; +add.f32 f2096, f1356, f1361; +sub.f32 f1460, f1356, f1361; +mul.f32 f1461, f1460, 0fBF5DB3D7; +add.f32 f1462, f1461, f1459; +sub.f32 f1463, f1459, f1461; +add.f32 f2095, f2178, f2096; +mul.f32 f1464, f2096, 0f3F000000; +sub.f32 f1465, f2178, f1464; +sub.f32 f1466, f1354, f1359; +mul.f32 f1467, f1466, 0fBF5DB3D7; +sub.f32 f1468, f1465, f1467; +add.f32 f1469, f1467, f1465; +add.f32 f1470, f1364, f1369; +add.f32 f1471, f1070, f1470; +mul.f32 f1474, f1470, 0f3F000000; +sub.f32 f1475, f1070, f1474; +add.f32 f2094, f1366, f1371; +sub.f32 f1476, f1366, f1371; +mul.f32 f1477, f1476, 0fBF5DB3D7; +add.f32 f1478, f1477, f1475; +sub.f32 f1479, f1475, f1477; +add.f32 f2093, f1076, f2094; +mul.f32 f1480, f2094, 0f3F000000; +sub.f32 f1481, f1076, f1480; +sub.f32 f1482, f1364, f1369; +mul.f32 f1483, f1482, 0fBF5DB3D7; +sub.f32 f1484, f1481, f1483; +add.f32 f1485, f1483, f1481; +add.f32 f1486, f1374, f1379; +add.f32 f1487, f1086, f1486; +mul.f32 f1490, f1486, 0f3F000000; +sub.f32 f1491, f1086, f1490; +add.f32 f2092, f1376, f1381; +sub.f32 f1492, f1376, f1381; +mul.f32 f1493, f1492, 0fBF5DB3D7; +add.f32 f1494, f1493, f1491; +sub.f32 f1495, f1491, f1493; +add.f32 f2091, f1092, f2092; +mul.f32 f1496, f2092, 0f3F000000; +sub.f32 f1497, f1092, f1496; +sub.f32 f1498, f1374, f1379; +mul.f32 f1499, f1498, 0fBF5DB3D7; +sub.f32 f1500, f1497, f1499; +add.f32 f1501, f1499, f1497; +add.f32 f1502, f1384, f1389; +add.f32 f1503, f1102, f1502; +mul.f32 f1506, f1502, 0f3F000000; +sub.f32 f1507, f1102, f1506; +add.f32 f2090, f1386, f1391; +sub.f32 f1508, f1386, f1391; +mul.f32 f1509, f1508, 0fBF5DB3D7; +add.f32 f1510, f1509, f1507; +sub.f32 f1511, f1507, f1509; +add.f32 f2089, f1108, f2090; +mul.f32 f1512, f2090, 0f3F000000; +sub.f32 f1513, f1108, f1512; +sub.f32 f1514, f1384, f1389; +mul.f32 f1515, f1514, 0fBF5DB3D7; +sub.f32 f1516, f1513, f1515; +add.f32 f1517, f1515, f1513; +add.f32 f1518, f1394, f1399; +add.f32 f1519, f1071, f1518; +mul.f32 f1522, f1518, 0f3F000000; +sub.f32 f1523, f1071, f1522; +add.f32 f2088, f1396, f1401; +sub.f32 f1524, f1396, f1401; +mul.f32 f1525, f1524, 0fBF5DB3D7; +add.f32 f1526, f1525, f1523; +sub.f32 f1527, f1523, f1525; +add.f32 f2087, f1077, f2088; +mul.f32 f1528, f2088, 0f3F000000; +sub.f32 f1529, f1077, f1528; +sub.f32 f1530, f1394, f1399; +mul.f32 f1531, f1530, 0fBF5DB3D7; +sub.f32 f1532, f1529, f1531; +add.f32 f1533, f1531, f1529; +add.f32 f1534, f1404, f1409; +add.f32 f1535, f1087, f1534; +mul.f32 f1538, f1534, 0f3F000000; +sub.f32 f1539, f1087, f1538; +add.f32 f2086, f1406, f1411; +sub.f32 f1540, f1406, f1411; +mul.f32 f1541, f1540, 0fBF5DB3D7; +add.f32 f1542, f1541, f1539; +sub.f32 f1543, f1539, f1541; +add.f32 f2085, f1093, f2086; +mul.f32 f1544, f2086, 0f3F000000; +sub.f32 f1545, f1093, f1544; +sub.f32 f1546, f1404, f1409; +mul.f32 f1547, f1546, 0fBF5DB3D7; +sub.f32 f1548, f1545, f1547; +add.f32 f1549, f1547, f1545; +add.f32 f1550, f1414, f1419; +add.f32 f1551, f1103, f1550; +mul.f32 f1554, f1550, 0f3F000000; +sub.f32 f1555, f1103, f1554; +add.f32 f2084, f1416, f1421; +sub.f32 f1556, f1416, f1421; +mul.f32 f1557, f1556, 0fBF5DB3D7; +add.f32 f1558, f1557, f1555; +sub.f32 f1559, f1555, f1557; +add.f32 f2083, f1109, f2084; +mul.f32 f1560, f2084, 0f3F000000; +sub.f32 f1561, f1109, f1560; +sub.f32 f1562, f1414, f1419; +mul.f32 f1563, f1562, 0fBF5DB3D7; +sub.f32 f1564, f1561, f1563; +add.f32 f1565, f1563, f1561; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1566, f1567}, [rd11]; +mul.f32 f1570, f2097, f1567; +fma.rn.f32 f1571, f1566, f1439, f1570; +mul.f32 f1572, f1439, f1567; +mul.f32 f1573, f1566, f2097; +sub.f32 f1574, f1573, f1572; +mul.f32 f2081, f1566, f1566; +mul.f32 f2082, f1567, f1567; +sub.f32 f1577, f2081, f2082; +mul.f32 f1578, f1567, f1566; +fma.rn.f32 f1579, f1567, f1566, f1578; +mul.f32 f1580, f2095, f1579; +fma.rn.f32 f1581, f1577, f1455, f1580; +mul.f32 f1582, f1455, f1579; +mul.f32 f1583, f1577, f2095; +sub.f32 f1584, f1583, f1582; +mul.f32 f1586, f1567, f1579; +mul.f32 f2080, f1566, f1577; +sub.f32 f1587, f2080, f1586; +mul.f32 f1588, f1566, f1579; +fma.rn.f32 f1589, f1567, f1577, f1588; +mul.f32 f1590, f2093, f1589; +fma.rn.f32 f1591, f1587, f1471, f1590; +mul.f32 f1592, f1471, f1589; +mul.f32 f1593, f1587, f2093; +sub.f32 f1594, f1593, f1592; +mul.f32 f1596, f1567, f1589; +mul.f32 f2079, f1566, f1587; +sub.f32 f1597, f2079, f1596; +mul.f32 f1598, f1566, f1589; +fma.rn.f32 f1599, f1567, f1587, f1598; +mul.f32 f1600, f2091, f1599; +fma.rn.f32 f1601, f1597, f1487, f1600; +mul.f32 f1602, f1487, f1599; +mul.f32 f1603, f1597, f2091; +sub.f32 f1604, f1603, f1602; +mul.f32 f1606, f1567, f1599; +mul.f32 f2078, f1566, f1597; +sub.f32 f1607, f2078, f1606; +mul.f32 f1608, f1566, f1599; +fma.rn.f32 f1609, f1567, f1597, f1608; +mul.f32 f1610, f2089, f1609; +fma.rn.f32 f1611, f1607, f1503, f1610; +mul.f32 f1612, f1503, f1609; +mul.f32 f1613, f1607, f2089; +sub.f32 f1614, f1613, f1612; +mul.f32 f2076, f1566, f1607; +mul.f32 f2077, f1567, f1609; +sub.f32 f1617, f2076, f2077; +mul.f32 f1618, f1566, f1609; +fma.rn.f32 f1619, f1567, f1607, f1618; +mul.f32 f1620, f2087, f1619; +fma.rn.f32 f1621, f1617, f1519, f1620; +mul.f32 f1622, f1519, f1619; +mul.f32 f1623, f1617, f2087; +sub.f32 f1624, f1623, f1622; +mul.f32 f2074, f1566, f1617; +mul.f32 f2075, f1567, f1619; +sub.f32 f1627, f2074, f2075; +mul.f32 f1628, f1566, f1619; +fma.rn.f32 f1629, f1567, f1617, f1628; +mul.f32 f1630, f2085, f1629; +fma.rn.f32 f1631, f1627, f1535, f1630; +mul.f32 f1632, f1535, f1629; +mul.f32 f1633, f1627, f2085; +sub.f32 f1634, f1633, f1632; +mul.f32 f1636, f1567, f1629; +mul.f32 f2073, f1566, f1627; +sub.f32 f1637, f2073, f1636; +mul.f32 f1638, f1566, f1629; +fma.rn.f32 f1639, f1567, f1627, f1638; +mul.f32 f1640, f2083, f1639; +fma.rn.f32 f1641, f1637, f1551, f1640; +mul.f32 f1642, f1551, f1639; +mul.f32 f1643, f1637, f2083; +sub.f32 f1644, f1643, f1642; +mul.f32 f1646, f1567, f1639; +mul.f32 f2072, f1566, f1637; +sub.f32 f1647, f2072, f1646; +mul.f32 f1648, f1566, f1639; +fma.rn.f32 f1649, f1567, f1637, f1648; +mul.f32 f1650, f1436, f1649; +fma.rn.f32 f1651, f1647, f1430, f1650; +mul.f32 f1652, f1430, f1649; +mul.f32 f1653, f1647, f1436; +sub.f32 f1654, f1653, f1652; +mul.f32 f2070, f1566, f1647; +mul.f32 f2071, f1567, f1649; +sub.f32 f1657, f2070, f2071; +mul.f32 f1658, f1566, f1649; +fma.rn.f32 f1659, f1567, f1647, f1658; +mul.f32 f1660, f1452, f1659; +fma.rn.f32 f1661, f1657, f1446, f1660; +mul.f32 f1662, f1446, f1659; +mul.f32 f1663, f1657, f1452; +sub.f32 f1664, f1663, f1662; +mul.f32 f2068, f1566, f1657; +mul.f32 f2069, f1567, f1659; +sub.f32 f1667, f2068, f2069; +mul.f32 f1668, f1566, f1659; +fma.rn.f32 f1669, f1567, f1657, f1668; +mul.f32 f1670, f1468, f1669; +fma.rn.f32 f1671, f1667, f1462, f1670; +mul.f32 f1672, f1462, f1669; +mul.f32 f1673, f1667, f1468; +sub.f32 f1674, f1673, f1672; +mul.f32 f1676, f1567, f1669; +mul.f32 f2067, f1566, f1667; +sub.f32 f1677, f2067, f1676; +mul.f32 f1678, f1566, f1669; +fma.rn.f32 f1679, f1567, f1667, f1678; +mul.f32 f1680, f1484, f1679; +fma.rn.f32 f1681, f1677, f1478, f1680; +mul.f32 f1682, f1478, f1679; +mul.f32 f1683, f1677, f1484; +sub.f32 f1684, f1683, f1682; +mul.f32 f1686, f1567, f1679; +mul.f32 f2066, f1566, f1677; +sub.f32 f1687, f2066, f1686; +mul.f32 f1688, f1566, f1679; +fma.rn.f32 f1689, f1567, f1677, f1688; +mul.f32 f1690, f1500, f1689; +fma.rn.f32 f1691, f1687, f1494, f1690; +mul.f32 f1692, f1494, f1689; +mul.f32 f1693, f1687, f1500; +sub.f32 f1694, f1693, f1692; +mul.f32 f1696, f1567, f1689; +mul.f32 f2065, f1566, f1687; +sub.f32 f1697, f2065, f1696; +mul.f32 f1698, f1566, f1689; +fma.rn.f32 f1699, f1567, f1687, f1698; +mul.f32 f1700, f1516, f1699; +fma.rn.f32 f1701, f1697, f1510, f1700; +mul.f32 f1702, f1510, f1699; +mul.f32 f1703, f1697, f1516; +sub.f32 f1704, f1703, f1702; +mul.f32 f2063, f1566, f1697; +mul.f32 f2064, f1567, f1699; +sub.f32 f1707, f2063, f2064; +mul.f32 f1708, f1566, f1699; +fma.rn.f32 f1709, f1567, f1697, f1708; +mul.f32 f1710, f1532, f1709; +fma.rn.f32 f1711, f1707, f1526, f1710; +mul.f32 f1712, f1526, f1709; +mul.f32 f1713, f1707, f1532; +sub.f32 f1714, f1713, f1712; +mul.f32 f2061, f1566, f1707; +mul.f32 f2062, f1567, f1709; +sub.f32 f1717, f2061, f2062; +mul.f32 f1718, f1566, f1709; +fma.rn.f32 f1719, f1567, f1707, f1718; +mul.f32 f1720, f1548, f1719; +fma.rn.f32 f1721, f1717, f1542, f1720; +mul.f32 f1722, f1542, f1719; +mul.f32 f1723, f1717, f1548; +sub.f32 f1724, f1723, f1722; +mul.f32 f1726, f1567, f1719; +mul.f32 f2060, f1566, f1717; +sub.f32 f1727, f2060, f1726; +mul.f32 f1728, f1566, f1719; +fma.rn.f32 f1729, f1567, f1717, f1728; +mul.f32 f1730, f1564, f1729; +fma.rn.f32 f1731, f1727, f1558, f1730; +mul.f32 f1732, f1558, f1729; +mul.f32 f1733, f1727, f1564; +sub.f32 f1734, f1733, f1732; +mul.f32 f1736, f1567, f1729; +mul.f32 f2059, f1566, f1727; +sub.f32 f1737, f2059, f1736; +mul.f32 f1738, f1566, f1729; +fma.rn.f32 f1739, f1567, f1727, f1738; +mul.f32 f1740, f1437, f1739; +fma.rn.f32 f1741, f1737, f1431, f1740; +mul.f32 f1742, f1431, f1739; +mul.f32 f1743, f1737, f1437; +sub.f32 f1744, f1743, f1742; +mul.f32 f1746, f1567, f1739; +mul.f32 f2058, f1566, f1737; +sub.f32 f1747, f2058, f1746; +mul.f32 f1748, f1566, f1739; +fma.rn.f32 f1749, f1567, f1737, f1748; +mul.f32 f1750, f1453, f1749; +fma.rn.f32 f1751, f1747, f1447, f1750; +mul.f32 f1752, f1447, f1749; +mul.f32 f1753, f1747, f1453; +sub.f32 f1754, f1753, f1752; +mul.f32 f2056, f1566, f1747; +mul.f32 f2057, f1567, f1749; +sub.f32 f1757, f2056, f2057; +mul.f32 f1758, f1566, f1749; +fma.rn.f32 f1759, f1567, f1747, f1758; +mul.f32 f1760, f1469, f1759; +fma.rn.f32 f1761, f1757, f1463, f1760; +mul.f32 f1762, f1463, f1759; +mul.f32 f1763, f1757, f1469; +sub.f32 f1764, f1763, f1762; +mul.f32 f2054, f1566, f1757; +mul.f32 f2055, f1567, f1759; +sub.f32 f1767, f2054, f2055; +mul.f32 f1768, f1566, f1759; +fma.rn.f32 f1769, f1567, f1757, f1768; +mul.f32 f1770, f1485, f1769; +fma.rn.f32 f1771, f1767, f1479, f1770; +mul.f32 f1772, f1479, f1769; +mul.f32 f1773, f1767, f1485; +sub.f32 f1774, f1773, f1772; +mul.f32 f1776, f1567, f1769; +mul.f32 f2053, f1566, f1767; +sub.f32 f1777, f2053, f1776; +mul.f32 f1778, f1566, f1769; +fma.rn.f32 f1779, f1567, f1767, f1778; +mul.f32 f1780, f1501, f1779; +fma.rn.f32 f1781, f1777, f1495, f1780; +mul.f32 f1782, f1495, f1779; +mul.f32 f1783, f1777, f1501; +sub.f32 f1784, f1783, f1782; +mul.f32 f1786, f1567, f1779; +mul.f32 f2052, f1566, f1777; +sub.f32 f1787, f2052, f1786; +mul.f32 f1788, f1566, f1779; +fma.rn.f32 f1789, f1567, f1777, f1788; +mul.f32 f1790, f1517, f1789; +fma.rn.f32 f1791, f1787, f1511, f1790; +mul.f32 f1792, f1511, f1789; +mul.f32 f1793, f1787, f1517; +sub.f32 f1794, f1793, f1792; +mul.f32 f2050, f1566, f1787; +mul.f32 f2051, f1567, f1789; +sub.f32 f1797, f2050, f2051; +mul.f32 f1798, f1566, f1789; +fma.rn.f32 f1799, f1567, f1787, f1798; +mul.f32 f1800, f1533, f1799; +fma.rn.f32 f1801, f1797, f1527, f1800; +mul.f32 f1802, f1527, f1799; +mul.f32 f1803, f1797, f1533; +sub.f32 f1804, f1803, f1802; +mul.f32 f2048, f1566, f1797; +mul.f32 f2049, f1567, f1799; +sub.f32 f1807, f2048, f2049; +mul.f32 f1808, f1566, f1799; +fma.rn.f32 f1809, f1567, f1797, f1808; +mul.f32 f1810, f1549, f1809; +fma.rn.f32 f1811, f1807, f1543, f1810; +mul.f32 f1812, f1543, f1809; +mul.f32 f1813, f1807, f1549; +sub.f32 f1814, f1813, f1812; +mul.f32 f1816, f1567, f1809; +mul.f32 f2047, f1566, f1807; +sub.f32 f1817, f2047, f1816; +mul.f32 f1818, f1566, f1809; +fma.rn.f32 f1819, f1567, f1807, f1818; +mul.f32 f1820, f1565, f1819; +fma.rn.f32 f1821, f1817, f1559, f1820; +mul.f32 f1822, f1559, f1819; +mul.f32 f1823, f1817, f1565; +sub.f32 f1824, f1823, f1822; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 2916, r19; +st.shared.f32 [r20], f1423; +st.shared.f32 [r20+108], f1571; +st.shared.f32 [r20+216], f1581; +st.shared.f32 [r20+324], f1591; +st.shared.f32 [r20+432], f1601; +st.shared.f32 [r20+540], f1611; +st.shared.f32 [r20+648], f1621; +st.shared.f32 [r20+756], f1631; +st.shared.f32 [r20+864], f1641; +st.shared.f32 [r20+972], f1651; +st.shared.f32 [r20+1080], f1661; +st.shared.f32 [r20+1188], f1671; +st.shared.f32 [r20+1296], f1681; +st.shared.f32 [r20+1404], f1691; +st.shared.f32 [r20+1512], f1701; +st.shared.f32 [r20+1620], f1711; +st.shared.f32 [r20+1728], f1721; +st.shared.f32 [r20+1836], f1731; +st.shared.f32 [r20+1944], f1741; +st.shared.f32 [r20+2052], f1751; +st.shared.f32 [r20+2160], f1761; +st.shared.f32 [r20+2268], f1771; +st.shared.f32 [r20+2376], f1781; +st.shared.f32 [r20+2484], f1791; +st.shared.f32 [r20+2592], f1801; +st.shared.f32 [r20+2700], f1811; +st.shared.f32 [r20+2808], f1821; +barrier.sync 0; +ld.shared.f32 f1825, [r10]; +ld.shared.f32 f1826, [r10+324]; +ld.shared.f32 f1827, [r10+648]; +ld.shared.f32 f1828, [r10+972]; +ld.shared.f32 f1829, [r10+1296]; +ld.shared.f32 f1830, [r10+1620]; +ld.shared.f32 f1831, [r10+1944]; +ld.shared.f32 f1832, [r10+2268]; +ld.shared.f32 f1833, [r10+2592]; +ld.shared.f32 f1834, [r10+2916]; +ld.shared.f32 f1835, [r10+3240]; +ld.shared.f32 f1836, [r10+3564]; +ld.shared.f32 f1837, [r10+3888]; +ld.shared.f32 f1838, [r10+4212]; +ld.shared.f32 f1839, [r10+4536]; +ld.shared.f32 f1840, [r10+4860]; +ld.shared.f32 f1841, [r10+5184]; +ld.shared.f32 f1842, [r10+5508]; +ld.shared.f32 f1843, [r10+5832]; +ld.shared.f32 f1844, [r10+6156]; +ld.shared.f32 f1845, [r10+6480]; +ld.shared.f32 f1846, [r10+6804]; +ld.shared.f32 f1847, [r10+7128]; +ld.shared.f32 f1848, [r10+7452]; +ld.shared.f32 f1849, [r10+7776]; +ld.shared.f32 f1850, [r10+8100]; +ld.shared.f32 f1851, [r10+8424]; +barrier.sync 0; +st.shared.f32 [r20], f2099; +st.shared.f32 [r20+108], f1574; +st.shared.f32 [r20+216], f1584; +st.shared.f32 [r20+324], f1594; +st.shared.f32 [r20+432], f1604; +st.shared.f32 [r20+540], f1614; +st.shared.f32 [r20+648], f1624; +st.shared.f32 [r20+756], f1634; +st.shared.f32 [r20+864], f1644; +st.shared.f32 [r20+972], f1654; +st.shared.f32 [r20+1080], f1664; +st.shared.f32 [r20+1188], f1674; +st.shared.f32 [r20+1296], f1684; +st.shared.f32 [r20+1404], f1694; +st.shared.f32 [r20+1512], f1704; +st.shared.f32 [r20+1620], f1714; +st.shared.f32 [r20+1728], f1724; +st.shared.f32 [r20+1836], f1734; +st.shared.f32 [r20+1944], f1744; +st.shared.f32 [r20+2052], f1754; +st.shared.f32 [r20+2160], f1764; +st.shared.f32 [r20+2268], f1774; +st.shared.f32 [r20+2376], f1784; +st.shared.f32 [r20+2484], f1794; +st.shared.f32 [r20+2592], f1804; +st.shared.f32 [r20+2700], f1814; +st.shared.f32 [r20+2808], f1824; +barrier.sync 0; +ld.shared.f32 f1852, [r10]; +ld.shared.f32 f1853, [r10+324]; +ld.shared.f32 f1854, [r10+648]; +ld.shared.f32 f1855, [r10+972]; +ld.shared.f32 f1856, [r10+1296]; +ld.shared.f32 f1857, [r10+1620]; +ld.shared.f32 f1858, [r10+1944]; +ld.shared.f32 f1859, [r10+2268]; +ld.shared.f32 f1860, [r10+2592]; +ld.shared.f32 f1861, [r10+2916]; +ld.shared.f32 f1862, [r10+3240]; +ld.shared.f32 f1863, [r10+3564]; +ld.shared.f32 f1864, [r10+3888]; +ld.shared.f32 f1865, [r10+4212]; +ld.shared.f32 f1866, [r10+4536]; +ld.shared.f32 f1867, [r10+4860]; +ld.shared.f32 f1868, [r10+5184]; +ld.shared.f32 f1869, [r10+5508]; +ld.shared.f32 f1870, [r10+5832]; +ld.shared.f32 f1871, [r10+6156]; +ld.shared.f32 f1872, [r10+6480]; +ld.shared.f32 f1873, [r10+6804]; +ld.shared.f32 f1874, [r10+7128]; +ld.shared.f32 f1875, [r10+7452]; +ld.shared.f32 f1876, [r10+7776]; +ld.shared.f32 f1877, [r10+8100]; +ld.shared.f32 f1878, [r10+8424]; +add.f32 f1879, f1834, f1843; +mul.f32 f1881, f1879, 0f3F000000; +sub.f32 f1882, f1825, f1881; +add.f32 f2046, f1861, f1870; +sub.f32 f1883, f1861, f1870; +mul.f32 f1884, f1883, 0fBF5DB3D7; +mul.f32 f1885, f2046, 0f3F000000; +sub.f32 f1886, f1852, f1885; +sub.f32 f1887, f1834, f1843; +mul.f32 f1888, f1887, 0fBF5DB3D7; +add.f32 f1889, f1835, f1844; +mul.f32 f1891, f1889, 0f3F000000; +sub.f32 f1892, f1826, f1891; +add.f32 f2045, f1862, f1871; +sub.f32 f1893, f1862, f1871; +mul.f32 f1894, f1893, 0fBF5DB3D7; +mul.f32 f1895, f2045, 0f3F000000; +sub.f32 f1896, f1853, f1895; +sub.f32 f1897, f1835, f1844; +mul.f32 f1898, f1897, 0fBF5DB3D7; +add.f32 f1899, f1836, f1845; +mul.f32 f1901, f1899, 0f3F000000; +sub.f32 f1902, f1827, f1901; +add.f32 f2044, f1863, f1872; +sub.f32 f1903, f1863, f1872; +mul.f32 f1904, f1903, 0fBF5DB3D7; +mul.f32 f1905, f2044, 0f3F000000; +sub.f32 f1906, f1854, f1905; +sub.f32 f1907, f1836, f1845; +mul.f32 f1908, f1907, 0fBF5DB3D7; +add.f32 f1909, f1837, f1846; +mul.f32 f1911, f1909, 0f3F000000; +sub.f32 f1912, f1828, f1911; +add.f32 f2043, f1864, f1873; +sub.f32 f1913, f1864, f1873; +mul.f32 f1914, f1913, 0fBF5DB3D7; +mul.f32 f1915, f2043, 0f3F000000; +sub.f32 f1916, f1855, f1915; +sub.f32 f1917, f1837, f1846; +mul.f32 f1918, f1917, 0fBF5DB3D7; +add.f32 f1919, f1838, f1847; +mul.f32 f1921, f1919, 0f3F000000; +sub.f32 f1922, f1829, f1921; +add.f32 f2042, f1865, f1874; +sub.f32 f1923, f1865, f1874; +mul.f32 f1924, f1923, 0fBF5DB3D7; +mul.f32 f1925, f2042, 0f3F000000; +sub.f32 f1926, f1856, f1925; +sub.f32 f1927, f1838, f1847; +mul.f32 f1928, f1927, 0fBF5DB3D7; +add.f32 f1929, f1839, f1848; +mul.f32 f1931, f1929, 0f3F000000; +sub.f32 f1932, f1830, f1931; +add.f32 f2041, f1866, f1875; +sub.f32 f1933, f1866, f1875; +mul.f32 f1934, f1933, 0fBF5DB3D7; +mul.f32 f1935, f2041, 0f3F000000; +sub.f32 f1936, f1857, f1935; +sub.f32 f1937, f1839, f1848; +mul.f32 f1938, f1937, 0fBF5DB3D7; +add.f32 f1939, f1840, f1849; +mul.f32 f1941, f1939, 0f3F000000; +sub.f32 f1942, f1831, f1941; +add.f32 f2040, f1867, f1876; +sub.f32 f1943, f1867, f1876; +mul.f32 f1944, f1943, 0fBF5DB3D7; +mul.f32 f1945, f2040, 0f3F000000; +sub.f32 f1946, f1858, f1945; +sub.f32 f1947, f1840, f1849; +mul.f32 f1948, f1947, 0fBF5DB3D7; +add.f32 f1949, f1841, f1850; +mul.f32 f1951, f1949, 0f3F000000; +sub.f32 f1952, f1832, f1951; +add.f32 f2039, f1868, f1877; +sub.f32 f1953, f1868, f1877; +mul.f32 f1954, f1953, 0fBF5DB3D7; +mul.f32 f1955, f2039, 0f3F000000; +sub.f32 f1956, f1859, f1955; +sub.f32 f1957, f1841, f1850; +mul.f32 f1958, f1957, 0fBF5DB3D7; +add.f32 f1959, f1842, f1851; +mul.f32 f1961, f1959, 0f3F000000; +sub.f32 f1962, f1833, f1961; +add.f32 f2038, f1869, f1878; +sub.f32 f1963, f1869, f1878; +mul.f32 f1964, f1963, 0fBF5DB3D7; +mul.f32 f1965, f2038, 0f3F000000; +sub.f32 f1966, f1860, f1965; +sub.f32 f1967, f1842, f1851; +mul.f32 f2353, f1939, 0f3F000000; +sub.f32 f2352, f1831, f2353; +mul.f32 f1968, f1967, 0fBF5DB3D7; +add.f32 %0, f1825, f1879; +mul.f32 f2355, f2039, 0f3F000000; +sub.f32 f2354, f1859, f2355; +add.f32 %1, f1852, f2046; +mul.f32 f2357, f1929, 0f3F000000; +sub.f32 f2356, f1830, f2357; +mul.f32 f2359, f2040, 0f3F000000; +sub.f32 f2358, f1858, f2359; +add.f32 %2, f1826, f1889; +add.f32 %3, f1853, f2045; +add.f32 %4, f1827, f1899; +add.f32 %5, f1854, f2044; +add.f32 %6, f1828, f1909; +add.f32 %7, f1855, f2043; +add.f32 %8, f1829, f1919; +add.f32 %9, f1856, f2042; +add.f32 %10, f1830, f1929; +add.f32 %11, f1857, f2041; +add.f32 %12, f1831, f1939; +add.f32 %13, f1858, f2040; +add.f32 %14, f1832, f1949; +add.f32 %15, f1859, f2039; +add.f32 %16, f1833, f1959; +add.f32 %17, f1860, f2038; +add.f32 %18, f1884, f1882; +sub.f32 %19, f1886, f1888; +add.f32 %20, f1894, f1892; +sub.f32 %21, f1896, f1898; +add.f32 %22, f1904, f1902; +sub.f32 %23, f1906, f1908; +sub.f32 %25, f1916, f1918; +add.f32 %24, f1914, f1912; +sub.f32 %27, f1926, f1928; +add.f32 %26, f1924, f1922; +sub.f32 %29, f1936, f1938; +add.f32 %28, f1934, f2356; +add.f32 %30, f1944, f2352; +sub.f32 %31, f2358, f1948; +add.f32 %32, f1954, f1952; +sub.f32 %33, f2354, f1958; +add.f32 %34, f1964, f1962; +sub.f32 %35, f1966, f1968; +sub.f32 %36, f1882, f1884; +add.f32 %37, f1888, f1886; +sub.f32 %38, f1892, f1894; +add.f32 %39, f1898, f1896; +sub.f32 %40, f1902, f1904; +add.f32 %41, f1908, f1906; +sub.f32 %42, f1912, f1914; +add.f32 %43, f1918, f1916; +sub.f32 %44, f1922, f1924; +add.f32 %45, f1928, f1926; +sub.f32 %46, f2356, f1934; +add.f32 %47, f1938, f1936; +sub.f32 %48, f2352, f1944; +add.f32 %49, f1948, f2358; +sub.f32 %50, f1952, f1954; +add.f32 %51, f1958, f2354; +sub.f32 %52, f1962, f1964; +add.f32 %53, f1968, f1966; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_2187), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<350, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<311>; +.reg .b32 r<46>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 17496, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %15, %18; +add.f32 f14, %17, %19; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %13, f15; +sub.f32 f17, %17, %19; +mul.f32 f18, f17, 0fBF5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %14, f21; +sub.f32 f23, %15, %18; +mul.f32 f24, f23, 0fBF5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f25, f28; +mul.f32 f32, f19, f28; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f26, f38; +mul.f32 f40, f20, f38; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %14, f14; +add.f32 f43, %13, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f27, f19, f31; +sub.f32 f45, f33, f32; +st.shared.v2.f32 [r9+8], {f44, f45}; +sub.f32 f46, f41, f40; +fma.rn.f32 f47, f36, f20, f39; +st.shared.v2.f32 [r9+16], {f47, f46}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+5832]; +ld.shared.v2.f32 {f56, f57}, [r11+11664]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0fBF5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0fBF5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f72, f75; +mul.f32 f79, f66, f75; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f73, f85; +mul.f32 f87, f67, f85; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f74, f66, f78; +sub.f32 f92, f80, f79; +st.shared.v2.f32 [r17+24], {f91, f92}; +fma.rn.f32 f93, f83, f67, f86; +sub.f32 f94, f88, f87; +st.shared.v2.f32 [r17+48], {f93, f94}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+5832]; +ld.shared.v2.f32 {f103, f104}, [r11+11664]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f119, f122; +mul.f32 f126, f113, f122; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f120, f132; +mul.f32 f134, f114, f132; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r23], {f137, f136}; +fma.rn.f32 f138, f121, f113, f125; +sub.f32 f139, f127, f126; +st.shared.v2.f32 [r23+72], {f138, f139}; +fma.rn.f32 f140, f130, f114, f133; +sub.f32 f141, f135, f134; +st.shared.v2.f32 [r23+144], {f140, f141}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r11]; +ld.shared.v2.f32 {f146, f147}, [r11+5832]; +ld.shared.v2.f32 {f150, f151}, [r11+11664]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0fBF5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f155, 0f3F000000; +sub.f32 f163, f143, f162; +sub.f32 f164, f146, f150; +mul.f32 f165, f164, 0fBF5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f168, f169}, [rd21]; +mul.f32 f172, f166, f169; +mul.f32 f173, f160, f169; +mul.f32 f174, f168, f166; +mul.f32 f175, f168, f168; +mul.f32 f176, f169, f169; +sub.f32 f177, f175, f176; +mul.f32 f178, f169, f168; +fma.rn.f32 f179, f169, f168, f178; +mul.f32 f180, f167, f179; +mul.f32 f181, f161, f179; +mul.f32 f182, f177, f167; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +add.f32 f183, f143, f155; +add.f32 f184, f142, f154; +st.shared.v2.f32 [r33], {f184, f183}; +fma.rn.f32 f185, f168, f160, f172; +sub.f32 f186, f174, f173; +st.shared.v2.f32 [r33+216], {f185, f186}; +fma.rn.f32 f187, f177, f161, f180; +sub.f32 f188, f182, f181; +st.shared.v2.f32 [r33+432], {f187, f188}; +barrier.sync 0; +ld.shared.v2.f32 {f189, f190}, [r11]; +ld.shared.v2.f32 {f193, f194}, [r11+5832]; +ld.shared.v2.f32 {f197, f198}, [r11+11664]; +add.f32 f201, f193, f197; +add.f32 f202, f194, f198; +mul.f32 f203, f201, 0f3F000000; +sub.f32 f204, f189, f203; +sub.f32 f205, f194, f198; +mul.f32 f206, f205, 0fBF5DB3D7; +add.f32 f207, f206, f204; +sub.f32 f208, f204, f206; +mul.f32 f209, f202, 0f3F000000; +sub.f32 f210, f190, f209; +sub.f32 f211, f193, f197; +mul.f32 f212, f211, 0fBF5DB3D7; +sub.f32 f213, f210, f212; +add.f32 f214, f212, f210; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f215, f216}, [rd26]; +mul.f32 f219, f213, f216; +mul.f32 f220, f207, f216; +mul.f32 f221, f215, f213; +mul.f32 f222, f215, f215; +mul.f32 f223, f216, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f216, f215; +fma.rn.f32 f226, f216, f215, f225; +mul.f32 f227, f214, f226; +mul.f32 f228, f208, f226; +mul.f32 f229, f224, f214; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +add.f32 f230, f190, f202; +add.f32 f231, f189, f201; +st.shared.v2.f32 [r39], {f231, f230}; +fma.rn.f32 f232, f215, f207, f219; +sub.f32 f233, f221, f220; +st.shared.v2.f32 [r39+648], {f232, f233}; +fma.rn.f32 f234, f224, f208, f227; +sub.f32 f235, f229, f228; +st.shared.v2.f32 [r39+1296], {f234, f235}; +barrier.sync 0; +ld.shared.v2.f32 {f236, f237}, [r11]; +ld.shared.v2.f32 {f240, f241}, [r11+5832]; +ld.shared.v2.f32 {f244, f245}, [r11+11664]; +add.f32 f248, f240, f244; +add.f32 f249, f241, f245; +mul.f32 f250, f248, 0f3F000000; +sub.f32 f251, f236, f250; +sub.f32 f252, f241, f245; +mul.f32 f253, f252, 0fBF5DB3D7; +add.f32 f254, f253, f251; +sub.f32 f255, f251, f253; +mul.f32 f256, f249, 0f3F000000; +sub.f32 f257, f237, f256; +sub.f32 f258, f240, f244; +mul.f32 f259, f258, 0fBF5DB3D7; +sub.f32 f260, f257, f259; +add.f32 f261, f259, f257; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 3; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 8; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f32 {f262, f263}, [rd31]; +mul.f32 f266, f260, f263; +mul.f32 f267, f254, f263; +mul.f32 f268, f262, f260; +mul.f32 f269, f262, f262; +mul.f32 f270, f263, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f263, f262; +fma.rn.f32 f273, f263, f262, f272; +mul.f32 f274, f261, f273; +mul.f32 f275, f255, f273; +mul.f32 f276, f271, f261; +barrier.sync 0; +mad.lo.s32 r45, r40, 5832, r44; +add.f32 f277, f237, f249; +add.f32 f278, f236, f248; +st.shared.v2.f32 [r45], {f278, f277}; +fma.rn.f32 f279, f262, f254, f266; +sub.f32 f280, f268, f267; +st.shared.v2.f32 [r45+1944], {f279, f280}; +fma.rn.f32 f281, f271, f255, f274; +sub.f32 f282, f276, f275; +st.shared.v2.f32 [r45+3888], {f281, f282}; +barrier.sync 0; +ld.shared.v2.f32 {f283, f284}, [r11]; +ld.shared.v2.f32 {f287, f288}, [r11+5832]; +ld.shared.v2.f32 {f291, f292}, [r11+11664]; +add.f32 f295, f287, f291; +add.f32 f296, f288, f292; +mul.f32 f297, f295, 0f3F000000; +sub.f32 f298, f283, f297; +sub.f32 f299, f288, f292; +mul.f32 f300, f299, 0fBF5DB3D7; +mul.f32 f301, f296, 0f3F000000; +sub.f32 f302, f284, f301; +sub.f32 f303, f287, f291; +mul.f32 f304, f303, 0fBF5DB3D7; +add.f32 %1, f284, f296; +add.f32 %0, f283, f295; +sub.f32 %3, f302, f304; +add.f32 %2, f300, f298; +add.f32 %5, f304, f302; +sub.f32 %4, f298, f300; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_2187), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<351, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<275>; +.reg .b32 r<46>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 8748, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %15, %18; +add.f32 f14, %13, f13; +add.f32 f15, %17, %19; +add.f32 f16, %14, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %13, f17; +sub.f32 f19, %17, %19; +mul.f32 f20, f19, 0fBF5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %14, f23; +sub.f32 f25, %15, %18; +mul.f32 f26, f25, 0fBF5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 8748, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f27, f30; +fma.rn.f32 f34, f29, f21, f33; +mul.f32 f35, f21, f30; +mul.f32 f36, f29, f27; +sub.f32 f37, f36, f35; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f28, f42; +fma.rn.f32 f44, f40, f22, f43; +mul.f32 f45, f22, f42; +mul.f32 f46, f40, f28; +sub.f32 f47, f46, f45; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f34; +st.shared.f32 [r9+8], f44; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+2916]; +ld.shared.f32 f50, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+2916]; +ld.shared.f32 f53, [r11+5832]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0fBF5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0fBF5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f68, f71; +fma.rn.f32 f75, f70, f62, f74; +mul.f32 f76, f62, f71; +mul.f32 f77, f70, f68; +sub.f32 f78, f77, f76; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f69, f83; +fma.rn.f32 f85, f81, f63, f84; +mul.f32 f86, f63, f83; +mul.f32 f87, f81, f69; +sub.f32 f88, f87, f86; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f75; +st.shared.f32 [r17+24], f85; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+2916]; +ld.shared.f32 f91, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+2916]; +ld.shared.f32 f94, [r11+5832]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0fBF5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0fBF5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f109, f112; +fma.rn.f32 f116, f111, f103, f115; +mul.f32 f117, f103, f112; +mul.f32 f118, f111, f109; +sub.f32 f119, f118, f117; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f110, f124; +fma.rn.f32 f126, f122, f104, f125; +mul.f32 f127, f104, f124; +mul.f32 f128, f122, f110; +sub.f32 f129, f128, f127; +barrier.sync 0; +mad.lo.s32 r23, r18, 108, r22; +st.shared.f32 [r23], f96; +st.shared.f32 [r23+36], f116; +st.shared.f32 [r23+72], f126; +barrier.sync 0; +ld.shared.f32 f130, [r11]; +ld.shared.f32 f131, [r11+2916]; +ld.shared.f32 f132, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r23], f98; +st.shared.f32 [r23+36], f119; +st.shared.f32 [r23+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r11]; +ld.shared.f32 f134, [r11+2916]; +ld.shared.f32 f135, [r11+5832]; +add.f32 f136, f131, f132; +add.f32 f137, f130, f136; +add.f32 f138, f134, f135; +add.f32 f139, f133, f138; +mul.f32 f140, f136, 0f3F000000; +sub.f32 f141, f130, f140; +sub.f32 f142, f134, f135; +mul.f32 f143, f142, 0fBF5DB3D7; +add.f32 f144, f143, f141; +sub.f32 f145, f141, f143; +mul.f32 f146, f138, 0f3F000000; +sub.f32 f147, f133, f146; +sub.f32 f148, f131, f132; +mul.f32 f149, f148, 0fBF5DB3D7; +sub.f32 f150, f147, f149; +add.f32 f151, f149, f147; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 2; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f152, f153}, [rd21]; +mul.f32 f156, f150, f153; +fma.rn.f32 f157, f152, f144, f156; +mul.f32 f158, f144, f153; +mul.f32 f159, f152, f150; +sub.f32 f160, f159, f158; +mul.f32 f161, f152, f152; +mul.f32 f162, f153, f153; +sub.f32 f163, f161, f162; +mul.f32 f164, f153, f152; +fma.rn.f32 f165, f153, f152, f164; +mul.f32 f166, f151, f165; +fma.rn.f32 f167, f163, f145, f166; +mul.f32 f168, f145, f165; +mul.f32 f169, f163, f151; +sub.f32 f170, f169, f168; +barrier.sync 0; +mad.lo.s32 r33, r28, 324, r32; +st.shared.f32 [r33], f137; +st.shared.f32 [r33+108], f157; +st.shared.f32 [r33+216], f167; +barrier.sync 0; +ld.shared.f32 f171, [r11]; +ld.shared.f32 f172, [r11+2916]; +ld.shared.f32 f173, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r33], f139; +st.shared.f32 [r33+108], f160; +st.shared.f32 [r33+216], f170; +barrier.sync 0; +ld.shared.f32 f174, [r11]; +ld.shared.f32 f175, [r11+2916]; +ld.shared.f32 f176, [r11+5832]; +add.f32 f177, f172, f173; +add.f32 f178, f171, f177; +add.f32 f179, f175, f176; +add.f32 f180, f174, f179; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f171, f181; +sub.f32 f183, f175, f176; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +mul.f32 f187, f179, 0f3F000000; +sub.f32 f188, f174, f187; +sub.f32 f189, f172, f173; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 2; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f193, f194}, [rd26]; +mul.f32 f197, f191, f194; +fma.rn.f32 f198, f193, f185, f197; +mul.f32 f199, f185, f194; +mul.f32 f200, f193, f191; +sub.f32 f201, f200, f199; +mul.f32 f202, f193, f193; +mul.f32 f203, f194, f194; +sub.f32 f204, f202, f203; +mul.f32 f205, f194, f193; +fma.rn.f32 f206, f194, f193, f205; +mul.f32 f207, f192, f206; +fma.rn.f32 f208, f204, f186, f207; +mul.f32 f209, f186, f206; +mul.f32 f210, f204, f192; +sub.f32 f211, f210, f209; +barrier.sync 0; +mad.lo.s32 r39, r34, 972, r38; +st.shared.f32 [r39], f178; +st.shared.f32 [r39+324], f198; +st.shared.f32 [r39+648], f208; +barrier.sync 0; +ld.shared.f32 f212, [r11]; +ld.shared.f32 f213, [r11+2916]; +ld.shared.f32 f214, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r39], f180; +st.shared.f32 [r39+324], f201; +st.shared.f32 [r39+648], f211; +barrier.sync 0; +ld.shared.f32 f215, [r11]; +ld.shared.f32 f216, [r11+2916]; +ld.shared.f32 f217, [r11+5832]; +add.f32 f218, f213, f214; +add.f32 f219, f212, f218; +add.f32 f220, f216, f217; +add.f32 f221, f215, f220; +mul.f32 f222, f218, 0f3F000000; +sub.f32 f223, f212, f222; +sub.f32 f224, f216, f217; +mul.f32 f225, f224, 0fBF5DB3D7; +add.f32 f226, f225, f223; +sub.f32 f227, f223, f225; +mul.f32 f228, f220, 0f3F000000; +sub.f32 f229, f215, f228; +sub.f32 f230, f213, f214; +mul.f32 f231, f230, 0fBF5DB3D7; +sub.f32 f232, f229, f231; +add.f32 f233, f231, f229; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 2; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 8; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f32 {f234, f235}, [rd31]; +mul.f32 f238, f232, f235; +fma.rn.f32 f239, f234, f226, f238; +mul.f32 f240, f226, f235; +mul.f32 f241, f234, f232; +sub.f32 f242, f241, f240; +mul.f32 f243, f234, f234; +mul.f32 f244, f235, f235; +sub.f32 f245, f243, f244; +mul.f32 f246, f235, f234; +fma.rn.f32 f247, f235, f234, f246; +mul.f32 f248, f233, f247; +fma.rn.f32 f249, f245, f227, f248; +mul.f32 f250, f227, f247; +mul.f32 f251, f245, f233; +sub.f32 f252, f251, f250; +barrier.sync 0; +mad.lo.s32 r45, r40, 2916, r44; +st.shared.f32 [r45], f219; +st.shared.f32 [r45+972], f239; +st.shared.f32 [r45+1944], f249; +barrier.sync 0; +ld.shared.f32 f253, [r11]; +ld.shared.f32 f254, [r11+2916]; +ld.shared.f32 f255, [r11+5832]; +barrier.sync 0; +st.shared.f32 [r45], f221; +st.shared.f32 [r45+972], f242; +st.shared.f32 [r45+1944], f252; +barrier.sync 0; +ld.shared.f32 f256, [r11]; +ld.shared.f32 f257, [r11+2916]; +ld.shared.f32 f258, [r11+5832]; +add.f32 f259, f254, f255; +add.f32 f260, f257, f258; +mul.f32 f261, f259, 0f3F000000; +sub.f32 f262, f253, f261; +sub.f32 f263, f257, f258; +mul.f32 f264, f263, 0fBF5DB3D7; +mul.f32 f265, f260, 0f3F000000; +sub.f32 f266, f256, f265; +sub.f32 f267, f254, f255; +mul.f32 f268, f267, 0fBF5DB3D7; +add.f32 %0, f253, f259; +add.f32 %1, f256, f260; +add.f32 %2, f264, f262; +sub.f32 %3, f266, f268; +sub.f32 %4, f262, f264; +add.f32 %5, f268, f266; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_2187), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..c2ee7bbe20a8a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp64_fwd.hpp.inc @@ -0,0 +1,2198 @@ +#ifndef CUFFTDX_FFT_2187_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_2187_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<528, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<775>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 34992, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 34992, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd151, fd120; +mul.f64 fd156, fd152, fd122; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd136; +mul.f64 fd164, fd162, fd138; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd168, fd111; +mul.f64 fd172, fd170, fd117; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd176, fd127; +mul.f64 fd180, fd178, fd133; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+3888]; +mul.f64 fd186, fd182, fd143; +mul.f64 fd187, fd183, fd149; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd191, fd112; +mul.f64 fd195, fd193, fd118; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd199, fd128; +mul.f64 fd203, fd201, fd134; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd207, fd144; +mul.f64 fd211, fd209, fd150; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd152, fd120, fd157; +sub.f64 fd216, fd155, fd156; +st.shared.v2.f64 [r9+16], {fd216, fd215}; +fma.rn.f64 fd217, fd162, fd136, fd165; +sub.f64 fd218, fd163, fd164; +st.shared.v2.f64 [r9+32], {fd218, fd217}; +sub.f64 fd219, fd171, fd172; +fma.rn.f64 fd220, fd170, fd111, fd173; +st.shared.v2.f64 [r9+48], {fd219, fd220}; +fma.rn.f64 fd221, fd178, fd127, fd181; +sub.f64 fd222, fd179, fd180; +st.shared.v2.f64 [r9+64], {fd222, fd221}; +fma.rn.f64 fd223, fd183, fd143, fd188; +sub.f64 fd224, fd186, fd187; +st.shared.v2.f64 [r9+80], {fd224, fd223}; +fma.rn.f64 fd225, fd193, fd112, fd196; +sub.f64 fd226, fd194, fd195; +st.shared.v2.f64 [r9+96], {fd226, fd225}; +fma.rn.f64 fd227, fd201, fd128, fd204; +sub.f64 fd228, fd202, fd203; +st.shared.v2.f64 [r9+112], {fd228, fd227}; +fma.rn.f64 fd229, fd209, fd144, fd212; +sub.f64 fd230, fd210, fd211; +st.shared.v2.f64 [r9+128], {fd230, fd229}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+3888]; +ld.shared.v2.f64 {fd239, fd240}, [r11+7776]; +ld.shared.v2.f64 {fd243, fd244}, [r11+11664]; +ld.shared.v2.f64 {fd247, fd248}, [r11+15552]; +ld.shared.v2.f64 {fd251, fd252}, [r11+19440]; +ld.shared.v2.f64 {fd255, fd256}, [r11+23328]; +ld.shared.v2.f64 {fd259, fd260}, [r11+27216]; +ld.shared.v2.f64 {fd263, fd264}, [r11+31104]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0d3FEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0d3FEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0d3FEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0dBFE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0dBFE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0dBFEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0dBFEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0dBFEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0dBFEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0dBFD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0dBFD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0d3FEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0d3FEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd381, fd350; +mul.f64 fd386, fd382, fd352; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd390, fd366; +mul.f64 fd394, fd392, fd368; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd398, fd341; +mul.f64 fd402, fd400, fd347; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd406, fd357; +mul.f64 fd410, fd408, fd363; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+432]; +mul.f64 fd416, fd412, fd373; +mul.f64 fd417, fd413, fd379; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd421, fd342; +mul.f64 fd425, fd423, fd348; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd429, fd358; +mul.f64 fd433, fd431, fd364; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd437, fd374; +mul.f64 fd441, fd439, fd380; +mul.f64 fd442, fd437, fd380; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 1296, r16; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r17], {fd444, fd443}; +fma.rn.f64 fd445, fd382, fd350, fd387; +sub.f64 fd446, fd385, fd386; +st.shared.v2.f64 [r17+144], {fd446, fd445}; +fma.rn.f64 fd447, fd392, fd366, fd395; +sub.f64 fd448, fd393, fd394; +st.shared.v2.f64 [r17+288], {fd448, fd447}; +fma.rn.f64 fd449, fd400, fd341, fd403; +sub.f64 fd450, fd401, fd402; +st.shared.v2.f64 [r17+432], {fd450, fd449}; +fma.rn.f64 fd451, fd408, fd357, fd411; +sub.f64 fd452, fd409, fd410; +st.shared.v2.f64 [r17+576], {fd452, fd451}; +fma.rn.f64 fd453, fd413, fd373, fd418; +sub.f64 fd454, fd416, fd417; +st.shared.v2.f64 [r17+720], {fd454, fd453}; +fma.rn.f64 fd455, fd423, fd342, fd426; +sub.f64 fd456, fd424, fd425; +st.shared.v2.f64 [r17+864], {fd456, fd455}; +fma.rn.f64 fd457, fd431, fd358, fd434; +sub.f64 fd458, fd432, fd433; +st.shared.v2.f64 [r17+1008], {fd458, fd457}; +fma.rn.f64 fd459, fd439, fd374, fd442; +sub.f64 fd460, fd440, fd441; +st.shared.v2.f64 [r17+1152], {fd460, fd459}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r11]; +ld.shared.v2.f64 {fd465, fd466}, [r11+3888]; +ld.shared.v2.f64 {fd469, fd470}, [r11+7776]; +ld.shared.v2.f64 {fd473, fd474}, [r11+11664]; +ld.shared.v2.f64 {fd477, fd478}, [r11+15552]; +ld.shared.v2.f64 {fd481, fd482}, [r11+19440]; +ld.shared.v2.f64 {fd485, fd486}, [r11+23328]; +ld.shared.v2.f64 {fd489, fd490}, [r11+27216]; +ld.shared.v2.f64 {fd493, fd494}, [r11+31104]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd461, fd497; +add.f64 fd499, fd474, fd486; +add.f64 fd500, fd462, fd499; +mul.f64 fd501, fd497, 0d3FE0000000000000; +sub.f64 fd502, fd461, fd501; +sub.f64 fd503, fd474, fd486; +mul.f64 fd504, fd503, 0d3FEBB67AE8584CAA; +add.f64 fd505, fd504, fd502; +sub.f64 fd506, fd502, fd504; +mul.f64 fd507, fd499, 0d3FE0000000000000; +sub.f64 fd508, fd462, fd507; +sub.f64 fd509, fd473, fd485; +mul.f64 fd510, fd509, 0d3FEBB67AE8584CAA; +sub.f64 fd511, fd508, fd510; +add.f64 fd512, fd510, fd508; +add.f64 fd513, fd477, fd489; +add.f64 fd514, fd465, fd513; +add.f64 fd515, fd478, fd490; +add.f64 fd516, fd466, fd515; +mul.f64 fd517, fd513, 0d3FE0000000000000; +sub.f64 fd518, fd465, fd517; +sub.f64 fd519, fd478, fd490; +mul.f64 fd520, fd519, 0d3FEBB67AE8584CAA; +add.f64 fd521, fd520, fd518; +sub.f64 fd522, fd518, fd520; +mul.f64 fd523, fd515, 0d3FE0000000000000; +sub.f64 fd524, fd466, fd523; +sub.f64 fd525, fd477, fd489; +mul.f64 fd526, fd525, 0d3FEBB67AE8584CAA; +sub.f64 fd527, fd524, fd526; +add.f64 fd528, fd526, fd524; +add.f64 fd529, fd481, fd493; +add.f64 fd530, fd469, fd529; +add.f64 fd531, fd482, fd494; +add.f64 fd532, fd470, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd469, fd533; +sub.f64 fd535, fd482, fd494; +mul.f64 fd536, fd535, 0d3FEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd470, fd539; +sub.f64 fd541, fd481, fd493; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +mul.f64 fd545, fd521, 0d3FE8836FA2CF5039; +mul.f64 fd546, fd527, 0dBFE491B7523C161D; +sub.f64 fd547, fd545, fd546; +mul.f64 fd548, fd527, 0d3FE8836FA2CF5039; +fma.rn.f64 fd549, fd521, 0dBFE491B7523C161D, fd548; +mul.f64 fd550, fd537, 0d3FC63A1A7E0B738A; +mul.f64 fd551, fd543, 0dBFEF838B8C811C17; +sub.f64 fd552, fd550, fd551; +mul.f64 fd553, fd543, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd554, fd537, 0dBFEF838B8C811C17, fd553; +mul.f64 fd555, fd522, 0d3FC63A1A7E0B738A; +mul.f64 fd556, fd528, 0dBFEF838B8C811C17; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd528, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd559, fd522, 0dBFEF838B8C811C17, fd558; +mul.f64 fd560, fd538, 0dBFEE11F642522D1C; +mul.f64 fd561, fd544, 0dBFD5E3A8748A0BF5; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd544, 0dBFEE11F642522D1C; +fma.rn.f64 fd564, fd538, 0dBFD5E3A8748A0BF5, fd563; +add.f64 fd565, fd514, fd530; +add.f64 fd566, fd516, fd532; +mul.f64 fd567, fd565, 0d3FE0000000000000; +sub.f64 fd568, fd498, fd567; +sub.f64 fd569, fd516, fd532; +mul.f64 fd570, fd569, 0d3FEBB67AE8584CAA; +add.f64 fd571, fd570, fd568; +sub.f64 fd572, fd568, fd570; +mul.f64 fd573, fd566, 0d3FE0000000000000; +sub.f64 fd574, fd500, fd573; +sub.f64 fd575, fd514, fd530; +mul.f64 fd576, fd575, 0d3FEBB67AE8584CAA; +sub.f64 fd577, fd574, fd576; +add.f64 fd578, fd576, fd574; +add.f64 fd579, fd547, fd552; +add.f64 fd580, fd505, fd579; +add.f64 fd581, fd549, fd554; +add.f64 fd582, fd511, fd581; +mul.f64 fd583, fd579, 0d3FE0000000000000; +sub.f64 fd584, fd505, fd583; +sub.f64 fd585, fd549, fd554; +mul.f64 fd586, fd585, 0d3FEBB67AE8584CAA; +add.f64 fd587, fd586, fd584; +sub.f64 fd588, fd584, fd586; +mul.f64 fd589, fd581, 0d3FE0000000000000; +sub.f64 fd590, fd511, fd589; +sub.f64 fd591, fd547, fd552; +mul.f64 fd592, fd591, 0d3FEBB67AE8584CAA; +sub.f64 fd593, fd590, fd592; +add.f64 fd594, fd592, fd590; +add.f64 fd595, fd557, fd562; +add.f64 fd596, fd506, fd595; +add.f64 fd597, fd559, fd564; +add.f64 fd598, fd512, fd597; +mul.f64 fd599, fd595, 0d3FE0000000000000; +sub.f64 fd600, fd506, fd599; +sub.f64 fd601, fd559, fd564; +mul.f64 fd602, fd601, 0d3FEBB67AE8584CAA; +add.f64 fd603, fd602, fd600; +sub.f64 fd604, fd600, fd602; +mul.f64 fd605, fd597, 0d3FE0000000000000; +sub.f64 fd606, fd512, fd605; +sub.f64 fd607, fd557, fd562; +mul.f64 fd608, fd607, 0d3FEBB67AE8584CAA; +sub.f64 fd609, fd606, fd608; +add.f64 fd610, fd608, fd606; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd611, fd612}, [rd16]; +mul.f64 fd615, fd611, fd580; +mul.f64 fd616, fd612, fd582; +mul.f64 fd617, fd611, fd582; +mul.f64 fd618, fd611, fd611; +mul.f64 fd619, fd612, fd612; +sub.f64 fd620, fd618, fd619; +mul.f64 fd621, fd612, fd611; +fma.rn.f64 fd622, fd612, fd611, fd621; +mul.f64 fd623, fd620, fd596; +mul.f64 fd624, fd622, fd598; +mul.f64 fd625, fd620, fd598; +mul.f64 fd626, fd611, fd620; +mul.f64 fd627, fd612, fd622; +sub.f64 fd628, fd626, fd627; +mul.f64 fd629, fd611, fd622; +fma.rn.f64 fd630, fd612, fd620, fd629; +mul.f64 fd631, fd628, fd571; +mul.f64 fd632, fd630, fd577; +mul.f64 fd633, fd628, fd577; +mul.f64 fd634, fd611, fd628; +mul.f64 fd635, fd612, fd630; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd611, fd630; +fma.rn.f64 fd638, fd612, fd628, fd637; +mul.f64 fd639, fd636, fd587; +mul.f64 fd640, fd638, fd593; +mul.f64 fd641, fd636, fd593; +ld.global.v2.f64 {fd642, fd643}, [rd16+48]; +mul.f64 fd646, fd642, fd603; +mul.f64 fd647, fd643, fd609; +mul.f64 fd648, fd642, fd609; +mul.f64 fd649, fd611, fd642; +mul.f64 fd650, fd612, fd643; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd611, fd643; +fma.rn.f64 fd653, fd612, fd642, fd652; +mul.f64 fd654, fd651, fd572; +mul.f64 fd655, fd653, fd578; +mul.f64 fd656, fd651, fd578; +mul.f64 fd657, fd611, fd651; +mul.f64 fd658, fd612, fd653; +sub.f64 fd659, fd657, fd658; +mul.f64 fd660, fd611, fd653; +fma.rn.f64 fd661, fd612, fd651, fd660; +mul.f64 fd662, fd659, fd588; +mul.f64 fd663, fd661, fd594; +mul.f64 fd664, fd659, fd594; +mul.f64 fd665, fd611, fd659; +mul.f64 fd666, fd612, fd661; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd611, fd661; +fma.rn.f64 fd669, fd612, fd659, fd668; +mul.f64 fd670, fd667, fd604; +mul.f64 fd671, fd669, fd610; +mul.f64 fd672, fd667, fd610; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 11664, r22; +add.f64 fd673, fd500, fd566; +add.f64 fd674, fd498, fd565; +st.shared.v2.f64 [r23], {fd674, fd673}; +fma.rn.f64 fd675, fd612, fd580, fd617; +sub.f64 fd676, fd615, fd616; +st.shared.v2.f64 [r23+1296], {fd676, fd675}; +fma.rn.f64 fd677, fd622, fd596, fd625; +sub.f64 fd678, fd623, fd624; +st.shared.v2.f64 [r23+2592], {fd678, fd677}; +fma.rn.f64 fd679, fd630, fd571, fd633; +sub.f64 fd680, fd631, fd632; +st.shared.v2.f64 [r23+3888], {fd680, fd679}; +fma.rn.f64 fd681, fd638, fd587, fd641; +sub.f64 fd682, fd639, fd640; +st.shared.v2.f64 [r23+5184], {fd682, fd681}; +fma.rn.f64 fd683, fd643, fd603, fd648; +sub.f64 fd684, fd646, fd647; +st.shared.v2.f64 [r23+6480], {fd684, fd683}; +fma.rn.f64 fd685, fd653, fd572, fd656; +sub.f64 fd686, fd654, fd655; +st.shared.v2.f64 [r23+7776], {fd686, fd685}; +fma.rn.f64 fd687, fd661, fd588, fd664; +sub.f64 fd688, fd662, fd663; +st.shared.v2.f64 [r23+9072], {fd688, fd687}; +fma.rn.f64 fd689, fd669, fd604, fd672; +sub.f64 fd690, fd670, fd671; +st.shared.v2.f64 [r23+10368], {fd690, fd689}; +barrier.sync 0; +ld.shared.v2.f64 {fd691, fd692}, [r11]; +ld.shared.v2.f64 {fd695, fd696}, [r11+3888]; +ld.shared.v2.f64 {fd699, fd700}, [r11+7776]; +ld.shared.v2.f64 {fd703, fd704}, [r11+11664]; +ld.shared.v2.f64 {fd707, fd708}, [r11+15552]; +ld.shared.v2.f64 {fd711, fd712}, [r11+19440]; +ld.shared.v2.f64 {fd715, fd716}, [r11+23328]; +ld.shared.v2.f64 {fd719, fd720}, [r11+27216]; +ld.shared.v2.f64 {fd723, fd724}, [r11+31104]; +add.f64 fd727, fd703, fd715; +add.f64 fd728, fd704, fd716; +mul.f64 fd729, fd727, 0d3FE0000000000000; +sub.f64 fd730, fd691, fd729; +sub.f64 fd731, fd704, fd716; +mul.f64 fd732, fd731, 0d3FEBB67AE8584CAA; +mul.f64 fd733, fd728, 0d3FE0000000000000; +sub.f64 fd734, fd692, fd733; +sub.f64 fd735, fd703, fd715; +mul.f64 fd736, fd735, 0d3FEBB67AE8584CAA; +add.f64 fd737, fd707, fd719; +add.f64 fd738, fd708, fd720; +mul.f64 fd739, fd737, 0d3FE0000000000000; +sub.f64 fd740, fd695, fd739; +sub.f64 fd741, fd708, fd720; +mul.f64 fd742, fd741, 0d3FEBB67AE8584CAA; +mul.f64 fd743, fd738, 0d3FE0000000000000; +sub.f64 fd744, fd696, fd743; +sub.f64 fd745, fd707, fd719; +mul.f64 fd746, fd745, 0d3FEBB67AE8584CAA; +add.f64 fd747, fd711, fd723; +add.f64 fd748, fd712, fd724; +mul.f64 fd749, fd747, 0d3FE0000000000000; +sub.f64 fd750, fd699, fd749; +sub.f64 fd751, fd712, fd724; +mul.f64 fd752, fd751, 0d3FEBB67AE8584CAA; +mul.f64 fd753, fd748, 0d3FE0000000000000; +sub.f64 fd754, fd700, fd753; +sub.f64 fd755, fd711, fd723; +mul.f64 fd756, fd755, 0d3FEBB67AE8584CAA; +add.f64 %1, fd692, fd728; +add.f64 %0, fd691, fd727; +add.f64 %3, fd696, fd738; +add.f64 %2, fd695, fd737; +add.f64 %5, fd700, fd748; +add.f64 %4, fd699, fd747; +sub.f64 %7, fd734, fd736; +add.f64 %6, fd732, fd730; +sub.f64 %9, fd744, fd746; +add.f64 %8, fd742, fd740; +sub.f64 %11, fd754, fd756; +add.f64 %10, fd752, fd750; +add.f64 %13, fd736, fd734; +sub.f64 %12, fd730, fd732; +add.f64 %15, fd746, fd744; +sub.f64 %14, fd740, fd742; +add.f64 %17, fd756, fd754; +sub.f64 %16, fd750, fd752; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_2187), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<527, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<721>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 17496, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0d3FEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0d3FEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0d3FEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd153, fd122; +mul.f64 fd158, fd154, fd124; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd153, fd124; +fma.rn.f64 fd161, fd154, fd122, fd160; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd164, fd138; +mul.f64 fd168, fd166, fd140; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd164, fd140; +fma.rn.f64 fd171, fd166, fd138, fd170; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd174, fd113; +mul.f64 fd178, fd176, fd119; +sub.f64 fd179, fd177, fd178; +mul.f64 fd180, fd174, fd119; +fma.rn.f64 fd181, fd176, fd113, fd180; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd184, fd129; +mul.f64 fd188, fd186, fd135; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd184, fd135; +fma.rn.f64 fd191, fd186, fd129, fd190; +ld.global.v2.f64 {fd192, fd193}, [rd6+3888]; +mul.f64 fd196, fd192, fd145; +mul.f64 fd197, fd193, fd151; +sub.f64 fd198, fd196, fd197; +mul.f64 fd199, fd192, fd151; +fma.rn.f64 fd200, fd193, fd145, fd199; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd203, fd114; +mul.f64 fd207, fd205, fd120; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, fd120; +fma.rn.f64 fd210, fd205, fd114, fd209; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd213, fd130; +mul.f64 fd217, fd215, fd136; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd213, fd136; +fma.rn.f64 fd220, fd215, fd130, fd219; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd223, fd146; +mul.f64 fd227, fd225, fd152; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd223, fd152; +fma.rn.f64 fd230, fd225, fd146, fd229; +mad.lo.s32 r8, r5, 17496, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd159; +st.shared.f64 [r9+16], fd169; +st.shared.f64 [r9+24], fd179; +st.shared.f64 [r9+32], fd189; +st.shared.f64 [r9+40], fd198; +st.shared.f64 [r9+48], fd208; +st.shared.f64 [r9+56], fd218; +st.shared.f64 [r9+64], fd228; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+1944]; +ld.shared.f64 fd233, [r11+3888]; +ld.shared.f64 fd234, [r11+5832]; +ld.shared.f64 fd235, [r11+7776]; +ld.shared.f64 fd236, [r11+9720]; +ld.shared.f64 fd237, [r11+11664]; +ld.shared.f64 fd238, [r11+13608]; +ld.shared.f64 fd239, [r11+15552]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+1944]; +ld.shared.f64 fd242, [r11+3888]; +ld.shared.f64 fd243, [r11+5832]; +ld.shared.f64 fd244, [r11+7776]; +ld.shared.f64 fd245, [r11+9720]; +ld.shared.f64 fd246, [r11+11664]; +ld.shared.f64 fd247, [r11+13608]; +ld.shared.f64 fd248, [r11+15552]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0d3FEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0d3FEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0d3FEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0dBFE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0dBFE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0dBFEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0dBFEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0dBFEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0dBFEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0dBFD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0dBFD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0d3FEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0d3FEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd365, fd334; +mul.f64 fd370, fd366, fd336; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd365, fd336; +fma.rn.f64 fd373, fd366, fd334, fd372; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd376, fd350; +mul.f64 fd380, fd378, fd352; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, fd352; +fma.rn.f64 fd383, fd378, fd350, fd382; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd386, fd325; +mul.f64 fd390, fd388, fd331; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd386, fd331; +fma.rn.f64 fd393, fd388, fd325, fd392; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd396, fd341; +mul.f64 fd400, fd398, fd347; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd396, fd347; +fma.rn.f64 fd403, fd398, fd341, fd402; +ld.global.v2.f64 {fd404, fd405}, [rd11+432]; +mul.f64 fd408, fd404, fd357; +mul.f64 fd409, fd405, fd363; +sub.f64 fd410, fd408, fd409; +mul.f64 fd411, fd404, fd363; +fma.rn.f64 fd412, fd405, fd357, fd411; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd415, fd326; +mul.f64 fd419, fd417, fd332; +sub.f64 fd420, fd418, fd419; +mul.f64 fd421, fd415, fd332; +fma.rn.f64 fd422, fd417, fd326, fd421; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd425, fd342; +mul.f64 fd429, fd427, fd348; +sub.f64 fd430, fd428, fd429; +mul.f64 fd431, fd425, fd348; +fma.rn.f64 fd432, fd427, fd342, fd431; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd435, fd358; +mul.f64 fd439, fd437, fd364; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd435, fd364; +fma.rn.f64 fd442, fd437, fd358, fd441; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +st.shared.f64 [r17], fd318; +st.shared.f64 [r17+72], fd371; +st.shared.f64 [r17+144], fd381; +st.shared.f64 [r17+216], fd391; +st.shared.f64 [r17+288], fd401; +st.shared.f64 [r17+360], fd410; +st.shared.f64 [r17+432], fd420; +st.shared.f64 [r17+504], fd430; +st.shared.f64 [r17+576], fd440; +barrier.sync 0; +ld.shared.f64 fd443, [r11]; +ld.shared.f64 fd444, [r11+1944]; +ld.shared.f64 fd445, [r11+3888]; +ld.shared.f64 fd446, [r11+5832]; +ld.shared.f64 fd447, [r11+7776]; +ld.shared.f64 fd448, [r11+9720]; +ld.shared.f64 fd449, [r11+11664]; +ld.shared.f64 fd450, [r11+13608]; +ld.shared.f64 fd451, [r11+15552]; +barrier.sync 0; +st.shared.f64 [r17], fd320; +st.shared.f64 [r17+72], fd373; +st.shared.f64 [r17+144], fd383; +st.shared.f64 [r17+216], fd393; +st.shared.f64 [r17+288], fd403; +st.shared.f64 [r17+360], fd412; +st.shared.f64 [r17+432], fd422; +st.shared.f64 [r17+504], fd432; +st.shared.f64 [r17+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r11]; +ld.shared.f64 fd453, [r11+1944]; +ld.shared.f64 fd454, [r11+3888]; +ld.shared.f64 fd455, [r11+5832]; +ld.shared.f64 fd456, [r11+7776]; +ld.shared.f64 fd457, [r11+9720]; +ld.shared.f64 fd458, [r11+11664]; +ld.shared.f64 fd459, [r11+13608]; +ld.shared.f64 fd460, [r11+15552]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd443, fd461; +add.f64 fd463, fd455, fd458; +add.f64 fd464, fd452, fd463; +mul.f64 fd465, fd461, 0d3FE0000000000000; +sub.f64 fd466, fd443, fd465; +sub.f64 fd467, fd455, fd458; +mul.f64 fd468, fd467, 0d3FEBB67AE8584CAA; +add.f64 fd469, fd468, fd466; +sub.f64 fd470, fd466, fd468; +mul.f64 fd471, fd463, 0d3FE0000000000000; +sub.f64 fd472, fd452, fd471; +sub.f64 fd473, fd446, fd449; +mul.f64 fd474, fd473, 0d3FEBB67AE8584CAA; +sub.f64 fd475, fd472, fd474; +add.f64 fd476, fd474, fd472; +add.f64 fd477, fd447, fd450; +add.f64 fd478, fd444, fd477; +add.f64 fd479, fd456, fd459; +add.f64 fd480, fd453, fd479; +mul.f64 fd481, fd477, 0d3FE0000000000000; +sub.f64 fd482, fd444, fd481; +sub.f64 fd483, fd456, fd459; +mul.f64 fd484, fd483, 0d3FEBB67AE8584CAA; +add.f64 fd485, fd484, fd482; +sub.f64 fd486, fd482, fd484; +mul.f64 fd487, fd479, 0d3FE0000000000000; +sub.f64 fd488, fd453, fd487; +sub.f64 fd489, fd447, fd450; +mul.f64 fd490, fd489, 0d3FEBB67AE8584CAA; +sub.f64 fd491, fd488, fd490; +add.f64 fd492, fd490, fd488; +add.f64 fd493, fd448, fd451; +add.f64 fd494, fd445, fd493; +add.f64 fd495, fd457, fd460; +add.f64 fd496, fd454, fd495; +mul.f64 fd497, fd493, 0d3FE0000000000000; +sub.f64 fd498, fd445, fd497; +sub.f64 fd499, fd457, fd460; +mul.f64 fd500, fd499, 0d3FEBB67AE8584CAA; +add.f64 fd501, fd500, fd498; +sub.f64 fd502, fd498, fd500; +mul.f64 fd503, fd495, 0d3FE0000000000000; +sub.f64 fd504, fd454, fd503; +sub.f64 fd505, fd448, fd451; +mul.f64 fd506, fd505, 0d3FEBB67AE8584CAA; +sub.f64 fd507, fd504, fd506; +add.f64 fd508, fd506, fd504; +mul.f64 fd509, fd485, 0d3FE8836FA2CF5039; +mul.f64 fd510, fd491, 0dBFE491B7523C161D; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd491, 0d3FE8836FA2CF5039; +fma.rn.f64 fd513, fd485, 0dBFE491B7523C161D, fd512; +mul.f64 fd514, fd501, 0d3FC63A1A7E0B738A; +mul.f64 fd515, fd507, 0dBFEF838B8C811C17; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd507, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd518, fd501, 0dBFEF838B8C811C17, fd517; +mul.f64 fd519, fd486, 0d3FC63A1A7E0B738A; +mul.f64 fd520, fd492, 0dBFEF838B8C811C17; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd492, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd523, fd486, 0dBFEF838B8C811C17, fd522; +mul.f64 fd524, fd502, 0dBFEE11F642522D1C; +mul.f64 fd525, fd508, 0dBFD5E3A8748A0BF5; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd508, 0dBFEE11F642522D1C; +fma.rn.f64 fd528, fd502, 0dBFD5E3A8748A0BF5, fd527; +add.f64 fd529, fd478, fd494; +add.f64 fd530, fd462, fd529; +add.f64 fd531, fd480, fd496; +add.f64 fd532, fd464, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd462, fd533; +sub.f64 fd535, fd480, fd496; +mul.f64 fd536, fd535, 0d3FEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd464, fd539; +sub.f64 fd541, fd478, fd494; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +add.f64 fd545, fd511, fd516; +add.f64 fd546, fd469, fd545; +add.f64 fd547, fd513, fd518; +add.f64 fd548, fd475, fd547; +mul.f64 fd549, fd545, 0d3FE0000000000000; +sub.f64 fd550, fd469, fd549; +sub.f64 fd551, fd513, fd518; +mul.f64 fd552, fd551, 0d3FEBB67AE8584CAA; +add.f64 fd553, fd552, fd550; +sub.f64 fd554, fd550, fd552; +mul.f64 fd555, fd547, 0d3FE0000000000000; +sub.f64 fd556, fd475, fd555; +sub.f64 fd557, fd511, fd516; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +sub.f64 fd559, fd556, fd558; +add.f64 fd560, fd558, fd556; +add.f64 fd561, fd521, fd526; +add.f64 fd562, fd470, fd561; +add.f64 fd563, fd523, fd528; +add.f64 fd564, fd476, fd563; +mul.f64 fd565, fd561, 0d3FE0000000000000; +sub.f64 fd566, fd470, fd565; +sub.f64 fd567, fd523, fd528; +mul.f64 fd568, fd567, 0d3FEBB67AE8584CAA; +add.f64 fd569, fd568, fd566; +sub.f64 fd570, fd566, fd568; +mul.f64 fd571, fd563, 0d3FE0000000000000; +sub.f64 fd572, fd476, fd571; +sub.f64 fd573, fd521, fd526; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +sub.f64 fd575, fd572, fd574; +add.f64 fd576, fd574, fd572; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd577, fd578}, [rd16]; +mul.f64 fd581, fd577, fd546; +mul.f64 fd582, fd578, fd548; +sub.f64 fd583, fd581, fd582; +mul.f64 fd584, fd577, fd548; +fma.rn.f64 fd585, fd578, fd546, fd584; +mul.f64 fd586, fd577, fd577; +mul.f64 fd587, fd578, fd578; +sub.f64 fd588, fd586, fd587; +mul.f64 fd589, fd578, fd577; +fma.rn.f64 fd590, fd578, fd577, fd589; +mul.f64 fd591, fd588, fd562; +mul.f64 fd592, fd590, fd564; +sub.f64 fd593, fd591, fd592; +mul.f64 fd594, fd588, fd564; +fma.rn.f64 fd595, fd590, fd562, fd594; +mul.f64 fd596, fd577, fd588; +mul.f64 fd597, fd578, fd590; +sub.f64 fd598, fd596, fd597; +mul.f64 fd599, fd577, fd590; +fma.rn.f64 fd600, fd578, fd588, fd599; +mul.f64 fd601, fd598, fd537; +mul.f64 fd602, fd600, fd543; +sub.f64 fd603, fd601, fd602; +mul.f64 fd604, fd598, fd543; +fma.rn.f64 fd605, fd600, fd537, fd604; +mul.f64 fd606, fd577, fd598; +mul.f64 fd607, fd578, fd600; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd577, fd600; +fma.rn.f64 fd610, fd578, fd598, fd609; +mul.f64 fd611, fd608, fd553; +mul.f64 fd612, fd610, fd559; +sub.f64 fd613, fd611, fd612; +mul.f64 fd614, fd608, fd559; +fma.rn.f64 fd615, fd610, fd553, fd614; +ld.global.v2.f64 {fd616, fd617}, [rd16+48]; +mul.f64 fd620, fd616, fd569; +mul.f64 fd621, fd617, fd575; +sub.f64 fd622, fd620, fd621; +mul.f64 fd623, fd616, fd575; +fma.rn.f64 fd624, fd617, fd569, fd623; +mul.f64 fd625, fd577, fd616; +mul.f64 fd626, fd578, fd617; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd577, fd617; +fma.rn.f64 fd629, fd578, fd616, fd628; +mul.f64 fd630, fd627, fd538; +mul.f64 fd631, fd629, fd544; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd627, fd544; +fma.rn.f64 fd634, fd629, fd538, fd633; +mul.f64 fd635, fd577, fd627; +mul.f64 fd636, fd578, fd629; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd577, fd629; +fma.rn.f64 fd639, fd578, fd627, fd638; +mul.f64 fd640, fd637, fd554; +mul.f64 fd641, fd639, fd560; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd637, fd560; +fma.rn.f64 fd644, fd639, fd554, fd643; +mul.f64 fd645, fd577, fd637; +mul.f64 fd646, fd578, fd639; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd577, fd639; +fma.rn.f64 fd649, fd578, fd637, fd648; +mul.f64 fd650, fd647, fd570; +mul.f64 fd651, fd649, fd576; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd647, fd576; +fma.rn.f64 fd654, fd649, fd570, fd653; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +st.shared.f64 [r23], fd530; +st.shared.f64 [r23+648], fd583; +st.shared.f64 [r23+1296], fd593; +st.shared.f64 [r23+1944], fd603; +st.shared.f64 [r23+2592], fd613; +st.shared.f64 [r23+3240], fd622; +st.shared.f64 [r23+3888], fd632; +st.shared.f64 [r23+4536], fd642; +st.shared.f64 [r23+5184], fd652; +barrier.sync 0; +ld.shared.f64 fd655, [r11]; +ld.shared.f64 fd656, [r11+1944]; +ld.shared.f64 fd657, [r11+3888]; +ld.shared.f64 fd658, [r11+5832]; +ld.shared.f64 fd659, [r11+7776]; +ld.shared.f64 fd660, [r11+9720]; +ld.shared.f64 fd661, [r11+11664]; +ld.shared.f64 fd662, [r11+13608]; +ld.shared.f64 fd663, [r11+15552]; +barrier.sync 0; +st.shared.f64 [r23], fd532; +st.shared.f64 [r23+648], fd585; +st.shared.f64 [r23+1296], fd595; +st.shared.f64 [r23+1944], fd605; +st.shared.f64 [r23+2592], fd615; +st.shared.f64 [r23+3240], fd624; +st.shared.f64 [r23+3888], fd634; +st.shared.f64 [r23+4536], fd644; +st.shared.f64 [r23+5184], fd654; +barrier.sync 0; +ld.shared.f64 fd664, [r11]; +ld.shared.f64 fd665, [r11+1944]; +ld.shared.f64 fd666, [r11+3888]; +ld.shared.f64 fd667, [r11+5832]; +ld.shared.f64 fd668, [r11+7776]; +ld.shared.f64 fd669, [r11+9720]; +ld.shared.f64 fd670, [r11+11664]; +ld.shared.f64 fd671, [r11+13608]; +ld.shared.f64 fd672, [r11+15552]; +add.f64 fd673, fd658, fd661; +add.f64 fd674, fd667, fd670; +mul.f64 fd675, fd673, 0d3FE0000000000000; +sub.f64 fd676, fd655, fd675; +sub.f64 fd677, fd667, fd670; +mul.f64 fd678, fd677, 0d3FEBB67AE8584CAA; +mul.f64 fd679, fd674, 0d3FE0000000000000; +sub.f64 fd680, fd664, fd679; +sub.f64 fd681, fd658, fd661; +mul.f64 fd682, fd681, 0d3FEBB67AE8584CAA; +add.f64 fd683, fd659, fd662; +add.f64 fd684, fd668, fd671; +mul.f64 fd685, fd683, 0d3FE0000000000000; +sub.f64 fd686, fd656, fd685; +sub.f64 fd687, fd668, fd671; +mul.f64 fd688, fd687, 0d3FEBB67AE8584CAA; +mul.f64 fd689, fd684, 0d3FE0000000000000; +sub.f64 fd690, fd665, fd689; +sub.f64 fd691, fd659, fd662; +mul.f64 fd692, fd691, 0d3FEBB67AE8584CAA; +add.f64 fd693, fd660, fd663; +add.f64 fd694, fd669, fd672; +mul.f64 fd695, fd693, 0d3FE0000000000000; +sub.f64 fd696, fd657, fd695; +sub.f64 fd697, fd669, fd672; +mul.f64 fd698, fd697, 0d3FEBB67AE8584CAA; +mul.f64 fd699, fd694, 0d3FE0000000000000; +sub.f64 fd700, fd666, fd699; +sub.f64 fd701, fd660, fd663; +mul.f64 fd702, fd701, 0d3FEBB67AE8584CAA; +add.f64 %0, fd655, fd673; +add.f64 %1, fd664, fd674; +add.f64 %2, fd656, fd683; +add.f64 %3, fd665, fd684; +add.f64 %4, fd657, fd693; +add.f64 %5, fd666, fd694; +add.f64 %6, fd678, fd676; +sub.f64 %7, fd680, fd682; +add.f64 %8, fd688, fd686; +sub.f64 %9, fd690, fd692; +add.f64 %10, fd698, fd696; +sub.f64 %11, fd700, fd702; +sub.f64 %12, fd676, fd678; +add.f64 %13, fd682, fd680; +sub.f64 %14, fd686, fd688; +add.f64 %15, fd692, fd690; +sub.f64 %16, fd696, fd698; +add.f64 %17, fd702, fd700; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_2187), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<529, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<46>; +.reg .f64 fd<269>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 17496, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %15, %18; +add.f64 fd14, %13, fd13; +add.f64 fd15, %17, %19; +add.f64 fd16, %14, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %13, fd17; +sub.f64 fd19, %17, %19; +mul.f64 fd20, fd19, 0d3FEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %14, fd23; +sub.f64 fd25, %15, %18; +mul.f64 fd26, fd25, 0d3FEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd29, fd21; +mul.f64 fd34, fd30, fd27; +sub.f64 fd35, fd33, fd34; +mul.f64 fd36, fd29, fd27; +fma.rn.f64 fd37, fd30, fd21, fd36; +ld.global.v2.f64 {fd38, fd39}, [rd6+11664]; +mul.f64 fd42, fd38, fd22; +mul.f64 fd43, fd39, fd28; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd38, fd28; +fma.rn.f64 fd46, fd39, fd22, fd45; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd35; +st.shared.f64 [r9+16], fd44; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+5832]; +ld.shared.f64 fd49, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+5832]; +ld.shared.f64 fd52, [r11+11664]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd69, fd61; +mul.f64 fd74, fd70, fd67; +sub.f64 fd75, fd73, fd74; +mul.f64 fd76, fd69, fd67; +fma.rn.f64 fd77, fd70, fd61, fd76; +ld.global.v2.f64 {fd78, fd79}, [rd11+3888]; +mul.f64 fd82, fd78, fd62; +mul.f64 fd83, fd79, fd68; +sub.f64 fd84, fd82, fd83; +mul.f64 fd85, fd78, fd68; +fma.rn.f64 fd86, fd79, fd62, fd85; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd75; +st.shared.f64 [r17+48], fd84; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+5832]; +ld.shared.f64 fd89, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+5832]; +ld.shared.f64 fd92, [r11+11664]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0d3FEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0d3FEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd109, fd101; +mul.f64 fd114, fd110, fd107; +sub.f64 fd115, fd113, fd114; +mul.f64 fd116, fd109, fd107; +fma.rn.f64 fd117, fd110, fd101, fd116; +ld.global.v2.f64 {fd118, fd119}, [rd16+1296]; +mul.f64 fd122, fd118, fd102; +mul.f64 fd123, fd119, fd108; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd118, fd108; +fma.rn.f64 fd126, fd119, fd102, fd125; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +st.shared.f64 [r23], fd94; +st.shared.f64 [r23+72], fd115; +st.shared.f64 [r23+144], fd124; +barrier.sync 0; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+5832]; +ld.shared.f64 fd129, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r23], fd96; +st.shared.f64 [r23+72], fd117; +st.shared.f64 [r23+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r11]; +ld.shared.f64 fd131, [r11+5832]; +ld.shared.f64 fd132, [r11+11664]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd127, fd133; +add.f64 fd135, fd131, fd132; +add.f64 fd136, fd130, fd135; +mul.f64 fd137, fd133, 0d3FE0000000000000; +sub.f64 fd138, fd127, fd137; +sub.f64 fd139, fd131, fd132; +mul.f64 fd140, fd139, 0d3FEBB67AE8584CAA; +add.f64 fd141, fd140, fd138; +sub.f64 fd142, fd138, fd140; +mul.f64 fd143, fd135, 0d3FE0000000000000; +sub.f64 fd144, fd130, fd143; +sub.f64 fd145, fd128, fd129; +mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd146, fd144; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd149, fd150}, [rd21]; +mul.f64 fd153, fd149, fd141; +mul.f64 fd154, fd150, fd147; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd149, fd147; +fma.rn.f64 fd157, fd150, fd141, fd156; +ld.global.v2.f64 {fd158, fd159}, [rd21+432]; +mul.f64 fd162, fd158, fd142; +mul.f64 fd163, fd159, fd148; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd158, fd148; +fma.rn.f64 fd166, fd159, fd142, fd165; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +st.shared.f64 [r33], fd134; +st.shared.f64 [r33+216], fd155; +st.shared.f64 [r33+432], fd164; +barrier.sync 0; +ld.shared.f64 fd167, [r11]; +ld.shared.f64 fd168, [r11+5832]; +ld.shared.f64 fd169, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r33], fd136; +st.shared.f64 [r33+216], fd157; +st.shared.f64 [r33+432], fd166; +barrier.sync 0; +ld.shared.f64 fd170, [r11]; +ld.shared.f64 fd171, [r11+5832]; +ld.shared.f64 fd172, [r11+11664]; +add.f64 fd173, fd168, fd169; +add.f64 fd174, fd167, fd173; +add.f64 fd175, fd171, fd172; +add.f64 fd176, fd170, fd175; +mul.f64 fd177, fd173, 0d3FE0000000000000; +sub.f64 fd178, fd167, fd177; +sub.f64 fd179, fd171, fd172; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +add.f64 fd181, fd180, fd178; +sub.f64 fd182, fd178, fd180; +mul.f64 fd183, fd175, 0d3FE0000000000000; +sub.f64 fd184, fd170, fd183; +sub.f64 fd185, fd168, fd169; +mul.f64 fd186, fd185, 0d3FEBB67AE8584CAA; +sub.f64 fd187, fd184, fd186; +add.f64 fd188, fd186, fd184; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd189, fd190}, [rd26]; +mul.f64 fd193, fd189, fd181; +mul.f64 fd194, fd190, fd187; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd189, fd187; +fma.rn.f64 fd197, fd190, fd181, fd196; +ld.global.v2.f64 {fd198, fd199}, [rd26+144]; +mul.f64 fd202, fd198, fd182; +mul.f64 fd203, fd199, fd188; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd198, fd188; +fma.rn.f64 fd206, fd199, fd182, fd205; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +st.shared.f64 [r39], fd174; +st.shared.f64 [r39+648], fd195; +st.shared.f64 [r39+1296], fd204; +barrier.sync 0; +ld.shared.f64 fd207, [r11]; +ld.shared.f64 fd208, [r11+5832]; +ld.shared.f64 fd209, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r39], fd176; +st.shared.f64 [r39+648], fd197; +st.shared.f64 [r39+1296], fd206; +barrier.sync 0; +ld.shared.f64 fd210, [r11]; +ld.shared.f64 fd211, [r11+5832]; +ld.shared.f64 fd212, [r11+11664]; +add.f64 fd213, fd208, fd209; +add.f64 fd214, fd207, fd213; +add.f64 fd215, fd211, fd212; +add.f64 fd216, fd210, fd215; +mul.f64 fd217, fd213, 0d3FE0000000000000; +sub.f64 fd218, fd207, fd217; +sub.f64 fd219, fd211, fd212; +mul.f64 fd220, fd219, 0d3FEBB67AE8584CAA; +add.f64 fd221, fd220, fd218; +sub.f64 fd222, fd218, fd220; +mul.f64 fd223, fd215, 0d3FE0000000000000; +sub.f64 fd224, fd210, fd223; +sub.f64 fd225, fd208, fd209; +mul.f64 fd226, fd225, 0d3FEBB67AE8584CAA; +sub.f64 fd227, fd224, fd226; +add.f64 fd228, fd226, fd224; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 3; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 16; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f64 {fd229, fd230}, [rd31]; +mul.f64 fd233, fd229, fd221; +mul.f64 fd234, fd230, fd227; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd229, fd227; +fma.rn.f64 fd237, fd230, fd221, fd236; +ld.global.v2.f64 {fd238, fd239}, [rd31+48]; +mul.f64 fd242, fd238, fd222; +mul.f64 fd243, fd239, fd228; +sub.f64 fd244, fd242, fd243; +mul.f64 fd245, fd238, fd228; +fma.rn.f64 fd246, fd239, fd222, fd245; +barrier.sync 0; +mad.lo.s32 r45, r40, 5832, r44; +st.shared.f64 [r45], fd214; +st.shared.f64 [r45+1944], fd235; +st.shared.f64 [r45+3888], fd244; +barrier.sync 0; +ld.shared.f64 fd247, [r11]; +ld.shared.f64 fd248, [r11+5832]; +ld.shared.f64 fd249, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r45], fd216; +st.shared.f64 [r45+1944], fd237; +st.shared.f64 [r45+3888], fd246; +barrier.sync 0; +ld.shared.f64 fd250, [r11]; +ld.shared.f64 fd251, [r11+5832]; +ld.shared.f64 fd252, [r11+11664]; +add.f64 fd253, fd248, fd249; +add.f64 fd254, fd251, fd252; +mul.f64 fd255, fd253, 0d3FE0000000000000; +sub.f64 fd256, fd247, fd255; +sub.f64 fd257, fd251, fd252; +mul.f64 fd258, fd257, 0d3FEBB67AE8584CAA; +mul.f64 fd259, fd254, 0d3FE0000000000000; +sub.f64 fd260, fd250, fd259; +sub.f64 fd261, fd248, fd249; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +add.f64 %0, fd247, fd253; +add.f64 %1, fd250, fd254; +add.f64 %2, fd258, fd256; +sub.f64 %3, fd260, fd262; +sub.f64 %4, fd256, fd258; +add.f64 %5, fd262, fd260; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_2187), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<530, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<46>; +.reg .f64 fd<305>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 34992, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %15, %18; +add.f64 fd14, %17, %19; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %13, fd15; +sub.f64 fd17, %17, %19; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %14, fd21; +sub.f64 fd23, %15, %18; +mul.f64 fd24, fd23, 0d3FEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 34992, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd27, fd19; +mul.f64 fd32, fd28, fd25; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+11664]; +mul.f64 fd38, fd34, fd20; +mul.f64 fd39, fd35, fd26; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %14, fd14; +add.f64 fd42, %13, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd28, fd19, fd33; +sub.f64 fd44, fd31, fd32; +st.shared.v2.f64 [r9+16], {fd44, fd43}; +fma.rn.f64 fd45, fd35, fd20, fd40; +sub.f64 fd46, fd38, fd39; +st.shared.v2.f64 [r9+32], {fd46, fd45}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+11664]; +ld.shared.v2.f64 {fd55, fd56}, [r11+23328]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0d3FEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0d3FEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd73, fd65; +mul.f64 fd78, fd74, fd71; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+3888]; +mul.f64 fd84, fd80, fd66; +mul.f64 fd85, fd81, fd72; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd74, fd65, fd79; +sub.f64 fd90, fd77, fd78; +st.shared.v2.f64 [r17+48], {fd90, fd89}; +fma.rn.f64 fd91, fd81, fd66, fd86; +sub.f64 fd92, fd84, fd85; +st.shared.v2.f64 [r17+96], {fd92, fd91}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+11664]; +ld.shared.v2.f64 {fd101, fd102}, [r11+23328]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd119, fd111; +mul.f64 fd124, fd120, fd117; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+1296]; +mul.f64 fd130, fd126, fd112; +mul.f64 fd131, fd127, fd118; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r23, r18, 432, r22; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r23], {fd134, fd133}; +fma.rn.f64 fd135, fd120, fd111, fd125; +sub.f64 fd136, fd123, fd124; +st.shared.v2.f64 [r23+144], {fd136, fd135}; +fma.rn.f64 fd137, fd127, fd112, fd132; +sub.f64 fd138, fd130, fd131; +st.shared.v2.f64 [r23+288], {fd138, fd137}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r11]; +ld.shared.v2.f64 {fd143, fd144}, [r11+11664]; +ld.shared.v2.f64 {fd147, fd148}, [r11+23328]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0d3FEBB67AE8584CAA; +add.f64 fd157, fd156, fd154; +sub.f64 fd158, fd154, fd156; +mul.f64 fd159, fd152, 0d3FE0000000000000; +sub.f64 fd160, fd140, fd159; +sub.f64 fd161, fd143, fd147; +mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA; +sub.f64 fd163, fd160, fd162; +add.f64 fd164, fd162, fd160; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 4; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd165, fd166}, [rd21]; +mul.f64 fd169, fd165, fd157; +mul.f64 fd170, fd166, fd163; +mul.f64 fd171, fd165, fd163; +ld.global.v2.f64 {fd172, fd173}, [rd21+432]; +mul.f64 fd176, fd172, fd158; +mul.f64 fd177, fd173, fd164; +mul.f64 fd178, fd172, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 1296, r32; +add.f64 fd179, fd140, fd152; +add.f64 fd180, fd139, fd151; +st.shared.v2.f64 [r33], {fd180, fd179}; +fma.rn.f64 fd181, fd166, fd157, fd171; +sub.f64 fd182, fd169, fd170; +st.shared.v2.f64 [r33+432], {fd182, fd181}; +fma.rn.f64 fd183, fd173, fd158, fd178; +sub.f64 fd184, fd176, fd177; +st.shared.v2.f64 [r33+864], {fd184, fd183}; +barrier.sync 0; +ld.shared.v2.f64 {fd185, fd186}, [r11]; +ld.shared.v2.f64 {fd189, fd190}, [r11+11664]; +ld.shared.v2.f64 {fd193, fd194}, [r11+23328]; +add.f64 fd197, fd189, fd193; +add.f64 fd198, fd190, fd194; +mul.f64 fd199, fd197, 0d3FE0000000000000; +sub.f64 fd200, fd185, fd199; +sub.f64 fd201, fd190, fd194; +mul.f64 fd202, fd201, 0d3FEBB67AE8584CAA; +add.f64 fd203, fd202, fd200; +sub.f64 fd204, fd200, fd202; +mul.f64 fd205, fd198, 0d3FE0000000000000; +sub.f64 fd206, fd186, fd205; +sub.f64 fd207, fd189, fd193; +mul.f64 fd208, fd207, 0d3FEBB67AE8584CAA; +sub.f64 fd209, fd206, fd208; +add.f64 fd210, fd208, fd206; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 4; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd211, fd212}, [rd26]; +mul.f64 fd215, fd211, fd203; +mul.f64 fd216, fd212, fd209; +mul.f64 fd217, fd211, fd209; +ld.global.v2.f64 {fd218, fd219}, [rd26+144]; +mul.f64 fd222, fd218, fd204; +mul.f64 fd223, fd219, fd210; +mul.f64 fd224, fd218, fd210; +barrier.sync 0; +mad.lo.s32 r39, r34, 3888, r38; +add.f64 fd225, fd186, fd198; +add.f64 fd226, fd185, fd197; +st.shared.v2.f64 [r39], {fd226, fd225}; +fma.rn.f64 fd227, fd212, fd203, fd217; +sub.f64 fd228, fd215, fd216; +st.shared.v2.f64 [r39+1296], {fd228, fd227}; +fma.rn.f64 fd229, fd219, fd204, fd224; +sub.f64 fd230, fd222, fd223; +st.shared.v2.f64 [r39+2592], {fd230, fd229}; +barrier.sync 0; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+11664]; +ld.shared.v2.f64 {fd239, fd240}, [r11+23328]; +add.f64 fd243, fd235, fd239; +add.f64 fd244, fd236, fd240; +mul.f64 fd245, fd243, 0d3FE0000000000000; +sub.f64 fd246, fd231, fd245; +sub.f64 fd247, fd236, fd240; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +mul.f64 fd251, fd244, 0d3FE0000000000000; +sub.f64 fd252, fd232, fd251; +sub.f64 fd253, fd235, fd239; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 4; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 16; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f64 {fd257, fd258}, [rd31]; +mul.f64 fd261, fd257, fd249; +mul.f64 fd262, fd258, fd255; +mul.f64 fd263, fd257, fd255; +ld.global.v2.f64 {fd264, fd265}, [rd31+48]; +mul.f64 fd268, fd264, fd250; +mul.f64 fd269, fd265, fd256; +mul.f64 fd270, fd264, fd256; +barrier.sync 0; +mad.lo.s32 r45, r40, 11664, r44; +add.f64 fd271, fd232, fd244; +add.f64 fd272, fd231, fd243; +st.shared.v2.f64 [r45], {fd272, fd271}; +fma.rn.f64 fd273, fd258, fd249, fd263; +sub.f64 fd274, fd261, fd262; +st.shared.v2.f64 [r45+3888], {fd274, fd273}; +fma.rn.f64 fd275, fd265, fd250, fd270; +sub.f64 fd276, fd268, fd269; +st.shared.v2.f64 [r45+7776], {fd276, fd275}; +barrier.sync 0; +ld.shared.v2.f64 {fd277, fd278}, [r11]; +ld.shared.v2.f64 {fd281, fd282}, [r11+11664]; +ld.shared.v2.f64 {fd285, fd286}, [r11+23328]; +add.f64 fd289, fd281, fd285; +add.f64 fd290, fd282, fd286; +mul.f64 fd291, fd289, 0d3FE0000000000000; +sub.f64 fd292, fd277, fd291; +sub.f64 fd293, fd282, fd286; +mul.f64 fd294, fd293, 0d3FEBB67AE8584CAA; +mul.f64 fd295, fd290, 0d3FE0000000000000; +sub.f64 fd296, fd278, fd295; +sub.f64 fd297, fd281, fd285; +mul.f64 fd298, fd297, 0d3FEBB67AE8584CAA; +add.f64 %1, fd278, fd290; +add.f64 %0, fd277, fd289; +sub.f64 %3, fd296, fd298; +add.f64 %2, fd294, fd292; +add.f64 %5, fd298, fd296; +sub.f64 %4, fd292, fd294; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_2187), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..f0f34d9112964 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2187_fp64_inv.hpp.inc @@ -0,0 +1,2198 @@ +#ifndef CUFFTDX_FFT_2187_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_2187_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<699, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<775>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 34992, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 34992, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd122, fd152; +mul.f64 fd156, fd120, fd152; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd138, fd162; +mul.f64 fd164, fd136, fd162; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd117, fd170; +mul.f64 fd172, fd111, fd170; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd133, fd178; +mul.f64 fd180, fd127, fd178; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+3888]; +mul.f64 fd186, fd149, fd183; +mul.f64 fd187, fd143, fd183; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd118, fd193; +mul.f64 fd195, fd112, fd193; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd134, fd201; +mul.f64 fd203, fd128, fd201; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd150, fd209; +mul.f64 fd211, fd144, fd209; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd151, fd120, fd155; +sub.f64 fd216, fd157, fd156; +st.shared.v2.f64 [r9+16], {fd215, fd216}; +fma.rn.f64 fd217, fd160, fd136, fd163; +sub.f64 fd218, fd165, fd164; +st.shared.v2.f64 [r9+32], {fd217, fd218}; +sub.f64 fd219, fd173, fd172; +fma.rn.f64 fd220, fd168, fd111, fd171; +st.shared.v2.f64 [r9+48], {fd220, fd219}; +fma.rn.f64 fd221, fd176, fd127, fd179; +sub.f64 fd222, fd181, fd180; +st.shared.v2.f64 [r9+64], {fd221, fd222}; +fma.rn.f64 fd223, fd182, fd143, fd186; +sub.f64 fd224, fd188, fd187; +st.shared.v2.f64 [r9+80], {fd223, fd224}; +fma.rn.f64 fd225, fd191, fd112, fd194; +sub.f64 fd226, fd196, fd195; +st.shared.v2.f64 [r9+96], {fd225, fd226}; +fma.rn.f64 fd227, fd199, fd128, fd202; +sub.f64 fd228, fd204, fd203; +st.shared.v2.f64 [r9+112], {fd227, fd228}; +fma.rn.f64 fd229, fd207, fd144, fd210; +sub.f64 fd230, fd212, fd211; +st.shared.v2.f64 [r9+128], {fd229, fd230}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+3888]; +ld.shared.v2.f64 {fd239, fd240}, [r11+7776]; +ld.shared.v2.f64 {fd243, fd244}, [r11+11664]; +ld.shared.v2.f64 {fd247, fd248}, [r11+15552]; +ld.shared.v2.f64 {fd251, fd252}, [r11+19440]; +ld.shared.v2.f64 {fd255, fd256}, [r11+23328]; +ld.shared.v2.f64 {fd259, fd260}, [r11+27216]; +ld.shared.v2.f64 {fd263, fd264}, [r11+31104]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0dBFEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0dBFEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0dBFEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0d3FE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0d3FE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0d3FEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0d3FEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0d3FEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0d3FEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0d3FD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0d3FD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0dBFEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0dBFEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd352, fd382; +mul.f64 fd386, fd350, fd382; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd368, fd392; +mul.f64 fd394, fd366, fd392; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd347, fd400; +mul.f64 fd402, fd341, fd400; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd363, fd408; +mul.f64 fd410, fd357, fd408; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+432]; +mul.f64 fd416, fd379, fd413; +mul.f64 fd417, fd373, fd413; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd348, fd423; +mul.f64 fd425, fd342, fd423; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd364, fd431; +mul.f64 fd433, fd358, fd431; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd380, fd439; +mul.f64 fd441, fd374, fd439; +mul.f64 fd442, fd437, fd380; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 1296, r16; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r17], {fd444, fd443}; +fma.rn.f64 fd445, fd381, fd350, fd385; +sub.f64 fd446, fd387, fd386; +st.shared.v2.f64 [r17+144], {fd445, fd446}; +fma.rn.f64 fd447, fd390, fd366, fd393; +sub.f64 fd448, fd395, fd394; +st.shared.v2.f64 [r17+288], {fd447, fd448}; +fma.rn.f64 fd449, fd398, fd341, fd401; +sub.f64 fd450, fd403, fd402; +st.shared.v2.f64 [r17+432], {fd449, fd450}; +fma.rn.f64 fd451, fd406, fd357, fd409; +sub.f64 fd452, fd411, fd410; +st.shared.v2.f64 [r17+576], {fd451, fd452}; +fma.rn.f64 fd453, fd412, fd373, fd416; +sub.f64 fd454, fd418, fd417; +st.shared.v2.f64 [r17+720], {fd453, fd454}; +fma.rn.f64 fd455, fd421, fd342, fd424; +sub.f64 fd456, fd426, fd425; +st.shared.v2.f64 [r17+864], {fd455, fd456}; +fma.rn.f64 fd457, fd429, fd358, fd432; +sub.f64 fd458, fd434, fd433; +st.shared.v2.f64 [r17+1008], {fd457, fd458}; +fma.rn.f64 fd459, fd437, fd374, fd440; +sub.f64 fd460, fd442, fd441; +st.shared.v2.f64 [r17+1152], {fd459, fd460}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r11]; +ld.shared.v2.f64 {fd465, fd466}, [r11+3888]; +ld.shared.v2.f64 {fd469, fd470}, [r11+7776]; +ld.shared.v2.f64 {fd473, fd474}, [r11+11664]; +ld.shared.v2.f64 {fd477, fd478}, [r11+15552]; +ld.shared.v2.f64 {fd481, fd482}, [r11+19440]; +ld.shared.v2.f64 {fd485, fd486}, [r11+23328]; +ld.shared.v2.f64 {fd489, fd490}, [r11+27216]; +ld.shared.v2.f64 {fd493, fd494}, [r11+31104]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd461, fd497; +add.f64 fd499, fd474, fd486; +add.f64 fd500, fd462, fd499; +mul.f64 fd501, fd497, 0d3FE0000000000000; +sub.f64 fd502, fd461, fd501; +sub.f64 fd503, fd474, fd486; +mul.f64 fd504, fd503, 0dBFEBB67AE8584CAA; +add.f64 fd505, fd504, fd502; +sub.f64 fd506, fd502, fd504; +mul.f64 fd507, fd499, 0d3FE0000000000000; +sub.f64 fd508, fd462, fd507; +sub.f64 fd509, fd473, fd485; +mul.f64 fd510, fd509, 0dBFEBB67AE8584CAA; +sub.f64 fd511, fd508, fd510; +add.f64 fd512, fd510, fd508; +add.f64 fd513, fd477, fd489; +add.f64 fd514, fd465, fd513; +add.f64 fd515, fd478, fd490; +add.f64 fd516, fd466, fd515; +mul.f64 fd517, fd513, 0d3FE0000000000000; +sub.f64 fd518, fd465, fd517; +sub.f64 fd519, fd478, fd490; +mul.f64 fd520, fd519, 0dBFEBB67AE8584CAA; +add.f64 fd521, fd520, fd518; +sub.f64 fd522, fd518, fd520; +mul.f64 fd523, fd515, 0d3FE0000000000000; +sub.f64 fd524, fd466, fd523; +sub.f64 fd525, fd477, fd489; +mul.f64 fd526, fd525, 0dBFEBB67AE8584CAA; +sub.f64 fd527, fd524, fd526; +add.f64 fd528, fd526, fd524; +add.f64 fd529, fd481, fd493; +add.f64 fd530, fd469, fd529; +add.f64 fd531, fd482, fd494; +add.f64 fd532, fd470, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd469, fd533; +sub.f64 fd535, fd482, fd494; +mul.f64 fd536, fd535, 0dBFEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd470, fd539; +sub.f64 fd541, fd481, fd493; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +mul.f64 fd545, fd521, 0d3FE8836FA2CF5039; +mul.f64 fd546, fd527, 0d3FE491B7523C161D; +sub.f64 fd547, fd545, fd546; +mul.f64 fd548, fd527, 0d3FE8836FA2CF5039; +fma.rn.f64 fd549, fd521, 0d3FE491B7523C161D, fd548; +mul.f64 fd550, fd537, 0d3FC63A1A7E0B738A; +mul.f64 fd551, fd543, 0d3FEF838B8C811C17; +sub.f64 fd552, fd550, fd551; +mul.f64 fd553, fd543, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd554, fd537, 0d3FEF838B8C811C17, fd553; +mul.f64 fd555, fd522, 0d3FC63A1A7E0B738A; +mul.f64 fd556, fd528, 0d3FEF838B8C811C17; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd528, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd559, fd522, 0d3FEF838B8C811C17, fd558; +mul.f64 fd560, fd538, 0dBFEE11F642522D1C; +mul.f64 fd561, fd544, 0d3FD5E3A8748A0BF5; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd544, 0dBFEE11F642522D1C; +fma.rn.f64 fd564, fd538, 0d3FD5E3A8748A0BF5, fd563; +add.f64 fd565, fd514, fd530; +add.f64 fd566, fd516, fd532; +mul.f64 fd567, fd565, 0d3FE0000000000000; +sub.f64 fd568, fd498, fd567; +sub.f64 fd569, fd516, fd532; +mul.f64 fd570, fd569, 0dBFEBB67AE8584CAA; +add.f64 fd571, fd570, fd568; +sub.f64 fd572, fd568, fd570; +mul.f64 fd573, fd566, 0d3FE0000000000000; +sub.f64 fd574, fd500, fd573; +sub.f64 fd575, fd514, fd530; +mul.f64 fd576, fd575, 0dBFEBB67AE8584CAA; +sub.f64 fd577, fd574, fd576; +add.f64 fd578, fd576, fd574; +add.f64 fd579, fd547, fd552; +add.f64 fd580, fd505, fd579; +add.f64 fd581, fd549, fd554; +add.f64 fd582, fd511, fd581; +mul.f64 fd583, fd579, 0d3FE0000000000000; +sub.f64 fd584, fd505, fd583; +sub.f64 fd585, fd549, fd554; +mul.f64 fd586, fd585, 0dBFEBB67AE8584CAA; +add.f64 fd587, fd586, fd584; +sub.f64 fd588, fd584, fd586; +mul.f64 fd589, fd581, 0d3FE0000000000000; +sub.f64 fd590, fd511, fd589; +sub.f64 fd591, fd547, fd552; +mul.f64 fd592, fd591, 0dBFEBB67AE8584CAA; +sub.f64 fd593, fd590, fd592; +add.f64 fd594, fd592, fd590; +add.f64 fd595, fd557, fd562; +add.f64 fd596, fd506, fd595; +add.f64 fd597, fd559, fd564; +add.f64 fd598, fd512, fd597; +mul.f64 fd599, fd595, 0d3FE0000000000000; +sub.f64 fd600, fd506, fd599; +sub.f64 fd601, fd559, fd564; +mul.f64 fd602, fd601, 0dBFEBB67AE8584CAA; +add.f64 fd603, fd602, fd600; +sub.f64 fd604, fd600, fd602; +mul.f64 fd605, fd597, 0d3FE0000000000000; +sub.f64 fd606, fd512, fd605; +sub.f64 fd607, fd557, fd562; +mul.f64 fd608, fd607, 0dBFEBB67AE8584CAA; +sub.f64 fd609, fd606, fd608; +add.f64 fd610, fd608, fd606; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd611, fd612}, [rd16]; +mul.f64 fd615, fd582, fd612; +mul.f64 fd616, fd580, fd612; +mul.f64 fd617, fd611, fd582; +mul.f64 fd618, fd611, fd611; +mul.f64 fd619, fd612, fd612; +sub.f64 fd620, fd618, fd619; +mul.f64 fd621, fd612, fd611; +fma.rn.f64 fd622, fd612, fd611, fd621; +mul.f64 fd623, fd598, fd622; +mul.f64 fd624, fd596, fd622; +mul.f64 fd625, fd620, fd598; +mul.f64 fd626, fd611, fd620; +mul.f64 fd627, fd612, fd622; +sub.f64 fd628, fd626, fd627; +mul.f64 fd629, fd611, fd622; +fma.rn.f64 fd630, fd612, fd620, fd629; +mul.f64 fd631, fd577, fd630; +mul.f64 fd632, fd571, fd630; +mul.f64 fd633, fd628, fd577; +mul.f64 fd634, fd611, fd628; +mul.f64 fd635, fd612, fd630; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd611, fd630; +fma.rn.f64 fd638, fd612, fd628, fd637; +mul.f64 fd639, fd593, fd638; +mul.f64 fd640, fd587, fd638; +mul.f64 fd641, fd636, fd593; +ld.global.v2.f64 {fd642, fd643}, [rd16+48]; +mul.f64 fd646, fd609, fd643; +mul.f64 fd647, fd603, fd643; +mul.f64 fd648, fd642, fd609; +mul.f64 fd649, fd611, fd642; +mul.f64 fd650, fd612, fd643; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd611, fd643; +fma.rn.f64 fd653, fd612, fd642, fd652; +mul.f64 fd654, fd578, fd653; +mul.f64 fd655, fd572, fd653; +mul.f64 fd656, fd651, fd578; +mul.f64 fd657, fd611, fd651; +mul.f64 fd658, fd612, fd653; +sub.f64 fd659, fd657, fd658; +mul.f64 fd660, fd611, fd653; +fma.rn.f64 fd661, fd612, fd651, fd660; +mul.f64 fd662, fd594, fd661; +mul.f64 fd663, fd588, fd661; +mul.f64 fd664, fd659, fd594; +mul.f64 fd665, fd611, fd659; +mul.f64 fd666, fd612, fd661; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd611, fd661; +fma.rn.f64 fd669, fd612, fd659, fd668; +mul.f64 fd670, fd610, fd669; +mul.f64 fd671, fd604, fd669; +mul.f64 fd672, fd667, fd610; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 11664, r22; +add.f64 fd673, fd500, fd566; +add.f64 fd674, fd498, fd565; +st.shared.v2.f64 [r23], {fd674, fd673}; +fma.rn.f64 fd675, fd611, fd580, fd615; +sub.f64 fd676, fd617, fd616; +st.shared.v2.f64 [r23+1296], {fd675, fd676}; +fma.rn.f64 fd677, fd620, fd596, fd623; +sub.f64 fd678, fd625, fd624; +st.shared.v2.f64 [r23+2592], {fd677, fd678}; +fma.rn.f64 fd679, fd628, fd571, fd631; +sub.f64 fd680, fd633, fd632; +st.shared.v2.f64 [r23+3888], {fd679, fd680}; +fma.rn.f64 fd681, fd636, fd587, fd639; +sub.f64 fd682, fd641, fd640; +st.shared.v2.f64 [r23+5184], {fd681, fd682}; +fma.rn.f64 fd683, fd642, fd603, fd646; +sub.f64 fd684, fd648, fd647; +st.shared.v2.f64 [r23+6480], {fd683, fd684}; +fma.rn.f64 fd685, fd651, fd572, fd654; +sub.f64 fd686, fd656, fd655; +st.shared.v2.f64 [r23+7776], {fd685, fd686}; +fma.rn.f64 fd687, fd659, fd588, fd662; +sub.f64 fd688, fd664, fd663; +st.shared.v2.f64 [r23+9072], {fd687, fd688}; +fma.rn.f64 fd689, fd667, fd604, fd670; +sub.f64 fd690, fd672, fd671; +st.shared.v2.f64 [r23+10368], {fd689, fd690}; +barrier.sync 0; +ld.shared.v2.f64 {fd691, fd692}, [r11]; +ld.shared.v2.f64 {fd695, fd696}, [r11+3888]; +ld.shared.v2.f64 {fd699, fd700}, [r11+7776]; +ld.shared.v2.f64 {fd703, fd704}, [r11+11664]; +ld.shared.v2.f64 {fd707, fd708}, [r11+15552]; +ld.shared.v2.f64 {fd711, fd712}, [r11+19440]; +ld.shared.v2.f64 {fd715, fd716}, [r11+23328]; +ld.shared.v2.f64 {fd719, fd720}, [r11+27216]; +ld.shared.v2.f64 {fd723, fd724}, [r11+31104]; +add.f64 fd727, fd703, fd715; +add.f64 fd728, fd704, fd716; +mul.f64 fd729, fd727, 0d3FE0000000000000; +sub.f64 fd730, fd691, fd729; +sub.f64 fd731, fd704, fd716; +mul.f64 fd732, fd731, 0dBFEBB67AE8584CAA; +mul.f64 fd733, fd728, 0d3FE0000000000000; +sub.f64 fd734, fd692, fd733; +sub.f64 fd735, fd703, fd715; +mul.f64 fd736, fd735, 0dBFEBB67AE8584CAA; +add.f64 fd737, fd707, fd719; +add.f64 fd738, fd708, fd720; +mul.f64 fd739, fd737, 0d3FE0000000000000; +sub.f64 fd740, fd695, fd739; +sub.f64 fd741, fd708, fd720; +mul.f64 fd742, fd741, 0dBFEBB67AE8584CAA; +mul.f64 fd743, fd738, 0d3FE0000000000000; +sub.f64 fd744, fd696, fd743; +sub.f64 fd745, fd707, fd719; +mul.f64 fd746, fd745, 0dBFEBB67AE8584CAA; +add.f64 fd747, fd711, fd723; +add.f64 fd748, fd712, fd724; +mul.f64 fd749, fd747, 0d3FE0000000000000; +sub.f64 fd750, fd699, fd749; +sub.f64 fd751, fd712, fd724; +mul.f64 fd752, fd751, 0dBFEBB67AE8584CAA; +mul.f64 fd753, fd748, 0d3FE0000000000000; +sub.f64 fd754, fd700, fd753; +sub.f64 fd755, fd711, fd723; +mul.f64 fd756, fd755, 0dBFEBB67AE8584CAA; +add.f64 %1, fd692, fd728; +add.f64 %0, fd691, fd727; +add.f64 %3, fd696, fd738; +add.f64 %2, fd695, fd737; +add.f64 %5, fd700, fd748; +add.f64 %4, fd699, fd747; +sub.f64 %7, fd734, fd736; +add.f64 %6, fd732, fd730; +sub.f64 %9, fd744, fd746; +add.f64 %8, fd742, fd740; +sub.f64 %11, fd754, fd756; +add.f64 %10, fd752, fd750; +add.f64 %13, fd736, fd734; +sub.f64 %12, fd730, fd732; +add.f64 %15, fd746, fd744; +sub.f64 %14, fd740, fd742; +add.f64 %17, fd756, fd754; +sub.f64 %16, fd750, fd752; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_2187), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<698, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<721>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 17496, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0dBFEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0dBFEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0dBFEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd124, fd154; +fma.rn.f64 fd158, fd153, fd122, fd157; +mul.f64 fd159, fd122, fd154; +mul.f64 fd160, fd153, fd124; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd140, fd166; +fma.rn.f64 fd168, fd164, fd138, fd167; +mul.f64 fd169, fd138, fd166; +mul.f64 fd170, fd164, fd140; +sub.f64 fd171, fd170, fd169; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd119, fd176; +fma.rn.f64 fd178, fd174, fd113, fd177; +mul.f64 fd179, fd113, fd176; +mul.f64 fd180, fd174, fd119; +sub.f64 fd181, fd180, fd179; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd135, fd186; +fma.rn.f64 fd188, fd184, fd129, fd187; +mul.f64 fd189, fd129, fd186; +mul.f64 fd190, fd184, fd135; +sub.f64 fd191, fd190, fd189; +ld.global.v2.f64 {fd192, fd193}, [rd6+3888]; +mul.f64 fd196, fd151, fd193; +fma.rn.f64 fd197, fd192, fd145, fd196; +mul.f64 fd198, fd145, fd193; +mul.f64 fd199, fd192, fd151; +sub.f64 fd200, fd199, fd198; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd120, fd205; +fma.rn.f64 fd207, fd203, fd114, fd206; +mul.f64 fd208, fd114, fd205; +mul.f64 fd209, fd203, fd120; +sub.f64 fd210, fd209, fd208; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd136, fd215; +fma.rn.f64 fd217, fd213, fd130, fd216; +mul.f64 fd218, fd130, fd215; +mul.f64 fd219, fd213, fd136; +sub.f64 fd220, fd219, fd218; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd152, fd225; +fma.rn.f64 fd227, fd223, fd146, fd226; +mul.f64 fd228, fd146, fd225; +mul.f64 fd229, fd223, fd152; +sub.f64 fd230, fd229, fd228; +mad.lo.s32 r8, r5, 17496, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd158; +st.shared.f64 [r9+16], fd168; +st.shared.f64 [r9+24], fd178; +st.shared.f64 [r9+32], fd188; +st.shared.f64 [r9+40], fd197; +st.shared.f64 [r9+48], fd207; +st.shared.f64 [r9+56], fd217; +st.shared.f64 [r9+64], fd227; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+1944]; +ld.shared.f64 fd233, [r11+3888]; +ld.shared.f64 fd234, [r11+5832]; +ld.shared.f64 fd235, [r11+7776]; +ld.shared.f64 fd236, [r11+9720]; +ld.shared.f64 fd237, [r11+11664]; +ld.shared.f64 fd238, [r11+13608]; +ld.shared.f64 fd239, [r11+15552]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+1944]; +ld.shared.f64 fd242, [r11+3888]; +ld.shared.f64 fd243, [r11+5832]; +ld.shared.f64 fd244, [r11+7776]; +ld.shared.f64 fd245, [r11+9720]; +ld.shared.f64 fd246, [r11+11664]; +ld.shared.f64 fd247, [r11+13608]; +ld.shared.f64 fd248, [r11+15552]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0dBFEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0d3FE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0d3FE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0d3FEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0d3FEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0d3FEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0d3FEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0d3FD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0d3FD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0dBFEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0dBFEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd336, fd366; +fma.rn.f64 fd370, fd365, fd334, fd369; +mul.f64 fd371, fd334, fd366; +mul.f64 fd372, fd365, fd336; +sub.f64 fd373, fd372, fd371; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd352, fd378; +fma.rn.f64 fd380, fd376, fd350, fd379; +mul.f64 fd381, fd350, fd378; +mul.f64 fd382, fd376, fd352; +sub.f64 fd383, fd382, fd381; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd331, fd388; +fma.rn.f64 fd390, fd386, fd325, fd389; +mul.f64 fd391, fd325, fd388; +mul.f64 fd392, fd386, fd331; +sub.f64 fd393, fd392, fd391; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd347, fd398; +fma.rn.f64 fd400, fd396, fd341, fd399; +mul.f64 fd401, fd341, fd398; +mul.f64 fd402, fd396, fd347; +sub.f64 fd403, fd402, fd401; +ld.global.v2.f64 {fd404, fd405}, [rd11+432]; +mul.f64 fd408, fd363, fd405; +fma.rn.f64 fd409, fd404, fd357, fd408; +mul.f64 fd410, fd357, fd405; +mul.f64 fd411, fd404, fd363; +sub.f64 fd412, fd411, fd410; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd332, fd417; +fma.rn.f64 fd419, fd415, fd326, fd418; +mul.f64 fd420, fd326, fd417; +mul.f64 fd421, fd415, fd332; +sub.f64 fd422, fd421, fd420; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd348, fd427; +fma.rn.f64 fd429, fd425, fd342, fd428; +mul.f64 fd430, fd342, fd427; +mul.f64 fd431, fd425, fd348; +sub.f64 fd432, fd431, fd430; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd364, fd437; +fma.rn.f64 fd439, fd435, fd358, fd438; +mul.f64 fd440, fd358, fd437; +mul.f64 fd441, fd435, fd364; +sub.f64 fd442, fd441, fd440; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +st.shared.f64 [r17], fd318; +st.shared.f64 [r17+72], fd370; +st.shared.f64 [r17+144], fd380; +st.shared.f64 [r17+216], fd390; +st.shared.f64 [r17+288], fd400; +st.shared.f64 [r17+360], fd409; +st.shared.f64 [r17+432], fd419; +st.shared.f64 [r17+504], fd429; +st.shared.f64 [r17+576], fd439; +barrier.sync 0; +ld.shared.f64 fd443, [r11]; +ld.shared.f64 fd444, [r11+1944]; +ld.shared.f64 fd445, [r11+3888]; +ld.shared.f64 fd446, [r11+5832]; +ld.shared.f64 fd447, [r11+7776]; +ld.shared.f64 fd448, [r11+9720]; +ld.shared.f64 fd449, [r11+11664]; +ld.shared.f64 fd450, [r11+13608]; +ld.shared.f64 fd451, [r11+15552]; +barrier.sync 0; +st.shared.f64 [r17], fd320; +st.shared.f64 [r17+72], fd373; +st.shared.f64 [r17+144], fd383; +st.shared.f64 [r17+216], fd393; +st.shared.f64 [r17+288], fd403; +st.shared.f64 [r17+360], fd412; +st.shared.f64 [r17+432], fd422; +st.shared.f64 [r17+504], fd432; +st.shared.f64 [r17+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r11]; +ld.shared.f64 fd453, [r11+1944]; +ld.shared.f64 fd454, [r11+3888]; +ld.shared.f64 fd455, [r11+5832]; +ld.shared.f64 fd456, [r11+7776]; +ld.shared.f64 fd457, [r11+9720]; +ld.shared.f64 fd458, [r11+11664]; +ld.shared.f64 fd459, [r11+13608]; +ld.shared.f64 fd460, [r11+15552]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd443, fd461; +add.f64 fd463, fd455, fd458; +add.f64 fd464, fd452, fd463; +mul.f64 fd465, fd461, 0d3FE0000000000000; +sub.f64 fd466, fd443, fd465; +sub.f64 fd467, fd455, fd458; +mul.f64 fd468, fd467, 0dBFEBB67AE8584CAA; +add.f64 fd469, fd468, fd466; +sub.f64 fd470, fd466, fd468; +mul.f64 fd471, fd463, 0d3FE0000000000000; +sub.f64 fd472, fd452, fd471; +sub.f64 fd473, fd446, fd449; +mul.f64 fd474, fd473, 0dBFEBB67AE8584CAA; +sub.f64 fd475, fd472, fd474; +add.f64 fd476, fd474, fd472; +add.f64 fd477, fd447, fd450; +add.f64 fd478, fd444, fd477; +add.f64 fd479, fd456, fd459; +add.f64 fd480, fd453, fd479; +mul.f64 fd481, fd477, 0d3FE0000000000000; +sub.f64 fd482, fd444, fd481; +sub.f64 fd483, fd456, fd459; +mul.f64 fd484, fd483, 0dBFEBB67AE8584CAA; +add.f64 fd485, fd484, fd482; +sub.f64 fd486, fd482, fd484; +mul.f64 fd487, fd479, 0d3FE0000000000000; +sub.f64 fd488, fd453, fd487; +sub.f64 fd489, fd447, fd450; +mul.f64 fd490, fd489, 0dBFEBB67AE8584CAA; +sub.f64 fd491, fd488, fd490; +add.f64 fd492, fd490, fd488; +add.f64 fd493, fd448, fd451; +add.f64 fd494, fd445, fd493; +add.f64 fd495, fd457, fd460; +add.f64 fd496, fd454, fd495; +mul.f64 fd497, fd493, 0d3FE0000000000000; +sub.f64 fd498, fd445, fd497; +sub.f64 fd499, fd457, fd460; +mul.f64 fd500, fd499, 0dBFEBB67AE8584CAA; +add.f64 fd501, fd500, fd498; +sub.f64 fd502, fd498, fd500; +mul.f64 fd503, fd495, 0d3FE0000000000000; +sub.f64 fd504, fd454, fd503; +sub.f64 fd505, fd448, fd451; +mul.f64 fd506, fd505, 0dBFEBB67AE8584CAA; +sub.f64 fd507, fd504, fd506; +add.f64 fd508, fd506, fd504; +mul.f64 fd509, fd485, 0d3FE8836FA2CF5039; +mul.f64 fd510, fd491, 0d3FE491B7523C161D; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd491, 0d3FE8836FA2CF5039; +fma.rn.f64 fd513, fd485, 0d3FE491B7523C161D, fd512; +mul.f64 fd514, fd501, 0d3FC63A1A7E0B738A; +mul.f64 fd515, fd507, 0d3FEF838B8C811C17; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd507, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd518, fd501, 0d3FEF838B8C811C17, fd517; +mul.f64 fd519, fd486, 0d3FC63A1A7E0B738A; +mul.f64 fd520, fd492, 0d3FEF838B8C811C17; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd492, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd523, fd486, 0d3FEF838B8C811C17, fd522; +mul.f64 fd524, fd502, 0dBFEE11F642522D1C; +mul.f64 fd525, fd508, 0d3FD5E3A8748A0BF5; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd508, 0dBFEE11F642522D1C; +fma.rn.f64 fd528, fd502, 0d3FD5E3A8748A0BF5, fd527; +add.f64 fd529, fd478, fd494; +add.f64 fd530, fd462, fd529; +add.f64 fd531, fd480, fd496; +add.f64 fd532, fd464, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd462, fd533; +sub.f64 fd535, fd480, fd496; +mul.f64 fd536, fd535, 0dBFEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd464, fd539; +sub.f64 fd541, fd478, fd494; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +add.f64 fd545, fd511, fd516; +add.f64 fd546, fd469, fd545; +add.f64 fd547, fd513, fd518; +add.f64 fd548, fd475, fd547; +mul.f64 fd549, fd545, 0d3FE0000000000000; +sub.f64 fd550, fd469, fd549; +sub.f64 fd551, fd513, fd518; +mul.f64 fd552, fd551, 0dBFEBB67AE8584CAA; +add.f64 fd553, fd552, fd550; +sub.f64 fd554, fd550, fd552; +mul.f64 fd555, fd547, 0d3FE0000000000000; +sub.f64 fd556, fd475, fd555; +sub.f64 fd557, fd511, fd516; +mul.f64 fd558, fd557, 0dBFEBB67AE8584CAA; +sub.f64 fd559, fd556, fd558; +add.f64 fd560, fd558, fd556; +add.f64 fd561, fd521, fd526; +add.f64 fd562, fd470, fd561; +add.f64 fd563, fd523, fd528; +add.f64 fd564, fd476, fd563; +mul.f64 fd565, fd561, 0d3FE0000000000000; +sub.f64 fd566, fd470, fd565; +sub.f64 fd567, fd523, fd528; +mul.f64 fd568, fd567, 0dBFEBB67AE8584CAA; +add.f64 fd569, fd568, fd566; +sub.f64 fd570, fd566, fd568; +mul.f64 fd571, fd563, 0d3FE0000000000000; +sub.f64 fd572, fd476, fd571; +sub.f64 fd573, fd521, fd526; +mul.f64 fd574, fd573, 0dBFEBB67AE8584CAA; +sub.f64 fd575, fd572, fd574; +add.f64 fd576, fd574, fd572; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd577, fd578}, [rd16]; +mul.f64 fd581, fd548, fd578; +fma.rn.f64 fd582, fd577, fd546, fd581; +mul.f64 fd583, fd546, fd578; +mul.f64 fd584, fd577, fd548; +sub.f64 fd585, fd584, fd583; +mul.f64 fd586, fd577, fd577; +mul.f64 fd587, fd578, fd578; +sub.f64 fd588, fd586, fd587; +mul.f64 fd589, fd578, fd577; +fma.rn.f64 fd590, fd578, fd577, fd589; +mul.f64 fd591, fd564, fd590; +fma.rn.f64 fd592, fd588, fd562, fd591; +mul.f64 fd593, fd562, fd590; +mul.f64 fd594, fd588, fd564; +sub.f64 fd595, fd594, fd593; +mul.f64 fd596, fd577, fd588; +mul.f64 fd597, fd578, fd590; +sub.f64 fd598, fd596, fd597; +mul.f64 fd599, fd577, fd590; +fma.rn.f64 fd600, fd578, fd588, fd599; +mul.f64 fd601, fd543, fd600; +fma.rn.f64 fd602, fd598, fd537, fd601; +mul.f64 fd603, fd537, fd600; +mul.f64 fd604, fd598, fd543; +sub.f64 fd605, fd604, fd603; +mul.f64 fd606, fd577, fd598; +mul.f64 fd607, fd578, fd600; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd577, fd600; +fma.rn.f64 fd610, fd578, fd598, fd609; +mul.f64 fd611, fd559, fd610; +fma.rn.f64 fd612, fd608, fd553, fd611; +mul.f64 fd613, fd553, fd610; +mul.f64 fd614, fd608, fd559; +sub.f64 fd615, fd614, fd613; +ld.global.v2.f64 {fd616, fd617}, [rd16+48]; +mul.f64 fd620, fd575, fd617; +fma.rn.f64 fd621, fd616, fd569, fd620; +mul.f64 fd622, fd569, fd617; +mul.f64 fd623, fd616, fd575; +sub.f64 fd624, fd623, fd622; +mul.f64 fd625, fd577, fd616; +mul.f64 fd626, fd578, fd617; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd577, fd617; +fma.rn.f64 fd629, fd578, fd616, fd628; +mul.f64 fd630, fd544, fd629; +fma.rn.f64 fd631, fd627, fd538, fd630; +mul.f64 fd632, fd538, fd629; +mul.f64 fd633, fd627, fd544; +sub.f64 fd634, fd633, fd632; +mul.f64 fd635, fd577, fd627; +mul.f64 fd636, fd578, fd629; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd577, fd629; +fma.rn.f64 fd639, fd578, fd627, fd638; +mul.f64 fd640, fd560, fd639; +fma.rn.f64 fd641, fd637, fd554, fd640; +mul.f64 fd642, fd554, fd639; +mul.f64 fd643, fd637, fd560; +sub.f64 fd644, fd643, fd642; +mul.f64 fd645, fd577, fd637; +mul.f64 fd646, fd578, fd639; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd577, fd639; +fma.rn.f64 fd649, fd578, fd637, fd648; +mul.f64 fd650, fd576, fd649; +fma.rn.f64 fd651, fd647, fd570, fd650; +mul.f64 fd652, fd570, fd649; +mul.f64 fd653, fd647, fd576; +sub.f64 fd654, fd653, fd652; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +st.shared.f64 [r23], fd530; +st.shared.f64 [r23+648], fd582; +st.shared.f64 [r23+1296], fd592; +st.shared.f64 [r23+1944], fd602; +st.shared.f64 [r23+2592], fd612; +st.shared.f64 [r23+3240], fd621; +st.shared.f64 [r23+3888], fd631; +st.shared.f64 [r23+4536], fd641; +st.shared.f64 [r23+5184], fd651; +barrier.sync 0; +ld.shared.f64 fd655, [r11]; +ld.shared.f64 fd656, [r11+1944]; +ld.shared.f64 fd657, [r11+3888]; +ld.shared.f64 fd658, [r11+5832]; +ld.shared.f64 fd659, [r11+7776]; +ld.shared.f64 fd660, [r11+9720]; +ld.shared.f64 fd661, [r11+11664]; +ld.shared.f64 fd662, [r11+13608]; +ld.shared.f64 fd663, [r11+15552]; +barrier.sync 0; +st.shared.f64 [r23], fd532; +st.shared.f64 [r23+648], fd585; +st.shared.f64 [r23+1296], fd595; +st.shared.f64 [r23+1944], fd605; +st.shared.f64 [r23+2592], fd615; +st.shared.f64 [r23+3240], fd624; +st.shared.f64 [r23+3888], fd634; +st.shared.f64 [r23+4536], fd644; +st.shared.f64 [r23+5184], fd654; +barrier.sync 0; +ld.shared.f64 fd664, [r11]; +ld.shared.f64 fd665, [r11+1944]; +ld.shared.f64 fd666, [r11+3888]; +ld.shared.f64 fd667, [r11+5832]; +ld.shared.f64 fd668, [r11+7776]; +ld.shared.f64 fd669, [r11+9720]; +ld.shared.f64 fd670, [r11+11664]; +ld.shared.f64 fd671, [r11+13608]; +ld.shared.f64 fd672, [r11+15552]; +add.f64 fd673, fd658, fd661; +add.f64 fd674, fd667, fd670; +mul.f64 fd675, fd673, 0d3FE0000000000000; +sub.f64 fd676, fd655, fd675; +sub.f64 fd677, fd667, fd670; +mul.f64 fd678, fd677, 0dBFEBB67AE8584CAA; +mul.f64 fd679, fd674, 0d3FE0000000000000; +sub.f64 fd680, fd664, fd679; +sub.f64 fd681, fd658, fd661; +mul.f64 fd682, fd681, 0dBFEBB67AE8584CAA; +add.f64 fd683, fd659, fd662; +add.f64 fd684, fd668, fd671; +mul.f64 fd685, fd683, 0d3FE0000000000000; +sub.f64 fd686, fd656, fd685; +sub.f64 fd687, fd668, fd671; +mul.f64 fd688, fd687, 0dBFEBB67AE8584CAA; +mul.f64 fd689, fd684, 0d3FE0000000000000; +sub.f64 fd690, fd665, fd689; +sub.f64 fd691, fd659, fd662; +mul.f64 fd692, fd691, 0dBFEBB67AE8584CAA; +add.f64 fd693, fd660, fd663; +add.f64 fd694, fd669, fd672; +mul.f64 fd695, fd693, 0d3FE0000000000000; +sub.f64 fd696, fd657, fd695; +sub.f64 fd697, fd669, fd672; +mul.f64 fd698, fd697, 0dBFEBB67AE8584CAA; +mul.f64 fd699, fd694, 0d3FE0000000000000; +sub.f64 fd700, fd666, fd699; +sub.f64 fd701, fd660, fd663; +mul.f64 fd702, fd701, 0dBFEBB67AE8584CAA; +add.f64 %0, fd655, fd673; +add.f64 %1, fd664, fd674; +add.f64 %2, fd656, fd683; +add.f64 %3, fd665, fd684; +add.f64 %4, fd657, fd693; +add.f64 %5, fd666, fd694; +add.f64 %6, fd678, fd676; +sub.f64 %7, fd680, fd682; +add.f64 %8, fd688, fd686; +sub.f64 %9, fd690, fd692; +add.f64 %10, fd698, fd696; +sub.f64 %11, fd700, fd702; +sub.f64 %12, fd676, fd678; +add.f64 %13, fd682, fd680; +sub.f64 %14, fd686, fd688; +add.f64 %15, fd692, fd690; +sub.f64 %16, fd696, fd698; +add.f64 %17, fd702, fd700; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_2187), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<700, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<46>; +.reg .f64 fd<269>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 17496, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %15, %18; +add.f64 fd14, %13, fd13; +add.f64 fd15, %17, %19; +add.f64 fd16, %14, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %13, fd17; +sub.f64 fd19, %17, %19; +mul.f64 fd20, fd19, 0dBFEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %14, fd23; +sub.f64 fd25, %15, %18; +mul.f64 fd26, fd25, 0dBFEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 17496, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd27, fd30; +fma.rn.f64 fd34, fd29, fd21, fd33; +mul.f64 fd35, fd21, fd30; +mul.f64 fd36, fd29, fd27; +sub.f64 fd37, fd36, fd35; +ld.global.v2.f64 {fd38, fd39}, [rd6+11664]; +mul.f64 fd42, fd28, fd39; +fma.rn.f64 fd43, fd38, fd22, fd42; +mul.f64 fd44, fd22, fd39; +mul.f64 fd45, fd38, fd28; +sub.f64 fd46, fd45, fd44; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd34; +st.shared.f64 [r9+16], fd43; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+5832]; +ld.shared.f64 fd49, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+5832]; +ld.shared.f64 fd52, [r11+11664]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd67, fd70; +fma.rn.f64 fd74, fd69, fd61, fd73; +mul.f64 fd75, fd61, fd70; +mul.f64 fd76, fd69, fd67; +sub.f64 fd77, fd76, fd75; +ld.global.v2.f64 {fd78, fd79}, [rd11+3888]; +mul.f64 fd82, fd68, fd79; +fma.rn.f64 fd83, fd78, fd62, fd82; +mul.f64 fd84, fd62, fd79; +mul.f64 fd85, fd78, fd68; +sub.f64 fd86, fd85, fd84; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd74; +st.shared.f64 [r17+48], fd83; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+5832]; +ld.shared.f64 fd89, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+5832]; +ld.shared.f64 fd92, [r11+11664]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0dBFEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0dBFEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd107, fd110; +fma.rn.f64 fd114, fd109, fd101, fd113; +mul.f64 fd115, fd101, fd110; +mul.f64 fd116, fd109, fd107; +sub.f64 fd117, fd116, fd115; +ld.global.v2.f64 {fd118, fd119}, [rd16+1296]; +mul.f64 fd122, fd108, fd119; +fma.rn.f64 fd123, fd118, fd102, fd122; +mul.f64 fd124, fd102, fd119; +mul.f64 fd125, fd118, fd108; +sub.f64 fd126, fd125, fd124; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +st.shared.f64 [r23], fd94; +st.shared.f64 [r23+72], fd114; +st.shared.f64 [r23+144], fd123; +barrier.sync 0; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+5832]; +ld.shared.f64 fd129, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r23], fd96; +st.shared.f64 [r23+72], fd117; +st.shared.f64 [r23+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r11]; +ld.shared.f64 fd131, [r11+5832]; +ld.shared.f64 fd132, [r11+11664]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd127, fd133; +add.f64 fd135, fd131, fd132; +add.f64 fd136, fd130, fd135; +mul.f64 fd137, fd133, 0d3FE0000000000000; +sub.f64 fd138, fd127, fd137; +sub.f64 fd139, fd131, fd132; +mul.f64 fd140, fd139, 0dBFEBB67AE8584CAA; +add.f64 fd141, fd140, fd138; +sub.f64 fd142, fd138, fd140; +mul.f64 fd143, fd135, 0d3FE0000000000000; +sub.f64 fd144, fd130, fd143; +sub.f64 fd145, fd128, fd129; +mul.f64 fd146, fd145, 0dBFEBB67AE8584CAA; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd146, fd144; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd149, fd150}, [rd21]; +mul.f64 fd153, fd147, fd150; +fma.rn.f64 fd154, fd149, fd141, fd153; +mul.f64 fd155, fd141, fd150; +mul.f64 fd156, fd149, fd147; +sub.f64 fd157, fd156, fd155; +ld.global.v2.f64 {fd158, fd159}, [rd21+432]; +mul.f64 fd162, fd148, fd159; +fma.rn.f64 fd163, fd158, fd142, fd162; +mul.f64 fd164, fd142, fd159; +mul.f64 fd165, fd158, fd148; +sub.f64 fd166, fd165, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +st.shared.f64 [r33], fd134; +st.shared.f64 [r33+216], fd154; +st.shared.f64 [r33+432], fd163; +barrier.sync 0; +ld.shared.f64 fd167, [r11]; +ld.shared.f64 fd168, [r11+5832]; +ld.shared.f64 fd169, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r33], fd136; +st.shared.f64 [r33+216], fd157; +st.shared.f64 [r33+432], fd166; +barrier.sync 0; +ld.shared.f64 fd170, [r11]; +ld.shared.f64 fd171, [r11+5832]; +ld.shared.f64 fd172, [r11+11664]; +add.f64 fd173, fd168, fd169; +add.f64 fd174, fd167, fd173; +add.f64 fd175, fd171, fd172; +add.f64 fd176, fd170, fd175; +mul.f64 fd177, fd173, 0d3FE0000000000000; +sub.f64 fd178, fd167, fd177; +sub.f64 fd179, fd171, fd172; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +add.f64 fd181, fd180, fd178; +sub.f64 fd182, fd178, fd180; +mul.f64 fd183, fd175, 0d3FE0000000000000; +sub.f64 fd184, fd170, fd183; +sub.f64 fd185, fd168, fd169; +mul.f64 fd186, fd185, 0dBFEBB67AE8584CAA; +sub.f64 fd187, fd184, fd186; +add.f64 fd188, fd186, fd184; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd189, fd190}, [rd26]; +mul.f64 fd193, fd187, fd190; +fma.rn.f64 fd194, fd189, fd181, fd193; +mul.f64 fd195, fd181, fd190; +mul.f64 fd196, fd189, fd187; +sub.f64 fd197, fd196, fd195; +ld.global.v2.f64 {fd198, fd199}, [rd26+144]; +mul.f64 fd202, fd188, fd199; +fma.rn.f64 fd203, fd198, fd182, fd202; +mul.f64 fd204, fd182, fd199; +mul.f64 fd205, fd198, fd188; +sub.f64 fd206, fd205, fd204; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +st.shared.f64 [r39], fd174; +st.shared.f64 [r39+648], fd194; +st.shared.f64 [r39+1296], fd203; +barrier.sync 0; +ld.shared.f64 fd207, [r11]; +ld.shared.f64 fd208, [r11+5832]; +ld.shared.f64 fd209, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r39], fd176; +st.shared.f64 [r39+648], fd197; +st.shared.f64 [r39+1296], fd206; +barrier.sync 0; +ld.shared.f64 fd210, [r11]; +ld.shared.f64 fd211, [r11+5832]; +ld.shared.f64 fd212, [r11+11664]; +add.f64 fd213, fd208, fd209; +add.f64 fd214, fd207, fd213; +add.f64 fd215, fd211, fd212; +add.f64 fd216, fd210, fd215; +mul.f64 fd217, fd213, 0d3FE0000000000000; +sub.f64 fd218, fd207, fd217; +sub.f64 fd219, fd211, fd212; +mul.f64 fd220, fd219, 0dBFEBB67AE8584CAA; +add.f64 fd221, fd220, fd218; +sub.f64 fd222, fd218, fd220; +mul.f64 fd223, fd215, 0d3FE0000000000000; +sub.f64 fd224, fd210, fd223; +sub.f64 fd225, fd208, fd209; +mul.f64 fd226, fd225, 0dBFEBB67AE8584CAA; +sub.f64 fd227, fd224, fd226; +add.f64 fd228, fd226, fd224; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 3; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 16; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f64 {fd229, fd230}, [rd31]; +mul.f64 fd233, fd227, fd230; +fma.rn.f64 fd234, fd229, fd221, fd233; +mul.f64 fd235, fd221, fd230; +mul.f64 fd236, fd229, fd227; +sub.f64 fd237, fd236, fd235; +ld.global.v2.f64 {fd238, fd239}, [rd31+48]; +mul.f64 fd242, fd228, fd239; +fma.rn.f64 fd243, fd238, fd222, fd242; +mul.f64 fd244, fd222, fd239; +mul.f64 fd245, fd238, fd228; +sub.f64 fd246, fd245, fd244; +barrier.sync 0; +mad.lo.s32 r45, r40, 5832, r44; +st.shared.f64 [r45], fd214; +st.shared.f64 [r45+1944], fd234; +st.shared.f64 [r45+3888], fd243; +barrier.sync 0; +ld.shared.f64 fd247, [r11]; +ld.shared.f64 fd248, [r11+5832]; +ld.shared.f64 fd249, [r11+11664]; +barrier.sync 0; +st.shared.f64 [r45], fd216; +st.shared.f64 [r45+1944], fd237; +st.shared.f64 [r45+3888], fd246; +barrier.sync 0; +ld.shared.f64 fd250, [r11]; +ld.shared.f64 fd251, [r11+5832]; +ld.shared.f64 fd252, [r11+11664]; +add.f64 fd253, fd248, fd249; +add.f64 fd254, fd251, fd252; +mul.f64 fd255, fd253, 0d3FE0000000000000; +sub.f64 fd256, fd247, fd255; +sub.f64 fd257, fd251, fd252; +mul.f64 fd258, fd257, 0dBFEBB67AE8584CAA; +mul.f64 fd259, fd254, 0d3FE0000000000000; +sub.f64 fd260, fd250, fd259; +sub.f64 fd261, fd248, fd249; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +add.f64 %0, fd247, fd253; +add.f64 %1, fd250, fd254; +add.f64 %2, fd258, fd256; +sub.f64 %3, fd260, fd262; +sub.f64 %4, fd256, fd258; +add.f64 %5, fd262, fd260; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_2187), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<701, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<46>; +.reg .f64 fd<305>; +.reg .b64 rd<32>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 34992, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %15, %18; +add.f64 fd14, %17, %19; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %13, fd15; +sub.f64 fd17, %17, %19; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %14, fd21; +sub.f64 fd23, %15, %18; +mul.f64 fd24, fd23, 0dBFEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 34992, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd25, fd28; +mul.f64 fd32, fd19, fd28; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+11664]; +mul.f64 fd38, fd26, fd35; +mul.f64 fd39, fd20, fd35; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %14, fd14; +add.f64 fd42, %13, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd27, fd19, fd31; +sub.f64 fd44, fd33, fd32; +st.shared.v2.f64 [r9+16], {fd43, fd44}; +fma.rn.f64 fd45, fd34, fd20, fd38; +sub.f64 fd46, fd40, fd39; +st.shared.v2.f64 [r9+32], {fd45, fd46}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+11664]; +ld.shared.v2.f64 {fd55, fd56}, [r11+23328]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0dBFEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0dBFEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd71, fd74; +mul.f64 fd78, fd65, fd74; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+3888]; +mul.f64 fd84, fd72, fd81; +mul.f64 fd85, fd66, fd81; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd73, fd65, fd77; +sub.f64 fd90, fd79, fd78; +st.shared.v2.f64 [r17+48], {fd89, fd90}; +fma.rn.f64 fd91, fd80, fd66, fd84; +sub.f64 fd92, fd86, fd85; +st.shared.v2.f64 [r17+96], {fd91, fd92}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+11664]; +ld.shared.v2.f64 {fd101, fd102}, [r11+23328]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd117, fd120; +mul.f64 fd124, fd111, fd120; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+1296]; +mul.f64 fd130, fd118, fd127; +mul.f64 fd131, fd112, fd127; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r23, r18, 432, r22; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r23], {fd134, fd133}; +fma.rn.f64 fd135, fd119, fd111, fd123; +sub.f64 fd136, fd125, fd124; +st.shared.v2.f64 [r23+144], {fd135, fd136}; +fma.rn.f64 fd137, fd126, fd112, fd130; +sub.f64 fd138, fd132, fd131; +st.shared.v2.f64 [r23+288], {fd137, fd138}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r11]; +ld.shared.v2.f64 {fd143, fd144}, [r11+11664]; +ld.shared.v2.f64 {fd147, fd148}, [r11+23328]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0dBFEBB67AE8584CAA; +add.f64 fd157, fd156, fd154; +sub.f64 fd158, fd154, fd156; +mul.f64 fd159, fd152, 0d3FE0000000000000; +sub.f64 fd160, fd140, fd159; +sub.f64 fd161, fd143, fd147; +mul.f64 fd162, fd161, 0dBFEBB67AE8584CAA; +sub.f64 fd163, fd160, fd162; +add.f64 fd164, fd162, fd160; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 4; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd165, fd166}, [rd21]; +mul.f64 fd169, fd163, fd166; +mul.f64 fd170, fd157, fd166; +mul.f64 fd171, fd165, fd163; +ld.global.v2.f64 {fd172, fd173}, [rd21+432]; +mul.f64 fd176, fd164, fd173; +mul.f64 fd177, fd158, fd173; +mul.f64 fd178, fd172, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 1296, r32; +add.f64 fd179, fd140, fd152; +add.f64 fd180, fd139, fd151; +st.shared.v2.f64 [r33], {fd180, fd179}; +fma.rn.f64 fd181, fd165, fd157, fd169; +sub.f64 fd182, fd171, fd170; +st.shared.v2.f64 [r33+432], {fd181, fd182}; +fma.rn.f64 fd183, fd172, fd158, fd176; +sub.f64 fd184, fd178, fd177; +st.shared.v2.f64 [r33+864], {fd183, fd184}; +barrier.sync 0; +ld.shared.v2.f64 {fd185, fd186}, [r11]; +ld.shared.v2.f64 {fd189, fd190}, [r11+11664]; +ld.shared.v2.f64 {fd193, fd194}, [r11+23328]; +add.f64 fd197, fd189, fd193; +add.f64 fd198, fd190, fd194; +mul.f64 fd199, fd197, 0d3FE0000000000000; +sub.f64 fd200, fd185, fd199; +sub.f64 fd201, fd190, fd194; +mul.f64 fd202, fd201, 0dBFEBB67AE8584CAA; +add.f64 fd203, fd202, fd200; +sub.f64 fd204, fd200, fd202; +mul.f64 fd205, fd198, 0d3FE0000000000000; +sub.f64 fd206, fd186, fd205; +sub.f64 fd207, fd189, fd193; +mul.f64 fd208, fd207, 0dBFEBB67AE8584CAA; +sub.f64 fd209, fd206, fd208; +add.f64 fd210, fd208, fd206; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 4; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd211, fd212}, [rd26]; +mul.f64 fd215, fd209, fd212; +mul.f64 fd216, fd203, fd212; +mul.f64 fd217, fd211, fd209; +ld.global.v2.f64 {fd218, fd219}, [rd26+144]; +mul.f64 fd222, fd210, fd219; +mul.f64 fd223, fd204, fd219; +mul.f64 fd224, fd218, fd210; +barrier.sync 0; +mad.lo.s32 r39, r34, 3888, r38; +add.f64 fd225, fd186, fd198; +add.f64 fd226, fd185, fd197; +st.shared.v2.f64 [r39], {fd226, fd225}; +fma.rn.f64 fd227, fd211, fd203, fd215; +sub.f64 fd228, fd217, fd216; +st.shared.v2.f64 [r39+1296], {fd227, fd228}; +fma.rn.f64 fd229, fd218, fd204, fd222; +sub.f64 fd230, fd224, fd223; +st.shared.v2.f64 [r39+2592], {fd229, fd230}; +barrier.sync 0; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+11664]; +ld.shared.v2.f64 {fd239, fd240}, [r11+23328]; +add.f64 fd243, fd235, fd239; +add.f64 fd244, fd236, fd240; +mul.f64 fd245, fd243, 0d3FE0000000000000; +sub.f64 fd246, fd231, fd245; +sub.f64 fd247, fd236, fd240; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +mul.f64 fd251, fd244, 0d3FE0000000000000; +sub.f64 fd252, fd232, fd251; +sub.f64 fd253, fd235, fd239; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +mul.wide.u32 rd27, r7, -2032597691; +shr.u64 rd28, rd27, 39; +cvt.u32.u64 r40, rd28; +mul.lo.s32 r41, r40, 243; +sub.s32 r42, r7, r41; +shl.b32 r43, r42, 4; +add.s32 r44, r8, r43; +mul.wide.u32 rd29, r40, 16; +mov.u64 rd30, %12; +add.s64 rd31, rd30, rd29; +ld.global.v2.f64 {fd257, fd258}, [rd31]; +mul.f64 fd261, fd255, fd258; +mul.f64 fd262, fd249, fd258; +mul.f64 fd263, fd257, fd255; +ld.global.v2.f64 {fd264, fd265}, [rd31+48]; +mul.f64 fd268, fd256, fd265; +mul.f64 fd269, fd250, fd265; +mul.f64 fd270, fd264, fd256; +barrier.sync 0; +mad.lo.s32 r45, r40, 11664, r44; +add.f64 fd271, fd232, fd244; +add.f64 fd272, fd231, fd243; +st.shared.v2.f64 [r45], {fd272, fd271}; +fma.rn.f64 fd273, fd257, fd249, fd261; +sub.f64 fd274, fd263, fd262; +st.shared.v2.f64 [r45+3888], {fd273, fd274}; +fma.rn.f64 fd275, fd264, fd250, fd268; +sub.f64 fd276, fd270, fd269; +st.shared.v2.f64 [r45+7776], {fd275, fd276}; +barrier.sync 0; +ld.shared.v2.f64 {fd277, fd278}, [r11]; +ld.shared.v2.f64 {fd281, fd282}, [r11+11664]; +ld.shared.v2.f64 {fd285, fd286}, [r11+23328]; +add.f64 fd289, fd281, fd285; +add.f64 fd290, fd282, fd286; +mul.f64 fd291, fd289, 0d3FE0000000000000; +sub.f64 fd292, fd277, fd291; +sub.f64 fd293, fd282, fd286; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +mul.f64 fd295, fd290, 0d3FE0000000000000; +sub.f64 fd296, fd278, fd295; +sub.f64 fd297, fd281, fd285; +mul.f64 fd298, fd297, 0dBFEBB67AE8584CAA; +add.f64 %1, fd278, fd290; +add.f64 %0, fd277, fd289; +sub.f64 %3, fd296, fd298; +add.f64 %2, fd294, fd292; +add.f64 %5, fd298, fd296; +sub.f64 %4, fd292, fd294; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_2187), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..cd229484f2465 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp16_fwd.hpp.inc @@ -0,0 +1,3146 @@ +#ifndef CUFFTDX_FFT_21_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_21_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<753, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<111>; +.reg .b32 r<2833>; +.reg .f64 fd<85>; +.reg .b64 rd<2>; +mov.f64 fd35, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd35; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd36, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd36; +} +mov.b32 r447, {rs2, rs2}; +mov.f64 fd41, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs3, fd41; +} +mov.b32 r654, {rs3, rs3}; +mov.f64 fd42, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs4, fd42; +} +mov.b32 r678, {rs4, rs4}; +mov.f64 fd53, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs5, fd53; +} +mov.b32 r636, {rs5, rs5}; +mov.f64 fd28, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs6, fd28; +} +mov.b32 r663, {rs6, rs6}; +{ +cvt.rn.f16.f64 rs7, fd53; +} +mov.b32 r537, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs8, fd28; +} +{ +neg.f16 rs9, rs8; +} +mov.b32 r561, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs11, fd35; +} +mov.b32 r645, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd36; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r669, {rs13, rs13}; +{ +add.f16x2 r1, %48, %78; +} +{ +add.f16x2 r4, %42, r1; +} +{ +add.f16x2 r7, %54, %72; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %60, %66; +} +{ +add.f16x2 r16, r10, r13; +} +{ +add.f16x2 r19, %49, %79; +} +{ +add.f16x2 r22, %43, r19; +} +{ +add.f16x2 r25, %55, %73; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %61, %67; +} +{ +add.f16x2 r34, r28, r31; +} +{ +add.f16x2 r37, %48, %78; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %42, r40; +} +{ +add.f16x2 r46, %54, %72; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %60, %66; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %49, %79; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %55, %73; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %61, %67; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r61, r85; +} +{ +add.f16x2 r91, %48, %78; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %42, r94; +} +{ +add.f16x2 r100, %54, %72; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %60, %66; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %49, %79; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %55, %73; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %61, %67; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 r142, r115, r139; +} +{ +add.f16x2 r145, %48, %78; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %42, r148; +} +{ +add.f16x2 r154, %54, %72; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %60, %66; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %49, %79; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %55, %73; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %61, %67; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 r196, r169, r193; +} +{ +add.f16x2 r199, %48, %78; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %42, r202; +} +{ +add.f16x2 r208, %54, %72; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %60, %66; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %49, %79; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %55, %73; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %61, %67; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 r250, r223, r247; +} +{ +add.f16x2 r253, %48, %78; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %42, r256; +} +{ +add.f16x2 r262, %54, %72; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %60, %66; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %49, %79; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %55, %73; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %61, %67; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 r304, r277, r301; +} +{ +add.f16x2 r307, %48, %78; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %42, r310; +} +{ +add.f16x2 r316, %54, %72; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %60, %66; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %49, %79; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %55, %73; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %61, %67; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 r358, r331, r355; +} +{ +add.f16x2 r361, %49, %79; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %43, r364; +} +{ +add.f16x2 r370, %55, %73; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %61, %67; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %48, %78; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %54, %72; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %60, %66; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 r412, r385, r409; +} +{ +add.f16x2 r415, %49, %79; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %43, r418; +} +{ +add.f16x2 r424, %55, %73; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %61, %67; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %48, %78; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %54, %72; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %60, %66; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 r466, r439, r463; +} +{ +add.f16x2 r469, %49, %79; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %43, r472; +} +{ +add.f16x2 r478, %55, %73; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %61, %67; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %48, %78; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %54, %72; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %60, %66; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r493, r517; +} +{ +add.f16x2 r523, %49, %79; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %43, r526; +} +{ +add.f16x2 r532, %55, %73; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %61, %67; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %48, %78; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %54, %72; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %60, %66; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 r574, r547, r571; +} +{ +add.f16x2 r577, %49, %79; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %43, r580; +} +{ +add.f16x2 r586, %55, %73; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %61, %67; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %48, %78; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %54, %72; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %60, %66; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r601, r625; +} +{ +add.f16x2 r631, %49, %79; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %43, r634; +} +{ +add.f16x2 r640, %55, %73; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %61, %67; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %48, %78; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %54, %72; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %60, %66; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r655, r679; +} +{ +cvt.rn.f16.f64 rs15, fd35; +} +mov.b32 r1104, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd36; +} +mov.b32 r1131, {rs16, rs16}; +{ +cvt.rn.f16.f64 rs17, fd41; +} +mov.b32 r1338, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd42; +} +mov.b32 r1362, {rs18, rs18}; +{ +cvt.rn.f16.f64 rs19, fd53; +} +mov.b32 r1320, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd28; +} +mov.b32 r1347, {rs20, rs20}; +{ +cvt.rn.f16.f64 rs21, fd53; +} +mov.b32 r1221, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd28; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r1245, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd35; +} +mov.b32 r1329, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd36; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r1353, {rs27, rs27}; +{ +add.f16x2 r685, %50, %80; +} +{ +add.f16x2 r688, %44, r685; +} +{ +add.f16x2 r691, %56, %74; +} +{ +add.f16x2 r694, r688, r691; +} +{ +add.f16x2 r697, %62, %68; +} +{ +add.f16x2 r700, r694, r697; +} +{ +add.f16x2 r703, %51, %81; +} +{ +add.f16x2 r706, %45, r703; +} +{ +add.f16x2 r709, %57, %75; +} +{ +add.f16x2 r712, r706, r709; +} +{ +add.f16x2 r715, %63, %69; +} +{ +add.f16x2 r718, r712, r715; +} +{ +add.f16x2 r721, %50, %80; +} +{ +mul.f16x2 r724, r721, r1104; +} +{ +add.f16x2 r727, %44, r724; +} +{ +add.f16x2 r730, %56, %74; +} +{ +mul.f16x2 r733, r730, r1338; +} +{ +add.f16x2 r736, r727, r733; +} +{ +add.f16x2 r739, %62, %68; +} +{ +mul.f16x2 r742, r739, r1320; +} +{ +add.f16x2 r745, r736, r742; +} +{ +sub.f16x2 r748, %51, %81; +} +{ +mul.f16x2 r751, r748, r1131; +} +{ +sub.f16x2 r754, %57, %75; +} +{ +mul.f16x2 r757, r754, r1362; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %63, %69; +} +{ +mul.f16x2 r766, r763, r1347; +} +{ +add.f16x2 r769, r760, r766; +} +{ +sub.f16x2 r772, r745, r769; +} +{ +add.f16x2 r775, %50, %80; +} +{ +mul.f16x2 r778, r775, r1104; +} +{ +add.f16x2 r781, %44, r778; +} +{ +add.f16x2 r784, %56, %74; +} +{ +mul.f16x2 r787, r784, r1338; +} +{ +add.f16x2 r790, r781, r787; +} +{ +add.f16x2 r793, %62, %68; +} +{ +mul.f16x2 r796, r793, r1320; +} +{ +add.f16x2 r799, r790, r796; +} +{ +sub.f16x2 r802, %51, %81; +} +{ +mul.f16x2 r805, r802, r1131; +} +{ +sub.f16x2 r808, %57, %75; +} +{ +mul.f16x2 r811, r808, r1362; +} +{ +add.f16x2 r814, r805, r811; +} +{ +sub.f16x2 r817, %63, %69; +} +{ +mul.f16x2 r820, r817, r1347; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r799, r823; +} +{ +add.f16x2 r829, %50, %80; +} +{ +mul.f16x2 r832, r829, r1338; +} +{ +add.f16x2 r835, %44, r832; +} +{ +add.f16x2 r838, %56, %74; +} +{ +mul.f16x2 r841, r838, r1221; +} +{ +add.f16x2 r844, r835, r841; +} +{ +add.f16x2 r847, %62, %68; +} +{ +mul.f16x2 r850, r847, r1329; +} +{ +add.f16x2 r853, r844, r850; +} +{ +sub.f16x2 r856, %51, %81; +} +{ +mul.f16x2 r859, r856, r1362; +} +{ +sub.f16x2 r862, %57, %75; +} +{ +mul.f16x2 r865, r862, r1245; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %63, %69; +} +{ +mul.f16x2 r874, r871, r1353; +} +{ +add.f16x2 r877, r868, r874; +} +{ +sub.f16x2 r880, r853, r877; +} +{ +add.f16x2 r883, %50, %80; +} +{ +mul.f16x2 r886, r883, r1338; +} +{ +add.f16x2 r889, %44, r886; +} +{ +add.f16x2 r892, %56, %74; +} +{ +mul.f16x2 r895, r892, r1221; +} +{ +add.f16x2 r898, r889, r895; +} +{ +add.f16x2 r901, %62, %68; +} +{ +mul.f16x2 r904, r901, r1329; +} +{ +add.f16x2 r907, r898, r904; +} +{ +sub.f16x2 r910, %51, %81; +} +{ +mul.f16x2 r913, r910, r1362; +} +{ +sub.f16x2 r916, %57, %75; +} +{ +mul.f16x2 r919, r916, r1245; +} +{ +add.f16x2 r922, r913, r919; +} +{ +sub.f16x2 r925, %63, %69; +} +{ +mul.f16x2 r928, r925, r1353; +} +{ +add.f16x2 r931, r922, r928; +} +{ +add.f16x2 r934, r907, r931; +} +{ +add.f16x2 r937, %50, %80; +} +{ +mul.f16x2 r940, r937, r1320; +} +{ +add.f16x2 r943, %44, r940; +} +{ +add.f16x2 r946, %56, %74; +} +{ +mul.f16x2 r949, r946, r1329; +} +{ +add.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, %62, %68; +} +{ +mul.f16x2 r958, r955, r1338; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, %51, %81; +} +{ +mul.f16x2 r967, r964, r1347; +} +{ +sub.f16x2 r970, %57, %75; +} +{ +mul.f16x2 r973, r970, r1353; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %63, %69; +} +{ +mul.f16x2 r982, r979, r1362; +} +{ +add.f16x2 r985, r976, r982; +} +{ +sub.f16x2 r988, r961, r985; +} +{ +add.f16x2 r991, %50, %80; +} +{ +mul.f16x2 r994, r991, r1320; +} +{ +add.f16x2 r997, %44, r994; +} +{ +add.f16x2 r1000, %56, %74; +} +{ +mul.f16x2 r1003, r1000, r1329; +} +{ +add.f16x2 r1006, r997, r1003; +} +{ +add.f16x2 r1009, %62, %68; +} +{ +mul.f16x2 r1012, r1009, r1338; +} +{ +add.f16x2 r1015, r1006, r1012; +} +{ +sub.f16x2 r1018, %51, %81; +} +{ +mul.f16x2 r1021, r1018, r1347; +} +{ +sub.f16x2 r1024, %57, %75; +} +{ +mul.f16x2 r1027, r1024, r1353; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %63, %69; +} +{ +mul.f16x2 r1036, r1033, r1362; +} +{ +add.f16x2 r1039, r1030, r1036; +} +{ +add.f16x2 r1042, r1015, r1039; +} +{ +add.f16x2 r1045, %51, %81; +} +{ +mul.f16x2 r1048, r1045, r1104; +} +{ +add.f16x2 r1051, %45, r1048; +} +{ +add.f16x2 r1054, %57, %75; +} +{ +mul.f16x2 r1057, r1054, r1338; +} +{ +add.f16x2 r1060, r1051, r1057; +} +{ +add.f16x2 r1063, %63, %69; +} +{ +mul.f16x2 r1066, r1063, r1320; +} +{ +add.f16x2 r1069, r1060, r1066; +} +{ +sub.f16x2 r1072, %50, %80; +} +{ +mul.f16x2 r1075, r1072, r1131; +} +{ +sub.f16x2 r1078, %56, %74; +} +{ +mul.f16x2 r1081, r1078, r1362; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %62, %68; +} +{ +mul.f16x2 r1090, r1087, r1347; +} +{ +add.f16x2 r1093, r1084, r1090; +} +{ +add.f16x2 r1096, r1069, r1093; +} +{ +add.f16x2 r1099, %51, %81; +} +{ +mul.f16x2 r1102, r1099, r1104; +} +{ +add.f16x2 r1105, %45, r1102; +} +{ +add.f16x2 r1108, %57, %75; +} +{ +mul.f16x2 r1111, r1108, r1338; +} +{ +add.f16x2 r1114, r1105, r1111; +} +{ +add.f16x2 r1117, %63, %69; +} +{ +mul.f16x2 r1120, r1117, r1320; +} +{ +add.f16x2 r1123, r1114, r1120; +} +{ +sub.f16x2 r1126, %50, %80; +} +{ +mul.f16x2 r1129, r1126, r1131; +} +{ +sub.f16x2 r1132, %56, %74; +} +{ +mul.f16x2 r1135, r1132, r1362; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %62, %68; +} +{ +mul.f16x2 r1144, r1141, r1347; +} +{ +add.f16x2 r1147, r1138, r1144; +} +{ +sub.f16x2 r1150, r1123, r1147; +} +{ +add.f16x2 r1153, %51, %81; +} +{ +mul.f16x2 r1156, r1153, r1338; +} +{ +add.f16x2 r1159, %45, r1156; +} +{ +add.f16x2 r1162, %57, %75; +} +{ +mul.f16x2 r1165, r1162, r1221; +} +{ +add.f16x2 r1168, r1159, r1165; +} +{ +add.f16x2 r1171, %63, %69; +} +{ +mul.f16x2 r1174, r1171, r1329; +} +{ +add.f16x2 r1177, r1168, r1174; +} +{ +sub.f16x2 r1180, %50, %80; +} +{ +mul.f16x2 r1183, r1180, r1362; +} +{ +sub.f16x2 r1186, %56, %74; +} +{ +mul.f16x2 r1189, r1186, r1245; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %62, %68; +} +{ +mul.f16x2 r1198, r1195, r1353; +} +{ +add.f16x2 r1201, r1192, r1198; +} +{ +add.f16x2 r1204, r1177, r1201; +} +{ +add.f16x2 r1207, %51, %81; +} +{ +mul.f16x2 r1210, r1207, r1338; +} +{ +add.f16x2 r1213, %45, r1210; +} +{ +add.f16x2 r1216, %57, %75; +} +{ +mul.f16x2 r1219, r1216, r1221; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %63, %69; +} +{ +mul.f16x2 r1228, r1225, r1329; +} +{ +add.f16x2 r1231, r1222, r1228; +} +{ +sub.f16x2 r1234, %50, %80; +} +{ +mul.f16x2 r1237, r1234, r1362; +} +{ +sub.f16x2 r1240, %56, %74; +} +{ +mul.f16x2 r1243, r1240, r1245; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %62, %68; +} +{ +mul.f16x2 r1252, r1249, r1353; +} +{ +add.f16x2 r1255, r1246, r1252; +} +{ +sub.f16x2 r1258, r1231, r1255; +} +{ +add.f16x2 r1261, %51, %81; +} +{ +mul.f16x2 r1264, r1261, r1320; +} +{ +add.f16x2 r1267, %45, r1264; +} +{ +add.f16x2 r1270, %57, %75; +} +{ +mul.f16x2 r1273, r1270, r1329; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +add.f16x2 r1279, %63, %69; +} +{ +mul.f16x2 r1282, r1279, r1338; +} +{ +add.f16x2 r1285, r1276, r1282; +} +{ +sub.f16x2 r1288, %50, %80; +} +{ +mul.f16x2 r1291, r1288, r1347; +} +{ +sub.f16x2 r1294, %56, %74; +} +{ +mul.f16x2 r1297, r1294, r1353; +} +{ +add.f16x2 r1300, r1291, r1297; +} +{ +sub.f16x2 r1303, %62, %68; +} +{ +mul.f16x2 r1306, r1303, r1362; +} +{ +add.f16x2 r1309, r1300, r1306; +} +{ +add.f16x2 r1312, r1285, r1309; +} +{ +add.f16x2 r1315, %51, %81; +} +{ +mul.f16x2 r1318, r1315, r1320; +} +{ +add.f16x2 r1321, %45, r1318; +} +{ +add.f16x2 r1324, %57, %75; +} +{ +mul.f16x2 r1327, r1324, r1329; +} +{ +add.f16x2 r1330, r1321, r1327; +} +{ +add.f16x2 r1333, %63, %69; +} +{ +mul.f16x2 r1336, r1333, r1338; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +sub.f16x2 r1342, %50, %80; +} +{ +mul.f16x2 r1345, r1342, r1347; +} +{ +sub.f16x2 r1348, %56, %74; +} +{ +mul.f16x2 r1351, r1348, r1353; +} +{ +add.f16x2 r1354, r1345, r1351; +} +{ +sub.f16x2 r1357, %62, %68; +} +{ +mul.f16x2 r1360, r1357, r1362; +} +{ +add.f16x2 r1363, r1354, r1360; +} +{ +sub.f16x2 r1366, r1339, r1363; +} +{ +cvt.rn.f16.f64 rs29, fd35; +} +mov.b32 r1788, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd36; +} +mov.b32 r1815, {rs30, rs30}; +{ +cvt.rn.f16.f64 rs31, fd41; +} +mov.b32 r2022, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd42; +} +mov.b32 r2046, {rs32, rs32}; +{ +cvt.rn.f16.f64 rs33, fd53; +} +mov.b32 r2004, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd28; +} +mov.b32 r2031, {rs34, rs34}; +{ +cvt.rn.f16.f64 rs35, fd53; +} +mov.b32 r1905, {rs35, rs35}; +{ +cvt.rn.f16.f64 rs36, fd28; +} +{ +neg.f16 rs37, rs36; +} +mov.b32 r1929, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs39, fd35; +} +mov.b32 r2013, {rs39, rs39}; +{ +cvt.rn.f16.f64 rs40, fd36; +} +{ +neg.f16 rs41, rs40; +} +mov.b32 r2037, {rs41, rs41}; +{ +add.f16x2 r1369, %52, %82; +} +{ +add.f16x2 r1372, %46, r1369; +} +{ +add.f16x2 r1375, %58, %76; +} +{ +add.f16x2 r1378, r1372, r1375; +} +{ +add.f16x2 r1381, %64, %70; +} +{ +add.f16x2 r1384, r1378, r1381; +} +{ +add.f16x2 r1387, %53, %83; +} +{ +add.f16x2 r1390, %47, r1387; +} +{ +add.f16x2 r1393, %59, %77; +} +{ +add.f16x2 r1396, r1390, r1393; +} +{ +add.f16x2 r1399, %65, %71; +} +{ +add.f16x2 r1402, r1396, r1399; +} +{ +add.f16x2 r1405, %52, %82; +} +{ +mul.f16x2 r1408, r1405, r1788; +} +{ +add.f16x2 r1411, %46, r1408; +} +{ +add.f16x2 r1414, %58, %76; +} +{ +mul.f16x2 r1417, r1414, r2022; +} +{ +add.f16x2 r1420, r1411, r1417; +} +{ +add.f16x2 r1423, %64, %70; +} +{ +mul.f16x2 r1426, r1423, r2004; +} +{ +add.f16x2 r1429, r1420, r1426; +} +{ +sub.f16x2 r1432, %53, %83; +} +{ +mul.f16x2 r1435, r1432, r1815; +} +{ +sub.f16x2 r1438, %59, %77; +} +{ +mul.f16x2 r1441, r1438, r2046; +} +{ +add.f16x2 r1444, r1435, r1441; +} +{ +sub.f16x2 r1447, %65, %71; +} +{ +mul.f16x2 r1450, r1447, r2031; +} +{ +add.f16x2 r1453, r1444, r1450; +} +{ +sub.f16x2 r1456, r1429, r1453; +} +{ +add.f16x2 r1459, %52, %82; +} +{ +mul.f16x2 r1462, r1459, r1788; +} +{ +add.f16x2 r1465, %46, r1462; +} +{ +add.f16x2 r1468, %58, %76; +} +{ +mul.f16x2 r1471, r1468, r2022; +} +{ +add.f16x2 r1474, r1465, r1471; +} +{ +add.f16x2 r1477, %64, %70; +} +{ +mul.f16x2 r1480, r1477, r2004; +} +{ +add.f16x2 r1483, r1474, r1480; +} +{ +sub.f16x2 r1486, %53, %83; +} +{ +mul.f16x2 r1489, r1486, r1815; +} +{ +sub.f16x2 r1492, %59, %77; +} +{ +mul.f16x2 r1495, r1492, r2046; +} +{ +add.f16x2 r1498, r1489, r1495; +} +{ +sub.f16x2 r1501, %65, %71; +} +{ +mul.f16x2 r1504, r1501, r2031; +} +{ +add.f16x2 r1507, r1498, r1504; +} +{ +add.f16x2 r1510, r1483, r1507; +} +{ +add.f16x2 r1513, %52, %82; +} +{ +mul.f16x2 r1516, r1513, r2022; +} +{ +add.f16x2 r1519, %46, r1516; +} +{ +add.f16x2 r1522, %58, %76; +} +{ +mul.f16x2 r1525, r1522, r1905; +} +{ +add.f16x2 r1528, r1519, r1525; +} +{ +add.f16x2 r1531, %64, %70; +} +{ +mul.f16x2 r1534, r1531, r2013; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +sub.f16x2 r1540, %53, %83; +} +{ +mul.f16x2 r1543, r1540, r2046; +} +{ +sub.f16x2 r1546, %59, %77; +} +{ +mul.f16x2 r1549, r1546, r1929; +} +{ +add.f16x2 r1552, r1543, r1549; +} +{ +sub.f16x2 r1555, %65, %71; +} +{ +mul.f16x2 r1558, r1555, r2037; +} +{ +add.f16x2 r1561, r1552, r1558; +} +{ +sub.f16x2 r1564, r1537, r1561; +} +{ +add.f16x2 r1567, %52, %82; +} +{ +mul.f16x2 r1570, r1567, r2022; +} +{ +add.f16x2 r1573, %46, r1570; +} +{ +add.f16x2 r1576, %58, %76; +} +{ +mul.f16x2 r1579, r1576, r1905; +} +{ +add.f16x2 r1582, r1573, r1579; +} +{ +add.f16x2 r1585, %64, %70; +} +{ +mul.f16x2 r1588, r1585, r2013; +} +{ +add.f16x2 r1591, r1582, r1588; +} +{ +sub.f16x2 r1594, %53, %83; +} +{ +mul.f16x2 r1597, r1594, r2046; +} +{ +sub.f16x2 r1600, %59, %77; +} +{ +mul.f16x2 r1603, r1600, r1929; +} +{ +add.f16x2 r1606, r1597, r1603; +} +{ +sub.f16x2 r1609, %65, %71; +} +{ +mul.f16x2 r1612, r1609, r2037; +} +{ +add.f16x2 r1615, r1606, r1612; +} +{ +add.f16x2 r1618, r1591, r1615; +} +{ +add.f16x2 r1621, %52, %82; +} +{ +mul.f16x2 r1624, r1621, r2004; +} +{ +add.f16x2 r1627, %46, r1624; +} +{ +add.f16x2 r1630, %58, %76; +} +{ +mul.f16x2 r1633, r1630, r2013; +} +{ +add.f16x2 r1636, r1627, r1633; +} +{ +add.f16x2 r1639, %64, %70; +} +{ +mul.f16x2 r1642, r1639, r2022; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +sub.f16x2 r1648, %53, %83; +} +{ +mul.f16x2 r1651, r1648, r2031; +} +{ +sub.f16x2 r1654, %59, %77; +} +{ +mul.f16x2 r1657, r1654, r2037; +} +{ +add.f16x2 r1660, r1651, r1657; +} +{ +sub.f16x2 r1663, %65, %71; +} +{ +mul.f16x2 r1666, r1663, r2046; +} +{ +add.f16x2 r1669, r1660, r1666; +} +{ +sub.f16x2 r1672, r1645, r1669; +} +{ +add.f16x2 r1675, %52, %82; +} +{ +mul.f16x2 r1678, r1675, r2004; +} +{ +add.f16x2 r1681, %46, r1678; +} +{ +add.f16x2 r1684, %58, %76; +} +{ +mul.f16x2 r1687, r1684, r2013; +} +{ +add.f16x2 r1690, r1681, r1687; +} +{ +add.f16x2 r1693, %64, %70; +} +{ +mul.f16x2 r1696, r1693, r2022; +} +{ +add.f16x2 r1699, r1690, r1696; +} +{ +sub.f16x2 r1702, %53, %83; +} +{ +mul.f16x2 r1705, r1702, r2031; +} +{ +sub.f16x2 r1708, %59, %77; +} +{ +mul.f16x2 r1711, r1708, r2037; +} +{ +add.f16x2 r1714, r1705, r1711; +} +{ +sub.f16x2 r1717, %65, %71; +} +{ +mul.f16x2 r1720, r1717, r2046; +} +{ +add.f16x2 r1723, r1714, r1720; +} +{ +add.f16x2 r1726, r1699, r1723; +} +{ +add.f16x2 r1729, %53, %83; +} +{ +mul.f16x2 r1732, r1729, r1788; +} +{ +add.f16x2 r1735, %47, r1732; +} +{ +add.f16x2 r1738, %59, %77; +} +{ +mul.f16x2 r1741, r1738, r2022; +} +{ +add.f16x2 r1744, r1735, r1741; +} +{ +add.f16x2 r1747, %65, %71; +} +{ +mul.f16x2 r1750, r1747, r2004; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, %52, %82; +} +{ +mul.f16x2 r1759, r1756, r1815; +} +{ +sub.f16x2 r1762, %58, %76; +} +{ +mul.f16x2 r1765, r1762, r2046; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +sub.f16x2 r1771, %64, %70; +} +{ +mul.f16x2 r1774, r1771, r2031; +} +{ +add.f16x2 r1777, r1768, r1774; +} +{ +add.f16x2 r1780, r1753, r1777; +} +{ +add.f16x2 r1783, %53, %83; +} +{ +mul.f16x2 r1786, r1783, r1788; +} +{ +add.f16x2 r1789, %47, r1786; +} +{ +add.f16x2 r1792, %59, %77; +} +{ +mul.f16x2 r1795, r1792, r2022; +} +{ +add.f16x2 r1798, r1789, r1795; +} +{ +add.f16x2 r1801, %65, %71; +} +{ +mul.f16x2 r1804, r1801, r2004; +} +{ +add.f16x2 r1807, r1798, r1804; +} +{ +sub.f16x2 r1810, %52, %82; +} +{ +mul.f16x2 r1813, r1810, r1815; +} +{ +sub.f16x2 r1816, %58, %76; +} +{ +mul.f16x2 r1819, r1816, r2046; +} +{ +add.f16x2 r1822, r1813, r1819; +} +{ +sub.f16x2 r1825, %64, %70; +} +{ +mul.f16x2 r1828, r1825, r2031; +} +{ +add.f16x2 r1831, r1822, r1828; +} +{ +sub.f16x2 r1834, r1807, r1831; +} +{ +add.f16x2 r1837, %53, %83; +} +{ +mul.f16x2 r1840, r1837, r2022; +} +{ +add.f16x2 r1843, %47, r1840; +} +{ +add.f16x2 r1846, %59, %77; +} +{ +mul.f16x2 r1849, r1846, r1905; +} +{ +add.f16x2 r1852, r1843, r1849; +} +{ +add.f16x2 r1855, %65, %71; +} +{ +mul.f16x2 r1858, r1855, r2013; +} +{ +add.f16x2 r1861, r1852, r1858; +} +{ +sub.f16x2 r1864, %52, %82; +} +{ +mul.f16x2 r1867, r1864, r2046; +} +{ +sub.f16x2 r1870, %58, %76; +} +{ +mul.f16x2 r1873, r1870, r1929; +} +{ +add.f16x2 r1876, r1867, r1873; +} +{ +sub.f16x2 r1879, %64, %70; +} +{ +mul.f16x2 r1882, r1879, r2037; +} +{ +add.f16x2 r1885, r1876, r1882; +} +{ +add.f16x2 r1888, r1861, r1885; +} +{ +add.f16x2 r1891, %53, %83; +} +{ +mul.f16x2 r1894, r1891, r2022; +} +{ +add.f16x2 r1897, %47, r1894; +} +{ +add.f16x2 r1900, %59, %77; +} +{ +mul.f16x2 r1903, r1900, r1905; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, %65, %71; +} +{ +mul.f16x2 r1912, r1909, r2013; +} +{ +add.f16x2 r1915, r1906, r1912; +} +{ +sub.f16x2 r1918, %52, %82; +} +{ +mul.f16x2 r1921, r1918, r2046; +} +{ +sub.f16x2 r1924, %58, %76; +} +{ +mul.f16x2 r1927, r1924, r1929; +} +{ +add.f16x2 r1930, r1921, r1927; +} +{ +sub.f16x2 r1933, %64, %70; +} +{ +mul.f16x2 r1936, r1933, r2037; +} +{ +add.f16x2 r1939, r1930, r1936; +} +{ +sub.f16x2 r1942, r1915, r1939; +} +{ +add.f16x2 r1945, %53, %83; +} +{ +mul.f16x2 r1948, r1945, r2004; +} +{ +add.f16x2 r1951, %47, r1948; +} +{ +add.f16x2 r1954, %59, %77; +} +{ +mul.f16x2 r1957, r1954, r2013; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +add.f16x2 r1963, %65, %71; +} +{ +mul.f16x2 r1966, r1963, r2022; +} +{ +add.f16x2 r1969, r1960, r1966; +} +{ +sub.f16x2 r1972, %52, %82; +} +{ +mul.f16x2 r1975, r1972, r2031; +} +{ +sub.f16x2 r1978, %58, %76; +} +{ +mul.f16x2 r1981, r1978, r2037; +} +{ +add.f16x2 r1984, r1975, r1981; +} +{ +sub.f16x2 r1987, %64, %70; +} +{ +mul.f16x2 r1990, r1987, r2046; +} +{ +add.f16x2 r1993, r1984, r1990; +} +{ +add.f16x2 r1996, r1969, r1993; +} +{ +add.f16x2 r1999, %53, %83; +} +{ +mul.f16x2 r2002, r1999, r2004; +} +{ +add.f16x2 r2005, %47, r2002; +} +{ +add.f16x2 r2008, %59, %77; +} +{ +mul.f16x2 r2011, r2008, r2013; +} +{ +add.f16x2 r2014, r2005, r2011; +} +{ +add.f16x2 r2017, %65, %71; +} +{ +mul.f16x2 r2020, r2017, r2022; +} +{ +add.f16x2 r2023, r2014, r2020; +} +{ +sub.f16x2 r2026, %52, %82; +} +{ +mul.f16x2 r2029, r2026, r2031; +} +{ +sub.f16x2 r2032, %58, %76; +} +{ +mul.f16x2 r2035, r2032, r2037; +} +{ +add.f16x2 r2038, r2029, r2035; +} +{ +sub.f16x2 r2041, %64, %70; +} +{ +mul.f16x2 r2044, r2041, r2046; +} +{ +add.f16x2 r2047, r2038, r2044; +} +{ +sub.f16x2 r2050, r2023, r2047; +} +mov.f64 fd31, 0d3FEE940D6BB98CC5; +{ +cvt.rn.f16.f64 rs43, fd31; +} +mov.f64 fd32, 0dBFD2DD44CE9AFBA7; +{ +cvt.rn.f16.f64 rs44, fd32; +} +mov.f64 fd33, 0d3FEA708C4C4BFA74; +{ +cvt.rn.f16.f64 rs45, fd33; +} +mov.f64 fd34, 0dBFE206B7C9520CED; +{ +cvt.rn.f16.f64 rs46, fd34; +} +{ +cvt.rn.f16.f64 rs47, fd35; +} +{ +cvt.rn.f16.f64 rs48, fd36; +} +mov.f64 fd37, 0d3FD761BF51E29C90; +{ +cvt.rn.f16.f64 rs49, fd37; +} +mov.f64 fd38, 0dBFEDC9B7BE64378E; +{ +cvt.rn.f16.f64 rs50, fd38; +} +mov.f64 fd39, 0d3FB32182EBFB0FE9; +{ +cvt.rn.f16.f64 rs51, fd39; +} +mov.f64 fd40, 0dBFEFE917F00AE2CD; +{ +cvt.rn.f16.f64 rs52, fd40; +} +{ +cvt.rn.f16.f64 rs53, fd41; +} +{ +cvt.rn.f16.f64 rs54, fd42; +} +mov.f64 fd83, 0dBFE0000000000000; +mov.f64 fd84, 0dBFEBB67AE8584CAA; +mov.f64 fd45, 0dBFE7752932F8FB65; +{ +cvt.rn.f16.f64 rs57, fd45; +} +mov.f64 fd46, 0dBFE5C3F99E0B6B95; +{ +cvt.rn.f16.f64 rs58, fd46; +} +mov.f64 fd49, 0dBFEFA4808B7D3C19; +{ +cvt.rn.f16.f64 rs61, fd49; +} +mov.f64 fd50, 0dBFC313D12579650C; +{ +cvt.rn.f16.f64 rs62, fd50; +} +{ +cvt.rn.f16.f64 rs65, fd53; +} +mov.f64 fd54, 0d3FDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs66, fd54; +} +mov.b32 r2067, {rs43, rs43}; +{ +mul.f16x2 r2053, r772, r2067; +} +mov.b32 r2064, {rs44, rs44}; +{ +mul.f16x2 r2056, r1096, r2064; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r772, r2064; +} +{ +fma.rn.f16x2 r2065, r1096, r2067, r2062; +} +mov.b32 r2099, {rs45, rs45}; +{ +mul.f16x2 r2069, r1456, r2099; +} +mov.b32 r2096, {rs46, rs46}; +{ +mul.f16x2 r2072, r1780, r2096; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1456, r2096; +} +{ +fma.rn.f16x2 r2081, r1780, r2099, r2078; +} +{ +mul.f16x2 r2085, r880, r2099; +} +{ +mul.f16x2 r2088, r1204, r2096; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r880, r2096; +} +{ +fma.rn.f16x2 r2097, r1204, r2099, r2094; +} +mov.b32 r2163, {rs49, rs49}; +{ +mul.f16x2 r2101, r1564, r2163; +} +mov.b32 r2160, {rs50, rs50}; +{ +mul.f16x2 r2104, r1888, r2160; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1564, r2160; +} +{ +fma.rn.f16x2 r2113, r1888, r2163, r2110; +} +mov.b32 r2131, {rs47, rs47}; +{ +mul.f16x2 r2117, r988, r2131; +} +mov.b32 r2128, {rs48, rs48}; +{ +mul.f16x2 r2120, r1312, r2128; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r988, r2128; +} +{ +fma.rn.f16x2 r2129, r1312, r2131, r2126; +} +mov.b32 r2227, {rs53, rs53}; +{ +mul.f16x2 r2133, r1672, r2227; +} +mov.b32 r2224, {rs54, rs54}; +{ +mul.f16x2 r2136, r1996, r2224; +} +{ +sub.f16x2 r2139, r2133, r2136; +} +{ +mul.f16x2 r2142, r1672, r2224; +} +{ +fma.rn.f16x2 r2145, r1996, r2227, r2142; +} +{ +mul.f16x2 r2149, r1042, r2163; +} +{ +mul.f16x2 r2152, r1366, r2160; +} +{ +sub.f16x2 r2155, r2149, r2152; +} +{ +mul.f16x2 r2158, r1042, r2160; +} +{ +fma.rn.f16x2 r2161, r1366, r2163, r2158; +} +mov.b32 r2179, {rs57, rs57}; +{ +mul.f16x2 r2165, r1726, r2179; +} +mov.b32 r2176, {rs58, rs58}; +{ +mul.f16x2 r2168, r2050, r2176; +} +{ +sub.f16x2 r2171, r2165, r2168; +} +{ +mul.f16x2 r2174, r1726, r2176; +} +{ +fma.rn.f16x2 r2177, r2050, r2179, r2174; +} +mov.b32 r2195, {rs51, rs51}; +{ +mul.f16x2 r2181, r934, r2195; +} +mov.b32 r2192, {rs52, rs52}; +{ +mul.f16x2 r2184, r1258, r2192; +} +{ +sub.f16x2 r2187, r2181, r2184; +} +{ +mul.f16x2 r2190, r934, r2192; +} +{ +fma.rn.f16x2 r2193, r1258, r2195, r2190; +} +mov.b32 r2211, {rs61, rs61}; +{ +mul.f16x2 r2197, r1618, r2211; +} +mov.b32 r2208, {rs62, rs62}; +{ +mul.f16x2 r2200, r1942, r2208; +} +{ +sub.f16x2 r2203, r2197, r2200; +} +{ +mul.f16x2 r2206, r1618, r2208; +} +{ +fma.rn.f16x2 r2209, r1942, r2211, r2206; +} +{ +mul.f16x2 r2213, r826, r2227; +} +{ +mul.f16x2 r2216, r1150, r2224; +} +{ +sub.f16x2 r2219, r2213, r2216; +} +{ +mul.f16x2 r2222, r826, r2224; +} +{ +fma.rn.f16x2 r2225, r1150, r2227, r2222; +} +mov.b32 r2243, {rs65, rs65}; +{ +mul.f16x2 r2229, r1510, r2243; +} +mov.b32 r2240, {rs66, rs66}; +{ +mul.f16x2 r2232, r1834, r2240; +} +{ +sub.f16x2 r2235, r2229, r2232; +} +{ +mul.f16x2 r2238, r1510, r2240; +} +{ +fma.rn.f16x2 r2241, r1834, r2243, r2238; +} +{ +cvt.rn.f16.f64 rs83, fd83; +} +mov.b32 r2316, {rs83, rs83}; +{ +cvt.rn.f16.f64 rs84, fd84; +} +{ +neg.f16 rs85, rs84; +} +mov.b32 r2325, {rs85, rs85}; +{ +add.f16x2 r2245, r700, r1384; +} +{ +add.f16x2 %0, r16, r2245; +} +{ +add.f16x2 r2251, r718, r1402; +} +{ +add.f16x2 %1, r34, r2251; +} +{ +add.f16x2 r2257, r700, r1384; +} +{ +mul.f16x2 r2260, r2257, r2316; +} +{ +add.f16x2 r2263, r16, r2260; +} +{ +sub.f16x2 r2266, r718, r1402; +} +{ +mul.f16x2 r2269, r2266, r2325; +} +{ +add.f16x2 %14, r2263, r2269; +} +{ +add.f16x2 r2275, r700, r1384; +} +{ +mul.f16x2 r2278, r2275, r2316; +} +{ +add.f16x2 r2281, r16, r2278; +} +{ +sub.f16x2 r2284, r718, r1402; +} +{ +mul.f16x2 r2287, r2284, r2325; +} +{ +sub.f16x2 %28, r2281, r2287; +} +{ +add.f16x2 r2293, r718, r1402; +} +{ +mul.f16x2 r2296, r2293, r2316; +} +{ +add.f16x2 r2299, r34, r2296; +} +{ +sub.f16x2 r2302, r700, r1384; +} +{ +mul.f16x2 r2305, r2302, r2325; +} +{ +sub.f16x2 %15, r2299, r2305; +} +{ +add.f16x2 r2311, r718, r1402; +} +{ +mul.f16x2 r2314, r2311, r2316; +} +{ +add.f16x2 r2317, r34, r2314; +} +{ +sub.f16x2 r2320, r700, r1384; +} +{ +mul.f16x2 r2323, r2320, r2325; +} +{ +add.f16x2 %29, r2317, r2323; +} +{ +cvt.rn.f16.f64 rs87, fd83; +} +mov.b32 r2400, {rs87, rs87}; +{ +cvt.rn.f16.f64 rs88, fd84; +} +{ +neg.f16 rs89, rs88; +} +mov.b32 r2409, {rs89, rs89}; +{ +add.f16x2 r2329, r2059, r2075; +} +{ +add.f16x2 %2, r88, r2329; +} +{ +add.f16x2 r2335, r2065, r2081; +} +{ +add.f16x2 %3, r412, r2335; +} +{ +add.f16x2 r2341, r2059, r2075; +} +{ +mul.f16x2 r2344, r2341, r2400; +} +{ +add.f16x2 r2347, r88, r2344; +} +{ +sub.f16x2 r2350, r2065, r2081; +} +{ +mul.f16x2 r2353, r2350, r2409; +} +{ +add.f16x2 %16, r2347, r2353; +} +{ +add.f16x2 r2359, r2059, r2075; +} +{ +mul.f16x2 r2362, r2359, r2400; +} +{ +add.f16x2 r2365, r88, r2362; +} +{ +sub.f16x2 r2368, r2065, r2081; +} +{ +mul.f16x2 r2371, r2368, r2409; +} +{ +sub.f16x2 %30, r2365, r2371; +} +{ +add.f16x2 r2377, r2065, r2081; +} +{ +mul.f16x2 r2380, r2377, r2400; +} +{ +add.f16x2 r2383, r412, r2380; +} +{ +sub.f16x2 r2386, r2059, r2075; +} +{ +mul.f16x2 r2389, r2386, r2409; +} +{ +sub.f16x2 %17, r2383, r2389; +} +{ +add.f16x2 r2395, r2065, r2081; +} +{ +mul.f16x2 r2398, r2395, r2400; +} +{ +add.f16x2 r2401, r412, r2398; +} +{ +sub.f16x2 r2404, r2059, r2075; +} +{ +mul.f16x2 r2407, r2404, r2409; +} +{ +add.f16x2 %31, r2401, r2407; +} +{ +cvt.rn.f16.f64 rs91, fd83; +} +mov.b32 r2484, {rs91, rs91}; +{ +cvt.rn.f16.f64 rs92, fd84; +} +{ +neg.f16 rs93, rs92; +} +mov.b32 r2493, {rs93, rs93}; +{ +add.f16x2 r2413, r2091, r2107; +} +{ +add.f16x2 %4, r196, r2413; +} +{ +add.f16x2 r2419, r2097, r2113; +} +{ +add.f16x2 %5, r520, r2419; +} +{ +add.f16x2 r2425, r2091, r2107; +} +{ +mul.f16x2 r2428, r2425, r2484; +} +{ +add.f16x2 r2431, r196, r2428; +} +{ +sub.f16x2 r2434, r2097, r2113; +} +{ +mul.f16x2 r2437, r2434, r2493; +} +{ +add.f16x2 %18, r2431, r2437; +} +{ +add.f16x2 r2443, r2091, r2107; +} +{ +mul.f16x2 r2446, r2443, r2484; +} +{ +add.f16x2 r2449, r196, r2446; +} +{ +sub.f16x2 r2452, r2097, r2113; +} +{ +mul.f16x2 r2455, r2452, r2493; +} +{ +sub.f16x2 %32, r2449, r2455; +} +{ +add.f16x2 r2461, r2097, r2113; +} +{ +mul.f16x2 r2464, r2461, r2484; +} +{ +add.f16x2 r2467, r520, r2464; +} +{ +sub.f16x2 r2470, r2091, r2107; +} +{ +mul.f16x2 r2473, r2470, r2493; +} +{ +sub.f16x2 %19, r2467, r2473; +} +{ +add.f16x2 r2479, r2097, r2113; +} +{ +mul.f16x2 r2482, r2479, r2484; +} +{ +add.f16x2 r2485, r520, r2482; +} +{ +sub.f16x2 r2488, r2091, r2107; +} +{ +mul.f16x2 r2491, r2488, r2493; +} +{ +add.f16x2 %33, r2485, r2491; +} +{ +cvt.rn.f16.f64 rs95, fd83; +} +mov.b32 r2568, {rs95, rs95}; +{ +cvt.rn.f16.f64 rs96, fd84; +} +{ +neg.f16 rs97, rs96; +} +mov.b32 r2577, {rs97, rs97}; +{ +add.f16x2 r2497, r2123, r2139; +} +{ +add.f16x2 %6, r304, r2497; +} +{ +add.f16x2 r2503, r2129, r2145; +} +{ +add.f16x2 %7, r628, r2503; +} +{ +add.f16x2 r2509, r2123, r2139; +} +{ +mul.f16x2 r2512, r2509, r2568; +} +{ +add.f16x2 r2515, r304, r2512; +} +{ +sub.f16x2 r2518, r2129, r2145; +} +{ +mul.f16x2 r2521, r2518, r2577; +} +{ +add.f16x2 %20, r2515, r2521; +} +{ +add.f16x2 r2527, r2123, r2139; +} +{ +mul.f16x2 r2530, r2527, r2568; +} +{ +add.f16x2 r2533, r304, r2530; +} +{ +sub.f16x2 r2536, r2129, r2145; +} +{ +mul.f16x2 r2539, r2536, r2577; +} +{ +sub.f16x2 %34, r2533, r2539; +} +{ +add.f16x2 r2545, r2129, r2145; +} +{ +mul.f16x2 r2548, r2545, r2568; +} +{ +add.f16x2 r2551, r628, r2548; +} +{ +sub.f16x2 r2554, r2123, r2139; +} +{ +mul.f16x2 r2557, r2554, r2577; +} +{ +sub.f16x2 %21, r2551, r2557; +} +{ +add.f16x2 r2563, r2129, r2145; +} +{ +mul.f16x2 r2566, r2563, r2568; +} +{ +add.f16x2 r2569, r628, r2566; +} +{ +sub.f16x2 r2572, r2123, r2139; +} +{ +mul.f16x2 r2575, r2572, r2577; +} +{ +add.f16x2 %35, r2569, r2575; +} +{ +cvt.rn.f16.f64 rs99, fd83; +} +mov.b32 r2652, {rs99, rs99}; +{ +cvt.rn.f16.f64 rs100, fd84; +} +{ +neg.f16 rs101, rs100; +} +mov.b32 r2661, {rs101, rs101}; +{ +add.f16x2 r2581, r2155, r2171; +} +{ +add.f16x2 %8, r358, r2581; +} +{ +add.f16x2 r2587, r2161, r2177; +} +{ +add.f16x2 %9, r682, r2587; +} +{ +add.f16x2 r2593, r2155, r2171; +} +{ +mul.f16x2 r2596, r2593, r2652; +} +{ +add.f16x2 r2599, r358, r2596; +} +{ +sub.f16x2 r2602, r2161, r2177; +} +{ +mul.f16x2 r2605, r2602, r2661; +} +{ +add.f16x2 %22, r2599, r2605; +} +{ +add.f16x2 r2611, r2155, r2171; +} +{ +mul.f16x2 r2614, r2611, r2652; +} +{ +add.f16x2 r2617, r358, r2614; +} +{ +sub.f16x2 r2620, r2161, r2177; +} +{ +mul.f16x2 r2623, r2620, r2661; +} +{ +sub.f16x2 %36, r2617, r2623; +} +{ +add.f16x2 r2629, r2161, r2177; +} +{ +mul.f16x2 r2632, r2629, r2652; +} +{ +add.f16x2 r2635, r682, r2632; +} +{ +sub.f16x2 r2638, r2155, r2171; +} +{ +mul.f16x2 r2641, r2638, r2661; +} +{ +sub.f16x2 %23, r2635, r2641; +} +{ +add.f16x2 r2647, r2161, r2177; +} +{ +mul.f16x2 r2650, r2647, r2652; +} +{ +add.f16x2 r2653, r682, r2650; +} +{ +sub.f16x2 r2656, r2155, r2171; +} +{ +mul.f16x2 r2659, r2656, r2661; +} +{ +add.f16x2 %37, r2653, r2659; +} +{ +cvt.rn.f16.f64 rs103, fd83; +} +mov.b32 r2736, {rs103, rs103}; +{ +cvt.rn.f16.f64 rs104, fd84; +} +{ +neg.f16 rs105, rs104; +} +mov.b32 r2745, {rs105, rs105}; +{ +add.f16x2 r2665, r2187, r2203; +} +{ +add.f16x2 %10, r250, r2665; +} +{ +add.f16x2 r2671, r2193, r2209; +} +{ +add.f16x2 %11, r574, r2671; +} +{ +add.f16x2 r2677, r2187, r2203; +} +{ +mul.f16x2 r2680, r2677, r2736; +} +{ +add.f16x2 r2683, r250, r2680; +} +{ +sub.f16x2 r2686, r2193, r2209; +} +{ +mul.f16x2 r2689, r2686, r2745; +} +{ +add.f16x2 %24, r2683, r2689; +} +{ +add.f16x2 r2695, r2187, r2203; +} +{ +mul.f16x2 r2698, r2695, r2736; +} +{ +add.f16x2 r2701, r250, r2698; +} +{ +sub.f16x2 r2704, r2193, r2209; +} +{ +mul.f16x2 r2707, r2704, r2745; +} +{ +sub.f16x2 %38, r2701, r2707; +} +{ +add.f16x2 r2713, r2193, r2209; +} +{ +mul.f16x2 r2716, r2713, r2736; +} +{ +add.f16x2 r2719, r574, r2716; +} +{ +sub.f16x2 r2722, r2187, r2203; +} +{ +mul.f16x2 r2725, r2722, r2745; +} +{ +sub.f16x2 %25, r2719, r2725; +} +{ +add.f16x2 r2731, r2193, r2209; +} +{ +mul.f16x2 r2734, r2731, r2736; +} +{ +add.f16x2 r2737, r574, r2734; +} +{ +sub.f16x2 r2740, r2187, r2203; +} +{ +mul.f16x2 r2743, r2740, r2745; +} +{ +add.f16x2 %39, r2737, r2743; +} +{ +cvt.rn.f16.f64 rs107, fd83; +} +mov.b32 r2820, {rs107, rs107}; +{ +cvt.rn.f16.f64 rs108, fd84; +} +{ +neg.f16 rs109, rs108; +} +mov.b32 r2829, {rs109, rs109}; +{ +add.f16x2 r2749, r2219, r2235; +} +{ +add.f16x2 %12, r142, r2749; +} +{ +add.f16x2 r2755, r2225, r2241; +} +{ +add.f16x2 %13, r466, r2755; +} +{ +add.f16x2 r2761, r2219, r2235; +} +{ +mul.f16x2 r2764, r2761, r2820; +} +{ +add.f16x2 r2767, r142, r2764; +} +{ +sub.f16x2 r2770, r2225, r2241; +} +{ +mul.f16x2 r2773, r2770, r2829; +} +{ +add.f16x2 %26, r2767, r2773; +} +{ +add.f16x2 r2779, r2219, r2235; +} +{ +mul.f16x2 r2782, r2779, r2820; +} +{ +add.f16x2 r2785, r142, r2782; +} +{ +sub.f16x2 r2788, r2225, r2241; +} +{ +mul.f16x2 r2791, r2788, r2829; +} +{ +sub.f16x2 %40, r2785, r2791; +} +{ +add.f16x2 r2797, r2225, r2241; +} +{ +mul.f16x2 r2800, r2797, r2820; +} +{ +add.f16x2 r2803, r466, r2800; +} +{ +sub.f16x2 r2806, r2219, r2235; +} +{ +mul.f16x2 r2809, r2806, r2829; +} +{ +sub.f16x2 %27, r2803, r2809; +} +{ +add.f16x2 r2815, r2225, r2241; +} +{ +mul.f16x2 r2818, r2815, r2820; +} +{ +add.f16x2 r2821, r466, r2818; +} +{ +sub.f16x2 r2824, r2219, r2235; +} +{ +mul.f16x2 r2827, r2824, r2829; +} +{ +add.f16x2 %41, r2821, r2827; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..dfc8b2eb2261d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp16_inv.hpp.inc @@ -0,0 +1,3135 @@ +#ifndef CUFFTDX_FFT_21_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_21_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<955, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<103>; +.reg .b32 r<2833>; +.reg .f64 fd<85>; +.reg .b64 rd<2>; +mov.f64 fd35, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd35; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd30, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd30; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r447, {rs3, rs3}; +mov.f64 fd41, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs5, fd41; +} +mov.b32 r654, {rs5, rs5}; +mov.f64 fd24, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs6, fd24; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r678, {rs7, rs7}; +mov.f64 fd53, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs9, fd53; +} +mov.b32 r636, {rs9, rs9}; +mov.f64 fd54, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs10, fd54; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r663, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs13, fd53; +} +mov.b32 r537, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd54; +} +mov.b32 r561, {rs14, rs14}; +{ +cvt.rn.f16.f64 rs15, fd35; +} +mov.b32 r645, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd30; +} +mov.b32 r669, {rs16, rs16}; +{ +add.f16x2 r1, %48, %78; +} +{ +add.f16x2 r4, %42, r1; +} +{ +add.f16x2 r7, %54, %72; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %60, %66; +} +{ +add.f16x2 r16, r10, r13; +} +{ +add.f16x2 r19, %49, %79; +} +{ +add.f16x2 r22, %43, r19; +} +{ +add.f16x2 r25, %55, %73; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %61, %67; +} +{ +add.f16x2 r34, r28, r31; +} +{ +add.f16x2 r37, %48, %78; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %42, r40; +} +{ +add.f16x2 r46, %54, %72; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %60, %66; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %49, %79; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %55, %73; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %61, %67; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r61, r85; +} +{ +add.f16x2 r91, %48, %78; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %42, r94; +} +{ +add.f16x2 r100, %54, %72; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %60, %66; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %49, %79; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %55, %73; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %61, %67; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 r142, r115, r139; +} +{ +add.f16x2 r145, %48, %78; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %42, r148; +} +{ +add.f16x2 r154, %54, %72; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %60, %66; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %49, %79; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %55, %73; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %61, %67; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 r196, r169, r193; +} +{ +add.f16x2 r199, %48, %78; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %42, r202; +} +{ +add.f16x2 r208, %54, %72; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %60, %66; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %49, %79; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %55, %73; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %61, %67; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 r250, r223, r247; +} +{ +add.f16x2 r253, %48, %78; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %42, r256; +} +{ +add.f16x2 r262, %54, %72; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %60, %66; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %49, %79; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %55, %73; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %61, %67; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 r304, r277, r301; +} +{ +add.f16x2 r307, %48, %78; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %42, r310; +} +{ +add.f16x2 r316, %54, %72; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %60, %66; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %49, %79; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %55, %73; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %61, %67; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 r358, r331, r355; +} +{ +add.f16x2 r361, %49, %79; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %43, r364; +} +{ +add.f16x2 r370, %55, %73; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %61, %67; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %48, %78; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %54, %72; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %60, %66; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 r412, r385, r409; +} +{ +add.f16x2 r415, %49, %79; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %43, r418; +} +{ +add.f16x2 r424, %55, %73; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %61, %67; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %48, %78; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %54, %72; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %60, %66; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 r466, r439, r463; +} +{ +add.f16x2 r469, %49, %79; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %43, r472; +} +{ +add.f16x2 r478, %55, %73; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %61, %67; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %48, %78; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %54, %72; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %60, %66; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r493, r517; +} +{ +add.f16x2 r523, %49, %79; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %43, r526; +} +{ +add.f16x2 r532, %55, %73; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %61, %67; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %48, %78; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %54, %72; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %60, %66; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 r574, r547, r571; +} +{ +add.f16x2 r577, %49, %79; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %43, r580; +} +{ +add.f16x2 r586, %55, %73; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %61, %67; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %48, %78; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %54, %72; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %60, %66; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r601, r625; +} +{ +add.f16x2 r631, %49, %79; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %43, r634; +} +{ +add.f16x2 r640, %55, %73; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %61, %67; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %48, %78; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %54, %72; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %60, %66; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r655, r679; +} +{ +cvt.rn.f16.f64 rs17, fd35; +} +mov.b32 r1104, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd30; +} +{ +neg.f16 rs19, rs18; +} +mov.b32 r1131, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs21, fd41; +} +mov.b32 r1338, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd24; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r1362, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd53; +} +mov.b32 r1320, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd54; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r1347, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs29, fd53; +} +mov.b32 r1221, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd54; +} +mov.b32 r1245, {rs30, rs30}; +{ +cvt.rn.f16.f64 rs31, fd35; +} +mov.b32 r1329, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd30; +} +mov.b32 r1353, {rs32, rs32}; +{ +add.f16x2 r685, %50, %80; +} +{ +add.f16x2 r688, %44, r685; +} +{ +add.f16x2 r691, %56, %74; +} +{ +add.f16x2 r694, r688, r691; +} +{ +add.f16x2 r697, %62, %68; +} +{ +add.f16x2 r700, r694, r697; +} +{ +add.f16x2 r703, %51, %81; +} +{ +add.f16x2 r706, %45, r703; +} +{ +add.f16x2 r709, %57, %75; +} +{ +add.f16x2 r712, r706, r709; +} +{ +add.f16x2 r715, %63, %69; +} +{ +add.f16x2 r718, r712, r715; +} +{ +add.f16x2 r721, %50, %80; +} +{ +mul.f16x2 r724, r721, r1104; +} +{ +add.f16x2 r727, %44, r724; +} +{ +add.f16x2 r730, %56, %74; +} +{ +mul.f16x2 r733, r730, r1338; +} +{ +add.f16x2 r736, r727, r733; +} +{ +add.f16x2 r739, %62, %68; +} +{ +mul.f16x2 r742, r739, r1320; +} +{ +add.f16x2 r745, r736, r742; +} +{ +sub.f16x2 r748, %51, %81; +} +{ +mul.f16x2 r751, r748, r1131; +} +{ +sub.f16x2 r754, %57, %75; +} +{ +mul.f16x2 r757, r754, r1362; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %63, %69; +} +{ +mul.f16x2 r766, r763, r1347; +} +{ +add.f16x2 r769, r760, r766; +} +{ +sub.f16x2 r772, r745, r769; +} +{ +add.f16x2 r775, %50, %80; +} +{ +mul.f16x2 r778, r775, r1104; +} +{ +add.f16x2 r781, %44, r778; +} +{ +add.f16x2 r784, %56, %74; +} +{ +mul.f16x2 r787, r784, r1338; +} +{ +add.f16x2 r790, r781, r787; +} +{ +add.f16x2 r793, %62, %68; +} +{ +mul.f16x2 r796, r793, r1320; +} +{ +add.f16x2 r799, r790, r796; +} +{ +sub.f16x2 r802, %51, %81; +} +{ +mul.f16x2 r805, r802, r1131; +} +{ +sub.f16x2 r808, %57, %75; +} +{ +mul.f16x2 r811, r808, r1362; +} +{ +add.f16x2 r814, r805, r811; +} +{ +sub.f16x2 r817, %63, %69; +} +{ +mul.f16x2 r820, r817, r1347; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r799, r823; +} +{ +add.f16x2 r829, %50, %80; +} +{ +mul.f16x2 r832, r829, r1338; +} +{ +add.f16x2 r835, %44, r832; +} +{ +add.f16x2 r838, %56, %74; +} +{ +mul.f16x2 r841, r838, r1221; +} +{ +add.f16x2 r844, r835, r841; +} +{ +add.f16x2 r847, %62, %68; +} +{ +mul.f16x2 r850, r847, r1329; +} +{ +add.f16x2 r853, r844, r850; +} +{ +sub.f16x2 r856, %51, %81; +} +{ +mul.f16x2 r859, r856, r1362; +} +{ +sub.f16x2 r862, %57, %75; +} +{ +mul.f16x2 r865, r862, r1245; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %63, %69; +} +{ +mul.f16x2 r874, r871, r1353; +} +{ +add.f16x2 r877, r868, r874; +} +{ +sub.f16x2 r880, r853, r877; +} +{ +add.f16x2 r883, %50, %80; +} +{ +mul.f16x2 r886, r883, r1338; +} +{ +add.f16x2 r889, %44, r886; +} +{ +add.f16x2 r892, %56, %74; +} +{ +mul.f16x2 r895, r892, r1221; +} +{ +add.f16x2 r898, r889, r895; +} +{ +add.f16x2 r901, %62, %68; +} +{ +mul.f16x2 r904, r901, r1329; +} +{ +add.f16x2 r907, r898, r904; +} +{ +sub.f16x2 r910, %51, %81; +} +{ +mul.f16x2 r913, r910, r1362; +} +{ +sub.f16x2 r916, %57, %75; +} +{ +mul.f16x2 r919, r916, r1245; +} +{ +add.f16x2 r922, r913, r919; +} +{ +sub.f16x2 r925, %63, %69; +} +{ +mul.f16x2 r928, r925, r1353; +} +{ +add.f16x2 r931, r922, r928; +} +{ +add.f16x2 r934, r907, r931; +} +{ +add.f16x2 r937, %50, %80; +} +{ +mul.f16x2 r940, r937, r1320; +} +{ +add.f16x2 r943, %44, r940; +} +{ +add.f16x2 r946, %56, %74; +} +{ +mul.f16x2 r949, r946, r1329; +} +{ +add.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, %62, %68; +} +{ +mul.f16x2 r958, r955, r1338; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, %51, %81; +} +{ +mul.f16x2 r967, r964, r1347; +} +{ +sub.f16x2 r970, %57, %75; +} +{ +mul.f16x2 r973, r970, r1353; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %63, %69; +} +{ +mul.f16x2 r982, r979, r1362; +} +{ +add.f16x2 r985, r976, r982; +} +{ +sub.f16x2 r988, r961, r985; +} +{ +add.f16x2 r991, %50, %80; +} +{ +mul.f16x2 r994, r991, r1320; +} +{ +add.f16x2 r997, %44, r994; +} +{ +add.f16x2 r1000, %56, %74; +} +{ +mul.f16x2 r1003, r1000, r1329; +} +{ +add.f16x2 r1006, r997, r1003; +} +{ +add.f16x2 r1009, %62, %68; +} +{ +mul.f16x2 r1012, r1009, r1338; +} +{ +add.f16x2 r1015, r1006, r1012; +} +{ +sub.f16x2 r1018, %51, %81; +} +{ +mul.f16x2 r1021, r1018, r1347; +} +{ +sub.f16x2 r1024, %57, %75; +} +{ +mul.f16x2 r1027, r1024, r1353; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %63, %69; +} +{ +mul.f16x2 r1036, r1033, r1362; +} +{ +add.f16x2 r1039, r1030, r1036; +} +{ +add.f16x2 r1042, r1015, r1039; +} +{ +add.f16x2 r1045, %51, %81; +} +{ +mul.f16x2 r1048, r1045, r1104; +} +{ +add.f16x2 r1051, %45, r1048; +} +{ +add.f16x2 r1054, %57, %75; +} +{ +mul.f16x2 r1057, r1054, r1338; +} +{ +add.f16x2 r1060, r1051, r1057; +} +{ +add.f16x2 r1063, %63, %69; +} +{ +mul.f16x2 r1066, r1063, r1320; +} +{ +add.f16x2 r1069, r1060, r1066; +} +{ +sub.f16x2 r1072, %50, %80; +} +{ +mul.f16x2 r1075, r1072, r1131; +} +{ +sub.f16x2 r1078, %56, %74; +} +{ +mul.f16x2 r1081, r1078, r1362; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %62, %68; +} +{ +mul.f16x2 r1090, r1087, r1347; +} +{ +add.f16x2 r1093, r1084, r1090; +} +{ +add.f16x2 r1096, r1069, r1093; +} +{ +add.f16x2 r1099, %51, %81; +} +{ +mul.f16x2 r1102, r1099, r1104; +} +{ +add.f16x2 r1105, %45, r1102; +} +{ +add.f16x2 r1108, %57, %75; +} +{ +mul.f16x2 r1111, r1108, r1338; +} +{ +add.f16x2 r1114, r1105, r1111; +} +{ +add.f16x2 r1117, %63, %69; +} +{ +mul.f16x2 r1120, r1117, r1320; +} +{ +add.f16x2 r1123, r1114, r1120; +} +{ +sub.f16x2 r1126, %50, %80; +} +{ +mul.f16x2 r1129, r1126, r1131; +} +{ +sub.f16x2 r1132, %56, %74; +} +{ +mul.f16x2 r1135, r1132, r1362; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %62, %68; +} +{ +mul.f16x2 r1144, r1141, r1347; +} +{ +add.f16x2 r1147, r1138, r1144; +} +{ +sub.f16x2 r1150, r1123, r1147; +} +{ +add.f16x2 r1153, %51, %81; +} +{ +mul.f16x2 r1156, r1153, r1338; +} +{ +add.f16x2 r1159, %45, r1156; +} +{ +add.f16x2 r1162, %57, %75; +} +{ +mul.f16x2 r1165, r1162, r1221; +} +{ +add.f16x2 r1168, r1159, r1165; +} +{ +add.f16x2 r1171, %63, %69; +} +{ +mul.f16x2 r1174, r1171, r1329; +} +{ +add.f16x2 r1177, r1168, r1174; +} +{ +sub.f16x2 r1180, %50, %80; +} +{ +mul.f16x2 r1183, r1180, r1362; +} +{ +sub.f16x2 r1186, %56, %74; +} +{ +mul.f16x2 r1189, r1186, r1245; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %62, %68; +} +{ +mul.f16x2 r1198, r1195, r1353; +} +{ +add.f16x2 r1201, r1192, r1198; +} +{ +add.f16x2 r1204, r1177, r1201; +} +{ +add.f16x2 r1207, %51, %81; +} +{ +mul.f16x2 r1210, r1207, r1338; +} +{ +add.f16x2 r1213, %45, r1210; +} +{ +add.f16x2 r1216, %57, %75; +} +{ +mul.f16x2 r1219, r1216, r1221; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %63, %69; +} +{ +mul.f16x2 r1228, r1225, r1329; +} +{ +add.f16x2 r1231, r1222, r1228; +} +{ +sub.f16x2 r1234, %50, %80; +} +{ +mul.f16x2 r1237, r1234, r1362; +} +{ +sub.f16x2 r1240, %56, %74; +} +{ +mul.f16x2 r1243, r1240, r1245; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %62, %68; +} +{ +mul.f16x2 r1252, r1249, r1353; +} +{ +add.f16x2 r1255, r1246, r1252; +} +{ +sub.f16x2 r1258, r1231, r1255; +} +{ +add.f16x2 r1261, %51, %81; +} +{ +mul.f16x2 r1264, r1261, r1320; +} +{ +add.f16x2 r1267, %45, r1264; +} +{ +add.f16x2 r1270, %57, %75; +} +{ +mul.f16x2 r1273, r1270, r1329; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +add.f16x2 r1279, %63, %69; +} +{ +mul.f16x2 r1282, r1279, r1338; +} +{ +add.f16x2 r1285, r1276, r1282; +} +{ +sub.f16x2 r1288, %50, %80; +} +{ +mul.f16x2 r1291, r1288, r1347; +} +{ +sub.f16x2 r1294, %56, %74; +} +{ +mul.f16x2 r1297, r1294, r1353; +} +{ +add.f16x2 r1300, r1291, r1297; +} +{ +sub.f16x2 r1303, %62, %68; +} +{ +mul.f16x2 r1306, r1303, r1362; +} +{ +add.f16x2 r1309, r1300, r1306; +} +{ +add.f16x2 r1312, r1285, r1309; +} +{ +add.f16x2 r1315, %51, %81; +} +{ +mul.f16x2 r1318, r1315, r1320; +} +{ +add.f16x2 r1321, %45, r1318; +} +{ +add.f16x2 r1324, %57, %75; +} +{ +mul.f16x2 r1327, r1324, r1329; +} +{ +add.f16x2 r1330, r1321, r1327; +} +{ +add.f16x2 r1333, %63, %69; +} +{ +mul.f16x2 r1336, r1333, r1338; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +sub.f16x2 r1342, %50, %80; +} +{ +mul.f16x2 r1345, r1342, r1347; +} +{ +sub.f16x2 r1348, %56, %74; +} +{ +mul.f16x2 r1351, r1348, r1353; +} +{ +add.f16x2 r1354, r1345, r1351; +} +{ +sub.f16x2 r1357, %62, %68; +} +{ +mul.f16x2 r1360, r1357, r1362; +} +{ +add.f16x2 r1363, r1354, r1360; +} +{ +sub.f16x2 r1366, r1339, r1363; +} +{ +cvt.rn.f16.f64 rs33, fd35; +} +mov.b32 r1788, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd30; +} +{ +neg.f16 rs35, rs34; +} +mov.b32 r1815, {rs35, rs35}; +{ +cvt.rn.f16.f64 rs37, fd41; +} +mov.b32 r2022, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs38, fd24; +} +{ +neg.f16 rs39, rs38; +} +mov.b32 r2046, {rs39, rs39}; +{ +cvt.rn.f16.f64 rs41, fd53; +} +mov.b32 r2004, {rs41, rs41}; +{ +cvt.rn.f16.f64 rs42, fd54; +} +{ +neg.f16 rs43, rs42; +} +mov.b32 r2031, {rs43, rs43}; +{ +cvt.rn.f16.f64 rs45, fd53; +} +mov.b32 r1905, {rs45, rs45}; +{ +cvt.rn.f16.f64 rs46, fd54; +} +mov.b32 r1929, {rs46, rs46}; +{ +cvt.rn.f16.f64 rs47, fd35; +} +mov.b32 r2013, {rs47, rs47}; +{ +cvt.rn.f16.f64 rs48, fd30; +} +mov.b32 r2037, {rs48, rs48}; +{ +add.f16x2 r1369, %52, %82; +} +{ +add.f16x2 r1372, %46, r1369; +} +{ +add.f16x2 r1375, %58, %76; +} +{ +add.f16x2 r1378, r1372, r1375; +} +{ +add.f16x2 r1381, %64, %70; +} +{ +add.f16x2 r1384, r1378, r1381; +} +{ +add.f16x2 r1387, %53, %83; +} +{ +add.f16x2 r1390, %47, r1387; +} +{ +add.f16x2 r1393, %59, %77; +} +{ +add.f16x2 r1396, r1390, r1393; +} +{ +add.f16x2 r1399, %65, %71; +} +{ +add.f16x2 r1402, r1396, r1399; +} +{ +add.f16x2 r1405, %52, %82; +} +{ +mul.f16x2 r1408, r1405, r1788; +} +{ +add.f16x2 r1411, %46, r1408; +} +{ +add.f16x2 r1414, %58, %76; +} +{ +mul.f16x2 r1417, r1414, r2022; +} +{ +add.f16x2 r1420, r1411, r1417; +} +{ +add.f16x2 r1423, %64, %70; +} +{ +mul.f16x2 r1426, r1423, r2004; +} +{ +add.f16x2 r1429, r1420, r1426; +} +{ +sub.f16x2 r1432, %53, %83; +} +{ +mul.f16x2 r1435, r1432, r1815; +} +{ +sub.f16x2 r1438, %59, %77; +} +{ +mul.f16x2 r1441, r1438, r2046; +} +{ +add.f16x2 r1444, r1435, r1441; +} +{ +sub.f16x2 r1447, %65, %71; +} +{ +mul.f16x2 r1450, r1447, r2031; +} +{ +add.f16x2 r1453, r1444, r1450; +} +{ +sub.f16x2 r1456, r1429, r1453; +} +{ +add.f16x2 r1459, %52, %82; +} +{ +mul.f16x2 r1462, r1459, r1788; +} +{ +add.f16x2 r1465, %46, r1462; +} +{ +add.f16x2 r1468, %58, %76; +} +{ +mul.f16x2 r1471, r1468, r2022; +} +{ +add.f16x2 r1474, r1465, r1471; +} +{ +add.f16x2 r1477, %64, %70; +} +{ +mul.f16x2 r1480, r1477, r2004; +} +{ +add.f16x2 r1483, r1474, r1480; +} +{ +sub.f16x2 r1486, %53, %83; +} +{ +mul.f16x2 r1489, r1486, r1815; +} +{ +sub.f16x2 r1492, %59, %77; +} +{ +mul.f16x2 r1495, r1492, r2046; +} +{ +add.f16x2 r1498, r1489, r1495; +} +{ +sub.f16x2 r1501, %65, %71; +} +{ +mul.f16x2 r1504, r1501, r2031; +} +{ +add.f16x2 r1507, r1498, r1504; +} +{ +add.f16x2 r1510, r1483, r1507; +} +{ +add.f16x2 r1513, %52, %82; +} +{ +mul.f16x2 r1516, r1513, r2022; +} +{ +add.f16x2 r1519, %46, r1516; +} +{ +add.f16x2 r1522, %58, %76; +} +{ +mul.f16x2 r1525, r1522, r1905; +} +{ +add.f16x2 r1528, r1519, r1525; +} +{ +add.f16x2 r1531, %64, %70; +} +{ +mul.f16x2 r1534, r1531, r2013; +} +{ +add.f16x2 r1537, r1528, r1534; +} +{ +sub.f16x2 r1540, %53, %83; +} +{ +mul.f16x2 r1543, r1540, r2046; +} +{ +sub.f16x2 r1546, %59, %77; +} +{ +mul.f16x2 r1549, r1546, r1929; +} +{ +add.f16x2 r1552, r1543, r1549; +} +{ +sub.f16x2 r1555, %65, %71; +} +{ +mul.f16x2 r1558, r1555, r2037; +} +{ +add.f16x2 r1561, r1552, r1558; +} +{ +sub.f16x2 r1564, r1537, r1561; +} +{ +add.f16x2 r1567, %52, %82; +} +{ +mul.f16x2 r1570, r1567, r2022; +} +{ +add.f16x2 r1573, %46, r1570; +} +{ +add.f16x2 r1576, %58, %76; +} +{ +mul.f16x2 r1579, r1576, r1905; +} +{ +add.f16x2 r1582, r1573, r1579; +} +{ +add.f16x2 r1585, %64, %70; +} +{ +mul.f16x2 r1588, r1585, r2013; +} +{ +add.f16x2 r1591, r1582, r1588; +} +{ +sub.f16x2 r1594, %53, %83; +} +{ +mul.f16x2 r1597, r1594, r2046; +} +{ +sub.f16x2 r1600, %59, %77; +} +{ +mul.f16x2 r1603, r1600, r1929; +} +{ +add.f16x2 r1606, r1597, r1603; +} +{ +sub.f16x2 r1609, %65, %71; +} +{ +mul.f16x2 r1612, r1609, r2037; +} +{ +add.f16x2 r1615, r1606, r1612; +} +{ +add.f16x2 r1618, r1591, r1615; +} +{ +add.f16x2 r1621, %52, %82; +} +{ +mul.f16x2 r1624, r1621, r2004; +} +{ +add.f16x2 r1627, %46, r1624; +} +{ +add.f16x2 r1630, %58, %76; +} +{ +mul.f16x2 r1633, r1630, r2013; +} +{ +add.f16x2 r1636, r1627, r1633; +} +{ +add.f16x2 r1639, %64, %70; +} +{ +mul.f16x2 r1642, r1639, r2022; +} +{ +add.f16x2 r1645, r1636, r1642; +} +{ +sub.f16x2 r1648, %53, %83; +} +{ +mul.f16x2 r1651, r1648, r2031; +} +{ +sub.f16x2 r1654, %59, %77; +} +{ +mul.f16x2 r1657, r1654, r2037; +} +{ +add.f16x2 r1660, r1651, r1657; +} +{ +sub.f16x2 r1663, %65, %71; +} +{ +mul.f16x2 r1666, r1663, r2046; +} +{ +add.f16x2 r1669, r1660, r1666; +} +{ +sub.f16x2 r1672, r1645, r1669; +} +{ +add.f16x2 r1675, %52, %82; +} +{ +mul.f16x2 r1678, r1675, r2004; +} +{ +add.f16x2 r1681, %46, r1678; +} +{ +add.f16x2 r1684, %58, %76; +} +{ +mul.f16x2 r1687, r1684, r2013; +} +{ +add.f16x2 r1690, r1681, r1687; +} +{ +add.f16x2 r1693, %64, %70; +} +{ +mul.f16x2 r1696, r1693, r2022; +} +{ +add.f16x2 r1699, r1690, r1696; +} +{ +sub.f16x2 r1702, %53, %83; +} +{ +mul.f16x2 r1705, r1702, r2031; +} +{ +sub.f16x2 r1708, %59, %77; +} +{ +mul.f16x2 r1711, r1708, r2037; +} +{ +add.f16x2 r1714, r1705, r1711; +} +{ +sub.f16x2 r1717, %65, %71; +} +{ +mul.f16x2 r1720, r1717, r2046; +} +{ +add.f16x2 r1723, r1714, r1720; +} +{ +add.f16x2 r1726, r1699, r1723; +} +{ +add.f16x2 r1729, %53, %83; +} +{ +mul.f16x2 r1732, r1729, r1788; +} +{ +add.f16x2 r1735, %47, r1732; +} +{ +add.f16x2 r1738, %59, %77; +} +{ +mul.f16x2 r1741, r1738, r2022; +} +{ +add.f16x2 r1744, r1735, r1741; +} +{ +add.f16x2 r1747, %65, %71; +} +{ +mul.f16x2 r1750, r1747, r2004; +} +{ +add.f16x2 r1753, r1744, r1750; +} +{ +sub.f16x2 r1756, %52, %82; +} +{ +mul.f16x2 r1759, r1756, r1815; +} +{ +sub.f16x2 r1762, %58, %76; +} +{ +mul.f16x2 r1765, r1762, r2046; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +sub.f16x2 r1771, %64, %70; +} +{ +mul.f16x2 r1774, r1771, r2031; +} +{ +add.f16x2 r1777, r1768, r1774; +} +{ +add.f16x2 r1780, r1753, r1777; +} +{ +add.f16x2 r1783, %53, %83; +} +{ +mul.f16x2 r1786, r1783, r1788; +} +{ +add.f16x2 r1789, %47, r1786; +} +{ +add.f16x2 r1792, %59, %77; +} +{ +mul.f16x2 r1795, r1792, r2022; +} +{ +add.f16x2 r1798, r1789, r1795; +} +{ +add.f16x2 r1801, %65, %71; +} +{ +mul.f16x2 r1804, r1801, r2004; +} +{ +add.f16x2 r1807, r1798, r1804; +} +{ +sub.f16x2 r1810, %52, %82; +} +{ +mul.f16x2 r1813, r1810, r1815; +} +{ +sub.f16x2 r1816, %58, %76; +} +{ +mul.f16x2 r1819, r1816, r2046; +} +{ +add.f16x2 r1822, r1813, r1819; +} +{ +sub.f16x2 r1825, %64, %70; +} +{ +mul.f16x2 r1828, r1825, r2031; +} +{ +add.f16x2 r1831, r1822, r1828; +} +{ +sub.f16x2 r1834, r1807, r1831; +} +{ +add.f16x2 r1837, %53, %83; +} +{ +mul.f16x2 r1840, r1837, r2022; +} +{ +add.f16x2 r1843, %47, r1840; +} +{ +add.f16x2 r1846, %59, %77; +} +{ +mul.f16x2 r1849, r1846, r1905; +} +{ +add.f16x2 r1852, r1843, r1849; +} +{ +add.f16x2 r1855, %65, %71; +} +{ +mul.f16x2 r1858, r1855, r2013; +} +{ +add.f16x2 r1861, r1852, r1858; +} +{ +sub.f16x2 r1864, %52, %82; +} +{ +mul.f16x2 r1867, r1864, r2046; +} +{ +sub.f16x2 r1870, %58, %76; +} +{ +mul.f16x2 r1873, r1870, r1929; +} +{ +add.f16x2 r1876, r1867, r1873; +} +{ +sub.f16x2 r1879, %64, %70; +} +{ +mul.f16x2 r1882, r1879, r2037; +} +{ +add.f16x2 r1885, r1876, r1882; +} +{ +add.f16x2 r1888, r1861, r1885; +} +{ +add.f16x2 r1891, %53, %83; +} +{ +mul.f16x2 r1894, r1891, r2022; +} +{ +add.f16x2 r1897, %47, r1894; +} +{ +add.f16x2 r1900, %59, %77; +} +{ +mul.f16x2 r1903, r1900, r1905; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, %65, %71; +} +{ +mul.f16x2 r1912, r1909, r2013; +} +{ +add.f16x2 r1915, r1906, r1912; +} +{ +sub.f16x2 r1918, %52, %82; +} +{ +mul.f16x2 r1921, r1918, r2046; +} +{ +sub.f16x2 r1924, %58, %76; +} +{ +mul.f16x2 r1927, r1924, r1929; +} +{ +add.f16x2 r1930, r1921, r1927; +} +{ +sub.f16x2 r1933, %64, %70; +} +{ +mul.f16x2 r1936, r1933, r2037; +} +{ +add.f16x2 r1939, r1930, r1936; +} +{ +sub.f16x2 r1942, r1915, r1939; +} +{ +add.f16x2 r1945, %53, %83; +} +{ +mul.f16x2 r1948, r1945, r2004; +} +{ +add.f16x2 r1951, %47, r1948; +} +{ +add.f16x2 r1954, %59, %77; +} +{ +mul.f16x2 r1957, r1954, r2013; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +add.f16x2 r1963, %65, %71; +} +{ +mul.f16x2 r1966, r1963, r2022; +} +{ +add.f16x2 r1969, r1960, r1966; +} +{ +sub.f16x2 r1972, %52, %82; +} +{ +mul.f16x2 r1975, r1972, r2031; +} +{ +sub.f16x2 r1978, %58, %76; +} +{ +mul.f16x2 r1981, r1978, r2037; +} +{ +add.f16x2 r1984, r1975, r1981; +} +{ +sub.f16x2 r1987, %64, %70; +} +{ +mul.f16x2 r1990, r1987, r2046; +} +{ +add.f16x2 r1993, r1984, r1990; +} +{ +add.f16x2 r1996, r1969, r1993; +} +{ +add.f16x2 r1999, %53, %83; +} +{ +mul.f16x2 r2002, r1999, r2004; +} +{ +add.f16x2 r2005, %47, r2002; +} +{ +add.f16x2 r2008, %59, %77; +} +{ +mul.f16x2 r2011, r2008, r2013; +} +{ +add.f16x2 r2014, r2005, r2011; +} +{ +add.f16x2 r2017, %65, %71; +} +{ +mul.f16x2 r2020, r2017, r2022; +} +{ +add.f16x2 r2023, r2014, r2020; +} +{ +sub.f16x2 r2026, %52, %82; +} +{ +mul.f16x2 r2029, r2026, r2031; +} +{ +sub.f16x2 r2032, %58, %76; +} +{ +mul.f16x2 r2035, r2032, r2037; +} +{ +add.f16x2 r2038, r2029, r2035; +} +{ +sub.f16x2 r2041, %64, %70; +} +{ +mul.f16x2 r2044, r2041, r2046; +} +{ +add.f16x2 r2047, r2038, r2044; +} +{ +sub.f16x2 r2050, r2023, r2047; +} +mov.f64 fd31, 0d3FEE940D6BB98CC5; +{ +cvt.rn.f16.f64 rs49, fd31; +} +mov.f64 fd32, 0d3FD2DD44CE9AFBA7; +{ +cvt.rn.f16.f64 rs50, fd32; +} +mov.f64 fd33, 0d3FEA708C4C4BFA74; +{ +cvt.rn.f16.f64 rs51, fd33; +} +mov.f64 fd34, 0d3FE206B7C9520CED; +{ +cvt.rn.f16.f64 rs52, fd34; +} +{ +cvt.rn.f16.f64 rs53, fd35; +} +mov.f64 fd36, 0d3FE904C37505DE4B; +{ +cvt.rn.f16.f64 rs54, fd36; +} +mov.f64 fd37, 0d3FD761BF51E29C90; +{ +cvt.rn.f16.f64 rs55, fd37; +} +mov.f64 fd38, 0d3FEDC9B7BE64378E; +{ +cvt.rn.f16.f64 rs56, fd38; +} +mov.f64 fd39, 0d3FB32182EBFB0FE9; +{ +cvt.rn.f16.f64 rs57, fd39; +} +mov.f64 fd40, 0d3FEFE917F00AE2CD; +{ +cvt.rn.f16.f64 rs58, fd40; +} +{ +cvt.rn.f16.f64 rs59, fd41; +} +mov.f64 fd42, 0d3FEF329C0558E969; +{ +cvt.rn.f16.f64 rs60, fd42; +} +mov.f64 fd83, 0dBFE0000000000000; +mov.f64 fd45, 0dBFE7752932F8FB65; +{ +cvt.rn.f16.f64 rs63, fd45; +} +mov.f64 fd46, 0d3FE5C3F99E0B6B95; +{ +cvt.rn.f16.f64 rs64, fd46; +} +mov.f64 fd49, 0dBFEFA4808B7D3C19; +{ +cvt.rn.f16.f64 rs67, fd49; +} +mov.f64 fd50, 0d3FC313D12579650C; +{ +cvt.rn.f16.f64 rs68, fd50; +} +{ +cvt.rn.f16.f64 rs71, fd53; +} +{ +cvt.rn.f16.f64 rs72, fd54; +} +mov.f64 fd84, 0dBFEBB67AE8584CAA; +mov.b32 r2067, {rs49, rs49}; +{ +mul.f16x2 r2053, r772, r2067; +} +mov.b32 r2064, {rs50, rs50}; +{ +mul.f16x2 r2056, r1096, r2064; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r772, r2064; +} +{ +fma.rn.f16x2 r2065, r1096, r2067, r2062; +} +mov.b32 r2099, {rs51, rs51}; +{ +mul.f16x2 r2069, r1456, r2099; +} +mov.b32 r2096, {rs52, rs52}; +{ +mul.f16x2 r2072, r1780, r2096; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1456, r2096; +} +{ +fma.rn.f16x2 r2081, r1780, r2099, r2078; +} +{ +mul.f16x2 r2085, r880, r2099; +} +{ +mul.f16x2 r2088, r1204, r2096; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r880, r2096; +} +{ +fma.rn.f16x2 r2097, r1204, r2099, r2094; +} +mov.b32 r2163, {rs55, rs55}; +{ +mul.f16x2 r2101, r1564, r2163; +} +mov.b32 r2160, {rs56, rs56}; +{ +mul.f16x2 r2104, r1888, r2160; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1564, r2160; +} +{ +fma.rn.f16x2 r2113, r1888, r2163, r2110; +} +mov.b32 r2131, {rs53, rs53}; +{ +mul.f16x2 r2117, r988, r2131; +} +mov.b32 r2128, {rs54, rs54}; +{ +mul.f16x2 r2120, r1312, r2128; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r988, r2128; +} +{ +fma.rn.f16x2 r2129, r1312, r2131, r2126; +} +mov.b32 r2227, {rs59, rs59}; +{ +mul.f16x2 r2133, r1672, r2227; +} +mov.b32 r2224, {rs60, rs60}; +{ +mul.f16x2 r2136, r1996, r2224; +} +{ +sub.f16x2 r2139, r2133, r2136; +} +{ +mul.f16x2 r2142, r1672, r2224; +} +{ +fma.rn.f16x2 r2145, r1996, r2227, r2142; +} +{ +mul.f16x2 r2149, r1042, r2163; +} +{ +mul.f16x2 r2152, r1366, r2160; +} +{ +sub.f16x2 r2155, r2149, r2152; +} +{ +mul.f16x2 r2158, r1042, r2160; +} +{ +fma.rn.f16x2 r2161, r1366, r2163, r2158; +} +mov.b32 r2179, {rs63, rs63}; +{ +mul.f16x2 r2165, r1726, r2179; +} +mov.b32 r2176, {rs64, rs64}; +{ +mul.f16x2 r2168, r2050, r2176; +} +{ +sub.f16x2 r2171, r2165, r2168; +} +{ +mul.f16x2 r2174, r1726, r2176; +} +{ +fma.rn.f16x2 r2177, r2050, r2179, r2174; +} +mov.b32 r2195, {rs57, rs57}; +{ +mul.f16x2 r2181, r934, r2195; +} +mov.b32 r2192, {rs58, rs58}; +{ +mul.f16x2 r2184, r1258, r2192; +} +{ +sub.f16x2 r2187, r2181, r2184; +} +{ +mul.f16x2 r2190, r934, r2192; +} +{ +fma.rn.f16x2 r2193, r1258, r2195, r2190; +} +mov.b32 r2211, {rs67, rs67}; +{ +mul.f16x2 r2197, r1618, r2211; +} +mov.b32 r2208, {rs68, rs68}; +{ +mul.f16x2 r2200, r1942, r2208; +} +{ +sub.f16x2 r2203, r2197, r2200; +} +{ +mul.f16x2 r2206, r1618, r2208; +} +{ +fma.rn.f16x2 r2209, r1942, r2211, r2206; +} +{ +mul.f16x2 r2213, r826, r2227; +} +{ +mul.f16x2 r2216, r1150, r2224; +} +{ +sub.f16x2 r2219, r2213, r2216; +} +{ +mul.f16x2 r2222, r826, r2224; +} +{ +fma.rn.f16x2 r2225, r1150, r2227, r2222; +} +mov.b32 r2243, {rs71, rs71}; +{ +mul.f16x2 r2229, r1510, r2243; +} +mov.b32 r2240, {rs72, rs72}; +{ +mul.f16x2 r2232, r1834, r2240; +} +{ +sub.f16x2 r2235, r2229, r2232; +} +{ +mul.f16x2 r2238, r1510, r2240; +} +{ +fma.rn.f16x2 r2241, r1834, r2243, r2238; +} +{ +cvt.rn.f16.f64 rs89, fd83; +} +mov.b32 r2316, {rs89, rs89}; +{ +cvt.rn.f16.f64 rs90, fd84; +} +mov.b32 r2325, {rs90, rs90}; +{ +add.f16x2 r2245, r700, r1384; +} +{ +add.f16x2 %0, r16, r2245; +} +{ +add.f16x2 r2251, r718, r1402; +} +{ +add.f16x2 %1, r34, r2251; +} +{ +add.f16x2 r2257, r700, r1384; +} +{ +mul.f16x2 r2260, r2257, r2316; +} +{ +add.f16x2 r2263, r16, r2260; +} +{ +sub.f16x2 r2266, r718, r1402; +} +{ +mul.f16x2 r2269, r2266, r2325; +} +{ +add.f16x2 %14, r2263, r2269; +} +{ +add.f16x2 r2275, r700, r1384; +} +{ +mul.f16x2 r2278, r2275, r2316; +} +{ +add.f16x2 r2281, r16, r2278; +} +{ +sub.f16x2 r2284, r718, r1402; +} +{ +mul.f16x2 r2287, r2284, r2325; +} +{ +sub.f16x2 %28, r2281, r2287; +} +{ +add.f16x2 r2293, r718, r1402; +} +{ +mul.f16x2 r2296, r2293, r2316; +} +{ +add.f16x2 r2299, r34, r2296; +} +{ +sub.f16x2 r2302, r700, r1384; +} +{ +mul.f16x2 r2305, r2302, r2325; +} +{ +sub.f16x2 %15, r2299, r2305; +} +{ +add.f16x2 r2311, r718, r1402; +} +{ +mul.f16x2 r2314, r2311, r2316; +} +{ +add.f16x2 r2317, r34, r2314; +} +{ +sub.f16x2 r2320, r700, r1384; +} +{ +mul.f16x2 r2323, r2320, r2325; +} +{ +add.f16x2 %29, r2317, r2323; +} +{ +cvt.rn.f16.f64 rs91, fd83; +} +mov.b32 r2400, {rs91, rs91}; +{ +cvt.rn.f16.f64 rs92, fd84; +} +mov.b32 r2409, {rs92, rs92}; +{ +add.f16x2 r2329, r2059, r2075; +} +{ +add.f16x2 %2, r88, r2329; +} +{ +add.f16x2 r2335, r2065, r2081; +} +{ +add.f16x2 %3, r412, r2335; +} +{ +add.f16x2 r2341, r2059, r2075; +} +{ +mul.f16x2 r2344, r2341, r2400; +} +{ +add.f16x2 r2347, r88, r2344; +} +{ +sub.f16x2 r2350, r2065, r2081; +} +{ +mul.f16x2 r2353, r2350, r2409; +} +{ +add.f16x2 %16, r2347, r2353; +} +{ +add.f16x2 r2359, r2059, r2075; +} +{ +mul.f16x2 r2362, r2359, r2400; +} +{ +add.f16x2 r2365, r88, r2362; +} +{ +sub.f16x2 r2368, r2065, r2081; +} +{ +mul.f16x2 r2371, r2368, r2409; +} +{ +sub.f16x2 %30, r2365, r2371; +} +{ +add.f16x2 r2377, r2065, r2081; +} +{ +mul.f16x2 r2380, r2377, r2400; +} +{ +add.f16x2 r2383, r412, r2380; +} +{ +sub.f16x2 r2386, r2059, r2075; +} +{ +mul.f16x2 r2389, r2386, r2409; +} +{ +sub.f16x2 %17, r2383, r2389; +} +{ +add.f16x2 r2395, r2065, r2081; +} +{ +mul.f16x2 r2398, r2395, r2400; +} +{ +add.f16x2 r2401, r412, r2398; +} +{ +sub.f16x2 r2404, r2059, r2075; +} +{ +mul.f16x2 r2407, r2404, r2409; +} +{ +add.f16x2 %31, r2401, r2407; +} +{ +cvt.rn.f16.f64 rs93, fd83; +} +mov.b32 r2484, {rs93, rs93}; +{ +cvt.rn.f16.f64 rs94, fd84; +} +mov.b32 r2493, {rs94, rs94}; +{ +add.f16x2 r2413, r2091, r2107; +} +{ +add.f16x2 %4, r196, r2413; +} +{ +add.f16x2 r2419, r2097, r2113; +} +{ +add.f16x2 %5, r520, r2419; +} +{ +add.f16x2 r2425, r2091, r2107; +} +{ +mul.f16x2 r2428, r2425, r2484; +} +{ +add.f16x2 r2431, r196, r2428; +} +{ +sub.f16x2 r2434, r2097, r2113; +} +{ +mul.f16x2 r2437, r2434, r2493; +} +{ +add.f16x2 %18, r2431, r2437; +} +{ +add.f16x2 r2443, r2091, r2107; +} +{ +mul.f16x2 r2446, r2443, r2484; +} +{ +add.f16x2 r2449, r196, r2446; +} +{ +sub.f16x2 r2452, r2097, r2113; +} +{ +mul.f16x2 r2455, r2452, r2493; +} +{ +sub.f16x2 %32, r2449, r2455; +} +{ +add.f16x2 r2461, r2097, r2113; +} +{ +mul.f16x2 r2464, r2461, r2484; +} +{ +add.f16x2 r2467, r520, r2464; +} +{ +sub.f16x2 r2470, r2091, r2107; +} +{ +mul.f16x2 r2473, r2470, r2493; +} +{ +sub.f16x2 %19, r2467, r2473; +} +{ +add.f16x2 r2479, r2097, r2113; +} +{ +mul.f16x2 r2482, r2479, r2484; +} +{ +add.f16x2 r2485, r520, r2482; +} +{ +sub.f16x2 r2488, r2091, r2107; +} +{ +mul.f16x2 r2491, r2488, r2493; +} +{ +add.f16x2 %33, r2485, r2491; +} +{ +cvt.rn.f16.f64 rs95, fd83; +} +mov.b32 r2568, {rs95, rs95}; +{ +cvt.rn.f16.f64 rs96, fd84; +} +mov.b32 r2577, {rs96, rs96}; +{ +add.f16x2 r2497, r2123, r2139; +} +{ +add.f16x2 %6, r304, r2497; +} +{ +add.f16x2 r2503, r2129, r2145; +} +{ +add.f16x2 %7, r628, r2503; +} +{ +add.f16x2 r2509, r2123, r2139; +} +{ +mul.f16x2 r2512, r2509, r2568; +} +{ +add.f16x2 r2515, r304, r2512; +} +{ +sub.f16x2 r2518, r2129, r2145; +} +{ +mul.f16x2 r2521, r2518, r2577; +} +{ +add.f16x2 %20, r2515, r2521; +} +{ +add.f16x2 r2527, r2123, r2139; +} +{ +mul.f16x2 r2530, r2527, r2568; +} +{ +add.f16x2 r2533, r304, r2530; +} +{ +sub.f16x2 r2536, r2129, r2145; +} +{ +mul.f16x2 r2539, r2536, r2577; +} +{ +sub.f16x2 %34, r2533, r2539; +} +{ +add.f16x2 r2545, r2129, r2145; +} +{ +mul.f16x2 r2548, r2545, r2568; +} +{ +add.f16x2 r2551, r628, r2548; +} +{ +sub.f16x2 r2554, r2123, r2139; +} +{ +mul.f16x2 r2557, r2554, r2577; +} +{ +sub.f16x2 %21, r2551, r2557; +} +{ +add.f16x2 r2563, r2129, r2145; +} +{ +mul.f16x2 r2566, r2563, r2568; +} +{ +add.f16x2 r2569, r628, r2566; +} +{ +sub.f16x2 r2572, r2123, r2139; +} +{ +mul.f16x2 r2575, r2572, r2577; +} +{ +add.f16x2 %35, r2569, r2575; +} +{ +cvt.rn.f16.f64 rs97, fd83; +} +mov.b32 r2652, {rs97, rs97}; +{ +cvt.rn.f16.f64 rs98, fd84; +} +mov.b32 r2661, {rs98, rs98}; +{ +add.f16x2 r2581, r2155, r2171; +} +{ +add.f16x2 %8, r358, r2581; +} +{ +add.f16x2 r2587, r2161, r2177; +} +{ +add.f16x2 %9, r682, r2587; +} +{ +add.f16x2 r2593, r2155, r2171; +} +{ +mul.f16x2 r2596, r2593, r2652; +} +{ +add.f16x2 r2599, r358, r2596; +} +{ +sub.f16x2 r2602, r2161, r2177; +} +{ +mul.f16x2 r2605, r2602, r2661; +} +{ +add.f16x2 %22, r2599, r2605; +} +{ +add.f16x2 r2611, r2155, r2171; +} +{ +mul.f16x2 r2614, r2611, r2652; +} +{ +add.f16x2 r2617, r358, r2614; +} +{ +sub.f16x2 r2620, r2161, r2177; +} +{ +mul.f16x2 r2623, r2620, r2661; +} +{ +sub.f16x2 %36, r2617, r2623; +} +{ +add.f16x2 r2629, r2161, r2177; +} +{ +mul.f16x2 r2632, r2629, r2652; +} +{ +add.f16x2 r2635, r682, r2632; +} +{ +sub.f16x2 r2638, r2155, r2171; +} +{ +mul.f16x2 r2641, r2638, r2661; +} +{ +sub.f16x2 %23, r2635, r2641; +} +{ +add.f16x2 r2647, r2161, r2177; +} +{ +mul.f16x2 r2650, r2647, r2652; +} +{ +add.f16x2 r2653, r682, r2650; +} +{ +sub.f16x2 r2656, r2155, r2171; +} +{ +mul.f16x2 r2659, r2656, r2661; +} +{ +add.f16x2 %37, r2653, r2659; +} +{ +cvt.rn.f16.f64 rs99, fd83; +} +mov.b32 r2736, {rs99, rs99}; +{ +cvt.rn.f16.f64 rs100, fd84; +} +mov.b32 r2745, {rs100, rs100}; +{ +add.f16x2 r2665, r2187, r2203; +} +{ +add.f16x2 %10, r250, r2665; +} +{ +add.f16x2 r2671, r2193, r2209; +} +{ +add.f16x2 %11, r574, r2671; +} +{ +add.f16x2 r2677, r2187, r2203; +} +{ +mul.f16x2 r2680, r2677, r2736; +} +{ +add.f16x2 r2683, r250, r2680; +} +{ +sub.f16x2 r2686, r2193, r2209; +} +{ +mul.f16x2 r2689, r2686, r2745; +} +{ +add.f16x2 %24, r2683, r2689; +} +{ +add.f16x2 r2695, r2187, r2203; +} +{ +mul.f16x2 r2698, r2695, r2736; +} +{ +add.f16x2 r2701, r250, r2698; +} +{ +sub.f16x2 r2704, r2193, r2209; +} +{ +mul.f16x2 r2707, r2704, r2745; +} +{ +sub.f16x2 %38, r2701, r2707; +} +{ +add.f16x2 r2713, r2193, r2209; +} +{ +mul.f16x2 r2716, r2713, r2736; +} +{ +add.f16x2 r2719, r574, r2716; +} +{ +sub.f16x2 r2722, r2187, r2203; +} +{ +mul.f16x2 r2725, r2722, r2745; +} +{ +sub.f16x2 %25, r2719, r2725; +} +{ +add.f16x2 r2731, r2193, r2209; +} +{ +mul.f16x2 r2734, r2731, r2736; +} +{ +add.f16x2 r2737, r574, r2734; +} +{ +sub.f16x2 r2740, r2187, r2203; +} +{ +mul.f16x2 r2743, r2740, r2745; +} +{ +add.f16x2 %39, r2737, r2743; +} +{ +cvt.rn.f16.f64 rs101, fd83; +} +mov.b32 r2820, {rs101, rs101}; +{ +cvt.rn.f16.f64 rs102, fd84; +} +mov.b32 r2829, {rs102, rs102}; +{ +add.f16x2 r2749, r2219, r2235; +} +{ +add.f16x2 %12, r142, r2749; +} +{ +add.f16x2 r2755, r2225, r2241; +} +{ +add.f16x2 %13, r466, r2755; +} +{ +add.f16x2 r2761, r2219, r2235; +} +{ +mul.f16x2 r2764, r2761, r2820; +} +{ +add.f16x2 r2767, r142, r2764; +} +{ +sub.f16x2 r2770, r2225, r2241; +} +{ +mul.f16x2 r2773, r2770, r2829; +} +{ +add.f16x2 %26, r2767, r2773; +} +{ +add.f16x2 r2779, r2219, r2235; +} +{ +mul.f16x2 r2782, r2779, r2820; +} +{ +add.f16x2 r2785, r142, r2782; +} +{ +sub.f16x2 r2788, r2225, r2241; +} +{ +mul.f16x2 r2791, r2788, r2829; +} +{ +sub.f16x2 %40, r2785, r2791; +} +{ +add.f16x2 r2797, r2225, r2241; +} +{ +mul.f16x2 r2800, r2797, r2820; +} +{ +add.f16x2 r2803, r466, r2800; +} +{ +sub.f16x2 r2806, r2219, r2235; +} +{ +mul.f16x2 r2809, r2806, r2829; +} +{ +sub.f16x2 %27, r2803, r2809; +} +{ +add.f16x2 r2815, r2225, r2241; +} +{ +mul.f16x2 r2818, r2815, r2820; +} +{ +add.f16x2 r2821, r466, r2818; +} +{ +sub.f16x2 r2824, r2219, r2235; +} +{ +mul.f16x2 r2827, r2824, r2829; +} +{ +add.f16x2 %41, r2821, r2827; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..094323ddf5d46 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp32_fwd.hpp.inc @@ -0,0 +1,452 @@ +#ifndef CUFFTDX_FFT_21_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_21_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<7, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<521>; +.reg .b64 rd<2>; +add.f32 f85, %50, %90; +add.f32 f86, %42, f85; +add.f32 f87, %58, %82; +add.f32 f88, f87, f86; +add.f32 f89, %66, %74; +add.f32 f90, f89, f88; +add.f32 f91, %51, %91; +add.f32 f92, %43, f91; +add.f32 f93, %59, %83; +add.f32 f94, f93, f92; +add.f32 f95, %67, %75; +add.f32 f96, f95, f94; +fma.rn.f32 f97, f85, 0f3F1F9D07, %42; +mul.f32 f98, f87, 0f3E63DC87; +sub.f32 f99, f97, f98; +mul.f32 f100, f89, 0f3F66A5E5; +sub.f32 f101, f99, f100; +sub.f32 f102, %51, %91; +mul.f32 f103, f102, 0f3F48261C; +sub.f32 f104, %59, %83; +mul.f32 f105, f104, 0fBF7994E0; +sub.f32 f106, f105, f103; +sub.f32 f107, %67, %75; +mul.f32 f108, f107, 0f3EDE2602; +sub.f32 f109, f106, f108; +sub.f32 f110, f101, f109; +add.f32 f111, f109, f101; +mul.f32 f112, f85, 0f3E63DC87; +sub.f32 f113, %42, f112; +mul.f32 f114, f87, 0f3F66A5E5; +sub.f32 f115, f113, f114; +fma.rn.f32 f116, f89, 0f3F1F9D07, f115; +mul.f32 f117, f102, 0f3F7994E0; +mul.f32 f118, f104, 0f3EDE2602; +sub.f32 f119, f118, f117; +fma.rn.f32 f120, f107, 0f3F48261C, f119; +sub.f32 f121, f116, f120; +add.f32 f122, f120, f116; +mul.f32 f123, f85, 0f3F66A5E5; +sub.f32 f124, %42, f123; +fma.rn.f32 f125, f87, 0f3F1F9D07, f124; +mul.f32 f126, f89, 0f3E63DC87; +sub.f32 f127, f125, f126; +mul.f32 f128, f102, 0f3EDE2602; +mul.f32 f129, f104, 0f3F48261C; +sub.f32 f130, f129, f128; +mul.f32 f131, f107, 0f3F7994E0; +sub.f32 f132, f130, f131; +sub.f32 f133, f127, f132; +add.f32 f134, f132, f127; +fma.rn.f32 f135, f91, 0f3F1F9D07, %43; +mul.f32 f136, f93, 0f3E63DC87; +sub.f32 f137, f135, f136; +mul.f32 f138, f95, 0f3F66A5E5; +sub.f32 f139, f137, f138; +sub.f32 f140, %50, %90; +mul.f32 f141, f140, 0f3F48261C; +sub.f32 f142, %58, %82; +mul.f32 f143, f142, 0fBF7994E0; +sub.f32 f144, f143, f141; +sub.f32 f145, %66, %74; +mul.f32 f146, f145, 0f3EDE2602; +sub.f32 f147, f144, f146; +add.f32 f148, f147, f139; +sub.f32 f149, f139, f147; +mul.f32 f150, f91, 0f3E63DC87; +sub.f32 f151, %43, f150; +mul.f32 f152, f93, 0f3F66A5E5; +sub.f32 f153, f151, f152; +fma.rn.f32 f154, f95, 0f3F1F9D07, f153; +mul.f32 f155, f140, 0f3F7994E0; +mul.f32 f156, f142, 0f3EDE2602; +sub.f32 f157, f156, f155; +fma.rn.f32 f158, f145, 0f3F48261C, f157; +add.f32 f159, f158, f154; +sub.f32 f160, f154, f158; +mul.f32 f161, f91, 0f3F66A5E5; +sub.f32 f162, %43, f161; +fma.rn.f32 f163, f93, 0f3F1F9D07, f162; +mul.f32 f164, f95, 0f3E63DC87; +sub.f32 f165, f163, f164; +mul.f32 f166, f140, 0f3EDE2602; +mul.f32 f167, f142, 0f3F48261C; +sub.f32 f168, f167, f166; +mul.f32 f169, f145, 0f3F7994E0; +sub.f32 f170, f168, f169; +add.f32 f171, f170, f165; +sub.f32 f172, f165, f170; +add.f32 f173, %52, %92; +add.f32 f174, %44, f173; +add.f32 f175, %60, %84; +add.f32 f176, f175, f174; +add.f32 f177, %68, %76; +add.f32 f178, f177, f176; +add.f32 f179, %54, %94; +add.f32 f180, %46, f179; +add.f32 f181, %62, %86; +add.f32 f182, f181, f180; +add.f32 f183, %70, %78; +add.f32 f184, f183, f182; +fma.rn.f32 f185, f173, 0f3F1F9D07, %44; +mul.f32 f186, f175, 0f3E63DC87; +sub.f32 f187, f185, f186; +mul.f32 f188, f177, 0f3F66A5E5; +sub.f32 f189, f187, f188; +sub.f32 f190, %54, %94; +mul.f32 f191, f190, 0f3F48261C; +sub.f32 f192, %62, %86; +mul.f32 f193, f192, 0fBF7994E0; +sub.f32 f194, f193, f191; +sub.f32 f195, %70, %78; +mul.f32 f196, f195, 0f3EDE2602; +sub.f32 f197, f194, f196; +sub.f32 f198, f189, f197; +add.f32 f199, f197, f189; +mul.f32 f200, f173, 0f3E63DC87; +sub.f32 f201, %44, f200; +mul.f32 f202, f175, 0f3F66A5E5; +sub.f32 f203, f201, f202; +fma.rn.f32 f204, f177, 0f3F1F9D07, f203; +mul.f32 f205, f190, 0f3F7994E0; +mul.f32 f206, f192, 0f3EDE2602; +sub.f32 f207, f206, f205; +fma.rn.f32 f208, f195, 0f3F48261C, f207; +sub.f32 f209, f204, f208; +add.f32 f210, f208, f204; +mul.f32 f211, f173, 0f3F66A5E5; +sub.f32 f212, %44, f211; +fma.rn.f32 f213, f175, 0f3F1F9D07, f212; +mul.f32 f214, f177, 0f3E63DC87; +sub.f32 f215, f213, f214; +mul.f32 f216, f190, 0f3EDE2602; +mul.f32 f217, f192, 0f3F48261C; +sub.f32 f218, f217, f216; +mul.f32 f219, f195, 0f3F7994E0; +sub.f32 f220, f218, f219; +sub.f32 f221, f215, f220; +add.f32 f222, f220, f215; +fma.rn.f32 f223, f179, 0f3F1F9D07, %46; +mul.f32 f224, f181, 0f3E63DC87; +sub.f32 f225, f223, f224; +mul.f32 f226, f183, 0f3F66A5E5; +sub.f32 f227, f225, f226; +sub.f32 f228, %52, %92; +mul.f32 f229, f228, 0f3F48261C; +sub.f32 f230, %60, %84; +mul.f32 f231, f230, 0fBF7994E0; +sub.f32 f232, f231, f229; +sub.f32 f233, %68, %76; +mul.f32 f234, f233, 0f3EDE2602; +sub.f32 f235, f232, f234; +add.f32 f236, f235, f227; +sub.f32 f237, f227, f235; +mul.f32 f238, f179, 0f3E63DC87; +sub.f32 f239, %46, f238; +mul.f32 f240, f181, 0f3F66A5E5; +sub.f32 f241, f239, f240; +fma.rn.f32 f242, f183, 0f3F1F9D07, f241; +mul.f32 f243, f228, 0f3F7994E0; +mul.f32 f244, f230, 0f3EDE2602; +sub.f32 f245, f244, f243; +fma.rn.f32 f246, f233, 0f3F48261C, f245; +add.f32 f247, f246, f242; +sub.f32 f248, f242, f246; +mul.f32 f249, f179, 0f3F66A5E5; +sub.f32 f250, %46, f249; +fma.rn.f32 f251, f181, 0f3F1F9D07, f250; +mul.f32 f252, f183, 0f3E63DC87; +sub.f32 f253, f251, f252; +mul.f32 f254, f228, 0f3EDE2602; +mul.f32 f255, f230, 0f3F48261C; +sub.f32 f256, f255, f254; +mul.f32 f257, f233, 0f3F7994E0; +sub.f32 f258, f256, f257; +add.f32 f259, f258, f253; +sub.f32 f260, f253, f258; +add.f32 f261, %55, %95; +add.f32 f262, %47, f261; +add.f32 f263, %63, %87; +add.f32 f264, f263, f262; +add.f32 f265, %71, %79; +add.f32 f266, f265, f264; +add.f32 f267, %57, %96; +add.f32 f268, %49, f267; +add.f32 f269, %65, %89; +add.f32 f270, f269, f268; +add.f32 f271, %73, %81; +add.f32 f272, f271, f270; +fma.rn.f32 f273, f261, 0f3F1F9D07, %47; +mul.f32 f274, f263, 0f3E63DC87; +sub.f32 f275, f273, f274; +mul.f32 f276, f265, 0f3F66A5E5; +sub.f32 f277, f275, f276; +sub.f32 f278, %57, %96; +mul.f32 f279, f278, 0f3F48261C; +sub.f32 f280, %65, %89; +mul.f32 f281, f280, 0fBF7994E0; +sub.f32 f282, f281, f279; +sub.f32 f283, %73, %81; +mul.f32 f284, f283, 0f3EDE2602; +sub.f32 f285, f282, f284; +sub.f32 f286, f277, f285; +add.f32 f287, f285, f277; +mul.f32 f288, f261, 0f3E63DC87; +sub.f32 f289, %47, f288; +mul.f32 f290, f263, 0f3F66A5E5; +sub.f32 f291, f289, f290; +fma.rn.f32 f292, f265, 0f3F1F9D07, f291; +mul.f32 f293, f278, 0f3F7994E0; +mul.f32 f294, f280, 0f3EDE2602; +sub.f32 f295, f294, f293; +fma.rn.f32 f296, f283, 0f3F48261C, f295; +sub.f32 f297, f292, f296; +add.f32 f298, f296, f292; +mul.f32 f299, f261, 0f3F66A5E5; +sub.f32 f300, %47, f299; +fma.rn.f32 f301, f263, 0f3F1F9D07, f300; +mul.f32 f302, f265, 0f3E63DC87; +sub.f32 f303, f301, f302; +mul.f32 f304, f278, 0f3EDE2602; +mul.f32 f305, f280, 0f3F48261C; +sub.f32 f306, f305, f304; +mul.f32 f307, f283, 0f3F7994E0; +sub.f32 f308, f306, f307; +sub.f32 f309, f303, f308; +add.f32 f310, f308, f303; +fma.rn.f32 f311, f267, 0f3F1F9D07, %49; +mul.f32 f312, f269, 0f3E63DC87; +sub.f32 f313, f311, f312; +mul.f32 f314, f271, 0f3F66A5E5; +sub.f32 f315, f313, f314; +sub.f32 f316, %55, %95; +mul.f32 f317, f316, 0f3F48261C; +sub.f32 f318, %63, %87; +mul.f32 f319, f318, 0fBF7994E0; +sub.f32 f320, f319, f317; +sub.f32 f321, %71, %79; +mul.f32 f322, f321, 0f3EDE2602; +sub.f32 f323, f320, f322; +add.f32 f324, f323, f315; +sub.f32 f325, f315, f323; +mul.f32 f326, f267, 0f3E63DC87; +sub.f32 f327, %49, f326; +mul.f32 f328, f269, 0f3F66A5E5; +sub.f32 f329, f327, f328; +fma.rn.f32 f330, f271, 0f3F1F9D07, f329; +mul.f32 f331, f316, 0f3F7994E0; +mul.f32 f332, f318, 0f3EDE2602; +sub.f32 f333, f332, f331; +fma.rn.f32 f334, f321, 0f3F48261C, f333; +add.f32 f335, f334, f330; +sub.f32 f336, f330, f334; +mul.f32 f337, f267, 0f3F66A5E5; +sub.f32 f338, %49, f337; +fma.rn.f32 f339, f269, 0f3F1F9D07, f338; +mul.f32 f340, f271, 0f3E63DC87; +sub.f32 f341, f339, f340; +mul.f32 f342, f316, 0f3EDE2602; +mul.f32 f343, f318, 0f3F48261C; +sub.f32 f344, f343, f342; +mul.f32 f345, f321, 0f3F7994E0; +sub.f32 f346, f344, f345; +add.f32 f347, f346, f341; +sub.f32 f348, f341, f346; +mul.f32 f349, f198, 0f3F74A06B; +mul.f32 f350, f236, 0fBE96EA26; +sub.f32 f351, f349, f350; +mul.f32 f352, f236, 0f3F74A06B; +fma.rn.f32 f353, f198, 0fBE96EA26, f352; +mul.f32 f354, f286, 0f3F538462; +mul.f32 f355, f324, 0fBF1035BE; +sub.f32 f356, f354, f355; +mul.f32 f357, f324, 0f3F538462; +fma.rn.f32 f358, f286, 0fBF1035BE, f357; +mul.f32 f359, f209, 0f3F538462; +mul.f32 f360, f247, 0fBF1035BE; +sub.f32 f361, f359, f360; +mul.f32 f362, f247, 0f3F538462; +fma.rn.f32 f363, f209, 0fBF1035BE, f362; +mul.f32 f364, f297, 0f3EBB0DFB; +mul.f32 f365, f335, 0fBF6E4DBE; +sub.f32 f366, f364, f365; +mul.f32 f367, f335, 0f3EBB0DFB; +fma.rn.f32 f368, f297, 0fBF6E4DBE, f367; +mul.f32 f369, f221, 0f3F1F9D07; +mul.f32 f370, f259, 0fBF48261C; +sub.f32 f371, f369, f370; +mul.f32 f372, f259, 0f3F1F9D07; +fma.rn.f32 f373, f221, 0fBF48261C, f372; +mul.f32 f374, f309, 0fBE63DC87; +mul.f32 f375, f347, 0fBF7994E0; +sub.f32 f376, f374, f375; +mul.f32 f377, f347, 0fBE63DC87; +fma.rn.f32 f378, f309, 0fBF7994E0, f377; +mul.f32 f379, f222, 0f3EBB0DFB; +mul.f32 f380, f260, 0fBF6E4DBE; +sub.f32 f381, f379, f380; +mul.f32 f382, f260, 0f3EBB0DFB; +fma.rn.f32 f383, f222, 0fBF6E4DBE, f382; +mul.f32 f384, f310, 0fBF3BA94A; +mul.f32 f385, f348, 0fBF2E1FCD; +sub.f32 f386, f384, f385; +mul.f32 f387, f348, 0fBF3BA94A; +fma.rn.f32 f388, f310, 0fBF2E1FCD, f387; +mul.f32 f389, f210, 0f3D990C17; +mul.f32 f390, f248, 0fBF7F48C0; +sub.f32 f391, f389, f390; +mul.f32 f392, f248, 0f3D990C17; +fma.rn.f32 f393, f210, 0fBF7F48C0, f392; +mul.f32 f394, f298, 0fBF7D2404; +mul.f32 f395, f336, 0fBE189E89; +sub.f32 f396, f394, f395; +mul.f32 f397, f336, 0fBF7D2404; +fma.rn.f32 f398, f298, 0fBE189E89, f397; +mul.f32 f399, f199, 0fBE63DC87; +mul.f32 f400, f237, 0fBF7994E0; +sub.f32 f401, f399, f400; +mul.f32 f402, f237, 0fBE63DC87; +fma.rn.f32 f403, f199, 0fBF7994E0, f402; +mul.f32 f404, f287, 0fBF66A5E5; +mul.f32 f405, f325, 0f3EDE2602; +sub.f32 f406, f404, f405; +mul.f32 f407, f325, 0fBF66A5E5; +fma.rn.f32 f408, f287, 0f3EDE2602, f407; +add.f32 f409, f178, f266; +add.f32 f410, f184, f272; +mul.f32 f411, f409, 0f3F000000; +sub.f32 f412, f90, f411; +sub.f32 f413, f184, f272; +mul.f32 f414, f413, 0f3F5DB3D7; +mul.f32 f415, f410, 0f3F000000; +sub.f32 f416, f96, f415; +sub.f32 f417, f178, f266; +mul.f32 f418, f417, 0f3F5DB3D7; +add.f32 f419, f351, f356; +add.f32 f420, f353, f358; +mul.f32 f421, f419, 0f3F000000; +sub.f32 f422, f110, f421; +sub.f32 f423, f353, f358; +mul.f32 f424, f423, 0f3F5DB3D7; +mul.f32 f425, f420, 0f3F000000; +sub.f32 f426, f148, f425; +sub.f32 f427, f351, f356; +mul.f32 f428, f427, 0f3F5DB3D7; +add.f32 f429, f361, f366; +add.f32 f430, f363, f368; +mul.f32 f431, f429, 0f3F000000; +sub.f32 f432, f121, f431; +sub.f32 f433, f363, f368; +mul.f32 f434, f433, 0f3F5DB3D7; +mul.f32 f435, f430, 0f3F000000; +sub.f32 f436, f159, f435; +sub.f32 f437, f361, f366; +mul.f32 f438, f437, 0f3F5DB3D7; +add.f32 f439, f371, f376; +add.f32 f440, f373, f378; +mul.f32 f441, f439, 0f3F000000; +sub.f32 f442, f133, f441; +sub.f32 f443, f373, f378; +mul.f32 f444, f443, 0f3F5DB3D7; +mul.f32 f445, f440, 0f3F000000; +sub.f32 f446, f171, f445; +sub.f32 f447, f371, f376; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f381, f386; +add.f32 f450, f383, f388; +mul.f32 f451, f449, 0f3F000000; +sub.f32 f452, f134, f451; +sub.f32 f453, f383, f388; +mul.f32 f454, f453, 0f3F5DB3D7; +mul.f32 f455, f450, 0f3F000000; +sub.f32 f456, f172, f455; +sub.f32 f457, f381, f386; +mul.f32 f458, f457, 0f3F5DB3D7; +add.f32 f459, f391, f396; +add.f32 f460, f393, f398; +mul.f32 f461, f459, 0f3F000000; +sub.f32 f462, f122, f461; +sub.f32 f463, f393, f398; +mul.f32 f464, f463, 0f3F5DB3D7; +mul.f32 f465, f460, 0f3F000000; +sub.f32 f466, f160, f465; +sub.f32 f467, f391, f396; +mul.f32 f468, f467, 0f3F5DB3D7; +add.f32 f469, f401, f406; +add.f32 f470, f403, f408; +mul.f32 f471, f469, 0f3F000000; +sub.f32 f472, f111, f471; +sub.f32 f473, f403, f408; +mul.f32 f474, f473, 0f3F5DB3D7; +mul.f32 f475, f470, 0f3F000000; +sub.f32 f476, f149, f475; +sub.f32 f477, f401, f406; +mul.f32 f478, f477, 0f3F5DB3D7; +add.f32 %1, f96, f410; +add.f32 %0, f90, f409; +add.f32 %3, f148, f420; +add.f32 %2, f110, f419; +add.f32 %5, f159, f430; +add.f32 %4, f121, f429; +add.f32 %7, f171, f440; +add.f32 %6, f133, f439; +add.f32 %9, f172, f450; +add.f32 %8, f134, f449; +add.f32 %11, f160, f460; +add.f32 %10, f122, f459; +add.f32 %13, f149, f470; +add.f32 %12, f111, f469; +sub.f32 %15, f416, f418; +add.f32 %14, f414, f412; +sub.f32 %17, f426, f428; +add.f32 %16, f424, f422; +sub.f32 %19, f436, f438; +add.f32 %18, f434, f432; +sub.f32 %21, f446, f448; +add.f32 %20, f444, f442; +sub.f32 %23, f456, f458; +add.f32 %22, f454, f452; +sub.f32 %25, f466, f468; +add.f32 %24, f464, f462; +sub.f32 %27, f476, f478; +add.f32 %26, f474, f472; +add.f32 %29, f418, f416; +sub.f32 %28, f412, f414; +add.f32 %31, f428, f426; +sub.f32 %30, f422, f424; +add.f32 %33, f438, f436; +sub.f32 %32, f432, f434; +add.f32 %35, f448, f446; +sub.f32 %34, f442, f444; +add.f32 %37, f458, f456; +sub.f32 %36, f452, f454; +add.f32 %39, f468, f466; +sub.f32 %38, f462, f464; +add.f32 %41, f478, f476; +sub.f32 %40, f472, f474; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..2073ef35c032d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp32_inv.hpp.inc @@ -0,0 +1,440 @@ +#ifndef CUFFTDX_FFT_21_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_21_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<209, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<509>; +.reg .b64 rd<2>; +add.f32 f85, %50, %90; +add.f32 f86, %42, f85; +add.f32 f87, %58, %82; +add.f32 f88, f87, f86; +add.f32 f89, %66, %74; +add.f32 f90, f89, f88; +add.f32 f91, %51, %91; +add.f32 f92, %43, f91; +add.f32 f93, %59, %83; +add.f32 f94, f93, f92; +add.f32 f95, %67, %75; +add.f32 f96, f95, f94; +fma.rn.f32 f97, f85, 0f3F1F9D07, %42; +mul.f32 f98, f87, 0f3E63DC87; +sub.f32 f99, f97, f98; +mul.f32 f100, f89, 0f3F66A5E5; +sub.f32 f101, f99, f100; +sub.f32 f102, %51, %91; +mul.f32 f103, f102, 0f3F48261C; +sub.f32 f104, %59, %83; +fma.rn.f32 f105, f104, 0f3F7994E0, f103; +sub.f32 f106, %67, %75; +fma.rn.f32 f107, f106, 0f3EDE2602, f105; +sub.f32 f108, f101, f107; +add.f32 f109, f107, f101; +mul.f32 f110, f85, 0f3E63DC87; +sub.f32 f111, %42, f110; +mul.f32 f112, f87, 0f3F66A5E5; +sub.f32 f113, f111, f112; +fma.rn.f32 f114, f89, 0f3F1F9D07, f113; +mul.f32 f115, f102, 0f3F7994E0; +mul.f32 f116, f104, 0f3EDE2602; +sub.f32 f117, f115, f116; +mul.f32 f118, f106, 0f3F48261C; +sub.f32 f119, f117, f118; +sub.f32 f120, f114, f119; +add.f32 f121, f119, f114; +mul.f32 f122, f85, 0f3F66A5E5; +sub.f32 f123, %42, f122; +fma.rn.f32 f124, f87, 0f3F1F9D07, f123; +mul.f32 f125, f89, 0f3E63DC87; +sub.f32 f126, f124, f125; +mul.f32 f127, f102, 0f3EDE2602; +mul.f32 f128, f104, 0f3F48261C; +sub.f32 f129, f127, f128; +fma.rn.f32 f130, f106, 0f3F7994E0, f129; +sub.f32 f131, f126, f130; +add.f32 f132, f130, f126; +fma.rn.f32 f133, f91, 0f3F1F9D07, %43; +mul.f32 f134, f93, 0f3E63DC87; +sub.f32 f135, f133, f134; +mul.f32 f136, f95, 0f3F66A5E5; +sub.f32 f137, f135, f136; +sub.f32 f138, %50, %90; +mul.f32 f139, f138, 0f3F48261C; +sub.f32 f140, %58, %82; +fma.rn.f32 f141, f140, 0f3F7994E0, f139; +sub.f32 f142, %66, %74; +fma.rn.f32 f143, f142, 0f3EDE2602, f141; +add.f32 f144, f143, f137; +sub.f32 f145, f137, f143; +mul.f32 f146, f91, 0f3E63DC87; +sub.f32 f147, %43, f146; +mul.f32 f148, f93, 0f3F66A5E5; +sub.f32 f149, f147, f148; +fma.rn.f32 f150, f95, 0f3F1F9D07, f149; +mul.f32 f151, f138, 0f3F7994E0; +mul.f32 f152, f140, 0f3EDE2602; +sub.f32 f153, f151, f152; +mul.f32 f154, f142, 0f3F48261C; +sub.f32 f155, f153, f154; +add.f32 f156, f155, f150; +sub.f32 f157, f150, f155; +mul.f32 f158, f91, 0f3F66A5E5; +sub.f32 f159, %43, f158; +fma.rn.f32 f160, f93, 0f3F1F9D07, f159; +mul.f32 f161, f95, 0f3E63DC87; +sub.f32 f162, f160, f161; +mul.f32 f163, f138, 0f3EDE2602; +mul.f32 f164, f140, 0f3F48261C; +sub.f32 f165, f163, f164; +fma.rn.f32 f166, f142, 0f3F7994E0, f165; +add.f32 f167, f166, f162; +sub.f32 f168, f162, f166; +add.f32 f169, %52, %92; +add.f32 f170, %44, f169; +add.f32 f171, %60, %84; +add.f32 f172, f171, f170; +add.f32 f173, %68, %76; +add.f32 f174, f173, f172; +add.f32 f175, %54, %94; +add.f32 f176, %46, f175; +add.f32 f177, %62, %86; +add.f32 f178, f177, f176; +add.f32 f179, %70, %78; +add.f32 f180, f179, f178; +fma.rn.f32 f181, f169, 0f3F1F9D07, %44; +mul.f32 f182, f171, 0f3E63DC87; +sub.f32 f183, f181, f182; +mul.f32 f184, f173, 0f3F66A5E5; +sub.f32 f185, f183, f184; +sub.f32 f186, %54, %94; +mul.f32 f187, f186, 0f3F48261C; +sub.f32 f188, %62, %86; +fma.rn.f32 f189, f188, 0f3F7994E0, f187; +sub.f32 f190, %70, %78; +fma.rn.f32 f191, f190, 0f3EDE2602, f189; +sub.f32 f192, f185, f191; +add.f32 f193, f191, f185; +mul.f32 f194, f169, 0f3E63DC87; +sub.f32 f195, %44, f194; +mul.f32 f196, f171, 0f3F66A5E5; +sub.f32 f197, f195, f196; +fma.rn.f32 f198, f173, 0f3F1F9D07, f197; +mul.f32 f199, f186, 0f3F7994E0; +mul.f32 f200, f188, 0f3EDE2602; +sub.f32 f201, f199, f200; +mul.f32 f202, f190, 0f3F48261C; +sub.f32 f203, f201, f202; +sub.f32 f204, f198, f203; +add.f32 f205, f203, f198; +mul.f32 f206, f169, 0f3F66A5E5; +sub.f32 f207, %44, f206; +fma.rn.f32 f208, f171, 0f3F1F9D07, f207; +mul.f32 f209, f173, 0f3E63DC87; +sub.f32 f210, f208, f209; +mul.f32 f211, f186, 0f3EDE2602; +mul.f32 f212, f188, 0f3F48261C; +sub.f32 f213, f211, f212; +fma.rn.f32 f214, f190, 0f3F7994E0, f213; +sub.f32 f215, f210, f214; +add.f32 f216, f214, f210; +fma.rn.f32 f217, f175, 0f3F1F9D07, %46; +mul.f32 f218, f177, 0f3E63DC87; +sub.f32 f219, f217, f218; +mul.f32 f220, f179, 0f3F66A5E5; +sub.f32 f221, f219, f220; +sub.f32 f222, %52, %92; +mul.f32 f223, f222, 0f3F48261C; +sub.f32 f224, %60, %84; +fma.rn.f32 f225, f224, 0f3F7994E0, f223; +sub.f32 f226, %68, %76; +fma.rn.f32 f227, f226, 0f3EDE2602, f225; +add.f32 f228, f227, f221; +sub.f32 f229, f221, f227; +mul.f32 f230, f175, 0f3E63DC87; +sub.f32 f231, %46, f230; +mul.f32 f232, f177, 0f3F66A5E5; +sub.f32 f233, f231, f232; +fma.rn.f32 f234, f179, 0f3F1F9D07, f233; +mul.f32 f235, f222, 0f3F7994E0; +mul.f32 f236, f224, 0f3EDE2602; +sub.f32 f237, f235, f236; +mul.f32 f238, f226, 0f3F48261C; +sub.f32 f239, f237, f238; +add.f32 f240, f239, f234; +sub.f32 f241, f234, f239; +mul.f32 f242, f175, 0f3F66A5E5; +sub.f32 f243, %46, f242; +fma.rn.f32 f244, f177, 0f3F1F9D07, f243; +mul.f32 f245, f179, 0f3E63DC87; +sub.f32 f246, f244, f245; +mul.f32 f247, f222, 0f3EDE2602; +mul.f32 f248, f224, 0f3F48261C; +sub.f32 f249, f247, f248; +fma.rn.f32 f250, f226, 0f3F7994E0, f249; +add.f32 f251, f250, f246; +sub.f32 f252, f246, f250; +add.f32 f253, %55, %95; +add.f32 f254, %47, f253; +add.f32 f255, %63, %87; +add.f32 f256, f255, f254; +add.f32 f257, %71, %79; +add.f32 f258, f257, f256; +add.f32 f259, %57, %96; +add.f32 f260, %49, f259; +add.f32 f261, %65, %89; +add.f32 f262, f261, f260; +add.f32 f263, %73, %81; +add.f32 f264, f263, f262; +fma.rn.f32 f265, f253, 0f3F1F9D07, %47; +mul.f32 f266, f255, 0f3E63DC87; +sub.f32 f267, f265, f266; +mul.f32 f268, f257, 0f3F66A5E5; +sub.f32 f269, f267, f268; +sub.f32 f270, %57, %96; +mul.f32 f271, f270, 0f3F48261C; +sub.f32 f272, %65, %89; +fma.rn.f32 f273, f272, 0f3F7994E0, f271; +sub.f32 f274, %73, %81; +fma.rn.f32 f275, f274, 0f3EDE2602, f273; +sub.f32 f276, f269, f275; +add.f32 f277, f275, f269; +mul.f32 f278, f253, 0f3E63DC87; +sub.f32 f279, %47, f278; +mul.f32 f280, f255, 0f3F66A5E5; +sub.f32 f281, f279, f280; +fma.rn.f32 f282, f257, 0f3F1F9D07, f281; +mul.f32 f283, f270, 0f3F7994E0; +mul.f32 f284, f272, 0f3EDE2602; +sub.f32 f285, f283, f284; +mul.f32 f286, f274, 0f3F48261C; +sub.f32 f287, f285, f286; +sub.f32 f288, f282, f287; +add.f32 f289, f287, f282; +mul.f32 f290, f253, 0f3F66A5E5; +sub.f32 f291, %47, f290; +fma.rn.f32 f292, f255, 0f3F1F9D07, f291; +mul.f32 f293, f257, 0f3E63DC87; +sub.f32 f294, f292, f293; +mul.f32 f295, f270, 0f3EDE2602; +mul.f32 f296, f272, 0f3F48261C; +sub.f32 f297, f295, f296; +fma.rn.f32 f298, f274, 0f3F7994E0, f297; +sub.f32 f299, f294, f298; +add.f32 f300, f298, f294; +fma.rn.f32 f301, f259, 0f3F1F9D07, %49; +mul.f32 f302, f261, 0f3E63DC87; +sub.f32 f303, f301, f302; +mul.f32 f304, f263, 0f3F66A5E5; +sub.f32 f305, f303, f304; +sub.f32 f306, %55, %95; +mul.f32 f307, f306, 0f3F48261C; +sub.f32 f308, %63, %87; +fma.rn.f32 f309, f308, 0f3F7994E0, f307; +sub.f32 f310, %71, %79; +fma.rn.f32 f311, f310, 0f3EDE2602, f309; +add.f32 f312, f311, f305; +sub.f32 f313, f305, f311; +mul.f32 f314, f259, 0f3E63DC87; +sub.f32 f315, %49, f314; +mul.f32 f316, f261, 0f3F66A5E5; +sub.f32 f317, f315, f316; +fma.rn.f32 f318, f263, 0f3F1F9D07, f317; +mul.f32 f319, f306, 0f3F7994E0; +mul.f32 f320, f308, 0f3EDE2602; +sub.f32 f321, f319, f320; +mul.f32 f322, f310, 0f3F48261C; +sub.f32 f323, f321, f322; +add.f32 f324, f323, f318; +sub.f32 f325, f318, f323; +mul.f32 f326, f259, 0f3F66A5E5; +sub.f32 f327, %49, f326; +fma.rn.f32 f328, f261, 0f3F1F9D07, f327; +mul.f32 f329, f263, 0f3E63DC87; +sub.f32 f330, f328, f329; +mul.f32 f331, f306, 0f3EDE2602; +mul.f32 f332, f308, 0f3F48261C; +sub.f32 f333, f331, f332; +fma.rn.f32 f334, f310, 0f3F7994E0, f333; +add.f32 f335, f334, f330; +sub.f32 f336, f330, f334; +mul.f32 f337, f192, 0f3F74A06B; +mul.f32 f338, f228, 0f3E96EA26; +sub.f32 f339, f337, f338; +mul.f32 f340, f228, 0f3F74A06B; +fma.rn.f32 f341, f192, 0f3E96EA26, f340; +mul.f32 f342, f276, 0f3F538462; +mul.f32 f343, f312, 0f3F1035BE; +sub.f32 f344, f342, f343; +mul.f32 f345, f312, 0f3F538462; +fma.rn.f32 f346, f276, 0f3F1035BE, f345; +mul.f32 f347, f204, 0f3F538462; +mul.f32 f348, f240, 0f3F1035BE; +sub.f32 f349, f347, f348; +mul.f32 f350, f240, 0f3F538462; +fma.rn.f32 f351, f204, 0f3F1035BE, f350; +mul.f32 f352, f288, 0f3EBB0DFB; +mul.f32 f353, f324, 0f3F6E4DBE; +sub.f32 f354, f352, f353; +mul.f32 f355, f324, 0f3EBB0DFB; +fma.rn.f32 f356, f288, 0f3F6E4DBE, f355; +mul.f32 f357, f215, 0f3F1F9D07; +mul.f32 f358, f251, 0f3F48261C; +sub.f32 f359, f357, f358; +mul.f32 f360, f251, 0f3F1F9D07; +fma.rn.f32 f361, f215, 0f3F48261C, f360; +mul.f32 f362, f299, 0fBE63DC87; +mul.f32 f363, f335, 0f3F7994E0; +sub.f32 f364, f362, f363; +mul.f32 f365, f335, 0fBE63DC87; +fma.rn.f32 f366, f299, 0f3F7994E0, f365; +mul.f32 f367, f216, 0f3EBB0DFB; +mul.f32 f368, f252, 0f3F6E4DBE; +sub.f32 f369, f367, f368; +mul.f32 f370, f252, 0f3EBB0DFB; +fma.rn.f32 f371, f216, 0f3F6E4DBE, f370; +mul.f32 f372, f300, 0fBF3BA94A; +mul.f32 f373, f336, 0f3F2E1FCD; +sub.f32 f374, f372, f373; +mul.f32 f375, f336, 0fBF3BA94A; +fma.rn.f32 f376, f300, 0f3F2E1FCD, f375; +mul.f32 f377, f205, 0f3D990C17; +mul.f32 f378, f241, 0f3F7F48C0; +sub.f32 f379, f377, f378; +mul.f32 f380, f241, 0f3D990C17; +fma.rn.f32 f381, f205, 0f3F7F48C0, f380; +mul.f32 f382, f289, 0fBF7D2404; +mul.f32 f383, f325, 0f3E189E89; +sub.f32 f384, f382, f383; +mul.f32 f385, f325, 0fBF7D2404; +fma.rn.f32 f386, f289, 0f3E189E89, f385; +mul.f32 f387, f193, 0fBE63DC87; +mul.f32 f388, f229, 0f3F7994E0; +sub.f32 f389, f387, f388; +mul.f32 f390, f229, 0fBE63DC87; +fma.rn.f32 f391, f193, 0f3F7994E0, f390; +mul.f32 f392, f277, 0fBF66A5E5; +mul.f32 f393, f313, 0fBEDE2602; +sub.f32 f394, f392, f393; +mul.f32 f395, f313, 0fBF66A5E5; +fma.rn.f32 f396, f277, 0fBEDE2602, f395; +add.f32 f397, f174, f258; +add.f32 f398, f180, f264; +mul.f32 f399, f397, 0f3F000000; +sub.f32 f400, f90, f399; +sub.f32 f401, f180, f264; +mul.f32 f402, f401, 0fBF5DB3D7; +mul.f32 f403, f398, 0f3F000000; +sub.f32 f404, f96, f403; +sub.f32 f405, f174, f258; +mul.f32 f406, f405, 0fBF5DB3D7; +add.f32 f407, f339, f344; +add.f32 f408, f341, f346; +mul.f32 f409, f407, 0f3F000000; +sub.f32 f410, f108, f409; +sub.f32 f411, f341, f346; +mul.f32 f412, f411, 0fBF5DB3D7; +mul.f32 f413, f408, 0f3F000000; +sub.f32 f414, f144, f413; +sub.f32 f415, f339, f344; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f349, f354; +add.f32 f418, f351, f356; +mul.f32 f419, f417, 0f3F000000; +sub.f32 f420, f120, f419; +sub.f32 f421, f351, f356; +mul.f32 f422, f421, 0fBF5DB3D7; +mul.f32 f423, f418, 0f3F000000; +sub.f32 f424, f156, f423; +sub.f32 f425, f349, f354; +mul.f32 f426, f425, 0fBF5DB3D7; +add.f32 f427, f359, f364; +add.f32 f428, f361, f366; +mul.f32 f429, f427, 0f3F000000; +sub.f32 f430, f131, f429; +sub.f32 f431, f361, f366; +mul.f32 f432, f431, 0fBF5DB3D7; +mul.f32 f433, f428, 0f3F000000; +sub.f32 f434, f167, f433; +sub.f32 f435, f359, f364; +mul.f32 f436, f435, 0fBF5DB3D7; +add.f32 f437, f369, f374; +add.f32 f438, f371, f376; +mul.f32 f439, f437, 0f3F000000; +sub.f32 f440, f132, f439; +sub.f32 f441, f371, f376; +mul.f32 f442, f441, 0fBF5DB3D7; +mul.f32 f443, f438, 0f3F000000; +sub.f32 f444, f168, f443; +sub.f32 f445, f369, f374; +mul.f32 f446, f445, 0fBF5DB3D7; +add.f32 f447, f379, f384; +add.f32 f448, f381, f386; +mul.f32 f449, f447, 0f3F000000; +sub.f32 f450, f121, f449; +sub.f32 f451, f381, f386; +mul.f32 f452, f451, 0fBF5DB3D7; +mul.f32 f453, f448, 0f3F000000; +sub.f32 f454, f157, f453; +sub.f32 f455, f379, f384; +mul.f32 f456, f455, 0fBF5DB3D7; +add.f32 f457, f389, f394; +add.f32 f458, f391, f396; +mul.f32 f459, f457, 0f3F000000; +sub.f32 f460, f109, f459; +sub.f32 f461, f391, f396; +mul.f32 f462, f461, 0fBF5DB3D7; +mul.f32 f463, f458, 0f3F000000; +sub.f32 f464, f145, f463; +sub.f32 f465, f389, f394; +mul.f32 f466, f465, 0fBF5DB3D7; +add.f32 %1, f96, f398; +add.f32 %0, f90, f397; +add.f32 %3, f144, f408; +add.f32 %2, f108, f407; +add.f32 %5, f156, f418; +add.f32 %4, f120, f417; +add.f32 %7, f167, f428; +add.f32 %6, f131, f427; +add.f32 %9, f168, f438; +add.f32 %8, f132, f437; +add.f32 %11, f157, f448; +add.f32 %10, f121, f447; +add.f32 %13, f145, f458; +add.f32 %12, f109, f457; +sub.f32 %15, f404, f406; +add.f32 %14, f402, f400; +sub.f32 %17, f414, f416; +add.f32 %16, f412, f410; +sub.f32 %19, f424, f426; +add.f32 %18, f422, f420; +sub.f32 %21, f434, f436; +add.f32 %20, f432, f430; +sub.f32 %23, f444, f446; +add.f32 %22, f442, f440; +sub.f32 %25, f454, f456; +add.f32 %24, f452, f450; +sub.f32 %27, f464, f466; +add.f32 %26, f462, f460; +add.f32 %29, f406, f404; +sub.f32 %28, f400, f402; +add.f32 %31, f416, f414; +sub.f32 %30, f410, f412; +add.f32 %33, f426, f424; +sub.f32 %32, f420, f422; +add.f32 %35, f436, f434; +sub.f32 %34, f430, f432; +add.f32 %37, f446, f444; +sub.f32 %36, f440, f442; +add.f32 %39, f456, f454; +sub.f32 %38, f450, f452; +add.f32 %41, f466, f464; +sub.f32 %40, f460, f462; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..de6ea978ffb1b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp64_fwd.hpp.inc @@ -0,0 +1,452 @@ +#ifndef CUFFTDX_FFT_21_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_21_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<411, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<521>; +.reg .b64 rd<2>; +add.f64 fd85, %50, %90; +add.f64 fd86, %42, fd85; +add.f64 fd87, %58, %82; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %66, %74; +add.f64 fd90, fd89, fd88; +add.f64 fd91, %51, %91; +add.f64 fd92, %43, fd91; +add.f64 fd93, %59, %83; +add.f64 fd94, fd93, fd92; +add.f64 fd95, %67, %75; +add.f64 fd96, fd95, fd94; +fma.rn.f64 fd97, fd85, 0d3FE3F3A0E28BEDD1, %42; +mul.f64 fd98, fd87, 0d3FCC7B90E3024582; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd89, 0d3FECD4BCA9CB5C71; +sub.f64 fd101, fd99, fd100; +sub.f64 fd102, %51, %91; +mul.f64 fd103, fd102, 0d3FE904C37505DE4B; +sub.f64 fd104, %59, %83; +mul.f64 fd105, fd104, 0dBFEF329C0558E969; +sub.f64 fd106, fd105, fd103; +sub.f64 fd107, %67, %75; +mul.f64 fd108, fd107, 0d3FDBC4C04D71ABC1; +sub.f64 fd109, fd106, fd108; +sub.f64 fd110, fd101, fd109; +add.f64 fd111, fd109, fd101; +mul.f64 fd112, fd85, 0d3FCC7B90E3024582; +sub.f64 fd113, %42, fd112; +mul.f64 fd114, fd87, 0d3FECD4BCA9CB5C71; +sub.f64 fd115, fd113, fd114; +fma.rn.f64 fd116, fd89, 0d3FE3F3A0E28BEDD1, fd115; +mul.f64 fd117, fd102, 0d3FEF329C0558E969; +mul.f64 fd118, fd104, 0d3FDBC4C04D71ABC1; +sub.f64 fd119, fd118, fd117; +fma.rn.f64 fd120, fd107, 0d3FE904C37505DE4B, fd119; +sub.f64 fd121, fd116, fd120; +add.f64 fd122, fd120, fd116; +mul.f64 fd123, fd85, 0d3FECD4BCA9CB5C71; +sub.f64 fd124, %42, fd123; +fma.rn.f64 fd125, fd87, 0d3FE3F3A0E28BEDD1, fd124; +mul.f64 fd126, fd89, 0d3FCC7B90E3024582; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd102, 0d3FDBC4C04D71ABC1; +mul.f64 fd129, fd104, 0d3FE904C37505DE4B; +sub.f64 fd130, fd129, fd128; +mul.f64 fd131, fd107, 0d3FEF329C0558E969; +sub.f64 fd132, fd130, fd131; +sub.f64 fd133, fd127, fd132; +add.f64 fd134, fd132, fd127; +fma.rn.f64 fd135, fd91, 0d3FE3F3A0E28BEDD1, %43; +mul.f64 fd136, fd93, 0d3FCC7B90E3024582; +sub.f64 fd137, fd135, fd136; +mul.f64 fd138, fd95, 0d3FECD4BCA9CB5C71; +sub.f64 fd139, fd137, fd138; +sub.f64 fd140, %50, %90; +mul.f64 fd141, fd140, 0d3FE904C37505DE4B; +sub.f64 fd142, %58, %82; +mul.f64 fd143, fd142, 0dBFEF329C0558E969; +sub.f64 fd144, fd143, fd141; +sub.f64 fd145, %66, %74; +mul.f64 fd146, fd145, 0d3FDBC4C04D71ABC1; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd147, fd139; +sub.f64 fd149, fd139, fd147; +mul.f64 fd150, fd91, 0d3FCC7B90E3024582; +sub.f64 fd151, %43, fd150; +mul.f64 fd152, fd93, 0d3FECD4BCA9CB5C71; +sub.f64 fd153, fd151, fd152; +fma.rn.f64 fd154, fd95, 0d3FE3F3A0E28BEDD1, fd153; +mul.f64 fd155, fd140, 0d3FEF329C0558E969; +mul.f64 fd156, fd142, 0d3FDBC4C04D71ABC1; +sub.f64 fd157, fd156, fd155; +fma.rn.f64 fd158, fd145, 0d3FE904C37505DE4B, fd157; +add.f64 fd159, fd158, fd154; +sub.f64 fd160, fd154, fd158; +mul.f64 fd161, fd91, 0d3FECD4BCA9CB5C71; +sub.f64 fd162, %43, fd161; +fma.rn.f64 fd163, fd93, 0d3FE3F3A0E28BEDD1, fd162; +mul.f64 fd164, fd95, 0d3FCC7B90E3024582; +sub.f64 fd165, fd163, fd164; +mul.f64 fd166, fd140, 0d3FDBC4C04D71ABC1; +mul.f64 fd167, fd142, 0d3FE904C37505DE4B; +sub.f64 fd168, fd167, fd166; +mul.f64 fd169, fd145, 0d3FEF329C0558E969; +sub.f64 fd170, fd168, fd169; +add.f64 fd171, fd170, fd165; +sub.f64 fd172, fd165, fd170; +add.f64 fd173, %52, %92; +add.f64 fd174, %44, fd173; +add.f64 fd175, %60, %84; +add.f64 fd176, fd175, fd174; +add.f64 fd177, %68, %76; +add.f64 fd178, fd177, fd176; +add.f64 fd179, %54, %94; +add.f64 fd180, %46, fd179; +add.f64 fd181, %62, %86; +add.f64 fd182, fd181, fd180; +add.f64 fd183, %70, %78; +add.f64 fd184, fd183, fd182; +fma.rn.f64 fd185, fd173, 0d3FE3F3A0E28BEDD1, %44; +mul.f64 fd186, fd175, 0d3FCC7B90E3024582; +sub.f64 fd187, fd185, fd186; +mul.f64 fd188, fd177, 0d3FECD4BCA9CB5C71; +sub.f64 fd189, fd187, fd188; +sub.f64 fd190, %54, %94; +mul.f64 fd191, fd190, 0d3FE904C37505DE4B; +sub.f64 fd192, %62, %86; +mul.f64 fd193, fd192, 0dBFEF329C0558E969; +sub.f64 fd194, fd193, fd191; +sub.f64 fd195, %70, %78; +mul.f64 fd196, fd195, 0d3FDBC4C04D71ABC1; +sub.f64 fd197, fd194, fd196; +sub.f64 fd198, fd189, fd197; +add.f64 fd199, fd197, fd189; +mul.f64 fd200, fd173, 0d3FCC7B90E3024582; +sub.f64 fd201, %44, fd200; +mul.f64 fd202, fd175, 0d3FECD4BCA9CB5C71; +sub.f64 fd203, fd201, fd202; +fma.rn.f64 fd204, fd177, 0d3FE3F3A0E28BEDD1, fd203; +mul.f64 fd205, fd190, 0d3FEF329C0558E969; +mul.f64 fd206, fd192, 0d3FDBC4C04D71ABC1; +sub.f64 fd207, fd206, fd205; +fma.rn.f64 fd208, fd195, 0d3FE904C37505DE4B, fd207; +sub.f64 fd209, fd204, fd208; +add.f64 fd210, fd208, fd204; +mul.f64 fd211, fd173, 0d3FECD4BCA9CB5C71; +sub.f64 fd212, %44, fd211; +fma.rn.f64 fd213, fd175, 0d3FE3F3A0E28BEDD1, fd212; +mul.f64 fd214, fd177, 0d3FCC7B90E3024582; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd190, 0d3FDBC4C04D71ABC1; +mul.f64 fd217, fd192, 0d3FE904C37505DE4B; +sub.f64 fd218, fd217, fd216; +mul.f64 fd219, fd195, 0d3FEF329C0558E969; +sub.f64 fd220, fd218, fd219; +sub.f64 fd221, fd215, fd220; +add.f64 fd222, fd220, fd215; +fma.rn.f64 fd223, fd179, 0d3FE3F3A0E28BEDD1, %46; +mul.f64 fd224, fd181, 0d3FCC7B90E3024582; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd183, 0d3FECD4BCA9CB5C71; +sub.f64 fd227, fd225, fd226; +sub.f64 fd228, %52, %92; +mul.f64 fd229, fd228, 0d3FE904C37505DE4B; +sub.f64 fd230, %60, %84; +mul.f64 fd231, fd230, 0dBFEF329C0558E969; +sub.f64 fd232, fd231, fd229; +sub.f64 fd233, %68, %76; +mul.f64 fd234, fd233, 0d3FDBC4C04D71ABC1; +sub.f64 fd235, fd232, fd234; +add.f64 fd236, fd235, fd227; +sub.f64 fd237, fd227, fd235; +mul.f64 fd238, fd179, 0d3FCC7B90E3024582; +sub.f64 fd239, %46, fd238; +mul.f64 fd240, fd181, 0d3FECD4BCA9CB5C71; +sub.f64 fd241, fd239, fd240; +fma.rn.f64 fd242, fd183, 0d3FE3F3A0E28BEDD1, fd241; +mul.f64 fd243, fd228, 0d3FEF329C0558E969; +mul.f64 fd244, fd230, 0d3FDBC4C04D71ABC1; +sub.f64 fd245, fd244, fd243; +fma.rn.f64 fd246, fd233, 0d3FE904C37505DE4B, fd245; +add.f64 fd247, fd246, fd242; +sub.f64 fd248, fd242, fd246; +mul.f64 fd249, fd179, 0d3FECD4BCA9CB5C71; +sub.f64 fd250, %46, fd249; +fma.rn.f64 fd251, fd181, 0d3FE3F3A0E28BEDD1, fd250; +mul.f64 fd252, fd183, 0d3FCC7B90E3024582; +sub.f64 fd253, fd251, fd252; +mul.f64 fd254, fd228, 0d3FDBC4C04D71ABC1; +mul.f64 fd255, fd230, 0d3FE904C37505DE4B; +sub.f64 fd256, fd255, fd254; +mul.f64 fd257, fd233, 0d3FEF329C0558E969; +sub.f64 fd258, fd256, fd257; +add.f64 fd259, fd258, fd253; +sub.f64 fd260, fd253, fd258; +add.f64 fd261, %55, %95; +add.f64 fd262, %47, fd261; +add.f64 fd263, %63, %87; +add.f64 fd264, fd263, fd262; +add.f64 fd265, %71, %79; +add.f64 fd266, fd265, fd264; +add.f64 fd267, %57, %96; +add.f64 fd268, %49, fd267; +add.f64 fd269, %65, %89; +add.f64 fd270, fd269, fd268; +add.f64 fd271, %73, %81; +add.f64 fd272, fd271, fd270; +fma.rn.f64 fd273, fd261, 0d3FE3F3A0E28BEDD1, %47; +mul.f64 fd274, fd263, 0d3FCC7B90E3024582; +sub.f64 fd275, fd273, fd274; +mul.f64 fd276, fd265, 0d3FECD4BCA9CB5C71; +sub.f64 fd277, fd275, fd276; +sub.f64 fd278, %57, %96; +mul.f64 fd279, fd278, 0d3FE904C37505DE4B; +sub.f64 fd280, %65, %89; +mul.f64 fd281, fd280, 0dBFEF329C0558E969; +sub.f64 fd282, fd281, fd279; +sub.f64 fd283, %73, %81; +mul.f64 fd284, fd283, 0d3FDBC4C04D71ABC1; +sub.f64 fd285, fd282, fd284; +sub.f64 fd286, fd277, fd285; +add.f64 fd287, fd285, fd277; +mul.f64 fd288, fd261, 0d3FCC7B90E3024582; +sub.f64 fd289, %47, fd288; +mul.f64 fd290, fd263, 0d3FECD4BCA9CB5C71; +sub.f64 fd291, fd289, fd290; +fma.rn.f64 fd292, fd265, 0d3FE3F3A0E28BEDD1, fd291; +mul.f64 fd293, fd278, 0d3FEF329C0558E969; +mul.f64 fd294, fd280, 0d3FDBC4C04D71ABC1; +sub.f64 fd295, fd294, fd293; +fma.rn.f64 fd296, fd283, 0d3FE904C37505DE4B, fd295; +sub.f64 fd297, fd292, fd296; +add.f64 fd298, fd296, fd292; +mul.f64 fd299, fd261, 0d3FECD4BCA9CB5C71; +sub.f64 fd300, %47, fd299; +fma.rn.f64 fd301, fd263, 0d3FE3F3A0E28BEDD1, fd300; +mul.f64 fd302, fd265, 0d3FCC7B90E3024582; +sub.f64 fd303, fd301, fd302; +mul.f64 fd304, fd278, 0d3FDBC4C04D71ABC1; +mul.f64 fd305, fd280, 0d3FE904C37505DE4B; +sub.f64 fd306, fd305, fd304; +mul.f64 fd307, fd283, 0d3FEF329C0558E969; +sub.f64 fd308, fd306, fd307; +sub.f64 fd309, fd303, fd308; +add.f64 fd310, fd308, fd303; +fma.rn.f64 fd311, fd267, 0d3FE3F3A0E28BEDD1, %49; +mul.f64 fd312, fd269, 0d3FCC7B90E3024582; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd271, 0d3FECD4BCA9CB5C71; +sub.f64 fd315, fd313, fd314; +sub.f64 fd316, %55, %95; +mul.f64 fd317, fd316, 0d3FE904C37505DE4B; +sub.f64 fd318, %63, %87; +mul.f64 fd319, fd318, 0dBFEF329C0558E969; +sub.f64 fd320, fd319, fd317; +sub.f64 fd321, %71, %79; +mul.f64 fd322, fd321, 0d3FDBC4C04D71ABC1; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd323, fd315; +sub.f64 fd325, fd315, fd323; +mul.f64 fd326, fd267, 0d3FCC7B90E3024582; +sub.f64 fd327, %49, fd326; +mul.f64 fd328, fd269, 0d3FECD4BCA9CB5C71; +sub.f64 fd329, fd327, fd328; +fma.rn.f64 fd330, fd271, 0d3FE3F3A0E28BEDD1, fd329; +mul.f64 fd331, fd316, 0d3FEF329C0558E969; +mul.f64 fd332, fd318, 0d3FDBC4C04D71ABC1; +sub.f64 fd333, fd332, fd331; +fma.rn.f64 fd334, fd321, 0d3FE904C37505DE4B, fd333; +add.f64 fd335, fd334, fd330; +sub.f64 fd336, fd330, fd334; +mul.f64 fd337, fd267, 0d3FECD4BCA9CB5C71; +sub.f64 fd338, %49, fd337; +fma.rn.f64 fd339, fd269, 0d3FE3F3A0E28BEDD1, fd338; +mul.f64 fd340, fd271, 0d3FCC7B90E3024582; +sub.f64 fd341, fd339, fd340; +mul.f64 fd342, fd316, 0d3FDBC4C04D71ABC1; +mul.f64 fd343, fd318, 0d3FE904C37505DE4B; +sub.f64 fd344, fd343, fd342; +mul.f64 fd345, fd321, 0d3FEF329C0558E969; +sub.f64 fd346, fd344, fd345; +add.f64 fd347, fd346, fd341; +sub.f64 fd348, fd341, fd346; +mul.f64 fd349, fd198, 0d3FEE940D6BB98CC5; +mul.f64 fd350, fd236, 0dBFD2DD44CE9AFBA7; +sub.f64 fd351, fd349, fd350; +mul.f64 fd352, fd236, 0d3FEE940D6BB98CC5; +fma.rn.f64 fd353, fd198, 0dBFD2DD44CE9AFBA7, fd352; +mul.f64 fd354, fd286, 0d3FEA708C4C4BFA74; +mul.f64 fd355, fd324, 0dBFE206B7C9520CED; +sub.f64 fd356, fd354, fd355; +mul.f64 fd357, fd324, 0d3FEA708C4C4BFA74; +fma.rn.f64 fd358, fd286, 0dBFE206B7C9520CED, fd357; +mul.f64 fd359, fd209, 0d3FEA708C4C4BFA74; +mul.f64 fd360, fd247, 0dBFE206B7C9520CED; +sub.f64 fd361, fd359, fd360; +mul.f64 fd362, fd247, 0d3FEA708C4C4BFA74; +fma.rn.f64 fd363, fd209, 0dBFE206B7C9520CED, fd362; +mul.f64 fd364, fd297, 0d3FD761BF51E29C90; +mul.f64 fd365, fd335, 0dBFEDC9B7BE64378E; +sub.f64 fd366, fd364, fd365; +mul.f64 fd367, fd335, 0d3FD761BF51E29C90; +fma.rn.f64 fd368, fd297, 0dBFEDC9B7BE64378E, fd367; +mul.f64 fd369, fd221, 0d3FE3F3A0E28BEDD1; +mul.f64 fd370, fd259, 0dBFE904C37505DE4B; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd259, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd373, fd221, 0dBFE904C37505DE4B, fd372; +mul.f64 fd374, fd309, 0dBFCC7B90E3024582; +mul.f64 fd375, fd347, 0dBFEF329C0558E969; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd347, 0dBFCC7B90E3024582; +fma.rn.f64 fd378, fd309, 0dBFEF329C0558E969, fd377; +mul.f64 fd379, fd222, 0d3FD761BF51E29C90; +mul.f64 fd380, fd260, 0dBFEDC9B7BE64378E; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd260, 0d3FD761BF51E29C90; +fma.rn.f64 fd383, fd222, 0dBFEDC9B7BE64378E, fd382; +mul.f64 fd384, fd310, 0dBFE7752932F8FB65; +mul.f64 fd385, fd348, 0dBFE5C3F99E0B6B95; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd348, 0dBFE7752932F8FB65; +fma.rn.f64 fd388, fd310, 0dBFE5C3F99E0B6B95, fd387; +mul.f64 fd389, fd210, 0d3FB32182EBFB0FE9; +mul.f64 fd390, fd248, 0dBFEFE917F00AE2CD; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd248, 0d3FB32182EBFB0FE9; +fma.rn.f64 fd393, fd210, 0dBFEFE917F00AE2CD, fd392; +mul.f64 fd394, fd298, 0dBFEFA4808B7D3C19; +mul.f64 fd395, fd336, 0dBFC313D12579650C; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd336, 0dBFEFA4808B7D3C19; +fma.rn.f64 fd398, fd298, 0dBFC313D12579650C, fd397; +mul.f64 fd399, fd199, 0dBFCC7B90E3024582; +mul.f64 fd400, fd237, 0dBFEF329C0558E969; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd237, 0dBFCC7B90E3024582; +fma.rn.f64 fd403, fd199, 0dBFEF329C0558E969, fd402; +mul.f64 fd404, fd287, 0dBFECD4BCA9CB5C71; +mul.f64 fd405, fd325, 0d3FDBC4C04D71ABC1; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd325, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd408, fd287, 0d3FDBC4C04D71ABC1, fd407; +add.f64 fd409, fd178, fd266; +add.f64 fd410, fd184, fd272; +mul.f64 fd411, fd409, 0d3FE0000000000000; +sub.f64 fd412, fd90, fd411; +sub.f64 fd413, fd184, fd272; +mul.f64 fd414, fd413, 0d3FEBB67AE8584CAA; +mul.f64 fd415, fd410, 0d3FE0000000000000; +sub.f64 fd416, fd96, fd415; +sub.f64 fd417, fd178, fd266; +mul.f64 fd418, fd417, 0d3FEBB67AE8584CAA; +add.f64 fd419, fd351, fd356; +add.f64 fd420, fd353, fd358; +mul.f64 fd421, fd419, 0d3FE0000000000000; +sub.f64 fd422, fd110, fd421; +sub.f64 fd423, fd353, fd358; +mul.f64 fd424, fd423, 0d3FEBB67AE8584CAA; +mul.f64 fd425, fd420, 0d3FE0000000000000; +sub.f64 fd426, fd148, fd425; +sub.f64 fd427, fd351, fd356; +mul.f64 fd428, fd427, 0d3FEBB67AE8584CAA; +add.f64 fd429, fd361, fd366; +add.f64 fd430, fd363, fd368; +mul.f64 fd431, fd429, 0d3FE0000000000000; +sub.f64 fd432, fd121, fd431; +sub.f64 fd433, fd363, fd368; +mul.f64 fd434, fd433, 0d3FEBB67AE8584CAA; +mul.f64 fd435, fd430, 0d3FE0000000000000; +sub.f64 fd436, fd159, fd435; +sub.f64 fd437, fd361, fd366; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +add.f64 fd439, fd371, fd376; +add.f64 fd440, fd373, fd378; +mul.f64 fd441, fd439, 0d3FE0000000000000; +sub.f64 fd442, fd133, fd441; +sub.f64 fd443, fd373, fd378; +mul.f64 fd444, fd443, 0d3FEBB67AE8584CAA; +mul.f64 fd445, fd440, 0d3FE0000000000000; +sub.f64 fd446, fd171, fd445; +sub.f64 fd447, fd371, fd376; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd381, fd386; +add.f64 fd450, fd383, fd388; +mul.f64 fd451, fd449, 0d3FE0000000000000; +sub.f64 fd452, fd134, fd451; +sub.f64 fd453, fd383, fd388; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +mul.f64 fd455, fd450, 0d3FE0000000000000; +sub.f64 fd456, fd172, fd455; +sub.f64 fd457, fd381, fd386; +mul.f64 fd458, fd457, 0d3FEBB67AE8584CAA; +add.f64 fd459, fd391, fd396; +add.f64 fd460, fd393, fd398; +mul.f64 fd461, fd459, 0d3FE0000000000000; +sub.f64 fd462, fd122, fd461; +sub.f64 fd463, fd393, fd398; +mul.f64 fd464, fd463, 0d3FEBB67AE8584CAA; +mul.f64 fd465, fd460, 0d3FE0000000000000; +sub.f64 fd466, fd160, fd465; +sub.f64 fd467, fd391, fd396; +mul.f64 fd468, fd467, 0d3FEBB67AE8584CAA; +add.f64 fd469, fd401, fd406; +add.f64 fd470, fd403, fd408; +mul.f64 fd471, fd469, 0d3FE0000000000000; +sub.f64 fd472, fd111, fd471; +sub.f64 fd473, fd403, fd408; +mul.f64 fd474, fd473, 0d3FEBB67AE8584CAA; +mul.f64 fd475, fd470, 0d3FE0000000000000; +sub.f64 fd476, fd149, fd475; +sub.f64 fd477, fd401, fd406; +mul.f64 fd478, fd477, 0d3FEBB67AE8584CAA; +add.f64 %1, fd96, fd410; +add.f64 %0, fd90, fd409; +add.f64 %3, fd148, fd420; +add.f64 %2, fd110, fd419; +add.f64 %5, fd159, fd430; +add.f64 %4, fd121, fd429; +add.f64 %7, fd171, fd440; +add.f64 %6, fd133, fd439; +add.f64 %9, fd172, fd450; +add.f64 %8, fd134, fd449; +add.f64 %11, fd160, fd460; +add.f64 %10, fd122, fd459; +add.f64 %13, fd149, fd470; +add.f64 %12, fd111, fd469; +sub.f64 %15, fd416, fd418; +add.f64 %14, fd414, fd412; +sub.f64 %17, fd426, fd428; +add.f64 %16, fd424, fd422; +sub.f64 %19, fd436, fd438; +add.f64 %18, fd434, fd432; +sub.f64 %21, fd446, fd448; +add.f64 %20, fd444, fd442; +sub.f64 %23, fd456, fd458; +add.f64 %22, fd454, fd452; +sub.f64 %25, fd466, fd468; +add.f64 %24, fd464, fd462; +sub.f64 %27, fd476, fd478; +add.f64 %26, fd474, fd472; +add.f64 %29, fd418, fd416; +sub.f64 %28, fd412, fd414; +add.f64 %31, fd428, fd426; +sub.f64 %30, fd422, fd424; +add.f64 %33, fd438, fd436; +sub.f64 %32, fd432, fd434; +add.f64 %35, fd448, fd446; +sub.f64 %34, fd442, fd444; +add.f64 %37, fd458, fd456; +sub.f64 %36, fd452, fd454; +add.f64 %39, fd468, fd466; +sub.f64 %38, fd462, fd464; +add.f64 %41, fd478, fd476; +sub.f64 %40, fd472, fd474; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..774220e1c8a33 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_21_fp64_inv.hpp.inc @@ -0,0 +1,440 @@ +#ifndef CUFFTDX_FFT_21_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_21_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<582, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<509>; +.reg .b64 rd<2>; +add.f64 fd85, %50, %90; +add.f64 fd86, %42, fd85; +add.f64 fd87, %58, %82; +add.f64 fd88, fd87, fd86; +add.f64 fd89, %66, %74; +add.f64 fd90, fd89, fd88; +add.f64 fd91, %51, %91; +add.f64 fd92, %43, fd91; +add.f64 fd93, %59, %83; +add.f64 fd94, fd93, fd92; +add.f64 fd95, %67, %75; +add.f64 fd96, fd95, fd94; +fma.rn.f64 fd97, fd85, 0d3FE3F3A0E28BEDD1, %42; +mul.f64 fd98, fd87, 0d3FCC7B90E3024582; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd89, 0d3FECD4BCA9CB5C71; +sub.f64 fd101, fd99, fd100; +sub.f64 fd102, %51, %91; +mul.f64 fd103, fd102, 0d3FE904C37505DE4B; +sub.f64 fd104, %59, %83; +fma.rn.f64 fd105, fd104, 0d3FEF329C0558E969, fd103; +sub.f64 fd106, %67, %75; +fma.rn.f64 fd107, fd106, 0d3FDBC4C04D71ABC1, fd105; +sub.f64 fd108, fd101, fd107; +add.f64 fd109, fd107, fd101; +mul.f64 fd110, fd85, 0d3FCC7B90E3024582; +sub.f64 fd111, %42, fd110; +mul.f64 fd112, fd87, 0d3FECD4BCA9CB5C71; +sub.f64 fd113, fd111, fd112; +fma.rn.f64 fd114, fd89, 0d3FE3F3A0E28BEDD1, fd113; +mul.f64 fd115, fd102, 0d3FEF329C0558E969; +mul.f64 fd116, fd104, 0d3FDBC4C04D71ABC1; +sub.f64 fd117, fd115, fd116; +mul.f64 fd118, fd106, 0d3FE904C37505DE4B; +sub.f64 fd119, fd117, fd118; +sub.f64 fd120, fd114, fd119; +add.f64 fd121, fd119, fd114; +mul.f64 fd122, fd85, 0d3FECD4BCA9CB5C71; +sub.f64 fd123, %42, fd122; +fma.rn.f64 fd124, fd87, 0d3FE3F3A0E28BEDD1, fd123; +mul.f64 fd125, fd89, 0d3FCC7B90E3024582; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd102, 0d3FDBC4C04D71ABC1; +mul.f64 fd128, fd104, 0d3FE904C37505DE4B; +sub.f64 fd129, fd127, fd128; +fma.rn.f64 fd130, fd106, 0d3FEF329C0558E969, fd129; +sub.f64 fd131, fd126, fd130; +add.f64 fd132, fd130, fd126; +fma.rn.f64 fd133, fd91, 0d3FE3F3A0E28BEDD1, %43; +mul.f64 fd134, fd93, 0d3FCC7B90E3024582; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd95, 0d3FECD4BCA9CB5C71; +sub.f64 fd137, fd135, fd136; +sub.f64 fd138, %50, %90; +mul.f64 fd139, fd138, 0d3FE904C37505DE4B; +sub.f64 fd140, %58, %82; +fma.rn.f64 fd141, fd140, 0d3FEF329C0558E969, fd139; +sub.f64 fd142, %66, %74; +fma.rn.f64 fd143, fd142, 0d3FDBC4C04D71ABC1, fd141; +add.f64 fd144, fd143, fd137; +sub.f64 fd145, fd137, fd143; +mul.f64 fd146, fd91, 0d3FCC7B90E3024582; +sub.f64 fd147, %43, fd146; +mul.f64 fd148, fd93, 0d3FECD4BCA9CB5C71; +sub.f64 fd149, fd147, fd148; +fma.rn.f64 fd150, fd95, 0d3FE3F3A0E28BEDD1, fd149; +mul.f64 fd151, fd138, 0d3FEF329C0558E969; +mul.f64 fd152, fd140, 0d3FDBC4C04D71ABC1; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd142, 0d3FE904C37505DE4B; +sub.f64 fd155, fd153, fd154; +add.f64 fd156, fd155, fd150; +sub.f64 fd157, fd150, fd155; +mul.f64 fd158, fd91, 0d3FECD4BCA9CB5C71; +sub.f64 fd159, %43, fd158; +fma.rn.f64 fd160, fd93, 0d3FE3F3A0E28BEDD1, fd159; +mul.f64 fd161, fd95, 0d3FCC7B90E3024582; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd138, 0d3FDBC4C04D71ABC1; +mul.f64 fd164, fd140, 0d3FE904C37505DE4B; +sub.f64 fd165, fd163, fd164; +fma.rn.f64 fd166, fd142, 0d3FEF329C0558E969, fd165; +add.f64 fd167, fd166, fd162; +sub.f64 fd168, fd162, fd166; +add.f64 fd169, %52, %92; +add.f64 fd170, %44, fd169; +add.f64 fd171, %60, %84; +add.f64 fd172, fd171, fd170; +add.f64 fd173, %68, %76; +add.f64 fd174, fd173, fd172; +add.f64 fd175, %54, %94; +add.f64 fd176, %46, fd175; +add.f64 fd177, %62, %86; +add.f64 fd178, fd177, fd176; +add.f64 fd179, %70, %78; +add.f64 fd180, fd179, fd178; +fma.rn.f64 fd181, fd169, 0d3FE3F3A0E28BEDD1, %44; +mul.f64 fd182, fd171, 0d3FCC7B90E3024582; +sub.f64 fd183, fd181, fd182; +mul.f64 fd184, fd173, 0d3FECD4BCA9CB5C71; +sub.f64 fd185, fd183, fd184; +sub.f64 fd186, %54, %94; +mul.f64 fd187, fd186, 0d3FE904C37505DE4B; +sub.f64 fd188, %62, %86; +fma.rn.f64 fd189, fd188, 0d3FEF329C0558E969, fd187; +sub.f64 fd190, %70, %78; +fma.rn.f64 fd191, fd190, 0d3FDBC4C04D71ABC1, fd189; +sub.f64 fd192, fd185, fd191; +add.f64 fd193, fd191, fd185; +mul.f64 fd194, fd169, 0d3FCC7B90E3024582; +sub.f64 fd195, %44, fd194; +mul.f64 fd196, fd171, 0d3FECD4BCA9CB5C71; +sub.f64 fd197, fd195, fd196; +fma.rn.f64 fd198, fd173, 0d3FE3F3A0E28BEDD1, fd197; +mul.f64 fd199, fd186, 0d3FEF329C0558E969; +mul.f64 fd200, fd188, 0d3FDBC4C04D71ABC1; +sub.f64 fd201, fd199, fd200; +mul.f64 fd202, fd190, 0d3FE904C37505DE4B; +sub.f64 fd203, fd201, fd202; +sub.f64 fd204, fd198, fd203; +add.f64 fd205, fd203, fd198; +mul.f64 fd206, fd169, 0d3FECD4BCA9CB5C71; +sub.f64 fd207, %44, fd206; +fma.rn.f64 fd208, fd171, 0d3FE3F3A0E28BEDD1, fd207; +mul.f64 fd209, fd173, 0d3FCC7B90E3024582; +sub.f64 fd210, fd208, fd209; +mul.f64 fd211, fd186, 0d3FDBC4C04D71ABC1; +mul.f64 fd212, fd188, 0d3FE904C37505DE4B; +sub.f64 fd213, fd211, fd212; +fma.rn.f64 fd214, fd190, 0d3FEF329C0558E969, fd213; +sub.f64 fd215, fd210, fd214; +add.f64 fd216, fd214, fd210; +fma.rn.f64 fd217, fd175, 0d3FE3F3A0E28BEDD1, %46; +mul.f64 fd218, fd177, 0d3FCC7B90E3024582; +sub.f64 fd219, fd217, fd218; +mul.f64 fd220, fd179, 0d3FECD4BCA9CB5C71; +sub.f64 fd221, fd219, fd220; +sub.f64 fd222, %52, %92; +mul.f64 fd223, fd222, 0d3FE904C37505DE4B; +sub.f64 fd224, %60, %84; +fma.rn.f64 fd225, fd224, 0d3FEF329C0558E969, fd223; +sub.f64 fd226, %68, %76; +fma.rn.f64 fd227, fd226, 0d3FDBC4C04D71ABC1, fd225; +add.f64 fd228, fd227, fd221; +sub.f64 fd229, fd221, fd227; +mul.f64 fd230, fd175, 0d3FCC7B90E3024582; +sub.f64 fd231, %46, fd230; +mul.f64 fd232, fd177, 0d3FECD4BCA9CB5C71; +sub.f64 fd233, fd231, fd232; +fma.rn.f64 fd234, fd179, 0d3FE3F3A0E28BEDD1, fd233; +mul.f64 fd235, fd222, 0d3FEF329C0558E969; +mul.f64 fd236, fd224, 0d3FDBC4C04D71ABC1; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd226, 0d3FE904C37505DE4B; +sub.f64 fd239, fd237, fd238; +add.f64 fd240, fd239, fd234; +sub.f64 fd241, fd234, fd239; +mul.f64 fd242, fd175, 0d3FECD4BCA9CB5C71; +sub.f64 fd243, %46, fd242; +fma.rn.f64 fd244, fd177, 0d3FE3F3A0E28BEDD1, fd243; +mul.f64 fd245, fd179, 0d3FCC7B90E3024582; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd222, 0d3FDBC4C04D71ABC1; +mul.f64 fd248, fd224, 0d3FE904C37505DE4B; +sub.f64 fd249, fd247, fd248; +fma.rn.f64 fd250, fd226, 0d3FEF329C0558E969, fd249; +add.f64 fd251, fd250, fd246; +sub.f64 fd252, fd246, fd250; +add.f64 fd253, %55, %95; +add.f64 fd254, %47, fd253; +add.f64 fd255, %63, %87; +add.f64 fd256, fd255, fd254; +add.f64 fd257, %71, %79; +add.f64 fd258, fd257, fd256; +add.f64 fd259, %57, %96; +add.f64 fd260, %49, fd259; +add.f64 fd261, %65, %89; +add.f64 fd262, fd261, fd260; +add.f64 fd263, %73, %81; +add.f64 fd264, fd263, fd262; +fma.rn.f64 fd265, fd253, 0d3FE3F3A0E28BEDD1, %47; +mul.f64 fd266, fd255, 0d3FCC7B90E3024582; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd257, 0d3FECD4BCA9CB5C71; +sub.f64 fd269, fd267, fd268; +sub.f64 fd270, %57, %96; +mul.f64 fd271, fd270, 0d3FE904C37505DE4B; +sub.f64 fd272, %65, %89; +fma.rn.f64 fd273, fd272, 0d3FEF329C0558E969, fd271; +sub.f64 fd274, %73, %81; +fma.rn.f64 fd275, fd274, 0d3FDBC4C04D71ABC1, fd273; +sub.f64 fd276, fd269, fd275; +add.f64 fd277, fd275, fd269; +mul.f64 fd278, fd253, 0d3FCC7B90E3024582; +sub.f64 fd279, %47, fd278; +mul.f64 fd280, fd255, 0d3FECD4BCA9CB5C71; +sub.f64 fd281, fd279, fd280; +fma.rn.f64 fd282, fd257, 0d3FE3F3A0E28BEDD1, fd281; +mul.f64 fd283, fd270, 0d3FEF329C0558E969; +mul.f64 fd284, fd272, 0d3FDBC4C04D71ABC1; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd274, 0d3FE904C37505DE4B; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, fd282, fd287; +add.f64 fd289, fd287, fd282; +mul.f64 fd290, fd253, 0d3FECD4BCA9CB5C71; +sub.f64 fd291, %47, fd290; +fma.rn.f64 fd292, fd255, 0d3FE3F3A0E28BEDD1, fd291; +mul.f64 fd293, fd257, 0d3FCC7B90E3024582; +sub.f64 fd294, fd292, fd293; +mul.f64 fd295, fd270, 0d3FDBC4C04D71ABC1; +mul.f64 fd296, fd272, 0d3FE904C37505DE4B; +sub.f64 fd297, fd295, fd296; +fma.rn.f64 fd298, fd274, 0d3FEF329C0558E969, fd297; +sub.f64 fd299, fd294, fd298; +add.f64 fd300, fd298, fd294; +fma.rn.f64 fd301, fd259, 0d3FE3F3A0E28BEDD1, %49; +mul.f64 fd302, fd261, 0d3FCC7B90E3024582; +sub.f64 fd303, fd301, fd302; +mul.f64 fd304, fd263, 0d3FECD4BCA9CB5C71; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, %55, %95; +mul.f64 fd307, fd306, 0d3FE904C37505DE4B; +sub.f64 fd308, %63, %87; +fma.rn.f64 fd309, fd308, 0d3FEF329C0558E969, fd307; +sub.f64 fd310, %71, %79; +fma.rn.f64 fd311, fd310, 0d3FDBC4C04D71ABC1, fd309; +add.f64 fd312, fd311, fd305; +sub.f64 fd313, fd305, fd311; +mul.f64 fd314, fd259, 0d3FCC7B90E3024582; +sub.f64 fd315, %49, fd314; +mul.f64 fd316, fd261, 0d3FECD4BCA9CB5C71; +sub.f64 fd317, fd315, fd316; +fma.rn.f64 fd318, fd263, 0d3FE3F3A0E28BEDD1, fd317; +mul.f64 fd319, fd306, 0d3FEF329C0558E969; +mul.f64 fd320, fd308, 0d3FDBC4C04D71ABC1; +sub.f64 fd321, fd319, fd320; +mul.f64 fd322, fd310, 0d3FE904C37505DE4B; +sub.f64 fd323, fd321, fd322; +add.f64 fd324, fd323, fd318; +sub.f64 fd325, fd318, fd323; +mul.f64 fd326, fd259, 0d3FECD4BCA9CB5C71; +sub.f64 fd327, %49, fd326; +fma.rn.f64 fd328, fd261, 0d3FE3F3A0E28BEDD1, fd327; +mul.f64 fd329, fd263, 0d3FCC7B90E3024582; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd306, 0d3FDBC4C04D71ABC1; +mul.f64 fd332, fd308, 0d3FE904C37505DE4B; +sub.f64 fd333, fd331, fd332; +fma.rn.f64 fd334, fd310, 0d3FEF329C0558E969, fd333; +add.f64 fd335, fd334, fd330; +sub.f64 fd336, fd330, fd334; +mul.f64 fd337, fd192, 0d3FEE940D6BB98CC5; +mul.f64 fd338, fd228, 0d3FD2DD44CE9AFBA7; +sub.f64 fd339, fd337, fd338; +mul.f64 fd340, fd228, 0d3FEE940D6BB98CC5; +fma.rn.f64 fd341, fd192, 0d3FD2DD44CE9AFBA7, fd340; +mul.f64 fd342, fd276, 0d3FEA708C4C4BFA74; +mul.f64 fd343, fd312, 0d3FE206B7C9520CED; +sub.f64 fd344, fd342, fd343; +mul.f64 fd345, fd312, 0d3FEA708C4C4BFA74; +fma.rn.f64 fd346, fd276, 0d3FE206B7C9520CED, fd345; +mul.f64 fd347, fd204, 0d3FEA708C4C4BFA74; +mul.f64 fd348, fd240, 0d3FE206B7C9520CED; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd240, 0d3FEA708C4C4BFA74; +fma.rn.f64 fd351, fd204, 0d3FE206B7C9520CED, fd350; +mul.f64 fd352, fd288, 0d3FD761BF51E29C90; +mul.f64 fd353, fd324, 0d3FEDC9B7BE64378E; +sub.f64 fd354, fd352, fd353; +mul.f64 fd355, fd324, 0d3FD761BF51E29C90; +fma.rn.f64 fd356, fd288, 0d3FEDC9B7BE64378E, fd355; +mul.f64 fd357, fd215, 0d3FE3F3A0E28BEDD1; +mul.f64 fd358, fd251, 0d3FE904C37505DE4B; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd251, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd361, fd215, 0d3FE904C37505DE4B, fd360; +mul.f64 fd362, fd299, 0dBFCC7B90E3024582; +mul.f64 fd363, fd335, 0d3FEF329C0558E969; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd335, 0dBFCC7B90E3024582; +fma.rn.f64 fd366, fd299, 0d3FEF329C0558E969, fd365; +mul.f64 fd367, fd216, 0d3FD761BF51E29C90; +mul.f64 fd368, fd252, 0d3FEDC9B7BE64378E; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd252, 0d3FD761BF51E29C90; +fma.rn.f64 fd371, fd216, 0d3FEDC9B7BE64378E, fd370; +mul.f64 fd372, fd300, 0dBFE7752932F8FB65; +mul.f64 fd373, fd336, 0d3FE5C3F99E0B6B95; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd336, 0dBFE7752932F8FB65; +fma.rn.f64 fd376, fd300, 0d3FE5C3F99E0B6B95, fd375; +mul.f64 fd377, fd205, 0d3FB32182EBFB0FE9; +mul.f64 fd378, fd241, 0d3FEFE917F00AE2CD; +sub.f64 fd379, fd377, fd378; +mul.f64 fd380, fd241, 0d3FB32182EBFB0FE9; +fma.rn.f64 fd381, fd205, 0d3FEFE917F00AE2CD, fd380; +mul.f64 fd382, fd289, 0dBFEFA4808B7D3C19; +mul.f64 fd383, fd325, 0d3FC313D12579650C; +sub.f64 fd384, fd382, fd383; +mul.f64 fd385, fd325, 0dBFEFA4808B7D3C19; +fma.rn.f64 fd386, fd289, 0d3FC313D12579650C, fd385; +mul.f64 fd387, fd193, 0dBFCC7B90E3024582; +mul.f64 fd388, fd229, 0d3FEF329C0558E969; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd229, 0dBFCC7B90E3024582; +fma.rn.f64 fd391, fd193, 0d3FEF329C0558E969, fd390; +mul.f64 fd392, fd277, 0dBFECD4BCA9CB5C71; +mul.f64 fd393, fd313, 0dBFDBC4C04D71ABC1; +sub.f64 fd394, fd392, fd393; +mul.f64 fd395, fd313, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd396, fd277, 0dBFDBC4C04D71ABC1, fd395; +add.f64 fd397, fd174, fd258; +add.f64 fd398, fd180, fd264; +mul.f64 fd399, fd397, 0d3FE0000000000000; +sub.f64 fd400, fd90, fd399; +sub.f64 fd401, fd180, fd264; +mul.f64 fd402, fd401, 0dBFEBB67AE8584CAA; +mul.f64 fd403, fd398, 0d3FE0000000000000; +sub.f64 fd404, fd96, fd403; +sub.f64 fd405, fd174, fd258; +mul.f64 fd406, fd405, 0dBFEBB67AE8584CAA; +add.f64 fd407, fd339, fd344; +add.f64 fd408, fd341, fd346; +mul.f64 fd409, fd407, 0d3FE0000000000000; +sub.f64 fd410, fd108, fd409; +sub.f64 fd411, fd341, fd346; +mul.f64 fd412, fd411, 0dBFEBB67AE8584CAA; +mul.f64 fd413, fd408, 0d3FE0000000000000; +sub.f64 fd414, fd144, fd413; +sub.f64 fd415, fd339, fd344; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd349, fd354; +add.f64 fd418, fd351, fd356; +mul.f64 fd419, fd417, 0d3FE0000000000000; +sub.f64 fd420, fd120, fd419; +sub.f64 fd421, fd351, fd356; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +mul.f64 fd423, fd418, 0d3FE0000000000000; +sub.f64 fd424, fd156, fd423; +sub.f64 fd425, fd349, fd354; +mul.f64 fd426, fd425, 0dBFEBB67AE8584CAA; +add.f64 fd427, fd359, fd364; +add.f64 fd428, fd361, fd366; +mul.f64 fd429, fd427, 0d3FE0000000000000; +sub.f64 fd430, fd131, fd429; +sub.f64 fd431, fd361, fd366; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +mul.f64 fd433, fd428, 0d3FE0000000000000; +sub.f64 fd434, fd167, fd433; +sub.f64 fd435, fd359, fd364; +mul.f64 fd436, fd435, 0dBFEBB67AE8584CAA; +add.f64 fd437, fd369, fd374; +add.f64 fd438, fd371, fd376; +mul.f64 fd439, fd437, 0d3FE0000000000000; +sub.f64 fd440, fd132, fd439; +sub.f64 fd441, fd371, fd376; +mul.f64 fd442, fd441, 0dBFEBB67AE8584CAA; +mul.f64 fd443, fd438, 0d3FE0000000000000; +sub.f64 fd444, fd168, fd443; +sub.f64 fd445, fd369, fd374; +mul.f64 fd446, fd445, 0dBFEBB67AE8584CAA; +add.f64 fd447, fd379, fd384; +add.f64 fd448, fd381, fd386; +mul.f64 fd449, fd447, 0d3FE0000000000000; +sub.f64 fd450, fd121, fd449; +sub.f64 fd451, fd381, fd386; +mul.f64 fd452, fd451, 0dBFEBB67AE8584CAA; +mul.f64 fd453, fd448, 0d3FE0000000000000; +sub.f64 fd454, fd157, fd453; +sub.f64 fd455, fd379, fd384; +mul.f64 fd456, fd455, 0dBFEBB67AE8584CAA; +add.f64 fd457, fd389, fd394; +add.f64 fd458, fd391, fd396; +mul.f64 fd459, fd457, 0d3FE0000000000000; +sub.f64 fd460, fd109, fd459; +sub.f64 fd461, fd391, fd396; +mul.f64 fd462, fd461, 0dBFEBB67AE8584CAA; +mul.f64 fd463, fd458, 0d3FE0000000000000; +sub.f64 fd464, fd145, fd463; +sub.f64 fd465, fd389, fd394; +mul.f64 fd466, fd465, 0dBFEBB67AE8584CAA; +add.f64 %1, fd96, fd398; +add.f64 %0, fd90, fd397; +add.f64 %3, fd144, fd408; +add.f64 %2, fd108, fd407; +add.f64 %5, fd156, fd418; +add.f64 %4, fd120, fd417; +add.f64 %7, fd167, fd428; +add.f64 %6, fd131, fd427; +add.f64 %9, fd168, fd438; +add.f64 %8, fd132, fd437; +add.f64 %11, fd157, fd448; +add.f64 %10, fd121, fd447; +add.f64 %13, fd145, fd458; +add.f64 %12, fd109, fd457; +sub.f64 %15, fd404, fd406; +add.f64 %14, fd402, fd400; +sub.f64 %17, fd414, fd416; +add.f64 %16, fd412, fd410; +sub.f64 %19, fd424, fd426; +add.f64 %18, fd422, fd420; +sub.f64 %21, fd434, fd436; +add.f64 %20, fd432, fd430; +sub.f64 %23, fd444, fd446; +add.f64 %22, fd442, fd440; +sub.f64 %25, fd454, fd456; +add.f64 %24, fd452, fd450; +sub.f64 %27, fd464, fd466; +add.f64 %26, fd462, fd460; +add.f64 %29, fd406, fd404; +sub.f64 %28, fd400, fd402; +add.f64 %31, fd416, fd414; +sub.f64 %30, fd410, fd412; +add.f64 %33, fd426, fd424; +sub.f64 %32, fd420, fd422; +add.f64 %35, fd436, fd434; +sub.f64 %34, fd430, fd432; +add.f64 %37, fd446, fd444; +sub.f64 %36, fd440, fd442; +add.f64 %39, fd456, fd454; +sub.f64 %38, fd450, fd452; +add.f64 %41, fd466, fd464; +sub.f64 %40, fd460, fd462; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..c18462a9e20dd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp16_fwd.hpp.inc @@ -0,0 +1,2740 @@ +#ifndef CUFFTDX_FFT_22_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_22_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<754, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<263>; +.reg .b32 r<1835>; +.reg .f64 fd<243>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %46, %64; +} +{ +add.f16x2 r4, %47, %65; +} +{ +sub.f16x2 r7, %46, %64; +} +{ +sub.f16x2 r10, %47, %65; +} +{ +add.f16x2 r13, %48, %62; +} +{ +add.f16x2 r16, %49, %63; +} +{ +sub.f16x2 r19, %48, %62; +} +{ +sub.f16x2 r22, %49, %63; +} +{ +add.f16x2 r25, %50, %60; +} +{ +add.f16x2 r28, %51, %61; +} +{ +sub.f16x2 r31, %50, %60; +} +{ +sub.f16x2 r34, %51, %61; +} +{ +add.f16x2 r37, %52, %58; +} +{ +add.f16x2 r40, %53, %59; +} +{ +sub.f16x2 r43, %52, %58; +} +{ +sub.f16x2 r46, %53, %59; +} +{ +add.f16x2 r49, %54, %56; +} +{ +add.f16x2 r52, %55, %57; +} +{ +sub.f16x2 r55, %54, %56; +} +{ +sub.f16x2 r58, %55, %57; +} +{ +add.f16x2 r61, %44, r1; +} +{ +add.f16x2 r64, %45, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 r85, r79, r49; +} +{ +add.f16x2 r88, r82, r52; +} +mov.u32 r1388, 0; +cvt.rn.f16.s32 rs1, r1388; +mov.b32 r103, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r1388; +mov.b32 r115, {rs2, rs2}; +mov.f64 fd203, 0d3FEAEB8C8764F0BA; +{ +cvt.rn.f16.f64 rs3, fd203; +} +mov.b32 r95, {rs3, rs3}; +{ +mul.f16x2 r93, r1, r95; +} +{ +add.f16x2 r96, %44, r93; +} +mov.f64 fd218, 0dBFE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs4, fd218; +} +mov.b32 r101, {rs4, rs4}; +{ +mul.f16x2 r99, r10, r101; +} +{ +add.f16x2 r102, r103, r99; +} +{ +cvt.rn.f16.f64 rs5, fd203; +} +mov.b32 r107, {rs5, rs5}; +{ +mul.f16x2 r105, r4, r107; +} +{ +add.f16x2 r108, %45, r105; +} +{ +cvt.rn.f16.f64 rs6, fd218; +} +mov.b32 r113, {rs6, rs6}; +{ +mul.f16x2 r111, r7, r113; +} +{ +add.f16x2 r114, r115, r111; +} +mov.f64 fd207, 0d3FDA9628D9C712B6; +{ +cvt.rn.f16.f64 rs7, fd207; +} +mov.b32 r119, {rs7, rs7}; +{ +mul.f16x2 r117, r13, r119; +} +{ +add.f16x2 r120, r96, r117; +} +mov.f64 fd214, 0dBFED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs8, fd214; +} +mov.b32 r125, {rs8, rs8}; +{ +mul.f16x2 r123, r22, r125; +} +{ +add.f16x2 r126, r102, r123; +} +{ +cvt.rn.f16.f64 rs9, fd207; +} +mov.b32 r131, {rs9, rs9}; +{ +mul.f16x2 r129, r16, r131; +} +{ +add.f16x2 r132, r108, r129; +} +{ +cvt.rn.f16.f64 rs10, fd214; +} +mov.b32 r137, {rs10, rs10}; +{ +mul.f16x2 r135, r19, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd211, 0dBFC2375F640F44DB; +{ +cvt.rn.f16.f64 rs11, fd211; +} +mov.b32 r143, {rs11, rs11}; +{ +mul.f16x2 r141, r25, r143; +} +{ +add.f16x2 r144, r120, r141; +} +mov.f64 fd212, 0dBFEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs12, fd212; +} +mov.b32 r149, {rs12, rs12}; +{ +mul.f16x2 r147, r34, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs13, fd211; +} +mov.b32 r155, {rs13, rs13}; +{ +mul.f16x2 r153, r28, r155; +} +{ +add.f16x2 r156, r132, r153; +} +{ +cvt.rn.f16.f64 rs14, fd212; +} +mov.b32 r161, {rs14, rs14}; +{ +mul.f16x2 r159, r31, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd215, 0dBFE4F49E7F775887; +{ +cvt.rn.f16.f64 rs15, fd215; +} +mov.b32 r167, {rs15, rs15}; +{ +mul.f16x2 r165, r37, r167; +} +{ +add.f16x2 r168, r144, r165; +} +mov.f64 fd216, 0dBFE82F19BB3A28A1; +{ +cvt.rn.f16.f64 rs16, fd216; +} +mov.b32 r173, {rs16, rs16}; +{ +mul.f16x2 r171, r46, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs17, fd215; +} +mov.b32 r179, {rs17, rs17}; +{ +mul.f16x2 r177, r40, r179; +} +{ +add.f16x2 r180, r156, r177; +} +{ +cvt.rn.f16.f64 rs18, fd216; +} +mov.b32 r185, {rs18, rs18}; +{ +mul.f16x2 r183, r43, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd219, 0dBFEEB42A9BCD5057; +{ +cvt.rn.f16.f64 rs19, fd219; +} +mov.b32 r191, {rs19, rs19}; +{ +mul.f16x2 r189, r49, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd220, 0dBFD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs20, fd220; +} +mov.b32 r197, {rs20, rs20}; +{ +mul.f16x2 r195, r58, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs21, fd219; +} +mov.b32 r203, {rs21, rs21}; +{ +mul.f16x2 r201, r52, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs22, fd220; +} +mov.b32 r209, {rs22, rs22}; +{ +mul.f16x2 r207, r55, r209; +} +{ +add.f16x2 r210, r186, r207; +} +{ +sub.f16x2 r213, r192, r198; +} +{ +add.f16x2 r216, r204, r210; +} +{ +add.f16x2 r219, r192, r198; +} +{ +sub.f16x2 r222, r204, r210; +} +cvt.rn.f16.s32 rs23, r1388; +mov.b32 r237, {rs23, rs23}; +cvt.rn.f16.s32 rs24, r1388; +mov.b32 r249, {rs24, rs24}; +{ +cvt.rn.f16.f64 rs25, fd207; +} +mov.b32 r229, {rs25, rs25}; +{ +mul.f16x2 r227, r1, r229; +} +{ +add.f16x2 r230, %44, r227; +} +{ +cvt.rn.f16.f64 rs26, fd214; +} +mov.b32 r235, {rs26, rs26}; +{ +mul.f16x2 r233, r10, r235; +} +{ +add.f16x2 r236, r237, r233; +} +{ +cvt.rn.f16.f64 rs27, fd207; +} +mov.b32 r241, {rs27, rs27}; +{ +mul.f16x2 r239, r4, r241; +} +{ +add.f16x2 r242, %45, r239; +} +{ +cvt.rn.f16.f64 rs28, fd214; +} +mov.b32 r247, {rs28, rs28}; +{ +mul.f16x2 r245, r7, r247; +} +{ +add.f16x2 r248, r249, r245; +} +{ +cvt.rn.f16.f64 rs29, fd215; +} +mov.b32 r253, {rs29, rs29}; +{ +mul.f16x2 r251, r13, r253; +} +{ +add.f16x2 r254, r230, r251; +} +{ +cvt.rn.f16.f64 rs30, fd216; +} +mov.b32 r259, {rs30, rs30}; +{ +mul.f16x2 r257, r22, r259; +} +{ +add.f16x2 r260, r236, r257; +} +{ +cvt.rn.f16.f64 rs31, fd215; +} +mov.b32 r265, {rs31, rs31}; +{ +mul.f16x2 r263, r16, r265; +} +{ +add.f16x2 r266, r242, r263; +} +{ +cvt.rn.f16.f64 rs32, fd216; +} +mov.b32 r271, {rs32, rs32}; +{ +mul.f16x2 r269, r19, r271; +} +{ +add.f16x2 r272, r248, r269; +} +{ +cvt.rn.f16.f64 rs33, fd219; +} +mov.b32 r277, {rs33, rs33}; +{ +mul.f16x2 r275, r25, r277; +} +{ +add.f16x2 r278, r254, r275; +} +mov.f64 fd148, 0d3FD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs34, fd148; +} +mov.b32 r283, {rs34, rs34}; +{ +mul.f16x2 r281, r34, r283; +} +{ +add.f16x2 r284, r260, r281; +} +{ +cvt.rn.f16.f64 rs35, fd219; +} +mov.b32 r289, {rs35, rs35}; +{ +mul.f16x2 r287, r28, r289; +} +{ +add.f16x2 r290, r266, r287; +} +{ +cvt.rn.f16.f64 rs36, fd148; +} +mov.b32 r295, {rs36, rs36}; +{ +mul.f16x2 r293, r31, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs37, fd211; +} +mov.b32 r301, {rs37, rs37}; +{ +mul.f16x2 r299, r37, r301; +} +{ +add.f16x2 r302, r278, r299; +} +mov.f64 fd168, 0d3FEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs38, fd168; +} +mov.b32 r307, {rs38, rs38}; +{ +mul.f16x2 r305, r46, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs39, fd211; +} +mov.b32 r313, {rs39, rs39}; +{ +mul.f16x2 r311, r40, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs40, fd168; +} +mov.b32 r319, {rs40, rs40}; +{ +mul.f16x2 r317, r43, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs41, fd203; +} +mov.b32 r325, {rs41, rs41}; +{ +mul.f16x2 r323, r49, r325; +} +{ +add.f16x2 r326, r302, r323; +} +mov.f64 fd188, 0d3FE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs42, fd188; +} +mov.b32 r331, {rs42, rs42}; +{ +mul.f16x2 r329, r58, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs43, fd203; +} +mov.b32 r337, {rs43, rs43}; +{ +mul.f16x2 r335, r52, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs44, fd188; +} +mov.b32 r343, {rs44, rs44}; +{ +mul.f16x2 r341, r55, r343; +} +{ +add.f16x2 r344, r320, r341; +} +{ +sub.f16x2 r347, r326, r332; +} +{ +add.f16x2 r350, r338, r344; +} +{ +add.f16x2 r353, r326, r332; +} +{ +sub.f16x2 r356, r338, r344; +} +cvt.rn.f16.s32 rs45, r1388; +mov.b32 r371, {rs45, rs45}; +cvt.rn.f16.s32 rs46, r1388; +mov.b32 r383, {rs46, rs46}; +{ +cvt.rn.f16.f64 rs47, fd211; +} +mov.b32 r363, {rs47, rs47}; +{ +mul.f16x2 r361, r1, r363; +} +{ +add.f16x2 r364, %44, r361; +} +{ +cvt.rn.f16.f64 rs48, fd212; +} +mov.b32 r369, {rs48, rs48}; +{ +mul.f16x2 r367, r10, r369; +} +{ +add.f16x2 r370, r371, r367; +} +{ +cvt.rn.f16.f64 rs49, fd211; +} +mov.b32 r375, {rs49, rs49}; +{ +mul.f16x2 r373, r4, r375; +} +{ +add.f16x2 r376, %45, r373; +} +{ +cvt.rn.f16.f64 rs50, fd212; +} +mov.b32 r381, {rs50, rs50}; +{ +mul.f16x2 r379, r7, r381; +} +{ +add.f16x2 r382, r383, r379; +} +{ +cvt.rn.f16.f64 rs51, fd219; +} +mov.b32 r387, {rs51, rs51}; +{ +mul.f16x2 r385, r13, r387; +} +{ +add.f16x2 r388, r364, r385; +} +{ +cvt.rn.f16.f64 rs52, fd148; +} +mov.b32 r393, {rs52, rs52}; +{ +mul.f16x2 r391, r22, r393; +} +{ +add.f16x2 r394, r370, r391; +} +{ +cvt.rn.f16.f64 rs53, fd219; +} +mov.b32 r399, {rs53, rs53}; +{ +mul.f16x2 r397, r16, r399; +} +{ +add.f16x2 r400, r376, r397; +} +{ +cvt.rn.f16.f64 rs54, fd148; +} +mov.b32 r405, {rs54, rs54}; +{ +mul.f16x2 r403, r19, r405; +} +{ +add.f16x2 r406, r382, r403; +} +{ +cvt.rn.f16.f64 rs55, fd207; +} +mov.b32 r411, {rs55, rs55}; +{ +mul.f16x2 r409, r25, r411; +} +{ +add.f16x2 r412, r388, r409; +} +mov.f64 fd196, 0d3FED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs56, fd196; +} +mov.b32 r417, {rs56, rs56}; +{ +mul.f16x2 r415, r34, r417; +} +{ +add.f16x2 r418, r394, r415; +} +{ +cvt.rn.f16.f64 rs57, fd207; +} +mov.b32 r423, {rs57, rs57}; +{ +mul.f16x2 r421, r28, r423; +} +{ +add.f16x2 r424, r400, r421; +} +{ +cvt.rn.f16.f64 rs58, fd196; +} +mov.b32 r429, {rs58, rs58}; +{ +mul.f16x2 r427, r31, r429; +} +{ +add.f16x2 r430, r406, r427; +} +{ +cvt.rn.f16.f64 rs59, fd203; +} +mov.b32 r435, {rs59, rs59}; +{ +mul.f16x2 r433, r37, r435; +} +{ +add.f16x2 r436, r412, r433; +} +{ +cvt.rn.f16.f64 rs60, fd218; +} +mov.b32 r441, {rs60, rs60}; +{ +mul.f16x2 r439, r46, r441; +} +{ +add.f16x2 r442, r418, r439; +} +{ +cvt.rn.f16.f64 rs61, fd203; +} +mov.b32 r447, {rs61, rs61}; +{ +mul.f16x2 r445, r40, r447; +} +{ +add.f16x2 r448, r424, r445; +} +{ +cvt.rn.f16.f64 rs62, fd218; +} +mov.b32 r453, {rs62, rs62}; +{ +mul.f16x2 r451, r43, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs63, fd215; +} +mov.b32 r459, {rs63, rs63}; +{ +mul.f16x2 r457, r49, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs64, fd216; +} +mov.b32 r465, {rs64, rs64}; +{ +mul.f16x2 r463, r58, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs65, fd215; +} +mov.b32 r471, {rs65, rs65}; +{ +mul.f16x2 r469, r52, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs66, fd216; +} +mov.b32 r477, {rs66, rs66}; +{ +mul.f16x2 r475, r55, r477; +} +{ +add.f16x2 r478, r454, r475; +} +{ +sub.f16x2 r481, r460, r466; +} +{ +add.f16x2 r484, r472, r478; +} +{ +add.f16x2 r487, r460, r466; +} +{ +sub.f16x2 r490, r472, r478; +} +cvt.rn.f16.s32 rs67, r1388; +mov.b32 r505, {rs67, rs67}; +cvt.rn.f16.s32 rs68, r1388; +mov.b32 r517, {rs68, rs68}; +{ +cvt.rn.f16.f64 rs69, fd215; +} +mov.b32 r497, {rs69, rs69}; +{ +mul.f16x2 r495, r1, r497; +} +{ +add.f16x2 r498, %44, r495; +} +{ +cvt.rn.f16.f64 rs70, fd216; +} +mov.b32 r503, {rs70, rs70}; +{ +mul.f16x2 r501, r10, r503; +} +{ +add.f16x2 r504, r505, r501; +} +{ +cvt.rn.f16.f64 rs71, fd215; +} +mov.b32 r509, {rs71, rs71}; +{ +mul.f16x2 r507, r4, r509; +} +{ +add.f16x2 r510, %45, r507; +} +{ +cvt.rn.f16.f64 rs72, fd216; +} +mov.b32 r515, {rs72, rs72}; +{ +mul.f16x2 r513, r7, r515; +} +{ +add.f16x2 r516, r517, r513; +} +{ +cvt.rn.f16.f64 rs73, fd211; +} +mov.b32 r521, {rs73, rs73}; +{ +mul.f16x2 r519, r13, r521; +} +{ +add.f16x2 r522, r498, r519; +} +{ +cvt.rn.f16.f64 rs74, fd168; +} +mov.b32 r527, {rs74, rs74}; +{ +mul.f16x2 r525, r22, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs75, fd211; +} +mov.b32 r533, {rs75, rs75}; +{ +mul.f16x2 r531, r16, r533; +} +{ +add.f16x2 r534, r510, r531; +} +{ +cvt.rn.f16.f64 rs76, fd168; +} +mov.b32 r539, {rs76, rs76}; +{ +mul.f16x2 r537, r19, r539; +} +{ +add.f16x2 r540, r516, r537; +} +{ +cvt.rn.f16.f64 rs77, fd203; +} +mov.b32 r545, {rs77, rs77}; +{ +mul.f16x2 r543, r25, r545; +} +{ +add.f16x2 r546, r522, r543; +} +{ +cvt.rn.f16.f64 rs78, fd218; +} +mov.b32 r551, {rs78, rs78}; +{ +mul.f16x2 r549, r34, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs79, fd203; +} +mov.b32 r557, {rs79, rs79}; +{ +mul.f16x2 r555, r28, r557; +} +{ +add.f16x2 r558, r534, r555; +} +{ +cvt.rn.f16.f64 rs80, fd218; +} +mov.b32 r563, {rs80, rs80}; +{ +mul.f16x2 r561, r31, r563; +} +{ +add.f16x2 r564, r540, r561; +} +{ +cvt.rn.f16.f64 rs81, fd219; +} +mov.b32 r569, {rs81, rs81}; +{ +mul.f16x2 r567, r37, r569; +} +{ +add.f16x2 r570, r546, r567; +} +{ +cvt.rn.f16.f64 rs82, fd220; +} +mov.b32 r575, {rs82, rs82}; +{ +mul.f16x2 r573, r46, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs83, fd219; +} +mov.b32 r581, {rs83, rs83}; +{ +mul.f16x2 r579, r40, r581; +} +{ +add.f16x2 r582, r558, r579; +} +{ +cvt.rn.f16.f64 rs84, fd220; +} +mov.b32 r587, {rs84, rs84}; +{ +mul.f16x2 r585, r43, r587; +} +{ +add.f16x2 r588, r564, r585; +} +{ +cvt.rn.f16.f64 rs85, fd207; +} +mov.b32 r593, {rs85, rs85}; +{ +mul.f16x2 r591, r49, r593; +} +{ +add.f16x2 r594, r570, r591; +} +{ +cvt.rn.f16.f64 rs86, fd196; +} +mov.b32 r599, {rs86, rs86}; +{ +mul.f16x2 r597, r58, r599; +} +{ +add.f16x2 r600, r576, r597; +} +{ +cvt.rn.f16.f64 rs87, fd207; +} +mov.b32 r605, {rs87, rs87}; +{ +mul.f16x2 r603, r52, r605; +} +{ +add.f16x2 r606, r582, r603; +} +{ +cvt.rn.f16.f64 rs88, fd196; +} +mov.b32 r611, {rs88, rs88}; +{ +mul.f16x2 r609, r55, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +sub.f16x2 r615, r594, r600; +} +{ +add.f16x2 r618, r606, r612; +} +{ +add.f16x2 r621, r594, r600; +} +{ +sub.f16x2 r624, r606, r612; +} +cvt.rn.f16.s32 rs89, r1388; +mov.b32 r639, {rs89, rs89}; +cvt.rn.f16.s32 rs90, r1388; +mov.b32 r651, {rs90, rs90}; +{ +cvt.rn.f16.f64 rs91, fd219; +} +mov.b32 r631, {rs91, rs91}; +{ +mul.f16x2 r629, r1, r631; +} +{ +add.f16x2 r632, %44, r629; +} +{ +cvt.rn.f16.f64 rs92, fd220; +} +mov.b32 r637, {rs92, rs92}; +{ +mul.f16x2 r635, r10, r637; +} +{ +add.f16x2 r638, r639, r635; +} +{ +cvt.rn.f16.f64 rs93, fd219; +} +mov.b32 r643, {rs93, rs93}; +{ +mul.f16x2 r641, r4, r643; +} +{ +add.f16x2 r644, %45, r641; +} +{ +cvt.rn.f16.f64 rs94, fd220; +} +mov.b32 r649, {rs94, rs94}; +{ +mul.f16x2 r647, r7, r649; +} +{ +add.f16x2 r650, r651, r647; +} +{ +cvt.rn.f16.f64 rs95, fd203; +} +mov.b32 r655, {rs95, rs95}; +{ +mul.f16x2 r653, r13, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs96, fd188; +} +mov.b32 r661, {rs96, rs96}; +{ +mul.f16x2 r659, r22, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs97, fd203; +} +mov.b32 r667, {rs97, rs97}; +{ +mul.f16x2 r665, r16, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs98, fd188; +} +mov.b32 r673, {rs98, rs98}; +{ +mul.f16x2 r671, r19, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs99, fd215; +} +mov.b32 r679, {rs99, rs99}; +{ +mul.f16x2 r677, r25, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs100, fd216; +} +mov.b32 r685, {rs100, rs100}; +{ +mul.f16x2 r683, r34, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs101, fd215; +} +mov.b32 r691, {rs101, rs101}; +{ +mul.f16x2 r689, r28, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs102, fd216; +} +mov.b32 r697, {rs102, rs102}; +{ +mul.f16x2 r695, r31, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs103, fd207; +} +mov.b32 r703, {rs103, rs103}; +{ +mul.f16x2 r701, r37, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs104, fd196; +} +mov.b32 r709, {rs104, rs104}; +{ +mul.f16x2 r707, r46, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs105, fd207; +} +mov.b32 r715, {rs105, rs105}; +{ +mul.f16x2 r713, r40, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs106, fd196; +} +mov.b32 r721, {rs106, rs106}; +{ +mul.f16x2 r719, r43, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs107, fd211; +} +mov.b32 r727, {rs107, rs107}; +{ +mul.f16x2 r725, r49, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs108, fd212; +} +mov.b32 r733, {rs108, rs108}; +{ +mul.f16x2 r731, r58, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs109, fd211; +} +mov.b32 r739, {rs109, rs109}; +{ +mul.f16x2 r737, r52, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs110, fd212; +} +mov.b32 r745, {rs110, rs110}; +{ +mul.f16x2 r743, r55, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +sub.f16x2 r749, r728, r734; +} +{ +add.f16x2 r752, r740, r746; +} +{ +add.f16x2 r755, r728, r734; +} +{ +sub.f16x2 r758, r740, r746; +} +{ +add.f16x2 r761, %74, %66; +} +{ +add.f16x2 r764, %77, %71; +} +{ +sub.f16x2 r767, %74, %66; +} +{ +sub.f16x2 r770, %77, %71; +} +{ +add.f16x2 r773, %84, %80; +} +{ +add.f16x2 r776, %87, %82; +} +{ +sub.f16x2 r779, %84, %80; +} +{ +sub.f16x2 r782, %87, %82; +} +{ +add.f16x2 r785, %75, %68; +} +{ +add.f16x2 r788, %78, %72; +} +{ +sub.f16x2 r791, %75, %68; +} +{ +sub.f16x2 r794, %78, %72; +} +{ +add.f16x2 r797, %86, %81; +} +{ +add.f16x2 r800, %67, %83; +} +{ +sub.f16x2 r803, %86, %81; +} +{ +sub.f16x2 r806, %67, %83; +} +{ +add.f16x2 r809, %76, %70; +} +{ +add.f16x2 r812, %79, %73; +} +{ +sub.f16x2 r815, %76, %70; +} +{ +sub.f16x2 r818, %79, %73; +} +{ +add.f16x2 r821, %85, r761; +} +{ +add.f16x2 r824, %69, r764; +} +{ +add.f16x2 r827, r821, r773; +} +{ +add.f16x2 r830, r824, r776; +} +{ +add.f16x2 r833, r827, r785; +} +{ +add.f16x2 r836, r830, r788; +} +{ +add.f16x2 r839, r833, r797; +} +{ +add.f16x2 r842, r836, r800; +} +{ +add.f16x2 r845, r839, r809; +} +{ +add.f16x2 r848, r842, r812; +} +cvt.rn.f16.s32 rs111, r1388; +mov.b32 r863, {rs111, rs111}; +cvt.rn.f16.s32 rs112, r1388; +mov.b32 r875, {rs112, rs112}; +{ +cvt.rn.f16.f64 rs113, fd203; +} +mov.b32 r855, {rs113, rs113}; +{ +mul.f16x2 r853, r761, r855; +} +{ +add.f16x2 r856, %85, r853; +} +{ +cvt.rn.f16.f64 rs114, fd218; +} +mov.b32 r861, {rs114, rs114}; +{ +mul.f16x2 r859, r770, r861; +} +{ +add.f16x2 r862, r863, r859; +} +{ +cvt.rn.f16.f64 rs115, fd203; +} +mov.b32 r867, {rs115, rs115}; +{ +mul.f16x2 r865, r764, r867; +} +{ +add.f16x2 r868, %69, r865; +} +{ +cvt.rn.f16.f64 rs116, fd218; +} +mov.b32 r873, {rs116, rs116}; +{ +mul.f16x2 r871, r767, r873; +} +{ +add.f16x2 r874, r875, r871; +} +{ +cvt.rn.f16.f64 rs117, fd207; +} +mov.b32 r879, {rs117, rs117}; +{ +mul.f16x2 r877, r773, r879; +} +{ +add.f16x2 r880, r856, r877; +} +{ +cvt.rn.f16.f64 rs118, fd214; +} +mov.b32 r885, {rs118, rs118}; +{ +mul.f16x2 r883, r782, r885; +} +{ +add.f16x2 r886, r862, r883; +} +{ +cvt.rn.f16.f64 rs119, fd207; +} +mov.b32 r891, {rs119, rs119}; +{ +mul.f16x2 r889, r776, r891; +} +{ +add.f16x2 r892, r868, r889; +} +{ +cvt.rn.f16.f64 rs120, fd214; +} +mov.b32 r897, {rs120, rs120}; +{ +mul.f16x2 r895, r779, r897; +} +{ +add.f16x2 r898, r874, r895; +} +{ +cvt.rn.f16.f64 rs121, fd211; +} +mov.b32 r903, {rs121, rs121}; +{ +mul.f16x2 r901, r785, r903; +} +{ +add.f16x2 r904, r880, r901; +} +{ +cvt.rn.f16.f64 rs122, fd212; +} +mov.b32 r909, {rs122, rs122}; +{ +mul.f16x2 r907, r794, r909; +} +{ +add.f16x2 r910, r886, r907; +} +{ +cvt.rn.f16.f64 rs123, fd211; +} +mov.b32 r915, {rs123, rs123}; +{ +mul.f16x2 r913, r788, r915; +} +{ +add.f16x2 r916, r892, r913; +} +{ +cvt.rn.f16.f64 rs124, fd212; +} +mov.b32 r921, {rs124, rs124}; +{ +mul.f16x2 r919, r791, r921; +} +{ +add.f16x2 r922, r898, r919; +} +{ +cvt.rn.f16.f64 rs125, fd215; +} +mov.b32 r927, {rs125, rs125}; +{ +mul.f16x2 r925, r797, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs126, fd216; +} +mov.b32 r933, {rs126, rs126}; +{ +mul.f16x2 r931, r806, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs127, fd215; +} +mov.b32 r939, {rs127, rs127}; +{ +mul.f16x2 r937, r800, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs128, fd216; +} +mov.b32 r945, {rs128, rs128}; +{ +mul.f16x2 r943, r803, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs129, fd219; +} +mov.b32 r951, {rs129, rs129}; +{ +mul.f16x2 r949, r809, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs130, fd220; +} +mov.b32 r957, {rs130, rs130}; +{ +mul.f16x2 r955, r818, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs131, fd219; +} +mov.b32 r963, {rs131, rs131}; +{ +mul.f16x2 r961, r812, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs132, fd220; +} +mov.b32 r969, {rs132, rs132}; +{ +mul.f16x2 r967, r815, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +sub.f16x2 r973, r952, r958; +} +{ +add.f16x2 r976, r964, r970; +} +{ +add.f16x2 r979, r952, r958; +} +{ +sub.f16x2 r982, r964, r970; +} +cvt.rn.f16.s32 rs133, r1388; +mov.b32 r997, {rs133, rs133}; +cvt.rn.f16.s32 rs134, r1388; +mov.b32 r1009, {rs134, rs134}; +{ +cvt.rn.f16.f64 rs135, fd207; +} +mov.b32 r989, {rs135, rs135}; +{ +mul.f16x2 r987, r761, r989; +} +{ +add.f16x2 r990, %85, r987; +} +{ +cvt.rn.f16.f64 rs136, fd214; +} +mov.b32 r995, {rs136, rs136}; +{ +mul.f16x2 r993, r770, r995; +} +{ +add.f16x2 r996, r997, r993; +} +{ +cvt.rn.f16.f64 rs137, fd207; +} +mov.b32 r1001, {rs137, rs137}; +{ +mul.f16x2 r999, r764, r1001; +} +{ +add.f16x2 r1002, %69, r999; +} +{ +cvt.rn.f16.f64 rs138, fd214; +} +mov.b32 r1007, {rs138, rs138}; +{ +mul.f16x2 r1005, r767, r1007; +} +{ +add.f16x2 r1008, r1009, r1005; +} +{ +cvt.rn.f16.f64 rs139, fd215; +} +mov.b32 r1013, {rs139, rs139}; +{ +mul.f16x2 r1011, r773, r1013; +} +{ +add.f16x2 r1014, r990, r1011; +} +{ +cvt.rn.f16.f64 rs140, fd216; +} +mov.b32 r1019, {rs140, rs140}; +{ +mul.f16x2 r1017, r782, r1019; +} +{ +add.f16x2 r1020, r996, r1017; +} +{ +cvt.rn.f16.f64 rs141, fd215; +} +mov.b32 r1025, {rs141, rs141}; +{ +mul.f16x2 r1023, r776, r1025; +} +{ +add.f16x2 r1026, r1002, r1023; +} +{ +cvt.rn.f16.f64 rs142, fd216; +} +mov.b32 r1031, {rs142, rs142}; +{ +mul.f16x2 r1029, r779, r1031; +} +{ +add.f16x2 r1032, r1008, r1029; +} +{ +cvt.rn.f16.f64 rs143, fd219; +} +mov.b32 r1037, {rs143, rs143}; +{ +mul.f16x2 r1035, r785, r1037; +} +{ +add.f16x2 r1038, r1014, r1035; +} +{ +cvt.rn.f16.f64 rs144, fd148; +} +mov.b32 r1043, {rs144, rs144}; +{ +mul.f16x2 r1041, r794, r1043; +} +{ +add.f16x2 r1044, r1020, r1041; +} +{ +cvt.rn.f16.f64 rs145, fd219; +} +mov.b32 r1049, {rs145, rs145}; +{ +mul.f16x2 r1047, r788, r1049; +} +{ +add.f16x2 r1050, r1026, r1047; +} +{ +cvt.rn.f16.f64 rs146, fd148; +} +mov.b32 r1055, {rs146, rs146}; +{ +mul.f16x2 r1053, r791, r1055; +} +{ +add.f16x2 r1056, r1032, r1053; +} +{ +cvt.rn.f16.f64 rs147, fd211; +} +mov.b32 r1061, {rs147, rs147}; +{ +mul.f16x2 r1059, r797, r1061; +} +{ +add.f16x2 r1062, r1038, r1059; +} +{ +cvt.rn.f16.f64 rs148, fd168; +} +mov.b32 r1067, {rs148, rs148}; +{ +mul.f16x2 r1065, r806, r1067; +} +{ +add.f16x2 r1068, r1044, r1065; +} +{ +cvt.rn.f16.f64 rs149, fd211; +} +mov.b32 r1073, {rs149, rs149}; +{ +mul.f16x2 r1071, r800, r1073; +} +{ +add.f16x2 r1074, r1050, r1071; +} +{ +cvt.rn.f16.f64 rs150, fd168; +} +mov.b32 r1079, {rs150, rs150}; +{ +mul.f16x2 r1077, r803, r1079; +} +{ +add.f16x2 r1080, r1056, r1077; +} +{ +cvt.rn.f16.f64 rs151, fd203; +} +mov.b32 r1085, {rs151, rs151}; +{ +mul.f16x2 r1083, r809, r1085; +} +{ +add.f16x2 r1086, r1062, r1083; +} +{ +cvt.rn.f16.f64 rs152, fd188; +} +mov.b32 r1091, {rs152, rs152}; +{ +mul.f16x2 r1089, r818, r1091; +} +{ +add.f16x2 r1092, r1068, r1089; +} +{ +cvt.rn.f16.f64 rs153, fd203; +} +mov.b32 r1097, {rs153, rs153}; +{ +mul.f16x2 r1095, r812, r1097; +} +{ +add.f16x2 r1098, r1074, r1095; +} +{ +cvt.rn.f16.f64 rs154, fd188; +} +mov.b32 r1103, {rs154, rs154}; +{ +mul.f16x2 r1101, r815, r1103; +} +{ +add.f16x2 r1104, r1080, r1101; +} +{ +sub.f16x2 r1107, r1086, r1092; +} +{ +add.f16x2 r1110, r1098, r1104; +} +{ +add.f16x2 r1113, r1086, r1092; +} +{ +sub.f16x2 r1116, r1098, r1104; +} +cvt.rn.f16.s32 rs155, r1388; +mov.b32 r1131, {rs155, rs155}; +cvt.rn.f16.s32 rs156, r1388; +mov.b32 r1143, {rs156, rs156}; +{ +cvt.rn.f16.f64 rs157, fd211; +} +mov.b32 r1123, {rs157, rs157}; +{ +mul.f16x2 r1121, r761, r1123; +} +{ +add.f16x2 r1124, %85, r1121; +} +{ +cvt.rn.f16.f64 rs158, fd212; +} +mov.b32 r1129, {rs158, rs158}; +{ +mul.f16x2 r1127, r770, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +cvt.rn.f16.f64 rs159, fd211; +} +mov.b32 r1135, {rs159, rs159}; +{ +mul.f16x2 r1133, r764, r1135; +} +{ +add.f16x2 r1136, %69, r1133; +} +{ +cvt.rn.f16.f64 rs160, fd212; +} +mov.b32 r1141, {rs160, rs160}; +{ +mul.f16x2 r1139, r767, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +cvt.rn.f16.f64 rs161, fd219; +} +mov.b32 r1147, {rs161, rs161}; +{ +mul.f16x2 r1145, r773, r1147; +} +{ +add.f16x2 r1148, r1124, r1145; +} +{ +cvt.rn.f16.f64 rs162, fd148; +} +mov.b32 r1153, {rs162, rs162}; +{ +mul.f16x2 r1151, r782, r1153; +} +{ +add.f16x2 r1154, r1130, r1151; +} +{ +cvt.rn.f16.f64 rs163, fd219; +} +mov.b32 r1159, {rs163, rs163}; +{ +mul.f16x2 r1157, r776, r1159; +} +{ +add.f16x2 r1160, r1136, r1157; +} +{ +cvt.rn.f16.f64 rs164, fd148; +} +mov.b32 r1165, {rs164, rs164}; +{ +mul.f16x2 r1163, r779, r1165; +} +{ +add.f16x2 r1166, r1142, r1163; +} +{ +cvt.rn.f16.f64 rs165, fd207; +} +mov.b32 r1171, {rs165, rs165}; +{ +mul.f16x2 r1169, r785, r1171; +} +{ +add.f16x2 r1172, r1148, r1169; +} +{ +cvt.rn.f16.f64 rs166, fd196; +} +mov.b32 r1177, {rs166, rs166}; +{ +mul.f16x2 r1175, r794, r1177; +} +{ +add.f16x2 r1178, r1154, r1175; +} +{ +cvt.rn.f16.f64 rs167, fd207; +} +mov.b32 r1183, {rs167, rs167}; +{ +mul.f16x2 r1181, r788, r1183; +} +{ +add.f16x2 r1184, r1160, r1181; +} +{ +cvt.rn.f16.f64 rs168, fd196; +} +mov.b32 r1189, {rs168, rs168}; +{ +mul.f16x2 r1187, r791, r1189; +} +{ +add.f16x2 r1190, r1166, r1187; +} +{ +cvt.rn.f16.f64 rs169, fd203; +} +mov.b32 r1195, {rs169, rs169}; +{ +mul.f16x2 r1193, r797, r1195; +} +{ +add.f16x2 r1196, r1172, r1193; +} +{ +cvt.rn.f16.f64 rs170, fd218; +} +mov.b32 r1201, {rs170, rs170}; +{ +mul.f16x2 r1199, r806, r1201; +} +{ +add.f16x2 r1202, r1178, r1199; +} +{ +cvt.rn.f16.f64 rs171, fd203; +} +mov.b32 r1207, {rs171, rs171}; +{ +mul.f16x2 r1205, r800, r1207; +} +{ +add.f16x2 r1208, r1184, r1205; +} +{ +cvt.rn.f16.f64 rs172, fd218; +} +mov.b32 r1213, {rs172, rs172}; +{ +mul.f16x2 r1211, r803, r1213; +} +{ +add.f16x2 r1214, r1190, r1211; +} +{ +cvt.rn.f16.f64 rs173, fd215; +} +mov.b32 r1219, {rs173, rs173}; +{ +mul.f16x2 r1217, r809, r1219; +} +{ +add.f16x2 r1220, r1196, r1217; +} +{ +cvt.rn.f16.f64 rs174, fd216; +} +mov.b32 r1225, {rs174, rs174}; +{ +mul.f16x2 r1223, r818, r1225; +} +{ +add.f16x2 r1226, r1202, r1223; +} +{ +cvt.rn.f16.f64 rs175, fd215; +} +mov.b32 r1231, {rs175, rs175}; +{ +mul.f16x2 r1229, r812, r1231; +} +{ +add.f16x2 r1232, r1208, r1229; +} +{ +cvt.rn.f16.f64 rs176, fd216; +} +mov.b32 r1237, {rs176, rs176}; +{ +mul.f16x2 r1235, r815, r1237; +} +{ +add.f16x2 r1238, r1214, r1235; +} +{ +sub.f16x2 r1241, r1220, r1226; +} +{ +add.f16x2 r1244, r1232, r1238; +} +{ +add.f16x2 r1247, r1220, r1226; +} +{ +sub.f16x2 r1250, r1232, r1238; +} +cvt.rn.f16.s32 rs177, r1388; +mov.b32 r1265, {rs177, rs177}; +cvt.rn.f16.s32 rs178, r1388; +mov.b32 r1277, {rs178, rs178}; +{ +cvt.rn.f16.f64 rs179, fd215; +} +mov.b32 r1257, {rs179, rs179}; +{ +mul.f16x2 r1255, r761, r1257; +} +{ +add.f16x2 r1258, %85, r1255; +} +{ +cvt.rn.f16.f64 rs180, fd216; +} +mov.b32 r1263, {rs180, rs180}; +{ +mul.f16x2 r1261, r770, r1263; +} +{ +add.f16x2 r1264, r1265, r1261; +} +{ +cvt.rn.f16.f64 rs181, fd215; +} +mov.b32 r1269, {rs181, rs181}; +{ +mul.f16x2 r1267, r764, r1269; +} +{ +add.f16x2 r1270, %69, r1267; +} +{ +cvt.rn.f16.f64 rs182, fd216; +} +mov.b32 r1275, {rs182, rs182}; +{ +mul.f16x2 r1273, r767, r1275; +} +{ +add.f16x2 r1276, r1277, r1273; +} +{ +cvt.rn.f16.f64 rs183, fd211; +} +mov.b32 r1281, {rs183, rs183}; +{ +mul.f16x2 r1279, r773, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs184, fd168; +} +mov.b32 r1287, {rs184, rs184}; +{ +mul.f16x2 r1285, r782, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +cvt.rn.f16.f64 rs185, fd211; +} +mov.b32 r1293, {rs185, rs185}; +{ +mul.f16x2 r1291, r776, r1293; +} +{ +add.f16x2 r1294, r1270, r1291; +} +{ +cvt.rn.f16.f64 rs186, fd168; +} +mov.b32 r1299, {rs186, rs186}; +{ +mul.f16x2 r1297, r779, r1299; +} +{ +add.f16x2 r1300, r1276, r1297; +} +{ +cvt.rn.f16.f64 rs187, fd203; +} +mov.b32 r1305, {rs187, rs187}; +{ +mul.f16x2 r1303, r785, r1305; +} +{ +add.f16x2 r1306, r1282, r1303; +} +{ +cvt.rn.f16.f64 rs188, fd218; +} +mov.b32 r1311, {rs188, rs188}; +{ +mul.f16x2 r1309, r794, r1311; +} +{ +add.f16x2 r1312, r1288, r1309; +} +{ +cvt.rn.f16.f64 rs189, fd203; +} +mov.b32 r1317, {rs189, rs189}; +{ +mul.f16x2 r1315, r788, r1317; +} +{ +add.f16x2 r1318, r1294, r1315; +} +{ +cvt.rn.f16.f64 rs190, fd218; +} +mov.b32 r1323, {rs190, rs190}; +{ +mul.f16x2 r1321, r791, r1323; +} +{ +add.f16x2 r1324, r1300, r1321; +} +{ +cvt.rn.f16.f64 rs191, fd219; +} +mov.b32 r1329, {rs191, rs191}; +{ +mul.f16x2 r1327, r797, r1329; +} +{ +add.f16x2 r1330, r1306, r1327; +} +{ +cvt.rn.f16.f64 rs192, fd220; +} +mov.b32 r1335, {rs192, rs192}; +{ +mul.f16x2 r1333, r806, r1335; +} +{ +add.f16x2 r1336, r1312, r1333; +} +{ +cvt.rn.f16.f64 rs193, fd219; +} +mov.b32 r1341, {rs193, rs193}; +{ +mul.f16x2 r1339, r800, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs194, fd220; +} +mov.b32 r1347, {rs194, rs194}; +{ +mul.f16x2 r1345, r803, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs195, fd207; +} +mov.b32 r1353, {rs195, rs195}; +{ +mul.f16x2 r1351, r809, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs196, fd196; +} +mov.b32 r1359, {rs196, rs196}; +{ +mul.f16x2 r1357, r818, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs197, fd207; +} +mov.b32 r1365, {rs197, rs197}; +{ +mul.f16x2 r1363, r812, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +cvt.rn.f16.f64 rs198, fd196; +} +mov.b32 r1371, {rs198, rs198}; +{ +mul.f16x2 r1369, r815, r1371; +} +{ +add.f16x2 r1372, r1348, r1369; +} +{ +sub.f16x2 r1375, r1354, r1360; +} +{ +add.f16x2 r1378, r1366, r1372; +} +{ +add.f16x2 r1381, r1354, r1360; +} +{ +sub.f16x2 r1384, r1366, r1372; +} +cvt.rn.f16.s32 rs199, r1388; +mov.b32 r1399, {rs199, rs199}; +cvt.rn.f16.s32 rs200, r1388; +mov.b32 r1411, {rs200, rs200}; +{ +cvt.rn.f16.f64 rs201, fd219; +} +mov.b32 r1391, {rs201, rs201}; +{ +mul.f16x2 r1389, r761, r1391; +} +{ +add.f16x2 r1392, %85, r1389; +} +{ +cvt.rn.f16.f64 rs202, fd220; +} +mov.b32 r1397, {rs202, rs202}; +{ +mul.f16x2 r1395, r770, r1397; +} +{ +add.f16x2 r1398, r1399, r1395; +} +{ +cvt.rn.f16.f64 rs203, fd219; +} +mov.b32 r1403, {rs203, rs203}; +{ +mul.f16x2 r1401, r764, r1403; +} +{ +add.f16x2 r1404, %69, r1401; +} +{ +cvt.rn.f16.f64 rs204, fd220; +} +mov.b32 r1409, {rs204, rs204}; +{ +mul.f16x2 r1407, r767, r1409; +} +{ +add.f16x2 r1410, r1411, r1407; +} +{ +cvt.rn.f16.f64 rs205, fd203; +} +mov.b32 r1415, {rs205, rs205}; +{ +mul.f16x2 r1413, r773, r1415; +} +{ +add.f16x2 r1416, r1392, r1413; +} +{ +cvt.rn.f16.f64 rs206, fd188; +} +mov.b32 r1421, {rs206, rs206}; +{ +mul.f16x2 r1419, r782, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs207, fd203; +} +mov.b32 r1427, {rs207, rs207}; +{ +mul.f16x2 r1425, r776, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs208, fd188; +} +mov.b32 r1433, {rs208, rs208}; +{ +mul.f16x2 r1431, r779, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs209, fd215; +} +mov.b32 r1439, {rs209, rs209}; +{ +mul.f16x2 r1437, r785, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs210, fd216; +} +mov.b32 r1445, {rs210, rs210}; +{ +mul.f16x2 r1443, r794, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs211, fd215; +} +mov.b32 r1451, {rs211, rs211}; +{ +mul.f16x2 r1449, r788, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs212, fd216; +} +mov.b32 r1457, {rs212, rs212}; +{ +mul.f16x2 r1455, r791, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs213, fd207; +} +mov.b32 r1463, {rs213, rs213}; +{ +mul.f16x2 r1461, r797, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs214, fd196; +} +mov.b32 r1469, {rs214, rs214}; +{ +mul.f16x2 r1467, r806, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs215, fd207; +} +mov.b32 r1475, {rs215, rs215}; +{ +mul.f16x2 r1473, r800, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs216, fd196; +} +mov.b32 r1481, {rs216, rs216}; +{ +mul.f16x2 r1479, r803, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs217, fd211; +} +mov.b32 r1487, {rs217, rs217}; +{ +mul.f16x2 r1485, r809, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs218, fd212; +} +mov.b32 r1493, {rs218, rs218}; +{ +mul.f16x2 r1491, r818, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs219, fd211; +} +mov.b32 r1499, {rs219, rs219}; +{ +mul.f16x2 r1497, r812, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs220, fd212; +} +mov.b32 r1505, {rs220, rs220}; +{ +mul.f16x2 r1503, r815, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +sub.f16x2 r1509, r1488, r1494; +} +{ +add.f16x2 r1512, r1500, r1506; +} +{ +add.f16x2 r1515, r1488, r1494; +} +{ +sub.f16x2 r1518, r1500, r1506; +} +mov.f64 fd201, 0d3FEEB42A9BCD5057; +{ +cvt.rn.f16.f64 rs221, fd201; +} +{ +cvt.rn.f16.f64 rs222, fd220; +} +{ +cvt.rn.f16.f64 rs223, fd203; +} +{ +cvt.rn.f16.f64 rs224, fd218; +} +mov.f64 fd205, 0d3FE4F49E7F775887; +{ +cvt.rn.f16.f64 rs225, fd205; +} +{ +cvt.rn.f16.f64 rs226, fd216; +} +{ +cvt.rn.f16.f64 rs227, fd207; +} +{ +cvt.rn.f16.f64 rs228, fd214; +} +mov.f64 fd209, 0d3FC2375F640F44DB; +{ +cvt.rn.f16.f64 rs229, fd209; +} +{ +cvt.rn.f16.f64 rs230, fd212; +} +{ +cvt.rn.f16.f64 rs231, fd211; +} +{ +cvt.rn.f16.f64 rs232, fd212; +} +mov.f64 fd213, 0dBFDA9628D9C712B6; +{ +cvt.rn.f16.f64 rs233, fd213; +} +{ +cvt.rn.f16.f64 rs234, fd214; +} +{ +cvt.rn.f16.f64 rs235, fd215; +} +{ +cvt.rn.f16.f64 rs236, fd216; +} +mov.f64 fd217, 0dBFEAEB8C8764F0BA; +{ +cvt.rn.f16.f64 rs237, fd217; +} +{ +cvt.rn.f16.f64 rs238, fd218; +} +{ +cvt.rn.f16.f64 rs239, fd219; +} +{ +cvt.rn.f16.f64 rs240, fd220; +} +mov.b32 r1535, {rs221, rs221}; +{ +mul.f16x2 r1521, r973, r1535; +} +mov.b32 r1532, {rs222, rs222}; +{ +mul.f16x2 r1524, r976, r1532; +} +{ +sub.f16x2 r1527, r1521, r1524; +} +{ +mul.f16x2 r1530, r973, r1532; +} +{ +fma.rn.f16x2 r1533, r976, r1535, r1530; +} +mov.b32 r1551, {rs223, rs223}; +{ +mul.f16x2 r1537, r1107, r1551; +} +mov.b32 r1548, {rs224, rs224}; +{ +mul.f16x2 r1540, r1110, r1548; +} +{ +sub.f16x2 r1543, r1537, r1540; +} +{ +mul.f16x2 r1546, r1107, r1548; +} +{ +fma.rn.f16x2 r1549, r1110, r1551, r1546; +} +mov.b32 r1567, {rs225, rs225}; +{ +mul.f16x2 r1553, r1241, r1567; +} +mov.b32 r1564, {rs226, rs226}; +{ +mul.f16x2 r1556, r1244, r1564; +} +{ +sub.f16x2 r1559, r1553, r1556; +} +{ +mul.f16x2 r1562, r1241, r1564; +} +{ +fma.rn.f16x2 r1565, r1244, r1567, r1562; +} +mov.b32 r1583, {rs227, rs227}; +{ +mul.f16x2 r1569, r1375, r1583; +} +mov.b32 r1580, {rs228, rs228}; +{ +mul.f16x2 r1572, r1378, r1580; +} +{ +sub.f16x2 r1575, r1569, r1572; +} +{ +mul.f16x2 r1578, r1375, r1580; +} +{ +fma.rn.f16x2 r1581, r1378, r1583, r1578; +} +mov.b32 r1599, {rs229, rs229}; +{ +mul.f16x2 r1585, r1509, r1599; +} +mov.b32 r1596, {rs230, rs230}; +{ +mul.f16x2 r1588, r1512, r1596; +} +{ +sub.f16x2 r1591, r1585, r1588; +} +{ +mul.f16x2 r1594, r1509, r1596; +} +{ +fma.rn.f16x2 r1597, r1512, r1599, r1594; +} +mov.b32 r1615, {rs231, rs231}; +{ +mul.f16x2 r1601, r1515, r1615; +} +mov.b32 r1612, {rs232, rs232}; +{ +mul.f16x2 r1604, r1518, r1612; +} +{ +sub.f16x2 r1607, r1601, r1604; +} +{ +mul.f16x2 r1610, r1515, r1612; +} +{ +fma.rn.f16x2 r1613, r1518, r1615, r1610; +} +mov.b32 r1631, {rs233, rs233}; +{ +mul.f16x2 r1617, r1381, r1631; +} +mov.b32 r1628, {rs234, rs234}; +{ +mul.f16x2 r1620, r1384, r1628; +} +{ +sub.f16x2 r1623, r1617, r1620; +} +{ +mul.f16x2 r1626, r1381, r1628; +} +{ +fma.rn.f16x2 r1629, r1384, r1631, r1626; +} +mov.b32 r1647, {rs235, rs235}; +{ +mul.f16x2 r1633, r1247, r1647; +} +mov.b32 r1644, {rs236, rs236}; +{ +mul.f16x2 r1636, r1250, r1644; +} +{ +sub.f16x2 r1639, r1633, r1636; +} +{ +mul.f16x2 r1642, r1247, r1644; +} +{ +fma.rn.f16x2 r1645, r1250, r1647, r1642; +} +mov.b32 r1663, {rs237, rs237}; +{ +mul.f16x2 r1649, r1113, r1663; +} +mov.b32 r1660, {rs238, rs238}; +{ +mul.f16x2 r1652, r1116, r1660; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r1113, r1660; +} +{ +fma.rn.f16x2 r1661, r1116, r1663, r1658; +} +mov.b32 r1679, {rs239, rs239}; +{ +mul.f16x2 r1665, r979, r1679; +} +mov.b32 r1676, {rs240, rs240}; +{ +mul.f16x2 r1668, r982, r1676; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r979, r1676; +} +{ +fma.rn.f16x2 r1677, r982, r1679, r1674; +} +{ +add.f16x2 %0, r85, r845; +} +{ +add.f16x2 %1, r88, r848; +} +{ +sub.f16x2 %22, r85, r845; +} +{ +sub.f16x2 %23, r88, r848; +} +{ +add.f16x2 %2, r213, r1527; +} +{ +add.f16x2 %3, r216, r1533; +} +{ +sub.f16x2 %24, r213, r1527; +} +{ +sub.f16x2 %25, r216, r1533; +} +{ +add.f16x2 %4, r347, r1543; +} +{ +add.f16x2 %5, r350, r1549; +} +{ +sub.f16x2 %26, r347, r1543; +} +{ +sub.f16x2 %27, r350, r1549; +} +{ +add.f16x2 %6, r481, r1559; +} +{ +add.f16x2 %7, r484, r1565; +} +{ +sub.f16x2 %28, r481, r1559; +} +{ +sub.f16x2 %29, r484, r1565; +} +{ +add.f16x2 %8, r615, r1575; +} +{ +add.f16x2 %9, r618, r1581; +} +{ +sub.f16x2 %30, r615, r1575; +} +{ +sub.f16x2 %31, r618, r1581; +} +{ +add.f16x2 %10, r749, r1591; +} +{ +add.f16x2 %11, r752, r1597; +} +{ +sub.f16x2 %32, r749, r1591; +} +{ +sub.f16x2 %33, r752, r1597; +} +{ +add.f16x2 %12, r755, r1607; +} +{ +add.f16x2 %13, r758, r1613; +} +{ +sub.f16x2 %34, r755, r1607; +} +{ +sub.f16x2 %35, r758, r1613; +} +{ +add.f16x2 %14, r621, r1623; +} +{ +add.f16x2 %15, r624, r1629; +} +{ +sub.f16x2 %36, r621, r1623; +} +{ +sub.f16x2 %37, r624, r1629; +} +{ +add.f16x2 %16, r487, r1639; +} +{ +add.f16x2 %17, r490, r1645; +} +{ +sub.f16x2 %38, r487, r1639; +} +{ +sub.f16x2 %39, r490, r1645; +} +{ +add.f16x2 %18, r353, r1655; +} +{ +add.f16x2 %19, r356, r1661; +} +{ +sub.f16x2 %40, r353, r1655; +} +{ +sub.f16x2 %41, r356, r1661; +} +{ +add.f16x2 %20, r219, r1671; +} +{ +add.f16x2 %21, r222, r1677; +} +{ +sub.f16x2 %42, r219, r1671; +} +{ +sub.f16x2 %43, r222, r1677; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ae51fe8ba263d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp16_inv.hpp.inc @@ -0,0 +1,2740 @@ +#ifndef CUFFTDX_FFT_22_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_22_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<956, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<263>; +.reg .b32 r<1835>; +.reg .f64 fd<243>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %46, %64; +} +{ +add.f16x2 r4, %47, %65; +} +{ +sub.f16x2 r7, %46, %64; +} +{ +sub.f16x2 r10, %47, %65; +} +{ +add.f16x2 r13, %48, %62; +} +{ +add.f16x2 r16, %49, %63; +} +{ +sub.f16x2 r19, %48, %62; +} +{ +sub.f16x2 r22, %49, %63; +} +{ +add.f16x2 r25, %50, %60; +} +{ +add.f16x2 r28, %51, %61; +} +{ +sub.f16x2 r31, %50, %60; +} +{ +sub.f16x2 r34, %51, %61; +} +{ +add.f16x2 r37, %52, %58; +} +{ +add.f16x2 r40, %53, %59; +} +{ +sub.f16x2 r43, %52, %58; +} +{ +sub.f16x2 r46, %53, %59; +} +{ +add.f16x2 r49, %54, %56; +} +{ +add.f16x2 r52, %55, %57; +} +{ +sub.f16x2 r55, %54, %56; +} +{ +sub.f16x2 r58, %55, %57; +} +{ +add.f16x2 r61, %44, r1; +} +{ +add.f16x2 r64, %45, r4; +} +{ +add.f16x2 r67, r61, r13; +} +{ +add.f16x2 r70, r64, r16; +} +{ +add.f16x2 r73, r67, r25; +} +{ +add.f16x2 r76, r70, r28; +} +{ +add.f16x2 r79, r73, r37; +} +{ +add.f16x2 r82, r76, r40; +} +{ +add.f16x2 r85, r79, r49; +} +{ +add.f16x2 r88, r82, r52; +} +mov.u32 r1388, 0; +cvt.rn.f16.s32 rs1, r1388; +mov.b32 r103, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r1388; +mov.b32 r115, {rs2, rs2}; +mov.f64 fd203, 0d3FEAEB8C8764F0BA; +{ +cvt.rn.f16.f64 rs3, fd203; +} +mov.b32 r95, {rs3, rs3}; +{ +mul.f16x2 r93, r1, r95; +} +{ +add.f16x2 r96, %44, r93; +} +mov.f64 fd218, 0d3FE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs4, fd218; +} +mov.b32 r101, {rs4, rs4}; +{ +mul.f16x2 r99, r10, r101; +} +{ +add.f16x2 r102, r103, r99; +} +{ +cvt.rn.f16.f64 rs5, fd203; +} +mov.b32 r107, {rs5, rs5}; +{ +mul.f16x2 r105, r4, r107; +} +{ +add.f16x2 r108, %45, r105; +} +{ +cvt.rn.f16.f64 rs6, fd218; +} +mov.b32 r113, {rs6, rs6}; +{ +mul.f16x2 r111, r7, r113; +} +{ +add.f16x2 r114, r115, r111; +} +mov.f64 fd207, 0d3FDA9628D9C712B6; +{ +cvt.rn.f16.f64 rs7, fd207; +} +mov.b32 r119, {rs7, rs7}; +{ +mul.f16x2 r117, r13, r119; +} +{ +add.f16x2 r120, r96, r117; +} +mov.f64 fd214, 0d3FED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs8, fd214; +} +mov.b32 r125, {rs8, rs8}; +{ +mul.f16x2 r123, r22, r125; +} +{ +add.f16x2 r126, r102, r123; +} +{ +cvt.rn.f16.f64 rs9, fd207; +} +mov.b32 r131, {rs9, rs9}; +{ +mul.f16x2 r129, r16, r131; +} +{ +add.f16x2 r132, r108, r129; +} +{ +cvt.rn.f16.f64 rs10, fd214; +} +mov.b32 r137, {rs10, rs10}; +{ +mul.f16x2 r135, r19, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd211, 0dBFC2375F640F44DB; +{ +cvt.rn.f16.f64 rs11, fd211; +} +mov.b32 r143, {rs11, rs11}; +{ +mul.f16x2 r141, r25, r143; +} +{ +add.f16x2 r144, r120, r141; +} +mov.f64 fd212, 0d3FEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs12, fd212; +} +mov.b32 r149, {rs12, rs12}; +{ +mul.f16x2 r147, r34, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs13, fd211; +} +mov.b32 r155, {rs13, rs13}; +{ +mul.f16x2 r153, r28, r155; +} +{ +add.f16x2 r156, r132, r153; +} +{ +cvt.rn.f16.f64 rs14, fd212; +} +mov.b32 r161, {rs14, rs14}; +{ +mul.f16x2 r159, r31, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd215, 0dBFE4F49E7F775887; +{ +cvt.rn.f16.f64 rs15, fd215; +} +mov.b32 r167, {rs15, rs15}; +{ +mul.f16x2 r165, r37, r167; +} +{ +add.f16x2 r168, r144, r165; +} +mov.f64 fd216, 0d3FE82F19BB3A28A1; +{ +cvt.rn.f16.f64 rs16, fd216; +} +mov.b32 r173, {rs16, rs16}; +{ +mul.f16x2 r171, r46, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs17, fd215; +} +mov.b32 r179, {rs17, rs17}; +{ +mul.f16x2 r177, r40, r179; +} +{ +add.f16x2 r180, r156, r177; +} +{ +cvt.rn.f16.f64 rs18, fd216; +} +mov.b32 r185, {rs18, rs18}; +{ +mul.f16x2 r183, r43, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd219, 0dBFEEB42A9BCD5057; +{ +cvt.rn.f16.f64 rs19, fd219; +} +mov.b32 r191, {rs19, rs19}; +{ +mul.f16x2 r189, r49, r191; +} +{ +add.f16x2 r192, r168, r189; +} +mov.f64 fd220, 0d3FD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs20, fd220; +} +mov.b32 r197, {rs20, rs20}; +{ +mul.f16x2 r195, r58, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs21, fd219; +} +mov.b32 r203, {rs21, rs21}; +{ +mul.f16x2 r201, r52, r203; +} +{ +add.f16x2 r204, r180, r201; +} +{ +cvt.rn.f16.f64 rs22, fd220; +} +mov.b32 r209, {rs22, rs22}; +{ +mul.f16x2 r207, r55, r209; +} +{ +add.f16x2 r210, r186, r207; +} +{ +sub.f16x2 r213, r192, r198; +} +{ +add.f16x2 r216, r204, r210; +} +{ +add.f16x2 r219, r192, r198; +} +{ +sub.f16x2 r222, r204, r210; +} +cvt.rn.f16.s32 rs23, r1388; +mov.b32 r237, {rs23, rs23}; +cvt.rn.f16.s32 rs24, r1388; +mov.b32 r249, {rs24, rs24}; +{ +cvt.rn.f16.f64 rs25, fd207; +} +mov.b32 r229, {rs25, rs25}; +{ +mul.f16x2 r227, r1, r229; +} +{ +add.f16x2 r230, %44, r227; +} +{ +cvt.rn.f16.f64 rs26, fd214; +} +mov.b32 r235, {rs26, rs26}; +{ +mul.f16x2 r233, r10, r235; +} +{ +add.f16x2 r236, r237, r233; +} +{ +cvt.rn.f16.f64 rs27, fd207; +} +mov.b32 r241, {rs27, rs27}; +{ +mul.f16x2 r239, r4, r241; +} +{ +add.f16x2 r242, %45, r239; +} +{ +cvt.rn.f16.f64 rs28, fd214; +} +mov.b32 r247, {rs28, rs28}; +{ +mul.f16x2 r245, r7, r247; +} +{ +add.f16x2 r248, r249, r245; +} +{ +cvt.rn.f16.f64 rs29, fd215; +} +mov.b32 r253, {rs29, rs29}; +{ +mul.f16x2 r251, r13, r253; +} +{ +add.f16x2 r254, r230, r251; +} +{ +cvt.rn.f16.f64 rs30, fd216; +} +mov.b32 r259, {rs30, rs30}; +{ +mul.f16x2 r257, r22, r259; +} +{ +add.f16x2 r260, r236, r257; +} +{ +cvt.rn.f16.f64 rs31, fd215; +} +mov.b32 r265, {rs31, rs31}; +{ +mul.f16x2 r263, r16, r265; +} +{ +add.f16x2 r266, r242, r263; +} +{ +cvt.rn.f16.f64 rs32, fd216; +} +mov.b32 r271, {rs32, rs32}; +{ +mul.f16x2 r269, r19, r271; +} +{ +add.f16x2 r272, r248, r269; +} +{ +cvt.rn.f16.f64 rs33, fd219; +} +mov.b32 r277, {rs33, rs33}; +{ +mul.f16x2 r275, r25, r277; +} +{ +add.f16x2 r278, r254, r275; +} +mov.f64 fd148, 0dBFD207E7FD768DBF; +{ +cvt.rn.f16.f64 rs34, fd148; +} +mov.b32 r283, {rs34, rs34}; +{ +mul.f16x2 r281, r34, r283; +} +{ +add.f16x2 r284, r260, r281; +} +{ +cvt.rn.f16.f64 rs35, fd219; +} +mov.b32 r289, {rs35, rs35}; +{ +mul.f16x2 r287, r28, r289; +} +{ +add.f16x2 r290, r266, r287; +} +{ +cvt.rn.f16.f64 rs36, fd148; +} +mov.b32 r295, {rs36, rs36}; +{ +mul.f16x2 r293, r31, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs37, fd211; +} +mov.b32 r301, {rs37, rs37}; +{ +mul.f16x2 r299, r37, r301; +} +{ +add.f16x2 r302, r278, r299; +} +mov.f64 fd168, 0dBFEFAC9E043842EF; +{ +cvt.rn.f16.f64 rs38, fd168; +} +mov.b32 r307, {rs38, rs38}; +{ +mul.f16x2 r305, r46, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs39, fd211; +} +mov.b32 r313, {rs39, rs39}; +{ +mul.f16x2 r311, r40, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs40, fd168; +} +mov.b32 r319, {rs40, rs40}; +{ +mul.f16x2 r317, r43, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs41, fd203; +} +mov.b32 r325, {rs41, rs41}; +{ +mul.f16x2 r323, r49, r325; +} +{ +add.f16x2 r326, r302, r323; +} +mov.f64 fd188, 0dBFE14CEDF8BB580B; +{ +cvt.rn.f16.f64 rs42, fd188; +} +mov.b32 r331, {rs42, rs42}; +{ +mul.f16x2 r329, r58, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs43, fd203; +} +mov.b32 r337, {rs43, rs43}; +{ +mul.f16x2 r335, r52, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs44, fd188; +} +mov.b32 r343, {rs44, rs44}; +{ +mul.f16x2 r341, r55, r343; +} +{ +add.f16x2 r344, r320, r341; +} +{ +sub.f16x2 r347, r326, r332; +} +{ +add.f16x2 r350, r338, r344; +} +{ +add.f16x2 r353, r326, r332; +} +{ +sub.f16x2 r356, r338, r344; +} +cvt.rn.f16.s32 rs45, r1388; +mov.b32 r371, {rs45, rs45}; +cvt.rn.f16.s32 rs46, r1388; +mov.b32 r383, {rs46, rs46}; +{ +cvt.rn.f16.f64 rs47, fd211; +} +mov.b32 r363, {rs47, rs47}; +{ +mul.f16x2 r361, r1, r363; +} +{ +add.f16x2 r364, %44, r361; +} +{ +cvt.rn.f16.f64 rs48, fd212; +} +mov.b32 r369, {rs48, rs48}; +{ +mul.f16x2 r367, r10, r369; +} +{ +add.f16x2 r370, r371, r367; +} +{ +cvt.rn.f16.f64 rs49, fd211; +} +mov.b32 r375, {rs49, rs49}; +{ +mul.f16x2 r373, r4, r375; +} +{ +add.f16x2 r376, %45, r373; +} +{ +cvt.rn.f16.f64 rs50, fd212; +} +mov.b32 r381, {rs50, rs50}; +{ +mul.f16x2 r379, r7, r381; +} +{ +add.f16x2 r382, r383, r379; +} +{ +cvt.rn.f16.f64 rs51, fd219; +} +mov.b32 r387, {rs51, rs51}; +{ +mul.f16x2 r385, r13, r387; +} +{ +add.f16x2 r388, r364, r385; +} +{ +cvt.rn.f16.f64 rs52, fd148; +} +mov.b32 r393, {rs52, rs52}; +{ +mul.f16x2 r391, r22, r393; +} +{ +add.f16x2 r394, r370, r391; +} +{ +cvt.rn.f16.f64 rs53, fd219; +} +mov.b32 r399, {rs53, rs53}; +{ +mul.f16x2 r397, r16, r399; +} +{ +add.f16x2 r400, r376, r397; +} +{ +cvt.rn.f16.f64 rs54, fd148; +} +mov.b32 r405, {rs54, rs54}; +{ +mul.f16x2 r403, r19, r405; +} +{ +add.f16x2 r406, r382, r403; +} +{ +cvt.rn.f16.f64 rs55, fd207; +} +mov.b32 r411, {rs55, rs55}; +{ +mul.f16x2 r409, r25, r411; +} +{ +add.f16x2 r412, r388, r409; +} +mov.f64 fd196, 0dBFED1BB48EEE2C13; +{ +cvt.rn.f16.f64 rs56, fd196; +} +mov.b32 r417, {rs56, rs56}; +{ +mul.f16x2 r415, r34, r417; +} +{ +add.f16x2 r418, r394, r415; +} +{ +cvt.rn.f16.f64 rs57, fd207; +} +mov.b32 r423, {rs57, rs57}; +{ +mul.f16x2 r421, r28, r423; +} +{ +add.f16x2 r424, r400, r421; +} +{ +cvt.rn.f16.f64 rs58, fd196; +} +mov.b32 r429, {rs58, rs58}; +{ +mul.f16x2 r427, r31, r429; +} +{ +add.f16x2 r430, r406, r427; +} +{ +cvt.rn.f16.f64 rs59, fd203; +} +mov.b32 r435, {rs59, rs59}; +{ +mul.f16x2 r433, r37, r435; +} +{ +add.f16x2 r436, r412, r433; +} +{ +cvt.rn.f16.f64 rs60, fd218; +} +mov.b32 r441, {rs60, rs60}; +{ +mul.f16x2 r439, r46, r441; +} +{ +add.f16x2 r442, r418, r439; +} +{ +cvt.rn.f16.f64 rs61, fd203; +} +mov.b32 r447, {rs61, rs61}; +{ +mul.f16x2 r445, r40, r447; +} +{ +add.f16x2 r448, r424, r445; +} +{ +cvt.rn.f16.f64 rs62, fd218; +} +mov.b32 r453, {rs62, rs62}; +{ +mul.f16x2 r451, r43, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs63, fd215; +} +mov.b32 r459, {rs63, rs63}; +{ +mul.f16x2 r457, r49, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs64, fd216; +} +mov.b32 r465, {rs64, rs64}; +{ +mul.f16x2 r463, r58, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs65, fd215; +} +mov.b32 r471, {rs65, rs65}; +{ +mul.f16x2 r469, r52, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs66, fd216; +} +mov.b32 r477, {rs66, rs66}; +{ +mul.f16x2 r475, r55, r477; +} +{ +add.f16x2 r478, r454, r475; +} +{ +sub.f16x2 r481, r460, r466; +} +{ +add.f16x2 r484, r472, r478; +} +{ +add.f16x2 r487, r460, r466; +} +{ +sub.f16x2 r490, r472, r478; +} +cvt.rn.f16.s32 rs67, r1388; +mov.b32 r505, {rs67, rs67}; +cvt.rn.f16.s32 rs68, r1388; +mov.b32 r517, {rs68, rs68}; +{ +cvt.rn.f16.f64 rs69, fd215; +} +mov.b32 r497, {rs69, rs69}; +{ +mul.f16x2 r495, r1, r497; +} +{ +add.f16x2 r498, %44, r495; +} +{ +cvt.rn.f16.f64 rs70, fd216; +} +mov.b32 r503, {rs70, rs70}; +{ +mul.f16x2 r501, r10, r503; +} +{ +add.f16x2 r504, r505, r501; +} +{ +cvt.rn.f16.f64 rs71, fd215; +} +mov.b32 r509, {rs71, rs71}; +{ +mul.f16x2 r507, r4, r509; +} +{ +add.f16x2 r510, %45, r507; +} +{ +cvt.rn.f16.f64 rs72, fd216; +} +mov.b32 r515, {rs72, rs72}; +{ +mul.f16x2 r513, r7, r515; +} +{ +add.f16x2 r516, r517, r513; +} +{ +cvt.rn.f16.f64 rs73, fd211; +} +mov.b32 r521, {rs73, rs73}; +{ +mul.f16x2 r519, r13, r521; +} +{ +add.f16x2 r522, r498, r519; +} +{ +cvt.rn.f16.f64 rs74, fd168; +} +mov.b32 r527, {rs74, rs74}; +{ +mul.f16x2 r525, r22, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs75, fd211; +} +mov.b32 r533, {rs75, rs75}; +{ +mul.f16x2 r531, r16, r533; +} +{ +add.f16x2 r534, r510, r531; +} +{ +cvt.rn.f16.f64 rs76, fd168; +} +mov.b32 r539, {rs76, rs76}; +{ +mul.f16x2 r537, r19, r539; +} +{ +add.f16x2 r540, r516, r537; +} +{ +cvt.rn.f16.f64 rs77, fd203; +} +mov.b32 r545, {rs77, rs77}; +{ +mul.f16x2 r543, r25, r545; +} +{ +add.f16x2 r546, r522, r543; +} +{ +cvt.rn.f16.f64 rs78, fd218; +} +mov.b32 r551, {rs78, rs78}; +{ +mul.f16x2 r549, r34, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs79, fd203; +} +mov.b32 r557, {rs79, rs79}; +{ +mul.f16x2 r555, r28, r557; +} +{ +add.f16x2 r558, r534, r555; +} +{ +cvt.rn.f16.f64 rs80, fd218; +} +mov.b32 r563, {rs80, rs80}; +{ +mul.f16x2 r561, r31, r563; +} +{ +add.f16x2 r564, r540, r561; +} +{ +cvt.rn.f16.f64 rs81, fd219; +} +mov.b32 r569, {rs81, rs81}; +{ +mul.f16x2 r567, r37, r569; +} +{ +add.f16x2 r570, r546, r567; +} +{ +cvt.rn.f16.f64 rs82, fd220; +} +mov.b32 r575, {rs82, rs82}; +{ +mul.f16x2 r573, r46, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs83, fd219; +} +mov.b32 r581, {rs83, rs83}; +{ +mul.f16x2 r579, r40, r581; +} +{ +add.f16x2 r582, r558, r579; +} +{ +cvt.rn.f16.f64 rs84, fd220; +} +mov.b32 r587, {rs84, rs84}; +{ +mul.f16x2 r585, r43, r587; +} +{ +add.f16x2 r588, r564, r585; +} +{ +cvt.rn.f16.f64 rs85, fd207; +} +mov.b32 r593, {rs85, rs85}; +{ +mul.f16x2 r591, r49, r593; +} +{ +add.f16x2 r594, r570, r591; +} +{ +cvt.rn.f16.f64 rs86, fd196; +} +mov.b32 r599, {rs86, rs86}; +{ +mul.f16x2 r597, r58, r599; +} +{ +add.f16x2 r600, r576, r597; +} +{ +cvt.rn.f16.f64 rs87, fd207; +} +mov.b32 r605, {rs87, rs87}; +{ +mul.f16x2 r603, r52, r605; +} +{ +add.f16x2 r606, r582, r603; +} +{ +cvt.rn.f16.f64 rs88, fd196; +} +mov.b32 r611, {rs88, rs88}; +{ +mul.f16x2 r609, r55, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +sub.f16x2 r615, r594, r600; +} +{ +add.f16x2 r618, r606, r612; +} +{ +add.f16x2 r621, r594, r600; +} +{ +sub.f16x2 r624, r606, r612; +} +cvt.rn.f16.s32 rs89, r1388; +mov.b32 r639, {rs89, rs89}; +cvt.rn.f16.s32 rs90, r1388; +mov.b32 r651, {rs90, rs90}; +{ +cvt.rn.f16.f64 rs91, fd219; +} +mov.b32 r631, {rs91, rs91}; +{ +mul.f16x2 r629, r1, r631; +} +{ +add.f16x2 r632, %44, r629; +} +{ +cvt.rn.f16.f64 rs92, fd220; +} +mov.b32 r637, {rs92, rs92}; +{ +mul.f16x2 r635, r10, r637; +} +{ +add.f16x2 r638, r639, r635; +} +{ +cvt.rn.f16.f64 rs93, fd219; +} +mov.b32 r643, {rs93, rs93}; +{ +mul.f16x2 r641, r4, r643; +} +{ +add.f16x2 r644, %45, r641; +} +{ +cvt.rn.f16.f64 rs94, fd220; +} +mov.b32 r649, {rs94, rs94}; +{ +mul.f16x2 r647, r7, r649; +} +{ +add.f16x2 r650, r651, r647; +} +{ +cvt.rn.f16.f64 rs95, fd203; +} +mov.b32 r655, {rs95, rs95}; +{ +mul.f16x2 r653, r13, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs96, fd188; +} +mov.b32 r661, {rs96, rs96}; +{ +mul.f16x2 r659, r22, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs97, fd203; +} +mov.b32 r667, {rs97, rs97}; +{ +mul.f16x2 r665, r16, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs98, fd188; +} +mov.b32 r673, {rs98, rs98}; +{ +mul.f16x2 r671, r19, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs99, fd215; +} +mov.b32 r679, {rs99, rs99}; +{ +mul.f16x2 r677, r25, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs100, fd216; +} +mov.b32 r685, {rs100, rs100}; +{ +mul.f16x2 r683, r34, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs101, fd215; +} +mov.b32 r691, {rs101, rs101}; +{ +mul.f16x2 r689, r28, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs102, fd216; +} +mov.b32 r697, {rs102, rs102}; +{ +mul.f16x2 r695, r31, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs103, fd207; +} +mov.b32 r703, {rs103, rs103}; +{ +mul.f16x2 r701, r37, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs104, fd196; +} +mov.b32 r709, {rs104, rs104}; +{ +mul.f16x2 r707, r46, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs105, fd207; +} +mov.b32 r715, {rs105, rs105}; +{ +mul.f16x2 r713, r40, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs106, fd196; +} +mov.b32 r721, {rs106, rs106}; +{ +mul.f16x2 r719, r43, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs107, fd211; +} +mov.b32 r727, {rs107, rs107}; +{ +mul.f16x2 r725, r49, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs108, fd212; +} +mov.b32 r733, {rs108, rs108}; +{ +mul.f16x2 r731, r58, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs109, fd211; +} +mov.b32 r739, {rs109, rs109}; +{ +mul.f16x2 r737, r52, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs110, fd212; +} +mov.b32 r745, {rs110, rs110}; +{ +mul.f16x2 r743, r55, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +sub.f16x2 r749, r728, r734; +} +{ +add.f16x2 r752, r740, r746; +} +{ +add.f16x2 r755, r728, r734; +} +{ +sub.f16x2 r758, r740, r746; +} +{ +add.f16x2 r761, %74, %66; +} +{ +add.f16x2 r764, %77, %71; +} +{ +sub.f16x2 r767, %74, %66; +} +{ +sub.f16x2 r770, %77, %71; +} +{ +add.f16x2 r773, %84, %80; +} +{ +add.f16x2 r776, %87, %82; +} +{ +sub.f16x2 r779, %84, %80; +} +{ +sub.f16x2 r782, %87, %82; +} +{ +add.f16x2 r785, %75, %68; +} +{ +add.f16x2 r788, %78, %72; +} +{ +sub.f16x2 r791, %75, %68; +} +{ +sub.f16x2 r794, %78, %72; +} +{ +add.f16x2 r797, %86, %81; +} +{ +add.f16x2 r800, %67, %83; +} +{ +sub.f16x2 r803, %86, %81; +} +{ +sub.f16x2 r806, %67, %83; +} +{ +add.f16x2 r809, %76, %70; +} +{ +add.f16x2 r812, %79, %73; +} +{ +sub.f16x2 r815, %76, %70; +} +{ +sub.f16x2 r818, %79, %73; +} +{ +add.f16x2 r821, %85, r761; +} +{ +add.f16x2 r824, %69, r764; +} +{ +add.f16x2 r827, r821, r773; +} +{ +add.f16x2 r830, r824, r776; +} +{ +add.f16x2 r833, r827, r785; +} +{ +add.f16x2 r836, r830, r788; +} +{ +add.f16x2 r839, r833, r797; +} +{ +add.f16x2 r842, r836, r800; +} +{ +add.f16x2 r845, r839, r809; +} +{ +add.f16x2 r848, r842, r812; +} +cvt.rn.f16.s32 rs111, r1388; +mov.b32 r863, {rs111, rs111}; +cvt.rn.f16.s32 rs112, r1388; +mov.b32 r875, {rs112, rs112}; +{ +cvt.rn.f16.f64 rs113, fd203; +} +mov.b32 r855, {rs113, rs113}; +{ +mul.f16x2 r853, r761, r855; +} +{ +add.f16x2 r856, %85, r853; +} +{ +cvt.rn.f16.f64 rs114, fd218; +} +mov.b32 r861, {rs114, rs114}; +{ +mul.f16x2 r859, r770, r861; +} +{ +add.f16x2 r862, r863, r859; +} +{ +cvt.rn.f16.f64 rs115, fd203; +} +mov.b32 r867, {rs115, rs115}; +{ +mul.f16x2 r865, r764, r867; +} +{ +add.f16x2 r868, %69, r865; +} +{ +cvt.rn.f16.f64 rs116, fd218; +} +mov.b32 r873, {rs116, rs116}; +{ +mul.f16x2 r871, r767, r873; +} +{ +add.f16x2 r874, r875, r871; +} +{ +cvt.rn.f16.f64 rs117, fd207; +} +mov.b32 r879, {rs117, rs117}; +{ +mul.f16x2 r877, r773, r879; +} +{ +add.f16x2 r880, r856, r877; +} +{ +cvt.rn.f16.f64 rs118, fd214; +} +mov.b32 r885, {rs118, rs118}; +{ +mul.f16x2 r883, r782, r885; +} +{ +add.f16x2 r886, r862, r883; +} +{ +cvt.rn.f16.f64 rs119, fd207; +} +mov.b32 r891, {rs119, rs119}; +{ +mul.f16x2 r889, r776, r891; +} +{ +add.f16x2 r892, r868, r889; +} +{ +cvt.rn.f16.f64 rs120, fd214; +} +mov.b32 r897, {rs120, rs120}; +{ +mul.f16x2 r895, r779, r897; +} +{ +add.f16x2 r898, r874, r895; +} +{ +cvt.rn.f16.f64 rs121, fd211; +} +mov.b32 r903, {rs121, rs121}; +{ +mul.f16x2 r901, r785, r903; +} +{ +add.f16x2 r904, r880, r901; +} +{ +cvt.rn.f16.f64 rs122, fd212; +} +mov.b32 r909, {rs122, rs122}; +{ +mul.f16x2 r907, r794, r909; +} +{ +add.f16x2 r910, r886, r907; +} +{ +cvt.rn.f16.f64 rs123, fd211; +} +mov.b32 r915, {rs123, rs123}; +{ +mul.f16x2 r913, r788, r915; +} +{ +add.f16x2 r916, r892, r913; +} +{ +cvt.rn.f16.f64 rs124, fd212; +} +mov.b32 r921, {rs124, rs124}; +{ +mul.f16x2 r919, r791, r921; +} +{ +add.f16x2 r922, r898, r919; +} +{ +cvt.rn.f16.f64 rs125, fd215; +} +mov.b32 r927, {rs125, rs125}; +{ +mul.f16x2 r925, r797, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs126, fd216; +} +mov.b32 r933, {rs126, rs126}; +{ +mul.f16x2 r931, r806, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs127, fd215; +} +mov.b32 r939, {rs127, rs127}; +{ +mul.f16x2 r937, r800, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs128, fd216; +} +mov.b32 r945, {rs128, rs128}; +{ +mul.f16x2 r943, r803, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs129, fd219; +} +mov.b32 r951, {rs129, rs129}; +{ +mul.f16x2 r949, r809, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs130, fd220; +} +mov.b32 r957, {rs130, rs130}; +{ +mul.f16x2 r955, r818, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs131, fd219; +} +mov.b32 r963, {rs131, rs131}; +{ +mul.f16x2 r961, r812, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs132, fd220; +} +mov.b32 r969, {rs132, rs132}; +{ +mul.f16x2 r967, r815, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +sub.f16x2 r973, r952, r958; +} +{ +add.f16x2 r976, r964, r970; +} +{ +add.f16x2 r979, r952, r958; +} +{ +sub.f16x2 r982, r964, r970; +} +cvt.rn.f16.s32 rs133, r1388; +mov.b32 r997, {rs133, rs133}; +cvt.rn.f16.s32 rs134, r1388; +mov.b32 r1009, {rs134, rs134}; +{ +cvt.rn.f16.f64 rs135, fd207; +} +mov.b32 r989, {rs135, rs135}; +{ +mul.f16x2 r987, r761, r989; +} +{ +add.f16x2 r990, %85, r987; +} +{ +cvt.rn.f16.f64 rs136, fd214; +} +mov.b32 r995, {rs136, rs136}; +{ +mul.f16x2 r993, r770, r995; +} +{ +add.f16x2 r996, r997, r993; +} +{ +cvt.rn.f16.f64 rs137, fd207; +} +mov.b32 r1001, {rs137, rs137}; +{ +mul.f16x2 r999, r764, r1001; +} +{ +add.f16x2 r1002, %69, r999; +} +{ +cvt.rn.f16.f64 rs138, fd214; +} +mov.b32 r1007, {rs138, rs138}; +{ +mul.f16x2 r1005, r767, r1007; +} +{ +add.f16x2 r1008, r1009, r1005; +} +{ +cvt.rn.f16.f64 rs139, fd215; +} +mov.b32 r1013, {rs139, rs139}; +{ +mul.f16x2 r1011, r773, r1013; +} +{ +add.f16x2 r1014, r990, r1011; +} +{ +cvt.rn.f16.f64 rs140, fd216; +} +mov.b32 r1019, {rs140, rs140}; +{ +mul.f16x2 r1017, r782, r1019; +} +{ +add.f16x2 r1020, r996, r1017; +} +{ +cvt.rn.f16.f64 rs141, fd215; +} +mov.b32 r1025, {rs141, rs141}; +{ +mul.f16x2 r1023, r776, r1025; +} +{ +add.f16x2 r1026, r1002, r1023; +} +{ +cvt.rn.f16.f64 rs142, fd216; +} +mov.b32 r1031, {rs142, rs142}; +{ +mul.f16x2 r1029, r779, r1031; +} +{ +add.f16x2 r1032, r1008, r1029; +} +{ +cvt.rn.f16.f64 rs143, fd219; +} +mov.b32 r1037, {rs143, rs143}; +{ +mul.f16x2 r1035, r785, r1037; +} +{ +add.f16x2 r1038, r1014, r1035; +} +{ +cvt.rn.f16.f64 rs144, fd148; +} +mov.b32 r1043, {rs144, rs144}; +{ +mul.f16x2 r1041, r794, r1043; +} +{ +add.f16x2 r1044, r1020, r1041; +} +{ +cvt.rn.f16.f64 rs145, fd219; +} +mov.b32 r1049, {rs145, rs145}; +{ +mul.f16x2 r1047, r788, r1049; +} +{ +add.f16x2 r1050, r1026, r1047; +} +{ +cvt.rn.f16.f64 rs146, fd148; +} +mov.b32 r1055, {rs146, rs146}; +{ +mul.f16x2 r1053, r791, r1055; +} +{ +add.f16x2 r1056, r1032, r1053; +} +{ +cvt.rn.f16.f64 rs147, fd211; +} +mov.b32 r1061, {rs147, rs147}; +{ +mul.f16x2 r1059, r797, r1061; +} +{ +add.f16x2 r1062, r1038, r1059; +} +{ +cvt.rn.f16.f64 rs148, fd168; +} +mov.b32 r1067, {rs148, rs148}; +{ +mul.f16x2 r1065, r806, r1067; +} +{ +add.f16x2 r1068, r1044, r1065; +} +{ +cvt.rn.f16.f64 rs149, fd211; +} +mov.b32 r1073, {rs149, rs149}; +{ +mul.f16x2 r1071, r800, r1073; +} +{ +add.f16x2 r1074, r1050, r1071; +} +{ +cvt.rn.f16.f64 rs150, fd168; +} +mov.b32 r1079, {rs150, rs150}; +{ +mul.f16x2 r1077, r803, r1079; +} +{ +add.f16x2 r1080, r1056, r1077; +} +{ +cvt.rn.f16.f64 rs151, fd203; +} +mov.b32 r1085, {rs151, rs151}; +{ +mul.f16x2 r1083, r809, r1085; +} +{ +add.f16x2 r1086, r1062, r1083; +} +{ +cvt.rn.f16.f64 rs152, fd188; +} +mov.b32 r1091, {rs152, rs152}; +{ +mul.f16x2 r1089, r818, r1091; +} +{ +add.f16x2 r1092, r1068, r1089; +} +{ +cvt.rn.f16.f64 rs153, fd203; +} +mov.b32 r1097, {rs153, rs153}; +{ +mul.f16x2 r1095, r812, r1097; +} +{ +add.f16x2 r1098, r1074, r1095; +} +{ +cvt.rn.f16.f64 rs154, fd188; +} +mov.b32 r1103, {rs154, rs154}; +{ +mul.f16x2 r1101, r815, r1103; +} +{ +add.f16x2 r1104, r1080, r1101; +} +{ +sub.f16x2 r1107, r1086, r1092; +} +{ +add.f16x2 r1110, r1098, r1104; +} +{ +add.f16x2 r1113, r1086, r1092; +} +{ +sub.f16x2 r1116, r1098, r1104; +} +cvt.rn.f16.s32 rs155, r1388; +mov.b32 r1131, {rs155, rs155}; +cvt.rn.f16.s32 rs156, r1388; +mov.b32 r1143, {rs156, rs156}; +{ +cvt.rn.f16.f64 rs157, fd211; +} +mov.b32 r1123, {rs157, rs157}; +{ +mul.f16x2 r1121, r761, r1123; +} +{ +add.f16x2 r1124, %85, r1121; +} +{ +cvt.rn.f16.f64 rs158, fd212; +} +mov.b32 r1129, {rs158, rs158}; +{ +mul.f16x2 r1127, r770, r1129; +} +{ +add.f16x2 r1130, r1131, r1127; +} +{ +cvt.rn.f16.f64 rs159, fd211; +} +mov.b32 r1135, {rs159, rs159}; +{ +mul.f16x2 r1133, r764, r1135; +} +{ +add.f16x2 r1136, %69, r1133; +} +{ +cvt.rn.f16.f64 rs160, fd212; +} +mov.b32 r1141, {rs160, rs160}; +{ +mul.f16x2 r1139, r767, r1141; +} +{ +add.f16x2 r1142, r1143, r1139; +} +{ +cvt.rn.f16.f64 rs161, fd219; +} +mov.b32 r1147, {rs161, rs161}; +{ +mul.f16x2 r1145, r773, r1147; +} +{ +add.f16x2 r1148, r1124, r1145; +} +{ +cvt.rn.f16.f64 rs162, fd148; +} +mov.b32 r1153, {rs162, rs162}; +{ +mul.f16x2 r1151, r782, r1153; +} +{ +add.f16x2 r1154, r1130, r1151; +} +{ +cvt.rn.f16.f64 rs163, fd219; +} +mov.b32 r1159, {rs163, rs163}; +{ +mul.f16x2 r1157, r776, r1159; +} +{ +add.f16x2 r1160, r1136, r1157; +} +{ +cvt.rn.f16.f64 rs164, fd148; +} +mov.b32 r1165, {rs164, rs164}; +{ +mul.f16x2 r1163, r779, r1165; +} +{ +add.f16x2 r1166, r1142, r1163; +} +{ +cvt.rn.f16.f64 rs165, fd207; +} +mov.b32 r1171, {rs165, rs165}; +{ +mul.f16x2 r1169, r785, r1171; +} +{ +add.f16x2 r1172, r1148, r1169; +} +{ +cvt.rn.f16.f64 rs166, fd196; +} +mov.b32 r1177, {rs166, rs166}; +{ +mul.f16x2 r1175, r794, r1177; +} +{ +add.f16x2 r1178, r1154, r1175; +} +{ +cvt.rn.f16.f64 rs167, fd207; +} +mov.b32 r1183, {rs167, rs167}; +{ +mul.f16x2 r1181, r788, r1183; +} +{ +add.f16x2 r1184, r1160, r1181; +} +{ +cvt.rn.f16.f64 rs168, fd196; +} +mov.b32 r1189, {rs168, rs168}; +{ +mul.f16x2 r1187, r791, r1189; +} +{ +add.f16x2 r1190, r1166, r1187; +} +{ +cvt.rn.f16.f64 rs169, fd203; +} +mov.b32 r1195, {rs169, rs169}; +{ +mul.f16x2 r1193, r797, r1195; +} +{ +add.f16x2 r1196, r1172, r1193; +} +{ +cvt.rn.f16.f64 rs170, fd218; +} +mov.b32 r1201, {rs170, rs170}; +{ +mul.f16x2 r1199, r806, r1201; +} +{ +add.f16x2 r1202, r1178, r1199; +} +{ +cvt.rn.f16.f64 rs171, fd203; +} +mov.b32 r1207, {rs171, rs171}; +{ +mul.f16x2 r1205, r800, r1207; +} +{ +add.f16x2 r1208, r1184, r1205; +} +{ +cvt.rn.f16.f64 rs172, fd218; +} +mov.b32 r1213, {rs172, rs172}; +{ +mul.f16x2 r1211, r803, r1213; +} +{ +add.f16x2 r1214, r1190, r1211; +} +{ +cvt.rn.f16.f64 rs173, fd215; +} +mov.b32 r1219, {rs173, rs173}; +{ +mul.f16x2 r1217, r809, r1219; +} +{ +add.f16x2 r1220, r1196, r1217; +} +{ +cvt.rn.f16.f64 rs174, fd216; +} +mov.b32 r1225, {rs174, rs174}; +{ +mul.f16x2 r1223, r818, r1225; +} +{ +add.f16x2 r1226, r1202, r1223; +} +{ +cvt.rn.f16.f64 rs175, fd215; +} +mov.b32 r1231, {rs175, rs175}; +{ +mul.f16x2 r1229, r812, r1231; +} +{ +add.f16x2 r1232, r1208, r1229; +} +{ +cvt.rn.f16.f64 rs176, fd216; +} +mov.b32 r1237, {rs176, rs176}; +{ +mul.f16x2 r1235, r815, r1237; +} +{ +add.f16x2 r1238, r1214, r1235; +} +{ +sub.f16x2 r1241, r1220, r1226; +} +{ +add.f16x2 r1244, r1232, r1238; +} +{ +add.f16x2 r1247, r1220, r1226; +} +{ +sub.f16x2 r1250, r1232, r1238; +} +cvt.rn.f16.s32 rs177, r1388; +mov.b32 r1265, {rs177, rs177}; +cvt.rn.f16.s32 rs178, r1388; +mov.b32 r1277, {rs178, rs178}; +{ +cvt.rn.f16.f64 rs179, fd215; +} +mov.b32 r1257, {rs179, rs179}; +{ +mul.f16x2 r1255, r761, r1257; +} +{ +add.f16x2 r1258, %85, r1255; +} +{ +cvt.rn.f16.f64 rs180, fd216; +} +mov.b32 r1263, {rs180, rs180}; +{ +mul.f16x2 r1261, r770, r1263; +} +{ +add.f16x2 r1264, r1265, r1261; +} +{ +cvt.rn.f16.f64 rs181, fd215; +} +mov.b32 r1269, {rs181, rs181}; +{ +mul.f16x2 r1267, r764, r1269; +} +{ +add.f16x2 r1270, %69, r1267; +} +{ +cvt.rn.f16.f64 rs182, fd216; +} +mov.b32 r1275, {rs182, rs182}; +{ +mul.f16x2 r1273, r767, r1275; +} +{ +add.f16x2 r1276, r1277, r1273; +} +{ +cvt.rn.f16.f64 rs183, fd211; +} +mov.b32 r1281, {rs183, rs183}; +{ +mul.f16x2 r1279, r773, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs184, fd168; +} +mov.b32 r1287, {rs184, rs184}; +{ +mul.f16x2 r1285, r782, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +cvt.rn.f16.f64 rs185, fd211; +} +mov.b32 r1293, {rs185, rs185}; +{ +mul.f16x2 r1291, r776, r1293; +} +{ +add.f16x2 r1294, r1270, r1291; +} +{ +cvt.rn.f16.f64 rs186, fd168; +} +mov.b32 r1299, {rs186, rs186}; +{ +mul.f16x2 r1297, r779, r1299; +} +{ +add.f16x2 r1300, r1276, r1297; +} +{ +cvt.rn.f16.f64 rs187, fd203; +} +mov.b32 r1305, {rs187, rs187}; +{ +mul.f16x2 r1303, r785, r1305; +} +{ +add.f16x2 r1306, r1282, r1303; +} +{ +cvt.rn.f16.f64 rs188, fd218; +} +mov.b32 r1311, {rs188, rs188}; +{ +mul.f16x2 r1309, r794, r1311; +} +{ +add.f16x2 r1312, r1288, r1309; +} +{ +cvt.rn.f16.f64 rs189, fd203; +} +mov.b32 r1317, {rs189, rs189}; +{ +mul.f16x2 r1315, r788, r1317; +} +{ +add.f16x2 r1318, r1294, r1315; +} +{ +cvt.rn.f16.f64 rs190, fd218; +} +mov.b32 r1323, {rs190, rs190}; +{ +mul.f16x2 r1321, r791, r1323; +} +{ +add.f16x2 r1324, r1300, r1321; +} +{ +cvt.rn.f16.f64 rs191, fd219; +} +mov.b32 r1329, {rs191, rs191}; +{ +mul.f16x2 r1327, r797, r1329; +} +{ +add.f16x2 r1330, r1306, r1327; +} +{ +cvt.rn.f16.f64 rs192, fd220; +} +mov.b32 r1335, {rs192, rs192}; +{ +mul.f16x2 r1333, r806, r1335; +} +{ +add.f16x2 r1336, r1312, r1333; +} +{ +cvt.rn.f16.f64 rs193, fd219; +} +mov.b32 r1341, {rs193, rs193}; +{ +mul.f16x2 r1339, r800, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs194, fd220; +} +mov.b32 r1347, {rs194, rs194}; +{ +mul.f16x2 r1345, r803, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs195, fd207; +} +mov.b32 r1353, {rs195, rs195}; +{ +mul.f16x2 r1351, r809, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs196, fd196; +} +mov.b32 r1359, {rs196, rs196}; +{ +mul.f16x2 r1357, r818, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs197, fd207; +} +mov.b32 r1365, {rs197, rs197}; +{ +mul.f16x2 r1363, r812, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +cvt.rn.f16.f64 rs198, fd196; +} +mov.b32 r1371, {rs198, rs198}; +{ +mul.f16x2 r1369, r815, r1371; +} +{ +add.f16x2 r1372, r1348, r1369; +} +{ +sub.f16x2 r1375, r1354, r1360; +} +{ +add.f16x2 r1378, r1366, r1372; +} +{ +add.f16x2 r1381, r1354, r1360; +} +{ +sub.f16x2 r1384, r1366, r1372; +} +cvt.rn.f16.s32 rs199, r1388; +mov.b32 r1399, {rs199, rs199}; +cvt.rn.f16.s32 rs200, r1388; +mov.b32 r1411, {rs200, rs200}; +{ +cvt.rn.f16.f64 rs201, fd219; +} +mov.b32 r1391, {rs201, rs201}; +{ +mul.f16x2 r1389, r761, r1391; +} +{ +add.f16x2 r1392, %85, r1389; +} +{ +cvt.rn.f16.f64 rs202, fd220; +} +mov.b32 r1397, {rs202, rs202}; +{ +mul.f16x2 r1395, r770, r1397; +} +{ +add.f16x2 r1398, r1399, r1395; +} +{ +cvt.rn.f16.f64 rs203, fd219; +} +mov.b32 r1403, {rs203, rs203}; +{ +mul.f16x2 r1401, r764, r1403; +} +{ +add.f16x2 r1404, %69, r1401; +} +{ +cvt.rn.f16.f64 rs204, fd220; +} +mov.b32 r1409, {rs204, rs204}; +{ +mul.f16x2 r1407, r767, r1409; +} +{ +add.f16x2 r1410, r1411, r1407; +} +{ +cvt.rn.f16.f64 rs205, fd203; +} +mov.b32 r1415, {rs205, rs205}; +{ +mul.f16x2 r1413, r773, r1415; +} +{ +add.f16x2 r1416, r1392, r1413; +} +{ +cvt.rn.f16.f64 rs206, fd188; +} +mov.b32 r1421, {rs206, rs206}; +{ +mul.f16x2 r1419, r782, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs207, fd203; +} +mov.b32 r1427, {rs207, rs207}; +{ +mul.f16x2 r1425, r776, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs208, fd188; +} +mov.b32 r1433, {rs208, rs208}; +{ +mul.f16x2 r1431, r779, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs209, fd215; +} +mov.b32 r1439, {rs209, rs209}; +{ +mul.f16x2 r1437, r785, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs210, fd216; +} +mov.b32 r1445, {rs210, rs210}; +{ +mul.f16x2 r1443, r794, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs211, fd215; +} +mov.b32 r1451, {rs211, rs211}; +{ +mul.f16x2 r1449, r788, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs212, fd216; +} +mov.b32 r1457, {rs212, rs212}; +{ +mul.f16x2 r1455, r791, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs213, fd207; +} +mov.b32 r1463, {rs213, rs213}; +{ +mul.f16x2 r1461, r797, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs214, fd196; +} +mov.b32 r1469, {rs214, rs214}; +{ +mul.f16x2 r1467, r806, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs215, fd207; +} +mov.b32 r1475, {rs215, rs215}; +{ +mul.f16x2 r1473, r800, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs216, fd196; +} +mov.b32 r1481, {rs216, rs216}; +{ +mul.f16x2 r1479, r803, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs217, fd211; +} +mov.b32 r1487, {rs217, rs217}; +{ +mul.f16x2 r1485, r809, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs218, fd212; +} +mov.b32 r1493, {rs218, rs218}; +{ +mul.f16x2 r1491, r818, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs219, fd211; +} +mov.b32 r1499, {rs219, rs219}; +{ +mul.f16x2 r1497, r812, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs220, fd212; +} +mov.b32 r1505, {rs220, rs220}; +{ +mul.f16x2 r1503, r815, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +sub.f16x2 r1509, r1488, r1494; +} +{ +add.f16x2 r1512, r1500, r1506; +} +{ +add.f16x2 r1515, r1488, r1494; +} +{ +sub.f16x2 r1518, r1500, r1506; +} +mov.f64 fd201, 0d3FEEB42A9BCD5057; +{ +cvt.rn.f16.f64 rs221, fd201; +} +{ +cvt.rn.f16.f64 rs222, fd220; +} +{ +cvt.rn.f16.f64 rs223, fd203; +} +{ +cvt.rn.f16.f64 rs224, fd218; +} +mov.f64 fd205, 0d3FE4F49E7F775887; +{ +cvt.rn.f16.f64 rs225, fd205; +} +{ +cvt.rn.f16.f64 rs226, fd216; +} +{ +cvt.rn.f16.f64 rs227, fd207; +} +{ +cvt.rn.f16.f64 rs228, fd214; +} +mov.f64 fd209, 0d3FC2375F640F44DB; +{ +cvt.rn.f16.f64 rs229, fd209; +} +{ +cvt.rn.f16.f64 rs230, fd212; +} +{ +cvt.rn.f16.f64 rs231, fd211; +} +{ +cvt.rn.f16.f64 rs232, fd212; +} +mov.f64 fd213, 0dBFDA9628D9C712B6; +{ +cvt.rn.f16.f64 rs233, fd213; +} +{ +cvt.rn.f16.f64 rs234, fd214; +} +{ +cvt.rn.f16.f64 rs235, fd215; +} +{ +cvt.rn.f16.f64 rs236, fd216; +} +mov.f64 fd217, 0dBFEAEB8C8764F0BA; +{ +cvt.rn.f16.f64 rs237, fd217; +} +{ +cvt.rn.f16.f64 rs238, fd218; +} +{ +cvt.rn.f16.f64 rs239, fd219; +} +{ +cvt.rn.f16.f64 rs240, fd220; +} +mov.b32 r1535, {rs221, rs221}; +{ +mul.f16x2 r1521, r973, r1535; +} +mov.b32 r1532, {rs222, rs222}; +{ +mul.f16x2 r1524, r976, r1532; +} +{ +sub.f16x2 r1527, r1521, r1524; +} +{ +mul.f16x2 r1530, r973, r1532; +} +{ +fma.rn.f16x2 r1533, r976, r1535, r1530; +} +mov.b32 r1551, {rs223, rs223}; +{ +mul.f16x2 r1537, r1107, r1551; +} +mov.b32 r1548, {rs224, rs224}; +{ +mul.f16x2 r1540, r1110, r1548; +} +{ +sub.f16x2 r1543, r1537, r1540; +} +{ +mul.f16x2 r1546, r1107, r1548; +} +{ +fma.rn.f16x2 r1549, r1110, r1551, r1546; +} +mov.b32 r1567, {rs225, rs225}; +{ +mul.f16x2 r1553, r1241, r1567; +} +mov.b32 r1564, {rs226, rs226}; +{ +mul.f16x2 r1556, r1244, r1564; +} +{ +sub.f16x2 r1559, r1553, r1556; +} +{ +mul.f16x2 r1562, r1241, r1564; +} +{ +fma.rn.f16x2 r1565, r1244, r1567, r1562; +} +mov.b32 r1583, {rs227, rs227}; +{ +mul.f16x2 r1569, r1375, r1583; +} +mov.b32 r1580, {rs228, rs228}; +{ +mul.f16x2 r1572, r1378, r1580; +} +{ +sub.f16x2 r1575, r1569, r1572; +} +{ +mul.f16x2 r1578, r1375, r1580; +} +{ +fma.rn.f16x2 r1581, r1378, r1583, r1578; +} +mov.b32 r1599, {rs229, rs229}; +{ +mul.f16x2 r1585, r1509, r1599; +} +mov.b32 r1596, {rs230, rs230}; +{ +mul.f16x2 r1588, r1512, r1596; +} +{ +sub.f16x2 r1591, r1585, r1588; +} +{ +mul.f16x2 r1594, r1509, r1596; +} +{ +fma.rn.f16x2 r1597, r1512, r1599, r1594; +} +mov.b32 r1615, {rs231, rs231}; +{ +mul.f16x2 r1601, r1515, r1615; +} +mov.b32 r1612, {rs232, rs232}; +{ +mul.f16x2 r1604, r1518, r1612; +} +{ +sub.f16x2 r1607, r1601, r1604; +} +{ +mul.f16x2 r1610, r1515, r1612; +} +{ +fma.rn.f16x2 r1613, r1518, r1615, r1610; +} +mov.b32 r1631, {rs233, rs233}; +{ +mul.f16x2 r1617, r1381, r1631; +} +mov.b32 r1628, {rs234, rs234}; +{ +mul.f16x2 r1620, r1384, r1628; +} +{ +sub.f16x2 r1623, r1617, r1620; +} +{ +mul.f16x2 r1626, r1381, r1628; +} +{ +fma.rn.f16x2 r1629, r1384, r1631, r1626; +} +mov.b32 r1647, {rs235, rs235}; +{ +mul.f16x2 r1633, r1247, r1647; +} +mov.b32 r1644, {rs236, rs236}; +{ +mul.f16x2 r1636, r1250, r1644; +} +{ +sub.f16x2 r1639, r1633, r1636; +} +{ +mul.f16x2 r1642, r1247, r1644; +} +{ +fma.rn.f16x2 r1645, r1250, r1647, r1642; +} +mov.b32 r1663, {rs237, rs237}; +{ +mul.f16x2 r1649, r1113, r1663; +} +mov.b32 r1660, {rs238, rs238}; +{ +mul.f16x2 r1652, r1116, r1660; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r1113, r1660; +} +{ +fma.rn.f16x2 r1661, r1116, r1663, r1658; +} +mov.b32 r1679, {rs239, rs239}; +{ +mul.f16x2 r1665, r979, r1679; +} +mov.b32 r1676, {rs240, rs240}; +{ +mul.f16x2 r1668, r982, r1676; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r979, r1676; +} +{ +fma.rn.f16x2 r1677, r982, r1679, r1674; +} +{ +add.f16x2 %0, r85, r845; +} +{ +add.f16x2 %1, r88, r848; +} +{ +sub.f16x2 %22, r85, r845; +} +{ +sub.f16x2 %23, r88, r848; +} +{ +add.f16x2 %2, r213, r1527; +} +{ +add.f16x2 %3, r216, r1533; +} +{ +sub.f16x2 %24, r213, r1527; +} +{ +sub.f16x2 %25, r216, r1533; +} +{ +add.f16x2 %4, r347, r1543; +} +{ +add.f16x2 %5, r350, r1549; +} +{ +sub.f16x2 %26, r347, r1543; +} +{ +sub.f16x2 %27, r350, r1549; +} +{ +add.f16x2 %6, r481, r1559; +} +{ +add.f16x2 %7, r484, r1565; +} +{ +sub.f16x2 %28, r481, r1559; +} +{ +sub.f16x2 %29, r484, r1565; +} +{ +add.f16x2 %8, r615, r1575; +} +{ +add.f16x2 %9, r618, r1581; +} +{ +sub.f16x2 %30, r615, r1575; +} +{ +sub.f16x2 %31, r618, r1581; +} +{ +add.f16x2 %10, r749, r1591; +} +{ +add.f16x2 %11, r752, r1597; +} +{ +sub.f16x2 %32, r749, r1591; +} +{ +sub.f16x2 %33, r752, r1597; +} +{ +add.f16x2 %12, r755, r1607; +} +{ +add.f16x2 %13, r758, r1613; +} +{ +sub.f16x2 %34, r755, r1607; +} +{ +sub.f16x2 %35, r758, r1613; +} +{ +add.f16x2 %14, r621, r1623; +} +{ +add.f16x2 %15, r624, r1629; +} +{ +sub.f16x2 %36, r621, r1623; +} +{ +sub.f16x2 %37, r624, r1629; +} +{ +add.f16x2 %16, r487, r1639; +} +{ +add.f16x2 %17, r490, r1645; +} +{ +sub.f16x2 %38, r487, r1639; +} +{ +sub.f16x2 %39, r490, r1645; +} +{ +add.f16x2 %18, r353, r1655; +} +{ +add.f16x2 %19, r356, r1661; +} +{ +sub.f16x2 %40, r353, r1655; +} +{ +sub.f16x2 %41, r356, r1661; +} +{ +add.f16x2 %20, r219, r1671; +} +{ +add.f16x2 %21, r222, r1677; +} +{ +sub.f16x2 %42, r219, r1671; +} +{ +sub.f16x2 %43, r222, r1677; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..175d87eba515a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp32_fwd.hpp.inc @@ -0,0 +1,410 @@ +#ifndef CUFFTDX_FFT_22_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_22_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<8, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<483>; +.reg .b64 rd<2>; +add.f32 f89, %49, %97; +add.f32 f90, %51, %99; +sub.f32 f91, %49, %97; +sub.f32 f92, %51, %99; +add.f32 f93, %54, %92; +add.f32 f94, %56, %93; +sub.f32 f95, %54, %92; +sub.f32 f96, %56, %93; +add.f32 f97, %60, %86; +add.f32 f98, %61, %88; +sub.f32 f99, %60, %86; +sub.f32 f100, %61, %88; +add.f32 f101, %65, %81; +add.f32 f102, %67, %83; +sub.f32 f103, %65, %81; +sub.f32 f104, %67, %83; +add.f32 f105, %70, %76; +add.f32 f106, %72, %77; +sub.f32 f107, %70, %76; +sub.f32 f108, %72, %77; +add.f32 f109, %44, f89; +add.f32 f110, %45, f90; +add.f32 f111, f109, f93; +add.f32 f112, f110, f94; +add.f32 f113, f111, f97; +add.f32 f114, f112, f98; +add.f32 f115, f113, f101; +add.f32 f116, f114, f102; +add.f32 f117, f115, f105; +add.f32 f118, f116, f106; +fma.rn.f32 f119, f89, 0f3F575C64, %44; +fma.rn.f32 f120, f92, 0fBF0A6770, 0f00000000; +fma.rn.f32 f121, f90, 0f3F575C64, %45; +fma.rn.f32 f122, f91, 0fBF0A6770, 0f00000000; +fma.rn.f32 f123, f93, 0f3ED4B147, f119; +fma.rn.f32 f124, f96, 0fBF68DDA4, f120; +fma.rn.f32 f125, f94, 0f3ED4B147, f121; +fma.rn.f32 f126, f95, 0fBF68DDA4, f122; +fma.rn.f32 f127, f97, 0fBE11BAFB, f123; +fma.rn.f32 f128, f100, 0fBF7D64F0, f124; +fma.rn.f32 f129, f98, 0fBE11BAFB, f125; +fma.rn.f32 f130, f99, 0fBF7D64F0, f126; +fma.rn.f32 f131, f101, 0fBF27A4F4, f127; +fma.rn.f32 f132, f104, 0fBF4178CE, f128; +fma.rn.f32 f133, f102, 0fBF27A4F4, f129; +fma.rn.f32 f134, f103, 0fBF4178CE, f130; +fma.rn.f32 f135, f105, 0fBF75A155, f131; +fma.rn.f32 f136, f108, 0fBE903F40, f132; +fma.rn.f32 f137, f106, 0fBF75A155, f133; +fma.rn.f32 f138, f107, 0fBE903F40, f134; +sub.f32 f139, f135, f136; +add.f32 f140, f138, f137; +add.f32 f141, f136, f135; +sub.f32 f142, f137, f138; +fma.rn.f32 f143, f89, 0f3ED4B147, %44; +fma.rn.f32 f144, f92, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f145, f90, 0f3ED4B147, %45; +fma.rn.f32 f146, f91, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f147, f93, 0fBF27A4F4, f143; +fma.rn.f32 f148, f96, 0fBF4178CE, f144; +fma.rn.f32 f149, f94, 0fBF27A4F4, f145; +fma.rn.f32 f150, f95, 0fBF4178CE, f146; +fma.rn.f32 f151, f97, 0fBF75A155, f147; +fma.rn.f32 f152, f100, 0f3E903F40, f148; +fma.rn.f32 f153, f98, 0fBF75A155, f149; +fma.rn.f32 f154, f99, 0f3E903F40, f150; +fma.rn.f32 f155, f101, 0fBE11BAFB, f151; +fma.rn.f32 f156, f104, 0f3F7D64F0, f152; +fma.rn.f32 f157, f102, 0fBE11BAFB, f153; +fma.rn.f32 f158, f103, 0f3F7D64F0, f154; +fma.rn.f32 f159, f105, 0f3F575C64, f155; +fma.rn.f32 f160, f108, 0f3F0A6770, f156; +fma.rn.f32 f161, f106, 0f3F575C64, f157; +fma.rn.f32 f162, f107, 0f3F0A6770, f158; +sub.f32 f163, f159, f160; +add.f32 f164, f162, f161; +add.f32 f165, f160, f159; +sub.f32 f166, f161, f162; +fma.rn.f32 f167, f89, 0fBE11BAFB, %44; +fma.rn.f32 f168, f92, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f169, f90, 0fBE11BAFB, %45; +fma.rn.f32 f170, f91, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f171, f93, 0fBF75A155, f167; +fma.rn.f32 f172, f96, 0f3E903F40, f168; +fma.rn.f32 f173, f94, 0fBF75A155, f169; +fma.rn.f32 f174, f95, 0f3E903F40, f170; +fma.rn.f32 f175, f97, 0f3ED4B147, f171; +fma.rn.f32 f176, f100, 0f3F68DDA4, f172; +fma.rn.f32 f177, f98, 0f3ED4B147, f173; +fma.rn.f32 f178, f99, 0f3F68DDA4, f174; +fma.rn.f32 f179, f101, 0f3F575C64, f175; +fma.rn.f32 f180, f104, 0fBF0A6770, f176; +fma.rn.f32 f181, f102, 0f3F575C64, f177; +fma.rn.f32 f182, f103, 0fBF0A6770, f178; +fma.rn.f32 f183, f105, 0fBF27A4F4, f179; +fma.rn.f32 f184, f108, 0fBF4178CE, f180; +fma.rn.f32 f185, f106, 0fBF27A4F4, f181; +fma.rn.f32 f186, f107, 0fBF4178CE, f182; +sub.f32 f187, f183, f184; +add.f32 f188, f186, f185; +add.f32 f189, f184, f183; +sub.f32 f190, f185, f186; +fma.rn.f32 f191, f89, 0fBF27A4F4, %44; +fma.rn.f32 f192, f92, 0fBF4178CE, 0f00000000; +fma.rn.f32 f193, f90, 0fBF27A4F4, %45; +fma.rn.f32 f194, f91, 0fBF4178CE, 0f00000000; +fma.rn.f32 f195, f93, 0fBE11BAFB, f191; +fma.rn.f32 f196, f96, 0f3F7D64F0, f192; +fma.rn.f32 f197, f94, 0fBE11BAFB, f193; +fma.rn.f32 f198, f95, 0f3F7D64F0, f194; +fma.rn.f32 f199, f97, 0f3F575C64, f195; +fma.rn.f32 f200, f100, 0fBF0A6770, f196; +fma.rn.f32 f201, f98, 0f3F575C64, f197; +fma.rn.f32 f202, f99, 0fBF0A6770, f198; +fma.rn.f32 f203, f101, 0fBF75A155, f199; +fma.rn.f32 f204, f104, 0fBE903F40, f200; +fma.rn.f32 f205, f102, 0fBF75A155, f201; +fma.rn.f32 f206, f103, 0fBE903F40, f202; +fma.rn.f32 f207, f105, 0f3ED4B147, f203; +fma.rn.f32 f208, f108, 0f3F68DDA4, f204; +fma.rn.f32 f209, f106, 0f3ED4B147, f205; +fma.rn.f32 f210, f107, 0f3F68DDA4, f206; +sub.f32 f211, f207, f208; +add.f32 f212, f210, f209; +add.f32 f213, f208, f207; +sub.f32 f214, f209, f210; +fma.rn.f32 f215, f89, 0fBF75A155, %44; +fma.rn.f32 f216, f92, 0fBE903F40, 0f00000000; +fma.rn.f32 f217, f90, 0fBF75A155, %45; +fma.rn.f32 f218, f91, 0fBE903F40, 0f00000000; +fma.rn.f32 f219, f93, 0f3F575C64, f215; +fma.rn.f32 f220, f96, 0f3F0A6770, f216; +fma.rn.f32 f221, f94, 0f3F575C64, f217; +fma.rn.f32 f222, f95, 0f3F0A6770, f218; +fma.rn.f32 f223, f97, 0fBF27A4F4, f219; +fma.rn.f32 f224, f100, 0fBF4178CE, f220; +fma.rn.f32 f225, f98, 0fBF27A4F4, f221; +fma.rn.f32 f226, f99, 0fBF4178CE, f222; +fma.rn.f32 f227, f101, 0f3ED4B147, f223; +fma.rn.f32 f228, f104, 0f3F68DDA4, f224; +fma.rn.f32 f229, f102, 0f3ED4B147, f225; +fma.rn.f32 f230, f103, 0f3F68DDA4, f226; +fma.rn.f32 f231, f105, 0fBE11BAFB, f227; +fma.rn.f32 f232, f108, 0fBF7D64F0, f228; +fma.rn.f32 f233, f106, 0fBE11BAFB, f229; +fma.rn.f32 f234, f107, 0fBF7D64F0, f230; +sub.f32 f235, f231, f232; +add.f32 f236, f234, f233; +add.f32 f237, f232, f231; +sub.f32 f238, f233, f234; +add.f32 f239, %52, %100; +add.f32 f240, %53, %101; +sub.f32 f241, %52, %100; +sub.f32 f242, %53, %101; +add.f32 f243, %57, %94; +add.f32 f244, %59, %96; +sub.f32 f245, %57, %94; +sub.f32 f246, %59, %96; +add.f32 f247, %62, %89; +add.f32 f248, %64, %91; +sub.f32 f249, %62, %89; +sub.f32 f250, %64, %91; +add.f32 f251, %68, %84; +add.f32 f252, %69, %85; +sub.f32 f253, %68, %84; +sub.f32 f254, %69, %85; +add.f32 f255, %73, %78; +add.f32 f256, %75, %80; +sub.f32 f257, %73, %78; +sub.f32 f258, %75, %80; +add.f32 f259, %46, f239; +add.f32 f260, %48, f240; +add.f32 f261, f259, f243; +add.f32 f262, f260, f244; +add.f32 f263, f261, f247; +add.f32 f264, f262, f248; +add.f32 f265, f263, f251; +add.f32 f266, f264, f252; +add.f32 f267, f265, f255; +add.f32 f268, f266, f256; +fma.rn.f32 f269, f239, 0f3F575C64, %46; +fma.rn.f32 f270, f242, 0fBF0A6770, 0f00000000; +fma.rn.f32 f271, f240, 0f3F575C64, %48; +fma.rn.f32 f272, f241, 0fBF0A6770, 0f00000000; +fma.rn.f32 f273, f243, 0f3ED4B147, f269; +fma.rn.f32 f274, f246, 0fBF68DDA4, f270; +fma.rn.f32 f275, f244, 0f3ED4B147, f271; +fma.rn.f32 f276, f245, 0fBF68DDA4, f272; +fma.rn.f32 f277, f247, 0fBE11BAFB, f273; +fma.rn.f32 f278, f250, 0fBF7D64F0, f274; +fma.rn.f32 f279, f248, 0fBE11BAFB, f275; +fma.rn.f32 f280, f249, 0fBF7D64F0, f276; +fma.rn.f32 f281, f251, 0fBF27A4F4, f277; +fma.rn.f32 f282, f254, 0fBF4178CE, f278; +fma.rn.f32 f283, f252, 0fBF27A4F4, f279; +fma.rn.f32 f284, f253, 0fBF4178CE, f280; +fma.rn.f32 f285, f255, 0fBF75A155, f281; +fma.rn.f32 f286, f258, 0fBE903F40, f282; +fma.rn.f32 f287, f256, 0fBF75A155, f283; +fma.rn.f32 f288, f257, 0fBE903F40, f284; +sub.f32 f289, f285, f286; +add.f32 f290, f288, f287; +add.f32 f291, f286, f285; +sub.f32 f292, f287, f288; +fma.rn.f32 f293, f239, 0f3ED4B147, %46; +fma.rn.f32 f294, f242, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f295, f240, 0f3ED4B147, %48; +fma.rn.f32 f296, f241, 0fBF68DDA4, 0f00000000; +fma.rn.f32 f297, f243, 0fBF27A4F4, f293; +fma.rn.f32 f298, f246, 0fBF4178CE, f294; +fma.rn.f32 f299, f244, 0fBF27A4F4, f295; +fma.rn.f32 f300, f245, 0fBF4178CE, f296; +fma.rn.f32 f301, f247, 0fBF75A155, f297; +fma.rn.f32 f302, f250, 0f3E903F40, f298; +fma.rn.f32 f303, f248, 0fBF75A155, f299; +fma.rn.f32 f304, f249, 0f3E903F40, f300; +fma.rn.f32 f305, f251, 0fBE11BAFB, f301; +fma.rn.f32 f306, f254, 0f3F7D64F0, f302; +fma.rn.f32 f307, f252, 0fBE11BAFB, f303; +fma.rn.f32 f308, f253, 0f3F7D64F0, f304; +fma.rn.f32 f309, f255, 0f3F575C64, f305; +fma.rn.f32 f310, f258, 0f3F0A6770, f306; +fma.rn.f32 f311, f256, 0f3F575C64, f307; +fma.rn.f32 f312, f257, 0f3F0A6770, f308; +sub.f32 f313, f309, f310; +add.f32 f314, f312, f311; +add.f32 f315, f310, f309; +sub.f32 f316, f311, f312; +fma.rn.f32 f317, f239, 0fBE11BAFB, %46; +fma.rn.f32 f318, f242, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f319, f240, 0fBE11BAFB, %48; +fma.rn.f32 f320, f241, 0fBF7D64F0, 0f00000000; +fma.rn.f32 f321, f243, 0fBF75A155, f317; +fma.rn.f32 f322, f246, 0f3E903F40, f318; +fma.rn.f32 f323, f244, 0fBF75A155, f319; +fma.rn.f32 f324, f245, 0f3E903F40, f320; +fma.rn.f32 f325, f247, 0f3ED4B147, f321; +fma.rn.f32 f326, f250, 0f3F68DDA4, f322; +fma.rn.f32 f327, f248, 0f3ED4B147, f323; +fma.rn.f32 f328, f249, 0f3F68DDA4, f324; +fma.rn.f32 f329, f251, 0f3F575C64, f325; +fma.rn.f32 f330, f254, 0fBF0A6770, f326; +fma.rn.f32 f331, f252, 0f3F575C64, f327; +fma.rn.f32 f332, f253, 0fBF0A6770, f328; +fma.rn.f32 f333, f255, 0fBF27A4F4, f329; +fma.rn.f32 f334, f258, 0fBF4178CE, f330; +fma.rn.f32 f335, f256, 0fBF27A4F4, f331; +fma.rn.f32 f336, f257, 0fBF4178CE, f332; +sub.f32 f337, f333, f334; +add.f32 f338, f336, f335; +add.f32 f339, f334, f333; +sub.f32 f340, f335, f336; +fma.rn.f32 f341, f239, 0fBF27A4F4, %46; +fma.rn.f32 f342, f242, 0fBF4178CE, 0f00000000; +fma.rn.f32 f343, f240, 0fBF27A4F4, %48; +fma.rn.f32 f344, f241, 0fBF4178CE, 0f00000000; +fma.rn.f32 f345, f243, 0fBE11BAFB, f341; +fma.rn.f32 f346, f246, 0f3F7D64F0, f342; +fma.rn.f32 f347, f244, 0fBE11BAFB, f343; +fma.rn.f32 f348, f245, 0f3F7D64F0, f344; +fma.rn.f32 f349, f247, 0f3F575C64, f345; +fma.rn.f32 f350, f250, 0fBF0A6770, f346; +fma.rn.f32 f351, f248, 0f3F575C64, f347; +fma.rn.f32 f352, f249, 0fBF0A6770, f348; +fma.rn.f32 f353, f251, 0fBF75A155, f349; +fma.rn.f32 f354, f254, 0fBE903F40, f350; +fma.rn.f32 f355, f252, 0fBF75A155, f351; +fma.rn.f32 f356, f253, 0fBE903F40, f352; +fma.rn.f32 f357, f255, 0f3ED4B147, f353; +fma.rn.f32 f358, f258, 0f3F68DDA4, f354; +fma.rn.f32 f359, f256, 0f3ED4B147, f355; +fma.rn.f32 f360, f257, 0f3F68DDA4, f356; +sub.f32 f361, f357, f358; +add.f32 f362, f360, f359; +add.f32 f363, f358, f357; +sub.f32 f364, f359, f360; +fma.rn.f32 f365, f239, 0fBF75A155, %46; +fma.rn.f32 f366, f242, 0fBE903F40, 0f00000000; +fma.rn.f32 f367, f240, 0fBF75A155, %48; +fma.rn.f32 f368, f241, 0fBE903F40, 0f00000000; +fma.rn.f32 f369, f243, 0f3F575C64, f365; +fma.rn.f32 f370, f246, 0f3F0A6770, f366; +fma.rn.f32 f371, f244, 0f3F575C64, f367; +fma.rn.f32 f372, f245, 0f3F0A6770, f368; +fma.rn.f32 f373, f247, 0fBF27A4F4, f369; +fma.rn.f32 f374, f250, 0fBF4178CE, f370; +fma.rn.f32 f375, f248, 0fBF27A4F4, f371; +fma.rn.f32 f376, f249, 0fBF4178CE, f372; +fma.rn.f32 f377, f251, 0f3ED4B147, f373; +fma.rn.f32 f378, f254, 0f3F68DDA4, f374; +fma.rn.f32 f379, f252, 0f3ED4B147, f375; +fma.rn.f32 f380, f253, 0f3F68DDA4, f376; +fma.rn.f32 f381, f255, 0fBE11BAFB, f377; +fma.rn.f32 f382, f258, 0fBF7D64F0, f378; +fma.rn.f32 f383, f256, 0fBE11BAFB, f379; +fma.rn.f32 f384, f257, 0fBF7D64F0, f380; +sub.f32 f385, f381, f382; +add.f32 f386, f384, f383; +add.f32 f387, f382, f381; +sub.f32 f388, f383, f384; +mul.f32 f389, f289, 0f3F75A155; +mul.f32 f390, f290, 0fBE903F40; +sub.f32 f391, f389, f390; +mul.f32 f392, f290, 0f3F75A155; +fma.rn.f32 f393, f289, 0fBE903F40, f392; +mul.f32 f394, f313, 0f3F575C64; +mul.f32 f395, f314, 0fBF0A6770; +sub.f32 f396, f394, f395; +mul.f32 f397, f314, 0f3F575C64; +fma.rn.f32 f398, f313, 0fBF0A6770, f397; +mul.f32 f399, f337, 0f3F27A4F4; +mul.f32 f400, f338, 0fBF4178CE; +sub.f32 f401, f399, f400; +mul.f32 f402, f338, 0f3F27A4F4; +fma.rn.f32 f403, f337, 0fBF4178CE, f402; +mul.f32 f404, f361, 0f3ED4B147; +mul.f32 f405, f362, 0fBF68DDA4; +sub.f32 f406, f404, f405; +mul.f32 f407, f362, 0f3ED4B147; +fma.rn.f32 f408, f361, 0fBF68DDA4, f407; +mul.f32 f409, f385, 0f3E11BAFB; +mul.f32 f410, f386, 0fBF7D64F0; +sub.f32 f411, f409, f410; +mul.f32 f412, f386, 0f3E11BAFB; +fma.rn.f32 f413, f385, 0fBF7D64F0, f412; +mul.f32 f414, f387, 0fBE11BAFB; +mul.f32 f415, f388, 0fBF7D64F0; +sub.f32 f416, f414, f415; +mul.f32 f417, f388, 0fBE11BAFB; +fma.rn.f32 f418, f387, 0fBF7D64F0, f417; +mul.f32 f419, f363, 0fBED4B147; +mul.f32 f420, f364, 0fBF68DDA4; +sub.f32 f421, f419, f420; +mul.f32 f422, f364, 0fBED4B147; +fma.rn.f32 f423, f363, 0fBF68DDA4, f422; +mul.f32 f424, f339, 0fBF27A4F4; +mul.f32 f425, f340, 0fBF4178CE; +sub.f32 f426, f424, f425; +mul.f32 f427, f340, 0fBF27A4F4; +fma.rn.f32 f428, f339, 0fBF4178CE, f427; +mul.f32 f429, f315, 0fBF575C64; +mul.f32 f430, f316, 0fBF0A6770; +sub.f32 f431, f429, f430; +mul.f32 f432, f316, 0fBF575C64; +fma.rn.f32 f433, f315, 0fBF0A6770, f432; +mul.f32 f434, f291, 0fBF75A155; +mul.f32 f435, f292, 0fBE903F40; +sub.f32 f436, f434, f435; +mul.f32 f437, f292, 0fBF75A155; +fma.rn.f32 f438, f291, 0fBE903F40, f437; +add.f32 %1, f118, f268; +add.f32 %0, f117, f267; +add.f32 %3, f140, f393; +add.f32 %2, f139, f391; +add.f32 %5, f164, f398; +add.f32 %4, f163, f396; +add.f32 %7, f188, f403; +add.f32 %6, f187, f401; +add.f32 %9, f212, f408; +add.f32 %8, f211, f406; +add.f32 %11, f236, f413; +add.f32 %10, f235, f411; +add.f32 %13, f238, f418; +add.f32 %12, f237, f416; +add.f32 %15, f214, f423; +add.f32 %14, f213, f421; +add.f32 %17, f190, f428; +add.f32 %16, f189, f426; +add.f32 %19, f166, f433; +add.f32 %18, f165, f431; +add.f32 %21, f142, f438; +add.f32 %20, f141, f436; +sub.f32 %23, f118, f268; +sub.f32 %22, f117, f267; +sub.f32 %25, f140, f393; +sub.f32 %24, f139, f391; +sub.f32 %27, f164, f398; +sub.f32 %26, f163, f396; +sub.f32 %29, f188, f403; +sub.f32 %28, f187, f401; +sub.f32 %31, f212, f408; +sub.f32 %30, f211, f406; +sub.f32 %33, f236, f413; +sub.f32 %32, f235, f411; +sub.f32 %35, f238, f418; +sub.f32 %34, f237, f416; +sub.f32 %37, f214, f423; +sub.f32 %36, f213, f421; +sub.f32 %39, f190, f428; +sub.f32 %38, f189, f426; +sub.f32 %41, f166, f433; +sub.f32 %40, f165, f431; +sub.f32 %43, f142, f438; +sub.f32 %42, f141, f436; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..183d875c66a67 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp32_inv.hpp.inc @@ -0,0 +1,410 @@ +#ifndef CUFFTDX_FFT_22_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_22_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<210, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<483>; +.reg .b64 rd<2>; +add.f32 f89, %49, %97; +add.f32 f90, %51, %99; +sub.f32 f91, %49, %97; +sub.f32 f92, %51, %99; +add.f32 f93, %54, %92; +add.f32 f94, %56, %93; +sub.f32 f95, %54, %92; +sub.f32 f96, %56, %93; +add.f32 f97, %60, %86; +add.f32 f98, %61, %88; +sub.f32 f99, %60, %86; +sub.f32 f100, %61, %88; +add.f32 f101, %65, %81; +add.f32 f102, %67, %83; +sub.f32 f103, %65, %81; +sub.f32 f104, %67, %83; +add.f32 f105, %70, %76; +add.f32 f106, %72, %77; +sub.f32 f107, %70, %76; +sub.f32 f108, %72, %77; +add.f32 f109, %44, f89; +add.f32 f110, %45, f90; +add.f32 f111, f109, f93; +add.f32 f112, f110, f94; +add.f32 f113, f111, f97; +add.f32 f114, f112, f98; +add.f32 f115, f113, f101; +add.f32 f116, f114, f102; +add.f32 f117, f115, f105; +add.f32 f118, f116, f106; +fma.rn.f32 f119, f89, 0f3F575C64, %44; +fma.rn.f32 f120, f92, 0f3F0A6770, 0f00000000; +fma.rn.f32 f121, f90, 0f3F575C64, %45; +fma.rn.f32 f122, f91, 0f3F0A6770, 0f00000000; +fma.rn.f32 f123, f93, 0f3ED4B147, f119; +fma.rn.f32 f124, f96, 0f3F68DDA4, f120; +fma.rn.f32 f125, f94, 0f3ED4B147, f121; +fma.rn.f32 f126, f95, 0f3F68DDA4, f122; +fma.rn.f32 f127, f97, 0fBE11BAFB, f123; +fma.rn.f32 f128, f100, 0f3F7D64F0, f124; +fma.rn.f32 f129, f98, 0fBE11BAFB, f125; +fma.rn.f32 f130, f99, 0f3F7D64F0, f126; +fma.rn.f32 f131, f101, 0fBF27A4F4, f127; +fma.rn.f32 f132, f104, 0f3F4178CE, f128; +fma.rn.f32 f133, f102, 0fBF27A4F4, f129; +fma.rn.f32 f134, f103, 0f3F4178CE, f130; +fma.rn.f32 f135, f105, 0fBF75A155, f131; +fma.rn.f32 f136, f108, 0f3E903F40, f132; +fma.rn.f32 f137, f106, 0fBF75A155, f133; +fma.rn.f32 f138, f107, 0f3E903F40, f134; +sub.f32 f139, f135, f136; +add.f32 f140, f138, f137; +add.f32 f141, f136, f135; +sub.f32 f142, f137, f138; +fma.rn.f32 f143, f89, 0f3ED4B147, %44; +fma.rn.f32 f144, f92, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f145, f90, 0f3ED4B147, %45; +fma.rn.f32 f146, f91, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f147, f93, 0fBF27A4F4, f143; +fma.rn.f32 f148, f96, 0f3F4178CE, f144; +fma.rn.f32 f149, f94, 0fBF27A4F4, f145; +fma.rn.f32 f150, f95, 0f3F4178CE, f146; +fma.rn.f32 f151, f97, 0fBF75A155, f147; +fma.rn.f32 f152, f100, 0fBE903F40, f148; +fma.rn.f32 f153, f98, 0fBF75A155, f149; +fma.rn.f32 f154, f99, 0fBE903F40, f150; +fma.rn.f32 f155, f101, 0fBE11BAFB, f151; +fma.rn.f32 f156, f104, 0fBF7D64F0, f152; +fma.rn.f32 f157, f102, 0fBE11BAFB, f153; +fma.rn.f32 f158, f103, 0fBF7D64F0, f154; +fma.rn.f32 f159, f105, 0f3F575C64, f155; +fma.rn.f32 f160, f108, 0fBF0A6770, f156; +fma.rn.f32 f161, f106, 0f3F575C64, f157; +fma.rn.f32 f162, f107, 0fBF0A6770, f158; +sub.f32 f163, f159, f160; +add.f32 f164, f162, f161; +add.f32 f165, f160, f159; +sub.f32 f166, f161, f162; +fma.rn.f32 f167, f89, 0fBE11BAFB, %44; +fma.rn.f32 f168, f92, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f169, f90, 0fBE11BAFB, %45; +fma.rn.f32 f170, f91, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f171, f93, 0fBF75A155, f167; +fma.rn.f32 f172, f96, 0fBE903F40, f168; +fma.rn.f32 f173, f94, 0fBF75A155, f169; +fma.rn.f32 f174, f95, 0fBE903F40, f170; +fma.rn.f32 f175, f97, 0f3ED4B147, f171; +fma.rn.f32 f176, f100, 0fBF68DDA4, f172; +fma.rn.f32 f177, f98, 0f3ED4B147, f173; +fma.rn.f32 f178, f99, 0fBF68DDA4, f174; +fma.rn.f32 f179, f101, 0f3F575C64, f175; +fma.rn.f32 f180, f104, 0f3F0A6770, f176; +fma.rn.f32 f181, f102, 0f3F575C64, f177; +fma.rn.f32 f182, f103, 0f3F0A6770, f178; +fma.rn.f32 f183, f105, 0fBF27A4F4, f179; +fma.rn.f32 f184, f108, 0f3F4178CE, f180; +fma.rn.f32 f185, f106, 0fBF27A4F4, f181; +fma.rn.f32 f186, f107, 0f3F4178CE, f182; +sub.f32 f187, f183, f184; +add.f32 f188, f186, f185; +add.f32 f189, f184, f183; +sub.f32 f190, f185, f186; +fma.rn.f32 f191, f89, 0fBF27A4F4, %44; +fma.rn.f32 f192, f92, 0f3F4178CE, 0f00000000; +fma.rn.f32 f193, f90, 0fBF27A4F4, %45; +fma.rn.f32 f194, f91, 0f3F4178CE, 0f00000000; +fma.rn.f32 f195, f93, 0fBE11BAFB, f191; +fma.rn.f32 f196, f96, 0fBF7D64F0, f192; +fma.rn.f32 f197, f94, 0fBE11BAFB, f193; +fma.rn.f32 f198, f95, 0fBF7D64F0, f194; +fma.rn.f32 f199, f97, 0f3F575C64, f195; +fma.rn.f32 f200, f100, 0f3F0A6770, f196; +fma.rn.f32 f201, f98, 0f3F575C64, f197; +fma.rn.f32 f202, f99, 0f3F0A6770, f198; +fma.rn.f32 f203, f101, 0fBF75A155, f199; +fma.rn.f32 f204, f104, 0f3E903F40, f200; +fma.rn.f32 f205, f102, 0fBF75A155, f201; +fma.rn.f32 f206, f103, 0f3E903F40, f202; +fma.rn.f32 f207, f105, 0f3ED4B147, f203; +fma.rn.f32 f208, f108, 0fBF68DDA4, f204; +fma.rn.f32 f209, f106, 0f3ED4B147, f205; +fma.rn.f32 f210, f107, 0fBF68DDA4, f206; +sub.f32 f211, f207, f208; +add.f32 f212, f210, f209; +add.f32 f213, f208, f207; +sub.f32 f214, f209, f210; +fma.rn.f32 f215, f89, 0fBF75A155, %44; +fma.rn.f32 f216, f92, 0f3E903F40, 0f00000000; +fma.rn.f32 f217, f90, 0fBF75A155, %45; +fma.rn.f32 f218, f91, 0f3E903F40, 0f00000000; +fma.rn.f32 f219, f93, 0f3F575C64, f215; +fma.rn.f32 f220, f96, 0fBF0A6770, f216; +fma.rn.f32 f221, f94, 0f3F575C64, f217; +fma.rn.f32 f222, f95, 0fBF0A6770, f218; +fma.rn.f32 f223, f97, 0fBF27A4F4, f219; +fma.rn.f32 f224, f100, 0f3F4178CE, f220; +fma.rn.f32 f225, f98, 0fBF27A4F4, f221; +fma.rn.f32 f226, f99, 0f3F4178CE, f222; +fma.rn.f32 f227, f101, 0f3ED4B147, f223; +fma.rn.f32 f228, f104, 0fBF68DDA4, f224; +fma.rn.f32 f229, f102, 0f3ED4B147, f225; +fma.rn.f32 f230, f103, 0fBF68DDA4, f226; +fma.rn.f32 f231, f105, 0fBE11BAFB, f227; +fma.rn.f32 f232, f108, 0f3F7D64F0, f228; +fma.rn.f32 f233, f106, 0fBE11BAFB, f229; +fma.rn.f32 f234, f107, 0f3F7D64F0, f230; +sub.f32 f235, f231, f232; +add.f32 f236, f234, f233; +add.f32 f237, f232, f231; +sub.f32 f238, f233, f234; +add.f32 f239, %52, %100; +add.f32 f240, %53, %101; +sub.f32 f241, %52, %100; +sub.f32 f242, %53, %101; +add.f32 f243, %57, %94; +add.f32 f244, %59, %96; +sub.f32 f245, %57, %94; +sub.f32 f246, %59, %96; +add.f32 f247, %62, %89; +add.f32 f248, %64, %91; +sub.f32 f249, %62, %89; +sub.f32 f250, %64, %91; +add.f32 f251, %68, %84; +add.f32 f252, %69, %85; +sub.f32 f253, %68, %84; +sub.f32 f254, %69, %85; +add.f32 f255, %73, %78; +add.f32 f256, %75, %80; +sub.f32 f257, %73, %78; +sub.f32 f258, %75, %80; +add.f32 f259, %46, f239; +add.f32 f260, %48, f240; +add.f32 f261, f259, f243; +add.f32 f262, f260, f244; +add.f32 f263, f261, f247; +add.f32 f264, f262, f248; +add.f32 f265, f263, f251; +add.f32 f266, f264, f252; +add.f32 f267, f265, f255; +add.f32 f268, f266, f256; +fma.rn.f32 f269, f239, 0f3F575C64, %46; +fma.rn.f32 f270, f242, 0f3F0A6770, 0f00000000; +fma.rn.f32 f271, f240, 0f3F575C64, %48; +fma.rn.f32 f272, f241, 0f3F0A6770, 0f00000000; +fma.rn.f32 f273, f243, 0f3ED4B147, f269; +fma.rn.f32 f274, f246, 0f3F68DDA4, f270; +fma.rn.f32 f275, f244, 0f3ED4B147, f271; +fma.rn.f32 f276, f245, 0f3F68DDA4, f272; +fma.rn.f32 f277, f247, 0fBE11BAFB, f273; +fma.rn.f32 f278, f250, 0f3F7D64F0, f274; +fma.rn.f32 f279, f248, 0fBE11BAFB, f275; +fma.rn.f32 f280, f249, 0f3F7D64F0, f276; +fma.rn.f32 f281, f251, 0fBF27A4F4, f277; +fma.rn.f32 f282, f254, 0f3F4178CE, f278; +fma.rn.f32 f283, f252, 0fBF27A4F4, f279; +fma.rn.f32 f284, f253, 0f3F4178CE, f280; +fma.rn.f32 f285, f255, 0fBF75A155, f281; +fma.rn.f32 f286, f258, 0f3E903F40, f282; +fma.rn.f32 f287, f256, 0fBF75A155, f283; +fma.rn.f32 f288, f257, 0f3E903F40, f284; +sub.f32 f289, f285, f286; +add.f32 f290, f288, f287; +add.f32 f291, f286, f285; +sub.f32 f292, f287, f288; +fma.rn.f32 f293, f239, 0f3ED4B147, %46; +fma.rn.f32 f294, f242, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f295, f240, 0f3ED4B147, %48; +fma.rn.f32 f296, f241, 0f3F68DDA4, 0f00000000; +fma.rn.f32 f297, f243, 0fBF27A4F4, f293; +fma.rn.f32 f298, f246, 0f3F4178CE, f294; +fma.rn.f32 f299, f244, 0fBF27A4F4, f295; +fma.rn.f32 f300, f245, 0f3F4178CE, f296; +fma.rn.f32 f301, f247, 0fBF75A155, f297; +fma.rn.f32 f302, f250, 0fBE903F40, f298; +fma.rn.f32 f303, f248, 0fBF75A155, f299; +fma.rn.f32 f304, f249, 0fBE903F40, f300; +fma.rn.f32 f305, f251, 0fBE11BAFB, f301; +fma.rn.f32 f306, f254, 0fBF7D64F0, f302; +fma.rn.f32 f307, f252, 0fBE11BAFB, f303; +fma.rn.f32 f308, f253, 0fBF7D64F0, f304; +fma.rn.f32 f309, f255, 0f3F575C64, f305; +fma.rn.f32 f310, f258, 0fBF0A6770, f306; +fma.rn.f32 f311, f256, 0f3F575C64, f307; +fma.rn.f32 f312, f257, 0fBF0A6770, f308; +sub.f32 f313, f309, f310; +add.f32 f314, f312, f311; +add.f32 f315, f310, f309; +sub.f32 f316, f311, f312; +fma.rn.f32 f317, f239, 0fBE11BAFB, %46; +fma.rn.f32 f318, f242, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f319, f240, 0fBE11BAFB, %48; +fma.rn.f32 f320, f241, 0f3F7D64F0, 0f00000000; +fma.rn.f32 f321, f243, 0fBF75A155, f317; +fma.rn.f32 f322, f246, 0fBE903F40, f318; +fma.rn.f32 f323, f244, 0fBF75A155, f319; +fma.rn.f32 f324, f245, 0fBE903F40, f320; +fma.rn.f32 f325, f247, 0f3ED4B147, f321; +fma.rn.f32 f326, f250, 0fBF68DDA4, f322; +fma.rn.f32 f327, f248, 0f3ED4B147, f323; +fma.rn.f32 f328, f249, 0fBF68DDA4, f324; +fma.rn.f32 f329, f251, 0f3F575C64, f325; +fma.rn.f32 f330, f254, 0f3F0A6770, f326; +fma.rn.f32 f331, f252, 0f3F575C64, f327; +fma.rn.f32 f332, f253, 0f3F0A6770, f328; +fma.rn.f32 f333, f255, 0fBF27A4F4, f329; +fma.rn.f32 f334, f258, 0f3F4178CE, f330; +fma.rn.f32 f335, f256, 0fBF27A4F4, f331; +fma.rn.f32 f336, f257, 0f3F4178CE, f332; +sub.f32 f337, f333, f334; +add.f32 f338, f336, f335; +add.f32 f339, f334, f333; +sub.f32 f340, f335, f336; +fma.rn.f32 f341, f239, 0fBF27A4F4, %46; +fma.rn.f32 f342, f242, 0f3F4178CE, 0f00000000; +fma.rn.f32 f343, f240, 0fBF27A4F4, %48; +fma.rn.f32 f344, f241, 0f3F4178CE, 0f00000000; +fma.rn.f32 f345, f243, 0fBE11BAFB, f341; +fma.rn.f32 f346, f246, 0fBF7D64F0, f342; +fma.rn.f32 f347, f244, 0fBE11BAFB, f343; +fma.rn.f32 f348, f245, 0fBF7D64F0, f344; +fma.rn.f32 f349, f247, 0f3F575C64, f345; +fma.rn.f32 f350, f250, 0f3F0A6770, f346; +fma.rn.f32 f351, f248, 0f3F575C64, f347; +fma.rn.f32 f352, f249, 0f3F0A6770, f348; +fma.rn.f32 f353, f251, 0fBF75A155, f349; +fma.rn.f32 f354, f254, 0f3E903F40, f350; +fma.rn.f32 f355, f252, 0fBF75A155, f351; +fma.rn.f32 f356, f253, 0f3E903F40, f352; +fma.rn.f32 f357, f255, 0f3ED4B147, f353; +fma.rn.f32 f358, f258, 0fBF68DDA4, f354; +fma.rn.f32 f359, f256, 0f3ED4B147, f355; +fma.rn.f32 f360, f257, 0fBF68DDA4, f356; +sub.f32 f361, f357, f358; +add.f32 f362, f360, f359; +add.f32 f363, f358, f357; +sub.f32 f364, f359, f360; +fma.rn.f32 f365, f239, 0fBF75A155, %46; +fma.rn.f32 f366, f242, 0f3E903F40, 0f00000000; +fma.rn.f32 f367, f240, 0fBF75A155, %48; +fma.rn.f32 f368, f241, 0f3E903F40, 0f00000000; +fma.rn.f32 f369, f243, 0f3F575C64, f365; +fma.rn.f32 f370, f246, 0fBF0A6770, f366; +fma.rn.f32 f371, f244, 0f3F575C64, f367; +fma.rn.f32 f372, f245, 0fBF0A6770, f368; +fma.rn.f32 f373, f247, 0fBF27A4F4, f369; +fma.rn.f32 f374, f250, 0f3F4178CE, f370; +fma.rn.f32 f375, f248, 0fBF27A4F4, f371; +fma.rn.f32 f376, f249, 0f3F4178CE, f372; +fma.rn.f32 f377, f251, 0f3ED4B147, f373; +fma.rn.f32 f378, f254, 0fBF68DDA4, f374; +fma.rn.f32 f379, f252, 0f3ED4B147, f375; +fma.rn.f32 f380, f253, 0fBF68DDA4, f376; +fma.rn.f32 f381, f255, 0fBE11BAFB, f377; +fma.rn.f32 f382, f258, 0f3F7D64F0, f378; +fma.rn.f32 f383, f256, 0fBE11BAFB, f379; +fma.rn.f32 f384, f257, 0f3F7D64F0, f380; +sub.f32 f385, f381, f382; +add.f32 f386, f384, f383; +add.f32 f387, f382, f381; +sub.f32 f388, f383, f384; +mul.f32 f389, f289, 0f3F75A155; +mul.f32 f390, f290, 0f3E903F40; +sub.f32 f391, f389, f390; +mul.f32 f392, f290, 0f3F75A155; +fma.rn.f32 f393, f289, 0f3E903F40, f392; +mul.f32 f394, f313, 0f3F575C64; +mul.f32 f395, f314, 0f3F0A6770; +sub.f32 f396, f394, f395; +mul.f32 f397, f314, 0f3F575C64; +fma.rn.f32 f398, f313, 0f3F0A6770, f397; +mul.f32 f399, f337, 0f3F27A4F4; +mul.f32 f400, f338, 0f3F4178CE; +sub.f32 f401, f399, f400; +mul.f32 f402, f338, 0f3F27A4F4; +fma.rn.f32 f403, f337, 0f3F4178CE, f402; +mul.f32 f404, f361, 0f3ED4B147; +mul.f32 f405, f362, 0f3F68DDA4; +sub.f32 f406, f404, f405; +mul.f32 f407, f362, 0f3ED4B147; +fma.rn.f32 f408, f361, 0f3F68DDA4, f407; +mul.f32 f409, f385, 0f3E11BAFB; +mul.f32 f410, f386, 0f3F7D64F0; +sub.f32 f411, f409, f410; +mul.f32 f412, f386, 0f3E11BAFB; +fma.rn.f32 f413, f385, 0f3F7D64F0, f412; +mul.f32 f414, f387, 0fBE11BAFB; +mul.f32 f415, f388, 0f3F7D64F0; +sub.f32 f416, f414, f415; +mul.f32 f417, f388, 0fBE11BAFB; +fma.rn.f32 f418, f387, 0f3F7D64F0, f417; +mul.f32 f419, f363, 0fBED4B147; +mul.f32 f420, f364, 0f3F68DDA4; +sub.f32 f421, f419, f420; +mul.f32 f422, f364, 0fBED4B147; +fma.rn.f32 f423, f363, 0f3F68DDA4, f422; +mul.f32 f424, f339, 0fBF27A4F4; +mul.f32 f425, f340, 0f3F4178CE; +sub.f32 f426, f424, f425; +mul.f32 f427, f340, 0fBF27A4F4; +fma.rn.f32 f428, f339, 0f3F4178CE, f427; +mul.f32 f429, f315, 0fBF575C64; +mul.f32 f430, f316, 0f3F0A6770; +sub.f32 f431, f429, f430; +mul.f32 f432, f316, 0fBF575C64; +fma.rn.f32 f433, f315, 0f3F0A6770, f432; +mul.f32 f434, f291, 0fBF75A155; +mul.f32 f435, f292, 0f3E903F40; +sub.f32 f436, f434, f435; +mul.f32 f437, f292, 0fBF75A155; +fma.rn.f32 f438, f291, 0f3E903F40, f437; +add.f32 %1, f118, f268; +add.f32 %0, f117, f267; +add.f32 %3, f140, f393; +add.f32 %2, f139, f391; +add.f32 %5, f164, f398; +add.f32 %4, f163, f396; +add.f32 %7, f188, f403; +add.f32 %6, f187, f401; +add.f32 %9, f212, f408; +add.f32 %8, f211, f406; +add.f32 %11, f236, f413; +add.f32 %10, f235, f411; +add.f32 %13, f238, f418; +add.f32 %12, f237, f416; +add.f32 %15, f214, f423; +add.f32 %14, f213, f421; +add.f32 %17, f190, f428; +add.f32 %16, f189, f426; +add.f32 %19, f166, f433; +add.f32 %18, f165, f431; +add.f32 %21, f142, f438; +add.f32 %20, f141, f436; +sub.f32 %23, f118, f268; +sub.f32 %22, f117, f267; +sub.f32 %25, f140, f393; +sub.f32 %24, f139, f391; +sub.f32 %27, f164, f398; +sub.f32 %26, f163, f396; +sub.f32 %29, f188, f403; +sub.f32 %28, f187, f401; +sub.f32 %31, f212, f408; +sub.f32 %30, f211, f406; +sub.f32 %33, f236, f413; +sub.f32 %32, f235, f411; +sub.f32 %35, f238, f418; +sub.f32 %34, f237, f416; +sub.f32 %37, f214, f423; +sub.f32 %36, f213, f421; +sub.f32 %39, f190, f428; +sub.f32 %38, f189, f426; +sub.f32 %41, f166, f433; +sub.f32 %40, f165, f431; +sub.f32 %43, f142, f438; +sub.f32 %42, f141, f436; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..72698cbf50972 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp64_fwd.hpp.inc @@ -0,0 +1,410 @@ +#ifndef CUFFTDX_FFT_22_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_22_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<412, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<483>; +.reg .b64 rd<2>; +add.f64 fd89, %49, %97; +add.f64 fd90, %51, %99; +sub.f64 fd91, %49, %97; +sub.f64 fd92, %51, %99; +add.f64 fd93, %54, %92; +add.f64 fd94, %56, %93; +sub.f64 fd95, %54, %92; +sub.f64 fd96, %56, %93; +add.f64 fd97, %60, %86; +add.f64 fd98, %61, %88; +sub.f64 fd99, %60, %86; +sub.f64 fd100, %61, %88; +add.f64 fd101, %65, %81; +add.f64 fd102, %67, %83; +sub.f64 fd103, %65, %81; +sub.f64 fd104, %67, %83; +add.f64 fd105, %70, %76; +add.f64 fd106, %72, %77; +sub.f64 fd107, %70, %76; +sub.f64 fd108, %72, %77; +add.f64 fd109, %44, fd89; +add.f64 fd110, %45, fd90; +add.f64 fd111, fd109, fd93; +add.f64 fd112, fd110, fd94; +add.f64 fd113, fd111, fd97; +add.f64 fd114, fd112, fd98; +add.f64 fd115, fd113, fd101; +add.f64 fd116, fd114, fd102; +add.f64 fd117, fd115, fd105; +add.f64 fd118, fd116, fd106; +fma.rn.f64 fd119, fd89, 0d3FEAEB8C8764F0BA, %44; +fma.rn.f64 fd120, fd92, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd121, fd90, 0d3FEAEB8C8764F0BA, %45; +fma.rn.f64 fd122, fd91, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd123, fd93, 0d3FDA9628D9C712B6, fd119; +fma.rn.f64 fd124, fd96, 0dBFED1BB48EEE2C13, fd120; +fma.rn.f64 fd125, fd94, 0d3FDA9628D9C712B6, fd121; +fma.rn.f64 fd126, fd95, 0dBFED1BB48EEE2C13, fd122; +fma.rn.f64 fd127, fd97, 0dBFC2375F640F44DB, fd123; +fma.rn.f64 fd128, fd100, 0dBFEFAC9E043842EF, fd124; +fma.rn.f64 fd129, fd98, 0dBFC2375F640F44DB, fd125; +fma.rn.f64 fd130, fd99, 0dBFEFAC9E043842EF, fd126; +fma.rn.f64 fd131, fd101, 0dBFE4F49E7F775887, fd127; +fma.rn.f64 fd132, fd104, 0dBFE82F19BB3A28A1, fd128; +fma.rn.f64 fd133, fd102, 0dBFE4F49E7F775887, fd129; +fma.rn.f64 fd134, fd103, 0dBFE82F19BB3A28A1, fd130; +fma.rn.f64 fd135, fd105, 0dBFEEB42A9BCD5057, fd131; +fma.rn.f64 fd136, fd108, 0dBFD207E7FD768DBF, fd132; +fma.rn.f64 fd137, fd106, 0dBFEEB42A9BCD5057, fd133; +fma.rn.f64 fd138, fd107, 0dBFD207E7FD768DBF, fd134; +sub.f64 fd139, fd135, fd136; +add.f64 fd140, fd138, fd137; +add.f64 fd141, fd136, fd135; +sub.f64 fd142, fd137, fd138; +fma.rn.f64 fd143, fd89, 0d3FDA9628D9C712B6, %44; +fma.rn.f64 fd144, fd92, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd145, fd90, 0d3FDA9628D9C712B6, %45; +fma.rn.f64 fd146, fd91, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd147, fd93, 0dBFE4F49E7F775887, fd143; +fma.rn.f64 fd148, fd96, 0dBFE82F19BB3A28A1, fd144; +fma.rn.f64 fd149, fd94, 0dBFE4F49E7F775887, fd145; +fma.rn.f64 fd150, fd95, 0dBFE82F19BB3A28A1, fd146; +fma.rn.f64 fd151, fd97, 0dBFEEB42A9BCD5057, fd147; +fma.rn.f64 fd152, fd100, 0d3FD207E7FD768DBF, fd148; +fma.rn.f64 fd153, fd98, 0dBFEEB42A9BCD5057, fd149; +fma.rn.f64 fd154, fd99, 0d3FD207E7FD768DBF, fd150; +fma.rn.f64 fd155, fd101, 0dBFC2375F640F44DB, fd151; +fma.rn.f64 fd156, fd104, 0d3FEFAC9E043842EF, fd152; +fma.rn.f64 fd157, fd102, 0dBFC2375F640F44DB, fd153; +fma.rn.f64 fd158, fd103, 0d3FEFAC9E043842EF, fd154; +fma.rn.f64 fd159, fd105, 0d3FEAEB8C8764F0BA, fd155; +fma.rn.f64 fd160, fd108, 0d3FE14CEDF8BB580B, fd156; +fma.rn.f64 fd161, fd106, 0d3FEAEB8C8764F0BA, fd157; +fma.rn.f64 fd162, fd107, 0d3FE14CEDF8BB580B, fd158; +sub.f64 fd163, fd159, fd160; +add.f64 fd164, fd162, fd161; +add.f64 fd165, fd160, fd159; +sub.f64 fd166, fd161, fd162; +fma.rn.f64 fd167, fd89, 0dBFC2375F640F44DB, %44; +fma.rn.f64 fd168, fd92, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd169, fd90, 0dBFC2375F640F44DB, %45; +fma.rn.f64 fd170, fd91, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd171, fd93, 0dBFEEB42A9BCD5057, fd167; +fma.rn.f64 fd172, fd96, 0d3FD207E7FD768DBF, fd168; +fma.rn.f64 fd173, fd94, 0dBFEEB42A9BCD5057, fd169; +fma.rn.f64 fd174, fd95, 0d3FD207E7FD768DBF, fd170; +fma.rn.f64 fd175, fd97, 0d3FDA9628D9C712B6, fd171; +fma.rn.f64 fd176, fd100, 0d3FED1BB48EEE2C13, fd172; +fma.rn.f64 fd177, fd98, 0d3FDA9628D9C712B6, fd173; +fma.rn.f64 fd178, fd99, 0d3FED1BB48EEE2C13, fd174; +fma.rn.f64 fd179, fd101, 0d3FEAEB8C8764F0BA, fd175; +fma.rn.f64 fd180, fd104, 0dBFE14CEDF8BB580B, fd176; +fma.rn.f64 fd181, fd102, 0d3FEAEB8C8764F0BA, fd177; +fma.rn.f64 fd182, fd103, 0dBFE14CEDF8BB580B, fd178; +fma.rn.f64 fd183, fd105, 0dBFE4F49E7F775887, fd179; +fma.rn.f64 fd184, fd108, 0dBFE82F19BB3A28A1, fd180; +fma.rn.f64 fd185, fd106, 0dBFE4F49E7F775887, fd181; +fma.rn.f64 fd186, fd107, 0dBFE82F19BB3A28A1, fd182; +sub.f64 fd187, fd183, fd184; +add.f64 fd188, fd186, fd185; +add.f64 fd189, fd184, fd183; +sub.f64 fd190, fd185, fd186; +fma.rn.f64 fd191, fd89, 0dBFE4F49E7F775887, %44; +fma.rn.f64 fd192, fd92, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd193, fd90, 0dBFE4F49E7F775887, %45; +fma.rn.f64 fd194, fd91, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd195, fd93, 0dBFC2375F640F44DB, fd191; +fma.rn.f64 fd196, fd96, 0d3FEFAC9E043842EF, fd192; +fma.rn.f64 fd197, fd94, 0dBFC2375F640F44DB, fd193; +fma.rn.f64 fd198, fd95, 0d3FEFAC9E043842EF, fd194; +fma.rn.f64 fd199, fd97, 0d3FEAEB8C8764F0BA, fd195; +fma.rn.f64 fd200, fd100, 0dBFE14CEDF8BB580B, fd196; +fma.rn.f64 fd201, fd98, 0d3FEAEB8C8764F0BA, fd197; +fma.rn.f64 fd202, fd99, 0dBFE14CEDF8BB580B, fd198; +fma.rn.f64 fd203, fd101, 0dBFEEB42A9BCD5057, fd199; +fma.rn.f64 fd204, fd104, 0dBFD207E7FD768DBF, fd200; +fma.rn.f64 fd205, fd102, 0dBFEEB42A9BCD5057, fd201; +fma.rn.f64 fd206, fd103, 0dBFD207E7FD768DBF, fd202; +fma.rn.f64 fd207, fd105, 0d3FDA9628D9C712B6, fd203; +fma.rn.f64 fd208, fd108, 0d3FED1BB48EEE2C13, fd204; +fma.rn.f64 fd209, fd106, 0d3FDA9628D9C712B6, fd205; +fma.rn.f64 fd210, fd107, 0d3FED1BB48EEE2C13, fd206; +sub.f64 fd211, fd207, fd208; +add.f64 fd212, fd210, fd209; +add.f64 fd213, fd208, fd207; +sub.f64 fd214, fd209, fd210; +fma.rn.f64 fd215, fd89, 0dBFEEB42A9BCD5057, %44; +fma.rn.f64 fd216, fd92, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd217, fd90, 0dBFEEB42A9BCD5057, %45; +fma.rn.f64 fd218, fd91, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd219, fd93, 0d3FEAEB8C8764F0BA, fd215; +fma.rn.f64 fd220, fd96, 0d3FE14CEDF8BB580B, fd216; +fma.rn.f64 fd221, fd94, 0d3FEAEB8C8764F0BA, fd217; +fma.rn.f64 fd222, fd95, 0d3FE14CEDF8BB580B, fd218; +fma.rn.f64 fd223, fd97, 0dBFE4F49E7F775887, fd219; +fma.rn.f64 fd224, fd100, 0dBFE82F19BB3A28A1, fd220; +fma.rn.f64 fd225, fd98, 0dBFE4F49E7F775887, fd221; +fma.rn.f64 fd226, fd99, 0dBFE82F19BB3A28A1, fd222; +fma.rn.f64 fd227, fd101, 0d3FDA9628D9C712B6, fd223; +fma.rn.f64 fd228, fd104, 0d3FED1BB48EEE2C13, fd224; +fma.rn.f64 fd229, fd102, 0d3FDA9628D9C712B6, fd225; +fma.rn.f64 fd230, fd103, 0d3FED1BB48EEE2C13, fd226; +fma.rn.f64 fd231, fd105, 0dBFC2375F640F44DB, fd227; +fma.rn.f64 fd232, fd108, 0dBFEFAC9E043842EF, fd228; +fma.rn.f64 fd233, fd106, 0dBFC2375F640F44DB, fd229; +fma.rn.f64 fd234, fd107, 0dBFEFAC9E043842EF, fd230; +sub.f64 fd235, fd231, fd232; +add.f64 fd236, fd234, fd233; +add.f64 fd237, fd232, fd231; +sub.f64 fd238, fd233, fd234; +add.f64 fd239, %52, %100; +add.f64 fd240, %53, %101; +sub.f64 fd241, %52, %100; +sub.f64 fd242, %53, %101; +add.f64 fd243, %57, %94; +add.f64 fd244, %59, %96; +sub.f64 fd245, %57, %94; +sub.f64 fd246, %59, %96; +add.f64 fd247, %62, %89; +add.f64 fd248, %64, %91; +sub.f64 fd249, %62, %89; +sub.f64 fd250, %64, %91; +add.f64 fd251, %68, %84; +add.f64 fd252, %69, %85; +sub.f64 fd253, %68, %84; +sub.f64 fd254, %69, %85; +add.f64 fd255, %73, %78; +add.f64 fd256, %75, %80; +sub.f64 fd257, %73, %78; +sub.f64 fd258, %75, %80; +add.f64 fd259, %46, fd239; +add.f64 fd260, %48, fd240; +add.f64 fd261, fd259, fd243; +add.f64 fd262, fd260, fd244; +add.f64 fd263, fd261, fd247; +add.f64 fd264, fd262, fd248; +add.f64 fd265, fd263, fd251; +add.f64 fd266, fd264, fd252; +add.f64 fd267, fd265, fd255; +add.f64 fd268, fd266, fd256; +fma.rn.f64 fd269, fd239, 0d3FEAEB8C8764F0BA, %46; +fma.rn.f64 fd270, fd242, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd271, fd240, 0d3FEAEB8C8764F0BA, %48; +fma.rn.f64 fd272, fd241, 0dBFE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd273, fd243, 0d3FDA9628D9C712B6, fd269; +fma.rn.f64 fd274, fd246, 0dBFED1BB48EEE2C13, fd270; +fma.rn.f64 fd275, fd244, 0d3FDA9628D9C712B6, fd271; +fma.rn.f64 fd276, fd245, 0dBFED1BB48EEE2C13, fd272; +fma.rn.f64 fd277, fd247, 0dBFC2375F640F44DB, fd273; +fma.rn.f64 fd278, fd250, 0dBFEFAC9E043842EF, fd274; +fma.rn.f64 fd279, fd248, 0dBFC2375F640F44DB, fd275; +fma.rn.f64 fd280, fd249, 0dBFEFAC9E043842EF, fd276; +fma.rn.f64 fd281, fd251, 0dBFE4F49E7F775887, fd277; +fma.rn.f64 fd282, fd254, 0dBFE82F19BB3A28A1, fd278; +fma.rn.f64 fd283, fd252, 0dBFE4F49E7F775887, fd279; +fma.rn.f64 fd284, fd253, 0dBFE82F19BB3A28A1, fd280; +fma.rn.f64 fd285, fd255, 0dBFEEB42A9BCD5057, fd281; +fma.rn.f64 fd286, fd258, 0dBFD207E7FD768DBF, fd282; +fma.rn.f64 fd287, fd256, 0dBFEEB42A9BCD5057, fd283; +fma.rn.f64 fd288, fd257, 0dBFD207E7FD768DBF, fd284; +sub.f64 fd289, fd285, fd286; +add.f64 fd290, fd288, fd287; +add.f64 fd291, fd286, fd285; +sub.f64 fd292, fd287, fd288; +fma.rn.f64 fd293, fd239, 0d3FDA9628D9C712B6, %46; +fma.rn.f64 fd294, fd242, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd295, fd240, 0d3FDA9628D9C712B6, %48; +fma.rn.f64 fd296, fd241, 0dBFED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd297, fd243, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd298, fd246, 0dBFE82F19BB3A28A1, fd294; +fma.rn.f64 fd299, fd244, 0dBFE4F49E7F775887, fd295; +fma.rn.f64 fd300, fd245, 0dBFE82F19BB3A28A1, fd296; +fma.rn.f64 fd301, fd247, 0dBFEEB42A9BCD5057, fd297; +fma.rn.f64 fd302, fd250, 0d3FD207E7FD768DBF, fd298; +fma.rn.f64 fd303, fd248, 0dBFEEB42A9BCD5057, fd299; +fma.rn.f64 fd304, fd249, 0d3FD207E7FD768DBF, fd300; +fma.rn.f64 fd305, fd251, 0dBFC2375F640F44DB, fd301; +fma.rn.f64 fd306, fd254, 0d3FEFAC9E043842EF, fd302; +fma.rn.f64 fd307, fd252, 0dBFC2375F640F44DB, fd303; +fma.rn.f64 fd308, fd253, 0d3FEFAC9E043842EF, fd304; +fma.rn.f64 fd309, fd255, 0d3FEAEB8C8764F0BA, fd305; +fma.rn.f64 fd310, fd258, 0d3FE14CEDF8BB580B, fd306; +fma.rn.f64 fd311, fd256, 0d3FEAEB8C8764F0BA, fd307; +fma.rn.f64 fd312, fd257, 0d3FE14CEDF8BB580B, fd308; +sub.f64 fd313, fd309, fd310; +add.f64 fd314, fd312, fd311; +add.f64 fd315, fd310, fd309; +sub.f64 fd316, fd311, fd312; +fma.rn.f64 fd317, fd239, 0dBFC2375F640F44DB, %46; +fma.rn.f64 fd318, fd242, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd319, fd240, 0dBFC2375F640F44DB, %48; +fma.rn.f64 fd320, fd241, 0dBFEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd321, fd243, 0dBFEEB42A9BCD5057, fd317; +fma.rn.f64 fd322, fd246, 0d3FD207E7FD768DBF, fd318; +fma.rn.f64 fd323, fd244, 0dBFEEB42A9BCD5057, fd319; +fma.rn.f64 fd324, fd245, 0d3FD207E7FD768DBF, fd320; +fma.rn.f64 fd325, fd247, 0d3FDA9628D9C712B6, fd321; +fma.rn.f64 fd326, fd250, 0d3FED1BB48EEE2C13, fd322; +fma.rn.f64 fd327, fd248, 0d3FDA9628D9C712B6, fd323; +fma.rn.f64 fd328, fd249, 0d3FED1BB48EEE2C13, fd324; +fma.rn.f64 fd329, fd251, 0d3FEAEB8C8764F0BA, fd325; +fma.rn.f64 fd330, fd254, 0dBFE14CEDF8BB580B, fd326; +fma.rn.f64 fd331, fd252, 0d3FEAEB8C8764F0BA, fd327; +fma.rn.f64 fd332, fd253, 0dBFE14CEDF8BB580B, fd328; +fma.rn.f64 fd333, fd255, 0dBFE4F49E7F775887, fd329; +fma.rn.f64 fd334, fd258, 0dBFE82F19BB3A28A1, fd330; +fma.rn.f64 fd335, fd256, 0dBFE4F49E7F775887, fd331; +fma.rn.f64 fd336, fd257, 0dBFE82F19BB3A28A1, fd332; +sub.f64 fd337, fd333, fd334; +add.f64 fd338, fd336, fd335; +add.f64 fd339, fd334, fd333; +sub.f64 fd340, fd335, fd336; +fma.rn.f64 fd341, fd239, 0dBFE4F49E7F775887, %46; +fma.rn.f64 fd342, fd242, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd343, fd240, 0dBFE4F49E7F775887, %48; +fma.rn.f64 fd344, fd241, 0dBFE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd345, fd243, 0dBFC2375F640F44DB, fd341; +fma.rn.f64 fd346, fd246, 0d3FEFAC9E043842EF, fd342; +fma.rn.f64 fd347, fd244, 0dBFC2375F640F44DB, fd343; +fma.rn.f64 fd348, fd245, 0d3FEFAC9E043842EF, fd344; +fma.rn.f64 fd349, fd247, 0d3FEAEB8C8764F0BA, fd345; +fma.rn.f64 fd350, fd250, 0dBFE14CEDF8BB580B, fd346; +fma.rn.f64 fd351, fd248, 0d3FEAEB8C8764F0BA, fd347; +fma.rn.f64 fd352, fd249, 0dBFE14CEDF8BB580B, fd348; +fma.rn.f64 fd353, fd251, 0dBFEEB42A9BCD5057, fd349; +fma.rn.f64 fd354, fd254, 0dBFD207E7FD768DBF, fd350; +fma.rn.f64 fd355, fd252, 0dBFEEB42A9BCD5057, fd351; +fma.rn.f64 fd356, fd253, 0dBFD207E7FD768DBF, fd352; +fma.rn.f64 fd357, fd255, 0d3FDA9628D9C712B6, fd353; +fma.rn.f64 fd358, fd258, 0d3FED1BB48EEE2C13, fd354; +fma.rn.f64 fd359, fd256, 0d3FDA9628D9C712B6, fd355; +fma.rn.f64 fd360, fd257, 0d3FED1BB48EEE2C13, fd356; +sub.f64 fd361, fd357, fd358; +add.f64 fd362, fd360, fd359; +add.f64 fd363, fd358, fd357; +sub.f64 fd364, fd359, fd360; +fma.rn.f64 fd365, fd239, 0dBFEEB42A9BCD5057, %46; +fma.rn.f64 fd366, fd242, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd367, fd240, 0dBFEEB42A9BCD5057, %48; +fma.rn.f64 fd368, fd241, 0dBFD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd369, fd243, 0d3FEAEB8C8764F0BA, fd365; +fma.rn.f64 fd370, fd246, 0d3FE14CEDF8BB580B, fd366; +fma.rn.f64 fd371, fd244, 0d3FEAEB8C8764F0BA, fd367; +fma.rn.f64 fd372, fd245, 0d3FE14CEDF8BB580B, fd368; +fma.rn.f64 fd373, fd247, 0dBFE4F49E7F775887, fd369; +fma.rn.f64 fd374, fd250, 0dBFE82F19BB3A28A1, fd370; +fma.rn.f64 fd375, fd248, 0dBFE4F49E7F775887, fd371; +fma.rn.f64 fd376, fd249, 0dBFE82F19BB3A28A1, fd372; +fma.rn.f64 fd377, fd251, 0d3FDA9628D9C712B6, fd373; +fma.rn.f64 fd378, fd254, 0d3FED1BB48EEE2C13, fd374; +fma.rn.f64 fd379, fd252, 0d3FDA9628D9C712B6, fd375; +fma.rn.f64 fd380, fd253, 0d3FED1BB48EEE2C13, fd376; +fma.rn.f64 fd381, fd255, 0dBFC2375F640F44DB, fd377; +fma.rn.f64 fd382, fd258, 0dBFEFAC9E043842EF, fd378; +fma.rn.f64 fd383, fd256, 0dBFC2375F640F44DB, fd379; +fma.rn.f64 fd384, fd257, 0dBFEFAC9E043842EF, fd380; +sub.f64 fd385, fd381, fd382; +add.f64 fd386, fd384, fd383; +add.f64 fd387, fd382, fd381; +sub.f64 fd388, fd383, fd384; +mul.f64 fd389, fd289, 0d3FEEB42A9BCD5057; +mul.f64 fd390, fd290, 0dBFD207E7FD768DBF; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd290, 0d3FEEB42A9BCD5057; +fma.rn.f64 fd393, fd289, 0dBFD207E7FD768DBF, fd392; +mul.f64 fd394, fd313, 0d3FEAEB8C8764F0BA; +mul.f64 fd395, fd314, 0dBFE14CEDF8BB580B; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd314, 0d3FEAEB8C8764F0BA; +fma.rn.f64 fd398, fd313, 0dBFE14CEDF8BB580B, fd397; +mul.f64 fd399, fd337, 0d3FE4F49E7F775887; +mul.f64 fd400, fd338, 0dBFE82F19BB3A28A1; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd338, 0d3FE4F49E7F775887; +fma.rn.f64 fd403, fd337, 0dBFE82F19BB3A28A1, fd402; +mul.f64 fd404, fd361, 0d3FDA9628D9C712B6; +mul.f64 fd405, fd362, 0dBFED1BB48EEE2C13; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd362, 0d3FDA9628D9C712B6; +fma.rn.f64 fd408, fd361, 0dBFED1BB48EEE2C13, fd407; +mul.f64 fd409, fd385, 0d3FC2375F640F44DB; +mul.f64 fd410, fd386, 0dBFEFAC9E043842EF; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd386, 0d3FC2375F640F44DB; +fma.rn.f64 fd413, fd385, 0dBFEFAC9E043842EF, fd412; +mul.f64 fd414, fd387, 0dBFC2375F640F44DB; +mul.f64 fd415, fd388, 0dBFEFAC9E043842EF; +sub.f64 fd416, fd414, fd415; +mul.f64 fd417, fd388, 0dBFC2375F640F44DB; +fma.rn.f64 fd418, fd387, 0dBFEFAC9E043842EF, fd417; +mul.f64 fd419, fd363, 0dBFDA9628D9C712B6; +mul.f64 fd420, fd364, 0dBFED1BB48EEE2C13; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd364, 0dBFDA9628D9C712B6; +fma.rn.f64 fd423, fd363, 0dBFED1BB48EEE2C13, fd422; +mul.f64 fd424, fd339, 0dBFE4F49E7F775887; +mul.f64 fd425, fd340, 0dBFE82F19BB3A28A1; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd340, 0dBFE4F49E7F775887; +fma.rn.f64 fd428, fd339, 0dBFE82F19BB3A28A1, fd427; +mul.f64 fd429, fd315, 0dBFEAEB8C8764F0BA; +mul.f64 fd430, fd316, 0dBFE14CEDF8BB580B; +sub.f64 fd431, fd429, fd430; +mul.f64 fd432, fd316, 0dBFEAEB8C8764F0BA; +fma.rn.f64 fd433, fd315, 0dBFE14CEDF8BB580B, fd432; +mul.f64 fd434, fd291, 0dBFEEB42A9BCD5057; +mul.f64 fd435, fd292, 0dBFD207E7FD768DBF; +sub.f64 fd436, fd434, fd435; +mul.f64 fd437, fd292, 0dBFEEB42A9BCD5057; +fma.rn.f64 fd438, fd291, 0dBFD207E7FD768DBF, fd437; +add.f64 %1, fd118, fd268; +add.f64 %0, fd117, fd267; +add.f64 %3, fd140, fd393; +add.f64 %2, fd139, fd391; +add.f64 %5, fd164, fd398; +add.f64 %4, fd163, fd396; +add.f64 %7, fd188, fd403; +add.f64 %6, fd187, fd401; +add.f64 %9, fd212, fd408; +add.f64 %8, fd211, fd406; +add.f64 %11, fd236, fd413; +add.f64 %10, fd235, fd411; +add.f64 %13, fd238, fd418; +add.f64 %12, fd237, fd416; +add.f64 %15, fd214, fd423; +add.f64 %14, fd213, fd421; +add.f64 %17, fd190, fd428; +add.f64 %16, fd189, fd426; +add.f64 %19, fd166, fd433; +add.f64 %18, fd165, fd431; +add.f64 %21, fd142, fd438; +add.f64 %20, fd141, fd436; +sub.f64 %23, fd118, fd268; +sub.f64 %22, fd117, fd267; +sub.f64 %25, fd140, fd393; +sub.f64 %24, fd139, fd391; +sub.f64 %27, fd164, fd398; +sub.f64 %26, fd163, fd396; +sub.f64 %29, fd188, fd403; +sub.f64 %28, fd187, fd401; +sub.f64 %31, fd212, fd408; +sub.f64 %30, fd211, fd406; +sub.f64 %33, fd236, fd413; +sub.f64 %32, fd235, fd411; +sub.f64 %35, fd238, fd418; +sub.f64 %34, fd237, fd416; +sub.f64 %37, fd214, fd423; +sub.f64 %36, fd213, fd421; +sub.f64 %39, fd190, fd428; +sub.f64 %38, fd189, fd426; +sub.f64 %41, fd166, fd433; +sub.f64 %40, fd165, fd431; +sub.f64 %43, fd142, fd438; +sub.f64 %42, fd141, fd436; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..20ec60122633f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_22_fp64_inv.hpp.inc @@ -0,0 +1,410 @@ +#ifndef CUFFTDX_FFT_22_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_22_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<583, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<483>; +.reg .b64 rd<2>; +add.f64 fd89, %49, %97; +add.f64 fd90, %51, %99; +sub.f64 fd91, %49, %97; +sub.f64 fd92, %51, %99; +add.f64 fd93, %54, %92; +add.f64 fd94, %56, %93; +sub.f64 fd95, %54, %92; +sub.f64 fd96, %56, %93; +add.f64 fd97, %60, %86; +add.f64 fd98, %61, %88; +sub.f64 fd99, %60, %86; +sub.f64 fd100, %61, %88; +add.f64 fd101, %65, %81; +add.f64 fd102, %67, %83; +sub.f64 fd103, %65, %81; +sub.f64 fd104, %67, %83; +add.f64 fd105, %70, %76; +add.f64 fd106, %72, %77; +sub.f64 fd107, %70, %76; +sub.f64 fd108, %72, %77; +add.f64 fd109, %44, fd89; +add.f64 fd110, %45, fd90; +add.f64 fd111, fd109, fd93; +add.f64 fd112, fd110, fd94; +add.f64 fd113, fd111, fd97; +add.f64 fd114, fd112, fd98; +add.f64 fd115, fd113, fd101; +add.f64 fd116, fd114, fd102; +add.f64 fd117, fd115, fd105; +add.f64 fd118, fd116, fd106; +fma.rn.f64 fd119, fd89, 0d3FEAEB8C8764F0BA, %44; +fma.rn.f64 fd120, fd92, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd121, fd90, 0d3FEAEB8C8764F0BA, %45; +fma.rn.f64 fd122, fd91, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd123, fd93, 0d3FDA9628D9C712B6, fd119; +fma.rn.f64 fd124, fd96, 0d3FED1BB48EEE2C13, fd120; +fma.rn.f64 fd125, fd94, 0d3FDA9628D9C712B6, fd121; +fma.rn.f64 fd126, fd95, 0d3FED1BB48EEE2C13, fd122; +fma.rn.f64 fd127, fd97, 0dBFC2375F640F44DB, fd123; +fma.rn.f64 fd128, fd100, 0d3FEFAC9E043842EF, fd124; +fma.rn.f64 fd129, fd98, 0dBFC2375F640F44DB, fd125; +fma.rn.f64 fd130, fd99, 0d3FEFAC9E043842EF, fd126; +fma.rn.f64 fd131, fd101, 0dBFE4F49E7F775887, fd127; +fma.rn.f64 fd132, fd104, 0d3FE82F19BB3A28A1, fd128; +fma.rn.f64 fd133, fd102, 0dBFE4F49E7F775887, fd129; +fma.rn.f64 fd134, fd103, 0d3FE82F19BB3A28A1, fd130; +fma.rn.f64 fd135, fd105, 0dBFEEB42A9BCD5057, fd131; +fma.rn.f64 fd136, fd108, 0d3FD207E7FD768DBF, fd132; +fma.rn.f64 fd137, fd106, 0dBFEEB42A9BCD5057, fd133; +fma.rn.f64 fd138, fd107, 0d3FD207E7FD768DBF, fd134; +sub.f64 fd139, fd135, fd136; +add.f64 fd140, fd138, fd137; +add.f64 fd141, fd136, fd135; +sub.f64 fd142, fd137, fd138; +fma.rn.f64 fd143, fd89, 0d3FDA9628D9C712B6, %44; +fma.rn.f64 fd144, fd92, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd145, fd90, 0d3FDA9628D9C712B6, %45; +fma.rn.f64 fd146, fd91, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd147, fd93, 0dBFE4F49E7F775887, fd143; +fma.rn.f64 fd148, fd96, 0d3FE82F19BB3A28A1, fd144; +fma.rn.f64 fd149, fd94, 0dBFE4F49E7F775887, fd145; +fma.rn.f64 fd150, fd95, 0d3FE82F19BB3A28A1, fd146; +fma.rn.f64 fd151, fd97, 0dBFEEB42A9BCD5057, fd147; +fma.rn.f64 fd152, fd100, 0dBFD207E7FD768DBF, fd148; +fma.rn.f64 fd153, fd98, 0dBFEEB42A9BCD5057, fd149; +fma.rn.f64 fd154, fd99, 0dBFD207E7FD768DBF, fd150; +fma.rn.f64 fd155, fd101, 0dBFC2375F640F44DB, fd151; +fma.rn.f64 fd156, fd104, 0dBFEFAC9E043842EF, fd152; +fma.rn.f64 fd157, fd102, 0dBFC2375F640F44DB, fd153; +fma.rn.f64 fd158, fd103, 0dBFEFAC9E043842EF, fd154; +fma.rn.f64 fd159, fd105, 0d3FEAEB8C8764F0BA, fd155; +fma.rn.f64 fd160, fd108, 0dBFE14CEDF8BB580B, fd156; +fma.rn.f64 fd161, fd106, 0d3FEAEB8C8764F0BA, fd157; +fma.rn.f64 fd162, fd107, 0dBFE14CEDF8BB580B, fd158; +sub.f64 fd163, fd159, fd160; +add.f64 fd164, fd162, fd161; +add.f64 fd165, fd160, fd159; +sub.f64 fd166, fd161, fd162; +fma.rn.f64 fd167, fd89, 0dBFC2375F640F44DB, %44; +fma.rn.f64 fd168, fd92, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd169, fd90, 0dBFC2375F640F44DB, %45; +fma.rn.f64 fd170, fd91, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd171, fd93, 0dBFEEB42A9BCD5057, fd167; +fma.rn.f64 fd172, fd96, 0dBFD207E7FD768DBF, fd168; +fma.rn.f64 fd173, fd94, 0dBFEEB42A9BCD5057, fd169; +fma.rn.f64 fd174, fd95, 0dBFD207E7FD768DBF, fd170; +fma.rn.f64 fd175, fd97, 0d3FDA9628D9C712B6, fd171; +fma.rn.f64 fd176, fd100, 0dBFED1BB48EEE2C13, fd172; +fma.rn.f64 fd177, fd98, 0d3FDA9628D9C712B6, fd173; +fma.rn.f64 fd178, fd99, 0dBFED1BB48EEE2C13, fd174; +fma.rn.f64 fd179, fd101, 0d3FEAEB8C8764F0BA, fd175; +fma.rn.f64 fd180, fd104, 0d3FE14CEDF8BB580B, fd176; +fma.rn.f64 fd181, fd102, 0d3FEAEB8C8764F0BA, fd177; +fma.rn.f64 fd182, fd103, 0d3FE14CEDF8BB580B, fd178; +fma.rn.f64 fd183, fd105, 0dBFE4F49E7F775887, fd179; +fma.rn.f64 fd184, fd108, 0d3FE82F19BB3A28A1, fd180; +fma.rn.f64 fd185, fd106, 0dBFE4F49E7F775887, fd181; +fma.rn.f64 fd186, fd107, 0d3FE82F19BB3A28A1, fd182; +sub.f64 fd187, fd183, fd184; +add.f64 fd188, fd186, fd185; +add.f64 fd189, fd184, fd183; +sub.f64 fd190, fd185, fd186; +fma.rn.f64 fd191, fd89, 0dBFE4F49E7F775887, %44; +fma.rn.f64 fd192, fd92, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd193, fd90, 0dBFE4F49E7F775887, %45; +fma.rn.f64 fd194, fd91, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd195, fd93, 0dBFC2375F640F44DB, fd191; +fma.rn.f64 fd196, fd96, 0dBFEFAC9E043842EF, fd192; +fma.rn.f64 fd197, fd94, 0dBFC2375F640F44DB, fd193; +fma.rn.f64 fd198, fd95, 0dBFEFAC9E043842EF, fd194; +fma.rn.f64 fd199, fd97, 0d3FEAEB8C8764F0BA, fd195; +fma.rn.f64 fd200, fd100, 0d3FE14CEDF8BB580B, fd196; +fma.rn.f64 fd201, fd98, 0d3FEAEB8C8764F0BA, fd197; +fma.rn.f64 fd202, fd99, 0d3FE14CEDF8BB580B, fd198; +fma.rn.f64 fd203, fd101, 0dBFEEB42A9BCD5057, fd199; +fma.rn.f64 fd204, fd104, 0d3FD207E7FD768DBF, fd200; +fma.rn.f64 fd205, fd102, 0dBFEEB42A9BCD5057, fd201; +fma.rn.f64 fd206, fd103, 0d3FD207E7FD768DBF, fd202; +fma.rn.f64 fd207, fd105, 0d3FDA9628D9C712B6, fd203; +fma.rn.f64 fd208, fd108, 0dBFED1BB48EEE2C13, fd204; +fma.rn.f64 fd209, fd106, 0d3FDA9628D9C712B6, fd205; +fma.rn.f64 fd210, fd107, 0dBFED1BB48EEE2C13, fd206; +sub.f64 fd211, fd207, fd208; +add.f64 fd212, fd210, fd209; +add.f64 fd213, fd208, fd207; +sub.f64 fd214, fd209, fd210; +fma.rn.f64 fd215, fd89, 0dBFEEB42A9BCD5057, %44; +fma.rn.f64 fd216, fd92, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd217, fd90, 0dBFEEB42A9BCD5057, %45; +fma.rn.f64 fd218, fd91, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd219, fd93, 0d3FEAEB8C8764F0BA, fd215; +fma.rn.f64 fd220, fd96, 0dBFE14CEDF8BB580B, fd216; +fma.rn.f64 fd221, fd94, 0d3FEAEB8C8764F0BA, fd217; +fma.rn.f64 fd222, fd95, 0dBFE14CEDF8BB580B, fd218; +fma.rn.f64 fd223, fd97, 0dBFE4F49E7F775887, fd219; +fma.rn.f64 fd224, fd100, 0d3FE82F19BB3A28A1, fd220; +fma.rn.f64 fd225, fd98, 0dBFE4F49E7F775887, fd221; +fma.rn.f64 fd226, fd99, 0d3FE82F19BB3A28A1, fd222; +fma.rn.f64 fd227, fd101, 0d3FDA9628D9C712B6, fd223; +fma.rn.f64 fd228, fd104, 0dBFED1BB48EEE2C13, fd224; +fma.rn.f64 fd229, fd102, 0d3FDA9628D9C712B6, fd225; +fma.rn.f64 fd230, fd103, 0dBFED1BB48EEE2C13, fd226; +fma.rn.f64 fd231, fd105, 0dBFC2375F640F44DB, fd227; +fma.rn.f64 fd232, fd108, 0d3FEFAC9E043842EF, fd228; +fma.rn.f64 fd233, fd106, 0dBFC2375F640F44DB, fd229; +fma.rn.f64 fd234, fd107, 0d3FEFAC9E043842EF, fd230; +sub.f64 fd235, fd231, fd232; +add.f64 fd236, fd234, fd233; +add.f64 fd237, fd232, fd231; +sub.f64 fd238, fd233, fd234; +add.f64 fd239, %52, %100; +add.f64 fd240, %53, %101; +sub.f64 fd241, %52, %100; +sub.f64 fd242, %53, %101; +add.f64 fd243, %57, %94; +add.f64 fd244, %59, %96; +sub.f64 fd245, %57, %94; +sub.f64 fd246, %59, %96; +add.f64 fd247, %62, %89; +add.f64 fd248, %64, %91; +sub.f64 fd249, %62, %89; +sub.f64 fd250, %64, %91; +add.f64 fd251, %68, %84; +add.f64 fd252, %69, %85; +sub.f64 fd253, %68, %84; +sub.f64 fd254, %69, %85; +add.f64 fd255, %73, %78; +add.f64 fd256, %75, %80; +sub.f64 fd257, %73, %78; +sub.f64 fd258, %75, %80; +add.f64 fd259, %46, fd239; +add.f64 fd260, %48, fd240; +add.f64 fd261, fd259, fd243; +add.f64 fd262, fd260, fd244; +add.f64 fd263, fd261, fd247; +add.f64 fd264, fd262, fd248; +add.f64 fd265, fd263, fd251; +add.f64 fd266, fd264, fd252; +add.f64 fd267, fd265, fd255; +add.f64 fd268, fd266, fd256; +fma.rn.f64 fd269, fd239, 0d3FEAEB8C8764F0BA, %46; +fma.rn.f64 fd270, fd242, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd271, fd240, 0d3FEAEB8C8764F0BA, %48; +fma.rn.f64 fd272, fd241, 0d3FE14CEDF8BB580B, 0d0000000000000000; +fma.rn.f64 fd273, fd243, 0d3FDA9628D9C712B6, fd269; +fma.rn.f64 fd274, fd246, 0d3FED1BB48EEE2C13, fd270; +fma.rn.f64 fd275, fd244, 0d3FDA9628D9C712B6, fd271; +fma.rn.f64 fd276, fd245, 0d3FED1BB48EEE2C13, fd272; +fma.rn.f64 fd277, fd247, 0dBFC2375F640F44DB, fd273; +fma.rn.f64 fd278, fd250, 0d3FEFAC9E043842EF, fd274; +fma.rn.f64 fd279, fd248, 0dBFC2375F640F44DB, fd275; +fma.rn.f64 fd280, fd249, 0d3FEFAC9E043842EF, fd276; +fma.rn.f64 fd281, fd251, 0dBFE4F49E7F775887, fd277; +fma.rn.f64 fd282, fd254, 0d3FE82F19BB3A28A1, fd278; +fma.rn.f64 fd283, fd252, 0dBFE4F49E7F775887, fd279; +fma.rn.f64 fd284, fd253, 0d3FE82F19BB3A28A1, fd280; +fma.rn.f64 fd285, fd255, 0dBFEEB42A9BCD5057, fd281; +fma.rn.f64 fd286, fd258, 0d3FD207E7FD768DBF, fd282; +fma.rn.f64 fd287, fd256, 0dBFEEB42A9BCD5057, fd283; +fma.rn.f64 fd288, fd257, 0d3FD207E7FD768DBF, fd284; +sub.f64 fd289, fd285, fd286; +add.f64 fd290, fd288, fd287; +add.f64 fd291, fd286, fd285; +sub.f64 fd292, fd287, fd288; +fma.rn.f64 fd293, fd239, 0d3FDA9628D9C712B6, %46; +fma.rn.f64 fd294, fd242, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd295, fd240, 0d3FDA9628D9C712B6, %48; +fma.rn.f64 fd296, fd241, 0d3FED1BB48EEE2C13, 0d0000000000000000; +fma.rn.f64 fd297, fd243, 0dBFE4F49E7F775887, fd293; +fma.rn.f64 fd298, fd246, 0d3FE82F19BB3A28A1, fd294; +fma.rn.f64 fd299, fd244, 0dBFE4F49E7F775887, fd295; +fma.rn.f64 fd300, fd245, 0d3FE82F19BB3A28A1, fd296; +fma.rn.f64 fd301, fd247, 0dBFEEB42A9BCD5057, fd297; +fma.rn.f64 fd302, fd250, 0dBFD207E7FD768DBF, fd298; +fma.rn.f64 fd303, fd248, 0dBFEEB42A9BCD5057, fd299; +fma.rn.f64 fd304, fd249, 0dBFD207E7FD768DBF, fd300; +fma.rn.f64 fd305, fd251, 0dBFC2375F640F44DB, fd301; +fma.rn.f64 fd306, fd254, 0dBFEFAC9E043842EF, fd302; +fma.rn.f64 fd307, fd252, 0dBFC2375F640F44DB, fd303; +fma.rn.f64 fd308, fd253, 0dBFEFAC9E043842EF, fd304; +fma.rn.f64 fd309, fd255, 0d3FEAEB8C8764F0BA, fd305; +fma.rn.f64 fd310, fd258, 0dBFE14CEDF8BB580B, fd306; +fma.rn.f64 fd311, fd256, 0d3FEAEB8C8764F0BA, fd307; +fma.rn.f64 fd312, fd257, 0dBFE14CEDF8BB580B, fd308; +sub.f64 fd313, fd309, fd310; +add.f64 fd314, fd312, fd311; +add.f64 fd315, fd310, fd309; +sub.f64 fd316, fd311, fd312; +fma.rn.f64 fd317, fd239, 0dBFC2375F640F44DB, %46; +fma.rn.f64 fd318, fd242, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd319, fd240, 0dBFC2375F640F44DB, %48; +fma.rn.f64 fd320, fd241, 0d3FEFAC9E043842EF, 0d0000000000000000; +fma.rn.f64 fd321, fd243, 0dBFEEB42A9BCD5057, fd317; +fma.rn.f64 fd322, fd246, 0dBFD207E7FD768DBF, fd318; +fma.rn.f64 fd323, fd244, 0dBFEEB42A9BCD5057, fd319; +fma.rn.f64 fd324, fd245, 0dBFD207E7FD768DBF, fd320; +fma.rn.f64 fd325, fd247, 0d3FDA9628D9C712B6, fd321; +fma.rn.f64 fd326, fd250, 0dBFED1BB48EEE2C13, fd322; +fma.rn.f64 fd327, fd248, 0d3FDA9628D9C712B6, fd323; +fma.rn.f64 fd328, fd249, 0dBFED1BB48EEE2C13, fd324; +fma.rn.f64 fd329, fd251, 0d3FEAEB8C8764F0BA, fd325; +fma.rn.f64 fd330, fd254, 0d3FE14CEDF8BB580B, fd326; +fma.rn.f64 fd331, fd252, 0d3FEAEB8C8764F0BA, fd327; +fma.rn.f64 fd332, fd253, 0d3FE14CEDF8BB580B, fd328; +fma.rn.f64 fd333, fd255, 0dBFE4F49E7F775887, fd329; +fma.rn.f64 fd334, fd258, 0d3FE82F19BB3A28A1, fd330; +fma.rn.f64 fd335, fd256, 0dBFE4F49E7F775887, fd331; +fma.rn.f64 fd336, fd257, 0d3FE82F19BB3A28A1, fd332; +sub.f64 fd337, fd333, fd334; +add.f64 fd338, fd336, fd335; +add.f64 fd339, fd334, fd333; +sub.f64 fd340, fd335, fd336; +fma.rn.f64 fd341, fd239, 0dBFE4F49E7F775887, %46; +fma.rn.f64 fd342, fd242, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd343, fd240, 0dBFE4F49E7F775887, %48; +fma.rn.f64 fd344, fd241, 0d3FE82F19BB3A28A1, 0d0000000000000000; +fma.rn.f64 fd345, fd243, 0dBFC2375F640F44DB, fd341; +fma.rn.f64 fd346, fd246, 0dBFEFAC9E043842EF, fd342; +fma.rn.f64 fd347, fd244, 0dBFC2375F640F44DB, fd343; +fma.rn.f64 fd348, fd245, 0dBFEFAC9E043842EF, fd344; +fma.rn.f64 fd349, fd247, 0d3FEAEB8C8764F0BA, fd345; +fma.rn.f64 fd350, fd250, 0d3FE14CEDF8BB580B, fd346; +fma.rn.f64 fd351, fd248, 0d3FEAEB8C8764F0BA, fd347; +fma.rn.f64 fd352, fd249, 0d3FE14CEDF8BB580B, fd348; +fma.rn.f64 fd353, fd251, 0dBFEEB42A9BCD5057, fd349; +fma.rn.f64 fd354, fd254, 0d3FD207E7FD768DBF, fd350; +fma.rn.f64 fd355, fd252, 0dBFEEB42A9BCD5057, fd351; +fma.rn.f64 fd356, fd253, 0d3FD207E7FD768DBF, fd352; +fma.rn.f64 fd357, fd255, 0d3FDA9628D9C712B6, fd353; +fma.rn.f64 fd358, fd258, 0dBFED1BB48EEE2C13, fd354; +fma.rn.f64 fd359, fd256, 0d3FDA9628D9C712B6, fd355; +fma.rn.f64 fd360, fd257, 0dBFED1BB48EEE2C13, fd356; +sub.f64 fd361, fd357, fd358; +add.f64 fd362, fd360, fd359; +add.f64 fd363, fd358, fd357; +sub.f64 fd364, fd359, fd360; +fma.rn.f64 fd365, fd239, 0dBFEEB42A9BCD5057, %46; +fma.rn.f64 fd366, fd242, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd367, fd240, 0dBFEEB42A9BCD5057, %48; +fma.rn.f64 fd368, fd241, 0d3FD207E7FD768DBF, 0d0000000000000000; +fma.rn.f64 fd369, fd243, 0d3FEAEB8C8764F0BA, fd365; +fma.rn.f64 fd370, fd246, 0dBFE14CEDF8BB580B, fd366; +fma.rn.f64 fd371, fd244, 0d3FEAEB8C8764F0BA, fd367; +fma.rn.f64 fd372, fd245, 0dBFE14CEDF8BB580B, fd368; +fma.rn.f64 fd373, fd247, 0dBFE4F49E7F775887, fd369; +fma.rn.f64 fd374, fd250, 0d3FE82F19BB3A28A1, fd370; +fma.rn.f64 fd375, fd248, 0dBFE4F49E7F775887, fd371; +fma.rn.f64 fd376, fd249, 0d3FE82F19BB3A28A1, fd372; +fma.rn.f64 fd377, fd251, 0d3FDA9628D9C712B6, fd373; +fma.rn.f64 fd378, fd254, 0dBFED1BB48EEE2C13, fd374; +fma.rn.f64 fd379, fd252, 0d3FDA9628D9C712B6, fd375; +fma.rn.f64 fd380, fd253, 0dBFED1BB48EEE2C13, fd376; +fma.rn.f64 fd381, fd255, 0dBFC2375F640F44DB, fd377; +fma.rn.f64 fd382, fd258, 0d3FEFAC9E043842EF, fd378; +fma.rn.f64 fd383, fd256, 0dBFC2375F640F44DB, fd379; +fma.rn.f64 fd384, fd257, 0d3FEFAC9E043842EF, fd380; +sub.f64 fd385, fd381, fd382; +add.f64 fd386, fd384, fd383; +add.f64 fd387, fd382, fd381; +sub.f64 fd388, fd383, fd384; +mul.f64 fd389, fd289, 0d3FEEB42A9BCD5057; +mul.f64 fd390, fd290, 0d3FD207E7FD768DBF; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd290, 0d3FEEB42A9BCD5057; +fma.rn.f64 fd393, fd289, 0d3FD207E7FD768DBF, fd392; +mul.f64 fd394, fd313, 0d3FEAEB8C8764F0BA; +mul.f64 fd395, fd314, 0d3FE14CEDF8BB580B; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd314, 0d3FEAEB8C8764F0BA; +fma.rn.f64 fd398, fd313, 0d3FE14CEDF8BB580B, fd397; +mul.f64 fd399, fd337, 0d3FE4F49E7F775887; +mul.f64 fd400, fd338, 0d3FE82F19BB3A28A1; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd338, 0d3FE4F49E7F775887; +fma.rn.f64 fd403, fd337, 0d3FE82F19BB3A28A1, fd402; +mul.f64 fd404, fd361, 0d3FDA9628D9C712B6; +mul.f64 fd405, fd362, 0d3FED1BB48EEE2C13; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd362, 0d3FDA9628D9C712B6; +fma.rn.f64 fd408, fd361, 0d3FED1BB48EEE2C13, fd407; +mul.f64 fd409, fd385, 0d3FC2375F640F44DB; +mul.f64 fd410, fd386, 0d3FEFAC9E043842EF; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd386, 0d3FC2375F640F44DB; +fma.rn.f64 fd413, fd385, 0d3FEFAC9E043842EF, fd412; +mul.f64 fd414, fd387, 0dBFC2375F640F44DB; +mul.f64 fd415, fd388, 0d3FEFAC9E043842EF; +sub.f64 fd416, fd414, fd415; +mul.f64 fd417, fd388, 0dBFC2375F640F44DB; +fma.rn.f64 fd418, fd387, 0d3FEFAC9E043842EF, fd417; +mul.f64 fd419, fd363, 0dBFDA9628D9C712B6; +mul.f64 fd420, fd364, 0d3FED1BB48EEE2C13; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd364, 0dBFDA9628D9C712B6; +fma.rn.f64 fd423, fd363, 0d3FED1BB48EEE2C13, fd422; +mul.f64 fd424, fd339, 0dBFE4F49E7F775887; +mul.f64 fd425, fd340, 0d3FE82F19BB3A28A1; +sub.f64 fd426, fd424, fd425; +mul.f64 fd427, fd340, 0dBFE4F49E7F775887; +fma.rn.f64 fd428, fd339, 0d3FE82F19BB3A28A1, fd427; +mul.f64 fd429, fd315, 0dBFEAEB8C8764F0BA; +mul.f64 fd430, fd316, 0d3FE14CEDF8BB580B; +sub.f64 fd431, fd429, fd430; +mul.f64 fd432, fd316, 0dBFEAEB8C8764F0BA; +fma.rn.f64 fd433, fd315, 0d3FE14CEDF8BB580B, fd432; +mul.f64 fd434, fd291, 0dBFEEB42A9BCD5057; +mul.f64 fd435, fd292, 0d3FD207E7FD768DBF; +sub.f64 fd436, fd434, fd435; +mul.f64 fd437, fd292, 0dBFEEB42A9BCD5057; +fma.rn.f64 fd438, fd291, 0d3FD207E7FD768DBF, fd437; +add.f64 %1, fd118, fd268; +add.f64 %0, fd117, fd267; +add.f64 %3, fd140, fd393; +add.f64 %2, fd139, fd391; +add.f64 %5, fd164, fd398; +add.f64 %4, fd163, fd396; +add.f64 %7, fd188, fd403; +add.f64 %6, fd187, fd401; +add.f64 %9, fd212, fd408; +add.f64 %8, fd211, fd406; +add.f64 %11, fd236, fd413; +add.f64 %10, fd235, fd411; +add.f64 %13, fd238, fd418; +add.f64 %12, fd237, fd416; +add.f64 %15, fd214, fd423; +add.f64 %14, fd213, fd421; +add.f64 %17, fd190, fd428; +add.f64 %16, fd189, fd426; +add.f64 %19, fd166, fd433; +add.f64 %18, fd165, fd431; +add.f64 %21, fd142, fd438; +add.f64 %20, fd141, fd436; +sub.f64 %23, fd118, fd268; +sub.f64 %22, fd117, fd267; +sub.f64 %25, fd140, fd393; +sub.f64 %24, fd139, fd391; +sub.f64 %27, fd164, fd398; +sub.f64 %26, fd163, fd396; +sub.f64 %29, fd188, fd403; +sub.f64 %28, fd187, fd401; +sub.f64 %31, fd212, fd408; +sub.f64 %30, fd211, fd406; +sub.f64 %33, fd236, fd413; +sub.f64 %32, fd235, fd411; +sub.f64 %35, fd238, fd418; +sub.f64 %34, fd237, fd416; +sub.f64 %37, fd214, fd423; +sub.f64 %36, fd213, fd421; +sub.f64 %39, fd190, fd428; +sub.f64 %38, fd189, fd426; +sub.f64 %41, fd166, fd433; +sub.f64 %40, fd165, fd431; +sub.f64 %43, fd142, fd438; +sub.f64 %42, fd141, fd436; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..470da42b85cea --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp16_fwd.hpp.inc @@ -0,0 +1,5266 @@ +#ifndef CUFFTDX_FFT_23_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_23_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<755, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<507>; +.reg .b32 r<3257>; +.reg .f64 fd<485>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %48, %90; +} +{ +add.f16x2 r4, %49, %91; +} +{ +sub.f16x2 r7, %48, %90; +} +{ +sub.f16x2 r10, %49, %91; +} +{ +add.f16x2 r13, %50, %88; +} +{ +add.f16x2 r16, %51, %89; +} +{ +sub.f16x2 r19, %50, %88; +} +{ +sub.f16x2 r22, %51, %89; +} +{ +add.f16x2 r25, %52, %86; +} +{ +add.f16x2 r28, %53, %87; +} +{ +sub.f16x2 r31, %52, %86; +} +{ +sub.f16x2 r34, %53, %87; +} +{ +add.f16x2 r37, %54, %84; +} +{ +add.f16x2 r40, %55, %85; +} +{ +sub.f16x2 r43, %54, %84; +} +{ +sub.f16x2 r46, %55, %85; +} +{ +add.f16x2 r49, %56, %82; +} +{ +add.f16x2 r52, %57, %83; +} +{ +sub.f16x2 r55, %56, %82; +} +{ +sub.f16x2 r58, %57, %83; +} +{ +add.f16x2 r61, %58, %80; +} +{ +add.f16x2 r64, %59, %81; +} +{ +sub.f16x2 r67, %58, %80; +} +{ +sub.f16x2 r70, %59, %81; +} +{ +add.f16x2 r73, %60, %78; +} +{ +add.f16x2 r76, %61, %79; +} +{ +sub.f16x2 r79, %60, %78; +} +{ +sub.f16x2 r82, %61, %79; +} +{ +add.f16x2 r85, %62, %76; +} +{ +add.f16x2 r88, %63, %77; +} +{ +sub.f16x2 r91, %62, %76; +} +{ +sub.f16x2 r94, %63, %77; +} +{ +add.f16x2 r97, %64, %74; +} +{ +add.f16x2 r100, %65, %75; +} +{ +sub.f16x2 r103, %64, %74; +} +{ +sub.f16x2 r106, %65, %75; +} +{ +add.f16x2 r109, %66, %72; +} +{ +add.f16x2 r112, %67, %73; +} +{ +sub.f16x2 r115, %66, %72; +} +{ +sub.f16x2 r118, %67, %73; +} +{ +add.f16x2 r121, %68, %70; +} +{ +add.f16x2 r124, %69, %71; +} +{ +sub.f16x2 r127, %68, %70; +} +{ +sub.f16x2 r130, %69, %71; +} +{ +add.f16x2 r133, %46, r1; +} +{ +add.f16x2 r136, %47, r4; +} +{ +add.f16x2 r139, r133, r13; +} +{ +add.f16x2 r142, r136, r16; +} +{ +add.f16x2 r145, r139, r25; +} +{ +add.f16x2 r148, r142, r28; +} +{ +add.f16x2 r151, r145, r37; +} +{ +add.f16x2 r154, r148, r40; +} +{ +add.f16x2 r157, r151, r49; +} +{ +add.f16x2 r160, r154, r52; +} +{ +add.f16x2 r163, r157, r61; +} +{ +add.f16x2 r166, r160, r64; +} +{ +add.f16x2 r169, r163, r73; +} +{ +add.f16x2 r172, r166, r76; +} +{ +add.f16x2 r175, r169, r85; +} +{ +add.f16x2 r178, r172, r88; +} +{ +add.f16x2 r181, r175, r97; +} +{ +add.f16x2 r184, r178, r100; +} +{ +add.f16x2 r187, r181, r109; +} +{ +add.f16x2 r190, r184, r112; +} +{ +add.f16x2 %0, r187, r121; +} +{ +add.f16x2 %1, r190, r124; +} +mov.u32 r2980, 0; +cvt.rn.f16.s32 rs1, r2980; +mov.b32 r211, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r2980; +mov.b32 r223, {rs2, rs2}; +mov.f64 fd447, 0d3FEED037EA3D2DBB; +{ +cvt.rn.f16.f64 rs3, fd447; +} +mov.b32 r203, {rs3, rs3}; +{ +mul.f16x2 r201, r1, r203; +} +{ +add.f16x2 r204, %46, r201; +} +mov.f64 fd424, 0dBFD14459AD2BE466; +{ +cvt.rn.f16.f64 rs4, fd424; +} +mov.b32 r209, {rs4, rs4}; +{ +mul.f16x2 r207, r10, r209; +} +{ +add.f16x2 r210, r211, r207; +} +{ +cvt.rn.f16.f64 rs5, fd447; +} +mov.b32 r215, {rs5, rs5}; +{ +mul.f16x2 r213, r4, r215; +} +{ +add.f16x2 r216, %47, r213; +} +{ +cvt.rn.f16.f64 rs6, fd424; +} +mov.b32 r221, {rs6, rs6}; +{ +mul.f16x2 r219, r7, r221; +} +{ +add.f16x2 r222, r223, r219; +} +mov.f64 fd455, 0d3FEB57675CF309EE; +{ +cvt.rn.f16.f64 rs7, fd455; +} +mov.b32 r227, {rs7, rs7}; +{ +mul.f16x2 r225, r13, r227; +} +{ +add.f16x2 r228, r204, r225; +} +mov.f64 fd332, 0dBFE0A06E851DB7CA; +{ +cvt.rn.f16.f64 rs8, fd332; +} +mov.b32 r233, {rs8, rs8}; +{ +mul.f16x2 r231, r22, r233; +} +{ +add.f16x2 r234, r210, r231; +} +{ +cvt.rn.f16.f64 rs9, fd455; +} +mov.b32 r239, {rs9, rs9}; +{ +mul.f16x2 r237, r16, r239; +} +{ +add.f16x2 r240, r216, r237; +} +{ +cvt.rn.f16.f64 rs10, fd332; +} +mov.b32 r245, {rs10, rs10}; +{ +mul.f16x2 r243, r19, r245; +} +{ +add.f16x2 r246, r222, r243; +} +mov.f64 fd463, 0d3FE5D779B07CFEF7; +{ +cvt.rn.f16.f64 rs11, fd463; +} +mov.b32 r251, {rs11, rs11}; +{ +mul.f16x2 r249, r25, r251; +} +{ +add.f16x2 r252, r228, r249; +} +mov.f64 fd384, 0dBFE763021AAA15DA; +{ +cvt.rn.f16.f64 rs12, fd384; +} +mov.b32 r257, {rs12, rs12}; +{ +mul.f16x2 r255, r34, r257; +} +{ +add.f16x2 r258, r234, r255; +} +{ +cvt.rn.f16.f64 rs13, fd463; +} +mov.b32 r263, {rs13, rs13}; +{ +mul.f16x2 r261, r28, r263; +} +{ +add.f16x2 r264, r240, r261; +} +{ +cvt.rn.f16.f64 rs14, fd384; +} +mov.b32 r269, {rs14, rs14}; +{ +mul.f16x2 r267, r31, r269; +} +{ +add.f16x2 r270, r246, r267; +} +mov.f64 fd471, 0d3FDD71B4A0C5A6C8; +{ +cvt.rn.f16.f64 rs15, fd471; +} +mov.b32 r275, {rs15, rs15}; +{ +mul.f16x2 r273, r37, r275; +} +{ +add.f16x2 r276, r252, r273; +} +mov.f64 fd416, 0dBFEC698E42F47B09; +{ +cvt.rn.f16.f64 rs16, fd416; +} +mov.b32 r281, {rs16, rs16}; +{ +mul.f16x2 r279, r46, r281; +} +{ +add.f16x2 r282, r258, r279; +} +{ +cvt.rn.f16.f64 rs17, fd471; +} +mov.b32 r287, {rs17, rs17}; +{ +mul.f16x2 r285, r40, r287; +} +{ +add.f16x2 r288, r264, r285; +} +{ +cvt.rn.f16.f64 rs18, fd416; +} +mov.b32 r293, {rs18, rs18}; +{ +mul.f16x2 r291, r43, r293; +} +{ +add.f16x2 r294, r270, r291; +} +mov.f64 fd479, 0d3FCA0AD8BD1E2882; +{ +cvt.rn.f16.f64 rs19, fd479; +} +mov.b32 r299, {rs19, rs19}; +{ +mul.f16x2 r297, r49, r299; +} +{ +add.f16x2 r300, r276, r297; +} +mov.f64 fd280, 0dBFEF54A827142577; +{ +cvt.rn.f16.f64 rs20, fd280; +} +mov.b32 r305, {rs20, rs20}; +{ +mul.f16x2 r303, r58, r305; +} +{ +add.f16x2 r306, r282, r303; +} +{ +cvt.rn.f16.f64 rs21, fd479; +} +mov.b32 r311, {rs21, rs21}; +{ +mul.f16x2 r309, r52, r311; +} +{ +add.f16x2 r312, r288, r309; +} +{ +cvt.rn.f16.f64 rs22, fd280; +} +mov.b32 r317, {rs22, rs22}; +{ +mul.f16x2 r315, r55, r317; +} +{ +add.f16x2 r318, r294, r315; +} +mov.f64 fd483, 0dBFB17855B599F3B9; +{ +cvt.rn.f16.f64 rs23, fd483; +} +mov.b32 r323, {rs23, rs23}; +{ +mul.f16x2 r321, r61, r323; +} +{ +add.f16x2 r324, r300, r321; +} +mov.f64 fd484, 0dBFEFECE70DFD3EFB; +{ +cvt.rn.f16.f64 rs24, fd484; +} +mov.b32 r329, {rs24, rs24}; +{ +mul.f16x2 r327, r70, r329; +} +{ +add.f16x2 r330, r306, r327; +} +{ +cvt.rn.f16.f64 rs25, fd483; +} +mov.b32 r335, {rs25, rs25}; +{ +mul.f16x2 r333, r64, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +cvt.rn.f16.f64 rs26, fd484; +} +mov.b32 r341, {rs26, rs26}; +{ +mul.f16x2 r339, r67, r341; +} +{ +add.f16x2 r342, r318, r339; +} +mov.f64 fd475, 0dBFD56EAAE597C776; +{ +cvt.rn.f16.f64 rs27, fd475; +} +mov.b32 r347, {rs27, rs27}; +{ +mul.f16x2 r345, r73, r347; +} +{ +add.f16x2 r348, r324, r345; +} +mov.f64 fd476, 0dBFEE270060999288; +{ +cvt.rn.f16.f64 rs28, fd476; +} +mov.b32 r353, {rs28, rs28}; +{ +mul.f16x2 r351, r82, r353; +} +{ +add.f16x2 r354, r330, r351; +} +{ +cvt.rn.f16.f64 rs29, fd475; +} +mov.b32 r359, {rs29, rs29}; +{ +mul.f16x2 r357, r76, r359; +} +{ +add.f16x2 r360, r336, r357; +} +{ +cvt.rn.f16.f64 rs30, fd476; +} +mov.b32 r365, {rs30, rs30}; +{ +mul.f16x2 r363, r79, r365; +} +{ +add.f16x2 r366, r342, r363; +} +mov.f64 fd467, 0dBFE2742A4A775CFB; +{ +cvt.rn.f16.f64 rs31, fd467; +} +mov.b32 r371, {rs31, rs31}; +{ +mul.f16x2 r369, r85, r371; +} +{ +add.f16x2 r372, r348, r369; +} +mov.f64 fd468, 0dBFEA249E0B897CA9; +{ +cvt.rn.f16.f64 rs32, fd468; +} +mov.b32 r377, {rs32, rs32}; +{ +mul.f16x2 r375, r94, r377; +} +{ +add.f16x2 r378, r354, r375; +} +{ +cvt.rn.f16.f64 rs33, fd467; +} +mov.b32 r383, {rs33, rs33}; +{ +mul.f16x2 r381, r88, r383; +} +{ +add.f16x2 r384, r360, r381; +} +{ +cvt.rn.f16.f64 rs34, fd468; +} +mov.b32 r389, {rs34, rs34}; +{ +mul.f16x2 r387, r91, r389; +} +{ +add.f16x2 r390, r366, r387; +} +mov.f64 fd459, 0dBFE8D2A07C16D46F; +{ +cvt.rn.f16.f64 rs35, fd459; +} +mov.b32 r395, {rs35, rs35}; +{ +mul.f16x2 r393, r97, r395; +} +{ +add.f16x2 r396, r372, r393; +} +mov.f64 fd460, 0dBFE431DF5838F7EF; +{ +cvt.rn.f16.f64 rs36, fd460; +} +mov.b32 r401, {rs36, rs36}; +{ +mul.f16x2 r399, r106, r401; +} +{ +add.f16x2 r402, r378, r399; +} +{ +cvt.rn.f16.f64 rs37, fd459; +} +mov.b32 r407, {rs37, rs37}; +{ +mul.f16x2 r405, r100, r407; +} +{ +add.f16x2 r408, r384, r405; +} +{ +cvt.rn.f16.f64 rs38, fd460; +} +mov.b32 r413, {rs38, rs38}; +{ +mul.f16x2 r411, r103, r413; +} +{ +add.f16x2 r414, r390, r411; +} +mov.f64 fd451, 0dBFED59CB83EF99BC; +{ +cvt.rn.f16.f64 rs39, fd451; +} +mov.b32 r419, {rs39, rs39}; +{ +mul.f16x2 r417, r109, r419; +} +{ +add.f16x2 r420, r396, r417; +} +mov.f64 fd452, 0dBFD97F6748E524B2; +{ +cvt.rn.f16.f64 rs40, fd452; +} +mov.b32 r425, {rs40, rs40}; +{ +mul.f16x2 r423, r118, r425; +} +{ +add.f16x2 r426, r402, r423; +} +{ +cvt.rn.f16.f64 rs41, fd451; +} +mov.b32 r431, {rs41, rs41}; +{ +mul.f16x2 r429, r112, r431; +} +{ +add.f16x2 r432, r408, r429; +} +{ +cvt.rn.f16.f64 rs42, fd452; +} +mov.b32 r437, {rs42, rs42}; +{ +mul.f16x2 r435, r115, r437; +} +{ +add.f16x2 r438, r414, r435; +} +mov.f64 fd443, 0dBFEFB3B3035AA6CD; +{ +cvt.rn.f16.f64 rs43, fd443; +} +mov.b32 r443, {rs43, rs43}; +{ +mul.f16x2 r441, r121, r443; +} +{ +add.f16x2 r444, r420, r441; +} +mov.f64 fd444, 0dBFC16DE8A4564F0A; +{ +cvt.rn.f16.f64 rs44, fd444; +} +mov.b32 r449, {rs44, rs44}; +{ +mul.f16x2 r447, r130, r449; +} +{ +add.f16x2 r450, r426, r447; +} +{ +cvt.rn.f16.f64 rs45, fd443; +} +mov.b32 r455, {rs45, rs45}; +{ +mul.f16x2 r453, r124, r455; +} +{ +add.f16x2 r456, r432, r453; +} +{ +cvt.rn.f16.f64 rs46, fd444; +} +mov.b32 r461, {rs46, rs46}; +{ +mul.f16x2 r459, r127, r461; +} +{ +add.f16x2 r462, r438, r459; +} +{ +sub.f16x2 %2, r444, r450; +} +{ +add.f16x2 %3, r456, r462; +} +{ +add.f16x2 %44, r444, r450; +} +{ +sub.f16x2 %45, r456, r462; +} +cvt.rn.f16.s32 rs47, r2980; +mov.b32 r489, {rs47, rs47}; +cvt.rn.f16.s32 rs48, r2980; +mov.b32 r501, {rs48, rs48}; +{ +cvt.rn.f16.f64 rs49, fd455; +} +mov.b32 r481, {rs49, rs49}; +{ +mul.f16x2 r479, r1, r481; +} +{ +add.f16x2 r482, %46, r479; +} +{ +cvt.rn.f16.f64 rs50, fd332; +} +mov.b32 r487, {rs50, rs50}; +{ +mul.f16x2 r485, r10, r487; +} +{ +add.f16x2 r488, r489, r485; +} +{ +cvt.rn.f16.f64 rs51, fd455; +} +mov.b32 r493, {rs51, rs51}; +{ +mul.f16x2 r491, r4, r493; +} +{ +add.f16x2 r494, %47, r491; +} +{ +cvt.rn.f16.f64 rs52, fd332; +} +mov.b32 r499, {rs52, rs52}; +{ +mul.f16x2 r497, r7, r499; +} +{ +add.f16x2 r500, r501, r497; +} +{ +cvt.rn.f16.f64 rs53, fd471; +} +mov.b32 r505, {rs53, rs53}; +{ +mul.f16x2 r503, r13, r505; +} +{ +add.f16x2 r506, r482, r503; +} +{ +cvt.rn.f16.f64 rs54, fd416; +} +mov.b32 r511, {rs54, rs54}; +{ +mul.f16x2 r509, r22, r511; +} +{ +add.f16x2 r512, r488, r509; +} +{ +cvt.rn.f16.f64 rs55, fd471; +} +mov.b32 r517, {rs55, rs55}; +{ +mul.f16x2 r515, r16, r517; +} +{ +add.f16x2 r518, r494, r515; +} +{ +cvt.rn.f16.f64 rs56, fd416; +} +mov.b32 r523, {rs56, rs56}; +{ +mul.f16x2 r521, r19, r523; +} +{ +add.f16x2 r524, r500, r521; +} +{ +cvt.rn.f16.f64 rs57, fd483; +} +mov.b32 r529, {rs57, rs57}; +{ +mul.f16x2 r527, r25, r529; +} +{ +add.f16x2 r530, r506, r527; +} +{ +cvt.rn.f16.f64 rs58, fd484; +} +mov.b32 r535, {rs58, rs58}; +{ +mul.f16x2 r533, r34, r535; +} +{ +add.f16x2 r536, r512, r533; +} +{ +cvt.rn.f16.f64 rs59, fd483; +} +mov.b32 r541, {rs59, rs59}; +{ +mul.f16x2 r539, r28, r541; +} +{ +add.f16x2 r542, r518, r539; +} +{ +cvt.rn.f16.f64 rs60, fd484; +} +mov.b32 r547, {rs60, rs60}; +{ +mul.f16x2 r545, r31, r547; +} +{ +add.f16x2 r548, r524, r545; +} +{ +cvt.rn.f16.f64 rs61, fd467; +} +mov.b32 r553, {rs61, rs61}; +{ +mul.f16x2 r551, r37, r553; +} +{ +add.f16x2 r554, r530, r551; +} +{ +cvt.rn.f16.f64 rs62, fd468; +} +mov.b32 r559, {rs62, rs62}; +{ +mul.f16x2 r557, r46, r559; +} +{ +add.f16x2 r560, r536, r557; +} +{ +cvt.rn.f16.f64 rs63, fd467; +} +mov.b32 r565, {rs63, rs63}; +{ +mul.f16x2 r563, r40, r565; +} +{ +add.f16x2 r566, r542, r563; +} +{ +cvt.rn.f16.f64 rs64, fd468; +} +mov.b32 r571, {rs64, rs64}; +{ +mul.f16x2 r569, r43, r571; +} +{ +add.f16x2 r572, r548, r569; +} +{ +cvt.rn.f16.f64 rs65, fd451; +} +mov.b32 r577, {rs65, rs65}; +{ +mul.f16x2 r575, r49, r577; +} +{ +add.f16x2 r578, r554, r575; +} +{ +cvt.rn.f16.f64 rs66, fd452; +} +mov.b32 r583, {rs66, rs66}; +{ +mul.f16x2 r581, r58, r583; +} +{ +add.f16x2 r584, r560, r581; +} +{ +cvt.rn.f16.f64 rs67, fd451; +} +mov.b32 r589, {rs67, rs67}; +{ +mul.f16x2 r587, r52, r589; +} +{ +add.f16x2 r590, r566, r587; +} +{ +cvt.rn.f16.f64 rs68, fd452; +} +mov.b32 r595, {rs68, rs68}; +{ +mul.f16x2 r593, r55, r595; +} +{ +add.f16x2 r596, r572, r593; +} +{ +cvt.rn.f16.f64 rs69, fd443; +} +mov.b32 r601, {rs69, rs69}; +{ +mul.f16x2 r599, r61, r601; +} +{ +add.f16x2 r602, r578, r599; +} +mov.f64 fd388, 0d3FC16DE8A4564F0A; +{ +cvt.rn.f16.f64 rs70, fd388; +} +mov.b32 r607, {rs70, rs70}; +{ +mul.f16x2 r605, r70, r607; +} +{ +add.f16x2 r608, r584, r605; +} +{ +cvt.rn.f16.f64 rs71, fd443; +} +mov.b32 r613, {rs71, rs71}; +{ +mul.f16x2 r611, r64, r613; +} +{ +add.f16x2 r614, r590, r611; +} +{ +cvt.rn.f16.f64 rs72, fd388; +} +mov.b32 r619, {rs72, rs72}; +{ +mul.f16x2 r617, r67, r619; +} +{ +add.f16x2 r620, r596, r617; +} +{ +cvt.rn.f16.f64 rs73, fd459; +} +mov.b32 r625, {rs73, rs73}; +{ +mul.f16x2 r623, r73, r625; +} +{ +add.f16x2 r626, r602, r623; +} +mov.f64 fd420, 0d3FE431DF5838F7EF; +{ +cvt.rn.f16.f64 rs74, fd420; +} +mov.b32 r631, {rs74, rs74}; +{ +mul.f16x2 r629, r82, r631; +} +{ +add.f16x2 r632, r608, r629; +} +{ +cvt.rn.f16.f64 rs75, fd459; +} +mov.b32 r637, {rs75, rs75}; +{ +mul.f16x2 r635, r76, r637; +} +{ +add.f16x2 r638, r614, r635; +} +{ +cvt.rn.f16.f64 rs76, fd420; +} +mov.b32 r643, {rs76, rs76}; +{ +mul.f16x2 r641, r79, r643; +} +{ +add.f16x2 r644, r620, r641; +} +{ +cvt.rn.f16.f64 rs77, fd475; +} +mov.b32 r649, {rs77, rs77}; +{ +mul.f16x2 r647, r85, r649; +} +{ +add.f16x2 r650, r626, r647; +} +mov.f64 fd316, 0d3FEE270060999288; +{ +cvt.rn.f16.f64 rs78, fd316; +} +mov.b32 r655, {rs78, rs78}; +{ +mul.f16x2 r653, r94, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs79, fd475; +} +mov.b32 r661, {rs79, rs79}; +{ +mul.f16x2 r659, r88, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs80, fd316; +} +mov.b32 r667, {rs80, rs80}; +{ +mul.f16x2 r665, r91, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs81, fd479; +} +mov.b32 r673, {rs81, rs81}; +{ +mul.f16x2 r671, r97, r673; +} +{ +add.f16x2 r674, r650, r671; +} +mov.f64 fd480, 0d3FEF54A827142577; +{ +cvt.rn.f16.f64 rs82, fd480; +} +mov.b32 r679, {rs82, rs82}; +{ +mul.f16x2 r677, r106, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs83, fd479; +} +mov.b32 r685, {rs83, rs83}; +{ +mul.f16x2 r683, r100, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs84, fd480; +} +mov.b32 r691, {rs84, rs84}; +{ +mul.f16x2 r689, r103, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs85, fd463; +} +mov.b32 r697, {rs85, rs85}; +{ +mul.f16x2 r695, r109, r697; +} +{ +add.f16x2 r698, r674, r695; +} +mov.f64 fd464, 0d3FE763021AAA15DA; +{ +cvt.rn.f16.f64 rs86, fd464; +} +mov.b32 r703, {rs86, rs86}; +{ +mul.f16x2 r701, r118, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs87, fd463; +} +mov.b32 r709, {rs87, rs87}; +{ +mul.f16x2 r707, r112, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs88, fd464; +} +mov.b32 r715, {rs88, rs88}; +{ +mul.f16x2 r713, r115, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs89, fd447; +} +mov.b32 r721, {rs89, rs89}; +{ +mul.f16x2 r719, r121, r721; +} +{ +add.f16x2 r722, r698, r719; +} +mov.f64 fd448, 0d3FD14459AD2BE466; +{ +cvt.rn.f16.f64 rs90, fd448; +} +mov.b32 r727, {rs90, rs90}; +{ +mul.f16x2 r725, r130, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs91, fd447; +} +mov.b32 r733, {rs91, rs91}; +{ +mul.f16x2 r731, r124, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs92, fd448; +} +mov.b32 r739, {rs92, rs92}; +{ +mul.f16x2 r737, r127, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +sub.f16x2 %4, r722, r728; +} +{ +add.f16x2 %5, r734, r740; +} +{ +add.f16x2 %42, r722, r728; +} +{ +sub.f16x2 %43, r734, r740; +} +cvt.rn.f16.s32 rs93, r2980; +mov.b32 r767, {rs93, rs93}; +cvt.rn.f16.s32 rs94, r2980; +mov.b32 r779, {rs94, rs94}; +{ +cvt.rn.f16.f64 rs95, fd463; +} +mov.b32 r759, {rs95, rs95}; +{ +mul.f16x2 r757, r1, r759; +} +{ +add.f16x2 r760, %46, r757; +} +{ +cvt.rn.f16.f64 rs96, fd384; +} +mov.b32 r765, {rs96, rs96}; +{ +mul.f16x2 r763, r10, r765; +} +{ +add.f16x2 r766, r767, r763; +} +{ +cvt.rn.f16.f64 rs97, fd463; +} +mov.b32 r771, {rs97, rs97}; +{ +mul.f16x2 r769, r4, r771; +} +{ +add.f16x2 r772, %47, r769; +} +{ +cvt.rn.f16.f64 rs98, fd384; +} +mov.b32 r777, {rs98, rs98}; +{ +mul.f16x2 r775, r7, r777; +} +{ +add.f16x2 r778, r779, r775; +} +{ +cvt.rn.f16.f64 rs99, fd483; +} +mov.b32 r783, {rs99, rs99}; +{ +mul.f16x2 r781, r13, r783; +} +{ +add.f16x2 r784, r760, r781; +} +{ +cvt.rn.f16.f64 rs100, fd484; +} +mov.b32 r789, {rs100, rs100}; +{ +mul.f16x2 r787, r22, r789; +} +{ +add.f16x2 r790, r766, r787; +} +{ +cvt.rn.f16.f64 rs101, fd483; +} +mov.b32 r795, {rs101, rs101}; +{ +mul.f16x2 r793, r16, r795; +} +{ +add.f16x2 r796, r772, r793; +} +{ +cvt.rn.f16.f64 rs102, fd484; +} +mov.b32 r801, {rs102, rs102}; +{ +mul.f16x2 r799, r19, r801; +} +{ +add.f16x2 r802, r778, r799; +} +{ +cvt.rn.f16.f64 rs103, fd459; +} +mov.b32 r807, {rs103, rs103}; +{ +mul.f16x2 r805, r25, r807; +} +{ +add.f16x2 r808, r784, r805; +} +{ +cvt.rn.f16.f64 rs104, fd460; +} +mov.b32 r813, {rs104, rs104}; +{ +mul.f16x2 r811, r34, r813; +} +{ +add.f16x2 r814, r790, r811; +} +{ +cvt.rn.f16.f64 rs105, fd459; +} +mov.b32 r819, {rs105, rs105}; +{ +mul.f16x2 r817, r28, r819; +} +{ +add.f16x2 r820, r796, r817; +} +{ +cvt.rn.f16.f64 rs106, fd460; +} +mov.b32 r825, {rs106, rs106}; +{ +mul.f16x2 r823, r31, r825; +} +{ +add.f16x2 r826, r802, r823; +} +{ +cvt.rn.f16.f64 rs107, fd443; +} +mov.b32 r831, {rs107, rs107}; +{ +mul.f16x2 r829, r37, r831; +} +{ +add.f16x2 r832, r808, r829; +} +{ +cvt.rn.f16.f64 rs108, fd388; +} +mov.b32 r837, {rs108, rs108}; +{ +mul.f16x2 r835, r46, r837; +} +{ +add.f16x2 r838, r814, r835; +} +{ +cvt.rn.f16.f64 rs109, fd443; +} +mov.b32 r843, {rs109, rs109}; +{ +mul.f16x2 r841, r40, r843; +} +{ +add.f16x2 r844, r820, r841; +} +{ +cvt.rn.f16.f64 rs110, fd388; +} +mov.b32 r849, {rs110, rs110}; +{ +mul.f16x2 r847, r43, r849; +} +{ +add.f16x2 r850, r826, r847; +} +{ +cvt.rn.f16.f64 rs111, fd467; +} +mov.b32 r855, {rs111, rs111}; +{ +mul.f16x2 r853, r49, r855; +} +{ +add.f16x2 r856, r832, r853; +} +mov.f64 fd188, 0d3FEA249E0B897CA9; +{ +cvt.rn.f16.f64 rs112, fd188; +} +mov.b32 r861, {rs112, rs112}; +{ +mul.f16x2 r859, r58, r861; +} +{ +add.f16x2 r862, r838, r859; +} +{ +cvt.rn.f16.f64 rs113, fd467; +} +mov.b32 r867, {rs113, rs113}; +{ +mul.f16x2 r865, r52, r867; +} +{ +add.f16x2 r868, r844, r865; +} +{ +cvt.rn.f16.f64 rs114, fd188; +} +mov.b32 r873, {rs114, rs114}; +{ +mul.f16x2 r871, r55, r873; +} +{ +add.f16x2 r874, r850, r871; +} +{ +cvt.rn.f16.f64 rs115, fd479; +} +mov.b32 r879, {rs115, rs115}; +{ +mul.f16x2 r877, r61, r879; +} +{ +add.f16x2 r880, r856, r877; +} +{ +cvt.rn.f16.f64 rs116, fd480; +} +mov.b32 r885, {rs116, rs116}; +{ +mul.f16x2 r883, r70, r885; +} +{ +add.f16x2 r886, r862, r883; +} +{ +cvt.rn.f16.f64 rs117, fd479; +} +mov.b32 r891, {rs117, rs117}; +{ +mul.f16x2 r889, r64, r891; +} +{ +add.f16x2 r892, r868, r889; +} +{ +cvt.rn.f16.f64 rs118, fd480; +} +mov.b32 r897, {rs118, rs118}; +{ +mul.f16x2 r895, r67, r897; +} +{ +add.f16x2 r898, r874, r895; +} +{ +cvt.rn.f16.f64 rs119, fd455; +} +mov.b32 r903, {rs119, rs119}; +{ +mul.f16x2 r901, r73, r903; +} +{ +add.f16x2 r904, r880, r901; +} +mov.f64 fd456, 0d3FE0A06E851DB7CA; +{ +cvt.rn.f16.f64 rs120, fd456; +} +mov.b32 r909, {rs120, rs120}; +{ +mul.f16x2 r907, r82, r909; +} +{ +add.f16x2 r910, r886, r907; +} +{ +cvt.rn.f16.f64 rs121, fd455; +} +mov.b32 r915, {rs121, rs121}; +{ +mul.f16x2 r913, r76, r915; +} +{ +add.f16x2 r916, r892, r913; +} +{ +cvt.rn.f16.f64 rs122, fd456; +} +mov.b32 r921, {rs122, rs122}; +{ +mul.f16x2 r919, r79, r921; +} +{ +add.f16x2 r922, r898, r919; +} +{ +cvt.rn.f16.f64 rs123, fd447; +} +mov.b32 r927, {rs123, rs123}; +{ +mul.f16x2 r925, r85, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs124, fd424; +} +mov.b32 r933, {rs124, rs124}; +{ +mul.f16x2 r931, r94, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs125, fd447; +} +mov.b32 r939, {rs125, rs125}; +{ +mul.f16x2 r937, r88, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs126, fd424; +} +mov.b32 r945, {rs126, rs126}; +{ +mul.f16x2 r943, r91, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs127, fd471; +} +mov.b32 r951, {rs127, rs127}; +{ +mul.f16x2 r949, r97, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs128, fd416; +} +mov.b32 r957, {rs128, rs128}; +{ +mul.f16x2 r955, r106, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs129, fd471; +} +mov.b32 r963, {rs129, rs129}; +{ +mul.f16x2 r961, r100, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs130, fd416; +} +mov.b32 r969, {rs130, rs130}; +{ +mul.f16x2 r967, r103, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +cvt.rn.f16.f64 rs131, fd475; +} +mov.b32 r975, {rs131, rs131}; +{ +mul.f16x2 r973, r109, r975; +} +{ +add.f16x2 r976, r952, r973; +} +{ +cvt.rn.f16.f64 rs132, fd476; +} +mov.b32 r981, {rs132, rs132}; +{ +mul.f16x2 r979, r118, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs133, fd475; +} +mov.b32 r987, {rs133, rs133}; +{ +mul.f16x2 r985, r112, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs134, fd476; +} +mov.b32 r993, {rs134, rs134}; +{ +mul.f16x2 r991, r115, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs135, fd451; +} +mov.b32 r999, {rs135, rs135}; +{ +mul.f16x2 r997, r121, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs136, fd452; +} +mov.b32 r1005, {rs136, rs136}; +{ +mul.f16x2 r1003, r130, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs137, fd451; +} +mov.b32 r1011, {rs137, rs137}; +{ +mul.f16x2 r1009, r124, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs138, fd452; +} +mov.b32 r1017, {rs138, rs138}; +{ +mul.f16x2 r1015, r127, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +sub.f16x2 %6, r1000, r1006; +} +{ +add.f16x2 %7, r1012, r1018; +} +{ +add.f16x2 %40, r1000, r1006; +} +{ +sub.f16x2 %41, r1012, r1018; +} +cvt.rn.f16.s32 rs139, r2980; +mov.b32 r1045, {rs139, rs139}; +cvt.rn.f16.s32 rs140, r2980; +mov.b32 r1057, {rs140, rs140}; +{ +cvt.rn.f16.f64 rs141, fd471; +} +mov.b32 r1037, {rs141, rs141}; +{ +mul.f16x2 r1035, r1, r1037; +} +{ +add.f16x2 r1038, %46, r1035; +} +{ +cvt.rn.f16.f64 rs142, fd416; +} +mov.b32 r1043, {rs142, rs142}; +{ +mul.f16x2 r1041, r10, r1043; +} +{ +add.f16x2 r1044, r1045, r1041; +} +{ +cvt.rn.f16.f64 rs143, fd471; +} +mov.b32 r1049, {rs143, rs143}; +{ +mul.f16x2 r1047, r4, r1049; +} +{ +add.f16x2 r1050, %47, r1047; +} +{ +cvt.rn.f16.f64 rs144, fd416; +} +mov.b32 r1055, {rs144, rs144}; +{ +mul.f16x2 r1053, r7, r1055; +} +{ +add.f16x2 r1056, r1057, r1053; +} +{ +cvt.rn.f16.f64 rs145, fd467; +} +mov.b32 r1061, {rs145, rs145}; +{ +mul.f16x2 r1059, r13, r1061; +} +{ +add.f16x2 r1062, r1038, r1059; +} +{ +cvt.rn.f16.f64 rs146, fd468; +} +mov.b32 r1067, {rs146, rs146}; +{ +mul.f16x2 r1065, r22, r1067; +} +{ +add.f16x2 r1068, r1044, r1065; +} +{ +cvt.rn.f16.f64 rs147, fd467; +} +mov.b32 r1073, {rs147, rs147}; +{ +mul.f16x2 r1071, r16, r1073; +} +{ +add.f16x2 r1074, r1050, r1071; +} +{ +cvt.rn.f16.f64 rs148, fd468; +} +mov.b32 r1079, {rs148, rs148}; +{ +mul.f16x2 r1077, r19, r1079; +} +{ +add.f16x2 r1080, r1056, r1077; +} +{ +cvt.rn.f16.f64 rs149, fd443; +} +mov.b32 r1085, {rs149, rs149}; +{ +mul.f16x2 r1083, r25, r1085; +} +{ +add.f16x2 r1086, r1062, r1083; +} +{ +cvt.rn.f16.f64 rs150, fd388; +} +mov.b32 r1091, {rs150, rs150}; +{ +mul.f16x2 r1089, r34, r1091; +} +{ +add.f16x2 r1092, r1068, r1089; +} +{ +cvt.rn.f16.f64 rs151, fd443; +} +mov.b32 r1097, {rs151, rs151}; +{ +mul.f16x2 r1095, r28, r1097; +} +{ +add.f16x2 r1098, r1074, r1095; +} +{ +cvt.rn.f16.f64 rs152, fd388; +} +mov.b32 r1103, {rs152, rs152}; +{ +mul.f16x2 r1101, r31, r1103; +} +{ +add.f16x2 r1104, r1080, r1101; +} +{ +cvt.rn.f16.f64 rs153, fd475; +} +mov.b32 r1109, {rs153, rs153}; +{ +mul.f16x2 r1107, r37, r1109; +} +{ +add.f16x2 r1110, r1086, r1107; +} +{ +cvt.rn.f16.f64 rs154, fd316; +} +mov.b32 r1115, {rs154, rs154}; +{ +mul.f16x2 r1113, r46, r1115; +} +{ +add.f16x2 r1116, r1092, r1113; +} +{ +cvt.rn.f16.f64 rs155, fd475; +} +mov.b32 r1121, {rs155, rs155}; +{ +mul.f16x2 r1119, r40, r1121; +} +{ +add.f16x2 r1122, r1098, r1119; +} +{ +cvt.rn.f16.f64 rs156, fd316; +} +mov.b32 r1127, {rs156, rs156}; +{ +mul.f16x2 r1125, r43, r1127; +} +{ +add.f16x2 r1128, r1104, r1125; +} +{ +cvt.rn.f16.f64 rs157, fd463; +} +mov.b32 r1133, {rs157, rs157}; +{ +mul.f16x2 r1131, r49, r1133; +} +{ +add.f16x2 r1134, r1110, r1131; +} +{ +cvt.rn.f16.f64 rs158, fd464; +} +mov.b32 r1139, {rs158, rs158}; +{ +mul.f16x2 r1137, r58, r1139; +} +{ +add.f16x2 r1140, r1116, r1137; +} +{ +cvt.rn.f16.f64 rs159, fd463; +} +mov.b32 r1145, {rs159, rs159}; +{ +mul.f16x2 r1143, r52, r1145; +} +{ +add.f16x2 r1146, r1122, r1143; +} +{ +cvt.rn.f16.f64 rs160, fd464; +} +mov.b32 r1151, {rs160, rs160}; +{ +mul.f16x2 r1149, r55, r1151; +} +{ +add.f16x2 r1152, r1128, r1149; +} +{ +cvt.rn.f16.f64 rs161, fd447; +} +mov.b32 r1157, {rs161, rs161}; +{ +mul.f16x2 r1155, r61, r1157; +} +{ +add.f16x2 r1158, r1134, r1155; +} +{ +cvt.rn.f16.f64 rs162, fd424; +} +mov.b32 r1163, {rs162, rs162}; +{ +mul.f16x2 r1161, r70, r1163; +} +{ +add.f16x2 r1164, r1140, r1161; +} +{ +cvt.rn.f16.f64 rs163, fd447; +} +mov.b32 r1169, {rs163, rs163}; +{ +mul.f16x2 r1167, r64, r1169; +} +{ +add.f16x2 r1170, r1146, r1167; +} +{ +cvt.rn.f16.f64 rs164, fd424; +} +mov.b32 r1175, {rs164, rs164}; +{ +mul.f16x2 r1173, r67, r1175; +} +{ +add.f16x2 r1176, r1152, r1173; +} +{ +cvt.rn.f16.f64 rs165, fd479; +} +mov.b32 r1181, {rs165, rs165}; +{ +mul.f16x2 r1179, r73, r1181; +} +{ +add.f16x2 r1182, r1158, r1179; +} +{ +cvt.rn.f16.f64 rs166, fd280; +} +mov.b32 r1187, {rs166, rs166}; +{ +mul.f16x2 r1185, r82, r1187; +} +{ +add.f16x2 r1188, r1164, r1185; +} +{ +cvt.rn.f16.f64 rs167, fd479; +} +mov.b32 r1193, {rs167, rs167}; +{ +mul.f16x2 r1191, r76, r1193; +} +{ +add.f16x2 r1194, r1170, r1191; +} +{ +cvt.rn.f16.f64 rs168, fd280; +} +mov.b32 r1199, {rs168, rs168}; +{ +mul.f16x2 r1197, r79, r1199; +} +{ +add.f16x2 r1200, r1176, r1197; +} +{ +cvt.rn.f16.f64 rs169, fd459; +} +mov.b32 r1205, {rs169, rs169}; +{ +mul.f16x2 r1203, r85, r1205; +} +{ +add.f16x2 r1206, r1182, r1203; +} +{ +cvt.rn.f16.f64 rs170, fd460; +} +mov.b32 r1211, {rs170, rs170}; +{ +mul.f16x2 r1209, r94, r1211; +} +{ +add.f16x2 r1212, r1188, r1209; +} +{ +cvt.rn.f16.f64 rs171, fd459; +} +mov.b32 r1217, {rs171, rs171}; +{ +mul.f16x2 r1215, r88, r1217; +} +{ +add.f16x2 r1218, r1194, r1215; +} +{ +cvt.rn.f16.f64 rs172, fd460; +} +mov.b32 r1223, {rs172, rs172}; +{ +mul.f16x2 r1221, r91, r1223; +} +{ +add.f16x2 r1224, r1200, r1221; +} +{ +cvt.rn.f16.f64 rs173, fd451; +} +mov.b32 r1229, {rs173, rs173}; +{ +mul.f16x2 r1227, r97, r1229; +} +{ +add.f16x2 r1230, r1206, r1227; +} +mov.f64 fd368, 0d3FD97F6748E524B2; +{ +cvt.rn.f16.f64 rs174, fd368; +} +mov.b32 r1235, {rs174, rs174}; +{ +mul.f16x2 r1233, r106, r1235; +} +{ +add.f16x2 r1236, r1212, r1233; +} +{ +cvt.rn.f16.f64 rs175, fd451; +} +mov.b32 r1241, {rs175, rs175}; +{ +mul.f16x2 r1239, r100, r1241; +} +{ +add.f16x2 r1242, r1218, r1239; +} +{ +cvt.rn.f16.f64 rs176, fd368; +} +mov.b32 r1247, {rs176, rs176}; +{ +mul.f16x2 r1245, r103, r1247; +} +{ +add.f16x2 r1248, r1224, r1245; +} +{ +cvt.rn.f16.f64 rs177, fd483; +} +mov.b32 r1253, {rs177, rs177}; +{ +mul.f16x2 r1251, r109, r1253; +} +{ +add.f16x2 r1254, r1230, r1251; +} +mov.f64 fd412, 0d3FEFECE70DFD3EFB; +{ +cvt.rn.f16.f64 rs178, fd412; +} +mov.b32 r1259, {rs178, rs178}; +{ +mul.f16x2 r1257, r118, r1259; +} +{ +add.f16x2 r1260, r1236, r1257; +} +{ +cvt.rn.f16.f64 rs179, fd483; +} +mov.b32 r1265, {rs179, rs179}; +{ +mul.f16x2 r1263, r112, r1265; +} +{ +add.f16x2 r1266, r1242, r1263; +} +{ +cvt.rn.f16.f64 rs180, fd412; +} +mov.b32 r1271, {rs180, rs180}; +{ +mul.f16x2 r1269, r115, r1271; +} +{ +add.f16x2 r1272, r1248, r1269; +} +{ +cvt.rn.f16.f64 rs181, fd455; +} +mov.b32 r1277, {rs181, rs181}; +{ +mul.f16x2 r1275, r121, r1277; +} +{ +add.f16x2 r1278, r1254, r1275; +} +{ +cvt.rn.f16.f64 rs182, fd456; +} +mov.b32 r1283, {rs182, rs182}; +{ +mul.f16x2 r1281, r130, r1283; +} +{ +add.f16x2 r1284, r1260, r1281; +} +{ +cvt.rn.f16.f64 rs183, fd455; +} +mov.b32 r1289, {rs183, rs183}; +{ +mul.f16x2 r1287, r124, r1289; +} +{ +add.f16x2 r1290, r1266, r1287; +} +{ +cvt.rn.f16.f64 rs184, fd456; +} +mov.b32 r1295, {rs184, rs184}; +{ +mul.f16x2 r1293, r127, r1295; +} +{ +add.f16x2 r1296, r1272, r1293; +} +{ +sub.f16x2 %8, r1278, r1284; +} +{ +add.f16x2 %9, r1290, r1296; +} +{ +add.f16x2 %38, r1278, r1284; +} +{ +sub.f16x2 %39, r1290, r1296; +} +cvt.rn.f16.s32 rs185, r2980; +mov.b32 r1323, {rs185, rs185}; +cvt.rn.f16.s32 rs186, r2980; +mov.b32 r1335, {rs186, rs186}; +{ +cvt.rn.f16.f64 rs187, fd479; +} +mov.b32 r1315, {rs187, rs187}; +{ +mul.f16x2 r1313, r1, r1315; +} +{ +add.f16x2 r1316, %46, r1313; +} +{ +cvt.rn.f16.f64 rs188, fd280; +} +mov.b32 r1321, {rs188, rs188}; +{ +mul.f16x2 r1319, r10, r1321; +} +{ +add.f16x2 r1322, r1323, r1319; +} +{ +cvt.rn.f16.f64 rs189, fd479; +} +mov.b32 r1327, {rs189, rs189}; +{ +mul.f16x2 r1325, r4, r1327; +} +{ +add.f16x2 r1328, %47, r1325; +} +{ +cvt.rn.f16.f64 rs190, fd280; +} +mov.b32 r1333, {rs190, rs190}; +{ +mul.f16x2 r1331, r7, r1333; +} +{ +add.f16x2 r1334, r1335, r1331; +} +{ +cvt.rn.f16.f64 rs191, fd451; +} +mov.b32 r1339, {rs191, rs191}; +{ +mul.f16x2 r1337, r13, r1339; +} +{ +add.f16x2 r1340, r1316, r1337; +} +{ +cvt.rn.f16.f64 rs192, fd452; +} +mov.b32 r1345, {rs192, rs192}; +{ +mul.f16x2 r1343, r22, r1345; +} +{ +add.f16x2 r1346, r1322, r1343; +} +{ +cvt.rn.f16.f64 rs193, fd451; +} +mov.b32 r1351, {rs193, rs193}; +{ +mul.f16x2 r1349, r16, r1351; +} +{ +add.f16x2 r1352, r1328, r1349; +} +{ +cvt.rn.f16.f64 rs194, fd452; +} +mov.b32 r1357, {rs194, rs194}; +{ +mul.f16x2 r1355, r19, r1357; +} +{ +add.f16x2 r1358, r1334, r1355; +} +{ +cvt.rn.f16.f64 rs195, fd467; +} +mov.b32 r1363, {rs195, rs195}; +{ +mul.f16x2 r1361, r25, r1363; +} +{ +add.f16x2 r1364, r1340, r1361; +} +{ +cvt.rn.f16.f64 rs196, fd188; +} +mov.b32 r1369, {rs196, rs196}; +{ +mul.f16x2 r1367, r34, r1369; +} +{ +add.f16x2 r1370, r1346, r1367; +} +{ +cvt.rn.f16.f64 rs197, fd467; +} +mov.b32 r1375, {rs197, rs197}; +{ +mul.f16x2 r1373, r28, r1375; +} +{ +add.f16x2 r1376, r1352, r1373; +} +{ +cvt.rn.f16.f64 rs198, fd188; +} +mov.b32 r1381, {rs198, rs198}; +{ +mul.f16x2 r1379, r31, r1381; +} +{ +add.f16x2 r1382, r1358, r1379; +} +{ +cvt.rn.f16.f64 rs199, fd463; +} +mov.b32 r1387, {rs199, rs199}; +{ +mul.f16x2 r1385, r37, r1387; +} +{ +add.f16x2 r1388, r1364, r1385; +} +{ +cvt.rn.f16.f64 rs200, fd464; +} +mov.b32 r1393, {rs200, rs200}; +{ +mul.f16x2 r1391, r46, r1393; +} +{ +add.f16x2 r1394, r1370, r1391; +} +{ +cvt.rn.f16.f64 rs201, fd463; +} +mov.b32 r1399, {rs201, rs201}; +{ +mul.f16x2 r1397, r40, r1399; +} +{ +add.f16x2 r1400, r1376, r1397; +} +{ +cvt.rn.f16.f64 rs202, fd464; +} +mov.b32 r1405, {rs202, rs202}; +{ +mul.f16x2 r1403, r43, r1405; +} +{ +add.f16x2 r1406, r1382, r1403; +} +{ +cvt.rn.f16.f64 rs203, fd455; +} +mov.b32 r1411, {rs203, rs203}; +{ +mul.f16x2 r1409, r49, r1411; +} +{ +add.f16x2 r1412, r1388, r1409; +} +{ +cvt.rn.f16.f64 rs204, fd332; +} +mov.b32 r1417, {rs204, rs204}; +{ +mul.f16x2 r1415, r58, r1417; +} +{ +add.f16x2 r1418, r1394, r1415; +} +{ +cvt.rn.f16.f64 rs205, fd455; +} +mov.b32 r1423, {rs205, rs205}; +{ +mul.f16x2 r1421, r52, r1423; +} +{ +add.f16x2 r1424, r1400, r1421; +} +{ +cvt.rn.f16.f64 rs206, fd332; +} +mov.b32 r1429, {rs206, rs206}; +{ +mul.f16x2 r1427, r55, r1429; +} +{ +add.f16x2 r1430, r1406, r1427; +} +{ +cvt.rn.f16.f64 rs207, fd475; +} +mov.b32 r1435, {rs207, rs207}; +{ +mul.f16x2 r1433, r61, r1435; +} +{ +add.f16x2 r1436, r1412, r1433; +} +{ +cvt.rn.f16.f64 rs208, fd476; +} +mov.b32 r1441, {rs208, rs208}; +{ +mul.f16x2 r1439, r70, r1441; +} +{ +add.f16x2 r1442, r1418, r1439; +} +{ +cvt.rn.f16.f64 rs209, fd475; +} +mov.b32 r1447, {rs209, rs209}; +{ +mul.f16x2 r1445, r64, r1447; +} +{ +add.f16x2 r1448, r1424, r1445; +} +{ +cvt.rn.f16.f64 rs210, fd476; +} +mov.b32 r1453, {rs210, rs210}; +{ +mul.f16x2 r1451, r67, r1453; +} +{ +add.f16x2 r1454, r1430, r1451; +} +{ +cvt.rn.f16.f64 rs211, fd443; +} +mov.b32 r1459, {rs211, rs211}; +{ +mul.f16x2 r1457, r73, r1459; +} +{ +add.f16x2 r1460, r1436, r1457; +} +{ +cvt.rn.f16.f64 rs212, fd388; +} +mov.b32 r1465, {rs212, rs212}; +{ +mul.f16x2 r1463, r82, r1465; +} +{ +add.f16x2 r1466, r1442, r1463; +} +{ +cvt.rn.f16.f64 rs213, fd443; +} +mov.b32 r1471, {rs213, rs213}; +{ +mul.f16x2 r1469, r76, r1471; +} +{ +add.f16x2 r1472, r1448, r1469; +} +{ +cvt.rn.f16.f64 rs214, fd388; +} +mov.b32 r1477, {rs214, rs214}; +{ +mul.f16x2 r1475, r79, r1477; +} +{ +add.f16x2 r1478, r1454, r1475; +} +{ +cvt.rn.f16.f64 rs215, fd483; +} +mov.b32 r1483, {rs215, rs215}; +{ +mul.f16x2 r1481, r85, r1483; +} +{ +add.f16x2 r1484, r1460, r1481; +} +{ +cvt.rn.f16.f64 rs216, fd412; +} +mov.b32 r1489, {rs216, rs216}; +{ +mul.f16x2 r1487, r94, r1489; +} +{ +add.f16x2 r1490, r1466, r1487; +} +{ +cvt.rn.f16.f64 rs217, fd483; +} +mov.b32 r1495, {rs217, rs217}; +{ +mul.f16x2 r1493, r88, r1495; +} +{ +add.f16x2 r1496, r1472, r1493; +} +{ +cvt.rn.f16.f64 rs218, fd412; +} +mov.b32 r1501, {rs218, rs218}; +{ +mul.f16x2 r1499, r91, r1501; +} +{ +add.f16x2 r1502, r1478, r1499; +} +{ +cvt.rn.f16.f64 rs219, fd447; +} +mov.b32 r1507, {rs219, rs219}; +{ +mul.f16x2 r1505, r97, r1507; +} +{ +add.f16x2 r1508, r1484, r1505; +} +{ +cvt.rn.f16.f64 rs220, fd448; +} +mov.b32 r1513, {rs220, rs220}; +{ +mul.f16x2 r1511, r106, r1513; +} +{ +add.f16x2 r1514, r1490, r1511; +} +{ +cvt.rn.f16.f64 rs221, fd447; +} +mov.b32 r1519, {rs221, rs221}; +{ +mul.f16x2 r1517, r100, r1519; +} +{ +add.f16x2 r1520, r1496, r1517; +} +{ +cvt.rn.f16.f64 rs222, fd448; +} +mov.b32 r1525, {rs222, rs222}; +{ +mul.f16x2 r1523, r103, r1525; +} +{ +add.f16x2 r1526, r1502, r1523; +} +{ +cvt.rn.f16.f64 rs223, fd471; +} +mov.b32 r1531, {rs223, rs223}; +{ +mul.f16x2 r1529, r109, r1531; +} +{ +add.f16x2 r1532, r1508, r1529; +} +{ +cvt.rn.f16.f64 rs224, fd416; +} +mov.b32 r1537, {rs224, rs224}; +{ +mul.f16x2 r1535, r118, r1537; +} +{ +add.f16x2 r1538, r1514, r1535; +} +{ +cvt.rn.f16.f64 rs225, fd471; +} +mov.b32 r1543, {rs225, rs225}; +{ +mul.f16x2 r1541, r112, r1543; +} +{ +add.f16x2 r1544, r1520, r1541; +} +{ +cvt.rn.f16.f64 rs226, fd416; +} +mov.b32 r1549, {rs226, rs226}; +{ +mul.f16x2 r1547, r115, r1549; +} +{ +add.f16x2 r1550, r1526, r1547; +} +{ +cvt.rn.f16.f64 rs227, fd459; +} +mov.b32 r1555, {rs227, rs227}; +{ +mul.f16x2 r1553, r121, r1555; +} +{ +add.f16x2 r1556, r1532, r1553; +} +{ +cvt.rn.f16.f64 rs228, fd460; +} +mov.b32 r1561, {rs228, rs228}; +{ +mul.f16x2 r1559, r130, r1561; +} +{ +add.f16x2 r1562, r1538, r1559; +} +{ +cvt.rn.f16.f64 rs229, fd459; +} +mov.b32 r1567, {rs229, rs229}; +{ +mul.f16x2 r1565, r124, r1567; +} +{ +add.f16x2 r1568, r1544, r1565; +} +{ +cvt.rn.f16.f64 rs230, fd460; +} +mov.b32 r1573, {rs230, rs230}; +{ +mul.f16x2 r1571, r127, r1573; +} +{ +add.f16x2 r1574, r1550, r1571; +} +{ +sub.f16x2 %10, r1556, r1562; +} +{ +add.f16x2 %11, r1568, r1574; +} +{ +add.f16x2 %36, r1556, r1562; +} +{ +sub.f16x2 %37, r1568, r1574; +} +cvt.rn.f16.s32 rs231, r2980; +mov.b32 r1601, {rs231, rs231}; +cvt.rn.f16.s32 rs232, r2980; +mov.b32 r1613, {rs232, rs232}; +{ +cvt.rn.f16.f64 rs233, fd483; +} +mov.b32 r1593, {rs233, rs233}; +{ +mul.f16x2 r1591, r1, r1593; +} +{ +add.f16x2 r1594, %46, r1591; +} +{ +cvt.rn.f16.f64 rs234, fd484; +} +mov.b32 r1599, {rs234, rs234}; +{ +mul.f16x2 r1597, r10, r1599; +} +{ +add.f16x2 r1600, r1601, r1597; +} +{ +cvt.rn.f16.f64 rs235, fd483; +} +mov.b32 r1605, {rs235, rs235}; +{ +mul.f16x2 r1603, r4, r1605; +} +{ +add.f16x2 r1606, %47, r1603; +} +{ +cvt.rn.f16.f64 rs236, fd484; +} +mov.b32 r1611, {rs236, rs236}; +{ +mul.f16x2 r1609, r7, r1611; +} +{ +add.f16x2 r1612, r1613, r1609; +} +{ +cvt.rn.f16.f64 rs237, fd443; +} +mov.b32 r1617, {rs237, rs237}; +{ +mul.f16x2 r1615, r13, r1617; +} +{ +add.f16x2 r1618, r1594, r1615; +} +{ +cvt.rn.f16.f64 rs238, fd388; +} +mov.b32 r1623, {rs238, rs238}; +{ +mul.f16x2 r1621, r22, r1623; +} +{ +add.f16x2 r1624, r1600, r1621; +} +{ +cvt.rn.f16.f64 rs239, fd443; +} +mov.b32 r1629, {rs239, rs239}; +{ +mul.f16x2 r1627, r16, r1629; +} +{ +add.f16x2 r1630, r1606, r1627; +} +{ +cvt.rn.f16.f64 rs240, fd388; +} +mov.b32 r1635, {rs240, rs240}; +{ +mul.f16x2 r1633, r19, r1635; +} +{ +add.f16x2 r1636, r1612, r1633; +} +{ +cvt.rn.f16.f64 rs241, fd479; +} +mov.b32 r1641, {rs241, rs241}; +{ +mul.f16x2 r1639, r25, r1641; +} +{ +add.f16x2 r1642, r1618, r1639; +} +{ +cvt.rn.f16.f64 rs242, fd480; +} +mov.b32 r1647, {rs242, rs242}; +{ +mul.f16x2 r1645, r34, r1647; +} +{ +add.f16x2 r1648, r1624, r1645; +} +{ +cvt.rn.f16.f64 rs243, fd479; +} +mov.b32 r1653, {rs243, rs243}; +{ +mul.f16x2 r1651, r28, r1653; +} +{ +add.f16x2 r1654, r1630, r1651; +} +{ +cvt.rn.f16.f64 rs244, fd480; +} +mov.b32 r1659, {rs244, rs244}; +{ +mul.f16x2 r1657, r31, r1659; +} +{ +add.f16x2 r1660, r1636, r1657; +} +{ +cvt.rn.f16.f64 rs245, fd447; +} +mov.b32 r1665, {rs245, rs245}; +{ +mul.f16x2 r1663, r37, r1665; +} +{ +add.f16x2 r1666, r1642, r1663; +} +{ +cvt.rn.f16.f64 rs246, fd424; +} +mov.b32 r1671, {rs246, rs246}; +{ +mul.f16x2 r1669, r46, r1671; +} +{ +add.f16x2 r1672, r1648, r1669; +} +{ +cvt.rn.f16.f64 rs247, fd447; +} +mov.b32 r1677, {rs247, rs247}; +{ +mul.f16x2 r1675, r40, r1677; +} +{ +add.f16x2 r1678, r1654, r1675; +} +{ +cvt.rn.f16.f64 rs248, fd424; +} +mov.b32 r1683, {rs248, rs248}; +{ +mul.f16x2 r1681, r43, r1683; +} +{ +add.f16x2 r1684, r1660, r1681; +} +{ +cvt.rn.f16.f64 rs249, fd475; +} +mov.b32 r1689, {rs249, rs249}; +{ +mul.f16x2 r1687, r49, r1689; +} +{ +add.f16x2 r1690, r1666, r1687; +} +{ +cvt.rn.f16.f64 rs250, fd476; +} +mov.b32 r1695, {rs250, rs250}; +{ +mul.f16x2 r1693, r58, r1695; +} +{ +add.f16x2 r1696, r1672, r1693; +} +{ +cvt.rn.f16.f64 rs251, fd475; +} +mov.b32 r1701, {rs251, rs251}; +{ +mul.f16x2 r1699, r52, r1701; +} +{ +add.f16x2 r1702, r1678, r1699; +} +{ +cvt.rn.f16.f64 rs252, fd476; +} +mov.b32 r1707, {rs252, rs252}; +{ +mul.f16x2 r1705, r55, r1707; +} +{ +add.f16x2 r1708, r1684, r1705; +} +{ +cvt.rn.f16.f64 rs253, fd451; +} +mov.b32 r1713, {rs253, rs253}; +{ +mul.f16x2 r1711, r61, r1713; +} +{ +add.f16x2 r1714, r1690, r1711; +} +{ +cvt.rn.f16.f64 rs254, fd368; +} +mov.b32 r1719, {rs254, rs254}; +{ +mul.f16x2 r1717, r70, r1719; +} +{ +add.f16x2 r1720, r1696, r1717; +} +{ +cvt.rn.f16.f64 rs255, fd451; +} +mov.b32 r1725, {rs255, rs255}; +{ +mul.f16x2 r1723, r64, r1725; +} +{ +add.f16x2 r1726, r1702, r1723; +} +{ +cvt.rn.f16.f64 rs256, fd368; +} +mov.b32 r1731, {rs256, rs256}; +{ +mul.f16x2 r1729, r67, r1731; +} +{ +add.f16x2 r1732, r1708, r1729; +} +{ +cvt.rn.f16.f64 rs257, fd471; +} +mov.b32 r1737, {rs257, rs257}; +{ +mul.f16x2 r1735, r73, r1737; +} +{ +add.f16x2 r1738, r1714, r1735; +} +mov.f64 fd472, 0d3FEC698E42F47B09; +{ +cvt.rn.f16.f64 rs258, fd472; +} +mov.b32 r1743, {rs258, rs258}; +{ +mul.f16x2 r1741, r82, r1743; +} +{ +add.f16x2 r1744, r1720, r1741; +} +{ +cvt.rn.f16.f64 rs259, fd471; +} +mov.b32 r1749, {rs259, rs259}; +{ +mul.f16x2 r1747, r76, r1749; +} +{ +add.f16x2 r1750, r1726, r1747; +} +{ +cvt.rn.f16.f64 rs260, fd472; +} +mov.b32 r1755, {rs260, rs260}; +{ +mul.f16x2 r1753, r79, r1755; +} +{ +add.f16x2 r1756, r1732, r1753; +} +{ +cvt.rn.f16.f64 rs261, fd455; +} +mov.b32 r1761, {rs261, rs261}; +{ +mul.f16x2 r1759, r85, r1761; +} +{ +add.f16x2 r1762, r1738, r1759; +} +{ +cvt.rn.f16.f64 rs262, fd332; +} +mov.b32 r1767, {rs262, rs262}; +{ +mul.f16x2 r1765, r94, r1767; +} +{ +add.f16x2 r1768, r1744, r1765; +} +{ +cvt.rn.f16.f64 rs263, fd455; +} +mov.b32 r1773, {rs263, rs263}; +{ +mul.f16x2 r1771, r88, r1773; +} +{ +add.f16x2 r1774, r1750, r1771; +} +{ +cvt.rn.f16.f64 rs264, fd332; +} +mov.b32 r1779, {rs264, rs264}; +{ +mul.f16x2 r1777, r91, r1779; +} +{ +add.f16x2 r1780, r1756, r1777; +} +{ +cvt.rn.f16.f64 rs265, fd467; +} +mov.b32 r1785, {rs265, rs265}; +{ +mul.f16x2 r1783, r97, r1785; +} +{ +add.f16x2 r1786, r1762, r1783; +} +{ +cvt.rn.f16.f64 rs266, fd468; +} +mov.b32 r1791, {rs266, rs266}; +{ +mul.f16x2 r1789, r106, r1791; +} +{ +add.f16x2 r1792, r1768, r1789; +} +{ +cvt.rn.f16.f64 rs267, fd467; +} +mov.b32 r1797, {rs267, rs267}; +{ +mul.f16x2 r1795, r100, r1797; +} +{ +add.f16x2 r1798, r1774, r1795; +} +{ +cvt.rn.f16.f64 rs268, fd468; +} +mov.b32 r1803, {rs268, rs268}; +{ +mul.f16x2 r1801, r103, r1803; +} +{ +add.f16x2 r1804, r1780, r1801; +} +{ +cvt.rn.f16.f64 rs269, fd459; +} +mov.b32 r1809, {rs269, rs269}; +{ +mul.f16x2 r1807, r109, r1809; +} +{ +add.f16x2 r1810, r1786, r1807; +} +{ +cvt.rn.f16.f64 rs270, fd420; +} +mov.b32 r1815, {rs270, rs270}; +{ +mul.f16x2 r1813, r118, r1815; +} +{ +add.f16x2 r1816, r1792, r1813; +} +{ +cvt.rn.f16.f64 rs271, fd459; +} +mov.b32 r1821, {rs271, rs271}; +{ +mul.f16x2 r1819, r112, r1821; +} +{ +add.f16x2 r1822, r1798, r1819; +} +{ +cvt.rn.f16.f64 rs272, fd420; +} +mov.b32 r1827, {rs272, rs272}; +{ +mul.f16x2 r1825, r115, r1827; +} +{ +add.f16x2 r1828, r1804, r1825; +} +{ +cvt.rn.f16.f64 rs273, fd463; +} +mov.b32 r1833, {rs273, rs273}; +{ +mul.f16x2 r1831, r121, r1833; +} +{ +add.f16x2 r1834, r1810, r1831; +} +{ +cvt.rn.f16.f64 rs274, fd464; +} +mov.b32 r1839, {rs274, rs274}; +{ +mul.f16x2 r1837, r130, r1839; +} +{ +add.f16x2 r1840, r1816, r1837; +} +{ +cvt.rn.f16.f64 rs275, fd463; +} +mov.b32 r1845, {rs275, rs275}; +{ +mul.f16x2 r1843, r124, r1845; +} +{ +add.f16x2 r1846, r1822, r1843; +} +{ +cvt.rn.f16.f64 rs276, fd464; +} +mov.b32 r1851, {rs276, rs276}; +{ +mul.f16x2 r1849, r127, r1851; +} +{ +add.f16x2 r1852, r1828, r1849; +} +{ +sub.f16x2 %12, r1834, r1840; +} +{ +add.f16x2 %13, r1846, r1852; +} +{ +add.f16x2 %34, r1834, r1840; +} +{ +sub.f16x2 %35, r1846, r1852; +} +cvt.rn.f16.s32 rs277, r2980; +mov.b32 r1879, {rs277, rs277}; +cvt.rn.f16.s32 rs278, r2980; +mov.b32 r1891, {rs278, rs278}; +{ +cvt.rn.f16.f64 rs279, fd475; +} +mov.b32 r1871, {rs279, rs279}; +{ +mul.f16x2 r1869, r1, r1871; +} +{ +add.f16x2 r1872, %46, r1869; +} +{ +cvt.rn.f16.f64 rs280, fd476; +} +mov.b32 r1877, {rs280, rs280}; +{ +mul.f16x2 r1875, r10, r1877; +} +{ +add.f16x2 r1878, r1879, r1875; +} +{ +cvt.rn.f16.f64 rs281, fd475; +} +mov.b32 r1883, {rs281, rs281}; +{ +mul.f16x2 r1881, r4, r1883; +} +{ +add.f16x2 r1884, %47, r1881; +} +{ +cvt.rn.f16.f64 rs282, fd476; +} +mov.b32 r1889, {rs282, rs282}; +{ +mul.f16x2 r1887, r7, r1889; +} +{ +add.f16x2 r1890, r1891, r1887; +} +{ +cvt.rn.f16.f64 rs283, fd459; +} +mov.b32 r1895, {rs283, rs283}; +{ +mul.f16x2 r1893, r13, r1895; +} +{ +add.f16x2 r1896, r1872, r1893; +} +{ +cvt.rn.f16.f64 rs284, fd420; +} +mov.b32 r1901, {rs284, rs284}; +{ +mul.f16x2 r1899, r22, r1901; +} +{ +add.f16x2 r1902, r1878, r1899; +} +{ +cvt.rn.f16.f64 rs285, fd459; +} +mov.b32 r1907, {rs285, rs285}; +{ +mul.f16x2 r1905, r16, r1907; +} +{ +add.f16x2 r1908, r1884, r1905; +} +{ +cvt.rn.f16.f64 rs286, fd420; +} +mov.b32 r1913, {rs286, rs286}; +{ +mul.f16x2 r1911, r19, r1913; +} +{ +add.f16x2 r1914, r1890, r1911; +} +{ +cvt.rn.f16.f64 rs287, fd455; +} +mov.b32 r1919, {rs287, rs287}; +{ +mul.f16x2 r1917, r25, r1919; +} +{ +add.f16x2 r1920, r1896, r1917; +} +{ +cvt.rn.f16.f64 rs288, fd456; +} +mov.b32 r1925, {rs288, rs288}; +{ +mul.f16x2 r1923, r34, r1925; +} +{ +add.f16x2 r1926, r1902, r1923; +} +{ +cvt.rn.f16.f64 rs289, fd455; +} +mov.b32 r1931, {rs289, rs289}; +{ +mul.f16x2 r1929, r28, r1931; +} +{ +add.f16x2 r1932, r1908, r1929; +} +{ +cvt.rn.f16.f64 rs290, fd456; +} +mov.b32 r1937, {rs290, rs290}; +{ +mul.f16x2 r1935, r31, r1937; +} +{ +add.f16x2 r1938, r1914, r1935; +} +{ +cvt.rn.f16.f64 rs291, fd479; +} +mov.b32 r1943, {rs291, rs291}; +{ +mul.f16x2 r1941, r37, r1943; +} +{ +add.f16x2 r1944, r1920, r1941; +} +{ +cvt.rn.f16.f64 rs292, fd280; +} +mov.b32 r1949, {rs292, rs292}; +{ +mul.f16x2 r1947, r46, r1949; +} +{ +add.f16x2 r1950, r1926, r1947; +} +{ +cvt.rn.f16.f64 rs293, fd479; +} +mov.b32 r1955, {rs293, rs293}; +{ +mul.f16x2 r1953, r40, r1955; +} +{ +add.f16x2 r1956, r1932, r1953; +} +{ +cvt.rn.f16.f64 rs294, fd280; +} +mov.b32 r1961, {rs294, rs294}; +{ +mul.f16x2 r1959, r43, r1961; +} +{ +add.f16x2 r1962, r1938, r1959; +} +{ +cvt.rn.f16.f64 rs295, fd443; +} +mov.b32 r1967, {rs295, rs295}; +{ +mul.f16x2 r1965, r49, r1967; +} +{ +add.f16x2 r1968, r1944, r1965; +} +{ +cvt.rn.f16.f64 rs296, fd388; +} +mov.b32 r1973, {rs296, rs296}; +{ +mul.f16x2 r1971, r58, r1973; +} +{ +add.f16x2 r1974, r1950, r1971; +} +{ +cvt.rn.f16.f64 rs297, fd443; +} +mov.b32 r1979, {rs297, rs297}; +{ +mul.f16x2 r1977, r52, r1979; +} +{ +add.f16x2 r1980, r1956, r1977; +} +{ +cvt.rn.f16.f64 rs298, fd388; +} +mov.b32 r1985, {rs298, rs298}; +{ +mul.f16x2 r1983, r55, r1985; +} +{ +add.f16x2 r1986, r1962, r1983; +} +{ +cvt.rn.f16.f64 rs299, fd471; +} +mov.b32 r1991, {rs299, rs299}; +{ +mul.f16x2 r1989, r61, r1991; +} +{ +add.f16x2 r1992, r1968, r1989; +} +{ +cvt.rn.f16.f64 rs300, fd472; +} +mov.b32 r1997, {rs300, rs300}; +{ +mul.f16x2 r1995, r70, r1997; +} +{ +add.f16x2 r1998, r1974, r1995; +} +{ +cvt.rn.f16.f64 rs301, fd471; +} +mov.b32 r2003, {rs301, rs301}; +{ +mul.f16x2 r2001, r64, r2003; +} +{ +add.f16x2 r2004, r1980, r2001; +} +{ +cvt.rn.f16.f64 rs302, fd472; +} +mov.b32 r2009, {rs302, rs302}; +{ +mul.f16x2 r2007, r67, r2009; +} +{ +add.f16x2 r2010, r1986, r2007; +} +{ +cvt.rn.f16.f64 rs303, fd463; +} +mov.b32 r2015, {rs303, rs303}; +{ +mul.f16x2 r2013, r73, r2015; +} +{ +add.f16x2 r2016, r1992, r2013; +} +{ +cvt.rn.f16.f64 rs304, fd384; +} +mov.b32 r2021, {rs304, rs304}; +{ +mul.f16x2 r2019, r82, r2021; +} +{ +add.f16x2 r2022, r1998, r2019; +} +{ +cvt.rn.f16.f64 rs305, fd463; +} +mov.b32 r2027, {rs305, rs305}; +{ +mul.f16x2 r2025, r76, r2027; +} +{ +add.f16x2 r2028, r2004, r2025; +} +{ +cvt.rn.f16.f64 rs306, fd384; +} +mov.b32 r2033, {rs306, rs306}; +{ +mul.f16x2 r2031, r79, r2033; +} +{ +add.f16x2 r2034, r2010, r2031; +} +{ +cvt.rn.f16.f64 rs307, fd451; +} +mov.b32 r2039, {rs307, rs307}; +{ +mul.f16x2 r2037, r85, r2039; +} +{ +add.f16x2 r2040, r2016, r2037; +} +{ +cvt.rn.f16.f64 rs308, fd452; +} +mov.b32 r2045, {rs308, rs308}; +{ +mul.f16x2 r2043, r94, r2045; +} +{ +add.f16x2 r2046, r2022, r2043; +} +{ +cvt.rn.f16.f64 rs309, fd451; +} +mov.b32 r2051, {rs309, rs309}; +{ +mul.f16x2 r2049, r88, r2051; +} +{ +add.f16x2 r2052, r2028, r2049; +} +{ +cvt.rn.f16.f64 rs310, fd452; +} +mov.b32 r2057, {rs310, rs310}; +{ +mul.f16x2 r2055, r91, r2057; +} +{ +add.f16x2 r2058, r2034, r2055; +} +{ +cvt.rn.f16.f64 rs311, fd483; +} +mov.b32 r2063, {rs311, rs311}; +{ +mul.f16x2 r2061, r97, r2063; +} +{ +add.f16x2 r2064, r2040, r2061; +} +{ +cvt.rn.f16.f64 rs312, fd412; +} +mov.b32 r2069, {rs312, rs312}; +{ +mul.f16x2 r2067, r106, r2069; +} +{ +add.f16x2 r2070, r2046, r2067; +} +{ +cvt.rn.f16.f64 rs313, fd483; +} +mov.b32 r2075, {rs313, rs313}; +{ +mul.f16x2 r2073, r100, r2075; +} +{ +add.f16x2 r2076, r2052, r2073; +} +{ +cvt.rn.f16.f64 rs314, fd412; +} +mov.b32 r2081, {rs314, rs314}; +{ +mul.f16x2 r2079, r103, r2081; +} +{ +add.f16x2 r2082, r2058, r2079; +} +{ +cvt.rn.f16.f64 rs315, fd447; +} +mov.b32 r2087, {rs315, rs315}; +{ +mul.f16x2 r2085, r109, r2087; +} +{ +add.f16x2 r2088, r2064, r2085; +} +{ +cvt.rn.f16.f64 rs316, fd424; +} +mov.b32 r2093, {rs316, rs316}; +{ +mul.f16x2 r2091, r118, r2093; +} +{ +add.f16x2 r2094, r2070, r2091; +} +{ +cvt.rn.f16.f64 rs317, fd447; +} +mov.b32 r2099, {rs317, rs317}; +{ +mul.f16x2 r2097, r112, r2099; +} +{ +add.f16x2 r2100, r2076, r2097; +} +{ +cvt.rn.f16.f64 rs318, fd424; +} +mov.b32 r2105, {rs318, rs318}; +{ +mul.f16x2 r2103, r115, r2105; +} +{ +add.f16x2 r2106, r2082, r2103; +} +{ +cvt.rn.f16.f64 rs319, fd467; +} +mov.b32 r2111, {rs319, rs319}; +{ +mul.f16x2 r2109, r121, r2111; +} +{ +add.f16x2 r2112, r2088, r2109; +} +{ +cvt.rn.f16.f64 rs320, fd468; +} +mov.b32 r2117, {rs320, rs320}; +{ +mul.f16x2 r2115, r130, r2117; +} +{ +add.f16x2 r2118, r2094, r2115; +} +{ +cvt.rn.f16.f64 rs321, fd467; +} +mov.b32 r2123, {rs321, rs321}; +{ +mul.f16x2 r2121, r124, r2123; +} +{ +add.f16x2 r2124, r2100, r2121; +} +{ +cvt.rn.f16.f64 rs322, fd468; +} +mov.b32 r2129, {rs322, rs322}; +{ +mul.f16x2 r2127, r127, r2129; +} +{ +add.f16x2 r2130, r2106, r2127; +} +{ +sub.f16x2 %14, r2112, r2118; +} +{ +add.f16x2 %15, r2124, r2130; +} +{ +add.f16x2 %32, r2112, r2118; +} +{ +sub.f16x2 %33, r2124, r2130; +} +cvt.rn.f16.s32 rs323, r2980; +mov.b32 r2157, {rs323, rs323}; +cvt.rn.f16.s32 rs324, r2980; +mov.b32 r2169, {rs324, rs324}; +{ +cvt.rn.f16.f64 rs325, fd467; +} +mov.b32 r2149, {rs325, rs325}; +{ +mul.f16x2 r2147, r1, r2149; +} +{ +add.f16x2 r2150, %46, r2147; +} +{ +cvt.rn.f16.f64 rs326, fd468; +} +mov.b32 r2155, {rs326, rs326}; +{ +mul.f16x2 r2153, r10, r2155; +} +{ +add.f16x2 r2156, r2157, r2153; +} +{ +cvt.rn.f16.f64 rs327, fd467; +} +mov.b32 r2161, {rs327, rs327}; +{ +mul.f16x2 r2159, r4, r2161; +} +{ +add.f16x2 r2162, %47, r2159; +} +{ +cvt.rn.f16.f64 rs328, fd468; +} +mov.b32 r2167, {rs328, rs328}; +{ +mul.f16x2 r2165, r7, r2167; +} +{ +add.f16x2 r2168, r2169, r2165; +} +{ +cvt.rn.f16.f64 rs329, fd475; +} +mov.b32 r2173, {rs329, rs329}; +{ +mul.f16x2 r2171, r13, r2173; +} +{ +add.f16x2 r2174, r2150, r2171; +} +{ +cvt.rn.f16.f64 rs330, fd316; +} +mov.b32 r2179, {rs330, rs330}; +{ +mul.f16x2 r2177, r22, r2179; +} +{ +add.f16x2 r2180, r2156, r2177; +} +{ +cvt.rn.f16.f64 rs331, fd475; +} +mov.b32 r2185, {rs331, rs331}; +{ +mul.f16x2 r2183, r16, r2185; +} +{ +add.f16x2 r2186, r2162, r2183; +} +{ +cvt.rn.f16.f64 rs332, fd316; +} +mov.b32 r2191, {rs332, rs332}; +{ +mul.f16x2 r2189, r19, r2191; +} +{ +add.f16x2 r2192, r2168, r2189; +} +{ +cvt.rn.f16.f64 rs333, fd447; +} +mov.b32 r2197, {rs333, rs333}; +{ +mul.f16x2 r2195, r25, r2197; +} +{ +add.f16x2 r2198, r2174, r2195; +} +{ +cvt.rn.f16.f64 rs334, fd424; +} +mov.b32 r2203, {rs334, rs334}; +{ +mul.f16x2 r2201, r34, r2203; +} +{ +add.f16x2 r2204, r2180, r2201; +} +{ +cvt.rn.f16.f64 rs335, fd447; +} +mov.b32 r2209, {rs335, rs335}; +{ +mul.f16x2 r2207, r28, r2209; +} +{ +add.f16x2 r2210, r2186, r2207; +} +{ +cvt.rn.f16.f64 rs336, fd424; +} +mov.b32 r2215, {rs336, rs336}; +{ +mul.f16x2 r2213, r31, r2215; +} +{ +add.f16x2 r2216, r2192, r2213; +} +{ +cvt.rn.f16.f64 rs337, fd459; +} +mov.b32 r2221, {rs337, rs337}; +{ +mul.f16x2 r2219, r37, r2221; +} +{ +add.f16x2 r2222, r2198, r2219; +} +{ +cvt.rn.f16.f64 rs338, fd460; +} +mov.b32 r2227, {rs338, rs338}; +{ +mul.f16x2 r2225, r46, r2227; +} +{ +add.f16x2 r2228, r2204, r2225; +} +{ +cvt.rn.f16.f64 rs339, fd459; +} +mov.b32 r2233, {rs339, rs339}; +{ +mul.f16x2 r2231, r40, r2233; +} +{ +add.f16x2 r2234, r2210, r2231; +} +{ +cvt.rn.f16.f64 rs340, fd460; +} +mov.b32 r2239, {rs340, rs340}; +{ +mul.f16x2 r2237, r43, r2239; +} +{ +add.f16x2 r2240, r2216, r2237; +} +{ +cvt.rn.f16.f64 rs341, fd483; +} +mov.b32 r2245, {rs341, rs341}; +{ +mul.f16x2 r2243, r49, r2245; +} +{ +add.f16x2 r2246, r2222, r2243; +} +{ +cvt.rn.f16.f64 rs342, fd412; +} +mov.b32 r2251, {rs342, rs342}; +{ +mul.f16x2 r2249, r58, r2251; +} +{ +add.f16x2 r2252, r2228, r2249; +} +{ +cvt.rn.f16.f64 rs343, fd483; +} +mov.b32 r2257, {rs343, rs343}; +{ +mul.f16x2 r2255, r52, r2257; +} +{ +add.f16x2 r2258, r2234, r2255; +} +{ +cvt.rn.f16.f64 rs344, fd412; +} +mov.b32 r2263, {rs344, rs344}; +{ +mul.f16x2 r2261, r55, r2263; +} +{ +add.f16x2 r2264, r2240, r2261; +} +{ +cvt.rn.f16.f64 rs345, fd455; +} +mov.b32 r2269, {rs345, rs345}; +{ +mul.f16x2 r2267, r61, r2269; +} +{ +add.f16x2 r2270, r2246, r2267; +} +{ +cvt.rn.f16.f64 rs346, fd332; +} +mov.b32 r2275, {rs346, rs346}; +{ +mul.f16x2 r2273, r70, r2275; +} +{ +add.f16x2 r2276, r2252, r2273; +} +{ +cvt.rn.f16.f64 rs347, fd455; +} +mov.b32 r2281, {rs347, rs347}; +{ +mul.f16x2 r2279, r64, r2281; +} +{ +add.f16x2 r2282, r2258, r2279; +} +{ +cvt.rn.f16.f64 rs348, fd332; +} +mov.b32 r2287, {rs348, rs348}; +{ +mul.f16x2 r2285, r67, r2287; +} +{ +add.f16x2 r2288, r2264, r2285; +} +{ +cvt.rn.f16.f64 rs349, fd451; +} +mov.b32 r2293, {rs349, rs349}; +{ +mul.f16x2 r2291, r73, r2293; +} +{ +add.f16x2 r2294, r2270, r2291; +} +{ +cvt.rn.f16.f64 rs350, fd452; +} +mov.b32 r2299, {rs350, rs350}; +{ +mul.f16x2 r2297, r82, r2299; +} +{ +add.f16x2 r2300, r2276, r2297; +} +{ +cvt.rn.f16.f64 rs351, fd451; +} +mov.b32 r2305, {rs351, rs351}; +{ +mul.f16x2 r2303, r76, r2305; +} +{ +add.f16x2 r2306, r2282, r2303; +} +{ +cvt.rn.f16.f64 rs352, fd452; +} +mov.b32 r2311, {rs352, rs352}; +{ +mul.f16x2 r2309, r79, r2311; +} +{ +add.f16x2 r2312, r2288, r2309; +} +{ +cvt.rn.f16.f64 rs353, fd479; +} +mov.b32 r2317, {rs353, rs353}; +{ +mul.f16x2 r2315, r85, r2317; +} +{ +add.f16x2 r2318, r2294, r2315; +} +{ +cvt.rn.f16.f64 rs354, fd480; +} +mov.b32 r2323, {rs354, rs354}; +{ +mul.f16x2 r2321, r94, r2323; +} +{ +add.f16x2 r2324, r2300, r2321; +} +{ +cvt.rn.f16.f64 rs355, fd479; +} +mov.b32 r2329, {rs355, rs355}; +{ +mul.f16x2 r2327, r88, r2329; +} +{ +add.f16x2 r2330, r2306, r2327; +} +{ +cvt.rn.f16.f64 rs356, fd480; +} +mov.b32 r2335, {rs356, rs356}; +{ +mul.f16x2 r2333, r91, r2335; +} +{ +add.f16x2 r2336, r2312, r2333; +} +{ +cvt.rn.f16.f64 rs357, fd463; +} +mov.b32 r2341, {rs357, rs357}; +{ +mul.f16x2 r2339, r97, r2341; +} +{ +add.f16x2 r2342, r2318, r2339; +} +{ +cvt.rn.f16.f64 rs358, fd384; +} +mov.b32 r2347, {rs358, rs358}; +{ +mul.f16x2 r2345, r106, r2347; +} +{ +add.f16x2 r2348, r2324, r2345; +} +{ +cvt.rn.f16.f64 rs359, fd463; +} +mov.b32 r2353, {rs359, rs359}; +{ +mul.f16x2 r2351, r100, r2353; +} +{ +add.f16x2 r2354, r2330, r2351; +} +{ +cvt.rn.f16.f64 rs360, fd384; +} +mov.b32 r2359, {rs360, rs360}; +{ +mul.f16x2 r2357, r103, r2359; +} +{ +add.f16x2 r2360, r2336, r2357; +} +{ +cvt.rn.f16.f64 rs361, fd443; +} +mov.b32 r2365, {rs361, rs361}; +{ +mul.f16x2 r2363, r109, r2365; +} +{ +add.f16x2 r2366, r2342, r2363; +} +{ +cvt.rn.f16.f64 rs362, fd444; +} +mov.b32 r2371, {rs362, rs362}; +{ +mul.f16x2 r2369, r118, r2371; +} +{ +add.f16x2 r2372, r2348, r2369; +} +{ +cvt.rn.f16.f64 rs363, fd443; +} +mov.b32 r2377, {rs363, rs363}; +{ +mul.f16x2 r2375, r112, r2377; +} +{ +add.f16x2 r2378, r2354, r2375; +} +{ +cvt.rn.f16.f64 rs364, fd444; +} +mov.b32 r2383, {rs364, rs364}; +{ +mul.f16x2 r2381, r115, r2383; +} +{ +add.f16x2 r2384, r2360, r2381; +} +{ +cvt.rn.f16.f64 rs365, fd471; +} +mov.b32 r2389, {rs365, rs365}; +{ +mul.f16x2 r2387, r121, r2389; +} +{ +add.f16x2 r2390, r2366, r2387; +} +{ +cvt.rn.f16.f64 rs366, fd472; +} +mov.b32 r2395, {rs366, rs366}; +{ +mul.f16x2 r2393, r130, r2395; +} +{ +add.f16x2 r2396, r2372, r2393; +} +{ +cvt.rn.f16.f64 rs367, fd471; +} +mov.b32 r2401, {rs367, rs367}; +{ +mul.f16x2 r2399, r124, r2401; +} +{ +add.f16x2 r2402, r2378, r2399; +} +{ +cvt.rn.f16.f64 rs368, fd472; +} +mov.b32 r2407, {rs368, rs368}; +{ +mul.f16x2 r2405, r127, r2407; +} +{ +add.f16x2 r2408, r2384, r2405; +} +{ +sub.f16x2 %16, r2390, r2396; +} +{ +add.f16x2 %17, r2402, r2408; +} +{ +add.f16x2 %30, r2390, r2396; +} +{ +sub.f16x2 %31, r2402, r2408; +} +cvt.rn.f16.s32 rs369, r2980; +mov.b32 r2435, {rs369, rs369}; +cvt.rn.f16.s32 rs370, r2980; +mov.b32 r2447, {rs370, rs370}; +{ +cvt.rn.f16.f64 rs371, fd459; +} +mov.b32 r2427, {rs371, rs371}; +{ +mul.f16x2 r2425, r1, r2427; +} +{ +add.f16x2 r2428, %46, r2425; +} +{ +cvt.rn.f16.f64 rs372, fd460; +} +mov.b32 r2433, {rs372, rs372}; +{ +mul.f16x2 r2431, r10, r2433; +} +{ +add.f16x2 r2434, r2435, r2431; +} +{ +cvt.rn.f16.f64 rs373, fd459; +} +mov.b32 r2439, {rs373, rs373}; +{ +mul.f16x2 r2437, r4, r2439; +} +{ +add.f16x2 r2440, %47, r2437; +} +{ +cvt.rn.f16.f64 rs374, fd460; +} +mov.b32 r2445, {rs374, rs374}; +{ +mul.f16x2 r2443, r7, r2445; +} +{ +add.f16x2 r2446, r2447, r2443; +} +{ +cvt.rn.f16.f64 rs375, fd479; +} +mov.b32 r2451, {rs375, rs375}; +{ +mul.f16x2 r2449, r13, r2451; +} +{ +add.f16x2 r2452, r2428, r2449; +} +{ +cvt.rn.f16.f64 rs376, fd480; +} +mov.b32 r2457, {rs376, rs376}; +{ +mul.f16x2 r2455, r22, r2457; +} +{ +add.f16x2 r2458, r2434, r2455; +} +{ +cvt.rn.f16.f64 rs377, fd479; +} +mov.b32 r2463, {rs377, rs377}; +{ +mul.f16x2 r2461, r16, r2463; +} +{ +add.f16x2 r2464, r2440, r2461; +} +{ +cvt.rn.f16.f64 rs378, fd480; +} +mov.b32 r2469, {rs378, rs378}; +{ +mul.f16x2 r2467, r19, r2469; +} +{ +add.f16x2 r2470, r2446, r2467; +} +{ +cvt.rn.f16.f64 rs379, fd471; +} +mov.b32 r2475, {rs379, rs379}; +{ +mul.f16x2 r2473, r25, r2475; +} +{ +add.f16x2 r2476, r2452, r2473; +} +{ +cvt.rn.f16.f64 rs380, fd416; +} +mov.b32 r2481, {rs380, rs380}; +{ +mul.f16x2 r2479, r34, r2481; +} +{ +add.f16x2 r2482, r2458, r2479; +} +{ +cvt.rn.f16.f64 rs381, fd471; +} +mov.b32 r2487, {rs381, rs381}; +{ +mul.f16x2 r2485, r28, r2487; +} +{ +add.f16x2 r2488, r2464, r2485; +} +{ +cvt.rn.f16.f64 rs382, fd416; +} +mov.b32 r2493, {rs382, rs382}; +{ +mul.f16x2 r2491, r31, r2493; +} +{ +add.f16x2 r2494, r2470, r2491; +} +{ +cvt.rn.f16.f64 rs383, fd451; +} +mov.b32 r2499, {rs383, rs383}; +{ +mul.f16x2 r2497, r37, r2499; +} +{ +add.f16x2 r2500, r2476, r2497; +} +{ +cvt.rn.f16.f64 rs384, fd368; +} +mov.b32 r2505, {rs384, rs384}; +{ +mul.f16x2 r2503, r46, r2505; +} +{ +add.f16x2 r2506, r2482, r2503; +} +{ +cvt.rn.f16.f64 rs385, fd451; +} +mov.b32 r2511, {rs385, rs385}; +{ +mul.f16x2 r2509, r40, r2511; +} +{ +add.f16x2 r2512, r2488, r2509; +} +{ +cvt.rn.f16.f64 rs386, fd368; +} +mov.b32 r2517, {rs386, rs386}; +{ +mul.f16x2 r2515, r43, r2517; +} +{ +add.f16x2 r2518, r2494, r2515; +} +{ +cvt.rn.f16.f64 rs387, fd447; +} +mov.b32 r2523, {rs387, rs387}; +{ +mul.f16x2 r2521, r49, r2523; +} +{ +add.f16x2 r2524, r2500, r2521; +} +{ +cvt.rn.f16.f64 rs388, fd448; +} +mov.b32 r2529, {rs388, rs388}; +{ +mul.f16x2 r2527, r58, r2529; +} +{ +add.f16x2 r2530, r2506, r2527; +} +{ +cvt.rn.f16.f64 rs389, fd447; +} +mov.b32 r2535, {rs389, rs389}; +{ +mul.f16x2 r2533, r52, r2535; +} +{ +add.f16x2 r2536, r2512, r2533; +} +{ +cvt.rn.f16.f64 rs390, fd448; +} +mov.b32 r2541, {rs390, rs390}; +{ +mul.f16x2 r2539, r55, r2541; +} +{ +add.f16x2 r2542, r2518, r2539; +} +{ +cvt.rn.f16.f64 rs391, fd467; +} +mov.b32 r2547, {rs391, rs391}; +{ +mul.f16x2 r2545, r61, r2547; +} +{ +add.f16x2 r2548, r2524, r2545; +} +{ +cvt.rn.f16.f64 rs392, fd468; +} +mov.b32 r2553, {rs392, rs392}; +{ +mul.f16x2 r2551, r70, r2553; +} +{ +add.f16x2 r2554, r2530, r2551; +} +{ +cvt.rn.f16.f64 rs393, fd467; +} +mov.b32 r2559, {rs393, rs393}; +{ +mul.f16x2 r2557, r64, r2559; +} +{ +add.f16x2 r2560, r2536, r2557; +} +{ +cvt.rn.f16.f64 rs394, fd468; +} +mov.b32 r2565, {rs394, rs394}; +{ +mul.f16x2 r2563, r67, r2565; +} +{ +add.f16x2 r2566, r2542, r2563; +} +{ +cvt.rn.f16.f64 rs395, fd483; +} +mov.b32 r2571, {rs395, rs395}; +{ +mul.f16x2 r2569, r73, r2571; +} +{ +add.f16x2 r2572, r2548, r2569; +} +{ +cvt.rn.f16.f64 rs396, fd412; +} +mov.b32 r2577, {rs396, rs396}; +{ +mul.f16x2 r2575, r82, r2577; +} +{ +add.f16x2 r2578, r2554, r2575; +} +{ +cvt.rn.f16.f64 rs397, fd483; +} +mov.b32 r2583, {rs397, rs397}; +{ +mul.f16x2 r2581, r76, r2583; +} +{ +add.f16x2 r2584, r2560, r2581; +} +{ +cvt.rn.f16.f64 rs398, fd412; +} +mov.b32 r2589, {rs398, rs398}; +{ +mul.f16x2 r2587, r79, r2589; +} +{ +add.f16x2 r2590, r2566, r2587; +} +{ +cvt.rn.f16.f64 rs399, fd463; +} +mov.b32 r2595, {rs399, rs399}; +{ +mul.f16x2 r2593, r85, r2595; +} +{ +add.f16x2 r2596, r2572, r2593; +} +{ +cvt.rn.f16.f64 rs400, fd384; +} +mov.b32 r2601, {rs400, rs400}; +{ +mul.f16x2 r2599, r94, r2601; +} +{ +add.f16x2 r2602, r2578, r2599; +} +{ +cvt.rn.f16.f64 rs401, fd463; +} +mov.b32 r2607, {rs401, rs401}; +{ +mul.f16x2 r2605, r88, r2607; +} +{ +add.f16x2 r2608, r2584, r2605; +} +{ +cvt.rn.f16.f64 rs402, fd384; +} +mov.b32 r2613, {rs402, rs402}; +{ +mul.f16x2 r2611, r91, r2613; +} +{ +add.f16x2 r2614, r2590, r2611; +} +{ +cvt.rn.f16.f64 rs403, fd443; +} +mov.b32 r2619, {rs403, rs403}; +{ +mul.f16x2 r2617, r97, r2619; +} +{ +add.f16x2 r2620, r2596, r2617; +} +{ +cvt.rn.f16.f64 rs404, fd388; +} +mov.b32 r2625, {rs404, rs404}; +{ +mul.f16x2 r2623, r106, r2625; +} +{ +add.f16x2 r2626, r2602, r2623; +} +{ +cvt.rn.f16.f64 rs405, fd443; +} +mov.b32 r2631, {rs405, rs405}; +{ +mul.f16x2 r2629, r100, r2631; +} +{ +add.f16x2 r2632, r2608, r2629; +} +{ +cvt.rn.f16.f64 rs406, fd388; +} +mov.b32 r2637, {rs406, rs406}; +{ +mul.f16x2 r2635, r103, r2637; +} +{ +add.f16x2 r2638, r2614, r2635; +} +{ +cvt.rn.f16.f64 rs407, fd455; +} +mov.b32 r2643, {rs407, rs407}; +{ +mul.f16x2 r2641, r109, r2643; +} +{ +add.f16x2 r2644, r2620, r2641; +} +{ +cvt.rn.f16.f64 rs408, fd456; +} +mov.b32 r2649, {rs408, rs408}; +{ +mul.f16x2 r2647, r118, r2649; +} +{ +add.f16x2 r2650, r2626, r2647; +} +{ +cvt.rn.f16.f64 rs409, fd455; +} +mov.b32 r2655, {rs409, rs409}; +{ +mul.f16x2 r2653, r112, r2655; +} +{ +add.f16x2 r2656, r2632, r2653; +} +{ +cvt.rn.f16.f64 rs410, fd456; +} +mov.b32 r2661, {rs410, rs410}; +{ +mul.f16x2 r2659, r115, r2661; +} +{ +add.f16x2 r2662, r2638, r2659; +} +{ +cvt.rn.f16.f64 rs411, fd475; +} +mov.b32 r2667, {rs411, rs411}; +{ +mul.f16x2 r2665, r121, r2667; +} +{ +add.f16x2 r2668, r2644, r2665; +} +{ +cvt.rn.f16.f64 rs412, fd476; +} +mov.b32 r2673, {rs412, rs412}; +{ +mul.f16x2 r2671, r130, r2673; +} +{ +add.f16x2 r2674, r2650, r2671; +} +{ +cvt.rn.f16.f64 rs413, fd475; +} +mov.b32 r2679, {rs413, rs413}; +{ +mul.f16x2 r2677, r124, r2679; +} +{ +add.f16x2 r2680, r2656, r2677; +} +{ +cvt.rn.f16.f64 rs414, fd476; +} +mov.b32 r2685, {rs414, rs414}; +{ +mul.f16x2 r2683, r127, r2685; +} +{ +add.f16x2 r2686, r2662, r2683; +} +{ +sub.f16x2 %18, r2668, r2674; +} +{ +add.f16x2 %19, r2680, r2686; +} +{ +add.f16x2 %28, r2668, r2674; +} +{ +sub.f16x2 %29, r2680, r2686; +} +cvt.rn.f16.s32 rs415, r2980; +mov.b32 r2713, {rs415, rs415}; +cvt.rn.f16.s32 rs416, r2980; +mov.b32 r2725, {rs416, rs416}; +{ +cvt.rn.f16.f64 rs417, fd451; +} +mov.b32 r2705, {rs417, rs417}; +{ +mul.f16x2 r2703, r1, r2705; +} +{ +add.f16x2 r2706, %46, r2703; +} +{ +cvt.rn.f16.f64 rs418, fd452; +} +mov.b32 r2711, {rs418, rs418}; +{ +mul.f16x2 r2709, r10, r2711; +} +{ +add.f16x2 r2712, r2713, r2709; +} +{ +cvt.rn.f16.f64 rs419, fd451; +} +mov.b32 r2717, {rs419, rs419}; +{ +mul.f16x2 r2715, r4, r2717; +} +{ +add.f16x2 r2718, %47, r2715; +} +{ +cvt.rn.f16.f64 rs420, fd452; +} +mov.b32 r2723, {rs420, rs420}; +{ +mul.f16x2 r2721, r7, r2723; +} +{ +add.f16x2 r2724, r2725, r2721; +} +{ +cvt.rn.f16.f64 rs421, fd463; +} +mov.b32 r2729, {rs421, rs421}; +{ +mul.f16x2 r2727, r13, r2729; +} +{ +add.f16x2 r2730, r2706, r2727; +} +{ +cvt.rn.f16.f64 rs422, fd464; +} +mov.b32 r2735, {rs422, rs422}; +{ +mul.f16x2 r2733, r22, r2735; +} +{ +add.f16x2 r2736, r2712, r2733; +} +{ +cvt.rn.f16.f64 rs423, fd463; +} +mov.b32 r2741, {rs423, rs423}; +{ +mul.f16x2 r2739, r16, r2741; +} +{ +add.f16x2 r2742, r2718, r2739; +} +{ +cvt.rn.f16.f64 rs424, fd464; +} +mov.b32 r2747, {rs424, rs424}; +{ +mul.f16x2 r2745, r19, r2747; +} +{ +add.f16x2 r2748, r2724, r2745; +} +{ +cvt.rn.f16.f64 rs425, fd475; +} +mov.b32 r2753, {rs425, rs425}; +{ +mul.f16x2 r2751, r25, r2753; +} +{ +add.f16x2 r2754, r2730, r2751; +} +{ +cvt.rn.f16.f64 rs426, fd476; +} +mov.b32 r2759, {rs426, rs426}; +{ +mul.f16x2 r2757, r34, r2759; +} +{ +add.f16x2 r2760, r2736, r2757; +} +{ +cvt.rn.f16.f64 rs427, fd475; +} +mov.b32 r2765, {rs427, rs427}; +{ +mul.f16x2 r2763, r28, r2765; +} +{ +add.f16x2 r2766, r2742, r2763; +} +{ +cvt.rn.f16.f64 rs428, fd476; +} +mov.b32 r2771, {rs428, rs428}; +{ +mul.f16x2 r2769, r31, r2771; +} +{ +add.f16x2 r2772, r2748, r2769; +} +{ +cvt.rn.f16.f64 rs429, fd483; +} +mov.b32 r2777, {rs429, rs429}; +{ +mul.f16x2 r2775, r37, r2777; +} +{ +add.f16x2 r2778, r2754, r2775; +} +{ +cvt.rn.f16.f64 rs430, fd412; +} +mov.b32 r2783, {rs430, rs430}; +{ +mul.f16x2 r2781, r46, r2783; +} +{ +add.f16x2 r2784, r2760, r2781; +} +{ +cvt.rn.f16.f64 rs431, fd483; +} +mov.b32 r2789, {rs431, rs431}; +{ +mul.f16x2 r2787, r40, r2789; +} +{ +add.f16x2 r2790, r2766, r2787; +} +{ +cvt.rn.f16.f64 rs432, fd412; +} +mov.b32 r2795, {rs432, rs432}; +{ +mul.f16x2 r2793, r43, r2795; +} +{ +add.f16x2 r2796, r2772, r2793; +} +{ +cvt.rn.f16.f64 rs433, fd471; +} +mov.b32 r2801, {rs433, rs433}; +{ +mul.f16x2 r2799, r49, r2801; +} +{ +add.f16x2 r2802, r2778, r2799; +} +{ +cvt.rn.f16.f64 rs434, fd416; +} +mov.b32 r2807, {rs434, rs434}; +{ +mul.f16x2 r2805, r58, r2807; +} +{ +add.f16x2 r2808, r2784, r2805; +} +{ +cvt.rn.f16.f64 rs435, fd471; +} +mov.b32 r2813, {rs435, rs435}; +{ +mul.f16x2 r2811, r52, r2813; +} +{ +add.f16x2 r2814, r2790, r2811; +} +{ +cvt.rn.f16.f64 rs436, fd416; +} +mov.b32 r2819, {rs436, rs436}; +{ +mul.f16x2 r2817, r55, r2819; +} +{ +add.f16x2 r2820, r2796, r2817; +} +{ +cvt.rn.f16.f64 rs437, fd459; +} +mov.b32 r2825, {rs437, rs437}; +{ +mul.f16x2 r2823, r61, r2825; +} +{ +add.f16x2 r2826, r2802, r2823; +} +{ +cvt.rn.f16.f64 rs438, fd420; +} +mov.b32 r2831, {rs438, rs438}; +{ +mul.f16x2 r2829, r70, r2831; +} +{ +add.f16x2 r2832, r2808, r2829; +} +{ +cvt.rn.f16.f64 rs439, fd459; +} +mov.b32 r2837, {rs439, rs439}; +{ +mul.f16x2 r2835, r64, r2837; +} +{ +add.f16x2 r2838, r2814, r2835; +} +{ +cvt.rn.f16.f64 rs440, fd420; +} +mov.b32 r2843, {rs440, rs440}; +{ +mul.f16x2 r2841, r67, r2843; +} +{ +add.f16x2 r2844, r2820, r2841; +} +{ +cvt.rn.f16.f64 rs441, fd447; +} +mov.b32 r2849, {rs441, rs441}; +{ +mul.f16x2 r2847, r73, r2849; +} +{ +add.f16x2 r2850, r2826, r2847; +} +{ +cvt.rn.f16.f64 rs442, fd424; +} +mov.b32 r2855, {rs442, rs442}; +{ +mul.f16x2 r2853, r82, r2855; +} +{ +add.f16x2 r2856, r2832, r2853; +} +{ +cvt.rn.f16.f64 rs443, fd447; +} +mov.b32 r2861, {rs443, rs443}; +{ +mul.f16x2 r2859, r76, r2861; +} +{ +add.f16x2 r2862, r2838, r2859; +} +{ +cvt.rn.f16.f64 rs444, fd424; +} +mov.b32 r2867, {rs444, rs444}; +{ +mul.f16x2 r2865, r79, r2867; +} +{ +add.f16x2 r2868, r2844, r2865; +} +{ +cvt.rn.f16.f64 rs445, fd443; +} +mov.b32 r2873, {rs445, rs445}; +{ +mul.f16x2 r2871, r85, r2873; +} +{ +add.f16x2 r2874, r2850, r2871; +} +{ +cvt.rn.f16.f64 rs446, fd444; +} +mov.b32 r2879, {rs446, rs446}; +{ +mul.f16x2 r2877, r94, r2879; +} +{ +add.f16x2 r2880, r2856, r2877; +} +{ +cvt.rn.f16.f64 rs447, fd443; +} +mov.b32 r2885, {rs447, rs447}; +{ +mul.f16x2 r2883, r88, r2885; +} +{ +add.f16x2 r2886, r2862, r2883; +} +{ +cvt.rn.f16.f64 rs448, fd444; +} +mov.b32 r2891, {rs448, rs448}; +{ +mul.f16x2 r2889, r91, r2891; +} +{ +add.f16x2 r2892, r2868, r2889; +} +{ +cvt.rn.f16.f64 rs449, fd455; +} +mov.b32 r2897, {rs449, rs449}; +{ +mul.f16x2 r2895, r97, r2897; +} +{ +add.f16x2 r2898, r2874, r2895; +} +{ +cvt.rn.f16.f64 rs450, fd456; +} +mov.b32 r2903, {rs450, rs450}; +{ +mul.f16x2 r2901, r106, r2903; +} +{ +add.f16x2 r2904, r2880, r2901; +} +{ +cvt.rn.f16.f64 rs451, fd455; +} +mov.b32 r2909, {rs451, rs451}; +{ +mul.f16x2 r2907, r100, r2909; +} +{ +add.f16x2 r2910, r2886, r2907; +} +{ +cvt.rn.f16.f64 rs452, fd456; +} +mov.b32 r2915, {rs452, rs452}; +{ +mul.f16x2 r2913, r103, r2915; +} +{ +add.f16x2 r2916, r2892, r2913; +} +{ +cvt.rn.f16.f64 rs453, fd467; +} +mov.b32 r2921, {rs453, rs453}; +{ +mul.f16x2 r2919, r109, r2921; +} +{ +add.f16x2 r2922, r2898, r2919; +} +{ +cvt.rn.f16.f64 rs454, fd468; +} +mov.b32 r2927, {rs454, rs454}; +{ +mul.f16x2 r2925, r118, r2927; +} +{ +add.f16x2 r2928, r2904, r2925; +} +{ +cvt.rn.f16.f64 rs455, fd467; +} +mov.b32 r2933, {rs455, rs455}; +{ +mul.f16x2 r2931, r112, r2933; +} +{ +add.f16x2 r2934, r2910, r2931; +} +{ +cvt.rn.f16.f64 rs456, fd468; +} +mov.b32 r2939, {rs456, rs456}; +{ +mul.f16x2 r2937, r115, r2939; +} +{ +add.f16x2 r2940, r2916, r2937; +} +{ +cvt.rn.f16.f64 rs457, fd479; +} +mov.b32 r2945, {rs457, rs457}; +{ +mul.f16x2 r2943, r121, r2945; +} +{ +add.f16x2 r2946, r2922, r2943; +} +{ +cvt.rn.f16.f64 rs458, fd480; +} +mov.b32 r2951, {rs458, rs458}; +{ +mul.f16x2 r2949, r130, r2951; +} +{ +add.f16x2 r2952, r2928, r2949; +} +{ +cvt.rn.f16.f64 rs459, fd479; +} +mov.b32 r2957, {rs459, rs459}; +{ +mul.f16x2 r2955, r124, r2957; +} +{ +add.f16x2 r2958, r2934, r2955; +} +{ +cvt.rn.f16.f64 rs460, fd480; +} +mov.b32 r2963, {rs460, rs460}; +{ +mul.f16x2 r2961, r127, r2963; +} +{ +add.f16x2 r2964, r2940, r2961; +} +{ +sub.f16x2 %20, r2946, r2952; +} +{ +add.f16x2 %21, r2958, r2964; +} +{ +add.f16x2 %26, r2946, r2952; +} +{ +sub.f16x2 %27, r2958, r2964; +} +cvt.rn.f16.s32 rs461, r2980; +mov.b32 r2991, {rs461, rs461}; +cvt.rn.f16.s32 rs462, r2980; +mov.b32 r3003, {rs462, rs462}; +{ +cvt.rn.f16.f64 rs463, fd443; +} +mov.b32 r2983, {rs463, rs463}; +{ +mul.f16x2 r2981, r1, r2983; +} +{ +add.f16x2 r2984, %46, r2981; +} +{ +cvt.rn.f16.f64 rs464, fd444; +} +mov.b32 r2989, {rs464, rs464}; +{ +mul.f16x2 r2987, r10, r2989; +} +{ +add.f16x2 r2990, r2991, r2987; +} +{ +cvt.rn.f16.f64 rs465, fd443; +} +mov.b32 r2995, {rs465, rs465}; +{ +mul.f16x2 r2993, r4, r2995; +} +{ +add.f16x2 r2996, %47, r2993; +} +{ +cvt.rn.f16.f64 rs466, fd444; +} +mov.b32 r3001, {rs466, rs466}; +{ +mul.f16x2 r2999, r7, r3001; +} +{ +add.f16x2 r3002, r3003, r2999; +} +{ +cvt.rn.f16.f64 rs467, fd447; +} +mov.b32 r3007, {rs467, rs467}; +{ +mul.f16x2 r3005, r13, r3007; +} +{ +add.f16x2 r3008, r2984, r3005; +} +{ +cvt.rn.f16.f64 rs468, fd448; +} +mov.b32 r3013, {rs468, rs468}; +{ +mul.f16x2 r3011, r22, r3013; +} +{ +add.f16x2 r3014, r2990, r3011; +} +{ +cvt.rn.f16.f64 rs469, fd447; +} +mov.b32 r3019, {rs469, rs469}; +{ +mul.f16x2 r3017, r16, r3019; +} +{ +add.f16x2 r3020, r2996, r3017; +} +{ +cvt.rn.f16.f64 rs470, fd448; +} +mov.b32 r3025, {rs470, rs470}; +{ +mul.f16x2 r3023, r19, r3025; +} +{ +add.f16x2 r3026, r3002, r3023; +} +{ +cvt.rn.f16.f64 rs471, fd451; +} +mov.b32 r3031, {rs471, rs471}; +{ +mul.f16x2 r3029, r25, r3031; +} +{ +add.f16x2 r3032, r3008, r3029; +} +{ +cvt.rn.f16.f64 rs472, fd452; +} +mov.b32 r3037, {rs472, rs472}; +{ +mul.f16x2 r3035, r34, r3037; +} +{ +add.f16x2 r3038, r3014, r3035; +} +{ +cvt.rn.f16.f64 rs473, fd451; +} +mov.b32 r3043, {rs473, rs473}; +{ +mul.f16x2 r3041, r28, r3043; +} +{ +add.f16x2 r3044, r3020, r3041; +} +{ +cvt.rn.f16.f64 rs474, fd452; +} +mov.b32 r3049, {rs474, rs474}; +{ +mul.f16x2 r3047, r31, r3049; +} +{ +add.f16x2 r3050, r3026, r3047; +} +{ +cvt.rn.f16.f64 rs475, fd455; +} +mov.b32 r3055, {rs475, rs475}; +{ +mul.f16x2 r3053, r37, r3055; +} +{ +add.f16x2 r3056, r3032, r3053; +} +{ +cvt.rn.f16.f64 rs476, fd456; +} +mov.b32 r3061, {rs476, rs476}; +{ +mul.f16x2 r3059, r46, r3061; +} +{ +add.f16x2 r3062, r3038, r3059; +} +{ +cvt.rn.f16.f64 rs477, fd455; +} +mov.b32 r3067, {rs477, rs477}; +{ +mul.f16x2 r3065, r40, r3067; +} +{ +add.f16x2 r3068, r3044, r3065; +} +{ +cvt.rn.f16.f64 rs478, fd456; +} +mov.b32 r3073, {rs478, rs478}; +{ +mul.f16x2 r3071, r43, r3073; +} +{ +add.f16x2 r3074, r3050, r3071; +} +{ +cvt.rn.f16.f64 rs479, fd459; +} +mov.b32 r3079, {rs479, rs479}; +{ +mul.f16x2 r3077, r49, r3079; +} +{ +add.f16x2 r3080, r3056, r3077; +} +{ +cvt.rn.f16.f64 rs480, fd460; +} +mov.b32 r3085, {rs480, rs480}; +{ +mul.f16x2 r3083, r58, r3085; +} +{ +add.f16x2 r3086, r3062, r3083; +} +{ +cvt.rn.f16.f64 rs481, fd459; +} +mov.b32 r3091, {rs481, rs481}; +{ +mul.f16x2 r3089, r52, r3091; +} +{ +add.f16x2 r3092, r3068, r3089; +} +{ +cvt.rn.f16.f64 rs482, fd460; +} +mov.b32 r3097, {rs482, rs482}; +{ +mul.f16x2 r3095, r55, r3097; +} +{ +add.f16x2 r3098, r3074, r3095; +} +{ +cvt.rn.f16.f64 rs483, fd463; +} +mov.b32 r3103, {rs483, rs483}; +{ +mul.f16x2 r3101, r61, r3103; +} +{ +add.f16x2 r3104, r3080, r3101; +} +{ +cvt.rn.f16.f64 rs484, fd464; +} +mov.b32 r3109, {rs484, rs484}; +{ +mul.f16x2 r3107, r70, r3109; +} +{ +add.f16x2 r3110, r3086, r3107; +} +{ +cvt.rn.f16.f64 rs485, fd463; +} +mov.b32 r3115, {rs485, rs485}; +{ +mul.f16x2 r3113, r64, r3115; +} +{ +add.f16x2 r3116, r3092, r3113; +} +{ +cvt.rn.f16.f64 rs486, fd464; +} +mov.b32 r3121, {rs486, rs486}; +{ +mul.f16x2 r3119, r67, r3121; +} +{ +add.f16x2 r3122, r3098, r3119; +} +{ +cvt.rn.f16.f64 rs487, fd467; +} +mov.b32 r3127, {rs487, rs487}; +{ +mul.f16x2 r3125, r73, r3127; +} +{ +add.f16x2 r3128, r3104, r3125; +} +{ +cvt.rn.f16.f64 rs488, fd468; +} +mov.b32 r3133, {rs488, rs488}; +{ +mul.f16x2 r3131, r82, r3133; +} +{ +add.f16x2 r3134, r3110, r3131; +} +{ +cvt.rn.f16.f64 rs489, fd467; +} +mov.b32 r3139, {rs489, rs489}; +{ +mul.f16x2 r3137, r76, r3139; +} +{ +add.f16x2 r3140, r3116, r3137; +} +{ +cvt.rn.f16.f64 rs490, fd468; +} +mov.b32 r3145, {rs490, rs490}; +{ +mul.f16x2 r3143, r79, r3145; +} +{ +add.f16x2 r3146, r3122, r3143; +} +{ +cvt.rn.f16.f64 rs491, fd471; +} +mov.b32 r3151, {rs491, rs491}; +{ +mul.f16x2 r3149, r85, r3151; +} +{ +add.f16x2 r3152, r3128, r3149; +} +{ +cvt.rn.f16.f64 rs492, fd472; +} +mov.b32 r3157, {rs492, rs492}; +{ +mul.f16x2 r3155, r94, r3157; +} +{ +add.f16x2 r3158, r3134, r3155; +} +{ +cvt.rn.f16.f64 rs493, fd471; +} +mov.b32 r3163, {rs493, rs493}; +{ +mul.f16x2 r3161, r88, r3163; +} +{ +add.f16x2 r3164, r3140, r3161; +} +{ +cvt.rn.f16.f64 rs494, fd472; +} +mov.b32 r3169, {rs494, rs494}; +{ +mul.f16x2 r3167, r91, r3169; +} +{ +add.f16x2 r3170, r3146, r3167; +} +{ +cvt.rn.f16.f64 rs495, fd475; +} +mov.b32 r3175, {rs495, rs495}; +{ +mul.f16x2 r3173, r97, r3175; +} +{ +add.f16x2 r3176, r3152, r3173; +} +{ +cvt.rn.f16.f64 rs496, fd476; +} +mov.b32 r3181, {rs496, rs496}; +{ +mul.f16x2 r3179, r106, r3181; +} +{ +add.f16x2 r3182, r3158, r3179; +} +{ +cvt.rn.f16.f64 rs497, fd475; +} +mov.b32 r3187, {rs497, rs497}; +{ +mul.f16x2 r3185, r100, r3187; +} +{ +add.f16x2 r3188, r3164, r3185; +} +{ +cvt.rn.f16.f64 rs498, fd476; +} +mov.b32 r3193, {rs498, rs498}; +{ +mul.f16x2 r3191, r103, r3193; +} +{ +add.f16x2 r3194, r3170, r3191; +} +{ +cvt.rn.f16.f64 rs499, fd479; +} +mov.b32 r3199, {rs499, rs499}; +{ +mul.f16x2 r3197, r109, r3199; +} +{ +add.f16x2 r3200, r3176, r3197; +} +{ +cvt.rn.f16.f64 rs500, fd480; +} +mov.b32 r3205, {rs500, rs500}; +{ +mul.f16x2 r3203, r118, r3205; +} +{ +add.f16x2 r3206, r3182, r3203; +} +{ +cvt.rn.f16.f64 rs501, fd479; +} +mov.b32 r3211, {rs501, rs501}; +{ +mul.f16x2 r3209, r112, r3211; +} +{ +add.f16x2 r3212, r3188, r3209; +} +{ +cvt.rn.f16.f64 rs502, fd480; +} +mov.b32 r3217, {rs502, rs502}; +{ +mul.f16x2 r3215, r115, r3217; +} +{ +add.f16x2 r3218, r3194, r3215; +} +{ +cvt.rn.f16.f64 rs503, fd483; +} +mov.b32 r3223, {rs503, rs503}; +{ +mul.f16x2 r3221, r121, r3223; +} +{ +add.f16x2 r3224, r3200, r3221; +} +{ +cvt.rn.f16.f64 rs504, fd484; +} +mov.b32 r3229, {rs504, rs504}; +{ +mul.f16x2 r3227, r130, r3229; +} +{ +add.f16x2 r3230, r3206, r3227; +} +{ +cvt.rn.f16.f64 rs505, fd483; +} +mov.b32 r3235, {rs505, rs505}; +{ +mul.f16x2 r3233, r124, r3235; +} +{ +add.f16x2 r3236, r3212, r3233; +} +{ +cvt.rn.f16.f64 rs506, fd484; +} +mov.b32 r3241, {rs506, rs506}; +{ +mul.f16x2 r3239, r127, r3241; +} +{ +add.f16x2 r3242, r3218, r3239; +} +{ +sub.f16x2 %22, r3224, r3230; +} +{ +add.f16x2 %23, r3236, r3242; +} +{ +add.f16x2 %24, r3224, r3230; +} +{ +sub.f16x2 %25, r3236, r3242; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..d021860fd628a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp16_inv.hpp.inc @@ -0,0 +1,5266 @@ +#ifndef CUFFTDX_FFT_23_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_23_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<957, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<507>; +.reg .b32 r<3257>; +.reg .f64 fd<485>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %48, %90; +} +{ +add.f16x2 r4, %49, %91; +} +{ +sub.f16x2 r7, %48, %90; +} +{ +sub.f16x2 r10, %49, %91; +} +{ +add.f16x2 r13, %50, %88; +} +{ +add.f16x2 r16, %51, %89; +} +{ +sub.f16x2 r19, %50, %88; +} +{ +sub.f16x2 r22, %51, %89; +} +{ +add.f16x2 r25, %52, %86; +} +{ +add.f16x2 r28, %53, %87; +} +{ +sub.f16x2 r31, %52, %86; +} +{ +sub.f16x2 r34, %53, %87; +} +{ +add.f16x2 r37, %54, %84; +} +{ +add.f16x2 r40, %55, %85; +} +{ +sub.f16x2 r43, %54, %84; +} +{ +sub.f16x2 r46, %55, %85; +} +{ +add.f16x2 r49, %56, %82; +} +{ +add.f16x2 r52, %57, %83; +} +{ +sub.f16x2 r55, %56, %82; +} +{ +sub.f16x2 r58, %57, %83; +} +{ +add.f16x2 r61, %58, %80; +} +{ +add.f16x2 r64, %59, %81; +} +{ +sub.f16x2 r67, %58, %80; +} +{ +sub.f16x2 r70, %59, %81; +} +{ +add.f16x2 r73, %60, %78; +} +{ +add.f16x2 r76, %61, %79; +} +{ +sub.f16x2 r79, %60, %78; +} +{ +sub.f16x2 r82, %61, %79; +} +{ +add.f16x2 r85, %62, %76; +} +{ +add.f16x2 r88, %63, %77; +} +{ +sub.f16x2 r91, %62, %76; +} +{ +sub.f16x2 r94, %63, %77; +} +{ +add.f16x2 r97, %64, %74; +} +{ +add.f16x2 r100, %65, %75; +} +{ +sub.f16x2 r103, %64, %74; +} +{ +sub.f16x2 r106, %65, %75; +} +{ +add.f16x2 r109, %66, %72; +} +{ +add.f16x2 r112, %67, %73; +} +{ +sub.f16x2 r115, %66, %72; +} +{ +sub.f16x2 r118, %67, %73; +} +{ +add.f16x2 r121, %68, %70; +} +{ +add.f16x2 r124, %69, %71; +} +{ +sub.f16x2 r127, %68, %70; +} +{ +sub.f16x2 r130, %69, %71; +} +{ +add.f16x2 r133, %46, r1; +} +{ +add.f16x2 r136, %47, r4; +} +{ +add.f16x2 r139, r133, r13; +} +{ +add.f16x2 r142, r136, r16; +} +{ +add.f16x2 r145, r139, r25; +} +{ +add.f16x2 r148, r142, r28; +} +{ +add.f16x2 r151, r145, r37; +} +{ +add.f16x2 r154, r148, r40; +} +{ +add.f16x2 r157, r151, r49; +} +{ +add.f16x2 r160, r154, r52; +} +{ +add.f16x2 r163, r157, r61; +} +{ +add.f16x2 r166, r160, r64; +} +{ +add.f16x2 r169, r163, r73; +} +{ +add.f16x2 r172, r166, r76; +} +{ +add.f16x2 r175, r169, r85; +} +{ +add.f16x2 r178, r172, r88; +} +{ +add.f16x2 r181, r175, r97; +} +{ +add.f16x2 r184, r178, r100; +} +{ +add.f16x2 r187, r181, r109; +} +{ +add.f16x2 r190, r184, r112; +} +{ +add.f16x2 %0, r187, r121; +} +{ +add.f16x2 %1, r190, r124; +} +mov.u32 r2980, 0; +cvt.rn.f16.s32 rs1, r2980; +mov.b32 r211, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r2980; +mov.b32 r223, {rs2, rs2}; +mov.f64 fd447, 0d3FEED037EA3D2DBB; +{ +cvt.rn.f16.f64 rs3, fd447; +} +mov.b32 r203, {rs3, rs3}; +{ +mul.f16x2 r201, r1, r203; +} +{ +add.f16x2 r204, %46, r201; +} +mov.f64 fd424, 0d3FD14459AD2BE466; +{ +cvt.rn.f16.f64 rs4, fd424; +} +mov.b32 r209, {rs4, rs4}; +{ +mul.f16x2 r207, r10, r209; +} +{ +add.f16x2 r210, r211, r207; +} +{ +cvt.rn.f16.f64 rs5, fd447; +} +mov.b32 r215, {rs5, rs5}; +{ +mul.f16x2 r213, r4, r215; +} +{ +add.f16x2 r216, %47, r213; +} +{ +cvt.rn.f16.f64 rs6, fd424; +} +mov.b32 r221, {rs6, rs6}; +{ +mul.f16x2 r219, r7, r221; +} +{ +add.f16x2 r222, r223, r219; +} +mov.f64 fd455, 0d3FEB57675CF309EE; +{ +cvt.rn.f16.f64 rs7, fd455; +} +mov.b32 r227, {rs7, rs7}; +{ +mul.f16x2 r225, r13, r227; +} +{ +add.f16x2 r228, r204, r225; +} +mov.f64 fd332, 0d3FE0A06E851DB7CA; +{ +cvt.rn.f16.f64 rs8, fd332; +} +mov.b32 r233, {rs8, rs8}; +{ +mul.f16x2 r231, r22, r233; +} +{ +add.f16x2 r234, r210, r231; +} +{ +cvt.rn.f16.f64 rs9, fd455; +} +mov.b32 r239, {rs9, rs9}; +{ +mul.f16x2 r237, r16, r239; +} +{ +add.f16x2 r240, r216, r237; +} +{ +cvt.rn.f16.f64 rs10, fd332; +} +mov.b32 r245, {rs10, rs10}; +{ +mul.f16x2 r243, r19, r245; +} +{ +add.f16x2 r246, r222, r243; +} +mov.f64 fd463, 0d3FE5D779B07CFEF7; +{ +cvt.rn.f16.f64 rs11, fd463; +} +mov.b32 r251, {rs11, rs11}; +{ +mul.f16x2 r249, r25, r251; +} +{ +add.f16x2 r252, r228, r249; +} +mov.f64 fd384, 0d3FE763021AAA15DA; +{ +cvt.rn.f16.f64 rs12, fd384; +} +mov.b32 r257, {rs12, rs12}; +{ +mul.f16x2 r255, r34, r257; +} +{ +add.f16x2 r258, r234, r255; +} +{ +cvt.rn.f16.f64 rs13, fd463; +} +mov.b32 r263, {rs13, rs13}; +{ +mul.f16x2 r261, r28, r263; +} +{ +add.f16x2 r264, r240, r261; +} +{ +cvt.rn.f16.f64 rs14, fd384; +} +mov.b32 r269, {rs14, rs14}; +{ +mul.f16x2 r267, r31, r269; +} +{ +add.f16x2 r270, r246, r267; +} +mov.f64 fd471, 0d3FDD71B4A0C5A6C8; +{ +cvt.rn.f16.f64 rs15, fd471; +} +mov.b32 r275, {rs15, rs15}; +{ +mul.f16x2 r273, r37, r275; +} +{ +add.f16x2 r276, r252, r273; +} +mov.f64 fd416, 0d3FEC698E42F47B09; +{ +cvt.rn.f16.f64 rs16, fd416; +} +mov.b32 r281, {rs16, rs16}; +{ +mul.f16x2 r279, r46, r281; +} +{ +add.f16x2 r282, r258, r279; +} +{ +cvt.rn.f16.f64 rs17, fd471; +} +mov.b32 r287, {rs17, rs17}; +{ +mul.f16x2 r285, r40, r287; +} +{ +add.f16x2 r288, r264, r285; +} +{ +cvt.rn.f16.f64 rs18, fd416; +} +mov.b32 r293, {rs18, rs18}; +{ +mul.f16x2 r291, r43, r293; +} +{ +add.f16x2 r294, r270, r291; +} +mov.f64 fd479, 0d3FCA0AD8BD1E2882; +{ +cvt.rn.f16.f64 rs19, fd479; +} +mov.b32 r299, {rs19, rs19}; +{ +mul.f16x2 r297, r49, r299; +} +{ +add.f16x2 r300, r276, r297; +} +mov.f64 fd280, 0d3FEF54A827142577; +{ +cvt.rn.f16.f64 rs20, fd280; +} +mov.b32 r305, {rs20, rs20}; +{ +mul.f16x2 r303, r58, r305; +} +{ +add.f16x2 r306, r282, r303; +} +{ +cvt.rn.f16.f64 rs21, fd479; +} +mov.b32 r311, {rs21, rs21}; +{ +mul.f16x2 r309, r52, r311; +} +{ +add.f16x2 r312, r288, r309; +} +{ +cvt.rn.f16.f64 rs22, fd280; +} +mov.b32 r317, {rs22, rs22}; +{ +mul.f16x2 r315, r55, r317; +} +{ +add.f16x2 r318, r294, r315; +} +mov.f64 fd483, 0dBFB17855B599F3B9; +{ +cvt.rn.f16.f64 rs23, fd483; +} +mov.b32 r323, {rs23, rs23}; +{ +mul.f16x2 r321, r61, r323; +} +{ +add.f16x2 r324, r300, r321; +} +mov.f64 fd484, 0d3FEFECE70DFD3EFB; +{ +cvt.rn.f16.f64 rs24, fd484; +} +mov.b32 r329, {rs24, rs24}; +{ +mul.f16x2 r327, r70, r329; +} +{ +add.f16x2 r330, r306, r327; +} +{ +cvt.rn.f16.f64 rs25, fd483; +} +mov.b32 r335, {rs25, rs25}; +{ +mul.f16x2 r333, r64, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +cvt.rn.f16.f64 rs26, fd484; +} +mov.b32 r341, {rs26, rs26}; +{ +mul.f16x2 r339, r67, r341; +} +{ +add.f16x2 r342, r318, r339; +} +mov.f64 fd475, 0dBFD56EAAE597C776; +{ +cvt.rn.f16.f64 rs27, fd475; +} +mov.b32 r347, {rs27, rs27}; +{ +mul.f16x2 r345, r73, r347; +} +{ +add.f16x2 r348, r324, r345; +} +mov.f64 fd476, 0d3FEE270060999288; +{ +cvt.rn.f16.f64 rs28, fd476; +} +mov.b32 r353, {rs28, rs28}; +{ +mul.f16x2 r351, r82, r353; +} +{ +add.f16x2 r354, r330, r351; +} +{ +cvt.rn.f16.f64 rs29, fd475; +} +mov.b32 r359, {rs29, rs29}; +{ +mul.f16x2 r357, r76, r359; +} +{ +add.f16x2 r360, r336, r357; +} +{ +cvt.rn.f16.f64 rs30, fd476; +} +mov.b32 r365, {rs30, rs30}; +{ +mul.f16x2 r363, r79, r365; +} +{ +add.f16x2 r366, r342, r363; +} +mov.f64 fd467, 0dBFE2742A4A775CFB; +{ +cvt.rn.f16.f64 rs31, fd467; +} +mov.b32 r371, {rs31, rs31}; +{ +mul.f16x2 r369, r85, r371; +} +{ +add.f16x2 r372, r348, r369; +} +mov.f64 fd468, 0d3FEA249E0B897CA9; +{ +cvt.rn.f16.f64 rs32, fd468; +} +mov.b32 r377, {rs32, rs32}; +{ +mul.f16x2 r375, r94, r377; +} +{ +add.f16x2 r378, r354, r375; +} +{ +cvt.rn.f16.f64 rs33, fd467; +} +mov.b32 r383, {rs33, rs33}; +{ +mul.f16x2 r381, r88, r383; +} +{ +add.f16x2 r384, r360, r381; +} +{ +cvt.rn.f16.f64 rs34, fd468; +} +mov.b32 r389, {rs34, rs34}; +{ +mul.f16x2 r387, r91, r389; +} +{ +add.f16x2 r390, r366, r387; +} +mov.f64 fd459, 0dBFE8D2A07C16D46F; +{ +cvt.rn.f16.f64 rs35, fd459; +} +mov.b32 r395, {rs35, rs35}; +{ +mul.f16x2 r393, r97, r395; +} +{ +add.f16x2 r396, r372, r393; +} +mov.f64 fd460, 0d3FE431DF5838F7EF; +{ +cvt.rn.f16.f64 rs36, fd460; +} +mov.b32 r401, {rs36, rs36}; +{ +mul.f16x2 r399, r106, r401; +} +{ +add.f16x2 r402, r378, r399; +} +{ +cvt.rn.f16.f64 rs37, fd459; +} +mov.b32 r407, {rs37, rs37}; +{ +mul.f16x2 r405, r100, r407; +} +{ +add.f16x2 r408, r384, r405; +} +{ +cvt.rn.f16.f64 rs38, fd460; +} +mov.b32 r413, {rs38, rs38}; +{ +mul.f16x2 r411, r103, r413; +} +{ +add.f16x2 r414, r390, r411; +} +mov.f64 fd451, 0dBFED59CB83EF99BC; +{ +cvt.rn.f16.f64 rs39, fd451; +} +mov.b32 r419, {rs39, rs39}; +{ +mul.f16x2 r417, r109, r419; +} +{ +add.f16x2 r420, r396, r417; +} +mov.f64 fd452, 0d3FD97F6748E524B2; +{ +cvt.rn.f16.f64 rs40, fd452; +} +mov.b32 r425, {rs40, rs40}; +{ +mul.f16x2 r423, r118, r425; +} +{ +add.f16x2 r426, r402, r423; +} +{ +cvt.rn.f16.f64 rs41, fd451; +} +mov.b32 r431, {rs41, rs41}; +{ +mul.f16x2 r429, r112, r431; +} +{ +add.f16x2 r432, r408, r429; +} +{ +cvt.rn.f16.f64 rs42, fd452; +} +mov.b32 r437, {rs42, rs42}; +{ +mul.f16x2 r435, r115, r437; +} +{ +add.f16x2 r438, r414, r435; +} +mov.f64 fd443, 0dBFEFB3B3035AA6CD; +{ +cvt.rn.f16.f64 rs43, fd443; +} +mov.b32 r443, {rs43, rs43}; +{ +mul.f16x2 r441, r121, r443; +} +{ +add.f16x2 r444, r420, r441; +} +mov.f64 fd444, 0d3FC16DE8A4564F0A; +{ +cvt.rn.f16.f64 rs44, fd444; +} +mov.b32 r449, {rs44, rs44}; +{ +mul.f16x2 r447, r130, r449; +} +{ +add.f16x2 r450, r426, r447; +} +{ +cvt.rn.f16.f64 rs45, fd443; +} +mov.b32 r455, {rs45, rs45}; +{ +mul.f16x2 r453, r124, r455; +} +{ +add.f16x2 r456, r432, r453; +} +{ +cvt.rn.f16.f64 rs46, fd444; +} +mov.b32 r461, {rs46, rs46}; +{ +mul.f16x2 r459, r127, r461; +} +{ +add.f16x2 r462, r438, r459; +} +{ +sub.f16x2 %2, r444, r450; +} +{ +add.f16x2 %3, r456, r462; +} +{ +add.f16x2 %44, r444, r450; +} +{ +sub.f16x2 %45, r456, r462; +} +cvt.rn.f16.s32 rs47, r2980; +mov.b32 r489, {rs47, rs47}; +cvt.rn.f16.s32 rs48, r2980; +mov.b32 r501, {rs48, rs48}; +{ +cvt.rn.f16.f64 rs49, fd455; +} +mov.b32 r481, {rs49, rs49}; +{ +mul.f16x2 r479, r1, r481; +} +{ +add.f16x2 r482, %46, r479; +} +{ +cvt.rn.f16.f64 rs50, fd332; +} +mov.b32 r487, {rs50, rs50}; +{ +mul.f16x2 r485, r10, r487; +} +{ +add.f16x2 r488, r489, r485; +} +{ +cvt.rn.f16.f64 rs51, fd455; +} +mov.b32 r493, {rs51, rs51}; +{ +mul.f16x2 r491, r4, r493; +} +{ +add.f16x2 r494, %47, r491; +} +{ +cvt.rn.f16.f64 rs52, fd332; +} +mov.b32 r499, {rs52, rs52}; +{ +mul.f16x2 r497, r7, r499; +} +{ +add.f16x2 r500, r501, r497; +} +{ +cvt.rn.f16.f64 rs53, fd471; +} +mov.b32 r505, {rs53, rs53}; +{ +mul.f16x2 r503, r13, r505; +} +{ +add.f16x2 r506, r482, r503; +} +{ +cvt.rn.f16.f64 rs54, fd416; +} +mov.b32 r511, {rs54, rs54}; +{ +mul.f16x2 r509, r22, r511; +} +{ +add.f16x2 r512, r488, r509; +} +{ +cvt.rn.f16.f64 rs55, fd471; +} +mov.b32 r517, {rs55, rs55}; +{ +mul.f16x2 r515, r16, r517; +} +{ +add.f16x2 r518, r494, r515; +} +{ +cvt.rn.f16.f64 rs56, fd416; +} +mov.b32 r523, {rs56, rs56}; +{ +mul.f16x2 r521, r19, r523; +} +{ +add.f16x2 r524, r500, r521; +} +{ +cvt.rn.f16.f64 rs57, fd483; +} +mov.b32 r529, {rs57, rs57}; +{ +mul.f16x2 r527, r25, r529; +} +{ +add.f16x2 r530, r506, r527; +} +{ +cvt.rn.f16.f64 rs58, fd484; +} +mov.b32 r535, {rs58, rs58}; +{ +mul.f16x2 r533, r34, r535; +} +{ +add.f16x2 r536, r512, r533; +} +{ +cvt.rn.f16.f64 rs59, fd483; +} +mov.b32 r541, {rs59, rs59}; +{ +mul.f16x2 r539, r28, r541; +} +{ +add.f16x2 r542, r518, r539; +} +{ +cvt.rn.f16.f64 rs60, fd484; +} +mov.b32 r547, {rs60, rs60}; +{ +mul.f16x2 r545, r31, r547; +} +{ +add.f16x2 r548, r524, r545; +} +{ +cvt.rn.f16.f64 rs61, fd467; +} +mov.b32 r553, {rs61, rs61}; +{ +mul.f16x2 r551, r37, r553; +} +{ +add.f16x2 r554, r530, r551; +} +{ +cvt.rn.f16.f64 rs62, fd468; +} +mov.b32 r559, {rs62, rs62}; +{ +mul.f16x2 r557, r46, r559; +} +{ +add.f16x2 r560, r536, r557; +} +{ +cvt.rn.f16.f64 rs63, fd467; +} +mov.b32 r565, {rs63, rs63}; +{ +mul.f16x2 r563, r40, r565; +} +{ +add.f16x2 r566, r542, r563; +} +{ +cvt.rn.f16.f64 rs64, fd468; +} +mov.b32 r571, {rs64, rs64}; +{ +mul.f16x2 r569, r43, r571; +} +{ +add.f16x2 r572, r548, r569; +} +{ +cvt.rn.f16.f64 rs65, fd451; +} +mov.b32 r577, {rs65, rs65}; +{ +mul.f16x2 r575, r49, r577; +} +{ +add.f16x2 r578, r554, r575; +} +{ +cvt.rn.f16.f64 rs66, fd452; +} +mov.b32 r583, {rs66, rs66}; +{ +mul.f16x2 r581, r58, r583; +} +{ +add.f16x2 r584, r560, r581; +} +{ +cvt.rn.f16.f64 rs67, fd451; +} +mov.b32 r589, {rs67, rs67}; +{ +mul.f16x2 r587, r52, r589; +} +{ +add.f16x2 r590, r566, r587; +} +{ +cvt.rn.f16.f64 rs68, fd452; +} +mov.b32 r595, {rs68, rs68}; +{ +mul.f16x2 r593, r55, r595; +} +{ +add.f16x2 r596, r572, r593; +} +{ +cvt.rn.f16.f64 rs69, fd443; +} +mov.b32 r601, {rs69, rs69}; +{ +mul.f16x2 r599, r61, r601; +} +{ +add.f16x2 r602, r578, r599; +} +mov.f64 fd388, 0dBFC16DE8A4564F0A; +{ +cvt.rn.f16.f64 rs70, fd388; +} +mov.b32 r607, {rs70, rs70}; +{ +mul.f16x2 r605, r70, r607; +} +{ +add.f16x2 r608, r584, r605; +} +{ +cvt.rn.f16.f64 rs71, fd443; +} +mov.b32 r613, {rs71, rs71}; +{ +mul.f16x2 r611, r64, r613; +} +{ +add.f16x2 r614, r590, r611; +} +{ +cvt.rn.f16.f64 rs72, fd388; +} +mov.b32 r619, {rs72, rs72}; +{ +mul.f16x2 r617, r67, r619; +} +{ +add.f16x2 r620, r596, r617; +} +{ +cvt.rn.f16.f64 rs73, fd459; +} +mov.b32 r625, {rs73, rs73}; +{ +mul.f16x2 r623, r73, r625; +} +{ +add.f16x2 r626, r602, r623; +} +mov.f64 fd420, 0dBFE431DF5838F7EF; +{ +cvt.rn.f16.f64 rs74, fd420; +} +mov.b32 r631, {rs74, rs74}; +{ +mul.f16x2 r629, r82, r631; +} +{ +add.f16x2 r632, r608, r629; +} +{ +cvt.rn.f16.f64 rs75, fd459; +} +mov.b32 r637, {rs75, rs75}; +{ +mul.f16x2 r635, r76, r637; +} +{ +add.f16x2 r638, r614, r635; +} +{ +cvt.rn.f16.f64 rs76, fd420; +} +mov.b32 r643, {rs76, rs76}; +{ +mul.f16x2 r641, r79, r643; +} +{ +add.f16x2 r644, r620, r641; +} +{ +cvt.rn.f16.f64 rs77, fd475; +} +mov.b32 r649, {rs77, rs77}; +{ +mul.f16x2 r647, r85, r649; +} +{ +add.f16x2 r650, r626, r647; +} +mov.f64 fd316, 0dBFEE270060999288; +{ +cvt.rn.f16.f64 rs78, fd316; +} +mov.b32 r655, {rs78, rs78}; +{ +mul.f16x2 r653, r94, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs79, fd475; +} +mov.b32 r661, {rs79, rs79}; +{ +mul.f16x2 r659, r88, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs80, fd316; +} +mov.b32 r667, {rs80, rs80}; +{ +mul.f16x2 r665, r91, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs81, fd479; +} +mov.b32 r673, {rs81, rs81}; +{ +mul.f16x2 r671, r97, r673; +} +{ +add.f16x2 r674, r650, r671; +} +mov.f64 fd480, 0dBFEF54A827142577; +{ +cvt.rn.f16.f64 rs82, fd480; +} +mov.b32 r679, {rs82, rs82}; +{ +mul.f16x2 r677, r106, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs83, fd479; +} +mov.b32 r685, {rs83, rs83}; +{ +mul.f16x2 r683, r100, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs84, fd480; +} +mov.b32 r691, {rs84, rs84}; +{ +mul.f16x2 r689, r103, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs85, fd463; +} +mov.b32 r697, {rs85, rs85}; +{ +mul.f16x2 r695, r109, r697; +} +{ +add.f16x2 r698, r674, r695; +} +mov.f64 fd464, 0dBFE763021AAA15DA; +{ +cvt.rn.f16.f64 rs86, fd464; +} +mov.b32 r703, {rs86, rs86}; +{ +mul.f16x2 r701, r118, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs87, fd463; +} +mov.b32 r709, {rs87, rs87}; +{ +mul.f16x2 r707, r112, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs88, fd464; +} +mov.b32 r715, {rs88, rs88}; +{ +mul.f16x2 r713, r115, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs89, fd447; +} +mov.b32 r721, {rs89, rs89}; +{ +mul.f16x2 r719, r121, r721; +} +{ +add.f16x2 r722, r698, r719; +} +mov.f64 fd448, 0dBFD14459AD2BE466; +{ +cvt.rn.f16.f64 rs90, fd448; +} +mov.b32 r727, {rs90, rs90}; +{ +mul.f16x2 r725, r130, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs91, fd447; +} +mov.b32 r733, {rs91, rs91}; +{ +mul.f16x2 r731, r124, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs92, fd448; +} +mov.b32 r739, {rs92, rs92}; +{ +mul.f16x2 r737, r127, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +sub.f16x2 %4, r722, r728; +} +{ +add.f16x2 %5, r734, r740; +} +{ +add.f16x2 %42, r722, r728; +} +{ +sub.f16x2 %43, r734, r740; +} +cvt.rn.f16.s32 rs93, r2980; +mov.b32 r767, {rs93, rs93}; +cvt.rn.f16.s32 rs94, r2980; +mov.b32 r779, {rs94, rs94}; +{ +cvt.rn.f16.f64 rs95, fd463; +} +mov.b32 r759, {rs95, rs95}; +{ +mul.f16x2 r757, r1, r759; +} +{ +add.f16x2 r760, %46, r757; +} +{ +cvt.rn.f16.f64 rs96, fd384; +} +mov.b32 r765, {rs96, rs96}; +{ +mul.f16x2 r763, r10, r765; +} +{ +add.f16x2 r766, r767, r763; +} +{ +cvt.rn.f16.f64 rs97, fd463; +} +mov.b32 r771, {rs97, rs97}; +{ +mul.f16x2 r769, r4, r771; +} +{ +add.f16x2 r772, %47, r769; +} +{ +cvt.rn.f16.f64 rs98, fd384; +} +mov.b32 r777, {rs98, rs98}; +{ +mul.f16x2 r775, r7, r777; +} +{ +add.f16x2 r778, r779, r775; +} +{ +cvt.rn.f16.f64 rs99, fd483; +} +mov.b32 r783, {rs99, rs99}; +{ +mul.f16x2 r781, r13, r783; +} +{ +add.f16x2 r784, r760, r781; +} +{ +cvt.rn.f16.f64 rs100, fd484; +} +mov.b32 r789, {rs100, rs100}; +{ +mul.f16x2 r787, r22, r789; +} +{ +add.f16x2 r790, r766, r787; +} +{ +cvt.rn.f16.f64 rs101, fd483; +} +mov.b32 r795, {rs101, rs101}; +{ +mul.f16x2 r793, r16, r795; +} +{ +add.f16x2 r796, r772, r793; +} +{ +cvt.rn.f16.f64 rs102, fd484; +} +mov.b32 r801, {rs102, rs102}; +{ +mul.f16x2 r799, r19, r801; +} +{ +add.f16x2 r802, r778, r799; +} +{ +cvt.rn.f16.f64 rs103, fd459; +} +mov.b32 r807, {rs103, rs103}; +{ +mul.f16x2 r805, r25, r807; +} +{ +add.f16x2 r808, r784, r805; +} +{ +cvt.rn.f16.f64 rs104, fd460; +} +mov.b32 r813, {rs104, rs104}; +{ +mul.f16x2 r811, r34, r813; +} +{ +add.f16x2 r814, r790, r811; +} +{ +cvt.rn.f16.f64 rs105, fd459; +} +mov.b32 r819, {rs105, rs105}; +{ +mul.f16x2 r817, r28, r819; +} +{ +add.f16x2 r820, r796, r817; +} +{ +cvt.rn.f16.f64 rs106, fd460; +} +mov.b32 r825, {rs106, rs106}; +{ +mul.f16x2 r823, r31, r825; +} +{ +add.f16x2 r826, r802, r823; +} +{ +cvt.rn.f16.f64 rs107, fd443; +} +mov.b32 r831, {rs107, rs107}; +{ +mul.f16x2 r829, r37, r831; +} +{ +add.f16x2 r832, r808, r829; +} +{ +cvt.rn.f16.f64 rs108, fd388; +} +mov.b32 r837, {rs108, rs108}; +{ +mul.f16x2 r835, r46, r837; +} +{ +add.f16x2 r838, r814, r835; +} +{ +cvt.rn.f16.f64 rs109, fd443; +} +mov.b32 r843, {rs109, rs109}; +{ +mul.f16x2 r841, r40, r843; +} +{ +add.f16x2 r844, r820, r841; +} +{ +cvt.rn.f16.f64 rs110, fd388; +} +mov.b32 r849, {rs110, rs110}; +{ +mul.f16x2 r847, r43, r849; +} +{ +add.f16x2 r850, r826, r847; +} +{ +cvt.rn.f16.f64 rs111, fd467; +} +mov.b32 r855, {rs111, rs111}; +{ +mul.f16x2 r853, r49, r855; +} +{ +add.f16x2 r856, r832, r853; +} +mov.f64 fd188, 0dBFEA249E0B897CA9; +{ +cvt.rn.f16.f64 rs112, fd188; +} +mov.b32 r861, {rs112, rs112}; +{ +mul.f16x2 r859, r58, r861; +} +{ +add.f16x2 r862, r838, r859; +} +{ +cvt.rn.f16.f64 rs113, fd467; +} +mov.b32 r867, {rs113, rs113}; +{ +mul.f16x2 r865, r52, r867; +} +{ +add.f16x2 r868, r844, r865; +} +{ +cvt.rn.f16.f64 rs114, fd188; +} +mov.b32 r873, {rs114, rs114}; +{ +mul.f16x2 r871, r55, r873; +} +{ +add.f16x2 r874, r850, r871; +} +{ +cvt.rn.f16.f64 rs115, fd479; +} +mov.b32 r879, {rs115, rs115}; +{ +mul.f16x2 r877, r61, r879; +} +{ +add.f16x2 r880, r856, r877; +} +{ +cvt.rn.f16.f64 rs116, fd480; +} +mov.b32 r885, {rs116, rs116}; +{ +mul.f16x2 r883, r70, r885; +} +{ +add.f16x2 r886, r862, r883; +} +{ +cvt.rn.f16.f64 rs117, fd479; +} +mov.b32 r891, {rs117, rs117}; +{ +mul.f16x2 r889, r64, r891; +} +{ +add.f16x2 r892, r868, r889; +} +{ +cvt.rn.f16.f64 rs118, fd480; +} +mov.b32 r897, {rs118, rs118}; +{ +mul.f16x2 r895, r67, r897; +} +{ +add.f16x2 r898, r874, r895; +} +{ +cvt.rn.f16.f64 rs119, fd455; +} +mov.b32 r903, {rs119, rs119}; +{ +mul.f16x2 r901, r73, r903; +} +{ +add.f16x2 r904, r880, r901; +} +mov.f64 fd456, 0dBFE0A06E851DB7CA; +{ +cvt.rn.f16.f64 rs120, fd456; +} +mov.b32 r909, {rs120, rs120}; +{ +mul.f16x2 r907, r82, r909; +} +{ +add.f16x2 r910, r886, r907; +} +{ +cvt.rn.f16.f64 rs121, fd455; +} +mov.b32 r915, {rs121, rs121}; +{ +mul.f16x2 r913, r76, r915; +} +{ +add.f16x2 r916, r892, r913; +} +{ +cvt.rn.f16.f64 rs122, fd456; +} +mov.b32 r921, {rs122, rs122}; +{ +mul.f16x2 r919, r79, r921; +} +{ +add.f16x2 r922, r898, r919; +} +{ +cvt.rn.f16.f64 rs123, fd447; +} +mov.b32 r927, {rs123, rs123}; +{ +mul.f16x2 r925, r85, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs124, fd424; +} +mov.b32 r933, {rs124, rs124}; +{ +mul.f16x2 r931, r94, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs125, fd447; +} +mov.b32 r939, {rs125, rs125}; +{ +mul.f16x2 r937, r88, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs126, fd424; +} +mov.b32 r945, {rs126, rs126}; +{ +mul.f16x2 r943, r91, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs127, fd471; +} +mov.b32 r951, {rs127, rs127}; +{ +mul.f16x2 r949, r97, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs128, fd416; +} +mov.b32 r957, {rs128, rs128}; +{ +mul.f16x2 r955, r106, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs129, fd471; +} +mov.b32 r963, {rs129, rs129}; +{ +mul.f16x2 r961, r100, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs130, fd416; +} +mov.b32 r969, {rs130, rs130}; +{ +mul.f16x2 r967, r103, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +cvt.rn.f16.f64 rs131, fd475; +} +mov.b32 r975, {rs131, rs131}; +{ +mul.f16x2 r973, r109, r975; +} +{ +add.f16x2 r976, r952, r973; +} +{ +cvt.rn.f16.f64 rs132, fd476; +} +mov.b32 r981, {rs132, rs132}; +{ +mul.f16x2 r979, r118, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs133, fd475; +} +mov.b32 r987, {rs133, rs133}; +{ +mul.f16x2 r985, r112, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs134, fd476; +} +mov.b32 r993, {rs134, rs134}; +{ +mul.f16x2 r991, r115, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs135, fd451; +} +mov.b32 r999, {rs135, rs135}; +{ +mul.f16x2 r997, r121, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs136, fd452; +} +mov.b32 r1005, {rs136, rs136}; +{ +mul.f16x2 r1003, r130, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs137, fd451; +} +mov.b32 r1011, {rs137, rs137}; +{ +mul.f16x2 r1009, r124, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs138, fd452; +} +mov.b32 r1017, {rs138, rs138}; +{ +mul.f16x2 r1015, r127, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +sub.f16x2 %6, r1000, r1006; +} +{ +add.f16x2 %7, r1012, r1018; +} +{ +add.f16x2 %40, r1000, r1006; +} +{ +sub.f16x2 %41, r1012, r1018; +} +cvt.rn.f16.s32 rs139, r2980; +mov.b32 r1045, {rs139, rs139}; +cvt.rn.f16.s32 rs140, r2980; +mov.b32 r1057, {rs140, rs140}; +{ +cvt.rn.f16.f64 rs141, fd471; +} +mov.b32 r1037, {rs141, rs141}; +{ +mul.f16x2 r1035, r1, r1037; +} +{ +add.f16x2 r1038, %46, r1035; +} +{ +cvt.rn.f16.f64 rs142, fd416; +} +mov.b32 r1043, {rs142, rs142}; +{ +mul.f16x2 r1041, r10, r1043; +} +{ +add.f16x2 r1044, r1045, r1041; +} +{ +cvt.rn.f16.f64 rs143, fd471; +} +mov.b32 r1049, {rs143, rs143}; +{ +mul.f16x2 r1047, r4, r1049; +} +{ +add.f16x2 r1050, %47, r1047; +} +{ +cvt.rn.f16.f64 rs144, fd416; +} +mov.b32 r1055, {rs144, rs144}; +{ +mul.f16x2 r1053, r7, r1055; +} +{ +add.f16x2 r1056, r1057, r1053; +} +{ +cvt.rn.f16.f64 rs145, fd467; +} +mov.b32 r1061, {rs145, rs145}; +{ +mul.f16x2 r1059, r13, r1061; +} +{ +add.f16x2 r1062, r1038, r1059; +} +{ +cvt.rn.f16.f64 rs146, fd468; +} +mov.b32 r1067, {rs146, rs146}; +{ +mul.f16x2 r1065, r22, r1067; +} +{ +add.f16x2 r1068, r1044, r1065; +} +{ +cvt.rn.f16.f64 rs147, fd467; +} +mov.b32 r1073, {rs147, rs147}; +{ +mul.f16x2 r1071, r16, r1073; +} +{ +add.f16x2 r1074, r1050, r1071; +} +{ +cvt.rn.f16.f64 rs148, fd468; +} +mov.b32 r1079, {rs148, rs148}; +{ +mul.f16x2 r1077, r19, r1079; +} +{ +add.f16x2 r1080, r1056, r1077; +} +{ +cvt.rn.f16.f64 rs149, fd443; +} +mov.b32 r1085, {rs149, rs149}; +{ +mul.f16x2 r1083, r25, r1085; +} +{ +add.f16x2 r1086, r1062, r1083; +} +{ +cvt.rn.f16.f64 rs150, fd388; +} +mov.b32 r1091, {rs150, rs150}; +{ +mul.f16x2 r1089, r34, r1091; +} +{ +add.f16x2 r1092, r1068, r1089; +} +{ +cvt.rn.f16.f64 rs151, fd443; +} +mov.b32 r1097, {rs151, rs151}; +{ +mul.f16x2 r1095, r28, r1097; +} +{ +add.f16x2 r1098, r1074, r1095; +} +{ +cvt.rn.f16.f64 rs152, fd388; +} +mov.b32 r1103, {rs152, rs152}; +{ +mul.f16x2 r1101, r31, r1103; +} +{ +add.f16x2 r1104, r1080, r1101; +} +{ +cvt.rn.f16.f64 rs153, fd475; +} +mov.b32 r1109, {rs153, rs153}; +{ +mul.f16x2 r1107, r37, r1109; +} +{ +add.f16x2 r1110, r1086, r1107; +} +{ +cvt.rn.f16.f64 rs154, fd316; +} +mov.b32 r1115, {rs154, rs154}; +{ +mul.f16x2 r1113, r46, r1115; +} +{ +add.f16x2 r1116, r1092, r1113; +} +{ +cvt.rn.f16.f64 rs155, fd475; +} +mov.b32 r1121, {rs155, rs155}; +{ +mul.f16x2 r1119, r40, r1121; +} +{ +add.f16x2 r1122, r1098, r1119; +} +{ +cvt.rn.f16.f64 rs156, fd316; +} +mov.b32 r1127, {rs156, rs156}; +{ +mul.f16x2 r1125, r43, r1127; +} +{ +add.f16x2 r1128, r1104, r1125; +} +{ +cvt.rn.f16.f64 rs157, fd463; +} +mov.b32 r1133, {rs157, rs157}; +{ +mul.f16x2 r1131, r49, r1133; +} +{ +add.f16x2 r1134, r1110, r1131; +} +{ +cvt.rn.f16.f64 rs158, fd464; +} +mov.b32 r1139, {rs158, rs158}; +{ +mul.f16x2 r1137, r58, r1139; +} +{ +add.f16x2 r1140, r1116, r1137; +} +{ +cvt.rn.f16.f64 rs159, fd463; +} +mov.b32 r1145, {rs159, rs159}; +{ +mul.f16x2 r1143, r52, r1145; +} +{ +add.f16x2 r1146, r1122, r1143; +} +{ +cvt.rn.f16.f64 rs160, fd464; +} +mov.b32 r1151, {rs160, rs160}; +{ +mul.f16x2 r1149, r55, r1151; +} +{ +add.f16x2 r1152, r1128, r1149; +} +{ +cvt.rn.f16.f64 rs161, fd447; +} +mov.b32 r1157, {rs161, rs161}; +{ +mul.f16x2 r1155, r61, r1157; +} +{ +add.f16x2 r1158, r1134, r1155; +} +{ +cvt.rn.f16.f64 rs162, fd424; +} +mov.b32 r1163, {rs162, rs162}; +{ +mul.f16x2 r1161, r70, r1163; +} +{ +add.f16x2 r1164, r1140, r1161; +} +{ +cvt.rn.f16.f64 rs163, fd447; +} +mov.b32 r1169, {rs163, rs163}; +{ +mul.f16x2 r1167, r64, r1169; +} +{ +add.f16x2 r1170, r1146, r1167; +} +{ +cvt.rn.f16.f64 rs164, fd424; +} +mov.b32 r1175, {rs164, rs164}; +{ +mul.f16x2 r1173, r67, r1175; +} +{ +add.f16x2 r1176, r1152, r1173; +} +{ +cvt.rn.f16.f64 rs165, fd479; +} +mov.b32 r1181, {rs165, rs165}; +{ +mul.f16x2 r1179, r73, r1181; +} +{ +add.f16x2 r1182, r1158, r1179; +} +{ +cvt.rn.f16.f64 rs166, fd280; +} +mov.b32 r1187, {rs166, rs166}; +{ +mul.f16x2 r1185, r82, r1187; +} +{ +add.f16x2 r1188, r1164, r1185; +} +{ +cvt.rn.f16.f64 rs167, fd479; +} +mov.b32 r1193, {rs167, rs167}; +{ +mul.f16x2 r1191, r76, r1193; +} +{ +add.f16x2 r1194, r1170, r1191; +} +{ +cvt.rn.f16.f64 rs168, fd280; +} +mov.b32 r1199, {rs168, rs168}; +{ +mul.f16x2 r1197, r79, r1199; +} +{ +add.f16x2 r1200, r1176, r1197; +} +{ +cvt.rn.f16.f64 rs169, fd459; +} +mov.b32 r1205, {rs169, rs169}; +{ +mul.f16x2 r1203, r85, r1205; +} +{ +add.f16x2 r1206, r1182, r1203; +} +{ +cvt.rn.f16.f64 rs170, fd460; +} +mov.b32 r1211, {rs170, rs170}; +{ +mul.f16x2 r1209, r94, r1211; +} +{ +add.f16x2 r1212, r1188, r1209; +} +{ +cvt.rn.f16.f64 rs171, fd459; +} +mov.b32 r1217, {rs171, rs171}; +{ +mul.f16x2 r1215, r88, r1217; +} +{ +add.f16x2 r1218, r1194, r1215; +} +{ +cvt.rn.f16.f64 rs172, fd460; +} +mov.b32 r1223, {rs172, rs172}; +{ +mul.f16x2 r1221, r91, r1223; +} +{ +add.f16x2 r1224, r1200, r1221; +} +{ +cvt.rn.f16.f64 rs173, fd451; +} +mov.b32 r1229, {rs173, rs173}; +{ +mul.f16x2 r1227, r97, r1229; +} +{ +add.f16x2 r1230, r1206, r1227; +} +mov.f64 fd368, 0dBFD97F6748E524B2; +{ +cvt.rn.f16.f64 rs174, fd368; +} +mov.b32 r1235, {rs174, rs174}; +{ +mul.f16x2 r1233, r106, r1235; +} +{ +add.f16x2 r1236, r1212, r1233; +} +{ +cvt.rn.f16.f64 rs175, fd451; +} +mov.b32 r1241, {rs175, rs175}; +{ +mul.f16x2 r1239, r100, r1241; +} +{ +add.f16x2 r1242, r1218, r1239; +} +{ +cvt.rn.f16.f64 rs176, fd368; +} +mov.b32 r1247, {rs176, rs176}; +{ +mul.f16x2 r1245, r103, r1247; +} +{ +add.f16x2 r1248, r1224, r1245; +} +{ +cvt.rn.f16.f64 rs177, fd483; +} +mov.b32 r1253, {rs177, rs177}; +{ +mul.f16x2 r1251, r109, r1253; +} +{ +add.f16x2 r1254, r1230, r1251; +} +mov.f64 fd412, 0dBFEFECE70DFD3EFB; +{ +cvt.rn.f16.f64 rs178, fd412; +} +mov.b32 r1259, {rs178, rs178}; +{ +mul.f16x2 r1257, r118, r1259; +} +{ +add.f16x2 r1260, r1236, r1257; +} +{ +cvt.rn.f16.f64 rs179, fd483; +} +mov.b32 r1265, {rs179, rs179}; +{ +mul.f16x2 r1263, r112, r1265; +} +{ +add.f16x2 r1266, r1242, r1263; +} +{ +cvt.rn.f16.f64 rs180, fd412; +} +mov.b32 r1271, {rs180, rs180}; +{ +mul.f16x2 r1269, r115, r1271; +} +{ +add.f16x2 r1272, r1248, r1269; +} +{ +cvt.rn.f16.f64 rs181, fd455; +} +mov.b32 r1277, {rs181, rs181}; +{ +mul.f16x2 r1275, r121, r1277; +} +{ +add.f16x2 r1278, r1254, r1275; +} +{ +cvt.rn.f16.f64 rs182, fd456; +} +mov.b32 r1283, {rs182, rs182}; +{ +mul.f16x2 r1281, r130, r1283; +} +{ +add.f16x2 r1284, r1260, r1281; +} +{ +cvt.rn.f16.f64 rs183, fd455; +} +mov.b32 r1289, {rs183, rs183}; +{ +mul.f16x2 r1287, r124, r1289; +} +{ +add.f16x2 r1290, r1266, r1287; +} +{ +cvt.rn.f16.f64 rs184, fd456; +} +mov.b32 r1295, {rs184, rs184}; +{ +mul.f16x2 r1293, r127, r1295; +} +{ +add.f16x2 r1296, r1272, r1293; +} +{ +sub.f16x2 %8, r1278, r1284; +} +{ +add.f16x2 %9, r1290, r1296; +} +{ +add.f16x2 %38, r1278, r1284; +} +{ +sub.f16x2 %39, r1290, r1296; +} +cvt.rn.f16.s32 rs185, r2980; +mov.b32 r1323, {rs185, rs185}; +cvt.rn.f16.s32 rs186, r2980; +mov.b32 r1335, {rs186, rs186}; +{ +cvt.rn.f16.f64 rs187, fd479; +} +mov.b32 r1315, {rs187, rs187}; +{ +mul.f16x2 r1313, r1, r1315; +} +{ +add.f16x2 r1316, %46, r1313; +} +{ +cvt.rn.f16.f64 rs188, fd280; +} +mov.b32 r1321, {rs188, rs188}; +{ +mul.f16x2 r1319, r10, r1321; +} +{ +add.f16x2 r1322, r1323, r1319; +} +{ +cvt.rn.f16.f64 rs189, fd479; +} +mov.b32 r1327, {rs189, rs189}; +{ +mul.f16x2 r1325, r4, r1327; +} +{ +add.f16x2 r1328, %47, r1325; +} +{ +cvt.rn.f16.f64 rs190, fd280; +} +mov.b32 r1333, {rs190, rs190}; +{ +mul.f16x2 r1331, r7, r1333; +} +{ +add.f16x2 r1334, r1335, r1331; +} +{ +cvt.rn.f16.f64 rs191, fd451; +} +mov.b32 r1339, {rs191, rs191}; +{ +mul.f16x2 r1337, r13, r1339; +} +{ +add.f16x2 r1340, r1316, r1337; +} +{ +cvt.rn.f16.f64 rs192, fd452; +} +mov.b32 r1345, {rs192, rs192}; +{ +mul.f16x2 r1343, r22, r1345; +} +{ +add.f16x2 r1346, r1322, r1343; +} +{ +cvt.rn.f16.f64 rs193, fd451; +} +mov.b32 r1351, {rs193, rs193}; +{ +mul.f16x2 r1349, r16, r1351; +} +{ +add.f16x2 r1352, r1328, r1349; +} +{ +cvt.rn.f16.f64 rs194, fd452; +} +mov.b32 r1357, {rs194, rs194}; +{ +mul.f16x2 r1355, r19, r1357; +} +{ +add.f16x2 r1358, r1334, r1355; +} +{ +cvt.rn.f16.f64 rs195, fd467; +} +mov.b32 r1363, {rs195, rs195}; +{ +mul.f16x2 r1361, r25, r1363; +} +{ +add.f16x2 r1364, r1340, r1361; +} +{ +cvt.rn.f16.f64 rs196, fd188; +} +mov.b32 r1369, {rs196, rs196}; +{ +mul.f16x2 r1367, r34, r1369; +} +{ +add.f16x2 r1370, r1346, r1367; +} +{ +cvt.rn.f16.f64 rs197, fd467; +} +mov.b32 r1375, {rs197, rs197}; +{ +mul.f16x2 r1373, r28, r1375; +} +{ +add.f16x2 r1376, r1352, r1373; +} +{ +cvt.rn.f16.f64 rs198, fd188; +} +mov.b32 r1381, {rs198, rs198}; +{ +mul.f16x2 r1379, r31, r1381; +} +{ +add.f16x2 r1382, r1358, r1379; +} +{ +cvt.rn.f16.f64 rs199, fd463; +} +mov.b32 r1387, {rs199, rs199}; +{ +mul.f16x2 r1385, r37, r1387; +} +{ +add.f16x2 r1388, r1364, r1385; +} +{ +cvt.rn.f16.f64 rs200, fd464; +} +mov.b32 r1393, {rs200, rs200}; +{ +mul.f16x2 r1391, r46, r1393; +} +{ +add.f16x2 r1394, r1370, r1391; +} +{ +cvt.rn.f16.f64 rs201, fd463; +} +mov.b32 r1399, {rs201, rs201}; +{ +mul.f16x2 r1397, r40, r1399; +} +{ +add.f16x2 r1400, r1376, r1397; +} +{ +cvt.rn.f16.f64 rs202, fd464; +} +mov.b32 r1405, {rs202, rs202}; +{ +mul.f16x2 r1403, r43, r1405; +} +{ +add.f16x2 r1406, r1382, r1403; +} +{ +cvt.rn.f16.f64 rs203, fd455; +} +mov.b32 r1411, {rs203, rs203}; +{ +mul.f16x2 r1409, r49, r1411; +} +{ +add.f16x2 r1412, r1388, r1409; +} +{ +cvt.rn.f16.f64 rs204, fd332; +} +mov.b32 r1417, {rs204, rs204}; +{ +mul.f16x2 r1415, r58, r1417; +} +{ +add.f16x2 r1418, r1394, r1415; +} +{ +cvt.rn.f16.f64 rs205, fd455; +} +mov.b32 r1423, {rs205, rs205}; +{ +mul.f16x2 r1421, r52, r1423; +} +{ +add.f16x2 r1424, r1400, r1421; +} +{ +cvt.rn.f16.f64 rs206, fd332; +} +mov.b32 r1429, {rs206, rs206}; +{ +mul.f16x2 r1427, r55, r1429; +} +{ +add.f16x2 r1430, r1406, r1427; +} +{ +cvt.rn.f16.f64 rs207, fd475; +} +mov.b32 r1435, {rs207, rs207}; +{ +mul.f16x2 r1433, r61, r1435; +} +{ +add.f16x2 r1436, r1412, r1433; +} +{ +cvt.rn.f16.f64 rs208, fd476; +} +mov.b32 r1441, {rs208, rs208}; +{ +mul.f16x2 r1439, r70, r1441; +} +{ +add.f16x2 r1442, r1418, r1439; +} +{ +cvt.rn.f16.f64 rs209, fd475; +} +mov.b32 r1447, {rs209, rs209}; +{ +mul.f16x2 r1445, r64, r1447; +} +{ +add.f16x2 r1448, r1424, r1445; +} +{ +cvt.rn.f16.f64 rs210, fd476; +} +mov.b32 r1453, {rs210, rs210}; +{ +mul.f16x2 r1451, r67, r1453; +} +{ +add.f16x2 r1454, r1430, r1451; +} +{ +cvt.rn.f16.f64 rs211, fd443; +} +mov.b32 r1459, {rs211, rs211}; +{ +mul.f16x2 r1457, r73, r1459; +} +{ +add.f16x2 r1460, r1436, r1457; +} +{ +cvt.rn.f16.f64 rs212, fd388; +} +mov.b32 r1465, {rs212, rs212}; +{ +mul.f16x2 r1463, r82, r1465; +} +{ +add.f16x2 r1466, r1442, r1463; +} +{ +cvt.rn.f16.f64 rs213, fd443; +} +mov.b32 r1471, {rs213, rs213}; +{ +mul.f16x2 r1469, r76, r1471; +} +{ +add.f16x2 r1472, r1448, r1469; +} +{ +cvt.rn.f16.f64 rs214, fd388; +} +mov.b32 r1477, {rs214, rs214}; +{ +mul.f16x2 r1475, r79, r1477; +} +{ +add.f16x2 r1478, r1454, r1475; +} +{ +cvt.rn.f16.f64 rs215, fd483; +} +mov.b32 r1483, {rs215, rs215}; +{ +mul.f16x2 r1481, r85, r1483; +} +{ +add.f16x2 r1484, r1460, r1481; +} +{ +cvt.rn.f16.f64 rs216, fd412; +} +mov.b32 r1489, {rs216, rs216}; +{ +mul.f16x2 r1487, r94, r1489; +} +{ +add.f16x2 r1490, r1466, r1487; +} +{ +cvt.rn.f16.f64 rs217, fd483; +} +mov.b32 r1495, {rs217, rs217}; +{ +mul.f16x2 r1493, r88, r1495; +} +{ +add.f16x2 r1496, r1472, r1493; +} +{ +cvt.rn.f16.f64 rs218, fd412; +} +mov.b32 r1501, {rs218, rs218}; +{ +mul.f16x2 r1499, r91, r1501; +} +{ +add.f16x2 r1502, r1478, r1499; +} +{ +cvt.rn.f16.f64 rs219, fd447; +} +mov.b32 r1507, {rs219, rs219}; +{ +mul.f16x2 r1505, r97, r1507; +} +{ +add.f16x2 r1508, r1484, r1505; +} +{ +cvt.rn.f16.f64 rs220, fd448; +} +mov.b32 r1513, {rs220, rs220}; +{ +mul.f16x2 r1511, r106, r1513; +} +{ +add.f16x2 r1514, r1490, r1511; +} +{ +cvt.rn.f16.f64 rs221, fd447; +} +mov.b32 r1519, {rs221, rs221}; +{ +mul.f16x2 r1517, r100, r1519; +} +{ +add.f16x2 r1520, r1496, r1517; +} +{ +cvt.rn.f16.f64 rs222, fd448; +} +mov.b32 r1525, {rs222, rs222}; +{ +mul.f16x2 r1523, r103, r1525; +} +{ +add.f16x2 r1526, r1502, r1523; +} +{ +cvt.rn.f16.f64 rs223, fd471; +} +mov.b32 r1531, {rs223, rs223}; +{ +mul.f16x2 r1529, r109, r1531; +} +{ +add.f16x2 r1532, r1508, r1529; +} +{ +cvt.rn.f16.f64 rs224, fd416; +} +mov.b32 r1537, {rs224, rs224}; +{ +mul.f16x2 r1535, r118, r1537; +} +{ +add.f16x2 r1538, r1514, r1535; +} +{ +cvt.rn.f16.f64 rs225, fd471; +} +mov.b32 r1543, {rs225, rs225}; +{ +mul.f16x2 r1541, r112, r1543; +} +{ +add.f16x2 r1544, r1520, r1541; +} +{ +cvt.rn.f16.f64 rs226, fd416; +} +mov.b32 r1549, {rs226, rs226}; +{ +mul.f16x2 r1547, r115, r1549; +} +{ +add.f16x2 r1550, r1526, r1547; +} +{ +cvt.rn.f16.f64 rs227, fd459; +} +mov.b32 r1555, {rs227, rs227}; +{ +mul.f16x2 r1553, r121, r1555; +} +{ +add.f16x2 r1556, r1532, r1553; +} +{ +cvt.rn.f16.f64 rs228, fd460; +} +mov.b32 r1561, {rs228, rs228}; +{ +mul.f16x2 r1559, r130, r1561; +} +{ +add.f16x2 r1562, r1538, r1559; +} +{ +cvt.rn.f16.f64 rs229, fd459; +} +mov.b32 r1567, {rs229, rs229}; +{ +mul.f16x2 r1565, r124, r1567; +} +{ +add.f16x2 r1568, r1544, r1565; +} +{ +cvt.rn.f16.f64 rs230, fd460; +} +mov.b32 r1573, {rs230, rs230}; +{ +mul.f16x2 r1571, r127, r1573; +} +{ +add.f16x2 r1574, r1550, r1571; +} +{ +sub.f16x2 %10, r1556, r1562; +} +{ +add.f16x2 %11, r1568, r1574; +} +{ +add.f16x2 %36, r1556, r1562; +} +{ +sub.f16x2 %37, r1568, r1574; +} +cvt.rn.f16.s32 rs231, r2980; +mov.b32 r1601, {rs231, rs231}; +cvt.rn.f16.s32 rs232, r2980; +mov.b32 r1613, {rs232, rs232}; +{ +cvt.rn.f16.f64 rs233, fd483; +} +mov.b32 r1593, {rs233, rs233}; +{ +mul.f16x2 r1591, r1, r1593; +} +{ +add.f16x2 r1594, %46, r1591; +} +{ +cvt.rn.f16.f64 rs234, fd484; +} +mov.b32 r1599, {rs234, rs234}; +{ +mul.f16x2 r1597, r10, r1599; +} +{ +add.f16x2 r1600, r1601, r1597; +} +{ +cvt.rn.f16.f64 rs235, fd483; +} +mov.b32 r1605, {rs235, rs235}; +{ +mul.f16x2 r1603, r4, r1605; +} +{ +add.f16x2 r1606, %47, r1603; +} +{ +cvt.rn.f16.f64 rs236, fd484; +} +mov.b32 r1611, {rs236, rs236}; +{ +mul.f16x2 r1609, r7, r1611; +} +{ +add.f16x2 r1612, r1613, r1609; +} +{ +cvt.rn.f16.f64 rs237, fd443; +} +mov.b32 r1617, {rs237, rs237}; +{ +mul.f16x2 r1615, r13, r1617; +} +{ +add.f16x2 r1618, r1594, r1615; +} +{ +cvt.rn.f16.f64 rs238, fd388; +} +mov.b32 r1623, {rs238, rs238}; +{ +mul.f16x2 r1621, r22, r1623; +} +{ +add.f16x2 r1624, r1600, r1621; +} +{ +cvt.rn.f16.f64 rs239, fd443; +} +mov.b32 r1629, {rs239, rs239}; +{ +mul.f16x2 r1627, r16, r1629; +} +{ +add.f16x2 r1630, r1606, r1627; +} +{ +cvt.rn.f16.f64 rs240, fd388; +} +mov.b32 r1635, {rs240, rs240}; +{ +mul.f16x2 r1633, r19, r1635; +} +{ +add.f16x2 r1636, r1612, r1633; +} +{ +cvt.rn.f16.f64 rs241, fd479; +} +mov.b32 r1641, {rs241, rs241}; +{ +mul.f16x2 r1639, r25, r1641; +} +{ +add.f16x2 r1642, r1618, r1639; +} +{ +cvt.rn.f16.f64 rs242, fd480; +} +mov.b32 r1647, {rs242, rs242}; +{ +mul.f16x2 r1645, r34, r1647; +} +{ +add.f16x2 r1648, r1624, r1645; +} +{ +cvt.rn.f16.f64 rs243, fd479; +} +mov.b32 r1653, {rs243, rs243}; +{ +mul.f16x2 r1651, r28, r1653; +} +{ +add.f16x2 r1654, r1630, r1651; +} +{ +cvt.rn.f16.f64 rs244, fd480; +} +mov.b32 r1659, {rs244, rs244}; +{ +mul.f16x2 r1657, r31, r1659; +} +{ +add.f16x2 r1660, r1636, r1657; +} +{ +cvt.rn.f16.f64 rs245, fd447; +} +mov.b32 r1665, {rs245, rs245}; +{ +mul.f16x2 r1663, r37, r1665; +} +{ +add.f16x2 r1666, r1642, r1663; +} +{ +cvt.rn.f16.f64 rs246, fd424; +} +mov.b32 r1671, {rs246, rs246}; +{ +mul.f16x2 r1669, r46, r1671; +} +{ +add.f16x2 r1672, r1648, r1669; +} +{ +cvt.rn.f16.f64 rs247, fd447; +} +mov.b32 r1677, {rs247, rs247}; +{ +mul.f16x2 r1675, r40, r1677; +} +{ +add.f16x2 r1678, r1654, r1675; +} +{ +cvt.rn.f16.f64 rs248, fd424; +} +mov.b32 r1683, {rs248, rs248}; +{ +mul.f16x2 r1681, r43, r1683; +} +{ +add.f16x2 r1684, r1660, r1681; +} +{ +cvt.rn.f16.f64 rs249, fd475; +} +mov.b32 r1689, {rs249, rs249}; +{ +mul.f16x2 r1687, r49, r1689; +} +{ +add.f16x2 r1690, r1666, r1687; +} +{ +cvt.rn.f16.f64 rs250, fd476; +} +mov.b32 r1695, {rs250, rs250}; +{ +mul.f16x2 r1693, r58, r1695; +} +{ +add.f16x2 r1696, r1672, r1693; +} +{ +cvt.rn.f16.f64 rs251, fd475; +} +mov.b32 r1701, {rs251, rs251}; +{ +mul.f16x2 r1699, r52, r1701; +} +{ +add.f16x2 r1702, r1678, r1699; +} +{ +cvt.rn.f16.f64 rs252, fd476; +} +mov.b32 r1707, {rs252, rs252}; +{ +mul.f16x2 r1705, r55, r1707; +} +{ +add.f16x2 r1708, r1684, r1705; +} +{ +cvt.rn.f16.f64 rs253, fd451; +} +mov.b32 r1713, {rs253, rs253}; +{ +mul.f16x2 r1711, r61, r1713; +} +{ +add.f16x2 r1714, r1690, r1711; +} +{ +cvt.rn.f16.f64 rs254, fd368; +} +mov.b32 r1719, {rs254, rs254}; +{ +mul.f16x2 r1717, r70, r1719; +} +{ +add.f16x2 r1720, r1696, r1717; +} +{ +cvt.rn.f16.f64 rs255, fd451; +} +mov.b32 r1725, {rs255, rs255}; +{ +mul.f16x2 r1723, r64, r1725; +} +{ +add.f16x2 r1726, r1702, r1723; +} +{ +cvt.rn.f16.f64 rs256, fd368; +} +mov.b32 r1731, {rs256, rs256}; +{ +mul.f16x2 r1729, r67, r1731; +} +{ +add.f16x2 r1732, r1708, r1729; +} +{ +cvt.rn.f16.f64 rs257, fd471; +} +mov.b32 r1737, {rs257, rs257}; +{ +mul.f16x2 r1735, r73, r1737; +} +{ +add.f16x2 r1738, r1714, r1735; +} +mov.f64 fd472, 0dBFEC698E42F47B09; +{ +cvt.rn.f16.f64 rs258, fd472; +} +mov.b32 r1743, {rs258, rs258}; +{ +mul.f16x2 r1741, r82, r1743; +} +{ +add.f16x2 r1744, r1720, r1741; +} +{ +cvt.rn.f16.f64 rs259, fd471; +} +mov.b32 r1749, {rs259, rs259}; +{ +mul.f16x2 r1747, r76, r1749; +} +{ +add.f16x2 r1750, r1726, r1747; +} +{ +cvt.rn.f16.f64 rs260, fd472; +} +mov.b32 r1755, {rs260, rs260}; +{ +mul.f16x2 r1753, r79, r1755; +} +{ +add.f16x2 r1756, r1732, r1753; +} +{ +cvt.rn.f16.f64 rs261, fd455; +} +mov.b32 r1761, {rs261, rs261}; +{ +mul.f16x2 r1759, r85, r1761; +} +{ +add.f16x2 r1762, r1738, r1759; +} +{ +cvt.rn.f16.f64 rs262, fd332; +} +mov.b32 r1767, {rs262, rs262}; +{ +mul.f16x2 r1765, r94, r1767; +} +{ +add.f16x2 r1768, r1744, r1765; +} +{ +cvt.rn.f16.f64 rs263, fd455; +} +mov.b32 r1773, {rs263, rs263}; +{ +mul.f16x2 r1771, r88, r1773; +} +{ +add.f16x2 r1774, r1750, r1771; +} +{ +cvt.rn.f16.f64 rs264, fd332; +} +mov.b32 r1779, {rs264, rs264}; +{ +mul.f16x2 r1777, r91, r1779; +} +{ +add.f16x2 r1780, r1756, r1777; +} +{ +cvt.rn.f16.f64 rs265, fd467; +} +mov.b32 r1785, {rs265, rs265}; +{ +mul.f16x2 r1783, r97, r1785; +} +{ +add.f16x2 r1786, r1762, r1783; +} +{ +cvt.rn.f16.f64 rs266, fd468; +} +mov.b32 r1791, {rs266, rs266}; +{ +mul.f16x2 r1789, r106, r1791; +} +{ +add.f16x2 r1792, r1768, r1789; +} +{ +cvt.rn.f16.f64 rs267, fd467; +} +mov.b32 r1797, {rs267, rs267}; +{ +mul.f16x2 r1795, r100, r1797; +} +{ +add.f16x2 r1798, r1774, r1795; +} +{ +cvt.rn.f16.f64 rs268, fd468; +} +mov.b32 r1803, {rs268, rs268}; +{ +mul.f16x2 r1801, r103, r1803; +} +{ +add.f16x2 r1804, r1780, r1801; +} +{ +cvt.rn.f16.f64 rs269, fd459; +} +mov.b32 r1809, {rs269, rs269}; +{ +mul.f16x2 r1807, r109, r1809; +} +{ +add.f16x2 r1810, r1786, r1807; +} +{ +cvt.rn.f16.f64 rs270, fd420; +} +mov.b32 r1815, {rs270, rs270}; +{ +mul.f16x2 r1813, r118, r1815; +} +{ +add.f16x2 r1816, r1792, r1813; +} +{ +cvt.rn.f16.f64 rs271, fd459; +} +mov.b32 r1821, {rs271, rs271}; +{ +mul.f16x2 r1819, r112, r1821; +} +{ +add.f16x2 r1822, r1798, r1819; +} +{ +cvt.rn.f16.f64 rs272, fd420; +} +mov.b32 r1827, {rs272, rs272}; +{ +mul.f16x2 r1825, r115, r1827; +} +{ +add.f16x2 r1828, r1804, r1825; +} +{ +cvt.rn.f16.f64 rs273, fd463; +} +mov.b32 r1833, {rs273, rs273}; +{ +mul.f16x2 r1831, r121, r1833; +} +{ +add.f16x2 r1834, r1810, r1831; +} +{ +cvt.rn.f16.f64 rs274, fd464; +} +mov.b32 r1839, {rs274, rs274}; +{ +mul.f16x2 r1837, r130, r1839; +} +{ +add.f16x2 r1840, r1816, r1837; +} +{ +cvt.rn.f16.f64 rs275, fd463; +} +mov.b32 r1845, {rs275, rs275}; +{ +mul.f16x2 r1843, r124, r1845; +} +{ +add.f16x2 r1846, r1822, r1843; +} +{ +cvt.rn.f16.f64 rs276, fd464; +} +mov.b32 r1851, {rs276, rs276}; +{ +mul.f16x2 r1849, r127, r1851; +} +{ +add.f16x2 r1852, r1828, r1849; +} +{ +sub.f16x2 %12, r1834, r1840; +} +{ +add.f16x2 %13, r1846, r1852; +} +{ +add.f16x2 %34, r1834, r1840; +} +{ +sub.f16x2 %35, r1846, r1852; +} +cvt.rn.f16.s32 rs277, r2980; +mov.b32 r1879, {rs277, rs277}; +cvt.rn.f16.s32 rs278, r2980; +mov.b32 r1891, {rs278, rs278}; +{ +cvt.rn.f16.f64 rs279, fd475; +} +mov.b32 r1871, {rs279, rs279}; +{ +mul.f16x2 r1869, r1, r1871; +} +{ +add.f16x2 r1872, %46, r1869; +} +{ +cvt.rn.f16.f64 rs280, fd476; +} +mov.b32 r1877, {rs280, rs280}; +{ +mul.f16x2 r1875, r10, r1877; +} +{ +add.f16x2 r1878, r1879, r1875; +} +{ +cvt.rn.f16.f64 rs281, fd475; +} +mov.b32 r1883, {rs281, rs281}; +{ +mul.f16x2 r1881, r4, r1883; +} +{ +add.f16x2 r1884, %47, r1881; +} +{ +cvt.rn.f16.f64 rs282, fd476; +} +mov.b32 r1889, {rs282, rs282}; +{ +mul.f16x2 r1887, r7, r1889; +} +{ +add.f16x2 r1890, r1891, r1887; +} +{ +cvt.rn.f16.f64 rs283, fd459; +} +mov.b32 r1895, {rs283, rs283}; +{ +mul.f16x2 r1893, r13, r1895; +} +{ +add.f16x2 r1896, r1872, r1893; +} +{ +cvt.rn.f16.f64 rs284, fd420; +} +mov.b32 r1901, {rs284, rs284}; +{ +mul.f16x2 r1899, r22, r1901; +} +{ +add.f16x2 r1902, r1878, r1899; +} +{ +cvt.rn.f16.f64 rs285, fd459; +} +mov.b32 r1907, {rs285, rs285}; +{ +mul.f16x2 r1905, r16, r1907; +} +{ +add.f16x2 r1908, r1884, r1905; +} +{ +cvt.rn.f16.f64 rs286, fd420; +} +mov.b32 r1913, {rs286, rs286}; +{ +mul.f16x2 r1911, r19, r1913; +} +{ +add.f16x2 r1914, r1890, r1911; +} +{ +cvt.rn.f16.f64 rs287, fd455; +} +mov.b32 r1919, {rs287, rs287}; +{ +mul.f16x2 r1917, r25, r1919; +} +{ +add.f16x2 r1920, r1896, r1917; +} +{ +cvt.rn.f16.f64 rs288, fd456; +} +mov.b32 r1925, {rs288, rs288}; +{ +mul.f16x2 r1923, r34, r1925; +} +{ +add.f16x2 r1926, r1902, r1923; +} +{ +cvt.rn.f16.f64 rs289, fd455; +} +mov.b32 r1931, {rs289, rs289}; +{ +mul.f16x2 r1929, r28, r1931; +} +{ +add.f16x2 r1932, r1908, r1929; +} +{ +cvt.rn.f16.f64 rs290, fd456; +} +mov.b32 r1937, {rs290, rs290}; +{ +mul.f16x2 r1935, r31, r1937; +} +{ +add.f16x2 r1938, r1914, r1935; +} +{ +cvt.rn.f16.f64 rs291, fd479; +} +mov.b32 r1943, {rs291, rs291}; +{ +mul.f16x2 r1941, r37, r1943; +} +{ +add.f16x2 r1944, r1920, r1941; +} +{ +cvt.rn.f16.f64 rs292, fd280; +} +mov.b32 r1949, {rs292, rs292}; +{ +mul.f16x2 r1947, r46, r1949; +} +{ +add.f16x2 r1950, r1926, r1947; +} +{ +cvt.rn.f16.f64 rs293, fd479; +} +mov.b32 r1955, {rs293, rs293}; +{ +mul.f16x2 r1953, r40, r1955; +} +{ +add.f16x2 r1956, r1932, r1953; +} +{ +cvt.rn.f16.f64 rs294, fd280; +} +mov.b32 r1961, {rs294, rs294}; +{ +mul.f16x2 r1959, r43, r1961; +} +{ +add.f16x2 r1962, r1938, r1959; +} +{ +cvt.rn.f16.f64 rs295, fd443; +} +mov.b32 r1967, {rs295, rs295}; +{ +mul.f16x2 r1965, r49, r1967; +} +{ +add.f16x2 r1968, r1944, r1965; +} +{ +cvt.rn.f16.f64 rs296, fd388; +} +mov.b32 r1973, {rs296, rs296}; +{ +mul.f16x2 r1971, r58, r1973; +} +{ +add.f16x2 r1974, r1950, r1971; +} +{ +cvt.rn.f16.f64 rs297, fd443; +} +mov.b32 r1979, {rs297, rs297}; +{ +mul.f16x2 r1977, r52, r1979; +} +{ +add.f16x2 r1980, r1956, r1977; +} +{ +cvt.rn.f16.f64 rs298, fd388; +} +mov.b32 r1985, {rs298, rs298}; +{ +mul.f16x2 r1983, r55, r1985; +} +{ +add.f16x2 r1986, r1962, r1983; +} +{ +cvt.rn.f16.f64 rs299, fd471; +} +mov.b32 r1991, {rs299, rs299}; +{ +mul.f16x2 r1989, r61, r1991; +} +{ +add.f16x2 r1992, r1968, r1989; +} +{ +cvt.rn.f16.f64 rs300, fd472; +} +mov.b32 r1997, {rs300, rs300}; +{ +mul.f16x2 r1995, r70, r1997; +} +{ +add.f16x2 r1998, r1974, r1995; +} +{ +cvt.rn.f16.f64 rs301, fd471; +} +mov.b32 r2003, {rs301, rs301}; +{ +mul.f16x2 r2001, r64, r2003; +} +{ +add.f16x2 r2004, r1980, r2001; +} +{ +cvt.rn.f16.f64 rs302, fd472; +} +mov.b32 r2009, {rs302, rs302}; +{ +mul.f16x2 r2007, r67, r2009; +} +{ +add.f16x2 r2010, r1986, r2007; +} +{ +cvt.rn.f16.f64 rs303, fd463; +} +mov.b32 r2015, {rs303, rs303}; +{ +mul.f16x2 r2013, r73, r2015; +} +{ +add.f16x2 r2016, r1992, r2013; +} +{ +cvt.rn.f16.f64 rs304, fd384; +} +mov.b32 r2021, {rs304, rs304}; +{ +mul.f16x2 r2019, r82, r2021; +} +{ +add.f16x2 r2022, r1998, r2019; +} +{ +cvt.rn.f16.f64 rs305, fd463; +} +mov.b32 r2027, {rs305, rs305}; +{ +mul.f16x2 r2025, r76, r2027; +} +{ +add.f16x2 r2028, r2004, r2025; +} +{ +cvt.rn.f16.f64 rs306, fd384; +} +mov.b32 r2033, {rs306, rs306}; +{ +mul.f16x2 r2031, r79, r2033; +} +{ +add.f16x2 r2034, r2010, r2031; +} +{ +cvt.rn.f16.f64 rs307, fd451; +} +mov.b32 r2039, {rs307, rs307}; +{ +mul.f16x2 r2037, r85, r2039; +} +{ +add.f16x2 r2040, r2016, r2037; +} +{ +cvt.rn.f16.f64 rs308, fd452; +} +mov.b32 r2045, {rs308, rs308}; +{ +mul.f16x2 r2043, r94, r2045; +} +{ +add.f16x2 r2046, r2022, r2043; +} +{ +cvt.rn.f16.f64 rs309, fd451; +} +mov.b32 r2051, {rs309, rs309}; +{ +mul.f16x2 r2049, r88, r2051; +} +{ +add.f16x2 r2052, r2028, r2049; +} +{ +cvt.rn.f16.f64 rs310, fd452; +} +mov.b32 r2057, {rs310, rs310}; +{ +mul.f16x2 r2055, r91, r2057; +} +{ +add.f16x2 r2058, r2034, r2055; +} +{ +cvt.rn.f16.f64 rs311, fd483; +} +mov.b32 r2063, {rs311, rs311}; +{ +mul.f16x2 r2061, r97, r2063; +} +{ +add.f16x2 r2064, r2040, r2061; +} +{ +cvt.rn.f16.f64 rs312, fd412; +} +mov.b32 r2069, {rs312, rs312}; +{ +mul.f16x2 r2067, r106, r2069; +} +{ +add.f16x2 r2070, r2046, r2067; +} +{ +cvt.rn.f16.f64 rs313, fd483; +} +mov.b32 r2075, {rs313, rs313}; +{ +mul.f16x2 r2073, r100, r2075; +} +{ +add.f16x2 r2076, r2052, r2073; +} +{ +cvt.rn.f16.f64 rs314, fd412; +} +mov.b32 r2081, {rs314, rs314}; +{ +mul.f16x2 r2079, r103, r2081; +} +{ +add.f16x2 r2082, r2058, r2079; +} +{ +cvt.rn.f16.f64 rs315, fd447; +} +mov.b32 r2087, {rs315, rs315}; +{ +mul.f16x2 r2085, r109, r2087; +} +{ +add.f16x2 r2088, r2064, r2085; +} +{ +cvt.rn.f16.f64 rs316, fd424; +} +mov.b32 r2093, {rs316, rs316}; +{ +mul.f16x2 r2091, r118, r2093; +} +{ +add.f16x2 r2094, r2070, r2091; +} +{ +cvt.rn.f16.f64 rs317, fd447; +} +mov.b32 r2099, {rs317, rs317}; +{ +mul.f16x2 r2097, r112, r2099; +} +{ +add.f16x2 r2100, r2076, r2097; +} +{ +cvt.rn.f16.f64 rs318, fd424; +} +mov.b32 r2105, {rs318, rs318}; +{ +mul.f16x2 r2103, r115, r2105; +} +{ +add.f16x2 r2106, r2082, r2103; +} +{ +cvt.rn.f16.f64 rs319, fd467; +} +mov.b32 r2111, {rs319, rs319}; +{ +mul.f16x2 r2109, r121, r2111; +} +{ +add.f16x2 r2112, r2088, r2109; +} +{ +cvt.rn.f16.f64 rs320, fd468; +} +mov.b32 r2117, {rs320, rs320}; +{ +mul.f16x2 r2115, r130, r2117; +} +{ +add.f16x2 r2118, r2094, r2115; +} +{ +cvt.rn.f16.f64 rs321, fd467; +} +mov.b32 r2123, {rs321, rs321}; +{ +mul.f16x2 r2121, r124, r2123; +} +{ +add.f16x2 r2124, r2100, r2121; +} +{ +cvt.rn.f16.f64 rs322, fd468; +} +mov.b32 r2129, {rs322, rs322}; +{ +mul.f16x2 r2127, r127, r2129; +} +{ +add.f16x2 r2130, r2106, r2127; +} +{ +sub.f16x2 %14, r2112, r2118; +} +{ +add.f16x2 %15, r2124, r2130; +} +{ +add.f16x2 %32, r2112, r2118; +} +{ +sub.f16x2 %33, r2124, r2130; +} +cvt.rn.f16.s32 rs323, r2980; +mov.b32 r2157, {rs323, rs323}; +cvt.rn.f16.s32 rs324, r2980; +mov.b32 r2169, {rs324, rs324}; +{ +cvt.rn.f16.f64 rs325, fd467; +} +mov.b32 r2149, {rs325, rs325}; +{ +mul.f16x2 r2147, r1, r2149; +} +{ +add.f16x2 r2150, %46, r2147; +} +{ +cvt.rn.f16.f64 rs326, fd468; +} +mov.b32 r2155, {rs326, rs326}; +{ +mul.f16x2 r2153, r10, r2155; +} +{ +add.f16x2 r2156, r2157, r2153; +} +{ +cvt.rn.f16.f64 rs327, fd467; +} +mov.b32 r2161, {rs327, rs327}; +{ +mul.f16x2 r2159, r4, r2161; +} +{ +add.f16x2 r2162, %47, r2159; +} +{ +cvt.rn.f16.f64 rs328, fd468; +} +mov.b32 r2167, {rs328, rs328}; +{ +mul.f16x2 r2165, r7, r2167; +} +{ +add.f16x2 r2168, r2169, r2165; +} +{ +cvt.rn.f16.f64 rs329, fd475; +} +mov.b32 r2173, {rs329, rs329}; +{ +mul.f16x2 r2171, r13, r2173; +} +{ +add.f16x2 r2174, r2150, r2171; +} +{ +cvt.rn.f16.f64 rs330, fd316; +} +mov.b32 r2179, {rs330, rs330}; +{ +mul.f16x2 r2177, r22, r2179; +} +{ +add.f16x2 r2180, r2156, r2177; +} +{ +cvt.rn.f16.f64 rs331, fd475; +} +mov.b32 r2185, {rs331, rs331}; +{ +mul.f16x2 r2183, r16, r2185; +} +{ +add.f16x2 r2186, r2162, r2183; +} +{ +cvt.rn.f16.f64 rs332, fd316; +} +mov.b32 r2191, {rs332, rs332}; +{ +mul.f16x2 r2189, r19, r2191; +} +{ +add.f16x2 r2192, r2168, r2189; +} +{ +cvt.rn.f16.f64 rs333, fd447; +} +mov.b32 r2197, {rs333, rs333}; +{ +mul.f16x2 r2195, r25, r2197; +} +{ +add.f16x2 r2198, r2174, r2195; +} +{ +cvt.rn.f16.f64 rs334, fd424; +} +mov.b32 r2203, {rs334, rs334}; +{ +mul.f16x2 r2201, r34, r2203; +} +{ +add.f16x2 r2204, r2180, r2201; +} +{ +cvt.rn.f16.f64 rs335, fd447; +} +mov.b32 r2209, {rs335, rs335}; +{ +mul.f16x2 r2207, r28, r2209; +} +{ +add.f16x2 r2210, r2186, r2207; +} +{ +cvt.rn.f16.f64 rs336, fd424; +} +mov.b32 r2215, {rs336, rs336}; +{ +mul.f16x2 r2213, r31, r2215; +} +{ +add.f16x2 r2216, r2192, r2213; +} +{ +cvt.rn.f16.f64 rs337, fd459; +} +mov.b32 r2221, {rs337, rs337}; +{ +mul.f16x2 r2219, r37, r2221; +} +{ +add.f16x2 r2222, r2198, r2219; +} +{ +cvt.rn.f16.f64 rs338, fd460; +} +mov.b32 r2227, {rs338, rs338}; +{ +mul.f16x2 r2225, r46, r2227; +} +{ +add.f16x2 r2228, r2204, r2225; +} +{ +cvt.rn.f16.f64 rs339, fd459; +} +mov.b32 r2233, {rs339, rs339}; +{ +mul.f16x2 r2231, r40, r2233; +} +{ +add.f16x2 r2234, r2210, r2231; +} +{ +cvt.rn.f16.f64 rs340, fd460; +} +mov.b32 r2239, {rs340, rs340}; +{ +mul.f16x2 r2237, r43, r2239; +} +{ +add.f16x2 r2240, r2216, r2237; +} +{ +cvt.rn.f16.f64 rs341, fd483; +} +mov.b32 r2245, {rs341, rs341}; +{ +mul.f16x2 r2243, r49, r2245; +} +{ +add.f16x2 r2246, r2222, r2243; +} +{ +cvt.rn.f16.f64 rs342, fd412; +} +mov.b32 r2251, {rs342, rs342}; +{ +mul.f16x2 r2249, r58, r2251; +} +{ +add.f16x2 r2252, r2228, r2249; +} +{ +cvt.rn.f16.f64 rs343, fd483; +} +mov.b32 r2257, {rs343, rs343}; +{ +mul.f16x2 r2255, r52, r2257; +} +{ +add.f16x2 r2258, r2234, r2255; +} +{ +cvt.rn.f16.f64 rs344, fd412; +} +mov.b32 r2263, {rs344, rs344}; +{ +mul.f16x2 r2261, r55, r2263; +} +{ +add.f16x2 r2264, r2240, r2261; +} +{ +cvt.rn.f16.f64 rs345, fd455; +} +mov.b32 r2269, {rs345, rs345}; +{ +mul.f16x2 r2267, r61, r2269; +} +{ +add.f16x2 r2270, r2246, r2267; +} +{ +cvt.rn.f16.f64 rs346, fd332; +} +mov.b32 r2275, {rs346, rs346}; +{ +mul.f16x2 r2273, r70, r2275; +} +{ +add.f16x2 r2276, r2252, r2273; +} +{ +cvt.rn.f16.f64 rs347, fd455; +} +mov.b32 r2281, {rs347, rs347}; +{ +mul.f16x2 r2279, r64, r2281; +} +{ +add.f16x2 r2282, r2258, r2279; +} +{ +cvt.rn.f16.f64 rs348, fd332; +} +mov.b32 r2287, {rs348, rs348}; +{ +mul.f16x2 r2285, r67, r2287; +} +{ +add.f16x2 r2288, r2264, r2285; +} +{ +cvt.rn.f16.f64 rs349, fd451; +} +mov.b32 r2293, {rs349, rs349}; +{ +mul.f16x2 r2291, r73, r2293; +} +{ +add.f16x2 r2294, r2270, r2291; +} +{ +cvt.rn.f16.f64 rs350, fd452; +} +mov.b32 r2299, {rs350, rs350}; +{ +mul.f16x2 r2297, r82, r2299; +} +{ +add.f16x2 r2300, r2276, r2297; +} +{ +cvt.rn.f16.f64 rs351, fd451; +} +mov.b32 r2305, {rs351, rs351}; +{ +mul.f16x2 r2303, r76, r2305; +} +{ +add.f16x2 r2306, r2282, r2303; +} +{ +cvt.rn.f16.f64 rs352, fd452; +} +mov.b32 r2311, {rs352, rs352}; +{ +mul.f16x2 r2309, r79, r2311; +} +{ +add.f16x2 r2312, r2288, r2309; +} +{ +cvt.rn.f16.f64 rs353, fd479; +} +mov.b32 r2317, {rs353, rs353}; +{ +mul.f16x2 r2315, r85, r2317; +} +{ +add.f16x2 r2318, r2294, r2315; +} +{ +cvt.rn.f16.f64 rs354, fd480; +} +mov.b32 r2323, {rs354, rs354}; +{ +mul.f16x2 r2321, r94, r2323; +} +{ +add.f16x2 r2324, r2300, r2321; +} +{ +cvt.rn.f16.f64 rs355, fd479; +} +mov.b32 r2329, {rs355, rs355}; +{ +mul.f16x2 r2327, r88, r2329; +} +{ +add.f16x2 r2330, r2306, r2327; +} +{ +cvt.rn.f16.f64 rs356, fd480; +} +mov.b32 r2335, {rs356, rs356}; +{ +mul.f16x2 r2333, r91, r2335; +} +{ +add.f16x2 r2336, r2312, r2333; +} +{ +cvt.rn.f16.f64 rs357, fd463; +} +mov.b32 r2341, {rs357, rs357}; +{ +mul.f16x2 r2339, r97, r2341; +} +{ +add.f16x2 r2342, r2318, r2339; +} +{ +cvt.rn.f16.f64 rs358, fd384; +} +mov.b32 r2347, {rs358, rs358}; +{ +mul.f16x2 r2345, r106, r2347; +} +{ +add.f16x2 r2348, r2324, r2345; +} +{ +cvt.rn.f16.f64 rs359, fd463; +} +mov.b32 r2353, {rs359, rs359}; +{ +mul.f16x2 r2351, r100, r2353; +} +{ +add.f16x2 r2354, r2330, r2351; +} +{ +cvt.rn.f16.f64 rs360, fd384; +} +mov.b32 r2359, {rs360, rs360}; +{ +mul.f16x2 r2357, r103, r2359; +} +{ +add.f16x2 r2360, r2336, r2357; +} +{ +cvt.rn.f16.f64 rs361, fd443; +} +mov.b32 r2365, {rs361, rs361}; +{ +mul.f16x2 r2363, r109, r2365; +} +{ +add.f16x2 r2366, r2342, r2363; +} +{ +cvt.rn.f16.f64 rs362, fd444; +} +mov.b32 r2371, {rs362, rs362}; +{ +mul.f16x2 r2369, r118, r2371; +} +{ +add.f16x2 r2372, r2348, r2369; +} +{ +cvt.rn.f16.f64 rs363, fd443; +} +mov.b32 r2377, {rs363, rs363}; +{ +mul.f16x2 r2375, r112, r2377; +} +{ +add.f16x2 r2378, r2354, r2375; +} +{ +cvt.rn.f16.f64 rs364, fd444; +} +mov.b32 r2383, {rs364, rs364}; +{ +mul.f16x2 r2381, r115, r2383; +} +{ +add.f16x2 r2384, r2360, r2381; +} +{ +cvt.rn.f16.f64 rs365, fd471; +} +mov.b32 r2389, {rs365, rs365}; +{ +mul.f16x2 r2387, r121, r2389; +} +{ +add.f16x2 r2390, r2366, r2387; +} +{ +cvt.rn.f16.f64 rs366, fd472; +} +mov.b32 r2395, {rs366, rs366}; +{ +mul.f16x2 r2393, r130, r2395; +} +{ +add.f16x2 r2396, r2372, r2393; +} +{ +cvt.rn.f16.f64 rs367, fd471; +} +mov.b32 r2401, {rs367, rs367}; +{ +mul.f16x2 r2399, r124, r2401; +} +{ +add.f16x2 r2402, r2378, r2399; +} +{ +cvt.rn.f16.f64 rs368, fd472; +} +mov.b32 r2407, {rs368, rs368}; +{ +mul.f16x2 r2405, r127, r2407; +} +{ +add.f16x2 r2408, r2384, r2405; +} +{ +sub.f16x2 %16, r2390, r2396; +} +{ +add.f16x2 %17, r2402, r2408; +} +{ +add.f16x2 %30, r2390, r2396; +} +{ +sub.f16x2 %31, r2402, r2408; +} +cvt.rn.f16.s32 rs369, r2980; +mov.b32 r2435, {rs369, rs369}; +cvt.rn.f16.s32 rs370, r2980; +mov.b32 r2447, {rs370, rs370}; +{ +cvt.rn.f16.f64 rs371, fd459; +} +mov.b32 r2427, {rs371, rs371}; +{ +mul.f16x2 r2425, r1, r2427; +} +{ +add.f16x2 r2428, %46, r2425; +} +{ +cvt.rn.f16.f64 rs372, fd460; +} +mov.b32 r2433, {rs372, rs372}; +{ +mul.f16x2 r2431, r10, r2433; +} +{ +add.f16x2 r2434, r2435, r2431; +} +{ +cvt.rn.f16.f64 rs373, fd459; +} +mov.b32 r2439, {rs373, rs373}; +{ +mul.f16x2 r2437, r4, r2439; +} +{ +add.f16x2 r2440, %47, r2437; +} +{ +cvt.rn.f16.f64 rs374, fd460; +} +mov.b32 r2445, {rs374, rs374}; +{ +mul.f16x2 r2443, r7, r2445; +} +{ +add.f16x2 r2446, r2447, r2443; +} +{ +cvt.rn.f16.f64 rs375, fd479; +} +mov.b32 r2451, {rs375, rs375}; +{ +mul.f16x2 r2449, r13, r2451; +} +{ +add.f16x2 r2452, r2428, r2449; +} +{ +cvt.rn.f16.f64 rs376, fd480; +} +mov.b32 r2457, {rs376, rs376}; +{ +mul.f16x2 r2455, r22, r2457; +} +{ +add.f16x2 r2458, r2434, r2455; +} +{ +cvt.rn.f16.f64 rs377, fd479; +} +mov.b32 r2463, {rs377, rs377}; +{ +mul.f16x2 r2461, r16, r2463; +} +{ +add.f16x2 r2464, r2440, r2461; +} +{ +cvt.rn.f16.f64 rs378, fd480; +} +mov.b32 r2469, {rs378, rs378}; +{ +mul.f16x2 r2467, r19, r2469; +} +{ +add.f16x2 r2470, r2446, r2467; +} +{ +cvt.rn.f16.f64 rs379, fd471; +} +mov.b32 r2475, {rs379, rs379}; +{ +mul.f16x2 r2473, r25, r2475; +} +{ +add.f16x2 r2476, r2452, r2473; +} +{ +cvt.rn.f16.f64 rs380, fd416; +} +mov.b32 r2481, {rs380, rs380}; +{ +mul.f16x2 r2479, r34, r2481; +} +{ +add.f16x2 r2482, r2458, r2479; +} +{ +cvt.rn.f16.f64 rs381, fd471; +} +mov.b32 r2487, {rs381, rs381}; +{ +mul.f16x2 r2485, r28, r2487; +} +{ +add.f16x2 r2488, r2464, r2485; +} +{ +cvt.rn.f16.f64 rs382, fd416; +} +mov.b32 r2493, {rs382, rs382}; +{ +mul.f16x2 r2491, r31, r2493; +} +{ +add.f16x2 r2494, r2470, r2491; +} +{ +cvt.rn.f16.f64 rs383, fd451; +} +mov.b32 r2499, {rs383, rs383}; +{ +mul.f16x2 r2497, r37, r2499; +} +{ +add.f16x2 r2500, r2476, r2497; +} +{ +cvt.rn.f16.f64 rs384, fd368; +} +mov.b32 r2505, {rs384, rs384}; +{ +mul.f16x2 r2503, r46, r2505; +} +{ +add.f16x2 r2506, r2482, r2503; +} +{ +cvt.rn.f16.f64 rs385, fd451; +} +mov.b32 r2511, {rs385, rs385}; +{ +mul.f16x2 r2509, r40, r2511; +} +{ +add.f16x2 r2512, r2488, r2509; +} +{ +cvt.rn.f16.f64 rs386, fd368; +} +mov.b32 r2517, {rs386, rs386}; +{ +mul.f16x2 r2515, r43, r2517; +} +{ +add.f16x2 r2518, r2494, r2515; +} +{ +cvt.rn.f16.f64 rs387, fd447; +} +mov.b32 r2523, {rs387, rs387}; +{ +mul.f16x2 r2521, r49, r2523; +} +{ +add.f16x2 r2524, r2500, r2521; +} +{ +cvt.rn.f16.f64 rs388, fd448; +} +mov.b32 r2529, {rs388, rs388}; +{ +mul.f16x2 r2527, r58, r2529; +} +{ +add.f16x2 r2530, r2506, r2527; +} +{ +cvt.rn.f16.f64 rs389, fd447; +} +mov.b32 r2535, {rs389, rs389}; +{ +mul.f16x2 r2533, r52, r2535; +} +{ +add.f16x2 r2536, r2512, r2533; +} +{ +cvt.rn.f16.f64 rs390, fd448; +} +mov.b32 r2541, {rs390, rs390}; +{ +mul.f16x2 r2539, r55, r2541; +} +{ +add.f16x2 r2542, r2518, r2539; +} +{ +cvt.rn.f16.f64 rs391, fd467; +} +mov.b32 r2547, {rs391, rs391}; +{ +mul.f16x2 r2545, r61, r2547; +} +{ +add.f16x2 r2548, r2524, r2545; +} +{ +cvt.rn.f16.f64 rs392, fd468; +} +mov.b32 r2553, {rs392, rs392}; +{ +mul.f16x2 r2551, r70, r2553; +} +{ +add.f16x2 r2554, r2530, r2551; +} +{ +cvt.rn.f16.f64 rs393, fd467; +} +mov.b32 r2559, {rs393, rs393}; +{ +mul.f16x2 r2557, r64, r2559; +} +{ +add.f16x2 r2560, r2536, r2557; +} +{ +cvt.rn.f16.f64 rs394, fd468; +} +mov.b32 r2565, {rs394, rs394}; +{ +mul.f16x2 r2563, r67, r2565; +} +{ +add.f16x2 r2566, r2542, r2563; +} +{ +cvt.rn.f16.f64 rs395, fd483; +} +mov.b32 r2571, {rs395, rs395}; +{ +mul.f16x2 r2569, r73, r2571; +} +{ +add.f16x2 r2572, r2548, r2569; +} +{ +cvt.rn.f16.f64 rs396, fd412; +} +mov.b32 r2577, {rs396, rs396}; +{ +mul.f16x2 r2575, r82, r2577; +} +{ +add.f16x2 r2578, r2554, r2575; +} +{ +cvt.rn.f16.f64 rs397, fd483; +} +mov.b32 r2583, {rs397, rs397}; +{ +mul.f16x2 r2581, r76, r2583; +} +{ +add.f16x2 r2584, r2560, r2581; +} +{ +cvt.rn.f16.f64 rs398, fd412; +} +mov.b32 r2589, {rs398, rs398}; +{ +mul.f16x2 r2587, r79, r2589; +} +{ +add.f16x2 r2590, r2566, r2587; +} +{ +cvt.rn.f16.f64 rs399, fd463; +} +mov.b32 r2595, {rs399, rs399}; +{ +mul.f16x2 r2593, r85, r2595; +} +{ +add.f16x2 r2596, r2572, r2593; +} +{ +cvt.rn.f16.f64 rs400, fd384; +} +mov.b32 r2601, {rs400, rs400}; +{ +mul.f16x2 r2599, r94, r2601; +} +{ +add.f16x2 r2602, r2578, r2599; +} +{ +cvt.rn.f16.f64 rs401, fd463; +} +mov.b32 r2607, {rs401, rs401}; +{ +mul.f16x2 r2605, r88, r2607; +} +{ +add.f16x2 r2608, r2584, r2605; +} +{ +cvt.rn.f16.f64 rs402, fd384; +} +mov.b32 r2613, {rs402, rs402}; +{ +mul.f16x2 r2611, r91, r2613; +} +{ +add.f16x2 r2614, r2590, r2611; +} +{ +cvt.rn.f16.f64 rs403, fd443; +} +mov.b32 r2619, {rs403, rs403}; +{ +mul.f16x2 r2617, r97, r2619; +} +{ +add.f16x2 r2620, r2596, r2617; +} +{ +cvt.rn.f16.f64 rs404, fd388; +} +mov.b32 r2625, {rs404, rs404}; +{ +mul.f16x2 r2623, r106, r2625; +} +{ +add.f16x2 r2626, r2602, r2623; +} +{ +cvt.rn.f16.f64 rs405, fd443; +} +mov.b32 r2631, {rs405, rs405}; +{ +mul.f16x2 r2629, r100, r2631; +} +{ +add.f16x2 r2632, r2608, r2629; +} +{ +cvt.rn.f16.f64 rs406, fd388; +} +mov.b32 r2637, {rs406, rs406}; +{ +mul.f16x2 r2635, r103, r2637; +} +{ +add.f16x2 r2638, r2614, r2635; +} +{ +cvt.rn.f16.f64 rs407, fd455; +} +mov.b32 r2643, {rs407, rs407}; +{ +mul.f16x2 r2641, r109, r2643; +} +{ +add.f16x2 r2644, r2620, r2641; +} +{ +cvt.rn.f16.f64 rs408, fd456; +} +mov.b32 r2649, {rs408, rs408}; +{ +mul.f16x2 r2647, r118, r2649; +} +{ +add.f16x2 r2650, r2626, r2647; +} +{ +cvt.rn.f16.f64 rs409, fd455; +} +mov.b32 r2655, {rs409, rs409}; +{ +mul.f16x2 r2653, r112, r2655; +} +{ +add.f16x2 r2656, r2632, r2653; +} +{ +cvt.rn.f16.f64 rs410, fd456; +} +mov.b32 r2661, {rs410, rs410}; +{ +mul.f16x2 r2659, r115, r2661; +} +{ +add.f16x2 r2662, r2638, r2659; +} +{ +cvt.rn.f16.f64 rs411, fd475; +} +mov.b32 r2667, {rs411, rs411}; +{ +mul.f16x2 r2665, r121, r2667; +} +{ +add.f16x2 r2668, r2644, r2665; +} +{ +cvt.rn.f16.f64 rs412, fd476; +} +mov.b32 r2673, {rs412, rs412}; +{ +mul.f16x2 r2671, r130, r2673; +} +{ +add.f16x2 r2674, r2650, r2671; +} +{ +cvt.rn.f16.f64 rs413, fd475; +} +mov.b32 r2679, {rs413, rs413}; +{ +mul.f16x2 r2677, r124, r2679; +} +{ +add.f16x2 r2680, r2656, r2677; +} +{ +cvt.rn.f16.f64 rs414, fd476; +} +mov.b32 r2685, {rs414, rs414}; +{ +mul.f16x2 r2683, r127, r2685; +} +{ +add.f16x2 r2686, r2662, r2683; +} +{ +sub.f16x2 %18, r2668, r2674; +} +{ +add.f16x2 %19, r2680, r2686; +} +{ +add.f16x2 %28, r2668, r2674; +} +{ +sub.f16x2 %29, r2680, r2686; +} +cvt.rn.f16.s32 rs415, r2980; +mov.b32 r2713, {rs415, rs415}; +cvt.rn.f16.s32 rs416, r2980; +mov.b32 r2725, {rs416, rs416}; +{ +cvt.rn.f16.f64 rs417, fd451; +} +mov.b32 r2705, {rs417, rs417}; +{ +mul.f16x2 r2703, r1, r2705; +} +{ +add.f16x2 r2706, %46, r2703; +} +{ +cvt.rn.f16.f64 rs418, fd452; +} +mov.b32 r2711, {rs418, rs418}; +{ +mul.f16x2 r2709, r10, r2711; +} +{ +add.f16x2 r2712, r2713, r2709; +} +{ +cvt.rn.f16.f64 rs419, fd451; +} +mov.b32 r2717, {rs419, rs419}; +{ +mul.f16x2 r2715, r4, r2717; +} +{ +add.f16x2 r2718, %47, r2715; +} +{ +cvt.rn.f16.f64 rs420, fd452; +} +mov.b32 r2723, {rs420, rs420}; +{ +mul.f16x2 r2721, r7, r2723; +} +{ +add.f16x2 r2724, r2725, r2721; +} +{ +cvt.rn.f16.f64 rs421, fd463; +} +mov.b32 r2729, {rs421, rs421}; +{ +mul.f16x2 r2727, r13, r2729; +} +{ +add.f16x2 r2730, r2706, r2727; +} +{ +cvt.rn.f16.f64 rs422, fd464; +} +mov.b32 r2735, {rs422, rs422}; +{ +mul.f16x2 r2733, r22, r2735; +} +{ +add.f16x2 r2736, r2712, r2733; +} +{ +cvt.rn.f16.f64 rs423, fd463; +} +mov.b32 r2741, {rs423, rs423}; +{ +mul.f16x2 r2739, r16, r2741; +} +{ +add.f16x2 r2742, r2718, r2739; +} +{ +cvt.rn.f16.f64 rs424, fd464; +} +mov.b32 r2747, {rs424, rs424}; +{ +mul.f16x2 r2745, r19, r2747; +} +{ +add.f16x2 r2748, r2724, r2745; +} +{ +cvt.rn.f16.f64 rs425, fd475; +} +mov.b32 r2753, {rs425, rs425}; +{ +mul.f16x2 r2751, r25, r2753; +} +{ +add.f16x2 r2754, r2730, r2751; +} +{ +cvt.rn.f16.f64 rs426, fd476; +} +mov.b32 r2759, {rs426, rs426}; +{ +mul.f16x2 r2757, r34, r2759; +} +{ +add.f16x2 r2760, r2736, r2757; +} +{ +cvt.rn.f16.f64 rs427, fd475; +} +mov.b32 r2765, {rs427, rs427}; +{ +mul.f16x2 r2763, r28, r2765; +} +{ +add.f16x2 r2766, r2742, r2763; +} +{ +cvt.rn.f16.f64 rs428, fd476; +} +mov.b32 r2771, {rs428, rs428}; +{ +mul.f16x2 r2769, r31, r2771; +} +{ +add.f16x2 r2772, r2748, r2769; +} +{ +cvt.rn.f16.f64 rs429, fd483; +} +mov.b32 r2777, {rs429, rs429}; +{ +mul.f16x2 r2775, r37, r2777; +} +{ +add.f16x2 r2778, r2754, r2775; +} +{ +cvt.rn.f16.f64 rs430, fd412; +} +mov.b32 r2783, {rs430, rs430}; +{ +mul.f16x2 r2781, r46, r2783; +} +{ +add.f16x2 r2784, r2760, r2781; +} +{ +cvt.rn.f16.f64 rs431, fd483; +} +mov.b32 r2789, {rs431, rs431}; +{ +mul.f16x2 r2787, r40, r2789; +} +{ +add.f16x2 r2790, r2766, r2787; +} +{ +cvt.rn.f16.f64 rs432, fd412; +} +mov.b32 r2795, {rs432, rs432}; +{ +mul.f16x2 r2793, r43, r2795; +} +{ +add.f16x2 r2796, r2772, r2793; +} +{ +cvt.rn.f16.f64 rs433, fd471; +} +mov.b32 r2801, {rs433, rs433}; +{ +mul.f16x2 r2799, r49, r2801; +} +{ +add.f16x2 r2802, r2778, r2799; +} +{ +cvt.rn.f16.f64 rs434, fd416; +} +mov.b32 r2807, {rs434, rs434}; +{ +mul.f16x2 r2805, r58, r2807; +} +{ +add.f16x2 r2808, r2784, r2805; +} +{ +cvt.rn.f16.f64 rs435, fd471; +} +mov.b32 r2813, {rs435, rs435}; +{ +mul.f16x2 r2811, r52, r2813; +} +{ +add.f16x2 r2814, r2790, r2811; +} +{ +cvt.rn.f16.f64 rs436, fd416; +} +mov.b32 r2819, {rs436, rs436}; +{ +mul.f16x2 r2817, r55, r2819; +} +{ +add.f16x2 r2820, r2796, r2817; +} +{ +cvt.rn.f16.f64 rs437, fd459; +} +mov.b32 r2825, {rs437, rs437}; +{ +mul.f16x2 r2823, r61, r2825; +} +{ +add.f16x2 r2826, r2802, r2823; +} +{ +cvt.rn.f16.f64 rs438, fd420; +} +mov.b32 r2831, {rs438, rs438}; +{ +mul.f16x2 r2829, r70, r2831; +} +{ +add.f16x2 r2832, r2808, r2829; +} +{ +cvt.rn.f16.f64 rs439, fd459; +} +mov.b32 r2837, {rs439, rs439}; +{ +mul.f16x2 r2835, r64, r2837; +} +{ +add.f16x2 r2838, r2814, r2835; +} +{ +cvt.rn.f16.f64 rs440, fd420; +} +mov.b32 r2843, {rs440, rs440}; +{ +mul.f16x2 r2841, r67, r2843; +} +{ +add.f16x2 r2844, r2820, r2841; +} +{ +cvt.rn.f16.f64 rs441, fd447; +} +mov.b32 r2849, {rs441, rs441}; +{ +mul.f16x2 r2847, r73, r2849; +} +{ +add.f16x2 r2850, r2826, r2847; +} +{ +cvt.rn.f16.f64 rs442, fd424; +} +mov.b32 r2855, {rs442, rs442}; +{ +mul.f16x2 r2853, r82, r2855; +} +{ +add.f16x2 r2856, r2832, r2853; +} +{ +cvt.rn.f16.f64 rs443, fd447; +} +mov.b32 r2861, {rs443, rs443}; +{ +mul.f16x2 r2859, r76, r2861; +} +{ +add.f16x2 r2862, r2838, r2859; +} +{ +cvt.rn.f16.f64 rs444, fd424; +} +mov.b32 r2867, {rs444, rs444}; +{ +mul.f16x2 r2865, r79, r2867; +} +{ +add.f16x2 r2868, r2844, r2865; +} +{ +cvt.rn.f16.f64 rs445, fd443; +} +mov.b32 r2873, {rs445, rs445}; +{ +mul.f16x2 r2871, r85, r2873; +} +{ +add.f16x2 r2874, r2850, r2871; +} +{ +cvt.rn.f16.f64 rs446, fd444; +} +mov.b32 r2879, {rs446, rs446}; +{ +mul.f16x2 r2877, r94, r2879; +} +{ +add.f16x2 r2880, r2856, r2877; +} +{ +cvt.rn.f16.f64 rs447, fd443; +} +mov.b32 r2885, {rs447, rs447}; +{ +mul.f16x2 r2883, r88, r2885; +} +{ +add.f16x2 r2886, r2862, r2883; +} +{ +cvt.rn.f16.f64 rs448, fd444; +} +mov.b32 r2891, {rs448, rs448}; +{ +mul.f16x2 r2889, r91, r2891; +} +{ +add.f16x2 r2892, r2868, r2889; +} +{ +cvt.rn.f16.f64 rs449, fd455; +} +mov.b32 r2897, {rs449, rs449}; +{ +mul.f16x2 r2895, r97, r2897; +} +{ +add.f16x2 r2898, r2874, r2895; +} +{ +cvt.rn.f16.f64 rs450, fd456; +} +mov.b32 r2903, {rs450, rs450}; +{ +mul.f16x2 r2901, r106, r2903; +} +{ +add.f16x2 r2904, r2880, r2901; +} +{ +cvt.rn.f16.f64 rs451, fd455; +} +mov.b32 r2909, {rs451, rs451}; +{ +mul.f16x2 r2907, r100, r2909; +} +{ +add.f16x2 r2910, r2886, r2907; +} +{ +cvt.rn.f16.f64 rs452, fd456; +} +mov.b32 r2915, {rs452, rs452}; +{ +mul.f16x2 r2913, r103, r2915; +} +{ +add.f16x2 r2916, r2892, r2913; +} +{ +cvt.rn.f16.f64 rs453, fd467; +} +mov.b32 r2921, {rs453, rs453}; +{ +mul.f16x2 r2919, r109, r2921; +} +{ +add.f16x2 r2922, r2898, r2919; +} +{ +cvt.rn.f16.f64 rs454, fd468; +} +mov.b32 r2927, {rs454, rs454}; +{ +mul.f16x2 r2925, r118, r2927; +} +{ +add.f16x2 r2928, r2904, r2925; +} +{ +cvt.rn.f16.f64 rs455, fd467; +} +mov.b32 r2933, {rs455, rs455}; +{ +mul.f16x2 r2931, r112, r2933; +} +{ +add.f16x2 r2934, r2910, r2931; +} +{ +cvt.rn.f16.f64 rs456, fd468; +} +mov.b32 r2939, {rs456, rs456}; +{ +mul.f16x2 r2937, r115, r2939; +} +{ +add.f16x2 r2940, r2916, r2937; +} +{ +cvt.rn.f16.f64 rs457, fd479; +} +mov.b32 r2945, {rs457, rs457}; +{ +mul.f16x2 r2943, r121, r2945; +} +{ +add.f16x2 r2946, r2922, r2943; +} +{ +cvt.rn.f16.f64 rs458, fd480; +} +mov.b32 r2951, {rs458, rs458}; +{ +mul.f16x2 r2949, r130, r2951; +} +{ +add.f16x2 r2952, r2928, r2949; +} +{ +cvt.rn.f16.f64 rs459, fd479; +} +mov.b32 r2957, {rs459, rs459}; +{ +mul.f16x2 r2955, r124, r2957; +} +{ +add.f16x2 r2958, r2934, r2955; +} +{ +cvt.rn.f16.f64 rs460, fd480; +} +mov.b32 r2963, {rs460, rs460}; +{ +mul.f16x2 r2961, r127, r2963; +} +{ +add.f16x2 r2964, r2940, r2961; +} +{ +sub.f16x2 %20, r2946, r2952; +} +{ +add.f16x2 %21, r2958, r2964; +} +{ +add.f16x2 %26, r2946, r2952; +} +{ +sub.f16x2 %27, r2958, r2964; +} +cvt.rn.f16.s32 rs461, r2980; +mov.b32 r2991, {rs461, rs461}; +cvt.rn.f16.s32 rs462, r2980; +mov.b32 r3003, {rs462, rs462}; +{ +cvt.rn.f16.f64 rs463, fd443; +} +mov.b32 r2983, {rs463, rs463}; +{ +mul.f16x2 r2981, r1, r2983; +} +{ +add.f16x2 r2984, %46, r2981; +} +{ +cvt.rn.f16.f64 rs464, fd444; +} +mov.b32 r2989, {rs464, rs464}; +{ +mul.f16x2 r2987, r10, r2989; +} +{ +add.f16x2 r2990, r2991, r2987; +} +{ +cvt.rn.f16.f64 rs465, fd443; +} +mov.b32 r2995, {rs465, rs465}; +{ +mul.f16x2 r2993, r4, r2995; +} +{ +add.f16x2 r2996, %47, r2993; +} +{ +cvt.rn.f16.f64 rs466, fd444; +} +mov.b32 r3001, {rs466, rs466}; +{ +mul.f16x2 r2999, r7, r3001; +} +{ +add.f16x2 r3002, r3003, r2999; +} +{ +cvt.rn.f16.f64 rs467, fd447; +} +mov.b32 r3007, {rs467, rs467}; +{ +mul.f16x2 r3005, r13, r3007; +} +{ +add.f16x2 r3008, r2984, r3005; +} +{ +cvt.rn.f16.f64 rs468, fd448; +} +mov.b32 r3013, {rs468, rs468}; +{ +mul.f16x2 r3011, r22, r3013; +} +{ +add.f16x2 r3014, r2990, r3011; +} +{ +cvt.rn.f16.f64 rs469, fd447; +} +mov.b32 r3019, {rs469, rs469}; +{ +mul.f16x2 r3017, r16, r3019; +} +{ +add.f16x2 r3020, r2996, r3017; +} +{ +cvt.rn.f16.f64 rs470, fd448; +} +mov.b32 r3025, {rs470, rs470}; +{ +mul.f16x2 r3023, r19, r3025; +} +{ +add.f16x2 r3026, r3002, r3023; +} +{ +cvt.rn.f16.f64 rs471, fd451; +} +mov.b32 r3031, {rs471, rs471}; +{ +mul.f16x2 r3029, r25, r3031; +} +{ +add.f16x2 r3032, r3008, r3029; +} +{ +cvt.rn.f16.f64 rs472, fd452; +} +mov.b32 r3037, {rs472, rs472}; +{ +mul.f16x2 r3035, r34, r3037; +} +{ +add.f16x2 r3038, r3014, r3035; +} +{ +cvt.rn.f16.f64 rs473, fd451; +} +mov.b32 r3043, {rs473, rs473}; +{ +mul.f16x2 r3041, r28, r3043; +} +{ +add.f16x2 r3044, r3020, r3041; +} +{ +cvt.rn.f16.f64 rs474, fd452; +} +mov.b32 r3049, {rs474, rs474}; +{ +mul.f16x2 r3047, r31, r3049; +} +{ +add.f16x2 r3050, r3026, r3047; +} +{ +cvt.rn.f16.f64 rs475, fd455; +} +mov.b32 r3055, {rs475, rs475}; +{ +mul.f16x2 r3053, r37, r3055; +} +{ +add.f16x2 r3056, r3032, r3053; +} +{ +cvt.rn.f16.f64 rs476, fd456; +} +mov.b32 r3061, {rs476, rs476}; +{ +mul.f16x2 r3059, r46, r3061; +} +{ +add.f16x2 r3062, r3038, r3059; +} +{ +cvt.rn.f16.f64 rs477, fd455; +} +mov.b32 r3067, {rs477, rs477}; +{ +mul.f16x2 r3065, r40, r3067; +} +{ +add.f16x2 r3068, r3044, r3065; +} +{ +cvt.rn.f16.f64 rs478, fd456; +} +mov.b32 r3073, {rs478, rs478}; +{ +mul.f16x2 r3071, r43, r3073; +} +{ +add.f16x2 r3074, r3050, r3071; +} +{ +cvt.rn.f16.f64 rs479, fd459; +} +mov.b32 r3079, {rs479, rs479}; +{ +mul.f16x2 r3077, r49, r3079; +} +{ +add.f16x2 r3080, r3056, r3077; +} +{ +cvt.rn.f16.f64 rs480, fd460; +} +mov.b32 r3085, {rs480, rs480}; +{ +mul.f16x2 r3083, r58, r3085; +} +{ +add.f16x2 r3086, r3062, r3083; +} +{ +cvt.rn.f16.f64 rs481, fd459; +} +mov.b32 r3091, {rs481, rs481}; +{ +mul.f16x2 r3089, r52, r3091; +} +{ +add.f16x2 r3092, r3068, r3089; +} +{ +cvt.rn.f16.f64 rs482, fd460; +} +mov.b32 r3097, {rs482, rs482}; +{ +mul.f16x2 r3095, r55, r3097; +} +{ +add.f16x2 r3098, r3074, r3095; +} +{ +cvt.rn.f16.f64 rs483, fd463; +} +mov.b32 r3103, {rs483, rs483}; +{ +mul.f16x2 r3101, r61, r3103; +} +{ +add.f16x2 r3104, r3080, r3101; +} +{ +cvt.rn.f16.f64 rs484, fd464; +} +mov.b32 r3109, {rs484, rs484}; +{ +mul.f16x2 r3107, r70, r3109; +} +{ +add.f16x2 r3110, r3086, r3107; +} +{ +cvt.rn.f16.f64 rs485, fd463; +} +mov.b32 r3115, {rs485, rs485}; +{ +mul.f16x2 r3113, r64, r3115; +} +{ +add.f16x2 r3116, r3092, r3113; +} +{ +cvt.rn.f16.f64 rs486, fd464; +} +mov.b32 r3121, {rs486, rs486}; +{ +mul.f16x2 r3119, r67, r3121; +} +{ +add.f16x2 r3122, r3098, r3119; +} +{ +cvt.rn.f16.f64 rs487, fd467; +} +mov.b32 r3127, {rs487, rs487}; +{ +mul.f16x2 r3125, r73, r3127; +} +{ +add.f16x2 r3128, r3104, r3125; +} +{ +cvt.rn.f16.f64 rs488, fd468; +} +mov.b32 r3133, {rs488, rs488}; +{ +mul.f16x2 r3131, r82, r3133; +} +{ +add.f16x2 r3134, r3110, r3131; +} +{ +cvt.rn.f16.f64 rs489, fd467; +} +mov.b32 r3139, {rs489, rs489}; +{ +mul.f16x2 r3137, r76, r3139; +} +{ +add.f16x2 r3140, r3116, r3137; +} +{ +cvt.rn.f16.f64 rs490, fd468; +} +mov.b32 r3145, {rs490, rs490}; +{ +mul.f16x2 r3143, r79, r3145; +} +{ +add.f16x2 r3146, r3122, r3143; +} +{ +cvt.rn.f16.f64 rs491, fd471; +} +mov.b32 r3151, {rs491, rs491}; +{ +mul.f16x2 r3149, r85, r3151; +} +{ +add.f16x2 r3152, r3128, r3149; +} +{ +cvt.rn.f16.f64 rs492, fd472; +} +mov.b32 r3157, {rs492, rs492}; +{ +mul.f16x2 r3155, r94, r3157; +} +{ +add.f16x2 r3158, r3134, r3155; +} +{ +cvt.rn.f16.f64 rs493, fd471; +} +mov.b32 r3163, {rs493, rs493}; +{ +mul.f16x2 r3161, r88, r3163; +} +{ +add.f16x2 r3164, r3140, r3161; +} +{ +cvt.rn.f16.f64 rs494, fd472; +} +mov.b32 r3169, {rs494, rs494}; +{ +mul.f16x2 r3167, r91, r3169; +} +{ +add.f16x2 r3170, r3146, r3167; +} +{ +cvt.rn.f16.f64 rs495, fd475; +} +mov.b32 r3175, {rs495, rs495}; +{ +mul.f16x2 r3173, r97, r3175; +} +{ +add.f16x2 r3176, r3152, r3173; +} +{ +cvt.rn.f16.f64 rs496, fd476; +} +mov.b32 r3181, {rs496, rs496}; +{ +mul.f16x2 r3179, r106, r3181; +} +{ +add.f16x2 r3182, r3158, r3179; +} +{ +cvt.rn.f16.f64 rs497, fd475; +} +mov.b32 r3187, {rs497, rs497}; +{ +mul.f16x2 r3185, r100, r3187; +} +{ +add.f16x2 r3188, r3164, r3185; +} +{ +cvt.rn.f16.f64 rs498, fd476; +} +mov.b32 r3193, {rs498, rs498}; +{ +mul.f16x2 r3191, r103, r3193; +} +{ +add.f16x2 r3194, r3170, r3191; +} +{ +cvt.rn.f16.f64 rs499, fd479; +} +mov.b32 r3199, {rs499, rs499}; +{ +mul.f16x2 r3197, r109, r3199; +} +{ +add.f16x2 r3200, r3176, r3197; +} +{ +cvt.rn.f16.f64 rs500, fd480; +} +mov.b32 r3205, {rs500, rs500}; +{ +mul.f16x2 r3203, r118, r3205; +} +{ +add.f16x2 r3206, r3182, r3203; +} +{ +cvt.rn.f16.f64 rs501, fd479; +} +mov.b32 r3211, {rs501, rs501}; +{ +mul.f16x2 r3209, r112, r3211; +} +{ +add.f16x2 r3212, r3188, r3209; +} +{ +cvt.rn.f16.f64 rs502, fd480; +} +mov.b32 r3217, {rs502, rs502}; +{ +mul.f16x2 r3215, r115, r3217; +} +{ +add.f16x2 r3218, r3194, r3215; +} +{ +cvt.rn.f16.f64 rs503, fd483; +} +mov.b32 r3223, {rs503, rs503}; +{ +mul.f16x2 r3221, r121, r3223; +} +{ +add.f16x2 r3224, r3200, r3221; +} +{ +cvt.rn.f16.f64 rs504, fd484; +} +mov.b32 r3229, {rs504, rs504}; +{ +mul.f16x2 r3227, r130, r3229; +} +{ +add.f16x2 r3230, r3206, r3227; +} +{ +cvt.rn.f16.f64 rs505, fd483; +} +mov.b32 r3235, {rs505, rs505}; +{ +mul.f16x2 r3233, r124, r3235; +} +{ +add.f16x2 r3236, r3212, r3233; +} +{ +cvt.rn.f16.f64 rs506, fd484; +} +mov.b32 r3241, {rs506, rs506}; +{ +mul.f16x2 r3239, r127, r3241; +} +{ +add.f16x2 r3242, r3218, r3239; +} +{ +sub.f16x2 %22, r3224, r3230; +} +{ +add.f16x2 %23, r3236, r3242; +} +{ +add.f16x2 %24, r3224, r3230; +} +{ +sub.f16x2 %25, r3236, r3242; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..73f9a2e9b8d9e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp32_fwd.hpp.inc @@ -0,0 +1,610 @@ +#ifndef CUFFTDX_FFT_23_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_23_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<9, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<758>; +.reg .b64 rd<4>; +add.f32 f93, %48, %91; +sub.f32 f95, %48, %91; +add.f32 f757, %50, %92; +sub.f32 f96, %50, %92; +add.f32 f97, %51, %89; +sub.f32 f99, %51, %89; +add.f32 f755, %93, %90; +sub.f32 f100, %93, %90; +add.f32 f101, %53, %87; +sub.f32 f103, %53, %87; +add.f32 f753, %54, %94; +sub.f32 f104, %54, %94; +add.f32 f105, %55, %85; +sub.f32 f107, %55, %85; +add.f32 f750, %95, %96; +sub.f32 f108, %95, %96; +add.f32 f109, %57, %83; +sub.f32 f111, %57, %83; +add.f32 f748, %97, %84; +sub.f32 f112, %97, %84; +add.f32 f113, %59, %81; +sub.f32 f115, %59, %81; +add.f32 f746, %60, %98; +sub.f32 f116, %60, %98; +add.f32 f117, %61, %79; +sub.f32 f119, %61, %79; +add.f32 f743, %100, %99; +sub.f32 f120, %100, %99; +add.f32 f121, %63, %77; +sub.f32 f123, %63, %77; +add.f32 f741, %101, %78; +sub.f32 f124, %101, %78; +add.f32 f125, %65, %75; +sub.f32 f127, %65, %75; +add.f32 f739, %66, %102; +sub.f32 f128, %66, %102; +add.f32 f129, %67, %73; +sub.f32 f131, %67, %73; +add.f32 f736, %103, %104; +sub.f32 f132, %103, %104; +add.f32 f133, %69, %71; +sub.f32 f135, %69, %71; +add.f32 f734, %105, %72; +sub.f32 f136, %105, %72; +add.f32 f137, %46, f93; +add.f32 f139, f137, f97; +add.f32 f733, %47, f757; +add.f32 f140, f733, f755; +add.f32 f141, f139, f101; +add.f32 f142, f140, f753; +add.f32 f143, f141, f105; +add.f32 f144, f142, f750; +add.f32 f145, f143, f109; +add.f32 f146, f144, f748; +add.f32 f147, f145, f113; +add.f32 f148, f146, f746; +add.f32 f149, f147, f117; +add.f32 f150, f148, f743; +add.f32 f151, f149, f121; +add.f32 f152, f150, f741; +add.f32 f153, f151, f125; +add.f32 f154, f152, f739; +add.f32 f155, f153, f129; +add.f32 f156, f154, f736; +fma.rn.f32 f157, f93, 0f3F7681BF, %46; +fma.rn.f32 f161, f97, 0f3F5ABB3B, f157; +fma.rn.f32 f732, f96, 0fBE8A22CD, 0f00000000; +fma.rn.f32 f162, f100, 0fBF050374, f732; +fma.rn.f32 f731, f757, 0f3F7681BF, %47; +fma.rn.f32 f163, f755, 0f3F5ABB3B, f731; +fma.rn.f32 f730, f95, 0fBE8A22CD, 0f00000000; +fma.rn.f32 f164, f99, 0fBF050374, f730; +fma.rn.f32 f165, f101, 0f3F2EBBCE, f161; +fma.rn.f32 f166, f104, 0fBF3B1811, f162; +fma.rn.f32 f167, f753, 0f3F2EBBCE, f163; +fma.rn.f32 f168, f103, 0fBF3B1811, f164; +fma.rn.f32 f169, f105, 0f3EEB8DA5, f165; +fma.rn.f32 f170, f108, 0fBF634C72, f166; +fma.rn.f32 f171, f750, 0f3EEB8DA5, f167; +fma.rn.f32 f172, f107, 0fBF634C72, f168; +fma.rn.f32 f173, f109, 0f3E5056C6, f169; +fma.rn.f32 f174, f112, 0fBF7AA541, f170; +fma.rn.f32 f175, f748, 0f3E5056C6, f171; +fma.rn.f32 f176, f111, 0fBF7AA541, f172; +fma.rn.f32 f177, f113, 0fBD8BC2AE, f173; +fma.rn.f32 f178, f116, 0fBF7F6738, f174; +fma.rn.f32 f179, f746, 0fBD8BC2AE, f175; +fma.rn.f32 f180, f115, 0fBF7F6738, f176; +fma.rn.f32 f181, f117, 0fBEAB7557, f177; +fma.rn.f32 f182, f120, 0fBF713803, f178; +fma.rn.f32 f183, f743, 0fBEAB7557, f179; +fma.rn.f32 f184, f119, 0fBF713803, f180; +fma.rn.f32 f185, f121, 0fBF13A152, f181; +fma.rn.f32 f186, f124, 0fBF5124F0, f182; +fma.rn.f32 f187, f741, 0fBF13A152, f183; +fma.rn.f32 f188, f123, 0fBF5124F0, f184; +fma.rn.f32 f189, f125, 0fBF469504, f185; +fma.rn.f32 f190, f128, 0fBF218EFB, f186; +fma.rn.f32 f191, f739, 0fBF469504, f187; +fma.rn.f32 f192, f127, 0fBF218EFB, f188; +fma.rn.f32 f193, f129, 0fBF6ACE5C, f189; +fma.rn.f32 f194, f132, 0fBECBFB3A, f190; +fma.rn.f32 f195, f736, 0fBF6ACE5C, f191; +fma.rn.f32 f196, f131, 0fBECBFB3A, f192; +fma.rn.f32 f197, f133, 0fBF7D9D98, f193; +fma.rn.f32 f198, f136, 0fBE0B6F45, f194; +fma.rn.f32 f199, f734, 0fBF7D9D98, f195; +fma.rn.f32 f200, f135, 0fBE0B6F45, f196; +fma.rn.f32 f201, f93, 0f3F5ABB3B, %46; +fma.rn.f32 f205, f97, 0f3EEB8DA5, f201; +fma.rn.f32 f729, f96, 0fBF050374, 0f00000000; +fma.rn.f32 f206, f100, 0fBF634C72, f729; +fma.rn.f32 f728, f757, 0f3F5ABB3B, %47; +fma.rn.f32 f207, f755, 0f3EEB8DA5, f728; +fma.rn.f32 f727, f95, 0fBF050374, 0f00000000; +fma.rn.f32 f208, f99, 0fBF634C72, f727; +fma.rn.f32 f209, f101, 0fBD8BC2AE, f205; +fma.rn.f32 f210, f104, 0fBF7F6738, f206; +fma.rn.f32 f211, f753, 0fBD8BC2AE, f207; +fma.rn.f32 f212, f103, 0fBF7F6738, f208; +fma.rn.f32 f213, f105, 0fBF13A152, f209; +fma.rn.f32 f214, f108, 0fBF5124F0, f210; +fma.rn.f32 f215, f750, 0fBF13A152, f211; +fma.rn.f32 f216, f107, 0fBF5124F0, f212; +fma.rn.f32 f217, f109, 0fBF6ACE5C, f213; +fma.rn.f32 f218, f112, 0fBECBFB3A, f214; +fma.rn.f32 f219, f748, 0fBF6ACE5C, f215; +fma.rn.f32 f220, f111, 0fBECBFB3A, f216; +fma.rn.f32 f221, f113, 0fBF7D9D98, f217; +fma.rn.f32 f222, f116, 0f3E0B6F45, f218; +fma.rn.f32 f223, f746, 0fBF7D9D98, f219; +fma.rn.f32 f224, f115, 0f3E0B6F45, f220; +fma.rn.f32 f225, f117, 0fBF469504, f221; +fma.rn.f32 f226, f120, 0f3F218EFB, f222; +fma.rn.f32 f227, f743, 0fBF469504, f223; +fma.rn.f32 f228, f119, 0f3F218EFB, f224; +fma.rn.f32 f229, f121, 0fBEAB7557, f225; +fma.rn.f32 f230, f124, 0f3F713803, f226; +fma.rn.f32 f231, f741, 0fBEAB7557, f227; +fma.rn.f32 f232, f123, 0f3F713803, f228; +fma.rn.f32 f233, f125, 0f3E5056C6, f229; +fma.rn.f32 f234, f128, 0f3F7AA541, f230; +fma.rn.f32 f235, f739, 0f3E5056C6, f231; +fma.rn.f32 f236, f127, 0f3F7AA541, f232; +fma.rn.f32 f237, f129, 0f3F2EBBCE, f233; +fma.rn.f32 f238, f132, 0f3F3B1811, f234; +fma.rn.f32 f239, f736, 0f3F2EBBCE, f235; +fma.rn.f32 f240, f131, 0f3F3B1811, f236; +fma.rn.f32 f241, f133, 0f3F7681BF, f237; +fma.rn.f32 f242, f136, 0f3E8A22CD, f238; +fma.rn.f32 f243, f734, 0f3F7681BF, f239; +fma.rn.f32 f244, f135, 0f3E8A22CD, f240; +fma.rn.f32 f245, f93, 0f3F2EBBCE, %46; +fma.rn.f32 f249, f97, 0fBD8BC2AE, f245; +fma.rn.f32 f726, f96, 0fBF3B1811, 0f00000000; +fma.rn.f32 f250, f100, 0fBF7F6738, f726; +fma.rn.f32 f725, f757, 0f3F2EBBCE, %47; +fma.rn.f32 f251, f755, 0fBD8BC2AE, f725; +fma.rn.f32 f724, f95, 0fBF3B1811, 0f00000000; +fma.rn.f32 f252, f99, 0fBF7F6738, f724; +fma.rn.f32 f253, f101, 0fBF469504, f249; +fma.rn.f32 f254, f104, 0fBF218EFB, f250; +fma.rn.f32 f255, f753, 0fBF469504, f251; +fma.rn.f32 f256, f103, 0fBF218EFB, f252; +fma.rn.f32 f257, f105, 0fBF7D9D98, f253; +fma.rn.f32 f258, f108, 0f3E0B6F45, f254; +fma.rn.f32 f259, f750, 0fBF7D9D98, f255; +fma.rn.f32 f260, f107, 0f3E0B6F45, f256; +fma.rn.f32 f261, f109, 0fBF13A152, f257; +fma.rn.f32 f262, f112, 0f3F5124F0, f258; +fma.rn.f32 f263, f748, 0fBF13A152, f259; +fma.rn.f32 f264, f111, 0f3F5124F0, f260; +fma.rn.f32 f265, f113, 0f3E5056C6, f261; +fma.rn.f32 f266, f116, 0f3F7AA541, f262; +fma.rn.f32 f267, f746, 0f3E5056C6, f263; +fma.rn.f32 f268, f115, 0f3F7AA541, f264; +fma.rn.f32 f269, f117, 0f3F5ABB3B, f265; +fma.rn.f32 f270, f120, 0f3F050374, f266; +fma.rn.f32 f271, f743, 0f3F5ABB3B, f267; +fma.rn.f32 f272, f119, 0f3F050374, f268; +fma.rn.f32 f273, f121, 0f3F7681BF, f269; +fma.rn.f32 f274, f124, 0fBE8A22CD, f270; +fma.rn.f32 f275, f741, 0f3F7681BF, f271; +fma.rn.f32 f276, f123, 0fBE8A22CD, f272; +fma.rn.f32 f277, f125, 0f3EEB8DA5, f273; +fma.rn.f32 f278, f128, 0fBF634C72, f274; +fma.rn.f32 f279, f739, 0f3EEB8DA5, f275; +fma.rn.f32 f280, f127, 0fBF634C72, f276; +fma.rn.f32 f281, f129, 0fBEAB7557, f277; +fma.rn.f32 f282, f132, 0fBF713803, f278; +fma.rn.f32 f283, f736, 0fBEAB7557, f279; +fma.rn.f32 f284, f131, 0fBF713803, f280; +fma.rn.f32 f285, f133, 0fBF6ACE5C, f281; +fma.rn.f32 f286, f136, 0fBECBFB3A, f282; +fma.rn.f32 f287, f734, 0fBF6ACE5C, f283; +fma.rn.f32 f288, f135, 0fBECBFB3A, f284; +fma.rn.f32 f289, f93, 0f3EEB8DA5, %46; +fma.rn.f32 f293, f97, 0fBF13A152, f289; +fma.rn.f32 f723, f96, 0fBF634C72, 0f00000000; +fma.rn.f32 f294, f100, 0fBF5124F0, f723; +fma.rn.f32 f722, f757, 0f3EEB8DA5, %47; +fma.rn.f32 f295, f755, 0fBF13A152, f722; +fma.rn.f32 f721, f95, 0fBF634C72, 0f00000000; +fma.rn.f32 f296, f99, 0fBF5124F0, f721; +fma.rn.f32 f297, f101, 0fBF7D9D98, f293; +fma.rn.f32 f298, f104, 0f3E0B6F45, f294; +fma.rn.f32 f299, f753, 0fBF7D9D98, f295; +fma.rn.f32 f300, f103, 0f3E0B6F45, f296; +fma.rn.f32 f301, f105, 0fBEAB7557, f297; +fma.rn.f32 f302, f108, 0f3F713803, f298; +fma.rn.f32 f303, f750, 0fBEAB7557, f299; +fma.rn.f32 f304, f107, 0f3F713803, f300; +fma.rn.f32 f305, f109, 0f3F2EBBCE, f301; +fma.rn.f32 f306, f112, 0f3F3B1811, f302; +fma.rn.f32 f307, f748, 0f3F2EBBCE, f303; +fma.rn.f32 f308, f111, 0f3F3B1811, f304; +fma.rn.f32 f309, f113, 0f3F7681BF, f305; +fma.rn.f32 f310, f116, 0fBE8A22CD, f306; +fma.rn.f32 f311, f746, 0f3F7681BF, f307; +fma.rn.f32 f312, f115, 0fBE8A22CD, f308; +fma.rn.f32 f313, f117, 0f3E5056C6, f309; +fma.rn.f32 f314, f120, 0fBF7AA541, f310; +fma.rn.f32 f315, f743, 0f3E5056C6, f311; +fma.rn.f32 f316, f119, 0fBF7AA541, f312; +fma.rn.f32 f317, f121, 0fBF469504, f313; +fma.rn.f32 f318, f124, 0fBF218EFB, f314; +fma.rn.f32 f319, f741, 0fBF469504, f315; +fma.rn.f32 f320, f123, 0fBF218EFB, f316; +fma.rn.f32 f321, f125, 0fBF6ACE5C, f317; +fma.rn.f32 f322, f128, 0f3ECBFB3A, f318; +fma.rn.f32 f323, f739, 0fBF6ACE5C, f319; +fma.rn.f32 f324, f127, 0f3ECBFB3A, f320; +fma.rn.f32 f325, f129, 0fBD8BC2AE, f321; +fma.rn.f32 f326, f132, 0f3F7F6738, f322; +fma.rn.f32 f327, f736, 0fBD8BC2AE, f323; +fma.rn.f32 f328, f131, 0f3F7F6738, f324; +fma.rn.f32 f329, f133, 0f3F5ABB3B, f325; +fma.rn.f32 f330, f136, 0f3F050374, f326; +fma.rn.f32 f331, f734, 0f3F5ABB3B, f327; +fma.rn.f32 f332, f135, 0f3F050374, f328; +fma.rn.f32 f333, f93, 0f3E5056C6, %46; +fma.rn.f32 f337, f97, 0fBF6ACE5C, f333; +fma.rn.f32 f720, f96, 0fBF7AA541, 0f00000000; +fma.rn.f32 f338, f100, 0fBECBFB3A, f720; +fma.rn.f32 f719, f757, 0f3E5056C6, %47; +fma.rn.f32 f339, f755, 0fBF6ACE5C, f719; +fma.rn.f32 f718, f95, 0fBF7AA541, 0f00000000; +fma.rn.f32 f340, f99, 0fBECBFB3A, f718; +fma.rn.f32 f341, f101, 0fBF13A152, f337; +fma.rn.f32 f342, f104, 0f3F5124F0, f338; +fma.rn.f32 f343, f753, 0fBF13A152, f339; +fma.rn.f32 f344, f103, 0f3F5124F0, f340; +fma.rn.f32 f345, f105, 0f3F2EBBCE, f341; +fma.rn.f32 f346, f108, 0f3F3B1811, f342; +fma.rn.f32 f347, f750, 0f3F2EBBCE, f343; +fma.rn.f32 f348, f107, 0f3F3B1811, f344; +fma.rn.f32 f349, f109, 0f3F5ABB3B, f345; +fma.rn.f32 f350, f112, 0fBF050374, f346; +fma.rn.f32 f351, f748, 0f3F5ABB3B, f347; +fma.rn.f32 f352, f111, 0fBF050374, f348; +fma.rn.f32 f353, f113, 0fBEAB7557, f349; +fma.rn.f32 f354, f116, 0fBF713803, f350; +fma.rn.f32 f355, f746, 0fBEAB7557, f351; +fma.rn.f32 f356, f115, 0fBF713803, f352; +fma.rn.f32 f357, f117, 0fBF7D9D98, f353; +fma.rn.f32 f358, f120, 0f3E0B6F45, f354; +fma.rn.f32 f359, f743, 0fBF7D9D98, f355; +fma.rn.f32 f360, f119, 0f3E0B6F45, f356; +fma.rn.f32 f361, f121, 0fBD8BC2AE, f357; +fma.rn.f32 f362, f124, 0f3F7F6738, f358; +fma.rn.f32 f363, f741, 0fBD8BC2AE, f359; +fma.rn.f32 f364, f123, 0f3F7F6738, f360; +fma.rn.f32 f365, f125, 0f3F7681BF, f361; +fma.rn.f32 f366, f128, 0f3E8A22CD, f362; +fma.rn.f32 f367, f739, 0f3F7681BF, f363; +fma.rn.f32 f368, f127, 0f3E8A22CD, f364; +fma.rn.f32 f369, f129, 0f3EEB8DA5, f365; +fma.rn.f32 f370, f132, 0fBF634C72, f366; +fma.rn.f32 f371, f736, 0f3EEB8DA5, f367; +fma.rn.f32 f372, f131, 0fBF634C72, f368; +fma.rn.f32 f373, f133, 0fBF469504, f369; +fma.rn.f32 f374, f136, 0fBF218EFB, f370; +fma.rn.f32 f375, f734, 0fBF469504, f371; +fma.rn.f32 f376, f135, 0fBF218EFB, f372; +fma.rn.f32 f377, f93, 0fBD8BC2AE, %46; +fma.rn.f32 f381, f97, 0fBF7D9D98, f377; +fma.rn.f32 f717, f96, 0fBF7F6738, 0f00000000; +fma.rn.f32 f382, f100, 0f3E0B6F45, f717; +fma.rn.f32 f716, f757, 0fBD8BC2AE, %47; +fma.rn.f32 f383, f755, 0fBF7D9D98, f716; +fma.rn.f32 f715, f95, 0fBF7F6738, 0f00000000; +fma.rn.f32 f384, f99, 0f3E0B6F45, f715; +fma.rn.f32 f385, f101, 0f3E5056C6, f381; +fma.rn.f32 f386, f104, 0f3F7AA541, f382; +fma.rn.f32 f387, f753, 0f3E5056C6, f383; +fma.rn.f32 f388, f103, 0f3F7AA541, f384; +fma.rn.f32 f389, f105, 0f3F7681BF, f385; +fma.rn.f32 f390, f108, 0fBE8A22CD, f386; +fma.rn.f32 f391, f750, 0f3F7681BF, f387; +fma.rn.f32 f392, f107, 0fBE8A22CD, f388; +fma.rn.f32 f393, f109, 0fBEAB7557, f389; +fma.rn.f32 f394, f112, 0fBF713803, f390; +fma.rn.f32 f395, f748, 0fBEAB7557, f391; +fma.rn.f32 f396, f111, 0fBF713803, f392; +fma.rn.f32 f397, f113, 0fBF6ACE5C, f393; +fma.rn.f32 f398, f116, 0f3ECBFB3A, f394; +fma.rn.f32 f399, f746, 0fBF6ACE5C, f395; +fma.rn.f32 f400, f115, 0f3ECBFB3A, f396; +fma.rn.f32 f401, f117, 0f3EEB8DA5, f397; +fma.rn.f32 f402, f120, 0f3F634C72, f398; +fma.rn.f32 f403, f743, 0f3EEB8DA5, f399; +fma.rn.f32 f404, f119, 0f3F634C72, f400; +fma.rn.f32 f405, f121, 0f3F5ABB3B, f401; +fma.rn.f32 f406, f124, 0fBF050374, f402; +fma.rn.f32 f407, f741, 0f3F5ABB3B, f403; +fma.rn.f32 f408, f123, 0fBF050374, f404; +fma.rn.f32 f409, f125, 0fBF13A152, f405; +fma.rn.f32 f410, f128, 0fBF5124F0, f406; +fma.rn.f32 f411, f739, 0fBF13A152, f407; +fma.rn.f32 f412, f127, 0fBF5124F0, f408; +fma.rn.f32 f413, f129, 0fBF469504, f409; +fma.rn.f32 f414, f132, 0f3F218EFB, f410; +fma.rn.f32 f415, f736, 0fBF469504, f411; +fma.rn.f32 f416, f131, 0f3F218EFB, f412; +fma.rn.f32 f417, f133, 0f3F2EBBCE, f413; +fma.rn.f32 f418, f136, 0f3F3B1811, f414; +fma.rn.f32 f419, f734, 0f3F2EBBCE, f415; +fma.rn.f32 f420, f135, 0f3F3B1811, f416; +fma.rn.f32 f421, f93, 0fBEAB7557, %46; +fma.rn.f32 f425, f97, 0fBF469504, f421; +fma.rn.f32 f714, f96, 0fBF713803, 0f00000000; +fma.rn.f32 f426, f100, 0f3F218EFB, f714; +fma.rn.f32 f713, f757, 0fBEAB7557, %47; +fma.rn.f32 f427, f755, 0fBF469504, f713; +fma.rn.f32 f712, f95, 0fBF713803, 0f00000000; +fma.rn.f32 f428, f99, 0f3F218EFB, f712; +fma.rn.f32 f429, f101, 0f3F5ABB3B, f425; +fma.rn.f32 f430, f104, 0f3F050374, f426; +fma.rn.f32 f431, f753, 0f3F5ABB3B, f427; +fma.rn.f32 f432, f103, 0f3F050374, f428; +fma.rn.f32 f433, f105, 0f3E5056C6, f429; +fma.rn.f32 f434, f108, 0fBF7AA541, f430; +fma.rn.f32 f435, f750, 0f3E5056C6, f431; +fma.rn.f32 f436, f107, 0fBF7AA541, f432; +fma.rn.f32 f437, f109, 0fBF7D9D98, f433; +fma.rn.f32 f438, f112, 0f3E0B6F45, f434; +fma.rn.f32 f439, f748, 0fBF7D9D98, f435; +fma.rn.f32 f440, f111, 0f3E0B6F45, f436; +fma.rn.f32 f441, f113, 0f3EEB8DA5, f437; +fma.rn.f32 f442, f116, 0f3F634C72, f438; +fma.rn.f32 f443, f746, 0f3EEB8DA5, f439; +fma.rn.f32 f444, f115, 0f3F634C72, f440; +fma.rn.f32 f445, f117, 0f3F2EBBCE, f441; +fma.rn.f32 f446, f120, 0fBF3B1811, f442; +fma.rn.f32 f447, f743, 0f3F2EBBCE, f443; +fma.rn.f32 f448, f119, 0fBF3B1811, f444; +fma.rn.f32 f449, f121, 0fBF6ACE5C, f445; +fma.rn.f32 f450, f124, 0fBECBFB3A, f446; +fma.rn.f32 f451, f741, 0fBF6ACE5C, f447; +fma.rn.f32 f452, f123, 0fBECBFB3A, f448; +fma.rn.f32 f453, f125, 0fBD8BC2AE, f449; +fma.rn.f32 f454, f128, 0f3F7F6738, f450; +fma.rn.f32 f455, f739, 0fBD8BC2AE, f451; +fma.rn.f32 f456, f127, 0f3F7F6738, f452; +fma.rn.f32 f457, f129, 0f3F7681BF, f453; +fma.rn.f32 f458, f132, 0fBE8A22CD, f454; +fma.rn.f32 f459, f736, 0f3F7681BF, f455; +fma.rn.f32 f460, f131, 0fBE8A22CD, f456; +fma.rn.f32 f461, f133, 0fBF13A152, f457; +fma.rn.f32 f462, f136, 0fBF5124F0, f458; +fma.rn.f32 f463, f734, 0fBF13A152, f459; +fma.rn.f32 f464, f135, 0fBF5124F0, f460; +fma.rn.f32 f465, f93, 0fBF13A152, %46; +fma.rn.f32 f469, f97, 0fBEAB7557, f465; +fma.rn.f32 f711, f96, 0fBF5124F0, 0f00000000; +fma.rn.f32 f470, f100, 0f3F713803, f711; +fma.rn.f32 f710, f757, 0fBF13A152, %47; +fma.rn.f32 f471, f755, 0fBEAB7557, f710; +fma.rn.f32 f709, f95, 0fBF5124F0, 0f00000000; +fma.rn.f32 f472, f99, 0f3F713803, f709; +fma.rn.f32 f473, f101, 0f3F7681BF, f469; +fma.rn.f32 f474, f104, 0fBE8A22CD, f470; +fma.rn.f32 f475, f753, 0f3F7681BF, f471; +fma.rn.f32 f476, f103, 0fBE8A22CD, f472; +fma.rn.f32 f477, f105, 0fBF469504, f473; +fma.rn.f32 f478, f108, 0fBF218EFB, f474; +fma.rn.f32 f479, f750, 0fBF469504, f475; +fma.rn.f32 f480, f107, 0fBF218EFB, f476; +fma.rn.f32 f481, f109, 0fBD8BC2AE, f477; +fma.rn.f32 f482, f112, 0f3F7F6738, f478; +fma.rn.f32 f483, f748, 0fBD8BC2AE, f479; +fma.rn.f32 f484, f111, 0f3F7F6738, f480; +fma.rn.f32 f485, f113, 0f3F5ABB3B, f481; +fma.rn.f32 f486, f116, 0fBF050374, f482; +fma.rn.f32 f487, f746, 0f3F5ABB3B, f483; +fma.rn.f32 f488, f115, 0fBF050374, f484; +fma.rn.f32 f489, f117, 0fBF6ACE5C, f485; +fma.rn.f32 f490, f120, 0fBECBFB3A, f486; +fma.rn.f32 f491, f743, 0fBF6ACE5C, f487; +fma.rn.f32 f492, f119, 0fBECBFB3A, f488; +fma.rn.f32 f493, f121, 0f3E5056C6, f489; +fma.rn.f32 f494, f124, 0f3F7AA541, f490; +fma.rn.f32 f495, f741, 0f3E5056C6, f491; +fma.rn.f32 f496, f123, 0f3F7AA541, f492; +fma.rn.f32 f497, f125, 0f3F2EBBCE, f493; +fma.rn.f32 f498, f128, 0fBF3B1811, f494; +fma.rn.f32 f499, f739, 0f3F2EBBCE, f495; +fma.rn.f32 f500, f127, 0fBF3B1811, f496; +fma.rn.f32 f501, f129, 0fBF7D9D98, f497; +fma.rn.f32 f502, f132, 0fBE0B6F45, f498; +fma.rn.f32 f503, f736, 0fBF7D9D98, f499; +fma.rn.f32 f504, f131, 0fBE0B6F45, f500; +fma.rn.f32 f505, f133, 0f3EEB8DA5, f501; +fma.rn.f32 f506, f136, 0f3F634C72, f502; +fma.rn.f32 f507, f734, 0f3EEB8DA5, f503; +fma.rn.f32 f508, f135, 0f3F634C72, f504; +fma.rn.f32 f509, f93, 0fBF469504, %46; +fma.rn.f32 f513, f97, 0f3E5056C6, f509; +fma.rn.f32 f708, f96, 0fBF218EFB, 0f00000000; +fma.rn.f32 f514, f100, 0f3F7AA541, f708; +fma.rn.f32 f707, f757, 0fBF469504, %47; +fma.rn.f32 f515, f755, 0f3E5056C6, f707; +fma.rn.f32 f706, f95, 0fBF218EFB, 0f00000000; +fma.rn.f32 f516, f99, 0f3F7AA541, f706; +fma.rn.f32 f517, f101, 0f3EEB8DA5, f513; +fma.rn.f32 f518, f104, 0fBF634C72, f514; +fma.rn.f32 f519, f753, 0f3EEB8DA5, f515; +fma.rn.f32 f520, f103, 0fBF634C72, f516; +fma.rn.f32 f521, f105, 0fBF6ACE5C, f517; +fma.rn.f32 f522, f108, 0f3ECBFB3A, f518; +fma.rn.f32 f523, f750, 0fBF6ACE5C, f519; +fma.rn.f32 f524, f107, 0f3ECBFB3A, f520; +fma.rn.f32 f525, f109, 0f3F7681BF, f521; +fma.rn.f32 f526, f112, 0f3E8A22CD, f522; +fma.rn.f32 f527, f748, 0f3F7681BF, f523; +fma.rn.f32 f528, f111, 0f3E8A22CD, f524; +fma.rn.f32 f529, f113, 0fBF13A152, f525; +fma.rn.f32 f530, f116, 0fBF5124F0, f526; +fma.rn.f32 f531, f746, 0fBF13A152, f527; +fma.rn.f32 f532, f115, 0fBF5124F0, f528; +fma.rn.f32 f533, f117, 0fBD8BC2AE, f529; +fma.rn.f32 f534, f120, 0f3F7F6738, f530; +fma.rn.f32 f535, f743, 0fBD8BC2AE, f531; +fma.rn.f32 f536, f119, 0f3F7F6738, f532; +fma.rn.f32 f537, f121, 0f3F2EBBCE, f533; +fma.rn.f32 f538, f124, 0fBF3B1811, f534; +fma.rn.f32 f539, f741, 0f3F2EBBCE, f535; +fma.rn.f32 f540, f123, 0fBF3B1811, f536; +fma.rn.f32 f541, f125, 0fBF7D9D98, f537; +fma.rn.f32 f542, f128, 0f3E0B6F45, f538; +fma.rn.f32 f543, f739, 0fBF7D9D98, f539; +fma.rn.f32 f544, f127, 0f3E0B6F45, f540; +fma.rn.f32 f545, f129, 0f3F5ABB3B, f541; +fma.rn.f32 f546, f132, 0f3F050374, f542; +fma.rn.f32 f547, f736, 0f3F5ABB3B, f543; +fma.rn.f32 f548, f131, 0f3F050374, f544; +fma.rn.f32 f549, f133, 0fBEAB7557, f545; +fma.rn.f32 f550, f136, 0fBF713803, f546; +fma.rn.f32 f551, f734, 0fBEAB7557, f547; +fma.rn.f32 f552, f135, 0fBF713803, f548; +fma.rn.f32 f553, f93, 0fBF6ACE5C, %46; +fma.rn.f32 f557, f97, 0f3F2EBBCE, f553; +fma.rn.f32 f705, f96, 0fBECBFB3A, 0f00000000; +fma.rn.f32 f558, f100, 0f3F3B1811, f705; +fma.rn.f32 f704, f757, 0fBF6ACE5C, %47; +fma.rn.f32 f559, f755, 0f3F2EBBCE, f704; +fma.rn.f32 f703, f95, 0fBECBFB3A, 0f00000000; +fma.rn.f32 f560, f99, 0f3F3B1811, f703; +fma.rn.f32 f561, f101, 0fBEAB7557, f557; +fma.rn.f32 f562, f104, 0fBF713803, f558; +fma.rn.f32 f563, f753, 0fBEAB7557, f559; +fma.rn.f32 f564, f103, 0fBF713803, f560; +fma.rn.f32 f565, f105, 0fBD8BC2AE, f561; +fma.rn.f32 f566, f108, 0f3F7F6738, f562; +fma.rn.f32 f567, f750, 0fBD8BC2AE, f563; +fma.rn.f32 f568, f107, 0f3F7F6738, f564; +fma.rn.f32 f569, f109, 0f3EEB8DA5, f565; +fma.rn.f32 f570, f112, 0fBF634C72, f566; +fma.rn.f32 f571, f748, 0f3EEB8DA5, f567; +fma.rn.f32 f572, f111, 0fBF634C72, f568; +fma.rn.f32 f573, f113, 0fBF469504, f569; +fma.rn.f32 f574, f116, 0f3F218EFB, f570; +fma.rn.f32 f575, f746, 0fBF469504, f571; +fma.rn.f32 f576, f115, 0f3F218EFB, f572; +fma.rn.f32 f577, f117, 0f3F7681BF, f573; +fma.rn.f32 f578, f120, 0fBE8A22CD, f574; +fma.rn.f32 f579, f743, 0f3F7681BF, f575; +fma.rn.f32 f580, f119, 0fBE8A22CD, f576; +fma.rn.f32 f581, f121, 0fBF7D9D98, f577; +fma.rn.f32 f582, f124, 0fBE0B6F45, f578; +fma.rn.f32 f583, f741, 0fBF7D9D98, f579; +fma.rn.f32 f584, f123, 0fBE0B6F45, f580; +fma.rn.f32 f585, f125, 0f3F5ABB3B, f581; +fma.rn.f32 f586, f128, 0f3F050374, f582; +fma.rn.f32 f587, f739, 0f3F5ABB3B, f583; +fma.rn.f32 f588, f127, 0f3F050374, f584; +fma.rn.f32 f589, f129, 0fBF13A152, f585; +fma.rn.f32 f590, f132, 0fBF5124F0, f586; +fma.rn.f32 f591, f736, 0fBF13A152, f587; +fma.rn.f32 f592, f131, 0fBF5124F0, f588; +fma.rn.f32 f593, f133, 0f3E5056C6, f589; +fma.rn.f32 f594, f136, 0f3F7AA541, f590; +fma.rn.f32 f595, f734, 0f3E5056C6, f591; +fma.rn.f32 f596, f135, 0f3F7AA541, f592; +fma.rn.f32 f597, f93, 0fBF7D9D98, %46; +fma.rn.f32 f598, f96, 0fBE0B6F45, 0f00000000; +fma.rn.f32 f599, f757, 0fBF7D9D98, %47; +fma.rn.f32 f600, f95, 0fBE0B6F45, 0f00000000; +fma.rn.f32 f601, f97, 0f3F7681BF, f597; +fma.rn.f32 f602, f100, 0f3E8A22CD, f598; +fma.rn.f32 f603, f755, 0f3F7681BF, f599; +fma.rn.f32 f604, f99, 0f3E8A22CD, f600; +fma.rn.f32 f605, f101, 0fBF6ACE5C, f601; +fma.rn.f32 f606, f104, 0fBECBFB3A, f602; +fma.rn.f32 f607, f753, 0fBF6ACE5C, f603; +fma.rn.f32 f608, f103, 0fBECBFB3A, f604; +fma.rn.f32 f609, f105, 0f3F5ABB3B, f605; +fma.rn.f32 f610, f108, 0f3F050374, f606; +fma.rn.f32 f611, f750, 0f3F5ABB3B, f607; +fma.rn.f32 f612, f107, 0f3F050374, f608; +fma.rn.f32 f613, f109, 0fBF469504, f609; +fma.rn.f32 f614, f112, 0fBF218EFB, f610; +fma.rn.f32 f615, f748, 0fBF469504, f611; +fma.rn.f32 f616, f111, 0fBF218EFB, f612; +fma.rn.f32 f617, f113, 0f3F2EBBCE, f613; +fma.rn.f32 f618, f116, 0f3F3B1811, f614; +fma.rn.f32 f619, f746, 0f3F2EBBCE, f615; +fma.rn.f32 f620, f115, 0f3F3B1811, f616; +fma.rn.f32 f621, f117, 0fBF13A152, f617; +fma.rn.f32 f622, f120, 0fBF5124F0, f618; +fma.rn.f32 f623, f743, 0fBF13A152, f619; +fma.rn.f32 f624, f119, 0fBF5124F0, f620; +fma.rn.f32 f625, f121, 0f3EEB8DA5, f621; +fma.rn.f32 f626, f124, 0f3F634C72, f622; +fma.rn.f32 f627, f741, 0f3EEB8DA5, f623; +fma.rn.f32 f628, f123, 0f3F634C72, f624; +fma.rn.f32 f629, f125, 0fBEAB7557, f625; +fma.rn.f32 f630, f128, 0fBF713803, f626; +fma.rn.f32 f631, f739, 0fBEAB7557, f627; +fma.rn.f32 f632, f127, 0fBF713803, f628; +fma.rn.f32 f633, f129, 0f3E5056C6, f629; +fma.rn.f32 f634, f132, 0f3F7AA541, f630; +fma.rn.f32 f635, f736, 0f3E5056C6, f631; +fma.rn.f32 f636, f131, 0f3F7AA541, f632; +fma.rn.f32 f637, f133, 0fBD8BC2AE, f633; +fma.rn.f32 f638, f136, 0fBF7F6738, f634; +fma.rn.f32 f639, f734, 0fBD8BC2AE, f635; +fma.rn.f32 f640, f135, 0fBF7F6738, f636; +add.f32 %1, f156, f734; +add.f32 %0, f155, f133; +sub.f32 %2, f197, f198; +add.f32 %3, f199, f200; +sub.f32 %4, f241, f242; +add.f32 %5, f243, f244; +add.f32 %7, f287, f288; +sub.f32 %6, f285, f286; +add.f32 %9, f331, f332; +sub.f32 %8, f329, f330; +add.f32 %11, f375, f376; +sub.f32 %10, f373, f374; +sub.f32 %12, f417, f418; +add.f32 %13, f419, f420; +sub.f32 %14, f461, f462; +add.f32 %15, f463, f464; +sub.f32 %16, f505, f506; +add.f32 %17, f507, f508; +sub.f32 %18, f549, f550; +add.f32 %19, f551, f552; +add.f32 %21, f595, f596; +sub.f32 %20, f593, f594; +add.f32 %23, f639, f640; +sub.f32 %22, f637, f638; +sub.f32 %25, f639, f640; +add.f32 %24, f637, f638; +sub.f32 %27, f595, f596; +add.f32 %26, f593, f594; +sub.f32 %29, f551, f552; +add.f32 %28, f549, f550; +sub.f32 %31, f507, f508; +add.f32 %30, f505, f506; +sub.f32 %33, f463, f464; +add.f32 %32, f461, f462; +sub.f32 %35, f419, f420; +add.f32 %34, f417, f418; +sub.f32 %37, f375, f376; +add.f32 %36, f373, f374; +sub.f32 %39, f331, f332; +add.f32 %38, f329, f330; +sub.f32 %41, f287, f288; +add.f32 %40, f285, f286; +sub.f32 %43, f243, f244; +add.f32 %42, f241, f242; +sub.f32 %45, f199, f200; +add.f32 %44, f197, f198; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[2].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[19].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[16].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[14].y), "f"(rmem[10].y), "f"(rmem[13].y), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..a13dc737b5590 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp32_inv.hpp.inc @@ -0,0 +1,610 @@ +#ifndef CUFFTDX_FFT_23_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_23_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<211, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<758>; +.reg .b64 rd<4>; +add.f32 f93, %48, %91; +sub.f32 f95, %48, %91; +add.f32 f757, %50, %92; +sub.f32 f96, %50, %92; +add.f32 f97, %51, %89; +sub.f32 f99, %51, %89; +add.f32 f755, %93, %90; +sub.f32 f100, %93, %90; +add.f32 f101, %53, %87; +sub.f32 f103, %53, %87; +add.f32 f753, %54, %94; +sub.f32 f104, %54, %94; +add.f32 f105, %55, %85; +sub.f32 f107, %55, %85; +add.f32 f750, %95, %96; +sub.f32 f108, %95, %96; +add.f32 f109, %57, %83; +sub.f32 f111, %57, %83; +add.f32 f748, %97, %84; +sub.f32 f112, %97, %84; +add.f32 f113, %59, %81; +sub.f32 f115, %59, %81; +add.f32 f746, %60, %98; +sub.f32 f116, %60, %98; +add.f32 f117, %61, %79; +sub.f32 f119, %61, %79; +add.f32 f743, %100, %99; +sub.f32 f120, %100, %99; +add.f32 f121, %63, %77; +sub.f32 f123, %63, %77; +add.f32 f741, %101, %78; +sub.f32 f124, %101, %78; +add.f32 f125, %65, %75; +sub.f32 f127, %65, %75; +add.f32 f739, %66, %102; +sub.f32 f128, %66, %102; +add.f32 f129, %67, %73; +sub.f32 f131, %67, %73; +add.f32 f736, %103, %104; +sub.f32 f132, %103, %104; +add.f32 f133, %69, %71; +sub.f32 f135, %69, %71; +add.f32 f734, %105, %72; +sub.f32 f136, %105, %72; +add.f32 f137, %46, f93; +add.f32 f139, f137, f97; +add.f32 f733, %47, f757; +add.f32 f140, f733, f755; +add.f32 f141, f139, f101; +add.f32 f142, f140, f753; +add.f32 f143, f141, f105; +add.f32 f144, f142, f750; +add.f32 f145, f143, f109; +add.f32 f146, f144, f748; +add.f32 f147, f145, f113; +add.f32 f148, f146, f746; +add.f32 f149, f147, f117; +add.f32 f150, f148, f743; +add.f32 f151, f149, f121; +add.f32 f152, f150, f741; +add.f32 f153, f151, f125; +add.f32 f154, f152, f739; +add.f32 f155, f153, f129; +add.f32 f156, f154, f736; +fma.rn.f32 f157, f93, 0f3F7681BF, %46; +fma.rn.f32 f161, f97, 0f3F5ABB3B, f157; +fma.rn.f32 f732, f96, 0f3E8A22CD, 0f00000000; +fma.rn.f32 f162, f100, 0f3F050374, f732; +fma.rn.f32 f731, f757, 0f3F7681BF, %47; +fma.rn.f32 f163, f755, 0f3F5ABB3B, f731; +fma.rn.f32 f730, f95, 0f3E8A22CD, 0f00000000; +fma.rn.f32 f164, f99, 0f3F050374, f730; +fma.rn.f32 f165, f101, 0f3F2EBBCE, f161; +fma.rn.f32 f166, f104, 0f3F3B1811, f162; +fma.rn.f32 f167, f753, 0f3F2EBBCE, f163; +fma.rn.f32 f168, f103, 0f3F3B1811, f164; +fma.rn.f32 f169, f105, 0f3EEB8DA5, f165; +fma.rn.f32 f170, f108, 0f3F634C72, f166; +fma.rn.f32 f171, f750, 0f3EEB8DA5, f167; +fma.rn.f32 f172, f107, 0f3F634C72, f168; +fma.rn.f32 f173, f109, 0f3E5056C6, f169; +fma.rn.f32 f174, f112, 0f3F7AA541, f170; +fma.rn.f32 f175, f748, 0f3E5056C6, f171; +fma.rn.f32 f176, f111, 0f3F7AA541, f172; +fma.rn.f32 f177, f113, 0fBD8BC2AE, f173; +fma.rn.f32 f178, f116, 0f3F7F6738, f174; +fma.rn.f32 f179, f746, 0fBD8BC2AE, f175; +fma.rn.f32 f180, f115, 0f3F7F6738, f176; +fma.rn.f32 f181, f117, 0fBEAB7557, f177; +fma.rn.f32 f182, f120, 0f3F713803, f178; +fma.rn.f32 f183, f743, 0fBEAB7557, f179; +fma.rn.f32 f184, f119, 0f3F713803, f180; +fma.rn.f32 f185, f121, 0fBF13A152, f181; +fma.rn.f32 f186, f124, 0f3F5124F0, f182; +fma.rn.f32 f187, f741, 0fBF13A152, f183; +fma.rn.f32 f188, f123, 0f3F5124F0, f184; +fma.rn.f32 f189, f125, 0fBF469504, f185; +fma.rn.f32 f190, f128, 0f3F218EFB, f186; +fma.rn.f32 f191, f739, 0fBF469504, f187; +fma.rn.f32 f192, f127, 0f3F218EFB, f188; +fma.rn.f32 f193, f129, 0fBF6ACE5C, f189; +fma.rn.f32 f194, f132, 0f3ECBFB3A, f190; +fma.rn.f32 f195, f736, 0fBF6ACE5C, f191; +fma.rn.f32 f196, f131, 0f3ECBFB3A, f192; +fma.rn.f32 f197, f133, 0fBF7D9D98, f193; +fma.rn.f32 f198, f136, 0f3E0B6F45, f194; +fma.rn.f32 f199, f734, 0fBF7D9D98, f195; +fma.rn.f32 f200, f135, 0f3E0B6F45, f196; +fma.rn.f32 f201, f93, 0f3F5ABB3B, %46; +fma.rn.f32 f205, f97, 0f3EEB8DA5, f201; +fma.rn.f32 f729, f96, 0f3F050374, 0f00000000; +fma.rn.f32 f206, f100, 0f3F634C72, f729; +fma.rn.f32 f728, f757, 0f3F5ABB3B, %47; +fma.rn.f32 f207, f755, 0f3EEB8DA5, f728; +fma.rn.f32 f727, f95, 0f3F050374, 0f00000000; +fma.rn.f32 f208, f99, 0f3F634C72, f727; +fma.rn.f32 f209, f101, 0fBD8BC2AE, f205; +fma.rn.f32 f210, f104, 0f3F7F6738, f206; +fma.rn.f32 f211, f753, 0fBD8BC2AE, f207; +fma.rn.f32 f212, f103, 0f3F7F6738, f208; +fma.rn.f32 f213, f105, 0fBF13A152, f209; +fma.rn.f32 f214, f108, 0f3F5124F0, f210; +fma.rn.f32 f215, f750, 0fBF13A152, f211; +fma.rn.f32 f216, f107, 0f3F5124F0, f212; +fma.rn.f32 f217, f109, 0fBF6ACE5C, f213; +fma.rn.f32 f218, f112, 0f3ECBFB3A, f214; +fma.rn.f32 f219, f748, 0fBF6ACE5C, f215; +fma.rn.f32 f220, f111, 0f3ECBFB3A, f216; +fma.rn.f32 f221, f113, 0fBF7D9D98, f217; +fma.rn.f32 f222, f116, 0fBE0B6F45, f218; +fma.rn.f32 f223, f746, 0fBF7D9D98, f219; +fma.rn.f32 f224, f115, 0fBE0B6F45, f220; +fma.rn.f32 f225, f117, 0fBF469504, f221; +fma.rn.f32 f226, f120, 0fBF218EFB, f222; +fma.rn.f32 f227, f743, 0fBF469504, f223; +fma.rn.f32 f228, f119, 0fBF218EFB, f224; +fma.rn.f32 f229, f121, 0fBEAB7557, f225; +fma.rn.f32 f230, f124, 0fBF713803, f226; +fma.rn.f32 f231, f741, 0fBEAB7557, f227; +fma.rn.f32 f232, f123, 0fBF713803, f228; +fma.rn.f32 f233, f125, 0f3E5056C6, f229; +fma.rn.f32 f234, f128, 0fBF7AA541, f230; +fma.rn.f32 f235, f739, 0f3E5056C6, f231; +fma.rn.f32 f236, f127, 0fBF7AA541, f232; +fma.rn.f32 f237, f129, 0f3F2EBBCE, f233; +fma.rn.f32 f238, f132, 0fBF3B1811, f234; +fma.rn.f32 f239, f736, 0f3F2EBBCE, f235; +fma.rn.f32 f240, f131, 0fBF3B1811, f236; +fma.rn.f32 f241, f133, 0f3F7681BF, f237; +fma.rn.f32 f242, f136, 0fBE8A22CD, f238; +fma.rn.f32 f243, f734, 0f3F7681BF, f239; +fma.rn.f32 f244, f135, 0fBE8A22CD, f240; +fma.rn.f32 f245, f93, 0f3F2EBBCE, %46; +fma.rn.f32 f249, f97, 0fBD8BC2AE, f245; +fma.rn.f32 f726, f96, 0f3F3B1811, 0f00000000; +fma.rn.f32 f250, f100, 0f3F7F6738, f726; +fma.rn.f32 f725, f757, 0f3F2EBBCE, %47; +fma.rn.f32 f251, f755, 0fBD8BC2AE, f725; +fma.rn.f32 f724, f95, 0f3F3B1811, 0f00000000; +fma.rn.f32 f252, f99, 0f3F7F6738, f724; +fma.rn.f32 f253, f101, 0fBF469504, f249; +fma.rn.f32 f254, f104, 0f3F218EFB, f250; +fma.rn.f32 f255, f753, 0fBF469504, f251; +fma.rn.f32 f256, f103, 0f3F218EFB, f252; +fma.rn.f32 f257, f105, 0fBF7D9D98, f253; +fma.rn.f32 f258, f108, 0fBE0B6F45, f254; +fma.rn.f32 f259, f750, 0fBF7D9D98, f255; +fma.rn.f32 f260, f107, 0fBE0B6F45, f256; +fma.rn.f32 f261, f109, 0fBF13A152, f257; +fma.rn.f32 f262, f112, 0fBF5124F0, f258; +fma.rn.f32 f263, f748, 0fBF13A152, f259; +fma.rn.f32 f264, f111, 0fBF5124F0, f260; +fma.rn.f32 f265, f113, 0f3E5056C6, f261; +fma.rn.f32 f266, f116, 0fBF7AA541, f262; +fma.rn.f32 f267, f746, 0f3E5056C6, f263; +fma.rn.f32 f268, f115, 0fBF7AA541, f264; +fma.rn.f32 f269, f117, 0f3F5ABB3B, f265; +fma.rn.f32 f270, f120, 0fBF050374, f266; +fma.rn.f32 f271, f743, 0f3F5ABB3B, f267; +fma.rn.f32 f272, f119, 0fBF050374, f268; +fma.rn.f32 f273, f121, 0f3F7681BF, f269; +fma.rn.f32 f274, f124, 0f3E8A22CD, f270; +fma.rn.f32 f275, f741, 0f3F7681BF, f271; +fma.rn.f32 f276, f123, 0f3E8A22CD, f272; +fma.rn.f32 f277, f125, 0f3EEB8DA5, f273; +fma.rn.f32 f278, f128, 0f3F634C72, f274; +fma.rn.f32 f279, f739, 0f3EEB8DA5, f275; +fma.rn.f32 f280, f127, 0f3F634C72, f276; +fma.rn.f32 f281, f129, 0fBEAB7557, f277; +fma.rn.f32 f282, f132, 0f3F713803, f278; +fma.rn.f32 f283, f736, 0fBEAB7557, f279; +fma.rn.f32 f284, f131, 0f3F713803, f280; +fma.rn.f32 f285, f133, 0fBF6ACE5C, f281; +fma.rn.f32 f286, f136, 0f3ECBFB3A, f282; +fma.rn.f32 f287, f734, 0fBF6ACE5C, f283; +fma.rn.f32 f288, f135, 0f3ECBFB3A, f284; +fma.rn.f32 f289, f93, 0f3EEB8DA5, %46; +fma.rn.f32 f293, f97, 0fBF13A152, f289; +fma.rn.f32 f723, f96, 0f3F634C72, 0f00000000; +fma.rn.f32 f294, f100, 0f3F5124F0, f723; +fma.rn.f32 f722, f757, 0f3EEB8DA5, %47; +fma.rn.f32 f295, f755, 0fBF13A152, f722; +fma.rn.f32 f721, f95, 0f3F634C72, 0f00000000; +fma.rn.f32 f296, f99, 0f3F5124F0, f721; +fma.rn.f32 f297, f101, 0fBF7D9D98, f293; +fma.rn.f32 f298, f104, 0fBE0B6F45, f294; +fma.rn.f32 f299, f753, 0fBF7D9D98, f295; +fma.rn.f32 f300, f103, 0fBE0B6F45, f296; +fma.rn.f32 f301, f105, 0fBEAB7557, f297; +fma.rn.f32 f302, f108, 0fBF713803, f298; +fma.rn.f32 f303, f750, 0fBEAB7557, f299; +fma.rn.f32 f304, f107, 0fBF713803, f300; +fma.rn.f32 f305, f109, 0f3F2EBBCE, f301; +fma.rn.f32 f306, f112, 0fBF3B1811, f302; +fma.rn.f32 f307, f748, 0f3F2EBBCE, f303; +fma.rn.f32 f308, f111, 0fBF3B1811, f304; +fma.rn.f32 f309, f113, 0f3F7681BF, f305; +fma.rn.f32 f310, f116, 0f3E8A22CD, f306; +fma.rn.f32 f311, f746, 0f3F7681BF, f307; +fma.rn.f32 f312, f115, 0f3E8A22CD, f308; +fma.rn.f32 f313, f117, 0f3E5056C6, f309; +fma.rn.f32 f314, f120, 0f3F7AA541, f310; +fma.rn.f32 f315, f743, 0f3E5056C6, f311; +fma.rn.f32 f316, f119, 0f3F7AA541, f312; +fma.rn.f32 f317, f121, 0fBF469504, f313; +fma.rn.f32 f318, f124, 0f3F218EFB, f314; +fma.rn.f32 f319, f741, 0fBF469504, f315; +fma.rn.f32 f320, f123, 0f3F218EFB, f316; +fma.rn.f32 f321, f125, 0fBF6ACE5C, f317; +fma.rn.f32 f322, f128, 0fBECBFB3A, f318; +fma.rn.f32 f323, f739, 0fBF6ACE5C, f319; +fma.rn.f32 f324, f127, 0fBECBFB3A, f320; +fma.rn.f32 f325, f129, 0fBD8BC2AE, f321; +fma.rn.f32 f326, f132, 0fBF7F6738, f322; +fma.rn.f32 f327, f736, 0fBD8BC2AE, f323; +fma.rn.f32 f328, f131, 0fBF7F6738, f324; +fma.rn.f32 f329, f133, 0f3F5ABB3B, f325; +fma.rn.f32 f330, f136, 0fBF050374, f326; +fma.rn.f32 f331, f734, 0f3F5ABB3B, f327; +fma.rn.f32 f332, f135, 0fBF050374, f328; +fma.rn.f32 f333, f93, 0f3E5056C6, %46; +fma.rn.f32 f337, f97, 0fBF6ACE5C, f333; +fma.rn.f32 f720, f96, 0f3F7AA541, 0f00000000; +fma.rn.f32 f338, f100, 0f3ECBFB3A, f720; +fma.rn.f32 f719, f757, 0f3E5056C6, %47; +fma.rn.f32 f339, f755, 0fBF6ACE5C, f719; +fma.rn.f32 f718, f95, 0f3F7AA541, 0f00000000; +fma.rn.f32 f340, f99, 0f3ECBFB3A, f718; +fma.rn.f32 f341, f101, 0fBF13A152, f337; +fma.rn.f32 f342, f104, 0fBF5124F0, f338; +fma.rn.f32 f343, f753, 0fBF13A152, f339; +fma.rn.f32 f344, f103, 0fBF5124F0, f340; +fma.rn.f32 f345, f105, 0f3F2EBBCE, f341; +fma.rn.f32 f346, f108, 0fBF3B1811, f342; +fma.rn.f32 f347, f750, 0f3F2EBBCE, f343; +fma.rn.f32 f348, f107, 0fBF3B1811, f344; +fma.rn.f32 f349, f109, 0f3F5ABB3B, f345; +fma.rn.f32 f350, f112, 0f3F050374, f346; +fma.rn.f32 f351, f748, 0f3F5ABB3B, f347; +fma.rn.f32 f352, f111, 0f3F050374, f348; +fma.rn.f32 f353, f113, 0fBEAB7557, f349; +fma.rn.f32 f354, f116, 0f3F713803, f350; +fma.rn.f32 f355, f746, 0fBEAB7557, f351; +fma.rn.f32 f356, f115, 0f3F713803, f352; +fma.rn.f32 f357, f117, 0fBF7D9D98, f353; +fma.rn.f32 f358, f120, 0fBE0B6F45, f354; +fma.rn.f32 f359, f743, 0fBF7D9D98, f355; +fma.rn.f32 f360, f119, 0fBE0B6F45, f356; +fma.rn.f32 f361, f121, 0fBD8BC2AE, f357; +fma.rn.f32 f362, f124, 0fBF7F6738, f358; +fma.rn.f32 f363, f741, 0fBD8BC2AE, f359; +fma.rn.f32 f364, f123, 0fBF7F6738, f360; +fma.rn.f32 f365, f125, 0f3F7681BF, f361; +fma.rn.f32 f366, f128, 0fBE8A22CD, f362; +fma.rn.f32 f367, f739, 0f3F7681BF, f363; +fma.rn.f32 f368, f127, 0fBE8A22CD, f364; +fma.rn.f32 f369, f129, 0f3EEB8DA5, f365; +fma.rn.f32 f370, f132, 0f3F634C72, f366; +fma.rn.f32 f371, f736, 0f3EEB8DA5, f367; +fma.rn.f32 f372, f131, 0f3F634C72, f368; +fma.rn.f32 f373, f133, 0fBF469504, f369; +fma.rn.f32 f374, f136, 0f3F218EFB, f370; +fma.rn.f32 f375, f734, 0fBF469504, f371; +fma.rn.f32 f376, f135, 0f3F218EFB, f372; +fma.rn.f32 f377, f93, 0fBD8BC2AE, %46; +fma.rn.f32 f381, f97, 0fBF7D9D98, f377; +fma.rn.f32 f717, f96, 0f3F7F6738, 0f00000000; +fma.rn.f32 f382, f100, 0fBE0B6F45, f717; +fma.rn.f32 f716, f757, 0fBD8BC2AE, %47; +fma.rn.f32 f383, f755, 0fBF7D9D98, f716; +fma.rn.f32 f715, f95, 0f3F7F6738, 0f00000000; +fma.rn.f32 f384, f99, 0fBE0B6F45, f715; +fma.rn.f32 f385, f101, 0f3E5056C6, f381; +fma.rn.f32 f386, f104, 0fBF7AA541, f382; +fma.rn.f32 f387, f753, 0f3E5056C6, f383; +fma.rn.f32 f388, f103, 0fBF7AA541, f384; +fma.rn.f32 f389, f105, 0f3F7681BF, f385; +fma.rn.f32 f390, f108, 0f3E8A22CD, f386; +fma.rn.f32 f391, f750, 0f3F7681BF, f387; +fma.rn.f32 f392, f107, 0f3E8A22CD, f388; +fma.rn.f32 f393, f109, 0fBEAB7557, f389; +fma.rn.f32 f394, f112, 0f3F713803, f390; +fma.rn.f32 f395, f748, 0fBEAB7557, f391; +fma.rn.f32 f396, f111, 0f3F713803, f392; +fma.rn.f32 f397, f113, 0fBF6ACE5C, f393; +fma.rn.f32 f398, f116, 0fBECBFB3A, f394; +fma.rn.f32 f399, f746, 0fBF6ACE5C, f395; +fma.rn.f32 f400, f115, 0fBECBFB3A, f396; +fma.rn.f32 f401, f117, 0f3EEB8DA5, f397; +fma.rn.f32 f402, f120, 0fBF634C72, f398; +fma.rn.f32 f403, f743, 0f3EEB8DA5, f399; +fma.rn.f32 f404, f119, 0fBF634C72, f400; +fma.rn.f32 f405, f121, 0f3F5ABB3B, f401; +fma.rn.f32 f406, f124, 0f3F050374, f402; +fma.rn.f32 f407, f741, 0f3F5ABB3B, f403; +fma.rn.f32 f408, f123, 0f3F050374, f404; +fma.rn.f32 f409, f125, 0fBF13A152, f405; +fma.rn.f32 f410, f128, 0f3F5124F0, f406; +fma.rn.f32 f411, f739, 0fBF13A152, f407; +fma.rn.f32 f412, f127, 0f3F5124F0, f408; +fma.rn.f32 f413, f129, 0fBF469504, f409; +fma.rn.f32 f414, f132, 0fBF218EFB, f410; +fma.rn.f32 f415, f736, 0fBF469504, f411; +fma.rn.f32 f416, f131, 0fBF218EFB, f412; +fma.rn.f32 f417, f133, 0f3F2EBBCE, f413; +fma.rn.f32 f418, f136, 0fBF3B1811, f414; +fma.rn.f32 f419, f734, 0f3F2EBBCE, f415; +fma.rn.f32 f420, f135, 0fBF3B1811, f416; +fma.rn.f32 f421, f93, 0fBEAB7557, %46; +fma.rn.f32 f425, f97, 0fBF469504, f421; +fma.rn.f32 f714, f96, 0f3F713803, 0f00000000; +fma.rn.f32 f426, f100, 0fBF218EFB, f714; +fma.rn.f32 f713, f757, 0fBEAB7557, %47; +fma.rn.f32 f427, f755, 0fBF469504, f713; +fma.rn.f32 f712, f95, 0f3F713803, 0f00000000; +fma.rn.f32 f428, f99, 0fBF218EFB, f712; +fma.rn.f32 f429, f101, 0f3F5ABB3B, f425; +fma.rn.f32 f430, f104, 0fBF050374, f426; +fma.rn.f32 f431, f753, 0f3F5ABB3B, f427; +fma.rn.f32 f432, f103, 0fBF050374, f428; +fma.rn.f32 f433, f105, 0f3E5056C6, f429; +fma.rn.f32 f434, f108, 0f3F7AA541, f430; +fma.rn.f32 f435, f750, 0f3E5056C6, f431; +fma.rn.f32 f436, f107, 0f3F7AA541, f432; +fma.rn.f32 f437, f109, 0fBF7D9D98, f433; +fma.rn.f32 f438, f112, 0fBE0B6F45, f434; +fma.rn.f32 f439, f748, 0fBF7D9D98, f435; +fma.rn.f32 f440, f111, 0fBE0B6F45, f436; +fma.rn.f32 f441, f113, 0f3EEB8DA5, f437; +fma.rn.f32 f442, f116, 0fBF634C72, f438; +fma.rn.f32 f443, f746, 0f3EEB8DA5, f439; +fma.rn.f32 f444, f115, 0fBF634C72, f440; +fma.rn.f32 f445, f117, 0f3F2EBBCE, f441; +fma.rn.f32 f446, f120, 0f3F3B1811, f442; +fma.rn.f32 f447, f743, 0f3F2EBBCE, f443; +fma.rn.f32 f448, f119, 0f3F3B1811, f444; +fma.rn.f32 f449, f121, 0fBF6ACE5C, f445; +fma.rn.f32 f450, f124, 0f3ECBFB3A, f446; +fma.rn.f32 f451, f741, 0fBF6ACE5C, f447; +fma.rn.f32 f452, f123, 0f3ECBFB3A, f448; +fma.rn.f32 f453, f125, 0fBD8BC2AE, f449; +fma.rn.f32 f454, f128, 0fBF7F6738, f450; +fma.rn.f32 f455, f739, 0fBD8BC2AE, f451; +fma.rn.f32 f456, f127, 0fBF7F6738, f452; +fma.rn.f32 f457, f129, 0f3F7681BF, f453; +fma.rn.f32 f458, f132, 0f3E8A22CD, f454; +fma.rn.f32 f459, f736, 0f3F7681BF, f455; +fma.rn.f32 f460, f131, 0f3E8A22CD, f456; +fma.rn.f32 f461, f133, 0fBF13A152, f457; +fma.rn.f32 f462, f136, 0f3F5124F0, f458; +fma.rn.f32 f463, f734, 0fBF13A152, f459; +fma.rn.f32 f464, f135, 0f3F5124F0, f460; +fma.rn.f32 f465, f93, 0fBF13A152, %46; +fma.rn.f32 f469, f97, 0fBEAB7557, f465; +fma.rn.f32 f711, f96, 0f3F5124F0, 0f00000000; +fma.rn.f32 f470, f100, 0fBF713803, f711; +fma.rn.f32 f710, f757, 0fBF13A152, %47; +fma.rn.f32 f471, f755, 0fBEAB7557, f710; +fma.rn.f32 f709, f95, 0f3F5124F0, 0f00000000; +fma.rn.f32 f472, f99, 0fBF713803, f709; +fma.rn.f32 f473, f101, 0f3F7681BF, f469; +fma.rn.f32 f474, f104, 0f3E8A22CD, f470; +fma.rn.f32 f475, f753, 0f3F7681BF, f471; +fma.rn.f32 f476, f103, 0f3E8A22CD, f472; +fma.rn.f32 f477, f105, 0fBF469504, f473; +fma.rn.f32 f478, f108, 0f3F218EFB, f474; +fma.rn.f32 f479, f750, 0fBF469504, f475; +fma.rn.f32 f480, f107, 0f3F218EFB, f476; +fma.rn.f32 f481, f109, 0fBD8BC2AE, f477; +fma.rn.f32 f482, f112, 0fBF7F6738, f478; +fma.rn.f32 f483, f748, 0fBD8BC2AE, f479; +fma.rn.f32 f484, f111, 0fBF7F6738, f480; +fma.rn.f32 f485, f113, 0f3F5ABB3B, f481; +fma.rn.f32 f486, f116, 0f3F050374, f482; +fma.rn.f32 f487, f746, 0f3F5ABB3B, f483; +fma.rn.f32 f488, f115, 0f3F050374, f484; +fma.rn.f32 f489, f117, 0fBF6ACE5C, f485; +fma.rn.f32 f490, f120, 0f3ECBFB3A, f486; +fma.rn.f32 f491, f743, 0fBF6ACE5C, f487; +fma.rn.f32 f492, f119, 0f3ECBFB3A, f488; +fma.rn.f32 f493, f121, 0f3E5056C6, f489; +fma.rn.f32 f494, f124, 0fBF7AA541, f490; +fma.rn.f32 f495, f741, 0f3E5056C6, f491; +fma.rn.f32 f496, f123, 0fBF7AA541, f492; +fma.rn.f32 f497, f125, 0f3F2EBBCE, f493; +fma.rn.f32 f498, f128, 0f3F3B1811, f494; +fma.rn.f32 f499, f739, 0f3F2EBBCE, f495; +fma.rn.f32 f500, f127, 0f3F3B1811, f496; +fma.rn.f32 f501, f129, 0fBF7D9D98, f497; +fma.rn.f32 f502, f132, 0f3E0B6F45, f498; +fma.rn.f32 f503, f736, 0fBF7D9D98, f499; +fma.rn.f32 f504, f131, 0f3E0B6F45, f500; +fma.rn.f32 f505, f133, 0f3EEB8DA5, f501; +fma.rn.f32 f506, f136, 0fBF634C72, f502; +fma.rn.f32 f507, f734, 0f3EEB8DA5, f503; +fma.rn.f32 f508, f135, 0fBF634C72, f504; +fma.rn.f32 f509, f93, 0fBF469504, %46; +fma.rn.f32 f513, f97, 0f3E5056C6, f509; +fma.rn.f32 f708, f96, 0f3F218EFB, 0f00000000; +fma.rn.f32 f514, f100, 0fBF7AA541, f708; +fma.rn.f32 f707, f757, 0fBF469504, %47; +fma.rn.f32 f515, f755, 0f3E5056C6, f707; +fma.rn.f32 f706, f95, 0f3F218EFB, 0f00000000; +fma.rn.f32 f516, f99, 0fBF7AA541, f706; +fma.rn.f32 f517, f101, 0f3EEB8DA5, f513; +fma.rn.f32 f518, f104, 0f3F634C72, f514; +fma.rn.f32 f519, f753, 0f3EEB8DA5, f515; +fma.rn.f32 f520, f103, 0f3F634C72, f516; +fma.rn.f32 f521, f105, 0fBF6ACE5C, f517; +fma.rn.f32 f522, f108, 0fBECBFB3A, f518; +fma.rn.f32 f523, f750, 0fBF6ACE5C, f519; +fma.rn.f32 f524, f107, 0fBECBFB3A, f520; +fma.rn.f32 f525, f109, 0f3F7681BF, f521; +fma.rn.f32 f526, f112, 0fBE8A22CD, f522; +fma.rn.f32 f527, f748, 0f3F7681BF, f523; +fma.rn.f32 f528, f111, 0fBE8A22CD, f524; +fma.rn.f32 f529, f113, 0fBF13A152, f525; +fma.rn.f32 f530, f116, 0f3F5124F0, f526; +fma.rn.f32 f531, f746, 0fBF13A152, f527; +fma.rn.f32 f532, f115, 0f3F5124F0, f528; +fma.rn.f32 f533, f117, 0fBD8BC2AE, f529; +fma.rn.f32 f534, f120, 0fBF7F6738, f530; +fma.rn.f32 f535, f743, 0fBD8BC2AE, f531; +fma.rn.f32 f536, f119, 0fBF7F6738, f532; +fma.rn.f32 f537, f121, 0f3F2EBBCE, f533; +fma.rn.f32 f538, f124, 0f3F3B1811, f534; +fma.rn.f32 f539, f741, 0f3F2EBBCE, f535; +fma.rn.f32 f540, f123, 0f3F3B1811, f536; +fma.rn.f32 f541, f125, 0fBF7D9D98, f537; +fma.rn.f32 f542, f128, 0fBE0B6F45, f538; +fma.rn.f32 f543, f739, 0fBF7D9D98, f539; +fma.rn.f32 f544, f127, 0fBE0B6F45, f540; +fma.rn.f32 f545, f129, 0f3F5ABB3B, f541; +fma.rn.f32 f546, f132, 0fBF050374, f542; +fma.rn.f32 f547, f736, 0f3F5ABB3B, f543; +fma.rn.f32 f548, f131, 0fBF050374, f544; +fma.rn.f32 f549, f133, 0fBEAB7557, f545; +fma.rn.f32 f550, f136, 0f3F713803, f546; +fma.rn.f32 f551, f734, 0fBEAB7557, f547; +fma.rn.f32 f552, f135, 0f3F713803, f548; +fma.rn.f32 f553, f93, 0fBF6ACE5C, %46; +fma.rn.f32 f557, f97, 0f3F2EBBCE, f553; +fma.rn.f32 f705, f96, 0f3ECBFB3A, 0f00000000; +fma.rn.f32 f558, f100, 0fBF3B1811, f705; +fma.rn.f32 f704, f757, 0fBF6ACE5C, %47; +fma.rn.f32 f559, f755, 0f3F2EBBCE, f704; +fma.rn.f32 f703, f95, 0f3ECBFB3A, 0f00000000; +fma.rn.f32 f560, f99, 0fBF3B1811, f703; +fma.rn.f32 f561, f101, 0fBEAB7557, f557; +fma.rn.f32 f562, f104, 0f3F713803, f558; +fma.rn.f32 f563, f753, 0fBEAB7557, f559; +fma.rn.f32 f564, f103, 0f3F713803, f560; +fma.rn.f32 f565, f105, 0fBD8BC2AE, f561; +fma.rn.f32 f566, f108, 0fBF7F6738, f562; +fma.rn.f32 f567, f750, 0fBD8BC2AE, f563; +fma.rn.f32 f568, f107, 0fBF7F6738, f564; +fma.rn.f32 f569, f109, 0f3EEB8DA5, f565; +fma.rn.f32 f570, f112, 0f3F634C72, f566; +fma.rn.f32 f571, f748, 0f3EEB8DA5, f567; +fma.rn.f32 f572, f111, 0f3F634C72, f568; +fma.rn.f32 f573, f113, 0fBF469504, f569; +fma.rn.f32 f574, f116, 0fBF218EFB, f570; +fma.rn.f32 f575, f746, 0fBF469504, f571; +fma.rn.f32 f576, f115, 0fBF218EFB, f572; +fma.rn.f32 f577, f117, 0f3F7681BF, f573; +fma.rn.f32 f578, f120, 0f3E8A22CD, f574; +fma.rn.f32 f579, f743, 0f3F7681BF, f575; +fma.rn.f32 f580, f119, 0f3E8A22CD, f576; +fma.rn.f32 f581, f121, 0fBF7D9D98, f577; +fma.rn.f32 f582, f124, 0f3E0B6F45, f578; +fma.rn.f32 f583, f741, 0fBF7D9D98, f579; +fma.rn.f32 f584, f123, 0f3E0B6F45, f580; +fma.rn.f32 f585, f125, 0f3F5ABB3B, f581; +fma.rn.f32 f586, f128, 0fBF050374, f582; +fma.rn.f32 f587, f739, 0f3F5ABB3B, f583; +fma.rn.f32 f588, f127, 0fBF050374, f584; +fma.rn.f32 f589, f129, 0fBF13A152, f585; +fma.rn.f32 f590, f132, 0f3F5124F0, f586; +fma.rn.f32 f591, f736, 0fBF13A152, f587; +fma.rn.f32 f592, f131, 0f3F5124F0, f588; +fma.rn.f32 f593, f133, 0f3E5056C6, f589; +fma.rn.f32 f594, f136, 0fBF7AA541, f590; +fma.rn.f32 f595, f734, 0f3E5056C6, f591; +fma.rn.f32 f596, f135, 0fBF7AA541, f592; +fma.rn.f32 f597, f93, 0fBF7D9D98, %46; +fma.rn.f32 f598, f96, 0f3E0B6F45, 0f00000000; +fma.rn.f32 f599, f757, 0fBF7D9D98, %47; +fma.rn.f32 f600, f95, 0f3E0B6F45, 0f00000000; +fma.rn.f32 f601, f97, 0f3F7681BF, f597; +fma.rn.f32 f602, f100, 0fBE8A22CD, f598; +fma.rn.f32 f603, f755, 0f3F7681BF, f599; +fma.rn.f32 f604, f99, 0fBE8A22CD, f600; +fma.rn.f32 f605, f101, 0fBF6ACE5C, f601; +fma.rn.f32 f606, f104, 0f3ECBFB3A, f602; +fma.rn.f32 f607, f753, 0fBF6ACE5C, f603; +fma.rn.f32 f608, f103, 0f3ECBFB3A, f604; +fma.rn.f32 f609, f105, 0f3F5ABB3B, f605; +fma.rn.f32 f610, f108, 0fBF050374, f606; +fma.rn.f32 f611, f750, 0f3F5ABB3B, f607; +fma.rn.f32 f612, f107, 0fBF050374, f608; +fma.rn.f32 f613, f109, 0fBF469504, f609; +fma.rn.f32 f614, f112, 0f3F218EFB, f610; +fma.rn.f32 f615, f748, 0fBF469504, f611; +fma.rn.f32 f616, f111, 0f3F218EFB, f612; +fma.rn.f32 f617, f113, 0f3F2EBBCE, f613; +fma.rn.f32 f618, f116, 0fBF3B1811, f614; +fma.rn.f32 f619, f746, 0f3F2EBBCE, f615; +fma.rn.f32 f620, f115, 0fBF3B1811, f616; +fma.rn.f32 f621, f117, 0fBF13A152, f617; +fma.rn.f32 f622, f120, 0f3F5124F0, f618; +fma.rn.f32 f623, f743, 0fBF13A152, f619; +fma.rn.f32 f624, f119, 0f3F5124F0, f620; +fma.rn.f32 f625, f121, 0f3EEB8DA5, f621; +fma.rn.f32 f626, f124, 0fBF634C72, f622; +fma.rn.f32 f627, f741, 0f3EEB8DA5, f623; +fma.rn.f32 f628, f123, 0fBF634C72, f624; +fma.rn.f32 f629, f125, 0fBEAB7557, f625; +fma.rn.f32 f630, f128, 0f3F713803, f626; +fma.rn.f32 f631, f739, 0fBEAB7557, f627; +fma.rn.f32 f632, f127, 0f3F713803, f628; +fma.rn.f32 f633, f129, 0f3E5056C6, f629; +fma.rn.f32 f634, f132, 0fBF7AA541, f630; +fma.rn.f32 f635, f736, 0f3E5056C6, f631; +fma.rn.f32 f636, f131, 0fBF7AA541, f632; +fma.rn.f32 f637, f133, 0fBD8BC2AE, f633; +fma.rn.f32 f638, f136, 0f3F7F6738, f634; +fma.rn.f32 f639, f734, 0fBD8BC2AE, f635; +fma.rn.f32 f640, f135, 0f3F7F6738, f636; +add.f32 %1, f156, f734; +add.f32 %0, f155, f133; +sub.f32 %2, f197, f198; +add.f32 %3, f199, f200; +sub.f32 %4, f241, f242; +add.f32 %5, f243, f244; +add.f32 %7, f287, f288; +sub.f32 %6, f285, f286; +add.f32 %9, f331, f332; +sub.f32 %8, f329, f330; +add.f32 %11, f375, f376; +sub.f32 %10, f373, f374; +sub.f32 %12, f417, f418; +add.f32 %13, f419, f420; +sub.f32 %14, f461, f462; +add.f32 %15, f463, f464; +sub.f32 %16, f505, f506; +add.f32 %17, f507, f508; +sub.f32 %18, f549, f550; +add.f32 %19, f551, f552; +add.f32 %21, f595, f596; +sub.f32 %20, f593, f594; +add.f32 %23, f639, f640; +sub.f32 %22, f637, f638; +sub.f32 %25, f639, f640; +add.f32 %24, f637, f638; +sub.f32 %27, f595, f596; +add.f32 %26, f593, f594; +sub.f32 %29, f551, f552; +add.f32 %28, f549, f550; +sub.f32 %31, f507, f508; +add.f32 %30, f505, f506; +sub.f32 %33, f463, f464; +add.f32 %32, f461, f462; +sub.f32 %35, f419, f420; +add.f32 %34, f417, f418; +sub.f32 %37, f375, f376; +add.f32 %36, f373, f374; +sub.f32 %39, f331, f332; +add.f32 %38, f329, f330; +sub.f32 %41, f287, f288; +add.f32 %40, f285, f286; +sub.f32 %43, f243, f244; +add.f32 %42, f241, f242; +sub.f32 %45, f199, f200; +add.f32 %44, f197, f198; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[2].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[19].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[16].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[14].y), "f"(rmem[10].y), "f"(rmem[13].y), "f"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..079375c650215 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp64_fwd.hpp.inc @@ -0,0 +1,610 @@ +#ifndef CUFFTDX_FFT_23_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_23_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<413, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<758>; +.reg .b64 rd<4>; +add.f64 fd93, %48, %91; +sub.f64 fd95, %48, %91; +add.f64 fd757, %50, %92; +sub.f64 fd96, %50, %92; +add.f64 fd97, %51, %89; +sub.f64 fd99, %51, %89; +add.f64 fd755, %93, %90; +sub.f64 fd100, %93, %90; +add.f64 fd101, %53, %87; +sub.f64 fd103, %53, %87; +add.f64 fd753, %54, %94; +sub.f64 fd104, %54, %94; +add.f64 fd105, %55, %85; +sub.f64 fd107, %55, %85; +add.f64 fd750, %95, %96; +sub.f64 fd108, %95, %96; +add.f64 fd109, %57, %83; +sub.f64 fd111, %57, %83; +add.f64 fd748, %97, %84; +sub.f64 fd112, %97, %84; +add.f64 fd113, %59, %81; +sub.f64 fd115, %59, %81; +add.f64 fd746, %60, %98; +sub.f64 fd116, %60, %98; +add.f64 fd117, %61, %79; +sub.f64 fd119, %61, %79; +add.f64 fd743, %100, %99; +sub.f64 fd120, %100, %99; +add.f64 fd121, %63, %77; +sub.f64 fd123, %63, %77; +add.f64 fd741, %101, %78; +sub.f64 fd124, %101, %78; +add.f64 fd125, %65, %75; +sub.f64 fd127, %65, %75; +add.f64 fd739, %66, %102; +sub.f64 fd128, %66, %102; +add.f64 fd129, %67, %73; +sub.f64 fd131, %67, %73; +add.f64 fd736, %103, %104; +sub.f64 fd132, %103, %104; +add.f64 fd133, %69, %71; +sub.f64 fd135, %69, %71; +add.f64 fd734, %105, %72; +sub.f64 fd136, %105, %72; +add.f64 fd137, %46, fd93; +add.f64 fd139, fd137, fd97; +add.f64 fd733, %47, fd757; +add.f64 fd140, fd733, fd755; +add.f64 fd141, fd139, fd101; +add.f64 fd142, fd140, fd753; +add.f64 fd143, fd141, fd105; +add.f64 fd144, fd142, fd750; +add.f64 fd145, fd143, fd109; +add.f64 fd146, fd144, fd748; +add.f64 fd147, fd145, fd113; +add.f64 fd148, fd146, fd746; +add.f64 fd149, fd147, fd117; +add.f64 fd150, fd148, fd743; +add.f64 fd151, fd149, fd121; +add.f64 fd152, fd150, fd741; +add.f64 fd153, fd151, fd125; +add.f64 fd154, fd152, fd739; +add.f64 fd155, fd153, fd129; +add.f64 fd156, fd154, fd736; +fma.rn.f64 fd157, fd93, 0d3FEED037EA3D2DBB, %46; +fma.rn.f64 fd161, fd97, 0d3FEB57675CF309EE, fd157; +fma.rn.f64 fd732, fd96, 0dBFD14459AD2BE466, 0d0000000000000000; +fma.rn.f64 fd162, fd100, 0dBFE0A06E851DB7CA, fd732; +fma.rn.f64 fd731, fd757, 0d3FEED037EA3D2DBB, %47; +fma.rn.f64 fd163, fd755, 0d3FEB57675CF309EE, fd731; +fma.rn.f64 fd730, fd95, 0dBFD14459AD2BE466, 0d0000000000000000; +fma.rn.f64 fd164, fd99, 0dBFE0A06E851DB7CA, fd730; +fma.rn.f64 fd165, fd101, 0d3FE5D779B07CFEF7, fd161; +fma.rn.f64 fd166, fd104, 0dBFE763021AAA15DA, fd162; +fma.rn.f64 fd167, fd753, 0d3FE5D779B07CFEF7, fd163; +fma.rn.f64 fd168, fd103, 0dBFE763021AAA15DA, fd164; +fma.rn.f64 fd169, fd105, 0d3FDD71B4A0C5A6C8, fd165; +fma.rn.f64 fd170, fd108, 0dBFEC698E42F47B09, fd166; +fma.rn.f64 fd171, fd750, 0d3FDD71B4A0C5A6C8, fd167; +fma.rn.f64 fd172, fd107, 0dBFEC698E42F47B09, fd168; +fma.rn.f64 fd173, fd109, 0d3FCA0AD8BD1E2882, fd169; +fma.rn.f64 fd174, fd112, 0dBFEF54A827142577, fd170; +fma.rn.f64 fd175, fd748, 0d3FCA0AD8BD1E2882, fd171; +fma.rn.f64 fd176, fd111, 0dBFEF54A827142577, fd172; +fma.rn.f64 fd177, fd113, 0dBFB17855B599F3B9, fd173; +fma.rn.f64 fd178, fd116, 0dBFEFECE70DFD3EFB, fd174; +fma.rn.f64 fd179, fd746, 0dBFB17855B599F3B9, fd175; +fma.rn.f64 fd180, fd115, 0dBFEFECE70DFD3EFB, fd176; +fma.rn.f64 fd181, fd117, 0dBFD56EAAE597C776, fd177; +fma.rn.f64 fd182, fd120, 0dBFEE270060999288, fd178; +fma.rn.f64 fd183, fd743, 0dBFD56EAAE597C776, fd179; +fma.rn.f64 fd184, fd119, 0dBFEE270060999288, fd180; +fma.rn.f64 fd185, fd121, 0dBFE2742A4A775CFB, fd181; +fma.rn.f64 fd186, fd124, 0dBFEA249E0B897CA9, fd182; +fma.rn.f64 fd187, fd741, 0dBFE2742A4A775CFB, fd183; +fma.rn.f64 fd188, fd123, 0dBFEA249E0B897CA9, fd184; +fma.rn.f64 fd189, fd125, 0dBFE8D2A07C16D46F, fd185; +fma.rn.f64 fd190, fd128, 0dBFE431DF5838F7EF, fd186; +fma.rn.f64 fd191, fd739, 0dBFE8D2A07C16D46F, fd187; +fma.rn.f64 fd192, fd127, 0dBFE431DF5838F7EF, fd188; +fma.rn.f64 fd193, fd129, 0dBFED59CB83EF99BC, fd189; +fma.rn.f64 fd194, fd132, 0dBFD97F6748E524B2, fd190; +fma.rn.f64 fd195, fd736, 0dBFED59CB83EF99BC, fd191; +fma.rn.f64 fd196, fd131, 0dBFD97F6748E524B2, fd192; +fma.rn.f64 fd197, fd133, 0dBFEFB3B3035AA6CD, fd193; +fma.rn.f64 fd198, fd136, 0dBFC16DE8A4564F0A, fd194; +fma.rn.f64 fd199, fd734, 0dBFEFB3B3035AA6CD, fd195; +fma.rn.f64 fd200, fd135, 0dBFC16DE8A4564F0A, fd196; +fma.rn.f64 fd201, fd93, 0d3FEB57675CF309EE, %46; +fma.rn.f64 fd205, fd97, 0d3FDD71B4A0C5A6C8, fd201; +fma.rn.f64 fd729, fd96, 0dBFE0A06E851DB7CA, 0d0000000000000000; +fma.rn.f64 fd206, fd100, 0dBFEC698E42F47B09, fd729; +fma.rn.f64 fd728, fd757, 0d3FEB57675CF309EE, %47; +fma.rn.f64 fd207, fd755, 0d3FDD71B4A0C5A6C8, fd728; +fma.rn.f64 fd727, fd95, 0dBFE0A06E851DB7CA, 0d0000000000000000; +fma.rn.f64 fd208, fd99, 0dBFEC698E42F47B09, fd727; +fma.rn.f64 fd209, fd101, 0dBFB17855B599F3B9, fd205; +fma.rn.f64 fd210, fd104, 0dBFEFECE70DFD3EFB, fd206; +fma.rn.f64 fd211, fd753, 0dBFB17855B599F3B9, fd207; +fma.rn.f64 fd212, fd103, 0dBFEFECE70DFD3EFB, fd208; +fma.rn.f64 fd213, fd105, 0dBFE2742A4A775CFB, fd209; +fma.rn.f64 fd214, fd108, 0dBFEA249E0B897CA9, fd210; +fma.rn.f64 fd215, fd750, 0dBFE2742A4A775CFB, fd211; +fma.rn.f64 fd216, fd107, 0dBFEA249E0B897CA9, fd212; +fma.rn.f64 fd217, fd109, 0dBFED59CB83EF99BC, fd213; +fma.rn.f64 fd218, fd112, 0dBFD97F6748E524B2, fd214; +fma.rn.f64 fd219, fd748, 0dBFED59CB83EF99BC, fd215; +fma.rn.f64 fd220, fd111, 0dBFD97F6748E524B2, fd216; +fma.rn.f64 fd221, fd113, 0dBFEFB3B3035AA6CD, fd217; +fma.rn.f64 fd222, fd116, 0d3FC16DE8A4564F0A, fd218; +fma.rn.f64 fd223, fd746, 0dBFEFB3B3035AA6CD, fd219; +fma.rn.f64 fd224, fd115, 0d3FC16DE8A4564F0A, fd220; +fma.rn.f64 fd225, fd117, 0dBFE8D2A07C16D46F, fd221; +fma.rn.f64 fd226, fd120, 0d3FE431DF5838F7EF, fd222; +fma.rn.f64 fd227, fd743, 0dBFE8D2A07C16D46F, fd223; +fma.rn.f64 fd228, fd119, 0d3FE431DF5838F7EF, fd224; +fma.rn.f64 fd229, fd121, 0dBFD56EAAE597C776, fd225; +fma.rn.f64 fd230, fd124, 0d3FEE270060999288, fd226; +fma.rn.f64 fd231, fd741, 0dBFD56EAAE597C776, fd227; +fma.rn.f64 fd232, fd123, 0d3FEE270060999288, fd228; +fma.rn.f64 fd233, fd125, 0d3FCA0AD8BD1E2882, fd229; +fma.rn.f64 fd234, fd128, 0d3FEF54A827142577, fd230; +fma.rn.f64 fd235, fd739, 0d3FCA0AD8BD1E2882, fd231; +fma.rn.f64 fd236, fd127, 0d3FEF54A827142577, fd232; +fma.rn.f64 fd237, fd129, 0d3FE5D779B07CFEF7, fd233; +fma.rn.f64 fd238, fd132, 0d3FE763021AAA15DA, fd234; +fma.rn.f64 fd239, fd736, 0d3FE5D779B07CFEF7, fd235; +fma.rn.f64 fd240, fd131, 0d3FE763021AAA15DA, fd236; +fma.rn.f64 fd241, fd133, 0d3FEED037EA3D2DBB, fd237; +fma.rn.f64 fd242, fd136, 0d3FD14459AD2BE466, fd238; +fma.rn.f64 fd243, fd734, 0d3FEED037EA3D2DBB, fd239; +fma.rn.f64 fd244, fd135, 0d3FD14459AD2BE466, fd240; +fma.rn.f64 fd245, fd93, 0d3FE5D779B07CFEF7, %46; +fma.rn.f64 fd249, fd97, 0dBFB17855B599F3B9, fd245; +fma.rn.f64 fd726, fd96, 0dBFE763021AAA15DA, 0d0000000000000000; +fma.rn.f64 fd250, fd100, 0dBFEFECE70DFD3EFB, fd726; +fma.rn.f64 fd725, fd757, 0d3FE5D779B07CFEF7, %47; +fma.rn.f64 fd251, fd755, 0dBFB17855B599F3B9, fd725; +fma.rn.f64 fd724, fd95, 0dBFE763021AAA15DA, 0d0000000000000000; +fma.rn.f64 fd252, fd99, 0dBFEFECE70DFD3EFB, fd724; +fma.rn.f64 fd253, fd101, 0dBFE8D2A07C16D46F, fd249; +fma.rn.f64 fd254, fd104, 0dBFE431DF5838F7EF, fd250; +fma.rn.f64 fd255, fd753, 0dBFE8D2A07C16D46F, fd251; +fma.rn.f64 fd256, fd103, 0dBFE431DF5838F7EF, fd252; +fma.rn.f64 fd257, fd105, 0dBFEFB3B3035AA6CD, fd253; +fma.rn.f64 fd258, fd108, 0d3FC16DE8A4564F0A, fd254; +fma.rn.f64 fd259, fd750, 0dBFEFB3B3035AA6CD, fd255; +fma.rn.f64 fd260, fd107, 0d3FC16DE8A4564F0A, fd256; +fma.rn.f64 fd261, fd109, 0dBFE2742A4A775CFB, fd257; +fma.rn.f64 fd262, fd112, 0d3FEA249E0B897CA9, fd258; +fma.rn.f64 fd263, fd748, 0dBFE2742A4A775CFB, fd259; +fma.rn.f64 fd264, fd111, 0d3FEA249E0B897CA9, fd260; +fma.rn.f64 fd265, fd113, 0d3FCA0AD8BD1E2882, fd261; +fma.rn.f64 fd266, fd116, 0d3FEF54A827142577, fd262; +fma.rn.f64 fd267, fd746, 0d3FCA0AD8BD1E2882, fd263; +fma.rn.f64 fd268, fd115, 0d3FEF54A827142577, fd264; +fma.rn.f64 fd269, fd117, 0d3FEB57675CF309EE, fd265; +fma.rn.f64 fd270, fd120, 0d3FE0A06E851DB7CA, fd266; +fma.rn.f64 fd271, fd743, 0d3FEB57675CF309EE, fd267; +fma.rn.f64 fd272, fd119, 0d3FE0A06E851DB7CA, fd268; +fma.rn.f64 fd273, fd121, 0d3FEED037EA3D2DBB, fd269; +fma.rn.f64 fd274, fd124, 0dBFD14459AD2BE466, fd270; +fma.rn.f64 fd275, fd741, 0d3FEED037EA3D2DBB, fd271; +fma.rn.f64 fd276, fd123, 0dBFD14459AD2BE466, fd272; +fma.rn.f64 fd277, fd125, 0d3FDD71B4A0C5A6C8, fd273; +fma.rn.f64 fd278, fd128, 0dBFEC698E42F47B09, fd274; +fma.rn.f64 fd279, fd739, 0d3FDD71B4A0C5A6C8, fd275; +fma.rn.f64 fd280, fd127, 0dBFEC698E42F47B09, fd276; +fma.rn.f64 fd281, fd129, 0dBFD56EAAE597C776, fd277; +fma.rn.f64 fd282, fd132, 0dBFEE270060999288, fd278; +fma.rn.f64 fd283, fd736, 0dBFD56EAAE597C776, fd279; +fma.rn.f64 fd284, fd131, 0dBFEE270060999288, fd280; +fma.rn.f64 fd285, fd133, 0dBFED59CB83EF99BC, fd281; +fma.rn.f64 fd286, fd136, 0dBFD97F6748E524B2, fd282; +fma.rn.f64 fd287, fd734, 0dBFED59CB83EF99BC, fd283; +fma.rn.f64 fd288, fd135, 0dBFD97F6748E524B2, fd284; +fma.rn.f64 fd289, fd93, 0d3FDD71B4A0C5A6C8, %46; +fma.rn.f64 fd293, fd97, 0dBFE2742A4A775CFB, fd289; +fma.rn.f64 fd723, fd96, 0dBFEC698E42F47B09, 0d0000000000000000; +fma.rn.f64 fd294, fd100, 0dBFEA249E0B897CA9, fd723; +fma.rn.f64 fd722, fd757, 0d3FDD71B4A0C5A6C8, %47; +fma.rn.f64 fd295, fd755, 0dBFE2742A4A775CFB, fd722; +fma.rn.f64 fd721, fd95, 0dBFEC698E42F47B09, 0d0000000000000000; +fma.rn.f64 fd296, fd99, 0dBFEA249E0B897CA9, fd721; +fma.rn.f64 fd297, fd101, 0dBFEFB3B3035AA6CD, fd293; +fma.rn.f64 fd298, fd104, 0d3FC16DE8A4564F0A, fd294; +fma.rn.f64 fd299, fd753, 0dBFEFB3B3035AA6CD, fd295; +fma.rn.f64 fd300, fd103, 0d3FC16DE8A4564F0A, fd296; +fma.rn.f64 fd301, fd105, 0dBFD56EAAE597C776, fd297; +fma.rn.f64 fd302, fd108, 0d3FEE270060999288, fd298; +fma.rn.f64 fd303, fd750, 0dBFD56EAAE597C776, fd299; +fma.rn.f64 fd304, fd107, 0d3FEE270060999288, fd300; +fma.rn.f64 fd305, fd109, 0d3FE5D779B07CFEF7, fd301; +fma.rn.f64 fd306, fd112, 0d3FE763021AAA15DA, fd302; +fma.rn.f64 fd307, fd748, 0d3FE5D779B07CFEF7, fd303; +fma.rn.f64 fd308, fd111, 0d3FE763021AAA15DA, fd304; +fma.rn.f64 fd309, fd113, 0d3FEED037EA3D2DBB, fd305; +fma.rn.f64 fd310, fd116, 0dBFD14459AD2BE466, fd306; +fma.rn.f64 fd311, fd746, 0d3FEED037EA3D2DBB, fd307; +fma.rn.f64 fd312, fd115, 0dBFD14459AD2BE466, fd308; +fma.rn.f64 fd313, fd117, 0d3FCA0AD8BD1E2882, fd309; +fma.rn.f64 fd314, fd120, 0dBFEF54A827142577, fd310; +fma.rn.f64 fd315, fd743, 0d3FCA0AD8BD1E2882, fd311; +fma.rn.f64 fd316, fd119, 0dBFEF54A827142577, fd312; +fma.rn.f64 fd317, fd121, 0dBFE8D2A07C16D46F, fd313; +fma.rn.f64 fd318, fd124, 0dBFE431DF5838F7EF, fd314; +fma.rn.f64 fd319, fd741, 0dBFE8D2A07C16D46F, fd315; +fma.rn.f64 fd320, fd123, 0dBFE431DF5838F7EF, fd316; +fma.rn.f64 fd321, fd125, 0dBFED59CB83EF99BC, fd317; +fma.rn.f64 fd322, fd128, 0d3FD97F6748E524B2, fd318; +fma.rn.f64 fd323, fd739, 0dBFED59CB83EF99BC, fd319; +fma.rn.f64 fd324, fd127, 0d3FD97F6748E524B2, fd320; +fma.rn.f64 fd325, fd129, 0dBFB17855B599F3B9, fd321; +fma.rn.f64 fd326, fd132, 0d3FEFECE70DFD3EFB, fd322; +fma.rn.f64 fd327, fd736, 0dBFB17855B599F3B9, fd323; +fma.rn.f64 fd328, fd131, 0d3FEFECE70DFD3EFB, fd324; +fma.rn.f64 fd329, fd133, 0d3FEB57675CF309EE, fd325; +fma.rn.f64 fd330, fd136, 0d3FE0A06E851DB7CA, fd326; +fma.rn.f64 fd331, fd734, 0d3FEB57675CF309EE, fd327; +fma.rn.f64 fd332, fd135, 0d3FE0A06E851DB7CA, fd328; +fma.rn.f64 fd333, fd93, 0d3FCA0AD8BD1E2882, %46; +fma.rn.f64 fd337, fd97, 0dBFED59CB83EF99BC, fd333; +fma.rn.f64 fd720, fd96, 0dBFEF54A827142577, 0d0000000000000000; +fma.rn.f64 fd338, fd100, 0dBFD97F6748E524B2, fd720; +fma.rn.f64 fd719, fd757, 0d3FCA0AD8BD1E2882, %47; +fma.rn.f64 fd339, fd755, 0dBFED59CB83EF99BC, fd719; +fma.rn.f64 fd718, fd95, 0dBFEF54A827142577, 0d0000000000000000; +fma.rn.f64 fd340, fd99, 0dBFD97F6748E524B2, fd718; +fma.rn.f64 fd341, fd101, 0dBFE2742A4A775CFB, fd337; +fma.rn.f64 fd342, fd104, 0d3FEA249E0B897CA9, fd338; +fma.rn.f64 fd343, fd753, 0dBFE2742A4A775CFB, fd339; +fma.rn.f64 fd344, fd103, 0d3FEA249E0B897CA9, fd340; +fma.rn.f64 fd345, fd105, 0d3FE5D779B07CFEF7, fd341; +fma.rn.f64 fd346, fd108, 0d3FE763021AAA15DA, fd342; +fma.rn.f64 fd347, fd750, 0d3FE5D779B07CFEF7, fd343; +fma.rn.f64 fd348, fd107, 0d3FE763021AAA15DA, fd344; +fma.rn.f64 fd349, fd109, 0d3FEB57675CF309EE, fd345; +fma.rn.f64 fd350, fd112, 0dBFE0A06E851DB7CA, fd346; +fma.rn.f64 fd351, fd748, 0d3FEB57675CF309EE, fd347; +fma.rn.f64 fd352, fd111, 0dBFE0A06E851DB7CA, fd348; +fma.rn.f64 fd353, fd113, 0dBFD56EAAE597C776, fd349; +fma.rn.f64 fd354, fd116, 0dBFEE270060999288, fd350; +fma.rn.f64 fd355, fd746, 0dBFD56EAAE597C776, fd351; +fma.rn.f64 fd356, fd115, 0dBFEE270060999288, fd352; +fma.rn.f64 fd357, fd117, 0dBFEFB3B3035AA6CD, fd353; +fma.rn.f64 fd358, fd120, 0d3FC16DE8A4564F0A, fd354; +fma.rn.f64 fd359, fd743, 0dBFEFB3B3035AA6CD, fd355; +fma.rn.f64 fd360, fd119, 0d3FC16DE8A4564F0A, fd356; +fma.rn.f64 fd361, fd121, 0dBFB17855B599F3B9, fd357; +fma.rn.f64 fd362, fd124, 0d3FEFECE70DFD3EFB, fd358; +fma.rn.f64 fd363, fd741, 0dBFB17855B599F3B9, fd359; +fma.rn.f64 fd364, fd123, 0d3FEFECE70DFD3EFB, fd360; +fma.rn.f64 fd365, fd125, 0d3FEED037EA3D2DBB, fd361; +fma.rn.f64 fd366, fd128, 0d3FD14459AD2BE466, fd362; +fma.rn.f64 fd367, fd739, 0d3FEED037EA3D2DBB, fd363; +fma.rn.f64 fd368, fd127, 0d3FD14459AD2BE466, fd364; +fma.rn.f64 fd369, fd129, 0d3FDD71B4A0C5A6C8, fd365; +fma.rn.f64 fd370, fd132, 0dBFEC698E42F47B09, fd366; +fma.rn.f64 fd371, fd736, 0d3FDD71B4A0C5A6C8, fd367; +fma.rn.f64 fd372, fd131, 0dBFEC698E42F47B09, fd368; +fma.rn.f64 fd373, fd133, 0dBFE8D2A07C16D46F, fd369; +fma.rn.f64 fd374, fd136, 0dBFE431DF5838F7EF, fd370; +fma.rn.f64 fd375, fd734, 0dBFE8D2A07C16D46F, fd371; +fma.rn.f64 fd376, fd135, 0dBFE431DF5838F7EF, fd372; +fma.rn.f64 fd377, fd93, 0dBFB17855B599F3B9, %46; +fma.rn.f64 fd381, fd97, 0dBFEFB3B3035AA6CD, fd377; +fma.rn.f64 fd717, fd96, 0dBFEFECE70DFD3EFB, 0d0000000000000000; +fma.rn.f64 fd382, fd100, 0d3FC16DE8A4564F0A, fd717; +fma.rn.f64 fd716, fd757, 0dBFB17855B599F3B9, %47; +fma.rn.f64 fd383, fd755, 0dBFEFB3B3035AA6CD, fd716; +fma.rn.f64 fd715, fd95, 0dBFEFECE70DFD3EFB, 0d0000000000000000; +fma.rn.f64 fd384, fd99, 0d3FC16DE8A4564F0A, fd715; +fma.rn.f64 fd385, fd101, 0d3FCA0AD8BD1E2882, fd381; +fma.rn.f64 fd386, fd104, 0d3FEF54A827142577, fd382; +fma.rn.f64 fd387, fd753, 0d3FCA0AD8BD1E2882, fd383; +fma.rn.f64 fd388, fd103, 0d3FEF54A827142577, fd384; +fma.rn.f64 fd389, fd105, 0d3FEED037EA3D2DBB, fd385; +fma.rn.f64 fd390, fd108, 0dBFD14459AD2BE466, fd386; +fma.rn.f64 fd391, fd750, 0d3FEED037EA3D2DBB, fd387; +fma.rn.f64 fd392, fd107, 0dBFD14459AD2BE466, fd388; +fma.rn.f64 fd393, fd109, 0dBFD56EAAE597C776, fd389; +fma.rn.f64 fd394, fd112, 0dBFEE270060999288, fd390; +fma.rn.f64 fd395, fd748, 0dBFD56EAAE597C776, fd391; +fma.rn.f64 fd396, fd111, 0dBFEE270060999288, fd392; +fma.rn.f64 fd397, fd113, 0dBFED59CB83EF99BC, fd393; +fma.rn.f64 fd398, fd116, 0d3FD97F6748E524B2, fd394; +fma.rn.f64 fd399, fd746, 0dBFED59CB83EF99BC, fd395; +fma.rn.f64 fd400, fd115, 0d3FD97F6748E524B2, fd396; +fma.rn.f64 fd401, fd117, 0d3FDD71B4A0C5A6C8, fd397; +fma.rn.f64 fd402, fd120, 0d3FEC698E42F47B09, fd398; +fma.rn.f64 fd403, fd743, 0d3FDD71B4A0C5A6C8, fd399; +fma.rn.f64 fd404, fd119, 0d3FEC698E42F47B09, fd400; +fma.rn.f64 fd405, fd121, 0d3FEB57675CF309EE, fd401; +fma.rn.f64 fd406, fd124, 0dBFE0A06E851DB7CA, fd402; +fma.rn.f64 fd407, fd741, 0d3FEB57675CF309EE, fd403; +fma.rn.f64 fd408, fd123, 0dBFE0A06E851DB7CA, fd404; +fma.rn.f64 fd409, fd125, 0dBFE2742A4A775CFB, fd405; +fma.rn.f64 fd410, fd128, 0dBFEA249E0B897CA9, fd406; +fma.rn.f64 fd411, fd739, 0dBFE2742A4A775CFB, fd407; +fma.rn.f64 fd412, fd127, 0dBFEA249E0B897CA9, fd408; +fma.rn.f64 fd413, fd129, 0dBFE8D2A07C16D46F, fd409; +fma.rn.f64 fd414, fd132, 0d3FE431DF5838F7EF, fd410; +fma.rn.f64 fd415, fd736, 0dBFE8D2A07C16D46F, fd411; +fma.rn.f64 fd416, fd131, 0d3FE431DF5838F7EF, fd412; +fma.rn.f64 fd417, fd133, 0d3FE5D779B07CFEF7, fd413; +fma.rn.f64 fd418, fd136, 0d3FE763021AAA15DA, fd414; +fma.rn.f64 fd419, fd734, 0d3FE5D779B07CFEF7, fd415; +fma.rn.f64 fd420, fd135, 0d3FE763021AAA15DA, fd416; +fma.rn.f64 fd421, fd93, 0dBFD56EAAE597C776, %46; +fma.rn.f64 fd425, fd97, 0dBFE8D2A07C16D46F, fd421; +fma.rn.f64 fd714, fd96, 0dBFEE270060999288, 0d0000000000000000; +fma.rn.f64 fd426, fd100, 0d3FE431DF5838F7EF, fd714; +fma.rn.f64 fd713, fd757, 0dBFD56EAAE597C776, %47; +fma.rn.f64 fd427, fd755, 0dBFE8D2A07C16D46F, fd713; +fma.rn.f64 fd712, fd95, 0dBFEE270060999288, 0d0000000000000000; +fma.rn.f64 fd428, fd99, 0d3FE431DF5838F7EF, fd712; +fma.rn.f64 fd429, fd101, 0d3FEB57675CF309EE, fd425; +fma.rn.f64 fd430, fd104, 0d3FE0A06E851DB7CA, fd426; +fma.rn.f64 fd431, fd753, 0d3FEB57675CF309EE, fd427; +fma.rn.f64 fd432, fd103, 0d3FE0A06E851DB7CA, fd428; +fma.rn.f64 fd433, fd105, 0d3FCA0AD8BD1E2882, fd429; +fma.rn.f64 fd434, fd108, 0dBFEF54A827142577, fd430; +fma.rn.f64 fd435, fd750, 0d3FCA0AD8BD1E2882, fd431; +fma.rn.f64 fd436, fd107, 0dBFEF54A827142577, fd432; +fma.rn.f64 fd437, fd109, 0dBFEFB3B3035AA6CD, fd433; +fma.rn.f64 fd438, fd112, 0d3FC16DE8A4564F0A, fd434; +fma.rn.f64 fd439, fd748, 0dBFEFB3B3035AA6CD, fd435; +fma.rn.f64 fd440, fd111, 0d3FC16DE8A4564F0A, fd436; +fma.rn.f64 fd441, fd113, 0d3FDD71B4A0C5A6C8, fd437; +fma.rn.f64 fd442, fd116, 0d3FEC698E42F47B09, fd438; +fma.rn.f64 fd443, fd746, 0d3FDD71B4A0C5A6C8, fd439; +fma.rn.f64 fd444, fd115, 0d3FEC698E42F47B09, fd440; +fma.rn.f64 fd445, fd117, 0d3FE5D779B07CFEF7, fd441; +fma.rn.f64 fd446, fd120, 0dBFE763021AAA15DA, fd442; +fma.rn.f64 fd447, fd743, 0d3FE5D779B07CFEF7, fd443; +fma.rn.f64 fd448, fd119, 0dBFE763021AAA15DA, fd444; +fma.rn.f64 fd449, fd121, 0dBFED59CB83EF99BC, fd445; +fma.rn.f64 fd450, fd124, 0dBFD97F6748E524B2, fd446; +fma.rn.f64 fd451, fd741, 0dBFED59CB83EF99BC, fd447; +fma.rn.f64 fd452, fd123, 0dBFD97F6748E524B2, fd448; +fma.rn.f64 fd453, fd125, 0dBFB17855B599F3B9, fd449; +fma.rn.f64 fd454, fd128, 0d3FEFECE70DFD3EFB, fd450; +fma.rn.f64 fd455, fd739, 0dBFB17855B599F3B9, fd451; +fma.rn.f64 fd456, fd127, 0d3FEFECE70DFD3EFB, fd452; +fma.rn.f64 fd457, fd129, 0d3FEED037EA3D2DBB, fd453; +fma.rn.f64 fd458, fd132, 0dBFD14459AD2BE466, fd454; +fma.rn.f64 fd459, fd736, 0d3FEED037EA3D2DBB, fd455; +fma.rn.f64 fd460, fd131, 0dBFD14459AD2BE466, fd456; +fma.rn.f64 fd461, fd133, 0dBFE2742A4A775CFB, fd457; +fma.rn.f64 fd462, fd136, 0dBFEA249E0B897CA9, fd458; +fma.rn.f64 fd463, fd734, 0dBFE2742A4A775CFB, fd459; +fma.rn.f64 fd464, fd135, 0dBFEA249E0B897CA9, fd460; +fma.rn.f64 fd465, fd93, 0dBFE2742A4A775CFB, %46; +fma.rn.f64 fd469, fd97, 0dBFD56EAAE597C776, fd465; +fma.rn.f64 fd711, fd96, 0dBFEA249E0B897CA9, 0d0000000000000000; +fma.rn.f64 fd470, fd100, 0d3FEE270060999288, fd711; +fma.rn.f64 fd710, fd757, 0dBFE2742A4A775CFB, %47; +fma.rn.f64 fd471, fd755, 0dBFD56EAAE597C776, fd710; +fma.rn.f64 fd709, fd95, 0dBFEA249E0B897CA9, 0d0000000000000000; +fma.rn.f64 fd472, fd99, 0d3FEE270060999288, fd709; +fma.rn.f64 fd473, fd101, 0d3FEED037EA3D2DBB, fd469; +fma.rn.f64 fd474, fd104, 0dBFD14459AD2BE466, fd470; +fma.rn.f64 fd475, fd753, 0d3FEED037EA3D2DBB, fd471; +fma.rn.f64 fd476, fd103, 0dBFD14459AD2BE466, fd472; +fma.rn.f64 fd477, fd105, 0dBFE8D2A07C16D46F, fd473; +fma.rn.f64 fd478, fd108, 0dBFE431DF5838F7EF, fd474; +fma.rn.f64 fd479, fd750, 0dBFE8D2A07C16D46F, fd475; +fma.rn.f64 fd480, fd107, 0dBFE431DF5838F7EF, fd476; +fma.rn.f64 fd481, fd109, 0dBFB17855B599F3B9, fd477; +fma.rn.f64 fd482, fd112, 0d3FEFECE70DFD3EFB, fd478; +fma.rn.f64 fd483, fd748, 0dBFB17855B599F3B9, fd479; +fma.rn.f64 fd484, fd111, 0d3FEFECE70DFD3EFB, fd480; +fma.rn.f64 fd485, fd113, 0d3FEB57675CF309EE, fd481; +fma.rn.f64 fd486, fd116, 0dBFE0A06E851DB7CA, fd482; +fma.rn.f64 fd487, fd746, 0d3FEB57675CF309EE, fd483; +fma.rn.f64 fd488, fd115, 0dBFE0A06E851DB7CA, fd484; +fma.rn.f64 fd489, fd117, 0dBFED59CB83EF99BC, fd485; +fma.rn.f64 fd490, fd120, 0dBFD97F6748E524B2, fd486; +fma.rn.f64 fd491, fd743, 0dBFED59CB83EF99BC, fd487; +fma.rn.f64 fd492, fd119, 0dBFD97F6748E524B2, fd488; +fma.rn.f64 fd493, fd121, 0d3FCA0AD8BD1E2882, fd489; +fma.rn.f64 fd494, fd124, 0d3FEF54A827142577, fd490; +fma.rn.f64 fd495, fd741, 0d3FCA0AD8BD1E2882, fd491; +fma.rn.f64 fd496, fd123, 0d3FEF54A827142577, fd492; +fma.rn.f64 fd497, fd125, 0d3FE5D779B07CFEF7, fd493; +fma.rn.f64 fd498, fd128, 0dBFE763021AAA15DA, fd494; +fma.rn.f64 fd499, fd739, 0d3FE5D779B07CFEF7, fd495; +fma.rn.f64 fd500, fd127, 0dBFE763021AAA15DA, fd496; +fma.rn.f64 fd501, fd129, 0dBFEFB3B3035AA6CD, fd497; +fma.rn.f64 fd502, fd132, 0dBFC16DE8A4564F0A, fd498; +fma.rn.f64 fd503, fd736, 0dBFEFB3B3035AA6CD, fd499; +fma.rn.f64 fd504, fd131, 0dBFC16DE8A4564F0A, fd500; +fma.rn.f64 fd505, fd133, 0d3FDD71B4A0C5A6C8, fd501; +fma.rn.f64 fd506, fd136, 0d3FEC698E42F47B09, fd502; +fma.rn.f64 fd507, fd734, 0d3FDD71B4A0C5A6C8, fd503; +fma.rn.f64 fd508, fd135, 0d3FEC698E42F47B09, fd504; +fma.rn.f64 fd509, fd93, 0dBFE8D2A07C16D46F, %46; +fma.rn.f64 fd513, fd97, 0d3FCA0AD8BD1E2882, fd509; +fma.rn.f64 fd708, fd96, 0dBFE431DF5838F7EF, 0d0000000000000000; +fma.rn.f64 fd514, fd100, 0d3FEF54A827142577, fd708; +fma.rn.f64 fd707, fd757, 0dBFE8D2A07C16D46F, %47; +fma.rn.f64 fd515, fd755, 0d3FCA0AD8BD1E2882, fd707; +fma.rn.f64 fd706, fd95, 0dBFE431DF5838F7EF, 0d0000000000000000; +fma.rn.f64 fd516, fd99, 0d3FEF54A827142577, fd706; +fma.rn.f64 fd517, fd101, 0d3FDD71B4A0C5A6C8, fd513; +fma.rn.f64 fd518, fd104, 0dBFEC698E42F47B09, fd514; +fma.rn.f64 fd519, fd753, 0d3FDD71B4A0C5A6C8, fd515; +fma.rn.f64 fd520, fd103, 0dBFEC698E42F47B09, fd516; +fma.rn.f64 fd521, fd105, 0dBFED59CB83EF99BC, fd517; +fma.rn.f64 fd522, fd108, 0d3FD97F6748E524B2, fd518; +fma.rn.f64 fd523, fd750, 0dBFED59CB83EF99BC, fd519; +fma.rn.f64 fd524, fd107, 0d3FD97F6748E524B2, fd520; +fma.rn.f64 fd525, fd109, 0d3FEED037EA3D2DBB, fd521; +fma.rn.f64 fd526, fd112, 0d3FD14459AD2BE466, fd522; +fma.rn.f64 fd527, fd748, 0d3FEED037EA3D2DBB, fd523; +fma.rn.f64 fd528, fd111, 0d3FD14459AD2BE466, fd524; +fma.rn.f64 fd529, fd113, 0dBFE2742A4A775CFB, fd525; +fma.rn.f64 fd530, fd116, 0dBFEA249E0B897CA9, fd526; +fma.rn.f64 fd531, fd746, 0dBFE2742A4A775CFB, fd527; +fma.rn.f64 fd532, fd115, 0dBFEA249E0B897CA9, fd528; +fma.rn.f64 fd533, fd117, 0dBFB17855B599F3B9, fd529; +fma.rn.f64 fd534, fd120, 0d3FEFECE70DFD3EFB, fd530; +fma.rn.f64 fd535, fd743, 0dBFB17855B599F3B9, fd531; +fma.rn.f64 fd536, fd119, 0d3FEFECE70DFD3EFB, fd532; +fma.rn.f64 fd537, fd121, 0d3FE5D779B07CFEF7, fd533; +fma.rn.f64 fd538, fd124, 0dBFE763021AAA15DA, fd534; +fma.rn.f64 fd539, fd741, 0d3FE5D779B07CFEF7, fd535; +fma.rn.f64 fd540, fd123, 0dBFE763021AAA15DA, fd536; +fma.rn.f64 fd541, fd125, 0dBFEFB3B3035AA6CD, fd537; +fma.rn.f64 fd542, fd128, 0d3FC16DE8A4564F0A, fd538; +fma.rn.f64 fd543, fd739, 0dBFEFB3B3035AA6CD, fd539; +fma.rn.f64 fd544, fd127, 0d3FC16DE8A4564F0A, fd540; +fma.rn.f64 fd545, fd129, 0d3FEB57675CF309EE, fd541; +fma.rn.f64 fd546, fd132, 0d3FE0A06E851DB7CA, fd542; +fma.rn.f64 fd547, fd736, 0d3FEB57675CF309EE, fd543; +fma.rn.f64 fd548, fd131, 0d3FE0A06E851DB7CA, fd544; +fma.rn.f64 fd549, fd133, 0dBFD56EAAE597C776, fd545; +fma.rn.f64 fd550, fd136, 0dBFEE270060999288, fd546; +fma.rn.f64 fd551, fd734, 0dBFD56EAAE597C776, fd547; +fma.rn.f64 fd552, fd135, 0dBFEE270060999288, fd548; +fma.rn.f64 fd553, fd93, 0dBFED59CB83EF99BC, %46; +fma.rn.f64 fd557, fd97, 0d3FE5D779B07CFEF7, fd553; +fma.rn.f64 fd705, fd96, 0dBFD97F6748E524B2, 0d0000000000000000; +fma.rn.f64 fd558, fd100, 0d3FE763021AAA15DA, fd705; +fma.rn.f64 fd704, fd757, 0dBFED59CB83EF99BC, %47; +fma.rn.f64 fd559, fd755, 0d3FE5D779B07CFEF7, fd704; +fma.rn.f64 fd703, fd95, 0dBFD97F6748E524B2, 0d0000000000000000; +fma.rn.f64 fd560, fd99, 0d3FE763021AAA15DA, fd703; +fma.rn.f64 fd561, fd101, 0dBFD56EAAE597C776, fd557; +fma.rn.f64 fd562, fd104, 0dBFEE270060999288, fd558; +fma.rn.f64 fd563, fd753, 0dBFD56EAAE597C776, fd559; +fma.rn.f64 fd564, fd103, 0dBFEE270060999288, fd560; +fma.rn.f64 fd565, fd105, 0dBFB17855B599F3B9, fd561; +fma.rn.f64 fd566, fd108, 0d3FEFECE70DFD3EFB, fd562; +fma.rn.f64 fd567, fd750, 0dBFB17855B599F3B9, fd563; +fma.rn.f64 fd568, fd107, 0d3FEFECE70DFD3EFB, fd564; +fma.rn.f64 fd569, fd109, 0d3FDD71B4A0C5A6C8, fd565; +fma.rn.f64 fd570, fd112, 0dBFEC698E42F47B09, fd566; +fma.rn.f64 fd571, fd748, 0d3FDD71B4A0C5A6C8, fd567; +fma.rn.f64 fd572, fd111, 0dBFEC698E42F47B09, fd568; +fma.rn.f64 fd573, fd113, 0dBFE8D2A07C16D46F, fd569; +fma.rn.f64 fd574, fd116, 0d3FE431DF5838F7EF, fd570; +fma.rn.f64 fd575, fd746, 0dBFE8D2A07C16D46F, fd571; +fma.rn.f64 fd576, fd115, 0d3FE431DF5838F7EF, fd572; +fma.rn.f64 fd577, fd117, 0d3FEED037EA3D2DBB, fd573; +fma.rn.f64 fd578, fd120, 0dBFD14459AD2BE466, fd574; +fma.rn.f64 fd579, fd743, 0d3FEED037EA3D2DBB, fd575; +fma.rn.f64 fd580, fd119, 0dBFD14459AD2BE466, fd576; +fma.rn.f64 fd581, fd121, 0dBFEFB3B3035AA6CD, fd577; +fma.rn.f64 fd582, fd124, 0dBFC16DE8A4564F0A, fd578; +fma.rn.f64 fd583, fd741, 0dBFEFB3B3035AA6CD, fd579; +fma.rn.f64 fd584, fd123, 0dBFC16DE8A4564F0A, fd580; +fma.rn.f64 fd585, fd125, 0d3FEB57675CF309EE, fd581; +fma.rn.f64 fd586, fd128, 0d3FE0A06E851DB7CA, fd582; +fma.rn.f64 fd587, fd739, 0d3FEB57675CF309EE, fd583; +fma.rn.f64 fd588, fd127, 0d3FE0A06E851DB7CA, fd584; +fma.rn.f64 fd589, fd129, 0dBFE2742A4A775CFB, fd585; +fma.rn.f64 fd590, fd132, 0dBFEA249E0B897CA9, fd586; +fma.rn.f64 fd591, fd736, 0dBFE2742A4A775CFB, fd587; +fma.rn.f64 fd592, fd131, 0dBFEA249E0B897CA9, fd588; +fma.rn.f64 fd593, fd133, 0d3FCA0AD8BD1E2882, fd589; +fma.rn.f64 fd594, fd136, 0d3FEF54A827142577, fd590; +fma.rn.f64 fd595, fd734, 0d3FCA0AD8BD1E2882, fd591; +fma.rn.f64 fd596, fd135, 0d3FEF54A827142577, fd592; +fma.rn.f64 fd597, fd93, 0dBFEFB3B3035AA6CD, %46; +fma.rn.f64 fd598, fd96, 0dBFC16DE8A4564F0A, 0d0000000000000000; +fma.rn.f64 fd599, fd757, 0dBFEFB3B3035AA6CD, %47; +fma.rn.f64 fd600, fd95, 0dBFC16DE8A4564F0A, 0d0000000000000000; +fma.rn.f64 fd601, fd97, 0d3FEED037EA3D2DBB, fd597; +fma.rn.f64 fd602, fd100, 0d3FD14459AD2BE466, fd598; +fma.rn.f64 fd603, fd755, 0d3FEED037EA3D2DBB, fd599; +fma.rn.f64 fd604, fd99, 0d3FD14459AD2BE466, fd600; +fma.rn.f64 fd605, fd101, 0dBFED59CB83EF99BC, fd601; +fma.rn.f64 fd606, fd104, 0dBFD97F6748E524B2, fd602; +fma.rn.f64 fd607, fd753, 0dBFED59CB83EF99BC, fd603; +fma.rn.f64 fd608, fd103, 0dBFD97F6748E524B2, fd604; +fma.rn.f64 fd609, fd105, 0d3FEB57675CF309EE, fd605; +fma.rn.f64 fd610, fd108, 0d3FE0A06E851DB7CA, fd606; +fma.rn.f64 fd611, fd750, 0d3FEB57675CF309EE, fd607; +fma.rn.f64 fd612, fd107, 0d3FE0A06E851DB7CA, fd608; +fma.rn.f64 fd613, fd109, 0dBFE8D2A07C16D46F, fd609; +fma.rn.f64 fd614, fd112, 0dBFE431DF5838F7EF, fd610; +fma.rn.f64 fd615, fd748, 0dBFE8D2A07C16D46F, fd611; +fma.rn.f64 fd616, fd111, 0dBFE431DF5838F7EF, fd612; +fma.rn.f64 fd617, fd113, 0d3FE5D779B07CFEF7, fd613; +fma.rn.f64 fd618, fd116, 0d3FE763021AAA15DA, fd614; +fma.rn.f64 fd619, fd746, 0d3FE5D779B07CFEF7, fd615; +fma.rn.f64 fd620, fd115, 0d3FE763021AAA15DA, fd616; +fma.rn.f64 fd621, fd117, 0dBFE2742A4A775CFB, fd617; +fma.rn.f64 fd622, fd120, 0dBFEA249E0B897CA9, fd618; +fma.rn.f64 fd623, fd743, 0dBFE2742A4A775CFB, fd619; +fma.rn.f64 fd624, fd119, 0dBFEA249E0B897CA9, fd620; +fma.rn.f64 fd625, fd121, 0d3FDD71B4A0C5A6C8, fd621; +fma.rn.f64 fd626, fd124, 0d3FEC698E42F47B09, fd622; +fma.rn.f64 fd627, fd741, 0d3FDD71B4A0C5A6C8, fd623; +fma.rn.f64 fd628, fd123, 0d3FEC698E42F47B09, fd624; +fma.rn.f64 fd629, fd125, 0dBFD56EAAE597C776, fd625; +fma.rn.f64 fd630, fd128, 0dBFEE270060999288, fd626; +fma.rn.f64 fd631, fd739, 0dBFD56EAAE597C776, fd627; +fma.rn.f64 fd632, fd127, 0dBFEE270060999288, fd628; +fma.rn.f64 fd633, fd129, 0d3FCA0AD8BD1E2882, fd629; +fma.rn.f64 fd634, fd132, 0d3FEF54A827142577, fd630; +fma.rn.f64 fd635, fd736, 0d3FCA0AD8BD1E2882, fd631; +fma.rn.f64 fd636, fd131, 0d3FEF54A827142577, fd632; +fma.rn.f64 fd637, fd133, 0dBFB17855B599F3B9, fd633; +fma.rn.f64 fd638, fd136, 0dBFEFECE70DFD3EFB, fd634; +fma.rn.f64 fd639, fd734, 0dBFB17855B599F3B9, fd635; +fma.rn.f64 fd640, fd135, 0dBFEFECE70DFD3EFB, fd636; +add.f64 %1, fd156, fd734; +add.f64 %0, fd155, fd133; +sub.f64 %2, fd197, fd198; +add.f64 %3, fd199, fd200; +sub.f64 %4, fd241, fd242; +add.f64 %5, fd243, fd244; +add.f64 %7, fd287, fd288; +sub.f64 %6, fd285, fd286; +add.f64 %9, fd331, fd332; +sub.f64 %8, fd329, fd330; +add.f64 %11, fd375, fd376; +sub.f64 %10, fd373, fd374; +sub.f64 %12, fd417, fd418; +add.f64 %13, fd419, fd420; +sub.f64 %14, fd461, fd462; +add.f64 %15, fd463, fd464; +sub.f64 %16, fd505, fd506; +add.f64 %17, fd507, fd508; +sub.f64 %18, fd549, fd550; +add.f64 %19, fd551, fd552; +add.f64 %21, fd595, fd596; +sub.f64 %20, fd593, fd594; +add.f64 %23, fd639, fd640; +sub.f64 %22, fd637, fd638; +sub.f64 %25, fd639, fd640; +add.f64 %24, fd637, fd638; +sub.f64 %27, fd595, fd596; +add.f64 %26, fd593, fd594; +sub.f64 %29, fd551, fd552; +add.f64 %28, fd549, fd550; +sub.f64 %31, fd507, fd508; +add.f64 %30, fd505, fd506; +sub.f64 %33, fd463, fd464; +add.f64 %32, fd461, fd462; +sub.f64 %35, fd419, fd420; +add.f64 %34, fd417, fd418; +sub.f64 %37, fd375, fd376; +add.f64 %36, fd373, fd374; +sub.f64 %39, fd331, fd332; +add.f64 %38, fd329, fd330; +sub.f64 %41, fd287, fd288; +add.f64 %40, fd285, fd286; +sub.f64 %43, fd243, fd244; +add.f64 %42, fd241, fd242; +sub.f64 %45, fd199, fd200; +add.f64 %44, fd197, fd198; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[2].y), "d"(rmem[20].y), "d"(rmem[4].y), "d"(rmem[19].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[16].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[14].y), "d"(rmem[10].y), "d"(rmem[13].y), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..ee0cae82c4530 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_23_fp64_inv.hpp.inc @@ -0,0 +1,610 @@ +#ifndef CUFFTDX_FFT_23_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_23_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<584, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<758>; +.reg .b64 rd<4>; +add.f64 fd93, %48, %91; +sub.f64 fd95, %48, %91; +add.f64 fd757, %50, %92; +sub.f64 fd96, %50, %92; +add.f64 fd97, %51, %89; +sub.f64 fd99, %51, %89; +add.f64 fd755, %93, %90; +sub.f64 fd100, %93, %90; +add.f64 fd101, %53, %87; +sub.f64 fd103, %53, %87; +add.f64 fd753, %54, %94; +sub.f64 fd104, %54, %94; +add.f64 fd105, %55, %85; +sub.f64 fd107, %55, %85; +add.f64 fd750, %95, %96; +sub.f64 fd108, %95, %96; +add.f64 fd109, %57, %83; +sub.f64 fd111, %57, %83; +add.f64 fd748, %97, %84; +sub.f64 fd112, %97, %84; +add.f64 fd113, %59, %81; +sub.f64 fd115, %59, %81; +add.f64 fd746, %60, %98; +sub.f64 fd116, %60, %98; +add.f64 fd117, %61, %79; +sub.f64 fd119, %61, %79; +add.f64 fd743, %100, %99; +sub.f64 fd120, %100, %99; +add.f64 fd121, %63, %77; +sub.f64 fd123, %63, %77; +add.f64 fd741, %101, %78; +sub.f64 fd124, %101, %78; +add.f64 fd125, %65, %75; +sub.f64 fd127, %65, %75; +add.f64 fd739, %66, %102; +sub.f64 fd128, %66, %102; +add.f64 fd129, %67, %73; +sub.f64 fd131, %67, %73; +add.f64 fd736, %103, %104; +sub.f64 fd132, %103, %104; +add.f64 fd133, %69, %71; +sub.f64 fd135, %69, %71; +add.f64 fd734, %105, %72; +sub.f64 fd136, %105, %72; +add.f64 fd137, %46, fd93; +add.f64 fd139, fd137, fd97; +add.f64 fd733, %47, fd757; +add.f64 fd140, fd733, fd755; +add.f64 fd141, fd139, fd101; +add.f64 fd142, fd140, fd753; +add.f64 fd143, fd141, fd105; +add.f64 fd144, fd142, fd750; +add.f64 fd145, fd143, fd109; +add.f64 fd146, fd144, fd748; +add.f64 fd147, fd145, fd113; +add.f64 fd148, fd146, fd746; +add.f64 fd149, fd147, fd117; +add.f64 fd150, fd148, fd743; +add.f64 fd151, fd149, fd121; +add.f64 fd152, fd150, fd741; +add.f64 fd153, fd151, fd125; +add.f64 fd154, fd152, fd739; +add.f64 fd155, fd153, fd129; +add.f64 fd156, fd154, fd736; +fma.rn.f64 fd157, fd93, 0d3FEED037EA3D2DBB, %46; +fma.rn.f64 fd161, fd97, 0d3FEB57675CF309EE, fd157; +fma.rn.f64 fd732, fd96, 0d3FD14459AD2BE466, 0d0000000000000000; +fma.rn.f64 fd162, fd100, 0d3FE0A06E851DB7CA, fd732; +fma.rn.f64 fd731, fd757, 0d3FEED037EA3D2DBB, %47; +fma.rn.f64 fd163, fd755, 0d3FEB57675CF309EE, fd731; +fma.rn.f64 fd730, fd95, 0d3FD14459AD2BE466, 0d0000000000000000; +fma.rn.f64 fd164, fd99, 0d3FE0A06E851DB7CA, fd730; +fma.rn.f64 fd165, fd101, 0d3FE5D779B07CFEF7, fd161; +fma.rn.f64 fd166, fd104, 0d3FE763021AAA15DA, fd162; +fma.rn.f64 fd167, fd753, 0d3FE5D779B07CFEF7, fd163; +fma.rn.f64 fd168, fd103, 0d3FE763021AAA15DA, fd164; +fma.rn.f64 fd169, fd105, 0d3FDD71B4A0C5A6C8, fd165; +fma.rn.f64 fd170, fd108, 0d3FEC698E42F47B09, fd166; +fma.rn.f64 fd171, fd750, 0d3FDD71B4A0C5A6C8, fd167; +fma.rn.f64 fd172, fd107, 0d3FEC698E42F47B09, fd168; +fma.rn.f64 fd173, fd109, 0d3FCA0AD8BD1E2882, fd169; +fma.rn.f64 fd174, fd112, 0d3FEF54A827142577, fd170; +fma.rn.f64 fd175, fd748, 0d3FCA0AD8BD1E2882, fd171; +fma.rn.f64 fd176, fd111, 0d3FEF54A827142577, fd172; +fma.rn.f64 fd177, fd113, 0dBFB17855B599F3B9, fd173; +fma.rn.f64 fd178, fd116, 0d3FEFECE70DFD3EFB, fd174; +fma.rn.f64 fd179, fd746, 0dBFB17855B599F3B9, fd175; +fma.rn.f64 fd180, fd115, 0d3FEFECE70DFD3EFB, fd176; +fma.rn.f64 fd181, fd117, 0dBFD56EAAE597C776, fd177; +fma.rn.f64 fd182, fd120, 0d3FEE270060999288, fd178; +fma.rn.f64 fd183, fd743, 0dBFD56EAAE597C776, fd179; +fma.rn.f64 fd184, fd119, 0d3FEE270060999288, fd180; +fma.rn.f64 fd185, fd121, 0dBFE2742A4A775CFB, fd181; +fma.rn.f64 fd186, fd124, 0d3FEA249E0B897CA9, fd182; +fma.rn.f64 fd187, fd741, 0dBFE2742A4A775CFB, fd183; +fma.rn.f64 fd188, fd123, 0d3FEA249E0B897CA9, fd184; +fma.rn.f64 fd189, fd125, 0dBFE8D2A07C16D46F, fd185; +fma.rn.f64 fd190, fd128, 0d3FE431DF5838F7EF, fd186; +fma.rn.f64 fd191, fd739, 0dBFE8D2A07C16D46F, fd187; +fma.rn.f64 fd192, fd127, 0d3FE431DF5838F7EF, fd188; +fma.rn.f64 fd193, fd129, 0dBFED59CB83EF99BC, fd189; +fma.rn.f64 fd194, fd132, 0d3FD97F6748E524B2, fd190; +fma.rn.f64 fd195, fd736, 0dBFED59CB83EF99BC, fd191; +fma.rn.f64 fd196, fd131, 0d3FD97F6748E524B2, fd192; +fma.rn.f64 fd197, fd133, 0dBFEFB3B3035AA6CD, fd193; +fma.rn.f64 fd198, fd136, 0d3FC16DE8A4564F0A, fd194; +fma.rn.f64 fd199, fd734, 0dBFEFB3B3035AA6CD, fd195; +fma.rn.f64 fd200, fd135, 0d3FC16DE8A4564F0A, fd196; +fma.rn.f64 fd201, fd93, 0d3FEB57675CF309EE, %46; +fma.rn.f64 fd205, fd97, 0d3FDD71B4A0C5A6C8, fd201; +fma.rn.f64 fd729, fd96, 0d3FE0A06E851DB7CA, 0d0000000000000000; +fma.rn.f64 fd206, fd100, 0d3FEC698E42F47B09, fd729; +fma.rn.f64 fd728, fd757, 0d3FEB57675CF309EE, %47; +fma.rn.f64 fd207, fd755, 0d3FDD71B4A0C5A6C8, fd728; +fma.rn.f64 fd727, fd95, 0d3FE0A06E851DB7CA, 0d0000000000000000; +fma.rn.f64 fd208, fd99, 0d3FEC698E42F47B09, fd727; +fma.rn.f64 fd209, fd101, 0dBFB17855B599F3B9, fd205; +fma.rn.f64 fd210, fd104, 0d3FEFECE70DFD3EFB, fd206; +fma.rn.f64 fd211, fd753, 0dBFB17855B599F3B9, fd207; +fma.rn.f64 fd212, fd103, 0d3FEFECE70DFD3EFB, fd208; +fma.rn.f64 fd213, fd105, 0dBFE2742A4A775CFB, fd209; +fma.rn.f64 fd214, fd108, 0d3FEA249E0B897CA9, fd210; +fma.rn.f64 fd215, fd750, 0dBFE2742A4A775CFB, fd211; +fma.rn.f64 fd216, fd107, 0d3FEA249E0B897CA9, fd212; +fma.rn.f64 fd217, fd109, 0dBFED59CB83EF99BC, fd213; +fma.rn.f64 fd218, fd112, 0d3FD97F6748E524B2, fd214; +fma.rn.f64 fd219, fd748, 0dBFED59CB83EF99BC, fd215; +fma.rn.f64 fd220, fd111, 0d3FD97F6748E524B2, fd216; +fma.rn.f64 fd221, fd113, 0dBFEFB3B3035AA6CD, fd217; +fma.rn.f64 fd222, fd116, 0dBFC16DE8A4564F0A, fd218; +fma.rn.f64 fd223, fd746, 0dBFEFB3B3035AA6CD, fd219; +fma.rn.f64 fd224, fd115, 0dBFC16DE8A4564F0A, fd220; +fma.rn.f64 fd225, fd117, 0dBFE8D2A07C16D46F, fd221; +fma.rn.f64 fd226, fd120, 0dBFE431DF5838F7EF, fd222; +fma.rn.f64 fd227, fd743, 0dBFE8D2A07C16D46F, fd223; +fma.rn.f64 fd228, fd119, 0dBFE431DF5838F7EF, fd224; +fma.rn.f64 fd229, fd121, 0dBFD56EAAE597C776, fd225; +fma.rn.f64 fd230, fd124, 0dBFEE270060999288, fd226; +fma.rn.f64 fd231, fd741, 0dBFD56EAAE597C776, fd227; +fma.rn.f64 fd232, fd123, 0dBFEE270060999288, fd228; +fma.rn.f64 fd233, fd125, 0d3FCA0AD8BD1E2882, fd229; +fma.rn.f64 fd234, fd128, 0dBFEF54A827142577, fd230; +fma.rn.f64 fd235, fd739, 0d3FCA0AD8BD1E2882, fd231; +fma.rn.f64 fd236, fd127, 0dBFEF54A827142577, fd232; +fma.rn.f64 fd237, fd129, 0d3FE5D779B07CFEF7, fd233; +fma.rn.f64 fd238, fd132, 0dBFE763021AAA15DA, fd234; +fma.rn.f64 fd239, fd736, 0d3FE5D779B07CFEF7, fd235; +fma.rn.f64 fd240, fd131, 0dBFE763021AAA15DA, fd236; +fma.rn.f64 fd241, fd133, 0d3FEED037EA3D2DBB, fd237; +fma.rn.f64 fd242, fd136, 0dBFD14459AD2BE466, fd238; +fma.rn.f64 fd243, fd734, 0d3FEED037EA3D2DBB, fd239; +fma.rn.f64 fd244, fd135, 0dBFD14459AD2BE466, fd240; +fma.rn.f64 fd245, fd93, 0d3FE5D779B07CFEF7, %46; +fma.rn.f64 fd249, fd97, 0dBFB17855B599F3B9, fd245; +fma.rn.f64 fd726, fd96, 0d3FE763021AAA15DA, 0d0000000000000000; +fma.rn.f64 fd250, fd100, 0d3FEFECE70DFD3EFB, fd726; +fma.rn.f64 fd725, fd757, 0d3FE5D779B07CFEF7, %47; +fma.rn.f64 fd251, fd755, 0dBFB17855B599F3B9, fd725; +fma.rn.f64 fd724, fd95, 0d3FE763021AAA15DA, 0d0000000000000000; +fma.rn.f64 fd252, fd99, 0d3FEFECE70DFD3EFB, fd724; +fma.rn.f64 fd253, fd101, 0dBFE8D2A07C16D46F, fd249; +fma.rn.f64 fd254, fd104, 0d3FE431DF5838F7EF, fd250; +fma.rn.f64 fd255, fd753, 0dBFE8D2A07C16D46F, fd251; +fma.rn.f64 fd256, fd103, 0d3FE431DF5838F7EF, fd252; +fma.rn.f64 fd257, fd105, 0dBFEFB3B3035AA6CD, fd253; +fma.rn.f64 fd258, fd108, 0dBFC16DE8A4564F0A, fd254; +fma.rn.f64 fd259, fd750, 0dBFEFB3B3035AA6CD, fd255; +fma.rn.f64 fd260, fd107, 0dBFC16DE8A4564F0A, fd256; +fma.rn.f64 fd261, fd109, 0dBFE2742A4A775CFB, fd257; +fma.rn.f64 fd262, fd112, 0dBFEA249E0B897CA9, fd258; +fma.rn.f64 fd263, fd748, 0dBFE2742A4A775CFB, fd259; +fma.rn.f64 fd264, fd111, 0dBFEA249E0B897CA9, fd260; +fma.rn.f64 fd265, fd113, 0d3FCA0AD8BD1E2882, fd261; +fma.rn.f64 fd266, fd116, 0dBFEF54A827142577, fd262; +fma.rn.f64 fd267, fd746, 0d3FCA0AD8BD1E2882, fd263; +fma.rn.f64 fd268, fd115, 0dBFEF54A827142577, fd264; +fma.rn.f64 fd269, fd117, 0d3FEB57675CF309EE, fd265; +fma.rn.f64 fd270, fd120, 0dBFE0A06E851DB7CA, fd266; +fma.rn.f64 fd271, fd743, 0d3FEB57675CF309EE, fd267; +fma.rn.f64 fd272, fd119, 0dBFE0A06E851DB7CA, fd268; +fma.rn.f64 fd273, fd121, 0d3FEED037EA3D2DBB, fd269; +fma.rn.f64 fd274, fd124, 0d3FD14459AD2BE466, fd270; +fma.rn.f64 fd275, fd741, 0d3FEED037EA3D2DBB, fd271; +fma.rn.f64 fd276, fd123, 0d3FD14459AD2BE466, fd272; +fma.rn.f64 fd277, fd125, 0d3FDD71B4A0C5A6C8, fd273; +fma.rn.f64 fd278, fd128, 0d3FEC698E42F47B09, fd274; +fma.rn.f64 fd279, fd739, 0d3FDD71B4A0C5A6C8, fd275; +fma.rn.f64 fd280, fd127, 0d3FEC698E42F47B09, fd276; +fma.rn.f64 fd281, fd129, 0dBFD56EAAE597C776, fd277; +fma.rn.f64 fd282, fd132, 0d3FEE270060999288, fd278; +fma.rn.f64 fd283, fd736, 0dBFD56EAAE597C776, fd279; +fma.rn.f64 fd284, fd131, 0d3FEE270060999288, fd280; +fma.rn.f64 fd285, fd133, 0dBFED59CB83EF99BC, fd281; +fma.rn.f64 fd286, fd136, 0d3FD97F6748E524B2, fd282; +fma.rn.f64 fd287, fd734, 0dBFED59CB83EF99BC, fd283; +fma.rn.f64 fd288, fd135, 0d3FD97F6748E524B2, fd284; +fma.rn.f64 fd289, fd93, 0d3FDD71B4A0C5A6C8, %46; +fma.rn.f64 fd293, fd97, 0dBFE2742A4A775CFB, fd289; +fma.rn.f64 fd723, fd96, 0d3FEC698E42F47B09, 0d0000000000000000; +fma.rn.f64 fd294, fd100, 0d3FEA249E0B897CA9, fd723; +fma.rn.f64 fd722, fd757, 0d3FDD71B4A0C5A6C8, %47; +fma.rn.f64 fd295, fd755, 0dBFE2742A4A775CFB, fd722; +fma.rn.f64 fd721, fd95, 0d3FEC698E42F47B09, 0d0000000000000000; +fma.rn.f64 fd296, fd99, 0d3FEA249E0B897CA9, fd721; +fma.rn.f64 fd297, fd101, 0dBFEFB3B3035AA6CD, fd293; +fma.rn.f64 fd298, fd104, 0dBFC16DE8A4564F0A, fd294; +fma.rn.f64 fd299, fd753, 0dBFEFB3B3035AA6CD, fd295; +fma.rn.f64 fd300, fd103, 0dBFC16DE8A4564F0A, fd296; +fma.rn.f64 fd301, fd105, 0dBFD56EAAE597C776, fd297; +fma.rn.f64 fd302, fd108, 0dBFEE270060999288, fd298; +fma.rn.f64 fd303, fd750, 0dBFD56EAAE597C776, fd299; +fma.rn.f64 fd304, fd107, 0dBFEE270060999288, fd300; +fma.rn.f64 fd305, fd109, 0d3FE5D779B07CFEF7, fd301; +fma.rn.f64 fd306, fd112, 0dBFE763021AAA15DA, fd302; +fma.rn.f64 fd307, fd748, 0d3FE5D779B07CFEF7, fd303; +fma.rn.f64 fd308, fd111, 0dBFE763021AAA15DA, fd304; +fma.rn.f64 fd309, fd113, 0d3FEED037EA3D2DBB, fd305; +fma.rn.f64 fd310, fd116, 0d3FD14459AD2BE466, fd306; +fma.rn.f64 fd311, fd746, 0d3FEED037EA3D2DBB, fd307; +fma.rn.f64 fd312, fd115, 0d3FD14459AD2BE466, fd308; +fma.rn.f64 fd313, fd117, 0d3FCA0AD8BD1E2882, fd309; +fma.rn.f64 fd314, fd120, 0d3FEF54A827142577, fd310; +fma.rn.f64 fd315, fd743, 0d3FCA0AD8BD1E2882, fd311; +fma.rn.f64 fd316, fd119, 0d3FEF54A827142577, fd312; +fma.rn.f64 fd317, fd121, 0dBFE8D2A07C16D46F, fd313; +fma.rn.f64 fd318, fd124, 0d3FE431DF5838F7EF, fd314; +fma.rn.f64 fd319, fd741, 0dBFE8D2A07C16D46F, fd315; +fma.rn.f64 fd320, fd123, 0d3FE431DF5838F7EF, fd316; +fma.rn.f64 fd321, fd125, 0dBFED59CB83EF99BC, fd317; +fma.rn.f64 fd322, fd128, 0dBFD97F6748E524B2, fd318; +fma.rn.f64 fd323, fd739, 0dBFED59CB83EF99BC, fd319; +fma.rn.f64 fd324, fd127, 0dBFD97F6748E524B2, fd320; +fma.rn.f64 fd325, fd129, 0dBFB17855B599F3B9, fd321; +fma.rn.f64 fd326, fd132, 0dBFEFECE70DFD3EFB, fd322; +fma.rn.f64 fd327, fd736, 0dBFB17855B599F3B9, fd323; +fma.rn.f64 fd328, fd131, 0dBFEFECE70DFD3EFB, fd324; +fma.rn.f64 fd329, fd133, 0d3FEB57675CF309EE, fd325; +fma.rn.f64 fd330, fd136, 0dBFE0A06E851DB7CA, fd326; +fma.rn.f64 fd331, fd734, 0d3FEB57675CF309EE, fd327; +fma.rn.f64 fd332, fd135, 0dBFE0A06E851DB7CA, fd328; +fma.rn.f64 fd333, fd93, 0d3FCA0AD8BD1E2882, %46; +fma.rn.f64 fd337, fd97, 0dBFED59CB83EF99BC, fd333; +fma.rn.f64 fd720, fd96, 0d3FEF54A827142577, 0d0000000000000000; +fma.rn.f64 fd338, fd100, 0d3FD97F6748E524B2, fd720; +fma.rn.f64 fd719, fd757, 0d3FCA0AD8BD1E2882, %47; +fma.rn.f64 fd339, fd755, 0dBFED59CB83EF99BC, fd719; +fma.rn.f64 fd718, fd95, 0d3FEF54A827142577, 0d0000000000000000; +fma.rn.f64 fd340, fd99, 0d3FD97F6748E524B2, fd718; +fma.rn.f64 fd341, fd101, 0dBFE2742A4A775CFB, fd337; +fma.rn.f64 fd342, fd104, 0dBFEA249E0B897CA9, fd338; +fma.rn.f64 fd343, fd753, 0dBFE2742A4A775CFB, fd339; +fma.rn.f64 fd344, fd103, 0dBFEA249E0B897CA9, fd340; +fma.rn.f64 fd345, fd105, 0d3FE5D779B07CFEF7, fd341; +fma.rn.f64 fd346, fd108, 0dBFE763021AAA15DA, fd342; +fma.rn.f64 fd347, fd750, 0d3FE5D779B07CFEF7, fd343; +fma.rn.f64 fd348, fd107, 0dBFE763021AAA15DA, fd344; +fma.rn.f64 fd349, fd109, 0d3FEB57675CF309EE, fd345; +fma.rn.f64 fd350, fd112, 0d3FE0A06E851DB7CA, fd346; +fma.rn.f64 fd351, fd748, 0d3FEB57675CF309EE, fd347; +fma.rn.f64 fd352, fd111, 0d3FE0A06E851DB7CA, fd348; +fma.rn.f64 fd353, fd113, 0dBFD56EAAE597C776, fd349; +fma.rn.f64 fd354, fd116, 0d3FEE270060999288, fd350; +fma.rn.f64 fd355, fd746, 0dBFD56EAAE597C776, fd351; +fma.rn.f64 fd356, fd115, 0d3FEE270060999288, fd352; +fma.rn.f64 fd357, fd117, 0dBFEFB3B3035AA6CD, fd353; +fma.rn.f64 fd358, fd120, 0dBFC16DE8A4564F0A, fd354; +fma.rn.f64 fd359, fd743, 0dBFEFB3B3035AA6CD, fd355; +fma.rn.f64 fd360, fd119, 0dBFC16DE8A4564F0A, fd356; +fma.rn.f64 fd361, fd121, 0dBFB17855B599F3B9, fd357; +fma.rn.f64 fd362, fd124, 0dBFEFECE70DFD3EFB, fd358; +fma.rn.f64 fd363, fd741, 0dBFB17855B599F3B9, fd359; +fma.rn.f64 fd364, fd123, 0dBFEFECE70DFD3EFB, fd360; +fma.rn.f64 fd365, fd125, 0d3FEED037EA3D2DBB, fd361; +fma.rn.f64 fd366, fd128, 0dBFD14459AD2BE466, fd362; +fma.rn.f64 fd367, fd739, 0d3FEED037EA3D2DBB, fd363; +fma.rn.f64 fd368, fd127, 0dBFD14459AD2BE466, fd364; +fma.rn.f64 fd369, fd129, 0d3FDD71B4A0C5A6C8, fd365; +fma.rn.f64 fd370, fd132, 0d3FEC698E42F47B09, fd366; +fma.rn.f64 fd371, fd736, 0d3FDD71B4A0C5A6C8, fd367; +fma.rn.f64 fd372, fd131, 0d3FEC698E42F47B09, fd368; +fma.rn.f64 fd373, fd133, 0dBFE8D2A07C16D46F, fd369; +fma.rn.f64 fd374, fd136, 0d3FE431DF5838F7EF, fd370; +fma.rn.f64 fd375, fd734, 0dBFE8D2A07C16D46F, fd371; +fma.rn.f64 fd376, fd135, 0d3FE431DF5838F7EF, fd372; +fma.rn.f64 fd377, fd93, 0dBFB17855B599F3B9, %46; +fma.rn.f64 fd381, fd97, 0dBFEFB3B3035AA6CD, fd377; +fma.rn.f64 fd717, fd96, 0d3FEFECE70DFD3EFB, 0d0000000000000000; +fma.rn.f64 fd382, fd100, 0dBFC16DE8A4564F0A, fd717; +fma.rn.f64 fd716, fd757, 0dBFB17855B599F3B9, %47; +fma.rn.f64 fd383, fd755, 0dBFEFB3B3035AA6CD, fd716; +fma.rn.f64 fd715, fd95, 0d3FEFECE70DFD3EFB, 0d0000000000000000; +fma.rn.f64 fd384, fd99, 0dBFC16DE8A4564F0A, fd715; +fma.rn.f64 fd385, fd101, 0d3FCA0AD8BD1E2882, fd381; +fma.rn.f64 fd386, fd104, 0dBFEF54A827142577, fd382; +fma.rn.f64 fd387, fd753, 0d3FCA0AD8BD1E2882, fd383; +fma.rn.f64 fd388, fd103, 0dBFEF54A827142577, fd384; +fma.rn.f64 fd389, fd105, 0d3FEED037EA3D2DBB, fd385; +fma.rn.f64 fd390, fd108, 0d3FD14459AD2BE466, fd386; +fma.rn.f64 fd391, fd750, 0d3FEED037EA3D2DBB, fd387; +fma.rn.f64 fd392, fd107, 0d3FD14459AD2BE466, fd388; +fma.rn.f64 fd393, fd109, 0dBFD56EAAE597C776, fd389; +fma.rn.f64 fd394, fd112, 0d3FEE270060999288, fd390; +fma.rn.f64 fd395, fd748, 0dBFD56EAAE597C776, fd391; +fma.rn.f64 fd396, fd111, 0d3FEE270060999288, fd392; +fma.rn.f64 fd397, fd113, 0dBFED59CB83EF99BC, fd393; +fma.rn.f64 fd398, fd116, 0dBFD97F6748E524B2, fd394; +fma.rn.f64 fd399, fd746, 0dBFED59CB83EF99BC, fd395; +fma.rn.f64 fd400, fd115, 0dBFD97F6748E524B2, fd396; +fma.rn.f64 fd401, fd117, 0d3FDD71B4A0C5A6C8, fd397; +fma.rn.f64 fd402, fd120, 0dBFEC698E42F47B09, fd398; +fma.rn.f64 fd403, fd743, 0d3FDD71B4A0C5A6C8, fd399; +fma.rn.f64 fd404, fd119, 0dBFEC698E42F47B09, fd400; +fma.rn.f64 fd405, fd121, 0d3FEB57675CF309EE, fd401; +fma.rn.f64 fd406, fd124, 0d3FE0A06E851DB7CA, fd402; +fma.rn.f64 fd407, fd741, 0d3FEB57675CF309EE, fd403; +fma.rn.f64 fd408, fd123, 0d3FE0A06E851DB7CA, fd404; +fma.rn.f64 fd409, fd125, 0dBFE2742A4A775CFB, fd405; +fma.rn.f64 fd410, fd128, 0d3FEA249E0B897CA9, fd406; +fma.rn.f64 fd411, fd739, 0dBFE2742A4A775CFB, fd407; +fma.rn.f64 fd412, fd127, 0d3FEA249E0B897CA9, fd408; +fma.rn.f64 fd413, fd129, 0dBFE8D2A07C16D46F, fd409; +fma.rn.f64 fd414, fd132, 0dBFE431DF5838F7EF, fd410; +fma.rn.f64 fd415, fd736, 0dBFE8D2A07C16D46F, fd411; +fma.rn.f64 fd416, fd131, 0dBFE431DF5838F7EF, fd412; +fma.rn.f64 fd417, fd133, 0d3FE5D779B07CFEF7, fd413; +fma.rn.f64 fd418, fd136, 0dBFE763021AAA15DA, fd414; +fma.rn.f64 fd419, fd734, 0d3FE5D779B07CFEF7, fd415; +fma.rn.f64 fd420, fd135, 0dBFE763021AAA15DA, fd416; +fma.rn.f64 fd421, fd93, 0dBFD56EAAE597C776, %46; +fma.rn.f64 fd425, fd97, 0dBFE8D2A07C16D46F, fd421; +fma.rn.f64 fd714, fd96, 0d3FEE270060999288, 0d0000000000000000; +fma.rn.f64 fd426, fd100, 0dBFE431DF5838F7EF, fd714; +fma.rn.f64 fd713, fd757, 0dBFD56EAAE597C776, %47; +fma.rn.f64 fd427, fd755, 0dBFE8D2A07C16D46F, fd713; +fma.rn.f64 fd712, fd95, 0d3FEE270060999288, 0d0000000000000000; +fma.rn.f64 fd428, fd99, 0dBFE431DF5838F7EF, fd712; +fma.rn.f64 fd429, fd101, 0d3FEB57675CF309EE, fd425; +fma.rn.f64 fd430, fd104, 0dBFE0A06E851DB7CA, fd426; +fma.rn.f64 fd431, fd753, 0d3FEB57675CF309EE, fd427; +fma.rn.f64 fd432, fd103, 0dBFE0A06E851DB7CA, fd428; +fma.rn.f64 fd433, fd105, 0d3FCA0AD8BD1E2882, fd429; +fma.rn.f64 fd434, fd108, 0d3FEF54A827142577, fd430; +fma.rn.f64 fd435, fd750, 0d3FCA0AD8BD1E2882, fd431; +fma.rn.f64 fd436, fd107, 0d3FEF54A827142577, fd432; +fma.rn.f64 fd437, fd109, 0dBFEFB3B3035AA6CD, fd433; +fma.rn.f64 fd438, fd112, 0dBFC16DE8A4564F0A, fd434; +fma.rn.f64 fd439, fd748, 0dBFEFB3B3035AA6CD, fd435; +fma.rn.f64 fd440, fd111, 0dBFC16DE8A4564F0A, fd436; +fma.rn.f64 fd441, fd113, 0d3FDD71B4A0C5A6C8, fd437; +fma.rn.f64 fd442, fd116, 0dBFEC698E42F47B09, fd438; +fma.rn.f64 fd443, fd746, 0d3FDD71B4A0C5A6C8, fd439; +fma.rn.f64 fd444, fd115, 0dBFEC698E42F47B09, fd440; +fma.rn.f64 fd445, fd117, 0d3FE5D779B07CFEF7, fd441; +fma.rn.f64 fd446, fd120, 0d3FE763021AAA15DA, fd442; +fma.rn.f64 fd447, fd743, 0d3FE5D779B07CFEF7, fd443; +fma.rn.f64 fd448, fd119, 0d3FE763021AAA15DA, fd444; +fma.rn.f64 fd449, fd121, 0dBFED59CB83EF99BC, fd445; +fma.rn.f64 fd450, fd124, 0d3FD97F6748E524B2, fd446; +fma.rn.f64 fd451, fd741, 0dBFED59CB83EF99BC, fd447; +fma.rn.f64 fd452, fd123, 0d3FD97F6748E524B2, fd448; +fma.rn.f64 fd453, fd125, 0dBFB17855B599F3B9, fd449; +fma.rn.f64 fd454, fd128, 0dBFEFECE70DFD3EFB, fd450; +fma.rn.f64 fd455, fd739, 0dBFB17855B599F3B9, fd451; +fma.rn.f64 fd456, fd127, 0dBFEFECE70DFD3EFB, fd452; +fma.rn.f64 fd457, fd129, 0d3FEED037EA3D2DBB, fd453; +fma.rn.f64 fd458, fd132, 0d3FD14459AD2BE466, fd454; +fma.rn.f64 fd459, fd736, 0d3FEED037EA3D2DBB, fd455; +fma.rn.f64 fd460, fd131, 0d3FD14459AD2BE466, fd456; +fma.rn.f64 fd461, fd133, 0dBFE2742A4A775CFB, fd457; +fma.rn.f64 fd462, fd136, 0d3FEA249E0B897CA9, fd458; +fma.rn.f64 fd463, fd734, 0dBFE2742A4A775CFB, fd459; +fma.rn.f64 fd464, fd135, 0d3FEA249E0B897CA9, fd460; +fma.rn.f64 fd465, fd93, 0dBFE2742A4A775CFB, %46; +fma.rn.f64 fd469, fd97, 0dBFD56EAAE597C776, fd465; +fma.rn.f64 fd711, fd96, 0d3FEA249E0B897CA9, 0d0000000000000000; +fma.rn.f64 fd470, fd100, 0dBFEE270060999288, fd711; +fma.rn.f64 fd710, fd757, 0dBFE2742A4A775CFB, %47; +fma.rn.f64 fd471, fd755, 0dBFD56EAAE597C776, fd710; +fma.rn.f64 fd709, fd95, 0d3FEA249E0B897CA9, 0d0000000000000000; +fma.rn.f64 fd472, fd99, 0dBFEE270060999288, fd709; +fma.rn.f64 fd473, fd101, 0d3FEED037EA3D2DBB, fd469; +fma.rn.f64 fd474, fd104, 0d3FD14459AD2BE466, fd470; +fma.rn.f64 fd475, fd753, 0d3FEED037EA3D2DBB, fd471; +fma.rn.f64 fd476, fd103, 0d3FD14459AD2BE466, fd472; +fma.rn.f64 fd477, fd105, 0dBFE8D2A07C16D46F, fd473; +fma.rn.f64 fd478, fd108, 0d3FE431DF5838F7EF, fd474; +fma.rn.f64 fd479, fd750, 0dBFE8D2A07C16D46F, fd475; +fma.rn.f64 fd480, fd107, 0d3FE431DF5838F7EF, fd476; +fma.rn.f64 fd481, fd109, 0dBFB17855B599F3B9, fd477; +fma.rn.f64 fd482, fd112, 0dBFEFECE70DFD3EFB, fd478; +fma.rn.f64 fd483, fd748, 0dBFB17855B599F3B9, fd479; +fma.rn.f64 fd484, fd111, 0dBFEFECE70DFD3EFB, fd480; +fma.rn.f64 fd485, fd113, 0d3FEB57675CF309EE, fd481; +fma.rn.f64 fd486, fd116, 0d3FE0A06E851DB7CA, fd482; +fma.rn.f64 fd487, fd746, 0d3FEB57675CF309EE, fd483; +fma.rn.f64 fd488, fd115, 0d3FE0A06E851DB7CA, fd484; +fma.rn.f64 fd489, fd117, 0dBFED59CB83EF99BC, fd485; +fma.rn.f64 fd490, fd120, 0d3FD97F6748E524B2, fd486; +fma.rn.f64 fd491, fd743, 0dBFED59CB83EF99BC, fd487; +fma.rn.f64 fd492, fd119, 0d3FD97F6748E524B2, fd488; +fma.rn.f64 fd493, fd121, 0d3FCA0AD8BD1E2882, fd489; +fma.rn.f64 fd494, fd124, 0dBFEF54A827142577, fd490; +fma.rn.f64 fd495, fd741, 0d3FCA0AD8BD1E2882, fd491; +fma.rn.f64 fd496, fd123, 0dBFEF54A827142577, fd492; +fma.rn.f64 fd497, fd125, 0d3FE5D779B07CFEF7, fd493; +fma.rn.f64 fd498, fd128, 0d3FE763021AAA15DA, fd494; +fma.rn.f64 fd499, fd739, 0d3FE5D779B07CFEF7, fd495; +fma.rn.f64 fd500, fd127, 0d3FE763021AAA15DA, fd496; +fma.rn.f64 fd501, fd129, 0dBFEFB3B3035AA6CD, fd497; +fma.rn.f64 fd502, fd132, 0d3FC16DE8A4564F0A, fd498; +fma.rn.f64 fd503, fd736, 0dBFEFB3B3035AA6CD, fd499; +fma.rn.f64 fd504, fd131, 0d3FC16DE8A4564F0A, fd500; +fma.rn.f64 fd505, fd133, 0d3FDD71B4A0C5A6C8, fd501; +fma.rn.f64 fd506, fd136, 0dBFEC698E42F47B09, fd502; +fma.rn.f64 fd507, fd734, 0d3FDD71B4A0C5A6C8, fd503; +fma.rn.f64 fd508, fd135, 0dBFEC698E42F47B09, fd504; +fma.rn.f64 fd509, fd93, 0dBFE8D2A07C16D46F, %46; +fma.rn.f64 fd513, fd97, 0d3FCA0AD8BD1E2882, fd509; +fma.rn.f64 fd708, fd96, 0d3FE431DF5838F7EF, 0d0000000000000000; +fma.rn.f64 fd514, fd100, 0dBFEF54A827142577, fd708; +fma.rn.f64 fd707, fd757, 0dBFE8D2A07C16D46F, %47; +fma.rn.f64 fd515, fd755, 0d3FCA0AD8BD1E2882, fd707; +fma.rn.f64 fd706, fd95, 0d3FE431DF5838F7EF, 0d0000000000000000; +fma.rn.f64 fd516, fd99, 0dBFEF54A827142577, fd706; +fma.rn.f64 fd517, fd101, 0d3FDD71B4A0C5A6C8, fd513; +fma.rn.f64 fd518, fd104, 0d3FEC698E42F47B09, fd514; +fma.rn.f64 fd519, fd753, 0d3FDD71B4A0C5A6C8, fd515; +fma.rn.f64 fd520, fd103, 0d3FEC698E42F47B09, fd516; +fma.rn.f64 fd521, fd105, 0dBFED59CB83EF99BC, fd517; +fma.rn.f64 fd522, fd108, 0dBFD97F6748E524B2, fd518; +fma.rn.f64 fd523, fd750, 0dBFED59CB83EF99BC, fd519; +fma.rn.f64 fd524, fd107, 0dBFD97F6748E524B2, fd520; +fma.rn.f64 fd525, fd109, 0d3FEED037EA3D2DBB, fd521; +fma.rn.f64 fd526, fd112, 0dBFD14459AD2BE466, fd522; +fma.rn.f64 fd527, fd748, 0d3FEED037EA3D2DBB, fd523; +fma.rn.f64 fd528, fd111, 0dBFD14459AD2BE466, fd524; +fma.rn.f64 fd529, fd113, 0dBFE2742A4A775CFB, fd525; +fma.rn.f64 fd530, fd116, 0d3FEA249E0B897CA9, fd526; +fma.rn.f64 fd531, fd746, 0dBFE2742A4A775CFB, fd527; +fma.rn.f64 fd532, fd115, 0d3FEA249E0B897CA9, fd528; +fma.rn.f64 fd533, fd117, 0dBFB17855B599F3B9, fd529; +fma.rn.f64 fd534, fd120, 0dBFEFECE70DFD3EFB, fd530; +fma.rn.f64 fd535, fd743, 0dBFB17855B599F3B9, fd531; +fma.rn.f64 fd536, fd119, 0dBFEFECE70DFD3EFB, fd532; +fma.rn.f64 fd537, fd121, 0d3FE5D779B07CFEF7, fd533; +fma.rn.f64 fd538, fd124, 0d3FE763021AAA15DA, fd534; +fma.rn.f64 fd539, fd741, 0d3FE5D779B07CFEF7, fd535; +fma.rn.f64 fd540, fd123, 0d3FE763021AAA15DA, fd536; +fma.rn.f64 fd541, fd125, 0dBFEFB3B3035AA6CD, fd537; +fma.rn.f64 fd542, fd128, 0dBFC16DE8A4564F0A, fd538; +fma.rn.f64 fd543, fd739, 0dBFEFB3B3035AA6CD, fd539; +fma.rn.f64 fd544, fd127, 0dBFC16DE8A4564F0A, fd540; +fma.rn.f64 fd545, fd129, 0d3FEB57675CF309EE, fd541; +fma.rn.f64 fd546, fd132, 0dBFE0A06E851DB7CA, fd542; +fma.rn.f64 fd547, fd736, 0d3FEB57675CF309EE, fd543; +fma.rn.f64 fd548, fd131, 0dBFE0A06E851DB7CA, fd544; +fma.rn.f64 fd549, fd133, 0dBFD56EAAE597C776, fd545; +fma.rn.f64 fd550, fd136, 0d3FEE270060999288, fd546; +fma.rn.f64 fd551, fd734, 0dBFD56EAAE597C776, fd547; +fma.rn.f64 fd552, fd135, 0d3FEE270060999288, fd548; +fma.rn.f64 fd553, fd93, 0dBFED59CB83EF99BC, %46; +fma.rn.f64 fd557, fd97, 0d3FE5D779B07CFEF7, fd553; +fma.rn.f64 fd705, fd96, 0d3FD97F6748E524B2, 0d0000000000000000; +fma.rn.f64 fd558, fd100, 0dBFE763021AAA15DA, fd705; +fma.rn.f64 fd704, fd757, 0dBFED59CB83EF99BC, %47; +fma.rn.f64 fd559, fd755, 0d3FE5D779B07CFEF7, fd704; +fma.rn.f64 fd703, fd95, 0d3FD97F6748E524B2, 0d0000000000000000; +fma.rn.f64 fd560, fd99, 0dBFE763021AAA15DA, fd703; +fma.rn.f64 fd561, fd101, 0dBFD56EAAE597C776, fd557; +fma.rn.f64 fd562, fd104, 0d3FEE270060999288, fd558; +fma.rn.f64 fd563, fd753, 0dBFD56EAAE597C776, fd559; +fma.rn.f64 fd564, fd103, 0d3FEE270060999288, fd560; +fma.rn.f64 fd565, fd105, 0dBFB17855B599F3B9, fd561; +fma.rn.f64 fd566, fd108, 0dBFEFECE70DFD3EFB, fd562; +fma.rn.f64 fd567, fd750, 0dBFB17855B599F3B9, fd563; +fma.rn.f64 fd568, fd107, 0dBFEFECE70DFD3EFB, fd564; +fma.rn.f64 fd569, fd109, 0d3FDD71B4A0C5A6C8, fd565; +fma.rn.f64 fd570, fd112, 0d3FEC698E42F47B09, fd566; +fma.rn.f64 fd571, fd748, 0d3FDD71B4A0C5A6C8, fd567; +fma.rn.f64 fd572, fd111, 0d3FEC698E42F47B09, fd568; +fma.rn.f64 fd573, fd113, 0dBFE8D2A07C16D46F, fd569; +fma.rn.f64 fd574, fd116, 0dBFE431DF5838F7EF, fd570; +fma.rn.f64 fd575, fd746, 0dBFE8D2A07C16D46F, fd571; +fma.rn.f64 fd576, fd115, 0dBFE431DF5838F7EF, fd572; +fma.rn.f64 fd577, fd117, 0d3FEED037EA3D2DBB, fd573; +fma.rn.f64 fd578, fd120, 0d3FD14459AD2BE466, fd574; +fma.rn.f64 fd579, fd743, 0d3FEED037EA3D2DBB, fd575; +fma.rn.f64 fd580, fd119, 0d3FD14459AD2BE466, fd576; +fma.rn.f64 fd581, fd121, 0dBFEFB3B3035AA6CD, fd577; +fma.rn.f64 fd582, fd124, 0d3FC16DE8A4564F0A, fd578; +fma.rn.f64 fd583, fd741, 0dBFEFB3B3035AA6CD, fd579; +fma.rn.f64 fd584, fd123, 0d3FC16DE8A4564F0A, fd580; +fma.rn.f64 fd585, fd125, 0d3FEB57675CF309EE, fd581; +fma.rn.f64 fd586, fd128, 0dBFE0A06E851DB7CA, fd582; +fma.rn.f64 fd587, fd739, 0d3FEB57675CF309EE, fd583; +fma.rn.f64 fd588, fd127, 0dBFE0A06E851DB7CA, fd584; +fma.rn.f64 fd589, fd129, 0dBFE2742A4A775CFB, fd585; +fma.rn.f64 fd590, fd132, 0d3FEA249E0B897CA9, fd586; +fma.rn.f64 fd591, fd736, 0dBFE2742A4A775CFB, fd587; +fma.rn.f64 fd592, fd131, 0d3FEA249E0B897CA9, fd588; +fma.rn.f64 fd593, fd133, 0d3FCA0AD8BD1E2882, fd589; +fma.rn.f64 fd594, fd136, 0dBFEF54A827142577, fd590; +fma.rn.f64 fd595, fd734, 0d3FCA0AD8BD1E2882, fd591; +fma.rn.f64 fd596, fd135, 0dBFEF54A827142577, fd592; +fma.rn.f64 fd597, fd93, 0dBFEFB3B3035AA6CD, %46; +fma.rn.f64 fd598, fd96, 0d3FC16DE8A4564F0A, 0d0000000000000000; +fma.rn.f64 fd599, fd757, 0dBFEFB3B3035AA6CD, %47; +fma.rn.f64 fd600, fd95, 0d3FC16DE8A4564F0A, 0d0000000000000000; +fma.rn.f64 fd601, fd97, 0d3FEED037EA3D2DBB, fd597; +fma.rn.f64 fd602, fd100, 0dBFD14459AD2BE466, fd598; +fma.rn.f64 fd603, fd755, 0d3FEED037EA3D2DBB, fd599; +fma.rn.f64 fd604, fd99, 0dBFD14459AD2BE466, fd600; +fma.rn.f64 fd605, fd101, 0dBFED59CB83EF99BC, fd601; +fma.rn.f64 fd606, fd104, 0d3FD97F6748E524B2, fd602; +fma.rn.f64 fd607, fd753, 0dBFED59CB83EF99BC, fd603; +fma.rn.f64 fd608, fd103, 0d3FD97F6748E524B2, fd604; +fma.rn.f64 fd609, fd105, 0d3FEB57675CF309EE, fd605; +fma.rn.f64 fd610, fd108, 0dBFE0A06E851DB7CA, fd606; +fma.rn.f64 fd611, fd750, 0d3FEB57675CF309EE, fd607; +fma.rn.f64 fd612, fd107, 0dBFE0A06E851DB7CA, fd608; +fma.rn.f64 fd613, fd109, 0dBFE8D2A07C16D46F, fd609; +fma.rn.f64 fd614, fd112, 0d3FE431DF5838F7EF, fd610; +fma.rn.f64 fd615, fd748, 0dBFE8D2A07C16D46F, fd611; +fma.rn.f64 fd616, fd111, 0d3FE431DF5838F7EF, fd612; +fma.rn.f64 fd617, fd113, 0d3FE5D779B07CFEF7, fd613; +fma.rn.f64 fd618, fd116, 0dBFE763021AAA15DA, fd614; +fma.rn.f64 fd619, fd746, 0d3FE5D779B07CFEF7, fd615; +fma.rn.f64 fd620, fd115, 0dBFE763021AAA15DA, fd616; +fma.rn.f64 fd621, fd117, 0dBFE2742A4A775CFB, fd617; +fma.rn.f64 fd622, fd120, 0d3FEA249E0B897CA9, fd618; +fma.rn.f64 fd623, fd743, 0dBFE2742A4A775CFB, fd619; +fma.rn.f64 fd624, fd119, 0d3FEA249E0B897CA9, fd620; +fma.rn.f64 fd625, fd121, 0d3FDD71B4A0C5A6C8, fd621; +fma.rn.f64 fd626, fd124, 0dBFEC698E42F47B09, fd622; +fma.rn.f64 fd627, fd741, 0d3FDD71B4A0C5A6C8, fd623; +fma.rn.f64 fd628, fd123, 0dBFEC698E42F47B09, fd624; +fma.rn.f64 fd629, fd125, 0dBFD56EAAE597C776, fd625; +fma.rn.f64 fd630, fd128, 0d3FEE270060999288, fd626; +fma.rn.f64 fd631, fd739, 0dBFD56EAAE597C776, fd627; +fma.rn.f64 fd632, fd127, 0d3FEE270060999288, fd628; +fma.rn.f64 fd633, fd129, 0d3FCA0AD8BD1E2882, fd629; +fma.rn.f64 fd634, fd132, 0dBFEF54A827142577, fd630; +fma.rn.f64 fd635, fd736, 0d3FCA0AD8BD1E2882, fd631; +fma.rn.f64 fd636, fd131, 0dBFEF54A827142577, fd632; +fma.rn.f64 fd637, fd133, 0dBFB17855B599F3B9, fd633; +fma.rn.f64 fd638, fd136, 0d3FEFECE70DFD3EFB, fd634; +fma.rn.f64 fd639, fd734, 0dBFB17855B599F3B9, fd635; +fma.rn.f64 fd640, fd135, 0d3FEFECE70DFD3EFB, fd636; +add.f64 %1, fd156, fd734; +add.f64 %0, fd155, fd133; +sub.f64 %2, fd197, fd198; +add.f64 %3, fd199, fd200; +sub.f64 %4, fd241, fd242; +add.f64 %5, fd243, fd244; +add.f64 %7, fd287, fd288; +sub.f64 %6, fd285, fd286; +add.f64 %9, fd331, fd332; +sub.f64 %8, fd329, fd330; +add.f64 %11, fd375, fd376; +sub.f64 %10, fd373, fd374; +sub.f64 %12, fd417, fd418; +add.f64 %13, fd419, fd420; +sub.f64 %14, fd461, fd462; +add.f64 %15, fd463, fd464; +sub.f64 %16, fd505, fd506; +add.f64 %17, fd507, fd508; +sub.f64 %18, fd549, fd550; +add.f64 %19, fd551, fd552; +add.f64 %21, fd595, fd596; +sub.f64 %20, fd593, fd594; +add.f64 %23, fd639, fd640; +sub.f64 %22, fd637, fd638; +sub.f64 %25, fd639, fd640; +add.f64 %24, fd637, fd638; +sub.f64 %27, fd595, fd596; +add.f64 %26, fd593, fd594; +sub.f64 %29, fd551, fd552; +add.f64 %28, fd549, fd550; +sub.f64 %31, fd507, fd508; +add.f64 %30, fd505, fd506; +sub.f64 %33, fd463, fd464; +add.f64 %32, fd461, fd462; +sub.f64 %35, fd419, fd420; +add.f64 %34, fd417, fd418; +sub.f64 %37, fd375, fd376; +add.f64 %36, fd373, fd374; +sub.f64 %39, fd331, fd332; +add.f64 %38, fd329, fd330; +sub.f64 %41, fd287, fd288; +add.f64 %40, fd285, fd286; +sub.f64 %43, fd243, fd244; +add.f64 %42, fd241, fd242; +sub.f64 %45, fd199, fd200; +add.f64 %44, fd197, fd198; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[2].y), "d"(rmem[20].y), "d"(rmem[4].y), "d"(rmem[19].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[16].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[14].y), "d"(rmem[10].y), "d"(rmem[13].y), "d"(rmem[11].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..b32fd7e0141e4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp16_fwd.hpp.inc @@ -0,0 +1,8155 @@ +#ifndef CUFFTDX_FFT_2401_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_2401_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<922, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<138>; +.reg .b32 r<3494>; +.reg .b64 rd<8>; +mov.u32 r3468, %tid.y; +mov.u32 r3469, %14; +mad.lo.s32 r3470, r3468, 19208, r3469; +mov.u32 r3471, %tid.x; +mov.f32 f126, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1, {low, high}; +} +mov.f32 f128, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2, {low, high}; +} +mov.f32 f114, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r3, {low, high}; +} +mov.f32 f116, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r4, {low, high}; +} +mov.f32 f122, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5, {low, high}; +} +mov.f32 f124, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r6, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r7, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r8, {low, high}; +} +{ +neg.f16x2 r9, r8; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r11, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r12, {low, high}; +} +{ +neg.f16x2 r13, r12; +} +{ +add.f16x2 r15, %17, %27; +} +{ +add.f16x2 r18, %15, r15; +} +{ +add.f16x2 r21, %19, %25; +} +{ +add.f16x2 r24, r18, r21; +} +{ +add.f16x2 r27, %21, %23; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %18, %28; +} +{ +add.f16x2 r36, %16, r33; +} +{ +add.f16x2 r39, %20, %26; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %22, %24; +} +{ +add.f16x2 r48, r42, r45; +} +{ +add.f16x2 r51, %17, %27; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %15, r54; +} +{ +add.f16x2 r60, %19, %25; +} +{ +mul.f16x2 r63, r60, r3; +} +{ +add.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %21, %23; +} +{ +mul.f16x2 r72, r69, r5; +} +{ +add.f16x2 r75, r66, r72; +} +{ +sub.f16x2 r78, %18, %28; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +sub.f16x2 r84, %20, %26; +} +{ +mul.f16x2 r87, r84, r4; +} +{ +add.f16x2 r90, r81, r87; +} +{ +sub.f16x2 r93, %22, %24; +} +{ +mul.f16x2 r96, r93, r6; +} +{ +add.f16x2 r99, r90, r96; +} +{ +sub.f16x2 r102, r75, r99; +} +{ +add.f16x2 r105, %17, %27; +} +{ +mul.f16x2 r108, r105, r1; +} +{ +add.f16x2 r111, %15, r108; +} +{ +add.f16x2 r114, %19, %25; +} +{ +mul.f16x2 r117, r114, r3; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %21, %23; +} +{ +mul.f16x2 r126, r123, r5; +} +{ +add.f16x2 r129, r120, r126; +} +{ +sub.f16x2 r132, %18, %28; +} +{ +mul.f16x2 r135, r132, r2; +} +{ +sub.f16x2 r138, %20, %26; +} +{ +mul.f16x2 r141, r138, r4; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %22, %24; +} +{ +mul.f16x2 r150, r147, r6; +} +{ +add.f16x2 r153, r144, r150; +} +{ +add.f16x2 r156, r129, r153; +} +{ +add.f16x2 r159, %17, %27; +} +{ +mul.f16x2 r162, r159, r3; +} +{ +add.f16x2 r165, %15, r162; +} +{ +add.f16x2 r168, %19, %25; +} +{ +mul.f16x2 r171, r168, r7; +} +{ +add.f16x2 r174, r165, r171; +} +{ +add.f16x2 r177, %21, %23; +} +{ +mul.f16x2 r180, r177, r11; +} +{ +add.f16x2 r183, r174, r180; +} +{ +sub.f16x2 r186, %18, %28; +} +{ +mul.f16x2 r189, r186, r4; +} +{ +sub.f16x2 r192, %20, %26; +} +{ +mul.f16x2 r195, r192, r9; +} +{ +add.f16x2 r198, r189, r195; +} +{ +sub.f16x2 r201, %22, %24; +} +{ +mul.f16x2 r204, r201, r13; +} +{ +add.f16x2 r207, r198, r204; +} +{ +sub.f16x2 r210, r183, r207; +} +{ +add.f16x2 r213, %17, %27; +} +{ +mul.f16x2 r216, r213, r3; +} +{ +add.f16x2 r219, %15, r216; +} +{ +add.f16x2 r222, %19, %25; +} +{ +mul.f16x2 r225, r222, r7; +} +{ +add.f16x2 r228, r219, r225; +} +{ +add.f16x2 r231, %21, %23; +} +{ +mul.f16x2 r234, r231, r11; +} +{ +add.f16x2 r237, r228, r234; +} +{ +sub.f16x2 r240, %18, %28; +} +{ +mul.f16x2 r243, r240, r4; +} +{ +sub.f16x2 r246, %20, %26; +} +{ +mul.f16x2 r249, r246, r9; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %22, %24; +} +{ +mul.f16x2 r258, r255, r13; +} +{ +add.f16x2 r261, r252, r258; +} +{ +add.f16x2 r264, r237, r261; +} +{ +add.f16x2 r267, %17, %27; +} +{ +mul.f16x2 r270, r267, r5; +} +{ +add.f16x2 r273, %15, r270; +} +{ +add.f16x2 r276, %19, %25; +} +{ +mul.f16x2 r279, r276, r11; +} +{ +add.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %21, %23; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, r282, r288; +} +{ +sub.f16x2 r294, %18, %28; +} +{ +mul.f16x2 r297, r294, r6; +} +{ +sub.f16x2 r300, %20, %26; +} +{ +mul.f16x2 r303, r300, r13; +} +{ +add.f16x2 r306, r297, r303; +} +{ +sub.f16x2 r309, %22, %24; +} +{ +mul.f16x2 r312, r309, r4; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r291, r315; +} +{ +add.f16x2 r321, %17, %27; +} +{ +mul.f16x2 r324, r321, r5; +} +{ +add.f16x2 r327, %15, r324; +} +{ +add.f16x2 r330, %19, %25; +} +{ +mul.f16x2 r333, r330, r11; +} +{ +add.f16x2 r336, r327, r333; +} +{ +add.f16x2 r339, %21, %23; +} +{ +mul.f16x2 r342, r339, r3; +} +{ +add.f16x2 r345, r336, r342; +} +{ +sub.f16x2 r348, %18, %28; +} +{ +mul.f16x2 r351, r348, r6; +} +{ +sub.f16x2 r354, %20, %26; +} +{ +mul.f16x2 r357, r354, r13; +} +{ +add.f16x2 r360, r351, r357; +} +{ +sub.f16x2 r363, %22, %24; +} +{ +mul.f16x2 r366, r363, r4; +} +{ +add.f16x2 r369, r360, r366; +} +{ +add.f16x2 r372, r345, r369; +} +{ +add.f16x2 r375, %18, %28; +} +{ +mul.f16x2 r378, r375, r1; +} +{ +add.f16x2 r381, %16, r378; +} +{ +add.f16x2 r384, %20, %26; +} +{ +mul.f16x2 r387, r384, r3; +} +{ +add.f16x2 r390, r381, r387; +} +{ +add.f16x2 r393, %22, %24; +} +{ +mul.f16x2 r396, r393, r5; +} +{ +add.f16x2 r399, r390, r396; +} +{ +sub.f16x2 r402, %17, %27; +} +{ +mul.f16x2 r405, r402, r2; +} +{ +sub.f16x2 r408, %19, %25; +} +{ +mul.f16x2 r411, r408, r4; +} +{ +add.f16x2 r414, r405, r411; +} +{ +sub.f16x2 r417, %21, %23; +} +{ +mul.f16x2 r420, r417, r6; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r399, r423; +} +{ +add.f16x2 r429, %18, %28; +} +{ +mul.f16x2 r432, r429, r1; +} +{ +add.f16x2 r435, %16, r432; +} +{ +add.f16x2 r438, %20, %26; +} +{ +mul.f16x2 r441, r438, r3; +} +{ +add.f16x2 r444, r435, r441; +} +{ +add.f16x2 r447, %22, %24; +} +{ +mul.f16x2 r450, r447, r5; +} +{ +add.f16x2 r453, r444, r450; +} +{ +sub.f16x2 r456, %17, %27; +} +{ +mul.f16x2 r459, r456, r2; +} +{ +sub.f16x2 r462, %19, %25; +} +{ +mul.f16x2 r465, r462, r4; +} +{ +add.f16x2 r468, r459, r465; +} +{ +sub.f16x2 r471, %21, %23; +} +{ +mul.f16x2 r474, r471, r6; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r453, r477; +} +{ +add.f16x2 r483, %18, %28; +} +{ +mul.f16x2 r486, r483, r3; +} +{ +add.f16x2 r489, %16, r486; +} +{ +add.f16x2 r492, %20, %26; +} +{ +mul.f16x2 r495, r492, r7; +} +{ +add.f16x2 r498, r489, r495; +} +{ +add.f16x2 r501, %22, %24; +} +{ +mul.f16x2 r504, r501, r11; +} +{ +add.f16x2 r507, r498, r504; +} +{ +sub.f16x2 r510, %17, %27; +} +{ +mul.f16x2 r513, r510, r4; +} +{ +sub.f16x2 r516, %19, %25; +} +{ +mul.f16x2 r519, r516, r9; +} +{ +add.f16x2 r522, r513, r519; +} +{ +sub.f16x2 r525, %21, %23; +} +{ +mul.f16x2 r528, r525, r13; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r507, r531; +} +{ +add.f16x2 r537, %18, %28; +} +{ +mul.f16x2 r540, r537, r3; +} +{ +add.f16x2 r543, %16, r540; +} +{ +add.f16x2 r546, %20, %26; +} +{ +mul.f16x2 r549, r546, r7; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, %22, %24; +} +{ +mul.f16x2 r558, r555, r11; +} +{ +add.f16x2 r561, r552, r558; +} +{ +sub.f16x2 r564, %17, %27; +} +{ +mul.f16x2 r567, r564, r4; +} +{ +sub.f16x2 r570, %19, %25; +} +{ +mul.f16x2 r573, r570, r9; +} +{ +add.f16x2 r576, r567, r573; +} +{ +sub.f16x2 r579, %21, %23; +} +{ +mul.f16x2 r582, r579, r13; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r561, r585; +} +{ +add.f16x2 r591, %18, %28; +} +{ +mul.f16x2 r594, r591, r5; +} +{ +add.f16x2 r597, %16, r594; +} +{ +add.f16x2 r600, %20, %26; +} +{ +mul.f16x2 r603, r600, r11; +} +{ +add.f16x2 r606, r597, r603; +} +{ +add.f16x2 r609, %22, %24; +} +{ +mul.f16x2 r612, r609, r3; +} +{ +add.f16x2 r615, r606, r612; +} +{ +sub.f16x2 r618, %17, %27; +} +{ +mul.f16x2 r621, r618, r6; +} +{ +sub.f16x2 r624, %19, %25; +} +{ +mul.f16x2 r627, r624, r13; +} +{ +add.f16x2 r630, r621, r627; +} +{ +sub.f16x2 r633, %21, %23; +} +{ +mul.f16x2 r636, r633, r4; +} +{ +add.f16x2 r639, r630, r636; +} +{ +add.f16x2 r642, r615, r639; +} +{ +add.f16x2 r645, %18, %28; +} +{ +mul.f16x2 r648, r645, r5; +} +{ +add.f16x2 r651, %16, r648; +} +{ +add.f16x2 r654, %20, %26; +} +{ +mul.f16x2 r657, r654, r11; +} +{ +add.f16x2 r660, r651, r657; +} +{ +add.f16x2 r663, %22, %24; +} +{ +mul.f16x2 r666, r663, r3; +} +{ +add.f16x2 r669, r660, r666; +} +{ +sub.f16x2 r672, %17, %27; +} +{ +mul.f16x2 r675, r672, r6; +} +{ +sub.f16x2 r678, %19, %25; +} +{ +mul.f16x2 r681, r678, r13; +} +{ +add.f16x2 r684, r675, r681; +} +{ +sub.f16x2 r687, %21, %23; +} +{ +mul.f16x2 r690, r687, r4; +} +{ +add.f16x2 r693, r684, r690; +} +{ +sub.f16x2 r696, r669, r693; +} +mul.wide.u32 rd2, r3471, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3472, rd3; +mul.lo.s32 r3473, r3472, 343; +sub.s32 r3474, r3471, r3473; +cvt.rn.f32.u32 f129, r3474; +mul.f32 f130, f129, 0f3B2B805B; +cos.approx.f32 f21, f130; +sin.approx.f32 f131, f130; +neg.f32 f22, f131; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r699, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r704, {high, high}; +} +{ +mul.f16x2 r706, r426, r704; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r102, r702, r709; +} +{ +mul.f16x2 r715, r102, r704; +} +{ +fma.rn.f16x2 r718, r426, r702, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r724, {high, high}; +} +mov.f32 f105, 0fBF800000; +mov.f32 f106, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r726, {low, high}; +} +{ +mul.f16x2 r727, r724, r726; +} +{ +mul.f16x2 r730, r699, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r733, {high, low}; +} +{ +fma.rn.f16x2 r735, r727, r733, r730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r741, {high, high}; +} +{ +mul.f16x2 r743, r534, r741; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r210, r739, r746; +} +{ +mul.f16x2 r752, r210, r741; +} +{ +fma.rn.f16x2 r755, r534, r739, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r761, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r763, {low, high}; +} +{ +mul.f16x2 r764, r761, r763; +} +{ +mul.f16x2 r767, r735, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r770, {high, low}; +} +{ +fma.rn.f16x2 r772, r764, r770, r767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r778, {high, high}; +} +{ +mul.f16x2 r780, r642, r778; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r318, r776, r783; +} +{ +mul.f16x2 r789, r318, r778; +} +{ +fma.rn.f16x2 r792, r642, r776, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r798, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r798, r800; +} +{ +mul.f16x2 r804, r772, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r807, {high, low}; +} +{ +fma.rn.f16x2 r809, r801, r807, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r815, {high, high}; +} +{ +mul.f16x2 r817, r696, r815; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r372, r813, r820; +} +{ +mul.f16x2 r826, r372, r815; +} +{ +fma.rn.f16x2 r829, r696, r813, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r833, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r835, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r837, {low, high}; +} +{ +mul.f16x2 r838, r835, r837; +} +{ +mul.f16x2 r841, r809, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r844, {high, low}; +} +{ +fma.rn.f16x2 r846, r838, r844, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r850, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r852, {high, high}; +} +{ +mul.f16x2 r854, r588, r852; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r264, r850, r857; +} +{ +mul.f16x2 r863, r264, r852; +} +{ +fma.rn.f16x2 r866, r588, r850, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r870, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r872, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r874, {low, high}; +} +{ +mul.f16x2 r875, r872, r874; +} +{ +mul.f16x2 r878, r846, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r881, {high, low}; +} +{ +fma.rn.f16x2 r883, r875, r881, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r887, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r889, {high, high}; +} +{ +mul.f16x2 r891, r480, r889; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r156, r887, r894; +} +{ +mul.f16x2 r900, r156, r889; +} +{ +fma.rn.f16x2 r903, r480, r887, r900; +} +mad.lo.s32 r3475, r3472, 19208, r3470; +barrier.sync 0; +mad.lo.s32 r3476, r3474, 56, r3475; +st.shared.v2.f32 [r3476], {r30, r48}; +st.shared.v2.f32 [r3476+8], {r711, r718}; +st.shared.v2.f32 [r3476+16], {r748, r755}; +st.shared.v2.f32 [r3476+24], {r785, r792}; +st.shared.v2.f32 [r3476+32], {r822, r829}; +st.shared.v2.f32 [r3476+40], {r859, r866}; +st.shared.v2.f32 [r3476+48], {r896, r903}; +barrier.sync 0; +mad.lo.s32 r3477, r3474, -48, r3476; +ld.shared.u32 r942, [r3477]; +ld.shared.u32 r960, [r3477+4]; +ld.shared.u32 r939, [r3477+2744]; +ld.shared.u32 r957, [r3477+2748]; +ld.shared.u32 r945, [r3477+5488]; +ld.shared.u32 r963, [r3477+5492]; +ld.shared.u32 r951, [r3477+8232]; +ld.shared.u32 r969, [r3477+8236]; +ld.shared.u32 r952, [r3477+10976]; +ld.shared.u32 r970, [r3477+10980]; +ld.shared.u32 r946, [r3477+13720]; +ld.shared.u32 r964, [r3477+13724]; +ld.shared.u32 r940, [r3477+16464]; +ld.shared.u32 r958, [r3477+16468]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +add.f16x2 r938, r939, r940; +} +{ +add.f16x2 r941, r942, r938; +} +{ +add.f16x2 r944, r945, r946; +} +{ +add.f16x2 r947, r941, r944; +} +{ +add.f16x2 r950, r951, r952; +} +{ +add.f16x2 r953, r947, r950; +} +{ +add.f16x2 r956, r957, r958; +} +{ +add.f16x2 r959, r960, r956; +} +{ +add.f16x2 r962, r963, r964; +} +{ +add.f16x2 r965, r959, r962; +} +{ +add.f16x2 r968, r969, r970; +} +{ +add.f16x2 r971, r965, r968; +} +{ +add.f16x2 r974, r939, r940; +} +{ +mul.f16x2 r977, r974, r924; +} +{ +add.f16x2 r980, r942, r977; +} +{ +add.f16x2 r983, r945, r946; +} +{ +mul.f16x2 r986, r983, r926; +} +{ +add.f16x2 r989, r980, r986; +} +{ +add.f16x2 r992, r951, r952; +} +{ +mul.f16x2 r995, r992, r928; +} +{ +add.f16x2 r998, r989, r995; +} +{ +sub.f16x2 r1001, r957, r958; +} +{ +mul.f16x2 r1004, r1001, r925; +} +{ +sub.f16x2 r1007, r963, r964; +} +{ +mul.f16x2 r1010, r1007, r927; +} +{ +add.f16x2 r1013, r1004, r1010; +} +{ +sub.f16x2 r1016, r969, r970; +} +{ +mul.f16x2 r1019, r1016, r929; +} +{ +add.f16x2 r1022, r1013, r1019; +} +{ +sub.f16x2 r1025, r998, r1022; +} +{ +add.f16x2 r1028, r939, r940; +} +{ +mul.f16x2 r1031, r1028, r924; +} +{ +add.f16x2 r1034, r942, r1031; +} +{ +add.f16x2 r1037, r945, r946; +} +{ +mul.f16x2 r1040, r1037, r926; +} +{ +add.f16x2 r1043, r1034, r1040; +} +{ +add.f16x2 r1046, r951, r952; +} +{ +mul.f16x2 r1049, r1046, r928; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, r957, r958; +} +{ +mul.f16x2 r1058, r1055, r925; +} +{ +sub.f16x2 r1061, r963, r964; +} +{ +mul.f16x2 r1064, r1061, r927; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +sub.f16x2 r1070, r969, r970; +} +{ +mul.f16x2 r1073, r1070, r929; +} +{ +add.f16x2 r1076, r1067, r1073; +} +{ +add.f16x2 r1079, r1052, r1076; +} +{ +add.f16x2 r1082, r939, r940; +} +{ +mul.f16x2 r1085, r1082, r926; +} +{ +add.f16x2 r1088, r942, r1085; +} +{ +add.f16x2 r1091, r945, r946; +} +{ +mul.f16x2 r1094, r1091, r930; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r951, r952; +} +{ +mul.f16x2 r1103, r1100, r934; +} +{ +add.f16x2 r1106, r1097, r1103; +} +{ +sub.f16x2 r1109, r957, r958; +} +{ +mul.f16x2 r1112, r1109, r927; +} +{ +sub.f16x2 r1115, r963, r964; +} +{ +mul.f16x2 r1118, r1115, r932; +} +{ +add.f16x2 r1121, r1112, r1118; +} +{ +sub.f16x2 r1124, r969, r970; +} +{ +mul.f16x2 r1127, r1124, r936; +} +{ +add.f16x2 r1130, r1121, r1127; +} +{ +sub.f16x2 r1133, r1106, r1130; +} +{ +add.f16x2 r1136, r939, r940; +} +{ +mul.f16x2 r1139, r1136, r926; +} +{ +add.f16x2 r1142, r942, r1139; +} +{ +add.f16x2 r1145, r945, r946; +} +{ +mul.f16x2 r1148, r1145, r930; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r951, r952; +} +{ +mul.f16x2 r1157, r1154, r934; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, r957, r958; +} +{ +mul.f16x2 r1166, r1163, r927; +} +{ +sub.f16x2 r1169, r963, r964; +} +{ +mul.f16x2 r1172, r1169, r932; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +sub.f16x2 r1178, r969, r970; +} +{ +mul.f16x2 r1181, r1178, r936; +} +{ +add.f16x2 r1184, r1175, r1181; +} +{ +add.f16x2 r1187, r1160, r1184; +} +{ +add.f16x2 r1190, r939, r940; +} +{ +mul.f16x2 r1193, r1190, r928; +} +{ +add.f16x2 r1196, r942, r1193; +} +{ +add.f16x2 r1199, r945, r946; +} +{ +mul.f16x2 r1202, r1199, r934; +} +{ +add.f16x2 r1205, r1196, r1202; +} +{ +add.f16x2 r1208, r951, r952; +} +{ +mul.f16x2 r1211, r1208, r926; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +sub.f16x2 r1217, r957, r958; +} +{ +mul.f16x2 r1220, r1217, r929; +} +{ +sub.f16x2 r1223, r963, r964; +} +{ +mul.f16x2 r1226, r1223, r936; +} +{ +add.f16x2 r1229, r1220, r1226; +} +{ +sub.f16x2 r1232, r969, r970; +} +{ +mul.f16x2 r1235, r1232, r927; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1214, r1238; +} +{ +add.f16x2 r1244, r939, r940; +} +{ +mul.f16x2 r1247, r1244, r928; +} +{ +add.f16x2 r1250, r942, r1247; +} +{ +add.f16x2 r1253, r945, r946; +} +{ +mul.f16x2 r1256, r1253, r934; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r951, r952; +} +{ +mul.f16x2 r1265, r1262, r926; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, r957, r958; +} +{ +mul.f16x2 r1274, r1271, r929; +} +{ +sub.f16x2 r1277, r963, r964; +} +{ +mul.f16x2 r1280, r1277, r936; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r969, r970; +} +{ +mul.f16x2 r1289, r1286, r927; +} +{ +add.f16x2 r1292, r1283, r1289; +} +{ +add.f16x2 r1295, r1268, r1292; +} +{ +add.f16x2 r1298, r957, r958; +} +{ +mul.f16x2 r1301, r1298, r924; +} +{ +add.f16x2 r1304, r960, r1301; +} +{ +add.f16x2 r1307, r963, r964; +} +{ +mul.f16x2 r1310, r1307, r926; +} +{ +add.f16x2 r1313, r1304, r1310; +} +{ +add.f16x2 r1316, r969, r970; +} +{ +mul.f16x2 r1319, r1316, r928; +} +{ +add.f16x2 r1322, r1313, r1319; +} +{ +sub.f16x2 r1325, r939, r940; +} +{ +mul.f16x2 r1328, r1325, r925; +} +{ +sub.f16x2 r1331, r945, r946; +} +{ +mul.f16x2 r1334, r1331, r927; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +sub.f16x2 r1340, r951, r952; +} +{ +mul.f16x2 r1343, r1340, r929; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +add.f16x2 r1349, r1322, r1346; +} +{ +add.f16x2 r1352, r957, r958; +} +{ +mul.f16x2 r1355, r1352, r924; +} +{ +add.f16x2 r1358, r960, r1355; +} +{ +add.f16x2 r1361, r963, r964; +} +{ +mul.f16x2 r1364, r1361, r926; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r969, r970; +} +{ +mul.f16x2 r1373, r1370, r928; +} +{ +add.f16x2 r1376, r1367, r1373; +} +{ +sub.f16x2 r1379, r939, r940; +} +{ +mul.f16x2 r1382, r1379, r925; +} +{ +sub.f16x2 r1385, r945, r946; +} +{ +mul.f16x2 r1388, r1385, r927; +} +{ +add.f16x2 r1391, r1382, r1388; +} +{ +sub.f16x2 r1394, r951, r952; +} +{ +mul.f16x2 r1397, r1394, r929; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, r1376, r1400; +} +{ +add.f16x2 r1406, r957, r958; +} +{ +mul.f16x2 r1409, r1406, r926; +} +{ +add.f16x2 r1412, r960, r1409; +} +{ +add.f16x2 r1415, r963, r964; +} +{ +mul.f16x2 r1418, r1415, r930; +} +{ +add.f16x2 r1421, r1412, r1418; +} +{ +add.f16x2 r1424, r969, r970; +} +{ +mul.f16x2 r1427, r1424, r934; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +sub.f16x2 r1433, r939, r940; +} +{ +mul.f16x2 r1436, r1433, r927; +} +{ +sub.f16x2 r1439, r945, r946; +} +{ +mul.f16x2 r1442, r1439, r932; +} +{ +add.f16x2 r1445, r1436, r1442; +} +{ +sub.f16x2 r1448, r951, r952; +} +{ +mul.f16x2 r1451, r1448, r936; +} +{ +add.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1430, r1454; +} +{ +add.f16x2 r1460, r957, r958; +} +{ +mul.f16x2 r1463, r1460, r926; +} +{ +add.f16x2 r1466, r960, r1463; +} +{ +add.f16x2 r1469, r963, r964; +} +{ +mul.f16x2 r1472, r1469, r930; +} +{ +add.f16x2 r1475, r1466, r1472; +} +{ +add.f16x2 r1478, r969, r970; +} +{ +mul.f16x2 r1481, r1478, r934; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r939, r940; +} +{ +mul.f16x2 r1490, r1487, r927; +} +{ +sub.f16x2 r1493, r945, r946; +} +{ +mul.f16x2 r1496, r1493, r932; +} +{ +add.f16x2 r1499, r1490, r1496; +} +{ +sub.f16x2 r1502, r951, r952; +} +{ +mul.f16x2 r1505, r1502, r936; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, r1484, r1508; +} +{ +add.f16x2 r1514, r957, r958; +} +{ +mul.f16x2 r1517, r1514, r928; +} +{ +add.f16x2 r1520, r960, r1517; +} +{ +add.f16x2 r1523, r963, r964; +} +{ +mul.f16x2 r1526, r1523, r934; +} +{ +add.f16x2 r1529, r1520, r1526; +} +{ +add.f16x2 r1532, r969, r970; +} +{ +mul.f16x2 r1535, r1532, r926; +} +{ +add.f16x2 r1538, r1529, r1535; +} +{ +sub.f16x2 r1541, r939, r940; +} +{ +mul.f16x2 r1544, r1541, r929; +} +{ +sub.f16x2 r1547, r945, r946; +} +{ +mul.f16x2 r1550, r1547, r936; +} +{ +add.f16x2 r1553, r1544, r1550; +} +{ +sub.f16x2 r1556, r951, r952; +} +{ +mul.f16x2 r1559, r1556, r927; +} +{ +add.f16x2 r1562, r1553, r1559; +} +{ +add.f16x2 r1565, r1538, r1562; +} +{ +add.f16x2 r1568, r957, r958; +} +{ +mul.f16x2 r1571, r1568, r928; +} +{ +add.f16x2 r1574, r960, r1571; +} +{ +add.f16x2 r1577, r963, r964; +} +{ +mul.f16x2 r1580, r1577, r934; +} +{ +add.f16x2 r1583, r1574, r1580; +} +{ +add.f16x2 r1586, r969, r970; +} +{ +mul.f16x2 r1589, r1586, r926; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +sub.f16x2 r1595, r939, r940; +} +{ +mul.f16x2 r1598, r1595, r929; +} +{ +sub.f16x2 r1601, r945, r946; +} +{ +mul.f16x2 r1604, r1601, r936; +} +{ +add.f16x2 r1607, r1598, r1604; +} +{ +sub.f16x2 r1610, r951, r952; +} +{ +mul.f16x2 r1613, r1610, r927; +} +{ +add.f16x2 r1616, r1607, r1613; +} +{ +sub.f16x2 r1619, r1592, r1616; +} +mul.wide.u32 rd4, r3474, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r3478, rd5; +sub.s32 r3479, r3474, r3478; +shr.u32 r3480, r3479, 1; +add.s32 r3481, r3480, r3478; +shr.u32 r3482, r3481, 2; +cvt.rn.f32.u32 f132, r3482; +mul.f32 f133, f132, 0f3C961050; +cos.approx.f32 f57, f133; +sin.approx.f32 f134, f133; +neg.f32 f58, f134; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1622, {low, high}; +} +mul.lo.s32 r3483, r3482, 7; +sub.s32 r3484, r3474, r3483; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1625, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1627, {high, high}; +} +{ +mul.f16x2 r1629, r1349, r1627; +} +{ +neg.f16x2 r1632, r1629; +} +{ +fma.rn.f16x2 r1634, r1025, r1625, r1632; +} +{ +mul.f16x2 r1638, r1025, r1627; +} +{ +fma.rn.f16x2 r1641, r1349, r1625, r1638; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1645, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1647, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1649, {low, high}; +} +{ +mul.f16x2 r1650, r1647, r1649; +} +{ +mul.f16x2 r1653, r1622, r1645; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1656, {high, low}; +} +{ +fma.rn.f16x2 r1658, r1650, r1656, r1653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1662, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1664, {high, high}; +} +{ +mul.f16x2 r1666, r1457, r1664; +} +{ +neg.f16x2 r1669, r1666; +} +{ +fma.rn.f16x2 r1671, r1133, r1662, r1669; +} +{ +mul.f16x2 r1675, r1133, r1664; +} +{ +fma.rn.f16x2 r1678, r1457, r1662, r1675; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1682, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1684, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1686, {low, high}; +} +{ +mul.f16x2 r1687, r1684, r1686; +} +{ +mul.f16x2 r1690, r1658, r1682; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1693, {high, low}; +} +{ +fma.rn.f16x2 r1695, r1687, r1693, r1690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1699, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1701, {high, high}; +} +{ +mul.f16x2 r1703, r1565, r1701; +} +{ +neg.f16x2 r1706, r1703; +} +{ +fma.rn.f16x2 r1708, r1241, r1699, r1706; +} +{ +mul.f16x2 r1712, r1241, r1701; +} +{ +fma.rn.f16x2 r1715, r1565, r1699, r1712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1719, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1721, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1723, {low, high}; +} +{ +mul.f16x2 r1724, r1721, r1723; +} +{ +mul.f16x2 r1727, r1695, r1719; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1730, {high, low}; +} +{ +fma.rn.f16x2 r1732, r1724, r1730, r1727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1738, {high, high}; +} +{ +mul.f16x2 r1740, r1619, r1738; +} +{ +neg.f16x2 r1743, r1740; +} +{ +fma.rn.f16x2 r1745, r1295, r1736, r1743; +} +{ +mul.f16x2 r1749, r1295, r1738; +} +{ +fma.rn.f16x2 r1752, r1619, r1736, r1749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1756, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1758, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1758, r1760; +} +{ +mul.f16x2 r1764, r1732, r1756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1767, {high, low}; +} +{ +fma.rn.f16x2 r1769, r1761, r1767, r1764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1775, {high, high}; +} +{ +mul.f16x2 r1777, r1511, r1775; +} +{ +neg.f16x2 r1780, r1777; +} +{ +fma.rn.f16x2 r1782, r1187, r1773, r1780; +} +{ +mul.f16x2 r1786, r1187, r1775; +} +{ +fma.rn.f16x2 r1789, r1511, r1773, r1786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1793, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1795, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1795, r1797; +} +{ +mul.f16x2 r1801, r1769, r1793; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1804, {high, low}; +} +{ +fma.rn.f16x2 r1806, r1798, r1804, r1801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1812, {high, high}; +} +{ +mul.f16x2 r1814, r1403, r1812; +} +{ +neg.f16x2 r1817, r1814; +} +{ +fma.rn.f16x2 r1819, r1079, r1810, r1817; +} +{ +mul.f16x2 r1823, r1079, r1812; +} +{ +fma.rn.f16x2 r1826, r1403, r1810, r1823; +} +shl.b32 r3485, r3484, 3; +add.s32 r3486, r3475, r3485; +barrier.sync 0; +mad.lo.s32 r3487, r3482, 392, r3486; +st.shared.u32 [r3487], r953; +st.shared.u32 [r3487+4], r971; +st.shared.u32 [r3487+56], r1634; +st.shared.u32 [r3487+60], r1641; +st.shared.u32 [r3487+112], r1671; +st.shared.u32 [r3487+116], r1678; +st.shared.u32 [r3487+168], r1708; +st.shared.u32 [r3487+172], r1715; +st.shared.u32 [r3487+224], r1745; +st.shared.u32 [r3487+228], r1752; +st.shared.u32 [r3487+280], r1782; +st.shared.u32 [r3487+284], r1789; +st.shared.u32 [r3487+336], r1819; +st.shared.u32 [r3487+340], r1826; +barrier.sync 0; +ld.shared.u32 r1865, [r3477]; +ld.shared.u32 r1883, [r3477+4]; +ld.shared.u32 r1862, [r3477+2744]; +ld.shared.u32 r1880, [r3477+2748]; +ld.shared.u32 r1868, [r3477+5488]; +ld.shared.u32 r1886, [r3477+5492]; +ld.shared.u32 r1874, [r3477+8232]; +ld.shared.u32 r1892, [r3477+8236]; +ld.shared.u32 r1875, [r3477+10976]; +ld.shared.u32 r1893, [r3477+10980]; +ld.shared.u32 r1869, [r3477+13720]; +ld.shared.u32 r1887, [r3477+13724]; +ld.shared.u32 r1863, [r3477+16464]; +ld.shared.u32 r1881, [r3477+16468]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1848, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r1849, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1850, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1853, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1854, {low, high}; +} +{ +neg.f16x2 r1855, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1858, {low, high}; +} +{ +neg.f16x2 r1859, r1858; +} +{ +add.f16x2 r1861, r1862, r1863; +} +{ +add.f16x2 r1864, r1865, r1861; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1864, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 r1876, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1882, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 r1894, r1888, r1891; +} +{ +add.f16x2 r1897, r1862, r1863; +} +{ +mul.f16x2 r1900, r1897, r1847; +} +{ +add.f16x2 r1903, r1865, r1900; +} +{ +add.f16x2 r1906, r1868, r1869; +} +{ +mul.f16x2 r1909, r1906, r1849; +} +{ +add.f16x2 r1912, r1903, r1909; +} +{ +add.f16x2 r1915, r1874, r1875; +} +{ +mul.f16x2 r1918, r1915, r1851; +} +{ +add.f16x2 r1921, r1912, r1918; +} +{ +sub.f16x2 r1924, r1880, r1881; +} +{ +mul.f16x2 r1927, r1924, r1848; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1850; +} +{ +add.f16x2 r1936, r1927, r1933; +} +{ +sub.f16x2 r1939, r1892, r1893; +} +{ +mul.f16x2 r1942, r1939, r1852; +} +{ +add.f16x2 r1945, r1936, r1942; +} +{ +sub.f16x2 r1948, r1921, r1945; +} +{ +add.f16x2 r1951, r1862, r1863; +} +{ +mul.f16x2 r1954, r1951, r1847; +} +{ +add.f16x2 r1957, r1865, r1954; +} +{ +add.f16x2 r1960, r1868, r1869; +} +{ +mul.f16x2 r1963, r1960, r1849; +} +{ +add.f16x2 r1966, r1957, r1963; +} +{ +add.f16x2 r1969, r1874, r1875; +} +{ +mul.f16x2 r1972, r1969, r1851; +} +{ +add.f16x2 r1975, r1966, r1972; +} +{ +sub.f16x2 r1978, r1880, r1881; +} +{ +mul.f16x2 r1981, r1978, r1848; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1850; +} +{ +add.f16x2 r1990, r1981, r1987; +} +{ +sub.f16x2 r1993, r1892, r1893; +} +{ +mul.f16x2 r1996, r1993, r1852; +} +{ +add.f16x2 r1999, r1990, r1996; +} +{ +add.f16x2 r2002, r1975, r1999; +} +{ +add.f16x2 r2005, r1862, r1863; +} +{ +mul.f16x2 r2008, r2005, r1849; +} +{ +add.f16x2 r2011, r1865, r2008; +} +{ +add.f16x2 r2014, r1868, r1869; +} +{ +mul.f16x2 r2017, r2014, r1853; +} +{ +add.f16x2 r2020, r2011, r2017; +} +{ +add.f16x2 r2023, r1874, r1875; +} +{ +mul.f16x2 r2026, r2023, r1857; +} +{ +add.f16x2 r2029, r2020, r2026; +} +{ +sub.f16x2 r2032, r1880, r1881; +} +{ +mul.f16x2 r2035, r2032, r1850; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1855; +} +{ +add.f16x2 r2044, r2035, r2041; +} +{ +sub.f16x2 r2047, r1892, r1893; +} +{ +mul.f16x2 r2050, r2047, r1859; +} +{ +add.f16x2 r2053, r2044, r2050; +} +{ +sub.f16x2 r2056, r2029, r2053; +} +{ +add.f16x2 r2059, r1862, r1863; +} +{ +mul.f16x2 r2062, r2059, r1849; +} +{ +add.f16x2 r2065, r1865, r2062; +} +{ +add.f16x2 r2068, r1868, r1869; +} +{ +mul.f16x2 r2071, r2068, r1853; +} +{ +add.f16x2 r2074, r2065, r2071; +} +{ +add.f16x2 r2077, r1874, r1875; +} +{ +mul.f16x2 r2080, r2077, r1857; +} +{ +add.f16x2 r2083, r2074, r2080; +} +{ +sub.f16x2 r2086, r1880, r1881; +} +{ +mul.f16x2 r2089, r2086, r1850; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1855; +} +{ +add.f16x2 r2098, r2089, r2095; +} +{ +sub.f16x2 r2101, r1892, r1893; +} +{ +mul.f16x2 r2104, r2101, r1859; +} +{ +add.f16x2 r2107, r2098, r2104; +} +{ +add.f16x2 r2110, r2083, r2107; +} +{ +add.f16x2 r2113, r1862, r1863; +} +{ +mul.f16x2 r2116, r2113, r1851; +} +{ +add.f16x2 r2119, r1865, r2116; +} +{ +add.f16x2 r2122, r1868, r1869; +} +{ +mul.f16x2 r2125, r2122, r1857; +} +{ +add.f16x2 r2128, r2119, r2125; +} +{ +add.f16x2 r2131, r1874, r1875; +} +{ +mul.f16x2 r2134, r2131, r1849; +} +{ +add.f16x2 r2137, r2128, r2134; +} +{ +sub.f16x2 r2140, r1880, r1881; +} +{ +mul.f16x2 r2143, r2140, r1852; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1859; +} +{ +add.f16x2 r2152, r2143, r2149; +} +{ +sub.f16x2 r2155, r1892, r1893; +} +{ +mul.f16x2 r2158, r2155, r1850; +} +{ +add.f16x2 r2161, r2152, r2158; +} +{ +sub.f16x2 r2164, r2137, r2161; +} +{ +add.f16x2 r2167, r1862, r1863; +} +{ +mul.f16x2 r2170, r2167, r1851; +} +{ +add.f16x2 r2173, r1865, r2170; +} +{ +add.f16x2 r2176, r1868, r1869; +} +{ +mul.f16x2 r2179, r2176, r1857; +} +{ +add.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r1874, r1875; +} +{ +mul.f16x2 r2188, r2185, r1849; +} +{ +add.f16x2 r2191, r2182, r2188; +} +{ +sub.f16x2 r2194, r1880, r1881; +} +{ +mul.f16x2 r2197, r2194, r1852; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1859; +} +{ +add.f16x2 r2206, r2197, r2203; +} +{ +sub.f16x2 r2209, r1892, r1893; +} +{ +mul.f16x2 r2212, r2209, r1850; +} +{ +add.f16x2 r2215, r2206, r2212; +} +{ +add.f16x2 r2218, r2191, r2215; +} +{ +add.f16x2 r2221, r1880, r1881; +} +{ +mul.f16x2 r2224, r2221, r1847; +} +{ +add.f16x2 r2227, r1883, r2224; +} +{ +add.f16x2 r2230, r1886, r1887; +} +{ +mul.f16x2 r2233, r2230, r1849; +} +{ +add.f16x2 r2236, r2227, r2233; +} +{ +add.f16x2 r2239, r1892, r1893; +} +{ +mul.f16x2 r2242, r2239, r1851; +} +{ +add.f16x2 r2245, r2236, r2242; +} +{ +sub.f16x2 r2248, r1862, r1863; +} +{ +mul.f16x2 r2251, r2248, r1848; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1850; +} +{ +add.f16x2 r2260, r2251, r2257; +} +{ +sub.f16x2 r2263, r1874, r1875; +} +{ +mul.f16x2 r2266, r2263, r1852; +} +{ +add.f16x2 r2269, r2260, r2266; +} +{ +add.f16x2 r2272, r2245, r2269; +} +{ +add.f16x2 r2275, r1880, r1881; +} +{ +mul.f16x2 r2278, r2275, r1847; +} +{ +add.f16x2 r2281, r1883, r2278; +} +{ +add.f16x2 r2284, r1886, r1887; +} +{ +mul.f16x2 r2287, r2284, r1849; +} +{ +add.f16x2 r2290, r2281, r2287; +} +{ +add.f16x2 r2293, r1892, r1893; +} +{ +mul.f16x2 r2296, r2293, r1851; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r1862, r1863; +} +{ +mul.f16x2 r2305, r2302, r1848; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1850; +} +{ +add.f16x2 r2314, r2305, r2311; +} +{ +sub.f16x2 r2317, r1874, r1875; +} +{ +mul.f16x2 r2320, r2317, r1852; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +sub.f16x2 r2326, r2299, r2323; +} +{ +add.f16x2 r2329, r1880, r1881; +} +{ +mul.f16x2 r2332, r2329, r1849; +} +{ +add.f16x2 r2335, r1883, r2332; +} +{ +add.f16x2 r2338, r1886, r1887; +} +{ +mul.f16x2 r2341, r2338, r1853; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +add.f16x2 r2347, r1892, r1893; +} +{ +mul.f16x2 r2350, r2347, r1857; +} +{ +add.f16x2 r2353, r2344, r2350; +} +{ +sub.f16x2 r2356, r1862, r1863; +} +{ +mul.f16x2 r2359, r2356, r1850; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1855; +} +{ +add.f16x2 r2368, r2359, r2365; +} +{ +sub.f16x2 r2371, r1874, r1875; +} +{ +mul.f16x2 r2374, r2371, r1859; +} +{ +add.f16x2 r2377, r2368, r2374; +} +{ +add.f16x2 r2380, r2353, r2377; +} +{ +add.f16x2 r2383, r1880, r1881; +} +{ +mul.f16x2 r2386, r2383, r1849; +} +{ +add.f16x2 r2389, r1883, r2386; +} +{ +add.f16x2 r2392, r1886, r1887; +} +{ +mul.f16x2 r2395, r2392, r1853; +} +{ +add.f16x2 r2398, r2389, r2395; +} +{ +add.f16x2 r2401, r1892, r1893; +} +{ +mul.f16x2 r2404, r2401, r1857; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +sub.f16x2 r2410, r1862, r1863; +} +{ +mul.f16x2 r2413, r2410, r1850; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1855; +} +{ +add.f16x2 r2422, r2413, r2419; +} +{ +sub.f16x2 r2425, r1874, r1875; +} +{ +mul.f16x2 r2428, r2425, r1859; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +sub.f16x2 r2434, r2407, r2431; +} +{ +add.f16x2 r2437, r1880, r1881; +} +{ +mul.f16x2 r2440, r2437, r1851; +} +{ +add.f16x2 r2443, r1883, r2440; +} +{ +add.f16x2 r2446, r1886, r1887; +} +{ +mul.f16x2 r2449, r2446, r1857; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +add.f16x2 r2455, r1892, r1893; +} +{ +mul.f16x2 r2458, r2455, r1849; +} +{ +add.f16x2 r2461, r2452, r2458; +} +{ +sub.f16x2 r2464, r1862, r1863; +} +{ +mul.f16x2 r2467, r2464, r1852; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1859; +} +{ +add.f16x2 r2476, r2467, r2473; +} +{ +sub.f16x2 r2479, r1874, r1875; +} +{ +mul.f16x2 r2482, r2479, r1850; +} +{ +add.f16x2 r2485, r2476, r2482; +} +{ +add.f16x2 r2488, r2461, r2485; +} +{ +add.f16x2 r2491, r1880, r1881; +} +{ +mul.f16x2 r2494, r2491, r1851; +} +{ +add.f16x2 r2497, r1883, r2494; +} +{ +add.f16x2 r2500, r1886, r1887; +} +{ +mul.f16x2 r2503, r2500, r1857; +} +{ +add.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1892, r1893; +} +{ +mul.f16x2 r2512, r2509, r1849; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +sub.f16x2 r2518, r1862, r1863; +} +{ +mul.f16x2 r2521, r2518, r1852; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1859; +} +{ +add.f16x2 r2530, r2521, r2527; +} +{ +sub.f16x2 r2533, r1874, r1875; +} +{ +mul.f16x2 r2536, r2533, r1850; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2515, r2539; +} +mul.wide.u32 rd6, r3474, 1402438301; +shr.u64 rd7, rd6, 36; +cvt.u32.u64 r3488, rd7; +cvt.rn.f32.u32 f135, r3488; +mul.f32 f136, f135, 0f3E034E46; +cos.approx.f32 f93, f136; +sin.approx.f32 f137, f136; +neg.f32 f94, f137; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r2545, {low, high}; +} +mul.lo.s32 r3489, r3488, 49; +sub.s32 r3490, r3474, r3489; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2550, {high, high}; +} +{ +mul.f16x2 r2552, r2272, r2550; +} +{ +neg.f16x2 r2555, r2552; +} +{ +fma.rn.f16x2 r2557, r1948, r2548, r2555; +} +{ +mul.f16x2 r2561, r1948, r2550; +} +{ +fma.rn.f16x2 r2564, r2272, r2548, r2561; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2568, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2570, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2572, {low, high}; +} +{ +mul.f16x2 r2573, r2570, r2572; +} +{ +mul.f16x2 r2576, r2545, r2568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2579, {high, low}; +} +{ +fma.rn.f16x2 r2581, r2573, r2579, r2576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2581; +mov.b32 r2585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2581; +mov.b32 r2587, {high, high}; +} +{ +mul.f16x2 r2589, r2380, r2587; +} +{ +neg.f16x2 r2592, r2589; +} +{ +fma.rn.f16x2 r2594, r2056, r2585, r2592; +} +{ +mul.f16x2 r2598, r2056, r2587; +} +{ +fma.rn.f16x2 r2601, r2380, r2585, r2598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2605, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2607, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2609, {low, high}; +} +{ +mul.f16x2 r2610, r2607, r2609; +} +{ +mul.f16x2 r2613, r2581, r2605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2581; +mov.b32 r2616, {high, low}; +} +{ +fma.rn.f16x2 r2618, r2610, r2616, r2613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2618; +mov.b32 r2622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2618; +mov.b32 r2624, {high, high}; +} +{ +mul.f16x2 r2626, r2488, r2624; +} +{ +neg.f16x2 r2629, r2626; +} +{ +fma.rn.f16x2 r2631, r2164, r2622, r2629; +} +{ +mul.f16x2 r2635, r2164, r2624; +} +{ +fma.rn.f16x2 r2638, r2488, r2622, r2635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2642, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2644, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2646, {low, high}; +} +{ +mul.f16x2 r2647, r2644, r2646; +} +{ +mul.f16x2 r2650, r2618, r2642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2618; +mov.b32 r2653, {high, low}; +} +{ +fma.rn.f16x2 r2655, r2647, r2653, r2650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2655; +mov.b32 r2659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2655; +mov.b32 r2661, {high, high}; +} +{ +mul.f16x2 r2663, r2542, r2661; +} +{ +neg.f16x2 r2666, r2663; +} +{ +fma.rn.f16x2 r2668, r2218, r2659, r2666; +} +{ +mul.f16x2 r2672, r2218, r2661; +} +{ +fma.rn.f16x2 r2675, r2542, r2659, r2672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2679, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2681, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2683, {low, high}; +} +{ +mul.f16x2 r2684, r2681, r2683; +} +{ +mul.f16x2 r2687, r2655, r2679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2655; +mov.b32 r2690, {high, low}; +} +{ +fma.rn.f16x2 r2692, r2684, r2690, r2687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2692; +mov.b32 r2696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2692; +mov.b32 r2698, {high, high}; +} +{ +mul.f16x2 r2700, r2434, r2698; +} +{ +neg.f16x2 r2703, r2700; +} +{ +fma.rn.f16x2 r2705, r2110, r2696, r2703; +} +{ +mul.f16x2 r2709, r2110, r2698; +} +{ +fma.rn.f16x2 r2712, r2434, r2696, r2709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2718, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2720, {low, high}; +} +{ +mul.f16x2 r2721, r2718, r2720; +} +{ +mul.f16x2 r2724, r2692, r2716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2692; +mov.b32 r2727, {high, low}; +} +{ +fma.rn.f16x2 r2729, r2721, r2727, r2724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2729; +mov.b32 r2733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2729; +mov.b32 r2735, {high, high}; +} +{ +mul.f16x2 r2737, r2326, r2735; +} +{ +neg.f16x2 r2740, r2737; +} +{ +fma.rn.f16x2 r2742, r2002, r2733, r2740; +} +{ +mul.f16x2 r2746, r2002, r2735; +} +{ +fma.rn.f16x2 r2749, r2326, r2733, r2746; +} +shl.b32 r3491, r3490, 3; +add.s32 r3492, r3475, r3491; +barrier.sync 0; +mad.lo.s32 r3493, r3488, 2744, r3492; +st.shared.u32 [r3493], r1876; +st.shared.u32 [r3493+4], r1894; +st.shared.u32 [r3493+392], r2557; +st.shared.u32 [r3493+396], r2564; +st.shared.u32 [r3493+784], r2594; +st.shared.u32 [r3493+788], r2601; +st.shared.u32 [r3493+1176], r2631; +st.shared.u32 [r3493+1180], r2638; +st.shared.u32 [r3493+1568], r2668; +st.shared.u32 [r3493+1572], r2675; +st.shared.u32 [r3493+1960], r2705; +st.shared.u32 [r3493+1964], r2712; +st.shared.u32 [r3493+2352], r2742; +st.shared.u32 [r3493+2356], r2749; +barrier.sync 0; +ld.shared.u32 r2788, [r3477]; +ld.shared.u32 r2806, [r3477+4]; +ld.shared.u32 r2785, [r3477+2744]; +ld.shared.u32 r2803, [r3477+2748]; +ld.shared.u32 r2791, [r3477+5488]; +ld.shared.u32 r2809, [r3477+5492]; +ld.shared.u32 r2797, [r3477+8232]; +ld.shared.u32 r2815, [r3477+8236]; +ld.shared.u32 r2798, [r3477+10976]; +ld.shared.u32 r2816, [r3477+10980]; +ld.shared.u32 r2792, [r3477+13720]; +ld.shared.u32 r2810, [r3477+13724]; +ld.shared.u32 r2786, [r3477+16464]; +ld.shared.u32 r2804, [r3477+16468]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r2772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r2773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2774, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2777, {low, high}; +} +{ +neg.f16x2 r2778, r2777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2780, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2781, {low, high}; +} +{ +neg.f16x2 r2782, r2781; +} +{ +add.f16x2 r2784, r2785, r2786; +} +{ +add.f16x2 r2787, r2788, r2784; +} +{ +add.f16x2 r2790, r2791, r2792; +} +{ +add.f16x2 r2793, r2787, r2790; +} +{ +add.f16x2 r2796, r2797, r2798; +} +{ +add.f16x2 %0, r2793, r2796; +} +{ +add.f16x2 r2802, r2803, r2804; +} +{ +add.f16x2 r2805, r2806, r2802; +} +{ +add.f16x2 r2808, r2809, r2810; +} +{ +add.f16x2 r2811, r2805, r2808; +} +{ +add.f16x2 r2814, r2815, r2816; +} +{ +add.f16x2 %1, r2811, r2814; +} +{ +add.f16x2 r2820, r2785, r2786; +} +{ +mul.f16x2 r2823, r2820, r2770; +} +{ +add.f16x2 r2826, r2788, r2823; +} +{ +add.f16x2 r2829, r2791, r2792; +} +{ +mul.f16x2 r2832, r2829, r2772; +} +{ +add.f16x2 r2835, r2826, r2832; +} +{ +add.f16x2 r2838, r2797, r2798; +} +{ +mul.f16x2 r2841, r2838, r2774; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r2803, r2804; +} +{ +mul.f16x2 r2850, r2847, r2771; +} +{ +sub.f16x2 r2853, r2809, r2810; +} +{ +mul.f16x2 r2856, r2853, r2773; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2815, r2816; +} +{ +mul.f16x2 r2865, r2862, r2775; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +sub.f16x2 %2, r2844, r2868; +} +{ +add.f16x2 r2874, r2785, r2786; +} +{ +mul.f16x2 r2877, r2874, r2770; +} +{ +add.f16x2 r2880, r2788, r2877; +} +{ +add.f16x2 r2883, r2791, r2792; +} +{ +mul.f16x2 r2886, r2883, r2772; +} +{ +add.f16x2 r2889, r2880, r2886; +} +{ +add.f16x2 r2892, r2797, r2798; +} +{ +mul.f16x2 r2895, r2892, r2774; +} +{ +add.f16x2 r2898, r2889, r2895; +} +{ +sub.f16x2 r2901, r2803, r2804; +} +{ +mul.f16x2 r2904, r2901, r2771; +} +{ +sub.f16x2 r2907, r2809, r2810; +} +{ +mul.f16x2 r2910, r2907, r2773; +} +{ +add.f16x2 r2913, r2904, r2910; +} +{ +sub.f16x2 r2916, r2815, r2816; +} +{ +mul.f16x2 r2919, r2916, r2775; +} +{ +add.f16x2 r2922, r2913, r2919; +} +{ +add.f16x2 %12, r2898, r2922; +} +{ +add.f16x2 r2928, r2785, r2786; +} +{ +mul.f16x2 r2931, r2928, r2772; +} +{ +add.f16x2 r2934, r2788, r2931; +} +{ +add.f16x2 r2937, r2791, r2792; +} +{ +mul.f16x2 r2940, r2937, r2776; +} +{ +add.f16x2 r2943, r2934, r2940; +} +{ +add.f16x2 r2946, r2797, r2798; +} +{ +mul.f16x2 r2949, r2946, r2780; +} +{ +add.f16x2 r2952, r2943, r2949; +} +{ +sub.f16x2 r2955, r2803, r2804; +} +{ +mul.f16x2 r2958, r2955, r2773; +} +{ +sub.f16x2 r2961, r2809, r2810; +} +{ +mul.f16x2 r2964, r2961, r2778; +} +{ +add.f16x2 r2967, r2958, r2964; +} +{ +sub.f16x2 r2970, r2815, r2816; +} +{ +mul.f16x2 r2973, r2970, r2782; +} +{ +add.f16x2 r2976, r2967, r2973; +} +{ +sub.f16x2 %4, r2952, r2976; +} +{ +add.f16x2 r2982, r2785, r2786; +} +{ +mul.f16x2 r2985, r2982, r2772; +} +{ +add.f16x2 r2988, r2788, r2985; +} +{ +add.f16x2 r2991, r2791, r2792; +} +{ +mul.f16x2 r2994, r2991, r2776; +} +{ +add.f16x2 r2997, r2988, r2994; +} +{ +add.f16x2 r3000, r2797, r2798; +} +{ +mul.f16x2 r3003, r3000, r2780; +} +{ +add.f16x2 r3006, r2997, r3003; +} +{ +sub.f16x2 r3009, r2803, r2804; +} +{ +mul.f16x2 r3012, r3009, r2773; +} +{ +sub.f16x2 r3015, r2809, r2810; +} +{ +mul.f16x2 r3018, r3015, r2778; +} +{ +add.f16x2 r3021, r3012, r3018; +} +{ +sub.f16x2 r3024, r2815, r2816; +} +{ +mul.f16x2 r3027, r3024, r2782; +} +{ +add.f16x2 r3030, r3021, r3027; +} +{ +add.f16x2 %10, r3006, r3030; +} +{ +add.f16x2 r3036, r2785, r2786; +} +{ +mul.f16x2 r3039, r3036, r2774; +} +{ +add.f16x2 r3042, r2788, r3039; +} +{ +add.f16x2 r3045, r2791, r2792; +} +{ +mul.f16x2 r3048, r3045, r2780; +} +{ +add.f16x2 r3051, r3042, r3048; +} +{ +add.f16x2 r3054, r2797, r2798; +} +{ +mul.f16x2 r3057, r3054, r2772; +} +{ +add.f16x2 r3060, r3051, r3057; +} +{ +sub.f16x2 r3063, r2803, r2804; +} +{ +mul.f16x2 r3066, r3063, r2775; +} +{ +sub.f16x2 r3069, r2809, r2810; +} +{ +mul.f16x2 r3072, r3069, r2782; +} +{ +add.f16x2 r3075, r3066, r3072; +} +{ +sub.f16x2 r3078, r2815, r2816; +} +{ +mul.f16x2 r3081, r3078, r2773; +} +{ +add.f16x2 r3084, r3075, r3081; +} +{ +sub.f16x2 %6, r3060, r3084; +} +{ +add.f16x2 r3090, r2785, r2786; +} +{ +mul.f16x2 r3093, r3090, r2774; +} +{ +add.f16x2 r3096, r2788, r3093; +} +{ +add.f16x2 r3099, r2791, r2792; +} +{ +mul.f16x2 r3102, r3099, r2780; +} +{ +add.f16x2 r3105, r3096, r3102; +} +{ +add.f16x2 r3108, r2797, r2798; +} +{ +mul.f16x2 r3111, r3108, r2772; +} +{ +add.f16x2 r3114, r3105, r3111; +} +{ +sub.f16x2 r3117, r2803, r2804; +} +{ +mul.f16x2 r3120, r3117, r2775; +} +{ +sub.f16x2 r3123, r2809, r2810; +} +{ +mul.f16x2 r3126, r3123, r2782; +} +{ +add.f16x2 r3129, r3120, r3126; +} +{ +sub.f16x2 r3132, r2815, r2816; +} +{ +mul.f16x2 r3135, r3132, r2773; +} +{ +add.f16x2 r3138, r3129, r3135; +} +{ +add.f16x2 %8, r3114, r3138; +} +{ +add.f16x2 r3144, r2803, r2804; +} +{ +mul.f16x2 r3147, r3144, r2770; +} +{ +add.f16x2 r3150, r2806, r3147; +} +{ +add.f16x2 r3153, r2809, r2810; +} +{ +mul.f16x2 r3156, r3153, r2772; +} +{ +add.f16x2 r3159, r3150, r3156; +} +{ +add.f16x2 r3162, r2815, r2816; +} +{ +mul.f16x2 r3165, r3162, r2774; +} +{ +add.f16x2 r3168, r3159, r3165; +} +{ +sub.f16x2 r3171, r2785, r2786; +} +{ +mul.f16x2 r3174, r3171, r2771; +} +{ +sub.f16x2 r3177, r2791, r2792; +} +{ +mul.f16x2 r3180, r3177, r2773; +} +{ +add.f16x2 r3183, r3174, r3180; +} +{ +sub.f16x2 r3186, r2797, r2798; +} +{ +mul.f16x2 r3189, r3186, r2775; +} +{ +add.f16x2 r3192, r3183, r3189; +} +{ +add.f16x2 %3, r3168, r3192; +} +{ +add.f16x2 r3198, r2803, r2804; +} +{ +mul.f16x2 r3201, r3198, r2770; +} +{ +add.f16x2 r3204, r2806, r3201; +} +{ +add.f16x2 r3207, r2809, r2810; +} +{ +mul.f16x2 r3210, r3207, r2772; +} +{ +add.f16x2 r3213, r3204, r3210; +} +{ +add.f16x2 r3216, r2815, r2816; +} +{ +mul.f16x2 r3219, r3216, r2774; +} +{ +add.f16x2 r3222, r3213, r3219; +} +{ +sub.f16x2 r3225, r2785, r2786; +} +{ +mul.f16x2 r3228, r3225, r2771; +} +{ +sub.f16x2 r3231, r2791, r2792; +} +{ +mul.f16x2 r3234, r3231, r2773; +} +{ +add.f16x2 r3237, r3228, r3234; +} +{ +sub.f16x2 r3240, r2797, r2798; +} +{ +mul.f16x2 r3243, r3240, r2775; +} +{ +add.f16x2 r3246, r3237, r3243; +} +{ +sub.f16x2 %13, r3222, r3246; +} +{ +add.f16x2 r3252, r2803, r2804; +} +{ +mul.f16x2 r3255, r3252, r2772; +} +{ +add.f16x2 r3258, r2806, r3255; +} +{ +add.f16x2 r3261, r2809, r2810; +} +{ +mul.f16x2 r3264, r3261, r2776; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +add.f16x2 r3270, r2815, r2816; +} +{ +mul.f16x2 r3273, r3270, r2780; +} +{ +add.f16x2 r3276, r3267, r3273; +} +{ +sub.f16x2 r3279, r2785, r2786; +} +{ +mul.f16x2 r3282, r3279, r2773; +} +{ +sub.f16x2 r3285, r2791, r2792; +} +{ +mul.f16x2 r3288, r3285, r2778; +} +{ +add.f16x2 r3291, r3282, r3288; +} +{ +sub.f16x2 r3294, r2797, r2798; +} +{ +mul.f16x2 r3297, r3294, r2782; +} +{ +add.f16x2 r3300, r3291, r3297; +} +{ +add.f16x2 %5, r3276, r3300; +} +{ +add.f16x2 r3306, r2803, r2804; +} +{ +mul.f16x2 r3309, r3306, r2772; +} +{ +add.f16x2 r3312, r2806, r3309; +} +{ +add.f16x2 r3315, r2809, r2810; +} +{ +mul.f16x2 r3318, r3315, r2776; +} +{ +add.f16x2 r3321, r3312, r3318; +} +{ +add.f16x2 r3324, r2815, r2816; +} +{ +mul.f16x2 r3327, r3324, r2780; +} +{ +add.f16x2 r3330, r3321, r3327; +} +{ +sub.f16x2 r3333, r2785, r2786; +} +{ +mul.f16x2 r3336, r3333, r2773; +} +{ +sub.f16x2 r3339, r2791, r2792; +} +{ +mul.f16x2 r3342, r3339, r2778; +} +{ +add.f16x2 r3345, r3336, r3342; +} +{ +sub.f16x2 r3348, r2797, r2798; +} +{ +mul.f16x2 r3351, r3348, r2782; +} +{ +add.f16x2 r3354, r3345, r3351; +} +{ +sub.f16x2 %11, r3330, r3354; +} +{ +add.f16x2 r3360, r2803, r2804; +} +{ +mul.f16x2 r3363, r3360, r2774; +} +{ +add.f16x2 r3366, r2806, r3363; +} +{ +add.f16x2 r3369, r2809, r2810; +} +{ +mul.f16x2 r3372, r3369, r2780; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r2815, r2816; +} +{ +mul.f16x2 r3381, r3378, r2772; +} +{ +add.f16x2 r3384, r3375, r3381; +} +{ +sub.f16x2 r3387, r2785, r2786; +} +{ +mul.f16x2 r3390, r3387, r2775; +} +{ +sub.f16x2 r3393, r2791, r2792; +} +{ +mul.f16x2 r3396, r3393, r2782; +} +{ +add.f16x2 r3399, r3390, r3396; +} +{ +sub.f16x2 r3402, r2797, r2798; +} +{ +mul.f16x2 r3405, r3402, r2773; +} +{ +add.f16x2 r3408, r3399, r3405; +} +{ +add.f16x2 %7, r3384, r3408; +} +{ +add.f16x2 r3414, r2803, r2804; +} +{ +mul.f16x2 r3417, r3414, r2774; +} +{ +add.f16x2 r3420, r2806, r3417; +} +{ +add.f16x2 r3423, r2809, r2810; +} +{ +mul.f16x2 r3426, r3423, r2780; +} +{ +add.f16x2 r3429, r3420, r3426; +} +{ +add.f16x2 r3432, r2815, r2816; +} +{ +mul.f16x2 r3435, r3432, r2772; +} +{ +add.f16x2 r3438, r3429, r3435; +} +{ +sub.f16x2 r3441, r2785, r2786; +} +{ +mul.f16x2 r3444, r3441, r2775; +} +{ +sub.f16x2 r3447, r2791, r2792; +} +{ +mul.f16x2 r3450, r3447, r2782; +} +{ +add.f16x2 r3453, r3444, r3450; +} +{ +sub.f16x2 r3456, r2797, r2798; +} +{ +mul.f16x2 r3459, r3456, r2773; +} +{ +add.f16x2 r3462, r3453, r3459; +} +{ +sub.f16x2 %9, r3438, r3462; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<923, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<138>; +.reg .b32 r<3494>; +.reg .b64 rd<8>; +mov.u32 r3468, %tid.y; +mov.u32 r3469, %14; +mad.lo.s32 r3470, r3468, 9604, r3469; +mov.u32 r3471, %tid.x; +mov.f32 f126, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1, {low, high}; +} +mov.f32 f128, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2, {low, high}; +} +mov.f32 f114, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r3, {low, high}; +} +mov.f32 f116, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r4, {low, high}; +} +mov.f32 f122, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r5, {low, high}; +} +mov.f32 f124, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r6, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r7, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r8, {low, high}; +} +{ +neg.f16x2 r9, r8; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r11, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r12, {low, high}; +} +{ +neg.f16x2 r13, r12; +} +{ +add.f16x2 r15, %17, %27; +} +{ +add.f16x2 r18, %15, r15; +} +{ +add.f16x2 r21, %19, %25; +} +{ +add.f16x2 r24, r18, r21; +} +{ +add.f16x2 r27, %21, %23; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %18, %28; +} +{ +add.f16x2 r36, %16, r33; +} +{ +add.f16x2 r39, %20, %26; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %22, %24; +} +{ +add.f16x2 r48, r42, r45; +} +{ +add.f16x2 r51, %17, %27; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %15, r54; +} +{ +add.f16x2 r60, %19, %25; +} +{ +mul.f16x2 r63, r60, r3; +} +{ +add.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %21, %23; +} +{ +mul.f16x2 r72, r69, r5; +} +{ +add.f16x2 r75, r66, r72; +} +{ +sub.f16x2 r78, %18, %28; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +sub.f16x2 r84, %20, %26; +} +{ +mul.f16x2 r87, r84, r4; +} +{ +add.f16x2 r90, r81, r87; +} +{ +sub.f16x2 r93, %22, %24; +} +{ +mul.f16x2 r96, r93, r6; +} +{ +add.f16x2 r99, r90, r96; +} +{ +sub.f16x2 r102, r75, r99; +} +{ +add.f16x2 r105, %17, %27; +} +{ +mul.f16x2 r108, r105, r1; +} +{ +add.f16x2 r111, %15, r108; +} +{ +add.f16x2 r114, %19, %25; +} +{ +mul.f16x2 r117, r114, r3; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %21, %23; +} +{ +mul.f16x2 r126, r123, r5; +} +{ +add.f16x2 r129, r120, r126; +} +{ +sub.f16x2 r132, %18, %28; +} +{ +mul.f16x2 r135, r132, r2; +} +{ +sub.f16x2 r138, %20, %26; +} +{ +mul.f16x2 r141, r138, r4; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %22, %24; +} +{ +mul.f16x2 r150, r147, r6; +} +{ +add.f16x2 r153, r144, r150; +} +{ +add.f16x2 r156, r129, r153; +} +{ +add.f16x2 r159, %17, %27; +} +{ +mul.f16x2 r162, r159, r3; +} +{ +add.f16x2 r165, %15, r162; +} +{ +add.f16x2 r168, %19, %25; +} +{ +mul.f16x2 r171, r168, r7; +} +{ +add.f16x2 r174, r165, r171; +} +{ +add.f16x2 r177, %21, %23; +} +{ +mul.f16x2 r180, r177, r11; +} +{ +add.f16x2 r183, r174, r180; +} +{ +sub.f16x2 r186, %18, %28; +} +{ +mul.f16x2 r189, r186, r4; +} +{ +sub.f16x2 r192, %20, %26; +} +{ +mul.f16x2 r195, r192, r9; +} +{ +add.f16x2 r198, r189, r195; +} +{ +sub.f16x2 r201, %22, %24; +} +{ +mul.f16x2 r204, r201, r13; +} +{ +add.f16x2 r207, r198, r204; +} +{ +sub.f16x2 r210, r183, r207; +} +{ +add.f16x2 r213, %17, %27; +} +{ +mul.f16x2 r216, r213, r3; +} +{ +add.f16x2 r219, %15, r216; +} +{ +add.f16x2 r222, %19, %25; +} +{ +mul.f16x2 r225, r222, r7; +} +{ +add.f16x2 r228, r219, r225; +} +{ +add.f16x2 r231, %21, %23; +} +{ +mul.f16x2 r234, r231, r11; +} +{ +add.f16x2 r237, r228, r234; +} +{ +sub.f16x2 r240, %18, %28; +} +{ +mul.f16x2 r243, r240, r4; +} +{ +sub.f16x2 r246, %20, %26; +} +{ +mul.f16x2 r249, r246, r9; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %22, %24; +} +{ +mul.f16x2 r258, r255, r13; +} +{ +add.f16x2 r261, r252, r258; +} +{ +add.f16x2 r264, r237, r261; +} +{ +add.f16x2 r267, %17, %27; +} +{ +mul.f16x2 r270, r267, r5; +} +{ +add.f16x2 r273, %15, r270; +} +{ +add.f16x2 r276, %19, %25; +} +{ +mul.f16x2 r279, r276, r11; +} +{ +add.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %21, %23; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, r282, r288; +} +{ +sub.f16x2 r294, %18, %28; +} +{ +mul.f16x2 r297, r294, r6; +} +{ +sub.f16x2 r300, %20, %26; +} +{ +mul.f16x2 r303, r300, r13; +} +{ +add.f16x2 r306, r297, r303; +} +{ +sub.f16x2 r309, %22, %24; +} +{ +mul.f16x2 r312, r309, r4; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r291, r315; +} +{ +add.f16x2 r321, %17, %27; +} +{ +mul.f16x2 r324, r321, r5; +} +{ +add.f16x2 r327, %15, r324; +} +{ +add.f16x2 r330, %19, %25; +} +{ +mul.f16x2 r333, r330, r11; +} +{ +add.f16x2 r336, r327, r333; +} +{ +add.f16x2 r339, %21, %23; +} +{ +mul.f16x2 r342, r339, r3; +} +{ +add.f16x2 r345, r336, r342; +} +{ +sub.f16x2 r348, %18, %28; +} +{ +mul.f16x2 r351, r348, r6; +} +{ +sub.f16x2 r354, %20, %26; +} +{ +mul.f16x2 r357, r354, r13; +} +{ +add.f16x2 r360, r351, r357; +} +{ +sub.f16x2 r363, %22, %24; +} +{ +mul.f16x2 r366, r363, r4; +} +{ +add.f16x2 r369, r360, r366; +} +{ +add.f16x2 r372, r345, r369; +} +{ +add.f16x2 r375, %18, %28; +} +{ +mul.f16x2 r378, r375, r1; +} +{ +add.f16x2 r381, %16, r378; +} +{ +add.f16x2 r384, %20, %26; +} +{ +mul.f16x2 r387, r384, r3; +} +{ +add.f16x2 r390, r381, r387; +} +{ +add.f16x2 r393, %22, %24; +} +{ +mul.f16x2 r396, r393, r5; +} +{ +add.f16x2 r399, r390, r396; +} +{ +sub.f16x2 r402, %17, %27; +} +{ +mul.f16x2 r405, r402, r2; +} +{ +sub.f16x2 r408, %19, %25; +} +{ +mul.f16x2 r411, r408, r4; +} +{ +add.f16x2 r414, r405, r411; +} +{ +sub.f16x2 r417, %21, %23; +} +{ +mul.f16x2 r420, r417, r6; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r399, r423; +} +{ +add.f16x2 r429, %18, %28; +} +{ +mul.f16x2 r432, r429, r1; +} +{ +add.f16x2 r435, %16, r432; +} +{ +add.f16x2 r438, %20, %26; +} +{ +mul.f16x2 r441, r438, r3; +} +{ +add.f16x2 r444, r435, r441; +} +{ +add.f16x2 r447, %22, %24; +} +{ +mul.f16x2 r450, r447, r5; +} +{ +add.f16x2 r453, r444, r450; +} +{ +sub.f16x2 r456, %17, %27; +} +{ +mul.f16x2 r459, r456, r2; +} +{ +sub.f16x2 r462, %19, %25; +} +{ +mul.f16x2 r465, r462, r4; +} +{ +add.f16x2 r468, r459, r465; +} +{ +sub.f16x2 r471, %21, %23; +} +{ +mul.f16x2 r474, r471, r6; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r453, r477; +} +{ +add.f16x2 r483, %18, %28; +} +{ +mul.f16x2 r486, r483, r3; +} +{ +add.f16x2 r489, %16, r486; +} +{ +add.f16x2 r492, %20, %26; +} +{ +mul.f16x2 r495, r492, r7; +} +{ +add.f16x2 r498, r489, r495; +} +{ +add.f16x2 r501, %22, %24; +} +{ +mul.f16x2 r504, r501, r11; +} +{ +add.f16x2 r507, r498, r504; +} +{ +sub.f16x2 r510, %17, %27; +} +{ +mul.f16x2 r513, r510, r4; +} +{ +sub.f16x2 r516, %19, %25; +} +{ +mul.f16x2 r519, r516, r9; +} +{ +add.f16x2 r522, r513, r519; +} +{ +sub.f16x2 r525, %21, %23; +} +{ +mul.f16x2 r528, r525, r13; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r507, r531; +} +{ +add.f16x2 r537, %18, %28; +} +{ +mul.f16x2 r540, r537, r3; +} +{ +add.f16x2 r543, %16, r540; +} +{ +add.f16x2 r546, %20, %26; +} +{ +mul.f16x2 r549, r546, r7; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, %22, %24; +} +{ +mul.f16x2 r558, r555, r11; +} +{ +add.f16x2 r561, r552, r558; +} +{ +sub.f16x2 r564, %17, %27; +} +{ +mul.f16x2 r567, r564, r4; +} +{ +sub.f16x2 r570, %19, %25; +} +{ +mul.f16x2 r573, r570, r9; +} +{ +add.f16x2 r576, r567, r573; +} +{ +sub.f16x2 r579, %21, %23; +} +{ +mul.f16x2 r582, r579, r13; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r561, r585; +} +{ +add.f16x2 r591, %18, %28; +} +{ +mul.f16x2 r594, r591, r5; +} +{ +add.f16x2 r597, %16, r594; +} +{ +add.f16x2 r600, %20, %26; +} +{ +mul.f16x2 r603, r600, r11; +} +{ +add.f16x2 r606, r597, r603; +} +{ +add.f16x2 r609, %22, %24; +} +{ +mul.f16x2 r612, r609, r3; +} +{ +add.f16x2 r615, r606, r612; +} +{ +sub.f16x2 r618, %17, %27; +} +{ +mul.f16x2 r621, r618, r6; +} +{ +sub.f16x2 r624, %19, %25; +} +{ +mul.f16x2 r627, r624, r13; +} +{ +add.f16x2 r630, r621, r627; +} +{ +sub.f16x2 r633, %21, %23; +} +{ +mul.f16x2 r636, r633, r4; +} +{ +add.f16x2 r639, r630, r636; +} +{ +add.f16x2 r642, r615, r639; +} +{ +add.f16x2 r645, %18, %28; +} +{ +mul.f16x2 r648, r645, r5; +} +{ +add.f16x2 r651, %16, r648; +} +{ +add.f16x2 r654, %20, %26; +} +{ +mul.f16x2 r657, r654, r11; +} +{ +add.f16x2 r660, r651, r657; +} +{ +add.f16x2 r663, %22, %24; +} +{ +mul.f16x2 r666, r663, r3; +} +{ +add.f16x2 r669, r660, r666; +} +{ +sub.f16x2 r672, %17, %27; +} +{ +mul.f16x2 r675, r672, r6; +} +{ +sub.f16x2 r678, %19, %25; +} +{ +mul.f16x2 r681, r678, r13; +} +{ +add.f16x2 r684, r675, r681; +} +{ +sub.f16x2 r687, %21, %23; +} +{ +mul.f16x2 r690, r687, r4; +} +{ +add.f16x2 r693, r684, r690; +} +{ +sub.f16x2 r696, r669, r693; +} +mul.wide.u32 rd2, r3471, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3472, rd3; +mul.lo.s32 r3473, r3472, 343; +sub.s32 r3474, r3471, r3473; +cvt.rn.f32.u32 f129, r3474; +mul.f32 f130, f129, 0f3B2B805B; +cos.approx.f32 f21, f130; +sin.approx.f32 f131, f130; +neg.f32 f22, f131; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r699, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r704, {high, high}; +} +{ +mul.f16x2 r706, r426, r704; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r102, r702, r709; +} +{ +mul.f16x2 r715, r102, r704; +} +{ +fma.rn.f16x2 r718, r426, r702, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r724, {high, high}; +} +mov.f32 f105, 0fBF800000; +mov.f32 f106, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r726, {low, high}; +} +{ +mul.f16x2 r727, r724, r726; +} +{ +mul.f16x2 r730, r699, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r733, {high, low}; +} +{ +fma.rn.f16x2 r735, r727, r733, r730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r741, {high, high}; +} +{ +mul.f16x2 r743, r534, r741; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r210, r739, r746; +} +{ +mul.f16x2 r752, r210, r741; +} +{ +fma.rn.f16x2 r755, r534, r739, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r761, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r763, {low, high}; +} +{ +mul.f16x2 r764, r761, r763; +} +{ +mul.f16x2 r767, r735, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r770, {high, low}; +} +{ +fma.rn.f16x2 r772, r764, r770, r767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r778, {high, high}; +} +{ +mul.f16x2 r780, r642, r778; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r318, r776, r783; +} +{ +mul.f16x2 r789, r318, r778; +} +{ +fma.rn.f16x2 r792, r642, r776, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r798, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r798, r800; +} +{ +mul.f16x2 r804, r772, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r807, {high, low}; +} +{ +fma.rn.f16x2 r809, r801, r807, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r815, {high, high}; +} +{ +mul.f16x2 r817, r696, r815; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r372, r813, r820; +} +{ +mul.f16x2 r826, r372, r815; +} +{ +fma.rn.f16x2 r829, r696, r813, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r833, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r835, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r837, {low, high}; +} +{ +mul.f16x2 r838, r835, r837; +} +{ +mul.f16x2 r841, r809, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r844, {high, low}; +} +{ +fma.rn.f16x2 r846, r838, r844, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r850, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r852, {high, high}; +} +{ +mul.f16x2 r854, r588, r852; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r264, r850, r857; +} +{ +mul.f16x2 r863, r264, r852; +} +{ +fma.rn.f16x2 r866, r588, r850, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r870, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r872, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r874, {low, high}; +} +{ +mul.f16x2 r875, r872, r874; +} +{ +mul.f16x2 r878, r846, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r881, {high, low}; +} +{ +fma.rn.f16x2 r883, r875, r881, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r887, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r889, {high, high}; +} +{ +mul.f16x2 r891, r480, r889; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r156, r887, r894; +} +{ +mul.f16x2 r900, r156, r889; +} +{ +fma.rn.f16x2 r903, r480, r887, r900; +} +mad.lo.s32 r3475, r3472, 9604, r3470; +barrier.sync 0; +mad.lo.s32 r3476, r3474, 28, r3475; +st.shared.u32 [r3476], r30; +st.shared.u32 [r3476+4], r711; +st.shared.u32 [r3476+8], r748; +st.shared.u32 [r3476+12], r785; +st.shared.u32 [r3476+16], r822; +st.shared.u32 [r3476+20], r859; +st.shared.u32 [r3476+24], r896; +barrier.sync 0; +mad.lo.s32 r3477, r3474, -24, r3476; +ld.shared.u32 r942, [r3477]; +ld.shared.u32 r939, [r3477+1372]; +ld.shared.u32 r945, [r3477+2744]; +ld.shared.u32 r951, [r3477+4116]; +ld.shared.u32 r952, [r3477+5488]; +ld.shared.u32 r946, [r3477+6860]; +ld.shared.u32 r940, [r3477+8232]; +barrier.sync 0; +st.shared.u32 [r3476], r48; +st.shared.u32 [r3476+4], r718; +st.shared.u32 [r3476+8], r755; +st.shared.u32 [r3476+12], r792; +st.shared.u32 [r3476+16], r829; +st.shared.u32 [r3476+20], r866; +st.shared.u32 [r3476+24], r903; +barrier.sync 0; +ld.shared.u32 r960, [r3477]; +ld.shared.u32 r957, [r3477+1372]; +ld.shared.u32 r963, [r3477+2744]; +ld.shared.u32 r969, [r3477+4116]; +ld.shared.u32 r970, [r3477+5488]; +ld.shared.u32 r964, [r3477+6860]; +ld.shared.u32 r958, [r3477+8232]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +add.f16x2 r938, r939, r940; +} +{ +add.f16x2 r941, r942, r938; +} +{ +add.f16x2 r944, r945, r946; +} +{ +add.f16x2 r947, r941, r944; +} +{ +add.f16x2 r950, r951, r952; +} +{ +add.f16x2 r953, r947, r950; +} +{ +add.f16x2 r956, r957, r958; +} +{ +add.f16x2 r959, r960, r956; +} +{ +add.f16x2 r962, r963, r964; +} +{ +add.f16x2 r965, r959, r962; +} +{ +add.f16x2 r968, r969, r970; +} +{ +add.f16x2 r971, r965, r968; +} +{ +add.f16x2 r974, r939, r940; +} +{ +mul.f16x2 r977, r974, r924; +} +{ +add.f16x2 r980, r942, r977; +} +{ +add.f16x2 r983, r945, r946; +} +{ +mul.f16x2 r986, r983, r926; +} +{ +add.f16x2 r989, r980, r986; +} +{ +add.f16x2 r992, r951, r952; +} +{ +mul.f16x2 r995, r992, r928; +} +{ +add.f16x2 r998, r989, r995; +} +{ +sub.f16x2 r1001, r957, r958; +} +{ +mul.f16x2 r1004, r1001, r925; +} +{ +sub.f16x2 r1007, r963, r964; +} +{ +mul.f16x2 r1010, r1007, r927; +} +{ +add.f16x2 r1013, r1004, r1010; +} +{ +sub.f16x2 r1016, r969, r970; +} +{ +mul.f16x2 r1019, r1016, r929; +} +{ +add.f16x2 r1022, r1013, r1019; +} +{ +sub.f16x2 r1025, r998, r1022; +} +{ +add.f16x2 r1028, r939, r940; +} +{ +mul.f16x2 r1031, r1028, r924; +} +{ +add.f16x2 r1034, r942, r1031; +} +{ +add.f16x2 r1037, r945, r946; +} +{ +mul.f16x2 r1040, r1037, r926; +} +{ +add.f16x2 r1043, r1034, r1040; +} +{ +add.f16x2 r1046, r951, r952; +} +{ +mul.f16x2 r1049, r1046, r928; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, r957, r958; +} +{ +mul.f16x2 r1058, r1055, r925; +} +{ +sub.f16x2 r1061, r963, r964; +} +{ +mul.f16x2 r1064, r1061, r927; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +sub.f16x2 r1070, r969, r970; +} +{ +mul.f16x2 r1073, r1070, r929; +} +{ +add.f16x2 r1076, r1067, r1073; +} +{ +add.f16x2 r1079, r1052, r1076; +} +{ +add.f16x2 r1082, r939, r940; +} +{ +mul.f16x2 r1085, r1082, r926; +} +{ +add.f16x2 r1088, r942, r1085; +} +{ +add.f16x2 r1091, r945, r946; +} +{ +mul.f16x2 r1094, r1091, r930; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r951, r952; +} +{ +mul.f16x2 r1103, r1100, r934; +} +{ +add.f16x2 r1106, r1097, r1103; +} +{ +sub.f16x2 r1109, r957, r958; +} +{ +mul.f16x2 r1112, r1109, r927; +} +{ +sub.f16x2 r1115, r963, r964; +} +{ +mul.f16x2 r1118, r1115, r932; +} +{ +add.f16x2 r1121, r1112, r1118; +} +{ +sub.f16x2 r1124, r969, r970; +} +{ +mul.f16x2 r1127, r1124, r936; +} +{ +add.f16x2 r1130, r1121, r1127; +} +{ +sub.f16x2 r1133, r1106, r1130; +} +{ +add.f16x2 r1136, r939, r940; +} +{ +mul.f16x2 r1139, r1136, r926; +} +{ +add.f16x2 r1142, r942, r1139; +} +{ +add.f16x2 r1145, r945, r946; +} +{ +mul.f16x2 r1148, r1145, r930; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r951, r952; +} +{ +mul.f16x2 r1157, r1154, r934; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, r957, r958; +} +{ +mul.f16x2 r1166, r1163, r927; +} +{ +sub.f16x2 r1169, r963, r964; +} +{ +mul.f16x2 r1172, r1169, r932; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +sub.f16x2 r1178, r969, r970; +} +{ +mul.f16x2 r1181, r1178, r936; +} +{ +add.f16x2 r1184, r1175, r1181; +} +{ +add.f16x2 r1187, r1160, r1184; +} +{ +add.f16x2 r1190, r939, r940; +} +{ +mul.f16x2 r1193, r1190, r928; +} +{ +add.f16x2 r1196, r942, r1193; +} +{ +add.f16x2 r1199, r945, r946; +} +{ +mul.f16x2 r1202, r1199, r934; +} +{ +add.f16x2 r1205, r1196, r1202; +} +{ +add.f16x2 r1208, r951, r952; +} +{ +mul.f16x2 r1211, r1208, r926; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +sub.f16x2 r1217, r957, r958; +} +{ +mul.f16x2 r1220, r1217, r929; +} +{ +sub.f16x2 r1223, r963, r964; +} +{ +mul.f16x2 r1226, r1223, r936; +} +{ +add.f16x2 r1229, r1220, r1226; +} +{ +sub.f16x2 r1232, r969, r970; +} +{ +mul.f16x2 r1235, r1232, r927; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1214, r1238; +} +{ +add.f16x2 r1244, r939, r940; +} +{ +mul.f16x2 r1247, r1244, r928; +} +{ +add.f16x2 r1250, r942, r1247; +} +{ +add.f16x2 r1253, r945, r946; +} +{ +mul.f16x2 r1256, r1253, r934; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r951, r952; +} +{ +mul.f16x2 r1265, r1262, r926; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, r957, r958; +} +{ +mul.f16x2 r1274, r1271, r929; +} +{ +sub.f16x2 r1277, r963, r964; +} +{ +mul.f16x2 r1280, r1277, r936; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r969, r970; +} +{ +mul.f16x2 r1289, r1286, r927; +} +{ +add.f16x2 r1292, r1283, r1289; +} +{ +add.f16x2 r1295, r1268, r1292; +} +{ +add.f16x2 r1298, r957, r958; +} +{ +mul.f16x2 r1301, r1298, r924; +} +{ +add.f16x2 r1304, r960, r1301; +} +{ +add.f16x2 r1307, r963, r964; +} +{ +mul.f16x2 r1310, r1307, r926; +} +{ +add.f16x2 r1313, r1304, r1310; +} +{ +add.f16x2 r1316, r969, r970; +} +{ +mul.f16x2 r1319, r1316, r928; +} +{ +add.f16x2 r1322, r1313, r1319; +} +{ +sub.f16x2 r1325, r939, r940; +} +{ +mul.f16x2 r1328, r1325, r925; +} +{ +sub.f16x2 r1331, r945, r946; +} +{ +mul.f16x2 r1334, r1331, r927; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +sub.f16x2 r1340, r951, r952; +} +{ +mul.f16x2 r1343, r1340, r929; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +add.f16x2 r1349, r1322, r1346; +} +{ +add.f16x2 r1352, r957, r958; +} +{ +mul.f16x2 r1355, r1352, r924; +} +{ +add.f16x2 r1358, r960, r1355; +} +{ +add.f16x2 r1361, r963, r964; +} +{ +mul.f16x2 r1364, r1361, r926; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r969, r970; +} +{ +mul.f16x2 r1373, r1370, r928; +} +{ +add.f16x2 r1376, r1367, r1373; +} +{ +sub.f16x2 r1379, r939, r940; +} +{ +mul.f16x2 r1382, r1379, r925; +} +{ +sub.f16x2 r1385, r945, r946; +} +{ +mul.f16x2 r1388, r1385, r927; +} +{ +add.f16x2 r1391, r1382, r1388; +} +{ +sub.f16x2 r1394, r951, r952; +} +{ +mul.f16x2 r1397, r1394, r929; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, r1376, r1400; +} +{ +add.f16x2 r1406, r957, r958; +} +{ +mul.f16x2 r1409, r1406, r926; +} +{ +add.f16x2 r1412, r960, r1409; +} +{ +add.f16x2 r1415, r963, r964; +} +{ +mul.f16x2 r1418, r1415, r930; +} +{ +add.f16x2 r1421, r1412, r1418; +} +{ +add.f16x2 r1424, r969, r970; +} +{ +mul.f16x2 r1427, r1424, r934; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +sub.f16x2 r1433, r939, r940; +} +{ +mul.f16x2 r1436, r1433, r927; +} +{ +sub.f16x2 r1439, r945, r946; +} +{ +mul.f16x2 r1442, r1439, r932; +} +{ +add.f16x2 r1445, r1436, r1442; +} +{ +sub.f16x2 r1448, r951, r952; +} +{ +mul.f16x2 r1451, r1448, r936; +} +{ +add.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1430, r1454; +} +{ +add.f16x2 r1460, r957, r958; +} +{ +mul.f16x2 r1463, r1460, r926; +} +{ +add.f16x2 r1466, r960, r1463; +} +{ +add.f16x2 r1469, r963, r964; +} +{ +mul.f16x2 r1472, r1469, r930; +} +{ +add.f16x2 r1475, r1466, r1472; +} +{ +add.f16x2 r1478, r969, r970; +} +{ +mul.f16x2 r1481, r1478, r934; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r939, r940; +} +{ +mul.f16x2 r1490, r1487, r927; +} +{ +sub.f16x2 r1493, r945, r946; +} +{ +mul.f16x2 r1496, r1493, r932; +} +{ +add.f16x2 r1499, r1490, r1496; +} +{ +sub.f16x2 r1502, r951, r952; +} +{ +mul.f16x2 r1505, r1502, r936; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, r1484, r1508; +} +{ +add.f16x2 r1514, r957, r958; +} +{ +mul.f16x2 r1517, r1514, r928; +} +{ +add.f16x2 r1520, r960, r1517; +} +{ +add.f16x2 r1523, r963, r964; +} +{ +mul.f16x2 r1526, r1523, r934; +} +{ +add.f16x2 r1529, r1520, r1526; +} +{ +add.f16x2 r1532, r969, r970; +} +{ +mul.f16x2 r1535, r1532, r926; +} +{ +add.f16x2 r1538, r1529, r1535; +} +{ +sub.f16x2 r1541, r939, r940; +} +{ +mul.f16x2 r1544, r1541, r929; +} +{ +sub.f16x2 r1547, r945, r946; +} +{ +mul.f16x2 r1550, r1547, r936; +} +{ +add.f16x2 r1553, r1544, r1550; +} +{ +sub.f16x2 r1556, r951, r952; +} +{ +mul.f16x2 r1559, r1556, r927; +} +{ +add.f16x2 r1562, r1553, r1559; +} +{ +add.f16x2 r1565, r1538, r1562; +} +{ +add.f16x2 r1568, r957, r958; +} +{ +mul.f16x2 r1571, r1568, r928; +} +{ +add.f16x2 r1574, r960, r1571; +} +{ +add.f16x2 r1577, r963, r964; +} +{ +mul.f16x2 r1580, r1577, r934; +} +{ +add.f16x2 r1583, r1574, r1580; +} +{ +add.f16x2 r1586, r969, r970; +} +{ +mul.f16x2 r1589, r1586, r926; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +sub.f16x2 r1595, r939, r940; +} +{ +mul.f16x2 r1598, r1595, r929; +} +{ +sub.f16x2 r1601, r945, r946; +} +{ +mul.f16x2 r1604, r1601, r936; +} +{ +add.f16x2 r1607, r1598, r1604; +} +{ +sub.f16x2 r1610, r951, r952; +} +{ +mul.f16x2 r1613, r1610, r927; +} +{ +add.f16x2 r1616, r1607, r1613; +} +{ +sub.f16x2 r1619, r1592, r1616; +} +mul.wide.u32 rd4, r3474, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r3478, rd5; +sub.s32 r3479, r3474, r3478; +shr.u32 r3480, r3479, 1; +add.s32 r3481, r3480, r3478; +shr.u32 r3482, r3481, 2; +cvt.rn.f32.u32 f132, r3482; +mul.f32 f133, f132, 0f3C961050; +cos.approx.f32 f57, f133; +sin.approx.f32 f134, f133; +neg.f32 f58, f134; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1622, {low, high}; +} +mul.lo.s32 r3483, r3482, 7; +sub.s32 r3484, r3474, r3483; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1625, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1627, {high, high}; +} +{ +mul.f16x2 r1629, r1349, r1627; +} +{ +neg.f16x2 r1632, r1629; +} +{ +fma.rn.f16x2 r1634, r1025, r1625, r1632; +} +{ +mul.f16x2 r1638, r1025, r1627; +} +{ +fma.rn.f16x2 r1641, r1349, r1625, r1638; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1645, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1647, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1649, {low, high}; +} +{ +mul.f16x2 r1650, r1647, r1649; +} +{ +mul.f16x2 r1653, r1622, r1645; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1656, {high, low}; +} +{ +fma.rn.f16x2 r1658, r1650, r1656, r1653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1662, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1664, {high, high}; +} +{ +mul.f16x2 r1666, r1457, r1664; +} +{ +neg.f16x2 r1669, r1666; +} +{ +fma.rn.f16x2 r1671, r1133, r1662, r1669; +} +{ +mul.f16x2 r1675, r1133, r1664; +} +{ +fma.rn.f16x2 r1678, r1457, r1662, r1675; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1682, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1684, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1686, {low, high}; +} +{ +mul.f16x2 r1687, r1684, r1686; +} +{ +mul.f16x2 r1690, r1658, r1682; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1693, {high, low}; +} +{ +fma.rn.f16x2 r1695, r1687, r1693, r1690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1699, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1701, {high, high}; +} +{ +mul.f16x2 r1703, r1565, r1701; +} +{ +neg.f16x2 r1706, r1703; +} +{ +fma.rn.f16x2 r1708, r1241, r1699, r1706; +} +{ +mul.f16x2 r1712, r1241, r1701; +} +{ +fma.rn.f16x2 r1715, r1565, r1699, r1712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1719, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1721, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1723, {low, high}; +} +{ +mul.f16x2 r1724, r1721, r1723; +} +{ +mul.f16x2 r1727, r1695, r1719; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1730, {high, low}; +} +{ +fma.rn.f16x2 r1732, r1724, r1730, r1727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1738, {high, high}; +} +{ +mul.f16x2 r1740, r1619, r1738; +} +{ +neg.f16x2 r1743, r1740; +} +{ +fma.rn.f16x2 r1745, r1295, r1736, r1743; +} +{ +mul.f16x2 r1749, r1295, r1738; +} +{ +fma.rn.f16x2 r1752, r1619, r1736, r1749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1756, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1758, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1758, r1760; +} +{ +mul.f16x2 r1764, r1732, r1756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1767, {high, low}; +} +{ +fma.rn.f16x2 r1769, r1761, r1767, r1764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1775, {high, high}; +} +{ +mul.f16x2 r1777, r1511, r1775; +} +{ +neg.f16x2 r1780, r1777; +} +{ +fma.rn.f16x2 r1782, r1187, r1773, r1780; +} +{ +mul.f16x2 r1786, r1187, r1775; +} +{ +fma.rn.f16x2 r1789, r1511, r1773, r1786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1793, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1795, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1795, r1797; +} +{ +mul.f16x2 r1801, r1769, r1793; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1804, {high, low}; +} +{ +fma.rn.f16x2 r1806, r1798, r1804, r1801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1812, {high, high}; +} +{ +mul.f16x2 r1814, r1403, r1812; +} +{ +neg.f16x2 r1817, r1814; +} +{ +fma.rn.f16x2 r1819, r1079, r1810, r1817; +} +{ +mul.f16x2 r1823, r1079, r1812; +} +{ +fma.rn.f16x2 r1826, r1403, r1810, r1823; +} +shl.b32 r3485, r3484, 2; +add.s32 r3486, r3475, r3485; +barrier.sync 0; +mad.lo.s32 r3487, r3482, 196, r3486; +st.shared.u32 [r3487], r953; +st.shared.u32 [r3487+28], r1634; +st.shared.u32 [r3487+56], r1671; +st.shared.u32 [r3487+84], r1708; +st.shared.u32 [r3487+112], r1745; +st.shared.u32 [r3487+140], r1782; +st.shared.u32 [r3487+168], r1819; +barrier.sync 0; +ld.shared.u32 r1865, [r3477]; +ld.shared.u32 r1862, [r3477+1372]; +ld.shared.u32 r1868, [r3477+2744]; +ld.shared.u32 r1874, [r3477+4116]; +ld.shared.u32 r1875, [r3477+5488]; +ld.shared.u32 r1869, [r3477+6860]; +ld.shared.u32 r1863, [r3477+8232]; +barrier.sync 0; +st.shared.u32 [r3487], r971; +st.shared.u32 [r3487+28], r1641; +st.shared.u32 [r3487+56], r1678; +st.shared.u32 [r3487+84], r1715; +st.shared.u32 [r3487+112], r1752; +st.shared.u32 [r3487+140], r1789; +st.shared.u32 [r3487+168], r1826; +barrier.sync 0; +ld.shared.u32 r1883, [r3477]; +ld.shared.u32 r1880, [r3477+1372]; +ld.shared.u32 r1886, [r3477+2744]; +ld.shared.u32 r1892, [r3477+4116]; +ld.shared.u32 r1893, [r3477+5488]; +ld.shared.u32 r1887, [r3477+6860]; +ld.shared.u32 r1881, [r3477+8232]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1848, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r1849, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1850, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1853, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1854, {low, high}; +} +{ +neg.f16x2 r1855, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1858, {low, high}; +} +{ +neg.f16x2 r1859, r1858; +} +{ +add.f16x2 r1861, r1862, r1863; +} +{ +add.f16x2 r1864, r1865, r1861; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1864, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 r1876, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1882, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 r1894, r1888, r1891; +} +{ +add.f16x2 r1897, r1862, r1863; +} +{ +mul.f16x2 r1900, r1897, r1847; +} +{ +add.f16x2 r1903, r1865, r1900; +} +{ +add.f16x2 r1906, r1868, r1869; +} +{ +mul.f16x2 r1909, r1906, r1849; +} +{ +add.f16x2 r1912, r1903, r1909; +} +{ +add.f16x2 r1915, r1874, r1875; +} +{ +mul.f16x2 r1918, r1915, r1851; +} +{ +add.f16x2 r1921, r1912, r1918; +} +{ +sub.f16x2 r1924, r1880, r1881; +} +{ +mul.f16x2 r1927, r1924, r1848; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1850; +} +{ +add.f16x2 r1936, r1927, r1933; +} +{ +sub.f16x2 r1939, r1892, r1893; +} +{ +mul.f16x2 r1942, r1939, r1852; +} +{ +add.f16x2 r1945, r1936, r1942; +} +{ +sub.f16x2 r1948, r1921, r1945; +} +{ +add.f16x2 r1951, r1862, r1863; +} +{ +mul.f16x2 r1954, r1951, r1847; +} +{ +add.f16x2 r1957, r1865, r1954; +} +{ +add.f16x2 r1960, r1868, r1869; +} +{ +mul.f16x2 r1963, r1960, r1849; +} +{ +add.f16x2 r1966, r1957, r1963; +} +{ +add.f16x2 r1969, r1874, r1875; +} +{ +mul.f16x2 r1972, r1969, r1851; +} +{ +add.f16x2 r1975, r1966, r1972; +} +{ +sub.f16x2 r1978, r1880, r1881; +} +{ +mul.f16x2 r1981, r1978, r1848; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1850; +} +{ +add.f16x2 r1990, r1981, r1987; +} +{ +sub.f16x2 r1993, r1892, r1893; +} +{ +mul.f16x2 r1996, r1993, r1852; +} +{ +add.f16x2 r1999, r1990, r1996; +} +{ +add.f16x2 r2002, r1975, r1999; +} +{ +add.f16x2 r2005, r1862, r1863; +} +{ +mul.f16x2 r2008, r2005, r1849; +} +{ +add.f16x2 r2011, r1865, r2008; +} +{ +add.f16x2 r2014, r1868, r1869; +} +{ +mul.f16x2 r2017, r2014, r1853; +} +{ +add.f16x2 r2020, r2011, r2017; +} +{ +add.f16x2 r2023, r1874, r1875; +} +{ +mul.f16x2 r2026, r2023, r1857; +} +{ +add.f16x2 r2029, r2020, r2026; +} +{ +sub.f16x2 r2032, r1880, r1881; +} +{ +mul.f16x2 r2035, r2032, r1850; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1855; +} +{ +add.f16x2 r2044, r2035, r2041; +} +{ +sub.f16x2 r2047, r1892, r1893; +} +{ +mul.f16x2 r2050, r2047, r1859; +} +{ +add.f16x2 r2053, r2044, r2050; +} +{ +sub.f16x2 r2056, r2029, r2053; +} +{ +add.f16x2 r2059, r1862, r1863; +} +{ +mul.f16x2 r2062, r2059, r1849; +} +{ +add.f16x2 r2065, r1865, r2062; +} +{ +add.f16x2 r2068, r1868, r1869; +} +{ +mul.f16x2 r2071, r2068, r1853; +} +{ +add.f16x2 r2074, r2065, r2071; +} +{ +add.f16x2 r2077, r1874, r1875; +} +{ +mul.f16x2 r2080, r2077, r1857; +} +{ +add.f16x2 r2083, r2074, r2080; +} +{ +sub.f16x2 r2086, r1880, r1881; +} +{ +mul.f16x2 r2089, r2086, r1850; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1855; +} +{ +add.f16x2 r2098, r2089, r2095; +} +{ +sub.f16x2 r2101, r1892, r1893; +} +{ +mul.f16x2 r2104, r2101, r1859; +} +{ +add.f16x2 r2107, r2098, r2104; +} +{ +add.f16x2 r2110, r2083, r2107; +} +{ +add.f16x2 r2113, r1862, r1863; +} +{ +mul.f16x2 r2116, r2113, r1851; +} +{ +add.f16x2 r2119, r1865, r2116; +} +{ +add.f16x2 r2122, r1868, r1869; +} +{ +mul.f16x2 r2125, r2122, r1857; +} +{ +add.f16x2 r2128, r2119, r2125; +} +{ +add.f16x2 r2131, r1874, r1875; +} +{ +mul.f16x2 r2134, r2131, r1849; +} +{ +add.f16x2 r2137, r2128, r2134; +} +{ +sub.f16x2 r2140, r1880, r1881; +} +{ +mul.f16x2 r2143, r2140, r1852; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1859; +} +{ +add.f16x2 r2152, r2143, r2149; +} +{ +sub.f16x2 r2155, r1892, r1893; +} +{ +mul.f16x2 r2158, r2155, r1850; +} +{ +add.f16x2 r2161, r2152, r2158; +} +{ +sub.f16x2 r2164, r2137, r2161; +} +{ +add.f16x2 r2167, r1862, r1863; +} +{ +mul.f16x2 r2170, r2167, r1851; +} +{ +add.f16x2 r2173, r1865, r2170; +} +{ +add.f16x2 r2176, r1868, r1869; +} +{ +mul.f16x2 r2179, r2176, r1857; +} +{ +add.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r1874, r1875; +} +{ +mul.f16x2 r2188, r2185, r1849; +} +{ +add.f16x2 r2191, r2182, r2188; +} +{ +sub.f16x2 r2194, r1880, r1881; +} +{ +mul.f16x2 r2197, r2194, r1852; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1859; +} +{ +add.f16x2 r2206, r2197, r2203; +} +{ +sub.f16x2 r2209, r1892, r1893; +} +{ +mul.f16x2 r2212, r2209, r1850; +} +{ +add.f16x2 r2215, r2206, r2212; +} +{ +add.f16x2 r2218, r2191, r2215; +} +{ +add.f16x2 r2221, r1880, r1881; +} +{ +mul.f16x2 r2224, r2221, r1847; +} +{ +add.f16x2 r2227, r1883, r2224; +} +{ +add.f16x2 r2230, r1886, r1887; +} +{ +mul.f16x2 r2233, r2230, r1849; +} +{ +add.f16x2 r2236, r2227, r2233; +} +{ +add.f16x2 r2239, r1892, r1893; +} +{ +mul.f16x2 r2242, r2239, r1851; +} +{ +add.f16x2 r2245, r2236, r2242; +} +{ +sub.f16x2 r2248, r1862, r1863; +} +{ +mul.f16x2 r2251, r2248, r1848; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1850; +} +{ +add.f16x2 r2260, r2251, r2257; +} +{ +sub.f16x2 r2263, r1874, r1875; +} +{ +mul.f16x2 r2266, r2263, r1852; +} +{ +add.f16x2 r2269, r2260, r2266; +} +{ +add.f16x2 r2272, r2245, r2269; +} +{ +add.f16x2 r2275, r1880, r1881; +} +{ +mul.f16x2 r2278, r2275, r1847; +} +{ +add.f16x2 r2281, r1883, r2278; +} +{ +add.f16x2 r2284, r1886, r1887; +} +{ +mul.f16x2 r2287, r2284, r1849; +} +{ +add.f16x2 r2290, r2281, r2287; +} +{ +add.f16x2 r2293, r1892, r1893; +} +{ +mul.f16x2 r2296, r2293, r1851; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r1862, r1863; +} +{ +mul.f16x2 r2305, r2302, r1848; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1850; +} +{ +add.f16x2 r2314, r2305, r2311; +} +{ +sub.f16x2 r2317, r1874, r1875; +} +{ +mul.f16x2 r2320, r2317, r1852; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +sub.f16x2 r2326, r2299, r2323; +} +{ +add.f16x2 r2329, r1880, r1881; +} +{ +mul.f16x2 r2332, r2329, r1849; +} +{ +add.f16x2 r2335, r1883, r2332; +} +{ +add.f16x2 r2338, r1886, r1887; +} +{ +mul.f16x2 r2341, r2338, r1853; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +add.f16x2 r2347, r1892, r1893; +} +{ +mul.f16x2 r2350, r2347, r1857; +} +{ +add.f16x2 r2353, r2344, r2350; +} +{ +sub.f16x2 r2356, r1862, r1863; +} +{ +mul.f16x2 r2359, r2356, r1850; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1855; +} +{ +add.f16x2 r2368, r2359, r2365; +} +{ +sub.f16x2 r2371, r1874, r1875; +} +{ +mul.f16x2 r2374, r2371, r1859; +} +{ +add.f16x2 r2377, r2368, r2374; +} +{ +add.f16x2 r2380, r2353, r2377; +} +{ +add.f16x2 r2383, r1880, r1881; +} +{ +mul.f16x2 r2386, r2383, r1849; +} +{ +add.f16x2 r2389, r1883, r2386; +} +{ +add.f16x2 r2392, r1886, r1887; +} +{ +mul.f16x2 r2395, r2392, r1853; +} +{ +add.f16x2 r2398, r2389, r2395; +} +{ +add.f16x2 r2401, r1892, r1893; +} +{ +mul.f16x2 r2404, r2401, r1857; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +sub.f16x2 r2410, r1862, r1863; +} +{ +mul.f16x2 r2413, r2410, r1850; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1855; +} +{ +add.f16x2 r2422, r2413, r2419; +} +{ +sub.f16x2 r2425, r1874, r1875; +} +{ +mul.f16x2 r2428, r2425, r1859; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +sub.f16x2 r2434, r2407, r2431; +} +{ +add.f16x2 r2437, r1880, r1881; +} +{ +mul.f16x2 r2440, r2437, r1851; +} +{ +add.f16x2 r2443, r1883, r2440; +} +{ +add.f16x2 r2446, r1886, r1887; +} +{ +mul.f16x2 r2449, r2446, r1857; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +add.f16x2 r2455, r1892, r1893; +} +{ +mul.f16x2 r2458, r2455, r1849; +} +{ +add.f16x2 r2461, r2452, r2458; +} +{ +sub.f16x2 r2464, r1862, r1863; +} +{ +mul.f16x2 r2467, r2464, r1852; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1859; +} +{ +add.f16x2 r2476, r2467, r2473; +} +{ +sub.f16x2 r2479, r1874, r1875; +} +{ +mul.f16x2 r2482, r2479, r1850; +} +{ +add.f16x2 r2485, r2476, r2482; +} +{ +add.f16x2 r2488, r2461, r2485; +} +{ +add.f16x2 r2491, r1880, r1881; +} +{ +mul.f16x2 r2494, r2491, r1851; +} +{ +add.f16x2 r2497, r1883, r2494; +} +{ +add.f16x2 r2500, r1886, r1887; +} +{ +mul.f16x2 r2503, r2500, r1857; +} +{ +add.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1892, r1893; +} +{ +mul.f16x2 r2512, r2509, r1849; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +sub.f16x2 r2518, r1862, r1863; +} +{ +mul.f16x2 r2521, r2518, r1852; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1859; +} +{ +add.f16x2 r2530, r2521, r2527; +} +{ +sub.f16x2 r2533, r1874, r1875; +} +{ +mul.f16x2 r2536, r2533, r1850; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2515, r2539; +} +mul.wide.u32 rd6, r3474, 1402438301; +shr.u64 rd7, rd6, 36; +cvt.u32.u64 r3488, rd7; +cvt.rn.f32.u32 f135, r3488; +mul.f32 f136, f135, 0f3E034E46; +cos.approx.f32 f93, f136; +sin.approx.f32 f137, f136; +neg.f32 f94, f137; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r2545, {low, high}; +} +mul.lo.s32 r3489, r3488, 49; +sub.s32 r3490, r3474, r3489; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2550, {high, high}; +} +{ +mul.f16x2 r2552, r2272, r2550; +} +{ +neg.f16x2 r2555, r2552; +} +{ +fma.rn.f16x2 r2557, r1948, r2548, r2555; +} +{ +mul.f16x2 r2561, r1948, r2550; +} +{ +fma.rn.f16x2 r2564, r2272, r2548, r2561; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2568, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2570, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2572, {low, high}; +} +{ +mul.f16x2 r2573, r2570, r2572; +} +{ +mul.f16x2 r2576, r2545, r2568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2579, {high, low}; +} +{ +fma.rn.f16x2 r2581, r2573, r2579, r2576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2581; +mov.b32 r2585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2581; +mov.b32 r2587, {high, high}; +} +{ +mul.f16x2 r2589, r2380, r2587; +} +{ +neg.f16x2 r2592, r2589; +} +{ +fma.rn.f16x2 r2594, r2056, r2585, r2592; +} +{ +mul.f16x2 r2598, r2056, r2587; +} +{ +fma.rn.f16x2 r2601, r2380, r2585, r2598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2605, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2607, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2609, {low, high}; +} +{ +mul.f16x2 r2610, r2607, r2609; +} +{ +mul.f16x2 r2613, r2581, r2605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2581; +mov.b32 r2616, {high, low}; +} +{ +fma.rn.f16x2 r2618, r2610, r2616, r2613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2618; +mov.b32 r2622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2618; +mov.b32 r2624, {high, high}; +} +{ +mul.f16x2 r2626, r2488, r2624; +} +{ +neg.f16x2 r2629, r2626; +} +{ +fma.rn.f16x2 r2631, r2164, r2622, r2629; +} +{ +mul.f16x2 r2635, r2164, r2624; +} +{ +fma.rn.f16x2 r2638, r2488, r2622, r2635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2642, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2644, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2646, {low, high}; +} +{ +mul.f16x2 r2647, r2644, r2646; +} +{ +mul.f16x2 r2650, r2618, r2642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2618; +mov.b32 r2653, {high, low}; +} +{ +fma.rn.f16x2 r2655, r2647, r2653, r2650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2655; +mov.b32 r2659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2655; +mov.b32 r2661, {high, high}; +} +{ +mul.f16x2 r2663, r2542, r2661; +} +{ +neg.f16x2 r2666, r2663; +} +{ +fma.rn.f16x2 r2668, r2218, r2659, r2666; +} +{ +mul.f16x2 r2672, r2218, r2661; +} +{ +fma.rn.f16x2 r2675, r2542, r2659, r2672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2679, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2681, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2683, {low, high}; +} +{ +mul.f16x2 r2684, r2681, r2683; +} +{ +mul.f16x2 r2687, r2655, r2679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2655; +mov.b32 r2690, {high, low}; +} +{ +fma.rn.f16x2 r2692, r2684, r2690, r2687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2692; +mov.b32 r2696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2692; +mov.b32 r2698, {high, high}; +} +{ +mul.f16x2 r2700, r2434, r2698; +} +{ +neg.f16x2 r2703, r2700; +} +{ +fma.rn.f16x2 r2705, r2110, r2696, r2703; +} +{ +mul.f16x2 r2709, r2110, r2698; +} +{ +fma.rn.f16x2 r2712, r2434, r2696, r2709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2545; +mov.b32 r2718, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2720, {low, high}; +} +{ +mul.f16x2 r2721, r2718, r2720; +} +{ +mul.f16x2 r2724, r2692, r2716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2692; +mov.b32 r2727, {high, low}; +} +{ +fma.rn.f16x2 r2729, r2721, r2727, r2724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2729; +mov.b32 r2733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2729; +mov.b32 r2735, {high, high}; +} +{ +mul.f16x2 r2737, r2326, r2735; +} +{ +neg.f16x2 r2740, r2737; +} +{ +fma.rn.f16x2 r2742, r2002, r2733, r2740; +} +{ +mul.f16x2 r2746, r2002, r2735; +} +{ +fma.rn.f16x2 r2749, r2326, r2733, r2746; +} +shl.b32 r3491, r3490, 2; +add.s32 r3492, r3475, r3491; +barrier.sync 0; +mad.lo.s32 r3493, r3488, 1372, r3492; +st.shared.u32 [r3493], r1876; +st.shared.u32 [r3493+196], r2557; +st.shared.u32 [r3493+392], r2594; +st.shared.u32 [r3493+588], r2631; +st.shared.u32 [r3493+784], r2668; +st.shared.u32 [r3493+980], r2705; +st.shared.u32 [r3493+1176], r2742; +barrier.sync 0; +ld.shared.u32 r2788, [r3477]; +ld.shared.u32 r2785, [r3477+1372]; +ld.shared.u32 r2791, [r3477+2744]; +ld.shared.u32 r2797, [r3477+4116]; +ld.shared.u32 r2798, [r3477+5488]; +ld.shared.u32 r2792, [r3477+6860]; +ld.shared.u32 r2786, [r3477+8232]; +barrier.sync 0; +st.shared.u32 [r3493], r1894; +st.shared.u32 [r3493+196], r2564; +st.shared.u32 [r3493+392], r2601; +st.shared.u32 [r3493+588], r2638; +st.shared.u32 [r3493+784], r2675; +st.shared.u32 [r3493+980], r2712; +st.shared.u32 [r3493+1176], r2749; +barrier.sync 0; +ld.shared.u32 r2806, [r3477]; +ld.shared.u32 r2803, [r3477+1372]; +ld.shared.u32 r2809, [r3477+2744]; +ld.shared.u32 r2815, [r3477+4116]; +ld.shared.u32 r2816, [r3477+5488]; +ld.shared.u32 r2810, [r3477+6860]; +ld.shared.u32 r2804, [r3477+8232]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2771, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r2772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r2773, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2774, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2777, {low, high}; +} +{ +neg.f16x2 r2778, r2777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2780, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2781, {low, high}; +} +{ +neg.f16x2 r2782, r2781; +} +{ +add.f16x2 r2784, r2785, r2786; +} +{ +add.f16x2 r2787, r2788, r2784; +} +{ +add.f16x2 r2790, r2791, r2792; +} +{ +add.f16x2 r2793, r2787, r2790; +} +{ +add.f16x2 r2796, r2797, r2798; +} +{ +add.f16x2 %0, r2793, r2796; +} +{ +add.f16x2 r2802, r2803, r2804; +} +{ +add.f16x2 r2805, r2806, r2802; +} +{ +add.f16x2 r2808, r2809, r2810; +} +{ +add.f16x2 r2811, r2805, r2808; +} +{ +add.f16x2 r2814, r2815, r2816; +} +{ +add.f16x2 %1, r2811, r2814; +} +{ +add.f16x2 r2820, r2785, r2786; +} +{ +mul.f16x2 r2823, r2820, r2770; +} +{ +add.f16x2 r2826, r2788, r2823; +} +{ +add.f16x2 r2829, r2791, r2792; +} +{ +mul.f16x2 r2832, r2829, r2772; +} +{ +add.f16x2 r2835, r2826, r2832; +} +{ +add.f16x2 r2838, r2797, r2798; +} +{ +mul.f16x2 r2841, r2838, r2774; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r2803, r2804; +} +{ +mul.f16x2 r2850, r2847, r2771; +} +{ +sub.f16x2 r2853, r2809, r2810; +} +{ +mul.f16x2 r2856, r2853, r2773; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2815, r2816; +} +{ +mul.f16x2 r2865, r2862, r2775; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +sub.f16x2 %2, r2844, r2868; +} +{ +add.f16x2 r2874, r2785, r2786; +} +{ +mul.f16x2 r2877, r2874, r2770; +} +{ +add.f16x2 r2880, r2788, r2877; +} +{ +add.f16x2 r2883, r2791, r2792; +} +{ +mul.f16x2 r2886, r2883, r2772; +} +{ +add.f16x2 r2889, r2880, r2886; +} +{ +add.f16x2 r2892, r2797, r2798; +} +{ +mul.f16x2 r2895, r2892, r2774; +} +{ +add.f16x2 r2898, r2889, r2895; +} +{ +sub.f16x2 r2901, r2803, r2804; +} +{ +mul.f16x2 r2904, r2901, r2771; +} +{ +sub.f16x2 r2907, r2809, r2810; +} +{ +mul.f16x2 r2910, r2907, r2773; +} +{ +add.f16x2 r2913, r2904, r2910; +} +{ +sub.f16x2 r2916, r2815, r2816; +} +{ +mul.f16x2 r2919, r2916, r2775; +} +{ +add.f16x2 r2922, r2913, r2919; +} +{ +add.f16x2 %12, r2898, r2922; +} +{ +add.f16x2 r2928, r2785, r2786; +} +{ +mul.f16x2 r2931, r2928, r2772; +} +{ +add.f16x2 r2934, r2788, r2931; +} +{ +add.f16x2 r2937, r2791, r2792; +} +{ +mul.f16x2 r2940, r2937, r2776; +} +{ +add.f16x2 r2943, r2934, r2940; +} +{ +add.f16x2 r2946, r2797, r2798; +} +{ +mul.f16x2 r2949, r2946, r2780; +} +{ +add.f16x2 r2952, r2943, r2949; +} +{ +sub.f16x2 r2955, r2803, r2804; +} +{ +mul.f16x2 r2958, r2955, r2773; +} +{ +sub.f16x2 r2961, r2809, r2810; +} +{ +mul.f16x2 r2964, r2961, r2778; +} +{ +add.f16x2 r2967, r2958, r2964; +} +{ +sub.f16x2 r2970, r2815, r2816; +} +{ +mul.f16x2 r2973, r2970, r2782; +} +{ +add.f16x2 r2976, r2967, r2973; +} +{ +sub.f16x2 %4, r2952, r2976; +} +{ +add.f16x2 r2982, r2785, r2786; +} +{ +mul.f16x2 r2985, r2982, r2772; +} +{ +add.f16x2 r2988, r2788, r2985; +} +{ +add.f16x2 r2991, r2791, r2792; +} +{ +mul.f16x2 r2994, r2991, r2776; +} +{ +add.f16x2 r2997, r2988, r2994; +} +{ +add.f16x2 r3000, r2797, r2798; +} +{ +mul.f16x2 r3003, r3000, r2780; +} +{ +add.f16x2 r3006, r2997, r3003; +} +{ +sub.f16x2 r3009, r2803, r2804; +} +{ +mul.f16x2 r3012, r3009, r2773; +} +{ +sub.f16x2 r3015, r2809, r2810; +} +{ +mul.f16x2 r3018, r3015, r2778; +} +{ +add.f16x2 r3021, r3012, r3018; +} +{ +sub.f16x2 r3024, r2815, r2816; +} +{ +mul.f16x2 r3027, r3024, r2782; +} +{ +add.f16x2 r3030, r3021, r3027; +} +{ +add.f16x2 %10, r3006, r3030; +} +{ +add.f16x2 r3036, r2785, r2786; +} +{ +mul.f16x2 r3039, r3036, r2774; +} +{ +add.f16x2 r3042, r2788, r3039; +} +{ +add.f16x2 r3045, r2791, r2792; +} +{ +mul.f16x2 r3048, r3045, r2780; +} +{ +add.f16x2 r3051, r3042, r3048; +} +{ +add.f16x2 r3054, r2797, r2798; +} +{ +mul.f16x2 r3057, r3054, r2772; +} +{ +add.f16x2 r3060, r3051, r3057; +} +{ +sub.f16x2 r3063, r2803, r2804; +} +{ +mul.f16x2 r3066, r3063, r2775; +} +{ +sub.f16x2 r3069, r2809, r2810; +} +{ +mul.f16x2 r3072, r3069, r2782; +} +{ +add.f16x2 r3075, r3066, r3072; +} +{ +sub.f16x2 r3078, r2815, r2816; +} +{ +mul.f16x2 r3081, r3078, r2773; +} +{ +add.f16x2 r3084, r3075, r3081; +} +{ +sub.f16x2 %6, r3060, r3084; +} +{ +add.f16x2 r3090, r2785, r2786; +} +{ +mul.f16x2 r3093, r3090, r2774; +} +{ +add.f16x2 r3096, r2788, r3093; +} +{ +add.f16x2 r3099, r2791, r2792; +} +{ +mul.f16x2 r3102, r3099, r2780; +} +{ +add.f16x2 r3105, r3096, r3102; +} +{ +add.f16x2 r3108, r2797, r2798; +} +{ +mul.f16x2 r3111, r3108, r2772; +} +{ +add.f16x2 r3114, r3105, r3111; +} +{ +sub.f16x2 r3117, r2803, r2804; +} +{ +mul.f16x2 r3120, r3117, r2775; +} +{ +sub.f16x2 r3123, r2809, r2810; +} +{ +mul.f16x2 r3126, r3123, r2782; +} +{ +add.f16x2 r3129, r3120, r3126; +} +{ +sub.f16x2 r3132, r2815, r2816; +} +{ +mul.f16x2 r3135, r3132, r2773; +} +{ +add.f16x2 r3138, r3129, r3135; +} +{ +add.f16x2 %8, r3114, r3138; +} +{ +add.f16x2 r3144, r2803, r2804; +} +{ +mul.f16x2 r3147, r3144, r2770; +} +{ +add.f16x2 r3150, r2806, r3147; +} +{ +add.f16x2 r3153, r2809, r2810; +} +{ +mul.f16x2 r3156, r3153, r2772; +} +{ +add.f16x2 r3159, r3150, r3156; +} +{ +add.f16x2 r3162, r2815, r2816; +} +{ +mul.f16x2 r3165, r3162, r2774; +} +{ +add.f16x2 r3168, r3159, r3165; +} +{ +sub.f16x2 r3171, r2785, r2786; +} +{ +mul.f16x2 r3174, r3171, r2771; +} +{ +sub.f16x2 r3177, r2791, r2792; +} +{ +mul.f16x2 r3180, r3177, r2773; +} +{ +add.f16x2 r3183, r3174, r3180; +} +{ +sub.f16x2 r3186, r2797, r2798; +} +{ +mul.f16x2 r3189, r3186, r2775; +} +{ +add.f16x2 r3192, r3183, r3189; +} +{ +add.f16x2 %3, r3168, r3192; +} +{ +add.f16x2 r3198, r2803, r2804; +} +{ +mul.f16x2 r3201, r3198, r2770; +} +{ +add.f16x2 r3204, r2806, r3201; +} +{ +add.f16x2 r3207, r2809, r2810; +} +{ +mul.f16x2 r3210, r3207, r2772; +} +{ +add.f16x2 r3213, r3204, r3210; +} +{ +add.f16x2 r3216, r2815, r2816; +} +{ +mul.f16x2 r3219, r3216, r2774; +} +{ +add.f16x2 r3222, r3213, r3219; +} +{ +sub.f16x2 r3225, r2785, r2786; +} +{ +mul.f16x2 r3228, r3225, r2771; +} +{ +sub.f16x2 r3231, r2791, r2792; +} +{ +mul.f16x2 r3234, r3231, r2773; +} +{ +add.f16x2 r3237, r3228, r3234; +} +{ +sub.f16x2 r3240, r2797, r2798; +} +{ +mul.f16x2 r3243, r3240, r2775; +} +{ +add.f16x2 r3246, r3237, r3243; +} +{ +sub.f16x2 %13, r3222, r3246; +} +{ +add.f16x2 r3252, r2803, r2804; +} +{ +mul.f16x2 r3255, r3252, r2772; +} +{ +add.f16x2 r3258, r2806, r3255; +} +{ +add.f16x2 r3261, r2809, r2810; +} +{ +mul.f16x2 r3264, r3261, r2776; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +add.f16x2 r3270, r2815, r2816; +} +{ +mul.f16x2 r3273, r3270, r2780; +} +{ +add.f16x2 r3276, r3267, r3273; +} +{ +sub.f16x2 r3279, r2785, r2786; +} +{ +mul.f16x2 r3282, r3279, r2773; +} +{ +sub.f16x2 r3285, r2791, r2792; +} +{ +mul.f16x2 r3288, r3285, r2778; +} +{ +add.f16x2 r3291, r3282, r3288; +} +{ +sub.f16x2 r3294, r2797, r2798; +} +{ +mul.f16x2 r3297, r3294, r2782; +} +{ +add.f16x2 r3300, r3291, r3297; +} +{ +add.f16x2 %5, r3276, r3300; +} +{ +add.f16x2 r3306, r2803, r2804; +} +{ +mul.f16x2 r3309, r3306, r2772; +} +{ +add.f16x2 r3312, r2806, r3309; +} +{ +add.f16x2 r3315, r2809, r2810; +} +{ +mul.f16x2 r3318, r3315, r2776; +} +{ +add.f16x2 r3321, r3312, r3318; +} +{ +add.f16x2 r3324, r2815, r2816; +} +{ +mul.f16x2 r3327, r3324, r2780; +} +{ +add.f16x2 r3330, r3321, r3327; +} +{ +sub.f16x2 r3333, r2785, r2786; +} +{ +mul.f16x2 r3336, r3333, r2773; +} +{ +sub.f16x2 r3339, r2791, r2792; +} +{ +mul.f16x2 r3342, r3339, r2778; +} +{ +add.f16x2 r3345, r3336, r3342; +} +{ +sub.f16x2 r3348, r2797, r2798; +} +{ +mul.f16x2 r3351, r3348, r2782; +} +{ +add.f16x2 r3354, r3345, r3351; +} +{ +sub.f16x2 %11, r3330, r3354; +} +{ +add.f16x2 r3360, r2803, r2804; +} +{ +mul.f16x2 r3363, r3360, r2774; +} +{ +add.f16x2 r3366, r2806, r3363; +} +{ +add.f16x2 r3369, r2809, r2810; +} +{ +mul.f16x2 r3372, r3369, r2780; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r2815, r2816; +} +{ +mul.f16x2 r3381, r3378, r2772; +} +{ +add.f16x2 r3384, r3375, r3381; +} +{ +sub.f16x2 r3387, r2785, r2786; +} +{ +mul.f16x2 r3390, r3387, r2775; +} +{ +sub.f16x2 r3393, r2791, r2792; +} +{ +mul.f16x2 r3396, r3393, r2782; +} +{ +add.f16x2 r3399, r3390, r3396; +} +{ +sub.f16x2 r3402, r2797, r2798; +} +{ +mul.f16x2 r3405, r3402, r2773; +} +{ +add.f16x2 r3408, r3399, r3405; +} +{ +add.f16x2 %7, r3384, r3408; +} +{ +add.f16x2 r3414, r2803, r2804; +} +{ +mul.f16x2 r3417, r3414, r2774; +} +{ +add.f16x2 r3420, r2806, r3417; +} +{ +add.f16x2 r3423, r2809, r2810; +} +{ +mul.f16x2 r3426, r3423, r2780; +} +{ +add.f16x2 r3429, r3420, r3426; +} +{ +add.f16x2 r3432, r2815, r2816; +} +{ +mul.f16x2 r3435, r3432, r2772; +} +{ +add.f16x2 r3438, r3429, r3435; +} +{ +sub.f16x2 r3441, r2785, r2786; +} +{ +mul.f16x2 r3444, r3441, r2775; +} +{ +sub.f16x2 r3447, r2791, r2792; +} +{ +mul.f16x2 r3450, r3447, r2782; +} +{ +add.f16x2 r3453, r3444, r3450; +} +{ +sub.f16x2 r3456, r2797, r2798; +} +{ +mul.f16x2 r3459, r3456, r2773; +} +{ +add.f16x2 r3462, r3453, r3459; +} +{ +sub.f16x2 %9, r3438, r3462; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..6d7ed769452fd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp16_inv.hpp.inc @@ -0,0 +1,8179 @@ +#ifndef CUFFTDX_FFT_2401_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_2401_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1124, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<138>; +.reg .b32 r<3502>; +.reg .b64 rd<8>; +mov.u32 r3476, %tid.y; +mov.u32 r3477, %14; +mad.lo.s32 r3478, r3476, 19208, r3477; +mov.u32 r3479, %tid.x; +mov.f32 f126, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1, {low, high}; +} +mov.f32 f128, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f114, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r5, {low, high}; +} +mov.f32 f116, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +mov.f32 f122, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9, {low, high}; +} +mov.f32 f124, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r10, {low, high}; +} +{ +neg.f16x2 r11, r10; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r14, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r15, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r16, {low, high}; +} +{ +add.f16x2 r17, %17, %27; +} +{ +add.f16x2 r20, %15, r17; +} +{ +add.f16x2 r23, %19, %25; +} +{ +add.f16x2 r26, r20, r23; +} +{ +add.f16x2 r29, %21, %23; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %18, %28; +} +{ +add.f16x2 r38, %16, r35; +} +{ +add.f16x2 r41, %20, %26; +} +{ +add.f16x2 r44, r38, r41; +} +{ +add.f16x2 r47, %22, %24; +} +{ +add.f16x2 r50, r44, r47; +} +{ +add.f16x2 r53, %17, %27; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %15, r56; +} +{ +add.f16x2 r62, %19, %25; +} +{ +mul.f16x2 r65, r62, r5; +} +{ +add.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %21, %23; +} +{ +mul.f16x2 r74, r71, r9; +} +{ +add.f16x2 r77, r68, r74; +} +{ +sub.f16x2 r80, %18, %28; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +sub.f16x2 r86, %20, %26; +} +{ +mul.f16x2 r89, r86, r7; +} +{ +add.f16x2 r92, r83, r89; +} +{ +sub.f16x2 r95, %22, %24; +} +{ +mul.f16x2 r98, r95, r11; +} +{ +add.f16x2 r101, r92, r98; +} +{ +sub.f16x2 r104, r77, r101; +} +{ +add.f16x2 r107, %17, %27; +} +{ +mul.f16x2 r110, r107, r1; +} +{ +add.f16x2 r113, %15, r110; +} +{ +add.f16x2 r116, %19, %25; +} +{ +mul.f16x2 r119, r116, r5; +} +{ +add.f16x2 r122, r113, r119; +} +{ +add.f16x2 r125, %21, %23; +} +{ +mul.f16x2 r128, r125, r9; +} +{ +add.f16x2 r131, r122, r128; +} +{ +sub.f16x2 r134, %18, %28; +} +{ +mul.f16x2 r137, r134, r3; +} +{ +sub.f16x2 r140, %20, %26; +} +{ +mul.f16x2 r143, r140, r7; +} +{ +add.f16x2 r146, r137, r143; +} +{ +sub.f16x2 r149, %22, %24; +} +{ +mul.f16x2 r152, r149, r11; +} +{ +add.f16x2 r155, r146, r152; +} +{ +add.f16x2 r158, r131, r155; +} +{ +add.f16x2 r161, %17, %27; +} +{ +mul.f16x2 r164, r161, r5; +} +{ +add.f16x2 r167, %15, r164; +} +{ +add.f16x2 r170, %19, %25; +} +{ +mul.f16x2 r173, r170, r13; +} +{ +add.f16x2 r176, r167, r173; +} +{ +add.f16x2 r179, %21, %23; +} +{ +mul.f16x2 r182, r179, r15; +} +{ +add.f16x2 r185, r176, r182; +} +{ +sub.f16x2 r188, %18, %28; +} +{ +mul.f16x2 r191, r188, r7; +} +{ +sub.f16x2 r194, %20, %26; +} +{ +mul.f16x2 r197, r194, r14; +} +{ +add.f16x2 r200, r191, r197; +} +{ +sub.f16x2 r203, %22, %24; +} +{ +mul.f16x2 r206, r203, r16; +} +{ +add.f16x2 r209, r200, r206; +} +{ +sub.f16x2 r212, r185, r209; +} +{ +add.f16x2 r215, %17, %27; +} +{ +mul.f16x2 r218, r215, r5; +} +{ +add.f16x2 r221, %15, r218; +} +{ +add.f16x2 r224, %19, %25; +} +{ +mul.f16x2 r227, r224, r13; +} +{ +add.f16x2 r230, r221, r227; +} +{ +add.f16x2 r233, %21, %23; +} +{ +mul.f16x2 r236, r233, r15; +} +{ +add.f16x2 r239, r230, r236; +} +{ +sub.f16x2 r242, %18, %28; +} +{ +mul.f16x2 r245, r242, r7; +} +{ +sub.f16x2 r248, %20, %26; +} +{ +mul.f16x2 r251, r248, r14; +} +{ +add.f16x2 r254, r245, r251; +} +{ +sub.f16x2 r257, %22, %24; +} +{ +mul.f16x2 r260, r257, r16; +} +{ +add.f16x2 r263, r254, r260; +} +{ +add.f16x2 r266, r239, r263; +} +{ +add.f16x2 r269, %17, %27; +} +{ +mul.f16x2 r272, r269, r9; +} +{ +add.f16x2 r275, %15, r272; +} +{ +add.f16x2 r278, %19, %25; +} +{ +mul.f16x2 r281, r278, r15; +} +{ +add.f16x2 r284, r275, r281; +} +{ +add.f16x2 r287, %21, %23; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, r284, r290; +} +{ +sub.f16x2 r296, %18, %28; +} +{ +mul.f16x2 r299, r296, r11; +} +{ +sub.f16x2 r302, %20, %26; +} +{ +mul.f16x2 r305, r302, r16; +} +{ +add.f16x2 r308, r299, r305; +} +{ +sub.f16x2 r311, %22, %24; +} +{ +mul.f16x2 r314, r311, r7; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r293, r317; +} +{ +add.f16x2 r323, %17, %27; +} +{ +mul.f16x2 r326, r323, r9; +} +{ +add.f16x2 r329, %15, r326; +} +{ +add.f16x2 r332, %19, %25; +} +{ +mul.f16x2 r335, r332, r15; +} +{ +add.f16x2 r338, r329, r335; +} +{ +add.f16x2 r341, %21, %23; +} +{ +mul.f16x2 r344, r341, r5; +} +{ +add.f16x2 r347, r338, r344; +} +{ +sub.f16x2 r350, %18, %28; +} +{ +mul.f16x2 r353, r350, r11; +} +{ +sub.f16x2 r356, %20, %26; +} +{ +mul.f16x2 r359, r356, r16; +} +{ +add.f16x2 r362, r353, r359; +} +{ +sub.f16x2 r365, %22, %24; +} +{ +mul.f16x2 r368, r365, r7; +} +{ +add.f16x2 r371, r362, r368; +} +{ +add.f16x2 r374, r347, r371; +} +{ +add.f16x2 r377, %18, %28; +} +{ +mul.f16x2 r380, r377, r1; +} +{ +add.f16x2 r383, %16, r380; +} +{ +add.f16x2 r386, %20, %26; +} +{ +mul.f16x2 r389, r386, r5; +} +{ +add.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %22, %24; +} +{ +mul.f16x2 r398, r395, r9; +} +{ +add.f16x2 r401, r392, r398; +} +{ +sub.f16x2 r404, %17, %27; +} +{ +mul.f16x2 r407, r404, r3; +} +{ +sub.f16x2 r410, %19, %25; +} +{ +mul.f16x2 r413, r410, r7; +} +{ +add.f16x2 r416, r407, r413; +} +{ +sub.f16x2 r419, %21, %23; +} +{ +mul.f16x2 r422, r419, r11; +} +{ +add.f16x2 r425, r416, r422; +} +{ +add.f16x2 r428, r401, r425; +} +{ +add.f16x2 r431, %18, %28; +} +{ +mul.f16x2 r434, r431, r1; +} +{ +add.f16x2 r437, %16, r434; +} +{ +add.f16x2 r440, %20, %26; +} +{ +mul.f16x2 r443, r440, r5; +} +{ +add.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, %22, %24; +} +{ +mul.f16x2 r452, r449, r9; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, %17, %27; +} +{ +mul.f16x2 r461, r458, r3; +} +{ +sub.f16x2 r464, %19, %25; +} +{ +mul.f16x2 r467, r464, r7; +} +{ +add.f16x2 r470, r461, r467; +} +{ +sub.f16x2 r473, %21, %23; +} +{ +mul.f16x2 r476, r473, r11; +} +{ +add.f16x2 r479, r470, r476; +} +{ +sub.f16x2 r482, r455, r479; +} +{ +add.f16x2 r485, %18, %28; +} +{ +mul.f16x2 r488, r485, r5; +} +{ +add.f16x2 r491, %16, r488; +} +{ +add.f16x2 r494, %20, %26; +} +{ +mul.f16x2 r497, r494, r13; +} +{ +add.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, %22, %24; +} +{ +mul.f16x2 r506, r503, r15; +} +{ +add.f16x2 r509, r500, r506; +} +{ +sub.f16x2 r512, %17, %27; +} +{ +mul.f16x2 r515, r512, r7; +} +{ +sub.f16x2 r518, %19, %25; +} +{ +mul.f16x2 r521, r518, r14; +} +{ +add.f16x2 r524, r515, r521; +} +{ +sub.f16x2 r527, %21, %23; +} +{ +mul.f16x2 r530, r527, r16; +} +{ +add.f16x2 r533, r524, r530; +} +{ +add.f16x2 r536, r509, r533; +} +{ +add.f16x2 r539, %18, %28; +} +{ +mul.f16x2 r542, r539, r5; +} +{ +add.f16x2 r545, %16, r542; +} +{ +add.f16x2 r548, %20, %26; +} +{ +mul.f16x2 r551, r548, r13; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, %22, %24; +} +{ +mul.f16x2 r560, r557, r15; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, %17, %27; +} +{ +mul.f16x2 r569, r566, r7; +} +{ +sub.f16x2 r572, %19, %25; +} +{ +mul.f16x2 r575, r572, r14; +} +{ +add.f16x2 r578, r569, r575; +} +{ +sub.f16x2 r581, %21, %23; +} +{ +mul.f16x2 r584, r581, r16; +} +{ +add.f16x2 r587, r578, r584; +} +{ +sub.f16x2 r590, r563, r587; +} +{ +add.f16x2 r593, %18, %28; +} +{ +mul.f16x2 r596, r593, r9; +} +{ +add.f16x2 r599, %16, r596; +} +{ +add.f16x2 r602, %20, %26; +} +{ +mul.f16x2 r605, r602, r15; +} +{ +add.f16x2 r608, r599, r605; +} +{ +add.f16x2 r611, %22, %24; +} +{ +mul.f16x2 r614, r611, r5; +} +{ +add.f16x2 r617, r608, r614; +} +{ +sub.f16x2 r620, %17, %27; +} +{ +mul.f16x2 r623, r620, r11; +} +{ +sub.f16x2 r626, %19, %25; +} +{ +mul.f16x2 r629, r626, r16; +} +{ +add.f16x2 r632, r623, r629; +} +{ +sub.f16x2 r635, %21, %23; +} +{ +mul.f16x2 r638, r635, r7; +} +{ +add.f16x2 r641, r632, r638; +} +{ +add.f16x2 r644, r617, r641; +} +{ +add.f16x2 r647, %18, %28; +} +{ +mul.f16x2 r650, r647, r9; +} +{ +add.f16x2 r653, %16, r650; +} +{ +add.f16x2 r656, %20, %26; +} +{ +mul.f16x2 r659, r656, r15; +} +{ +add.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %22, %24; +} +{ +mul.f16x2 r668, r665, r5; +} +{ +add.f16x2 r671, r662, r668; +} +{ +sub.f16x2 r674, %17, %27; +} +{ +mul.f16x2 r677, r674, r11; +} +{ +sub.f16x2 r680, %19, %25; +} +{ +mul.f16x2 r683, r680, r16; +} +{ +add.f16x2 r686, r677, r683; +} +{ +sub.f16x2 r689, %21, %23; +} +{ +mul.f16x2 r692, r689, r7; +} +{ +add.f16x2 r695, r686, r692; +} +{ +sub.f16x2 r698, r671, r695; +} +mul.wide.u32 rd2, r3479, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3480, rd3; +mul.lo.s32 r3481, r3480, 343; +sub.s32 r3482, r3479, r3481; +cvt.rn.f32.u32 f129, r3482; +mul.f32 f130, f129, 0f3B2B805B; +cos.approx.f32 f21, f130; +sin.approx.f32 f131, f130; +neg.f32 f22, f131; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r701, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r706, {high, high}; +} +{ +mul.f16x2 r708, r428, r706; +} +{ +fma.rn.f16x2 r711, r104, r704, r708; +} +{ +mul.f16x2 r715, r104, r706; +} +{ +neg.f16x2 r718, r715; +} +{ +fma.rn.f16x2 r720, r428, r704, r718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r726, {high, high}; +} +mov.f32 f105, 0fBF800000; +mov.f32 f106, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r728, {low, high}; +} +{ +mul.f16x2 r729, r726, r728; +} +{ +mul.f16x2 r732, r701, r724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r735, {high, low}; +} +{ +fma.rn.f16x2 r737, r729, r735, r732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r536, r743; +} +{ +fma.rn.f16x2 r748, r212, r741, r745; +} +{ +mul.f16x2 r752, r212, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r536, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r737, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r644, r780; +} +{ +fma.rn.f16x2 r785, r320, r778, r782; +} +{ +mul.f16x2 r789, r320, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r644, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r698, r817; +} +{ +fma.rn.f16x2 r822, r374, r815, r819; +} +{ +mul.f16x2 r826, r374, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r698, r815, r829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r835, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r837, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r839, {low, high}; +} +{ +mul.f16x2 r840, r837, r839; +} +{ +mul.f16x2 r843, r811, r835; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r846, {high, low}; +} +{ +fma.rn.f16x2 r848, r840, r846, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r854, {high, high}; +} +{ +mul.f16x2 r856, r590, r854; +} +{ +fma.rn.f16x2 r859, r266, r852, r856; +} +{ +mul.f16x2 r863, r266, r854; +} +{ +neg.f16x2 r866, r863; +} +{ +fma.rn.f16x2 r868, r590, r852, r866; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r872, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r874, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r876, {low, high}; +} +{ +mul.f16x2 r877, r874, r876; +} +{ +mul.f16x2 r880, r848, r872; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r883, {high, low}; +} +{ +fma.rn.f16x2 r885, r877, r883, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r891, {high, high}; +} +{ +mul.f16x2 r893, r482, r891; +} +{ +fma.rn.f16x2 r896, r158, r889, r893; +} +{ +mul.f16x2 r900, r158, r891; +} +{ +neg.f16x2 r903, r900; +} +{ +fma.rn.f16x2 r905, r482, r889, r903; +} +mad.lo.s32 r3483, r3480, 19208, r3478; +barrier.sync 0; +mad.lo.s32 r3484, r3482, 56, r3483; +st.shared.v2.f32 [r3484], {r32, r50}; +st.shared.v2.f32 [r3484+8], {r711, r720}; +st.shared.v2.f32 [r3484+16], {r748, r757}; +st.shared.v2.f32 [r3484+24], {r785, r794}; +st.shared.v2.f32 [r3484+32], {r822, r831}; +st.shared.v2.f32 [r3484+40], {r859, r868}; +st.shared.v2.f32 [r3484+48], {r896, r905}; +barrier.sync 0; +mad.lo.s32 r3485, r3482, -48, r3484; +ld.shared.u32 r946, [r3485]; +ld.shared.u32 r964, [r3485+4]; +ld.shared.u32 r943, [r3485+2744]; +ld.shared.u32 r961, [r3485+2748]; +ld.shared.u32 r949, [r3485+5488]; +ld.shared.u32 r967, [r3485+5492]; +ld.shared.u32 r955, [r3485+8232]; +ld.shared.u32 r973, [r3485+8236]; +ld.shared.u32 r956, [r3485+10976]; +ld.shared.u32 r974, [r3485+10980]; +ld.shared.u32 r950, [r3485+13720]; +ld.shared.u32 r968, [r3485+13724]; +ld.shared.u32 r944, [r3485+16464]; +ld.shared.u32 r962, [r3485+16468]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r927, {low, high}; +} +{ +neg.f16x2 r928, r927; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r941, {low, high}; +} +{ +add.f16x2 r942, r943, r944; +} +{ +add.f16x2 r945, r946, r942; +} +{ +add.f16x2 r948, r949, r950; +} +{ +add.f16x2 r951, r945, r948; +} +{ +add.f16x2 r954, r955, r956; +} +{ +add.f16x2 r957, r951, r954; +} +{ +add.f16x2 r960, r961, r962; +} +{ +add.f16x2 r963, r964, r960; +} +{ +add.f16x2 r966, r967, r968; +} +{ +add.f16x2 r969, r963, r966; +} +{ +add.f16x2 r972, r973, r974; +} +{ +add.f16x2 r975, r969, r972; +} +{ +add.f16x2 r978, r943, r944; +} +{ +mul.f16x2 r981, r978, r926; +} +{ +add.f16x2 r984, r946, r981; +} +{ +add.f16x2 r987, r949, r950; +} +{ +mul.f16x2 r990, r987, r930; +} +{ +add.f16x2 r993, r984, r990; +} +{ +add.f16x2 r996, r955, r956; +} +{ +mul.f16x2 r999, r996, r934; +} +{ +add.f16x2 r1002, r993, r999; +} +{ +sub.f16x2 r1005, r961, r962; +} +{ +mul.f16x2 r1008, r1005, r928; +} +{ +sub.f16x2 r1011, r967, r968; +} +{ +mul.f16x2 r1014, r1011, r932; +} +{ +add.f16x2 r1017, r1008, r1014; +} +{ +sub.f16x2 r1020, r973, r974; +} +{ +mul.f16x2 r1023, r1020, r936; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r1002, r1026; +} +{ +add.f16x2 r1032, r943, r944; +} +{ +mul.f16x2 r1035, r1032, r926; +} +{ +add.f16x2 r1038, r946, r1035; +} +{ +add.f16x2 r1041, r949, r950; +} +{ +mul.f16x2 r1044, r1041, r930; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r955, r956; +} +{ +mul.f16x2 r1053, r1050, r934; +} +{ +add.f16x2 r1056, r1047, r1053; +} +{ +sub.f16x2 r1059, r961, r962; +} +{ +mul.f16x2 r1062, r1059, r928; +} +{ +sub.f16x2 r1065, r967, r968; +} +{ +mul.f16x2 r1068, r1065, r932; +} +{ +add.f16x2 r1071, r1062, r1068; +} +{ +sub.f16x2 r1074, r973, r974; +} +{ +mul.f16x2 r1077, r1074, r936; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r1056, r1080; +} +{ +add.f16x2 r1086, r943, r944; +} +{ +mul.f16x2 r1089, r1086, r930; +} +{ +add.f16x2 r1092, r946, r1089; +} +{ +add.f16x2 r1095, r949, r950; +} +{ +mul.f16x2 r1098, r1095, r938; +} +{ +add.f16x2 r1101, r1092, r1098; +} +{ +add.f16x2 r1104, r955, r956; +} +{ +mul.f16x2 r1107, r1104, r940; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, r961, r962; +} +{ +mul.f16x2 r1116, r1113, r932; +} +{ +sub.f16x2 r1119, r967, r968; +} +{ +mul.f16x2 r1122, r1119, r939; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r973, r974; +} +{ +mul.f16x2 r1131, r1128, r941; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r1110, r1134; +} +{ +add.f16x2 r1140, r943, r944; +} +{ +mul.f16x2 r1143, r1140, r930; +} +{ +add.f16x2 r1146, r946, r1143; +} +{ +add.f16x2 r1149, r949, r950; +} +{ +mul.f16x2 r1152, r1149, r938; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r955, r956; +} +{ +mul.f16x2 r1161, r1158, r940; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +sub.f16x2 r1167, r961, r962; +} +{ +mul.f16x2 r1170, r1167, r932; +} +{ +sub.f16x2 r1173, r967, r968; +} +{ +mul.f16x2 r1176, r1173, r939; +} +{ +add.f16x2 r1179, r1170, r1176; +} +{ +sub.f16x2 r1182, r973, r974; +} +{ +mul.f16x2 r1185, r1182, r941; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +add.f16x2 r1191, r1164, r1188; +} +{ +add.f16x2 r1194, r943, r944; +} +{ +mul.f16x2 r1197, r1194, r934; +} +{ +add.f16x2 r1200, r946, r1197; +} +{ +add.f16x2 r1203, r949, r950; +} +{ +mul.f16x2 r1206, r1203, r940; +} +{ +add.f16x2 r1209, r1200, r1206; +} +{ +add.f16x2 r1212, r955, r956; +} +{ +mul.f16x2 r1215, r1212, r930; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, r961, r962; +} +{ +mul.f16x2 r1224, r1221, r936; +} +{ +sub.f16x2 r1227, r967, r968; +} +{ +mul.f16x2 r1230, r1227, r941; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r973, r974; +} +{ +mul.f16x2 r1239, r1236, r932; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r1218, r1242; +} +{ +add.f16x2 r1248, r943, r944; +} +{ +mul.f16x2 r1251, r1248, r934; +} +{ +add.f16x2 r1254, r946, r1251; +} +{ +add.f16x2 r1257, r949, r950; +} +{ +mul.f16x2 r1260, r1257, r940; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r955, r956; +} +{ +mul.f16x2 r1269, r1266, r930; +} +{ +add.f16x2 r1272, r1263, r1269; +} +{ +sub.f16x2 r1275, r961, r962; +} +{ +mul.f16x2 r1278, r1275, r936; +} +{ +sub.f16x2 r1281, r967, r968; +} +{ +mul.f16x2 r1284, r1281, r941; +} +{ +add.f16x2 r1287, r1278, r1284; +} +{ +sub.f16x2 r1290, r973, r974; +} +{ +mul.f16x2 r1293, r1290, r932; +} +{ +add.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 r1299, r1272, r1296; +} +{ +add.f16x2 r1302, r961, r962; +} +{ +mul.f16x2 r1305, r1302, r926; +} +{ +add.f16x2 r1308, r964, r1305; +} +{ +add.f16x2 r1311, r967, r968; +} +{ +mul.f16x2 r1314, r1311, r930; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +add.f16x2 r1320, r973, r974; +} +{ +mul.f16x2 r1323, r1320, r934; +} +{ +add.f16x2 r1326, r1317, r1323; +} +{ +sub.f16x2 r1329, r943, r944; +} +{ +mul.f16x2 r1332, r1329, r928; +} +{ +sub.f16x2 r1335, r949, r950; +} +{ +mul.f16x2 r1338, r1335, r932; +} +{ +add.f16x2 r1341, r1332, r1338; +} +{ +sub.f16x2 r1344, r955, r956; +} +{ +mul.f16x2 r1347, r1344, r936; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1326, r1350; +} +{ +add.f16x2 r1356, r961, r962; +} +{ +mul.f16x2 r1359, r1356, r926; +} +{ +add.f16x2 r1362, r964, r1359; +} +{ +add.f16x2 r1365, r967, r968; +} +{ +mul.f16x2 r1368, r1365, r930; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r973, r974; +} +{ +mul.f16x2 r1377, r1374, r934; +} +{ +add.f16x2 r1380, r1371, r1377; +} +{ +sub.f16x2 r1383, r943, r944; +} +{ +mul.f16x2 r1386, r1383, r928; +} +{ +sub.f16x2 r1389, r949, r950; +} +{ +mul.f16x2 r1392, r1389, r932; +} +{ +add.f16x2 r1395, r1386, r1392; +} +{ +sub.f16x2 r1398, r955, r956; +} +{ +mul.f16x2 r1401, r1398, r936; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +sub.f16x2 r1407, r1380, r1404; +} +{ +add.f16x2 r1410, r961, r962; +} +{ +mul.f16x2 r1413, r1410, r930; +} +{ +add.f16x2 r1416, r964, r1413; +} +{ +add.f16x2 r1419, r967, r968; +} +{ +mul.f16x2 r1422, r1419, r938; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +add.f16x2 r1428, r973, r974; +} +{ +mul.f16x2 r1431, r1428, r940; +} +{ +add.f16x2 r1434, r1425, r1431; +} +{ +sub.f16x2 r1437, r943, r944; +} +{ +mul.f16x2 r1440, r1437, r932; +} +{ +sub.f16x2 r1443, r949, r950; +} +{ +mul.f16x2 r1446, r1443, r939; +} +{ +add.f16x2 r1449, r1440, r1446; +} +{ +sub.f16x2 r1452, r955, r956; +} +{ +mul.f16x2 r1455, r1452, r941; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +add.f16x2 r1461, r1434, r1458; +} +{ +add.f16x2 r1464, r961, r962; +} +{ +mul.f16x2 r1467, r1464, r930; +} +{ +add.f16x2 r1470, r964, r1467; +} +{ +add.f16x2 r1473, r967, r968; +} +{ +mul.f16x2 r1476, r1473, r938; +} +{ +add.f16x2 r1479, r1470, r1476; +} +{ +add.f16x2 r1482, r973, r974; +} +{ +mul.f16x2 r1485, r1482, r940; +} +{ +add.f16x2 r1488, r1479, r1485; +} +{ +sub.f16x2 r1491, r943, r944; +} +{ +mul.f16x2 r1494, r1491, r932; +} +{ +sub.f16x2 r1497, r949, r950; +} +{ +mul.f16x2 r1500, r1497, r939; +} +{ +add.f16x2 r1503, r1494, r1500; +} +{ +sub.f16x2 r1506, r955, r956; +} +{ +mul.f16x2 r1509, r1506, r941; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +sub.f16x2 r1515, r1488, r1512; +} +{ +add.f16x2 r1518, r961, r962; +} +{ +mul.f16x2 r1521, r1518, r934; +} +{ +add.f16x2 r1524, r964, r1521; +} +{ +add.f16x2 r1527, r967, r968; +} +{ +mul.f16x2 r1530, r1527, r940; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +add.f16x2 r1536, r973, r974; +} +{ +mul.f16x2 r1539, r1536, r930; +} +{ +add.f16x2 r1542, r1533, r1539; +} +{ +sub.f16x2 r1545, r943, r944; +} +{ +mul.f16x2 r1548, r1545, r936; +} +{ +sub.f16x2 r1551, r949, r950; +} +{ +mul.f16x2 r1554, r1551, r941; +} +{ +add.f16x2 r1557, r1548, r1554; +} +{ +sub.f16x2 r1560, r955, r956; +} +{ +mul.f16x2 r1563, r1560, r932; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +add.f16x2 r1569, r1542, r1566; +} +{ +add.f16x2 r1572, r961, r962; +} +{ +mul.f16x2 r1575, r1572, r934; +} +{ +add.f16x2 r1578, r964, r1575; +} +{ +add.f16x2 r1581, r967, r968; +} +{ +mul.f16x2 r1584, r1581, r940; +} +{ +add.f16x2 r1587, r1578, r1584; +} +{ +add.f16x2 r1590, r973, r974; +} +{ +mul.f16x2 r1593, r1590, r930; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, r943, r944; +} +{ +mul.f16x2 r1602, r1599, r936; +} +{ +sub.f16x2 r1605, r949, r950; +} +{ +mul.f16x2 r1608, r1605, r941; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r955, r956; +} +{ +mul.f16x2 r1617, r1614, r932; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +sub.f16x2 r1623, r1596, r1620; +} +mul.wide.u32 rd4, r3482, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r3486, rd5; +sub.s32 r3487, r3482, r3486; +shr.u32 r3488, r3487, 1; +add.s32 r3489, r3488, r3486; +shr.u32 r3490, r3489, 2; +cvt.rn.f32.u32 f132, r3490; +mul.f32 f133, f132, 0f3C961050; +cos.approx.f32 f57, f133; +sin.approx.f32 f134, f133; +neg.f32 f58, f134; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1626, {low, high}; +} +mul.lo.s32 r3491, r3490, 7; +sub.s32 r3492, r3482, r3491; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1629, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1631, {high, high}; +} +{ +mul.f16x2 r1633, r1353, r1631; +} +{ +fma.rn.f16x2 r1636, r1029, r1629, r1633; +} +{ +mul.f16x2 r1640, r1029, r1631; +} +{ +neg.f16x2 r1643, r1640; +} +{ +fma.rn.f16x2 r1645, r1353, r1629, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1651, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1653, {low, high}; +} +{ +mul.f16x2 r1654, r1651, r1653; +} +{ +mul.f16x2 r1657, r1626, r1649; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1660, {high, low}; +} +{ +fma.rn.f16x2 r1662, r1654, r1660, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1666, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1668, {high, high}; +} +{ +mul.f16x2 r1670, r1461, r1668; +} +{ +fma.rn.f16x2 r1673, r1137, r1666, r1670; +} +{ +mul.f16x2 r1677, r1137, r1668; +} +{ +neg.f16x2 r1680, r1677; +} +{ +fma.rn.f16x2 r1682, r1461, r1666, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1688, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1688, r1690; +} +{ +mul.f16x2 r1694, r1662, r1686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1697, {high, low}; +} +{ +fma.rn.f16x2 r1699, r1691, r1697, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1703, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1705, {high, high}; +} +{ +mul.f16x2 r1707, r1569, r1705; +} +{ +fma.rn.f16x2 r1710, r1245, r1703, r1707; +} +{ +mul.f16x2 r1714, r1245, r1705; +} +{ +neg.f16x2 r1717, r1714; +} +{ +fma.rn.f16x2 r1719, r1569, r1703, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1725, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1725, r1727; +} +{ +mul.f16x2 r1731, r1699, r1723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1734, {high, low}; +} +{ +fma.rn.f16x2 r1736, r1728, r1734, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1742, {high, high}; +} +{ +mul.f16x2 r1744, r1623, r1742; +} +{ +fma.rn.f16x2 r1747, r1299, r1740, r1744; +} +{ +mul.f16x2 r1751, r1299, r1742; +} +{ +neg.f16x2 r1754, r1751; +} +{ +fma.rn.f16x2 r1756, r1623, r1740, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1762, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1764, {low, high}; +} +{ +mul.f16x2 r1765, r1762, r1764; +} +{ +mul.f16x2 r1768, r1736, r1760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1771, {high, low}; +} +{ +fma.rn.f16x2 r1773, r1765, r1771, r1768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1779, {high, high}; +} +{ +mul.f16x2 r1781, r1515, r1779; +} +{ +fma.rn.f16x2 r1784, r1191, r1777, r1781; +} +{ +mul.f16x2 r1788, r1191, r1779; +} +{ +neg.f16x2 r1791, r1788; +} +{ +fma.rn.f16x2 r1793, r1515, r1777, r1791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1799, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1801, {low, high}; +} +{ +mul.f16x2 r1802, r1799, r1801; +} +{ +mul.f16x2 r1805, r1773, r1797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1808, {high, low}; +} +{ +fma.rn.f16x2 r1810, r1802, r1808, r1805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1816, {high, high}; +} +{ +mul.f16x2 r1818, r1407, r1816; +} +{ +fma.rn.f16x2 r1821, r1083, r1814, r1818; +} +{ +mul.f16x2 r1825, r1083, r1816; +} +{ +neg.f16x2 r1828, r1825; +} +{ +fma.rn.f16x2 r1830, r1407, r1814, r1828; +} +shl.b32 r3493, r3492, 3; +add.s32 r3494, r3483, r3493; +barrier.sync 0; +mad.lo.s32 r3495, r3490, 392, r3494; +st.shared.u32 [r3495], r957; +st.shared.u32 [r3495+4], r975; +st.shared.u32 [r3495+56], r1636; +st.shared.u32 [r3495+60], r1645; +st.shared.u32 [r3495+112], r1673; +st.shared.u32 [r3495+116], r1682; +st.shared.u32 [r3495+168], r1710; +st.shared.u32 [r3495+172], r1719; +st.shared.u32 [r3495+224], r1747; +st.shared.u32 [r3495+228], r1756; +st.shared.u32 [r3495+280], r1784; +st.shared.u32 [r3495+284], r1793; +st.shared.u32 [r3495+336], r1821; +st.shared.u32 [r3495+340], r1830; +barrier.sync 0; +ld.shared.u32 r1871, [r3485]; +ld.shared.u32 r1889, [r3485+4]; +ld.shared.u32 r1868, [r3485+2744]; +ld.shared.u32 r1886, [r3485+2748]; +ld.shared.u32 r1874, [r3485+5488]; +ld.shared.u32 r1892, [r3485+5492]; +ld.shared.u32 r1880, [r3485+8232]; +ld.shared.u32 r1898, [r3485+8236]; +ld.shared.u32 r1881, [r3485+10976]; +ld.shared.u32 r1899, [r3485+10980]; +ld.shared.u32 r1875, [r3485+13720]; +ld.shared.u32 r1893, [r3485+13724]; +ld.shared.u32 r1869, [r3485+16464]; +ld.shared.u32 r1887, [r3485+16468]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1852, {low, high}; +} +{ +neg.f16x2 r1853, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r1855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1856, {low, high}; +} +{ +neg.f16x2 r1857, r1856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1859, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1860, {low, high}; +} +{ +neg.f16x2 r1861, r1860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1864, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1866, {low, high}; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1871, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 r1876, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1876, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 r1894, r1888, r1891; +} +{ +add.f16x2 r1897, r1898, r1899; +} +{ +add.f16x2 r1900, r1894, r1897; +} +{ +add.f16x2 r1903, r1868, r1869; +} +{ +mul.f16x2 r1906, r1903, r1851; +} +{ +add.f16x2 r1909, r1871, r1906; +} +{ +add.f16x2 r1912, r1874, r1875; +} +{ +mul.f16x2 r1915, r1912, r1855; +} +{ +add.f16x2 r1918, r1909, r1915; +} +{ +add.f16x2 r1921, r1880, r1881; +} +{ +mul.f16x2 r1924, r1921, r1859; +} +{ +add.f16x2 r1927, r1918, r1924; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1853; +} +{ +sub.f16x2 r1936, r1892, r1893; +} +{ +mul.f16x2 r1939, r1936, r1857; +} +{ +add.f16x2 r1942, r1933, r1939; +} +{ +sub.f16x2 r1945, r1898, r1899; +} +{ +mul.f16x2 r1948, r1945, r1861; +} +{ +add.f16x2 r1951, r1942, r1948; +} +{ +sub.f16x2 r1954, r1927, r1951; +} +{ +add.f16x2 r1957, r1868, r1869; +} +{ +mul.f16x2 r1960, r1957, r1851; +} +{ +add.f16x2 r1963, r1871, r1960; +} +{ +add.f16x2 r1966, r1874, r1875; +} +{ +mul.f16x2 r1969, r1966, r1855; +} +{ +add.f16x2 r1972, r1963, r1969; +} +{ +add.f16x2 r1975, r1880, r1881; +} +{ +mul.f16x2 r1978, r1975, r1859; +} +{ +add.f16x2 r1981, r1972, r1978; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1853; +} +{ +sub.f16x2 r1990, r1892, r1893; +} +{ +mul.f16x2 r1993, r1990, r1857; +} +{ +add.f16x2 r1996, r1987, r1993; +} +{ +sub.f16x2 r1999, r1898, r1899; +} +{ +mul.f16x2 r2002, r1999, r1861; +} +{ +add.f16x2 r2005, r1996, r2002; +} +{ +add.f16x2 r2008, r1981, r2005; +} +{ +add.f16x2 r2011, r1868, r1869; +} +{ +mul.f16x2 r2014, r2011, r1855; +} +{ +add.f16x2 r2017, r1871, r2014; +} +{ +add.f16x2 r2020, r1874, r1875; +} +{ +mul.f16x2 r2023, r2020, r1863; +} +{ +add.f16x2 r2026, r2017, r2023; +} +{ +add.f16x2 r2029, r1880, r1881; +} +{ +mul.f16x2 r2032, r2029, r1865; +} +{ +add.f16x2 r2035, r2026, r2032; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1857; +} +{ +sub.f16x2 r2044, r1892, r1893; +} +{ +mul.f16x2 r2047, r2044, r1864; +} +{ +add.f16x2 r2050, r2041, r2047; +} +{ +sub.f16x2 r2053, r1898, r1899; +} +{ +mul.f16x2 r2056, r2053, r1866; +} +{ +add.f16x2 r2059, r2050, r2056; +} +{ +sub.f16x2 r2062, r2035, r2059; +} +{ +add.f16x2 r2065, r1868, r1869; +} +{ +mul.f16x2 r2068, r2065, r1855; +} +{ +add.f16x2 r2071, r1871, r2068; +} +{ +add.f16x2 r2074, r1874, r1875; +} +{ +mul.f16x2 r2077, r2074, r1863; +} +{ +add.f16x2 r2080, r2071, r2077; +} +{ +add.f16x2 r2083, r1880, r1881; +} +{ +mul.f16x2 r2086, r2083, r1865; +} +{ +add.f16x2 r2089, r2080, r2086; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1857; +} +{ +sub.f16x2 r2098, r1892, r1893; +} +{ +mul.f16x2 r2101, r2098, r1864; +} +{ +add.f16x2 r2104, r2095, r2101; +} +{ +sub.f16x2 r2107, r1898, r1899; +} +{ +mul.f16x2 r2110, r2107, r1866; +} +{ +add.f16x2 r2113, r2104, r2110; +} +{ +add.f16x2 r2116, r2089, r2113; +} +{ +add.f16x2 r2119, r1868, r1869; +} +{ +mul.f16x2 r2122, r2119, r1859; +} +{ +add.f16x2 r2125, r1871, r2122; +} +{ +add.f16x2 r2128, r1874, r1875; +} +{ +mul.f16x2 r2131, r2128, r1865; +} +{ +add.f16x2 r2134, r2125, r2131; +} +{ +add.f16x2 r2137, r1880, r1881; +} +{ +mul.f16x2 r2140, r2137, r1855; +} +{ +add.f16x2 r2143, r2134, r2140; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1861; +} +{ +sub.f16x2 r2152, r1892, r1893; +} +{ +mul.f16x2 r2155, r2152, r1866; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +sub.f16x2 r2161, r1898, r1899; +} +{ +mul.f16x2 r2164, r2161, r1857; +} +{ +add.f16x2 r2167, r2158, r2164; +} +{ +sub.f16x2 r2170, r2143, r2167; +} +{ +add.f16x2 r2173, r1868, r1869; +} +{ +mul.f16x2 r2176, r2173, r1859; +} +{ +add.f16x2 r2179, r1871, r2176; +} +{ +add.f16x2 r2182, r1874, r1875; +} +{ +mul.f16x2 r2185, r2182, r1865; +} +{ +add.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r1880, r1881; +} +{ +mul.f16x2 r2194, r2191, r1855; +} +{ +add.f16x2 r2197, r2188, r2194; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1861; +} +{ +sub.f16x2 r2206, r1892, r1893; +} +{ +mul.f16x2 r2209, r2206, r1866; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +sub.f16x2 r2215, r1898, r1899; +} +{ +mul.f16x2 r2218, r2215, r1857; +} +{ +add.f16x2 r2221, r2212, r2218; +} +{ +add.f16x2 r2224, r2197, r2221; +} +{ +add.f16x2 r2227, r1886, r1887; +} +{ +mul.f16x2 r2230, r2227, r1851; +} +{ +add.f16x2 r2233, r1889, r2230; +} +{ +add.f16x2 r2236, r1892, r1893; +} +{ +mul.f16x2 r2239, r2236, r1855; +} +{ +add.f16x2 r2242, r2233, r2239; +} +{ +add.f16x2 r2245, r1898, r1899; +} +{ +mul.f16x2 r2248, r2245, r1859; +} +{ +add.f16x2 r2251, r2242, r2248; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1853; +} +{ +sub.f16x2 r2260, r1874, r1875; +} +{ +mul.f16x2 r2263, r2260, r1857; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +sub.f16x2 r2269, r1880, r1881; +} +{ +mul.f16x2 r2272, r2269, r1861; +} +{ +add.f16x2 r2275, r2266, r2272; +} +{ +add.f16x2 r2278, r2251, r2275; +} +{ +add.f16x2 r2281, r1886, r1887; +} +{ +mul.f16x2 r2284, r2281, r1851; +} +{ +add.f16x2 r2287, r1889, r2284; +} +{ +add.f16x2 r2290, r1892, r1893; +} +{ +mul.f16x2 r2293, r2290, r1855; +} +{ +add.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r1898, r1899; +} +{ +mul.f16x2 r2302, r2299, r1859; +} +{ +add.f16x2 r2305, r2296, r2302; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1853; +} +{ +sub.f16x2 r2314, r1874, r1875; +} +{ +mul.f16x2 r2317, r2314, r1857; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r1880, r1881; +} +{ +mul.f16x2 r2326, r2323, r1861; +} +{ +add.f16x2 r2329, r2320, r2326; +} +{ +sub.f16x2 r2332, r2305, r2329; +} +{ +add.f16x2 r2335, r1886, r1887; +} +{ +mul.f16x2 r2338, r2335, r1855; +} +{ +add.f16x2 r2341, r1889, r2338; +} +{ +add.f16x2 r2344, r1892, r1893; +} +{ +mul.f16x2 r2347, r2344, r1863; +} +{ +add.f16x2 r2350, r2341, r2347; +} +{ +add.f16x2 r2353, r1898, r1899; +} +{ +mul.f16x2 r2356, r2353, r1865; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1857; +} +{ +sub.f16x2 r2368, r1874, r1875; +} +{ +mul.f16x2 r2371, r2368, r1864; +} +{ +add.f16x2 r2374, r2365, r2371; +} +{ +sub.f16x2 r2377, r1880, r1881; +} +{ +mul.f16x2 r2380, r2377, r1866; +} +{ +add.f16x2 r2383, r2374, r2380; +} +{ +add.f16x2 r2386, r2359, r2383; +} +{ +add.f16x2 r2389, r1886, r1887; +} +{ +mul.f16x2 r2392, r2389, r1855; +} +{ +add.f16x2 r2395, r1889, r2392; +} +{ +add.f16x2 r2398, r1892, r1893; +} +{ +mul.f16x2 r2401, r2398, r1863; +} +{ +add.f16x2 r2404, r2395, r2401; +} +{ +add.f16x2 r2407, r1898, r1899; +} +{ +mul.f16x2 r2410, r2407, r1865; +} +{ +add.f16x2 r2413, r2404, r2410; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1857; +} +{ +sub.f16x2 r2422, r1874, r1875; +} +{ +mul.f16x2 r2425, r2422, r1864; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r1880, r1881; +} +{ +mul.f16x2 r2434, r2431, r1866; +} +{ +add.f16x2 r2437, r2428, r2434; +} +{ +sub.f16x2 r2440, r2413, r2437; +} +{ +add.f16x2 r2443, r1886, r1887; +} +{ +mul.f16x2 r2446, r2443, r1859; +} +{ +add.f16x2 r2449, r1889, r2446; +} +{ +add.f16x2 r2452, r1892, r1893; +} +{ +mul.f16x2 r2455, r2452, r1865; +} +{ +add.f16x2 r2458, r2449, r2455; +} +{ +add.f16x2 r2461, r1898, r1899; +} +{ +mul.f16x2 r2464, r2461, r1855; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1861; +} +{ +sub.f16x2 r2476, r1874, r1875; +} +{ +mul.f16x2 r2479, r2476, r1866; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +sub.f16x2 r2485, r1880, r1881; +} +{ +mul.f16x2 r2488, r2485, r1857; +} +{ +add.f16x2 r2491, r2482, r2488; +} +{ +add.f16x2 r2494, r2467, r2491; +} +{ +add.f16x2 r2497, r1886, r1887; +} +{ +mul.f16x2 r2500, r2497, r1859; +} +{ +add.f16x2 r2503, r1889, r2500; +} +{ +add.f16x2 r2506, r1892, r1893; +} +{ +mul.f16x2 r2509, r2506, r1865; +} +{ +add.f16x2 r2512, r2503, r2509; +} +{ +add.f16x2 r2515, r1898, r1899; +} +{ +mul.f16x2 r2518, r2515, r1855; +} +{ +add.f16x2 r2521, r2512, r2518; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1861; +} +{ +sub.f16x2 r2530, r1874, r1875; +} +{ +mul.f16x2 r2533, r2530, r1866; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r1880, r1881; +} +{ +mul.f16x2 r2542, r2539, r1857; +} +{ +add.f16x2 r2545, r2536, r2542; +} +{ +sub.f16x2 r2548, r2521, r2545; +} +mul.wide.u32 rd6, r3482, 1402438301; +shr.u64 rd7, rd6, 36; +cvt.u32.u64 r3496, rd7; +cvt.rn.f32.u32 f135, r3496; +mul.f32 f136, f135, 0f3E034E46; +cos.approx.f32 f93, f136; +sin.approx.f32 f137, f136; +neg.f32 f94, f137; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r2551, {low, high}; +} +mul.lo.s32 r3497, r3496, 49; +sub.s32 r3498, r3482, r3497; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2554, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2556, {high, high}; +} +{ +mul.f16x2 r2558, r2278, r2556; +} +{ +fma.rn.f16x2 r2561, r1954, r2554, r2558; +} +{ +mul.f16x2 r2565, r1954, r2556; +} +{ +neg.f16x2 r2568, r2565; +} +{ +fma.rn.f16x2 r2570, r2278, r2554, r2568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2576, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2578, {low, high}; +} +{ +mul.f16x2 r2579, r2576, r2578; +} +{ +mul.f16x2 r2582, r2551, r2574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2585, {high, low}; +} +{ +fma.rn.f16x2 r2587, r2579, r2585, r2582; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2587; +mov.b32 r2591, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2587; +mov.b32 r2593, {high, high}; +} +{ +mul.f16x2 r2595, r2386, r2593; +} +{ +fma.rn.f16x2 r2598, r2062, r2591, r2595; +} +{ +mul.f16x2 r2602, r2062, r2593; +} +{ +neg.f16x2 r2605, r2602; +} +{ +fma.rn.f16x2 r2607, r2386, r2591, r2605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2613, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2615, {low, high}; +} +{ +mul.f16x2 r2616, r2613, r2615; +} +{ +mul.f16x2 r2619, r2587, r2611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2587; +mov.b32 r2622, {high, low}; +} +{ +fma.rn.f16x2 r2624, r2616, r2622, r2619; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2624; +mov.b32 r2628, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2624; +mov.b32 r2630, {high, high}; +} +{ +mul.f16x2 r2632, r2494, r2630; +} +{ +fma.rn.f16x2 r2635, r2170, r2628, r2632; +} +{ +mul.f16x2 r2639, r2170, r2630; +} +{ +neg.f16x2 r2642, r2639; +} +{ +fma.rn.f16x2 r2644, r2494, r2628, r2642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2650, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2652, {low, high}; +} +{ +mul.f16x2 r2653, r2650, r2652; +} +{ +mul.f16x2 r2656, r2624, r2648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2624; +mov.b32 r2659, {high, low}; +} +{ +fma.rn.f16x2 r2661, r2653, r2659, r2656; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2661; +mov.b32 r2665, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2661; +mov.b32 r2667, {high, high}; +} +{ +mul.f16x2 r2669, r2548, r2667; +} +{ +fma.rn.f16x2 r2672, r2224, r2665, r2669; +} +{ +mul.f16x2 r2676, r2224, r2667; +} +{ +neg.f16x2 r2679, r2676; +} +{ +fma.rn.f16x2 r2681, r2548, r2665, r2679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2687, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2689, {low, high}; +} +{ +mul.f16x2 r2690, r2687, r2689; +} +{ +mul.f16x2 r2693, r2661, r2685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2661; +mov.b32 r2696, {high, low}; +} +{ +fma.rn.f16x2 r2698, r2690, r2696, r2693; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2698; +mov.b32 r2702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2698; +mov.b32 r2704, {high, high}; +} +{ +mul.f16x2 r2706, r2440, r2704; +} +{ +fma.rn.f16x2 r2709, r2116, r2702, r2706; +} +{ +mul.f16x2 r2713, r2116, r2704; +} +{ +neg.f16x2 r2716, r2713; +} +{ +fma.rn.f16x2 r2718, r2440, r2702, r2716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2724, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2726, {low, high}; +} +{ +mul.f16x2 r2727, r2724, r2726; +} +{ +mul.f16x2 r2730, r2698, r2722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2698; +mov.b32 r2733, {high, low}; +} +{ +fma.rn.f16x2 r2735, r2727, r2733, r2730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2735; +mov.b32 r2739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2735; +mov.b32 r2741, {high, high}; +} +{ +mul.f16x2 r2743, r2332, r2741; +} +{ +fma.rn.f16x2 r2746, r2008, r2739, r2743; +} +{ +mul.f16x2 r2750, r2008, r2741; +} +{ +neg.f16x2 r2753, r2750; +} +{ +fma.rn.f16x2 r2755, r2332, r2739, r2753; +} +shl.b32 r3499, r3498, 3; +add.s32 r3500, r3483, r3499; +barrier.sync 0; +mad.lo.s32 r3501, r3496, 2744, r3500; +st.shared.u32 [r3501], r1882; +st.shared.u32 [r3501+4], r1900; +st.shared.u32 [r3501+392], r2561; +st.shared.u32 [r3501+396], r2570; +st.shared.u32 [r3501+784], r2598; +st.shared.u32 [r3501+788], r2607; +st.shared.u32 [r3501+1176], r2635; +st.shared.u32 [r3501+1180], r2644; +st.shared.u32 [r3501+1568], r2672; +st.shared.u32 [r3501+1572], r2681; +st.shared.u32 [r3501+1960], r2709; +st.shared.u32 [r3501+1964], r2718; +st.shared.u32 [r3501+2352], r2746; +st.shared.u32 [r3501+2356], r2755; +barrier.sync 0; +ld.shared.u32 r2796, [r3485]; +ld.shared.u32 r2814, [r3485+4]; +ld.shared.u32 r2793, [r3485+2744]; +ld.shared.u32 r2811, [r3485+2748]; +ld.shared.u32 r2799, [r3485+5488]; +ld.shared.u32 r2817, [r3485+5492]; +ld.shared.u32 r2805, [r3485+8232]; +ld.shared.u32 r2823, [r3485+8236]; +ld.shared.u32 r2806, [r3485+10976]; +ld.shared.u32 r2824, [r3485+10980]; +ld.shared.u32 r2800, [r3485+13720]; +ld.shared.u32 r2818, [r3485+13724]; +ld.shared.u32 r2794, [r3485+16464]; +ld.shared.u32 r2812, [r3485+16468]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2777, {low, high}; +} +{ +neg.f16x2 r2778, r2777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r2780, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r2781, {low, high}; +} +{ +neg.f16x2 r2782, r2781; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2784, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2785, {low, high}; +} +{ +neg.f16x2 r2786, r2785; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2788, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2789, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2790, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2791, {low, high}; +} +{ +add.f16x2 r2792, r2793, r2794; +} +{ +add.f16x2 r2795, r2796, r2792; +} +{ +add.f16x2 r2798, r2799, r2800; +} +{ +add.f16x2 r2801, r2795, r2798; +} +{ +add.f16x2 r2804, r2805, r2806; +} +{ +add.f16x2 %0, r2801, r2804; +} +{ +add.f16x2 r2810, r2811, r2812; +} +{ +add.f16x2 r2813, r2814, r2810; +} +{ +add.f16x2 r2816, r2817, r2818; +} +{ +add.f16x2 r2819, r2813, r2816; +} +{ +add.f16x2 r2822, r2823, r2824; +} +{ +add.f16x2 %1, r2819, r2822; +} +{ +add.f16x2 r2828, r2793, r2794; +} +{ +mul.f16x2 r2831, r2828, r2776; +} +{ +add.f16x2 r2834, r2796, r2831; +} +{ +add.f16x2 r2837, r2799, r2800; +} +{ +mul.f16x2 r2840, r2837, r2780; +} +{ +add.f16x2 r2843, r2834, r2840; +} +{ +add.f16x2 r2846, r2805, r2806; +} +{ +mul.f16x2 r2849, r2846, r2784; +} +{ +add.f16x2 r2852, r2843, r2849; +} +{ +sub.f16x2 r2855, r2811, r2812; +} +{ +mul.f16x2 r2858, r2855, r2778; +} +{ +sub.f16x2 r2861, r2817, r2818; +} +{ +mul.f16x2 r2864, r2861, r2782; +} +{ +add.f16x2 r2867, r2858, r2864; +} +{ +sub.f16x2 r2870, r2823, r2824; +} +{ +mul.f16x2 r2873, r2870, r2786; +} +{ +add.f16x2 r2876, r2867, r2873; +} +{ +sub.f16x2 %2, r2852, r2876; +} +{ +add.f16x2 r2882, r2793, r2794; +} +{ +mul.f16x2 r2885, r2882, r2776; +} +{ +add.f16x2 r2888, r2796, r2885; +} +{ +add.f16x2 r2891, r2799, r2800; +} +{ +mul.f16x2 r2894, r2891, r2780; +} +{ +add.f16x2 r2897, r2888, r2894; +} +{ +add.f16x2 r2900, r2805, r2806; +} +{ +mul.f16x2 r2903, r2900, r2784; +} +{ +add.f16x2 r2906, r2897, r2903; +} +{ +sub.f16x2 r2909, r2811, r2812; +} +{ +mul.f16x2 r2912, r2909, r2778; +} +{ +sub.f16x2 r2915, r2817, r2818; +} +{ +mul.f16x2 r2918, r2915, r2782; +} +{ +add.f16x2 r2921, r2912, r2918; +} +{ +sub.f16x2 r2924, r2823, r2824; +} +{ +mul.f16x2 r2927, r2924, r2786; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +add.f16x2 %12, r2906, r2930; +} +{ +add.f16x2 r2936, r2793, r2794; +} +{ +mul.f16x2 r2939, r2936, r2780; +} +{ +add.f16x2 r2942, r2796, r2939; +} +{ +add.f16x2 r2945, r2799, r2800; +} +{ +mul.f16x2 r2948, r2945, r2788; +} +{ +add.f16x2 r2951, r2942, r2948; +} +{ +add.f16x2 r2954, r2805, r2806; +} +{ +mul.f16x2 r2957, r2954, r2790; +} +{ +add.f16x2 r2960, r2951, r2957; +} +{ +sub.f16x2 r2963, r2811, r2812; +} +{ +mul.f16x2 r2966, r2963, r2782; +} +{ +sub.f16x2 r2969, r2817, r2818; +} +{ +mul.f16x2 r2972, r2969, r2789; +} +{ +add.f16x2 r2975, r2966, r2972; +} +{ +sub.f16x2 r2978, r2823, r2824; +} +{ +mul.f16x2 r2981, r2978, r2791; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 %4, r2960, r2984; +} +{ +add.f16x2 r2990, r2793, r2794; +} +{ +mul.f16x2 r2993, r2990, r2780; +} +{ +add.f16x2 r2996, r2796, r2993; +} +{ +add.f16x2 r2999, r2799, r2800; +} +{ +mul.f16x2 r3002, r2999, r2788; +} +{ +add.f16x2 r3005, r2996, r3002; +} +{ +add.f16x2 r3008, r2805, r2806; +} +{ +mul.f16x2 r3011, r3008, r2790; +} +{ +add.f16x2 r3014, r3005, r3011; +} +{ +sub.f16x2 r3017, r2811, r2812; +} +{ +mul.f16x2 r3020, r3017, r2782; +} +{ +sub.f16x2 r3023, r2817, r2818; +} +{ +mul.f16x2 r3026, r3023, r2789; +} +{ +add.f16x2 r3029, r3020, r3026; +} +{ +sub.f16x2 r3032, r2823, r2824; +} +{ +mul.f16x2 r3035, r3032, r2791; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +add.f16x2 %10, r3014, r3038; +} +{ +add.f16x2 r3044, r2793, r2794; +} +{ +mul.f16x2 r3047, r3044, r2784; +} +{ +add.f16x2 r3050, r2796, r3047; +} +{ +add.f16x2 r3053, r2799, r2800; +} +{ +mul.f16x2 r3056, r3053, r2790; +} +{ +add.f16x2 r3059, r3050, r3056; +} +{ +add.f16x2 r3062, r2805, r2806; +} +{ +mul.f16x2 r3065, r3062, r2780; +} +{ +add.f16x2 r3068, r3059, r3065; +} +{ +sub.f16x2 r3071, r2811, r2812; +} +{ +mul.f16x2 r3074, r3071, r2786; +} +{ +sub.f16x2 r3077, r2817, r2818; +} +{ +mul.f16x2 r3080, r3077, r2791; +} +{ +add.f16x2 r3083, r3074, r3080; +} +{ +sub.f16x2 r3086, r2823, r2824; +} +{ +mul.f16x2 r3089, r3086, r2782; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 %6, r3068, r3092; +} +{ +add.f16x2 r3098, r2793, r2794; +} +{ +mul.f16x2 r3101, r3098, r2784; +} +{ +add.f16x2 r3104, r2796, r3101; +} +{ +add.f16x2 r3107, r2799, r2800; +} +{ +mul.f16x2 r3110, r3107, r2790; +} +{ +add.f16x2 r3113, r3104, r3110; +} +{ +add.f16x2 r3116, r2805, r2806; +} +{ +mul.f16x2 r3119, r3116, r2780; +} +{ +add.f16x2 r3122, r3113, r3119; +} +{ +sub.f16x2 r3125, r2811, r2812; +} +{ +mul.f16x2 r3128, r3125, r2786; +} +{ +sub.f16x2 r3131, r2817, r2818; +} +{ +mul.f16x2 r3134, r3131, r2791; +} +{ +add.f16x2 r3137, r3128, r3134; +} +{ +sub.f16x2 r3140, r2823, r2824; +} +{ +mul.f16x2 r3143, r3140, r2782; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +add.f16x2 %8, r3122, r3146; +} +{ +add.f16x2 r3152, r2811, r2812; +} +{ +mul.f16x2 r3155, r3152, r2776; +} +{ +add.f16x2 r3158, r2814, r3155; +} +{ +add.f16x2 r3161, r2817, r2818; +} +{ +mul.f16x2 r3164, r3161, r2780; +} +{ +add.f16x2 r3167, r3158, r3164; +} +{ +add.f16x2 r3170, r2823, r2824; +} +{ +mul.f16x2 r3173, r3170, r2784; +} +{ +add.f16x2 r3176, r3167, r3173; +} +{ +sub.f16x2 r3179, r2793, r2794; +} +{ +mul.f16x2 r3182, r3179, r2778; +} +{ +sub.f16x2 r3185, r2799, r2800; +} +{ +mul.f16x2 r3188, r3185, r2782; +} +{ +add.f16x2 r3191, r3182, r3188; +} +{ +sub.f16x2 r3194, r2805, r2806; +} +{ +mul.f16x2 r3197, r3194, r2786; +} +{ +add.f16x2 r3200, r3191, r3197; +} +{ +add.f16x2 %3, r3176, r3200; +} +{ +add.f16x2 r3206, r2811, r2812; +} +{ +mul.f16x2 r3209, r3206, r2776; +} +{ +add.f16x2 r3212, r2814, r3209; +} +{ +add.f16x2 r3215, r2817, r2818; +} +{ +mul.f16x2 r3218, r3215, r2780; +} +{ +add.f16x2 r3221, r3212, r3218; +} +{ +add.f16x2 r3224, r2823, r2824; +} +{ +mul.f16x2 r3227, r3224, r2784; +} +{ +add.f16x2 r3230, r3221, r3227; +} +{ +sub.f16x2 r3233, r2793, r2794; +} +{ +mul.f16x2 r3236, r3233, r2778; +} +{ +sub.f16x2 r3239, r2799, r2800; +} +{ +mul.f16x2 r3242, r3239, r2782; +} +{ +add.f16x2 r3245, r3236, r3242; +} +{ +sub.f16x2 r3248, r2805, r2806; +} +{ +mul.f16x2 r3251, r3248, r2786; +} +{ +add.f16x2 r3254, r3245, r3251; +} +{ +sub.f16x2 %13, r3230, r3254; +} +{ +add.f16x2 r3260, r2811, r2812; +} +{ +mul.f16x2 r3263, r3260, r2780; +} +{ +add.f16x2 r3266, r2814, r3263; +} +{ +add.f16x2 r3269, r2817, r2818; +} +{ +mul.f16x2 r3272, r3269, r2788; +} +{ +add.f16x2 r3275, r3266, r3272; +} +{ +add.f16x2 r3278, r2823, r2824; +} +{ +mul.f16x2 r3281, r3278, r2790; +} +{ +add.f16x2 r3284, r3275, r3281; +} +{ +sub.f16x2 r3287, r2793, r2794; +} +{ +mul.f16x2 r3290, r3287, r2782; +} +{ +sub.f16x2 r3293, r2799, r2800; +} +{ +mul.f16x2 r3296, r3293, r2789; +} +{ +add.f16x2 r3299, r3290, r3296; +} +{ +sub.f16x2 r3302, r2805, r2806; +} +{ +mul.f16x2 r3305, r3302, r2791; +} +{ +add.f16x2 r3308, r3299, r3305; +} +{ +add.f16x2 %5, r3284, r3308; +} +{ +add.f16x2 r3314, r2811, r2812; +} +{ +mul.f16x2 r3317, r3314, r2780; +} +{ +add.f16x2 r3320, r2814, r3317; +} +{ +add.f16x2 r3323, r2817, r2818; +} +{ +mul.f16x2 r3326, r3323, r2788; +} +{ +add.f16x2 r3329, r3320, r3326; +} +{ +add.f16x2 r3332, r2823, r2824; +} +{ +mul.f16x2 r3335, r3332, r2790; +} +{ +add.f16x2 r3338, r3329, r3335; +} +{ +sub.f16x2 r3341, r2793, r2794; +} +{ +mul.f16x2 r3344, r3341, r2782; +} +{ +sub.f16x2 r3347, r2799, r2800; +} +{ +mul.f16x2 r3350, r3347, r2789; +} +{ +add.f16x2 r3353, r3344, r3350; +} +{ +sub.f16x2 r3356, r2805, r2806; +} +{ +mul.f16x2 r3359, r3356, r2791; +} +{ +add.f16x2 r3362, r3353, r3359; +} +{ +sub.f16x2 %11, r3338, r3362; +} +{ +add.f16x2 r3368, r2811, r2812; +} +{ +mul.f16x2 r3371, r3368, r2784; +} +{ +add.f16x2 r3374, r2814, r3371; +} +{ +add.f16x2 r3377, r2817, r2818; +} +{ +mul.f16x2 r3380, r3377, r2790; +} +{ +add.f16x2 r3383, r3374, r3380; +} +{ +add.f16x2 r3386, r2823, r2824; +} +{ +mul.f16x2 r3389, r3386, r2780; +} +{ +add.f16x2 r3392, r3383, r3389; +} +{ +sub.f16x2 r3395, r2793, r2794; +} +{ +mul.f16x2 r3398, r3395, r2786; +} +{ +sub.f16x2 r3401, r2799, r2800; +} +{ +mul.f16x2 r3404, r3401, r2791; +} +{ +add.f16x2 r3407, r3398, r3404; +} +{ +sub.f16x2 r3410, r2805, r2806; +} +{ +mul.f16x2 r3413, r3410, r2782; +} +{ +add.f16x2 r3416, r3407, r3413; +} +{ +add.f16x2 %7, r3392, r3416; +} +{ +add.f16x2 r3422, r2811, r2812; +} +{ +mul.f16x2 r3425, r3422, r2784; +} +{ +add.f16x2 r3428, r2814, r3425; +} +{ +add.f16x2 r3431, r2817, r2818; +} +{ +mul.f16x2 r3434, r3431, r2790; +} +{ +add.f16x2 r3437, r3428, r3434; +} +{ +add.f16x2 r3440, r2823, r2824; +} +{ +mul.f16x2 r3443, r3440, r2780; +} +{ +add.f16x2 r3446, r3437, r3443; +} +{ +sub.f16x2 r3449, r2793, r2794; +} +{ +mul.f16x2 r3452, r3449, r2786; +} +{ +sub.f16x2 r3455, r2799, r2800; +} +{ +mul.f16x2 r3458, r3455, r2791; +} +{ +add.f16x2 r3461, r3452, r3458; +} +{ +sub.f16x2 r3464, r2805, r2806; +} +{ +mul.f16x2 r3467, r3464, r2782; +} +{ +add.f16x2 r3470, r3461, r3467; +} +{ +sub.f16x2 %9, r3446, r3470; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1125, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<138>; +.reg .b32 r<3502>; +.reg .b64 rd<8>; +mov.u32 r3476, %tid.y; +mov.u32 r3477, %14; +mad.lo.s32 r3478, r3476, 9604, r3477; +mov.u32 r3479, %tid.x; +mov.f32 f126, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1, {low, high}; +} +mov.f32 f128, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f114, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r5, {low, high}; +} +mov.f32 f116, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +mov.f32 f122, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r9, {low, high}; +} +mov.f32 f124, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r10, {low, high}; +} +{ +neg.f16x2 r11, r10; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r14, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r15, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r16, {low, high}; +} +{ +add.f16x2 r17, %17, %27; +} +{ +add.f16x2 r20, %15, r17; +} +{ +add.f16x2 r23, %19, %25; +} +{ +add.f16x2 r26, r20, r23; +} +{ +add.f16x2 r29, %21, %23; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %18, %28; +} +{ +add.f16x2 r38, %16, r35; +} +{ +add.f16x2 r41, %20, %26; +} +{ +add.f16x2 r44, r38, r41; +} +{ +add.f16x2 r47, %22, %24; +} +{ +add.f16x2 r50, r44, r47; +} +{ +add.f16x2 r53, %17, %27; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %15, r56; +} +{ +add.f16x2 r62, %19, %25; +} +{ +mul.f16x2 r65, r62, r5; +} +{ +add.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %21, %23; +} +{ +mul.f16x2 r74, r71, r9; +} +{ +add.f16x2 r77, r68, r74; +} +{ +sub.f16x2 r80, %18, %28; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +sub.f16x2 r86, %20, %26; +} +{ +mul.f16x2 r89, r86, r7; +} +{ +add.f16x2 r92, r83, r89; +} +{ +sub.f16x2 r95, %22, %24; +} +{ +mul.f16x2 r98, r95, r11; +} +{ +add.f16x2 r101, r92, r98; +} +{ +sub.f16x2 r104, r77, r101; +} +{ +add.f16x2 r107, %17, %27; +} +{ +mul.f16x2 r110, r107, r1; +} +{ +add.f16x2 r113, %15, r110; +} +{ +add.f16x2 r116, %19, %25; +} +{ +mul.f16x2 r119, r116, r5; +} +{ +add.f16x2 r122, r113, r119; +} +{ +add.f16x2 r125, %21, %23; +} +{ +mul.f16x2 r128, r125, r9; +} +{ +add.f16x2 r131, r122, r128; +} +{ +sub.f16x2 r134, %18, %28; +} +{ +mul.f16x2 r137, r134, r3; +} +{ +sub.f16x2 r140, %20, %26; +} +{ +mul.f16x2 r143, r140, r7; +} +{ +add.f16x2 r146, r137, r143; +} +{ +sub.f16x2 r149, %22, %24; +} +{ +mul.f16x2 r152, r149, r11; +} +{ +add.f16x2 r155, r146, r152; +} +{ +add.f16x2 r158, r131, r155; +} +{ +add.f16x2 r161, %17, %27; +} +{ +mul.f16x2 r164, r161, r5; +} +{ +add.f16x2 r167, %15, r164; +} +{ +add.f16x2 r170, %19, %25; +} +{ +mul.f16x2 r173, r170, r13; +} +{ +add.f16x2 r176, r167, r173; +} +{ +add.f16x2 r179, %21, %23; +} +{ +mul.f16x2 r182, r179, r15; +} +{ +add.f16x2 r185, r176, r182; +} +{ +sub.f16x2 r188, %18, %28; +} +{ +mul.f16x2 r191, r188, r7; +} +{ +sub.f16x2 r194, %20, %26; +} +{ +mul.f16x2 r197, r194, r14; +} +{ +add.f16x2 r200, r191, r197; +} +{ +sub.f16x2 r203, %22, %24; +} +{ +mul.f16x2 r206, r203, r16; +} +{ +add.f16x2 r209, r200, r206; +} +{ +sub.f16x2 r212, r185, r209; +} +{ +add.f16x2 r215, %17, %27; +} +{ +mul.f16x2 r218, r215, r5; +} +{ +add.f16x2 r221, %15, r218; +} +{ +add.f16x2 r224, %19, %25; +} +{ +mul.f16x2 r227, r224, r13; +} +{ +add.f16x2 r230, r221, r227; +} +{ +add.f16x2 r233, %21, %23; +} +{ +mul.f16x2 r236, r233, r15; +} +{ +add.f16x2 r239, r230, r236; +} +{ +sub.f16x2 r242, %18, %28; +} +{ +mul.f16x2 r245, r242, r7; +} +{ +sub.f16x2 r248, %20, %26; +} +{ +mul.f16x2 r251, r248, r14; +} +{ +add.f16x2 r254, r245, r251; +} +{ +sub.f16x2 r257, %22, %24; +} +{ +mul.f16x2 r260, r257, r16; +} +{ +add.f16x2 r263, r254, r260; +} +{ +add.f16x2 r266, r239, r263; +} +{ +add.f16x2 r269, %17, %27; +} +{ +mul.f16x2 r272, r269, r9; +} +{ +add.f16x2 r275, %15, r272; +} +{ +add.f16x2 r278, %19, %25; +} +{ +mul.f16x2 r281, r278, r15; +} +{ +add.f16x2 r284, r275, r281; +} +{ +add.f16x2 r287, %21, %23; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, r284, r290; +} +{ +sub.f16x2 r296, %18, %28; +} +{ +mul.f16x2 r299, r296, r11; +} +{ +sub.f16x2 r302, %20, %26; +} +{ +mul.f16x2 r305, r302, r16; +} +{ +add.f16x2 r308, r299, r305; +} +{ +sub.f16x2 r311, %22, %24; +} +{ +mul.f16x2 r314, r311, r7; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r293, r317; +} +{ +add.f16x2 r323, %17, %27; +} +{ +mul.f16x2 r326, r323, r9; +} +{ +add.f16x2 r329, %15, r326; +} +{ +add.f16x2 r332, %19, %25; +} +{ +mul.f16x2 r335, r332, r15; +} +{ +add.f16x2 r338, r329, r335; +} +{ +add.f16x2 r341, %21, %23; +} +{ +mul.f16x2 r344, r341, r5; +} +{ +add.f16x2 r347, r338, r344; +} +{ +sub.f16x2 r350, %18, %28; +} +{ +mul.f16x2 r353, r350, r11; +} +{ +sub.f16x2 r356, %20, %26; +} +{ +mul.f16x2 r359, r356, r16; +} +{ +add.f16x2 r362, r353, r359; +} +{ +sub.f16x2 r365, %22, %24; +} +{ +mul.f16x2 r368, r365, r7; +} +{ +add.f16x2 r371, r362, r368; +} +{ +add.f16x2 r374, r347, r371; +} +{ +add.f16x2 r377, %18, %28; +} +{ +mul.f16x2 r380, r377, r1; +} +{ +add.f16x2 r383, %16, r380; +} +{ +add.f16x2 r386, %20, %26; +} +{ +mul.f16x2 r389, r386, r5; +} +{ +add.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %22, %24; +} +{ +mul.f16x2 r398, r395, r9; +} +{ +add.f16x2 r401, r392, r398; +} +{ +sub.f16x2 r404, %17, %27; +} +{ +mul.f16x2 r407, r404, r3; +} +{ +sub.f16x2 r410, %19, %25; +} +{ +mul.f16x2 r413, r410, r7; +} +{ +add.f16x2 r416, r407, r413; +} +{ +sub.f16x2 r419, %21, %23; +} +{ +mul.f16x2 r422, r419, r11; +} +{ +add.f16x2 r425, r416, r422; +} +{ +add.f16x2 r428, r401, r425; +} +{ +add.f16x2 r431, %18, %28; +} +{ +mul.f16x2 r434, r431, r1; +} +{ +add.f16x2 r437, %16, r434; +} +{ +add.f16x2 r440, %20, %26; +} +{ +mul.f16x2 r443, r440, r5; +} +{ +add.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, %22, %24; +} +{ +mul.f16x2 r452, r449, r9; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, %17, %27; +} +{ +mul.f16x2 r461, r458, r3; +} +{ +sub.f16x2 r464, %19, %25; +} +{ +mul.f16x2 r467, r464, r7; +} +{ +add.f16x2 r470, r461, r467; +} +{ +sub.f16x2 r473, %21, %23; +} +{ +mul.f16x2 r476, r473, r11; +} +{ +add.f16x2 r479, r470, r476; +} +{ +sub.f16x2 r482, r455, r479; +} +{ +add.f16x2 r485, %18, %28; +} +{ +mul.f16x2 r488, r485, r5; +} +{ +add.f16x2 r491, %16, r488; +} +{ +add.f16x2 r494, %20, %26; +} +{ +mul.f16x2 r497, r494, r13; +} +{ +add.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, %22, %24; +} +{ +mul.f16x2 r506, r503, r15; +} +{ +add.f16x2 r509, r500, r506; +} +{ +sub.f16x2 r512, %17, %27; +} +{ +mul.f16x2 r515, r512, r7; +} +{ +sub.f16x2 r518, %19, %25; +} +{ +mul.f16x2 r521, r518, r14; +} +{ +add.f16x2 r524, r515, r521; +} +{ +sub.f16x2 r527, %21, %23; +} +{ +mul.f16x2 r530, r527, r16; +} +{ +add.f16x2 r533, r524, r530; +} +{ +add.f16x2 r536, r509, r533; +} +{ +add.f16x2 r539, %18, %28; +} +{ +mul.f16x2 r542, r539, r5; +} +{ +add.f16x2 r545, %16, r542; +} +{ +add.f16x2 r548, %20, %26; +} +{ +mul.f16x2 r551, r548, r13; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, %22, %24; +} +{ +mul.f16x2 r560, r557, r15; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, %17, %27; +} +{ +mul.f16x2 r569, r566, r7; +} +{ +sub.f16x2 r572, %19, %25; +} +{ +mul.f16x2 r575, r572, r14; +} +{ +add.f16x2 r578, r569, r575; +} +{ +sub.f16x2 r581, %21, %23; +} +{ +mul.f16x2 r584, r581, r16; +} +{ +add.f16x2 r587, r578, r584; +} +{ +sub.f16x2 r590, r563, r587; +} +{ +add.f16x2 r593, %18, %28; +} +{ +mul.f16x2 r596, r593, r9; +} +{ +add.f16x2 r599, %16, r596; +} +{ +add.f16x2 r602, %20, %26; +} +{ +mul.f16x2 r605, r602, r15; +} +{ +add.f16x2 r608, r599, r605; +} +{ +add.f16x2 r611, %22, %24; +} +{ +mul.f16x2 r614, r611, r5; +} +{ +add.f16x2 r617, r608, r614; +} +{ +sub.f16x2 r620, %17, %27; +} +{ +mul.f16x2 r623, r620, r11; +} +{ +sub.f16x2 r626, %19, %25; +} +{ +mul.f16x2 r629, r626, r16; +} +{ +add.f16x2 r632, r623, r629; +} +{ +sub.f16x2 r635, %21, %23; +} +{ +mul.f16x2 r638, r635, r7; +} +{ +add.f16x2 r641, r632, r638; +} +{ +add.f16x2 r644, r617, r641; +} +{ +add.f16x2 r647, %18, %28; +} +{ +mul.f16x2 r650, r647, r9; +} +{ +add.f16x2 r653, %16, r650; +} +{ +add.f16x2 r656, %20, %26; +} +{ +mul.f16x2 r659, r656, r15; +} +{ +add.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %22, %24; +} +{ +mul.f16x2 r668, r665, r5; +} +{ +add.f16x2 r671, r662, r668; +} +{ +sub.f16x2 r674, %17, %27; +} +{ +mul.f16x2 r677, r674, r11; +} +{ +sub.f16x2 r680, %19, %25; +} +{ +mul.f16x2 r683, r680, r16; +} +{ +add.f16x2 r686, r677, r683; +} +{ +sub.f16x2 r689, %21, %23; +} +{ +mul.f16x2 r692, r689, r7; +} +{ +add.f16x2 r695, r686, r692; +} +{ +sub.f16x2 r698, r671, r695; +} +mul.wide.u32 rd2, r3479, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3480, rd3; +mul.lo.s32 r3481, r3480, 343; +sub.s32 r3482, r3479, r3481; +cvt.rn.f32.u32 f129, r3482; +mul.f32 f130, f129, 0f3B2B805B; +cos.approx.f32 f21, f130; +sin.approx.f32 f131, f130; +neg.f32 f22, f131; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r701, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r706, {high, high}; +} +{ +mul.f16x2 r708, r428, r706; +} +{ +fma.rn.f16x2 r711, r104, r704, r708; +} +{ +mul.f16x2 r715, r104, r706; +} +{ +neg.f16x2 r718, r715; +} +{ +fma.rn.f16x2 r720, r428, r704, r718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r726, {high, high}; +} +mov.f32 f105, 0fBF800000; +mov.f32 f106, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r728, {low, high}; +} +{ +mul.f16x2 r729, r726, r728; +} +{ +mul.f16x2 r732, r701, r724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r735, {high, low}; +} +{ +fma.rn.f16x2 r737, r729, r735, r732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r536, r743; +} +{ +fma.rn.f16x2 r748, r212, r741, r745; +} +{ +mul.f16x2 r752, r212, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r536, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r737, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r644, r780; +} +{ +fma.rn.f16x2 r785, r320, r778, r782; +} +{ +mul.f16x2 r789, r320, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r644, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r698, r817; +} +{ +fma.rn.f16x2 r822, r374, r815, r819; +} +{ +mul.f16x2 r826, r374, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r698, r815, r829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r835, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r837, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r839, {low, high}; +} +{ +mul.f16x2 r840, r837, r839; +} +{ +mul.f16x2 r843, r811, r835; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r846, {high, low}; +} +{ +fma.rn.f16x2 r848, r840, r846, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r854, {high, high}; +} +{ +mul.f16x2 r856, r590, r854; +} +{ +fma.rn.f16x2 r859, r266, r852, r856; +} +{ +mul.f16x2 r863, r266, r854; +} +{ +neg.f16x2 r866, r863; +} +{ +fma.rn.f16x2 r868, r590, r852, r866; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r872, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r874, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r876, {low, high}; +} +{ +mul.f16x2 r877, r874, r876; +} +{ +mul.f16x2 r880, r848, r872; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r883, {high, low}; +} +{ +fma.rn.f16x2 r885, r877, r883, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r891, {high, high}; +} +{ +mul.f16x2 r893, r482, r891; +} +{ +fma.rn.f16x2 r896, r158, r889, r893; +} +{ +mul.f16x2 r900, r158, r891; +} +{ +neg.f16x2 r903, r900; +} +{ +fma.rn.f16x2 r905, r482, r889, r903; +} +mad.lo.s32 r3483, r3480, 9604, r3478; +barrier.sync 0; +mad.lo.s32 r3484, r3482, 28, r3483; +st.shared.u32 [r3484], r32; +st.shared.u32 [r3484+4], r711; +st.shared.u32 [r3484+8], r748; +st.shared.u32 [r3484+12], r785; +st.shared.u32 [r3484+16], r822; +st.shared.u32 [r3484+20], r859; +st.shared.u32 [r3484+24], r896; +barrier.sync 0; +mad.lo.s32 r3485, r3482, -24, r3484; +ld.shared.u32 r946, [r3485]; +ld.shared.u32 r943, [r3485+1372]; +ld.shared.u32 r949, [r3485+2744]; +ld.shared.u32 r955, [r3485+4116]; +ld.shared.u32 r956, [r3485+5488]; +ld.shared.u32 r950, [r3485+6860]; +ld.shared.u32 r944, [r3485+8232]; +barrier.sync 0; +st.shared.u32 [r3484], r50; +st.shared.u32 [r3484+4], r720; +st.shared.u32 [r3484+8], r757; +st.shared.u32 [r3484+12], r794; +st.shared.u32 [r3484+16], r831; +st.shared.u32 [r3484+20], r868; +st.shared.u32 [r3484+24], r905; +barrier.sync 0; +ld.shared.u32 r964, [r3485]; +ld.shared.u32 r961, [r3485+1372]; +ld.shared.u32 r967, [r3485+2744]; +ld.shared.u32 r973, [r3485+4116]; +ld.shared.u32 r974, [r3485+5488]; +ld.shared.u32 r968, [r3485+6860]; +ld.shared.u32 r962, [r3485+8232]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r927, {low, high}; +} +{ +neg.f16x2 r928, r927; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r941, {low, high}; +} +{ +add.f16x2 r942, r943, r944; +} +{ +add.f16x2 r945, r946, r942; +} +{ +add.f16x2 r948, r949, r950; +} +{ +add.f16x2 r951, r945, r948; +} +{ +add.f16x2 r954, r955, r956; +} +{ +add.f16x2 r957, r951, r954; +} +{ +add.f16x2 r960, r961, r962; +} +{ +add.f16x2 r963, r964, r960; +} +{ +add.f16x2 r966, r967, r968; +} +{ +add.f16x2 r969, r963, r966; +} +{ +add.f16x2 r972, r973, r974; +} +{ +add.f16x2 r975, r969, r972; +} +{ +add.f16x2 r978, r943, r944; +} +{ +mul.f16x2 r981, r978, r926; +} +{ +add.f16x2 r984, r946, r981; +} +{ +add.f16x2 r987, r949, r950; +} +{ +mul.f16x2 r990, r987, r930; +} +{ +add.f16x2 r993, r984, r990; +} +{ +add.f16x2 r996, r955, r956; +} +{ +mul.f16x2 r999, r996, r934; +} +{ +add.f16x2 r1002, r993, r999; +} +{ +sub.f16x2 r1005, r961, r962; +} +{ +mul.f16x2 r1008, r1005, r928; +} +{ +sub.f16x2 r1011, r967, r968; +} +{ +mul.f16x2 r1014, r1011, r932; +} +{ +add.f16x2 r1017, r1008, r1014; +} +{ +sub.f16x2 r1020, r973, r974; +} +{ +mul.f16x2 r1023, r1020, r936; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r1002, r1026; +} +{ +add.f16x2 r1032, r943, r944; +} +{ +mul.f16x2 r1035, r1032, r926; +} +{ +add.f16x2 r1038, r946, r1035; +} +{ +add.f16x2 r1041, r949, r950; +} +{ +mul.f16x2 r1044, r1041, r930; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r955, r956; +} +{ +mul.f16x2 r1053, r1050, r934; +} +{ +add.f16x2 r1056, r1047, r1053; +} +{ +sub.f16x2 r1059, r961, r962; +} +{ +mul.f16x2 r1062, r1059, r928; +} +{ +sub.f16x2 r1065, r967, r968; +} +{ +mul.f16x2 r1068, r1065, r932; +} +{ +add.f16x2 r1071, r1062, r1068; +} +{ +sub.f16x2 r1074, r973, r974; +} +{ +mul.f16x2 r1077, r1074, r936; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r1056, r1080; +} +{ +add.f16x2 r1086, r943, r944; +} +{ +mul.f16x2 r1089, r1086, r930; +} +{ +add.f16x2 r1092, r946, r1089; +} +{ +add.f16x2 r1095, r949, r950; +} +{ +mul.f16x2 r1098, r1095, r938; +} +{ +add.f16x2 r1101, r1092, r1098; +} +{ +add.f16x2 r1104, r955, r956; +} +{ +mul.f16x2 r1107, r1104, r940; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, r961, r962; +} +{ +mul.f16x2 r1116, r1113, r932; +} +{ +sub.f16x2 r1119, r967, r968; +} +{ +mul.f16x2 r1122, r1119, r939; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r973, r974; +} +{ +mul.f16x2 r1131, r1128, r941; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r1110, r1134; +} +{ +add.f16x2 r1140, r943, r944; +} +{ +mul.f16x2 r1143, r1140, r930; +} +{ +add.f16x2 r1146, r946, r1143; +} +{ +add.f16x2 r1149, r949, r950; +} +{ +mul.f16x2 r1152, r1149, r938; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r955, r956; +} +{ +mul.f16x2 r1161, r1158, r940; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +sub.f16x2 r1167, r961, r962; +} +{ +mul.f16x2 r1170, r1167, r932; +} +{ +sub.f16x2 r1173, r967, r968; +} +{ +mul.f16x2 r1176, r1173, r939; +} +{ +add.f16x2 r1179, r1170, r1176; +} +{ +sub.f16x2 r1182, r973, r974; +} +{ +mul.f16x2 r1185, r1182, r941; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +add.f16x2 r1191, r1164, r1188; +} +{ +add.f16x2 r1194, r943, r944; +} +{ +mul.f16x2 r1197, r1194, r934; +} +{ +add.f16x2 r1200, r946, r1197; +} +{ +add.f16x2 r1203, r949, r950; +} +{ +mul.f16x2 r1206, r1203, r940; +} +{ +add.f16x2 r1209, r1200, r1206; +} +{ +add.f16x2 r1212, r955, r956; +} +{ +mul.f16x2 r1215, r1212, r930; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, r961, r962; +} +{ +mul.f16x2 r1224, r1221, r936; +} +{ +sub.f16x2 r1227, r967, r968; +} +{ +mul.f16x2 r1230, r1227, r941; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r973, r974; +} +{ +mul.f16x2 r1239, r1236, r932; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r1218, r1242; +} +{ +add.f16x2 r1248, r943, r944; +} +{ +mul.f16x2 r1251, r1248, r934; +} +{ +add.f16x2 r1254, r946, r1251; +} +{ +add.f16x2 r1257, r949, r950; +} +{ +mul.f16x2 r1260, r1257, r940; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r955, r956; +} +{ +mul.f16x2 r1269, r1266, r930; +} +{ +add.f16x2 r1272, r1263, r1269; +} +{ +sub.f16x2 r1275, r961, r962; +} +{ +mul.f16x2 r1278, r1275, r936; +} +{ +sub.f16x2 r1281, r967, r968; +} +{ +mul.f16x2 r1284, r1281, r941; +} +{ +add.f16x2 r1287, r1278, r1284; +} +{ +sub.f16x2 r1290, r973, r974; +} +{ +mul.f16x2 r1293, r1290, r932; +} +{ +add.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 r1299, r1272, r1296; +} +{ +add.f16x2 r1302, r961, r962; +} +{ +mul.f16x2 r1305, r1302, r926; +} +{ +add.f16x2 r1308, r964, r1305; +} +{ +add.f16x2 r1311, r967, r968; +} +{ +mul.f16x2 r1314, r1311, r930; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +add.f16x2 r1320, r973, r974; +} +{ +mul.f16x2 r1323, r1320, r934; +} +{ +add.f16x2 r1326, r1317, r1323; +} +{ +sub.f16x2 r1329, r943, r944; +} +{ +mul.f16x2 r1332, r1329, r928; +} +{ +sub.f16x2 r1335, r949, r950; +} +{ +mul.f16x2 r1338, r1335, r932; +} +{ +add.f16x2 r1341, r1332, r1338; +} +{ +sub.f16x2 r1344, r955, r956; +} +{ +mul.f16x2 r1347, r1344, r936; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1326, r1350; +} +{ +add.f16x2 r1356, r961, r962; +} +{ +mul.f16x2 r1359, r1356, r926; +} +{ +add.f16x2 r1362, r964, r1359; +} +{ +add.f16x2 r1365, r967, r968; +} +{ +mul.f16x2 r1368, r1365, r930; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r973, r974; +} +{ +mul.f16x2 r1377, r1374, r934; +} +{ +add.f16x2 r1380, r1371, r1377; +} +{ +sub.f16x2 r1383, r943, r944; +} +{ +mul.f16x2 r1386, r1383, r928; +} +{ +sub.f16x2 r1389, r949, r950; +} +{ +mul.f16x2 r1392, r1389, r932; +} +{ +add.f16x2 r1395, r1386, r1392; +} +{ +sub.f16x2 r1398, r955, r956; +} +{ +mul.f16x2 r1401, r1398, r936; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +sub.f16x2 r1407, r1380, r1404; +} +{ +add.f16x2 r1410, r961, r962; +} +{ +mul.f16x2 r1413, r1410, r930; +} +{ +add.f16x2 r1416, r964, r1413; +} +{ +add.f16x2 r1419, r967, r968; +} +{ +mul.f16x2 r1422, r1419, r938; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +add.f16x2 r1428, r973, r974; +} +{ +mul.f16x2 r1431, r1428, r940; +} +{ +add.f16x2 r1434, r1425, r1431; +} +{ +sub.f16x2 r1437, r943, r944; +} +{ +mul.f16x2 r1440, r1437, r932; +} +{ +sub.f16x2 r1443, r949, r950; +} +{ +mul.f16x2 r1446, r1443, r939; +} +{ +add.f16x2 r1449, r1440, r1446; +} +{ +sub.f16x2 r1452, r955, r956; +} +{ +mul.f16x2 r1455, r1452, r941; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +add.f16x2 r1461, r1434, r1458; +} +{ +add.f16x2 r1464, r961, r962; +} +{ +mul.f16x2 r1467, r1464, r930; +} +{ +add.f16x2 r1470, r964, r1467; +} +{ +add.f16x2 r1473, r967, r968; +} +{ +mul.f16x2 r1476, r1473, r938; +} +{ +add.f16x2 r1479, r1470, r1476; +} +{ +add.f16x2 r1482, r973, r974; +} +{ +mul.f16x2 r1485, r1482, r940; +} +{ +add.f16x2 r1488, r1479, r1485; +} +{ +sub.f16x2 r1491, r943, r944; +} +{ +mul.f16x2 r1494, r1491, r932; +} +{ +sub.f16x2 r1497, r949, r950; +} +{ +mul.f16x2 r1500, r1497, r939; +} +{ +add.f16x2 r1503, r1494, r1500; +} +{ +sub.f16x2 r1506, r955, r956; +} +{ +mul.f16x2 r1509, r1506, r941; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +sub.f16x2 r1515, r1488, r1512; +} +{ +add.f16x2 r1518, r961, r962; +} +{ +mul.f16x2 r1521, r1518, r934; +} +{ +add.f16x2 r1524, r964, r1521; +} +{ +add.f16x2 r1527, r967, r968; +} +{ +mul.f16x2 r1530, r1527, r940; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +add.f16x2 r1536, r973, r974; +} +{ +mul.f16x2 r1539, r1536, r930; +} +{ +add.f16x2 r1542, r1533, r1539; +} +{ +sub.f16x2 r1545, r943, r944; +} +{ +mul.f16x2 r1548, r1545, r936; +} +{ +sub.f16x2 r1551, r949, r950; +} +{ +mul.f16x2 r1554, r1551, r941; +} +{ +add.f16x2 r1557, r1548, r1554; +} +{ +sub.f16x2 r1560, r955, r956; +} +{ +mul.f16x2 r1563, r1560, r932; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +add.f16x2 r1569, r1542, r1566; +} +{ +add.f16x2 r1572, r961, r962; +} +{ +mul.f16x2 r1575, r1572, r934; +} +{ +add.f16x2 r1578, r964, r1575; +} +{ +add.f16x2 r1581, r967, r968; +} +{ +mul.f16x2 r1584, r1581, r940; +} +{ +add.f16x2 r1587, r1578, r1584; +} +{ +add.f16x2 r1590, r973, r974; +} +{ +mul.f16x2 r1593, r1590, r930; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, r943, r944; +} +{ +mul.f16x2 r1602, r1599, r936; +} +{ +sub.f16x2 r1605, r949, r950; +} +{ +mul.f16x2 r1608, r1605, r941; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r955, r956; +} +{ +mul.f16x2 r1617, r1614, r932; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +sub.f16x2 r1623, r1596, r1620; +} +mul.wide.u32 rd4, r3482, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r3486, rd5; +sub.s32 r3487, r3482, r3486; +shr.u32 r3488, r3487, 1; +add.s32 r3489, r3488, r3486; +shr.u32 r3490, r3489, 2; +cvt.rn.f32.u32 f132, r3490; +mul.f32 f133, f132, 0f3C961050; +cos.approx.f32 f57, f133; +sin.approx.f32 f134, f133; +neg.f32 f58, f134; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1626, {low, high}; +} +mul.lo.s32 r3491, r3490, 7; +sub.s32 r3492, r3482, r3491; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1629, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1631, {high, high}; +} +{ +mul.f16x2 r1633, r1353, r1631; +} +{ +fma.rn.f16x2 r1636, r1029, r1629, r1633; +} +{ +mul.f16x2 r1640, r1029, r1631; +} +{ +neg.f16x2 r1643, r1640; +} +{ +fma.rn.f16x2 r1645, r1353, r1629, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1651, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1653, {low, high}; +} +{ +mul.f16x2 r1654, r1651, r1653; +} +{ +mul.f16x2 r1657, r1626, r1649; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1660, {high, low}; +} +{ +fma.rn.f16x2 r1662, r1654, r1660, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1666, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1668, {high, high}; +} +{ +mul.f16x2 r1670, r1461, r1668; +} +{ +fma.rn.f16x2 r1673, r1137, r1666, r1670; +} +{ +mul.f16x2 r1677, r1137, r1668; +} +{ +neg.f16x2 r1680, r1677; +} +{ +fma.rn.f16x2 r1682, r1461, r1666, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1688, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1688, r1690; +} +{ +mul.f16x2 r1694, r1662, r1686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1697, {high, low}; +} +{ +fma.rn.f16x2 r1699, r1691, r1697, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1703, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1705, {high, high}; +} +{ +mul.f16x2 r1707, r1569, r1705; +} +{ +fma.rn.f16x2 r1710, r1245, r1703, r1707; +} +{ +mul.f16x2 r1714, r1245, r1705; +} +{ +neg.f16x2 r1717, r1714; +} +{ +fma.rn.f16x2 r1719, r1569, r1703, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1725, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1725, r1727; +} +{ +mul.f16x2 r1731, r1699, r1723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1734, {high, low}; +} +{ +fma.rn.f16x2 r1736, r1728, r1734, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1742, {high, high}; +} +{ +mul.f16x2 r1744, r1623, r1742; +} +{ +fma.rn.f16x2 r1747, r1299, r1740, r1744; +} +{ +mul.f16x2 r1751, r1299, r1742; +} +{ +neg.f16x2 r1754, r1751; +} +{ +fma.rn.f16x2 r1756, r1623, r1740, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1762, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1764, {low, high}; +} +{ +mul.f16x2 r1765, r1762, r1764; +} +{ +mul.f16x2 r1768, r1736, r1760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1771, {high, low}; +} +{ +fma.rn.f16x2 r1773, r1765, r1771, r1768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1779, {high, high}; +} +{ +mul.f16x2 r1781, r1515, r1779; +} +{ +fma.rn.f16x2 r1784, r1191, r1777, r1781; +} +{ +mul.f16x2 r1788, r1191, r1779; +} +{ +neg.f16x2 r1791, r1788; +} +{ +fma.rn.f16x2 r1793, r1515, r1777, r1791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1799, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r1801, {low, high}; +} +{ +mul.f16x2 r1802, r1799, r1801; +} +{ +mul.f16x2 r1805, r1773, r1797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1808, {high, low}; +} +{ +fma.rn.f16x2 r1810, r1802, r1808, r1805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1816, {high, high}; +} +{ +mul.f16x2 r1818, r1407, r1816; +} +{ +fma.rn.f16x2 r1821, r1083, r1814, r1818; +} +{ +mul.f16x2 r1825, r1083, r1816; +} +{ +neg.f16x2 r1828, r1825; +} +{ +fma.rn.f16x2 r1830, r1407, r1814, r1828; +} +shl.b32 r3493, r3492, 2; +add.s32 r3494, r3483, r3493; +barrier.sync 0; +mad.lo.s32 r3495, r3490, 196, r3494; +st.shared.u32 [r3495], r957; +st.shared.u32 [r3495+28], r1636; +st.shared.u32 [r3495+56], r1673; +st.shared.u32 [r3495+84], r1710; +st.shared.u32 [r3495+112], r1747; +st.shared.u32 [r3495+140], r1784; +st.shared.u32 [r3495+168], r1821; +barrier.sync 0; +ld.shared.u32 r1871, [r3485]; +ld.shared.u32 r1868, [r3485+1372]; +ld.shared.u32 r1874, [r3485+2744]; +ld.shared.u32 r1880, [r3485+4116]; +ld.shared.u32 r1881, [r3485+5488]; +ld.shared.u32 r1875, [r3485+6860]; +ld.shared.u32 r1869, [r3485+8232]; +barrier.sync 0; +st.shared.u32 [r3495], r975; +st.shared.u32 [r3495+28], r1645; +st.shared.u32 [r3495+56], r1682; +st.shared.u32 [r3495+84], r1719; +st.shared.u32 [r3495+112], r1756; +st.shared.u32 [r3495+140], r1793; +st.shared.u32 [r3495+168], r1830; +barrier.sync 0; +ld.shared.u32 r1889, [r3485]; +ld.shared.u32 r1886, [r3485+1372]; +ld.shared.u32 r1892, [r3485+2744]; +ld.shared.u32 r1898, [r3485+4116]; +ld.shared.u32 r1899, [r3485+5488]; +ld.shared.u32 r1893, [r3485+6860]; +ld.shared.u32 r1887, [r3485+8232]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1852, {low, high}; +} +{ +neg.f16x2 r1853, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r1855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r1856, {low, high}; +} +{ +neg.f16x2 r1857, r1856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1859, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1860, {low, high}; +} +{ +neg.f16x2 r1861, r1860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r1863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r1864, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r1865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r1866, {low, high}; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1871, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 r1876, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1876, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 r1894, r1888, r1891; +} +{ +add.f16x2 r1897, r1898, r1899; +} +{ +add.f16x2 r1900, r1894, r1897; +} +{ +add.f16x2 r1903, r1868, r1869; +} +{ +mul.f16x2 r1906, r1903, r1851; +} +{ +add.f16x2 r1909, r1871, r1906; +} +{ +add.f16x2 r1912, r1874, r1875; +} +{ +mul.f16x2 r1915, r1912, r1855; +} +{ +add.f16x2 r1918, r1909, r1915; +} +{ +add.f16x2 r1921, r1880, r1881; +} +{ +mul.f16x2 r1924, r1921, r1859; +} +{ +add.f16x2 r1927, r1918, r1924; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1853; +} +{ +sub.f16x2 r1936, r1892, r1893; +} +{ +mul.f16x2 r1939, r1936, r1857; +} +{ +add.f16x2 r1942, r1933, r1939; +} +{ +sub.f16x2 r1945, r1898, r1899; +} +{ +mul.f16x2 r1948, r1945, r1861; +} +{ +add.f16x2 r1951, r1942, r1948; +} +{ +sub.f16x2 r1954, r1927, r1951; +} +{ +add.f16x2 r1957, r1868, r1869; +} +{ +mul.f16x2 r1960, r1957, r1851; +} +{ +add.f16x2 r1963, r1871, r1960; +} +{ +add.f16x2 r1966, r1874, r1875; +} +{ +mul.f16x2 r1969, r1966, r1855; +} +{ +add.f16x2 r1972, r1963, r1969; +} +{ +add.f16x2 r1975, r1880, r1881; +} +{ +mul.f16x2 r1978, r1975, r1859; +} +{ +add.f16x2 r1981, r1972, r1978; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1853; +} +{ +sub.f16x2 r1990, r1892, r1893; +} +{ +mul.f16x2 r1993, r1990, r1857; +} +{ +add.f16x2 r1996, r1987, r1993; +} +{ +sub.f16x2 r1999, r1898, r1899; +} +{ +mul.f16x2 r2002, r1999, r1861; +} +{ +add.f16x2 r2005, r1996, r2002; +} +{ +add.f16x2 r2008, r1981, r2005; +} +{ +add.f16x2 r2011, r1868, r1869; +} +{ +mul.f16x2 r2014, r2011, r1855; +} +{ +add.f16x2 r2017, r1871, r2014; +} +{ +add.f16x2 r2020, r1874, r1875; +} +{ +mul.f16x2 r2023, r2020, r1863; +} +{ +add.f16x2 r2026, r2017, r2023; +} +{ +add.f16x2 r2029, r1880, r1881; +} +{ +mul.f16x2 r2032, r2029, r1865; +} +{ +add.f16x2 r2035, r2026, r2032; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1857; +} +{ +sub.f16x2 r2044, r1892, r1893; +} +{ +mul.f16x2 r2047, r2044, r1864; +} +{ +add.f16x2 r2050, r2041, r2047; +} +{ +sub.f16x2 r2053, r1898, r1899; +} +{ +mul.f16x2 r2056, r2053, r1866; +} +{ +add.f16x2 r2059, r2050, r2056; +} +{ +sub.f16x2 r2062, r2035, r2059; +} +{ +add.f16x2 r2065, r1868, r1869; +} +{ +mul.f16x2 r2068, r2065, r1855; +} +{ +add.f16x2 r2071, r1871, r2068; +} +{ +add.f16x2 r2074, r1874, r1875; +} +{ +mul.f16x2 r2077, r2074, r1863; +} +{ +add.f16x2 r2080, r2071, r2077; +} +{ +add.f16x2 r2083, r1880, r1881; +} +{ +mul.f16x2 r2086, r2083, r1865; +} +{ +add.f16x2 r2089, r2080, r2086; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1857; +} +{ +sub.f16x2 r2098, r1892, r1893; +} +{ +mul.f16x2 r2101, r2098, r1864; +} +{ +add.f16x2 r2104, r2095, r2101; +} +{ +sub.f16x2 r2107, r1898, r1899; +} +{ +mul.f16x2 r2110, r2107, r1866; +} +{ +add.f16x2 r2113, r2104, r2110; +} +{ +add.f16x2 r2116, r2089, r2113; +} +{ +add.f16x2 r2119, r1868, r1869; +} +{ +mul.f16x2 r2122, r2119, r1859; +} +{ +add.f16x2 r2125, r1871, r2122; +} +{ +add.f16x2 r2128, r1874, r1875; +} +{ +mul.f16x2 r2131, r2128, r1865; +} +{ +add.f16x2 r2134, r2125, r2131; +} +{ +add.f16x2 r2137, r1880, r1881; +} +{ +mul.f16x2 r2140, r2137, r1855; +} +{ +add.f16x2 r2143, r2134, r2140; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1861; +} +{ +sub.f16x2 r2152, r1892, r1893; +} +{ +mul.f16x2 r2155, r2152, r1866; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +sub.f16x2 r2161, r1898, r1899; +} +{ +mul.f16x2 r2164, r2161, r1857; +} +{ +add.f16x2 r2167, r2158, r2164; +} +{ +sub.f16x2 r2170, r2143, r2167; +} +{ +add.f16x2 r2173, r1868, r1869; +} +{ +mul.f16x2 r2176, r2173, r1859; +} +{ +add.f16x2 r2179, r1871, r2176; +} +{ +add.f16x2 r2182, r1874, r1875; +} +{ +mul.f16x2 r2185, r2182, r1865; +} +{ +add.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r1880, r1881; +} +{ +mul.f16x2 r2194, r2191, r1855; +} +{ +add.f16x2 r2197, r2188, r2194; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1861; +} +{ +sub.f16x2 r2206, r1892, r1893; +} +{ +mul.f16x2 r2209, r2206, r1866; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +sub.f16x2 r2215, r1898, r1899; +} +{ +mul.f16x2 r2218, r2215, r1857; +} +{ +add.f16x2 r2221, r2212, r2218; +} +{ +add.f16x2 r2224, r2197, r2221; +} +{ +add.f16x2 r2227, r1886, r1887; +} +{ +mul.f16x2 r2230, r2227, r1851; +} +{ +add.f16x2 r2233, r1889, r2230; +} +{ +add.f16x2 r2236, r1892, r1893; +} +{ +mul.f16x2 r2239, r2236, r1855; +} +{ +add.f16x2 r2242, r2233, r2239; +} +{ +add.f16x2 r2245, r1898, r1899; +} +{ +mul.f16x2 r2248, r2245, r1859; +} +{ +add.f16x2 r2251, r2242, r2248; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1853; +} +{ +sub.f16x2 r2260, r1874, r1875; +} +{ +mul.f16x2 r2263, r2260, r1857; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +sub.f16x2 r2269, r1880, r1881; +} +{ +mul.f16x2 r2272, r2269, r1861; +} +{ +add.f16x2 r2275, r2266, r2272; +} +{ +add.f16x2 r2278, r2251, r2275; +} +{ +add.f16x2 r2281, r1886, r1887; +} +{ +mul.f16x2 r2284, r2281, r1851; +} +{ +add.f16x2 r2287, r1889, r2284; +} +{ +add.f16x2 r2290, r1892, r1893; +} +{ +mul.f16x2 r2293, r2290, r1855; +} +{ +add.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r1898, r1899; +} +{ +mul.f16x2 r2302, r2299, r1859; +} +{ +add.f16x2 r2305, r2296, r2302; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1853; +} +{ +sub.f16x2 r2314, r1874, r1875; +} +{ +mul.f16x2 r2317, r2314, r1857; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r1880, r1881; +} +{ +mul.f16x2 r2326, r2323, r1861; +} +{ +add.f16x2 r2329, r2320, r2326; +} +{ +sub.f16x2 r2332, r2305, r2329; +} +{ +add.f16x2 r2335, r1886, r1887; +} +{ +mul.f16x2 r2338, r2335, r1855; +} +{ +add.f16x2 r2341, r1889, r2338; +} +{ +add.f16x2 r2344, r1892, r1893; +} +{ +mul.f16x2 r2347, r2344, r1863; +} +{ +add.f16x2 r2350, r2341, r2347; +} +{ +add.f16x2 r2353, r1898, r1899; +} +{ +mul.f16x2 r2356, r2353, r1865; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1857; +} +{ +sub.f16x2 r2368, r1874, r1875; +} +{ +mul.f16x2 r2371, r2368, r1864; +} +{ +add.f16x2 r2374, r2365, r2371; +} +{ +sub.f16x2 r2377, r1880, r1881; +} +{ +mul.f16x2 r2380, r2377, r1866; +} +{ +add.f16x2 r2383, r2374, r2380; +} +{ +add.f16x2 r2386, r2359, r2383; +} +{ +add.f16x2 r2389, r1886, r1887; +} +{ +mul.f16x2 r2392, r2389, r1855; +} +{ +add.f16x2 r2395, r1889, r2392; +} +{ +add.f16x2 r2398, r1892, r1893; +} +{ +mul.f16x2 r2401, r2398, r1863; +} +{ +add.f16x2 r2404, r2395, r2401; +} +{ +add.f16x2 r2407, r1898, r1899; +} +{ +mul.f16x2 r2410, r2407, r1865; +} +{ +add.f16x2 r2413, r2404, r2410; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1857; +} +{ +sub.f16x2 r2422, r1874, r1875; +} +{ +mul.f16x2 r2425, r2422, r1864; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r1880, r1881; +} +{ +mul.f16x2 r2434, r2431, r1866; +} +{ +add.f16x2 r2437, r2428, r2434; +} +{ +sub.f16x2 r2440, r2413, r2437; +} +{ +add.f16x2 r2443, r1886, r1887; +} +{ +mul.f16x2 r2446, r2443, r1859; +} +{ +add.f16x2 r2449, r1889, r2446; +} +{ +add.f16x2 r2452, r1892, r1893; +} +{ +mul.f16x2 r2455, r2452, r1865; +} +{ +add.f16x2 r2458, r2449, r2455; +} +{ +add.f16x2 r2461, r1898, r1899; +} +{ +mul.f16x2 r2464, r2461, r1855; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1861; +} +{ +sub.f16x2 r2476, r1874, r1875; +} +{ +mul.f16x2 r2479, r2476, r1866; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +sub.f16x2 r2485, r1880, r1881; +} +{ +mul.f16x2 r2488, r2485, r1857; +} +{ +add.f16x2 r2491, r2482, r2488; +} +{ +add.f16x2 r2494, r2467, r2491; +} +{ +add.f16x2 r2497, r1886, r1887; +} +{ +mul.f16x2 r2500, r2497, r1859; +} +{ +add.f16x2 r2503, r1889, r2500; +} +{ +add.f16x2 r2506, r1892, r1893; +} +{ +mul.f16x2 r2509, r2506, r1865; +} +{ +add.f16x2 r2512, r2503, r2509; +} +{ +add.f16x2 r2515, r1898, r1899; +} +{ +mul.f16x2 r2518, r2515, r1855; +} +{ +add.f16x2 r2521, r2512, r2518; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1861; +} +{ +sub.f16x2 r2530, r1874, r1875; +} +{ +mul.f16x2 r2533, r2530, r1866; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r1880, r1881; +} +{ +mul.f16x2 r2542, r2539, r1857; +} +{ +add.f16x2 r2545, r2536, r2542; +} +{ +sub.f16x2 r2548, r2521, r2545; +} +mul.wide.u32 rd6, r3482, 1402438301; +shr.u64 rd7, rd6, 36; +cvt.u32.u64 r3496, rd7; +cvt.rn.f32.u32 f135, r3496; +mul.f32 f136, f135, 0f3E034E46; +cos.approx.f32 f93, f136; +sin.approx.f32 f137, f136; +neg.f32 f94, f137; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r2551, {low, high}; +} +mul.lo.s32 r3497, r3496, 49; +sub.s32 r3498, r3482, r3497; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2554, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2556, {high, high}; +} +{ +mul.f16x2 r2558, r2278, r2556; +} +{ +fma.rn.f16x2 r2561, r1954, r2554, r2558; +} +{ +mul.f16x2 r2565, r1954, r2556; +} +{ +neg.f16x2 r2568, r2565; +} +{ +fma.rn.f16x2 r2570, r2278, r2554, r2568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2576, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2578, {low, high}; +} +{ +mul.f16x2 r2579, r2576, r2578; +} +{ +mul.f16x2 r2582, r2551, r2574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2585, {high, low}; +} +{ +fma.rn.f16x2 r2587, r2579, r2585, r2582; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2587; +mov.b32 r2591, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2587; +mov.b32 r2593, {high, high}; +} +{ +mul.f16x2 r2595, r2386, r2593; +} +{ +fma.rn.f16x2 r2598, r2062, r2591, r2595; +} +{ +mul.f16x2 r2602, r2062, r2593; +} +{ +neg.f16x2 r2605, r2602; +} +{ +fma.rn.f16x2 r2607, r2386, r2591, r2605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2613, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2615, {low, high}; +} +{ +mul.f16x2 r2616, r2613, r2615; +} +{ +mul.f16x2 r2619, r2587, r2611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2587; +mov.b32 r2622, {high, low}; +} +{ +fma.rn.f16x2 r2624, r2616, r2622, r2619; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2624; +mov.b32 r2628, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2624; +mov.b32 r2630, {high, high}; +} +{ +mul.f16x2 r2632, r2494, r2630; +} +{ +fma.rn.f16x2 r2635, r2170, r2628, r2632; +} +{ +mul.f16x2 r2639, r2170, r2630; +} +{ +neg.f16x2 r2642, r2639; +} +{ +fma.rn.f16x2 r2644, r2494, r2628, r2642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2650, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2652, {low, high}; +} +{ +mul.f16x2 r2653, r2650, r2652; +} +{ +mul.f16x2 r2656, r2624, r2648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2624; +mov.b32 r2659, {high, low}; +} +{ +fma.rn.f16x2 r2661, r2653, r2659, r2656; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2661; +mov.b32 r2665, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2661; +mov.b32 r2667, {high, high}; +} +{ +mul.f16x2 r2669, r2548, r2667; +} +{ +fma.rn.f16x2 r2672, r2224, r2665, r2669; +} +{ +mul.f16x2 r2676, r2224, r2667; +} +{ +neg.f16x2 r2679, r2676; +} +{ +fma.rn.f16x2 r2681, r2548, r2665, r2679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2687, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2689, {low, high}; +} +{ +mul.f16x2 r2690, r2687, r2689; +} +{ +mul.f16x2 r2693, r2661, r2685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2661; +mov.b32 r2696, {high, low}; +} +{ +fma.rn.f16x2 r2698, r2690, r2696, r2693; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2698; +mov.b32 r2702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2698; +mov.b32 r2704, {high, high}; +} +{ +mul.f16x2 r2706, r2440, r2704; +} +{ +fma.rn.f16x2 r2709, r2116, r2702, r2706; +} +{ +mul.f16x2 r2713, r2116, r2704; +} +{ +neg.f16x2 r2716, r2713; +} +{ +fma.rn.f16x2 r2718, r2440, r2702, r2716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2551; +mov.b32 r2724, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f105; +cvt.rn.f16.f32 high, f106; +mov.b32 r2726, {low, high}; +} +{ +mul.f16x2 r2727, r2724, r2726; +} +{ +mul.f16x2 r2730, r2698, r2722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2698; +mov.b32 r2733, {high, low}; +} +{ +fma.rn.f16x2 r2735, r2727, r2733, r2730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2735; +mov.b32 r2739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2735; +mov.b32 r2741, {high, high}; +} +{ +mul.f16x2 r2743, r2332, r2741; +} +{ +fma.rn.f16x2 r2746, r2008, r2739, r2743; +} +{ +mul.f16x2 r2750, r2008, r2741; +} +{ +neg.f16x2 r2753, r2750; +} +{ +fma.rn.f16x2 r2755, r2332, r2739, r2753; +} +shl.b32 r3499, r3498, 2; +add.s32 r3500, r3483, r3499; +barrier.sync 0; +mad.lo.s32 r3501, r3496, 1372, r3500; +st.shared.u32 [r3501], r1882; +st.shared.u32 [r3501+196], r2561; +st.shared.u32 [r3501+392], r2598; +st.shared.u32 [r3501+588], r2635; +st.shared.u32 [r3501+784], r2672; +st.shared.u32 [r3501+980], r2709; +st.shared.u32 [r3501+1176], r2746; +barrier.sync 0; +ld.shared.u32 r2796, [r3485]; +ld.shared.u32 r2793, [r3485+1372]; +ld.shared.u32 r2799, [r3485+2744]; +ld.shared.u32 r2805, [r3485+4116]; +ld.shared.u32 r2806, [r3485+5488]; +ld.shared.u32 r2800, [r3485+6860]; +ld.shared.u32 r2794, [r3485+8232]; +barrier.sync 0; +st.shared.u32 [r3501], r1900; +st.shared.u32 [r3501+196], r2570; +st.shared.u32 [r3501+392], r2607; +st.shared.u32 [r3501+588], r2644; +st.shared.u32 [r3501+784], r2681; +st.shared.u32 [r3501+980], r2718; +st.shared.u32 [r3501+1176], r2755; +barrier.sync 0; +ld.shared.u32 r2814, [r3485]; +ld.shared.u32 r2811, [r3485+1372]; +ld.shared.u32 r2817, [r3485+2744]; +ld.shared.u32 r2823, [r3485+4116]; +ld.shared.u32 r2824, [r3485+5488]; +ld.shared.u32 r2818, [r3485+6860]; +ld.shared.u32 r2812, [r3485+8232]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2777, {low, high}; +} +{ +neg.f16x2 r2778, r2777; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f114; +cvt.rn.f16.f32 high, f114; +mov.b32 r2780, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f116; +cvt.rn.f16.f32 high, f116; +mov.b32 r2781, {low, high}; +} +{ +neg.f16x2 r2782, r2781; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2784, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2785, {low, high}; +} +{ +neg.f16x2 r2786, r2785; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f122; +cvt.rn.f16.f32 high, f122; +mov.b32 r2788, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f124; +cvt.rn.f16.f32 high, f124; +mov.b32 r2789, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f126; +cvt.rn.f16.f32 high, f126; +mov.b32 r2790, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f128; +cvt.rn.f16.f32 high, f128; +mov.b32 r2791, {low, high}; +} +{ +add.f16x2 r2792, r2793, r2794; +} +{ +add.f16x2 r2795, r2796, r2792; +} +{ +add.f16x2 r2798, r2799, r2800; +} +{ +add.f16x2 r2801, r2795, r2798; +} +{ +add.f16x2 r2804, r2805, r2806; +} +{ +add.f16x2 %0, r2801, r2804; +} +{ +add.f16x2 r2810, r2811, r2812; +} +{ +add.f16x2 r2813, r2814, r2810; +} +{ +add.f16x2 r2816, r2817, r2818; +} +{ +add.f16x2 r2819, r2813, r2816; +} +{ +add.f16x2 r2822, r2823, r2824; +} +{ +add.f16x2 %1, r2819, r2822; +} +{ +add.f16x2 r2828, r2793, r2794; +} +{ +mul.f16x2 r2831, r2828, r2776; +} +{ +add.f16x2 r2834, r2796, r2831; +} +{ +add.f16x2 r2837, r2799, r2800; +} +{ +mul.f16x2 r2840, r2837, r2780; +} +{ +add.f16x2 r2843, r2834, r2840; +} +{ +add.f16x2 r2846, r2805, r2806; +} +{ +mul.f16x2 r2849, r2846, r2784; +} +{ +add.f16x2 r2852, r2843, r2849; +} +{ +sub.f16x2 r2855, r2811, r2812; +} +{ +mul.f16x2 r2858, r2855, r2778; +} +{ +sub.f16x2 r2861, r2817, r2818; +} +{ +mul.f16x2 r2864, r2861, r2782; +} +{ +add.f16x2 r2867, r2858, r2864; +} +{ +sub.f16x2 r2870, r2823, r2824; +} +{ +mul.f16x2 r2873, r2870, r2786; +} +{ +add.f16x2 r2876, r2867, r2873; +} +{ +sub.f16x2 %2, r2852, r2876; +} +{ +add.f16x2 r2882, r2793, r2794; +} +{ +mul.f16x2 r2885, r2882, r2776; +} +{ +add.f16x2 r2888, r2796, r2885; +} +{ +add.f16x2 r2891, r2799, r2800; +} +{ +mul.f16x2 r2894, r2891, r2780; +} +{ +add.f16x2 r2897, r2888, r2894; +} +{ +add.f16x2 r2900, r2805, r2806; +} +{ +mul.f16x2 r2903, r2900, r2784; +} +{ +add.f16x2 r2906, r2897, r2903; +} +{ +sub.f16x2 r2909, r2811, r2812; +} +{ +mul.f16x2 r2912, r2909, r2778; +} +{ +sub.f16x2 r2915, r2817, r2818; +} +{ +mul.f16x2 r2918, r2915, r2782; +} +{ +add.f16x2 r2921, r2912, r2918; +} +{ +sub.f16x2 r2924, r2823, r2824; +} +{ +mul.f16x2 r2927, r2924, r2786; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +add.f16x2 %12, r2906, r2930; +} +{ +add.f16x2 r2936, r2793, r2794; +} +{ +mul.f16x2 r2939, r2936, r2780; +} +{ +add.f16x2 r2942, r2796, r2939; +} +{ +add.f16x2 r2945, r2799, r2800; +} +{ +mul.f16x2 r2948, r2945, r2788; +} +{ +add.f16x2 r2951, r2942, r2948; +} +{ +add.f16x2 r2954, r2805, r2806; +} +{ +mul.f16x2 r2957, r2954, r2790; +} +{ +add.f16x2 r2960, r2951, r2957; +} +{ +sub.f16x2 r2963, r2811, r2812; +} +{ +mul.f16x2 r2966, r2963, r2782; +} +{ +sub.f16x2 r2969, r2817, r2818; +} +{ +mul.f16x2 r2972, r2969, r2789; +} +{ +add.f16x2 r2975, r2966, r2972; +} +{ +sub.f16x2 r2978, r2823, r2824; +} +{ +mul.f16x2 r2981, r2978, r2791; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 %4, r2960, r2984; +} +{ +add.f16x2 r2990, r2793, r2794; +} +{ +mul.f16x2 r2993, r2990, r2780; +} +{ +add.f16x2 r2996, r2796, r2993; +} +{ +add.f16x2 r2999, r2799, r2800; +} +{ +mul.f16x2 r3002, r2999, r2788; +} +{ +add.f16x2 r3005, r2996, r3002; +} +{ +add.f16x2 r3008, r2805, r2806; +} +{ +mul.f16x2 r3011, r3008, r2790; +} +{ +add.f16x2 r3014, r3005, r3011; +} +{ +sub.f16x2 r3017, r2811, r2812; +} +{ +mul.f16x2 r3020, r3017, r2782; +} +{ +sub.f16x2 r3023, r2817, r2818; +} +{ +mul.f16x2 r3026, r3023, r2789; +} +{ +add.f16x2 r3029, r3020, r3026; +} +{ +sub.f16x2 r3032, r2823, r2824; +} +{ +mul.f16x2 r3035, r3032, r2791; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +add.f16x2 %10, r3014, r3038; +} +{ +add.f16x2 r3044, r2793, r2794; +} +{ +mul.f16x2 r3047, r3044, r2784; +} +{ +add.f16x2 r3050, r2796, r3047; +} +{ +add.f16x2 r3053, r2799, r2800; +} +{ +mul.f16x2 r3056, r3053, r2790; +} +{ +add.f16x2 r3059, r3050, r3056; +} +{ +add.f16x2 r3062, r2805, r2806; +} +{ +mul.f16x2 r3065, r3062, r2780; +} +{ +add.f16x2 r3068, r3059, r3065; +} +{ +sub.f16x2 r3071, r2811, r2812; +} +{ +mul.f16x2 r3074, r3071, r2786; +} +{ +sub.f16x2 r3077, r2817, r2818; +} +{ +mul.f16x2 r3080, r3077, r2791; +} +{ +add.f16x2 r3083, r3074, r3080; +} +{ +sub.f16x2 r3086, r2823, r2824; +} +{ +mul.f16x2 r3089, r3086, r2782; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 %6, r3068, r3092; +} +{ +add.f16x2 r3098, r2793, r2794; +} +{ +mul.f16x2 r3101, r3098, r2784; +} +{ +add.f16x2 r3104, r2796, r3101; +} +{ +add.f16x2 r3107, r2799, r2800; +} +{ +mul.f16x2 r3110, r3107, r2790; +} +{ +add.f16x2 r3113, r3104, r3110; +} +{ +add.f16x2 r3116, r2805, r2806; +} +{ +mul.f16x2 r3119, r3116, r2780; +} +{ +add.f16x2 r3122, r3113, r3119; +} +{ +sub.f16x2 r3125, r2811, r2812; +} +{ +mul.f16x2 r3128, r3125, r2786; +} +{ +sub.f16x2 r3131, r2817, r2818; +} +{ +mul.f16x2 r3134, r3131, r2791; +} +{ +add.f16x2 r3137, r3128, r3134; +} +{ +sub.f16x2 r3140, r2823, r2824; +} +{ +mul.f16x2 r3143, r3140, r2782; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +add.f16x2 %8, r3122, r3146; +} +{ +add.f16x2 r3152, r2811, r2812; +} +{ +mul.f16x2 r3155, r3152, r2776; +} +{ +add.f16x2 r3158, r2814, r3155; +} +{ +add.f16x2 r3161, r2817, r2818; +} +{ +mul.f16x2 r3164, r3161, r2780; +} +{ +add.f16x2 r3167, r3158, r3164; +} +{ +add.f16x2 r3170, r2823, r2824; +} +{ +mul.f16x2 r3173, r3170, r2784; +} +{ +add.f16x2 r3176, r3167, r3173; +} +{ +sub.f16x2 r3179, r2793, r2794; +} +{ +mul.f16x2 r3182, r3179, r2778; +} +{ +sub.f16x2 r3185, r2799, r2800; +} +{ +mul.f16x2 r3188, r3185, r2782; +} +{ +add.f16x2 r3191, r3182, r3188; +} +{ +sub.f16x2 r3194, r2805, r2806; +} +{ +mul.f16x2 r3197, r3194, r2786; +} +{ +add.f16x2 r3200, r3191, r3197; +} +{ +add.f16x2 %3, r3176, r3200; +} +{ +add.f16x2 r3206, r2811, r2812; +} +{ +mul.f16x2 r3209, r3206, r2776; +} +{ +add.f16x2 r3212, r2814, r3209; +} +{ +add.f16x2 r3215, r2817, r2818; +} +{ +mul.f16x2 r3218, r3215, r2780; +} +{ +add.f16x2 r3221, r3212, r3218; +} +{ +add.f16x2 r3224, r2823, r2824; +} +{ +mul.f16x2 r3227, r3224, r2784; +} +{ +add.f16x2 r3230, r3221, r3227; +} +{ +sub.f16x2 r3233, r2793, r2794; +} +{ +mul.f16x2 r3236, r3233, r2778; +} +{ +sub.f16x2 r3239, r2799, r2800; +} +{ +mul.f16x2 r3242, r3239, r2782; +} +{ +add.f16x2 r3245, r3236, r3242; +} +{ +sub.f16x2 r3248, r2805, r2806; +} +{ +mul.f16x2 r3251, r3248, r2786; +} +{ +add.f16x2 r3254, r3245, r3251; +} +{ +sub.f16x2 %13, r3230, r3254; +} +{ +add.f16x2 r3260, r2811, r2812; +} +{ +mul.f16x2 r3263, r3260, r2780; +} +{ +add.f16x2 r3266, r2814, r3263; +} +{ +add.f16x2 r3269, r2817, r2818; +} +{ +mul.f16x2 r3272, r3269, r2788; +} +{ +add.f16x2 r3275, r3266, r3272; +} +{ +add.f16x2 r3278, r2823, r2824; +} +{ +mul.f16x2 r3281, r3278, r2790; +} +{ +add.f16x2 r3284, r3275, r3281; +} +{ +sub.f16x2 r3287, r2793, r2794; +} +{ +mul.f16x2 r3290, r3287, r2782; +} +{ +sub.f16x2 r3293, r2799, r2800; +} +{ +mul.f16x2 r3296, r3293, r2789; +} +{ +add.f16x2 r3299, r3290, r3296; +} +{ +sub.f16x2 r3302, r2805, r2806; +} +{ +mul.f16x2 r3305, r3302, r2791; +} +{ +add.f16x2 r3308, r3299, r3305; +} +{ +add.f16x2 %5, r3284, r3308; +} +{ +add.f16x2 r3314, r2811, r2812; +} +{ +mul.f16x2 r3317, r3314, r2780; +} +{ +add.f16x2 r3320, r2814, r3317; +} +{ +add.f16x2 r3323, r2817, r2818; +} +{ +mul.f16x2 r3326, r3323, r2788; +} +{ +add.f16x2 r3329, r3320, r3326; +} +{ +add.f16x2 r3332, r2823, r2824; +} +{ +mul.f16x2 r3335, r3332, r2790; +} +{ +add.f16x2 r3338, r3329, r3335; +} +{ +sub.f16x2 r3341, r2793, r2794; +} +{ +mul.f16x2 r3344, r3341, r2782; +} +{ +sub.f16x2 r3347, r2799, r2800; +} +{ +mul.f16x2 r3350, r3347, r2789; +} +{ +add.f16x2 r3353, r3344, r3350; +} +{ +sub.f16x2 r3356, r2805, r2806; +} +{ +mul.f16x2 r3359, r3356, r2791; +} +{ +add.f16x2 r3362, r3353, r3359; +} +{ +sub.f16x2 %11, r3338, r3362; +} +{ +add.f16x2 r3368, r2811, r2812; +} +{ +mul.f16x2 r3371, r3368, r2784; +} +{ +add.f16x2 r3374, r2814, r3371; +} +{ +add.f16x2 r3377, r2817, r2818; +} +{ +mul.f16x2 r3380, r3377, r2790; +} +{ +add.f16x2 r3383, r3374, r3380; +} +{ +add.f16x2 r3386, r2823, r2824; +} +{ +mul.f16x2 r3389, r3386, r2780; +} +{ +add.f16x2 r3392, r3383, r3389; +} +{ +sub.f16x2 r3395, r2793, r2794; +} +{ +mul.f16x2 r3398, r3395, r2786; +} +{ +sub.f16x2 r3401, r2799, r2800; +} +{ +mul.f16x2 r3404, r3401, r2791; +} +{ +add.f16x2 r3407, r3398, r3404; +} +{ +sub.f16x2 r3410, r2805, r2806; +} +{ +mul.f16x2 r3413, r3410, r2782; +} +{ +add.f16x2 r3416, r3407, r3413; +} +{ +add.f16x2 %7, r3392, r3416; +} +{ +add.f16x2 r3422, r2811, r2812; +} +{ +mul.f16x2 r3425, r3422, r2784; +} +{ +add.f16x2 r3428, r2814, r3425; +} +{ +add.f16x2 r3431, r2817, r2818; +} +{ +mul.f16x2 r3434, r3431, r2790; +} +{ +add.f16x2 r3437, r3428, r3434; +} +{ +add.f16x2 r3440, r2823, r2824; +} +{ +mul.f16x2 r3443, r3440, r2780; +} +{ +add.f16x2 r3446, r3437, r3443; +} +{ +sub.f16x2 r3449, r2793, r2794; +} +{ +mul.f16x2 r3452, r3449, r2786; +} +{ +sub.f16x2 r3455, r2799, r2800; +} +{ +mul.f16x2 r3458, r3455, r2791; +} +{ +add.f16x2 r3461, r3452, r3458; +} +{ +sub.f16x2 r3464, r2805, r2806; +} +{ +mul.f16x2 r3467, r3464, r2782; +} +{ +add.f16x2 r3470, r3461, r3467; +} +{ +sub.f16x2 %9, r3446, r3470; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..e7132ae6b221f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp32_fwd.hpp.inc @@ -0,0 +1,1296 @@ +#ifndef CUFFTDX_FFT_2401_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_2401_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<176, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<642>; +.reg .b32 r<27>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 19208, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %20, %34; +add.f32 f30, %18, f29; +add.f32 f31, %23, %31; +add.f32 f32, f31, f30; +add.f32 f33, %26, %28; +add.f32 f34, %22, %35; +add.f32 f35, %19, f34; +add.f32 f36, %25, %33; +add.f32 f37, f36, f35; +add.f32 f38, %27, %30; +fma.rn.f32 f39, f29, 0f3F1F9D07, %18; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %22, %35; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %25, %33; +mul.f32 f47, f46, 0fBF7994E0; +sub.f32 f48, f47, f45; +sub.f32 f49, %27, %30; +mul.f32 f50, f49, 0f3EDE2602; +sub.f32 f51, f48, f50; +sub.f32 f52, f43, f51; +add.f32 f53, f51, f43; +mul.f32 f54, f29, 0f3E63DC87; +sub.f32 f55, %18, f54; +mul.f32 f56, f31, 0f3F66A5E5; +sub.f32 f57, f55, f56; +fma.rn.f32 f58, f33, 0f3F1F9D07, f57; +mul.f32 f59, f44, 0f3F7994E0; +mul.f32 f60, f46, 0f3EDE2602; +sub.f32 f61, f60, f59; +fma.rn.f32 f62, f49, 0f3F48261C, f61; +sub.f32 f63, f58, f62; +add.f32 f64, f62, f58; +mul.f32 f65, f29, 0f3F66A5E5; +sub.f32 f66, %18, f65; +fma.rn.f32 f67, f31, 0f3F1F9D07, f66; +mul.f32 f68, f33, 0f3E63DC87; +sub.f32 f69, f67, f68; +mul.f32 f70, f44, 0f3EDE2602; +mul.f32 f71, f46, 0f3F48261C; +sub.f32 f72, f71, f70; +mul.f32 f73, f49, 0f3F7994E0; +sub.f32 f74, f72, f73; +sub.f32 f75, f69, f74; +add.f32 f76, f74, f69; +fma.rn.f32 f77, f34, 0f3F1F9D07, %19; +mul.f32 f78, f36, 0f3E63DC87; +sub.f32 f79, f77, f78; +mul.f32 f80, f38, 0f3F66A5E5; +sub.f32 f81, f79, f80; +sub.f32 f82, %20, %34; +mul.f32 f83, f82, 0f3F48261C; +sub.f32 f84, %23, %31; +mul.f32 f85, f84, 0fBF7994E0; +sub.f32 f86, f85, f83; +sub.f32 f87, %26, %28; +mul.f32 f88, f87, 0f3EDE2602; +sub.f32 f89, f86, f88; +add.f32 f90, f89, f81; +sub.f32 f91, f81, f89; +mul.f32 f92, f34, 0f3E63DC87; +sub.f32 f93, %19, f92; +mul.f32 f94, f36, 0f3F66A5E5; +sub.f32 f95, f93, f94; +fma.rn.f32 f96, f38, 0f3F1F9D07, f95; +mul.f32 f97, f82, 0f3F7994E0; +mul.f32 f98, f84, 0f3EDE2602; +sub.f32 f99, f98, f97; +fma.rn.f32 f100, f87, 0f3F48261C, f99; +add.f32 f101, f100, f96; +sub.f32 f102, f96, f100; +mul.f32 f103, f34, 0f3F66A5E5; +sub.f32 f104, %19, f103; +fma.rn.f32 f105, f36, 0f3F1F9D07, f104; +mul.f32 f106, f38, 0f3E63DC87; +sub.f32 f107, f105, f106; +mul.f32 f108, f82, 0f3EDE2602; +mul.f32 f109, f84, 0f3F48261C; +sub.f32 f110, f109, f108; +mul.f32 f111, f87, 0f3F7994E0; +sub.f32 f112, f110, f111; +add.f32 f113, f112, f107; +sub.f32 f114, f107, f112; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 19208, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f115, f116}, [rd6]; +mul.f32 f119, f115, f52; +mul.f32 f120, f116, f90; +mul.f32 f121, f115, f90; +mul.f32 f122, f115, f115; +mul.f32 f123, f116, f116; +sub.f32 f124, f122, f123; +mul.f32 f125, f116, f115; +fma.rn.f32 f126, f116, f115, f125; +mul.f32 f127, f124, f63; +mul.f32 f128, f126, f101; +mul.f32 f129, f124, f101; +mul.f32 f130, f115, f124; +mul.f32 f131, f116, f126; +sub.f32 f132, f130, f131; +mul.f32 f133, f115, f126; +fma.rn.f32 f134, f116, f124, f133; +mul.f32 f135, f132, f75; +mul.f32 f136, f134, f113; +mul.f32 f137, f132, f113; +mul.f32 f138, f115, f132; +mul.f32 f139, f116, f134; +sub.f32 f140, f138, f139; +mul.f32 f141, f115, f134; +fma.rn.f32 f142, f116, f132, f141; +mul.f32 f143, f140, f76; +mul.f32 f144, f142, f114; +mul.f32 f145, f140, f114; +mul.f32 f146, f115, f140; +mul.f32 f147, f116, f142; +sub.f32 f148, f146, f147; +mul.f32 f149, f115, f142; +fma.rn.f32 f150, f116, f140, f149; +mul.f32 f151, f148, f64; +mul.f32 f152, f150, f102; +mul.f32 f153, f148, f102; +mul.f32 f154, f115, f148; +mul.f32 f155, f116, f150; +sub.f32 f156, f154, f155; +mul.f32 f157, f115, f150; +fma.rn.f32 f158, f116, f148, f157; +mul.f32 f159, f156, f53; +mul.f32 f160, f158, f91; +mul.f32 f161, f156, f91; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +add.f32 f162, f38, f37; +add.f32 f163, f33, f32; +st.shared.v2.f32 [r9], {f163, f162}; +fma.rn.f32 f164, f116, f52, f121; +sub.f32 f165, f119, f120; +st.shared.v2.f32 [r9+8], {f165, f164}; +fma.rn.f32 f166, f126, f63, f129; +sub.f32 f167, f127, f128; +st.shared.v2.f32 [r9+16], {f167, f166}; +sub.f32 f168, f135, f136; +fma.rn.f32 f169, f134, f75, f137; +st.shared.v2.f32 [r9+24], {f168, f169}; +fma.rn.f32 f170, f142, f76, f145; +sub.f32 f171, f143, f144; +st.shared.v2.f32 [r9+32], {f171, f170}; +fma.rn.f32 f172, f150, f64, f153; +sub.f32 f173, f151, f152; +st.shared.v2.f32 [r9+40], {f173, f172}; +fma.rn.f32 f174, f158, f53, f161; +sub.f32 f175, f159, f160; +st.shared.v2.f32 [r9+48], {f175, f174}; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.v2.f32 {f176, f177}, [r10]; +ld.shared.v2.f32 {f180, f181}, [r10+2744]; +ld.shared.v2.f32 {f184, f185}, [r10+5488]; +ld.shared.v2.f32 {f188, f189}, [r10+8232]; +ld.shared.v2.f32 {f192, f193}, [r10+10976]; +ld.shared.v2.f32 {f196, f197}, [r10+13720]; +ld.shared.v2.f32 {f200, f201}, [r10+16464]; +add.f32 f204, f180, f200; +add.f32 f205, f176, f204; +add.f32 f206, f184, f196; +add.f32 f207, f206, f205; +add.f32 f208, f188, f192; +add.f32 f209, f181, f201; +add.f32 f210, f177, f209; +add.f32 f211, f185, f197; +add.f32 f212, f211, f210; +add.f32 f213, f189, f193; +fma.rn.f32 f214, f204, 0f3F1F9D07, f176; +mul.f32 f215, f206, 0f3E63DC87; +sub.f32 f216, f214, f215; +mul.f32 f217, f208, 0f3F66A5E5; +sub.f32 f218, f216, f217; +sub.f32 f219, f181, f201; +mul.f32 f220, f219, 0f3F48261C; +sub.f32 f221, f185, f197; +mul.f32 f222, f221, 0fBF7994E0; +sub.f32 f223, f222, f220; +sub.f32 f224, f189, f193; +mul.f32 f225, f224, 0f3EDE2602; +sub.f32 f226, f223, f225; +sub.f32 f227, f218, f226; +add.f32 f228, f226, f218; +mul.f32 f229, f204, 0f3E63DC87; +sub.f32 f230, f176, f229; +mul.f32 f231, f206, 0f3F66A5E5; +sub.f32 f232, f230, f231; +fma.rn.f32 f233, f208, 0f3F1F9D07, f232; +mul.f32 f234, f219, 0f3F7994E0; +mul.f32 f235, f221, 0f3EDE2602; +sub.f32 f236, f235, f234; +fma.rn.f32 f237, f224, 0f3F48261C, f236; +sub.f32 f238, f233, f237; +add.f32 f239, f237, f233; +mul.f32 f240, f204, 0f3F66A5E5; +sub.f32 f241, f176, f240; +fma.rn.f32 f242, f206, 0f3F1F9D07, f241; +mul.f32 f243, f208, 0f3E63DC87; +sub.f32 f244, f242, f243; +mul.f32 f245, f219, 0f3EDE2602; +mul.f32 f246, f221, 0f3F48261C; +sub.f32 f247, f246, f245; +mul.f32 f248, f224, 0f3F7994E0; +sub.f32 f249, f247, f248; +sub.f32 f250, f244, f249; +add.f32 f251, f249, f244; +fma.rn.f32 f252, f209, 0f3F1F9D07, f177; +mul.f32 f253, f211, 0f3E63DC87; +sub.f32 f254, f252, f253; +mul.f32 f255, f213, 0f3F66A5E5; +sub.f32 f256, f254, f255; +sub.f32 f257, f180, f200; +mul.f32 f258, f257, 0f3F48261C; +sub.f32 f259, f184, f196; +mul.f32 f260, f259, 0fBF7994E0; +sub.f32 f261, f260, f258; +sub.f32 f262, f188, f192; +mul.f32 f263, f262, 0f3EDE2602; +sub.f32 f264, f261, f263; +add.f32 f265, f264, f256; +sub.f32 f266, f256, f264; +mul.f32 f267, f209, 0f3E63DC87; +sub.f32 f268, f177, f267; +mul.f32 f269, f211, 0f3F66A5E5; +sub.f32 f270, f268, f269; +fma.rn.f32 f271, f213, 0f3F1F9D07, f270; +mul.f32 f272, f257, 0f3F7994E0; +mul.f32 f273, f259, 0f3EDE2602; +sub.f32 f274, f273, f272; +fma.rn.f32 f275, f262, 0f3F48261C, f274; +add.f32 f276, f275, f271; +sub.f32 f277, f271, f275; +mul.f32 f278, f209, 0f3F66A5E5; +sub.f32 f279, f177, f278; +fma.rn.f32 f280, f211, 0f3F1F9D07, f279; +mul.f32 f281, f213, 0f3E63DC87; +sub.f32 f282, f280, f281; +mul.f32 f283, f257, 0f3EDE2602; +mul.f32 f284, f259, 0f3F48261C; +sub.f32 f285, f284, f283; +mul.f32 f286, f262, 0f3F7994E0; +sub.f32 f287, f285, f286; +add.f32 f288, f287, f282; +sub.f32 f289, f282, f287; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f290, f291}, [rd11]; +mul.f32 f294, f290, f227; +mul.f32 f295, f291, f265; +mul.f32 f296, f290, f265; +mul.f32 f297, f290, f290; +mul.f32 f298, f291, f291; +sub.f32 f299, f297, f298; +mul.f32 f300, f291, f290; +fma.rn.f32 f301, f291, f290, f300; +mul.f32 f302, f299, f238; +mul.f32 f303, f301, f276; +mul.f32 f304, f299, f276; +mul.f32 f305, f290, f299; +mul.f32 f306, f291, f301; +sub.f32 f307, f305, f306; +mul.f32 f308, f290, f301; +fma.rn.f32 f309, f291, f299, f308; +mul.f32 f310, f307, f250; +mul.f32 f311, f309, f288; +mul.f32 f312, f307, f288; +mul.f32 f313, f290, f307; +mul.f32 f314, f291, f309; +sub.f32 f315, f313, f314; +mul.f32 f316, f290, f309; +fma.rn.f32 f317, f291, f307, f316; +mul.f32 f318, f315, f251; +mul.f32 f319, f317, f289; +mul.f32 f320, f315, f289; +mul.f32 f321, f290, f315; +mul.f32 f322, f291, f317; +sub.f32 f323, f321, f322; +mul.f32 f324, f290, f317; +fma.rn.f32 f325, f291, f315, f324; +mul.f32 f326, f323, f239; +mul.f32 f327, f325, f277; +mul.f32 f328, f323, f277; +mul.f32 f329, f290, f323; +mul.f32 f330, f291, f325; +sub.f32 f331, f329, f330; +mul.f32 f332, f290, f325; +fma.rn.f32 f333, f291, f323, f332; +mul.f32 f334, f331, f228; +mul.f32 f335, f333, f266; +mul.f32 f336, f331, f266; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +add.f32 f337, f213, f212; +add.f32 f338, f208, f207; +st.shared.v2.f32 [r20], {f338, f337}; +fma.rn.f32 f339, f291, f227, f296; +sub.f32 f340, f294, f295; +st.shared.v2.f32 [r20+56], {f340, f339}; +fma.rn.f32 f341, f301, f238, f304; +sub.f32 f342, f302, f303; +st.shared.v2.f32 [r20+112], {f342, f341}; +fma.rn.f32 f343, f309, f250, f312; +sub.f32 f344, f310, f311; +st.shared.v2.f32 [r20+168], {f344, f343}; +sub.f32 f345, f318, f319; +fma.rn.f32 f346, f317, f251, f320; +st.shared.v2.f32 [r20+224], {f345, f346}; +fma.rn.f32 f347, f325, f239, f328; +sub.f32 f348, f326, f327; +st.shared.v2.f32 [r20+280], {f348, f347}; +fma.rn.f32 f349, f333, f228, f336; +sub.f32 f350, f334, f335; +st.shared.v2.f32 [r20+336], {f350, f349}; +barrier.sync 0; +ld.shared.v2.f32 {f351, f352}, [r10]; +ld.shared.v2.f32 {f355, f356}, [r10+2744]; +ld.shared.v2.f32 {f359, f360}, [r10+5488]; +ld.shared.v2.f32 {f363, f364}, [r10+8232]; +ld.shared.v2.f32 {f367, f368}, [r10+10976]; +ld.shared.v2.f32 {f371, f372}, [r10+13720]; +ld.shared.v2.f32 {f375, f376}, [r10+16464]; +add.f32 f379, f355, f375; +add.f32 f380, f351, f379; +add.f32 f381, f359, f371; +add.f32 f382, f381, f380; +add.f32 f383, f363, f367; +add.f32 f384, f356, f376; +add.f32 f385, f352, f384; +add.f32 f386, f360, f372; +add.f32 f387, f386, f385; +add.f32 f388, f364, f368; +fma.rn.f32 f389, f379, 0f3F1F9D07, f351; +mul.f32 f390, f381, 0f3E63DC87; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, 0f3F66A5E5; +sub.f32 f393, f391, f392; +sub.f32 f394, f356, f376; +mul.f32 f395, f394, 0f3F48261C; +sub.f32 f396, f360, f372; +mul.f32 f397, f396, 0fBF7994E0; +sub.f32 f398, f397, f395; +sub.f32 f399, f364, f368; +mul.f32 f400, f399, 0f3EDE2602; +sub.f32 f401, f398, f400; +sub.f32 f402, f393, f401; +add.f32 f403, f401, f393; +mul.f32 f404, f379, 0f3E63DC87; +sub.f32 f405, f351, f404; +mul.f32 f406, f381, 0f3F66A5E5; +sub.f32 f407, f405, f406; +fma.rn.f32 f408, f383, 0f3F1F9D07, f407; +mul.f32 f409, f394, 0f3F7994E0; +mul.f32 f410, f396, 0f3EDE2602; +sub.f32 f411, f410, f409; +fma.rn.f32 f412, f399, 0f3F48261C, f411; +sub.f32 f413, f408, f412; +add.f32 f414, f412, f408; +mul.f32 f415, f379, 0f3F66A5E5; +sub.f32 f416, f351, f415; +fma.rn.f32 f417, f381, 0f3F1F9D07, f416; +mul.f32 f418, f383, 0f3E63DC87; +sub.f32 f419, f417, f418; +mul.f32 f420, f394, 0f3EDE2602; +mul.f32 f421, f396, 0f3F48261C; +sub.f32 f422, f421, f420; +mul.f32 f423, f399, 0f3F7994E0; +sub.f32 f424, f422, f423; +sub.f32 f425, f419, f424; +add.f32 f426, f424, f419; +fma.rn.f32 f427, f384, 0f3F1F9D07, f352; +mul.f32 f428, f386, 0f3E63DC87; +sub.f32 f429, f427, f428; +mul.f32 f430, f388, 0f3F66A5E5; +sub.f32 f431, f429, f430; +sub.f32 f432, f355, f375; +mul.f32 f433, f432, 0f3F48261C; +sub.f32 f434, f359, f371; +mul.f32 f435, f434, 0fBF7994E0; +sub.f32 f436, f435, f433; +sub.f32 f437, f363, f367; +mul.f32 f438, f437, 0f3EDE2602; +sub.f32 f439, f436, f438; +add.f32 f440, f439, f431; +sub.f32 f441, f431, f439; +mul.f32 f442, f384, 0f3E63DC87; +sub.f32 f443, f352, f442; +mul.f32 f444, f386, 0f3F66A5E5; +sub.f32 f445, f443, f444; +fma.rn.f32 f446, f388, 0f3F1F9D07, f445; +mul.f32 f447, f432, 0f3F7994E0; +mul.f32 f448, f434, 0f3EDE2602; +sub.f32 f449, f448, f447; +fma.rn.f32 f450, f437, 0f3F48261C, f449; +add.f32 f451, f450, f446; +sub.f32 f452, f446, f450; +mul.f32 f453, f384, 0f3F66A5E5; +sub.f32 f454, f352, f453; +fma.rn.f32 f455, f386, 0f3F1F9D07, f454; +mul.f32 f456, f388, 0f3E63DC87; +sub.f32 f457, f455, f456; +mul.f32 f458, f432, 0f3EDE2602; +mul.f32 f459, f434, 0f3F48261C; +sub.f32 f460, f459, f458; +mul.f32 f461, f437, 0f3F7994E0; +sub.f32 f462, f460, f461; +add.f32 f463, f462, f457; +sub.f32 f464, f457, f462; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 8; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f465, f466}, [rd16]; +mul.f32 f469, f465, f402; +mul.f32 f470, f466, f440; +mul.f32 f471, f465, f440; +mul.f32 f472, f465, f465; +mul.f32 f473, f466, f466; +sub.f32 f474, f472, f473; +mul.f32 f475, f466, f465; +fma.rn.f32 f476, f466, f465, f475; +mul.f32 f477, f474, f413; +mul.f32 f478, f476, f451; +mul.f32 f479, f474, f451; +mul.f32 f480, f465, f474; +mul.f32 f481, f466, f476; +sub.f32 f482, f480, f481; +mul.f32 f483, f465, f476; +fma.rn.f32 f484, f466, f474, f483; +mul.f32 f485, f482, f425; +mul.f32 f486, f484, f463; +mul.f32 f487, f482, f463; +mul.f32 f488, f465, f482; +mul.f32 f489, f466, f484; +sub.f32 f490, f488, f489; +mul.f32 f491, f465, f484; +fma.rn.f32 f492, f466, f482, f491; +mul.f32 f493, f490, f426; +mul.f32 f494, f492, f464; +mul.f32 f495, f490, f464; +mul.f32 f496, f465, f490; +mul.f32 f497, f466, f492; +sub.f32 f498, f496, f497; +mul.f32 f499, f465, f492; +fma.rn.f32 f500, f466, f490, f499; +mul.f32 f501, f498, f414; +mul.f32 f502, f500, f452; +mul.f32 f503, f498, f452; +mul.f32 f504, f465, f498; +mul.f32 f505, f466, f500; +sub.f32 f506, f504, f505; +mul.f32 f507, f465, f500; +fma.rn.f32 f508, f466, f498, f507; +mul.f32 f509, f506, f403; +mul.f32 f510, f508, f441; +mul.f32 f511, f506, f441; +shl.b32 r24, r23, 3; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 2744, r25; +add.f32 f512, f388, f387; +add.f32 f513, f383, f382; +st.shared.v2.f32 [r26], {f513, f512}; +fma.rn.f32 f514, f466, f402, f471; +sub.f32 f515, f469, f470; +st.shared.v2.f32 [r26+392], {f515, f514}; +fma.rn.f32 f516, f476, f413, f479; +sub.f32 f517, f477, f478; +st.shared.v2.f32 [r26+784], {f517, f516}; +fma.rn.f32 f518, f484, f425, f487; +sub.f32 f519, f485, f486; +st.shared.v2.f32 [r26+1176], {f519, f518}; +fma.rn.f32 f520, f492, f426, f495; +sub.f32 f521, f493, f494; +st.shared.v2.f32 [r26+1568], {f521, f520}; +sub.f32 f522, f501, f502; +fma.rn.f32 f523, f500, f414, f503; +st.shared.v2.f32 [r26+1960], {f522, f523}; +fma.rn.f32 f524, f508, f403, f511; +sub.f32 f525, f509, f510; +st.shared.v2.f32 [r26+2352], {f525, f524}; +barrier.sync 0; +ld.shared.v2.f32 {f526, f527}, [r10]; +ld.shared.v2.f32 {f530, f531}, [r10+2744]; +ld.shared.v2.f32 {f534, f535}, [r10+5488]; +ld.shared.v2.f32 {f538, f539}, [r10+8232]; +ld.shared.v2.f32 {f542, f543}, [r10+10976]; +ld.shared.v2.f32 {f546, f547}, [r10+13720]; +ld.shared.v2.f32 {f550, f551}, [r10+16464]; +add.f32 f554, f530, f550; +add.f32 f555, f526, f554; +add.f32 f556, f534, f546; +add.f32 f557, f556, f555; +add.f32 f558, f538, f542; +add.f32 f559, f531, f551; +add.f32 f560, f527, f559; +add.f32 f561, f535, f547; +add.f32 f562, f561, f560; +add.f32 f563, f539, f543; +fma.rn.f32 f564, f554, 0f3F1F9D07, f526; +mul.f32 f565, f556, 0f3E63DC87; +sub.f32 f566, f564, f565; +mul.f32 f567, f558, 0f3F66A5E5; +sub.f32 f568, f566, f567; +sub.f32 f569, f531, f551; +mul.f32 f570, f569, 0f3F48261C; +sub.f32 f571, f535, f547; +mul.f32 f572, f571, 0fBF7994E0; +sub.f32 f573, f572, f570; +sub.f32 f574, f539, f543; +mul.f32 f575, f574, 0f3EDE2602; +sub.f32 f576, f573, f575; +mul.f32 f577, f554, 0f3E63DC87; +sub.f32 f578, f526, f577; +mul.f32 f579, f556, 0f3F66A5E5; +sub.f32 f580, f578, f579; +fma.rn.f32 f581, f558, 0f3F1F9D07, f580; +mul.f32 f582, f569, 0f3F7994E0; +mul.f32 f583, f571, 0f3EDE2602; +sub.f32 f584, f583, f582; +fma.rn.f32 f585, f574, 0f3F48261C, f584; +mul.f32 f586, f554, 0f3F66A5E5; +sub.f32 f587, f526, f586; +fma.rn.f32 f588, f556, 0f3F1F9D07, f587; +mul.f32 f589, f558, 0f3E63DC87; +sub.f32 f590, f588, f589; +mul.f32 f591, f569, 0f3EDE2602; +mul.f32 f592, f571, 0f3F48261C; +sub.f32 f593, f592, f591; +mul.f32 f594, f574, 0f3F7994E0; +sub.f32 f595, f593, f594; +fma.rn.f32 f596, f559, 0f3F1F9D07, f527; +mul.f32 f597, f561, 0f3E63DC87; +sub.f32 f598, f596, f597; +mul.f32 f599, f563, 0f3F66A5E5; +sub.f32 f600, f598, f599; +sub.f32 f601, f530, f550; +mul.f32 f602, f601, 0f3F48261C; +sub.f32 f603, f534, f546; +mul.f32 f604, f603, 0fBF7994E0; +sub.f32 f605, f604, f602; +sub.f32 f606, f538, f542; +mul.f32 f607, f606, 0f3EDE2602; +sub.f32 f608, f605, f607; +mul.f32 f609, f559, 0f3E63DC87; +sub.f32 f610, f527, f609; +mul.f32 f611, f561, 0f3F66A5E5; +sub.f32 f612, f610, f611; +fma.rn.f32 f613, f563, 0f3F1F9D07, f612; +mul.f32 f614, f601, 0f3F7994E0; +mul.f32 f615, f603, 0f3EDE2602; +sub.f32 f616, f615, f614; +fma.rn.f32 f617, f606, 0f3F48261C, f616; +mul.f32 f618, f559, 0f3F66A5E5; +sub.f32 f619, f527, f618; +fma.rn.f32 f620, f561, 0f3F1F9D07, f619; +mul.f32 f621, f563, 0f3E63DC87; +sub.f32 f622, f620, f621; +mul.f32 f623, f601, 0f3EDE2602; +mul.f32 f624, f603, 0f3F48261C; +sub.f32 f625, f624, f623; +mul.f32 f626, f606, 0f3F7994E0; +sub.f32 f627, f625, f626; +add.f32 %1, f563, f562; +add.f32 %0, f558, f557; +add.f32 %3, f608, f600; +sub.f32 %2, f568, f576; +add.f32 %5, f617, f613; +sub.f32 %4, f581, f585; +add.f32 %7, f627, f622; +sub.f32 %6, f590, f595; +sub.f32 %9, f622, f627; +add.f32 %8, f595, f590; +sub.f32 %11, f613, f617; +add.f32 %10, f585, f581; +sub.f32 %13, f600, f608; +add.f32 %12, f576, f568; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_2401), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<177, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<600>; +.reg .b32 r<27>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 9604, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %20, %34; +add.f32 f30, %18, f29; +add.f32 f31, %23, %31; +add.f32 f32, f31, f30; +add.f32 f33, %26, %28; +add.f32 f34, f33, f32; +add.f32 f35, %22, %35; +add.f32 f36, %19, f35; +add.f32 f37, %25, %33; +add.f32 f38, f37, f36; +add.f32 f39, %27, %30; +add.f32 f40, f39, f38; +fma.rn.f32 f41, f29, 0f3F1F9D07, %18; +mul.f32 f42, f31, 0f3E63DC87; +sub.f32 f43, f41, f42; +mul.f32 f44, f33, 0f3F66A5E5; +sub.f32 f45, f43, f44; +sub.f32 f46, %22, %35; +mul.f32 f47, f46, 0f3F48261C; +sub.f32 f48, %25, %33; +mul.f32 f49, f48, 0fBF7994E0; +sub.f32 f50, f49, f47; +sub.f32 f51, %27, %30; +mul.f32 f52, f51, 0f3EDE2602; +sub.f32 f53, f50, f52; +sub.f32 f54, f45, f53; +add.f32 f55, f53, f45; +mul.f32 f56, f29, 0f3E63DC87; +sub.f32 f57, %18, f56; +mul.f32 f58, f31, 0f3F66A5E5; +sub.f32 f59, f57, f58; +fma.rn.f32 f60, f33, 0f3F1F9D07, f59; +mul.f32 f61, f46, 0f3F7994E0; +mul.f32 f62, f48, 0f3EDE2602; +sub.f32 f63, f62, f61; +fma.rn.f32 f64, f51, 0f3F48261C, f63; +sub.f32 f65, f60, f64; +add.f32 f66, f64, f60; +mul.f32 f67, f29, 0f3F66A5E5; +sub.f32 f68, %18, f67; +fma.rn.f32 f69, f31, 0f3F1F9D07, f68; +mul.f32 f70, f33, 0f3E63DC87; +sub.f32 f71, f69, f70; +mul.f32 f72, f46, 0f3EDE2602; +mul.f32 f73, f48, 0f3F48261C; +sub.f32 f74, f73, f72; +mul.f32 f75, f51, 0f3F7994E0; +sub.f32 f76, f74, f75; +sub.f32 f77, f71, f76; +add.f32 f78, f76, f71; +fma.rn.f32 f79, f35, 0f3F1F9D07, %19; +mul.f32 f80, f37, 0f3E63DC87; +sub.f32 f81, f79, f80; +mul.f32 f82, f39, 0f3F66A5E5; +sub.f32 f83, f81, f82; +sub.f32 f84, %20, %34; +mul.f32 f85, f84, 0f3F48261C; +sub.f32 f86, %23, %31; +mul.f32 f87, f86, 0fBF7994E0; +sub.f32 f88, f87, f85; +sub.f32 f89, %26, %28; +mul.f32 f90, f89, 0f3EDE2602; +sub.f32 f91, f88, f90; +add.f32 f92, f91, f83; +sub.f32 f93, f83, f91; +mul.f32 f94, f35, 0f3E63DC87; +sub.f32 f95, %19, f94; +mul.f32 f96, f37, 0f3F66A5E5; +sub.f32 f97, f95, f96; +fma.rn.f32 f98, f39, 0f3F1F9D07, f97; +mul.f32 f99, f84, 0f3F7994E0; +mul.f32 f100, f86, 0f3EDE2602; +sub.f32 f101, f100, f99; +fma.rn.f32 f102, f89, 0f3F48261C, f101; +add.f32 f103, f102, f98; +sub.f32 f104, f98, f102; +mul.f32 f105, f35, 0f3F66A5E5; +sub.f32 f106, %19, f105; +fma.rn.f32 f107, f37, 0f3F1F9D07, f106; +mul.f32 f108, f39, 0f3E63DC87; +sub.f32 f109, f107, f108; +mul.f32 f110, f84, 0f3EDE2602; +mul.f32 f111, f86, 0f3F48261C; +sub.f32 f112, f111, f110; +mul.f32 f113, f89, 0f3F7994E0; +sub.f32 f114, f112, f113; +add.f32 f115, f114, f109; +sub.f32 f116, f109, f114; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f117, f118}, [rd6]; +mul.f32 f121, f117, f54; +mul.f32 f122, f118, f92; +sub.f32 f123, f121, f122; +mul.f32 f124, f117, f92; +fma.rn.f32 f125, f118, f54, f124; +mul.f32 f126, f117, f117; +mul.f32 f127, f118, f118; +sub.f32 f128, f126, f127; +mul.f32 f129, f118, f117; +fma.rn.f32 f130, f118, f117, f129; +mul.f32 f131, f128, f65; +mul.f32 f132, f130, f103; +sub.f32 f133, f131, f132; +mul.f32 f134, f128, f103; +fma.rn.f32 f135, f130, f65, f134; +mul.f32 f136, f117, f128; +mul.f32 f137, f118, f130; +sub.f32 f138, f136, f137; +mul.f32 f139, f117, f130; +fma.rn.f32 f140, f118, f128, f139; +mul.f32 f141, f138, f77; +mul.f32 f142, f140, f115; +sub.f32 f143, f141, f142; +mul.f32 f144, f138, f115; +fma.rn.f32 f145, f140, f77, f144; +mul.f32 f146, f117, f138; +mul.f32 f147, f118, f140; +sub.f32 f148, f146, f147; +mul.f32 f149, f117, f140; +fma.rn.f32 f150, f118, f138, f149; +mul.f32 f151, f148, f78; +mul.f32 f152, f150, f116; +sub.f32 f153, f151, f152; +mul.f32 f154, f148, f116; +fma.rn.f32 f155, f150, f78, f154; +mul.f32 f156, f117, f148; +mul.f32 f157, f118, f150; +sub.f32 f158, f156, f157; +mul.f32 f159, f117, f150; +fma.rn.f32 f160, f118, f148, f159; +mul.f32 f161, f158, f66; +mul.f32 f162, f160, f104; +sub.f32 f163, f161, f162; +mul.f32 f164, f158, f104; +fma.rn.f32 f165, f160, f66, f164; +mul.f32 f166, f117, f158; +mul.f32 f167, f118, f160; +sub.f32 f168, f166, f167; +mul.f32 f169, f117, f160; +fma.rn.f32 f170, f118, f158, f169; +mul.f32 f171, f168, f55; +mul.f32 f172, f170, f93; +sub.f32 f173, f171, f172; +mul.f32 f174, f168, f93; +fma.rn.f32 f175, f170, f55, f174; +mad.lo.s32 r8, r5, 9604, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 28, r8; +st.shared.f32 [r9], f34; +st.shared.f32 [r9+4], f123; +st.shared.f32 [r9+8], f133; +st.shared.f32 [r9+12], f143; +st.shared.f32 [r9+16], f153; +st.shared.f32 [r9+20], f163; +st.shared.f32 [r9+24], f173; +barrier.sync 0; +mad.lo.s32 r10, r7, -24, r9; +ld.shared.f32 f176, [r10]; +ld.shared.f32 f177, [r10+1372]; +ld.shared.f32 f178, [r10+2744]; +ld.shared.f32 f179, [r10+4116]; +ld.shared.f32 f180, [r10+5488]; +ld.shared.f32 f181, [r10+6860]; +ld.shared.f32 f182, [r10+8232]; +barrier.sync 0; +st.shared.f32 [r9], f40; +st.shared.f32 [r9+4], f125; +st.shared.f32 [r9+8], f135; +st.shared.f32 [r9+12], f145; +st.shared.f32 [r9+16], f155; +st.shared.f32 [r9+20], f165; +st.shared.f32 [r9+24], f175; +barrier.sync 0; +ld.shared.f32 f183, [r10]; +ld.shared.f32 f184, [r10+1372]; +ld.shared.f32 f185, [r10+2744]; +ld.shared.f32 f186, [r10+4116]; +ld.shared.f32 f187, [r10+5488]; +ld.shared.f32 f188, [r10+6860]; +ld.shared.f32 f189, [r10+8232]; +add.f32 f190, f177, f182; +add.f32 f191, f176, f190; +add.f32 f192, f178, f181; +add.f32 f193, f192, f191; +add.f32 f194, f179, f180; +add.f32 f195, f194, f193; +add.f32 f196, f184, f189; +add.f32 f197, f183, f196; +add.f32 f198, f185, f188; +add.f32 f199, f198, f197; +add.f32 f200, f186, f187; +add.f32 f201, f200, f199; +fma.rn.f32 f202, f190, 0f3F1F9D07, f176; +mul.f32 f203, f192, 0f3E63DC87; +sub.f32 f204, f202, f203; +mul.f32 f205, f194, 0f3F66A5E5; +sub.f32 f206, f204, f205; +sub.f32 f207, f184, f189; +mul.f32 f208, f207, 0f3F48261C; +sub.f32 f209, f185, f188; +mul.f32 f210, f209, 0fBF7994E0; +sub.f32 f211, f210, f208; +sub.f32 f212, f186, f187; +mul.f32 f213, f212, 0f3EDE2602; +sub.f32 f214, f211, f213; +sub.f32 f215, f206, f214; +add.f32 f216, f214, f206; +mul.f32 f217, f190, 0f3E63DC87; +sub.f32 f218, f176, f217; +mul.f32 f219, f192, 0f3F66A5E5; +sub.f32 f220, f218, f219; +fma.rn.f32 f221, f194, 0f3F1F9D07, f220; +mul.f32 f222, f207, 0f3F7994E0; +mul.f32 f223, f209, 0f3EDE2602; +sub.f32 f224, f223, f222; +fma.rn.f32 f225, f212, 0f3F48261C, f224; +sub.f32 f226, f221, f225; +add.f32 f227, f225, f221; +mul.f32 f228, f190, 0f3F66A5E5; +sub.f32 f229, f176, f228; +fma.rn.f32 f230, f192, 0f3F1F9D07, f229; +mul.f32 f231, f194, 0f3E63DC87; +sub.f32 f232, f230, f231; +mul.f32 f233, f207, 0f3EDE2602; +mul.f32 f234, f209, 0f3F48261C; +sub.f32 f235, f234, f233; +mul.f32 f236, f212, 0f3F7994E0; +sub.f32 f237, f235, f236; +sub.f32 f238, f232, f237; +add.f32 f239, f237, f232; +fma.rn.f32 f240, f196, 0f3F1F9D07, f183; +mul.f32 f241, f198, 0f3E63DC87; +sub.f32 f242, f240, f241; +mul.f32 f243, f200, 0f3F66A5E5; +sub.f32 f244, f242, f243; +sub.f32 f245, f177, f182; +mul.f32 f246, f245, 0f3F48261C; +sub.f32 f247, f178, f181; +mul.f32 f248, f247, 0fBF7994E0; +sub.f32 f249, f248, f246; +sub.f32 f250, f179, f180; +mul.f32 f251, f250, 0f3EDE2602; +sub.f32 f252, f249, f251; +add.f32 f253, f252, f244; +sub.f32 f254, f244, f252; +mul.f32 f255, f196, 0f3E63DC87; +sub.f32 f256, f183, f255; +mul.f32 f257, f198, 0f3F66A5E5; +sub.f32 f258, f256, f257; +fma.rn.f32 f259, f200, 0f3F1F9D07, f258; +mul.f32 f260, f245, 0f3F7994E0; +mul.f32 f261, f247, 0f3EDE2602; +sub.f32 f262, f261, f260; +fma.rn.f32 f263, f250, 0f3F48261C, f262; +add.f32 f264, f263, f259; +sub.f32 f265, f259, f263; +mul.f32 f266, f196, 0f3F66A5E5; +sub.f32 f267, f183, f266; +fma.rn.f32 f268, f198, 0f3F1F9D07, f267; +mul.f32 f269, f200, 0f3E63DC87; +sub.f32 f270, f268, f269; +mul.f32 f271, f245, 0f3EDE2602; +mul.f32 f272, f247, 0f3F48261C; +sub.f32 f273, f272, f271; +mul.f32 f274, f250, 0f3F7994E0; +sub.f32 f275, f273, f274; +add.f32 f276, f275, f270; +sub.f32 f277, f270, f275; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f278, f279}, [rd11]; +mul.f32 f282, f278, f215; +mul.f32 f283, f279, f253; +sub.f32 f284, f282, f283; +mul.f32 f285, f278, f253; +fma.rn.f32 f286, f279, f215, f285; +mul.f32 f287, f278, f278; +mul.f32 f288, f279, f279; +sub.f32 f289, f287, f288; +mul.f32 f290, f279, f278; +fma.rn.f32 f291, f279, f278, f290; +mul.f32 f292, f289, f226; +mul.f32 f293, f291, f264; +sub.f32 f294, f292, f293; +mul.f32 f295, f289, f264; +fma.rn.f32 f296, f291, f226, f295; +mul.f32 f297, f278, f289; +mul.f32 f298, f279, f291; +sub.f32 f299, f297, f298; +mul.f32 f300, f278, f291; +fma.rn.f32 f301, f279, f289, f300; +mul.f32 f302, f299, f238; +mul.f32 f303, f301, f276; +sub.f32 f304, f302, f303; +mul.f32 f305, f299, f276; +fma.rn.f32 f306, f301, f238, f305; +mul.f32 f307, f278, f299; +mul.f32 f308, f279, f301; +sub.f32 f309, f307, f308; +mul.f32 f310, f278, f301; +fma.rn.f32 f311, f279, f299, f310; +mul.f32 f312, f309, f239; +mul.f32 f313, f311, f277; +sub.f32 f314, f312, f313; +mul.f32 f315, f309, f277; +fma.rn.f32 f316, f311, f239, f315; +mul.f32 f317, f278, f309; +mul.f32 f318, f279, f311; +sub.f32 f319, f317, f318; +mul.f32 f320, f278, f311; +fma.rn.f32 f321, f279, f309, f320; +mul.f32 f322, f319, f227; +mul.f32 f323, f321, f265; +sub.f32 f324, f322, f323; +mul.f32 f325, f319, f265; +fma.rn.f32 f326, f321, f227, f325; +mul.f32 f327, f278, f319; +mul.f32 f328, f279, f321; +sub.f32 f329, f327, f328; +mul.f32 f330, f278, f321; +fma.rn.f32 f331, f279, f319, f330; +mul.f32 f332, f329, f216; +mul.f32 f333, f331, f254; +sub.f32 f334, f332, f333; +mul.f32 f335, f329, f254; +fma.rn.f32 f336, f331, f216, f335; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 196, r19; +st.shared.f32 [r20], f195; +st.shared.f32 [r20+28], f284; +st.shared.f32 [r20+56], f294; +st.shared.f32 [r20+84], f304; +st.shared.f32 [r20+112], f314; +st.shared.f32 [r20+140], f324; +st.shared.f32 [r20+168], f334; +barrier.sync 0; +ld.shared.f32 f337, [r10]; +ld.shared.f32 f338, [r10+1372]; +ld.shared.f32 f339, [r10+2744]; +ld.shared.f32 f340, [r10+4116]; +ld.shared.f32 f341, [r10+5488]; +ld.shared.f32 f342, [r10+6860]; +ld.shared.f32 f343, [r10+8232]; +barrier.sync 0; +st.shared.f32 [r20], f201; +st.shared.f32 [r20+28], f286; +st.shared.f32 [r20+56], f296; +st.shared.f32 [r20+84], f306; +st.shared.f32 [r20+112], f316; +st.shared.f32 [r20+140], f326; +st.shared.f32 [r20+168], f336; +barrier.sync 0; +ld.shared.f32 f344, [r10]; +ld.shared.f32 f345, [r10+1372]; +ld.shared.f32 f346, [r10+2744]; +ld.shared.f32 f347, [r10+4116]; +ld.shared.f32 f348, [r10+5488]; +ld.shared.f32 f349, [r10+6860]; +ld.shared.f32 f350, [r10+8232]; +add.f32 f351, f338, f343; +add.f32 f352, f337, f351; +add.f32 f353, f339, f342; +add.f32 f354, f353, f352; +add.f32 f355, f340, f341; +add.f32 f356, f355, f354; +add.f32 f357, f345, f350; +add.f32 f358, f344, f357; +add.f32 f359, f346, f349; +add.f32 f360, f359, f358; +add.f32 f361, f347, f348; +add.f32 f362, f361, f360; +fma.rn.f32 f363, f351, 0f3F1F9D07, f337; +mul.f32 f364, f353, 0f3E63DC87; +sub.f32 f365, f363, f364; +mul.f32 f366, f355, 0f3F66A5E5; +sub.f32 f367, f365, f366; +sub.f32 f368, f345, f350; +mul.f32 f369, f368, 0f3F48261C; +sub.f32 f370, f346, f349; +mul.f32 f371, f370, 0fBF7994E0; +sub.f32 f372, f371, f369; +sub.f32 f373, f347, f348; +mul.f32 f374, f373, 0f3EDE2602; +sub.f32 f375, f372, f374; +sub.f32 f376, f367, f375; +add.f32 f377, f375, f367; +mul.f32 f378, f351, 0f3E63DC87; +sub.f32 f379, f337, f378; +mul.f32 f380, f353, 0f3F66A5E5; +sub.f32 f381, f379, f380; +fma.rn.f32 f382, f355, 0f3F1F9D07, f381; +mul.f32 f383, f368, 0f3F7994E0; +mul.f32 f384, f370, 0f3EDE2602; +sub.f32 f385, f384, f383; +fma.rn.f32 f386, f373, 0f3F48261C, f385; +sub.f32 f387, f382, f386; +add.f32 f388, f386, f382; +mul.f32 f389, f351, 0f3F66A5E5; +sub.f32 f390, f337, f389; +fma.rn.f32 f391, f353, 0f3F1F9D07, f390; +mul.f32 f392, f355, 0f3E63DC87; +sub.f32 f393, f391, f392; +mul.f32 f394, f368, 0f3EDE2602; +mul.f32 f395, f370, 0f3F48261C; +sub.f32 f396, f395, f394; +mul.f32 f397, f373, 0f3F7994E0; +sub.f32 f398, f396, f397; +sub.f32 f399, f393, f398; +add.f32 f400, f398, f393; +fma.rn.f32 f401, f357, 0f3F1F9D07, f344; +mul.f32 f402, f359, 0f3E63DC87; +sub.f32 f403, f401, f402; +mul.f32 f404, f361, 0f3F66A5E5; +sub.f32 f405, f403, f404; +sub.f32 f406, f338, f343; +mul.f32 f407, f406, 0f3F48261C; +sub.f32 f408, f339, f342; +mul.f32 f409, f408, 0fBF7994E0; +sub.f32 f410, f409, f407; +sub.f32 f411, f340, f341; +mul.f32 f412, f411, 0f3EDE2602; +sub.f32 f413, f410, f412; +add.f32 f414, f413, f405; +sub.f32 f415, f405, f413; +mul.f32 f416, f357, 0f3E63DC87; +sub.f32 f417, f344, f416; +mul.f32 f418, f359, 0f3F66A5E5; +sub.f32 f419, f417, f418; +fma.rn.f32 f420, f361, 0f3F1F9D07, f419; +mul.f32 f421, f406, 0f3F7994E0; +mul.f32 f422, f408, 0f3EDE2602; +sub.f32 f423, f422, f421; +fma.rn.f32 f424, f411, 0f3F48261C, f423; +add.f32 f425, f424, f420; +sub.f32 f426, f420, f424; +mul.f32 f427, f357, 0f3F66A5E5; +sub.f32 f428, f344, f427; +fma.rn.f32 f429, f359, 0f3F1F9D07, f428; +mul.f32 f430, f361, 0f3E63DC87; +sub.f32 f431, f429, f430; +mul.f32 f432, f406, 0f3EDE2602; +mul.f32 f433, f408, 0f3F48261C; +sub.f32 f434, f433, f432; +mul.f32 f435, f411, 0f3F7994E0; +sub.f32 f436, f434, f435; +add.f32 f437, f436, f431; +sub.f32 f438, f431, f436; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 8; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f439, f440}, [rd16]; +mul.f32 f443, f439, f376; +mul.f32 f444, f440, f414; +sub.f32 f445, f443, f444; +mul.f32 f446, f439, f414; +fma.rn.f32 f447, f440, f376, f446; +mul.f32 f448, f439, f439; +mul.f32 f449, f440, f440; +sub.f32 f450, f448, f449; +mul.f32 f451, f440, f439; +fma.rn.f32 f452, f440, f439, f451; +mul.f32 f453, f450, f387; +mul.f32 f454, f452, f425; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, f425; +fma.rn.f32 f457, f452, f387, f456; +mul.f32 f458, f439, f450; +mul.f32 f459, f440, f452; +sub.f32 f460, f458, f459; +mul.f32 f461, f439, f452; +fma.rn.f32 f462, f440, f450, f461; +mul.f32 f463, f460, f399; +mul.f32 f464, f462, f437; +sub.f32 f465, f463, f464; +mul.f32 f466, f460, f437; +fma.rn.f32 f467, f462, f399, f466; +mul.f32 f468, f439, f460; +mul.f32 f469, f440, f462; +sub.f32 f470, f468, f469; +mul.f32 f471, f439, f462; +fma.rn.f32 f472, f440, f460, f471; +mul.f32 f473, f470, f400; +mul.f32 f474, f472, f438; +sub.f32 f475, f473, f474; +mul.f32 f476, f470, f438; +fma.rn.f32 f477, f472, f400, f476; +mul.f32 f478, f439, f470; +mul.f32 f479, f440, f472; +sub.f32 f480, f478, f479; +mul.f32 f481, f439, f472; +fma.rn.f32 f482, f440, f470, f481; +mul.f32 f483, f480, f388; +mul.f32 f484, f482, f426; +sub.f32 f485, f483, f484; +mul.f32 f486, f480, f426; +fma.rn.f32 f487, f482, f388, f486; +mul.f32 f488, f439, f480; +mul.f32 f489, f440, f482; +sub.f32 f490, f488, f489; +mul.f32 f491, f439, f482; +fma.rn.f32 f492, f440, f480, f491; +mul.f32 f493, f490, f377; +mul.f32 f494, f492, f415; +sub.f32 f495, f493, f494; +mul.f32 f496, f490, f415; +fma.rn.f32 f497, f492, f377, f496; +shl.b32 r24, r23, 2; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 1372, r25; +st.shared.f32 [r26], f356; +st.shared.f32 [r26+196], f445; +st.shared.f32 [r26+392], f455; +st.shared.f32 [r26+588], f465; +st.shared.f32 [r26+784], f475; +st.shared.f32 [r26+980], f485; +st.shared.f32 [r26+1176], f495; +barrier.sync 0; +ld.shared.f32 f498, [r10]; +ld.shared.f32 f499, [r10+1372]; +ld.shared.f32 f500, [r10+2744]; +ld.shared.f32 f501, [r10+4116]; +ld.shared.f32 f502, [r10+5488]; +ld.shared.f32 f503, [r10+6860]; +ld.shared.f32 f504, [r10+8232]; +barrier.sync 0; +st.shared.f32 [r26], f362; +st.shared.f32 [r26+196], f447; +st.shared.f32 [r26+392], f457; +st.shared.f32 [r26+588], f467; +st.shared.f32 [r26+784], f477; +st.shared.f32 [r26+980], f487; +st.shared.f32 [r26+1176], f497; +barrier.sync 0; +ld.shared.f32 f505, [r10]; +ld.shared.f32 f506, [r10+1372]; +ld.shared.f32 f507, [r10+2744]; +ld.shared.f32 f508, [r10+4116]; +ld.shared.f32 f509, [r10+5488]; +ld.shared.f32 f510, [r10+6860]; +ld.shared.f32 f511, [r10+8232]; +add.f32 f512, f499, f504; +add.f32 f513, f498, f512; +add.f32 f514, f500, f503; +add.f32 f515, f514, f513; +add.f32 f516, f501, f502; +add.f32 f517, f506, f511; +add.f32 f518, f505, f517; +add.f32 f519, f507, f510; +add.f32 f520, f519, f518; +add.f32 f521, f508, f509; +fma.rn.f32 f522, f512, 0f3F1F9D07, f498; +mul.f32 f523, f514, 0f3E63DC87; +sub.f32 f524, f522, f523; +mul.f32 f525, f516, 0f3F66A5E5; +sub.f32 f526, f524, f525; +sub.f32 f527, f506, f511; +mul.f32 f528, f527, 0f3F48261C; +sub.f32 f529, f507, f510; +mul.f32 f530, f529, 0fBF7994E0; +sub.f32 f531, f530, f528; +sub.f32 f532, f508, f509; +mul.f32 f533, f532, 0f3EDE2602; +sub.f32 f534, f531, f533; +mul.f32 f535, f512, 0f3E63DC87; +sub.f32 f536, f498, f535; +mul.f32 f537, f514, 0f3F66A5E5; +sub.f32 f538, f536, f537; +fma.rn.f32 f539, f516, 0f3F1F9D07, f538; +mul.f32 f540, f527, 0f3F7994E0; +mul.f32 f541, f529, 0f3EDE2602; +sub.f32 f542, f541, f540; +fma.rn.f32 f543, f532, 0f3F48261C, f542; +mul.f32 f544, f512, 0f3F66A5E5; +sub.f32 f545, f498, f544; +fma.rn.f32 f546, f514, 0f3F1F9D07, f545; +mul.f32 f547, f516, 0f3E63DC87; +sub.f32 f548, f546, f547; +mul.f32 f549, f527, 0f3EDE2602; +mul.f32 f550, f529, 0f3F48261C; +sub.f32 f551, f550, f549; +mul.f32 f552, f532, 0f3F7994E0; +sub.f32 f553, f551, f552; +fma.rn.f32 f554, f517, 0f3F1F9D07, f505; +mul.f32 f555, f519, 0f3E63DC87; +sub.f32 f556, f554, f555; +mul.f32 f557, f521, 0f3F66A5E5; +sub.f32 f558, f556, f557; +sub.f32 f559, f499, f504; +mul.f32 f560, f559, 0f3F48261C; +sub.f32 f561, f500, f503; +mul.f32 f562, f561, 0fBF7994E0; +sub.f32 f563, f562, f560; +sub.f32 f564, f501, f502; +mul.f32 f565, f564, 0f3EDE2602; +sub.f32 f566, f563, f565; +mul.f32 f567, f517, 0f3E63DC87; +sub.f32 f568, f505, f567; +mul.f32 f569, f519, 0f3F66A5E5; +sub.f32 f570, f568, f569; +fma.rn.f32 f571, f521, 0f3F1F9D07, f570; +mul.f32 f572, f559, 0f3F7994E0; +mul.f32 f573, f561, 0f3EDE2602; +sub.f32 f574, f573, f572; +fma.rn.f32 f575, f564, 0f3F48261C, f574; +mul.f32 f576, f517, 0f3F66A5E5; +sub.f32 f577, f505, f576; +fma.rn.f32 f578, f519, 0f3F1F9D07, f577; +mul.f32 f579, f521, 0f3E63DC87; +sub.f32 f580, f578, f579; +mul.f32 f581, f559, 0f3EDE2602; +mul.f32 f582, f561, 0f3F48261C; +sub.f32 f583, f582, f581; +mul.f32 f584, f564, 0f3F7994E0; +sub.f32 f585, f583, f584; +add.f32 %0, f516, f515; +add.f32 %1, f521, f520; +add.f32 %3, f566, f558; +sub.f32 %2, f526, f534; +sub.f32 %4, f539, f543; +add.f32 %5, f575, f571; +sub.f32 %6, f548, f553; +add.f32 %7, f585, f580; +add.f32 %8, f553, f548; +sub.f32 %9, f580, f585; +add.f32 %10, f543, f539; +sub.f32 %11, f571, f575; +sub.f32 %13, f558, f566; +add.f32 %12, f534, f526; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_2401), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..d7709ce9cd7fe --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp32_inv.hpp.inc @@ -0,0 +1,1264 @@ +#ifndef CUFFTDX_FFT_2401_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_2401_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<378, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<626>; +.reg .b32 r<27>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 19208, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %20, %34; +add.f32 f30, %18, f29; +add.f32 f31, %23, %31; +add.f32 f32, f31, f30; +add.f32 f33, %26, %28; +add.f32 f34, %22, %35; +add.f32 f35, %19, f34; +add.f32 f36, %25, %33; +add.f32 f37, f36, f35; +add.f32 f38, %27, %30; +fma.rn.f32 f39, f29, 0f3F1F9D07, %18; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %22, %35; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %25, %33; +fma.rn.f32 f47, f46, 0f3F7994E0, f45; +sub.f32 f48, %27, %30; +fma.rn.f32 f49, f48, 0f3EDE2602, f47; +sub.f32 f50, f43, f49; +add.f32 f51, f49, f43; +mul.f32 f52, f29, 0f3E63DC87; +sub.f32 f53, %18, f52; +mul.f32 f54, f31, 0f3F66A5E5; +sub.f32 f55, f53, f54; +fma.rn.f32 f56, f33, 0f3F1F9D07, f55; +mul.f32 f57, f44, 0f3F7994E0; +mul.f32 f58, f46, 0f3EDE2602; +sub.f32 f59, f57, f58; +mul.f32 f60, f48, 0f3F48261C; +sub.f32 f61, f59, f60; +sub.f32 f62, f56, f61; +add.f32 f63, f61, f56; +mul.f32 f64, f29, 0f3F66A5E5; +sub.f32 f65, %18, f64; +fma.rn.f32 f66, f31, 0f3F1F9D07, f65; +mul.f32 f67, f33, 0f3E63DC87; +sub.f32 f68, f66, f67; +mul.f32 f69, f44, 0f3EDE2602; +mul.f32 f70, f46, 0f3F48261C; +sub.f32 f71, f69, f70; +fma.rn.f32 f72, f48, 0f3F7994E0, f71; +sub.f32 f73, f68, f72; +add.f32 f74, f72, f68; +fma.rn.f32 f75, f34, 0f3F1F9D07, %19; +mul.f32 f76, f36, 0f3E63DC87; +sub.f32 f77, f75, f76; +mul.f32 f78, f38, 0f3F66A5E5; +sub.f32 f79, f77, f78; +sub.f32 f80, %20, %34; +mul.f32 f81, f80, 0f3F48261C; +sub.f32 f82, %23, %31; +fma.rn.f32 f83, f82, 0f3F7994E0, f81; +sub.f32 f84, %26, %28; +fma.rn.f32 f85, f84, 0f3EDE2602, f83; +add.f32 f86, f85, f79; +sub.f32 f87, f79, f85; +mul.f32 f88, f34, 0f3E63DC87; +sub.f32 f89, %19, f88; +mul.f32 f90, f36, 0f3F66A5E5; +sub.f32 f91, f89, f90; +fma.rn.f32 f92, f38, 0f3F1F9D07, f91; +mul.f32 f93, f80, 0f3F7994E0; +mul.f32 f94, f82, 0f3EDE2602; +sub.f32 f95, f93, f94; +mul.f32 f96, f84, 0f3F48261C; +sub.f32 f97, f95, f96; +add.f32 f98, f97, f92; +sub.f32 f99, f92, f97; +mul.f32 f100, f34, 0f3F66A5E5; +sub.f32 f101, %19, f100; +fma.rn.f32 f102, f36, 0f3F1F9D07, f101; +mul.f32 f103, f38, 0f3E63DC87; +sub.f32 f104, f102, f103; +mul.f32 f105, f80, 0f3EDE2602; +mul.f32 f106, f82, 0f3F48261C; +sub.f32 f107, f105, f106; +fma.rn.f32 f108, f84, 0f3F7994E0, f107; +add.f32 f109, f108, f104; +sub.f32 f110, f104, f108; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 19208, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f111, f112}, [rd6]; +mul.f32 f115, f86, f112; +mul.f32 f116, f50, f112; +mul.f32 f117, f111, f86; +mul.f32 f118, f111, f111; +mul.f32 f119, f112, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f112, f111; +fma.rn.f32 f122, f112, f111, f121; +mul.f32 f123, f98, f122; +mul.f32 f124, f62, f122; +mul.f32 f125, f120, f98; +mul.f32 f126, f111, f120; +mul.f32 f127, f112, f122; +sub.f32 f128, f126, f127; +mul.f32 f129, f111, f122; +fma.rn.f32 f130, f112, f120, f129; +mul.f32 f131, f109, f130; +mul.f32 f132, f73, f130; +mul.f32 f133, f128, f109; +mul.f32 f134, f111, f128; +mul.f32 f135, f112, f130; +sub.f32 f136, f134, f135; +mul.f32 f137, f111, f130; +fma.rn.f32 f138, f112, f128, f137; +mul.f32 f139, f110, f138; +mul.f32 f140, f74, f138; +mul.f32 f141, f136, f110; +mul.f32 f142, f111, f136; +mul.f32 f143, f112, f138; +sub.f32 f144, f142, f143; +mul.f32 f145, f111, f138; +fma.rn.f32 f146, f112, f136, f145; +mul.f32 f147, f99, f146; +mul.f32 f148, f63, f146; +mul.f32 f149, f144, f99; +mul.f32 f150, f111, f144; +mul.f32 f151, f112, f146; +sub.f32 f152, f150, f151; +mul.f32 f153, f111, f146; +fma.rn.f32 f154, f112, f144, f153; +mul.f32 f155, f87, f154; +mul.f32 f156, f51, f154; +mul.f32 f157, f152, f87; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +add.f32 f158, f38, f37; +add.f32 f159, f33, f32; +st.shared.v2.f32 [r9], {f159, f158}; +fma.rn.f32 f160, f111, f50, f115; +sub.f32 f161, f117, f116; +st.shared.v2.f32 [r9+8], {f160, f161}; +fma.rn.f32 f162, f120, f62, f123; +sub.f32 f163, f125, f124; +st.shared.v2.f32 [r9+16], {f162, f163}; +sub.f32 f164, f133, f132; +fma.rn.f32 f165, f128, f73, f131; +st.shared.v2.f32 [r9+24], {f165, f164}; +fma.rn.f32 f166, f136, f74, f139; +sub.f32 f167, f141, f140; +st.shared.v2.f32 [r9+32], {f166, f167}; +fma.rn.f32 f168, f144, f63, f147; +sub.f32 f169, f149, f148; +st.shared.v2.f32 [r9+40], {f168, f169}; +fma.rn.f32 f170, f152, f51, f155; +sub.f32 f171, f157, f156; +st.shared.v2.f32 [r9+48], {f170, f171}; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.v2.f32 {f172, f173}, [r10]; +ld.shared.v2.f32 {f176, f177}, [r10+2744]; +ld.shared.v2.f32 {f180, f181}, [r10+5488]; +ld.shared.v2.f32 {f184, f185}, [r10+8232]; +ld.shared.v2.f32 {f188, f189}, [r10+10976]; +ld.shared.v2.f32 {f192, f193}, [r10+13720]; +ld.shared.v2.f32 {f196, f197}, [r10+16464]; +add.f32 f200, f176, f196; +add.f32 f201, f172, f200; +add.f32 f202, f180, f192; +add.f32 f203, f202, f201; +add.f32 f204, f184, f188; +add.f32 f205, f177, f197; +add.f32 f206, f173, f205; +add.f32 f207, f181, f193; +add.f32 f208, f207, f206; +add.f32 f209, f185, f189; +fma.rn.f32 f210, f200, 0f3F1F9D07, f172; +mul.f32 f211, f202, 0f3E63DC87; +sub.f32 f212, f210, f211; +mul.f32 f213, f204, 0f3F66A5E5; +sub.f32 f214, f212, f213; +sub.f32 f215, f177, f197; +mul.f32 f216, f215, 0f3F48261C; +sub.f32 f217, f181, f193; +fma.rn.f32 f218, f217, 0f3F7994E0, f216; +sub.f32 f219, f185, f189; +fma.rn.f32 f220, f219, 0f3EDE2602, f218; +sub.f32 f221, f214, f220; +add.f32 f222, f220, f214; +mul.f32 f223, f200, 0f3E63DC87; +sub.f32 f224, f172, f223; +mul.f32 f225, f202, 0f3F66A5E5; +sub.f32 f226, f224, f225; +fma.rn.f32 f227, f204, 0f3F1F9D07, f226; +mul.f32 f228, f215, 0f3F7994E0; +mul.f32 f229, f217, 0f3EDE2602; +sub.f32 f230, f228, f229; +mul.f32 f231, f219, 0f3F48261C; +sub.f32 f232, f230, f231; +sub.f32 f233, f227, f232; +add.f32 f234, f232, f227; +mul.f32 f235, f200, 0f3F66A5E5; +sub.f32 f236, f172, f235; +fma.rn.f32 f237, f202, 0f3F1F9D07, f236; +mul.f32 f238, f204, 0f3E63DC87; +sub.f32 f239, f237, f238; +mul.f32 f240, f215, 0f3EDE2602; +mul.f32 f241, f217, 0f3F48261C; +sub.f32 f242, f240, f241; +fma.rn.f32 f243, f219, 0f3F7994E0, f242; +sub.f32 f244, f239, f243; +add.f32 f245, f243, f239; +fma.rn.f32 f246, f205, 0f3F1F9D07, f173; +mul.f32 f247, f207, 0f3E63DC87; +sub.f32 f248, f246, f247; +mul.f32 f249, f209, 0f3F66A5E5; +sub.f32 f250, f248, f249; +sub.f32 f251, f176, f196; +mul.f32 f252, f251, 0f3F48261C; +sub.f32 f253, f180, f192; +fma.rn.f32 f254, f253, 0f3F7994E0, f252; +sub.f32 f255, f184, f188; +fma.rn.f32 f256, f255, 0f3EDE2602, f254; +add.f32 f257, f256, f250; +sub.f32 f258, f250, f256; +mul.f32 f259, f205, 0f3E63DC87; +sub.f32 f260, f173, f259; +mul.f32 f261, f207, 0f3F66A5E5; +sub.f32 f262, f260, f261; +fma.rn.f32 f263, f209, 0f3F1F9D07, f262; +mul.f32 f264, f251, 0f3F7994E0; +mul.f32 f265, f253, 0f3EDE2602; +sub.f32 f266, f264, f265; +mul.f32 f267, f255, 0f3F48261C; +sub.f32 f268, f266, f267; +add.f32 f269, f268, f263; +sub.f32 f270, f263, f268; +mul.f32 f271, f205, 0f3F66A5E5; +sub.f32 f272, f173, f271; +fma.rn.f32 f273, f207, 0f3F1F9D07, f272; +mul.f32 f274, f209, 0f3E63DC87; +sub.f32 f275, f273, f274; +mul.f32 f276, f251, 0f3EDE2602; +mul.f32 f277, f253, 0f3F48261C; +sub.f32 f278, f276, f277; +fma.rn.f32 f279, f255, 0f3F7994E0, f278; +add.f32 f280, f279, f275; +sub.f32 f281, f275, f279; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f282, f283}, [rd11]; +mul.f32 f286, f257, f283; +mul.f32 f287, f221, f283; +mul.f32 f288, f282, f257; +mul.f32 f289, f282, f282; +mul.f32 f290, f283, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f283, f282; +fma.rn.f32 f293, f283, f282, f292; +mul.f32 f294, f269, f293; +mul.f32 f295, f233, f293; +mul.f32 f296, f291, f269; +mul.f32 f297, f282, f291; +mul.f32 f298, f283, f293; +sub.f32 f299, f297, f298; +mul.f32 f300, f282, f293; +fma.rn.f32 f301, f283, f291, f300; +mul.f32 f302, f280, f301; +mul.f32 f303, f244, f301; +mul.f32 f304, f299, f280; +mul.f32 f305, f282, f299; +mul.f32 f306, f283, f301; +sub.f32 f307, f305, f306; +mul.f32 f308, f282, f301; +fma.rn.f32 f309, f283, f299, f308; +mul.f32 f310, f281, f309; +mul.f32 f311, f245, f309; +mul.f32 f312, f307, f281; +mul.f32 f313, f282, f307; +mul.f32 f314, f283, f309; +sub.f32 f315, f313, f314; +mul.f32 f316, f282, f309; +fma.rn.f32 f317, f283, f307, f316; +mul.f32 f318, f270, f317; +mul.f32 f319, f234, f317; +mul.f32 f320, f315, f270; +mul.f32 f321, f282, f315; +mul.f32 f322, f283, f317; +sub.f32 f323, f321, f322; +mul.f32 f324, f282, f317; +fma.rn.f32 f325, f283, f315, f324; +mul.f32 f326, f258, f325; +mul.f32 f327, f222, f325; +mul.f32 f328, f323, f258; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +add.f32 f329, f209, f208; +add.f32 f330, f204, f203; +st.shared.v2.f32 [r20], {f330, f329}; +fma.rn.f32 f331, f282, f221, f286; +sub.f32 f332, f288, f287; +st.shared.v2.f32 [r20+56], {f331, f332}; +fma.rn.f32 f333, f291, f233, f294; +sub.f32 f334, f296, f295; +st.shared.v2.f32 [r20+112], {f333, f334}; +fma.rn.f32 f335, f299, f244, f302; +sub.f32 f336, f304, f303; +st.shared.v2.f32 [r20+168], {f335, f336}; +sub.f32 f337, f312, f311; +fma.rn.f32 f338, f307, f245, f310; +st.shared.v2.f32 [r20+224], {f338, f337}; +fma.rn.f32 f339, f315, f234, f318; +sub.f32 f340, f320, f319; +st.shared.v2.f32 [r20+280], {f339, f340}; +fma.rn.f32 f341, f323, f222, f326; +sub.f32 f342, f328, f327; +st.shared.v2.f32 [r20+336], {f341, f342}; +barrier.sync 0; +ld.shared.v2.f32 {f343, f344}, [r10]; +ld.shared.v2.f32 {f347, f348}, [r10+2744]; +ld.shared.v2.f32 {f351, f352}, [r10+5488]; +ld.shared.v2.f32 {f355, f356}, [r10+8232]; +ld.shared.v2.f32 {f359, f360}, [r10+10976]; +ld.shared.v2.f32 {f363, f364}, [r10+13720]; +ld.shared.v2.f32 {f367, f368}, [r10+16464]; +add.f32 f371, f347, f367; +add.f32 f372, f343, f371; +add.f32 f373, f351, f363; +add.f32 f374, f373, f372; +add.f32 f375, f355, f359; +add.f32 f376, f348, f368; +add.f32 f377, f344, f376; +add.f32 f378, f352, f364; +add.f32 f379, f378, f377; +add.f32 f380, f356, f360; +fma.rn.f32 f381, f371, 0f3F1F9D07, f343; +mul.f32 f382, f373, 0f3E63DC87; +sub.f32 f383, f381, f382; +mul.f32 f384, f375, 0f3F66A5E5; +sub.f32 f385, f383, f384; +sub.f32 f386, f348, f368; +mul.f32 f387, f386, 0f3F48261C; +sub.f32 f388, f352, f364; +fma.rn.f32 f389, f388, 0f3F7994E0, f387; +sub.f32 f390, f356, f360; +fma.rn.f32 f391, f390, 0f3EDE2602, f389; +sub.f32 f392, f385, f391; +add.f32 f393, f391, f385; +mul.f32 f394, f371, 0f3E63DC87; +sub.f32 f395, f343, f394; +mul.f32 f396, f373, 0f3F66A5E5; +sub.f32 f397, f395, f396; +fma.rn.f32 f398, f375, 0f3F1F9D07, f397; +mul.f32 f399, f386, 0f3F7994E0; +mul.f32 f400, f388, 0f3EDE2602; +sub.f32 f401, f399, f400; +mul.f32 f402, f390, 0f3F48261C; +sub.f32 f403, f401, f402; +sub.f32 f404, f398, f403; +add.f32 f405, f403, f398; +mul.f32 f406, f371, 0f3F66A5E5; +sub.f32 f407, f343, f406; +fma.rn.f32 f408, f373, 0f3F1F9D07, f407; +mul.f32 f409, f375, 0f3E63DC87; +sub.f32 f410, f408, f409; +mul.f32 f411, f386, 0f3EDE2602; +mul.f32 f412, f388, 0f3F48261C; +sub.f32 f413, f411, f412; +fma.rn.f32 f414, f390, 0f3F7994E0, f413; +sub.f32 f415, f410, f414; +add.f32 f416, f414, f410; +fma.rn.f32 f417, f376, 0f3F1F9D07, f344; +mul.f32 f418, f378, 0f3E63DC87; +sub.f32 f419, f417, f418; +mul.f32 f420, f380, 0f3F66A5E5; +sub.f32 f421, f419, f420; +sub.f32 f422, f347, f367; +mul.f32 f423, f422, 0f3F48261C; +sub.f32 f424, f351, f363; +fma.rn.f32 f425, f424, 0f3F7994E0, f423; +sub.f32 f426, f355, f359; +fma.rn.f32 f427, f426, 0f3EDE2602, f425; +add.f32 f428, f427, f421; +sub.f32 f429, f421, f427; +mul.f32 f430, f376, 0f3E63DC87; +sub.f32 f431, f344, f430; +mul.f32 f432, f378, 0f3F66A5E5; +sub.f32 f433, f431, f432; +fma.rn.f32 f434, f380, 0f3F1F9D07, f433; +mul.f32 f435, f422, 0f3F7994E0; +mul.f32 f436, f424, 0f3EDE2602; +sub.f32 f437, f435, f436; +mul.f32 f438, f426, 0f3F48261C; +sub.f32 f439, f437, f438; +add.f32 f440, f439, f434; +sub.f32 f441, f434, f439; +mul.f32 f442, f376, 0f3F66A5E5; +sub.f32 f443, f344, f442; +fma.rn.f32 f444, f378, 0f3F1F9D07, f443; +mul.f32 f445, f380, 0f3E63DC87; +sub.f32 f446, f444, f445; +mul.f32 f447, f422, 0f3EDE2602; +mul.f32 f448, f424, 0f3F48261C; +sub.f32 f449, f447, f448; +fma.rn.f32 f450, f426, 0f3F7994E0, f449; +add.f32 f451, f450, f446; +sub.f32 f452, f446, f450; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 8; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f453, f454}, [rd16]; +mul.f32 f457, f428, f454; +mul.f32 f458, f392, f454; +mul.f32 f459, f453, f428; +mul.f32 f460, f453, f453; +mul.f32 f461, f454, f454; +sub.f32 f462, f460, f461; +mul.f32 f463, f454, f453; +fma.rn.f32 f464, f454, f453, f463; +mul.f32 f465, f440, f464; +mul.f32 f466, f404, f464; +mul.f32 f467, f462, f440; +mul.f32 f468, f453, f462; +mul.f32 f469, f454, f464; +sub.f32 f470, f468, f469; +mul.f32 f471, f453, f464; +fma.rn.f32 f472, f454, f462, f471; +mul.f32 f473, f451, f472; +mul.f32 f474, f415, f472; +mul.f32 f475, f470, f451; +mul.f32 f476, f453, f470; +mul.f32 f477, f454, f472; +sub.f32 f478, f476, f477; +mul.f32 f479, f453, f472; +fma.rn.f32 f480, f454, f470, f479; +mul.f32 f481, f452, f480; +mul.f32 f482, f416, f480; +mul.f32 f483, f478, f452; +mul.f32 f484, f453, f478; +mul.f32 f485, f454, f480; +sub.f32 f486, f484, f485; +mul.f32 f487, f453, f480; +fma.rn.f32 f488, f454, f478, f487; +mul.f32 f489, f441, f488; +mul.f32 f490, f405, f488; +mul.f32 f491, f486, f441; +mul.f32 f492, f453, f486; +mul.f32 f493, f454, f488; +sub.f32 f494, f492, f493; +mul.f32 f495, f453, f488; +fma.rn.f32 f496, f454, f486, f495; +mul.f32 f497, f429, f496; +mul.f32 f498, f393, f496; +mul.f32 f499, f494, f429; +shl.b32 r24, r23, 3; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 2744, r25; +add.f32 f500, f380, f379; +add.f32 f501, f375, f374; +st.shared.v2.f32 [r26], {f501, f500}; +fma.rn.f32 f502, f453, f392, f457; +sub.f32 f503, f459, f458; +st.shared.v2.f32 [r26+392], {f502, f503}; +fma.rn.f32 f504, f462, f404, f465; +sub.f32 f505, f467, f466; +st.shared.v2.f32 [r26+784], {f504, f505}; +fma.rn.f32 f506, f470, f415, f473; +sub.f32 f507, f475, f474; +st.shared.v2.f32 [r26+1176], {f506, f507}; +fma.rn.f32 f508, f478, f416, f481; +sub.f32 f509, f483, f482; +st.shared.v2.f32 [r26+1568], {f508, f509}; +sub.f32 f510, f491, f490; +fma.rn.f32 f511, f486, f405, f489; +st.shared.v2.f32 [r26+1960], {f511, f510}; +fma.rn.f32 f512, f494, f393, f497; +sub.f32 f513, f499, f498; +st.shared.v2.f32 [r26+2352], {f512, f513}; +barrier.sync 0; +ld.shared.v2.f32 {f514, f515}, [r10]; +ld.shared.v2.f32 {f518, f519}, [r10+2744]; +ld.shared.v2.f32 {f522, f523}, [r10+5488]; +ld.shared.v2.f32 {f526, f527}, [r10+8232]; +ld.shared.v2.f32 {f530, f531}, [r10+10976]; +ld.shared.v2.f32 {f534, f535}, [r10+13720]; +ld.shared.v2.f32 {f538, f539}, [r10+16464]; +add.f32 f542, f518, f538; +add.f32 f543, f514, f542; +add.f32 f544, f522, f534; +add.f32 f545, f544, f543; +add.f32 f546, f526, f530; +add.f32 f547, f519, f539; +add.f32 f548, f515, f547; +add.f32 f549, f523, f535; +add.f32 f550, f549, f548; +add.f32 f551, f527, f531; +fma.rn.f32 f552, f542, 0f3F1F9D07, f514; +mul.f32 f553, f544, 0f3E63DC87; +sub.f32 f554, f552, f553; +mul.f32 f555, f546, 0f3F66A5E5; +sub.f32 f556, f554, f555; +sub.f32 f557, f519, f539; +mul.f32 f558, f557, 0f3F48261C; +sub.f32 f559, f523, f535; +fma.rn.f32 f560, f559, 0f3F7994E0, f558; +sub.f32 f561, f527, f531; +fma.rn.f32 f562, f561, 0f3EDE2602, f560; +mul.f32 f563, f542, 0f3E63DC87; +sub.f32 f564, f514, f563; +mul.f32 f565, f544, 0f3F66A5E5; +sub.f32 f566, f564, f565; +fma.rn.f32 f567, f546, 0f3F1F9D07, f566; +mul.f32 f568, f557, 0f3F7994E0; +mul.f32 f569, f559, 0f3EDE2602; +sub.f32 f570, f568, f569; +mul.f32 f571, f561, 0f3F48261C; +sub.f32 f572, f570, f571; +mul.f32 f573, f542, 0f3F66A5E5; +sub.f32 f574, f514, f573; +fma.rn.f32 f575, f544, 0f3F1F9D07, f574; +mul.f32 f576, f546, 0f3E63DC87; +sub.f32 f577, f575, f576; +mul.f32 f578, f557, 0f3EDE2602; +mul.f32 f579, f559, 0f3F48261C; +sub.f32 f580, f578, f579; +fma.rn.f32 f581, f561, 0f3F7994E0, f580; +fma.rn.f32 f582, f547, 0f3F1F9D07, f515; +mul.f32 f583, f549, 0f3E63DC87; +sub.f32 f584, f582, f583; +mul.f32 f585, f551, 0f3F66A5E5; +sub.f32 f586, f584, f585; +sub.f32 f587, f518, f538; +mul.f32 f588, f587, 0f3F48261C; +sub.f32 f589, f522, f534; +fma.rn.f32 f590, f589, 0f3F7994E0, f588; +sub.f32 f591, f526, f530; +fma.rn.f32 f592, f591, 0f3EDE2602, f590; +mul.f32 f593, f547, 0f3E63DC87; +sub.f32 f594, f515, f593; +mul.f32 f595, f549, 0f3F66A5E5; +sub.f32 f596, f594, f595; +fma.rn.f32 f597, f551, 0f3F1F9D07, f596; +mul.f32 f598, f587, 0f3F7994E0; +mul.f32 f599, f589, 0f3EDE2602; +sub.f32 f600, f598, f599; +mul.f32 f601, f591, 0f3F48261C; +sub.f32 f602, f600, f601; +mul.f32 f603, f547, 0f3F66A5E5; +sub.f32 f604, f515, f603; +fma.rn.f32 f605, f549, 0f3F1F9D07, f604; +mul.f32 f606, f551, 0f3E63DC87; +sub.f32 f607, f605, f606; +mul.f32 f608, f587, 0f3EDE2602; +mul.f32 f609, f589, 0f3F48261C; +sub.f32 f610, f608, f609; +fma.rn.f32 f611, f591, 0f3F7994E0, f610; +add.f32 %1, f551, f550; +add.f32 %0, f546, f545; +add.f32 %3, f592, f586; +sub.f32 %2, f556, f562; +add.f32 %5, f602, f597; +sub.f32 %4, f567, f572; +add.f32 %7, f611, f607; +sub.f32 %6, f577, f581; +sub.f32 %9, f607, f611; +add.f32 %8, f581, f577; +sub.f32 %11, f597, f602; +add.f32 %10, f572, f567; +sub.f32 %13, f586, f592; +add.f32 %12, f562, f556; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_2401), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<379, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<584>; +.reg .b32 r<27>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 9604, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %20, %34; +add.f32 f30, %18, f29; +add.f32 f31, %23, %31; +add.f32 f32, f31, f30; +add.f32 f33, %26, %28; +add.f32 f34, f33, f32; +add.f32 f35, %22, %35; +add.f32 f36, %19, f35; +add.f32 f37, %25, %33; +add.f32 f38, f37, f36; +add.f32 f39, %27, %30; +add.f32 f40, f39, f38; +fma.rn.f32 f41, f29, 0f3F1F9D07, %18; +mul.f32 f42, f31, 0f3E63DC87; +sub.f32 f43, f41, f42; +mul.f32 f44, f33, 0f3F66A5E5; +sub.f32 f45, f43, f44; +sub.f32 f46, %22, %35; +mul.f32 f47, f46, 0f3F48261C; +sub.f32 f48, %25, %33; +fma.rn.f32 f49, f48, 0f3F7994E0, f47; +sub.f32 f50, %27, %30; +fma.rn.f32 f51, f50, 0f3EDE2602, f49; +sub.f32 f52, f45, f51; +add.f32 f53, f51, f45; +mul.f32 f54, f29, 0f3E63DC87; +sub.f32 f55, %18, f54; +mul.f32 f56, f31, 0f3F66A5E5; +sub.f32 f57, f55, f56; +fma.rn.f32 f58, f33, 0f3F1F9D07, f57; +mul.f32 f59, f46, 0f3F7994E0; +mul.f32 f60, f48, 0f3EDE2602; +sub.f32 f61, f59, f60; +mul.f32 f62, f50, 0f3F48261C; +sub.f32 f63, f61, f62; +sub.f32 f64, f58, f63; +add.f32 f65, f63, f58; +mul.f32 f66, f29, 0f3F66A5E5; +sub.f32 f67, %18, f66; +fma.rn.f32 f68, f31, 0f3F1F9D07, f67; +mul.f32 f69, f33, 0f3E63DC87; +sub.f32 f70, f68, f69; +mul.f32 f71, f46, 0f3EDE2602; +mul.f32 f72, f48, 0f3F48261C; +sub.f32 f73, f71, f72; +fma.rn.f32 f74, f50, 0f3F7994E0, f73; +sub.f32 f75, f70, f74; +add.f32 f76, f74, f70; +fma.rn.f32 f77, f35, 0f3F1F9D07, %19; +mul.f32 f78, f37, 0f3E63DC87; +sub.f32 f79, f77, f78; +mul.f32 f80, f39, 0f3F66A5E5; +sub.f32 f81, f79, f80; +sub.f32 f82, %20, %34; +mul.f32 f83, f82, 0f3F48261C; +sub.f32 f84, %23, %31; +fma.rn.f32 f85, f84, 0f3F7994E0, f83; +sub.f32 f86, %26, %28; +fma.rn.f32 f87, f86, 0f3EDE2602, f85; +add.f32 f88, f87, f81; +sub.f32 f89, f81, f87; +mul.f32 f90, f35, 0f3E63DC87; +sub.f32 f91, %19, f90; +mul.f32 f92, f37, 0f3F66A5E5; +sub.f32 f93, f91, f92; +fma.rn.f32 f94, f39, 0f3F1F9D07, f93; +mul.f32 f95, f82, 0f3F7994E0; +mul.f32 f96, f84, 0f3EDE2602; +sub.f32 f97, f95, f96; +mul.f32 f98, f86, 0f3F48261C; +sub.f32 f99, f97, f98; +add.f32 f100, f99, f94; +sub.f32 f101, f94, f99; +mul.f32 f102, f35, 0f3F66A5E5; +sub.f32 f103, %19, f102; +fma.rn.f32 f104, f37, 0f3F1F9D07, f103; +mul.f32 f105, f39, 0f3E63DC87; +sub.f32 f106, f104, f105; +mul.f32 f107, f82, 0f3EDE2602; +mul.f32 f108, f84, 0f3F48261C; +sub.f32 f109, f107, f108; +fma.rn.f32 f110, f86, 0f3F7994E0, f109; +add.f32 f111, f110, f106; +sub.f32 f112, f106, f110; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f113, f114}, [rd6]; +mul.f32 f117, f88, f114; +fma.rn.f32 f118, f113, f52, f117; +mul.f32 f119, f52, f114; +mul.f32 f120, f113, f88; +sub.f32 f121, f120, f119; +mul.f32 f122, f113, f113; +mul.f32 f123, f114, f114; +sub.f32 f124, f122, f123; +mul.f32 f125, f114, f113; +fma.rn.f32 f126, f114, f113, f125; +mul.f32 f127, f100, f126; +fma.rn.f32 f128, f124, f64, f127; +mul.f32 f129, f64, f126; +mul.f32 f130, f124, f100; +sub.f32 f131, f130, f129; +mul.f32 f132, f113, f124; +mul.f32 f133, f114, f126; +sub.f32 f134, f132, f133; +mul.f32 f135, f113, f126; +fma.rn.f32 f136, f114, f124, f135; +mul.f32 f137, f111, f136; +fma.rn.f32 f138, f134, f75, f137; +mul.f32 f139, f75, f136; +mul.f32 f140, f134, f111; +sub.f32 f141, f140, f139; +mul.f32 f142, f113, f134; +mul.f32 f143, f114, f136; +sub.f32 f144, f142, f143; +mul.f32 f145, f113, f136; +fma.rn.f32 f146, f114, f134, f145; +mul.f32 f147, f112, f146; +fma.rn.f32 f148, f144, f76, f147; +mul.f32 f149, f76, f146; +mul.f32 f150, f144, f112; +sub.f32 f151, f150, f149; +mul.f32 f152, f113, f144; +mul.f32 f153, f114, f146; +sub.f32 f154, f152, f153; +mul.f32 f155, f113, f146; +fma.rn.f32 f156, f114, f144, f155; +mul.f32 f157, f101, f156; +fma.rn.f32 f158, f154, f65, f157; +mul.f32 f159, f65, f156; +mul.f32 f160, f154, f101; +sub.f32 f161, f160, f159; +mul.f32 f162, f113, f154; +mul.f32 f163, f114, f156; +sub.f32 f164, f162, f163; +mul.f32 f165, f113, f156; +fma.rn.f32 f166, f114, f154, f165; +mul.f32 f167, f89, f166; +fma.rn.f32 f168, f164, f53, f167; +mul.f32 f169, f53, f166; +mul.f32 f170, f164, f89; +sub.f32 f171, f170, f169; +mad.lo.s32 r8, r5, 9604, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 28, r8; +st.shared.f32 [r9], f34; +st.shared.f32 [r9+4], f118; +st.shared.f32 [r9+8], f128; +st.shared.f32 [r9+12], f138; +st.shared.f32 [r9+16], f148; +st.shared.f32 [r9+20], f158; +st.shared.f32 [r9+24], f168; +barrier.sync 0; +mad.lo.s32 r10, r7, -24, r9; +ld.shared.f32 f172, [r10]; +ld.shared.f32 f173, [r10+1372]; +ld.shared.f32 f174, [r10+2744]; +ld.shared.f32 f175, [r10+4116]; +ld.shared.f32 f176, [r10+5488]; +ld.shared.f32 f177, [r10+6860]; +ld.shared.f32 f178, [r10+8232]; +barrier.sync 0; +st.shared.f32 [r9], f40; +st.shared.f32 [r9+4], f121; +st.shared.f32 [r9+8], f131; +st.shared.f32 [r9+12], f141; +st.shared.f32 [r9+16], f151; +st.shared.f32 [r9+20], f161; +st.shared.f32 [r9+24], f171; +barrier.sync 0; +ld.shared.f32 f179, [r10]; +ld.shared.f32 f180, [r10+1372]; +ld.shared.f32 f181, [r10+2744]; +ld.shared.f32 f182, [r10+4116]; +ld.shared.f32 f183, [r10+5488]; +ld.shared.f32 f184, [r10+6860]; +ld.shared.f32 f185, [r10+8232]; +add.f32 f186, f173, f178; +add.f32 f187, f172, f186; +add.f32 f188, f174, f177; +add.f32 f189, f188, f187; +add.f32 f190, f175, f176; +add.f32 f191, f190, f189; +add.f32 f192, f180, f185; +add.f32 f193, f179, f192; +add.f32 f194, f181, f184; +add.f32 f195, f194, f193; +add.f32 f196, f182, f183; +add.f32 f197, f196, f195; +fma.rn.f32 f198, f186, 0f3F1F9D07, f172; +mul.f32 f199, f188, 0f3E63DC87; +sub.f32 f200, f198, f199; +mul.f32 f201, f190, 0f3F66A5E5; +sub.f32 f202, f200, f201; +sub.f32 f203, f180, f185; +mul.f32 f204, f203, 0f3F48261C; +sub.f32 f205, f181, f184; +fma.rn.f32 f206, f205, 0f3F7994E0, f204; +sub.f32 f207, f182, f183; +fma.rn.f32 f208, f207, 0f3EDE2602, f206; +sub.f32 f209, f202, f208; +add.f32 f210, f208, f202; +mul.f32 f211, f186, 0f3E63DC87; +sub.f32 f212, f172, f211; +mul.f32 f213, f188, 0f3F66A5E5; +sub.f32 f214, f212, f213; +fma.rn.f32 f215, f190, 0f3F1F9D07, f214; +mul.f32 f216, f203, 0f3F7994E0; +mul.f32 f217, f205, 0f3EDE2602; +sub.f32 f218, f216, f217; +mul.f32 f219, f207, 0f3F48261C; +sub.f32 f220, f218, f219; +sub.f32 f221, f215, f220; +add.f32 f222, f220, f215; +mul.f32 f223, f186, 0f3F66A5E5; +sub.f32 f224, f172, f223; +fma.rn.f32 f225, f188, 0f3F1F9D07, f224; +mul.f32 f226, f190, 0f3E63DC87; +sub.f32 f227, f225, f226; +mul.f32 f228, f203, 0f3EDE2602; +mul.f32 f229, f205, 0f3F48261C; +sub.f32 f230, f228, f229; +fma.rn.f32 f231, f207, 0f3F7994E0, f230; +sub.f32 f232, f227, f231; +add.f32 f233, f231, f227; +fma.rn.f32 f234, f192, 0f3F1F9D07, f179; +mul.f32 f235, f194, 0f3E63DC87; +sub.f32 f236, f234, f235; +mul.f32 f237, f196, 0f3F66A5E5; +sub.f32 f238, f236, f237; +sub.f32 f239, f173, f178; +mul.f32 f240, f239, 0f3F48261C; +sub.f32 f241, f174, f177; +fma.rn.f32 f242, f241, 0f3F7994E0, f240; +sub.f32 f243, f175, f176; +fma.rn.f32 f244, f243, 0f3EDE2602, f242; +add.f32 f245, f244, f238; +sub.f32 f246, f238, f244; +mul.f32 f247, f192, 0f3E63DC87; +sub.f32 f248, f179, f247; +mul.f32 f249, f194, 0f3F66A5E5; +sub.f32 f250, f248, f249; +fma.rn.f32 f251, f196, 0f3F1F9D07, f250; +mul.f32 f252, f239, 0f3F7994E0; +mul.f32 f253, f241, 0f3EDE2602; +sub.f32 f254, f252, f253; +mul.f32 f255, f243, 0f3F48261C; +sub.f32 f256, f254, f255; +add.f32 f257, f256, f251; +sub.f32 f258, f251, f256; +mul.f32 f259, f192, 0f3F66A5E5; +sub.f32 f260, f179, f259; +fma.rn.f32 f261, f194, 0f3F1F9D07, f260; +mul.f32 f262, f196, 0f3E63DC87; +sub.f32 f263, f261, f262; +mul.f32 f264, f239, 0f3EDE2602; +mul.f32 f265, f241, 0f3F48261C; +sub.f32 f266, f264, f265; +fma.rn.f32 f267, f243, 0f3F7994E0, f266; +add.f32 f268, f267, f263; +sub.f32 f269, f263, f267; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f270, f271}, [rd11]; +mul.f32 f274, f245, f271; +fma.rn.f32 f275, f270, f209, f274; +mul.f32 f276, f209, f271; +mul.f32 f277, f270, f245; +sub.f32 f278, f277, f276; +mul.f32 f279, f270, f270; +mul.f32 f280, f271, f271; +sub.f32 f281, f279, f280; +mul.f32 f282, f271, f270; +fma.rn.f32 f283, f271, f270, f282; +mul.f32 f284, f257, f283; +fma.rn.f32 f285, f281, f221, f284; +mul.f32 f286, f221, f283; +mul.f32 f287, f281, f257; +sub.f32 f288, f287, f286; +mul.f32 f289, f270, f281; +mul.f32 f290, f271, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f270, f283; +fma.rn.f32 f293, f271, f281, f292; +mul.f32 f294, f268, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f268; +sub.f32 f298, f297, f296; +mul.f32 f299, f270, f291; +mul.f32 f300, f271, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f270, f293; +fma.rn.f32 f303, f271, f291, f302; +mul.f32 f304, f269, f303; +fma.rn.f32 f305, f301, f233, f304; +mul.f32 f306, f233, f303; +mul.f32 f307, f301, f269; +sub.f32 f308, f307, f306; +mul.f32 f309, f270, f301; +mul.f32 f310, f271, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f270, f303; +fma.rn.f32 f313, f271, f301, f312; +mul.f32 f314, f258, f313; +fma.rn.f32 f315, f311, f222, f314; +mul.f32 f316, f222, f313; +mul.f32 f317, f311, f258; +sub.f32 f318, f317, f316; +mul.f32 f319, f270, f311; +mul.f32 f320, f271, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f270, f313; +fma.rn.f32 f323, f271, f311, f322; +mul.f32 f324, f246, f323; +fma.rn.f32 f325, f321, f210, f324; +mul.f32 f326, f210, f323; +mul.f32 f327, f321, f246; +sub.f32 f328, f327, f326; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 196, r19; +st.shared.f32 [r20], f191; +st.shared.f32 [r20+28], f275; +st.shared.f32 [r20+56], f285; +st.shared.f32 [r20+84], f295; +st.shared.f32 [r20+112], f305; +st.shared.f32 [r20+140], f315; +st.shared.f32 [r20+168], f325; +barrier.sync 0; +ld.shared.f32 f329, [r10]; +ld.shared.f32 f330, [r10+1372]; +ld.shared.f32 f331, [r10+2744]; +ld.shared.f32 f332, [r10+4116]; +ld.shared.f32 f333, [r10+5488]; +ld.shared.f32 f334, [r10+6860]; +ld.shared.f32 f335, [r10+8232]; +barrier.sync 0; +st.shared.f32 [r20], f197; +st.shared.f32 [r20+28], f278; +st.shared.f32 [r20+56], f288; +st.shared.f32 [r20+84], f298; +st.shared.f32 [r20+112], f308; +st.shared.f32 [r20+140], f318; +st.shared.f32 [r20+168], f328; +barrier.sync 0; +ld.shared.f32 f336, [r10]; +ld.shared.f32 f337, [r10+1372]; +ld.shared.f32 f338, [r10+2744]; +ld.shared.f32 f339, [r10+4116]; +ld.shared.f32 f340, [r10+5488]; +ld.shared.f32 f341, [r10+6860]; +ld.shared.f32 f342, [r10+8232]; +add.f32 f343, f330, f335; +add.f32 f344, f329, f343; +add.f32 f345, f331, f334; +add.f32 f346, f345, f344; +add.f32 f347, f332, f333; +add.f32 f348, f347, f346; +add.f32 f349, f337, f342; +add.f32 f350, f336, f349; +add.f32 f351, f338, f341; +add.f32 f352, f351, f350; +add.f32 f353, f339, f340; +add.f32 f354, f353, f352; +fma.rn.f32 f355, f343, 0f3F1F9D07, f329; +mul.f32 f356, f345, 0f3E63DC87; +sub.f32 f357, f355, f356; +mul.f32 f358, f347, 0f3F66A5E5; +sub.f32 f359, f357, f358; +sub.f32 f360, f337, f342; +mul.f32 f361, f360, 0f3F48261C; +sub.f32 f362, f338, f341; +fma.rn.f32 f363, f362, 0f3F7994E0, f361; +sub.f32 f364, f339, f340; +fma.rn.f32 f365, f364, 0f3EDE2602, f363; +sub.f32 f366, f359, f365; +add.f32 f367, f365, f359; +mul.f32 f368, f343, 0f3E63DC87; +sub.f32 f369, f329, f368; +mul.f32 f370, f345, 0f3F66A5E5; +sub.f32 f371, f369, f370; +fma.rn.f32 f372, f347, 0f3F1F9D07, f371; +mul.f32 f373, f360, 0f3F7994E0; +mul.f32 f374, f362, 0f3EDE2602; +sub.f32 f375, f373, f374; +mul.f32 f376, f364, 0f3F48261C; +sub.f32 f377, f375, f376; +sub.f32 f378, f372, f377; +add.f32 f379, f377, f372; +mul.f32 f380, f343, 0f3F66A5E5; +sub.f32 f381, f329, f380; +fma.rn.f32 f382, f345, 0f3F1F9D07, f381; +mul.f32 f383, f347, 0f3E63DC87; +sub.f32 f384, f382, f383; +mul.f32 f385, f360, 0f3EDE2602; +mul.f32 f386, f362, 0f3F48261C; +sub.f32 f387, f385, f386; +fma.rn.f32 f388, f364, 0f3F7994E0, f387; +sub.f32 f389, f384, f388; +add.f32 f390, f388, f384; +fma.rn.f32 f391, f349, 0f3F1F9D07, f336; +mul.f32 f392, f351, 0f3E63DC87; +sub.f32 f393, f391, f392; +mul.f32 f394, f353, 0f3F66A5E5; +sub.f32 f395, f393, f394; +sub.f32 f396, f330, f335; +mul.f32 f397, f396, 0f3F48261C; +sub.f32 f398, f331, f334; +fma.rn.f32 f399, f398, 0f3F7994E0, f397; +sub.f32 f400, f332, f333; +fma.rn.f32 f401, f400, 0f3EDE2602, f399; +add.f32 f402, f401, f395; +sub.f32 f403, f395, f401; +mul.f32 f404, f349, 0f3E63DC87; +sub.f32 f405, f336, f404; +mul.f32 f406, f351, 0f3F66A5E5; +sub.f32 f407, f405, f406; +fma.rn.f32 f408, f353, 0f3F1F9D07, f407; +mul.f32 f409, f396, 0f3F7994E0; +mul.f32 f410, f398, 0f3EDE2602; +sub.f32 f411, f409, f410; +mul.f32 f412, f400, 0f3F48261C; +sub.f32 f413, f411, f412; +add.f32 f414, f413, f408; +sub.f32 f415, f408, f413; +mul.f32 f416, f349, 0f3F66A5E5; +sub.f32 f417, f336, f416; +fma.rn.f32 f418, f351, 0f3F1F9D07, f417; +mul.f32 f419, f353, 0f3E63DC87; +sub.f32 f420, f418, f419; +mul.f32 f421, f396, 0f3EDE2602; +mul.f32 f422, f398, 0f3F48261C; +sub.f32 f423, f421, f422; +fma.rn.f32 f424, f400, 0f3F7994E0, f423; +add.f32 f425, f424, f420; +sub.f32 f426, f420, f424; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 8; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f427, f428}, [rd16]; +mul.f32 f431, f402, f428; +fma.rn.f32 f432, f427, f366, f431; +mul.f32 f433, f366, f428; +mul.f32 f434, f427, f402; +sub.f32 f435, f434, f433; +mul.f32 f436, f427, f427; +mul.f32 f437, f428, f428; +sub.f32 f438, f436, f437; +mul.f32 f439, f428, f427; +fma.rn.f32 f440, f428, f427, f439; +mul.f32 f441, f414, f440; +fma.rn.f32 f442, f438, f378, f441; +mul.f32 f443, f378, f440; +mul.f32 f444, f438, f414; +sub.f32 f445, f444, f443; +mul.f32 f446, f427, f438; +mul.f32 f447, f428, f440; +sub.f32 f448, f446, f447; +mul.f32 f449, f427, f440; +fma.rn.f32 f450, f428, f438, f449; +mul.f32 f451, f425, f450; +fma.rn.f32 f452, f448, f389, f451; +mul.f32 f453, f389, f450; +mul.f32 f454, f448, f425; +sub.f32 f455, f454, f453; +mul.f32 f456, f427, f448; +mul.f32 f457, f428, f450; +sub.f32 f458, f456, f457; +mul.f32 f459, f427, f450; +fma.rn.f32 f460, f428, f448, f459; +mul.f32 f461, f426, f460; +fma.rn.f32 f462, f458, f390, f461; +mul.f32 f463, f390, f460; +mul.f32 f464, f458, f426; +sub.f32 f465, f464, f463; +mul.f32 f466, f427, f458; +mul.f32 f467, f428, f460; +sub.f32 f468, f466, f467; +mul.f32 f469, f427, f460; +fma.rn.f32 f470, f428, f458, f469; +mul.f32 f471, f415, f470; +fma.rn.f32 f472, f468, f379, f471; +mul.f32 f473, f379, f470; +mul.f32 f474, f468, f415; +sub.f32 f475, f474, f473; +mul.f32 f476, f427, f468; +mul.f32 f477, f428, f470; +sub.f32 f478, f476, f477; +mul.f32 f479, f427, f470; +fma.rn.f32 f480, f428, f468, f479; +mul.f32 f481, f403, f480; +fma.rn.f32 f482, f478, f367, f481; +mul.f32 f483, f367, f480; +mul.f32 f484, f478, f403; +sub.f32 f485, f484, f483; +shl.b32 r24, r23, 2; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 1372, r25; +st.shared.f32 [r26], f348; +st.shared.f32 [r26+196], f432; +st.shared.f32 [r26+392], f442; +st.shared.f32 [r26+588], f452; +st.shared.f32 [r26+784], f462; +st.shared.f32 [r26+980], f472; +st.shared.f32 [r26+1176], f482; +barrier.sync 0; +ld.shared.f32 f486, [r10]; +ld.shared.f32 f487, [r10+1372]; +ld.shared.f32 f488, [r10+2744]; +ld.shared.f32 f489, [r10+4116]; +ld.shared.f32 f490, [r10+5488]; +ld.shared.f32 f491, [r10+6860]; +ld.shared.f32 f492, [r10+8232]; +barrier.sync 0; +st.shared.f32 [r26], f354; +st.shared.f32 [r26+196], f435; +st.shared.f32 [r26+392], f445; +st.shared.f32 [r26+588], f455; +st.shared.f32 [r26+784], f465; +st.shared.f32 [r26+980], f475; +st.shared.f32 [r26+1176], f485; +barrier.sync 0; +ld.shared.f32 f493, [r10]; +ld.shared.f32 f494, [r10+1372]; +ld.shared.f32 f495, [r10+2744]; +ld.shared.f32 f496, [r10+4116]; +ld.shared.f32 f497, [r10+5488]; +ld.shared.f32 f498, [r10+6860]; +ld.shared.f32 f499, [r10+8232]; +add.f32 f500, f487, f492; +add.f32 f501, f486, f500; +add.f32 f502, f488, f491; +add.f32 f503, f502, f501; +add.f32 f504, f489, f490; +add.f32 f505, f494, f499; +add.f32 f506, f493, f505; +add.f32 f507, f495, f498; +add.f32 f508, f507, f506; +add.f32 f509, f496, f497; +fma.rn.f32 f510, f500, 0f3F1F9D07, f486; +mul.f32 f511, f502, 0f3E63DC87; +sub.f32 f512, f510, f511; +mul.f32 f513, f504, 0f3F66A5E5; +sub.f32 f514, f512, f513; +sub.f32 f515, f494, f499; +mul.f32 f516, f515, 0f3F48261C; +sub.f32 f517, f495, f498; +fma.rn.f32 f518, f517, 0f3F7994E0, f516; +sub.f32 f519, f496, f497; +fma.rn.f32 f520, f519, 0f3EDE2602, f518; +mul.f32 f521, f500, 0f3E63DC87; +sub.f32 f522, f486, f521; +mul.f32 f523, f502, 0f3F66A5E5; +sub.f32 f524, f522, f523; +fma.rn.f32 f525, f504, 0f3F1F9D07, f524; +mul.f32 f526, f515, 0f3F7994E0; +mul.f32 f527, f517, 0f3EDE2602; +sub.f32 f528, f526, f527; +mul.f32 f529, f519, 0f3F48261C; +sub.f32 f530, f528, f529; +mul.f32 f531, f500, 0f3F66A5E5; +sub.f32 f532, f486, f531; +fma.rn.f32 f533, f502, 0f3F1F9D07, f532; +mul.f32 f534, f504, 0f3E63DC87; +sub.f32 f535, f533, f534; +mul.f32 f536, f515, 0f3EDE2602; +mul.f32 f537, f517, 0f3F48261C; +sub.f32 f538, f536, f537; +fma.rn.f32 f539, f519, 0f3F7994E0, f538; +fma.rn.f32 f540, f505, 0f3F1F9D07, f493; +mul.f32 f541, f507, 0f3E63DC87; +sub.f32 f542, f540, f541; +mul.f32 f543, f509, 0f3F66A5E5; +sub.f32 f544, f542, f543; +sub.f32 f545, f487, f492; +mul.f32 f546, f545, 0f3F48261C; +sub.f32 f547, f488, f491; +fma.rn.f32 f548, f547, 0f3F7994E0, f546; +sub.f32 f549, f489, f490; +fma.rn.f32 f550, f549, 0f3EDE2602, f548; +mul.f32 f551, f505, 0f3E63DC87; +sub.f32 f552, f493, f551; +mul.f32 f553, f507, 0f3F66A5E5; +sub.f32 f554, f552, f553; +fma.rn.f32 f555, f509, 0f3F1F9D07, f554; +mul.f32 f556, f545, 0f3F7994E0; +mul.f32 f557, f547, 0f3EDE2602; +sub.f32 f558, f556, f557; +mul.f32 f559, f549, 0f3F48261C; +sub.f32 f560, f558, f559; +mul.f32 f561, f505, 0f3F66A5E5; +sub.f32 f562, f493, f561; +fma.rn.f32 f563, f507, 0f3F1F9D07, f562; +mul.f32 f564, f509, 0f3E63DC87; +sub.f32 f565, f563, f564; +mul.f32 f566, f545, 0f3EDE2602; +mul.f32 f567, f547, 0f3F48261C; +sub.f32 f568, f566, f567; +fma.rn.f32 f569, f549, 0f3F7994E0, f568; +add.f32 %0, f504, f503; +add.f32 %1, f509, f508; +add.f32 %3, f550, f544; +sub.f32 %2, f514, f520; +sub.f32 %4, f525, f530; +add.f32 %5, f560, f555; +sub.f32 %6, f535, f539; +add.f32 %7, f569, f565; +add.f32 %8, f539, f535; +sub.f32 %9, f565, f569; +add.f32 %10, f530, f525; +sub.f32 %11, f555, f560; +sub.f32 %13, f544, f550; +add.f32 %12, f520, f514; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_2401), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..206371a653328 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp64_fwd.hpp.inc @@ -0,0 +1,1272 @@ +#ifndef CUFFTDX_FFT_2401_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_2401_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<551, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<639>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 38416, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %20, %34; +add.f64 fd30, %18, fd29; +add.f64 fd31, %23, %31; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %26, %28; +add.f64 fd34, %22, %35; +add.f64 fd35, %19, fd34; +add.f64 fd36, %25, %33; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %27, %30; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %22, %35; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %25, %33; +mul.f64 fd47, fd46, 0dBFEF329C0558E969; +sub.f64 fd48, fd47, fd45; +sub.f64 fd49, %27, %30; +mul.f64 fd50, fd49, 0d3FDBC4C04D71ABC1; +sub.f64 fd51, fd48, fd50; +sub.f64 fd52, fd43, fd51; +add.f64 fd53, fd51, fd43; +mul.f64 fd54, fd29, 0d3FCC7B90E3024582; +sub.f64 fd55, %18, fd54; +mul.f64 fd56, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd57, fd55, fd56; +fma.rn.f64 fd58, fd33, 0d3FE3F3A0E28BEDD1, fd57; +mul.f64 fd59, fd44, 0d3FEF329C0558E969; +mul.f64 fd60, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd61, fd60, fd59; +fma.rn.f64 fd62, fd49, 0d3FE904C37505DE4B, fd61; +sub.f64 fd63, fd58, fd62; +add.f64 fd64, fd62, fd58; +mul.f64 fd65, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd66, %18, fd65; +fma.rn.f64 fd67, fd31, 0d3FE3F3A0E28BEDD1, fd66; +mul.f64 fd68, fd33, 0d3FCC7B90E3024582; +sub.f64 fd69, fd67, fd68; +mul.f64 fd70, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd71, fd46, 0d3FE904C37505DE4B; +sub.f64 fd72, fd71, fd70; +mul.f64 fd73, fd49, 0d3FEF329C0558E969; +sub.f64 fd74, fd72, fd73; +sub.f64 fd75, fd69, fd74; +add.f64 fd76, fd74, fd69; +fma.rn.f64 fd77, fd34, 0d3FE3F3A0E28BEDD1, %19; +mul.f64 fd78, fd36, 0d3FCC7B90E3024582; +sub.f64 fd79, fd77, fd78; +mul.f64 fd80, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd81, fd79, fd80; +sub.f64 fd82, %20, %34; +mul.f64 fd83, fd82, 0d3FE904C37505DE4B; +sub.f64 fd84, %23, %31; +mul.f64 fd85, fd84, 0dBFEF329C0558E969; +sub.f64 fd86, fd85, fd83; +sub.f64 fd87, %26, %28; +mul.f64 fd88, fd87, 0d3FDBC4C04D71ABC1; +sub.f64 fd89, fd86, fd88; +add.f64 fd90, fd89, fd81; +sub.f64 fd91, fd81, fd89; +mul.f64 fd92, fd34, 0d3FCC7B90E3024582; +sub.f64 fd93, %19, fd92; +mul.f64 fd94, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd95, fd93, fd94; +fma.rn.f64 fd96, fd38, 0d3FE3F3A0E28BEDD1, fd95; +mul.f64 fd97, fd82, 0d3FEF329C0558E969; +mul.f64 fd98, fd84, 0d3FDBC4C04D71ABC1; +sub.f64 fd99, fd98, fd97; +fma.rn.f64 fd100, fd87, 0d3FE904C37505DE4B, fd99; +add.f64 fd101, fd100, fd96; +sub.f64 fd102, fd96, fd100; +mul.f64 fd103, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd104, %19, fd103; +fma.rn.f64 fd105, fd36, 0d3FE3F3A0E28BEDD1, fd104; +mul.f64 fd106, fd38, 0d3FCC7B90E3024582; +sub.f64 fd107, fd105, fd106; +mul.f64 fd108, fd82, 0d3FDBC4C04D71ABC1; +mul.f64 fd109, fd84, 0d3FE904C37505DE4B; +sub.f64 fd110, fd109, fd108; +mul.f64 fd111, fd87, 0d3FEF329C0558E969; +sub.f64 fd112, fd110, fd111; +add.f64 fd113, fd112, fd107; +sub.f64 fd114, fd107, fd112; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 38416, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd115, fd116}, [rd6]; +mul.f64 fd119, fd115, fd52; +mul.f64 fd120, fd116, fd90; +mul.f64 fd121, fd115, fd90; +mul.f64 fd122, fd115, fd115; +mul.f64 fd123, fd116, fd116; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd116, fd115; +fma.rn.f64 fd126, fd116, fd115, fd125; +mul.f64 fd127, fd124, fd63; +mul.f64 fd128, fd126, fd101; +mul.f64 fd129, fd124, fd101; +mul.f64 fd130, fd115, fd124; +mul.f64 fd131, fd116, fd126; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd115, fd126; +fma.rn.f64 fd134, fd116, fd124, fd133; +mul.f64 fd135, fd132, fd75; +mul.f64 fd136, fd134, fd113; +mul.f64 fd137, fd132, fd113; +ld.global.v2.f64 {fd138, fd139}, [rd6+5488]; +mul.f64 fd142, fd138, fd76; +mul.f64 fd143, fd139, fd114; +mul.f64 fd144, fd138, fd114; +mul.f64 fd145, fd115, fd138; +mul.f64 fd146, fd116, fd139; +sub.f64 fd147, fd145, fd146; +mul.f64 fd148, fd115, fd139; +fma.rn.f64 fd149, fd116, fd138, fd148; +mul.f64 fd150, fd147, fd64; +mul.f64 fd151, fd149, fd102; +mul.f64 fd152, fd147, fd102; +mul.f64 fd153, fd115, fd147; +mul.f64 fd154, fd116, fd149; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd115, fd149; +fma.rn.f64 fd157, fd116, fd147, fd156; +mul.f64 fd158, fd155, fd53; +mul.f64 fd159, fd157, fd91; +mul.f64 fd160, fd155, fd91; +barrier.sync 0; +mad.lo.s32 r9, r7, 112, r8; +add.f64 fd161, fd38, fd37; +add.f64 fd162, fd33, fd32; +st.shared.v2.f64 [r9], {fd162, fd161}; +fma.rn.f64 fd163, fd116, fd52, fd121; +sub.f64 fd164, fd119, fd120; +st.shared.v2.f64 [r9+16], {fd164, fd163}; +fma.rn.f64 fd165, fd126, fd63, fd129; +sub.f64 fd166, fd127, fd128; +st.shared.v2.f64 [r9+32], {fd166, fd165}; +sub.f64 fd167, fd135, fd136; +fma.rn.f64 fd168, fd134, fd75, fd137; +st.shared.v2.f64 [r9+48], {fd167, fd168}; +fma.rn.f64 fd169, fd139, fd76, fd144; +sub.f64 fd170, fd142, fd143; +st.shared.v2.f64 [r9+64], {fd170, fd169}; +fma.rn.f64 fd171, fd149, fd64, fd152; +sub.f64 fd172, fd150, fd151; +st.shared.v2.f64 [r9+80], {fd172, fd171}; +sub.f64 fd173, fd158, fd159; +fma.rn.f64 fd174, fd157, fd53, fd160; +st.shared.v2.f64 [r9+96], {fd173, fd174}; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.v2.f64 {fd175, fd176}, [r10]; +ld.shared.v2.f64 {fd179, fd180}, [r10+5488]; +ld.shared.v2.f64 {fd183, fd184}, [r10+10976]; +ld.shared.v2.f64 {fd187, fd188}, [r10+16464]; +ld.shared.v2.f64 {fd191, fd192}, [r10+21952]; +ld.shared.v2.f64 {fd195, fd196}, [r10+27440]; +ld.shared.v2.f64 {fd199, fd200}, [r10+32928]; +add.f64 fd203, fd179, fd199; +add.f64 fd204, fd175, fd203; +add.f64 fd205, fd183, fd195; +add.f64 fd206, fd205, fd204; +add.f64 fd207, fd187, fd191; +add.f64 fd208, fd180, fd200; +add.f64 fd209, fd176, fd208; +add.f64 fd210, fd184, fd196; +add.f64 fd211, fd210, fd209; +add.f64 fd212, fd188, fd192; +fma.rn.f64 fd213, fd203, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd214, fd205, 0d3FCC7B90E3024582; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd207, 0d3FECD4BCA9CB5C71; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, fd180, fd200; +mul.f64 fd219, fd218, 0d3FE904C37505DE4B; +sub.f64 fd220, fd184, fd196; +mul.f64 fd221, fd220, 0dBFEF329C0558E969; +sub.f64 fd222, fd221, fd219; +sub.f64 fd223, fd188, fd192; +mul.f64 fd224, fd223, 0d3FDBC4C04D71ABC1; +sub.f64 fd225, fd222, fd224; +sub.f64 fd226, fd217, fd225; +add.f64 fd227, fd225, fd217; +mul.f64 fd228, fd203, 0d3FCC7B90E3024582; +sub.f64 fd229, fd175, fd228; +mul.f64 fd230, fd205, 0d3FECD4BCA9CB5C71; +sub.f64 fd231, fd229, fd230; +fma.rn.f64 fd232, fd207, 0d3FE3F3A0E28BEDD1, fd231; +mul.f64 fd233, fd218, 0d3FEF329C0558E969; +mul.f64 fd234, fd220, 0d3FDBC4C04D71ABC1; +sub.f64 fd235, fd234, fd233; +fma.rn.f64 fd236, fd223, 0d3FE904C37505DE4B, fd235; +sub.f64 fd237, fd232, fd236; +add.f64 fd238, fd236, fd232; +mul.f64 fd239, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd240, fd175, fd239; +fma.rn.f64 fd241, fd205, 0d3FE3F3A0E28BEDD1, fd240; +mul.f64 fd242, fd207, 0d3FCC7B90E3024582; +sub.f64 fd243, fd241, fd242; +mul.f64 fd244, fd218, 0d3FDBC4C04D71ABC1; +mul.f64 fd245, fd220, 0d3FE904C37505DE4B; +sub.f64 fd246, fd245, fd244; +mul.f64 fd247, fd223, 0d3FEF329C0558E969; +sub.f64 fd248, fd246, fd247; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +fma.rn.f64 fd251, fd208, 0d3FE3F3A0E28BEDD1, fd176; +mul.f64 fd252, fd210, 0d3FCC7B90E3024582; +sub.f64 fd253, fd251, fd252; +mul.f64 fd254, fd212, 0d3FECD4BCA9CB5C71; +sub.f64 fd255, fd253, fd254; +sub.f64 fd256, fd179, fd199; +mul.f64 fd257, fd256, 0d3FE904C37505DE4B; +sub.f64 fd258, fd183, fd195; +mul.f64 fd259, fd258, 0dBFEF329C0558E969; +sub.f64 fd260, fd259, fd257; +sub.f64 fd261, fd187, fd191; +mul.f64 fd262, fd261, 0d3FDBC4C04D71ABC1; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd263, fd255; +sub.f64 fd265, fd255, fd263; +mul.f64 fd266, fd208, 0d3FCC7B90E3024582; +sub.f64 fd267, fd176, fd266; +mul.f64 fd268, fd210, 0d3FECD4BCA9CB5C71; +sub.f64 fd269, fd267, fd268; +fma.rn.f64 fd270, fd212, 0d3FE3F3A0E28BEDD1, fd269; +mul.f64 fd271, fd256, 0d3FEF329C0558E969; +mul.f64 fd272, fd258, 0d3FDBC4C04D71ABC1; +sub.f64 fd273, fd272, fd271; +fma.rn.f64 fd274, fd261, 0d3FE904C37505DE4B, fd273; +add.f64 fd275, fd274, fd270; +sub.f64 fd276, fd270, fd274; +mul.f64 fd277, fd208, 0d3FECD4BCA9CB5C71; +sub.f64 fd278, fd176, fd277; +fma.rn.f64 fd279, fd210, 0d3FE3F3A0E28BEDD1, fd278; +mul.f64 fd280, fd212, 0d3FCC7B90E3024582; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd256, 0d3FDBC4C04D71ABC1; +mul.f64 fd283, fd258, 0d3FE904C37505DE4B; +sub.f64 fd284, fd283, fd282; +mul.f64 fd285, fd261, 0d3FEF329C0558E969; +sub.f64 fd286, fd284, fd285; +add.f64 fd287, fd286, fd281; +sub.f64 fd288, fd281, fd286; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd289, fd290}, [rd11]; +mul.f64 fd293, fd289, fd226; +mul.f64 fd294, fd290, fd264; +mul.f64 fd295, fd289, fd264; +mul.f64 fd296, fd289, fd289; +mul.f64 fd297, fd290, fd290; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd290, fd289; +fma.rn.f64 fd300, fd290, fd289, fd299; +mul.f64 fd301, fd298, fd237; +mul.f64 fd302, fd300, fd275; +mul.f64 fd303, fd298, fd275; +mul.f64 fd304, fd289, fd298; +mul.f64 fd305, fd290, fd300; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd289, fd300; +fma.rn.f64 fd308, fd290, fd298, fd307; +mul.f64 fd309, fd306, fd249; +mul.f64 fd310, fd308, fd287; +mul.f64 fd311, fd306, fd287; +ld.global.v2.f64 {fd312, fd313}, [rd11+784]; +mul.f64 fd316, fd312, fd250; +mul.f64 fd317, fd313, fd288; +mul.f64 fd318, fd312, fd288; +mul.f64 fd319, fd289, fd312; +mul.f64 fd320, fd290, fd313; +sub.f64 fd321, fd319, fd320; +mul.f64 fd322, fd289, fd313; +fma.rn.f64 fd323, fd290, fd312, fd322; +mul.f64 fd324, fd321, fd238; +mul.f64 fd325, fd323, fd276; +mul.f64 fd326, fd321, fd276; +mul.f64 fd327, fd289, fd321; +mul.f64 fd328, fd290, fd323; +sub.f64 fd329, fd327, fd328; +mul.f64 fd330, fd289, fd323; +fma.rn.f64 fd331, fd290, fd321, fd330; +mul.f64 fd332, fd329, fd227; +mul.f64 fd333, fd331, fd265; +mul.f64 fd334, fd329, fd265; +shl.b32 r18, r17, 4; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 784, r19; +add.f64 fd335, fd212, fd211; +add.f64 fd336, fd207, fd206; +st.shared.v2.f64 [r20], {fd336, fd335}; +fma.rn.f64 fd337, fd290, fd226, fd295; +sub.f64 fd338, fd293, fd294; +st.shared.v2.f64 [r20+112], {fd338, fd337}; +fma.rn.f64 fd339, fd300, fd237, fd303; +sub.f64 fd340, fd301, fd302; +st.shared.v2.f64 [r20+224], {fd340, fd339}; +fma.rn.f64 fd341, fd308, fd249, fd311; +sub.f64 fd342, fd309, fd310; +st.shared.v2.f64 [r20+336], {fd342, fd341}; +fma.rn.f64 fd343, fd313, fd250, fd318; +sub.f64 fd344, fd316, fd317; +st.shared.v2.f64 [r20+448], {fd344, fd343}; +fma.rn.f64 fd345, fd323, fd238, fd326; +sub.f64 fd346, fd324, fd325; +st.shared.v2.f64 [r20+560], {fd346, fd345}; +fma.rn.f64 fd347, fd331, fd227, fd334; +sub.f64 fd348, fd332, fd333; +st.shared.v2.f64 [r20+672], {fd348, fd347}; +barrier.sync 0; +ld.shared.v2.f64 {fd349, fd350}, [r10]; +ld.shared.v2.f64 {fd353, fd354}, [r10+5488]; +ld.shared.v2.f64 {fd357, fd358}, [r10+10976]; +ld.shared.v2.f64 {fd361, fd362}, [r10+16464]; +ld.shared.v2.f64 {fd365, fd366}, [r10+21952]; +ld.shared.v2.f64 {fd369, fd370}, [r10+27440]; +ld.shared.v2.f64 {fd373, fd374}, [r10+32928]; +add.f64 fd377, fd353, fd373; +add.f64 fd378, fd349, fd377; +add.f64 fd379, fd357, fd369; +add.f64 fd380, fd379, fd378; +add.f64 fd381, fd361, fd365; +add.f64 fd382, fd354, fd374; +add.f64 fd383, fd350, fd382; +add.f64 fd384, fd358, fd370; +add.f64 fd385, fd384, fd383; +add.f64 fd386, fd362, fd366; +fma.rn.f64 fd387, fd377, 0d3FE3F3A0E28BEDD1, fd349; +mul.f64 fd388, fd379, 0d3FCC7B90E3024582; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd381, 0d3FECD4BCA9CB5C71; +sub.f64 fd391, fd389, fd390; +sub.f64 fd392, fd354, fd374; +mul.f64 fd393, fd392, 0d3FE904C37505DE4B; +sub.f64 fd394, fd358, fd370; +mul.f64 fd395, fd394, 0dBFEF329C0558E969; +sub.f64 fd396, fd395, fd393; +sub.f64 fd397, fd362, fd366; +mul.f64 fd398, fd397, 0d3FDBC4C04D71ABC1; +sub.f64 fd399, fd396, fd398; +sub.f64 fd400, fd391, fd399; +add.f64 fd401, fd399, fd391; +mul.f64 fd402, fd377, 0d3FCC7B90E3024582; +sub.f64 fd403, fd349, fd402; +mul.f64 fd404, fd379, 0d3FECD4BCA9CB5C71; +sub.f64 fd405, fd403, fd404; +fma.rn.f64 fd406, fd381, 0d3FE3F3A0E28BEDD1, fd405; +mul.f64 fd407, fd392, 0d3FEF329C0558E969; +mul.f64 fd408, fd394, 0d3FDBC4C04D71ABC1; +sub.f64 fd409, fd408, fd407; +fma.rn.f64 fd410, fd397, 0d3FE904C37505DE4B, fd409; +sub.f64 fd411, fd406, fd410; +add.f64 fd412, fd410, fd406; +mul.f64 fd413, fd377, 0d3FECD4BCA9CB5C71; +sub.f64 fd414, fd349, fd413; +fma.rn.f64 fd415, fd379, 0d3FE3F3A0E28BEDD1, fd414; +mul.f64 fd416, fd381, 0d3FCC7B90E3024582; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd392, 0d3FDBC4C04D71ABC1; +mul.f64 fd419, fd394, 0d3FE904C37505DE4B; +sub.f64 fd420, fd419, fd418; +mul.f64 fd421, fd397, 0d3FEF329C0558E969; +sub.f64 fd422, fd420, fd421; +sub.f64 fd423, fd417, fd422; +add.f64 fd424, fd422, fd417; +fma.rn.f64 fd425, fd382, 0d3FE3F3A0E28BEDD1, fd350; +mul.f64 fd426, fd384, 0d3FCC7B90E3024582; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd386, 0d3FECD4BCA9CB5C71; +sub.f64 fd429, fd427, fd428; +sub.f64 fd430, fd353, fd373; +mul.f64 fd431, fd430, 0d3FE904C37505DE4B; +sub.f64 fd432, fd357, fd369; +mul.f64 fd433, fd432, 0dBFEF329C0558E969; +sub.f64 fd434, fd433, fd431; +sub.f64 fd435, fd361, fd365; +mul.f64 fd436, fd435, 0d3FDBC4C04D71ABC1; +sub.f64 fd437, fd434, fd436; +add.f64 fd438, fd437, fd429; +sub.f64 fd439, fd429, fd437; +mul.f64 fd440, fd382, 0d3FCC7B90E3024582; +sub.f64 fd441, fd350, fd440; +mul.f64 fd442, fd384, 0d3FECD4BCA9CB5C71; +sub.f64 fd443, fd441, fd442; +fma.rn.f64 fd444, fd386, 0d3FE3F3A0E28BEDD1, fd443; +mul.f64 fd445, fd430, 0d3FEF329C0558E969; +mul.f64 fd446, fd432, 0d3FDBC4C04D71ABC1; +sub.f64 fd447, fd446, fd445; +fma.rn.f64 fd448, fd435, 0d3FE904C37505DE4B, fd447; +add.f64 fd449, fd448, fd444; +sub.f64 fd450, fd444, fd448; +mul.f64 fd451, fd382, 0d3FECD4BCA9CB5C71; +sub.f64 fd452, fd350, fd451; +fma.rn.f64 fd453, fd384, 0d3FE3F3A0E28BEDD1, fd452; +mul.f64 fd454, fd386, 0d3FCC7B90E3024582; +sub.f64 fd455, fd453, fd454; +mul.f64 fd456, fd430, 0d3FDBC4C04D71ABC1; +mul.f64 fd457, fd432, 0d3FE904C37505DE4B; +sub.f64 fd458, fd457, fd456; +mul.f64 fd459, fd435, 0d3FEF329C0558E969; +sub.f64 fd460, fd458, fd459; +add.f64 fd461, fd460, fd455; +sub.f64 fd462, fd455, fd460; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 16; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd463, fd464}, [rd16]; +mul.f64 fd467, fd463, fd400; +mul.f64 fd468, fd464, fd438; +mul.f64 fd469, fd463, fd438; +mul.f64 fd470, fd463, fd463; +mul.f64 fd471, fd464, fd464; +sub.f64 fd472, fd470, fd471; +mul.f64 fd473, fd464, fd463; +fma.rn.f64 fd474, fd464, fd463, fd473; +mul.f64 fd475, fd472, fd411; +mul.f64 fd476, fd474, fd449; +mul.f64 fd477, fd472, fd449; +mul.f64 fd478, fd463, fd472; +mul.f64 fd479, fd464, fd474; +sub.f64 fd480, fd478, fd479; +mul.f64 fd481, fd463, fd474; +fma.rn.f64 fd482, fd464, fd472, fd481; +mul.f64 fd483, fd480, fd423; +mul.f64 fd484, fd482, fd461; +mul.f64 fd485, fd480, fd461; +ld.global.v2.f64 {fd486, fd487}, [rd16+112]; +mul.f64 fd490, fd486, fd424; +mul.f64 fd491, fd487, fd462; +mul.f64 fd492, fd486, fd462; +mul.f64 fd493, fd463, fd486; +mul.f64 fd494, fd464, fd487; +sub.f64 fd495, fd493, fd494; +mul.f64 fd496, fd463, fd487; +fma.rn.f64 fd497, fd464, fd486, fd496; +mul.f64 fd498, fd495, fd412; +mul.f64 fd499, fd497, fd450; +mul.f64 fd500, fd495, fd450; +mul.f64 fd501, fd463, fd495; +mul.f64 fd502, fd464, fd497; +sub.f64 fd503, fd501, fd502; +mul.f64 fd504, fd463, fd497; +fma.rn.f64 fd505, fd464, fd495, fd504; +mul.f64 fd506, fd503, fd401; +mul.f64 fd507, fd505, fd439; +mul.f64 fd508, fd503, fd439; +shl.b32 r24, r23, 4; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 5488, r25; +add.f64 fd509, fd386, fd385; +add.f64 fd510, fd381, fd380; +st.shared.v2.f64 [r26], {fd510, fd509}; +fma.rn.f64 fd511, fd464, fd400, fd469; +sub.f64 fd512, fd467, fd468; +st.shared.v2.f64 [r26+784], {fd512, fd511}; +fma.rn.f64 fd513, fd474, fd411, fd477; +sub.f64 fd514, fd475, fd476; +st.shared.v2.f64 [r26+1568], {fd514, fd513}; +fma.rn.f64 fd515, fd482, fd423, fd485; +sub.f64 fd516, fd483, fd484; +st.shared.v2.f64 [r26+2352], {fd516, fd515}; +fma.rn.f64 fd517, fd487, fd424, fd492; +sub.f64 fd518, fd490, fd491; +st.shared.v2.f64 [r26+3136], {fd518, fd517}; +fma.rn.f64 fd519, fd497, fd412, fd500; +sub.f64 fd520, fd498, fd499; +st.shared.v2.f64 [r26+3920], {fd520, fd519}; +fma.rn.f64 fd521, fd505, fd401, fd508; +sub.f64 fd522, fd506, fd507; +st.shared.v2.f64 [r26+4704], {fd522, fd521}; +barrier.sync 0; +ld.shared.v2.f64 {fd523, fd524}, [r10]; +ld.shared.v2.f64 {fd527, fd528}, [r10+5488]; +ld.shared.v2.f64 {fd531, fd532}, [r10+10976]; +ld.shared.v2.f64 {fd535, fd536}, [r10+16464]; +ld.shared.v2.f64 {fd539, fd540}, [r10+21952]; +ld.shared.v2.f64 {fd543, fd544}, [r10+27440]; +ld.shared.v2.f64 {fd547, fd548}, [r10+32928]; +add.f64 fd551, fd527, fd547; +add.f64 fd552, fd523, fd551; +add.f64 fd553, fd531, fd543; +add.f64 fd554, fd553, fd552; +add.f64 fd555, fd535, fd539; +add.f64 fd556, fd528, fd548; +add.f64 fd557, fd524, fd556; +add.f64 fd558, fd532, fd544; +add.f64 fd559, fd558, fd557; +add.f64 fd560, fd536, fd540; +fma.rn.f64 fd561, fd551, 0d3FE3F3A0E28BEDD1, fd523; +mul.f64 fd562, fd553, 0d3FCC7B90E3024582; +sub.f64 fd563, fd561, fd562; +mul.f64 fd564, fd555, 0d3FECD4BCA9CB5C71; +sub.f64 fd565, fd563, fd564; +sub.f64 fd566, fd528, fd548; +mul.f64 fd567, fd566, 0d3FE904C37505DE4B; +sub.f64 fd568, fd532, fd544; +mul.f64 fd569, fd568, 0dBFEF329C0558E969; +sub.f64 fd570, fd569, fd567; +sub.f64 fd571, fd536, fd540; +mul.f64 fd572, fd571, 0d3FDBC4C04D71ABC1; +sub.f64 fd573, fd570, fd572; +mul.f64 fd574, fd551, 0d3FCC7B90E3024582; +sub.f64 fd575, fd523, fd574; +mul.f64 fd576, fd553, 0d3FECD4BCA9CB5C71; +sub.f64 fd577, fd575, fd576; +fma.rn.f64 fd578, fd555, 0d3FE3F3A0E28BEDD1, fd577; +mul.f64 fd579, fd566, 0d3FEF329C0558E969; +mul.f64 fd580, fd568, 0d3FDBC4C04D71ABC1; +sub.f64 fd581, fd580, fd579; +fma.rn.f64 fd582, fd571, 0d3FE904C37505DE4B, fd581; +mul.f64 fd583, fd551, 0d3FECD4BCA9CB5C71; +sub.f64 fd584, fd523, fd583; +fma.rn.f64 fd585, fd553, 0d3FE3F3A0E28BEDD1, fd584; +mul.f64 fd586, fd555, 0d3FCC7B90E3024582; +sub.f64 fd587, fd585, fd586; +mul.f64 fd588, fd566, 0d3FDBC4C04D71ABC1; +mul.f64 fd589, fd568, 0d3FE904C37505DE4B; +sub.f64 fd590, fd589, fd588; +mul.f64 fd591, fd571, 0d3FEF329C0558E969; +sub.f64 fd592, fd590, fd591; +fma.rn.f64 fd593, fd556, 0d3FE3F3A0E28BEDD1, fd524; +mul.f64 fd594, fd558, 0d3FCC7B90E3024582; +sub.f64 fd595, fd593, fd594; +mul.f64 fd596, fd560, 0d3FECD4BCA9CB5C71; +sub.f64 fd597, fd595, fd596; +sub.f64 fd598, fd527, fd547; +mul.f64 fd599, fd598, 0d3FE904C37505DE4B; +sub.f64 fd600, fd531, fd543; +mul.f64 fd601, fd600, 0dBFEF329C0558E969; +sub.f64 fd602, fd601, fd599; +sub.f64 fd603, fd535, fd539; +mul.f64 fd604, fd603, 0d3FDBC4C04D71ABC1; +sub.f64 fd605, fd602, fd604; +mul.f64 fd606, fd556, 0d3FCC7B90E3024582; +sub.f64 fd607, fd524, fd606; +mul.f64 fd608, fd558, 0d3FECD4BCA9CB5C71; +sub.f64 fd609, fd607, fd608; +fma.rn.f64 fd610, fd560, 0d3FE3F3A0E28BEDD1, fd609; +mul.f64 fd611, fd598, 0d3FEF329C0558E969; +mul.f64 fd612, fd600, 0d3FDBC4C04D71ABC1; +sub.f64 fd613, fd612, fd611; +fma.rn.f64 fd614, fd603, 0d3FE904C37505DE4B, fd613; +mul.f64 fd615, fd556, 0d3FECD4BCA9CB5C71; +sub.f64 fd616, fd524, fd615; +fma.rn.f64 fd617, fd558, 0d3FE3F3A0E28BEDD1, fd616; +mul.f64 fd618, fd560, 0d3FCC7B90E3024582; +sub.f64 fd619, fd617, fd618; +mul.f64 fd620, fd598, 0d3FDBC4C04D71ABC1; +mul.f64 fd621, fd600, 0d3FE904C37505DE4B; +sub.f64 fd622, fd621, fd620; +mul.f64 fd623, fd603, 0d3FEF329C0558E969; +sub.f64 fd624, fd622, fd623; +add.f64 %1, fd560, fd559; +add.f64 %0, fd555, fd554; +add.f64 %3, fd605, fd597; +sub.f64 %2, fd565, fd573; +add.f64 %5, fd614, fd610; +sub.f64 %4, fd578, fd582; +add.f64 %7, fd624, fd619; +sub.f64 %6, fd587, fd592; +sub.f64 %9, fd619, fd624; +add.f64 %8, fd592, fd587; +sub.f64 %11, fd610, fd614; +add.f64 %10, fd582, fd578; +sub.f64 %13, fd597, fd605; +add.f64 %12, fd573, fd565; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_2401), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<552, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<597>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 19208, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %20, %34; +add.f64 fd30, %18, fd29; +add.f64 fd31, %23, %31; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %26, %28; +add.f64 fd34, fd33, fd32; +add.f64 fd35, %22, %35; +add.f64 fd36, %19, fd35; +add.f64 fd37, %25, %33; +add.f64 fd38, fd37, fd36; +add.f64 fd39, %27, %30; +add.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd29, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd42, fd31, 0d3FCC7B90E3024582; +sub.f64 fd43, fd41, fd42; +mul.f64 fd44, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd45, fd43, fd44; +sub.f64 fd46, %22, %35; +mul.f64 fd47, fd46, 0d3FE904C37505DE4B; +sub.f64 fd48, %25, %33; +mul.f64 fd49, fd48, 0dBFEF329C0558E969; +sub.f64 fd50, fd49, fd47; +sub.f64 fd51, %27, %30; +mul.f64 fd52, fd51, 0d3FDBC4C04D71ABC1; +sub.f64 fd53, fd50, fd52; +sub.f64 fd54, fd45, fd53; +add.f64 fd55, fd53, fd45; +mul.f64 fd56, fd29, 0d3FCC7B90E3024582; +sub.f64 fd57, %18, fd56; +mul.f64 fd58, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd59, fd57, fd58; +fma.rn.f64 fd60, fd33, 0d3FE3F3A0E28BEDD1, fd59; +mul.f64 fd61, fd46, 0d3FEF329C0558E969; +mul.f64 fd62, fd48, 0d3FDBC4C04D71ABC1; +sub.f64 fd63, fd62, fd61; +fma.rn.f64 fd64, fd51, 0d3FE904C37505DE4B, fd63; +sub.f64 fd65, fd60, fd64; +add.f64 fd66, fd64, fd60; +mul.f64 fd67, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd68, %18, fd67; +fma.rn.f64 fd69, fd31, 0d3FE3F3A0E28BEDD1, fd68; +mul.f64 fd70, fd33, 0d3FCC7B90E3024582; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd46, 0d3FDBC4C04D71ABC1; +mul.f64 fd73, fd48, 0d3FE904C37505DE4B; +sub.f64 fd74, fd73, fd72; +mul.f64 fd75, fd51, 0d3FEF329C0558E969; +sub.f64 fd76, fd74, fd75; +sub.f64 fd77, fd71, fd76; +add.f64 fd78, fd76, fd71; +fma.rn.f64 fd79, fd35, 0d3FE3F3A0E28BEDD1, %19; +mul.f64 fd80, fd37, 0d3FCC7B90E3024582; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd39, 0d3FECD4BCA9CB5C71; +sub.f64 fd83, fd81, fd82; +sub.f64 fd84, %20, %34; +mul.f64 fd85, fd84, 0d3FE904C37505DE4B; +sub.f64 fd86, %23, %31; +mul.f64 fd87, fd86, 0dBFEF329C0558E969; +sub.f64 fd88, fd87, fd85; +sub.f64 fd89, %26, %28; +mul.f64 fd90, fd89, 0d3FDBC4C04D71ABC1; +sub.f64 fd91, fd88, fd90; +add.f64 fd92, fd91, fd83; +sub.f64 fd93, fd83, fd91; +mul.f64 fd94, fd35, 0d3FCC7B90E3024582; +sub.f64 fd95, %19, fd94; +mul.f64 fd96, fd37, 0d3FECD4BCA9CB5C71; +sub.f64 fd97, fd95, fd96; +fma.rn.f64 fd98, fd39, 0d3FE3F3A0E28BEDD1, fd97; +mul.f64 fd99, fd84, 0d3FEF329C0558E969; +mul.f64 fd100, fd86, 0d3FDBC4C04D71ABC1; +sub.f64 fd101, fd100, fd99; +fma.rn.f64 fd102, fd89, 0d3FE904C37505DE4B, fd101; +add.f64 fd103, fd102, fd98; +sub.f64 fd104, fd98, fd102; +mul.f64 fd105, fd35, 0d3FECD4BCA9CB5C71; +sub.f64 fd106, %19, fd105; +fma.rn.f64 fd107, fd37, 0d3FE3F3A0E28BEDD1, fd106; +mul.f64 fd108, fd39, 0d3FCC7B90E3024582; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd84, 0d3FDBC4C04D71ABC1; +mul.f64 fd111, fd86, 0d3FE904C37505DE4B; +sub.f64 fd112, fd111, fd110; +mul.f64 fd113, fd89, 0d3FEF329C0558E969; +sub.f64 fd114, fd112, fd113; +add.f64 fd115, fd114, fd109; +sub.f64 fd116, fd109, fd114; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd117, fd118}, [rd6]; +mul.f64 fd121, fd117, fd54; +mul.f64 fd122, fd118, fd92; +sub.f64 fd123, fd121, fd122; +mul.f64 fd124, fd117, fd92; +fma.rn.f64 fd125, fd118, fd54, fd124; +mul.f64 fd126, fd117, fd117; +mul.f64 fd127, fd118, fd118; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd118, fd117; +fma.rn.f64 fd130, fd118, fd117, fd129; +mul.f64 fd131, fd128, fd65; +mul.f64 fd132, fd130, fd103; +sub.f64 fd133, fd131, fd132; +mul.f64 fd134, fd128, fd103; +fma.rn.f64 fd135, fd130, fd65, fd134; +mul.f64 fd136, fd117, fd128; +mul.f64 fd137, fd118, fd130; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd117, fd130; +fma.rn.f64 fd140, fd118, fd128, fd139; +mul.f64 fd141, fd138, fd77; +mul.f64 fd142, fd140, fd115; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd138, fd115; +fma.rn.f64 fd145, fd140, fd77, fd144; +ld.global.v2.f64 {fd146, fd147}, [rd6+5488]; +mul.f64 fd150, fd146, fd78; +mul.f64 fd151, fd147, fd116; +sub.f64 fd152, fd150, fd151; +mul.f64 fd153, fd146, fd116; +fma.rn.f64 fd154, fd147, fd78, fd153; +mul.f64 fd155, fd117, fd146; +mul.f64 fd156, fd118, fd147; +sub.f64 fd157, fd155, fd156; +mul.f64 fd158, fd117, fd147; +fma.rn.f64 fd159, fd118, fd146, fd158; +mul.f64 fd160, fd157, fd66; +mul.f64 fd161, fd159, fd104; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd157, fd104; +fma.rn.f64 fd164, fd159, fd66, fd163; +mul.f64 fd165, fd117, fd157; +mul.f64 fd166, fd118, fd159; +sub.f64 fd167, fd165, fd166; +mul.f64 fd168, fd117, fd159; +fma.rn.f64 fd169, fd118, fd157, fd168; +mul.f64 fd170, fd167, fd55; +mul.f64 fd171, fd169, fd93; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd167, fd93; +fma.rn.f64 fd174, fd169, fd55, fd173; +mad.lo.s32 r8, r5, 19208, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +st.shared.f64 [r9], fd34; +st.shared.f64 [r9+8], fd123; +st.shared.f64 [r9+16], fd133; +st.shared.f64 [r9+24], fd143; +st.shared.f64 [r9+32], fd152; +st.shared.f64 [r9+40], fd162; +st.shared.f64 [r9+48], fd172; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.f64 fd175, [r10]; +ld.shared.f64 fd176, [r10+2744]; +ld.shared.f64 fd177, [r10+5488]; +ld.shared.f64 fd178, [r10+8232]; +ld.shared.f64 fd179, [r10+10976]; +ld.shared.f64 fd180, [r10+13720]; +ld.shared.f64 fd181, [r10+16464]; +barrier.sync 0; +st.shared.f64 [r9], fd40; +st.shared.f64 [r9+8], fd125; +st.shared.f64 [r9+16], fd135; +st.shared.f64 [r9+24], fd145; +st.shared.f64 [r9+32], fd154; +st.shared.f64 [r9+40], fd164; +st.shared.f64 [r9+48], fd174; +barrier.sync 0; +ld.shared.f64 fd182, [r10]; +ld.shared.f64 fd183, [r10+2744]; +ld.shared.f64 fd184, [r10+5488]; +ld.shared.f64 fd185, [r10+8232]; +ld.shared.f64 fd186, [r10+10976]; +ld.shared.f64 fd187, [r10+13720]; +ld.shared.f64 fd188, [r10+16464]; +add.f64 fd189, fd176, fd181; +add.f64 fd190, fd175, fd189; +add.f64 fd191, fd177, fd180; +add.f64 fd192, fd191, fd190; +add.f64 fd193, fd178, fd179; +add.f64 fd194, fd193, fd192; +add.f64 fd195, fd183, fd188; +add.f64 fd196, fd182, fd195; +add.f64 fd197, fd184, fd187; +add.f64 fd198, fd197, fd196; +add.f64 fd199, fd185, fd186; +add.f64 fd200, fd199, fd198; +fma.rn.f64 fd201, fd189, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd202, fd191, 0d3FCC7B90E3024582; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd193, 0d3FECD4BCA9CB5C71; +sub.f64 fd205, fd203, fd204; +sub.f64 fd206, fd183, fd188; +mul.f64 fd207, fd206, 0d3FE904C37505DE4B; +sub.f64 fd208, fd184, fd187; +mul.f64 fd209, fd208, 0dBFEF329C0558E969; +sub.f64 fd210, fd209, fd207; +sub.f64 fd211, fd185, fd186; +mul.f64 fd212, fd211, 0d3FDBC4C04D71ABC1; +sub.f64 fd213, fd210, fd212; +sub.f64 fd214, fd205, fd213; +add.f64 fd215, fd213, fd205; +mul.f64 fd216, fd189, 0d3FCC7B90E3024582; +sub.f64 fd217, fd175, fd216; +mul.f64 fd218, fd191, 0d3FECD4BCA9CB5C71; +sub.f64 fd219, fd217, fd218; +fma.rn.f64 fd220, fd193, 0d3FE3F3A0E28BEDD1, fd219; +mul.f64 fd221, fd206, 0d3FEF329C0558E969; +mul.f64 fd222, fd208, 0d3FDBC4C04D71ABC1; +sub.f64 fd223, fd222, fd221; +fma.rn.f64 fd224, fd211, 0d3FE904C37505DE4B, fd223; +sub.f64 fd225, fd220, fd224; +add.f64 fd226, fd224, fd220; +mul.f64 fd227, fd189, 0d3FECD4BCA9CB5C71; +sub.f64 fd228, fd175, fd227; +fma.rn.f64 fd229, fd191, 0d3FE3F3A0E28BEDD1, fd228; +mul.f64 fd230, fd193, 0d3FCC7B90E3024582; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd206, 0d3FDBC4C04D71ABC1; +mul.f64 fd233, fd208, 0d3FE904C37505DE4B; +sub.f64 fd234, fd233, fd232; +mul.f64 fd235, fd211, 0d3FEF329C0558E969; +sub.f64 fd236, fd234, fd235; +sub.f64 fd237, fd231, fd236; +add.f64 fd238, fd236, fd231; +fma.rn.f64 fd239, fd195, 0d3FE3F3A0E28BEDD1, fd182; +mul.f64 fd240, fd197, 0d3FCC7B90E3024582; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd199, 0d3FECD4BCA9CB5C71; +sub.f64 fd243, fd241, fd242; +sub.f64 fd244, fd176, fd181; +mul.f64 fd245, fd244, 0d3FE904C37505DE4B; +sub.f64 fd246, fd177, fd180; +mul.f64 fd247, fd246, 0dBFEF329C0558E969; +sub.f64 fd248, fd247, fd245; +sub.f64 fd249, fd178, fd179; +mul.f64 fd250, fd249, 0d3FDBC4C04D71ABC1; +sub.f64 fd251, fd248, fd250; +add.f64 fd252, fd251, fd243; +sub.f64 fd253, fd243, fd251; +mul.f64 fd254, fd195, 0d3FCC7B90E3024582; +sub.f64 fd255, fd182, fd254; +mul.f64 fd256, fd197, 0d3FECD4BCA9CB5C71; +sub.f64 fd257, fd255, fd256; +fma.rn.f64 fd258, fd199, 0d3FE3F3A0E28BEDD1, fd257; +mul.f64 fd259, fd244, 0d3FEF329C0558E969; +mul.f64 fd260, fd246, 0d3FDBC4C04D71ABC1; +sub.f64 fd261, fd260, fd259; +fma.rn.f64 fd262, fd249, 0d3FE904C37505DE4B, fd261; +add.f64 fd263, fd262, fd258; +sub.f64 fd264, fd258, fd262; +mul.f64 fd265, fd195, 0d3FECD4BCA9CB5C71; +sub.f64 fd266, fd182, fd265; +fma.rn.f64 fd267, fd197, 0d3FE3F3A0E28BEDD1, fd266; +mul.f64 fd268, fd199, 0d3FCC7B90E3024582; +sub.f64 fd269, fd267, fd268; +mul.f64 fd270, fd244, 0d3FDBC4C04D71ABC1; +mul.f64 fd271, fd246, 0d3FE904C37505DE4B; +sub.f64 fd272, fd271, fd270; +mul.f64 fd273, fd249, 0d3FEF329C0558E969; +sub.f64 fd274, fd272, fd273; +add.f64 fd275, fd274, fd269; +sub.f64 fd276, fd269, fd274; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd277, fd278}, [rd11]; +mul.f64 fd281, fd277, fd214; +mul.f64 fd282, fd278, fd252; +sub.f64 fd283, fd281, fd282; +mul.f64 fd284, fd277, fd252; +fma.rn.f64 fd285, fd278, fd214, fd284; +mul.f64 fd286, fd277, fd277; +mul.f64 fd287, fd278, fd278; +sub.f64 fd288, fd286, fd287; +mul.f64 fd289, fd278, fd277; +fma.rn.f64 fd290, fd278, fd277, fd289; +mul.f64 fd291, fd288, fd225; +mul.f64 fd292, fd290, fd263; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd288, fd263; +fma.rn.f64 fd295, fd290, fd225, fd294; +mul.f64 fd296, fd277, fd288; +mul.f64 fd297, fd278, fd290; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd277, fd290; +fma.rn.f64 fd300, fd278, fd288, fd299; +mul.f64 fd301, fd298, fd237; +mul.f64 fd302, fd300, fd275; +sub.f64 fd303, fd301, fd302; +mul.f64 fd304, fd298, fd275; +fma.rn.f64 fd305, fd300, fd237, fd304; +ld.global.v2.f64 {fd306, fd307}, [rd11+784]; +mul.f64 fd310, fd306, fd238; +mul.f64 fd311, fd307, fd276; +sub.f64 fd312, fd310, fd311; +mul.f64 fd313, fd306, fd276; +fma.rn.f64 fd314, fd307, fd238, fd313; +mul.f64 fd315, fd277, fd306; +mul.f64 fd316, fd278, fd307; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd277, fd307; +fma.rn.f64 fd319, fd278, fd306, fd318; +mul.f64 fd320, fd317, fd226; +mul.f64 fd321, fd319, fd264; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd317, fd264; +fma.rn.f64 fd324, fd319, fd226, fd323; +mul.f64 fd325, fd277, fd317; +mul.f64 fd326, fd278, fd319; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd277, fd319; +fma.rn.f64 fd329, fd278, fd317, fd328; +mul.f64 fd330, fd327, fd215; +mul.f64 fd331, fd329, fd253; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd327, fd253; +fma.rn.f64 fd334, fd329, fd215, fd333; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +st.shared.f64 [r20], fd194; +st.shared.f64 [r20+56], fd283; +st.shared.f64 [r20+112], fd293; +st.shared.f64 [r20+168], fd303; +st.shared.f64 [r20+224], fd312; +st.shared.f64 [r20+280], fd322; +st.shared.f64 [r20+336], fd332; +barrier.sync 0; +ld.shared.f64 fd335, [r10]; +ld.shared.f64 fd336, [r10+2744]; +ld.shared.f64 fd337, [r10+5488]; +ld.shared.f64 fd338, [r10+8232]; +ld.shared.f64 fd339, [r10+10976]; +ld.shared.f64 fd340, [r10+13720]; +ld.shared.f64 fd341, [r10+16464]; +barrier.sync 0; +st.shared.f64 [r20], fd200; +st.shared.f64 [r20+56], fd285; +st.shared.f64 [r20+112], fd295; +st.shared.f64 [r20+168], fd305; +st.shared.f64 [r20+224], fd314; +st.shared.f64 [r20+280], fd324; +st.shared.f64 [r20+336], fd334; +barrier.sync 0; +ld.shared.f64 fd342, [r10]; +ld.shared.f64 fd343, [r10+2744]; +ld.shared.f64 fd344, [r10+5488]; +ld.shared.f64 fd345, [r10+8232]; +ld.shared.f64 fd346, [r10+10976]; +ld.shared.f64 fd347, [r10+13720]; +ld.shared.f64 fd348, [r10+16464]; +add.f64 fd349, fd336, fd341; +add.f64 fd350, fd335, fd349; +add.f64 fd351, fd337, fd340; +add.f64 fd352, fd351, fd350; +add.f64 fd353, fd338, fd339; +add.f64 fd354, fd353, fd352; +add.f64 fd355, fd343, fd348; +add.f64 fd356, fd342, fd355; +add.f64 fd357, fd344, fd347; +add.f64 fd358, fd357, fd356; +add.f64 fd359, fd345, fd346; +add.f64 fd360, fd359, fd358; +fma.rn.f64 fd361, fd349, 0d3FE3F3A0E28BEDD1, fd335; +mul.f64 fd362, fd351, 0d3FCC7B90E3024582; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd353, 0d3FECD4BCA9CB5C71; +sub.f64 fd365, fd363, fd364; +sub.f64 fd366, fd343, fd348; +mul.f64 fd367, fd366, 0d3FE904C37505DE4B; +sub.f64 fd368, fd344, fd347; +mul.f64 fd369, fd368, 0dBFEF329C0558E969; +sub.f64 fd370, fd369, fd367; +sub.f64 fd371, fd345, fd346; +mul.f64 fd372, fd371, 0d3FDBC4C04D71ABC1; +sub.f64 fd373, fd370, fd372; +sub.f64 fd374, fd365, fd373; +add.f64 fd375, fd373, fd365; +mul.f64 fd376, fd349, 0d3FCC7B90E3024582; +sub.f64 fd377, fd335, fd376; +mul.f64 fd378, fd351, 0d3FECD4BCA9CB5C71; +sub.f64 fd379, fd377, fd378; +fma.rn.f64 fd380, fd353, 0d3FE3F3A0E28BEDD1, fd379; +mul.f64 fd381, fd366, 0d3FEF329C0558E969; +mul.f64 fd382, fd368, 0d3FDBC4C04D71ABC1; +sub.f64 fd383, fd382, fd381; +fma.rn.f64 fd384, fd371, 0d3FE904C37505DE4B, fd383; +sub.f64 fd385, fd380, fd384; +add.f64 fd386, fd384, fd380; +mul.f64 fd387, fd349, 0d3FECD4BCA9CB5C71; +sub.f64 fd388, fd335, fd387; +fma.rn.f64 fd389, fd351, 0d3FE3F3A0E28BEDD1, fd388; +mul.f64 fd390, fd353, 0d3FCC7B90E3024582; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd366, 0d3FDBC4C04D71ABC1; +mul.f64 fd393, fd368, 0d3FE904C37505DE4B; +sub.f64 fd394, fd393, fd392; +mul.f64 fd395, fd371, 0d3FEF329C0558E969; +sub.f64 fd396, fd394, fd395; +sub.f64 fd397, fd391, fd396; +add.f64 fd398, fd396, fd391; +fma.rn.f64 fd399, fd355, 0d3FE3F3A0E28BEDD1, fd342; +mul.f64 fd400, fd357, 0d3FCC7B90E3024582; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd359, 0d3FECD4BCA9CB5C71; +sub.f64 fd403, fd401, fd402; +sub.f64 fd404, fd336, fd341; +mul.f64 fd405, fd404, 0d3FE904C37505DE4B; +sub.f64 fd406, fd337, fd340; +mul.f64 fd407, fd406, 0dBFEF329C0558E969; +sub.f64 fd408, fd407, fd405; +sub.f64 fd409, fd338, fd339; +mul.f64 fd410, fd409, 0d3FDBC4C04D71ABC1; +sub.f64 fd411, fd408, fd410; +add.f64 fd412, fd411, fd403; +sub.f64 fd413, fd403, fd411; +mul.f64 fd414, fd355, 0d3FCC7B90E3024582; +sub.f64 fd415, fd342, fd414; +mul.f64 fd416, fd357, 0d3FECD4BCA9CB5C71; +sub.f64 fd417, fd415, fd416; +fma.rn.f64 fd418, fd359, 0d3FE3F3A0E28BEDD1, fd417; +mul.f64 fd419, fd404, 0d3FEF329C0558E969; +mul.f64 fd420, fd406, 0d3FDBC4C04D71ABC1; +sub.f64 fd421, fd420, fd419; +fma.rn.f64 fd422, fd409, 0d3FE904C37505DE4B, fd421; +add.f64 fd423, fd422, fd418; +sub.f64 fd424, fd418, fd422; +mul.f64 fd425, fd355, 0d3FECD4BCA9CB5C71; +sub.f64 fd426, fd342, fd425; +fma.rn.f64 fd427, fd357, 0d3FE3F3A0E28BEDD1, fd426; +mul.f64 fd428, fd359, 0d3FCC7B90E3024582; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd404, 0d3FDBC4C04D71ABC1; +mul.f64 fd431, fd406, 0d3FE904C37505DE4B; +sub.f64 fd432, fd431, fd430; +mul.f64 fd433, fd409, 0d3FEF329C0558E969; +sub.f64 fd434, fd432, fd433; +add.f64 fd435, fd434, fd429; +sub.f64 fd436, fd429, fd434; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 16; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd437, fd438}, [rd16]; +mul.f64 fd441, fd437, fd374; +mul.f64 fd442, fd438, fd412; +sub.f64 fd443, fd441, fd442; +mul.f64 fd444, fd437, fd412; +fma.rn.f64 fd445, fd438, fd374, fd444; +mul.f64 fd446, fd437, fd437; +mul.f64 fd447, fd438, fd438; +sub.f64 fd448, fd446, fd447; +mul.f64 fd449, fd438, fd437; +fma.rn.f64 fd450, fd438, fd437, fd449; +mul.f64 fd451, fd448, fd385; +mul.f64 fd452, fd450, fd423; +sub.f64 fd453, fd451, fd452; +mul.f64 fd454, fd448, fd423; +fma.rn.f64 fd455, fd450, fd385, fd454; +mul.f64 fd456, fd437, fd448; +mul.f64 fd457, fd438, fd450; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd437, fd450; +fma.rn.f64 fd460, fd438, fd448, fd459; +mul.f64 fd461, fd458, fd397; +mul.f64 fd462, fd460, fd435; +sub.f64 fd463, fd461, fd462; +mul.f64 fd464, fd458, fd435; +fma.rn.f64 fd465, fd460, fd397, fd464; +ld.global.v2.f64 {fd466, fd467}, [rd16+112]; +mul.f64 fd470, fd466, fd398; +mul.f64 fd471, fd467, fd436; +sub.f64 fd472, fd470, fd471; +mul.f64 fd473, fd466, fd436; +fma.rn.f64 fd474, fd467, fd398, fd473; +mul.f64 fd475, fd437, fd466; +mul.f64 fd476, fd438, fd467; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd437, fd467; +fma.rn.f64 fd479, fd438, fd466, fd478; +mul.f64 fd480, fd477, fd386; +mul.f64 fd481, fd479, fd424; +sub.f64 fd482, fd480, fd481; +mul.f64 fd483, fd477, fd424; +fma.rn.f64 fd484, fd479, fd386, fd483; +mul.f64 fd485, fd437, fd477; +mul.f64 fd486, fd438, fd479; +sub.f64 fd487, fd485, fd486; +mul.f64 fd488, fd437, fd479; +fma.rn.f64 fd489, fd438, fd477, fd488; +mul.f64 fd490, fd487, fd375; +mul.f64 fd491, fd489, fd413; +sub.f64 fd492, fd490, fd491; +mul.f64 fd493, fd487, fd413; +fma.rn.f64 fd494, fd489, fd375, fd493; +shl.b32 r24, r23, 3; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 2744, r25; +st.shared.f64 [r26], fd354; +st.shared.f64 [r26+392], fd443; +st.shared.f64 [r26+784], fd453; +st.shared.f64 [r26+1176], fd463; +st.shared.f64 [r26+1568], fd472; +st.shared.f64 [r26+1960], fd482; +st.shared.f64 [r26+2352], fd492; +barrier.sync 0; +ld.shared.f64 fd495, [r10]; +ld.shared.f64 fd496, [r10+2744]; +ld.shared.f64 fd497, [r10+5488]; +ld.shared.f64 fd498, [r10+8232]; +ld.shared.f64 fd499, [r10+10976]; +ld.shared.f64 fd500, [r10+13720]; +ld.shared.f64 fd501, [r10+16464]; +barrier.sync 0; +st.shared.f64 [r26], fd360; +st.shared.f64 [r26+392], fd445; +st.shared.f64 [r26+784], fd455; +st.shared.f64 [r26+1176], fd465; +st.shared.f64 [r26+1568], fd474; +st.shared.f64 [r26+1960], fd484; +st.shared.f64 [r26+2352], fd494; +barrier.sync 0; +ld.shared.f64 fd502, [r10]; +ld.shared.f64 fd503, [r10+2744]; +ld.shared.f64 fd504, [r10+5488]; +ld.shared.f64 fd505, [r10+8232]; +ld.shared.f64 fd506, [r10+10976]; +ld.shared.f64 fd507, [r10+13720]; +ld.shared.f64 fd508, [r10+16464]; +add.f64 fd509, fd496, fd501; +add.f64 fd510, fd495, fd509; +add.f64 fd511, fd497, fd500; +add.f64 fd512, fd511, fd510; +add.f64 fd513, fd498, fd499; +add.f64 fd514, fd503, fd508; +add.f64 fd515, fd502, fd514; +add.f64 fd516, fd504, fd507; +add.f64 fd517, fd516, fd515; +add.f64 fd518, fd505, fd506; +fma.rn.f64 fd519, fd509, 0d3FE3F3A0E28BEDD1, fd495; +mul.f64 fd520, fd511, 0d3FCC7B90E3024582; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd513, 0d3FECD4BCA9CB5C71; +sub.f64 fd523, fd521, fd522; +sub.f64 fd524, fd503, fd508; +mul.f64 fd525, fd524, 0d3FE904C37505DE4B; +sub.f64 fd526, fd504, fd507; +mul.f64 fd527, fd526, 0dBFEF329C0558E969; +sub.f64 fd528, fd527, fd525; +sub.f64 fd529, fd505, fd506; +mul.f64 fd530, fd529, 0d3FDBC4C04D71ABC1; +sub.f64 fd531, fd528, fd530; +mul.f64 fd532, fd509, 0d3FCC7B90E3024582; +sub.f64 fd533, fd495, fd532; +mul.f64 fd534, fd511, 0d3FECD4BCA9CB5C71; +sub.f64 fd535, fd533, fd534; +fma.rn.f64 fd536, fd513, 0d3FE3F3A0E28BEDD1, fd535; +mul.f64 fd537, fd524, 0d3FEF329C0558E969; +mul.f64 fd538, fd526, 0d3FDBC4C04D71ABC1; +sub.f64 fd539, fd538, fd537; +fma.rn.f64 fd540, fd529, 0d3FE904C37505DE4B, fd539; +mul.f64 fd541, fd509, 0d3FECD4BCA9CB5C71; +sub.f64 fd542, fd495, fd541; +fma.rn.f64 fd543, fd511, 0d3FE3F3A0E28BEDD1, fd542; +mul.f64 fd544, fd513, 0d3FCC7B90E3024582; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd524, 0d3FDBC4C04D71ABC1; +mul.f64 fd547, fd526, 0d3FE904C37505DE4B; +sub.f64 fd548, fd547, fd546; +mul.f64 fd549, fd529, 0d3FEF329C0558E969; +sub.f64 fd550, fd548, fd549; +fma.rn.f64 fd551, fd514, 0d3FE3F3A0E28BEDD1, fd502; +mul.f64 fd552, fd516, 0d3FCC7B90E3024582; +sub.f64 fd553, fd551, fd552; +mul.f64 fd554, fd518, 0d3FECD4BCA9CB5C71; +sub.f64 fd555, fd553, fd554; +sub.f64 fd556, fd496, fd501; +mul.f64 fd557, fd556, 0d3FE904C37505DE4B; +sub.f64 fd558, fd497, fd500; +mul.f64 fd559, fd558, 0dBFEF329C0558E969; +sub.f64 fd560, fd559, fd557; +sub.f64 fd561, fd498, fd499; +mul.f64 fd562, fd561, 0d3FDBC4C04D71ABC1; +sub.f64 fd563, fd560, fd562; +mul.f64 fd564, fd514, 0d3FCC7B90E3024582; +sub.f64 fd565, fd502, fd564; +mul.f64 fd566, fd516, 0d3FECD4BCA9CB5C71; +sub.f64 fd567, fd565, fd566; +fma.rn.f64 fd568, fd518, 0d3FE3F3A0E28BEDD1, fd567; +mul.f64 fd569, fd556, 0d3FEF329C0558E969; +mul.f64 fd570, fd558, 0d3FDBC4C04D71ABC1; +sub.f64 fd571, fd570, fd569; +fma.rn.f64 fd572, fd561, 0d3FE904C37505DE4B, fd571; +mul.f64 fd573, fd514, 0d3FECD4BCA9CB5C71; +sub.f64 fd574, fd502, fd573; +fma.rn.f64 fd575, fd516, 0d3FE3F3A0E28BEDD1, fd574; +mul.f64 fd576, fd518, 0d3FCC7B90E3024582; +sub.f64 fd577, fd575, fd576; +mul.f64 fd578, fd556, 0d3FDBC4C04D71ABC1; +mul.f64 fd579, fd558, 0d3FE904C37505DE4B; +sub.f64 fd580, fd579, fd578; +mul.f64 fd581, fd561, 0d3FEF329C0558E969; +sub.f64 fd582, fd580, fd581; +add.f64 %0, fd513, fd512; +add.f64 %1, fd518, fd517; +add.f64 %3, fd563, fd555; +sub.f64 %2, fd523, fd531; +sub.f64 %4, fd536, fd540; +add.f64 %5, fd572, fd568; +sub.f64 %6, fd545, fd550; +add.f64 %7, fd582, fd577; +add.f64 %8, fd550, fd545; +sub.f64 %9, fd577, fd582; +add.f64 %10, fd540, fd536; +sub.f64 %11, fd568, fd572; +sub.f64 %13, fd555, fd563; +add.f64 %12, fd531, fd523; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_2401), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..20c7cc8c531c8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2401_fp64_inv.hpp.inc @@ -0,0 +1,1240 @@ +#ifndef CUFFTDX_FFT_2401_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_2401_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<722, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<623>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 38416, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %20, %34; +add.f64 fd30, %18, fd29; +add.f64 fd31, %23, %31; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %26, %28; +add.f64 fd34, %22, %35; +add.f64 fd35, %19, fd34; +add.f64 fd36, %25, %33; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %27, %30; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %22, %35; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %25, %33; +fma.rn.f64 fd47, fd46, 0d3FEF329C0558E969, fd45; +sub.f64 fd48, %27, %30; +fma.rn.f64 fd49, fd48, 0d3FDBC4C04D71ABC1, fd47; +sub.f64 fd50, fd43, fd49; +add.f64 fd51, fd49, fd43; +mul.f64 fd52, fd29, 0d3FCC7B90E3024582; +sub.f64 fd53, %18, fd52; +mul.f64 fd54, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd55, fd53, fd54; +fma.rn.f64 fd56, fd33, 0d3FE3F3A0E28BEDD1, fd55; +mul.f64 fd57, fd44, 0d3FEF329C0558E969; +mul.f64 fd58, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd48, 0d3FE904C37505DE4B; +sub.f64 fd61, fd59, fd60; +sub.f64 fd62, fd56, fd61; +add.f64 fd63, fd61, fd56; +mul.f64 fd64, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd65, %18, fd64; +fma.rn.f64 fd66, fd31, 0d3FE3F3A0E28BEDD1, fd65; +mul.f64 fd67, fd33, 0d3FCC7B90E3024582; +sub.f64 fd68, fd66, fd67; +mul.f64 fd69, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd70, fd46, 0d3FE904C37505DE4B; +sub.f64 fd71, fd69, fd70; +fma.rn.f64 fd72, fd48, 0d3FEF329C0558E969, fd71; +sub.f64 fd73, fd68, fd72; +add.f64 fd74, fd72, fd68; +fma.rn.f64 fd75, fd34, 0d3FE3F3A0E28BEDD1, %19; +mul.f64 fd76, fd36, 0d3FCC7B90E3024582; +sub.f64 fd77, fd75, fd76; +mul.f64 fd78, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd79, fd77, fd78; +sub.f64 fd80, %20, %34; +mul.f64 fd81, fd80, 0d3FE904C37505DE4B; +sub.f64 fd82, %23, %31; +fma.rn.f64 fd83, fd82, 0d3FEF329C0558E969, fd81; +sub.f64 fd84, %26, %28; +fma.rn.f64 fd85, fd84, 0d3FDBC4C04D71ABC1, fd83; +add.f64 fd86, fd85, fd79; +sub.f64 fd87, fd79, fd85; +mul.f64 fd88, fd34, 0d3FCC7B90E3024582; +sub.f64 fd89, %19, fd88; +mul.f64 fd90, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd91, fd89, fd90; +fma.rn.f64 fd92, fd38, 0d3FE3F3A0E28BEDD1, fd91; +mul.f64 fd93, fd80, 0d3FEF329C0558E969; +mul.f64 fd94, fd82, 0d3FDBC4C04D71ABC1; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd84, 0d3FE904C37505DE4B; +sub.f64 fd97, fd95, fd96; +add.f64 fd98, fd97, fd92; +sub.f64 fd99, fd92, fd97; +mul.f64 fd100, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd101, %19, fd100; +fma.rn.f64 fd102, fd36, 0d3FE3F3A0E28BEDD1, fd101; +mul.f64 fd103, fd38, 0d3FCC7B90E3024582; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd80, 0d3FDBC4C04D71ABC1; +mul.f64 fd106, fd82, 0d3FE904C37505DE4B; +sub.f64 fd107, fd105, fd106; +fma.rn.f64 fd108, fd84, 0d3FEF329C0558E969, fd107; +add.f64 fd109, fd108, fd104; +sub.f64 fd110, fd104, fd108; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 38416, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd111, fd112}, [rd6]; +mul.f64 fd115, fd86, fd112; +mul.f64 fd116, fd50, fd112; +mul.f64 fd117, fd111, fd86; +mul.f64 fd118, fd111, fd111; +mul.f64 fd119, fd112, fd112; +sub.f64 fd120, fd118, fd119; +mul.f64 fd121, fd112, fd111; +fma.rn.f64 fd122, fd112, fd111, fd121; +mul.f64 fd123, fd98, fd122; +mul.f64 fd124, fd62, fd122; +mul.f64 fd125, fd120, fd98; +mul.f64 fd126, fd111, fd120; +mul.f64 fd127, fd112, fd122; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd111, fd122; +fma.rn.f64 fd130, fd112, fd120, fd129; +mul.f64 fd131, fd109, fd130; +mul.f64 fd132, fd73, fd130; +mul.f64 fd133, fd128, fd109; +ld.global.v2.f64 {fd134, fd135}, [rd6+5488]; +mul.f64 fd138, fd110, fd135; +mul.f64 fd139, fd74, fd135; +mul.f64 fd140, fd134, fd110; +mul.f64 fd141, fd111, fd134; +mul.f64 fd142, fd112, fd135; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd111, fd135; +fma.rn.f64 fd145, fd112, fd134, fd144; +mul.f64 fd146, fd99, fd145; +mul.f64 fd147, fd63, fd145; +mul.f64 fd148, fd143, fd99; +mul.f64 fd149, fd111, fd143; +mul.f64 fd150, fd112, fd145; +sub.f64 fd151, fd149, fd150; +mul.f64 fd152, fd111, fd145; +fma.rn.f64 fd153, fd112, fd143, fd152; +mul.f64 fd154, fd87, fd153; +mul.f64 fd155, fd51, fd153; +mul.f64 fd156, fd151, fd87; +barrier.sync 0; +mad.lo.s32 r9, r7, 112, r8; +add.f64 fd157, fd38, fd37; +add.f64 fd158, fd33, fd32; +st.shared.v2.f64 [r9], {fd158, fd157}; +fma.rn.f64 fd159, fd111, fd50, fd115; +sub.f64 fd160, fd117, fd116; +st.shared.v2.f64 [r9+16], {fd159, fd160}; +fma.rn.f64 fd161, fd120, fd62, fd123; +sub.f64 fd162, fd125, fd124; +st.shared.v2.f64 [r9+32], {fd161, fd162}; +sub.f64 fd163, fd133, fd132; +fma.rn.f64 fd164, fd128, fd73, fd131; +st.shared.v2.f64 [r9+48], {fd164, fd163}; +fma.rn.f64 fd165, fd134, fd74, fd138; +sub.f64 fd166, fd140, fd139; +st.shared.v2.f64 [r9+64], {fd165, fd166}; +fma.rn.f64 fd167, fd143, fd63, fd146; +sub.f64 fd168, fd148, fd147; +st.shared.v2.f64 [r9+80], {fd167, fd168}; +sub.f64 fd169, fd156, fd155; +fma.rn.f64 fd170, fd151, fd51, fd154; +st.shared.v2.f64 [r9+96], {fd170, fd169}; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.v2.f64 {fd171, fd172}, [r10]; +ld.shared.v2.f64 {fd175, fd176}, [r10+5488]; +ld.shared.v2.f64 {fd179, fd180}, [r10+10976]; +ld.shared.v2.f64 {fd183, fd184}, [r10+16464]; +ld.shared.v2.f64 {fd187, fd188}, [r10+21952]; +ld.shared.v2.f64 {fd191, fd192}, [r10+27440]; +ld.shared.v2.f64 {fd195, fd196}, [r10+32928]; +add.f64 fd199, fd175, fd195; +add.f64 fd200, fd171, fd199; +add.f64 fd201, fd179, fd191; +add.f64 fd202, fd201, fd200; +add.f64 fd203, fd183, fd187; +add.f64 fd204, fd176, fd196; +add.f64 fd205, fd172, fd204; +add.f64 fd206, fd180, fd192; +add.f64 fd207, fd206, fd205; +add.f64 fd208, fd184, fd188; +fma.rn.f64 fd209, fd199, 0d3FE3F3A0E28BEDD1, fd171; +mul.f64 fd210, fd201, 0d3FCC7B90E3024582; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +sub.f64 fd214, fd176, fd196; +mul.f64 fd215, fd214, 0d3FE904C37505DE4B; +sub.f64 fd216, fd180, fd192; +fma.rn.f64 fd217, fd216, 0d3FEF329C0558E969, fd215; +sub.f64 fd218, fd184, fd188; +fma.rn.f64 fd219, fd218, 0d3FDBC4C04D71ABC1, fd217; +sub.f64 fd220, fd213, fd219; +add.f64 fd221, fd219, fd213; +mul.f64 fd222, fd199, 0d3FCC7B90E3024582; +sub.f64 fd223, fd171, fd222; +mul.f64 fd224, fd201, 0d3FECD4BCA9CB5C71; +sub.f64 fd225, fd223, fd224; +fma.rn.f64 fd226, fd203, 0d3FE3F3A0E28BEDD1, fd225; +mul.f64 fd227, fd214, 0d3FEF329C0558E969; +mul.f64 fd228, fd216, 0d3FDBC4C04D71ABC1; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd218, 0d3FE904C37505DE4B; +sub.f64 fd231, fd229, fd230; +sub.f64 fd232, fd226, fd231; +add.f64 fd233, fd231, fd226; +mul.f64 fd234, fd199, 0d3FECD4BCA9CB5C71; +sub.f64 fd235, fd171, fd234; +fma.rn.f64 fd236, fd201, 0d3FE3F3A0E28BEDD1, fd235; +mul.f64 fd237, fd203, 0d3FCC7B90E3024582; +sub.f64 fd238, fd236, fd237; +mul.f64 fd239, fd214, 0d3FDBC4C04D71ABC1; +mul.f64 fd240, fd216, 0d3FE904C37505DE4B; +sub.f64 fd241, fd239, fd240; +fma.rn.f64 fd242, fd218, 0d3FEF329C0558E969, fd241; +sub.f64 fd243, fd238, fd242; +add.f64 fd244, fd242, fd238; +fma.rn.f64 fd245, fd204, 0d3FE3F3A0E28BEDD1, fd172; +mul.f64 fd246, fd206, 0d3FCC7B90E3024582; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd208, 0d3FECD4BCA9CB5C71; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd175, fd195; +mul.f64 fd251, fd250, 0d3FE904C37505DE4B; +sub.f64 fd252, fd179, fd191; +fma.rn.f64 fd253, fd252, 0d3FEF329C0558E969, fd251; +sub.f64 fd254, fd183, fd187; +fma.rn.f64 fd255, fd254, 0d3FDBC4C04D71ABC1, fd253; +add.f64 fd256, fd255, fd249; +sub.f64 fd257, fd249, fd255; +mul.f64 fd258, fd204, 0d3FCC7B90E3024582; +sub.f64 fd259, fd172, fd258; +mul.f64 fd260, fd206, 0d3FECD4BCA9CB5C71; +sub.f64 fd261, fd259, fd260; +fma.rn.f64 fd262, fd208, 0d3FE3F3A0E28BEDD1, fd261; +mul.f64 fd263, fd250, 0d3FEF329C0558E969; +mul.f64 fd264, fd252, 0d3FDBC4C04D71ABC1; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd254, 0d3FE904C37505DE4B; +sub.f64 fd267, fd265, fd266; +add.f64 fd268, fd267, fd262; +sub.f64 fd269, fd262, fd267; +mul.f64 fd270, fd204, 0d3FECD4BCA9CB5C71; +sub.f64 fd271, fd172, fd270; +fma.rn.f64 fd272, fd206, 0d3FE3F3A0E28BEDD1, fd271; +mul.f64 fd273, fd208, 0d3FCC7B90E3024582; +sub.f64 fd274, fd272, fd273; +mul.f64 fd275, fd250, 0d3FDBC4C04D71ABC1; +mul.f64 fd276, fd252, 0d3FE904C37505DE4B; +sub.f64 fd277, fd275, fd276; +fma.rn.f64 fd278, fd254, 0d3FEF329C0558E969, fd277; +add.f64 fd279, fd278, fd274; +sub.f64 fd280, fd274, fd278; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd281, fd282}, [rd11]; +mul.f64 fd285, fd256, fd282; +mul.f64 fd286, fd220, fd282; +mul.f64 fd287, fd281, fd256; +mul.f64 fd288, fd281, fd281; +mul.f64 fd289, fd282, fd282; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd282, fd281; +fma.rn.f64 fd292, fd282, fd281, fd291; +mul.f64 fd293, fd268, fd292; +mul.f64 fd294, fd232, fd292; +mul.f64 fd295, fd290, fd268; +mul.f64 fd296, fd281, fd290; +mul.f64 fd297, fd282, fd292; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd281, fd292; +fma.rn.f64 fd300, fd282, fd290, fd299; +mul.f64 fd301, fd279, fd300; +mul.f64 fd302, fd243, fd300; +mul.f64 fd303, fd298, fd279; +ld.global.v2.f64 {fd304, fd305}, [rd11+784]; +mul.f64 fd308, fd280, fd305; +mul.f64 fd309, fd244, fd305; +mul.f64 fd310, fd304, fd280; +mul.f64 fd311, fd281, fd304; +mul.f64 fd312, fd282, fd305; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd281, fd305; +fma.rn.f64 fd315, fd282, fd304, fd314; +mul.f64 fd316, fd269, fd315; +mul.f64 fd317, fd233, fd315; +mul.f64 fd318, fd313, fd269; +mul.f64 fd319, fd281, fd313; +mul.f64 fd320, fd282, fd315; +sub.f64 fd321, fd319, fd320; +mul.f64 fd322, fd281, fd315; +fma.rn.f64 fd323, fd282, fd313, fd322; +mul.f64 fd324, fd257, fd323; +mul.f64 fd325, fd221, fd323; +mul.f64 fd326, fd321, fd257; +shl.b32 r18, r17, 4; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 784, r19; +add.f64 fd327, fd208, fd207; +add.f64 fd328, fd203, fd202; +st.shared.v2.f64 [r20], {fd328, fd327}; +fma.rn.f64 fd329, fd281, fd220, fd285; +sub.f64 fd330, fd287, fd286; +st.shared.v2.f64 [r20+112], {fd329, fd330}; +fma.rn.f64 fd331, fd290, fd232, fd293; +sub.f64 fd332, fd295, fd294; +st.shared.v2.f64 [r20+224], {fd331, fd332}; +fma.rn.f64 fd333, fd298, fd243, fd301; +sub.f64 fd334, fd303, fd302; +st.shared.v2.f64 [r20+336], {fd333, fd334}; +fma.rn.f64 fd335, fd304, fd244, fd308; +sub.f64 fd336, fd310, fd309; +st.shared.v2.f64 [r20+448], {fd335, fd336}; +fma.rn.f64 fd337, fd313, fd233, fd316; +sub.f64 fd338, fd318, fd317; +st.shared.v2.f64 [r20+560], {fd337, fd338}; +fma.rn.f64 fd339, fd321, fd221, fd324; +sub.f64 fd340, fd326, fd325; +st.shared.v2.f64 [r20+672], {fd339, fd340}; +barrier.sync 0; +ld.shared.v2.f64 {fd341, fd342}, [r10]; +ld.shared.v2.f64 {fd345, fd346}, [r10+5488]; +ld.shared.v2.f64 {fd349, fd350}, [r10+10976]; +ld.shared.v2.f64 {fd353, fd354}, [r10+16464]; +ld.shared.v2.f64 {fd357, fd358}, [r10+21952]; +ld.shared.v2.f64 {fd361, fd362}, [r10+27440]; +ld.shared.v2.f64 {fd365, fd366}, [r10+32928]; +add.f64 fd369, fd345, fd365; +add.f64 fd370, fd341, fd369; +add.f64 fd371, fd349, fd361; +add.f64 fd372, fd371, fd370; +add.f64 fd373, fd353, fd357; +add.f64 fd374, fd346, fd366; +add.f64 fd375, fd342, fd374; +add.f64 fd376, fd350, fd362; +add.f64 fd377, fd376, fd375; +add.f64 fd378, fd354, fd358; +fma.rn.f64 fd379, fd369, 0d3FE3F3A0E28BEDD1, fd341; +mul.f64 fd380, fd371, 0d3FCC7B90E3024582; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd373, 0d3FECD4BCA9CB5C71; +sub.f64 fd383, fd381, fd382; +sub.f64 fd384, fd346, fd366; +mul.f64 fd385, fd384, 0d3FE904C37505DE4B; +sub.f64 fd386, fd350, fd362; +fma.rn.f64 fd387, fd386, 0d3FEF329C0558E969, fd385; +sub.f64 fd388, fd354, fd358; +fma.rn.f64 fd389, fd388, 0d3FDBC4C04D71ABC1, fd387; +sub.f64 fd390, fd383, fd389; +add.f64 fd391, fd389, fd383; +mul.f64 fd392, fd369, 0d3FCC7B90E3024582; +sub.f64 fd393, fd341, fd392; +mul.f64 fd394, fd371, 0d3FECD4BCA9CB5C71; +sub.f64 fd395, fd393, fd394; +fma.rn.f64 fd396, fd373, 0d3FE3F3A0E28BEDD1, fd395; +mul.f64 fd397, fd384, 0d3FEF329C0558E969; +mul.f64 fd398, fd386, 0d3FDBC4C04D71ABC1; +sub.f64 fd399, fd397, fd398; +mul.f64 fd400, fd388, 0d3FE904C37505DE4B; +sub.f64 fd401, fd399, fd400; +sub.f64 fd402, fd396, fd401; +add.f64 fd403, fd401, fd396; +mul.f64 fd404, fd369, 0d3FECD4BCA9CB5C71; +sub.f64 fd405, fd341, fd404; +fma.rn.f64 fd406, fd371, 0d3FE3F3A0E28BEDD1, fd405; +mul.f64 fd407, fd373, 0d3FCC7B90E3024582; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd384, 0d3FDBC4C04D71ABC1; +mul.f64 fd410, fd386, 0d3FE904C37505DE4B; +sub.f64 fd411, fd409, fd410; +fma.rn.f64 fd412, fd388, 0d3FEF329C0558E969, fd411; +sub.f64 fd413, fd408, fd412; +add.f64 fd414, fd412, fd408; +fma.rn.f64 fd415, fd374, 0d3FE3F3A0E28BEDD1, fd342; +mul.f64 fd416, fd376, 0d3FCC7B90E3024582; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd378, 0d3FECD4BCA9CB5C71; +sub.f64 fd419, fd417, fd418; +sub.f64 fd420, fd345, fd365; +mul.f64 fd421, fd420, 0d3FE904C37505DE4B; +sub.f64 fd422, fd349, fd361; +fma.rn.f64 fd423, fd422, 0d3FEF329C0558E969, fd421; +sub.f64 fd424, fd353, fd357; +fma.rn.f64 fd425, fd424, 0d3FDBC4C04D71ABC1, fd423; +add.f64 fd426, fd425, fd419; +sub.f64 fd427, fd419, fd425; +mul.f64 fd428, fd374, 0d3FCC7B90E3024582; +sub.f64 fd429, fd342, fd428; +mul.f64 fd430, fd376, 0d3FECD4BCA9CB5C71; +sub.f64 fd431, fd429, fd430; +fma.rn.f64 fd432, fd378, 0d3FE3F3A0E28BEDD1, fd431; +mul.f64 fd433, fd420, 0d3FEF329C0558E969; +mul.f64 fd434, fd422, 0d3FDBC4C04D71ABC1; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd424, 0d3FE904C37505DE4B; +sub.f64 fd437, fd435, fd436; +add.f64 fd438, fd437, fd432; +sub.f64 fd439, fd432, fd437; +mul.f64 fd440, fd374, 0d3FECD4BCA9CB5C71; +sub.f64 fd441, fd342, fd440; +fma.rn.f64 fd442, fd376, 0d3FE3F3A0E28BEDD1, fd441; +mul.f64 fd443, fd378, 0d3FCC7B90E3024582; +sub.f64 fd444, fd442, fd443; +mul.f64 fd445, fd420, 0d3FDBC4C04D71ABC1; +mul.f64 fd446, fd422, 0d3FE904C37505DE4B; +sub.f64 fd447, fd445, fd446; +fma.rn.f64 fd448, fd424, 0d3FEF329C0558E969, fd447; +add.f64 fd449, fd448, fd444; +sub.f64 fd450, fd444, fd448; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 16; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd451, fd452}, [rd16]; +mul.f64 fd455, fd426, fd452; +mul.f64 fd456, fd390, fd452; +mul.f64 fd457, fd451, fd426; +mul.f64 fd458, fd451, fd451; +mul.f64 fd459, fd452, fd452; +sub.f64 fd460, fd458, fd459; +mul.f64 fd461, fd452, fd451; +fma.rn.f64 fd462, fd452, fd451, fd461; +mul.f64 fd463, fd438, fd462; +mul.f64 fd464, fd402, fd462; +mul.f64 fd465, fd460, fd438; +mul.f64 fd466, fd451, fd460; +mul.f64 fd467, fd452, fd462; +sub.f64 fd468, fd466, fd467; +mul.f64 fd469, fd451, fd462; +fma.rn.f64 fd470, fd452, fd460, fd469; +mul.f64 fd471, fd449, fd470; +mul.f64 fd472, fd413, fd470; +mul.f64 fd473, fd468, fd449; +ld.global.v2.f64 {fd474, fd475}, [rd16+112]; +mul.f64 fd478, fd450, fd475; +mul.f64 fd479, fd414, fd475; +mul.f64 fd480, fd474, fd450; +mul.f64 fd481, fd451, fd474; +mul.f64 fd482, fd452, fd475; +sub.f64 fd483, fd481, fd482; +mul.f64 fd484, fd451, fd475; +fma.rn.f64 fd485, fd452, fd474, fd484; +mul.f64 fd486, fd439, fd485; +mul.f64 fd487, fd403, fd485; +mul.f64 fd488, fd483, fd439; +mul.f64 fd489, fd451, fd483; +mul.f64 fd490, fd452, fd485; +sub.f64 fd491, fd489, fd490; +mul.f64 fd492, fd451, fd485; +fma.rn.f64 fd493, fd452, fd483, fd492; +mul.f64 fd494, fd427, fd493; +mul.f64 fd495, fd391, fd493; +mul.f64 fd496, fd491, fd427; +shl.b32 r24, r23, 4; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 5488, r25; +add.f64 fd497, fd378, fd377; +add.f64 fd498, fd373, fd372; +st.shared.v2.f64 [r26], {fd498, fd497}; +fma.rn.f64 fd499, fd451, fd390, fd455; +sub.f64 fd500, fd457, fd456; +st.shared.v2.f64 [r26+784], {fd499, fd500}; +fma.rn.f64 fd501, fd460, fd402, fd463; +sub.f64 fd502, fd465, fd464; +st.shared.v2.f64 [r26+1568], {fd501, fd502}; +fma.rn.f64 fd503, fd468, fd413, fd471; +sub.f64 fd504, fd473, fd472; +st.shared.v2.f64 [r26+2352], {fd503, fd504}; +fma.rn.f64 fd505, fd474, fd414, fd478; +sub.f64 fd506, fd480, fd479; +st.shared.v2.f64 [r26+3136], {fd505, fd506}; +fma.rn.f64 fd507, fd483, fd403, fd486; +sub.f64 fd508, fd488, fd487; +st.shared.v2.f64 [r26+3920], {fd507, fd508}; +fma.rn.f64 fd509, fd491, fd391, fd494; +sub.f64 fd510, fd496, fd495; +st.shared.v2.f64 [r26+4704], {fd509, fd510}; +barrier.sync 0; +ld.shared.v2.f64 {fd511, fd512}, [r10]; +ld.shared.v2.f64 {fd515, fd516}, [r10+5488]; +ld.shared.v2.f64 {fd519, fd520}, [r10+10976]; +ld.shared.v2.f64 {fd523, fd524}, [r10+16464]; +ld.shared.v2.f64 {fd527, fd528}, [r10+21952]; +ld.shared.v2.f64 {fd531, fd532}, [r10+27440]; +ld.shared.v2.f64 {fd535, fd536}, [r10+32928]; +add.f64 fd539, fd515, fd535; +add.f64 fd540, fd511, fd539; +add.f64 fd541, fd519, fd531; +add.f64 fd542, fd541, fd540; +add.f64 fd543, fd523, fd527; +add.f64 fd544, fd516, fd536; +add.f64 fd545, fd512, fd544; +add.f64 fd546, fd520, fd532; +add.f64 fd547, fd546, fd545; +add.f64 fd548, fd524, fd528; +fma.rn.f64 fd549, fd539, 0d3FE3F3A0E28BEDD1, fd511; +mul.f64 fd550, fd541, 0d3FCC7B90E3024582; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd543, 0d3FECD4BCA9CB5C71; +sub.f64 fd553, fd551, fd552; +sub.f64 fd554, fd516, fd536; +mul.f64 fd555, fd554, 0d3FE904C37505DE4B; +sub.f64 fd556, fd520, fd532; +fma.rn.f64 fd557, fd556, 0d3FEF329C0558E969, fd555; +sub.f64 fd558, fd524, fd528; +fma.rn.f64 fd559, fd558, 0d3FDBC4C04D71ABC1, fd557; +mul.f64 fd560, fd539, 0d3FCC7B90E3024582; +sub.f64 fd561, fd511, fd560; +mul.f64 fd562, fd541, 0d3FECD4BCA9CB5C71; +sub.f64 fd563, fd561, fd562; +fma.rn.f64 fd564, fd543, 0d3FE3F3A0E28BEDD1, fd563; +mul.f64 fd565, fd554, 0d3FEF329C0558E969; +mul.f64 fd566, fd556, 0d3FDBC4C04D71ABC1; +sub.f64 fd567, fd565, fd566; +mul.f64 fd568, fd558, 0d3FE904C37505DE4B; +sub.f64 fd569, fd567, fd568; +mul.f64 fd570, fd539, 0d3FECD4BCA9CB5C71; +sub.f64 fd571, fd511, fd570; +fma.rn.f64 fd572, fd541, 0d3FE3F3A0E28BEDD1, fd571; +mul.f64 fd573, fd543, 0d3FCC7B90E3024582; +sub.f64 fd574, fd572, fd573; +mul.f64 fd575, fd554, 0d3FDBC4C04D71ABC1; +mul.f64 fd576, fd556, 0d3FE904C37505DE4B; +sub.f64 fd577, fd575, fd576; +fma.rn.f64 fd578, fd558, 0d3FEF329C0558E969, fd577; +fma.rn.f64 fd579, fd544, 0d3FE3F3A0E28BEDD1, fd512; +mul.f64 fd580, fd546, 0d3FCC7B90E3024582; +sub.f64 fd581, fd579, fd580; +mul.f64 fd582, fd548, 0d3FECD4BCA9CB5C71; +sub.f64 fd583, fd581, fd582; +sub.f64 fd584, fd515, fd535; +mul.f64 fd585, fd584, 0d3FE904C37505DE4B; +sub.f64 fd586, fd519, fd531; +fma.rn.f64 fd587, fd586, 0d3FEF329C0558E969, fd585; +sub.f64 fd588, fd523, fd527; +fma.rn.f64 fd589, fd588, 0d3FDBC4C04D71ABC1, fd587; +mul.f64 fd590, fd544, 0d3FCC7B90E3024582; +sub.f64 fd591, fd512, fd590; +mul.f64 fd592, fd546, 0d3FECD4BCA9CB5C71; +sub.f64 fd593, fd591, fd592; +fma.rn.f64 fd594, fd548, 0d3FE3F3A0E28BEDD1, fd593; +mul.f64 fd595, fd584, 0d3FEF329C0558E969; +mul.f64 fd596, fd586, 0d3FDBC4C04D71ABC1; +sub.f64 fd597, fd595, fd596; +mul.f64 fd598, fd588, 0d3FE904C37505DE4B; +sub.f64 fd599, fd597, fd598; +mul.f64 fd600, fd544, 0d3FECD4BCA9CB5C71; +sub.f64 fd601, fd512, fd600; +fma.rn.f64 fd602, fd546, 0d3FE3F3A0E28BEDD1, fd601; +mul.f64 fd603, fd548, 0d3FCC7B90E3024582; +sub.f64 fd604, fd602, fd603; +mul.f64 fd605, fd584, 0d3FDBC4C04D71ABC1; +mul.f64 fd606, fd586, 0d3FE904C37505DE4B; +sub.f64 fd607, fd605, fd606; +fma.rn.f64 fd608, fd588, 0d3FEF329C0558E969, fd607; +add.f64 %1, fd548, fd547; +add.f64 %0, fd543, fd542; +add.f64 %3, fd589, fd583; +sub.f64 %2, fd553, fd559; +add.f64 %5, fd599, fd594; +sub.f64 %4, fd564, fd569; +add.f64 %7, fd608, fd604; +sub.f64 %6, fd574, fd578; +sub.f64 %9, fd604, fd608; +add.f64 %8, fd578, fd574; +sub.f64 %11, fd594, fd599; +add.f64 %10, fd569, fd564; +sub.f64 %13, fd583, fd589; +add.f64 %12, fd559, fd553; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_2401), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<723, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<581>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 19208, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %20, %34; +add.f64 fd30, %18, fd29; +add.f64 fd31, %23, %31; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %26, %28; +add.f64 fd34, fd33, fd32; +add.f64 fd35, %22, %35; +add.f64 fd36, %19, fd35; +add.f64 fd37, %25, %33; +add.f64 fd38, fd37, fd36; +add.f64 fd39, %27, %30; +add.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd29, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd42, fd31, 0d3FCC7B90E3024582; +sub.f64 fd43, fd41, fd42; +mul.f64 fd44, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd45, fd43, fd44; +sub.f64 fd46, %22, %35; +mul.f64 fd47, fd46, 0d3FE904C37505DE4B; +sub.f64 fd48, %25, %33; +fma.rn.f64 fd49, fd48, 0d3FEF329C0558E969, fd47; +sub.f64 fd50, %27, %30; +fma.rn.f64 fd51, fd50, 0d3FDBC4C04D71ABC1, fd49; +sub.f64 fd52, fd45, fd51; +add.f64 fd53, fd51, fd45; +mul.f64 fd54, fd29, 0d3FCC7B90E3024582; +sub.f64 fd55, %18, fd54; +mul.f64 fd56, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd57, fd55, fd56; +fma.rn.f64 fd58, fd33, 0d3FE3F3A0E28BEDD1, fd57; +mul.f64 fd59, fd46, 0d3FEF329C0558E969; +mul.f64 fd60, fd48, 0d3FDBC4C04D71ABC1; +sub.f64 fd61, fd59, fd60; +mul.f64 fd62, fd50, 0d3FE904C37505DE4B; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd58, fd63; +add.f64 fd65, fd63, fd58; +mul.f64 fd66, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd67, %18, fd66; +fma.rn.f64 fd68, fd31, 0d3FE3F3A0E28BEDD1, fd67; +mul.f64 fd69, fd33, 0d3FCC7B90E3024582; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd46, 0d3FDBC4C04D71ABC1; +mul.f64 fd72, fd48, 0d3FE904C37505DE4B; +sub.f64 fd73, fd71, fd72; +fma.rn.f64 fd74, fd50, 0d3FEF329C0558E969, fd73; +sub.f64 fd75, fd70, fd74; +add.f64 fd76, fd74, fd70; +fma.rn.f64 fd77, fd35, 0d3FE3F3A0E28BEDD1, %19; +mul.f64 fd78, fd37, 0d3FCC7B90E3024582; +sub.f64 fd79, fd77, fd78; +mul.f64 fd80, fd39, 0d3FECD4BCA9CB5C71; +sub.f64 fd81, fd79, fd80; +sub.f64 fd82, %20, %34; +mul.f64 fd83, fd82, 0d3FE904C37505DE4B; +sub.f64 fd84, %23, %31; +fma.rn.f64 fd85, fd84, 0d3FEF329C0558E969, fd83; +sub.f64 fd86, %26, %28; +fma.rn.f64 fd87, fd86, 0d3FDBC4C04D71ABC1, fd85; +add.f64 fd88, fd87, fd81; +sub.f64 fd89, fd81, fd87; +mul.f64 fd90, fd35, 0d3FCC7B90E3024582; +sub.f64 fd91, %19, fd90; +mul.f64 fd92, fd37, 0d3FECD4BCA9CB5C71; +sub.f64 fd93, fd91, fd92; +fma.rn.f64 fd94, fd39, 0d3FE3F3A0E28BEDD1, fd93; +mul.f64 fd95, fd82, 0d3FEF329C0558E969; +mul.f64 fd96, fd84, 0d3FDBC4C04D71ABC1; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd86, 0d3FE904C37505DE4B; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd99, fd94; +sub.f64 fd101, fd94, fd99; +mul.f64 fd102, fd35, 0d3FECD4BCA9CB5C71; +sub.f64 fd103, %19, fd102; +fma.rn.f64 fd104, fd37, 0d3FE3F3A0E28BEDD1, fd103; +mul.f64 fd105, fd39, 0d3FCC7B90E3024582; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd82, 0d3FDBC4C04D71ABC1; +mul.f64 fd108, fd84, 0d3FE904C37505DE4B; +sub.f64 fd109, fd107, fd108; +fma.rn.f64 fd110, fd86, 0d3FEF329C0558E969, fd109; +add.f64 fd111, fd110, fd106; +sub.f64 fd112, fd106, fd110; +mul.wide.u32 rd2, r4, -1089394037; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 343; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd113, fd114}, [rd6]; +mul.f64 fd117, fd88, fd114; +fma.rn.f64 fd118, fd113, fd52, fd117; +mul.f64 fd119, fd52, fd114; +mul.f64 fd120, fd113, fd88; +sub.f64 fd121, fd120, fd119; +mul.f64 fd122, fd113, fd113; +mul.f64 fd123, fd114, fd114; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd114, fd113; +fma.rn.f64 fd126, fd114, fd113, fd125; +mul.f64 fd127, fd100, fd126; +fma.rn.f64 fd128, fd124, fd64, fd127; +mul.f64 fd129, fd64, fd126; +mul.f64 fd130, fd124, fd100; +sub.f64 fd131, fd130, fd129; +mul.f64 fd132, fd113, fd124; +mul.f64 fd133, fd114, fd126; +sub.f64 fd134, fd132, fd133; +mul.f64 fd135, fd113, fd126; +fma.rn.f64 fd136, fd114, fd124, fd135; +mul.f64 fd137, fd111, fd136; +fma.rn.f64 fd138, fd134, fd75, fd137; +mul.f64 fd139, fd75, fd136; +mul.f64 fd140, fd134, fd111; +sub.f64 fd141, fd140, fd139; +ld.global.v2.f64 {fd142, fd143}, [rd6+5488]; +mul.f64 fd146, fd112, fd143; +fma.rn.f64 fd147, fd142, fd76, fd146; +mul.f64 fd148, fd76, fd143; +mul.f64 fd149, fd142, fd112; +sub.f64 fd150, fd149, fd148; +mul.f64 fd151, fd113, fd142; +mul.f64 fd152, fd114, fd143; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd113, fd143; +fma.rn.f64 fd155, fd114, fd142, fd154; +mul.f64 fd156, fd101, fd155; +fma.rn.f64 fd157, fd153, fd65, fd156; +mul.f64 fd158, fd65, fd155; +mul.f64 fd159, fd153, fd101; +sub.f64 fd160, fd159, fd158; +mul.f64 fd161, fd113, fd153; +mul.f64 fd162, fd114, fd155; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd113, fd155; +fma.rn.f64 fd165, fd114, fd153, fd164; +mul.f64 fd166, fd89, fd165; +fma.rn.f64 fd167, fd163, fd53, fd166; +mul.f64 fd168, fd53, fd165; +mul.f64 fd169, fd163, fd89; +sub.f64 fd170, fd169, fd168; +mad.lo.s32 r8, r5, 19208, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +st.shared.f64 [r9], fd34; +st.shared.f64 [r9+8], fd118; +st.shared.f64 [r9+16], fd128; +st.shared.f64 [r9+24], fd138; +st.shared.f64 [r9+32], fd147; +st.shared.f64 [r9+40], fd157; +st.shared.f64 [r9+48], fd167; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.f64 fd171, [r10]; +ld.shared.f64 fd172, [r10+2744]; +ld.shared.f64 fd173, [r10+5488]; +ld.shared.f64 fd174, [r10+8232]; +ld.shared.f64 fd175, [r10+10976]; +ld.shared.f64 fd176, [r10+13720]; +ld.shared.f64 fd177, [r10+16464]; +barrier.sync 0; +st.shared.f64 [r9], fd40; +st.shared.f64 [r9+8], fd121; +st.shared.f64 [r9+16], fd131; +st.shared.f64 [r9+24], fd141; +st.shared.f64 [r9+32], fd150; +st.shared.f64 [r9+40], fd160; +st.shared.f64 [r9+48], fd170; +barrier.sync 0; +ld.shared.f64 fd178, [r10]; +ld.shared.f64 fd179, [r10+2744]; +ld.shared.f64 fd180, [r10+5488]; +ld.shared.f64 fd181, [r10+8232]; +ld.shared.f64 fd182, [r10+10976]; +ld.shared.f64 fd183, [r10+13720]; +ld.shared.f64 fd184, [r10+16464]; +add.f64 fd185, fd172, fd177; +add.f64 fd186, fd171, fd185; +add.f64 fd187, fd173, fd176; +add.f64 fd188, fd187, fd186; +add.f64 fd189, fd174, fd175; +add.f64 fd190, fd189, fd188; +add.f64 fd191, fd179, fd184; +add.f64 fd192, fd178, fd191; +add.f64 fd193, fd180, fd183; +add.f64 fd194, fd193, fd192; +add.f64 fd195, fd181, fd182; +add.f64 fd196, fd195, fd194; +fma.rn.f64 fd197, fd185, 0d3FE3F3A0E28BEDD1, fd171; +mul.f64 fd198, fd187, 0d3FCC7B90E3024582; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd189, 0d3FECD4BCA9CB5C71; +sub.f64 fd201, fd199, fd200; +sub.f64 fd202, fd179, fd184; +mul.f64 fd203, fd202, 0d3FE904C37505DE4B; +sub.f64 fd204, fd180, fd183; +fma.rn.f64 fd205, fd204, 0d3FEF329C0558E969, fd203; +sub.f64 fd206, fd181, fd182; +fma.rn.f64 fd207, fd206, 0d3FDBC4C04D71ABC1, fd205; +sub.f64 fd208, fd201, fd207; +add.f64 fd209, fd207, fd201; +mul.f64 fd210, fd185, 0d3FCC7B90E3024582; +sub.f64 fd211, fd171, fd210; +mul.f64 fd212, fd187, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +fma.rn.f64 fd214, fd189, 0d3FE3F3A0E28BEDD1, fd213; +mul.f64 fd215, fd202, 0d3FEF329C0558E969; +mul.f64 fd216, fd204, 0d3FDBC4C04D71ABC1; +sub.f64 fd217, fd215, fd216; +mul.f64 fd218, fd206, 0d3FE904C37505DE4B; +sub.f64 fd219, fd217, fd218; +sub.f64 fd220, fd214, fd219; +add.f64 fd221, fd219, fd214; +mul.f64 fd222, fd185, 0d3FECD4BCA9CB5C71; +sub.f64 fd223, fd171, fd222; +fma.rn.f64 fd224, fd187, 0d3FE3F3A0E28BEDD1, fd223; +mul.f64 fd225, fd189, 0d3FCC7B90E3024582; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd202, 0d3FDBC4C04D71ABC1; +mul.f64 fd228, fd204, 0d3FE904C37505DE4B; +sub.f64 fd229, fd227, fd228; +fma.rn.f64 fd230, fd206, 0d3FEF329C0558E969, fd229; +sub.f64 fd231, fd226, fd230; +add.f64 fd232, fd230, fd226; +fma.rn.f64 fd233, fd191, 0d3FE3F3A0E28BEDD1, fd178; +mul.f64 fd234, fd193, 0d3FCC7B90E3024582; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd195, 0d3FECD4BCA9CB5C71; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, fd172, fd177; +mul.f64 fd239, fd238, 0d3FE904C37505DE4B; +sub.f64 fd240, fd173, fd176; +fma.rn.f64 fd241, fd240, 0d3FEF329C0558E969, fd239; +sub.f64 fd242, fd174, fd175; +fma.rn.f64 fd243, fd242, 0d3FDBC4C04D71ABC1, fd241; +add.f64 fd244, fd243, fd237; +sub.f64 fd245, fd237, fd243; +mul.f64 fd246, fd191, 0d3FCC7B90E3024582; +sub.f64 fd247, fd178, fd246; +mul.f64 fd248, fd193, 0d3FECD4BCA9CB5C71; +sub.f64 fd249, fd247, fd248; +fma.rn.f64 fd250, fd195, 0d3FE3F3A0E28BEDD1, fd249; +mul.f64 fd251, fd238, 0d3FEF329C0558E969; +mul.f64 fd252, fd240, 0d3FDBC4C04D71ABC1; +sub.f64 fd253, fd251, fd252; +mul.f64 fd254, fd242, 0d3FE904C37505DE4B; +sub.f64 fd255, fd253, fd254; +add.f64 fd256, fd255, fd250; +sub.f64 fd257, fd250, fd255; +mul.f64 fd258, fd191, 0d3FECD4BCA9CB5C71; +sub.f64 fd259, fd178, fd258; +fma.rn.f64 fd260, fd193, 0d3FE3F3A0E28BEDD1, fd259; +mul.f64 fd261, fd195, 0d3FCC7B90E3024582; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd238, 0d3FDBC4C04D71ABC1; +mul.f64 fd264, fd240, 0d3FE904C37505DE4B; +sub.f64 fd265, fd263, fd264; +fma.rn.f64 fd266, fd242, 0d3FEF329C0558E969, fd265; +add.f64 fd267, fd266, fd262; +sub.f64 fd268, fd262, fd266; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd269, fd270}, [rd11]; +mul.f64 fd273, fd244, fd270; +fma.rn.f64 fd274, fd269, fd208, fd273; +mul.f64 fd275, fd208, fd270; +mul.f64 fd276, fd269, fd244; +sub.f64 fd277, fd276, fd275; +mul.f64 fd278, fd269, fd269; +mul.f64 fd279, fd270, fd270; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd270, fd269; +fma.rn.f64 fd282, fd270, fd269, fd281; +mul.f64 fd283, fd256, fd282; +fma.rn.f64 fd284, fd280, fd220, fd283; +mul.f64 fd285, fd220, fd282; +mul.f64 fd286, fd280, fd256; +sub.f64 fd287, fd286, fd285; +mul.f64 fd288, fd269, fd280; +mul.f64 fd289, fd270, fd282; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd269, fd282; +fma.rn.f64 fd292, fd270, fd280, fd291; +mul.f64 fd293, fd267, fd292; +fma.rn.f64 fd294, fd290, fd231, fd293; +mul.f64 fd295, fd231, fd292; +mul.f64 fd296, fd290, fd267; +sub.f64 fd297, fd296, fd295; +ld.global.v2.f64 {fd298, fd299}, [rd11+784]; +mul.f64 fd302, fd268, fd299; +fma.rn.f64 fd303, fd298, fd232, fd302; +mul.f64 fd304, fd232, fd299; +mul.f64 fd305, fd298, fd268; +sub.f64 fd306, fd305, fd304; +mul.f64 fd307, fd269, fd298; +mul.f64 fd308, fd270, fd299; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd269, fd299; +fma.rn.f64 fd311, fd270, fd298, fd310; +mul.f64 fd312, fd257, fd311; +fma.rn.f64 fd313, fd309, fd221, fd312; +mul.f64 fd314, fd221, fd311; +mul.f64 fd315, fd309, fd257; +sub.f64 fd316, fd315, fd314; +mul.f64 fd317, fd269, fd309; +mul.f64 fd318, fd270, fd311; +sub.f64 fd319, fd317, fd318; +mul.f64 fd320, fd269, fd311; +fma.rn.f64 fd321, fd270, fd309, fd320; +mul.f64 fd322, fd245, fd321; +fma.rn.f64 fd323, fd319, fd209, fd322; +mul.f64 fd324, fd209, fd321; +mul.f64 fd325, fd319, fd245; +sub.f64 fd326, fd325, fd324; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +st.shared.f64 [r20], fd190; +st.shared.f64 [r20+56], fd274; +st.shared.f64 [r20+112], fd284; +st.shared.f64 [r20+168], fd294; +st.shared.f64 [r20+224], fd303; +st.shared.f64 [r20+280], fd313; +st.shared.f64 [r20+336], fd323; +barrier.sync 0; +ld.shared.f64 fd327, [r10]; +ld.shared.f64 fd328, [r10+2744]; +ld.shared.f64 fd329, [r10+5488]; +ld.shared.f64 fd330, [r10+8232]; +ld.shared.f64 fd331, [r10+10976]; +ld.shared.f64 fd332, [r10+13720]; +ld.shared.f64 fd333, [r10+16464]; +barrier.sync 0; +st.shared.f64 [r20], fd196; +st.shared.f64 [r20+56], fd277; +st.shared.f64 [r20+112], fd287; +st.shared.f64 [r20+168], fd297; +st.shared.f64 [r20+224], fd306; +st.shared.f64 [r20+280], fd316; +st.shared.f64 [r20+336], fd326; +barrier.sync 0; +ld.shared.f64 fd334, [r10]; +ld.shared.f64 fd335, [r10+2744]; +ld.shared.f64 fd336, [r10+5488]; +ld.shared.f64 fd337, [r10+8232]; +ld.shared.f64 fd338, [r10+10976]; +ld.shared.f64 fd339, [r10+13720]; +ld.shared.f64 fd340, [r10+16464]; +add.f64 fd341, fd328, fd333; +add.f64 fd342, fd327, fd341; +add.f64 fd343, fd329, fd332; +add.f64 fd344, fd343, fd342; +add.f64 fd345, fd330, fd331; +add.f64 fd346, fd345, fd344; +add.f64 fd347, fd335, fd340; +add.f64 fd348, fd334, fd347; +add.f64 fd349, fd336, fd339; +add.f64 fd350, fd349, fd348; +add.f64 fd351, fd337, fd338; +add.f64 fd352, fd351, fd350; +fma.rn.f64 fd353, fd341, 0d3FE3F3A0E28BEDD1, fd327; +mul.f64 fd354, fd343, 0d3FCC7B90E3024582; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd345, 0d3FECD4BCA9CB5C71; +sub.f64 fd357, fd355, fd356; +sub.f64 fd358, fd335, fd340; +mul.f64 fd359, fd358, 0d3FE904C37505DE4B; +sub.f64 fd360, fd336, fd339; +fma.rn.f64 fd361, fd360, 0d3FEF329C0558E969, fd359; +sub.f64 fd362, fd337, fd338; +fma.rn.f64 fd363, fd362, 0d3FDBC4C04D71ABC1, fd361; +sub.f64 fd364, fd357, fd363; +add.f64 fd365, fd363, fd357; +mul.f64 fd366, fd341, 0d3FCC7B90E3024582; +sub.f64 fd367, fd327, fd366; +mul.f64 fd368, fd343, 0d3FECD4BCA9CB5C71; +sub.f64 fd369, fd367, fd368; +fma.rn.f64 fd370, fd345, 0d3FE3F3A0E28BEDD1, fd369; +mul.f64 fd371, fd358, 0d3FEF329C0558E969; +mul.f64 fd372, fd360, 0d3FDBC4C04D71ABC1; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd362, 0d3FE904C37505DE4B; +sub.f64 fd375, fd373, fd374; +sub.f64 fd376, fd370, fd375; +add.f64 fd377, fd375, fd370; +mul.f64 fd378, fd341, 0d3FECD4BCA9CB5C71; +sub.f64 fd379, fd327, fd378; +fma.rn.f64 fd380, fd343, 0d3FE3F3A0E28BEDD1, fd379; +mul.f64 fd381, fd345, 0d3FCC7B90E3024582; +sub.f64 fd382, fd380, fd381; +mul.f64 fd383, fd358, 0d3FDBC4C04D71ABC1; +mul.f64 fd384, fd360, 0d3FE904C37505DE4B; +sub.f64 fd385, fd383, fd384; +fma.rn.f64 fd386, fd362, 0d3FEF329C0558E969, fd385; +sub.f64 fd387, fd382, fd386; +add.f64 fd388, fd386, fd382; +fma.rn.f64 fd389, fd347, 0d3FE3F3A0E28BEDD1, fd334; +mul.f64 fd390, fd349, 0d3FCC7B90E3024582; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd351, 0d3FECD4BCA9CB5C71; +sub.f64 fd393, fd391, fd392; +sub.f64 fd394, fd328, fd333; +mul.f64 fd395, fd394, 0d3FE904C37505DE4B; +sub.f64 fd396, fd329, fd332; +fma.rn.f64 fd397, fd396, 0d3FEF329C0558E969, fd395; +sub.f64 fd398, fd330, fd331; +fma.rn.f64 fd399, fd398, 0d3FDBC4C04D71ABC1, fd397; +add.f64 fd400, fd399, fd393; +sub.f64 fd401, fd393, fd399; +mul.f64 fd402, fd347, 0d3FCC7B90E3024582; +sub.f64 fd403, fd334, fd402; +mul.f64 fd404, fd349, 0d3FECD4BCA9CB5C71; +sub.f64 fd405, fd403, fd404; +fma.rn.f64 fd406, fd351, 0d3FE3F3A0E28BEDD1, fd405; +mul.f64 fd407, fd394, 0d3FEF329C0558E969; +mul.f64 fd408, fd396, 0d3FDBC4C04D71ABC1; +sub.f64 fd409, fd407, fd408; +mul.f64 fd410, fd398, 0d3FE904C37505DE4B; +sub.f64 fd411, fd409, fd410; +add.f64 fd412, fd411, fd406; +sub.f64 fd413, fd406, fd411; +mul.f64 fd414, fd347, 0d3FECD4BCA9CB5C71; +sub.f64 fd415, fd334, fd414; +fma.rn.f64 fd416, fd349, 0d3FE3F3A0E28BEDD1, fd415; +mul.f64 fd417, fd351, 0d3FCC7B90E3024582; +sub.f64 fd418, fd416, fd417; +mul.f64 fd419, fd394, 0d3FDBC4C04D71ABC1; +mul.f64 fd420, fd396, 0d3FE904C37505DE4B; +sub.f64 fd421, fd419, fd420; +fma.rn.f64 fd422, fd398, 0d3FEF329C0558E969, fd421; +add.f64 fd423, fd422, fd418; +sub.f64 fd424, fd418, fd422; +mul.wide.u32 rd12, r7, 1402438301; +shr.u64 rd13, rd12, 36; +cvt.u32.u64 r21, rd13; +mul.lo.s32 r22, r21, 49; +sub.s32 r23, r7, r22; +mul.wide.u32 rd14, r21, 16; +mov.u64 rd15, %17; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd425, fd426}, [rd16]; +mul.f64 fd429, fd400, fd426; +fma.rn.f64 fd430, fd425, fd364, fd429; +mul.f64 fd431, fd364, fd426; +mul.f64 fd432, fd425, fd400; +sub.f64 fd433, fd432, fd431; +mul.f64 fd434, fd425, fd425; +mul.f64 fd435, fd426, fd426; +sub.f64 fd436, fd434, fd435; +mul.f64 fd437, fd426, fd425; +fma.rn.f64 fd438, fd426, fd425, fd437; +mul.f64 fd439, fd412, fd438; +fma.rn.f64 fd440, fd436, fd376, fd439; +mul.f64 fd441, fd376, fd438; +mul.f64 fd442, fd436, fd412; +sub.f64 fd443, fd442, fd441; +mul.f64 fd444, fd425, fd436; +mul.f64 fd445, fd426, fd438; +sub.f64 fd446, fd444, fd445; +mul.f64 fd447, fd425, fd438; +fma.rn.f64 fd448, fd426, fd436, fd447; +mul.f64 fd449, fd423, fd448; +fma.rn.f64 fd450, fd446, fd387, fd449; +mul.f64 fd451, fd387, fd448; +mul.f64 fd452, fd446, fd423; +sub.f64 fd453, fd452, fd451; +ld.global.v2.f64 {fd454, fd455}, [rd16+112]; +mul.f64 fd458, fd424, fd455; +fma.rn.f64 fd459, fd454, fd388, fd458; +mul.f64 fd460, fd388, fd455; +mul.f64 fd461, fd454, fd424; +sub.f64 fd462, fd461, fd460; +mul.f64 fd463, fd425, fd454; +mul.f64 fd464, fd426, fd455; +sub.f64 fd465, fd463, fd464; +mul.f64 fd466, fd425, fd455; +fma.rn.f64 fd467, fd426, fd454, fd466; +mul.f64 fd468, fd413, fd467; +fma.rn.f64 fd469, fd465, fd377, fd468; +mul.f64 fd470, fd377, fd467; +mul.f64 fd471, fd465, fd413; +sub.f64 fd472, fd471, fd470; +mul.f64 fd473, fd425, fd465; +mul.f64 fd474, fd426, fd467; +sub.f64 fd475, fd473, fd474; +mul.f64 fd476, fd425, fd467; +fma.rn.f64 fd477, fd426, fd465, fd476; +mul.f64 fd478, fd401, fd477; +fma.rn.f64 fd479, fd475, fd365, fd478; +mul.f64 fd480, fd365, fd477; +mul.f64 fd481, fd475, fd401; +sub.f64 fd482, fd481, fd480; +shl.b32 r24, r23, 3; +add.s32 r25, r8, r24; +barrier.sync 0; +mad.lo.s32 r26, r21, 2744, r25; +st.shared.f64 [r26], fd346; +st.shared.f64 [r26+392], fd430; +st.shared.f64 [r26+784], fd440; +st.shared.f64 [r26+1176], fd450; +st.shared.f64 [r26+1568], fd459; +st.shared.f64 [r26+1960], fd469; +st.shared.f64 [r26+2352], fd479; +barrier.sync 0; +ld.shared.f64 fd483, [r10]; +ld.shared.f64 fd484, [r10+2744]; +ld.shared.f64 fd485, [r10+5488]; +ld.shared.f64 fd486, [r10+8232]; +ld.shared.f64 fd487, [r10+10976]; +ld.shared.f64 fd488, [r10+13720]; +ld.shared.f64 fd489, [r10+16464]; +barrier.sync 0; +st.shared.f64 [r26], fd352; +st.shared.f64 [r26+392], fd433; +st.shared.f64 [r26+784], fd443; +st.shared.f64 [r26+1176], fd453; +st.shared.f64 [r26+1568], fd462; +st.shared.f64 [r26+1960], fd472; +st.shared.f64 [r26+2352], fd482; +barrier.sync 0; +ld.shared.f64 fd490, [r10]; +ld.shared.f64 fd491, [r10+2744]; +ld.shared.f64 fd492, [r10+5488]; +ld.shared.f64 fd493, [r10+8232]; +ld.shared.f64 fd494, [r10+10976]; +ld.shared.f64 fd495, [r10+13720]; +ld.shared.f64 fd496, [r10+16464]; +add.f64 fd497, fd484, fd489; +add.f64 fd498, fd483, fd497; +add.f64 fd499, fd485, fd488; +add.f64 fd500, fd499, fd498; +add.f64 fd501, fd486, fd487; +add.f64 fd502, fd491, fd496; +add.f64 fd503, fd490, fd502; +add.f64 fd504, fd492, fd495; +add.f64 fd505, fd504, fd503; +add.f64 fd506, fd493, fd494; +fma.rn.f64 fd507, fd497, 0d3FE3F3A0E28BEDD1, fd483; +mul.f64 fd508, fd499, 0d3FCC7B90E3024582; +sub.f64 fd509, fd507, fd508; +mul.f64 fd510, fd501, 0d3FECD4BCA9CB5C71; +sub.f64 fd511, fd509, fd510; +sub.f64 fd512, fd491, fd496; +mul.f64 fd513, fd512, 0d3FE904C37505DE4B; +sub.f64 fd514, fd492, fd495; +fma.rn.f64 fd515, fd514, 0d3FEF329C0558E969, fd513; +sub.f64 fd516, fd493, fd494; +fma.rn.f64 fd517, fd516, 0d3FDBC4C04D71ABC1, fd515; +mul.f64 fd518, fd497, 0d3FCC7B90E3024582; +sub.f64 fd519, fd483, fd518; +mul.f64 fd520, fd499, 0d3FECD4BCA9CB5C71; +sub.f64 fd521, fd519, fd520; +fma.rn.f64 fd522, fd501, 0d3FE3F3A0E28BEDD1, fd521; +mul.f64 fd523, fd512, 0d3FEF329C0558E969; +mul.f64 fd524, fd514, 0d3FDBC4C04D71ABC1; +sub.f64 fd525, fd523, fd524; +mul.f64 fd526, fd516, 0d3FE904C37505DE4B; +sub.f64 fd527, fd525, fd526; +mul.f64 fd528, fd497, 0d3FECD4BCA9CB5C71; +sub.f64 fd529, fd483, fd528; +fma.rn.f64 fd530, fd499, 0d3FE3F3A0E28BEDD1, fd529; +mul.f64 fd531, fd501, 0d3FCC7B90E3024582; +sub.f64 fd532, fd530, fd531; +mul.f64 fd533, fd512, 0d3FDBC4C04D71ABC1; +mul.f64 fd534, fd514, 0d3FE904C37505DE4B; +sub.f64 fd535, fd533, fd534; +fma.rn.f64 fd536, fd516, 0d3FEF329C0558E969, fd535; +fma.rn.f64 fd537, fd502, 0d3FE3F3A0E28BEDD1, fd490; +mul.f64 fd538, fd504, 0d3FCC7B90E3024582; +sub.f64 fd539, fd537, fd538; +mul.f64 fd540, fd506, 0d3FECD4BCA9CB5C71; +sub.f64 fd541, fd539, fd540; +sub.f64 fd542, fd484, fd489; +mul.f64 fd543, fd542, 0d3FE904C37505DE4B; +sub.f64 fd544, fd485, fd488; +fma.rn.f64 fd545, fd544, 0d3FEF329C0558E969, fd543; +sub.f64 fd546, fd486, fd487; +fma.rn.f64 fd547, fd546, 0d3FDBC4C04D71ABC1, fd545; +mul.f64 fd548, fd502, 0d3FCC7B90E3024582; +sub.f64 fd549, fd490, fd548; +mul.f64 fd550, fd504, 0d3FECD4BCA9CB5C71; +sub.f64 fd551, fd549, fd550; +fma.rn.f64 fd552, fd506, 0d3FE3F3A0E28BEDD1, fd551; +mul.f64 fd553, fd542, 0d3FEF329C0558E969; +mul.f64 fd554, fd544, 0d3FDBC4C04D71ABC1; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd546, 0d3FE904C37505DE4B; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd502, 0d3FECD4BCA9CB5C71; +sub.f64 fd559, fd490, fd558; +fma.rn.f64 fd560, fd504, 0d3FE3F3A0E28BEDD1, fd559; +mul.f64 fd561, fd506, 0d3FCC7B90E3024582; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd542, 0d3FDBC4C04D71ABC1; +mul.f64 fd564, fd544, 0d3FE904C37505DE4B; +sub.f64 fd565, fd563, fd564; +fma.rn.f64 fd566, fd546, 0d3FEF329C0558E969, fd565; +add.f64 %0, fd501, fd500; +add.f64 %1, fd506, fd505; +add.f64 %3, fd547, fd541; +sub.f64 %2, fd511, fd517; +sub.f64 %4, fd522, fd527; +add.f64 %5, fd557, fd552; +sub.f64 %6, fd532, fd536; +add.f64 %7, fd566, fd562; +add.f64 %8, fd536, fd532; +sub.f64 %9, fd562, fd566; +add.f64 %10, fd527, fd522; +sub.f64 %11, fd552, fd557; +sub.f64 %13, fd541, fd547; +add.f64 %12, fd517, fd511; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_2401), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..3e0de50c4224f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp16_fwd.hpp.inc @@ -0,0 +1,21189 @@ +#ifndef CUFFTDX_FFT_243_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_243_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<878, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<536>; +.reg .b32 r<5779>; +.reg .b64 rd<4>; +mov.u32 r5715, %54; +mov.u32 r5778, %tid.y; +mad.lo.s32 r5716, r5778, 1944, r5715; +mov.u32 r5717, %tid.x; +mov.f32 f530, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1, {low, high}; +} +mov.f32 f532, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %74, %58; +} +{ +add.f16x2 r8, %83, r5; +} +{ +add.f16x2 r11, %91, %73; +} +{ +add.f16x2 r14, %95, r11; +} +{ +add.f16x2 r17, %74, %58; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %83, r20; +} +{ +sub.f16x2 r26, %91, %73; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %74, %58; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %83, r38; +} +{ +sub.f16x2 r44, %91, %73; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %91, %73; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %95, r56; +} +{ +sub.f16x2 r62, %74, %58; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %91, %73; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %95, r74; +} +{ +sub.f16x2 r80, %74, %58; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %98, %81; +} +{ +add.f16x2 r96, %102, r93; +} +{ +add.f16x2 r99, %57, %96; +} +{ +add.f16x2 r102, %63, r99; +} +{ +add.f16x2 r105, %98, %81; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %102, r108; +} +{ +sub.f16x2 r114, %57, %96; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %98, %81; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %102, r126; +} +{ +sub.f16x2 r132, %57, %96; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %57, %96; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %63, r144; +} +{ +sub.f16x2 r150, %98, %81; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %57, %96; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %63, r162; +} +{ +sub.f16x2 r168, %98, %81; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %64, %104; +} +{ +add.f16x2 r184, %72, r181; +} +{ +add.f16x2 r187, %77, %61; +} +{ +add.f16x2 r190, %88, r187; +} +{ +add.f16x2 r193, %64, %104; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %72, r196; +} +{ +sub.f16x2 r202, %77, %61; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %64, %104; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %72, r214; +} +{ +sub.f16x2 r220, %77, %61; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %77, %61; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %88, r232; +} +{ +sub.f16x2 r238, %64, %104; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %77, %61; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %88, r250; +} +{ +sub.f16x2 r256, %64, %104; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f490, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r265, {low, high}; +} +mov.f32 f492, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r266, {low, high}; +} +mov.f32 f494, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r267, {low, high}; +} +mov.f32 f496, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r268, {low, high}; +} +mov.f32 f502, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r271, {low, high}; +} +mov.f32 f504, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %78, %60; +} +{ +add.f16x2 r616, %86, r613; +} +{ +add.f16x2 r619, %93, %76; +} +{ +add.f16x2 r622, %100, r619; +} +{ +add.f16x2 r625, %78, %60; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %86, r628; +} +{ +sub.f16x2 r634, %93, %76; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %78, %60; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %86, r646; +} +{ +sub.f16x2 r652, %93, %76; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %93, %76; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %100, r664; +} +{ +sub.f16x2 r670, %78, %60; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %93, %76; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %100, r682; +} +{ +sub.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %101, %85; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %59, %99; +} +{ +add.f16x2 r710, %67, r707; +} +{ +add.f16x2 r713, %101, %85; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %59, %99; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %101, %85; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %59, %99; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %59, %99; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %67, r752; +} +{ +sub.f16x2 r758, %101, %85; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %59, %99; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %67, r770; +} +{ +sub.f16x2 r776, %101, %85; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %68, %107; +} +{ +add.f16x2 r792, %75, r789; +} +{ +add.f16x2 r795, %84, %66; +} +{ +add.f16x2 r798, %92, r795; +} +{ +add.f16x2 r801, %68, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %75, r804; +} +{ +sub.f16x2 r810, %84, %66; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %68, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %75, r822; +} +{ +sub.f16x2 r828, %84, %66; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %84, %66; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %92, r840; +} +{ +sub.f16x2 r846, %68, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %84, %66; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %92, r858; +} +{ +sub.f16x2 r864, %68, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %82, %65; +} +{ +add.f16x2 r1224, %90, r1221; +} +{ +add.f16x2 r1227, %97, %80; +} +{ +add.f16x2 r1230, %105, r1227; +} +{ +add.f16x2 r1233, %82, %65; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %90, r1236; +} +{ +sub.f16x2 r1242, %97, %80; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %82, %65; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %90, r1254; +} +{ +sub.f16x2 r1260, %97, %80; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %97, %80; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %105, r1272; +} +{ +sub.f16x2 r1278, %82, %65; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %97, %80; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %105, r1290; +} +{ +sub.f16x2 r1296, %82, %65; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %106, %89; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %62, %103; +} +{ +add.f16x2 r1318, %70, r1315; +} +{ +add.f16x2 r1321, %106, %89; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %62, %103; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %106, %89; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %62, %103; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %62, %103; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %70, r1360; +} +{ +sub.f16x2 r1366, %106, %89; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %62, %103; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %70, r1378; +} +{ +sub.f16x2 r1384, %106, %89; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %71, %55; +} +{ +add.f16x2 r1400, %79, r1397; +} +{ +add.f16x2 r1403, %87, %69; +} +{ +add.f16x2 r1406, %94, r1403; +} +{ +add.f16x2 r1409, %71, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %79, r1412; +} +{ +sub.f16x2 r1418, %87, %69; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %71, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %79, r1430; +} +{ +sub.f16x2 r1436, %87, %69; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %87, %69; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %94, r1448; +} +{ +sub.f16x2 r1454, %71, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %87, %69; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %94, r1466; +} +{ +sub.f16x2 r1472, %71, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1825, {low, high}; +} +mov.f32 f172, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1826, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1827, {low, high}; +} +mov.f32 f176, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1830, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1831, {low, high}; +} +mov.f32 f184, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1832, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1833, {low, high}; +} +mov.f32 f188, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1836, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1837, {low, high}; +} +mov.f32 f196, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1838, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1839, {low, high}; +} +mov.f32 f200, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1840, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1843, {low, high}; +} +mov.f32 f208, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1844, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1848, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1851, {low, high}; +} +mov.f32 f224, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1855, {low, high}; +} +mov.f32 f232, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r5717, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5718, rd3; +mul.lo.s32 r5719, r5718, 9; +sub.s32 r5720, r5717, r5719; +cvt.rn.f32.u32 f533, r5720; +mul.f32 f534, f533, 0f3CD3D17E; +cos.approx.f32 f309, f534; +sin.approx.f32 f535, f534; +neg.f32 f310, f535; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +mad.lo.s32 r5721, r5718, 1944, r5716; +barrier.sync 0; +mad.lo.s32 r5722, r5720, 216, r5721; +st.shared.v2.f32 [r5722], {r2140, r2146}; +st.shared.v2.f32 [r5722+8], {r2937, r2944}; +st.shared.v2.f32 [r5722+16], {r2974, r2981}; +st.shared.v2.f32 [r5722+24], {r3011, r3018}; +st.shared.v2.f32 [r5722+32], {r3048, r3055}; +st.shared.v2.f32 [r5722+40], {r3085, r3092}; +st.shared.v2.f32 [r5722+48], {r3122, r3129}; +st.shared.v2.f32 [r5722+56], {r3159, r3166}; +st.shared.v2.f32 [r5722+64], {r3196, r3203}; +st.shared.v2.f32 [r5722+72], {r3233, r3240}; +st.shared.v2.f32 [r5722+80], {r3270, r3277}; +st.shared.v2.f32 [r5722+88], {r3307, r3314}; +st.shared.v2.f32 [r5722+96], {r3344, r3351}; +st.shared.v2.f32 [r5722+104], {r3381, r3388}; +st.shared.v2.f32 [r5722+112], {r3418, r3425}; +st.shared.v2.f32 [r5722+120], {r3455, r3462}; +st.shared.v2.f32 [r5722+128], {r3492, r3499}; +st.shared.v2.f32 [r5722+136], {r3529, r3536}; +st.shared.v2.f32 [r5722+144], {r3566, r3573}; +st.shared.v2.f32 [r5722+152], {r3603, r3610}; +st.shared.v2.f32 [r5722+160], {r3640, r3647}; +st.shared.v2.f32 [r5722+168], {r3677, r3684}; +st.shared.v2.f32 [r5722+176], {r3714, r3721}; +st.shared.v2.f32 [r5722+184], {r3751, r3758}; +st.shared.v2.f32 [r5722+192], {r3788, r3795}; +st.shared.v2.f32 [r5722+200], {r3825, r3832}; +st.shared.v2.f32 [r5722+208], {r3862, r3869}; +barrier.sync 0; +mad.lo.s32 r5723, r5720, -208, r5722; +ld.shared.u32 r3898, [r5723]; +ld.shared.u32 r3904, [r5723+4]; +ld.shared.u32 r4506, [r5723+72]; +ld.shared.u32 r4512, [r5723+76]; +ld.shared.u32 r5114, [r5723+144]; +ld.shared.u32 r5120, [r5723+148]; +ld.shared.u32 r3986, [r5723+216]; +ld.shared.u32 r3992, [r5723+220]; +ld.shared.u32 r4594, [r5723+288]; +ld.shared.u32 r4600, [r5723+292]; +ld.shared.u32 r5202, [r5723+360]; +ld.shared.u32 r5208, [r5723+364]; +ld.shared.u32 r4074, [r5723+432]; +ld.shared.u32 r4080, [r5723+436]; +ld.shared.u32 r4682, [r5723+504]; +ld.shared.u32 r4688, [r5723+508]; +ld.shared.u32 r5290, [r5723+576]; +ld.shared.u32 r5296, [r5723+580]; +ld.shared.u32 r3895, [r5723+648]; +ld.shared.u32 r3901, [r5723+652]; +ld.shared.u32 r4503, [r5723+720]; +ld.shared.u32 r4509, [r5723+724]; +ld.shared.u32 r5111, [r5723+792]; +ld.shared.u32 r5117, [r5723+796]; +ld.shared.u32 r3983, [r5723+864]; +ld.shared.u32 r3989, [r5723+868]; +ld.shared.u32 r4591, [r5723+936]; +ld.shared.u32 r4597, [r5723+940]; +ld.shared.u32 r5199, [r5723+1008]; +ld.shared.u32 r5205, [r5723+1012]; +ld.shared.u32 r4071, [r5723+1080]; +ld.shared.u32 r4077, [r5723+1084]; +ld.shared.u32 r4679, [r5723+1152]; +ld.shared.u32 r4685, [r5723+1156]; +ld.shared.u32 r5287, [r5723+1224]; +ld.shared.u32 r5293, [r5723+1228]; +ld.shared.u32 r3896, [r5723+1296]; +ld.shared.u32 r3902, [r5723+1300]; +ld.shared.u32 r4504, [r5723+1368]; +ld.shared.u32 r4510, [r5723+1372]; +ld.shared.u32 r5112, [r5723+1440]; +ld.shared.u32 r5118, [r5723+1444]; +ld.shared.u32 r3984, [r5723+1512]; +ld.shared.u32 r3990, [r5723+1516]; +ld.shared.u32 r4592, [r5723+1584]; +ld.shared.u32 r4598, [r5723+1588]; +ld.shared.u32 r5200, [r5723+1656]; +ld.shared.u32 r5206, [r5723+1660]; +ld.shared.u32 r4072, [r5723+1728]; +ld.shared.u32 r4078, [r5723+1732]; +ld.shared.u32 r4680, [r5723+1800]; +ld.shared.u32 r4686, [r5723+1804]; +ld.shared.u32 r5288, [r5723+1872]; +ld.shared.u32 r5294, [r5723+1876]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 %0, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 %1, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 %18, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 %36, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 %19, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 %37, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 %6, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 %7, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 %24, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 %42, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 %25, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 %43, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 %12, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 %13, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 %30, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 %48, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 %31, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 %49, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 %2, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 %3, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 %20, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 %38, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 %21, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 %39, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 %8, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 %9, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 %26, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 %44, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 %27, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 %45, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 %14, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 %15, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 %32, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 %50, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 %33, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 %51, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 %4, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 %5, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 %22, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 %40, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 %23, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 %41, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 %10, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 %11, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 %28, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 %46, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 %29, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 %47, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 %16, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 %17, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 %34, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 %52, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 %35, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 %53, r5702, r5708; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<879, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<536>; +.reg .b32 r<5779>; +.reg .b64 rd<4>; +mov.u32 r5715, %54; +mov.u32 r5778, %tid.y; +mad.lo.s32 r5716, r5778, 972, r5715; +mov.u32 r5717, %tid.x; +mov.f32 f530, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1, {low, high}; +} +mov.f32 f532, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %74, %58; +} +{ +add.f16x2 r8, %83, r5; +} +{ +add.f16x2 r11, %91, %73; +} +{ +add.f16x2 r14, %95, r11; +} +{ +add.f16x2 r17, %74, %58; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %83, r20; +} +{ +sub.f16x2 r26, %91, %73; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %74, %58; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %83, r38; +} +{ +sub.f16x2 r44, %91, %73; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %91, %73; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %95, r56; +} +{ +sub.f16x2 r62, %74, %58; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %91, %73; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %95, r74; +} +{ +sub.f16x2 r80, %74, %58; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %98, %81; +} +{ +add.f16x2 r96, %102, r93; +} +{ +add.f16x2 r99, %57, %96; +} +{ +add.f16x2 r102, %63, r99; +} +{ +add.f16x2 r105, %98, %81; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %102, r108; +} +{ +sub.f16x2 r114, %57, %96; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %98, %81; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %102, r126; +} +{ +sub.f16x2 r132, %57, %96; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %57, %96; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %63, r144; +} +{ +sub.f16x2 r150, %98, %81; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %57, %96; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %63, r162; +} +{ +sub.f16x2 r168, %98, %81; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %64, %104; +} +{ +add.f16x2 r184, %72, r181; +} +{ +add.f16x2 r187, %77, %61; +} +{ +add.f16x2 r190, %88, r187; +} +{ +add.f16x2 r193, %64, %104; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %72, r196; +} +{ +sub.f16x2 r202, %77, %61; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %64, %104; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %72, r214; +} +{ +sub.f16x2 r220, %77, %61; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %77, %61; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %88, r232; +} +{ +sub.f16x2 r238, %64, %104; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %77, %61; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %88, r250; +} +{ +sub.f16x2 r256, %64, %104; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f490, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r265, {low, high}; +} +mov.f32 f492, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r266, {low, high}; +} +mov.f32 f494, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r267, {low, high}; +} +mov.f32 f496, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r268, {low, high}; +} +mov.f32 f502, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r271, {low, high}; +} +mov.f32 f504, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %78, %60; +} +{ +add.f16x2 r616, %86, r613; +} +{ +add.f16x2 r619, %93, %76; +} +{ +add.f16x2 r622, %100, r619; +} +{ +add.f16x2 r625, %78, %60; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %86, r628; +} +{ +sub.f16x2 r634, %93, %76; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %78, %60; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %86, r646; +} +{ +sub.f16x2 r652, %93, %76; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %93, %76; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %100, r664; +} +{ +sub.f16x2 r670, %78, %60; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %93, %76; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %100, r682; +} +{ +sub.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %101, %85; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %59, %99; +} +{ +add.f16x2 r710, %67, r707; +} +{ +add.f16x2 r713, %101, %85; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %59, %99; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %101, %85; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %59, %99; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %59, %99; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %67, r752; +} +{ +sub.f16x2 r758, %101, %85; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %59, %99; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %67, r770; +} +{ +sub.f16x2 r776, %101, %85; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %68, %107; +} +{ +add.f16x2 r792, %75, r789; +} +{ +add.f16x2 r795, %84, %66; +} +{ +add.f16x2 r798, %92, r795; +} +{ +add.f16x2 r801, %68, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %75, r804; +} +{ +sub.f16x2 r810, %84, %66; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %68, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %75, r822; +} +{ +sub.f16x2 r828, %84, %66; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %84, %66; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %92, r840; +} +{ +sub.f16x2 r846, %68, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %84, %66; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %92, r858; +} +{ +sub.f16x2 r864, %68, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %82, %65; +} +{ +add.f16x2 r1224, %90, r1221; +} +{ +add.f16x2 r1227, %97, %80; +} +{ +add.f16x2 r1230, %105, r1227; +} +{ +add.f16x2 r1233, %82, %65; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %90, r1236; +} +{ +sub.f16x2 r1242, %97, %80; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %82, %65; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %90, r1254; +} +{ +sub.f16x2 r1260, %97, %80; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %97, %80; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %105, r1272; +} +{ +sub.f16x2 r1278, %82, %65; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %97, %80; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %105, r1290; +} +{ +sub.f16x2 r1296, %82, %65; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %106, %89; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %62, %103; +} +{ +add.f16x2 r1318, %70, r1315; +} +{ +add.f16x2 r1321, %106, %89; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %62, %103; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %106, %89; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %62, %103; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %62, %103; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %70, r1360; +} +{ +sub.f16x2 r1366, %106, %89; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %62, %103; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %70, r1378; +} +{ +sub.f16x2 r1384, %106, %89; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %71, %55; +} +{ +add.f16x2 r1400, %79, r1397; +} +{ +add.f16x2 r1403, %87, %69; +} +{ +add.f16x2 r1406, %94, r1403; +} +{ +add.f16x2 r1409, %71, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %79, r1412; +} +{ +sub.f16x2 r1418, %87, %69; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %71, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %79, r1430; +} +{ +sub.f16x2 r1436, %87, %69; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %87, %69; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %94, r1448; +} +{ +sub.f16x2 r1454, %71, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %87, %69; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %94, r1466; +} +{ +sub.f16x2 r1472, %71, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1825, {low, high}; +} +mov.f32 f172, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1826, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1827, {low, high}; +} +mov.f32 f176, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1830, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1831, {low, high}; +} +mov.f32 f184, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1832, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1833, {low, high}; +} +mov.f32 f188, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1836, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1837, {low, high}; +} +mov.f32 f196, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1838, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1839, {low, high}; +} +mov.f32 f200, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1840, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1843, {low, high}; +} +mov.f32 f208, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1844, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1848, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1851, {low, high}; +} +mov.f32 f224, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1855, {low, high}; +} +mov.f32 f232, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r5717, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5718, rd3; +mul.lo.s32 r5719, r5718, 9; +sub.s32 r5720, r5717, r5719; +mad.lo.s32 r5721, r5718, 972, r5716; +cvt.rn.f32.u32 f533, r5720; +mul.f32 f534, f533, 0f3CD3D17E; +cos.approx.f32 f309, f534; +sin.approx.f32 f535, f534; +neg.f32 f310, f535; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +barrier.sync 0; +mad.lo.s32 r5722, r5720, 108, r5721; +st.shared.u32 [r5722], r2140; +st.shared.u32 [r5722+4], r2937; +st.shared.u32 [r5722+8], r2974; +st.shared.u32 [r5722+12], r3011; +st.shared.u32 [r5722+16], r3048; +st.shared.u32 [r5722+20], r3085; +st.shared.u32 [r5722+24], r3122; +st.shared.u32 [r5722+28], r3159; +st.shared.u32 [r5722+32], r3196; +st.shared.u32 [r5722+36], r3233; +st.shared.u32 [r5722+40], r3270; +st.shared.u32 [r5722+44], r3307; +st.shared.u32 [r5722+48], r3344; +st.shared.u32 [r5722+52], r3381; +st.shared.u32 [r5722+56], r3418; +st.shared.u32 [r5722+60], r3455; +st.shared.u32 [r5722+64], r3492; +st.shared.u32 [r5722+68], r3529; +st.shared.u32 [r5722+72], r3566; +st.shared.u32 [r5722+76], r3603; +st.shared.u32 [r5722+80], r3640; +st.shared.u32 [r5722+84], r3677; +st.shared.u32 [r5722+88], r3714; +st.shared.u32 [r5722+92], r3751; +st.shared.u32 [r5722+96], r3788; +st.shared.u32 [r5722+100], r3825; +st.shared.u32 [r5722+104], r3862; +barrier.sync 0; +mad.lo.s32 r5723, r5720, -104, r5722; +ld.shared.u32 r3898, [r5723]; +ld.shared.u32 r4506, [r5723+36]; +ld.shared.u32 r5114, [r5723+72]; +ld.shared.u32 r3986, [r5723+108]; +ld.shared.u32 r4594, [r5723+144]; +ld.shared.u32 r5202, [r5723+180]; +ld.shared.u32 r4074, [r5723+216]; +ld.shared.u32 r4682, [r5723+252]; +ld.shared.u32 r5290, [r5723+288]; +ld.shared.u32 r3895, [r5723+324]; +ld.shared.u32 r4503, [r5723+360]; +ld.shared.u32 r5111, [r5723+396]; +ld.shared.u32 r3983, [r5723+432]; +ld.shared.u32 r4591, [r5723+468]; +ld.shared.u32 r5199, [r5723+504]; +ld.shared.u32 r4071, [r5723+540]; +ld.shared.u32 r4679, [r5723+576]; +ld.shared.u32 r5287, [r5723+612]; +ld.shared.u32 r3896, [r5723+648]; +ld.shared.u32 r4504, [r5723+684]; +ld.shared.u32 r5112, [r5723+720]; +ld.shared.u32 r3984, [r5723+756]; +ld.shared.u32 r4592, [r5723+792]; +ld.shared.u32 r5200, [r5723+828]; +ld.shared.u32 r4072, [r5723+864]; +ld.shared.u32 r4680, [r5723+900]; +ld.shared.u32 r5288, [r5723+936]; +barrier.sync 0; +st.shared.u32 [r5722], r2146; +st.shared.u32 [r5722+4], r2944; +st.shared.u32 [r5722+8], r2981; +st.shared.u32 [r5722+12], r3018; +st.shared.u32 [r5722+16], r3055; +st.shared.u32 [r5722+20], r3092; +st.shared.u32 [r5722+24], r3129; +st.shared.u32 [r5722+28], r3166; +st.shared.u32 [r5722+32], r3203; +st.shared.u32 [r5722+36], r3240; +st.shared.u32 [r5722+40], r3277; +st.shared.u32 [r5722+44], r3314; +st.shared.u32 [r5722+48], r3351; +st.shared.u32 [r5722+52], r3388; +st.shared.u32 [r5722+56], r3425; +st.shared.u32 [r5722+60], r3462; +st.shared.u32 [r5722+64], r3499; +st.shared.u32 [r5722+68], r3536; +st.shared.u32 [r5722+72], r3573; +st.shared.u32 [r5722+76], r3610; +st.shared.u32 [r5722+80], r3647; +st.shared.u32 [r5722+84], r3684; +st.shared.u32 [r5722+88], r3721; +st.shared.u32 [r5722+92], r3758; +st.shared.u32 [r5722+96], r3795; +st.shared.u32 [r5722+100], r3832; +st.shared.u32 [r5722+104], r3869; +barrier.sync 0; +ld.shared.u32 r3904, [r5723]; +ld.shared.u32 r4512, [r5723+36]; +ld.shared.u32 r5120, [r5723+72]; +ld.shared.u32 r3992, [r5723+108]; +ld.shared.u32 r4600, [r5723+144]; +ld.shared.u32 r5208, [r5723+180]; +ld.shared.u32 r4080, [r5723+216]; +ld.shared.u32 r4688, [r5723+252]; +ld.shared.u32 r5296, [r5723+288]; +ld.shared.u32 r3901, [r5723+324]; +ld.shared.u32 r4509, [r5723+360]; +ld.shared.u32 r5117, [r5723+396]; +ld.shared.u32 r3989, [r5723+432]; +ld.shared.u32 r4597, [r5723+468]; +ld.shared.u32 r5205, [r5723+504]; +ld.shared.u32 r4077, [r5723+540]; +ld.shared.u32 r4685, [r5723+576]; +ld.shared.u32 r5293, [r5723+612]; +ld.shared.u32 r3902, [r5723+648]; +ld.shared.u32 r4510, [r5723+684]; +ld.shared.u32 r5118, [r5723+720]; +ld.shared.u32 r3990, [r5723+756]; +ld.shared.u32 r4598, [r5723+792]; +ld.shared.u32 r5206, [r5723+828]; +ld.shared.u32 r4078, [r5723+864]; +ld.shared.u32 r4686, [r5723+900]; +ld.shared.u32 r5294, [r5723+936]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 %0, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 %1, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 %18, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 %36, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 %19, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 %37, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 %6, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 %7, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 %24, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 %42, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 %25, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 %43, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 %12, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 %13, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 %30, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 %48, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 %31, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 %49, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 %2, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 %3, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 %20, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 %38, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 %21, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 %39, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 %8, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 %9, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 %26, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 %44, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 %27, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 %45, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 %14, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 %15, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 %32, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 %50, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 %33, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 %51, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 %4, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 %5, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 %22, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 %40, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 %23, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 %41, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 %10, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 %11, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 %28, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 %46, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 %29, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 %47, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 %16, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 %17, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 %34, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 %52, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 %35, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 %53, r5702, r5708; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<880, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<171>; +.reg .b32 r<2100>; +.reg .b64 rd<6>; +mov.u32 r2079, %tid.y; +mov.u32 r2080, %18; +mad.lo.s32 r2081, r2079, 1944, r2080; +mov.u32 r2082, %tid.x; +mov.f32 f162, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1, {low, high}; +} +mov.f32 f164, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r265, {low, high}; +} +mov.f32 f92, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r266, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r267, {low, high}; +} +mov.f32 f96, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r268, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r271, {low, high}; +} +mov.f32 f104, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r2082, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r2083, rd3; +sub.s32 r2084, r2082, r2083; +shr.u32 r2085, r2084, 1; +add.s32 r2086, r2085, r2083; +shr.u32 r2087, r2086, 4; +mul.lo.s32 r2088, r2087, 27; +sub.s32 r2089, r2082, r2088; +cvt.rn.f32.u32 f165, r2089; +mul.f32 f166, f165, 0f3CD3D17E; +cos.approx.f32 f57, f166; +sin.approx.f32 f167, f166; +neg.f32 f58, f167; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +mad.lo.s32 r2090, r2087, 1944, r2081; +barrier.sync 0; +mad.lo.s32 r2091, r2089, 72, r2090; +st.shared.v2.f32 [r2091], {r352, r358}; +st.shared.v2.f32 [r2091+8], {r621, r628}; +st.shared.v2.f32 [r2091+16], {r658, r665}; +st.shared.v2.f32 [r2091+24], {r695, r702}; +st.shared.v2.f32 [r2091+32], {r732, r739}; +st.shared.v2.f32 [r2091+40], {r769, r776}; +st.shared.v2.f32 [r2091+48], {r806, r813}; +st.shared.v2.f32 [r2091+56], {r843, r850}; +st.shared.v2.f32 [r2091+64], {r880, r887}; +barrier.sync 0; +shl.b32 r2092, r2089, 6; +sub.s32 r2093, r2091, r2092; +ld.shared.u32 r916, [r2093]; +ld.shared.u32 r922, [r2093+4]; +ld.shared.u32 r1004, [r2093+216]; +ld.shared.u32 r1010, [r2093+220]; +ld.shared.u32 r1092, [r2093+432]; +ld.shared.u32 r1098, [r2093+436]; +ld.shared.u32 r913, [r2093+648]; +ld.shared.u32 r919, [r2093+652]; +ld.shared.u32 r1001, [r2093+864]; +ld.shared.u32 r1007, [r2093+868]; +ld.shared.u32 r1089, [r2093+1080]; +ld.shared.u32 r1095, [r2093+1084]; +ld.shared.u32 r914, [r2093+1296]; +ld.shared.u32 r920, [r2093+1300]; +ld.shared.u32 r1002, [r2093+1512]; +ld.shared.u32 r1008, [r2093+1516]; +ld.shared.u32 r1090, [r2093+1728]; +ld.shared.u32 r1096, [r2093+1732]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r2089, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2094, rd5; +cvt.rn.f32.u32 f168, r2094; +mul.f32 f169, f168, 0f3E6E4BAE; +cos.approx.f32 f133, f169; +sin.approx.f32 f170, f169; +neg.f32 f134, f170; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +mul.lo.s32 r2095, r2094, 9; +sub.s32 r2096, r2089, r2095; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +shl.b32 r2097, r2096, 3; +add.s32 r2098, r2090, r2097; +barrier.sync 0; +mad.lo.s32 r2099, r2094, 648, r2098; +st.shared.u32 [r2099], r1259; +st.shared.u32 [r2099+4], r1265; +st.shared.u32 [r2099+72], r1528; +st.shared.u32 [r2099+76], r1535; +st.shared.u32 [r2099+144], r1565; +st.shared.u32 [r2099+148], r1572; +st.shared.u32 [r2099+216], r1602; +st.shared.u32 [r2099+220], r1609; +st.shared.u32 [r2099+288], r1639; +st.shared.u32 [r2099+292], r1646; +st.shared.u32 [r2099+360], r1676; +st.shared.u32 [r2099+364], r1683; +st.shared.u32 [r2099+432], r1713; +st.shared.u32 [r2099+436], r1720; +st.shared.u32 [r2099+504], r1750; +st.shared.u32 [r2099+508], r1757; +st.shared.u32 [r2099+576], r1787; +st.shared.u32 [r2099+580], r1794; +barrier.sync 0; +ld.shared.u32 r1823, [r2093]; +ld.shared.u32 r1829, [r2093+4]; +ld.shared.u32 r1911, [r2093+216]; +ld.shared.u32 r1917, [r2093+220]; +ld.shared.u32 r1999, [r2093+432]; +ld.shared.u32 r2005, [r2093+436]; +ld.shared.u32 r1820, [r2093+648]; +ld.shared.u32 r1826, [r2093+652]; +ld.shared.u32 r1908, [r2093+864]; +ld.shared.u32 r1914, [r2093+868]; +ld.shared.u32 r1996, [r2093+1080]; +ld.shared.u32 r2002, [r2093+1084]; +ld.shared.u32 r1821, [r2093+1296]; +ld.shared.u32 r1827, [r2093+1300]; +ld.shared.u32 r1909, [r2093+1512]; +ld.shared.u32 r1915, [r2093+1516]; +ld.shared.u32 r1997, [r2093+1728]; +ld.shared.u32 r2003, [r2093+1732]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 %0, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 %1, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 %6, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 %12, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 %7, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 %13, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 %2, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 %3, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 %8, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 %14, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 %9, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 %15, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 %4, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 %5, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 %10, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 %16, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 %11, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 %17, r2067, r2073; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<881, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<171>; +.reg .b32 r<2100>; +.reg .b64 rd<6>; +mov.u32 r2079, %tid.y; +mov.u32 r2080, %18; +mad.lo.s32 r2081, r2079, 972, r2080; +mov.u32 r2082, %tid.x; +mov.f32 f162, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1, {low, high}; +} +mov.f32 f164, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r265, {low, high}; +} +mov.f32 f92, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r266, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r267, {low, high}; +} +mov.f32 f96, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r268, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r271, {low, high}; +} +mov.f32 f104, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r2082, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r2083, rd3; +sub.s32 r2084, r2082, r2083; +shr.u32 r2085, r2084, 1; +add.s32 r2086, r2085, r2083; +shr.u32 r2087, r2086, 4; +mul.lo.s32 r2088, r2087, 27; +sub.s32 r2089, r2082, r2088; +mad.lo.s32 r2090, r2087, 972, r2081; +cvt.rn.f32.u32 f165, r2089; +mul.f32 f166, f165, 0f3CD3D17E; +cos.approx.f32 f57, f166; +sin.approx.f32 f167, f166; +neg.f32 f58, f167; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +barrier.sync 0; +mad.lo.s32 r2091, r2089, 36, r2090; +st.shared.u32 [r2091], r352; +st.shared.u32 [r2091+4], r621; +st.shared.u32 [r2091+8], r658; +st.shared.u32 [r2091+12], r695; +st.shared.u32 [r2091+16], r732; +st.shared.u32 [r2091+20], r769; +st.shared.u32 [r2091+24], r806; +st.shared.u32 [r2091+28], r843; +st.shared.u32 [r2091+32], r880; +barrier.sync 0; +shl.b32 r2092, r2089, 5; +sub.s32 r2093, r2091, r2092; +ld.shared.u32 r916, [r2093]; +ld.shared.u32 r1004, [r2093+108]; +ld.shared.u32 r1092, [r2093+216]; +ld.shared.u32 r913, [r2093+324]; +ld.shared.u32 r1001, [r2093+432]; +ld.shared.u32 r1089, [r2093+540]; +ld.shared.u32 r914, [r2093+648]; +ld.shared.u32 r1002, [r2093+756]; +ld.shared.u32 r1090, [r2093+864]; +barrier.sync 0; +st.shared.u32 [r2091], r358; +st.shared.u32 [r2091+4], r628; +st.shared.u32 [r2091+8], r665; +st.shared.u32 [r2091+12], r702; +st.shared.u32 [r2091+16], r739; +st.shared.u32 [r2091+20], r776; +st.shared.u32 [r2091+24], r813; +st.shared.u32 [r2091+28], r850; +st.shared.u32 [r2091+32], r887; +barrier.sync 0; +ld.shared.u32 r922, [r2093]; +ld.shared.u32 r1010, [r2093+108]; +ld.shared.u32 r1098, [r2093+216]; +ld.shared.u32 r919, [r2093+324]; +ld.shared.u32 r1007, [r2093+432]; +ld.shared.u32 r1095, [r2093+540]; +ld.shared.u32 r920, [r2093+648]; +ld.shared.u32 r1008, [r2093+756]; +ld.shared.u32 r1096, [r2093+864]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r2089, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2094, rd5; +mul.lo.s32 r2095, r2094, 9; +sub.s32 r2096, r2089, r2095; +shl.b32 r2097, r2096, 2; +add.s32 r2098, r2090, r2097; +cvt.rn.f32.u32 f168, r2094; +mul.f32 f169, f168, 0f3E6E4BAE; +cos.approx.f32 f133, f169; +sin.approx.f32 f170, f169; +neg.f32 f134, f170; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +barrier.sync 0; +mad.lo.s32 r2099, r2094, 324, r2098; +st.shared.u32 [r2099], r1259; +st.shared.u32 [r2099+36], r1528; +st.shared.u32 [r2099+72], r1565; +st.shared.u32 [r2099+108], r1602; +st.shared.u32 [r2099+144], r1639; +st.shared.u32 [r2099+180], r1676; +st.shared.u32 [r2099+216], r1713; +st.shared.u32 [r2099+252], r1750; +st.shared.u32 [r2099+288], r1787; +barrier.sync 0; +ld.shared.u32 r1823, [r2093]; +ld.shared.u32 r1911, [r2093+108]; +ld.shared.u32 r1999, [r2093+216]; +ld.shared.u32 r1820, [r2093+324]; +ld.shared.u32 r1908, [r2093+432]; +ld.shared.u32 r1996, [r2093+540]; +ld.shared.u32 r1821, [r2093+648]; +ld.shared.u32 r1909, [r2093+756]; +ld.shared.u32 r1997, [r2093+864]; +barrier.sync 0; +st.shared.u32 [r2099], r1265; +st.shared.u32 [r2099+36], r1535; +st.shared.u32 [r2099+72], r1572; +st.shared.u32 [r2099+108], r1609; +st.shared.u32 [r2099+144], r1646; +st.shared.u32 [r2099+180], r1683; +st.shared.u32 [r2099+216], r1720; +st.shared.u32 [r2099+252], r1757; +st.shared.u32 [r2099+288], r1794; +barrier.sync 0; +ld.shared.u32 r1829, [r2093]; +ld.shared.u32 r1917, [r2093+108]; +ld.shared.u32 r2005, [r2093+216]; +ld.shared.u32 r1826, [r2093+324]; +ld.shared.u32 r1914, [r2093+432]; +ld.shared.u32 r2002, [r2093+540]; +ld.shared.u32 r1827, [r2093+648]; +ld.shared.u32 r1915, [r2093+756]; +ld.shared.u32 r2003, [r2093+864]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 %0, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 %1, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 %6, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 %12, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 %7, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 %13, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 %2, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 %3, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 %8, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 %14, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 %9, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 %15, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 %4, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 %5, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 %10, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 %16, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 %11, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 %17, r2067, r2073; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<882, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<65>; +.reg .b32 r<782>; +.reg .b64 rd<10>; +mov.u32 r749, %tid.y; +mov.u32 r750, %6; +mad.lo.s32 r751, r749, 1944, r750; +mov.u32 r752, %tid.x; +mov.f32 f50, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r1, {low, high}; +} +mov.f32 f52, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r752, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r753, rd3; +mul.lo.s32 r754, r753, 81; +sub.s32 r755, r752, r754; +mad.lo.s32 r756, r753, 1944, r751; +cvt.rn.f32.u32 f53, r755; +mul.f32 f54, f53, 0f3CD3D17E; +cos.approx.f32 f5, f54; +sin.approx.f32 f55, f54; +neg.f32 f6, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r757, r755, 24, r756; +st.shared.v2.f32 [r757], {r8, r14}; +st.shared.v2.f32 [r757+8], {r101, r108}; +st.shared.v2.f32 [r757+16], {r138, r145}; +barrier.sync 0; +shl.b32 r758, r755, 4; +sub.s32 r759, r757, r758; +ld.shared.u32 r174, [r759]; +ld.shared.u32 r180, [r759+4]; +ld.shared.u32 r171, [r759+648]; +ld.shared.u32 r177, [r759+652]; +ld.shared.u32 r172, [r759+1296]; +ld.shared.u32 r178, [r759+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r755, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r760, rd5; +mul.lo.s32 r761, r760, 3; +sub.s32 r762, r755, r761; +shl.b32 r763, r762, 3; +add.s32 r764, r756, r763; +cvt.rn.f32.u32 f56, r760; +mul.f32 f57, f56, 0f3D9EDD1F; +cos.approx.f32 f17, f57; +sin.approx.f32 f58, f57; +neg.f32 f18, f58; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r765, r760, 72, r764; +st.shared.u32 [r765], r173; +st.shared.u32 [r765+4], r179; +st.shared.u32 [r765+24], r266; +st.shared.u32 [r765+28], r273; +st.shared.u32 [r765+48], r303; +st.shared.u32 [r765+52], r310; +barrier.sync 0; +ld.shared.u32 r339, [r759]; +ld.shared.u32 r345, [r759+4]; +ld.shared.u32 r336, [r759+648]; +ld.shared.u32 r342, [r759+652]; +ld.shared.u32 r337, [r759+1296]; +ld.shared.u32 r343, [r759+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r755, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r766, rd7; +mul.lo.s32 r767, r766, 9; +sub.s32 r768, r755, r767; +shl.b32 r769, r768, 3; +add.s32 r770, r756, r769; +cvt.rn.f32.u32 f59, r766; +mul.f32 f60, f59, 0f3E6E4BAE; +cos.approx.f32 f29, f60; +sin.approx.f32 f61, f60; +neg.f32 f30, f61; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r771, r766, 216, r770; +st.shared.u32 [r771], r338; +st.shared.u32 [r771+4], r344; +st.shared.u32 [r771+72], r431; +st.shared.u32 [r771+76], r438; +st.shared.u32 [r771+144], r468; +st.shared.u32 [r771+148], r475; +barrier.sync 0; +ld.shared.u32 r504, [r759]; +ld.shared.u32 r510, [r759+4]; +ld.shared.u32 r501, [r759+648]; +ld.shared.u32 r507, [r759+652]; +ld.shared.u32 r502, [r759+1296]; +ld.shared.u32 r508, [r759+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 r503, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 r509, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 r545, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 r563, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 r581, r572, r578; +} +mul.wide.u32 rd8, r755, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r772, rd9; +sub.s32 r773, r755, r772; +shr.u32 r774, r773, 1; +add.s32 r775, r774, r772; +shr.u32 r776, r775, 4; +mul.lo.s32 r777, r776, 27; +sub.s32 r778, r755, r777; +shl.b32 r779, r778, 3; +add.s32 r780, r756, r779; +cvt.rn.f32.u32 f62, r776; +mul.f32 f63, f62, 0f3F32B8C2; +cos.approx.f32 f41, f63; +sin.approx.f32 f64, f63; +neg.f32 f42, f64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r584, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r589, {high, high}; +} +{ +mul.f16x2 r591, r563, r589; +} +{ +neg.f16x2 r594, r591; +} +{ +fma.rn.f16x2 r596, r527, r587, r594; +} +{ +mul.f16x2 r600, r527, r589; +} +{ +fma.rn.f16x2 r603, r563, r587, r600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r607, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r609, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r609, r611; +} +{ +mul.f16x2 r615, r584, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r618, {high, low}; +} +{ +fma.rn.f16x2 r620, r612, r618, r615; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r626, {high, high}; +} +{ +mul.f16x2 r628, r581, r626; +} +{ +neg.f16x2 r631, r628; +} +{ +fma.rn.f16x2 r633, r545, r624, r631; +} +{ +mul.f16x2 r637, r545, r626; +} +{ +fma.rn.f16x2 r640, r581, r624, r637; +} +barrier.sync 0; +mad.lo.s32 r781, r776, 648, r780; +st.shared.u32 [r781], r503; +st.shared.u32 [r781+4], r509; +st.shared.u32 [r781+216], r596; +st.shared.u32 [r781+220], r603; +st.shared.u32 [r781+432], r633; +st.shared.u32 [r781+436], r640; +barrier.sync 0; +ld.shared.u32 r669, [r759]; +ld.shared.u32 r675, [r759+4]; +ld.shared.u32 r666, [r759+648]; +ld.shared.u32 r672, [r759+652]; +ld.shared.u32 r667, [r759+1296]; +ld.shared.u32 r673, [r759+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r662, {low, high}; +} +{ +neg.f16x2 r663, r662; +} +{ +add.f16x2 r665, r666, r667; +} +{ +add.f16x2 %0, r669, r665; +} +{ +add.f16x2 r671, r672, r673; +} +{ +add.f16x2 %1, r675, r671; +} +{ +add.f16x2 r677, r666, r667; +} +{ +mul.f16x2 r680, r677, r661; +} +{ +add.f16x2 r683, r669, r680; +} +{ +sub.f16x2 r686, r672, r673; +} +{ +mul.f16x2 r689, r686, r663; +} +{ +add.f16x2 %2, r683, r689; +} +{ +add.f16x2 r695, r666, r667; +} +{ +mul.f16x2 r698, r695, r661; +} +{ +add.f16x2 r701, r669, r698; +} +{ +sub.f16x2 r704, r672, r673; +} +{ +mul.f16x2 r707, r704, r663; +} +{ +sub.f16x2 %4, r701, r707; +} +{ +add.f16x2 r713, r672, r673; +} +{ +mul.f16x2 r716, r713, r661; +} +{ +add.f16x2 r719, r675, r716; +} +{ +sub.f16x2 r722, r666, r667; +} +{ +mul.f16x2 r725, r722, r663; +} +{ +sub.f16x2 %3, r719, r725; +} +{ +add.f16x2 r731, r672, r673; +} +{ +mul.f16x2 r734, r731, r661; +} +{ +add.f16x2 r737, r675, r734; +} +{ +sub.f16x2 r740, r666, r667; +} +{ +mul.f16x2 r743, r740, r663; +} +{ +add.f16x2 %5, r737, r743; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<883, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<65>; +.reg .b32 r<782>; +.reg .b64 rd<10>; +mov.u32 r749, %tid.y; +mov.u32 r750, %6; +mad.lo.s32 r751, r749, 972, r750; +mov.u32 r752, %tid.x; +mov.f32 f50, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r1, {low, high}; +} +mov.f32 f52, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r752, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r753, rd3; +mul.lo.s32 r754, r753, 81; +sub.s32 r755, r752, r754; +mad.lo.s32 r756, r753, 972, r751; +cvt.rn.f32.u32 f53, r755; +mul.f32 f54, f53, 0f3CD3D17E; +cos.approx.f32 f5, f54; +sin.approx.f32 f55, f54; +neg.f32 f6, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r757, r755, 12, r756; +st.shared.u32 [r757], r8; +st.shared.u32 [r757+4], r101; +st.shared.u32 [r757+8], r138; +barrier.sync 0; +shl.b32 r758, r755, 3; +sub.s32 r759, r757, r758; +ld.shared.u32 r174, [r759]; +ld.shared.u32 r171, [r759+324]; +ld.shared.u32 r172, [r759+648]; +barrier.sync 0; +st.shared.u32 [r757], r14; +st.shared.u32 [r757+4], r108; +st.shared.u32 [r757+8], r145; +barrier.sync 0; +ld.shared.u32 r180, [r759]; +ld.shared.u32 r177, [r759+324]; +ld.shared.u32 r178, [r759+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r755, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r760, rd5; +mul.lo.s32 r761, r760, 3; +sub.s32 r762, r755, r761; +shl.b32 r763, r762, 2; +add.s32 r764, r756, r763; +cvt.rn.f32.u32 f56, r760; +mul.f32 f57, f56, 0f3D9EDD1F; +cos.approx.f32 f17, f57; +sin.approx.f32 f58, f57; +neg.f32 f18, f58; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r765, r760, 36, r764; +st.shared.u32 [r765], r173; +st.shared.u32 [r765+12], r266; +st.shared.u32 [r765+24], r303; +barrier.sync 0; +ld.shared.u32 r339, [r759]; +ld.shared.u32 r336, [r759+324]; +ld.shared.u32 r337, [r759+648]; +barrier.sync 0; +st.shared.u32 [r765], r179; +st.shared.u32 [r765+12], r273; +st.shared.u32 [r765+24], r310; +barrier.sync 0; +ld.shared.u32 r345, [r759]; +ld.shared.u32 r342, [r759+324]; +ld.shared.u32 r343, [r759+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r755, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r766, rd7; +mul.lo.s32 r767, r766, 9; +sub.s32 r768, r755, r767; +shl.b32 r769, r768, 2; +add.s32 r770, r756, r769; +cvt.rn.f32.u32 f59, r766; +mul.f32 f60, f59, 0f3E6E4BAE; +cos.approx.f32 f29, f60; +sin.approx.f32 f61, f60; +neg.f32 f30, f61; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r771, r766, 108, r770; +st.shared.u32 [r771], r338; +st.shared.u32 [r771+36], r431; +st.shared.u32 [r771+72], r468; +barrier.sync 0; +ld.shared.u32 r504, [r759]; +ld.shared.u32 r501, [r759+324]; +ld.shared.u32 r502, [r759+648]; +barrier.sync 0; +st.shared.u32 [r771], r344; +st.shared.u32 [r771+36], r438; +st.shared.u32 [r771+72], r475; +barrier.sync 0; +ld.shared.u32 r510, [r759]; +ld.shared.u32 r507, [r759+324]; +ld.shared.u32 r508, [r759+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 r503, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 r509, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 r545, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 r563, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 r581, r572, r578; +} +mul.wide.u32 rd8, r755, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r772, rd9; +sub.s32 r773, r755, r772; +shr.u32 r774, r773, 1; +add.s32 r775, r774, r772; +shr.u32 r776, r775, 4; +mul.lo.s32 r777, r776, 27; +sub.s32 r778, r755, r777; +shl.b32 r779, r778, 2; +add.s32 r780, r756, r779; +cvt.rn.f32.u32 f62, r776; +mul.f32 f63, f62, 0f3F32B8C2; +cos.approx.f32 f41, f63; +sin.approx.f32 f64, f63; +neg.f32 f42, f64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r584, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r589, {high, high}; +} +{ +mul.f16x2 r591, r563, r589; +} +{ +neg.f16x2 r594, r591; +} +{ +fma.rn.f16x2 r596, r527, r587, r594; +} +{ +mul.f16x2 r600, r527, r589; +} +{ +fma.rn.f16x2 r603, r563, r587, r600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r607, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r609, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r609, r611; +} +{ +mul.f16x2 r615, r584, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r618, {high, low}; +} +{ +fma.rn.f16x2 r620, r612, r618, r615; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r626, {high, high}; +} +{ +mul.f16x2 r628, r581, r626; +} +{ +neg.f16x2 r631, r628; +} +{ +fma.rn.f16x2 r633, r545, r624, r631; +} +{ +mul.f16x2 r637, r545, r626; +} +{ +fma.rn.f16x2 r640, r581, r624, r637; +} +barrier.sync 0; +mad.lo.s32 r781, r776, 324, r780; +st.shared.u32 [r781], r503; +st.shared.u32 [r781+108], r596; +st.shared.u32 [r781+216], r633; +barrier.sync 0; +ld.shared.u32 r669, [r759]; +ld.shared.u32 r666, [r759+324]; +ld.shared.u32 r667, [r759+648]; +barrier.sync 0; +st.shared.u32 [r781], r509; +st.shared.u32 [r781+108], r603; +st.shared.u32 [r781+216], r640; +barrier.sync 0; +ld.shared.u32 r675, [r759]; +ld.shared.u32 r672, [r759+324]; +ld.shared.u32 r673, [r759+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r662, {low, high}; +} +{ +neg.f16x2 r663, r662; +} +{ +add.f16x2 r665, r666, r667; +} +{ +add.f16x2 %0, r669, r665; +} +{ +add.f16x2 r671, r672, r673; +} +{ +add.f16x2 %1, r675, r671; +} +{ +add.f16x2 r677, r666, r667; +} +{ +mul.f16x2 r680, r677, r661; +} +{ +add.f16x2 r683, r669, r680; +} +{ +sub.f16x2 r686, r672, r673; +} +{ +mul.f16x2 r689, r686, r663; +} +{ +add.f16x2 %2, r683, r689; +} +{ +add.f16x2 r695, r666, r667; +} +{ +mul.f16x2 r698, r695, r661; +} +{ +add.f16x2 r701, r669, r698; +} +{ +sub.f16x2 r704, r672, r673; +} +{ +mul.f16x2 r707, r704, r663; +} +{ +sub.f16x2 %4, r701, r707; +} +{ +add.f16x2 r713, r672, r673; +} +{ +mul.f16x2 r716, r713, r661; +} +{ +add.f16x2 r719, r675, r716; +} +{ +sub.f16x2 r722, r666, r667; +} +{ +mul.f16x2 r725, r722, r663; +} +{ +sub.f16x2 %3, r719, r725; +} +{ +add.f16x2 r731, r672, r673; +} +{ +mul.f16x2 r734, r731, r661; +} +{ +add.f16x2 r737, r675, r734; +} +{ +sub.f16x2 r740, r666, r667; +} +{ +mul.f16x2 r743, r740, r663; +} +{ +add.f16x2 %5, r737, r743; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..39574c4dd9914 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp16_inv.hpp.inc @@ -0,0 +1,20799 @@ +#ifndef CUFFTDX_FFT_243_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_243_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1080, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<536>; +.reg .b32 r<5689>; +.reg .b64 rd<4>; +mov.u32 r5625, %54; +mov.u32 r5688, %tid.y; +mad.lo.s32 r5626, r5688, 1944, r5625; +mov.u32 r5627, %tid.x; +mov.f32 f530, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1, {low, high}; +} +mov.f32 f532, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %106, %91; +} +{ +add.f16x2 r6, %61, r3; +} +{ +add.f16x2 r9, %70, %102; +} +{ +add.f16x2 r12, %76, r9; +} +{ +add.f16x2 r15, %106, %91; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %61, r18; +} +{ +sub.f16x2 r24, %70, %102; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %106, %91; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %61, r36; +} +{ +sub.f16x2 r42, %70, %102; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %70, %102; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %76, r54; +} +{ +sub.f16x2 r60, %106, %91; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %70, %102; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %76, r72; +} +{ +sub.f16x2 r78, %106, %91; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %105, %90; +} +{ +add.f16x2 r92, %60, r89; +} +{ +add.f16x2 r95, %69, %103; +} +{ +add.f16x2 r98, %75, r95; +} +{ +add.f16x2 r101, %105, %90; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %60, r104; +} +{ +sub.f16x2 r110, %69, %103; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %105, %90; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %60, r122; +} +{ +sub.f16x2 r128, %69, %103; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %69, %103; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %75, r140; +} +{ +sub.f16x2 r146, %105, %90; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %69, %103; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %75, r158; +} +{ +sub.f16x2 r164, %105, %90; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %104, %89; +} +{ +add.f16x2 r178, %59, r175; +} +{ +add.f16x2 r181, %67, %101; +} +{ +add.f16x2 r184, %74, r181; +} +{ +add.f16x2 r187, %104, %89; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %59, r190; +} +{ +sub.f16x2 r196, %67, %101; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %104, %89; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %59, r208; +} +{ +sub.f16x2 r214, %67, %101; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %67, %101; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %74, r226; +} +{ +sub.f16x2 r232, %104, %89; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %67, %101; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %74, r244; +} +{ +sub.f16x2 r250, %104, %89; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f490, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r259, {low, high}; +} +mov.f32 f492, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r260, {low, high}; +} +mov.f32 f494, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r261, {low, high}; +} +mov.f32 f496, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r262, {low, high}; +} +mov.f32 f502, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r265, {low, high}; +} +mov.f32 f504, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %85, %68; +} +{ +add.f16x2 r602, %88, r599; +} +{ +add.f16x2 r605, %97, %83; +} +{ +add.f16x2 r608, %100, r605; +} +{ +add.f16x2 r611, %85, %68; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %88, r614; +} +{ +sub.f16x2 r620, %97, %83; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %85, %68; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %88, r632; +} +{ +sub.f16x2 r638, %97, %83; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %97, %83; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %100, r650; +} +{ +sub.f16x2 r656, %85, %68; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %97, %83; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %100, r668; +} +{ +sub.f16x2 r674, %85, %68; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %84, %66; +} +{ +add.f16x2 r688, %87, r685; +} +{ +add.f16x2 r691, %96, %81; +} +{ +add.f16x2 r694, %99, r691; +} +{ +add.f16x2 r697, %84, %66; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %87, r700; +} +{ +sub.f16x2 r706, %96, %81; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %84, %66; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %87, r718; +} +{ +sub.f16x2 r724, %96, %81; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %96, %81; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %99, r736; +} +{ +sub.f16x2 r742, %84, %66; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %96, %81; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %99, r754; +} +{ +sub.f16x2 r760, %84, %66; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %82, %65; +} +{ +add.f16x2 r774, %86, r771; +} +{ +add.f16x2 r777, %94, %80; +} +{ +add.f16x2 r780, %98, r777; +} +{ +add.f16x2 r783, %82, %65; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %86, r786; +} +{ +sub.f16x2 r792, %94, %80; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %82, %65; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %86, r804; +} +{ +sub.f16x2 r810, %94, %80; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %94, %80; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %98, r822; +} +{ +sub.f16x2 r828, %82, %65; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %94, %80; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %98, r840; +} +{ +sub.f16x2 r846, %82, %65; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %58, %95; +} +{ +add.f16x2 r1198, %64, r1195; +} +{ +add.f16x2 r1201, %73, %56; +} +{ +add.f16x2 r1204, %79, r1201; +} +{ +add.f16x2 r1207, %58, %95; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %64, r1210; +} +{ +sub.f16x2 r1216, %73, %56; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %58, %95; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %64, r1228; +} +{ +sub.f16x2 r1234, %73, %56; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %73, %56; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %79, r1246; +} +{ +sub.f16x2 r1252, %58, %95; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %73, %56; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %79, r1264; +} +{ +sub.f16x2 r1270, %58, %95; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %57, %93; +} +{ +add.f16x2 r1284, %63, r1281; +} +{ +add.f16x2 r1287, %72, %108; +} +{ +add.f16x2 r1290, %78, r1287; +} +{ +add.f16x2 r1293, %57, %93; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %63, r1296; +} +{ +sub.f16x2 r1302, %72, %108; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %57, %93; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %63, r1314; +} +{ +sub.f16x2 r1320, %72, %108; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %72, %108; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %78, r1332; +} +{ +sub.f16x2 r1338, %57, %93; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %72, %108; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %78, r1350; +} +{ +sub.f16x2 r1356, %57, %93; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %55, %92; +} +{ +add.f16x2 r1370, %62, r1367; +} +{ +add.f16x2 r1373, %71, %107; +} +{ +add.f16x2 r1376, %77, r1373; +} +{ +add.f16x2 r1379, %55, %92; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %62, r1382; +} +{ +sub.f16x2 r1388, %71, %107; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %55, %92; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %62, r1400; +} +{ +sub.f16x2 r1406, %71, %107; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %71, %107; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %77, r1418; +} +{ +sub.f16x2 r1424, %55, %92; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %71, %107; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %77, r1436; +} +{ +sub.f16x2 r1442, %55, %92; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1789, {low, high}; +} +mov.f32 f172, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1790, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1791, {low, high}; +} +mov.f32 f176, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1794, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1795, {low, high}; +} +mov.f32 f184, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1796, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1797, {low, high}; +} +mov.f32 f188, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1800, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1801, {low, high}; +} +mov.f32 f196, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1802, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1803, {low, high}; +} +mov.f32 f200, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1804, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1807, {low, high}; +} +mov.f32 f208, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1808, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1812, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1815, {low, high}; +} +mov.f32 f224, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1819, {low, high}; +} +mov.f32 f232, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r5627, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5628, rd3; +mul.lo.s32 r5629, r5628, 9; +sub.s32 r5630, r5627, r5629; +cvt.rn.f32.u32 f533, r5630; +mul.f32 f534, f533, 0f3CD3D17E; +cos.approx.f32 f309, f534; +sin.approx.f32 f535, f534; +neg.f32 f310, f535; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +mad.lo.s32 r5631, r5628, 1944, r5626; +barrier.sync 0; +mad.lo.s32 r5632, r5630, 216, r5631; +st.shared.v2.f32 [r5632], {r2102, r2108}; +st.shared.v2.f32 [r5632+8], {r2881, r2890}; +st.shared.v2.f32 [r5632+16], {r2918, r2927}; +st.shared.v2.f32 [r5632+24], {r2955, r2964}; +st.shared.v2.f32 [r5632+32], {r2992, r3001}; +st.shared.v2.f32 [r5632+40], {r3029, r3038}; +st.shared.v2.f32 [r5632+48], {r3066, r3075}; +st.shared.v2.f32 [r5632+56], {r3103, r3112}; +st.shared.v2.f32 [r5632+64], {r3140, r3149}; +st.shared.v2.f32 [r5632+72], {r3177, r3186}; +st.shared.v2.f32 [r5632+80], {r3214, r3223}; +st.shared.v2.f32 [r5632+88], {r3251, r3260}; +st.shared.v2.f32 [r5632+96], {r3288, r3297}; +st.shared.v2.f32 [r5632+104], {r3325, r3334}; +st.shared.v2.f32 [r5632+112], {r3362, r3371}; +st.shared.v2.f32 [r5632+120], {r3399, r3408}; +st.shared.v2.f32 [r5632+128], {r3436, r3445}; +st.shared.v2.f32 [r5632+136], {r3473, r3482}; +st.shared.v2.f32 [r5632+144], {r3510, r3519}; +st.shared.v2.f32 [r5632+152], {r3547, r3556}; +st.shared.v2.f32 [r5632+160], {r3584, r3593}; +st.shared.v2.f32 [r5632+168], {r3621, r3630}; +st.shared.v2.f32 [r5632+176], {r3658, r3667}; +st.shared.v2.f32 [r5632+184], {r3695, r3704}; +st.shared.v2.f32 [r5632+192], {r3732, r3741}; +st.shared.v2.f32 [r5632+200], {r3769, r3778}; +st.shared.v2.f32 [r5632+208], {r3806, r3815}; +barrier.sync 0; +mad.lo.s32 r5633, r5630, -208, r5632; +ld.shared.u32 r3842, [r5633]; +ld.shared.u32 r3848, [r5633+4]; +ld.shared.u32 r4438, [r5633+72]; +ld.shared.u32 r4444, [r5633+76]; +ld.shared.u32 r5034, [r5633+144]; +ld.shared.u32 r5040, [r5633+148]; +ld.shared.u32 r3928, [r5633+216]; +ld.shared.u32 r3934, [r5633+220]; +ld.shared.u32 r4524, [r5633+288]; +ld.shared.u32 r4530, [r5633+292]; +ld.shared.u32 r5120, [r5633+360]; +ld.shared.u32 r5126, [r5633+364]; +ld.shared.u32 r4014, [r5633+432]; +ld.shared.u32 r4020, [r5633+436]; +ld.shared.u32 r4610, [r5633+504]; +ld.shared.u32 r4616, [r5633+508]; +ld.shared.u32 r5206, [r5633+576]; +ld.shared.u32 r5212, [r5633+580]; +ld.shared.u32 r3839, [r5633+648]; +ld.shared.u32 r3845, [r5633+652]; +ld.shared.u32 r4435, [r5633+720]; +ld.shared.u32 r4441, [r5633+724]; +ld.shared.u32 r5031, [r5633+792]; +ld.shared.u32 r5037, [r5633+796]; +ld.shared.u32 r3925, [r5633+864]; +ld.shared.u32 r3931, [r5633+868]; +ld.shared.u32 r4521, [r5633+936]; +ld.shared.u32 r4527, [r5633+940]; +ld.shared.u32 r5117, [r5633+1008]; +ld.shared.u32 r5123, [r5633+1012]; +ld.shared.u32 r4011, [r5633+1080]; +ld.shared.u32 r4017, [r5633+1084]; +ld.shared.u32 r4607, [r5633+1152]; +ld.shared.u32 r4613, [r5633+1156]; +ld.shared.u32 r5203, [r5633+1224]; +ld.shared.u32 r5209, [r5633+1228]; +ld.shared.u32 r3840, [r5633+1296]; +ld.shared.u32 r3846, [r5633+1300]; +ld.shared.u32 r4436, [r5633+1368]; +ld.shared.u32 r4442, [r5633+1372]; +ld.shared.u32 r5032, [r5633+1440]; +ld.shared.u32 r5038, [r5633+1444]; +ld.shared.u32 r3926, [r5633+1512]; +ld.shared.u32 r3932, [r5633+1516]; +ld.shared.u32 r4522, [r5633+1584]; +ld.shared.u32 r4528, [r5633+1588]; +ld.shared.u32 r5118, [r5633+1656]; +ld.shared.u32 r5124, [r5633+1660]; +ld.shared.u32 r4012, [r5633+1728]; +ld.shared.u32 r4018, [r5633+1732]; +ld.shared.u32 r4608, [r5633+1800]; +ld.shared.u32 r4614, [r5633+1804]; +ld.shared.u32 r5204, [r5633+1872]; +ld.shared.u32 r5210, [r5633+1876]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 %0, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 %1, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 %18, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 %36, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 %19, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 %37, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 %6, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 %7, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 %24, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 %42, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 %25, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 %43, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 %12, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 %13, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 %30, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 %48, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 %31, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 %49, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 %2, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 %3, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 %20, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 %38, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 %21, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 %39, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 %8, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 %9, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 %26, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 %44, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 %27, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 %45, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 %14, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 %15, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 %32, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 %50, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 %33, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 %51, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 %4, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 %5, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 %22, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 %40, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 %23, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 %41, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 %10, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 %11, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 %28, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 %46, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 %29, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 %47, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 %16, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 %17, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 %34, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 %52, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 %35, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 %53, r5612, r5618; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1081, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<536>; +.reg .b32 r<5689>; +.reg .b64 rd<4>; +mov.u32 r5625, %54; +mov.u32 r5688, %tid.y; +mad.lo.s32 r5626, r5688, 972, r5625; +mov.u32 r5627, %tid.x; +mov.f32 f530, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1, {low, high}; +} +mov.f32 f532, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %106, %91; +} +{ +add.f16x2 r6, %61, r3; +} +{ +add.f16x2 r9, %70, %102; +} +{ +add.f16x2 r12, %76, r9; +} +{ +add.f16x2 r15, %106, %91; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %61, r18; +} +{ +sub.f16x2 r24, %70, %102; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %106, %91; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %61, r36; +} +{ +sub.f16x2 r42, %70, %102; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %70, %102; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %76, r54; +} +{ +sub.f16x2 r60, %106, %91; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %70, %102; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %76, r72; +} +{ +sub.f16x2 r78, %106, %91; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %105, %90; +} +{ +add.f16x2 r92, %60, r89; +} +{ +add.f16x2 r95, %69, %103; +} +{ +add.f16x2 r98, %75, r95; +} +{ +add.f16x2 r101, %105, %90; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %60, r104; +} +{ +sub.f16x2 r110, %69, %103; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %105, %90; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %60, r122; +} +{ +sub.f16x2 r128, %69, %103; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %69, %103; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %75, r140; +} +{ +sub.f16x2 r146, %105, %90; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %69, %103; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %75, r158; +} +{ +sub.f16x2 r164, %105, %90; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %104, %89; +} +{ +add.f16x2 r178, %59, r175; +} +{ +add.f16x2 r181, %67, %101; +} +{ +add.f16x2 r184, %74, r181; +} +{ +add.f16x2 r187, %104, %89; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %59, r190; +} +{ +sub.f16x2 r196, %67, %101; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %104, %89; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %59, r208; +} +{ +sub.f16x2 r214, %67, %101; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %67, %101; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %74, r226; +} +{ +sub.f16x2 r232, %104, %89; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %67, %101; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %74, r244; +} +{ +sub.f16x2 r250, %104, %89; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f490, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r259, {low, high}; +} +mov.f32 f492, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r260, {low, high}; +} +mov.f32 f494, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r261, {low, high}; +} +mov.f32 f496, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r262, {low, high}; +} +mov.f32 f502, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r265, {low, high}; +} +mov.f32 f504, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %85, %68; +} +{ +add.f16x2 r602, %88, r599; +} +{ +add.f16x2 r605, %97, %83; +} +{ +add.f16x2 r608, %100, r605; +} +{ +add.f16x2 r611, %85, %68; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %88, r614; +} +{ +sub.f16x2 r620, %97, %83; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %85, %68; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %88, r632; +} +{ +sub.f16x2 r638, %97, %83; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %97, %83; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %100, r650; +} +{ +sub.f16x2 r656, %85, %68; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %97, %83; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %100, r668; +} +{ +sub.f16x2 r674, %85, %68; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %84, %66; +} +{ +add.f16x2 r688, %87, r685; +} +{ +add.f16x2 r691, %96, %81; +} +{ +add.f16x2 r694, %99, r691; +} +{ +add.f16x2 r697, %84, %66; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %87, r700; +} +{ +sub.f16x2 r706, %96, %81; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %84, %66; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %87, r718; +} +{ +sub.f16x2 r724, %96, %81; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %96, %81; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %99, r736; +} +{ +sub.f16x2 r742, %84, %66; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %96, %81; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %99, r754; +} +{ +sub.f16x2 r760, %84, %66; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %82, %65; +} +{ +add.f16x2 r774, %86, r771; +} +{ +add.f16x2 r777, %94, %80; +} +{ +add.f16x2 r780, %98, r777; +} +{ +add.f16x2 r783, %82, %65; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %86, r786; +} +{ +sub.f16x2 r792, %94, %80; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %82, %65; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %86, r804; +} +{ +sub.f16x2 r810, %94, %80; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %94, %80; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %98, r822; +} +{ +sub.f16x2 r828, %82, %65; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %94, %80; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %98, r840; +} +{ +sub.f16x2 r846, %82, %65; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %58, %95; +} +{ +add.f16x2 r1198, %64, r1195; +} +{ +add.f16x2 r1201, %73, %56; +} +{ +add.f16x2 r1204, %79, r1201; +} +{ +add.f16x2 r1207, %58, %95; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %64, r1210; +} +{ +sub.f16x2 r1216, %73, %56; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %58, %95; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %64, r1228; +} +{ +sub.f16x2 r1234, %73, %56; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %73, %56; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %79, r1246; +} +{ +sub.f16x2 r1252, %58, %95; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %73, %56; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %79, r1264; +} +{ +sub.f16x2 r1270, %58, %95; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %57, %93; +} +{ +add.f16x2 r1284, %63, r1281; +} +{ +add.f16x2 r1287, %72, %108; +} +{ +add.f16x2 r1290, %78, r1287; +} +{ +add.f16x2 r1293, %57, %93; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %63, r1296; +} +{ +sub.f16x2 r1302, %72, %108; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %57, %93; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %63, r1314; +} +{ +sub.f16x2 r1320, %72, %108; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %72, %108; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %78, r1332; +} +{ +sub.f16x2 r1338, %57, %93; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %72, %108; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %78, r1350; +} +{ +sub.f16x2 r1356, %57, %93; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %55, %92; +} +{ +add.f16x2 r1370, %62, r1367; +} +{ +add.f16x2 r1373, %71, %107; +} +{ +add.f16x2 r1376, %77, r1373; +} +{ +add.f16x2 r1379, %55, %92; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %62, r1382; +} +{ +sub.f16x2 r1388, %71, %107; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %55, %92; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %62, r1400; +} +{ +sub.f16x2 r1406, %71, %107; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %71, %107; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %77, r1418; +} +{ +sub.f16x2 r1424, %55, %92; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %71, %107; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %77, r1436; +} +{ +sub.f16x2 r1442, %55, %92; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1789, {low, high}; +} +mov.f32 f172, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1790, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1791, {low, high}; +} +mov.f32 f176, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r1794, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1795, {low, high}; +} +mov.f32 f184, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1796, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1797, {low, high}; +} +mov.f32 f188, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r1800, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1801, {low, high}; +} +mov.f32 f196, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1802, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1803, {low, high}; +} +mov.f32 f200, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1804, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1807, {low, high}; +} +mov.f32 f208, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1808, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r1812, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1815, {low, high}; +} +mov.f32 f224, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1819, {low, high}; +} +mov.f32 f232, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r5627, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5628, rd3; +mul.lo.s32 r5629, r5628, 9; +sub.s32 r5630, r5627, r5629; +mad.lo.s32 r5631, r5628, 972, r5626; +cvt.rn.f32.u32 f533, r5630; +mul.f32 f534, f533, 0f3CD3D17E; +cos.approx.f32 f309, f534; +sin.approx.f32 f535, f534; +neg.f32 f310, f535; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +barrier.sync 0; +mad.lo.s32 r5632, r5630, 108, r5631; +st.shared.u32 [r5632], r2102; +st.shared.u32 [r5632+4], r2881; +st.shared.u32 [r5632+8], r2918; +st.shared.u32 [r5632+12], r2955; +st.shared.u32 [r5632+16], r2992; +st.shared.u32 [r5632+20], r3029; +st.shared.u32 [r5632+24], r3066; +st.shared.u32 [r5632+28], r3103; +st.shared.u32 [r5632+32], r3140; +st.shared.u32 [r5632+36], r3177; +st.shared.u32 [r5632+40], r3214; +st.shared.u32 [r5632+44], r3251; +st.shared.u32 [r5632+48], r3288; +st.shared.u32 [r5632+52], r3325; +st.shared.u32 [r5632+56], r3362; +st.shared.u32 [r5632+60], r3399; +st.shared.u32 [r5632+64], r3436; +st.shared.u32 [r5632+68], r3473; +st.shared.u32 [r5632+72], r3510; +st.shared.u32 [r5632+76], r3547; +st.shared.u32 [r5632+80], r3584; +st.shared.u32 [r5632+84], r3621; +st.shared.u32 [r5632+88], r3658; +st.shared.u32 [r5632+92], r3695; +st.shared.u32 [r5632+96], r3732; +st.shared.u32 [r5632+100], r3769; +st.shared.u32 [r5632+104], r3806; +barrier.sync 0; +mad.lo.s32 r5633, r5630, -104, r5632; +ld.shared.u32 r3842, [r5633]; +ld.shared.u32 r4438, [r5633+36]; +ld.shared.u32 r5034, [r5633+72]; +ld.shared.u32 r3928, [r5633+108]; +ld.shared.u32 r4524, [r5633+144]; +ld.shared.u32 r5120, [r5633+180]; +ld.shared.u32 r4014, [r5633+216]; +ld.shared.u32 r4610, [r5633+252]; +ld.shared.u32 r5206, [r5633+288]; +ld.shared.u32 r3839, [r5633+324]; +ld.shared.u32 r4435, [r5633+360]; +ld.shared.u32 r5031, [r5633+396]; +ld.shared.u32 r3925, [r5633+432]; +ld.shared.u32 r4521, [r5633+468]; +ld.shared.u32 r5117, [r5633+504]; +ld.shared.u32 r4011, [r5633+540]; +ld.shared.u32 r4607, [r5633+576]; +ld.shared.u32 r5203, [r5633+612]; +ld.shared.u32 r3840, [r5633+648]; +ld.shared.u32 r4436, [r5633+684]; +ld.shared.u32 r5032, [r5633+720]; +ld.shared.u32 r3926, [r5633+756]; +ld.shared.u32 r4522, [r5633+792]; +ld.shared.u32 r5118, [r5633+828]; +ld.shared.u32 r4012, [r5633+864]; +ld.shared.u32 r4608, [r5633+900]; +ld.shared.u32 r5204, [r5633+936]; +barrier.sync 0; +st.shared.u32 [r5632], r2108; +st.shared.u32 [r5632+4], r2890; +st.shared.u32 [r5632+8], r2927; +st.shared.u32 [r5632+12], r2964; +st.shared.u32 [r5632+16], r3001; +st.shared.u32 [r5632+20], r3038; +st.shared.u32 [r5632+24], r3075; +st.shared.u32 [r5632+28], r3112; +st.shared.u32 [r5632+32], r3149; +st.shared.u32 [r5632+36], r3186; +st.shared.u32 [r5632+40], r3223; +st.shared.u32 [r5632+44], r3260; +st.shared.u32 [r5632+48], r3297; +st.shared.u32 [r5632+52], r3334; +st.shared.u32 [r5632+56], r3371; +st.shared.u32 [r5632+60], r3408; +st.shared.u32 [r5632+64], r3445; +st.shared.u32 [r5632+68], r3482; +st.shared.u32 [r5632+72], r3519; +st.shared.u32 [r5632+76], r3556; +st.shared.u32 [r5632+80], r3593; +st.shared.u32 [r5632+84], r3630; +st.shared.u32 [r5632+88], r3667; +st.shared.u32 [r5632+92], r3704; +st.shared.u32 [r5632+96], r3741; +st.shared.u32 [r5632+100], r3778; +st.shared.u32 [r5632+104], r3815; +barrier.sync 0; +ld.shared.u32 r3848, [r5633]; +ld.shared.u32 r4444, [r5633+36]; +ld.shared.u32 r5040, [r5633+72]; +ld.shared.u32 r3934, [r5633+108]; +ld.shared.u32 r4530, [r5633+144]; +ld.shared.u32 r5126, [r5633+180]; +ld.shared.u32 r4020, [r5633+216]; +ld.shared.u32 r4616, [r5633+252]; +ld.shared.u32 r5212, [r5633+288]; +ld.shared.u32 r3845, [r5633+324]; +ld.shared.u32 r4441, [r5633+360]; +ld.shared.u32 r5037, [r5633+396]; +ld.shared.u32 r3931, [r5633+432]; +ld.shared.u32 r4527, [r5633+468]; +ld.shared.u32 r5123, [r5633+504]; +ld.shared.u32 r4017, [r5633+540]; +ld.shared.u32 r4613, [r5633+576]; +ld.shared.u32 r5209, [r5633+612]; +ld.shared.u32 r3846, [r5633+648]; +ld.shared.u32 r4442, [r5633+684]; +ld.shared.u32 r5038, [r5633+720]; +ld.shared.u32 r3932, [r5633+756]; +ld.shared.u32 r4528, [r5633+792]; +ld.shared.u32 r5124, [r5633+828]; +ld.shared.u32 r4018, [r5633+864]; +ld.shared.u32 r4614, [r5633+900]; +ld.shared.u32 r5210, [r5633+936]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 %0, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 %1, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 %18, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 %36, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 %19, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 %37, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 %6, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 %7, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 %24, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 %42, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 %25, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 %43, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 %12, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 %13, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 %30, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 %48, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 %31, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 %49, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 %2, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 %3, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 %20, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 %38, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 %21, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 %39, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 %8, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 %9, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 %26, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 %44, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 %27, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 %45, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 %14, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 %15, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 %32, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 %50, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 %33, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 %51, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f490; +cvt.rn.f16.f32 high, f490; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f492; +cvt.rn.f16.f32 high, f492; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f494; +cvt.rn.f16.f32 high, f494; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f496; +cvt.rn.f16.f32 high, f496; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f502; +cvt.rn.f16.f32 high, f502; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f504; +cvt.rn.f16.f32 high, f504; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 %4, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 %5, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 %22, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 %40, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 %23, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 %41, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 %10, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 %11, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 %28, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 %46, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 %29, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 %47, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f530; +cvt.rn.f16.f32 high, f530; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f532; +cvt.rn.f16.f32 high, f532; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 %16, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 %17, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 %34, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 %52, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 %35, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 %53, r5612, r5618; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1082, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<171>; +.reg .b32 r<2070>; +.reg .b64 rd<6>; +mov.u32 r2049, %tid.y; +mov.u32 r2050, %18; +mad.lo.s32 r2051, r2049, 1944, r2050; +mov.u32 r2052, %tid.x; +mov.f32 f162, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1, {low, high}; +} +mov.f32 f164, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r259, {low, high}; +} +mov.f32 f92, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r260, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r261, {low, high}; +} +mov.f32 f96, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r262, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r265, {low, high}; +} +mov.f32 f104, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r2052, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r2053, rd3; +sub.s32 r2054, r2052, r2053; +shr.u32 r2055, r2054, 1; +add.s32 r2056, r2055, r2053; +shr.u32 r2057, r2056, 4; +mul.lo.s32 r2058, r2057, 27; +sub.s32 r2059, r2052, r2058; +cvt.rn.f32.u32 f165, r2059; +mul.f32 f166, f165, 0f3CD3D17E; +cos.approx.f32 f57, f166; +sin.approx.f32 f167, f166; +neg.f32 f58, f167; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +mad.lo.s32 r2060, r2057, 1944, r2051; +barrier.sync 0; +mad.lo.s32 r2061, r2059, 72, r2060; +st.shared.v2.f32 [r2061], {r344, r350}; +st.shared.v2.f32 [r2061+8], {r607, r616}; +st.shared.v2.f32 [r2061+16], {r644, r653}; +st.shared.v2.f32 [r2061+24], {r681, r690}; +st.shared.v2.f32 [r2061+32], {r718, r727}; +st.shared.v2.f32 [r2061+40], {r755, r764}; +st.shared.v2.f32 [r2061+48], {r792, r801}; +st.shared.v2.f32 [r2061+56], {r829, r838}; +st.shared.v2.f32 [r2061+64], {r866, r875}; +barrier.sync 0; +shl.b32 r2062, r2059, 6; +sub.s32 r2063, r2061, r2062; +ld.shared.u32 r902, [r2063]; +ld.shared.u32 r908, [r2063+4]; +ld.shared.u32 r988, [r2063+216]; +ld.shared.u32 r994, [r2063+220]; +ld.shared.u32 r1074, [r2063+432]; +ld.shared.u32 r1080, [r2063+436]; +ld.shared.u32 r899, [r2063+648]; +ld.shared.u32 r905, [r2063+652]; +ld.shared.u32 r985, [r2063+864]; +ld.shared.u32 r991, [r2063+868]; +ld.shared.u32 r1071, [r2063+1080]; +ld.shared.u32 r1077, [r2063+1084]; +ld.shared.u32 r900, [r2063+1296]; +ld.shared.u32 r906, [r2063+1300]; +ld.shared.u32 r986, [r2063+1512]; +ld.shared.u32 r992, [r2063+1516]; +ld.shared.u32 r1072, [r2063+1728]; +ld.shared.u32 r1078, [r2063+1732]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r2059, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2064, rd5; +cvt.rn.f32.u32 f168, r2064; +mul.f32 f169, f168, 0f3E6E4BAE; +cos.approx.f32 f133, f169; +sin.approx.f32 f170, f169; +neg.f32 f134, f170; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +mul.lo.s32 r2065, r2064, 9; +sub.s32 r2066, r2059, r2065; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +shl.b32 r2067, r2066, 3; +add.s32 r2068, r2060, r2067; +barrier.sync 0; +mad.lo.s32 r2069, r2064, 648, r2068; +st.shared.u32 [r2069], r1239; +st.shared.u32 [r2069+4], r1245; +st.shared.u32 [r2069+72], r1502; +st.shared.u32 [r2069+76], r1511; +st.shared.u32 [r2069+144], r1539; +st.shared.u32 [r2069+148], r1548; +st.shared.u32 [r2069+216], r1576; +st.shared.u32 [r2069+220], r1585; +st.shared.u32 [r2069+288], r1613; +st.shared.u32 [r2069+292], r1622; +st.shared.u32 [r2069+360], r1650; +st.shared.u32 [r2069+364], r1659; +st.shared.u32 [r2069+432], r1687; +st.shared.u32 [r2069+436], r1696; +st.shared.u32 [r2069+504], r1724; +st.shared.u32 [r2069+508], r1733; +st.shared.u32 [r2069+576], r1761; +st.shared.u32 [r2069+580], r1770; +barrier.sync 0; +ld.shared.u32 r1797, [r2063]; +ld.shared.u32 r1803, [r2063+4]; +ld.shared.u32 r1883, [r2063+216]; +ld.shared.u32 r1889, [r2063+220]; +ld.shared.u32 r1969, [r2063+432]; +ld.shared.u32 r1975, [r2063+436]; +ld.shared.u32 r1794, [r2063+648]; +ld.shared.u32 r1800, [r2063+652]; +ld.shared.u32 r1880, [r2063+864]; +ld.shared.u32 r1886, [r2063+868]; +ld.shared.u32 r1966, [r2063+1080]; +ld.shared.u32 r1972, [r2063+1084]; +ld.shared.u32 r1795, [r2063+1296]; +ld.shared.u32 r1801, [r2063+1300]; +ld.shared.u32 r1881, [r2063+1512]; +ld.shared.u32 r1887, [r2063+1516]; +ld.shared.u32 r1967, [r2063+1728]; +ld.shared.u32 r1973, [r2063+1732]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 %0, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 %1, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 %6, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 %12, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 %7, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 %13, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 %2, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 %3, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 %8, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 %14, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 %9, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 %15, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 %4, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 %5, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 %10, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 %16, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 %11, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 %17, r2037, r2043; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1083, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<171>; +.reg .b32 r<2070>; +.reg .b64 rd<6>; +mov.u32 r2049, %tid.y; +mov.u32 r2050, %18; +mad.lo.s32 r2051, r2049, 972, r2050; +mov.u32 r2052, %tid.x; +mov.f32 f162, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1, {low, high}; +} +mov.f32 f164, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r259, {low, high}; +} +mov.f32 f92, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r260, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r261, {low, high}; +} +mov.f32 f96, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r262, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r265, {low, high}; +} +mov.f32 f104, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r2052, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r2053, rd3; +sub.s32 r2054, r2052, r2053; +shr.u32 r2055, r2054, 1; +add.s32 r2056, r2055, r2053; +shr.u32 r2057, r2056, 4; +mul.lo.s32 r2058, r2057, 27; +sub.s32 r2059, r2052, r2058; +mad.lo.s32 r2060, r2057, 972, r2051; +cvt.rn.f32.u32 f165, r2059; +mul.f32 f166, f165, 0f3CD3D17E; +cos.approx.f32 f57, f166; +sin.approx.f32 f167, f166; +neg.f32 f58, f167; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +barrier.sync 0; +mad.lo.s32 r2061, r2059, 36, r2060; +st.shared.u32 [r2061], r344; +st.shared.u32 [r2061+4], r607; +st.shared.u32 [r2061+8], r644; +st.shared.u32 [r2061+12], r681; +st.shared.u32 [r2061+16], r718; +st.shared.u32 [r2061+20], r755; +st.shared.u32 [r2061+24], r792; +st.shared.u32 [r2061+28], r829; +st.shared.u32 [r2061+32], r866; +barrier.sync 0; +shl.b32 r2062, r2059, 5; +sub.s32 r2063, r2061, r2062; +ld.shared.u32 r902, [r2063]; +ld.shared.u32 r988, [r2063+108]; +ld.shared.u32 r1074, [r2063+216]; +ld.shared.u32 r899, [r2063+324]; +ld.shared.u32 r985, [r2063+432]; +ld.shared.u32 r1071, [r2063+540]; +ld.shared.u32 r900, [r2063+648]; +ld.shared.u32 r986, [r2063+756]; +ld.shared.u32 r1072, [r2063+864]; +barrier.sync 0; +st.shared.u32 [r2061], r350; +st.shared.u32 [r2061+4], r616; +st.shared.u32 [r2061+8], r653; +st.shared.u32 [r2061+12], r690; +st.shared.u32 [r2061+16], r727; +st.shared.u32 [r2061+20], r764; +st.shared.u32 [r2061+24], r801; +st.shared.u32 [r2061+28], r838; +st.shared.u32 [r2061+32], r875; +barrier.sync 0; +ld.shared.u32 r908, [r2063]; +ld.shared.u32 r994, [r2063+108]; +ld.shared.u32 r1080, [r2063+216]; +ld.shared.u32 r905, [r2063+324]; +ld.shared.u32 r991, [r2063+432]; +ld.shared.u32 r1077, [r2063+540]; +ld.shared.u32 r906, [r2063+648]; +ld.shared.u32 r992, [r2063+756]; +ld.shared.u32 r1078, [r2063+864]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r2059, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2064, rd5; +mul.lo.s32 r2065, r2064, 9; +sub.s32 r2066, r2059, r2065; +shl.b32 r2067, r2066, 2; +add.s32 r2068, r2060, r2067; +cvt.rn.f32.u32 f168, r2064; +mul.f32 f169, f168, 0f3E6E4BAE; +cos.approx.f32 f133, f169; +sin.approx.f32 f170, f169; +neg.f32 f134, f170; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +barrier.sync 0; +mad.lo.s32 r2069, r2064, 324, r2068; +st.shared.u32 [r2069], r1239; +st.shared.u32 [r2069+36], r1502; +st.shared.u32 [r2069+72], r1539; +st.shared.u32 [r2069+108], r1576; +st.shared.u32 [r2069+144], r1613; +st.shared.u32 [r2069+180], r1650; +st.shared.u32 [r2069+216], r1687; +st.shared.u32 [r2069+252], r1724; +st.shared.u32 [r2069+288], r1761; +barrier.sync 0; +ld.shared.u32 r1797, [r2063]; +ld.shared.u32 r1883, [r2063+108]; +ld.shared.u32 r1969, [r2063+216]; +ld.shared.u32 r1794, [r2063+324]; +ld.shared.u32 r1880, [r2063+432]; +ld.shared.u32 r1966, [r2063+540]; +ld.shared.u32 r1795, [r2063+648]; +ld.shared.u32 r1881, [r2063+756]; +ld.shared.u32 r1967, [r2063+864]; +barrier.sync 0; +st.shared.u32 [r2069], r1245; +st.shared.u32 [r2069+36], r1511; +st.shared.u32 [r2069+72], r1548; +st.shared.u32 [r2069+108], r1585; +st.shared.u32 [r2069+144], r1622; +st.shared.u32 [r2069+180], r1659; +st.shared.u32 [r2069+216], r1696; +st.shared.u32 [r2069+252], r1733; +st.shared.u32 [r2069+288], r1770; +barrier.sync 0; +ld.shared.u32 r1803, [r2063]; +ld.shared.u32 r1889, [r2063+108]; +ld.shared.u32 r1975, [r2063+216]; +ld.shared.u32 r1800, [r2063+324]; +ld.shared.u32 r1886, [r2063+432]; +ld.shared.u32 r1972, [r2063+540]; +ld.shared.u32 r1801, [r2063+648]; +ld.shared.u32 r1887, [r2063+756]; +ld.shared.u32 r1973, [r2063+864]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 %0, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 %1, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 %6, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 %12, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 %7, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 %13, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 %2, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 %3, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 %8, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 %14, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 %9, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 %15, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f162; +cvt.rn.f16.f32 high, f162; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f164; +cvt.rn.f16.f32 high, f164; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 %4, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 %5, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 %10, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 %16, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 %11, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 %17, r2037, r2043; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1084, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<65>; +.reg .b32 r<772>; +.reg .b64 rd<10>; +mov.u32 r739, %tid.y; +mov.u32 r740, %6; +mad.lo.s32 r741, r739, 1944, r740; +mov.u32 r742, %tid.x; +mov.f32 f50, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r1, {low, high}; +} +mov.f32 f52, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r742, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r743, rd3; +mul.lo.s32 r744, r743, 81; +sub.s32 r745, r742, r744; +mad.lo.s32 r746, r743, 1944, r741; +cvt.rn.f32.u32 f53, r745; +mul.f32 f54, f53, 0f3CD3D17E; +cos.approx.f32 f5, f54; +sin.approx.f32 f55, f54; +neg.f32 f6, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r747, r745, 24, r746; +st.shared.v2.f32 [r747], {r6, r12}; +st.shared.v2.f32 [r747+8], {r97, r106}; +st.shared.v2.f32 [r747+16], {r134, r143}; +barrier.sync 0; +shl.b32 r748, r745, 4; +sub.s32 r749, r747, r748; +ld.shared.u32 r170, [r749]; +ld.shared.u32 r176, [r749+4]; +ld.shared.u32 r167, [r749+648]; +ld.shared.u32 r173, [r749+652]; +ld.shared.u32 r168, [r749+1296]; +ld.shared.u32 r174, [r749+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r745, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r750, rd5; +mul.lo.s32 r751, r750, 3; +sub.s32 r752, r745, r751; +shl.b32 r753, r752, 3; +add.s32 r754, r746, r753; +cvt.rn.f32.u32 f56, r750; +mul.f32 f57, f56, 0f3D9EDD1F; +cos.approx.f32 f17, f57; +sin.approx.f32 f58, f57; +neg.f32 f18, f58; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r755, r750, 72, r754; +st.shared.u32 [r755], r169; +st.shared.u32 [r755+4], r175; +st.shared.u32 [r755+24], r260; +st.shared.u32 [r755+28], r269; +st.shared.u32 [r755+48], r297; +st.shared.u32 [r755+52], r306; +barrier.sync 0; +ld.shared.u32 r333, [r749]; +ld.shared.u32 r339, [r749+4]; +ld.shared.u32 r330, [r749+648]; +ld.shared.u32 r336, [r749+652]; +ld.shared.u32 r331, [r749+1296]; +ld.shared.u32 r337, [r749+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r745, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r756, rd7; +mul.lo.s32 r757, r756, 9; +sub.s32 r758, r745, r757; +shl.b32 r759, r758, 3; +add.s32 r760, r746, r759; +cvt.rn.f32.u32 f59, r756; +mul.f32 f60, f59, 0f3E6E4BAE; +cos.approx.f32 f29, f60; +sin.approx.f32 f61, f60; +neg.f32 f30, f61; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r761, r756, 216, r760; +st.shared.u32 [r761], r332; +st.shared.u32 [r761+4], r338; +st.shared.u32 [r761+72], r423; +st.shared.u32 [r761+76], r432; +st.shared.u32 [r761+144], r460; +st.shared.u32 [r761+148], r469; +barrier.sync 0; +ld.shared.u32 r496, [r749]; +ld.shared.u32 r502, [r749+4]; +ld.shared.u32 r493, [r749+648]; +ld.shared.u32 r499, [r749+652]; +ld.shared.u32 r494, [r749+1296]; +ld.shared.u32 r500, [r749+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 r519, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 r537, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 r555, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 r573, r564, r570; +} +mul.wide.u32 rd8, r745, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r762, rd9; +sub.s32 r763, r745, r762; +shr.u32 r764, r763, 1; +add.s32 r765, r764, r762; +shr.u32 r766, r765, 4; +mul.lo.s32 r767, r766, 27; +sub.s32 r768, r745, r767; +shl.b32 r769, r768, 3; +add.s32 r770, r746, r769; +cvt.rn.f32.u32 f62, r766; +mul.f32 f63, f62, 0f3F32B8C2; +cos.approx.f32 f41, f63; +sin.approx.f32 f64, f63; +neg.f32 f42, f64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r576, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r579, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r581, {high, high}; +} +{ +mul.f16x2 r583, r555, r581; +} +{ +fma.rn.f16x2 r586, r519, r579, r583; +} +{ +mul.f16x2 r590, r519, r581; +} +{ +neg.f16x2 r593, r590; +} +{ +fma.rn.f16x2 r595, r555, r579, r593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r599, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r601, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r603, {low, high}; +} +{ +mul.f16x2 r604, r601, r603; +} +{ +mul.f16x2 r607, r576, r599; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r610, {high, low}; +} +{ +fma.rn.f16x2 r612, r604, r610, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r616, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r618, {high, high}; +} +{ +mul.f16x2 r620, r573, r618; +} +{ +fma.rn.f16x2 r623, r537, r616, r620; +} +{ +mul.f16x2 r627, r537, r618; +} +{ +neg.f16x2 r630, r627; +} +{ +fma.rn.f16x2 r632, r573, r616, r630; +} +barrier.sync 0; +mad.lo.s32 r771, r766, 648, r770; +st.shared.u32 [r771], r495; +st.shared.u32 [r771+4], r501; +st.shared.u32 [r771+216], r586; +st.shared.u32 [r771+220], r595; +st.shared.u32 [r771+432], r623; +st.shared.u32 [r771+436], r632; +barrier.sync 0; +ld.shared.u32 r659, [r749]; +ld.shared.u32 r665, [r749+4]; +ld.shared.u32 r656, [r749+648]; +ld.shared.u32 r662, [r749+652]; +ld.shared.u32 r657, [r749+1296]; +ld.shared.u32 r663, [r749+1300]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, r656, r657; +} +{ +add.f16x2 %0, r659, r655; +} +{ +add.f16x2 r661, r662, r663; +} +{ +add.f16x2 %1, r665, r661; +} +{ +add.f16x2 r667, r656, r657; +} +{ +mul.f16x2 r670, r667, r653; +} +{ +add.f16x2 r673, r659, r670; +} +{ +sub.f16x2 r676, r662, r663; +} +{ +mul.f16x2 r679, r676, r654; +} +{ +add.f16x2 %2, r673, r679; +} +{ +add.f16x2 r685, r656, r657; +} +{ +mul.f16x2 r688, r685, r653; +} +{ +add.f16x2 r691, r659, r688; +} +{ +sub.f16x2 r694, r662, r663; +} +{ +mul.f16x2 r697, r694, r654; +} +{ +sub.f16x2 %4, r691, r697; +} +{ +add.f16x2 r703, r662, r663; +} +{ +mul.f16x2 r706, r703, r653; +} +{ +add.f16x2 r709, r665, r706; +} +{ +sub.f16x2 r712, r656, r657; +} +{ +mul.f16x2 r715, r712, r654; +} +{ +sub.f16x2 %3, r709, r715; +} +{ +add.f16x2 r721, r662, r663; +} +{ +mul.f16x2 r724, r721, r653; +} +{ +add.f16x2 r727, r665, r724; +} +{ +sub.f16x2 r730, r656, r657; +} +{ +mul.f16x2 r733, r730, r654; +} +{ +add.f16x2 %5, r727, r733; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1085, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<65>; +.reg .b32 r<772>; +.reg .b64 rd<10>; +mov.u32 r739, %tid.y; +mov.u32 r740, %6; +mad.lo.s32 r741, r739, 972, r740; +mov.u32 r742, %tid.x; +mov.f32 f50, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r1, {low, high}; +} +mov.f32 f52, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r742, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r743, rd3; +mul.lo.s32 r744, r743, 81; +sub.s32 r745, r742, r744; +mad.lo.s32 r746, r743, 972, r741; +cvt.rn.f32.u32 f53, r745; +mul.f32 f54, f53, 0f3CD3D17E; +cos.approx.f32 f5, f54; +sin.approx.f32 f55, f54; +neg.f32 f6, f55; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f45, 0fBF800000; +mov.f32 f46, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r747, r745, 12, r746; +st.shared.u32 [r747], r6; +st.shared.u32 [r747+4], r97; +st.shared.u32 [r747+8], r134; +barrier.sync 0; +shl.b32 r748, r745, 3; +sub.s32 r749, r747, r748; +ld.shared.u32 r170, [r749]; +ld.shared.u32 r167, [r749+324]; +ld.shared.u32 r168, [r749+648]; +barrier.sync 0; +st.shared.u32 [r747], r12; +st.shared.u32 [r747+4], r106; +st.shared.u32 [r747+8], r143; +barrier.sync 0; +ld.shared.u32 r176, [r749]; +ld.shared.u32 r173, [r749+324]; +ld.shared.u32 r174, [r749+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r745, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r750, rd5; +mul.lo.s32 r751, r750, 3; +sub.s32 r752, r745, r751; +shl.b32 r753, r752, 2; +add.s32 r754, r746, r753; +cvt.rn.f32.u32 f56, r750; +mul.f32 f57, f56, 0f3D9EDD1F; +cos.approx.f32 f17, f57; +sin.approx.f32 f58, f57; +neg.f32 f18, f58; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r755, r750, 36, r754; +st.shared.u32 [r755], r169; +st.shared.u32 [r755+12], r260; +st.shared.u32 [r755+24], r297; +barrier.sync 0; +ld.shared.u32 r333, [r749]; +ld.shared.u32 r330, [r749+324]; +ld.shared.u32 r331, [r749+648]; +barrier.sync 0; +st.shared.u32 [r755], r175; +st.shared.u32 [r755+12], r269; +st.shared.u32 [r755+24], r306; +barrier.sync 0; +ld.shared.u32 r339, [r749]; +ld.shared.u32 r336, [r749+324]; +ld.shared.u32 r337, [r749+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r745, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r756, rd7; +mul.lo.s32 r757, r756, 9; +sub.s32 r758, r745, r757; +shl.b32 r759, r758, 2; +add.s32 r760, r746, r759; +cvt.rn.f32.u32 f59, r756; +mul.f32 f60, f59, 0f3E6E4BAE; +cos.approx.f32 f29, f60; +sin.approx.f32 f61, f60; +neg.f32 f30, f61; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r761, r756, 108, r760; +st.shared.u32 [r761], r332; +st.shared.u32 [r761+36], r423; +st.shared.u32 [r761+72], r460; +barrier.sync 0; +ld.shared.u32 r496, [r749]; +ld.shared.u32 r493, [r749+324]; +ld.shared.u32 r494, [r749+648]; +barrier.sync 0; +st.shared.u32 [r761], r338; +st.shared.u32 [r761+36], r432; +st.shared.u32 [r761+72], r469; +barrier.sync 0; +ld.shared.u32 r502, [r749]; +ld.shared.u32 r499, [r749+324]; +ld.shared.u32 r500, [r749+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 r519, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 r537, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 r555, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 r573, r564, r570; +} +mul.wide.u32 rd8, r745, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r762, rd9; +sub.s32 r763, r745, r762; +shr.u32 r764, r763, 1; +add.s32 r765, r764, r762; +shr.u32 r766, r765, 4; +mul.lo.s32 r767, r766, 27; +sub.s32 r768, r745, r767; +shl.b32 r769, r768, 2; +add.s32 r770, r746, r769; +cvt.rn.f32.u32 f62, r766; +mul.f32 f63, f62, 0f3F32B8C2; +cos.approx.f32 f41, f63; +sin.approx.f32 f64, f63; +neg.f32 f42, f64; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r576, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r579, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r581, {high, high}; +} +{ +mul.f16x2 r583, r555, r581; +} +{ +fma.rn.f16x2 r586, r519, r579, r583; +} +{ +mul.f16x2 r590, r519, r581; +} +{ +neg.f16x2 r593, r590; +} +{ +fma.rn.f16x2 r595, r555, r579, r593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r599, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r601, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f45; +cvt.rn.f16.f32 high, f46; +mov.b32 r603, {low, high}; +} +{ +mul.f16x2 r604, r601, r603; +} +{ +mul.f16x2 r607, r576, r599; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r610, {high, low}; +} +{ +fma.rn.f16x2 r612, r604, r610, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r616, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r618, {high, high}; +} +{ +mul.f16x2 r620, r573, r618; +} +{ +fma.rn.f16x2 r623, r537, r616, r620; +} +{ +mul.f16x2 r627, r537, r618; +} +{ +neg.f16x2 r630, r627; +} +{ +fma.rn.f16x2 r632, r573, r616, r630; +} +barrier.sync 0; +mad.lo.s32 r771, r766, 324, r770; +st.shared.u32 [r771], r495; +st.shared.u32 [r771+108], r586; +st.shared.u32 [r771+216], r623; +barrier.sync 0; +ld.shared.u32 r659, [r749]; +ld.shared.u32 r656, [r749+324]; +ld.shared.u32 r657, [r749+648]; +barrier.sync 0; +st.shared.u32 [r771], r501; +st.shared.u32 [r771+108], r595; +st.shared.u32 [r771+216], r632; +barrier.sync 0; +ld.shared.u32 r665, [r749]; +ld.shared.u32 r662, [r749+324]; +ld.shared.u32 r663, [r749+648]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, r656, r657; +} +{ +add.f16x2 %0, r659, r655; +} +{ +add.f16x2 r661, r662, r663; +} +{ +add.f16x2 %1, r665, r661; +} +{ +add.f16x2 r667, r656, r657; +} +{ +mul.f16x2 r670, r667, r653; +} +{ +add.f16x2 r673, r659, r670; +} +{ +sub.f16x2 r676, r662, r663; +} +{ +mul.f16x2 r679, r676, r654; +} +{ +add.f16x2 %2, r673, r679; +} +{ +add.f16x2 r685, r656, r657; +} +{ +mul.f16x2 r688, r685, r653; +} +{ +add.f16x2 r691, r659, r688; +} +{ +sub.f16x2 r694, r662, r663; +} +{ +mul.f16x2 r697, r694, r654; +} +{ +sub.f16x2 %4, r691, r697; +} +{ +add.f16x2 r703, r662, r663; +} +{ +mul.f16x2 r706, r703, r653; +} +{ +add.f16x2 r709, r665, r706; +} +{ +sub.f16x2 r712, r656, r657; +} +{ +mul.f16x2 r715, r712, r654; +} +{ +sub.f16x2 %3, r709, r715; +} +{ +add.f16x2 r721, r662, r663; +} +{ +mul.f16x2 r724, r721, r653; +} +{ +add.f16x2 r727, r665, r724; +} +{ +sub.f16x2 r730, r656, r657; +} +{ +mul.f16x2 r733, r730, r654; +} +{ +add.f16x2 %5, r727, r733; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..b6e9f66cb4e43 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp32_fwd.hpp.inc @@ -0,0 +1,4176 @@ +#ifndef CUFFTDX_FFT_243_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_243_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<132, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1626>; +.reg .b32 r<14>; +.reg .b64 rd<11>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 1944, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1625, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1624, %57, f1625; +mul.f32 f119, f1625, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1623, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1622, %63, f1623; +mul.f32 f135, f1623, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1621, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1620, %69, f1621; +mul.f32 f151, f1621, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f1619, f133, 0f3F441B7D; +sub.f32 f159, f1619, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f1617, f149, 0f3E31D0D4; +mul.f32 f1618, f155, 0fBF7C1C5C; +sub.f32 f164, f1617, f1618; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f1615, f134, 0f3E31D0D4; +mul.f32 f1616, f140, 0fBF7C1C5C; +sub.f32 f169, f1615, f1616; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f1613, f150, 0fBF708FB2; +mul.f32 f1614, f156, 0fBEAF1D44; +sub.f32 f174, f1613, f1614; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1612, f1622, f1620; +sub.f32 f183, f1622, f1620; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1611, f1624, f1612; +mul.f32 f187, f1612, 0f3F000000; +sub.f32 f188, f1624, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1610, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1609, f123, f1610; +mul.f32 f203, f1610, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1608, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1607, f124, f1608; +mul.f32 f219, f1608, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1604, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1602, %112, f1604; +mul.f32 f235, f1604, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1599, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1597, %115, f1599; +mul.f32 f251, f1599, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1594, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1592, %118, f1594; +mul.f32 f267, f1594, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f1591, f249, 0f3F441B7D; +sub.f32 f275, f1591, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f1590, f265, 0f3E31D0D4; +sub.f32 f280, f1590, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f1588, f250, 0f3E31D0D4; +mul.f32 f1589, f256, 0fBF7C1C5C; +sub.f32 f285, f1588, f1589; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f1586, f266, 0fBF708FB2; +mul.f32 f1587, f272, 0fBEAF1D44; +sub.f32 f290, f1586, f1587; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1585, f1597, f1592; +sub.f32 f299, f1597, f1592; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1584, f1602, f1585; +mul.f32 f303, f1585, 0f3F000000; +sub.f32 f304, f1602, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1583, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1582, f239, f1583; +mul.f32 f319, f1583, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1581, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1580, f240, f1581; +mul.f32 f335, f1581, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1577, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1575, %121, f1577; +mul.f32 f351, f1577, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1572, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1570, %124, f1572; +mul.f32 f367, f1572, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1568, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1566, %126, f1568; +mul.f32 f383, f1568, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f1565, f365, 0f3F441B7D; +sub.f32 f391, f1565, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f1564, f381, 0f3E31D0D4; +sub.f32 f396, f1564, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f1562, f366, 0f3E31D0D4; +mul.f32 f1563, f372, 0fBF7C1C5C; +sub.f32 f401, f1562, f1563; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f1560, f382, 0fBF708FB2; +mul.f32 f1561, f388, 0fBEAF1D44; +sub.f32 f406, f1560, f1561; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1559, f1570, f1566; +sub.f32 f415, f1570, f1566; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1558, f1575, f1559; +mul.f32 f419, f1559, 0f3F000000; +sub.f32 f420, f1575, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1557, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1556, f355, f1557; +mul.f32 f435, f1557, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1555, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1554, f356, f1555; +mul.f32 f451, f1555, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1582, 0fBE6C2691; +mul.f32 f1553, f310, 0f3F791978; +sub.f32 f459, f1553, f458; +mul.f32 f460, f1582, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f1551, f426, 0f3F64C51C; +mul.f32 f1552, f1556, 0fBEE5C902; +sub.f32 f464, f1551, f1552; +mul.f32 f465, f1556, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f1549, f326, 0f3F64C51C; +mul.f32 f1550, f1580, 0fBEE5C902; +sub.f32 f469, f1549, f1550; +mul.f32 f470, f1580, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f1547, f442, 0f3F18DF63; +mul.f32 f1548, f1554, 0fBF4D57F2; +sub.f32 f474, f1547, f1548; +mul.f32 f475, f1554, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f1545, f301, 0f3F441B7D; +mul.f32 f1546, f307, 0fBF248DBB; +sub.f32 f479, f1545, f1546; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f1544, f417, 0f3E31D0D4; +sub.f32 f484, f1544, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f1543, f317, 0f3F18DF63; +sub.f32 f489, f1543, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f1542, f433, 0fBE92D7E0; +sub.f32 f494, f1542, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f1541, f333, 0f3ECACAF8; +sub.f32 f499, f1541, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f1540, f449, 0fBF2FAD88; +sub.f32 f504, f1540, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f1539, f302, 0f3E31D0D4; +sub.f32 f509, f1539, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f1537, f418, 0fBF708FB2; +mul.f32 f1538, f424, 0fBEAF1D44; +sub.f32 f514, f1537, f1538; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f1535, f318, 0fBD6E2946; +mul.f32 f1536, f324, 0fBF7F9120; +sub.f32 f519, f1535, f1536; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f1533, f434, 0fBF7E44DE; +mul.f32 f1534, f440, 0f3DEDC21F; +sub.f32 f524, f1533, f1534; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f1532, f334, 0fBE92D7E0; +sub.f32 f529, f1532, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f1531, f450, 0fBF55E287; +sub.f32 f534, f1531, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f1530, f1584, f1558; +sub.f32 f541, f1584, f1558; +mul.f32 f542, f541, 0f3F5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f1530, 0f3F000000; +sub.f32 f546, f1611, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0f3F5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f1529, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0f3F5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f1528, f1609, f1529; +mul.f32 f561, f1529, 0f3F000000; +sub.f32 f562, f1609, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0f3F5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f1527, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0f3F5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f1526, f1607, f1527; +mul.f32 f577, f1527, 0f3F000000; +sub.f32 f578, f1607, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0f3F5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f1525, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0f3F5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f1524, f191, f1525; +mul.f32 f593, f1525, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0f3F5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f1523, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f1522, f207, f1523; +mul.f32 f609, f1523, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0f3F5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f1521, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0f3F5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f1520, f223, f1521; +mul.f32 f625, f1521, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0f3F5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f1519, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0f3F5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f1518, f192, f1519; +mul.f32 f641, f1519, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0f3F5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f1517, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0f3F5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f1516, f208, f1517; +mul.f32 f657, f1517, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0f3F5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f1515, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0f3F5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f1514, f224, f1515; +mul.f32 f673, f1515, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0f3F5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 1944, r3; +mul.wide.u32 rd7, r7, 8; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f684, f680, f1528; +mul.f32 f685, f679, f1528; +mul.f32 f1512, f679, f679; +mul.f32 f1513, f680, f680; +sub.f32 f688, f1512, f1513; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f692, f690, f1526; +mul.f32 f693, f688, f1526; +mul.f32 f695, f680, f690; +mul.f32 f1511, f679, f688; +sub.f32 f696, f1511, f695; +mul.f32 f1510, f688, f568; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f700, f698, f1524; +mul.f32 f701, f696, f1524; +mul.f32 f1508, f679, f696; +mul.f32 f1509, f680, f698; +sub.f32 f704, f1508, f1509; +mul.f32 f1507, f696, f584; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f708, f706, f1522; +mul.f32 f709, f704, f1522; +mul.f32 f711, f680, f706; +mul.f32 f1506, f679, f704; +sub.f32 f712, f1506, f711; +mul.f32 f1505, f704, f600; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f716, f714, f1520; +mul.f32 f717, f712, f1520; +mul.f32 f719, f680, f714; +mul.f32 f1504, f679, f712; +sub.f32 f720, f1504, f719; +mul.f32 f1503, f712, f616; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f724, f722, f1518; +mul.f32 f725, f720, f1518; +mul.f32 f1501, f679, f720; +mul.f32 f1502, f680, f722; +sub.f32 f728, f1501, f1502; +mul.f32 f1500, f720, f632; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f732, f730, f1516; +mul.f32 f733, f728, f1516; +mul.f32 f735, f680, f730; +mul.f32 f1499, f679, f728; +sub.f32 f736, f1499, f735; +mul.f32 f1498, f728, f648; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f740, f738, f1514; +mul.f32 f741, f736, f1514; +mul.f32 f743, f680, f738; +mul.f32 f1497, f679, f736; +sub.f32 f744, f1497, f743; +mul.f32 f1496, f736, f664; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f748, f746, f549; +mul.f32 f749, f744, f549; +mul.f32 f1494, f679, f744; +mul.f32 f1495, f680, f746; +sub.f32 f752, f1494, f1495; +mul.f32 f1493, f744, f543; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f756, f754, f565; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f1492, f679, f752; +sub.f32 f760, f1492, f759; +mul.f32 f1491, f752, f559; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f764, f762, f581; +mul.f32 f765, f760, f581; +mul.f32 f1489, f679, f760; +mul.f32 f1490, f680, f762; +sub.f32 f768, f1489, f1490; +mul.f32 f1488, f760, f575; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f772, f770, f597; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f1487, f679, f768; +sub.f32 f776, f1487, f775; +mul.f32 f1486, f768, f591; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f780, f778, f613; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f1485, f679, f776; +sub.f32 f784, f1485, f783; +mul.f32 f1484, f776, f607; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f788, f786, f629; +mul.f32 f789, f784, f629; +mul.f32 f1482, f679, f784; +mul.f32 f1483, f680, f786; +sub.f32 f792, f1482, f1483; +mul.f32 f1481, f784, f623; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f796, f794, f645; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f1480, f679, f792; +sub.f32 f800, f1480, f799; +mul.f32 f1479, f792, f639; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f804, f802, f661; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f1478, f679, f800; +sub.f32 f808, f1478, f807; +mul.f32 f1477, f800, f655; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f812, f810, f677; +mul.f32 f813, f808, f677; +mul.f32 f1475, f679, f808; +mul.f32 f1476, f680, f810; +sub.f32 f816, f1475, f1476; +mul.f32 f1474, f808, f671; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f820, f818, f550; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f1473, f679, f816; +sub.f32 f824, f1473, f823; +mul.f32 f1472, f816, f544; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f828, f826, f566; +mul.f32 f829, f824, f566; +mul.f32 f1470, f679, f824; +mul.f32 f1471, f680, f826; +sub.f32 f832, f1470, f1471; +mul.f32 f1469, f824, f560; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f836, f834, f582; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f1468, f679, f832; +sub.f32 f840, f1468, f839; +mul.f32 f1467, f832, f576; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f844, f842, f598; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f1466, f679, f840; +sub.f32 f848, f1466, f847; +mul.f32 f1465, f840, f592; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f852, f850, f614; +mul.f32 f853, f848, f614; +mul.f32 f1463, f679, f848; +mul.f32 f1464, f680, f850; +sub.f32 f856, f1463, f1464; +mul.f32 f1462, f848, f608; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f860, f858, f630; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f1461, f679, f856; +sub.f32 f864, f1461, f863; +mul.f32 f1460, f856, f624; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f868, f866, f646; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f1459, f679, f864; +sub.f32 f872, f1459, f871; +mul.f32 f1458, f864, f640; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f876, f874, f662; +mul.f32 f877, f872, f662; +mul.f32 f1456, f679, f872; +mul.f32 f1457, f680, f874; +sub.f32 f880, f1456, f1457; +mul.f32 f1455, f679, f552; +mul.f32 f881, f679, f874; +mul.f32 f1454, f872, f656; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f880, f672; +mul.f32 f884, f882, f678; +mul.f32 f885, f880, f678; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +add.f32 f886, f1611, f1530; +add.f32 f887, f178, f537; +st.shared.v2.f32 [r9], {f887, f886}; +fma.rn.f32 f888, f680, f552, f685; +sub.f32 f889, f1455, f684; +st.shared.v2.f32 [r9+8], {f889, f888}; +fma.rn.f32 f890, f690, f568, f693; +sub.f32 f891, f1510, f692; +st.shared.v2.f32 [r9+16], {f891, f890}; +fma.rn.f32 f892, f698, f584, f701; +sub.f32 f893, f1507, f700; +st.shared.v2.f32 [r9+24], {f893, f892}; +fma.rn.f32 f894, f706, f600, f709; +sub.f32 f895, f1505, f708; +st.shared.v2.f32 [r9+32], {f895, f894}; +fma.rn.f32 f896, f714, f616, f717; +sub.f32 f897, f1503, f716; +st.shared.v2.f32 [r9+40], {f897, f896}; +fma.rn.f32 f898, f722, f632, f725; +sub.f32 f899, f1500, f724; +st.shared.v2.f32 [r9+48], {f899, f898}; +sub.f32 f900, f1498, f732; +fma.rn.f32 f901, f730, f648, f733; +st.shared.v2.f32 [r9+56], {f900, f901}; +fma.rn.f32 f902, f738, f664, f741; +sub.f32 f903, f1496, f740; +st.shared.v2.f32 [r9+64], {f903, f902}; +fma.rn.f32 f904, f746, f543, f749; +sub.f32 f905, f1493, f748; +st.shared.v2.f32 [r9+72], {f905, f904}; +fma.rn.f32 f906, f754, f559, f757; +sub.f32 f907, f1491, f756; +st.shared.v2.f32 [r9+80], {f907, f906}; +fma.rn.f32 f908, f762, f575, f765; +sub.f32 f909, f1488, f764; +st.shared.v2.f32 [r9+88], {f909, f908}; +fma.rn.f32 f910, f770, f591, f773; +sub.f32 f911, f1486, f772; +st.shared.v2.f32 [r9+96], {f911, f910}; +fma.rn.f32 f912, f778, f607, f781; +sub.f32 f913, f1484, f780; +st.shared.v2.f32 [r9+104], {f913, f912}; +fma.rn.f32 f914, f786, f623, f789; +sub.f32 f915, f1481, f788; +st.shared.v2.f32 [r9+112], {f915, f914}; +fma.rn.f32 f916, f794, f639, f797; +sub.f32 f917, f1479, f796; +st.shared.v2.f32 [r9+120], {f917, f916}; +fma.rn.f32 f918, f802, f655, f805; +sub.f32 f919, f1477, f804; +st.shared.v2.f32 [r9+128], {f919, f918}; +fma.rn.f32 f920, f810, f671, f813; +sub.f32 f921, f1474, f812; +st.shared.v2.f32 [r9+136], {f921, f920}; +fma.rn.f32 f922, f818, f544, f821; +sub.f32 f923, f1472, f820; +st.shared.v2.f32 [r9+144], {f923, f922}; +fma.rn.f32 f924, f826, f560, f829; +sub.f32 f925, f1469, f828; +st.shared.v2.f32 [r9+152], {f925, f924}; +fma.rn.f32 f926, f834, f576, f837; +sub.f32 f927, f1467, f836; +st.shared.v2.f32 [r9+160], {f927, f926}; +fma.rn.f32 f928, f842, f592, f845; +sub.f32 f929, f1465, f844; +st.shared.v2.f32 [r9+168], {f929, f928}; +fma.rn.f32 f930, f850, f608, f853; +sub.f32 f931, f1462, f852; +st.shared.v2.f32 [r9+176], {f931, f930}; +fma.rn.f32 f932, f858, f624, f861; +sub.f32 f933, f1460, f860; +st.shared.v2.f32 [r9+184], {f933, f932}; +fma.rn.f32 f934, f866, f640, f869; +sub.f32 f935, f1458, f868; +st.shared.v2.f32 [r9+192], {f935, f934}; +fma.rn.f32 f936, f874, f656, f877; +sub.f32 f937, f1454, f876; +st.shared.v2.f32 [r9+200], {f937, f936}; +fma.rn.f32 f938, f882, f672, f885; +sub.f32 f939, f883, f884; +st.shared.v2.f32 [r9+208], {f939, f938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+72]; +ld.shared.v2.f32 {f948, f949}, [r10+144]; +ld.shared.v2.f32 {f952, f953}, [r10+216]; +ld.shared.v2.f32 {f956, f957}, [r10+288]; +ld.shared.v2.f32 {f960, f961}, [r10+360]; +ld.shared.v2.f32 {f964, f965}, [r10+432]; +ld.shared.v2.f32 {f968, f969}, [r10+504]; +ld.shared.v2.f32 {f972, f973}, [r10+576]; +ld.shared.v2.f32 {f976, f977}, [r10+648]; +ld.shared.v2.f32 {f980, f981}, [r10+720]; +ld.shared.v2.f32 {f984, f985}, [r10+792]; +ld.shared.v2.f32 {f988, f989}, [r10+864]; +ld.shared.v2.f32 {f992, f993}, [r10+936]; +ld.shared.v2.f32 {f996, f997}, [r10+1008]; +ld.shared.v2.f32 {f1000, f1001}, [r10+1080]; +ld.shared.v2.f32 {f1004, f1005}, [r10+1152]; +ld.shared.v2.f32 {f1008, f1009}, [r10+1224]; +ld.shared.v2.f32 {f1012, f1013}, [r10+1296]; +ld.shared.v2.f32 {f1016, f1017}, [r10+1368]; +ld.shared.v2.f32 {f1020, f1021}, [r10+1440]; +ld.shared.v2.f32 {f1024, f1025}, [r10+1512]; +ld.shared.v2.f32 {f1028, f1029}, [r10+1584]; +ld.shared.v2.f32 {f1032, f1033}, [r10+1656]; +ld.shared.v2.f32 {f1036, f1037}, [r10+1728]; +ld.shared.v2.f32 {f1040, f1041}, [r10+1800]; +ld.shared.v2.f32 {f1044, f1045}, [r10+1872]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f1453, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0f3F5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f1452, f941, f1453; +mul.f32 f1058, f1453, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0f3F5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f1451, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0f3F5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f1450, f953, f1451; +mul.f32 f1074, f1451, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0f3F5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f1449, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0f3F5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f1448, f965, f1449; +mul.f32 f1090, f1449, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0f3F5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f1446, f1072, 0f3F441B7D; +mul.f32 f1447, f1078, 0fBF248DBB; +sub.f32 f1098, f1446, f1447; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0fBF248DBB, f1099; +mul.f32 f1444, f1088, 0f3E31D0D4; +mul.f32 f1445, f1094, 0fBF7C1C5C; +sub.f32 f1103, f1444, f1445; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0fBF7C1C5C, f1104; +mul.f32 f1442, f1073, 0f3E31D0D4; +mul.f32 f1443, f1079, 0fBF7C1C5C; +sub.f32 f1108, f1442, f1443; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0fBF7C1C5C, f1109; +mul.f32 f1112, f1095, 0fBEAF1D44; +mul.f32 f1441, f1089, 0fBF708FB2; +sub.f32 f1113, f1441, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0fBEAF1D44, f1114; +add.f32 f1116, f1065, f1081; +mul.f32 f1118, f1116, 0f3F000000; +sub.f32 f1119, f1049, f1118; +add.f32 f1440, f1450, f1448; +sub.f32 f1120, f1450, f1448; +mul.f32 f1121, f1120, 0f3F5DB3D7; +mul.f32 f1122, f1440, 0f3F000000; +sub.f32 f1123, f1452, f1122; +sub.f32 f1124, f1065, f1081; +mul.f32 f1125, f1124, 0f3F5DB3D7; +add.f32 f1126, f1098, f1103; +mul.f32 f1128, f1126, 0f3F000000; +sub.f32 f1129, f1056, f1128; +add.f32 f1439, f1100, f1105; +sub.f32 f1130, f1100, f1105; +mul.f32 f1131, f1130, 0f3F5DB3D7; +mul.f32 f1132, f1439, 0f3F000000; +sub.f32 f1133, f1062, f1132; +sub.f32 f1134, f1098, f1103; +mul.f32 f1135, f1134, 0f3F5DB3D7; +add.f32 f1136, f1108, f1113; +mul.f32 f1138, f1136, 0f3F000000; +sub.f32 f1139, f1057, f1138; +add.f32 f1438, f1110, f1115; +sub.f32 f1140, f1110, f1115; +mul.f32 f1141, f1140, 0f3F5DB3D7; +mul.f32 f1142, f1438, 0f3F000000; +sub.f32 f1143, f1063, f1142; +sub.f32 f1144, f1108, f1113; +mul.f32 f1145, f1144, 0f3F5DB3D7; +add.f32 f1146, f980, f1016; +add.f32 f1147, f944, f1146; +mul.f32 f1150, f1146, 0f3F000000; +sub.f32 f1151, f944, f1150; +add.f32 f1437, f981, f1017; +sub.f32 f1152, f981, f1017; +mul.f32 f1153, f1152, 0f3F5DB3D7; +add.f32 f1154, f1153, f1151; +sub.f32 f1155, f1151, f1153; +add.f32 f1436, f945, f1437; +mul.f32 f1156, f1437, 0f3F000000; +sub.f32 f1157, f945, f1156; +sub.f32 f1158, f980, f1016; +mul.f32 f1159, f1158, 0f3F5DB3D7; +sub.f32 f1160, f1157, f1159; +add.f32 f1161, f1159, f1157; +add.f32 f1162, f992, f1028; +add.f32 f1163, f956, f1162; +mul.f32 f1166, f1162, 0f3F000000; +sub.f32 f1167, f956, f1166; +add.f32 f1435, f993, f1029; +sub.f32 f1168, f993, f1029; +mul.f32 f1169, f1168, 0f3F5DB3D7; +add.f32 f1170, f1169, f1167; +sub.f32 f1171, f1167, f1169; +add.f32 f1434, f957, f1435; +mul.f32 f1172, f1435, 0f3F000000; +sub.f32 f1173, f957, f1172; +sub.f32 f1174, f992, f1028; +mul.f32 f1175, f1174, 0f3F5DB3D7; +sub.f32 f1176, f1173, f1175; +add.f32 f1177, f1175, f1173; +add.f32 f1178, f1004, f1040; +add.f32 f1179, f968, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f968, f1182; +add.f32 f1433, f1005, f1041; +sub.f32 f1184, f1005, f1041; +mul.f32 f1185, f1184, 0f3F5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f1432, f969, f1433; +mul.f32 f1188, f1433, 0f3F000000; +sub.f32 f1189, f969, f1188; +sub.f32 f1190, f1004, f1040; +mul.f32 f1191, f1190, 0f3F5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +mul.f32 f1195, f1176, 0fBF248DBB; +mul.f32 f1431, f1170, 0f3F441B7D; +sub.f32 f1196, f1431, f1195; +mul.f32 f1197, f1176, 0f3F441B7D; +fma.rn.f32 f1198, f1170, 0fBF248DBB, f1197; +mul.f32 f1200, f1192, 0fBF7C1C5C; +mul.f32 f1430, f1186, 0f3E31D0D4; +sub.f32 f1201, f1430, f1200; +mul.f32 f1202, f1192, 0f3E31D0D4; +fma.rn.f32 f1203, f1186, 0fBF7C1C5C, f1202; +mul.f32 f1205, f1177, 0fBF7C1C5C; +mul.f32 f1429, f1171, 0f3E31D0D4; +sub.f32 f1206, f1429, f1205; +mul.f32 f1207, f1177, 0f3E31D0D4; +fma.rn.f32 f1208, f1171, 0fBF7C1C5C, f1207; +mul.f32 f1210, f1193, 0fBEAF1D44; +mul.f32 f1428, f1187, 0fBF708FB2; +sub.f32 f1211, f1428, f1210; +mul.f32 f1212, f1193, 0fBF708FB2; +fma.rn.f32 f1213, f1187, 0fBEAF1D44, f1212; +add.f32 f1214, f1163, f1179; +mul.f32 f1216, f1214, 0f3F000000; +sub.f32 f1217, f1147, f1216; +add.f32 f1427, f1434, f1432; +sub.f32 f1218, f1434, f1432; +mul.f32 f1219, f1218, 0f3F5DB3D7; +mul.f32 f1220, f1427, 0f3F000000; +sub.f32 f1221, f1436, f1220; +sub.f32 f1222, f1163, f1179; +mul.f32 f1223, f1222, 0f3F5DB3D7; +add.f32 f1224, f1196, f1201; +mul.f32 f1226, f1224, 0f3F000000; +sub.f32 f1227, f1154, f1226; +add.f32 f1426, f1198, f1203; +sub.f32 f1228, f1198, f1203; +mul.f32 f1229, f1228, 0f3F5DB3D7; +mul.f32 f1230, f1426, 0f3F000000; +sub.f32 f1231, f1160, f1230; +sub.f32 f1232, f1196, f1201; +mul.f32 f1233, f1232, 0f3F5DB3D7; +add.f32 f1234, f1206, f1211; +mul.f32 f1236, f1234, 0f3F000000; +sub.f32 f1237, f1155, f1236; +add.f32 f1425, f1208, f1213; +sub.f32 f1238, f1208, f1213; +mul.f32 f1239, f1238, 0f3F5DB3D7; +mul.f32 f1240, f1425, 0f3F000000; +sub.f32 f1241, f1161, f1240; +sub.f32 f1242, f1206, f1211; +mul.f32 f1243, f1242, 0f3F5DB3D7; +add.f32 f1244, f984, f1020; +add.f32 f1245, f948, f1244; +mul.f32 f1248, f1244, 0f3F000000; +sub.f32 f1249, f948, f1248; +add.f32 f1424, f985, f1021; +sub.f32 f1250, f985, f1021; +mul.f32 f1251, f1250, 0f3F5DB3D7; +add.f32 f1252, f1251, f1249; +sub.f32 f1253, f1249, f1251; +add.f32 f1423, f949, f1424; +mul.f32 f1254, f1424, 0f3F000000; +sub.f32 f1255, f949, f1254; +sub.f32 f1256, f984, f1020; +mul.f32 f1257, f1256, 0f3F5DB3D7; +sub.f32 f1258, f1255, f1257; +add.f32 f1259, f1257, f1255; +add.f32 f1260, f996, f1032; +add.f32 f1261, f960, f1260; +mul.f32 f1264, f1260, 0f3F000000; +sub.f32 f1265, f960, f1264; +add.f32 f1422, f997, f1033; +sub.f32 f1266, f997, f1033; +mul.f32 f1267, f1266, 0f3F5DB3D7; +add.f32 f1268, f1267, f1265; +sub.f32 f1269, f1265, f1267; +add.f32 f1421, f961, f1422; +mul.f32 f1270, f1422, 0f3F000000; +sub.f32 f1271, f961, f1270; +sub.f32 f1272, f996, f1032; +mul.f32 f1273, f1272, 0f3F5DB3D7; +sub.f32 f1274, f1271, f1273; +add.f32 f1275, f1273, f1271; +add.f32 f1276, f1008, f1044; +add.f32 f1277, f972, f1276; +mul.f32 f1280, f1276, 0f3F000000; +sub.f32 f1281, f972, f1280; +add.f32 f1420, f1009, f1045; +sub.f32 f1282, f1009, f1045; +mul.f32 f1283, f1282, 0f3F5DB3D7; +add.f32 f1284, f1283, f1281; +sub.f32 f1285, f1281, f1283; +add.f32 f1419, f973, f1420; +mul.f32 f1286, f1420, 0f3F000000; +sub.f32 f1287, f973, f1286; +sub.f32 f1288, f1008, f1044; +mul.f32 f1289, f1288, 0f3F5DB3D7; +sub.f32 f1290, f1287, f1289; +add.f32 f1291, f1289, f1287; +mul.f32 f1293, f1274, 0fBF248DBB; +mul.f32 f1418, f1268, 0f3F441B7D; +sub.f32 f1294, f1418, f1293; +mul.f32 f1295, f1274, 0f3F441B7D; +fma.rn.f32 f1296, f1268, 0fBF248DBB, f1295; +mul.f32 f1298, f1290, 0fBF7C1C5C; +mul.f32 f1417, f1284, 0f3E31D0D4; +sub.f32 f1299, f1417, f1298; +mul.f32 f1300, f1290, 0f3E31D0D4; +fma.rn.f32 f1301, f1284, 0fBF7C1C5C, f1300; +mul.f32 f1415, f1269, 0f3E31D0D4; +mul.f32 f1416, f1275, 0fBF7C1C5C; +sub.f32 f1304, f1415, f1416; +mul.f32 f1305, f1275, 0f3E31D0D4; +fma.rn.f32 f1306, f1269, 0fBF7C1C5C, f1305; +mul.f32 f1413, f1285, 0fBF708FB2; +mul.f32 f1414, f1291, 0fBEAF1D44; +sub.f32 f1309, f1413, f1414; +mul.f32 f1310, f1291, 0fBF708FB2; +fma.rn.f32 f1311, f1285, 0fBEAF1D44, f1310; +add.f32 f1312, f1261, f1277; +mul.f32 f1314, f1312, 0f3F000000; +sub.f32 f1315, f1245, f1314; +add.f32 f1412, f1421, f1419; +sub.f32 f1316, f1421, f1419; +mul.f32 f1317, f1316, 0f3F5DB3D7; +mul.f32 f1318, f1412, 0f3F000000; +sub.f32 f1319, f1423, f1318; +sub.f32 f1320, f1261, f1277; +mul.f32 f1321, f1320, 0f3F5DB3D7; +add.f32 f1322, f1294, f1299; +mul.f32 f1324, f1322, 0f3F000000; +sub.f32 f1325, f1252, f1324; +add.f32 f1411, f1296, f1301; +sub.f32 f1326, f1296, f1301; +mul.f32 f1327, f1326, 0f3F5DB3D7; +mul.f32 f1328, f1411, 0f3F000000; +sub.f32 f1329, f1258, f1328; +sub.f32 f1330, f1294, f1299; +mul.f32 f1331, f1330, 0f3F5DB3D7; +add.f32 f1332, f1304, f1309; +mul.f32 f1334, f1332, 0f3F000000; +sub.f32 f1335, f1253, f1334; +add.f32 f1410, f1306, f1311; +sub.f32 f1336, f1306, f1311; +mul.f32 f1337, f1336, 0f3F5DB3D7; +mul.f32 f1338, f1410, 0f3F000000; +sub.f32 f1339, f1259, f1338; +sub.f32 f1340, f1304, f1309; +mul.f32 f1341, f1340, 0f3F5DB3D7; +add.f32 %1, f1452, f1440; +add.f32 %0, f1049, f1116; +add.f32 %3, f1436, f1427; +add.f32 %2, f1147, f1214; +add.f32 %5, f1423, f1412; +add.f32 %4, f1245, f1312; +add.f32 %7, f1062, f1439; +add.f32 %6, f1056, f1126; +add.f32 %9, f1160, f1426; +add.f32 %8, f1154, f1224; +add.f32 %11, f1258, f1411; +add.f32 %10, f1252, f1322; +add.f32 %13, f1063, f1438; +add.f32 %12, f1057, f1136; +add.f32 %15, f1161, f1425; +add.f32 %14, f1155, f1234; +add.f32 %17, f1259, f1410; +add.f32 %16, f1253, f1332; +add.f32 %18, f1121, f1119; +sub.f32 %19, f1123, f1125; +sub.f32 %21, f1221, f1223; +add.f32 %20, f1219, f1217; +sub.f32 %23, f1319, f1321; +add.f32 %22, f1317, f1315; +sub.f32 %25, f1133, f1135; +add.f32 %24, f1131, f1129; +add.f32 %26, f1229, f1227; +sub.f32 %27, f1231, f1233; +add.f32 %28, f1327, f1325; +sub.f32 %29, f1329, f1331; +add.f32 %30, f1141, f1139; +sub.f32 %31, f1143, f1145; +sub.f32 %33, f1241, f1243; +add.f32 %32, f1239, f1237; +sub.f32 %35, f1339, f1341; +add.f32 %34, f1337, f1335; +add.f32 %37, f1125, f1123; +sub.f32 %36, f1119, f1121; +add.f32 %39, f1223, f1221; +sub.f32 %38, f1217, f1219; +add.f32 %41, f1321, f1319; +sub.f32 %40, f1315, f1317; +add.f32 %43, f1135, f1133; +sub.f32 %42, f1129, f1131; +add.f32 %45, f1233, f1231; +sub.f32 %44, f1227, f1229; +add.f32 %47, f1331, f1329; +sub.f32 %46, f1325, f1327; +add.f32 %49, f1145, f1143; +sub.f32 %48, f1139, f1141; +add.f32 %51, f1243, f1241; +sub.f32 %50, f1237, f1239; +add.f32 %53, f1341, f1339; +sub.f32 %52, f1335, f1337; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<133, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1590>; +.reg .b32 r<14>; +.reg .b64 rd<8>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 972, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1581, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1580, %57, f1581; +mul.f32 f119, f1581, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1579, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1578, %63, f1579; +mul.f32 f135, f1579, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1577, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1576, %69, f1577; +mul.f32 f151, f1577, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f1575, f133, 0f3F441B7D; +sub.f32 f159, f1575, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f1573, f149, 0f3E31D0D4; +mul.f32 f1574, f155, 0fBF7C1C5C; +sub.f32 f164, f1573, f1574; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f1571, f134, 0f3E31D0D4; +mul.f32 f1572, f140, 0fBF7C1C5C; +sub.f32 f169, f1571, f1572; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f1569, f150, 0fBF708FB2; +mul.f32 f1570, f156, 0fBEAF1D44; +sub.f32 f174, f1569, f1570; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1568, f1578, f1576; +sub.f32 f183, f1578, f1576; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1567, f1580, f1568; +mul.f32 f187, f1568, 0f3F000000; +sub.f32 f188, f1580, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1566, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1565, f123, f1566; +mul.f32 f203, f1566, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1564, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1563, f124, f1564; +mul.f32 f219, f1564, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1560, %110, %111; +sub.f32 f231, %110, %111; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1558, %112, f1560; +mul.f32 f235, f1560, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1555, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1553, %115, f1555; +mul.f32 f251, f1555, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1550, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1548, %118, f1550; +mul.f32 f267, f1550, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f1547, f249, 0f3F441B7D; +sub.f32 f275, f1547, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f1546, f265, 0f3E31D0D4; +sub.f32 f280, f1546, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f1544, f250, 0f3E31D0D4; +mul.f32 f1545, f256, 0fBF7C1C5C; +sub.f32 f285, f1544, f1545; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f1542, f266, 0fBF708FB2; +mul.f32 f1543, f272, 0fBEAF1D44; +sub.f32 f290, f1542, f1543; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1541, f1553, f1548; +sub.f32 f299, f1553, f1548; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1540, f1558, f1541; +mul.f32 f303, f1541, 0f3F000000; +sub.f32 f304, f1558, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1539, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1538, f239, f1539; +mul.f32 f319, f1539, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1537, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1536, f240, f1537; +mul.f32 f335, f1537, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1533, %120, %119; +sub.f32 f347, %120, %119; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1531, %121, f1533; +mul.f32 f351, f1533, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1528, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1526, %124, f1528; +mul.f32 f367, f1528, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1524, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1522, %126, f1524; +mul.f32 f383, f1524, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f1521, f365, 0f3F441B7D; +sub.f32 f391, f1521, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f1520, f381, 0f3E31D0D4; +sub.f32 f396, f1520, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f1518, f366, 0f3E31D0D4; +mul.f32 f1519, f372, 0fBF7C1C5C; +sub.f32 f401, f1518, f1519; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f1516, f382, 0fBF708FB2; +mul.f32 f1517, f388, 0fBEAF1D44; +sub.f32 f406, f1516, f1517; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1515, f1526, f1522; +sub.f32 f415, f1526, f1522; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1514, f1531, f1515; +mul.f32 f419, f1515, 0f3F000000; +sub.f32 f420, f1531, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1513, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1512, f355, f1513; +mul.f32 f435, f1513, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1511, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1510, f356, f1511; +mul.f32 f451, f1511, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1538, 0fBE6C2691; +mul.f32 f1509, f310, 0f3F791978; +sub.f32 f459, f1509, f458; +mul.f32 f460, f1538, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f1507, f426, 0f3F64C51C; +mul.f32 f1508, f1512, 0fBEE5C902; +sub.f32 f464, f1507, f1508; +mul.f32 f465, f1512, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f1505, f326, 0f3F64C51C; +mul.f32 f1506, f1536, 0fBEE5C902; +sub.f32 f469, f1505, f1506; +mul.f32 f470, f1536, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f1503, f442, 0f3F18DF63; +mul.f32 f1504, f1510, 0fBF4D57F2; +sub.f32 f474, f1503, f1504; +mul.f32 f475, f1510, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f1501, f301, 0f3F441B7D; +mul.f32 f1502, f307, 0fBF248DBB; +sub.f32 f479, f1501, f1502; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f1500, f417, 0f3E31D0D4; +sub.f32 f484, f1500, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f1499, f317, 0f3F18DF63; +sub.f32 f489, f1499, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f1498, f433, 0fBE92D7E0; +sub.f32 f494, f1498, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f1497, f333, 0f3ECACAF8; +sub.f32 f499, f1497, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f1496, f449, 0fBF2FAD88; +sub.f32 f504, f1496, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f1495, f302, 0f3E31D0D4; +sub.f32 f509, f1495, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f1493, f418, 0fBF708FB2; +mul.f32 f1494, f424, 0fBEAF1D44; +sub.f32 f514, f1493, f1494; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f1491, f318, 0fBD6E2946; +mul.f32 f1492, f324, 0fBF7F9120; +sub.f32 f519, f1491, f1492; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f1489, f434, 0fBF7E44DE; +mul.f32 f1490, f440, 0f3DEDC21F; +sub.f32 f524, f1489, f1490; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f1488, f334, 0fBE92D7E0; +sub.f32 f529, f1488, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f1487, f450, 0fBF55E287; +sub.f32 f534, f1487, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f1486, f1540, f1514; +sub.f32 f543, f1540, f1514; +mul.f32 f544, f543, 0f3F5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f1485, f1567, f1486; +mul.f32 f547, f1486, 0f3F000000; +sub.f32 f548, f1567, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0f3F5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f1484, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f1483, f1565, f1484; +mul.f32 f563, f1484, 0f3F000000; +sub.f32 f564, f1565, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0f3F5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f1482, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f1481, f1563, f1482; +mul.f32 f579, f1482, 0f3F000000; +sub.f32 f580, f1563, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0f3F5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f1480, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0f3F5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f1479, f191, f1480; +mul.f32 f595, f1480, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0f3F5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f1478, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0f3F5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f1477, f207, f1478; +mul.f32 f611, f1478, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0f3F5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f1476, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0f3F5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f1475, f223, f1476; +mul.f32 f627, f1476, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0f3F5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f1474, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0f3F5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f1473, f192, f1474; +mul.f32 f643, f1474, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0f3F5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f1472, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0f3F5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f1471, f208, f1472; +mul.f32 f659, f1472, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0f3F5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f1470, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0f3F5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f1469, f224, f1470; +mul.f32 f675, f1470, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0f3F5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f686, f682, f1483; +mul.f32 f1468, f681, f554; +sub.f32 f687, f1468, f686; +mul.f32 f688, f681, f1483; +fma.rn.f32 f689, f682, f554, f688; +mul.f32 f691, f682, f682; +mul.f32 f1467, f681, f681; +sub.f32 f692, f1467, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f696, f694, f1481; +mul.f32 f1466, f692, f570; +sub.f32 f697, f1466, f696; +mul.f32 f698, f692, f1481; +fma.rn.f32 f699, f694, f570, f698; +mul.f32 f701, f682, f694; +mul.f32 f1465, f681, f692; +sub.f32 f702, f1465, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f706, f704, f1479; +mul.f32 f1464, f702, f586; +sub.f32 f707, f1464, f706; +mul.f32 f708, f702, f1479; +fma.rn.f32 f709, f704, f586, f708; +mul.f32 f1462, f681, f702; +mul.f32 f1463, f682, f704; +sub.f32 f712, f1462, f1463; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f1460, f712, f602; +mul.f32 f1461, f714, f1477; +sub.f32 f717, f1460, f1461; +mul.f32 f718, f712, f1477; +fma.rn.f32 f719, f714, f602, f718; +mul.f32 f1458, f681, f712; +mul.f32 f1459, f682, f714; +sub.f32 f722, f1458, f1459; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f1456, f722, f618; +mul.f32 f1457, f724, f1475; +sub.f32 f727, f1456, f1457; +mul.f32 f728, f722, f1475; +fma.rn.f32 f729, f724, f618, f728; +mul.f32 f731, f682, f724; +mul.f32 f1455, f681, f722; +sub.f32 f732, f1455, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f736, f734, f1473; +mul.f32 f1454, f732, f634; +sub.f32 f737, f1454, f736; +mul.f32 f738, f732, f1473; +fma.rn.f32 f739, f734, f634, f738; +mul.f32 f741, f682, f734; +mul.f32 f1453, f681, f732; +sub.f32 f742, f1453, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f746, f744, f1471; +mul.f32 f1452, f742, f650; +sub.f32 f747, f1452, f746; +mul.f32 f748, f742, f1471; +fma.rn.f32 f749, f744, f650, f748; +mul.f32 f751, f682, f744; +mul.f32 f1451, f681, f742; +sub.f32 f752, f1451, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f756, f754, f1469; +mul.f32 f1450, f752, f666; +sub.f32 f757, f1450, f756; +mul.f32 f758, f752, f1469; +fma.rn.f32 f759, f754, f666, f758; +mul.f32 f1448, f681, f752; +mul.f32 f1449, f682, f754; +sub.f32 f762, f1448, f1449; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f1446, f762, f545; +mul.f32 f1447, f764, f551; +sub.f32 f767, f1446, f1447; +mul.f32 f768, f762, f551; +fma.rn.f32 f769, f764, f545, f768; +mul.f32 f1444, f681, f762; +mul.f32 f1445, f682, f764; +sub.f32 f772, f1444, f1445; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f776, f774, f567; +mul.f32 f1443, f772, f561; +sub.f32 f777, f1443, f776; +mul.f32 f778, f772, f567; +fma.rn.f32 f779, f774, f561, f778; +mul.f32 f781, f682, f774; +mul.f32 f1442, f681, f772; +sub.f32 f782, f1442, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f786, f784, f583; +mul.f32 f1441, f782, f577; +sub.f32 f787, f1441, f786; +mul.f32 f788, f782, f583; +fma.rn.f32 f789, f784, f577, f788; +mul.f32 f791, f682, f784; +mul.f32 f1440, f681, f782; +sub.f32 f792, f1440, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f796, f794, f599; +mul.f32 f1439, f792, f593; +sub.f32 f797, f1439, f796; +mul.f32 f798, f792, f599; +fma.rn.f32 f799, f794, f593, f798; +mul.f32 f801, f682, f794; +mul.f32 f1438, f681, f792; +sub.f32 f802, f1438, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f1436, f802, f609; +mul.f32 f1437, f804, f615; +sub.f32 f807, f1436, f1437; +mul.f32 f808, f802, f615; +fma.rn.f32 f809, f804, f609, f808; +mul.f32 f1434, f681, f802; +mul.f32 f1435, f682, f804; +sub.f32 f812, f1434, f1435; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f1432, f812, f625; +mul.f32 f1433, f814, f631; +sub.f32 f817, f1432, f1433; +mul.f32 f818, f812, f631; +fma.rn.f32 f819, f814, f625, f818; +mul.f32 f1430, f681, f812; +mul.f32 f1431, f682, f814; +sub.f32 f822, f1430, f1431; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f826, f824, f647; +mul.f32 f1429, f822, f641; +sub.f32 f827, f1429, f826; +mul.f32 f828, f822, f647; +fma.rn.f32 f829, f824, f641, f828; +mul.f32 f831, f682, f824; +mul.f32 f1428, f681, f822; +sub.f32 f832, f1428, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f836, f834, f663; +mul.f32 f1427, f832, f657; +sub.f32 f837, f1427, f836; +mul.f32 f838, f832, f663; +fma.rn.f32 f839, f834, f657, f838; +mul.f32 f841, f682, f834; +mul.f32 f1426, f681, f832; +sub.f32 f842, f1426, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f846, f844, f679; +mul.f32 f1425, f842, f673; +sub.f32 f847, f1425, f846; +mul.f32 f848, f842, f679; +fma.rn.f32 f849, f844, f673, f848; +mul.f32 f1423, f681, f842; +mul.f32 f1424, f682, f844; +sub.f32 f852, f1423, f1424; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f1421, f852, f546; +mul.f32 f1422, f854, f552; +sub.f32 f857, f1421, f1422; +mul.f32 f858, f852, f552; +fma.rn.f32 f859, f854, f546, f858; +mul.f32 f1419, f681, f852; +mul.f32 f1420, f682, f854; +sub.f32 f862, f1419, f1420; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f1417, f862, f562; +mul.f32 f1418, f864, f568; +sub.f32 f867, f1417, f1418; +mul.f32 f868, f862, f568; +fma.rn.f32 f869, f864, f562, f868; +mul.f32 f871, f682, f864; +mul.f32 f1416, f681, f862; +sub.f32 f872, f1416, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f876, f874, f584; +mul.f32 f1415, f872, f578; +sub.f32 f877, f1415, f876; +mul.f32 f878, f872, f584; +fma.rn.f32 f879, f874, f578, f878; +mul.f32 f881, f682, f874; +mul.f32 f1414, f681, f872; +sub.f32 f882, f1414, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f886, f884, f600; +mul.f32 f1413, f882, f594; +sub.f32 f887, f1413, f886; +mul.f32 f888, f882, f600; +fma.rn.f32 f889, f884, f594, f888; +mul.f32 f891, f682, f884; +mul.f32 f1412, f681, f882; +sub.f32 f892, f1412, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f1410, f892, f610; +mul.f32 f1411, f894, f616; +sub.f32 f897, f1410, f1411; +mul.f32 f898, f892, f616; +fma.rn.f32 f899, f894, f610, f898; +mul.f32 f1408, f681, f892; +mul.f32 f1409, f682, f894; +sub.f32 f902, f1408, f1409; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f1406, f902, f626; +mul.f32 f1407, f904, f632; +sub.f32 f907, f1406, f1407; +mul.f32 f908, f902, f632; +fma.rn.f32 f909, f904, f626, f908; +mul.f32 f1404, f681, f902; +mul.f32 f1405, f682, f904; +sub.f32 f912, f1404, f1405; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f916, f914, f648; +mul.f32 f1403, f912, f642; +sub.f32 f917, f1403, f916; +mul.f32 f918, f912, f648; +fma.rn.f32 f919, f914, f642, f918; +mul.f32 f921, f682, f914; +mul.f32 f1402, f681, f912; +sub.f32 f922, f1402, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f926, f924, f664; +mul.f32 f1401, f922, f658; +sub.f32 f927, f1401, f926; +mul.f32 f928, f922, f664; +fma.rn.f32 f929, f924, f658, f928; +mul.f32 f931, f682, f924; +mul.f32 f1400, f681, f922; +sub.f32 f932, f1400, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f936, f934, f680; +mul.f32 f1399, f932, f674; +sub.f32 f937, f1399, f936; +mul.f32 f938, f932, f680; +fma.rn.f32 f939, f934, f674, f938; +mad.lo.s32 r8, r5, 972, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f687; +st.shared.f32 [r9+8], f697; +st.shared.f32 [r9+12], f707; +st.shared.f32 [r9+16], f717; +st.shared.f32 [r9+20], f727; +st.shared.f32 [r9+24], f737; +st.shared.f32 [r9+28], f747; +st.shared.f32 [r9+32], f757; +st.shared.f32 [r9+36], f767; +st.shared.f32 [r9+40], f777; +st.shared.f32 [r9+44], f787; +st.shared.f32 [r9+48], f797; +st.shared.f32 [r9+52], f807; +st.shared.f32 [r9+56], f817; +st.shared.f32 [r9+60], f827; +st.shared.f32 [r9+64], f837; +st.shared.f32 [r9+68], f847; +st.shared.f32 [r9+72], f857; +st.shared.f32 [r9+76], f867; +st.shared.f32 [r9+80], f877; +st.shared.f32 [r9+84], f887; +st.shared.f32 [r9+88], f897; +st.shared.f32 [r9+92], f907; +st.shared.f32 [r9+96], f917; +st.shared.f32 [r9+100], f927; +st.shared.f32 [r9+104], f937; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+36]; +ld.shared.f32 f942, [r10+72]; +ld.shared.f32 f943, [r10+108]; +ld.shared.f32 f944, [r10+144]; +ld.shared.f32 f945, [r10+180]; +ld.shared.f32 f946, [r10+216]; +ld.shared.f32 f947, [r10+252]; +ld.shared.f32 f948, [r10+288]; +ld.shared.f32 f949, [r10+324]; +ld.shared.f32 f950, [r10+360]; +ld.shared.f32 f951, [r10+396]; +ld.shared.f32 f952, [r10+432]; +ld.shared.f32 f953, [r10+468]; +ld.shared.f32 f954, [r10+504]; +ld.shared.f32 f955, [r10+540]; +ld.shared.f32 f956, [r10+576]; +ld.shared.f32 f957, [r10+612]; +ld.shared.f32 f958, [r10+648]; +ld.shared.f32 f959, [r10+684]; +ld.shared.f32 f960, [r10+720]; +ld.shared.f32 f961, [r10+756]; +ld.shared.f32 f962, [r10+792]; +ld.shared.f32 f963, [r10+828]; +ld.shared.f32 f964, [r10+864]; +ld.shared.f32 f965, [r10+900]; +ld.shared.f32 f966, [r10+936]; +barrier.sync 0; +st.shared.f32 [r9], f1485; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +ld.shared.f32 f967, [r10]; +ld.shared.f32 f968, [r10+36]; +ld.shared.f32 f969, [r10+72]; +ld.shared.f32 f970, [r10+108]; +ld.shared.f32 f971, [r10+144]; +ld.shared.f32 f972, [r10+180]; +ld.shared.f32 f973, [r10+216]; +ld.shared.f32 f974, [r10+252]; +ld.shared.f32 f975, [r10+288]; +ld.shared.f32 f976, [r10+324]; +ld.shared.f32 f977, [r10+360]; +ld.shared.f32 f978, [r10+396]; +ld.shared.f32 f979, [r10+432]; +ld.shared.f32 f980, [r10+468]; +ld.shared.f32 f981, [r10+504]; +ld.shared.f32 f982, [r10+540]; +ld.shared.f32 f983, [r10+576]; +ld.shared.f32 f984, [r10+612]; +ld.shared.f32 f985, [r10+648]; +ld.shared.f32 f986, [r10+684]; +ld.shared.f32 f987, [r10+720]; +ld.shared.f32 f988, [r10+756]; +ld.shared.f32 f989, [r10+792]; +ld.shared.f32 f990, [r10+828]; +ld.shared.f32 f991, [r10+864]; +ld.shared.f32 f992, [r10+900]; +ld.shared.f32 f993, [r10+936]; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +add.f32 f1398, f976, f985; +sub.f32 f1000, f976, f985; +mul.f32 f1001, f1000, 0f3F5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +add.f32 f1397, f967, f1398; +mul.f32 f1004, f1398, 0f3F000000; +sub.f32 f1005, f967, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0f3F5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +add.f32 f1396, f979, f988; +sub.f32 f1016, f979, f988; +mul.f32 f1017, f1016, 0f3F5DB3D7; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f1395, f970, f1396; +mul.f32 f1020, f1396, 0f3F000000; +sub.f32 f1021, f970, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0f3F5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +sub.f32 f1031, f946, f1030; +add.f32 f1394, f982, f991; +sub.f32 f1032, f982, f991; +mul.f32 f1033, f1032, 0f3F5DB3D7; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f1393, f973, f1394; +mul.f32 f1036, f1394, 0f3F000000; +sub.f32 f1037, f973, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0f3F5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f1391, f1018, 0f3F441B7D; +mul.f32 f1392, f1024, 0fBF248DBB; +sub.f32 f1044, f1391, f1392; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0fBF248DBB, f1045; +mul.f32 f1048, f1040, 0fBF7C1C5C; +mul.f32 f1390, f1034, 0f3E31D0D4; +sub.f32 f1049, f1390, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0fBF7C1C5C, f1050; +mul.f32 f1053, f1025, 0fBF7C1C5C; +mul.f32 f1389, f1019, 0f3E31D0D4; +sub.f32 f1054, f1389, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0fBF7C1C5C, f1055; +mul.f32 f1058, f1041, 0fBEAF1D44; +mul.f32 f1388, f1035, 0fBF708FB2; +sub.f32 f1059, f1388, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0fBEAF1D44, f1060; +add.f32 f1062, f1011, f1027; +mul.f32 f1064, f1062, 0f3F000000; +sub.f32 f1065, f995, f1064; +add.f32 f1387, f1395, f1393; +sub.f32 f1066, f1395, f1393; +mul.f32 f1067, f1066, 0f3F5DB3D7; +mul.f32 f1068, f1387, 0f3F000000; +sub.f32 f1069, f1397, f1068; +sub.f32 f1070, f1011, f1027; +mul.f32 f1071, f1070, 0f3F5DB3D7; +add.f32 f1072, f1044, f1049; +mul.f32 f1074, f1072, 0f3F000000; +sub.f32 f1075, f1002, f1074; +add.f32 f1386, f1046, f1051; +sub.f32 f1076, f1046, f1051; +mul.f32 f1077, f1076, 0f3F5DB3D7; +mul.f32 f1078, f1386, 0f3F000000; +sub.f32 f1079, f1008, f1078; +sub.f32 f1080, f1044, f1049; +mul.f32 f1081, f1080, 0f3F5DB3D7; +add.f32 f1082, f1054, f1059; +mul.f32 f1084, f1082, 0f3F000000; +sub.f32 f1085, f1003, f1084; +add.f32 f1385, f1056, f1061; +sub.f32 f1086, f1056, f1061; +mul.f32 f1087, f1086, 0f3F5DB3D7; +mul.f32 f1088, f1385, 0f3F000000; +sub.f32 f1089, f1009, f1088; +sub.f32 f1090, f1054, f1059; +mul.f32 f1091, f1090, 0f3F5DB3D7; +add.f32 f1092, f950, f959; +add.f32 f1093, f941, f1092; +mul.f32 f1096, f1092, 0f3F000000; +sub.f32 f1097, f941, f1096; +add.f32 f1384, f977, f986; +sub.f32 f1098, f977, f986; +mul.f32 f1099, f1098, 0f3F5DB3D7; +add.f32 f1100, f1099, f1097; +sub.f32 f1101, f1097, f1099; +add.f32 f1383, f968, f1384; +mul.f32 f1102, f1384, 0f3F000000; +sub.f32 f1103, f968, f1102; +sub.f32 f1104, f950, f959; +mul.f32 f1105, f1104, 0f3F5DB3D7; +sub.f32 f1106, f1103, f1105; +add.f32 f1107, f1105, f1103; +add.f32 f1108, f953, f962; +add.f32 f1109, f944, f1108; +mul.f32 f1112, f1108, 0f3F000000; +sub.f32 f1113, f944, f1112; +add.f32 f1382, f980, f989; +sub.f32 f1114, f980, f989; +mul.f32 f1115, f1114, 0f3F5DB3D7; +add.f32 f1116, f1115, f1113; +sub.f32 f1117, f1113, f1115; +add.f32 f1381, f971, f1382; +mul.f32 f1118, f1382, 0f3F000000; +sub.f32 f1119, f971, f1118; +sub.f32 f1120, f953, f962; +mul.f32 f1121, f1120, 0f3F5DB3D7; +sub.f32 f1122, f1119, f1121; +add.f32 f1123, f1121, f1119; +add.f32 f1124, f956, f965; +add.f32 f1125, f947, f1124; +mul.f32 f1128, f1124, 0f3F000000; +sub.f32 f1129, f947, f1128; +add.f32 f1380, f983, f992; +sub.f32 f1130, f983, f992; +mul.f32 f1131, f1130, 0f3F5DB3D7; +add.f32 f1132, f1131, f1129; +sub.f32 f1133, f1129, f1131; +add.f32 f1379, f974, f1380; +mul.f32 f1134, f1380, 0f3F000000; +sub.f32 f1135, f974, f1134; +sub.f32 f1136, f956, f965; +mul.f32 f1137, f1136, 0f3F5DB3D7; +sub.f32 f1138, f1135, f1137; +add.f32 f1139, f1137, f1135; +mul.f32 f1141, f1122, 0fBF248DBB; +mul.f32 f1378, f1116, 0f3F441B7D; +sub.f32 f1142, f1378, f1141; +mul.f32 f1143, f1122, 0f3F441B7D; +fma.rn.f32 f1144, f1116, 0fBF248DBB, f1143; +mul.f32 f1146, f1138, 0fBF7C1C5C; +mul.f32 f1377, f1132, 0f3E31D0D4; +sub.f32 f1147, f1377, f1146; +mul.f32 f1148, f1138, 0f3E31D0D4; +fma.rn.f32 f1149, f1132, 0fBF7C1C5C, f1148; +mul.f32 f1151, f1123, 0fBF7C1C5C; +mul.f32 f1376, f1117, 0f3E31D0D4; +sub.f32 f1152, f1376, f1151; +mul.f32 f1153, f1123, 0f3E31D0D4; +fma.rn.f32 f1154, f1117, 0fBF7C1C5C, f1153; +mul.f32 f1374, f1133, 0fBF708FB2; +mul.f32 f1375, f1139, 0fBEAF1D44; +sub.f32 f1157, f1374, f1375; +mul.f32 f1158, f1139, 0fBF708FB2; +fma.rn.f32 f1159, f1133, 0fBEAF1D44, f1158; +add.f32 f1160, f1109, f1125; +mul.f32 f1162, f1160, 0f3F000000; +sub.f32 f1163, f1093, f1162; +add.f32 f1373, f1381, f1379; +sub.f32 f1164, f1381, f1379; +mul.f32 f1165, f1164, 0f3F5DB3D7; +mul.f32 f1166, f1373, 0f3F000000; +sub.f32 f1167, f1383, f1166; +sub.f32 f1168, f1109, f1125; +mul.f32 f1169, f1168, 0f3F5DB3D7; +add.f32 f1170, f1142, f1147; +mul.f32 f1172, f1170, 0f3F000000; +sub.f32 f1173, f1100, f1172; +add.f32 f1372, f1144, f1149; +sub.f32 f1174, f1144, f1149; +mul.f32 f1175, f1174, 0f3F5DB3D7; +mul.f32 f1176, f1372, 0f3F000000; +sub.f32 f1177, f1106, f1176; +sub.f32 f1178, f1142, f1147; +mul.f32 f1179, f1178, 0f3F5DB3D7; +add.f32 f1180, f1152, f1157; +mul.f32 f1182, f1180, 0f3F000000; +sub.f32 f1183, f1101, f1182; +add.f32 f1371, f1154, f1159; +sub.f32 f1184, f1154, f1159; +mul.f32 f1185, f1184, 0f3F5DB3D7; +mul.f32 f1186, f1371, 0f3F000000; +sub.f32 f1187, f1107, f1186; +sub.f32 f1188, f1152, f1157; +mul.f32 f1189, f1188, 0f3F5DB3D7; +add.f32 f1190, f951, f960; +add.f32 f1191, f942, f1190; +mul.f32 f1194, f1190, 0f3F000000; +sub.f32 f1195, f942, f1194; +add.f32 f1370, f978, f987; +sub.f32 f1196, f978, f987; +mul.f32 f1197, f1196, 0f3F5DB3D7; +add.f32 f1198, f1197, f1195; +sub.f32 f1199, f1195, f1197; +add.f32 f1369, f969, f1370; +mul.f32 f1200, f1370, 0f3F000000; +sub.f32 f1201, f969, f1200; +sub.f32 f1202, f951, f960; +mul.f32 f1203, f1202, 0f3F5DB3D7; +sub.f32 f1204, f1201, f1203; +add.f32 f1205, f1203, f1201; +add.f32 f1206, f954, f963; +add.f32 f1207, f945, f1206; +mul.f32 f1210, f1206, 0f3F000000; +sub.f32 f1211, f945, f1210; +add.f32 f1368, f981, f990; +sub.f32 f1212, f981, f990; +mul.f32 f1213, f1212, 0f3F5DB3D7; +add.f32 f1214, f1213, f1211; +sub.f32 f1215, f1211, f1213; +add.f32 f1367, f972, f1368; +mul.f32 f1216, f1368, 0f3F000000; +sub.f32 f1217, f972, f1216; +sub.f32 f1218, f954, f963; +mul.f32 f1219, f1218, 0f3F5DB3D7; +sub.f32 f1220, f1217, f1219; +add.f32 f1221, f1219, f1217; +add.f32 f1222, f957, f966; +add.f32 f1223, f948, f1222; +mul.f32 f1226, f1222, 0f3F000000; +sub.f32 f1227, f948, f1226; +add.f32 f1366, f984, f993; +sub.f32 f1228, f984, f993; +mul.f32 f1229, f1228, 0f3F5DB3D7; +add.f32 f1230, f1229, f1227; +sub.f32 f1231, f1227, f1229; +add.f32 f1365, f975, f1366; +mul.f32 f1232, f1366, 0f3F000000; +sub.f32 f1233, f975, f1232; +sub.f32 f1234, f957, f966; +mul.f32 f1235, f1234, 0f3F5DB3D7; +sub.f32 f1236, f1233, f1235; +add.f32 f1237, f1235, f1233; +mul.f32 f1363, f1214, 0f3F441B7D; +mul.f32 f1364, f1220, 0fBF248DBB; +sub.f32 f1240, f1363, f1364; +mul.f32 f1241, f1220, 0f3F441B7D; +fma.rn.f32 f1242, f1214, 0fBF248DBB, f1241; +mul.f32 f1361, f1230, 0f3E31D0D4; +mul.f32 f1362, f1236, 0fBF7C1C5C; +sub.f32 f1245, f1361, f1362; +mul.f32 f1246, f1236, 0f3E31D0D4; +fma.rn.f32 f1247, f1230, 0fBF7C1C5C, f1246; +mul.f32 f1359, f1215, 0f3E31D0D4; +mul.f32 f1360, f1221, 0fBF7C1C5C; +sub.f32 f1250, f1359, f1360; +mul.f32 f1251, f1221, 0f3E31D0D4; +fma.rn.f32 f1252, f1215, 0fBF7C1C5C, f1251; +mul.f32 f1357, f1231, 0fBF708FB2; +mul.f32 f1358, f1237, 0fBEAF1D44; +sub.f32 f1255, f1357, f1358; +mul.f32 f1256, f1237, 0fBF708FB2; +fma.rn.f32 f1257, f1231, 0fBEAF1D44, f1256; +add.f32 f1258, f1207, f1223; +mul.f32 f1260, f1258, 0f3F000000; +sub.f32 f1261, f1191, f1260; +add.f32 f1356, f1367, f1365; +sub.f32 f1262, f1367, f1365; +mul.f32 f1263, f1262, 0f3F5DB3D7; +mul.f32 f1264, f1356, 0f3F000000; +sub.f32 f1265, f1369, f1264; +sub.f32 f1266, f1207, f1223; +mul.f32 f1267, f1266, 0f3F5DB3D7; +add.f32 f1268, f1240, f1245; +mul.f32 f1270, f1268, 0f3F000000; +sub.f32 f1271, f1198, f1270; +add.f32 f1355, f1242, f1247; +sub.f32 f1272, f1242, f1247; +mul.f32 f1273, f1272, 0f3F5DB3D7; +mul.f32 f1274, f1355, 0f3F000000; +sub.f32 f1275, f1204, f1274; +sub.f32 f1276, f1240, f1245; +mul.f32 f1277, f1276, 0f3F5DB3D7; +add.f32 f1278, f1250, f1255; +mul.f32 f1280, f1278, 0f3F000000; +sub.f32 f1281, f1199, f1280; +add.f32 f1354, f1252, f1257; +sub.f32 f1282, f1252, f1257; +mul.f32 f1283, f1282, 0f3F5DB3D7; +mul.f32 f1284, f1354, 0f3F000000; +sub.f32 f1285, f1205, f1284; +sub.f32 f1286, f1250, f1255; +mul.f32 f1583, f1356, 0f3F000000; +sub.f32 f1582, f1369, f1583; +mul.f32 f1287, f1286, 0f3F5DB3D7; +add.f32 %0, f995, f1062; +mul.f32 f1585, f1072, 0f3F000000; +sub.f32 f1584, f1002, f1585; +add.f32 %1, f1397, f1387; +mul.f32 f1587, f1385, 0f3F000000; +sub.f32 f1586, f1009, f1587; +mul.f32 f1589, f1386, 0f3F000000; +sub.f32 f1588, f1008, f1589; +add.f32 %2, f1093, f1160; +add.f32 %3, f1383, f1373; +add.f32 %4, f1191, f1258; +add.f32 %5, f1369, f1356; +add.f32 %7, f1008, f1386; +add.f32 %6, f1002, f1072; +add.f32 %9, f1106, f1372; +add.f32 %8, f1100, f1170; +add.f32 %11, f1204, f1355; +add.f32 %10, f1198, f1268; +add.f32 %13, f1009, f1385; +add.f32 %12, f1003, f1082; +add.f32 %15, f1107, f1371; +add.f32 %14, f1101, f1180; +add.f32 %17, f1205, f1354; +add.f32 %16, f1199, f1278; +sub.f32 %19, f1069, f1071; +add.f32 %18, f1067, f1065; +sub.f32 %21, f1167, f1169; +add.f32 %20, f1165, f1163; +add.f32 %22, f1263, f1261; +sub.f32 %23, f1582, f1267; +sub.f32 %25, f1588, f1081; +add.f32 %24, f1077, f1584; +sub.f32 %27, f1177, f1179; +add.f32 %26, f1175, f1173; +add.f32 %28, f1273, f1271; +sub.f32 %29, f1275, f1277; +add.f32 %30, f1087, f1085; +sub.f32 %31, f1586, f1091; +add.f32 %32, f1185, f1183; +sub.f32 %33, f1187, f1189; +add.f32 %34, f1283, f1281; +sub.f32 %35, f1285, f1287; +sub.f32 %36, f1065, f1067; +add.f32 %37, f1071, f1069; +sub.f32 %38, f1163, f1165; +add.f32 %39, f1169, f1167; +sub.f32 %40, f1261, f1263; +add.f32 %41, f1267, f1582; +add.f32 %43, f1081, f1588; +sub.f32 %42, f1584, f1077; +add.f32 %45, f1179, f1177; +sub.f32 %44, f1173, f1175; +add.f32 %47, f1277, f1275; +sub.f32 %46, f1271, f1273; +add.f32 %49, f1091, f1586; +sub.f32 %48, f1085, f1087; +add.f32 %51, f1189, f1187; +sub.f32 %50, f1183, f1185; +add.f32 %53, f1287, f1285; +sub.f32 %52, f1281, f1283; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[20].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<134, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<547>; +.reg .b32 r<22>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 1944, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0f3F5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0f3F5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 1944, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f151, f120; +mul.f32 f156, f152, f122; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f160, f136; +mul.f32 f164, f162, f138; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f168, f111; +mul.f32 f172, f170, f117; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f176, f127; +mul.f32 f180, f178, f133; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f184, f143; +mul.f32 f188, f186, f149; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f192, f112; +mul.f32 f196, f194, f118; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f200, f128; +mul.f32 f204, f202, f134; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f208, f144; +mul.f32 f212, f210, f150; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r13, r11, 72, r12; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r13], {f215, f214}; +fma.rn.f32 f216, f152, f120, f157; +sub.f32 f217, f155, f156; +st.shared.v2.f32 [r13+8], {f217, f216}; +fma.rn.f32 f218, f162, f136, f165; +sub.f32 f219, f163, f164; +st.shared.v2.f32 [r13+16], {f219, f218}; +sub.f32 f220, f171, f172; +fma.rn.f32 f221, f170, f111, f173; +st.shared.v2.f32 [r13+24], {f220, f221}; +fma.rn.f32 f222, f178, f127, f181; +sub.f32 f223, f179, f180; +st.shared.v2.f32 [r13+32], {f223, f222}; +sub.f32 f224, f187, f188; +fma.rn.f32 f225, f186, f143, f189; +st.shared.v2.f32 [r13+40], {f224, f225}; +fma.rn.f32 f226, f194, f112, f197; +sub.f32 f227, f195, f196; +st.shared.v2.f32 [r13+48], {f227, f226}; +fma.rn.f32 f228, f202, f128, f205; +sub.f32 f229, f203, f204; +st.shared.v2.f32 [r13+56], {f229, f228}; +fma.rn.f32 f230, f210, f144, f213; +sub.f32 f231, f211, f212; +st.shared.v2.f32 [r13+64], {f231, f230}; +barrier.sync 0; +shl.b32 r14, r11, 6; +sub.s32 r15, r13, r14; +ld.shared.v2.f32 {f232, f233}, [r15]; +ld.shared.v2.f32 {f236, f237}, [r15+216]; +ld.shared.v2.f32 {f240, f241}, [r15+432]; +ld.shared.v2.f32 {f244, f245}, [r15+648]; +ld.shared.v2.f32 {f248, f249}, [r15+864]; +ld.shared.v2.f32 {f252, f253}, [r15+1080]; +ld.shared.v2.f32 {f256, f257}, [r15+1296]; +ld.shared.v2.f32 {f260, f261}, [r15+1512]; +ld.shared.v2.f32 {f264, f265}, [r15+1728]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0f3F5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0f3F5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0f3F5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0f3F5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0f3F5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0f3F5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0fBF248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0fBF248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0fBF7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0fBF7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0fBF7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0fBF7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0fBEAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0fBEAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0f3F5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0f3F5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f382, f351; +mul.f32 f387, f383, f353; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f391, f367; +mul.f32 f395, f393, f369; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f399, f342; +mul.f32 f403, f401, f348; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f415, f374; +mul.f32 f419, f417, f380; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f423, f343; +mul.f32 f427, f425, f349; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f431, f359; +mul.f32 f435, f433, f365; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f439, f375; +mul.f32 f443, f441, f381; +mul.f32 f444, f439, f381; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 648, r20; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r21], {f446, f445}; +fma.rn.f32 f447, f383, f351, f388; +sub.f32 f448, f386, f387; +st.shared.v2.f32 [r21+72], {f448, f447}; +fma.rn.f32 f449, f393, f367, f396; +sub.f32 f450, f394, f395; +st.shared.v2.f32 [r21+144], {f450, f449}; +fma.rn.f32 f451, f401, f342, f404; +sub.f32 f452, f402, f403; +st.shared.v2.f32 [r21+216], {f452, f451}; +fma.rn.f32 f453, f409, f358, f412; +sub.f32 f454, f410, f411; +st.shared.v2.f32 [r21+288], {f454, f453}; +fma.rn.f32 f455, f417, f374, f420; +sub.f32 f456, f418, f419; +st.shared.v2.f32 [r21+360], {f456, f455}; +fma.rn.f32 f457, f425, f343, f428; +sub.f32 f458, f426, f427; +st.shared.v2.f32 [r21+432], {f458, f457}; +sub.f32 f459, f434, f435; +fma.rn.f32 f460, f433, f359, f436; +st.shared.v2.f32 [r21+504], {f459, f460}; +fma.rn.f32 f461, f441, f375, f444; +sub.f32 f462, f442, f443; +st.shared.v2.f32 [r21+576], {f462, f461}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r15]; +ld.shared.v2.f32 {f467, f468}, [r15+216]; +ld.shared.v2.f32 {f471, f472}, [r15+432]; +ld.shared.v2.f32 {f475, f476}, [r15+648]; +ld.shared.v2.f32 {f479, f480}, [r15+864]; +ld.shared.v2.f32 {f483, f484}, [r15+1080]; +ld.shared.v2.f32 {f487, f488}, [r15+1296]; +ld.shared.v2.f32 {f491, f492}, [r15+1512]; +ld.shared.v2.f32 {f495, f496}, [r15+1728]; +add.f32 f499, f475, f487; +add.f32 f500, f476, f488; +mul.f32 f501, f499, 0f3F000000; +sub.f32 f502, f463, f501; +sub.f32 f503, f476, f488; +mul.f32 f504, f503, 0f3F5DB3D7; +mul.f32 f505, f500, 0f3F000000; +sub.f32 f506, f464, f505; +sub.f32 f507, f475, f487; +mul.f32 f508, f507, 0f3F5DB3D7; +add.f32 f509, f479, f491; +add.f32 f510, f480, f492; +mul.f32 f511, f509, 0f3F000000; +sub.f32 f512, f467, f511; +sub.f32 f513, f480, f492; +mul.f32 f514, f513, 0f3F5DB3D7; +mul.f32 f515, f510, 0f3F000000; +sub.f32 f516, f468, f515; +sub.f32 f517, f479, f491; +mul.f32 f518, f517, 0f3F5DB3D7; +add.f32 f519, f483, f495; +add.f32 f520, f484, f496; +mul.f32 f521, f519, 0f3F000000; +sub.f32 f522, f471, f521; +sub.f32 f523, f484, f496; +mul.f32 f524, f523, 0f3F5DB3D7; +mul.f32 f525, f520, 0f3F000000; +sub.f32 f526, f472, f525; +sub.f32 f527, f483, f495; +mul.f32 f528, f527, 0f3F5DB3D7; +add.f32 %1, f464, f500; +add.f32 %0, f463, f499; +add.f32 %3, f468, f510; +add.f32 %2, f467, f509; +add.f32 %5, f472, f520; +add.f32 %4, f471, f519; +sub.f32 %7, f506, f508; +add.f32 %6, f504, f502; +sub.f32 %9, f516, f518; +add.f32 %8, f514, f512; +sub.f32 %11, f526, f528; +add.f32 %10, f524, f522; +add.f32 %13, f508, f506; +sub.f32 %12, f502, f504; +add.f32 %15, f518, f516; +sub.f32 %14, f512, f514; +add.f32 %17, f528, f526; +sub.f32 %16, f522, f524; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<135, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<511>; +.reg .b32 r<22>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 972, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0f3F5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0f3F5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0f3F5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0f3F5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f153, f122; +mul.f32 f158, f154, f124; +sub.f32 f159, f157, f158; +mul.f32 f160, f153, f124; +fma.rn.f32 f161, f154, f122, f160; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f164, f138; +mul.f32 f168, f166, f140; +sub.f32 f169, f167, f168; +mul.f32 f170, f164, f140; +fma.rn.f32 f171, f166, f138, f170; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f174, f113; +mul.f32 f178, f176, f119; +sub.f32 f179, f177, f178; +mul.f32 f180, f174, f119; +fma.rn.f32 f181, f176, f113, f180; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f184, f129; +mul.f32 f188, f186, f135; +sub.f32 f189, f187, f188; +mul.f32 f190, f184, f135; +fma.rn.f32 f191, f186, f129, f190; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f194, f145; +mul.f32 f198, f196, f151; +sub.f32 f199, f197, f198; +mul.f32 f200, f194, f151; +fma.rn.f32 f201, f196, f145, f200; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f204, f114; +mul.f32 f208, f206, f120; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f120; +fma.rn.f32 f211, f206, f114, f210; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f214, f130; +mul.f32 f218, f216, f136; +sub.f32 f219, f217, f218; +mul.f32 f220, f214, f136; +fma.rn.f32 f221, f216, f130, f220; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f224, f146; +mul.f32 f228, f226, f152; +sub.f32 f229, f227, f228; +mul.f32 f230, f224, f152; +fma.rn.f32 f231, f226, f146, f230; +mad.lo.s32 r12, r9, 972, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 36, r12; +st.shared.f32 [r13], f106; +st.shared.f32 [r13+4], f159; +st.shared.f32 [r13+8], f169; +st.shared.f32 [r13+12], f179; +st.shared.f32 [r13+16], f189; +st.shared.f32 [r13+20], f199; +st.shared.f32 [r13+24], f209; +st.shared.f32 [r13+28], f219; +st.shared.f32 [r13+32], f229; +barrier.sync 0; +shl.b32 r14, r11, 5; +sub.s32 r15, r13, r14; +ld.shared.f32 f232, [r15]; +ld.shared.f32 f233, [r15+108]; +ld.shared.f32 f234, [r15+216]; +ld.shared.f32 f235, [r15+324]; +ld.shared.f32 f236, [r15+432]; +ld.shared.f32 f237, [r15+540]; +ld.shared.f32 f238, [r15+648]; +ld.shared.f32 f239, [r15+756]; +ld.shared.f32 f240, [r15+864]; +barrier.sync 0; +st.shared.f32 [r13], f108; +st.shared.f32 [r13+4], f161; +st.shared.f32 [r13+8], f171; +st.shared.f32 [r13+12], f181; +st.shared.f32 [r13+16], f191; +st.shared.f32 [r13+20], f201; +st.shared.f32 [r13+24], f211; +st.shared.f32 [r13+28], f221; +st.shared.f32 [r13+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r15]; +ld.shared.f32 f242, [r15+108]; +ld.shared.f32 f243, [r15+216]; +ld.shared.f32 f244, [r15+324]; +ld.shared.f32 f245, [r15+432]; +ld.shared.f32 f246, [r15+540]; +ld.shared.f32 f247, [r15+648]; +ld.shared.f32 f248, [r15+756]; +ld.shared.f32 f249, [r15+864]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0f3F5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0f3F5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0f3F5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0f3F5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0f3F5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0f3F5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0fBF248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0fBF248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0fBF7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0fBF7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0fBF7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0fBF7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0fBEAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0fBEAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0f3F5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0f3F5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f366, f335; +mul.f32 f371, f367, f337; +sub.f32 f372, f370, f371; +mul.f32 f373, f366, f337; +fma.rn.f32 f374, f367, f335, f373; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f377, f351; +mul.f32 f381, f379, f353; +sub.f32 f382, f380, f381; +mul.f32 f383, f377, f353; +fma.rn.f32 f384, f379, f351, f383; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f387, f326; +mul.f32 f391, f389, f332; +sub.f32 f392, f390, f391; +mul.f32 f393, f387, f332; +fma.rn.f32 f394, f389, f326, f393; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f397, f342; +mul.f32 f401, f399, f348; +sub.f32 f402, f400, f401; +mul.f32 f403, f397, f348; +fma.rn.f32 f404, f399, f342, f403; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +sub.f32 f412, f410, f411; +mul.f32 f413, f407, f364; +fma.rn.f32 f414, f409, f358, f413; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f417, f327; +mul.f32 f421, f419, f333; +sub.f32 f422, f420, f421; +mul.f32 f423, f417, f333; +fma.rn.f32 f424, f419, f327, f423; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f427, f343; +mul.f32 f431, f429, f349; +sub.f32 f432, f430, f431; +mul.f32 f433, f427, f349; +fma.rn.f32 f434, f429, f343, f433; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f437, f359; +mul.f32 f441, f439, f365; +sub.f32 f442, f440, f441; +mul.f32 f443, f437, f365; +fma.rn.f32 f444, f439, f359, f443; +shl.b32 r19, r18, 2; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 324, r20; +st.shared.f32 [r21], f319; +st.shared.f32 [r21+36], f372; +st.shared.f32 [r21+72], f382; +st.shared.f32 [r21+108], f392; +st.shared.f32 [r21+144], f402; +st.shared.f32 [r21+180], f412; +st.shared.f32 [r21+216], f422; +st.shared.f32 [r21+252], f432; +st.shared.f32 [r21+288], f442; +barrier.sync 0; +ld.shared.f32 f445, [r15]; +ld.shared.f32 f446, [r15+108]; +ld.shared.f32 f447, [r15+216]; +ld.shared.f32 f448, [r15+324]; +ld.shared.f32 f449, [r15+432]; +ld.shared.f32 f450, [r15+540]; +ld.shared.f32 f451, [r15+648]; +ld.shared.f32 f452, [r15+756]; +ld.shared.f32 f453, [r15+864]; +barrier.sync 0; +st.shared.f32 [r21], f321; +st.shared.f32 [r21+36], f374; +st.shared.f32 [r21+72], f384; +st.shared.f32 [r21+108], f394; +st.shared.f32 [r21+144], f404; +st.shared.f32 [r21+180], f414; +st.shared.f32 [r21+216], f424; +st.shared.f32 [r21+252], f434; +st.shared.f32 [r21+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r15]; +ld.shared.f32 f455, [r15+108]; +ld.shared.f32 f456, [r15+216]; +ld.shared.f32 f457, [r15+324]; +ld.shared.f32 f458, [r15+432]; +ld.shared.f32 f459, [r15+540]; +ld.shared.f32 f460, [r15+648]; +ld.shared.f32 f461, [r15+756]; +ld.shared.f32 f462, [r15+864]; +add.f32 f463, f448, f451; +add.f32 f464, f457, f460; +mul.f32 f465, f463, 0f3F000000; +sub.f32 f466, f445, f465; +sub.f32 f467, f457, f460; +mul.f32 f468, f467, 0f3F5DB3D7; +mul.f32 f469, f464, 0f3F000000; +sub.f32 f470, f454, f469; +sub.f32 f471, f448, f451; +mul.f32 f472, f471, 0f3F5DB3D7; +add.f32 f473, f449, f452; +add.f32 f474, f458, f461; +mul.f32 f475, f473, 0f3F000000; +sub.f32 f476, f446, f475; +sub.f32 f477, f458, f461; +mul.f32 f478, f477, 0f3F5DB3D7; +mul.f32 f479, f474, 0f3F000000; +sub.f32 f480, f455, f479; +sub.f32 f481, f449, f452; +mul.f32 f482, f481, 0f3F5DB3D7; +add.f32 f483, f450, f453; +add.f32 f484, f459, f462; +mul.f32 f485, f483, 0f3F000000; +sub.f32 f486, f447, f485; +sub.f32 f487, f459, f462; +mul.f32 f488, f487, 0f3F5DB3D7; +mul.f32 f489, f484, 0f3F000000; +sub.f32 f490, f456, f489; +sub.f32 f491, f450, f453; +mul.f32 f492, f491, 0f3F5DB3D7; +add.f32 %0, f445, f463; +add.f32 %1, f454, f464; +add.f32 %2, f446, f473; +add.f32 %3, f455, f474; +add.f32 %4, f447, f483; +add.f32 %5, f456, f484; +add.f32 %6, f468, f466; +sub.f32 %7, f470, f472; +add.f32 %8, f478, f476; +sub.f32 %9, f480, f482; +add.f32 %10, f488, f486; +sub.f32 %11, f490, f492; +sub.f32 %12, f466, f468; +add.f32 %13, f472, f470; +sub.f32 %14, f476, f478; +add.f32 %15, f482, f480; +sub.f32 %16, f486, f488; +add.f32 %17, f492, f490; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<136, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<217>; +.reg .b32 r<34>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 1944, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %13, %16; +add.f32 f14, %15, %17; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %11, f15; +sub.f32 f17, %15, %17; +mul.f32 f18, f17, 0f3F5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %12, f21; +sub.f32 f23, %13, %16; +mul.f32 f24, f23, 0f3F5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1944, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f27, f19; +mul.f32 f32, f28, f25; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f36, f20; +mul.f32 f40, f38, f26; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %12, f14; +add.f32 f43, %11, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f28, f19, f33; +sub.f32 f45, f31, f32; +st.shared.v2.f32 [r9+8], {f45, f44}; +sub.f32 f46, f39, f40; +fma.rn.f32 f47, f38, f20, f41; +st.shared.v2.f32 [r9+16], {f46, f47}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+648]; +ld.shared.v2.f32 {f56, f57}, [r11+1296]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0f3F5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0f3F5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f74, f66; +mul.f32 f79, f75, f72; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f83, f67; +mul.f32 f87, f85, f73; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f75, f66, f80; +sub.f32 f92, f78, f79; +st.shared.v2.f32 [r17+24], {f92, f91}; +fma.rn.f32 f93, f85, f67, f88; +sub.f32 f94, f86, f87; +st.shared.v2.f32 [r17+48], {f94, f93}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+648]; +ld.shared.v2.f32 {f103, f104}, [r11+1296]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f121, f113; +mul.f32 f126, f122, f119; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f130, f114; +mul.f32 f134, f132, f120; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r23], {f137, f136}; +fma.rn.f32 f138, f122, f113, f127; +sub.f32 f139, f125, f126; +st.shared.v2.f32 [r23+72], {f139, f138}; +fma.rn.f32 f140, f132, f114, f135; +sub.f32 f141, f133, f134; +st.shared.v2.f32 [r23+144], {f141, f140}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r11]; +ld.shared.v2.f32 {f146, f147}, [r11+648]; +ld.shared.v2.f32 {f150, f151}, [r11+1296]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0f3F5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f155, 0f3F000000; +sub.f32 f163, f143, f162; +sub.f32 f164, f146, f150; +mul.f32 f165, f164, 0f3F5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f168, f169}, [rd21]; +mul.f32 f172, f168, f160; +mul.f32 f173, f169, f166; +mul.f32 f174, f168, f166; +mul.f32 f175, f168, f168; +mul.f32 f176, f169, f169; +sub.f32 f177, f175, f176; +mul.f32 f178, f169, f168; +fma.rn.f32 f179, f169, f168, f178; +mul.f32 f180, f177, f161; +mul.f32 f181, f179, f167; +mul.f32 f182, f177, f167; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +add.f32 f183, f143, f155; +add.f32 f184, f142, f154; +st.shared.v2.f32 [r33], {f184, f183}; +fma.rn.f32 f185, f169, f160, f174; +sub.f32 f186, f172, f173; +st.shared.v2.f32 [r33+216], {f186, f185}; +fma.rn.f32 f187, f179, f161, f182; +sub.f32 f188, f180, f181; +st.shared.v2.f32 [r33+432], {f188, f187}; +barrier.sync 0; +ld.shared.v2.f32 {f189, f190}, [r11]; +ld.shared.v2.f32 {f193, f194}, [r11+648]; +ld.shared.v2.f32 {f197, f198}, [r11+1296]; +add.f32 f201, f193, f197; +add.f32 f202, f194, f198; +mul.f32 f203, f201, 0f3F000000; +sub.f32 f204, f189, f203; +sub.f32 f205, f194, f198; +mul.f32 f206, f205, 0f3F5DB3D7; +mul.f32 f207, f202, 0f3F000000; +sub.f32 f208, f190, f207; +sub.f32 f209, f193, f197; +mul.f32 f210, f209, 0f3F5DB3D7; +add.f32 %1, f190, f202; +add.f32 %0, f189, f201; +sub.f32 %3, f208, f210; +add.f32 %2, f206, f204; +add.f32 %5, f210, f208; +sub.f32 %4, f204, f206; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<137, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<193>; +.reg .b32 r<34>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 972, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %13, %16; +add.f32 f14, %11, f13; +add.f32 f15, %15, %17; +add.f32 f16, %12, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %11, f17; +sub.f32 f19, %15, %17; +mul.f32 f20, f19, 0f3F5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %12, f23; +sub.f32 f25, %13, %16; +mul.f32 f26, f25, 0f3F5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 972, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f29, f21; +mul.f32 f34, f30, f27; +sub.f32 f35, f33, f34; +mul.f32 f36, f29, f27; +fma.rn.f32 f37, f30, f21, f36; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f40, f22; +mul.f32 f44, f42, f28; +sub.f32 f45, f43, f44; +mul.f32 f46, f40, f28; +fma.rn.f32 f47, f42, f22, f46; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f35; +st.shared.f32 [r9+8], f45; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+324]; +ld.shared.f32 f50, [r11+648]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+324]; +ld.shared.f32 f53, [r11+648]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0f3F5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0f3F5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f70, f62; +mul.f32 f75, f71, f68; +sub.f32 f76, f74, f75; +mul.f32 f77, f70, f68; +fma.rn.f32 f78, f71, f62, f77; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f81, f63; +mul.f32 f85, f83, f69; +sub.f32 f86, f84, f85; +mul.f32 f87, f81, f69; +fma.rn.f32 f88, f83, f63, f87; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f76; +st.shared.f32 [r17+24], f86; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+324]; +ld.shared.f32 f91, [r11+648]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+324]; +ld.shared.f32 f94, [r11+648]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0f3F5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0f3F5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f111, f103; +mul.f32 f116, f112, f109; +sub.f32 f117, f115, f116; +mul.f32 f118, f111, f109; +fma.rn.f32 f119, f112, f103, f118; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f122, f104; +mul.f32 f126, f124, f110; +sub.f32 f127, f125, f126; +mul.f32 f128, f122, f110; +fma.rn.f32 f129, f124, f104, f128; +barrier.sync 0; +mad.lo.s32 r23, r18, 108, r22; +st.shared.f32 [r23], f96; +st.shared.f32 [r23+36], f117; +st.shared.f32 [r23+72], f127; +barrier.sync 0; +ld.shared.f32 f130, [r11]; +ld.shared.f32 f131, [r11+324]; +ld.shared.f32 f132, [r11+648]; +barrier.sync 0; +st.shared.f32 [r23], f98; +st.shared.f32 [r23+36], f119; +st.shared.f32 [r23+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r11]; +ld.shared.f32 f134, [r11+324]; +ld.shared.f32 f135, [r11+648]; +add.f32 f136, f131, f132; +add.f32 f137, f130, f136; +add.f32 f138, f134, f135; +add.f32 f139, f133, f138; +mul.f32 f140, f136, 0f3F000000; +sub.f32 f141, f130, f140; +sub.f32 f142, f134, f135; +mul.f32 f143, f142, 0f3F5DB3D7; +add.f32 f144, f143, f141; +sub.f32 f145, f141, f143; +mul.f32 f146, f138, 0f3F000000; +sub.f32 f147, f133, f146; +sub.f32 f148, f131, f132; +mul.f32 f149, f148, 0f3F5DB3D7; +sub.f32 f150, f147, f149; +add.f32 f151, f149, f147; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 2; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f152, f153}, [rd21]; +mul.f32 f156, f152, f144; +mul.f32 f157, f153, f150; +sub.f32 f158, f156, f157; +mul.f32 f159, f152, f150; +fma.rn.f32 f160, f153, f144, f159; +mul.f32 f161, f152, f152; +mul.f32 f162, f153, f153; +sub.f32 f163, f161, f162; +mul.f32 f164, f153, f152; +fma.rn.f32 f165, f153, f152, f164; +mul.f32 f166, f163, f145; +mul.f32 f167, f165, f151; +sub.f32 f168, f166, f167; +mul.f32 f169, f163, f151; +fma.rn.f32 f170, f165, f145, f169; +barrier.sync 0; +mad.lo.s32 r33, r28, 324, r32; +st.shared.f32 [r33], f137; +st.shared.f32 [r33+108], f158; +st.shared.f32 [r33+216], f168; +barrier.sync 0; +ld.shared.f32 f171, [r11]; +ld.shared.f32 f172, [r11+324]; +ld.shared.f32 f173, [r11+648]; +barrier.sync 0; +st.shared.f32 [r33], f139; +st.shared.f32 [r33+108], f160; +st.shared.f32 [r33+216], f170; +barrier.sync 0; +ld.shared.f32 f174, [r11]; +ld.shared.f32 f175, [r11+324]; +ld.shared.f32 f176, [r11+648]; +add.f32 f177, f172, f173; +add.f32 f178, f175, f176; +mul.f32 f179, f177, 0f3F000000; +sub.f32 f180, f171, f179; +sub.f32 f181, f175, f176; +mul.f32 f182, f181, 0f3F5DB3D7; +mul.f32 f183, f178, 0f3F000000; +sub.f32 f184, f174, f183; +sub.f32 f185, f172, f173; +mul.f32 f186, f185, 0f3F5DB3D7; +add.f32 %0, f171, f177; +add.f32 %1, f174, f178; +add.f32 %2, f182, f180; +sub.f32 %3, f184, f186; +sub.f32 %4, f180, f182; +add.f32 %5, f186, f184; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..b3a2835b1f5bb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp32_inv.hpp.inc @@ -0,0 +1,4176 @@ +#ifndef CUFFTDX_FFT_243_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_243_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<334, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1626>; +.reg .b32 r<14>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 1944, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1625, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1624, %57, f1625; +mul.f32 f119, f1625, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1623, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1622, %63, f1623; +mul.f32 f135, f1623, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1621, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1620, %69, f1621; +mul.f32 f151, f1621, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f1619, f133, 0f3F441B7D; +sub.f32 f159, f1619, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f1617, f149, 0f3E31D0D4; +mul.f32 f1618, f155, 0f3F7C1C5C; +sub.f32 f164, f1617, f1618; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f1615, f134, 0f3E31D0D4; +mul.f32 f1616, f140, 0f3F7C1C5C; +sub.f32 f169, f1615, f1616; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f1613, f150, 0fBF708FB2; +mul.f32 f1614, f156, 0f3EAF1D44; +sub.f32 f174, f1613, f1614; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1612, f1622, f1620; +sub.f32 f183, f1622, f1620; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1611, f1624, f1612; +mul.f32 f187, f1612, 0f3F000000; +sub.f32 f188, f1624, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1610, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1609, f123, f1610; +mul.f32 f203, f1610, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1608, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1607, f124, f1608; +mul.f32 f219, f1608, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1604, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1602, %112, f1604; +mul.f32 f235, f1604, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1599, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1597, %115, f1599; +mul.f32 f251, f1599, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1594, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1592, %118, f1594; +mul.f32 f267, f1594, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f1591, f249, 0f3F441B7D; +sub.f32 f275, f1591, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f1590, f265, 0f3E31D0D4; +sub.f32 f280, f1590, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f1588, f250, 0f3E31D0D4; +mul.f32 f1589, f256, 0f3F7C1C5C; +sub.f32 f285, f1588, f1589; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f1586, f266, 0fBF708FB2; +mul.f32 f1587, f272, 0f3EAF1D44; +sub.f32 f290, f1586, f1587; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1585, f1597, f1592; +sub.f32 f299, f1597, f1592; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1584, f1602, f1585; +mul.f32 f303, f1585, 0f3F000000; +sub.f32 f304, f1602, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1583, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1582, f239, f1583; +mul.f32 f319, f1583, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1581, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1580, f240, f1581; +mul.f32 f335, f1581, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1577, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1575, %121, f1577; +mul.f32 f351, f1577, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1572, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1570, %124, f1572; +mul.f32 f367, f1572, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1568, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1566, %126, f1568; +mul.f32 f383, f1568, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f1565, f365, 0f3F441B7D; +sub.f32 f391, f1565, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f1564, f381, 0f3E31D0D4; +sub.f32 f396, f1564, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f1562, f366, 0f3E31D0D4; +mul.f32 f1563, f372, 0f3F7C1C5C; +sub.f32 f401, f1562, f1563; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f1560, f382, 0fBF708FB2; +mul.f32 f1561, f388, 0f3EAF1D44; +sub.f32 f406, f1560, f1561; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1559, f1570, f1566; +sub.f32 f415, f1570, f1566; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1558, f1575, f1559; +mul.f32 f419, f1559, 0f3F000000; +sub.f32 f420, f1575, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1557, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1556, f355, f1557; +mul.f32 f435, f1557, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1555, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1554, f356, f1555; +mul.f32 f451, f1555, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1582, 0f3E6C2691; +mul.f32 f1553, f310, 0f3F791978; +sub.f32 f459, f1553, f458; +mul.f32 f460, f1582, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f1551, f426, 0f3F64C51C; +mul.f32 f1552, f1556, 0f3EE5C902; +sub.f32 f464, f1551, f1552; +mul.f32 f465, f1556, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f1549, f326, 0f3F64C51C; +mul.f32 f1550, f1580, 0f3EE5C902; +sub.f32 f469, f1549, f1550; +mul.f32 f470, f1580, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f1547, f442, 0f3F18DF63; +mul.f32 f1548, f1554, 0f3F4D57F2; +sub.f32 f474, f1547, f1548; +mul.f32 f475, f1554, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f1545, f301, 0f3F441B7D; +mul.f32 f1546, f307, 0f3F248DBB; +sub.f32 f479, f1545, f1546; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f1544, f417, 0f3E31D0D4; +sub.f32 f484, f1544, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f1543, f317, 0f3F18DF63; +sub.f32 f489, f1543, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f1542, f433, 0fBE92D7E0; +sub.f32 f494, f1542, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f1541, f333, 0f3ECACAF8; +sub.f32 f499, f1541, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f1540, f449, 0fBF2FAD88; +sub.f32 f504, f1540, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f1539, f302, 0f3E31D0D4; +sub.f32 f509, f1539, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f1537, f418, 0fBF708FB2; +mul.f32 f1538, f424, 0f3EAF1D44; +sub.f32 f514, f1537, f1538; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f1535, f318, 0fBD6E2946; +mul.f32 f1536, f324, 0f3F7F9120; +sub.f32 f519, f1535, f1536; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f1533, f434, 0fBF7E44DE; +mul.f32 f1534, f440, 0fBDEDC21F; +sub.f32 f524, f1533, f1534; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f1532, f334, 0fBE92D7E0; +sub.f32 f529, f1532, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f1531, f450, 0fBF55E287; +sub.f32 f534, f1531, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f1530, f1584, f1558; +sub.f32 f541, f1584, f1558; +mul.f32 f542, f541, 0fBF5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f1530, 0f3F000000; +sub.f32 f546, f1611, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0fBF5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f1529, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0fBF5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f1528, f1609, f1529; +mul.f32 f561, f1529, 0f3F000000; +sub.f32 f562, f1609, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0fBF5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f1527, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0fBF5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f1526, f1607, f1527; +mul.f32 f577, f1527, 0f3F000000; +sub.f32 f578, f1607, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0fBF5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f1525, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0fBF5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f1524, f191, f1525; +mul.f32 f593, f1525, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0fBF5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f1523, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f1522, f207, f1523; +mul.f32 f609, f1523, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0fBF5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f1521, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0fBF5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f1520, f223, f1521; +mul.f32 f625, f1521, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0fBF5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f1519, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0fBF5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f1518, f192, f1519; +mul.f32 f641, f1519, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0fBF5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f1517, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0fBF5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f1516, f208, f1517; +mul.f32 f657, f1517, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0fBF5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f1515, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0fBF5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f1514, f224, f1515; +mul.f32 f673, f1515, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0fBF5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 1944, r3; +mul.wide.u32 rd7, r7, 8; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f683, f1528, f680; +mul.f32 f685, f679, f1528; +mul.f32 f1512, f679, f679; +mul.f32 f1513, f680, f680; +sub.f32 f688, f1512, f1513; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f691, f1526, f690; +mul.f32 f693, f688, f1526; +mul.f32 f695, f680, f690; +mul.f32 f1511, f679, f688; +sub.f32 f696, f1511, f695; +mul.f32 f1510, f568, f690; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f699, f1524, f698; +mul.f32 f701, f696, f1524; +mul.f32 f1508, f679, f696; +mul.f32 f1509, f680, f698; +sub.f32 f704, f1508, f1509; +mul.f32 f1507, f584, f698; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f707, f1522, f706; +mul.f32 f709, f704, f1522; +mul.f32 f711, f680, f706; +mul.f32 f1506, f679, f704; +sub.f32 f712, f1506, f711; +mul.f32 f1505, f600, f706; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f715, f1520, f714; +mul.f32 f717, f712, f1520; +mul.f32 f719, f680, f714; +mul.f32 f1504, f679, f712; +sub.f32 f720, f1504, f719; +mul.f32 f1503, f616, f714; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f723, f1518, f722; +mul.f32 f725, f720, f1518; +mul.f32 f1501, f679, f720; +mul.f32 f1502, f680, f722; +sub.f32 f728, f1501, f1502; +mul.f32 f1500, f632, f722; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f731, f1516, f730; +mul.f32 f733, f728, f1516; +mul.f32 f735, f680, f730; +mul.f32 f1499, f679, f728; +sub.f32 f736, f1499, f735; +mul.f32 f1498, f648, f730; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f739, f1514, f738; +mul.f32 f741, f736, f1514; +mul.f32 f743, f680, f738; +mul.f32 f1497, f679, f736; +sub.f32 f744, f1497, f743; +mul.f32 f1496, f664, f738; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f747, f549, f746; +mul.f32 f749, f744, f549; +mul.f32 f1494, f679, f744; +mul.f32 f1495, f680, f746; +sub.f32 f752, f1494, f1495; +mul.f32 f1493, f543, f746; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f755, f565, f754; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f1492, f679, f752; +sub.f32 f760, f1492, f759; +mul.f32 f1491, f559, f754; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f763, f581, f762; +mul.f32 f765, f760, f581; +mul.f32 f1489, f679, f760; +mul.f32 f1490, f680, f762; +sub.f32 f768, f1489, f1490; +mul.f32 f1488, f575, f762; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f771, f597, f770; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f1487, f679, f768; +sub.f32 f776, f1487, f775; +mul.f32 f1486, f591, f770; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f779, f613, f778; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f1485, f679, f776; +sub.f32 f784, f1485, f783; +mul.f32 f1484, f607, f778; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f787, f629, f786; +mul.f32 f789, f784, f629; +mul.f32 f1482, f679, f784; +mul.f32 f1483, f680, f786; +sub.f32 f792, f1482, f1483; +mul.f32 f1481, f623, f786; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f795, f645, f794; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f1480, f679, f792; +sub.f32 f800, f1480, f799; +mul.f32 f1479, f639, f794; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f803, f661, f802; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f1478, f679, f800; +sub.f32 f808, f1478, f807; +mul.f32 f1477, f655, f802; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f811, f677, f810; +mul.f32 f813, f808, f677; +mul.f32 f1475, f679, f808; +mul.f32 f1476, f680, f810; +sub.f32 f816, f1475, f1476; +mul.f32 f1474, f671, f810; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f819, f550, f818; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f1473, f679, f816; +sub.f32 f824, f1473, f823; +mul.f32 f1472, f544, f818; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f827, f566, f826; +mul.f32 f829, f824, f566; +mul.f32 f1470, f679, f824; +mul.f32 f1471, f680, f826; +sub.f32 f832, f1470, f1471; +mul.f32 f1469, f560, f826; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f835, f582, f834; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f1468, f679, f832; +sub.f32 f840, f1468, f839; +mul.f32 f1467, f576, f834; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f843, f598, f842; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f1466, f679, f840; +sub.f32 f848, f1466, f847; +mul.f32 f1465, f592, f842; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f851, f614, f850; +mul.f32 f853, f848, f614; +mul.f32 f1463, f679, f848; +mul.f32 f1464, f680, f850; +sub.f32 f856, f1463, f1464; +mul.f32 f1462, f608, f850; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f859, f630, f858; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f1461, f679, f856; +sub.f32 f864, f1461, f863; +mul.f32 f1460, f624, f858; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f867, f646, f866; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f1459, f679, f864; +sub.f32 f872, f1459, f871; +mul.f32 f1458, f640, f866; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f875, f662, f874; +mul.f32 f877, f872, f662; +mul.f32 f1456, f679, f872; +mul.f32 f1457, f680, f874; +sub.f32 f880, f1456, f1457; +mul.f32 f1455, f656, f874; +mul.f32 f881, f679, f874; +mul.f32 f1454, f552, f680; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f678, f882; +mul.f32 f884, f672, f882; +mul.f32 f885, f880, f678; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +add.f32 f886, f1611, f1530; +add.f32 f887, f178, f537; +st.shared.v2.f32 [r9], {f887, f886}; +fma.rn.f32 f888, f679, f552, f683; +sub.f32 f889, f685, f1454; +st.shared.v2.f32 [r9+8], {f888, f889}; +fma.rn.f32 f890, f688, f568, f691; +sub.f32 f891, f693, f1510; +st.shared.v2.f32 [r9+16], {f890, f891}; +fma.rn.f32 f892, f696, f584, f699; +sub.f32 f893, f701, f1507; +st.shared.v2.f32 [r9+24], {f892, f893}; +fma.rn.f32 f894, f704, f600, f707; +sub.f32 f895, f709, f1505; +st.shared.v2.f32 [r9+32], {f894, f895}; +fma.rn.f32 f896, f712, f616, f715; +sub.f32 f897, f717, f1503; +st.shared.v2.f32 [r9+40], {f896, f897}; +fma.rn.f32 f898, f720, f632, f723; +sub.f32 f899, f725, f1500; +st.shared.v2.f32 [r9+48], {f898, f899}; +sub.f32 f900, f733, f1498; +fma.rn.f32 f901, f728, f648, f731; +st.shared.v2.f32 [r9+56], {f901, f900}; +fma.rn.f32 f902, f736, f664, f739; +sub.f32 f903, f741, f1496; +st.shared.v2.f32 [r9+64], {f902, f903}; +fma.rn.f32 f904, f744, f543, f747; +sub.f32 f905, f749, f1493; +st.shared.v2.f32 [r9+72], {f904, f905}; +fma.rn.f32 f906, f752, f559, f755; +sub.f32 f907, f757, f1491; +st.shared.v2.f32 [r9+80], {f906, f907}; +fma.rn.f32 f908, f760, f575, f763; +sub.f32 f909, f765, f1488; +st.shared.v2.f32 [r9+88], {f908, f909}; +fma.rn.f32 f910, f768, f591, f771; +sub.f32 f911, f773, f1486; +st.shared.v2.f32 [r9+96], {f910, f911}; +fma.rn.f32 f912, f776, f607, f779; +sub.f32 f913, f781, f1484; +st.shared.v2.f32 [r9+104], {f912, f913}; +fma.rn.f32 f914, f784, f623, f787; +sub.f32 f915, f789, f1481; +st.shared.v2.f32 [r9+112], {f914, f915}; +fma.rn.f32 f916, f792, f639, f795; +sub.f32 f917, f797, f1479; +st.shared.v2.f32 [r9+120], {f916, f917}; +fma.rn.f32 f918, f800, f655, f803; +sub.f32 f919, f805, f1477; +st.shared.v2.f32 [r9+128], {f918, f919}; +fma.rn.f32 f920, f808, f671, f811; +sub.f32 f921, f813, f1474; +st.shared.v2.f32 [r9+136], {f920, f921}; +fma.rn.f32 f922, f816, f544, f819; +sub.f32 f923, f821, f1472; +st.shared.v2.f32 [r9+144], {f922, f923}; +fma.rn.f32 f924, f824, f560, f827; +sub.f32 f925, f829, f1469; +st.shared.v2.f32 [r9+152], {f924, f925}; +fma.rn.f32 f926, f832, f576, f835; +sub.f32 f927, f837, f1467; +st.shared.v2.f32 [r9+160], {f926, f927}; +fma.rn.f32 f928, f840, f592, f843; +sub.f32 f929, f845, f1465; +st.shared.v2.f32 [r9+168], {f928, f929}; +fma.rn.f32 f930, f848, f608, f851; +sub.f32 f931, f853, f1462; +st.shared.v2.f32 [r9+176], {f930, f931}; +fma.rn.f32 f932, f856, f624, f859; +sub.f32 f933, f861, f1460; +st.shared.v2.f32 [r9+184], {f932, f933}; +fma.rn.f32 f934, f864, f640, f867; +sub.f32 f935, f869, f1458; +st.shared.v2.f32 [r9+192], {f934, f935}; +fma.rn.f32 f936, f872, f656, f875; +sub.f32 f937, f877, f1455; +st.shared.v2.f32 [r9+200], {f936, f937}; +fma.rn.f32 f938, f880, f672, f883; +sub.f32 f939, f885, f884; +st.shared.v2.f32 [r9+208], {f938, f939}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+72]; +ld.shared.v2.f32 {f948, f949}, [r10+144]; +ld.shared.v2.f32 {f952, f953}, [r10+216]; +ld.shared.v2.f32 {f956, f957}, [r10+288]; +ld.shared.v2.f32 {f960, f961}, [r10+360]; +ld.shared.v2.f32 {f964, f965}, [r10+432]; +ld.shared.v2.f32 {f968, f969}, [r10+504]; +ld.shared.v2.f32 {f972, f973}, [r10+576]; +ld.shared.v2.f32 {f976, f977}, [r10+648]; +ld.shared.v2.f32 {f980, f981}, [r10+720]; +ld.shared.v2.f32 {f984, f985}, [r10+792]; +ld.shared.v2.f32 {f988, f989}, [r10+864]; +ld.shared.v2.f32 {f992, f993}, [r10+936]; +ld.shared.v2.f32 {f996, f997}, [r10+1008]; +ld.shared.v2.f32 {f1000, f1001}, [r10+1080]; +ld.shared.v2.f32 {f1004, f1005}, [r10+1152]; +ld.shared.v2.f32 {f1008, f1009}, [r10+1224]; +ld.shared.v2.f32 {f1012, f1013}, [r10+1296]; +ld.shared.v2.f32 {f1016, f1017}, [r10+1368]; +ld.shared.v2.f32 {f1020, f1021}, [r10+1440]; +ld.shared.v2.f32 {f1024, f1025}, [r10+1512]; +ld.shared.v2.f32 {f1028, f1029}, [r10+1584]; +ld.shared.v2.f32 {f1032, f1033}, [r10+1656]; +ld.shared.v2.f32 {f1036, f1037}, [r10+1728]; +ld.shared.v2.f32 {f1040, f1041}, [r10+1800]; +ld.shared.v2.f32 {f1044, f1045}, [r10+1872]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f1453, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0fBF5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f1452, f941, f1453; +mul.f32 f1058, f1453, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0fBF5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f1451, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0fBF5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f1450, f953, f1451; +mul.f32 f1074, f1451, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0fBF5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f1449, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0fBF5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f1448, f965, f1449; +mul.f32 f1090, f1449, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0fBF5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f1446, f1072, 0f3F441B7D; +mul.f32 f1447, f1078, 0f3F248DBB; +sub.f32 f1098, f1446, f1447; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0f3F248DBB, f1099; +mul.f32 f1444, f1088, 0f3E31D0D4; +mul.f32 f1445, f1094, 0f3F7C1C5C; +sub.f32 f1103, f1444, f1445; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0f3F7C1C5C, f1104; +mul.f32 f1442, f1073, 0f3E31D0D4; +mul.f32 f1443, f1079, 0f3F7C1C5C; +sub.f32 f1108, f1442, f1443; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0f3F7C1C5C, f1109; +mul.f32 f1112, f1095, 0f3EAF1D44; +mul.f32 f1441, f1089, 0fBF708FB2; +sub.f32 f1113, f1441, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0f3EAF1D44, f1114; +add.f32 f1116, f1065, f1081; +mul.f32 f1118, f1116, 0f3F000000; +sub.f32 f1119, f1049, f1118; +add.f32 f1440, f1450, f1448; +sub.f32 f1120, f1450, f1448; +mul.f32 f1121, f1120, 0fBF5DB3D7; +mul.f32 f1122, f1440, 0f3F000000; +sub.f32 f1123, f1452, f1122; +sub.f32 f1124, f1065, f1081; +mul.f32 f1125, f1124, 0fBF5DB3D7; +add.f32 f1126, f1098, f1103; +mul.f32 f1128, f1126, 0f3F000000; +sub.f32 f1129, f1056, f1128; +add.f32 f1439, f1100, f1105; +sub.f32 f1130, f1100, f1105; +mul.f32 f1131, f1130, 0fBF5DB3D7; +mul.f32 f1132, f1439, 0f3F000000; +sub.f32 f1133, f1062, f1132; +sub.f32 f1134, f1098, f1103; +mul.f32 f1135, f1134, 0fBF5DB3D7; +add.f32 f1136, f1108, f1113; +mul.f32 f1138, f1136, 0f3F000000; +sub.f32 f1139, f1057, f1138; +add.f32 f1438, f1110, f1115; +sub.f32 f1140, f1110, f1115; +mul.f32 f1141, f1140, 0fBF5DB3D7; +mul.f32 f1142, f1438, 0f3F000000; +sub.f32 f1143, f1063, f1142; +sub.f32 f1144, f1108, f1113; +mul.f32 f1145, f1144, 0fBF5DB3D7; +add.f32 f1146, f980, f1016; +add.f32 f1147, f944, f1146; +mul.f32 f1150, f1146, 0f3F000000; +sub.f32 f1151, f944, f1150; +add.f32 f1437, f981, f1017; +sub.f32 f1152, f981, f1017; +mul.f32 f1153, f1152, 0fBF5DB3D7; +add.f32 f1154, f1153, f1151; +sub.f32 f1155, f1151, f1153; +add.f32 f1436, f945, f1437; +mul.f32 f1156, f1437, 0f3F000000; +sub.f32 f1157, f945, f1156; +sub.f32 f1158, f980, f1016; +mul.f32 f1159, f1158, 0fBF5DB3D7; +sub.f32 f1160, f1157, f1159; +add.f32 f1161, f1159, f1157; +add.f32 f1162, f992, f1028; +add.f32 f1163, f956, f1162; +mul.f32 f1166, f1162, 0f3F000000; +sub.f32 f1167, f956, f1166; +add.f32 f1435, f993, f1029; +sub.f32 f1168, f993, f1029; +mul.f32 f1169, f1168, 0fBF5DB3D7; +add.f32 f1170, f1169, f1167; +sub.f32 f1171, f1167, f1169; +add.f32 f1434, f957, f1435; +mul.f32 f1172, f1435, 0f3F000000; +sub.f32 f1173, f957, f1172; +sub.f32 f1174, f992, f1028; +mul.f32 f1175, f1174, 0fBF5DB3D7; +sub.f32 f1176, f1173, f1175; +add.f32 f1177, f1175, f1173; +add.f32 f1178, f1004, f1040; +add.f32 f1179, f968, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f968, f1182; +add.f32 f1433, f1005, f1041; +sub.f32 f1184, f1005, f1041; +mul.f32 f1185, f1184, 0fBF5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f1432, f969, f1433; +mul.f32 f1188, f1433, 0f3F000000; +sub.f32 f1189, f969, f1188; +sub.f32 f1190, f1004, f1040; +mul.f32 f1191, f1190, 0fBF5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +mul.f32 f1195, f1176, 0f3F248DBB; +mul.f32 f1431, f1170, 0f3F441B7D; +sub.f32 f1196, f1431, f1195; +mul.f32 f1197, f1176, 0f3F441B7D; +fma.rn.f32 f1198, f1170, 0f3F248DBB, f1197; +mul.f32 f1200, f1192, 0f3F7C1C5C; +mul.f32 f1430, f1186, 0f3E31D0D4; +sub.f32 f1201, f1430, f1200; +mul.f32 f1202, f1192, 0f3E31D0D4; +fma.rn.f32 f1203, f1186, 0f3F7C1C5C, f1202; +mul.f32 f1205, f1177, 0f3F7C1C5C; +mul.f32 f1429, f1171, 0f3E31D0D4; +sub.f32 f1206, f1429, f1205; +mul.f32 f1207, f1177, 0f3E31D0D4; +fma.rn.f32 f1208, f1171, 0f3F7C1C5C, f1207; +mul.f32 f1210, f1193, 0f3EAF1D44; +mul.f32 f1428, f1187, 0fBF708FB2; +sub.f32 f1211, f1428, f1210; +mul.f32 f1212, f1193, 0fBF708FB2; +fma.rn.f32 f1213, f1187, 0f3EAF1D44, f1212; +add.f32 f1214, f1163, f1179; +mul.f32 f1216, f1214, 0f3F000000; +sub.f32 f1217, f1147, f1216; +add.f32 f1427, f1434, f1432; +sub.f32 f1218, f1434, f1432; +mul.f32 f1219, f1218, 0fBF5DB3D7; +mul.f32 f1220, f1427, 0f3F000000; +sub.f32 f1221, f1436, f1220; +sub.f32 f1222, f1163, f1179; +mul.f32 f1223, f1222, 0fBF5DB3D7; +add.f32 f1224, f1196, f1201; +mul.f32 f1226, f1224, 0f3F000000; +sub.f32 f1227, f1154, f1226; +add.f32 f1426, f1198, f1203; +sub.f32 f1228, f1198, f1203; +mul.f32 f1229, f1228, 0fBF5DB3D7; +mul.f32 f1230, f1426, 0f3F000000; +sub.f32 f1231, f1160, f1230; +sub.f32 f1232, f1196, f1201; +mul.f32 f1233, f1232, 0fBF5DB3D7; +add.f32 f1234, f1206, f1211; +mul.f32 f1236, f1234, 0f3F000000; +sub.f32 f1237, f1155, f1236; +add.f32 f1425, f1208, f1213; +sub.f32 f1238, f1208, f1213; +mul.f32 f1239, f1238, 0fBF5DB3D7; +mul.f32 f1240, f1425, 0f3F000000; +sub.f32 f1241, f1161, f1240; +sub.f32 f1242, f1206, f1211; +mul.f32 f1243, f1242, 0fBF5DB3D7; +add.f32 f1244, f984, f1020; +add.f32 f1245, f948, f1244; +mul.f32 f1248, f1244, 0f3F000000; +sub.f32 f1249, f948, f1248; +add.f32 f1424, f985, f1021; +sub.f32 f1250, f985, f1021; +mul.f32 f1251, f1250, 0fBF5DB3D7; +add.f32 f1252, f1251, f1249; +sub.f32 f1253, f1249, f1251; +add.f32 f1423, f949, f1424; +mul.f32 f1254, f1424, 0f3F000000; +sub.f32 f1255, f949, f1254; +sub.f32 f1256, f984, f1020; +mul.f32 f1257, f1256, 0fBF5DB3D7; +sub.f32 f1258, f1255, f1257; +add.f32 f1259, f1257, f1255; +add.f32 f1260, f996, f1032; +add.f32 f1261, f960, f1260; +mul.f32 f1264, f1260, 0f3F000000; +sub.f32 f1265, f960, f1264; +add.f32 f1422, f997, f1033; +sub.f32 f1266, f997, f1033; +mul.f32 f1267, f1266, 0fBF5DB3D7; +add.f32 f1268, f1267, f1265; +sub.f32 f1269, f1265, f1267; +add.f32 f1421, f961, f1422; +mul.f32 f1270, f1422, 0f3F000000; +sub.f32 f1271, f961, f1270; +sub.f32 f1272, f996, f1032; +mul.f32 f1273, f1272, 0fBF5DB3D7; +sub.f32 f1274, f1271, f1273; +add.f32 f1275, f1273, f1271; +add.f32 f1276, f1008, f1044; +add.f32 f1277, f972, f1276; +mul.f32 f1280, f1276, 0f3F000000; +sub.f32 f1281, f972, f1280; +add.f32 f1420, f1009, f1045; +sub.f32 f1282, f1009, f1045; +mul.f32 f1283, f1282, 0fBF5DB3D7; +add.f32 f1284, f1283, f1281; +sub.f32 f1285, f1281, f1283; +add.f32 f1419, f973, f1420; +mul.f32 f1286, f1420, 0f3F000000; +sub.f32 f1287, f973, f1286; +sub.f32 f1288, f1008, f1044; +mul.f32 f1289, f1288, 0fBF5DB3D7; +sub.f32 f1290, f1287, f1289; +add.f32 f1291, f1289, f1287; +mul.f32 f1293, f1274, 0f3F248DBB; +mul.f32 f1418, f1268, 0f3F441B7D; +sub.f32 f1294, f1418, f1293; +mul.f32 f1295, f1274, 0f3F441B7D; +fma.rn.f32 f1296, f1268, 0f3F248DBB, f1295; +mul.f32 f1298, f1290, 0f3F7C1C5C; +mul.f32 f1417, f1284, 0f3E31D0D4; +sub.f32 f1299, f1417, f1298; +mul.f32 f1300, f1290, 0f3E31D0D4; +fma.rn.f32 f1301, f1284, 0f3F7C1C5C, f1300; +mul.f32 f1415, f1269, 0f3E31D0D4; +mul.f32 f1416, f1275, 0f3F7C1C5C; +sub.f32 f1304, f1415, f1416; +mul.f32 f1305, f1275, 0f3E31D0D4; +fma.rn.f32 f1306, f1269, 0f3F7C1C5C, f1305; +mul.f32 f1413, f1285, 0fBF708FB2; +mul.f32 f1414, f1291, 0f3EAF1D44; +sub.f32 f1309, f1413, f1414; +mul.f32 f1310, f1291, 0fBF708FB2; +fma.rn.f32 f1311, f1285, 0f3EAF1D44, f1310; +add.f32 f1312, f1261, f1277; +mul.f32 f1314, f1312, 0f3F000000; +sub.f32 f1315, f1245, f1314; +add.f32 f1412, f1421, f1419; +sub.f32 f1316, f1421, f1419; +mul.f32 f1317, f1316, 0fBF5DB3D7; +mul.f32 f1318, f1412, 0f3F000000; +sub.f32 f1319, f1423, f1318; +sub.f32 f1320, f1261, f1277; +mul.f32 f1321, f1320, 0fBF5DB3D7; +add.f32 f1322, f1294, f1299; +mul.f32 f1324, f1322, 0f3F000000; +sub.f32 f1325, f1252, f1324; +add.f32 f1411, f1296, f1301; +sub.f32 f1326, f1296, f1301; +mul.f32 f1327, f1326, 0fBF5DB3D7; +mul.f32 f1328, f1411, 0f3F000000; +sub.f32 f1329, f1258, f1328; +sub.f32 f1330, f1294, f1299; +mul.f32 f1331, f1330, 0fBF5DB3D7; +add.f32 f1332, f1304, f1309; +mul.f32 f1334, f1332, 0f3F000000; +sub.f32 f1335, f1253, f1334; +add.f32 f1410, f1306, f1311; +sub.f32 f1336, f1306, f1311; +mul.f32 f1337, f1336, 0fBF5DB3D7; +mul.f32 f1338, f1410, 0f3F000000; +sub.f32 f1339, f1259, f1338; +sub.f32 f1340, f1304, f1309; +mul.f32 f1341, f1340, 0fBF5DB3D7; +add.f32 %1, f1452, f1440; +add.f32 %0, f1049, f1116; +add.f32 %3, f1436, f1427; +add.f32 %2, f1147, f1214; +add.f32 %5, f1423, f1412; +add.f32 %4, f1245, f1312; +add.f32 %7, f1062, f1439; +add.f32 %6, f1056, f1126; +add.f32 %9, f1160, f1426; +add.f32 %8, f1154, f1224; +add.f32 %11, f1258, f1411; +add.f32 %10, f1252, f1322; +add.f32 %13, f1063, f1438; +add.f32 %12, f1057, f1136; +add.f32 %15, f1161, f1425; +add.f32 %14, f1155, f1234; +add.f32 %17, f1259, f1410; +add.f32 %16, f1253, f1332; +add.f32 %18, f1121, f1119; +sub.f32 %19, f1123, f1125; +sub.f32 %21, f1221, f1223; +add.f32 %20, f1219, f1217; +sub.f32 %23, f1319, f1321; +add.f32 %22, f1317, f1315; +sub.f32 %25, f1133, f1135; +add.f32 %24, f1131, f1129; +add.f32 %26, f1229, f1227; +sub.f32 %27, f1231, f1233; +add.f32 %28, f1327, f1325; +sub.f32 %29, f1329, f1331; +add.f32 %30, f1141, f1139; +sub.f32 %31, f1143, f1145; +sub.f32 %33, f1241, f1243; +add.f32 %32, f1239, f1237; +sub.f32 %35, f1339, f1341; +add.f32 %34, f1337, f1335; +add.f32 %37, f1125, f1123; +sub.f32 %36, f1119, f1121; +add.f32 %39, f1223, f1221; +sub.f32 %38, f1217, f1219; +add.f32 %41, f1321, f1319; +sub.f32 %40, f1315, f1317; +add.f32 %43, f1135, f1133; +sub.f32 %42, f1129, f1131; +add.f32 %45, f1233, f1231; +sub.f32 %44, f1227, f1229; +add.f32 %47, f1331, f1329; +sub.f32 %46, f1325, f1327; +add.f32 %49, f1145, f1143; +sub.f32 %48, f1139, f1141; +add.f32 %51, f1243, f1241; +sub.f32 %50, f1237, f1239; +add.f32 %53, f1341, f1339; +sub.f32 %52, f1335, f1337; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<335, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1555>; +.reg .b32 r<14>; +.reg .b64 rd<8>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 972, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1546, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1545, %57, f1546; +mul.f32 f119, f1546, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1544, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1543, %63, f1544; +mul.f32 f135, f1544, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1542, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1541, %69, f1542; +mul.f32 f151, f1542, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f1540, f133, 0f3F441B7D; +sub.f32 f159, f1540, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f1538, f149, 0f3E31D0D4; +mul.f32 f1539, f155, 0f3F7C1C5C; +sub.f32 f164, f1538, f1539; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f1536, f134, 0f3E31D0D4; +mul.f32 f1537, f140, 0f3F7C1C5C; +sub.f32 f169, f1536, f1537; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f1534, f150, 0fBF708FB2; +mul.f32 f1535, f156, 0f3EAF1D44; +sub.f32 f174, f1534, f1535; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1533, f1543, f1541; +sub.f32 f183, f1543, f1541; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1532, f1545, f1533; +mul.f32 f187, f1533, 0f3F000000; +sub.f32 f188, f1545, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1531, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1530, f123, f1531; +mul.f32 f203, f1531, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1529, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1528, f124, f1529; +mul.f32 f219, f1529, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1525, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1523, %112, f1525; +mul.f32 f235, f1525, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1520, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1518, %115, f1520; +mul.f32 f251, f1520, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1515, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1513, %118, f1515; +mul.f32 f267, f1515, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f1512, f249, 0f3F441B7D; +sub.f32 f275, f1512, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f1511, f265, 0f3E31D0D4; +sub.f32 f280, f1511, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f1509, f250, 0f3E31D0D4; +mul.f32 f1510, f256, 0f3F7C1C5C; +sub.f32 f285, f1509, f1510; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f1507, f266, 0fBF708FB2; +mul.f32 f1508, f272, 0f3EAF1D44; +sub.f32 f290, f1507, f1508; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1506, f1518, f1513; +sub.f32 f299, f1518, f1513; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1505, f1523, f1506; +mul.f32 f303, f1506, 0f3F000000; +sub.f32 f304, f1523, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1504, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1503, f239, f1504; +mul.f32 f319, f1504, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1502, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1501, f240, f1502; +mul.f32 f335, f1502, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1498, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1496, %121, f1498; +mul.f32 f351, f1498, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1493, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1491, %124, f1493; +mul.f32 f367, f1493, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1489, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1487, %126, f1489; +mul.f32 f383, f1489, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f1486, f365, 0f3F441B7D; +sub.f32 f391, f1486, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f1485, f381, 0f3E31D0D4; +sub.f32 f396, f1485, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f1483, f366, 0f3E31D0D4; +mul.f32 f1484, f372, 0f3F7C1C5C; +sub.f32 f401, f1483, f1484; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f1481, f382, 0fBF708FB2; +mul.f32 f1482, f388, 0f3EAF1D44; +sub.f32 f406, f1481, f1482; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1480, f1491, f1487; +sub.f32 f415, f1491, f1487; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1479, f1496, f1480; +mul.f32 f419, f1480, 0f3F000000; +sub.f32 f420, f1496, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1478, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1477, f355, f1478; +mul.f32 f435, f1478, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1476, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1475, f356, f1476; +mul.f32 f451, f1476, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1503, 0f3E6C2691; +mul.f32 f1474, f310, 0f3F791978; +sub.f32 f459, f1474, f458; +mul.f32 f460, f1503, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f1472, f426, 0f3F64C51C; +mul.f32 f1473, f1477, 0f3EE5C902; +sub.f32 f464, f1472, f1473; +mul.f32 f465, f1477, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f1470, f326, 0f3F64C51C; +mul.f32 f1471, f1501, 0f3EE5C902; +sub.f32 f469, f1470, f1471; +mul.f32 f470, f1501, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f1468, f442, 0f3F18DF63; +mul.f32 f1469, f1475, 0f3F4D57F2; +sub.f32 f474, f1468, f1469; +mul.f32 f475, f1475, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f1466, f301, 0f3F441B7D; +mul.f32 f1467, f307, 0f3F248DBB; +sub.f32 f479, f1466, f1467; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f1465, f417, 0f3E31D0D4; +sub.f32 f484, f1465, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f1464, f317, 0f3F18DF63; +sub.f32 f489, f1464, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f1463, f433, 0fBE92D7E0; +sub.f32 f494, f1463, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f1462, f333, 0f3ECACAF8; +sub.f32 f499, f1462, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f1461, f449, 0fBF2FAD88; +sub.f32 f504, f1461, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f1460, f302, 0f3E31D0D4; +sub.f32 f509, f1460, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f1458, f418, 0fBF708FB2; +mul.f32 f1459, f424, 0f3EAF1D44; +sub.f32 f514, f1458, f1459; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f1456, f318, 0fBD6E2946; +mul.f32 f1457, f324, 0f3F7F9120; +sub.f32 f519, f1456, f1457; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f1454, f434, 0fBF7E44DE; +mul.f32 f1455, f440, 0fBDEDC21F; +sub.f32 f524, f1454, f1455; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f1453, f334, 0fBE92D7E0; +sub.f32 f529, f1453, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f1452, f450, 0fBF55E287; +sub.f32 f534, f1452, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f1451, f1505, f1479; +sub.f32 f543, f1505, f1479; +mul.f32 f544, f543, 0fBF5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f1450, f1532, f1451; +mul.f32 f547, f1451, 0f3F000000; +sub.f32 f548, f1532, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0fBF5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f1449, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f1448, f1530, f1449; +mul.f32 f563, f1449, 0f3F000000; +sub.f32 f564, f1530, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0fBF5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f1447, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f1446, f1528, f1447; +mul.f32 f579, f1447, 0f3F000000; +sub.f32 f580, f1528, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0fBF5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f1445, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0fBF5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f1444, f191, f1445; +mul.f32 f595, f1445, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0fBF5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f1443, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0fBF5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f1442, f207, f1443; +mul.f32 f611, f1443, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0fBF5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f1441, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0fBF5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f1440, f223, f1441; +mul.f32 f627, f1441, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0fBF5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f1439, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0fBF5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f1438, f192, f1439; +mul.f32 f643, f1439, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0fBF5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f1437, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0fBF5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f1436, f208, f1437; +mul.f32 f659, f1437, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0fBF5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f1435, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0fBF5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f1434, f224, f1435; +mul.f32 f675, f1435, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0fBF5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f685, f1448, f682; +fma.rn.f32 f686, f681, f554, f685; +mul.f32 f687, f554, f682; +mul.f32 f688, f681, f1448; +sub.f32 f689, f688, f687; +mul.f32 f691, f682, f682; +mul.f32 f1433, f681, f681; +sub.f32 f692, f1433, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f695, f1446, f694; +fma.rn.f32 f696, f692, f570, f695; +mul.f32 f697, f570, f694; +mul.f32 f698, f692, f1446; +sub.f32 f699, f698, f697; +mul.f32 f701, f682, f694; +mul.f32 f1432, f681, f692; +sub.f32 f702, f1432, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f705, f1444, f704; +fma.rn.f32 f706, f702, f586, f705; +mul.f32 f707, f586, f704; +mul.f32 f708, f702, f1444; +sub.f32 f709, f708, f707; +mul.f32 f1430, f681, f702; +mul.f32 f1431, f682, f704; +sub.f32 f712, f1430, f1431; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f715, f1442, f714; +fma.rn.f32 f716, f712, f602, f715; +mul.f32 f717, f602, f714; +mul.f32 f718, f712, f1442; +sub.f32 f719, f718, f717; +mul.f32 f1428, f681, f712; +mul.f32 f1429, f682, f714; +sub.f32 f722, f1428, f1429; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f725, f1440, f724; +fma.rn.f32 f726, f722, f618, f725; +mul.f32 f727, f618, f724; +mul.f32 f728, f722, f1440; +sub.f32 f729, f728, f727; +mul.f32 f731, f682, f724; +mul.f32 f1427, f681, f722; +sub.f32 f732, f1427, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f735, f1438, f734; +fma.rn.f32 f736, f732, f634, f735; +mul.f32 f737, f634, f734; +mul.f32 f738, f732, f1438; +sub.f32 f739, f738, f737; +mul.f32 f741, f682, f734; +mul.f32 f1426, f681, f732; +sub.f32 f742, f1426, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f745, f1436, f744; +fma.rn.f32 f746, f742, f650, f745; +mul.f32 f747, f650, f744; +mul.f32 f748, f742, f1436; +sub.f32 f749, f748, f747; +mul.f32 f751, f682, f744; +mul.f32 f1425, f681, f742; +sub.f32 f752, f1425, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f755, f1434, f754; +fma.rn.f32 f756, f752, f666, f755; +mul.f32 f757, f666, f754; +mul.f32 f758, f752, f1434; +sub.f32 f759, f758, f757; +mul.f32 f1423, f681, f752; +mul.f32 f1424, f682, f754; +sub.f32 f762, f1423, f1424; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f765, f551, f764; +fma.rn.f32 f766, f762, f545, f765; +mul.f32 f767, f545, f764; +mul.f32 f768, f762, f551; +sub.f32 f769, f768, f767; +mul.f32 f1421, f681, f762; +mul.f32 f1422, f682, f764; +sub.f32 f772, f1421, f1422; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f775, f567, f774; +fma.rn.f32 f776, f772, f561, f775; +mul.f32 f777, f561, f774; +mul.f32 f778, f772, f567; +sub.f32 f779, f778, f777; +mul.f32 f781, f682, f774; +mul.f32 f1420, f681, f772; +sub.f32 f782, f1420, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f785, f583, f784; +fma.rn.f32 f786, f782, f577, f785; +mul.f32 f787, f577, f784; +mul.f32 f788, f782, f583; +sub.f32 f789, f788, f787; +mul.f32 f791, f682, f784; +mul.f32 f1419, f681, f782; +sub.f32 f792, f1419, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f795, f599, f794; +fma.rn.f32 f796, f792, f593, f795; +mul.f32 f797, f593, f794; +mul.f32 f798, f792, f599; +sub.f32 f799, f798, f797; +mul.f32 f801, f682, f794; +mul.f32 f1418, f681, f792; +sub.f32 f802, f1418, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f805, f615, f804; +fma.rn.f32 f806, f802, f609, f805; +mul.f32 f807, f609, f804; +mul.f32 f808, f802, f615; +sub.f32 f809, f808, f807; +mul.f32 f1416, f681, f802; +mul.f32 f1417, f682, f804; +sub.f32 f812, f1416, f1417; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f815, f631, f814; +fma.rn.f32 f816, f812, f625, f815; +mul.f32 f817, f625, f814; +mul.f32 f818, f812, f631; +sub.f32 f819, f818, f817; +mul.f32 f1414, f681, f812; +mul.f32 f1415, f682, f814; +sub.f32 f822, f1414, f1415; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f825, f647, f824; +fma.rn.f32 f826, f822, f641, f825; +mul.f32 f827, f641, f824; +mul.f32 f828, f822, f647; +sub.f32 f829, f828, f827; +mul.f32 f831, f682, f824; +mul.f32 f1413, f681, f822; +sub.f32 f832, f1413, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f835, f663, f834; +fma.rn.f32 f836, f832, f657, f835; +mul.f32 f837, f657, f834; +mul.f32 f838, f832, f663; +sub.f32 f839, f838, f837; +mul.f32 f841, f682, f834; +mul.f32 f1412, f681, f832; +sub.f32 f842, f1412, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f845, f679, f844; +fma.rn.f32 f846, f842, f673, f845; +mul.f32 f847, f673, f844; +mul.f32 f848, f842, f679; +sub.f32 f849, f848, f847; +mul.f32 f1410, f681, f842; +mul.f32 f1411, f682, f844; +sub.f32 f852, f1410, f1411; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f855, f552, f854; +fma.rn.f32 f856, f852, f546, f855; +mul.f32 f857, f546, f854; +mul.f32 f858, f852, f552; +sub.f32 f859, f858, f857; +mul.f32 f1408, f681, f852; +mul.f32 f1409, f682, f854; +sub.f32 f862, f1408, f1409; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f865, f568, f864; +fma.rn.f32 f866, f862, f562, f865; +mul.f32 f867, f562, f864; +mul.f32 f868, f862, f568; +sub.f32 f869, f868, f867; +mul.f32 f871, f682, f864; +mul.f32 f1407, f681, f862; +sub.f32 f872, f1407, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f875, f584, f874; +fma.rn.f32 f876, f872, f578, f875; +mul.f32 f877, f578, f874; +mul.f32 f878, f872, f584; +sub.f32 f879, f878, f877; +mul.f32 f881, f682, f874; +mul.f32 f1406, f681, f872; +sub.f32 f882, f1406, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f885, f600, f884; +fma.rn.f32 f886, f882, f594, f885; +mul.f32 f887, f594, f884; +mul.f32 f888, f882, f600; +sub.f32 f889, f888, f887; +mul.f32 f891, f682, f884; +mul.f32 f1405, f681, f882; +sub.f32 f892, f1405, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f895, f616, f894; +fma.rn.f32 f896, f892, f610, f895; +mul.f32 f897, f610, f894; +mul.f32 f898, f892, f616; +sub.f32 f899, f898, f897; +mul.f32 f1403, f681, f892; +mul.f32 f1404, f682, f894; +sub.f32 f902, f1403, f1404; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f905, f632, f904; +fma.rn.f32 f906, f902, f626, f905; +mul.f32 f907, f626, f904; +mul.f32 f908, f902, f632; +sub.f32 f909, f908, f907; +mul.f32 f1401, f681, f902; +mul.f32 f1402, f682, f904; +sub.f32 f912, f1401, f1402; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f915, f648, f914; +fma.rn.f32 f916, f912, f642, f915; +mul.f32 f917, f642, f914; +mul.f32 f918, f912, f648; +sub.f32 f919, f918, f917; +mul.f32 f921, f682, f914; +mul.f32 f1400, f681, f912; +sub.f32 f922, f1400, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f925, f664, f924; +fma.rn.f32 f926, f922, f658, f925; +mul.f32 f927, f658, f924; +mul.f32 f928, f922, f664; +sub.f32 f929, f928, f927; +mul.f32 f931, f682, f924; +mul.f32 f1399, f681, f922; +sub.f32 f932, f1399, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f935, f680, f934; +fma.rn.f32 f936, f932, f674, f935; +mul.f32 f937, f674, f934; +mul.f32 f938, f932, f680; +sub.f32 f939, f938, f937; +mad.lo.s32 r8, r5, 972, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f686; +st.shared.f32 [r9+8], f696; +st.shared.f32 [r9+12], f706; +st.shared.f32 [r9+16], f716; +st.shared.f32 [r9+20], f726; +st.shared.f32 [r9+24], f736; +st.shared.f32 [r9+28], f746; +st.shared.f32 [r9+32], f756; +st.shared.f32 [r9+36], f766; +st.shared.f32 [r9+40], f776; +st.shared.f32 [r9+44], f786; +st.shared.f32 [r9+48], f796; +st.shared.f32 [r9+52], f806; +st.shared.f32 [r9+56], f816; +st.shared.f32 [r9+60], f826; +st.shared.f32 [r9+64], f836; +st.shared.f32 [r9+68], f846; +st.shared.f32 [r9+72], f856; +st.shared.f32 [r9+76], f866; +st.shared.f32 [r9+80], f876; +st.shared.f32 [r9+84], f886; +st.shared.f32 [r9+88], f896; +st.shared.f32 [r9+92], f906; +st.shared.f32 [r9+96], f916; +st.shared.f32 [r9+100], f926; +st.shared.f32 [r9+104], f936; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+36]; +ld.shared.f32 f942, [r10+72]; +ld.shared.f32 f943, [r10+108]; +ld.shared.f32 f944, [r10+144]; +ld.shared.f32 f945, [r10+180]; +ld.shared.f32 f946, [r10+216]; +ld.shared.f32 f947, [r10+252]; +ld.shared.f32 f948, [r10+288]; +ld.shared.f32 f949, [r10+324]; +ld.shared.f32 f950, [r10+360]; +ld.shared.f32 f951, [r10+396]; +ld.shared.f32 f952, [r10+432]; +ld.shared.f32 f953, [r10+468]; +ld.shared.f32 f954, [r10+504]; +ld.shared.f32 f955, [r10+540]; +ld.shared.f32 f956, [r10+576]; +ld.shared.f32 f957, [r10+612]; +ld.shared.f32 f958, [r10+648]; +ld.shared.f32 f959, [r10+684]; +ld.shared.f32 f960, [r10+720]; +ld.shared.f32 f961, [r10+756]; +ld.shared.f32 f962, [r10+792]; +ld.shared.f32 f963, [r10+828]; +ld.shared.f32 f964, [r10+864]; +ld.shared.f32 f965, [r10+900]; +ld.shared.f32 f966, [r10+936]; +barrier.sync 0; +st.shared.f32 [r9], f1450; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +ld.shared.f32 f967, [r10]; +ld.shared.f32 f968, [r10+36]; +ld.shared.f32 f969, [r10+72]; +ld.shared.f32 f970, [r10+108]; +ld.shared.f32 f971, [r10+144]; +ld.shared.f32 f972, [r10+180]; +ld.shared.f32 f973, [r10+216]; +ld.shared.f32 f974, [r10+252]; +ld.shared.f32 f975, [r10+288]; +ld.shared.f32 f976, [r10+324]; +ld.shared.f32 f977, [r10+360]; +ld.shared.f32 f978, [r10+396]; +ld.shared.f32 f979, [r10+432]; +ld.shared.f32 f980, [r10+468]; +ld.shared.f32 f981, [r10+504]; +ld.shared.f32 f982, [r10+540]; +ld.shared.f32 f983, [r10+576]; +ld.shared.f32 f984, [r10+612]; +ld.shared.f32 f985, [r10+648]; +ld.shared.f32 f986, [r10+684]; +ld.shared.f32 f987, [r10+720]; +ld.shared.f32 f988, [r10+756]; +ld.shared.f32 f989, [r10+792]; +ld.shared.f32 f990, [r10+828]; +ld.shared.f32 f991, [r10+864]; +ld.shared.f32 f992, [r10+900]; +ld.shared.f32 f993, [r10+936]; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +add.f32 f1398, f976, f985; +sub.f32 f1000, f976, f985; +mul.f32 f1001, f1000, 0fBF5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +add.f32 f1397, f967, f1398; +mul.f32 f1004, f1398, 0f3F000000; +sub.f32 f1005, f967, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0fBF5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +add.f32 f1396, f979, f988; +sub.f32 f1016, f979, f988; +mul.f32 f1017, f1016, 0fBF5DB3D7; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f1395, f970, f1396; +mul.f32 f1020, f1396, 0f3F000000; +sub.f32 f1021, f970, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0fBF5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +sub.f32 f1031, f946, f1030; +add.f32 f1394, f982, f991; +sub.f32 f1032, f982, f991; +mul.f32 f1033, f1032, 0fBF5DB3D7; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f1393, f973, f1394; +mul.f32 f1036, f1394, 0f3F000000; +sub.f32 f1037, f973, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0fBF5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f1391, f1018, 0f3F441B7D; +mul.f32 f1392, f1024, 0f3F248DBB; +sub.f32 f1044, f1391, f1392; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0f3F248DBB, f1045; +mul.f32 f1048, f1040, 0f3F7C1C5C; +mul.f32 f1390, f1034, 0f3E31D0D4; +sub.f32 f1049, f1390, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0f3F7C1C5C, f1050; +mul.f32 f1053, f1025, 0f3F7C1C5C; +mul.f32 f1389, f1019, 0f3E31D0D4; +sub.f32 f1054, f1389, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0f3F7C1C5C, f1055; +mul.f32 f1058, f1041, 0f3EAF1D44; +mul.f32 f1388, f1035, 0fBF708FB2; +sub.f32 f1059, f1388, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0f3EAF1D44, f1060; +add.f32 f1062, f1011, f1027; +mul.f32 f1064, f1062, 0f3F000000; +sub.f32 f1065, f995, f1064; +add.f32 f1387, f1395, f1393; +sub.f32 f1066, f1395, f1393; +mul.f32 f1067, f1066, 0fBF5DB3D7; +mul.f32 f1068, f1387, 0f3F000000; +sub.f32 f1069, f1397, f1068; +sub.f32 f1070, f1011, f1027; +mul.f32 f1071, f1070, 0fBF5DB3D7; +add.f32 f1072, f1044, f1049; +mul.f32 f1074, f1072, 0f3F000000; +sub.f32 f1075, f1002, f1074; +add.f32 f1386, f1046, f1051; +sub.f32 f1076, f1046, f1051; +mul.f32 f1077, f1076, 0fBF5DB3D7; +mul.f32 f1078, f1386, 0f3F000000; +sub.f32 f1079, f1008, f1078; +sub.f32 f1080, f1044, f1049; +mul.f32 f1081, f1080, 0fBF5DB3D7; +add.f32 f1082, f1054, f1059; +mul.f32 f1084, f1082, 0f3F000000; +sub.f32 f1085, f1003, f1084; +add.f32 f1385, f1056, f1061; +sub.f32 f1086, f1056, f1061; +mul.f32 f1087, f1086, 0fBF5DB3D7; +mul.f32 f1088, f1385, 0f3F000000; +sub.f32 f1089, f1009, f1088; +sub.f32 f1090, f1054, f1059; +mul.f32 f1091, f1090, 0fBF5DB3D7; +add.f32 f1092, f950, f959; +add.f32 f1093, f941, f1092; +mul.f32 f1096, f1092, 0f3F000000; +sub.f32 f1097, f941, f1096; +add.f32 f1384, f977, f986; +sub.f32 f1098, f977, f986; +mul.f32 f1099, f1098, 0fBF5DB3D7; +add.f32 f1100, f1099, f1097; +sub.f32 f1101, f1097, f1099; +add.f32 f1383, f968, f1384; +mul.f32 f1102, f1384, 0f3F000000; +sub.f32 f1103, f968, f1102; +sub.f32 f1104, f950, f959; +mul.f32 f1105, f1104, 0fBF5DB3D7; +sub.f32 f1106, f1103, f1105; +add.f32 f1107, f1105, f1103; +add.f32 f1108, f953, f962; +add.f32 f1109, f944, f1108; +mul.f32 f1112, f1108, 0f3F000000; +sub.f32 f1113, f944, f1112; +add.f32 f1382, f980, f989; +sub.f32 f1114, f980, f989; +mul.f32 f1115, f1114, 0fBF5DB3D7; +add.f32 f1116, f1115, f1113; +sub.f32 f1117, f1113, f1115; +add.f32 f1381, f971, f1382; +mul.f32 f1118, f1382, 0f3F000000; +sub.f32 f1119, f971, f1118; +sub.f32 f1120, f953, f962; +mul.f32 f1121, f1120, 0fBF5DB3D7; +sub.f32 f1122, f1119, f1121; +add.f32 f1123, f1121, f1119; +add.f32 f1124, f956, f965; +add.f32 f1125, f947, f1124; +mul.f32 f1128, f1124, 0f3F000000; +sub.f32 f1129, f947, f1128; +add.f32 f1380, f983, f992; +sub.f32 f1130, f983, f992; +mul.f32 f1131, f1130, 0fBF5DB3D7; +add.f32 f1132, f1131, f1129; +sub.f32 f1133, f1129, f1131; +add.f32 f1379, f974, f1380; +mul.f32 f1134, f1380, 0f3F000000; +sub.f32 f1135, f974, f1134; +sub.f32 f1136, f956, f965; +mul.f32 f1137, f1136, 0fBF5DB3D7; +sub.f32 f1138, f1135, f1137; +add.f32 f1139, f1137, f1135; +mul.f32 f1141, f1122, 0f3F248DBB; +mul.f32 f1378, f1116, 0f3F441B7D; +sub.f32 f1142, f1378, f1141; +mul.f32 f1143, f1122, 0f3F441B7D; +fma.rn.f32 f1144, f1116, 0f3F248DBB, f1143; +mul.f32 f1146, f1138, 0f3F7C1C5C; +mul.f32 f1377, f1132, 0f3E31D0D4; +sub.f32 f1147, f1377, f1146; +mul.f32 f1148, f1138, 0f3E31D0D4; +fma.rn.f32 f1149, f1132, 0f3F7C1C5C, f1148; +mul.f32 f1151, f1123, 0f3F7C1C5C; +mul.f32 f1376, f1117, 0f3E31D0D4; +sub.f32 f1152, f1376, f1151; +mul.f32 f1153, f1123, 0f3E31D0D4; +fma.rn.f32 f1154, f1117, 0f3F7C1C5C, f1153; +mul.f32 f1374, f1133, 0fBF708FB2; +mul.f32 f1375, f1139, 0f3EAF1D44; +sub.f32 f1157, f1374, f1375; +mul.f32 f1158, f1139, 0fBF708FB2; +fma.rn.f32 f1159, f1133, 0f3EAF1D44, f1158; +add.f32 f1160, f1109, f1125; +mul.f32 f1162, f1160, 0f3F000000; +sub.f32 f1163, f1093, f1162; +add.f32 f1373, f1381, f1379; +sub.f32 f1164, f1381, f1379; +mul.f32 f1165, f1164, 0fBF5DB3D7; +mul.f32 f1166, f1373, 0f3F000000; +sub.f32 f1167, f1383, f1166; +sub.f32 f1168, f1109, f1125; +mul.f32 f1169, f1168, 0fBF5DB3D7; +add.f32 f1170, f1142, f1147; +mul.f32 f1172, f1170, 0f3F000000; +sub.f32 f1173, f1100, f1172; +add.f32 f1372, f1144, f1149; +sub.f32 f1174, f1144, f1149; +mul.f32 f1175, f1174, 0fBF5DB3D7; +mul.f32 f1176, f1372, 0f3F000000; +sub.f32 f1177, f1106, f1176; +sub.f32 f1178, f1142, f1147; +mul.f32 f1179, f1178, 0fBF5DB3D7; +add.f32 f1180, f1152, f1157; +mul.f32 f1182, f1180, 0f3F000000; +sub.f32 f1183, f1101, f1182; +add.f32 f1371, f1154, f1159; +sub.f32 f1184, f1154, f1159; +mul.f32 f1185, f1184, 0fBF5DB3D7; +mul.f32 f1186, f1371, 0f3F000000; +sub.f32 f1187, f1107, f1186; +sub.f32 f1188, f1152, f1157; +mul.f32 f1189, f1188, 0fBF5DB3D7; +add.f32 f1190, f951, f960; +add.f32 f1191, f942, f1190; +mul.f32 f1194, f1190, 0f3F000000; +sub.f32 f1195, f942, f1194; +add.f32 f1370, f978, f987; +sub.f32 f1196, f978, f987; +mul.f32 f1197, f1196, 0fBF5DB3D7; +add.f32 f1198, f1197, f1195; +sub.f32 f1199, f1195, f1197; +add.f32 f1369, f969, f1370; +mul.f32 f1200, f1370, 0f3F000000; +sub.f32 f1201, f969, f1200; +sub.f32 f1202, f951, f960; +mul.f32 f1203, f1202, 0fBF5DB3D7; +sub.f32 f1204, f1201, f1203; +add.f32 f1205, f1203, f1201; +add.f32 f1206, f954, f963; +add.f32 f1207, f945, f1206; +mul.f32 f1210, f1206, 0f3F000000; +sub.f32 f1211, f945, f1210; +add.f32 f1368, f981, f990; +sub.f32 f1212, f981, f990; +mul.f32 f1213, f1212, 0fBF5DB3D7; +add.f32 f1214, f1213, f1211; +sub.f32 f1215, f1211, f1213; +add.f32 f1367, f972, f1368; +mul.f32 f1216, f1368, 0f3F000000; +sub.f32 f1217, f972, f1216; +sub.f32 f1218, f954, f963; +mul.f32 f1219, f1218, 0fBF5DB3D7; +sub.f32 f1220, f1217, f1219; +add.f32 f1221, f1219, f1217; +add.f32 f1222, f957, f966; +add.f32 f1223, f948, f1222; +mul.f32 f1226, f1222, 0f3F000000; +sub.f32 f1227, f948, f1226; +add.f32 f1366, f984, f993; +sub.f32 f1228, f984, f993; +mul.f32 f1229, f1228, 0fBF5DB3D7; +add.f32 f1230, f1229, f1227; +sub.f32 f1231, f1227, f1229; +add.f32 f1365, f975, f1366; +mul.f32 f1232, f1366, 0f3F000000; +sub.f32 f1233, f975, f1232; +sub.f32 f1234, f957, f966; +mul.f32 f1235, f1234, 0fBF5DB3D7; +sub.f32 f1236, f1233, f1235; +add.f32 f1237, f1235, f1233; +mul.f32 f1363, f1214, 0f3F441B7D; +mul.f32 f1364, f1220, 0f3F248DBB; +sub.f32 f1240, f1363, f1364; +mul.f32 f1241, f1220, 0f3F441B7D; +fma.rn.f32 f1242, f1214, 0f3F248DBB, f1241; +mul.f32 f1361, f1230, 0f3E31D0D4; +mul.f32 f1362, f1236, 0f3F7C1C5C; +sub.f32 f1245, f1361, f1362; +mul.f32 f1246, f1236, 0f3E31D0D4; +fma.rn.f32 f1247, f1230, 0f3F7C1C5C, f1246; +mul.f32 f1359, f1215, 0f3E31D0D4; +mul.f32 f1360, f1221, 0f3F7C1C5C; +sub.f32 f1250, f1359, f1360; +mul.f32 f1251, f1221, 0f3E31D0D4; +fma.rn.f32 f1252, f1215, 0f3F7C1C5C, f1251; +mul.f32 f1357, f1231, 0fBF708FB2; +mul.f32 f1358, f1237, 0f3EAF1D44; +sub.f32 f1255, f1357, f1358; +mul.f32 f1256, f1237, 0fBF708FB2; +fma.rn.f32 f1257, f1231, 0f3EAF1D44, f1256; +add.f32 f1258, f1207, f1223; +mul.f32 f1260, f1258, 0f3F000000; +sub.f32 f1261, f1191, f1260; +add.f32 f1356, f1367, f1365; +sub.f32 f1262, f1367, f1365; +mul.f32 f1263, f1262, 0fBF5DB3D7; +mul.f32 f1264, f1356, 0f3F000000; +sub.f32 f1265, f1369, f1264; +sub.f32 f1266, f1207, f1223; +mul.f32 f1267, f1266, 0fBF5DB3D7; +add.f32 f1268, f1240, f1245; +mul.f32 f1270, f1268, 0f3F000000; +sub.f32 f1271, f1198, f1270; +add.f32 f1355, f1242, f1247; +sub.f32 f1272, f1242, f1247; +mul.f32 f1273, f1272, 0fBF5DB3D7; +mul.f32 f1274, f1355, 0f3F000000; +sub.f32 f1275, f1204, f1274; +sub.f32 f1276, f1240, f1245; +mul.f32 f1277, f1276, 0fBF5DB3D7; +add.f32 f1278, f1250, f1255; +mul.f32 f1280, f1278, 0f3F000000; +sub.f32 f1281, f1199, f1280; +add.f32 f1354, f1252, f1257; +sub.f32 f1282, f1252, f1257; +mul.f32 f1283, f1282, 0fBF5DB3D7; +mul.f32 f1284, f1354, 0f3F000000; +sub.f32 f1285, f1205, f1284; +sub.f32 f1286, f1250, f1255; +mul.f32 f1548, f1356, 0f3F000000; +sub.f32 f1547, f1369, f1548; +mul.f32 f1287, f1286, 0fBF5DB3D7; +add.f32 %0, f995, f1062; +mul.f32 f1550, f1072, 0f3F000000; +sub.f32 f1549, f1002, f1550; +add.f32 %1, f1397, f1387; +mul.f32 f1552, f1385, 0f3F000000; +sub.f32 f1551, f1009, f1552; +mul.f32 f1554, f1386, 0f3F000000; +sub.f32 f1553, f1008, f1554; +add.f32 %2, f1093, f1160; +add.f32 %3, f1383, f1373; +add.f32 %4, f1191, f1258; +add.f32 %5, f1369, f1356; +add.f32 %7, f1008, f1386; +add.f32 %6, f1002, f1072; +add.f32 %9, f1106, f1372; +add.f32 %8, f1100, f1170; +add.f32 %11, f1204, f1355; +add.f32 %10, f1198, f1268; +add.f32 %13, f1009, f1385; +add.f32 %12, f1003, f1082; +add.f32 %15, f1107, f1371; +add.f32 %14, f1101, f1180; +add.f32 %17, f1205, f1354; +add.f32 %16, f1199, f1278; +sub.f32 %19, f1069, f1071; +add.f32 %18, f1067, f1065; +sub.f32 %21, f1167, f1169; +add.f32 %20, f1165, f1163; +add.f32 %22, f1263, f1261; +sub.f32 %23, f1547, f1267; +sub.f32 %25, f1553, f1081; +add.f32 %24, f1077, f1549; +sub.f32 %27, f1177, f1179; +add.f32 %26, f1175, f1173; +add.f32 %28, f1273, f1271; +sub.f32 %29, f1275, f1277; +add.f32 %30, f1087, f1085; +sub.f32 %31, f1551, f1091; +add.f32 %32, f1185, f1183; +sub.f32 %33, f1187, f1189; +add.f32 %34, f1283, f1281; +sub.f32 %35, f1285, f1287; +sub.f32 %36, f1065, f1067; +add.f32 %37, f1071, f1069; +sub.f32 %38, f1163, f1165; +add.f32 %39, f1169, f1167; +sub.f32 %40, f1261, f1263; +add.f32 %41, f1267, f1547; +add.f32 %43, f1081, f1553; +sub.f32 %42, f1549, f1077; +add.f32 %45, f1179, f1177; +sub.f32 %44, f1173, f1175; +add.f32 %47, f1277, f1275; +sub.f32 %46, f1271, f1273; +add.f32 %49, f1091, f1551; +sub.f32 %48, f1085, f1087; +add.f32 %51, f1189, f1187; +sub.f32 %50, f1183, f1185; +add.f32 %53, f1287, f1285; +sub.f32 %52, f1281, f1283; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<336, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<547>; +.reg .b32 r<22>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 1944, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0fBF5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0fBF5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 1944, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f122, f152; +mul.f32 f156, f120, f152; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f138, f162; +mul.f32 f164, f136, f162; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f117, f170; +mul.f32 f172, f111, f170; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f133, f178; +mul.f32 f180, f127, f178; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f149, f186; +mul.f32 f188, f143, f186; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f118, f194; +mul.f32 f196, f112, f194; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f134, f202; +mul.f32 f204, f128, f202; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f150, f210; +mul.f32 f212, f144, f210; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r13, r11, 72, r12; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r13], {f215, f214}; +fma.rn.f32 f216, f151, f120, f155; +sub.f32 f217, f157, f156; +st.shared.v2.f32 [r13+8], {f216, f217}; +fma.rn.f32 f218, f160, f136, f163; +sub.f32 f219, f165, f164; +st.shared.v2.f32 [r13+16], {f218, f219}; +sub.f32 f220, f173, f172; +fma.rn.f32 f221, f168, f111, f171; +st.shared.v2.f32 [r13+24], {f221, f220}; +fma.rn.f32 f222, f176, f127, f179; +sub.f32 f223, f181, f180; +st.shared.v2.f32 [r13+32], {f222, f223}; +sub.f32 f224, f189, f188; +fma.rn.f32 f225, f184, f143, f187; +st.shared.v2.f32 [r13+40], {f225, f224}; +fma.rn.f32 f226, f192, f112, f195; +sub.f32 f227, f197, f196; +st.shared.v2.f32 [r13+48], {f226, f227}; +fma.rn.f32 f228, f200, f128, f203; +sub.f32 f229, f205, f204; +st.shared.v2.f32 [r13+56], {f228, f229}; +fma.rn.f32 f230, f208, f144, f211; +sub.f32 f231, f213, f212; +st.shared.v2.f32 [r13+64], {f230, f231}; +barrier.sync 0; +shl.b32 r14, r11, 6; +sub.s32 r15, r13, r14; +ld.shared.v2.f32 {f232, f233}, [r15]; +ld.shared.v2.f32 {f236, f237}, [r15+216]; +ld.shared.v2.f32 {f240, f241}, [r15+432]; +ld.shared.v2.f32 {f244, f245}, [r15+648]; +ld.shared.v2.f32 {f248, f249}, [r15+864]; +ld.shared.v2.f32 {f252, f253}, [r15+1080]; +ld.shared.v2.f32 {f256, f257}, [r15+1296]; +ld.shared.v2.f32 {f260, f261}, [r15+1512]; +ld.shared.v2.f32 {f264, f265}, [r15+1728]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0fBF5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0fBF5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0fBF5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0fBF5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0fBF5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0fBF5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0f3F248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0f3F248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0f3F7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0f3F7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0f3F7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0f3F7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0f3EAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0f3EAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0fBF5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0fBF5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f353, f383; +mul.f32 f387, f351, f383; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f369, f393; +mul.f32 f395, f367, f393; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f348, f401; +mul.f32 f403, f342, f401; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f364, f409; +mul.f32 f411, f358, f409; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f380, f417; +mul.f32 f419, f374, f417; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f349, f425; +mul.f32 f427, f343, f425; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f365, f433; +mul.f32 f435, f359, f433; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f381, f441; +mul.f32 f443, f375, f441; +mul.f32 f444, f439, f381; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 648, r20; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r21], {f446, f445}; +fma.rn.f32 f447, f382, f351, f386; +sub.f32 f448, f388, f387; +st.shared.v2.f32 [r21+72], {f447, f448}; +fma.rn.f32 f449, f391, f367, f394; +sub.f32 f450, f396, f395; +st.shared.v2.f32 [r21+144], {f449, f450}; +fma.rn.f32 f451, f399, f342, f402; +sub.f32 f452, f404, f403; +st.shared.v2.f32 [r21+216], {f451, f452}; +fma.rn.f32 f453, f407, f358, f410; +sub.f32 f454, f412, f411; +st.shared.v2.f32 [r21+288], {f453, f454}; +fma.rn.f32 f455, f415, f374, f418; +sub.f32 f456, f420, f419; +st.shared.v2.f32 [r21+360], {f455, f456}; +fma.rn.f32 f457, f423, f343, f426; +sub.f32 f458, f428, f427; +st.shared.v2.f32 [r21+432], {f457, f458}; +sub.f32 f459, f436, f435; +fma.rn.f32 f460, f431, f359, f434; +st.shared.v2.f32 [r21+504], {f460, f459}; +fma.rn.f32 f461, f439, f375, f442; +sub.f32 f462, f444, f443; +st.shared.v2.f32 [r21+576], {f461, f462}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r15]; +ld.shared.v2.f32 {f467, f468}, [r15+216]; +ld.shared.v2.f32 {f471, f472}, [r15+432]; +ld.shared.v2.f32 {f475, f476}, [r15+648]; +ld.shared.v2.f32 {f479, f480}, [r15+864]; +ld.shared.v2.f32 {f483, f484}, [r15+1080]; +ld.shared.v2.f32 {f487, f488}, [r15+1296]; +ld.shared.v2.f32 {f491, f492}, [r15+1512]; +ld.shared.v2.f32 {f495, f496}, [r15+1728]; +add.f32 f499, f475, f487; +add.f32 f500, f476, f488; +mul.f32 f501, f499, 0f3F000000; +sub.f32 f502, f463, f501; +sub.f32 f503, f476, f488; +mul.f32 f504, f503, 0fBF5DB3D7; +mul.f32 f505, f500, 0f3F000000; +sub.f32 f506, f464, f505; +sub.f32 f507, f475, f487; +mul.f32 f508, f507, 0fBF5DB3D7; +add.f32 f509, f479, f491; +add.f32 f510, f480, f492; +mul.f32 f511, f509, 0f3F000000; +sub.f32 f512, f467, f511; +sub.f32 f513, f480, f492; +mul.f32 f514, f513, 0fBF5DB3D7; +mul.f32 f515, f510, 0f3F000000; +sub.f32 f516, f468, f515; +sub.f32 f517, f479, f491; +mul.f32 f518, f517, 0fBF5DB3D7; +add.f32 f519, f483, f495; +add.f32 f520, f484, f496; +mul.f32 f521, f519, 0f3F000000; +sub.f32 f522, f471, f521; +sub.f32 f523, f484, f496; +mul.f32 f524, f523, 0fBF5DB3D7; +mul.f32 f525, f520, 0f3F000000; +sub.f32 f526, f472, f525; +sub.f32 f527, f483, f495; +mul.f32 f528, f527, 0fBF5DB3D7; +add.f32 %1, f464, f500; +add.f32 %0, f463, f499; +add.f32 %3, f468, f510; +add.f32 %2, f467, f509; +add.f32 %5, f472, f520; +add.f32 %4, f471, f519; +sub.f32 %7, f506, f508; +add.f32 %6, f504, f502; +sub.f32 %9, f516, f518; +add.f32 %8, f514, f512; +sub.f32 %11, f526, f528; +add.f32 %10, f524, f522; +add.f32 %13, f508, f506; +sub.f32 %12, f502, f504; +add.f32 %15, f518, f516; +sub.f32 %14, f512, f514; +add.f32 %17, f528, f526; +sub.f32 %16, f522, f524; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<337, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<511>; +.reg .b32 r<22>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 972, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0fBF5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0fBF5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0fBF5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0fBF5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f124, f154; +fma.rn.f32 f158, f153, f122, f157; +mul.f32 f159, f122, f154; +mul.f32 f160, f153, f124; +sub.f32 f161, f160, f159; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f140, f166; +fma.rn.f32 f168, f164, f138, f167; +mul.f32 f169, f138, f166; +mul.f32 f170, f164, f140; +sub.f32 f171, f170, f169; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f119, f176; +fma.rn.f32 f178, f174, f113, f177; +mul.f32 f179, f113, f176; +mul.f32 f180, f174, f119; +sub.f32 f181, f180, f179; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f135, f186; +fma.rn.f32 f188, f184, f129, f187; +mul.f32 f189, f129, f186; +mul.f32 f190, f184, f135; +sub.f32 f191, f190, f189; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f151, f196; +fma.rn.f32 f198, f194, f145, f197; +mul.f32 f199, f145, f196; +mul.f32 f200, f194, f151; +sub.f32 f201, f200, f199; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f120, f206; +fma.rn.f32 f208, f204, f114, f207; +mul.f32 f209, f114, f206; +mul.f32 f210, f204, f120; +sub.f32 f211, f210, f209; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f136, f216; +fma.rn.f32 f218, f214, f130, f217; +mul.f32 f219, f130, f216; +mul.f32 f220, f214, f136; +sub.f32 f221, f220, f219; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f152, f226; +fma.rn.f32 f228, f224, f146, f227; +mul.f32 f229, f146, f226; +mul.f32 f230, f224, f152; +sub.f32 f231, f230, f229; +mad.lo.s32 r12, r9, 972, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 36, r12; +st.shared.f32 [r13], f106; +st.shared.f32 [r13+4], f158; +st.shared.f32 [r13+8], f168; +st.shared.f32 [r13+12], f178; +st.shared.f32 [r13+16], f188; +st.shared.f32 [r13+20], f198; +st.shared.f32 [r13+24], f208; +st.shared.f32 [r13+28], f218; +st.shared.f32 [r13+32], f228; +barrier.sync 0; +shl.b32 r14, r11, 5; +sub.s32 r15, r13, r14; +ld.shared.f32 f232, [r15]; +ld.shared.f32 f233, [r15+108]; +ld.shared.f32 f234, [r15+216]; +ld.shared.f32 f235, [r15+324]; +ld.shared.f32 f236, [r15+432]; +ld.shared.f32 f237, [r15+540]; +ld.shared.f32 f238, [r15+648]; +ld.shared.f32 f239, [r15+756]; +ld.shared.f32 f240, [r15+864]; +barrier.sync 0; +st.shared.f32 [r13], f108; +st.shared.f32 [r13+4], f161; +st.shared.f32 [r13+8], f171; +st.shared.f32 [r13+12], f181; +st.shared.f32 [r13+16], f191; +st.shared.f32 [r13+20], f201; +st.shared.f32 [r13+24], f211; +st.shared.f32 [r13+28], f221; +st.shared.f32 [r13+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r15]; +ld.shared.f32 f242, [r15+108]; +ld.shared.f32 f243, [r15+216]; +ld.shared.f32 f244, [r15+324]; +ld.shared.f32 f245, [r15+432]; +ld.shared.f32 f246, [r15+540]; +ld.shared.f32 f247, [r15+648]; +ld.shared.f32 f248, [r15+756]; +ld.shared.f32 f249, [r15+864]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0fBF5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0fBF5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0fBF5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0fBF5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0fBF5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0fBF5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0f3F248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0f3F248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0f3F7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0f3F7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0f3F7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0f3F7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0f3EAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0f3EAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0fBF5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0fBF5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f337, f367; +fma.rn.f32 f371, f366, f335, f370; +mul.f32 f372, f335, f367; +mul.f32 f373, f366, f337; +sub.f32 f374, f373, f372; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f353, f379; +fma.rn.f32 f381, f377, f351, f380; +mul.f32 f382, f351, f379; +mul.f32 f383, f377, f353; +sub.f32 f384, f383, f382; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f332, f389; +fma.rn.f32 f391, f387, f326, f390; +mul.f32 f392, f326, f389; +mul.f32 f393, f387, f332; +sub.f32 f394, f393, f392; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f348, f399; +fma.rn.f32 f401, f397, f342, f400; +mul.f32 f402, f342, f399; +mul.f32 f403, f397, f348; +sub.f32 f404, f403, f402; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f364, f409; +fma.rn.f32 f411, f407, f358, f410; +mul.f32 f412, f358, f409; +mul.f32 f413, f407, f364; +sub.f32 f414, f413, f412; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f333, f419; +fma.rn.f32 f421, f417, f327, f420; +mul.f32 f422, f327, f419; +mul.f32 f423, f417, f333; +sub.f32 f424, f423, f422; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f349, f429; +fma.rn.f32 f431, f427, f343, f430; +mul.f32 f432, f343, f429; +mul.f32 f433, f427, f349; +sub.f32 f434, f433, f432; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f365, f439; +fma.rn.f32 f441, f437, f359, f440; +mul.f32 f442, f359, f439; +mul.f32 f443, f437, f365; +sub.f32 f444, f443, f442; +shl.b32 r19, r18, 2; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 324, r20; +st.shared.f32 [r21], f319; +st.shared.f32 [r21+36], f371; +st.shared.f32 [r21+72], f381; +st.shared.f32 [r21+108], f391; +st.shared.f32 [r21+144], f401; +st.shared.f32 [r21+180], f411; +st.shared.f32 [r21+216], f421; +st.shared.f32 [r21+252], f431; +st.shared.f32 [r21+288], f441; +barrier.sync 0; +ld.shared.f32 f445, [r15]; +ld.shared.f32 f446, [r15+108]; +ld.shared.f32 f447, [r15+216]; +ld.shared.f32 f448, [r15+324]; +ld.shared.f32 f449, [r15+432]; +ld.shared.f32 f450, [r15+540]; +ld.shared.f32 f451, [r15+648]; +ld.shared.f32 f452, [r15+756]; +ld.shared.f32 f453, [r15+864]; +barrier.sync 0; +st.shared.f32 [r21], f321; +st.shared.f32 [r21+36], f374; +st.shared.f32 [r21+72], f384; +st.shared.f32 [r21+108], f394; +st.shared.f32 [r21+144], f404; +st.shared.f32 [r21+180], f414; +st.shared.f32 [r21+216], f424; +st.shared.f32 [r21+252], f434; +st.shared.f32 [r21+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r15]; +ld.shared.f32 f455, [r15+108]; +ld.shared.f32 f456, [r15+216]; +ld.shared.f32 f457, [r15+324]; +ld.shared.f32 f458, [r15+432]; +ld.shared.f32 f459, [r15+540]; +ld.shared.f32 f460, [r15+648]; +ld.shared.f32 f461, [r15+756]; +ld.shared.f32 f462, [r15+864]; +add.f32 f463, f448, f451; +add.f32 f464, f457, f460; +mul.f32 f465, f463, 0f3F000000; +sub.f32 f466, f445, f465; +sub.f32 f467, f457, f460; +mul.f32 f468, f467, 0fBF5DB3D7; +mul.f32 f469, f464, 0f3F000000; +sub.f32 f470, f454, f469; +sub.f32 f471, f448, f451; +mul.f32 f472, f471, 0fBF5DB3D7; +add.f32 f473, f449, f452; +add.f32 f474, f458, f461; +mul.f32 f475, f473, 0f3F000000; +sub.f32 f476, f446, f475; +sub.f32 f477, f458, f461; +mul.f32 f478, f477, 0fBF5DB3D7; +mul.f32 f479, f474, 0f3F000000; +sub.f32 f480, f455, f479; +sub.f32 f481, f449, f452; +mul.f32 f482, f481, 0fBF5DB3D7; +add.f32 f483, f450, f453; +add.f32 f484, f459, f462; +mul.f32 f485, f483, 0f3F000000; +sub.f32 f486, f447, f485; +sub.f32 f487, f459, f462; +mul.f32 f488, f487, 0fBF5DB3D7; +mul.f32 f489, f484, 0f3F000000; +sub.f32 f490, f456, f489; +sub.f32 f491, f450, f453; +mul.f32 f492, f491, 0fBF5DB3D7; +add.f32 %0, f445, f463; +add.f32 %1, f454, f464; +add.f32 %2, f446, f473; +add.f32 %3, f455, f474; +add.f32 %4, f447, f483; +add.f32 %5, f456, f484; +add.f32 %6, f468, f466; +sub.f32 %7, f470, f472; +add.f32 %8, f478, f476; +sub.f32 %9, f480, f482; +add.f32 %10, f488, f486; +sub.f32 %11, f490, f492; +sub.f32 %12, f466, f468; +add.f32 %13, f472, f470; +sub.f32 %14, f476, f478; +add.f32 %15, f482, f480; +sub.f32 %16, f486, f488; +add.f32 %17, f492, f490; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_243), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<338, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<217>; +.reg .b32 r<34>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 1944, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %13, %16; +add.f32 f14, %15, %17; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %11, f15; +sub.f32 f17, %15, %17; +mul.f32 f18, f17, 0fBF5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %12, f21; +sub.f32 f23, %13, %16; +mul.f32 f24, f23, 0fBF5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1944, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f25, f28; +mul.f32 f32, f19, f28; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f26, f38; +mul.f32 f40, f20, f38; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %12, f14; +add.f32 f43, %11, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f27, f19, f31; +sub.f32 f45, f33, f32; +st.shared.v2.f32 [r9+8], {f44, f45}; +sub.f32 f46, f41, f40; +fma.rn.f32 f47, f36, f20, f39; +st.shared.v2.f32 [r9+16], {f47, f46}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+648]; +ld.shared.v2.f32 {f56, f57}, [r11+1296]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0fBF5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0fBF5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f72, f75; +mul.f32 f79, f66, f75; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f73, f85; +mul.f32 f87, f67, f85; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f74, f66, f78; +sub.f32 f92, f80, f79; +st.shared.v2.f32 [r17+24], {f91, f92}; +fma.rn.f32 f93, f83, f67, f86; +sub.f32 f94, f88, f87; +st.shared.v2.f32 [r17+48], {f93, f94}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+648]; +ld.shared.v2.f32 {f103, f104}, [r11+1296]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f119, f122; +mul.f32 f126, f113, f122; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f120, f132; +mul.f32 f134, f114, f132; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r23], {f137, f136}; +fma.rn.f32 f138, f121, f113, f125; +sub.f32 f139, f127, f126; +st.shared.v2.f32 [r23+72], {f138, f139}; +fma.rn.f32 f140, f130, f114, f133; +sub.f32 f141, f135, f134; +st.shared.v2.f32 [r23+144], {f140, f141}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r11]; +ld.shared.v2.f32 {f146, f147}, [r11+648]; +ld.shared.v2.f32 {f150, f151}, [r11+1296]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0fBF5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f155, 0f3F000000; +sub.f32 f163, f143, f162; +sub.f32 f164, f146, f150; +mul.f32 f165, f164, 0fBF5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f168, f169}, [rd21]; +mul.f32 f172, f166, f169; +mul.f32 f173, f160, f169; +mul.f32 f174, f168, f166; +mul.f32 f175, f168, f168; +mul.f32 f176, f169, f169; +sub.f32 f177, f175, f176; +mul.f32 f178, f169, f168; +fma.rn.f32 f179, f169, f168, f178; +mul.f32 f180, f167, f179; +mul.f32 f181, f161, f179; +mul.f32 f182, f177, f167; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +add.f32 f183, f143, f155; +add.f32 f184, f142, f154; +st.shared.v2.f32 [r33], {f184, f183}; +fma.rn.f32 f185, f168, f160, f172; +sub.f32 f186, f174, f173; +st.shared.v2.f32 [r33+216], {f185, f186}; +fma.rn.f32 f187, f177, f161, f180; +sub.f32 f188, f182, f181; +st.shared.v2.f32 [r33+432], {f187, f188}; +barrier.sync 0; +ld.shared.v2.f32 {f189, f190}, [r11]; +ld.shared.v2.f32 {f193, f194}, [r11+648]; +ld.shared.v2.f32 {f197, f198}, [r11+1296]; +add.f32 f201, f193, f197; +add.f32 f202, f194, f198; +mul.f32 f203, f201, 0f3F000000; +sub.f32 f204, f189, f203; +sub.f32 f205, f194, f198; +mul.f32 f206, f205, 0fBF5DB3D7; +mul.f32 f207, f202, 0f3F000000; +sub.f32 f208, f190, f207; +sub.f32 f209, f193, f197; +mul.f32 f210, f209, 0fBF5DB3D7; +add.f32 %1, f190, f202; +add.f32 %0, f189, f201; +sub.f32 %3, f208, f210; +add.f32 %2, f206, f204; +add.f32 %5, f210, f208; +sub.f32 %4, f204, f206; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<339, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<193>; +.reg .b32 r<34>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 972, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %13, %16; +add.f32 f14, %11, f13; +add.f32 f15, %15, %17; +add.f32 f16, %12, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %11, f17; +sub.f32 f19, %15, %17; +mul.f32 f20, f19, 0fBF5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %12, f23; +sub.f32 f25, %13, %16; +mul.f32 f26, f25, 0fBF5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 972, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f27, f30; +fma.rn.f32 f34, f29, f21, f33; +mul.f32 f35, f21, f30; +mul.f32 f36, f29, f27; +sub.f32 f37, f36, f35; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f28, f42; +fma.rn.f32 f44, f40, f22, f43; +mul.f32 f45, f22, f42; +mul.f32 f46, f40, f28; +sub.f32 f47, f46, f45; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f34; +st.shared.f32 [r9+8], f44; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+324]; +ld.shared.f32 f50, [r11+648]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+324]; +ld.shared.f32 f53, [r11+648]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0fBF5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0fBF5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f68, f71; +fma.rn.f32 f75, f70, f62, f74; +mul.f32 f76, f62, f71; +mul.f32 f77, f70, f68; +sub.f32 f78, f77, f76; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f69, f83; +fma.rn.f32 f85, f81, f63, f84; +mul.f32 f86, f63, f83; +mul.f32 f87, f81, f69; +sub.f32 f88, f87, f86; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f75; +st.shared.f32 [r17+24], f85; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+324]; +ld.shared.f32 f91, [r11+648]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+324]; +ld.shared.f32 f94, [r11+648]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0fBF5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0fBF5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f109, f112; +fma.rn.f32 f116, f111, f103, f115; +mul.f32 f117, f103, f112; +mul.f32 f118, f111, f109; +sub.f32 f119, f118, f117; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f110, f124; +fma.rn.f32 f126, f122, f104, f125; +mul.f32 f127, f104, f124; +mul.f32 f128, f122, f110; +sub.f32 f129, f128, f127; +barrier.sync 0; +mad.lo.s32 r23, r18, 108, r22; +st.shared.f32 [r23], f96; +st.shared.f32 [r23+36], f116; +st.shared.f32 [r23+72], f126; +barrier.sync 0; +ld.shared.f32 f130, [r11]; +ld.shared.f32 f131, [r11+324]; +ld.shared.f32 f132, [r11+648]; +barrier.sync 0; +st.shared.f32 [r23], f98; +st.shared.f32 [r23+36], f119; +st.shared.f32 [r23+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r11]; +ld.shared.f32 f134, [r11+324]; +ld.shared.f32 f135, [r11+648]; +add.f32 f136, f131, f132; +add.f32 f137, f130, f136; +add.f32 f138, f134, f135; +add.f32 f139, f133, f138; +mul.f32 f140, f136, 0f3F000000; +sub.f32 f141, f130, f140; +sub.f32 f142, f134, f135; +mul.f32 f143, f142, 0fBF5DB3D7; +add.f32 f144, f143, f141; +sub.f32 f145, f141, f143; +mul.f32 f146, f138, 0f3F000000; +sub.f32 f147, f133, f146; +sub.f32 f148, f131, f132; +mul.f32 f149, f148, 0fBF5DB3D7; +sub.f32 f150, f147, f149; +add.f32 f151, f149, f147; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 2; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f152, f153}, [rd21]; +mul.f32 f156, f150, f153; +fma.rn.f32 f157, f152, f144, f156; +mul.f32 f158, f144, f153; +mul.f32 f159, f152, f150; +sub.f32 f160, f159, f158; +mul.f32 f161, f152, f152; +mul.f32 f162, f153, f153; +sub.f32 f163, f161, f162; +mul.f32 f164, f153, f152; +fma.rn.f32 f165, f153, f152, f164; +mul.f32 f166, f151, f165; +fma.rn.f32 f167, f163, f145, f166; +mul.f32 f168, f145, f165; +mul.f32 f169, f163, f151; +sub.f32 f170, f169, f168; +barrier.sync 0; +mad.lo.s32 r33, r28, 324, r32; +st.shared.f32 [r33], f137; +st.shared.f32 [r33+108], f157; +st.shared.f32 [r33+216], f167; +barrier.sync 0; +ld.shared.f32 f171, [r11]; +ld.shared.f32 f172, [r11+324]; +ld.shared.f32 f173, [r11+648]; +barrier.sync 0; +st.shared.f32 [r33], f139; +st.shared.f32 [r33+108], f160; +st.shared.f32 [r33+216], f170; +barrier.sync 0; +ld.shared.f32 f174, [r11]; +ld.shared.f32 f175, [r11+324]; +ld.shared.f32 f176, [r11+648]; +add.f32 f177, f172, f173; +add.f32 f178, f175, f176; +mul.f32 f179, f177, 0f3F000000; +sub.f32 f180, f171, f179; +sub.f32 f181, f175, f176; +mul.f32 f182, f181, 0fBF5DB3D7; +mul.f32 f183, f178, 0f3F000000; +sub.f32 f184, f174, f183; +sub.f32 f185, f172, f173; +mul.f32 f186, f185, 0fBF5DB3D7; +add.f32 %0, f171, f177; +add.f32 %1, f174, f178; +add.f32 %2, f182, f180; +sub.f32 %3, f184, f186; +sub.f32 %4, f180, f182; +add.f32 %5, f186, f184; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..c1dbeee41d181 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp64_fwd.hpp.inc @@ -0,0 +1,4120 @@ +#ifndef CUFFTDX_FFT_243_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_243_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<516, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1622>; +.reg .b64 rd<11>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 3888, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1621, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1620, %57, fd1621; +mul.f64 fd119, fd1621, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1619, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1618, %63, fd1619; +mul.f64 fd135, fd1619, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1617, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1616, %69, fd1617; +mul.f64 fd151, fd1617, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd1615, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1615, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd1613, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1614, fd155, 0dBFEF838B8C811C17; +sub.f64 fd164, fd1613, fd1614; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd1611, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1612, fd140, 0dBFEF838B8C811C17; +sub.f64 fd169, fd1611, fd1612; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd1609, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1610, fd156, 0dBFD5E3A8748A0BF5; +sub.f64 fd174, fd1609, fd1610; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1608, fd1618, fd1616; +sub.f64 fd183, fd1618, fd1616; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1607, fd1620, fd1608; +mul.f64 fd187, fd1608, 0d3FE0000000000000; +sub.f64 fd188, fd1620, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1606, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1605, fd123, fd1606; +mul.f64 fd203, fd1606, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1604, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1603, fd124, fd1604; +mul.f64 fd219, fd1604, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1600, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1598, %112, fd1600; +mul.f64 fd235, fd1600, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1595, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1593, %115, fd1595; +mul.f64 fd251, fd1595, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1590, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1588, %118, fd1590; +mul.f64 fd267, fd1590, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd1587, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1587, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd1586, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1586, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd1584, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1585, fd256, 0dBFEF838B8C811C17; +sub.f64 fd285, fd1584, fd1585; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd1582, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1583, fd272, 0dBFD5E3A8748A0BF5; +sub.f64 fd290, fd1582, fd1583; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1581, fd1593, fd1588; +sub.f64 fd299, fd1593, fd1588; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1580, fd1598, fd1581; +mul.f64 fd303, fd1581, 0d3FE0000000000000; +sub.f64 fd304, fd1598, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1579, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1578, fd239, fd1579; +mul.f64 fd319, fd1579, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1577, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1576, fd240, fd1577; +mul.f64 fd335, fd1577, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1573, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1571, %121, fd1573; +mul.f64 fd351, fd1573, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1568, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1566, %124, fd1568; +mul.f64 fd367, fd1568, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1564, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1562, %126, fd1564; +mul.f64 fd383, fd1564, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd1561, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1561, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd1560, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1560, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd1558, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1559, fd372, 0dBFEF838B8C811C17; +sub.f64 fd401, fd1558, fd1559; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd1556, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1557, fd388, 0dBFD5E3A8748A0BF5; +sub.f64 fd406, fd1556, fd1557; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1555, fd1566, fd1562; +sub.f64 fd415, fd1566, fd1562; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1554, fd1571, fd1555; +mul.f64 fd419, fd1555, 0d3FE0000000000000; +sub.f64 fd420, fd1571, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1553, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1552, fd355, fd1553; +mul.f64 fd435, fd1553, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1551, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1550, fd356, fd1551; +mul.f64 fd451, fd1551, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1578, 0dBFCD84D223638000; +mul.f64 fd1549, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1549, fd458; +mul.f64 fd460, fd1578, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd1547, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1548, fd1552, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd1547, fd1548; +mul.f64 fd465, fd1552, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd1545, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1546, fd1576, 0dBFDCB920325BAFA6; +sub.f64 fd469, fd1545, fd1546; +mul.f64 fd470, fd1576, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd1543, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1544, fd1550, 0dBFE9AAFE4207DF5F; +sub.f64 fd474, fd1543, fd1544; +mul.f64 fd475, fd1550, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd1541, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1542, fd307, 0dBFE491B7523C161D; +sub.f64 fd479, fd1541, fd1542; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd1540, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1540, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd1539, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1539, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd1538, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1538, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0dBFED6206BEB6C24B; +mul.f64 fd1537, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1537, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0dBFE746A51650EADE; +mul.f64 fd1536, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1536, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0dBFEF838B8C811C17; +mul.f64 fd1535, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1535, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd1533, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1534, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd1533, fd1534; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd1531, fd318, 0dBFADC528B5343A86; +mul.f64 fd1532, fd324, 0dBFEFF223F3635CE3; +sub.f64 fd519, fd1531, fd1532; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd1529, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1530, fd440, 0d3FBDB843E577175E; +sub.f64 fd524, fd1529, fd1530; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd1528, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1528, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd1527, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1527, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd1526, fd1580, fd1554; +sub.f64 fd541, fd1580, fd1554; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +add.f64 fd543, fd542, fd540; +sub.f64 fd544, fd540, fd542; +mul.f64 fd545, fd1526, 0d3FE0000000000000; +sub.f64 fd546, fd1607, fd545; +sub.f64 fd547, fd294, fd410; +mul.f64 fd548, fd547, 0d3FEBB67AE8584CAA; +sub.f64 fd549, fd546, fd548; +add.f64 fd550, fd548, fd546; +add.f64 fd551, fd459, fd464; +add.f64 fd552, fd194, fd551; +mul.f64 fd555, fd551, 0d3FE0000000000000; +sub.f64 fd556, fd194, fd555; +add.f64 fd1525, fd461, fd466; +sub.f64 fd557, fd461, fd466; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +add.f64 fd559, fd558, fd556; +sub.f64 fd560, fd556, fd558; +add.f64 fd1524, fd1605, fd1525; +mul.f64 fd561, fd1525, 0d3FE0000000000000; +sub.f64 fd562, fd1605, fd561; +sub.f64 fd563, fd459, fd464; +mul.f64 fd564, fd563, 0d3FEBB67AE8584CAA; +sub.f64 fd565, fd562, fd564; +add.f64 fd566, fd564, fd562; +add.f64 fd567, fd469, fd474; +add.f64 fd568, fd210, fd567; +mul.f64 fd571, fd567, 0d3FE0000000000000; +sub.f64 fd572, fd210, fd571; +add.f64 fd1523, fd471, fd476; +sub.f64 fd573, fd471, fd476; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +add.f64 fd575, fd574, fd572; +sub.f64 fd576, fd572, fd574; +add.f64 fd1522, fd1603, fd1523; +mul.f64 fd577, fd1523, 0d3FE0000000000000; +sub.f64 fd578, fd1603, fd577; +sub.f64 fd579, fd469, fd474; +mul.f64 fd580, fd579, 0d3FEBB67AE8584CAA; +sub.f64 fd581, fd578, fd580; +add.f64 fd582, fd580, fd578; +add.f64 fd583, fd479, fd484; +add.f64 fd584, fd185, fd583; +mul.f64 fd587, fd583, 0d3FE0000000000000; +sub.f64 fd588, fd185, fd587; +add.f64 fd1521, fd481, fd486; +sub.f64 fd589, fd481, fd486; +mul.f64 fd590, fd589, 0d3FEBB67AE8584CAA; +add.f64 fd591, fd590, fd588; +sub.f64 fd592, fd588, fd590; +add.f64 fd1520, fd191, fd1521; +mul.f64 fd593, fd1521, 0d3FE0000000000000; +sub.f64 fd594, fd191, fd593; +sub.f64 fd595, fd479, fd484; +mul.f64 fd596, fd595, 0d3FEBB67AE8584CAA; +sub.f64 fd597, fd594, fd596; +add.f64 fd598, fd596, fd594; +add.f64 fd599, fd489, fd494; +add.f64 fd600, fd201, fd599; +mul.f64 fd603, fd599, 0d3FE0000000000000; +sub.f64 fd604, fd201, fd603; +add.f64 fd1519, fd491, fd496; +sub.f64 fd605, fd491, fd496; +mul.f64 fd606, fd605, 0d3FEBB67AE8584CAA; +add.f64 fd607, fd606, fd604; +sub.f64 fd608, fd604, fd606; +add.f64 fd1518, fd207, fd1519; +mul.f64 fd609, fd1519, 0d3FE0000000000000; +sub.f64 fd610, fd207, fd609; +sub.f64 fd611, fd489, fd494; +mul.f64 fd612, fd611, 0d3FEBB67AE8584CAA; +sub.f64 fd613, fd610, fd612; +add.f64 fd614, fd612, fd610; +add.f64 fd615, fd499, fd504; +add.f64 fd616, fd217, fd615; +mul.f64 fd619, fd615, 0d3FE0000000000000; +sub.f64 fd620, fd217, fd619; +add.f64 fd1517, fd501, fd506; +sub.f64 fd621, fd501, fd506; +mul.f64 fd622, fd621, 0d3FEBB67AE8584CAA; +add.f64 fd623, fd622, fd620; +sub.f64 fd624, fd620, fd622; +add.f64 fd1516, fd223, fd1517; +mul.f64 fd625, fd1517, 0d3FE0000000000000; +sub.f64 fd626, fd223, fd625; +sub.f64 fd627, fd499, fd504; +mul.f64 fd628, fd627, 0d3FEBB67AE8584CAA; +sub.f64 fd629, fd626, fd628; +add.f64 fd630, fd628, fd626; +add.f64 fd631, fd509, fd514; +add.f64 fd632, fd186, fd631; +mul.f64 fd635, fd631, 0d3FE0000000000000; +sub.f64 fd636, fd186, fd635; +add.f64 fd1515, fd511, fd516; +sub.f64 fd637, fd511, fd516; +mul.f64 fd638, fd637, 0d3FEBB67AE8584CAA; +add.f64 fd639, fd638, fd636; +sub.f64 fd640, fd636, fd638; +add.f64 fd1514, fd192, fd1515; +mul.f64 fd641, fd1515, 0d3FE0000000000000; +sub.f64 fd642, fd192, fd641; +sub.f64 fd643, fd509, fd514; +mul.f64 fd644, fd643, 0d3FEBB67AE8584CAA; +sub.f64 fd645, fd642, fd644; +add.f64 fd646, fd644, fd642; +add.f64 fd647, fd519, fd524; +add.f64 fd648, fd202, fd647; +mul.f64 fd651, fd647, 0d3FE0000000000000; +sub.f64 fd652, fd202, fd651; +add.f64 fd1513, fd521, fd526; +sub.f64 fd653, fd521, fd526; +mul.f64 fd654, fd653, 0d3FEBB67AE8584CAA; +add.f64 fd655, fd654, fd652; +sub.f64 fd656, fd652, fd654; +add.f64 fd1512, fd208, fd1513; +mul.f64 fd657, fd1513, 0d3FE0000000000000; +sub.f64 fd658, fd208, fd657; +sub.f64 fd659, fd519, fd524; +mul.f64 fd660, fd659, 0d3FEBB67AE8584CAA; +sub.f64 fd661, fd658, fd660; +add.f64 fd662, fd660, fd658; +add.f64 fd663, fd529, fd534; +add.f64 fd664, fd218, fd663; +mul.f64 fd667, fd663, 0d3FE0000000000000; +sub.f64 fd668, fd218, fd667; +add.f64 fd1511, fd531, fd536; +sub.f64 fd669, fd531, fd536; +mul.f64 fd670, fd669, 0d3FEBB67AE8584CAA; +add.f64 fd671, fd670, fd668; +sub.f64 fd672, fd668, fd670; +add.f64 fd1510, fd224, fd1511; +mul.f64 fd673, fd1511, 0d3FE0000000000000; +sub.f64 fd674, fd224, fd673; +sub.f64 fd675, fd529, fd534; +mul.f64 fd676, fd675, 0d3FEBB67AE8584CAA; +sub.f64 fd677, fd674, fd676; +add.f64 fd678, fd676, fd674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 3888, r3; +mul.wide.u32 rd7, r7, 16; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd679, fd680}, [rd6]; +mul.f64 fd684, fd680, fd1524; +mul.f64 fd685, fd679, fd1524; +mul.f64 fd1508, fd679, fd679; +mul.f64 fd1509, fd680, fd680; +sub.f64 fd688, fd1508, fd1509; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd692, fd690, fd1522; +mul.f64 fd693, fd688, fd1522; +mul.f64 fd695, fd680, fd690; +mul.f64 fd1507, fd679, fd688; +sub.f64 fd696, fd1507, fd695; +mul.f64 fd1506, fd688, fd568; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd700, fd698, fd1520; +mul.f64 fd701, fd696, fd1520; +mul.f64 fd1504, fd679, fd696; +mul.f64 fd1505, fd680, fd698; +sub.f64 fd704, fd1504, fd1505; +mul.f64 fd1503, fd696, fd584; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd708, fd706, fd1518; +mul.f64 fd709, fd704, fd1518; +mul.f64 fd711, fd680, fd706; +mul.f64 fd1502, fd679, fd704; +sub.f64 fd712, fd1502, fd711; +mul.f64 fd1501, fd704, fd600; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd716, fd714, fd1516; +mul.f64 fd717, fd712, fd1516; +mul.f64 fd719, fd680, fd714; +mul.f64 fd1500, fd679, fd712; +sub.f64 fd720, fd1500, fd719; +mul.f64 fd1499, fd712, fd616; +mul.f64 fd721, fd679, fd714; +fma.rn.f64 fd722, fd680, fd712, fd721; +mul.f64 fd724, fd722, fd1514; +mul.f64 fd725, fd720, fd1514; +mul.f64 fd1497, fd679, fd720; +mul.f64 fd1498, fd680, fd722; +sub.f64 fd728, fd1497, fd1498; +mul.f64 fd1496, fd720, fd632; +mul.f64 fd729, fd679, fd722; +fma.rn.f64 fd730, fd680, fd720, fd729; +mul.f64 fd732, fd730, fd1512; +mul.f64 fd733, fd728, fd1512; +mul.f64 fd735, fd680, fd730; +mul.f64 fd1495, fd679, fd728; +sub.f64 fd736, fd1495, fd735; +mul.f64 fd1494, fd728, fd648; +mul.f64 fd737, fd679, fd730; +fma.rn.f64 fd738, fd680, fd728, fd737; +mul.f64 fd740, fd738, fd1510; +mul.f64 fd741, fd736, fd1510; +mul.f64 fd743, fd680, fd738; +mul.f64 fd1493, fd679, fd736; +sub.f64 fd744, fd1493, fd743; +mul.f64 fd1492, fd736, fd664; +mul.f64 fd745, fd679, fd738; +fma.rn.f64 fd746, fd680, fd736, fd745; +mul.f64 fd748, fd746, fd549; +mul.f64 fd749, fd744, fd549; +mul.f64 fd1490, fd679, fd744; +mul.f64 fd1491, fd680, fd746; +sub.f64 fd752, fd1490, fd1491; +mul.f64 fd1489, fd744, fd543; +mul.f64 fd753, fd679, fd746; +fma.rn.f64 fd754, fd680, fd744, fd753; +mul.f64 fd756, fd754, fd565; +mul.f64 fd757, fd752, fd565; +mul.f64 fd759, fd680, fd754; +mul.f64 fd1488, fd679, fd752; +sub.f64 fd760, fd1488, fd759; +mul.f64 fd1487, fd752, fd559; +mul.f64 fd761, fd679, fd754; +fma.rn.f64 fd762, fd680, fd752, fd761; +mul.f64 fd764, fd762, fd581; +mul.f64 fd765, fd760, fd581; +mul.f64 fd1485, fd679, fd760; +mul.f64 fd1486, fd680, fd762; +sub.f64 fd768, fd1485, fd1486; +mul.f64 fd1484, fd760, fd575; +mul.f64 fd769, fd679, fd762; +fma.rn.f64 fd770, fd680, fd760, fd769; +mul.f64 fd772, fd770, fd597; +mul.f64 fd773, fd768, fd597; +mul.f64 fd775, fd680, fd770; +mul.f64 fd1483, fd679, fd768; +sub.f64 fd776, fd1483, fd775; +mul.f64 fd1482, fd768, fd591; +mul.f64 fd777, fd679, fd770; +fma.rn.f64 fd778, fd680, fd768, fd777; +mul.f64 fd779, fd776, fd607; +mul.f64 fd780, fd778, fd613; +mul.f64 fd781, fd776, fd613; +ld.global.v2.f64 {fd782, fd783}, [rd6+144]; +mul.f64 fd787, fd783, fd629; +mul.f64 fd788, fd782, fd629; +mul.f64 fd790, fd680, fd783; +mul.f64 fd1481, fd679, fd782; +sub.f64 fd791, fd1481, fd790; +mul.f64 fd1480, fd782, fd623; +mul.f64 fd792, fd679, fd783; +fma.rn.f64 fd793, fd680, fd782, fd792; +mul.f64 fd795, fd793, fd645; +mul.f64 fd796, fd791, fd645; +mul.f64 fd1478, fd679, fd791; +mul.f64 fd1479, fd680, fd793; +sub.f64 fd799, fd1478, fd1479; +mul.f64 fd1477, fd791, fd639; +mul.f64 fd800, fd679, fd793; +fma.rn.f64 fd801, fd680, fd791, fd800; +mul.f64 fd803, fd801, fd661; +mul.f64 fd804, fd799, fd661; +mul.f64 fd806, fd680, fd801; +mul.f64 fd1476, fd679, fd799; +sub.f64 fd807, fd1476, fd806; +mul.f64 fd1475, fd799, fd655; +mul.f64 fd808, fd679, fd801; +fma.rn.f64 fd809, fd680, fd799, fd808; +mul.f64 fd811, fd809, fd677; +mul.f64 fd812, fd807, fd677; +mul.f64 fd814, fd680, fd809; +mul.f64 fd1474, fd679, fd807; +sub.f64 fd815, fd1474, fd814; +mul.f64 fd1473, fd807, fd671; +mul.f64 fd816, fd679, fd809; +fma.rn.f64 fd817, fd680, fd807, fd816; +mul.f64 fd819, fd817, fd550; +mul.f64 fd820, fd815, fd550; +mul.f64 fd1471, fd679, fd815; +mul.f64 fd1472, fd680, fd817; +sub.f64 fd823, fd1471, fd1472; +mul.f64 fd1470, fd815, fd544; +mul.f64 fd824, fd679, fd817; +fma.rn.f64 fd825, fd680, fd815, fd824; +mul.f64 fd827, fd825, fd566; +mul.f64 fd828, fd823, fd566; +mul.f64 fd830, fd680, fd825; +mul.f64 fd1469, fd679, fd823; +sub.f64 fd831, fd1469, fd830; +mul.f64 fd1468, fd823, fd560; +mul.f64 fd832, fd679, fd825; +fma.rn.f64 fd833, fd680, fd823, fd832; +mul.f64 fd835, fd833, fd582; +mul.f64 fd836, fd831, fd582; +mul.f64 fd1466, fd679, fd831; +mul.f64 fd1467, fd680, fd833; +sub.f64 fd839, fd1466, fd1467; +mul.f64 fd1465, fd831, fd576; +mul.f64 fd840, fd679, fd833; +fma.rn.f64 fd841, fd680, fd831, fd840; +mul.f64 fd843, fd841, fd598; +mul.f64 fd844, fd839, fd598; +mul.f64 fd1463, fd679, fd839; +mul.f64 fd1464, fd680, fd841; +sub.f64 fd847, fd1463, fd1464; +mul.f64 fd1462, fd839, fd592; +mul.f64 fd848, fd679, fd841; +fma.rn.f64 fd849, fd680, fd839, fd848; +mul.f64 fd851, fd849, fd614; +mul.f64 fd852, fd847, fd614; +mul.f64 fd854, fd680, fd849; +mul.f64 fd1461, fd679, fd847; +sub.f64 fd855, fd1461, fd854; +mul.f64 fd1460, fd847, fd608; +mul.f64 fd856, fd679, fd849; +fma.rn.f64 fd857, fd680, fd847, fd856; +mul.f64 fd859, fd857, fd630; +mul.f64 fd860, fd855, fd630; +mul.f64 fd1458, fd679, fd855; +mul.f64 fd1459, fd680, fd857; +sub.f64 fd863, fd1458, fd1459; +mul.f64 fd1457, fd855, fd624; +mul.f64 fd864, fd679, fd857; +fma.rn.f64 fd865, fd680, fd855, fd864; +mul.f64 fd867, fd865, fd646; +mul.f64 fd868, fd863, fd646; +mul.f64 fd870, fd680, fd865; +mul.f64 fd1456, fd679, fd863; +sub.f64 fd871, fd1456, fd870; +mul.f64 fd1455, fd863, fd640; +mul.f64 fd872, fd679, fd865; +fma.rn.f64 fd873, fd680, fd863, fd872; +mul.f64 fd875, fd873, fd662; +mul.f64 fd876, fd871, fd662; +mul.f64 fd878, fd680, fd873; +mul.f64 fd1454, fd679, fd871; +sub.f64 fd879, fd1454, fd878; +mul.f64 fd1453, fd679, fd552; +mul.f64 fd880, fd679, fd873; +mul.f64 fd1452, fd871, fd656; +fma.rn.f64 fd881, fd680, fd871, fd880; +mul.f64 fd882, fd879, fd672; +mul.f64 fd883, fd881, fd678; +mul.f64 fd884, fd879, fd678; +barrier.sync 0; +mad.lo.s32 r9, r7, 432, r8; +add.f64 fd885, fd1607, fd1526; +add.f64 fd886, fd178, fd537; +st.shared.v2.f64 [r9], {fd886, fd885}; +fma.rn.f64 fd887, fd680, fd552, fd685; +sub.f64 fd888, fd1453, fd684; +st.shared.v2.f64 [r9+16], {fd888, fd887}; +fma.rn.f64 fd889, fd690, fd568, fd693; +sub.f64 fd890, fd1506, fd692; +st.shared.v2.f64 [r9+32], {fd890, fd889}; +fma.rn.f64 fd891, fd698, fd584, fd701; +sub.f64 fd892, fd1503, fd700; +st.shared.v2.f64 [r9+48], {fd892, fd891}; +fma.rn.f64 fd893, fd706, fd600, fd709; +sub.f64 fd894, fd1501, fd708; +st.shared.v2.f64 [r9+64], {fd894, fd893}; +fma.rn.f64 fd895, fd714, fd616, fd717; +sub.f64 fd896, fd1499, fd716; +st.shared.v2.f64 [r9+80], {fd896, fd895}; +fma.rn.f64 fd897, fd722, fd632, fd725; +sub.f64 fd898, fd1496, fd724; +st.shared.v2.f64 [r9+96], {fd898, fd897}; +sub.f64 fd899, fd1494, fd732; +fma.rn.f64 fd900, fd730, fd648, fd733; +st.shared.v2.f64 [r9+112], {fd899, fd900}; +fma.rn.f64 fd901, fd738, fd664, fd741; +sub.f64 fd902, fd1492, fd740; +st.shared.v2.f64 [r9+128], {fd902, fd901}; +fma.rn.f64 fd903, fd746, fd543, fd749; +sub.f64 fd904, fd1489, fd748; +st.shared.v2.f64 [r9+144], {fd904, fd903}; +fma.rn.f64 fd905, fd754, fd559, fd757; +sub.f64 fd906, fd1487, fd756; +st.shared.v2.f64 [r9+160], {fd906, fd905}; +fma.rn.f64 fd907, fd762, fd575, fd765; +sub.f64 fd908, fd1484, fd764; +st.shared.v2.f64 [r9+176], {fd908, fd907}; +fma.rn.f64 fd909, fd770, fd591, fd773; +sub.f64 fd910, fd1482, fd772; +st.shared.v2.f64 [r9+192], {fd910, fd909}; +fma.rn.f64 fd911, fd778, fd607, fd781; +sub.f64 fd912, fd779, fd780; +st.shared.v2.f64 [r9+208], {fd912, fd911}; +fma.rn.f64 fd913, fd783, fd623, fd788; +sub.f64 fd914, fd1480, fd787; +st.shared.v2.f64 [r9+224], {fd914, fd913}; +fma.rn.f64 fd915, fd793, fd639, fd796; +sub.f64 fd916, fd1477, fd795; +st.shared.v2.f64 [r9+240], {fd916, fd915}; +fma.rn.f64 fd917, fd801, fd655, fd804; +sub.f64 fd918, fd1475, fd803; +st.shared.v2.f64 [r9+256], {fd918, fd917}; +fma.rn.f64 fd919, fd809, fd671, fd812; +sub.f64 fd920, fd1473, fd811; +st.shared.v2.f64 [r9+272], {fd920, fd919}; +fma.rn.f64 fd921, fd817, fd544, fd820; +sub.f64 fd922, fd1470, fd819; +st.shared.v2.f64 [r9+288], {fd922, fd921}; +fma.rn.f64 fd923, fd825, fd560, fd828; +sub.f64 fd924, fd1468, fd827; +st.shared.v2.f64 [r9+304], {fd924, fd923}; +sub.f64 fd925, fd1465, fd835; +fma.rn.f64 fd926, fd833, fd576, fd836; +st.shared.v2.f64 [r9+320], {fd925, fd926}; +fma.rn.f64 fd927, fd841, fd592, fd844; +sub.f64 fd928, fd1462, fd843; +st.shared.v2.f64 [r9+336], {fd928, fd927}; +fma.rn.f64 fd929, fd849, fd608, fd852; +sub.f64 fd930, fd1460, fd851; +st.shared.v2.f64 [r9+352], {fd930, fd929}; +fma.rn.f64 fd931, fd857, fd624, fd860; +sub.f64 fd932, fd1457, fd859; +st.shared.v2.f64 [r9+368], {fd932, fd931}; +fma.rn.f64 fd933, fd865, fd640, fd868; +sub.f64 fd934, fd1455, fd867; +st.shared.v2.f64 [r9+384], {fd934, fd933}; +fma.rn.f64 fd935, fd873, fd656, fd876; +sub.f64 fd936, fd1452, fd875; +st.shared.v2.f64 [r9+400], {fd936, fd935}; +fma.rn.f64 fd937, fd881, fd672, fd884; +sub.f64 fd938, fd882, fd883; +st.shared.v2.f64 [r9+416], {fd938, fd937}; +barrier.sync 0; +mad.lo.s32 r10, r7, -416, r9; +ld.shared.v2.f64 {fd939, fd940}, [r10]; +ld.shared.v2.f64 {fd943, fd944}, [r10+144]; +ld.shared.v2.f64 {fd947, fd948}, [r10+288]; +ld.shared.v2.f64 {fd951, fd952}, [r10+432]; +ld.shared.v2.f64 {fd955, fd956}, [r10+576]; +ld.shared.v2.f64 {fd959, fd960}, [r10+720]; +ld.shared.v2.f64 {fd963, fd964}, [r10+864]; +ld.shared.v2.f64 {fd967, fd968}, [r10+1008]; +ld.shared.v2.f64 {fd971, fd972}, [r10+1152]; +ld.shared.v2.f64 {fd975, fd976}, [r10+1296]; +ld.shared.v2.f64 {fd979, fd980}, [r10+1440]; +ld.shared.v2.f64 {fd983, fd984}, [r10+1584]; +ld.shared.v2.f64 {fd987, fd988}, [r10+1728]; +ld.shared.v2.f64 {fd991, fd992}, [r10+1872]; +ld.shared.v2.f64 {fd995, fd996}, [r10+2016]; +ld.shared.v2.f64 {fd999, fd1000}, [r10+2160]; +ld.shared.v2.f64 {fd1003, fd1004}, [r10+2304]; +ld.shared.v2.f64 {fd1007, fd1008}, [r10+2448]; +ld.shared.v2.f64 {fd1011, fd1012}, [r10+2592]; +ld.shared.v2.f64 {fd1015, fd1016}, [r10+2736]; +ld.shared.v2.f64 {fd1019, fd1020}, [r10+2880]; +ld.shared.v2.f64 {fd1023, fd1024}, [r10+3024]; +ld.shared.v2.f64 {fd1027, fd1028}, [r10+3168]; +ld.shared.v2.f64 {fd1031, fd1032}, [r10+3312]; +ld.shared.v2.f64 {fd1035, fd1036}, [r10+3456]; +ld.shared.v2.f64 {fd1039, fd1040}, [r10+3600]; +ld.shared.v2.f64 {fd1043, fd1044}, [r10+3744]; +add.f64 fd1047, fd975, fd1011; +add.f64 fd1048, fd939, fd1047; +mul.f64 fd1051, fd1047, 0d3FE0000000000000; +sub.f64 fd1052, fd939, fd1051; +add.f64 fd1451, fd976, fd1012; +sub.f64 fd1053, fd976, fd1012; +mul.f64 fd1054, fd1053, 0d3FEBB67AE8584CAA; +add.f64 fd1055, fd1054, fd1052; +sub.f64 fd1056, fd1052, fd1054; +add.f64 fd1450, fd940, fd1451; +mul.f64 fd1057, fd1451, 0d3FE0000000000000; +sub.f64 fd1058, fd940, fd1057; +sub.f64 fd1059, fd975, fd1011; +mul.f64 fd1060, fd1059, 0d3FEBB67AE8584CAA; +sub.f64 fd1061, fd1058, fd1060; +add.f64 fd1062, fd1060, fd1058; +add.f64 fd1063, fd987, fd1023; +add.f64 fd1064, fd951, fd1063; +mul.f64 fd1067, fd1063, 0d3FE0000000000000; +sub.f64 fd1068, fd951, fd1067; +add.f64 fd1449, fd988, fd1024; +sub.f64 fd1069, fd988, fd1024; +mul.f64 fd1070, fd1069, 0d3FEBB67AE8584CAA; +add.f64 fd1071, fd1070, fd1068; +sub.f64 fd1072, fd1068, fd1070; +add.f64 fd1448, fd952, fd1449; +mul.f64 fd1073, fd1449, 0d3FE0000000000000; +sub.f64 fd1074, fd952, fd1073; +sub.f64 fd1075, fd987, fd1023; +mul.f64 fd1076, fd1075, 0d3FEBB67AE8584CAA; +sub.f64 fd1077, fd1074, fd1076; +add.f64 fd1078, fd1076, fd1074; +add.f64 fd1079, fd999, fd1035; +add.f64 fd1080, fd963, fd1079; +mul.f64 fd1083, fd1079, 0d3FE0000000000000; +sub.f64 fd1084, fd963, fd1083; +add.f64 fd1447, fd1000, fd1036; +sub.f64 fd1085, fd1000, fd1036; +mul.f64 fd1086, fd1085, 0d3FEBB67AE8584CAA; +add.f64 fd1087, fd1086, fd1084; +sub.f64 fd1088, fd1084, fd1086; +add.f64 fd1446, fd964, fd1447; +mul.f64 fd1089, fd1447, 0d3FE0000000000000; +sub.f64 fd1090, fd964, fd1089; +sub.f64 fd1091, fd999, fd1035; +mul.f64 fd1092, fd1091, 0d3FEBB67AE8584CAA; +sub.f64 fd1093, fd1090, fd1092; +add.f64 fd1094, fd1092, fd1090; +mul.f64 fd1096, fd1077, 0dBFE491B7523C161D; +mul.f64 fd1445, fd1071, 0d3FE8836FA2CF5039; +sub.f64 fd1097, fd1445, fd1096; +mul.f64 fd1098, fd1077, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1099, fd1071, 0dBFE491B7523C161D, fd1098; +mul.f64 fd1101, fd1093, 0dBFEF838B8C811C17; +mul.f64 fd1444, fd1087, 0d3FC63A1A7E0B738A; +sub.f64 fd1102, fd1444, fd1101; +mul.f64 fd1103, fd1093, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1104, fd1087, 0dBFEF838B8C811C17, fd1103; +mul.f64 fd1106, fd1078, 0dBFEF838B8C811C17; +mul.f64 fd1443, fd1072, 0d3FC63A1A7E0B738A; +sub.f64 fd1107, fd1443, fd1106; +mul.f64 fd1108, fd1078, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1109, fd1072, 0dBFEF838B8C811C17, fd1108; +mul.f64 fd1441, fd1088, 0dBFEE11F642522D1C; +mul.f64 fd1442, fd1094, 0dBFD5E3A8748A0BF5; +sub.f64 fd1112, fd1441, fd1442; +mul.f64 fd1113, fd1094, 0dBFEE11F642522D1C; +fma.rn.f64 fd1114, fd1088, 0dBFD5E3A8748A0BF5, fd1113; +add.f64 fd1115, fd1064, fd1080; +mul.f64 fd1117, fd1115, 0d3FE0000000000000; +sub.f64 fd1118, fd1048, fd1117; +add.f64 fd1440, fd1448, fd1446; +sub.f64 fd1119, fd1448, fd1446; +mul.f64 fd1120, fd1119, 0d3FEBB67AE8584CAA; +mul.f64 fd1121, fd1440, 0d3FE0000000000000; +sub.f64 fd1122, fd1450, fd1121; +sub.f64 fd1123, fd1064, fd1080; +mul.f64 fd1124, fd1123, 0d3FEBB67AE8584CAA; +add.f64 fd1125, fd1097, fd1102; +mul.f64 fd1127, fd1125, 0d3FE0000000000000; +sub.f64 fd1128, fd1055, fd1127; +add.f64 fd1439, fd1099, fd1104; +sub.f64 fd1129, fd1099, fd1104; +mul.f64 fd1130, fd1129, 0d3FEBB67AE8584CAA; +mul.f64 fd1131, fd1439, 0d3FE0000000000000; +sub.f64 fd1132, fd1061, fd1131; +sub.f64 fd1133, fd1097, fd1102; +mul.f64 fd1134, fd1133, 0d3FEBB67AE8584CAA; +add.f64 fd1135, fd1107, fd1112; +mul.f64 fd1137, fd1135, 0d3FE0000000000000; +sub.f64 fd1138, fd1056, fd1137; +add.f64 fd1438, fd1109, fd1114; +sub.f64 fd1139, fd1109, fd1114; +mul.f64 fd1140, fd1139, 0d3FEBB67AE8584CAA; +mul.f64 fd1141, fd1438, 0d3FE0000000000000; +sub.f64 fd1142, fd1062, fd1141; +sub.f64 fd1143, fd1107, fd1112; +mul.f64 fd1144, fd1143, 0d3FEBB67AE8584CAA; +add.f64 fd1145, fd979, fd1015; +add.f64 fd1146, fd943, fd1145; +mul.f64 fd1149, fd1145, 0d3FE0000000000000; +sub.f64 fd1150, fd943, fd1149; +add.f64 fd1437, fd980, fd1016; +sub.f64 fd1151, fd980, fd1016; +mul.f64 fd1152, fd1151, 0d3FEBB67AE8584CAA; +add.f64 fd1153, fd1152, fd1150; +sub.f64 fd1154, fd1150, fd1152; +add.f64 fd1436, fd944, fd1437; +mul.f64 fd1155, fd1437, 0d3FE0000000000000; +sub.f64 fd1156, fd944, fd1155; +sub.f64 fd1157, fd979, fd1015; +mul.f64 fd1158, fd1157, 0d3FEBB67AE8584CAA; +sub.f64 fd1159, fd1156, fd1158; +add.f64 fd1160, fd1158, fd1156; +add.f64 fd1161, fd991, fd1027; +add.f64 fd1162, fd955, fd1161; +mul.f64 fd1165, fd1161, 0d3FE0000000000000; +sub.f64 fd1166, fd955, fd1165; +add.f64 fd1435, fd992, fd1028; +sub.f64 fd1167, fd992, fd1028; +mul.f64 fd1168, fd1167, 0d3FEBB67AE8584CAA; +add.f64 fd1169, fd1168, fd1166; +sub.f64 fd1170, fd1166, fd1168; +add.f64 fd1434, fd956, fd1435; +mul.f64 fd1171, fd1435, 0d3FE0000000000000; +sub.f64 fd1172, fd956, fd1171; +sub.f64 fd1173, fd991, fd1027; +mul.f64 fd1174, fd1173, 0d3FEBB67AE8584CAA; +sub.f64 fd1175, fd1172, fd1174; +add.f64 fd1176, fd1174, fd1172; +add.f64 fd1177, fd1003, fd1039; +add.f64 fd1178, fd967, fd1177; +mul.f64 fd1181, fd1177, 0d3FE0000000000000; +sub.f64 fd1182, fd967, fd1181; +add.f64 fd1433, fd1004, fd1040; +sub.f64 fd1183, fd1004, fd1040; +mul.f64 fd1184, fd1183, 0d3FEBB67AE8584CAA; +add.f64 fd1185, fd1184, fd1182; +sub.f64 fd1186, fd1182, fd1184; +add.f64 fd1432, fd968, fd1433; +mul.f64 fd1187, fd1433, 0d3FE0000000000000; +sub.f64 fd1188, fd968, fd1187; +sub.f64 fd1189, fd1003, fd1039; +mul.f64 fd1190, fd1189, 0d3FEBB67AE8584CAA; +sub.f64 fd1191, fd1188, fd1190; +add.f64 fd1192, fd1190, fd1188; +mul.f64 fd1430, fd1169, 0d3FE8836FA2CF5039; +mul.f64 fd1431, fd1175, 0dBFE491B7523C161D; +sub.f64 fd1195, fd1430, fd1431; +mul.f64 fd1196, fd1175, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1197, fd1169, 0dBFE491B7523C161D, fd1196; +mul.f64 fd1428, fd1185, 0d3FC63A1A7E0B738A; +mul.f64 fd1429, fd1191, 0dBFEF838B8C811C17; +sub.f64 fd1200, fd1428, fd1429; +mul.f64 fd1201, fd1191, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1202, fd1185, 0dBFEF838B8C811C17, fd1201; +mul.f64 fd1426, fd1170, 0d3FC63A1A7E0B738A; +mul.f64 fd1427, fd1176, 0dBFEF838B8C811C17; +sub.f64 fd1205, fd1426, fd1427; +mul.f64 fd1206, fd1176, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1207, fd1170, 0dBFEF838B8C811C17, fd1206; +mul.f64 fd1424, fd1186, 0dBFEE11F642522D1C; +mul.f64 fd1425, fd1192, 0dBFD5E3A8748A0BF5; +sub.f64 fd1210, fd1424, fd1425; +mul.f64 fd1211, fd1192, 0dBFEE11F642522D1C; +fma.rn.f64 fd1212, fd1186, 0dBFD5E3A8748A0BF5, fd1211; +add.f64 fd1213, fd1162, fd1178; +mul.f64 fd1215, fd1213, 0d3FE0000000000000; +sub.f64 fd1216, fd1146, fd1215; +add.f64 fd1423, fd1434, fd1432; +sub.f64 fd1217, fd1434, fd1432; +mul.f64 fd1218, fd1217, 0d3FEBB67AE8584CAA; +mul.f64 fd1219, fd1423, 0d3FE0000000000000; +sub.f64 fd1220, fd1436, fd1219; +sub.f64 fd1221, fd1162, fd1178; +mul.f64 fd1222, fd1221, 0d3FEBB67AE8584CAA; +add.f64 fd1223, fd1195, fd1200; +mul.f64 fd1225, fd1223, 0d3FE0000000000000; +sub.f64 fd1226, fd1153, fd1225; +add.f64 fd1422, fd1197, fd1202; +sub.f64 fd1227, fd1197, fd1202; +mul.f64 fd1228, fd1227, 0d3FEBB67AE8584CAA; +mul.f64 fd1229, fd1422, 0d3FE0000000000000; +sub.f64 fd1230, fd1159, fd1229; +sub.f64 fd1231, fd1195, fd1200; +mul.f64 fd1232, fd1231, 0d3FEBB67AE8584CAA; +add.f64 fd1233, fd1205, fd1210; +mul.f64 fd1235, fd1233, 0d3FE0000000000000; +sub.f64 fd1236, fd1154, fd1235; +add.f64 fd1421, fd1207, fd1212; +sub.f64 fd1237, fd1207, fd1212; +mul.f64 fd1238, fd1237, 0d3FEBB67AE8584CAA; +mul.f64 fd1239, fd1421, 0d3FE0000000000000; +sub.f64 fd1240, fd1160, fd1239; +sub.f64 fd1241, fd1205, fd1210; +mul.f64 fd1242, fd1241, 0d3FEBB67AE8584CAA; +add.f64 fd1243, fd983, fd1019; +add.f64 fd1244, fd947, fd1243; +mul.f64 fd1247, fd1243, 0d3FE0000000000000; +sub.f64 fd1248, fd947, fd1247; +add.f64 fd1420, fd984, fd1020; +sub.f64 fd1249, fd984, fd1020; +mul.f64 fd1250, fd1249, 0d3FEBB67AE8584CAA; +add.f64 fd1251, fd1250, fd1248; +sub.f64 fd1252, fd1248, fd1250; +add.f64 fd1419, fd948, fd1420; +mul.f64 fd1253, fd1420, 0d3FE0000000000000; +sub.f64 fd1254, fd948, fd1253; +sub.f64 fd1255, fd983, fd1019; +mul.f64 fd1256, fd1255, 0d3FEBB67AE8584CAA; +sub.f64 fd1257, fd1254, fd1256; +add.f64 fd1258, fd1256, fd1254; +add.f64 fd1259, fd995, fd1031; +add.f64 fd1260, fd959, fd1259; +mul.f64 fd1263, fd1259, 0d3FE0000000000000; +sub.f64 fd1264, fd959, fd1263; +add.f64 fd1418, fd996, fd1032; +sub.f64 fd1265, fd996, fd1032; +mul.f64 fd1266, fd1265, 0d3FEBB67AE8584CAA; +add.f64 fd1267, fd1266, fd1264; +sub.f64 fd1268, fd1264, fd1266; +add.f64 fd1417, fd960, fd1418; +mul.f64 fd1269, fd1418, 0d3FE0000000000000; +sub.f64 fd1270, fd960, fd1269; +sub.f64 fd1271, fd995, fd1031; +mul.f64 fd1272, fd1271, 0d3FEBB67AE8584CAA; +sub.f64 fd1273, fd1270, fd1272; +add.f64 fd1274, fd1272, fd1270; +add.f64 fd1275, fd1007, fd1043; +add.f64 fd1276, fd971, fd1275; +mul.f64 fd1279, fd1275, 0d3FE0000000000000; +sub.f64 fd1280, fd971, fd1279; +add.f64 fd1416, fd1008, fd1044; +sub.f64 fd1281, fd1008, fd1044; +mul.f64 fd1282, fd1281, 0d3FEBB67AE8584CAA; +add.f64 fd1283, fd1282, fd1280; +sub.f64 fd1284, fd1280, fd1282; +add.f64 fd1415, fd972, fd1416; +mul.f64 fd1285, fd1416, 0d3FE0000000000000; +sub.f64 fd1286, fd972, fd1285; +sub.f64 fd1287, fd1007, fd1043; +mul.f64 fd1288, fd1287, 0d3FEBB67AE8584CAA; +sub.f64 fd1289, fd1286, fd1288; +add.f64 fd1290, fd1288, fd1286; +mul.f64 fd1413, fd1267, 0d3FE8836FA2CF5039; +mul.f64 fd1414, fd1273, 0dBFE491B7523C161D; +sub.f64 fd1293, fd1413, fd1414; +mul.f64 fd1294, fd1273, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1295, fd1267, 0dBFE491B7523C161D, fd1294; +mul.f64 fd1297, fd1289, 0dBFEF838B8C811C17; +mul.f64 fd1412, fd1283, 0d3FC63A1A7E0B738A; +sub.f64 fd1298, fd1412, fd1297; +mul.f64 fd1299, fd1289, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1300, fd1283, 0dBFEF838B8C811C17, fd1299; +mul.f64 fd1302, fd1274, 0dBFEF838B8C811C17; +mul.f64 fd1411, fd1268, 0d3FC63A1A7E0B738A; +sub.f64 fd1303, fd1411, fd1302; +mul.f64 fd1304, fd1274, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1305, fd1268, 0dBFEF838B8C811C17, fd1304; +mul.f64 fd1307, fd1290, 0dBFD5E3A8748A0BF5; +mul.f64 fd1410, fd1284, 0dBFEE11F642522D1C; +sub.f64 fd1308, fd1410, fd1307; +mul.f64 fd1309, fd1290, 0dBFEE11F642522D1C; +fma.rn.f64 fd1310, fd1284, 0dBFD5E3A8748A0BF5, fd1309; +add.f64 fd1311, fd1260, fd1276; +mul.f64 fd1313, fd1311, 0d3FE0000000000000; +sub.f64 fd1314, fd1244, fd1313; +add.f64 fd1409, fd1417, fd1415; +sub.f64 fd1315, fd1417, fd1415; +mul.f64 fd1316, fd1315, 0d3FEBB67AE8584CAA; +mul.f64 fd1317, fd1409, 0d3FE0000000000000; +sub.f64 fd1318, fd1419, fd1317; +sub.f64 fd1319, fd1260, fd1276; +mul.f64 fd1320, fd1319, 0d3FEBB67AE8584CAA; +add.f64 fd1321, fd1293, fd1298; +mul.f64 fd1323, fd1321, 0d3FE0000000000000; +sub.f64 fd1324, fd1251, fd1323; +add.f64 fd1408, fd1295, fd1300; +sub.f64 fd1325, fd1295, fd1300; +mul.f64 fd1326, fd1325, 0d3FEBB67AE8584CAA; +mul.f64 fd1327, fd1408, 0d3FE0000000000000; +sub.f64 fd1328, fd1257, fd1327; +sub.f64 fd1329, fd1293, fd1298; +mul.f64 fd1330, fd1329, 0d3FEBB67AE8584CAA; +add.f64 fd1331, fd1303, fd1308; +mul.f64 fd1333, fd1331, 0d3FE0000000000000; +sub.f64 fd1334, fd1252, fd1333; +add.f64 fd1407, fd1305, fd1310; +sub.f64 fd1335, fd1305, fd1310; +mul.f64 fd1336, fd1335, 0d3FEBB67AE8584CAA; +mul.f64 fd1337, fd1407, 0d3FE0000000000000; +sub.f64 fd1338, fd1258, fd1337; +sub.f64 fd1339, fd1303, fd1308; +mul.f64 fd1340, fd1339, 0d3FEBB67AE8584CAA; +add.f64 %1, fd1450, fd1440; +add.f64 %0, fd1048, fd1115; +add.f64 %3, fd1436, fd1423; +add.f64 %2, fd1146, fd1213; +add.f64 %5, fd1419, fd1409; +add.f64 %4, fd1244, fd1311; +add.f64 %7, fd1061, fd1439; +add.f64 %6, fd1055, fd1125; +add.f64 %9, fd1159, fd1422; +add.f64 %8, fd1153, fd1223; +add.f64 %11, fd1257, fd1408; +add.f64 %10, fd1251, fd1321; +add.f64 %13, fd1062, fd1438; +add.f64 %12, fd1056, fd1135; +add.f64 %15, fd1160, fd1421; +add.f64 %14, fd1154, fd1233; +add.f64 %17, fd1258, fd1407; +add.f64 %16, fd1252, fd1331; +sub.f64 %19, fd1122, fd1124; +add.f64 %18, fd1120, fd1118; +add.f64 %20, fd1218, fd1216; +sub.f64 %21, fd1220, fd1222; +add.f64 %22, fd1316, fd1314; +sub.f64 %23, fd1318, fd1320; +add.f64 %24, fd1130, fd1128; +sub.f64 %25, fd1132, fd1134; +add.f64 %26, fd1228, fd1226; +sub.f64 %27, fd1230, fd1232; +sub.f64 %29, fd1328, fd1330; +add.f64 %28, fd1326, fd1324; +sub.f64 %31, fd1142, fd1144; +add.f64 %30, fd1140, fd1138; +add.f64 %32, fd1238, fd1236; +sub.f64 %33, fd1240, fd1242; +add.f64 %34, fd1336, fd1334; +sub.f64 %35, fd1338, fd1340; +add.f64 %37, fd1124, fd1122; +sub.f64 %36, fd1118, fd1120; +add.f64 %39, fd1222, fd1220; +sub.f64 %38, fd1216, fd1218; +add.f64 %41, fd1320, fd1318; +sub.f64 %40, fd1314, fd1316; +add.f64 %43, fd1134, fd1132; +sub.f64 %42, fd1128, fd1130; +add.f64 %45, fd1232, fd1230; +sub.f64 %44, fd1226, fd1228; +add.f64 %47, fd1330, fd1328; +sub.f64 %46, fd1324, fd1326; +add.f64 %49, fd1144, fd1142; +sub.f64 %48, fd1138, fd1140; +add.f64 %51, fd1242, fd1240; +sub.f64 %50, fd1236, fd1238; +add.f64 %53, fd1340, fd1338; +sub.f64 %52, fd1334, fd1336; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_243), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<515, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1589>; +.reg .b64 rd<8>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 1944, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1580, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1579, %57, fd1580; +mul.f64 fd119, fd1580, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1578, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1577, %63, fd1578; +mul.f64 fd135, fd1578, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1576, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1575, %69, fd1576; +mul.f64 fd151, fd1576, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd1574, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1574, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd1572, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1573, fd155, 0dBFEF838B8C811C17; +sub.f64 fd164, fd1572, fd1573; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd1570, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1571, fd140, 0dBFEF838B8C811C17; +sub.f64 fd169, fd1570, fd1571; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd1568, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1569, fd156, 0dBFD5E3A8748A0BF5; +sub.f64 fd174, fd1568, fd1569; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1567, fd1577, fd1575; +sub.f64 fd183, fd1577, fd1575; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1566, fd1579, fd1567; +mul.f64 fd187, fd1567, 0d3FE0000000000000; +sub.f64 fd188, fd1579, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1565, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1564, fd123, fd1565; +mul.f64 fd203, fd1565, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1563, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1562, fd124, fd1563; +mul.f64 fd219, fd1563, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1559, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1557, %112, fd1559; +mul.f64 fd235, fd1559, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1554, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1552, %115, fd1554; +mul.f64 fd251, fd1554, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1549, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1547, %118, fd1549; +mul.f64 fd267, fd1549, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd1546, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1546, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd1545, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1545, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd1543, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1544, fd256, 0dBFEF838B8C811C17; +sub.f64 fd285, fd1543, fd1544; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd1541, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1542, fd272, 0dBFD5E3A8748A0BF5; +sub.f64 fd290, fd1541, fd1542; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1540, fd1552, fd1547; +sub.f64 fd299, fd1552, fd1547; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1539, fd1557, fd1540; +mul.f64 fd303, fd1540, 0d3FE0000000000000; +sub.f64 fd304, fd1557, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1538, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1537, fd239, fd1538; +mul.f64 fd319, fd1538, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1536, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1535, fd240, fd1536; +mul.f64 fd335, fd1536, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1532, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1530, %121, fd1532; +mul.f64 fd351, fd1532, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1527, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1525, %124, fd1527; +mul.f64 fd367, fd1527, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1523, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1521, %126, fd1523; +mul.f64 fd383, fd1523, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd1520, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1520, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd1519, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1519, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd1517, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1518, fd372, 0dBFEF838B8C811C17; +sub.f64 fd401, fd1517, fd1518; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd1515, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1516, fd388, 0dBFD5E3A8748A0BF5; +sub.f64 fd406, fd1515, fd1516; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1514, fd1525, fd1521; +sub.f64 fd415, fd1525, fd1521; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1513, fd1530, fd1514; +mul.f64 fd419, fd1514, 0d3FE0000000000000; +sub.f64 fd420, fd1530, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1512, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1511, fd355, fd1512; +mul.f64 fd435, fd1512, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1510, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1509, fd356, fd1510; +mul.f64 fd451, fd1510, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1537, 0dBFCD84D223638000; +mul.f64 fd1508, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1508, fd458; +mul.f64 fd460, fd1537, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd1506, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1507, fd1511, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd1506, fd1507; +mul.f64 fd465, fd1511, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd1504, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1505, fd1535, 0dBFDCB920325BAFA6; +sub.f64 fd469, fd1504, fd1505; +mul.f64 fd470, fd1535, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd1502, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1503, fd1509, 0dBFE9AAFE4207DF5F; +sub.f64 fd474, fd1502, fd1503; +mul.f64 fd475, fd1509, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd1500, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1501, fd307, 0dBFE491B7523C161D; +sub.f64 fd479, fd1500, fd1501; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd1499, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1499, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd1498, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1498, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd1497, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1497, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0dBFED6206BEB6C24B; +mul.f64 fd1496, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1496, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0dBFE746A51650EADE; +mul.f64 fd1495, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1495, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0dBFEF838B8C811C17; +mul.f64 fd1494, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1494, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd1492, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1493, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd1492, fd1493; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd1490, fd318, 0dBFADC528B5343A86; +mul.f64 fd1491, fd324, 0dBFEFF223F3635CE3; +sub.f64 fd519, fd1490, fd1491; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd1488, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1489, fd440, 0d3FBDB843E577175E; +sub.f64 fd524, fd1488, fd1489; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd1487, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1487, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd1486, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1486, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +add.f64 fd538, fd178, fd537; +mul.f64 fd541, fd537, 0d3FE0000000000000; +sub.f64 fd542, fd178, fd541; +add.f64 fd1485, fd1539, fd1513; +sub.f64 fd543, fd1539, fd1513; +mul.f64 fd544, fd543, 0d3FEBB67AE8584CAA; +add.f64 fd545, fd544, fd542; +sub.f64 fd546, fd542, fd544; +add.f64 fd1484, fd1566, fd1485; +mul.f64 fd547, fd1485, 0d3FE0000000000000; +sub.f64 fd548, fd1566, fd547; +sub.f64 fd549, fd294, fd410; +mul.f64 fd550, fd549, 0d3FEBB67AE8584CAA; +sub.f64 fd551, fd548, fd550; +add.f64 fd552, fd550, fd548; +add.f64 fd553, fd459, fd464; +add.f64 fd554, fd194, fd553; +mul.f64 fd557, fd553, 0d3FE0000000000000; +sub.f64 fd558, fd194, fd557; +add.f64 fd1483, fd461, fd466; +sub.f64 fd559, fd461, fd466; +mul.f64 fd560, fd559, 0d3FEBB67AE8584CAA; +add.f64 fd561, fd560, fd558; +sub.f64 fd562, fd558, fd560; +add.f64 fd1482, fd1564, fd1483; +mul.f64 fd563, fd1483, 0d3FE0000000000000; +sub.f64 fd564, fd1564, fd563; +sub.f64 fd565, fd459, fd464; +mul.f64 fd566, fd565, 0d3FEBB67AE8584CAA; +sub.f64 fd567, fd564, fd566; +add.f64 fd568, fd566, fd564; +add.f64 fd569, fd469, fd474; +add.f64 fd570, fd210, fd569; +mul.f64 fd573, fd569, 0d3FE0000000000000; +sub.f64 fd574, fd210, fd573; +add.f64 fd1481, fd471, fd476; +sub.f64 fd575, fd471, fd476; +mul.f64 fd576, fd575, 0d3FEBB67AE8584CAA; +add.f64 fd577, fd576, fd574; +sub.f64 fd578, fd574, fd576; +add.f64 fd1480, fd1562, fd1481; +mul.f64 fd579, fd1481, 0d3FE0000000000000; +sub.f64 fd580, fd1562, fd579; +sub.f64 fd581, fd469, fd474; +mul.f64 fd582, fd581, 0d3FEBB67AE8584CAA; +sub.f64 fd583, fd580, fd582; +add.f64 fd584, fd582, fd580; +add.f64 fd585, fd479, fd484; +add.f64 fd586, fd185, fd585; +mul.f64 fd589, fd585, 0d3FE0000000000000; +sub.f64 fd590, fd185, fd589; +add.f64 fd1479, fd481, fd486; +sub.f64 fd591, fd481, fd486; +mul.f64 fd592, fd591, 0d3FEBB67AE8584CAA; +add.f64 fd593, fd592, fd590; +sub.f64 fd594, fd590, fd592; +add.f64 fd1478, fd191, fd1479; +mul.f64 fd595, fd1479, 0d3FE0000000000000; +sub.f64 fd596, fd191, fd595; +sub.f64 fd597, fd479, fd484; +mul.f64 fd598, fd597, 0d3FEBB67AE8584CAA; +sub.f64 fd599, fd596, fd598; +add.f64 fd600, fd598, fd596; +add.f64 fd601, fd489, fd494; +add.f64 fd602, fd201, fd601; +mul.f64 fd605, fd601, 0d3FE0000000000000; +sub.f64 fd606, fd201, fd605; +add.f64 fd1477, fd491, fd496; +sub.f64 fd607, fd491, fd496; +mul.f64 fd608, fd607, 0d3FEBB67AE8584CAA; +add.f64 fd609, fd608, fd606; +sub.f64 fd610, fd606, fd608; +add.f64 fd1476, fd207, fd1477; +mul.f64 fd611, fd1477, 0d3FE0000000000000; +sub.f64 fd612, fd207, fd611; +sub.f64 fd613, fd489, fd494; +mul.f64 fd614, fd613, 0d3FEBB67AE8584CAA; +sub.f64 fd615, fd612, fd614; +add.f64 fd616, fd614, fd612; +add.f64 fd617, fd499, fd504; +add.f64 fd618, fd217, fd617; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd217, fd621; +add.f64 fd1475, fd501, fd506; +sub.f64 fd623, fd501, fd506; +mul.f64 fd624, fd623, 0d3FEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +add.f64 fd1474, fd223, fd1475; +mul.f64 fd627, fd1475, 0d3FE0000000000000; +sub.f64 fd628, fd223, fd627; +sub.f64 fd629, fd499, fd504; +mul.f64 fd630, fd629, 0d3FEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd509, fd514; +add.f64 fd634, fd186, fd633; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd186, fd637; +add.f64 fd1473, fd511, fd516; +sub.f64 fd639, fd511, fd516; +mul.f64 fd640, fd639, 0d3FEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +add.f64 fd1472, fd192, fd1473; +mul.f64 fd643, fd1473, 0d3FE0000000000000; +sub.f64 fd644, fd192, fd643; +sub.f64 fd645, fd509, fd514; +mul.f64 fd646, fd645, 0d3FEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +add.f64 fd649, fd519, fd524; +add.f64 fd650, fd202, fd649; +mul.f64 fd653, fd649, 0d3FE0000000000000; +sub.f64 fd654, fd202, fd653; +add.f64 fd1471, fd521, fd526; +sub.f64 fd655, fd521, fd526; +mul.f64 fd656, fd655, 0d3FEBB67AE8584CAA; +add.f64 fd657, fd656, fd654; +sub.f64 fd658, fd654, fd656; +add.f64 fd1470, fd208, fd1471; +mul.f64 fd659, fd1471, 0d3FE0000000000000; +sub.f64 fd660, fd208, fd659; +sub.f64 fd661, fd519, fd524; +mul.f64 fd662, fd661, 0d3FEBB67AE8584CAA; +sub.f64 fd663, fd660, fd662; +add.f64 fd664, fd662, fd660; +add.f64 fd665, fd529, fd534; +add.f64 fd666, fd218, fd665; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd218, fd669; +add.f64 fd1469, fd531, fd536; +sub.f64 fd671, fd531, fd536; +mul.f64 fd672, fd671, 0d3FEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +add.f64 fd1468, fd224, fd1469; +mul.f64 fd675, fd1469, 0d3FE0000000000000; +sub.f64 fd676, fd224, fd675; +sub.f64 fd677, fd529, fd534; +mul.f64 fd678, fd677, 0d3FEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd681, fd682}, [rd6]; +mul.f64 fd686, fd682, fd1482; +mul.f64 fd1467, fd681, fd554; +sub.f64 fd687, fd1467, fd686; +mul.f64 fd688, fd681, fd1482; +fma.rn.f64 fd689, fd682, fd554, fd688; +mul.f64 fd691, fd682, fd682; +mul.f64 fd1466, fd681, fd681; +sub.f64 fd692, fd1466, fd691; +mul.f64 fd693, fd682, fd681; +fma.rn.f64 fd694, fd682, fd681, fd693; +mul.f64 fd696, fd694, fd1480; +mul.f64 fd1465, fd692, fd570; +sub.f64 fd697, fd1465, fd696; +mul.f64 fd698, fd692, fd1480; +fma.rn.f64 fd699, fd694, fd570, fd698; +mul.f64 fd701, fd682, fd694; +mul.f64 fd1464, fd681, fd692; +sub.f64 fd702, fd1464, fd701; +mul.f64 fd703, fd681, fd694; +fma.rn.f64 fd704, fd682, fd692, fd703; +mul.f64 fd706, fd704, fd1478; +mul.f64 fd1463, fd702, fd586; +sub.f64 fd707, fd1463, fd706; +mul.f64 fd708, fd702, fd1478; +fma.rn.f64 fd709, fd704, fd586, fd708; +mul.f64 fd1461, fd681, fd702; +mul.f64 fd1462, fd682, fd704; +sub.f64 fd712, fd1461, fd1462; +mul.f64 fd713, fd681, fd704; +fma.rn.f64 fd714, fd682, fd702, fd713; +mul.f64 fd1459, fd712, fd602; +mul.f64 fd1460, fd714, fd1476; +sub.f64 fd717, fd1459, fd1460; +mul.f64 fd718, fd712, fd1476; +fma.rn.f64 fd719, fd714, fd602, fd718; +mul.f64 fd1457, fd681, fd712; +mul.f64 fd1458, fd682, fd714; +sub.f64 fd722, fd1457, fd1458; +mul.f64 fd723, fd681, fd714; +fma.rn.f64 fd724, fd682, fd712, fd723; +mul.f64 fd1455, fd722, fd618; +mul.f64 fd1456, fd724, fd1474; +sub.f64 fd727, fd1455, fd1456; +mul.f64 fd728, fd722, fd1474; +fma.rn.f64 fd729, fd724, fd618, fd728; +mul.f64 fd731, fd682, fd724; +mul.f64 fd1454, fd681, fd722; +sub.f64 fd732, fd1454, fd731; +mul.f64 fd733, fd681, fd724; +fma.rn.f64 fd734, fd682, fd722, fd733; +mul.f64 fd736, fd734, fd1472; +mul.f64 fd1453, fd732, fd634; +sub.f64 fd737, fd1453, fd736; +mul.f64 fd738, fd732, fd1472; +fma.rn.f64 fd739, fd734, fd634, fd738; +mul.f64 fd741, fd682, fd734; +mul.f64 fd1452, fd681, fd732; +sub.f64 fd742, fd1452, fd741; +mul.f64 fd743, fd681, fd734; +fma.rn.f64 fd744, fd682, fd732, fd743; +mul.f64 fd746, fd744, fd1470; +mul.f64 fd1451, fd742, fd650; +sub.f64 fd747, fd1451, fd746; +mul.f64 fd748, fd742, fd1470; +fma.rn.f64 fd749, fd744, fd650, fd748; +mul.f64 fd751, fd682, fd744; +mul.f64 fd1450, fd681, fd742; +sub.f64 fd752, fd1450, fd751; +mul.f64 fd753, fd681, fd744; +fma.rn.f64 fd754, fd682, fd742, fd753; +mul.f64 fd756, fd754, fd1468; +mul.f64 fd1449, fd752, fd666; +sub.f64 fd757, fd1449, fd756; +mul.f64 fd758, fd752, fd1468; +fma.rn.f64 fd759, fd754, fd666, fd758; +mul.f64 fd1447, fd681, fd752; +mul.f64 fd1448, fd682, fd754; +sub.f64 fd762, fd1447, fd1448; +mul.f64 fd763, fd681, fd754; +fma.rn.f64 fd764, fd682, fd752, fd763; +mul.f64 fd1445, fd762, fd545; +mul.f64 fd1446, fd764, fd551; +sub.f64 fd767, fd1445, fd1446; +mul.f64 fd768, fd762, fd551; +fma.rn.f64 fd769, fd764, fd545, fd768; +mul.f64 fd1443, fd681, fd762; +mul.f64 fd1444, fd682, fd764; +sub.f64 fd772, fd1443, fd1444; +mul.f64 fd773, fd681, fd764; +fma.rn.f64 fd774, fd682, fd762, fd773; +mul.f64 fd776, fd774, fd567; +mul.f64 fd1442, fd772, fd561; +sub.f64 fd777, fd1442, fd776; +mul.f64 fd778, fd772, fd567; +fma.rn.f64 fd779, fd774, fd561, fd778; +mul.f64 fd781, fd682, fd774; +mul.f64 fd1441, fd681, fd772; +sub.f64 fd782, fd1441, fd781; +mul.f64 fd783, fd681, fd774; +fma.rn.f64 fd784, fd682, fd772, fd783; +mul.f64 fd786, fd784, fd583; +mul.f64 fd1440, fd782, fd577; +sub.f64 fd787, fd1440, fd786; +mul.f64 fd788, fd782, fd583; +fma.rn.f64 fd789, fd784, fd577, fd788; +mul.f64 fd791, fd682, fd784; +mul.f64 fd1439, fd681, fd782; +sub.f64 fd792, fd1439, fd791; +mul.f64 fd793, fd681, fd784; +fma.rn.f64 fd794, fd682, fd782, fd793; +mul.f64 fd796, fd794, fd599; +mul.f64 fd1438, fd792, fd593; +sub.f64 fd797, fd1438, fd796; +mul.f64 fd798, fd792, fd599; +fma.rn.f64 fd799, fd794, fd593, fd798; +mul.f64 fd801, fd682, fd794; +mul.f64 fd1437, fd681, fd792; +sub.f64 fd802, fd1437, fd801; +mul.f64 fd803, fd681, fd794; +fma.rn.f64 fd804, fd682, fd792, fd803; +mul.f64 fd1435, fd802, fd609; +mul.f64 fd1436, fd804, fd615; +sub.f64 fd807, fd1435, fd1436; +mul.f64 fd808, fd802, fd615; +fma.rn.f64 fd809, fd804, fd609, fd808; +ld.global.v2.f64 {fd810, fd811}, [rd6+144]; +mul.f64 fd815, fd811, fd631; +mul.f64 fd1434, fd810, fd625; +sub.f64 fd816, fd1434, fd815; +mul.f64 fd817, fd810, fd631; +fma.rn.f64 fd818, fd811, fd625, fd817; +mul.f64 fd820, fd682, fd811; +mul.f64 fd1433, fd681, fd810; +sub.f64 fd821, fd1433, fd820; +mul.f64 fd822, fd681, fd811; +fma.rn.f64 fd823, fd682, fd810, fd822; +mul.f64 fd1431, fd821, fd641; +mul.f64 fd1432, fd823, fd647; +sub.f64 fd826, fd1431, fd1432; +mul.f64 fd827, fd821, fd647; +fma.rn.f64 fd828, fd823, fd641, fd827; +mul.f64 fd1429, fd681, fd821; +mul.f64 fd1430, fd682, fd823; +sub.f64 fd831, fd1429, fd1430; +mul.f64 fd832, fd681, fd823; +fma.rn.f64 fd833, fd682, fd821, fd832; +mul.f64 fd1427, fd831, fd657; +mul.f64 fd1428, fd833, fd663; +sub.f64 fd836, fd1427, fd1428; +mul.f64 fd837, fd831, fd663; +fma.rn.f64 fd838, fd833, fd657, fd837; +mul.f64 fd840, fd682, fd833; +mul.f64 fd1426, fd681, fd831; +sub.f64 fd841, fd1426, fd840; +mul.f64 fd842, fd681, fd833; +fma.rn.f64 fd843, fd682, fd831, fd842; +mul.f64 fd845, fd843, fd679; +mul.f64 fd1425, fd841, fd673; +sub.f64 fd846, fd1425, fd845; +mul.f64 fd847, fd841, fd679; +fma.rn.f64 fd848, fd843, fd673, fd847; +mul.f64 fd850, fd682, fd843; +mul.f64 fd1424, fd681, fd841; +sub.f64 fd851, fd1424, fd850; +mul.f64 fd852, fd681, fd843; +fma.rn.f64 fd853, fd682, fd841, fd852; +mul.f64 fd855, fd853, fd552; +mul.f64 fd1423, fd851, fd546; +sub.f64 fd856, fd1423, fd855; +mul.f64 fd857, fd851, fd552; +fma.rn.f64 fd858, fd853, fd546, fd857; +mul.f64 fd860, fd682, fd853; +mul.f64 fd1422, fd681, fd851; +sub.f64 fd861, fd1422, fd860; +mul.f64 fd862, fd681, fd853; +fma.rn.f64 fd863, fd682, fd851, fd862; +mul.f64 fd865, fd863, fd568; +mul.f64 fd1421, fd861, fd562; +sub.f64 fd866, fd1421, fd865; +mul.f64 fd867, fd861, fd568; +fma.rn.f64 fd868, fd863, fd562, fd867; +mul.f64 fd1419, fd681, fd861; +mul.f64 fd1420, fd682, fd863; +sub.f64 fd871, fd1419, fd1420; +mul.f64 fd872, fd681, fd863; +fma.rn.f64 fd873, fd682, fd861, fd872; +mul.f64 fd1417, fd871, fd578; +mul.f64 fd1418, fd873, fd584; +sub.f64 fd876, fd1417, fd1418; +mul.f64 fd877, fd871, fd584; +fma.rn.f64 fd878, fd873, fd578, fd877; +mul.f64 fd1415, fd681, fd871; +mul.f64 fd1416, fd682, fd873; +sub.f64 fd881, fd1415, fd1416; +mul.f64 fd882, fd681, fd873; +fma.rn.f64 fd883, fd682, fd871, fd882; +mul.f64 fd1413, fd881, fd594; +mul.f64 fd1414, fd883, fd600; +sub.f64 fd886, fd1413, fd1414; +mul.f64 fd887, fd881, fd600; +fma.rn.f64 fd888, fd883, fd594, fd887; +mul.f64 fd890, fd682, fd883; +mul.f64 fd1412, fd681, fd881; +sub.f64 fd891, fd1412, fd890; +mul.f64 fd892, fd681, fd883; +fma.rn.f64 fd893, fd682, fd881, fd892; +mul.f64 fd895, fd893, fd616; +mul.f64 fd1411, fd891, fd610; +sub.f64 fd896, fd1411, fd895; +mul.f64 fd897, fd891, fd616; +fma.rn.f64 fd898, fd893, fd610, fd897; +mul.f64 fd900, fd682, fd893; +mul.f64 fd1410, fd681, fd891; +sub.f64 fd901, fd1410, fd900; +mul.f64 fd902, fd681, fd893; +fma.rn.f64 fd903, fd682, fd891, fd902; +mul.f64 fd905, fd903, fd632; +mul.f64 fd1409, fd901, fd626; +sub.f64 fd906, fd1409, fd905; +mul.f64 fd907, fd901, fd632; +fma.rn.f64 fd908, fd903, fd626, fd907; +mul.f64 fd910, fd682, fd903; +mul.f64 fd1408, fd681, fd901; +sub.f64 fd911, fd1408, fd910; +mul.f64 fd912, fd681, fd903; +fma.rn.f64 fd913, fd682, fd901, fd912; +mul.f64 fd1406, fd911, fd642; +mul.f64 fd1407, fd913, fd648; +sub.f64 fd916, fd1406, fd1407; +mul.f64 fd917, fd911, fd648; +fma.rn.f64 fd918, fd913, fd642, fd917; +mul.f64 fd1404, fd681, fd911; +mul.f64 fd1405, fd682, fd913; +sub.f64 fd921, fd1404, fd1405; +mul.f64 fd922, fd681, fd913; +fma.rn.f64 fd923, fd682, fd911, fd922; +mul.f64 fd1402, fd921, fd658; +mul.f64 fd1403, fd923, fd664; +sub.f64 fd926, fd1402, fd1403; +mul.f64 fd927, fd921, fd664; +fma.rn.f64 fd928, fd923, fd658, fd927; +mul.f64 fd1400, fd681, fd921; +mul.f64 fd1401, fd682, fd923; +sub.f64 fd931, fd1400, fd1401; +mul.f64 fd932, fd681, fd923; +fma.rn.f64 fd933, fd682, fd921, fd932; +mul.f64 fd935, fd933, fd680; +mul.f64 fd1399, fd931, fd674; +sub.f64 fd936, fd1399, fd935; +mul.f64 fd937, fd931, fd680; +fma.rn.f64 fd938, fd933, fd674, fd937; +mad.lo.s32 r8, r5, 1944, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +st.shared.f64 [r9], fd538; +st.shared.f64 [r9+8], fd687; +st.shared.f64 [r9+16], fd697; +st.shared.f64 [r9+24], fd707; +st.shared.f64 [r9+32], fd717; +st.shared.f64 [r9+40], fd727; +st.shared.f64 [r9+48], fd737; +st.shared.f64 [r9+56], fd747; +st.shared.f64 [r9+64], fd757; +st.shared.f64 [r9+72], fd767; +st.shared.f64 [r9+80], fd777; +st.shared.f64 [r9+88], fd787; +st.shared.f64 [r9+96], fd797; +st.shared.f64 [r9+104], fd807; +st.shared.f64 [r9+112], fd816; +st.shared.f64 [r9+120], fd826; +st.shared.f64 [r9+128], fd836; +st.shared.f64 [r9+136], fd846; +st.shared.f64 [r9+144], fd856; +st.shared.f64 [r9+152], fd866; +st.shared.f64 [r9+160], fd876; +st.shared.f64 [r9+168], fd886; +st.shared.f64 [r9+176], fd896; +st.shared.f64 [r9+184], fd906; +st.shared.f64 [r9+192], fd916; +st.shared.f64 [r9+200], fd926; +st.shared.f64 [r9+208], fd936; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.f64 fd939, [r10]; +ld.shared.f64 fd940, [r10+72]; +ld.shared.f64 fd941, [r10+144]; +ld.shared.f64 fd942, [r10+216]; +ld.shared.f64 fd943, [r10+288]; +ld.shared.f64 fd944, [r10+360]; +ld.shared.f64 fd945, [r10+432]; +ld.shared.f64 fd946, [r10+504]; +ld.shared.f64 fd947, [r10+576]; +ld.shared.f64 fd948, [r10+648]; +ld.shared.f64 fd949, [r10+720]; +ld.shared.f64 fd950, [r10+792]; +ld.shared.f64 fd951, [r10+864]; +ld.shared.f64 fd952, [r10+936]; +ld.shared.f64 fd953, [r10+1008]; +ld.shared.f64 fd954, [r10+1080]; +ld.shared.f64 fd955, [r10+1152]; +ld.shared.f64 fd956, [r10+1224]; +ld.shared.f64 fd957, [r10+1296]; +ld.shared.f64 fd958, [r10+1368]; +ld.shared.f64 fd959, [r10+1440]; +ld.shared.f64 fd960, [r10+1512]; +ld.shared.f64 fd961, [r10+1584]; +ld.shared.f64 fd962, [r10+1656]; +ld.shared.f64 fd963, [r10+1728]; +ld.shared.f64 fd964, [r10+1800]; +ld.shared.f64 fd965, [r10+1872]; +barrier.sync 0; +st.shared.f64 [r9], fd1484; +st.shared.f64 [r9+8], fd689; +st.shared.f64 [r9+16], fd699; +st.shared.f64 [r9+24], fd709; +st.shared.f64 [r9+32], fd719; +st.shared.f64 [r9+40], fd729; +st.shared.f64 [r9+48], fd739; +st.shared.f64 [r9+56], fd749; +st.shared.f64 [r9+64], fd759; +st.shared.f64 [r9+72], fd769; +st.shared.f64 [r9+80], fd779; +st.shared.f64 [r9+88], fd789; +st.shared.f64 [r9+96], fd799; +st.shared.f64 [r9+104], fd809; +st.shared.f64 [r9+112], fd818; +st.shared.f64 [r9+120], fd828; +st.shared.f64 [r9+128], fd838; +st.shared.f64 [r9+136], fd848; +st.shared.f64 [r9+144], fd858; +st.shared.f64 [r9+152], fd868; +st.shared.f64 [r9+160], fd878; +st.shared.f64 [r9+168], fd888; +st.shared.f64 [r9+176], fd898; +st.shared.f64 [r9+184], fd908; +st.shared.f64 [r9+192], fd918; +st.shared.f64 [r9+200], fd928; +st.shared.f64 [r9+208], fd938; +barrier.sync 0; +ld.shared.f64 fd966, [r10]; +ld.shared.f64 fd967, [r10+72]; +ld.shared.f64 fd968, [r10+144]; +ld.shared.f64 fd969, [r10+216]; +ld.shared.f64 fd970, [r10+288]; +ld.shared.f64 fd971, [r10+360]; +ld.shared.f64 fd972, [r10+432]; +ld.shared.f64 fd973, [r10+504]; +ld.shared.f64 fd974, [r10+576]; +ld.shared.f64 fd975, [r10+648]; +ld.shared.f64 fd976, [r10+720]; +ld.shared.f64 fd977, [r10+792]; +ld.shared.f64 fd978, [r10+864]; +ld.shared.f64 fd979, [r10+936]; +ld.shared.f64 fd980, [r10+1008]; +ld.shared.f64 fd981, [r10+1080]; +ld.shared.f64 fd982, [r10+1152]; +ld.shared.f64 fd983, [r10+1224]; +ld.shared.f64 fd984, [r10+1296]; +ld.shared.f64 fd985, [r10+1368]; +ld.shared.f64 fd986, [r10+1440]; +ld.shared.f64 fd987, [r10+1512]; +ld.shared.f64 fd988, [r10+1584]; +ld.shared.f64 fd989, [r10+1656]; +ld.shared.f64 fd990, [r10+1728]; +ld.shared.f64 fd991, [r10+1800]; +ld.shared.f64 fd992, [r10+1872]; +add.f64 fd993, fd948, fd957; +add.f64 fd994, fd939, fd993; +mul.f64 fd997, fd993, 0d3FE0000000000000; +sub.f64 fd998, fd939, fd997; +add.f64 fd1398, fd975, fd984; +sub.f64 fd999, fd975, fd984; +mul.f64 fd1000, fd999, 0d3FEBB67AE8584CAA; +add.f64 fd1001, fd1000, fd998; +sub.f64 fd1002, fd998, fd1000; +add.f64 fd1397, fd966, fd1398; +mul.f64 fd1003, fd1398, 0d3FE0000000000000; +sub.f64 fd1004, fd966, fd1003; +sub.f64 fd1005, fd948, fd957; +mul.f64 fd1006, fd1005, 0d3FEBB67AE8584CAA; +sub.f64 fd1007, fd1004, fd1006; +add.f64 fd1008, fd1006, fd1004; +add.f64 fd1009, fd951, fd960; +add.f64 fd1010, fd942, fd1009; +mul.f64 fd1013, fd1009, 0d3FE0000000000000; +sub.f64 fd1014, fd942, fd1013; +add.f64 fd1396, fd978, fd987; +sub.f64 fd1015, fd978, fd987; +mul.f64 fd1016, fd1015, 0d3FEBB67AE8584CAA; +add.f64 fd1017, fd1016, fd1014; +sub.f64 fd1018, fd1014, fd1016; +add.f64 fd1395, fd969, fd1396; +mul.f64 fd1019, fd1396, 0d3FE0000000000000; +sub.f64 fd1020, fd969, fd1019; +sub.f64 fd1021, fd951, fd960; +mul.f64 fd1022, fd1021, 0d3FEBB67AE8584CAA; +sub.f64 fd1023, fd1020, fd1022; +add.f64 fd1024, fd1022, fd1020; +add.f64 fd1025, fd954, fd963; +add.f64 fd1026, fd945, fd1025; +mul.f64 fd1029, fd1025, 0d3FE0000000000000; +sub.f64 fd1030, fd945, fd1029; +add.f64 fd1394, fd981, fd990; +sub.f64 fd1031, fd981, fd990; +mul.f64 fd1032, fd1031, 0d3FEBB67AE8584CAA; +add.f64 fd1033, fd1032, fd1030; +sub.f64 fd1034, fd1030, fd1032; +add.f64 fd1393, fd972, fd1394; +mul.f64 fd1035, fd1394, 0d3FE0000000000000; +sub.f64 fd1036, fd972, fd1035; +sub.f64 fd1037, fd954, fd963; +mul.f64 fd1038, fd1037, 0d3FEBB67AE8584CAA; +sub.f64 fd1039, fd1036, fd1038; +add.f64 fd1040, fd1038, fd1036; +mul.f64 fd1042, fd1023, 0dBFE491B7523C161D; +mul.f64 fd1392, fd1017, 0d3FE8836FA2CF5039; +sub.f64 fd1043, fd1392, fd1042; +mul.f64 fd1044, fd1023, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1045, fd1017, 0dBFE491B7523C161D, fd1044; +mul.f64 fd1390, fd1033, 0d3FC63A1A7E0B738A; +mul.f64 fd1391, fd1039, 0dBFEF838B8C811C17; +sub.f64 fd1048, fd1390, fd1391; +mul.f64 fd1049, fd1039, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1050, fd1033, 0dBFEF838B8C811C17, fd1049; +mul.f64 fd1388, fd1018, 0d3FC63A1A7E0B738A; +mul.f64 fd1389, fd1024, 0dBFEF838B8C811C17; +sub.f64 fd1053, fd1388, fd1389; +mul.f64 fd1054, fd1024, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1055, fd1018, 0dBFEF838B8C811C17, fd1054; +mul.f64 fd1386, fd1034, 0dBFEE11F642522D1C; +mul.f64 fd1387, fd1040, 0dBFD5E3A8748A0BF5; +sub.f64 fd1058, fd1386, fd1387; +mul.f64 fd1059, fd1040, 0dBFEE11F642522D1C; +fma.rn.f64 fd1060, fd1034, 0dBFD5E3A8748A0BF5, fd1059; +add.f64 fd1061, fd1010, fd1026; +mul.f64 fd1063, fd1061, 0d3FE0000000000000; +sub.f64 fd1064, fd994, fd1063; +add.f64 fd1385, fd1395, fd1393; +sub.f64 fd1065, fd1395, fd1393; +mul.f64 fd1066, fd1065, 0d3FEBB67AE8584CAA; +mul.f64 fd1067, fd1385, 0d3FE0000000000000; +sub.f64 fd1068, fd1397, fd1067; +sub.f64 fd1069, fd1010, fd1026; +mul.f64 fd1070, fd1069, 0d3FEBB67AE8584CAA; +add.f64 fd1071, fd1043, fd1048; +mul.f64 fd1073, fd1071, 0d3FE0000000000000; +sub.f64 fd1074, fd1001, fd1073; +add.f64 fd1384, fd1045, fd1050; +sub.f64 fd1075, fd1045, fd1050; +mul.f64 fd1076, fd1075, 0d3FEBB67AE8584CAA; +mul.f64 fd1077, fd1384, 0d3FE0000000000000; +sub.f64 fd1078, fd1007, fd1077; +sub.f64 fd1079, fd1043, fd1048; +mul.f64 fd1080, fd1079, 0d3FEBB67AE8584CAA; +add.f64 fd1081, fd1053, fd1058; +mul.f64 fd1083, fd1081, 0d3FE0000000000000; +sub.f64 fd1084, fd1002, fd1083; +add.f64 fd1383, fd1055, fd1060; +sub.f64 fd1085, fd1055, fd1060; +mul.f64 fd1086, fd1085, 0d3FEBB67AE8584CAA; +mul.f64 fd1087, fd1383, 0d3FE0000000000000; +sub.f64 fd1088, fd1008, fd1087; +sub.f64 fd1089, fd1053, fd1058; +mul.f64 fd1090, fd1089, 0d3FEBB67AE8584CAA; +add.f64 fd1091, fd949, fd958; +add.f64 fd1092, fd940, fd1091; +mul.f64 fd1095, fd1091, 0d3FE0000000000000; +sub.f64 fd1096, fd940, fd1095; +add.f64 fd1382, fd976, fd985; +sub.f64 fd1097, fd976, fd985; +mul.f64 fd1098, fd1097, 0d3FEBB67AE8584CAA; +add.f64 fd1099, fd1098, fd1096; +sub.f64 fd1100, fd1096, fd1098; +add.f64 fd1381, fd967, fd1382; +mul.f64 fd1101, fd1382, 0d3FE0000000000000; +sub.f64 fd1102, fd967, fd1101; +sub.f64 fd1103, fd949, fd958; +mul.f64 fd1104, fd1103, 0d3FEBB67AE8584CAA; +sub.f64 fd1105, fd1102, fd1104; +add.f64 fd1106, fd1104, fd1102; +add.f64 fd1107, fd952, fd961; +add.f64 fd1108, fd943, fd1107; +mul.f64 fd1111, fd1107, 0d3FE0000000000000; +sub.f64 fd1112, fd943, fd1111; +add.f64 fd1380, fd979, fd988; +sub.f64 fd1113, fd979, fd988; +mul.f64 fd1114, fd1113, 0d3FEBB67AE8584CAA; +add.f64 fd1115, fd1114, fd1112; +sub.f64 fd1116, fd1112, fd1114; +add.f64 fd1379, fd970, fd1380; +mul.f64 fd1117, fd1380, 0d3FE0000000000000; +sub.f64 fd1118, fd970, fd1117; +sub.f64 fd1119, fd952, fd961; +mul.f64 fd1120, fd1119, 0d3FEBB67AE8584CAA; +sub.f64 fd1121, fd1118, fd1120; +add.f64 fd1122, fd1120, fd1118; +add.f64 fd1123, fd955, fd964; +add.f64 fd1124, fd946, fd1123; +mul.f64 fd1127, fd1123, 0d3FE0000000000000; +sub.f64 fd1128, fd946, fd1127; +add.f64 fd1378, fd982, fd991; +sub.f64 fd1129, fd982, fd991; +mul.f64 fd1130, fd1129, 0d3FEBB67AE8584CAA; +add.f64 fd1131, fd1130, fd1128; +sub.f64 fd1132, fd1128, fd1130; +add.f64 fd1377, fd973, fd1378; +mul.f64 fd1133, fd1378, 0d3FE0000000000000; +sub.f64 fd1134, fd973, fd1133; +sub.f64 fd1135, fd955, fd964; +mul.f64 fd1136, fd1135, 0d3FEBB67AE8584CAA; +sub.f64 fd1137, fd1134, fd1136; +add.f64 fd1138, fd1136, fd1134; +mul.f64 fd1375, fd1115, 0d3FE8836FA2CF5039; +mul.f64 fd1376, fd1121, 0dBFE491B7523C161D; +sub.f64 fd1141, fd1375, fd1376; +mul.f64 fd1142, fd1121, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1143, fd1115, 0dBFE491B7523C161D, fd1142; +mul.f64 fd1373, fd1131, 0d3FC63A1A7E0B738A; +mul.f64 fd1374, fd1137, 0dBFEF838B8C811C17; +sub.f64 fd1146, fd1373, fd1374; +mul.f64 fd1147, fd1137, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1148, fd1131, 0dBFEF838B8C811C17, fd1147; +mul.f64 fd1150, fd1122, 0dBFEF838B8C811C17; +mul.f64 fd1372, fd1116, 0d3FC63A1A7E0B738A; +sub.f64 fd1151, fd1372, fd1150; +mul.f64 fd1152, fd1122, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1153, fd1116, 0dBFEF838B8C811C17, fd1152; +mul.f64 fd1155, fd1138, 0dBFD5E3A8748A0BF5; +mul.f64 fd1371, fd1132, 0dBFEE11F642522D1C; +sub.f64 fd1156, fd1371, fd1155; +mul.f64 fd1157, fd1138, 0dBFEE11F642522D1C; +fma.rn.f64 fd1158, fd1132, 0dBFD5E3A8748A0BF5, fd1157; +add.f64 fd1159, fd1108, fd1124; +mul.f64 fd1161, fd1159, 0d3FE0000000000000; +sub.f64 fd1162, fd1092, fd1161; +add.f64 fd1370, fd1379, fd1377; +sub.f64 fd1163, fd1379, fd1377; +mul.f64 fd1164, fd1163, 0d3FEBB67AE8584CAA; +mul.f64 fd1165, fd1370, 0d3FE0000000000000; +sub.f64 fd1166, fd1381, fd1165; +sub.f64 fd1167, fd1108, fd1124; +mul.f64 fd1168, fd1167, 0d3FEBB67AE8584CAA; +add.f64 fd1169, fd1141, fd1146; +mul.f64 fd1171, fd1169, 0d3FE0000000000000; +sub.f64 fd1172, fd1099, fd1171; +add.f64 fd1369, fd1143, fd1148; +sub.f64 fd1173, fd1143, fd1148; +mul.f64 fd1174, fd1173, 0d3FEBB67AE8584CAA; +mul.f64 fd1175, fd1369, 0d3FE0000000000000; +sub.f64 fd1176, fd1105, fd1175; +sub.f64 fd1177, fd1141, fd1146; +mul.f64 fd1178, fd1177, 0d3FEBB67AE8584CAA; +add.f64 fd1179, fd1151, fd1156; +mul.f64 fd1181, fd1179, 0d3FE0000000000000; +sub.f64 fd1182, fd1100, fd1181; +add.f64 fd1368, fd1153, fd1158; +sub.f64 fd1183, fd1153, fd1158; +mul.f64 fd1184, fd1183, 0d3FEBB67AE8584CAA; +mul.f64 fd1185, fd1368, 0d3FE0000000000000; +sub.f64 fd1186, fd1106, fd1185; +sub.f64 fd1187, fd1151, fd1156; +mul.f64 fd1188, fd1187, 0d3FEBB67AE8584CAA; +add.f64 fd1189, fd950, fd959; +add.f64 fd1190, fd941, fd1189; +mul.f64 fd1193, fd1189, 0d3FE0000000000000; +sub.f64 fd1194, fd941, fd1193; +add.f64 fd1367, fd977, fd986; +sub.f64 fd1195, fd977, fd986; +mul.f64 fd1196, fd1195, 0d3FEBB67AE8584CAA; +add.f64 fd1197, fd1196, fd1194; +sub.f64 fd1198, fd1194, fd1196; +add.f64 fd1366, fd968, fd1367; +mul.f64 fd1199, fd1367, 0d3FE0000000000000; +sub.f64 fd1200, fd968, fd1199; +sub.f64 fd1201, fd950, fd959; +mul.f64 fd1202, fd1201, 0d3FEBB67AE8584CAA; +sub.f64 fd1203, fd1200, fd1202; +add.f64 fd1204, fd1202, fd1200; +add.f64 fd1205, fd953, fd962; +add.f64 fd1206, fd944, fd1205; +mul.f64 fd1209, fd1205, 0d3FE0000000000000; +sub.f64 fd1210, fd944, fd1209; +add.f64 fd1365, fd980, fd989; +sub.f64 fd1211, fd980, fd989; +mul.f64 fd1212, fd1211, 0d3FEBB67AE8584CAA; +add.f64 fd1213, fd1212, fd1210; +sub.f64 fd1214, fd1210, fd1212; +add.f64 fd1364, fd971, fd1365; +mul.f64 fd1215, fd1365, 0d3FE0000000000000; +sub.f64 fd1216, fd971, fd1215; +sub.f64 fd1217, fd953, fd962; +mul.f64 fd1218, fd1217, 0d3FEBB67AE8584CAA; +sub.f64 fd1219, fd1216, fd1218; +add.f64 fd1220, fd1218, fd1216; +add.f64 fd1221, fd956, fd965; +add.f64 fd1222, fd947, fd1221; +mul.f64 fd1225, fd1221, 0d3FE0000000000000; +sub.f64 fd1226, fd947, fd1225; +add.f64 fd1363, fd983, fd992; +sub.f64 fd1227, fd983, fd992; +mul.f64 fd1228, fd1227, 0d3FEBB67AE8584CAA; +add.f64 fd1229, fd1228, fd1226; +sub.f64 fd1230, fd1226, fd1228; +add.f64 fd1362, fd974, fd1363; +mul.f64 fd1231, fd1363, 0d3FE0000000000000; +sub.f64 fd1232, fd974, fd1231; +sub.f64 fd1233, fd956, fd965; +mul.f64 fd1234, fd1233, 0d3FEBB67AE8584CAA; +sub.f64 fd1235, fd1232, fd1234; +add.f64 fd1236, fd1234, fd1232; +mul.f64 fd1238, fd1219, 0dBFE491B7523C161D; +mul.f64 fd1361, fd1213, 0d3FE8836FA2CF5039; +sub.f64 fd1239, fd1361, fd1238; +mul.f64 fd1240, fd1219, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1241, fd1213, 0dBFE491B7523C161D, fd1240; +mul.f64 fd1243, fd1235, 0dBFEF838B8C811C17; +mul.f64 fd1360, fd1229, 0d3FC63A1A7E0B738A; +sub.f64 fd1244, fd1360, fd1243; +mul.f64 fd1245, fd1235, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1246, fd1229, 0dBFEF838B8C811C17, fd1245; +mul.f64 fd1248, fd1220, 0dBFEF838B8C811C17; +mul.f64 fd1359, fd1214, 0d3FC63A1A7E0B738A; +sub.f64 fd1249, fd1359, fd1248; +mul.f64 fd1250, fd1220, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1251, fd1214, 0dBFEF838B8C811C17, fd1250; +mul.f64 fd1253, fd1236, 0dBFD5E3A8748A0BF5; +mul.f64 fd1358, fd1230, 0dBFEE11F642522D1C; +sub.f64 fd1254, fd1358, fd1253; +mul.f64 fd1255, fd1236, 0dBFEE11F642522D1C; +fma.rn.f64 fd1256, fd1230, 0dBFD5E3A8748A0BF5, fd1255; +add.f64 fd1257, fd1206, fd1222; +mul.f64 fd1259, fd1257, 0d3FE0000000000000; +sub.f64 fd1260, fd1190, fd1259; +add.f64 fd1357, fd1364, fd1362; +sub.f64 fd1261, fd1364, fd1362; +mul.f64 fd1262, fd1261, 0d3FEBB67AE8584CAA; +mul.f64 fd1263, fd1357, 0d3FE0000000000000; +sub.f64 fd1264, fd1366, fd1263; +sub.f64 fd1265, fd1206, fd1222; +mul.f64 fd1266, fd1265, 0d3FEBB67AE8584CAA; +add.f64 fd1267, fd1239, fd1244; +mul.f64 fd1269, fd1267, 0d3FE0000000000000; +sub.f64 fd1270, fd1197, fd1269; +add.f64 fd1356, fd1241, fd1246; +sub.f64 fd1271, fd1241, fd1246; +mul.f64 fd1272, fd1271, 0d3FEBB67AE8584CAA; +mul.f64 fd1273, fd1356, 0d3FE0000000000000; +sub.f64 fd1274, fd1203, fd1273; +sub.f64 fd1275, fd1239, fd1244; +mul.f64 fd1276, fd1275, 0d3FEBB67AE8584CAA; +add.f64 fd1277, fd1249, fd1254; +mul.f64 fd1279, fd1277, 0d3FE0000000000000; +sub.f64 fd1280, fd1198, fd1279; +add.f64 fd1355, fd1251, fd1256; +sub.f64 fd1281, fd1251, fd1256; +mul.f64 fd1282, fd1281, 0d3FEBB67AE8584CAA; +mul.f64 fd1283, fd1355, 0d3FE0000000000000; +sub.f64 fd1284, fd1204, fd1283; +sub.f64 fd1285, fd1249, fd1254; +mul.f64 fd1582, fd1179, 0d3FE0000000000000; +sub.f64 fd1581, fd1100, fd1582; +mul.f64 fd1286, fd1285, 0d3FEBB67AE8584CAA; +add.f64 %0, fd994, fd1061; +mul.f64 fd1584, fd1385, 0d3FE0000000000000; +sub.f64 fd1583, fd1397, fd1584; +add.f64 %1, fd1397, fd1385; +mul.f64 fd1586, fd1169, 0d3FE0000000000000; +sub.f64 fd1585, fd1099, fd1586; +mul.f64 fd1588, fd1368, 0d3FE0000000000000; +sub.f64 fd1587, fd1106, fd1588; +add.f64 %2, fd1092, fd1159; +add.f64 %3, fd1381, fd1370; +add.f64 %4, fd1190, fd1257; +add.f64 %5, fd1366, fd1357; +add.f64 %7, fd1007, fd1384; +add.f64 %6, fd1001, fd1071; +add.f64 %9, fd1105, fd1369; +add.f64 %8, fd1099, fd1169; +add.f64 %11, fd1203, fd1356; +add.f64 %10, fd1197, fd1267; +add.f64 %13, fd1008, fd1383; +add.f64 %12, fd1002, fd1081; +add.f64 %15, fd1106, fd1368; +add.f64 %14, fd1100, fd1179; +add.f64 %17, fd1204, fd1355; +add.f64 %16, fd1198, fd1277; +add.f64 %18, fd1066, fd1064; +sub.f64 %19, fd1583, fd1070; +add.f64 %20, fd1164, fd1162; +sub.f64 %21, fd1166, fd1168; +sub.f64 %23, fd1264, fd1266; +add.f64 %22, fd1262, fd1260; +add.f64 %24, fd1076, fd1074; +sub.f64 %25, fd1078, fd1080; +add.f64 %26, fd1174, fd1585; +sub.f64 %27, fd1176, fd1178; +add.f64 %28, fd1272, fd1270; +sub.f64 %29, fd1274, fd1276; +sub.f64 %31, fd1088, fd1090; +add.f64 %30, fd1086, fd1084; +sub.f64 %33, fd1587, fd1188; +add.f64 %32, fd1184, fd1581; +sub.f64 %35, fd1284, fd1286; +add.f64 %34, fd1282, fd1280; +sub.f64 %36, fd1064, fd1066; +add.f64 %37, fd1070, fd1583; +sub.f64 %38, fd1162, fd1164; +add.f64 %39, fd1168, fd1166; +sub.f64 %40, fd1260, fd1262; +add.f64 %41, fd1266, fd1264; +add.f64 %43, fd1080, fd1078; +sub.f64 %42, fd1074, fd1076; +add.f64 %45, fd1178, fd1176; +sub.f64 %44, fd1585, fd1174; +add.f64 %47, fd1276, fd1274; +sub.f64 %46, fd1270, fd1272; +add.f64 %49, fd1090, fd1088; +sub.f64 %48, fd1084, fd1086; +add.f64 %51, fd1188, fd1587; +sub.f64 %50, fd1581, fd1184; +add.f64 %53, fd1286, fd1284; +sub.f64 %52, fd1280, fd1282; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_243), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<517, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<545>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 3888, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 3888, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd151, fd120; +mul.f64 fd156, fd152, fd122; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd136; +mul.f64 fd164, fd162, fd138; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd168, fd111; +mul.f64 fd172, fd170, fd117; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd176, fd127; +mul.f64 fd180, fd178, fd133; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+432]; +mul.f64 fd186, fd182, fd143; +mul.f64 fd187, fd183, fd149; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd191, fd112; +mul.f64 fd195, fd193, fd118; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd199, fd128; +mul.f64 fd203, fd201, fd134; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd207, fd144; +mul.f64 fd211, fd209, fd150; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r13, r11, 144, r12; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r13], {fd214, fd213}; +fma.rn.f64 fd215, fd152, fd120, fd157; +sub.f64 fd216, fd155, fd156; +st.shared.v2.f64 [r13+16], {fd216, fd215}; +fma.rn.f64 fd217, fd162, fd136, fd165; +sub.f64 fd218, fd163, fd164; +st.shared.v2.f64 [r13+32], {fd218, fd217}; +sub.f64 fd219, fd171, fd172; +fma.rn.f64 fd220, fd170, fd111, fd173; +st.shared.v2.f64 [r13+48], {fd219, fd220}; +fma.rn.f64 fd221, fd178, fd127, fd181; +sub.f64 fd222, fd179, fd180; +st.shared.v2.f64 [r13+64], {fd222, fd221}; +fma.rn.f64 fd223, fd183, fd143, fd188; +sub.f64 fd224, fd186, fd187; +st.shared.v2.f64 [r13+80], {fd224, fd223}; +fma.rn.f64 fd225, fd193, fd112, fd196; +sub.f64 fd226, fd194, fd195; +st.shared.v2.f64 [r13+96], {fd226, fd225}; +fma.rn.f64 fd227, fd201, fd128, fd204; +sub.f64 fd228, fd202, fd203; +st.shared.v2.f64 [r13+112], {fd228, fd227}; +fma.rn.f64 fd229, fd209, fd144, fd212; +sub.f64 fd230, fd210, fd211; +st.shared.v2.f64 [r13+128], {fd230, fd229}; +barrier.sync 0; +shl.b32 r14, r11, 7; +sub.s32 r15, r13, r14; +ld.shared.v2.f64 {fd231, fd232}, [r15]; +ld.shared.v2.f64 {fd235, fd236}, [r15+432]; +ld.shared.v2.f64 {fd239, fd240}, [r15+864]; +ld.shared.v2.f64 {fd243, fd244}, [r15+1296]; +ld.shared.v2.f64 {fd247, fd248}, [r15+1728]; +ld.shared.v2.f64 {fd251, fd252}, [r15+2160]; +ld.shared.v2.f64 {fd255, fd256}, [r15+2592]; +ld.shared.v2.f64 {fd259, fd260}, [r15+3024]; +ld.shared.v2.f64 {fd263, fd264}, [r15+3456]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0d3FEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0d3FEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0d3FEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0dBFE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0dBFE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0dBFEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0dBFEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0dBFEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0dBFEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0dBFD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0dBFD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0d3FEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0d3FEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd381, fd350; +mul.f64 fd386, fd382, fd352; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd390, fd366; +mul.f64 fd394, fd392, fd368; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd398, fd341; +mul.f64 fd402, fd400, fd347; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd406, fd357; +mul.f64 fd410, fd408, fd363; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+48]; +mul.f64 fd416, fd412, fd373; +mul.f64 fd417, fd413, fd379; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd421, fd342; +mul.f64 fd425, fd423, fd348; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd429, fd358; +mul.f64 fd433, fd431, fd364; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd437, fd374; +mul.f64 fd441, fd439, fd380; +mul.f64 fd442, fd437, fd380; +shl.b32 r19, r18, 4; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 1296, r20; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r21], {fd444, fd443}; +fma.rn.f64 fd445, fd382, fd350, fd387; +sub.f64 fd446, fd385, fd386; +st.shared.v2.f64 [r21+144], {fd446, fd445}; +fma.rn.f64 fd447, fd392, fd366, fd395; +sub.f64 fd448, fd393, fd394; +st.shared.v2.f64 [r21+288], {fd448, fd447}; +fma.rn.f64 fd449, fd400, fd341, fd403; +sub.f64 fd450, fd401, fd402; +st.shared.v2.f64 [r21+432], {fd450, fd449}; +fma.rn.f64 fd451, fd408, fd357, fd411; +sub.f64 fd452, fd409, fd410; +st.shared.v2.f64 [r21+576], {fd452, fd451}; +fma.rn.f64 fd453, fd413, fd373, fd418; +sub.f64 fd454, fd416, fd417; +st.shared.v2.f64 [r21+720], {fd454, fd453}; +fma.rn.f64 fd455, fd423, fd342, fd426; +sub.f64 fd456, fd424, fd425; +st.shared.v2.f64 [r21+864], {fd456, fd455}; +fma.rn.f64 fd457, fd431, fd358, fd434; +sub.f64 fd458, fd432, fd433; +st.shared.v2.f64 [r21+1008], {fd458, fd457}; +fma.rn.f64 fd459, fd439, fd374, fd442; +sub.f64 fd460, fd440, fd441; +st.shared.v2.f64 [r21+1152], {fd460, fd459}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r15]; +ld.shared.v2.f64 {fd465, fd466}, [r15+432]; +ld.shared.v2.f64 {fd469, fd470}, [r15+864]; +ld.shared.v2.f64 {fd473, fd474}, [r15+1296]; +ld.shared.v2.f64 {fd477, fd478}, [r15+1728]; +ld.shared.v2.f64 {fd481, fd482}, [r15+2160]; +ld.shared.v2.f64 {fd485, fd486}, [r15+2592]; +ld.shared.v2.f64 {fd489, fd490}, [r15+3024]; +ld.shared.v2.f64 {fd493, fd494}, [r15+3456]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd474, fd486; +mul.f64 fd499, fd497, 0d3FE0000000000000; +sub.f64 fd500, fd461, fd499; +sub.f64 fd501, fd474, fd486; +mul.f64 fd502, fd501, 0d3FEBB67AE8584CAA; +mul.f64 fd503, fd498, 0d3FE0000000000000; +sub.f64 fd504, fd462, fd503; +sub.f64 fd505, fd473, fd485; +mul.f64 fd506, fd505, 0d3FEBB67AE8584CAA; +add.f64 fd507, fd477, fd489; +add.f64 fd508, fd478, fd490; +mul.f64 fd509, fd507, 0d3FE0000000000000; +sub.f64 fd510, fd465, fd509; +sub.f64 fd511, fd478, fd490; +mul.f64 fd512, fd511, 0d3FEBB67AE8584CAA; +mul.f64 fd513, fd508, 0d3FE0000000000000; +sub.f64 fd514, fd466, fd513; +sub.f64 fd515, fd477, fd489; +mul.f64 fd516, fd515, 0d3FEBB67AE8584CAA; +add.f64 fd517, fd481, fd493; +add.f64 fd518, fd482, fd494; +mul.f64 fd519, fd517, 0d3FE0000000000000; +sub.f64 fd520, fd469, fd519; +sub.f64 fd521, fd482, fd494; +mul.f64 fd522, fd521, 0d3FEBB67AE8584CAA; +mul.f64 fd523, fd518, 0d3FE0000000000000; +sub.f64 fd524, fd470, fd523; +sub.f64 fd525, fd481, fd493; +mul.f64 fd526, fd525, 0d3FEBB67AE8584CAA; +add.f64 %1, fd462, fd498; +add.f64 %0, fd461, fd497; +add.f64 %3, fd466, fd508; +add.f64 %2, fd465, fd507; +add.f64 %5, fd470, fd518; +add.f64 %4, fd469, fd517; +sub.f64 %7, fd504, fd506; +add.f64 %6, fd502, fd500; +sub.f64 %9, fd514, fd516; +add.f64 %8, fd512, fd510; +sub.f64 %11, fd524, fd526; +add.f64 %10, fd522, fd520; +add.f64 %13, fd506, fd504; +sub.f64 %12, fd500, fd502; +add.f64 %15, fd516, fd514; +sub.f64 %14, fd510, fd512; +add.f64 %17, fd526, fd524; +sub.f64 %16, fd520, fd522; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<518, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<509>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 1944, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0d3FEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0d3FEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0d3FEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd153, fd122; +mul.f64 fd158, fd154, fd124; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd153, fd124; +fma.rn.f64 fd161, fd154, fd122, fd160; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd164, fd138; +mul.f64 fd168, fd166, fd140; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd164, fd140; +fma.rn.f64 fd171, fd166, fd138, fd170; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd174, fd113; +mul.f64 fd178, fd176, fd119; +sub.f64 fd179, fd177, fd178; +mul.f64 fd180, fd174, fd119; +fma.rn.f64 fd181, fd176, fd113, fd180; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd184, fd129; +mul.f64 fd188, fd186, fd135; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd184, fd135; +fma.rn.f64 fd191, fd186, fd129, fd190; +ld.global.v2.f64 {fd192, fd193}, [rd6+432]; +mul.f64 fd196, fd192, fd145; +mul.f64 fd197, fd193, fd151; +sub.f64 fd198, fd196, fd197; +mul.f64 fd199, fd192, fd151; +fma.rn.f64 fd200, fd193, fd145, fd199; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd203, fd114; +mul.f64 fd207, fd205, fd120; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, fd120; +fma.rn.f64 fd210, fd205, fd114, fd209; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd213, fd130; +mul.f64 fd217, fd215, fd136; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd213, fd136; +fma.rn.f64 fd220, fd215, fd130, fd219; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd223, fd146; +mul.f64 fd227, fd225, fd152; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd223, fd152; +fma.rn.f64 fd230, fd225, fd146, fd229; +mad.lo.s32 r12, r9, 1944, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 72, r12; +st.shared.f64 [r13], fd106; +st.shared.f64 [r13+8], fd159; +st.shared.f64 [r13+16], fd169; +st.shared.f64 [r13+24], fd179; +st.shared.f64 [r13+32], fd189; +st.shared.f64 [r13+40], fd198; +st.shared.f64 [r13+48], fd208; +st.shared.f64 [r13+56], fd218; +st.shared.f64 [r13+64], fd228; +barrier.sync 0; +shl.b32 r14, r11, 6; +sub.s32 r15, r13, r14; +ld.shared.f64 fd231, [r15]; +ld.shared.f64 fd232, [r15+216]; +ld.shared.f64 fd233, [r15+432]; +ld.shared.f64 fd234, [r15+648]; +ld.shared.f64 fd235, [r15+864]; +ld.shared.f64 fd236, [r15+1080]; +ld.shared.f64 fd237, [r15+1296]; +ld.shared.f64 fd238, [r15+1512]; +ld.shared.f64 fd239, [r15+1728]; +barrier.sync 0; +st.shared.f64 [r13], fd108; +st.shared.f64 [r13+8], fd161; +st.shared.f64 [r13+16], fd171; +st.shared.f64 [r13+24], fd181; +st.shared.f64 [r13+32], fd191; +st.shared.f64 [r13+40], fd200; +st.shared.f64 [r13+48], fd210; +st.shared.f64 [r13+56], fd220; +st.shared.f64 [r13+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r15]; +ld.shared.f64 fd241, [r15+216]; +ld.shared.f64 fd242, [r15+432]; +ld.shared.f64 fd243, [r15+648]; +ld.shared.f64 fd244, [r15+864]; +ld.shared.f64 fd245, [r15+1080]; +ld.shared.f64 fd246, [r15+1296]; +ld.shared.f64 fd247, [r15+1512]; +ld.shared.f64 fd248, [r15+1728]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0d3FEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0d3FEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0d3FEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0dBFE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0dBFE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0dBFEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0dBFEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0dBFEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0dBFEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0dBFD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0dBFD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0d3FEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0d3FEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd365, fd334; +mul.f64 fd370, fd366, fd336; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd365, fd336; +fma.rn.f64 fd373, fd366, fd334, fd372; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd376, fd350; +mul.f64 fd380, fd378, fd352; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, fd352; +fma.rn.f64 fd383, fd378, fd350, fd382; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd386, fd325; +mul.f64 fd390, fd388, fd331; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd386, fd331; +fma.rn.f64 fd393, fd388, fd325, fd392; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd396, fd341; +mul.f64 fd400, fd398, fd347; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd396, fd347; +fma.rn.f64 fd403, fd398, fd341, fd402; +ld.global.v2.f64 {fd404, fd405}, [rd11+48]; +mul.f64 fd408, fd404, fd357; +mul.f64 fd409, fd405, fd363; +sub.f64 fd410, fd408, fd409; +mul.f64 fd411, fd404, fd363; +fma.rn.f64 fd412, fd405, fd357, fd411; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd415, fd326; +mul.f64 fd419, fd417, fd332; +sub.f64 fd420, fd418, fd419; +mul.f64 fd421, fd415, fd332; +fma.rn.f64 fd422, fd417, fd326, fd421; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd425, fd342; +mul.f64 fd429, fd427, fd348; +sub.f64 fd430, fd428, fd429; +mul.f64 fd431, fd425, fd348; +fma.rn.f64 fd432, fd427, fd342, fd431; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd435, fd358; +mul.f64 fd439, fd437, fd364; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd435, fd364; +fma.rn.f64 fd442, fd437, fd358, fd441; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 648, r20; +st.shared.f64 [r21], fd318; +st.shared.f64 [r21+72], fd371; +st.shared.f64 [r21+144], fd381; +st.shared.f64 [r21+216], fd391; +st.shared.f64 [r21+288], fd401; +st.shared.f64 [r21+360], fd410; +st.shared.f64 [r21+432], fd420; +st.shared.f64 [r21+504], fd430; +st.shared.f64 [r21+576], fd440; +barrier.sync 0; +ld.shared.f64 fd443, [r15]; +ld.shared.f64 fd444, [r15+216]; +ld.shared.f64 fd445, [r15+432]; +ld.shared.f64 fd446, [r15+648]; +ld.shared.f64 fd447, [r15+864]; +ld.shared.f64 fd448, [r15+1080]; +ld.shared.f64 fd449, [r15+1296]; +ld.shared.f64 fd450, [r15+1512]; +ld.shared.f64 fd451, [r15+1728]; +barrier.sync 0; +st.shared.f64 [r21], fd320; +st.shared.f64 [r21+72], fd373; +st.shared.f64 [r21+144], fd383; +st.shared.f64 [r21+216], fd393; +st.shared.f64 [r21+288], fd403; +st.shared.f64 [r21+360], fd412; +st.shared.f64 [r21+432], fd422; +st.shared.f64 [r21+504], fd432; +st.shared.f64 [r21+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r15]; +ld.shared.f64 fd453, [r15+216]; +ld.shared.f64 fd454, [r15+432]; +ld.shared.f64 fd455, [r15+648]; +ld.shared.f64 fd456, [r15+864]; +ld.shared.f64 fd457, [r15+1080]; +ld.shared.f64 fd458, [r15+1296]; +ld.shared.f64 fd459, [r15+1512]; +ld.shared.f64 fd460, [r15+1728]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd455, fd458; +mul.f64 fd463, fd461, 0d3FE0000000000000; +sub.f64 fd464, fd443, fd463; +sub.f64 fd465, fd455, fd458; +mul.f64 fd466, fd465, 0d3FEBB67AE8584CAA; +mul.f64 fd467, fd462, 0d3FE0000000000000; +sub.f64 fd468, fd452, fd467; +sub.f64 fd469, fd446, fd449; +mul.f64 fd470, fd469, 0d3FEBB67AE8584CAA; +add.f64 fd471, fd447, fd450; +add.f64 fd472, fd456, fd459; +mul.f64 fd473, fd471, 0d3FE0000000000000; +sub.f64 fd474, fd444, fd473; +sub.f64 fd475, fd456, fd459; +mul.f64 fd476, fd475, 0d3FEBB67AE8584CAA; +mul.f64 fd477, fd472, 0d3FE0000000000000; +sub.f64 fd478, fd453, fd477; +sub.f64 fd479, fd447, fd450; +mul.f64 fd480, fd479, 0d3FEBB67AE8584CAA; +add.f64 fd481, fd448, fd451; +add.f64 fd482, fd457, fd460; +mul.f64 fd483, fd481, 0d3FE0000000000000; +sub.f64 fd484, fd445, fd483; +sub.f64 fd485, fd457, fd460; +mul.f64 fd486, fd485, 0d3FEBB67AE8584CAA; +mul.f64 fd487, fd482, 0d3FE0000000000000; +sub.f64 fd488, fd454, fd487; +sub.f64 fd489, fd448, fd451; +mul.f64 fd490, fd489, 0d3FEBB67AE8584CAA; +add.f64 %0, fd443, fd461; +add.f64 %1, fd452, fd462; +add.f64 %2, fd444, fd471; +add.f64 %3, fd453, fd472; +add.f64 %4, fd445, fd481; +add.f64 %5, fd454, fd482; +add.f64 %6, fd466, fd464; +sub.f64 %7, fd468, fd470; +add.f64 %8, fd476, fd474; +sub.f64 %9, fd478, fd480; +add.f64 %10, fd486, fd484; +sub.f64 %11, fd488, fd490; +sub.f64 %12, fd464, fd466; +add.f64 %13, fd470, fd468; +sub.f64 %14, fd474, fd476; +add.f64 %15, fd480, fd478; +sub.f64 %16, fd484, fd486; +add.f64 %17, fd490, fd488; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<520, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<189>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 1944, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %13, %16; +add.f64 fd14, %11, fd13; +add.f64 fd15, %15, %17; +add.f64 fd16, %12, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %11, fd17; +sub.f64 fd19, %15, %17; +mul.f64 fd20, fd19, 0d3FEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %12, fd23; +sub.f64 fd25, %13, %16; +mul.f64 fd26, fd25, 0d3FEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1944, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd29, fd21; +mul.f64 fd34, fd30, fd27; +sub.f64 fd35, fd33, fd34; +mul.f64 fd36, fd29, fd27; +fma.rn.f64 fd37, fd30, fd21, fd36; +ld.global.v2.f64 {fd38, fd39}, [rd6+1296]; +mul.f64 fd42, fd38, fd22; +mul.f64 fd43, fd39, fd28; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd38, fd28; +fma.rn.f64 fd46, fd39, fd22, fd45; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd35; +st.shared.f64 [r9+16], fd44; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+648]; +ld.shared.f64 fd49, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+648]; +ld.shared.f64 fd52, [r11+1296]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd69, fd61; +mul.f64 fd74, fd70, fd67; +sub.f64 fd75, fd73, fd74; +mul.f64 fd76, fd69, fd67; +fma.rn.f64 fd77, fd70, fd61, fd76; +ld.global.v2.f64 {fd78, fd79}, [rd11+432]; +mul.f64 fd82, fd78, fd62; +mul.f64 fd83, fd79, fd68; +sub.f64 fd84, fd82, fd83; +mul.f64 fd85, fd78, fd68; +fma.rn.f64 fd86, fd79, fd62, fd85; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd75; +st.shared.f64 [r17+48], fd84; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+648]; +ld.shared.f64 fd89, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+648]; +ld.shared.f64 fd92, [r11+1296]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0d3FEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0d3FEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd109, fd101; +mul.f64 fd114, fd110, fd107; +sub.f64 fd115, fd113, fd114; +mul.f64 fd116, fd109, fd107; +fma.rn.f64 fd117, fd110, fd101, fd116; +ld.global.v2.f64 {fd118, fd119}, [rd16+144]; +mul.f64 fd122, fd118, fd102; +mul.f64 fd123, fd119, fd108; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd118, fd108; +fma.rn.f64 fd126, fd119, fd102, fd125; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +st.shared.f64 [r23], fd94; +st.shared.f64 [r23+72], fd115; +st.shared.f64 [r23+144], fd124; +barrier.sync 0; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+648]; +ld.shared.f64 fd129, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r23], fd96; +st.shared.f64 [r23+72], fd117; +st.shared.f64 [r23+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r11]; +ld.shared.f64 fd131, [r11+648]; +ld.shared.f64 fd132, [r11+1296]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd127, fd133; +add.f64 fd135, fd131, fd132; +add.f64 fd136, fd130, fd135; +mul.f64 fd137, fd133, 0d3FE0000000000000; +sub.f64 fd138, fd127, fd137; +sub.f64 fd139, fd131, fd132; +mul.f64 fd140, fd139, 0d3FEBB67AE8584CAA; +add.f64 fd141, fd140, fd138; +sub.f64 fd142, fd138, fd140; +mul.f64 fd143, fd135, 0d3FE0000000000000; +sub.f64 fd144, fd130, fd143; +sub.f64 fd145, fd128, fd129; +mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd146, fd144; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd149, fd150}, [rd21]; +mul.f64 fd153, fd149, fd141; +mul.f64 fd154, fd150, fd147; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd149, fd147; +fma.rn.f64 fd157, fd150, fd141, fd156; +ld.global.v2.f64 {fd158, fd159}, [rd21+48]; +mul.f64 fd162, fd158, fd142; +mul.f64 fd163, fd159, fd148; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd158, fd148; +fma.rn.f64 fd166, fd159, fd142, fd165; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +st.shared.f64 [r33], fd134; +st.shared.f64 [r33+216], fd155; +st.shared.f64 [r33+432], fd164; +barrier.sync 0; +ld.shared.f64 fd167, [r11]; +ld.shared.f64 fd168, [r11+648]; +ld.shared.f64 fd169, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r33], fd136; +st.shared.f64 [r33+216], fd157; +st.shared.f64 [r33+432], fd166; +barrier.sync 0; +ld.shared.f64 fd170, [r11]; +ld.shared.f64 fd171, [r11+648]; +ld.shared.f64 fd172, [r11+1296]; +add.f64 fd173, fd168, fd169; +add.f64 fd174, fd171, fd172; +mul.f64 fd175, fd173, 0d3FE0000000000000; +sub.f64 fd176, fd167, fd175; +sub.f64 fd177, fd171, fd172; +mul.f64 fd178, fd177, 0d3FEBB67AE8584CAA; +mul.f64 fd179, fd174, 0d3FE0000000000000; +sub.f64 fd180, fd170, fd179; +sub.f64 fd181, fd168, fd169; +mul.f64 fd182, fd181, 0d3FEBB67AE8584CAA; +add.f64 %0, fd167, fd173; +add.f64 %1, fd170, fd174; +add.f64 %2, fd178, fd176; +sub.f64 %3, fd180, fd182; +sub.f64 %4, fd176, fd178; +add.f64 %5, fd182, fd180; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<519, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<213>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 3888, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %13, %16; +add.f64 fd14, %15, %17; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %11, fd15; +sub.f64 fd17, %15, %17; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %12, fd21; +sub.f64 fd23, %13, %16; +mul.f64 fd24, fd23, 0d3FEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 3888, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd27, fd19; +mul.f64 fd32, fd28, fd25; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+1296]; +mul.f64 fd38, fd34, fd20; +mul.f64 fd39, fd35, fd26; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %12, fd14; +add.f64 fd42, %11, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd28, fd19, fd33; +sub.f64 fd44, fd31, fd32; +st.shared.v2.f64 [r9+16], {fd44, fd43}; +fma.rn.f64 fd45, fd35, fd20, fd40; +sub.f64 fd46, fd38, fd39; +st.shared.v2.f64 [r9+32], {fd46, fd45}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+1296]; +ld.shared.v2.f64 {fd55, fd56}, [r11+2592]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0d3FEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0d3FEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd73, fd65; +mul.f64 fd78, fd74, fd71; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+432]; +mul.f64 fd84, fd80, fd66; +mul.f64 fd85, fd81, fd72; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd74, fd65, fd79; +sub.f64 fd90, fd77, fd78; +st.shared.v2.f64 [r17+48], {fd90, fd89}; +fma.rn.f64 fd91, fd81, fd66, fd86; +sub.f64 fd92, fd84, fd85; +st.shared.v2.f64 [r17+96], {fd92, fd91}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+1296]; +ld.shared.v2.f64 {fd101, fd102}, [r11+2592]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd119, fd111; +mul.f64 fd124, fd120, fd117; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+144]; +mul.f64 fd130, fd126, fd112; +mul.f64 fd131, fd127, fd118; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r23, r18, 432, r22; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r23], {fd134, fd133}; +fma.rn.f64 fd135, fd120, fd111, fd125; +sub.f64 fd136, fd123, fd124; +st.shared.v2.f64 [r23+144], {fd136, fd135}; +fma.rn.f64 fd137, fd127, fd112, fd132; +sub.f64 fd138, fd130, fd131; +st.shared.v2.f64 [r23+288], {fd138, fd137}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r11]; +ld.shared.v2.f64 {fd143, fd144}, [r11+1296]; +ld.shared.v2.f64 {fd147, fd148}, [r11+2592]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0d3FEBB67AE8584CAA; +add.f64 fd157, fd156, fd154; +sub.f64 fd158, fd154, fd156; +mul.f64 fd159, fd152, 0d3FE0000000000000; +sub.f64 fd160, fd140, fd159; +sub.f64 fd161, fd143, fd147; +mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA; +sub.f64 fd163, fd160, fd162; +add.f64 fd164, fd162, fd160; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 4; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd165, fd166}, [rd21]; +mul.f64 fd169, fd165, fd157; +mul.f64 fd170, fd166, fd163; +mul.f64 fd171, fd165, fd163; +ld.global.v2.f64 {fd172, fd173}, [rd21+48]; +mul.f64 fd176, fd172, fd158; +mul.f64 fd177, fd173, fd164; +mul.f64 fd178, fd172, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 1296, r32; +add.f64 fd179, fd140, fd152; +add.f64 fd180, fd139, fd151; +st.shared.v2.f64 [r33], {fd180, fd179}; +fma.rn.f64 fd181, fd166, fd157, fd171; +sub.f64 fd182, fd169, fd170; +st.shared.v2.f64 [r33+432], {fd182, fd181}; +fma.rn.f64 fd183, fd173, fd158, fd178; +sub.f64 fd184, fd176, fd177; +st.shared.v2.f64 [r33+864], {fd184, fd183}; +barrier.sync 0; +ld.shared.v2.f64 {fd185, fd186}, [r11]; +ld.shared.v2.f64 {fd189, fd190}, [r11+1296]; +ld.shared.v2.f64 {fd193, fd194}, [r11+2592]; +add.f64 fd197, fd189, fd193; +add.f64 fd198, fd190, fd194; +mul.f64 fd199, fd197, 0d3FE0000000000000; +sub.f64 fd200, fd185, fd199; +sub.f64 fd201, fd190, fd194; +mul.f64 fd202, fd201, 0d3FEBB67AE8584CAA; +mul.f64 fd203, fd198, 0d3FE0000000000000; +sub.f64 fd204, fd186, fd203; +sub.f64 fd205, fd189, fd193; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +add.f64 %1, fd186, fd198; +add.f64 %0, fd185, fd197; +sub.f64 %3, fd204, fd206; +add.f64 %2, fd202, fd200; +add.f64 %5, fd206, fd204; +sub.f64 %4, fd200, fd202; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..0d77132528ec0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_243_fp64_inv.hpp.inc @@ -0,0 +1,4120 @@ +#ifndef CUFFTDX_FFT_243_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_243_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<687, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1622>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 3888, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1621, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1620, %57, fd1621; +mul.f64 fd119, fd1621, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1619, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1618, %63, fd1619; +mul.f64 fd135, fd1619, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1617, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1616, %69, fd1617; +mul.f64 fd151, fd1617, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd1615, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1615, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd1613, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1614, fd155, 0d3FEF838B8C811C17; +sub.f64 fd164, fd1613, fd1614; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd1611, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1612, fd140, 0d3FEF838B8C811C17; +sub.f64 fd169, fd1611, fd1612; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd1609, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1610, fd156, 0d3FD5E3A8748A0BF5; +sub.f64 fd174, fd1609, fd1610; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1608, fd1618, fd1616; +sub.f64 fd183, fd1618, fd1616; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1607, fd1620, fd1608; +mul.f64 fd187, fd1608, 0d3FE0000000000000; +sub.f64 fd188, fd1620, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1606, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1605, fd123, fd1606; +mul.f64 fd203, fd1606, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1604, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1603, fd124, fd1604; +mul.f64 fd219, fd1604, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1600, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1598, %112, fd1600; +mul.f64 fd235, fd1600, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1595, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1593, %115, fd1595; +mul.f64 fd251, fd1595, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1590, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1588, %118, fd1590; +mul.f64 fd267, fd1590, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd1587, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1587, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd1586, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1586, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd1584, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1585, fd256, 0d3FEF838B8C811C17; +sub.f64 fd285, fd1584, fd1585; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd1582, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1583, fd272, 0d3FD5E3A8748A0BF5; +sub.f64 fd290, fd1582, fd1583; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1581, fd1593, fd1588; +sub.f64 fd299, fd1593, fd1588; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1580, fd1598, fd1581; +mul.f64 fd303, fd1581, 0d3FE0000000000000; +sub.f64 fd304, fd1598, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1579, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1578, fd239, fd1579; +mul.f64 fd319, fd1579, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1577, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1576, fd240, fd1577; +mul.f64 fd335, fd1577, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1573, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1571, %121, fd1573; +mul.f64 fd351, fd1573, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1568, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1566, %124, fd1568; +mul.f64 fd367, fd1568, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1564, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1562, %126, fd1564; +mul.f64 fd383, fd1564, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd1561, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1561, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd1560, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1560, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd1558, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1559, fd372, 0d3FEF838B8C811C17; +sub.f64 fd401, fd1558, fd1559; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd1556, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1557, fd388, 0d3FD5E3A8748A0BF5; +sub.f64 fd406, fd1556, fd1557; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1555, fd1566, fd1562; +sub.f64 fd415, fd1566, fd1562; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1554, fd1571, fd1555; +mul.f64 fd419, fd1555, 0d3FE0000000000000; +sub.f64 fd420, fd1571, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1553, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1552, fd355, fd1553; +mul.f64 fd435, fd1553, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1551, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1550, fd356, fd1551; +mul.f64 fd451, fd1551, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1578, 0d3FCD84D223638000; +mul.f64 fd1549, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1549, fd458; +mul.f64 fd460, fd1578, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd1547, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1548, fd1552, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd1547, fd1548; +mul.f64 fd465, fd1552, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd1545, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1546, fd1576, 0d3FDCB920325BAFA6; +sub.f64 fd469, fd1545, fd1546; +mul.f64 fd470, fd1576, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd1543, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1544, fd1550, 0d3FE9AAFE4207DF5F; +sub.f64 fd474, fd1543, fd1544; +mul.f64 fd475, fd1550, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd1541, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1542, fd307, 0d3FE491B7523C161D; +sub.f64 fd479, fd1541, fd1542; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd1540, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1540, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd1539, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1539, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd1538, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1538, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0d3FED6206BEB6C24B; +mul.f64 fd1537, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1537, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0d3FE746A51650EADE; +mul.f64 fd1536, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1536, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0d3FEF838B8C811C17; +mul.f64 fd1535, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1535, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd1533, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1534, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd1533, fd1534; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd1531, fd318, 0dBFADC528B5343A86; +mul.f64 fd1532, fd324, 0d3FEFF223F3635CE3; +sub.f64 fd519, fd1531, fd1532; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd1529, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1530, fd440, 0dBFBDB843E577175E; +sub.f64 fd524, fd1529, fd1530; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd1528, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1528, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd1527, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1527, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd1526, fd1580, fd1554; +sub.f64 fd541, fd1580, fd1554; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +add.f64 fd543, fd542, fd540; +sub.f64 fd544, fd540, fd542; +mul.f64 fd545, fd1526, 0d3FE0000000000000; +sub.f64 fd546, fd1607, fd545; +sub.f64 fd547, fd294, fd410; +mul.f64 fd548, fd547, 0dBFEBB67AE8584CAA; +sub.f64 fd549, fd546, fd548; +add.f64 fd550, fd548, fd546; +add.f64 fd551, fd459, fd464; +add.f64 fd552, fd194, fd551; +mul.f64 fd555, fd551, 0d3FE0000000000000; +sub.f64 fd556, fd194, fd555; +add.f64 fd1525, fd461, fd466; +sub.f64 fd557, fd461, fd466; +mul.f64 fd558, fd557, 0dBFEBB67AE8584CAA; +add.f64 fd559, fd558, fd556; +sub.f64 fd560, fd556, fd558; +add.f64 fd1524, fd1605, fd1525; +mul.f64 fd561, fd1525, 0d3FE0000000000000; +sub.f64 fd562, fd1605, fd561; +sub.f64 fd563, fd459, fd464; +mul.f64 fd564, fd563, 0dBFEBB67AE8584CAA; +sub.f64 fd565, fd562, fd564; +add.f64 fd566, fd564, fd562; +add.f64 fd567, fd469, fd474; +add.f64 fd568, fd210, fd567; +mul.f64 fd571, fd567, 0d3FE0000000000000; +sub.f64 fd572, fd210, fd571; +add.f64 fd1523, fd471, fd476; +sub.f64 fd573, fd471, fd476; +mul.f64 fd574, fd573, 0dBFEBB67AE8584CAA; +add.f64 fd575, fd574, fd572; +sub.f64 fd576, fd572, fd574; +add.f64 fd1522, fd1603, fd1523; +mul.f64 fd577, fd1523, 0d3FE0000000000000; +sub.f64 fd578, fd1603, fd577; +sub.f64 fd579, fd469, fd474; +mul.f64 fd580, fd579, 0dBFEBB67AE8584CAA; +sub.f64 fd581, fd578, fd580; +add.f64 fd582, fd580, fd578; +add.f64 fd583, fd479, fd484; +add.f64 fd584, fd185, fd583; +mul.f64 fd587, fd583, 0d3FE0000000000000; +sub.f64 fd588, fd185, fd587; +add.f64 fd1521, fd481, fd486; +sub.f64 fd589, fd481, fd486; +mul.f64 fd590, fd589, 0dBFEBB67AE8584CAA; +add.f64 fd591, fd590, fd588; +sub.f64 fd592, fd588, fd590; +add.f64 fd1520, fd191, fd1521; +mul.f64 fd593, fd1521, 0d3FE0000000000000; +sub.f64 fd594, fd191, fd593; +sub.f64 fd595, fd479, fd484; +mul.f64 fd596, fd595, 0dBFEBB67AE8584CAA; +sub.f64 fd597, fd594, fd596; +add.f64 fd598, fd596, fd594; +add.f64 fd599, fd489, fd494; +add.f64 fd600, fd201, fd599; +mul.f64 fd603, fd599, 0d3FE0000000000000; +sub.f64 fd604, fd201, fd603; +add.f64 fd1519, fd491, fd496; +sub.f64 fd605, fd491, fd496; +mul.f64 fd606, fd605, 0dBFEBB67AE8584CAA; +add.f64 fd607, fd606, fd604; +sub.f64 fd608, fd604, fd606; +add.f64 fd1518, fd207, fd1519; +mul.f64 fd609, fd1519, 0d3FE0000000000000; +sub.f64 fd610, fd207, fd609; +sub.f64 fd611, fd489, fd494; +mul.f64 fd612, fd611, 0dBFEBB67AE8584CAA; +sub.f64 fd613, fd610, fd612; +add.f64 fd614, fd612, fd610; +add.f64 fd615, fd499, fd504; +add.f64 fd616, fd217, fd615; +mul.f64 fd619, fd615, 0d3FE0000000000000; +sub.f64 fd620, fd217, fd619; +add.f64 fd1517, fd501, fd506; +sub.f64 fd621, fd501, fd506; +mul.f64 fd622, fd621, 0dBFEBB67AE8584CAA; +add.f64 fd623, fd622, fd620; +sub.f64 fd624, fd620, fd622; +add.f64 fd1516, fd223, fd1517; +mul.f64 fd625, fd1517, 0d3FE0000000000000; +sub.f64 fd626, fd223, fd625; +sub.f64 fd627, fd499, fd504; +mul.f64 fd628, fd627, 0dBFEBB67AE8584CAA; +sub.f64 fd629, fd626, fd628; +add.f64 fd630, fd628, fd626; +add.f64 fd631, fd509, fd514; +add.f64 fd632, fd186, fd631; +mul.f64 fd635, fd631, 0d3FE0000000000000; +sub.f64 fd636, fd186, fd635; +add.f64 fd1515, fd511, fd516; +sub.f64 fd637, fd511, fd516; +mul.f64 fd638, fd637, 0dBFEBB67AE8584CAA; +add.f64 fd639, fd638, fd636; +sub.f64 fd640, fd636, fd638; +add.f64 fd1514, fd192, fd1515; +mul.f64 fd641, fd1515, 0d3FE0000000000000; +sub.f64 fd642, fd192, fd641; +sub.f64 fd643, fd509, fd514; +mul.f64 fd644, fd643, 0dBFEBB67AE8584CAA; +sub.f64 fd645, fd642, fd644; +add.f64 fd646, fd644, fd642; +add.f64 fd647, fd519, fd524; +add.f64 fd648, fd202, fd647; +mul.f64 fd651, fd647, 0d3FE0000000000000; +sub.f64 fd652, fd202, fd651; +add.f64 fd1513, fd521, fd526; +sub.f64 fd653, fd521, fd526; +mul.f64 fd654, fd653, 0dBFEBB67AE8584CAA; +add.f64 fd655, fd654, fd652; +sub.f64 fd656, fd652, fd654; +add.f64 fd1512, fd208, fd1513; +mul.f64 fd657, fd1513, 0d3FE0000000000000; +sub.f64 fd658, fd208, fd657; +sub.f64 fd659, fd519, fd524; +mul.f64 fd660, fd659, 0dBFEBB67AE8584CAA; +sub.f64 fd661, fd658, fd660; +add.f64 fd662, fd660, fd658; +add.f64 fd663, fd529, fd534; +add.f64 fd664, fd218, fd663; +mul.f64 fd667, fd663, 0d3FE0000000000000; +sub.f64 fd668, fd218, fd667; +add.f64 fd1511, fd531, fd536; +sub.f64 fd669, fd531, fd536; +mul.f64 fd670, fd669, 0dBFEBB67AE8584CAA; +add.f64 fd671, fd670, fd668; +sub.f64 fd672, fd668, fd670; +add.f64 fd1510, fd224, fd1511; +mul.f64 fd673, fd1511, 0d3FE0000000000000; +sub.f64 fd674, fd224, fd673; +sub.f64 fd675, fd529, fd534; +mul.f64 fd676, fd675, 0dBFEBB67AE8584CAA; +sub.f64 fd677, fd674, fd676; +add.f64 fd678, fd676, fd674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 3888, r3; +mul.wide.u32 rd7, r7, 16; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd679, fd680}, [rd6]; +mul.f64 fd683, fd1524, fd680; +mul.f64 fd685, fd679, fd1524; +mul.f64 fd1508, fd679, fd679; +mul.f64 fd1509, fd680, fd680; +sub.f64 fd688, fd1508, fd1509; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd691, fd1522, fd690; +mul.f64 fd693, fd688, fd1522; +mul.f64 fd695, fd680, fd690; +mul.f64 fd1507, fd679, fd688; +sub.f64 fd696, fd1507, fd695; +mul.f64 fd1506, fd568, fd690; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd699, fd1520, fd698; +mul.f64 fd701, fd696, fd1520; +mul.f64 fd1504, fd679, fd696; +mul.f64 fd1505, fd680, fd698; +sub.f64 fd704, fd1504, fd1505; +mul.f64 fd1503, fd584, fd698; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd707, fd1518, fd706; +mul.f64 fd709, fd704, fd1518; +mul.f64 fd711, fd680, fd706; +mul.f64 fd1502, fd679, fd704; +sub.f64 fd712, fd1502, fd711; +mul.f64 fd1501, fd600, fd706; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd715, fd1516, fd714; +mul.f64 fd717, fd712, fd1516; +mul.f64 fd719, fd680, fd714; +mul.f64 fd1500, fd679, fd712; +sub.f64 fd720, fd1500, fd719; +mul.f64 fd1499, fd616, fd714; +mul.f64 fd721, fd679, fd714; +fma.rn.f64 fd722, fd680, fd712, fd721; +mul.f64 fd723, fd1514, fd722; +mul.f64 fd725, fd720, fd1514; +mul.f64 fd1497, fd679, fd720; +mul.f64 fd1498, fd680, fd722; +sub.f64 fd728, fd1497, fd1498; +mul.f64 fd1496, fd632, fd722; +mul.f64 fd729, fd679, fd722; +fma.rn.f64 fd730, fd680, fd720, fd729; +mul.f64 fd731, fd1512, fd730; +mul.f64 fd733, fd728, fd1512; +mul.f64 fd735, fd680, fd730; +mul.f64 fd1495, fd679, fd728; +sub.f64 fd736, fd1495, fd735; +mul.f64 fd1494, fd648, fd730; +mul.f64 fd737, fd679, fd730; +fma.rn.f64 fd738, fd680, fd728, fd737; +mul.f64 fd739, fd1510, fd738; +mul.f64 fd741, fd736, fd1510; +mul.f64 fd743, fd680, fd738; +mul.f64 fd1493, fd679, fd736; +sub.f64 fd744, fd1493, fd743; +mul.f64 fd1492, fd664, fd738; +mul.f64 fd745, fd679, fd738; +fma.rn.f64 fd746, fd680, fd736, fd745; +mul.f64 fd747, fd549, fd746; +mul.f64 fd749, fd744, fd549; +mul.f64 fd1490, fd679, fd744; +mul.f64 fd1491, fd680, fd746; +sub.f64 fd752, fd1490, fd1491; +mul.f64 fd1489, fd543, fd746; +mul.f64 fd753, fd679, fd746; +fma.rn.f64 fd754, fd680, fd744, fd753; +mul.f64 fd755, fd565, fd754; +mul.f64 fd757, fd752, fd565; +mul.f64 fd759, fd680, fd754; +mul.f64 fd1488, fd679, fd752; +sub.f64 fd760, fd1488, fd759; +mul.f64 fd1487, fd559, fd754; +mul.f64 fd761, fd679, fd754; +fma.rn.f64 fd762, fd680, fd752, fd761; +mul.f64 fd763, fd581, fd762; +mul.f64 fd765, fd760, fd581; +mul.f64 fd1485, fd679, fd760; +mul.f64 fd1486, fd680, fd762; +sub.f64 fd768, fd1485, fd1486; +mul.f64 fd1484, fd575, fd762; +mul.f64 fd769, fd679, fd762; +fma.rn.f64 fd770, fd680, fd760, fd769; +mul.f64 fd771, fd597, fd770; +mul.f64 fd773, fd768, fd597; +mul.f64 fd775, fd680, fd770; +mul.f64 fd1483, fd679, fd768; +sub.f64 fd776, fd1483, fd775; +mul.f64 fd1482, fd591, fd770; +mul.f64 fd777, fd679, fd770; +fma.rn.f64 fd778, fd680, fd768, fd777; +mul.f64 fd779, fd613, fd778; +mul.f64 fd780, fd607, fd778; +mul.f64 fd781, fd776, fd613; +ld.global.v2.f64 {fd782, fd783}, [rd6+144]; +mul.f64 fd786, fd629, fd783; +mul.f64 fd788, fd782, fd629; +mul.f64 fd790, fd680, fd783; +mul.f64 fd1481, fd679, fd782; +sub.f64 fd791, fd1481, fd790; +mul.f64 fd1480, fd623, fd783; +mul.f64 fd792, fd679, fd783; +fma.rn.f64 fd793, fd680, fd782, fd792; +mul.f64 fd794, fd645, fd793; +mul.f64 fd796, fd791, fd645; +mul.f64 fd1478, fd679, fd791; +mul.f64 fd1479, fd680, fd793; +sub.f64 fd799, fd1478, fd1479; +mul.f64 fd1477, fd639, fd793; +mul.f64 fd800, fd679, fd793; +fma.rn.f64 fd801, fd680, fd791, fd800; +mul.f64 fd802, fd661, fd801; +mul.f64 fd804, fd799, fd661; +mul.f64 fd806, fd680, fd801; +mul.f64 fd1476, fd679, fd799; +sub.f64 fd807, fd1476, fd806; +mul.f64 fd1475, fd655, fd801; +mul.f64 fd808, fd679, fd801; +fma.rn.f64 fd809, fd680, fd799, fd808; +mul.f64 fd810, fd677, fd809; +mul.f64 fd812, fd807, fd677; +mul.f64 fd814, fd680, fd809; +mul.f64 fd1474, fd679, fd807; +sub.f64 fd815, fd1474, fd814; +mul.f64 fd1473, fd671, fd809; +mul.f64 fd816, fd679, fd809; +fma.rn.f64 fd817, fd680, fd807, fd816; +mul.f64 fd818, fd550, fd817; +mul.f64 fd820, fd815, fd550; +mul.f64 fd1471, fd679, fd815; +mul.f64 fd1472, fd680, fd817; +sub.f64 fd823, fd1471, fd1472; +mul.f64 fd1470, fd544, fd817; +mul.f64 fd824, fd679, fd817; +fma.rn.f64 fd825, fd680, fd815, fd824; +mul.f64 fd826, fd566, fd825; +mul.f64 fd828, fd823, fd566; +mul.f64 fd830, fd680, fd825; +mul.f64 fd1469, fd679, fd823; +sub.f64 fd831, fd1469, fd830; +mul.f64 fd1468, fd560, fd825; +mul.f64 fd832, fd679, fd825; +fma.rn.f64 fd833, fd680, fd823, fd832; +mul.f64 fd834, fd582, fd833; +mul.f64 fd836, fd831, fd582; +mul.f64 fd1466, fd679, fd831; +mul.f64 fd1467, fd680, fd833; +sub.f64 fd839, fd1466, fd1467; +mul.f64 fd1465, fd576, fd833; +mul.f64 fd840, fd679, fd833; +fma.rn.f64 fd841, fd680, fd831, fd840; +mul.f64 fd842, fd598, fd841; +mul.f64 fd844, fd839, fd598; +mul.f64 fd1463, fd679, fd839; +mul.f64 fd1464, fd680, fd841; +sub.f64 fd847, fd1463, fd1464; +mul.f64 fd1462, fd592, fd841; +mul.f64 fd848, fd679, fd841; +fma.rn.f64 fd849, fd680, fd839, fd848; +mul.f64 fd850, fd614, fd849; +mul.f64 fd852, fd847, fd614; +mul.f64 fd854, fd680, fd849; +mul.f64 fd1461, fd679, fd847; +sub.f64 fd855, fd1461, fd854; +mul.f64 fd1460, fd608, fd849; +mul.f64 fd856, fd679, fd849; +fma.rn.f64 fd857, fd680, fd847, fd856; +mul.f64 fd858, fd630, fd857; +mul.f64 fd860, fd855, fd630; +mul.f64 fd1458, fd679, fd855; +mul.f64 fd1459, fd680, fd857; +sub.f64 fd863, fd1458, fd1459; +mul.f64 fd1457, fd624, fd857; +mul.f64 fd864, fd679, fd857; +fma.rn.f64 fd865, fd680, fd855, fd864; +mul.f64 fd866, fd646, fd865; +mul.f64 fd868, fd863, fd646; +mul.f64 fd870, fd680, fd865; +mul.f64 fd1456, fd679, fd863; +sub.f64 fd871, fd1456, fd870; +mul.f64 fd1455, fd640, fd865; +mul.f64 fd872, fd679, fd865; +fma.rn.f64 fd873, fd680, fd863, fd872; +mul.f64 fd874, fd662, fd873; +mul.f64 fd876, fd871, fd662; +mul.f64 fd878, fd680, fd873; +mul.f64 fd1454, fd679, fd871; +sub.f64 fd879, fd1454, fd878; +mul.f64 fd1453, fd656, fd873; +mul.f64 fd880, fd679, fd873; +mul.f64 fd1452, fd552, fd680; +fma.rn.f64 fd881, fd680, fd871, fd880; +mul.f64 fd882, fd678, fd881; +mul.f64 fd883, fd672, fd881; +mul.f64 fd884, fd879, fd678; +barrier.sync 0; +mad.lo.s32 r9, r7, 432, r8; +add.f64 fd885, fd1607, fd1526; +add.f64 fd886, fd178, fd537; +st.shared.v2.f64 [r9], {fd886, fd885}; +fma.rn.f64 fd887, fd679, fd552, fd683; +sub.f64 fd888, fd685, fd1452; +st.shared.v2.f64 [r9+16], {fd887, fd888}; +fma.rn.f64 fd889, fd688, fd568, fd691; +sub.f64 fd890, fd693, fd1506; +st.shared.v2.f64 [r9+32], {fd889, fd890}; +fma.rn.f64 fd891, fd696, fd584, fd699; +sub.f64 fd892, fd701, fd1503; +st.shared.v2.f64 [r9+48], {fd891, fd892}; +fma.rn.f64 fd893, fd704, fd600, fd707; +sub.f64 fd894, fd709, fd1501; +st.shared.v2.f64 [r9+64], {fd893, fd894}; +fma.rn.f64 fd895, fd712, fd616, fd715; +sub.f64 fd896, fd717, fd1499; +st.shared.v2.f64 [r9+80], {fd895, fd896}; +fma.rn.f64 fd897, fd720, fd632, fd723; +sub.f64 fd898, fd725, fd1496; +st.shared.v2.f64 [r9+96], {fd897, fd898}; +sub.f64 fd899, fd733, fd1494; +fma.rn.f64 fd900, fd728, fd648, fd731; +st.shared.v2.f64 [r9+112], {fd900, fd899}; +fma.rn.f64 fd901, fd736, fd664, fd739; +sub.f64 fd902, fd741, fd1492; +st.shared.v2.f64 [r9+128], {fd901, fd902}; +fma.rn.f64 fd903, fd744, fd543, fd747; +sub.f64 fd904, fd749, fd1489; +st.shared.v2.f64 [r9+144], {fd903, fd904}; +fma.rn.f64 fd905, fd752, fd559, fd755; +sub.f64 fd906, fd757, fd1487; +st.shared.v2.f64 [r9+160], {fd905, fd906}; +fma.rn.f64 fd907, fd760, fd575, fd763; +sub.f64 fd908, fd765, fd1484; +st.shared.v2.f64 [r9+176], {fd907, fd908}; +fma.rn.f64 fd909, fd768, fd591, fd771; +sub.f64 fd910, fd773, fd1482; +st.shared.v2.f64 [r9+192], {fd909, fd910}; +fma.rn.f64 fd911, fd776, fd607, fd779; +sub.f64 fd912, fd781, fd780; +st.shared.v2.f64 [r9+208], {fd911, fd912}; +fma.rn.f64 fd913, fd782, fd623, fd786; +sub.f64 fd914, fd788, fd1480; +st.shared.v2.f64 [r9+224], {fd913, fd914}; +fma.rn.f64 fd915, fd791, fd639, fd794; +sub.f64 fd916, fd796, fd1477; +st.shared.v2.f64 [r9+240], {fd915, fd916}; +fma.rn.f64 fd917, fd799, fd655, fd802; +sub.f64 fd918, fd804, fd1475; +st.shared.v2.f64 [r9+256], {fd917, fd918}; +fma.rn.f64 fd919, fd807, fd671, fd810; +sub.f64 fd920, fd812, fd1473; +st.shared.v2.f64 [r9+272], {fd919, fd920}; +fma.rn.f64 fd921, fd815, fd544, fd818; +sub.f64 fd922, fd820, fd1470; +st.shared.v2.f64 [r9+288], {fd921, fd922}; +fma.rn.f64 fd923, fd823, fd560, fd826; +sub.f64 fd924, fd828, fd1468; +st.shared.v2.f64 [r9+304], {fd923, fd924}; +sub.f64 fd925, fd836, fd1465; +fma.rn.f64 fd926, fd831, fd576, fd834; +st.shared.v2.f64 [r9+320], {fd926, fd925}; +fma.rn.f64 fd927, fd839, fd592, fd842; +sub.f64 fd928, fd844, fd1462; +st.shared.v2.f64 [r9+336], {fd927, fd928}; +fma.rn.f64 fd929, fd847, fd608, fd850; +sub.f64 fd930, fd852, fd1460; +st.shared.v2.f64 [r9+352], {fd929, fd930}; +fma.rn.f64 fd931, fd855, fd624, fd858; +sub.f64 fd932, fd860, fd1457; +st.shared.v2.f64 [r9+368], {fd931, fd932}; +fma.rn.f64 fd933, fd863, fd640, fd866; +sub.f64 fd934, fd868, fd1455; +st.shared.v2.f64 [r9+384], {fd933, fd934}; +fma.rn.f64 fd935, fd871, fd656, fd874; +sub.f64 fd936, fd876, fd1453; +st.shared.v2.f64 [r9+400], {fd935, fd936}; +fma.rn.f64 fd937, fd879, fd672, fd882; +sub.f64 fd938, fd884, fd883; +st.shared.v2.f64 [r9+416], {fd937, fd938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -416, r9; +ld.shared.v2.f64 {fd939, fd940}, [r10]; +ld.shared.v2.f64 {fd943, fd944}, [r10+144]; +ld.shared.v2.f64 {fd947, fd948}, [r10+288]; +ld.shared.v2.f64 {fd951, fd952}, [r10+432]; +ld.shared.v2.f64 {fd955, fd956}, [r10+576]; +ld.shared.v2.f64 {fd959, fd960}, [r10+720]; +ld.shared.v2.f64 {fd963, fd964}, [r10+864]; +ld.shared.v2.f64 {fd967, fd968}, [r10+1008]; +ld.shared.v2.f64 {fd971, fd972}, [r10+1152]; +ld.shared.v2.f64 {fd975, fd976}, [r10+1296]; +ld.shared.v2.f64 {fd979, fd980}, [r10+1440]; +ld.shared.v2.f64 {fd983, fd984}, [r10+1584]; +ld.shared.v2.f64 {fd987, fd988}, [r10+1728]; +ld.shared.v2.f64 {fd991, fd992}, [r10+1872]; +ld.shared.v2.f64 {fd995, fd996}, [r10+2016]; +ld.shared.v2.f64 {fd999, fd1000}, [r10+2160]; +ld.shared.v2.f64 {fd1003, fd1004}, [r10+2304]; +ld.shared.v2.f64 {fd1007, fd1008}, [r10+2448]; +ld.shared.v2.f64 {fd1011, fd1012}, [r10+2592]; +ld.shared.v2.f64 {fd1015, fd1016}, [r10+2736]; +ld.shared.v2.f64 {fd1019, fd1020}, [r10+2880]; +ld.shared.v2.f64 {fd1023, fd1024}, [r10+3024]; +ld.shared.v2.f64 {fd1027, fd1028}, [r10+3168]; +ld.shared.v2.f64 {fd1031, fd1032}, [r10+3312]; +ld.shared.v2.f64 {fd1035, fd1036}, [r10+3456]; +ld.shared.v2.f64 {fd1039, fd1040}, [r10+3600]; +ld.shared.v2.f64 {fd1043, fd1044}, [r10+3744]; +add.f64 fd1047, fd975, fd1011; +add.f64 fd1048, fd939, fd1047; +mul.f64 fd1051, fd1047, 0d3FE0000000000000; +sub.f64 fd1052, fd939, fd1051; +add.f64 fd1451, fd976, fd1012; +sub.f64 fd1053, fd976, fd1012; +mul.f64 fd1054, fd1053, 0dBFEBB67AE8584CAA; +add.f64 fd1055, fd1054, fd1052; +sub.f64 fd1056, fd1052, fd1054; +add.f64 fd1450, fd940, fd1451; +mul.f64 fd1057, fd1451, 0d3FE0000000000000; +sub.f64 fd1058, fd940, fd1057; +sub.f64 fd1059, fd975, fd1011; +mul.f64 fd1060, fd1059, 0dBFEBB67AE8584CAA; +sub.f64 fd1061, fd1058, fd1060; +add.f64 fd1062, fd1060, fd1058; +add.f64 fd1063, fd987, fd1023; +add.f64 fd1064, fd951, fd1063; +mul.f64 fd1067, fd1063, 0d3FE0000000000000; +sub.f64 fd1068, fd951, fd1067; +add.f64 fd1449, fd988, fd1024; +sub.f64 fd1069, fd988, fd1024; +mul.f64 fd1070, fd1069, 0dBFEBB67AE8584CAA; +add.f64 fd1071, fd1070, fd1068; +sub.f64 fd1072, fd1068, fd1070; +add.f64 fd1448, fd952, fd1449; +mul.f64 fd1073, fd1449, 0d3FE0000000000000; +sub.f64 fd1074, fd952, fd1073; +sub.f64 fd1075, fd987, fd1023; +mul.f64 fd1076, fd1075, 0dBFEBB67AE8584CAA; +sub.f64 fd1077, fd1074, fd1076; +add.f64 fd1078, fd1076, fd1074; +add.f64 fd1079, fd999, fd1035; +add.f64 fd1080, fd963, fd1079; +mul.f64 fd1083, fd1079, 0d3FE0000000000000; +sub.f64 fd1084, fd963, fd1083; +add.f64 fd1447, fd1000, fd1036; +sub.f64 fd1085, fd1000, fd1036; +mul.f64 fd1086, fd1085, 0dBFEBB67AE8584CAA; +add.f64 fd1087, fd1086, fd1084; +sub.f64 fd1088, fd1084, fd1086; +add.f64 fd1446, fd964, fd1447; +mul.f64 fd1089, fd1447, 0d3FE0000000000000; +sub.f64 fd1090, fd964, fd1089; +sub.f64 fd1091, fd999, fd1035; +mul.f64 fd1092, fd1091, 0dBFEBB67AE8584CAA; +sub.f64 fd1093, fd1090, fd1092; +add.f64 fd1094, fd1092, fd1090; +mul.f64 fd1096, fd1077, 0d3FE491B7523C161D; +mul.f64 fd1445, fd1071, 0d3FE8836FA2CF5039; +sub.f64 fd1097, fd1445, fd1096; +mul.f64 fd1098, fd1077, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1099, fd1071, 0d3FE491B7523C161D, fd1098; +mul.f64 fd1101, fd1093, 0d3FEF838B8C811C17; +mul.f64 fd1444, fd1087, 0d3FC63A1A7E0B738A; +sub.f64 fd1102, fd1444, fd1101; +mul.f64 fd1103, fd1093, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1104, fd1087, 0d3FEF838B8C811C17, fd1103; +mul.f64 fd1106, fd1078, 0d3FEF838B8C811C17; +mul.f64 fd1443, fd1072, 0d3FC63A1A7E0B738A; +sub.f64 fd1107, fd1443, fd1106; +mul.f64 fd1108, fd1078, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1109, fd1072, 0d3FEF838B8C811C17, fd1108; +mul.f64 fd1441, fd1088, 0dBFEE11F642522D1C; +mul.f64 fd1442, fd1094, 0d3FD5E3A8748A0BF5; +sub.f64 fd1112, fd1441, fd1442; +mul.f64 fd1113, fd1094, 0dBFEE11F642522D1C; +fma.rn.f64 fd1114, fd1088, 0d3FD5E3A8748A0BF5, fd1113; +add.f64 fd1115, fd1064, fd1080; +mul.f64 fd1117, fd1115, 0d3FE0000000000000; +sub.f64 fd1118, fd1048, fd1117; +add.f64 fd1440, fd1448, fd1446; +sub.f64 fd1119, fd1448, fd1446; +mul.f64 fd1120, fd1119, 0dBFEBB67AE8584CAA; +mul.f64 fd1121, fd1440, 0d3FE0000000000000; +sub.f64 fd1122, fd1450, fd1121; +sub.f64 fd1123, fd1064, fd1080; +mul.f64 fd1124, fd1123, 0dBFEBB67AE8584CAA; +add.f64 fd1125, fd1097, fd1102; +mul.f64 fd1127, fd1125, 0d3FE0000000000000; +sub.f64 fd1128, fd1055, fd1127; +add.f64 fd1439, fd1099, fd1104; +sub.f64 fd1129, fd1099, fd1104; +mul.f64 fd1130, fd1129, 0dBFEBB67AE8584CAA; +mul.f64 fd1131, fd1439, 0d3FE0000000000000; +sub.f64 fd1132, fd1061, fd1131; +sub.f64 fd1133, fd1097, fd1102; +mul.f64 fd1134, fd1133, 0dBFEBB67AE8584CAA; +add.f64 fd1135, fd1107, fd1112; +mul.f64 fd1137, fd1135, 0d3FE0000000000000; +sub.f64 fd1138, fd1056, fd1137; +add.f64 fd1438, fd1109, fd1114; +sub.f64 fd1139, fd1109, fd1114; +mul.f64 fd1140, fd1139, 0dBFEBB67AE8584CAA; +mul.f64 fd1141, fd1438, 0d3FE0000000000000; +sub.f64 fd1142, fd1062, fd1141; +sub.f64 fd1143, fd1107, fd1112; +mul.f64 fd1144, fd1143, 0dBFEBB67AE8584CAA; +add.f64 fd1145, fd979, fd1015; +add.f64 fd1146, fd943, fd1145; +mul.f64 fd1149, fd1145, 0d3FE0000000000000; +sub.f64 fd1150, fd943, fd1149; +add.f64 fd1437, fd980, fd1016; +sub.f64 fd1151, fd980, fd1016; +mul.f64 fd1152, fd1151, 0dBFEBB67AE8584CAA; +add.f64 fd1153, fd1152, fd1150; +sub.f64 fd1154, fd1150, fd1152; +add.f64 fd1436, fd944, fd1437; +mul.f64 fd1155, fd1437, 0d3FE0000000000000; +sub.f64 fd1156, fd944, fd1155; +sub.f64 fd1157, fd979, fd1015; +mul.f64 fd1158, fd1157, 0dBFEBB67AE8584CAA; +sub.f64 fd1159, fd1156, fd1158; +add.f64 fd1160, fd1158, fd1156; +add.f64 fd1161, fd991, fd1027; +add.f64 fd1162, fd955, fd1161; +mul.f64 fd1165, fd1161, 0d3FE0000000000000; +sub.f64 fd1166, fd955, fd1165; +add.f64 fd1435, fd992, fd1028; +sub.f64 fd1167, fd992, fd1028; +mul.f64 fd1168, fd1167, 0dBFEBB67AE8584CAA; +add.f64 fd1169, fd1168, fd1166; +sub.f64 fd1170, fd1166, fd1168; +add.f64 fd1434, fd956, fd1435; +mul.f64 fd1171, fd1435, 0d3FE0000000000000; +sub.f64 fd1172, fd956, fd1171; +sub.f64 fd1173, fd991, fd1027; +mul.f64 fd1174, fd1173, 0dBFEBB67AE8584CAA; +sub.f64 fd1175, fd1172, fd1174; +add.f64 fd1176, fd1174, fd1172; +add.f64 fd1177, fd1003, fd1039; +add.f64 fd1178, fd967, fd1177; +mul.f64 fd1181, fd1177, 0d3FE0000000000000; +sub.f64 fd1182, fd967, fd1181; +add.f64 fd1433, fd1004, fd1040; +sub.f64 fd1183, fd1004, fd1040; +mul.f64 fd1184, fd1183, 0dBFEBB67AE8584CAA; +add.f64 fd1185, fd1184, fd1182; +sub.f64 fd1186, fd1182, fd1184; +add.f64 fd1432, fd968, fd1433; +mul.f64 fd1187, fd1433, 0d3FE0000000000000; +sub.f64 fd1188, fd968, fd1187; +sub.f64 fd1189, fd1003, fd1039; +mul.f64 fd1190, fd1189, 0dBFEBB67AE8584CAA; +sub.f64 fd1191, fd1188, fd1190; +add.f64 fd1192, fd1190, fd1188; +mul.f64 fd1430, fd1169, 0d3FE8836FA2CF5039; +mul.f64 fd1431, fd1175, 0d3FE491B7523C161D; +sub.f64 fd1195, fd1430, fd1431; +mul.f64 fd1196, fd1175, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1197, fd1169, 0d3FE491B7523C161D, fd1196; +mul.f64 fd1428, fd1185, 0d3FC63A1A7E0B738A; +mul.f64 fd1429, fd1191, 0d3FEF838B8C811C17; +sub.f64 fd1200, fd1428, fd1429; +mul.f64 fd1201, fd1191, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1202, fd1185, 0d3FEF838B8C811C17, fd1201; +mul.f64 fd1426, fd1170, 0d3FC63A1A7E0B738A; +mul.f64 fd1427, fd1176, 0d3FEF838B8C811C17; +sub.f64 fd1205, fd1426, fd1427; +mul.f64 fd1206, fd1176, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1207, fd1170, 0d3FEF838B8C811C17, fd1206; +mul.f64 fd1424, fd1186, 0dBFEE11F642522D1C; +mul.f64 fd1425, fd1192, 0d3FD5E3A8748A0BF5; +sub.f64 fd1210, fd1424, fd1425; +mul.f64 fd1211, fd1192, 0dBFEE11F642522D1C; +fma.rn.f64 fd1212, fd1186, 0d3FD5E3A8748A0BF5, fd1211; +add.f64 fd1213, fd1162, fd1178; +mul.f64 fd1215, fd1213, 0d3FE0000000000000; +sub.f64 fd1216, fd1146, fd1215; +add.f64 fd1423, fd1434, fd1432; +sub.f64 fd1217, fd1434, fd1432; +mul.f64 fd1218, fd1217, 0dBFEBB67AE8584CAA; +mul.f64 fd1219, fd1423, 0d3FE0000000000000; +sub.f64 fd1220, fd1436, fd1219; +sub.f64 fd1221, fd1162, fd1178; +mul.f64 fd1222, fd1221, 0dBFEBB67AE8584CAA; +add.f64 fd1223, fd1195, fd1200; +mul.f64 fd1225, fd1223, 0d3FE0000000000000; +sub.f64 fd1226, fd1153, fd1225; +add.f64 fd1422, fd1197, fd1202; +sub.f64 fd1227, fd1197, fd1202; +mul.f64 fd1228, fd1227, 0dBFEBB67AE8584CAA; +mul.f64 fd1229, fd1422, 0d3FE0000000000000; +sub.f64 fd1230, fd1159, fd1229; +sub.f64 fd1231, fd1195, fd1200; +mul.f64 fd1232, fd1231, 0dBFEBB67AE8584CAA; +add.f64 fd1233, fd1205, fd1210; +mul.f64 fd1235, fd1233, 0d3FE0000000000000; +sub.f64 fd1236, fd1154, fd1235; +add.f64 fd1421, fd1207, fd1212; +sub.f64 fd1237, fd1207, fd1212; +mul.f64 fd1238, fd1237, 0dBFEBB67AE8584CAA; +mul.f64 fd1239, fd1421, 0d3FE0000000000000; +sub.f64 fd1240, fd1160, fd1239; +sub.f64 fd1241, fd1205, fd1210; +mul.f64 fd1242, fd1241, 0dBFEBB67AE8584CAA; +add.f64 fd1243, fd983, fd1019; +add.f64 fd1244, fd947, fd1243; +mul.f64 fd1247, fd1243, 0d3FE0000000000000; +sub.f64 fd1248, fd947, fd1247; +add.f64 fd1420, fd984, fd1020; +sub.f64 fd1249, fd984, fd1020; +mul.f64 fd1250, fd1249, 0dBFEBB67AE8584CAA; +add.f64 fd1251, fd1250, fd1248; +sub.f64 fd1252, fd1248, fd1250; +add.f64 fd1419, fd948, fd1420; +mul.f64 fd1253, fd1420, 0d3FE0000000000000; +sub.f64 fd1254, fd948, fd1253; +sub.f64 fd1255, fd983, fd1019; +mul.f64 fd1256, fd1255, 0dBFEBB67AE8584CAA; +sub.f64 fd1257, fd1254, fd1256; +add.f64 fd1258, fd1256, fd1254; +add.f64 fd1259, fd995, fd1031; +add.f64 fd1260, fd959, fd1259; +mul.f64 fd1263, fd1259, 0d3FE0000000000000; +sub.f64 fd1264, fd959, fd1263; +add.f64 fd1418, fd996, fd1032; +sub.f64 fd1265, fd996, fd1032; +mul.f64 fd1266, fd1265, 0dBFEBB67AE8584CAA; +add.f64 fd1267, fd1266, fd1264; +sub.f64 fd1268, fd1264, fd1266; +add.f64 fd1417, fd960, fd1418; +mul.f64 fd1269, fd1418, 0d3FE0000000000000; +sub.f64 fd1270, fd960, fd1269; +sub.f64 fd1271, fd995, fd1031; +mul.f64 fd1272, fd1271, 0dBFEBB67AE8584CAA; +sub.f64 fd1273, fd1270, fd1272; +add.f64 fd1274, fd1272, fd1270; +add.f64 fd1275, fd1007, fd1043; +add.f64 fd1276, fd971, fd1275; +mul.f64 fd1279, fd1275, 0d3FE0000000000000; +sub.f64 fd1280, fd971, fd1279; +add.f64 fd1416, fd1008, fd1044; +sub.f64 fd1281, fd1008, fd1044; +mul.f64 fd1282, fd1281, 0dBFEBB67AE8584CAA; +add.f64 fd1283, fd1282, fd1280; +sub.f64 fd1284, fd1280, fd1282; +add.f64 fd1415, fd972, fd1416; +mul.f64 fd1285, fd1416, 0d3FE0000000000000; +sub.f64 fd1286, fd972, fd1285; +sub.f64 fd1287, fd1007, fd1043; +mul.f64 fd1288, fd1287, 0dBFEBB67AE8584CAA; +sub.f64 fd1289, fd1286, fd1288; +add.f64 fd1290, fd1288, fd1286; +mul.f64 fd1413, fd1267, 0d3FE8836FA2CF5039; +mul.f64 fd1414, fd1273, 0d3FE491B7523C161D; +sub.f64 fd1293, fd1413, fd1414; +mul.f64 fd1294, fd1273, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1295, fd1267, 0d3FE491B7523C161D, fd1294; +mul.f64 fd1297, fd1289, 0d3FEF838B8C811C17; +mul.f64 fd1412, fd1283, 0d3FC63A1A7E0B738A; +sub.f64 fd1298, fd1412, fd1297; +mul.f64 fd1299, fd1289, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1300, fd1283, 0d3FEF838B8C811C17, fd1299; +mul.f64 fd1302, fd1274, 0d3FEF838B8C811C17; +mul.f64 fd1411, fd1268, 0d3FC63A1A7E0B738A; +sub.f64 fd1303, fd1411, fd1302; +mul.f64 fd1304, fd1274, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1305, fd1268, 0d3FEF838B8C811C17, fd1304; +mul.f64 fd1307, fd1290, 0d3FD5E3A8748A0BF5; +mul.f64 fd1410, fd1284, 0dBFEE11F642522D1C; +sub.f64 fd1308, fd1410, fd1307; +mul.f64 fd1309, fd1290, 0dBFEE11F642522D1C; +fma.rn.f64 fd1310, fd1284, 0d3FD5E3A8748A0BF5, fd1309; +add.f64 fd1311, fd1260, fd1276; +mul.f64 fd1313, fd1311, 0d3FE0000000000000; +sub.f64 fd1314, fd1244, fd1313; +add.f64 fd1409, fd1417, fd1415; +sub.f64 fd1315, fd1417, fd1415; +mul.f64 fd1316, fd1315, 0dBFEBB67AE8584CAA; +mul.f64 fd1317, fd1409, 0d3FE0000000000000; +sub.f64 fd1318, fd1419, fd1317; +sub.f64 fd1319, fd1260, fd1276; +mul.f64 fd1320, fd1319, 0dBFEBB67AE8584CAA; +add.f64 fd1321, fd1293, fd1298; +mul.f64 fd1323, fd1321, 0d3FE0000000000000; +sub.f64 fd1324, fd1251, fd1323; +add.f64 fd1408, fd1295, fd1300; +sub.f64 fd1325, fd1295, fd1300; +mul.f64 fd1326, fd1325, 0dBFEBB67AE8584CAA; +mul.f64 fd1327, fd1408, 0d3FE0000000000000; +sub.f64 fd1328, fd1257, fd1327; +sub.f64 fd1329, fd1293, fd1298; +mul.f64 fd1330, fd1329, 0dBFEBB67AE8584CAA; +add.f64 fd1331, fd1303, fd1308; +mul.f64 fd1333, fd1331, 0d3FE0000000000000; +sub.f64 fd1334, fd1252, fd1333; +add.f64 fd1407, fd1305, fd1310; +sub.f64 fd1335, fd1305, fd1310; +mul.f64 fd1336, fd1335, 0dBFEBB67AE8584CAA; +mul.f64 fd1337, fd1407, 0d3FE0000000000000; +sub.f64 fd1338, fd1258, fd1337; +sub.f64 fd1339, fd1303, fd1308; +mul.f64 fd1340, fd1339, 0dBFEBB67AE8584CAA; +add.f64 %1, fd1450, fd1440; +add.f64 %0, fd1048, fd1115; +add.f64 %3, fd1436, fd1423; +add.f64 %2, fd1146, fd1213; +add.f64 %5, fd1419, fd1409; +add.f64 %4, fd1244, fd1311; +add.f64 %7, fd1061, fd1439; +add.f64 %6, fd1055, fd1125; +add.f64 %9, fd1159, fd1422; +add.f64 %8, fd1153, fd1223; +add.f64 %11, fd1257, fd1408; +add.f64 %10, fd1251, fd1321; +add.f64 %13, fd1062, fd1438; +add.f64 %12, fd1056, fd1135; +add.f64 %15, fd1160, fd1421; +add.f64 %14, fd1154, fd1233; +add.f64 %17, fd1258, fd1407; +add.f64 %16, fd1252, fd1331; +sub.f64 %19, fd1122, fd1124; +add.f64 %18, fd1120, fd1118; +add.f64 %20, fd1218, fd1216; +sub.f64 %21, fd1220, fd1222; +add.f64 %22, fd1316, fd1314; +sub.f64 %23, fd1318, fd1320; +add.f64 %24, fd1130, fd1128; +sub.f64 %25, fd1132, fd1134; +add.f64 %26, fd1228, fd1226; +sub.f64 %27, fd1230, fd1232; +sub.f64 %29, fd1328, fd1330; +add.f64 %28, fd1326, fd1324; +sub.f64 %31, fd1142, fd1144; +add.f64 %30, fd1140, fd1138; +add.f64 %32, fd1238, fd1236; +sub.f64 %33, fd1240, fd1242; +add.f64 %34, fd1336, fd1334; +sub.f64 %35, fd1338, fd1340; +add.f64 %37, fd1124, fd1122; +sub.f64 %36, fd1118, fd1120; +add.f64 %39, fd1222, fd1220; +sub.f64 %38, fd1216, fd1218; +add.f64 %41, fd1320, fd1318; +sub.f64 %40, fd1314, fd1316; +add.f64 %43, fd1134, fd1132; +sub.f64 %42, fd1128, fd1130; +add.f64 %45, fd1232, fd1230; +sub.f64 %44, fd1226, fd1228; +add.f64 %47, fd1330, fd1328; +sub.f64 %46, fd1324, fd1326; +add.f64 %49, fd1144, fd1142; +sub.f64 %48, fd1138, fd1140; +add.f64 %51, fd1242, fd1240; +sub.f64 %50, fd1236, fd1238; +add.f64 %53, fd1340, fd1338; +sub.f64 %52, fd1334, fd1336; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_243), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<686, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1553>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 1944, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1544, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1543, %57, fd1544; +mul.f64 fd119, fd1544, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1542, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1541, %63, fd1542; +mul.f64 fd135, fd1542, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1540, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1539, %69, fd1540; +mul.f64 fd151, fd1540, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd1538, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1538, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd1536, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1537, fd155, 0d3FEF838B8C811C17; +sub.f64 fd164, fd1536, fd1537; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd1534, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1535, fd140, 0d3FEF838B8C811C17; +sub.f64 fd169, fd1534, fd1535; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd1532, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1533, fd156, 0d3FD5E3A8748A0BF5; +sub.f64 fd174, fd1532, fd1533; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1531, fd1541, fd1539; +sub.f64 fd183, fd1541, fd1539; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1530, fd1543, fd1531; +mul.f64 fd187, fd1531, 0d3FE0000000000000; +sub.f64 fd188, fd1543, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1529, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1528, fd123, fd1529; +mul.f64 fd203, fd1529, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1527, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1526, fd124, fd1527; +mul.f64 fd219, fd1527, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1523, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1521, %112, fd1523; +mul.f64 fd235, fd1523, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1518, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1516, %115, fd1518; +mul.f64 fd251, fd1518, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1513, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1511, %118, fd1513; +mul.f64 fd267, fd1513, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd1510, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1510, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd1509, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1509, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd1507, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1508, fd256, 0d3FEF838B8C811C17; +sub.f64 fd285, fd1507, fd1508; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd1505, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1506, fd272, 0d3FD5E3A8748A0BF5; +sub.f64 fd290, fd1505, fd1506; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1504, fd1516, fd1511; +sub.f64 fd299, fd1516, fd1511; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1503, fd1521, fd1504; +mul.f64 fd303, fd1504, 0d3FE0000000000000; +sub.f64 fd304, fd1521, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1502, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1501, fd239, fd1502; +mul.f64 fd319, fd1502, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1500, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1499, fd240, fd1500; +mul.f64 fd335, fd1500, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1496, %120, %119; +sub.f64 fd347, %120, %119; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1494, %121, fd1496; +mul.f64 fd351, fd1496, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1491, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1489, %124, fd1491; +mul.f64 fd367, fd1491, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1487, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1485, %126, fd1487; +mul.f64 fd383, fd1487, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd1484, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1484, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd1483, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1483, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd1481, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1482, fd372, 0d3FEF838B8C811C17; +sub.f64 fd401, fd1481, fd1482; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd1479, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1480, fd388, 0d3FD5E3A8748A0BF5; +sub.f64 fd406, fd1479, fd1480; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1478, fd1489, fd1485; +sub.f64 fd415, fd1489, fd1485; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1477, fd1494, fd1478; +mul.f64 fd419, fd1478, 0d3FE0000000000000; +sub.f64 fd420, fd1494, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1476, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1475, fd355, fd1476; +mul.f64 fd435, fd1476, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1474, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1473, fd356, fd1474; +mul.f64 fd451, fd1474, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1501, 0d3FCD84D223638000; +mul.f64 fd1472, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1472, fd458; +mul.f64 fd460, fd1501, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd1470, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1471, fd1475, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd1470, fd1471; +mul.f64 fd465, fd1475, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd1468, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1469, fd1499, 0d3FDCB920325BAFA6; +sub.f64 fd469, fd1468, fd1469; +mul.f64 fd470, fd1499, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd1466, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1467, fd1473, 0d3FE9AAFE4207DF5F; +sub.f64 fd474, fd1466, fd1467; +mul.f64 fd475, fd1473, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd1464, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1465, fd307, 0d3FE491B7523C161D; +sub.f64 fd479, fd1464, fd1465; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd1463, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1463, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd1462, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1462, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd1461, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1461, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0d3FED6206BEB6C24B; +mul.f64 fd1460, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1460, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0d3FE746A51650EADE; +mul.f64 fd1459, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1459, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0d3FEF838B8C811C17; +mul.f64 fd1458, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1458, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd1456, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1457, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd1456, fd1457; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd1454, fd318, 0dBFADC528B5343A86; +mul.f64 fd1455, fd324, 0d3FEFF223F3635CE3; +sub.f64 fd519, fd1454, fd1455; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd1452, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1453, fd440, 0dBFBDB843E577175E; +sub.f64 fd524, fd1452, fd1453; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd1451, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1451, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd1450, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1450, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +add.f64 fd538, fd178, fd537; +mul.f64 fd541, fd537, 0d3FE0000000000000; +sub.f64 fd542, fd178, fd541; +add.f64 fd1449, fd1503, fd1477; +sub.f64 fd543, fd1503, fd1477; +mul.f64 fd544, fd543, 0dBFEBB67AE8584CAA; +add.f64 fd545, fd544, fd542; +sub.f64 fd546, fd542, fd544; +add.f64 fd1448, fd1530, fd1449; +mul.f64 fd547, fd1449, 0d3FE0000000000000; +sub.f64 fd548, fd1530, fd547; +sub.f64 fd549, fd294, fd410; +mul.f64 fd550, fd549, 0dBFEBB67AE8584CAA; +sub.f64 fd551, fd548, fd550; +add.f64 fd552, fd550, fd548; +add.f64 fd553, fd459, fd464; +add.f64 fd554, fd194, fd553; +mul.f64 fd557, fd553, 0d3FE0000000000000; +sub.f64 fd558, fd194, fd557; +add.f64 fd1447, fd461, fd466; +sub.f64 fd559, fd461, fd466; +mul.f64 fd560, fd559, 0dBFEBB67AE8584CAA; +add.f64 fd561, fd560, fd558; +sub.f64 fd562, fd558, fd560; +add.f64 fd1446, fd1528, fd1447; +mul.f64 fd563, fd1447, 0d3FE0000000000000; +sub.f64 fd564, fd1528, fd563; +sub.f64 fd565, fd459, fd464; +mul.f64 fd566, fd565, 0dBFEBB67AE8584CAA; +sub.f64 fd567, fd564, fd566; +add.f64 fd568, fd566, fd564; +add.f64 fd569, fd469, fd474; +add.f64 fd570, fd210, fd569; +mul.f64 fd573, fd569, 0d3FE0000000000000; +sub.f64 fd574, fd210, fd573; +add.f64 fd1445, fd471, fd476; +sub.f64 fd575, fd471, fd476; +mul.f64 fd576, fd575, 0dBFEBB67AE8584CAA; +add.f64 fd577, fd576, fd574; +sub.f64 fd578, fd574, fd576; +add.f64 fd1444, fd1526, fd1445; +mul.f64 fd579, fd1445, 0d3FE0000000000000; +sub.f64 fd580, fd1526, fd579; +sub.f64 fd581, fd469, fd474; +mul.f64 fd582, fd581, 0dBFEBB67AE8584CAA; +sub.f64 fd583, fd580, fd582; +add.f64 fd584, fd582, fd580; +add.f64 fd585, fd479, fd484; +add.f64 fd586, fd185, fd585; +mul.f64 fd589, fd585, 0d3FE0000000000000; +sub.f64 fd590, fd185, fd589; +add.f64 fd1443, fd481, fd486; +sub.f64 fd591, fd481, fd486; +mul.f64 fd592, fd591, 0dBFEBB67AE8584CAA; +add.f64 fd593, fd592, fd590; +sub.f64 fd594, fd590, fd592; +add.f64 fd1442, fd191, fd1443; +mul.f64 fd595, fd1443, 0d3FE0000000000000; +sub.f64 fd596, fd191, fd595; +sub.f64 fd597, fd479, fd484; +mul.f64 fd598, fd597, 0dBFEBB67AE8584CAA; +sub.f64 fd599, fd596, fd598; +add.f64 fd600, fd598, fd596; +add.f64 fd601, fd489, fd494; +add.f64 fd602, fd201, fd601; +mul.f64 fd605, fd601, 0d3FE0000000000000; +sub.f64 fd606, fd201, fd605; +add.f64 fd1441, fd491, fd496; +sub.f64 fd607, fd491, fd496; +mul.f64 fd608, fd607, 0dBFEBB67AE8584CAA; +add.f64 fd609, fd608, fd606; +sub.f64 fd610, fd606, fd608; +add.f64 fd1440, fd207, fd1441; +mul.f64 fd611, fd1441, 0d3FE0000000000000; +sub.f64 fd612, fd207, fd611; +sub.f64 fd613, fd489, fd494; +mul.f64 fd614, fd613, 0dBFEBB67AE8584CAA; +sub.f64 fd615, fd612, fd614; +add.f64 fd616, fd614, fd612; +add.f64 fd617, fd499, fd504; +add.f64 fd618, fd217, fd617; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd217, fd621; +add.f64 fd1439, fd501, fd506; +sub.f64 fd623, fd501, fd506; +mul.f64 fd624, fd623, 0dBFEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +add.f64 fd1438, fd223, fd1439; +mul.f64 fd627, fd1439, 0d3FE0000000000000; +sub.f64 fd628, fd223, fd627; +sub.f64 fd629, fd499, fd504; +mul.f64 fd630, fd629, 0dBFEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd509, fd514; +add.f64 fd634, fd186, fd633; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd186, fd637; +add.f64 fd1437, fd511, fd516; +sub.f64 fd639, fd511, fd516; +mul.f64 fd640, fd639, 0dBFEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +add.f64 fd1436, fd192, fd1437; +mul.f64 fd643, fd1437, 0d3FE0000000000000; +sub.f64 fd644, fd192, fd643; +sub.f64 fd645, fd509, fd514; +mul.f64 fd646, fd645, 0dBFEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +add.f64 fd649, fd519, fd524; +add.f64 fd650, fd202, fd649; +mul.f64 fd653, fd649, 0d3FE0000000000000; +sub.f64 fd654, fd202, fd653; +add.f64 fd1435, fd521, fd526; +sub.f64 fd655, fd521, fd526; +mul.f64 fd656, fd655, 0dBFEBB67AE8584CAA; +add.f64 fd657, fd656, fd654; +sub.f64 fd658, fd654, fd656; +add.f64 fd1434, fd208, fd1435; +mul.f64 fd659, fd1435, 0d3FE0000000000000; +sub.f64 fd660, fd208, fd659; +sub.f64 fd661, fd519, fd524; +mul.f64 fd662, fd661, 0dBFEBB67AE8584CAA; +sub.f64 fd663, fd660, fd662; +add.f64 fd664, fd662, fd660; +add.f64 fd665, fd529, fd534; +add.f64 fd666, fd218, fd665; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd218, fd669; +add.f64 fd1433, fd531, fd536; +sub.f64 fd671, fd531, fd536; +mul.f64 fd672, fd671, 0dBFEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +add.f64 fd1432, fd224, fd1433; +mul.f64 fd675, fd1433, 0d3FE0000000000000; +sub.f64 fd676, fd224, fd675; +sub.f64 fd677, fd529, fd534; +mul.f64 fd678, fd677, 0dBFEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd681, fd682}, [rd6]; +mul.f64 fd685, fd1446, fd682; +fma.rn.f64 fd686, fd681, fd554, fd685; +mul.f64 fd687, fd554, fd682; +mul.f64 fd688, fd681, fd1446; +sub.f64 fd689, fd688, fd687; +mul.f64 fd691, fd682, fd682; +mul.f64 fd1431, fd681, fd681; +sub.f64 fd692, fd1431, fd691; +mul.f64 fd693, fd682, fd681; +fma.rn.f64 fd694, fd682, fd681, fd693; +mul.f64 fd695, fd1444, fd694; +fma.rn.f64 fd696, fd692, fd570, fd695; +mul.f64 fd697, fd570, fd694; +mul.f64 fd698, fd692, fd1444; +sub.f64 fd699, fd698, fd697; +mul.f64 fd701, fd682, fd694; +mul.f64 fd1430, fd681, fd692; +sub.f64 fd702, fd1430, fd701; +mul.f64 fd703, fd681, fd694; +fma.rn.f64 fd704, fd682, fd692, fd703; +mul.f64 fd705, fd1442, fd704; +fma.rn.f64 fd706, fd702, fd586, fd705; +mul.f64 fd707, fd586, fd704; +mul.f64 fd708, fd702, fd1442; +sub.f64 fd709, fd708, fd707; +mul.f64 fd1428, fd681, fd702; +mul.f64 fd1429, fd682, fd704; +sub.f64 fd712, fd1428, fd1429; +mul.f64 fd713, fd681, fd704; +fma.rn.f64 fd714, fd682, fd702, fd713; +mul.f64 fd715, fd1440, fd714; +fma.rn.f64 fd716, fd712, fd602, fd715; +mul.f64 fd717, fd602, fd714; +mul.f64 fd718, fd712, fd1440; +sub.f64 fd719, fd718, fd717; +mul.f64 fd1426, fd681, fd712; +mul.f64 fd1427, fd682, fd714; +sub.f64 fd722, fd1426, fd1427; +mul.f64 fd723, fd681, fd714; +fma.rn.f64 fd724, fd682, fd712, fd723; +mul.f64 fd725, fd1438, fd724; +fma.rn.f64 fd726, fd722, fd618, fd725; +mul.f64 fd727, fd618, fd724; +mul.f64 fd728, fd722, fd1438; +sub.f64 fd729, fd728, fd727; +mul.f64 fd731, fd682, fd724; +mul.f64 fd1425, fd681, fd722; +sub.f64 fd732, fd1425, fd731; +mul.f64 fd733, fd681, fd724; +fma.rn.f64 fd734, fd682, fd722, fd733; +mul.f64 fd735, fd1436, fd734; +fma.rn.f64 fd736, fd732, fd634, fd735; +mul.f64 fd737, fd634, fd734; +mul.f64 fd738, fd732, fd1436; +sub.f64 fd739, fd738, fd737; +mul.f64 fd741, fd682, fd734; +mul.f64 fd1424, fd681, fd732; +sub.f64 fd742, fd1424, fd741; +mul.f64 fd743, fd681, fd734; +fma.rn.f64 fd744, fd682, fd732, fd743; +mul.f64 fd745, fd1434, fd744; +fma.rn.f64 fd746, fd742, fd650, fd745; +mul.f64 fd747, fd650, fd744; +mul.f64 fd748, fd742, fd1434; +sub.f64 fd749, fd748, fd747; +mul.f64 fd751, fd682, fd744; +mul.f64 fd1423, fd681, fd742; +sub.f64 fd752, fd1423, fd751; +mul.f64 fd753, fd681, fd744; +fma.rn.f64 fd754, fd682, fd742, fd753; +mul.f64 fd755, fd1432, fd754; +fma.rn.f64 fd756, fd752, fd666, fd755; +mul.f64 fd757, fd666, fd754; +mul.f64 fd758, fd752, fd1432; +sub.f64 fd759, fd758, fd757; +mul.f64 fd1421, fd681, fd752; +mul.f64 fd1422, fd682, fd754; +sub.f64 fd762, fd1421, fd1422; +mul.f64 fd763, fd681, fd754; +fma.rn.f64 fd764, fd682, fd752, fd763; +mul.f64 fd765, fd551, fd764; +fma.rn.f64 fd766, fd762, fd545, fd765; +mul.f64 fd767, fd545, fd764; +mul.f64 fd768, fd762, fd551; +sub.f64 fd769, fd768, fd767; +mul.f64 fd1419, fd681, fd762; +mul.f64 fd1420, fd682, fd764; +sub.f64 fd772, fd1419, fd1420; +mul.f64 fd773, fd681, fd764; +fma.rn.f64 fd774, fd682, fd762, fd773; +mul.f64 fd775, fd567, fd774; +fma.rn.f64 fd776, fd772, fd561, fd775; +mul.f64 fd777, fd561, fd774; +mul.f64 fd778, fd772, fd567; +sub.f64 fd779, fd778, fd777; +mul.f64 fd781, fd682, fd774; +mul.f64 fd1418, fd681, fd772; +sub.f64 fd782, fd1418, fd781; +mul.f64 fd783, fd681, fd774; +fma.rn.f64 fd784, fd682, fd772, fd783; +mul.f64 fd785, fd583, fd784; +fma.rn.f64 fd786, fd782, fd577, fd785; +mul.f64 fd787, fd577, fd784; +mul.f64 fd788, fd782, fd583; +sub.f64 fd789, fd788, fd787; +mul.f64 fd791, fd682, fd784; +mul.f64 fd1417, fd681, fd782; +sub.f64 fd792, fd1417, fd791; +mul.f64 fd793, fd681, fd784; +fma.rn.f64 fd794, fd682, fd782, fd793; +mul.f64 fd795, fd599, fd794; +fma.rn.f64 fd796, fd792, fd593, fd795; +mul.f64 fd797, fd593, fd794; +mul.f64 fd798, fd792, fd599; +sub.f64 fd799, fd798, fd797; +mul.f64 fd801, fd682, fd794; +mul.f64 fd1416, fd681, fd792; +sub.f64 fd802, fd1416, fd801; +mul.f64 fd803, fd681, fd794; +fma.rn.f64 fd804, fd682, fd792, fd803; +mul.f64 fd805, fd615, fd804; +fma.rn.f64 fd806, fd802, fd609, fd805; +mul.f64 fd807, fd609, fd804; +mul.f64 fd808, fd802, fd615; +sub.f64 fd809, fd808, fd807; +ld.global.v2.f64 {fd810, fd811}, [rd6+144]; +mul.f64 fd814, fd631, fd811; +fma.rn.f64 fd815, fd810, fd625, fd814; +mul.f64 fd816, fd625, fd811; +mul.f64 fd817, fd810, fd631; +sub.f64 fd818, fd817, fd816; +mul.f64 fd820, fd682, fd811; +mul.f64 fd1415, fd681, fd810; +sub.f64 fd821, fd1415, fd820; +mul.f64 fd822, fd681, fd811; +fma.rn.f64 fd823, fd682, fd810, fd822; +mul.f64 fd824, fd647, fd823; +fma.rn.f64 fd825, fd821, fd641, fd824; +mul.f64 fd826, fd641, fd823; +mul.f64 fd827, fd821, fd647; +sub.f64 fd828, fd827, fd826; +mul.f64 fd1413, fd681, fd821; +mul.f64 fd1414, fd682, fd823; +sub.f64 fd831, fd1413, fd1414; +mul.f64 fd832, fd681, fd823; +fma.rn.f64 fd833, fd682, fd821, fd832; +mul.f64 fd834, fd663, fd833; +fma.rn.f64 fd835, fd831, fd657, fd834; +mul.f64 fd836, fd657, fd833; +mul.f64 fd837, fd831, fd663; +sub.f64 fd838, fd837, fd836; +mul.f64 fd840, fd682, fd833; +mul.f64 fd1412, fd681, fd831; +sub.f64 fd841, fd1412, fd840; +mul.f64 fd842, fd681, fd833; +fma.rn.f64 fd843, fd682, fd831, fd842; +mul.f64 fd844, fd679, fd843; +fma.rn.f64 fd845, fd841, fd673, fd844; +mul.f64 fd846, fd673, fd843; +mul.f64 fd847, fd841, fd679; +sub.f64 fd848, fd847, fd846; +mul.f64 fd850, fd682, fd843; +mul.f64 fd1411, fd681, fd841; +sub.f64 fd851, fd1411, fd850; +mul.f64 fd852, fd681, fd843; +fma.rn.f64 fd853, fd682, fd841, fd852; +mul.f64 fd854, fd552, fd853; +fma.rn.f64 fd855, fd851, fd546, fd854; +mul.f64 fd856, fd546, fd853; +mul.f64 fd857, fd851, fd552; +sub.f64 fd858, fd857, fd856; +mul.f64 fd860, fd682, fd853; +mul.f64 fd1410, fd681, fd851; +sub.f64 fd861, fd1410, fd860; +mul.f64 fd862, fd681, fd853; +fma.rn.f64 fd863, fd682, fd851, fd862; +mul.f64 fd864, fd568, fd863; +fma.rn.f64 fd865, fd861, fd562, fd864; +mul.f64 fd866, fd562, fd863; +mul.f64 fd867, fd861, fd568; +sub.f64 fd868, fd867, fd866; +mul.f64 fd1408, fd681, fd861; +mul.f64 fd1409, fd682, fd863; +sub.f64 fd871, fd1408, fd1409; +mul.f64 fd872, fd681, fd863; +fma.rn.f64 fd873, fd682, fd861, fd872; +mul.f64 fd874, fd584, fd873; +fma.rn.f64 fd875, fd871, fd578, fd874; +mul.f64 fd876, fd578, fd873; +mul.f64 fd877, fd871, fd584; +sub.f64 fd878, fd877, fd876; +mul.f64 fd1406, fd681, fd871; +mul.f64 fd1407, fd682, fd873; +sub.f64 fd881, fd1406, fd1407; +mul.f64 fd882, fd681, fd873; +fma.rn.f64 fd883, fd682, fd871, fd882; +mul.f64 fd884, fd600, fd883; +fma.rn.f64 fd885, fd881, fd594, fd884; +mul.f64 fd886, fd594, fd883; +mul.f64 fd887, fd881, fd600; +sub.f64 fd888, fd887, fd886; +mul.f64 fd890, fd682, fd883; +mul.f64 fd1405, fd681, fd881; +sub.f64 fd891, fd1405, fd890; +mul.f64 fd892, fd681, fd883; +fma.rn.f64 fd893, fd682, fd881, fd892; +mul.f64 fd894, fd616, fd893; +fma.rn.f64 fd895, fd891, fd610, fd894; +mul.f64 fd896, fd610, fd893; +mul.f64 fd897, fd891, fd616; +sub.f64 fd898, fd897, fd896; +mul.f64 fd900, fd682, fd893; +mul.f64 fd1404, fd681, fd891; +sub.f64 fd901, fd1404, fd900; +mul.f64 fd902, fd681, fd893; +fma.rn.f64 fd903, fd682, fd891, fd902; +mul.f64 fd904, fd632, fd903; +fma.rn.f64 fd905, fd901, fd626, fd904; +mul.f64 fd906, fd626, fd903; +mul.f64 fd907, fd901, fd632; +sub.f64 fd908, fd907, fd906; +mul.f64 fd910, fd682, fd903; +mul.f64 fd1403, fd681, fd901; +sub.f64 fd911, fd1403, fd910; +mul.f64 fd912, fd681, fd903; +fma.rn.f64 fd913, fd682, fd901, fd912; +mul.f64 fd914, fd648, fd913; +fma.rn.f64 fd915, fd911, fd642, fd914; +mul.f64 fd916, fd642, fd913; +mul.f64 fd917, fd911, fd648; +sub.f64 fd918, fd917, fd916; +mul.f64 fd1401, fd681, fd911; +mul.f64 fd1402, fd682, fd913; +sub.f64 fd921, fd1401, fd1402; +mul.f64 fd922, fd681, fd913; +fma.rn.f64 fd923, fd682, fd911, fd922; +mul.f64 fd924, fd664, fd923; +fma.rn.f64 fd925, fd921, fd658, fd924; +mul.f64 fd926, fd658, fd923; +mul.f64 fd927, fd921, fd664; +sub.f64 fd928, fd927, fd926; +mul.f64 fd1399, fd681, fd921; +mul.f64 fd1400, fd682, fd923; +sub.f64 fd931, fd1399, fd1400; +mul.f64 fd932, fd681, fd923; +fma.rn.f64 fd933, fd682, fd921, fd932; +mul.f64 fd934, fd680, fd933; +fma.rn.f64 fd935, fd931, fd674, fd934; +mul.f64 fd936, fd674, fd933; +mul.f64 fd937, fd931, fd680; +sub.f64 fd938, fd937, fd936; +mad.lo.s32 r8, r5, 1944, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +st.shared.f64 [r9], fd538; +st.shared.f64 [r9+8], fd686; +st.shared.f64 [r9+16], fd696; +st.shared.f64 [r9+24], fd706; +st.shared.f64 [r9+32], fd716; +st.shared.f64 [r9+40], fd726; +st.shared.f64 [r9+48], fd736; +st.shared.f64 [r9+56], fd746; +st.shared.f64 [r9+64], fd756; +st.shared.f64 [r9+72], fd766; +st.shared.f64 [r9+80], fd776; +st.shared.f64 [r9+88], fd786; +st.shared.f64 [r9+96], fd796; +st.shared.f64 [r9+104], fd806; +st.shared.f64 [r9+112], fd815; +st.shared.f64 [r9+120], fd825; +st.shared.f64 [r9+128], fd835; +st.shared.f64 [r9+136], fd845; +st.shared.f64 [r9+144], fd855; +st.shared.f64 [r9+152], fd865; +st.shared.f64 [r9+160], fd875; +st.shared.f64 [r9+168], fd885; +st.shared.f64 [r9+176], fd895; +st.shared.f64 [r9+184], fd905; +st.shared.f64 [r9+192], fd915; +st.shared.f64 [r9+200], fd925; +st.shared.f64 [r9+208], fd935; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.f64 fd939, [r10]; +ld.shared.f64 fd940, [r10+72]; +ld.shared.f64 fd941, [r10+144]; +ld.shared.f64 fd942, [r10+216]; +ld.shared.f64 fd943, [r10+288]; +ld.shared.f64 fd944, [r10+360]; +ld.shared.f64 fd945, [r10+432]; +ld.shared.f64 fd946, [r10+504]; +ld.shared.f64 fd947, [r10+576]; +ld.shared.f64 fd948, [r10+648]; +ld.shared.f64 fd949, [r10+720]; +ld.shared.f64 fd950, [r10+792]; +ld.shared.f64 fd951, [r10+864]; +ld.shared.f64 fd952, [r10+936]; +ld.shared.f64 fd953, [r10+1008]; +ld.shared.f64 fd954, [r10+1080]; +ld.shared.f64 fd955, [r10+1152]; +ld.shared.f64 fd956, [r10+1224]; +ld.shared.f64 fd957, [r10+1296]; +ld.shared.f64 fd958, [r10+1368]; +ld.shared.f64 fd959, [r10+1440]; +ld.shared.f64 fd960, [r10+1512]; +ld.shared.f64 fd961, [r10+1584]; +ld.shared.f64 fd962, [r10+1656]; +ld.shared.f64 fd963, [r10+1728]; +ld.shared.f64 fd964, [r10+1800]; +ld.shared.f64 fd965, [r10+1872]; +barrier.sync 0; +st.shared.f64 [r9], fd1448; +st.shared.f64 [r9+8], fd689; +st.shared.f64 [r9+16], fd699; +st.shared.f64 [r9+24], fd709; +st.shared.f64 [r9+32], fd719; +st.shared.f64 [r9+40], fd729; +st.shared.f64 [r9+48], fd739; +st.shared.f64 [r9+56], fd749; +st.shared.f64 [r9+64], fd759; +st.shared.f64 [r9+72], fd769; +st.shared.f64 [r9+80], fd779; +st.shared.f64 [r9+88], fd789; +st.shared.f64 [r9+96], fd799; +st.shared.f64 [r9+104], fd809; +st.shared.f64 [r9+112], fd818; +st.shared.f64 [r9+120], fd828; +st.shared.f64 [r9+128], fd838; +st.shared.f64 [r9+136], fd848; +st.shared.f64 [r9+144], fd858; +st.shared.f64 [r9+152], fd868; +st.shared.f64 [r9+160], fd878; +st.shared.f64 [r9+168], fd888; +st.shared.f64 [r9+176], fd898; +st.shared.f64 [r9+184], fd908; +st.shared.f64 [r9+192], fd918; +st.shared.f64 [r9+200], fd928; +st.shared.f64 [r9+208], fd938; +barrier.sync 0; +ld.shared.f64 fd966, [r10]; +ld.shared.f64 fd967, [r10+72]; +ld.shared.f64 fd968, [r10+144]; +ld.shared.f64 fd969, [r10+216]; +ld.shared.f64 fd970, [r10+288]; +ld.shared.f64 fd971, [r10+360]; +ld.shared.f64 fd972, [r10+432]; +ld.shared.f64 fd973, [r10+504]; +ld.shared.f64 fd974, [r10+576]; +ld.shared.f64 fd975, [r10+648]; +ld.shared.f64 fd976, [r10+720]; +ld.shared.f64 fd977, [r10+792]; +ld.shared.f64 fd978, [r10+864]; +ld.shared.f64 fd979, [r10+936]; +ld.shared.f64 fd980, [r10+1008]; +ld.shared.f64 fd981, [r10+1080]; +ld.shared.f64 fd982, [r10+1152]; +ld.shared.f64 fd983, [r10+1224]; +ld.shared.f64 fd984, [r10+1296]; +ld.shared.f64 fd985, [r10+1368]; +ld.shared.f64 fd986, [r10+1440]; +ld.shared.f64 fd987, [r10+1512]; +ld.shared.f64 fd988, [r10+1584]; +ld.shared.f64 fd989, [r10+1656]; +ld.shared.f64 fd990, [r10+1728]; +ld.shared.f64 fd991, [r10+1800]; +ld.shared.f64 fd992, [r10+1872]; +add.f64 fd993, fd948, fd957; +add.f64 fd994, fd939, fd993; +mul.f64 fd997, fd993, 0d3FE0000000000000; +sub.f64 fd998, fd939, fd997; +add.f64 fd1398, fd975, fd984; +sub.f64 fd999, fd975, fd984; +mul.f64 fd1000, fd999, 0dBFEBB67AE8584CAA; +add.f64 fd1001, fd1000, fd998; +sub.f64 fd1002, fd998, fd1000; +add.f64 fd1397, fd966, fd1398; +mul.f64 fd1003, fd1398, 0d3FE0000000000000; +sub.f64 fd1004, fd966, fd1003; +sub.f64 fd1005, fd948, fd957; +mul.f64 fd1006, fd1005, 0dBFEBB67AE8584CAA; +sub.f64 fd1007, fd1004, fd1006; +add.f64 fd1008, fd1006, fd1004; +add.f64 fd1009, fd951, fd960; +add.f64 fd1010, fd942, fd1009; +mul.f64 fd1013, fd1009, 0d3FE0000000000000; +sub.f64 fd1014, fd942, fd1013; +add.f64 fd1396, fd978, fd987; +sub.f64 fd1015, fd978, fd987; +mul.f64 fd1016, fd1015, 0dBFEBB67AE8584CAA; +add.f64 fd1017, fd1016, fd1014; +sub.f64 fd1018, fd1014, fd1016; +add.f64 fd1395, fd969, fd1396; +mul.f64 fd1019, fd1396, 0d3FE0000000000000; +sub.f64 fd1020, fd969, fd1019; +sub.f64 fd1021, fd951, fd960; +mul.f64 fd1022, fd1021, 0dBFEBB67AE8584CAA; +sub.f64 fd1023, fd1020, fd1022; +add.f64 fd1024, fd1022, fd1020; +add.f64 fd1025, fd954, fd963; +add.f64 fd1026, fd945, fd1025; +mul.f64 fd1029, fd1025, 0d3FE0000000000000; +sub.f64 fd1030, fd945, fd1029; +add.f64 fd1394, fd981, fd990; +sub.f64 fd1031, fd981, fd990; +mul.f64 fd1032, fd1031, 0dBFEBB67AE8584CAA; +add.f64 fd1033, fd1032, fd1030; +sub.f64 fd1034, fd1030, fd1032; +add.f64 fd1393, fd972, fd1394; +mul.f64 fd1035, fd1394, 0d3FE0000000000000; +sub.f64 fd1036, fd972, fd1035; +sub.f64 fd1037, fd954, fd963; +mul.f64 fd1038, fd1037, 0dBFEBB67AE8584CAA; +sub.f64 fd1039, fd1036, fd1038; +add.f64 fd1040, fd1038, fd1036; +mul.f64 fd1042, fd1023, 0d3FE491B7523C161D; +mul.f64 fd1392, fd1017, 0d3FE8836FA2CF5039; +sub.f64 fd1043, fd1392, fd1042; +mul.f64 fd1044, fd1023, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1045, fd1017, 0d3FE491B7523C161D, fd1044; +mul.f64 fd1390, fd1033, 0d3FC63A1A7E0B738A; +mul.f64 fd1391, fd1039, 0d3FEF838B8C811C17; +sub.f64 fd1048, fd1390, fd1391; +mul.f64 fd1049, fd1039, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1050, fd1033, 0d3FEF838B8C811C17, fd1049; +mul.f64 fd1388, fd1018, 0d3FC63A1A7E0B738A; +mul.f64 fd1389, fd1024, 0d3FEF838B8C811C17; +sub.f64 fd1053, fd1388, fd1389; +mul.f64 fd1054, fd1024, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1055, fd1018, 0d3FEF838B8C811C17, fd1054; +mul.f64 fd1386, fd1034, 0dBFEE11F642522D1C; +mul.f64 fd1387, fd1040, 0d3FD5E3A8748A0BF5; +sub.f64 fd1058, fd1386, fd1387; +mul.f64 fd1059, fd1040, 0dBFEE11F642522D1C; +fma.rn.f64 fd1060, fd1034, 0d3FD5E3A8748A0BF5, fd1059; +add.f64 fd1061, fd1010, fd1026; +mul.f64 fd1063, fd1061, 0d3FE0000000000000; +sub.f64 fd1064, fd994, fd1063; +add.f64 fd1385, fd1395, fd1393; +sub.f64 fd1065, fd1395, fd1393; +mul.f64 fd1066, fd1065, 0dBFEBB67AE8584CAA; +mul.f64 fd1067, fd1385, 0d3FE0000000000000; +sub.f64 fd1068, fd1397, fd1067; +sub.f64 fd1069, fd1010, fd1026; +mul.f64 fd1070, fd1069, 0dBFEBB67AE8584CAA; +add.f64 fd1071, fd1043, fd1048; +mul.f64 fd1073, fd1071, 0d3FE0000000000000; +sub.f64 fd1074, fd1001, fd1073; +add.f64 fd1384, fd1045, fd1050; +sub.f64 fd1075, fd1045, fd1050; +mul.f64 fd1076, fd1075, 0dBFEBB67AE8584CAA; +mul.f64 fd1077, fd1384, 0d3FE0000000000000; +sub.f64 fd1078, fd1007, fd1077; +sub.f64 fd1079, fd1043, fd1048; +mul.f64 fd1080, fd1079, 0dBFEBB67AE8584CAA; +add.f64 fd1081, fd1053, fd1058; +mul.f64 fd1083, fd1081, 0d3FE0000000000000; +sub.f64 fd1084, fd1002, fd1083; +add.f64 fd1383, fd1055, fd1060; +sub.f64 fd1085, fd1055, fd1060; +mul.f64 fd1086, fd1085, 0dBFEBB67AE8584CAA; +mul.f64 fd1087, fd1383, 0d3FE0000000000000; +sub.f64 fd1088, fd1008, fd1087; +sub.f64 fd1089, fd1053, fd1058; +mul.f64 fd1090, fd1089, 0dBFEBB67AE8584CAA; +add.f64 fd1091, fd949, fd958; +add.f64 fd1092, fd940, fd1091; +mul.f64 fd1095, fd1091, 0d3FE0000000000000; +sub.f64 fd1096, fd940, fd1095; +add.f64 fd1382, fd976, fd985; +sub.f64 fd1097, fd976, fd985; +mul.f64 fd1098, fd1097, 0dBFEBB67AE8584CAA; +add.f64 fd1099, fd1098, fd1096; +sub.f64 fd1100, fd1096, fd1098; +add.f64 fd1381, fd967, fd1382; +mul.f64 fd1101, fd1382, 0d3FE0000000000000; +sub.f64 fd1102, fd967, fd1101; +sub.f64 fd1103, fd949, fd958; +mul.f64 fd1104, fd1103, 0dBFEBB67AE8584CAA; +sub.f64 fd1105, fd1102, fd1104; +add.f64 fd1106, fd1104, fd1102; +add.f64 fd1107, fd952, fd961; +add.f64 fd1108, fd943, fd1107; +mul.f64 fd1111, fd1107, 0d3FE0000000000000; +sub.f64 fd1112, fd943, fd1111; +add.f64 fd1380, fd979, fd988; +sub.f64 fd1113, fd979, fd988; +mul.f64 fd1114, fd1113, 0dBFEBB67AE8584CAA; +add.f64 fd1115, fd1114, fd1112; +sub.f64 fd1116, fd1112, fd1114; +add.f64 fd1379, fd970, fd1380; +mul.f64 fd1117, fd1380, 0d3FE0000000000000; +sub.f64 fd1118, fd970, fd1117; +sub.f64 fd1119, fd952, fd961; +mul.f64 fd1120, fd1119, 0dBFEBB67AE8584CAA; +sub.f64 fd1121, fd1118, fd1120; +add.f64 fd1122, fd1120, fd1118; +add.f64 fd1123, fd955, fd964; +add.f64 fd1124, fd946, fd1123; +mul.f64 fd1127, fd1123, 0d3FE0000000000000; +sub.f64 fd1128, fd946, fd1127; +add.f64 fd1378, fd982, fd991; +sub.f64 fd1129, fd982, fd991; +mul.f64 fd1130, fd1129, 0dBFEBB67AE8584CAA; +add.f64 fd1131, fd1130, fd1128; +sub.f64 fd1132, fd1128, fd1130; +add.f64 fd1377, fd973, fd1378; +mul.f64 fd1133, fd1378, 0d3FE0000000000000; +sub.f64 fd1134, fd973, fd1133; +sub.f64 fd1135, fd955, fd964; +mul.f64 fd1136, fd1135, 0dBFEBB67AE8584CAA; +sub.f64 fd1137, fd1134, fd1136; +add.f64 fd1138, fd1136, fd1134; +mul.f64 fd1375, fd1115, 0d3FE8836FA2CF5039; +mul.f64 fd1376, fd1121, 0d3FE491B7523C161D; +sub.f64 fd1141, fd1375, fd1376; +mul.f64 fd1142, fd1121, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1143, fd1115, 0d3FE491B7523C161D, fd1142; +mul.f64 fd1373, fd1131, 0d3FC63A1A7E0B738A; +mul.f64 fd1374, fd1137, 0d3FEF838B8C811C17; +sub.f64 fd1146, fd1373, fd1374; +mul.f64 fd1147, fd1137, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1148, fd1131, 0d3FEF838B8C811C17, fd1147; +mul.f64 fd1150, fd1122, 0d3FEF838B8C811C17; +mul.f64 fd1372, fd1116, 0d3FC63A1A7E0B738A; +sub.f64 fd1151, fd1372, fd1150; +mul.f64 fd1152, fd1122, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1153, fd1116, 0d3FEF838B8C811C17, fd1152; +mul.f64 fd1155, fd1138, 0d3FD5E3A8748A0BF5; +mul.f64 fd1371, fd1132, 0dBFEE11F642522D1C; +sub.f64 fd1156, fd1371, fd1155; +mul.f64 fd1157, fd1138, 0dBFEE11F642522D1C; +fma.rn.f64 fd1158, fd1132, 0d3FD5E3A8748A0BF5, fd1157; +add.f64 fd1159, fd1108, fd1124; +mul.f64 fd1161, fd1159, 0d3FE0000000000000; +sub.f64 fd1162, fd1092, fd1161; +add.f64 fd1370, fd1379, fd1377; +sub.f64 fd1163, fd1379, fd1377; +mul.f64 fd1164, fd1163, 0dBFEBB67AE8584CAA; +mul.f64 fd1165, fd1370, 0d3FE0000000000000; +sub.f64 fd1166, fd1381, fd1165; +sub.f64 fd1167, fd1108, fd1124; +mul.f64 fd1168, fd1167, 0dBFEBB67AE8584CAA; +add.f64 fd1169, fd1141, fd1146; +mul.f64 fd1171, fd1169, 0d3FE0000000000000; +sub.f64 fd1172, fd1099, fd1171; +add.f64 fd1369, fd1143, fd1148; +sub.f64 fd1173, fd1143, fd1148; +mul.f64 fd1174, fd1173, 0dBFEBB67AE8584CAA; +mul.f64 fd1175, fd1369, 0d3FE0000000000000; +sub.f64 fd1176, fd1105, fd1175; +sub.f64 fd1177, fd1141, fd1146; +mul.f64 fd1178, fd1177, 0dBFEBB67AE8584CAA; +add.f64 fd1179, fd1151, fd1156; +mul.f64 fd1181, fd1179, 0d3FE0000000000000; +sub.f64 fd1182, fd1100, fd1181; +add.f64 fd1368, fd1153, fd1158; +sub.f64 fd1183, fd1153, fd1158; +mul.f64 fd1184, fd1183, 0dBFEBB67AE8584CAA; +mul.f64 fd1185, fd1368, 0d3FE0000000000000; +sub.f64 fd1186, fd1106, fd1185; +sub.f64 fd1187, fd1151, fd1156; +mul.f64 fd1188, fd1187, 0dBFEBB67AE8584CAA; +add.f64 fd1189, fd950, fd959; +add.f64 fd1190, fd941, fd1189; +mul.f64 fd1193, fd1189, 0d3FE0000000000000; +sub.f64 fd1194, fd941, fd1193; +add.f64 fd1367, fd977, fd986; +sub.f64 fd1195, fd977, fd986; +mul.f64 fd1196, fd1195, 0dBFEBB67AE8584CAA; +add.f64 fd1197, fd1196, fd1194; +sub.f64 fd1198, fd1194, fd1196; +add.f64 fd1366, fd968, fd1367; +mul.f64 fd1199, fd1367, 0d3FE0000000000000; +sub.f64 fd1200, fd968, fd1199; +sub.f64 fd1201, fd950, fd959; +mul.f64 fd1202, fd1201, 0dBFEBB67AE8584CAA; +sub.f64 fd1203, fd1200, fd1202; +add.f64 fd1204, fd1202, fd1200; +add.f64 fd1205, fd953, fd962; +add.f64 fd1206, fd944, fd1205; +mul.f64 fd1209, fd1205, 0d3FE0000000000000; +sub.f64 fd1210, fd944, fd1209; +add.f64 fd1365, fd980, fd989; +sub.f64 fd1211, fd980, fd989; +mul.f64 fd1212, fd1211, 0dBFEBB67AE8584CAA; +add.f64 fd1213, fd1212, fd1210; +sub.f64 fd1214, fd1210, fd1212; +add.f64 fd1364, fd971, fd1365; +mul.f64 fd1215, fd1365, 0d3FE0000000000000; +sub.f64 fd1216, fd971, fd1215; +sub.f64 fd1217, fd953, fd962; +mul.f64 fd1218, fd1217, 0dBFEBB67AE8584CAA; +sub.f64 fd1219, fd1216, fd1218; +add.f64 fd1220, fd1218, fd1216; +add.f64 fd1221, fd956, fd965; +add.f64 fd1222, fd947, fd1221; +mul.f64 fd1225, fd1221, 0d3FE0000000000000; +sub.f64 fd1226, fd947, fd1225; +add.f64 fd1363, fd983, fd992; +sub.f64 fd1227, fd983, fd992; +mul.f64 fd1228, fd1227, 0dBFEBB67AE8584CAA; +add.f64 fd1229, fd1228, fd1226; +sub.f64 fd1230, fd1226, fd1228; +add.f64 fd1362, fd974, fd1363; +mul.f64 fd1231, fd1363, 0d3FE0000000000000; +sub.f64 fd1232, fd974, fd1231; +sub.f64 fd1233, fd956, fd965; +mul.f64 fd1234, fd1233, 0dBFEBB67AE8584CAA; +sub.f64 fd1235, fd1232, fd1234; +add.f64 fd1236, fd1234, fd1232; +mul.f64 fd1238, fd1219, 0d3FE491B7523C161D; +mul.f64 fd1361, fd1213, 0d3FE8836FA2CF5039; +sub.f64 fd1239, fd1361, fd1238; +mul.f64 fd1240, fd1219, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1241, fd1213, 0d3FE491B7523C161D, fd1240; +mul.f64 fd1243, fd1235, 0d3FEF838B8C811C17; +mul.f64 fd1360, fd1229, 0d3FC63A1A7E0B738A; +sub.f64 fd1244, fd1360, fd1243; +mul.f64 fd1245, fd1235, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1246, fd1229, 0d3FEF838B8C811C17, fd1245; +mul.f64 fd1248, fd1220, 0d3FEF838B8C811C17; +mul.f64 fd1359, fd1214, 0d3FC63A1A7E0B738A; +sub.f64 fd1249, fd1359, fd1248; +mul.f64 fd1250, fd1220, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1251, fd1214, 0d3FEF838B8C811C17, fd1250; +mul.f64 fd1253, fd1236, 0d3FD5E3A8748A0BF5; +mul.f64 fd1358, fd1230, 0dBFEE11F642522D1C; +sub.f64 fd1254, fd1358, fd1253; +mul.f64 fd1255, fd1236, 0dBFEE11F642522D1C; +fma.rn.f64 fd1256, fd1230, 0d3FD5E3A8748A0BF5, fd1255; +add.f64 fd1257, fd1206, fd1222; +mul.f64 fd1259, fd1257, 0d3FE0000000000000; +sub.f64 fd1260, fd1190, fd1259; +add.f64 fd1357, fd1364, fd1362; +sub.f64 fd1261, fd1364, fd1362; +mul.f64 fd1262, fd1261, 0dBFEBB67AE8584CAA; +mul.f64 fd1263, fd1357, 0d3FE0000000000000; +sub.f64 fd1264, fd1366, fd1263; +sub.f64 fd1265, fd1206, fd1222; +mul.f64 fd1266, fd1265, 0dBFEBB67AE8584CAA; +add.f64 fd1267, fd1239, fd1244; +mul.f64 fd1269, fd1267, 0d3FE0000000000000; +sub.f64 fd1270, fd1197, fd1269; +add.f64 fd1356, fd1241, fd1246; +sub.f64 fd1271, fd1241, fd1246; +mul.f64 fd1272, fd1271, 0dBFEBB67AE8584CAA; +mul.f64 fd1273, fd1356, 0d3FE0000000000000; +sub.f64 fd1274, fd1203, fd1273; +sub.f64 fd1275, fd1239, fd1244; +mul.f64 fd1276, fd1275, 0dBFEBB67AE8584CAA; +add.f64 fd1277, fd1249, fd1254; +mul.f64 fd1279, fd1277, 0d3FE0000000000000; +sub.f64 fd1280, fd1198, fd1279; +add.f64 fd1355, fd1251, fd1256; +sub.f64 fd1281, fd1251, fd1256; +mul.f64 fd1282, fd1281, 0dBFEBB67AE8584CAA; +mul.f64 fd1283, fd1355, 0d3FE0000000000000; +sub.f64 fd1284, fd1204, fd1283; +sub.f64 fd1285, fd1249, fd1254; +mul.f64 fd1546, fd1179, 0d3FE0000000000000; +sub.f64 fd1545, fd1100, fd1546; +mul.f64 fd1286, fd1285, 0dBFEBB67AE8584CAA; +add.f64 %0, fd994, fd1061; +mul.f64 fd1548, fd1385, 0d3FE0000000000000; +sub.f64 fd1547, fd1397, fd1548; +add.f64 %1, fd1397, fd1385; +mul.f64 fd1550, fd1179, 0d3FE0000000000000; +sub.f64 fd1549, fd1100, fd1550; +mul.f64 fd1552, fd1169, 0d3FE0000000000000; +sub.f64 fd1551, fd1099, fd1552; +add.f64 %2, fd1092, fd1159; +add.f64 %3, fd1381, fd1370; +add.f64 %4, fd1190, fd1257; +add.f64 %5, fd1366, fd1357; +add.f64 %7, fd1007, fd1384; +add.f64 %6, fd1001, fd1071; +add.f64 %9, fd1105, fd1369; +add.f64 %8, fd1099, fd1169; +add.f64 %11, fd1203, fd1356; +add.f64 %10, fd1197, fd1267; +add.f64 %13, fd1008, fd1383; +add.f64 %12, fd1002, fd1081; +add.f64 %15, fd1106, fd1368; +add.f64 %14, fd1100, fd1179; +add.f64 %17, fd1204, fd1355; +add.f64 %16, fd1198, fd1277; +add.f64 %18, fd1066, fd1064; +sub.f64 %19, fd1547, fd1070; +add.f64 %20, fd1164, fd1162; +sub.f64 %21, fd1166, fd1168; +sub.f64 %23, fd1264, fd1266; +add.f64 %22, fd1262, fd1260; +add.f64 %24, fd1076, fd1074; +sub.f64 %25, fd1078, fd1080; +add.f64 %26, fd1174, fd1551; +sub.f64 %27, fd1176, fd1178; +add.f64 %28, fd1272, fd1270; +sub.f64 %29, fd1274, fd1276; +sub.f64 %31, fd1088, fd1090; +add.f64 %30, fd1086, fd1084; +sub.f64 %33, fd1186, fd1188; +add.f64 %32, fd1184, fd1549; +sub.f64 %35, fd1284, fd1286; +add.f64 %34, fd1282, fd1280; +sub.f64 %36, fd1064, fd1066; +add.f64 %37, fd1070, fd1547; +sub.f64 %38, fd1162, fd1164; +add.f64 %39, fd1168, fd1166; +sub.f64 %40, fd1260, fd1262; +add.f64 %41, fd1266, fd1264; +add.f64 %43, fd1080, fd1078; +sub.f64 %42, fd1074, fd1076; +add.f64 %45, fd1178, fd1176; +sub.f64 %44, fd1551, fd1174; +add.f64 %47, fd1276, fd1274; +sub.f64 %46, fd1270, fd1272; +add.f64 %49, fd1090, fd1088; +sub.f64 %48, fd1084, fd1086; +add.f64 %51, fd1188, fd1186; +sub.f64 %50, fd1549, fd1184; +add.f64 %53, fd1286, fd1284; +sub.f64 %52, fd1280, fd1282; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_243), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[20].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<688, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<545>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 3888, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 3888, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd122, fd152; +mul.f64 fd156, fd120, fd152; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd138, fd162; +mul.f64 fd164, fd136, fd162; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd117, fd170; +mul.f64 fd172, fd111, fd170; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd133, fd178; +mul.f64 fd180, fd127, fd178; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+432]; +mul.f64 fd186, fd149, fd183; +mul.f64 fd187, fd143, fd183; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd118, fd193; +mul.f64 fd195, fd112, fd193; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd134, fd201; +mul.f64 fd203, fd128, fd201; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd150, fd209; +mul.f64 fd211, fd144, fd209; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r13, r11, 144, r12; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r13], {fd214, fd213}; +fma.rn.f64 fd215, fd151, fd120, fd155; +sub.f64 fd216, fd157, fd156; +st.shared.v2.f64 [r13+16], {fd215, fd216}; +fma.rn.f64 fd217, fd160, fd136, fd163; +sub.f64 fd218, fd165, fd164; +st.shared.v2.f64 [r13+32], {fd217, fd218}; +sub.f64 fd219, fd173, fd172; +fma.rn.f64 fd220, fd168, fd111, fd171; +st.shared.v2.f64 [r13+48], {fd220, fd219}; +fma.rn.f64 fd221, fd176, fd127, fd179; +sub.f64 fd222, fd181, fd180; +st.shared.v2.f64 [r13+64], {fd221, fd222}; +fma.rn.f64 fd223, fd182, fd143, fd186; +sub.f64 fd224, fd188, fd187; +st.shared.v2.f64 [r13+80], {fd223, fd224}; +fma.rn.f64 fd225, fd191, fd112, fd194; +sub.f64 fd226, fd196, fd195; +st.shared.v2.f64 [r13+96], {fd225, fd226}; +fma.rn.f64 fd227, fd199, fd128, fd202; +sub.f64 fd228, fd204, fd203; +st.shared.v2.f64 [r13+112], {fd227, fd228}; +fma.rn.f64 fd229, fd207, fd144, fd210; +sub.f64 fd230, fd212, fd211; +st.shared.v2.f64 [r13+128], {fd229, fd230}; +barrier.sync 0; +shl.b32 r14, r11, 7; +sub.s32 r15, r13, r14; +ld.shared.v2.f64 {fd231, fd232}, [r15]; +ld.shared.v2.f64 {fd235, fd236}, [r15+432]; +ld.shared.v2.f64 {fd239, fd240}, [r15+864]; +ld.shared.v2.f64 {fd243, fd244}, [r15+1296]; +ld.shared.v2.f64 {fd247, fd248}, [r15+1728]; +ld.shared.v2.f64 {fd251, fd252}, [r15+2160]; +ld.shared.v2.f64 {fd255, fd256}, [r15+2592]; +ld.shared.v2.f64 {fd259, fd260}, [r15+3024]; +ld.shared.v2.f64 {fd263, fd264}, [r15+3456]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0dBFEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0dBFEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0dBFEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0d3FE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0d3FE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0d3FEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0d3FEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0d3FEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0d3FEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0d3FD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0d3FD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0dBFEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0dBFEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd352, fd382; +mul.f64 fd386, fd350, fd382; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd368, fd392; +mul.f64 fd394, fd366, fd392; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd347, fd400; +mul.f64 fd402, fd341, fd400; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd363, fd408; +mul.f64 fd410, fd357, fd408; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+48]; +mul.f64 fd416, fd379, fd413; +mul.f64 fd417, fd373, fd413; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd348, fd423; +mul.f64 fd425, fd342, fd423; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd364, fd431; +mul.f64 fd433, fd358, fd431; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd380, fd439; +mul.f64 fd441, fd374, fd439; +mul.f64 fd442, fd437, fd380; +shl.b32 r19, r18, 4; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 1296, r20; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r21], {fd444, fd443}; +fma.rn.f64 fd445, fd381, fd350, fd385; +sub.f64 fd446, fd387, fd386; +st.shared.v2.f64 [r21+144], {fd445, fd446}; +fma.rn.f64 fd447, fd390, fd366, fd393; +sub.f64 fd448, fd395, fd394; +st.shared.v2.f64 [r21+288], {fd447, fd448}; +fma.rn.f64 fd449, fd398, fd341, fd401; +sub.f64 fd450, fd403, fd402; +st.shared.v2.f64 [r21+432], {fd449, fd450}; +fma.rn.f64 fd451, fd406, fd357, fd409; +sub.f64 fd452, fd411, fd410; +st.shared.v2.f64 [r21+576], {fd451, fd452}; +fma.rn.f64 fd453, fd412, fd373, fd416; +sub.f64 fd454, fd418, fd417; +st.shared.v2.f64 [r21+720], {fd453, fd454}; +fma.rn.f64 fd455, fd421, fd342, fd424; +sub.f64 fd456, fd426, fd425; +st.shared.v2.f64 [r21+864], {fd455, fd456}; +fma.rn.f64 fd457, fd429, fd358, fd432; +sub.f64 fd458, fd434, fd433; +st.shared.v2.f64 [r21+1008], {fd457, fd458}; +fma.rn.f64 fd459, fd437, fd374, fd440; +sub.f64 fd460, fd442, fd441; +st.shared.v2.f64 [r21+1152], {fd459, fd460}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r15]; +ld.shared.v2.f64 {fd465, fd466}, [r15+432]; +ld.shared.v2.f64 {fd469, fd470}, [r15+864]; +ld.shared.v2.f64 {fd473, fd474}, [r15+1296]; +ld.shared.v2.f64 {fd477, fd478}, [r15+1728]; +ld.shared.v2.f64 {fd481, fd482}, [r15+2160]; +ld.shared.v2.f64 {fd485, fd486}, [r15+2592]; +ld.shared.v2.f64 {fd489, fd490}, [r15+3024]; +ld.shared.v2.f64 {fd493, fd494}, [r15+3456]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd474, fd486; +mul.f64 fd499, fd497, 0d3FE0000000000000; +sub.f64 fd500, fd461, fd499; +sub.f64 fd501, fd474, fd486; +mul.f64 fd502, fd501, 0dBFEBB67AE8584CAA; +mul.f64 fd503, fd498, 0d3FE0000000000000; +sub.f64 fd504, fd462, fd503; +sub.f64 fd505, fd473, fd485; +mul.f64 fd506, fd505, 0dBFEBB67AE8584CAA; +add.f64 fd507, fd477, fd489; +add.f64 fd508, fd478, fd490; +mul.f64 fd509, fd507, 0d3FE0000000000000; +sub.f64 fd510, fd465, fd509; +sub.f64 fd511, fd478, fd490; +mul.f64 fd512, fd511, 0dBFEBB67AE8584CAA; +mul.f64 fd513, fd508, 0d3FE0000000000000; +sub.f64 fd514, fd466, fd513; +sub.f64 fd515, fd477, fd489; +mul.f64 fd516, fd515, 0dBFEBB67AE8584CAA; +add.f64 fd517, fd481, fd493; +add.f64 fd518, fd482, fd494; +mul.f64 fd519, fd517, 0d3FE0000000000000; +sub.f64 fd520, fd469, fd519; +sub.f64 fd521, fd482, fd494; +mul.f64 fd522, fd521, 0dBFEBB67AE8584CAA; +mul.f64 fd523, fd518, 0d3FE0000000000000; +sub.f64 fd524, fd470, fd523; +sub.f64 fd525, fd481, fd493; +mul.f64 fd526, fd525, 0dBFEBB67AE8584CAA; +add.f64 %1, fd462, fd498; +add.f64 %0, fd461, fd497; +add.f64 %3, fd466, fd508; +add.f64 %2, fd465, fd507; +add.f64 %5, fd470, fd518; +add.f64 %4, fd469, fd517; +sub.f64 %7, fd504, fd506; +add.f64 %6, fd502, fd500; +sub.f64 %9, fd514, fd516; +add.f64 %8, fd512, fd510; +sub.f64 %11, fd524, fd526; +add.f64 %10, fd522, fd520; +add.f64 %13, fd506, fd504; +sub.f64 %12, fd500, fd502; +add.f64 %15, fd516, fd514; +sub.f64 %14, fd510, fd512; +add.f64 %17, fd526, fd524; +sub.f64 %16, fd520, fd522; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<689, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<509>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 1944, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0dBFEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0dBFEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0dBFEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd124, fd154; +fma.rn.f64 fd158, fd153, fd122, fd157; +mul.f64 fd159, fd122, fd154; +mul.f64 fd160, fd153, fd124; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd140, fd166; +fma.rn.f64 fd168, fd164, fd138, fd167; +mul.f64 fd169, fd138, fd166; +mul.f64 fd170, fd164, fd140; +sub.f64 fd171, fd170, fd169; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd119, fd176; +fma.rn.f64 fd178, fd174, fd113, fd177; +mul.f64 fd179, fd113, fd176; +mul.f64 fd180, fd174, fd119; +sub.f64 fd181, fd180, fd179; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd135, fd186; +fma.rn.f64 fd188, fd184, fd129, fd187; +mul.f64 fd189, fd129, fd186; +mul.f64 fd190, fd184, fd135; +sub.f64 fd191, fd190, fd189; +ld.global.v2.f64 {fd192, fd193}, [rd6+432]; +mul.f64 fd196, fd151, fd193; +fma.rn.f64 fd197, fd192, fd145, fd196; +mul.f64 fd198, fd145, fd193; +mul.f64 fd199, fd192, fd151; +sub.f64 fd200, fd199, fd198; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd120, fd205; +fma.rn.f64 fd207, fd203, fd114, fd206; +mul.f64 fd208, fd114, fd205; +mul.f64 fd209, fd203, fd120; +sub.f64 fd210, fd209, fd208; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd136, fd215; +fma.rn.f64 fd217, fd213, fd130, fd216; +mul.f64 fd218, fd130, fd215; +mul.f64 fd219, fd213, fd136; +sub.f64 fd220, fd219, fd218; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd152, fd225; +fma.rn.f64 fd227, fd223, fd146, fd226; +mul.f64 fd228, fd146, fd225; +mul.f64 fd229, fd223, fd152; +sub.f64 fd230, fd229, fd228; +mad.lo.s32 r12, r9, 1944, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 72, r12; +st.shared.f64 [r13], fd106; +st.shared.f64 [r13+8], fd158; +st.shared.f64 [r13+16], fd168; +st.shared.f64 [r13+24], fd178; +st.shared.f64 [r13+32], fd188; +st.shared.f64 [r13+40], fd197; +st.shared.f64 [r13+48], fd207; +st.shared.f64 [r13+56], fd217; +st.shared.f64 [r13+64], fd227; +barrier.sync 0; +shl.b32 r14, r11, 6; +sub.s32 r15, r13, r14; +ld.shared.f64 fd231, [r15]; +ld.shared.f64 fd232, [r15+216]; +ld.shared.f64 fd233, [r15+432]; +ld.shared.f64 fd234, [r15+648]; +ld.shared.f64 fd235, [r15+864]; +ld.shared.f64 fd236, [r15+1080]; +ld.shared.f64 fd237, [r15+1296]; +ld.shared.f64 fd238, [r15+1512]; +ld.shared.f64 fd239, [r15+1728]; +barrier.sync 0; +st.shared.f64 [r13], fd108; +st.shared.f64 [r13+8], fd161; +st.shared.f64 [r13+16], fd171; +st.shared.f64 [r13+24], fd181; +st.shared.f64 [r13+32], fd191; +st.shared.f64 [r13+40], fd200; +st.shared.f64 [r13+48], fd210; +st.shared.f64 [r13+56], fd220; +st.shared.f64 [r13+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r15]; +ld.shared.f64 fd241, [r15+216]; +ld.shared.f64 fd242, [r15+432]; +ld.shared.f64 fd243, [r15+648]; +ld.shared.f64 fd244, [r15+864]; +ld.shared.f64 fd245, [r15+1080]; +ld.shared.f64 fd246, [r15+1296]; +ld.shared.f64 fd247, [r15+1512]; +ld.shared.f64 fd248, [r15+1728]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0dBFEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0d3FE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0d3FE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0d3FEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0d3FEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0d3FEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0d3FEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0d3FD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0d3FD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0dBFEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0dBFEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r11, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 9; +sub.s32 r18, r11, r17; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd336, fd366; +fma.rn.f64 fd370, fd365, fd334, fd369; +mul.f64 fd371, fd334, fd366; +mul.f64 fd372, fd365, fd336; +sub.f64 fd373, fd372, fd371; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd352, fd378; +fma.rn.f64 fd380, fd376, fd350, fd379; +mul.f64 fd381, fd350, fd378; +mul.f64 fd382, fd376, fd352; +sub.f64 fd383, fd382, fd381; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd331, fd388; +fma.rn.f64 fd390, fd386, fd325, fd389; +mul.f64 fd391, fd325, fd388; +mul.f64 fd392, fd386, fd331; +sub.f64 fd393, fd392, fd391; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd347, fd398; +fma.rn.f64 fd400, fd396, fd341, fd399; +mul.f64 fd401, fd341, fd398; +mul.f64 fd402, fd396, fd347; +sub.f64 fd403, fd402, fd401; +ld.global.v2.f64 {fd404, fd405}, [rd11+48]; +mul.f64 fd408, fd363, fd405; +fma.rn.f64 fd409, fd404, fd357, fd408; +mul.f64 fd410, fd357, fd405; +mul.f64 fd411, fd404, fd363; +sub.f64 fd412, fd411, fd410; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd332, fd417; +fma.rn.f64 fd419, fd415, fd326, fd418; +mul.f64 fd420, fd326, fd417; +mul.f64 fd421, fd415, fd332; +sub.f64 fd422, fd421, fd420; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd348, fd427; +fma.rn.f64 fd429, fd425, fd342, fd428; +mul.f64 fd430, fd342, fd427; +mul.f64 fd431, fd425, fd348; +sub.f64 fd432, fd431, fd430; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd364, fd437; +fma.rn.f64 fd439, fd435, fd358, fd438; +mul.f64 fd440, fd358, fd437; +mul.f64 fd441, fd435, fd364; +sub.f64 fd442, fd441, fd440; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +barrier.sync 0; +mad.lo.s32 r21, r16, 648, r20; +st.shared.f64 [r21], fd318; +st.shared.f64 [r21+72], fd370; +st.shared.f64 [r21+144], fd380; +st.shared.f64 [r21+216], fd390; +st.shared.f64 [r21+288], fd400; +st.shared.f64 [r21+360], fd409; +st.shared.f64 [r21+432], fd419; +st.shared.f64 [r21+504], fd429; +st.shared.f64 [r21+576], fd439; +barrier.sync 0; +ld.shared.f64 fd443, [r15]; +ld.shared.f64 fd444, [r15+216]; +ld.shared.f64 fd445, [r15+432]; +ld.shared.f64 fd446, [r15+648]; +ld.shared.f64 fd447, [r15+864]; +ld.shared.f64 fd448, [r15+1080]; +ld.shared.f64 fd449, [r15+1296]; +ld.shared.f64 fd450, [r15+1512]; +ld.shared.f64 fd451, [r15+1728]; +barrier.sync 0; +st.shared.f64 [r21], fd320; +st.shared.f64 [r21+72], fd373; +st.shared.f64 [r21+144], fd383; +st.shared.f64 [r21+216], fd393; +st.shared.f64 [r21+288], fd403; +st.shared.f64 [r21+360], fd412; +st.shared.f64 [r21+432], fd422; +st.shared.f64 [r21+504], fd432; +st.shared.f64 [r21+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r15]; +ld.shared.f64 fd453, [r15+216]; +ld.shared.f64 fd454, [r15+432]; +ld.shared.f64 fd455, [r15+648]; +ld.shared.f64 fd456, [r15+864]; +ld.shared.f64 fd457, [r15+1080]; +ld.shared.f64 fd458, [r15+1296]; +ld.shared.f64 fd459, [r15+1512]; +ld.shared.f64 fd460, [r15+1728]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd455, fd458; +mul.f64 fd463, fd461, 0d3FE0000000000000; +sub.f64 fd464, fd443, fd463; +sub.f64 fd465, fd455, fd458; +mul.f64 fd466, fd465, 0dBFEBB67AE8584CAA; +mul.f64 fd467, fd462, 0d3FE0000000000000; +sub.f64 fd468, fd452, fd467; +sub.f64 fd469, fd446, fd449; +mul.f64 fd470, fd469, 0dBFEBB67AE8584CAA; +add.f64 fd471, fd447, fd450; +add.f64 fd472, fd456, fd459; +mul.f64 fd473, fd471, 0d3FE0000000000000; +sub.f64 fd474, fd444, fd473; +sub.f64 fd475, fd456, fd459; +mul.f64 fd476, fd475, 0dBFEBB67AE8584CAA; +mul.f64 fd477, fd472, 0d3FE0000000000000; +sub.f64 fd478, fd453, fd477; +sub.f64 fd479, fd447, fd450; +mul.f64 fd480, fd479, 0dBFEBB67AE8584CAA; +add.f64 fd481, fd448, fd451; +add.f64 fd482, fd457, fd460; +mul.f64 fd483, fd481, 0d3FE0000000000000; +sub.f64 fd484, fd445, fd483; +sub.f64 fd485, fd457, fd460; +mul.f64 fd486, fd485, 0dBFEBB67AE8584CAA; +mul.f64 fd487, fd482, 0d3FE0000000000000; +sub.f64 fd488, fd454, fd487; +sub.f64 fd489, fd448, fd451; +mul.f64 fd490, fd489, 0dBFEBB67AE8584CAA; +add.f64 %0, fd443, fd461; +add.f64 %1, fd452, fd462; +add.f64 %2, fd444, fd471; +add.f64 %3, fd453, fd472; +add.f64 %4, fd445, fd481; +add.f64 %5, fd454, fd482; +add.f64 %6, fd466, fd464; +sub.f64 %7, fd468, fd470; +add.f64 %8, fd476, fd474; +sub.f64 %9, fd478, fd480; +add.f64 %10, fd486, fd484; +sub.f64 %11, fd488, fd490; +sub.f64 %12, fd464, fd466; +add.f64 %13, fd470, fd468; +sub.f64 %14, fd474, fd476; +add.f64 %15, fd480, fd478; +sub.f64 %16, fd484, fd486; +add.f64 %17, fd490, fd488; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_243), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<691, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<189>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 1944, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %13, %16; +add.f64 fd14, %11, fd13; +add.f64 fd15, %15, %17; +add.f64 fd16, %12, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %11, fd17; +sub.f64 fd19, %15, %17; +mul.f64 fd20, fd19, 0dBFEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %12, fd23; +sub.f64 fd25, %13, %16; +mul.f64 fd26, fd25, 0dBFEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1944, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd27, fd30; +fma.rn.f64 fd34, fd29, fd21, fd33; +mul.f64 fd35, fd21, fd30; +mul.f64 fd36, fd29, fd27; +sub.f64 fd37, fd36, fd35; +ld.global.v2.f64 {fd38, fd39}, [rd6+1296]; +mul.f64 fd42, fd28, fd39; +fma.rn.f64 fd43, fd38, fd22, fd42; +mul.f64 fd44, fd22, fd39; +mul.f64 fd45, fd38, fd28; +sub.f64 fd46, fd45, fd44; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd34; +st.shared.f64 [r9+16], fd43; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+648]; +ld.shared.f64 fd49, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+648]; +ld.shared.f64 fd52, [r11+1296]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd67, fd70; +fma.rn.f64 fd74, fd69, fd61, fd73; +mul.f64 fd75, fd61, fd70; +mul.f64 fd76, fd69, fd67; +sub.f64 fd77, fd76, fd75; +ld.global.v2.f64 {fd78, fd79}, [rd11+432]; +mul.f64 fd82, fd68, fd79; +fma.rn.f64 fd83, fd78, fd62, fd82; +mul.f64 fd84, fd62, fd79; +mul.f64 fd85, fd78, fd68; +sub.f64 fd86, fd85, fd84; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd74; +st.shared.f64 [r17+48], fd83; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+648]; +ld.shared.f64 fd89, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+648]; +ld.shared.f64 fd92, [r11+1296]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0dBFEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0dBFEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd107, fd110; +fma.rn.f64 fd114, fd109, fd101, fd113; +mul.f64 fd115, fd101, fd110; +mul.f64 fd116, fd109, fd107; +sub.f64 fd117, fd116, fd115; +ld.global.v2.f64 {fd118, fd119}, [rd16+144]; +mul.f64 fd122, fd108, fd119; +fma.rn.f64 fd123, fd118, fd102, fd122; +mul.f64 fd124, fd102, fd119; +mul.f64 fd125, fd118, fd108; +sub.f64 fd126, fd125, fd124; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +st.shared.f64 [r23], fd94; +st.shared.f64 [r23+72], fd114; +st.shared.f64 [r23+144], fd123; +barrier.sync 0; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+648]; +ld.shared.f64 fd129, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r23], fd96; +st.shared.f64 [r23+72], fd117; +st.shared.f64 [r23+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r11]; +ld.shared.f64 fd131, [r11+648]; +ld.shared.f64 fd132, [r11+1296]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd127, fd133; +add.f64 fd135, fd131, fd132; +add.f64 fd136, fd130, fd135; +mul.f64 fd137, fd133, 0d3FE0000000000000; +sub.f64 fd138, fd127, fd137; +sub.f64 fd139, fd131, fd132; +mul.f64 fd140, fd139, 0dBFEBB67AE8584CAA; +add.f64 fd141, fd140, fd138; +sub.f64 fd142, fd138, fd140; +mul.f64 fd143, fd135, 0d3FE0000000000000; +sub.f64 fd144, fd130, fd143; +sub.f64 fd145, fd128, fd129; +mul.f64 fd146, fd145, 0dBFEBB67AE8584CAA; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd146, fd144; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd149, fd150}, [rd21]; +mul.f64 fd153, fd147, fd150; +fma.rn.f64 fd154, fd149, fd141, fd153; +mul.f64 fd155, fd141, fd150; +mul.f64 fd156, fd149, fd147; +sub.f64 fd157, fd156, fd155; +ld.global.v2.f64 {fd158, fd159}, [rd21+48]; +mul.f64 fd162, fd148, fd159; +fma.rn.f64 fd163, fd158, fd142, fd162; +mul.f64 fd164, fd142, fd159; +mul.f64 fd165, fd158, fd148; +sub.f64 fd166, fd165, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +st.shared.f64 [r33], fd134; +st.shared.f64 [r33+216], fd154; +st.shared.f64 [r33+432], fd163; +barrier.sync 0; +ld.shared.f64 fd167, [r11]; +ld.shared.f64 fd168, [r11+648]; +ld.shared.f64 fd169, [r11+1296]; +barrier.sync 0; +st.shared.f64 [r33], fd136; +st.shared.f64 [r33+216], fd157; +st.shared.f64 [r33+432], fd166; +barrier.sync 0; +ld.shared.f64 fd170, [r11]; +ld.shared.f64 fd171, [r11+648]; +ld.shared.f64 fd172, [r11+1296]; +add.f64 fd173, fd168, fd169; +add.f64 fd174, fd171, fd172; +mul.f64 fd175, fd173, 0d3FE0000000000000; +sub.f64 fd176, fd167, fd175; +sub.f64 fd177, fd171, fd172; +mul.f64 fd178, fd177, 0dBFEBB67AE8584CAA; +mul.f64 fd179, fd174, 0d3FE0000000000000; +sub.f64 fd180, fd170, fd179; +sub.f64 fd181, fd168, fd169; +mul.f64 fd182, fd181, 0dBFEBB67AE8584CAA; +add.f64 %0, fd167, fd173; +add.f64 %1, fd170, fd174; +add.f64 %2, fd178, fd176; +sub.f64 %3, fd180, fd182; +sub.f64 %4, fd176, fd178; +add.f64 %5, fd182, fd180; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<690, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<213>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 3888, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %13, %16; +add.f64 fd14, %15, %17; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %11, fd15; +sub.f64 fd17, %15, %17; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %12, fd21; +sub.f64 fd23, %13, %16; +mul.f64 fd24, fd23, 0dBFEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 3888, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd25, fd28; +mul.f64 fd32, fd19, fd28; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+1296]; +mul.f64 fd38, fd26, fd35; +mul.f64 fd39, fd20, fd35; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %12, fd14; +add.f64 fd42, %11, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd27, fd19, fd31; +sub.f64 fd44, fd33, fd32; +st.shared.v2.f64 [r9+16], {fd43, fd44}; +fma.rn.f64 fd45, fd34, fd20, fd38; +sub.f64 fd46, fd40, fd39; +st.shared.v2.f64 [r9+32], {fd45, fd46}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+1296]; +ld.shared.v2.f64 {fd55, fd56}, [r11+2592]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0dBFEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0dBFEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd71, fd74; +mul.f64 fd78, fd65, fd74; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+432]; +mul.f64 fd84, fd72, fd81; +mul.f64 fd85, fd66, fd81; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd73, fd65, fd77; +sub.f64 fd90, fd79, fd78; +st.shared.v2.f64 [r17+48], {fd89, fd90}; +fma.rn.f64 fd91, fd80, fd66, fd84; +sub.f64 fd92, fd86, fd85; +st.shared.v2.f64 [r17+96], {fd91, fd92}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+1296]; +ld.shared.v2.f64 {fd101, fd102}, [r11+2592]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd117, fd120; +mul.f64 fd124, fd111, fd120; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+144]; +mul.f64 fd130, fd118, fd127; +mul.f64 fd131, fd112, fd127; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r23, r18, 432, r22; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r23], {fd134, fd133}; +fma.rn.f64 fd135, fd119, fd111, fd123; +sub.f64 fd136, fd125, fd124; +st.shared.v2.f64 [r23+144], {fd135, fd136}; +fma.rn.f64 fd137, fd126, fd112, fd130; +sub.f64 fd138, fd132, fd131; +st.shared.v2.f64 [r23+288], {fd137, fd138}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r11]; +ld.shared.v2.f64 {fd143, fd144}, [r11+1296]; +ld.shared.v2.f64 {fd147, fd148}, [r11+2592]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0dBFEBB67AE8584CAA; +add.f64 fd157, fd156, fd154; +sub.f64 fd158, fd154, fd156; +mul.f64 fd159, fd152, 0d3FE0000000000000; +sub.f64 fd160, fd140, fd159; +sub.f64 fd161, fd143, fd147; +mul.f64 fd162, fd161, 0dBFEBB67AE8584CAA; +sub.f64 fd163, fd160, fd162; +add.f64 fd164, fd162, fd160; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 4; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd165, fd166}, [rd21]; +mul.f64 fd169, fd163, fd166; +mul.f64 fd170, fd157, fd166; +mul.f64 fd171, fd165, fd163; +ld.global.v2.f64 {fd172, fd173}, [rd21+48]; +mul.f64 fd176, fd164, fd173; +mul.f64 fd177, fd158, fd173; +mul.f64 fd178, fd172, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 1296, r32; +add.f64 fd179, fd140, fd152; +add.f64 fd180, fd139, fd151; +st.shared.v2.f64 [r33], {fd180, fd179}; +fma.rn.f64 fd181, fd165, fd157, fd169; +sub.f64 fd182, fd171, fd170; +st.shared.v2.f64 [r33+432], {fd181, fd182}; +fma.rn.f64 fd183, fd172, fd158, fd176; +sub.f64 fd184, fd178, fd177; +st.shared.v2.f64 [r33+864], {fd183, fd184}; +barrier.sync 0; +ld.shared.v2.f64 {fd185, fd186}, [r11]; +ld.shared.v2.f64 {fd189, fd190}, [r11+1296]; +ld.shared.v2.f64 {fd193, fd194}, [r11+2592]; +add.f64 fd197, fd189, fd193; +add.f64 fd198, fd190, fd194; +mul.f64 fd199, fd197, 0d3FE0000000000000; +sub.f64 fd200, fd185, fd199; +sub.f64 fd201, fd190, fd194; +mul.f64 fd202, fd201, 0dBFEBB67AE8584CAA; +mul.f64 fd203, fd198, 0d3FE0000000000000; +sub.f64 fd204, fd186, fd203; +sub.f64 fd205, fd189, fd193; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +add.f64 %1, fd186, fd198; +add.f64 %0, fd185, fd197; +sub.f64 %3, fd204, fd206; +add.f64 %2, fd202, fd200; +add.f64 %5, fd206, fd204; +sub.f64 %4, fd200, fd202; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..168b52d55cf2d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp16_fwd.hpp.inc @@ -0,0 +1,1827 @@ +#ifndef CUFFTDX_FFT_24_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_24_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<756, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<163>; +.reg .b32 r<1575>; +.reg .f64 fd<148>; +.reg .b64 rd<3>; +mov.f64 fd120, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd120; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd119, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd119; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %53, %51; +} +{ +add.f16x2 r4, %48, r1; +} +{ +add.f16x2 r7, %49, %52; +} +{ +add.f16x2 r10, %50, r7; +} +{ +add.f16x2 r13, %53, %51; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %48, r16; +} +{ +sub.f16x2 r22, %49, %52; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %53, %51; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %48, r34; +} +{ +sub.f16x2 r40, %49, %52; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %49, %52; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %50, r52; +} +{ +sub.f16x2 r58, %53, %51; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %49, %52; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %50, r70; +} +{ +sub.f16x2 r76, %53, %51; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs5, fd120; +} +mov.b32 r156, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd119; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r165, {rs7, rs7}; +{ +add.f16x2 r85, %59, %57; +} +{ +add.f16x2 r88, %54, r85; +} +{ +add.f16x2 r91, %55, %58; +} +{ +add.f16x2 r94, %56, r91; +} +{ +add.f16x2 r97, %59, %57; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %54, r100; +} +{ +sub.f16x2 r106, %55, %58; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %59, %57; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %54, r118; +} +{ +sub.f16x2 r124, %55, %58; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %55, %58; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %56, r136; +} +{ +sub.f16x2 r142, %59, %57; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %55, %58; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %56, r154; +} +{ +sub.f16x2 r160, %59, %57; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +mov.f64 fd107, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs9, fd107; +} +{ +cvt.rn.f16.f64 rs10, fd119; +} +{ +cvt.rn.f16.f64 rs11, fd120; +} +{ +cvt.rn.f16.f64 rs12, fd119; +} +mov.b32 r183, {rs9, rs9}; +{ +mul.f16x2 r169, r112, r183; +} +mov.b32 r180, {rs10, rs10}; +{ +mul.f16x2 r172, r148, r180; +} +{ +sub.f16x2 r175, r169, r172; +} +{ +mul.f16x2 r178, r112, r180; +} +{ +fma.rn.f16x2 r181, r148, r183, r178; +} +mov.b32 r199, {rs11, rs11}; +{ +mul.f16x2 r185, r130, r199; +} +mov.b32 r196, {rs12, rs12}; +{ +mul.f16x2 r188, r166, r196; +} +{ +sub.f16x2 r191, r185, r188; +} +{ +mul.f16x2 r194, r130, r196; +} +{ +fma.rn.f16x2 r197, r166, r199, r194; +} +{ +add.f16x2 r201, r4, r88; +} +{ +add.f16x2 r204, r10, r94; +} +{ +sub.f16x2 r207, r4, r88; +} +{ +sub.f16x2 r210, r10, r94; +} +{ +add.f16x2 r213, r28, r175; +} +{ +add.f16x2 r216, r64, r181; +} +{ +sub.f16x2 r219, r28, r175; +} +{ +sub.f16x2 r222, r64, r181; +} +{ +add.f16x2 r225, r46, r191; +} +{ +add.f16x2 r228, r82, r197; +} +{ +sub.f16x2 r231, r46, r191; +} +{ +sub.f16x2 r234, r82, r197; +} +{ +cvt.rn.f16.f64 rs19, fd120; +} +mov.b32 r308, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd119; +} +{ +neg.f16 rs21, rs20; +} +mov.b32 r317, {rs21, rs21}; +{ +add.f16x2 r237, %62, %60; +} +{ +add.f16x2 r240, %63, r237; +} +{ +add.f16x2 r243, %64, %61; +} +{ +add.f16x2 r246, %65, r243; +} +{ +add.f16x2 r249, %62, %60; +} +{ +mul.f16x2 r252, r249, r308; +} +{ +add.f16x2 r255, %63, r252; +} +{ +sub.f16x2 r258, %64, %61; +} +{ +mul.f16x2 r261, r258, r317; +} +{ +add.f16x2 r264, r255, r261; +} +{ +add.f16x2 r267, %62, %60; +} +{ +mul.f16x2 r270, r267, r308; +} +{ +add.f16x2 r273, %63, r270; +} +{ +sub.f16x2 r276, %64, %61; +} +{ +mul.f16x2 r279, r276, r317; +} +{ +sub.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %64, %61; +} +{ +mul.f16x2 r288, r285, r308; +} +{ +add.f16x2 r291, %65, r288; +} +{ +sub.f16x2 r294, %62, %60; +} +{ +mul.f16x2 r297, r294, r317; +} +{ +sub.f16x2 r300, r291, r297; +} +{ +add.f16x2 r303, %64, %61; +} +{ +mul.f16x2 r306, r303, r308; +} +{ +add.f16x2 r309, %65, r306; +} +{ +sub.f16x2 r312, %62, %60; +} +{ +mul.f16x2 r315, r312, r317; +} +{ +add.f16x2 r318, r309, r315; +} +{ +cvt.rn.f16.f64 rs23, fd120; +} +mov.b32 r392, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs24, fd119; +} +{ +neg.f16 rs25, rs24; +} +mov.b32 r401, {rs25, rs25}; +{ +add.f16x2 r321, %68, %66; +} +{ +add.f16x2 r324, %69, r321; +} +{ +add.f16x2 r327, %70, %67; +} +{ +add.f16x2 r330, %71, r327; +} +{ +add.f16x2 r333, %68, %66; +} +{ +mul.f16x2 r336, r333, r392; +} +{ +add.f16x2 r339, %69, r336; +} +{ +sub.f16x2 r342, %70, %67; +} +{ +mul.f16x2 r345, r342, r401; +} +{ +add.f16x2 r348, r339, r345; +} +{ +add.f16x2 r351, %68, %66; +} +{ +mul.f16x2 r354, r351, r392; +} +{ +add.f16x2 r357, %69, r354; +} +{ +sub.f16x2 r360, %70, %67; +} +{ +mul.f16x2 r363, r360, r401; +} +{ +sub.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %70, %67; +} +{ +mul.f16x2 r372, r369, r392; +} +{ +add.f16x2 r375, %71, r372; +} +{ +sub.f16x2 r378, %68, %66; +} +{ +mul.f16x2 r381, r378, r401; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %70, %67; +} +{ +mul.f16x2 r390, r387, r392; +} +{ +add.f16x2 r393, %71, r390; +} +{ +sub.f16x2 r396, %68, %66; +} +{ +mul.f16x2 r399, r396, r401; +} +{ +add.f16x2 r402, r393, r399; +} +{ +cvt.rn.f16.f64 rs27, fd107; +} +{ +cvt.rn.f16.f64 rs28, fd119; +} +{ +cvt.rn.f16.f64 rs29, fd120; +} +{ +cvt.rn.f16.f64 rs30, fd119; +} +mov.b32 r419, {rs27, rs27}; +{ +mul.f16x2 r405, r348, r419; +} +mov.b32 r416, {rs28, rs28}; +{ +mul.f16x2 r408, r384, r416; +} +{ +sub.f16x2 r411, r405, r408; +} +{ +mul.f16x2 r414, r348, r416; +} +{ +fma.rn.f16x2 r417, r384, r419, r414; +} +mov.b32 r435, {rs29, rs29}; +{ +mul.f16x2 r421, r366, r435; +} +mov.b32 r432, {rs30, rs30}; +mov.f64 fd147, 0d3FEBB67AE8584CAA; +{ +mul.f16x2 r424, r402, r432; +} +{ +sub.f16x2 r427, r421, r424; +} +{ +mul.f16x2 r430, r366, r432; +} +{ +fma.rn.f16x2 r433, r402, r435, r430; +} +{ +add.f16x2 r437, r240, r324; +} +{ +add.f16x2 r440, r246, r330; +} +{ +sub.f16x2 r443, r240, r324; +} +{ +sub.f16x2 r446, r246, r330; +} +{ +add.f16x2 r449, r264, r411; +} +{ +add.f16x2 r452, r300, r417; +} +{ +sub.f16x2 r455, r264, r411; +} +{ +sub.f16x2 r458, r300, r417; +} +{ +add.f16x2 r461, r282, r427; +} +{ +add.f16x2 r464, r318, r433; +} +{ +sub.f16x2 r467, r282, r427; +} +{ +sub.f16x2 r470, r318, r433; +} +{ +cvt.rn.f16.f64 rs37, fd147; +} +{ +cvt.rn.f16.f64 rs38, fd120; +} +{ +cvt.rn.f16.f64 rs39, fd107; +} +{ +cvt.rn.f16.f64 rs40, fd119; +} +{ +cvt.rn.f16.f64 rs43, fd120; +} +{ +cvt.rn.f16.f64 rs44, fd119; +} +{ +cvt.rn.f16.f64 rs45, fd119; +} +{ +cvt.rn.f16.f64 rs46, fd120; +} +mov.b32 r487, {rs37, rs37}; +{ +mul.f16x2 r473, r449, r487; +} +mov.b32 r484, {rs38, rs38}; +{ +mul.f16x2 r476, r452, r484; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r449, r484; +} +{ +fma.rn.f16x2 r485, r452, r487, r482; +} +mov.b32 r503, {rs39, rs39}; +{ +mul.f16x2 r489, r461, r503; +} +mov.b32 r500, {rs40, rs40}; +{ +mul.f16x2 r492, r464, r500; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r461, r500; +} +{ +fma.rn.f16x2 r501, r464, r503, r498; +} +{ +neg.f16x2 r505, r443; +} +mov.b32 r521, {rs43, rs43}; +{ +mul.f16x2 r507, r455, r521; +} +mov.b32 r518, {rs44, rs44}; +{ +mul.f16x2 r510, r458, r518; +} +{ +sub.f16x2 r513, r507, r510; +} +{ +mul.f16x2 r516, r455, r518; +} +{ +fma.rn.f16x2 r519, r458, r521, r516; +} +mov.b32 r537, {rs45, rs45}; +{ +mul.f16x2 r523, r467, r537; +} +mov.b32 r534, {rs46, rs46}; +{ +mul.f16x2 r526, r470, r534; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r467, r534; +} +{ +fma.rn.f16x2 r535, r470, r537, r532; +} +{ +add.f16x2 r539, r201, r437; +} +{ +add.f16x2 r542, r204, r440; +} +{ +sub.f16x2 r545, r201, r437; +} +{ +sub.f16x2 r548, r204, r440; +} +{ +add.f16x2 r551, r213, r479; +} +{ +add.f16x2 r554, r216, r485; +} +{ +sub.f16x2 r557, r213, r479; +} +{ +sub.f16x2 r560, r216, r485; +} +{ +add.f16x2 r563, r225, r495; +} +{ +add.f16x2 r566, r228, r501; +} +{ +sub.f16x2 r569, r225, r495; +} +{ +sub.f16x2 r572, r228, r501; +} +{ +add.f16x2 r575, r207, r446; +} +{ +add.f16x2 r578, r210, r505; +} +{ +sub.f16x2 r581, r207, r446; +} +{ +sub.f16x2 r584, r210, r505; +} +{ +add.f16x2 r587, r219, r513; +} +{ +add.f16x2 r590, r222, r519; +} +{ +sub.f16x2 r593, r219, r513; +} +{ +sub.f16x2 r596, r222, r519; +} +{ +add.f16x2 r599, r231, r529; +} +{ +add.f16x2 r602, r234, r535; +} +{ +sub.f16x2 r605, r231, r529; +} +{ +sub.f16x2 r608, r234, r535; +} +{ +cvt.rn.f16.f64 rs59, fd120; +} +mov.b32 r682, {rs59, rs59}; +{ +cvt.rn.f16.f64 rs60, fd119; +} +{ +neg.f16 rs61, rs60; +} +mov.b32 r691, {rs61, rs61}; +{ +add.f16x2 r611, %74, %72; +} +{ +add.f16x2 r614, %75, r611; +} +{ +add.f16x2 r617, %76, %73; +} +{ +add.f16x2 r620, %77, r617; +} +{ +add.f16x2 r623, %74, %72; +} +{ +mul.f16x2 r626, r623, r682; +} +{ +add.f16x2 r629, %75, r626; +} +{ +sub.f16x2 r632, %76, %73; +} +{ +mul.f16x2 r635, r632, r691; +} +{ +add.f16x2 r638, r629, r635; +} +{ +add.f16x2 r641, %74, %72; +} +{ +mul.f16x2 r644, r641, r682; +} +{ +add.f16x2 r647, %75, r644; +} +{ +sub.f16x2 r650, %76, %73; +} +{ +mul.f16x2 r653, r650, r691; +} +{ +sub.f16x2 r656, r647, r653; +} +{ +add.f16x2 r659, %76, %73; +} +{ +mul.f16x2 r662, r659, r682; +} +{ +add.f16x2 r665, %77, r662; +} +{ +sub.f16x2 r668, %74, %72; +} +{ +mul.f16x2 r671, r668, r691; +} +{ +sub.f16x2 r674, r665, r671; +} +{ +add.f16x2 r677, %76, %73; +} +{ +mul.f16x2 r680, r677, r682; +} +{ +add.f16x2 r683, %77, r680; +} +{ +sub.f16x2 r686, %74, %72; +} +{ +mul.f16x2 r689, r686, r691; +} +{ +add.f16x2 r692, r683, r689; +} +{ +cvt.rn.f16.f64 rs63, fd120; +} +mov.b32 r766, {rs63, rs63}; +{ +cvt.rn.f16.f64 rs64, fd119; +} +{ +neg.f16 rs65, rs64; +} +mov.b32 r775, {rs65, rs65}; +{ +add.f16x2 r695, %80, %78; +} +{ +add.f16x2 r698, %81, r695; +} +{ +add.f16x2 r701, %82, %79; +} +{ +add.f16x2 r704, %83, r701; +} +{ +add.f16x2 r707, %80, %78; +} +{ +mul.f16x2 r710, r707, r766; +} +{ +add.f16x2 r713, %81, r710; +} +{ +sub.f16x2 r716, %82, %79; +} +{ +mul.f16x2 r719, r716, r775; +} +{ +add.f16x2 r722, r713, r719; +} +{ +add.f16x2 r725, %80, %78; +} +{ +mul.f16x2 r728, r725, r766; +} +{ +add.f16x2 r731, %81, r728; +} +{ +sub.f16x2 r734, %82, %79; +} +{ +mul.f16x2 r737, r734, r775; +} +{ +sub.f16x2 r740, r731, r737; +} +{ +add.f16x2 r743, %82, %79; +} +{ +mul.f16x2 r746, r743, r766; +} +{ +add.f16x2 r749, %83, r746; +} +{ +sub.f16x2 r752, %80, %78; +} +{ +mul.f16x2 r755, r752, r775; +} +{ +sub.f16x2 r758, r749, r755; +} +{ +add.f16x2 r761, %82, %79; +} +{ +mul.f16x2 r764, r761, r766; +} +{ +add.f16x2 r767, %83, r764; +} +{ +sub.f16x2 r770, %80, %78; +} +{ +mul.f16x2 r773, r770, r775; +} +{ +add.f16x2 r776, r767, r773; +} +{ +cvt.rn.f16.f64 rs67, fd107; +} +{ +cvt.rn.f16.f64 rs68, fd119; +} +{ +cvt.rn.f16.f64 rs69, fd120; +} +{ +cvt.rn.f16.f64 rs70, fd119; +} +mov.b32 r793, {rs67, rs67}; +{ +mul.f16x2 r779, r722, r793; +} +mov.b32 r790, {rs68, rs68}; +{ +mul.f16x2 r782, r758, r790; +} +{ +sub.f16x2 r785, r779, r782; +} +{ +mul.f16x2 r788, r722, r790; +} +{ +fma.rn.f16x2 r791, r758, r793, r788; +} +mov.b32 r809, {rs69, rs69}; +{ +mul.f16x2 r795, r740, r809; +} +mov.b32 r806, {rs70, rs70}; +{ +mul.f16x2 r798, r776, r806; +} +{ +sub.f16x2 r801, r795, r798; +} +{ +mul.f16x2 r804, r740, r806; +} +{ +fma.rn.f16x2 r807, r776, r809, r804; +} +{ +add.f16x2 r811, r614, r698; +} +{ +add.f16x2 r814, r620, r704; +} +{ +sub.f16x2 r817, r614, r698; +} +{ +sub.f16x2 r820, r620, r704; +} +{ +add.f16x2 r823, r638, r785; +} +{ +add.f16x2 r826, r674, r791; +} +{ +sub.f16x2 r829, r638, r785; +} +{ +sub.f16x2 r832, r674, r791; +} +{ +add.f16x2 r835, r656, r801; +} +{ +add.f16x2 r838, r692, r807; +} +{ +sub.f16x2 r841, r656, r801; +} +{ +sub.f16x2 r844, r692, r807; +} +{ +cvt.rn.f16.f64 rs77, fd120; +} +mov.b32 r918, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs78, fd119; +} +{ +neg.f16 rs79, rs78; +} +mov.b32 r927, {rs79, rs79}; +{ +add.f16x2 r847, %89, %87; +} +{ +add.f16x2 r850, %84, r847; +} +{ +add.f16x2 r853, %85, %88; +} +{ +add.f16x2 r856, %86, r853; +} +{ +add.f16x2 r859, %89, %87; +} +{ +mul.f16x2 r862, r859, r918; +} +{ +add.f16x2 r865, %84, r862; +} +{ +sub.f16x2 r868, %85, %88; +} +{ +mul.f16x2 r871, r868, r927; +} +{ +add.f16x2 r874, r865, r871; +} +{ +add.f16x2 r877, %89, %87; +} +{ +mul.f16x2 r880, r877, r918; +} +{ +add.f16x2 r883, %84, r880; +} +{ +sub.f16x2 r886, %85, %88; +} +{ +mul.f16x2 r889, r886, r927; +} +{ +sub.f16x2 r892, r883, r889; +} +{ +add.f16x2 r895, %85, %88; +} +{ +mul.f16x2 r898, r895, r918; +} +{ +add.f16x2 r901, %86, r898; +} +{ +sub.f16x2 r904, %89, %87; +} +{ +mul.f16x2 r907, r904, r927; +} +{ +sub.f16x2 r910, r901, r907; +} +{ +add.f16x2 r913, %85, %88; +} +{ +mul.f16x2 r916, r913, r918; +} +{ +add.f16x2 r919, %86, r916; +} +{ +sub.f16x2 r922, %89, %87; +} +{ +mul.f16x2 r925, r922, r927; +} +{ +add.f16x2 r928, r919, r925; +} +{ +cvt.rn.f16.f64 rs81, fd120; +} +mov.b32 r1002, {rs81, rs81}; +{ +cvt.rn.f16.f64 rs82, fd119; +} +{ +neg.f16 rs83, rs82; +} +mov.b32 r1011, {rs83, rs83}; +{ +add.f16x2 r931, %95, %93; +} +{ +add.f16x2 r934, %90, r931; +} +{ +add.f16x2 r937, %91, %94; +} +{ +add.f16x2 r940, %92, r937; +} +{ +add.f16x2 r943, %95, %93; +} +{ +mul.f16x2 r946, r943, r1002; +} +{ +add.f16x2 r949, %90, r946; +} +{ +sub.f16x2 r952, %91, %94; +} +{ +mul.f16x2 r955, r952, r1011; +} +{ +add.f16x2 r958, r949, r955; +} +{ +add.f16x2 r961, %95, %93; +} +{ +mul.f16x2 r964, r961, r1002; +} +{ +add.f16x2 r967, %90, r964; +} +{ +sub.f16x2 r970, %91, %94; +} +{ +mul.f16x2 r973, r970, r1011; +} +{ +sub.f16x2 r976, r967, r973; +} +{ +add.f16x2 r979, %91, %94; +} +{ +mul.f16x2 r982, r979, r1002; +} +{ +add.f16x2 r985, %92, r982; +} +{ +sub.f16x2 r988, %95, %93; +} +{ +mul.f16x2 r991, r988, r1011; +} +{ +sub.f16x2 r994, r985, r991; +} +{ +add.f16x2 r997, %91, %94; +} +{ +mul.f16x2 r1000, r997, r1002; +} +{ +add.f16x2 r1003, %92, r1000; +} +{ +sub.f16x2 r1006, %95, %93; +} +{ +mul.f16x2 r1009, r1006, r1011; +} +{ +add.f16x2 r1012, r1003, r1009; +} +{ +cvt.rn.f16.f64 rs85, fd107; +} +{ +cvt.rn.f16.f64 rs86, fd119; +} +{ +cvt.rn.f16.f64 rs87, fd120; +} +{ +cvt.rn.f16.f64 rs88, fd119; +} +mov.b32 r1029, {rs85, rs85}; +{ +mul.f16x2 r1015, r958, r1029; +} +mov.b32 r1026, {rs86, rs86}; +{ +mul.f16x2 r1018, r994, r1026; +} +{ +sub.f16x2 r1021, r1015, r1018; +} +{ +mul.f16x2 r1024, r958, r1026; +} +{ +fma.rn.f16x2 r1027, r994, r1029, r1024; +} +mov.b32 r1045, {rs87, rs87}; +{ +mul.f16x2 r1031, r976, r1045; +} +mov.b32 r1042, {rs88, rs88}; +{ +mul.f16x2 r1034, r1012, r1042; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r976, r1042; +} +{ +fma.rn.f16x2 r1043, r1012, r1045, r1040; +} +{ +add.f16x2 r1047, r850, r934; +} +{ +add.f16x2 r1050, r856, r940; +} +{ +sub.f16x2 r1053, r850, r934; +} +{ +sub.f16x2 r1056, r856, r940; +} +{ +add.f16x2 r1059, r874, r1021; +} +{ +add.f16x2 r1062, r910, r1027; +} +{ +sub.f16x2 r1065, r874, r1021; +} +{ +sub.f16x2 r1068, r910, r1027; +} +{ +add.f16x2 r1071, r892, r1037; +} +{ +add.f16x2 r1074, r928, r1043; +} +{ +sub.f16x2 r1077, r892, r1037; +} +{ +sub.f16x2 r1080, r928, r1043; +} +{ +cvt.rn.f16.f64 rs95, fd147; +} +{ +cvt.rn.f16.f64 rs96, fd120; +} +{ +cvt.rn.f16.f64 rs97, fd107; +} +{ +cvt.rn.f16.f64 rs98, fd119; +} +{ +cvt.rn.f16.f64 rs101, fd120; +} +{ +cvt.rn.f16.f64 rs102, fd119; +} +{ +cvt.rn.f16.f64 rs103, fd119; +} +{ +cvt.rn.f16.f64 rs104, fd120; +} +mov.b32 r1097, {rs95, rs95}; +{ +mul.f16x2 r1083, r1059, r1097; +} +mov.b32 r1094, {rs96, rs96}; +{ +mul.f16x2 r1086, r1062, r1094; +} +{ +sub.f16x2 r1089, r1083, r1086; +} +{ +mul.f16x2 r1092, r1059, r1094; +} +{ +fma.rn.f16x2 r1095, r1062, r1097, r1092; +} +mov.b32 r1113, {rs97, rs97}; +{ +mul.f16x2 r1099, r1071, r1113; +} +mov.b32 r1110, {rs98, rs98}; +{ +mul.f16x2 r1102, r1074, r1110; +} +{ +sub.f16x2 r1105, r1099, r1102; +} +{ +mul.f16x2 r1108, r1071, r1110; +} +{ +fma.rn.f16x2 r1111, r1074, r1113, r1108; +} +{ +neg.f16x2 r1115, r1053; +} +mov.b32 r1131, {rs101, rs101}; +{ +mul.f16x2 r1117, r1065, r1131; +} +mov.b32 r1128, {rs102, rs102}; +{ +mul.f16x2 r1120, r1068, r1128; +} +{ +sub.f16x2 r1123, r1117, r1120; +} +{ +mul.f16x2 r1126, r1065, r1128; +} +{ +fma.rn.f16x2 r1129, r1068, r1131, r1126; +} +mov.b32 r1147, {rs103, rs103}; +{ +mul.f16x2 r1133, r1077, r1147; +} +mov.b32 r1144, {rs104, rs104}; +{ +mul.f16x2 r1136, r1080, r1144; +} +{ +sub.f16x2 r1139, r1133, r1136; +} +{ +mul.f16x2 r1142, r1077, r1144; +} +{ +fma.rn.f16x2 r1145, r1080, r1147, r1142; +} +{ +add.f16x2 r1149, r811, r1047; +} +{ +add.f16x2 r1152, r814, r1050; +} +{ +sub.f16x2 r1155, r811, r1047; +} +{ +sub.f16x2 r1158, r814, r1050; +} +{ +add.f16x2 r1161, r823, r1089; +} +{ +add.f16x2 r1164, r826, r1095; +} +{ +sub.f16x2 r1167, r823, r1089; +} +{ +sub.f16x2 r1170, r826, r1095; +} +{ +add.f16x2 r1173, r835, r1105; +} +{ +add.f16x2 r1176, r838, r1111; +} +{ +sub.f16x2 r1179, r835, r1105; +} +{ +sub.f16x2 r1182, r838, r1111; +} +{ +add.f16x2 r1185, r817, r1056; +} +{ +add.f16x2 r1188, r820, r1115; +} +{ +sub.f16x2 r1191, r817, r1056; +} +{ +sub.f16x2 r1194, r820, r1115; +} +{ +add.f16x2 r1197, r829, r1123; +} +{ +add.f16x2 r1200, r832, r1129; +} +{ +sub.f16x2 r1203, r829, r1123; +} +{ +sub.f16x2 r1206, r832, r1129; +} +{ +add.f16x2 r1209, r841, r1139; +} +{ +add.f16x2 r1212, r844, r1145; +} +{ +sub.f16x2 r1215, r841, r1139; +} +{ +sub.f16x2 r1218, r844, r1145; +} +mov.f64 fd101, 0d3FEEE8DD4748BF15; +{ +cvt.rn.f16.f64 rs117, fd101; +} +mov.f64 fd122, 0dBFD0907DC1930690; +{ +cvt.rn.f16.f64 rs118, fd122; +} +{ +cvt.rn.f16.f64 rs119, fd147; +} +{ +cvt.rn.f16.f64 rs120, fd120; +} +mov.f64 fd105, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs121, fd105; +} +mov.f64 fd118, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs122, fd118; +} +{ +cvt.rn.f16.f64 rs123, fd107; +} +{ +cvt.rn.f16.f64 rs124, fd119; +} +mov.f64 fd109, 0d3FD0907DC1930690; +{ +cvt.rn.f16.f64 rs125, fd109; +} +mov.f64 fd121, 0dBFEEE8DD4748BF15; +{ +cvt.rn.f16.f64 rs126, fd121; +} +{ +cvt.rn.f16.f64 rs129, fd122; +} +{ +cvt.rn.f16.f64 rs130, fd121; +} +{ +cvt.rn.f16.f64 rs131, fd120; +} +{ +cvt.rn.f16.f64 rs132, fd119; +} +{ +cvt.rn.f16.f64 rs133, fd118; +} +{ +cvt.rn.f16.f64 rs134, fd118; +} +{ +cvt.rn.f16.f64 rs135, fd119; +} +{ +cvt.rn.f16.f64 rs136, fd120; +} +{ +cvt.rn.f16.f64 rs137, fd121; +} +{ +cvt.rn.f16.f64 rs138, fd122; +} +mov.b32 r1235, {rs117, rs117}; +{ +mul.f16x2 r1221, r1161, r1235; +} +mov.b32 r1232, {rs118, rs118}; +{ +mul.f16x2 r1224, r1164, r1232; +} +{ +sub.f16x2 r1227, r1221, r1224; +} +{ +mul.f16x2 r1230, r1161, r1232; +} +{ +fma.rn.f16x2 r1233, r1164, r1235, r1230; +} +mov.b32 r1251, {rs119, rs119}; +{ +mul.f16x2 r1237, r1173, r1251; +} +mov.b32 r1248, {rs120, rs120}; +{ +mul.f16x2 r1240, r1176, r1248; +} +{ +sub.f16x2 r1243, r1237, r1240; +} +{ +mul.f16x2 r1246, r1173, r1248; +} +{ +fma.rn.f16x2 r1249, r1176, r1251, r1246; +} +mov.b32 r1267, {rs121, rs121}; +{ +mul.f16x2 r1253, r1185, r1267; +} +mov.b32 r1264, {rs122, rs122}; +{ +mul.f16x2 r1256, r1188, r1264; +} +{ +sub.f16x2 r1259, r1253, r1256; +} +{ +mul.f16x2 r1262, r1185, r1264; +} +{ +fma.rn.f16x2 r1265, r1188, r1267, r1262; +} +mov.b32 r1283, {rs123, rs123}; +{ +mul.f16x2 r1269, r1197, r1283; +} +mov.b32 r1280, {rs124, rs124}; +{ +mul.f16x2 r1272, r1200, r1280; +} +{ +sub.f16x2 r1275, r1269, r1272; +} +{ +mul.f16x2 r1278, r1197, r1280; +} +{ +fma.rn.f16x2 r1281, r1200, r1283, r1278; +} +mov.b32 r1299, {rs125, rs125}; +{ +mul.f16x2 r1285, r1209, r1299; +} +mov.b32 r1296, {rs126, rs126}; +{ +mul.f16x2 r1288, r1212, r1296; +} +{ +sub.f16x2 r1291, r1285, r1288; +} +{ +mul.f16x2 r1294, r1209, r1296; +} +{ +fma.rn.f16x2 r1297, r1212, r1299, r1294; +} +{ +neg.f16x2 r1301, r1155; +} +mov.b32 r1317, {rs129, rs129}; +{ +mul.f16x2 r1303, r1167, r1317; +} +mov.b32 r1314, {rs130, rs130}; +{ +mul.f16x2 r1306, r1170, r1314; +} +{ +sub.f16x2 r1309, r1303, r1306; +} +{ +mul.f16x2 r1312, r1167, r1314; +} +{ +fma.rn.f16x2 r1315, r1170, r1317, r1312; +} +mov.b32 r1333, {rs131, rs131}; +{ +mul.f16x2 r1319, r1179, r1333; +} +mov.b32 r1330, {rs132, rs132}; +{ +mul.f16x2 r1322, r1182, r1330; +} +{ +sub.f16x2 r1325, r1319, r1322; +} +{ +mul.f16x2 r1328, r1179, r1330; +} +{ +fma.rn.f16x2 r1331, r1182, r1333, r1328; +} +mov.b32 r1349, {rs133, rs133}; +{ +mul.f16x2 r1335, r1191, r1349; +} +mov.b32 r1346, {rs134, rs134}; +{ +mul.f16x2 r1338, r1194, r1346; +} +{ +sub.f16x2 r1341, r1335, r1338; +} +{ +mul.f16x2 r1344, r1191, r1346; +} +{ +fma.rn.f16x2 r1347, r1194, r1349, r1344; +} +mov.b32 r1365, {rs135, rs135}; +{ +mul.f16x2 r1351, r1203, r1365; +} +mov.b32 r1362, {rs136, rs136}; +{ +mul.f16x2 r1354, r1206, r1362; +} +{ +sub.f16x2 r1357, r1351, r1354; +} +{ +mul.f16x2 r1360, r1203, r1362; +} +{ +fma.rn.f16x2 r1363, r1206, r1365, r1360; +} +mov.b32 r1381, {rs137, rs137}; +{ +mul.f16x2 r1367, r1215, r1381; +} +mov.b32 r1378, {rs138, rs138}; +{ +mul.f16x2 r1370, r1218, r1378; +} +{ +sub.f16x2 r1373, r1367, r1370; +} +{ +mul.f16x2 r1376, r1215, r1378; +} +{ +fma.rn.f16x2 r1379, r1218, r1381, r1376; +} +{ +add.f16x2 %0, r539, r1149; +} +{ +add.f16x2 %1, r542, r1152; +} +{ +sub.f16x2 %24, r539, r1149; +} +{ +sub.f16x2 %25, r542, r1152; +} +{ +add.f16x2 %2, r551, r1227; +} +{ +add.f16x2 %3, r554, r1233; +} +{ +sub.f16x2 %26, r551, r1227; +} +{ +sub.f16x2 %27, r554, r1233; +} +{ +add.f16x2 %4, r563, r1243; +} +{ +add.f16x2 %5, r566, r1249; +} +{ +sub.f16x2 %28, r563, r1243; +} +{ +sub.f16x2 %29, r566, r1249; +} +{ +add.f16x2 %6, r575, r1259; +} +{ +add.f16x2 %7, r578, r1265; +} +{ +sub.f16x2 %30, r575, r1259; +} +{ +sub.f16x2 %31, r578, r1265; +} +{ +add.f16x2 %8, r587, r1275; +} +{ +add.f16x2 %9, r590, r1281; +} +{ +sub.f16x2 %32, r587, r1275; +} +{ +sub.f16x2 %33, r590, r1281; +} +{ +add.f16x2 %10, r599, r1291; +} +{ +add.f16x2 %11, r602, r1297; +} +{ +sub.f16x2 %34, r599, r1291; +} +{ +sub.f16x2 %35, r602, r1297; +} +{ +add.f16x2 %12, r545, r1158; +} +{ +add.f16x2 %13, r548, r1301; +} +{ +sub.f16x2 %36, r545, r1158; +} +{ +sub.f16x2 %37, r548, r1301; +} +{ +add.f16x2 %14, r557, r1309; +} +{ +add.f16x2 %15, r560, r1315; +} +{ +sub.f16x2 %38, r557, r1309; +} +{ +sub.f16x2 %39, r560, r1315; +} +{ +add.f16x2 %16, r569, r1325; +} +{ +add.f16x2 %17, r572, r1331; +} +{ +sub.f16x2 %40, r569, r1325; +} +{ +sub.f16x2 %41, r572, r1331; +} +{ +add.f16x2 %18, r581, r1341; +} +{ +add.f16x2 %19, r584, r1347; +} +{ +sub.f16x2 %42, r581, r1341; +} +{ +sub.f16x2 %43, r584, r1347; +} +{ +add.f16x2 %20, r593, r1357; +} +{ +add.f16x2 %21, r596, r1363; +} +{ +sub.f16x2 %44, r593, r1357; +} +{ +sub.f16x2 %45, r596, r1363; +} +{ +add.f16x2 %22, r605, r1373; +} +{ +add.f16x2 %23, r608, r1379; +} +{ +sub.f16x2 %46, r605, r1373; +} +{ +sub.f16x2 %47, r608, r1379; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[15].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..c0181aeb6f9a8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp16_inv.hpp.inc @@ -0,0 +1,1803 @@ +#ifndef CUFFTDX_FFT_24_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_24_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<958, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<147>; +.reg .b32 r<1575>; +.reg .f64 fd<147>; +.reg .b64 rd<3>; +mov.f64 fd115, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd115; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd119, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd119; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %50, %48; +} +{ +add.f16x2 r4, %51, r1; +} +{ +add.f16x2 r7, %52, %49; +} +{ +add.f16x2 r10, %53, r7; +} +{ +add.f16x2 r13, %50, %48; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %51, r16; +} +{ +sub.f16x2 r22, %52, %49; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %50, %48; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %51, r34; +} +{ +sub.f16x2 r40, %52, %49; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %52, %49; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %53, r52; +} +{ +sub.f16x2 r58, %50, %48; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %52, %49; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %53, r70; +} +{ +sub.f16x2 r76, %50, %48; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs3, fd115; +} +mov.b32 r156, {rs3, rs3}; +{ +cvt.rn.f16.f64 rs4, fd119; +} +mov.b32 r165, {rs4, rs4}; +{ +add.f16x2 r85, %55, %59; +} +{ +add.f16x2 r88, %56, r85; +} +{ +add.f16x2 r91, %57, %54; +} +{ +add.f16x2 r94, %58, r91; +} +{ +add.f16x2 r97, %55, %59; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %56, r100; +} +{ +sub.f16x2 r106, %57, %54; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %55, %59; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %56, r118; +} +{ +sub.f16x2 r124, %57, %54; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %57, %54; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %58, r136; +} +{ +sub.f16x2 r142, %55, %59; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %57, %54; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %58, r154; +} +{ +sub.f16x2 r160, %55, %59; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +mov.f64 fd120, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs5, fd120; +} +mov.f64 fd116, 0d3FEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs6, fd116; +} +{ +cvt.rn.f16.f64 rs7, fd115; +} +{ +cvt.rn.f16.f64 rs8, fd116; +} +mov.b32 r183, {rs5, rs5}; +{ +mul.f16x2 r169, r112, r183; +} +mov.b32 r180, {rs6, rs6}; +{ +mul.f16x2 r172, r148, r180; +} +{ +sub.f16x2 r175, r169, r172; +} +{ +mul.f16x2 r178, r112, r180; +} +{ +fma.rn.f16x2 r181, r148, r183, r178; +} +mov.b32 r199, {rs7, rs7}; +{ +mul.f16x2 r185, r130, r199; +} +mov.b32 r196, {rs8, rs8}; +{ +mul.f16x2 r188, r166, r196; +} +{ +sub.f16x2 r191, r185, r188; +} +{ +mul.f16x2 r194, r130, r196; +} +{ +fma.rn.f16x2 r197, r166, r199, r194; +} +{ +add.f16x2 r201, r4, r88; +} +{ +add.f16x2 r204, r10, r94; +} +{ +sub.f16x2 r207, r4, r88; +} +{ +sub.f16x2 r210, r10, r94; +} +{ +add.f16x2 r213, r28, r175; +} +{ +add.f16x2 r216, r64, r181; +} +{ +sub.f16x2 r219, r28, r175; +} +{ +sub.f16x2 r222, r64, r181; +} +{ +add.f16x2 r225, r46, r191; +} +{ +add.f16x2 r228, r82, r197; +} +{ +sub.f16x2 r231, r46, r191; +} +{ +sub.f16x2 r234, r82, r197; +} +{ +cvt.rn.f16.f64 rs15, fd115; +} +mov.b32 r308, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd119; +} +mov.b32 r317, {rs16, rs16}; +{ +add.f16x2 r237, %61, %65; +} +{ +add.f16x2 r240, %62, r237; +} +{ +add.f16x2 r243, %63, %60; +} +{ +add.f16x2 r246, %64, r243; +} +{ +add.f16x2 r249, %61, %65; +} +{ +mul.f16x2 r252, r249, r308; +} +{ +add.f16x2 r255, %62, r252; +} +{ +sub.f16x2 r258, %63, %60; +} +{ +mul.f16x2 r261, r258, r317; +} +{ +add.f16x2 r264, r255, r261; +} +{ +add.f16x2 r267, %61, %65; +} +{ +mul.f16x2 r270, r267, r308; +} +{ +add.f16x2 r273, %62, r270; +} +{ +sub.f16x2 r276, %63, %60; +} +{ +mul.f16x2 r279, r276, r317; +} +{ +sub.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %63, %60; +} +{ +mul.f16x2 r288, r285, r308; +} +{ +add.f16x2 r291, %64, r288; +} +{ +sub.f16x2 r294, %61, %65; +} +{ +mul.f16x2 r297, r294, r317; +} +{ +sub.f16x2 r300, r291, r297; +} +{ +add.f16x2 r303, %63, %60; +} +{ +mul.f16x2 r306, r303, r308; +} +{ +add.f16x2 r309, %64, r306; +} +{ +sub.f16x2 r312, %61, %65; +} +{ +mul.f16x2 r315, r312, r317; +} +{ +add.f16x2 r318, r309, r315; +} +{ +cvt.rn.f16.f64 rs17, fd115; +} +mov.b32 r392, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd119; +} +mov.b32 r401, {rs18, rs18}; +{ +add.f16x2 r321, %70, %68; +} +{ +add.f16x2 r324, %71, r321; +} +{ +add.f16x2 r327, %66, %69; +} +{ +add.f16x2 r330, %67, r327; +} +{ +add.f16x2 r333, %70, %68; +} +{ +mul.f16x2 r336, r333, r392; +} +{ +add.f16x2 r339, %71, r336; +} +{ +sub.f16x2 r342, %66, %69; +} +{ +mul.f16x2 r345, r342, r401; +} +{ +add.f16x2 r348, r339, r345; +} +{ +add.f16x2 r351, %70, %68; +} +{ +mul.f16x2 r354, r351, r392; +} +{ +add.f16x2 r357, %71, r354; +} +{ +sub.f16x2 r360, %66, %69; +} +{ +mul.f16x2 r363, r360, r401; +} +{ +sub.f16x2 r366, r357, r363; +} +{ +add.f16x2 r369, %66, %69; +} +{ +mul.f16x2 r372, r369, r392; +} +{ +add.f16x2 r375, %67, r372; +} +{ +sub.f16x2 r378, %70, %68; +} +{ +mul.f16x2 r381, r378, r401; +} +{ +sub.f16x2 r384, r375, r381; +} +{ +add.f16x2 r387, %66, %69; +} +{ +mul.f16x2 r390, r387, r392; +} +{ +add.f16x2 r393, %67, r390; +} +{ +sub.f16x2 r396, %70, %68; +} +{ +mul.f16x2 r399, r396, r401; +} +{ +add.f16x2 r402, r393, r399; +} +{ +cvt.rn.f16.f64 rs19, fd120; +} +{ +cvt.rn.f16.f64 rs20, fd116; +} +{ +cvt.rn.f16.f64 rs21, fd115; +} +{ +cvt.rn.f16.f64 rs22, fd116; +} +mov.b32 r419, {rs19, rs19}; +{ +mul.f16x2 r405, r348, r419; +} +mov.b32 r416, {rs20, rs20}; +{ +mul.f16x2 r408, r384, r416; +} +{ +sub.f16x2 r411, r405, r408; +} +{ +mul.f16x2 r414, r348, r416; +} +{ +fma.rn.f16x2 r417, r384, r419, r414; +} +mov.b32 r435, {rs21, rs21}; +{ +mul.f16x2 r421, r366, r435; +} +mov.b32 r432, {rs22, rs22}; +{ +mul.f16x2 r424, r402, r432; +} +{ +sub.f16x2 r427, r421, r424; +} +{ +mul.f16x2 r430, r366, r432; +} +{ +fma.rn.f16x2 r433, r402, r435, r430; +} +{ +add.f16x2 r437, r240, r324; +} +{ +add.f16x2 r440, r246, r330; +} +{ +sub.f16x2 r443, r240, r324; +} +{ +sub.f16x2 r446, r246, r330; +} +{ +add.f16x2 r449, r264, r411; +} +{ +add.f16x2 r452, r300, r417; +} +{ +sub.f16x2 r455, r264, r411; +} +{ +sub.f16x2 r458, r300, r417; +} +{ +add.f16x2 r461, r282, r427; +} +{ +add.f16x2 r464, r318, r433; +} +{ +sub.f16x2 r467, r282, r427; +} +{ +sub.f16x2 r470, r318, r433; +} +{ +cvt.rn.f16.f64 rs29, fd116; +} +{ +cvt.rn.f16.f64 rs30, fd120; +} +{ +cvt.rn.f16.f64 rs31, fd120; +} +{ +cvt.rn.f16.f64 rs32, fd116; +} +{ +cvt.rn.f16.f64 rs35, fd115; +} +{ +cvt.rn.f16.f64 rs36, fd116; +} +{ +cvt.rn.f16.f64 rs37, fd119; +} +{ +cvt.rn.f16.f64 rs38, fd120; +} +mov.b32 r487, {rs29, rs29}; +{ +mul.f16x2 r473, r449, r487; +} +mov.b32 r484, {rs30, rs30}; +{ +mul.f16x2 r476, r452, r484; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r449, r484; +} +{ +fma.rn.f16x2 r485, r452, r487, r482; +} +mov.b32 r503, {rs31, rs31}; +{ +mul.f16x2 r489, r461, r503; +} +mov.b32 r500, {rs32, rs32}; +{ +mul.f16x2 r492, r464, r500; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r461, r500; +} +{ +fma.rn.f16x2 r501, r464, r503, r498; +} +{ +neg.f16x2 r505, r446; +} +mov.b32 r521, {rs35, rs35}; +{ +mul.f16x2 r507, r455, r521; +} +mov.b32 r518, {rs36, rs36}; +{ +mul.f16x2 r510, r458, r518; +} +{ +sub.f16x2 r513, r507, r510; +} +{ +mul.f16x2 r516, r455, r518; +} +{ +fma.rn.f16x2 r519, r458, r521, r516; +} +mov.b32 r537, {rs37, rs37}; +{ +mul.f16x2 r523, r467, r537; +} +mov.b32 r534, {rs38, rs38}; +{ +mul.f16x2 r526, r470, r534; +} +{ +sub.f16x2 r529, r523, r526; +} +{ +mul.f16x2 r532, r467, r534; +} +{ +fma.rn.f16x2 r535, r470, r537, r532; +} +{ +add.f16x2 r539, r201, r437; +} +{ +add.f16x2 r542, r204, r440; +} +{ +sub.f16x2 r545, r201, r437; +} +{ +sub.f16x2 r548, r204, r440; +} +{ +add.f16x2 r551, r213, r479; +} +{ +add.f16x2 r554, r216, r485; +} +{ +sub.f16x2 r557, r213, r479; +} +{ +sub.f16x2 r560, r216, r485; +} +{ +add.f16x2 r563, r225, r495; +} +{ +add.f16x2 r566, r228, r501; +} +{ +sub.f16x2 r569, r225, r495; +} +{ +sub.f16x2 r572, r228, r501; +} +{ +add.f16x2 r575, r207, r505; +} +{ +add.f16x2 r578, r210, r443; +} +{ +sub.f16x2 r581, r207, r505; +} +{ +sub.f16x2 r584, r210, r443; +} +{ +add.f16x2 r587, r219, r513; +} +{ +add.f16x2 r590, r222, r519; +} +{ +sub.f16x2 r593, r219, r513; +} +{ +sub.f16x2 r596, r222, r519; +} +{ +add.f16x2 r599, r231, r529; +} +{ +add.f16x2 r602, r234, r535; +} +{ +sub.f16x2 r605, r231, r529; +} +{ +sub.f16x2 r608, r234, r535; +} +{ +cvt.rn.f16.f64 rs51, fd115; +} +mov.b32 r682, {rs51, rs51}; +{ +cvt.rn.f16.f64 rs52, fd119; +} +mov.b32 r691, {rs52, rs52}; +{ +add.f16x2 r611, %74, %72; +} +{ +add.f16x2 r614, %75, r611; +} +{ +add.f16x2 r617, %76, %73; +} +{ +add.f16x2 r620, %77, r617; +} +{ +add.f16x2 r623, %74, %72; +} +{ +mul.f16x2 r626, r623, r682; +} +{ +add.f16x2 r629, %75, r626; +} +{ +sub.f16x2 r632, %76, %73; +} +{ +mul.f16x2 r635, r632, r691; +} +{ +add.f16x2 r638, r629, r635; +} +{ +add.f16x2 r641, %74, %72; +} +{ +mul.f16x2 r644, r641, r682; +} +{ +add.f16x2 r647, %75, r644; +} +{ +sub.f16x2 r650, %76, %73; +} +{ +mul.f16x2 r653, r650, r691; +} +{ +sub.f16x2 r656, r647, r653; +} +{ +add.f16x2 r659, %76, %73; +} +{ +mul.f16x2 r662, r659, r682; +} +{ +add.f16x2 r665, %77, r662; +} +{ +sub.f16x2 r668, %74, %72; +} +{ +mul.f16x2 r671, r668, r691; +} +{ +sub.f16x2 r674, r665, r671; +} +{ +add.f16x2 r677, %76, %73; +} +{ +mul.f16x2 r680, r677, r682; +} +{ +add.f16x2 r683, %77, r680; +} +{ +sub.f16x2 r686, %74, %72; +} +{ +mul.f16x2 r689, r686, r691; +} +{ +add.f16x2 r692, r683, r689; +} +{ +cvt.rn.f16.f64 rs53, fd115; +} +mov.b32 r766, {rs53, rs53}; +{ +cvt.rn.f16.f64 rs54, fd119; +} +mov.b32 r775, {rs54, rs54}; +{ +add.f16x2 r695, %83, %81; +} +{ +add.f16x2 r698, %78, r695; +} +{ +add.f16x2 r701, %79, %82; +} +{ +add.f16x2 r704, %80, r701; +} +{ +add.f16x2 r707, %83, %81; +} +{ +mul.f16x2 r710, r707, r766; +} +{ +add.f16x2 r713, %78, r710; +} +{ +sub.f16x2 r716, %79, %82; +} +{ +mul.f16x2 r719, r716, r775; +} +{ +add.f16x2 r722, r713, r719; +} +{ +add.f16x2 r725, %83, %81; +} +{ +mul.f16x2 r728, r725, r766; +} +{ +add.f16x2 r731, %78, r728; +} +{ +sub.f16x2 r734, %79, %82; +} +{ +mul.f16x2 r737, r734, r775; +} +{ +sub.f16x2 r740, r731, r737; +} +{ +add.f16x2 r743, %79, %82; +} +{ +mul.f16x2 r746, r743, r766; +} +{ +add.f16x2 r749, %80, r746; +} +{ +sub.f16x2 r752, %83, %81; +} +{ +mul.f16x2 r755, r752, r775; +} +{ +sub.f16x2 r758, r749, r755; +} +{ +add.f16x2 r761, %79, %82; +} +{ +mul.f16x2 r764, r761, r766; +} +{ +add.f16x2 r767, %80, r764; +} +{ +sub.f16x2 r770, %83, %81; +} +{ +mul.f16x2 r773, r770, r775; +} +{ +add.f16x2 r776, r767, r773; +} +{ +cvt.rn.f16.f64 rs55, fd120; +} +{ +cvt.rn.f16.f64 rs56, fd116; +} +{ +cvt.rn.f16.f64 rs57, fd115; +} +{ +cvt.rn.f16.f64 rs58, fd116; +} +mov.b32 r793, {rs55, rs55}; +{ +mul.f16x2 r779, r722, r793; +} +mov.b32 r790, {rs56, rs56}; +{ +mul.f16x2 r782, r758, r790; +} +{ +sub.f16x2 r785, r779, r782; +} +{ +mul.f16x2 r788, r722, r790; +} +{ +fma.rn.f16x2 r791, r758, r793, r788; +} +mov.b32 r809, {rs57, rs57}; +{ +mul.f16x2 r795, r740, r809; +} +mov.b32 r806, {rs58, rs58}; +{ +mul.f16x2 r798, r776, r806; +} +{ +sub.f16x2 r801, r795, r798; +} +{ +mul.f16x2 r804, r740, r806; +} +{ +fma.rn.f16x2 r807, r776, r809, r804; +} +{ +add.f16x2 r811, r614, r698; +} +{ +add.f16x2 r814, r620, r704; +} +{ +sub.f16x2 r817, r614, r698; +} +{ +sub.f16x2 r820, r620, r704; +} +{ +add.f16x2 r823, r638, r785; +} +{ +add.f16x2 r826, r674, r791; +} +{ +sub.f16x2 r829, r638, r785; +} +{ +sub.f16x2 r832, r674, r791; +} +{ +add.f16x2 r835, r656, r801; +} +{ +add.f16x2 r838, r692, r807; +} +{ +sub.f16x2 r841, r656, r801; +} +{ +sub.f16x2 r844, r692, r807; +} +{ +cvt.rn.f16.f64 rs65, fd115; +} +mov.b32 r918, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs66, fd119; +} +mov.b32 r927, {rs66, rs66}; +{ +add.f16x2 r847, %84, %88; +} +{ +add.f16x2 r850, %85, r847; +} +{ +add.f16x2 r853, %86, %89; +} +{ +add.f16x2 r856, %87, r853; +} +{ +add.f16x2 r859, %84, %88; +} +{ +mul.f16x2 r862, r859, r918; +} +{ +add.f16x2 r865, %85, r862; +} +{ +sub.f16x2 r868, %86, %89; +} +{ +mul.f16x2 r871, r868, r927; +} +{ +add.f16x2 r874, r865, r871; +} +{ +add.f16x2 r877, %84, %88; +} +{ +mul.f16x2 r880, r877, r918; +} +{ +add.f16x2 r883, %85, r880; +} +{ +sub.f16x2 r886, %86, %89; +} +{ +mul.f16x2 r889, r886, r927; +} +{ +sub.f16x2 r892, r883, r889; +} +{ +add.f16x2 r895, %86, %89; +} +{ +mul.f16x2 r898, r895, r918; +} +{ +add.f16x2 r901, %87, r898; +} +{ +sub.f16x2 r904, %84, %88; +} +{ +mul.f16x2 r907, r904, r927; +} +{ +sub.f16x2 r910, r901, r907; +} +{ +add.f16x2 r913, %86, %89; +} +{ +mul.f16x2 r916, r913, r918; +} +{ +add.f16x2 r919, %87, r916; +} +{ +sub.f16x2 r922, %84, %88; +} +{ +mul.f16x2 r925, r922, r927; +} +{ +add.f16x2 r928, r919, r925; +} +{ +cvt.rn.f16.f64 rs67, fd115; +} +mov.b32 r1002, {rs67, rs67}; +{ +cvt.rn.f16.f64 rs68, fd119; +} +mov.b32 r1011, {rs68, rs68}; +{ +add.f16x2 r931, %92, %90; +} +{ +add.f16x2 r934, %93, r931; +} +{ +add.f16x2 r937, %94, %91; +} +{ +add.f16x2 r940, %95, r937; +} +{ +add.f16x2 r943, %92, %90; +} +{ +mul.f16x2 r946, r943, r1002; +} +{ +add.f16x2 r949, %93, r946; +} +{ +sub.f16x2 r952, %94, %91; +} +{ +mul.f16x2 r955, r952, r1011; +} +{ +add.f16x2 r958, r949, r955; +} +{ +add.f16x2 r961, %92, %90; +} +{ +mul.f16x2 r964, r961, r1002; +} +{ +add.f16x2 r967, %93, r964; +} +{ +sub.f16x2 r970, %94, %91; +} +{ +mul.f16x2 r973, r970, r1011; +} +{ +sub.f16x2 r976, r967, r973; +} +{ +add.f16x2 r979, %94, %91; +} +{ +mul.f16x2 r982, r979, r1002; +} +{ +add.f16x2 r985, %95, r982; +} +{ +sub.f16x2 r988, %92, %90; +} +{ +mul.f16x2 r991, r988, r1011; +} +{ +sub.f16x2 r994, r985, r991; +} +{ +add.f16x2 r997, %94, %91; +} +{ +mul.f16x2 r1000, r997, r1002; +} +{ +add.f16x2 r1003, %95, r1000; +} +{ +sub.f16x2 r1006, %92, %90; +} +{ +mul.f16x2 r1009, r1006, r1011; +} +{ +add.f16x2 r1012, r1003, r1009; +} +{ +cvt.rn.f16.f64 rs69, fd120; +} +{ +cvt.rn.f16.f64 rs70, fd116; +} +{ +cvt.rn.f16.f64 rs71, fd115; +} +{ +cvt.rn.f16.f64 rs72, fd116; +} +mov.b32 r1029, {rs69, rs69}; +{ +mul.f16x2 r1015, r958, r1029; +} +mov.b32 r1026, {rs70, rs70}; +{ +mul.f16x2 r1018, r994, r1026; +} +{ +sub.f16x2 r1021, r1015, r1018; +} +{ +mul.f16x2 r1024, r958, r1026; +} +{ +fma.rn.f16x2 r1027, r994, r1029, r1024; +} +mov.b32 r1045, {rs71, rs71}; +{ +mul.f16x2 r1031, r976, r1045; +} +mov.b32 r1042, {rs72, rs72}; +{ +mul.f16x2 r1034, r1012, r1042; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r976, r1042; +} +{ +fma.rn.f16x2 r1043, r1012, r1045, r1040; +} +{ +add.f16x2 r1047, r850, r934; +} +{ +add.f16x2 r1050, r856, r940; +} +{ +sub.f16x2 r1053, r850, r934; +} +{ +sub.f16x2 r1056, r856, r940; +} +{ +add.f16x2 r1059, r874, r1021; +} +{ +add.f16x2 r1062, r910, r1027; +} +{ +sub.f16x2 r1065, r874, r1021; +} +{ +sub.f16x2 r1068, r910, r1027; +} +{ +add.f16x2 r1071, r892, r1037; +} +{ +add.f16x2 r1074, r928, r1043; +} +{ +sub.f16x2 r1077, r892, r1037; +} +{ +sub.f16x2 r1080, r928, r1043; +} +{ +cvt.rn.f16.f64 rs79, fd116; +} +{ +cvt.rn.f16.f64 rs80, fd120; +} +{ +cvt.rn.f16.f64 rs81, fd120; +} +{ +cvt.rn.f16.f64 rs82, fd116; +} +{ +cvt.rn.f16.f64 rs85, fd115; +} +{ +cvt.rn.f16.f64 rs86, fd116; +} +{ +cvt.rn.f16.f64 rs87, fd119; +} +{ +cvt.rn.f16.f64 rs88, fd120; +} +mov.b32 r1097, {rs79, rs79}; +{ +mul.f16x2 r1083, r1059, r1097; +} +mov.b32 r1094, {rs80, rs80}; +{ +mul.f16x2 r1086, r1062, r1094; +} +{ +sub.f16x2 r1089, r1083, r1086; +} +{ +mul.f16x2 r1092, r1059, r1094; +} +{ +fma.rn.f16x2 r1095, r1062, r1097, r1092; +} +mov.b32 r1113, {rs81, rs81}; +{ +mul.f16x2 r1099, r1071, r1113; +} +mov.b32 r1110, {rs82, rs82}; +{ +mul.f16x2 r1102, r1074, r1110; +} +{ +sub.f16x2 r1105, r1099, r1102; +} +{ +mul.f16x2 r1108, r1071, r1110; +} +{ +fma.rn.f16x2 r1111, r1074, r1113, r1108; +} +{ +neg.f16x2 r1115, r1056; +} +mov.b32 r1131, {rs85, rs85}; +{ +mul.f16x2 r1117, r1065, r1131; +} +mov.b32 r1128, {rs86, rs86}; +{ +mul.f16x2 r1120, r1068, r1128; +} +{ +sub.f16x2 r1123, r1117, r1120; +} +{ +mul.f16x2 r1126, r1065, r1128; +} +{ +fma.rn.f16x2 r1129, r1068, r1131, r1126; +} +mov.b32 r1147, {rs87, rs87}; +{ +mul.f16x2 r1133, r1077, r1147; +} +mov.b32 r1144, {rs88, rs88}; +{ +mul.f16x2 r1136, r1080, r1144; +} +{ +sub.f16x2 r1139, r1133, r1136; +} +{ +mul.f16x2 r1142, r1077, r1144; +} +{ +fma.rn.f16x2 r1145, r1080, r1147, r1142; +} +{ +add.f16x2 r1149, r811, r1047; +} +{ +add.f16x2 r1152, r814, r1050; +} +{ +sub.f16x2 r1155, r811, r1047; +} +{ +sub.f16x2 r1158, r814, r1050; +} +{ +add.f16x2 r1161, r823, r1089; +} +{ +add.f16x2 r1164, r826, r1095; +} +{ +sub.f16x2 r1167, r823, r1089; +} +{ +sub.f16x2 r1170, r826, r1095; +} +{ +add.f16x2 r1173, r835, r1105; +} +{ +add.f16x2 r1176, r838, r1111; +} +{ +sub.f16x2 r1179, r835, r1105; +} +{ +sub.f16x2 r1182, r838, r1111; +} +{ +add.f16x2 r1185, r817, r1115; +} +{ +add.f16x2 r1188, r820, r1053; +} +{ +sub.f16x2 r1191, r817, r1115; +} +{ +sub.f16x2 r1194, r820, r1053; +} +{ +add.f16x2 r1197, r829, r1123; +} +{ +add.f16x2 r1200, r832, r1129; +} +{ +sub.f16x2 r1203, r829, r1123; +} +{ +sub.f16x2 r1206, r832, r1129; +} +{ +add.f16x2 r1209, r841, r1139; +} +{ +add.f16x2 r1212, r844, r1145; +} +{ +sub.f16x2 r1215, r841, r1139; +} +{ +sub.f16x2 r1218, r844, r1145; +} +mov.f64 fd114, 0d3FEEE8DD4748BF15; +{ +cvt.rn.f16.f64 rs101, fd114; +} +mov.f64 fd122, 0d3FD0907DC1930690; +{ +cvt.rn.f16.f64 rs102, fd122; +} +{ +cvt.rn.f16.f64 rs103, fd116; +} +{ +cvt.rn.f16.f64 rs104, fd120; +} +mov.f64 fd118, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs105, fd118; +} +{ +cvt.rn.f16.f64 rs106, fd118; +} +{ +cvt.rn.f16.f64 rs107, fd120; +} +{ +cvt.rn.f16.f64 rs108, fd116; +} +{ +cvt.rn.f16.f64 rs109, fd122; +} +{ +cvt.rn.f16.f64 rs110, fd114; +} +mov.f64 fd113, 0dBFD0907DC1930690; +{ +cvt.rn.f16.f64 rs113, fd113; +} +{ +cvt.rn.f16.f64 rs114, fd114; +} +{ +cvt.rn.f16.f64 rs115, fd115; +} +{ +cvt.rn.f16.f64 rs116, fd116; +} +mov.f64 fd117, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs117, fd117; +} +{ +cvt.rn.f16.f64 rs118, fd118; +} +{ +cvt.rn.f16.f64 rs119, fd119; +} +{ +cvt.rn.f16.f64 rs120, fd120; +} +mov.f64 fd121, 0dBFEEE8DD4748BF15; +{ +cvt.rn.f16.f64 rs121, fd121; +} +{ +cvt.rn.f16.f64 rs122, fd122; +} +mov.b32 r1235, {rs101, rs101}; +{ +mul.f16x2 r1221, r1161, r1235; +} +mov.b32 r1232, {rs102, rs102}; +{ +mul.f16x2 r1224, r1164, r1232; +} +{ +sub.f16x2 r1227, r1221, r1224; +} +{ +mul.f16x2 r1230, r1161, r1232; +} +{ +fma.rn.f16x2 r1233, r1164, r1235, r1230; +} +mov.b32 r1251, {rs103, rs103}; +{ +mul.f16x2 r1237, r1173, r1251; +} +mov.b32 r1248, {rs104, rs104}; +{ +mul.f16x2 r1240, r1176, r1248; +} +{ +sub.f16x2 r1243, r1237, r1240; +} +{ +mul.f16x2 r1246, r1173, r1248; +} +{ +fma.rn.f16x2 r1249, r1176, r1251, r1246; +} +mov.b32 r1267, {rs105, rs105}; +{ +mul.f16x2 r1253, r1185, r1267; +} +mov.b32 r1264, {rs106, rs106}; +{ +mul.f16x2 r1256, r1188, r1264; +} +{ +sub.f16x2 r1259, r1253, r1256; +} +{ +mul.f16x2 r1262, r1185, r1264; +} +{ +fma.rn.f16x2 r1265, r1188, r1267, r1262; +} +mov.b32 r1283, {rs107, rs107}; +{ +mul.f16x2 r1269, r1197, r1283; +} +mov.b32 r1280, {rs108, rs108}; +{ +mul.f16x2 r1272, r1200, r1280; +} +{ +sub.f16x2 r1275, r1269, r1272; +} +{ +mul.f16x2 r1278, r1197, r1280; +} +{ +fma.rn.f16x2 r1281, r1200, r1283, r1278; +} +mov.b32 r1299, {rs109, rs109}; +{ +mul.f16x2 r1285, r1209, r1299; +} +mov.b32 r1296, {rs110, rs110}; +{ +mul.f16x2 r1288, r1212, r1296; +} +{ +sub.f16x2 r1291, r1285, r1288; +} +{ +mul.f16x2 r1294, r1209, r1296; +} +{ +fma.rn.f16x2 r1297, r1212, r1299, r1294; +} +{ +neg.f16x2 r1301, r1158; +} +mov.b32 r1317, {rs113, rs113}; +{ +mul.f16x2 r1303, r1167, r1317; +} +mov.b32 r1314, {rs114, rs114}; +{ +mul.f16x2 r1306, r1170, r1314; +} +{ +sub.f16x2 r1309, r1303, r1306; +} +{ +mul.f16x2 r1312, r1167, r1314; +} +{ +fma.rn.f16x2 r1315, r1170, r1317, r1312; +} +mov.b32 r1333, {rs115, rs115}; +{ +mul.f16x2 r1319, r1179, r1333; +} +mov.b32 r1330, {rs116, rs116}; +{ +mul.f16x2 r1322, r1182, r1330; +} +{ +sub.f16x2 r1325, r1319, r1322; +} +{ +mul.f16x2 r1328, r1179, r1330; +} +{ +fma.rn.f16x2 r1331, r1182, r1333, r1328; +} +mov.b32 r1349, {rs117, rs117}; +{ +mul.f16x2 r1335, r1191, r1349; +} +mov.b32 r1346, {rs118, rs118}; +{ +mul.f16x2 r1338, r1194, r1346; +} +{ +sub.f16x2 r1341, r1335, r1338; +} +{ +mul.f16x2 r1344, r1191, r1346; +} +{ +fma.rn.f16x2 r1347, r1194, r1349, r1344; +} +mov.b32 r1365, {rs119, rs119}; +{ +mul.f16x2 r1351, r1203, r1365; +} +mov.b32 r1362, {rs120, rs120}; +{ +mul.f16x2 r1354, r1206, r1362; +} +{ +sub.f16x2 r1357, r1351, r1354; +} +{ +mul.f16x2 r1360, r1203, r1362; +} +{ +fma.rn.f16x2 r1363, r1206, r1365, r1360; +} +mov.b32 r1381, {rs121, rs121}; +{ +mul.f16x2 r1367, r1215, r1381; +} +mov.b32 r1378, {rs122, rs122}; +{ +mul.f16x2 r1370, r1218, r1378; +} +{ +sub.f16x2 r1373, r1367, r1370; +} +{ +mul.f16x2 r1376, r1215, r1378; +} +{ +fma.rn.f16x2 r1379, r1218, r1381, r1376; +} +{ +add.f16x2 %0, r539, r1149; +} +{ +add.f16x2 %1, r542, r1152; +} +{ +sub.f16x2 %24, r539, r1149; +} +{ +sub.f16x2 %25, r542, r1152; +} +{ +add.f16x2 %2, r551, r1227; +} +{ +add.f16x2 %3, r554, r1233; +} +{ +sub.f16x2 %26, r551, r1227; +} +{ +sub.f16x2 %27, r554, r1233; +} +{ +add.f16x2 %4, r563, r1243; +} +{ +add.f16x2 %5, r566, r1249; +} +{ +sub.f16x2 %28, r563, r1243; +} +{ +sub.f16x2 %29, r566, r1249; +} +{ +add.f16x2 %6, r575, r1259; +} +{ +add.f16x2 %7, r578, r1265; +} +{ +sub.f16x2 %30, r575, r1259; +} +{ +sub.f16x2 %31, r578, r1265; +} +{ +add.f16x2 %8, r587, r1275; +} +{ +add.f16x2 %9, r590, r1281; +} +{ +sub.f16x2 %32, r587, r1275; +} +{ +sub.f16x2 %33, r590, r1281; +} +{ +add.f16x2 %10, r599, r1291; +} +{ +add.f16x2 %11, r602, r1297; +} +{ +sub.f16x2 %34, r599, r1291; +} +{ +sub.f16x2 %35, r602, r1297; +} +{ +add.f16x2 %12, r545, r1301; +} +{ +add.f16x2 %13, r548, r1155; +} +{ +sub.f16x2 %36, r545, r1301; +} +{ +sub.f16x2 %37, r548, r1155; +} +{ +add.f16x2 %14, r557, r1309; +} +{ +add.f16x2 %15, r560, r1315; +} +{ +sub.f16x2 %38, r557, r1309; +} +{ +sub.f16x2 %39, r560, r1315; +} +{ +add.f16x2 %16, r569, r1325; +} +{ +add.f16x2 %17, r572, r1331; +} +{ +sub.f16x2 %40, r569, r1325; +} +{ +sub.f16x2 %41, r572, r1331; +} +{ +add.f16x2 %18, r581, r1341; +} +{ +add.f16x2 %19, r584, r1347; +} +{ +sub.f16x2 %42, r581, r1341; +} +{ +sub.f16x2 %43, r584, r1347; +} +{ +add.f16x2 %20, r593, r1357; +} +{ +add.f16x2 %21, r596, r1363; +} +{ +sub.f16x2 %44, r593, r1357; +} +{ +sub.f16x2 %45, r596, r1363; +} +{ +add.f16x2 %22, r605, r1373; +} +{ +add.f16x2 %23, r608, r1379; +} +{ +sub.f16x2 %46, r605, r1373; +} +{ +sub.f16x2 %47, r608, r1379; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)): "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..e6b69a550b787 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp32_fwd.hpp.inc @@ -0,0 +1,417 @@ +#ifndef CUFFTDX_FFT_24_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_24_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<10, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<498>; +.reg .b64 rd<2>; +add.f32 f97, %69, %90; +add.f32 f98, %48, f97; +add.f32 f99, %71, %92; +add.f32 f100, %49, f99; +mul.f32 f101, f97, 0f3F000000; +sub.f32 f102, %48, f101; +sub.f32 f103, %71, %92; +mul.f32 f104, f103, 0f3F5DB3D7; +add.f32 f105, f104, f102; +sub.f32 f106, f102, f104; +mul.f32 f107, f99, 0f3F000000; +sub.f32 f108, %49, f107; +sub.f32 f109, %69, %90; +mul.f32 f110, f109, 0f3F5DB3D7; +sub.f32 f111, f108, f110; +add.f32 f112, f110, f108; +add.f32 f113, %80, %101; +add.f32 f114, %58, f113; +add.f32 f115, %81, %103; +add.f32 f116, %60, f115; +mul.f32 f117, f113, 0f3F000000; +sub.f32 f118, %58, f117; +sub.f32 f119, %81, %103; +mul.f32 f120, f119, 0f3F5DB3D7; +add.f32 f121, f120, f118; +sub.f32 f122, f118, f120; +mul.f32 f123, f115, 0f3F000000; +sub.f32 f124, %60, f123; +sub.f32 f125, %80, %101; +mul.f32 f126, f125, 0f3F5DB3D7; +sub.f32 f127, f124, f126; +add.f32 f128, f126, f124; +mul.f32 f129, f121, 0f3F000000; +mul.f32 f130, f127, 0fBF5DB3D7; +sub.f32 f131, f129, f130; +mul.f32 f132, f127, 0f3F000000; +fma.rn.f32 f133, f121, 0fBF5DB3D7, f132; +mul.f32 f134, f122, 0fBF000000; +mul.f32 f135, f128, 0fBF5DB3D7; +sub.f32 f136, f134, f135; +mul.f32 f137, f128, 0fBF000000; +fma.rn.f32 f138, f122, 0fBF5DB3D7, f137; +add.f32 f139, f98, f114; +add.f32 f140, f100, f116; +sub.f32 f141, f98, f114; +sub.f32 f142, f100, f116; +add.f32 f143, f105, f131; +add.f32 f144, f111, f133; +sub.f32 f145, f105, f131; +sub.f32 f146, f111, f133; +add.f32 f147, f106, f136; +add.f32 f148, f112, f138; +sub.f32 f149, f106, f136; +sub.f32 f150, f112, f138; +add.f32 f151, %74, %96; +add.f32 f152, %53, f151; +add.f32 f153, %76, %97; +add.f32 f154, %55, f153; +mul.f32 f155, f151, 0f3F000000; +sub.f32 f156, %53, f155; +sub.f32 f157, %76, %97; +mul.f32 f158, f157, 0f3F5DB3D7; +add.f32 f159, f158, f156; +sub.f32 f160, f156, f158; +mul.f32 f161, f153, 0f3F000000; +sub.f32 f162, %55, f161; +sub.f32 f163, %74, %96; +mul.f32 f164, f163, 0f3F5DB3D7; +sub.f32 f165, f162, f164; +add.f32 f166, f164, f162; +add.f32 f167, %85, %106; +add.f32 f168, %64, f167; +add.f32 f169, %87, %108; +add.f32 f170, %65, f169; +mul.f32 f171, f167, 0f3F000000; +sub.f32 f172, %64, f171; +sub.f32 f173, %87, %108; +mul.f32 f174, f173, 0f3F5DB3D7; +add.f32 f175, f174, f172; +sub.f32 f176, f172, f174; +mul.f32 f177, f169, 0f3F000000; +sub.f32 f178, %65, f177; +sub.f32 f179, %85, %106; +mul.f32 f180, f179, 0f3F5DB3D7; +sub.f32 f181, f178, f180; +add.f32 f182, f180, f178; +mul.f32 f183, f175, 0f3F000000; +mul.f32 f184, f181, 0fBF5DB3D7; +sub.f32 f185, f183, f184; +mul.f32 f186, f181, 0f3F000000; +fma.rn.f32 f187, f175, 0fBF5DB3D7, f186; +mul.f32 f188, f176, 0fBF000000; +mul.f32 f189, f182, 0fBF5DB3D7; +sub.f32 f190, f188, f189; +mul.f32 f191, f182, 0fBF000000; +fma.rn.f32 f192, f176, 0fBF5DB3D7, f191; +add.f32 f193, f152, f168; +add.f32 f194, f154, f170; +sub.f32 f195, f152, f168; +sub.f32 f196, f154, f170; +add.f32 f197, f159, f185; +add.f32 f198, f165, f187; +sub.f32 f199, f159, f185; +sub.f32 f200, f165, f187; +add.f32 f201, f160, f190; +add.f32 f202, f166, f192; +sub.f32 f203, f160, f190; +sub.f32 f204, f166, f192; +mul.f32 f205, f197, 0f3F5DB3D7; +mul.f32 f206, f198, 0fBF000000; +sub.f32 f207, f205, f206; +mul.f32 f208, f198, 0f3F5DB3D7; +fma.rn.f32 f209, f197, 0fBF000000, f208; +mul.f32 f210, f201, 0f3F000000; +mul.f32 f211, f202, 0fBF5DB3D7; +sub.f32 f212, f210, f211; +mul.f32 f213, f202, 0f3F000000; +fma.rn.f32 f214, f201, 0fBF5DB3D7, f213; +mul.f32 f215, f199, 0fBF000000; +mul.f32 f216, f200, 0fBF5DB3D7; +sub.f32 f217, f215, f216; +mul.f32 f218, f200, 0fBF000000; +fma.rn.f32 f219, f199, 0fBF5DB3D7, f218; +mul.f32 f220, f203, 0fBF5DB3D7; +mul.f32 f221, f204, 0fBF000000; +sub.f32 f222, f220, f221; +mul.f32 f223, f204, 0fBF5DB3D7; +fma.rn.f32 f224, f203, 0fBF000000, f223; +add.f32 f225, f139, f193; +add.f32 f226, f140, f194; +sub.f32 f227, f139, f193; +sub.f32 f228, f140, f194; +add.f32 f229, f143, f207; +add.f32 f230, f144, f209; +sub.f32 f231, f143, f207; +sub.f32 f232, f144, f209; +add.f32 f233, f147, f212; +add.f32 f234, f148, f214; +sub.f32 f235, f147, f212; +sub.f32 f236, f148, f214; +add.f32 f237, f141, f196; +sub.f32 f238, f142, f195; +sub.f32 f239, f141, f196; +add.f32 f240, f142, f195; +add.f32 f241, f145, f217; +add.f32 f242, f146, f219; +sub.f32 f243, f145, f217; +sub.f32 f244, f146, f219; +add.f32 f245, f149, f222; +add.f32 f246, f150, f224; +sub.f32 f247, f149, f222; +sub.f32 f248, f150, f224; +add.f32 f249, %72, %93; +add.f32 f250, %50, f249; +add.f32 f251, %73, %95; +add.f32 f252, %52, f251; +mul.f32 f253, f249, 0f3F000000; +sub.f32 f254, %50, f253; +sub.f32 f255, %73, %95; +mul.f32 f256, f255, 0f3F5DB3D7; +add.f32 f257, f256, f254; +sub.f32 f258, f254, f256; +mul.f32 f259, f251, 0f3F000000; +sub.f32 f260, %52, f259; +sub.f32 f261, %72, %93; +mul.f32 f262, f261, 0f3F5DB3D7; +sub.f32 f263, f260, f262; +add.f32 f264, f262, f260; +add.f32 f265, %82, %104; +add.f32 f266, %61, f265; +add.f32 f267, %84, %105; +add.f32 f268, %63, f267; +mul.f32 f269, f265, 0f3F000000; +sub.f32 f270, %61, f269; +sub.f32 f271, %84, %105; +mul.f32 f272, f271, 0f3F5DB3D7; +add.f32 f273, f272, f270; +sub.f32 f274, f270, f272; +mul.f32 f275, f267, 0f3F000000; +sub.f32 f276, %63, f275; +sub.f32 f277, %82, %104; +mul.f32 f278, f277, 0f3F5DB3D7; +sub.f32 f279, f276, f278; +add.f32 f280, f278, f276; +mul.f32 f281, f273, 0f3F000000; +mul.f32 f282, f279, 0fBF5DB3D7; +sub.f32 f283, f281, f282; +mul.f32 f284, f279, 0f3F000000; +fma.rn.f32 f285, f273, 0fBF5DB3D7, f284; +mul.f32 f286, f274, 0fBF000000; +mul.f32 f287, f280, 0fBF5DB3D7; +sub.f32 f288, f286, f287; +mul.f32 f289, f280, 0fBF000000; +fma.rn.f32 f290, f274, 0fBF5DB3D7, f289; +add.f32 f291, f250, f266; +add.f32 f292, f252, f268; +sub.f32 f293, f250, f266; +sub.f32 f294, f252, f268; +add.f32 f295, f257, f283; +add.f32 f296, f263, f285; +sub.f32 f297, f257, f283; +sub.f32 f298, f263, f285; +add.f32 f299, f258, f288; +add.f32 f300, f264, f290; +sub.f32 f301, f258, f288; +sub.f32 f302, f264, f290; +add.f32 f303, %77, %98; +add.f32 f304, %56, f303; +add.f32 f305, %79, %100; +add.f32 f306, %57, f305; +mul.f32 f307, f303, 0f3F000000; +sub.f32 f308, %56, f307; +sub.f32 f309, %79, %100; +mul.f32 f310, f309, 0f3F5DB3D7; +add.f32 f311, f310, f308; +sub.f32 f312, f308, f310; +mul.f32 f313, f305, 0f3F000000; +sub.f32 f314, %57, f313; +sub.f32 f315, %77, %98; +mul.f32 f316, f315, 0f3F5DB3D7; +sub.f32 f317, f314, f316; +add.f32 f318, f316, f314; +add.f32 f319, %88, %109; +add.f32 f320, %66, f319; +add.f32 f321, %89, %110; +add.f32 f322, %68, f321; +mul.f32 f323, f319, 0f3F000000; +sub.f32 f324, %66, f323; +sub.f32 f325, %89, %110; +mul.f32 f326, f325, 0f3F5DB3D7; +add.f32 f327, f326, f324; +sub.f32 f328, f324, f326; +mul.f32 f329, f321, 0f3F000000; +sub.f32 f330, %68, f329; +sub.f32 f331, %88, %109; +mul.f32 f332, f331, 0f3F5DB3D7; +sub.f32 f333, f330, f332; +add.f32 f334, f332, f330; +mul.f32 f335, f327, 0f3F000000; +mul.f32 f336, f333, 0fBF5DB3D7; +sub.f32 f337, f335, f336; +mul.f32 f338, f333, 0f3F000000; +fma.rn.f32 f339, f327, 0fBF5DB3D7, f338; +mul.f32 f340, f328, 0fBF000000; +mul.f32 f341, f334, 0fBF5DB3D7; +sub.f32 f342, f340, f341; +mul.f32 f343, f334, 0fBF000000; +fma.rn.f32 f344, f328, 0fBF5DB3D7, f343; +add.f32 f345, f304, f320; +add.f32 f346, f306, f322; +sub.f32 f347, f304, f320; +sub.f32 f348, f306, f322; +add.f32 f349, f311, f337; +add.f32 f350, f317, f339; +sub.f32 f351, f311, f337; +sub.f32 f352, f317, f339; +add.f32 f353, f312, f342; +add.f32 f354, f318, f344; +sub.f32 f355, f312, f342; +sub.f32 f356, f318, f344; +mul.f32 f357, f349, 0f3F5DB3D7; +mul.f32 f358, f350, 0fBF000000; +sub.f32 f359, f357, f358; +mul.f32 f360, f350, 0f3F5DB3D7; +fma.rn.f32 f361, f349, 0fBF000000, f360; +mul.f32 f362, f353, 0f3F000000; +mul.f32 f363, f354, 0fBF5DB3D7; +sub.f32 f364, f362, f363; +mul.f32 f365, f354, 0f3F000000; +fma.rn.f32 f366, f353, 0fBF5DB3D7, f365; +mul.f32 f367, f351, 0fBF000000; +mul.f32 f368, f352, 0fBF5DB3D7; +sub.f32 f369, f367, f368; +mul.f32 f370, f352, 0fBF000000; +fma.rn.f32 f371, f351, 0fBF5DB3D7, f370; +mul.f32 f372, f355, 0fBF5DB3D7; +mul.f32 f373, f356, 0fBF000000; +sub.f32 f374, f372, f373; +mul.f32 f375, f356, 0fBF5DB3D7; +fma.rn.f32 f376, f355, 0fBF000000, f375; +add.f32 f377, f291, f345; +add.f32 f378, f292, f346; +sub.f32 f379, f291, f345; +sub.f32 f380, f292, f346; +add.f32 f381, f295, f359; +add.f32 f382, f296, f361; +sub.f32 f383, f295, f359; +sub.f32 f384, f296, f361; +add.f32 f385, f299, f364; +add.f32 f386, f300, f366; +sub.f32 f387, f299, f364; +sub.f32 f388, f300, f366; +add.f32 f389, f293, f348; +sub.f32 f390, f294, f347; +sub.f32 f391, f293, f348; +add.f32 f392, f294, f347; +add.f32 f393, f297, f369; +add.f32 f394, f298, f371; +sub.f32 f395, f297, f369; +sub.f32 f396, f298, f371; +add.f32 f397, f301, f374; +add.f32 f398, f302, f376; +sub.f32 f399, f301, f374; +sub.f32 f400, f302, f376; +mul.f32 f401, f381, 0f3F7746EA; +mul.f32 f402, f382, 0fBE8483EE; +sub.f32 f403, f401, f402; +mul.f32 f404, f382, 0f3F7746EA; +fma.rn.f32 f405, f381, 0fBE8483EE, f404; +mul.f32 f406, f385, 0f3F5DB3D7; +mul.f32 f407, f386, 0fBF000000; +sub.f32 f408, f406, f407; +mul.f32 f409, f386, 0f3F5DB3D7; +fma.rn.f32 f410, f385, 0fBF000000, f409; +mul.f32 f411, f389, 0f3F3504F3; +mul.f32 f412, f390, 0fBF3504F3; +sub.f32 f413, f411, f412; +mul.f32 f414, f390, 0f3F3504F3; +fma.rn.f32 f415, f389, 0fBF3504F3, f414; +mul.f32 f416, f393, 0f3F000000; +mul.f32 f417, f394, 0fBF5DB3D7; +sub.f32 f418, f416, f417; +mul.f32 f419, f394, 0f3F000000; +fma.rn.f32 f420, f393, 0fBF5DB3D7, f419; +mul.f32 f421, f397, 0f3E8483EE; +mul.f32 f422, f398, 0fBF7746EA; +sub.f32 f423, f421, f422; +mul.f32 f424, f398, 0f3E8483EE; +fma.rn.f32 f425, f397, 0fBF7746EA, f424; +mul.f32 f426, f383, 0fBE8483EE; +mul.f32 f427, f384, 0fBF7746EA; +sub.f32 f428, f426, f427; +mul.f32 f429, f384, 0fBE8483EE; +fma.rn.f32 f430, f383, 0fBF7746EA, f429; +mul.f32 f431, f387, 0fBF000000; +mul.f32 f432, f388, 0fBF5DB3D7; +sub.f32 f433, f431, f432; +mul.f32 f434, f388, 0fBF000000; +fma.rn.f32 f435, f387, 0fBF5DB3D7, f434; +mul.f32 f436, f391, 0fBF3504F3; +mul.f32 f437, f392, 0fBF3504F3; +sub.f32 f438, f436, f437; +add.f32 f439, f436, f437; +mul.f32 f440, f395, 0fBF5DB3D7; +mul.f32 f441, f396, 0fBF000000; +sub.f32 f442, f440, f441; +mul.f32 f443, f396, 0fBF5DB3D7; +fma.rn.f32 f444, f395, 0fBF000000, f443; +mul.f32 f445, f399, 0fBF7746EA; +mul.f32 f446, f400, 0fBE8483EE; +sub.f32 f447, f445, f446; +mul.f32 f448, f400, 0fBF7746EA; +fma.rn.f32 f449, f399, 0fBE8483EE, f448; +add.f32 %1, f226, f378; +add.f32 %0, f225, f377; +add.f32 %3, f230, f405; +add.f32 %2, f229, f403; +add.f32 %5, f234, f410; +add.f32 %4, f233, f408; +add.f32 %7, f238, f415; +add.f32 %6, f237, f413; +add.f32 %9, f242, f420; +add.f32 %8, f241, f418; +add.f32 %11, f246, f425; +add.f32 %10, f245, f423; +sub.f32 %13, f228, f379; +add.f32 %12, f227, f380; +add.f32 %15, f232, f430; +add.f32 %14, f231, f428; +add.f32 %17, f236, f435; +add.f32 %16, f235, f433; +add.f32 %19, f240, f439; +add.f32 %18, f239, f438; +add.f32 %21, f244, f444; +add.f32 %20, f243, f442; +add.f32 %23, f248, f449; +add.f32 %22, f247, f447; +sub.f32 %25, f226, f378; +sub.f32 %24, f225, f377; +sub.f32 %27, f230, f405; +sub.f32 %26, f229, f403; +sub.f32 %29, f234, f410; +sub.f32 %28, f233, f408; +sub.f32 %31, f238, f415; +sub.f32 %30, f237, f413; +sub.f32 %33, f242, f420; +sub.f32 %32, f241, f418; +sub.f32 %35, f246, f425; +sub.f32 %34, f245, f423; +add.f32 %37, f228, f379; +sub.f32 %36, f227, f380; +sub.f32 %39, f232, f430; +sub.f32 %38, f231, f428; +sub.f32 %41, f236, f435; +sub.f32 %40, f235, f433; +sub.f32 %43, f240, f439; +sub.f32 %42, f239, f438; +sub.f32 %45, f244, f444; +sub.f32 %44, f243, f442; +sub.f32 %47, f248, f449; +sub.f32 %46, f247, f447; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..2b1fe2366c8ea --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp32_inv.hpp.inc @@ -0,0 +1,417 @@ +#ifndef CUFFTDX_FFT_24_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_24_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<212, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<498>; +.reg .b64 rd<2>; +add.f32 f97, %69, %90; +add.f32 f98, %48, f97; +add.f32 f99, %71, %92; +add.f32 f100, %49, f99; +mul.f32 f101, f97, 0f3F000000; +sub.f32 f102, %48, f101; +sub.f32 f103, %71, %92; +mul.f32 f104, f103, 0fBF5DB3D7; +add.f32 f105, f104, f102; +sub.f32 f106, f102, f104; +mul.f32 f107, f99, 0f3F000000; +sub.f32 f108, %49, f107; +sub.f32 f109, %69, %90; +mul.f32 f110, f109, 0fBF5DB3D7; +sub.f32 f111, f108, f110; +add.f32 f112, f110, f108; +add.f32 f113, %80, %101; +add.f32 f114, %58, f113; +add.f32 f115, %81, %103; +add.f32 f116, %60, f115; +mul.f32 f117, f113, 0f3F000000; +sub.f32 f118, %58, f117; +sub.f32 f119, %81, %103; +mul.f32 f120, f119, 0fBF5DB3D7; +add.f32 f121, f120, f118; +sub.f32 f122, f118, f120; +mul.f32 f123, f115, 0f3F000000; +sub.f32 f124, %60, f123; +sub.f32 f125, %80, %101; +mul.f32 f126, f125, 0fBF5DB3D7; +sub.f32 f127, f124, f126; +add.f32 f128, f126, f124; +mul.f32 f129, f121, 0f3F000000; +mul.f32 f130, f127, 0f3F5DB3D7; +sub.f32 f131, f129, f130; +mul.f32 f132, f127, 0f3F000000; +fma.rn.f32 f133, f121, 0f3F5DB3D7, f132; +mul.f32 f134, f122, 0fBF000000; +mul.f32 f135, f128, 0f3F5DB3D7; +sub.f32 f136, f134, f135; +mul.f32 f137, f128, 0fBF000000; +fma.rn.f32 f138, f122, 0f3F5DB3D7, f137; +add.f32 f139, f98, f114; +add.f32 f140, f100, f116; +sub.f32 f141, f98, f114; +sub.f32 f142, f100, f116; +add.f32 f143, f105, f131; +add.f32 f144, f111, f133; +sub.f32 f145, f105, f131; +sub.f32 f146, f111, f133; +add.f32 f147, f106, f136; +add.f32 f148, f112, f138; +sub.f32 f149, f106, f136; +sub.f32 f150, f112, f138; +add.f32 f151, %74, %96; +add.f32 f152, %53, f151; +add.f32 f153, %76, %97; +add.f32 f154, %55, f153; +mul.f32 f155, f151, 0f3F000000; +sub.f32 f156, %53, f155; +sub.f32 f157, %76, %97; +mul.f32 f158, f157, 0fBF5DB3D7; +add.f32 f159, f158, f156; +sub.f32 f160, f156, f158; +mul.f32 f161, f153, 0f3F000000; +sub.f32 f162, %55, f161; +sub.f32 f163, %74, %96; +mul.f32 f164, f163, 0fBF5DB3D7; +sub.f32 f165, f162, f164; +add.f32 f166, f164, f162; +add.f32 f167, %85, %106; +add.f32 f168, %64, f167; +add.f32 f169, %87, %108; +add.f32 f170, %65, f169; +mul.f32 f171, f167, 0f3F000000; +sub.f32 f172, %64, f171; +sub.f32 f173, %87, %108; +mul.f32 f174, f173, 0fBF5DB3D7; +add.f32 f175, f174, f172; +sub.f32 f176, f172, f174; +mul.f32 f177, f169, 0f3F000000; +sub.f32 f178, %65, f177; +sub.f32 f179, %85, %106; +mul.f32 f180, f179, 0fBF5DB3D7; +sub.f32 f181, f178, f180; +add.f32 f182, f180, f178; +mul.f32 f183, f175, 0f3F000000; +mul.f32 f184, f181, 0f3F5DB3D7; +sub.f32 f185, f183, f184; +mul.f32 f186, f181, 0f3F000000; +fma.rn.f32 f187, f175, 0f3F5DB3D7, f186; +mul.f32 f188, f176, 0fBF000000; +mul.f32 f189, f182, 0f3F5DB3D7; +sub.f32 f190, f188, f189; +mul.f32 f191, f182, 0fBF000000; +fma.rn.f32 f192, f176, 0f3F5DB3D7, f191; +add.f32 f193, f152, f168; +add.f32 f194, f154, f170; +sub.f32 f195, f152, f168; +sub.f32 f196, f154, f170; +add.f32 f197, f159, f185; +add.f32 f198, f165, f187; +sub.f32 f199, f159, f185; +sub.f32 f200, f165, f187; +add.f32 f201, f160, f190; +add.f32 f202, f166, f192; +sub.f32 f203, f160, f190; +sub.f32 f204, f166, f192; +mul.f32 f205, f197, 0f3F5DB3D7; +mul.f32 f206, f198, 0f3F000000; +sub.f32 f207, f205, f206; +mul.f32 f208, f198, 0f3F5DB3D7; +fma.rn.f32 f209, f197, 0f3F000000, f208; +mul.f32 f210, f201, 0f3F000000; +mul.f32 f211, f202, 0f3F5DB3D7; +sub.f32 f212, f210, f211; +mul.f32 f213, f202, 0f3F000000; +fma.rn.f32 f214, f201, 0f3F5DB3D7, f213; +mul.f32 f215, f199, 0fBF000000; +mul.f32 f216, f200, 0f3F5DB3D7; +sub.f32 f217, f215, f216; +mul.f32 f218, f200, 0fBF000000; +fma.rn.f32 f219, f199, 0f3F5DB3D7, f218; +mul.f32 f220, f203, 0fBF5DB3D7; +mul.f32 f221, f204, 0f3F000000; +sub.f32 f222, f220, f221; +mul.f32 f223, f204, 0fBF5DB3D7; +fma.rn.f32 f224, f203, 0f3F000000, f223; +add.f32 f225, f139, f193; +add.f32 f226, f140, f194; +sub.f32 f227, f139, f193; +sub.f32 f228, f140, f194; +add.f32 f229, f143, f207; +add.f32 f230, f144, f209; +sub.f32 f231, f143, f207; +sub.f32 f232, f144, f209; +add.f32 f233, f147, f212; +add.f32 f234, f148, f214; +sub.f32 f235, f147, f212; +sub.f32 f236, f148, f214; +sub.f32 f237, f141, f196; +add.f32 f238, f142, f195; +add.f32 f239, f141, f196; +sub.f32 f240, f142, f195; +add.f32 f241, f145, f217; +add.f32 f242, f146, f219; +sub.f32 f243, f145, f217; +sub.f32 f244, f146, f219; +add.f32 f245, f149, f222; +add.f32 f246, f150, f224; +sub.f32 f247, f149, f222; +sub.f32 f248, f150, f224; +add.f32 f249, %72, %93; +add.f32 f250, %50, f249; +add.f32 f251, %73, %95; +add.f32 f252, %52, f251; +mul.f32 f253, f249, 0f3F000000; +sub.f32 f254, %50, f253; +sub.f32 f255, %73, %95; +mul.f32 f256, f255, 0fBF5DB3D7; +add.f32 f257, f256, f254; +sub.f32 f258, f254, f256; +mul.f32 f259, f251, 0f3F000000; +sub.f32 f260, %52, f259; +sub.f32 f261, %72, %93; +mul.f32 f262, f261, 0fBF5DB3D7; +sub.f32 f263, f260, f262; +add.f32 f264, f262, f260; +add.f32 f265, %82, %104; +add.f32 f266, %61, f265; +add.f32 f267, %84, %105; +add.f32 f268, %63, f267; +mul.f32 f269, f265, 0f3F000000; +sub.f32 f270, %61, f269; +sub.f32 f271, %84, %105; +mul.f32 f272, f271, 0fBF5DB3D7; +add.f32 f273, f272, f270; +sub.f32 f274, f270, f272; +mul.f32 f275, f267, 0f3F000000; +sub.f32 f276, %63, f275; +sub.f32 f277, %82, %104; +mul.f32 f278, f277, 0fBF5DB3D7; +sub.f32 f279, f276, f278; +add.f32 f280, f278, f276; +mul.f32 f281, f273, 0f3F000000; +mul.f32 f282, f279, 0f3F5DB3D7; +sub.f32 f283, f281, f282; +mul.f32 f284, f279, 0f3F000000; +fma.rn.f32 f285, f273, 0f3F5DB3D7, f284; +mul.f32 f286, f274, 0fBF000000; +mul.f32 f287, f280, 0f3F5DB3D7; +sub.f32 f288, f286, f287; +mul.f32 f289, f280, 0fBF000000; +fma.rn.f32 f290, f274, 0f3F5DB3D7, f289; +add.f32 f291, f250, f266; +add.f32 f292, f252, f268; +sub.f32 f293, f250, f266; +sub.f32 f294, f252, f268; +add.f32 f295, f257, f283; +add.f32 f296, f263, f285; +sub.f32 f297, f257, f283; +sub.f32 f298, f263, f285; +add.f32 f299, f258, f288; +add.f32 f300, f264, f290; +sub.f32 f301, f258, f288; +sub.f32 f302, f264, f290; +add.f32 f303, %77, %98; +add.f32 f304, %56, f303; +add.f32 f305, %79, %100; +add.f32 f306, %57, f305; +mul.f32 f307, f303, 0f3F000000; +sub.f32 f308, %56, f307; +sub.f32 f309, %79, %100; +mul.f32 f310, f309, 0fBF5DB3D7; +add.f32 f311, f310, f308; +sub.f32 f312, f308, f310; +mul.f32 f313, f305, 0f3F000000; +sub.f32 f314, %57, f313; +sub.f32 f315, %77, %98; +mul.f32 f316, f315, 0fBF5DB3D7; +sub.f32 f317, f314, f316; +add.f32 f318, f316, f314; +add.f32 f319, %88, %109; +add.f32 f320, %66, f319; +add.f32 f321, %89, %110; +add.f32 f322, %68, f321; +mul.f32 f323, f319, 0f3F000000; +sub.f32 f324, %66, f323; +sub.f32 f325, %89, %110; +mul.f32 f326, f325, 0fBF5DB3D7; +add.f32 f327, f326, f324; +sub.f32 f328, f324, f326; +mul.f32 f329, f321, 0f3F000000; +sub.f32 f330, %68, f329; +sub.f32 f331, %88, %109; +mul.f32 f332, f331, 0fBF5DB3D7; +sub.f32 f333, f330, f332; +add.f32 f334, f332, f330; +mul.f32 f335, f327, 0f3F000000; +mul.f32 f336, f333, 0f3F5DB3D7; +sub.f32 f337, f335, f336; +mul.f32 f338, f333, 0f3F000000; +fma.rn.f32 f339, f327, 0f3F5DB3D7, f338; +mul.f32 f340, f328, 0fBF000000; +mul.f32 f341, f334, 0f3F5DB3D7; +sub.f32 f342, f340, f341; +mul.f32 f343, f334, 0fBF000000; +fma.rn.f32 f344, f328, 0f3F5DB3D7, f343; +add.f32 f345, f304, f320; +add.f32 f346, f306, f322; +sub.f32 f347, f304, f320; +sub.f32 f348, f306, f322; +add.f32 f349, f311, f337; +add.f32 f350, f317, f339; +sub.f32 f351, f311, f337; +sub.f32 f352, f317, f339; +add.f32 f353, f312, f342; +add.f32 f354, f318, f344; +sub.f32 f355, f312, f342; +sub.f32 f356, f318, f344; +mul.f32 f357, f349, 0f3F5DB3D7; +mul.f32 f358, f350, 0f3F000000; +sub.f32 f359, f357, f358; +mul.f32 f360, f350, 0f3F5DB3D7; +fma.rn.f32 f361, f349, 0f3F000000, f360; +mul.f32 f362, f353, 0f3F000000; +mul.f32 f363, f354, 0f3F5DB3D7; +sub.f32 f364, f362, f363; +mul.f32 f365, f354, 0f3F000000; +fma.rn.f32 f366, f353, 0f3F5DB3D7, f365; +mul.f32 f367, f351, 0fBF000000; +mul.f32 f368, f352, 0f3F5DB3D7; +sub.f32 f369, f367, f368; +mul.f32 f370, f352, 0fBF000000; +fma.rn.f32 f371, f351, 0f3F5DB3D7, f370; +mul.f32 f372, f355, 0fBF5DB3D7; +mul.f32 f373, f356, 0f3F000000; +sub.f32 f374, f372, f373; +mul.f32 f375, f356, 0fBF5DB3D7; +fma.rn.f32 f376, f355, 0f3F000000, f375; +add.f32 f377, f291, f345; +add.f32 f378, f292, f346; +sub.f32 f379, f291, f345; +sub.f32 f380, f292, f346; +add.f32 f381, f295, f359; +add.f32 f382, f296, f361; +sub.f32 f383, f295, f359; +sub.f32 f384, f296, f361; +add.f32 f385, f299, f364; +add.f32 f386, f300, f366; +sub.f32 f387, f299, f364; +sub.f32 f388, f300, f366; +sub.f32 f389, f293, f348; +add.f32 f390, f294, f347; +add.f32 f391, f293, f348; +sub.f32 f392, f294, f347; +add.f32 f393, f297, f369; +add.f32 f394, f298, f371; +sub.f32 f395, f297, f369; +sub.f32 f396, f298, f371; +add.f32 f397, f301, f374; +add.f32 f398, f302, f376; +sub.f32 f399, f301, f374; +sub.f32 f400, f302, f376; +mul.f32 f401, f381, 0f3F7746EA; +mul.f32 f402, f382, 0f3E8483EE; +sub.f32 f403, f401, f402; +mul.f32 f404, f382, 0f3F7746EA; +fma.rn.f32 f405, f381, 0f3E8483EE, f404; +mul.f32 f406, f385, 0f3F5DB3D7; +mul.f32 f407, f386, 0f3F000000; +sub.f32 f408, f406, f407; +mul.f32 f409, f386, 0f3F5DB3D7; +fma.rn.f32 f410, f385, 0f3F000000, f409; +mul.f32 f411, f389, 0f3F3504F3; +mul.f32 f412, f390, 0f3F3504F3; +sub.f32 f413, f411, f412; +add.f32 f414, f411, f412; +mul.f32 f415, f393, 0f3F000000; +mul.f32 f416, f394, 0f3F5DB3D7; +sub.f32 f417, f415, f416; +mul.f32 f418, f394, 0f3F000000; +fma.rn.f32 f419, f393, 0f3F5DB3D7, f418; +mul.f32 f420, f397, 0f3E8483EE; +mul.f32 f421, f398, 0f3F7746EA; +sub.f32 f422, f420, f421; +mul.f32 f423, f398, 0f3E8483EE; +fma.rn.f32 f424, f397, 0f3F7746EA, f423; +mul.f32 f425, f383, 0fBE8483EE; +mul.f32 f426, f384, 0f3F7746EA; +sub.f32 f427, f425, f426; +mul.f32 f428, f384, 0fBE8483EE; +fma.rn.f32 f429, f383, 0f3F7746EA, f428; +mul.f32 f430, f387, 0fBF000000; +mul.f32 f431, f388, 0f3F5DB3D7; +sub.f32 f432, f430, f431; +mul.f32 f433, f388, 0fBF000000; +fma.rn.f32 f434, f387, 0f3F5DB3D7, f433; +mul.f32 f435, f391, 0fBF3504F3; +mul.f32 f436, f392, 0f3F3504F3; +sub.f32 f437, f435, f436; +mul.f32 f438, f392, 0fBF3504F3; +fma.rn.f32 f439, f391, 0f3F3504F3, f438; +mul.f32 f440, f395, 0fBF5DB3D7; +mul.f32 f441, f396, 0f3F000000; +sub.f32 f442, f440, f441; +mul.f32 f443, f396, 0fBF5DB3D7; +fma.rn.f32 f444, f395, 0f3F000000, f443; +mul.f32 f445, f399, 0fBF7746EA; +mul.f32 f446, f400, 0f3E8483EE; +sub.f32 f447, f445, f446; +mul.f32 f448, f400, 0fBF7746EA; +fma.rn.f32 f449, f399, 0f3E8483EE, f448; +add.f32 %1, f226, f378; +add.f32 %0, f225, f377; +add.f32 %3, f230, f405; +add.f32 %2, f229, f403; +add.f32 %5, f234, f410; +add.f32 %4, f233, f408; +add.f32 %7, f238, f414; +add.f32 %6, f237, f413; +add.f32 %9, f242, f419; +add.f32 %8, f241, f417; +add.f32 %11, f246, f424; +add.f32 %10, f245, f422; +add.f32 %13, f228, f379; +sub.f32 %12, f227, f380; +add.f32 %15, f232, f429; +add.f32 %14, f231, f427; +add.f32 %17, f236, f434; +add.f32 %16, f235, f432; +add.f32 %19, f240, f439; +add.f32 %18, f239, f437; +add.f32 %21, f244, f444; +add.f32 %20, f243, f442; +add.f32 %23, f248, f449; +add.f32 %22, f247, f447; +sub.f32 %25, f226, f378; +sub.f32 %24, f225, f377; +sub.f32 %27, f230, f405; +sub.f32 %26, f229, f403; +sub.f32 %29, f234, f410; +sub.f32 %28, f233, f408; +sub.f32 %31, f238, f414; +sub.f32 %30, f237, f413; +sub.f32 %33, f242, f419; +sub.f32 %32, f241, f417; +sub.f32 %35, f246, f424; +sub.f32 %34, f245, f422; +sub.f32 %37, f228, f379; +add.f32 %36, f227, f380; +sub.f32 %39, f232, f429; +sub.f32 %38, f231, f427; +sub.f32 %41, f236, f434; +sub.f32 %40, f235, f432; +sub.f32 %43, f240, f439; +sub.f32 %42, f239, f437; +sub.f32 %45, f244, f444; +sub.f32 %44, f243, f442; +sub.f32 %47, f248, f449; +sub.f32 %46, f247, f447; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..228e3bb12abce --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp64_fwd.hpp.inc @@ -0,0 +1,417 @@ +#ifndef CUFFTDX_FFT_24_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_24_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<414, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<498>; +.reg .b64 rd<2>; +add.f64 fd97, %69, %90; +add.f64 fd98, %48, fd97; +add.f64 fd99, %71, %92; +add.f64 fd100, %49, fd99; +mul.f64 fd101, fd97, 0d3FE0000000000000; +sub.f64 fd102, %48, fd101; +sub.f64 fd103, %71, %92; +mul.f64 fd104, fd103, 0d3FEBB67AE8584CAA; +add.f64 fd105, fd104, fd102; +sub.f64 fd106, fd102, fd104; +mul.f64 fd107, fd99, 0d3FE0000000000000; +sub.f64 fd108, %49, fd107; +sub.f64 fd109, %69, %90; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +sub.f64 fd111, fd108, fd110; +add.f64 fd112, fd110, fd108; +add.f64 fd113, %80, %101; +add.f64 fd114, %58, fd113; +add.f64 fd115, %81, %103; +add.f64 fd116, %60, fd115; +mul.f64 fd117, fd113, 0d3FE0000000000000; +sub.f64 fd118, %58, fd117; +sub.f64 fd119, %81, %103; +mul.f64 fd120, fd119, 0d3FEBB67AE8584CAA; +add.f64 fd121, fd120, fd118; +sub.f64 fd122, fd118, fd120; +mul.f64 fd123, fd115, 0d3FE0000000000000; +sub.f64 fd124, %60, fd123; +sub.f64 fd125, %80, %101; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +sub.f64 fd127, fd124, fd126; +add.f64 fd128, fd126, fd124; +mul.f64 fd129, fd121, 0d3FE0000000000000; +mul.f64 fd130, fd127, 0dBFEBB67AE8584CAA; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd127, 0d3FE0000000000000; +fma.rn.f64 fd133, fd121, 0dBFEBB67AE8584CAA, fd132; +mul.f64 fd134, fd122, 0dBFE0000000000000; +mul.f64 fd135, fd128, 0dBFEBB67AE8584CAA; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd128, 0dBFE0000000000000; +fma.rn.f64 fd138, fd122, 0dBFEBB67AE8584CAA, fd137; +add.f64 fd139, fd98, fd114; +add.f64 fd140, fd100, fd116; +sub.f64 fd141, fd98, fd114; +sub.f64 fd142, fd100, fd116; +add.f64 fd143, fd105, fd131; +add.f64 fd144, fd111, fd133; +sub.f64 fd145, fd105, fd131; +sub.f64 fd146, fd111, fd133; +add.f64 fd147, fd106, fd136; +add.f64 fd148, fd112, fd138; +sub.f64 fd149, fd106, fd136; +sub.f64 fd150, fd112, fd138; +add.f64 fd151, %74, %96; +add.f64 fd152, %53, fd151; +add.f64 fd153, %76, %97; +add.f64 fd154, %55, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, %53, fd155; +sub.f64 fd157, %76, %97; +mul.f64 fd158, fd157, 0d3FEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, %55, fd161; +sub.f64 fd163, %74, %96; +mul.f64 fd164, fd163, 0d3FEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, %85, %106; +add.f64 fd168, %64, fd167; +add.f64 fd169, %87, %108; +add.f64 fd170, %65, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, %64, fd171; +sub.f64 fd173, %87, %108; +mul.f64 fd174, fd173, 0d3FEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, %65, fd177; +sub.f64 fd179, %85, %106; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0dBFEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0dBFEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0dBFEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0dBFEBB67AE8584CAA, fd191; +add.f64 fd193, fd152, fd168; +add.f64 fd194, fd154, fd170; +sub.f64 fd195, fd152, fd168; +sub.f64 fd196, fd154, fd170; +add.f64 fd197, fd159, fd185; +add.f64 fd198, fd165, fd187; +sub.f64 fd199, fd159, fd185; +sub.f64 fd200, fd165, fd187; +add.f64 fd201, fd160, fd190; +add.f64 fd202, fd166, fd192; +sub.f64 fd203, fd160, fd190; +sub.f64 fd204, fd166, fd192; +mul.f64 fd205, fd197, 0d3FEBB67AE8584CAA; +mul.f64 fd206, fd198, 0dBFE0000000000000; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd198, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd209, fd197, 0dBFE0000000000000, fd208; +mul.f64 fd210, fd201, 0d3FE0000000000000; +mul.f64 fd211, fd202, 0dBFEBB67AE8584CAA; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd202, 0d3FE0000000000000; +fma.rn.f64 fd214, fd201, 0dBFEBB67AE8584CAA, fd213; +mul.f64 fd215, fd199, 0dBFE0000000000000; +mul.f64 fd216, fd200, 0dBFEBB67AE8584CAA; +sub.f64 fd217, fd215, fd216; +mul.f64 fd218, fd200, 0dBFE0000000000000; +fma.rn.f64 fd219, fd199, 0dBFEBB67AE8584CAA, fd218; +mul.f64 fd220, fd203, 0dBFEBB67AE8584CAA; +mul.f64 fd221, fd204, 0dBFE0000000000000; +sub.f64 fd222, fd220, fd221; +mul.f64 fd223, fd204, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd224, fd203, 0dBFE0000000000000, fd223; +add.f64 fd225, fd139, fd193; +add.f64 fd226, fd140, fd194; +sub.f64 fd227, fd139, fd193; +sub.f64 fd228, fd140, fd194; +add.f64 fd229, fd143, fd207; +add.f64 fd230, fd144, fd209; +sub.f64 fd231, fd143, fd207; +sub.f64 fd232, fd144, fd209; +add.f64 fd233, fd147, fd212; +add.f64 fd234, fd148, fd214; +sub.f64 fd235, fd147, fd212; +sub.f64 fd236, fd148, fd214; +add.f64 fd237, fd141, fd196; +sub.f64 fd238, fd142, fd195; +sub.f64 fd239, fd141, fd196; +add.f64 fd240, fd142, fd195; +add.f64 fd241, fd145, fd217; +add.f64 fd242, fd146, fd219; +sub.f64 fd243, fd145, fd217; +sub.f64 fd244, fd146, fd219; +add.f64 fd245, fd149, fd222; +add.f64 fd246, fd150, fd224; +sub.f64 fd247, fd149, fd222; +sub.f64 fd248, fd150, fd224; +add.f64 fd249, %72, %93; +add.f64 fd250, %50, fd249; +add.f64 fd251, %73, %95; +add.f64 fd252, %52, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, %50, fd253; +sub.f64 fd255, %73, %95; +mul.f64 fd256, fd255, 0d3FEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, %52, fd259; +sub.f64 fd261, %72, %93; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, %82, %104; +add.f64 fd266, %61, fd265; +add.f64 fd267, %84, %105; +add.f64 fd268, %63, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, %61, fd269; +sub.f64 fd271, %84, %105; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, %63, fd275; +sub.f64 fd277, %82, %104; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +mul.f64 fd281, fd273, 0d3FE0000000000000; +mul.f64 fd282, fd279, 0dBFEBB67AE8584CAA; +sub.f64 fd283, fd281, fd282; +mul.f64 fd284, fd279, 0d3FE0000000000000; +fma.rn.f64 fd285, fd273, 0dBFEBB67AE8584CAA, fd284; +mul.f64 fd286, fd274, 0dBFE0000000000000; +mul.f64 fd287, fd280, 0dBFEBB67AE8584CAA; +sub.f64 fd288, fd286, fd287; +mul.f64 fd289, fd280, 0dBFE0000000000000; +fma.rn.f64 fd290, fd274, 0dBFEBB67AE8584CAA, fd289; +add.f64 fd291, fd250, fd266; +add.f64 fd292, fd252, fd268; +sub.f64 fd293, fd250, fd266; +sub.f64 fd294, fd252, fd268; +add.f64 fd295, fd257, fd283; +add.f64 fd296, fd263, fd285; +sub.f64 fd297, fd257, fd283; +sub.f64 fd298, fd263, fd285; +add.f64 fd299, fd258, fd288; +add.f64 fd300, fd264, fd290; +sub.f64 fd301, fd258, fd288; +sub.f64 fd302, fd264, fd290; +add.f64 fd303, %77, %98; +add.f64 fd304, %56, fd303; +add.f64 fd305, %79, %100; +add.f64 fd306, %57, fd305; +mul.f64 fd307, fd303, 0d3FE0000000000000; +sub.f64 fd308, %56, fd307; +sub.f64 fd309, %79, %100; +mul.f64 fd310, fd309, 0d3FEBB67AE8584CAA; +add.f64 fd311, fd310, fd308; +sub.f64 fd312, fd308, fd310; +mul.f64 fd313, fd305, 0d3FE0000000000000; +sub.f64 fd314, %57, fd313; +sub.f64 fd315, %77, %98; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +sub.f64 fd317, fd314, fd316; +add.f64 fd318, fd316, fd314; +add.f64 fd319, %88, %109; +add.f64 fd320, %66, fd319; +add.f64 fd321, %89, %110; +add.f64 fd322, %68, fd321; +mul.f64 fd323, fd319, 0d3FE0000000000000; +sub.f64 fd324, %66, fd323; +sub.f64 fd325, %89, %110; +mul.f64 fd326, fd325, 0d3FEBB67AE8584CAA; +add.f64 fd327, fd326, fd324; +sub.f64 fd328, fd324, fd326; +mul.f64 fd329, fd321, 0d3FE0000000000000; +sub.f64 fd330, %68, fd329; +sub.f64 fd331, %88, %109; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +sub.f64 fd333, fd330, fd332; +add.f64 fd334, fd332, fd330; +mul.f64 fd335, fd327, 0d3FE0000000000000; +mul.f64 fd336, fd333, 0dBFEBB67AE8584CAA; +sub.f64 fd337, fd335, fd336; +mul.f64 fd338, fd333, 0d3FE0000000000000; +fma.rn.f64 fd339, fd327, 0dBFEBB67AE8584CAA, fd338; +mul.f64 fd340, fd328, 0dBFE0000000000000; +mul.f64 fd341, fd334, 0dBFEBB67AE8584CAA; +sub.f64 fd342, fd340, fd341; +mul.f64 fd343, fd334, 0dBFE0000000000000; +fma.rn.f64 fd344, fd328, 0dBFEBB67AE8584CAA, fd343; +add.f64 fd345, fd304, fd320; +add.f64 fd346, fd306, fd322; +sub.f64 fd347, fd304, fd320; +sub.f64 fd348, fd306, fd322; +add.f64 fd349, fd311, fd337; +add.f64 fd350, fd317, fd339; +sub.f64 fd351, fd311, fd337; +sub.f64 fd352, fd317, fd339; +add.f64 fd353, fd312, fd342; +add.f64 fd354, fd318, fd344; +sub.f64 fd355, fd312, fd342; +sub.f64 fd356, fd318, fd344; +mul.f64 fd357, fd349, 0d3FEBB67AE8584CAA; +mul.f64 fd358, fd350, 0dBFE0000000000000; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd350, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd361, fd349, 0dBFE0000000000000, fd360; +mul.f64 fd362, fd353, 0d3FE0000000000000; +mul.f64 fd363, fd354, 0dBFEBB67AE8584CAA; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd354, 0d3FE0000000000000; +fma.rn.f64 fd366, fd353, 0dBFEBB67AE8584CAA, fd365; +mul.f64 fd367, fd351, 0dBFE0000000000000; +mul.f64 fd368, fd352, 0dBFEBB67AE8584CAA; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd352, 0dBFE0000000000000; +fma.rn.f64 fd371, fd351, 0dBFEBB67AE8584CAA, fd370; +mul.f64 fd372, fd355, 0dBFEBB67AE8584CAA; +mul.f64 fd373, fd356, 0dBFE0000000000000; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd356, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd376, fd355, 0dBFE0000000000000, fd375; +add.f64 fd377, fd291, fd345; +add.f64 fd378, fd292, fd346; +sub.f64 fd379, fd291, fd345; +sub.f64 fd380, fd292, fd346; +add.f64 fd381, fd295, fd359; +add.f64 fd382, fd296, fd361; +sub.f64 fd383, fd295, fd359; +sub.f64 fd384, fd296, fd361; +add.f64 fd385, fd299, fd364; +add.f64 fd386, fd300, fd366; +sub.f64 fd387, fd299, fd364; +sub.f64 fd388, fd300, fd366; +add.f64 fd389, fd293, fd348; +sub.f64 fd390, fd294, fd347; +sub.f64 fd391, fd293, fd348; +add.f64 fd392, fd294, fd347; +add.f64 fd393, fd297, fd369; +add.f64 fd394, fd298, fd371; +sub.f64 fd395, fd297, fd369; +sub.f64 fd396, fd298, fd371; +add.f64 fd397, fd301, fd374; +add.f64 fd398, fd302, fd376; +sub.f64 fd399, fd301, fd374; +sub.f64 fd400, fd302, fd376; +mul.f64 fd401, fd381, 0d3FEEE8DD4748BF15; +mul.f64 fd402, fd382, 0dBFD0907DC1930690; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd382, 0d3FEEE8DD4748BF15; +fma.rn.f64 fd405, fd381, 0dBFD0907DC1930690, fd404; +mul.f64 fd406, fd385, 0d3FEBB67AE8584CAA; +mul.f64 fd407, fd386, 0dBFE0000000000000; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd386, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd410, fd385, 0dBFE0000000000000, fd409; +mul.f64 fd411, fd389, 0d3FE6A09E667F3BCD; +mul.f64 fd412, fd390, 0dBFE6A09E667F3BCD; +sub.f64 fd413, fd411, fd412; +mul.f64 fd414, fd390, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd415, fd389, 0dBFE6A09E667F3BCD, fd414; +mul.f64 fd416, fd393, 0d3FE0000000000000; +mul.f64 fd417, fd394, 0dBFEBB67AE8584CAA; +sub.f64 fd418, fd416, fd417; +mul.f64 fd419, fd394, 0d3FE0000000000000; +fma.rn.f64 fd420, fd393, 0dBFEBB67AE8584CAA, fd419; +mul.f64 fd421, fd397, 0d3FD0907DC1930690; +mul.f64 fd422, fd398, 0dBFEEE8DD4748BF15; +sub.f64 fd423, fd421, fd422; +mul.f64 fd424, fd398, 0d3FD0907DC1930690; +fma.rn.f64 fd425, fd397, 0dBFEEE8DD4748BF15, fd424; +mul.f64 fd426, fd383, 0dBFD0907DC1930690; +mul.f64 fd427, fd384, 0dBFEEE8DD4748BF15; +sub.f64 fd428, fd426, fd427; +mul.f64 fd429, fd384, 0dBFD0907DC1930690; +fma.rn.f64 fd430, fd383, 0dBFEEE8DD4748BF15, fd429; +mul.f64 fd431, fd387, 0dBFE0000000000000; +mul.f64 fd432, fd388, 0dBFEBB67AE8584CAA; +sub.f64 fd433, fd431, fd432; +mul.f64 fd434, fd388, 0dBFE0000000000000; +fma.rn.f64 fd435, fd387, 0dBFEBB67AE8584CAA, fd434; +mul.f64 fd436, fd391, 0dBFE6A09E667F3BCD; +mul.f64 fd437, fd392, 0dBFE6A09E667F3BCD; +sub.f64 fd438, fd436, fd437; +add.f64 fd439, fd436, fd437; +mul.f64 fd440, fd395, 0dBFEBB67AE8584CAA; +mul.f64 fd441, fd396, 0dBFE0000000000000; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd396, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd444, fd395, 0dBFE0000000000000, fd443; +mul.f64 fd445, fd399, 0dBFEEE8DD4748BF15; +mul.f64 fd446, fd400, 0dBFD0907DC1930690; +sub.f64 fd447, fd445, fd446; +mul.f64 fd448, fd400, 0dBFEEE8DD4748BF15; +fma.rn.f64 fd449, fd399, 0dBFD0907DC1930690, fd448; +add.f64 %1, fd226, fd378; +add.f64 %0, fd225, fd377; +add.f64 %3, fd230, fd405; +add.f64 %2, fd229, fd403; +add.f64 %5, fd234, fd410; +add.f64 %4, fd233, fd408; +add.f64 %7, fd238, fd415; +add.f64 %6, fd237, fd413; +add.f64 %9, fd242, fd420; +add.f64 %8, fd241, fd418; +add.f64 %11, fd246, fd425; +add.f64 %10, fd245, fd423; +sub.f64 %13, fd228, fd379; +add.f64 %12, fd227, fd380; +add.f64 %15, fd232, fd430; +add.f64 %14, fd231, fd428; +add.f64 %17, fd236, fd435; +add.f64 %16, fd235, fd433; +add.f64 %19, fd240, fd439; +add.f64 %18, fd239, fd438; +add.f64 %21, fd244, fd444; +add.f64 %20, fd243, fd442; +add.f64 %23, fd248, fd449; +add.f64 %22, fd247, fd447; +sub.f64 %25, fd226, fd378; +sub.f64 %24, fd225, fd377; +sub.f64 %27, fd230, fd405; +sub.f64 %26, fd229, fd403; +sub.f64 %29, fd234, fd410; +sub.f64 %28, fd233, fd408; +sub.f64 %31, fd238, fd415; +sub.f64 %30, fd237, fd413; +sub.f64 %33, fd242, fd420; +sub.f64 %32, fd241, fd418; +sub.f64 %35, fd246, fd425; +sub.f64 %34, fd245, fd423; +add.f64 %37, fd228, fd379; +sub.f64 %36, fd227, fd380; +sub.f64 %39, fd232, fd430; +sub.f64 %38, fd231, fd428; +sub.f64 %41, fd236, fd435; +sub.f64 %40, fd235, fd433; +sub.f64 %43, fd240, fd439; +sub.f64 %42, fd239, fd438; +sub.f64 %45, fd244, fd444; +sub.f64 %44, fd243, fd442; +sub.f64 %47, fd248, fd449; +sub.f64 %46, fd247, fd447; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..a854649d3129e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_24_fp64_inv.hpp.inc @@ -0,0 +1,417 @@ +#ifndef CUFFTDX_FFT_24_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_24_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<585, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<498>; +.reg .b64 rd<2>; +add.f64 fd97, %69, %90; +add.f64 fd98, %48, fd97; +add.f64 fd99, %71, %92; +add.f64 fd100, %49, fd99; +mul.f64 fd101, fd97, 0d3FE0000000000000; +sub.f64 fd102, %48, fd101; +sub.f64 fd103, %71, %92; +mul.f64 fd104, fd103, 0dBFEBB67AE8584CAA; +add.f64 fd105, fd104, fd102; +sub.f64 fd106, fd102, fd104; +mul.f64 fd107, fd99, 0d3FE0000000000000; +sub.f64 fd108, %49, fd107; +sub.f64 fd109, %69, %90; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +sub.f64 fd111, fd108, fd110; +add.f64 fd112, fd110, fd108; +add.f64 fd113, %80, %101; +add.f64 fd114, %58, fd113; +add.f64 fd115, %81, %103; +add.f64 fd116, %60, fd115; +mul.f64 fd117, fd113, 0d3FE0000000000000; +sub.f64 fd118, %58, fd117; +sub.f64 fd119, %81, %103; +mul.f64 fd120, fd119, 0dBFEBB67AE8584CAA; +add.f64 fd121, fd120, fd118; +sub.f64 fd122, fd118, fd120; +mul.f64 fd123, fd115, 0d3FE0000000000000; +sub.f64 fd124, %60, fd123; +sub.f64 fd125, %80, %101; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +sub.f64 fd127, fd124, fd126; +add.f64 fd128, fd126, fd124; +mul.f64 fd129, fd121, 0d3FE0000000000000; +mul.f64 fd130, fd127, 0d3FEBB67AE8584CAA; +sub.f64 fd131, fd129, fd130; +mul.f64 fd132, fd127, 0d3FE0000000000000; +fma.rn.f64 fd133, fd121, 0d3FEBB67AE8584CAA, fd132; +mul.f64 fd134, fd122, 0dBFE0000000000000; +mul.f64 fd135, fd128, 0d3FEBB67AE8584CAA; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd128, 0dBFE0000000000000; +fma.rn.f64 fd138, fd122, 0d3FEBB67AE8584CAA, fd137; +add.f64 fd139, fd98, fd114; +add.f64 fd140, fd100, fd116; +sub.f64 fd141, fd98, fd114; +sub.f64 fd142, fd100, fd116; +add.f64 fd143, fd105, fd131; +add.f64 fd144, fd111, fd133; +sub.f64 fd145, fd105, fd131; +sub.f64 fd146, fd111, fd133; +add.f64 fd147, fd106, fd136; +add.f64 fd148, fd112, fd138; +sub.f64 fd149, fd106, fd136; +sub.f64 fd150, fd112, fd138; +add.f64 fd151, %74, %96; +add.f64 fd152, %53, fd151; +add.f64 fd153, %76, %97; +add.f64 fd154, %55, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, %53, fd155; +sub.f64 fd157, %76, %97; +mul.f64 fd158, fd157, 0dBFEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, %55, fd161; +sub.f64 fd163, %74, %96; +mul.f64 fd164, fd163, 0dBFEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, %85, %106; +add.f64 fd168, %64, fd167; +add.f64 fd169, %87, %108; +add.f64 fd170, %65, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, %64, fd171; +sub.f64 fd173, %87, %108; +mul.f64 fd174, fd173, 0dBFEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, %65, fd177; +sub.f64 fd179, %85, %106; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0d3FEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0d3FEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0d3FEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0d3FEBB67AE8584CAA, fd191; +add.f64 fd193, fd152, fd168; +add.f64 fd194, fd154, fd170; +sub.f64 fd195, fd152, fd168; +sub.f64 fd196, fd154, fd170; +add.f64 fd197, fd159, fd185; +add.f64 fd198, fd165, fd187; +sub.f64 fd199, fd159, fd185; +sub.f64 fd200, fd165, fd187; +add.f64 fd201, fd160, fd190; +add.f64 fd202, fd166, fd192; +sub.f64 fd203, fd160, fd190; +sub.f64 fd204, fd166, fd192; +mul.f64 fd205, fd197, 0d3FEBB67AE8584CAA; +mul.f64 fd206, fd198, 0d3FE0000000000000; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd198, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd209, fd197, 0d3FE0000000000000, fd208; +mul.f64 fd210, fd201, 0d3FE0000000000000; +mul.f64 fd211, fd202, 0d3FEBB67AE8584CAA; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd202, 0d3FE0000000000000; +fma.rn.f64 fd214, fd201, 0d3FEBB67AE8584CAA, fd213; +mul.f64 fd215, fd199, 0dBFE0000000000000; +mul.f64 fd216, fd200, 0d3FEBB67AE8584CAA; +sub.f64 fd217, fd215, fd216; +mul.f64 fd218, fd200, 0dBFE0000000000000; +fma.rn.f64 fd219, fd199, 0d3FEBB67AE8584CAA, fd218; +mul.f64 fd220, fd203, 0dBFEBB67AE8584CAA; +mul.f64 fd221, fd204, 0d3FE0000000000000; +sub.f64 fd222, fd220, fd221; +mul.f64 fd223, fd204, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd224, fd203, 0d3FE0000000000000, fd223; +add.f64 fd225, fd139, fd193; +add.f64 fd226, fd140, fd194; +sub.f64 fd227, fd139, fd193; +sub.f64 fd228, fd140, fd194; +add.f64 fd229, fd143, fd207; +add.f64 fd230, fd144, fd209; +sub.f64 fd231, fd143, fd207; +sub.f64 fd232, fd144, fd209; +add.f64 fd233, fd147, fd212; +add.f64 fd234, fd148, fd214; +sub.f64 fd235, fd147, fd212; +sub.f64 fd236, fd148, fd214; +sub.f64 fd237, fd141, fd196; +add.f64 fd238, fd142, fd195; +add.f64 fd239, fd141, fd196; +sub.f64 fd240, fd142, fd195; +add.f64 fd241, fd145, fd217; +add.f64 fd242, fd146, fd219; +sub.f64 fd243, fd145, fd217; +sub.f64 fd244, fd146, fd219; +add.f64 fd245, fd149, fd222; +add.f64 fd246, fd150, fd224; +sub.f64 fd247, fd149, fd222; +sub.f64 fd248, fd150, fd224; +add.f64 fd249, %72, %93; +add.f64 fd250, %50, fd249; +add.f64 fd251, %73, %95; +add.f64 fd252, %52, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, %50, fd253; +sub.f64 fd255, %73, %95; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, %52, fd259; +sub.f64 fd261, %72, %93; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, %82, %104; +add.f64 fd266, %61, fd265; +add.f64 fd267, %84, %105; +add.f64 fd268, %63, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, %61, fd269; +sub.f64 fd271, %84, %105; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, %63, fd275; +sub.f64 fd277, %82, %104; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +mul.f64 fd281, fd273, 0d3FE0000000000000; +mul.f64 fd282, fd279, 0d3FEBB67AE8584CAA; +sub.f64 fd283, fd281, fd282; +mul.f64 fd284, fd279, 0d3FE0000000000000; +fma.rn.f64 fd285, fd273, 0d3FEBB67AE8584CAA, fd284; +mul.f64 fd286, fd274, 0dBFE0000000000000; +mul.f64 fd287, fd280, 0d3FEBB67AE8584CAA; +sub.f64 fd288, fd286, fd287; +mul.f64 fd289, fd280, 0dBFE0000000000000; +fma.rn.f64 fd290, fd274, 0d3FEBB67AE8584CAA, fd289; +add.f64 fd291, fd250, fd266; +add.f64 fd292, fd252, fd268; +sub.f64 fd293, fd250, fd266; +sub.f64 fd294, fd252, fd268; +add.f64 fd295, fd257, fd283; +add.f64 fd296, fd263, fd285; +sub.f64 fd297, fd257, fd283; +sub.f64 fd298, fd263, fd285; +add.f64 fd299, fd258, fd288; +add.f64 fd300, fd264, fd290; +sub.f64 fd301, fd258, fd288; +sub.f64 fd302, fd264, fd290; +add.f64 fd303, %77, %98; +add.f64 fd304, %56, fd303; +add.f64 fd305, %79, %100; +add.f64 fd306, %57, fd305; +mul.f64 fd307, fd303, 0d3FE0000000000000; +sub.f64 fd308, %56, fd307; +sub.f64 fd309, %79, %100; +mul.f64 fd310, fd309, 0dBFEBB67AE8584CAA; +add.f64 fd311, fd310, fd308; +sub.f64 fd312, fd308, fd310; +mul.f64 fd313, fd305, 0d3FE0000000000000; +sub.f64 fd314, %57, fd313; +sub.f64 fd315, %77, %98; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +sub.f64 fd317, fd314, fd316; +add.f64 fd318, fd316, fd314; +add.f64 fd319, %88, %109; +add.f64 fd320, %66, fd319; +add.f64 fd321, %89, %110; +add.f64 fd322, %68, fd321; +mul.f64 fd323, fd319, 0d3FE0000000000000; +sub.f64 fd324, %66, fd323; +sub.f64 fd325, %89, %110; +mul.f64 fd326, fd325, 0dBFEBB67AE8584CAA; +add.f64 fd327, fd326, fd324; +sub.f64 fd328, fd324, fd326; +mul.f64 fd329, fd321, 0d3FE0000000000000; +sub.f64 fd330, %68, fd329; +sub.f64 fd331, %88, %109; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +sub.f64 fd333, fd330, fd332; +add.f64 fd334, fd332, fd330; +mul.f64 fd335, fd327, 0d3FE0000000000000; +mul.f64 fd336, fd333, 0d3FEBB67AE8584CAA; +sub.f64 fd337, fd335, fd336; +mul.f64 fd338, fd333, 0d3FE0000000000000; +fma.rn.f64 fd339, fd327, 0d3FEBB67AE8584CAA, fd338; +mul.f64 fd340, fd328, 0dBFE0000000000000; +mul.f64 fd341, fd334, 0d3FEBB67AE8584CAA; +sub.f64 fd342, fd340, fd341; +mul.f64 fd343, fd334, 0dBFE0000000000000; +fma.rn.f64 fd344, fd328, 0d3FEBB67AE8584CAA, fd343; +add.f64 fd345, fd304, fd320; +add.f64 fd346, fd306, fd322; +sub.f64 fd347, fd304, fd320; +sub.f64 fd348, fd306, fd322; +add.f64 fd349, fd311, fd337; +add.f64 fd350, fd317, fd339; +sub.f64 fd351, fd311, fd337; +sub.f64 fd352, fd317, fd339; +add.f64 fd353, fd312, fd342; +add.f64 fd354, fd318, fd344; +sub.f64 fd355, fd312, fd342; +sub.f64 fd356, fd318, fd344; +mul.f64 fd357, fd349, 0d3FEBB67AE8584CAA; +mul.f64 fd358, fd350, 0d3FE0000000000000; +sub.f64 fd359, fd357, fd358; +mul.f64 fd360, fd350, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd361, fd349, 0d3FE0000000000000, fd360; +mul.f64 fd362, fd353, 0d3FE0000000000000; +mul.f64 fd363, fd354, 0d3FEBB67AE8584CAA; +sub.f64 fd364, fd362, fd363; +mul.f64 fd365, fd354, 0d3FE0000000000000; +fma.rn.f64 fd366, fd353, 0d3FEBB67AE8584CAA, fd365; +mul.f64 fd367, fd351, 0dBFE0000000000000; +mul.f64 fd368, fd352, 0d3FEBB67AE8584CAA; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd352, 0dBFE0000000000000; +fma.rn.f64 fd371, fd351, 0d3FEBB67AE8584CAA, fd370; +mul.f64 fd372, fd355, 0dBFEBB67AE8584CAA; +mul.f64 fd373, fd356, 0d3FE0000000000000; +sub.f64 fd374, fd372, fd373; +mul.f64 fd375, fd356, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd376, fd355, 0d3FE0000000000000, fd375; +add.f64 fd377, fd291, fd345; +add.f64 fd378, fd292, fd346; +sub.f64 fd379, fd291, fd345; +sub.f64 fd380, fd292, fd346; +add.f64 fd381, fd295, fd359; +add.f64 fd382, fd296, fd361; +sub.f64 fd383, fd295, fd359; +sub.f64 fd384, fd296, fd361; +add.f64 fd385, fd299, fd364; +add.f64 fd386, fd300, fd366; +sub.f64 fd387, fd299, fd364; +sub.f64 fd388, fd300, fd366; +sub.f64 fd389, fd293, fd348; +add.f64 fd390, fd294, fd347; +add.f64 fd391, fd293, fd348; +sub.f64 fd392, fd294, fd347; +add.f64 fd393, fd297, fd369; +add.f64 fd394, fd298, fd371; +sub.f64 fd395, fd297, fd369; +sub.f64 fd396, fd298, fd371; +add.f64 fd397, fd301, fd374; +add.f64 fd398, fd302, fd376; +sub.f64 fd399, fd301, fd374; +sub.f64 fd400, fd302, fd376; +mul.f64 fd401, fd381, 0d3FEEE8DD4748BF15; +mul.f64 fd402, fd382, 0d3FD0907DC1930690; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd382, 0d3FEEE8DD4748BF15; +fma.rn.f64 fd405, fd381, 0d3FD0907DC1930690, fd404; +mul.f64 fd406, fd385, 0d3FEBB67AE8584CAA; +mul.f64 fd407, fd386, 0d3FE0000000000000; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd386, 0d3FEBB67AE8584CAA; +fma.rn.f64 fd410, fd385, 0d3FE0000000000000, fd409; +mul.f64 fd411, fd389, 0d3FE6A09E667F3BCD; +mul.f64 fd412, fd390, 0d3FE6A09E667F3BCD; +sub.f64 fd413, fd411, fd412; +add.f64 fd414, fd411, fd412; +mul.f64 fd415, fd393, 0d3FE0000000000000; +mul.f64 fd416, fd394, 0d3FEBB67AE8584CAA; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd394, 0d3FE0000000000000; +fma.rn.f64 fd419, fd393, 0d3FEBB67AE8584CAA, fd418; +mul.f64 fd420, fd397, 0d3FD0907DC1930690; +mul.f64 fd421, fd398, 0d3FEEE8DD4748BF15; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd398, 0d3FD0907DC1930690; +fma.rn.f64 fd424, fd397, 0d3FEEE8DD4748BF15, fd423; +mul.f64 fd425, fd383, 0dBFD0907DC1930690; +mul.f64 fd426, fd384, 0d3FEEE8DD4748BF15; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd384, 0dBFD0907DC1930690; +fma.rn.f64 fd429, fd383, 0d3FEEE8DD4748BF15, fd428; +mul.f64 fd430, fd387, 0dBFE0000000000000; +mul.f64 fd431, fd388, 0d3FEBB67AE8584CAA; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd388, 0dBFE0000000000000; +fma.rn.f64 fd434, fd387, 0d3FEBB67AE8584CAA, fd433; +mul.f64 fd435, fd391, 0dBFE6A09E667F3BCD; +mul.f64 fd436, fd392, 0d3FE6A09E667F3BCD; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd392, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd439, fd391, 0d3FE6A09E667F3BCD, fd438; +mul.f64 fd440, fd395, 0dBFEBB67AE8584CAA; +mul.f64 fd441, fd396, 0d3FE0000000000000; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd396, 0dBFEBB67AE8584CAA; +fma.rn.f64 fd444, fd395, 0d3FE0000000000000, fd443; +mul.f64 fd445, fd399, 0dBFEEE8DD4748BF15; +mul.f64 fd446, fd400, 0d3FD0907DC1930690; +sub.f64 fd447, fd445, fd446; +mul.f64 fd448, fd400, 0dBFEEE8DD4748BF15; +fma.rn.f64 fd449, fd399, 0d3FD0907DC1930690, fd448; +add.f64 %1, fd226, fd378; +add.f64 %0, fd225, fd377; +add.f64 %3, fd230, fd405; +add.f64 %2, fd229, fd403; +add.f64 %5, fd234, fd410; +add.f64 %4, fd233, fd408; +add.f64 %7, fd238, fd414; +add.f64 %6, fd237, fd413; +add.f64 %9, fd242, fd419; +add.f64 %8, fd241, fd417; +add.f64 %11, fd246, fd424; +add.f64 %10, fd245, fd422; +add.f64 %13, fd228, fd379; +sub.f64 %12, fd227, fd380; +add.f64 %15, fd232, fd429; +add.f64 %14, fd231, fd427; +add.f64 %17, fd236, fd434; +add.f64 %16, fd235, fd432; +add.f64 %19, fd240, fd439; +add.f64 %18, fd239, fd437; +add.f64 %21, fd244, fd444; +add.f64 %20, fd243, fd442; +add.f64 %23, fd248, fd449; +add.f64 %22, fd247, fd447; +sub.f64 %25, fd226, fd378; +sub.f64 %24, fd225, fd377; +sub.f64 %27, fd230, fd405; +sub.f64 %26, fd229, fd403; +sub.f64 %29, fd234, fd410; +sub.f64 %28, fd233, fd408; +sub.f64 %31, fd238, fd414; +sub.f64 %30, fd237, fd413; +sub.f64 %33, fd242, fd419; +sub.f64 %32, fd241, fd417; +sub.f64 %35, fd246, fd424; +sub.f64 %34, fd245, fd422; +sub.f64 %37, fd228, fd379; +add.f64 %36, fd227, fd380; +sub.f64 %39, fd232, fd429; +sub.f64 %38, fd231, fd427; +sub.f64 %41, fd236, fd434; +sub.f64 %40, fd235, fd432; +sub.f64 %43, fd240, fd439; +sub.f64 %42, fd239, fd437; +sub.f64 %45, fd244, fd444; +sub.f64 %44, fd243, fd442; +sub.f64 %47, fd248, fd449; +sub.f64 %46, fd247, fd447; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..f0177864afe48 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp16_fwd.hpp.inc @@ -0,0 +1,18730 @@ +#ifndef CUFFTDX_FFT_256_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_256_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<807, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<270>; +.reg .b32 r<1803>; +.reg .b64 rd<2>; +mov.u32 r1791, %tid.y; +shl.b32 r1792, r1791, 10; +mov.u32 r1793, %32; +add.s32 r1794, r1793, r1792; +mov.u32 r1795, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f212, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r101, {low, high}; +} +mov.f32 f230, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1796, r1795, 15; +shl.b32 r1797, r1795, 6; +and.b32 r1798, r1797, -1024; +add.s32 r1799, r1794, r1798; +cvt.rn.f32.u32 f267, r1796; +mul.f32 f268, f267, 0f3CC90FDB; +cos.approx.f32 f117, f268; +sin.approx.f32 f269, f268; +neg.f32 f118, f269; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1800, r1797, 960; +add.s32 r1801, r1799, r1800; +st.shared.v4.f32 [r1801], {r521, r629, r666, r703}; +st.shared.v4.f32 [r1801+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r1801+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r1801+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r1802, r1796, -60, r1801; +ld.shared.u32 r1176, [r1802]; +ld.shared.u32 r1372, [r1802+64]; +ld.shared.u32 r1226, [r1802+128]; +ld.shared.u32 r1422, [r1802+192]; +ld.shared.u32 r1188, [r1802+256]; +ld.shared.u32 r1384, [r1802+320]; +ld.shared.u32 r1238, [r1802+384]; +ld.shared.u32 r1434, [r1802+448]; +ld.shared.u32 r1177, [r1802+512]; +ld.shared.u32 r1373, [r1802+576]; +ld.shared.u32 r1227, [r1802+640]; +ld.shared.u32 r1423, [r1802+704]; +ld.shared.u32 r1189, [r1802+768]; +ld.shared.u32 r1385, [r1802+832]; +ld.shared.u32 r1239, [r1802+896]; +ld.shared.u32 r1435, [r1802+960]; +barrier.sync 0; +st.shared.v4.f32 [r1801], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1801+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1801+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1801+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1802]; +ld.shared.u32 r1375, [r1802+64]; +ld.shared.u32 r1229, [r1802+128]; +ld.shared.u32 r1425, [r1802+192]; +ld.shared.u32 r1191, [r1802+256]; +ld.shared.u32 r1387, [r1802+320]; +ld.shared.u32 r1241, [r1802+384]; +ld.shared.u32 r1437, [r1802+448]; +ld.shared.u32 r1180, [r1802+512]; +ld.shared.u32 r1376, [r1802+576]; +ld.shared.u32 r1230, [r1802+640]; +ld.shared.u32 r1426, [r1802+704]; +ld.shared.u32 r1192, [r1802+768]; +ld.shared.u32 r1388, [r1802+832]; +ld.shared.u32 r1242, [r1802+896]; +ld.shared.u32 r1438, [r1802+960]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 %0, r1323, r1519; +} +{ +add.f16x2 %1, r1326, r1522; +} +{ +sub.f16x2 %16, r1323, r1519; +} +{ +sub.f16x2 %17, r1326, r1522; +} +{ +add.f16x2 %2, r1335, r1603; +} +{ +add.f16x2 %3, r1338, r1609; +} +{ +sub.f16x2 %18, r1335, r1603; +} +{ +sub.f16x2 %19, r1338, r1609; +} +{ +add.f16x2 %4, r1347, r1619; +} +{ +add.f16x2 %5, r1350, r1625; +} +{ +sub.f16x2 %20, r1347, r1619; +} +{ +sub.f16x2 %21, r1350, r1625; +} +{ +add.f16x2 %6, r1359, r1635; +} +{ +add.f16x2 %7, r1362, r1641; +} +{ +sub.f16x2 %22, r1359, r1635; +} +{ +sub.f16x2 %23, r1362, r1641; +} +{ +add.f16x2 %8, r1329, r1528; +} +{ +add.f16x2 %9, r1332, r1645; +} +{ +sub.f16x2 %24, r1329, r1528; +} +{ +sub.f16x2 %25, r1332, r1645; +} +{ +add.f16x2 %10, r1341, r1653; +} +{ +add.f16x2 %11, r1344, r1659; +} +{ +sub.f16x2 %26, r1341, r1653; +} +{ +sub.f16x2 %27, r1344, r1659; +} +{ +add.f16x2 %12, r1353, r1669; +} +{ +add.f16x2 %13, r1356, r1675; +} +{ +sub.f16x2 %28, r1353, r1669; +} +{ +sub.f16x2 %29, r1356, r1675; +} +{ +add.f16x2 %14, r1365, r1685; +} +{ +add.f16x2 %15, r1368, r1691; +} +{ +sub.f16x2 %30, r1365, r1685; +} +{ +sub.f16x2 %31, r1368, r1691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<808, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<1037>; +.reg .b64 rd<2>; +mov.u32 r1017, %tid.y; +shl.b32 r1018, r1017, 11; +mov.u32 r1019, %16; +add.s32 r1020, r1019, r1018; +mov.u32 r1021, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f48, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r101, {low, high}; +} +mov.f32 f58, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f90, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1022, r1021, 31; +shl.b32 r1023, r1021, 6; +and.b32 r1024, r1023, -2048; +add.s32 r1025, r1020, r1024; +cvt.rn.f32.u32 f93, r1022; +mul.f32 f94, f93, 0f3CC90FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1026, r1023, 1984; +add.s32 r1027, r1025, r1026; +st.shared.v4.f32 [r1027], {r149, r152, r209, r216}; +st.shared.v4.f32 [r1027+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r1027+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r1027+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r1028, r1022, -56, r1027; +ld.shared.u32 r460, [r1028]; +ld.shared.u32 r463, [r1028+4]; +ld.shared.u32 r510, [r1028+256]; +ld.shared.u32 r513, [r1028+260]; +ld.shared.u32 r472, [r1028+512]; +ld.shared.u32 r475, [r1028+516]; +ld.shared.u32 r522, [r1028+768]; +ld.shared.u32 r525, [r1028+772]; +ld.shared.u32 r461, [r1028+1024]; +ld.shared.u32 r464, [r1028+1028]; +ld.shared.u32 r511, [r1028+1280]; +ld.shared.u32 r514, [r1028+1284]; +ld.shared.u32 r473, [r1028+1536]; +ld.shared.u32 r476, [r1028+1540]; +ld.shared.u32 r523, [r1028+1792]; +ld.shared.u32 r526, [r1028+1796]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1029, r1021, 24; +bfe.u32 r1030, r1021, 3, 2; +cvt.rn.f32.u32 f96, r1030; +mul.f32 f97, f96, 0f3E490FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r1031, r1021, 3; +and.b32 r1032, r1031, 56; +add.s32 r1033, r1025, r1032; +barrier.sync 0; +and.b32 r1034, r1023, 1536; +add.s32 r1035, r1033, r1034; +st.shared.u32 [r1035], r607; +st.shared.u32 [r1035+4], r610; +st.shared.u32 [r1035+64], r667; +st.shared.u32 [r1035+68], r674; +st.shared.u32 [r1035+128], r704; +st.shared.u32 [r1035+132], r711; +st.shared.u32 [r1035+192], r741; +st.shared.u32 [r1035+196], r748; +st.shared.u32 [r1035+256], r778; +st.shared.u32 [r1035+260], r785; +st.shared.u32 [r1035+320], r815; +st.shared.u32 [r1035+324], r822; +st.shared.u32 [r1035+384], r852; +st.shared.u32 [r1035+388], r859; +st.shared.u32 [r1035+448], r889; +st.shared.u32 [r1035+452], r896; +barrier.sync 0; +mad.lo.s32 r1036, r1029, -56, r1035; +ld.shared.u32 r918, [r1036]; +ld.shared.u32 r921, [r1036+4]; +ld.shared.u32 r968, [r1036+256]; +ld.shared.u32 r971, [r1036+260]; +ld.shared.u32 r930, [r1036+512]; +ld.shared.u32 r933, [r1036+516]; +ld.shared.u32 r980, [r1036+768]; +ld.shared.u32 r983, [r1036+772]; +ld.shared.u32 r919, [r1036+1024]; +ld.shared.u32 r922, [r1036+1028]; +ld.shared.u32 r969, [r1036+1280]; +ld.shared.u32 r972, [r1036+1284]; +ld.shared.u32 r931, [r1036+1536]; +ld.shared.u32 r934, [r1036+1540]; +ld.shared.u32 r981, [r1036+1792]; +ld.shared.u32 r984, [r1036+1796]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 %0, r917, r929; +} +{ +add.f16x2 %1, r920, r932; +} +{ +sub.f16x2 %8, r917, r929; +} +{ +sub.f16x2 %9, r920, r932; +} +{ +add.f16x2 %4, r923, r938; +} +{ +add.f16x2 %5, r926, r941; +} +{ +sub.f16x2 %12, r923, r938; +} +{ +sub.f16x2 %13, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 %2, r967, r979; +} +{ +add.f16x2 %3, r970, r982; +} +{ +sub.f16x2 %10, r967, r979; +} +{ +sub.f16x2 %11, r970, r982; +} +{ +add.f16x2 %6, r973, r988; +} +{ +add.f16x2 %7, r976, r991; +} +{ +sub.f16x2 %14, r973, r988; +} +{ +sub.f16x2 %15, r976, r991; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<809, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<1037>; +.reg .b64 rd<2>; +mov.u32 r1017, %tid.y; +shl.b32 r1018, r1017, 10; +mov.u32 r1019, %16; +add.s32 r1020, r1019, r1018; +mov.u32 r1021, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f48, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r101, {low, high}; +} +mov.f32 f58, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f90, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1022, r1021, 31; +shl.b32 r1023, r1021, 5; +and.b32 r1024, r1023, -1024; +add.s32 r1025, r1020, r1024; +cvt.rn.f32.u32 f93, r1022; +mul.f32 f94, f93, 0f3CC90FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1026, r1023, 992; +add.s32 r1027, r1025, r1026; +st.shared.v4.f32 [r1027], {r149, r209, r246, r283}; +st.shared.v4.f32 [r1027+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r1028, r1022, -28, r1027; +ld.shared.u32 r460, [r1028]; +ld.shared.u32 r510, [r1028+128]; +ld.shared.u32 r472, [r1028+256]; +ld.shared.u32 r522, [r1028+384]; +ld.shared.u32 r461, [r1028+512]; +ld.shared.u32 r511, [r1028+640]; +ld.shared.u32 r473, [r1028+768]; +ld.shared.u32 r523, [r1028+896]; +barrier.sync 0; +st.shared.v4.f32 [r1027], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1027+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1028]; +ld.shared.u32 r513, [r1028+128]; +ld.shared.u32 r475, [r1028+256]; +ld.shared.u32 r525, [r1028+384]; +ld.shared.u32 r464, [r1028+512]; +ld.shared.u32 r514, [r1028+640]; +ld.shared.u32 r476, [r1028+768]; +ld.shared.u32 r526, [r1028+896]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1029, r1021, 24; +bfe.u32 r1030, r1021, 3, 2; +shl.b32 r1031, r1021, 2; +and.b32 r1032, r1031, 28; +add.s32 r1033, r1025, r1032; +cvt.rn.f32.u32 f96, r1030; +mul.f32 f97, f96, 0f3E490FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r1034, r1023, 768; +add.s32 r1035, r1033, r1034; +st.shared.u32 [r1035], r607; +st.shared.u32 [r1035+32], r667; +st.shared.u32 [r1035+64], r704; +st.shared.u32 [r1035+96], r741; +st.shared.u32 [r1035+128], r778; +st.shared.u32 [r1035+160], r815; +st.shared.u32 [r1035+192], r852; +st.shared.u32 [r1035+224], r889; +barrier.sync 0; +mad.lo.s32 r1036, r1029, -28, r1035; +ld.shared.u32 r918, [r1036]; +ld.shared.u32 r968, [r1036+128]; +ld.shared.u32 r930, [r1036+256]; +ld.shared.u32 r980, [r1036+384]; +ld.shared.u32 r919, [r1036+512]; +ld.shared.u32 r969, [r1036+640]; +ld.shared.u32 r931, [r1036+768]; +ld.shared.u32 r981, [r1036+896]; +barrier.sync 0; +st.shared.u32 [r1035], r610; +st.shared.u32 [r1035+32], r674; +st.shared.u32 [r1035+64], r711; +st.shared.u32 [r1035+96], r748; +st.shared.u32 [r1035+128], r785; +st.shared.u32 [r1035+160], r822; +st.shared.u32 [r1035+192], r859; +st.shared.u32 [r1035+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1036]; +ld.shared.u32 r971, [r1036+128]; +ld.shared.u32 r933, [r1036+256]; +ld.shared.u32 r983, [r1036+384]; +ld.shared.u32 r922, [r1036+512]; +ld.shared.u32 r972, [r1036+640]; +ld.shared.u32 r934, [r1036+768]; +ld.shared.u32 r984, [r1036+896]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 %0, r917, r929; +} +{ +add.f16x2 %1, r920, r932; +} +{ +sub.f16x2 %8, r917, r929; +} +{ +sub.f16x2 %9, r920, r932; +} +{ +add.f16x2 %4, r923, r938; +} +{ +add.f16x2 %5, r926, r941; +} +{ +sub.f16x2 %12, r923, r938; +} +{ +sub.f16x2 %13, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 %2, r967, r979; +} +{ +add.f16x2 %3, r970, r982; +} +{ +sub.f16x2 %10, r967, r979; +} +{ +sub.f16x2 %11, r970, r982; +} +{ +add.f16x2 %6, r973, r988; +} +{ +add.f16x2 %7, r976, r991; +} +{ +sub.f16x2 %14, r973, r988; +} +{ +sub.f16x2 %15, r976, r991; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<810, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<270>; +.reg .b32 r<1803>; +.reg .b64 rd<2>; +mov.u32 r1791, %tid.y; +shl.b32 r1792, r1791, 11; +mov.u32 r1793, %32; +add.s32 r1794, r1793, r1792; +mov.u32 r1795, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f212, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r101, {low, high}; +} +mov.f32 f230, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1796, r1795, 15; +shl.b32 r1797, r1795, 7; +and.b32 r1798, r1797, -2048; +add.s32 r1799, r1794, r1798; +cvt.rn.f32.u32 f267, r1796; +mul.f32 f268, f267, 0f3CC90FDB; +cos.approx.f32 f117, f268; +sin.approx.f32 f269, f268; +neg.f32 f118, f269; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1800, r1797, 1920; +add.s32 r1801, r1799, r1800; +st.shared.v4.f32 [r1801], {r521, r524, r629, r636}; +st.shared.v4.f32 [r1801+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r1801+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r1801+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r1801+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r1801+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r1801+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r1801+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r1802, r1796, -120, r1801; +ld.shared.u32 r1176, [r1802]; +ld.shared.u32 r1179, [r1802+4]; +ld.shared.u32 r1372, [r1802+128]; +ld.shared.u32 r1375, [r1802+132]; +ld.shared.u32 r1226, [r1802+256]; +ld.shared.u32 r1229, [r1802+260]; +ld.shared.u32 r1422, [r1802+384]; +ld.shared.u32 r1425, [r1802+388]; +ld.shared.u32 r1188, [r1802+512]; +ld.shared.u32 r1191, [r1802+516]; +ld.shared.u32 r1384, [r1802+640]; +ld.shared.u32 r1387, [r1802+644]; +ld.shared.u32 r1238, [r1802+768]; +ld.shared.u32 r1241, [r1802+772]; +ld.shared.u32 r1434, [r1802+896]; +ld.shared.u32 r1437, [r1802+900]; +ld.shared.u32 r1177, [r1802+1024]; +ld.shared.u32 r1180, [r1802+1028]; +ld.shared.u32 r1373, [r1802+1152]; +ld.shared.u32 r1376, [r1802+1156]; +ld.shared.u32 r1227, [r1802+1280]; +ld.shared.u32 r1230, [r1802+1284]; +ld.shared.u32 r1423, [r1802+1408]; +ld.shared.u32 r1426, [r1802+1412]; +ld.shared.u32 r1189, [r1802+1536]; +ld.shared.u32 r1192, [r1802+1540]; +ld.shared.u32 r1385, [r1802+1664]; +ld.shared.u32 r1388, [r1802+1668]; +ld.shared.u32 r1239, [r1802+1792]; +ld.shared.u32 r1242, [r1802+1796]; +ld.shared.u32 r1435, [r1802+1920]; +ld.shared.u32 r1438, [r1802+1924]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 %0, r1323, r1519; +} +{ +add.f16x2 %1, r1326, r1522; +} +{ +sub.f16x2 %16, r1323, r1519; +} +{ +sub.f16x2 %17, r1326, r1522; +} +{ +add.f16x2 %2, r1335, r1603; +} +{ +add.f16x2 %3, r1338, r1609; +} +{ +sub.f16x2 %18, r1335, r1603; +} +{ +sub.f16x2 %19, r1338, r1609; +} +{ +add.f16x2 %4, r1347, r1619; +} +{ +add.f16x2 %5, r1350, r1625; +} +{ +sub.f16x2 %20, r1347, r1619; +} +{ +sub.f16x2 %21, r1350, r1625; +} +{ +add.f16x2 %6, r1359, r1635; +} +{ +add.f16x2 %7, r1362, r1641; +} +{ +sub.f16x2 %22, r1359, r1635; +} +{ +sub.f16x2 %23, r1362, r1641; +} +{ +add.f16x2 %8, r1329, r1528; +} +{ +add.f16x2 %9, r1332, r1645; +} +{ +sub.f16x2 %24, r1329, r1528; +} +{ +sub.f16x2 %25, r1332, r1645; +} +{ +add.f16x2 %10, r1341, r1653; +} +{ +add.f16x2 %11, r1344, r1659; +} +{ +sub.f16x2 %26, r1341, r1653; +} +{ +sub.f16x2 %27, r1344, r1659; +} +{ +add.f16x2 %12, r1353, r1669; +} +{ +add.f16x2 %13, r1356, r1675; +} +{ +sub.f16x2 %28, r1353, r1669; +} +{ +sub.f16x2 %29, r1356, r1675; +} +{ +add.f16x2 %14, r1365, r1685; +} +{ +add.f16x2 %15, r1368, r1691; +} +{ +sub.f16x2 %30, r1365, r1685; +} +{ +sub.f16x2 %31, r1368, r1691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<811, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<540>; +.reg .b32 r<3723>; +.reg .b64 rd<3>; +mov.u32 r3647, %tid.y; +shl.b32 r3648, r3647, 11; +mov.u32 r3649, %64; +add.s32 r3650, r3649, r3648; +mov.u32 r3651, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f508, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r101, {low, high}; +} +mov.f32 f518, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %79, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %79, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f238, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r396, {low, high}; +} +mov.f32 f254, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r397, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %78, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %78, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3652, r3651, 7; +shl.b32 r3653, r3651, 8; +and.b32 r3654, r3653, -2048; +add.s32 r3655, r3650, r3654; +cvt.rn.f32.u32 f535, r3652; +mul.f32 f536, f535, 0f3CC90FDB; +cos.approx.f32 f357, f536; +sin.approx.f32 f537, f536; +neg.f32 f358, f537; +mov.f32 f539, 0fBF800000; +mov.f32 f538, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r3656, r3653, 1792; +add.s32 r3657, r3655, r3656; +st.shared.v4.f32 [r3657], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r3657+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r3657+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r3657+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r3657+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r3657+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r3657+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r3657+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r3657+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r3657+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r3657+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r3657+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r3657+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r3657+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r3657+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r3657+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r3658, r3652, -248, r3657; +ld.shared.u32 r2864, [r3658]; +ld.shared.u32 r2867, [r3658+4]; +ld.shared.u32 r3060, [r3658+64]; +ld.shared.u32 r3063, [r3658+68]; +ld.shared.u32 r3256, [r3658+128]; +ld.shared.u32 r3259, [r3658+132]; +ld.shared.u32 r3452, [r3658+192]; +ld.shared.u32 r3455, [r3658+196]; +ld.shared.u32 r2914, [r3658+256]; +ld.shared.u32 r2917, [r3658+260]; +ld.shared.u32 r3110, [r3658+320]; +ld.shared.u32 r3113, [r3658+324]; +ld.shared.u32 r3306, [r3658+384]; +ld.shared.u32 r3309, [r3658+388]; +ld.shared.u32 r3502, [r3658+448]; +ld.shared.u32 r3505, [r3658+452]; +ld.shared.u32 r2876, [r3658+512]; +ld.shared.u32 r2879, [r3658+516]; +ld.shared.u32 r3072, [r3658+576]; +ld.shared.u32 r3075, [r3658+580]; +ld.shared.u32 r3268, [r3658+640]; +ld.shared.u32 r3271, [r3658+644]; +ld.shared.u32 r3464, [r3658+704]; +ld.shared.u32 r3467, [r3658+708]; +ld.shared.u32 r2926, [r3658+768]; +ld.shared.u32 r2929, [r3658+772]; +ld.shared.u32 r3122, [r3658+832]; +ld.shared.u32 r3125, [r3658+836]; +ld.shared.u32 r3318, [r3658+896]; +ld.shared.u32 r3321, [r3658+900]; +ld.shared.u32 r3514, [r3658+960]; +ld.shared.u32 r3517, [r3658+964]; +ld.shared.u32 r2865, [r3658+1024]; +ld.shared.u32 r2868, [r3658+1028]; +ld.shared.u32 r3061, [r3658+1088]; +ld.shared.u32 r3064, [r3658+1092]; +ld.shared.u32 r3257, [r3658+1152]; +ld.shared.u32 r3260, [r3658+1156]; +ld.shared.u32 r3453, [r3658+1216]; +ld.shared.u32 r3456, [r3658+1220]; +ld.shared.u32 r2915, [r3658+1280]; +ld.shared.u32 r2918, [r3658+1284]; +ld.shared.u32 r3111, [r3658+1344]; +ld.shared.u32 r3114, [r3658+1348]; +ld.shared.u32 r3307, [r3658+1408]; +ld.shared.u32 r3310, [r3658+1412]; +ld.shared.u32 r3503, [r3658+1472]; +ld.shared.u32 r3506, [r3658+1476]; +ld.shared.u32 r2877, [r3658+1536]; +ld.shared.u32 r2880, [r3658+1540]; +ld.shared.u32 r3073, [r3658+1600]; +ld.shared.u32 r3076, [r3658+1604]; +ld.shared.u32 r3269, [r3658+1664]; +ld.shared.u32 r3272, [r3658+1668]; +ld.shared.u32 r3465, [r3658+1728]; +ld.shared.u32 r3468, [r3658+1732]; +ld.shared.u32 r2927, [r3658+1792]; +ld.shared.u32 r2930, [r3658+1796]; +ld.shared.u32 r3123, [r3658+1856]; +ld.shared.u32 r3126, [r3658+1860]; +ld.shared.u32 r3319, [r3658+1920]; +ld.shared.u32 r3322, [r3658+1924]; +ld.shared.u32 r3515, [r3658+1984]; +ld.shared.u32 r3518, [r3658+1988]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 %0, r2889, r2939; +} +{ +add.f16x2 %1, r2892, r2942; +} +{ +sub.f16x2 %32, r2889, r2939; +} +{ +sub.f16x2 %33, r2892, r2942; +} +{ +add.f16x2 %8, r2901, r2983; +} +{ +add.f16x2 %9, r2904, r2989; +} +{ +sub.f16x2 %40, r2901, r2983; +} +{ +sub.f16x2 %41, r2904, r2989; +} +{ +add.f16x2 %16, r2895, r2948; +} +{ +add.f16x2 %17, r2898, r2993; +} +{ +sub.f16x2 %48, r2895, r2948; +} +{ +sub.f16x2 %49, r2898, r2993; +} +{ +add.f16x2 %24, r2907, r3001; +} +{ +add.f16x2 %25, r2910, r3007; +} +{ +sub.f16x2 %56, r2907, r3001; +} +{ +sub.f16x2 %57, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 %2, r3085, r3135; +} +{ +add.f16x2 %3, r3088, r3138; +} +{ +sub.f16x2 %34, r3085, r3135; +} +{ +sub.f16x2 %35, r3088, r3138; +} +{ +add.f16x2 %10, r3097, r3179; +} +{ +add.f16x2 %11, r3100, r3185; +} +{ +sub.f16x2 %42, r3097, r3179; +} +{ +sub.f16x2 %43, r3100, r3185; +} +{ +add.f16x2 %18, r3091, r3144; +} +{ +add.f16x2 %19, r3094, r3189; +} +{ +sub.f16x2 %50, r3091, r3144; +} +{ +sub.f16x2 %51, r3094, r3189; +} +{ +add.f16x2 %26, r3103, r3197; +} +{ +add.f16x2 %27, r3106, r3203; +} +{ +sub.f16x2 %58, r3103, r3197; +} +{ +sub.f16x2 %59, r3106, r3203; +} +{ +add.f16x2 r3255, r3256, r3257; +} +{ +add.f16x2 r3258, r3259, r3260; +} +{ +sub.f16x2 r3261, r3256, r3257; +} +{ +sub.f16x2 r3264, r3259, r3260; +} +{ +add.f16x2 r3267, r3268, r3269; +} +{ +add.f16x2 r3270, r3271, r3272; +} +{ +sub.f16x2 r3273, r3268, r3269; +} +{ +sub.f16x2 r3276, r3271, r3272; +} +{ +neg.f16x2 r3279, r3273; +} +{ +add.f16x2 r3281, r3255, r3267; +} +{ +add.f16x2 r3284, r3258, r3270; +} +{ +sub.f16x2 r3287, r3255, r3267; +} +{ +sub.f16x2 r3290, r3258, r3270; +} +{ +add.f16x2 r3293, r3261, r3276; +} +{ +add.f16x2 r3296, r3264, r3279; +} +{ +sub.f16x2 r3299, r3261, r3276; +} +{ +sub.f16x2 r3302, r3264, r3279; +} +{ +add.f16x2 r3305, r3306, r3307; +} +{ +add.f16x2 r3308, r3309, r3310; +} +{ +sub.f16x2 r3311, r3306, r3307; +} +{ +sub.f16x2 r3314, r3309, r3310; +} +{ +add.f16x2 r3317, r3318, r3319; +} +{ +add.f16x2 r3320, r3321, r3322; +} +{ +sub.f16x2 r3323, r3318, r3319; +} +{ +sub.f16x2 r3326, r3321, r3322; +} +{ +neg.f16x2 r3329, r3323; +} +{ +add.f16x2 r3331, r3305, r3317; +} +{ +add.f16x2 r3334, r3308, r3320; +} +{ +sub.f16x2 r3337, r3305, r3317; +} +{ +sub.f16x2 r3340, r3308, r3320; +} +{ +add.f16x2 r3343, r3311, r3326; +} +{ +add.f16x2 r3346, r3314, r3329; +} +{ +sub.f16x2 r3349, r3311, r3326; +} +{ +sub.f16x2 r3352, r3314, r3329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r3355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3360, {low, high}; +} +{ +mul.f16x2 r3369, r3343, r3355; +} +{ +mul.f16x2 r3372, r3346, r3356; +} +{ +sub.f16x2 r3375, r3369, r3372; +} +{ +mul.f16x2 r3378, r3343, r3356; +} +{ +fma.rn.f16x2 r3381, r3346, r3355, r3378; +} +{ +neg.f16x2 r3385, r3337; +} +{ +mul.f16x2 r3387, r3349, r3359; +} +{ +mul.f16x2 r3390, r3352, r3360; +} +{ +sub.f16x2 r3393, r3387, r3390; +} +{ +mul.f16x2 r3396, r3349, r3360; +} +{ +fma.rn.f16x2 r3399, r3352, r3359, r3396; +} +{ +add.f16x2 %4, r3281, r3331; +} +{ +add.f16x2 %5, r3284, r3334; +} +{ +sub.f16x2 %36, r3281, r3331; +} +{ +sub.f16x2 %37, r3284, r3334; +} +{ +add.f16x2 %12, r3293, r3375; +} +{ +add.f16x2 %13, r3296, r3381; +} +{ +sub.f16x2 %44, r3293, r3375; +} +{ +sub.f16x2 %45, r3296, r3381; +} +{ +add.f16x2 %20, r3287, r3340; +} +{ +add.f16x2 %21, r3290, r3385; +} +{ +sub.f16x2 %52, r3287, r3340; +} +{ +sub.f16x2 %53, r3290, r3385; +} +{ +add.f16x2 %28, r3299, r3393; +} +{ +add.f16x2 %29, r3302, r3399; +} +{ +sub.f16x2 %60, r3299, r3393; +} +{ +sub.f16x2 %61, r3302, r3399; +} +{ +add.f16x2 r3451, r3452, r3453; +} +{ +add.f16x2 r3454, r3455, r3456; +} +{ +sub.f16x2 r3457, r3452, r3453; +} +{ +sub.f16x2 r3460, r3455, r3456; +} +{ +add.f16x2 r3463, r3464, r3465; +} +{ +add.f16x2 r3466, r3467, r3468; +} +{ +sub.f16x2 r3469, r3464, r3465; +} +{ +sub.f16x2 r3472, r3467, r3468; +} +{ +neg.f16x2 r3475, r3469; +} +{ +add.f16x2 r3477, r3451, r3463; +} +{ +add.f16x2 r3480, r3454, r3466; +} +{ +sub.f16x2 r3483, r3451, r3463; +} +{ +sub.f16x2 r3486, r3454, r3466; +} +{ +add.f16x2 r3489, r3457, r3472; +} +{ +add.f16x2 r3492, r3460, r3475; +} +{ +sub.f16x2 r3495, r3457, r3472; +} +{ +sub.f16x2 r3498, r3460, r3475; +} +{ +add.f16x2 r3501, r3502, r3503; +} +{ +add.f16x2 r3504, r3505, r3506; +} +{ +sub.f16x2 r3507, r3502, r3503; +} +{ +sub.f16x2 r3510, r3505, r3506; +} +{ +add.f16x2 r3513, r3514, r3515; +} +{ +add.f16x2 r3516, r3517, r3518; +} +{ +sub.f16x2 r3519, r3514, r3515; +} +{ +sub.f16x2 r3522, r3517, r3518; +} +{ +neg.f16x2 r3525, r3519; +} +{ +add.f16x2 r3527, r3501, r3513; +} +{ +add.f16x2 r3530, r3504, r3516; +} +{ +sub.f16x2 r3533, r3501, r3513; +} +{ +sub.f16x2 r3536, r3504, r3516; +} +{ +add.f16x2 r3539, r3507, r3522; +} +{ +add.f16x2 r3542, r3510, r3525; +} +{ +sub.f16x2 r3545, r3507, r3522; +} +{ +sub.f16x2 r3548, r3510, r3525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r3551, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3552, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3555, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3556, {low, high}; +} +{ +mul.f16x2 r3565, r3539, r3551; +} +{ +mul.f16x2 r3568, r3542, r3552; +} +{ +sub.f16x2 r3571, r3565, r3568; +} +{ +mul.f16x2 r3574, r3539, r3552; +} +{ +fma.rn.f16x2 r3577, r3542, r3551, r3574; +} +{ +neg.f16x2 r3581, r3533; +} +{ +mul.f16x2 r3583, r3545, r3555; +} +{ +mul.f16x2 r3586, r3548, r3556; +} +{ +sub.f16x2 r3589, r3583, r3586; +} +{ +mul.f16x2 r3592, r3545, r3556; +} +{ +fma.rn.f16x2 r3595, r3548, r3555, r3592; +} +{ +add.f16x2 %6, r3477, r3527; +} +{ +add.f16x2 %7, r3480, r3530; +} +{ +sub.f16x2 %38, r3477, r3527; +} +{ +sub.f16x2 %39, r3480, r3530; +} +{ +add.f16x2 %14, r3489, r3571; +} +{ +add.f16x2 %15, r3492, r3577; +} +{ +sub.f16x2 %46, r3489, r3571; +} +{ +sub.f16x2 %47, r3492, r3577; +} +{ +add.f16x2 %22, r3483, r3536; +} +{ +add.f16x2 %23, r3486, r3581; +} +{ +sub.f16x2 %54, r3483, r3536; +} +{ +sub.f16x2 %55, r3486, r3581; +} +{ +add.f16x2 %30, r3495, r3589; +} +{ +add.f16x2 %31, r3498, r3595; +} +{ +sub.f16x2 %62, r3495, r3589; +} +{ +sub.f16x2 %63, r3498, r3595; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<812, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<570>; +.reg .b64 rd<2>; +mov.u32 r543, %tid.y; +shl.b32 r544, r543, 11; +mov.u32 r545, %8; +add.s32 r546, r545, r544; +mov.u32 r547, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r548, r547, 63; +shl.b32 r549, r547, 5; +and.b32 r550, r549, -2048; +add.s32 r551, r546, r550; +cvt.rn.f32.u32 f31, r548; +mul.f32 f32, f31, 0f3CC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r552, r549, 2016; +add.s32 r553, r551, r552; +st.shared.v4.f32 [r553], {r27, r30, r63, r70}; +st.shared.v4.f32 [r553+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r554, r548, -24, r553; +ld.shared.u32 r166, [r554]; +ld.shared.u32 r169, [r554+4]; +ld.shared.u32 r178, [r554+512]; +ld.shared.u32 r181, [r554+516]; +ld.shared.u32 r167, [r554+1024]; +ld.shared.u32 r170, [r554+1028]; +ld.shared.u32 r179, [r554+1536]; +ld.shared.u32 r182, [r554+1540]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r555, r547, 60; +bfe.u32 r556, r547, 2, 4; +cvt.rn.f32.u32 f34, r556; +mul.f32 f35, f34, 0f3DC90FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +shl.b32 r557, r547, 3; +and.b32 r558, r557, 24; +add.s32 r559, r551, r558; +barrier.sync 0; +and.b32 r560, r549, 1920; +add.s32 r561, r559, r560; +st.shared.u32 [r561], r191; +st.shared.u32 [r561+4], r194; +st.shared.u32 [r561+32], r227; +st.shared.u32 [r561+36], r234; +st.shared.u32 [r561+64], r264; +st.shared.u32 [r561+68], r271; +st.shared.u32 [r561+96], r301; +st.shared.u32 [r561+100], r308; +barrier.sync 0; +mad.lo.s32 r562, r555, -24, r561; +ld.shared.u32 r330, [r562]; +ld.shared.u32 r333, [r562+4]; +ld.shared.u32 r342, [r562+512]; +ld.shared.u32 r345, [r562+516]; +ld.shared.u32 r331, [r562+1024]; +ld.shared.u32 r334, [r562+1028]; +ld.shared.u32 r343, [r562+1536]; +ld.shared.u32 r346, [r562+1540]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r350; +} +{ +add.f16x2 r370, r338, r353; +} +{ +sub.f16x2 r373, r335, r350; +} +{ +sub.f16x2 r376, r338, r353; +} +and.b32 r563, r547, 48; +bfe.u32 r564, r547, 4, 2; +cvt.rn.f32.u32 f37, r564; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +neg.f16x2 r389, r386; +} +{ +fma.rn.f16x2 r391, r367, r382, r389; +} +{ +mul.f16x2 r395, r367, r384; +} +{ +fma.rn.f16x2 r398, r370, r382, r395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r361, r419, r426; +} +{ +mul.f16x2 r432, r361, r421; +} +{ +fma.rn.f16x2 r435, r364, r419, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r373, r456, r463; +} +{ +mul.f16x2 r469, r373, r458; +} +{ +fma.rn.f16x2 r472, r376, r456, r469; +} +and.b32 r565, r557, 120; +add.s32 r566, r551, r565; +barrier.sync 0; +and.b32 r567, r549, 1536; +add.s32 r568, r566, r567; +st.shared.u32 [r568], r355; +st.shared.u32 [r568+4], r358; +st.shared.u32 [r568+128], r391; +st.shared.u32 [r568+132], r398; +st.shared.u32 [r568+256], r428; +st.shared.u32 [r568+260], r435; +st.shared.u32 [r568+384], r465; +st.shared.u32 [r568+388], r472; +barrier.sync 0; +mad.lo.s32 r569, r563, -24, r568; +ld.shared.u32 r494, [r569]; +ld.shared.u32 r497, [r569+4]; +ld.shared.u32 r506, [r569+512]; +ld.shared.u32 r509, [r569+516]; +ld.shared.u32 r495, [r569+1024]; +ld.shared.u32 r498, [r569+1028]; +ld.shared.u32 r507, [r569+1536]; +ld.shared.u32 r510, [r569+1540]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r511; +} +{ +add.f16x2 %0, r493, r505; +} +{ +add.f16x2 %1, r496, r508; +} +{ +sub.f16x2 %4, r493, r505; +} +{ +sub.f16x2 %5, r496, r508; +} +{ +add.f16x2 %2, r499, r514; +} +{ +add.f16x2 %3, r502, r517; +} +{ +sub.f16x2 %6, r499, r514; +} +{ +sub.f16x2 %7, r502, r517; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<813, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<570>; +.reg .b64 rd<2>; +mov.u32 r543, %tid.y; +shl.b32 r544, r543, 10; +mov.u32 r545, %8; +add.s32 r546, r545, r544; +mov.u32 r547, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r548, r547, 63; +shl.b32 r549, r547, 4; +and.b32 r550, r549, -1024; +add.s32 r551, r546, r550; +cvt.rn.f32.u32 f31, r548; +mul.f32 f32, f31, 0f3CC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r552, r549, 1008; +add.s32 r553, r551, r552; +st.shared.v4.f32 [r553], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r554, r548, -12, r553; +ld.shared.u32 r166, [r554]; +ld.shared.u32 r178, [r554+256]; +ld.shared.u32 r167, [r554+512]; +ld.shared.u32 r179, [r554+768]; +barrier.sync 0; +st.shared.v4.f32 [r553], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r554]; +ld.shared.u32 r181, [r554+256]; +ld.shared.u32 r170, [r554+512]; +ld.shared.u32 r182, [r554+768]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r555, r547, 60; +bfe.u32 r556, r547, 2, 4; +shl.b32 r557, r547, 2; +and.b32 r558, r557, 12; +add.s32 r559, r551, r558; +cvt.rn.f32.u32 f34, r556; +mul.f32 f35, f34, 0f3DC90FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +barrier.sync 0; +and.b32 r560, r549, 960; +add.s32 r561, r559, r560; +st.shared.u32 [r561], r191; +st.shared.u32 [r561+16], r227; +st.shared.u32 [r561+32], r264; +st.shared.u32 [r561+48], r301; +barrier.sync 0; +mad.lo.s32 r562, r555, -12, r561; +ld.shared.u32 r330, [r562]; +ld.shared.u32 r342, [r562+256]; +ld.shared.u32 r331, [r562+512]; +ld.shared.u32 r343, [r562+768]; +barrier.sync 0; +st.shared.u32 [r561], r194; +st.shared.u32 [r561+16], r234; +st.shared.u32 [r561+32], r271; +st.shared.u32 [r561+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r562]; +ld.shared.u32 r345, [r562+256]; +ld.shared.u32 r334, [r562+512]; +ld.shared.u32 r346, [r562+768]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r350; +} +{ +add.f16x2 r370, r338, r353; +} +{ +sub.f16x2 r373, r335, r350; +} +{ +sub.f16x2 r376, r338, r353; +} +and.b32 r563, r547, 48; +bfe.u32 r564, r547, 4, 2; +and.b32 r565, r557, 60; +add.s32 r566, r551, r565; +cvt.rn.f32.u32 f37, r564; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +neg.f16x2 r389, r386; +} +{ +fma.rn.f16x2 r391, r367, r382, r389; +} +{ +mul.f16x2 r395, r367, r384; +} +{ +fma.rn.f16x2 r398, r370, r382, r395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r361, r419, r426; +} +{ +mul.f16x2 r432, r361, r421; +} +{ +fma.rn.f16x2 r435, r364, r419, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r373, r456, r463; +} +{ +mul.f16x2 r469, r373, r458; +} +{ +fma.rn.f16x2 r472, r376, r456, r469; +} +barrier.sync 0; +and.b32 r567, r549, 768; +add.s32 r568, r566, r567; +st.shared.u32 [r568], r355; +st.shared.u32 [r568+64], r391; +st.shared.u32 [r568+128], r428; +st.shared.u32 [r568+192], r465; +barrier.sync 0; +mad.lo.s32 r569, r563, -12, r568; +ld.shared.u32 r494, [r569]; +ld.shared.u32 r506, [r569+256]; +ld.shared.u32 r495, [r569+512]; +ld.shared.u32 r507, [r569+768]; +barrier.sync 0; +st.shared.u32 [r568], r358; +st.shared.u32 [r568+64], r398; +st.shared.u32 [r568+128], r435; +st.shared.u32 [r568+192], r472; +barrier.sync 0; +ld.shared.u32 r497, [r569]; +ld.shared.u32 r509, [r569+256]; +ld.shared.u32 r498, [r569+512]; +ld.shared.u32 r510, [r569+768]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r511; +} +{ +add.f16x2 %0, r493, r505; +} +{ +add.f16x2 %1, r496, r508; +} +{ +sub.f16x2 %4, r493, r505; +} +{ +sub.f16x2 %5, r496, r508; +} +{ +add.f16x2 %2, r499, r514; +} +{ +add.f16x2 %3, r502, r517; +} +{ +sub.f16x2 %6, r499, r514; +} +{ +sub.f16x2 %7, r502, r517; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<814, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<540>; +.reg .b32 r<3723>; +.reg .b64 rd<3>; +mov.u32 r3647, %tid.y; +shl.b32 r3648, r3647, 10; +mov.u32 r3649, %64; +add.s32 r3650, r3649, r3648; +mov.u32 r3651, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f508, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r101, {low, high}; +} +mov.f32 f518, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %79, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %79, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f238, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r396, {low, high}; +} +mov.f32 f254, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r397, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %78, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %78, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3652, r3651, 7; +shl.b32 r3653, r3651, 7; +and.b32 r3654, r3653, -1024; +add.s32 r3655, r3650, r3654; +cvt.rn.f32.u32 f535, r3652; +mul.f32 f536, f535, 0f3CC90FDB; +cos.approx.f32 f357, f536; +sin.approx.f32 f537, f536; +neg.f32 f358, f537; +mov.f32 f539, 0fBF800000; +mov.f32 f538, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r3656, r3653, 896; +add.s32 r3657, r3655, r3656; +st.shared.v4.f32 [r3657], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r3657+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r3657+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r3657+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r3657+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r3657+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r3657+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r3657+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r3658, r3652, -124, r3657; +ld.shared.u32 r2864, [r3658]; +ld.shared.u32 r3060, [r3658+32]; +ld.shared.u32 r3256, [r3658+64]; +ld.shared.u32 r3452, [r3658+96]; +ld.shared.u32 r2914, [r3658+128]; +ld.shared.u32 r3110, [r3658+160]; +ld.shared.u32 r3306, [r3658+192]; +ld.shared.u32 r3502, [r3658+224]; +ld.shared.u32 r2876, [r3658+256]; +ld.shared.u32 r3072, [r3658+288]; +ld.shared.u32 r3268, [r3658+320]; +ld.shared.u32 r3464, [r3658+352]; +ld.shared.u32 r2926, [r3658+384]; +ld.shared.u32 r3122, [r3658+416]; +ld.shared.u32 r3318, [r3658+448]; +ld.shared.u32 r3514, [r3658+480]; +ld.shared.u32 r2865, [r3658+512]; +ld.shared.u32 r3061, [r3658+544]; +ld.shared.u32 r3257, [r3658+576]; +ld.shared.u32 r3453, [r3658+608]; +ld.shared.u32 r2915, [r3658+640]; +ld.shared.u32 r3111, [r3658+672]; +ld.shared.u32 r3307, [r3658+704]; +ld.shared.u32 r3503, [r3658+736]; +ld.shared.u32 r2877, [r3658+768]; +ld.shared.u32 r3073, [r3658+800]; +ld.shared.u32 r3269, [r3658+832]; +ld.shared.u32 r3465, [r3658+864]; +ld.shared.u32 r2927, [r3658+896]; +ld.shared.u32 r3123, [r3658+928]; +ld.shared.u32 r3319, [r3658+960]; +ld.shared.u32 r3515, [r3658+992]; +barrier.sync 0; +st.shared.v4.f32 [r3657], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r3657+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r3657+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r3657+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r3657+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r3657+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r3657+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r3657+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r3658]; +ld.shared.u32 r3063, [r3658+32]; +ld.shared.u32 r3259, [r3658+64]; +ld.shared.u32 r3455, [r3658+96]; +ld.shared.u32 r2917, [r3658+128]; +ld.shared.u32 r3113, [r3658+160]; +ld.shared.u32 r3309, [r3658+192]; +ld.shared.u32 r3505, [r3658+224]; +ld.shared.u32 r2879, [r3658+256]; +ld.shared.u32 r3075, [r3658+288]; +ld.shared.u32 r3271, [r3658+320]; +ld.shared.u32 r3467, [r3658+352]; +ld.shared.u32 r2929, [r3658+384]; +ld.shared.u32 r3125, [r3658+416]; +ld.shared.u32 r3321, [r3658+448]; +ld.shared.u32 r3517, [r3658+480]; +ld.shared.u32 r2868, [r3658+512]; +ld.shared.u32 r3064, [r3658+544]; +ld.shared.u32 r3260, [r3658+576]; +ld.shared.u32 r3456, [r3658+608]; +ld.shared.u32 r2918, [r3658+640]; +ld.shared.u32 r3114, [r3658+672]; +ld.shared.u32 r3310, [r3658+704]; +ld.shared.u32 r3506, [r3658+736]; +ld.shared.u32 r2880, [r3658+768]; +ld.shared.u32 r3076, [r3658+800]; +ld.shared.u32 r3272, [r3658+832]; +ld.shared.u32 r3468, [r3658+864]; +ld.shared.u32 r2930, [r3658+896]; +ld.shared.u32 r3126, [r3658+928]; +ld.shared.u32 r3322, [r3658+960]; +ld.shared.u32 r3518, [r3658+992]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 %0, r2889, r2939; +} +{ +add.f16x2 %1, r2892, r2942; +} +{ +sub.f16x2 %32, r2889, r2939; +} +{ +sub.f16x2 %33, r2892, r2942; +} +{ +add.f16x2 %8, r2901, r2983; +} +{ +add.f16x2 %9, r2904, r2989; +} +{ +sub.f16x2 %40, r2901, r2983; +} +{ +sub.f16x2 %41, r2904, r2989; +} +{ +add.f16x2 %16, r2895, r2948; +} +{ +add.f16x2 %17, r2898, r2993; +} +{ +sub.f16x2 %48, r2895, r2948; +} +{ +sub.f16x2 %49, r2898, r2993; +} +{ +add.f16x2 %24, r2907, r3001; +} +{ +add.f16x2 %25, r2910, r3007; +} +{ +sub.f16x2 %56, r2907, r3001; +} +{ +sub.f16x2 %57, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 %2, r3085, r3135; +} +{ +add.f16x2 %3, r3088, r3138; +} +{ +sub.f16x2 %34, r3085, r3135; +} +{ +sub.f16x2 %35, r3088, r3138; +} +{ +add.f16x2 %10, r3097, r3179; +} +{ +add.f16x2 %11, r3100, r3185; +} +{ +sub.f16x2 %42, r3097, r3179; +} +{ +sub.f16x2 %43, r3100, r3185; +} +{ +add.f16x2 %18, r3091, r3144; +} +{ +add.f16x2 %19, r3094, r3189; +} +{ +sub.f16x2 %50, r3091, r3144; +} +{ +sub.f16x2 %51, r3094, r3189; +} +{ +add.f16x2 %26, r3103, r3197; +} +{ +add.f16x2 %27, r3106, r3203; +} +{ +sub.f16x2 %58, r3103, r3197; +} +{ +sub.f16x2 %59, r3106, r3203; +} +{ +add.f16x2 r3255, r3256, r3257; +} +{ +add.f16x2 r3258, r3259, r3260; +} +{ +sub.f16x2 r3261, r3256, r3257; +} +{ +sub.f16x2 r3264, r3259, r3260; +} +{ +add.f16x2 r3267, r3268, r3269; +} +{ +add.f16x2 r3270, r3271, r3272; +} +{ +sub.f16x2 r3273, r3268, r3269; +} +{ +sub.f16x2 r3276, r3271, r3272; +} +{ +neg.f16x2 r3279, r3273; +} +{ +add.f16x2 r3281, r3255, r3267; +} +{ +add.f16x2 r3284, r3258, r3270; +} +{ +sub.f16x2 r3287, r3255, r3267; +} +{ +sub.f16x2 r3290, r3258, r3270; +} +{ +add.f16x2 r3293, r3261, r3276; +} +{ +add.f16x2 r3296, r3264, r3279; +} +{ +sub.f16x2 r3299, r3261, r3276; +} +{ +sub.f16x2 r3302, r3264, r3279; +} +{ +add.f16x2 r3305, r3306, r3307; +} +{ +add.f16x2 r3308, r3309, r3310; +} +{ +sub.f16x2 r3311, r3306, r3307; +} +{ +sub.f16x2 r3314, r3309, r3310; +} +{ +add.f16x2 r3317, r3318, r3319; +} +{ +add.f16x2 r3320, r3321, r3322; +} +{ +sub.f16x2 r3323, r3318, r3319; +} +{ +sub.f16x2 r3326, r3321, r3322; +} +{ +neg.f16x2 r3329, r3323; +} +{ +add.f16x2 r3331, r3305, r3317; +} +{ +add.f16x2 r3334, r3308, r3320; +} +{ +sub.f16x2 r3337, r3305, r3317; +} +{ +sub.f16x2 r3340, r3308, r3320; +} +{ +add.f16x2 r3343, r3311, r3326; +} +{ +add.f16x2 r3346, r3314, r3329; +} +{ +sub.f16x2 r3349, r3311, r3326; +} +{ +sub.f16x2 r3352, r3314, r3329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r3355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3360, {low, high}; +} +{ +mul.f16x2 r3369, r3343, r3355; +} +{ +mul.f16x2 r3372, r3346, r3356; +} +{ +sub.f16x2 r3375, r3369, r3372; +} +{ +mul.f16x2 r3378, r3343, r3356; +} +{ +fma.rn.f16x2 r3381, r3346, r3355, r3378; +} +{ +neg.f16x2 r3385, r3337; +} +{ +mul.f16x2 r3387, r3349, r3359; +} +{ +mul.f16x2 r3390, r3352, r3360; +} +{ +sub.f16x2 r3393, r3387, r3390; +} +{ +mul.f16x2 r3396, r3349, r3360; +} +{ +fma.rn.f16x2 r3399, r3352, r3359, r3396; +} +{ +add.f16x2 %4, r3281, r3331; +} +{ +add.f16x2 %5, r3284, r3334; +} +{ +sub.f16x2 %36, r3281, r3331; +} +{ +sub.f16x2 %37, r3284, r3334; +} +{ +add.f16x2 %12, r3293, r3375; +} +{ +add.f16x2 %13, r3296, r3381; +} +{ +sub.f16x2 %44, r3293, r3375; +} +{ +sub.f16x2 %45, r3296, r3381; +} +{ +add.f16x2 %20, r3287, r3340; +} +{ +add.f16x2 %21, r3290, r3385; +} +{ +sub.f16x2 %52, r3287, r3340; +} +{ +sub.f16x2 %53, r3290, r3385; +} +{ +add.f16x2 %28, r3299, r3393; +} +{ +add.f16x2 %29, r3302, r3399; +} +{ +sub.f16x2 %60, r3299, r3393; +} +{ +sub.f16x2 %61, r3302, r3399; +} +{ +add.f16x2 r3451, r3452, r3453; +} +{ +add.f16x2 r3454, r3455, r3456; +} +{ +sub.f16x2 r3457, r3452, r3453; +} +{ +sub.f16x2 r3460, r3455, r3456; +} +{ +add.f16x2 r3463, r3464, r3465; +} +{ +add.f16x2 r3466, r3467, r3468; +} +{ +sub.f16x2 r3469, r3464, r3465; +} +{ +sub.f16x2 r3472, r3467, r3468; +} +{ +neg.f16x2 r3475, r3469; +} +{ +add.f16x2 r3477, r3451, r3463; +} +{ +add.f16x2 r3480, r3454, r3466; +} +{ +sub.f16x2 r3483, r3451, r3463; +} +{ +sub.f16x2 r3486, r3454, r3466; +} +{ +add.f16x2 r3489, r3457, r3472; +} +{ +add.f16x2 r3492, r3460, r3475; +} +{ +sub.f16x2 r3495, r3457, r3472; +} +{ +sub.f16x2 r3498, r3460, r3475; +} +{ +add.f16x2 r3501, r3502, r3503; +} +{ +add.f16x2 r3504, r3505, r3506; +} +{ +sub.f16x2 r3507, r3502, r3503; +} +{ +sub.f16x2 r3510, r3505, r3506; +} +{ +add.f16x2 r3513, r3514, r3515; +} +{ +add.f16x2 r3516, r3517, r3518; +} +{ +sub.f16x2 r3519, r3514, r3515; +} +{ +sub.f16x2 r3522, r3517, r3518; +} +{ +neg.f16x2 r3525, r3519; +} +{ +add.f16x2 r3527, r3501, r3513; +} +{ +add.f16x2 r3530, r3504, r3516; +} +{ +sub.f16x2 r3533, r3501, r3513; +} +{ +sub.f16x2 r3536, r3504, r3516; +} +{ +add.f16x2 r3539, r3507, r3522; +} +{ +add.f16x2 r3542, r3510, r3525; +} +{ +sub.f16x2 r3545, r3507, r3522; +} +{ +sub.f16x2 r3548, r3510, r3525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f508; +cvt.rn.f16.f32 high, f508; +mov.b32 r3551, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3552, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3555, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3556, {low, high}; +} +{ +mul.f16x2 r3565, r3539, r3551; +} +{ +mul.f16x2 r3568, r3542, r3552; +} +{ +sub.f16x2 r3571, r3565, r3568; +} +{ +mul.f16x2 r3574, r3539, r3552; +} +{ +fma.rn.f16x2 r3577, r3542, r3551, r3574; +} +{ +neg.f16x2 r3581, r3533; +} +{ +mul.f16x2 r3583, r3545, r3555; +} +{ +mul.f16x2 r3586, r3548, r3556; +} +{ +sub.f16x2 r3589, r3583, r3586; +} +{ +mul.f16x2 r3592, r3545, r3556; +} +{ +fma.rn.f16x2 r3595, r3548, r3555, r3592; +} +{ +add.f16x2 %6, r3477, r3527; +} +{ +add.f16x2 %7, r3480, r3530; +} +{ +sub.f16x2 %38, r3477, r3527; +} +{ +sub.f16x2 %39, r3480, r3530; +} +{ +add.f16x2 %14, r3489, r3571; +} +{ +add.f16x2 %15, r3492, r3577; +} +{ +sub.f16x2 %46, r3489, r3571; +} +{ +sub.f16x2 %47, r3492, r3577; +} +{ +add.f16x2 %22, r3483, r3536; +} +{ +add.f16x2 %23, r3486, r3581; +} +{ +sub.f16x2 %54, r3483, r3536; +} +{ +sub.f16x2 %55, r3486, r3581; +} +{ +add.f16x2 %30, r3495, r3589; +} +{ +add.f16x2 %31, r3498, r3595; +} +{ +sub.f16x2 %62, r3495, r3589; +} +{ +sub.f16x2 %63, r3498, r3595; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<815, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<64>; +.reg .b32 r<433>; +.reg .b64 rd<2>; +mov.u32 r377, %tid.y; +shl.b32 r378, r377, 11; +mov.u32 r379, %4; +add.s32 r380, r379, r378; +mov.u32 r381, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r382, r381, 127; +shl.b32 r383, r381, 4; +and.b32 r384, r383, -2048; +add.s32 r385, r380, r384; +cvt.rn.f32.u32 f43, r382; +mul.f32 f44, f43, 0f3CC90FDB; +cos.approx.f32 f1, f44; +sin.approx.f32 f45, f44; +neg.f32 f2, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r386, r383, 2032; +add.s32 r387, r385, r386; +st.shared.v2.f32 [r387], {r1, r4}; +st.shared.v2.f32 [r387+8], {r25, r32}; +barrier.sync 0; +shl.b32 r388, r381, 3; +and.b32 r389, r388, 1016; +sub.s32 r390, r387, r389; +ld.shared.u32 r54, [r390]; +ld.shared.u32 r57, [r390+4]; +ld.shared.u32 r55, [r390+1024]; +ld.shared.u32 r58, [r390+1028]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r391, r381, 1, 6; +cvt.rn.f32.u32 f46, r391; +mul.f32 f47, f46, 0f3D490FDB; +cos.approx.f32 f7, f47; +sin.approx.f32 f48, f47; +neg.f32 f8, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r392, r388, 8; +add.s32 r393, r385, r392; +barrier.sync 0; +and.b32 r394, r383, 2016; +add.s32 r395, r393, r394; +st.shared.u32 [r395], r53; +st.shared.u32 [r395+4], r56; +st.shared.u32 [r395+16], r77; +st.shared.u32 [r395+20], r84; +barrier.sync 0; +and.b32 r396, r388, 1008; +sub.s32 r397, r395, r396; +ld.shared.u32 r106, [r397]; +ld.shared.u32 r109, [r397+4]; +ld.shared.u32 r107, [r397+1024]; +ld.shared.u32 r110, [r397+1028]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r398, r381, 2, 5; +cvt.rn.f32.u32 f49, r398; +mul.f32 f50, f49, 0f3DC90FDB; +cos.approx.f32 f13, f50; +sin.approx.f32 f51, f50; +neg.f32 f14, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +and.b32 r399, r388, 24; +add.s32 r400, r385, r399; +barrier.sync 0; +and.b32 r401, r383, 1984; +add.s32 r402, r400, r401; +st.shared.u32 [r402], r105; +st.shared.u32 [r402+4], r108; +st.shared.u32 [r402+32], r129; +st.shared.u32 [r402+36], r136; +barrier.sync 0; +and.b32 r403, r388, 992; +sub.s32 r404, r402, r403; +ld.shared.u32 r158, [r404]; +ld.shared.u32 r161, [r404+4]; +ld.shared.u32 r159, [r404+1024]; +ld.shared.u32 r162, [r404+1028]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r405, r381, 3, 4; +cvt.rn.f32.u32 f52, r405; +mul.f32 f53, f52, 0f3E490FDB; +cos.approx.f32 f19, f53; +sin.approx.f32 f54, f53; +neg.f32 f20, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +and.b32 r406, r388, 56; +add.s32 r407, r385, r406; +barrier.sync 0; +and.b32 r408, r383, 1920; +add.s32 r409, r407, r408; +st.shared.u32 [r409], r157; +st.shared.u32 [r409+4], r160; +st.shared.u32 [r409+64], r181; +st.shared.u32 [r409+68], r188; +barrier.sync 0; +and.b32 r410, r388, 960; +sub.s32 r411, r409, r410; +ld.shared.u32 r210, [r411]; +ld.shared.u32 r213, [r411+4]; +ld.shared.u32 r211, [r411+1024]; +ld.shared.u32 r214, [r411+1028]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r412, r381, 4, 3; +cvt.rn.f32.u32 f55, r412; +mul.f32 f56, f55, 0f3EC90FDB; +cos.approx.f32 f25, f56; +sin.approx.f32 f57, f56; +neg.f32 f26, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +and.b32 r413, r388, 120; +add.s32 r414, r385, r413; +barrier.sync 0; +and.b32 r415, r383, 1792; +add.s32 r416, r414, r415; +st.shared.u32 [r416], r209; +st.shared.u32 [r416+4], r212; +st.shared.u32 [r416+128], r233; +st.shared.u32 [r416+132], r240; +barrier.sync 0; +and.b32 r417, r388, 896; +sub.s32 r418, r416, r417; +ld.shared.u32 r262, [r418]; +ld.shared.u32 r265, [r418+4]; +ld.shared.u32 r263, [r418+1024]; +ld.shared.u32 r266, [r418+1028]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r419, r381, 5, 2; +cvt.rn.f32.u32 f58, r419; +mul.f32 f59, f58, 0f3F490FDB; +cos.approx.f32 f31, f59; +sin.approx.f32 f60, f59; +neg.f32 f32, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +neg.f16x2 r283, r280; +} +{ +fma.rn.f16x2 r285, r267, r276, r283; +} +{ +mul.f16x2 r289, r267, r278; +} +{ +fma.rn.f16x2 r292, r270, r276, r289; +} +and.b32 r420, r388, 248; +add.s32 r421, r385, r420; +barrier.sync 0; +and.b32 r422, r383, 1536; +add.s32 r423, r421, r422; +st.shared.u32 [r423], r261; +st.shared.u32 [r423+4], r264; +st.shared.u32 [r423+256], r285; +st.shared.u32 [r423+260], r292; +barrier.sync 0; +and.b32 r424, r388, 768; +sub.s32 r425, r423, r424; +ld.shared.u32 r314, [r425]; +ld.shared.u32 r317, [r425+4]; +ld.shared.u32 r315, [r425+1024]; +ld.shared.u32 r318, [r425+1028]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r426, r381, 6, 1; +cvt.rn.f32.u32 f61, r426; +mul.f32 f62, f61, 0f3FC90FDB; +cos.approx.f32 f37, f62; +sin.approx.f32 f63, f62; +neg.f32 f38, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +neg.f16x2 r335, r332; +} +{ +fma.rn.f16x2 r337, r319, r328, r335; +} +{ +mul.f16x2 r341, r319, r330; +} +{ +fma.rn.f16x2 r344, r322, r328, r341; +} +and.b32 r427, r388, 504; +add.s32 r428, r385, r427; +barrier.sync 0; +and.b32 r429, r383, 1024; +add.s32 r430, r428, r429; +st.shared.u32 [r430], r313; +st.shared.u32 [r430+4], r316; +st.shared.u32 [r430+512], r337; +st.shared.u32 [r430+516], r344; +barrier.sync 0; +and.b32 r431, r388, 512; +sub.s32 r432, r430, r431; +ld.shared.u32 r366, [r432]; +ld.shared.u32 r369, [r432+4]; +ld.shared.u32 r367, [r432+1024]; +ld.shared.u32 r370, [r432+1028]; +{ +add.f16x2 %0, r366, r367; +} +{ +add.f16x2 %1, r369, r370; +} +{ +sub.f16x2 %2, r366, r367; +} +{ +sub.f16x2 %3, r369, r370; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<816, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<64>; +.reg .b32 r<433>; +.reg .b64 rd<2>; +mov.u32 r377, %tid.y; +shl.b32 r378, r377, 10; +mov.u32 r379, %4; +add.s32 r380, r379, r378; +mov.u32 r381, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r382, r381, 127; +shl.b32 r383, r381, 3; +and.b32 r384, r383, -1024; +add.s32 r385, r380, r384; +cvt.rn.f32.u32 f43, r382; +mul.f32 f44, f43, 0f3CC90FDB; +cos.approx.f32 f1, f44; +sin.approx.f32 f45, f44; +neg.f32 f2, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r386, r383, 1016; +add.s32 r387, r385, r386; +st.shared.v2.f32 [r387], {r1, r25}; +barrier.sync 0; +shl.b32 r388, r381, 2; +and.b32 r389, r388, 508; +sub.s32 r390, r387, r389; +ld.shared.u32 r54, [r390]; +ld.shared.u32 r55, [r390+512]; +barrier.sync 0; +st.shared.v2.f32 [r387], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r390]; +ld.shared.u32 r58, [r390+512]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r391, r381, 1, 6; +and.b32 r392, r388, 4; +add.s32 r393, r385, r392; +cvt.rn.f32.u32 f46, r391; +mul.f32 f47, f46, 0f3D490FDB; +cos.approx.f32 f7, f47; +sin.approx.f32 f48, f47; +neg.f32 f8, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r394, r383, 1008; +add.s32 r395, r393, r394; +st.shared.u32 [r395], r53; +st.shared.u32 [r395+8], r77; +barrier.sync 0; +and.b32 r396, r388, 504; +sub.s32 r397, r395, r396; +ld.shared.u32 r106, [r397]; +ld.shared.u32 r107, [r397+512]; +barrier.sync 0; +st.shared.u32 [r395], r56; +st.shared.u32 [r395+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r397]; +ld.shared.u32 r110, [r397+512]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r398, r381, 2, 5; +and.b32 r399, r388, 12; +add.s32 r400, r385, r399; +cvt.rn.f32.u32 f49, r398; +mul.f32 f50, f49, 0f3DC90FDB; +cos.approx.f32 f13, f50; +sin.approx.f32 f51, f50; +neg.f32 f14, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +barrier.sync 0; +and.b32 r401, r383, 992; +add.s32 r402, r400, r401; +st.shared.u32 [r402], r105; +st.shared.u32 [r402+16], r129; +barrier.sync 0; +and.b32 r403, r388, 496; +sub.s32 r404, r402, r403; +ld.shared.u32 r158, [r404]; +ld.shared.u32 r159, [r404+512]; +barrier.sync 0; +st.shared.u32 [r402], r108; +st.shared.u32 [r402+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r404]; +ld.shared.u32 r162, [r404+512]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r405, r381, 3, 4; +and.b32 r406, r388, 28; +add.s32 r407, r385, r406; +cvt.rn.f32.u32 f52, r405; +mul.f32 f53, f52, 0f3E490FDB; +cos.approx.f32 f19, f53; +sin.approx.f32 f54, f53; +neg.f32 f20, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +barrier.sync 0; +and.b32 r408, r383, 960; +add.s32 r409, r407, r408; +st.shared.u32 [r409], r157; +st.shared.u32 [r409+32], r181; +barrier.sync 0; +and.b32 r410, r388, 480; +sub.s32 r411, r409, r410; +ld.shared.u32 r210, [r411]; +ld.shared.u32 r211, [r411+512]; +barrier.sync 0; +st.shared.u32 [r409], r160; +st.shared.u32 [r409+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r411]; +ld.shared.u32 r214, [r411+512]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r412, r381, 4, 3; +and.b32 r413, r388, 60; +add.s32 r414, r385, r413; +cvt.rn.f32.u32 f55, r412; +mul.f32 f56, f55, 0f3EC90FDB; +cos.approx.f32 f25, f56; +sin.approx.f32 f57, f56; +neg.f32 f26, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +barrier.sync 0; +and.b32 r415, r383, 896; +add.s32 r416, r414, r415; +st.shared.u32 [r416], r209; +st.shared.u32 [r416+64], r233; +barrier.sync 0; +and.b32 r417, r388, 448; +sub.s32 r418, r416, r417; +ld.shared.u32 r262, [r418]; +ld.shared.u32 r263, [r418+512]; +barrier.sync 0; +st.shared.u32 [r416], r212; +st.shared.u32 [r416+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r418]; +ld.shared.u32 r266, [r418+512]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r419, r381, 5, 2; +and.b32 r420, r388, 124; +add.s32 r421, r385, r420; +cvt.rn.f32.u32 f58, r419; +mul.f32 f59, f58, 0f3F490FDB; +cos.approx.f32 f31, f59; +sin.approx.f32 f60, f59; +neg.f32 f32, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +neg.f16x2 r283, r280; +} +{ +fma.rn.f16x2 r285, r267, r276, r283; +} +{ +mul.f16x2 r289, r267, r278; +} +{ +fma.rn.f16x2 r292, r270, r276, r289; +} +barrier.sync 0; +and.b32 r422, r383, 768; +add.s32 r423, r421, r422; +st.shared.u32 [r423], r261; +st.shared.u32 [r423+128], r285; +barrier.sync 0; +and.b32 r424, r388, 384; +sub.s32 r425, r423, r424; +ld.shared.u32 r314, [r425]; +ld.shared.u32 r315, [r425+512]; +barrier.sync 0; +st.shared.u32 [r423], r264; +st.shared.u32 [r423+128], r292; +barrier.sync 0; +ld.shared.u32 r317, [r425]; +ld.shared.u32 r318, [r425+512]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r426, r381, 6, 1; +and.b32 r427, r388, 252; +add.s32 r428, r385, r427; +cvt.rn.f32.u32 f61, r426; +mul.f32 f62, f61, 0f3FC90FDB; +cos.approx.f32 f37, f62; +sin.approx.f32 f63, f62; +neg.f32 f38, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +neg.f16x2 r335, r332; +} +{ +fma.rn.f16x2 r337, r319, r328, r335; +} +{ +mul.f16x2 r341, r319, r330; +} +{ +fma.rn.f16x2 r344, r322, r328, r341; +} +barrier.sync 0; +and.b32 r429, r383, 512; +add.s32 r430, r428, r429; +st.shared.u32 [r430], r313; +st.shared.u32 [r430+256], r337; +barrier.sync 0; +and.b32 r431, r388, 256; +sub.s32 r432, r430, r431; +ld.shared.u32 r366, [r432]; +ld.shared.u32 r367, [r432+512]; +barrier.sync 0; +st.shared.u32 [r430], r316; +st.shared.u32 [r430+256], r344; +barrier.sync 0; +ld.shared.u32 r369, [r432]; +ld.shared.u32 r370, [r432+512]; +{ +add.f16x2 %0, r366, r367; +} +{ +add.f16x2 %1, r369, r370; +} +{ +sub.f16x2 %2, r366, r367; +} +{ +sub.f16x2 %3, r369, r370; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..03e005599a392 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp16_inv.hpp.inc @@ -0,0 +1,18730 @@ +#ifndef CUFFTDX_FFT_256_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_256_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1009, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<270>; +.reg .b32 r<1803>; +.reg .b64 rd<2>; +mov.u32 r1791, %tid.y; +shl.b32 r1792, r1791, 10; +mov.u32 r1793, %32; +add.s32 r1794, r1793, r1792; +mov.u32 r1795, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f230, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f228, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1796, r1795, 15; +shl.b32 r1797, r1795, 6; +and.b32 r1798, r1797, -1024; +add.s32 r1799, r1794, r1798; +cvt.rn.f32.u32 f267, r1796; +mul.f32 f268, f267, 0f3CC90FDB; +cos.approx.f32 f117, f268; +sin.approx.f32 f269, f268; +neg.f32 f118, f269; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1800, r1797, 960; +add.s32 r1801, r1799, r1800; +st.shared.v4.f32 [r1801], {r521, r627, r664, r701}; +st.shared.v4.f32 [r1801+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r1801+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r1801+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r1802, r1796, -60, r1801; +ld.shared.u32 r1176, [r1802]; +ld.shared.u32 r1372, [r1802+64]; +ld.shared.u32 r1226, [r1802+128]; +ld.shared.u32 r1422, [r1802+192]; +ld.shared.u32 r1188, [r1802+256]; +ld.shared.u32 r1384, [r1802+320]; +ld.shared.u32 r1238, [r1802+384]; +ld.shared.u32 r1434, [r1802+448]; +ld.shared.u32 r1177, [r1802+512]; +ld.shared.u32 r1373, [r1802+576]; +ld.shared.u32 r1227, [r1802+640]; +ld.shared.u32 r1423, [r1802+704]; +ld.shared.u32 r1189, [r1802+768]; +ld.shared.u32 r1385, [r1802+832]; +ld.shared.u32 r1239, [r1802+896]; +ld.shared.u32 r1435, [r1802+960]; +barrier.sync 0; +st.shared.v4.f32 [r1801], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1801+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1801+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1801+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1802]; +ld.shared.u32 r1375, [r1802+64]; +ld.shared.u32 r1229, [r1802+128]; +ld.shared.u32 r1425, [r1802+192]; +ld.shared.u32 r1191, [r1802+256]; +ld.shared.u32 r1387, [r1802+320]; +ld.shared.u32 r1241, [r1802+384]; +ld.shared.u32 r1437, [r1802+448]; +ld.shared.u32 r1180, [r1802+512]; +ld.shared.u32 r1376, [r1802+576]; +ld.shared.u32 r1230, [r1802+640]; +ld.shared.u32 r1426, [r1802+704]; +ld.shared.u32 r1192, [r1802+768]; +ld.shared.u32 r1388, [r1802+832]; +ld.shared.u32 r1242, [r1802+896]; +ld.shared.u32 r1438, [r1802+960]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 %0, r1323, r1519; +} +{ +add.f16x2 %1, r1326, r1522; +} +{ +sub.f16x2 %16, r1323, r1519; +} +{ +sub.f16x2 %17, r1326, r1522; +} +{ +add.f16x2 %2, r1335, r1603; +} +{ +add.f16x2 %3, r1338, r1609; +} +{ +sub.f16x2 %18, r1335, r1603; +} +{ +sub.f16x2 %19, r1338, r1609; +} +{ +add.f16x2 %4, r1347, r1619; +} +{ +add.f16x2 %5, r1350, r1625; +} +{ +sub.f16x2 %20, r1347, r1619; +} +{ +sub.f16x2 %21, r1350, r1625; +} +{ +add.f16x2 %6, r1359, r1635; +} +{ +add.f16x2 %7, r1362, r1641; +} +{ +sub.f16x2 %22, r1359, r1635; +} +{ +sub.f16x2 %23, r1362, r1641; +} +{ +add.f16x2 %8, r1329, r1645; +} +{ +add.f16x2 %9, r1332, r1525; +} +{ +sub.f16x2 %24, r1329, r1645; +} +{ +sub.f16x2 %25, r1332, r1525; +} +{ +add.f16x2 %10, r1341, r1653; +} +{ +add.f16x2 %11, r1344, r1659; +} +{ +sub.f16x2 %26, r1341, r1653; +} +{ +sub.f16x2 %27, r1344, r1659; +} +{ +add.f16x2 %12, r1353, r1669; +} +{ +add.f16x2 %13, r1356, r1675; +} +{ +sub.f16x2 %28, r1353, r1669; +} +{ +sub.f16x2 %29, r1356, r1675; +} +{ +add.f16x2 %14, r1365, r1685; +} +{ +add.f16x2 %15, r1368, r1691; +} +{ +sub.f16x2 %30, r1365, r1685; +} +{ +sub.f16x2 %31, r1368, r1691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1010, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<1037>; +.reg .b64 rd<2>; +mov.u32 r1017, %tid.y; +shl.b32 r1018, r1017, 11; +mov.u32 r1019, %16; +add.s32 r1020, r1019, r1018; +mov.u32 r1021, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f58, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f90, 0f3F800000; +mov.f32 f56, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1022, r1021, 31; +shl.b32 r1023, r1021, 6; +and.b32 r1024, r1023, -2048; +add.s32 r1025, r1020, r1024; +cvt.rn.f32.u32 f93, r1022; +mul.f32 f94, f93, 0f3CC90FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1026, r1023, 1984; +add.s32 r1027, r1025, r1026; +st.shared.v4.f32 [r1027], {r149, r152, r207, r216}; +st.shared.v4.f32 [r1027+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r1027+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r1027+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r1028, r1022, -56, r1027; +ld.shared.u32 r460, [r1028]; +ld.shared.u32 r463, [r1028+4]; +ld.shared.u32 r510, [r1028+256]; +ld.shared.u32 r513, [r1028+260]; +ld.shared.u32 r472, [r1028+512]; +ld.shared.u32 r475, [r1028+516]; +ld.shared.u32 r522, [r1028+768]; +ld.shared.u32 r525, [r1028+772]; +ld.shared.u32 r461, [r1028+1024]; +ld.shared.u32 r464, [r1028+1028]; +ld.shared.u32 r511, [r1028+1280]; +ld.shared.u32 r514, [r1028+1284]; +ld.shared.u32 r473, [r1028+1536]; +ld.shared.u32 r476, [r1028+1540]; +ld.shared.u32 r523, [r1028+1792]; +ld.shared.u32 r526, [r1028+1796]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1029, r1021, 24; +bfe.u32 r1030, r1021, 3, 2; +cvt.rn.f32.u32 f96, r1030; +mul.f32 f97, f96, 0f3E490FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r1031, r1021, 3; +and.b32 r1032, r1031, 56; +add.s32 r1033, r1025, r1032; +barrier.sync 0; +and.b32 r1034, r1023, 1536; +add.s32 r1035, r1033, r1034; +st.shared.u32 [r1035], r607; +st.shared.u32 [r1035+4], r610; +st.shared.u32 [r1035+64], r665; +st.shared.u32 [r1035+68], r674; +st.shared.u32 [r1035+128], r702; +st.shared.u32 [r1035+132], r711; +st.shared.u32 [r1035+192], r739; +st.shared.u32 [r1035+196], r748; +st.shared.u32 [r1035+256], r776; +st.shared.u32 [r1035+260], r785; +st.shared.u32 [r1035+320], r813; +st.shared.u32 [r1035+324], r822; +st.shared.u32 [r1035+384], r850; +st.shared.u32 [r1035+388], r859; +st.shared.u32 [r1035+448], r887; +st.shared.u32 [r1035+452], r896; +barrier.sync 0; +mad.lo.s32 r1036, r1029, -56, r1035; +ld.shared.u32 r918, [r1036]; +ld.shared.u32 r921, [r1036+4]; +ld.shared.u32 r968, [r1036+256]; +ld.shared.u32 r971, [r1036+260]; +ld.shared.u32 r930, [r1036+512]; +ld.shared.u32 r933, [r1036+516]; +ld.shared.u32 r980, [r1036+768]; +ld.shared.u32 r983, [r1036+772]; +ld.shared.u32 r919, [r1036+1024]; +ld.shared.u32 r922, [r1036+1028]; +ld.shared.u32 r969, [r1036+1280]; +ld.shared.u32 r972, [r1036+1284]; +ld.shared.u32 r931, [r1036+1536]; +ld.shared.u32 r934, [r1036+1540]; +ld.shared.u32 r981, [r1036+1792]; +ld.shared.u32 r984, [r1036+1796]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 %0, r917, r929; +} +{ +add.f16x2 %1, r920, r932; +} +{ +sub.f16x2 %8, r917, r929; +} +{ +sub.f16x2 %9, r920, r932; +} +{ +add.f16x2 %4, r923, r941; +} +{ +add.f16x2 %5, r926, r935; +} +{ +sub.f16x2 %12, r923, r941; +} +{ +sub.f16x2 %13, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 %2, r967, r979; +} +{ +add.f16x2 %3, r970, r982; +} +{ +sub.f16x2 %10, r967, r979; +} +{ +sub.f16x2 %11, r970, r982; +} +{ +add.f16x2 %6, r973, r991; +} +{ +add.f16x2 %7, r976, r985; +} +{ +sub.f16x2 %14, r973, r991; +} +{ +sub.f16x2 %15, r976, r985; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1011, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<1037>; +.reg .b64 rd<2>; +mov.u32 r1017, %tid.y; +shl.b32 r1018, r1017, 10; +mov.u32 r1019, %16; +add.s32 r1020, r1019, r1018; +mov.u32 r1021, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f58, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f90, 0f3F800000; +mov.f32 f56, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1022, r1021, 31; +shl.b32 r1023, r1021, 5; +and.b32 r1024, r1023, -1024; +add.s32 r1025, r1020, r1024; +cvt.rn.f32.u32 f93, r1022; +mul.f32 f94, f93, 0f3CC90FDB; +cos.approx.f32 f29, f94; +sin.approx.f32 f95, f94; +neg.f32 f30, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1026, r1023, 992; +add.s32 r1027, r1025, r1026; +st.shared.v4.f32 [r1027], {r149, r207, r244, r281}; +st.shared.v4.f32 [r1027+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r1028, r1022, -28, r1027; +ld.shared.u32 r460, [r1028]; +ld.shared.u32 r510, [r1028+128]; +ld.shared.u32 r472, [r1028+256]; +ld.shared.u32 r522, [r1028+384]; +ld.shared.u32 r461, [r1028+512]; +ld.shared.u32 r511, [r1028+640]; +ld.shared.u32 r473, [r1028+768]; +ld.shared.u32 r523, [r1028+896]; +barrier.sync 0; +st.shared.v4.f32 [r1027], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1027+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1028]; +ld.shared.u32 r513, [r1028+128]; +ld.shared.u32 r475, [r1028+256]; +ld.shared.u32 r525, [r1028+384]; +ld.shared.u32 r464, [r1028+512]; +ld.shared.u32 r514, [r1028+640]; +ld.shared.u32 r476, [r1028+768]; +ld.shared.u32 r526, [r1028+896]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1029, r1021, 24; +bfe.u32 r1030, r1021, 3, 2; +shl.b32 r1031, r1021, 2; +and.b32 r1032, r1031, 28; +add.s32 r1033, r1025, r1032; +cvt.rn.f32.u32 f96, r1030; +mul.f32 f97, f96, 0f3E490FDB; +cos.approx.f32 f75, f97; +sin.approx.f32 f98, f97; +neg.f32 f76, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r1034, r1023, 768; +add.s32 r1035, r1033, r1034; +st.shared.u32 [r1035], r607; +st.shared.u32 [r1035+32], r665; +st.shared.u32 [r1035+64], r702; +st.shared.u32 [r1035+96], r739; +st.shared.u32 [r1035+128], r776; +st.shared.u32 [r1035+160], r813; +st.shared.u32 [r1035+192], r850; +st.shared.u32 [r1035+224], r887; +barrier.sync 0; +mad.lo.s32 r1036, r1029, -28, r1035; +ld.shared.u32 r918, [r1036]; +ld.shared.u32 r968, [r1036+128]; +ld.shared.u32 r930, [r1036+256]; +ld.shared.u32 r980, [r1036+384]; +ld.shared.u32 r919, [r1036+512]; +ld.shared.u32 r969, [r1036+640]; +ld.shared.u32 r931, [r1036+768]; +ld.shared.u32 r981, [r1036+896]; +barrier.sync 0; +st.shared.u32 [r1035], r610; +st.shared.u32 [r1035+32], r674; +st.shared.u32 [r1035+64], r711; +st.shared.u32 [r1035+96], r748; +st.shared.u32 [r1035+128], r785; +st.shared.u32 [r1035+160], r822; +st.shared.u32 [r1035+192], r859; +st.shared.u32 [r1035+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1036]; +ld.shared.u32 r971, [r1036+128]; +ld.shared.u32 r933, [r1036+256]; +ld.shared.u32 r983, [r1036+384]; +ld.shared.u32 r922, [r1036+512]; +ld.shared.u32 r972, [r1036+640]; +ld.shared.u32 r934, [r1036+768]; +ld.shared.u32 r984, [r1036+896]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 %0, r917, r929; +} +{ +add.f16x2 %1, r920, r932; +} +{ +sub.f16x2 %8, r917, r929; +} +{ +sub.f16x2 %9, r920, r932; +} +{ +add.f16x2 %4, r923, r941; +} +{ +add.f16x2 %5, r926, r935; +} +{ +sub.f16x2 %12, r923, r941; +} +{ +sub.f16x2 %13, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 %2, r967, r979; +} +{ +add.f16x2 %3, r970, r982; +} +{ +sub.f16x2 %10, r967, r979; +} +{ +sub.f16x2 %11, r970, r982; +} +{ +add.f16x2 %6, r973, r991; +} +{ +add.f16x2 %7, r976, r985; +} +{ +sub.f16x2 %14, r973, r991; +} +{ +sub.f16x2 %15, r976, r985; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1012, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<270>; +.reg .b32 r<1803>; +.reg .b64 rd<2>; +mov.u32 r1791, %tid.y; +shl.b32 r1792, r1791, 11; +mov.u32 r1793, %32; +add.s32 r1794, r1793, r1792; +mov.u32 r1795, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f230, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f228, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1796, r1795, 15; +shl.b32 r1797, r1795, 7; +and.b32 r1798, r1797, -2048; +add.s32 r1799, r1794, r1798; +cvt.rn.f32.u32 f267, r1796; +mul.f32 f268, f267, 0f3CC90FDB; +cos.approx.f32 f117, f268; +sin.approx.f32 f269, f268; +neg.f32 f118, f269; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1800, r1797, 1920; +add.s32 r1801, r1799, r1800; +st.shared.v4.f32 [r1801], {r521, r524, r627, r636}; +st.shared.v4.f32 [r1801+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r1801+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r1801+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r1801+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r1801+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r1801+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r1801+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r1802, r1796, -120, r1801; +ld.shared.u32 r1176, [r1802]; +ld.shared.u32 r1179, [r1802+4]; +ld.shared.u32 r1372, [r1802+128]; +ld.shared.u32 r1375, [r1802+132]; +ld.shared.u32 r1226, [r1802+256]; +ld.shared.u32 r1229, [r1802+260]; +ld.shared.u32 r1422, [r1802+384]; +ld.shared.u32 r1425, [r1802+388]; +ld.shared.u32 r1188, [r1802+512]; +ld.shared.u32 r1191, [r1802+516]; +ld.shared.u32 r1384, [r1802+640]; +ld.shared.u32 r1387, [r1802+644]; +ld.shared.u32 r1238, [r1802+768]; +ld.shared.u32 r1241, [r1802+772]; +ld.shared.u32 r1434, [r1802+896]; +ld.shared.u32 r1437, [r1802+900]; +ld.shared.u32 r1177, [r1802+1024]; +ld.shared.u32 r1180, [r1802+1028]; +ld.shared.u32 r1373, [r1802+1152]; +ld.shared.u32 r1376, [r1802+1156]; +ld.shared.u32 r1227, [r1802+1280]; +ld.shared.u32 r1230, [r1802+1284]; +ld.shared.u32 r1423, [r1802+1408]; +ld.shared.u32 r1426, [r1802+1412]; +ld.shared.u32 r1189, [r1802+1536]; +ld.shared.u32 r1192, [r1802+1540]; +ld.shared.u32 r1385, [r1802+1664]; +ld.shared.u32 r1388, [r1802+1668]; +ld.shared.u32 r1239, [r1802+1792]; +ld.shared.u32 r1242, [r1802+1796]; +ld.shared.u32 r1435, [r1802+1920]; +ld.shared.u32 r1438, [r1802+1924]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 %0, r1323, r1519; +} +{ +add.f16x2 %1, r1326, r1522; +} +{ +sub.f16x2 %16, r1323, r1519; +} +{ +sub.f16x2 %17, r1326, r1522; +} +{ +add.f16x2 %2, r1335, r1603; +} +{ +add.f16x2 %3, r1338, r1609; +} +{ +sub.f16x2 %18, r1335, r1603; +} +{ +sub.f16x2 %19, r1338, r1609; +} +{ +add.f16x2 %4, r1347, r1619; +} +{ +add.f16x2 %5, r1350, r1625; +} +{ +sub.f16x2 %20, r1347, r1619; +} +{ +sub.f16x2 %21, r1350, r1625; +} +{ +add.f16x2 %6, r1359, r1635; +} +{ +add.f16x2 %7, r1362, r1641; +} +{ +sub.f16x2 %22, r1359, r1635; +} +{ +sub.f16x2 %23, r1362, r1641; +} +{ +add.f16x2 %8, r1329, r1645; +} +{ +add.f16x2 %9, r1332, r1525; +} +{ +sub.f16x2 %24, r1329, r1645; +} +{ +sub.f16x2 %25, r1332, r1525; +} +{ +add.f16x2 %10, r1341, r1653; +} +{ +add.f16x2 %11, r1344, r1659; +} +{ +sub.f16x2 %26, r1341, r1653; +} +{ +sub.f16x2 %27, r1344, r1659; +} +{ +add.f16x2 %12, r1353, r1669; +} +{ +add.f16x2 %13, r1356, r1675; +} +{ +sub.f16x2 %28, r1353, r1669; +} +{ +sub.f16x2 %29, r1356, r1675; +} +{ +add.f16x2 %14, r1365, r1685; +} +{ +add.f16x2 %15, r1368, r1691; +} +{ +sub.f16x2 %30, r1365, r1685; +} +{ +sub.f16x2 %31, r1368, r1691; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1013, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<540>; +.reg .b32 r<3723>; +.reg .b64 rd<3>; +mov.u32 r3647, %tid.y; +shl.b32 r3648, r3647, 11; +mov.u32 r3649, %64; +add.s32 r3650, r3649, r3648; +mov.u32 r3651, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f518, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r102, {low, high}; +} +mov.f32 f516, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f272, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r398, {low, high}; +} +mov.f32 f270, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r404, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3652, r3651, 7; +shl.b32 r3653, r3651, 8; +and.b32 r3654, r3653, -2048; +add.s32 r3655, r3650, r3654; +cvt.rn.f32.u32 f535, r3652; +mul.f32 f536, f535, 0f3CC90FDB; +cos.approx.f32 f357, f536; +sin.approx.f32 f537, f536; +neg.f32 f358, f537; +mov.f32 f539, 0fBF800000; +mov.f32 f538, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r3656, r3653, 1792; +add.s32 r3657, r3655, r3656; +st.shared.v4.f32 [r3657], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r3657+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r3657+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r3657+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r3657+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r3657+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r3657+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r3657+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r3657+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r3657+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r3657+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r3657+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r3657+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r3657+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r3657+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r3657+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r3658, r3652, -248, r3657; +ld.shared.u32 r2864, [r3658]; +ld.shared.u32 r2867, [r3658+4]; +ld.shared.u32 r3060, [r3658+64]; +ld.shared.u32 r3063, [r3658+68]; +ld.shared.u32 r3256, [r3658+128]; +ld.shared.u32 r3259, [r3658+132]; +ld.shared.u32 r3452, [r3658+192]; +ld.shared.u32 r3455, [r3658+196]; +ld.shared.u32 r2914, [r3658+256]; +ld.shared.u32 r2917, [r3658+260]; +ld.shared.u32 r3110, [r3658+320]; +ld.shared.u32 r3113, [r3658+324]; +ld.shared.u32 r3306, [r3658+384]; +ld.shared.u32 r3309, [r3658+388]; +ld.shared.u32 r3502, [r3658+448]; +ld.shared.u32 r3505, [r3658+452]; +ld.shared.u32 r2876, [r3658+512]; +ld.shared.u32 r2879, [r3658+516]; +ld.shared.u32 r3072, [r3658+576]; +ld.shared.u32 r3075, [r3658+580]; +ld.shared.u32 r3268, [r3658+640]; +ld.shared.u32 r3271, [r3658+644]; +ld.shared.u32 r3464, [r3658+704]; +ld.shared.u32 r3467, [r3658+708]; +ld.shared.u32 r2926, [r3658+768]; +ld.shared.u32 r2929, [r3658+772]; +ld.shared.u32 r3122, [r3658+832]; +ld.shared.u32 r3125, [r3658+836]; +ld.shared.u32 r3318, [r3658+896]; +ld.shared.u32 r3321, [r3658+900]; +ld.shared.u32 r3514, [r3658+960]; +ld.shared.u32 r3517, [r3658+964]; +ld.shared.u32 r2865, [r3658+1024]; +ld.shared.u32 r2868, [r3658+1028]; +ld.shared.u32 r3061, [r3658+1088]; +ld.shared.u32 r3064, [r3658+1092]; +ld.shared.u32 r3257, [r3658+1152]; +ld.shared.u32 r3260, [r3658+1156]; +ld.shared.u32 r3453, [r3658+1216]; +ld.shared.u32 r3456, [r3658+1220]; +ld.shared.u32 r2915, [r3658+1280]; +ld.shared.u32 r2918, [r3658+1284]; +ld.shared.u32 r3111, [r3658+1344]; +ld.shared.u32 r3114, [r3658+1348]; +ld.shared.u32 r3307, [r3658+1408]; +ld.shared.u32 r3310, [r3658+1412]; +ld.shared.u32 r3503, [r3658+1472]; +ld.shared.u32 r3506, [r3658+1476]; +ld.shared.u32 r2877, [r3658+1536]; +ld.shared.u32 r2880, [r3658+1540]; +ld.shared.u32 r3073, [r3658+1600]; +ld.shared.u32 r3076, [r3658+1604]; +ld.shared.u32 r3269, [r3658+1664]; +ld.shared.u32 r3272, [r3658+1668]; +ld.shared.u32 r3465, [r3658+1728]; +ld.shared.u32 r3468, [r3658+1732]; +ld.shared.u32 r2927, [r3658+1792]; +ld.shared.u32 r2930, [r3658+1796]; +ld.shared.u32 r3123, [r3658+1856]; +ld.shared.u32 r3126, [r3658+1860]; +ld.shared.u32 r3319, [r3658+1920]; +ld.shared.u32 r3322, [r3658+1924]; +ld.shared.u32 r3515, [r3658+1984]; +ld.shared.u32 r3518, [r3658+1988]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 %0, r2889, r2939; +} +{ +add.f16x2 %1, r2892, r2942; +} +{ +sub.f16x2 %32, r2889, r2939; +} +{ +sub.f16x2 %33, r2892, r2942; +} +{ +add.f16x2 %8, r2901, r2983; +} +{ +add.f16x2 %9, r2904, r2989; +} +{ +sub.f16x2 %40, r2901, r2983; +} +{ +sub.f16x2 %41, r2904, r2989; +} +{ +add.f16x2 %16, r2895, r2993; +} +{ +add.f16x2 %17, r2898, r2945; +} +{ +sub.f16x2 %48, r2895, r2993; +} +{ +sub.f16x2 %49, r2898, r2945; +} +{ +add.f16x2 %24, r2907, r3001; +} +{ +add.f16x2 %25, r2910, r3007; +} +{ +sub.f16x2 %56, r2907, r3001; +} +{ +sub.f16x2 %57, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 %2, r3085, r3135; +} +{ +add.f16x2 %3, r3088, r3138; +} +{ +sub.f16x2 %34, r3085, r3135; +} +{ +sub.f16x2 %35, r3088, r3138; +} +{ +add.f16x2 %10, r3097, r3179; +} +{ +add.f16x2 %11, r3100, r3185; +} +{ +sub.f16x2 %42, r3097, r3179; +} +{ +sub.f16x2 %43, r3100, r3185; +} +{ +add.f16x2 %18, r3091, r3189; +} +{ +add.f16x2 %19, r3094, r3141; +} +{ +sub.f16x2 %50, r3091, r3189; +} +{ +sub.f16x2 %51, r3094, r3141; +} +{ +add.f16x2 %26, r3103, r3197; +} +{ +add.f16x2 %27, r3106, r3203; +} +{ +sub.f16x2 %58, r3103, r3197; +} +{ +sub.f16x2 %59, r3106, r3203; +} +{ +add.f16x2 r3255, r3256, r3257; +} +{ +add.f16x2 r3258, r3259, r3260; +} +{ +sub.f16x2 r3261, r3256, r3257; +} +{ +sub.f16x2 r3264, r3259, r3260; +} +{ +add.f16x2 r3267, r3268, r3269; +} +{ +add.f16x2 r3270, r3271, r3272; +} +{ +sub.f16x2 r3273, r3268, r3269; +} +{ +sub.f16x2 r3276, r3271, r3272; +} +{ +neg.f16x2 r3279, r3276; +} +{ +add.f16x2 r3281, r3255, r3267; +} +{ +add.f16x2 r3284, r3258, r3270; +} +{ +sub.f16x2 r3287, r3255, r3267; +} +{ +sub.f16x2 r3290, r3258, r3270; +} +{ +add.f16x2 r3293, r3261, r3279; +} +{ +add.f16x2 r3296, r3264, r3273; +} +{ +sub.f16x2 r3299, r3261, r3279; +} +{ +sub.f16x2 r3302, r3264, r3273; +} +{ +add.f16x2 r3305, r3306, r3307; +} +{ +add.f16x2 r3308, r3309, r3310; +} +{ +sub.f16x2 r3311, r3306, r3307; +} +{ +sub.f16x2 r3314, r3309, r3310; +} +{ +add.f16x2 r3317, r3318, r3319; +} +{ +add.f16x2 r3320, r3321, r3322; +} +{ +sub.f16x2 r3323, r3318, r3319; +} +{ +sub.f16x2 r3326, r3321, r3322; +} +{ +neg.f16x2 r3329, r3326; +} +{ +add.f16x2 r3331, r3305, r3317; +} +{ +add.f16x2 r3334, r3308, r3320; +} +{ +sub.f16x2 r3337, r3305, r3317; +} +{ +sub.f16x2 r3340, r3308, r3320; +} +{ +add.f16x2 r3343, r3311, r3329; +} +{ +add.f16x2 r3346, r3314, r3323; +} +{ +sub.f16x2 r3349, r3311, r3329; +} +{ +sub.f16x2 r3352, r3314, r3323; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r3359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3360, {low, high}; +} +{ +mul.f16x2 r3369, r3343, r3355; +} +{ +mul.f16x2 r3372, r3346, r3356; +} +{ +sub.f16x2 r3375, r3369, r3372; +} +{ +mul.f16x2 r3378, r3343, r3356; +} +{ +fma.rn.f16x2 r3381, r3346, r3355, r3378; +} +{ +neg.f16x2 r3385, r3340; +} +{ +mul.f16x2 r3387, r3349, r3359; +} +{ +mul.f16x2 r3390, r3352, r3360; +} +{ +sub.f16x2 r3393, r3387, r3390; +} +{ +mul.f16x2 r3396, r3349, r3360; +} +{ +fma.rn.f16x2 r3399, r3352, r3359, r3396; +} +{ +add.f16x2 %4, r3281, r3331; +} +{ +add.f16x2 %5, r3284, r3334; +} +{ +sub.f16x2 %36, r3281, r3331; +} +{ +sub.f16x2 %37, r3284, r3334; +} +{ +add.f16x2 %12, r3293, r3375; +} +{ +add.f16x2 %13, r3296, r3381; +} +{ +sub.f16x2 %44, r3293, r3375; +} +{ +sub.f16x2 %45, r3296, r3381; +} +{ +add.f16x2 %20, r3287, r3385; +} +{ +add.f16x2 %21, r3290, r3337; +} +{ +sub.f16x2 %52, r3287, r3385; +} +{ +sub.f16x2 %53, r3290, r3337; +} +{ +add.f16x2 %28, r3299, r3393; +} +{ +add.f16x2 %29, r3302, r3399; +} +{ +sub.f16x2 %60, r3299, r3393; +} +{ +sub.f16x2 %61, r3302, r3399; +} +{ +add.f16x2 r3451, r3452, r3453; +} +{ +add.f16x2 r3454, r3455, r3456; +} +{ +sub.f16x2 r3457, r3452, r3453; +} +{ +sub.f16x2 r3460, r3455, r3456; +} +{ +add.f16x2 r3463, r3464, r3465; +} +{ +add.f16x2 r3466, r3467, r3468; +} +{ +sub.f16x2 r3469, r3464, r3465; +} +{ +sub.f16x2 r3472, r3467, r3468; +} +{ +neg.f16x2 r3475, r3472; +} +{ +add.f16x2 r3477, r3451, r3463; +} +{ +add.f16x2 r3480, r3454, r3466; +} +{ +sub.f16x2 r3483, r3451, r3463; +} +{ +sub.f16x2 r3486, r3454, r3466; +} +{ +add.f16x2 r3489, r3457, r3475; +} +{ +add.f16x2 r3492, r3460, r3469; +} +{ +sub.f16x2 r3495, r3457, r3475; +} +{ +sub.f16x2 r3498, r3460, r3469; +} +{ +add.f16x2 r3501, r3502, r3503; +} +{ +add.f16x2 r3504, r3505, r3506; +} +{ +sub.f16x2 r3507, r3502, r3503; +} +{ +sub.f16x2 r3510, r3505, r3506; +} +{ +add.f16x2 r3513, r3514, r3515; +} +{ +add.f16x2 r3516, r3517, r3518; +} +{ +sub.f16x2 r3519, r3514, r3515; +} +{ +sub.f16x2 r3522, r3517, r3518; +} +{ +neg.f16x2 r3525, r3522; +} +{ +add.f16x2 r3527, r3501, r3513; +} +{ +add.f16x2 r3530, r3504, r3516; +} +{ +sub.f16x2 r3533, r3501, r3513; +} +{ +sub.f16x2 r3536, r3504, r3516; +} +{ +add.f16x2 r3539, r3507, r3525; +} +{ +add.f16x2 r3542, r3510, r3519; +} +{ +sub.f16x2 r3545, r3507, r3525; +} +{ +sub.f16x2 r3548, r3510, r3519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3551, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3552, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r3555, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3556, {low, high}; +} +{ +mul.f16x2 r3565, r3539, r3551; +} +{ +mul.f16x2 r3568, r3542, r3552; +} +{ +sub.f16x2 r3571, r3565, r3568; +} +{ +mul.f16x2 r3574, r3539, r3552; +} +{ +fma.rn.f16x2 r3577, r3542, r3551, r3574; +} +{ +neg.f16x2 r3581, r3536; +} +{ +mul.f16x2 r3583, r3545, r3555; +} +{ +mul.f16x2 r3586, r3548, r3556; +} +{ +sub.f16x2 r3589, r3583, r3586; +} +{ +mul.f16x2 r3592, r3545, r3556; +} +{ +fma.rn.f16x2 r3595, r3548, r3555, r3592; +} +{ +add.f16x2 %6, r3477, r3527; +} +{ +add.f16x2 %7, r3480, r3530; +} +{ +sub.f16x2 %38, r3477, r3527; +} +{ +sub.f16x2 %39, r3480, r3530; +} +{ +add.f16x2 %14, r3489, r3571; +} +{ +add.f16x2 %15, r3492, r3577; +} +{ +sub.f16x2 %46, r3489, r3571; +} +{ +sub.f16x2 %47, r3492, r3577; +} +{ +add.f16x2 %22, r3483, r3581; +} +{ +add.f16x2 %23, r3486, r3533; +} +{ +sub.f16x2 %54, r3483, r3581; +} +{ +sub.f16x2 %55, r3486, r3533; +} +{ +add.f16x2 %30, r3495, r3589; +} +{ +add.f16x2 %31, r3498, r3595; +} +{ +sub.f16x2 %62, r3495, r3589; +} +{ +sub.f16x2 %63, r3498, r3595; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1014, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<570>; +.reg .b64 rd<2>; +mov.u32 r543, %tid.y; +shl.b32 r544, r543, 11; +mov.u32 r545, %8; +add.s32 r546, r545, r544; +mov.u32 r547, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r548, r547, 63; +shl.b32 r549, r547, 5; +and.b32 r550, r549, -2048; +add.s32 r551, r546, r550; +cvt.rn.f32.u32 f31, r548; +mul.f32 f32, f31, 0f3CC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r552, r549, 2016; +add.s32 r553, r551, r552; +st.shared.v4.f32 [r553], {r27, r30, r61, r70}; +st.shared.v4.f32 [r553+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r554, r548, -24, r553; +ld.shared.u32 r166, [r554]; +ld.shared.u32 r169, [r554+4]; +ld.shared.u32 r178, [r554+512]; +ld.shared.u32 r181, [r554+516]; +ld.shared.u32 r167, [r554+1024]; +ld.shared.u32 r170, [r554+1028]; +ld.shared.u32 r179, [r554+1536]; +ld.shared.u32 r182, [r554+1540]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r555, r547, 60; +bfe.u32 r556, r547, 2, 4; +cvt.rn.f32.u32 f34, r556; +mul.f32 f35, f34, 0f3DC90FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +shl.b32 r557, r547, 3; +and.b32 r558, r557, 24; +add.s32 r559, r551, r558; +barrier.sync 0; +and.b32 r560, r549, 1920; +add.s32 r561, r559, r560; +st.shared.u32 [r561], r191; +st.shared.u32 [r561+4], r194; +st.shared.u32 [r561+32], r225; +st.shared.u32 [r561+36], r234; +st.shared.u32 [r561+64], r262; +st.shared.u32 [r561+68], r271; +st.shared.u32 [r561+96], r299; +st.shared.u32 [r561+100], r308; +barrier.sync 0; +mad.lo.s32 r562, r555, -24, r561; +ld.shared.u32 r330, [r562]; +ld.shared.u32 r333, [r562+4]; +ld.shared.u32 r342, [r562+512]; +ld.shared.u32 r345, [r562+516]; +ld.shared.u32 r331, [r562+1024]; +ld.shared.u32 r334, [r562+1028]; +ld.shared.u32 r343, [r562+1536]; +ld.shared.u32 r346, [r562+1540]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r353; +} +{ +add.f16x2 r370, r338, r347; +} +{ +sub.f16x2 r373, r335, r353; +} +{ +sub.f16x2 r376, r338, r347; +} +and.b32 r563, r547, 48; +bfe.u32 r564, r547, 4, 2; +cvt.rn.f32.u32 f37, r564; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +fma.rn.f16x2 r389, r367, r382, r386; +} +{ +mul.f16x2 r393, r367, r384; +} +{ +neg.f16x2 r396, r393; +} +{ +fma.rn.f16x2 r398, r370, r382, r396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +fma.rn.f16x2 r426, r361, r419, r423; +} +{ +mul.f16x2 r430, r361, r421; +} +{ +neg.f16x2 r433, r430; +} +{ +fma.rn.f16x2 r435, r364, r419, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +fma.rn.f16x2 r463, r373, r456, r460; +} +{ +mul.f16x2 r467, r373, r458; +} +{ +neg.f16x2 r470, r467; +} +{ +fma.rn.f16x2 r472, r376, r456, r470; +} +and.b32 r565, r557, 120; +add.s32 r566, r551, r565; +barrier.sync 0; +and.b32 r567, r549, 1536; +add.s32 r568, r566, r567; +st.shared.u32 [r568], r355; +st.shared.u32 [r568+4], r358; +st.shared.u32 [r568+128], r389; +st.shared.u32 [r568+132], r398; +st.shared.u32 [r568+256], r426; +st.shared.u32 [r568+260], r435; +st.shared.u32 [r568+384], r463; +st.shared.u32 [r568+388], r472; +barrier.sync 0; +mad.lo.s32 r569, r563, -24, r568; +ld.shared.u32 r494, [r569]; +ld.shared.u32 r497, [r569+4]; +ld.shared.u32 r506, [r569+512]; +ld.shared.u32 r509, [r569+516]; +ld.shared.u32 r495, [r569+1024]; +ld.shared.u32 r498, [r569+1028]; +ld.shared.u32 r507, [r569+1536]; +ld.shared.u32 r510, [r569+1540]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r514; +} +{ +add.f16x2 %0, r493, r505; +} +{ +add.f16x2 %1, r496, r508; +} +{ +sub.f16x2 %4, r493, r505; +} +{ +sub.f16x2 %5, r496, r508; +} +{ +add.f16x2 %2, r499, r517; +} +{ +add.f16x2 %3, r502, r511; +} +{ +sub.f16x2 %6, r499, r517; +} +{ +sub.f16x2 %7, r502, r511; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1015, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<570>; +.reg .b64 rd<2>; +mov.u32 r543, %tid.y; +shl.b32 r544, r543, 10; +mov.u32 r545, %8; +add.s32 r546, r545, r544; +mov.u32 r547, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r548, r547, 63; +shl.b32 r549, r547, 4; +and.b32 r550, r549, -1024; +add.s32 r551, r546, r550; +cvt.rn.f32.u32 f31, r548; +mul.f32 f32, f31, 0f3CC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f27, 0fBF800000; +mov.f32 f28, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r552, r549, 1008; +add.s32 r553, r551, r552; +st.shared.v4.f32 [r553], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r554, r548, -12, r553; +ld.shared.u32 r166, [r554]; +ld.shared.u32 r178, [r554+256]; +ld.shared.u32 r167, [r554+512]; +ld.shared.u32 r179, [r554+768]; +barrier.sync 0; +st.shared.v4.f32 [r553], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r554]; +ld.shared.u32 r181, [r554+256]; +ld.shared.u32 r170, [r554+512]; +ld.shared.u32 r182, [r554+768]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r555, r547, 60; +bfe.u32 r556, r547, 2, 4; +shl.b32 r557, r547, 2; +and.b32 r558, r557, 12; +add.s32 r559, r551, r558; +cvt.rn.f32.u32 f34, r556; +mul.f32 f35, f34, 0f3DC90FDB; +cos.approx.f32 f11, f35; +sin.approx.f32 f36, f35; +neg.f32 f12, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +barrier.sync 0; +and.b32 r560, r549, 960; +add.s32 r561, r559, r560; +st.shared.u32 [r561], r191; +st.shared.u32 [r561+16], r225; +st.shared.u32 [r561+32], r262; +st.shared.u32 [r561+48], r299; +barrier.sync 0; +mad.lo.s32 r562, r555, -12, r561; +ld.shared.u32 r330, [r562]; +ld.shared.u32 r342, [r562+256]; +ld.shared.u32 r331, [r562+512]; +ld.shared.u32 r343, [r562+768]; +barrier.sync 0; +st.shared.u32 [r561], r194; +st.shared.u32 [r561+16], r234; +st.shared.u32 [r561+32], r271; +st.shared.u32 [r561+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r562]; +ld.shared.u32 r345, [r562+256]; +ld.shared.u32 r334, [r562+512]; +ld.shared.u32 r346, [r562+768]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r353; +} +{ +add.f16x2 r370, r338, r347; +} +{ +sub.f16x2 r373, r335, r353; +} +{ +sub.f16x2 r376, r338, r347; +} +and.b32 r563, r547, 48; +bfe.u32 r564, r547, 4, 2; +and.b32 r565, r557, 60; +add.s32 r566, r551, r565; +cvt.rn.f32.u32 f37, r564; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f21, f38; +sin.approx.f32 f39, f38; +neg.f32 f22, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +fma.rn.f16x2 r389, r367, r382, r386; +} +{ +mul.f16x2 r393, r367, r384; +} +{ +neg.f16x2 r396, r393; +} +{ +fma.rn.f16x2 r398, r370, r382, r396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +fma.rn.f16x2 r426, r361, r419, r423; +} +{ +mul.f16x2 r430, r361, r421; +} +{ +neg.f16x2 r433, r430; +} +{ +fma.rn.f16x2 r435, r364, r419, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f27; +cvt.rn.f16.f32 high, f28; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +fma.rn.f16x2 r463, r373, r456, r460; +} +{ +mul.f16x2 r467, r373, r458; +} +{ +neg.f16x2 r470, r467; +} +{ +fma.rn.f16x2 r472, r376, r456, r470; +} +barrier.sync 0; +and.b32 r567, r549, 768; +add.s32 r568, r566, r567; +st.shared.u32 [r568], r355; +st.shared.u32 [r568+64], r389; +st.shared.u32 [r568+128], r426; +st.shared.u32 [r568+192], r463; +barrier.sync 0; +mad.lo.s32 r569, r563, -12, r568; +ld.shared.u32 r494, [r569]; +ld.shared.u32 r506, [r569+256]; +ld.shared.u32 r495, [r569+512]; +ld.shared.u32 r507, [r569+768]; +barrier.sync 0; +st.shared.u32 [r568], r358; +st.shared.u32 [r568+64], r398; +st.shared.u32 [r568+128], r435; +st.shared.u32 [r568+192], r472; +barrier.sync 0; +ld.shared.u32 r497, [r569]; +ld.shared.u32 r509, [r569+256]; +ld.shared.u32 r498, [r569+512]; +ld.shared.u32 r510, [r569+768]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r514; +} +{ +add.f16x2 %0, r493, r505; +} +{ +add.f16x2 %1, r496, r508; +} +{ +sub.f16x2 %4, r493, r505; +} +{ +sub.f16x2 %5, r496, r508; +} +{ +add.f16x2 %2, r499, r517; +} +{ +add.f16x2 %3, r502, r511; +} +{ +sub.f16x2 %6, r499, r517; +} +{ +sub.f16x2 %7, r502, r511; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1016, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<540>; +.reg .b32 r<3723>; +.reg .b64 rd<3>; +mov.u32 r3647, %tid.y; +shl.b32 r3648, r3647, 10; +mov.u32 r3649, %64; +add.s32 r3650, r3649, r3648; +mov.u32 r3651, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f518, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r102, {low, high}; +} +mov.f32 f516, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f272, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r398, {low, high}; +} +mov.f32 f270, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r404, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3652, r3651, 7; +shl.b32 r3653, r3651, 7; +and.b32 r3654, r3653, -1024; +add.s32 r3655, r3650, r3654; +cvt.rn.f32.u32 f535, r3652; +mul.f32 f536, f535, 0f3CC90FDB; +cos.approx.f32 f357, f536; +sin.approx.f32 f537, f536; +neg.f32 f358, f537; +mov.f32 f539, 0fBF800000; +mov.f32 f538, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f539; +cvt.rn.f16.f32 high, f538; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r3656, r3653, 896; +add.s32 r3657, r3655, r3656; +st.shared.v4.f32 [r3657], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r3657+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r3657+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r3657+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r3657+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r3657+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r3657+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r3657+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r3658, r3652, -124, r3657; +ld.shared.u32 r2864, [r3658]; +ld.shared.u32 r3060, [r3658+32]; +ld.shared.u32 r3256, [r3658+64]; +ld.shared.u32 r3452, [r3658+96]; +ld.shared.u32 r2914, [r3658+128]; +ld.shared.u32 r3110, [r3658+160]; +ld.shared.u32 r3306, [r3658+192]; +ld.shared.u32 r3502, [r3658+224]; +ld.shared.u32 r2876, [r3658+256]; +ld.shared.u32 r3072, [r3658+288]; +ld.shared.u32 r3268, [r3658+320]; +ld.shared.u32 r3464, [r3658+352]; +ld.shared.u32 r2926, [r3658+384]; +ld.shared.u32 r3122, [r3658+416]; +ld.shared.u32 r3318, [r3658+448]; +ld.shared.u32 r3514, [r3658+480]; +ld.shared.u32 r2865, [r3658+512]; +ld.shared.u32 r3061, [r3658+544]; +ld.shared.u32 r3257, [r3658+576]; +ld.shared.u32 r3453, [r3658+608]; +ld.shared.u32 r2915, [r3658+640]; +ld.shared.u32 r3111, [r3658+672]; +ld.shared.u32 r3307, [r3658+704]; +ld.shared.u32 r3503, [r3658+736]; +ld.shared.u32 r2877, [r3658+768]; +ld.shared.u32 r3073, [r3658+800]; +ld.shared.u32 r3269, [r3658+832]; +ld.shared.u32 r3465, [r3658+864]; +ld.shared.u32 r2927, [r3658+896]; +ld.shared.u32 r3123, [r3658+928]; +ld.shared.u32 r3319, [r3658+960]; +ld.shared.u32 r3515, [r3658+992]; +barrier.sync 0; +st.shared.v4.f32 [r3657], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r3657+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r3657+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r3657+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r3657+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r3657+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r3657+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r3657+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r3658]; +ld.shared.u32 r3063, [r3658+32]; +ld.shared.u32 r3259, [r3658+64]; +ld.shared.u32 r3455, [r3658+96]; +ld.shared.u32 r2917, [r3658+128]; +ld.shared.u32 r3113, [r3658+160]; +ld.shared.u32 r3309, [r3658+192]; +ld.shared.u32 r3505, [r3658+224]; +ld.shared.u32 r2879, [r3658+256]; +ld.shared.u32 r3075, [r3658+288]; +ld.shared.u32 r3271, [r3658+320]; +ld.shared.u32 r3467, [r3658+352]; +ld.shared.u32 r2929, [r3658+384]; +ld.shared.u32 r3125, [r3658+416]; +ld.shared.u32 r3321, [r3658+448]; +ld.shared.u32 r3517, [r3658+480]; +ld.shared.u32 r2868, [r3658+512]; +ld.shared.u32 r3064, [r3658+544]; +ld.shared.u32 r3260, [r3658+576]; +ld.shared.u32 r3456, [r3658+608]; +ld.shared.u32 r2918, [r3658+640]; +ld.shared.u32 r3114, [r3658+672]; +ld.shared.u32 r3310, [r3658+704]; +ld.shared.u32 r3506, [r3658+736]; +ld.shared.u32 r2880, [r3658+768]; +ld.shared.u32 r3076, [r3658+800]; +ld.shared.u32 r3272, [r3658+832]; +ld.shared.u32 r3468, [r3658+864]; +ld.shared.u32 r2930, [r3658+896]; +ld.shared.u32 r3126, [r3658+928]; +ld.shared.u32 r3322, [r3658+960]; +ld.shared.u32 r3518, [r3658+992]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 %0, r2889, r2939; +} +{ +add.f16x2 %1, r2892, r2942; +} +{ +sub.f16x2 %32, r2889, r2939; +} +{ +sub.f16x2 %33, r2892, r2942; +} +{ +add.f16x2 %8, r2901, r2983; +} +{ +add.f16x2 %9, r2904, r2989; +} +{ +sub.f16x2 %40, r2901, r2983; +} +{ +sub.f16x2 %41, r2904, r2989; +} +{ +add.f16x2 %16, r2895, r2993; +} +{ +add.f16x2 %17, r2898, r2945; +} +{ +sub.f16x2 %48, r2895, r2993; +} +{ +sub.f16x2 %49, r2898, r2945; +} +{ +add.f16x2 %24, r2907, r3001; +} +{ +add.f16x2 %25, r2910, r3007; +} +{ +sub.f16x2 %56, r2907, r3001; +} +{ +sub.f16x2 %57, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 %2, r3085, r3135; +} +{ +add.f16x2 %3, r3088, r3138; +} +{ +sub.f16x2 %34, r3085, r3135; +} +{ +sub.f16x2 %35, r3088, r3138; +} +{ +add.f16x2 %10, r3097, r3179; +} +{ +add.f16x2 %11, r3100, r3185; +} +{ +sub.f16x2 %42, r3097, r3179; +} +{ +sub.f16x2 %43, r3100, r3185; +} +{ +add.f16x2 %18, r3091, r3189; +} +{ +add.f16x2 %19, r3094, r3141; +} +{ +sub.f16x2 %50, r3091, r3189; +} +{ +sub.f16x2 %51, r3094, r3141; +} +{ +add.f16x2 %26, r3103, r3197; +} +{ +add.f16x2 %27, r3106, r3203; +} +{ +sub.f16x2 %58, r3103, r3197; +} +{ +sub.f16x2 %59, r3106, r3203; +} +{ +add.f16x2 r3255, r3256, r3257; +} +{ +add.f16x2 r3258, r3259, r3260; +} +{ +sub.f16x2 r3261, r3256, r3257; +} +{ +sub.f16x2 r3264, r3259, r3260; +} +{ +add.f16x2 r3267, r3268, r3269; +} +{ +add.f16x2 r3270, r3271, r3272; +} +{ +sub.f16x2 r3273, r3268, r3269; +} +{ +sub.f16x2 r3276, r3271, r3272; +} +{ +neg.f16x2 r3279, r3276; +} +{ +add.f16x2 r3281, r3255, r3267; +} +{ +add.f16x2 r3284, r3258, r3270; +} +{ +sub.f16x2 r3287, r3255, r3267; +} +{ +sub.f16x2 r3290, r3258, r3270; +} +{ +add.f16x2 r3293, r3261, r3279; +} +{ +add.f16x2 r3296, r3264, r3273; +} +{ +sub.f16x2 r3299, r3261, r3279; +} +{ +sub.f16x2 r3302, r3264, r3273; +} +{ +add.f16x2 r3305, r3306, r3307; +} +{ +add.f16x2 r3308, r3309, r3310; +} +{ +sub.f16x2 r3311, r3306, r3307; +} +{ +sub.f16x2 r3314, r3309, r3310; +} +{ +add.f16x2 r3317, r3318, r3319; +} +{ +add.f16x2 r3320, r3321, r3322; +} +{ +sub.f16x2 r3323, r3318, r3319; +} +{ +sub.f16x2 r3326, r3321, r3322; +} +{ +neg.f16x2 r3329, r3326; +} +{ +add.f16x2 r3331, r3305, r3317; +} +{ +add.f16x2 r3334, r3308, r3320; +} +{ +sub.f16x2 r3337, r3305, r3317; +} +{ +sub.f16x2 r3340, r3308, r3320; +} +{ +add.f16x2 r3343, r3311, r3329; +} +{ +add.f16x2 r3346, r3314, r3323; +} +{ +sub.f16x2 r3349, r3311, r3329; +} +{ +sub.f16x2 r3352, r3314, r3323; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r3359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3360, {low, high}; +} +{ +mul.f16x2 r3369, r3343, r3355; +} +{ +mul.f16x2 r3372, r3346, r3356; +} +{ +sub.f16x2 r3375, r3369, r3372; +} +{ +mul.f16x2 r3378, r3343, r3356; +} +{ +fma.rn.f16x2 r3381, r3346, r3355, r3378; +} +{ +neg.f16x2 r3385, r3340; +} +{ +mul.f16x2 r3387, r3349, r3359; +} +{ +mul.f16x2 r3390, r3352, r3360; +} +{ +sub.f16x2 r3393, r3387, r3390; +} +{ +mul.f16x2 r3396, r3349, r3360; +} +{ +fma.rn.f16x2 r3399, r3352, r3359, r3396; +} +{ +add.f16x2 %4, r3281, r3331; +} +{ +add.f16x2 %5, r3284, r3334; +} +{ +sub.f16x2 %36, r3281, r3331; +} +{ +sub.f16x2 %37, r3284, r3334; +} +{ +add.f16x2 %12, r3293, r3375; +} +{ +add.f16x2 %13, r3296, r3381; +} +{ +sub.f16x2 %44, r3293, r3375; +} +{ +sub.f16x2 %45, r3296, r3381; +} +{ +add.f16x2 %20, r3287, r3385; +} +{ +add.f16x2 %21, r3290, r3337; +} +{ +sub.f16x2 %52, r3287, r3385; +} +{ +sub.f16x2 %53, r3290, r3337; +} +{ +add.f16x2 %28, r3299, r3393; +} +{ +add.f16x2 %29, r3302, r3399; +} +{ +sub.f16x2 %60, r3299, r3393; +} +{ +sub.f16x2 %61, r3302, r3399; +} +{ +add.f16x2 r3451, r3452, r3453; +} +{ +add.f16x2 r3454, r3455, r3456; +} +{ +sub.f16x2 r3457, r3452, r3453; +} +{ +sub.f16x2 r3460, r3455, r3456; +} +{ +add.f16x2 r3463, r3464, r3465; +} +{ +add.f16x2 r3466, r3467, r3468; +} +{ +sub.f16x2 r3469, r3464, r3465; +} +{ +sub.f16x2 r3472, r3467, r3468; +} +{ +neg.f16x2 r3475, r3472; +} +{ +add.f16x2 r3477, r3451, r3463; +} +{ +add.f16x2 r3480, r3454, r3466; +} +{ +sub.f16x2 r3483, r3451, r3463; +} +{ +sub.f16x2 r3486, r3454, r3466; +} +{ +add.f16x2 r3489, r3457, r3475; +} +{ +add.f16x2 r3492, r3460, r3469; +} +{ +sub.f16x2 r3495, r3457, r3475; +} +{ +sub.f16x2 r3498, r3460, r3469; +} +{ +add.f16x2 r3501, r3502, r3503; +} +{ +add.f16x2 r3504, r3505, r3506; +} +{ +sub.f16x2 r3507, r3502, r3503; +} +{ +sub.f16x2 r3510, r3505, r3506; +} +{ +add.f16x2 r3513, r3514, r3515; +} +{ +add.f16x2 r3516, r3517, r3518; +} +{ +sub.f16x2 r3519, r3514, r3515; +} +{ +sub.f16x2 r3522, r3517, r3518; +} +{ +neg.f16x2 r3525, r3522; +} +{ +add.f16x2 r3527, r3501, r3513; +} +{ +add.f16x2 r3530, r3504, r3516; +} +{ +sub.f16x2 r3533, r3501, r3513; +} +{ +sub.f16x2 r3536, r3504, r3516; +} +{ +add.f16x2 r3539, r3507, r3525; +} +{ +add.f16x2 r3542, r3510, r3519; +} +{ +sub.f16x2 r3545, r3507, r3525; +} +{ +sub.f16x2 r3548, r3510, r3519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3551, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3552, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f516; +cvt.rn.f16.f32 high, f516; +mov.b32 r3555, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f518; +cvt.rn.f16.f32 high, f518; +mov.b32 r3556, {low, high}; +} +{ +mul.f16x2 r3565, r3539, r3551; +} +{ +mul.f16x2 r3568, r3542, r3552; +} +{ +sub.f16x2 r3571, r3565, r3568; +} +{ +mul.f16x2 r3574, r3539, r3552; +} +{ +fma.rn.f16x2 r3577, r3542, r3551, r3574; +} +{ +neg.f16x2 r3581, r3536; +} +{ +mul.f16x2 r3583, r3545, r3555; +} +{ +mul.f16x2 r3586, r3548, r3556; +} +{ +sub.f16x2 r3589, r3583, r3586; +} +{ +mul.f16x2 r3592, r3545, r3556; +} +{ +fma.rn.f16x2 r3595, r3548, r3555, r3592; +} +{ +add.f16x2 %6, r3477, r3527; +} +{ +add.f16x2 %7, r3480, r3530; +} +{ +sub.f16x2 %38, r3477, r3527; +} +{ +sub.f16x2 %39, r3480, r3530; +} +{ +add.f16x2 %14, r3489, r3571; +} +{ +add.f16x2 %15, r3492, r3577; +} +{ +sub.f16x2 %46, r3489, r3571; +} +{ +sub.f16x2 %47, r3492, r3577; +} +{ +add.f16x2 %22, r3483, r3581; +} +{ +add.f16x2 %23, r3486, r3533; +} +{ +sub.f16x2 %54, r3483, r3581; +} +{ +sub.f16x2 %55, r3486, r3533; +} +{ +add.f16x2 %30, r3495, r3589; +} +{ +add.f16x2 %31, r3498, r3595; +} +{ +sub.f16x2 %62, r3495, r3589; +} +{ +sub.f16x2 %63, r3498, r3595; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1017, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<64>; +.reg .b32 r<433>; +.reg .b64 rd<2>; +mov.u32 r377, %tid.y; +shl.b32 r378, r377, 11; +mov.u32 r379, %4; +add.s32 r380, r379, r378; +mov.u32 r381, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r382, r381, 127; +shl.b32 r383, r381, 4; +and.b32 r384, r383, -2048; +add.s32 r385, r380, r384; +cvt.rn.f32.u32 f43, r382; +mul.f32 f44, f43, 0f3CC90FDB; +cos.approx.f32 f1, f44; +sin.approx.f32 f45, f44; +neg.f32 f2, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r386, r383, 2032; +add.s32 r387, r385, r386; +st.shared.v2.f32 [r387], {r1, r4}; +st.shared.v2.f32 [r387+8], {r23, r32}; +barrier.sync 0; +shl.b32 r388, r381, 3; +and.b32 r389, r388, 1016; +sub.s32 r390, r387, r389; +ld.shared.u32 r54, [r390]; +ld.shared.u32 r57, [r390+4]; +ld.shared.u32 r55, [r390+1024]; +ld.shared.u32 r58, [r390+1028]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r391, r381, 1, 6; +cvt.rn.f32.u32 f46, r391; +mul.f32 f47, f46, 0f3D490FDB; +cos.approx.f32 f7, f47; +sin.approx.f32 f48, f47; +neg.f32 f8, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r392, r388, 8; +add.s32 r393, r385, r392; +barrier.sync 0; +and.b32 r394, r383, 2016; +add.s32 r395, r393, r394; +st.shared.u32 [r395], r53; +st.shared.u32 [r395+4], r56; +st.shared.u32 [r395+16], r75; +st.shared.u32 [r395+20], r84; +barrier.sync 0; +and.b32 r396, r388, 1008; +sub.s32 r397, r395, r396; +ld.shared.u32 r106, [r397]; +ld.shared.u32 r109, [r397+4]; +ld.shared.u32 r107, [r397+1024]; +ld.shared.u32 r110, [r397+1028]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r398, r381, 2, 5; +cvt.rn.f32.u32 f49, r398; +mul.f32 f50, f49, 0f3DC90FDB; +cos.approx.f32 f13, f50; +sin.approx.f32 f51, f50; +neg.f32 f14, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +and.b32 r399, r388, 24; +add.s32 r400, r385, r399; +barrier.sync 0; +and.b32 r401, r383, 1984; +add.s32 r402, r400, r401; +st.shared.u32 [r402], r105; +st.shared.u32 [r402+4], r108; +st.shared.u32 [r402+32], r127; +st.shared.u32 [r402+36], r136; +barrier.sync 0; +and.b32 r403, r388, 992; +sub.s32 r404, r402, r403; +ld.shared.u32 r158, [r404]; +ld.shared.u32 r161, [r404+4]; +ld.shared.u32 r159, [r404+1024]; +ld.shared.u32 r162, [r404+1028]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r405, r381, 3, 4; +cvt.rn.f32.u32 f52, r405; +mul.f32 f53, f52, 0f3E490FDB; +cos.approx.f32 f19, f53; +sin.approx.f32 f54, f53; +neg.f32 f20, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +and.b32 r406, r388, 56; +add.s32 r407, r385, r406; +barrier.sync 0; +and.b32 r408, r383, 1920; +add.s32 r409, r407, r408; +st.shared.u32 [r409], r157; +st.shared.u32 [r409+4], r160; +st.shared.u32 [r409+64], r179; +st.shared.u32 [r409+68], r188; +barrier.sync 0; +and.b32 r410, r388, 960; +sub.s32 r411, r409, r410; +ld.shared.u32 r210, [r411]; +ld.shared.u32 r213, [r411+4]; +ld.shared.u32 r211, [r411+1024]; +ld.shared.u32 r214, [r411+1028]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r412, r381, 4, 3; +cvt.rn.f32.u32 f55, r412; +mul.f32 f56, f55, 0f3EC90FDB; +cos.approx.f32 f25, f56; +sin.approx.f32 f57, f56; +neg.f32 f26, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +and.b32 r413, r388, 120; +add.s32 r414, r385, r413; +barrier.sync 0; +and.b32 r415, r383, 1792; +add.s32 r416, r414, r415; +st.shared.u32 [r416], r209; +st.shared.u32 [r416+4], r212; +st.shared.u32 [r416+128], r231; +st.shared.u32 [r416+132], r240; +barrier.sync 0; +and.b32 r417, r388, 896; +sub.s32 r418, r416, r417; +ld.shared.u32 r262, [r418]; +ld.shared.u32 r265, [r418+4]; +ld.shared.u32 r263, [r418+1024]; +ld.shared.u32 r266, [r418+1028]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r419, r381, 5, 2; +cvt.rn.f32.u32 f58, r419; +mul.f32 f59, f58, 0f3F490FDB; +cos.approx.f32 f31, f59; +sin.approx.f32 f60, f59; +neg.f32 f32, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +fma.rn.f16x2 r283, r267, r276, r280; +} +{ +mul.f16x2 r287, r267, r278; +} +{ +neg.f16x2 r290, r287; +} +{ +fma.rn.f16x2 r292, r270, r276, r290; +} +and.b32 r420, r388, 248; +add.s32 r421, r385, r420; +barrier.sync 0; +and.b32 r422, r383, 1536; +add.s32 r423, r421, r422; +st.shared.u32 [r423], r261; +st.shared.u32 [r423+4], r264; +st.shared.u32 [r423+256], r283; +st.shared.u32 [r423+260], r292; +barrier.sync 0; +and.b32 r424, r388, 768; +sub.s32 r425, r423, r424; +ld.shared.u32 r314, [r425]; +ld.shared.u32 r317, [r425+4]; +ld.shared.u32 r315, [r425+1024]; +ld.shared.u32 r318, [r425+1028]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r426, r381, 6, 1; +cvt.rn.f32.u32 f61, r426; +mul.f32 f62, f61, 0f3FC90FDB; +cos.approx.f32 f37, f62; +sin.approx.f32 f63, f62; +neg.f32 f38, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +fma.rn.f16x2 r335, r319, r328, r332; +} +{ +mul.f16x2 r339, r319, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r322, r328, r342; +} +and.b32 r427, r388, 504; +add.s32 r428, r385, r427; +barrier.sync 0; +and.b32 r429, r383, 1024; +add.s32 r430, r428, r429; +st.shared.u32 [r430], r313; +st.shared.u32 [r430+4], r316; +st.shared.u32 [r430+512], r335; +st.shared.u32 [r430+516], r344; +barrier.sync 0; +and.b32 r431, r388, 512; +sub.s32 r432, r430, r431; +ld.shared.u32 r366, [r432]; +ld.shared.u32 r369, [r432+4]; +ld.shared.u32 r367, [r432+1024]; +ld.shared.u32 r370, [r432+1028]; +{ +add.f16x2 %0, r366, r367; +} +{ +add.f16x2 %1, r369, r370; +} +{ +sub.f16x2 %2, r366, r367; +} +{ +sub.f16x2 %3, r369, r370; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1018, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<64>; +.reg .b32 r<433>; +.reg .b64 rd<2>; +mov.u32 r377, %tid.y; +shl.b32 r378, r377, 10; +mov.u32 r379, %4; +add.s32 r380, r379, r378; +mov.u32 r381, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r382, r381, 127; +shl.b32 r383, r381, 3; +and.b32 r384, r383, -1024; +add.s32 r385, r380, r384; +cvt.rn.f32.u32 f43, r382; +mul.f32 f44, f43, 0f3CC90FDB; +cos.approx.f32 f1, f44; +sin.approx.f32 f45, f44; +neg.f32 f2, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r386, r383, 1016; +add.s32 r387, r385, r386; +st.shared.v2.f32 [r387], {r1, r23}; +barrier.sync 0; +shl.b32 r388, r381, 2; +and.b32 r389, r388, 508; +sub.s32 r390, r387, r389; +ld.shared.u32 r54, [r390]; +ld.shared.u32 r55, [r390+512]; +barrier.sync 0; +st.shared.v2.f32 [r387], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r390]; +ld.shared.u32 r58, [r390+512]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r391, r381, 1, 6; +and.b32 r392, r388, 4; +add.s32 r393, r385, r392; +cvt.rn.f32.u32 f46, r391; +mul.f32 f47, f46, 0f3D490FDB; +cos.approx.f32 f7, f47; +sin.approx.f32 f48, f47; +neg.f32 f8, f48; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r394, r383, 1008; +add.s32 r395, r393, r394; +st.shared.u32 [r395], r53; +st.shared.u32 [r395+8], r75; +barrier.sync 0; +and.b32 r396, r388, 504; +sub.s32 r397, r395, r396; +ld.shared.u32 r106, [r397]; +ld.shared.u32 r107, [r397+512]; +barrier.sync 0; +st.shared.u32 [r395], r56; +st.shared.u32 [r395+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r397]; +ld.shared.u32 r110, [r397+512]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r398, r381, 2, 5; +and.b32 r399, r388, 12; +add.s32 r400, r385, r399; +cvt.rn.f32.u32 f49, r398; +mul.f32 f50, f49, 0f3DC90FDB; +cos.approx.f32 f13, f50; +sin.approx.f32 f51, f50; +neg.f32 f14, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +barrier.sync 0; +and.b32 r401, r383, 992; +add.s32 r402, r400, r401; +st.shared.u32 [r402], r105; +st.shared.u32 [r402+16], r127; +barrier.sync 0; +and.b32 r403, r388, 496; +sub.s32 r404, r402, r403; +ld.shared.u32 r158, [r404]; +ld.shared.u32 r159, [r404+512]; +barrier.sync 0; +st.shared.u32 [r402], r108; +st.shared.u32 [r402+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r404]; +ld.shared.u32 r162, [r404+512]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r405, r381, 3, 4; +and.b32 r406, r388, 28; +add.s32 r407, r385, r406; +cvt.rn.f32.u32 f52, r405; +mul.f32 f53, f52, 0f3E490FDB; +cos.approx.f32 f19, f53; +sin.approx.f32 f54, f53; +neg.f32 f20, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +barrier.sync 0; +and.b32 r408, r383, 960; +add.s32 r409, r407, r408; +st.shared.u32 [r409], r157; +st.shared.u32 [r409+32], r179; +barrier.sync 0; +and.b32 r410, r388, 480; +sub.s32 r411, r409, r410; +ld.shared.u32 r210, [r411]; +ld.shared.u32 r211, [r411+512]; +barrier.sync 0; +st.shared.u32 [r409], r160; +st.shared.u32 [r409+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r411]; +ld.shared.u32 r214, [r411+512]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r412, r381, 4, 3; +and.b32 r413, r388, 60; +add.s32 r414, r385, r413; +cvt.rn.f32.u32 f55, r412; +mul.f32 f56, f55, 0f3EC90FDB; +cos.approx.f32 f25, f56; +sin.approx.f32 f57, f56; +neg.f32 f26, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +barrier.sync 0; +and.b32 r415, r383, 896; +add.s32 r416, r414, r415; +st.shared.u32 [r416], r209; +st.shared.u32 [r416+64], r231; +barrier.sync 0; +and.b32 r417, r388, 448; +sub.s32 r418, r416, r417; +ld.shared.u32 r262, [r418]; +ld.shared.u32 r263, [r418+512]; +barrier.sync 0; +st.shared.u32 [r416], r212; +st.shared.u32 [r416+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r418]; +ld.shared.u32 r266, [r418+512]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r419, r381, 5, 2; +and.b32 r420, r388, 124; +add.s32 r421, r385, r420; +cvt.rn.f32.u32 f58, r419; +mul.f32 f59, f58, 0f3F490FDB; +cos.approx.f32 f31, f59; +sin.approx.f32 f60, f59; +neg.f32 f32, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +fma.rn.f16x2 r283, r267, r276, r280; +} +{ +mul.f16x2 r287, r267, r278; +} +{ +neg.f16x2 r290, r287; +} +{ +fma.rn.f16x2 r292, r270, r276, r290; +} +barrier.sync 0; +and.b32 r422, r383, 768; +add.s32 r423, r421, r422; +st.shared.u32 [r423], r261; +st.shared.u32 [r423+128], r283; +barrier.sync 0; +and.b32 r424, r388, 384; +sub.s32 r425, r423, r424; +ld.shared.u32 r314, [r425]; +ld.shared.u32 r315, [r425+512]; +barrier.sync 0; +st.shared.u32 [r423], r264; +st.shared.u32 [r423+128], r292; +barrier.sync 0; +ld.shared.u32 r317, [r425]; +ld.shared.u32 r318, [r425+512]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r426, r381, 6, 1; +and.b32 r427, r388, 252; +add.s32 r428, r385, r427; +cvt.rn.f32.u32 f61, r426; +mul.f32 f62, f61, 0f3FC90FDB; +cos.approx.f32 f37, f62; +sin.approx.f32 f63, f62; +neg.f32 f38, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +fma.rn.f16x2 r335, r319, r328, r332; +} +{ +mul.f16x2 r339, r319, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r322, r328, r342; +} +barrier.sync 0; +and.b32 r429, r383, 512; +add.s32 r430, r428, r429; +st.shared.u32 [r430], r313; +st.shared.u32 [r430+256], r335; +barrier.sync 0; +and.b32 r431, r388, 256; +sub.s32 r432, r430, r431; +ld.shared.u32 r366, [r432]; +ld.shared.u32 r367, [r432+512]; +barrier.sync 0; +st.shared.u32 [r430], r316; +st.shared.u32 [r430+256], r344; +barrier.sync 0; +ld.shared.u32 r369, [r432]; +ld.shared.u32 r370, [r432+512]; +{ +add.f16x2 %0, r366, r367; +} +{ +add.f16x2 %1, r369, r370; +} +{ +sub.f16x2 %2, r366, r367; +} +{ +sub.f16x2 %3, r369, r370; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..46a0a913d51bc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp32_fwd.hpp.inc @@ -0,0 +1,5027 @@ +#ifndef CUFFTDX_FFT_256_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_256_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<61, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<596>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 960; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+64]; +ld.shared.f32 f391, [r13+128]; +ld.shared.f32 f392, [r13+192]; +ld.shared.f32 f393, [r13+256]; +ld.shared.f32 f394, [r13+320]; +ld.shared.f32 f395, [r13+384]; +ld.shared.f32 f396, [r13+448]; +ld.shared.f32 f397, [r13+512]; +ld.shared.f32 f398, [r13+576]; +ld.shared.f32 f399, [r13+640]; +ld.shared.f32 f400, [r13+704]; +ld.shared.f32 f401, [r13+768]; +ld.shared.f32 f402, [r13+832]; +ld.shared.f32 f403, [r13+896]; +ld.shared.f32 f404, [r13+960]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+64]; +ld.shared.f32 f407, [r13+128]; +ld.shared.f32 f408, [r13+192]; +ld.shared.f32 f409, [r13+256]; +ld.shared.f32 f410, [r13+320]; +ld.shared.f32 f411, [r13+384]; +ld.shared.f32 f412, [r13+448]; +ld.shared.f32 f413, [r13+512]; +ld.shared.f32 f414, [r13+576]; +ld.shared.f32 f415, [r13+640]; +ld.shared.f32 f416, [r13+704]; +ld.shared.f32 f417, [r13+768]; +ld.shared.f32 f418, [r13+832]; +ld.shared.f32 f419, [r13+896]; +ld.shared.f32 f420, [r13+960]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 %0, f462, f519; +add.f32 %1, f463, f520; +add.f32 %3, f467, f539; +add.f32 %2, f466, f537; +add.f32 %5, f471, f544; +add.f32 %4, f470, f542; +add.f32 %7, f475, f549; +add.f32 %6, f474, f547; +sub.f32 %9, f465, f521; +add.f32 %8, f464, f522; +add.f32 %11, f469, f554; +add.f32 %10, f468, f552; +add.f32 %13, f473, f558; +add.f32 %12, f472, f557; +add.f32 %15, f477, f563; +add.f32 %14, f476, f561; +sub.f32 %16, f462, f519; +sub.f32 %17, f463, f520; +sub.f32 %19, f467, f539; +sub.f32 %18, f466, f537; +sub.f32 %21, f471, f544; +sub.f32 %20, f470, f542; +sub.f32 %23, f475, f549; +sub.f32 %22, f474, f547; +add.f32 %25, f465, f521; +sub.f32 %24, f464, f522; +sub.f32 %27, f469, f554; +sub.f32 %26, f468, f552; +sub.f32 %29, f473, f558; +sub.f32 %28, f472, f557; +sub.f32 %31, f477, f563; +sub.f32 %30, f476, f561; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<62, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<381>; +.reg .b32 r<20>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 1984; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+256]; +ld.shared.v2.f32 {f167, f168}, [r13+512]; +ld.shared.v2.f32 {f171, f172}, [r13+768]; +ld.shared.v2.f32 {f175, f176}, [r13+1024]; +ld.shared.v2.f32 {f179, f180}, [r13+1280]; +ld.shared.v2.f32 {f183, f184}, [r13+1536]; +ld.shared.v2.f32 {f187, f188}, [r13+1792]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 24; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 1536; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+256]; +ld.shared.v2.f32 {f325, f326}, [r19+512]; +ld.shared.v2.f32 {f329, f330}, [r19+768]; +ld.shared.v2.f32 {f333, f334}, [r19+1024]; +ld.shared.v2.f32 {f337, f338}, [r19+1280]; +ld.shared.v2.f32 {f341, f342}, [r19+1536]; +ld.shared.v2.f32 {f345, f346}, [r19+1792]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f321, f337; +add.f32 f358, f322, f338; +sub.f32 f359, f321, f337; +sub.f32 f360, f322, f338; +add.f32 f361, f329, f345; +add.f32 f362, f330, f346; +sub.f32 f363, f329, f345; +sub.f32 f364, f330, f346; +add.f32 %1, f350, f354; +add.f32 %0, f349, f353; +add.f32 %3, f358, f362; +add.f32 %2, f357, f361; +sub.f32 %5, f352, f355; +add.f32 %4, f351, f356; +sub.f32 %7, f360, f363; +add.f32 %6, f359, f364; +sub.f32 %9, f350, f354; +sub.f32 %8, f349, f353; +sub.f32 %11, f358, f362; +sub.f32 %10, f357, f361; +add.f32 %13, f352, f355; +sub.f32 %12, f351, f356; +add.f32 %15, f360, f363; +sub.f32 %14, f359, f364; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<63, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<349>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 992; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+128]; +ld.shared.f32 f161, [r13+256]; +ld.shared.f32 f162, [r13+384]; +ld.shared.f32 f163, [r13+512]; +ld.shared.f32 f164, [r13+640]; +ld.shared.f32 f165, [r13+768]; +ld.shared.f32 f166, [r13+896]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+128]; +ld.shared.f32 f169, [r13+256]; +ld.shared.f32 f170, [r13+384]; +ld.shared.f32 f171, [r13+512]; +ld.shared.f32 f172, [r13+640]; +ld.shared.f32 f173, [r13+768]; +ld.shared.f32 f174, [r13+896]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 24; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 768; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+128]; +ld.shared.f32 f303, [r20+256]; +ld.shared.f32 f304, [r20+384]; +ld.shared.f32 f305, [r20+512]; +ld.shared.f32 f306, [r20+640]; +ld.shared.f32 f307, [r20+768]; +ld.shared.f32 f308, [r20+896]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+128]; +ld.shared.f32 f311, [r20+256]; +ld.shared.f32 f312, [r20+384]; +ld.shared.f32 f313, [r20+512]; +ld.shared.f32 f314, [r20+640]; +ld.shared.f32 f315, [r20+768]; +ld.shared.f32 f316, [r20+896]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f302, f306; +add.f32 f326, f310, f314; +sub.f32 f327, f302, f306; +sub.f32 f328, f310, f314; +add.f32 f329, f304, f308; +add.f32 f330, f312, f316; +sub.f32 f331, f304, f308; +sub.f32 f332, f312, f316; +add.f32 %0, f317, f321; +add.f32 %1, f318, f322; +add.f32 %2, f325, f329; +add.f32 %3, f326, f330; +sub.f32 %5, f320, f323; +add.f32 %4, f319, f324; +sub.f32 %7, f328, f331; +add.f32 %6, f327, f332; +sub.f32 %8, f317, f321; +sub.f32 %9, f318, f322; +sub.f32 %10, f325, f329; +sub.f32 %11, f326, f330; +add.f32 %13, f320, f323; +sub.f32 %12, f319, f324; +add.f32 %15, f328, f331; +sub.f32 %14, f327, f332; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<64, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<769>; +.reg .b32 r<26>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f761, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f759, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f758, f761, f759; +sub.f32 f76, f761, f759; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f757, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f754, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f752, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f751, f754, f752; +sub.f32 f92, f754, f752; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f750, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f750, 0fBF3504F3; +mul.f32 f749, f93, 0f3F3504F3; +sub.f32 f99, f749, f98; +mul.f32 f100, f750, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f748, f758, f751; +sub.f32 f109, f758, f751; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f747, f757, f101; +sub.f32 f113, f757, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f746, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f745, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f743, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f740, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f739, f743, f740; +sub.f32 f133, f743, f740; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f738, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f736, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f734, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f733, f736, f734; +sub.f32 f149, f736, f734; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f732, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f732, 0fBF3504F3; +mul.f32 f731, f150, 0f3F3504F3; +sub.f32 f156, f731, f155; +mul.f32 f157, f732, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f730, f739, f733; +sub.f32 f166, f739, f733; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f729, f738, f158; +sub.f32 f170, f738, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f728, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f727, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f725, f167, 0f3F6C835E; +mul.f32 f726, f729, 0fBEC3EF15; +sub.f32 f181, f725, f726; +mul.f32 f182, f729, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f723, f171, 0f3F3504F3; +mul.f32 f724, f728, 0fBF3504F3; +sub.f32 f186, f723, f724; +mul.f32 f187, f728, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f721, f175, 0f3EC3EF15; +mul.f32 f722, f727, 0fBF6C835E; +sub.f32 f191, f721, f722; +mul.f32 f192, f727, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f719, f169, 0fBEC3EF15; +mul.f32 f720, f170, 0fBF6C835E; +sub.f32 f196, f719, f720; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f717, f177, 0fBF6C835E; +mul.f32 f718, f178, 0fBEC3EF15; +sub.f32 f205, f717, f718; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f716, f747, f183; +sub.f32 f213, f747, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f715, f746, f188; +sub.f32 f217, f746, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f714, f745, f193; +sub.f32 f221, f745, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f713, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f712, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f711, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f710, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +and.b32 r14, r15, 15; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f716; +mul.f32 f244, f238, f716; +mul.f32 f246, f239, f239; +mul.f32 f709, f238, f238; +sub.f32 f247, f709, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f715; +mul.f32 f252, f247, f715; +mul.f32 f707, f238, f247; +mul.f32 f708, f239, f249; +sub.f32 f255, f707, f708; +mul.f32 f706, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f714; +mul.f32 f260, f255, f714; +mul.f32 f262, f239, f257; +mul.f32 f705, f238, f255; +sub.f32 f263, f705, f262; +mul.f32 f704, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f713; +mul.f32 f268, f263, f713; +mul.f32 f270, f239, f265; +mul.f32 f703, f238, f263; +sub.f32 f271, f703, f270; +mul.f32 f702, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f712; +mul.f32 f276, f271, f712; +mul.f32 f700, f238, f271; +mul.f32 f701, f239, f273; +sub.f32 f279, f700, f701; +mul.f32 f699, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f711; +mul.f32 f284, f279, f711; +mul.f32 f286, f239, f281; +mul.f32 f698, f238, f279; +sub.f32 f287, f698, f286; +mul.f32 f697, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f710; +mul.f32 f292, f287, f710; +mul.f32 f294, f239, f289; +mul.f32 f696, f238, f287; +sub.f32 f295, f696, f294; +mul.f32 f695, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f694, f748, f730; +mul.f32 f299, f297, f694; +mul.f32 f300, f295, f694; +mul.f32 f692, f238, f295; +mul.f32 f693, f239, f297; +sub.f32 f303, f692, f693; +sub.f32 f691, f106, f163; +mul.f32 f690, f295, f691; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f689, f238, f303; +sub.f32 f311, f689, f310; +mul.f32 f688, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f686, f238, f311; +mul.f32 f687, f239, f313; +sub.f32 f319, f686, f687; +mul.f32 f685, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f684, f238, f319; +sub.f32 f327, f684, f326; +mul.f32 f683, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f682, f238, f327; +sub.f32 f335, f682, f334; +mul.f32 f681, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f679, f238, f335; +mul.f32 f680, f239, f337; +sub.f32 f343, f679, f680; +mul.f32 f678, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f677, f238, f343; +sub.f32 f351, f677, f350; +mul.f32 f676, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f675, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 15; +sub.f32 f764, f748, f730; +mul.f32 f763, f297, f764; +mov.u32 r25, %tid.x; +shl.b32 r24, r25, 7; +barrier.sync 0; +and.b32 r11, r24, 1920; +add.s32 r12, r9, r11; +sub.f32 f768, f748, f730; +mul.f32 f767, f297, f768; +add.f32 f357, f748, f730; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 15; +sub.f32 f766, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 15; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 15; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f676, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f706, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f704, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f702, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f699, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f697, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f695, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f766, f300; +sub.f32 f374, f690, f767; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f688, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f685, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f683, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f681, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f678, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f675, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +mad.lo.s32 r13, r22, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+128]; +ld.shared.v2.f32 {f397, f398}, [r13+256]; +ld.shared.v2.f32 {f401, f402}, [r13+384]; +ld.shared.v2.f32 {f405, f406}, [r13+512]; +ld.shared.v2.f32 {f409, f410}, [r13+640]; +ld.shared.v2.f32 {f413, f414}, [r13+768]; +ld.shared.v2.f32 {f417, f418}, [r13+896]; +ld.shared.v2.f32 {f421, f422}, [r13+1024]; +ld.shared.v2.f32 {f425, f426}, [r13+1152]; +ld.shared.v2.f32 {f429, f430}, [r13+1280]; +ld.shared.v2.f32 {f433, f434}, [r13+1408]; +ld.shared.v2.f32 {f437, f438}, [r13+1536]; +ld.shared.v2.f32 {f441, f442}, [r13+1664]; +ld.shared.v2.f32 {f445, f446}, [r13+1792]; +ld.shared.v2.f32 {f449, f450}, [r13+1920]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f674, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f673, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f672, f674, f673; +sub.f32 f464, f674, f673; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f671, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f670, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f669, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f668, f670, f669; +sub.f32 f480, f670, f669; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f667, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f665, f481, 0f3F3504F3; +mul.f32 f666, f667, 0fBF3504F3; +sub.f32 f487, f665, f666; +mul.f32 f488, f667, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f664, f672, f668; +sub.f32 f497, f672, f668; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f663, f671, f489; +sub.f32 f501, f671, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f662, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f661, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f660, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f659, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f658, f660, f659; +sub.f32 f521, f660, f659; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f657, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f656, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f655, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f654, f656, f655; +sub.f32 f537, f656, f655; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f653, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f651, f538, 0f3F3504F3; +mul.f32 f652, f653, 0fBF3504F3; +sub.f32 f544, f651, f652; +mul.f32 f545, f653, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f650, f658, f654; +sub.f32 f554, f658, f654; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f649, f657, f546; +sub.f32 f558, f657, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f648, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f647, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f649, 0fBEC3EF15; +mul.f32 f646, f555, 0f3F6C835E; +sub.f32 f569, f646, f568; +mul.f32 f570, f649, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f648, 0fBF3504F3; +mul.f32 f645, f559, 0f3F3504F3; +sub.f32 f574, f645, f573; +mul.f32 f575, f648, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f643, f563, 0f3EC3EF15; +mul.f32 f644, f647, 0fBF6C835E; +sub.f32 f579, f643, f644; +mul.f32 f580, f647, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f641, f557, 0fBEC3EF15; +mul.f32 f642, f558, 0fBF6C835E; +sub.f32 f584, f641, f642; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f640, f565, 0fBF6C835E; +sub.f32 f593, f640, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 %1, f664, f650; +add.f32 %0, f494, f551; +add.f32 %2, f498, f569; +add.f32 %3, f663, f571; +add.f32 %4, f502, f574; +add.f32 %5, f662, f576; +add.f32 %6, f506, f579; +add.f32 %7, f661, f581; +sub.f32 %9, f497, f553; +add.f32 %8, f496, f554; +add.f32 %11, f501, f586; +add.f32 %10, f500, f584; +add.f32 %13, f505, f590; +add.f32 %12, f504, f589; +add.f32 %14, f508, f593; +add.f32 %15, f509, f595; +sub.f32 %17, f664, f650; +sub.f32 %16, f494, f551; +sub.f32 %19, f663, f571; +sub.f32 %18, f498, f569; +sub.f32 %21, f662, f576; +sub.f32 %20, f502, f574; +sub.f32 %23, f661, f581; +sub.f32 %22, f506, f579; +add.f32 %25, f497, f553; +sub.f32 %24, f496, f554; +sub.f32 %27, f501, f586; +sub.f32 %26, f500, f584; +sub.f32 %29, f505, f590; +sub.f32 %28, f504, f589; +sub.f32 %31, f509, f595; +sub.f32 %30, f508, f593; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<65, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1554>; +.reg .b32 r<24>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1549, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1547, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1546, f1549, f1547; +sub.f32 f140, f1549, f1547; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1545, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1542, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1540, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1539, f1542, f1540; +sub.f32 f156, f1542, f1540; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1538, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1538, 0fBF3504F3; +mul.f32 f1537, f157, 0f3F3504F3; +sub.f32 f163, f1537, f162; +mul.f32 f164, f1538, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1536, f1546, f1539; +sub.f32 f173, f1546, f1539; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1535, f1545, f165; +sub.f32 f177, f1545, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1534, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1533, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1531, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1528, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1527, f1531, f1528; +sub.f32 f197, f1531, f1528; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1526, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1524, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1522, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1521, f1524, f1522; +sub.f32 f213, f1524, f1522; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1520, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1520, 0fBF3504F3; +mul.f32 f1519, f214, 0f3F3504F3; +sub.f32 f220, f1519, f219; +mul.f32 f221, f1520, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1518, f1527, f1521; +sub.f32 f230, f1527, f1521; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1517, f1526, f222; +sub.f32 f234, f1526, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1516, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1515, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1513, f231, 0f3F6C835E; +mul.f32 f1514, f1517, 0fBEC3EF15; +sub.f32 f245, f1513, f1514; +mul.f32 f246, f1517, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1511, f235, 0f3F3504F3; +mul.f32 f1512, f1516, 0fBF3504F3; +sub.f32 f250, f1511, f1512; +mul.f32 f251, f1516, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1509, f239, 0f3EC3EF15; +mul.f32 f1510, f1515, 0fBF6C835E; +sub.f32 f255, f1509, f1510; +mul.f32 f256, f1515, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1507, f233, 0fBEC3EF15; +mul.f32 f1508, f234, 0fBF6C835E; +sub.f32 f260, f1507, f1508; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1505, f241, 0fBF6C835E; +mul.f32 f1506, f242, 0fBEC3EF15; +sub.f32 f269, f1505, f1506; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1504, f1536, f1518; +sub.f32 f275, f1536, f1518; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1503, f1535, f247; +sub.f32 f279, f1535, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1502, f1534, f252; +sub.f32 f283, f1534, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1501, f1533, f257; +sub.f32 f287, f1533, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1500, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1499, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1498, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1497, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1494, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1492, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1491, f1494, f1492; +sub.f32 f315, f1494, f1492; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1490, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1488, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1485, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1484, f1488, f1485; +sub.f32 f331, f1488, f1485; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1483, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1481, f332, 0f3F3504F3; +mul.f32 f1482, f1483, 0fBF3504F3; +sub.f32 f338, f1481, f1482; +mul.f32 f339, f1483, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1480, f1491, f1484; +sub.f32 f348, f1491, f1484; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1479, f1490, f340; +sub.f32 f352, f1490, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1478, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1477, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1475, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1473, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1472, f1475, f1473; +sub.f32 f372, f1475, f1473; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1471, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1468, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1467, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1466, f1468, f1467; +sub.f32 f388, f1468, f1467; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1465, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1463, f389, 0f3F3504F3; +mul.f32 f1464, f1465, 0fBF3504F3; +sub.f32 f395, f1463, f1464; +mul.f32 f396, f1465, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1462, f1472, f1466; +sub.f32 f405, f1472, f1466; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1461, f1471, f397; +sub.f32 f409, f1471, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1460, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1459, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1461, 0fBEC3EF15; +mul.f32 f1458, f406, 0f3F6C835E; +sub.f32 f420, f1458, f419; +mul.f32 f421, f1461, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1460, 0fBF3504F3; +mul.f32 f1457, f410, 0f3F3504F3; +sub.f32 f425, f1457, f424; +mul.f32 f426, f1460, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1455, f414, 0f3EC3EF15; +mul.f32 f1456, f1459, 0fBF6C835E; +sub.f32 f430, f1455, f1456; +mul.f32 f431, f1459, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1453, f408, 0fBEC3EF15; +mul.f32 f1454, f409, 0fBF6C835E; +sub.f32 f435, f1453, f1454; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1452, f416, 0fBF6C835E; +sub.f32 f444, f1452, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1451, f1480, f1462; +sub.f32 f450, f1480, f1462; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1450, f1479, f422; +sub.f32 f454, f1479, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1449, f1478, f427; +sub.f32 f458, f1478, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1448, f1477, f432; +sub.f32 f462, f1477, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1447, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1446, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1445, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1444, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1450, 0fBE47C5C2; +mul.f32 f1443, f451, 0f3F7B14BE; +sub.f32 f481, f1443, f480; +mul.f32 f482, f1450, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1449, 0fBEC3EF15; +mul.f32 f1442, f455, 0f3F6C835E; +sub.f32 f486, f1442, f485; +mul.f32 f487, f1449, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1448, 0fBF0E39DA; +mul.f32 f1441, f459, 0f3F54DB31; +sub.f32 f491, f1441, f490; +mul.f32 f492, f1448, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1447, 0fBF3504F3; +mul.f32 f1440, f463, 0f3F3504F3; +sub.f32 f496, f1440, f495; +mul.f32 f497, f1447, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1438, f467, 0f3F0E39DA; +mul.f32 f1439, f1446, 0fBF54DB31; +sub.f32 f501, f1438, f1439; +mul.f32 f502, f1446, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1436, f471, 0f3EC3EF15; +mul.f32 f1437, f1445, 0fBF6C835E; +sub.f32 f506, f1436, f1437; +mul.f32 f507, f1445, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1434, f475, 0f3E47C5C2; +mul.f32 f1435, f1444, 0fBF7B14BE; +sub.f32 f511, f1434, f1435; +mul.f32 f512, f1444, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1432, f453, 0fBE47C5C2; +mul.f32 f1433, f454, 0fBF7B14BE; +sub.f32 f516, f1432, f1433; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1431, f457, 0fBEC3EF15; +sub.f32 f521, f1431, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1430, f461, 0fBF0E39DA; +sub.f32 f526, f1430, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1428, f469, 0fBF54DB31; +mul.f32 f1429, f470, 0fBF0E39DA; +sub.f32 f535, f1428, f1429; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1427, f473, 0fBF6C835E; +sub.f32 f540, f1427, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1426, f477, 0fBF7B14BE; +sub.f32 f545, f1426, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1425, f1503, f483; +sub.f32 f553, f1503, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1424, f1502, f488; +sub.f32 f557, f1502, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1423, f1501, f493; +sub.f32 f561, f1501, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1422, f1500, f498; +sub.f32 f565, f1500, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f1421, f1499, f503; +sub.f32 f569, f1499, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f1420, f1498, f508; +sub.f32 f573, f1498, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f1419, f1497, f513; +sub.f32 f577, f1497, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f1418, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f1417, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f1416, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f1415, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f1414, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1413, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1412, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1411, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +and.b32 r14, r15, 7; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f1425; +mul.f32 f616, f610, f1425; +mul.f32 f618, f611, f611; +mul.f32 f1410, f610, f610; +sub.f32 f619, f1410, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f1424; +mul.f32 f624, f619, f1424; +mul.f32 f626, f611, f621; +mul.f32 f1409, f610, f619; +sub.f32 f627, f1409, f626; +mul.f32 f1408, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f1423; +mul.f32 f632, f627, f1423; +mul.f32 f1406, f610, f627; +mul.f32 f1407, f611, f629; +sub.f32 f635, f1406, f1407; +mul.f32 f1405, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f1422; +mul.f32 f640, f635, f1422; +mul.f32 f642, f611, f637; +mul.f32 f1404, f610, f635; +sub.f32 f643, f1404, f642; +mul.f32 f1403, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f1421; +mul.f32 f648, f643, f1421; +mul.f32 f1401, f610, f643; +mul.f32 f1402, f611, f645; +sub.f32 f651, f1401, f1402; +mul.f32 f1400, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f1420; +mul.f32 f656, f651, f1420; +mul.f32 f658, f611, f653; +mul.f32 f1399, f610, f651; +sub.f32 f659, f1399, f658; +mul.f32 f1398, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f1419; +mul.f32 f664, f659, f1419; +mul.f32 f666, f611, f661; +mul.f32 f1397, f610, f659; +sub.f32 f667, f1397, f666; +mul.f32 f1396, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f1418; +mul.f32 f672, f667, f1418; +mul.f32 f1394, f610, f667; +mul.f32 f1395, f611, f669; +sub.f32 f675, f1394, f1395; +mul.f32 f1393, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f1417; +mul.f32 f680, f675, f1417; +mul.f32 f682, f611, f677; +mul.f32 f1392, f610, f675; +sub.f32 f683, f1392, f682; +mul.f32 f1391, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f1416; +mul.f32 f688, f683, f1416; +mul.f32 f690, f611, f685; +mul.f32 f1390, f610, f683; +sub.f32 f691, f1390, f690; +mul.f32 f1389, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f1415; +mul.f32 f696, f691, f1415; +mul.f32 f1387, f610, f691; +mul.f32 f1388, f611, f693; +sub.f32 f699, f1387, f1388; +mul.f32 f1386, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f1414; +mul.f32 f704, f699, f1414; +mul.f32 f706, f611, f701; +mul.f32 f1385, f610, f699; +sub.f32 f707, f1385, f706; +mul.f32 f1384, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f1413; +mul.f32 f712, f707, f1413; +mul.f32 f1382, f610, f707; +mul.f32 f1383, f611, f709; +sub.f32 f715, f1382, f1383; +mul.f32 f1381, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f1412; +mul.f32 f720, f715, f1412; +mul.f32 f722, f611, f717; +mul.f32 f1380, f610, f715; +sub.f32 f723, f1380, f722; +mul.f32 f1379, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f1411; +mul.f32 f728, f723, f1411; +mul.f32 f730, f611, f725; +mul.f32 f1378, f610, f723; +sub.f32 f731, f1378, f730; +mul.f32 f1377, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1376, f1504, f1451; +mul.f32 f735, f733, f1376; +mul.f32 f736, f731, f1376; +mul.f32 f1374, f610, f731; +mul.f32 f1375, f611, f733; +sub.f32 f739, f1374, f1375; +sub.f32 f1373, f272, f447; +mul.f32 f1372, f731, f1373; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1371, f610, f739; +sub.f32 f747, f1371, f746; +mul.f32 f1370, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1369, f610, f747; +sub.f32 f755, f1369, f754; +mul.f32 f1368, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f1366, f610, f755; +mul.f32 f1367, f611, f757; +sub.f32 f763, f1366, f1367; +mul.f32 f1365, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1364, f610, f763; +sub.f32 f771, f1364, f770; +mul.f32 f1363, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f1361, f610, f771; +mul.f32 f1362, f611, f773; +sub.f32 f779, f1361, f1362; +mul.f32 f1360, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1359, f610, f779; +sub.f32 f787, f1359, f786; +mul.f32 f1358, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1357, f610, f787; +sub.f32 f795, f1357, f794; +mul.f32 f1356, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f1354, f610, f795; +mul.f32 f1355, f611, f797; +sub.f32 f803, f1354, f1355; +mul.f32 f1353, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1352, f610, f803; +sub.f32 f811, f1352, f810; +mul.f32 f1351, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1350, f610, f811; +sub.f32 f819, f1350, f818; +mul.f32 f1349, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f1347, f610, f819; +mul.f32 f1348, f611, f821; +sub.f32 f827, f1347, f1348; +mul.f32 f1346, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1345, f610, f827; +sub.f32 f835, f1345, f834; +mul.f32 f1344, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f1342, f610, f835; +mul.f32 f1343, f611, f837; +sub.f32 f843, f1342, f1343; +mul.f32 f1341, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1340, f610, f843; +sub.f32 f851, f1340, f850; +mul.f32 f1339, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f1338, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r17, %tid.x; +shl.b32 r16, r17, 8; +barrier.sync 0; +and.b32 r11, r16, 1792; +add.s32 r12, r9, r11; +sub.f32 f1552, f1504, f1451; +mul.f32 f1551, f733, f1552; +add.f32 f857, f1504, f1451; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 7; +sub.f32 f1553, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 7; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 7; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f1339, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f1408, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f1405, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f1403, f639; +sub.f32 f867, f1400, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f1398, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f1396, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f1393, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f1391, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f1389, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f1386, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f1384, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f1381, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f1379, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f1377, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f1553, f736; +sub.f32 f890, f1372, f1551; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f1370, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f1368, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f1365, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f1363, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f1360, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f1358, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f1356, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f1353, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f1351, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f1349, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f1346, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f1344, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f1341, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f1338, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +mad.lo.s32 r13, r22, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+64]; +ld.shared.v2.f32 {f929, f930}, [r13+128]; +ld.shared.v2.f32 {f933, f934}, [r13+192]; +ld.shared.v2.f32 {f937, f938}, [r13+256]; +ld.shared.v2.f32 {f941, f942}, [r13+320]; +ld.shared.v2.f32 {f945, f946}, [r13+384]; +ld.shared.v2.f32 {f949, f950}, [r13+448]; +ld.shared.v2.f32 {f953, f954}, [r13+512]; +ld.shared.v2.f32 {f957, f958}, [r13+576]; +ld.shared.v2.f32 {f961, f962}, [r13+640]; +ld.shared.v2.f32 {f965, f966}, [r13+704]; +ld.shared.v2.f32 {f969, f970}, [r13+768]; +ld.shared.v2.f32 {f973, f974}, [r13+832]; +ld.shared.v2.f32 {f977, f978}, [r13+896]; +ld.shared.v2.f32 {f981, f982}, [r13+960]; +ld.shared.v2.f32 {f985, f986}, [r13+1024]; +ld.shared.v2.f32 {f989, f990}, [r13+1088]; +ld.shared.v2.f32 {f993, f994}, [r13+1152]; +ld.shared.v2.f32 {f997, f998}, [r13+1216]; +ld.shared.v2.f32 {f1001, f1002}, [r13+1280]; +ld.shared.v2.f32 {f1005, f1006}, [r13+1344]; +ld.shared.v2.f32 {f1009, f1010}, [r13+1408]; +ld.shared.v2.f32 {f1013, f1014}, [r13+1472]; +ld.shared.v2.f32 {f1017, f1018}, [r13+1536]; +ld.shared.v2.f32 {f1021, f1022}, [r13+1600]; +ld.shared.v2.f32 {f1025, f1026}, [r13+1664]; +ld.shared.v2.f32 {f1029, f1030}, [r13+1728]; +ld.shared.v2.f32 {f1033, f1034}, [r13+1792]; +ld.shared.v2.f32 {f1037, f1038}, [r13+1856]; +ld.shared.v2.f32 {f1041, f1042}, [r13+1920]; +ld.shared.v2.f32 {f1045, f1046}, [r13+1984]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1337, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1336, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f1335, f1337, f1336; +sub.f32 f1060, f1337, f1336; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f1334, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f1333, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f1332, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f1331, f1333, f1332; +sub.f32 f1076, f1333, f1332; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f1330, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f1330, 0fBF3504F3; +mul.f32 f1329, f1077, 0f3F3504F3; +sub.f32 f1083, f1329, f1082; +mul.f32 f1084, f1330, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f925, f989; +sub.f32 f1092, f925, f989; +add.f32 f1328, f926, f990; +sub.f32 f1093, f926, f990; +add.f32 f1094, f957, f1021; +sub.f32 f1096, f957, f1021; +add.f32 f1327, f958, f1022; +sub.f32 f1097, f958, f1022; +add.f32 f1098, f1090, f1094; +sub.f32 f1100, f1090, f1094; +add.f32 f1326, f1328, f1327; +sub.f32 f1101, f1328, f1327; +add.f32 f1102, f1092, f1097; +sub.f32 f1104, f1092, f1097; +sub.f32 f1325, f1093, f1096; +add.f32 f1105, f1093, f1096; +add.f32 f1106, f941, f1005; +sub.f32 f1108, f941, f1005; +add.f32 f1324, f942, f1006; +sub.f32 f1109, f942, f1006; +add.f32 f1110, f973, f1037; +sub.f32 f1112, f973, f1037; +add.f32 f1323, f974, f1038; +sub.f32 f1113, f974, f1038; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f1322, f1324, f1323; +sub.f32 f1117, f1324, f1323; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f1321, f1109, f1112; +add.f32 f1121, f1109, f1112; +mul.f32 f1123, f1321, 0fBF3504F3; +mul.f32 f1320, f1118, 0f3F3504F3; +sub.f32 f1124, f1320, f1123; +mul.f32 f1125, f1321, 0f3F3504F3; +fma.rn.f32 f1126, f1118, 0fBF3504F3, f1125; +mul.f32 f1127, f1120, 0fBF3504F3; +mul.f32 f1128, f1121, 0fBF3504F3; +sub.f32 f1129, f1127, f1128; +add.f32 f1130, f1127, f1128; +add.f32 f1131, f929, f993; +sub.f32 f1133, f929, f993; +add.f32 f1319, f930, f994; +sub.f32 f1134, f930, f994; +add.f32 f1135, f961, f1025; +sub.f32 f1137, f961, f1025; +add.f32 f1318, f962, f1026; +sub.f32 f1138, f962, f1026; +add.f32 f1139, f1131, f1135; +sub.f32 f1141, f1131, f1135; +add.f32 f1317, f1319, f1318; +sub.f32 f1142, f1319, f1318; +add.f32 f1143, f1133, f1138; +sub.f32 f1145, f1133, f1138; +sub.f32 f1316, f1134, f1137; +add.f32 f1146, f1134, f1137; +add.f32 f1147, f945, f1009; +sub.f32 f1149, f945, f1009; +add.f32 f1315, f946, f1010; +sub.f32 f1150, f946, f1010; +add.f32 f1151, f977, f1041; +sub.f32 f1153, f977, f1041; +add.f32 f1314, f978, f1042; +sub.f32 f1154, f978, f1042; +add.f32 f1155, f1147, f1151; +sub.f32 f1157, f1147, f1151; +add.f32 f1313, f1315, f1314; +sub.f32 f1158, f1315, f1314; +add.f32 f1159, f1149, f1154; +sub.f32 f1161, f1149, f1154; +sub.f32 f1312, f1150, f1153; +add.f32 f1162, f1150, f1153; +mul.f32 f1310, f1159, 0f3F3504F3; +mul.f32 f1311, f1312, 0fBF3504F3; +sub.f32 f1165, f1310, f1311; +mul.f32 f1166, f1312, 0f3F3504F3; +fma.rn.f32 f1167, f1159, 0fBF3504F3, f1166; +mul.f32 f1168, f1161, 0fBF3504F3; +mul.f32 f1169, f1162, 0fBF3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +add.f32 f1172, f933, f997; +sub.f32 f1174, f933, f997; +add.f32 f1309, f934, f998; +sub.f32 f1175, f934, f998; +add.f32 f1176, f965, f1029; +sub.f32 f1178, f965, f1029; +add.f32 f1308, f966, f1030; +sub.f32 f1179, f966, f1030; +add.f32 f1180, f1172, f1176; +sub.f32 f1182, f1172, f1176; +add.f32 f1307, f1309, f1308; +sub.f32 f1183, f1309, f1308; +add.f32 f1184, f1174, f1179; +sub.f32 f1186, f1174, f1179; +sub.f32 f1306, f1175, f1178; +add.f32 f1187, f1175, f1178; +add.f32 f1188, f949, f1013; +sub.f32 f1190, f949, f1013; +add.f32 f1305, f950, f1014; +sub.f32 f1191, f950, f1014; +add.f32 f1192, f981, f1045; +sub.f32 f1194, f981, f1045; +add.f32 f1304, f982, f1046; +sub.f32 f1195, f982, f1046; +add.f32 f1196, f1188, f1192; +sub.f32 f1198, f1188, f1192; +add.f32 f1303, f1305, f1304; +sub.f32 f1199, f1305, f1304; +add.f32 f1200, f1190, f1195; +sub.f32 f1202, f1190, f1195; +sub.f32 f1302, f1191, f1194; +add.f32 f1203, f1191, f1194; +mul.f32 f1205, f1302, 0fBF3504F3; +mul.f32 f1301, f1200, 0f3F3504F3; +sub.f32 f1206, f1301, f1205; +mul.f32 f1207, f1302, 0f3F3504F3; +fma.rn.f32 f1208, f1200, 0fBF3504F3, f1207; +mul.f32 f1209, f1202, 0fBF3504F3; +mul.f32 f1210, f1203, 0fBF3504F3; +sub.f32 f1211, f1209, f1210; +add.f32 f1212, f1209, f1210; +add.f32 %1, f1335, f1331; +add.f32 %0, f1057, f1073; +add.f32 %3, f1326, f1322; +add.f32 %2, f1098, f1114; +add.f32 %5, f1317, f1313; +add.f32 %4, f1139, f1155; +add.f32 %6, f1180, f1196; +add.f32 %7, f1307, f1303; +add.f32 %8, f1061, f1083; +add.f32 %9, f1334, f1085; +add.f32 %10, f1102, f1124; +add.f32 %11, f1325, f1126; +add.f32 %13, f1316, f1167; +add.f32 %12, f1143, f1165; +add.f32 %15, f1306, f1208; +add.f32 %14, f1184, f1206; +sub.f32 %17, f1060, f1075; +add.f32 %16, f1059, f1076; +add.f32 %18, f1100, f1117; +sub.f32 %19, f1101, f1116; +add.f32 %20, f1141, f1158; +sub.f32 %21, f1142, f1157; +add.f32 %22, f1182, f1199; +sub.f32 %23, f1183, f1198; +add.f32 %24, f1063, f1088; +add.f32 %25, f1064, f1089; +add.f32 %27, f1105, f1130; +add.f32 %26, f1104, f1129; +add.f32 %29, f1146, f1171; +add.f32 %28, f1145, f1170; +add.f32 %30, f1186, f1211; +add.f32 %31, f1187, f1212; +sub.f32 %33, f1335, f1331; +sub.f32 %32, f1057, f1073; +sub.f32 %35, f1326, f1322; +sub.f32 %34, f1098, f1114; +sub.f32 %37, f1317, f1313; +sub.f32 %36, f1139, f1155; +sub.f32 %39, f1307, f1303; +sub.f32 %38, f1180, f1196; +sub.f32 %41, f1334, f1085; +sub.f32 %40, f1061, f1083; +sub.f32 %43, f1325, f1126; +sub.f32 %42, f1102, f1124; +sub.f32 %45, f1316, f1167; +sub.f32 %44, f1143, f1165; +sub.f32 %47, f1306, f1208; +sub.f32 %46, f1184, f1206; +add.f32 %49, f1060, f1075; +sub.f32 %48, f1059, f1076; +add.f32 %51, f1101, f1116; +sub.f32 %50, f1100, f1117; +add.f32 %53, f1142, f1157; +sub.f32 %52, f1141, f1158; +add.f32 %55, f1183, f1198; +sub.f32 %54, f1182, f1199; +sub.f32 %57, f1064, f1089; +sub.f32 %56, f1063, f1088; +sub.f32 %59, f1105, f1130; +sub.f32 %58, f1104, f1129; +sub.f32 %61, f1146, f1171; +sub.f32 %60, f1145, f1170; +sub.f32 %63, f1187, f1212; +sub.f32 %62, f1186, f1211; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<66, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<216>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 2016; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+512]; +ld.shared.v2.f32 {f70, f71}, [r13+1024]; +ld.shared.v2.f32 {f74, f75}, [r13+1536]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 1920; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+512]; +ld.shared.v2.f32 {f131, f132}, [r20+1024]; +ld.shared.v2.f32 {f135, f136}, [r20+1536]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +add.f32 f149, f141, f146; +sub.f32 f150, f142, f145; +sub.f32 f151, f141, f146; +add.f32 f152, f142, f145; +and.b32 r21, r5, 48; +bfe.u32 r22, r5, 4, 2; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f153, f149; +mul.f32 f158, f154, f150; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f162, f147; +mul.f32 f166, f164, f148; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f170, f151; +mul.f32 f174, f172, f152; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 1536; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f154, f149, f159; +sub.f32 f179, f157, f158; +st.shared.v2.f32 [r26+128], {f179, f178}; +fma.rn.f32 f180, f164, f147, f167; +sub.f32 f181, f165, f166; +st.shared.v2.f32 [r26+256], {f181, f180}; +sub.f32 f182, f173, f174; +fma.rn.f32 f183, f172, f151, f175; +st.shared.v2.f32 [r26+384], {f182, f183}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+512]; +ld.shared.v2.f32 {f192, f193}, [r27+1024]; +ld.shared.v2.f32 {f196, f197}, [r27+1536]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +add.f32 %1, f201, f205; +add.f32 %0, f200, f204; +sub.f32 %3, f203, f206; +add.f32 %2, f202, f207; +sub.f32 %5, f201, f205; +sub.f32 %4, f200, f204; +add.f32 %7, f203, f206; +sub.f32 %6, f202, f207; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<67, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<192>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1008; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+256]; +ld.shared.f32 f64, [r13+512]; +ld.shared.f32 f65, [r13+768]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+256]; +ld.shared.f32 f68, [r13+512]; +ld.shared.f32 f69, [r13+768]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 960; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+256]; +ld.shared.f32 f117, [r21+512]; +ld.shared.f32 f118, [r21+768]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+256]; +ld.shared.f32 f121, [r21+512]; +ld.shared.f32 f122, [r21+768]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +add.f32 f135, f125, f130; +sub.f32 f136, f126, f129; +sub.f32 f137, f125, f130; +add.f32 f138, f126, f129; +and.b32 r22, r5, 48; +bfe.u32 r23, r5, 4, 2; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f139, f135; +mul.f32 f144, f140, f136; +sub.f32 f145, f143, f144; +mul.f32 f146, f139, f136; +fma.rn.f32 f147, f140, f135, f146; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f150, f133; +mul.f32 f154, f152, f134; +sub.f32 f155, f153, f154; +mul.f32 f156, f150, f134; +fma.rn.f32 f157, f152, f133, f156; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f160, f137; +mul.f32 f164, f162, f138; +sub.f32 f165, f163, f164; +mul.f32 f166, f160, f138; +fma.rn.f32 f167, f162, f137, f166; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 768; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f145; +st.shared.f32 [r27+128], f155; +st.shared.f32 [r27+192], f165; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+256]; +ld.shared.f32 f170, [r28+512]; +ld.shared.f32 f171, [r28+768]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+256]; +ld.shared.f32 f174, [r28+512]; +ld.shared.f32 f175, [r28+768]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 %0, f176, f180; +add.f32 %1, f177, f181; +sub.f32 %3, f179, f182; +add.f32 %2, f178, f183; +sub.f32 %4, f176, f180; +sub.f32 %5, f177, f181; +add.f32 %7, f179, f182; +sub.f32 %6, f178, f183; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<68, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1503>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1501, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1499, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1498, f1501, f1499; +sub.f32 f140, f1501, f1499; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1497, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1494, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1492, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1491, f1494, f1492; +sub.f32 f156, f1494, f1492; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1490, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1490, 0fBF3504F3; +mul.f32 f1489, f157, 0f3F3504F3; +sub.f32 f163, f1489, f162; +mul.f32 f164, f1490, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1488, f1498, f1491; +sub.f32 f173, f1498, f1491; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1487, f1497, f165; +sub.f32 f177, f1497, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1486, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1485, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1483, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1480, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1479, f1483, f1480; +sub.f32 f197, f1483, f1480; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1478, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1476, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1474, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1473, f1476, f1474; +sub.f32 f213, f1476, f1474; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1472, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1472, 0fBF3504F3; +mul.f32 f1471, f214, 0f3F3504F3; +sub.f32 f220, f1471, f219; +mul.f32 f221, f1472, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1470, f1479, f1473; +sub.f32 f230, f1479, f1473; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1469, f1478, f222; +sub.f32 f234, f1478, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1468, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1467, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1465, f231, 0f3F6C835E; +mul.f32 f1466, f1469, 0fBEC3EF15; +sub.f32 f245, f1465, f1466; +mul.f32 f246, f1469, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1463, f235, 0f3F3504F3; +mul.f32 f1464, f1468, 0fBF3504F3; +sub.f32 f250, f1463, f1464; +mul.f32 f251, f1468, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1461, f239, 0f3EC3EF15; +mul.f32 f1462, f1467, 0fBF6C835E; +sub.f32 f255, f1461, f1462; +mul.f32 f256, f1467, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1459, f233, 0fBEC3EF15; +mul.f32 f1460, f234, 0fBF6C835E; +sub.f32 f260, f1459, f1460; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1457, f241, 0fBF6C835E; +mul.f32 f1458, f242, 0fBEC3EF15; +sub.f32 f269, f1457, f1458; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1456, f1488, f1470; +sub.f32 f275, f1488, f1470; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1455, f1487, f247; +sub.f32 f279, f1487, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1454, f1486, f252; +sub.f32 f283, f1486, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1453, f1485, f257; +sub.f32 f287, f1485, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1452, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1451, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1450, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1449, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1446, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1444, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1443, f1446, f1444; +sub.f32 f315, f1446, f1444; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1442, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1440, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1437, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1436, f1440, f1437; +sub.f32 f331, f1440, f1437; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1435, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1433, f332, 0f3F3504F3; +mul.f32 f1434, f1435, 0fBF3504F3; +sub.f32 f338, f1433, f1434; +mul.f32 f339, f1435, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1432, f1443, f1436; +sub.f32 f348, f1443, f1436; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1431, f1442, f340; +sub.f32 f352, f1442, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1430, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1429, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1427, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1425, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1424, f1427, f1425; +sub.f32 f372, f1427, f1425; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1423, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1420, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1419, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1418, f1420, f1419; +sub.f32 f388, f1420, f1419; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1417, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1415, f389, 0f3F3504F3; +mul.f32 f1416, f1417, 0fBF3504F3; +sub.f32 f395, f1415, f1416; +mul.f32 f396, f1417, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1414, f1424, f1418; +sub.f32 f405, f1424, f1418; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1413, f1423, f397; +sub.f32 f409, f1423, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1412, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1411, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1413, 0fBEC3EF15; +mul.f32 f1410, f406, 0f3F6C835E; +sub.f32 f420, f1410, f419; +mul.f32 f421, f1413, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1412, 0fBF3504F3; +mul.f32 f1409, f410, 0f3F3504F3; +sub.f32 f425, f1409, f424; +mul.f32 f426, f1412, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1407, f414, 0f3EC3EF15; +mul.f32 f1408, f1411, 0fBF6C835E; +sub.f32 f430, f1407, f1408; +mul.f32 f431, f1411, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1405, f408, 0fBEC3EF15; +mul.f32 f1406, f409, 0fBF6C835E; +sub.f32 f435, f1405, f1406; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1404, f416, 0fBF6C835E; +sub.f32 f444, f1404, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1403, f1432, f1414; +sub.f32 f450, f1432, f1414; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1402, f1431, f422; +sub.f32 f454, f1431, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1401, f1430, f427; +sub.f32 f458, f1430, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1400, f1429, f432; +sub.f32 f462, f1429, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1399, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1398, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1397, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1396, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1402, 0fBE47C5C2; +mul.f32 f1395, f451, 0f3F7B14BE; +sub.f32 f481, f1395, f480; +mul.f32 f482, f1402, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1401, 0fBEC3EF15; +mul.f32 f1394, f455, 0f3F6C835E; +sub.f32 f486, f1394, f485; +mul.f32 f487, f1401, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1400, 0fBF0E39DA; +mul.f32 f1393, f459, 0f3F54DB31; +sub.f32 f491, f1393, f490; +mul.f32 f492, f1400, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1399, 0fBF3504F3; +mul.f32 f1392, f463, 0f3F3504F3; +sub.f32 f496, f1392, f495; +mul.f32 f497, f1399, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1390, f467, 0f3F0E39DA; +mul.f32 f1391, f1398, 0fBF54DB31; +sub.f32 f501, f1390, f1391; +mul.f32 f502, f1398, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1388, f471, 0f3EC3EF15; +mul.f32 f1389, f1397, 0fBF6C835E; +sub.f32 f506, f1388, f1389; +mul.f32 f507, f1397, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1386, f475, 0f3E47C5C2; +mul.f32 f1387, f1396, 0fBF7B14BE; +sub.f32 f511, f1386, f1387; +mul.f32 f512, f1396, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1384, f453, 0fBE47C5C2; +mul.f32 f1385, f454, 0fBF7B14BE; +sub.f32 f516, f1384, f1385; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1383, f457, 0fBEC3EF15; +sub.f32 f521, f1383, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1382, f461, 0fBF0E39DA; +sub.f32 f526, f1382, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1380, f469, 0fBF54DB31; +mul.f32 f1381, f470, 0fBF0E39DA; +sub.f32 f535, f1380, f1381; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1379, f473, 0fBF6C835E; +sub.f32 f540, f1379, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1378, f477, 0fBF7B14BE; +sub.f32 f545, f1378, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1377, f1456, f1403; +sub.f32 f551, f1456, f1403; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1376, f1455, f483; +sub.f32 f555, f1455, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1375, f1454, f488; +sub.f32 f559, f1454, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1374, f1453, f493; +sub.f32 f563, f1453, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1373, f1452, f498; +sub.f32 f567, f1452, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f1372, f1451, f503; +sub.f32 f571, f1451, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f1371, f1450, f508; +sub.f32 f575, f1450, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f1370, f1449, f513; +sub.f32 f579, f1449, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f1369, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f1368, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f1367, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f1366, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f1365, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1364, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1363, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1362, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f1376; +mul.f32 f1361, f612, f552; +sub.f32 f618, f1361, f617; +mul.f32 f619, f612, f1376; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f1359, f612, f612; +mul.f32 f1360, f613, f613; +sub.f32 f623, f1359, f1360; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f1357, f623, f556; +mul.f32 f1358, f625, f1375; +sub.f32 f628, f1357, f1358; +mul.f32 f629, f623, f1375; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f1355, f612, f623; +mul.f32 f1356, f613, f625; +sub.f32 f633, f1355, f1356; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f1353, f633, f560; +mul.f32 f1354, f635, f1374; +sub.f32 f638, f1353, f1354; +mul.f32 f639, f633, f1374; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f1352, f612, f633; +sub.f32 f643, f1352, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f1373; +mul.f32 f1351, f643, f564; +sub.f32 f648, f1351, f647; +mul.f32 f649, f643, f1373; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f1350, f612, f643; +sub.f32 f653, f1350, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f1372; +mul.f32 f1349, f653, f568; +sub.f32 f658, f1349, f657; +mul.f32 f659, f653, f1372; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f1348, f612, f653; +sub.f32 f663, f1348, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f1346, f663, f572; +mul.f32 f1347, f665, f1371; +sub.f32 f668, f1346, f1347; +mul.f32 f669, f663, f1371; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f1344, f612, f663; +mul.f32 f1345, f613, f665; +sub.f32 f673, f1344, f1345; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f1342, f673, f576; +mul.f32 f1343, f675, f1370; +sub.f32 f678, f1342, f1343; +mul.f32 f679, f673, f1370; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f1340, f612, f673; +mul.f32 f1341, f613, f675; +sub.f32 f683, f1340, f1341; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f1369; +mul.f32 f1339, f683, f580; +sub.f32 f688, f1339, f687; +mul.f32 f689, f683, f1369; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f1338, f612, f683; +sub.f32 f693, f1338, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f1368; +mul.f32 f1337, f693, f584; +sub.f32 f698, f1337, f697; +mul.f32 f699, f693, f1368; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f1336, f612, f693; +sub.f32 f703, f1336, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f1367; +mul.f32 f1335, f703, f588; +sub.f32 f708, f1335, f707; +mul.f32 f709, f703, f1367; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f1333, f612, f703; +mul.f32 f1334, f613, f705; +sub.f32 f713, f1333, f1334; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f1331, f713, f592; +mul.f32 f1332, f715, f1366; +sub.f32 f718, f1331, f1332; +mul.f32 f719, f713, f1366; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f1329, f612, f713; +mul.f32 f1330, f613, f715; +sub.f32 f723, f1329, f1330; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f1327, f723, f596; +mul.f32 f1328, f725, f1365; +sub.f32 f728, f1327, f1328; +mul.f32 f729, f723, f1365; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f1326, f612, f723; +sub.f32 f733, f1326, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f1364; +mul.f32 f1325, f733, f600; +sub.f32 f738, f1325, f737; +mul.f32 f739, f733, f1364; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f1324, f612, f733; +sub.f32 f743, f1324, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f1363; +mul.f32 f1323, f743, f604; +sub.f32 f748, f1323, f747; +mul.f32 f749, f743, f1363; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f1322, f612, f743; +sub.f32 f753, f1322, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f1362; +mul.f32 f1321, f753, f608; +sub.f32 f758, f1321, f757; +mul.f32 f759, f753, f1362; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f1319, f612, f753; +mul.f32 f1320, f613, f755; +sub.f32 f763, f1319, f1320; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f1317, f763, f550; +mul.f32 f1318, f765, f551; +sub.f32 f768, f1317, f1318; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f1315, f612, f763; +mul.f32 f1316, f613, f765; +sub.f32 f773, f1315, f1316; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f1314, f773, f554; +sub.f32 f778, f1314, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f1313, f612, f773; +sub.f32 f783, f1313, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f1312, f783, f558; +sub.f32 f788, f1312, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f1311, f612, f783; +sub.f32 f793, f1311, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f1310, f793, f562; +sub.f32 f798, f1310, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f1309, f612, f793; +sub.f32 f803, f1309, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f1307, f803, f566; +mul.f32 f1308, f805, f567; +sub.f32 f808, f1307, f1308; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f1305, f612, f803; +mul.f32 f1306, f613, f805; +sub.f32 f813, f1305, f1306; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f1303, f813, f570; +mul.f32 f1304, f815, f571; +sub.f32 f818, f1303, f1304; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f1301, f612, f813; +mul.f32 f1302, f613, f815; +sub.f32 f823, f1301, f1302; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f1300, f823, f574; +sub.f32 f828, f1300, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f1299, f612, f823; +sub.f32 f833, f1299, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f1298, f833, f578; +sub.f32 f838, f1298, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f1297, f612, f833; +sub.f32 f843, f1297, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f1296, f843, f582; +sub.f32 f848, f1296, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f1294, f612, f843; +mul.f32 f1295, f613, f845; +sub.f32 f853, f1294, f1295; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f1292, f853, f586; +mul.f32 f1293, f855, f587; +sub.f32 f858, f1292, f1293; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f1290, f612, f853; +mul.f32 f1291, f613, f855; +sub.f32 f863, f1290, f1291; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f1288, f863, f590; +mul.f32 f1289, f865, f591; +sub.f32 f868, f1288, f1289; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f1287, f612, f863; +sub.f32 f873, f1287, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f1286, f873, f594; +sub.f32 f878, f1286, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f1285, f612, f873; +sub.f32 f883, f1285, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f1284, f883, f598; +sub.f32 f888, f1284, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f1283, f612, f883; +sub.f32 f893, f1283, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f1281, f893, f602; +mul.f32 f1282, f895, f603; +sub.f32 f898, f1281, f1282; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f1279, f612, f893; +mul.f32 f1280, f613, f895; +sub.f32 f903, f1279, f1280; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f1277, f903, f606; +mul.f32 f1278, f905, f607; +sub.f32 f908, f1277, f1278; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f1275, f612, f903; +mul.f32 f1276, f613, f905; +sub.f32 f913, f1275, f1276; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f1274, f913, f610; +sub.f32 f918, f1274, f917; +mov.u32 r17, %tid.x; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +and.b32 r14, r17, 7; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 896; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+32]; +ld.shared.f32 f923, [r13+64]; +ld.shared.f32 f924, [r13+96]; +ld.shared.f32 f925, [r13+128]; +ld.shared.f32 f926, [r13+160]; +ld.shared.f32 f927, [r13+192]; +ld.shared.f32 f928, [r13+224]; +ld.shared.f32 f929, [r13+256]; +ld.shared.f32 f930, [r13+288]; +ld.shared.f32 f931, [r13+320]; +ld.shared.f32 f932, [r13+352]; +ld.shared.f32 f933, [r13+384]; +ld.shared.f32 f934, [r13+416]; +ld.shared.f32 f935, [r13+448]; +ld.shared.f32 f936, [r13+480]; +ld.shared.f32 f937, [r13+512]; +ld.shared.f32 f938, [r13+544]; +ld.shared.f32 f939, [r13+576]; +ld.shared.f32 f940, [r13+608]; +ld.shared.f32 f941, [r13+640]; +ld.shared.f32 f942, [r13+672]; +ld.shared.f32 f943, [r13+704]; +ld.shared.f32 f944, [r13+736]; +ld.shared.f32 f945, [r13+768]; +ld.shared.f32 f946, [r13+800]; +ld.shared.f32 f947, [r13+832]; +ld.shared.f32 f948, [r13+864]; +ld.shared.f32 f949, [r13+896]; +ld.shared.f32 f950, [r13+928]; +ld.shared.f32 f951, [r13+960]; +ld.shared.f32 f952, [r13+992]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1377, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+32]; +ld.shared.f32 f955, [r13+64]; +ld.shared.f32 f956, [r13+96]; +ld.shared.f32 f957, [r13+128]; +ld.shared.f32 f958, [r13+160]; +ld.shared.f32 f959, [r13+192]; +ld.shared.f32 f960, [r13+224]; +ld.shared.f32 f961, [r13+256]; +ld.shared.f32 f962, [r13+288]; +ld.shared.f32 f963, [r13+320]; +ld.shared.f32 f964, [r13+352]; +ld.shared.f32 f965, [r13+384]; +ld.shared.f32 f966, [r13+416]; +ld.shared.f32 f967, [r13+448]; +ld.shared.f32 f968, [r13+480]; +ld.shared.f32 f969, [r13+512]; +ld.shared.f32 f970, [r13+544]; +ld.shared.f32 f971, [r13+576]; +ld.shared.f32 f972, [r13+608]; +ld.shared.f32 f973, [r13+640]; +ld.shared.f32 f974, [r13+672]; +ld.shared.f32 f975, [r13+704]; +ld.shared.f32 f976, [r13+736]; +ld.shared.f32 f977, [r13+768]; +ld.shared.f32 f978, [r13+800]; +ld.shared.f32 f979, [r13+832]; +ld.shared.f32 f980, [r13+864]; +ld.shared.f32 f981, [r13+896]; +ld.shared.f32 f982, [r13+928]; +ld.shared.f32 f983, [r13+960]; +ld.shared.f32 f984, [r13+992]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1273, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1272, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f1271, f1273, f1272; +sub.f32 f996, f1273, f1272; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f1270, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f1269, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f1268, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f1267, f1269, f1268; +sub.f32 f1012, f1269, f1268; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f1266, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f1266, 0fBF3504F3; +mul.f32 f1265, f1013, 0f3F3504F3; +sub.f32 f1019, f1265, f1018; +mul.f32 f1020, f1266, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f922, f938; +sub.f32 f1028, f922, f938; +add.f32 f1264, f954, f970; +sub.f32 f1029, f954, f970; +add.f32 f1030, f930, f946; +sub.f32 f1032, f930, f946; +add.f32 f1263, f962, f978; +sub.f32 f1033, f962, f978; +add.f32 f1034, f1026, f1030; +sub.f32 f1036, f1026, f1030; +add.f32 f1262, f1264, f1263; +sub.f32 f1037, f1264, f1263; +add.f32 f1038, f1028, f1033; +sub.f32 f1040, f1028, f1033; +sub.f32 f1261, f1029, f1032; +add.f32 f1041, f1029, f1032; +add.f32 f1042, f926, f942; +sub.f32 f1044, f926, f942; +add.f32 f1260, f958, f974; +sub.f32 f1045, f958, f974; +add.f32 f1046, f934, f950; +sub.f32 f1048, f934, f950; +add.f32 f1259, f966, f982; +sub.f32 f1049, f966, f982; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f1258, f1260, f1259; +sub.f32 f1053, f1260, f1259; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f1257, f1045, f1048; +add.f32 f1057, f1045, f1048; +mul.f32 f1059, f1257, 0fBF3504F3; +mul.f32 f1256, f1054, 0f3F3504F3; +sub.f32 f1060, f1256, f1059; +mul.f32 f1061, f1257, 0f3F3504F3; +fma.rn.f32 f1062, f1054, 0fBF3504F3, f1061; +mul.f32 f1063, f1056, 0fBF3504F3; +mul.f32 f1064, f1057, 0fBF3504F3; +sub.f32 f1065, f1063, f1064; +add.f32 f1066, f1063, f1064; +add.f32 f1067, f923, f939; +sub.f32 f1069, f923, f939; +add.f32 f1255, f955, f971; +sub.f32 f1070, f955, f971; +add.f32 f1071, f931, f947; +sub.f32 f1073, f931, f947; +add.f32 f1254, f963, f979; +sub.f32 f1074, f963, f979; +add.f32 f1075, f1067, f1071; +sub.f32 f1077, f1067, f1071; +add.f32 f1253, f1255, f1254; +sub.f32 f1078, f1255, f1254; +add.f32 f1079, f1069, f1074; +sub.f32 f1081, f1069, f1074; +sub.f32 f1252, f1070, f1073; +add.f32 f1082, f1070, f1073; +add.f32 f1083, f927, f943; +sub.f32 f1085, f927, f943; +add.f32 f1251, f959, f975; +sub.f32 f1086, f959, f975; +add.f32 f1087, f935, f951; +sub.f32 f1089, f935, f951; +add.f32 f1250, f967, f983; +sub.f32 f1090, f967, f983; +add.f32 f1091, f1083, f1087; +sub.f32 f1093, f1083, f1087; +add.f32 f1249, f1251, f1250; +sub.f32 f1094, f1251, f1250; +add.f32 f1095, f1085, f1090; +sub.f32 f1097, f1085, f1090; +sub.f32 f1248, f1086, f1089; +add.f32 f1098, f1086, f1089; +mul.f32 f1246, f1095, 0f3F3504F3; +mul.f32 f1247, f1248, 0fBF3504F3; +sub.f32 f1101, f1246, f1247; +mul.f32 f1102, f1248, 0f3F3504F3; +fma.rn.f32 f1103, f1095, 0fBF3504F3, f1102; +mul.f32 f1104, f1097, 0fBF3504F3; +mul.f32 f1105, f1098, 0fBF3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +add.f32 f1108, f924, f940; +sub.f32 f1110, f924, f940; +add.f32 f1245, f956, f972; +sub.f32 f1111, f956, f972; +add.f32 f1112, f932, f948; +sub.f32 f1114, f932, f948; +add.f32 f1244, f964, f980; +sub.f32 f1115, f964, f980; +add.f32 f1116, f1108, f1112; +sub.f32 f1118, f1108, f1112; +add.f32 f1243, f1245, f1244; +sub.f32 f1119, f1245, f1244; +add.f32 f1120, f1110, f1115; +sub.f32 f1122, f1110, f1115; +sub.f32 f1242, f1111, f1114; +add.f32 f1123, f1111, f1114; +add.f32 f1124, f928, f944; +sub.f32 f1126, f928, f944; +add.f32 f1241, f960, f976; +sub.f32 f1127, f960, f976; +add.f32 f1128, f936, f952; +sub.f32 f1130, f936, f952; +add.f32 f1240, f968, f984; +sub.f32 f1131, f968, f984; +add.f32 f1132, f1124, f1128; +sub.f32 f1134, f1124, f1128; +add.f32 f1239, f1241, f1240; +sub.f32 f1135, f1241, f1240; +add.f32 f1136, f1126, f1131; +sub.f32 f1138, f1126, f1131; +sub.f32 f1238, f1127, f1130; +add.f32 f1139, f1127, f1130; +mul.f32 f1141, f1238, 0fBF3504F3; +mul.f32 f1237, f1136, 0f3F3504F3; +sub.f32 f1142, f1237, f1141; +mul.f32 f1143, f1238, 0f3F3504F3; +fma.rn.f32 f1144, f1136, 0fBF3504F3, f1143; +mul.f32 f1145, f1138, 0fBF3504F3; +mul.f32 f1146, f1139, 0fBF3504F3; +sub.f32 f1147, f1145, f1146; +add.f32 f1148, f1145, f1146; +add.f32 %0, f993, f1009; +add.f32 %1, f1271, f1267; +add.f32 %2, f1034, f1050; +add.f32 %3, f1262, f1258; +add.f32 %4, f1075, f1091; +add.f32 %5, f1253, f1249; +add.f32 %7, f1243, f1239; +add.f32 %6, f1116, f1132; +add.f32 %8, f997, f1019; +add.f32 %9, f1270, f1021; +add.f32 %10, f1038, f1060; +add.f32 %11, f1261, f1062; +add.f32 %13, f1252, f1103; +add.f32 %12, f1079, f1101; +add.f32 %15, f1242, f1144; +add.f32 %14, f1120, f1142; +sub.f32 %17, f996, f1011; +add.f32 %16, f995, f1012; +add.f32 %18, f1036, f1053; +sub.f32 %19, f1037, f1052; +add.f32 %20, f1077, f1094; +sub.f32 %21, f1078, f1093; +add.f32 %22, f1118, f1135; +sub.f32 %23, f1119, f1134; +add.f32 %24, f999, f1024; +add.f32 %25, f1000, f1025; +add.f32 %27, f1041, f1066; +add.f32 %26, f1040, f1065; +add.f32 %29, f1082, f1107; +add.f32 %28, f1081, f1106; +add.f32 %30, f1122, f1147; +add.f32 %31, f1123, f1148; +sub.f32 %32, f993, f1009; +sub.f32 %33, f1271, f1267; +sub.f32 %34, f1034, f1050; +sub.f32 %35, f1262, f1258; +sub.f32 %36, f1075, f1091; +sub.f32 %37, f1253, f1249; +sub.f32 %38, f1116, f1132; +sub.f32 %39, f1243, f1239; +sub.f32 %41, f1270, f1021; +sub.f32 %40, f997, f1019; +sub.f32 %43, f1261, f1062; +sub.f32 %42, f1038, f1060; +sub.f32 %45, f1252, f1103; +sub.f32 %44, f1079, f1101; +sub.f32 %47, f1242, f1144; +sub.f32 %46, f1120, f1142; +add.f32 %49, f996, f1011; +sub.f32 %48, f995, f1012; +add.f32 %51, f1037, f1052; +sub.f32 %50, f1036, f1053; +add.f32 %53, f1078, f1093; +sub.f32 %52, f1077, f1094; +add.f32 %55, f1119, f1134; +sub.f32 %54, f1118, f1135; +sub.f32 %57, f1000, f1025; +sub.f32 %56, f999, f1024; +sub.f32 %59, f1041, f1066; +sub.f32 %58, f1040, f1065; +sub.f32 %61, f1082, f1107; +sub.f32 %60, f1081, f1106; +sub.f32 %63, f1123, f1148; +sub.f32 %62, f1122, f1147; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<69, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<160>; +.reg .b32 r<56>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %12, %14; +sub.f32 f10, %13, %15; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 2032; +add.s32 r11, r8, r10; +add.f32 f18, %13, %15; +add.f32 f19, %12, %14; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 1016; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+1024]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 2016; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 1008; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+1024]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 1984; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 992; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+1024]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 120; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1920; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 960; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+1024]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 3; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f95, f93; +mul.f32 f100, f96, f94; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1792; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f96, f93, f101; +sub.f32 f105, f99, f100; +st.shared.v2.f32 [r39+128], {f105, f104}; +barrier.sync 0; +and.b32 r40, r9, 896; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+1024]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f116, f114; +mul.f32 f121, f117, f115; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1536; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f117, f114, f122; +sub.f32 f126, f120, f121; +st.shared.v2.f32 [r46+256], {f126, f125}; +barrier.sync 0; +and.b32 r47, r9, 768; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+1024]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f137, f135; +mul.f32 f142, f138, f136; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 1024; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f138, f135, f143; +sub.f32 f147, f141, f142; +st.shared.v2.f32 [r53+512], {f147, f146}; +barrier.sync 0; +and.b32 r54, r9, 512; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+1024]; +add.f32 %1, f149, f153; +add.f32 %0, f148, f152; +sub.f32 %3, f149, f153; +sub.f32 %2, f148, f152; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<70, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<132>; +.reg .b32 r<56>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %12, %14; +add.f32 f10, %13, %15; +sub.f32 f11, %12, %14; +sub.f32 f12, %13, %15; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 1016; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 508; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+512]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+512]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 1008; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 504; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+512]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+512]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 992; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 496; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+512]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+512]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 120; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 960; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 480; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+512]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+512]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 3; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f81, f79; +mul.f32 f86, f82, f80; +sub.f32 f87, f85, f86; +mul.f32 f88, f81, f80; +fma.rn.f32 f89, f82, f79, f88; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 896; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f87; +barrier.sync 0; +and.b32 r40, r11, 448; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+512]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+512]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f98, f96; +mul.f32 f103, f99, f97; +sub.f32 f104, f102, f103; +mul.f32 f105, f98, f97; +fma.rn.f32 f106, f99, f96, f105; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 768; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f104; +barrier.sync 0; +and.b32 r47, r11, 384; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+512]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+512]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f115, f113; +mul.f32 f120, f116, f114; +sub.f32 f121, f119, f120; +mul.f32 f122, f115, f114; +fma.rn.f32 f123, f116, f113, f122; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 512; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f121; +barrier.sync 0; +and.b32 r54, r11, 256; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+512]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+512]; +add.f32 %0, f124, f125; +add.f32 %1, f126, f127; +sub.f32 %2, f124, f125; +sub.f32 %3, f126, f127; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..f4db50e48b4bd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp32_inv.hpp.inc @@ -0,0 +1,5015 @@ +#ifndef CUFFTDX_FFT_256_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_256_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<263, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<596>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 960; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+64]; +ld.shared.f32 f391, [r13+128]; +ld.shared.f32 f392, [r13+192]; +ld.shared.f32 f393, [r13+256]; +ld.shared.f32 f394, [r13+320]; +ld.shared.f32 f395, [r13+384]; +ld.shared.f32 f396, [r13+448]; +ld.shared.f32 f397, [r13+512]; +ld.shared.f32 f398, [r13+576]; +ld.shared.f32 f399, [r13+640]; +ld.shared.f32 f400, [r13+704]; +ld.shared.f32 f401, [r13+768]; +ld.shared.f32 f402, [r13+832]; +ld.shared.f32 f403, [r13+896]; +ld.shared.f32 f404, [r13+960]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+64]; +ld.shared.f32 f407, [r13+128]; +ld.shared.f32 f408, [r13+192]; +ld.shared.f32 f409, [r13+256]; +ld.shared.f32 f410, [r13+320]; +ld.shared.f32 f411, [r13+384]; +ld.shared.f32 f412, [r13+448]; +ld.shared.f32 f413, [r13+512]; +ld.shared.f32 f414, [r13+576]; +ld.shared.f32 f415, [r13+640]; +ld.shared.f32 f416, [r13+704]; +ld.shared.f32 f417, [r13+768]; +ld.shared.f32 f418, [r13+832]; +ld.shared.f32 f419, [r13+896]; +ld.shared.f32 f420, [r13+960]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 %0, f462, f519; +add.f32 %1, f463, f520; +add.f32 %3, f467, f539; +add.f32 %2, f466, f537; +add.f32 %5, f471, f543; +add.f32 %4, f470, f542; +add.f32 %7, f475, f548; +add.f32 %6, f474, f546; +add.f32 %9, f465, f521; +sub.f32 %8, f464, f522; +add.f32 %11, f469, f553; +add.f32 %10, f468, f551; +add.f32 %13, f473, f558; +add.f32 %12, f472, f556; +add.f32 %15, f477, f563; +add.f32 %14, f476, f561; +sub.f32 %16, f462, f519; +sub.f32 %17, f463, f520; +sub.f32 %19, f467, f539; +sub.f32 %18, f466, f537; +sub.f32 %21, f471, f543; +sub.f32 %20, f470, f542; +sub.f32 %23, f475, f548; +sub.f32 %22, f474, f546; +sub.f32 %25, f465, f521; +add.f32 %24, f464, f522; +sub.f32 %27, f469, f553; +sub.f32 %26, f468, f551; +sub.f32 %29, f473, f558; +sub.f32 %28, f472, f556; +sub.f32 %31, f477, f563; +sub.f32 %30, f476, f561; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<264, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<381>; +.reg .b32 r<20>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 1984; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+256]; +ld.shared.v2.f32 {f167, f168}, [r13+512]; +ld.shared.v2.f32 {f171, f172}, [r13+768]; +ld.shared.v2.f32 {f175, f176}, [r13+1024]; +ld.shared.v2.f32 {f179, f180}, [r13+1280]; +ld.shared.v2.f32 {f183, f184}, [r13+1536]; +ld.shared.v2.f32 {f187, f188}, [r13+1792]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 24; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 1536; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+256]; +ld.shared.v2.f32 {f325, f326}, [r19+512]; +ld.shared.v2.f32 {f329, f330}, [r19+768]; +ld.shared.v2.f32 {f333, f334}, [r19+1024]; +ld.shared.v2.f32 {f337, f338}, [r19+1280]; +ld.shared.v2.f32 {f341, f342}, [r19+1536]; +ld.shared.v2.f32 {f345, f346}, [r19+1792]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f321, f337; +add.f32 f358, f322, f338; +sub.f32 f359, f321, f337; +sub.f32 f360, f322, f338; +add.f32 f361, f329, f345; +add.f32 f362, f330, f346; +sub.f32 f363, f329, f345; +sub.f32 f364, f330, f346; +add.f32 %1, f350, f354; +add.f32 %0, f349, f353; +add.f32 %3, f358, f362; +add.f32 %2, f357, f361; +add.f32 %5, f352, f355; +sub.f32 %4, f351, f356; +add.f32 %7, f360, f363; +sub.f32 %6, f359, f364; +sub.f32 %9, f350, f354; +sub.f32 %8, f349, f353; +sub.f32 %11, f358, f362; +sub.f32 %10, f357, f361; +sub.f32 %13, f352, f355; +add.f32 %12, f351, f356; +sub.f32 %15, f360, f363; +add.f32 %14, f359, f364; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<265, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<349>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 992; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+128]; +ld.shared.f32 f161, [r13+256]; +ld.shared.f32 f162, [r13+384]; +ld.shared.f32 f163, [r13+512]; +ld.shared.f32 f164, [r13+640]; +ld.shared.f32 f165, [r13+768]; +ld.shared.f32 f166, [r13+896]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+128]; +ld.shared.f32 f169, [r13+256]; +ld.shared.f32 f170, [r13+384]; +ld.shared.f32 f171, [r13+512]; +ld.shared.f32 f172, [r13+640]; +ld.shared.f32 f173, [r13+768]; +ld.shared.f32 f174, [r13+896]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 24; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 768; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+128]; +ld.shared.f32 f303, [r20+256]; +ld.shared.f32 f304, [r20+384]; +ld.shared.f32 f305, [r20+512]; +ld.shared.f32 f306, [r20+640]; +ld.shared.f32 f307, [r20+768]; +ld.shared.f32 f308, [r20+896]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+128]; +ld.shared.f32 f311, [r20+256]; +ld.shared.f32 f312, [r20+384]; +ld.shared.f32 f313, [r20+512]; +ld.shared.f32 f314, [r20+640]; +ld.shared.f32 f315, [r20+768]; +ld.shared.f32 f316, [r20+896]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f302, f306; +add.f32 f326, f310, f314; +sub.f32 f327, f302, f306; +sub.f32 f328, f310, f314; +add.f32 f329, f304, f308; +add.f32 f330, f312, f316; +sub.f32 f331, f304, f308; +sub.f32 f332, f312, f316; +add.f32 %0, f317, f321; +add.f32 %1, f318, f322; +add.f32 %2, f325, f329; +add.f32 %3, f326, f330; +add.f32 %5, f320, f323; +sub.f32 %4, f319, f324; +add.f32 %7, f328, f331; +sub.f32 %6, f327, f332; +sub.f32 %8, f317, f321; +sub.f32 %9, f318, f322; +sub.f32 %10, f325, f329; +sub.f32 %11, f326, f330; +sub.f32 %13, f320, f323; +add.f32 %12, f319, f324; +sub.f32 %15, f328, f331; +add.f32 %14, f327, f332; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_256), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<266, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<763>; +.reg .b32 r<20>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f757, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f755, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f754, f757, f755; +sub.f32 f76, f757, f755; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f753, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f750, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f748, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f747, f750, f748; +sub.f32 f92, f750, f748; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f746, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f746, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f744, f95, 0fBF3504F3; +mul.f32 f745, f96, 0f3F3504F3; +sub.f32 f103, f744, f745; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f743, f754, f747; +sub.f32 f109, f754, f747; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f742, f753, f100; +sub.f32 f113, f753, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f741, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f740, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f738, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f735, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f734, f738, f735; +sub.f32 f133, f738, f735; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f733, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f731, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f729, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f728, f731, f729; +sub.f32 f149, f731, f729; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f727, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f727, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f725, f152, 0fBF3504F3; +mul.f32 f726, f153, 0f3F3504F3; +sub.f32 f160, f725, f726; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f724, f734, f728; +sub.f32 f166, f734, f728; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f723, f733, f157; +sub.f32 f170, f733, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f722, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f721, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f719, f167, 0f3F6C835E; +mul.f32 f720, f723, 0f3EC3EF15; +sub.f32 f181, f719, f720; +mul.f32 f182, f723, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f722, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f721, 0f3F6C835E; +mul.f32 f718, f175, 0f3EC3EF15; +sub.f32 f190, f718, f189; +mul.f32 f191, f721, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f717, f169, 0fBEC3EF15; +sub.f32 f195, f717, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f715, f173, 0fBF3504F3; +mul.f32 f716, f174, 0f3F3504F3; +sub.f32 f200, f715, f716; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f713, f177, 0fBF6C835E; +mul.f32 f714, f178, 0f3EC3EF15; +sub.f32 f205, f713, f714; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f712, f742, f183; +sub.f32 f213, f742, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f711, f741, f187; +sub.f32 f217, f741, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f710, f740, f192; +sub.f32 f221, f740, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f709, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f708, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f707, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f706, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +and.b32 r14, r15, 15; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f712, f239; +mul.f32 f244, f238, f712; +mul.f32 f246, f239, f239; +mul.f32 f705, f238, f238; +sub.f32 f247, f705, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f711, f249; +mul.f32 f252, f247, f711; +mul.f32 f703, f238, f247; +mul.f32 f704, f239, f249; +sub.f32 f255, f703, f704; +mul.f32 f702, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f710, f257; +mul.f32 f260, f255, f710; +mul.f32 f262, f239, f257; +mul.f32 f701, f238, f255; +sub.f32 f263, f701, f262; +mul.f32 f700, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f709, f265; +mul.f32 f268, f263, f709; +mul.f32 f270, f239, f265; +mul.f32 f699, f238, f263; +sub.f32 f271, f699, f270; +mul.f32 f698, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f708, f273; +mul.f32 f276, f271, f708; +mul.f32 f696, f238, f271; +mul.f32 f697, f239, f273; +sub.f32 f279, f696, f697; +mul.f32 f695, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f707, f281; +mul.f32 f284, f279, f707; +mul.f32 f286, f239, f281; +mul.f32 f694, f238, f279; +sub.f32 f287, f694, f286; +mul.f32 f693, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f706, f289; +mul.f32 f292, f287, f706; +mul.f32 f294, f239, f289; +mul.f32 f692, f238, f287; +sub.f32 f295, f692, f294; +mul.f32 f691, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f690, f743, f724; +mul.f32 f298, f690, f297; +mul.f32 f300, f295, f690; +mul.f32 f688, f238, f295; +mul.f32 f689, f239, f297; +sub.f32 f303, f688, f689; +sub.f32 f687, f106, f163; +mul.f32 f686, f687, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f685, f238, f303; +sub.f32 f311, f685, f310; +mul.f32 f684, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f682, f238, f311; +mul.f32 f683, f239, f313; +sub.f32 f319, f682, f683; +mul.f32 f681, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f680, f238, f319; +sub.f32 f327, f680, f326; +mul.f32 f679, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f678, f238, f327; +sub.f32 f335, f678, f334; +mul.f32 f677, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f675, f238, f335; +mul.f32 f676, f239, f337; +sub.f32 f343, f675, f676; +mul.f32 f674, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f673, f238, f343; +sub.f32 f351, f673, f350; +mul.f32 f672, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f671, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 1920; +add.s32 r12, r9, r11; +sub.f32 f762, f743, f724; +mul.f32 f761, f295, f762; +add.f32 f357, f743, f724; +sub.f32 f760, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 15; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 15; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f671; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f702; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f700; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f698; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f695; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f693; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f691; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f760, f298; +sub.f32 f374, f761, f686; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f684; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f681; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f679; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f677; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f674; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f672; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +mad.lo.s32 r13, r18, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+128]; +ld.shared.v2.f32 {f397, f398}, [r13+256]; +ld.shared.v2.f32 {f401, f402}, [r13+384]; +ld.shared.v2.f32 {f405, f406}, [r13+512]; +ld.shared.v2.f32 {f409, f410}, [r13+640]; +ld.shared.v2.f32 {f413, f414}, [r13+768]; +ld.shared.v2.f32 {f417, f418}, [r13+896]; +ld.shared.v2.f32 {f421, f422}, [r13+1024]; +ld.shared.v2.f32 {f425, f426}, [r13+1152]; +ld.shared.v2.f32 {f429, f430}, [r13+1280]; +ld.shared.v2.f32 {f433, f434}, [r13+1408]; +ld.shared.v2.f32 {f437, f438}, [r13+1536]; +ld.shared.v2.f32 {f441, f442}, [r13+1664]; +ld.shared.v2.f32 {f445, f446}, [r13+1792]; +ld.shared.v2.f32 {f449, f450}, [r13+1920]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f670, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f669, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f668, f670, f669; +sub.f32 f464, f670, f669; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f667, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f666, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f665, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f664, f666, f665; +sub.f32 f480, f666, f665; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f663, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f663, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f662, f483, 0fBF3504F3; +sub.f32 f491, f662, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f661, f668, f664; +sub.f32 f497, f668, f664; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f660, f667, f488; +sub.f32 f501, f667, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f659, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f658, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f657, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f656, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f655, f657, f656; +sub.f32 f521, f657, f656; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f654, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f653, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f652, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f651, f653, f652; +sub.f32 f537, f653, f652; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f650, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f650, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f649, f540, 0fBF3504F3; +sub.f32 f548, f649, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f648, f655, f651; +sub.f32 f554, f655, f651; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f647, f654, f545; +sub.f32 f558, f654, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f646, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f645, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f647, 0f3EC3EF15; +mul.f32 f644, f555, 0f3F6C835E; +sub.f32 f569, f644, f568; +mul.f32 f570, f647, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f646, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f645, 0f3F6C835E; +mul.f32 f643, f563, 0f3EC3EF15; +sub.f32 f578, f643, f577; +mul.f32 f579, f645, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f642, f557, 0fBEC3EF15; +sub.f32 f583, f642, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f641, f561, 0fBF3504F3; +sub.f32 f588, f641, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f640, f565, 0fBF6C835E; +sub.f32 f593, f640, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 %1, f661, f648; +add.f32 %0, f494, f551; +add.f32 %2, f498, f569; +add.f32 %3, f660, f571; +add.f32 %4, f502, f574; +add.f32 %5, f659, f575; +add.f32 %6, f506, f578; +add.f32 %7, f658, f580; +add.f32 %9, f497, f553; +sub.f32 %8, f496, f554; +add.f32 %11, f501, f585; +add.f32 %10, f500, f583; +add.f32 %13, f505, f590; +add.f32 %12, f504, f588; +add.f32 %14, f508, f593; +add.f32 %15, f509, f595; +sub.f32 %17, f661, f648; +sub.f32 %16, f494, f551; +sub.f32 %19, f660, f571; +sub.f32 %18, f498, f569; +sub.f32 %21, f659, f575; +sub.f32 %20, f502, f574; +sub.f32 %23, f658, f580; +sub.f32 %22, f506, f578; +sub.f32 %25, f497, f553; +add.f32 %24, f496, f554; +sub.f32 %27, f501, f585; +sub.f32 %26, f500, f583; +sub.f32 %29, f505, f590; +sub.f32 %28, f504, f588; +sub.f32 %31, f509, f595; +sub.f32 %30, f508, f593; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<267, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1552>; +.reg .b32 r<20>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1546, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1544, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1543, f1546, f1544; +sub.f32 f140, f1546, f1544; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1542, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1539, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1537, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1536, f1539, f1537; +sub.f32 f156, f1539, f1537; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1535, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1535, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1533, f159, 0fBF3504F3; +mul.f32 f1534, f160, 0f3F3504F3; +sub.f32 f167, f1533, f1534; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1532, f1543, f1536; +sub.f32 f173, f1543, f1536; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1531, f1542, f164; +sub.f32 f177, f1542, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1530, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1529, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1527, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1524, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1523, f1527, f1524; +sub.f32 f197, f1527, f1524; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1522, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1520, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1518, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1517, f1520, f1518; +sub.f32 f213, f1520, f1518; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1516, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1516, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1514, f216, 0fBF3504F3; +mul.f32 f1515, f217, 0f3F3504F3; +sub.f32 f224, f1514, f1515; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1513, f1523, f1517; +sub.f32 f230, f1523, f1517; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1512, f1522, f221; +sub.f32 f234, f1522, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1511, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1510, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1508, f231, 0f3F6C835E; +mul.f32 f1509, f1512, 0f3EC3EF15; +sub.f32 f245, f1508, f1509; +mul.f32 f246, f1512, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1511, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1510, 0f3F6C835E; +mul.f32 f1507, f239, 0f3EC3EF15; +sub.f32 f254, f1507, f253; +mul.f32 f255, f1510, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1506, f233, 0fBEC3EF15; +sub.f32 f259, f1506, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1504, f237, 0fBF3504F3; +mul.f32 f1505, f238, 0f3F3504F3; +sub.f32 f264, f1504, f1505; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1502, f241, 0fBF6C835E; +mul.f32 f1503, f242, 0f3EC3EF15; +sub.f32 f269, f1502, f1503; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1501, f1532, f1513; +sub.f32 f275, f1532, f1513; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1500, f1531, f247; +sub.f32 f279, f1531, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1499, f1530, f251; +sub.f32 f283, f1530, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1498, f1529, f256; +sub.f32 f287, f1529, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1497, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1496, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1495, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1494, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1491, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1489, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1488, f1491, f1489; +sub.f32 f315, f1491, f1489; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1487, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1485, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1482, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1481, f1485, f1482; +sub.f32 f331, f1485, f1482; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1480, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1480, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1479, f334, 0fBF3504F3; +sub.f32 f342, f1479, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1478, f1488, f1481; +sub.f32 f348, f1488, f1481; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1477, f1487, f339; +sub.f32 f352, f1487, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1476, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1475, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1473, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1471, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1470, f1473, f1471; +sub.f32 f372, f1473, f1471; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1469, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1466, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1465, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1464, f1466, f1465; +sub.f32 f388, f1466, f1465; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1463, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1463, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1461, f391, 0fBF3504F3; +mul.f32 f1462, f392, 0f3F3504F3; +sub.f32 f399, f1461, f1462; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1460, f1470, f1464; +sub.f32 f405, f1470, f1464; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1459, f1469, f396; +sub.f32 f409, f1469, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1458, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1457, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1459, 0f3EC3EF15; +mul.f32 f1456, f406, 0f3F6C835E; +sub.f32 f420, f1456, f419; +mul.f32 f421, f1459, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1458, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1457, 0f3F6C835E; +mul.f32 f1455, f414, 0f3EC3EF15; +sub.f32 f429, f1455, f428; +mul.f32 f430, f1457, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1454, f408, 0fBEC3EF15; +sub.f32 f434, f1454, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1453, f412, 0fBF3504F3; +sub.f32 f439, f1453, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1452, f416, 0fBF6C835E; +sub.f32 f444, f1452, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1451, f1478, f1460; +sub.f32 f450, f1478, f1460; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1450, f1477, f422; +sub.f32 f454, f1477, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1449, f1476, f426; +sub.f32 f458, f1476, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1448, f1475, f431; +sub.f32 f462, f1475, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1447, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1446, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1445, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1444, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1450, 0f3E47C5C2; +mul.f32 f1443, f451, 0f3F7B14BE; +sub.f32 f481, f1443, f480; +mul.f32 f482, f1450, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1449, 0f3EC3EF15; +mul.f32 f1442, f455, 0f3F6C835E; +sub.f32 f486, f1442, f485; +mul.f32 f487, f1449, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1448, 0f3F0E39DA; +mul.f32 f1441, f459, 0f3F54DB31; +sub.f32 f491, f1441, f490; +mul.f32 f492, f1448, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1447, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1446, 0f3F54DB31; +mul.f32 f1440, f467, 0f3F0E39DA; +sub.f32 f500, f1440, f499; +mul.f32 f501, f1446, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1445, 0f3F6C835E; +mul.f32 f1439, f471, 0f3EC3EF15; +sub.f32 f505, f1439, f504; +mul.f32 f506, f1445, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1444, 0f3F7B14BE; +mul.f32 f1438, f475, 0f3E47C5C2; +sub.f32 f510, f1438, f509; +mul.f32 f511, f1444, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1437, f453, 0fBE47C5C2; +sub.f32 f515, f1437, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1435, f457, 0fBEC3EF15; +mul.f32 f1436, f458, 0f3F6C835E; +sub.f32 f520, f1435, f1436; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1433, f461, 0fBF0E39DA; +mul.f32 f1434, f462, 0f3F54DB31; +sub.f32 f525, f1433, f1434; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1431, f465, 0fBF3504F3; +mul.f32 f1432, f466, 0f3F3504F3; +sub.f32 f530, f1431, f1432; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1429, f469, 0fBF54DB31; +mul.f32 f1430, f470, 0f3F0E39DA; +sub.f32 f535, f1429, f1430; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1428, f473, 0fBF6C835E; +sub.f32 f540, f1428, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1427, f477, 0fBF7B14BE; +sub.f32 f545, f1427, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1426, f1500, f483; +sub.f32 f553, f1500, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1425, f1499, f488; +sub.f32 f557, f1499, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1424, f1498, f493; +sub.f32 f561, f1498, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1423, f1497, f497; +sub.f32 f565, f1497, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f1422, f1496, f502; +sub.f32 f569, f1496, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f1421, f1495, f507; +sub.f32 f573, f1495, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f1420, f1494, f512; +sub.f32 f577, f1494, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f1419, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f1418, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f1417, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f1416, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f1415, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1414, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1413, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1412, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +and.b32 r14, r15, 7; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f1426, f611; +mul.f32 f616, f610, f1426; +mul.f32 f618, f611, f611; +mul.f32 f1411, f610, f610; +sub.f32 f619, f1411, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f1425, f621; +mul.f32 f624, f619, f1425; +mul.f32 f626, f611, f621; +mul.f32 f1410, f610, f619; +sub.f32 f627, f1410, f626; +mul.f32 f1409, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f1424, f629; +mul.f32 f632, f627, f1424; +mul.f32 f1407, f610, f627; +mul.f32 f1408, f611, f629; +sub.f32 f635, f1407, f1408; +mul.f32 f1406, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f1423, f637; +mul.f32 f640, f635, f1423; +mul.f32 f642, f611, f637; +mul.f32 f1405, f610, f635; +sub.f32 f643, f1405, f642; +mul.f32 f1404, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f1422, f645; +mul.f32 f648, f643, f1422; +mul.f32 f1402, f610, f643; +mul.f32 f1403, f611, f645; +sub.f32 f651, f1402, f1403; +mul.f32 f1401, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f1421, f653; +mul.f32 f656, f651, f1421; +mul.f32 f658, f611, f653; +mul.f32 f1400, f610, f651; +sub.f32 f659, f1400, f658; +mul.f32 f1399, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f1420, f661; +mul.f32 f664, f659, f1420; +mul.f32 f666, f611, f661; +mul.f32 f1398, f610, f659; +sub.f32 f667, f1398, f666; +mul.f32 f1397, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f1419, f669; +mul.f32 f672, f667, f1419; +mul.f32 f1395, f610, f667; +mul.f32 f1396, f611, f669; +sub.f32 f675, f1395, f1396; +mul.f32 f1394, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f1418, f677; +mul.f32 f680, f675, f1418; +mul.f32 f682, f611, f677; +mul.f32 f1393, f610, f675; +sub.f32 f683, f1393, f682; +mul.f32 f1392, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f1417, f685; +mul.f32 f688, f683, f1417; +mul.f32 f690, f611, f685; +mul.f32 f1391, f610, f683; +sub.f32 f691, f1391, f690; +mul.f32 f1390, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f1416, f693; +mul.f32 f696, f691, f1416; +mul.f32 f1388, f610, f691; +mul.f32 f1389, f611, f693; +sub.f32 f699, f1388, f1389; +mul.f32 f1387, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f1415, f701; +mul.f32 f704, f699, f1415; +mul.f32 f706, f611, f701; +mul.f32 f1386, f610, f699; +sub.f32 f707, f1386, f706; +mul.f32 f1385, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f1414, f709; +mul.f32 f712, f707, f1414; +mul.f32 f1383, f610, f707; +mul.f32 f1384, f611, f709; +sub.f32 f715, f1383, f1384; +mul.f32 f1382, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f1413, f717; +mul.f32 f720, f715, f1413; +mul.f32 f722, f611, f717; +mul.f32 f1381, f610, f715; +sub.f32 f723, f1381, f722; +mul.f32 f1380, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f1412, f725; +mul.f32 f728, f723, f1412; +mul.f32 f730, f611, f725; +mul.f32 f1379, f610, f723; +sub.f32 f731, f1379, f730; +mul.f32 f1378, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1377, f1501, f1451; +mul.f32 f734, f1377, f733; +mul.f32 f736, f731, f1377; +mul.f32 f1375, f610, f731; +mul.f32 f1376, f611, f733; +sub.f32 f739, f1375, f1376; +sub.f32 f1374, f272, f447; +mul.f32 f1373, f1374, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1372, f610, f739; +sub.f32 f747, f1372, f746; +mul.f32 f1371, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1370, f610, f747; +sub.f32 f755, f1370, f754; +mul.f32 f1369, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f1367, f610, f755; +mul.f32 f1368, f611, f757; +sub.f32 f763, f1367, f1368; +mul.f32 f1366, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1365, f610, f763; +sub.f32 f771, f1365, f770; +mul.f32 f1364, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f1362, f610, f771; +mul.f32 f1363, f611, f773; +sub.f32 f779, f1362, f1363; +mul.f32 f1361, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1360, f610, f779; +sub.f32 f787, f1360, f786; +mul.f32 f1359, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1358, f610, f787; +sub.f32 f795, f1358, f794; +mul.f32 f1357, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f1355, f610, f795; +mul.f32 f1356, f611, f797; +sub.f32 f803, f1355, f1356; +mul.f32 f1354, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1353, f610, f803; +sub.f32 f811, f1353, f810; +mul.f32 f1352, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1351, f610, f811; +sub.f32 f819, f1351, f818; +mul.f32 f1350, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f1348, f610, f819; +mul.f32 f1349, f611, f821; +sub.f32 f827, f1348, f1349; +mul.f32 f1347, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1346, f610, f827; +sub.f32 f835, f1346, f834; +mul.f32 f1345, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f1343, f610, f835; +mul.f32 f1344, f611, f837; +sub.f32 f843, f1343, f1344; +mul.f32 f1342, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1341, f610, f843; +sub.f32 f851, f1341, f850; +mul.f32 f1340, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f1339, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 1792; +add.s32 r12, r9, r11; +sub.f32 f1551, f1501, f1451; +mul.f32 f1550, f731, f1551; +add.f32 f857, f1501, f1451; +sub.f32 f1549, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 7; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 7; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f1339; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f1409; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f1406; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f1404; +sub.f32 f867, f648, f1401; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f1399; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f1397; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f1394; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f1392; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f1390; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f1387; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f1385; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f1382; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f1380; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f1378; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f1549, f734; +sub.f32 f890, f1550, f1373; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f1371; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f1369; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f1366; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f1364; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f1361; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f1359; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f1357; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f1354; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f1352; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f1350; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f1347; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f1345; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f1342; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f1340; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +mad.lo.s32 r13, r18, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+64]; +ld.shared.v2.f32 {f929, f930}, [r13+128]; +ld.shared.v2.f32 {f933, f934}, [r13+192]; +ld.shared.v2.f32 {f937, f938}, [r13+256]; +ld.shared.v2.f32 {f941, f942}, [r13+320]; +ld.shared.v2.f32 {f945, f946}, [r13+384]; +ld.shared.v2.f32 {f949, f950}, [r13+448]; +ld.shared.v2.f32 {f953, f954}, [r13+512]; +ld.shared.v2.f32 {f957, f958}, [r13+576]; +ld.shared.v2.f32 {f961, f962}, [r13+640]; +ld.shared.v2.f32 {f965, f966}, [r13+704]; +ld.shared.v2.f32 {f969, f970}, [r13+768]; +ld.shared.v2.f32 {f973, f974}, [r13+832]; +ld.shared.v2.f32 {f977, f978}, [r13+896]; +ld.shared.v2.f32 {f981, f982}, [r13+960]; +ld.shared.v2.f32 {f985, f986}, [r13+1024]; +ld.shared.v2.f32 {f989, f990}, [r13+1088]; +ld.shared.v2.f32 {f993, f994}, [r13+1152]; +ld.shared.v2.f32 {f997, f998}, [r13+1216]; +ld.shared.v2.f32 {f1001, f1002}, [r13+1280]; +ld.shared.v2.f32 {f1005, f1006}, [r13+1344]; +ld.shared.v2.f32 {f1009, f1010}, [r13+1408]; +ld.shared.v2.f32 {f1013, f1014}, [r13+1472]; +ld.shared.v2.f32 {f1017, f1018}, [r13+1536]; +ld.shared.v2.f32 {f1021, f1022}, [r13+1600]; +ld.shared.v2.f32 {f1025, f1026}, [r13+1664]; +ld.shared.v2.f32 {f1029, f1030}, [r13+1728]; +ld.shared.v2.f32 {f1033, f1034}, [r13+1792]; +ld.shared.v2.f32 {f1037, f1038}, [r13+1856]; +ld.shared.v2.f32 {f1041, f1042}, [r13+1920]; +ld.shared.v2.f32 {f1045, f1046}, [r13+1984]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1338, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1337, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f1336, f1338, f1337; +sub.f32 f1060, f1338, f1337; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f1335, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f1334, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f1333, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f1332, f1334, f1333; +sub.f32 f1076, f1334, f1333; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f1331, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f1331, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f1329, f1079, 0fBF3504F3; +mul.f32 f1330, f1080, 0f3F3504F3; +sub.f32 f1087, f1329, f1330; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f925, f989; +sub.f32 f1092, f925, f989; +add.f32 f1328, f926, f990; +sub.f32 f1093, f926, f990; +add.f32 f1094, f957, f1021; +sub.f32 f1096, f957, f1021; +add.f32 f1327, f958, f1022; +sub.f32 f1097, f958, f1022; +add.f32 f1098, f1090, f1094; +sub.f32 f1100, f1090, f1094; +add.f32 f1326, f1328, f1327; +sub.f32 f1101, f1328, f1327; +sub.f32 f1102, f1092, f1097; +add.f32 f1104, f1092, f1097; +add.f32 f1325, f1093, f1096; +sub.f32 f1105, f1093, f1096; +add.f32 f1106, f941, f1005; +sub.f32 f1108, f941, f1005; +add.f32 f1324, f942, f1006; +sub.f32 f1109, f942, f1006; +add.f32 f1110, f973, f1037; +sub.f32 f1112, f973, f1037; +add.f32 f1323, f974, f1038; +sub.f32 f1113, f974, f1038; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f1322, f1324, f1323; +sub.f32 f1117, f1324, f1323; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f1321, f1109, f1112; +sub.f32 f1121, f1109, f1112; +mul.f32 f1122, f1118, 0f3F3504F3; +mul.f32 f1123, f1321, 0f3F3504F3; +sub.f32 f1124, f1122, f1123; +add.f32 f1125, f1122, f1123; +mul.f32 f1127, f1121, 0f3F3504F3; +mul.f32 f1320, f1120, 0fBF3504F3; +sub.f32 f1128, f1320, f1127; +mul.f32 f1129, f1121, 0fBF3504F3; +fma.rn.f32 f1130, f1120, 0f3F3504F3, f1129; +add.f32 f1131, f929, f993; +sub.f32 f1133, f929, f993; +add.f32 f1319, f930, f994; +sub.f32 f1134, f930, f994; +add.f32 f1135, f961, f1025; +sub.f32 f1137, f961, f1025; +add.f32 f1318, f962, f1026; +sub.f32 f1138, f962, f1026; +add.f32 f1139, f1131, f1135; +sub.f32 f1141, f1131, f1135; +add.f32 f1317, f1319, f1318; +sub.f32 f1142, f1319, f1318; +sub.f32 f1143, f1133, f1138; +add.f32 f1145, f1133, f1138; +add.f32 f1316, f1134, f1137; +sub.f32 f1146, f1134, f1137; +add.f32 f1147, f945, f1009; +sub.f32 f1149, f945, f1009; +add.f32 f1315, f946, f1010; +sub.f32 f1150, f946, f1010; +add.f32 f1151, f977, f1041; +sub.f32 f1153, f977, f1041; +add.f32 f1314, f978, f1042; +sub.f32 f1154, f978, f1042; +add.f32 f1155, f1147, f1151; +sub.f32 f1157, f1147, f1151; +add.f32 f1313, f1315, f1314; +sub.f32 f1158, f1315, f1314; +sub.f32 f1159, f1149, f1154; +add.f32 f1161, f1149, f1154; +add.f32 f1312, f1150, f1153; +sub.f32 f1162, f1150, f1153; +mul.f32 f1163, f1159, 0f3F3504F3; +mul.f32 f1164, f1312, 0f3F3504F3; +sub.f32 f1165, f1163, f1164; +add.f32 f1166, f1163, f1164; +mul.f32 f1168, f1162, 0f3F3504F3; +mul.f32 f1311, f1161, 0fBF3504F3; +sub.f32 f1169, f1311, f1168; +mul.f32 f1170, f1162, 0fBF3504F3; +fma.rn.f32 f1171, f1161, 0f3F3504F3, f1170; +add.f32 f1172, f933, f997; +sub.f32 f1174, f933, f997; +add.f32 f1310, f934, f998; +sub.f32 f1175, f934, f998; +add.f32 f1176, f965, f1029; +sub.f32 f1178, f965, f1029; +add.f32 f1309, f966, f1030; +sub.f32 f1179, f966, f1030; +add.f32 f1180, f1172, f1176; +sub.f32 f1182, f1172, f1176; +add.f32 f1308, f1310, f1309; +sub.f32 f1183, f1310, f1309; +sub.f32 f1184, f1174, f1179; +add.f32 f1186, f1174, f1179; +add.f32 f1307, f1175, f1178; +sub.f32 f1187, f1175, f1178; +add.f32 f1188, f949, f1013; +sub.f32 f1190, f949, f1013; +add.f32 f1306, f950, f1014; +sub.f32 f1191, f950, f1014; +add.f32 f1192, f981, f1045; +sub.f32 f1194, f981, f1045; +add.f32 f1305, f982, f1046; +sub.f32 f1195, f982, f1046; +add.f32 f1196, f1188, f1192; +sub.f32 f1198, f1188, f1192; +add.f32 f1304, f1306, f1305; +sub.f32 f1199, f1306, f1305; +sub.f32 f1200, f1190, f1195; +add.f32 f1202, f1190, f1195; +add.f32 f1303, f1191, f1194; +sub.f32 f1203, f1191, f1194; +mul.f32 f1204, f1200, 0f3F3504F3; +mul.f32 f1205, f1303, 0f3F3504F3; +sub.f32 f1206, f1204, f1205; +add.f32 f1207, f1204, f1205; +mul.f32 f1301, f1202, 0fBF3504F3; +mul.f32 f1302, f1203, 0f3F3504F3; +sub.f32 f1210, f1301, f1302; +mul.f32 f1211, f1203, 0fBF3504F3; +fma.rn.f32 f1212, f1202, 0f3F3504F3, f1211; +add.f32 %1, f1336, f1332; +add.f32 %0, f1057, f1073; +add.f32 %3, f1326, f1322; +add.f32 %2, f1098, f1114; +add.f32 %5, f1317, f1313; +add.f32 %4, f1139, f1155; +add.f32 %6, f1180, f1196; +add.f32 %7, f1308, f1304; +add.f32 %8, f1061, f1083; +add.f32 %9, f1335, f1084; +add.f32 %10, f1102, f1124; +add.f32 %11, f1325, f1125; +add.f32 %13, f1316, f1166; +add.f32 %12, f1143, f1165; +add.f32 %15, f1307, f1207; +add.f32 %14, f1184, f1206; +add.f32 %17, f1060, f1075; +sub.f32 %16, f1059, f1076; +sub.f32 %18, f1100, f1117; +add.f32 %19, f1101, f1116; +sub.f32 %20, f1141, f1158; +add.f32 %21, f1142, f1157; +sub.f32 %22, f1182, f1199; +add.f32 %23, f1183, f1198; +add.f32 %24, f1063, f1087; +add.f32 %25, f1064, f1089; +add.f32 %27, f1105, f1130; +add.f32 %26, f1104, f1128; +add.f32 %29, f1146, f1171; +add.f32 %28, f1145, f1169; +add.f32 %30, f1186, f1210; +add.f32 %31, f1187, f1212; +sub.f32 %33, f1336, f1332; +sub.f32 %32, f1057, f1073; +sub.f32 %35, f1326, f1322; +sub.f32 %34, f1098, f1114; +sub.f32 %37, f1317, f1313; +sub.f32 %36, f1139, f1155; +sub.f32 %39, f1308, f1304; +sub.f32 %38, f1180, f1196; +sub.f32 %41, f1335, f1084; +sub.f32 %40, f1061, f1083; +sub.f32 %43, f1325, f1125; +sub.f32 %42, f1102, f1124; +sub.f32 %45, f1316, f1166; +sub.f32 %44, f1143, f1165; +sub.f32 %47, f1307, f1207; +sub.f32 %46, f1184, f1206; +sub.f32 %49, f1060, f1075; +add.f32 %48, f1059, f1076; +sub.f32 %51, f1101, f1116; +add.f32 %50, f1100, f1117; +sub.f32 %53, f1142, f1157; +add.f32 %52, f1141, f1158; +sub.f32 %55, f1183, f1198; +add.f32 %54, f1182, f1199; +sub.f32 %57, f1064, f1089; +sub.f32 %56, f1063, f1087; +sub.f32 %59, f1105, f1130; +sub.f32 %58, f1104, f1128; +sub.f32 %61, f1146, f1171; +sub.f32 %60, f1145, f1169; +sub.f32 %63, f1187, f1212; +sub.f32 %62, f1186, f1210; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<268, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<216>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -2048; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 2016; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+512]; +ld.shared.v2.f32 {f70, f71}, [r13+1024]; +ld.shared.v2.f32 {f74, f75}, [r13+1536]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 1920; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+512]; +ld.shared.v2.f32 {f131, f132}, [r20+1024]; +ld.shared.v2.f32 {f135, f136}, [r20+1536]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +sub.f32 f149, f141, f146; +add.f32 f150, f142, f145; +add.f32 f151, f141, f146; +sub.f32 f152, f142, f145; +and.b32 r21, r5, 48; +bfe.u32 r22, r5, 4, 2; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f150, f154; +mul.f32 f158, f149, f154; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f148, f164; +mul.f32 f166, f147, f164; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f152, f172; +mul.f32 f174, f151, f172; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 1536; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f153, f149, f157; +sub.f32 f179, f159, f158; +st.shared.v2.f32 [r26+128], {f178, f179}; +fma.rn.f32 f180, f162, f147, f165; +sub.f32 f181, f167, f166; +st.shared.v2.f32 [r26+256], {f180, f181}; +sub.f32 f182, f175, f174; +fma.rn.f32 f183, f170, f151, f173; +st.shared.v2.f32 [r26+384], {f183, f182}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+512]; +ld.shared.v2.f32 {f192, f193}, [r27+1024]; +ld.shared.v2.f32 {f196, f197}, [r27+1536]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +add.f32 %1, f201, f205; +add.f32 %0, f200, f204; +add.f32 %3, f203, f206; +sub.f32 %2, f202, f207; +sub.f32 %5, f201, f205; +sub.f32 %4, f200, f204; +sub.f32 %7, f203, f206; +add.f32 %6, f202, f207; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<269, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<192>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %12, %17; +add.f32 f18, %13, %19; +sub.f32 f19, %12, %17; +sub.f32 f20, %13, %19; +add.f32 f21, %14, %20; +add.f32 f22, %16, %21; +sub.f32 f23, %14, %20; +sub.f32 f24, %16, %21; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1008; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+256]; +ld.shared.f32 f64, [r13+512]; +ld.shared.f32 f65, [r13+768]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+256]; +ld.shared.f32 f68, [r13+512]; +ld.shared.f32 f69, [r13+768]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 960; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+256]; +ld.shared.f32 f117, [r21+512]; +ld.shared.f32 f118, [r21+768]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+256]; +ld.shared.f32 f121, [r21+512]; +ld.shared.f32 f122, [r21+768]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +sub.f32 f135, f125, f130; +add.f32 f136, f126, f129; +add.f32 f137, f125, f130; +sub.f32 f138, f126, f129; +and.b32 r22, r5, 48; +bfe.u32 r23, r5, 4, 2; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f136, f140; +fma.rn.f32 f144, f139, f135, f143; +mul.f32 f145, f135, f140; +mul.f32 f146, f139, f136; +sub.f32 f147, f146, f145; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f134, f152; +fma.rn.f32 f154, f150, f133, f153; +mul.f32 f155, f133, f152; +mul.f32 f156, f150, f134; +sub.f32 f157, f156, f155; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f138, f162; +fma.rn.f32 f164, f160, f137, f163; +mul.f32 f165, f137, f162; +mul.f32 f166, f160, f138; +sub.f32 f167, f166, f165; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 768; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f144; +st.shared.f32 [r27+128], f154; +st.shared.f32 [r27+192], f164; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+256]; +ld.shared.f32 f170, [r28+512]; +ld.shared.f32 f171, [r28+768]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+256]; +ld.shared.f32 f174, [r28+512]; +ld.shared.f32 f175, [r28+768]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 %0, f176, f180; +add.f32 %1, f177, f181; +add.f32 %3, f179, f182; +sub.f32 %2, f178, f183; +sub.f32 %4, f176, f180; +sub.f32 %5, f177, f181; +sub.f32 %7, f179, f182; +add.f32 %6, f178, f183; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<270, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1456>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1454, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1452, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1451, f1454, f1452; +sub.f32 f140, f1454, f1452; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1450, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1447, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1445, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1444, f1447, f1445; +sub.f32 f156, f1447, f1445; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1443, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1443, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1441, f159, 0fBF3504F3; +mul.f32 f1442, f160, 0f3F3504F3; +sub.f32 f167, f1441, f1442; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1440, f1451, f1444; +sub.f32 f173, f1451, f1444; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1439, f1450, f164; +sub.f32 f177, f1450, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1438, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1437, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1435, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1432, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1431, f1435, f1432; +sub.f32 f197, f1435, f1432; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1430, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1428, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1426, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1425, f1428, f1426; +sub.f32 f213, f1428, f1426; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1424, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1424, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1422, f216, 0fBF3504F3; +mul.f32 f1423, f217, 0f3F3504F3; +sub.f32 f224, f1422, f1423; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1421, f1431, f1425; +sub.f32 f230, f1431, f1425; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1420, f1430, f221; +sub.f32 f234, f1430, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1419, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1418, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1416, f231, 0f3F6C835E; +mul.f32 f1417, f1420, 0f3EC3EF15; +sub.f32 f245, f1416, f1417; +mul.f32 f246, f1420, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1419, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1418, 0f3F6C835E; +mul.f32 f1415, f239, 0f3EC3EF15; +sub.f32 f254, f1415, f253; +mul.f32 f255, f1418, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1414, f233, 0fBEC3EF15; +sub.f32 f259, f1414, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1412, f237, 0fBF3504F3; +mul.f32 f1413, f238, 0f3F3504F3; +sub.f32 f264, f1412, f1413; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1410, f241, 0fBF6C835E; +mul.f32 f1411, f242, 0f3EC3EF15; +sub.f32 f269, f1410, f1411; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1409, f1440, f1421; +sub.f32 f275, f1440, f1421; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1408, f1439, f247; +sub.f32 f279, f1439, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1407, f1438, f251; +sub.f32 f283, f1438, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1406, f1437, f256; +sub.f32 f287, f1437, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1405, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1404, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1403, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1402, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1399, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1397, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1396, f1399, f1397; +sub.f32 f315, f1399, f1397; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1395, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1393, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1390, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1389, f1393, f1390; +sub.f32 f331, f1393, f1390; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1388, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1388, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1387, f334, 0fBF3504F3; +sub.f32 f342, f1387, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1386, f1396, f1389; +sub.f32 f348, f1396, f1389; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1385, f1395, f339; +sub.f32 f352, f1395, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1384, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1383, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1381, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1379, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1378, f1381, f1379; +sub.f32 f372, f1381, f1379; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1377, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1374, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1373, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1372, f1374, f1373; +sub.f32 f388, f1374, f1373; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1371, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1371, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1369, f391, 0fBF3504F3; +mul.f32 f1370, f392, 0f3F3504F3; +sub.f32 f399, f1369, f1370; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1368, f1378, f1372; +sub.f32 f405, f1378, f1372; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1367, f1377, f396; +sub.f32 f409, f1377, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1366, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1365, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1367, 0f3EC3EF15; +mul.f32 f1364, f406, 0f3F6C835E; +sub.f32 f420, f1364, f419; +mul.f32 f421, f1367, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1366, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1365, 0f3F6C835E; +mul.f32 f1363, f414, 0f3EC3EF15; +sub.f32 f429, f1363, f428; +mul.f32 f430, f1365, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1362, f408, 0fBEC3EF15; +sub.f32 f434, f1362, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1361, f412, 0fBF3504F3; +sub.f32 f439, f1361, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1360, f416, 0fBF6C835E; +sub.f32 f444, f1360, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1359, f1386, f1368; +sub.f32 f450, f1386, f1368; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1358, f1385, f422; +sub.f32 f454, f1385, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1357, f1384, f426; +sub.f32 f458, f1384, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1356, f1383, f431; +sub.f32 f462, f1383, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1355, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1354, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1353, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1352, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1358, 0f3E47C5C2; +mul.f32 f1351, f451, 0f3F7B14BE; +sub.f32 f481, f1351, f480; +mul.f32 f482, f1358, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1357, 0f3EC3EF15; +mul.f32 f1350, f455, 0f3F6C835E; +sub.f32 f486, f1350, f485; +mul.f32 f487, f1357, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1356, 0f3F0E39DA; +mul.f32 f1349, f459, 0f3F54DB31; +sub.f32 f491, f1349, f490; +mul.f32 f492, f1356, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1355, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1354, 0f3F54DB31; +mul.f32 f1348, f467, 0f3F0E39DA; +sub.f32 f500, f1348, f499; +mul.f32 f501, f1354, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1353, 0f3F6C835E; +mul.f32 f1347, f471, 0f3EC3EF15; +sub.f32 f505, f1347, f504; +mul.f32 f506, f1353, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1352, 0f3F7B14BE; +mul.f32 f1346, f475, 0f3E47C5C2; +sub.f32 f510, f1346, f509; +mul.f32 f511, f1352, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1345, f453, 0fBE47C5C2; +sub.f32 f515, f1345, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1343, f457, 0fBEC3EF15; +mul.f32 f1344, f458, 0f3F6C835E; +sub.f32 f520, f1343, f1344; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1341, f461, 0fBF0E39DA; +mul.f32 f1342, f462, 0f3F54DB31; +sub.f32 f525, f1341, f1342; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1339, f465, 0fBF3504F3; +mul.f32 f1340, f466, 0f3F3504F3; +sub.f32 f530, f1339, f1340; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1337, f469, 0fBF54DB31; +mul.f32 f1338, f470, 0f3F0E39DA; +sub.f32 f535, f1337, f1338; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1336, f473, 0fBF6C835E; +sub.f32 f540, f1336, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1335, f477, 0fBF7B14BE; +sub.f32 f545, f1335, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1334, f1409, f1359; +sub.f32 f551, f1409, f1359; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1333, f1408, f483; +sub.f32 f555, f1408, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1332, f1407, f488; +sub.f32 f559, f1407, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1331, f1406, f493; +sub.f32 f563, f1406, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1330, f1405, f497; +sub.f32 f567, f1405, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f1329, f1404, f502; +sub.f32 f571, f1404, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f1328, f1403, f507; +sub.f32 f575, f1403, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f1327, f1402, f512; +sub.f32 f579, f1402, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f1326, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f1325, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f1324, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f1323, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f1322, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1321, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1320, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1319, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f1333, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f1333; +sub.f32 f620, f619, f618; +mul.f32 f1317, f612, f612; +mul.f32 f1318, f613, f613; +sub.f32 f623, f1317, f1318; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f1332, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f1332; +sub.f32 f630, f629, f628; +mul.f32 f1315, f612, f623; +mul.f32 f1316, f613, f625; +sub.f32 f633, f1315, f1316; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f1331, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f1331; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f1314, f612, f633; +sub.f32 f643, f1314, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f1330, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f1330; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f1313, f612, f643; +sub.f32 f653, f1313, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f1329, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f1329; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f1312, f612, f653; +sub.f32 f663, f1312, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f1328, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f1328; +sub.f32 f670, f669, f668; +mul.f32 f1310, f612, f663; +mul.f32 f1311, f613, f665; +sub.f32 f673, f1310, f1311; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f1327, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f1327; +sub.f32 f680, f679, f678; +mul.f32 f1308, f612, f673; +mul.f32 f1309, f613, f675; +sub.f32 f683, f1308, f1309; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f1326, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f1326; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f1307, f612, f683; +sub.f32 f693, f1307, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f1325, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f1325; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f1306, f612, f693; +sub.f32 f703, f1306, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f1324, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f1324; +sub.f32 f710, f709, f708; +mul.f32 f1304, f612, f703; +mul.f32 f1305, f613, f705; +sub.f32 f713, f1304, f1305; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f1323, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f1323; +sub.f32 f720, f719, f718; +mul.f32 f1302, f612, f713; +mul.f32 f1303, f613, f715; +sub.f32 f723, f1302, f1303; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f1322, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f1322; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f1301, f612, f723; +sub.f32 f733, f1301, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f1321, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f1321; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f1300, f612, f733; +sub.f32 f743, f1300, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f1320, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f1320; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f1299, f612, f743; +sub.f32 f753, f1299, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f1319, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f1319; +sub.f32 f760, f759, f758; +mul.f32 f1297, f612, f753; +mul.f32 f1298, f613, f755; +sub.f32 f763, f1297, f1298; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f1295, f612, f763; +mul.f32 f1296, f613, f765; +sub.f32 f773, f1295, f1296; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f1294, f612, f773; +sub.f32 f783, f1294, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f1293, f612, f783; +sub.f32 f793, f1293, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f1292, f612, f793; +sub.f32 f803, f1292, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f1290, f612, f803; +mul.f32 f1291, f613, f805; +sub.f32 f813, f1290, f1291; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f1288, f612, f813; +mul.f32 f1289, f613, f815; +sub.f32 f823, f1288, f1289; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f1287, f612, f823; +sub.f32 f833, f1287, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f1286, f612, f833; +sub.f32 f843, f1286, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f1284, f612, f843; +mul.f32 f1285, f613, f845; +sub.f32 f853, f1284, f1285; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f1282, f612, f853; +mul.f32 f1283, f613, f855; +sub.f32 f863, f1282, f1283; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f1281, f612, f863; +sub.f32 f873, f1281, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f1280, f612, f873; +sub.f32 f883, f1280, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f1279, f612, f883; +sub.f32 f893, f1279, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f1277, f612, f893; +mul.f32 f1278, f613, f895; +sub.f32 f903, f1277, f1278; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f1275, f612, f903; +mul.f32 f1276, f613, f905; +sub.f32 f913, f1275, f1276; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mov.u32 r17, %tid.x; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +and.b32 r14, r17, 7; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -1024; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 896; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+32]; +ld.shared.f32 f923, [r13+64]; +ld.shared.f32 f924, [r13+96]; +ld.shared.f32 f925, [r13+128]; +ld.shared.f32 f926, [r13+160]; +ld.shared.f32 f927, [r13+192]; +ld.shared.f32 f928, [r13+224]; +ld.shared.f32 f929, [r13+256]; +ld.shared.f32 f930, [r13+288]; +ld.shared.f32 f931, [r13+320]; +ld.shared.f32 f932, [r13+352]; +ld.shared.f32 f933, [r13+384]; +ld.shared.f32 f934, [r13+416]; +ld.shared.f32 f935, [r13+448]; +ld.shared.f32 f936, [r13+480]; +ld.shared.f32 f937, [r13+512]; +ld.shared.f32 f938, [r13+544]; +ld.shared.f32 f939, [r13+576]; +ld.shared.f32 f940, [r13+608]; +ld.shared.f32 f941, [r13+640]; +ld.shared.f32 f942, [r13+672]; +ld.shared.f32 f943, [r13+704]; +ld.shared.f32 f944, [r13+736]; +ld.shared.f32 f945, [r13+768]; +ld.shared.f32 f946, [r13+800]; +ld.shared.f32 f947, [r13+832]; +ld.shared.f32 f948, [r13+864]; +ld.shared.f32 f949, [r13+896]; +ld.shared.f32 f950, [r13+928]; +ld.shared.f32 f951, [r13+960]; +ld.shared.f32 f952, [r13+992]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1334, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+32]; +ld.shared.f32 f955, [r13+64]; +ld.shared.f32 f956, [r13+96]; +ld.shared.f32 f957, [r13+128]; +ld.shared.f32 f958, [r13+160]; +ld.shared.f32 f959, [r13+192]; +ld.shared.f32 f960, [r13+224]; +ld.shared.f32 f961, [r13+256]; +ld.shared.f32 f962, [r13+288]; +ld.shared.f32 f963, [r13+320]; +ld.shared.f32 f964, [r13+352]; +ld.shared.f32 f965, [r13+384]; +ld.shared.f32 f966, [r13+416]; +ld.shared.f32 f967, [r13+448]; +ld.shared.f32 f968, [r13+480]; +ld.shared.f32 f969, [r13+512]; +ld.shared.f32 f970, [r13+544]; +ld.shared.f32 f971, [r13+576]; +ld.shared.f32 f972, [r13+608]; +ld.shared.f32 f973, [r13+640]; +ld.shared.f32 f974, [r13+672]; +ld.shared.f32 f975, [r13+704]; +ld.shared.f32 f976, [r13+736]; +ld.shared.f32 f977, [r13+768]; +ld.shared.f32 f978, [r13+800]; +ld.shared.f32 f979, [r13+832]; +ld.shared.f32 f980, [r13+864]; +ld.shared.f32 f981, [r13+896]; +ld.shared.f32 f982, [r13+928]; +ld.shared.f32 f983, [r13+960]; +ld.shared.f32 f984, [r13+992]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1274, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1273, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f1272, f1274, f1273; +sub.f32 f996, f1274, f1273; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f1271, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f1270, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f1269, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f1268, f1270, f1269; +sub.f32 f1012, f1270, f1269; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f1267, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f1267, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f1265, f1015, 0fBF3504F3; +mul.f32 f1266, f1016, 0f3F3504F3; +sub.f32 f1023, f1265, f1266; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f922, f938; +sub.f32 f1028, f922, f938; +add.f32 f1264, f954, f970; +sub.f32 f1029, f954, f970; +add.f32 f1030, f930, f946; +sub.f32 f1032, f930, f946; +add.f32 f1263, f962, f978; +sub.f32 f1033, f962, f978; +add.f32 f1034, f1026, f1030; +sub.f32 f1036, f1026, f1030; +add.f32 f1262, f1264, f1263; +sub.f32 f1037, f1264, f1263; +sub.f32 f1038, f1028, f1033; +add.f32 f1040, f1028, f1033; +add.f32 f1261, f1029, f1032; +sub.f32 f1041, f1029, f1032; +add.f32 f1042, f926, f942; +sub.f32 f1044, f926, f942; +add.f32 f1260, f958, f974; +sub.f32 f1045, f958, f974; +add.f32 f1046, f934, f950; +sub.f32 f1048, f934, f950; +add.f32 f1259, f966, f982; +sub.f32 f1049, f966, f982; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f1258, f1260, f1259; +sub.f32 f1053, f1260, f1259; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f1257, f1045, f1048; +sub.f32 f1057, f1045, f1048; +mul.f32 f1058, f1054, 0f3F3504F3; +mul.f32 f1059, f1257, 0f3F3504F3; +sub.f32 f1060, f1058, f1059; +add.f32 f1061, f1058, f1059; +mul.f32 f1063, f1057, 0f3F3504F3; +mul.f32 f1256, f1056, 0fBF3504F3; +sub.f32 f1064, f1256, f1063; +mul.f32 f1065, f1057, 0fBF3504F3; +fma.rn.f32 f1066, f1056, 0f3F3504F3, f1065; +add.f32 f1067, f923, f939; +sub.f32 f1069, f923, f939; +add.f32 f1255, f955, f971; +sub.f32 f1070, f955, f971; +add.f32 f1071, f931, f947; +sub.f32 f1073, f931, f947; +add.f32 f1254, f963, f979; +sub.f32 f1074, f963, f979; +add.f32 f1075, f1067, f1071; +sub.f32 f1077, f1067, f1071; +add.f32 f1253, f1255, f1254; +sub.f32 f1078, f1255, f1254; +sub.f32 f1079, f1069, f1074; +add.f32 f1081, f1069, f1074; +add.f32 f1252, f1070, f1073; +sub.f32 f1082, f1070, f1073; +add.f32 f1083, f927, f943; +sub.f32 f1085, f927, f943; +add.f32 f1251, f959, f975; +sub.f32 f1086, f959, f975; +add.f32 f1087, f935, f951; +sub.f32 f1089, f935, f951; +add.f32 f1250, f967, f983; +sub.f32 f1090, f967, f983; +add.f32 f1091, f1083, f1087; +sub.f32 f1093, f1083, f1087; +add.f32 f1249, f1251, f1250; +sub.f32 f1094, f1251, f1250; +sub.f32 f1095, f1085, f1090; +add.f32 f1097, f1085, f1090; +add.f32 f1248, f1086, f1089; +sub.f32 f1098, f1086, f1089; +mul.f32 f1099, f1095, 0f3F3504F3; +mul.f32 f1100, f1248, 0f3F3504F3; +sub.f32 f1101, f1099, f1100; +add.f32 f1102, f1099, f1100; +mul.f32 f1104, f1098, 0f3F3504F3; +mul.f32 f1247, f1097, 0fBF3504F3; +sub.f32 f1105, f1247, f1104; +mul.f32 f1106, f1098, 0fBF3504F3; +fma.rn.f32 f1107, f1097, 0f3F3504F3, f1106; +add.f32 f1108, f924, f940; +sub.f32 f1110, f924, f940; +add.f32 f1246, f956, f972; +sub.f32 f1111, f956, f972; +add.f32 f1112, f932, f948; +sub.f32 f1114, f932, f948; +add.f32 f1245, f964, f980; +sub.f32 f1115, f964, f980; +add.f32 f1116, f1108, f1112; +sub.f32 f1118, f1108, f1112; +add.f32 f1244, f1246, f1245; +sub.f32 f1119, f1246, f1245; +sub.f32 f1120, f1110, f1115; +add.f32 f1122, f1110, f1115; +add.f32 f1243, f1111, f1114; +sub.f32 f1123, f1111, f1114; +add.f32 f1124, f928, f944; +sub.f32 f1126, f928, f944; +add.f32 f1242, f960, f976; +sub.f32 f1127, f960, f976; +add.f32 f1128, f936, f952; +sub.f32 f1130, f936, f952; +add.f32 f1241, f968, f984; +sub.f32 f1131, f968, f984; +add.f32 f1132, f1124, f1128; +sub.f32 f1134, f1124, f1128; +add.f32 f1240, f1242, f1241; +sub.f32 f1135, f1242, f1241; +sub.f32 f1136, f1126, f1131; +add.f32 f1138, f1126, f1131; +add.f32 f1239, f1127, f1130; +sub.f32 f1139, f1127, f1130; +mul.f32 f1140, f1136, 0f3F3504F3; +mul.f32 f1141, f1239, 0f3F3504F3; +sub.f32 f1142, f1140, f1141; +add.f32 f1143, f1140, f1141; +mul.f32 f1237, f1138, 0fBF3504F3; +mul.f32 f1238, f1139, 0f3F3504F3; +sub.f32 f1146, f1237, f1238; +mul.f32 f1147, f1139, 0fBF3504F3; +fma.rn.f32 f1148, f1138, 0f3F3504F3, f1147; +add.f32 %0, f993, f1009; +add.f32 %1, f1272, f1268; +add.f32 %2, f1034, f1050; +add.f32 %3, f1262, f1258; +add.f32 %4, f1075, f1091; +add.f32 %5, f1253, f1249; +add.f32 %7, f1244, f1240; +add.f32 %6, f1116, f1132; +add.f32 %8, f997, f1019; +add.f32 %9, f1271, f1020; +add.f32 %10, f1038, f1060; +add.f32 %11, f1261, f1061; +add.f32 %13, f1252, f1102; +add.f32 %12, f1079, f1101; +add.f32 %15, f1243, f1143; +add.f32 %14, f1120, f1142; +add.f32 %17, f996, f1011; +sub.f32 %16, f995, f1012; +sub.f32 %18, f1036, f1053; +add.f32 %19, f1037, f1052; +sub.f32 %20, f1077, f1094; +add.f32 %21, f1078, f1093; +sub.f32 %22, f1118, f1135; +add.f32 %23, f1119, f1134; +add.f32 %24, f999, f1023; +add.f32 %25, f1000, f1025; +add.f32 %27, f1041, f1066; +add.f32 %26, f1040, f1064; +add.f32 %29, f1082, f1107; +add.f32 %28, f1081, f1105; +add.f32 %30, f1122, f1146; +add.f32 %31, f1123, f1148; +sub.f32 %32, f993, f1009; +sub.f32 %33, f1272, f1268; +sub.f32 %34, f1034, f1050; +sub.f32 %35, f1262, f1258; +sub.f32 %36, f1075, f1091; +sub.f32 %37, f1253, f1249; +sub.f32 %38, f1116, f1132; +sub.f32 %39, f1244, f1240; +sub.f32 %41, f1271, f1020; +sub.f32 %40, f997, f1019; +sub.f32 %43, f1261, f1061; +sub.f32 %42, f1038, f1060; +sub.f32 %45, f1252, f1102; +sub.f32 %44, f1079, f1101; +sub.f32 %47, f1243, f1143; +sub.f32 %46, f1120, f1142; +sub.f32 %49, f996, f1011; +add.f32 %48, f995, f1012; +sub.f32 %51, f1037, f1052; +add.f32 %50, f1036, f1053; +sub.f32 %53, f1078, f1093; +add.f32 %52, f1077, f1094; +sub.f32 %55, f1119, f1134; +add.f32 %54, f1118, f1135; +sub.f32 %57, f1000, f1025; +sub.f32 %56, f999, f1023; +sub.f32 %59, f1041, f1066; +sub.f32 %58, f1040, f1064; +sub.f32 %61, f1082, f1107; +sub.f32 %60, f1081, f1105; +sub.f32 %63, f1123, f1148; +sub.f32 %62, f1122, f1146; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<271, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<160>; +.reg .b32 r<56>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %12, %14; +sub.f32 f10, %13, %15; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 2032; +add.s32 r11, r8, r10; +add.f32 f18, %13, %15; +add.f32 f19, %12, %14; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 1016; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+1024]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 2016; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 1008; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+1024]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 1984; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 992; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+1024]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 120; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1920; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 960; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+1024]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 3; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f94, f96; +mul.f32 f100, f93, f96; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1792; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f95, f93, f99; +sub.f32 f105, f101, f100; +st.shared.v2.f32 [r39+128], {f104, f105}; +barrier.sync 0; +and.b32 r40, r9, 896; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+1024]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f115, f117; +mul.f32 f121, f114, f117; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1536; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f116, f114, f120; +sub.f32 f126, f122, f121; +st.shared.v2.f32 [r46+256], {f125, f126}; +barrier.sync 0; +and.b32 r47, r9, 768; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+1024]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f136, f138; +mul.f32 f142, f135, f138; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 1024; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f137, f135, f141; +sub.f32 f147, f143, f142; +st.shared.v2.f32 [r53+512], {f146, f147}; +barrier.sync 0; +and.b32 r54, r9, 512; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+1024]; +add.f32 %1, f149, f153; +add.f32 %0, f148, f152; +sub.f32 %3, f149, f153; +sub.f32 %2, f148, f152; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<272, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<132>; +.reg .b32 r<56>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %12, %14; +add.f32 f10, %13, %15; +sub.f32 f11, %12, %14; +sub.f32 f12, %13, %15; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 1016; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 508; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+512]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+512]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 1008; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 504; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+512]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+512]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 992; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 496; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+512]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+512]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 120; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 960; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 480; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+512]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+512]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 3; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f80, f82; +fma.rn.f32 f86, f81, f79, f85; +mul.f32 f87, f79, f82; +mul.f32 f88, f81, f80; +sub.f32 f89, f88, f87; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 896; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f86; +barrier.sync 0; +and.b32 r40, r11, 448; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+512]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+512]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f97, f99; +fma.rn.f32 f103, f98, f96, f102; +mul.f32 f104, f96, f99; +mul.f32 f105, f98, f97; +sub.f32 f106, f105, f104; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 768; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f103; +barrier.sync 0; +and.b32 r47, r11, 384; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+512]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+512]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f114, f116; +fma.rn.f32 f120, f115, f113, f119; +mul.f32 f121, f113, f116; +mul.f32 f122, f115, f114; +sub.f32 f123, f122, f121; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 512; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f120; +barrier.sync 0; +and.b32 r54, r11, 256; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+512]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+512]; +add.f32 %0, f124, f125; +add.f32 %1, f126, f127; +sub.f32 %2, f124, f125; +sub.f32 %3, f126, f127; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..0e48a0afbb037 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp64_fwd.hpp.inc @@ -0,0 +1,2765 @@ +#ifndef CUFFTDX_FFT_256_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_256_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<460, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<347>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+512]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1984; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+256]; +ld.shared.f64 fd160, [r13+512]; +ld.shared.f64 fd161, [r13+768]; +ld.shared.f64 fd162, [r13+1024]; +ld.shared.f64 fd163, [r13+1280]; +ld.shared.f64 fd164, [r13+1536]; +ld.shared.f64 fd165, [r13+1792]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+256]; +ld.shared.f64 fd168, [r13+512]; +ld.shared.f64 fd169, [r13+768]; +ld.shared.f64 fd170, [r13+1024]; +ld.shared.f64 fd171, [r13+1280]; +ld.shared.f64 fd172, [r13+1536]; +ld.shared.f64 fd173, [r13+1792]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 24; +bfe.u32 r15, r5, 3, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+64]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1536; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+256]; +ld.shared.f64 fd301, [r21+512]; +ld.shared.f64 fd302, [r21+768]; +ld.shared.f64 fd303, [r21+1024]; +ld.shared.f64 fd304, [r21+1280]; +ld.shared.f64 fd305, [r21+1536]; +ld.shared.f64 fd306, [r21+1792]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+256]; +ld.shared.f64 fd309, [r21+512]; +ld.shared.f64 fd310, [r21+768]; +ld.shared.f64 fd311, [r21+1024]; +ld.shared.f64 fd312, [r21+1280]; +ld.shared.f64 fd313, [r21+1536]; +ld.shared.f64 fd314, [r21+1792]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd300, fd304; +add.f64 fd324, fd308, fd312; +sub.f64 fd325, fd300, fd304; +sub.f64 fd326, fd308, fd312; +add.f64 fd327, fd302, fd306; +add.f64 fd328, fd310, fd314; +sub.f64 fd329, fd302, fd306; +sub.f64 fd330, fd310, fd314; +add.f64 %0, fd315, fd319; +add.f64 %1, fd316, fd320; +add.f64 %2, fd323, fd327; +add.f64 %3, fd324, fd328; +sub.f64 %5, fd318, fd321; +add.f64 %4, fd317, fd322; +sub.f64 %7, fd326, fd329; +add.f64 %6, fd325, fd330; +sub.f64 %8, fd315, fd319; +sub.f64 %9, fd316, fd320; +sub.f64 %10, fd323, fd327; +sub.f64 %11, fd324, fd328; +add.f64 %13, fd318, fd321; +sub.f64 %12, fd317, fd322; +add.f64 %15, fd326, fd329; +sub.f64 %14, fd325, fd330; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<461, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<761>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd754, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd752, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd751, fd754, fd752; +sub.f64 fd76, fd754, fd752; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd750, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd747, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd745, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd744, fd747, fd745; +sub.f64 fd92, fd747, fd745; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd743, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd743, 0dBFE6A09E667F3BCD; +mul.f64 fd742, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd742, fd98; +mul.f64 fd100, fd743, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd741, fd751, fd744; +sub.f64 fd109, fd751, fd744; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd740, fd750, fd101; +sub.f64 fd113, fd750, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd739, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd738, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd736, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd733, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd732, fd736, fd733; +sub.f64 fd133, fd736, fd733; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd731, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd729, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd727, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd726, fd729, fd727; +sub.f64 fd149, fd729, fd727; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd725, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd725, 0dBFE6A09E667F3BCD; +mul.f64 fd724, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd724, fd155; +mul.f64 fd157, fd725, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd723, fd732, fd726; +sub.f64 fd166, fd732, fd726; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd722, fd731, fd158; +sub.f64 fd170, fd731, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd721, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd720, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd718, fd167, 0d3FED906BCF328D46; +mul.f64 fd719, fd722, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd718, fd719; +mul.f64 fd182, fd722, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd716, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd717, fd721, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd716, fd717; +mul.f64 fd187, fd721, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd714, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd715, fd720, 0dBFED906BCF328D46; +sub.f64 fd191, fd714, fd715; +mul.f64 fd192, fd720, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd712, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd713, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd712, fd713; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd710, fd177, 0dBFED906BCF328D46; +mul.f64 fd711, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd710, fd711; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd709, fd740, fd183; +sub.f64 fd213, fd740, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd708, fd739, fd188; +sub.f64 fd217, fd739, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd707, fd738, fd193; +sub.f64 fd221, fd738, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd706, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd705, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd704, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd703, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +and.b32 r14, r15, 15; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd709; +mul.f64 fd244, fd238, fd709; +mul.f64 fd246, fd239, fd239; +mul.f64 fd702, fd238, fd238; +sub.f64 fd247, fd702, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd708; +mul.f64 fd252, fd247, fd708; +mul.f64 fd700, fd238, fd247; +mul.f64 fd701, fd239, fd249; +sub.f64 fd255, fd700, fd701; +mul.f64 fd699, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd707; +mul.f64 fd260, fd255, fd707; +mul.f64 fd262, fd239, fd257; +mul.f64 fd698, fd238, fd255; +sub.f64 fd263, fd698, fd262; +mul.f64 fd697, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd706; +mul.f64 fd268, fd263, fd706; +mul.f64 fd270, fd239, fd265; +mul.f64 fd696, fd238, fd263; +sub.f64 fd271, fd696, fd270; +mul.f64 fd695, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd705; +mul.f64 fd276, fd271, fd705; +mul.f64 fd693, fd238, fd271; +mul.f64 fd694, fd239, fd273; +sub.f64 fd279, fd693, fd694; +mul.f64 fd692, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd704; +mul.f64 fd284, fd279, fd704; +mul.f64 fd286, fd239, fd281; +mul.f64 fd691, fd238, fd279; +sub.f64 fd287, fd691, fd286; +mul.f64 fd690, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd703; +mul.f64 fd292, fd287, fd703; +mul.f64 fd294, fd239, fd289; +mul.f64 fd689, fd238, fd287; +sub.f64 fd295, fd689, fd294; +mul.f64 fd688, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd687, fd741, fd723; +sub.f64 fd686, fd106, fd163; +mul.f64 fd298, fd295, fd686; +mul.f64 fd299, fd297, fd687; +mul.f64 fd300, fd295, fd687; +ld.global.v2.f64 {fd301, fd302}, [rd5+256]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd684, fd238, fd301; +mul.f64 fd685, fd239, fd302; +sub.f64 fd310, fd684, fd685; +mul.f64 fd683, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd682, fd238, fd310; +sub.f64 fd318, fd682, fd317; +mul.f64 fd681, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd679, fd238, fd318; +mul.f64 fd680, fd239, fd320; +sub.f64 fd326, fd679, fd680; +mul.f64 fd678, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd676, fd238, fd326; +mul.f64 fd677, fd239, fd328; +sub.f64 fd334, fd676, fd677; +mul.f64 fd675, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd674, fd238, fd334; +sub.f64 fd342, fd674, fd341; +mul.f64 fd673, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd671, fd238, fd342; +mul.f64 fd672, fd239, fd344; +sub.f64 fd350, fd671, fd672; +mul.f64 fd670, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd669, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 15; +sub.f64 fd758, fd741, fd723; +mul.f64 fd757, fd297, fd758; +mov.u32 r23, %tid.x; +shl.b32 r22, r23, 8; +barrier.sync 0; +and.b32 r11, r22, 3840; +add.s32 r12, r9, r11; +sub.f64 fd760, fd741, fd723; +mul.f64 fd759, fd297, fd760; +add.f64 fd356, fd741, fd723; +sub.f64 fd756, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 15; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 15; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd670, fd243; +st.shared.v2.f64 [r12+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd699, fd251; +st.shared.v2.f64 [r12+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd697, fd259; +st.shared.v2.f64 [r12+48], {fd363, fd362}; +sub.f64 fd364, fd695, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r12+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd692, fd275; +st.shared.v2.f64 [r12+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd690, fd283; +st.shared.v2.f64 [r12+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd688, fd291; +st.shared.v2.f64 [r12+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd756, fd300; +sub.f64 fd373, fd298, fd759; +st.shared.v2.f64 [r12+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd683, fd306; +st.shared.v2.f64 [r12+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd681, fd314; +st.shared.v2.f64 [r12+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd678, fd322; +st.shared.v2.f64 [r12+176], {fd379, fd378}; +sub.f64 fd380, fd675, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r12+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd673, fd338; +st.shared.v2.f64 [r12+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd669, fd346; +st.shared.v2.f64 [r12+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r12+240], {fd387, fd386}; +barrier.sync 0; +mad.lo.s32 r13, r20, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+256]; +ld.shared.v2.f64 {fd396, fd397}, [r13+512]; +ld.shared.v2.f64 {fd400, fd401}, [r13+768]; +ld.shared.v2.f64 {fd404, fd405}, [r13+1024]; +ld.shared.v2.f64 {fd408, fd409}, [r13+1280]; +ld.shared.v2.f64 {fd412, fd413}, [r13+1536]; +ld.shared.v2.f64 {fd416, fd417}, [r13+1792]; +ld.shared.v2.f64 {fd420, fd421}, [r13+2048]; +ld.shared.v2.f64 {fd424, fd425}, [r13+2304]; +ld.shared.v2.f64 {fd428, fd429}, [r13+2560]; +ld.shared.v2.f64 {fd432, fd433}, [r13+2816]; +ld.shared.v2.f64 {fd436, fd437}, [r13+3072]; +ld.shared.v2.f64 {fd440, fd441}, [r13+3328]; +ld.shared.v2.f64 {fd444, fd445}, [r13+3584]; +ld.shared.v2.f64 {fd448, fd449}, [r13+3840]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd668, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd667, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd666, fd668, fd667; +sub.f64 fd463, fd668, fd667; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd665, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd664, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd663, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd662, fd664, fd663; +sub.f64 fd479, fd664, fd663; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd661, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd661, 0dBFE6A09E667F3BCD; +mul.f64 fd660, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd660, fd485; +mul.f64 fd487, fd661, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd659, fd666, fd662; +sub.f64 fd496, fd666, fd662; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd658, fd665, fd488; +sub.f64 fd500, fd665, fd488; +add.f64 fd501, fd462, fd479; +sub.f64 fd503, fd462, fd479; +sub.f64 fd657, fd463, fd478; +add.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd491; +sub.f64 fd507, fd466, fd491; +add.f64 fd656, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd655, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd654, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd653, fd655, fd654; +sub.f64 fd520, fd655, fd654; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd652, fd512, fd515; +add.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd651, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd650, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd649, fd651, fd650; +sub.f64 fd536, fd651, fd650; +add.f64 fd537, fd527, fd532; +sub.f64 fd539, fd527, fd532; +sub.f64 fd648, fd528, fd531; +add.f64 fd540, fd528, fd531; +mul.f64 fd542, fd648, 0dBFE6A09E667F3BCD; +mul.f64 fd647, fd537, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd647, fd542; +mul.f64 fd544, fd648, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd545, fd537, 0dBFE6A09E667F3BCD, fd544; +mul.f64 fd546, fd539, 0dBFE6A09E667F3BCD; +mul.f64 fd547, fd540, 0dBFE6A09E667F3BCD; +sub.f64 fd548, fd546, fd547; +add.f64 fd549, fd546, fd547; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd646, fd653, fd649; +sub.f64 fd553, fd653, fd649; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd645, fd652, fd545; +sub.f64 fd557, fd652, fd545; +add.f64 fd558, fd519, fd536; +sub.f64 fd560, fd519, fd536; +sub.f64 fd644, fd520, fd535; +add.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd548; +sub.f64 fd564, fd523, fd548; +add.f64 fd643, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd641, fd554, 0d3FED906BCF328D46; +mul.f64 fd642, fd645, 0dBFD87DE2A6AEA963; +sub.f64 fd568, fd641, fd642; +mul.f64 fd569, fd645, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0dBFD87DE2A6AEA963, fd569; +mul.f64 fd572, fd644, 0dBFE6A09E667F3BCD; +mul.f64 fd640, fd558, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd640, fd572; +mul.f64 fd574, fd644, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd575, fd558, 0dBFE6A09E667F3BCD, fd574; +mul.f64 fd577, fd643, 0dBFED906BCF328D46; +mul.f64 fd639, fd562, 0d3FD87DE2A6AEA963; +sub.f64 fd578, fd639, fd577; +mul.f64 fd579, fd643, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd580, fd562, 0dBFED906BCF328D46, fd579; +mul.f64 fd582, fd557, 0dBFED906BCF328D46; +mul.f64 fd638, fd556, 0dBFD87DE2A6AEA963; +sub.f64 fd583, fd638, fd582; +mul.f64 fd584, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd585, fd556, 0dBFED906BCF328D46, fd584; +mul.f64 fd586, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd561, 0dBFE6A09E667F3BCD; +sub.f64 fd588, fd586, fd587; +add.f64 fd589, fd586, fd587; +mul.f64 fd591, fd565, 0dBFD87DE2A6AEA963; +mul.f64 fd637, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd637, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0dBFD87DE2A6AEA963, fd593; +add.f64 %0, fd493, fd550; +add.f64 %1, fd659, fd646; +add.f64 %2, fd497, fd568; +add.f64 %3, fd658, fd570; +add.f64 %5, fd657, fd575; +add.f64 %4, fd501, fd573; +add.f64 %7, fd656, fd580; +add.f64 %6, fd505, fd578; +add.f64 %8, fd495, fd553; +sub.f64 %9, fd496, fd552; +add.f64 %10, fd499, fd583; +add.f64 %11, fd500, fd585; +add.f64 %12, fd503, fd588; +add.f64 %13, fd504, fd589; +add.f64 %14, fd507, fd592; +add.f64 %15, fd508, fd594; +sub.f64 %17, fd659, fd646; +sub.f64 %16, fd493, fd550; +sub.f64 %19, fd658, fd570; +sub.f64 %18, fd497, fd568; +sub.f64 %21, fd657, fd575; +sub.f64 %20, fd501, fd573; +sub.f64 %23, fd656, fd580; +sub.f64 %22, fd505, fd578; +add.f64 %25, fd496, fd552; +sub.f64 %24, fd495, fd553; +sub.f64 %27, fd500, fd585; +sub.f64 %26, fd499, fd583; +sub.f64 %29, fd504, fd589; +sub.f64 %28, fd503, fd588; +sub.f64 %31, fd508, fd594; +sub.f64 %30, fd507, fd592; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<462, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<595>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+256]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1920; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+128]; +ld.shared.f64 fd390, [r13+256]; +ld.shared.f64 fd391, [r13+384]; +ld.shared.f64 fd392, [r13+512]; +ld.shared.f64 fd393, [r13+640]; +ld.shared.f64 fd394, [r13+768]; +ld.shared.f64 fd395, [r13+896]; +ld.shared.f64 fd396, [r13+1024]; +ld.shared.f64 fd397, [r13+1152]; +ld.shared.f64 fd398, [r13+1280]; +ld.shared.f64 fd399, [r13+1408]; +ld.shared.f64 fd400, [r13+1536]; +ld.shared.f64 fd401, [r13+1664]; +ld.shared.f64 fd402, [r13+1792]; +ld.shared.f64 fd403, [r13+1920]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+128]; +ld.shared.f64 fd406, [r13+256]; +ld.shared.f64 fd407, [r13+384]; +ld.shared.f64 fd408, [r13+512]; +ld.shared.f64 fd409, [r13+640]; +ld.shared.f64 fd410, [r13+768]; +ld.shared.f64 fd411, [r13+896]; +ld.shared.f64 fd412, [r13+1024]; +ld.shared.f64 fd413, [r13+1152]; +ld.shared.f64 fd414, [r13+1280]; +ld.shared.f64 fd415, [r13+1408]; +ld.shared.f64 fd416, [r13+1536]; +ld.shared.f64 fd417, [r13+1664]; +ld.shared.f64 fd418, [r13+1792]; +ld.shared.f64 fd419, [r13+1920]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 %0, fd461, fd518; +add.f64 %1, fd462, fd519; +add.f64 %3, fd466, fd538; +add.f64 %2, fd465, fd536; +add.f64 %5, fd470, fd543; +add.f64 %4, fd469, fd541; +add.f64 %7, fd474, fd548; +add.f64 %6, fd473, fd546; +sub.f64 %9, fd464, fd520; +add.f64 %8, fd463, fd521; +add.f64 %11, fd468, fd553; +add.f64 %10, fd467, fd551; +add.f64 %13, fd472, fd557; +add.f64 %12, fd471, fd556; +add.f64 %15, fd476, fd562; +add.f64 %14, fd475, fd560; +sub.f64 %16, fd461, fd518; +sub.f64 %17, fd462, fd519; +sub.f64 %19, fd466, fd538; +sub.f64 %18, fd465, fd536; +sub.f64 %21, fd470, fd543; +sub.f64 %20, fd469, fd541; +sub.f64 %23, fd474, fd548; +sub.f64 %22, fd473, fd546; +add.f64 %25, fd464, fd520; +sub.f64 %24, fd463, fd521; +sub.f64 %27, fd468, fd553; +sub.f64 %26, fd467, fd551; +sub.f64 %29, fd472, fd557; +sub.f64 %28, fd471, fd556; +sub.f64 %31, fd476, fd562; +sub.f64 %30, fd475, fd560; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<463, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<213>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+1024]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 4032; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+1024]; +ld.shared.v2.f64 {fd69, fd70}, [r13+2048]; +ld.shared.v2.f64 {fd73, fd74}, [r13+3072]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+256]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 3840; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+1024]; +ld.shared.v2.f64 {fd129, fd130}, [r20+2048]; +ld.shared.v2.f64 {fd133, fd134}, [r20+3072]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd144; +sub.f64 fd148, fd140, fd143; +sub.f64 fd149, fd139, fd144; +add.f64 fd150, fd140, fd143; +and.b32 r21, r5, 48; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd151, fd147; +mul.f64 fd156, fd152, fd148; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd145; +mul.f64 fd164, fd162, fd146; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+64]; +mul.f64 fd170, fd166, fd149; +mul.f64 fd171, fd167, fd150; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 3072; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd152, fd147, fd157; +sub.f64 fd176, fd155, fd156; +st.shared.v2.f64 [r25+256], {fd176, fd175}; +fma.rn.f64 fd177, fd162, fd145, fd165; +sub.f64 fd178, fd163, fd164; +st.shared.v2.f64 [r25+512], {fd178, fd177}; +fma.rn.f64 fd179, fd167, fd149, fd172; +sub.f64 fd180, fd170, fd171; +st.shared.v2.f64 [r25+768], {fd180, fd179}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+1024]; +ld.shared.v2.f64 {fd189, fd190}, [r26+2048]; +ld.shared.v2.f64 {fd193, fd194}, [r26+3072]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +add.f64 %1, fd198, fd202; +add.f64 %0, fd197, fd201; +sub.f64 %3, fd200, fd203; +add.f64 %2, fd199, fd204; +sub.f64 %5, fd198, fd202; +sub.f64 %4, fd197, fd201; +add.f64 %7, fd200, fd203; +sub.f64 %6, fd199, fd204; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<465, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<379>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+512]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 3968; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+512]; +ld.shared.v2.f64 {fd166, fd167}, [r13+1024]; +ld.shared.v2.f64 {fd170, fd171}, [r13+1536]; +ld.shared.v2.f64 {fd174, fd175}, [r13+2048]; +ld.shared.v2.f64 {fd178, fd179}, [r13+2560]; +ld.shared.v2.f64 {fd182, fd183}, [r13+3072]; +ld.shared.v2.f64 {fd186, fd187}, [r13+3584]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 24; +bfe.u32 r15, r5, 3, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+64]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 3072; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+512]; +ld.shared.v2.f64 {fd323, fd324}, [r20+1024]; +ld.shared.v2.f64 {fd327, fd328}, [r20+1536]; +ld.shared.v2.f64 {fd331, fd332}, [r20+2048]; +ld.shared.v2.f64 {fd335, fd336}, [r20+2560]; +ld.shared.v2.f64 {fd339, fd340}, [r20+3072]; +ld.shared.v2.f64 {fd343, fd344}, [r20+3584]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd319, fd335; +add.f64 fd356, fd320, fd336; +sub.f64 fd357, fd319, fd335; +sub.f64 fd358, fd320, fd336; +add.f64 fd359, fd327, fd343; +add.f64 fd360, fd328, fd344; +sub.f64 fd361, fd327, fd343; +sub.f64 fd362, fd328, fd344; +add.f64 %1, fd348, fd352; +add.f64 %0, fd347, fd351; +add.f64 %3, fd356, fd360; +add.f64 %2, fd355, fd359; +sub.f64 %5, fd350, fd353; +add.f64 %4, fd349, fd354; +sub.f64 %7, fd358, fd361; +add.f64 %6, fd357, fd362; +sub.f64 %9, fd348, fd352; +sub.f64 %8, fd347, fd351; +sub.f64 %11, fd356, fd360; +sub.f64 %10, fd355, fd359; +add.f64 %13, fd350, fd353; +sub.f64 %12, fd349, fd354; +add.f64 %15, fd358, fd361; +sub.f64 %14, fd357, fd362; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<464, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<189>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+1024]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 2016; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+512]; +ld.shared.f64 fd63, [r13+1024]; +ld.shared.f64 fd64, [r13+1536]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+512]; +ld.shared.f64 fd67, [r13+1024]; +ld.shared.f64 fd68, [r13+1536]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+256]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1920; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+512]; +ld.shared.f64 fd115, [r21+1024]; +ld.shared.f64 fd116, [r21+1536]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+512]; +ld.shared.f64 fd119, [r21+1024]; +ld.shared.f64 fd120, [r21+1536]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +add.f64 fd133, fd123, fd128; +sub.f64 fd134, fd124, fd127; +sub.f64 fd135, fd123, fd128; +add.f64 fd136, fd124, fd127; +and.b32 r22, r5, 48; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd137, fd133; +mul.f64 fd142, fd138, fd134; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd137, fd134; +fma.rn.f64 fd145, fd138, fd133, fd144; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd148, fd131; +mul.f64 fd152, fd150, fd132; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd148, fd132; +fma.rn.f64 fd155, fd150, fd131, fd154; +ld.global.v2.f64 {fd156, fd157}, [rd11+64]; +mul.f64 fd160, fd156, fd135; +mul.f64 fd161, fd157, fd136; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd156, fd136; +fma.rn.f64 fd164, fd157, fd135, fd163; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 1536; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd143; +st.shared.f64 [r26+256], fd153; +st.shared.f64 [r26+384], fd162; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+512]; +ld.shared.f64 fd167, [r27+1024]; +ld.shared.f64 fd168, [r27+1536]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+512]; +ld.shared.f64 fd171, [r27+1024]; +ld.shared.f64 fd172, [r27+1536]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 %0, fd173, fd177; +add.f64 %1, fd174, fd178; +sub.f64 %3, fd176, fd179; +add.f64 %2, fd175, fd180; +sub.f64 %4, fd173, fd177; +sub.f64 %5, fd174, fd178; +add.f64 %7, fd176, fd179; +sub.f64 %6, fd175, fd180; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<467, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<56>; +.reg .f64 fd<160>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %12, %14; +sub.f64 fd10, %13, %15; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 4064; +add.s32 r11, r8, r10; +add.f64 fd18, %13, %15; +add.f64 fd19, %12, %14; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 2032; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+2048]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4032; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 2016; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+2048]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 3968; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 1984; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+2048]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 4; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 3840; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 1920; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+2048]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 112; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd95, fd93; +mul.f64 fd100, fd96, fd94; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3584; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd96, fd93, fd101; +sub.f64 fd105, fd99, fd100; +st.shared.v2.f64 [r39+256], {fd105, fd104}; +barrier.sync 0; +and.b32 r40, r9, 1792; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+2048]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd116, fd114; +mul.f64 fd121, fd117, fd115; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3072; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd117, fd114, fd122; +sub.f64 fd126, fd120, fd121; +st.shared.v2.f64 [r46+512], {fd126, fd125}; +barrier.sync 0; +and.b32 r47, r9, 1536; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+2048]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd137, fd135; +mul.f64 fd142, fd138, fd136; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 2048; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd138, fd135, fd143; +sub.f64 fd147, fd141, fd142; +st.shared.v2.f64 [r53+1024], {fd147, fd146}; +barrier.sync 0; +and.b32 r54, r9, 1024; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+2048]; +add.f64 %1, fd149, fd153; +add.f64 %0, fd148, fd152; +sub.f64 %3, fd149, fd153; +sub.f64 %2, fd148, fd152; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<466, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<56>; +.reg .f64 fd<132>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %12, %14; +add.f64 fd10, %13, %15; +sub.f64 fd11, %12, %14; +sub.f64 fd12, %13, %15; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 2032; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 1016; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+1024]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+1024]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 2016; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 1008; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+1024]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+1024]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 1984; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 992; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+1024]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+1024]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 4; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1920; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 960; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+1024]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+1024]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 112; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd81, fd79; +mul.f64 fd86, fd82, fd80; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd81, fd80; +fma.rn.f64 fd89, fd82, fd79, fd88; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1792; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd87; +barrier.sync 0; +and.b32 r40, r11, 896; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+1024]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+1024]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd98, fd96; +mul.f64 fd103, fd99, fd97; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd98, fd97; +fma.rn.f64 fd106, fd99, fd96, fd105; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1536; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd104; +barrier.sync 0; +and.b32 r47, r11, 768; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+1024]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+1024]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd115, fd113; +mul.f64 fd120, fd116, fd114; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd115, fd114; +fma.rn.f64 fd123, fd116, fd113, fd122; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 1024; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd121; +barrier.sync 0; +and.b32 r54, r11, 512; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+1024]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+1024]; +add.f64 %0, fd124, fd125; +add.f64 %1, fd126, fd127; +sub.f64 %2, fd124, fd125; +sub.f64 %3, fd126, fd127; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..c5efb2ed4ba3a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_256_fp64_inv.hpp.inc @@ -0,0 +1,2759 @@ +#ifndef CUFFTDX_FFT_256_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_256_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<631, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<347>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+512]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1984; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+256]; +ld.shared.f64 fd160, [r13+512]; +ld.shared.f64 fd161, [r13+768]; +ld.shared.f64 fd162, [r13+1024]; +ld.shared.f64 fd163, [r13+1280]; +ld.shared.f64 fd164, [r13+1536]; +ld.shared.f64 fd165, [r13+1792]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+256]; +ld.shared.f64 fd168, [r13+512]; +ld.shared.f64 fd169, [r13+768]; +ld.shared.f64 fd170, [r13+1024]; +ld.shared.f64 fd171, [r13+1280]; +ld.shared.f64 fd172, [r13+1536]; +ld.shared.f64 fd173, [r13+1792]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 24; +bfe.u32 r15, r5, 3, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+64]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1536; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+256]; +ld.shared.f64 fd301, [r21+512]; +ld.shared.f64 fd302, [r21+768]; +ld.shared.f64 fd303, [r21+1024]; +ld.shared.f64 fd304, [r21+1280]; +ld.shared.f64 fd305, [r21+1536]; +ld.shared.f64 fd306, [r21+1792]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+256]; +ld.shared.f64 fd309, [r21+512]; +ld.shared.f64 fd310, [r21+768]; +ld.shared.f64 fd311, [r21+1024]; +ld.shared.f64 fd312, [r21+1280]; +ld.shared.f64 fd313, [r21+1536]; +ld.shared.f64 fd314, [r21+1792]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd300, fd304; +add.f64 fd324, fd308, fd312; +sub.f64 fd325, fd300, fd304; +sub.f64 fd326, fd308, fd312; +add.f64 fd327, fd302, fd306; +add.f64 fd328, fd310, fd314; +sub.f64 fd329, fd302, fd306; +sub.f64 fd330, fd310, fd314; +add.f64 %0, fd315, fd319; +add.f64 %1, fd316, fd320; +add.f64 %2, fd323, fd327; +add.f64 %3, fd324, fd328; +add.f64 %5, fd318, fd321; +sub.f64 %4, fd317, fd322; +add.f64 %7, fd326, fd329; +sub.f64 %6, fd325, fd330; +sub.f64 %8, fd315, fd319; +sub.f64 %9, fd316, fd320; +sub.f64 %10, fd323, fd327; +sub.f64 %11, fd324, fd328; +sub.f64 %13, fd318, fd321; +add.f64 %12, fd317, fd322; +sub.f64 %15, fd326, fd329; +add.f64 %14, fd325, fd330; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<632, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<20>; +.reg .f64 fd<763>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd757, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd755, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd754, fd757, fd755; +sub.f64 fd76, fd757, fd755; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd753, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd750, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd748, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd747, fd750, fd748; +sub.f64 fd92, fd750, fd748; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd746, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd746, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd744, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd745, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd744, fd745; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd743, fd754, fd747; +sub.f64 fd109, fd754, fd747; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd742, fd753, fd100; +sub.f64 fd113, fd753, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd741, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd740, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd738, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd735, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd734, fd738, fd735; +sub.f64 fd133, fd738, fd735; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd733, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd731, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd729, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd728, fd731, fd729; +sub.f64 fd149, fd731, fd729; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd727, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd727, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd725, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd726, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd725, fd726; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd724, fd734, fd728; +sub.f64 fd166, fd734, fd728; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd723, fd733, fd157; +sub.f64 fd170, fd733, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd722, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd721, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd719, fd167, 0d3FED906BCF328D46; +mul.f64 fd720, fd723, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd719, fd720; +mul.f64 fd182, fd723, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd722, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd721, 0d3FED906BCF328D46; +mul.f64 fd718, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd718, fd189; +mul.f64 fd191, fd721, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd717, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd717, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd715, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd716, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd715, fd716; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd713, fd177, 0dBFED906BCF328D46; +mul.f64 fd714, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd713, fd714; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd712, fd742, fd183; +sub.f64 fd213, fd742, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd711, fd741, fd187; +sub.f64 fd217, fd741, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd710, fd740, fd192; +sub.f64 fd221, fd740, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd709, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd708, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd707, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd706, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +and.b32 r14, r15, 15; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd712, fd239; +mul.f64 fd244, fd238, fd712; +mul.f64 fd246, fd239, fd239; +mul.f64 fd705, fd238, fd238; +sub.f64 fd247, fd705, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd711, fd249; +mul.f64 fd252, fd247, fd711; +mul.f64 fd703, fd238, fd247; +mul.f64 fd704, fd239, fd249; +sub.f64 fd255, fd703, fd704; +mul.f64 fd702, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd710, fd257; +mul.f64 fd260, fd255, fd710; +mul.f64 fd262, fd239, fd257; +mul.f64 fd701, fd238, fd255; +sub.f64 fd263, fd701, fd262; +mul.f64 fd700, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd709, fd265; +mul.f64 fd268, fd263, fd709; +mul.f64 fd270, fd239, fd265; +mul.f64 fd699, fd238, fd263; +sub.f64 fd271, fd699, fd270; +mul.f64 fd698, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd708, fd273; +mul.f64 fd276, fd271, fd708; +mul.f64 fd696, fd238, fd271; +mul.f64 fd697, fd239, fd273; +sub.f64 fd279, fd696, fd697; +mul.f64 fd695, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd707, fd281; +mul.f64 fd284, fd279, fd707; +mul.f64 fd286, fd239, fd281; +mul.f64 fd694, fd238, fd279; +sub.f64 fd287, fd694, fd286; +mul.f64 fd693, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd706, fd289; +mul.f64 fd292, fd287, fd706; +mul.f64 fd294, fd239, fd289; +mul.f64 fd692, fd238, fd287; +sub.f64 fd295, fd692, fd294; +mul.f64 fd691, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd690, fd743, fd724; +mul.f64 fd298, fd690, fd297; +sub.f64 fd689, fd106, fd163; +mul.f64 fd299, fd689, fd297; +mul.f64 fd300, fd295, fd690; +ld.global.v2.f64 {fd301, fd302}, [rd5+256]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd687, fd238, fd301; +mul.f64 fd688, fd239, fd302; +sub.f64 fd310, fd687, fd688; +mul.f64 fd686, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd685, fd238, fd310; +sub.f64 fd318, fd685, fd317; +mul.f64 fd684, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd682, fd238, fd318; +mul.f64 fd683, fd239, fd320; +sub.f64 fd326, fd682, fd683; +mul.f64 fd681, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd679, fd238, fd326; +mul.f64 fd680, fd239, fd328; +sub.f64 fd334, fd679, fd680; +mul.f64 fd678, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd677, fd238, fd334; +sub.f64 fd342, fd677, fd341; +mul.f64 fd676, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd674, fd238, fd342; +mul.f64 fd675, fd239, fd344; +sub.f64 fd350, fd674, fd675; +mul.f64 fd673, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd672, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 3840; +add.s32 r12, r9, r11; +sub.f64 fd762, fd743, fd724; +mul.f64 fd761, fd295, fd762; +add.f64 fd356, fd743, fd724; +sub.f64 fd760, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 15; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 15; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd672; +st.shared.v2.f64 [r12+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd702; +st.shared.v2.f64 [r12+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd700; +st.shared.v2.f64 [r12+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd698; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r12+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd695; +st.shared.v2.f64 [r12+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd693; +st.shared.v2.f64 [r12+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd691; +st.shared.v2.f64 [r12+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd760, fd298; +sub.f64 fd373, fd761, fd299; +st.shared.v2.f64 [r12+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd686; +st.shared.v2.f64 [r12+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd684; +st.shared.v2.f64 [r12+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd681; +st.shared.v2.f64 [r12+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd678; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r12+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd676; +st.shared.v2.f64 [r12+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd673; +st.shared.v2.f64 [r12+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r12+240], {fd386, fd387}; +barrier.sync 0; +mad.lo.s32 r13, r18, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+256]; +ld.shared.v2.f64 {fd396, fd397}, [r13+512]; +ld.shared.v2.f64 {fd400, fd401}, [r13+768]; +ld.shared.v2.f64 {fd404, fd405}, [r13+1024]; +ld.shared.v2.f64 {fd408, fd409}, [r13+1280]; +ld.shared.v2.f64 {fd412, fd413}, [r13+1536]; +ld.shared.v2.f64 {fd416, fd417}, [r13+1792]; +ld.shared.v2.f64 {fd420, fd421}, [r13+2048]; +ld.shared.v2.f64 {fd424, fd425}, [r13+2304]; +ld.shared.v2.f64 {fd428, fd429}, [r13+2560]; +ld.shared.v2.f64 {fd432, fd433}, [r13+2816]; +ld.shared.v2.f64 {fd436, fd437}, [r13+3072]; +ld.shared.v2.f64 {fd440, fd441}, [r13+3328]; +ld.shared.v2.f64 {fd444, fd445}, [r13+3584]; +ld.shared.v2.f64 {fd448, fd449}, [r13+3840]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd671, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd670, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd669, fd671, fd670; +sub.f64 fd463, fd671, fd670; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd668, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd667, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd666, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd665, fd667, fd666; +sub.f64 fd479, fd667, fd666; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd664, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd664, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd663, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd663, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd662, fd669, fd665; +sub.f64 fd496, fd669, fd665; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd661, fd668, fd487; +sub.f64 fd500, fd668, fd487; +sub.f64 fd501, fd462, fd479; +add.f64 fd503, fd462, fd479; +add.f64 fd660, fd463, fd478; +sub.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd490; +sub.f64 fd507, fd466, fd490; +add.f64 fd659, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd658, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd657, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd656, fd658, fd657; +sub.f64 fd520, fd658, fd657; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd655, fd512, fd515; +sub.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd654, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd653, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd652, fd654, fd653; +sub.f64 fd536, fd654, fd653; +sub.f64 fd537, fd527, fd532; +add.f64 fd539, fd527, fd532; +add.f64 fd651, fd528, fd531; +sub.f64 fd540, fd528, fd531; +mul.f64 fd541, fd537, 0d3FE6A09E667F3BCD; +mul.f64 fd542, fd651, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +mul.f64 fd546, fd540, 0d3FE6A09E667F3BCD; +mul.f64 fd650, fd539, 0dBFE6A09E667F3BCD; +sub.f64 fd547, fd650, fd546; +mul.f64 fd548, fd540, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd549, fd539, 0d3FE6A09E667F3BCD, fd548; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd649, fd656, fd652; +sub.f64 fd553, fd656, fd652; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd648, fd655, fd544; +sub.f64 fd557, fd655, fd544; +sub.f64 fd558, fd519, fd536; +add.f64 fd560, fd519, fd536; +add.f64 fd647, fd520, fd535; +sub.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd547; +sub.f64 fd564, fd523, fd547; +add.f64 fd646, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd644, fd554, 0d3FED906BCF328D46; +mul.f64 fd645, fd648, 0d3FD87DE2A6AEA963; +sub.f64 fd568, fd644, fd645; +mul.f64 fd569, fd648, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0d3FD87DE2A6AEA963, fd569; +mul.f64 fd571, fd558, 0d3FE6A09E667F3BCD; +mul.f64 fd572, fd647, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd571, fd572; +add.f64 fd574, fd571, fd572; +mul.f64 fd642, fd562, 0d3FD87DE2A6AEA963; +mul.f64 fd643, fd646, 0d3FED906BCF328D46; +sub.f64 fd577, fd642, fd643; +mul.f64 fd578, fd646, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd579, fd562, 0d3FED906BCF328D46, fd578; +mul.f64 fd640, fd556, 0dBFD87DE2A6AEA963; +mul.f64 fd641, fd557, 0d3FED906BCF328D46; +sub.f64 fd582, fd640, fd641; +mul.f64 fd583, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd584, fd556, 0d3FED906BCF328D46, fd583; +mul.f64 fd638, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd639, fd561, 0d3FE6A09E667F3BCD; +sub.f64 fd587, fd638, fd639; +mul.f64 fd588, fd561, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd589, fd560, 0d3FE6A09E667F3BCD, fd588; +mul.f64 fd591, fd565, 0d3FD87DE2A6AEA963; +mul.f64 fd637, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd637, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0d3FD87DE2A6AEA963, fd593; +add.f64 %0, fd493, fd550; +add.f64 %1, fd662, fd649; +add.f64 %2, fd497, fd568; +add.f64 %3, fd661, fd570; +add.f64 %5, fd660, fd574; +add.f64 %4, fd501, fd573; +add.f64 %7, fd659, fd579; +add.f64 %6, fd505, fd577; +sub.f64 %8, fd495, fd553; +add.f64 %9, fd496, fd552; +add.f64 %10, fd499, fd582; +add.f64 %11, fd500, fd584; +add.f64 %12, fd503, fd587; +add.f64 %13, fd504, fd589; +add.f64 %14, fd507, fd592; +add.f64 %15, fd508, fd594; +sub.f64 %17, fd662, fd649; +sub.f64 %16, fd493, fd550; +sub.f64 %19, fd661, fd570; +sub.f64 %18, fd497, fd568; +sub.f64 %21, fd660, fd574; +sub.f64 %20, fd501, fd573; +sub.f64 %23, fd659, fd579; +sub.f64 %22, fd505, fd577; +sub.f64 %25, fd496, fd552; +add.f64 %24, fd495, fd553; +sub.f64 %27, fd500, fd584; +sub.f64 %26, fd499, fd582; +sub.f64 %29, fd504, fd589; +sub.f64 %28, fd503, fd587; +sub.f64 %31, fd508, fd594; +sub.f64 %30, fd507, fd592; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<633, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<595>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+256]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1920; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+128]; +ld.shared.f64 fd390, [r13+256]; +ld.shared.f64 fd391, [r13+384]; +ld.shared.f64 fd392, [r13+512]; +ld.shared.f64 fd393, [r13+640]; +ld.shared.f64 fd394, [r13+768]; +ld.shared.f64 fd395, [r13+896]; +ld.shared.f64 fd396, [r13+1024]; +ld.shared.f64 fd397, [r13+1152]; +ld.shared.f64 fd398, [r13+1280]; +ld.shared.f64 fd399, [r13+1408]; +ld.shared.f64 fd400, [r13+1536]; +ld.shared.f64 fd401, [r13+1664]; +ld.shared.f64 fd402, [r13+1792]; +ld.shared.f64 fd403, [r13+1920]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+128]; +ld.shared.f64 fd406, [r13+256]; +ld.shared.f64 fd407, [r13+384]; +ld.shared.f64 fd408, [r13+512]; +ld.shared.f64 fd409, [r13+640]; +ld.shared.f64 fd410, [r13+768]; +ld.shared.f64 fd411, [r13+896]; +ld.shared.f64 fd412, [r13+1024]; +ld.shared.f64 fd413, [r13+1152]; +ld.shared.f64 fd414, [r13+1280]; +ld.shared.f64 fd415, [r13+1408]; +ld.shared.f64 fd416, [r13+1536]; +ld.shared.f64 fd417, [r13+1664]; +ld.shared.f64 fd418, [r13+1792]; +ld.shared.f64 fd419, [r13+1920]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 %0, fd461, fd518; +add.f64 %1, fd462, fd519; +add.f64 %3, fd466, fd538; +add.f64 %2, fd465, fd536; +add.f64 %5, fd470, fd542; +add.f64 %4, fd469, fd541; +add.f64 %7, fd474, fd547; +add.f64 %6, fd473, fd545; +add.f64 %9, fd464, fd520; +sub.f64 %8, fd463, fd521; +add.f64 %11, fd468, fd552; +add.f64 %10, fd467, fd550; +add.f64 %13, fd472, fd557; +add.f64 %12, fd471, fd555; +add.f64 %15, fd476, fd562; +add.f64 %14, fd475, fd560; +sub.f64 %16, fd461, fd518; +sub.f64 %17, fd462, fd519; +sub.f64 %19, fd466, fd538; +sub.f64 %18, fd465, fd536; +sub.f64 %21, fd470, fd542; +sub.f64 %20, fd469, fd541; +sub.f64 %23, fd474, fd547; +sub.f64 %22, fd473, fd545; +sub.f64 %25, fd464, fd520; +add.f64 %24, fd463, fd521; +sub.f64 %27, fd468, fd552; +sub.f64 %26, fd467, fd550; +sub.f64 %29, fd472, fd557; +sub.f64 %28, fd471, fd555; +sub.f64 %31, fd476, fd562; +sub.f64 %30, fd475, fd560; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<634, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<27>; +.reg .f64 fd<213>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+1024]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 4032; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+1024]; +ld.shared.v2.f64 {fd69, fd70}, [r13+2048]; +ld.shared.v2.f64 {fd73, fd74}, [r13+3072]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+256]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 3840; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+1024]; +ld.shared.v2.f64 {fd129, fd130}, [r20+2048]; +ld.shared.v2.f64 {fd133, fd134}, [r20+3072]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +sub.f64 fd147, fd139, fd144; +add.f64 fd148, fd140, fd143; +add.f64 fd149, fd139, fd144; +sub.f64 fd150, fd140, fd143; +and.b32 r21, r5, 48; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd148, fd152; +mul.f64 fd156, fd147, fd152; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd146, fd162; +mul.f64 fd164, fd145, fd162; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+64]; +mul.f64 fd170, fd150, fd167; +mul.f64 fd171, fd149, fd167; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 3072; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd151, fd147, fd155; +sub.f64 fd176, fd157, fd156; +st.shared.v2.f64 [r25+256], {fd175, fd176}; +fma.rn.f64 fd177, fd160, fd145, fd163; +sub.f64 fd178, fd165, fd164; +st.shared.v2.f64 [r25+512], {fd177, fd178}; +fma.rn.f64 fd179, fd166, fd149, fd170; +sub.f64 fd180, fd172, fd171; +st.shared.v2.f64 [r25+768], {fd179, fd180}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+1024]; +ld.shared.v2.f64 {fd189, fd190}, [r26+2048]; +ld.shared.v2.f64 {fd193, fd194}, [r26+3072]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +add.f64 %1, fd198, fd202; +add.f64 %0, fd197, fd201; +add.f64 %3, fd200, fd203; +sub.f64 %2, fd199, fd204; +sub.f64 %5, fd198, fd202; +sub.f64 %4, fd197, fd201; +sub.f64 %7, fd200, fd203; +add.f64 %6, fd199, fd204; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<636, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<379>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+512]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 3968; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+512]; +ld.shared.v2.f64 {fd166, fd167}, [r13+1024]; +ld.shared.v2.f64 {fd170, fd171}, [r13+1536]; +ld.shared.v2.f64 {fd174, fd175}, [r13+2048]; +ld.shared.v2.f64 {fd178, fd179}, [r13+2560]; +ld.shared.v2.f64 {fd182, fd183}, [r13+3072]; +ld.shared.v2.f64 {fd186, fd187}, [r13+3584]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 24; +bfe.u32 r15, r5, 3, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+64]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 3072; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+512]; +ld.shared.v2.f64 {fd323, fd324}, [r20+1024]; +ld.shared.v2.f64 {fd327, fd328}, [r20+1536]; +ld.shared.v2.f64 {fd331, fd332}, [r20+2048]; +ld.shared.v2.f64 {fd335, fd336}, [r20+2560]; +ld.shared.v2.f64 {fd339, fd340}, [r20+3072]; +ld.shared.v2.f64 {fd343, fd344}, [r20+3584]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd319, fd335; +add.f64 fd356, fd320, fd336; +sub.f64 fd357, fd319, fd335; +sub.f64 fd358, fd320, fd336; +add.f64 fd359, fd327, fd343; +add.f64 fd360, fd328, fd344; +sub.f64 fd361, fd327, fd343; +sub.f64 fd362, fd328, fd344; +add.f64 %1, fd348, fd352; +add.f64 %0, fd347, fd351; +add.f64 %3, fd356, fd360; +add.f64 %2, fd355, fd359; +add.f64 %5, fd350, fd353; +sub.f64 %4, fd349, fd354; +add.f64 %7, fd358, fd361; +sub.f64 %6, fd357, fd362; +sub.f64 %9, fd348, fd352; +sub.f64 %8, fd347, fd351; +sub.f64 %11, fd356, fd360; +sub.f64 %10, fd355, fd359; +sub.f64 %13, fd350, fd353; +add.f64 %12, fd349, fd354; +sub.f64 %15, fd358, fd361; +add.f64 %14, fd357, fd362; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_256), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<635, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<189>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %12, %17; +add.f64 fd18, %13, %19; +sub.f64 fd19, %12, %17; +sub.f64 fd20, %13, %19; +add.f64 fd21, %14, %20; +add.f64 fd22, %16, %21; +sub.f64 fd23, %14, %20; +sub.f64 fd24, %16, %21; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+1024]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 2016; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+512]; +ld.shared.f64 fd63, [r13+1024]; +ld.shared.f64 fd64, [r13+1536]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+512]; +ld.shared.f64 fd67, [r13+1024]; +ld.shared.f64 fd68, [r13+1536]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 60; +bfe.u32 r15, r5, 2, 4; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+256]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1920; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+512]; +ld.shared.f64 fd115, [r21+1024]; +ld.shared.f64 fd116, [r21+1536]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+512]; +ld.shared.f64 fd119, [r21+1024]; +ld.shared.f64 fd120, [r21+1536]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd128; +add.f64 fd134, fd124, fd127; +add.f64 fd135, fd123, fd128; +sub.f64 fd136, fd124, fd127; +and.b32 r22, r5, 48; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd134, fd138; +fma.rn.f64 fd142, fd137, fd133, fd141; +mul.f64 fd143, fd133, fd138; +mul.f64 fd144, fd137, fd134; +sub.f64 fd145, fd144, fd143; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd132, fd150; +fma.rn.f64 fd152, fd148, fd131, fd151; +mul.f64 fd153, fd131, fd150; +mul.f64 fd154, fd148, fd132; +sub.f64 fd155, fd154, fd153; +ld.global.v2.f64 {fd156, fd157}, [rd11+64]; +mul.f64 fd160, fd136, fd157; +fma.rn.f64 fd161, fd156, fd135, fd160; +mul.f64 fd162, fd135, fd157; +mul.f64 fd163, fd156, fd136; +sub.f64 fd164, fd163, fd162; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 1536; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd142; +st.shared.f64 [r26+256], fd152; +st.shared.f64 [r26+384], fd161; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+512]; +ld.shared.f64 fd167, [r27+1024]; +ld.shared.f64 fd168, [r27+1536]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+512]; +ld.shared.f64 fd171, [r27+1024]; +ld.shared.f64 fd172, [r27+1536]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 %0, fd173, fd177; +add.f64 %1, fd174, fd178; +add.f64 %3, fd176, fd179; +sub.f64 %2, fd175, fd180; +sub.f64 %4, fd173, fd177; +sub.f64 %5, fd174, fd178; +sub.f64 %7, fd176, fd179; +add.f64 %6, fd175, fd180; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<638, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<56>; +.reg .f64 fd<160>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %12, %14; +sub.f64 fd10, %13, %15; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 4064; +add.s32 r11, r8, r10; +add.f64 fd18, %13, %15; +add.f64 fd19, %12, %14; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 2032; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+2048]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4032; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 2016; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+2048]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 3968; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 1984; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+2048]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 4; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 3840; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 1920; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+2048]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 112; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd94, fd96; +mul.f64 fd100, fd93, fd96; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3584; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd95, fd93, fd99; +sub.f64 fd105, fd101, fd100; +st.shared.v2.f64 [r39+256], {fd104, fd105}; +barrier.sync 0; +and.b32 r40, r9, 1792; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+2048]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd115, fd117; +mul.f64 fd121, fd114, fd117; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3072; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd116, fd114, fd120; +sub.f64 fd126, fd122, fd121; +st.shared.v2.f64 [r46+512], {fd125, fd126}; +barrier.sync 0; +and.b32 r47, r9, 1536; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+2048]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd136, fd138; +mul.f64 fd142, fd135, fd138; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 2048; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd137, fd135, fd141; +sub.f64 fd147, fd143, fd142; +st.shared.v2.f64 [r53+1024], {fd146, fd147}; +barrier.sync 0; +and.b32 r54, r9, 1024; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+2048]; +add.f64 %1, fd149, fd153; +add.f64 %0, fd148, fd152; +sub.f64 %3, fd149, fd153; +sub.f64 %2, fd148, fd152; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<637, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<56>; +.reg .f64 fd<132>; +.reg .b64 rd<24>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %12, %14; +add.f64 fd10, %13, %15; +sub.f64 fd11, %12, %14; +sub.f64 fd12, %13, %15; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 2032; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 1016; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+1024]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+1024]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 6; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 2016; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 1008; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+1024]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+1024]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 5; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 1984; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 992; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+1024]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+1024]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 4; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1920; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 960; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+1024]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+1024]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 112; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd80, fd82; +fma.rn.f64 fd86, fd81, fd79, fd85; +mul.f64 fd87, fd79, fd82; +mul.f64 fd88, fd81, fd80; +sub.f64 fd89, fd88, fd87; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1792; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd86; +barrier.sync 0; +and.b32 r40, r11, 896; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+1024]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+1024]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 2; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd97, fd99; +fma.rn.f64 fd103, fd98, fd96, fd102; +mul.f64 fd104, fd96, fd99; +mul.f64 fd105, fd98, fd97; +sub.f64 fd106, fd105, fd104; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1536; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd103; +barrier.sync 0; +and.b32 r47, r11, 768; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+1024]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+1024]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 1; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd114, fd116; +fma.rn.f64 fd120, fd115, fd113, fd119; +mul.f64 fd121, fd113, fd116; +mul.f64 fd122, fd115, fd114; +sub.f64 fd123, fd122, fd121; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 1024; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd120; +barrier.sync 0; +and.b32 r54, r11, 512; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+1024]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+1024]; +add.f64 %0, fd124, fd125; +add.f64 %1, fd126, fd127; +sub.f64 %2, fd124, fd125; +sub.f64 %3, fd126, fd127; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..35b4019596f74 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp16_fwd.hpp.inc @@ -0,0 +1,5652 @@ +#ifndef CUFFTDX_FFT_25_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_25_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<901, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<129>; +.reg .b32 r<3427>; +.reg .f64 fd<109>; +.reg .b64 rd<3>; +mov.f64 fd107, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd107; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd108, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd108; +} +mov.b32 r228, {rs2, rs2}; +mov.f64 fd105, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs3, fd105; +} +mov.b32 r282, {rs3, rs3}; +mov.f64 fd106, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs4, fd106; +} +mov.b32 r300, {rs4, rs4}; +{ +cvt.rn.f16.f64 rs5, fd107; +} +mov.b32 r291, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd108; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r306, {rs7, rs7}; +{ +add.f16x2 r1, %56, %53; +} +{ +add.f16x2 r4, %57, r1; +} +{ +add.f16x2 r7, %59, %55; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %51, %58; +} +{ +add.f16x2 r16, %52, r13; +} +{ +add.f16x2 r19, %54, %50; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %56, %53; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %57, r28; +} +{ +add.f16x2 r34, %59, %55; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %51, %58; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %54, %50; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %56, %53; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %57, r64; +} +{ +add.f16x2 r70, %59, %55; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %51, %58; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %54, %50; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %56, %53; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %57, r100; +} +{ +add.f16x2 r106, %59, %55; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %51, %58; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %54, %50; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %56, %53; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %57, r136; +} +{ +add.f16x2 r142, %59, %55; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %51, %58; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %54, %50; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %51, %58; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %52, r172; +} +{ +add.f16x2 r178, %54, %50; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %56, %53; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %59, %55; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %51, %58; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %52, r208; +} +{ +add.f16x2 r214, %54, %50; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %56, %53; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %59, %55; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %51, %58; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %52, r244; +} +{ +add.f16x2 r250, %54, %50; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %56, %53; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %59, %55; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %51, %58; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %52, r280; +} +{ +add.f16x2 r286, %54, %50; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %56, %53; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %59, %55; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs9, fd107; +} +mov.b32 r522, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd108; +} +mov.b32 r540, {rs10, rs10}; +{ +cvt.rn.f16.f64 rs11, fd105; +} +mov.b32 r594, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd106; +} +mov.b32 r612, {rs12, rs12}; +{ +cvt.rn.f16.f64 rs13, fd107; +} +mov.b32 r603, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd108; +} +{ +neg.f16 rs15, rs14; +} +mov.b32 r618, {rs15, rs15}; +{ +add.f16x2 r313, %64, %61; +} +{ +add.f16x2 r316, %65, r313; +} +{ +add.f16x2 r319, %67, %63; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %69, %66; +} +{ +add.f16x2 r328, %60, r325; +} +{ +add.f16x2 r331, %62, %68; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %64, %61; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %65, r340; +} +{ +add.f16x2 r346, %67, %63; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %69, %66; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %62, %68; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %64, %61; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %65, r376; +} +{ +add.f16x2 r382, %67, %63; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %69, %66; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %62, %68; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %64, %61; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %65, r412; +} +{ +add.f16x2 r418, %67, %63; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %69, %66; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %62, %68; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %64, %61; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %65, r448; +} +{ +add.f16x2 r454, %67, %63; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %69, %66; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %62, %68; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %69, %66; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %60, r484; +} +{ +add.f16x2 r490, %62, %68; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %64, %61; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %67, %63; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %69, %66; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %60, r520; +} +{ +add.f16x2 r526, %62, %68; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %64, %61; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %67, %63; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %69, %66; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %60, r556; +} +{ +add.f16x2 r562, %62, %68; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %64, %61; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %67, %63; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %69, %66; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %60, r592; +} +{ +add.f16x2 r598, %62, %68; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %64, %61; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %67, %63; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +{ +cvt.rn.f16.f64 rs17, fd107; +} +mov.b32 r834, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd108; +} +mov.b32 r852, {rs18, rs18}; +{ +cvt.rn.f16.f64 rs19, fd105; +} +mov.b32 r906, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd106; +} +mov.b32 r924, {rs20, rs20}; +{ +cvt.rn.f16.f64 rs21, fd107; +} +mov.b32 r915, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd108; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r930, {rs23, rs23}; +{ +add.f16x2 r625, %71, %78; +} +{ +add.f16x2 r628, %72, r625; +} +{ +add.f16x2 r631, %74, %70; +} +{ +add.f16x2 r634, r628, r631; +} +{ +add.f16x2 r637, %76, %73; +} +{ +add.f16x2 r640, %77, r637; +} +{ +add.f16x2 r643, %79, %75; +} +{ +add.f16x2 r646, r640, r643; +} +{ +add.f16x2 r649, %71, %78; +} +{ +mul.f16x2 r652, r649, r834; +} +{ +add.f16x2 r655, %72, r652; +} +{ +add.f16x2 r658, %74, %70; +} +{ +mul.f16x2 r661, r658, r906; +} +{ +add.f16x2 r664, r655, r661; +} +{ +sub.f16x2 r667, %76, %73; +} +{ +mul.f16x2 r670, r667, r852; +} +{ +sub.f16x2 r673, %79, %75; +} +{ +mul.f16x2 r676, r673, r924; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r664, r679; +} +{ +add.f16x2 r685, %71, %78; +} +{ +mul.f16x2 r688, r685, r834; +} +{ +add.f16x2 r691, %72, r688; +} +{ +add.f16x2 r694, %74, %70; +} +{ +mul.f16x2 r697, r694, r906; +} +{ +add.f16x2 r700, r691, r697; +} +{ +sub.f16x2 r703, %76, %73; +} +{ +mul.f16x2 r706, r703, r852; +} +{ +sub.f16x2 r709, %79, %75; +} +{ +mul.f16x2 r712, r709, r924; +} +{ +add.f16x2 r715, r706, r712; +} +{ +add.f16x2 r718, r700, r715; +} +{ +add.f16x2 r721, %71, %78; +} +{ +mul.f16x2 r724, r721, r906; +} +{ +add.f16x2 r727, %72, r724; +} +{ +add.f16x2 r730, %74, %70; +} +{ +mul.f16x2 r733, r730, r915; +} +{ +add.f16x2 r736, r727, r733; +} +{ +sub.f16x2 r739, %76, %73; +} +{ +mul.f16x2 r742, r739, r924; +} +{ +sub.f16x2 r745, %79, %75; +} +{ +mul.f16x2 r748, r745, r930; +} +{ +add.f16x2 r751, r742, r748; +} +{ +sub.f16x2 r754, r736, r751; +} +{ +add.f16x2 r757, %71, %78; +} +{ +mul.f16x2 r760, r757, r906; +} +{ +add.f16x2 r763, %72, r760; +} +{ +add.f16x2 r766, %74, %70; +} +{ +mul.f16x2 r769, r766, r915; +} +{ +add.f16x2 r772, r763, r769; +} +{ +sub.f16x2 r775, %76, %73; +} +{ +mul.f16x2 r778, r775, r924; +} +{ +sub.f16x2 r781, %79, %75; +} +{ +mul.f16x2 r784, r781, r930; +} +{ +add.f16x2 r787, r778, r784; +} +{ +add.f16x2 r790, r772, r787; +} +{ +add.f16x2 r793, %76, %73; +} +{ +mul.f16x2 r796, r793, r834; +} +{ +add.f16x2 r799, %77, r796; +} +{ +add.f16x2 r802, %79, %75; +} +{ +mul.f16x2 r805, r802, r906; +} +{ +add.f16x2 r808, r799, r805; +} +{ +sub.f16x2 r811, %71, %78; +} +{ +mul.f16x2 r814, r811, r852; +} +{ +sub.f16x2 r817, %74, %70; +} +{ +mul.f16x2 r820, r817, r924; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r808, r823; +} +{ +add.f16x2 r829, %76, %73; +} +{ +mul.f16x2 r832, r829, r834; +} +{ +add.f16x2 r835, %77, r832; +} +{ +add.f16x2 r838, %79, %75; +} +{ +mul.f16x2 r841, r838, r906; +} +{ +add.f16x2 r844, r835, r841; +} +{ +sub.f16x2 r847, %71, %78; +} +{ +mul.f16x2 r850, r847, r852; +} +{ +sub.f16x2 r853, %74, %70; +} +{ +mul.f16x2 r856, r853, r924; +} +{ +add.f16x2 r859, r850, r856; +} +{ +sub.f16x2 r862, r844, r859; +} +{ +add.f16x2 r865, %76, %73; +} +{ +mul.f16x2 r868, r865, r906; +} +{ +add.f16x2 r871, %77, r868; +} +{ +add.f16x2 r874, %79, %75; +} +{ +mul.f16x2 r877, r874, r915; +} +{ +add.f16x2 r880, r871, r877; +} +{ +sub.f16x2 r883, %71, %78; +} +{ +mul.f16x2 r886, r883, r924; +} +{ +sub.f16x2 r889, %74, %70; +} +{ +mul.f16x2 r892, r889, r930; +} +{ +add.f16x2 r895, r886, r892; +} +{ +add.f16x2 r898, r880, r895; +} +{ +add.f16x2 r901, %76, %73; +} +{ +mul.f16x2 r904, r901, r906; +} +{ +add.f16x2 r907, %77, r904; +} +{ +add.f16x2 r910, %79, %75; +} +{ +mul.f16x2 r913, r910, r915; +} +{ +add.f16x2 r916, r907, r913; +} +{ +sub.f16x2 r919, %71, %78; +} +{ +mul.f16x2 r922, r919, r924; +} +{ +sub.f16x2 r925, %74, %70; +} +{ +mul.f16x2 r928, r925, r930; +} +{ +add.f16x2 r931, r922, r928; +} +{ +sub.f16x2 r934, r916, r931; +} +{ +cvt.rn.f16.f64 rs25, fd107; +} +mov.b32 r1146, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd108; +} +mov.b32 r1164, {rs26, rs26}; +{ +cvt.rn.f16.f64 rs27, fd105; +} +mov.b32 r1218, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs28, fd106; +} +mov.b32 r1236, {rs28, rs28}; +{ +cvt.rn.f16.f64 rs29, fd107; +} +mov.b32 r1227, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd108; +} +{ +neg.f16 rs31, rs30; +} +mov.b32 r1242, {rs31, rs31}; +{ +add.f16x2 r937, %89, %86; +} +{ +add.f16x2 r940, %80, r937; +} +{ +add.f16x2 r943, %82, %88; +} +{ +add.f16x2 r946, r940, r943; +} +{ +add.f16x2 r949, %84, %81; +} +{ +add.f16x2 r952, %85, r949; +} +{ +add.f16x2 r955, %87, %83; +} +{ +add.f16x2 r958, r952, r955; +} +{ +add.f16x2 r961, %89, %86; +} +{ +mul.f16x2 r964, r961, r1146; +} +{ +add.f16x2 r967, %80, r964; +} +{ +add.f16x2 r970, %82, %88; +} +{ +mul.f16x2 r973, r970, r1218; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %84, %81; +} +{ +mul.f16x2 r982, r979, r1164; +} +{ +sub.f16x2 r985, %87, %83; +} +{ +mul.f16x2 r988, r985, r1236; +} +{ +add.f16x2 r991, r982, r988; +} +{ +sub.f16x2 r994, r976, r991; +} +{ +add.f16x2 r997, %89, %86; +} +{ +mul.f16x2 r1000, r997, r1146; +} +{ +add.f16x2 r1003, %80, r1000; +} +{ +add.f16x2 r1006, %82, %88; +} +{ +mul.f16x2 r1009, r1006, r1218; +} +{ +add.f16x2 r1012, r1003, r1009; +} +{ +sub.f16x2 r1015, %84, %81; +} +{ +mul.f16x2 r1018, r1015, r1164; +} +{ +sub.f16x2 r1021, %87, %83; +} +{ +mul.f16x2 r1024, r1021, r1236; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1012, r1027; +} +{ +add.f16x2 r1033, %89, %86; +} +{ +mul.f16x2 r1036, r1033, r1218; +} +{ +add.f16x2 r1039, %80, r1036; +} +{ +add.f16x2 r1042, %82, %88; +} +{ +mul.f16x2 r1045, r1042, r1227; +} +{ +add.f16x2 r1048, r1039, r1045; +} +{ +sub.f16x2 r1051, %84, %81; +} +{ +mul.f16x2 r1054, r1051, r1236; +} +{ +sub.f16x2 r1057, %87, %83; +} +{ +mul.f16x2 r1060, r1057, r1242; +} +{ +add.f16x2 r1063, r1054, r1060; +} +{ +sub.f16x2 r1066, r1048, r1063; +} +{ +add.f16x2 r1069, %89, %86; +} +{ +mul.f16x2 r1072, r1069, r1218; +} +{ +add.f16x2 r1075, %80, r1072; +} +{ +add.f16x2 r1078, %82, %88; +} +{ +mul.f16x2 r1081, r1078, r1227; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %84, %81; +} +{ +mul.f16x2 r1090, r1087, r1236; +} +{ +sub.f16x2 r1093, %87, %83; +} +{ +mul.f16x2 r1096, r1093, r1242; +} +{ +add.f16x2 r1099, r1090, r1096; +} +{ +add.f16x2 r1102, r1084, r1099; +} +{ +add.f16x2 r1105, %84, %81; +} +{ +mul.f16x2 r1108, r1105, r1146; +} +{ +add.f16x2 r1111, %85, r1108; +} +{ +add.f16x2 r1114, %87, %83; +} +{ +mul.f16x2 r1117, r1114, r1218; +} +{ +add.f16x2 r1120, r1111, r1117; +} +{ +sub.f16x2 r1123, %89, %86; +} +{ +mul.f16x2 r1126, r1123, r1164; +} +{ +sub.f16x2 r1129, %82, %88; +} +{ +mul.f16x2 r1132, r1129, r1236; +} +{ +add.f16x2 r1135, r1126, r1132; +} +{ +add.f16x2 r1138, r1120, r1135; +} +{ +add.f16x2 r1141, %84, %81; +} +{ +mul.f16x2 r1144, r1141, r1146; +} +{ +add.f16x2 r1147, %85, r1144; +} +{ +add.f16x2 r1150, %87, %83; +} +{ +mul.f16x2 r1153, r1150, r1218; +} +{ +add.f16x2 r1156, r1147, r1153; +} +{ +sub.f16x2 r1159, %89, %86; +} +{ +mul.f16x2 r1162, r1159, r1164; +} +{ +sub.f16x2 r1165, %82, %88; +} +{ +mul.f16x2 r1168, r1165, r1236; +} +{ +add.f16x2 r1171, r1162, r1168; +} +{ +sub.f16x2 r1174, r1156, r1171; +} +{ +add.f16x2 r1177, %84, %81; +} +{ +mul.f16x2 r1180, r1177, r1218; +} +{ +add.f16x2 r1183, %85, r1180; +} +{ +add.f16x2 r1186, %87, %83; +} +{ +mul.f16x2 r1189, r1186, r1227; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %89, %86; +} +{ +mul.f16x2 r1198, r1195, r1236; +} +{ +sub.f16x2 r1201, %82, %88; +} +{ +mul.f16x2 r1204, r1201, r1242; +} +{ +add.f16x2 r1207, r1198, r1204; +} +{ +add.f16x2 r1210, r1192, r1207; +} +{ +add.f16x2 r1213, %84, %81; +} +{ +mul.f16x2 r1216, r1213, r1218; +} +{ +add.f16x2 r1219, %85, r1216; +} +{ +add.f16x2 r1222, %87, %83; +} +{ +mul.f16x2 r1225, r1222, r1227; +} +{ +add.f16x2 r1228, r1219, r1225; +} +{ +sub.f16x2 r1231, %89, %86; +} +{ +mul.f16x2 r1234, r1231, r1236; +} +{ +sub.f16x2 r1237, %82, %88; +} +{ +mul.f16x2 r1240, r1237, r1242; +} +{ +add.f16x2 r1243, r1234, r1240; +} +{ +sub.f16x2 r1246, r1228, r1243; +} +{ +cvt.rn.f16.f64 rs33, fd107; +} +mov.b32 r1458, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd108; +} +mov.b32 r1476, {rs34, rs34}; +{ +cvt.rn.f16.f64 rs35, fd105; +} +mov.b32 r1530, {rs35, rs35}; +{ +cvt.rn.f16.f64 rs36, fd106; +} +mov.b32 r1548, {rs36, rs36}; +{ +cvt.rn.f16.f64 rs37, fd107; +} +mov.b32 r1539, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs38, fd108; +} +{ +neg.f16 rs39, rs38; +} +mov.b32 r1554, {rs39, rs39}; +{ +add.f16x2 r1249, %96, %93; +} +{ +add.f16x2 r1252, %97, r1249; +} +{ +add.f16x2 r1255, %99, %95; +} +{ +add.f16x2 r1258, r1252, r1255; +} +{ +add.f16x2 r1261, %91, %98; +} +{ +add.f16x2 r1264, %92, r1261; +} +{ +add.f16x2 r1267, %94, %90; +} +{ +add.f16x2 r1270, r1264, r1267; +} +{ +add.f16x2 r1273, %96, %93; +} +{ +mul.f16x2 r1276, r1273, r1458; +} +{ +add.f16x2 r1279, %97, r1276; +} +{ +add.f16x2 r1282, %99, %95; +} +{ +mul.f16x2 r1285, r1282, r1530; +} +{ +add.f16x2 r1288, r1279, r1285; +} +{ +sub.f16x2 r1291, %91, %98; +} +{ +mul.f16x2 r1294, r1291, r1476; +} +{ +sub.f16x2 r1297, %94, %90; +} +{ +mul.f16x2 r1300, r1297, r1548; +} +{ +add.f16x2 r1303, r1294, r1300; +} +{ +sub.f16x2 r1306, r1288, r1303; +} +{ +add.f16x2 r1309, %96, %93; +} +{ +mul.f16x2 r1312, r1309, r1458; +} +{ +add.f16x2 r1315, %97, r1312; +} +{ +add.f16x2 r1318, %99, %95; +} +{ +mul.f16x2 r1321, r1318, r1530; +} +{ +add.f16x2 r1324, r1315, r1321; +} +{ +sub.f16x2 r1327, %91, %98; +} +{ +mul.f16x2 r1330, r1327, r1476; +} +{ +sub.f16x2 r1333, %94, %90; +} +{ +mul.f16x2 r1336, r1333, r1548; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +add.f16x2 r1342, r1324, r1339; +} +{ +add.f16x2 r1345, %96, %93; +} +{ +mul.f16x2 r1348, r1345, r1530; +} +{ +add.f16x2 r1351, %97, r1348; +} +{ +add.f16x2 r1354, %99, %95; +} +{ +mul.f16x2 r1357, r1354, r1539; +} +{ +add.f16x2 r1360, r1351, r1357; +} +{ +sub.f16x2 r1363, %91, %98; +} +{ +mul.f16x2 r1366, r1363, r1548; +} +{ +sub.f16x2 r1369, %94, %90; +} +{ +mul.f16x2 r1372, r1369, r1554; +} +{ +add.f16x2 r1375, r1366, r1372; +} +{ +sub.f16x2 r1378, r1360, r1375; +} +{ +add.f16x2 r1381, %96, %93; +} +{ +mul.f16x2 r1384, r1381, r1530; +} +{ +add.f16x2 r1387, %97, r1384; +} +{ +add.f16x2 r1390, %99, %95; +} +{ +mul.f16x2 r1393, r1390, r1539; +} +{ +add.f16x2 r1396, r1387, r1393; +} +{ +sub.f16x2 r1399, %91, %98; +} +{ +mul.f16x2 r1402, r1399, r1548; +} +{ +sub.f16x2 r1405, %94, %90; +} +{ +mul.f16x2 r1408, r1405, r1554; +} +{ +add.f16x2 r1411, r1402, r1408; +} +{ +add.f16x2 r1414, r1396, r1411; +} +{ +add.f16x2 r1417, %91, %98; +} +{ +mul.f16x2 r1420, r1417, r1458; +} +{ +add.f16x2 r1423, %92, r1420; +} +{ +add.f16x2 r1426, %94, %90; +} +{ +mul.f16x2 r1429, r1426, r1530; +} +{ +add.f16x2 r1432, r1423, r1429; +} +{ +sub.f16x2 r1435, %96, %93; +} +{ +mul.f16x2 r1438, r1435, r1476; +} +{ +sub.f16x2 r1441, %99, %95; +} +{ +mul.f16x2 r1444, r1441, r1548; +} +{ +add.f16x2 r1447, r1438, r1444; +} +{ +add.f16x2 r1450, r1432, r1447; +} +{ +add.f16x2 r1453, %91, %98; +} +{ +mul.f16x2 r1456, r1453, r1458; +} +{ +add.f16x2 r1459, %92, r1456; +} +{ +add.f16x2 r1462, %94, %90; +} +{ +mul.f16x2 r1465, r1462, r1530; +} +{ +add.f16x2 r1468, r1459, r1465; +} +{ +sub.f16x2 r1471, %96, %93; +} +{ +mul.f16x2 r1474, r1471, r1476; +} +{ +sub.f16x2 r1477, %99, %95; +} +{ +mul.f16x2 r1480, r1477, r1548; +} +{ +add.f16x2 r1483, r1474, r1480; +} +{ +sub.f16x2 r1486, r1468, r1483; +} +{ +add.f16x2 r1489, %91, %98; +} +{ +mul.f16x2 r1492, r1489, r1530; +} +{ +add.f16x2 r1495, %92, r1492; +} +{ +add.f16x2 r1498, %94, %90; +} +{ +mul.f16x2 r1501, r1498, r1539; +} +{ +add.f16x2 r1504, r1495, r1501; +} +{ +sub.f16x2 r1507, %96, %93; +} +{ +mul.f16x2 r1510, r1507, r1548; +} +{ +sub.f16x2 r1513, %99, %95; +} +{ +mul.f16x2 r1516, r1513, r1554; +} +{ +add.f16x2 r1519, r1510, r1516; +} +{ +add.f16x2 r1522, r1504, r1519; +} +{ +add.f16x2 r1525, %91, %98; +} +{ +mul.f16x2 r1528, r1525, r1530; +} +{ +add.f16x2 r1531, %92, r1528; +} +{ +add.f16x2 r1534, %94, %90; +} +{ +mul.f16x2 r1537, r1534, r1539; +} +{ +add.f16x2 r1540, r1531, r1537; +} +{ +sub.f16x2 r1543, %96, %93; +} +{ +mul.f16x2 r1546, r1543, r1548; +} +{ +sub.f16x2 r1549, %99, %95; +} +{ +mul.f16x2 r1552, r1549, r1554; +} +{ +add.f16x2 r1555, r1546, r1552; +} +{ +sub.f16x2 r1558, r1540, r1555; +} +mov.f64 fd31, 0d3FEEFEA21D101EE0; +{ +cvt.rn.f16.f64 rs41, fd31; +} +mov.f64 fd32, 0dBFCFD511FA1C0796; +{ +cvt.rn.f16.f64 rs42, fd32; +} +mov.f64 fd33, 0d3FEC0AB44E81C059; +{ +cvt.rn.f16.f64 rs43, fd33; +} +mov.f64 fd34, 0dBFDED50D5CBFA951; +{ +cvt.rn.f16.f64 rs44, fd34; +} +mov.f64 fd35, 0d3FE753B603D2B816; +{ +cvt.rn.f16.f64 rs45, fd35; +} +mov.f64 fd36, 0dBFE5E7CF55112014; +{ +cvt.rn.f16.f64 rs46, fd36; +} +mov.f64 fd37, 0d3FE1257E3C182B51; +{ +cvt.rn.f16.f64 rs47, fd37; +} +mov.f64 fd38, 0dBFEB04BBFF642E86; +{ +cvt.rn.f16.f64 rs48, fd38; +} +mov.f64 fd41, 0d3FB0130A1BE09379; +{ +cvt.rn.f16.f64 rs51, fd41; +} +mov.f64 fd42, 0dBFEFEFD5BFE443FE; +{ +cvt.rn.f16.f64 rs52, fd42; +} +mov.f64 fd45, 0dBFDB3FF7C925819C; +{ +cvt.rn.f16.f64 rs55, fd45; +} +mov.f64 fd46, 0dBFECF457DCDC158C; +{ +cvt.rn.f16.f64 rs56, fd46; +} +mov.f64 fd61, 0dBFE465C6FEB501BC; +{ +cvt.rn.f16.f64 rs57, fd61; +} +mov.f64 fd48, 0dBFE8A80B635B6BEA; +{ +cvt.rn.f16.f64 rs58, fd48; +} +mov.f64 fd53, 0dBFEFBF675480D903; +{ +cvt.rn.f16.f64 rs63, fd53; +} +mov.f64 fd54, 0dBFC00AEB5DA15BE0; +{ +cvt.rn.f16.f64 rs64, fd54; +} +{ +cvt.rn.f16.f64 rs71, fd61; +} +mov.f64 fd62, 0d3FE8A80B635B6BEA; +{ +cvt.rn.f16.f64 rs72, fd62; +} +mov.b32 r1575, {rs41, rs41}; +{ +mul.f16x2 r1561, r370, r1575; +} +mov.b32 r1572, {rs42, rs42}; +{ +mul.f16x2 r1564, r514, r1572; +} +{ +sub.f16x2 r1567, r1561, r1564; +} +{ +mul.f16x2 r1570, r370, r1572; +} +{ +fma.rn.f16x2 r1573, r514, r1575, r1570; +} +mov.b32 r1639, {rs43, rs43}; +{ +mul.f16x2 r1577, r682, r1639; +} +mov.b32 r1636, {rs44, rs44}; +{ +mul.f16x2 r1580, r826, r1636; +} +{ +sub.f16x2 r1583, r1577, r1580; +} +{ +mul.f16x2 r1586, r682, r1636; +} +{ +fma.rn.f16x2 r1589, r826, r1639, r1586; +} +mov.b32 r1703, {rs45, rs45}; +{ +mul.f16x2 r1593, r994, r1703; +} +mov.b32 r1700, {rs46, rs46}; +{ +mul.f16x2 r1596, r1138, r1700; +} +{ +sub.f16x2 r1599, r1593, r1596; +} +{ +mul.f16x2 r1602, r994, r1700; +} +{ +fma.rn.f16x2 r1605, r1138, r1703, r1602; +} +mov.b32 r1767, {rs47, rs47}; +{ +mul.f16x2 r1609, r1306, r1767; +} +mov.b32 r1764, {rs48, rs48}; +{ +mul.f16x2 r1612, r1450, r1764; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1306, r1764; +} +{ +fma.rn.f16x2 r1621, r1450, r1767, r1618; +} +{ +mul.f16x2 r1625, r442, r1639; +} +{ +mul.f16x2 r1628, r586, r1636; +} +{ +sub.f16x2 r1631, r1625, r1628; +} +{ +mul.f16x2 r1634, r442, r1636; +} +{ +fma.rn.f16x2 r1637, r586, r1639, r1634; +} +{ +mul.f16x2 r1641, r754, r1767; +} +{ +mul.f16x2 r1644, r898, r1764; +} +{ +sub.f16x2 r1647, r1641, r1644; +} +{ +mul.f16x2 r1650, r754, r1764; +} +{ +fma.rn.f16x2 r1653, r898, r1767, r1650; +} +mov.b32 r1719, {rs51, rs51}; +{ +mul.f16x2 r1657, r1066, r1719; +} +mov.b32 r1716, {rs52, rs52}; +{ +mul.f16x2 r1660, r1210, r1716; +} +{ +sub.f16x2 r1663, r1657, r1660; +} +{ +mul.f16x2 r1666, r1066, r1716; +} +{ +fma.rn.f16x2 r1669, r1210, r1719, r1666; +} +mov.b32 r1783, {rs55, rs55}; +{ +mul.f16x2 r1673, r1378, r1783; +} +mov.b32 r1780, {rs56, rs56}; +{ +mul.f16x2 r1676, r1522, r1780; +} +{ +sub.f16x2 r1679, r1673, r1676; +} +{ +mul.f16x2 r1682, r1378, r1780; +} +{ +fma.rn.f16x2 r1685, r1522, r1783, r1682; +} +{ +mul.f16x2 r1689, r478, r1703; +} +{ +mul.f16x2 r1692, r622, r1700; +} +{ +sub.f16x2 r1695, r1689, r1692; +} +{ +mul.f16x2 r1698, r478, r1700; +} +{ +fma.rn.f16x2 r1701, r622, r1703, r1698; +} +{ +mul.f16x2 r1705, r790, r1719; +} +{ +mul.f16x2 r1708, r934, r1716; +} +{ +sub.f16x2 r1711, r1705, r1708; +} +{ +mul.f16x2 r1714, r790, r1716; +} +{ +fma.rn.f16x2 r1717, r934, r1719, r1714; +} +mov.b32 r1735, {rs57, rs57}; +{ +mul.f16x2 r1721, r1102, r1735; +} +mov.b32 r1732, {rs58, rs58}; +{ +mul.f16x2 r1724, r1246, r1732; +} +{ +sub.f16x2 r1727, r1721, r1724; +} +{ +mul.f16x2 r1730, r1102, r1732; +} +{ +fma.rn.f16x2 r1733, r1246, r1735, r1730; +} +mov.b32 r1799, {rs63, rs63}; +{ +mul.f16x2 r1737, r1414, r1799; +} +mov.b32 r1796, {rs64, rs64}; +{ +mul.f16x2 r1740, r1558, r1796; +} +{ +sub.f16x2 r1743, r1737, r1740; +} +{ +mul.f16x2 r1746, r1414, r1796; +} +{ +fma.rn.f16x2 r1749, r1558, r1799, r1746; +} +{ +mul.f16x2 r1753, r406, r1767; +} +{ +mul.f16x2 r1756, r550, r1764; +} +{ +sub.f16x2 r1759, r1753, r1756; +} +{ +mul.f16x2 r1762, r406, r1764; +} +{ +fma.rn.f16x2 r1765, r550, r1767, r1762; +} +{ +mul.f16x2 r1769, r718, r1783; +} +{ +mul.f16x2 r1772, r862, r1780; +} +{ +sub.f16x2 r1775, r1769, r1772; +} +{ +mul.f16x2 r1778, r718, r1780; +} +{ +fma.rn.f16x2 r1781, r862, r1783, r1778; +} +{ +mul.f16x2 r1785, r1030, r1799; +} +{ +mul.f16x2 r1788, r1174, r1796; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1030, r1796; +} +{ +fma.rn.f16x2 r1797, r1174, r1799, r1794; +} +mov.b32 r1815, {rs71, rs71}; +{ +mul.f16x2 r1801, r1342, r1815; +} +mov.b32 r1812, {rs72, rs72}; +{ +mul.f16x2 r1804, r1486, r1812; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1342, r1812; +} +{ +fma.rn.f16x2 r1813, r1486, r1815, r1810; +} +{ +cvt.rn.f16.f64 rs89, fd107; +} +mov.b32 r2026, {rs89, rs89}; +{ +cvt.rn.f16.f64 rs90, fd108; +} +mov.b32 r2044, {rs90, rs90}; +{ +cvt.rn.f16.f64 rs91, fd105; +} +mov.b32 r2098, {rs91, rs91}; +{ +cvt.rn.f16.f64 rs92, fd106; +} +mov.b32 r2116, {rs92, rs92}; +{ +cvt.rn.f16.f64 rs93, fd107; +} +mov.b32 r2107, {rs93, rs93}; +{ +cvt.rn.f16.f64 rs94, fd108; +} +{ +neg.f16 rs95, rs94; +} +mov.b32 r2122, {rs95, rs95}; +{ +add.f16x2 r1817, r322, r1258; +} +{ +add.f16x2 r1820, r10, r1817; +} +{ +add.f16x2 r1823, r634, r946; +} +{ +add.f16x2 %0, r1820, r1823; +} +{ +add.f16x2 r1829, r334, r1270; +} +{ +add.f16x2 r1832, r22, r1829; +} +{ +add.f16x2 r1835, r646, r958; +} +{ +add.f16x2 %1, r1832, r1835; +} +{ +add.f16x2 r1841, r322, r1258; +} +{ +mul.f16x2 r1844, r1841, r2026; +} +{ +add.f16x2 r1847, r10, r1844; +} +{ +add.f16x2 r1850, r634, r946; +} +{ +mul.f16x2 r1853, r1850, r2098; +} +{ +add.f16x2 r1856, r1847, r1853; +} +{ +sub.f16x2 r1859, r334, r1270; +} +{ +mul.f16x2 r1862, r1859, r2044; +} +{ +sub.f16x2 r1865, r646, r958; +} +{ +mul.f16x2 r1868, r1865, r2116; +} +{ +add.f16x2 r1871, r1862, r1868; +} +{ +sub.f16x2 %10, r1856, r1871; +} +{ +add.f16x2 r1877, r322, r1258; +} +{ +mul.f16x2 r1880, r1877, r2026; +} +{ +add.f16x2 r1883, r10, r1880; +} +{ +add.f16x2 r1886, r634, r946; +} +{ +mul.f16x2 r1889, r1886, r2098; +} +{ +add.f16x2 r1892, r1883, r1889; +} +{ +sub.f16x2 r1895, r334, r1270; +} +{ +mul.f16x2 r1898, r1895, r2044; +} +{ +sub.f16x2 r1901, r646, r958; +} +{ +mul.f16x2 r1904, r1901, r2116; +} +{ +add.f16x2 r1907, r1898, r1904; +} +{ +add.f16x2 %40, r1892, r1907; +} +{ +add.f16x2 r1913, r322, r1258; +} +{ +mul.f16x2 r1916, r1913, r2098; +} +{ +add.f16x2 r1919, r10, r1916; +} +{ +add.f16x2 r1922, r634, r946; +} +{ +mul.f16x2 r1925, r1922, r2107; +} +{ +add.f16x2 r1928, r1919, r1925; +} +{ +sub.f16x2 r1931, r334, r1270; +} +{ +mul.f16x2 r1934, r1931, r2116; +} +{ +sub.f16x2 r1937, r646, r958; +} +{ +mul.f16x2 r1940, r1937, r2122; +} +{ +add.f16x2 r1943, r1934, r1940; +} +{ +sub.f16x2 %20, r1928, r1943; +} +{ +add.f16x2 r1949, r322, r1258; +} +{ +mul.f16x2 r1952, r1949, r2098; +} +{ +add.f16x2 r1955, r10, r1952; +} +{ +add.f16x2 r1958, r634, r946; +} +{ +mul.f16x2 r1961, r1958, r2107; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r334, r1270; +} +{ +mul.f16x2 r1970, r1967, r2116; +} +{ +sub.f16x2 r1973, r646, r958; +} +{ +mul.f16x2 r1976, r1973, r2122; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +add.f16x2 %30, r1964, r1979; +} +{ +add.f16x2 r1985, r334, r1270; +} +{ +mul.f16x2 r1988, r1985, r2026; +} +{ +add.f16x2 r1991, r22, r1988; +} +{ +add.f16x2 r1994, r646, r958; +} +{ +mul.f16x2 r1997, r1994, r2098; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r322, r1258; +} +{ +mul.f16x2 r2006, r2003, r2044; +} +{ +sub.f16x2 r2009, r634, r946; +} +{ +mul.f16x2 r2012, r2009, r2116; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 %11, r2000, r2015; +} +{ +add.f16x2 r2021, r334, r1270; +} +{ +mul.f16x2 r2024, r2021, r2026; +} +{ +add.f16x2 r2027, r22, r2024; +} +{ +add.f16x2 r2030, r646, r958; +} +{ +mul.f16x2 r2033, r2030, r2098; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r322, r1258; +} +{ +mul.f16x2 r2042, r2039, r2044; +} +{ +sub.f16x2 r2045, r634, r946; +} +{ +mul.f16x2 r2048, r2045, r2116; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 %41, r2036, r2051; +} +{ +add.f16x2 r2057, r334, r1270; +} +{ +mul.f16x2 r2060, r2057, r2098; +} +{ +add.f16x2 r2063, r22, r2060; +} +{ +add.f16x2 r2066, r646, r958; +} +{ +mul.f16x2 r2069, r2066, r2107; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r322, r1258; +} +{ +mul.f16x2 r2078, r2075, r2116; +} +{ +sub.f16x2 r2081, r634, r946; +} +{ +mul.f16x2 r2084, r2081, r2122; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 %21, r2072, r2087; +} +{ +add.f16x2 r2093, r334, r1270; +} +{ +mul.f16x2 r2096, r2093, r2098; +} +{ +add.f16x2 r2099, r22, r2096; +} +{ +add.f16x2 r2102, r646, r958; +} +{ +mul.f16x2 r2105, r2102, r2107; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r322, r1258; +} +{ +mul.f16x2 r2114, r2111, r2116; +} +{ +sub.f16x2 r2117, r634, r946; +} +{ +mul.f16x2 r2120, r2117, r2122; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +sub.f16x2 %31, r2108, r2123; +} +{ +cvt.rn.f16.f64 rs97, fd107; +} +mov.b32 r2338, {rs97, rs97}; +{ +cvt.rn.f16.f64 rs98, fd108; +} +mov.b32 r2356, {rs98, rs98}; +{ +cvt.rn.f16.f64 rs99, fd105; +} +mov.b32 r2410, {rs99, rs99}; +{ +cvt.rn.f16.f64 rs100, fd106; +} +mov.b32 r2428, {rs100, rs100}; +{ +cvt.rn.f16.f64 rs101, fd107; +} +mov.b32 r2419, {rs101, rs101}; +{ +cvt.rn.f16.f64 rs102, fd108; +} +{ +neg.f16 rs103, rs102; +} +mov.b32 r2434, {rs103, rs103}; +{ +add.f16x2 r2129, r1567, r1615; +} +{ +add.f16x2 r2132, r58, r2129; +} +{ +add.f16x2 r2135, r1583, r1599; +} +{ +add.f16x2 %2, r2132, r2135; +} +{ +add.f16x2 r2141, r1573, r1621; +} +{ +add.f16x2 r2144, r202, r2141; +} +{ +add.f16x2 r2147, r1589, r1605; +} +{ +add.f16x2 %3, r2144, r2147; +} +{ +add.f16x2 r2153, r1567, r1615; +} +{ +mul.f16x2 r2156, r2153, r2338; +} +{ +add.f16x2 r2159, r58, r2156; +} +{ +add.f16x2 r2162, r1583, r1599; +} +{ +mul.f16x2 r2165, r2162, r2410; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r1573, r1621; +} +{ +mul.f16x2 r2174, r2171, r2356; +} +{ +sub.f16x2 r2177, r1589, r1605; +} +{ +mul.f16x2 r2180, r2177, r2428; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +sub.f16x2 %12, r2168, r2183; +} +{ +add.f16x2 r2189, r1567, r1615; +} +{ +mul.f16x2 r2192, r2189, r2338; +} +{ +add.f16x2 r2195, r58, r2192; +} +{ +add.f16x2 r2198, r1583, r1599; +} +{ +mul.f16x2 r2201, r2198, r2410; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r1573, r1621; +} +{ +mul.f16x2 r2210, r2207, r2356; +} +{ +sub.f16x2 r2213, r1589, r1605; +} +{ +mul.f16x2 r2216, r2213, r2428; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +add.f16x2 %42, r2204, r2219; +} +{ +add.f16x2 r2225, r1567, r1615; +} +{ +mul.f16x2 r2228, r2225, r2410; +} +{ +add.f16x2 r2231, r58, r2228; +} +{ +add.f16x2 r2234, r1583, r1599; +} +{ +mul.f16x2 r2237, r2234, r2419; +} +{ +add.f16x2 r2240, r2231, r2237; +} +{ +sub.f16x2 r2243, r1573, r1621; +} +{ +mul.f16x2 r2246, r2243, r2428; +} +{ +sub.f16x2 r2249, r1589, r1605; +} +{ +mul.f16x2 r2252, r2249, r2434; +} +{ +add.f16x2 r2255, r2246, r2252; +} +{ +sub.f16x2 %22, r2240, r2255; +} +{ +add.f16x2 r2261, r1567, r1615; +} +{ +mul.f16x2 r2264, r2261, r2410; +} +{ +add.f16x2 r2267, r58, r2264; +} +{ +add.f16x2 r2270, r1583, r1599; +} +{ +mul.f16x2 r2273, r2270, r2419; +} +{ +add.f16x2 r2276, r2267, r2273; +} +{ +sub.f16x2 r2279, r1573, r1621; +} +{ +mul.f16x2 r2282, r2279, r2428; +} +{ +sub.f16x2 r2285, r1589, r1605; +} +{ +mul.f16x2 r2288, r2285, r2434; +} +{ +add.f16x2 r2291, r2282, r2288; +} +{ +add.f16x2 %32, r2276, r2291; +} +{ +add.f16x2 r2297, r1573, r1621; +} +{ +mul.f16x2 r2300, r2297, r2338; +} +{ +add.f16x2 r2303, r202, r2300; +} +{ +add.f16x2 r2306, r1589, r1605; +} +{ +mul.f16x2 r2309, r2306, r2410; +} +{ +add.f16x2 r2312, r2303, r2309; +} +{ +sub.f16x2 r2315, r1567, r1615; +} +{ +mul.f16x2 r2318, r2315, r2356; +} +{ +sub.f16x2 r2321, r1583, r1599; +} +{ +mul.f16x2 r2324, r2321, r2428; +} +{ +add.f16x2 r2327, r2318, r2324; +} +{ +add.f16x2 %13, r2312, r2327; +} +{ +add.f16x2 r2333, r1573, r1621; +} +{ +mul.f16x2 r2336, r2333, r2338; +} +{ +add.f16x2 r2339, r202, r2336; +} +{ +add.f16x2 r2342, r1589, r1605; +} +{ +mul.f16x2 r2345, r2342, r2410; +} +{ +add.f16x2 r2348, r2339, r2345; +} +{ +sub.f16x2 r2351, r1567, r1615; +} +{ +mul.f16x2 r2354, r2351, r2356; +} +{ +sub.f16x2 r2357, r1583, r1599; +} +{ +mul.f16x2 r2360, r2357, r2428; +} +{ +add.f16x2 r2363, r2354, r2360; +} +{ +sub.f16x2 %43, r2348, r2363; +} +{ +add.f16x2 r2369, r1573, r1621; +} +{ +mul.f16x2 r2372, r2369, r2410; +} +{ +add.f16x2 r2375, r202, r2372; +} +{ +add.f16x2 r2378, r1589, r1605; +} +{ +mul.f16x2 r2381, r2378, r2419; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +sub.f16x2 r2387, r1567, r1615; +} +{ +mul.f16x2 r2390, r2387, r2428; +} +{ +sub.f16x2 r2393, r1583, r1599; +} +{ +mul.f16x2 r2396, r2393, r2434; +} +{ +add.f16x2 r2399, r2390, r2396; +} +{ +add.f16x2 %23, r2384, r2399; +} +{ +add.f16x2 r2405, r1573, r1621; +} +{ +mul.f16x2 r2408, r2405, r2410; +} +{ +add.f16x2 r2411, r202, r2408; +} +{ +add.f16x2 r2414, r1589, r1605; +} +{ +mul.f16x2 r2417, r2414, r2419; +} +{ +add.f16x2 r2420, r2411, r2417; +} +{ +sub.f16x2 r2423, r1567, r1615; +} +{ +mul.f16x2 r2426, r2423, r2428; +} +{ +sub.f16x2 r2429, r1583, r1599; +} +{ +mul.f16x2 r2432, r2429, r2434; +} +{ +add.f16x2 r2435, r2426, r2432; +} +{ +sub.f16x2 %33, r2420, r2435; +} +{ +cvt.rn.f16.f64 rs105, fd107; +} +mov.b32 r2650, {rs105, rs105}; +{ +cvt.rn.f16.f64 rs106, fd108; +} +mov.b32 r2668, {rs106, rs106}; +{ +cvt.rn.f16.f64 rs107, fd105; +} +mov.b32 r2722, {rs107, rs107}; +{ +cvt.rn.f16.f64 rs108, fd106; +} +mov.b32 r2740, {rs108, rs108}; +{ +cvt.rn.f16.f64 rs109, fd107; +} +mov.b32 r2731, {rs109, rs109}; +{ +cvt.rn.f16.f64 rs110, fd108; +} +{ +neg.f16 rs111, rs110; +} +mov.b32 r2746, {rs111, rs111}; +{ +add.f16x2 r2441, r1631, r1679; +} +{ +add.f16x2 r2444, r130, r2441; +} +{ +add.f16x2 r2447, r1647, r1663; +} +{ +add.f16x2 %4, r2444, r2447; +} +{ +add.f16x2 r2453, r1637, r1685; +} +{ +add.f16x2 r2456, r274, r2453; +} +{ +add.f16x2 r2459, r1653, r1669; +} +{ +add.f16x2 %5, r2456, r2459; +} +{ +add.f16x2 r2465, r1631, r1679; +} +{ +mul.f16x2 r2468, r2465, r2650; +} +{ +add.f16x2 r2471, r130, r2468; +} +{ +add.f16x2 r2474, r1647, r1663; +} +{ +mul.f16x2 r2477, r2474, r2722; +} +{ +add.f16x2 r2480, r2471, r2477; +} +{ +sub.f16x2 r2483, r1637, r1685; +} +{ +mul.f16x2 r2486, r2483, r2668; +} +{ +sub.f16x2 r2489, r1653, r1669; +} +{ +mul.f16x2 r2492, r2489, r2740; +} +{ +add.f16x2 r2495, r2486, r2492; +} +{ +sub.f16x2 %14, r2480, r2495; +} +{ +add.f16x2 r2501, r1631, r1679; +} +{ +mul.f16x2 r2504, r2501, r2650; +} +{ +add.f16x2 r2507, r130, r2504; +} +{ +add.f16x2 r2510, r1647, r1663; +} +{ +mul.f16x2 r2513, r2510, r2722; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +sub.f16x2 r2519, r1637, r1685; +} +{ +mul.f16x2 r2522, r2519, r2668; +} +{ +sub.f16x2 r2525, r1653, r1669; +} +{ +mul.f16x2 r2528, r2525, r2740; +} +{ +add.f16x2 r2531, r2522, r2528; +} +{ +add.f16x2 %44, r2516, r2531; +} +{ +add.f16x2 r2537, r1631, r1679; +} +{ +mul.f16x2 r2540, r2537, r2722; +} +{ +add.f16x2 r2543, r130, r2540; +} +{ +add.f16x2 r2546, r1647, r1663; +} +{ +mul.f16x2 r2549, r2546, r2731; +} +{ +add.f16x2 r2552, r2543, r2549; +} +{ +sub.f16x2 r2555, r1637, r1685; +} +{ +mul.f16x2 r2558, r2555, r2740; +} +{ +sub.f16x2 r2561, r1653, r1669; +} +{ +mul.f16x2 r2564, r2561, r2746; +} +{ +add.f16x2 r2567, r2558, r2564; +} +{ +sub.f16x2 %24, r2552, r2567; +} +{ +add.f16x2 r2573, r1631, r1679; +} +{ +mul.f16x2 r2576, r2573, r2722; +} +{ +add.f16x2 r2579, r130, r2576; +} +{ +add.f16x2 r2582, r1647, r1663; +} +{ +mul.f16x2 r2585, r2582, r2731; +} +{ +add.f16x2 r2588, r2579, r2585; +} +{ +sub.f16x2 r2591, r1637, r1685; +} +{ +mul.f16x2 r2594, r2591, r2740; +} +{ +sub.f16x2 r2597, r1653, r1669; +} +{ +mul.f16x2 r2600, r2597, r2746; +} +{ +add.f16x2 r2603, r2594, r2600; +} +{ +add.f16x2 %34, r2588, r2603; +} +{ +add.f16x2 r2609, r1637, r1685; +} +{ +mul.f16x2 r2612, r2609, r2650; +} +{ +add.f16x2 r2615, r274, r2612; +} +{ +add.f16x2 r2618, r1653, r1669; +} +{ +mul.f16x2 r2621, r2618, r2722; +} +{ +add.f16x2 r2624, r2615, r2621; +} +{ +sub.f16x2 r2627, r1631, r1679; +} +{ +mul.f16x2 r2630, r2627, r2668; +} +{ +sub.f16x2 r2633, r1647, r1663; +} +{ +mul.f16x2 r2636, r2633, r2740; +} +{ +add.f16x2 r2639, r2630, r2636; +} +{ +add.f16x2 %15, r2624, r2639; +} +{ +add.f16x2 r2645, r1637, r1685; +} +{ +mul.f16x2 r2648, r2645, r2650; +} +{ +add.f16x2 r2651, r274, r2648; +} +{ +add.f16x2 r2654, r1653, r1669; +} +{ +mul.f16x2 r2657, r2654, r2722; +} +{ +add.f16x2 r2660, r2651, r2657; +} +{ +sub.f16x2 r2663, r1631, r1679; +} +{ +mul.f16x2 r2666, r2663, r2668; +} +{ +sub.f16x2 r2669, r1647, r1663; +} +{ +mul.f16x2 r2672, r2669, r2740; +} +{ +add.f16x2 r2675, r2666, r2672; +} +{ +sub.f16x2 %45, r2660, r2675; +} +{ +add.f16x2 r2681, r1637, r1685; +} +{ +mul.f16x2 r2684, r2681, r2722; +} +{ +add.f16x2 r2687, r274, r2684; +} +{ +add.f16x2 r2690, r1653, r1669; +} +{ +mul.f16x2 r2693, r2690, r2731; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +sub.f16x2 r2699, r1631, r1679; +} +{ +mul.f16x2 r2702, r2699, r2740; +} +{ +sub.f16x2 r2705, r1647, r1663; +} +{ +mul.f16x2 r2708, r2705, r2746; +} +{ +add.f16x2 r2711, r2702, r2708; +} +{ +add.f16x2 %25, r2696, r2711; +} +{ +add.f16x2 r2717, r1637, r1685; +} +{ +mul.f16x2 r2720, r2717, r2722; +} +{ +add.f16x2 r2723, r274, r2720; +} +{ +add.f16x2 r2726, r1653, r1669; +} +{ +mul.f16x2 r2729, r2726, r2731; +} +{ +add.f16x2 r2732, r2723, r2729; +} +{ +sub.f16x2 r2735, r1631, r1679; +} +{ +mul.f16x2 r2738, r2735, r2740; +} +{ +sub.f16x2 r2741, r1647, r1663; +} +{ +mul.f16x2 r2744, r2741, r2746; +} +{ +add.f16x2 r2747, r2738, r2744; +} +{ +sub.f16x2 %35, r2732, r2747; +} +{ +cvt.rn.f16.f64 rs113, fd107; +} +mov.b32 r2962, {rs113, rs113}; +{ +cvt.rn.f16.f64 rs114, fd108; +} +mov.b32 r2980, {rs114, rs114}; +{ +cvt.rn.f16.f64 rs115, fd105; +} +mov.b32 r3034, {rs115, rs115}; +{ +cvt.rn.f16.f64 rs116, fd106; +} +mov.b32 r3052, {rs116, rs116}; +{ +cvt.rn.f16.f64 rs117, fd107; +} +mov.b32 r3043, {rs117, rs117}; +{ +cvt.rn.f16.f64 rs118, fd108; +} +{ +neg.f16 rs119, rs118; +} +mov.b32 r3058, {rs119, rs119}; +{ +add.f16x2 r2753, r1695, r1743; +} +{ +add.f16x2 r2756, r166, r2753; +} +{ +add.f16x2 r2759, r1711, r1727; +} +{ +add.f16x2 %6, r2756, r2759; +} +{ +add.f16x2 r2765, r1701, r1749; +} +{ +add.f16x2 r2768, r310, r2765; +} +{ +add.f16x2 r2771, r1717, r1733; +} +{ +add.f16x2 %7, r2768, r2771; +} +{ +add.f16x2 r2777, r1695, r1743; +} +{ +mul.f16x2 r2780, r2777, r2962; +} +{ +add.f16x2 r2783, r166, r2780; +} +{ +add.f16x2 r2786, r1711, r1727; +} +{ +mul.f16x2 r2789, r2786, r3034; +} +{ +add.f16x2 r2792, r2783, r2789; +} +{ +sub.f16x2 r2795, r1701, r1749; +} +{ +mul.f16x2 r2798, r2795, r2980; +} +{ +sub.f16x2 r2801, r1717, r1733; +} +{ +mul.f16x2 r2804, r2801, r3052; +} +{ +add.f16x2 r2807, r2798, r2804; +} +{ +sub.f16x2 %16, r2792, r2807; +} +{ +add.f16x2 r2813, r1695, r1743; +} +{ +mul.f16x2 r2816, r2813, r2962; +} +{ +add.f16x2 r2819, r166, r2816; +} +{ +add.f16x2 r2822, r1711, r1727; +} +{ +mul.f16x2 r2825, r2822, r3034; +} +{ +add.f16x2 r2828, r2819, r2825; +} +{ +sub.f16x2 r2831, r1701, r1749; +} +{ +mul.f16x2 r2834, r2831, r2980; +} +{ +sub.f16x2 r2837, r1717, r1733; +} +{ +mul.f16x2 r2840, r2837, r3052; +} +{ +add.f16x2 r2843, r2834, r2840; +} +{ +add.f16x2 %46, r2828, r2843; +} +{ +add.f16x2 r2849, r1695, r1743; +} +{ +mul.f16x2 r2852, r2849, r3034; +} +{ +add.f16x2 r2855, r166, r2852; +} +{ +add.f16x2 r2858, r1711, r1727; +} +{ +mul.f16x2 r2861, r2858, r3043; +} +{ +add.f16x2 r2864, r2855, r2861; +} +{ +sub.f16x2 r2867, r1701, r1749; +} +{ +mul.f16x2 r2870, r2867, r3052; +} +{ +sub.f16x2 r2873, r1717, r1733; +} +{ +mul.f16x2 r2876, r2873, r3058; +} +{ +add.f16x2 r2879, r2870, r2876; +} +{ +sub.f16x2 %26, r2864, r2879; +} +{ +add.f16x2 r2885, r1695, r1743; +} +{ +mul.f16x2 r2888, r2885, r3034; +} +{ +add.f16x2 r2891, r166, r2888; +} +{ +add.f16x2 r2894, r1711, r1727; +} +{ +mul.f16x2 r2897, r2894, r3043; +} +{ +add.f16x2 r2900, r2891, r2897; +} +{ +sub.f16x2 r2903, r1701, r1749; +} +{ +mul.f16x2 r2906, r2903, r3052; +} +{ +sub.f16x2 r2909, r1717, r1733; +} +{ +mul.f16x2 r2912, r2909, r3058; +} +{ +add.f16x2 r2915, r2906, r2912; +} +{ +add.f16x2 %36, r2900, r2915; +} +{ +add.f16x2 r2921, r1701, r1749; +} +{ +mul.f16x2 r2924, r2921, r2962; +} +{ +add.f16x2 r2927, r310, r2924; +} +{ +add.f16x2 r2930, r1717, r1733; +} +{ +mul.f16x2 r2933, r2930, r3034; +} +{ +add.f16x2 r2936, r2927, r2933; +} +{ +sub.f16x2 r2939, r1695, r1743; +} +{ +mul.f16x2 r2942, r2939, r2980; +} +{ +sub.f16x2 r2945, r1711, r1727; +} +{ +mul.f16x2 r2948, r2945, r3052; +} +{ +add.f16x2 r2951, r2942, r2948; +} +{ +add.f16x2 %17, r2936, r2951; +} +{ +add.f16x2 r2957, r1701, r1749; +} +{ +mul.f16x2 r2960, r2957, r2962; +} +{ +add.f16x2 r2963, r310, r2960; +} +{ +add.f16x2 r2966, r1717, r1733; +} +{ +mul.f16x2 r2969, r2966, r3034; +} +{ +add.f16x2 r2972, r2963, r2969; +} +{ +sub.f16x2 r2975, r1695, r1743; +} +{ +mul.f16x2 r2978, r2975, r2980; +} +{ +sub.f16x2 r2981, r1711, r1727; +} +{ +mul.f16x2 r2984, r2981, r3052; +} +{ +add.f16x2 r2987, r2978, r2984; +} +{ +sub.f16x2 %47, r2972, r2987; +} +{ +add.f16x2 r2993, r1701, r1749; +} +{ +mul.f16x2 r2996, r2993, r3034; +} +{ +add.f16x2 r2999, r310, r2996; +} +{ +add.f16x2 r3002, r1717, r1733; +} +{ +mul.f16x2 r3005, r3002, r3043; +} +{ +add.f16x2 r3008, r2999, r3005; +} +{ +sub.f16x2 r3011, r1695, r1743; +} +{ +mul.f16x2 r3014, r3011, r3052; +} +{ +sub.f16x2 r3017, r1711, r1727; +} +{ +mul.f16x2 r3020, r3017, r3058; +} +{ +add.f16x2 r3023, r3014, r3020; +} +{ +add.f16x2 %27, r3008, r3023; +} +{ +add.f16x2 r3029, r1701, r1749; +} +{ +mul.f16x2 r3032, r3029, r3034; +} +{ +add.f16x2 r3035, r310, r3032; +} +{ +add.f16x2 r3038, r1717, r1733; +} +{ +mul.f16x2 r3041, r3038, r3043; +} +{ +add.f16x2 r3044, r3035, r3041; +} +{ +sub.f16x2 r3047, r1695, r1743; +} +{ +mul.f16x2 r3050, r3047, r3052; +} +{ +sub.f16x2 r3053, r1711, r1727; +} +{ +mul.f16x2 r3056, r3053, r3058; +} +{ +add.f16x2 r3059, r3050, r3056; +} +{ +sub.f16x2 %37, r3044, r3059; +} +{ +cvt.rn.f16.f64 rs121, fd107; +} +mov.b32 r3274, {rs121, rs121}; +{ +cvt.rn.f16.f64 rs122, fd108; +} +mov.b32 r3292, {rs122, rs122}; +{ +cvt.rn.f16.f64 rs123, fd105; +} +mov.b32 r3346, {rs123, rs123}; +{ +cvt.rn.f16.f64 rs124, fd106; +} +mov.b32 r3364, {rs124, rs124}; +{ +cvt.rn.f16.f64 rs125, fd107; +} +mov.b32 r3355, {rs125, rs125}; +{ +cvt.rn.f16.f64 rs126, fd108; +} +{ +neg.f16 rs127, rs126; +} +mov.b32 r3370, {rs127, rs127}; +{ +add.f16x2 r3065, r1759, r1807; +} +{ +add.f16x2 r3068, r94, r3065; +} +{ +add.f16x2 r3071, r1775, r1791; +} +{ +add.f16x2 %8, r3068, r3071; +} +{ +add.f16x2 r3077, r1765, r1813; +} +{ +add.f16x2 r3080, r238, r3077; +} +{ +add.f16x2 r3083, r1781, r1797; +} +{ +add.f16x2 %9, r3080, r3083; +} +{ +add.f16x2 r3089, r1759, r1807; +} +{ +mul.f16x2 r3092, r3089, r3274; +} +{ +add.f16x2 r3095, r94, r3092; +} +{ +add.f16x2 r3098, r1775, r1791; +} +{ +mul.f16x2 r3101, r3098, r3346; +} +{ +add.f16x2 r3104, r3095, r3101; +} +{ +sub.f16x2 r3107, r1765, r1813; +} +{ +mul.f16x2 r3110, r3107, r3292; +} +{ +sub.f16x2 r3113, r1781, r1797; +} +{ +mul.f16x2 r3116, r3113, r3364; +} +{ +add.f16x2 r3119, r3110, r3116; +} +{ +sub.f16x2 %18, r3104, r3119; +} +{ +add.f16x2 r3125, r1759, r1807; +} +{ +mul.f16x2 r3128, r3125, r3274; +} +{ +add.f16x2 r3131, r94, r3128; +} +{ +add.f16x2 r3134, r1775, r1791; +} +{ +mul.f16x2 r3137, r3134, r3346; +} +{ +add.f16x2 r3140, r3131, r3137; +} +{ +sub.f16x2 r3143, r1765, r1813; +} +{ +mul.f16x2 r3146, r3143, r3292; +} +{ +sub.f16x2 r3149, r1781, r1797; +} +{ +mul.f16x2 r3152, r3149, r3364; +} +{ +add.f16x2 r3155, r3146, r3152; +} +{ +add.f16x2 %48, r3140, r3155; +} +{ +add.f16x2 r3161, r1759, r1807; +} +{ +mul.f16x2 r3164, r3161, r3346; +} +{ +add.f16x2 r3167, r94, r3164; +} +{ +add.f16x2 r3170, r1775, r1791; +} +{ +mul.f16x2 r3173, r3170, r3355; +} +{ +add.f16x2 r3176, r3167, r3173; +} +{ +sub.f16x2 r3179, r1765, r1813; +} +{ +mul.f16x2 r3182, r3179, r3364; +} +{ +sub.f16x2 r3185, r1781, r1797; +} +{ +mul.f16x2 r3188, r3185, r3370; +} +{ +add.f16x2 r3191, r3182, r3188; +} +{ +sub.f16x2 %28, r3176, r3191; +} +{ +add.f16x2 r3197, r1759, r1807; +} +{ +mul.f16x2 r3200, r3197, r3346; +} +{ +add.f16x2 r3203, r94, r3200; +} +{ +add.f16x2 r3206, r1775, r1791; +} +{ +mul.f16x2 r3209, r3206, r3355; +} +{ +add.f16x2 r3212, r3203, r3209; +} +{ +sub.f16x2 r3215, r1765, r1813; +} +{ +mul.f16x2 r3218, r3215, r3364; +} +{ +sub.f16x2 r3221, r1781, r1797; +} +{ +mul.f16x2 r3224, r3221, r3370; +} +{ +add.f16x2 r3227, r3218, r3224; +} +{ +add.f16x2 %38, r3212, r3227; +} +{ +add.f16x2 r3233, r1765, r1813; +} +{ +mul.f16x2 r3236, r3233, r3274; +} +{ +add.f16x2 r3239, r238, r3236; +} +{ +add.f16x2 r3242, r1781, r1797; +} +{ +mul.f16x2 r3245, r3242, r3346; +} +{ +add.f16x2 r3248, r3239, r3245; +} +{ +sub.f16x2 r3251, r1759, r1807; +} +{ +mul.f16x2 r3254, r3251, r3292; +} +{ +sub.f16x2 r3257, r1775, r1791; +} +{ +mul.f16x2 r3260, r3257, r3364; +} +{ +add.f16x2 r3263, r3254, r3260; +} +{ +add.f16x2 %19, r3248, r3263; +} +{ +add.f16x2 r3269, r1765, r1813; +} +{ +mul.f16x2 r3272, r3269, r3274; +} +{ +add.f16x2 r3275, r238, r3272; +} +{ +add.f16x2 r3278, r1781, r1797; +} +{ +mul.f16x2 r3281, r3278, r3346; +} +{ +add.f16x2 r3284, r3275, r3281; +} +{ +sub.f16x2 r3287, r1759, r1807; +} +{ +mul.f16x2 r3290, r3287, r3292; +} +{ +sub.f16x2 r3293, r1775, r1791; +} +{ +mul.f16x2 r3296, r3293, r3364; +} +{ +add.f16x2 r3299, r3290, r3296; +} +{ +sub.f16x2 %49, r3284, r3299; +} +{ +add.f16x2 r3305, r1765, r1813; +} +{ +mul.f16x2 r3308, r3305, r3346; +} +{ +add.f16x2 r3311, r238, r3308; +} +{ +add.f16x2 r3314, r1781, r1797; +} +{ +mul.f16x2 r3317, r3314, r3355; +} +{ +add.f16x2 r3320, r3311, r3317; +} +{ +sub.f16x2 r3323, r1759, r1807; +} +{ +mul.f16x2 r3326, r3323, r3364; +} +{ +sub.f16x2 r3329, r1775, r1791; +} +{ +mul.f16x2 r3332, r3329, r3370; +} +{ +add.f16x2 r3335, r3326, r3332; +} +{ +add.f16x2 %29, r3320, r3335; +} +{ +add.f16x2 r3341, r1765, r1813; +} +{ +mul.f16x2 r3344, r3341, r3346; +} +{ +add.f16x2 r3347, r238, r3344; +} +{ +add.f16x2 r3350, r1781, r1797; +} +{ +mul.f16x2 r3353, r3350, r3355; +} +{ +add.f16x2 r3356, r3347, r3353; +} +{ +sub.f16x2 r3359, r1759, r1807; +} +{ +mul.f16x2 r3362, r3359, r3364; +} +{ +sub.f16x2 r3365, r1775, r1791; +} +{ +mul.f16x2 r3368, r3365, r3370; +} +{ +add.f16x2 r3371, r3362, r3368; +} +{ +sub.f16x2 %39, r3356, r3371; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<902, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<803>; +.reg .b64 rd<4>; +mov.u32 r792, %tid.x; +mov.f32 f34, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r1, {low, high}; +} +mov.f32 f36, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r2, {low, high}; +} +mov.f32 f30, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r3, {low, high}; +} +mov.f32 f32, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r792, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r793, rd3; +mul.lo.s32 r794, r793, 5; +sub.s32 r795, r792, r794; +cvt.rn.f32.u32 f37, r795; +mul.f32 f38, f37, 0f3E80ADFD; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +mov.u32 r796, %tid.y; +mov.u32 r797, %10; +mad.lo.s32 r798, r796, 200, r797; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +mad.lo.s32 r799, r793, 200, r798; +barrier.sync 0; +mad.lo.s32 r800, r795, 40, r799; +st.shared.v2.f32 [r800], {r18, r30}; +st.shared.v2.f32 [r800+8], {r333, r340}; +st.shared.v2.f32 [r800+16], {r370, r377}; +st.shared.v2.f32 [r800+24], {r407, r414}; +st.shared.v2.f32 [r800+32], {r444, r451}; +barrier.sync 0; +shl.b32 r801, r795, 5; +sub.s32 r802, r800, r801; +ld.shared.u32 r484, [r802]; +ld.shared.u32 r496, [r802+4]; +ld.shared.u32 r481, [r802+40]; +ld.shared.u32 r493, [r802+44]; +ld.shared.u32 r487, [r802+80]; +ld.shared.u32 r499, [r802+84]; +ld.shared.u32 r488, [r802+120]; +ld.shared.u32 r500, [r802+124]; +ld.shared.u32 r482, [r802+160]; +ld.shared.u32 r494, [r802+164]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 %0, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 %1, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 %2, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 %8, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 %4, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 %6, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 %3, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 %9, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 %5, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 %7, r771, r786; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<903, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<803>; +.reg .b64 rd<4>; +mov.u32 r792, %tid.y; +mov.u32 r793, %10; +mad.lo.s32 r794, r792, 100, r793; +mov.u32 r795, %tid.x; +mov.f32 f34, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r1, {low, high}; +} +mov.f32 f36, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r2, {low, high}; +} +mov.f32 f30, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r3, {low, high}; +} +mov.f32 f32, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r795, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r796, rd3; +mul.lo.s32 r797, r796, 5; +sub.s32 r798, r795, r797; +mad.lo.s32 r799, r796, 100, r794; +cvt.rn.f32.u32 f37, r798; +mul.f32 f38, f37, 0f3E80ADFD; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +barrier.sync 0; +mad.lo.s32 r800, r798, 20, r799; +st.shared.u32 [r800], r18; +st.shared.u32 [r800+4], r333; +st.shared.u32 [r800+8], r370; +st.shared.u32 [r800+12], r407; +st.shared.u32 [r800+16], r444; +barrier.sync 0; +shl.b32 r801, r798, 4; +sub.s32 r802, r800, r801; +ld.shared.u32 r484, [r802]; +ld.shared.u32 r481, [r802+20]; +ld.shared.u32 r487, [r802+40]; +ld.shared.u32 r488, [r802+60]; +ld.shared.u32 r482, [r802+80]; +barrier.sync 0; +st.shared.u32 [r800], r30; +st.shared.u32 [r800+4], r340; +st.shared.u32 [r800+8], r377; +st.shared.u32 [r800+12], r414; +st.shared.u32 [r800+16], r451; +barrier.sync 0; +ld.shared.u32 r496, [r802]; +ld.shared.u32 r493, [r802+20]; +ld.shared.u32 r499, [r802+40]; +ld.shared.u32 r500, [r802+60]; +ld.shared.u32 r494, [r802+80]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 %0, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 %1, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 %2, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 %8, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 %4, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 %6, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 %3, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 %9, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 %5, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 %7, r771, r786; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..b140a8f7c35d2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp16_inv.hpp.inc @@ -0,0 +1,5694 @@ +#ifndef CUFFTDX_FFT_25_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_25_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1103, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<149>; +.reg .b32 r<3427>; +.reg .f64 fd<109>; +.reg .b64 rd<3>; +mov.f64 fd107, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd107; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd108, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd108; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r228, {rs3, rs3}; +mov.f64 fd105, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs5, fd105; +} +mov.b32 r282, {rs5, rs5}; +mov.f64 fd106, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs6, fd106; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r300, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs9, fd107; +} +mov.b32 r291, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd108; +} +mov.b32 r306, {rs10, rs10}; +{ +add.f16x2 r1, %51, %58; +} +{ +add.f16x2 r4, %52, r1; +} +{ +add.f16x2 r7, %54, %50; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %56, %53; +} +{ +add.f16x2 r16, %57, r13; +} +{ +add.f16x2 r19, %59, %55; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %51, %58; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %52, r28; +} +{ +add.f16x2 r34, %54, %50; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %56, %53; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %59, %55; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %51, %58; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %52, r64; +} +{ +add.f16x2 r70, %54, %50; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %56, %53; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %59, %55; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %51, %58; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %52, r100; +} +{ +add.f16x2 r106, %54, %50; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %56, %53; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %59, %55; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %51, %58; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %52, r136; +} +{ +add.f16x2 r142, %54, %50; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %56, %53; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %59, %55; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %56, %53; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %57, r172; +} +{ +add.f16x2 r178, %59, %55; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %51, %58; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %54, %50; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %56, %53; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %57, r208; +} +{ +add.f16x2 r214, %59, %55; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %51, %58; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %54, %50; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %56, %53; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %57, r244; +} +{ +add.f16x2 r250, %59, %55; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %51, %58; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %54, %50; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %56, %53; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %57, r280; +} +{ +add.f16x2 r286, %59, %55; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %51, %58; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %54, %50; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs11, fd107; +} +mov.b32 r522, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd108; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r540, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs15, fd105; +} +mov.b32 r594, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd106; +} +{ +neg.f16 rs17, rs16; +} +mov.b32 r612, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs19, fd107; +} +mov.b32 r603, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd108; +} +mov.b32 r618, {rs20, rs20}; +{ +add.f16x2 r313, %62, %69; +} +{ +add.f16x2 r316, %63, r313; +} +{ +add.f16x2 r319, %65, %61; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %67, %64; +} +{ +add.f16x2 r328, %68, r325; +} +{ +add.f16x2 r331, %60, %66; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %62, %69; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %63, r340; +} +{ +add.f16x2 r346, %65, %61; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %67, %64; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %60, %66; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %62, %69; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %63, r376; +} +{ +add.f16x2 r382, %65, %61; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %67, %64; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %60, %66; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %62, %69; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %63, r412; +} +{ +add.f16x2 r418, %65, %61; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %67, %64; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %60, %66; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %62, %69; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %63, r448; +} +{ +add.f16x2 r454, %65, %61; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %67, %64; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %60, %66; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %67, %64; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %68, r484; +} +{ +add.f16x2 r490, %60, %66; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %62, %69; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %65, %61; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %67, %64; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %68, r520; +} +{ +add.f16x2 r526, %60, %66; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %62, %69; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %65, %61; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %67, %64; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %68, r556; +} +{ +add.f16x2 r562, %60, %66; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %62, %69; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %65, %61; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %67, %64; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %68, r592; +} +{ +add.f16x2 r598, %60, %66; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %62, %69; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %65, %61; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +{ +cvt.rn.f16.f64 rs21, fd107; +} +mov.b32 r834, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd108; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r852, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd105; +} +mov.b32 r906, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd106; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r924, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs29, fd107; +} +mov.b32 r915, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd108; +} +mov.b32 r930, {rs30, rs30}; +{ +add.f16x2 r625, %73, %71; +} +{ +add.f16x2 r628, %75, r625; +} +{ +add.f16x2 r631, %77, %74; +} +{ +add.f16x2 r634, r628, r631; +} +{ +add.f16x2 r637, %79, %76; +} +{ +add.f16x2 r640, %70, r637; +} +{ +add.f16x2 r643, %72, %78; +} +{ +add.f16x2 r646, r640, r643; +} +{ +add.f16x2 r649, %73, %71; +} +{ +mul.f16x2 r652, r649, r834; +} +{ +add.f16x2 r655, %75, r652; +} +{ +add.f16x2 r658, %77, %74; +} +{ +mul.f16x2 r661, r658, r906; +} +{ +add.f16x2 r664, r655, r661; +} +{ +sub.f16x2 r667, %79, %76; +} +{ +mul.f16x2 r670, r667, r852; +} +{ +sub.f16x2 r673, %72, %78; +} +{ +mul.f16x2 r676, r673, r924; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r664, r679; +} +{ +add.f16x2 r685, %73, %71; +} +{ +mul.f16x2 r688, r685, r834; +} +{ +add.f16x2 r691, %75, r688; +} +{ +add.f16x2 r694, %77, %74; +} +{ +mul.f16x2 r697, r694, r906; +} +{ +add.f16x2 r700, r691, r697; +} +{ +sub.f16x2 r703, %79, %76; +} +{ +mul.f16x2 r706, r703, r852; +} +{ +sub.f16x2 r709, %72, %78; +} +{ +mul.f16x2 r712, r709, r924; +} +{ +add.f16x2 r715, r706, r712; +} +{ +add.f16x2 r718, r700, r715; +} +{ +add.f16x2 r721, %73, %71; +} +{ +mul.f16x2 r724, r721, r906; +} +{ +add.f16x2 r727, %75, r724; +} +{ +add.f16x2 r730, %77, %74; +} +{ +mul.f16x2 r733, r730, r915; +} +{ +add.f16x2 r736, r727, r733; +} +{ +sub.f16x2 r739, %79, %76; +} +{ +mul.f16x2 r742, r739, r924; +} +{ +sub.f16x2 r745, %72, %78; +} +{ +mul.f16x2 r748, r745, r930; +} +{ +add.f16x2 r751, r742, r748; +} +{ +sub.f16x2 r754, r736, r751; +} +{ +add.f16x2 r757, %73, %71; +} +{ +mul.f16x2 r760, r757, r906; +} +{ +add.f16x2 r763, %75, r760; +} +{ +add.f16x2 r766, %77, %74; +} +{ +mul.f16x2 r769, r766, r915; +} +{ +add.f16x2 r772, r763, r769; +} +{ +sub.f16x2 r775, %79, %76; +} +{ +mul.f16x2 r778, r775, r924; +} +{ +sub.f16x2 r781, %72, %78; +} +{ +mul.f16x2 r784, r781, r930; +} +{ +add.f16x2 r787, r778, r784; +} +{ +add.f16x2 r790, r772, r787; +} +{ +add.f16x2 r793, %79, %76; +} +{ +mul.f16x2 r796, r793, r834; +} +{ +add.f16x2 r799, %70, r796; +} +{ +add.f16x2 r802, %72, %78; +} +{ +mul.f16x2 r805, r802, r906; +} +{ +add.f16x2 r808, r799, r805; +} +{ +sub.f16x2 r811, %73, %71; +} +{ +mul.f16x2 r814, r811, r852; +} +{ +sub.f16x2 r817, %77, %74; +} +{ +mul.f16x2 r820, r817, r924; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r808, r823; +} +{ +add.f16x2 r829, %79, %76; +} +{ +mul.f16x2 r832, r829, r834; +} +{ +add.f16x2 r835, %70, r832; +} +{ +add.f16x2 r838, %72, %78; +} +{ +mul.f16x2 r841, r838, r906; +} +{ +add.f16x2 r844, r835, r841; +} +{ +sub.f16x2 r847, %73, %71; +} +{ +mul.f16x2 r850, r847, r852; +} +{ +sub.f16x2 r853, %77, %74; +} +{ +mul.f16x2 r856, r853, r924; +} +{ +add.f16x2 r859, r850, r856; +} +{ +sub.f16x2 r862, r844, r859; +} +{ +add.f16x2 r865, %79, %76; +} +{ +mul.f16x2 r868, r865, r906; +} +{ +add.f16x2 r871, %70, r868; +} +{ +add.f16x2 r874, %72, %78; +} +{ +mul.f16x2 r877, r874, r915; +} +{ +add.f16x2 r880, r871, r877; +} +{ +sub.f16x2 r883, %73, %71; +} +{ +mul.f16x2 r886, r883, r924; +} +{ +sub.f16x2 r889, %77, %74; +} +{ +mul.f16x2 r892, r889, r930; +} +{ +add.f16x2 r895, r886, r892; +} +{ +add.f16x2 r898, r880, r895; +} +{ +add.f16x2 r901, %79, %76; +} +{ +mul.f16x2 r904, r901, r906; +} +{ +add.f16x2 r907, %70, r904; +} +{ +add.f16x2 r910, %72, %78; +} +{ +mul.f16x2 r913, r910, r915; +} +{ +add.f16x2 r916, r907, r913; +} +{ +sub.f16x2 r919, %73, %71; +} +{ +mul.f16x2 r922, r919, r924; +} +{ +sub.f16x2 r925, %77, %74; +} +{ +mul.f16x2 r928, r925, r930; +} +{ +add.f16x2 r931, r922, r928; +} +{ +sub.f16x2 r934, r916, r931; +} +{ +cvt.rn.f16.f64 rs31, fd107; +} +mov.b32 r1146, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd108; +} +{ +neg.f16 rs33, rs32; +} +mov.b32 r1164, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs35, fd105; +} +mov.b32 r1218, {rs35, rs35}; +{ +cvt.rn.f16.f64 rs36, fd106; +} +{ +neg.f16 rs37, rs36; +} +mov.b32 r1236, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs39, fd107; +} +mov.b32 r1227, {rs39, rs39}; +{ +cvt.rn.f16.f64 rs40, fd108; +} +mov.b32 r1242, {rs40, rs40}; +{ +add.f16x2 r937, %85, %83; +} +{ +add.f16x2 r940, %87, r937; +} +{ +add.f16x2 r943, %89, %86; +} +{ +add.f16x2 r946, r940, r943; +} +{ +add.f16x2 r949, %81, %88; +} +{ +add.f16x2 r952, %82, r949; +} +{ +add.f16x2 r955, %84, %80; +} +{ +add.f16x2 r958, r952, r955; +} +{ +add.f16x2 r961, %85, %83; +} +{ +mul.f16x2 r964, r961, r1146; +} +{ +add.f16x2 r967, %87, r964; +} +{ +add.f16x2 r970, %89, %86; +} +{ +mul.f16x2 r973, r970, r1218; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %81, %88; +} +{ +mul.f16x2 r982, r979, r1164; +} +{ +sub.f16x2 r985, %84, %80; +} +{ +mul.f16x2 r988, r985, r1236; +} +{ +add.f16x2 r991, r982, r988; +} +{ +sub.f16x2 r994, r976, r991; +} +{ +add.f16x2 r997, %85, %83; +} +{ +mul.f16x2 r1000, r997, r1146; +} +{ +add.f16x2 r1003, %87, r1000; +} +{ +add.f16x2 r1006, %89, %86; +} +{ +mul.f16x2 r1009, r1006, r1218; +} +{ +add.f16x2 r1012, r1003, r1009; +} +{ +sub.f16x2 r1015, %81, %88; +} +{ +mul.f16x2 r1018, r1015, r1164; +} +{ +sub.f16x2 r1021, %84, %80; +} +{ +mul.f16x2 r1024, r1021, r1236; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1012, r1027; +} +{ +add.f16x2 r1033, %85, %83; +} +{ +mul.f16x2 r1036, r1033, r1218; +} +{ +add.f16x2 r1039, %87, r1036; +} +{ +add.f16x2 r1042, %89, %86; +} +{ +mul.f16x2 r1045, r1042, r1227; +} +{ +add.f16x2 r1048, r1039, r1045; +} +{ +sub.f16x2 r1051, %81, %88; +} +{ +mul.f16x2 r1054, r1051, r1236; +} +{ +sub.f16x2 r1057, %84, %80; +} +{ +mul.f16x2 r1060, r1057, r1242; +} +{ +add.f16x2 r1063, r1054, r1060; +} +{ +sub.f16x2 r1066, r1048, r1063; +} +{ +add.f16x2 r1069, %85, %83; +} +{ +mul.f16x2 r1072, r1069, r1218; +} +{ +add.f16x2 r1075, %87, r1072; +} +{ +add.f16x2 r1078, %89, %86; +} +{ +mul.f16x2 r1081, r1078, r1227; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %81, %88; +} +{ +mul.f16x2 r1090, r1087, r1236; +} +{ +sub.f16x2 r1093, %84, %80; +} +{ +mul.f16x2 r1096, r1093, r1242; +} +{ +add.f16x2 r1099, r1090, r1096; +} +{ +add.f16x2 r1102, r1084, r1099; +} +{ +add.f16x2 r1105, %81, %88; +} +{ +mul.f16x2 r1108, r1105, r1146; +} +{ +add.f16x2 r1111, %82, r1108; +} +{ +add.f16x2 r1114, %84, %80; +} +{ +mul.f16x2 r1117, r1114, r1218; +} +{ +add.f16x2 r1120, r1111, r1117; +} +{ +sub.f16x2 r1123, %85, %83; +} +{ +mul.f16x2 r1126, r1123, r1164; +} +{ +sub.f16x2 r1129, %89, %86; +} +{ +mul.f16x2 r1132, r1129, r1236; +} +{ +add.f16x2 r1135, r1126, r1132; +} +{ +add.f16x2 r1138, r1120, r1135; +} +{ +add.f16x2 r1141, %81, %88; +} +{ +mul.f16x2 r1144, r1141, r1146; +} +{ +add.f16x2 r1147, %82, r1144; +} +{ +add.f16x2 r1150, %84, %80; +} +{ +mul.f16x2 r1153, r1150, r1218; +} +{ +add.f16x2 r1156, r1147, r1153; +} +{ +sub.f16x2 r1159, %85, %83; +} +{ +mul.f16x2 r1162, r1159, r1164; +} +{ +sub.f16x2 r1165, %89, %86; +} +{ +mul.f16x2 r1168, r1165, r1236; +} +{ +add.f16x2 r1171, r1162, r1168; +} +{ +sub.f16x2 r1174, r1156, r1171; +} +{ +add.f16x2 r1177, %81, %88; +} +{ +mul.f16x2 r1180, r1177, r1218; +} +{ +add.f16x2 r1183, %82, r1180; +} +{ +add.f16x2 r1186, %84, %80; +} +{ +mul.f16x2 r1189, r1186, r1227; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %85, %83; +} +{ +mul.f16x2 r1198, r1195, r1236; +} +{ +sub.f16x2 r1201, %89, %86; +} +{ +mul.f16x2 r1204, r1201, r1242; +} +{ +add.f16x2 r1207, r1198, r1204; +} +{ +add.f16x2 r1210, r1192, r1207; +} +{ +add.f16x2 r1213, %81, %88; +} +{ +mul.f16x2 r1216, r1213, r1218; +} +{ +add.f16x2 r1219, %82, r1216; +} +{ +add.f16x2 r1222, %84, %80; +} +{ +mul.f16x2 r1225, r1222, r1227; +} +{ +add.f16x2 r1228, r1219, r1225; +} +{ +sub.f16x2 r1231, %85, %83; +} +{ +mul.f16x2 r1234, r1231, r1236; +} +{ +sub.f16x2 r1237, %89, %86; +} +{ +mul.f16x2 r1240, r1237, r1242; +} +{ +add.f16x2 r1243, r1234, r1240; +} +{ +sub.f16x2 r1246, r1228, r1243; +} +{ +cvt.rn.f16.f64 rs41, fd107; +} +mov.b32 r1458, {rs41, rs41}; +{ +cvt.rn.f16.f64 rs42, fd108; +} +{ +neg.f16 rs43, rs42; +} +mov.b32 r1476, {rs43, rs43}; +{ +cvt.rn.f16.f64 rs45, fd105; +} +mov.b32 r1530, {rs45, rs45}; +{ +cvt.rn.f16.f64 rs46, fd106; +} +{ +neg.f16 rs47, rs46; +} +mov.b32 r1548, {rs47, rs47}; +{ +cvt.rn.f16.f64 rs49, fd107; +} +mov.b32 r1539, {rs49, rs49}; +{ +cvt.rn.f16.f64 rs50, fd108; +} +mov.b32 r1554, {rs50, rs50}; +{ +add.f16x2 r1249, %97, %95; +} +{ +add.f16x2 r1252, %99, r1249; +} +{ +add.f16x2 r1255, %91, %98; +} +{ +add.f16x2 r1258, r1252, r1255; +} +{ +add.f16x2 r1261, %93, %90; +} +{ +add.f16x2 r1264, %94, r1261; +} +{ +add.f16x2 r1267, %96, %92; +} +{ +add.f16x2 r1270, r1264, r1267; +} +{ +add.f16x2 r1273, %97, %95; +} +{ +mul.f16x2 r1276, r1273, r1458; +} +{ +add.f16x2 r1279, %99, r1276; +} +{ +add.f16x2 r1282, %91, %98; +} +{ +mul.f16x2 r1285, r1282, r1530; +} +{ +add.f16x2 r1288, r1279, r1285; +} +{ +sub.f16x2 r1291, %93, %90; +} +{ +mul.f16x2 r1294, r1291, r1476; +} +{ +sub.f16x2 r1297, %96, %92; +} +{ +mul.f16x2 r1300, r1297, r1548; +} +{ +add.f16x2 r1303, r1294, r1300; +} +{ +sub.f16x2 r1306, r1288, r1303; +} +{ +add.f16x2 r1309, %97, %95; +} +{ +mul.f16x2 r1312, r1309, r1458; +} +{ +add.f16x2 r1315, %99, r1312; +} +{ +add.f16x2 r1318, %91, %98; +} +{ +mul.f16x2 r1321, r1318, r1530; +} +{ +add.f16x2 r1324, r1315, r1321; +} +{ +sub.f16x2 r1327, %93, %90; +} +{ +mul.f16x2 r1330, r1327, r1476; +} +{ +sub.f16x2 r1333, %96, %92; +} +{ +mul.f16x2 r1336, r1333, r1548; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +add.f16x2 r1342, r1324, r1339; +} +{ +add.f16x2 r1345, %97, %95; +} +{ +mul.f16x2 r1348, r1345, r1530; +} +{ +add.f16x2 r1351, %99, r1348; +} +{ +add.f16x2 r1354, %91, %98; +} +{ +mul.f16x2 r1357, r1354, r1539; +} +{ +add.f16x2 r1360, r1351, r1357; +} +{ +sub.f16x2 r1363, %93, %90; +} +{ +mul.f16x2 r1366, r1363, r1548; +} +{ +sub.f16x2 r1369, %96, %92; +} +{ +mul.f16x2 r1372, r1369, r1554; +} +{ +add.f16x2 r1375, r1366, r1372; +} +{ +sub.f16x2 r1378, r1360, r1375; +} +{ +add.f16x2 r1381, %97, %95; +} +{ +mul.f16x2 r1384, r1381, r1530; +} +{ +add.f16x2 r1387, %99, r1384; +} +{ +add.f16x2 r1390, %91, %98; +} +{ +mul.f16x2 r1393, r1390, r1539; +} +{ +add.f16x2 r1396, r1387, r1393; +} +{ +sub.f16x2 r1399, %93, %90; +} +{ +mul.f16x2 r1402, r1399, r1548; +} +{ +sub.f16x2 r1405, %96, %92; +} +{ +mul.f16x2 r1408, r1405, r1554; +} +{ +add.f16x2 r1411, r1402, r1408; +} +{ +add.f16x2 r1414, r1396, r1411; +} +{ +add.f16x2 r1417, %93, %90; +} +{ +mul.f16x2 r1420, r1417, r1458; +} +{ +add.f16x2 r1423, %94, r1420; +} +{ +add.f16x2 r1426, %96, %92; +} +{ +mul.f16x2 r1429, r1426, r1530; +} +{ +add.f16x2 r1432, r1423, r1429; +} +{ +sub.f16x2 r1435, %97, %95; +} +{ +mul.f16x2 r1438, r1435, r1476; +} +{ +sub.f16x2 r1441, %91, %98; +} +{ +mul.f16x2 r1444, r1441, r1548; +} +{ +add.f16x2 r1447, r1438, r1444; +} +{ +add.f16x2 r1450, r1432, r1447; +} +{ +add.f16x2 r1453, %93, %90; +} +{ +mul.f16x2 r1456, r1453, r1458; +} +{ +add.f16x2 r1459, %94, r1456; +} +{ +add.f16x2 r1462, %96, %92; +} +{ +mul.f16x2 r1465, r1462, r1530; +} +{ +add.f16x2 r1468, r1459, r1465; +} +{ +sub.f16x2 r1471, %97, %95; +} +{ +mul.f16x2 r1474, r1471, r1476; +} +{ +sub.f16x2 r1477, %91, %98; +} +{ +mul.f16x2 r1480, r1477, r1548; +} +{ +add.f16x2 r1483, r1474, r1480; +} +{ +sub.f16x2 r1486, r1468, r1483; +} +{ +add.f16x2 r1489, %93, %90; +} +{ +mul.f16x2 r1492, r1489, r1530; +} +{ +add.f16x2 r1495, %94, r1492; +} +{ +add.f16x2 r1498, %96, %92; +} +{ +mul.f16x2 r1501, r1498, r1539; +} +{ +add.f16x2 r1504, r1495, r1501; +} +{ +sub.f16x2 r1507, %97, %95; +} +{ +mul.f16x2 r1510, r1507, r1548; +} +{ +sub.f16x2 r1513, %91, %98; +} +{ +mul.f16x2 r1516, r1513, r1554; +} +{ +add.f16x2 r1519, r1510, r1516; +} +{ +add.f16x2 r1522, r1504, r1519; +} +{ +add.f16x2 r1525, %93, %90; +} +{ +mul.f16x2 r1528, r1525, r1530; +} +{ +add.f16x2 r1531, %94, r1528; +} +{ +add.f16x2 r1534, %96, %92; +} +{ +mul.f16x2 r1537, r1534, r1539; +} +{ +add.f16x2 r1540, r1531, r1537; +} +{ +sub.f16x2 r1543, %97, %95; +} +{ +mul.f16x2 r1546, r1543, r1548; +} +{ +sub.f16x2 r1549, %91, %98; +} +{ +mul.f16x2 r1552, r1549, r1554; +} +{ +add.f16x2 r1555, r1546, r1552; +} +{ +sub.f16x2 r1558, r1540, r1555; +} +mov.f64 fd31, 0d3FEEFEA21D101EE0; +{ +cvt.rn.f16.f64 rs51, fd31; +} +mov.f64 fd32, 0d3FCFD511FA1C0796; +{ +cvt.rn.f16.f64 rs52, fd32; +} +mov.f64 fd33, 0d3FEC0AB44E81C059; +{ +cvt.rn.f16.f64 rs53, fd33; +} +mov.f64 fd34, 0d3FDED50D5CBFA951; +{ +cvt.rn.f16.f64 rs54, fd34; +} +mov.f64 fd35, 0d3FE753B603D2B816; +{ +cvt.rn.f16.f64 rs55, fd35; +} +mov.f64 fd36, 0d3FE5E7CF55112014; +{ +cvt.rn.f16.f64 rs56, fd36; +} +mov.f64 fd37, 0d3FE1257E3C182B51; +{ +cvt.rn.f16.f64 rs57, fd37; +} +mov.f64 fd38, 0d3FEB04BBFF642E86; +{ +cvt.rn.f16.f64 rs58, fd38; +} +mov.f64 fd41, 0d3FB0130A1BE09379; +{ +cvt.rn.f16.f64 rs61, fd41; +} +mov.f64 fd42, 0d3FEFEFD5BFE443FE; +{ +cvt.rn.f16.f64 rs62, fd42; +} +mov.f64 fd45, 0dBFDB3FF7C925819C; +{ +cvt.rn.f16.f64 rs65, fd45; +} +mov.f64 fd46, 0d3FECF457DCDC158C; +{ +cvt.rn.f16.f64 rs66, fd46; +} +mov.f64 fd61, 0dBFE465C6FEB501BC; +{ +cvt.rn.f16.f64 rs67, fd61; +} +mov.f64 fd48, 0d3FE8A80B635B6BEA; +{ +cvt.rn.f16.f64 rs68, fd48; +} +mov.f64 fd53, 0dBFEFBF675480D903; +{ +cvt.rn.f16.f64 rs73, fd53; +} +mov.f64 fd54, 0d3FC00AEB5DA15BE0; +{ +cvt.rn.f16.f64 rs74, fd54; +} +{ +cvt.rn.f16.f64 rs81, fd61; +} +mov.f64 fd62, 0dBFE8A80B635B6BEA; +{ +cvt.rn.f16.f64 rs82, fd62; +} +mov.b32 r1575, {rs51, rs51}; +{ +mul.f16x2 r1561, r370, r1575; +} +mov.b32 r1572, {rs52, rs52}; +{ +mul.f16x2 r1564, r514, r1572; +} +{ +sub.f16x2 r1567, r1561, r1564; +} +{ +mul.f16x2 r1570, r370, r1572; +} +{ +fma.rn.f16x2 r1573, r514, r1575, r1570; +} +mov.b32 r1639, {rs53, rs53}; +{ +mul.f16x2 r1577, r682, r1639; +} +mov.b32 r1636, {rs54, rs54}; +{ +mul.f16x2 r1580, r826, r1636; +} +{ +sub.f16x2 r1583, r1577, r1580; +} +{ +mul.f16x2 r1586, r682, r1636; +} +{ +fma.rn.f16x2 r1589, r826, r1639, r1586; +} +mov.b32 r1703, {rs55, rs55}; +{ +mul.f16x2 r1593, r994, r1703; +} +mov.b32 r1700, {rs56, rs56}; +{ +mul.f16x2 r1596, r1138, r1700; +} +{ +sub.f16x2 r1599, r1593, r1596; +} +{ +mul.f16x2 r1602, r994, r1700; +} +{ +fma.rn.f16x2 r1605, r1138, r1703, r1602; +} +mov.b32 r1767, {rs57, rs57}; +{ +mul.f16x2 r1609, r1306, r1767; +} +mov.b32 r1764, {rs58, rs58}; +{ +mul.f16x2 r1612, r1450, r1764; +} +{ +sub.f16x2 r1615, r1609, r1612; +} +{ +mul.f16x2 r1618, r1306, r1764; +} +{ +fma.rn.f16x2 r1621, r1450, r1767, r1618; +} +{ +mul.f16x2 r1625, r442, r1639; +} +{ +mul.f16x2 r1628, r586, r1636; +} +{ +sub.f16x2 r1631, r1625, r1628; +} +{ +mul.f16x2 r1634, r442, r1636; +} +{ +fma.rn.f16x2 r1637, r586, r1639, r1634; +} +{ +mul.f16x2 r1641, r754, r1767; +} +{ +mul.f16x2 r1644, r898, r1764; +} +{ +sub.f16x2 r1647, r1641, r1644; +} +{ +mul.f16x2 r1650, r754, r1764; +} +{ +fma.rn.f16x2 r1653, r898, r1767, r1650; +} +mov.b32 r1719, {rs61, rs61}; +{ +mul.f16x2 r1657, r1066, r1719; +} +mov.b32 r1716, {rs62, rs62}; +{ +mul.f16x2 r1660, r1210, r1716; +} +{ +sub.f16x2 r1663, r1657, r1660; +} +{ +mul.f16x2 r1666, r1066, r1716; +} +{ +fma.rn.f16x2 r1669, r1210, r1719, r1666; +} +mov.b32 r1783, {rs65, rs65}; +{ +mul.f16x2 r1673, r1378, r1783; +} +mov.b32 r1780, {rs66, rs66}; +{ +mul.f16x2 r1676, r1522, r1780; +} +{ +sub.f16x2 r1679, r1673, r1676; +} +{ +mul.f16x2 r1682, r1378, r1780; +} +{ +fma.rn.f16x2 r1685, r1522, r1783, r1682; +} +{ +mul.f16x2 r1689, r478, r1703; +} +{ +mul.f16x2 r1692, r622, r1700; +} +{ +sub.f16x2 r1695, r1689, r1692; +} +{ +mul.f16x2 r1698, r478, r1700; +} +{ +fma.rn.f16x2 r1701, r622, r1703, r1698; +} +{ +mul.f16x2 r1705, r790, r1719; +} +{ +mul.f16x2 r1708, r934, r1716; +} +{ +sub.f16x2 r1711, r1705, r1708; +} +{ +mul.f16x2 r1714, r790, r1716; +} +{ +fma.rn.f16x2 r1717, r934, r1719, r1714; +} +mov.b32 r1735, {rs67, rs67}; +{ +mul.f16x2 r1721, r1102, r1735; +} +mov.b32 r1732, {rs68, rs68}; +{ +mul.f16x2 r1724, r1246, r1732; +} +{ +sub.f16x2 r1727, r1721, r1724; +} +{ +mul.f16x2 r1730, r1102, r1732; +} +{ +fma.rn.f16x2 r1733, r1246, r1735, r1730; +} +mov.b32 r1799, {rs73, rs73}; +{ +mul.f16x2 r1737, r1414, r1799; +} +mov.b32 r1796, {rs74, rs74}; +{ +mul.f16x2 r1740, r1558, r1796; +} +{ +sub.f16x2 r1743, r1737, r1740; +} +{ +mul.f16x2 r1746, r1414, r1796; +} +{ +fma.rn.f16x2 r1749, r1558, r1799, r1746; +} +{ +mul.f16x2 r1753, r406, r1767; +} +{ +mul.f16x2 r1756, r550, r1764; +} +{ +sub.f16x2 r1759, r1753, r1756; +} +{ +mul.f16x2 r1762, r406, r1764; +} +{ +fma.rn.f16x2 r1765, r550, r1767, r1762; +} +{ +mul.f16x2 r1769, r718, r1783; +} +{ +mul.f16x2 r1772, r862, r1780; +} +{ +sub.f16x2 r1775, r1769, r1772; +} +{ +mul.f16x2 r1778, r718, r1780; +} +{ +fma.rn.f16x2 r1781, r862, r1783, r1778; +} +{ +mul.f16x2 r1785, r1030, r1799; +} +{ +mul.f16x2 r1788, r1174, r1796; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1030, r1796; +} +{ +fma.rn.f16x2 r1797, r1174, r1799, r1794; +} +mov.b32 r1815, {rs81, rs81}; +{ +mul.f16x2 r1801, r1342, r1815; +} +mov.b32 r1812, {rs82, rs82}; +{ +mul.f16x2 r1804, r1486, r1812; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r1342, r1812; +} +{ +fma.rn.f16x2 r1813, r1486, r1815, r1810; +} +{ +cvt.rn.f16.f64 rs99, fd107; +} +mov.b32 r2026, {rs99, rs99}; +{ +cvt.rn.f16.f64 rs100, fd108; +} +{ +neg.f16 rs101, rs100; +} +mov.b32 r2044, {rs101, rs101}; +{ +cvt.rn.f16.f64 rs103, fd105; +} +mov.b32 r2098, {rs103, rs103}; +{ +cvt.rn.f16.f64 rs104, fd106; +} +{ +neg.f16 rs105, rs104; +} +mov.b32 r2116, {rs105, rs105}; +{ +cvt.rn.f16.f64 rs107, fd107; +} +mov.b32 r2107, {rs107, rs107}; +{ +cvt.rn.f16.f64 rs108, fd108; +} +mov.b32 r2122, {rs108, rs108}; +{ +add.f16x2 r1817, r322, r1258; +} +{ +add.f16x2 r1820, r10, r1817; +} +{ +add.f16x2 r1823, r634, r946; +} +{ +add.f16x2 %0, r1820, r1823; +} +{ +add.f16x2 r1829, r334, r1270; +} +{ +add.f16x2 r1832, r22, r1829; +} +{ +add.f16x2 r1835, r646, r958; +} +{ +add.f16x2 %1, r1832, r1835; +} +{ +add.f16x2 r1841, r322, r1258; +} +{ +mul.f16x2 r1844, r1841, r2026; +} +{ +add.f16x2 r1847, r10, r1844; +} +{ +add.f16x2 r1850, r634, r946; +} +{ +mul.f16x2 r1853, r1850, r2098; +} +{ +add.f16x2 r1856, r1847, r1853; +} +{ +sub.f16x2 r1859, r334, r1270; +} +{ +mul.f16x2 r1862, r1859, r2044; +} +{ +sub.f16x2 r1865, r646, r958; +} +{ +mul.f16x2 r1868, r1865, r2116; +} +{ +add.f16x2 r1871, r1862, r1868; +} +{ +sub.f16x2 %10, r1856, r1871; +} +{ +add.f16x2 r1877, r322, r1258; +} +{ +mul.f16x2 r1880, r1877, r2026; +} +{ +add.f16x2 r1883, r10, r1880; +} +{ +add.f16x2 r1886, r634, r946; +} +{ +mul.f16x2 r1889, r1886, r2098; +} +{ +add.f16x2 r1892, r1883, r1889; +} +{ +sub.f16x2 r1895, r334, r1270; +} +{ +mul.f16x2 r1898, r1895, r2044; +} +{ +sub.f16x2 r1901, r646, r958; +} +{ +mul.f16x2 r1904, r1901, r2116; +} +{ +add.f16x2 r1907, r1898, r1904; +} +{ +add.f16x2 %40, r1892, r1907; +} +{ +add.f16x2 r1913, r322, r1258; +} +{ +mul.f16x2 r1916, r1913, r2098; +} +{ +add.f16x2 r1919, r10, r1916; +} +{ +add.f16x2 r1922, r634, r946; +} +{ +mul.f16x2 r1925, r1922, r2107; +} +{ +add.f16x2 r1928, r1919, r1925; +} +{ +sub.f16x2 r1931, r334, r1270; +} +{ +mul.f16x2 r1934, r1931, r2116; +} +{ +sub.f16x2 r1937, r646, r958; +} +{ +mul.f16x2 r1940, r1937, r2122; +} +{ +add.f16x2 r1943, r1934, r1940; +} +{ +sub.f16x2 %20, r1928, r1943; +} +{ +add.f16x2 r1949, r322, r1258; +} +{ +mul.f16x2 r1952, r1949, r2098; +} +{ +add.f16x2 r1955, r10, r1952; +} +{ +add.f16x2 r1958, r634, r946; +} +{ +mul.f16x2 r1961, r1958, r2107; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r334, r1270; +} +{ +mul.f16x2 r1970, r1967, r2116; +} +{ +sub.f16x2 r1973, r646, r958; +} +{ +mul.f16x2 r1976, r1973, r2122; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +add.f16x2 %30, r1964, r1979; +} +{ +add.f16x2 r1985, r334, r1270; +} +{ +mul.f16x2 r1988, r1985, r2026; +} +{ +add.f16x2 r1991, r22, r1988; +} +{ +add.f16x2 r1994, r646, r958; +} +{ +mul.f16x2 r1997, r1994, r2098; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r322, r1258; +} +{ +mul.f16x2 r2006, r2003, r2044; +} +{ +sub.f16x2 r2009, r634, r946; +} +{ +mul.f16x2 r2012, r2009, r2116; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 %11, r2000, r2015; +} +{ +add.f16x2 r2021, r334, r1270; +} +{ +mul.f16x2 r2024, r2021, r2026; +} +{ +add.f16x2 r2027, r22, r2024; +} +{ +add.f16x2 r2030, r646, r958; +} +{ +mul.f16x2 r2033, r2030, r2098; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r322, r1258; +} +{ +mul.f16x2 r2042, r2039, r2044; +} +{ +sub.f16x2 r2045, r634, r946; +} +{ +mul.f16x2 r2048, r2045, r2116; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 %41, r2036, r2051; +} +{ +add.f16x2 r2057, r334, r1270; +} +{ +mul.f16x2 r2060, r2057, r2098; +} +{ +add.f16x2 r2063, r22, r2060; +} +{ +add.f16x2 r2066, r646, r958; +} +{ +mul.f16x2 r2069, r2066, r2107; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r322, r1258; +} +{ +mul.f16x2 r2078, r2075, r2116; +} +{ +sub.f16x2 r2081, r634, r946; +} +{ +mul.f16x2 r2084, r2081, r2122; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 %21, r2072, r2087; +} +{ +add.f16x2 r2093, r334, r1270; +} +{ +mul.f16x2 r2096, r2093, r2098; +} +{ +add.f16x2 r2099, r22, r2096; +} +{ +add.f16x2 r2102, r646, r958; +} +{ +mul.f16x2 r2105, r2102, r2107; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r322, r1258; +} +{ +mul.f16x2 r2114, r2111, r2116; +} +{ +sub.f16x2 r2117, r634, r946; +} +{ +mul.f16x2 r2120, r2117, r2122; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +sub.f16x2 %31, r2108, r2123; +} +{ +cvt.rn.f16.f64 rs109, fd107; +} +mov.b32 r2338, {rs109, rs109}; +{ +cvt.rn.f16.f64 rs110, fd108; +} +{ +neg.f16 rs111, rs110; +} +mov.b32 r2356, {rs111, rs111}; +{ +cvt.rn.f16.f64 rs113, fd105; +} +mov.b32 r2410, {rs113, rs113}; +{ +cvt.rn.f16.f64 rs114, fd106; +} +{ +neg.f16 rs115, rs114; +} +mov.b32 r2428, {rs115, rs115}; +{ +cvt.rn.f16.f64 rs117, fd107; +} +mov.b32 r2419, {rs117, rs117}; +{ +cvt.rn.f16.f64 rs118, fd108; +} +mov.b32 r2434, {rs118, rs118}; +{ +add.f16x2 r2129, r1567, r1615; +} +{ +add.f16x2 r2132, r58, r2129; +} +{ +add.f16x2 r2135, r1583, r1599; +} +{ +add.f16x2 %2, r2132, r2135; +} +{ +add.f16x2 r2141, r1573, r1621; +} +{ +add.f16x2 r2144, r202, r2141; +} +{ +add.f16x2 r2147, r1589, r1605; +} +{ +add.f16x2 %3, r2144, r2147; +} +{ +add.f16x2 r2153, r1567, r1615; +} +{ +mul.f16x2 r2156, r2153, r2338; +} +{ +add.f16x2 r2159, r58, r2156; +} +{ +add.f16x2 r2162, r1583, r1599; +} +{ +mul.f16x2 r2165, r2162, r2410; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r1573, r1621; +} +{ +mul.f16x2 r2174, r2171, r2356; +} +{ +sub.f16x2 r2177, r1589, r1605; +} +{ +mul.f16x2 r2180, r2177, r2428; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +sub.f16x2 %12, r2168, r2183; +} +{ +add.f16x2 r2189, r1567, r1615; +} +{ +mul.f16x2 r2192, r2189, r2338; +} +{ +add.f16x2 r2195, r58, r2192; +} +{ +add.f16x2 r2198, r1583, r1599; +} +{ +mul.f16x2 r2201, r2198, r2410; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r1573, r1621; +} +{ +mul.f16x2 r2210, r2207, r2356; +} +{ +sub.f16x2 r2213, r1589, r1605; +} +{ +mul.f16x2 r2216, r2213, r2428; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +add.f16x2 %42, r2204, r2219; +} +{ +add.f16x2 r2225, r1567, r1615; +} +{ +mul.f16x2 r2228, r2225, r2410; +} +{ +add.f16x2 r2231, r58, r2228; +} +{ +add.f16x2 r2234, r1583, r1599; +} +{ +mul.f16x2 r2237, r2234, r2419; +} +{ +add.f16x2 r2240, r2231, r2237; +} +{ +sub.f16x2 r2243, r1573, r1621; +} +{ +mul.f16x2 r2246, r2243, r2428; +} +{ +sub.f16x2 r2249, r1589, r1605; +} +{ +mul.f16x2 r2252, r2249, r2434; +} +{ +add.f16x2 r2255, r2246, r2252; +} +{ +sub.f16x2 %22, r2240, r2255; +} +{ +add.f16x2 r2261, r1567, r1615; +} +{ +mul.f16x2 r2264, r2261, r2410; +} +{ +add.f16x2 r2267, r58, r2264; +} +{ +add.f16x2 r2270, r1583, r1599; +} +{ +mul.f16x2 r2273, r2270, r2419; +} +{ +add.f16x2 r2276, r2267, r2273; +} +{ +sub.f16x2 r2279, r1573, r1621; +} +{ +mul.f16x2 r2282, r2279, r2428; +} +{ +sub.f16x2 r2285, r1589, r1605; +} +{ +mul.f16x2 r2288, r2285, r2434; +} +{ +add.f16x2 r2291, r2282, r2288; +} +{ +add.f16x2 %32, r2276, r2291; +} +{ +add.f16x2 r2297, r1573, r1621; +} +{ +mul.f16x2 r2300, r2297, r2338; +} +{ +add.f16x2 r2303, r202, r2300; +} +{ +add.f16x2 r2306, r1589, r1605; +} +{ +mul.f16x2 r2309, r2306, r2410; +} +{ +add.f16x2 r2312, r2303, r2309; +} +{ +sub.f16x2 r2315, r1567, r1615; +} +{ +mul.f16x2 r2318, r2315, r2356; +} +{ +sub.f16x2 r2321, r1583, r1599; +} +{ +mul.f16x2 r2324, r2321, r2428; +} +{ +add.f16x2 r2327, r2318, r2324; +} +{ +add.f16x2 %13, r2312, r2327; +} +{ +add.f16x2 r2333, r1573, r1621; +} +{ +mul.f16x2 r2336, r2333, r2338; +} +{ +add.f16x2 r2339, r202, r2336; +} +{ +add.f16x2 r2342, r1589, r1605; +} +{ +mul.f16x2 r2345, r2342, r2410; +} +{ +add.f16x2 r2348, r2339, r2345; +} +{ +sub.f16x2 r2351, r1567, r1615; +} +{ +mul.f16x2 r2354, r2351, r2356; +} +{ +sub.f16x2 r2357, r1583, r1599; +} +{ +mul.f16x2 r2360, r2357, r2428; +} +{ +add.f16x2 r2363, r2354, r2360; +} +{ +sub.f16x2 %43, r2348, r2363; +} +{ +add.f16x2 r2369, r1573, r1621; +} +{ +mul.f16x2 r2372, r2369, r2410; +} +{ +add.f16x2 r2375, r202, r2372; +} +{ +add.f16x2 r2378, r1589, r1605; +} +{ +mul.f16x2 r2381, r2378, r2419; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +sub.f16x2 r2387, r1567, r1615; +} +{ +mul.f16x2 r2390, r2387, r2428; +} +{ +sub.f16x2 r2393, r1583, r1599; +} +{ +mul.f16x2 r2396, r2393, r2434; +} +{ +add.f16x2 r2399, r2390, r2396; +} +{ +add.f16x2 %23, r2384, r2399; +} +{ +add.f16x2 r2405, r1573, r1621; +} +{ +mul.f16x2 r2408, r2405, r2410; +} +{ +add.f16x2 r2411, r202, r2408; +} +{ +add.f16x2 r2414, r1589, r1605; +} +{ +mul.f16x2 r2417, r2414, r2419; +} +{ +add.f16x2 r2420, r2411, r2417; +} +{ +sub.f16x2 r2423, r1567, r1615; +} +{ +mul.f16x2 r2426, r2423, r2428; +} +{ +sub.f16x2 r2429, r1583, r1599; +} +{ +mul.f16x2 r2432, r2429, r2434; +} +{ +add.f16x2 r2435, r2426, r2432; +} +{ +sub.f16x2 %33, r2420, r2435; +} +{ +cvt.rn.f16.f64 rs119, fd107; +} +mov.b32 r2650, {rs119, rs119}; +{ +cvt.rn.f16.f64 rs120, fd108; +} +{ +neg.f16 rs121, rs120; +} +mov.b32 r2668, {rs121, rs121}; +{ +cvt.rn.f16.f64 rs123, fd105; +} +mov.b32 r2722, {rs123, rs123}; +{ +cvt.rn.f16.f64 rs124, fd106; +} +{ +neg.f16 rs125, rs124; +} +mov.b32 r2740, {rs125, rs125}; +{ +cvt.rn.f16.f64 rs127, fd107; +} +mov.b32 r2731, {rs127, rs127}; +{ +cvt.rn.f16.f64 rs128, fd108; +} +mov.b32 r2746, {rs128, rs128}; +{ +add.f16x2 r2441, r1631, r1679; +} +{ +add.f16x2 r2444, r130, r2441; +} +{ +add.f16x2 r2447, r1647, r1663; +} +{ +add.f16x2 %4, r2444, r2447; +} +{ +add.f16x2 r2453, r1637, r1685; +} +{ +add.f16x2 r2456, r274, r2453; +} +{ +add.f16x2 r2459, r1653, r1669; +} +{ +add.f16x2 %5, r2456, r2459; +} +{ +add.f16x2 r2465, r1631, r1679; +} +{ +mul.f16x2 r2468, r2465, r2650; +} +{ +add.f16x2 r2471, r130, r2468; +} +{ +add.f16x2 r2474, r1647, r1663; +} +{ +mul.f16x2 r2477, r2474, r2722; +} +{ +add.f16x2 r2480, r2471, r2477; +} +{ +sub.f16x2 r2483, r1637, r1685; +} +{ +mul.f16x2 r2486, r2483, r2668; +} +{ +sub.f16x2 r2489, r1653, r1669; +} +{ +mul.f16x2 r2492, r2489, r2740; +} +{ +add.f16x2 r2495, r2486, r2492; +} +{ +sub.f16x2 %14, r2480, r2495; +} +{ +add.f16x2 r2501, r1631, r1679; +} +{ +mul.f16x2 r2504, r2501, r2650; +} +{ +add.f16x2 r2507, r130, r2504; +} +{ +add.f16x2 r2510, r1647, r1663; +} +{ +mul.f16x2 r2513, r2510, r2722; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +sub.f16x2 r2519, r1637, r1685; +} +{ +mul.f16x2 r2522, r2519, r2668; +} +{ +sub.f16x2 r2525, r1653, r1669; +} +{ +mul.f16x2 r2528, r2525, r2740; +} +{ +add.f16x2 r2531, r2522, r2528; +} +{ +add.f16x2 %44, r2516, r2531; +} +{ +add.f16x2 r2537, r1631, r1679; +} +{ +mul.f16x2 r2540, r2537, r2722; +} +{ +add.f16x2 r2543, r130, r2540; +} +{ +add.f16x2 r2546, r1647, r1663; +} +{ +mul.f16x2 r2549, r2546, r2731; +} +{ +add.f16x2 r2552, r2543, r2549; +} +{ +sub.f16x2 r2555, r1637, r1685; +} +{ +mul.f16x2 r2558, r2555, r2740; +} +{ +sub.f16x2 r2561, r1653, r1669; +} +{ +mul.f16x2 r2564, r2561, r2746; +} +{ +add.f16x2 r2567, r2558, r2564; +} +{ +sub.f16x2 %24, r2552, r2567; +} +{ +add.f16x2 r2573, r1631, r1679; +} +{ +mul.f16x2 r2576, r2573, r2722; +} +{ +add.f16x2 r2579, r130, r2576; +} +{ +add.f16x2 r2582, r1647, r1663; +} +{ +mul.f16x2 r2585, r2582, r2731; +} +{ +add.f16x2 r2588, r2579, r2585; +} +{ +sub.f16x2 r2591, r1637, r1685; +} +{ +mul.f16x2 r2594, r2591, r2740; +} +{ +sub.f16x2 r2597, r1653, r1669; +} +{ +mul.f16x2 r2600, r2597, r2746; +} +{ +add.f16x2 r2603, r2594, r2600; +} +{ +add.f16x2 %34, r2588, r2603; +} +{ +add.f16x2 r2609, r1637, r1685; +} +{ +mul.f16x2 r2612, r2609, r2650; +} +{ +add.f16x2 r2615, r274, r2612; +} +{ +add.f16x2 r2618, r1653, r1669; +} +{ +mul.f16x2 r2621, r2618, r2722; +} +{ +add.f16x2 r2624, r2615, r2621; +} +{ +sub.f16x2 r2627, r1631, r1679; +} +{ +mul.f16x2 r2630, r2627, r2668; +} +{ +sub.f16x2 r2633, r1647, r1663; +} +{ +mul.f16x2 r2636, r2633, r2740; +} +{ +add.f16x2 r2639, r2630, r2636; +} +{ +add.f16x2 %15, r2624, r2639; +} +{ +add.f16x2 r2645, r1637, r1685; +} +{ +mul.f16x2 r2648, r2645, r2650; +} +{ +add.f16x2 r2651, r274, r2648; +} +{ +add.f16x2 r2654, r1653, r1669; +} +{ +mul.f16x2 r2657, r2654, r2722; +} +{ +add.f16x2 r2660, r2651, r2657; +} +{ +sub.f16x2 r2663, r1631, r1679; +} +{ +mul.f16x2 r2666, r2663, r2668; +} +{ +sub.f16x2 r2669, r1647, r1663; +} +{ +mul.f16x2 r2672, r2669, r2740; +} +{ +add.f16x2 r2675, r2666, r2672; +} +{ +sub.f16x2 %45, r2660, r2675; +} +{ +add.f16x2 r2681, r1637, r1685; +} +{ +mul.f16x2 r2684, r2681, r2722; +} +{ +add.f16x2 r2687, r274, r2684; +} +{ +add.f16x2 r2690, r1653, r1669; +} +{ +mul.f16x2 r2693, r2690, r2731; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +sub.f16x2 r2699, r1631, r1679; +} +{ +mul.f16x2 r2702, r2699, r2740; +} +{ +sub.f16x2 r2705, r1647, r1663; +} +{ +mul.f16x2 r2708, r2705, r2746; +} +{ +add.f16x2 r2711, r2702, r2708; +} +{ +add.f16x2 %25, r2696, r2711; +} +{ +add.f16x2 r2717, r1637, r1685; +} +{ +mul.f16x2 r2720, r2717, r2722; +} +{ +add.f16x2 r2723, r274, r2720; +} +{ +add.f16x2 r2726, r1653, r1669; +} +{ +mul.f16x2 r2729, r2726, r2731; +} +{ +add.f16x2 r2732, r2723, r2729; +} +{ +sub.f16x2 r2735, r1631, r1679; +} +{ +mul.f16x2 r2738, r2735, r2740; +} +{ +sub.f16x2 r2741, r1647, r1663; +} +{ +mul.f16x2 r2744, r2741, r2746; +} +{ +add.f16x2 r2747, r2738, r2744; +} +{ +sub.f16x2 %35, r2732, r2747; +} +{ +cvt.rn.f16.f64 rs129, fd107; +} +mov.b32 r2962, {rs129, rs129}; +{ +cvt.rn.f16.f64 rs130, fd108; +} +{ +neg.f16 rs131, rs130; +} +mov.b32 r2980, {rs131, rs131}; +{ +cvt.rn.f16.f64 rs133, fd105; +} +mov.b32 r3034, {rs133, rs133}; +{ +cvt.rn.f16.f64 rs134, fd106; +} +{ +neg.f16 rs135, rs134; +} +mov.b32 r3052, {rs135, rs135}; +{ +cvt.rn.f16.f64 rs137, fd107; +} +mov.b32 r3043, {rs137, rs137}; +{ +cvt.rn.f16.f64 rs138, fd108; +} +mov.b32 r3058, {rs138, rs138}; +{ +add.f16x2 r2753, r1695, r1743; +} +{ +add.f16x2 r2756, r166, r2753; +} +{ +add.f16x2 r2759, r1711, r1727; +} +{ +add.f16x2 %6, r2756, r2759; +} +{ +add.f16x2 r2765, r1701, r1749; +} +{ +add.f16x2 r2768, r310, r2765; +} +{ +add.f16x2 r2771, r1717, r1733; +} +{ +add.f16x2 %7, r2768, r2771; +} +{ +add.f16x2 r2777, r1695, r1743; +} +{ +mul.f16x2 r2780, r2777, r2962; +} +{ +add.f16x2 r2783, r166, r2780; +} +{ +add.f16x2 r2786, r1711, r1727; +} +{ +mul.f16x2 r2789, r2786, r3034; +} +{ +add.f16x2 r2792, r2783, r2789; +} +{ +sub.f16x2 r2795, r1701, r1749; +} +{ +mul.f16x2 r2798, r2795, r2980; +} +{ +sub.f16x2 r2801, r1717, r1733; +} +{ +mul.f16x2 r2804, r2801, r3052; +} +{ +add.f16x2 r2807, r2798, r2804; +} +{ +sub.f16x2 %16, r2792, r2807; +} +{ +add.f16x2 r2813, r1695, r1743; +} +{ +mul.f16x2 r2816, r2813, r2962; +} +{ +add.f16x2 r2819, r166, r2816; +} +{ +add.f16x2 r2822, r1711, r1727; +} +{ +mul.f16x2 r2825, r2822, r3034; +} +{ +add.f16x2 r2828, r2819, r2825; +} +{ +sub.f16x2 r2831, r1701, r1749; +} +{ +mul.f16x2 r2834, r2831, r2980; +} +{ +sub.f16x2 r2837, r1717, r1733; +} +{ +mul.f16x2 r2840, r2837, r3052; +} +{ +add.f16x2 r2843, r2834, r2840; +} +{ +add.f16x2 %46, r2828, r2843; +} +{ +add.f16x2 r2849, r1695, r1743; +} +{ +mul.f16x2 r2852, r2849, r3034; +} +{ +add.f16x2 r2855, r166, r2852; +} +{ +add.f16x2 r2858, r1711, r1727; +} +{ +mul.f16x2 r2861, r2858, r3043; +} +{ +add.f16x2 r2864, r2855, r2861; +} +{ +sub.f16x2 r2867, r1701, r1749; +} +{ +mul.f16x2 r2870, r2867, r3052; +} +{ +sub.f16x2 r2873, r1717, r1733; +} +{ +mul.f16x2 r2876, r2873, r3058; +} +{ +add.f16x2 r2879, r2870, r2876; +} +{ +sub.f16x2 %26, r2864, r2879; +} +{ +add.f16x2 r2885, r1695, r1743; +} +{ +mul.f16x2 r2888, r2885, r3034; +} +{ +add.f16x2 r2891, r166, r2888; +} +{ +add.f16x2 r2894, r1711, r1727; +} +{ +mul.f16x2 r2897, r2894, r3043; +} +{ +add.f16x2 r2900, r2891, r2897; +} +{ +sub.f16x2 r2903, r1701, r1749; +} +{ +mul.f16x2 r2906, r2903, r3052; +} +{ +sub.f16x2 r2909, r1717, r1733; +} +{ +mul.f16x2 r2912, r2909, r3058; +} +{ +add.f16x2 r2915, r2906, r2912; +} +{ +add.f16x2 %36, r2900, r2915; +} +{ +add.f16x2 r2921, r1701, r1749; +} +{ +mul.f16x2 r2924, r2921, r2962; +} +{ +add.f16x2 r2927, r310, r2924; +} +{ +add.f16x2 r2930, r1717, r1733; +} +{ +mul.f16x2 r2933, r2930, r3034; +} +{ +add.f16x2 r2936, r2927, r2933; +} +{ +sub.f16x2 r2939, r1695, r1743; +} +{ +mul.f16x2 r2942, r2939, r2980; +} +{ +sub.f16x2 r2945, r1711, r1727; +} +{ +mul.f16x2 r2948, r2945, r3052; +} +{ +add.f16x2 r2951, r2942, r2948; +} +{ +add.f16x2 %17, r2936, r2951; +} +{ +add.f16x2 r2957, r1701, r1749; +} +{ +mul.f16x2 r2960, r2957, r2962; +} +{ +add.f16x2 r2963, r310, r2960; +} +{ +add.f16x2 r2966, r1717, r1733; +} +{ +mul.f16x2 r2969, r2966, r3034; +} +{ +add.f16x2 r2972, r2963, r2969; +} +{ +sub.f16x2 r2975, r1695, r1743; +} +{ +mul.f16x2 r2978, r2975, r2980; +} +{ +sub.f16x2 r2981, r1711, r1727; +} +{ +mul.f16x2 r2984, r2981, r3052; +} +{ +add.f16x2 r2987, r2978, r2984; +} +{ +sub.f16x2 %47, r2972, r2987; +} +{ +add.f16x2 r2993, r1701, r1749; +} +{ +mul.f16x2 r2996, r2993, r3034; +} +{ +add.f16x2 r2999, r310, r2996; +} +{ +add.f16x2 r3002, r1717, r1733; +} +{ +mul.f16x2 r3005, r3002, r3043; +} +{ +add.f16x2 r3008, r2999, r3005; +} +{ +sub.f16x2 r3011, r1695, r1743; +} +{ +mul.f16x2 r3014, r3011, r3052; +} +{ +sub.f16x2 r3017, r1711, r1727; +} +{ +mul.f16x2 r3020, r3017, r3058; +} +{ +add.f16x2 r3023, r3014, r3020; +} +{ +add.f16x2 %27, r3008, r3023; +} +{ +add.f16x2 r3029, r1701, r1749; +} +{ +mul.f16x2 r3032, r3029, r3034; +} +{ +add.f16x2 r3035, r310, r3032; +} +{ +add.f16x2 r3038, r1717, r1733; +} +{ +mul.f16x2 r3041, r3038, r3043; +} +{ +add.f16x2 r3044, r3035, r3041; +} +{ +sub.f16x2 r3047, r1695, r1743; +} +{ +mul.f16x2 r3050, r3047, r3052; +} +{ +sub.f16x2 r3053, r1711, r1727; +} +{ +mul.f16x2 r3056, r3053, r3058; +} +{ +add.f16x2 r3059, r3050, r3056; +} +{ +sub.f16x2 %37, r3044, r3059; +} +{ +cvt.rn.f16.f64 rs139, fd107; +} +mov.b32 r3274, {rs139, rs139}; +{ +cvt.rn.f16.f64 rs140, fd108; +} +{ +neg.f16 rs141, rs140; +} +mov.b32 r3292, {rs141, rs141}; +{ +cvt.rn.f16.f64 rs143, fd105; +} +mov.b32 r3346, {rs143, rs143}; +{ +cvt.rn.f16.f64 rs144, fd106; +} +{ +neg.f16 rs145, rs144; +} +mov.b32 r3364, {rs145, rs145}; +{ +cvt.rn.f16.f64 rs147, fd107; +} +mov.b32 r3355, {rs147, rs147}; +{ +cvt.rn.f16.f64 rs148, fd108; +} +mov.b32 r3370, {rs148, rs148}; +{ +add.f16x2 r3065, r1759, r1807; +} +{ +add.f16x2 r3068, r94, r3065; +} +{ +add.f16x2 r3071, r1775, r1791; +} +{ +add.f16x2 %8, r3068, r3071; +} +{ +add.f16x2 r3077, r1765, r1813; +} +{ +add.f16x2 r3080, r238, r3077; +} +{ +add.f16x2 r3083, r1781, r1797; +} +{ +add.f16x2 %9, r3080, r3083; +} +{ +add.f16x2 r3089, r1759, r1807; +} +{ +mul.f16x2 r3092, r3089, r3274; +} +{ +add.f16x2 r3095, r94, r3092; +} +{ +add.f16x2 r3098, r1775, r1791; +} +{ +mul.f16x2 r3101, r3098, r3346; +} +{ +add.f16x2 r3104, r3095, r3101; +} +{ +sub.f16x2 r3107, r1765, r1813; +} +{ +mul.f16x2 r3110, r3107, r3292; +} +{ +sub.f16x2 r3113, r1781, r1797; +} +{ +mul.f16x2 r3116, r3113, r3364; +} +{ +add.f16x2 r3119, r3110, r3116; +} +{ +sub.f16x2 %18, r3104, r3119; +} +{ +add.f16x2 r3125, r1759, r1807; +} +{ +mul.f16x2 r3128, r3125, r3274; +} +{ +add.f16x2 r3131, r94, r3128; +} +{ +add.f16x2 r3134, r1775, r1791; +} +{ +mul.f16x2 r3137, r3134, r3346; +} +{ +add.f16x2 r3140, r3131, r3137; +} +{ +sub.f16x2 r3143, r1765, r1813; +} +{ +mul.f16x2 r3146, r3143, r3292; +} +{ +sub.f16x2 r3149, r1781, r1797; +} +{ +mul.f16x2 r3152, r3149, r3364; +} +{ +add.f16x2 r3155, r3146, r3152; +} +{ +add.f16x2 %48, r3140, r3155; +} +{ +add.f16x2 r3161, r1759, r1807; +} +{ +mul.f16x2 r3164, r3161, r3346; +} +{ +add.f16x2 r3167, r94, r3164; +} +{ +add.f16x2 r3170, r1775, r1791; +} +{ +mul.f16x2 r3173, r3170, r3355; +} +{ +add.f16x2 r3176, r3167, r3173; +} +{ +sub.f16x2 r3179, r1765, r1813; +} +{ +mul.f16x2 r3182, r3179, r3364; +} +{ +sub.f16x2 r3185, r1781, r1797; +} +{ +mul.f16x2 r3188, r3185, r3370; +} +{ +add.f16x2 r3191, r3182, r3188; +} +{ +sub.f16x2 %28, r3176, r3191; +} +{ +add.f16x2 r3197, r1759, r1807; +} +{ +mul.f16x2 r3200, r3197, r3346; +} +{ +add.f16x2 r3203, r94, r3200; +} +{ +add.f16x2 r3206, r1775, r1791; +} +{ +mul.f16x2 r3209, r3206, r3355; +} +{ +add.f16x2 r3212, r3203, r3209; +} +{ +sub.f16x2 r3215, r1765, r1813; +} +{ +mul.f16x2 r3218, r3215, r3364; +} +{ +sub.f16x2 r3221, r1781, r1797; +} +{ +mul.f16x2 r3224, r3221, r3370; +} +{ +add.f16x2 r3227, r3218, r3224; +} +{ +add.f16x2 %38, r3212, r3227; +} +{ +add.f16x2 r3233, r1765, r1813; +} +{ +mul.f16x2 r3236, r3233, r3274; +} +{ +add.f16x2 r3239, r238, r3236; +} +{ +add.f16x2 r3242, r1781, r1797; +} +{ +mul.f16x2 r3245, r3242, r3346; +} +{ +add.f16x2 r3248, r3239, r3245; +} +{ +sub.f16x2 r3251, r1759, r1807; +} +{ +mul.f16x2 r3254, r3251, r3292; +} +{ +sub.f16x2 r3257, r1775, r1791; +} +{ +mul.f16x2 r3260, r3257, r3364; +} +{ +add.f16x2 r3263, r3254, r3260; +} +{ +add.f16x2 %19, r3248, r3263; +} +{ +add.f16x2 r3269, r1765, r1813; +} +{ +mul.f16x2 r3272, r3269, r3274; +} +{ +add.f16x2 r3275, r238, r3272; +} +{ +add.f16x2 r3278, r1781, r1797; +} +{ +mul.f16x2 r3281, r3278, r3346; +} +{ +add.f16x2 r3284, r3275, r3281; +} +{ +sub.f16x2 r3287, r1759, r1807; +} +{ +mul.f16x2 r3290, r3287, r3292; +} +{ +sub.f16x2 r3293, r1775, r1791; +} +{ +mul.f16x2 r3296, r3293, r3364; +} +{ +add.f16x2 r3299, r3290, r3296; +} +{ +sub.f16x2 %49, r3284, r3299; +} +{ +add.f16x2 r3305, r1765, r1813; +} +{ +mul.f16x2 r3308, r3305, r3346; +} +{ +add.f16x2 r3311, r238, r3308; +} +{ +add.f16x2 r3314, r1781, r1797; +} +{ +mul.f16x2 r3317, r3314, r3355; +} +{ +add.f16x2 r3320, r3311, r3317; +} +{ +sub.f16x2 r3323, r1759, r1807; +} +{ +mul.f16x2 r3326, r3323, r3364; +} +{ +sub.f16x2 r3329, r1775, r1791; +} +{ +mul.f16x2 r3332, r3329, r3370; +} +{ +add.f16x2 r3335, r3326, r3332; +} +{ +add.f16x2 %29, r3320, r3335; +} +{ +add.f16x2 r3341, r1765, r1813; +} +{ +mul.f16x2 r3344, r3341, r3346; +} +{ +add.f16x2 r3347, r238, r3344; +} +{ +add.f16x2 r3350, r1781, r1797; +} +{ +mul.f16x2 r3353, r3350, r3355; +} +{ +add.f16x2 r3356, r3347, r3353; +} +{ +sub.f16x2 r3359, r1759, r1807; +} +{ +mul.f16x2 r3362, r3359, r3364; +} +{ +sub.f16x2 r3365, r1775, r1791; +} +{ +mul.f16x2 r3368, r3365, r3370; +} +{ +add.f16x2 r3371, r3362, r3368; +} +{ +sub.f16x2 %39, r3356, r3371; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1104, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<807>; +.reg .b64 rd<4>; +mov.u32 r796, %tid.x; +mov.f32 f34, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r1, {low, high}; +} +mov.f32 f36, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f30, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r5, {low, high}; +} +mov.f32 f32, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r796, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r797, rd3; +mul.lo.s32 r798, r797, 5; +sub.s32 r799, r796, r798; +cvt.rn.f32.u32 f37, r799; +mul.f32 f38, f37, 0f3E80ADFD; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +mov.u32 r800, %tid.y; +mov.u32 r801, %10; +mad.lo.s32 r802, r800, 200, r801; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +mad.lo.s32 r803, r797, 200, r802; +barrier.sync 0; +mad.lo.s32 r804, r799, 40, r803; +st.shared.v2.f32 [r804], {r20, r32}; +st.shared.v2.f32 [r804+8], {r333, r342}; +st.shared.v2.f32 [r804+16], {r370, r379}; +st.shared.v2.f32 [r804+24], {r407, r416}; +st.shared.v2.f32 [r804+32], {r444, r453}; +barrier.sync 0; +shl.b32 r805, r799, 5; +sub.s32 r806, r804, r805; +ld.shared.u32 r488, [r806]; +ld.shared.u32 r500, [r806+4]; +ld.shared.u32 r485, [r806+40]; +ld.shared.u32 r497, [r806+44]; +ld.shared.u32 r491, [r806+80]; +ld.shared.u32 r503, [r806+84]; +ld.shared.u32 r492, [r806+120]; +ld.shared.u32 r504, [r806+124]; +ld.shared.u32 r486, [r806+160]; +ld.shared.u32 r498, [r806+164]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 %0, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 %1, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 %2, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 %8, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 %4, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 %6, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 %3, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 %9, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 %5, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 %7, r775, r790; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1105, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<40>; +.reg .b32 r<807>; +.reg .b64 rd<4>; +mov.u32 r796, %tid.y; +mov.u32 r797, %10; +mad.lo.s32 r798, r796, 100, r797; +mov.u32 r799, %tid.x; +mov.f32 f34, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r1, {low, high}; +} +mov.f32 f36, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f30, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r5, {low, high}; +} +mov.f32 f32, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r799, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r800, rd3; +mul.lo.s32 r801, r800, 5; +sub.s32 r802, r799, r801; +mad.lo.s32 r803, r800, 100, r798; +cvt.rn.f32.u32 f37, r802; +mul.f32 f38, f37, 0f3E80ADFD; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +barrier.sync 0; +mad.lo.s32 r804, r802, 20, r803; +st.shared.u32 [r804], r20; +st.shared.u32 [r804+4], r333; +st.shared.u32 [r804+8], r370; +st.shared.u32 [r804+12], r407; +st.shared.u32 [r804+16], r444; +barrier.sync 0; +shl.b32 r805, r802, 4; +sub.s32 r806, r804, r805; +ld.shared.u32 r488, [r806]; +ld.shared.u32 r485, [r806+20]; +ld.shared.u32 r491, [r806+40]; +ld.shared.u32 r492, [r806+60]; +ld.shared.u32 r486, [r806+80]; +barrier.sync 0; +st.shared.u32 [r804], r32; +st.shared.u32 [r804+4], r342; +st.shared.u32 [r804+8], r379; +st.shared.u32 [r804+12], r416; +st.shared.u32 [r804+16], r453; +barrier.sync 0; +ld.shared.u32 r500, [r806]; +ld.shared.u32 r497, [r806+20]; +ld.shared.u32 r503, [r806+40]; +ld.shared.u32 r504, [r806+60]; +ld.shared.u32 r498, [r806+80]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f30; +cvt.rn.f16.f32 high, f30; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f32; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f34; +cvt.rn.f16.f32 high, f34; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f36; +cvt.rn.f16.f32 high, f36; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 %0, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 %1, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 %2, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 %8, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 %4, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 %6, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 %3, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 %9, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 %5, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 %7, r775, r790; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..f9efe9fb166e3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp32_fwd.hpp.inc @@ -0,0 +1,878 @@ +#ifndef CUFFTDX_FFT_25_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_25_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<155, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<621>; +.reg .b64 rd<2>; +add.f32 f101, %63, %103; +add.f32 f102, %50, f101; +add.f32 f103, %76, %90; +add.f32 f104, f103, f102; +add.f32 f105, %65, %105; +add.f32 f106, %51, f105; +add.f32 f107, %78, %91; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %50; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %65, %105; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %78, %91; +mul.f32 f115, f114, 0fBF167918; +sub.f32 f116, f115, f113; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %50, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f127, f105, 0f3E9E377A, %51; +mul.f32 f128, f107, 0f3F4F1BBD; +sub.f32 f129, f127, f128; +sub.f32 f130, %63, %103; +mul.f32 f131, f130, 0f3F737871; +sub.f32 f132, %76, %90; +mul.f32 f133, f132, 0fBF167918; +sub.f32 f134, f133, f131; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %51, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %66, %106; +add.f32 f146, %52, f145; +add.f32 f147, %79, %92; +add.f32 f148, f147, f146; +add.f32 f149, %67, %107; +add.f32 f150, %54, f149; +add.f32 f151, %81, %94; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f145, 0f3E9E377A, %52; +mul.f32 f154, f147, 0f3F4F1BBD; +sub.f32 f155, f153, f154; +sub.f32 f156, %67, %107; +mul.f32 f157, f156, 0f3F737871; +sub.f32 f158, %81, %94; +mul.f32 f159, f158, 0fBF167918; +sub.f32 f160, f159, f157; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %52, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +fma.rn.f32 f171, f149, 0f3E9E377A, %54; +mul.f32 f172, f151, 0f3F4F1BBD; +sub.f32 f173, f171, f172; +sub.f32 f174, %66, %106; +mul.f32 f175, f174, 0f3F737871; +sub.f32 f176, %79, %92; +mul.f32 f177, f176, 0fBF167918; +sub.f32 f178, f177, f175; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %54, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %68, %108; +add.f32 f190, %55, f189; +add.f32 f191, %82, %95; +add.f32 f192, f191, f190; +add.f32 f193, %70, %110; +add.f32 f194, %57, f193; +add.f32 f195, %83, %97; +add.f32 f196, f195, f194; +fma.rn.f32 f197, f189, 0f3E9E377A, %55; +mul.f32 f198, f191, 0f3F4F1BBD; +sub.f32 f199, f197, f198; +sub.f32 f200, %70, %110; +mul.f32 f201, f200, 0f3F737871; +sub.f32 f202, %83, %97; +mul.f32 f203, f202, 0fBF167918; +sub.f32 f204, f203, f201; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %55, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f215, f193, 0f3E9E377A, %57; +mul.f32 f216, f195, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, %68, %108; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, %82, %95; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %57, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %71, %111; +add.f32 f234, %58, f233; +add.f32 f235, %84, %98; +add.f32 f236, f235, f234; +add.f32 f237, %73, %113; +add.f32 f238, %59, f237; +add.f32 f239, %86, %99; +add.f32 f240, f239, f238; +fma.rn.f32 f241, f233, 0f3E9E377A, %58; +mul.f32 f242, f235, 0f3F4F1BBD; +sub.f32 f243, f241, f242; +sub.f32 f244, %73, %113; +mul.f32 f245, f244, 0f3F737871; +sub.f32 f246, %86, %99; +mul.f32 f247, f246, 0fBF167918; +sub.f32 f248, f247, f245; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %58, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +fma.rn.f32 f259, f237, 0f3E9E377A, %59; +mul.f32 f260, f239, 0f3F4F1BBD; +sub.f32 f261, f259, f260; +sub.f32 f262, %71, %111; +mul.f32 f263, f262, 0f3F737871; +sub.f32 f264, %84, %98; +mul.f32 f265, f264, 0fBF167918; +sub.f32 f266, f265, f263; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %59, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %74, %114; +add.f32 f278, %60, f277; +add.f32 f279, %87, %100; +add.f32 f280, f279, f278; +add.f32 f281, %75, %115; +add.f32 f282, %62, f281; +add.f32 f283, %89, %102; +add.f32 f284, f283, f282; +fma.rn.f32 f285, f277, 0f3E9E377A, %60; +mul.f32 f286, f279, 0f3F4F1BBD; +sub.f32 f287, f285, f286; +sub.f32 f288, %75, %115; +mul.f32 f289, f288, 0f3F737871; +sub.f32 f290, %89, %102; +mul.f32 f291, f290, 0fBF167918; +sub.f32 f292, f291, f289; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %60, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +fma.rn.f32 f303, f281, 0f3E9E377A, %62; +mul.f32 f304, f283, 0f3F4F1BBD; +sub.f32 f305, f303, f304; +sub.f32 f306, %74, %114; +mul.f32 f307, f306, 0f3F737871; +sub.f32 f308, %87, %100; +mul.f32 f309, f308, 0fBF167918; +sub.f32 f310, f309, f307; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %62, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mul.f32 f321, f161, 0f3F77F511; +mul.f32 f322, f179, 0fBE7EA890; +sub.f32 f323, f321, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f326, f205, 0f3F6055A2; +mul.f32 f327, f223, 0fBEF6A86B; +sub.f32 f328, f326, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f331, f249, 0f3F3A9DB0; +mul.f32 f332, f267, 0fBF2F3E7B; +sub.f32 f333, f331, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f336, f293, 0f3F092BF2; +mul.f32 f337, f311, 0fBF5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f341, f169, 0f3F6055A2; +mul.f32 f342, f187, 0fBEF6A86B; +sub.f32 f343, f341, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f346, f213, 0f3F092BF2; +mul.f32 f347, f231, 0fBF5825E0; +sub.f32 f348, f346, f347; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f351, f257, 0f3D809851; +mul.f32 f352, f275, 0fBF7F7EAE; +sub.f32 f353, f351, f352; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f356, f301, 0fBED9FFBE; +mul.f32 f357, f319, 0fBF67A2BF; +sub.f32 f358, f356, f357; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f361, f170, 0f3F3A9DB0; +mul.f32 f362, f188, 0fBF2F3E7B; +sub.f32 f363, f361, f362; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f366, f214, 0f3D809851; +mul.f32 f367, f232, 0fBF7F7EAE; +sub.f32 f368, f366, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f371, f258, 0fBF232E38; +mul.f32 f372, f276, 0fBF45405B; +sub.f32 f373, f371, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f376, f302, 0fBF7DFB3B; +mul.f32 f377, f320, 0fBE00575B; +sub.f32 f378, f376, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f381, f162, 0f3F092BF2; +mul.f32 f382, f180, 0fBF5825E0; +sub.f32 f383, f381, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f386, f206, 0fBED9FFBE; +mul.f32 f387, f224, 0fBF67A2BF; +sub.f32 f388, f386, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f391, f250, 0fBF7DFB3B; +mul.f32 f392, f268, 0fBE00575B; +sub.f32 f393, f391, f392; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f396, f294, 0fBF232E38; +mul.f32 f397, f312, 0f3F45405B; +sub.f32 f398, f396, f397; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f402, f104, f401; +add.f32 f403, f192, f236; +add.f32 f404, f152, f284; +add.f32 f405, f108, f404; +add.f32 f406, f196, f240; +fma.rn.f32 f407, f401, 0f3E9E377A, f104; +mul.f32 f408, f403, 0f3F4F1BBD; +sub.f32 f409, f407, f408; +sub.f32 f410, f152, f284; +mul.f32 f411, f410, 0f3F737871; +sub.f32 f412, f196, f240; +mul.f32 f413, f412, 0fBF167918; +sub.f32 f414, f413, f411; +mul.f32 f415, f401, 0f3F4F1BBD; +sub.f32 f416, f104, f415; +fma.rn.f32 f417, f403, 0f3E9E377A, f416; +mul.f32 f418, f410, 0f3F167918; +mul.f32 f419, f412, 0f3F737871; +sub.f32 f420, f419, f418; +fma.rn.f32 f421, f404, 0f3E9E377A, f108; +mul.f32 f422, f406, 0f3F4F1BBD; +sub.f32 f423, f421, f422; +sub.f32 f424, f148, f280; +mul.f32 f425, f424, 0f3F737871; +sub.f32 f426, f192, f236; +mul.f32 f427, f426, 0fBF167918; +sub.f32 f428, f427, f425; +mul.f32 f429, f404, 0f3F4F1BBD; +sub.f32 f430, f108, f429; +fma.rn.f32 f431, f406, 0f3E9E377A, f430; +mul.f32 f432, f424, 0f3F167918; +mul.f32 f433, f426, 0f3F737871; +sub.f32 f434, f433, f432; +add.f32 f435, f323, f338; +add.f32 f436, f117, f435; +add.f32 f437, f328, f333; +add.f32 f438, f325, f340; +add.f32 f439, f135, f438; +add.f32 f440, f330, f335; +fma.rn.f32 f441, f435, 0f3E9E377A, f117; +mul.f32 f442, f437, 0f3F4F1BBD; +sub.f32 f443, f441, f442; +sub.f32 f444, f325, f340; +mul.f32 f445, f444, 0f3F737871; +sub.f32 f446, f330, f335; +mul.f32 f447, f446, 0fBF167918; +sub.f32 f448, f447, f445; +mul.f32 f449, f435, 0f3F4F1BBD; +sub.f32 f450, f117, f449; +fma.rn.f32 f451, f437, 0f3E9E377A, f450; +mul.f32 f452, f444, 0f3F167918; +mul.f32 f453, f446, 0f3F737871; +sub.f32 f454, f453, f452; +fma.rn.f32 f455, f438, 0f3E9E377A, f135; +mul.f32 f456, f440, 0f3F4F1BBD; +sub.f32 f457, f455, f456; +sub.f32 f458, f323, f338; +mul.f32 f459, f458, 0f3F737871; +sub.f32 f460, f328, f333; +mul.f32 f461, f460, 0fBF167918; +sub.f32 f462, f461, f459; +mul.f32 f463, f438, 0f3F4F1BBD; +sub.f32 f464, f135, f463; +fma.rn.f32 f465, f440, 0f3E9E377A, f464; +mul.f32 f466, f458, 0f3F167918; +mul.f32 f467, f460, 0f3F737871; +sub.f32 f468, f467, f466; +add.f32 f469, f343, f358; +add.f32 f470, f125, f469; +add.f32 f471, f348, f353; +add.f32 f472, f345, f360; +add.f32 f473, f143, f472; +add.f32 f474, f350, f355; +fma.rn.f32 f475, f469, 0f3E9E377A, f125; +mul.f32 f476, f471, 0f3F4F1BBD; +sub.f32 f477, f475, f476; +sub.f32 f478, f345, f360; +mul.f32 f479, f478, 0f3F737871; +sub.f32 f480, f350, f355; +mul.f32 f481, f480, 0fBF167918; +sub.f32 f482, f481, f479; +mul.f32 f483, f469, 0f3F4F1BBD; +sub.f32 f484, f125, f483; +fma.rn.f32 f485, f471, 0f3E9E377A, f484; +mul.f32 f486, f478, 0f3F167918; +mul.f32 f487, f480, 0f3F737871; +sub.f32 f488, f487, f486; +fma.rn.f32 f489, f472, 0f3E9E377A, f143; +mul.f32 f490, f474, 0f3F4F1BBD; +sub.f32 f491, f489, f490; +sub.f32 f492, f343, f358; +mul.f32 f493, f492, 0f3F737871; +sub.f32 f494, f348, f353; +mul.f32 f495, f494, 0fBF167918; +sub.f32 f496, f495, f493; +mul.f32 f497, f472, 0f3F4F1BBD; +sub.f32 f498, f143, f497; +fma.rn.f32 f499, f474, 0f3E9E377A, f498; +mul.f32 f500, f492, 0f3F167918; +mul.f32 f501, f494, 0f3F737871; +sub.f32 f502, f501, f500; +add.f32 f503, f363, f378; +add.f32 f504, f126, f503; +add.f32 f505, f368, f373; +add.f32 f506, f365, f380; +add.f32 f507, f144, f506; +add.f32 f508, f370, f375; +fma.rn.f32 f509, f503, 0f3E9E377A, f126; +mul.f32 f510, f505, 0f3F4F1BBD; +sub.f32 f511, f509, f510; +sub.f32 f512, f365, f380; +mul.f32 f513, f512, 0f3F737871; +sub.f32 f514, f370, f375; +mul.f32 f515, f514, 0fBF167918; +sub.f32 f516, f515, f513; +mul.f32 f517, f503, 0f3F4F1BBD; +sub.f32 f518, f126, f517; +fma.rn.f32 f519, f505, 0f3E9E377A, f518; +mul.f32 f520, f512, 0f3F167918; +mul.f32 f521, f514, 0f3F737871; +sub.f32 f522, f521, f520; +fma.rn.f32 f523, f506, 0f3E9E377A, f144; +mul.f32 f524, f508, 0f3F4F1BBD; +sub.f32 f525, f523, f524; +sub.f32 f526, f363, f378; +mul.f32 f527, f526, 0f3F737871; +sub.f32 f528, f368, f373; +mul.f32 f529, f528, 0fBF167918; +sub.f32 f530, f529, f527; +mul.f32 f531, f506, 0f3F4F1BBD; +sub.f32 f532, f144, f531; +fma.rn.f32 f533, f508, 0f3E9E377A, f532; +mul.f32 f534, f526, 0f3F167918; +mul.f32 f535, f528, 0f3F737871; +sub.f32 f536, f535, f534; +add.f32 f537, f383, f398; +add.f32 f538, f118, f537; +add.f32 f539, f388, f393; +add.f32 f540, f385, f400; +add.f32 f541, f136, f540; +add.f32 f542, f390, f395; +fma.rn.f32 f543, f537, 0f3E9E377A, f118; +mul.f32 f544, f539, 0f3F4F1BBD; +sub.f32 f545, f543, f544; +sub.f32 f546, f385, f400; +mul.f32 f547, f546, 0f3F737871; +sub.f32 f548, f390, f395; +mul.f32 f549, f548, 0fBF167918; +sub.f32 f550, f549, f547; +mul.f32 f551, f537, 0f3F4F1BBD; +sub.f32 f552, f118, f551; +fma.rn.f32 f553, f539, 0f3E9E377A, f552; +mul.f32 f554, f546, 0f3F167918; +mul.f32 f555, f548, 0f3F737871; +sub.f32 f556, f555, f554; +fma.rn.f32 f557, f540, 0f3E9E377A, f136; +mul.f32 f558, f542, 0f3F4F1BBD; +sub.f32 f559, f557, f558; +sub.f32 f560, f383, f398; +mul.f32 f561, f560, 0f3F737871; +sub.f32 f562, f388, f393; +mul.f32 f563, f562, 0fBF167918; +sub.f32 f564, f563, f561; +mul.f32 f565, f540, 0f3F4F1BBD; +sub.f32 f566, f136, f565; +fma.rn.f32 f567, f542, 0f3E9E377A, f566; +mul.f32 f568, f560, 0f3F167918; +mul.f32 f569, f562, 0f3F737871; +sub.f32 f570, f569, f568; +add.f32 %1, f406, f405; +add.f32 %0, f403, f402; +add.f32 %3, f440, f439; +add.f32 %2, f437, f436; +add.f32 %5, f474, f473; +add.f32 %4, f471, f470; +add.f32 %7, f508, f507; +add.f32 %6, f505, f504; +add.f32 %9, f542, f541; +add.f32 %8, f539, f538; +add.f32 %11, f428, f423; +sub.f32 %10, f409, f414; +add.f32 %13, f462, f457; +sub.f32 %12, f443, f448; +add.f32 %15, f496, f491; +sub.f32 %14, f477, f482; +add.f32 %17, f530, f525; +sub.f32 %16, f511, f516; +add.f32 %19, f564, f559; +sub.f32 %18, f545, f550; +add.f32 %21, f434, f431; +sub.f32 %20, f417, f420; +add.f32 %23, f468, f465; +sub.f32 %22, f451, f454; +add.f32 %25, f502, f499; +sub.f32 %24, f485, f488; +add.f32 %27, f536, f533; +sub.f32 %26, f519, f522; +add.f32 %29, f570, f567; +sub.f32 %28, f553, f556; +sub.f32 %31, f431, f434; +add.f32 %30, f420, f417; +sub.f32 %33, f465, f468; +add.f32 %32, f454, f451; +sub.f32 %35, f499, f502; +add.f32 %34, f488, f485; +sub.f32 %37, f533, f536; +add.f32 %36, f522, f519; +sub.f32 %39, f567, f570; +add.f32 %38, f556, f553; +sub.f32 %41, f423, f428; +add.f32 %40, f414, f409; +sub.f32 %43, f457, f462; +add.f32 %42, f448, f443; +sub.f32 %45, f491, f496; +add.f32 %44, f482, f477; +sub.f32 %47, f525, f530; +add.f32 %46, f516, f511; +sub.f32 %49, f559, f564; +add.f32 %48, f550, f545; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<156, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<168>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 200, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %14, %22; +add.f32 f22, %12, f21; +add.f32 f23, %17, %20; +add.f32 f24, %16, %23; +add.f32 f25, %13, f24; +add.f32 f26, %19, %21; +fma.rn.f32 f27, f21, 0f3E9E377A, %12; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %16, %23; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %19, %21; +mul.f32 f33, f32, 0fBF167918; +sub.f32 f34, f33, f31; +sub.f32 f35, f29, f34; +add.f32 f36, f34, f29; +mul.f32 f37, f21, 0f3F4F1BBD; +sub.f32 f38, %12, f37; +fma.rn.f32 f39, f23, 0f3E9E377A, f38; +mul.f32 f40, f30, 0f3F167918; +mul.f32 f41, f32, 0f3F737871; +sub.f32 f42, f41, f40; +sub.f32 f43, f39, f42; +add.f32 f44, f42, f39; +fma.rn.f32 f45, f24, 0f3E9E377A, %13; +mul.f32 f46, f26, 0f3F4F1BBD; +sub.f32 f47, f45, f46; +sub.f32 f48, %14, %22; +mul.f32 f49, f48, 0f3F737871; +sub.f32 f50, %17, %20; +mul.f32 f51, f50, 0fBF167918; +sub.f32 f52, f51, f49; +add.f32 f53, f52, f47; +sub.f32 f54, f47, f52; +mul.f32 f55, f24, 0f3F4F1BBD; +sub.f32 f56, %13, f55; +fma.rn.f32 f57, f26, 0f3E9E377A, f56; +mul.f32 f58, f48, 0f3F167918; +mul.f32 f59, f50, 0f3F737871; +sub.f32 f60, f59, f58; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 200, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f63, f35; +mul.f32 f68, f64, f53; +mul.f32 f69, f63, f53; +mul.f32 f70, f63, f63; +mul.f32 f71, f64, f64; +sub.f32 f72, f70, f71; +mul.f32 f73, f64, f63; +fma.rn.f32 f74, f64, f63, f73; +mul.f32 f75, f72, f43; +mul.f32 f76, f74, f61; +mul.f32 f77, f72, f61; +mul.f32 f78, f63, f72; +mul.f32 f79, f64, f74; +sub.f32 f80, f78, f79; +mul.f32 f81, f63, f74; +fma.rn.f32 f82, f64, f72, f81; +mul.f32 f83, f80, f44; +mul.f32 f84, f82, f62; +mul.f32 f85, f80, f62; +mul.f32 f86, f63, f80; +mul.f32 f87, f64, f82; +sub.f32 f88, f86, f87; +mul.f32 f89, f63, f82; +fma.rn.f32 f90, f64, f80, f89; +mul.f32 f91, f88, f36; +mul.f32 f92, f90, f54; +mul.f32 f93, f88, f54; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f94, f26, f25; +add.f32 f95, f23, f22; +st.shared.v2.f32 [r9], {f95, f94}; +fma.rn.f32 f96, f64, f35, f69; +sub.f32 f97, f67, f68; +st.shared.v2.f32 [r9+8], {f97, f96}; +fma.rn.f32 f98, f74, f43, f77; +sub.f32 f99, f75, f76; +st.shared.v2.f32 [r9+16], {f99, f98}; +sub.f32 f100, f83, f84; +fma.rn.f32 f101, f82, f44, f85; +st.shared.v2.f32 [r9+24], {f100, f101}; +fma.rn.f32 f102, f90, f36, f93; +sub.f32 f103, f91, f92; +st.shared.v2.f32 [r9+32], {f103, f102}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f104, f105}, [r11]; +ld.shared.v2.f32 {f108, f109}, [r11+40]; +ld.shared.v2.f32 {f112, f113}, [r11+80]; +ld.shared.v2.f32 {f116, f117}, [r11+120]; +ld.shared.v2.f32 {f120, f121}, [r11+160]; +add.f32 f124, f108, f120; +add.f32 f125, f104, f124; +add.f32 f126, f112, f116; +add.f32 f127, f109, f121; +add.f32 f128, f105, f127; +add.f32 f129, f113, f117; +fma.rn.f32 f130, f124, 0f3E9E377A, f104; +mul.f32 f131, f126, 0f3F4F1BBD; +sub.f32 f132, f130, f131; +sub.f32 f133, f109, f121; +mul.f32 f134, f133, 0f3F737871; +sub.f32 f135, f113, f117; +mul.f32 f136, f135, 0fBF167918; +sub.f32 f137, f136, f134; +mul.f32 f138, f124, 0f3F4F1BBD; +sub.f32 f139, f104, f138; +fma.rn.f32 f140, f126, 0f3E9E377A, f139; +mul.f32 f141, f133, 0f3F167918; +mul.f32 f142, f135, 0f3F737871; +sub.f32 f143, f142, f141; +fma.rn.f32 f144, f127, 0f3E9E377A, f105; +mul.f32 f145, f129, 0f3F4F1BBD; +sub.f32 f146, f144, f145; +sub.f32 f147, f108, f120; +mul.f32 f148, f147, 0f3F737871; +sub.f32 f149, f112, f116; +mul.f32 f150, f149, 0fBF167918; +sub.f32 f151, f150, f148; +mul.f32 f152, f127, 0f3F4F1BBD; +sub.f32 f153, f105, f152; +fma.rn.f32 f154, f129, 0f3E9E377A, f153; +mul.f32 f155, f147, 0f3F167918; +mul.f32 f156, f149, 0f3F737871; +sub.f32 f157, f156, f155; +add.f32 %1, f129, f128; +add.f32 %0, f126, f125; +add.f32 %3, f151, f146; +sub.f32 %2, f132, f137; +add.f32 %5, f157, f154; +sub.f32 %4, f140, f143; +sub.f32 %7, f154, f157; +add.f32 %6, f143, f140; +sub.f32 %9, f146, f151; +add.f32 %8, f137, f132; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<157, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<158>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 100, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %14, %22; +add.f32 f22, %12, f21; +add.f32 f23, %17, %20; +add.f32 f24, f23, f22; +add.f32 f25, %16, %23; +add.f32 f26, %13, f25; +add.f32 f27, %19, %21; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %12; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %16, %23; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %19, %21; +mul.f32 f35, f34, 0fBF167918; +sub.f32 f36, f35, f33; +sub.f32 f37, f31, f36; +add.f32 f38, f36, f31; +mul.f32 f39, f21, 0f3F4F1BBD; +sub.f32 f40, %12, f39; +fma.rn.f32 f41, f23, 0f3E9E377A, f40; +mul.f32 f42, f32, 0f3F167918; +mul.f32 f43, f34, 0f3F737871; +sub.f32 f44, f43, f42; +sub.f32 f45, f41, f44; +add.f32 f46, f44, f41; +fma.rn.f32 f47, f25, 0f3E9E377A, %13; +mul.f32 f48, f27, 0f3F4F1BBD; +sub.f32 f49, f47, f48; +sub.f32 f50, %14, %22; +mul.f32 f51, f50, 0f3F737871; +sub.f32 f52, %17, %20; +mul.f32 f53, f52, 0fBF167918; +sub.f32 f54, f53, f51; +add.f32 f55, f54, f49; +sub.f32 f56, f49, f54; +mul.f32 f57, f25, 0f3F4F1BBD; +sub.f32 f58, %13, f57; +fma.rn.f32 f59, f27, 0f3E9E377A, f58; +mul.f32 f60, f50, 0f3F167918; +mul.f32 f61, f52, 0f3F737871; +sub.f32 f62, f61, f60; +add.f32 f63, f62, f59; +sub.f32 f64, f59, f62; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f65, f66}, [rd6]; +mul.f32 f69, f65, f37; +mul.f32 f70, f66, f55; +sub.f32 f71, f69, f70; +mul.f32 f72, f65, f55; +fma.rn.f32 f73, f66, f37, f72; +mul.f32 f74, f65, f65; +mul.f32 f75, f66, f66; +sub.f32 f76, f74, f75; +mul.f32 f77, f66, f65; +fma.rn.f32 f78, f66, f65, f77; +mul.f32 f79, f76, f45; +mul.f32 f80, f78, f63; +sub.f32 f81, f79, f80; +mul.f32 f82, f76, f63; +fma.rn.f32 f83, f78, f45, f82; +mul.f32 f84, f65, f76; +mul.f32 f85, f66, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f65, f78; +fma.rn.f32 f88, f66, f76, f87; +mul.f32 f89, f86, f46; +mul.f32 f90, f88, f64; +sub.f32 f91, f89, f90; +mul.f32 f92, f86, f64; +fma.rn.f32 f93, f88, f46, f92; +mul.f32 f94, f65, f86; +mul.f32 f95, f66, f88; +sub.f32 f96, f94, f95; +mul.f32 f97, f65, f88; +fma.rn.f32 f98, f66, f86, f97; +mul.f32 f99, f96, f38; +mul.f32 f100, f98, f56; +sub.f32 f101, f99, f100; +mul.f32 f102, f96, f56; +fma.rn.f32 f103, f98, f38, f102; +mad.lo.s32 r8, r5, 100, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f104, [r11]; +ld.shared.f32 f105, [r11+20]; +ld.shared.f32 f106, [r11+40]; +ld.shared.f32 f107, [r11+60]; +ld.shared.f32 f108, [r11+80]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f73; +st.shared.f32 [r9+8], f83; +st.shared.f32 [r9+12], f93; +st.shared.f32 [r9+16], f103; +barrier.sync 0; +ld.shared.f32 f109, [r11]; +ld.shared.f32 f110, [r11+20]; +ld.shared.f32 f111, [r11+40]; +ld.shared.f32 f112, [r11+60]; +ld.shared.f32 f113, [r11+80]; +add.f32 f114, f105, f108; +add.f32 f115, f104, f114; +add.f32 f116, f106, f107; +add.f32 f117, f110, f113; +add.f32 f118, f109, f117; +add.f32 f119, f111, f112; +fma.rn.f32 f120, f114, 0f3E9E377A, f104; +mul.f32 f121, f116, 0f3F4F1BBD; +sub.f32 f122, f120, f121; +sub.f32 f123, f110, f113; +mul.f32 f124, f123, 0f3F737871; +sub.f32 f125, f111, f112; +mul.f32 f126, f125, 0fBF167918; +sub.f32 f127, f126, f124; +mul.f32 f128, f114, 0f3F4F1BBD; +sub.f32 f129, f104, f128; +fma.rn.f32 f130, f116, 0f3E9E377A, f129; +mul.f32 f131, f123, 0f3F167918; +mul.f32 f132, f125, 0f3F737871; +sub.f32 f133, f132, f131; +fma.rn.f32 f134, f117, 0f3E9E377A, f109; +mul.f32 f135, f119, 0f3F4F1BBD; +sub.f32 f136, f134, f135; +sub.f32 f137, f105, f108; +mul.f32 f138, f137, 0f3F737871; +sub.f32 f139, f106, f107; +mul.f32 f140, f139, 0fBF167918; +sub.f32 f141, f140, f138; +mul.f32 f142, f117, 0f3F4F1BBD; +sub.f32 f143, f109, f142; +fma.rn.f32 f144, f119, 0f3E9E377A, f143; +mul.f32 f145, f137, 0f3F167918; +mul.f32 f146, f139, 0f3F737871; +sub.f32 f147, f146, f145; +add.f32 %0, f116, f115; +add.f32 %1, f119, f118; +add.f32 %3, f141, f136; +sub.f32 %2, f122, f127; +sub.f32 %4, f130, f133; +add.f32 %5, f147, f144; +add.f32 %6, f133, f130; +sub.f32 %7, f144, f147; +sub.f32 %9, f136, f141; +add.f32 %8, f127, f122; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..1b9a7fda1b413 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp32_inv.hpp.inc @@ -0,0 +1,850 @@ +#ifndef CUFFTDX_FFT_25_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_25_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<357, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<601>; +.reg .b64 rd<2>; +add.f32 f101, %63, %103; +add.f32 f102, %50, f101; +add.f32 f103, %76, %90; +add.f32 f104, f103, f102; +add.f32 f105, %65, %105; +add.f32 f106, %51, f105; +add.f32 f107, %78, %91; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %50; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %65, %105; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %78, %91; +fma.rn.f32 f115, f114, 0f3F167918, f113; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %50, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +fma.rn.f32 f126, f105, 0f3E9E377A, %51; +mul.f32 f127, f107, 0f3F4F1BBD; +sub.f32 f128, f126, f127; +sub.f32 f129, %63, %103; +mul.f32 f130, f129, 0f3F737871; +sub.f32 f131, %76, %90; +fma.rn.f32 f132, f131, 0f3F167918, f130; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %51, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %66, %106; +add.f32 f144, %52, f143; +add.f32 f145, %79, %92; +add.f32 f146, f145, f144; +add.f32 f147, %67, %107; +add.f32 f148, %54, f147; +add.f32 f149, %81, %94; +add.f32 f150, f149, f148; +fma.rn.f32 f151, f143, 0f3E9E377A, %52; +mul.f32 f152, f145, 0f3F4F1BBD; +sub.f32 f153, f151, f152; +sub.f32 f154, %67, %107; +mul.f32 f155, f154, 0f3F737871; +sub.f32 f156, %81, %94; +fma.rn.f32 f157, f156, 0f3F167918, f155; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %52, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +fma.rn.f32 f168, f147, 0f3E9E377A, %54; +mul.f32 f169, f149, 0f3F4F1BBD; +sub.f32 f170, f168, f169; +sub.f32 f171, %66, %106; +mul.f32 f172, f171, 0f3F737871; +sub.f32 f173, %79, %92; +fma.rn.f32 f174, f173, 0f3F167918, f172; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %54, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %68, %108; +add.f32 f186, %55, f185; +add.f32 f187, %82, %95; +add.f32 f188, f187, f186; +add.f32 f189, %70, %110; +add.f32 f190, %57, f189; +add.f32 f191, %83, %97; +add.f32 f192, f191, f190; +fma.rn.f32 f193, f185, 0f3E9E377A, %55; +mul.f32 f194, f187, 0f3F4F1BBD; +sub.f32 f195, f193, f194; +sub.f32 f196, %70, %110; +mul.f32 f197, f196, 0f3F737871; +sub.f32 f198, %83, %97; +fma.rn.f32 f199, f198, 0f3F167918, f197; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %55, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f210, f189, 0f3E9E377A, %57; +mul.f32 f211, f191, 0f3F4F1BBD; +sub.f32 f212, f210, f211; +sub.f32 f213, %68, %108; +mul.f32 f214, f213, 0f3F737871; +sub.f32 f215, %82, %95; +fma.rn.f32 f216, f215, 0f3F167918, f214; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %57, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %71, %111; +add.f32 f228, %58, f227; +add.f32 f229, %84, %98; +add.f32 f230, f229, f228; +add.f32 f231, %73, %113; +add.f32 f232, %59, f231; +add.f32 f233, %86, %99; +add.f32 f234, f233, f232; +fma.rn.f32 f235, f227, 0f3E9E377A, %58; +mul.f32 f236, f229, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %73, %113; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %86, %99; +fma.rn.f32 f241, f240, 0f3F167918, f239; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %58, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +fma.rn.f32 f252, f231, 0f3E9E377A, %59; +mul.f32 f253, f233, 0f3F4F1BBD; +sub.f32 f254, f252, f253; +sub.f32 f255, %71, %111; +mul.f32 f256, f255, 0f3F737871; +sub.f32 f257, %84, %98; +fma.rn.f32 f258, f257, 0f3F167918, f256; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %59, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %74, %114; +add.f32 f270, %60, f269; +add.f32 f271, %87, %100; +add.f32 f272, f271, f270; +add.f32 f273, %75, %115; +add.f32 f274, %62, f273; +add.f32 f275, %89, %102; +add.f32 f276, f275, f274; +fma.rn.f32 f277, f269, 0f3E9E377A, %60; +mul.f32 f278, f271, 0f3F4F1BBD; +sub.f32 f279, f277, f278; +sub.f32 f280, %75, %115; +mul.f32 f281, f280, 0f3F737871; +sub.f32 f282, %89, %102; +fma.rn.f32 f283, f282, 0f3F167918, f281; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %60, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +fma.rn.f32 f294, f273, 0f3E9E377A, %62; +mul.f32 f295, f275, 0f3F4F1BBD; +sub.f32 f296, f294, f295; +sub.f32 f297, %74, %114; +mul.f32 f298, f297, 0f3F737871; +sub.f32 f299, %87, %100; +fma.rn.f32 f300, f299, 0f3F167918, f298; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %62, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mul.f32 f311, f158, 0f3F77F511; +mul.f32 f312, f175, 0f3E7EA890; +sub.f32 f313, f311, f312; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f316, f200, 0f3F6055A2; +mul.f32 f317, f217, 0f3EF6A86B; +sub.f32 f318, f316, f317; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f321, f242, 0f3F3A9DB0; +mul.f32 f322, f259, 0f3F2F3E7B; +sub.f32 f323, f321, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f326, f284, 0f3F092BF2; +mul.f32 f327, f301, 0f3F5825E0; +sub.f32 f328, f326, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f331, f166, 0f3F6055A2; +mul.f32 f332, f183, 0f3EF6A86B; +sub.f32 f333, f331, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f336, f208, 0f3F092BF2; +mul.f32 f337, f225, 0f3F5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f341, f250, 0f3D809851; +mul.f32 f342, f267, 0f3F7F7EAE; +sub.f32 f343, f341, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f346, f292, 0fBED9FFBE; +mul.f32 f347, f309, 0f3F67A2BF; +sub.f32 f348, f346, f347; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f351, f167, 0f3F3A9DB0; +mul.f32 f352, f184, 0f3F2F3E7B; +sub.f32 f353, f351, f352; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f356, f209, 0f3D809851; +mul.f32 f357, f226, 0f3F7F7EAE; +sub.f32 f358, f356, f357; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f361, f251, 0fBF232E38; +mul.f32 f362, f268, 0f3F45405B; +sub.f32 f363, f361, f362; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f366, f293, 0fBF7DFB3B; +mul.f32 f367, f310, 0f3E00575B; +sub.f32 f368, f366, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f371, f159, 0f3F092BF2; +mul.f32 f372, f176, 0f3F5825E0; +sub.f32 f373, f371, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f376, f201, 0fBED9FFBE; +mul.f32 f377, f218, 0f3F67A2BF; +sub.f32 f378, f376, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f381, f243, 0fBF7DFB3B; +mul.f32 f382, f260, 0f3E00575B; +sub.f32 f383, f381, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f386, f285, 0fBF232E38; +mul.f32 f387, f302, 0fBF45405B; +sub.f32 f388, f386, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f392, f104, f391; +add.f32 f393, f188, f230; +add.f32 f394, f150, f276; +add.f32 f395, f108, f394; +add.f32 f396, f192, f234; +fma.rn.f32 f397, f391, 0f3E9E377A, f104; +mul.f32 f398, f393, 0f3F4F1BBD; +sub.f32 f399, f397, f398; +sub.f32 f400, f150, f276; +mul.f32 f401, f400, 0f3F737871; +sub.f32 f402, f192, f234; +fma.rn.f32 f403, f402, 0f3F167918, f401; +mul.f32 f404, f391, 0f3F4F1BBD; +sub.f32 f405, f104, f404; +fma.rn.f32 f406, f393, 0f3E9E377A, f405; +mul.f32 f407, f400, 0f3F167918; +mul.f32 f408, f402, 0f3F737871; +sub.f32 f409, f407, f408; +fma.rn.f32 f410, f394, 0f3E9E377A, f108; +mul.f32 f411, f396, 0f3F4F1BBD; +sub.f32 f412, f410, f411; +sub.f32 f413, f146, f272; +mul.f32 f414, f413, 0f3F737871; +sub.f32 f415, f188, f230; +fma.rn.f32 f416, f415, 0f3F167918, f414; +mul.f32 f417, f394, 0f3F4F1BBD; +sub.f32 f418, f108, f417; +fma.rn.f32 f419, f396, 0f3E9E377A, f418; +mul.f32 f420, f413, 0f3F167918; +mul.f32 f421, f415, 0f3F737871; +sub.f32 f422, f420, f421; +add.f32 f423, f313, f328; +add.f32 f424, f116, f423; +add.f32 f425, f318, f323; +add.f32 f426, f315, f330; +add.f32 f427, f133, f426; +add.f32 f428, f320, f325; +fma.rn.f32 f429, f423, 0f3E9E377A, f116; +mul.f32 f430, f425, 0f3F4F1BBD; +sub.f32 f431, f429, f430; +sub.f32 f432, f315, f330; +mul.f32 f433, f432, 0f3F737871; +sub.f32 f434, f320, f325; +fma.rn.f32 f435, f434, 0f3F167918, f433; +mul.f32 f436, f423, 0f3F4F1BBD; +sub.f32 f437, f116, f436; +fma.rn.f32 f438, f425, 0f3E9E377A, f437; +mul.f32 f439, f432, 0f3F167918; +mul.f32 f440, f434, 0f3F737871; +sub.f32 f441, f439, f440; +fma.rn.f32 f442, f426, 0f3E9E377A, f133; +mul.f32 f443, f428, 0f3F4F1BBD; +sub.f32 f444, f442, f443; +sub.f32 f445, f313, f328; +mul.f32 f446, f445, 0f3F737871; +sub.f32 f447, f318, f323; +fma.rn.f32 f448, f447, 0f3F167918, f446; +mul.f32 f449, f426, 0f3F4F1BBD; +sub.f32 f450, f133, f449; +fma.rn.f32 f451, f428, 0f3E9E377A, f450; +mul.f32 f452, f445, 0f3F167918; +mul.f32 f453, f447, 0f3F737871; +sub.f32 f454, f452, f453; +add.f32 f455, f333, f348; +add.f32 f456, f124, f455; +add.f32 f457, f338, f343; +add.f32 f458, f335, f350; +add.f32 f459, f141, f458; +add.f32 f460, f340, f345; +fma.rn.f32 f461, f455, 0f3E9E377A, f124; +mul.f32 f462, f457, 0f3F4F1BBD; +sub.f32 f463, f461, f462; +sub.f32 f464, f335, f350; +mul.f32 f465, f464, 0f3F737871; +sub.f32 f466, f340, f345; +fma.rn.f32 f467, f466, 0f3F167918, f465; +mul.f32 f468, f455, 0f3F4F1BBD; +sub.f32 f469, f124, f468; +fma.rn.f32 f470, f457, 0f3E9E377A, f469; +mul.f32 f471, f464, 0f3F167918; +mul.f32 f472, f466, 0f3F737871; +sub.f32 f473, f471, f472; +fma.rn.f32 f474, f458, 0f3E9E377A, f141; +mul.f32 f475, f460, 0f3F4F1BBD; +sub.f32 f476, f474, f475; +sub.f32 f477, f333, f348; +mul.f32 f478, f477, 0f3F737871; +sub.f32 f479, f338, f343; +fma.rn.f32 f480, f479, 0f3F167918, f478; +mul.f32 f481, f458, 0f3F4F1BBD; +sub.f32 f482, f141, f481; +fma.rn.f32 f483, f460, 0f3E9E377A, f482; +mul.f32 f484, f477, 0f3F167918; +mul.f32 f485, f479, 0f3F737871; +sub.f32 f486, f484, f485; +add.f32 f487, f353, f368; +add.f32 f488, f125, f487; +add.f32 f489, f358, f363; +add.f32 f490, f355, f370; +add.f32 f491, f142, f490; +add.f32 f492, f360, f365; +fma.rn.f32 f493, f487, 0f3E9E377A, f125; +mul.f32 f494, f489, 0f3F4F1BBD; +sub.f32 f495, f493, f494; +sub.f32 f496, f355, f370; +mul.f32 f497, f496, 0f3F737871; +sub.f32 f498, f360, f365; +fma.rn.f32 f499, f498, 0f3F167918, f497; +mul.f32 f500, f487, 0f3F4F1BBD; +sub.f32 f501, f125, f500; +fma.rn.f32 f502, f489, 0f3E9E377A, f501; +mul.f32 f503, f496, 0f3F167918; +mul.f32 f504, f498, 0f3F737871; +sub.f32 f505, f503, f504; +fma.rn.f32 f506, f490, 0f3E9E377A, f142; +mul.f32 f507, f492, 0f3F4F1BBD; +sub.f32 f508, f506, f507; +sub.f32 f509, f353, f368; +mul.f32 f510, f509, 0f3F737871; +sub.f32 f511, f358, f363; +fma.rn.f32 f512, f511, 0f3F167918, f510; +mul.f32 f513, f490, 0f3F4F1BBD; +sub.f32 f514, f142, f513; +fma.rn.f32 f515, f492, 0f3E9E377A, f514; +mul.f32 f516, f509, 0f3F167918; +mul.f32 f517, f511, 0f3F737871; +sub.f32 f518, f516, f517; +add.f32 f519, f373, f388; +add.f32 f520, f117, f519; +add.f32 f521, f378, f383; +add.f32 f522, f375, f390; +add.f32 f523, f134, f522; +add.f32 f524, f380, f385; +fma.rn.f32 f525, f519, 0f3E9E377A, f117; +mul.f32 f526, f521, 0f3F4F1BBD; +sub.f32 f527, f525, f526; +sub.f32 f528, f375, f390; +mul.f32 f529, f528, 0f3F737871; +sub.f32 f530, f380, f385; +fma.rn.f32 f531, f530, 0f3F167918, f529; +mul.f32 f532, f519, 0f3F4F1BBD; +sub.f32 f533, f117, f532; +fma.rn.f32 f534, f521, 0f3E9E377A, f533; +mul.f32 f535, f528, 0f3F167918; +mul.f32 f536, f530, 0f3F737871; +sub.f32 f537, f535, f536; +fma.rn.f32 f538, f522, 0f3E9E377A, f134; +mul.f32 f539, f524, 0f3F4F1BBD; +sub.f32 f540, f538, f539; +sub.f32 f541, f373, f388; +mul.f32 f542, f541, 0f3F737871; +sub.f32 f543, f378, f383; +fma.rn.f32 f544, f543, 0f3F167918, f542; +mul.f32 f545, f522, 0f3F4F1BBD; +sub.f32 f546, f134, f545; +fma.rn.f32 f547, f524, 0f3E9E377A, f546; +mul.f32 f548, f541, 0f3F167918; +mul.f32 f549, f543, 0f3F737871; +sub.f32 f550, f548, f549; +add.f32 %1, f396, f395; +add.f32 %0, f393, f392; +add.f32 %3, f428, f427; +add.f32 %2, f425, f424; +add.f32 %5, f460, f459; +add.f32 %4, f457, f456; +add.f32 %7, f492, f491; +add.f32 %6, f489, f488; +add.f32 %9, f524, f523; +add.f32 %8, f521, f520; +add.f32 %11, f416, f412; +sub.f32 %10, f399, f403; +add.f32 %13, f448, f444; +sub.f32 %12, f431, f435; +add.f32 %15, f480, f476; +sub.f32 %14, f463, f467; +add.f32 %17, f512, f508; +sub.f32 %16, f495, f499; +add.f32 %19, f544, f540; +sub.f32 %18, f527, f531; +add.f32 %21, f422, f419; +sub.f32 %20, f406, f409; +add.f32 %23, f454, f451; +sub.f32 %22, f438, f441; +add.f32 %25, f486, f483; +sub.f32 %24, f470, f473; +add.f32 %27, f518, f515; +sub.f32 %26, f502, f505; +add.f32 %29, f550, f547; +sub.f32 %28, f534, f537; +sub.f32 %31, f419, f422; +add.f32 %30, f409, f406; +sub.f32 %33, f451, f454; +add.f32 %32, f441, f438; +sub.f32 %35, f483, f486; +add.f32 %34, f473, f470; +sub.f32 %37, f515, f518; +add.f32 %36, f505, f502; +sub.f32 %39, f547, f550; +add.f32 %38, f537, f534; +sub.f32 %41, f412, f416; +add.f32 %40, f403, f399; +sub.f32 %43, f444, f448; +add.f32 %42, f435, f431; +sub.f32 %45, f476, f480; +add.f32 %44, f467, f463; +sub.f32 %47, f508, f512; +add.f32 %46, f499, f495; +sub.f32 %49, f540, f544; +add.f32 %48, f531, f527; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<358, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<164>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 200, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %14, %22; +add.f32 f22, %12, f21; +add.f32 f23, %17, %20; +add.f32 f24, %16, %23; +add.f32 f25, %13, f24; +add.f32 f26, %19, %21; +fma.rn.f32 f27, f21, 0f3E9E377A, %12; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %16, %23; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %19, %21; +fma.rn.f32 f33, f32, 0f3F167918, f31; +sub.f32 f34, f29, f33; +add.f32 f35, f33, f29; +mul.f32 f36, f21, 0f3F4F1BBD; +sub.f32 f37, %12, f36; +fma.rn.f32 f38, f23, 0f3E9E377A, f37; +mul.f32 f39, f30, 0f3F167918; +mul.f32 f40, f32, 0f3F737871; +sub.f32 f41, f39, f40; +sub.f32 f42, f38, f41; +add.f32 f43, f41, f38; +fma.rn.f32 f44, f24, 0f3E9E377A, %13; +mul.f32 f45, f26, 0f3F4F1BBD; +sub.f32 f46, f44, f45; +sub.f32 f47, %14, %22; +mul.f32 f48, f47, 0f3F737871; +sub.f32 f49, %17, %20; +fma.rn.f32 f50, f49, 0f3F167918, f48; +add.f32 f51, f50, f46; +sub.f32 f52, f46, f50; +mul.f32 f53, f24, 0f3F4F1BBD; +sub.f32 f54, %13, f53; +fma.rn.f32 f55, f26, 0f3E9E377A, f54; +mul.f32 f56, f47, 0f3F167918; +mul.f32 f57, f49, 0f3F737871; +sub.f32 f58, f56, f57; +add.f32 f59, f58, f55; +sub.f32 f60, f55, f58; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 200, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f61, f62}, [rd6]; +mul.f32 f65, f51, f62; +mul.f32 f66, f34, f62; +mul.f32 f67, f61, f51; +mul.f32 f68, f61, f61; +mul.f32 f69, f62, f62; +sub.f32 f70, f68, f69; +mul.f32 f71, f62, f61; +fma.rn.f32 f72, f62, f61, f71; +mul.f32 f73, f59, f72; +mul.f32 f74, f42, f72; +mul.f32 f75, f70, f59; +mul.f32 f76, f61, f70; +mul.f32 f77, f62, f72; +sub.f32 f78, f76, f77; +mul.f32 f79, f61, f72; +fma.rn.f32 f80, f62, f70, f79; +mul.f32 f81, f60, f80; +mul.f32 f82, f43, f80; +mul.f32 f83, f78, f60; +mul.f32 f84, f61, f78; +mul.f32 f85, f62, f80; +sub.f32 f86, f84, f85; +mul.f32 f87, f61, f80; +fma.rn.f32 f88, f62, f78, f87; +mul.f32 f89, f52, f88; +mul.f32 f90, f35, f88; +mul.f32 f91, f86, f52; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f92, f26, f25; +add.f32 f93, f23, f22; +st.shared.v2.f32 [r9], {f93, f92}; +fma.rn.f32 f94, f61, f34, f65; +sub.f32 f95, f67, f66; +st.shared.v2.f32 [r9+8], {f94, f95}; +fma.rn.f32 f96, f70, f42, f73; +sub.f32 f97, f75, f74; +st.shared.v2.f32 [r9+16], {f96, f97}; +sub.f32 f98, f83, f82; +fma.rn.f32 f99, f78, f43, f81; +st.shared.v2.f32 [r9+24], {f99, f98}; +fma.rn.f32 f100, f86, f35, f89; +sub.f32 f101, f91, f90; +st.shared.v2.f32 [r9+32], {f100, f101}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f102, f103}, [r11]; +ld.shared.v2.f32 {f106, f107}, [r11+40]; +ld.shared.v2.f32 {f110, f111}, [r11+80]; +ld.shared.v2.f32 {f114, f115}, [r11+120]; +ld.shared.v2.f32 {f118, f119}, [r11+160]; +add.f32 f122, f106, f118; +add.f32 f123, f102, f122; +add.f32 f124, f110, f114; +add.f32 f125, f107, f119; +add.f32 f126, f103, f125; +add.f32 f127, f111, f115; +fma.rn.f32 f128, f122, 0f3E9E377A, f102; +mul.f32 f129, f124, 0f3F4F1BBD; +sub.f32 f130, f128, f129; +sub.f32 f131, f107, f119; +mul.f32 f132, f131, 0f3F737871; +sub.f32 f133, f111, f115; +fma.rn.f32 f134, f133, 0f3F167918, f132; +mul.f32 f135, f122, 0f3F4F1BBD; +sub.f32 f136, f102, f135; +fma.rn.f32 f137, f124, 0f3E9E377A, f136; +mul.f32 f138, f131, 0f3F167918; +mul.f32 f139, f133, 0f3F737871; +sub.f32 f140, f138, f139; +fma.rn.f32 f141, f125, 0f3E9E377A, f103; +mul.f32 f142, f127, 0f3F4F1BBD; +sub.f32 f143, f141, f142; +sub.f32 f144, f106, f118; +mul.f32 f145, f144, 0f3F737871; +sub.f32 f146, f110, f114; +fma.rn.f32 f147, f146, 0f3F167918, f145; +mul.f32 f148, f125, 0f3F4F1BBD; +sub.f32 f149, f103, f148; +fma.rn.f32 f150, f127, 0f3E9E377A, f149; +mul.f32 f151, f144, 0f3F167918; +mul.f32 f152, f146, 0f3F737871; +sub.f32 f153, f151, f152; +add.f32 %1, f127, f126; +add.f32 %0, f124, f123; +add.f32 %3, f147, f143; +sub.f32 %2, f130, f134; +add.f32 %5, f153, f150; +sub.f32 %4, f137, f140; +sub.f32 %7, f150, f153; +add.f32 %6, f140, f137; +sub.f32 %9, f143, f147; +add.f32 %8, f134, f130; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<359, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 100, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %14, %22; +add.f32 f22, %12, f21; +add.f32 f23, %17, %20; +add.f32 f24, f23, f22; +add.f32 f25, %16, %23; +add.f32 f26, %13, f25; +add.f32 f27, %19, %21; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %12; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %16, %23; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %19, %21; +fma.rn.f32 f35, f34, 0f3F167918, f33; +sub.f32 f36, f31, f35; +add.f32 f37, f35, f31; +mul.f32 f38, f21, 0f3F4F1BBD; +sub.f32 f39, %12, f38; +fma.rn.f32 f40, f23, 0f3E9E377A, f39; +mul.f32 f41, f32, 0f3F167918; +mul.f32 f42, f34, 0f3F737871; +sub.f32 f43, f41, f42; +sub.f32 f44, f40, f43; +add.f32 f45, f43, f40; +fma.rn.f32 f46, f25, 0f3E9E377A, %13; +mul.f32 f47, f27, 0f3F4F1BBD; +sub.f32 f48, f46, f47; +sub.f32 f49, %14, %22; +mul.f32 f50, f49, 0f3F737871; +sub.f32 f51, %17, %20; +fma.rn.f32 f52, f51, 0f3F167918, f50; +add.f32 f53, f52, f48; +sub.f32 f54, f48, f52; +mul.f32 f55, f25, 0f3F4F1BBD; +sub.f32 f56, %13, f55; +fma.rn.f32 f57, f27, 0f3E9E377A, f56; +mul.f32 f58, f49, 0f3F167918; +mul.f32 f59, f51, 0f3F737871; +sub.f32 f60, f58, f59; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f53, f64; +fma.rn.f32 f68, f63, f36, f67; +mul.f32 f69, f36, f64; +mul.f32 f70, f63, f53; +sub.f32 f71, f70, f69; +mul.f32 f72, f63, f63; +mul.f32 f73, f64, f64; +sub.f32 f74, f72, f73; +mul.f32 f75, f64, f63; +fma.rn.f32 f76, f64, f63, f75; +mul.f32 f77, f61, f76; +fma.rn.f32 f78, f74, f44, f77; +mul.f32 f79, f44, f76; +mul.f32 f80, f74, f61; +sub.f32 f81, f80, f79; +mul.f32 f82, f63, f74; +mul.f32 f83, f64, f76; +sub.f32 f84, f82, f83; +mul.f32 f85, f63, f76; +fma.rn.f32 f86, f64, f74, f85; +mul.f32 f87, f62, f86; +fma.rn.f32 f88, f84, f45, f87; +mul.f32 f89, f45, f86; +mul.f32 f90, f84, f62; +sub.f32 f91, f90, f89; +mul.f32 f92, f63, f84; +mul.f32 f93, f64, f86; +sub.f32 f94, f92, f93; +mul.f32 f95, f63, f86; +fma.rn.f32 f96, f64, f84, f95; +mul.f32 f97, f54, f96; +fma.rn.f32 f98, f94, f37, f97; +mul.f32 f99, f37, f96; +mul.f32 f100, f94, f54; +sub.f32 f101, f100, f99; +mad.lo.s32 r8, r5, 100, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f68; +st.shared.f32 [r9+8], f78; +st.shared.f32 [r9+12], f88; +st.shared.f32 [r9+16], f98; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f102, [r11]; +ld.shared.f32 f103, [r11+20]; +ld.shared.f32 f104, [r11+40]; +ld.shared.f32 f105, [r11+60]; +ld.shared.f32 f106, [r11+80]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +ld.shared.f32 f107, [r11]; +ld.shared.f32 f108, [r11+20]; +ld.shared.f32 f109, [r11+40]; +ld.shared.f32 f110, [r11+60]; +ld.shared.f32 f111, [r11+80]; +add.f32 f112, f103, f106; +add.f32 f113, f102, f112; +add.f32 f114, f104, f105; +add.f32 f115, f108, f111; +add.f32 f116, f107, f115; +add.f32 f117, f109, f110; +fma.rn.f32 f118, f112, 0f3E9E377A, f102; +mul.f32 f119, f114, 0f3F4F1BBD; +sub.f32 f120, f118, f119; +sub.f32 f121, f108, f111; +mul.f32 f122, f121, 0f3F737871; +sub.f32 f123, f109, f110; +fma.rn.f32 f124, f123, 0f3F167918, f122; +mul.f32 f125, f112, 0f3F4F1BBD; +sub.f32 f126, f102, f125; +fma.rn.f32 f127, f114, 0f3E9E377A, f126; +mul.f32 f128, f121, 0f3F167918; +mul.f32 f129, f123, 0f3F737871; +sub.f32 f130, f128, f129; +fma.rn.f32 f131, f115, 0f3E9E377A, f107; +mul.f32 f132, f117, 0f3F4F1BBD; +sub.f32 f133, f131, f132; +sub.f32 f134, f103, f106; +mul.f32 f135, f134, 0f3F737871; +sub.f32 f136, f104, f105; +fma.rn.f32 f137, f136, 0f3F167918, f135; +mul.f32 f138, f115, 0f3F4F1BBD; +sub.f32 f139, f107, f138; +fma.rn.f32 f140, f117, 0f3E9E377A, f139; +mul.f32 f141, f134, 0f3F167918; +mul.f32 f142, f136, 0f3F737871; +sub.f32 f143, f141, f142; +add.f32 %0, f114, f113; +add.f32 %1, f117, f116; +add.f32 %3, f137, f133; +sub.f32 %2, f120, f124; +sub.f32 %4, f127, f130; +add.f32 %5, f143, f140; +add.f32 %6, f130, f127; +sub.f32 %7, f140, f143; +sub.f32 %9, f133, f137; +add.f32 %8, f124, f120; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..33a47eed62de9 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp64_fwd.hpp.inc @@ -0,0 +1,870 @@ +#ifndef CUFFTDX_FFT_25_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_25_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<533, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<621>; +.reg .b64 rd<2>; +add.f64 fd101, %63, %103; +add.f64 fd102, %50, fd101; +add.f64 fd103, %76, %90; +add.f64 fd104, fd103, fd102; +add.f64 fd105, %65, %105; +add.f64 fd106, %51, fd105; +add.f64 fd107, %78, %91; +add.f64 fd108, fd107, fd106; +fma.rn.f64 fd109, fd101, 0d3FD3C6EF372FE950, %50; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %65, %105; +mul.f64 fd113, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd114, %78, %91; +mul.f64 fd115, fd114, 0dBFE2CF2304755A5E; +sub.f64 fd116, fd115, fd113; +sub.f64 fd117, fd111, fd116; +add.f64 fd118, fd116, fd111; +mul.f64 fd119, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd120, %50, fd119; +fma.rn.f64 fd121, fd103, 0d3FD3C6EF372FE950, fd120; +mul.f64 fd122, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd123, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd123, fd122; +sub.f64 fd125, fd121, fd124; +add.f64 fd126, fd124, fd121; +fma.rn.f64 fd127, fd105, 0d3FD3C6EF372FE950, %51; +mul.f64 fd128, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, %63, %103; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, %76, %90; +mul.f64 fd133, fd132, 0dBFE2CF2304755A5E; +sub.f64 fd134, fd133, fd131; +add.f64 fd135, fd134, fd129; +sub.f64 fd136, fd129, fd134; +mul.f64 fd137, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd138, %51, fd137; +fma.rn.f64 fd139, fd107, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd141, fd140; +add.f64 fd143, fd142, fd139; +sub.f64 fd144, fd139, fd142; +add.f64 fd145, %66, %106; +add.f64 fd146, %52, fd145; +add.f64 fd147, %79, %92; +add.f64 fd148, fd147, fd146; +add.f64 fd149, %67, %107; +add.f64 fd150, %54, fd149; +add.f64 fd151, %81, %94; +add.f64 fd152, fd151, fd150; +fma.rn.f64 fd153, fd145, 0d3FD3C6EF372FE950, %52; +mul.f64 fd154, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd155, fd153, fd154; +sub.f64 fd156, %67, %107; +mul.f64 fd157, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd158, %81, %94; +mul.f64 fd159, fd158, 0dBFE2CF2304755A5E; +sub.f64 fd160, fd159, fd157; +sub.f64 fd161, fd155, fd160; +add.f64 fd162, fd160, fd155; +mul.f64 fd163, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd164, %52, fd163; +fma.rn.f64 fd165, fd147, 0d3FD3C6EF372FE950, fd164; +mul.f64 fd166, fd156, 0d3FE2CF2304755A5E; +mul.f64 fd167, fd158, 0d3FEE6F0E134454FF; +sub.f64 fd168, fd167, fd166; +sub.f64 fd169, fd165, fd168; +add.f64 fd170, fd168, fd165; +fma.rn.f64 fd171, fd149, 0d3FD3C6EF372FE950, %54; +mul.f64 fd172, fd151, 0d3FE9E3779B97F4A8; +sub.f64 fd173, fd171, fd172; +sub.f64 fd174, %66, %106; +mul.f64 fd175, fd174, 0d3FEE6F0E134454FF; +sub.f64 fd176, %79, %92; +mul.f64 fd177, fd176, 0dBFE2CF2304755A5E; +sub.f64 fd178, fd177, fd175; +add.f64 fd179, fd178, fd173; +sub.f64 fd180, fd173, fd178; +mul.f64 fd181, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd182, %54, fd181; +fma.rn.f64 fd183, fd151, 0d3FD3C6EF372FE950, fd182; +mul.f64 fd184, fd174, 0d3FE2CF2304755A5E; +mul.f64 fd185, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd186, fd185, fd184; +add.f64 fd187, fd186, fd183; +sub.f64 fd188, fd183, fd186; +add.f64 fd189, %68, %108; +add.f64 fd190, %55, fd189; +add.f64 fd191, %82, %95; +add.f64 fd192, fd191, fd190; +add.f64 fd193, %70, %110; +add.f64 fd194, %57, fd193; +add.f64 fd195, %83, %97; +add.f64 fd196, fd195, fd194; +fma.rn.f64 fd197, fd189, 0d3FD3C6EF372FE950, %55; +mul.f64 fd198, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd199, fd197, fd198; +sub.f64 fd200, %70, %110; +mul.f64 fd201, fd200, 0d3FEE6F0E134454FF; +sub.f64 fd202, %83, %97; +mul.f64 fd203, fd202, 0dBFE2CF2304755A5E; +sub.f64 fd204, fd203, fd201; +sub.f64 fd205, fd199, fd204; +add.f64 fd206, fd204, fd199; +mul.f64 fd207, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd208, %55, fd207; +fma.rn.f64 fd209, fd191, 0d3FD3C6EF372FE950, fd208; +mul.f64 fd210, fd200, 0d3FE2CF2304755A5E; +mul.f64 fd211, fd202, 0d3FEE6F0E134454FF; +sub.f64 fd212, fd211, fd210; +sub.f64 fd213, fd209, fd212; +add.f64 fd214, fd212, fd209; +fma.rn.f64 fd215, fd193, 0d3FD3C6EF372FE950, %57; +mul.f64 fd216, fd195, 0d3FE9E3779B97F4A8; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, %68, %108; +mul.f64 fd219, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd220, %82, %95; +mul.f64 fd221, fd220, 0dBFE2CF2304755A5E; +sub.f64 fd222, fd221, fd219; +add.f64 fd223, fd222, fd217; +sub.f64 fd224, fd217, fd222; +mul.f64 fd225, fd193, 0d3FE9E3779B97F4A8; +sub.f64 fd226, %57, fd225; +fma.rn.f64 fd227, fd195, 0d3FD3C6EF372FE950, fd226; +mul.f64 fd228, fd218, 0d3FE2CF2304755A5E; +mul.f64 fd229, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd230, fd229, fd228; +add.f64 fd231, fd230, fd227; +sub.f64 fd232, fd227, fd230; +add.f64 fd233, %71, %111; +add.f64 fd234, %58, fd233; +add.f64 fd235, %84, %98; +add.f64 fd236, fd235, fd234; +add.f64 fd237, %73, %113; +add.f64 fd238, %59, fd237; +add.f64 fd239, %86, %99; +add.f64 fd240, fd239, fd238; +fma.rn.f64 fd241, fd233, 0d3FD3C6EF372FE950, %58; +mul.f64 fd242, fd235, 0d3FE9E3779B97F4A8; +sub.f64 fd243, fd241, fd242; +sub.f64 fd244, %73, %113; +mul.f64 fd245, fd244, 0d3FEE6F0E134454FF; +sub.f64 fd246, %86, %99; +mul.f64 fd247, fd246, 0dBFE2CF2304755A5E; +sub.f64 fd248, fd247, fd245; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +mul.f64 fd251, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd252, %58, fd251; +fma.rn.f64 fd253, fd235, 0d3FD3C6EF372FE950, fd252; +mul.f64 fd254, fd244, 0d3FE2CF2304755A5E; +mul.f64 fd255, fd246, 0d3FEE6F0E134454FF; +sub.f64 fd256, fd255, fd254; +sub.f64 fd257, fd253, fd256; +add.f64 fd258, fd256, fd253; +fma.rn.f64 fd259, fd237, 0d3FD3C6EF372FE950, %59; +mul.f64 fd260, fd239, 0d3FE9E3779B97F4A8; +sub.f64 fd261, fd259, fd260; +sub.f64 fd262, %71, %111; +mul.f64 fd263, fd262, 0d3FEE6F0E134454FF; +sub.f64 fd264, %84, %98; +mul.f64 fd265, fd264, 0dBFE2CF2304755A5E; +sub.f64 fd266, fd265, fd263; +add.f64 fd267, fd266, fd261; +sub.f64 fd268, fd261, fd266; +mul.f64 fd269, fd237, 0d3FE9E3779B97F4A8; +sub.f64 fd270, %59, fd269; +fma.rn.f64 fd271, fd239, 0d3FD3C6EF372FE950, fd270; +mul.f64 fd272, fd262, 0d3FE2CF2304755A5E; +mul.f64 fd273, fd264, 0d3FEE6F0E134454FF; +sub.f64 fd274, fd273, fd272; +add.f64 fd275, fd274, fd271; +sub.f64 fd276, fd271, fd274; +add.f64 fd277, %74, %114; +add.f64 fd278, %60, fd277; +add.f64 fd279, %87, %100; +add.f64 fd280, fd279, fd278; +add.f64 fd281, %75, %115; +add.f64 fd282, %62, fd281; +add.f64 fd283, %89, %102; +add.f64 fd284, fd283, fd282; +fma.rn.f64 fd285, fd277, 0d3FD3C6EF372FE950, %60; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, %75, %115; +mul.f64 fd289, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd290, %89, %102; +mul.f64 fd291, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd291, fd289; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, %60, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +fma.rn.f64 fd303, fd281, 0d3FD3C6EF372FE950, %62; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, %74, %114; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, %87, %100; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, %62, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +mul.f64 fd321, fd161, 0d3FEEFEA21D101EE0; +mul.f64 fd322, fd179, 0dBFCFD511FA1C0796; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd179, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd325, fd161, 0dBFCFD511FA1C0796, fd324; +mul.f64 fd326, fd205, 0d3FEC0AB44E81C059; +mul.f64 fd327, fd223, 0dBFDED50D5CBFA951; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd223, 0d3FEC0AB44E81C059; +fma.rn.f64 fd330, fd205, 0dBFDED50D5CBFA951, fd329; +mul.f64 fd331, fd249, 0d3FE753B603D2B816; +mul.f64 fd332, fd267, 0dBFE5E7CF55112014; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd267, 0d3FE753B603D2B816; +fma.rn.f64 fd335, fd249, 0dBFE5E7CF55112014, fd334; +mul.f64 fd336, fd293, 0d3FE1257E3C182B51; +mul.f64 fd337, fd311, 0dBFEB04BBFF642E86; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd311, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd293, 0dBFEB04BBFF642E86, fd339; +mul.f64 fd341, fd169, 0d3FEC0AB44E81C059; +mul.f64 fd342, fd187, 0dBFDED50D5CBFA951; +sub.f64 fd343, fd341, fd342; +mul.f64 fd344, fd187, 0d3FEC0AB44E81C059; +fma.rn.f64 fd345, fd169, 0dBFDED50D5CBFA951, fd344; +mul.f64 fd346, fd213, 0d3FE1257E3C182B51; +mul.f64 fd347, fd231, 0dBFEB04BBFF642E86; +sub.f64 fd348, fd346, fd347; +mul.f64 fd349, fd231, 0d3FE1257E3C182B51; +fma.rn.f64 fd350, fd213, 0dBFEB04BBFF642E86, fd349; +mul.f64 fd351, fd257, 0d3FB0130A1BE09379; +mul.f64 fd352, fd275, 0dBFEFEFD5BFE443FE; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd275, 0d3FB0130A1BE09379; +fma.rn.f64 fd355, fd257, 0dBFEFEFD5BFE443FE, fd354; +mul.f64 fd356, fd301, 0dBFDB3FF7C925819C; +mul.f64 fd357, fd319, 0dBFECF457DCDC158C; +sub.f64 fd358, fd356, fd357; +mul.f64 fd359, fd319, 0dBFDB3FF7C925819C; +fma.rn.f64 fd360, fd301, 0dBFECF457DCDC158C, fd359; +mul.f64 fd361, fd170, 0d3FE753B603D2B816; +mul.f64 fd362, fd188, 0dBFE5E7CF55112014; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd188, 0d3FE753B603D2B816; +fma.rn.f64 fd365, fd170, 0dBFE5E7CF55112014, fd364; +mul.f64 fd366, fd214, 0d3FB0130A1BE09379; +mul.f64 fd367, fd232, 0dBFEFEFD5BFE443FE; +sub.f64 fd368, fd366, fd367; +mul.f64 fd369, fd232, 0d3FB0130A1BE09379; +fma.rn.f64 fd370, fd214, 0dBFEFEFD5BFE443FE, fd369; +mul.f64 fd371, fd258, 0dBFE465C6FEB501BC; +mul.f64 fd372, fd276, 0dBFE8A80B635B6BEA; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd276, 0dBFE465C6FEB501BC; +fma.rn.f64 fd375, fd258, 0dBFE8A80B635B6BEA, fd374; +mul.f64 fd376, fd302, 0dBFEFBF675480D903; +mul.f64 fd377, fd320, 0dBFC00AEB5DA15BE0; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd320, 0dBFEFBF675480D903; +fma.rn.f64 fd380, fd302, 0dBFC00AEB5DA15BE0, fd379; +mul.f64 fd381, fd162, 0d3FE1257E3C182B51; +mul.f64 fd382, fd180, 0dBFEB04BBFF642E86; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd180, 0d3FE1257E3C182B51; +fma.rn.f64 fd385, fd162, 0dBFEB04BBFF642E86, fd384; +mul.f64 fd386, fd206, 0dBFDB3FF7C925819C; +mul.f64 fd387, fd224, 0dBFECF457DCDC158C; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd224, 0dBFDB3FF7C925819C; +fma.rn.f64 fd390, fd206, 0dBFECF457DCDC158C, fd389; +mul.f64 fd391, fd250, 0dBFEFBF675480D903; +mul.f64 fd392, fd268, 0dBFC00AEB5DA15BE0; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd268, 0dBFEFBF675480D903; +fma.rn.f64 fd395, fd250, 0dBFC00AEB5DA15BE0, fd394; +mul.f64 fd396, fd294, 0dBFE465C6FEB501BC; +mul.f64 fd397, fd312, 0d3FE8A80B635B6BEA; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd312, 0dBFE465C6FEB501BC; +fma.rn.f64 fd400, fd294, 0d3FE8A80B635B6BEA, fd399; +add.f64 fd401, fd148, fd280; +add.f64 fd402, fd104, fd401; +add.f64 fd403, fd192, fd236; +add.f64 fd404, fd152, fd284; +add.f64 fd405, fd108, fd404; +add.f64 fd406, fd196, fd240; +fma.rn.f64 fd407, fd401, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd408, fd403, 0d3FE9E3779B97F4A8; +sub.f64 fd409, fd407, fd408; +sub.f64 fd410, fd152, fd284; +mul.f64 fd411, fd410, 0d3FEE6F0E134454FF; +sub.f64 fd412, fd196, fd240; +mul.f64 fd413, fd412, 0dBFE2CF2304755A5E; +sub.f64 fd414, fd413, fd411; +mul.f64 fd415, fd401, 0d3FE9E3779B97F4A8; +sub.f64 fd416, fd104, fd415; +fma.rn.f64 fd417, fd403, 0d3FD3C6EF372FE950, fd416; +mul.f64 fd418, fd410, 0d3FE2CF2304755A5E; +mul.f64 fd419, fd412, 0d3FEE6F0E134454FF; +sub.f64 fd420, fd419, fd418; +fma.rn.f64 fd421, fd404, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd422, fd406, 0d3FE9E3779B97F4A8; +sub.f64 fd423, fd421, fd422; +sub.f64 fd424, fd148, fd280; +mul.f64 fd425, fd424, 0d3FEE6F0E134454FF; +sub.f64 fd426, fd192, fd236; +mul.f64 fd427, fd426, 0dBFE2CF2304755A5E; +sub.f64 fd428, fd427, fd425; +mul.f64 fd429, fd404, 0d3FE9E3779B97F4A8; +sub.f64 fd430, fd108, fd429; +fma.rn.f64 fd431, fd406, 0d3FD3C6EF372FE950, fd430; +mul.f64 fd432, fd424, 0d3FE2CF2304755A5E; +mul.f64 fd433, fd426, 0d3FEE6F0E134454FF; +sub.f64 fd434, fd433, fd432; +add.f64 fd435, fd323, fd338; +add.f64 fd436, fd117, fd435; +add.f64 fd437, fd328, fd333; +add.f64 fd438, fd325, fd340; +add.f64 fd439, fd135, fd438; +add.f64 fd440, fd330, fd335; +fma.rn.f64 fd441, fd435, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd442, fd437, 0d3FE9E3779B97F4A8; +sub.f64 fd443, fd441, fd442; +sub.f64 fd444, fd325, fd340; +mul.f64 fd445, fd444, 0d3FEE6F0E134454FF; +sub.f64 fd446, fd330, fd335; +mul.f64 fd447, fd446, 0dBFE2CF2304755A5E; +sub.f64 fd448, fd447, fd445; +mul.f64 fd449, fd435, 0d3FE9E3779B97F4A8; +sub.f64 fd450, fd117, fd449; +fma.rn.f64 fd451, fd437, 0d3FD3C6EF372FE950, fd450; +mul.f64 fd452, fd444, 0d3FE2CF2304755A5E; +mul.f64 fd453, fd446, 0d3FEE6F0E134454FF; +sub.f64 fd454, fd453, fd452; +fma.rn.f64 fd455, fd438, 0d3FD3C6EF372FE950, fd135; +mul.f64 fd456, fd440, 0d3FE9E3779B97F4A8; +sub.f64 fd457, fd455, fd456; +sub.f64 fd458, fd323, fd338; +mul.f64 fd459, fd458, 0d3FEE6F0E134454FF; +sub.f64 fd460, fd328, fd333; +mul.f64 fd461, fd460, 0dBFE2CF2304755A5E; +sub.f64 fd462, fd461, fd459; +mul.f64 fd463, fd438, 0d3FE9E3779B97F4A8; +sub.f64 fd464, fd135, fd463; +fma.rn.f64 fd465, fd440, 0d3FD3C6EF372FE950, fd464; +mul.f64 fd466, fd458, 0d3FE2CF2304755A5E; +mul.f64 fd467, fd460, 0d3FEE6F0E134454FF; +sub.f64 fd468, fd467, fd466; +add.f64 fd469, fd343, fd358; +add.f64 fd470, fd125, fd469; +add.f64 fd471, fd348, fd353; +add.f64 fd472, fd345, fd360; +add.f64 fd473, fd143, fd472; +add.f64 fd474, fd350, fd355; +fma.rn.f64 fd475, fd469, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd476, fd471, 0d3FE9E3779B97F4A8; +sub.f64 fd477, fd475, fd476; +sub.f64 fd478, fd345, fd360; +mul.f64 fd479, fd478, 0d3FEE6F0E134454FF; +sub.f64 fd480, fd350, fd355; +mul.f64 fd481, fd480, 0dBFE2CF2304755A5E; +sub.f64 fd482, fd481, fd479; +mul.f64 fd483, fd469, 0d3FE9E3779B97F4A8; +sub.f64 fd484, fd125, fd483; +fma.rn.f64 fd485, fd471, 0d3FD3C6EF372FE950, fd484; +mul.f64 fd486, fd478, 0d3FE2CF2304755A5E; +mul.f64 fd487, fd480, 0d3FEE6F0E134454FF; +sub.f64 fd488, fd487, fd486; +fma.rn.f64 fd489, fd472, 0d3FD3C6EF372FE950, fd143; +mul.f64 fd490, fd474, 0d3FE9E3779B97F4A8; +sub.f64 fd491, fd489, fd490; +sub.f64 fd492, fd343, fd358; +mul.f64 fd493, fd492, 0d3FEE6F0E134454FF; +sub.f64 fd494, fd348, fd353; +mul.f64 fd495, fd494, 0dBFE2CF2304755A5E; +sub.f64 fd496, fd495, fd493; +mul.f64 fd497, fd472, 0d3FE9E3779B97F4A8; +sub.f64 fd498, fd143, fd497; +fma.rn.f64 fd499, fd474, 0d3FD3C6EF372FE950, fd498; +mul.f64 fd500, fd492, 0d3FE2CF2304755A5E; +mul.f64 fd501, fd494, 0d3FEE6F0E134454FF; +sub.f64 fd502, fd501, fd500; +add.f64 fd503, fd363, fd378; +add.f64 fd504, fd126, fd503; +add.f64 fd505, fd368, fd373; +add.f64 fd506, fd365, fd380; +add.f64 fd507, fd144, fd506; +add.f64 fd508, fd370, fd375; +fma.rn.f64 fd509, fd503, 0d3FD3C6EF372FE950, fd126; +mul.f64 fd510, fd505, 0d3FE9E3779B97F4A8; +sub.f64 fd511, fd509, fd510; +sub.f64 fd512, fd365, fd380; +mul.f64 fd513, fd512, 0d3FEE6F0E134454FF; +sub.f64 fd514, fd370, fd375; +mul.f64 fd515, fd514, 0dBFE2CF2304755A5E; +sub.f64 fd516, fd515, fd513; +mul.f64 fd517, fd503, 0d3FE9E3779B97F4A8; +sub.f64 fd518, fd126, fd517; +fma.rn.f64 fd519, fd505, 0d3FD3C6EF372FE950, fd518; +mul.f64 fd520, fd512, 0d3FE2CF2304755A5E; +mul.f64 fd521, fd514, 0d3FEE6F0E134454FF; +sub.f64 fd522, fd521, fd520; +fma.rn.f64 fd523, fd506, 0d3FD3C6EF372FE950, fd144; +mul.f64 fd524, fd508, 0d3FE9E3779B97F4A8; +sub.f64 fd525, fd523, fd524; +sub.f64 fd526, fd363, fd378; +mul.f64 fd527, fd526, 0d3FEE6F0E134454FF; +sub.f64 fd528, fd368, fd373; +mul.f64 fd529, fd528, 0dBFE2CF2304755A5E; +sub.f64 fd530, fd529, fd527; +mul.f64 fd531, fd506, 0d3FE9E3779B97F4A8; +sub.f64 fd532, fd144, fd531; +fma.rn.f64 fd533, fd508, 0d3FD3C6EF372FE950, fd532; +mul.f64 fd534, fd526, 0d3FE2CF2304755A5E; +mul.f64 fd535, fd528, 0d3FEE6F0E134454FF; +sub.f64 fd536, fd535, fd534; +add.f64 fd537, fd383, fd398; +add.f64 fd538, fd118, fd537; +add.f64 fd539, fd388, fd393; +add.f64 fd540, fd385, fd400; +add.f64 fd541, fd136, fd540; +add.f64 fd542, fd390, fd395; +fma.rn.f64 fd543, fd537, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd544, fd539, 0d3FE9E3779B97F4A8; +sub.f64 fd545, fd543, fd544; +sub.f64 fd546, fd385, fd400; +mul.f64 fd547, fd546, 0d3FEE6F0E134454FF; +sub.f64 fd548, fd390, fd395; +mul.f64 fd549, fd548, 0dBFE2CF2304755A5E; +sub.f64 fd550, fd549, fd547; +mul.f64 fd551, fd537, 0d3FE9E3779B97F4A8; +sub.f64 fd552, fd118, fd551; +fma.rn.f64 fd553, fd539, 0d3FD3C6EF372FE950, fd552; +mul.f64 fd554, fd546, 0d3FE2CF2304755A5E; +mul.f64 fd555, fd548, 0d3FEE6F0E134454FF; +sub.f64 fd556, fd555, fd554; +fma.rn.f64 fd557, fd540, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd558, fd542, 0d3FE9E3779B97F4A8; +sub.f64 fd559, fd557, fd558; +sub.f64 fd560, fd383, fd398; +mul.f64 fd561, fd560, 0d3FEE6F0E134454FF; +sub.f64 fd562, fd388, fd393; +mul.f64 fd563, fd562, 0dBFE2CF2304755A5E; +sub.f64 fd564, fd563, fd561; +mul.f64 fd565, fd540, 0d3FE9E3779B97F4A8; +sub.f64 fd566, fd136, fd565; +fma.rn.f64 fd567, fd542, 0d3FD3C6EF372FE950, fd566; +mul.f64 fd568, fd560, 0d3FE2CF2304755A5E; +mul.f64 fd569, fd562, 0d3FEE6F0E134454FF; +sub.f64 fd570, fd569, fd568; +add.f64 %1, fd406, fd405; +add.f64 %0, fd403, fd402; +add.f64 %3, fd440, fd439; +add.f64 %2, fd437, fd436; +add.f64 %5, fd474, fd473; +add.f64 %4, fd471, fd470; +add.f64 %7, fd508, fd507; +add.f64 %6, fd505, fd504; +add.f64 %9, fd542, fd541; +add.f64 %8, fd539, fd538; +add.f64 %11, fd428, fd423; +sub.f64 %10, fd409, fd414; +add.f64 %13, fd462, fd457; +sub.f64 %12, fd443, fd448; +add.f64 %15, fd496, fd491; +sub.f64 %14, fd477, fd482; +add.f64 %17, fd530, fd525; +sub.f64 %16, fd511, fd516; +add.f64 %19, fd564, fd559; +sub.f64 %18, fd545, fd550; +add.f64 %21, fd434, fd431; +sub.f64 %20, fd417, fd420; +add.f64 %23, fd468, fd465; +sub.f64 %22, fd451, fd454; +add.f64 %25, fd502, fd499; +sub.f64 %24, fd485, fd488; +add.f64 %27, fd536, fd533; +sub.f64 %26, fd519, fd522; +add.f64 %29, fd570, fd567; +sub.f64 %28, fd553, fd556; +sub.f64 %31, fd431, fd434; +add.f64 %30, fd420, fd417; +sub.f64 %33, fd465, fd468; +add.f64 %32, fd454, fd451; +sub.f64 %35, fd499, fd502; +add.f64 %34, fd488, fd485; +sub.f64 %37, fd533, fd536; +add.f64 %36, fd522, fd519; +sub.f64 %39, fd567, fd570; +add.f64 %38, fd556, fd553; +sub.f64 %41, fd423, fd428; +add.f64 %40, fd414, fd409; +sub.f64 %43, fd457, fd462; +add.f64 %42, fd448, fd443; +sub.f64 %45, fd491, fd496; +add.f64 %44, fd482, fd477; +sub.f64 %47, fd525, fd530; +add.f64 %46, fd516, fd511; +sub.f64 %49, fd559, fd564; +add.f64 %48, fd550, fd545; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<534, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<167>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 400, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %14, %22; +add.f64 fd22, %12, fd21; +add.f64 fd23, %17, %20; +add.f64 fd24, %16, %23; +add.f64 fd25, %13, fd24; +add.f64 fd26, %19, %21; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %12; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %16, %23; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %19, %21; +mul.f64 fd33, fd32, 0dBFE2CF2304755A5E; +sub.f64 fd34, fd33, fd31; +sub.f64 fd35, fd29, fd34; +add.f64 fd36, fd34, fd29; +mul.f64 fd37, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd38, %12, fd37; +fma.rn.f64 fd39, fd23, 0d3FD3C6EF372FE950, fd38; +mul.f64 fd40, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd41, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd42, fd41, fd40; +sub.f64 fd43, fd39, fd42; +add.f64 fd44, fd42, fd39; +fma.rn.f64 fd45, fd24, 0d3FD3C6EF372FE950, %13; +mul.f64 fd46, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd47, fd45, fd46; +sub.f64 fd48, %14, %22; +mul.f64 fd49, fd48, 0d3FEE6F0E134454FF; +sub.f64 fd50, %17, %20; +mul.f64 fd51, fd50, 0dBFE2CF2304755A5E; +sub.f64 fd52, fd51, fd49; +add.f64 fd53, fd52, fd47; +sub.f64 fd54, fd47, fd52; +mul.f64 fd55, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %13, fd55; +fma.rn.f64 fd57, fd26, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd48, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd59, fd58; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 400, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd63, fd35; +mul.f64 fd68, fd64, fd53; +mul.f64 fd69, fd63, fd53; +mul.f64 fd70, fd63, fd63; +mul.f64 fd71, fd64, fd64; +sub.f64 fd72, fd70, fd71; +mul.f64 fd73, fd64, fd63; +fma.rn.f64 fd74, fd64, fd63, fd73; +mul.f64 fd75, fd72, fd43; +mul.f64 fd76, fd74, fd61; +mul.f64 fd77, fd72, fd61; +ld.global.v2.f64 {fd78, fd79}, [rd6+80]; +mul.f64 fd82, fd78, fd44; +mul.f64 fd83, fd79, fd62; +mul.f64 fd84, fd78, fd62; +mul.f64 fd85, fd63, fd78; +mul.f64 fd86, fd64, fd79; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd63, fd79; +fma.rn.f64 fd89, fd64, fd78, fd88; +mul.f64 fd90, fd87, fd36; +mul.f64 fd91, fd89, fd54; +mul.f64 fd92, fd87, fd54; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd93, fd26, fd25; +add.f64 fd94, fd23, fd22; +st.shared.v2.f64 [r9], {fd94, fd93}; +fma.rn.f64 fd95, fd64, fd35, fd69; +sub.f64 fd96, fd67, fd68; +st.shared.v2.f64 [r9+16], {fd96, fd95}; +fma.rn.f64 fd97, fd74, fd43, fd77; +sub.f64 fd98, fd75, fd76; +st.shared.v2.f64 [r9+32], {fd98, fd97}; +fma.rn.f64 fd99, fd79, fd44, fd84; +sub.f64 fd100, fd82, fd83; +st.shared.v2.f64 [r9+48], {fd100, fd99}; +fma.rn.f64 fd101, fd89, fd36, fd92; +sub.f64 fd102, fd90, fd91; +st.shared.v2.f64 [r9+64], {fd102, fd101}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd103, fd104}, [r11]; +ld.shared.v2.f64 {fd107, fd108}, [r11+80]; +ld.shared.v2.f64 {fd111, fd112}, [r11+160]; +ld.shared.v2.f64 {fd115, fd116}, [r11+240]; +ld.shared.v2.f64 {fd119, fd120}, [r11+320]; +add.f64 fd123, fd107, fd119; +add.f64 fd124, fd103, fd123; +add.f64 fd125, fd111, fd115; +add.f64 fd126, fd108, fd120; +add.f64 fd127, fd104, fd126; +add.f64 fd128, fd112, fd116; +fma.rn.f64 fd129, fd123, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd130, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd131, fd129, fd130; +sub.f64 fd132, fd108, fd120; +mul.f64 fd133, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd134, fd112, fd116; +mul.f64 fd135, fd134, 0dBFE2CF2304755A5E; +sub.f64 fd136, fd135, fd133; +mul.f64 fd137, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd138, fd103, fd137; +fma.rn.f64 fd139, fd125, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd132, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd141, fd140; +fma.rn.f64 fd143, fd126, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd144, fd128, 0d3FE9E3779B97F4A8; +sub.f64 fd145, fd143, fd144; +sub.f64 fd146, fd107, fd119; +mul.f64 fd147, fd146, 0d3FEE6F0E134454FF; +sub.f64 fd148, fd111, fd115; +mul.f64 fd149, fd148, 0dBFE2CF2304755A5E; +sub.f64 fd150, fd149, fd147; +mul.f64 fd151, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd152, fd104, fd151; +fma.rn.f64 fd153, fd128, 0d3FD3C6EF372FE950, fd152; +mul.f64 fd154, fd146, 0d3FE2CF2304755A5E; +mul.f64 fd155, fd148, 0d3FEE6F0E134454FF; +sub.f64 fd156, fd155, fd154; +add.f64 %1, fd128, fd127; +add.f64 %0, fd125, fd124; +add.f64 %3, fd150, fd145; +sub.f64 %2, fd131, fd136; +add.f64 %5, fd156, fd153; +sub.f64 %4, fd139, fd142; +sub.f64 %7, fd153, fd156; +add.f64 %6, fd142, fd139; +sub.f64 %9, fd145, fd150; +add.f64 %8, fd136, fd131; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<535, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<157>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 200, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %14, %22; +add.f64 fd22, %12, fd21; +add.f64 fd23, %17, %20; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %16, %23; +add.f64 fd26, %13, fd25; +add.f64 fd27, %19, %21; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %12; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %16, %23; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %19, %21; +mul.f64 fd35, fd34, 0dBFE2CF2304755A5E; +sub.f64 fd36, fd35, fd33; +sub.f64 fd37, fd31, fd36; +add.f64 fd38, fd36, fd31; +mul.f64 fd39, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd40, %12, fd39; +fma.rn.f64 fd41, fd23, 0d3FD3C6EF372FE950, fd40; +mul.f64 fd42, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd43, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd44, fd43, fd42; +sub.f64 fd45, fd41, fd44; +add.f64 fd46, fd44, fd41; +fma.rn.f64 fd47, fd25, 0d3FD3C6EF372FE950, %13; +mul.f64 fd48, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd49, fd47, fd48; +sub.f64 fd50, %14, %22; +mul.f64 fd51, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd52, %17, %20; +mul.f64 fd53, fd52, 0dBFE2CF2304755A5E; +sub.f64 fd54, fd53, fd51; +add.f64 fd55, fd54, fd49; +sub.f64 fd56, fd49, fd54; +mul.f64 fd57, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd58, %13, fd57; +fma.rn.f64 fd59, fd27, 0d3FD3C6EF372FE950, fd58; +mul.f64 fd60, fd50, 0d3FE2CF2304755A5E; +mul.f64 fd61, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd62, fd61, fd60; +add.f64 fd63, fd62, fd59; +sub.f64 fd64, fd59, fd62; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd65, fd66}, [rd6]; +mul.f64 fd69, fd65, fd37; +mul.f64 fd70, fd66, fd55; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd65, fd55; +fma.rn.f64 fd73, fd66, fd37, fd72; +mul.f64 fd74, fd65, fd65; +mul.f64 fd75, fd66, fd66; +sub.f64 fd76, fd74, fd75; +mul.f64 fd77, fd66, fd65; +fma.rn.f64 fd78, fd66, fd65, fd77; +mul.f64 fd79, fd76, fd45; +mul.f64 fd80, fd78, fd63; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd76, fd63; +fma.rn.f64 fd83, fd78, fd45, fd82; +ld.global.v2.f64 {fd84, fd85}, [rd6+80]; +mul.f64 fd88, fd84, fd46; +mul.f64 fd89, fd85, fd64; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd84, fd64; +fma.rn.f64 fd92, fd85, fd46, fd91; +mul.f64 fd93, fd65, fd84; +mul.f64 fd94, fd66, fd85; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd65, fd85; +fma.rn.f64 fd97, fd66, fd84, fd96; +mul.f64 fd98, fd95, fd38; +mul.f64 fd99, fd97, fd56; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd95, fd56; +fma.rn.f64 fd102, fd97, fd38, fd101; +mad.lo.s32 r8, r5, 200, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd103, [r11]; +ld.shared.f64 fd104, [r11+40]; +ld.shared.f64 fd105, [r11+80]; +ld.shared.f64 fd106, [r11+120]; +ld.shared.f64 fd107, [r11+160]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd73; +st.shared.f64 [r9+16], fd83; +st.shared.f64 [r9+24], fd92; +st.shared.f64 [r9+32], fd102; +barrier.sync 0; +ld.shared.f64 fd108, [r11]; +ld.shared.f64 fd109, [r11+40]; +ld.shared.f64 fd110, [r11+80]; +ld.shared.f64 fd111, [r11+120]; +ld.shared.f64 fd112, [r11+160]; +add.f64 fd113, fd104, fd107; +add.f64 fd114, fd103, fd113; +add.f64 fd115, fd105, fd106; +add.f64 fd116, fd109, fd112; +add.f64 fd117, fd108, fd116; +add.f64 fd118, fd110, fd111; +fma.rn.f64 fd119, fd113, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd120, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd121, fd119, fd120; +sub.f64 fd122, fd109, fd112; +mul.f64 fd123, fd122, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd110, fd111; +mul.f64 fd125, fd124, 0dBFE2CF2304755A5E; +sub.f64 fd126, fd125, fd123; +mul.f64 fd127, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd128, fd103, fd127; +fma.rn.f64 fd129, fd115, 0d3FD3C6EF372FE950, fd128; +mul.f64 fd130, fd122, 0d3FE2CF2304755A5E; +mul.f64 fd131, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd131, fd130; +fma.rn.f64 fd133, fd116, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd134, fd118, 0d3FE9E3779B97F4A8; +sub.f64 fd135, fd133, fd134; +sub.f64 fd136, fd104, fd107; +mul.f64 fd137, fd136, 0d3FEE6F0E134454FF; +sub.f64 fd138, fd105, fd106; +mul.f64 fd139, fd138, 0dBFE2CF2304755A5E; +sub.f64 fd140, fd139, fd137; +mul.f64 fd141, fd116, 0d3FE9E3779B97F4A8; +sub.f64 fd142, fd108, fd141; +fma.rn.f64 fd143, fd118, 0d3FD3C6EF372FE950, fd142; +mul.f64 fd144, fd136, 0d3FE2CF2304755A5E; +mul.f64 fd145, fd138, 0d3FEE6F0E134454FF; +sub.f64 fd146, fd145, fd144; +add.f64 %0, fd115, fd114; +add.f64 %1, fd118, fd117; +add.f64 %3, fd140, fd135; +sub.f64 %2, fd121, fd126; +sub.f64 %4, fd129, fd132; +add.f64 %5, fd146, fd143; +add.f64 %6, fd132, fd129; +sub.f64 %7, fd143, fd146; +sub.f64 %9, fd135, fd140; +add.f64 %8, fd126, fd121; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..9c99070a1cb3c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_25_fp64_inv.hpp.inc @@ -0,0 +1,842 @@ +#ifndef CUFFTDX_FFT_25_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_25_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<704, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<601>; +.reg .b64 rd<2>; +add.f64 fd101, %63, %103; +add.f64 fd102, %50, fd101; +add.f64 fd103, %76, %90; +add.f64 fd104, fd103, fd102; +add.f64 fd105, %65, %105; +add.f64 fd106, %51, fd105; +add.f64 fd107, %78, %91; +add.f64 fd108, fd107, fd106; +fma.rn.f64 fd109, fd101, 0d3FD3C6EF372FE950, %50; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %65, %105; +mul.f64 fd113, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd114, %78, %91; +fma.rn.f64 fd115, fd114, 0d3FE2CF2304755A5E, fd113; +sub.f64 fd116, fd111, fd115; +add.f64 fd117, fd115, fd111; +mul.f64 fd118, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd119, %50, fd118; +fma.rn.f64 fd120, fd103, 0d3FD3C6EF372FE950, fd119; +mul.f64 fd121, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd122, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd120, fd123; +add.f64 fd125, fd123, fd120; +fma.rn.f64 fd126, fd105, 0d3FD3C6EF372FE950, %51; +mul.f64 fd127, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd128, fd126, fd127; +sub.f64 fd129, %63, %103; +mul.f64 fd130, fd129, 0d3FEE6F0E134454FF; +sub.f64 fd131, %76, %90; +fma.rn.f64 fd132, fd131, 0d3FE2CF2304755A5E, fd130; +add.f64 fd133, fd132, fd128; +sub.f64 fd134, fd128, fd132; +mul.f64 fd135, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd136, %51, fd135; +fma.rn.f64 fd137, fd107, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd138, fd129, 0d3FE2CF2304755A5E; +mul.f64 fd139, fd131, 0d3FEE6F0E134454FF; +sub.f64 fd140, fd138, fd139; +add.f64 fd141, fd140, fd137; +sub.f64 fd142, fd137, fd140; +add.f64 fd143, %66, %106; +add.f64 fd144, %52, fd143; +add.f64 fd145, %79, %92; +add.f64 fd146, fd145, fd144; +add.f64 fd147, %67, %107; +add.f64 fd148, %54, fd147; +add.f64 fd149, %81, %94; +add.f64 fd150, fd149, fd148; +fma.rn.f64 fd151, fd143, 0d3FD3C6EF372FE950, %52; +mul.f64 fd152, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd153, fd151, fd152; +sub.f64 fd154, %67, %107; +mul.f64 fd155, fd154, 0d3FEE6F0E134454FF; +sub.f64 fd156, %81, %94; +fma.rn.f64 fd157, fd156, 0d3FE2CF2304755A5E, fd155; +sub.f64 fd158, fd153, fd157; +add.f64 fd159, fd157, fd153; +mul.f64 fd160, fd143, 0d3FE9E3779B97F4A8; +sub.f64 fd161, %52, fd160; +fma.rn.f64 fd162, fd145, 0d3FD3C6EF372FE950, fd161; +mul.f64 fd163, fd154, 0d3FE2CF2304755A5E; +mul.f64 fd164, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd165, fd163, fd164; +sub.f64 fd166, fd162, fd165; +add.f64 fd167, fd165, fd162; +fma.rn.f64 fd168, fd147, 0d3FD3C6EF372FE950, %54; +mul.f64 fd169, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd170, fd168, fd169; +sub.f64 fd171, %66, %106; +mul.f64 fd172, fd171, 0d3FEE6F0E134454FF; +sub.f64 fd173, %79, %92; +fma.rn.f64 fd174, fd173, 0d3FE2CF2304755A5E, fd172; +add.f64 fd175, fd174, fd170; +sub.f64 fd176, fd170, fd174; +mul.f64 fd177, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd178, %54, fd177; +fma.rn.f64 fd179, fd149, 0d3FD3C6EF372FE950, fd178; +mul.f64 fd180, fd171, 0d3FE2CF2304755A5E; +mul.f64 fd181, fd173, 0d3FEE6F0E134454FF; +sub.f64 fd182, fd180, fd181; +add.f64 fd183, fd182, fd179; +sub.f64 fd184, fd179, fd182; +add.f64 fd185, %68, %108; +add.f64 fd186, %55, fd185; +add.f64 fd187, %82, %95; +add.f64 fd188, fd187, fd186; +add.f64 fd189, %70, %110; +add.f64 fd190, %57, fd189; +add.f64 fd191, %83, %97; +add.f64 fd192, fd191, fd190; +fma.rn.f64 fd193, fd185, 0d3FD3C6EF372FE950, %55; +mul.f64 fd194, fd187, 0d3FE9E3779B97F4A8; +sub.f64 fd195, fd193, fd194; +sub.f64 fd196, %70, %110; +mul.f64 fd197, fd196, 0d3FEE6F0E134454FF; +sub.f64 fd198, %83, %97; +fma.rn.f64 fd199, fd198, 0d3FE2CF2304755A5E, fd197; +sub.f64 fd200, fd195, fd199; +add.f64 fd201, fd199, fd195; +mul.f64 fd202, fd185, 0d3FE9E3779B97F4A8; +sub.f64 fd203, %55, fd202; +fma.rn.f64 fd204, fd187, 0d3FD3C6EF372FE950, fd203; +mul.f64 fd205, fd196, 0d3FE2CF2304755A5E; +mul.f64 fd206, fd198, 0d3FEE6F0E134454FF; +sub.f64 fd207, fd205, fd206; +sub.f64 fd208, fd204, fd207; +add.f64 fd209, fd207, fd204; +fma.rn.f64 fd210, fd189, 0d3FD3C6EF372FE950, %57; +mul.f64 fd211, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd212, fd210, fd211; +sub.f64 fd213, %68, %108; +mul.f64 fd214, fd213, 0d3FEE6F0E134454FF; +sub.f64 fd215, %82, %95; +fma.rn.f64 fd216, fd215, 0d3FE2CF2304755A5E, fd214; +add.f64 fd217, fd216, fd212; +sub.f64 fd218, fd212, fd216; +mul.f64 fd219, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd220, %57, fd219; +fma.rn.f64 fd221, fd191, 0d3FD3C6EF372FE950, fd220; +mul.f64 fd222, fd213, 0d3FE2CF2304755A5E; +mul.f64 fd223, fd215, 0d3FEE6F0E134454FF; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd224, fd221; +sub.f64 fd226, fd221, fd224; +add.f64 fd227, %71, %111; +add.f64 fd228, %58, fd227; +add.f64 fd229, %84, %98; +add.f64 fd230, fd229, fd228; +add.f64 fd231, %73, %113; +add.f64 fd232, %59, fd231; +add.f64 fd233, %86, %99; +add.f64 fd234, fd233, fd232; +fma.rn.f64 fd235, fd227, 0d3FD3C6EF372FE950, %58; +mul.f64 fd236, fd229, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, %73, %113; +mul.f64 fd239, fd238, 0d3FEE6F0E134454FF; +sub.f64 fd240, %86, %99; +fma.rn.f64 fd241, fd240, 0d3FE2CF2304755A5E, fd239; +sub.f64 fd242, fd237, fd241; +add.f64 fd243, fd241, fd237; +mul.f64 fd244, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd245, %58, fd244; +fma.rn.f64 fd246, fd229, 0d3FD3C6EF372FE950, fd245; +mul.f64 fd247, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd248, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd246, fd249; +add.f64 fd251, fd249, fd246; +fma.rn.f64 fd252, fd231, 0d3FD3C6EF372FE950, %59; +mul.f64 fd253, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd254, fd252, fd253; +sub.f64 fd255, %71, %111; +mul.f64 fd256, fd255, 0d3FEE6F0E134454FF; +sub.f64 fd257, %84, %98; +fma.rn.f64 fd258, fd257, 0d3FE2CF2304755A5E, fd256; +add.f64 fd259, fd258, fd254; +sub.f64 fd260, fd254, fd258; +mul.f64 fd261, fd231, 0d3FE9E3779B97F4A8; +sub.f64 fd262, %59, fd261; +fma.rn.f64 fd263, fd233, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd264, fd255, 0d3FE2CF2304755A5E; +mul.f64 fd265, fd257, 0d3FEE6F0E134454FF; +sub.f64 fd266, fd264, fd265; +add.f64 fd267, fd266, fd263; +sub.f64 fd268, fd263, fd266; +add.f64 fd269, %74, %114; +add.f64 fd270, %60, fd269; +add.f64 fd271, %87, %100; +add.f64 fd272, fd271, fd270; +add.f64 fd273, %75, %115; +add.f64 fd274, %62, fd273; +add.f64 fd275, %89, %102; +add.f64 fd276, fd275, fd274; +fma.rn.f64 fd277, fd269, 0d3FD3C6EF372FE950, %60; +mul.f64 fd278, fd271, 0d3FE9E3779B97F4A8; +sub.f64 fd279, fd277, fd278; +sub.f64 fd280, %75, %115; +mul.f64 fd281, fd280, 0d3FEE6F0E134454FF; +sub.f64 fd282, %89, %102; +fma.rn.f64 fd283, fd282, 0d3FE2CF2304755A5E, fd281; +sub.f64 fd284, fd279, fd283; +add.f64 fd285, fd283, fd279; +mul.f64 fd286, fd269, 0d3FE9E3779B97F4A8; +sub.f64 fd287, %60, fd286; +fma.rn.f64 fd288, fd271, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd289, fd280, 0d3FE2CF2304755A5E; +mul.f64 fd290, fd282, 0d3FEE6F0E134454FF; +sub.f64 fd291, fd289, fd290; +sub.f64 fd292, fd288, fd291; +add.f64 fd293, fd291, fd288; +fma.rn.f64 fd294, fd273, 0d3FD3C6EF372FE950, %62; +mul.f64 fd295, fd275, 0d3FE9E3779B97F4A8; +sub.f64 fd296, fd294, fd295; +sub.f64 fd297, %74, %114; +mul.f64 fd298, fd297, 0d3FEE6F0E134454FF; +sub.f64 fd299, %87, %100; +fma.rn.f64 fd300, fd299, 0d3FE2CF2304755A5E, fd298; +add.f64 fd301, fd300, fd296; +sub.f64 fd302, fd296, fd300; +mul.f64 fd303, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd304, %62, fd303; +fma.rn.f64 fd305, fd275, 0d3FD3C6EF372FE950, fd304; +mul.f64 fd306, fd297, 0d3FE2CF2304755A5E; +mul.f64 fd307, fd299, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd306, fd307; +add.f64 fd309, fd308, fd305; +sub.f64 fd310, fd305, fd308; +mul.f64 fd311, fd158, 0d3FEEFEA21D101EE0; +mul.f64 fd312, fd175, 0d3FCFD511FA1C0796; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd175, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd315, fd158, 0d3FCFD511FA1C0796, fd314; +mul.f64 fd316, fd200, 0d3FEC0AB44E81C059; +mul.f64 fd317, fd217, 0d3FDED50D5CBFA951; +sub.f64 fd318, fd316, fd317; +mul.f64 fd319, fd217, 0d3FEC0AB44E81C059; +fma.rn.f64 fd320, fd200, 0d3FDED50D5CBFA951, fd319; +mul.f64 fd321, fd242, 0d3FE753B603D2B816; +mul.f64 fd322, fd259, 0d3FE5E7CF55112014; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd259, 0d3FE753B603D2B816; +fma.rn.f64 fd325, fd242, 0d3FE5E7CF55112014, fd324; +mul.f64 fd326, fd284, 0d3FE1257E3C182B51; +mul.f64 fd327, fd301, 0d3FEB04BBFF642E86; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd301, 0d3FE1257E3C182B51; +fma.rn.f64 fd330, fd284, 0d3FEB04BBFF642E86, fd329; +mul.f64 fd331, fd166, 0d3FEC0AB44E81C059; +mul.f64 fd332, fd183, 0d3FDED50D5CBFA951; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd183, 0d3FEC0AB44E81C059; +fma.rn.f64 fd335, fd166, 0d3FDED50D5CBFA951, fd334; +mul.f64 fd336, fd208, 0d3FE1257E3C182B51; +mul.f64 fd337, fd225, 0d3FEB04BBFF642E86; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd225, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd208, 0d3FEB04BBFF642E86, fd339; +mul.f64 fd341, fd250, 0d3FB0130A1BE09379; +mul.f64 fd342, fd267, 0d3FEFEFD5BFE443FE; +sub.f64 fd343, fd341, fd342; +mul.f64 fd344, fd267, 0d3FB0130A1BE09379; +fma.rn.f64 fd345, fd250, 0d3FEFEFD5BFE443FE, fd344; +mul.f64 fd346, fd292, 0dBFDB3FF7C925819C; +mul.f64 fd347, fd309, 0d3FECF457DCDC158C; +sub.f64 fd348, fd346, fd347; +mul.f64 fd349, fd309, 0dBFDB3FF7C925819C; +fma.rn.f64 fd350, fd292, 0d3FECF457DCDC158C, fd349; +mul.f64 fd351, fd167, 0d3FE753B603D2B816; +mul.f64 fd352, fd184, 0d3FE5E7CF55112014; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd184, 0d3FE753B603D2B816; +fma.rn.f64 fd355, fd167, 0d3FE5E7CF55112014, fd354; +mul.f64 fd356, fd209, 0d3FB0130A1BE09379; +mul.f64 fd357, fd226, 0d3FEFEFD5BFE443FE; +sub.f64 fd358, fd356, fd357; +mul.f64 fd359, fd226, 0d3FB0130A1BE09379; +fma.rn.f64 fd360, fd209, 0d3FEFEFD5BFE443FE, fd359; +mul.f64 fd361, fd251, 0dBFE465C6FEB501BC; +mul.f64 fd362, fd268, 0d3FE8A80B635B6BEA; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd268, 0dBFE465C6FEB501BC; +fma.rn.f64 fd365, fd251, 0d3FE8A80B635B6BEA, fd364; +mul.f64 fd366, fd293, 0dBFEFBF675480D903; +mul.f64 fd367, fd310, 0d3FC00AEB5DA15BE0; +sub.f64 fd368, fd366, fd367; +mul.f64 fd369, fd310, 0dBFEFBF675480D903; +fma.rn.f64 fd370, fd293, 0d3FC00AEB5DA15BE0, fd369; +mul.f64 fd371, fd159, 0d3FE1257E3C182B51; +mul.f64 fd372, fd176, 0d3FEB04BBFF642E86; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd176, 0d3FE1257E3C182B51; +fma.rn.f64 fd375, fd159, 0d3FEB04BBFF642E86, fd374; +mul.f64 fd376, fd201, 0dBFDB3FF7C925819C; +mul.f64 fd377, fd218, 0d3FECF457DCDC158C; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd218, 0dBFDB3FF7C925819C; +fma.rn.f64 fd380, fd201, 0d3FECF457DCDC158C, fd379; +mul.f64 fd381, fd243, 0dBFEFBF675480D903; +mul.f64 fd382, fd260, 0d3FC00AEB5DA15BE0; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd260, 0dBFEFBF675480D903; +fma.rn.f64 fd385, fd243, 0d3FC00AEB5DA15BE0, fd384; +mul.f64 fd386, fd285, 0dBFE465C6FEB501BC; +mul.f64 fd387, fd302, 0dBFE8A80B635B6BEA; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd302, 0dBFE465C6FEB501BC; +fma.rn.f64 fd390, fd285, 0dBFE8A80B635B6BEA, fd389; +add.f64 fd391, fd146, fd272; +add.f64 fd392, fd104, fd391; +add.f64 fd393, fd188, fd230; +add.f64 fd394, fd150, fd276; +add.f64 fd395, fd108, fd394; +add.f64 fd396, fd192, fd234; +fma.rn.f64 fd397, fd391, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd398, fd393, 0d3FE9E3779B97F4A8; +sub.f64 fd399, fd397, fd398; +sub.f64 fd400, fd150, fd276; +mul.f64 fd401, fd400, 0d3FEE6F0E134454FF; +sub.f64 fd402, fd192, fd234; +fma.rn.f64 fd403, fd402, 0d3FE2CF2304755A5E, fd401; +mul.f64 fd404, fd391, 0d3FE9E3779B97F4A8; +sub.f64 fd405, fd104, fd404; +fma.rn.f64 fd406, fd393, 0d3FD3C6EF372FE950, fd405; +mul.f64 fd407, fd400, 0d3FE2CF2304755A5E; +mul.f64 fd408, fd402, 0d3FEE6F0E134454FF; +sub.f64 fd409, fd407, fd408; +fma.rn.f64 fd410, fd394, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd411, fd396, 0d3FE9E3779B97F4A8; +sub.f64 fd412, fd410, fd411; +sub.f64 fd413, fd146, fd272; +mul.f64 fd414, fd413, 0d3FEE6F0E134454FF; +sub.f64 fd415, fd188, fd230; +fma.rn.f64 fd416, fd415, 0d3FE2CF2304755A5E, fd414; +mul.f64 fd417, fd394, 0d3FE9E3779B97F4A8; +sub.f64 fd418, fd108, fd417; +fma.rn.f64 fd419, fd396, 0d3FD3C6EF372FE950, fd418; +mul.f64 fd420, fd413, 0d3FE2CF2304755A5E; +mul.f64 fd421, fd415, 0d3FEE6F0E134454FF; +sub.f64 fd422, fd420, fd421; +add.f64 fd423, fd313, fd328; +add.f64 fd424, fd116, fd423; +add.f64 fd425, fd318, fd323; +add.f64 fd426, fd315, fd330; +add.f64 fd427, fd133, fd426; +add.f64 fd428, fd320, fd325; +fma.rn.f64 fd429, fd423, 0d3FD3C6EF372FE950, fd116; +mul.f64 fd430, fd425, 0d3FE9E3779B97F4A8; +sub.f64 fd431, fd429, fd430; +sub.f64 fd432, fd315, fd330; +mul.f64 fd433, fd432, 0d3FEE6F0E134454FF; +sub.f64 fd434, fd320, fd325; +fma.rn.f64 fd435, fd434, 0d3FE2CF2304755A5E, fd433; +mul.f64 fd436, fd423, 0d3FE9E3779B97F4A8; +sub.f64 fd437, fd116, fd436; +fma.rn.f64 fd438, fd425, 0d3FD3C6EF372FE950, fd437; +mul.f64 fd439, fd432, 0d3FE2CF2304755A5E; +mul.f64 fd440, fd434, 0d3FEE6F0E134454FF; +sub.f64 fd441, fd439, fd440; +fma.rn.f64 fd442, fd426, 0d3FD3C6EF372FE950, fd133; +mul.f64 fd443, fd428, 0d3FE9E3779B97F4A8; +sub.f64 fd444, fd442, fd443; +sub.f64 fd445, fd313, fd328; +mul.f64 fd446, fd445, 0d3FEE6F0E134454FF; +sub.f64 fd447, fd318, fd323; +fma.rn.f64 fd448, fd447, 0d3FE2CF2304755A5E, fd446; +mul.f64 fd449, fd426, 0d3FE9E3779B97F4A8; +sub.f64 fd450, fd133, fd449; +fma.rn.f64 fd451, fd428, 0d3FD3C6EF372FE950, fd450; +mul.f64 fd452, fd445, 0d3FE2CF2304755A5E; +mul.f64 fd453, fd447, 0d3FEE6F0E134454FF; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd333, fd348; +add.f64 fd456, fd124, fd455; +add.f64 fd457, fd338, fd343; +add.f64 fd458, fd335, fd350; +add.f64 fd459, fd141, fd458; +add.f64 fd460, fd340, fd345; +fma.rn.f64 fd461, fd455, 0d3FD3C6EF372FE950, fd124; +mul.f64 fd462, fd457, 0d3FE9E3779B97F4A8; +sub.f64 fd463, fd461, fd462; +sub.f64 fd464, fd335, fd350; +mul.f64 fd465, fd464, 0d3FEE6F0E134454FF; +sub.f64 fd466, fd340, fd345; +fma.rn.f64 fd467, fd466, 0d3FE2CF2304755A5E, fd465; +mul.f64 fd468, fd455, 0d3FE9E3779B97F4A8; +sub.f64 fd469, fd124, fd468; +fma.rn.f64 fd470, fd457, 0d3FD3C6EF372FE950, fd469; +mul.f64 fd471, fd464, 0d3FE2CF2304755A5E; +mul.f64 fd472, fd466, 0d3FEE6F0E134454FF; +sub.f64 fd473, fd471, fd472; +fma.rn.f64 fd474, fd458, 0d3FD3C6EF372FE950, fd141; +mul.f64 fd475, fd460, 0d3FE9E3779B97F4A8; +sub.f64 fd476, fd474, fd475; +sub.f64 fd477, fd333, fd348; +mul.f64 fd478, fd477, 0d3FEE6F0E134454FF; +sub.f64 fd479, fd338, fd343; +fma.rn.f64 fd480, fd479, 0d3FE2CF2304755A5E, fd478; +mul.f64 fd481, fd458, 0d3FE9E3779B97F4A8; +sub.f64 fd482, fd141, fd481; +fma.rn.f64 fd483, fd460, 0d3FD3C6EF372FE950, fd482; +mul.f64 fd484, fd477, 0d3FE2CF2304755A5E; +mul.f64 fd485, fd479, 0d3FEE6F0E134454FF; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd353, fd368; +add.f64 fd488, fd125, fd487; +add.f64 fd489, fd358, fd363; +add.f64 fd490, fd355, fd370; +add.f64 fd491, fd142, fd490; +add.f64 fd492, fd360, fd365; +fma.rn.f64 fd493, fd487, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd494, fd489, 0d3FE9E3779B97F4A8; +sub.f64 fd495, fd493, fd494; +sub.f64 fd496, fd355, fd370; +mul.f64 fd497, fd496, 0d3FEE6F0E134454FF; +sub.f64 fd498, fd360, fd365; +fma.rn.f64 fd499, fd498, 0d3FE2CF2304755A5E, fd497; +mul.f64 fd500, fd487, 0d3FE9E3779B97F4A8; +sub.f64 fd501, fd125, fd500; +fma.rn.f64 fd502, fd489, 0d3FD3C6EF372FE950, fd501; +mul.f64 fd503, fd496, 0d3FE2CF2304755A5E; +mul.f64 fd504, fd498, 0d3FEE6F0E134454FF; +sub.f64 fd505, fd503, fd504; +fma.rn.f64 fd506, fd490, 0d3FD3C6EF372FE950, fd142; +mul.f64 fd507, fd492, 0d3FE9E3779B97F4A8; +sub.f64 fd508, fd506, fd507; +sub.f64 fd509, fd353, fd368; +mul.f64 fd510, fd509, 0d3FEE6F0E134454FF; +sub.f64 fd511, fd358, fd363; +fma.rn.f64 fd512, fd511, 0d3FE2CF2304755A5E, fd510; +mul.f64 fd513, fd490, 0d3FE9E3779B97F4A8; +sub.f64 fd514, fd142, fd513; +fma.rn.f64 fd515, fd492, 0d3FD3C6EF372FE950, fd514; +mul.f64 fd516, fd509, 0d3FE2CF2304755A5E; +mul.f64 fd517, fd511, 0d3FEE6F0E134454FF; +sub.f64 fd518, fd516, fd517; +add.f64 fd519, fd373, fd388; +add.f64 fd520, fd117, fd519; +add.f64 fd521, fd378, fd383; +add.f64 fd522, fd375, fd390; +add.f64 fd523, fd134, fd522; +add.f64 fd524, fd380, fd385; +fma.rn.f64 fd525, fd519, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd526, fd521, 0d3FE9E3779B97F4A8; +sub.f64 fd527, fd525, fd526; +sub.f64 fd528, fd375, fd390; +mul.f64 fd529, fd528, 0d3FEE6F0E134454FF; +sub.f64 fd530, fd380, fd385; +fma.rn.f64 fd531, fd530, 0d3FE2CF2304755A5E, fd529; +mul.f64 fd532, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd533, fd117, fd532; +fma.rn.f64 fd534, fd521, 0d3FD3C6EF372FE950, fd533; +mul.f64 fd535, fd528, 0d3FE2CF2304755A5E; +mul.f64 fd536, fd530, 0d3FEE6F0E134454FF; +sub.f64 fd537, fd535, fd536; +fma.rn.f64 fd538, fd522, 0d3FD3C6EF372FE950, fd134; +mul.f64 fd539, fd524, 0d3FE9E3779B97F4A8; +sub.f64 fd540, fd538, fd539; +sub.f64 fd541, fd373, fd388; +mul.f64 fd542, fd541, 0d3FEE6F0E134454FF; +sub.f64 fd543, fd378, fd383; +fma.rn.f64 fd544, fd543, 0d3FE2CF2304755A5E, fd542; +mul.f64 fd545, fd522, 0d3FE9E3779B97F4A8; +sub.f64 fd546, fd134, fd545; +fma.rn.f64 fd547, fd524, 0d3FD3C6EF372FE950, fd546; +mul.f64 fd548, fd541, 0d3FE2CF2304755A5E; +mul.f64 fd549, fd543, 0d3FEE6F0E134454FF; +sub.f64 fd550, fd548, fd549; +add.f64 %1, fd396, fd395; +add.f64 %0, fd393, fd392; +add.f64 %3, fd428, fd427; +add.f64 %2, fd425, fd424; +add.f64 %5, fd460, fd459; +add.f64 %4, fd457, fd456; +add.f64 %7, fd492, fd491; +add.f64 %6, fd489, fd488; +add.f64 %9, fd524, fd523; +add.f64 %8, fd521, fd520; +add.f64 %11, fd416, fd412; +sub.f64 %10, fd399, fd403; +add.f64 %13, fd448, fd444; +sub.f64 %12, fd431, fd435; +add.f64 %15, fd480, fd476; +sub.f64 %14, fd463, fd467; +add.f64 %17, fd512, fd508; +sub.f64 %16, fd495, fd499; +add.f64 %19, fd544, fd540; +sub.f64 %18, fd527, fd531; +add.f64 %21, fd422, fd419; +sub.f64 %20, fd406, fd409; +add.f64 %23, fd454, fd451; +sub.f64 %22, fd438, fd441; +add.f64 %25, fd486, fd483; +sub.f64 %24, fd470, fd473; +add.f64 %27, fd518, fd515; +sub.f64 %26, fd502, fd505; +add.f64 %29, fd550, fd547; +sub.f64 %28, fd534, fd537; +sub.f64 %31, fd419, fd422; +add.f64 %30, fd409, fd406; +sub.f64 %33, fd451, fd454; +add.f64 %32, fd441, fd438; +sub.f64 %35, fd483, fd486; +add.f64 %34, fd473, fd470; +sub.f64 %37, fd515, fd518; +add.f64 %36, fd505, fd502; +sub.f64 %39, fd547, fd550; +add.f64 %38, fd537, fd534; +sub.f64 %41, fd412, fd416; +add.f64 %40, fd403, fd399; +sub.f64 %43, fd444, fd448; +add.f64 %42, fd435, fd431; +sub.f64 %45, fd476, fd480; +add.f64 %44, fd467, fd463; +sub.f64 %47, fd508, fd512; +add.f64 %46, fd499, fd495; +sub.f64 %49, fd540, fd544; +add.f64 %48, fd531, fd527; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<705, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<163>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 400, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %14, %22; +add.f64 fd22, %12, fd21; +add.f64 fd23, %17, %20; +add.f64 fd24, %16, %23; +add.f64 fd25, %13, fd24; +add.f64 fd26, %19, %21; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %12; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %16, %23; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %19, %21; +fma.rn.f64 fd33, fd32, 0d3FE2CF2304755A5E, fd31; +sub.f64 fd34, fd29, fd33; +add.f64 fd35, fd33, fd29; +mul.f64 fd36, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd37, %12, fd36; +fma.rn.f64 fd38, fd23, 0d3FD3C6EF372FE950, fd37; +mul.f64 fd39, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd40, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd41, fd39, fd40; +sub.f64 fd42, fd38, fd41; +add.f64 fd43, fd41, fd38; +fma.rn.f64 fd44, fd24, 0d3FD3C6EF372FE950, %13; +mul.f64 fd45, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd46, fd44, fd45; +sub.f64 fd47, %14, %22; +mul.f64 fd48, fd47, 0d3FEE6F0E134454FF; +sub.f64 fd49, %17, %20; +fma.rn.f64 fd50, fd49, 0d3FE2CF2304755A5E, fd48; +add.f64 fd51, fd50, fd46; +sub.f64 fd52, fd46, fd50; +mul.f64 fd53, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd54, %13, fd53; +fma.rn.f64 fd55, fd26, 0d3FD3C6EF372FE950, fd54; +mul.f64 fd56, fd47, 0d3FE2CF2304755A5E; +mul.f64 fd57, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd58, fd56, fd57; +add.f64 fd59, fd58, fd55; +sub.f64 fd60, fd55, fd58; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 400, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd61, fd62}, [rd6]; +mul.f64 fd65, fd51, fd62; +mul.f64 fd66, fd34, fd62; +mul.f64 fd67, fd61, fd51; +mul.f64 fd68, fd61, fd61; +mul.f64 fd69, fd62, fd62; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd62, fd61; +fma.rn.f64 fd72, fd62, fd61, fd71; +mul.f64 fd73, fd59, fd72; +mul.f64 fd74, fd42, fd72; +mul.f64 fd75, fd70, fd59; +ld.global.v2.f64 {fd76, fd77}, [rd6+80]; +mul.f64 fd80, fd60, fd77; +mul.f64 fd81, fd43, fd77; +mul.f64 fd82, fd76, fd60; +mul.f64 fd83, fd61, fd76; +mul.f64 fd84, fd62, fd77; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd61, fd77; +fma.rn.f64 fd87, fd62, fd76, fd86; +mul.f64 fd88, fd52, fd87; +mul.f64 fd89, fd35, fd87; +mul.f64 fd90, fd85, fd52; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd91, fd26, fd25; +add.f64 fd92, fd23, fd22; +st.shared.v2.f64 [r9], {fd92, fd91}; +fma.rn.f64 fd93, fd61, fd34, fd65; +sub.f64 fd94, fd67, fd66; +st.shared.v2.f64 [r9+16], {fd93, fd94}; +fma.rn.f64 fd95, fd70, fd42, fd73; +sub.f64 fd96, fd75, fd74; +st.shared.v2.f64 [r9+32], {fd95, fd96}; +fma.rn.f64 fd97, fd76, fd43, fd80; +sub.f64 fd98, fd82, fd81; +st.shared.v2.f64 [r9+48], {fd97, fd98}; +fma.rn.f64 fd99, fd85, fd35, fd88; +sub.f64 fd100, fd90, fd89; +st.shared.v2.f64 [r9+64], {fd99, fd100}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd101, fd102}, [r11]; +ld.shared.v2.f64 {fd105, fd106}, [r11+80]; +ld.shared.v2.f64 {fd109, fd110}, [r11+160]; +ld.shared.v2.f64 {fd113, fd114}, [r11+240]; +ld.shared.v2.f64 {fd117, fd118}, [r11+320]; +add.f64 fd121, fd105, fd117; +add.f64 fd122, fd101, fd121; +add.f64 fd123, fd109, fd113; +add.f64 fd124, fd106, fd118; +add.f64 fd125, fd102, fd124; +add.f64 fd126, fd110, fd114; +fma.rn.f64 fd127, fd121, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd128, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, fd106, fd118; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd110, fd114; +fma.rn.f64 fd133, fd132, 0d3FE2CF2304755A5E, fd131; +mul.f64 fd134, fd121, 0d3FE9E3779B97F4A8; +sub.f64 fd135, fd101, fd134; +fma.rn.f64 fd136, fd123, 0d3FD3C6EF372FE950, fd135; +mul.f64 fd137, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd138, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd139, fd137, fd138; +fma.rn.f64 fd140, fd124, 0d3FD3C6EF372FE950, fd102; +mul.f64 fd141, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd142, fd140, fd141; +sub.f64 fd143, fd105, fd117; +mul.f64 fd144, fd143, 0d3FEE6F0E134454FF; +sub.f64 fd145, fd109, fd113; +fma.rn.f64 fd146, fd145, 0d3FE2CF2304755A5E, fd144; +mul.f64 fd147, fd124, 0d3FE9E3779B97F4A8; +sub.f64 fd148, fd102, fd147; +fma.rn.f64 fd149, fd126, 0d3FD3C6EF372FE950, fd148; +mul.f64 fd150, fd143, 0d3FE2CF2304755A5E; +mul.f64 fd151, fd145, 0d3FEE6F0E134454FF; +sub.f64 fd152, fd150, fd151; +add.f64 %1, fd126, fd125; +add.f64 %0, fd123, fd122; +add.f64 %3, fd146, fd142; +sub.f64 %2, fd129, fd133; +add.f64 %5, fd152, fd149; +sub.f64 %4, fd136, fd139; +sub.f64 %7, fd149, fd152; +add.f64 %6, fd139, fd136; +sub.f64 %9, fd142, fd146; +add.f64 %8, fd133, fd129; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<706, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<153>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 200, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %14, %22; +add.f64 fd22, %12, fd21; +add.f64 fd23, %17, %20; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %16, %23; +add.f64 fd26, %13, fd25; +add.f64 fd27, %19, %21; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %12; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %16, %23; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %19, %21; +fma.rn.f64 fd35, fd34, 0d3FE2CF2304755A5E, fd33; +sub.f64 fd36, fd31, fd35; +add.f64 fd37, fd35, fd31; +mul.f64 fd38, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd39, %12, fd38; +fma.rn.f64 fd40, fd23, 0d3FD3C6EF372FE950, fd39; +mul.f64 fd41, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd42, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, fd40, fd43; +add.f64 fd45, fd43, fd40; +fma.rn.f64 fd46, fd25, 0d3FD3C6EF372FE950, %13; +mul.f64 fd47, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd48, fd46, fd47; +sub.f64 fd49, %14, %22; +mul.f64 fd50, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd51, %17, %20; +fma.rn.f64 fd52, fd51, 0d3FE2CF2304755A5E, fd50; +add.f64 fd53, fd52, fd48; +sub.f64 fd54, fd48, fd52; +mul.f64 fd55, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %13, fd55; +fma.rn.f64 fd57, fd27, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd49, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd51, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd58, fd59; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, -858993459; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 5; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd53, fd64; +fma.rn.f64 fd68, fd63, fd36, fd67; +mul.f64 fd69, fd36, fd64; +mul.f64 fd70, fd63, fd53; +sub.f64 fd71, fd70, fd69; +mul.f64 fd72, fd63, fd63; +mul.f64 fd73, fd64, fd64; +sub.f64 fd74, fd72, fd73; +mul.f64 fd75, fd64, fd63; +fma.rn.f64 fd76, fd64, fd63, fd75; +mul.f64 fd77, fd61, fd76; +fma.rn.f64 fd78, fd74, fd44, fd77; +mul.f64 fd79, fd44, fd76; +mul.f64 fd80, fd74, fd61; +sub.f64 fd81, fd80, fd79; +ld.global.v2.f64 {fd82, fd83}, [rd6+80]; +mul.f64 fd86, fd62, fd83; +fma.rn.f64 fd87, fd82, fd45, fd86; +mul.f64 fd88, fd45, fd83; +mul.f64 fd89, fd82, fd62; +sub.f64 fd90, fd89, fd88; +mul.f64 fd91, fd63, fd82; +mul.f64 fd92, fd64, fd83; +sub.f64 fd93, fd91, fd92; +mul.f64 fd94, fd63, fd83; +fma.rn.f64 fd95, fd64, fd82, fd94; +mul.f64 fd96, fd54, fd95; +fma.rn.f64 fd97, fd93, fd37, fd96; +mul.f64 fd98, fd37, fd95; +mul.f64 fd99, fd93, fd54; +sub.f64 fd100, fd99, fd98; +mad.lo.s32 r8, r5, 200, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd68; +st.shared.f64 [r9+16], fd78; +st.shared.f64 [r9+24], fd87; +st.shared.f64 [r9+32], fd97; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd101, [r11]; +ld.shared.f64 fd102, [r11+40]; +ld.shared.f64 fd103, [r11+80]; +ld.shared.f64 fd104, [r11+120]; +ld.shared.f64 fd105, [r11+160]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +ld.shared.f64 fd106, [r11]; +ld.shared.f64 fd107, [r11+40]; +ld.shared.f64 fd108, [r11+80]; +ld.shared.f64 fd109, [r11+120]; +ld.shared.f64 fd110, [r11+160]; +add.f64 fd111, fd102, fd105; +add.f64 fd112, fd101, fd111; +add.f64 fd113, fd103, fd104; +add.f64 fd114, fd107, fd110; +add.f64 fd115, fd106, fd114; +add.f64 fd116, fd108, fd109; +fma.rn.f64 fd117, fd111, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd118, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd119, fd117, fd118; +sub.f64 fd120, fd107, fd110; +mul.f64 fd121, fd120, 0d3FEE6F0E134454FF; +sub.f64 fd122, fd108, fd109; +fma.rn.f64 fd123, fd122, 0d3FE2CF2304755A5E, fd121; +mul.f64 fd124, fd111, 0d3FE9E3779B97F4A8; +sub.f64 fd125, fd101, fd124; +fma.rn.f64 fd126, fd113, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd127, fd120, 0d3FE2CF2304755A5E; +mul.f64 fd128, fd122, 0d3FEE6F0E134454FF; +sub.f64 fd129, fd127, fd128; +fma.rn.f64 fd130, fd114, 0d3FD3C6EF372FE950, fd106; +mul.f64 fd131, fd116, 0d3FE9E3779B97F4A8; +sub.f64 fd132, fd130, fd131; +sub.f64 fd133, fd102, fd105; +mul.f64 fd134, fd133, 0d3FEE6F0E134454FF; +sub.f64 fd135, fd103, fd104; +fma.rn.f64 fd136, fd135, 0d3FE2CF2304755A5E, fd134; +mul.f64 fd137, fd114, 0d3FE9E3779B97F4A8; +sub.f64 fd138, fd106, fd137; +fma.rn.f64 fd139, fd116, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd133, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd135, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd140, fd141; +add.f64 %0, fd113, fd112; +add.f64 %1, fd116, fd115; +add.f64 %3, fd136, fd132; +sub.f64 %2, fd119, fd123; +sub.f64 %4, fd126, fd129; +add.f64 %5, fd142, fd139; +add.f64 %6, fd129, fd126; +sub.f64 %7, fd139, fd142; +sub.f64 %9, fd132, fd136; +add.f64 %8, fd123, fd119; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..9a4c6e1d63b94 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp16_fwd.hpp.inc @@ -0,0 +1,3763 @@ +#ifndef CUFFTDX_FFT_26_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_26_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<757, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<363>; +.reg .b32 r<2487>; +.reg .f64 fd<339>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %54, %76; +} +{ +add.f16x2 r4, %55, %77; +} +{ +sub.f16x2 r7, %54, %76; +} +{ +sub.f16x2 r10, %55, %77; +} +{ +add.f16x2 r13, %56, %74; +} +{ +add.f16x2 r16, %57, %75; +} +{ +sub.f16x2 r19, %56, %74; +} +{ +sub.f16x2 r22, %57, %75; +} +{ +add.f16x2 r25, %58, %72; +} +{ +add.f16x2 r28, %59, %73; +} +{ +sub.f16x2 r31, %58, %72; +} +{ +sub.f16x2 r34, %59, %73; +} +{ +add.f16x2 r37, %60, %70; +} +{ +add.f16x2 r40, %61, %71; +} +{ +sub.f16x2 r43, %60, %70; +} +{ +sub.f16x2 r46, %61, %71; +} +{ +add.f16x2 r49, %62, %68; +} +{ +add.f16x2 r52, %63, %69; +} +{ +sub.f16x2 r55, %62, %68; +} +{ +sub.f16x2 r58, %63, %69; +} +{ +add.f16x2 r61, %64, %66; +} +{ +add.f16x2 r64, %65, %67; +} +{ +sub.f16x2 r67, %64, %66; +} +{ +sub.f16x2 r70, %65, %67; +} +{ +add.f16x2 r73, %52, r1; +} +{ +add.f16x2 r76, %53, r4; +} +{ +add.f16x2 r79, r73, r13; +} +{ +add.f16x2 r82, r76, r16; +} +{ +add.f16x2 r85, r79, r25; +} +{ +add.f16x2 r88, r82, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 r103, r97, r61; +} +{ +add.f16x2 r106, r100, r64; +} +mov.u32 r1956, 0; +cvt.rn.f16.s32 rs1, r1956; +mov.b32 r121, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r1956; +mov.b32 r133, {rs2, rs2}; +mov.f64 fd291, 0d3FEC55A7E00740E9; +{ +cvt.rn.f16.f64 rs3, fd291; +} +mov.b32 r113, {rs3, rs3}; +{ +mul.f16x2 r111, r1, r113; +} +{ +add.f16x2 r114, %52, r111; +} +mov.f64 fd310, 0dBFDDBE064267C47C; +{ +cvt.rn.f16.f64 rs4, fd310; +} +mov.b32 r119, {rs4, rs4}; +{ +mul.f16x2 r117, r10, r119; +} +{ +add.f16x2 r120, r121, r117; +} +{ +cvt.rn.f16.f64 rs5, fd291; +} +mov.b32 r125, {rs5, rs5}; +{ +mul.f16x2 r123, r4, r125; +} +{ +add.f16x2 r126, %53, r123; +} +{ +cvt.rn.f16.f64 rs6, fd310; +} +mov.b32 r131, {rs6, rs6}; +{ +mul.f16x2 r129, r7, r131; +} +{ +add.f16x2 r132, r133, r129; +} +mov.f64 fd295, 0d3FE22D961EA71119; +{ +cvt.rn.f16.f64 rs7, fd295; +} +mov.b32 r137, {rs7, rs7}; +{ +mul.f16x2 r135, r13, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd306, 0dBFEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs8, fd306; +} +mov.b32 r143, {rs8, rs8}; +{ +mul.f16x2 r141, r22, r143; +} +{ +add.f16x2 r144, r120, r141; +} +{ +cvt.rn.f16.f64 rs9, fd295; +} +mov.b32 r149, {rs9, rs9}; +{ +mul.f16x2 r147, r16, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs10, fd306; +} +mov.b32 r155, {rs10, rs10}; +{ +mul.f16x2 r153, r19, r155; +} +{ +add.f16x2 r156, r132, r153; +} +mov.f64 fd299, 0d3FBEDB7DEBAA3ED8; +{ +cvt.rn.f16.f64 rs11, fd299; +} +mov.b32 r161, {rs11, rs11}; +{ +mul.f16x2 r159, r25, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd302, 0dBFEFC44566966769; +{ +cvt.rn.f16.f64 rs12, fd302; +} +mov.b32 r167, {rs12, rs12}; +{ +mul.f16x2 r165, r34, r167; +} +{ +add.f16x2 r168, r144, r165; +} +{ +cvt.rn.f16.f64 rs13, fd299; +} +mov.b32 r173, {rs13, rs13}; +{ +mul.f16x2 r171, r28, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs14, fd302; +} +mov.b32 r179, {rs14, rs14}; +{ +mul.f16x2 r177, r31, r179; +} +{ +add.f16x2 r180, r156, r177; +} +mov.f64 fd303, 0dBFD6B1D8B2365DA1; +{ +cvt.rn.f16.f64 rs15, fd303; +} +mov.b32 r185, {rs15, rs15}; +{ +mul.f16x2 r183, r37, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd304, 0dBFEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs16, fd304; +} +mov.b32 r191, {rs16, rs16}; +{ +mul.f16x2 r189, r46, r191; +} +{ +add.f16x2 r192, r168, r189; +} +{ +cvt.rn.f16.f64 rs17, fd303; +} +mov.b32 r197, {rs17, rs17}; +{ +mul.f16x2 r195, r40, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs18, fd304; +} +mov.b32 r203, {rs18, rs18}; +{ +mul.f16x2 r201, r43, r203; +} +{ +add.f16x2 r204, r180, r201; +} +mov.f64 fd307, 0dBFE7F3CCD0032E0C; +{ +cvt.rn.f16.f64 rs19, fd307; +} +mov.b32 r209, {rs19, rs19}; +{ +mul.f16x2 r207, r49, r209; +} +{ +add.f16x2 r210, r186, r207; +} +mov.f64 fd308, 0dBFE5384D024C2F84; +{ +cvt.rn.f16.f64 rs20, fd308; +} +mov.b32 r215, {rs20, rs20}; +{ +mul.f16x2 r213, r58, r215; +} +{ +add.f16x2 r216, r192, r213; +} +{ +cvt.rn.f16.f64 rs21, fd307; +} +mov.b32 r221, {rs21, rs21}; +{ +mul.f16x2 r219, r52, r221; +} +{ +add.f16x2 r222, r198, r219; +} +{ +cvt.rn.f16.f64 rs22, fd308; +} +mov.b32 r227, {rs22, rs22}; +{ +mul.f16x2 r225, r55, r227; +} +{ +add.f16x2 r228, r204, r225; +} +mov.f64 fd311, 0dBFEF11F493053D00; +{ +cvt.rn.f16.f64 rs23, fd311; +} +mov.b32 r233, {rs23, rs23}; +{ +mul.f16x2 r231, r61, r233; +} +{ +add.f16x2 r234, r210, r231; +} +mov.f64 fd312, 0dBFCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs24, fd312; +} +mov.b32 r239, {rs24, rs24}; +{ +mul.f16x2 r237, r70, r239; +} +{ +add.f16x2 r240, r216, r237; +} +{ +cvt.rn.f16.f64 rs25, fd311; +} +mov.b32 r245, {rs25, rs25}; +{ +mul.f16x2 r243, r64, r245; +} +{ +add.f16x2 r246, r222, r243; +} +{ +cvt.rn.f16.f64 rs26, fd312; +} +mov.b32 r251, {rs26, rs26}; +{ +mul.f16x2 r249, r67, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +sub.f16x2 r255, r234, r240; +} +{ +add.f16x2 r258, r246, r252; +} +{ +add.f16x2 r261, r234, r240; +} +{ +sub.f16x2 r264, r246, r252; +} +cvt.rn.f16.s32 rs27, r1956; +mov.b32 r279, {rs27, rs27}; +cvt.rn.f16.s32 rs28, r1956; +mov.b32 r291, {rs28, rs28}; +{ +cvt.rn.f16.f64 rs29, fd295; +} +mov.b32 r271, {rs29, rs29}; +{ +mul.f16x2 r269, r1, r271; +} +{ +add.f16x2 r272, %52, r269; +} +{ +cvt.rn.f16.f64 rs30, fd306; +} +mov.b32 r277, {rs30, rs30}; +{ +mul.f16x2 r275, r10, r277; +} +{ +add.f16x2 r278, r279, r275; +} +{ +cvt.rn.f16.f64 rs31, fd295; +} +mov.b32 r283, {rs31, rs31}; +{ +mul.f16x2 r281, r4, r283; +} +{ +add.f16x2 r284, %53, r281; +} +{ +cvt.rn.f16.f64 rs32, fd306; +} +mov.b32 r289, {rs32, rs32}; +{ +mul.f16x2 r287, r7, r289; +} +{ +add.f16x2 r290, r291, r287; +} +{ +cvt.rn.f16.f64 rs33, fd303; +} +mov.b32 r295, {rs33, rs33}; +{ +mul.f16x2 r293, r13, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs34, fd304; +} +mov.b32 r301, {rs34, rs34}; +{ +mul.f16x2 r299, r22, r301; +} +{ +add.f16x2 r302, r278, r299; +} +{ +cvt.rn.f16.f64 rs35, fd303; +} +mov.b32 r307, {rs35, rs35}; +{ +mul.f16x2 r305, r16, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs36, fd304; +} +mov.b32 r313, {rs36, rs36}; +{ +mul.f16x2 r311, r19, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs37, fd311; +} +mov.b32 r319, {rs37, rs37}; +{ +mul.f16x2 r317, r25, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs38, fd312; +} +mov.b32 r325, {rs38, rs38}; +{ +mul.f16x2 r323, r34, r325; +} +{ +add.f16x2 r326, r302, r323; +} +{ +cvt.rn.f16.f64 rs39, fd311; +} +mov.b32 r331, {rs39, rs39}; +{ +mul.f16x2 r329, r28, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs40, fd312; +} +mov.b32 r337, {rs40, rs40}; +{ +mul.f16x2 r335, r31, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs41, fd307; +} +mov.b32 r343, {rs41, rs41}; +{ +mul.f16x2 r341, r37, r343; +} +{ +add.f16x2 r344, r320, r341; +} +mov.f64 fd224, 0d3FE5384D024C2F84; +{ +cvt.rn.f16.f64 rs42, fd224; +} +mov.b32 r349, {rs42, rs42}; +{ +mul.f16x2 r347, r46, r349; +} +{ +add.f16x2 r350, r326, r347; +} +{ +cvt.rn.f16.f64 rs43, fd307; +} +mov.b32 r355, {rs43, rs43}; +{ +mul.f16x2 r353, r40, r355; +} +{ +add.f16x2 r356, r332, r353; +} +{ +cvt.rn.f16.f64 rs44, fd224; +} +mov.b32 r361, {rs44, rs44}; +{ +mul.f16x2 r359, r43, r361; +} +{ +add.f16x2 r362, r338, r359; +} +{ +cvt.rn.f16.f64 rs45, fd299; +} +mov.b32 r367, {rs45, rs45}; +{ +mul.f16x2 r365, r49, r367; +} +{ +add.f16x2 r368, r344, r365; +} +mov.f64 fd288, 0d3FEFC44566966769; +{ +cvt.rn.f16.f64 rs46, fd288; +} +mov.b32 r373, {rs46, rs46}; +{ +mul.f16x2 r371, r58, r373; +} +{ +add.f16x2 r374, r350, r371; +} +{ +cvt.rn.f16.f64 rs47, fd299; +} +mov.b32 r379, {rs47, rs47}; +{ +mul.f16x2 r377, r52, r379; +} +{ +add.f16x2 r380, r356, r377; +} +{ +cvt.rn.f16.f64 rs48, fd288; +} +mov.b32 r385, {rs48, rs48}; +{ +mul.f16x2 r383, r55, r385; +} +{ +add.f16x2 r386, r362, r383; +} +{ +cvt.rn.f16.f64 rs49, fd291; +} +mov.b32 r391, {rs49, rs49}; +{ +mul.f16x2 r389, r61, r391; +} +{ +add.f16x2 r392, r368, r389; +} +mov.f64 fd272, 0d3FDDBE064267C47C; +{ +cvt.rn.f16.f64 rs50, fd272; +} +mov.b32 r397, {rs50, rs50}; +{ +mul.f16x2 r395, r70, r397; +} +{ +add.f16x2 r398, r374, r395; +} +{ +cvt.rn.f16.f64 rs51, fd291; +} +mov.b32 r403, {rs51, rs51}; +{ +mul.f16x2 r401, r64, r403; +} +{ +add.f16x2 r404, r380, r401; +} +{ +cvt.rn.f16.f64 rs52, fd272; +} +mov.b32 r409, {rs52, rs52}; +{ +mul.f16x2 r407, r67, r409; +} +{ +add.f16x2 r410, r386, r407; +} +{ +sub.f16x2 r413, r392, r398; +} +{ +add.f16x2 r416, r404, r410; +} +{ +add.f16x2 r419, r392, r398; +} +{ +sub.f16x2 r422, r404, r410; +} +cvt.rn.f16.s32 rs53, r1956; +mov.b32 r437, {rs53, rs53}; +cvt.rn.f16.s32 rs54, r1956; +mov.b32 r449, {rs54, rs54}; +{ +cvt.rn.f16.f64 rs55, fd299; +} +mov.b32 r429, {rs55, rs55}; +{ +mul.f16x2 r427, r1, r429; +} +{ +add.f16x2 r430, %52, r427; +} +{ +cvt.rn.f16.f64 rs56, fd302; +} +mov.b32 r435, {rs56, rs56}; +{ +mul.f16x2 r433, r10, r435; +} +{ +add.f16x2 r436, r437, r433; +} +{ +cvt.rn.f16.f64 rs57, fd299; +} +mov.b32 r441, {rs57, rs57}; +{ +mul.f16x2 r439, r4, r441; +} +{ +add.f16x2 r442, %53, r439; +} +{ +cvt.rn.f16.f64 rs58, fd302; +} +mov.b32 r447, {rs58, rs58}; +{ +mul.f16x2 r445, r7, r447; +} +{ +add.f16x2 r448, r449, r445; +} +{ +cvt.rn.f16.f64 rs59, fd311; +} +mov.b32 r453, {rs59, rs59}; +{ +mul.f16x2 r451, r13, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs60, fd312; +} +mov.b32 r459, {rs60, rs60}; +{ +mul.f16x2 r457, r22, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs61, fd311; +} +mov.b32 r465, {rs61, rs61}; +{ +mul.f16x2 r463, r16, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs62, fd312; +} +mov.b32 r471, {rs62, rs62}; +{ +mul.f16x2 r469, r19, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs63, fd303; +} +mov.b32 r477, {rs63, rs63}; +{ +mul.f16x2 r475, r25, r477; +} +{ +add.f16x2 r478, r454, r475; +} +mov.f64 fd204, 0d3FEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs64, fd204; +} +mov.b32 r483, {rs64, rs64}; +{ +mul.f16x2 r481, r34, r483; +} +{ +add.f16x2 r484, r460, r481; +} +{ +cvt.rn.f16.f64 rs65, fd303; +} +mov.b32 r489, {rs65, rs65}; +{ +mul.f16x2 r487, r28, r489; +} +{ +add.f16x2 r490, r466, r487; +} +{ +cvt.rn.f16.f64 rs66, fd204; +} +mov.b32 r495, {rs66, rs66}; +{ +mul.f16x2 r493, r31, r495; +} +{ +add.f16x2 r496, r472, r493; +} +{ +cvt.rn.f16.f64 rs67, fd291; +} +mov.b32 r501, {rs67, rs67}; +{ +mul.f16x2 r499, r37, r501; +} +{ +add.f16x2 r502, r478, r499; +} +{ +cvt.rn.f16.f64 rs68, fd272; +} +mov.b32 r507, {rs68, rs68}; +{ +mul.f16x2 r505, r46, r507; +} +{ +add.f16x2 r508, r484, r505; +} +{ +cvt.rn.f16.f64 rs69, fd291; +} +mov.b32 r513, {rs69, rs69}; +{ +mul.f16x2 r511, r40, r513; +} +{ +add.f16x2 r514, r490, r511; +} +{ +cvt.rn.f16.f64 rs70, fd272; +} +mov.b32 r519, {rs70, rs70}; +{ +mul.f16x2 r517, r43, r519; +} +{ +add.f16x2 r520, r496, r517; +} +{ +cvt.rn.f16.f64 rs71, fd295; +} +mov.b32 r525, {rs71, rs71}; +{ +mul.f16x2 r523, r49, r525; +} +{ +add.f16x2 r526, r502, r523; +} +{ +cvt.rn.f16.f64 rs72, fd306; +} +mov.b32 r531, {rs72, rs72}; +{ +mul.f16x2 r529, r58, r531; +} +{ +add.f16x2 r532, r508, r529; +} +{ +cvt.rn.f16.f64 rs73, fd295; +} +mov.b32 r537, {rs73, rs73}; +{ +mul.f16x2 r535, r52, r537; +} +{ +add.f16x2 r538, r514, r535; +} +{ +cvt.rn.f16.f64 rs74, fd306; +} +mov.b32 r543, {rs74, rs74}; +{ +mul.f16x2 r541, r55, r543; +} +{ +add.f16x2 r544, r520, r541; +} +{ +cvt.rn.f16.f64 rs75, fd307; +} +mov.b32 r549, {rs75, rs75}; +{ +mul.f16x2 r547, r61, r549; +} +{ +add.f16x2 r550, r526, r547; +} +{ +cvt.rn.f16.f64 rs76, fd308; +} +mov.b32 r555, {rs76, rs76}; +{ +mul.f16x2 r553, r70, r555; +} +{ +add.f16x2 r556, r532, r553; +} +{ +cvt.rn.f16.f64 rs77, fd307; +} +mov.b32 r561, {rs77, rs77}; +{ +mul.f16x2 r559, r64, r561; +} +{ +add.f16x2 r562, r538, r559; +} +{ +cvt.rn.f16.f64 rs78, fd308; +} +mov.b32 r567, {rs78, rs78}; +{ +mul.f16x2 r565, r67, r567; +} +{ +add.f16x2 r568, r544, r565; +} +{ +sub.f16x2 r571, r550, r556; +} +{ +add.f16x2 r574, r562, r568; +} +{ +add.f16x2 r577, r550, r556; +} +{ +sub.f16x2 r580, r562, r568; +} +cvt.rn.f16.s32 rs79, r1956; +mov.b32 r595, {rs79, rs79}; +cvt.rn.f16.s32 rs80, r1956; +mov.b32 r607, {rs80, rs80}; +{ +cvt.rn.f16.f64 rs81, fd303; +} +mov.b32 r587, {rs81, rs81}; +{ +mul.f16x2 r585, r1, r587; +} +{ +add.f16x2 r588, %52, r585; +} +{ +cvt.rn.f16.f64 rs82, fd304; +} +mov.b32 r593, {rs82, rs82}; +{ +mul.f16x2 r591, r10, r593; +} +{ +add.f16x2 r594, r595, r591; +} +{ +cvt.rn.f16.f64 rs83, fd303; +} +mov.b32 r599, {rs83, rs83}; +{ +mul.f16x2 r597, r4, r599; +} +{ +add.f16x2 r600, %53, r597; +} +{ +cvt.rn.f16.f64 rs84, fd304; +} +mov.b32 r605, {rs84, rs84}; +{ +mul.f16x2 r603, r7, r605; +} +{ +add.f16x2 r606, r607, r603; +} +{ +cvt.rn.f16.f64 rs85, fd307; +} +mov.b32 r611, {rs85, rs85}; +{ +mul.f16x2 r609, r13, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +cvt.rn.f16.f64 rs86, fd224; +} +mov.b32 r617, {rs86, rs86}; +{ +mul.f16x2 r615, r22, r617; +} +{ +add.f16x2 r618, r594, r615; +} +{ +cvt.rn.f16.f64 rs87, fd307; +} +mov.b32 r623, {rs87, rs87}; +{ +mul.f16x2 r621, r16, r623; +} +{ +add.f16x2 r624, r600, r621; +} +{ +cvt.rn.f16.f64 rs88, fd224; +} +mov.b32 r629, {rs88, rs88}; +{ +mul.f16x2 r627, r19, r629; +} +{ +add.f16x2 r630, r606, r627; +} +{ +cvt.rn.f16.f64 rs89, fd291; +} +mov.b32 r635, {rs89, rs89}; +{ +mul.f16x2 r633, r25, r635; +} +{ +add.f16x2 r636, r612, r633; +} +{ +cvt.rn.f16.f64 rs90, fd272; +} +mov.b32 r641, {rs90, rs90}; +{ +mul.f16x2 r639, r34, r641; +} +{ +add.f16x2 r642, r618, r639; +} +{ +cvt.rn.f16.f64 rs91, fd291; +} +mov.b32 r647, {rs91, rs91}; +{ +mul.f16x2 r645, r28, r647; +} +{ +add.f16x2 r648, r624, r645; +} +{ +cvt.rn.f16.f64 rs92, fd272; +} +mov.b32 r653, {rs92, rs92}; +{ +mul.f16x2 r651, r31, r653; +} +{ +add.f16x2 r654, r630, r651; +} +{ +cvt.rn.f16.f64 rs93, fd299; +} +mov.b32 r659, {rs93, rs93}; +{ +mul.f16x2 r657, r37, r659; +} +{ +add.f16x2 r660, r636, r657; +} +{ +cvt.rn.f16.f64 rs94, fd302; +} +mov.b32 r665, {rs94, rs94}; +{ +mul.f16x2 r663, r46, r665; +} +{ +add.f16x2 r666, r642, r663; +} +{ +cvt.rn.f16.f64 rs95, fd299; +} +mov.b32 r671, {rs95, rs95}; +{ +mul.f16x2 r669, r40, r671; +} +{ +add.f16x2 r672, r648, r669; +} +{ +cvt.rn.f16.f64 rs96, fd302; +} +mov.b32 r677, {rs96, rs96}; +{ +mul.f16x2 r675, r43, r677; +} +{ +add.f16x2 r678, r654, r675; +} +{ +cvt.rn.f16.f64 rs97, fd311; +} +mov.b32 r683, {rs97, rs97}; +{ +mul.f16x2 r681, r49, r683; +} +{ +add.f16x2 r684, r660, r681; +} +mov.f64 fd256, 0d3FCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs98, fd256; +} +mov.b32 r689, {rs98, rs98}; +{ +mul.f16x2 r687, r58, r689; +} +{ +add.f16x2 r690, r666, r687; +} +{ +cvt.rn.f16.f64 rs99, fd311; +} +mov.b32 r695, {rs99, rs99}; +{ +mul.f16x2 r693, r52, r695; +} +{ +add.f16x2 r696, r672, r693; +} +{ +cvt.rn.f16.f64 rs100, fd256; +} +mov.b32 r701, {rs100, rs100}; +{ +mul.f16x2 r699, r55, r701; +} +{ +add.f16x2 r702, r678, r699; +} +{ +cvt.rn.f16.f64 rs101, fd295; +} +mov.b32 r707, {rs101, rs101}; +{ +mul.f16x2 r705, r61, r707; +} +{ +add.f16x2 r708, r684, r705; +} +mov.f64 fd280, 0d3FEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs102, fd280; +} +mov.b32 r713, {rs102, rs102}; +{ +mul.f16x2 r711, r70, r713; +} +{ +add.f16x2 r714, r690, r711; +} +{ +cvt.rn.f16.f64 rs103, fd295; +} +mov.b32 r719, {rs103, rs103}; +{ +mul.f16x2 r717, r64, r719; +} +{ +add.f16x2 r720, r696, r717; +} +{ +cvt.rn.f16.f64 rs104, fd280; +} +mov.b32 r725, {rs104, rs104}; +{ +mul.f16x2 r723, r67, r725; +} +{ +add.f16x2 r726, r702, r723; +} +{ +sub.f16x2 r729, r708, r714; +} +{ +add.f16x2 r732, r720, r726; +} +{ +add.f16x2 r735, r708, r714; +} +{ +sub.f16x2 r738, r720, r726; +} +cvt.rn.f16.s32 rs105, r1956; +mov.b32 r753, {rs105, rs105}; +cvt.rn.f16.s32 rs106, r1956; +mov.b32 r765, {rs106, rs106}; +{ +cvt.rn.f16.f64 rs107, fd307; +} +mov.b32 r745, {rs107, rs107}; +{ +mul.f16x2 r743, r1, r745; +} +{ +add.f16x2 r746, %52, r743; +} +{ +cvt.rn.f16.f64 rs108, fd308; +} +mov.b32 r751, {rs108, rs108}; +{ +mul.f16x2 r749, r10, r751; +} +{ +add.f16x2 r752, r753, r749; +} +{ +cvt.rn.f16.f64 rs109, fd307; +} +mov.b32 r757, {rs109, rs109}; +{ +mul.f16x2 r755, r4, r757; +} +{ +add.f16x2 r758, %53, r755; +} +{ +cvt.rn.f16.f64 rs110, fd308; +} +mov.b32 r763, {rs110, rs110}; +{ +mul.f16x2 r761, r7, r763; +} +{ +add.f16x2 r764, r765, r761; +} +{ +cvt.rn.f16.f64 rs111, fd299; +} +mov.b32 r769, {rs111, rs111}; +{ +mul.f16x2 r767, r13, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs112, fd288; +} +mov.b32 r775, {rs112, rs112}; +{ +mul.f16x2 r773, r22, r775; +} +{ +add.f16x2 r776, r752, r773; +} +{ +cvt.rn.f16.f64 rs113, fd299; +} +mov.b32 r781, {rs113, rs113}; +{ +mul.f16x2 r779, r16, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs114, fd288; +} +mov.b32 r787, {rs114, rs114}; +{ +mul.f16x2 r785, r19, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs115, fd295; +} +mov.b32 r793, {rs115, rs115}; +{ +mul.f16x2 r791, r25, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs116, fd306; +} +mov.b32 r799, {rs116, rs116}; +{ +mul.f16x2 r797, r34, r799; +} +{ +add.f16x2 r800, r776, r797; +} +{ +cvt.rn.f16.f64 rs117, fd295; +} +mov.b32 r805, {rs117, rs117}; +{ +mul.f16x2 r803, r28, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs118, fd306; +} +mov.b32 r811, {rs118, rs118}; +{ +mul.f16x2 r809, r31, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs119, fd311; +} +mov.b32 r817, {rs119, rs119}; +{ +mul.f16x2 r815, r37, r817; +} +{ +add.f16x2 r818, r794, r815; +} +{ +cvt.rn.f16.f64 rs120, fd256; +} +mov.b32 r823, {rs120, rs120}; +{ +mul.f16x2 r821, r46, r823; +} +{ +add.f16x2 r824, r800, r821; +} +{ +cvt.rn.f16.f64 rs121, fd311; +} +mov.b32 r829, {rs121, rs121}; +{ +mul.f16x2 r827, r40, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs122, fd256; +} +mov.b32 r835, {rs122, rs122}; +{ +mul.f16x2 r833, r43, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs123, fd291; +} +mov.b32 r841, {rs123, rs123}; +{ +mul.f16x2 r839, r49, r841; +} +{ +add.f16x2 r842, r818, r839; +} +{ +cvt.rn.f16.f64 rs124, fd272; +} +mov.b32 r847, {rs124, rs124}; +{ +mul.f16x2 r845, r58, r847; +} +{ +add.f16x2 r848, r824, r845; +} +{ +cvt.rn.f16.f64 rs125, fd291; +} +mov.b32 r853, {rs125, rs125}; +{ +mul.f16x2 r851, r52, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs126, fd272; +} +mov.b32 r859, {rs126, rs126}; +{ +mul.f16x2 r857, r55, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs127, fd303; +} +mov.b32 r865, {rs127, rs127}; +{ +mul.f16x2 r863, r61, r865; +} +{ +add.f16x2 r866, r842, r863; +} +{ +cvt.rn.f16.f64 rs128, fd304; +} +mov.b32 r871, {rs128, rs128}; +{ +mul.f16x2 r869, r70, r871; +} +{ +add.f16x2 r872, r848, r869; +} +{ +cvt.rn.f16.f64 rs129, fd303; +} +mov.b32 r877, {rs129, rs129}; +{ +mul.f16x2 r875, r64, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs130, fd304; +} +mov.b32 r883, {rs130, rs130}; +{ +mul.f16x2 r881, r67, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +sub.f16x2 r887, r866, r872; +} +{ +add.f16x2 r890, r878, r884; +} +{ +add.f16x2 r893, r866, r872; +} +{ +sub.f16x2 r896, r878, r884; +} +cvt.rn.f16.s32 rs131, r1956; +mov.b32 r911, {rs131, rs131}; +cvt.rn.f16.s32 rs132, r1956; +mov.b32 r923, {rs132, rs132}; +{ +cvt.rn.f16.f64 rs133, fd311; +} +mov.b32 r903, {rs133, rs133}; +{ +mul.f16x2 r901, r1, r903; +} +{ +add.f16x2 r904, %52, r901; +} +{ +cvt.rn.f16.f64 rs134, fd312; +} +mov.b32 r909, {rs134, rs134}; +{ +mul.f16x2 r907, r10, r909; +} +{ +add.f16x2 r910, r911, r907; +} +{ +cvt.rn.f16.f64 rs135, fd311; +} +mov.b32 r915, {rs135, rs135}; +{ +mul.f16x2 r913, r4, r915; +} +{ +add.f16x2 r916, %53, r913; +} +{ +cvt.rn.f16.f64 rs136, fd312; +} +mov.b32 r921, {rs136, rs136}; +{ +mul.f16x2 r919, r7, r921; +} +{ +add.f16x2 r922, r923, r919; +} +{ +cvt.rn.f16.f64 rs137, fd291; +} +mov.b32 r927, {rs137, rs137}; +{ +mul.f16x2 r925, r13, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs138, fd272; +} +mov.b32 r933, {rs138, rs138}; +{ +mul.f16x2 r931, r22, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs139, fd291; +} +mov.b32 r939, {rs139, rs139}; +{ +mul.f16x2 r937, r16, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs140, fd272; +} +mov.b32 r945, {rs140, rs140}; +{ +mul.f16x2 r943, r19, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs141, fd307; +} +mov.b32 r951, {rs141, rs141}; +{ +mul.f16x2 r949, r25, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs142, fd308; +} +mov.b32 r957, {rs142, rs142}; +{ +mul.f16x2 r955, r34, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs143, fd307; +} +mov.b32 r963, {rs143, rs143}; +{ +mul.f16x2 r961, r28, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs144, fd308; +} +mov.b32 r969, {rs144, rs144}; +{ +mul.f16x2 r967, r31, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +cvt.rn.f16.f64 rs145, fd295; +} +mov.b32 r975, {rs145, rs145}; +{ +mul.f16x2 r973, r37, r975; +} +{ +add.f16x2 r976, r952, r973; +} +{ +cvt.rn.f16.f64 rs146, fd280; +} +mov.b32 r981, {rs146, rs146}; +{ +mul.f16x2 r979, r46, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs147, fd295; +} +mov.b32 r987, {rs147, rs147}; +{ +mul.f16x2 r985, r40, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs148, fd280; +} +mov.b32 r993, {rs148, rs148}; +{ +mul.f16x2 r991, r43, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs149, fd303; +} +mov.b32 r999, {rs149, rs149}; +{ +mul.f16x2 r997, r49, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs150, fd304; +} +mov.b32 r1005, {rs150, rs150}; +{ +mul.f16x2 r1003, r58, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs151, fd303; +} +mov.b32 r1011, {rs151, rs151}; +{ +mul.f16x2 r1009, r52, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs152, fd304; +} +mov.b32 r1017, {rs152, rs152}; +{ +mul.f16x2 r1015, r55, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +cvt.rn.f16.f64 rs153, fd299; +} +mov.b32 r1023, {rs153, rs153}; +{ +mul.f16x2 r1021, r61, r1023; +} +{ +add.f16x2 r1024, r1000, r1021; +} +{ +cvt.rn.f16.f64 rs154, fd288; +} +mov.b32 r1029, {rs154, rs154}; +{ +mul.f16x2 r1027, r70, r1029; +} +{ +add.f16x2 r1030, r1006, r1027; +} +{ +cvt.rn.f16.f64 rs155, fd299; +} +mov.b32 r1035, {rs155, rs155}; +{ +mul.f16x2 r1033, r64, r1035; +} +{ +add.f16x2 r1036, r1012, r1033; +} +{ +cvt.rn.f16.f64 rs156, fd288; +} +mov.b32 r1041, {rs156, rs156}; +{ +mul.f16x2 r1039, r67, r1041; +} +{ +add.f16x2 r1042, r1018, r1039; +} +{ +sub.f16x2 r1045, r1024, r1030; +} +{ +add.f16x2 r1048, r1036, r1042; +} +{ +add.f16x2 r1051, r1024, r1030; +} +{ +sub.f16x2 r1054, r1036, r1042; +} +{ +add.f16x2 r1057, %96, %88; +} +{ +add.f16x2 r1060, %99, %93; +} +{ +sub.f16x2 r1063, %96, %88; +} +{ +sub.f16x2 r1066, %99, %93; +} +{ +add.f16x2 r1069, %83, %103; +} +{ +add.f16x2 r1072, %87, %81; +} +{ +sub.f16x2 r1075, %83, %103; +} +{ +sub.f16x2 r1078, %87, %81; +} +{ +add.f16x2 r1081, %97, %91; +} +{ +add.f16x2 r1084, %101, %94; +} +{ +sub.f16x2 r1087, %97, %91; +} +{ +sub.f16x2 r1090, %101, %94; +} +{ +add.f16x2 r1093, %85, %78; +} +{ +add.f16x2 r1096, %89, %82; +} +{ +sub.f16x2 r1099, %85, %78; +} +{ +sub.f16x2 r1102, %89, %82; +} +{ +add.f16x2 r1105, %98, %92; +} +{ +add.f16x2 r1108, %102, %95; +} +{ +sub.f16x2 r1111, %98, %92; +} +{ +sub.f16x2 r1114, %102, %95; +} +{ +add.f16x2 r1117, %86, %80; +} +{ +add.f16x2 r1120, %90, %84; +} +{ +sub.f16x2 r1123, %86, %80; +} +{ +sub.f16x2 r1126, %90, %84; +} +{ +add.f16x2 r1129, %100, r1057; +} +{ +add.f16x2 r1132, %79, r1060; +} +{ +add.f16x2 r1135, r1129, r1069; +} +{ +add.f16x2 r1138, r1132, r1072; +} +{ +add.f16x2 r1141, r1135, r1081; +} +{ +add.f16x2 r1144, r1138, r1084; +} +{ +add.f16x2 r1147, r1141, r1093; +} +{ +add.f16x2 r1150, r1144, r1096; +} +{ +add.f16x2 r1153, r1147, r1105; +} +{ +add.f16x2 r1156, r1150, r1108; +} +{ +add.f16x2 r1159, r1153, r1117; +} +{ +add.f16x2 r1162, r1156, r1120; +} +cvt.rn.f16.s32 rs157, r1956; +mov.b32 r1177, {rs157, rs157}; +cvt.rn.f16.s32 rs158, r1956; +mov.b32 r1189, {rs158, rs158}; +{ +cvt.rn.f16.f64 rs159, fd291; +} +mov.b32 r1169, {rs159, rs159}; +{ +mul.f16x2 r1167, r1057, r1169; +} +{ +add.f16x2 r1170, %100, r1167; +} +{ +cvt.rn.f16.f64 rs160, fd310; +} +mov.b32 r1175, {rs160, rs160}; +{ +mul.f16x2 r1173, r1066, r1175; +} +{ +add.f16x2 r1176, r1177, r1173; +} +{ +cvt.rn.f16.f64 rs161, fd291; +} +mov.b32 r1181, {rs161, rs161}; +{ +mul.f16x2 r1179, r1060, r1181; +} +{ +add.f16x2 r1182, %79, r1179; +} +{ +cvt.rn.f16.f64 rs162, fd310; +} +mov.b32 r1187, {rs162, rs162}; +{ +mul.f16x2 r1185, r1063, r1187; +} +{ +add.f16x2 r1188, r1189, r1185; +} +{ +cvt.rn.f16.f64 rs163, fd295; +} +mov.b32 r1193, {rs163, rs163}; +{ +mul.f16x2 r1191, r1069, r1193; +} +{ +add.f16x2 r1194, r1170, r1191; +} +{ +cvt.rn.f16.f64 rs164, fd306; +} +mov.b32 r1199, {rs164, rs164}; +{ +mul.f16x2 r1197, r1078, r1199; +} +{ +add.f16x2 r1200, r1176, r1197; +} +{ +cvt.rn.f16.f64 rs165, fd295; +} +mov.b32 r1205, {rs165, rs165}; +{ +mul.f16x2 r1203, r1072, r1205; +} +{ +add.f16x2 r1206, r1182, r1203; +} +{ +cvt.rn.f16.f64 rs166, fd306; +} +mov.b32 r1211, {rs166, rs166}; +{ +mul.f16x2 r1209, r1075, r1211; +} +{ +add.f16x2 r1212, r1188, r1209; +} +{ +cvt.rn.f16.f64 rs167, fd299; +} +mov.b32 r1217, {rs167, rs167}; +{ +mul.f16x2 r1215, r1081, r1217; +} +{ +add.f16x2 r1218, r1194, r1215; +} +{ +cvt.rn.f16.f64 rs168, fd302; +} +mov.b32 r1223, {rs168, rs168}; +{ +mul.f16x2 r1221, r1090, r1223; +} +{ +add.f16x2 r1224, r1200, r1221; +} +{ +cvt.rn.f16.f64 rs169, fd299; +} +mov.b32 r1229, {rs169, rs169}; +{ +mul.f16x2 r1227, r1084, r1229; +} +{ +add.f16x2 r1230, r1206, r1227; +} +{ +cvt.rn.f16.f64 rs170, fd302; +} +mov.b32 r1235, {rs170, rs170}; +{ +mul.f16x2 r1233, r1087, r1235; +} +{ +add.f16x2 r1236, r1212, r1233; +} +{ +cvt.rn.f16.f64 rs171, fd303; +} +mov.b32 r1241, {rs171, rs171}; +{ +mul.f16x2 r1239, r1093, r1241; +} +{ +add.f16x2 r1242, r1218, r1239; +} +{ +cvt.rn.f16.f64 rs172, fd304; +} +mov.b32 r1247, {rs172, rs172}; +{ +mul.f16x2 r1245, r1102, r1247; +} +{ +add.f16x2 r1248, r1224, r1245; +} +{ +cvt.rn.f16.f64 rs173, fd303; +} +mov.b32 r1253, {rs173, rs173}; +{ +mul.f16x2 r1251, r1096, r1253; +} +{ +add.f16x2 r1254, r1230, r1251; +} +{ +cvt.rn.f16.f64 rs174, fd304; +} +mov.b32 r1259, {rs174, rs174}; +{ +mul.f16x2 r1257, r1099, r1259; +} +{ +add.f16x2 r1260, r1236, r1257; +} +{ +cvt.rn.f16.f64 rs175, fd307; +} +mov.b32 r1265, {rs175, rs175}; +{ +mul.f16x2 r1263, r1105, r1265; +} +{ +add.f16x2 r1266, r1242, r1263; +} +{ +cvt.rn.f16.f64 rs176, fd308; +} +mov.b32 r1271, {rs176, rs176}; +{ +mul.f16x2 r1269, r1114, r1271; +} +{ +add.f16x2 r1272, r1248, r1269; +} +{ +cvt.rn.f16.f64 rs177, fd307; +} +mov.b32 r1277, {rs177, rs177}; +{ +mul.f16x2 r1275, r1108, r1277; +} +{ +add.f16x2 r1278, r1254, r1275; +} +{ +cvt.rn.f16.f64 rs178, fd308; +} +mov.b32 r1283, {rs178, rs178}; +{ +mul.f16x2 r1281, r1111, r1283; +} +{ +add.f16x2 r1284, r1260, r1281; +} +{ +cvt.rn.f16.f64 rs179, fd311; +} +mov.b32 r1289, {rs179, rs179}; +{ +mul.f16x2 r1287, r1117, r1289; +} +{ +add.f16x2 r1290, r1266, r1287; +} +{ +cvt.rn.f16.f64 rs180, fd312; +} +mov.b32 r1295, {rs180, rs180}; +{ +mul.f16x2 r1293, r1126, r1295; +} +{ +add.f16x2 r1296, r1272, r1293; +} +{ +cvt.rn.f16.f64 rs181, fd311; +} +mov.b32 r1301, {rs181, rs181}; +{ +mul.f16x2 r1299, r1120, r1301; +} +{ +add.f16x2 r1302, r1278, r1299; +} +{ +cvt.rn.f16.f64 rs182, fd312; +} +mov.b32 r1307, {rs182, rs182}; +{ +mul.f16x2 r1305, r1123, r1307; +} +{ +add.f16x2 r1308, r1284, r1305; +} +{ +sub.f16x2 r1311, r1290, r1296; +} +{ +add.f16x2 r1314, r1302, r1308; +} +{ +add.f16x2 r1317, r1290, r1296; +} +{ +sub.f16x2 r1320, r1302, r1308; +} +cvt.rn.f16.s32 rs183, r1956; +mov.b32 r1335, {rs183, rs183}; +cvt.rn.f16.s32 rs184, r1956; +mov.b32 r1347, {rs184, rs184}; +{ +cvt.rn.f16.f64 rs185, fd295; +} +mov.b32 r1327, {rs185, rs185}; +{ +mul.f16x2 r1325, r1057, r1327; +} +{ +add.f16x2 r1328, %100, r1325; +} +{ +cvt.rn.f16.f64 rs186, fd306; +} +mov.b32 r1333, {rs186, rs186}; +{ +mul.f16x2 r1331, r1066, r1333; +} +{ +add.f16x2 r1334, r1335, r1331; +} +{ +cvt.rn.f16.f64 rs187, fd295; +} +mov.b32 r1339, {rs187, rs187}; +{ +mul.f16x2 r1337, r1060, r1339; +} +{ +add.f16x2 r1340, %79, r1337; +} +{ +cvt.rn.f16.f64 rs188, fd306; +} +mov.b32 r1345, {rs188, rs188}; +{ +mul.f16x2 r1343, r1063, r1345; +} +{ +add.f16x2 r1346, r1347, r1343; +} +{ +cvt.rn.f16.f64 rs189, fd303; +} +mov.b32 r1351, {rs189, rs189}; +{ +mul.f16x2 r1349, r1069, r1351; +} +{ +add.f16x2 r1352, r1328, r1349; +} +{ +cvt.rn.f16.f64 rs190, fd304; +} +mov.b32 r1357, {rs190, rs190}; +{ +mul.f16x2 r1355, r1078, r1357; +} +{ +add.f16x2 r1358, r1334, r1355; +} +{ +cvt.rn.f16.f64 rs191, fd303; +} +mov.b32 r1363, {rs191, rs191}; +{ +mul.f16x2 r1361, r1072, r1363; +} +{ +add.f16x2 r1364, r1340, r1361; +} +{ +cvt.rn.f16.f64 rs192, fd304; +} +mov.b32 r1369, {rs192, rs192}; +{ +mul.f16x2 r1367, r1075, r1369; +} +{ +add.f16x2 r1370, r1346, r1367; +} +{ +cvt.rn.f16.f64 rs193, fd311; +} +mov.b32 r1375, {rs193, rs193}; +{ +mul.f16x2 r1373, r1081, r1375; +} +{ +add.f16x2 r1376, r1352, r1373; +} +{ +cvt.rn.f16.f64 rs194, fd312; +} +mov.b32 r1381, {rs194, rs194}; +{ +mul.f16x2 r1379, r1090, r1381; +} +{ +add.f16x2 r1382, r1358, r1379; +} +{ +cvt.rn.f16.f64 rs195, fd311; +} +mov.b32 r1387, {rs195, rs195}; +{ +mul.f16x2 r1385, r1084, r1387; +} +{ +add.f16x2 r1388, r1364, r1385; +} +{ +cvt.rn.f16.f64 rs196, fd312; +} +mov.b32 r1393, {rs196, rs196}; +{ +mul.f16x2 r1391, r1087, r1393; +} +{ +add.f16x2 r1394, r1370, r1391; +} +{ +cvt.rn.f16.f64 rs197, fd307; +} +mov.b32 r1399, {rs197, rs197}; +{ +mul.f16x2 r1397, r1093, r1399; +} +{ +add.f16x2 r1400, r1376, r1397; +} +{ +cvt.rn.f16.f64 rs198, fd224; +} +mov.b32 r1405, {rs198, rs198}; +{ +mul.f16x2 r1403, r1102, r1405; +} +{ +add.f16x2 r1406, r1382, r1403; +} +{ +cvt.rn.f16.f64 rs199, fd307; +} +mov.b32 r1411, {rs199, rs199}; +{ +mul.f16x2 r1409, r1096, r1411; +} +{ +add.f16x2 r1412, r1388, r1409; +} +{ +cvt.rn.f16.f64 rs200, fd224; +} +mov.b32 r1417, {rs200, rs200}; +{ +mul.f16x2 r1415, r1099, r1417; +} +{ +add.f16x2 r1418, r1394, r1415; +} +{ +cvt.rn.f16.f64 rs201, fd299; +} +mov.b32 r1423, {rs201, rs201}; +{ +mul.f16x2 r1421, r1105, r1423; +} +{ +add.f16x2 r1424, r1400, r1421; +} +{ +cvt.rn.f16.f64 rs202, fd288; +} +mov.b32 r1429, {rs202, rs202}; +{ +mul.f16x2 r1427, r1114, r1429; +} +{ +add.f16x2 r1430, r1406, r1427; +} +{ +cvt.rn.f16.f64 rs203, fd299; +} +mov.b32 r1435, {rs203, rs203}; +{ +mul.f16x2 r1433, r1108, r1435; +} +{ +add.f16x2 r1436, r1412, r1433; +} +{ +cvt.rn.f16.f64 rs204, fd288; +} +mov.b32 r1441, {rs204, rs204}; +{ +mul.f16x2 r1439, r1111, r1441; +} +{ +add.f16x2 r1442, r1418, r1439; +} +{ +cvt.rn.f16.f64 rs205, fd291; +} +mov.b32 r1447, {rs205, rs205}; +{ +mul.f16x2 r1445, r1117, r1447; +} +{ +add.f16x2 r1448, r1424, r1445; +} +{ +cvt.rn.f16.f64 rs206, fd272; +} +mov.b32 r1453, {rs206, rs206}; +{ +mul.f16x2 r1451, r1126, r1453; +} +{ +add.f16x2 r1454, r1430, r1451; +} +{ +cvt.rn.f16.f64 rs207, fd291; +} +mov.b32 r1459, {rs207, rs207}; +{ +mul.f16x2 r1457, r1120, r1459; +} +{ +add.f16x2 r1460, r1436, r1457; +} +{ +cvt.rn.f16.f64 rs208, fd272; +} +mov.b32 r1465, {rs208, rs208}; +{ +mul.f16x2 r1463, r1123, r1465; +} +{ +add.f16x2 r1466, r1442, r1463; +} +{ +sub.f16x2 r1469, r1448, r1454; +} +{ +add.f16x2 r1472, r1460, r1466; +} +{ +add.f16x2 r1475, r1448, r1454; +} +{ +sub.f16x2 r1478, r1460, r1466; +} +cvt.rn.f16.s32 rs209, r1956; +mov.b32 r1493, {rs209, rs209}; +cvt.rn.f16.s32 rs210, r1956; +mov.b32 r1505, {rs210, rs210}; +{ +cvt.rn.f16.f64 rs211, fd299; +} +mov.b32 r1485, {rs211, rs211}; +{ +mul.f16x2 r1483, r1057, r1485; +} +{ +add.f16x2 r1486, %100, r1483; +} +{ +cvt.rn.f16.f64 rs212, fd302; +} +mov.b32 r1491, {rs212, rs212}; +{ +mul.f16x2 r1489, r1066, r1491; +} +{ +add.f16x2 r1492, r1493, r1489; +} +{ +cvt.rn.f16.f64 rs213, fd299; +} +mov.b32 r1497, {rs213, rs213}; +{ +mul.f16x2 r1495, r1060, r1497; +} +{ +add.f16x2 r1498, %79, r1495; +} +{ +cvt.rn.f16.f64 rs214, fd302; +} +mov.b32 r1503, {rs214, rs214}; +{ +mul.f16x2 r1501, r1063, r1503; +} +{ +add.f16x2 r1504, r1505, r1501; +} +{ +cvt.rn.f16.f64 rs215, fd311; +} +mov.b32 r1509, {rs215, rs215}; +{ +mul.f16x2 r1507, r1069, r1509; +} +{ +add.f16x2 r1510, r1486, r1507; +} +{ +cvt.rn.f16.f64 rs216, fd312; +} +mov.b32 r1515, {rs216, rs216}; +{ +mul.f16x2 r1513, r1078, r1515; +} +{ +add.f16x2 r1516, r1492, r1513; +} +{ +cvt.rn.f16.f64 rs217, fd311; +} +mov.b32 r1521, {rs217, rs217}; +{ +mul.f16x2 r1519, r1072, r1521; +} +{ +add.f16x2 r1522, r1498, r1519; +} +{ +cvt.rn.f16.f64 rs218, fd312; +} +mov.b32 r1527, {rs218, rs218}; +{ +mul.f16x2 r1525, r1075, r1527; +} +{ +add.f16x2 r1528, r1504, r1525; +} +{ +cvt.rn.f16.f64 rs219, fd303; +} +mov.b32 r1533, {rs219, rs219}; +{ +mul.f16x2 r1531, r1081, r1533; +} +{ +add.f16x2 r1534, r1510, r1531; +} +{ +cvt.rn.f16.f64 rs220, fd204; +} +mov.b32 r1539, {rs220, rs220}; +{ +mul.f16x2 r1537, r1090, r1539; +} +{ +add.f16x2 r1540, r1516, r1537; +} +{ +cvt.rn.f16.f64 rs221, fd303; +} +mov.b32 r1545, {rs221, rs221}; +{ +mul.f16x2 r1543, r1084, r1545; +} +{ +add.f16x2 r1546, r1522, r1543; +} +{ +cvt.rn.f16.f64 rs222, fd204; +} +mov.b32 r1551, {rs222, rs222}; +{ +mul.f16x2 r1549, r1087, r1551; +} +{ +add.f16x2 r1552, r1528, r1549; +} +{ +cvt.rn.f16.f64 rs223, fd291; +} +mov.b32 r1557, {rs223, rs223}; +{ +mul.f16x2 r1555, r1093, r1557; +} +{ +add.f16x2 r1558, r1534, r1555; +} +{ +cvt.rn.f16.f64 rs224, fd272; +} +mov.b32 r1563, {rs224, rs224}; +{ +mul.f16x2 r1561, r1102, r1563; +} +{ +add.f16x2 r1564, r1540, r1561; +} +{ +cvt.rn.f16.f64 rs225, fd291; +} +mov.b32 r1569, {rs225, rs225}; +{ +mul.f16x2 r1567, r1096, r1569; +} +{ +add.f16x2 r1570, r1546, r1567; +} +{ +cvt.rn.f16.f64 rs226, fd272; +} +mov.b32 r1575, {rs226, rs226}; +{ +mul.f16x2 r1573, r1099, r1575; +} +{ +add.f16x2 r1576, r1552, r1573; +} +{ +cvt.rn.f16.f64 rs227, fd295; +} +mov.b32 r1581, {rs227, rs227}; +{ +mul.f16x2 r1579, r1105, r1581; +} +{ +add.f16x2 r1582, r1558, r1579; +} +{ +cvt.rn.f16.f64 rs228, fd306; +} +mov.b32 r1587, {rs228, rs228}; +{ +mul.f16x2 r1585, r1114, r1587; +} +{ +add.f16x2 r1588, r1564, r1585; +} +{ +cvt.rn.f16.f64 rs229, fd295; +} +mov.b32 r1593, {rs229, rs229}; +{ +mul.f16x2 r1591, r1108, r1593; +} +{ +add.f16x2 r1594, r1570, r1591; +} +{ +cvt.rn.f16.f64 rs230, fd306; +} +mov.b32 r1599, {rs230, rs230}; +{ +mul.f16x2 r1597, r1111, r1599; +} +{ +add.f16x2 r1600, r1576, r1597; +} +{ +cvt.rn.f16.f64 rs231, fd307; +} +mov.b32 r1605, {rs231, rs231}; +{ +mul.f16x2 r1603, r1117, r1605; +} +{ +add.f16x2 r1606, r1582, r1603; +} +{ +cvt.rn.f16.f64 rs232, fd308; +} +mov.b32 r1611, {rs232, rs232}; +{ +mul.f16x2 r1609, r1126, r1611; +} +{ +add.f16x2 r1612, r1588, r1609; +} +{ +cvt.rn.f16.f64 rs233, fd307; +} +mov.b32 r1617, {rs233, rs233}; +{ +mul.f16x2 r1615, r1120, r1617; +} +{ +add.f16x2 r1618, r1594, r1615; +} +{ +cvt.rn.f16.f64 rs234, fd308; +} +mov.b32 r1623, {rs234, rs234}; +{ +mul.f16x2 r1621, r1123, r1623; +} +{ +add.f16x2 r1624, r1600, r1621; +} +{ +sub.f16x2 r1627, r1606, r1612; +} +{ +add.f16x2 r1630, r1618, r1624; +} +{ +add.f16x2 r1633, r1606, r1612; +} +{ +sub.f16x2 r1636, r1618, r1624; +} +cvt.rn.f16.s32 rs235, r1956; +mov.b32 r1651, {rs235, rs235}; +cvt.rn.f16.s32 rs236, r1956; +mov.b32 r1663, {rs236, rs236}; +{ +cvt.rn.f16.f64 rs237, fd303; +} +mov.b32 r1643, {rs237, rs237}; +{ +mul.f16x2 r1641, r1057, r1643; +} +{ +add.f16x2 r1644, %100, r1641; +} +{ +cvt.rn.f16.f64 rs238, fd304; +} +mov.b32 r1649, {rs238, rs238}; +{ +mul.f16x2 r1647, r1066, r1649; +} +{ +add.f16x2 r1650, r1651, r1647; +} +{ +cvt.rn.f16.f64 rs239, fd303; +} +mov.b32 r1655, {rs239, rs239}; +{ +mul.f16x2 r1653, r1060, r1655; +} +{ +add.f16x2 r1656, %79, r1653; +} +{ +cvt.rn.f16.f64 rs240, fd304; +} +mov.b32 r1661, {rs240, rs240}; +{ +mul.f16x2 r1659, r1063, r1661; +} +{ +add.f16x2 r1662, r1663, r1659; +} +{ +cvt.rn.f16.f64 rs241, fd307; +} +mov.b32 r1667, {rs241, rs241}; +{ +mul.f16x2 r1665, r1069, r1667; +} +{ +add.f16x2 r1668, r1644, r1665; +} +{ +cvt.rn.f16.f64 rs242, fd224; +} +mov.b32 r1673, {rs242, rs242}; +{ +mul.f16x2 r1671, r1078, r1673; +} +{ +add.f16x2 r1674, r1650, r1671; +} +{ +cvt.rn.f16.f64 rs243, fd307; +} +mov.b32 r1679, {rs243, rs243}; +{ +mul.f16x2 r1677, r1072, r1679; +} +{ +add.f16x2 r1680, r1656, r1677; +} +{ +cvt.rn.f16.f64 rs244, fd224; +} +mov.b32 r1685, {rs244, rs244}; +{ +mul.f16x2 r1683, r1075, r1685; +} +{ +add.f16x2 r1686, r1662, r1683; +} +{ +cvt.rn.f16.f64 rs245, fd291; +} +mov.b32 r1691, {rs245, rs245}; +{ +mul.f16x2 r1689, r1081, r1691; +} +{ +add.f16x2 r1692, r1668, r1689; +} +{ +cvt.rn.f16.f64 rs246, fd272; +} +mov.b32 r1697, {rs246, rs246}; +{ +mul.f16x2 r1695, r1090, r1697; +} +{ +add.f16x2 r1698, r1674, r1695; +} +{ +cvt.rn.f16.f64 rs247, fd291; +} +mov.b32 r1703, {rs247, rs247}; +{ +mul.f16x2 r1701, r1084, r1703; +} +{ +add.f16x2 r1704, r1680, r1701; +} +{ +cvt.rn.f16.f64 rs248, fd272; +} +mov.b32 r1709, {rs248, rs248}; +{ +mul.f16x2 r1707, r1087, r1709; +} +{ +add.f16x2 r1710, r1686, r1707; +} +{ +cvt.rn.f16.f64 rs249, fd299; +} +mov.b32 r1715, {rs249, rs249}; +{ +mul.f16x2 r1713, r1093, r1715; +} +{ +add.f16x2 r1716, r1692, r1713; +} +{ +cvt.rn.f16.f64 rs250, fd302; +} +mov.b32 r1721, {rs250, rs250}; +{ +mul.f16x2 r1719, r1102, r1721; +} +{ +add.f16x2 r1722, r1698, r1719; +} +{ +cvt.rn.f16.f64 rs251, fd299; +} +mov.b32 r1727, {rs251, rs251}; +{ +mul.f16x2 r1725, r1096, r1727; +} +{ +add.f16x2 r1728, r1704, r1725; +} +{ +cvt.rn.f16.f64 rs252, fd302; +} +mov.b32 r1733, {rs252, rs252}; +{ +mul.f16x2 r1731, r1099, r1733; +} +{ +add.f16x2 r1734, r1710, r1731; +} +{ +cvt.rn.f16.f64 rs253, fd311; +} +mov.b32 r1739, {rs253, rs253}; +{ +mul.f16x2 r1737, r1105, r1739; +} +{ +add.f16x2 r1740, r1716, r1737; +} +{ +cvt.rn.f16.f64 rs254, fd256; +} +mov.b32 r1745, {rs254, rs254}; +{ +mul.f16x2 r1743, r1114, r1745; +} +{ +add.f16x2 r1746, r1722, r1743; +} +{ +cvt.rn.f16.f64 rs255, fd311; +} +mov.b32 r1751, {rs255, rs255}; +{ +mul.f16x2 r1749, r1108, r1751; +} +{ +add.f16x2 r1752, r1728, r1749; +} +{ +cvt.rn.f16.f64 rs256, fd256; +} +mov.b32 r1757, {rs256, rs256}; +{ +mul.f16x2 r1755, r1111, r1757; +} +{ +add.f16x2 r1758, r1734, r1755; +} +{ +cvt.rn.f16.f64 rs257, fd295; +} +mov.b32 r1763, {rs257, rs257}; +{ +mul.f16x2 r1761, r1117, r1763; +} +{ +add.f16x2 r1764, r1740, r1761; +} +{ +cvt.rn.f16.f64 rs258, fd280; +} +mov.b32 r1769, {rs258, rs258}; +{ +mul.f16x2 r1767, r1126, r1769; +} +{ +add.f16x2 r1770, r1746, r1767; +} +{ +cvt.rn.f16.f64 rs259, fd295; +} +mov.b32 r1775, {rs259, rs259}; +{ +mul.f16x2 r1773, r1120, r1775; +} +{ +add.f16x2 r1776, r1752, r1773; +} +{ +cvt.rn.f16.f64 rs260, fd280; +} +mov.b32 r1781, {rs260, rs260}; +{ +mul.f16x2 r1779, r1123, r1781; +} +{ +add.f16x2 r1782, r1758, r1779; +} +{ +sub.f16x2 r1785, r1764, r1770; +} +{ +add.f16x2 r1788, r1776, r1782; +} +{ +add.f16x2 r1791, r1764, r1770; +} +{ +sub.f16x2 r1794, r1776, r1782; +} +cvt.rn.f16.s32 rs261, r1956; +mov.b32 r1809, {rs261, rs261}; +cvt.rn.f16.s32 rs262, r1956; +mov.b32 r1821, {rs262, rs262}; +{ +cvt.rn.f16.f64 rs263, fd307; +} +mov.b32 r1801, {rs263, rs263}; +{ +mul.f16x2 r1799, r1057, r1801; +} +{ +add.f16x2 r1802, %100, r1799; +} +{ +cvt.rn.f16.f64 rs264, fd308; +} +mov.b32 r1807, {rs264, rs264}; +{ +mul.f16x2 r1805, r1066, r1807; +} +{ +add.f16x2 r1808, r1809, r1805; +} +{ +cvt.rn.f16.f64 rs265, fd307; +} +mov.b32 r1813, {rs265, rs265}; +{ +mul.f16x2 r1811, r1060, r1813; +} +{ +add.f16x2 r1814, %79, r1811; +} +{ +cvt.rn.f16.f64 rs266, fd308; +} +mov.b32 r1819, {rs266, rs266}; +{ +mul.f16x2 r1817, r1063, r1819; +} +{ +add.f16x2 r1820, r1821, r1817; +} +{ +cvt.rn.f16.f64 rs267, fd299; +} +mov.b32 r1825, {rs267, rs267}; +{ +mul.f16x2 r1823, r1069, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs268, fd288; +} +mov.b32 r1831, {rs268, rs268}; +{ +mul.f16x2 r1829, r1078, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs269, fd299; +} +mov.b32 r1837, {rs269, rs269}; +{ +mul.f16x2 r1835, r1072, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs270, fd288; +} +mov.b32 r1843, {rs270, rs270}; +{ +mul.f16x2 r1841, r1075, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs271, fd295; +} +mov.b32 r1849, {rs271, rs271}; +{ +mul.f16x2 r1847, r1081, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs272, fd306; +} +mov.b32 r1855, {rs272, rs272}; +{ +mul.f16x2 r1853, r1090, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs273, fd295; +} +mov.b32 r1861, {rs273, rs273}; +{ +mul.f16x2 r1859, r1084, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs274, fd306; +} +mov.b32 r1867, {rs274, rs274}; +{ +mul.f16x2 r1865, r1087, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs275, fd311; +} +mov.b32 r1873, {rs275, rs275}; +{ +mul.f16x2 r1871, r1093, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs276, fd256; +} +mov.b32 r1879, {rs276, rs276}; +{ +mul.f16x2 r1877, r1102, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs277, fd311; +} +mov.b32 r1885, {rs277, rs277}; +{ +mul.f16x2 r1883, r1096, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs278, fd256; +} +mov.b32 r1891, {rs278, rs278}; +{ +mul.f16x2 r1889, r1099, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs279, fd291; +} +mov.b32 r1897, {rs279, rs279}; +{ +mul.f16x2 r1895, r1105, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs280, fd272; +} +mov.b32 r1903, {rs280, rs280}; +{ +mul.f16x2 r1901, r1114, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs281, fd291; +} +mov.b32 r1909, {rs281, rs281}; +{ +mul.f16x2 r1907, r1108, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs282, fd272; +} +mov.b32 r1915, {rs282, rs282}; +{ +mul.f16x2 r1913, r1111, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs283, fd303; +} +mov.b32 r1921, {rs283, rs283}; +{ +mul.f16x2 r1919, r1117, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs284, fd304; +} +mov.b32 r1927, {rs284, rs284}; +{ +mul.f16x2 r1925, r1126, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs285, fd303; +} +mov.b32 r1933, {rs285, rs285}; +{ +mul.f16x2 r1931, r1120, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs286, fd304; +} +mov.b32 r1939, {rs286, rs286}; +{ +mul.f16x2 r1937, r1123, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +sub.f16x2 r1943, r1922, r1928; +} +{ +add.f16x2 r1946, r1934, r1940; +} +{ +add.f16x2 r1949, r1922, r1928; +} +{ +sub.f16x2 r1952, r1934, r1940; +} +cvt.rn.f16.s32 rs287, r1956; +mov.b32 r1967, {rs287, rs287}; +cvt.rn.f16.s32 rs288, r1956; +mov.b32 r1979, {rs288, rs288}; +{ +cvt.rn.f16.f64 rs289, fd311; +} +mov.b32 r1959, {rs289, rs289}; +{ +mul.f16x2 r1957, r1057, r1959; +} +{ +add.f16x2 r1960, %100, r1957; +} +{ +cvt.rn.f16.f64 rs290, fd312; +} +mov.b32 r1965, {rs290, rs290}; +{ +mul.f16x2 r1963, r1066, r1965; +} +{ +add.f16x2 r1966, r1967, r1963; +} +{ +cvt.rn.f16.f64 rs291, fd311; +} +mov.b32 r1971, {rs291, rs291}; +{ +mul.f16x2 r1969, r1060, r1971; +} +{ +add.f16x2 r1972, %79, r1969; +} +{ +cvt.rn.f16.f64 rs292, fd312; +} +mov.b32 r1977, {rs292, rs292}; +{ +mul.f16x2 r1975, r1063, r1977; +} +{ +add.f16x2 r1978, r1979, r1975; +} +{ +cvt.rn.f16.f64 rs293, fd291; +} +mov.b32 r1983, {rs293, rs293}; +{ +mul.f16x2 r1981, r1069, r1983; +} +{ +add.f16x2 r1984, r1960, r1981; +} +{ +cvt.rn.f16.f64 rs294, fd272; +} +mov.b32 r1989, {rs294, rs294}; +{ +mul.f16x2 r1987, r1078, r1989; +} +{ +add.f16x2 r1990, r1966, r1987; +} +{ +cvt.rn.f16.f64 rs295, fd291; +} +mov.b32 r1995, {rs295, rs295}; +{ +mul.f16x2 r1993, r1072, r1995; +} +{ +add.f16x2 r1996, r1972, r1993; +} +{ +cvt.rn.f16.f64 rs296, fd272; +} +mov.b32 r2001, {rs296, rs296}; +{ +mul.f16x2 r1999, r1075, r2001; +} +{ +add.f16x2 r2002, r1978, r1999; +} +{ +cvt.rn.f16.f64 rs297, fd307; +} +mov.b32 r2007, {rs297, rs297}; +{ +mul.f16x2 r2005, r1081, r2007; +} +{ +add.f16x2 r2008, r1984, r2005; +} +{ +cvt.rn.f16.f64 rs298, fd308; +} +mov.b32 r2013, {rs298, rs298}; +{ +mul.f16x2 r2011, r1090, r2013; +} +{ +add.f16x2 r2014, r1990, r2011; +} +{ +cvt.rn.f16.f64 rs299, fd307; +} +mov.b32 r2019, {rs299, rs299}; +{ +mul.f16x2 r2017, r1084, r2019; +} +{ +add.f16x2 r2020, r1996, r2017; +} +{ +cvt.rn.f16.f64 rs300, fd308; +} +mov.b32 r2025, {rs300, rs300}; +{ +mul.f16x2 r2023, r1087, r2025; +} +{ +add.f16x2 r2026, r2002, r2023; +} +{ +cvt.rn.f16.f64 rs301, fd295; +} +mov.b32 r2031, {rs301, rs301}; +{ +mul.f16x2 r2029, r1093, r2031; +} +{ +add.f16x2 r2032, r2008, r2029; +} +{ +cvt.rn.f16.f64 rs302, fd280; +} +mov.b32 r2037, {rs302, rs302}; +{ +mul.f16x2 r2035, r1102, r2037; +} +{ +add.f16x2 r2038, r2014, r2035; +} +{ +cvt.rn.f16.f64 rs303, fd295; +} +mov.b32 r2043, {rs303, rs303}; +{ +mul.f16x2 r2041, r1096, r2043; +} +{ +add.f16x2 r2044, r2020, r2041; +} +{ +cvt.rn.f16.f64 rs304, fd280; +} +mov.b32 r2049, {rs304, rs304}; +{ +mul.f16x2 r2047, r1099, r2049; +} +{ +add.f16x2 r2050, r2026, r2047; +} +{ +cvt.rn.f16.f64 rs305, fd303; +} +mov.b32 r2055, {rs305, rs305}; +{ +mul.f16x2 r2053, r1105, r2055; +} +{ +add.f16x2 r2056, r2032, r2053; +} +{ +cvt.rn.f16.f64 rs306, fd304; +} +mov.b32 r2061, {rs306, rs306}; +{ +mul.f16x2 r2059, r1114, r2061; +} +{ +add.f16x2 r2062, r2038, r2059; +} +{ +cvt.rn.f16.f64 rs307, fd303; +} +mov.b32 r2067, {rs307, rs307}; +{ +mul.f16x2 r2065, r1108, r2067; +} +{ +add.f16x2 r2068, r2044, r2065; +} +{ +cvt.rn.f16.f64 rs308, fd304; +} +mov.b32 r2073, {rs308, rs308}; +{ +mul.f16x2 r2071, r1111, r2073; +} +{ +add.f16x2 r2074, r2050, r2071; +} +{ +cvt.rn.f16.f64 rs309, fd299; +} +mov.b32 r2079, {rs309, rs309}; +{ +mul.f16x2 r2077, r1117, r2079; +} +{ +add.f16x2 r2080, r2056, r2077; +} +{ +cvt.rn.f16.f64 rs310, fd288; +} +mov.b32 r2085, {rs310, rs310}; +{ +mul.f16x2 r2083, r1126, r2085; +} +{ +add.f16x2 r2086, r2062, r2083; +} +{ +cvt.rn.f16.f64 rs311, fd299; +} +mov.b32 r2091, {rs311, rs311}; +{ +mul.f16x2 r2089, r1120, r2091; +} +{ +add.f16x2 r2092, r2068, r2089; +} +{ +cvt.rn.f16.f64 rs312, fd288; +} +mov.b32 r2097, {rs312, rs312}; +{ +mul.f16x2 r2095, r1123, r2097; +} +{ +add.f16x2 r2098, r2074, r2095; +} +{ +sub.f16x2 r2101, r2080, r2086; +} +{ +add.f16x2 r2104, r2092, r2098; +} +{ +add.f16x2 r2107, r2080, r2086; +} +{ +sub.f16x2 r2110, r2092, r2098; +} +mov.f64 fd289, 0d3FEF11F493053D00; +{ +cvt.rn.f16.f64 rs313, fd289; +} +{ +cvt.rn.f16.f64 rs314, fd312; +} +{ +cvt.rn.f16.f64 rs315, fd291; +} +{ +cvt.rn.f16.f64 rs316, fd310; +} +mov.f64 fd293, 0d3FE7F3CCD0032E0C; +{ +cvt.rn.f16.f64 rs317, fd293; +} +{ +cvt.rn.f16.f64 rs318, fd308; +} +{ +cvt.rn.f16.f64 rs319, fd295; +} +{ +cvt.rn.f16.f64 rs320, fd306; +} +mov.f64 fd297, 0d3FD6B1D8B2365DA1; +{ +cvt.rn.f16.f64 rs321, fd297; +} +{ +cvt.rn.f16.f64 rs322, fd304; +} +{ +cvt.rn.f16.f64 rs323, fd299; +} +{ +cvt.rn.f16.f64 rs324, fd302; +} +mov.f64 fd301, 0dBFBEDB7DEBAA3ED8; +{ +cvt.rn.f16.f64 rs325, fd301; +} +{ +cvt.rn.f16.f64 rs326, fd302; +} +{ +cvt.rn.f16.f64 rs327, fd303; +} +{ +cvt.rn.f16.f64 rs328, fd304; +} +mov.f64 fd305, 0dBFE22D961EA71119; +{ +cvt.rn.f16.f64 rs329, fd305; +} +{ +cvt.rn.f16.f64 rs330, fd306; +} +{ +cvt.rn.f16.f64 rs331, fd307; +} +{ +cvt.rn.f16.f64 rs332, fd308; +} +mov.f64 fd309, 0dBFEC55A7E00740E9; +{ +cvt.rn.f16.f64 rs333, fd309; +} +{ +cvt.rn.f16.f64 rs334, fd310; +} +{ +cvt.rn.f16.f64 rs335, fd311; +} +{ +cvt.rn.f16.f64 rs336, fd312; +} +mov.b32 r2127, {rs313, rs313}; +{ +mul.f16x2 r2113, r1311, r2127; +} +mov.b32 r2124, {rs314, rs314}; +{ +mul.f16x2 r2116, r1314, r2124; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r1311, r2124; +} +{ +fma.rn.f16x2 r2125, r1314, r2127, r2122; +} +mov.b32 r2143, {rs315, rs315}; +{ +mul.f16x2 r2129, r1469, r2143; +} +mov.b32 r2140, {rs316, rs316}; +{ +mul.f16x2 r2132, r1472, r2140; +} +{ +sub.f16x2 r2135, r2129, r2132; +} +{ +mul.f16x2 r2138, r1469, r2140; +} +{ +fma.rn.f16x2 r2141, r1472, r2143, r2138; +} +mov.b32 r2159, {rs317, rs317}; +{ +mul.f16x2 r2145, r1627, r2159; +} +mov.b32 r2156, {rs318, rs318}; +{ +mul.f16x2 r2148, r1630, r2156; +} +{ +sub.f16x2 r2151, r2145, r2148; +} +{ +mul.f16x2 r2154, r1627, r2156; +} +{ +fma.rn.f16x2 r2157, r1630, r2159, r2154; +} +mov.b32 r2175, {rs319, rs319}; +{ +mul.f16x2 r2161, r1785, r2175; +} +mov.b32 r2172, {rs320, rs320}; +{ +mul.f16x2 r2164, r1788, r2172; +} +{ +sub.f16x2 r2167, r2161, r2164; +} +{ +mul.f16x2 r2170, r1785, r2172; +} +{ +fma.rn.f16x2 r2173, r1788, r2175, r2170; +} +mov.b32 r2191, {rs321, rs321}; +{ +mul.f16x2 r2177, r1943, r2191; +} +mov.b32 r2188, {rs322, rs322}; +{ +mul.f16x2 r2180, r1946, r2188; +} +{ +sub.f16x2 r2183, r2177, r2180; +} +{ +mul.f16x2 r2186, r1943, r2188; +} +{ +fma.rn.f16x2 r2189, r1946, r2191, r2186; +} +mov.b32 r2207, {rs323, rs323}; +{ +mul.f16x2 r2193, r2101, r2207; +} +mov.b32 r2204, {rs324, rs324}; +{ +mul.f16x2 r2196, r2104, r2204; +} +{ +sub.f16x2 r2199, r2193, r2196; +} +{ +mul.f16x2 r2202, r2101, r2204; +} +{ +fma.rn.f16x2 r2205, r2104, r2207, r2202; +} +mov.b32 r2223, {rs325, rs325}; +{ +mul.f16x2 r2209, r2107, r2223; +} +mov.b32 r2220, {rs326, rs326}; +{ +mul.f16x2 r2212, r2110, r2220; +} +{ +sub.f16x2 r2215, r2209, r2212; +} +{ +mul.f16x2 r2218, r2107, r2220; +} +{ +fma.rn.f16x2 r2221, r2110, r2223, r2218; +} +mov.b32 r2239, {rs327, rs327}; +{ +mul.f16x2 r2225, r1949, r2239; +} +mov.b32 r2236, {rs328, rs328}; +{ +mul.f16x2 r2228, r1952, r2236; +} +{ +sub.f16x2 r2231, r2225, r2228; +} +{ +mul.f16x2 r2234, r1949, r2236; +} +{ +fma.rn.f16x2 r2237, r1952, r2239, r2234; +} +mov.b32 r2255, {rs329, rs329}; +{ +mul.f16x2 r2241, r1791, r2255; +} +mov.b32 r2252, {rs330, rs330}; +{ +mul.f16x2 r2244, r1794, r2252; +} +{ +sub.f16x2 r2247, r2241, r2244; +} +{ +mul.f16x2 r2250, r1791, r2252; +} +{ +fma.rn.f16x2 r2253, r1794, r2255, r2250; +} +mov.b32 r2271, {rs331, rs331}; +{ +mul.f16x2 r2257, r1633, r2271; +} +mov.b32 r2268, {rs332, rs332}; +{ +mul.f16x2 r2260, r1636, r2268; +} +{ +sub.f16x2 r2263, r2257, r2260; +} +{ +mul.f16x2 r2266, r1633, r2268; +} +{ +fma.rn.f16x2 r2269, r1636, r2271, r2266; +} +mov.b32 r2287, {rs333, rs333}; +{ +mul.f16x2 r2273, r1475, r2287; +} +mov.b32 r2284, {rs334, rs334}; +{ +mul.f16x2 r2276, r1478, r2284; +} +{ +sub.f16x2 r2279, r2273, r2276; +} +{ +mul.f16x2 r2282, r1475, r2284; +} +{ +fma.rn.f16x2 r2285, r1478, r2287, r2282; +} +mov.b32 r2303, {rs335, rs335}; +{ +mul.f16x2 r2289, r1317, r2303; +} +mov.b32 r2300, {rs336, rs336}; +{ +mul.f16x2 r2292, r1320, r2300; +} +{ +sub.f16x2 r2295, r2289, r2292; +} +{ +mul.f16x2 r2298, r1317, r2300; +} +{ +fma.rn.f16x2 r2301, r1320, r2303, r2298; +} +{ +add.f16x2 %0, r103, r1159; +} +{ +add.f16x2 %1, r106, r1162; +} +{ +sub.f16x2 %26, r103, r1159; +} +{ +sub.f16x2 %27, r106, r1162; +} +{ +add.f16x2 %2, r255, r2119; +} +{ +add.f16x2 %3, r258, r2125; +} +{ +sub.f16x2 %28, r255, r2119; +} +{ +sub.f16x2 %29, r258, r2125; +} +{ +add.f16x2 %4, r413, r2135; +} +{ +add.f16x2 %5, r416, r2141; +} +{ +sub.f16x2 %30, r413, r2135; +} +{ +sub.f16x2 %31, r416, r2141; +} +{ +add.f16x2 %6, r571, r2151; +} +{ +add.f16x2 %7, r574, r2157; +} +{ +sub.f16x2 %32, r571, r2151; +} +{ +sub.f16x2 %33, r574, r2157; +} +{ +add.f16x2 %8, r729, r2167; +} +{ +add.f16x2 %9, r732, r2173; +} +{ +sub.f16x2 %34, r729, r2167; +} +{ +sub.f16x2 %35, r732, r2173; +} +{ +add.f16x2 %10, r887, r2183; +} +{ +add.f16x2 %11, r890, r2189; +} +{ +sub.f16x2 %36, r887, r2183; +} +{ +sub.f16x2 %37, r890, r2189; +} +{ +add.f16x2 %12, r1045, r2199; +} +{ +add.f16x2 %13, r1048, r2205; +} +{ +sub.f16x2 %38, r1045, r2199; +} +{ +sub.f16x2 %39, r1048, r2205; +} +{ +add.f16x2 %14, r1051, r2215; +} +{ +add.f16x2 %15, r1054, r2221; +} +{ +sub.f16x2 %40, r1051, r2215; +} +{ +sub.f16x2 %41, r1054, r2221; +} +{ +add.f16x2 %16, r893, r2231; +} +{ +add.f16x2 %17, r896, r2237; +} +{ +sub.f16x2 %42, r893, r2231; +} +{ +sub.f16x2 %43, r896, r2237; +} +{ +add.f16x2 %18, r735, r2247; +} +{ +add.f16x2 %19, r738, r2253; +} +{ +sub.f16x2 %44, r735, r2247; +} +{ +sub.f16x2 %45, r738, r2253; +} +{ +add.f16x2 %20, r577, r2263; +} +{ +add.f16x2 %21, r580, r2269; +} +{ +sub.f16x2 %46, r577, r2263; +} +{ +sub.f16x2 %47, r580, r2269; +} +{ +add.f16x2 %22, r419, r2279; +} +{ +add.f16x2 %23, r422, r2285; +} +{ +sub.f16x2 %48, r419, r2279; +} +{ +sub.f16x2 %49, r422, r2285; +} +{ +add.f16x2 %24, r261, r2295; +} +{ +add.f16x2 %25, r264, r2301; +} +{ +sub.f16x2 %50, r261, r2295; +} +{ +sub.f16x2 %51, r264, r2301; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[23].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..f5cbdcf94461a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp16_inv.hpp.inc @@ -0,0 +1,3763 @@ +#ifndef CUFFTDX_FFT_26_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_26_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<959, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<363>; +.reg .b32 r<2487>; +.reg .f64 fd<339>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %54, %76; +} +{ +add.f16x2 r4, %55, %77; +} +{ +sub.f16x2 r7, %54, %76; +} +{ +sub.f16x2 r10, %55, %77; +} +{ +add.f16x2 r13, %56, %74; +} +{ +add.f16x2 r16, %57, %75; +} +{ +sub.f16x2 r19, %56, %74; +} +{ +sub.f16x2 r22, %57, %75; +} +{ +add.f16x2 r25, %58, %72; +} +{ +add.f16x2 r28, %59, %73; +} +{ +sub.f16x2 r31, %58, %72; +} +{ +sub.f16x2 r34, %59, %73; +} +{ +add.f16x2 r37, %60, %70; +} +{ +add.f16x2 r40, %61, %71; +} +{ +sub.f16x2 r43, %60, %70; +} +{ +sub.f16x2 r46, %61, %71; +} +{ +add.f16x2 r49, %62, %68; +} +{ +add.f16x2 r52, %63, %69; +} +{ +sub.f16x2 r55, %62, %68; +} +{ +sub.f16x2 r58, %63, %69; +} +{ +add.f16x2 r61, %64, %66; +} +{ +add.f16x2 r64, %65, %67; +} +{ +sub.f16x2 r67, %64, %66; +} +{ +sub.f16x2 r70, %65, %67; +} +{ +add.f16x2 r73, %52, r1; +} +{ +add.f16x2 r76, %53, r4; +} +{ +add.f16x2 r79, r73, r13; +} +{ +add.f16x2 r82, r76, r16; +} +{ +add.f16x2 r85, r79, r25; +} +{ +add.f16x2 r88, r82, r28; +} +{ +add.f16x2 r91, r85, r37; +} +{ +add.f16x2 r94, r88, r40; +} +{ +add.f16x2 r97, r91, r49; +} +{ +add.f16x2 r100, r94, r52; +} +{ +add.f16x2 r103, r97, r61; +} +{ +add.f16x2 r106, r100, r64; +} +mov.u32 r1956, 0; +cvt.rn.f16.s32 rs1, r1956; +mov.b32 r121, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r1956; +mov.b32 r133, {rs2, rs2}; +mov.f64 fd291, 0d3FEC55A7E00740E9; +{ +cvt.rn.f16.f64 rs3, fd291; +} +mov.b32 r113, {rs3, rs3}; +{ +mul.f16x2 r111, r1, r113; +} +{ +add.f16x2 r114, %52, r111; +} +mov.f64 fd310, 0d3FDDBE064267C47C; +{ +cvt.rn.f16.f64 rs4, fd310; +} +mov.b32 r119, {rs4, rs4}; +{ +mul.f16x2 r117, r10, r119; +} +{ +add.f16x2 r120, r121, r117; +} +{ +cvt.rn.f16.f64 rs5, fd291; +} +mov.b32 r125, {rs5, rs5}; +{ +mul.f16x2 r123, r4, r125; +} +{ +add.f16x2 r126, %53, r123; +} +{ +cvt.rn.f16.f64 rs6, fd310; +} +mov.b32 r131, {rs6, rs6}; +{ +mul.f16x2 r129, r7, r131; +} +{ +add.f16x2 r132, r133, r129; +} +mov.f64 fd295, 0d3FE22D961EA71119; +{ +cvt.rn.f16.f64 rs7, fd295; +} +mov.b32 r137, {rs7, rs7}; +{ +mul.f16x2 r135, r13, r137; +} +{ +add.f16x2 r138, r114, r135; +} +mov.f64 fd306, 0d3FEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs8, fd306; +} +mov.b32 r143, {rs8, rs8}; +{ +mul.f16x2 r141, r22, r143; +} +{ +add.f16x2 r144, r120, r141; +} +{ +cvt.rn.f16.f64 rs9, fd295; +} +mov.b32 r149, {rs9, rs9}; +{ +mul.f16x2 r147, r16, r149; +} +{ +add.f16x2 r150, r126, r147; +} +{ +cvt.rn.f16.f64 rs10, fd306; +} +mov.b32 r155, {rs10, rs10}; +{ +mul.f16x2 r153, r19, r155; +} +{ +add.f16x2 r156, r132, r153; +} +mov.f64 fd299, 0d3FBEDB7DEBAA3ED8; +{ +cvt.rn.f16.f64 rs11, fd299; +} +mov.b32 r161, {rs11, rs11}; +{ +mul.f16x2 r159, r25, r161; +} +{ +add.f16x2 r162, r138, r159; +} +mov.f64 fd302, 0d3FEFC44566966769; +{ +cvt.rn.f16.f64 rs12, fd302; +} +mov.b32 r167, {rs12, rs12}; +{ +mul.f16x2 r165, r34, r167; +} +{ +add.f16x2 r168, r144, r165; +} +{ +cvt.rn.f16.f64 rs13, fd299; +} +mov.b32 r173, {rs13, rs13}; +{ +mul.f16x2 r171, r28, r173; +} +{ +add.f16x2 r174, r150, r171; +} +{ +cvt.rn.f16.f64 rs14, fd302; +} +mov.b32 r179, {rs14, rs14}; +{ +mul.f16x2 r177, r31, r179; +} +{ +add.f16x2 r180, r156, r177; +} +mov.f64 fd303, 0dBFD6B1D8B2365DA1; +{ +cvt.rn.f16.f64 rs15, fd303; +} +mov.b32 r185, {rs15, rs15}; +{ +mul.f16x2 r183, r37, r185; +} +{ +add.f16x2 r186, r162, r183; +} +mov.f64 fd304, 0d3FEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs16, fd304; +} +mov.b32 r191, {rs16, rs16}; +{ +mul.f16x2 r189, r46, r191; +} +{ +add.f16x2 r192, r168, r189; +} +{ +cvt.rn.f16.f64 rs17, fd303; +} +mov.b32 r197, {rs17, rs17}; +{ +mul.f16x2 r195, r40, r197; +} +{ +add.f16x2 r198, r174, r195; +} +{ +cvt.rn.f16.f64 rs18, fd304; +} +mov.b32 r203, {rs18, rs18}; +{ +mul.f16x2 r201, r43, r203; +} +{ +add.f16x2 r204, r180, r201; +} +mov.f64 fd307, 0dBFE7F3CCD0032E0C; +{ +cvt.rn.f16.f64 rs19, fd307; +} +mov.b32 r209, {rs19, rs19}; +{ +mul.f16x2 r207, r49, r209; +} +{ +add.f16x2 r210, r186, r207; +} +mov.f64 fd308, 0d3FE5384D024C2F84; +{ +cvt.rn.f16.f64 rs20, fd308; +} +mov.b32 r215, {rs20, rs20}; +{ +mul.f16x2 r213, r58, r215; +} +{ +add.f16x2 r216, r192, r213; +} +{ +cvt.rn.f16.f64 rs21, fd307; +} +mov.b32 r221, {rs21, rs21}; +{ +mul.f16x2 r219, r52, r221; +} +{ +add.f16x2 r222, r198, r219; +} +{ +cvt.rn.f16.f64 rs22, fd308; +} +mov.b32 r227, {rs22, rs22}; +{ +mul.f16x2 r225, r55, r227; +} +{ +add.f16x2 r228, r204, r225; +} +mov.f64 fd311, 0dBFEF11F493053D00; +{ +cvt.rn.f16.f64 rs23, fd311; +} +mov.b32 r233, {rs23, rs23}; +{ +mul.f16x2 r231, r61, r233; +} +{ +add.f16x2 r234, r210, r231; +} +mov.f64 fd312, 0d3FCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs24, fd312; +} +mov.b32 r239, {rs24, rs24}; +{ +mul.f16x2 r237, r70, r239; +} +{ +add.f16x2 r240, r216, r237; +} +{ +cvt.rn.f16.f64 rs25, fd311; +} +mov.b32 r245, {rs25, rs25}; +{ +mul.f16x2 r243, r64, r245; +} +{ +add.f16x2 r246, r222, r243; +} +{ +cvt.rn.f16.f64 rs26, fd312; +} +mov.b32 r251, {rs26, rs26}; +{ +mul.f16x2 r249, r67, r251; +} +{ +add.f16x2 r252, r228, r249; +} +{ +sub.f16x2 r255, r234, r240; +} +{ +add.f16x2 r258, r246, r252; +} +{ +add.f16x2 r261, r234, r240; +} +{ +sub.f16x2 r264, r246, r252; +} +cvt.rn.f16.s32 rs27, r1956; +mov.b32 r279, {rs27, rs27}; +cvt.rn.f16.s32 rs28, r1956; +mov.b32 r291, {rs28, rs28}; +{ +cvt.rn.f16.f64 rs29, fd295; +} +mov.b32 r271, {rs29, rs29}; +{ +mul.f16x2 r269, r1, r271; +} +{ +add.f16x2 r272, %52, r269; +} +{ +cvt.rn.f16.f64 rs30, fd306; +} +mov.b32 r277, {rs30, rs30}; +{ +mul.f16x2 r275, r10, r277; +} +{ +add.f16x2 r278, r279, r275; +} +{ +cvt.rn.f16.f64 rs31, fd295; +} +mov.b32 r283, {rs31, rs31}; +{ +mul.f16x2 r281, r4, r283; +} +{ +add.f16x2 r284, %53, r281; +} +{ +cvt.rn.f16.f64 rs32, fd306; +} +mov.b32 r289, {rs32, rs32}; +{ +mul.f16x2 r287, r7, r289; +} +{ +add.f16x2 r290, r291, r287; +} +{ +cvt.rn.f16.f64 rs33, fd303; +} +mov.b32 r295, {rs33, rs33}; +{ +mul.f16x2 r293, r13, r295; +} +{ +add.f16x2 r296, r272, r293; +} +{ +cvt.rn.f16.f64 rs34, fd304; +} +mov.b32 r301, {rs34, rs34}; +{ +mul.f16x2 r299, r22, r301; +} +{ +add.f16x2 r302, r278, r299; +} +{ +cvt.rn.f16.f64 rs35, fd303; +} +mov.b32 r307, {rs35, rs35}; +{ +mul.f16x2 r305, r16, r307; +} +{ +add.f16x2 r308, r284, r305; +} +{ +cvt.rn.f16.f64 rs36, fd304; +} +mov.b32 r313, {rs36, rs36}; +{ +mul.f16x2 r311, r19, r313; +} +{ +add.f16x2 r314, r290, r311; +} +{ +cvt.rn.f16.f64 rs37, fd311; +} +mov.b32 r319, {rs37, rs37}; +{ +mul.f16x2 r317, r25, r319; +} +{ +add.f16x2 r320, r296, r317; +} +{ +cvt.rn.f16.f64 rs38, fd312; +} +mov.b32 r325, {rs38, rs38}; +{ +mul.f16x2 r323, r34, r325; +} +{ +add.f16x2 r326, r302, r323; +} +{ +cvt.rn.f16.f64 rs39, fd311; +} +mov.b32 r331, {rs39, rs39}; +{ +mul.f16x2 r329, r28, r331; +} +{ +add.f16x2 r332, r308, r329; +} +{ +cvt.rn.f16.f64 rs40, fd312; +} +mov.b32 r337, {rs40, rs40}; +{ +mul.f16x2 r335, r31, r337; +} +{ +add.f16x2 r338, r314, r335; +} +{ +cvt.rn.f16.f64 rs41, fd307; +} +mov.b32 r343, {rs41, rs41}; +{ +mul.f16x2 r341, r37, r343; +} +{ +add.f16x2 r344, r320, r341; +} +mov.f64 fd224, 0dBFE5384D024C2F84; +{ +cvt.rn.f16.f64 rs42, fd224; +} +mov.b32 r349, {rs42, rs42}; +{ +mul.f16x2 r347, r46, r349; +} +{ +add.f16x2 r350, r326, r347; +} +{ +cvt.rn.f16.f64 rs43, fd307; +} +mov.b32 r355, {rs43, rs43}; +{ +mul.f16x2 r353, r40, r355; +} +{ +add.f16x2 r356, r332, r353; +} +{ +cvt.rn.f16.f64 rs44, fd224; +} +mov.b32 r361, {rs44, rs44}; +{ +mul.f16x2 r359, r43, r361; +} +{ +add.f16x2 r362, r338, r359; +} +{ +cvt.rn.f16.f64 rs45, fd299; +} +mov.b32 r367, {rs45, rs45}; +{ +mul.f16x2 r365, r49, r367; +} +{ +add.f16x2 r368, r344, r365; +} +mov.f64 fd288, 0dBFEFC44566966769; +{ +cvt.rn.f16.f64 rs46, fd288; +} +mov.b32 r373, {rs46, rs46}; +{ +mul.f16x2 r371, r58, r373; +} +{ +add.f16x2 r374, r350, r371; +} +{ +cvt.rn.f16.f64 rs47, fd299; +} +mov.b32 r379, {rs47, rs47}; +{ +mul.f16x2 r377, r52, r379; +} +{ +add.f16x2 r380, r356, r377; +} +{ +cvt.rn.f16.f64 rs48, fd288; +} +mov.b32 r385, {rs48, rs48}; +{ +mul.f16x2 r383, r55, r385; +} +{ +add.f16x2 r386, r362, r383; +} +{ +cvt.rn.f16.f64 rs49, fd291; +} +mov.b32 r391, {rs49, rs49}; +{ +mul.f16x2 r389, r61, r391; +} +{ +add.f16x2 r392, r368, r389; +} +mov.f64 fd272, 0dBFDDBE064267C47C; +{ +cvt.rn.f16.f64 rs50, fd272; +} +mov.b32 r397, {rs50, rs50}; +{ +mul.f16x2 r395, r70, r397; +} +{ +add.f16x2 r398, r374, r395; +} +{ +cvt.rn.f16.f64 rs51, fd291; +} +mov.b32 r403, {rs51, rs51}; +{ +mul.f16x2 r401, r64, r403; +} +{ +add.f16x2 r404, r380, r401; +} +{ +cvt.rn.f16.f64 rs52, fd272; +} +mov.b32 r409, {rs52, rs52}; +{ +mul.f16x2 r407, r67, r409; +} +{ +add.f16x2 r410, r386, r407; +} +{ +sub.f16x2 r413, r392, r398; +} +{ +add.f16x2 r416, r404, r410; +} +{ +add.f16x2 r419, r392, r398; +} +{ +sub.f16x2 r422, r404, r410; +} +cvt.rn.f16.s32 rs53, r1956; +mov.b32 r437, {rs53, rs53}; +cvt.rn.f16.s32 rs54, r1956; +mov.b32 r449, {rs54, rs54}; +{ +cvt.rn.f16.f64 rs55, fd299; +} +mov.b32 r429, {rs55, rs55}; +{ +mul.f16x2 r427, r1, r429; +} +{ +add.f16x2 r430, %52, r427; +} +{ +cvt.rn.f16.f64 rs56, fd302; +} +mov.b32 r435, {rs56, rs56}; +{ +mul.f16x2 r433, r10, r435; +} +{ +add.f16x2 r436, r437, r433; +} +{ +cvt.rn.f16.f64 rs57, fd299; +} +mov.b32 r441, {rs57, rs57}; +{ +mul.f16x2 r439, r4, r441; +} +{ +add.f16x2 r442, %53, r439; +} +{ +cvt.rn.f16.f64 rs58, fd302; +} +mov.b32 r447, {rs58, rs58}; +{ +mul.f16x2 r445, r7, r447; +} +{ +add.f16x2 r448, r449, r445; +} +{ +cvt.rn.f16.f64 rs59, fd311; +} +mov.b32 r453, {rs59, rs59}; +{ +mul.f16x2 r451, r13, r453; +} +{ +add.f16x2 r454, r430, r451; +} +{ +cvt.rn.f16.f64 rs60, fd312; +} +mov.b32 r459, {rs60, rs60}; +{ +mul.f16x2 r457, r22, r459; +} +{ +add.f16x2 r460, r436, r457; +} +{ +cvt.rn.f16.f64 rs61, fd311; +} +mov.b32 r465, {rs61, rs61}; +{ +mul.f16x2 r463, r16, r465; +} +{ +add.f16x2 r466, r442, r463; +} +{ +cvt.rn.f16.f64 rs62, fd312; +} +mov.b32 r471, {rs62, rs62}; +{ +mul.f16x2 r469, r19, r471; +} +{ +add.f16x2 r472, r448, r469; +} +{ +cvt.rn.f16.f64 rs63, fd303; +} +mov.b32 r477, {rs63, rs63}; +{ +mul.f16x2 r475, r25, r477; +} +{ +add.f16x2 r478, r454, r475; +} +mov.f64 fd204, 0dBFEDEBA72EF20147; +{ +cvt.rn.f16.f64 rs64, fd204; +} +mov.b32 r483, {rs64, rs64}; +{ +mul.f16x2 r481, r34, r483; +} +{ +add.f16x2 r484, r460, r481; +} +{ +cvt.rn.f16.f64 rs65, fd303; +} +mov.b32 r489, {rs65, rs65}; +{ +mul.f16x2 r487, r28, r489; +} +{ +add.f16x2 r490, r466, r487; +} +{ +cvt.rn.f16.f64 rs66, fd204; +} +mov.b32 r495, {rs66, rs66}; +{ +mul.f16x2 r493, r31, r495; +} +{ +add.f16x2 r496, r472, r493; +} +{ +cvt.rn.f16.f64 rs67, fd291; +} +mov.b32 r501, {rs67, rs67}; +{ +mul.f16x2 r499, r37, r501; +} +{ +add.f16x2 r502, r478, r499; +} +{ +cvt.rn.f16.f64 rs68, fd272; +} +mov.b32 r507, {rs68, rs68}; +{ +mul.f16x2 r505, r46, r507; +} +{ +add.f16x2 r508, r484, r505; +} +{ +cvt.rn.f16.f64 rs69, fd291; +} +mov.b32 r513, {rs69, rs69}; +{ +mul.f16x2 r511, r40, r513; +} +{ +add.f16x2 r514, r490, r511; +} +{ +cvt.rn.f16.f64 rs70, fd272; +} +mov.b32 r519, {rs70, rs70}; +{ +mul.f16x2 r517, r43, r519; +} +{ +add.f16x2 r520, r496, r517; +} +{ +cvt.rn.f16.f64 rs71, fd295; +} +mov.b32 r525, {rs71, rs71}; +{ +mul.f16x2 r523, r49, r525; +} +{ +add.f16x2 r526, r502, r523; +} +{ +cvt.rn.f16.f64 rs72, fd306; +} +mov.b32 r531, {rs72, rs72}; +{ +mul.f16x2 r529, r58, r531; +} +{ +add.f16x2 r532, r508, r529; +} +{ +cvt.rn.f16.f64 rs73, fd295; +} +mov.b32 r537, {rs73, rs73}; +{ +mul.f16x2 r535, r52, r537; +} +{ +add.f16x2 r538, r514, r535; +} +{ +cvt.rn.f16.f64 rs74, fd306; +} +mov.b32 r543, {rs74, rs74}; +{ +mul.f16x2 r541, r55, r543; +} +{ +add.f16x2 r544, r520, r541; +} +{ +cvt.rn.f16.f64 rs75, fd307; +} +mov.b32 r549, {rs75, rs75}; +{ +mul.f16x2 r547, r61, r549; +} +{ +add.f16x2 r550, r526, r547; +} +{ +cvt.rn.f16.f64 rs76, fd308; +} +mov.b32 r555, {rs76, rs76}; +{ +mul.f16x2 r553, r70, r555; +} +{ +add.f16x2 r556, r532, r553; +} +{ +cvt.rn.f16.f64 rs77, fd307; +} +mov.b32 r561, {rs77, rs77}; +{ +mul.f16x2 r559, r64, r561; +} +{ +add.f16x2 r562, r538, r559; +} +{ +cvt.rn.f16.f64 rs78, fd308; +} +mov.b32 r567, {rs78, rs78}; +{ +mul.f16x2 r565, r67, r567; +} +{ +add.f16x2 r568, r544, r565; +} +{ +sub.f16x2 r571, r550, r556; +} +{ +add.f16x2 r574, r562, r568; +} +{ +add.f16x2 r577, r550, r556; +} +{ +sub.f16x2 r580, r562, r568; +} +cvt.rn.f16.s32 rs79, r1956; +mov.b32 r595, {rs79, rs79}; +cvt.rn.f16.s32 rs80, r1956; +mov.b32 r607, {rs80, rs80}; +{ +cvt.rn.f16.f64 rs81, fd303; +} +mov.b32 r587, {rs81, rs81}; +{ +mul.f16x2 r585, r1, r587; +} +{ +add.f16x2 r588, %52, r585; +} +{ +cvt.rn.f16.f64 rs82, fd304; +} +mov.b32 r593, {rs82, rs82}; +{ +mul.f16x2 r591, r10, r593; +} +{ +add.f16x2 r594, r595, r591; +} +{ +cvt.rn.f16.f64 rs83, fd303; +} +mov.b32 r599, {rs83, rs83}; +{ +mul.f16x2 r597, r4, r599; +} +{ +add.f16x2 r600, %53, r597; +} +{ +cvt.rn.f16.f64 rs84, fd304; +} +mov.b32 r605, {rs84, rs84}; +{ +mul.f16x2 r603, r7, r605; +} +{ +add.f16x2 r606, r607, r603; +} +{ +cvt.rn.f16.f64 rs85, fd307; +} +mov.b32 r611, {rs85, rs85}; +{ +mul.f16x2 r609, r13, r611; +} +{ +add.f16x2 r612, r588, r609; +} +{ +cvt.rn.f16.f64 rs86, fd224; +} +mov.b32 r617, {rs86, rs86}; +{ +mul.f16x2 r615, r22, r617; +} +{ +add.f16x2 r618, r594, r615; +} +{ +cvt.rn.f16.f64 rs87, fd307; +} +mov.b32 r623, {rs87, rs87}; +{ +mul.f16x2 r621, r16, r623; +} +{ +add.f16x2 r624, r600, r621; +} +{ +cvt.rn.f16.f64 rs88, fd224; +} +mov.b32 r629, {rs88, rs88}; +{ +mul.f16x2 r627, r19, r629; +} +{ +add.f16x2 r630, r606, r627; +} +{ +cvt.rn.f16.f64 rs89, fd291; +} +mov.b32 r635, {rs89, rs89}; +{ +mul.f16x2 r633, r25, r635; +} +{ +add.f16x2 r636, r612, r633; +} +{ +cvt.rn.f16.f64 rs90, fd272; +} +mov.b32 r641, {rs90, rs90}; +{ +mul.f16x2 r639, r34, r641; +} +{ +add.f16x2 r642, r618, r639; +} +{ +cvt.rn.f16.f64 rs91, fd291; +} +mov.b32 r647, {rs91, rs91}; +{ +mul.f16x2 r645, r28, r647; +} +{ +add.f16x2 r648, r624, r645; +} +{ +cvt.rn.f16.f64 rs92, fd272; +} +mov.b32 r653, {rs92, rs92}; +{ +mul.f16x2 r651, r31, r653; +} +{ +add.f16x2 r654, r630, r651; +} +{ +cvt.rn.f16.f64 rs93, fd299; +} +mov.b32 r659, {rs93, rs93}; +{ +mul.f16x2 r657, r37, r659; +} +{ +add.f16x2 r660, r636, r657; +} +{ +cvt.rn.f16.f64 rs94, fd302; +} +mov.b32 r665, {rs94, rs94}; +{ +mul.f16x2 r663, r46, r665; +} +{ +add.f16x2 r666, r642, r663; +} +{ +cvt.rn.f16.f64 rs95, fd299; +} +mov.b32 r671, {rs95, rs95}; +{ +mul.f16x2 r669, r40, r671; +} +{ +add.f16x2 r672, r648, r669; +} +{ +cvt.rn.f16.f64 rs96, fd302; +} +mov.b32 r677, {rs96, rs96}; +{ +mul.f16x2 r675, r43, r677; +} +{ +add.f16x2 r678, r654, r675; +} +{ +cvt.rn.f16.f64 rs97, fd311; +} +mov.b32 r683, {rs97, rs97}; +{ +mul.f16x2 r681, r49, r683; +} +{ +add.f16x2 r684, r660, r681; +} +mov.f64 fd256, 0dBFCEA1E54BC48DBF; +{ +cvt.rn.f16.f64 rs98, fd256; +} +mov.b32 r689, {rs98, rs98}; +{ +mul.f16x2 r687, r58, r689; +} +{ +add.f16x2 r690, r666, r687; +} +{ +cvt.rn.f16.f64 rs99, fd311; +} +mov.b32 r695, {rs99, rs99}; +{ +mul.f16x2 r693, r52, r695; +} +{ +add.f16x2 r696, r672, r693; +} +{ +cvt.rn.f16.f64 rs100, fd256; +} +mov.b32 r701, {rs100, rs100}; +{ +mul.f16x2 r699, r55, r701; +} +{ +add.f16x2 r702, r678, r699; +} +{ +cvt.rn.f16.f64 rs101, fd295; +} +mov.b32 r707, {rs101, rs101}; +{ +mul.f16x2 r705, r61, r707; +} +{ +add.f16x2 r708, r684, r705; +} +mov.f64 fd280, 0dBFEA55E242A4C3D2; +{ +cvt.rn.f16.f64 rs102, fd280; +} +mov.b32 r713, {rs102, rs102}; +{ +mul.f16x2 r711, r70, r713; +} +{ +add.f16x2 r714, r690, r711; +} +{ +cvt.rn.f16.f64 rs103, fd295; +} +mov.b32 r719, {rs103, rs103}; +{ +mul.f16x2 r717, r64, r719; +} +{ +add.f16x2 r720, r696, r717; +} +{ +cvt.rn.f16.f64 rs104, fd280; +} +mov.b32 r725, {rs104, rs104}; +{ +mul.f16x2 r723, r67, r725; +} +{ +add.f16x2 r726, r702, r723; +} +{ +sub.f16x2 r729, r708, r714; +} +{ +add.f16x2 r732, r720, r726; +} +{ +add.f16x2 r735, r708, r714; +} +{ +sub.f16x2 r738, r720, r726; +} +cvt.rn.f16.s32 rs105, r1956; +mov.b32 r753, {rs105, rs105}; +cvt.rn.f16.s32 rs106, r1956; +mov.b32 r765, {rs106, rs106}; +{ +cvt.rn.f16.f64 rs107, fd307; +} +mov.b32 r745, {rs107, rs107}; +{ +mul.f16x2 r743, r1, r745; +} +{ +add.f16x2 r746, %52, r743; +} +{ +cvt.rn.f16.f64 rs108, fd308; +} +mov.b32 r751, {rs108, rs108}; +{ +mul.f16x2 r749, r10, r751; +} +{ +add.f16x2 r752, r753, r749; +} +{ +cvt.rn.f16.f64 rs109, fd307; +} +mov.b32 r757, {rs109, rs109}; +{ +mul.f16x2 r755, r4, r757; +} +{ +add.f16x2 r758, %53, r755; +} +{ +cvt.rn.f16.f64 rs110, fd308; +} +mov.b32 r763, {rs110, rs110}; +{ +mul.f16x2 r761, r7, r763; +} +{ +add.f16x2 r764, r765, r761; +} +{ +cvt.rn.f16.f64 rs111, fd299; +} +mov.b32 r769, {rs111, rs111}; +{ +mul.f16x2 r767, r13, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs112, fd288; +} +mov.b32 r775, {rs112, rs112}; +{ +mul.f16x2 r773, r22, r775; +} +{ +add.f16x2 r776, r752, r773; +} +{ +cvt.rn.f16.f64 rs113, fd299; +} +mov.b32 r781, {rs113, rs113}; +{ +mul.f16x2 r779, r16, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs114, fd288; +} +mov.b32 r787, {rs114, rs114}; +{ +mul.f16x2 r785, r19, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs115, fd295; +} +mov.b32 r793, {rs115, rs115}; +{ +mul.f16x2 r791, r25, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs116, fd306; +} +mov.b32 r799, {rs116, rs116}; +{ +mul.f16x2 r797, r34, r799; +} +{ +add.f16x2 r800, r776, r797; +} +{ +cvt.rn.f16.f64 rs117, fd295; +} +mov.b32 r805, {rs117, rs117}; +{ +mul.f16x2 r803, r28, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs118, fd306; +} +mov.b32 r811, {rs118, rs118}; +{ +mul.f16x2 r809, r31, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs119, fd311; +} +mov.b32 r817, {rs119, rs119}; +{ +mul.f16x2 r815, r37, r817; +} +{ +add.f16x2 r818, r794, r815; +} +{ +cvt.rn.f16.f64 rs120, fd256; +} +mov.b32 r823, {rs120, rs120}; +{ +mul.f16x2 r821, r46, r823; +} +{ +add.f16x2 r824, r800, r821; +} +{ +cvt.rn.f16.f64 rs121, fd311; +} +mov.b32 r829, {rs121, rs121}; +{ +mul.f16x2 r827, r40, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs122, fd256; +} +mov.b32 r835, {rs122, rs122}; +{ +mul.f16x2 r833, r43, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs123, fd291; +} +mov.b32 r841, {rs123, rs123}; +{ +mul.f16x2 r839, r49, r841; +} +{ +add.f16x2 r842, r818, r839; +} +{ +cvt.rn.f16.f64 rs124, fd272; +} +mov.b32 r847, {rs124, rs124}; +{ +mul.f16x2 r845, r58, r847; +} +{ +add.f16x2 r848, r824, r845; +} +{ +cvt.rn.f16.f64 rs125, fd291; +} +mov.b32 r853, {rs125, rs125}; +{ +mul.f16x2 r851, r52, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs126, fd272; +} +mov.b32 r859, {rs126, rs126}; +{ +mul.f16x2 r857, r55, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs127, fd303; +} +mov.b32 r865, {rs127, rs127}; +{ +mul.f16x2 r863, r61, r865; +} +{ +add.f16x2 r866, r842, r863; +} +{ +cvt.rn.f16.f64 rs128, fd304; +} +mov.b32 r871, {rs128, rs128}; +{ +mul.f16x2 r869, r70, r871; +} +{ +add.f16x2 r872, r848, r869; +} +{ +cvt.rn.f16.f64 rs129, fd303; +} +mov.b32 r877, {rs129, rs129}; +{ +mul.f16x2 r875, r64, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs130, fd304; +} +mov.b32 r883, {rs130, rs130}; +{ +mul.f16x2 r881, r67, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +sub.f16x2 r887, r866, r872; +} +{ +add.f16x2 r890, r878, r884; +} +{ +add.f16x2 r893, r866, r872; +} +{ +sub.f16x2 r896, r878, r884; +} +cvt.rn.f16.s32 rs131, r1956; +mov.b32 r911, {rs131, rs131}; +cvt.rn.f16.s32 rs132, r1956; +mov.b32 r923, {rs132, rs132}; +{ +cvt.rn.f16.f64 rs133, fd311; +} +mov.b32 r903, {rs133, rs133}; +{ +mul.f16x2 r901, r1, r903; +} +{ +add.f16x2 r904, %52, r901; +} +{ +cvt.rn.f16.f64 rs134, fd312; +} +mov.b32 r909, {rs134, rs134}; +{ +mul.f16x2 r907, r10, r909; +} +{ +add.f16x2 r910, r911, r907; +} +{ +cvt.rn.f16.f64 rs135, fd311; +} +mov.b32 r915, {rs135, rs135}; +{ +mul.f16x2 r913, r4, r915; +} +{ +add.f16x2 r916, %53, r913; +} +{ +cvt.rn.f16.f64 rs136, fd312; +} +mov.b32 r921, {rs136, rs136}; +{ +mul.f16x2 r919, r7, r921; +} +{ +add.f16x2 r922, r923, r919; +} +{ +cvt.rn.f16.f64 rs137, fd291; +} +mov.b32 r927, {rs137, rs137}; +{ +mul.f16x2 r925, r13, r927; +} +{ +add.f16x2 r928, r904, r925; +} +{ +cvt.rn.f16.f64 rs138, fd272; +} +mov.b32 r933, {rs138, rs138}; +{ +mul.f16x2 r931, r22, r933; +} +{ +add.f16x2 r934, r910, r931; +} +{ +cvt.rn.f16.f64 rs139, fd291; +} +mov.b32 r939, {rs139, rs139}; +{ +mul.f16x2 r937, r16, r939; +} +{ +add.f16x2 r940, r916, r937; +} +{ +cvt.rn.f16.f64 rs140, fd272; +} +mov.b32 r945, {rs140, rs140}; +{ +mul.f16x2 r943, r19, r945; +} +{ +add.f16x2 r946, r922, r943; +} +{ +cvt.rn.f16.f64 rs141, fd307; +} +mov.b32 r951, {rs141, rs141}; +{ +mul.f16x2 r949, r25, r951; +} +{ +add.f16x2 r952, r928, r949; +} +{ +cvt.rn.f16.f64 rs142, fd308; +} +mov.b32 r957, {rs142, rs142}; +{ +mul.f16x2 r955, r34, r957; +} +{ +add.f16x2 r958, r934, r955; +} +{ +cvt.rn.f16.f64 rs143, fd307; +} +mov.b32 r963, {rs143, rs143}; +{ +mul.f16x2 r961, r28, r963; +} +{ +add.f16x2 r964, r940, r961; +} +{ +cvt.rn.f16.f64 rs144, fd308; +} +mov.b32 r969, {rs144, rs144}; +{ +mul.f16x2 r967, r31, r969; +} +{ +add.f16x2 r970, r946, r967; +} +{ +cvt.rn.f16.f64 rs145, fd295; +} +mov.b32 r975, {rs145, rs145}; +{ +mul.f16x2 r973, r37, r975; +} +{ +add.f16x2 r976, r952, r973; +} +{ +cvt.rn.f16.f64 rs146, fd280; +} +mov.b32 r981, {rs146, rs146}; +{ +mul.f16x2 r979, r46, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs147, fd295; +} +mov.b32 r987, {rs147, rs147}; +{ +mul.f16x2 r985, r40, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs148, fd280; +} +mov.b32 r993, {rs148, rs148}; +{ +mul.f16x2 r991, r43, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs149, fd303; +} +mov.b32 r999, {rs149, rs149}; +{ +mul.f16x2 r997, r49, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs150, fd304; +} +mov.b32 r1005, {rs150, rs150}; +{ +mul.f16x2 r1003, r58, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs151, fd303; +} +mov.b32 r1011, {rs151, rs151}; +{ +mul.f16x2 r1009, r52, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs152, fd304; +} +mov.b32 r1017, {rs152, rs152}; +{ +mul.f16x2 r1015, r55, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +cvt.rn.f16.f64 rs153, fd299; +} +mov.b32 r1023, {rs153, rs153}; +{ +mul.f16x2 r1021, r61, r1023; +} +{ +add.f16x2 r1024, r1000, r1021; +} +{ +cvt.rn.f16.f64 rs154, fd288; +} +mov.b32 r1029, {rs154, rs154}; +{ +mul.f16x2 r1027, r70, r1029; +} +{ +add.f16x2 r1030, r1006, r1027; +} +{ +cvt.rn.f16.f64 rs155, fd299; +} +mov.b32 r1035, {rs155, rs155}; +{ +mul.f16x2 r1033, r64, r1035; +} +{ +add.f16x2 r1036, r1012, r1033; +} +{ +cvt.rn.f16.f64 rs156, fd288; +} +mov.b32 r1041, {rs156, rs156}; +{ +mul.f16x2 r1039, r67, r1041; +} +{ +add.f16x2 r1042, r1018, r1039; +} +{ +sub.f16x2 r1045, r1024, r1030; +} +{ +add.f16x2 r1048, r1036, r1042; +} +{ +add.f16x2 r1051, r1024, r1030; +} +{ +sub.f16x2 r1054, r1036, r1042; +} +{ +add.f16x2 r1057, %96, %88; +} +{ +add.f16x2 r1060, %99, %93; +} +{ +sub.f16x2 r1063, %96, %88; +} +{ +sub.f16x2 r1066, %99, %93; +} +{ +add.f16x2 r1069, %83, %103; +} +{ +add.f16x2 r1072, %87, %81; +} +{ +sub.f16x2 r1075, %83, %103; +} +{ +sub.f16x2 r1078, %87, %81; +} +{ +add.f16x2 r1081, %97, %91; +} +{ +add.f16x2 r1084, %101, %94; +} +{ +sub.f16x2 r1087, %97, %91; +} +{ +sub.f16x2 r1090, %101, %94; +} +{ +add.f16x2 r1093, %85, %78; +} +{ +add.f16x2 r1096, %89, %82; +} +{ +sub.f16x2 r1099, %85, %78; +} +{ +sub.f16x2 r1102, %89, %82; +} +{ +add.f16x2 r1105, %98, %92; +} +{ +add.f16x2 r1108, %102, %95; +} +{ +sub.f16x2 r1111, %98, %92; +} +{ +sub.f16x2 r1114, %102, %95; +} +{ +add.f16x2 r1117, %86, %80; +} +{ +add.f16x2 r1120, %90, %84; +} +{ +sub.f16x2 r1123, %86, %80; +} +{ +sub.f16x2 r1126, %90, %84; +} +{ +add.f16x2 r1129, %100, r1057; +} +{ +add.f16x2 r1132, %79, r1060; +} +{ +add.f16x2 r1135, r1129, r1069; +} +{ +add.f16x2 r1138, r1132, r1072; +} +{ +add.f16x2 r1141, r1135, r1081; +} +{ +add.f16x2 r1144, r1138, r1084; +} +{ +add.f16x2 r1147, r1141, r1093; +} +{ +add.f16x2 r1150, r1144, r1096; +} +{ +add.f16x2 r1153, r1147, r1105; +} +{ +add.f16x2 r1156, r1150, r1108; +} +{ +add.f16x2 r1159, r1153, r1117; +} +{ +add.f16x2 r1162, r1156, r1120; +} +cvt.rn.f16.s32 rs157, r1956; +mov.b32 r1177, {rs157, rs157}; +cvt.rn.f16.s32 rs158, r1956; +mov.b32 r1189, {rs158, rs158}; +{ +cvt.rn.f16.f64 rs159, fd291; +} +mov.b32 r1169, {rs159, rs159}; +{ +mul.f16x2 r1167, r1057, r1169; +} +{ +add.f16x2 r1170, %100, r1167; +} +{ +cvt.rn.f16.f64 rs160, fd310; +} +mov.b32 r1175, {rs160, rs160}; +{ +mul.f16x2 r1173, r1066, r1175; +} +{ +add.f16x2 r1176, r1177, r1173; +} +{ +cvt.rn.f16.f64 rs161, fd291; +} +mov.b32 r1181, {rs161, rs161}; +{ +mul.f16x2 r1179, r1060, r1181; +} +{ +add.f16x2 r1182, %79, r1179; +} +{ +cvt.rn.f16.f64 rs162, fd310; +} +mov.b32 r1187, {rs162, rs162}; +{ +mul.f16x2 r1185, r1063, r1187; +} +{ +add.f16x2 r1188, r1189, r1185; +} +{ +cvt.rn.f16.f64 rs163, fd295; +} +mov.b32 r1193, {rs163, rs163}; +{ +mul.f16x2 r1191, r1069, r1193; +} +{ +add.f16x2 r1194, r1170, r1191; +} +{ +cvt.rn.f16.f64 rs164, fd306; +} +mov.b32 r1199, {rs164, rs164}; +{ +mul.f16x2 r1197, r1078, r1199; +} +{ +add.f16x2 r1200, r1176, r1197; +} +{ +cvt.rn.f16.f64 rs165, fd295; +} +mov.b32 r1205, {rs165, rs165}; +{ +mul.f16x2 r1203, r1072, r1205; +} +{ +add.f16x2 r1206, r1182, r1203; +} +{ +cvt.rn.f16.f64 rs166, fd306; +} +mov.b32 r1211, {rs166, rs166}; +{ +mul.f16x2 r1209, r1075, r1211; +} +{ +add.f16x2 r1212, r1188, r1209; +} +{ +cvt.rn.f16.f64 rs167, fd299; +} +mov.b32 r1217, {rs167, rs167}; +{ +mul.f16x2 r1215, r1081, r1217; +} +{ +add.f16x2 r1218, r1194, r1215; +} +{ +cvt.rn.f16.f64 rs168, fd302; +} +mov.b32 r1223, {rs168, rs168}; +{ +mul.f16x2 r1221, r1090, r1223; +} +{ +add.f16x2 r1224, r1200, r1221; +} +{ +cvt.rn.f16.f64 rs169, fd299; +} +mov.b32 r1229, {rs169, rs169}; +{ +mul.f16x2 r1227, r1084, r1229; +} +{ +add.f16x2 r1230, r1206, r1227; +} +{ +cvt.rn.f16.f64 rs170, fd302; +} +mov.b32 r1235, {rs170, rs170}; +{ +mul.f16x2 r1233, r1087, r1235; +} +{ +add.f16x2 r1236, r1212, r1233; +} +{ +cvt.rn.f16.f64 rs171, fd303; +} +mov.b32 r1241, {rs171, rs171}; +{ +mul.f16x2 r1239, r1093, r1241; +} +{ +add.f16x2 r1242, r1218, r1239; +} +{ +cvt.rn.f16.f64 rs172, fd304; +} +mov.b32 r1247, {rs172, rs172}; +{ +mul.f16x2 r1245, r1102, r1247; +} +{ +add.f16x2 r1248, r1224, r1245; +} +{ +cvt.rn.f16.f64 rs173, fd303; +} +mov.b32 r1253, {rs173, rs173}; +{ +mul.f16x2 r1251, r1096, r1253; +} +{ +add.f16x2 r1254, r1230, r1251; +} +{ +cvt.rn.f16.f64 rs174, fd304; +} +mov.b32 r1259, {rs174, rs174}; +{ +mul.f16x2 r1257, r1099, r1259; +} +{ +add.f16x2 r1260, r1236, r1257; +} +{ +cvt.rn.f16.f64 rs175, fd307; +} +mov.b32 r1265, {rs175, rs175}; +{ +mul.f16x2 r1263, r1105, r1265; +} +{ +add.f16x2 r1266, r1242, r1263; +} +{ +cvt.rn.f16.f64 rs176, fd308; +} +mov.b32 r1271, {rs176, rs176}; +{ +mul.f16x2 r1269, r1114, r1271; +} +{ +add.f16x2 r1272, r1248, r1269; +} +{ +cvt.rn.f16.f64 rs177, fd307; +} +mov.b32 r1277, {rs177, rs177}; +{ +mul.f16x2 r1275, r1108, r1277; +} +{ +add.f16x2 r1278, r1254, r1275; +} +{ +cvt.rn.f16.f64 rs178, fd308; +} +mov.b32 r1283, {rs178, rs178}; +{ +mul.f16x2 r1281, r1111, r1283; +} +{ +add.f16x2 r1284, r1260, r1281; +} +{ +cvt.rn.f16.f64 rs179, fd311; +} +mov.b32 r1289, {rs179, rs179}; +{ +mul.f16x2 r1287, r1117, r1289; +} +{ +add.f16x2 r1290, r1266, r1287; +} +{ +cvt.rn.f16.f64 rs180, fd312; +} +mov.b32 r1295, {rs180, rs180}; +{ +mul.f16x2 r1293, r1126, r1295; +} +{ +add.f16x2 r1296, r1272, r1293; +} +{ +cvt.rn.f16.f64 rs181, fd311; +} +mov.b32 r1301, {rs181, rs181}; +{ +mul.f16x2 r1299, r1120, r1301; +} +{ +add.f16x2 r1302, r1278, r1299; +} +{ +cvt.rn.f16.f64 rs182, fd312; +} +mov.b32 r1307, {rs182, rs182}; +{ +mul.f16x2 r1305, r1123, r1307; +} +{ +add.f16x2 r1308, r1284, r1305; +} +{ +sub.f16x2 r1311, r1290, r1296; +} +{ +add.f16x2 r1314, r1302, r1308; +} +{ +add.f16x2 r1317, r1290, r1296; +} +{ +sub.f16x2 r1320, r1302, r1308; +} +cvt.rn.f16.s32 rs183, r1956; +mov.b32 r1335, {rs183, rs183}; +cvt.rn.f16.s32 rs184, r1956; +mov.b32 r1347, {rs184, rs184}; +{ +cvt.rn.f16.f64 rs185, fd295; +} +mov.b32 r1327, {rs185, rs185}; +{ +mul.f16x2 r1325, r1057, r1327; +} +{ +add.f16x2 r1328, %100, r1325; +} +{ +cvt.rn.f16.f64 rs186, fd306; +} +mov.b32 r1333, {rs186, rs186}; +{ +mul.f16x2 r1331, r1066, r1333; +} +{ +add.f16x2 r1334, r1335, r1331; +} +{ +cvt.rn.f16.f64 rs187, fd295; +} +mov.b32 r1339, {rs187, rs187}; +{ +mul.f16x2 r1337, r1060, r1339; +} +{ +add.f16x2 r1340, %79, r1337; +} +{ +cvt.rn.f16.f64 rs188, fd306; +} +mov.b32 r1345, {rs188, rs188}; +{ +mul.f16x2 r1343, r1063, r1345; +} +{ +add.f16x2 r1346, r1347, r1343; +} +{ +cvt.rn.f16.f64 rs189, fd303; +} +mov.b32 r1351, {rs189, rs189}; +{ +mul.f16x2 r1349, r1069, r1351; +} +{ +add.f16x2 r1352, r1328, r1349; +} +{ +cvt.rn.f16.f64 rs190, fd304; +} +mov.b32 r1357, {rs190, rs190}; +{ +mul.f16x2 r1355, r1078, r1357; +} +{ +add.f16x2 r1358, r1334, r1355; +} +{ +cvt.rn.f16.f64 rs191, fd303; +} +mov.b32 r1363, {rs191, rs191}; +{ +mul.f16x2 r1361, r1072, r1363; +} +{ +add.f16x2 r1364, r1340, r1361; +} +{ +cvt.rn.f16.f64 rs192, fd304; +} +mov.b32 r1369, {rs192, rs192}; +{ +mul.f16x2 r1367, r1075, r1369; +} +{ +add.f16x2 r1370, r1346, r1367; +} +{ +cvt.rn.f16.f64 rs193, fd311; +} +mov.b32 r1375, {rs193, rs193}; +{ +mul.f16x2 r1373, r1081, r1375; +} +{ +add.f16x2 r1376, r1352, r1373; +} +{ +cvt.rn.f16.f64 rs194, fd312; +} +mov.b32 r1381, {rs194, rs194}; +{ +mul.f16x2 r1379, r1090, r1381; +} +{ +add.f16x2 r1382, r1358, r1379; +} +{ +cvt.rn.f16.f64 rs195, fd311; +} +mov.b32 r1387, {rs195, rs195}; +{ +mul.f16x2 r1385, r1084, r1387; +} +{ +add.f16x2 r1388, r1364, r1385; +} +{ +cvt.rn.f16.f64 rs196, fd312; +} +mov.b32 r1393, {rs196, rs196}; +{ +mul.f16x2 r1391, r1087, r1393; +} +{ +add.f16x2 r1394, r1370, r1391; +} +{ +cvt.rn.f16.f64 rs197, fd307; +} +mov.b32 r1399, {rs197, rs197}; +{ +mul.f16x2 r1397, r1093, r1399; +} +{ +add.f16x2 r1400, r1376, r1397; +} +{ +cvt.rn.f16.f64 rs198, fd224; +} +mov.b32 r1405, {rs198, rs198}; +{ +mul.f16x2 r1403, r1102, r1405; +} +{ +add.f16x2 r1406, r1382, r1403; +} +{ +cvt.rn.f16.f64 rs199, fd307; +} +mov.b32 r1411, {rs199, rs199}; +{ +mul.f16x2 r1409, r1096, r1411; +} +{ +add.f16x2 r1412, r1388, r1409; +} +{ +cvt.rn.f16.f64 rs200, fd224; +} +mov.b32 r1417, {rs200, rs200}; +{ +mul.f16x2 r1415, r1099, r1417; +} +{ +add.f16x2 r1418, r1394, r1415; +} +{ +cvt.rn.f16.f64 rs201, fd299; +} +mov.b32 r1423, {rs201, rs201}; +{ +mul.f16x2 r1421, r1105, r1423; +} +{ +add.f16x2 r1424, r1400, r1421; +} +{ +cvt.rn.f16.f64 rs202, fd288; +} +mov.b32 r1429, {rs202, rs202}; +{ +mul.f16x2 r1427, r1114, r1429; +} +{ +add.f16x2 r1430, r1406, r1427; +} +{ +cvt.rn.f16.f64 rs203, fd299; +} +mov.b32 r1435, {rs203, rs203}; +{ +mul.f16x2 r1433, r1108, r1435; +} +{ +add.f16x2 r1436, r1412, r1433; +} +{ +cvt.rn.f16.f64 rs204, fd288; +} +mov.b32 r1441, {rs204, rs204}; +{ +mul.f16x2 r1439, r1111, r1441; +} +{ +add.f16x2 r1442, r1418, r1439; +} +{ +cvt.rn.f16.f64 rs205, fd291; +} +mov.b32 r1447, {rs205, rs205}; +{ +mul.f16x2 r1445, r1117, r1447; +} +{ +add.f16x2 r1448, r1424, r1445; +} +{ +cvt.rn.f16.f64 rs206, fd272; +} +mov.b32 r1453, {rs206, rs206}; +{ +mul.f16x2 r1451, r1126, r1453; +} +{ +add.f16x2 r1454, r1430, r1451; +} +{ +cvt.rn.f16.f64 rs207, fd291; +} +mov.b32 r1459, {rs207, rs207}; +{ +mul.f16x2 r1457, r1120, r1459; +} +{ +add.f16x2 r1460, r1436, r1457; +} +{ +cvt.rn.f16.f64 rs208, fd272; +} +mov.b32 r1465, {rs208, rs208}; +{ +mul.f16x2 r1463, r1123, r1465; +} +{ +add.f16x2 r1466, r1442, r1463; +} +{ +sub.f16x2 r1469, r1448, r1454; +} +{ +add.f16x2 r1472, r1460, r1466; +} +{ +add.f16x2 r1475, r1448, r1454; +} +{ +sub.f16x2 r1478, r1460, r1466; +} +cvt.rn.f16.s32 rs209, r1956; +mov.b32 r1493, {rs209, rs209}; +cvt.rn.f16.s32 rs210, r1956; +mov.b32 r1505, {rs210, rs210}; +{ +cvt.rn.f16.f64 rs211, fd299; +} +mov.b32 r1485, {rs211, rs211}; +{ +mul.f16x2 r1483, r1057, r1485; +} +{ +add.f16x2 r1486, %100, r1483; +} +{ +cvt.rn.f16.f64 rs212, fd302; +} +mov.b32 r1491, {rs212, rs212}; +{ +mul.f16x2 r1489, r1066, r1491; +} +{ +add.f16x2 r1492, r1493, r1489; +} +{ +cvt.rn.f16.f64 rs213, fd299; +} +mov.b32 r1497, {rs213, rs213}; +{ +mul.f16x2 r1495, r1060, r1497; +} +{ +add.f16x2 r1498, %79, r1495; +} +{ +cvt.rn.f16.f64 rs214, fd302; +} +mov.b32 r1503, {rs214, rs214}; +{ +mul.f16x2 r1501, r1063, r1503; +} +{ +add.f16x2 r1504, r1505, r1501; +} +{ +cvt.rn.f16.f64 rs215, fd311; +} +mov.b32 r1509, {rs215, rs215}; +{ +mul.f16x2 r1507, r1069, r1509; +} +{ +add.f16x2 r1510, r1486, r1507; +} +{ +cvt.rn.f16.f64 rs216, fd312; +} +mov.b32 r1515, {rs216, rs216}; +{ +mul.f16x2 r1513, r1078, r1515; +} +{ +add.f16x2 r1516, r1492, r1513; +} +{ +cvt.rn.f16.f64 rs217, fd311; +} +mov.b32 r1521, {rs217, rs217}; +{ +mul.f16x2 r1519, r1072, r1521; +} +{ +add.f16x2 r1522, r1498, r1519; +} +{ +cvt.rn.f16.f64 rs218, fd312; +} +mov.b32 r1527, {rs218, rs218}; +{ +mul.f16x2 r1525, r1075, r1527; +} +{ +add.f16x2 r1528, r1504, r1525; +} +{ +cvt.rn.f16.f64 rs219, fd303; +} +mov.b32 r1533, {rs219, rs219}; +{ +mul.f16x2 r1531, r1081, r1533; +} +{ +add.f16x2 r1534, r1510, r1531; +} +{ +cvt.rn.f16.f64 rs220, fd204; +} +mov.b32 r1539, {rs220, rs220}; +{ +mul.f16x2 r1537, r1090, r1539; +} +{ +add.f16x2 r1540, r1516, r1537; +} +{ +cvt.rn.f16.f64 rs221, fd303; +} +mov.b32 r1545, {rs221, rs221}; +{ +mul.f16x2 r1543, r1084, r1545; +} +{ +add.f16x2 r1546, r1522, r1543; +} +{ +cvt.rn.f16.f64 rs222, fd204; +} +mov.b32 r1551, {rs222, rs222}; +{ +mul.f16x2 r1549, r1087, r1551; +} +{ +add.f16x2 r1552, r1528, r1549; +} +{ +cvt.rn.f16.f64 rs223, fd291; +} +mov.b32 r1557, {rs223, rs223}; +{ +mul.f16x2 r1555, r1093, r1557; +} +{ +add.f16x2 r1558, r1534, r1555; +} +{ +cvt.rn.f16.f64 rs224, fd272; +} +mov.b32 r1563, {rs224, rs224}; +{ +mul.f16x2 r1561, r1102, r1563; +} +{ +add.f16x2 r1564, r1540, r1561; +} +{ +cvt.rn.f16.f64 rs225, fd291; +} +mov.b32 r1569, {rs225, rs225}; +{ +mul.f16x2 r1567, r1096, r1569; +} +{ +add.f16x2 r1570, r1546, r1567; +} +{ +cvt.rn.f16.f64 rs226, fd272; +} +mov.b32 r1575, {rs226, rs226}; +{ +mul.f16x2 r1573, r1099, r1575; +} +{ +add.f16x2 r1576, r1552, r1573; +} +{ +cvt.rn.f16.f64 rs227, fd295; +} +mov.b32 r1581, {rs227, rs227}; +{ +mul.f16x2 r1579, r1105, r1581; +} +{ +add.f16x2 r1582, r1558, r1579; +} +{ +cvt.rn.f16.f64 rs228, fd306; +} +mov.b32 r1587, {rs228, rs228}; +{ +mul.f16x2 r1585, r1114, r1587; +} +{ +add.f16x2 r1588, r1564, r1585; +} +{ +cvt.rn.f16.f64 rs229, fd295; +} +mov.b32 r1593, {rs229, rs229}; +{ +mul.f16x2 r1591, r1108, r1593; +} +{ +add.f16x2 r1594, r1570, r1591; +} +{ +cvt.rn.f16.f64 rs230, fd306; +} +mov.b32 r1599, {rs230, rs230}; +{ +mul.f16x2 r1597, r1111, r1599; +} +{ +add.f16x2 r1600, r1576, r1597; +} +{ +cvt.rn.f16.f64 rs231, fd307; +} +mov.b32 r1605, {rs231, rs231}; +{ +mul.f16x2 r1603, r1117, r1605; +} +{ +add.f16x2 r1606, r1582, r1603; +} +{ +cvt.rn.f16.f64 rs232, fd308; +} +mov.b32 r1611, {rs232, rs232}; +{ +mul.f16x2 r1609, r1126, r1611; +} +{ +add.f16x2 r1612, r1588, r1609; +} +{ +cvt.rn.f16.f64 rs233, fd307; +} +mov.b32 r1617, {rs233, rs233}; +{ +mul.f16x2 r1615, r1120, r1617; +} +{ +add.f16x2 r1618, r1594, r1615; +} +{ +cvt.rn.f16.f64 rs234, fd308; +} +mov.b32 r1623, {rs234, rs234}; +{ +mul.f16x2 r1621, r1123, r1623; +} +{ +add.f16x2 r1624, r1600, r1621; +} +{ +sub.f16x2 r1627, r1606, r1612; +} +{ +add.f16x2 r1630, r1618, r1624; +} +{ +add.f16x2 r1633, r1606, r1612; +} +{ +sub.f16x2 r1636, r1618, r1624; +} +cvt.rn.f16.s32 rs235, r1956; +mov.b32 r1651, {rs235, rs235}; +cvt.rn.f16.s32 rs236, r1956; +mov.b32 r1663, {rs236, rs236}; +{ +cvt.rn.f16.f64 rs237, fd303; +} +mov.b32 r1643, {rs237, rs237}; +{ +mul.f16x2 r1641, r1057, r1643; +} +{ +add.f16x2 r1644, %100, r1641; +} +{ +cvt.rn.f16.f64 rs238, fd304; +} +mov.b32 r1649, {rs238, rs238}; +{ +mul.f16x2 r1647, r1066, r1649; +} +{ +add.f16x2 r1650, r1651, r1647; +} +{ +cvt.rn.f16.f64 rs239, fd303; +} +mov.b32 r1655, {rs239, rs239}; +{ +mul.f16x2 r1653, r1060, r1655; +} +{ +add.f16x2 r1656, %79, r1653; +} +{ +cvt.rn.f16.f64 rs240, fd304; +} +mov.b32 r1661, {rs240, rs240}; +{ +mul.f16x2 r1659, r1063, r1661; +} +{ +add.f16x2 r1662, r1663, r1659; +} +{ +cvt.rn.f16.f64 rs241, fd307; +} +mov.b32 r1667, {rs241, rs241}; +{ +mul.f16x2 r1665, r1069, r1667; +} +{ +add.f16x2 r1668, r1644, r1665; +} +{ +cvt.rn.f16.f64 rs242, fd224; +} +mov.b32 r1673, {rs242, rs242}; +{ +mul.f16x2 r1671, r1078, r1673; +} +{ +add.f16x2 r1674, r1650, r1671; +} +{ +cvt.rn.f16.f64 rs243, fd307; +} +mov.b32 r1679, {rs243, rs243}; +{ +mul.f16x2 r1677, r1072, r1679; +} +{ +add.f16x2 r1680, r1656, r1677; +} +{ +cvt.rn.f16.f64 rs244, fd224; +} +mov.b32 r1685, {rs244, rs244}; +{ +mul.f16x2 r1683, r1075, r1685; +} +{ +add.f16x2 r1686, r1662, r1683; +} +{ +cvt.rn.f16.f64 rs245, fd291; +} +mov.b32 r1691, {rs245, rs245}; +{ +mul.f16x2 r1689, r1081, r1691; +} +{ +add.f16x2 r1692, r1668, r1689; +} +{ +cvt.rn.f16.f64 rs246, fd272; +} +mov.b32 r1697, {rs246, rs246}; +{ +mul.f16x2 r1695, r1090, r1697; +} +{ +add.f16x2 r1698, r1674, r1695; +} +{ +cvt.rn.f16.f64 rs247, fd291; +} +mov.b32 r1703, {rs247, rs247}; +{ +mul.f16x2 r1701, r1084, r1703; +} +{ +add.f16x2 r1704, r1680, r1701; +} +{ +cvt.rn.f16.f64 rs248, fd272; +} +mov.b32 r1709, {rs248, rs248}; +{ +mul.f16x2 r1707, r1087, r1709; +} +{ +add.f16x2 r1710, r1686, r1707; +} +{ +cvt.rn.f16.f64 rs249, fd299; +} +mov.b32 r1715, {rs249, rs249}; +{ +mul.f16x2 r1713, r1093, r1715; +} +{ +add.f16x2 r1716, r1692, r1713; +} +{ +cvt.rn.f16.f64 rs250, fd302; +} +mov.b32 r1721, {rs250, rs250}; +{ +mul.f16x2 r1719, r1102, r1721; +} +{ +add.f16x2 r1722, r1698, r1719; +} +{ +cvt.rn.f16.f64 rs251, fd299; +} +mov.b32 r1727, {rs251, rs251}; +{ +mul.f16x2 r1725, r1096, r1727; +} +{ +add.f16x2 r1728, r1704, r1725; +} +{ +cvt.rn.f16.f64 rs252, fd302; +} +mov.b32 r1733, {rs252, rs252}; +{ +mul.f16x2 r1731, r1099, r1733; +} +{ +add.f16x2 r1734, r1710, r1731; +} +{ +cvt.rn.f16.f64 rs253, fd311; +} +mov.b32 r1739, {rs253, rs253}; +{ +mul.f16x2 r1737, r1105, r1739; +} +{ +add.f16x2 r1740, r1716, r1737; +} +{ +cvt.rn.f16.f64 rs254, fd256; +} +mov.b32 r1745, {rs254, rs254}; +{ +mul.f16x2 r1743, r1114, r1745; +} +{ +add.f16x2 r1746, r1722, r1743; +} +{ +cvt.rn.f16.f64 rs255, fd311; +} +mov.b32 r1751, {rs255, rs255}; +{ +mul.f16x2 r1749, r1108, r1751; +} +{ +add.f16x2 r1752, r1728, r1749; +} +{ +cvt.rn.f16.f64 rs256, fd256; +} +mov.b32 r1757, {rs256, rs256}; +{ +mul.f16x2 r1755, r1111, r1757; +} +{ +add.f16x2 r1758, r1734, r1755; +} +{ +cvt.rn.f16.f64 rs257, fd295; +} +mov.b32 r1763, {rs257, rs257}; +{ +mul.f16x2 r1761, r1117, r1763; +} +{ +add.f16x2 r1764, r1740, r1761; +} +{ +cvt.rn.f16.f64 rs258, fd280; +} +mov.b32 r1769, {rs258, rs258}; +{ +mul.f16x2 r1767, r1126, r1769; +} +{ +add.f16x2 r1770, r1746, r1767; +} +{ +cvt.rn.f16.f64 rs259, fd295; +} +mov.b32 r1775, {rs259, rs259}; +{ +mul.f16x2 r1773, r1120, r1775; +} +{ +add.f16x2 r1776, r1752, r1773; +} +{ +cvt.rn.f16.f64 rs260, fd280; +} +mov.b32 r1781, {rs260, rs260}; +{ +mul.f16x2 r1779, r1123, r1781; +} +{ +add.f16x2 r1782, r1758, r1779; +} +{ +sub.f16x2 r1785, r1764, r1770; +} +{ +add.f16x2 r1788, r1776, r1782; +} +{ +add.f16x2 r1791, r1764, r1770; +} +{ +sub.f16x2 r1794, r1776, r1782; +} +cvt.rn.f16.s32 rs261, r1956; +mov.b32 r1809, {rs261, rs261}; +cvt.rn.f16.s32 rs262, r1956; +mov.b32 r1821, {rs262, rs262}; +{ +cvt.rn.f16.f64 rs263, fd307; +} +mov.b32 r1801, {rs263, rs263}; +{ +mul.f16x2 r1799, r1057, r1801; +} +{ +add.f16x2 r1802, %100, r1799; +} +{ +cvt.rn.f16.f64 rs264, fd308; +} +mov.b32 r1807, {rs264, rs264}; +{ +mul.f16x2 r1805, r1066, r1807; +} +{ +add.f16x2 r1808, r1809, r1805; +} +{ +cvt.rn.f16.f64 rs265, fd307; +} +mov.b32 r1813, {rs265, rs265}; +{ +mul.f16x2 r1811, r1060, r1813; +} +{ +add.f16x2 r1814, %79, r1811; +} +{ +cvt.rn.f16.f64 rs266, fd308; +} +mov.b32 r1819, {rs266, rs266}; +{ +mul.f16x2 r1817, r1063, r1819; +} +{ +add.f16x2 r1820, r1821, r1817; +} +{ +cvt.rn.f16.f64 rs267, fd299; +} +mov.b32 r1825, {rs267, rs267}; +{ +mul.f16x2 r1823, r1069, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs268, fd288; +} +mov.b32 r1831, {rs268, rs268}; +{ +mul.f16x2 r1829, r1078, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs269, fd299; +} +mov.b32 r1837, {rs269, rs269}; +{ +mul.f16x2 r1835, r1072, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs270, fd288; +} +mov.b32 r1843, {rs270, rs270}; +{ +mul.f16x2 r1841, r1075, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs271, fd295; +} +mov.b32 r1849, {rs271, rs271}; +{ +mul.f16x2 r1847, r1081, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs272, fd306; +} +mov.b32 r1855, {rs272, rs272}; +{ +mul.f16x2 r1853, r1090, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs273, fd295; +} +mov.b32 r1861, {rs273, rs273}; +{ +mul.f16x2 r1859, r1084, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs274, fd306; +} +mov.b32 r1867, {rs274, rs274}; +{ +mul.f16x2 r1865, r1087, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs275, fd311; +} +mov.b32 r1873, {rs275, rs275}; +{ +mul.f16x2 r1871, r1093, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs276, fd256; +} +mov.b32 r1879, {rs276, rs276}; +{ +mul.f16x2 r1877, r1102, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs277, fd311; +} +mov.b32 r1885, {rs277, rs277}; +{ +mul.f16x2 r1883, r1096, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs278, fd256; +} +mov.b32 r1891, {rs278, rs278}; +{ +mul.f16x2 r1889, r1099, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs279, fd291; +} +mov.b32 r1897, {rs279, rs279}; +{ +mul.f16x2 r1895, r1105, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs280, fd272; +} +mov.b32 r1903, {rs280, rs280}; +{ +mul.f16x2 r1901, r1114, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs281, fd291; +} +mov.b32 r1909, {rs281, rs281}; +{ +mul.f16x2 r1907, r1108, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs282, fd272; +} +mov.b32 r1915, {rs282, rs282}; +{ +mul.f16x2 r1913, r1111, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs283, fd303; +} +mov.b32 r1921, {rs283, rs283}; +{ +mul.f16x2 r1919, r1117, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs284, fd304; +} +mov.b32 r1927, {rs284, rs284}; +{ +mul.f16x2 r1925, r1126, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs285, fd303; +} +mov.b32 r1933, {rs285, rs285}; +{ +mul.f16x2 r1931, r1120, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs286, fd304; +} +mov.b32 r1939, {rs286, rs286}; +{ +mul.f16x2 r1937, r1123, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +sub.f16x2 r1943, r1922, r1928; +} +{ +add.f16x2 r1946, r1934, r1940; +} +{ +add.f16x2 r1949, r1922, r1928; +} +{ +sub.f16x2 r1952, r1934, r1940; +} +cvt.rn.f16.s32 rs287, r1956; +mov.b32 r1967, {rs287, rs287}; +cvt.rn.f16.s32 rs288, r1956; +mov.b32 r1979, {rs288, rs288}; +{ +cvt.rn.f16.f64 rs289, fd311; +} +mov.b32 r1959, {rs289, rs289}; +{ +mul.f16x2 r1957, r1057, r1959; +} +{ +add.f16x2 r1960, %100, r1957; +} +{ +cvt.rn.f16.f64 rs290, fd312; +} +mov.b32 r1965, {rs290, rs290}; +{ +mul.f16x2 r1963, r1066, r1965; +} +{ +add.f16x2 r1966, r1967, r1963; +} +{ +cvt.rn.f16.f64 rs291, fd311; +} +mov.b32 r1971, {rs291, rs291}; +{ +mul.f16x2 r1969, r1060, r1971; +} +{ +add.f16x2 r1972, %79, r1969; +} +{ +cvt.rn.f16.f64 rs292, fd312; +} +mov.b32 r1977, {rs292, rs292}; +{ +mul.f16x2 r1975, r1063, r1977; +} +{ +add.f16x2 r1978, r1979, r1975; +} +{ +cvt.rn.f16.f64 rs293, fd291; +} +mov.b32 r1983, {rs293, rs293}; +{ +mul.f16x2 r1981, r1069, r1983; +} +{ +add.f16x2 r1984, r1960, r1981; +} +{ +cvt.rn.f16.f64 rs294, fd272; +} +mov.b32 r1989, {rs294, rs294}; +{ +mul.f16x2 r1987, r1078, r1989; +} +{ +add.f16x2 r1990, r1966, r1987; +} +{ +cvt.rn.f16.f64 rs295, fd291; +} +mov.b32 r1995, {rs295, rs295}; +{ +mul.f16x2 r1993, r1072, r1995; +} +{ +add.f16x2 r1996, r1972, r1993; +} +{ +cvt.rn.f16.f64 rs296, fd272; +} +mov.b32 r2001, {rs296, rs296}; +{ +mul.f16x2 r1999, r1075, r2001; +} +{ +add.f16x2 r2002, r1978, r1999; +} +{ +cvt.rn.f16.f64 rs297, fd307; +} +mov.b32 r2007, {rs297, rs297}; +{ +mul.f16x2 r2005, r1081, r2007; +} +{ +add.f16x2 r2008, r1984, r2005; +} +{ +cvt.rn.f16.f64 rs298, fd308; +} +mov.b32 r2013, {rs298, rs298}; +{ +mul.f16x2 r2011, r1090, r2013; +} +{ +add.f16x2 r2014, r1990, r2011; +} +{ +cvt.rn.f16.f64 rs299, fd307; +} +mov.b32 r2019, {rs299, rs299}; +{ +mul.f16x2 r2017, r1084, r2019; +} +{ +add.f16x2 r2020, r1996, r2017; +} +{ +cvt.rn.f16.f64 rs300, fd308; +} +mov.b32 r2025, {rs300, rs300}; +{ +mul.f16x2 r2023, r1087, r2025; +} +{ +add.f16x2 r2026, r2002, r2023; +} +{ +cvt.rn.f16.f64 rs301, fd295; +} +mov.b32 r2031, {rs301, rs301}; +{ +mul.f16x2 r2029, r1093, r2031; +} +{ +add.f16x2 r2032, r2008, r2029; +} +{ +cvt.rn.f16.f64 rs302, fd280; +} +mov.b32 r2037, {rs302, rs302}; +{ +mul.f16x2 r2035, r1102, r2037; +} +{ +add.f16x2 r2038, r2014, r2035; +} +{ +cvt.rn.f16.f64 rs303, fd295; +} +mov.b32 r2043, {rs303, rs303}; +{ +mul.f16x2 r2041, r1096, r2043; +} +{ +add.f16x2 r2044, r2020, r2041; +} +{ +cvt.rn.f16.f64 rs304, fd280; +} +mov.b32 r2049, {rs304, rs304}; +{ +mul.f16x2 r2047, r1099, r2049; +} +{ +add.f16x2 r2050, r2026, r2047; +} +{ +cvt.rn.f16.f64 rs305, fd303; +} +mov.b32 r2055, {rs305, rs305}; +{ +mul.f16x2 r2053, r1105, r2055; +} +{ +add.f16x2 r2056, r2032, r2053; +} +{ +cvt.rn.f16.f64 rs306, fd304; +} +mov.b32 r2061, {rs306, rs306}; +{ +mul.f16x2 r2059, r1114, r2061; +} +{ +add.f16x2 r2062, r2038, r2059; +} +{ +cvt.rn.f16.f64 rs307, fd303; +} +mov.b32 r2067, {rs307, rs307}; +{ +mul.f16x2 r2065, r1108, r2067; +} +{ +add.f16x2 r2068, r2044, r2065; +} +{ +cvt.rn.f16.f64 rs308, fd304; +} +mov.b32 r2073, {rs308, rs308}; +{ +mul.f16x2 r2071, r1111, r2073; +} +{ +add.f16x2 r2074, r2050, r2071; +} +{ +cvt.rn.f16.f64 rs309, fd299; +} +mov.b32 r2079, {rs309, rs309}; +{ +mul.f16x2 r2077, r1117, r2079; +} +{ +add.f16x2 r2080, r2056, r2077; +} +{ +cvt.rn.f16.f64 rs310, fd288; +} +mov.b32 r2085, {rs310, rs310}; +{ +mul.f16x2 r2083, r1126, r2085; +} +{ +add.f16x2 r2086, r2062, r2083; +} +{ +cvt.rn.f16.f64 rs311, fd299; +} +mov.b32 r2091, {rs311, rs311}; +{ +mul.f16x2 r2089, r1120, r2091; +} +{ +add.f16x2 r2092, r2068, r2089; +} +{ +cvt.rn.f16.f64 rs312, fd288; +} +mov.b32 r2097, {rs312, rs312}; +{ +mul.f16x2 r2095, r1123, r2097; +} +{ +add.f16x2 r2098, r2074, r2095; +} +{ +sub.f16x2 r2101, r2080, r2086; +} +{ +add.f16x2 r2104, r2092, r2098; +} +{ +add.f16x2 r2107, r2080, r2086; +} +{ +sub.f16x2 r2110, r2092, r2098; +} +mov.f64 fd289, 0d3FEF11F493053D00; +{ +cvt.rn.f16.f64 rs313, fd289; +} +{ +cvt.rn.f16.f64 rs314, fd312; +} +{ +cvt.rn.f16.f64 rs315, fd291; +} +{ +cvt.rn.f16.f64 rs316, fd310; +} +mov.f64 fd293, 0d3FE7F3CCD0032E0C; +{ +cvt.rn.f16.f64 rs317, fd293; +} +{ +cvt.rn.f16.f64 rs318, fd308; +} +{ +cvt.rn.f16.f64 rs319, fd295; +} +{ +cvt.rn.f16.f64 rs320, fd306; +} +mov.f64 fd297, 0d3FD6B1D8B2365DA1; +{ +cvt.rn.f16.f64 rs321, fd297; +} +{ +cvt.rn.f16.f64 rs322, fd304; +} +{ +cvt.rn.f16.f64 rs323, fd299; +} +{ +cvt.rn.f16.f64 rs324, fd302; +} +mov.f64 fd301, 0dBFBEDB7DEBAA3ED8; +{ +cvt.rn.f16.f64 rs325, fd301; +} +{ +cvt.rn.f16.f64 rs326, fd302; +} +{ +cvt.rn.f16.f64 rs327, fd303; +} +{ +cvt.rn.f16.f64 rs328, fd304; +} +mov.f64 fd305, 0dBFE22D961EA71119; +{ +cvt.rn.f16.f64 rs329, fd305; +} +{ +cvt.rn.f16.f64 rs330, fd306; +} +{ +cvt.rn.f16.f64 rs331, fd307; +} +{ +cvt.rn.f16.f64 rs332, fd308; +} +mov.f64 fd309, 0dBFEC55A7E00740E9; +{ +cvt.rn.f16.f64 rs333, fd309; +} +{ +cvt.rn.f16.f64 rs334, fd310; +} +{ +cvt.rn.f16.f64 rs335, fd311; +} +{ +cvt.rn.f16.f64 rs336, fd312; +} +mov.b32 r2127, {rs313, rs313}; +{ +mul.f16x2 r2113, r1311, r2127; +} +mov.b32 r2124, {rs314, rs314}; +{ +mul.f16x2 r2116, r1314, r2124; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r1311, r2124; +} +{ +fma.rn.f16x2 r2125, r1314, r2127, r2122; +} +mov.b32 r2143, {rs315, rs315}; +{ +mul.f16x2 r2129, r1469, r2143; +} +mov.b32 r2140, {rs316, rs316}; +{ +mul.f16x2 r2132, r1472, r2140; +} +{ +sub.f16x2 r2135, r2129, r2132; +} +{ +mul.f16x2 r2138, r1469, r2140; +} +{ +fma.rn.f16x2 r2141, r1472, r2143, r2138; +} +mov.b32 r2159, {rs317, rs317}; +{ +mul.f16x2 r2145, r1627, r2159; +} +mov.b32 r2156, {rs318, rs318}; +{ +mul.f16x2 r2148, r1630, r2156; +} +{ +sub.f16x2 r2151, r2145, r2148; +} +{ +mul.f16x2 r2154, r1627, r2156; +} +{ +fma.rn.f16x2 r2157, r1630, r2159, r2154; +} +mov.b32 r2175, {rs319, rs319}; +{ +mul.f16x2 r2161, r1785, r2175; +} +mov.b32 r2172, {rs320, rs320}; +{ +mul.f16x2 r2164, r1788, r2172; +} +{ +sub.f16x2 r2167, r2161, r2164; +} +{ +mul.f16x2 r2170, r1785, r2172; +} +{ +fma.rn.f16x2 r2173, r1788, r2175, r2170; +} +mov.b32 r2191, {rs321, rs321}; +{ +mul.f16x2 r2177, r1943, r2191; +} +mov.b32 r2188, {rs322, rs322}; +{ +mul.f16x2 r2180, r1946, r2188; +} +{ +sub.f16x2 r2183, r2177, r2180; +} +{ +mul.f16x2 r2186, r1943, r2188; +} +{ +fma.rn.f16x2 r2189, r1946, r2191, r2186; +} +mov.b32 r2207, {rs323, rs323}; +{ +mul.f16x2 r2193, r2101, r2207; +} +mov.b32 r2204, {rs324, rs324}; +{ +mul.f16x2 r2196, r2104, r2204; +} +{ +sub.f16x2 r2199, r2193, r2196; +} +{ +mul.f16x2 r2202, r2101, r2204; +} +{ +fma.rn.f16x2 r2205, r2104, r2207, r2202; +} +mov.b32 r2223, {rs325, rs325}; +{ +mul.f16x2 r2209, r2107, r2223; +} +mov.b32 r2220, {rs326, rs326}; +{ +mul.f16x2 r2212, r2110, r2220; +} +{ +sub.f16x2 r2215, r2209, r2212; +} +{ +mul.f16x2 r2218, r2107, r2220; +} +{ +fma.rn.f16x2 r2221, r2110, r2223, r2218; +} +mov.b32 r2239, {rs327, rs327}; +{ +mul.f16x2 r2225, r1949, r2239; +} +mov.b32 r2236, {rs328, rs328}; +{ +mul.f16x2 r2228, r1952, r2236; +} +{ +sub.f16x2 r2231, r2225, r2228; +} +{ +mul.f16x2 r2234, r1949, r2236; +} +{ +fma.rn.f16x2 r2237, r1952, r2239, r2234; +} +mov.b32 r2255, {rs329, rs329}; +{ +mul.f16x2 r2241, r1791, r2255; +} +mov.b32 r2252, {rs330, rs330}; +{ +mul.f16x2 r2244, r1794, r2252; +} +{ +sub.f16x2 r2247, r2241, r2244; +} +{ +mul.f16x2 r2250, r1791, r2252; +} +{ +fma.rn.f16x2 r2253, r1794, r2255, r2250; +} +mov.b32 r2271, {rs331, rs331}; +{ +mul.f16x2 r2257, r1633, r2271; +} +mov.b32 r2268, {rs332, rs332}; +{ +mul.f16x2 r2260, r1636, r2268; +} +{ +sub.f16x2 r2263, r2257, r2260; +} +{ +mul.f16x2 r2266, r1633, r2268; +} +{ +fma.rn.f16x2 r2269, r1636, r2271, r2266; +} +mov.b32 r2287, {rs333, rs333}; +{ +mul.f16x2 r2273, r1475, r2287; +} +mov.b32 r2284, {rs334, rs334}; +{ +mul.f16x2 r2276, r1478, r2284; +} +{ +sub.f16x2 r2279, r2273, r2276; +} +{ +mul.f16x2 r2282, r1475, r2284; +} +{ +fma.rn.f16x2 r2285, r1478, r2287, r2282; +} +mov.b32 r2303, {rs335, rs335}; +{ +mul.f16x2 r2289, r1317, r2303; +} +mov.b32 r2300, {rs336, rs336}; +{ +mul.f16x2 r2292, r1320, r2300; +} +{ +sub.f16x2 r2295, r2289, r2292; +} +{ +mul.f16x2 r2298, r1317, r2300; +} +{ +fma.rn.f16x2 r2301, r1320, r2303, r2298; +} +{ +add.f16x2 %0, r103, r1159; +} +{ +add.f16x2 %1, r106, r1162; +} +{ +sub.f16x2 %26, r103, r1159; +} +{ +sub.f16x2 %27, r106, r1162; +} +{ +add.f16x2 %2, r255, r2119; +} +{ +add.f16x2 %3, r258, r2125; +} +{ +sub.f16x2 %28, r255, r2119; +} +{ +sub.f16x2 %29, r258, r2125; +} +{ +add.f16x2 %4, r413, r2135; +} +{ +add.f16x2 %5, r416, r2141; +} +{ +sub.f16x2 %30, r413, r2135; +} +{ +sub.f16x2 %31, r416, r2141; +} +{ +add.f16x2 %6, r571, r2151; +} +{ +add.f16x2 %7, r574, r2157; +} +{ +sub.f16x2 %32, r571, r2151; +} +{ +sub.f16x2 %33, r574, r2157; +} +{ +add.f16x2 %8, r729, r2167; +} +{ +add.f16x2 %9, r732, r2173; +} +{ +sub.f16x2 %34, r729, r2167; +} +{ +sub.f16x2 %35, r732, r2173; +} +{ +add.f16x2 %10, r887, r2183; +} +{ +add.f16x2 %11, r890, r2189; +} +{ +sub.f16x2 %36, r887, r2183; +} +{ +sub.f16x2 %37, r890, r2189; +} +{ +add.f16x2 %12, r1045, r2199; +} +{ +add.f16x2 %13, r1048, r2205; +} +{ +sub.f16x2 %38, r1045, r2199; +} +{ +sub.f16x2 %39, r1048, r2205; +} +{ +add.f16x2 %14, r1051, r2215; +} +{ +add.f16x2 %15, r1054, r2221; +} +{ +sub.f16x2 %40, r1051, r2215; +} +{ +sub.f16x2 %41, r1054, r2221; +} +{ +add.f16x2 %16, r893, r2231; +} +{ +add.f16x2 %17, r896, r2237; +} +{ +sub.f16x2 %42, r893, r2231; +} +{ +sub.f16x2 %43, r896, r2237; +} +{ +add.f16x2 %18, r735, r2247; +} +{ +add.f16x2 %19, r738, r2253; +} +{ +sub.f16x2 %44, r735, r2247; +} +{ +sub.f16x2 %45, r738, r2253; +} +{ +add.f16x2 %20, r577, r2263; +} +{ +add.f16x2 %21, r580, r2269; +} +{ +sub.f16x2 %46, r577, r2263; +} +{ +sub.f16x2 %47, r580, r2269; +} +{ +add.f16x2 %22, r419, r2279; +} +{ +add.f16x2 %23, r422, r2285; +} +{ +sub.f16x2 %48, r419, r2279; +} +{ +sub.f16x2 %49, r422, r2285; +} +{ +add.f16x2 %24, r261, r2295; +} +{ +add.f16x2 %25, r264, r2301; +} +{ +sub.f16x2 %50, r261, r2295; +} +{ +sub.f16x2 %51, r264, r2301; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[23].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..6b106ed05553e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp32_fwd.hpp.inc @@ -0,0 +1,536 @@ +#ifndef CUFFTDX_FFT_26_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_26_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<11, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<734>; +.reg .b64 rd<7>; +add.f32 f105, %57, %101; +sub.f32 f107, %57, %101; +add.f32 f730, %105, %102; +sub.f32 f108, %105, %102; +add.f32 f109, %61, %97; +sub.f32 f111, %61, %97; +add.f32 f727, %106, %107; +sub.f32 f112, %106, %107; +add.f32 f113, %65, %93; +sub.f32 f115, %65, %93; +add.f32 f725, %66, %108; +sub.f32 f116, %66, %108; +add.f32 f117, %69, %89; +sub.f32 f119, %69, %89; +add.f32 f723, %109, %90; +sub.f32 f120, %109, %90; +add.f32 f121, %73, %85; +sub.f32 f123, %73, %85; +add.f32 f720, %110, %111; +sub.f32 f124, %110, %111; +add.f32 f125, %77, %81; +sub.f32 f127, %77, %81; +add.f32 f718, %78, %112; +sub.f32 f128, %78, %112; +add.f32 f129, %53, f105; +add.f32 f131, f129, f109; +add.f32 f717, %54, f730; +add.f32 f132, f717, f727; +add.f32 f133, f131, f113; +add.f32 f134, f132, f725; +add.f32 f135, f133, f117; +add.f32 f136, f134, f723; +add.f32 f137, f135, f121; +add.f32 f138, f136, f720; +add.f32 f139, f137, f125; +add.f32 f140, f138, f718; +fma.rn.f32 f141, f105, 0f3F62AD3F, %53; +fma.rn.f32 f145, f109, 0f3F116CB1, f141; +fma.rn.f32 f716, f108, 0fBEEDF032, 0f00000000; +fma.rn.f32 f146, f112, 0fBF52AF12, f716; +fma.rn.f32 f715, f730, 0f3F62AD3F, %54; +fma.rn.f32 f147, f727, 0f3F116CB1, f715; +fma.rn.f32 f714, f107, 0fBEEDF032, 0f00000000; +fma.rn.f32 f148, f111, 0fBF52AF12, f714; +fma.rn.f32 f149, f113, 0f3DF6DBEF, f145; +fma.rn.f32 f150, f116, 0fBF7E222B, f146; +fma.rn.f32 f151, f725, 0f3DF6DBEF, f147; +fma.rn.f32 f152, f115, 0fBF7E222B, f148; +fma.rn.f32 f153, f117, 0fBEB58EC6, f149; +fma.rn.f32 f154, f120, 0fBF6F5D39, f150; +fma.rn.f32 f155, f723, 0fBEB58EC6, f151; +fma.rn.f32 f156, f119, 0fBF6F5D39, f152; +fma.rn.f32 f157, f121, 0fBF3F9E67, f153; +fma.rn.f32 f158, f124, 0fBF29C268, f154; +fma.rn.f32 f159, f720, 0fBF3F9E67, f155; +fma.rn.f32 f160, f123, 0fBF29C268, f156; +fma.rn.f32 f161, f125, 0fBF788FA5, f157; +fma.rn.f32 f162, f128, 0fBE750F2A, f158; +fma.rn.f32 f163, f718, 0fBF788FA5, f159; +fma.rn.f32 f164, f127, 0fBE750F2A, f160; +sub.f32 f165, f161, f162; +add.f32 f167, f162, f161; +add.f32 f713, f164, f163; +sub.f32 f168, f163, f164; +fma.rn.f32 f169, f105, 0f3F116CB1, %53; +fma.rn.f32 f173, f109, 0fBEB58EC6, f169; +fma.rn.f32 f712, f108, 0fBF52AF12, 0f00000000; +fma.rn.f32 f174, f112, 0fBF6F5D39, f712; +fma.rn.f32 f711, f730, 0f3F116CB1, %54; +fma.rn.f32 f175, f727, 0fBEB58EC6, f711; +fma.rn.f32 f710, f107, 0fBF52AF12, 0f00000000; +fma.rn.f32 f176, f111, 0fBF6F5D39, f710; +fma.rn.f32 f177, f113, 0fBF788FA5, f173; +fma.rn.f32 f178, f116, 0fBE750F2A, f174; +fma.rn.f32 f179, f725, 0fBF788FA5, f175; +fma.rn.f32 f180, f115, 0fBE750F2A, f176; +fma.rn.f32 f181, f117, 0fBF3F9E67, f177; +fma.rn.f32 f182, f120, 0f3F29C268, f178; +fma.rn.f32 f183, f723, 0fBF3F9E67, f179; +fma.rn.f32 f184, f119, 0f3F29C268, f180; +fma.rn.f32 f185, f121, 0f3DF6DBEF, f181; +fma.rn.f32 f186, f124, 0f3F7E222B, f182; +fma.rn.f32 f187, f720, 0f3DF6DBEF, f183; +fma.rn.f32 f188, f123, 0f3F7E222B, f184; +fma.rn.f32 f189, f125, 0f3F62AD3F, f185; +fma.rn.f32 f190, f128, 0f3EEDF032, f186; +fma.rn.f32 f191, f718, 0f3F62AD3F, f187; +fma.rn.f32 f192, f127, 0f3EEDF032, f188; +sub.f32 f193, f189, f190; +add.f32 f195, f190, f189; +add.f32 f709, f192, f191; +sub.f32 f196, f191, f192; +fma.rn.f32 f197, f105, 0f3DF6DBEF, %53; +fma.rn.f32 f201, f109, 0fBF788FA5, f197; +fma.rn.f32 f708, f108, 0fBF7E222B, 0f00000000; +fma.rn.f32 f202, f112, 0fBE750F2A, f708; +fma.rn.f32 f707, f730, 0f3DF6DBEF, %54; +fma.rn.f32 f203, f727, 0fBF788FA5, f707; +fma.rn.f32 f706, f107, 0fBF7E222B, 0f00000000; +fma.rn.f32 f204, f111, 0fBE750F2A, f706; +fma.rn.f32 f205, f113, 0fBEB58EC6, f201; +fma.rn.f32 f206, f116, 0f3F6F5D39, f202; +fma.rn.f32 f207, f725, 0fBEB58EC6, f203; +fma.rn.f32 f208, f115, 0f3F6F5D39, f204; +fma.rn.f32 f209, f117, 0f3F62AD3F, f205; +fma.rn.f32 f210, f120, 0f3EEDF032, f206; +fma.rn.f32 f211, f723, 0f3F62AD3F, f207; +fma.rn.f32 f212, f119, 0f3EEDF032, f208; +fma.rn.f32 f213, f121, 0f3F116CB1, f209; +fma.rn.f32 f214, f124, 0fBF52AF12, f210; +fma.rn.f32 f215, f720, 0f3F116CB1, f211; +fma.rn.f32 f216, f123, 0fBF52AF12, f212; +fma.rn.f32 f217, f125, 0fBF3F9E67, f213; +fma.rn.f32 f218, f128, 0fBF29C268, f214; +fma.rn.f32 f219, f718, 0fBF3F9E67, f215; +fma.rn.f32 f220, f127, 0fBF29C268, f216; +sub.f32 f221, f217, f218; +add.f32 f223, f218, f217; +add.f32 f705, f220, f219; +sub.f32 f224, f219, f220; +fma.rn.f32 f225, f105, 0fBEB58EC6, %53; +fma.rn.f32 f229, f109, 0fBF3F9E67, f225; +fma.rn.f32 f704, f108, 0fBF6F5D39, 0f00000000; +fma.rn.f32 f230, f112, 0f3F29C268, f704; +fma.rn.f32 f703, f730, 0fBEB58EC6, %54; +fma.rn.f32 f231, f727, 0fBF3F9E67, f703; +fma.rn.f32 f702, f107, 0fBF6F5D39, 0f00000000; +fma.rn.f32 f232, f111, 0f3F29C268, f702; +fma.rn.f32 f233, f113, 0f3F62AD3F, f229; +fma.rn.f32 f234, f116, 0f3EEDF032, f230; +fma.rn.f32 f235, f725, 0f3F62AD3F, f231; +fma.rn.f32 f236, f115, 0f3EEDF032, f232; +fma.rn.f32 f237, f117, 0f3DF6DBEF, f233; +fma.rn.f32 f238, f120, 0fBF7E222B, f234; +fma.rn.f32 f239, f723, 0f3DF6DBEF, f235; +fma.rn.f32 f240, f119, 0fBF7E222B, f236; +fma.rn.f32 f241, f121, 0fBF788FA5, f237; +fma.rn.f32 f242, f124, 0f3E750F2A, f238; +fma.rn.f32 f243, f720, 0fBF788FA5, f239; +fma.rn.f32 f244, f123, 0f3E750F2A, f240; +fma.rn.f32 f245, f125, 0f3F116CB1, f241; +fma.rn.f32 f246, f128, 0f3F52AF12, f242; +fma.rn.f32 f247, f718, 0f3F116CB1, f243; +fma.rn.f32 f248, f127, 0f3F52AF12, f244; +sub.f32 f249, f245, f246; +add.f32 f251, f246, f245; +add.f32 f701, f248, f247; +sub.f32 f252, f247, f248; +fma.rn.f32 f253, f105, 0fBF3F9E67, %53; +fma.rn.f32 f257, f109, 0f3DF6DBEF, f253; +fma.rn.f32 f700, f108, 0fBF29C268, 0f00000000; +fma.rn.f32 f258, f112, 0f3F7E222B, f700; +fma.rn.f32 f699, f730, 0fBF3F9E67, %54; +fma.rn.f32 f259, f727, 0f3DF6DBEF, f699; +fma.rn.f32 f698, f107, 0fBF29C268, 0f00000000; +fma.rn.f32 f260, f111, 0f3F7E222B, f698; +fma.rn.f32 f261, f113, 0f3F116CB1, f257; +fma.rn.f32 f262, f116, 0fBF52AF12, f258; +fma.rn.f32 f263, f725, 0f3F116CB1, f259; +fma.rn.f32 f264, f115, 0fBF52AF12, f260; +fma.rn.f32 f265, f117, 0fBF788FA5, f261; +fma.rn.f32 f266, f120, 0f3E750F2A, f262; +fma.rn.f32 f267, f723, 0fBF788FA5, f263; +fma.rn.f32 f268, f119, 0f3E750F2A, f264; +fma.rn.f32 f269, f121, 0f3F62AD3F, f265; +fma.rn.f32 f270, f124, 0f3EEDF032, f266; +fma.rn.f32 f271, f720, 0f3F62AD3F, f267; +fma.rn.f32 f272, f123, 0f3EEDF032, f268; +fma.rn.f32 f273, f125, 0fBEB58EC6, f269; +fma.rn.f32 f274, f128, 0fBF6F5D39, f270; +fma.rn.f32 f275, f718, 0fBEB58EC6, f271; +fma.rn.f32 f276, f127, 0fBF6F5D39, f272; +sub.f32 f277, f273, f274; +add.f32 f279, f274, f273; +add.f32 f697, f276, f275; +sub.f32 f280, f275, f276; +fma.rn.f32 f281, f105, 0fBF788FA5, %53; +fma.rn.f32 f282, f108, 0fBE750F2A, 0f00000000; +fma.rn.f32 f283, f730, 0fBF788FA5, %54; +fma.rn.f32 f284, f107, 0fBE750F2A, 0f00000000; +fma.rn.f32 f285, f109, 0f3F62AD3F, f281; +fma.rn.f32 f286, f112, 0f3EEDF032, f282; +fma.rn.f32 f287, f727, 0f3F62AD3F, f283; +fma.rn.f32 f288, f111, 0f3EEDF032, f284; +fma.rn.f32 f289, f113, 0fBF3F9E67, f285; +fma.rn.f32 f290, f116, 0fBF29C268, f286; +fma.rn.f32 f291, f725, 0fBF3F9E67, f287; +fma.rn.f32 f292, f115, 0fBF29C268, f288; +fma.rn.f32 f293, f117, 0f3F116CB1, f289; +fma.rn.f32 f294, f120, 0f3F52AF12, f290; +fma.rn.f32 f295, f723, 0f3F116CB1, f291; +fma.rn.f32 f296, f119, 0f3F52AF12, f292; +fma.rn.f32 f297, f121, 0fBEB58EC6, f293; +fma.rn.f32 f298, f124, 0fBF6F5D39, f294; +fma.rn.f32 f299, f720, 0fBEB58EC6, f295; +fma.rn.f32 f300, f123, 0fBF6F5D39, f296; +fma.rn.f32 f301, f125, 0f3DF6DBEF, f297; +fma.rn.f32 f302, f128, 0f3F7E222B, f298; +fma.rn.f32 f303, f718, 0f3DF6DBEF, f299; +fma.rn.f32 f304, f127, 0f3F7E222B, f300; +sub.f32 f305, f301, f302; +add.f32 f307, f302, f301; +add.f32 f696, f304, f303; +sub.f32 f308, f303, f304; +add.f32 f309, %59, %103; +sub.f32 f311, %59, %103; +add.f32 f695, %60, %104; +sub.f32 f312, %60, %104; +add.f32 f313, %63, %99; +sub.f32 f315, %63, %99; +add.f32 f692, %114, %113; +sub.f32 f316, %114, %113; +add.f32 f317, %67, %95; +sub.f32 f319, %67, %95; +add.f32 f690, %115, %96; +sub.f32 f320, %115, %96; +add.f32 f321, %71, %91; +sub.f32 f323, %71, %91; +add.f32 f688, %72, %116; +sub.f32 f324, %72, %116; +add.f32 f325, %75, %87; +sub.f32 f327, %75, %87; +add.f32 f685, %117, %118; +sub.f32 f328, %117, %118; +add.f32 f329, %79, %83; +sub.f32 f331, %79, %83; +add.f32 f683, %119, %84; +sub.f32 f332, %119, %84; +add.f32 f333, %55, f309; +add.f32 f335, f333, f313; +add.f32 f681, %120, f695; +add.f32 f336, f681, f692; +add.f32 f337, f335, f317; +add.f32 f338, f336, f690; +add.f32 f339, f337, f321; +add.f32 f340, f338, f688; +add.f32 f341, f339, f325; +add.f32 f342, f340, f685; +add.f32 f343, f341, f329; +add.f32 f344, f342, f683; +fma.rn.f32 f345, f309, 0f3F62AD3F, %55; +fma.rn.f32 f349, f313, 0f3F116CB1, f345; +fma.rn.f32 f680, f312, 0fBEEDF032, 0f00000000; +fma.rn.f32 f350, f316, 0fBF52AF12, f680; +fma.rn.f32 f679, f695, 0f3F62AD3F, %120; +fma.rn.f32 f351, f692, 0f3F116CB1, f679; +fma.rn.f32 f678, f311, 0fBEEDF032, 0f00000000; +fma.rn.f32 f352, f315, 0fBF52AF12, f678; +fma.rn.f32 f353, f317, 0f3DF6DBEF, f349; +fma.rn.f32 f354, f320, 0fBF7E222B, f350; +fma.rn.f32 f355, f690, 0f3DF6DBEF, f351; +fma.rn.f32 f356, f319, 0fBF7E222B, f352; +fma.rn.f32 f357, f321, 0fBEB58EC6, f353; +fma.rn.f32 f358, f324, 0fBF6F5D39, f354; +fma.rn.f32 f359, f688, 0fBEB58EC6, f355; +fma.rn.f32 f360, f323, 0fBF6F5D39, f356; +fma.rn.f32 f361, f325, 0fBF3F9E67, f357; +fma.rn.f32 f362, f328, 0fBF29C268, f358; +fma.rn.f32 f363, f685, 0fBF3F9E67, f359; +fma.rn.f32 f364, f327, 0fBF29C268, f360; +fma.rn.f32 f365, f329, 0fBF788FA5, f361; +fma.rn.f32 f366, f332, 0fBE750F2A, f362; +fma.rn.f32 f367, f683, 0fBF788FA5, f363; +fma.rn.f32 f368, f331, 0fBE750F2A, f364; +sub.f32 f369, f365, f366; +add.f32 f371, f366, f365; +add.f32 f677, f368, f367; +sub.f32 f372, f367, f368; +fma.rn.f32 f373, f309, 0f3F116CB1, %55; +fma.rn.f32 f377, f313, 0fBEB58EC6, f373; +fma.rn.f32 f676, f312, 0fBF52AF12, 0f00000000; +fma.rn.f32 f378, f316, 0fBF6F5D39, f676; +fma.rn.f32 f675, f695, 0f3F116CB1, %120; +fma.rn.f32 f379, f692, 0fBEB58EC6, f675; +fma.rn.f32 f674, f311, 0fBF52AF12, 0f00000000; +fma.rn.f32 f380, f315, 0fBF6F5D39, f674; +fma.rn.f32 f381, f317, 0fBF788FA5, f377; +fma.rn.f32 f382, f320, 0fBE750F2A, f378; +fma.rn.f32 f383, f690, 0fBF788FA5, f379; +fma.rn.f32 f384, f319, 0fBE750F2A, f380; +fma.rn.f32 f385, f321, 0fBF3F9E67, f381; +fma.rn.f32 f386, f324, 0f3F29C268, f382; +fma.rn.f32 f387, f688, 0fBF3F9E67, f383; +fma.rn.f32 f388, f323, 0f3F29C268, f384; +fma.rn.f32 f389, f325, 0f3DF6DBEF, f385; +fma.rn.f32 f390, f328, 0f3F7E222B, f386; +fma.rn.f32 f391, f685, 0f3DF6DBEF, f387; +fma.rn.f32 f392, f327, 0f3F7E222B, f388; +fma.rn.f32 f393, f329, 0f3F62AD3F, f389; +fma.rn.f32 f394, f332, 0f3EEDF032, f390; +fma.rn.f32 f395, f683, 0f3F62AD3F, f391; +fma.rn.f32 f396, f331, 0f3EEDF032, f392; +sub.f32 f397, f393, f394; +add.f32 f399, f394, f393; +add.f32 f673, f396, f395; +sub.f32 f400, f395, f396; +fma.rn.f32 f401, f309, 0f3DF6DBEF, %55; +fma.rn.f32 f405, f313, 0fBF788FA5, f401; +fma.rn.f32 f672, f312, 0fBF7E222B, 0f00000000; +fma.rn.f32 f406, f316, 0fBE750F2A, f672; +fma.rn.f32 f671, f695, 0f3DF6DBEF, %120; +fma.rn.f32 f407, f692, 0fBF788FA5, f671; +fma.rn.f32 f670, f311, 0fBF7E222B, 0f00000000; +fma.rn.f32 f408, f315, 0fBE750F2A, f670; +fma.rn.f32 f409, f317, 0fBEB58EC6, f405; +fma.rn.f32 f410, f320, 0f3F6F5D39, f406; +fma.rn.f32 f411, f690, 0fBEB58EC6, f407; +fma.rn.f32 f412, f319, 0f3F6F5D39, f408; +fma.rn.f32 f413, f321, 0f3F62AD3F, f409; +fma.rn.f32 f414, f324, 0f3EEDF032, f410; +fma.rn.f32 f415, f688, 0f3F62AD3F, f411; +fma.rn.f32 f416, f323, 0f3EEDF032, f412; +fma.rn.f32 f417, f325, 0f3F116CB1, f413; +fma.rn.f32 f418, f328, 0fBF52AF12, f414; +fma.rn.f32 f419, f685, 0f3F116CB1, f415; +fma.rn.f32 f420, f327, 0fBF52AF12, f416; +fma.rn.f32 f421, f329, 0fBF3F9E67, f417; +fma.rn.f32 f422, f332, 0fBF29C268, f418; +fma.rn.f32 f423, f683, 0fBF3F9E67, f419; +fma.rn.f32 f424, f331, 0fBF29C268, f420; +sub.f32 f425, f421, f422; +add.f32 f427, f422, f421; +add.f32 f669, f424, f423; +sub.f32 f428, f423, f424; +fma.rn.f32 f429, f309, 0fBEB58EC6, %55; +fma.rn.f32 f433, f313, 0fBF3F9E67, f429; +fma.rn.f32 f668, f312, 0fBF6F5D39, 0f00000000; +fma.rn.f32 f434, f316, 0f3F29C268, f668; +fma.rn.f32 f667, f695, 0fBEB58EC6, %120; +fma.rn.f32 f435, f692, 0fBF3F9E67, f667; +fma.rn.f32 f666, f311, 0fBF6F5D39, 0f00000000; +fma.rn.f32 f436, f315, 0f3F29C268, f666; +fma.rn.f32 f437, f317, 0f3F62AD3F, f433; +fma.rn.f32 f438, f320, 0f3EEDF032, f434; +fma.rn.f32 f439, f690, 0f3F62AD3F, f435; +fma.rn.f32 f440, f319, 0f3EEDF032, f436; +fma.rn.f32 f441, f321, 0f3DF6DBEF, f437; +fma.rn.f32 f442, f324, 0fBF7E222B, f438; +fma.rn.f32 f443, f688, 0f3DF6DBEF, f439; +fma.rn.f32 f444, f323, 0fBF7E222B, f440; +fma.rn.f32 f445, f325, 0fBF788FA5, f441; +fma.rn.f32 f446, f328, 0f3E750F2A, f442; +fma.rn.f32 f447, f685, 0fBF788FA5, f443; +fma.rn.f32 f448, f327, 0f3E750F2A, f444; +fma.rn.f32 f449, f329, 0f3F116CB1, f445; +fma.rn.f32 f450, f332, 0f3F52AF12, f446; +fma.rn.f32 f451, f683, 0f3F116CB1, f447; +fma.rn.f32 f452, f331, 0f3F52AF12, f448; +sub.f32 f453, f449, f450; +add.f32 f455, f450, f449; +add.f32 f665, f452, f451; +sub.f32 f456, f451, f452; +fma.rn.f32 f457, f309, 0fBF3F9E67, %55; +fma.rn.f32 f461, f313, 0f3DF6DBEF, f457; +fma.rn.f32 f664, f312, 0fBF29C268, 0f00000000; +fma.rn.f32 f462, f316, 0f3F7E222B, f664; +fma.rn.f32 f663, f695, 0fBF3F9E67, %120; +fma.rn.f32 f463, f692, 0f3DF6DBEF, f663; +fma.rn.f32 f662, f311, 0fBF29C268, 0f00000000; +fma.rn.f32 f464, f315, 0f3F7E222B, f662; +fma.rn.f32 f465, f317, 0f3F116CB1, f461; +fma.rn.f32 f466, f320, 0fBF52AF12, f462; +fma.rn.f32 f467, f690, 0f3F116CB1, f463; +fma.rn.f32 f468, f319, 0fBF52AF12, f464; +fma.rn.f32 f469, f321, 0fBF788FA5, f465; +fma.rn.f32 f470, f324, 0f3E750F2A, f466; +fma.rn.f32 f471, f688, 0fBF788FA5, f467; +fma.rn.f32 f472, f323, 0f3E750F2A, f468; +fma.rn.f32 f473, f325, 0f3F62AD3F, f469; +fma.rn.f32 f474, f328, 0f3EEDF032, f470; +fma.rn.f32 f475, f685, 0f3F62AD3F, f471; +fma.rn.f32 f476, f327, 0f3EEDF032, f472; +fma.rn.f32 f477, f329, 0fBEB58EC6, f473; +fma.rn.f32 f478, f332, 0fBF6F5D39, f474; +fma.rn.f32 f479, f683, 0fBEB58EC6, f475; +fma.rn.f32 f480, f331, 0fBF6F5D39, f476; +sub.f32 f481, f477, f478; +add.f32 f483, f478, f477; +add.f32 f661, f480, f479; +sub.f32 f484, f479, f480; +fma.rn.f32 f485, f309, 0fBF788FA5, %55; +fma.rn.f32 f486, f312, 0fBE750F2A, 0f00000000; +fma.rn.f32 f487, f695, 0fBF788FA5, %52; +fma.rn.f32 f488, f311, 0fBE750F2A, 0f00000000; +fma.rn.f32 f489, f313, 0f3F62AD3F, f485; +fma.rn.f32 f490, f316, 0f3EEDF032, f486; +fma.rn.f32 f491, f692, 0f3F62AD3F, f487; +fma.rn.f32 f492, f315, 0f3EEDF032, f488; +fma.rn.f32 f493, f317, 0fBF3F9E67, f489; +fma.rn.f32 f494, f320, 0fBF29C268, f490; +fma.rn.f32 f495, f690, 0fBF3F9E67, f491; +fma.rn.f32 f496, f319, 0fBF29C268, f492; +fma.rn.f32 f497, f321, 0f3F116CB1, f493; +fma.rn.f32 f498, f324, 0f3F52AF12, f494; +fma.rn.f32 f499, f688, 0f3F116CB1, f495; +fma.rn.f32 f500, f323, 0f3F52AF12, f496; +fma.rn.f32 f501, f325, 0fBEB58EC6, f497; +fma.rn.f32 f502, f328, 0fBF6F5D39, f498; +fma.rn.f32 f503, f685, 0fBEB58EC6, f499; +fma.rn.f32 f504, f327, 0fBF6F5D39, f500; +fma.rn.f32 f505, f329, 0f3DF6DBEF, f501; +fma.rn.f32 f506, f332, 0f3F7E222B, f502; +fma.rn.f32 f507, f683, 0f3DF6DBEF, f503; +fma.rn.f32 f508, f331, 0f3F7E222B, f504; +sub.f32 f509, f505, f506; +add.f32 f511, f506, f505; +add.f32 f660, f508, f507; +sub.f32 f512, f507, f508; +mul.f32 f514, f677, 0fBE750F2A; +mul.f32 f659, f369, 0f3F788FA5; +sub.f32 f515, f659, f514; +mul.f32 f516, f677, 0f3F788FA5; +fma.rn.f32 f517, f369, 0fBE750F2A, f516; +mul.f32 f519, f673, 0fBEEDF032; +mul.f32 f658, f397, 0f3F62AD3F; +sub.f32 f520, f658, f519; +mul.f32 f521, f673, 0f3F62AD3F; +fma.rn.f32 f522, f397, 0fBEEDF032, f521; +mul.f32 f656, f425, 0f3F3F9E67; +mul.f32 f657, f669, 0fBF29C268; +sub.f32 f525, f656, f657; +mul.f32 f526, f669, 0f3F3F9E67; +fma.rn.f32 f527, f425, 0fBF29C268, f526; +mul.f32 f654, f453, 0f3F116CB1; +mul.f32 f655, f665, 0fBF52AF12; +sub.f32 f530, f654, f655; +mul.f32 f531, f665, 0f3F116CB1; +fma.rn.f32 f532, f453, 0fBF52AF12, f531; +mul.f32 f652, f481, 0f3EB58EC6; +mul.f32 f653, f661, 0fBF6F5D39; +sub.f32 f535, f652, f653; +mul.f32 f536, f661, 0f3EB58EC6; +fma.rn.f32 f537, f481, 0fBF6F5D39, f536; +mul.f32 f650, f509, 0f3DF6DBEF; +mul.f32 f651, f660, 0fBF7E222B; +sub.f32 f540, f650, f651; +mul.f32 f541, f660, 0f3DF6DBEF; +fma.rn.f32 f542, f509, 0fBF7E222B, f541; +mul.f32 f544, f512, 0fBF7E222B; +mul.f32 f649, f511, 0fBDF6DBEF; +sub.f32 f545, f649, f544; +mul.f32 f546, f512, 0fBDF6DBEF; +fma.rn.f32 f547, f511, 0fBF7E222B, f546; +mul.f32 f549, f484, 0fBF6F5D39; +mul.f32 f648, f483, 0fBEB58EC6; +sub.f32 f550, f648, f549; +mul.f32 f551, f484, 0fBEB58EC6; +fma.rn.f32 f552, f483, 0fBF6F5D39, f551; +mul.f32 f554, f456, 0fBF52AF12; +mul.f32 f647, f455, 0fBF116CB1; +sub.f32 f555, f647, f554; +mul.f32 f556, f456, 0fBF116CB1; +fma.rn.f32 f557, f455, 0fBF52AF12, f556; +mul.f32 f559, f428, 0fBF29C268; +mul.f32 f646, f427, 0fBF3F9E67; +sub.f32 f560, f646, f559; +mul.f32 f561, f428, 0fBF3F9E67; +fma.rn.f32 f562, f427, 0fBF29C268, f561; +mul.f32 f564, f400, 0fBEEDF032; +mul.f32 f645, f399, 0fBF62AD3F; +sub.f32 f565, f645, f564; +mul.f32 f566, f400, 0fBF62AD3F; +fma.rn.f32 f567, f399, 0fBEEDF032, f566; +mul.f32 f643, f371, 0fBF788FA5; +mul.f32 f644, f372, 0fBE750F2A; +sub.f32 f570, f643, f644; +mul.f32 f571, f372, 0fBF788FA5; +fma.rn.f32 f572, f371, 0fBE750F2A, f571; +add.f32 %1, f140, f344; +add.f32 %0, f139, f343; +add.f32 %2, f165, f515; +add.f32 %3, f713, f517; +add.f32 %4, f193, f520; +add.f32 %5, f709, f522; +add.f32 %6, f221, f525; +add.f32 %7, f705, f527; +add.f32 %8, f249, f530; +add.f32 %9, f701, f532; +add.f32 %11, f697, f537; +add.f32 %10, f277, f535; +add.f32 %13, f696, f542; +add.f32 %12, f305, f540; +add.f32 %15, f308, f547; +add.f32 %14, f307, f545; +add.f32 %16, f279, f550; +add.f32 %17, f280, f552; +add.f32 %18, f251, f555; +add.f32 %19, f252, f557; +add.f32 %20, f223, f560; +add.f32 %21, f224, f562; +add.f32 %22, f195, f565; +add.f32 %23, f196, f567; +add.f32 %25, f168, f572; +add.f32 %24, f167, f570; +sub.f32 %27, f140, f344; +sub.f32 %26, f139, f343; +sub.f32 %29, f713, f517; +sub.f32 %28, f165, f515; +sub.f32 %31, f709, f522; +sub.f32 %30, f193, f520; +sub.f32 %33, f705, f527; +sub.f32 %32, f221, f525; +sub.f32 %35, f701, f532; +sub.f32 %34, f249, f530; +sub.f32 %37, f697, f537; +sub.f32 %36, f277, f535; +sub.f32 %39, f696, f542; +sub.f32 %38, f305, f540; +sub.f32 %41, f308, f547; +sub.f32 %40, f307, f545; +sub.f32 %43, f280, f552; +sub.f32 %42, f279, f550; +sub.f32 %45, f252, f557; +sub.f32 %44, f251, f555; +sub.f32 %47, f224, f562; +sub.f32 %46, f223, f560; +sub.f32 %49, f196, f567; +sub.f32 %48, f195, f565; +sub.f32 %51, f168, f572; +sub.f32 %50, f167, f570; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y): "f"(rmem[1].y), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[2].y), "f"(rmem[4].y), "f"(rmem[22].y), "f"(rmem[20].y), "f"(rmem[8].y), "f"(rmem[10].y), "f"(rmem[16].y), "f"(rmem[14].y), "f"(rmem[23].y), "f"(rmem[5].y), "f"(rmem[7].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[17].y), "f"(rmem[13].y), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..7103d1ada4a16 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp32_inv.hpp.inc @@ -0,0 +1,536 @@ +#ifndef CUFFTDX_FFT_26_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_26_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<213, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<734>; +.reg .b64 rd<7>; +add.f32 f105, %57, %101; +sub.f32 f107, %57, %101; +add.f32 f730, %105, %102; +sub.f32 f108, %105, %102; +add.f32 f109, %61, %97; +sub.f32 f111, %61, %97; +add.f32 f727, %106, %107; +sub.f32 f112, %106, %107; +add.f32 f113, %65, %93; +sub.f32 f115, %65, %93; +add.f32 f725, %66, %108; +sub.f32 f116, %66, %108; +add.f32 f117, %69, %89; +sub.f32 f119, %69, %89; +add.f32 f723, %109, %90; +sub.f32 f120, %109, %90; +add.f32 f121, %73, %85; +sub.f32 f123, %73, %85; +add.f32 f720, %110, %111; +sub.f32 f124, %110, %111; +add.f32 f125, %77, %81; +sub.f32 f127, %77, %81; +add.f32 f718, %78, %112; +sub.f32 f128, %78, %112; +add.f32 f129, %53, f105; +add.f32 f131, f129, f109; +add.f32 f717, %54, f730; +add.f32 f132, f717, f727; +add.f32 f133, f131, f113; +add.f32 f134, f132, f725; +add.f32 f135, f133, f117; +add.f32 f136, f134, f723; +add.f32 f137, f135, f121; +add.f32 f138, f136, f720; +add.f32 f139, f137, f125; +add.f32 f140, f138, f718; +fma.rn.f32 f141, f105, 0f3F62AD3F, %53; +fma.rn.f32 f145, f109, 0f3F116CB1, f141; +fma.rn.f32 f716, f108, 0f3EEDF032, 0f00000000; +fma.rn.f32 f146, f112, 0f3F52AF12, f716; +fma.rn.f32 f715, f730, 0f3F62AD3F, %54; +fma.rn.f32 f147, f727, 0f3F116CB1, f715; +fma.rn.f32 f714, f107, 0f3EEDF032, 0f00000000; +fma.rn.f32 f148, f111, 0f3F52AF12, f714; +fma.rn.f32 f149, f113, 0f3DF6DBEF, f145; +fma.rn.f32 f150, f116, 0f3F7E222B, f146; +fma.rn.f32 f151, f725, 0f3DF6DBEF, f147; +fma.rn.f32 f152, f115, 0f3F7E222B, f148; +fma.rn.f32 f153, f117, 0fBEB58EC6, f149; +fma.rn.f32 f154, f120, 0f3F6F5D39, f150; +fma.rn.f32 f155, f723, 0fBEB58EC6, f151; +fma.rn.f32 f156, f119, 0f3F6F5D39, f152; +fma.rn.f32 f157, f121, 0fBF3F9E67, f153; +fma.rn.f32 f158, f124, 0f3F29C268, f154; +fma.rn.f32 f159, f720, 0fBF3F9E67, f155; +fma.rn.f32 f160, f123, 0f3F29C268, f156; +fma.rn.f32 f161, f125, 0fBF788FA5, f157; +fma.rn.f32 f162, f128, 0f3E750F2A, f158; +fma.rn.f32 f163, f718, 0fBF788FA5, f159; +fma.rn.f32 f164, f127, 0f3E750F2A, f160; +sub.f32 f165, f161, f162; +add.f32 f167, f162, f161; +add.f32 f713, f164, f163; +sub.f32 f168, f163, f164; +fma.rn.f32 f169, f105, 0f3F116CB1, %53; +fma.rn.f32 f173, f109, 0fBEB58EC6, f169; +fma.rn.f32 f712, f108, 0f3F52AF12, 0f00000000; +fma.rn.f32 f174, f112, 0f3F6F5D39, f712; +fma.rn.f32 f711, f730, 0f3F116CB1, %54; +fma.rn.f32 f175, f727, 0fBEB58EC6, f711; +fma.rn.f32 f710, f107, 0f3F52AF12, 0f00000000; +fma.rn.f32 f176, f111, 0f3F6F5D39, f710; +fma.rn.f32 f177, f113, 0fBF788FA5, f173; +fma.rn.f32 f178, f116, 0f3E750F2A, f174; +fma.rn.f32 f179, f725, 0fBF788FA5, f175; +fma.rn.f32 f180, f115, 0f3E750F2A, f176; +fma.rn.f32 f181, f117, 0fBF3F9E67, f177; +fma.rn.f32 f182, f120, 0fBF29C268, f178; +fma.rn.f32 f183, f723, 0fBF3F9E67, f179; +fma.rn.f32 f184, f119, 0fBF29C268, f180; +fma.rn.f32 f185, f121, 0f3DF6DBEF, f181; +fma.rn.f32 f186, f124, 0fBF7E222B, f182; +fma.rn.f32 f187, f720, 0f3DF6DBEF, f183; +fma.rn.f32 f188, f123, 0fBF7E222B, f184; +fma.rn.f32 f189, f125, 0f3F62AD3F, f185; +fma.rn.f32 f190, f128, 0fBEEDF032, f186; +fma.rn.f32 f191, f718, 0f3F62AD3F, f187; +fma.rn.f32 f192, f127, 0fBEEDF032, f188; +sub.f32 f193, f189, f190; +add.f32 f195, f190, f189; +add.f32 f709, f192, f191; +sub.f32 f196, f191, f192; +fma.rn.f32 f197, f105, 0f3DF6DBEF, %53; +fma.rn.f32 f201, f109, 0fBF788FA5, f197; +fma.rn.f32 f708, f108, 0f3F7E222B, 0f00000000; +fma.rn.f32 f202, f112, 0f3E750F2A, f708; +fma.rn.f32 f707, f730, 0f3DF6DBEF, %54; +fma.rn.f32 f203, f727, 0fBF788FA5, f707; +fma.rn.f32 f706, f107, 0f3F7E222B, 0f00000000; +fma.rn.f32 f204, f111, 0f3E750F2A, f706; +fma.rn.f32 f205, f113, 0fBEB58EC6, f201; +fma.rn.f32 f206, f116, 0fBF6F5D39, f202; +fma.rn.f32 f207, f725, 0fBEB58EC6, f203; +fma.rn.f32 f208, f115, 0fBF6F5D39, f204; +fma.rn.f32 f209, f117, 0f3F62AD3F, f205; +fma.rn.f32 f210, f120, 0fBEEDF032, f206; +fma.rn.f32 f211, f723, 0f3F62AD3F, f207; +fma.rn.f32 f212, f119, 0fBEEDF032, f208; +fma.rn.f32 f213, f121, 0f3F116CB1, f209; +fma.rn.f32 f214, f124, 0f3F52AF12, f210; +fma.rn.f32 f215, f720, 0f3F116CB1, f211; +fma.rn.f32 f216, f123, 0f3F52AF12, f212; +fma.rn.f32 f217, f125, 0fBF3F9E67, f213; +fma.rn.f32 f218, f128, 0f3F29C268, f214; +fma.rn.f32 f219, f718, 0fBF3F9E67, f215; +fma.rn.f32 f220, f127, 0f3F29C268, f216; +sub.f32 f221, f217, f218; +add.f32 f223, f218, f217; +add.f32 f705, f220, f219; +sub.f32 f224, f219, f220; +fma.rn.f32 f225, f105, 0fBEB58EC6, %53; +fma.rn.f32 f229, f109, 0fBF3F9E67, f225; +fma.rn.f32 f704, f108, 0f3F6F5D39, 0f00000000; +fma.rn.f32 f230, f112, 0fBF29C268, f704; +fma.rn.f32 f703, f730, 0fBEB58EC6, %54; +fma.rn.f32 f231, f727, 0fBF3F9E67, f703; +fma.rn.f32 f702, f107, 0f3F6F5D39, 0f00000000; +fma.rn.f32 f232, f111, 0fBF29C268, f702; +fma.rn.f32 f233, f113, 0f3F62AD3F, f229; +fma.rn.f32 f234, f116, 0fBEEDF032, f230; +fma.rn.f32 f235, f725, 0f3F62AD3F, f231; +fma.rn.f32 f236, f115, 0fBEEDF032, f232; +fma.rn.f32 f237, f117, 0f3DF6DBEF, f233; +fma.rn.f32 f238, f120, 0f3F7E222B, f234; +fma.rn.f32 f239, f723, 0f3DF6DBEF, f235; +fma.rn.f32 f240, f119, 0f3F7E222B, f236; +fma.rn.f32 f241, f121, 0fBF788FA5, f237; +fma.rn.f32 f242, f124, 0fBE750F2A, f238; +fma.rn.f32 f243, f720, 0fBF788FA5, f239; +fma.rn.f32 f244, f123, 0fBE750F2A, f240; +fma.rn.f32 f245, f125, 0f3F116CB1, f241; +fma.rn.f32 f246, f128, 0fBF52AF12, f242; +fma.rn.f32 f247, f718, 0f3F116CB1, f243; +fma.rn.f32 f248, f127, 0fBF52AF12, f244; +sub.f32 f249, f245, f246; +add.f32 f251, f246, f245; +add.f32 f701, f248, f247; +sub.f32 f252, f247, f248; +fma.rn.f32 f253, f105, 0fBF3F9E67, %53; +fma.rn.f32 f257, f109, 0f3DF6DBEF, f253; +fma.rn.f32 f700, f108, 0f3F29C268, 0f00000000; +fma.rn.f32 f258, f112, 0fBF7E222B, f700; +fma.rn.f32 f699, f730, 0fBF3F9E67, %54; +fma.rn.f32 f259, f727, 0f3DF6DBEF, f699; +fma.rn.f32 f698, f107, 0f3F29C268, 0f00000000; +fma.rn.f32 f260, f111, 0fBF7E222B, f698; +fma.rn.f32 f261, f113, 0f3F116CB1, f257; +fma.rn.f32 f262, f116, 0f3F52AF12, f258; +fma.rn.f32 f263, f725, 0f3F116CB1, f259; +fma.rn.f32 f264, f115, 0f3F52AF12, f260; +fma.rn.f32 f265, f117, 0fBF788FA5, f261; +fma.rn.f32 f266, f120, 0fBE750F2A, f262; +fma.rn.f32 f267, f723, 0fBF788FA5, f263; +fma.rn.f32 f268, f119, 0fBE750F2A, f264; +fma.rn.f32 f269, f121, 0f3F62AD3F, f265; +fma.rn.f32 f270, f124, 0fBEEDF032, f266; +fma.rn.f32 f271, f720, 0f3F62AD3F, f267; +fma.rn.f32 f272, f123, 0fBEEDF032, f268; +fma.rn.f32 f273, f125, 0fBEB58EC6, f269; +fma.rn.f32 f274, f128, 0f3F6F5D39, f270; +fma.rn.f32 f275, f718, 0fBEB58EC6, f271; +fma.rn.f32 f276, f127, 0f3F6F5D39, f272; +sub.f32 f277, f273, f274; +add.f32 f279, f274, f273; +add.f32 f697, f276, f275; +sub.f32 f280, f275, f276; +fma.rn.f32 f281, f105, 0fBF788FA5, %53; +fma.rn.f32 f282, f108, 0f3E750F2A, 0f00000000; +fma.rn.f32 f283, f730, 0fBF788FA5, %54; +fma.rn.f32 f284, f107, 0f3E750F2A, 0f00000000; +fma.rn.f32 f285, f109, 0f3F62AD3F, f281; +fma.rn.f32 f286, f112, 0fBEEDF032, f282; +fma.rn.f32 f287, f727, 0f3F62AD3F, f283; +fma.rn.f32 f288, f111, 0fBEEDF032, f284; +fma.rn.f32 f289, f113, 0fBF3F9E67, f285; +fma.rn.f32 f290, f116, 0f3F29C268, f286; +fma.rn.f32 f291, f725, 0fBF3F9E67, f287; +fma.rn.f32 f292, f115, 0f3F29C268, f288; +fma.rn.f32 f293, f117, 0f3F116CB1, f289; +fma.rn.f32 f294, f120, 0fBF52AF12, f290; +fma.rn.f32 f295, f723, 0f3F116CB1, f291; +fma.rn.f32 f296, f119, 0fBF52AF12, f292; +fma.rn.f32 f297, f121, 0fBEB58EC6, f293; +fma.rn.f32 f298, f124, 0f3F6F5D39, f294; +fma.rn.f32 f299, f720, 0fBEB58EC6, f295; +fma.rn.f32 f300, f123, 0f3F6F5D39, f296; +fma.rn.f32 f301, f125, 0f3DF6DBEF, f297; +fma.rn.f32 f302, f128, 0fBF7E222B, f298; +fma.rn.f32 f303, f718, 0f3DF6DBEF, f299; +fma.rn.f32 f304, f127, 0fBF7E222B, f300; +sub.f32 f305, f301, f302; +add.f32 f307, f302, f301; +add.f32 f696, f304, f303; +sub.f32 f308, f303, f304; +add.f32 f309, %59, %103; +sub.f32 f311, %59, %103; +add.f32 f695, %60, %104; +sub.f32 f312, %60, %104; +add.f32 f313, %63, %99; +sub.f32 f315, %63, %99; +add.f32 f692, %114, %113; +sub.f32 f316, %114, %113; +add.f32 f317, %67, %95; +sub.f32 f319, %67, %95; +add.f32 f690, %115, %96; +sub.f32 f320, %115, %96; +add.f32 f321, %71, %91; +sub.f32 f323, %71, %91; +add.f32 f688, %72, %116; +sub.f32 f324, %72, %116; +add.f32 f325, %75, %87; +sub.f32 f327, %75, %87; +add.f32 f685, %117, %118; +sub.f32 f328, %117, %118; +add.f32 f329, %79, %83; +sub.f32 f331, %79, %83; +add.f32 f683, %119, %84; +sub.f32 f332, %119, %84; +add.f32 f333, %55, f309; +add.f32 f335, f333, f313; +add.f32 f681, %120, f695; +add.f32 f336, f681, f692; +add.f32 f337, f335, f317; +add.f32 f338, f336, f690; +add.f32 f339, f337, f321; +add.f32 f340, f338, f688; +add.f32 f341, f339, f325; +add.f32 f342, f340, f685; +add.f32 f343, f341, f329; +add.f32 f344, f342, f683; +fma.rn.f32 f345, f309, 0f3F62AD3F, %55; +fma.rn.f32 f349, f313, 0f3F116CB1, f345; +fma.rn.f32 f680, f312, 0f3EEDF032, 0f00000000; +fma.rn.f32 f350, f316, 0f3F52AF12, f680; +fma.rn.f32 f679, f695, 0f3F62AD3F, %120; +fma.rn.f32 f351, f692, 0f3F116CB1, f679; +fma.rn.f32 f678, f311, 0f3EEDF032, 0f00000000; +fma.rn.f32 f352, f315, 0f3F52AF12, f678; +fma.rn.f32 f353, f317, 0f3DF6DBEF, f349; +fma.rn.f32 f354, f320, 0f3F7E222B, f350; +fma.rn.f32 f355, f690, 0f3DF6DBEF, f351; +fma.rn.f32 f356, f319, 0f3F7E222B, f352; +fma.rn.f32 f357, f321, 0fBEB58EC6, f353; +fma.rn.f32 f358, f324, 0f3F6F5D39, f354; +fma.rn.f32 f359, f688, 0fBEB58EC6, f355; +fma.rn.f32 f360, f323, 0f3F6F5D39, f356; +fma.rn.f32 f361, f325, 0fBF3F9E67, f357; +fma.rn.f32 f362, f328, 0f3F29C268, f358; +fma.rn.f32 f363, f685, 0fBF3F9E67, f359; +fma.rn.f32 f364, f327, 0f3F29C268, f360; +fma.rn.f32 f365, f329, 0fBF788FA5, f361; +fma.rn.f32 f366, f332, 0f3E750F2A, f362; +fma.rn.f32 f367, f683, 0fBF788FA5, f363; +fma.rn.f32 f368, f331, 0f3E750F2A, f364; +sub.f32 f369, f365, f366; +add.f32 f371, f366, f365; +add.f32 f677, f368, f367; +sub.f32 f372, f367, f368; +fma.rn.f32 f373, f309, 0f3F116CB1, %55; +fma.rn.f32 f377, f313, 0fBEB58EC6, f373; +fma.rn.f32 f676, f312, 0f3F52AF12, 0f00000000; +fma.rn.f32 f378, f316, 0f3F6F5D39, f676; +fma.rn.f32 f675, f695, 0f3F116CB1, %120; +fma.rn.f32 f379, f692, 0fBEB58EC6, f675; +fma.rn.f32 f674, f311, 0f3F52AF12, 0f00000000; +fma.rn.f32 f380, f315, 0f3F6F5D39, f674; +fma.rn.f32 f381, f317, 0fBF788FA5, f377; +fma.rn.f32 f382, f320, 0f3E750F2A, f378; +fma.rn.f32 f383, f690, 0fBF788FA5, f379; +fma.rn.f32 f384, f319, 0f3E750F2A, f380; +fma.rn.f32 f385, f321, 0fBF3F9E67, f381; +fma.rn.f32 f386, f324, 0fBF29C268, f382; +fma.rn.f32 f387, f688, 0fBF3F9E67, f383; +fma.rn.f32 f388, f323, 0fBF29C268, f384; +fma.rn.f32 f389, f325, 0f3DF6DBEF, f385; +fma.rn.f32 f390, f328, 0fBF7E222B, f386; +fma.rn.f32 f391, f685, 0f3DF6DBEF, f387; +fma.rn.f32 f392, f327, 0fBF7E222B, f388; +fma.rn.f32 f393, f329, 0f3F62AD3F, f389; +fma.rn.f32 f394, f332, 0fBEEDF032, f390; +fma.rn.f32 f395, f683, 0f3F62AD3F, f391; +fma.rn.f32 f396, f331, 0fBEEDF032, f392; +sub.f32 f397, f393, f394; +add.f32 f399, f394, f393; +add.f32 f673, f396, f395; +sub.f32 f400, f395, f396; +fma.rn.f32 f401, f309, 0f3DF6DBEF, %55; +fma.rn.f32 f405, f313, 0fBF788FA5, f401; +fma.rn.f32 f672, f312, 0f3F7E222B, 0f00000000; +fma.rn.f32 f406, f316, 0f3E750F2A, f672; +fma.rn.f32 f671, f695, 0f3DF6DBEF, %120; +fma.rn.f32 f407, f692, 0fBF788FA5, f671; +fma.rn.f32 f670, f311, 0f3F7E222B, 0f00000000; +fma.rn.f32 f408, f315, 0f3E750F2A, f670; +fma.rn.f32 f409, f317, 0fBEB58EC6, f405; +fma.rn.f32 f410, f320, 0fBF6F5D39, f406; +fma.rn.f32 f411, f690, 0fBEB58EC6, f407; +fma.rn.f32 f412, f319, 0fBF6F5D39, f408; +fma.rn.f32 f413, f321, 0f3F62AD3F, f409; +fma.rn.f32 f414, f324, 0fBEEDF032, f410; +fma.rn.f32 f415, f688, 0f3F62AD3F, f411; +fma.rn.f32 f416, f323, 0fBEEDF032, f412; +fma.rn.f32 f417, f325, 0f3F116CB1, f413; +fma.rn.f32 f418, f328, 0f3F52AF12, f414; +fma.rn.f32 f419, f685, 0f3F116CB1, f415; +fma.rn.f32 f420, f327, 0f3F52AF12, f416; +fma.rn.f32 f421, f329, 0fBF3F9E67, f417; +fma.rn.f32 f422, f332, 0f3F29C268, f418; +fma.rn.f32 f423, f683, 0fBF3F9E67, f419; +fma.rn.f32 f424, f331, 0f3F29C268, f420; +sub.f32 f425, f421, f422; +add.f32 f427, f422, f421; +add.f32 f669, f424, f423; +sub.f32 f428, f423, f424; +fma.rn.f32 f429, f309, 0fBEB58EC6, %55; +fma.rn.f32 f433, f313, 0fBF3F9E67, f429; +fma.rn.f32 f668, f312, 0f3F6F5D39, 0f00000000; +fma.rn.f32 f434, f316, 0fBF29C268, f668; +fma.rn.f32 f667, f695, 0fBEB58EC6, %120; +fma.rn.f32 f435, f692, 0fBF3F9E67, f667; +fma.rn.f32 f666, f311, 0f3F6F5D39, 0f00000000; +fma.rn.f32 f436, f315, 0fBF29C268, f666; +fma.rn.f32 f437, f317, 0f3F62AD3F, f433; +fma.rn.f32 f438, f320, 0fBEEDF032, f434; +fma.rn.f32 f439, f690, 0f3F62AD3F, f435; +fma.rn.f32 f440, f319, 0fBEEDF032, f436; +fma.rn.f32 f441, f321, 0f3DF6DBEF, f437; +fma.rn.f32 f442, f324, 0f3F7E222B, f438; +fma.rn.f32 f443, f688, 0f3DF6DBEF, f439; +fma.rn.f32 f444, f323, 0f3F7E222B, f440; +fma.rn.f32 f445, f325, 0fBF788FA5, f441; +fma.rn.f32 f446, f328, 0fBE750F2A, f442; +fma.rn.f32 f447, f685, 0fBF788FA5, f443; +fma.rn.f32 f448, f327, 0fBE750F2A, f444; +fma.rn.f32 f449, f329, 0f3F116CB1, f445; +fma.rn.f32 f450, f332, 0fBF52AF12, f446; +fma.rn.f32 f451, f683, 0f3F116CB1, f447; +fma.rn.f32 f452, f331, 0fBF52AF12, f448; +sub.f32 f453, f449, f450; +add.f32 f455, f450, f449; +add.f32 f665, f452, f451; +sub.f32 f456, f451, f452; +fma.rn.f32 f457, f309, 0fBF3F9E67, %55; +fma.rn.f32 f461, f313, 0f3DF6DBEF, f457; +fma.rn.f32 f664, f312, 0f3F29C268, 0f00000000; +fma.rn.f32 f462, f316, 0fBF7E222B, f664; +fma.rn.f32 f663, f695, 0fBF3F9E67, %120; +fma.rn.f32 f463, f692, 0f3DF6DBEF, f663; +fma.rn.f32 f662, f311, 0f3F29C268, 0f00000000; +fma.rn.f32 f464, f315, 0fBF7E222B, f662; +fma.rn.f32 f465, f317, 0f3F116CB1, f461; +fma.rn.f32 f466, f320, 0f3F52AF12, f462; +fma.rn.f32 f467, f690, 0f3F116CB1, f463; +fma.rn.f32 f468, f319, 0f3F52AF12, f464; +fma.rn.f32 f469, f321, 0fBF788FA5, f465; +fma.rn.f32 f470, f324, 0fBE750F2A, f466; +fma.rn.f32 f471, f688, 0fBF788FA5, f467; +fma.rn.f32 f472, f323, 0fBE750F2A, f468; +fma.rn.f32 f473, f325, 0f3F62AD3F, f469; +fma.rn.f32 f474, f328, 0fBEEDF032, f470; +fma.rn.f32 f475, f685, 0f3F62AD3F, f471; +fma.rn.f32 f476, f327, 0fBEEDF032, f472; +fma.rn.f32 f477, f329, 0fBEB58EC6, f473; +fma.rn.f32 f478, f332, 0f3F6F5D39, f474; +fma.rn.f32 f479, f683, 0fBEB58EC6, f475; +fma.rn.f32 f480, f331, 0f3F6F5D39, f476; +sub.f32 f481, f477, f478; +add.f32 f483, f478, f477; +add.f32 f661, f480, f479; +sub.f32 f484, f479, f480; +fma.rn.f32 f485, f309, 0fBF788FA5, %55; +fma.rn.f32 f486, f312, 0f3E750F2A, 0f00000000; +fma.rn.f32 f487, f695, 0fBF788FA5, %52; +fma.rn.f32 f488, f311, 0f3E750F2A, 0f00000000; +fma.rn.f32 f489, f313, 0f3F62AD3F, f485; +fma.rn.f32 f490, f316, 0fBEEDF032, f486; +fma.rn.f32 f491, f692, 0f3F62AD3F, f487; +fma.rn.f32 f492, f315, 0fBEEDF032, f488; +fma.rn.f32 f493, f317, 0fBF3F9E67, f489; +fma.rn.f32 f494, f320, 0f3F29C268, f490; +fma.rn.f32 f495, f690, 0fBF3F9E67, f491; +fma.rn.f32 f496, f319, 0f3F29C268, f492; +fma.rn.f32 f497, f321, 0f3F116CB1, f493; +fma.rn.f32 f498, f324, 0fBF52AF12, f494; +fma.rn.f32 f499, f688, 0f3F116CB1, f495; +fma.rn.f32 f500, f323, 0fBF52AF12, f496; +fma.rn.f32 f501, f325, 0fBEB58EC6, f497; +fma.rn.f32 f502, f328, 0f3F6F5D39, f498; +fma.rn.f32 f503, f685, 0fBEB58EC6, f499; +fma.rn.f32 f504, f327, 0f3F6F5D39, f500; +fma.rn.f32 f505, f329, 0f3DF6DBEF, f501; +fma.rn.f32 f506, f332, 0fBF7E222B, f502; +fma.rn.f32 f507, f683, 0f3DF6DBEF, f503; +fma.rn.f32 f508, f331, 0fBF7E222B, f504; +sub.f32 f509, f505, f506; +add.f32 f511, f506, f505; +add.f32 f660, f508, f507; +sub.f32 f512, f507, f508; +mul.f32 f514, f677, 0f3E750F2A; +mul.f32 f659, f369, 0f3F788FA5; +sub.f32 f515, f659, f514; +mul.f32 f516, f677, 0f3F788FA5; +fma.rn.f32 f517, f369, 0f3E750F2A, f516; +mul.f32 f519, f673, 0f3EEDF032; +mul.f32 f658, f397, 0f3F62AD3F; +sub.f32 f520, f658, f519; +mul.f32 f521, f673, 0f3F62AD3F; +fma.rn.f32 f522, f397, 0f3EEDF032, f521; +mul.f32 f656, f425, 0f3F3F9E67; +mul.f32 f657, f669, 0f3F29C268; +sub.f32 f525, f656, f657; +mul.f32 f526, f669, 0f3F3F9E67; +fma.rn.f32 f527, f425, 0f3F29C268, f526; +mul.f32 f654, f453, 0f3F116CB1; +mul.f32 f655, f665, 0f3F52AF12; +sub.f32 f530, f654, f655; +mul.f32 f531, f665, 0f3F116CB1; +fma.rn.f32 f532, f453, 0f3F52AF12, f531; +mul.f32 f652, f481, 0f3EB58EC6; +mul.f32 f653, f661, 0f3F6F5D39; +sub.f32 f535, f652, f653; +mul.f32 f536, f661, 0f3EB58EC6; +fma.rn.f32 f537, f481, 0f3F6F5D39, f536; +mul.f32 f650, f509, 0f3DF6DBEF; +mul.f32 f651, f660, 0f3F7E222B; +sub.f32 f540, f650, f651; +mul.f32 f541, f660, 0f3DF6DBEF; +fma.rn.f32 f542, f509, 0f3F7E222B, f541; +mul.f32 f544, f512, 0f3F7E222B; +mul.f32 f649, f511, 0fBDF6DBEF; +sub.f32 f545, f649, f544; +mul.f32 f546, f512, 0fBDF6DBEF; +fma.rn.f32 f547, f511, 0f3F7E222B, f546; +mul.f32 f549, f484, 0f3F6F5D39; +mul.f32 f648, f483, 0fBEB58EC6; +sub.f32 f550, f648, f549; +mul.f32 f551, f484, 0fBEB58EC6; +fma.rn.f32 f552, f483, 0f3F6F5D39, f551; +mul.f32 f554, f456, 0f3F52AF12; +mul.f32 f647, f455, 0fBF116CB1; +sub.f32 f555, f647, f554; +mul.f32 f556, f456, 0fBF116CB1; +fma.rn.f32 f557, f455, 0f3F52AF12, f556; +mul.f32 f559, f428, 0f3F29C268; +mul.f32 f646, f427, 0fBF3F9E67; +sub.f32 f560, f646, f559; +mul.f32 f561, f428, 0fBF3F9E67; +fma.rn.f32 f562, f427, 0f3F29C268, f561; +mul.f32 f564, f400, 0f3EEDF032; +mul.f32 f645, f399, 0fBF62AD3F; +sub.f32 f565, f645, f564; +mul.f32 f566, f400, 0fBF62AD3F; +fma.rn.f32 f567, f399, 0f3EEDF032, f566; +mul.f32 f643, f371, 0fBF788FA5; +mul.f32 f644, f372, 0f3E750F2A; +sub.f32 f570, f643, f644; +mul.f32 f571, f372, 0fBF788FA5; +fma.rn.f32 f572, f371, 0f3E750F2A, f571; +add.f32 %1, f140, f344; +add.f32 %0, f139, f343; +add.f32 %2, f165, f515; +add.f32 %3, f713, f517; +add.f32 %4, f193, f520; +add.f32 %5, f709, f522; +add.f32 %6, f221, f525; +add.f32 %7, f705, f527; +add.f32 %8, f249, f530; +add.f32 %9, f701, f532; +add.f32 %11, f697, f537; +add.f32 %10, f277, f535; +add.f32 %13, f696, f542; +add.f32 %12, f305, f540; +add.f32 %15, f308, f547; +add.f32 %14, f307, f545; +add.f32 %16, f279, f550; +add.f32 %17, f280, f552; +add.f32 %18, f251, f555; +add.f32 %19, f252, f557; +add.f32 %20, f223, f560; +add.f32 %21, f224, f562; +add.f32 %22, f195, f565; +add.f32 %23, f196, f567; +add.f32 %25, f168, f572; +add.f32 %24, f167, f570; +sub.f32 %27, f140, f344; +sub.f32 %26, f139, f343; +sub.f32 %29, f713, f517; +sub.f32 %28, f165, f515; +sub.f32 %31, f709, f522; +sub.f32 %30, f193, f520; +sub.f32 %33, f705, f527; +sub.f32 %32, f221, f525; +sub.f32 %35, f701, f532; +sub.f32 %34, f249, f530; +sub.f32 %37, f697, f537; +sub.f32 %36, f277, f535; +sub.f32 %39, f696, f542; +sub.f32 %38, f305, f540; +sub.f32 %41, f308, f547; +sub.f32 %40, f307, f545; +sub.f32 %43, f280, f552; +sub.f32 %42, f279, f550; +sub.f32 %45, f252, f557; +sub.f32 %44, f251, f555; +sub.f32 %47, f224, f562; +sub.f32 %46, f223, f560; +sub.f32 %49, f196, f567; +sub.f32 %48, f195, f565; +sub.f32 %51, f168, f572; +sub.f32 %50, f167, f570; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y): "f"(rmem[1].y), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[2].y), "f"(rmem[4].y), "f"(rmem[22].y), "f"(rmem[20].y), "f"(rmem[8].y), "f"(rmem[10].y), "f"(rmem[16].y), "f"(rmem[14].y), "f"(rmem[23].y), "f"(rmem[5].y), "f"(rmem[7].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[17].y), "f"(rmem[13].y), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..0601dcf9f6469 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp64_fwd.hpp.inc @@ -0,0 +1,536 @@ +#ifndef CUFFTDX_FFT_26_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_26_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<415, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<734>; +.reg .b64 rd<7>; +add.f64 fd105, %57, %101; +sub.f64 fd107, %57, %101; +add.f64 fd730, %105, %102; +sub.f64 fd108, %105, %102; +add.f64 fd109, %61, %97; +sub.f64 fd111, %61, %97; +add.f64 fd727, %106, %107; +sub.f64 fd112, %106, %107; +add.f64 fd113, %65, %93; +sub.f64 fd115, %65, %93; +add.f64 fd725, %66, %108; +sub.f64 fd116, %66, %108; +add.f64 fd117, %69, %89; +sub.f64 fd119, %69, %89; +add.f64 fd723, %109, %90; +sub.f64 fd120, %109, %90; +add.f64 fd121, %73, %85; +sub.f64 fd123, %73, %85; +add.f64 fd720, %110, %111; +sub.f64 fd124, %110, %111; +add.f64 fd125, %77, %81; +sub.f64 fd127, %77, %81; +add.f64 fd718, %78, %112; +sub.f64 fd128, %78, %112; +add.f64 fd129, %53, fd105; +add.f64 fd131, fd129, fd109; +add.f64 fd717, %54, fd730; +add.f64 fd132, fd717, fd727; +add.f64 fd133, fd131, fd113; +add.f64 fd134, fd132, fd725; +add.f64 fd135, fd133, fd117; +add.f64 fd136, fd134, fd723; +add.f64 fd137, fd135, fd121; +add.f64 fd138, fd136, fd720; +add.f64 fd139, fd137, fd125; +add.f64 fd140, fd138, fd718; +fma.rn.f64 fd141, fd105, 0d3FEC55A7E00740E9, %53; +fma.rn.f64 fd145, fd109, 0d3FE22D961EA71119, fd141; +fma.rn.f64 fd716, fd108, 0dBFDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd146, fd112, 0dBFEA55E242A4C3D2, fd716; +fma.rn.f64 fd715, fd730, 0d3FEC55A7E00740E9, %54; +fma.rn.f64 fd147, fd727, 0d3FE22D961EA71119, fd715; +fma.rn.f64 fd714, fd107, 0dBFDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd148, fd111, 0dBFEA55E242A4C3D2, fd714; +fma.rn.f64 fd149, fd113, 0d3FBEDB7DEBAA3ED8, fd145; +fma.rn.f64 fd150, fd116, 0dBFEFC44566966769, fd146; +fma.rn.f64 fd151, fd725, 0d3FBEDB7DEBAA3ED8, fd147; +fma.rn.f64 fd152, fd115, 0dBFEFC44566966769, fd148; +fma.rn.f64 fd153, fd117, 0dBFD6B1D8B2365DA1, fd149; +fma.rn.f64 fd154, fd120, 0dBFEDEBA72EF20147, fd150; +fma.rn.f64 fd155, fd723, 0dBFD6B1D8B2365DA1, fd151; +fma.rn.f64 fd156, fd119, 0dBFEDEBA72EF20147, fd152; +fma.rn.f64 fd157, fd121, 0dBFE7F3CCD0032E0C, fd153; +fma.rn.f64 fd158, fd124, 0dBFE5384D024C2F84, fd154; +fma.rn.f64 fd159, fd720, 0dBFE7F3CCD0032E0C, fd155; +fma.rn.f64 fd160, fd123, 0dBFE5384D024C2F84, fd156; +fma.rn.f64 fd161, fd125, 0dBFEF11F493053D00, fd157; +fma.rn.f64 fd162, fd128, 0dBFCEA1E54BC48DBF, fd158; +fma.rn.f64 fd163, fd718, 0dBFEF11F493053D00, fd159; +fma.rn.f64 fd164, fd127, 0dBFCEA1E54BC48DBF, fd160; +sub.f64 fd165, fd161, fd162; +add.f64 fd167, fd162, fd161; +add.f64 fd713, fd164, fd163; +sub.f64 fd168, fd163, fd164; +fma.rn.f64 fd169, fd105, 0d3FE22D961EA71119, %53; +fma.rn.f64 fd173, fd109, 0dBFD6B1D8B2365DA1, fd169; +fma.rn.f64 fd712, fd108, 0dBFEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd174, fd112, 0dBFEDEBA72EF20147, fd712; +fma.rn.f64 fd711, fd730, 0d3FE22D961EA71119, %54; +fma.rn.f64 fd175, fd727, 0dBFD6B1D8B2365DA1, fd711; +fma.rn.f64 fd710, fd107, 0dBFEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd176, fd111, 0dBFEDEBA72EF20147, fd710; +fma.rn.f64 fd177, fd113, 0dBFEF11F493053D00, fd173; +fma.rn.f64 fd178, fd116, 0dBFCEA1E54BC48DBF, fd174; +fma.rn.f64 fd179, fd725, 0dBFEF11F493053D00, fd175; +fma.rn.f64 fd180, fd115, 0dBFCEA1E54BC48DBF, fd176; +fma.rn.f64 fd181, fd117, 0dBFE7F3CCD0032E0C, fd177; +fma.rn.f64 fd182, fd120, 0d3FE5384D024C2F84, fd178; +fma.rn.f64 fd183, fd723, 0dBFE7F3CCD0032E0C, fd179; +fma.rn.f64 fd184, fd119, 0d3FE5384D024C2F84, fd180; +fma.rn.f64 fd185, fd121, 0d3FBEDB7DEBAA3ED8, fd181; +fma.rn.f64 fd186, fd124, 0d3FEFC44566966769, fd182; +fma.rn.f64 fd187, fd720, 0d3FBEDB7DEBAA3ED8, fd183; +fma.rn.f64 fd188, fd123, 0d3FEFC44566966769, fd184; +fma.rn.f64 fd189, fd125, 0d3FEC55A7E00740E9, fd185; +fma.rn.f64 fd190, fd128, 0d3FDDBE064267C47C, fd186; +fma.rn.f64 fd191, fd718, 0d3FEC55A7E00740E9, fd187; +fma.rn.f64 fd192, fd127, 0d3FDDBE064267C47C, fd188; +sub.f64 fd193, fd189, fd190; +add.f64 fd195, fd190, fd189; +add.f64 fd709, fd192, fd191; +sub.f64 fd196, fd191, fd192; +fma.rn.f64 fd197, fd105, 0d3FBEDB7DEBAA3ED8, %53; +fma.rn.f64 fd201, fd109, 0dBFEF11F493053D00, fd197; +fma.rn.f64 fd708, fd108, 0dBFEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd202, fd112, 0dBFCEA1E54BC48DBF, fd708; +fma.rn.f64 fd707, fd730, 0d3FBEDB7DEBAA3ED8, %54; +fma.rn.f64 fd203, fd727, 0dBFEF11F493053D00, fd707; +fma.rn.f64 fd706, fd107, 0dBFEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd204, fd111, 0dBFCEA1E54BC48DBF, fd706; +fma.rn.f64 fd205, fd113, 0dBFD6B1D8B2365DA1, fd201; +fma.rn.f64 fd206, fd116, 0d3FEDEBA72EF20147, fd202; +fma.rn.f64 fd207, fd725, 0dBFD6B1D8B2365DA1, fd203; +fma.rn.f64 fd208, fd115, 0d3FEDEBA72EF20147, fd204; +fma.rn.f64 fd209, fd117, 0d3FEC55A7E00740E9, fd205; +fma.rn.f64 fd210, fd120, 0d3FDDBE064267C47C, fd206; +fma.rn.f64 fd211, fd723, 0d3FEC55A7E00740E9, fd207; +fma.rn.f64 fd212, fd119, 0d3FDDBE064267C47C, fd208; +fma.rn.f64 fd213, fd121, 0d3FE22D961EA71119, fd209; +fma.rn.f64 fd214, fd124, 0dBFEA55E242A4C3D2, fd210; +fma.rn.f64 fd215, fd720, 0d3FE22D961EA71119, fd211; +fma.rn.f64 fd216, fd123, 0dBFEA55E242A4C3D2, fd212; +fma.rn.f64 fd217, fd125, 0dBFE7F3CCD0032E0C, fd213; +fma.rn.f64 fd218, fd128, 0dBFE5384D024C2F84, fd214; +fma.rn.f64 fd219, fd718, 0dBFE7F3CCD0032E0C, fd215; +fma.rn.f64 fd220, fd127, 0dBFE5384D024C2F84, fd216; +sub.f64 fd221, fd217, fd218; +add.f64 fd223, fd218, fd217; +add.f64 fd705, fd220, fd219; +sub.f64 fd224, fd219, fd220; +fma.rn.f64 fd225, fd105, 0dBFD6B1D8B2365DA1, %53; +fma.rn.f64 fd229, fd109, 0dBFE7F3CCD0032E0C, fd225; +fma.rn.f64 fd704, fd108, 0dBFEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd230, fd112, 0d3FE5384D024C2F84, fd704; +fma.rn.f64 fd703, fd730, 0dBFD6B1D8B2365DA1, %54; +fma.rn.f64 fd231, fd727, 0dBFE7F3CCD0032E0C, fd703; +fma.rn.f64 fd702, fd107, 0dBFEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd232, fd111, 0d3FE5384D024C2F84, fd702; +fma.rn.f64 fd233, fd113, 0d3FEC55A7E00740E9, fd229; +fma.rn.f64 fd234, fd116, 0d3FDDBE064267C47C, fd230; +fma.rn.f64 fd235, fd725, 0d3FEC55A7E00740E9, fd231; +fma.rn.f64 fd236, fd115, 0d3FDDBE064267C47C, fd232; +fma.rn.f64 fd237, fd117, 0d3FBEDB7DEBAA3ED8, fd233; +fma.rn.f64 fd238, fd120, 0dBFEFC44566966769, fd234; +fma.rn.f64 fd239, fd723, 0d3FBEDB7DEBAA3ED8, fd235; +fma.rn.f64 fd240, fd119, 0dBFEFC44566966769, fd236; +fma.rn.f64 fd241, fd121, 0dBFEF11F493053D00, fd237; +fma.rn.f64 fd242, fd124, 0d3FCEA1E54BC48DBF, fd238; +fma.rn.f64 fd243, fd720, 0dBFEF11F493053D00, fd239; +fma.rn.f64 fd244, fd123, 0d3FCEA1E54BC48DBF, fd240; +fma.rn.f64 fd245, fd125, 0d3FE22D961EA71119, fd241; +fma.rn.f64 fd246, fd128, 0d3FEA55E242A4C3D2, fd242; +fma.rn.f64 fd247, fd718, 0d3FE22D961EA71119, fd243; +fma.rn.f64 fd248, fd127, 0d3FEA55E242A4C3D2, fd244; +sub.f64 fd249, fd245, fd246; +add.f64 fd251, fd246, fd245; +add.f64 fd701, fd248, fd247; +sub.f64 fd252, fd247, fd248; +fma.rn.f64 fd253, fd105, 0dBFE7F3CCD0032E0C, %53; +fma.rn.f64 fd257, fd109, 0d3FBEDB7DEBAA3ED8, fd253; +fma.rn.f64 fd700, fd108, 0dBFE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd258, fd112, 0d3FEFC44566966769, fd700; +fma.rn.f64 fd699, fd730, 0dBFE7F3CCD0032E0C, %54; +fma.rn.f64 fd259, fd727, 0d3FBEDB7DEBAA3ED8, fd699; +fma.rn.f64 fd698, fd107, 0dBFE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd260, fd111, 0d3FEFC44566966769, fd698; +fma.rn.f64 fd261, fd113, 0d3FE22D961EA71119, fd257; +fma.rn.f64 fd262, fd116, 0dBFEA55E242A4C3D2, fd258; +fma.rn.f64 fd263, fd725, 0d3FE22D961EA71119, fd259; +fma.rn.f64 fd264, fd115, 0dBFEA55E242A4C3D2, fd260; +fma.rn.f64 fd265, fd117, 0dBFEF11F493053D00, fd261; +fma.rn.f64 fd266, fd120, 0d3FCEA1E54BC48DBF, fd262; +fma.rn.f64 fd267, fd723, 0dBFEF11F493053D00, fd263; +fma.rn.f64 fd268, fd119, 0d3FCEA1E54BC48DBF, fd264; +fma.rn.f64 fd269, fd121, 0d3FEC55A7E00740E9, fd265; +fma.rn.f64 fd270, fd124, 0d3FDDBE064267C47C, fd266; +fma.rn.f64 fd271, fd720, 0d3FEC55A7E00740E9, fd267; +fma.rn.f64 fd272, fd123, 0d3FDDBE064267C47C, fd268; +fma.rn.f64 fd273, fd125, 0dBFD6B1D8B2365DA1, fd269; +fma.rn.f64 fd274, fd128, 0dBFEDEBA72EF20147, fd270; +fma.rn.f64 fd275, fd718, 0dBFD6B1D8B2365DA1, fd271; +fma.rn.f64 fd276, fd127, 0dBFEDEBA72EF20147, fd272; +sub.f64 fd277, fd273, fd274; +add.f64 fd279, fd274, fd273; +add.f64 fd697, fd276, fd275; +sub.f64 fd280, fd275, fd276; +fma.rn.f64 fd281, fd105, 0dBFEF11F493053D00, %53; +fma.rn.f64 fd282, fd108, 0dBFCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd283, fd730, 0dBFEF11F493053D00, %54; +fma.rn.f64 fd284, fd107, 0dBFCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd285, fd109, 0d3FEC55A7E00740E9, fd281; +fma.rn.f64 fd286, fd112, 0d3FDDBE064267C47C, fd282; +fma.rn.f64 fd287, fd727, 0d3FEC55A7E00740E9, fd283; +fma.rn.f64 fd288, fd111, 0d3FDDBE064267C47C, fd284; +fma.rn.f64 fd289, fd113, 0dBFE7F3CCD0032E0C, fd285; +fma.rn.f64 fd290, fd116, 0dBFE5384D024C2F84, fd286; +fma.rn.f64 fd291, fd725, 0dBFE7F3CCD0032E0C, fd287; +fma.rn.f64 fd292, fd115, 0dBFE5384D024C2F84, fd288; +fma.rn.f64 fd293, fd117, 0d3FE22D961EA71119, fd289; +fma.rn.f64 fd294, fd120, 0d3FEA55E242A4C3D2, fd290; +fma.rn.f64 fd295, fd723, 0d3FE22D961EA71119, fd291; +fma.rn.f64 fd296, fd119, 0d3FEA55E242A4C3D2, fd292; +fma.rn.f64 fd297, fd121, 0dBFD6B1D8B2365DA1, fd293; +fma.rn.f64 fd298, fd124, 0dBFEDEBA72EF20147, fd294; +fma.rn.f64 fd299, fd720, 0dBFD6B1D8B2365DA1, fd295; +fma.rn.f64 fd300, fd123, 0dBFEDEBA72EF20147, fd296; +fma.rn.f64 fd301, fd125, 0d3FBEDB7DEBAA3ED8, fd297; +fma.rn.f64 fd302, fd128, 0d3FEFC44566966769, fd298; +fma.rn.f64 fd303, fd718, 0d3FBEDB7DEBAA3ED8, fd299; +fma.rn.f64 fd304, fd127, 0d3FEFC44566966769, fd300; +sub.f64 fd305, fd301, fd302; +add.f64 fd307, fd302, fd301; +add.f64 fd696, fd304, fd303; +sub.f64 fd308, fd303, fd304; +add.f64 fd309, %59, %103; +sub.f64 fd311, %59, %103; +add.f64 fd695, %60, %104; +sub.f64 fd312, %60, %104; +add.f64 fd313, %63, %99; +sub.f64 fd315, %63, %99; +add.f64 fd692, %114, %113; +sub.f64 fd316, %114, %113; +add.f64 fd317, %67, %95; +sub.f64 fd319, %67, %95; +add.f64 fd690, %115, %96; +sub.f64 fd320, %115, %96; +add.f64 fd321, %71, %91; +sub.f64 fd323, %71, %91; +add.f64 fd688, %72, %116; +sub.f64 fd324, %72, %116; +add.f64 fd325, %75, %87; +sub.f64 fd327, %75, %87; +add.f64 fd685, %117, %118; +sub.f64 fd328, %117, %118; +add.f64 fd329, %79, %83; +sub.f64 fd331, %79, %83; +add.f64 fd683, %119, %84; +sub.f64 fd332, %119, %84; +add.f64 fd333, %55, fd309; +add.f64 fd335, fd333, fd313; +add.f64 fd681, %120, fd695; +add.f64 fd336, fd681, fd692; +add.f64 fd337, fd335, fd317; +add.f64 fd338, fd336, fd690; +add.f64 fd339, fd337, fd321; +add.f64 fd340, fd338, fd688; +add.f64 fd341, fd339, fd325; +add.f64 fd342, fd340, fd685; +add.f64 fd343, fd341, fd329; +add.f64 fd344, fd342, fd683; +fma.rn.f64 fd345, fd309, 0d3FEC55A7E00740E9, %55; +fma.rn.f64 fd349, fd313, 0d3FE22D961EA71119, fd345; +fma.rn.f64 fd680, fd312, 0dBFDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd350, fd316, 0dBFEA55E242A4C3D2, fd680; +fma.rn.f64 fd679, fd695, 0d3FEC55A7E00740E9, %120; +fma.rn.f64 fd351, fd692, 0d3FE22D961EA71119, fd679; +fma.rn.f64 fd678, fd311, 0dBFDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd352, fd315, 0dBFEA55E242A4C3D2, fd678; +fma.rn.f64 fd353, fd317, 0d3FBEDB7DEBAA3ED8, fd349; +fma.rn.f64 fd354, fd320, 0dBFEFC44566966769, fd350; +fma.rn.f64 fd355, fd690, 0d3FBEDB7DEBAA3ED8, fd351; +fma.rn.f64 fd356, fd319, 0dBFEFC44566966769, fd352; +fma.rn.f64 fd357, fd321, 0dBFD6B1D8B2365DA1, fd353; +fma.rn.f64 fd358, fd324, 0dBFEDEBA72EF20147, fd354; +fma.rn.f64 fd359, fd688, 0dBFD6B1D8B2365DA1, fd355; +fma.rn.f64 fd360, fd323, 0dBFEDEBA72EF20147, fd356; +fma.rn.f64 fd361, fd325, 0dBFE7F3CCD0032E0C, fd357; +fma.rn.f64 fd362, fd328, 0dBFE5384D024C2F84, fd358; +fma.rn.f64 fd363, fd685, 0dBFE7F3CCD0032E0C, fd359; +fma.rn.f64 fd364, fd327, 0dBFE5384D024C2F84, fd360; +fma.rn.f64 fd365, fd329, 0dBFEF11F493053D00, fd361; +fma.rn.f64 fd366, fd332, 0dBFCEA1E54BC48DBF, fd362; +fma.rn.f64 fd367, fd683, 0dBFEF11F493053D00, fd363; +fma.rn.f64 fd368, fd331, 0dBFCEA1E54BC48DBF, fd364; +sub.f64 fd369, fd365, fd366; +add.f64 fd371, fd366, fd365; +add.f64 fd677, fd368, fd367; +sub.f64 fd372, fd367, fd368; +fma.rn.f64 fd373, fd309, 0d3FE22D961EA71119, %55; +fma.rn.f64 fd377, fd313, 0dBFD6B1D8B2365DA1, fd373; +fma.rn.f64 fd676, fd312, 0dBFEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd378, fd316, 0dBFEDEBA72EF20147, fd676; +fma.rn.f64 fd675, fd695, 0d3FE22D961EA71119, %120; +fma.rn.f64 fd379, fd692, 0dBFD6B1D8B2365DA1, fd675; +fma.rn.f64 fd674, fd311, 0dBFEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd380, fd315, 0dBFEDEBA72EF20147, fd674; +fma.rn.f64 fd381, fd317, 0dBFEF11F493053D00, fd377; +fma.rn.f64 fd382, fd320, 0dBFCEA1E54BC48DBF, fd378; +fma.rn.f64 fd383, fd690, 0dBFEF11F493053D00, fd379; +fma.rn.f64 fd384, fd319, 0dBFCEA1E54BC48DBF, fd380; +fma.rn.f64 fd385, fd321, 0dBFE7F3CCD0032E0C, fd381; +fma.rn.f64 fd386, fd324, 0d3FE5384D024C2F84, fd382; +fma.rn.f64 fd387, fd688, 0dBFE7F3CCD0032E0C, fd383; +fma.rn.f64 fd388, fd323, 0d3FE5384D024C2F84, fd384; +fma.rn.f64 fd389, fd325, 0d3FBEDB7DEBAA3ED8, fd385; +fma.rn.f64 fd390, fd328, 0d3FEFC44566966769, fd386; +fma.rn.f64 fd391, fd685, 0d3FBEDB7DEBAA3ED8, fd387; +fma.rn.f64 fd392, fd327, 0d3FEFC44566966769, fd388; +fma.rn.f64 fd393, fd329, 0d3FEC55A7E00740E9, fd389; +fma.rn.f64 fd394, fd332, 0d3FDDBE064267C47C, fd390; +fma.rn.f64 fd395, fd683, 0d3FEC55A7E00740E9, fd391; +fma.rn.f64 fd396, fd331, 0d3FDDBE064267C47C, fd392; +sub.f64 fd397, fd393, fd394; +add.f64 fd399, fd394, fd393; +add.f64 fd673, fd396, fd395; +sub.f64 fd400, fd395, fd396; +fma.rn.f64 fd401, fd309, 0d3FBEDB7DEBAA3ED8, %55; +fma.rn.f64 fd405, fd313, 0dBFEF11F493053D00, fd401; +fma.rn.f64 fd672, fd312, 0dBFEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd406, fd316, 0dBFCEA1E54BC48DBF, fd672; +fma.rn.f64 fd671, fd695, 0d3FBEDB7DEBAA3ED8, %120; +fma.rn.f64 fd407, fd692, 0dBFEF11F493053D00, fd671; +fma.rn.f64 fd670, fd311, 0dBFEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd408, fd315, 0dBFCEA1E54BC48DBF, fd670; +fma.rn.f64 fd409, fd317, 0dBFD6B1D8B2365DA1, fd405; +fma.rn.f64 fd410, fd320, 0d3FEDEBA72EF20147, fd406; +fma.rn.f64 fd411, fd690, 0dBFD6B1D8B2365DA1, fd407; +fma.rn.f64 fd412, fd319, 0d3FEDEBA72EF20147, fd408; +fma.rn.f64 fd413, fd321, 0d3FEC55A7E00740E9, fd409; +fma.rn.f64 fd414, fd324, 0d3FDDBE064267C47C, fd410; +fma.rn.f64 fd415, fd688, 0d3FEC55A7E00740E9, fd411; +fma.rn.f64 fd416, fd323, 0d3FDDBE064267C47C, fd412; +fma.rn.f64 fd417, fd325, 0d3FE22D961EA71119, fd413; +fma.rn.f64 fd418, fd328, 0dBFEA55E242A4C3D2, fd414; +fma.rn.f64 fd419, fd685, 0d3FE22D961EA71119, fd415; +fma.rn.f64 fd420, fd327, 0dBFEA55E242A4C3D2, fd416; +fma.rn.f64 fd421, fd329, 0dBFE7F3CCD0032E0C, fd417; +fma.rn.f64 fd422, fd332, 0dBFE5384D024C2F84, fd418; +fma.rn.f64 fd423, fd683, 0dBFE7F3CCD0032E0C, fd419; +fma.rn.f64 fd424, fd331, 0dBFE5384D024C2F84, fd420; +sub.f64 fd425, fd421, fd422; +add.f64 fd427, fd422, fd421; +add.f64 fd669, fd424, fd423; +sub.f64 fd428, fd423, fd424; +fma.rn.f64 fd429, fd309, 0dBFD6B1D8B2365DA1, %55; +fma.rn.f64 fd433, fd313, 0dBFE7F3CCD0032E0C, fd429; +fma.rn.f64 fd668, fd312, 0dBFEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd434, fd316, 0d3FE5384D024C2F84, fd668; +fma.rn.f64 fd667, fd695, 0dBFD6B1D8B2365DA1, %120; +fma.rn.f64 fd435, fd692, 0dBFE7F3CCD0032E0C, fd667; +fma.rn.f64 fd666, fd311, 0dBFEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd436, fd315, 0d3FE5384D024C2F84, fd666; +fma.rn.f64 fd437, fd317, 0d3FEC55A7E00740E9, fd433; +fma.rn.f64 fd438, fd320, 0d3FDDBE064267C47C, fd434; +fma.rn.f64 fd439, fd690, 0d3FEC55A7E00740E9, fd435; +fma.rn.f64 fd440, fd319, 0d3FDDBE064267C47C, fd436; +fma.rn.f64 fd441, fd321, 0d3FBEDB7DEBAA3ED8, fd437; +fma.rn.f64 fd442, fd324, 0dBFEFC44566966769, fd438; +fma.rn.f64 fd443, fd688, 0d3FBEDB7DEBAA3ED8, fd439; +fma.rn.f64 fd444, fd323, 0dBFEFC44566966769, fd440; +fma.rn.f64 fd445, fd325, 0dBFEF11F493053D00, fd441; +fma.rn.f64 fd446, fd328, 0d3FCEA1E54BC48DBF, fd442; +fma.rn.f64 fd447, fd685, 0dBFEF11F493053D00, fd443; +fma.rn.f64 fd448, fd327, 0d3FCEA1E54BC48DBF, fd444; +fma.rn.f64 fd449, fd329, 0d3FE22D961EA71119, fd445; +fma.rn.f64 fd450, fd332, 0d3FEA55E242A4C3D2, fd446; +fma.rn.f64 fd451, fd683, 0d3FE22D961EA71119, fd447; +fma.rn.f64 fd452, fd331, 0d3FEA55E242A4C3D2, fd448; +sub.f64 fd453, fd449, fd450; +add.f64 fd455, fd450, fd449; +add.f64 fd665, fd452, fd451; +sub.f64 fd456, fd451, fd452; +fma.rn.f64 fd457, fd309, 0dBFE7F3CCD0032E0C, %55; +fma.rn.f64 fd461, fd313, 0d3FBEDB7DEBAA3ED8, fd457; +fma.rn.f64 fd664, fd312, 0dBFE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd462, fd316, 0d3FEFC44566966769, fd664; +fma.rn.f64 fd663, fd695, 0dBFE7F3CCD0032E0C, %120; +fma.rn.f64 fd463, fd692, 0d3FBEDB7DEBAA3ED8, fd663; +fma.rn.f64 fd662, fd311, 0dBFE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd464, fd315, 0d3FEFC44566966769, fd662; +fma.rn.f64 fd465, fd317, 0d3FE22D961EA71119, fd461; +fma.rn.f64 fd466, fd320, 0dBFEA55E242A4C3D2, fd462; +fma.rn.f64 fd467, fd690, 0d3FE22D961EA71119, fd463; +fma.rn.f64 fd468, fd319, 0dBFEA55E242A4C3D2, fd464; +fma.rn.f64 fd469, fd321, 0dBFEF11F493053D00, fd465; +fma.rn.f64 fd470, fd324, 0d3FCEA1E54BC48DBF, fd466; +fma.rn.f64 fd471, fd688, 0dBFEF11F493053D00, fd467; +fma.rn.f64 fd472, fd323, 0d3FCEA1E54BC48DBF, fd468; +fma.rn.f64 fd473, fd325, 0d3FEC55A7E00740E9, fd469; +fma.rn.f64 fd474, fd328, 0d3FDDBE064267C47C, fd470; +fma.rn.f64 fd475, fd685, 0d3FEC55A7E00740E9, fd471; +fma.rn.f64 fd476, fd327, 0d3FDDBE064267C47C, fd472; +fma.rn.f64 fd477, fd329, 0dBFD6B1D8B2365DA1, fd473; +fma.rn.f64 fd478, fd332, 0dBFEDEBA72EF20147, fd474; +fma.rn.f64 fd479, fd683, 0dBFD6B1D8B2365DA1, fd475; +fma.rn.f64 fd480, fd331, 0dBFEDEBA72EF20147, fd476; +sub.f64 fd481, fd477, fd478; +add.f64 fd483, fd478, fd477; +add.f64 fd661, fd480, fd479; +sub.f64 fd484, fd479, fd480; +fma.rn.f64 fd485, fd309, 0dBFEF11F493053D00, %55; +fma.rn.f64 fd486, fd312, 0dBFCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd487, fd695, 0dBFEF11F493053D00, %52; +fma.rn.f64 fd488, fd311, 0dBFCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd489, fd313, 0d3FEC55A7E00740E9, fd485; +fma.rn.f64 fd490, fd316, 0d3FDDBE064267C47C, fd486; +fma.rn.f64 fd491, fd692, 0d3FEC55A7E00740E9, fd487; +fma.rn.f64 fd492, fd315, 0d3FDDBE064267C47C, fd488; +fma.rn.f64 fd493, fd317, 0dBFE7F3CCD0032E0C, fd489; +fma.rn.f64 fd494, fd320, 0dBFE5384D024C2F84, fd490; +fma.rn.f64 fd495, fd690, 0dBFE7F3CCD0032E0C, fd491; +fma.rn.f64 fd496, fd319, 0dBFE5384D024C2F84, fd492; +fma.rn.f64 fd497, fd321, 0d3FE22D961EA71119, fd493; +fma.rn.f64 fd498, fd324, 0d3FEA55E242A4C3D2, fd494; +fma.rn.f64 fd499, fd688, 0d3FE22D961EA71119, fd495; +fma.rn.f64 fd500, fd323, 0d3FEA55E242A4C3D2, fd496; +fma.rn.f64 fd501, fd325, 0dBFD6B1D8B2365DA1, fd497; +fma.rn.f64 fd502, fd328, 0dBFEDEBA72EF20147, fd498; +fma.rn.f64 fd503, fd685, 0dBFD6B1D8B2365DA1, fd499; +fma.rn.f64 fd504, fd327, 0dBFEDEBA72EF20147, fd500; +fma.rn.f64 fd505, fd329, 0d3FBEDB7DEBAA3ED8, fd501; +fma.rn.f64 fd506, fd332, 0d3FEFC44566966769, fd502; +fma.rn.f64 fd507, fd683, 0d3FBEDB7DEBAA3ED8, fd503; +fma.rn.f64 fd508, fd331, 0d3FEFC44566966769, fd504; +sub.f64 fd509, fd505, fd506; +add.f64 fd511, fd506, fd505; +add.f64 fd660, fd508, fd507; +sub.f64 fd512, fd507, fd508; +mul.f64 fd514, fd677, 0dBFCEA1E54BC48DBF; +mul.f64 fd659, fd369, 0d3FEF11F493053D00; +sub.f64 fd515, fd659, fd514; +mul.f64 fd516, fd677, 0d3FEF11F493053D00; +fma.rn.f64 fd517, fd369, 0dBFCEA1E54BC48DBF, fd516; +mul.f64 fd519, fd673, 0dBFDDBE064267C47C; +mul.f64 fd658, fd397, 0d3FEC55A7E00740E9; +sub.f64 fd520, fd658, fd519; +mul.f64 fd521, fd673, 0d3FEC55A7E00740E9; +fma.rn.f64 fd522, fd397, 0dBFDDBE064267C47C, fd521; +mul.f64 fd656, fd425, 0d3FE7F3CCD0032E0C; +mul.f64 fd657, fd669, 0dBFE5384D024C2F84; +sub.f64 fd525, fd656, fd657; +mul.f64 fd526, fd669, 0d3FE7F3CCD0032E0C; +fma.rn.f64 fd527, fd425, 0dBFE5384D024C2F84, fd526; +mul.f64 fd654, fd453, 0d3FE22D961EA71119; +mul.f64 fd655, fd665, 0dBFEA55E242A4C3D2; +sub.f64 fd530, fd654, fd655; +mul.f64 fd531, fd665, 0d3FE22D961EA71119; +fma.rn.f64 fd532, fd453, 0dBFEA55E242A4C3D2, fd531; +mul.f64 fd652, fd481, 0d3FD6B1D8B2365DA1; +mul.f64 fd653, fd661, 0dBFEDEBA72EF20147; +sub.f64 fd535, fd652, fd653; +mul.f64 fd536, fd661, 0d3FD6B1D8B2365DA1; +fma.rn.f64 fd537, fd481, 0dBFEDEBA72EF20147, fd536; +mul.f64 fd650, fd509, 0d3FBEDB7DEBAA3ED8; +mul.f64 fd651, fd660, 0dBFEFC44566966769; +sub.f64 fd540, fd650, fd651; +mul.f64 fd541, fd660, 0d3FBEDB7DEBAA3ED8; +fma.rn.f64 fd542, fd509, 0dBFEFC44566966769, fd541; +mul.f64 fd544, fd512, 0dBFEFC44566966769; +mul.f64 fd649, fd511, 0dBFBEDB7DEBAA3ED8; +sub.f64 fd545, fd649, fd544; +mul.f64 fd546, fd512, 0dBFBEDB7DEBAA3ED8; +fma.rn.f64 fd547, fd511, 0dBFEFC44566966769, fd546; +mul.f64 fd549, fd484, 0dBFEDEBA72EF20147; +mul.f64 fd648, fd483, 0dBFD6B1D8B2365DA1; +sub.f64 fd550, fd648, fd549; +mul.f64 fd551, fd484, 0dBFD6B1D8B2365DA1; +fma.rn.f64 fd552, fd483, 0dBFEDEBA72EF20147, fd551; +mul.f64 fd554, fd456, 0dBFEA55E242A4C3D2; +mul.f64 fd647, fd455, 0dBFE22D961EA71119; +sub.f64 fd555, fd647, fd554; +mul.f64 fd556, fd456, 0dBFE22D961EA71119; +fma.rn.f64 fd557, fd455, 0dBFEA55E242A4C3D2, fd556; +mul.f64 fd559, fd428, 0dBFE5384D024C2F84; +mul.f64 fd646, fd427, 0dBFE7F3CCD0032E0C; +sub.f64 fd560, fd646, fd559; +mul.f64 fd561, fd428, 0dBFE7F3CCD0032E0C; +fma.rn.f64 fd562, fd427, 0dBFE5384D024C2F84, fd561; +mul.f64 fd564, fd400, 0dBFDDBE064267C47C; +mul.f64 fd645, fd399, 0dBFEC55A7E00740E9; +sub.f64 fd565, fd645, fd564; +mul.f64 fd566, fd400, 0dBFEC55A7E00740E9; +fma.rn.f64 fd567, fd399, 0dBFDDBE064267C47C, fd566; +mul.f64 fd643, fd371, 0dBFEF11F493053D00; +mul.f64 fd644, fd372, 0dBFCEA1E54BC48DBF; +sub.f64 fd570, fd643, fd644; +mul.f64 fd571, fd372, 0dBFEF11F493053D00; +fma.rn.f64 fd572, fd371, 0dBFCEA1E54BC48DBF, fd571; +add.f64 %1, fd140, fd344; +add.f64 %0, fd139, fd343; +add.f64 %2, fd165, fd515; +add.f64 %3, fd713, fd517; +add.f64 %4, fd193, fd520; +add.f64 %5, fd709, fd522; +add.f64 %6, fd221, fd525; +add.f64 %7, fd705, fd527; +add.f64 %8, fd249, fd530; +add.f64 %9, fd701, fd532; +add.f64 %11, fd697, fd537; +add.f64 %10, fd277, fd535; +add.f64 %13, fd696, fd542; +add.f64 %12, fd305, fd540; +add.f64 %15, fd308, fd547; +add.f64 %14, fd307, fd545; +add.f64 %16, fd279, fd550; +add.f64 %17, fd280, fd552; +add.f64 %18, fd251, fd555; +add.f64 %19, fd252, fd557; +add.f64 %20, fd223, fd560; +add.f64 %21, fd224, fd562; +add.f64 %22, fd195, fd565; +add.f64 %23, fd196, fd567; +add.f64 %25, fd168, fd572; +add.f64 %24, fd167, fd570; +sub.f64 %27, fd140, fd344; +sub.f64 %26, fd139, fd343; +sub.f64 %29, fd713, fd517; +sub.f64 %28, fd165, fd515; +sub.f64 %31, fd709, fd522; +sub.f64 %30, fd193, fd520; +sub.f64 %33, fd705, fd527; +sub.f64 %32, fd221, fd525; +sub.f64 %35, fd701, fd532; +sub.f64 %34, fd249, fd530; +sub.f64 %37, fd697, fd537; +sub.f64 %36, fd277, fd535; +sub.f64 %39, fd696, fd542; +sub.f64 %38, fd305, fd540; +sub.f64 %41, fd308, fd547; +sub.f64 %40, fd307, fd545; +sub.f64 %43, fd280, fd552; +sub.f64 %42, fd279, fd550; +sub.f64 %45, fd252, fd557; +sub.f64 %44, fd251, fd555; +sub.f64 %47, fd224, fd562; +sub.f64 %46, fd223, fd560; +sub.f64 %49, fd196, fd567; +sub.f64 %48, fd195, fd565; +sub.f64 %51, fd168, fd572; +sub.f64 %50, fd167, fd570; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y): "d"(rmem[1].y), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[2].y), "d"(rmem[4].y), "d"(rmem[22].y), "d"(rmem[20].y), "d"(rmem[8].y), "d"(rmem[10].y), "d"(rmem[16].y), "d"(rmem[14].y), "d"(rmem[23].y), "d"(rmem[5].y), "d"(rmem[7].y), "d"(rmem[19].y), "d"(rmem[11].y), "d"(rmem[17].y), "d"(rmem[13].y), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..e727575e93068 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_26_fp64_inv.hpp.inc @@ -0,0 +1,536 @@ +#ifndef CUFFTDX_FFT_26_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_26_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<586, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<734>; +.reg .b64 rd<7>; +add.f64 fd105, %57, %101; +sub.f64 fd107, %57, %101; +add.f64 fd730, %105, %102; +sub.f64 fd108, %105, %102; +add.f64 fd109, %61, %97; +sub.f64 fd111, %61, %97; +add.f64 fd727, %106, %107; +sub.f64 fd112, %106, %107; +add.f64 fd113, %65, %93; +sub.f64 fd115, %65, %93; +add.f64 fd725, %66, %108; +sub.f64 fd116, %66, %108; +add.f64 fd117, %69, %89; +sub.f64 fd119, %69, %89; +add.f64 fd723, %109, %90; +sub.f64 fd120, %109, %90; +add.f64 fd121, %73, %85; +sub.f64 fd123, %73, %85; +add.f64 fd720, %110, %111; +sub.f64 fd124, %110, %111; +add.f64 fd125, %77, %81; +sub.f64 fd127, %77, %81; +add.f64 fd718, %78, %112; +sub.f64 fd128, %78, %112; +add.f64 fd129, %53, fd105; +add.f64 fd131, fd129, fd109; +add.f64 fd717, %54, fd730; +add.f64 fd132, fd717, fd727; +add.f64 fd133, fd131, fd113; +add.f64 fd134, fd132, fd725; +add.f64 fd135, fd133, fd117; +add.f64 fd136, fd134, fd723; +add.f64 fd137, fd135, fd121; +add.f64 fd138, fd136, fd720; +add.f64 fd139, fd137, fd125; +add.f64 fd140, fd138, fd718; +fma.rn.f64 fd141, fd105, 0d3FEC55A7E00740E9, %53; +fma.rn.f64 fd145, fd109, 0d3FE22D961EA71119, fd141; +fma.rn.f64 fd716, fd108, 0d3FDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd146, fd112, 0d3FEA55E242A4C3D2, fd716; +fma.rn.f64 fd715, fd730, 0d3FEC55A7E00740E9, %54; +fma.rn.f64 fd147, fd727, 0d3FE22D961EA71119, fd715; +fma.rn.f64 fd714, fd107, 0d3FDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd148, fd111, 0d3FEA55E242A4C3D2, fd714; +fma.rn.f64 fd149, fd113, 0d3FBEDB7DEBAA3ED8, fd145; +fma.rn.f64 fd150, fd116, 0d3FEFC44566966769, fd146; +fma.rn.f64 fd151, fd725, 0d3FBEDB7DEBAA3ED8, fd147; +fma.rn.f64 fd152, fd115, 0d3FEFC44566966769, fd148; +fma.rn.f64 fd153, fd117, 0dBFD6B1D8B2365DA1, fd149; +fma.rn.f64 fd154, fd120, 0d3FEDEBA72EF20147, fd150; +fma.rn.f64 fd155, fd723, 0dBFD6B1D8B2365DA1, fd151; +fma.rn.f64 fd156, fd119, 0d3FEDEBA72EF20147, fd152; +fma.rn.f64 fd157, fd121, 0dBFE7F3CCD0032E0C, fd153; +fma.rn.f64 fd158, fd124, 0d3FE5384D024C2F84, fd154; +fma.rn.f64 fd159, fd720, 0dBFE7F3CCD0032E0C, fd155; +fma.rn.f64 fd160, fd123, 0d3FE5384D024C2F84, fd156; +fma.rn.f64 fd161, fd125, 0dBFEF11F493053D00, fd157; +fma.rn.f64 fd162, fd128, 0d3FCEA1E54BC48DBF, fd158; +fma.rn.f64 fd163, fd718, 0dBFEF11F493053D00, fd159; +fma.rn.f64 fd164, fd127, 0d3FCEA1E54BC48DBF, fd160; +sub.f64 fd165, fd161, fd162; +add.f64 fd167, fd162, fd161; +add.f64 fd713, fd164, fd163; +sub.f64 fd168, fd163, fd164; +fma.rn.f64 fd169, fd105, 0d3FE22D961EA71119, %53; +fma.rn.f64 fd173, fd109, 0dBFD6B1D8B2365DA1, fd169; +fma.rn.f64 fd712, fd108, 0d3FEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd174, fd112, 0d3FEDEBA72EF20147, fd712; +fma.rn.f64 fd711, fd730, 0d3FE22D961EA71119, %54; +fma.rn.f64 fd175, fd727, 0dBFD6B1D8B2365DA1, fd711; +fma.rn.f64 fd710, fd107, 0d3FEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd176, fd111, 0d3FEDEBA72EF20147, fd710; +fma.rn.f64 fd177, fd113, 0dBFEF11F493053D00, fd173; +fma.rn.f64 fd178, fd116, 0d3FCEA1E54BC48DBF, fd174; +fma.rn.f64 fd179, fd725, 0dBFEF11F493053D00, fd175; +fma.rn.f64 fd180, fd115, 0d3FCEA1E54BC48DBF, fd176; +fma.rn.f64 fd181, fd117, 0dBFE7F3CCD0032E0C, fd177; +fma.rn.f64 fd182, fd120, 0dBFE5384D024C2F84, fd178; +fma.rn.f64 fd183, fd723, 0dBFE7F3CCD0032E0C, fd179; +fma.rn.f64 fd184, fd119, 0dBFE5384D024C2F84, fd180; +fma.rn.f64 fd185, fd121, 0d3FBEDB7DEBAA3ED8, fd181; +fma.rn.f64 fd186, fd124, 0dBFEFC44566966769, fd182; +fma.rn.f64 fd187, fd720, 0d3FBEDB7DEBAA3ED8, fd183; +fma.rn.f64 fd188, fd123, 0dBFEFC44566966769, fd184; +fma.rn.f64 fd189, fd125, 0d3FEC55A7E00740E9, fd185; +fma.rn.f64 fd190, fd128, 0dBFDDBE064267C47C, fd186; +fma.rn.f64 fd191, fd718, 0d3FEC55A7E00740E9, fd187; +fma.rn.f64 fd192, fd127, 0dBFDDBE064267C47C, fd188; +sub.f64 fd193, fd189, fd190; +add.f64 fd195, fd190, fd189; +add.f64 fd709, fd192, fd191; +sub.f64 fd196, fd191, fd192; +fma.rn.f64 fd197, fd105, 0d3FBEDB7DEBAA3ED8, %53; +fma.rn.f64 fd201, fd109, 0dBFEF11F493053D00, fd197; +fma.rn.f64 fd708, fd108, 0d3FEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd202, fd112, 0d3FCEA1E54BC48DBF, fd708; +fma.rn.f64 fd707, fd730, 0d3FBEDB7DEBAA3ED8, %54; +fma.rn.f64 fd203, fd727, 0dBFEF11F493053D00, fd707; +fma.rn.f64 fd706, fd107, 0d3FEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd204, fd111, 0d3FCEA1E54BC48DBF, fd706; +fma.rn.f64 fd205, fd113, 0dBFD6B1D8B2365DA1, fd201; +fma.rn.f64 fd206, fd116, 0dBFEDEBA72EF20147, fd202; +fma.rn.f64 fd207, fd725, 0dBFD6B1D8B2365DA1, fd203; +fma.rn.f64 fd208, fd115, 0dBFEDEBA72EF20147, fd204; +fma.rn.f64 fd209, fd117, 0d3FEC55A7E00740E9, fd205; +fma.rn.f64 fd210, fd120, 0dBFDDBE064267C47C, fd206; +fma.rn.f64 fd211, fd723, 0d3FEC55A7E00740E9, fd207; +fma.rn.f64 fd212, fd119, 0dBFDDBE064267C47C, fd208; +fma.rn.f64 fd213, fd121, 0d3FE22D961EA71119, fd209; +fma.rn.f64 fd214, fd124, 0d3FEA55E242A4C3D2, fd210; +fma.rn.f64 fd215, fd720, 0d3FE22D961EA71119, fd211; +fma.rn.f64 fd216, fd123, 0d3FEA55E242A4C3D2, fd212; +fma.rn.f64 fd217, fd125, 0dBFE7F3CCD0032E0C, fd213; +fma.rn.f64 fd218, fd128, 0d3FE5384D024C2F84, fd214; +fma.rn.f64 fd219, fd718, 0dBFE7F3CCD0032E0C, fd215; +fma.rn.f64 fd220, fd127, 0d3FE5384D024C2F84, fd216; +sub.f64 fd221, fd217, fd218; +add.f64 fd223, fd218, fd217; +add.f64 fd705, fd220, fd219; +sub.f64 fd224, fd219, fd220; +fma.rn.f64 fd225, fd105, 0dBFD6B1D8B2365DA1, %53; +fma.rn.f64 fd229, fd109, 0dBFE7F3CCD0032E0C, fd225; +fma.rn.f64 fd704, fd108, 0d3FEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd230, fd112, 0dBFE5384D024C2F84, fd704; +fma.rn.f64 fd703, fd730, 0dBFD6B1D8B2365DA1, %54; +fma.rn.f64 fd231, fd727, 0dBFE7F3CCD0032E0C, fd703; +fma.rn.f64 fd702, fd107, 0d3FEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd232, fd111, 0dBFE5384D024C2F84, fd702; +fma.rn.f64 fd233, fd113, 0d3FEC55A7E00740E9, fd229; +fma.rn.f64 fd234, fd116, 0dBFDDBE064267C47C, fd230; +fma.rn.f64 fd235, fd725, 0d3FEC55A7E00740E9, fd231; +fma.rn.f64 fd236, fd115, 0dBFDDBE064267C47C, fd232; +fma.rn.f64 fd237, fd117, 0d3FBEDB7DEBAA3ED8, fd233; +fma.rn.f64 fd238, fd120, 0d3FEFC44566966769, fd234; +fma.rn.f64 fd239, fd723, 0d3FBEDB7DEBAA3ED8, fd235; +fma.rn.f64 fd240, fd119, 0d3FEFC44566966769, fd236; +fma.rn.f64 fd241, fd121, 0dBFEF11F493053D00, fd237; +fma.rn.f64 fd242, fd124, 0dBFCEA1E54BC48DBF, fd238; +fma.rn.f64 fd243, fd720, 0dBFEF11F493053D00, fd239; +fma.rn.f64 fd244, fd123, 0dBFCEA1E54BC48DBF, fd240; +fma.rn.f64 fd245, fd125, 0d3FE22D961EA71119, fd241; +fma.rn.f64 fd246, fd128, 0dBFEA55E242A4C3D2, fd242; +fma.rn.f64 fd247, fd718, 0d3FE22D961EA71119, fd243; +fma.rn.f64 fd248, fd127, 0dBFEA55E242A4C3D2, fd244; +sub.f64 fd249, fd245, fd246; +add.f64 fd251, fd246, fd245; +add.f64 fd701, fd248, fd247; +sub.f64 fd252, fd247, fd248; +fma.rn.f64 fd253, fd105, 0dBFE7F3CCD0032E0C, %53; +fma.rn.f64 fd257, fd109, 0d3FBEDB7DEBAA3ED8, fd253; +fma.rn.f64 fd700, fd108, 0d3FE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd258, fd112, 0dBFEFC44566966769, fd700; +fma.rn.f64 fd699, fd730, 0dBFE7F3CCD0032E0C, %54; +fma.rn.f64 fd259, fd727, 0d3FBEDB7DEBAA3ED8, fd699; +fma.rn.f64 fd698, fd107, 0d3FE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd260, fd111, 0dBFEFC44566966769, fd698; +fma.rn.f64 fd261, fd113, 0d3FE22D961EA71119, fd257; +fma.rn.f64 fd262, fd116, 0d3FEA55E242A4C3D2, fd258; +fma.rn.f64 fd263, fd725, 0d3FE22D961EA71119, fd259; +fma.rn.f64 fd264, fd115, 0d3FEA55E242A4C3D2, fd260; +fma.rn.f64 fd265, fd117, 0dBFEF11F493053D00, fd261; +fma.rn.f64 fd266, fd120, 0dBFCEA1E54BC48DBF, fd262; +fma.rn.f64 fd267, fd723, 0dBFEF11F493053D00, fd263; +fma.rn.f64 fd268, fd119, 0dBFCEA1E54BC48DBF, fd264; +fma.rn.f64 fd269, fd121, 0d3FEC55A7E00740E9, fd265; +fma.rn.f64 fd270, fd124, 0dBFDDBE064267C47C, fd266; +fma.rn.f64 fd271, fd720, 0d3FEC55A7E00740E9, fd267; +fma.rn.f64 fd272, fd123, 0dBFDDBE064267C47C, fd268; +fma.rn.f64 fd273, fd125, 0dBFD6B1D8B2365DA1, fd269; +fma.rn.f64 fd274, fd128, 0d3FEDEBA72EF20147, fd270; +fma.rn.f64 fd275, fd718, 0dBFD6B1D8B2365DA1, fd271; +fma.rn.f64 fd276, fd127, 0d3FEDEBA72EF20147, fd272; +sub.f64 fd277, fd273, fd274; +add.f64 fd279, fd274, fd273; +add.f64 fd697, fd276, fd275; +sub.f64 fd280, fd275, fd276; +fma.rn.f64 fd281, fd105, 0dBFEF11F493053D00, %53; +fma.rn.f64 fd282, fd108, 0d3FCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd283, fd730, 0dBFEF11F493053D00, %54; +fma.rn.f64 fd284, fd107, 0d3FCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd285, fd109, 0d3FEC55A7E00740E9, fd281; +fma.rn.f64 fd286, fd112, 0dBFDDBE064267C47C, fd282; +fma.rn.f64 fd287, fd727, 0d3FEC55A7E00740E9, fd283; +fma.rn.f64 fd288, fd111, 0dBFDDBE064267C47C, fd284; +fma.rn.f64 fd289, fd113, 0dBFE7F3CCD0032E0C, fd285; +fma.rn.f64 fd290, fd116, 0d3FE5384D024C2F84, fd286; +fma.rn.f64 fd291, fd725, 0dBFE7F3CCD0032E0C, fd287; +fma.rn.f64 fd292, fd115, 0d3FE5384D024C2F84, fd288; +fma.rn.f64 fd293, fd117, 0d3FE22D961EA71119, fd289; +fma.rn.f64 fd294, fd120, 0dBFEA55E242A4C3D2, fd290; +fma.rn.f64 fd295, fd723, 0d3FE22D961EA71119, fd291; +fma.rn.f64 fd296, fd119, 0dBFEA55E242A4C3D2, fd292; +fma.rn.f64 fd297, fd121, 0dBFD6B1D8B2365DA1, fd293; +fma.rn.f64 fd298, fd124, 0d3FEDEBA72EF20147, fd294; +fma.rn.f64 fd299, fd720, 0dBFD6B1D8B2365DA1, fd295; +fma.rn.f64 fd300, fd123, 0d3FEDEBA72EF20147, fd296; +fma.rn.f64 fd301, fd125, 0d3FBEDB7DEBAA3ED8, fd297; +fma.rn.f64 fd302, fd128, 0dBFEFC44566966769, fd298; +fma.rn.f64 fd303, fd718, 0d3FBEDB7DEBAA3ED8, fd299; +fma.rn.f64 fd304, fd127, 0dBFEFC44566966769, fd300; +sub.f64 fd305, fd301, fd302; +add.f64 fd307, fd302, fd301; +add.f64 fd696, fd304, fd303; +sub.f64 fd308, fd303, fd304; +add.f64 fd309, %59, %103; +sub.f64 fd311, %59, %103; +add.f64 fd695, %60, %104; +sub.f64 fd312, %60, %104; +add.f64 fd313, %63, %99; +sub.f64 fd315, %63, %99; +add.f64 fd692, %114, %113; +sub.f64 fd316, %114, %113; +add.f64 fd317, %67, %95; +sub.f64 fd319, %67, %95; +add.f64 fd690, %115, %96; +sub.f64 fd320, %115, %96; +add.f64 fd321, %71, %91; +sub.f64 fd323, %71, %91; +add.f64 fd688, %72, %116; +sub.f64 fd324, %72, %116; +add.f64 fd325, %75, %87; +sub.f64 fd327, %75, %87; +add.f64 fd685, %117, %118; +sub.f64 fd328, %117, %118; +add.f64 fd329, %79, %83; +sub.f64 fd331, %79, %83; +add.f64 fd683, %119, %84; +sub.f64 fd332, %119, %84; +add.f64 fd333, %55, fd309; +add.f64 fd335, fd333, fd313; +add.f64 fd681, %120, fd695; +add.f64 fd336, fd681, fd692; +add.f64 fd337, fd335, fd317; +add.f64 fd338, fd336, fd690; +add.f64 fd339, fd337, fd321; +add.f64 fd340, fd338, fd688; +add.f64 fd341, fd339, fd325; +add.f64 fd342, fd340, fd685; +add.f64 fd343, fd341, fd329; +add.f64 fd344, fd342, fd683; +fma.rn.f64 fd345, fd309, 0d3FEC55A7E00740E9, %55; +fma.rn.f64 fd349, fd313, 0d3FE22D961EA71119, fd345; +fma.rn.f64 fd680, fd312, 0d3FDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd350, fd316, 0d3FEA55E242A4C3D2, fd680; +fma.rn.f64 fd679, fd695, 0d3FEC55A7E00740E9, %120; +fma.rn.f64 fd351, fd692, 0d3FE22D961EA71119, fd679; +fma.rn.f64 fd678, fd311, 0d3FDDBE064267C47C, 0d0000000000000000; +fma.rn.f64 fd352, fd315, 0d3FEA55E242A4C3D2, fd678; +fma.rn.f64 fd353, fd317, 0d3FBEDB7DEBAA3ED8, fd349; +fma.rn.f64 fd354, fd320, 0d3FEFC44566966769, fd350; +fma.rn.f64 fd355, fd690, 0d3FBEDB7DEBAA3ED8, fd351; +fma.rn.f64 fd356, fd319, 0d3FEFC44566966769, fd352; +fma.rn.f64 fd357, fd321, 0dBFD6B1D8B2365DA1, fd353; +fma.rn.f64 fd358, fd324, 0d3FEDEBA72EF20147, fd354; +fma.rn.f64 fd359, fd688, 0dBFD6B1D8B2365DA1, fd355; +fma.rn.f64 fd360, fd323, 0d3FEDEBA72EF20147, fd356; +fma.rn.f64 fd361, fd325, 0dBFE7F3CCD0032E0C, fd357; +fma.rn.f64 fd362, fd328, 0d3FE5384D024C2F84, fd358; +fma.rn.f64 fd363, fd685, 0dBFE7F3CCD0032E0C, fd359; +fma.rn.f64 fd364, fd327, 0d3FE5384D024C2F84, fd360; +fma.rn.f64 fd365, fd329, 0dBFEF11F493053D00, fd361; +fma.rn.f64 fd366, fd332, 0d3FCEA1E54BC48DBF, fd362; +fma.rn.f64 fd367, fd683, 0dBFEF11F493053D00, fd363; +fma.rn.f64 fd368, fd331, 0d3FCEA1E54BC48DBF, fd364; +sub.f64 fd369, fd365, fd366; +add.f64 fd371, fd366, fd365; +add.f64 fd677, fd368, fd367; +sub.f64 fd372, fd367, fd368; +fma.rn.f64 fd373, fd309, 0d3FE22D961EA71119, %55; +fma.rn.f64 fd377, fd313, 0dBFD6B1D8B2365DA1, fd373; +fma.rn.f64 fd676, fd312, 0d3FEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd378, fd316, 0d3FEDEBA72EF20147, fd676; +fma.rn.f64 fd675, fd695, 0d3FE22D961EA71119, %120; +fma.rn.f64 fd379, fd692, 0dBFD6B1D8B2365DA1, fd675; +fma.rn.f64 fd674, fd311, 0d3FEA55E242A4C3D2, 0d0000000000000000; +fma.rn.f64 fd380, fd315, 0d3FEDEBA72EF20147, fd674; +fma.rn.f64 fd381, fd317, 0dBFEF11F493053D00, fd377; +fma.rn.f64 fd382, fd320, 0d3FCEA1E54BC48DBF, fd378; +fma.rn.f64 fd383, fd690, 0dBFEF11F493053D00, fd379; +fma.rn.f64 fd384, fd319, 0d3FCEA1E54BC48DBF, fd380; +fma.rn.f64 fd385, fd321, 0dBFE7F3CCD0032E0C, fd381; +fma.rn.f64 fd386, fd324, 0dBFE5384D024C2F84, fd382; +fma.rn.f64 fd387, fd688, 0dBFE7F3CCD0032E0C, fd383; +fma.rn.f64 fd388, fd323, 0dBFE5384D024C2F84, fd384; +fma.rn.f64 fd389, fd325, 0d3FBEDB7DEBAA3ED8, fd385; +fma.rn.f64 fd390, fd328, 0dBFEFC44566966769, fd386; +fma.rn.f64 fd391, fd685, 0d3FBEDB7DEBAA3ED8, fd387; +fma.rn.f64 fd392, fd327, 0dBFEFC44566966769, fd388; +fma.rn.f64 fd393, fd329, 0d3FEC55A7E00740E9, fd389; +fma.rn.f64 fd394, fd332, 0dBFDDBE064267C47C, fd390; +fma.rn.f64 fd395, fd683, 0d3FEC55A7E00740E9, fd391; +fma.rn.f64 fd396, fd331, 0dBFDDBE064267C47C, fd392; +sub.f64 fd397, fd393, fd394; +add.f64 fd399, fd394, fd393; +add.f64 fd673, fd396, fd395; +sub.f64 fd400, fd395, fd396; +fma.rn.f64 fd401, fd309, 0d3FBEDB7DEBAA3ED8, %55; +fma.rn.f64 fd405, fd313, 0dBFEF11F493053D00, fd401; +fma.rn.f64 fd672, fd312, 0d3FEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd406, fd316, 0d3FCEA1E54BC48DBF, fd672; +fma.rn.f64 fd671, fd695, 0d3FBEDB7DEBAA3ED8, %120; +fma.rn.f64 fd407, fd692, 0dBFEF11F493053D00, fd671; +fma.rn.f64 fd670, fd311, 0d3FEFC44566966769, 0d0000000000000000; +fma.rn.f64 fd408, fd315, 0d3FCEA1E54BC48DBF, fd670; +fma.rn.f64 fd409, fd317, 0dBFD6B1D8B2365DA1, fd405; +fma.rn.f64 fd410, fd320, 0dBFEDEBA72EF20147, fd406; +fma.rn.f64 fd411, fd690, 0dBFD6B1D8B2365DA1, fd407; +fma.rn.f64 fd412, fd319, 0dBFEDEBA72EF20147, fd408; +fma.rn.f64 fd413, fd321, 0d3FEC55A7E00740E9, fd409; +fma.rn.f64 fd414, fd324, 0dBFDDBE064267C47C, fd410; +fma.rn.f64 fd415, fd688, 0d3FEC55A7E00740E9, fd411; +fma.rn.f64 fd416, fd323, 0dBFDDBE064267C47C, fd412; +fma.rn.f64 fd417, fd325, 0d3FE22D961EA71119, fd413; +fma.rn.f64 fd418, fd328, 0d3FEA55E242A4C3D2, fd414; +fma.rn.f64 fd419, fd685, 0d3FE22D961EA71119, fd415; +fma.rn.f64 fd420, fd327, 0d3FEA55E242A4C3D2, fd416; +fma.rn.f64 fd421, fd329, 0dBFE7F3CCD0032E0C, fd417; +fma.rn.f64 fd422, fd332, 0d3FE5384D024C2F84, fd418; +fma.rn.f64 fd423, fd683, 0dBFE7F3CCD0032E0C, fd419; +fma.rn.f64 fd424, fd331, 0d3FE5384D024C2F84, fd420; +sub.f64 fd425, fd421, fd422; +add.f64 fd427, fd422, fd421; +add.f64 fd669, fd424, fd423; +sub.f64 fd428, fd423, fd424; +fma.rn.f64 fd429, fd309, 0dBFD6B1D8B2365DA1, %55; +fma.rn.f64 fd433, fd313, 0dBFE7F3CCD0032E0C, fd429; +fma.rn.f64 fd668, fd312, 0d3FEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd434, fd316, 0dBFE5384D024C2F84, fd668; +fma.rn.f64 fd667, fd695, 0dBFD6B1D8B2365DA1, %120; +fma.rn.f64 fd435, fd692, 0dBFE7F3CCD0032E0C, fd667; +fma.rn.f64 fd666, fd311, 0d3FEDEBA72EF20147, 0d0000000000000000; +fma.rn.f64 fd436, fd315, 0dBFE5384D024C2F84, fd666; +fma.rn.f64 fd437, fd317, 0d3FEC55A7E00740E9, fd433; +fma.rn.f64 fd438, fd320, 0dBFDDBE064267C47C, fd434; +fma.rn.f64 fd439, fd690, 0d3FEC55A7E00740E9, fd435; +fma.rn.f64 fd440, fd319, 0dBFDDBE064267C47C, fd436; +fma.rn.f64 fd441, fd321, 0d3FBEDB7DEBAA3ED8, fd437; +fma.rn.f64 fd442, fd324, 0d3FEFC44566966769, fd438; +fma.rn.f64 fd443, fd688, 0d3FBEDB7DEBAA3ED8, fd439; +fma.rn.f64 fd444, fd323, 0d3FEFC44566966769, fd440; +fma.rn.f64 fd445, fd325, 0dBFEF11F493053D00, fd441; +fma.rn.f64 fd446, fd328, 0dBFCEA1E54BC48DBF, fd442; +fma.rn.f64 fd447, fd685, 0dBFEF11F493053D00, fd443; +fma.rn.f64 fd448, fd327, 0dBFCEA1E54BC48DBF, fd444; +fma.rn.f64 fd449, fd329, 0d3FE22D961EA71119, fd445; +fma.rn.f64 fd450, fd332, 0dBFEA55E242A4C3D2, fd446; +fma.rn.f64 fd451, fd683, 0d3FE22D961EA71119, fd447; +fma.rn.f64 fd452, fd331, 0dBFEA55E242A4C3D2, fd448; +sub.f64 fd453, fd449, fd450; +add.f64 fd455, fd450, fd449; +add.f64 fd665, fd452, fd451; +sub.f64 fd456, fd451, fd452; +fma.rn.f64 fd457, fd309, 0dBFE7F3CCD0032E0C, %55; +fma.rn.f64 fd461, fd313, 0d3FBEDB7DEBAA3ED8, fd457; +fma.rn.f64 fd664, fd312, 0d3FE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd462, fd316, 0dBFEFC44566966769, fd664; +fma.rn.f64 fd663, fd695, 0dBFE7F3CCD0032E0C, %120; +fma.rn.f64 fd463, fd692, 0d3FBEDB7DEBAA3ED8, fd663; +fma.rn.f64 fd662, fd311, 0d3FE5384D024C2F84, 0d0000000000000000; +fma.rn.f64 fd464, fd315, 0dBFEFC44566966769, fd662; +fma.rn.f64 fd465, fd317, 0d3FE22D961EA71119, fd461; +fma.rn.f64 fd466, fd320, 0d3FEA55E242A4C3D2, fd462; +fma.rn.f64 fd467, fd690, 0d3FE22D961EA71119, fd463; +fma.rn.f64 fd468, fd319, 0d3FEA55E242A4C3D2, fd464; +fma.rn.f64 fd469, fd321, 0dBFEF11F493053D00, fd465; +fma.rn.f64 fd470, fd324, 0dBFCEA1E54BC48DBF, fd466; +fma.rn.f64 fd471, fd688, 0dBFEF11F493053D00, fd467; +fma.rn.f64 fd472, fd323, 0dBFCEA1E54BC48DBF, fd468; +fma.rn.f64 fd473, fd325, 0d3FEC55A7E00740E9, fd469; +fma.rn.f64 fd474, fd328, 0dBFDDBE064267C47C, fd470; +fma.rn.f64 fd475, fd685, 0d3FEC55A7E00740E9, fd471; +fma.rn.f64 fd476, fd327, 0dBFDDBE064267C47C, fd472; +fma.rn.f64 fd477, fd329, 0dBFD6B1D8B2365DA1, fd473; +fma.rn.f64 fd478, fd332, 0d3FEDEBA72EF20147, fd474; +fma.rn.f64 fd479, fd683, 0dBFD6B1D8B2365DA1, fd475; +fma.rn.f64 fd480, fd331, 0d3FEDEBA72EF20147, fd476; +sub.f64 fd481, fd477, fd478; +add.f64 fd483, fd478, fd477; +add.f64 fd661, fd480, fd479; +sub.f64 fd484, fd479, fd480; +fma.rn.f64 fd485, fd309, 0dBFEF11F493053D00, %55; +fma.rn.f64 fd486, fd312, 0d3FCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd487, fd695, 0dBFEF11F493053D00, %52; +fma.rn.f64 fd488, fd311, 0d3FCEA1E54BC48DBF, 0d0000000000000000; +fma.rn.f64 fd489, fd313, 0d3FEC55A7E00740E9, fd485; +fma.rn.f64 fd490, fd316, 0dBFDDBE064267C47C, fd486; +fma.rn.f64 fd491, fd692, 0d3FEC55A7E00740E9, fd487; +fma.rn.f64 fd492, fd315, 0dBFDDBE064267C47C, fd488; +fma.rn.f64 fd493, fd317, 0dBFE7F3CCD0032E0C, fd489; +fma.rn.f64 fd494, fd320, 0d3FE5384D024C2F84, fd490; +fma.rn.f64 fd495, fd690, 0dBFE7F3CCD0032E0C, fd491; +fma.rn.f64 fd496, fd319, 0d3FE5384D024C2F84, fd492; +fma.rn.f64 fd497, fd321, 0d3FE22D961EA71119, fd493; +fma.rn.f64 fd498, fd324, 0dBFEA55E242A4C3D2, fd494; +fma.rn.f64 fd499, fd688, 0d3FE22D961EA71119, fd495; +fma.rn.f64 fd500, fd323, 0dBFEA55E242A4C3D2, fd496; +fma.rn.f64 fd501, fd325, 0dBFD6B1D8B2365DA1, fd497; +fma.rn.f64 fd502, fd328, 0d3FEDEBA72EF20147, fd498; +fma.rn.f64 fd503, fd685, 0dBFD6B1D8B2365DA1, fd499; +fma.rn.f64 fd504, fd327, 0d3FEDEBA72EF20147, fd500; +fma.rn.f64 fd505, fd329, 0d3FBEDB7DEBAA3ED8, fd501; +fma.rn.f64 fd506, fd332, 0dBFEFC44566966769, fd502; +fma.rn.f64 fd507, fd683, 0d3FBEDB7DEBAA3ED8, fd503; +fma.rn.f64 fd508, fd331, 0dBFEFC44566966769, fd504; +sub.f64 fd509, fd505, fd506; +add.f64 fd511, fd506, fd505; +add.f64 fd660, fd508, fd507; +sub.f64 fd512, fd507, fd508; +mul.f64 fd514, fd677, 0d3FCEA1E54BC48DBF; +mul.f64 fd659, fd369, 0d3FEF11F493053D00; +sub.f64 fd515, fd659, fd514; +mul.f64 fd516, fd677, 0d3FEF11F493053D00; +fma.rn.f64 fd517, fd369, 0d3FCEA1E54BC48DBF, fd516; +mul.f64 fd519, fd673, 0d3FDDBE064267C47C; +mul.f64 fd658, fd397, 0d3FEC55A7E00740E9; +sub.f64 fd520, fd658, fd519; +mul.f64 fd521, fd673, 0d3FEC55A7E00740E9; +fma.rn.f64 fd522, fd397, 0d3FDDBE064267C47C, fd521; +mul.f64 fd656, fd425, 0d3FE7F3CCD0032E0C; +mul.f64 fd657, fd669, 0d3FE5384D024C2F84; +sub.f64 fd525, fd656, fd657; +mul.f64 fd526, fd669, 0d3FE7F3CCD0032E0C; +fma.rn.f64 fd527, fd425, 0d3FE5384D024C2F84, fd526; +mul.f64 fd654, fd453, 0d3FE22D961EA71119; +mul.f64 fd655, fd665, 0d3FEA55E242A4C3D2; +sub.f64 fd530, fd654, fd655; +mul.f64 fd531, fd665, 0d3FE22D961EA71119; +fma.rn.f64 fd532, fd453, 0d3FEA55E242A4C3D2, fd531; +mul.f64 fd652, fd481, 0d3FD6B1D8B2365DA1; +mul.f64 fd653, fd661, 0d3FEDEBA72EF20147; +sub.f64 fd535, fd652, fd653; +mul.f64 fd536, fd661, 0d3FD6B1D8B2365DA1; +fma.rn.f64 fd537, fd481, 0d3FEDEBA72EF20147, fd536; +mul.f64 fd650, fd509, 0d3FBEDB7DEBAA3ED8; +mul.f64 fd651, fd660, 0d3FEFC44566966769; +sub.f64 fd540, fd650, fd651; +mul.f64 fd541, fd660, 0d3FBEDB7DEBAA3ED8; +fma.rn.f64 fd542, fd509, 0d3FEFC44566966769, fd541; +mul.f64 fd544, fd512, 0d3FEFC44566966769; +mul.f64 fd649, fd511, 0dBFBEDB7DEBAA3ED8; +sub.f64 fd545, fd649, fd544; +mul.f64 fd546, fd512, 0dBFBEDB7DEBAA3ED8; +fma.rn.f64 fd547, fd511, 0d3FEFC44566966769, fd546; +mul.f64 fd549, fd484, 0d3FEDEBA72EF20147; +mul.f64 fd648, fd483, 0dBFD6B1D8B2365DA1; +sub.f64 fd550, fd648, fd549; +mul.f64 fd551, fd484, 0dBFD6B1D8B2365DA1; +fma.rn.f64 fd552, fd483, 0d3FEDEBA72EF20147, fd551; +mul.f64 fd554, fd456, 0d3FEA55E242A4C3D2; +mul.f64 fd647, fd455, 0dBFE22D961EA71119; +sub.f64 fd555, fd647, fd554; +mul.f64 fd556, fd456, 0dBFE22D961EA71119; +fma.rn.f64 fd557, fd455, 0d3FEA55E242A4C3D2, fd556; +mul.f64 fd559, fd428, 0d3FE5384D024C2F84; +mul.f64 fd646, fd427, 0dBFE7F3CCD0032E0C; +sub.f64 fd560, fd646, fd559; +mul.f64 fd561, fd428, 0dBFE7F3CCD0032E0C; +fma.rn.f64 fd562, fd427, 0d3FE5384D024C2F84, fd561; +mul.f64 fd564, fd400, 0d3FDDBE064267C47C; +mul.f64 fd645, fd399, 0dBFEC55A7E00740E9; +sub.f64 fd565, fd645, fd564; +mul.f64 fd566, fd400, 0dBFEC55A7E00740E9; +fma.rn.f64 fd567, fd399, 0d3FDDBE064267C47C, fd566; +mul.f64 fd643, fd371, 0dBFEF11F493053D00; +mul.f64 fd644, fd372, 0d3FCEA1E54BC48DBF; +sub.f64 fd570, fd643, fd644; +mul.f64 fd571, fd372, 0dBFEF11F493053D00; +fma.rn.f64 fd572, fd371, 0d3FCEA1E54BC48DBF, fd571; +add.f64 %1, fd140, fd344; +add.f64 %0, fd139, fd343; +add.f64 %2, fd165, fd515; +add.f64 %3, fd713, fd517; +add.f64 %4, fd193, fd520; +add.f64 %5, fd709, fd522; +add.f64 %6, fd221, fd525; +add.f64 %7, fd705, fd527; +add.f64 %8, fd249, fd530; +add.f64 %9, fd701, fd532; +add.f64 %11, fd697, fd537; +add.f64 %10, fd277, fd535; +add.f64 %13, fd696, fd542; +add.f64 %12, fd305, fd540; +add.f64 %15, fd308, fd547; +add.f64 %14, fd307, fd545; +add.f64 %16, fd279, fd550; +add.f64 %17, fd280, fd552; +add.f64 %18, fd251, fd555; +add.f64 %19, fd252, fd557; +add.f64 %20, fd223, fd560; +add.f64 %21, fd224, fd562; +add.f64 %22, fd195, fd565; +add.f64 %23, fd196, fd567; +add.f64 %25, fd168, fd572; +add.f64 %24, fd167, fd570; +sub.f64 %27, fd140, fd344; +sub.f64 %26, fd139, fd343; +sub.f64 %29, fd713, fd517; +sub.f64 %28, fd165, fd515; +sub.f64 %31, fd709, fd522; +sub.f64 %30, fd193, fd520; +sub.f64 %33, fd705, fd527; +sub.f64 %32, fd221, fd525; +sub.f64 %35, fd701, fd532; +sub.f64 %34, fd249, fd530; +sub.f64 %37, fd697, fd537; +sub.f64 %36, fd277, fd535; +sub.f64 %39, fd696, fd542; +sub.f64 %38, fd305, fd540; +sub.f64 %41, fd308, fd547; +sub.f64 %40, fd307, fd545; +sub.f64 %43, fd280, fd552; +sub.f64 %42, fd279, fd550; +sub.f64 %45, fd252, fd557; +sub.f64 %44, fd251, fd555; +sub.f64 %47, fd224, fd562; +sub.f64 %46, fd223, fd560; +sub.f64 %49, fd196, fd567; +sub.f64 %48, fd195, fd565; +sub.f64 %51, fd168, fd572; +sub.f64 %50, fd167, fd570; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y): "d"(rmem[1].y), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[2].y), "d"(rmem[4].y), "d"(rmem[22].y), "d"(rmem[20].y), "d"(rmem[8].y), "d"(rmem[10].y), "d"(rmem[16].y), "d"(rmem[14].y), "d"(rmem[23].y), "d"(rmem[5].y), "d"(rmem[7].y), "d"(rmem[19].y), "d"(rmem[11].y), "d"(rmem[17].y), "d"(rmem[13].y), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..d72cd5e923f4a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp16_fwd.hpp.inc @@ -0,0 +1,7245 @@ +#ifndef CUFFTDX_FFT_27_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_27_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<867, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<209>; +.reg .b32 r<2771>; +.reg .f64 fd<155>; +.reg .b64 rd<3>; +mov.f64 fd153, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd153; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd154, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd154; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %59, %57; +} +{ +add.f16x2 r4, %54, r1; +} +{ +add.f16x2 r7, %55, %58; +} +{ +add.f16x2 r10, %56, r7; +} +{ +add.f16x2 r13, %59, %57; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %54, r16; +} +{ +sub.f16x2 r22, %55, %58; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %59, %57; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %54, r34; +} +{ +sub.f16x2 r40, %55, %58; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %55, %58; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %56, r52; +} +{ +sub.f16x2 r58, %59, %57; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %55, %58; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %56, r70; +} +{ +sub.f16x2 r76, %59, %57; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs5, fd153; +} +mov.b32 r156, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd154; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r165, {rs7, rs7}; +{ +add.f16x2 r85, %65, %63; +} +{ +add.f16x2 r88, %60, r85; +} +{ +add.f16x2 r91, %61, %64; +} +{ +add.f16x2 r94, %62, r91; +} +{ +add.f16x2 r97, %65, %63; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %60, r100; +} +{ +sub.f16x2 r106, %61, %64; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %65, %63; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %60, r118; +} +{ +sub.f16x2 r124, %61, %64; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %61, %64; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %62, r136; +} +{ +sub.f16x2 r142, %65, %63; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %61, %64; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %62, r154; +} +{ +sub.f16x2 r160, %65, %63; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +{ +cvt.rn.f16.f64 rs9, fd153; +} +mov.b32 r240, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd154; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r249, {rs11, rs11}; +{ +add.f16x2 r169, %71, %69; +} +{ +add.f16x2 r172, %66, r169; +} +{ +add.f16x2 r175, %67, %70; +} +{ +add.f16x2 r178, %68, r175; +} +{ +add.f16x2 r181, %71, %69; +} +{ +mul.f16x2 r184, r181, r240; +} +{ +add.f16x2 r187, %66, r184; +} +{ +sub.f16x2 r190, %67, %70; +} +{ +mul.f16x2 r193, r190, r249; +} +{ +add.f16x2 r196, r187, r193; +} +{ +add.f16x2 r199, %71, %69; +} +{ +mul.f16x2 r202, r199, r240; +} +{ +add.f16x2 r205, %66, r202; +} +{ +sub.f16x2 r208, %67, %70; +} +{ +mul.f16x2 r211, r208, r249; +} +{ +sub.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %67, %70; +} +{ +mul.f16x2 r220, r217, r240; +} +{ +add.f16x2 r223, %68, r220; +} +{ +sub.f16x2 r226, %71, %69; +} +{ +mul.f16x2 r229, r226, r249; +} +{ +sub.f16x2 r232, r223, r229; +} +{ +add.f16x2 r235, %67, %70; +} +{ +mul.f16x2 r238, r235, r240; +} +{ +add.f16x2 r241, %68, r238; +} +{ +sub.f16x2 r244, %71, %69; +} +{ +mul.f16x2 r247, r244, r249; +} +{ +add.f16x2 r250, r241, r247; +} +mov.f64 fd89, 0d3FE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs13, fd89; +} +mov.f64 fd90, 0dBFE491B7523C161D; +{ +cvt.rn.f16.f64 rs14, fd90; +} +mov.f64 fd95, 0d3FC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs15, fd95; +} +mov.f64 fd96, 0dBFEF838B8C811C17; +{ +cvt.rn.f16.f64 rs16, fd96; +} +mov.f64 fd107, 0dBFEE11F642522D1C; +{ +cvt.rn.f16.f64 rs19, fd107; +} +mov.f64 fd108, 0dBFD5E3A8748A0BF5; +{ +cvt.rn.f16.f64 rs20, fd108; +} +mov.b32 r267, {rs13, rs13}; +{ +mul.f16x2 r253, r112, r267; +} +mov.b32 r264, {rs14, rs14}; +{ +mul.f16x2 r256, r148, r264; +} +{ +sub.f16x2 r259, r253, r256; +} +{ +mul.f16x2 r262, r112, r264; +} +{ +fma.rn.f16x2 r265, r148, r267, r262; +} +mov.b32 r299, {rs15, rs15}; +{ +mul.f16x2 r269, r196, r299; +} +mov.b32 r296, {rs16, rs16}; +{ +mul.f16x2 r272, r232, r296; +} +{ +sub.f16x2 r275, r269, r272; +} +{ +mul.f16x2 r278, r196, r296; +} +{ +fma.rn.f16x2 r281, r232, r299, r278; +} +{ +mul.f16x2 r285, r130, r299; +} +{ +mul.f16x2 r288, r166, r296; +} +{ +sub.f16x2 r291, r285, r288; +} +{ +mul.f16x2 r294, r130, r296; +} +{ +fma.rn.f16x2 r297, r166, r299, r294; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r214, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r250, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r214, r312; +} +{ +fma.rn.f16x2 r313, r250, r315, r310; +} +{ +cvt.rn.f16.f64 rs29, fd153; +} +mov.b32 r388, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd154; +} +{ +neg.f16 rs31, rs30; +} +mov.b32 r397, {rs31, rs31}; +{ +add.f16x2 r317, r88, r172; +} +{ +add.f16x2 r320, r4, r317; +} +{ +add.f16x2 r323, r94, r178; +} +{ +add.f16x2 r326, r10, r323; +} +{ +add.f16x2 r329, r88, r172; +} +{ +mul.f16x2 r332, r329, r388; +} +{ +add.f16x2 r335, r4, r332; +} +{ +sub.f16x2 r338, r94, r178; +} +{ +mul.f16x2 r341, r338, r397; +} +{ +add.f16x2 r344, r335, r341; +} +{ +add.f16x2 r347, r88, r172; +} +{ +mul.f16x2 r350, r347, r388; +} +{ +add.f16x2 r353, r4, r350; +} +{ +sub.f16x2 r356, r94, r178; +} +{ +mul.f16x2 r359, r356, r397; +} +{ +sub.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r94, r178; +} +{ +mul.f16x2 r368, r365, r388; +} +{ +add.f16x2 r371, r10, r368; +} +{ +sub.f16x2 r374, r88, r172; +} +{ +mul.f16x2 r377, r374, r397; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r94, r178; +} +{ +mul.f16x2 r386, r383, r388; +} +{ +add.f16x2 r389, r10, r386; +} +{ +sub.f16x2 r392, r88, r172; +} +{ +mul.f16x2 r395, r392, r397; +} +{ +add.f16x2 r398, r389, r395; +} +{ +cvt.rn.f16.f64 rs33, fd153; +} +mov.b32 r472, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd154; +} +{ +neg.f16 rs35, rs34; +} +mov.b32 r481, {rs35, rs35}; +{ +add.f16x2 r401, r259, r275; +} +{ +add.f16x2 r404, r28, r401; +} +{ +add.f16x2 r407, r265, r281; +} +{ +add.f16x2 r410, r64, r407; +} +{ +add.f16x2 r413, r259, r275; +} +{ +mul.f16x2 r416, r413, r472; +} +{ +add.f16x2 r419, r28, r416; +} +{ +sub.f16x2 r422, r265, r281; +} +{ +mul.f16x2 r425, r422, r481; +} +{ +add.f16x2 r428, r419, r425; +} +{ +add.f16x2 r431, r259, r275; +} +{ +mul.f16x2 r434, r431, r472; +} +{ +add.f16x2 r437, r28, r434; +} +{ +sub.f16x2 r440, r265, r281; +} +{ +mul.f16x2 r443, r440, r481; +} +{ +sub.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, r265, r281; +} +{ +mul.f16x2 r452, r449, r472; +} +{ +add.f16x2 r455, r64, r452; +} +{ +sub.f16x2 r458, r259, r275; +} +{ +mul.f16x2 r461, r458, r481; +} +{ +sub.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r265, r281; +} +{ +mul.f16x2 r470, r467, r472; +} +{ +add.f16x2 r473, r64, r470; +} +{ +sub.f16x2 r476, r259, r275; +} +{ +mul.f16x2 r479, r476, r481; +} +{ +add.f16x2 r482, r473, r479; +} +{ +cvt.rn.f16.f64 rs37, fd153; +} +mov.b32 r556, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs38, fd154; +} +{ +neg.f16 rs39, rs38; +} +mov.b32 r565, {rs39, rs39}; +{ +add.f16x2 r485, r291, r307; +} +{ +add.f16x2 r488, r46, r485; +} +{ +add.f16x2 r491, r297, r313; +} +{ +add.f16x2 r494, r82, r491; +} +{ +add.f16x2 r497, r291, r307; +} +{ +mul.f16x2 r500, r497, r556; +} +{ +add.f16x2 r503, r46, r500; +} +{ +sub.f16x2 r506, r297, r313; +} +{ +mul.f16x2 r509, r506, r565; +} +{ +add.f16x2 r512, r503, r509; +} +{ +add.f16x2 r515, r291, r307; +} +{ +mul.f16x2 r518, r515, r556; +} +{ +add.f16x2 r521, r46, r518; +} +{ +sub.f16x2 r524, r297, r313; +} +{ +mul.f16x2 r527, r524, r565; +} +{ +sub.f16x2 r530, r521, r527; +} +{ +add.f16x2 r533, r297, r313; +} +{ +mul.f16x2 r536, r533, r556; +} +{ +add.f16x2 r539, r82, r536; +} +{ +sub.f16x2 r542, r291, r307; +} +{ +mul.f16x2 r545, r542, r565; +} +{ +sub.f16x2 r548, r539, r545; +} +{ +add.f16x2 r551, r297, r313; +} +{ +mul.f16x2 r554, r551, r556; +} +{ +add.f16x2 r557, r82, r554; +} +{ +sub.f16x2 r560, r291, r307; +} +{ +mul.f16x2 r563, r560, r565; +} +{ +add.f16x2 r566, r557, r563; +} +{ +cvt.rn.f16.f64 rs41, fd153; +} +mov.b32 r640, {rs41, rs41}; +{ +cvt.rn.f16.f64 rs42, fd154; +} +{ +neg.f16 rs43, rs42; +} +mov.b32 r649, {rs43, rs43}; +{ +add.f16x2 r569, %73, %77; +} +{ +add.f16x2 r572, %74, r569; +} +{ +add.f16x2 r575, %75, %72; +} +{ +add.f16x2 r578, %76, r575; +} +{ +add.f16x2 r581, %73, %77; +} +{ +mul.f16x2 r584, r581, r640; +} +{ +add.f16x2 r587, %74, r584; +} +{ +sub.f16x2 r590, %75, %72; +} +{ +mul.f16x2 r593, r590, r649; +} +{ +add.f16x2 r596, r587, r593; +} +{ +add.f16x2 r599, %73, %77; +} +{ +mul.f16x2 r602, r599, r640; +} +{ +add.f16x2 r605, %74, r602; +} +{ +sub.f16x2 r608, %75, %72; +} +{ +mul.f16x2 r611, r608, r649; +} +{ +sub.f16x2 r614, r605, r611; +} +{ +add.f16x2 r617, %75, %72; +} +{ +mul.f16x2 r620, r617, r640; +} +{ +add.f16x2 r623, %76, r620; +} +{ +sub.f16x2 r626, %73, %77; +} +{ +mul.f16x2 r629, r626, r649; +} +{ +sub.f16x2 r632, r623, r629; +} +{ +add.f16x2 r635, %75, %72; +} +{ +mul.f16x2 r638, r635, r640; +} +{ +add.f16x2 r641, %76, r638; +} +{ +sub.f16x2 r644, %73, %77; +} +{ +mul.f16x2 r647, r644, r649; +} +{ +add.f16x2 r650, r641, r647; +} +{ +cvt.rn.f16.f64 rs45, fd153; +} +mov.b32 r724, {rs45, rs45}; +{ +cvt.rn.f16.f64 rs46, fd154; +} +{ +neg.f16 rs47, rs46; +} +mov.b32 r733, {rs47, rs47}; +{ +add.f16x2 r653, %79, %83; +} +{ +add.f16x2 r656, %80, r653; +} +{ +add.f16x2 r659, %81, %78; +} +{ +add.f16x2 r662, %82, r659; +} +{ +add.f16x2 r665, %79, %83; +} +{ +mul.f16x2 r668, r665, r724; +} +{ +add.f16x2 r671, %80, r668; +} +{ +sub.f16x2 r674, %81, %78; +} +{ +mul.f16x2 r677, r674, r733; +} +{ +add.f16x2 r680, r671, r677; +} +{ +add.f16x2 r683, %79, %83; +} +{ +mul.f16x2 r686, r683, r724; +} +{ +add.f16x2 r689, %80, r686; +} +{ +sub.f16x2 r692, %81, %78; +} +{ +mul.f16x2 r695, r692, r733; +} +{ +sub.f16x2 r698, r689, r695; +} +{ +add.f16x2 r701, %81, %78; +} +{ +mul.f16x2 r704, r701, r724; +} +{ +add.f16x2 r707, %82, r704; +} +{ +sub.f16x2 r710, %79, %83; +} +{ +mul.f16x2 r713, r710, r733; +} +{ +sub.f16x2 r716, r707, r713; +} +{ +add.f16x2 r719, %81, %78; +} +{ +mul.f16x2 r722, r719, r724; +} +{ +add.f16x2 r725, %82, r722; +} +{ +sub.f16x2 r728, %79, %83; +} +{ +mul.f16x2 r731, r728, r733; +} +{ +add.f16x2 r734, r725, r731; +} +{ +cvt.rn.f16.f64 rs49, fd153; +} +mov.b32 r808, {rs49, rs49}; +{ +cvt.rn.f16.f64 rs50, fd154; +} +{ +neg.f16 rs51, rs50; +} +mov.b32 r817, {rs51, rs51}; +{ +add.f16x2 r737, %85, %89; +} +{ +add.f16x2 r740, %86, r737; +} +{ +add.f16x2 r743, %87, %84; +} +{ +add.f16x2 r746, %88, r743; +} +{ +add.f16x2 r749, %85, %89; +} +{ +mul.f16x2 r752, r749, r808; +} +{ +add.f16x2 r755, %86, r752; +} +{ +sub.f16x2 r758, %87, %84; +} +{ +mul.f16x2 r761, r758, r817; +} +{ +add.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %85, %89; +} +{ +mul.f16x2 r770, r767, r808; +} +{ +add.f16x2 r773, %86, r770; +} +{ +sub.f16x2 r776, %87, %84; +} +{ +mul.f16x2 r779, r776, r817; +} +{ +sub.f16x2 r782, r773, r779; +} +{ +add.f16x2 r785, %87, %84; +} +{ +mul.f16x2 r788, r785, r808; +} +{ +add.f16x2 r791, %88, r788; +} +{ +sub.f16x2 r794, %85, %89; +} +{ +mul.f16x2 r797, r794, r817; +} +{ +sub.f16x2 r800, r791, r797; +} +{ +add.f16x2 r803, %87, %84; +} +{ +mul.f16x2 r806, r803, r808; +} +{ +add.f16x2 r809, %88, r806; +} +{ +sub.f16x2 r812, %85, %89; +} +{ +mul.f16x2 r815, r812, r817; +} +{ +add.f16x2 r818, r809, r815; +} +{ +cvt.rn.f16.f64 rs53, fd89; +} +{ +cvt.rn.f16.f64 rs54, fd90; +} +{ +cvt.rn.f16.f64 rs55, fd95; +} +{ +cvt.rn.f16.f64 rs56, fd96; +} +{ +cvt.rn.f16.f64 rs59, fd107; +} +{ +cvt.rn.f16.f64 rs60, fd108; +} +mov.b32 r835, {rs53, rs53}; +{ +mul.f16x2 r821, r680, r835; +} +mov.b32 r832, {rs54, rs54}; +{ +mul.f16x2 r824, r716, r832; +} +{ +sub.f16x2 r827, r821, r824; +} +{ +mul.f16x2 r830, r680, r832; +} +{ +fma.rn.f16x2 r833, r716, r835, r830; +} +mov.b32 r867, {rs55, rs55}; +{ +mul.f16x2 r837, r764, r867; +} +mov.b32 r864, {rs56, rs56}; +{ +mul.f16x2 r840, r800, r864; +} +{ +sub.f16x2 r843, r837, r840; +} +{ +mul.f16x2 r846, r764, r864; +} +{ +fma.rn.f16x2 r849, r800, r867, r846; +} +{ +mul.f16x2 r853, r698, r867; +} +{ +mul.f16x2 r856, r734, r864; +} +{ +sub.f16x2 r859, r853, r856; +} +{ +mul.f16x2 r862, r698, r864; +} +{ +fma.rn.f16x2 r865, r734, r867, r862; +} +mov.b32 r883, {rs59, rs59}; +{ +mul.f16x2 r869, r782, r883; +} +mov.b32 r880, {rs60, rs60}; +{ +mul.f16x2 r872, r818, r880; +} +{ +sub.f16x2 r875, r869, r872; +} +{ +mul.f16x2 r878, r782, r880; +} +{ +fma.rn.f16x2 r881, r818, r883, r878; +} +{ +cvt.rn.f16.f64 rs69, fd153; +} +mov.b32 r956, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs70, fd154; +} +{ +neg.f16 rs71, rs70; +} +mov.b32 r965, {rs71, rs71}; +{ +add.f16x2 r885, r656, r740; +} +{ +add.f16x2 r888, r572, r885; +} +{ +add.f16x2 r891, r662, r746; +} +{ +add.f16x2 r894, r578, r891; +} +{ +add.f16x2 r897, r656, r740; +} +{ +mul.f16x2 r900, r897, r956; +} +{ +add.f16x2 r903, r572, r900; +} +{ +sub.f16x2 r906, r662, r746; +} +{ +mul.f16x2 r909, r906, r965; +} +{ +add.f16x2 r912, r903, r909; +} +{ +add.f16x2 r915, r656, r740; +} +{ +mul.f16x2 r918, r915, r956; +} +{ +add.f16x2 r921, r572, r918; +} +{ +sub.f16x2 r924, r662, r746; +} +{ +mul.f16x2 r927, r924, r965; +} +{ +sub.f16x2 r930, r921, r927; +} +{ +add.f16x2 r933, r662, r746; +} +{ +mul.f16x2 r936, r933, r956; +} +{ +add.f16x2 r939, r578, r936; +} +{ +sub.f16x2 r942, r656, r740; +} +{ +mul.f16x2 r945, r942, r965; +} +{ +sub.f16x2 r948, r939, r945; +} +{ +add.f16x2 r951, r662, r746; +} +{ +mul.f16x2 r954, r951, r956; +} +{ +add.f16x2 r957, r578, r954; +} +{ +sub.f16x2 r960, r656, r740; +} +{ +mul.f16x2 r963, r960, r965; +} +{ +add.f16x2 r966, r957, r963; +} +{ +cvt.rn.f16.f64 rs73, fd153; +} +mov.b32 r1040, {rs73, rs73}; +{ +cvt.rn.f16.f64 rs74, fd154; +} +{ +neg.f16 rs75, rs74; +} +mov.b32 r1049, {rs75, rs75}; +{ +add.f16x2 r969, r827, r843; +} +{ +add.f16x2 r972, r596, r969; +} +{ +add.f16x2 r975, r833, r849; +} +{ +add.f16x2 r978, r632, r975; +} +{ +add.f16x2 r981, r827, r843; +} +{ +mul.f16x2 r984, r981, r1040; +} +{ +add.f16x2 r987, r596, r984; +} +{ +sub.f16x2 r990, r833, r849; +} +{ +mul.f16x2 r993, r990, r1049; +} +{ +add.f16x2 r996, r987, r993; +} +{ +add.f16x2 r999, r827, r843; +} +{ +mul.f16x2 r1002, r999, r1040; +} +{ +add.f16x2 r1005, r596, r1002; +} +{ +sub.f16x2 r1008, r833, r849; +} +{ +mul.f16x2 r1011, r1008, r1049; +} +{ +sub.f16x2 r1014, r1005, r1011; +} +{ +add.f16x2 r1017, r833, r849; +} +{ +mul.f16x2 r1020, r1017, r1040; +} +{ +add.f16x2 r1023, r632, r1020; +} +{ +sub.f16x2 r1026, r827, r843; +} +{ +mul.f16x2 r1029, r1026, r1049; +} +{ +sub.f16x2 r1032, r1023, r1029; +} +{ +add.f16x2 r1035, r833, r849; +} +{ +mul.f16x2 r1038, r1035, r1040; +} +{ +add.f16x2 r1041, r632, r1038; +} +{ +sub.f16x2 r1044, r827, r843; +} +{ +mul.f16x2 r1047, r1044, r1049; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +cvt.rn.f16.f64 rs77, fd153; +} +mov.b32 r1124, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs78, fd154; +} +{ +neg.f16 rs79, rs78; +} +mov.b32 r1133, {rs79, rs79}; +{ +add.f16x2 r1053, r859, r875; +} +{ +add.f16x2 r1056, r614, r1053; +} +{ +add.f16x2 r1059, r865, r881; +} +{ +add.f16x2 r1062, r650, r1059; +} +{ +add.f16x2 r1065, r859, r875; +} +{ +mul.f16x2 r1068, r1065, r1124; +} +{ +add.f16x2 r1071, r614, r1068; +} +{ +sub.f16x2 r1074, r865, r881; +} +{ +mul.f16x2 r1077, r1074, r1133; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r859, r875; +} +{ +mul.f16x2 r1086, r1083, r1124; +} +{ +add.f16x2 r1089, r614, r1086; +} +{ +sub.f16x2 r1092, r865, r881; +} +{ +mul.f16x2 r1095, r1092, r1133; +} +{ +sub.f16x2 r1098, r1089, r1095; +} +{ +add.f16x2 r1101, r865, r881; +} +{ +mul.f16x2 r1104, r1101, r1124; +} +{ +add.f16x2 r1107, r650, r1104; +} +{ +sub.f16x2 r1110, r859, r875; +} +{ +mul.f16x2 r1113, r1110, r1133; +} +{ +sub.f16x2 r1116, r1107, r1113; +} +{ +add.f16x2 r1119, r865, r881; +} +{ +mul.f16x2 r1122, r1119, r1124; +} +{ +add.f16x2 r1125, r650, r1122; +} +{ +sub.f16x2 r1128, r859, r875; +} +{ +mul.f16x2 r1131, r1128, r1133; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +cvt.rn.f16.f64 rs81, fd153; +} +mov.b32 r1208, {rs81, rs81}; +{ +cvt.rn.f16.f64 rs82, fd154; +} +{ +neg.f16 rs83, rs82; +} +mov.b32 r1217, {rs83, rs83}; +{ +add.f16x2 r1137, %91, %95; +} +{ +add.f16x2 r1140, %92, r1137; +} +{ +add.f16x2 r1143, %93, %90; +} +{ +add.f16x2 r1146, %94, r1143; +} +{ +add.f16x2 r1149, %91, %95; +} +{ +mul.f16x2 r1152, r1149, r1208; +} +{ +add.f16x2 r1155, %92, r1152; +} +{ +sub.f16x2 r1158, %93, %90; +} +{ +mul.f16x2 r1161, r1158, r1217; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +add.f16x2 r1167, %91, %95; +} +{ +mul.f16x2 r1170, r1167, r1208; +} +{ +add.f16x2 r1173, %92, r1170; +} +{ +sub.f16x2 r1176, %93, %90; +} +{ +mul.f16x2 r1179, r1176, r1217; +} +{ +sub.f16x2 r1182, r1173, r1179; +} +{ +add.f16x2 r1185, %93, %90; +} +{ +mul.f16x2 r1188, r1185, r1208; +} +{ +add.f16x2 r1191, %94, r1188; +} +{ +sub.f16x2 r1194, %91, %95; +} +{ +mul.f16x2 r1197, r1194, r1217; +} +{ +sub.f16x2 r1200, r1191, r1197; +} +{ +add.f16x2 r1203, %93, %90; +} +{ +mul.f16x2 r1206, r1203, r1208; +} +{ +add.f16x2 r1209, %94, r1206; +} +{ +sub.f16x2 r1212, %91, %95; +} +{ +mul.f16x2 r1215, r1212, r1217; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +cvt.rn.f16.f64 rs85, fd153; +} +mov.b32 r1292, {rs85, rs85}; +{ +cvt.rn.f16.f64 rs86, fd154; +} +{ +neg.f16 rs87, rs86; +} +mov.b32 r1301, {rs87, rs87}; +{ +add.f16x2 r1221, %97, %101; +} +{ +add.f16x2 r1224, %98, r1221; +} +{ +add.f16x2 r1227, %99, %96; +} +{ +add.f16x2 r1230, %100, r1227; +} +{ +add.f16x2 r1233, %97, %101; +} +{ +mul.f16x2 r1236, r1233, r1292; +} +{ +add.f16x2 r1239, %98, r1236; +} +{ +sub.f16x2 r1242, %99, %96; +} +{ +mul.f16x2 r1245, r1242, r1301; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %97, %101; +} +{ +mul.f16x2 r1254, r1251, r1292; +} +{ +add.f16x2 r1257, %98, r1254; +} +{ +sub.f16x2 r1260, %99, %96; +} +{ +mul.f16x2 r1263, r1260, r1301; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %99, %96; +} +{ +mul.f16x2 r1272, r1269, r1292; +} +{ +add.f16x2 r1275, %100, r1272; +} +{ +sub.f16x2 r1278, %97, %101; +} +{ +mul.f16x2 r1281, r1278, r1301; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %99, %96; +} +{ +mul.f16x2 r1290, r1287, r1292; +} +{ +add.f16x2 r1293, %100, r1290; +} +{ +sub.f16x2 r1296, %97, %101; +} +{ +mul.f16x2 r1299, r1296, r1301; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +cvt.rn.f16.f64 rs89, fd153; +} +mov.b32 r1376, {rs89, rs89}; +{ +cvt.rn.f16.f64 rs90, fd154; +} +{ +neg.f16 rs91, rs90; +} +mov.b32 r1385, {rs91, rs91}; +{ +add.f16x2 r1305, %103, %107; +} +{ +add.f16x2 r1308, %104, r1305; +} +{ +add.f16x2 r1311, %105, %102; +} +{ +add.f16x2 r1314, %106, r1311; +} +{ +add.f16x2 r1317, %103, %107; +} +{ +mul.f16x2 r1320, r1317, r1376; +} +{ +add.f16x2 r1323, %104, r1320; +} +{ +sub.f16x2 r1326, %105, %102; +} +{ +mul.f16x2 r1329, r1326, r1385; +} +{ +add.f16x2 r1332, r1323, r1329; +} +{ +add.f16x2 r1335, %103, %107; +} +{ +mul.f16x2 r1338, r1335, r1376; +} +{ +add.f16x2 r1341, %104, r1338; +} +{ +sub.f16x2 r1344, %105, %102; +} +{ +mul.f16x2 r1347, r1344, r1385; +} +{ +sub.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, %105, %102; +} +{ +mul.f16x2 r1356, r1353, r1376; +} +{ +add.f16x2 r1359, %106, r1356; +} +{ +sub.f16x2 r1362, %103, %107; +} +{ +mul.f16x2 r1365, r1362, r1385; +} +{ +sub.f16x2 r1368, r1359, r1365; +} +{ +add.f16x2 r1371, %105, %102; +} +{ +mul.f16x2 r1374, r1371, r1376; +} +{ +add.f16x2 r1377, %106, r1374; +} +{ +sub.f16x2 r1380, %103, %107; +} +{ +mul.f16x2 r1383, r1380, r1385; +} +{ +add.f16x2 r1386, r1377, r1383; +} +{ +cvt.rn.f16.f64 rs93, fd89; +} +{ +cvt.rn.f16.f64 rs94, fd90; +} +{ +cvt.rn.f16.f64 rs95, fd95; +} +{ +cvt.rn.f16.f64 rs96, fd96; +} +{ +cvt.rn.f16.f64 rs99, fd107; +} +{ +cvt.rn.f16.f64 rs100, fd108; +} +mov.b32 r1403, {rs93, rs93}; +{ +mul.f16x2 r1389, r1248, r1403; +} +mov.b32 r1400, {rs94, rs94}; +{ +mul.f16x2 r1392, r1284, r1400; +} +{ +sub.f16x2 r1395, r1389, r1392; +} +{ +mul.f16x2 r1398, r1248, r1400; +} +{ +fma.rn.f16x2 r1401, r1284, r1403, r1398; +} +mov.b32 r1435, {rs95, rs95}; +{ +mul.f16x2 r1405, r1332, r1435; +} +mov.b32 r1432, {rs96, rs96}; +{ +mul.f16x2 r1408, r1368, r1432; +} +{ +sub.f16x2 r1411, r1405, r1408; +} +{ +mul.f16x2 r1414, r1332, r1432; +} +{ +fma.rn.f16x2 r1417, r1368, r1435, r1414; +} +{ +mul.f16x2 r1421, r1266, r1435; +} +{ +mul.f16x2 r1424, r1302, r1432; +} +{ +sub.f16x2 r1427, r1421, r1424; +} +{ +mul.f16x2 r1430, r1266, r1432; +} +{ +fma.rn.f16x2 r1433, r1302, r1435, r1430; +} +mov.b32 r1451, {rs99, rs99}; +{ +mul.f16x2 r1437, r1350, r1451; +} +mov.b32 r1448, {rs100, rs100}; +{ +mul.f16x2 r1440, r1386, r1448; +} +{ +sub.f16x2 r1443, r1437, r1440; +} +{ +mul.f16x2 r1446, r1350, r1448; +} +{ +fma.rn.f16x2 r1449, r1386, r1451, r1446; +} +{ +cvt.rn.f16.f64 rs109, fd153; +} +mov.b32 r1524, {rs109, rs109}; +{ +cvt.rn.f16.f64 rs110, fd154; +} +{ +neg.f16 rs111, rs110; +} +mov.b32 r1533, {rs111, rs111}; +{ +add.f16x2 r1453, r1224, r1308; +} +{ +add.f16x2 r1456, r1140, r1453; +} +{ +add.f16x2 r1459, r1230, r1314; +} +{ +add.f16x2 r1462, r1146, r1459; +} +{ +add.f16x2 r1465, r1224, r1308; +} +{ +mul.f16x2 r1468, r1465, r1524; +} +{ +add.f16x2 r1471, r1140, r1468; +} +{ +sub.f16x2 r1474, r1230, r1314; +} +{ +mul.f16x2 r1477, r1474, r1533; +} +{ +add.f16x2 r1480, r1471, r1477; +} +{ +add.f16x2 r1483, r1224, r1308; +} +{ +mul.f16x2 r1486, r1483, r1524; +} +{ +add.f16x2 r1489, r1140, r1486; +} +{ +sub.f16x2 r1492, r1230, r1314; +} +{ +mul.f16x2 r1495, r1492, r1533; +} +{ +sub.f16x2 r1498, r1489, r1495; +} +{ +add.f16x2 r1501, r1230, r1314; +} +{ +mul.f16x2 r1504, r1501, r1524; +} +{ +add.f16x2 r1507, r1146, r1504; +} +{ +sub.f16x2 r1510, r1224, r1308; +} +{ +mul.f16x2 r1513, r1510, r1533; +} +{ +sub.f16x2 r1516, r1507, r1513; +} +{ +add.f16x2 r1519, r1230, r1314; +} +{ +mul.f16x2 r1522, r1519, r1524; +} +{ +add.f16x2 r1525, r1146, r1522; +} +{ +sub.f16x2 r1528, r1224, r1308; +} +{ +mul.f16x2 r1531, r1528, r1533; +} +{ +add.f16x2 r1534, r1525, r1531; +} +{ +cvt.rn.f16.f64 rs113, fd153; +} +mov.b32 r1608, {rs113, rs113}; +{ +cvt.rn.f16.f64 rs114, fd154; +} +{ +neg.f16 rs115, rs114; +} +mov.b32 r1617, {rs115, rs115}; +{ +add.f16x2 r1537, r1395, r1411; +} +{ +add.f16x2 r1540, r1164, r1537; +} +{ +add.f16x2 r1543, r1401, r1417; +} +{ +add.f16x2 r1546, r1200, r1543; +} +{ +add.f16x2 r1549, r1395, r1411; +} +{ +mul.f16x2 r1552, r1549, r1608; +} +{ +add.f16x2 r1555, r1164, r1552; +} +{ +sub.f16x2 r1558, r1401, r1417; +} +{ +mul.f16x2 r1561, r1558, r1617; +} +{ +add.f16x2 r1564, r1555, r1561; +} +{ +add.f16x2 r1567, r1395, r1411; +} +{ +mul.f16x2 r1570, r1567, r1608; +} +{ +add.f16x2 r1573, r1164, r1570; +} +{ +sub.f16x2 r1576, r1401, r1417; +} +{ +mul.f16x2 r1579, r1576, r1617; +} +{ +sub.f16x2 r1582, r1573, r1579; +} +{ +add.f16x2 r1585, r1401, r1417; +} +{ +mul.f16x2 r1588, r1585, r1608; +} +{ +add.f16x2 r1591, r1200, r1588; +} +{ +sub.f16x2 r1594, r1395, r1411; +} +{ +mul.f16x2 r1597, r1594, r1617; +} +{ +sub.f16x2 r1600, r1591, r1597; +} +{ +add.f16x2 r1603, r1401, r1417; +} +{ +mul.f16x2 r1606, r1603, r1608; +} +{ +add.f16x2 r1609, r1200, r1606; +} +{ +sub.f16x2 r1612, r1395, r1411; +} +{ +mul.f16x2 r1615, r1612, r1617; +} +{ +add.f16x2 r1618, r1609, r1615; +} +{ +cvt.rn.f16.f64 rs117, fd153; +} +mov.b32 r1692, {rs117, rs117}; +{ +cvt.rn.f16.f64 rs118, fd154; +} +{ +neg.f16 rs119, rs118; +} +mov.b32 r1701, {rs119, rs119}; +{ +add.f16x2 r1621, r1427, r1443; +} +{ +add.f16x2 r1624, r1182, r1621; +} +{ +add.f16x2 r1627, r1433, r1449; +} +{ +add.f16x2 r1630, r1218, r1627; +} +{ +add.f16x2 r1633, r1427, r1443; +} +{ +mul.f16x2 r1636, r1633, r1692; +} +{ +add.f16x2 r1639, r1182, r1636; +} +{ +sub.f16x2 r1642, r1433, r1449; +} +{ +mul.f16x2 r1645, r1642, r1701; +} +{ +add.f16x2 r1648, r1639, r1645; +} +{ +add.f16x2 r1651, r1427, r1443; +} +{ +mul.f16x2 r1654, r1651, r1692; +} +{ +add.f16x2 r1657, r1182, r1654; +} +{ +sub.f16x2 r1660, r1433, r1449; +} +{ +mul.f16x2 r1663, r1660, r1701; +} +{ +sub.f16x2 r1666, r1657, r1663; +} +{ +add.f16x2 r1669, r1433, r1449; +} +{ +mul.f16x2 r1672, r1669, r1692; +} +{ +add.f16x2 r1675, r1218, r1672; +} +{ +sub.f16x2 r1678, r1427, r1443; +} +{ +mul.f16x2 r1681, r1678, r1701; +} +{ +sub.f16x2 r1684, r1675, r1681; +} +{ +add.f16x2 r1687, r1433, r1449; +} +{ +mul.f16x2 r1690, r1687, r1692; +} +{ +add.f16x2 r1693, r1218, r1690; +} +{ +sub.f16x2 r1696, r1427, r1443; +} +{ +mul.f16x2 r1699, r1696, r1701; +} +{ +add.f16x2 r1702, r1693, r1699; +} +mov.f64 fd85, 0d3FEF232EFF15C9E6; +{ +cvt.rn.f16.f64 rs121, fd85; +} +mov.f64 fd86, 0dBFCD84D223638000; +{ +cvt.rn.f16.f64 rs122, fd86; +} +mov.f64 fd87, 0d3FEC98A37A9A7850; +{ +cvt.rn.f16.f64 rs123, fd87; +} +mov.f64 fd88, 0dBFDCB920325BAFA6; +{ +cvt.rn.f16.f64 rs124, fd88; +} +{ +cvt.rn.f16.f64 rs125, fd89; +} +{ +cvt.rn.f16.f64 rs126, fd90; +} +mov.f64 fd91, 0d3FE31BEC55BC71BC; +{ +cvt.rn.f16.f64 rs127, fd91; +} +mov.f64 fd92, 0dBFE9AAFE4207DF5F; +{ +cvt.rn.f16.f64 rs128, fd92; +} +mov.f64 fd93, 0d3FD9595EF26FB670; +{ +cvt.rn.f16.f64 rs129, fd93; +} +mov.f64 fd94, 0dBFED6206BEB6C24B; +{ +cvt.rn.f16.f64 rs130, fd94; +} +{ +cvt.rn.f16.f64 rs131, fd95; +} +{ +cvt.rn.f16.f64 rs132, fd96; +} +mov.f64 fd97, 0dBFADC528B5343A86; +{ +cvt.rn.f16.f64 rs133, fd97; +} +mov.f64 fd98, 0dBFEFF223F3635CE3; +{ +cvt.rn.f16.f64 rs134, fd98; +} +mov.f64 fd99, 0dBFD25AFBF23865BF; +{ +cvt.rn.f16.f64 rs135, fd99; +} +mov.f64 fd100, 0dBFEEA7D99F29CADE; +{ +cvt.rn.f16.f64 rs136, fd100; +} +mov.f64 fd103, 0dBFE5F5B105F99707; +{ +cvt.rn.f16.f64 rs139, fd103; +} +mov.f64 fd104, 0dBFE746A51650EADE; +{ +cvt.rn.f16.f64 rs140, fd104; +} +mov.f64 fd115, 0dBFEABC50EF4734A7; +{ +cvt.rn.f16.f64 rs143, fd107; +} +{ +cvt.rn.f16.f64 rs144, fd108; +} +mov.f64 fd111, 0dBFEFC89BCEF44CF4; +{ +cvt.rn.f16.f64 rs147, fd111; +} +mov.f64 fd112, 0d3FBDB843E577175E; +{ +cvt.rn.f16.f64 rs148, fd112; +} +{ +cvt.rn.f16.f64 rs151, fd115; +} +mov.f64 fd116, 0d3FE19593DA358510; +{ +cvt.rn.f16.f64 rs152, fd116; +} +mov.b32 r1719, {rs121, rs121}; +{ +mul.f16x2 r1705, r972, r1719; +} +mov.b32 r1716, {rs122, rs122}; +{ +mul.f16x2 r1708, r978, r1716; +} +{ +sub.f16x2 r1711, r1705, r1708; +} +{ +mul.f16x2 r1714, r972, r1716; +} +{ +fma.rn.f16x2 r1717, r978, r1719, r1714; +} +mov.b32 r1751, {rs123, rs123}; +{ +mul.f16x2 r1721, r1540, r1751; +} +mov.b32 r1748, {rs124, rs124}; +{ +mul.f16x2 r1724, r1546, r1748; +} +{ +sub.f16x2 r1727, r1721, r1724; +} +{ +mul.f16x2 r1730, r1540, r1748; +} +{ +fma.rn.f16x2 r1733, r1546, r1751, r1730; +} +{ +mul.f16x2 r1737, r1056, r1751; +} +{ +mul.f16x2 r1740, r1062, r1748; +} +{ +sub.f16x2 r1743, r1737, r1740; +} +{ +mul.f16x2 r1746, r1056, r1748; +} +{ +fma.rn.f16x2 r1749, r1062, r1751, r1746; +} +mov.b32 r1815, {rs127, rs127}; +{ +mul.f16x2 r1753, r1624, r1815; +} +mov.b32 r1812, {rs128, rs128}; +{ +mul.f16x2 r1756, r1630, r1812; +} +{ +sub.f16x2 r1759, r1753, r1756; +} +{ +mul.f16x2 r1762, r1624, r1812; +} +{ +fma.rn.f16x2 r1765, r1630, r1815, r1762; +} +mov.b32 r1783, {rs125, rs125}; +{ +mul.f16x2 r1769, r912, r1783; +} +mov.b32 r1780, {rs126, rs126}; +{ +mul.f16x2 r1772, r948, r1780; +} +{ +sub.f16x2 r1775, r1769, r1772; +} +{ +mul.f16x2 r1778, r912, r1780; +} +{ +fma.rn.f16x2 r1781, r948, r1783, r1778; +} +mov.b32 r1879, {rs131, rs131}; +{ +mul.f16x2 r1785, r1480, r1879; +} +mov.b32 r1876, {rs132, rs132}; +{ +mul.f16x2 r1788, r1516, r1876; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1480, r1876; +} +{ +fma.rn.f16x2 r1797, r1516, r1879, r1794; +} +{ +mul.f16x2 r1801, r996, r1815; +} +{ +mul.f16x2 r1804, r1032, r1812; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r996, r1812; +} +{ +fma.rn.f16x2 r1813, r1032, r1815, r1810; +} +mov.b32 r1943, {rs135, rs135}; +{ +mul.f16x2 r1817, r1564, r1943; +} +mov.b32 r1940, {rs136, rs136}; +{ +mul.f16x2 r1820, r1600, r1940; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1564, r1940; +} +{ +fma.rn.f16x2 r1829, r1600, r1943, r1826; +} +mov.b32 r1847, {rs129, rs129}; +{ +mul.f16x2 r1833, r1080, r1847; +} +mov.b32 r1844, {rs130, rs130}; +{ +mul.f16x2 r1836, r1116, r1844; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1080, r1844; +} +{ +fma.rn.f16x2 r1845, r1116, r1847, r1842; +} +mov.b32 r1863, {rs139, rs139}; +{ +mul.f16x2 r1849, r1648, r1863; +} +mov.b32 r1860, {rs140, rs140}; +{ +mul.f16x2 r1852, r1684, r1860; +} +{ +sub.f16x2 r1855, r1849, r1852; +} +{ +mul.f16x2 r1858, r1648, r1860; +} +{ +fma.rn.f16x2 r1861, r1684, r1863, r1858; +} +{ +mul.f16x2 r1865, r930, r1879; +} +{ +mul.f16x2 r1868, r966, r1876; +} +{ +sub.f16x2 r1871, r1865, r1868; +} +{ +mul.f16x2 r1874, r930, r1876; +} +{ +fma.rn.f16x2 r1877, r966, r1879, r1874; +} +mov.b32 r1895, {rs143, rs143}; +{ +mul.f16x2 r1881, r1498, r1895; +} +mov.b32 r1892, {rs144, rs144}; +{ +mul.f16x2 r1884, r1534, r1892; +} +{ +sub.f16x2 r1887, r1881, r1884; +} +{ +mul.f16x2 r1890, r1498, r1892; +} +{ +fma.rn.f16x2 r1893, r1534, r1895, r1890; +} +mov.b32 r1911, {rs133, rs133}; +{ +mul.f16x2 r1897, r1014, r1911; +} +mov.b32 r1908, {rs134, rs134}; +{ +mul.f16x2 r1900, r1050, r1908; +} +{ +sub.f16x2 r1903, r1897, r1900; +} +{ +mul.f16x2 r1906, r1014, r1908; +} +{ +fma.rn.f16x2 r1909, r1050, r1911, r1906; +} +mov.b32 r1927, {rs147, rs147}; +{ +mul.f16x2 r1913, r1582, r1927; +} +mov.b32 r1924, {rs148, rs148}; +{ +mul.f16x2 r1916, r1618, r1924; +} +{ +sub.f16x2 r1919, r1913, r1916; +} +{ +mul.f16x2 r1922, r1582, r1924; +} +{ +fma.rn.f16x2 r1925, r1618, r1927, r1922; +} +{ +mul.f16x2 r1929, r1098, r1943; +} +{ +mul.f16x2 r1932, r1134, r1940; +} +{ +sub.f16x2 r1935, r1929, r1932; +} +{ +mul.f16x2 r1938, r1098, r1940; +} +{ +fma.rn.f16x2 r1941, r1134, r1943, r1938; +} +mov.b32 r1959, {rs151, rs151}; +{ +mul.f16x2 r1945, r1666, r1959; +} +mov.b32 r1956, {rs152, rs152}; +{ +mul.f16x2 r1948, r1702, r1956; +} +{ +sub.f16x2 r1951, r1945, r1948; +} +{ +mul.f16x2 r1954, r1666, r1956; +} +{ +fma.rn.f16x2 r1957, r1702, r1959, r1954; +} +{ +cvt.rn.f16.f64 rs173, fd153; +} +mov.b32 r2032, {rs173, rs173}; +{ +cvt.rn.f16.f64 rs174, fd154; +} +{ +neg.f16 rs175, rs174; +} +mov.b32 r2041, {rs175, rs175}; +{ +add.f16x2 r1961, r888, r1456; +} +{ +add.f16x2 %0, r320, r1961; +} +{ +add.f16x2 r1967, r894, r1462; +} +{ +add.f16x2 %1, r326, r1967; +} +{ +add.f16x2 r1973, r888, r1456; +} +{ +mul.f16x2 r1976, r1973, r2032; +} +{ +add.f16x2 r1979, r320, r1976; +} +{ +sub.f16x2 r1982, r894, r1462; +} +{ +mul.f16x2 r1985, r1982, r2041; +} +{ +add.f16x2 %18, r1979, r1985; +} +{ +add.f16x2 r1991, r888, r1456; +} +{ +mul.f16x2 r1994, r1991, r2032; +} +{ +add.f16x2 r1997, r320, r1994; +} +{ +sub.f16x2 r2000, r894, r1462; +} +{ +mul.f16x2 r2003, r2000, r2041; +} +{ +sub.f16x2 %36, r1997, r2003; +} +{ +add.f16x2 r2009, r894, r1462; +} +{ +mul.f16x2 r2012, r2009, r2032; +} +{ +add.f16x2 r2015, r326, r2012; +} +{ +sub.f16x2 r2018, r888, r1456; +} +{ +mul.f16x2 r2021, r2018, r2041; +} +{ +sub.f16x2 %19, r2015, r2021; +} +{ +add.f16x2 r2027, r894, r1462; +} +{ +mul.f16x2 r2030, r2027, r2032; +} +{ +add.f16x2 r2033, r326, r2030; +} +{ +sub.f16x2 r2036, r888, r1456; +} +{ +mul.f16x2 r2039, r2036, r2041; +} +{ +add.f16x2 %37, r2033, r2039; +} +{ +cvt.rn.f16.f64 rs177, fd153; +} +mov.b32 r2116, {rs177, rs177}; +{ +cvt.rn.f16.f64 rs178, fd154; +} +{ +neg.f16 rs179, rs178; +} +mov.b32 r2125, {rs179, rs179}; +{ +add.f16x2 r2045, r1711, r1727; +} +{ +add.f16x2 %2, r404, r2045; +} +{ +add.f16x2 r2051, r1717, r1733; +} +{ +add.f16x2 %3, r410, r2051; +} +{ +add.f16x2 r2057, r1711, r1727; +} +{ +mul.f16x2 r2060, r2057, r2116; +} +{ +add.f16x2 r2063, r404, r2060; +} +{ +sub.f16x2 r2066, r1717, r1733; +} +{ +mul.f16x2 r2069, r2066, r2125; +} +{ +add.f16x2 %20, r2063, r2069; +} +{ +add.f16x2 r2075, r1711, r1727; +} +{ +mul.f16x2 r2078, r2075, r2116; +} +{ +add.f16x2 r2081, r404, r2078; +} +{ +sub.f16x2 r2084, r1717, r1733; +} +{ +mul.f16x2 r2087, r2084, r2125; +} +{ +sub.f16x2 %38, r2081, r2087; +} +{ +add.f16x2 r2093, r1717, r1733; +} +{ +mul.f16x2 r2096, r2093, r2116; +} +{ +add.f16x2 r2099, r410, r2096; +} +{ +sub.f16x2 r2102, r1711, r1727; +} +{ +mul.f16x2 r2105, r2102, r2125; +} +{ +sub.f16x2 %21, r2099, r2105; +} +{ +add.f16x2 r2111, r1717, r1733; +} +{ +mul.f16x2 r2114, r2111, r2116; +} +{ +add.f16x2 r2117, r410, r2114; +} +{ +sub.f16x2 r2120, r1711, r1727; +} +{ +mul.f16x2 r2123, r2120, r2125; +} +{ +add.f16x2 %39, r2117, r2123; +} +{ +cvt.rn.f16.f64 rs181, fd153; +} +mov.b32 r2200, {rs181, rs181}; +{ +cvt.rn.f16.f64 rs182, fd154; +} +{ +neg.f16 rs183, rs182; +} +mov.b32 r2209, {rs183, rs183}; +{ +add.f16x2 r2129, r1743, r1759; +} +{ +add.f16x2 %4, r488, r2129; +} +{ +add.f16x2 r2135, r1749, r1765; +} +{ +add.f16x2 %5, r494, r2135; +} +{ +add.f16x2 r2141, r1743, r1759; +} +{ +mul.f16x2 r2144, r2141, r2200; +} +{ +add.f16x2 r2147, r488, r2144; +} +{ +sub.f16x2 r2150, r1749, r1765; +} +{ +mul.f16x2 r2153, r2150, r2209; +} +{ +add.f16x2 %22, r2147, r2153; +} +{ +add.f16x2 r2159, r1743, r1759; +} +{ +mul.f16x2 r2162, r2159, r2200; +} +{ +add.f16x2 r2165, r488, r2162; +} +{ +sub.f16x2 r2168, r1749, r1765; +} +{ +mul.f16x2 r2171, r2168, r2209; +} +{ +sub.f16x2 %40, r2165, r2171; +} +{ +add.f16x2 r2177, r1749, r1765; +} +{ +mul.f16x2 r2180, r2177, r2200; +} +{ +add.f16x2 r2183, r494, r2180; +} +{ +sub.f16x2 r2186, r1743, r1759; +} +{ +mul.f16x2 r2189, r2186, r2209; +} +{ +sub.f16x2 %23, r2183, r2189; +} +{ +add.f16x2 r2195, r1749, r1765; +} +{ +mul.f16x2 r2198, r2195, r2200; +} +{ +add.f16x2 r2201, r494, r2198; +} +{ +sub.f16x2 r2204, r1743, r1759; +} +{ +mul.f16x2 r2207, r2204, r2209; +} +{ +add.f16x2 %41, r2201, r2207; +} +{ +cvt.rn.f16.f64 rs185, fd153; +} +mov.b32 r2284, {rs185, rs185}; +{ +cvt.rn.f16.f64 rs186, fd154; +} +{ +neg.f16 rs187, rs186; +} +mov.b32 r2293, {rs187, rs187}; +{ +add.f16x2 r2213, r1775, r1791; +} +{ +add.f16x2 %6, r344, r2213; +} +{ +add.f16x2 r2219, r1781, r1797; +} +{ +add.f16x2 %7, r380, r2219; +} +{ +add.f16x2 r2225, r1775, r1791; +} +{ +mul.f16x2 r2228, r2225, r2284; +} +{ +add.f16x2 r2231, r344, r2228; +} +{ +sub.f16x2 r2234, r1781, r1797; +} +{ +mul.f16x2 r2237, r2234, r2293; +} +{ +add.f16x2 %24, r2231, r2237; +} +{ +add.f16x2 r2243, r1775, r1791; +} +{ +mul.f16x2 r2246, r2243, r2284; +} +{ +add.f16x2 r2249, r344, r2246; +} +{ +sub.f16x2 r2252, r1781, r1797; +} +{ +mul.f16x2 r2255, r2252, r2293; +} +{ +sub.f16x2 %42, r2249, r2255; +} +{ +add.f16x2 r2261, r1781, r1797; +} +{ +mul.f16x2 r2264, r2261, r2284; +} +{ +add.f16x2 r2267, r380, r2264; +} +{ +sub.f16x2 r2270, r1775, r1791; +} +{ +mul.f16x2 r2273, r2270, r2293; +} +{ +sub.f16x2 %25, r2267, r2273; +} +{ +add.f16x2 r2279, r1781, r1797; +} +{ +mul.f16x2 r2282, r2279, r2284; +} +{ +add.f16x2 r2285, r380, r2282; +} +{ +sub.f16x2 r2288, r1775, r1791; +} +{ +mul.f16x2 r2291, r2288, r2293; +} +{ +add.f16x2 %43, r2285, r2291; +} +{ +cvt.rn.f16.f64 rs189, fd153; +} +mov.b32 r2368, {rs189, rs189}; +{ +cvt.rn.f16.f64 rs190, fd154; +} +{ +neg.f16 rs191, rs190; +} +mov.b32 r2377, {rs191, rs191}; +{ +add.f16x2 r2297, r1807, r1823; +} +{ +add.f16x2 %8, r428, r2297; +} +{ +add.f16x2 r2303, r1813, r1829; +} +{ +add.f16x2 %9, r464, r2303; +} +{ +add.f16x2 r2309, r1807, r1823; +} +{ +mul.f16x2 r2312, r2309, r2368; +} +{ +add.f16x2 r2315, r428, r2312; +} +{ +sub.f16x2 r2318, r1813, r1829; +} +{ +mul.f16x2 r2321, r2318, r2377; +} +{ +add.f16x2 %26, r2315, r2321; +} +{ +add.f16x2 r2327, r1807, r1823; +} +{ +mul.f16x2 r2330, r2327, r2368; +} +{ +add.f16x2 r2333, r428, r2330; +} +{ +sub.f16x2 r2336, r1813, r1829; +} +{ +mul.f16x2 r2339, r2336, r2377; +} +{ +sub.f16x2 %44, r2333, r2339; +} +{ +add.f16x2 r2345, r1813, r1829; +} +{ +mul.f16x2 r2348, r2345, r2368; +} +{ +add.f16x2 r2351, r464, r2348; +} +{ +sub.f16x2 r2354, r1807, r1823; +} +{ +mul.f16x2 r2357, r2354, r2377; +} +{ +sub.f16x2 %27, r2351, r2357; +} +{ +add.f16x2 r2363, r1813, r1829; +} +{ +mul.f16x2 r2366, r2363, r2368; +} +{ +add.f16x2 r2369, r464, r2366; +} +{ +sub.f16x2 r2372, r1807, r1823; +} +{ +mul.f16x2 r2375, r2372, r2377; +} +{ +add.f16x2 %45, r2369, r2375; +} +{ +cvt.rn.f16.f64 rs193, fd153; +} +mov.b32 r2452, {rs193, rs193}; +{ +cvt.rn.f16.f64 rs194, fd154; +} +{ +neg.f16 rs195, rs194; +} +mov.b32 r2461, {rs195, rs195}; +{ +add.f16x2 r2381, r1839, r1855; +} +{ +add.f16x2 %10, r512, r2381; +} +{ +add.f16x2 r2387, r1845, r1861; +} +{ +add.f16x2 %11, r548, r2387; +} +{ +add.f16x2 r2393, r1839, r1855; +} +{ +mul.f16x2 r2396, r2393, r2452; +} +{ +add.f16x2 r2399, r512, r2396; +} +{ +sub.f16x2 r2402, r1845, r1861; +} +{ +mul.f16x2 r2405, r2402, r2461; +} +{ +add.f16x2 %28, r2399, r2405; +} +{ +add.f16x2 r2411, r1839, r1855; +} +{ +mul.f16x2 r2414, r2411, r2452; +} +{ +add.f16x2 r2417, r512, r2414; +} +{ +sub.f16x2 r2420, r1845, r1861; +} +{ +mul.f16x2 r2423, r2420, r2461; +} +{ +sub.f16x2 %46, r2417, r2423; +} +{ +add.f16x2 r2429, r1845, r1861; +} +{ +mul.f16x2 r2432, r2429, r2452; +} +{ +add.f16x2 r2435, r548, r2432; +} +{ +sub.f16x2 r2438, r1839, r1855; +} +{ +mul.f16x2 r2441, r2438, r2461; +} +{ +sub.f16x2 %29, r2435, r2441; +} +{ +add.f16x2 r2447, r1845, r1861; +} +{ +mul.f16x2 r2450, r2447, r2452; +} +{ +add.f16x2 r2453, r548, r2450; +} +{ +sub.f16x2 r2456, r1839, r1855; +} +{ +mul.f16x2 r2459, r2456, r2461; +} +{ +add.f16x2 %47, r2453, r2459; +} +{ +cvt.rn.f16.f64 rs197, fd153; +} +mov.b32 r2536, {rs197, rs197}; +{ +cvt.rn.f16.f64 rs198, fd154; +} +{ +neg.f16 rs199, rs198; +} +mov.b32 r2545, {rs199, rs199}; +{ +add.f16x2 r2465, r1871, r1887; +} +{ +add.f16x2 %12, r362, r2465; +} +{ +add.f16x2 r2471, r1877, r1893; +} +{ +add.f16x2 %13, r398, r2471; +} +{ +add.f16x2 r2477, r1871, r1887; +} +{ +mul.f16x2 r2480, r2477, r2536; +} +{ +add.f16x2 r2483, r362, r2480; +} +{ +sub.f16x2 r2486, r1877, r1893; +} +{ +mul.f16x2 r2489, r2486, r2545; +} +{ +add.f16x2 %30, r2483, r2489; +} +{ +add.f16x2 r2495, r1871, r1887; +} +{ +mul.f16x2 r2498, r2495, r2536; +} +{ +add.f16x2 r2501, r362, r2498; +} +{ +sub.f16x2 r2504, r1877, r1893; +} +{ +mul.f16x2 r2507, r2504, r2545; +} +{ +sub.f16x2 %48, r2501, r2507; +} +{ +add.f16x2 r2513, r1877, r1893; +} +{ +mul.f16x2 r2516, r2513, r2536; +} +{ +add.f16x2 r2519, r398, r2516; +} +{ +sub.f16x2 r2522, r1871, r1887; +} +{ +mul.f16x2 r2525, r2522, r2545; +} +{ +sub.f16x2 %31, r2519, r2525; +} +{ +add.f16x2 r2531, r1877, r1893; +} +{ +mul.f16x2 r2534, r2531, r2536; +} +{ +add.f16x2 r2537, r398, r2534; +} +{ +sub.f16x2 r2540, r1871, r1887; +} +{ +mul.f16x2 r2543, r2540, r2545; +} +{ +add.f16x2 %49, r2537, r2543; +} +{ +cvt.rn.f16.f64 rs201, fd153; +} +mov.b32 r2620, {rs201, rs201}; +{ +cvt.rn.f16.f64 rs202, fd154; +} +{ +neg.f16 rs203, rs202; +} +mov.b32 r2629, {rs203, rs203}; +{ +add.f16x2 r2549, r1903, r1919; +} +{ +add.f16x2 %14, r446, r2549; +} +{ +add.f16x2 r2555, r1909, r1925; +} +{ +add.f16x2 %15, r482, r2555; +} +{ +add.f16x2 r2561, r1903, r1919; +} +{ +mul.f16x2 r2564, r2561, r2620; +} +{ +add.f16x2 r2567, r446, r2564; +} +{ +sub.f16x2 r2570, r1909, r1925; +} +{ +mul.f16x2 r2573, r2570, r2629; +} +{ +add.f16x2 %32, r2567, r2573; +} +{ +add.f16x2 r2579, r1903, r1919; +} +{ +mul.f16x2 r2582, r2579, r2620; +} +{ +add.f16x2 r2585, r446, r2582; +} +{ +sub.f16x2 r2588, r1909, r1925; +} +{ +mul.f16x2 r2591, r2588, r2629; +} +{ +sub.f16x2 %50, r2585, r2591; +} +{ +add.f16x2 r2597, r1909, r1925; +} +{ +mul.f16x2 r2600, r2597, r2620; +} +{ +add.f16x2 r2603, r482, r2600; +} +{ +sub.f16x2 r2606, r1903, r1919; +} +{ +mul.f16x2 r2609, r2606, r2629; +} +{ +sub.f16x2 %33, r2603, r2609; +} +{ +add.f16x2 r2615, r1909, r1925; +} +{ +mul.f16x2 r2618, r2615, r2620; +} +{ +add.f16x2 r2621, r482, r2618; +} +{ +sub.f16x2 r2624, r1903, r1919; +} +{ +mul.f16x2 r2627, r2624, r2629; +} +{ +add.f16x2 %51, r2621, r2627; +} +{ +cvt.rn.f16.f64 rs205, fd153; +} +mov.b32 r2704, {rs205, rs205}; +{ +cvt.rn.f16.f64 rs206, fd154; +} +{ +neg.f16 rs207, rs206; +} +mov.b32 r2713, {rs207, rs207}; +{ +add.f16x2 r2633, r1935, r1951; +} +{ +add.f16x2 %16, r530, r2633; +} +{ +add.f16x2 r2639, r1941, r1957; +} +{ +add.f16x2 %17, r566, r2639; +} +{ +add.f16x2 r2645, r1935, r1951; +} +{ +mul.f16x2 r2648, r2645, r2704; +} +{ +add.f16x2 r2651, r530, r2648; +} +{ +sub.f16x2 r2654, r1941, r1957; +} +{ +mul.f16x2 r2657, r2654, r2713; +} +{ +add.f16x2 %34, r2651, r2657; +} +{ +add.f16x2 r2663, r1935, r1951; +} +{ +mul.f16x2 r2666, r2663, r2704; +} +{ +add.f16x2 r2669, r530, r2666; +} +{ +sub.f16x2 r2672, r1941, r1957; +} +{ +mul.f16x2 r2675, r2672, r2713; +} +{ +sub.f16x2 %52, r2669, r2675; +} +{ +add.f16x2 r2681, r1941, r1957; +} +{ +mul.f16x2 r2684, r2681, r2704; +} +{ +add.f16x2 r2687, r566, r2684; +} +{ +sub.f16x2 r2690, r1935, r1951; +} +{ +mul.f16x2 r2693, r2690, r2713; +} +{ +sub.f16x2 %35, r2687, r2693; +} +{ +add.f16x2 r2699, r1941, r1957; +} +{ +mul.f16x2 r2702, r2699, r2704; +} +{ +add.f16x2 r2705, r566, r2702; +} +{ +sub.f16x2 r2708, r1935, r1951; +} +{ +mul.f16x2 r2711, r2708, r2713; +} +{ +add.f16x2 %53, r2705, r2711; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[26].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<869, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<92>; +.reg .b32 r<1183>; +.reg .b64 rd<4>; +mov.u32 r1172, %tid.y; +mov.u32 r1173, %18; +mad.lo.s32 r1174, r1172, 216, r1173; +mov.u32 r1175, %tid.x; +mov.f32 f86, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1, {low, high}; +} +mov.f32 f88, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f14, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r265, {low, high}; +} +mov.f32 f16, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r266, {low, high}; +} +mov.f32 f18, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f18; +mov.b32 r267, {low, high}; +} +mov.f32 f20, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f20; +cvt.rn.f16.f32 high, f20; +mov.b32 r268, {low, high}; +} +mov.f32 f26, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r271, {low, high}; +} +mov.f32 f28, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r1175, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1176, rd3; +mul.lo.s32 r1177, r1176, 3; +sub.s32 r1178, r1175, r1177; +cvt.rn.f32.u32 f89, r1178; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f57, f90; +sin.approx.f32 f91, f90; +neg.f32 f58, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +mad.lo.s32 r1179, r1176, 216, r1174; +barrier.sync 0; +mad.lo.s32 r1180, r1178, 72, r1179; +st.shared.v2.f32 [r1180], {r352, r358}; +st.shared.v2.f32 [r1180+8], {r621, r628}; +st.shared.v2.f32 [r1180+16], {r658, r665}; +st.shared.v2.f32 [r1180+24], {r695, r702}; +st.shared.v2.f32 [r1180+32], {r732, r739}; +st.shared.v2.f32 [r1180+40], {r769, r776}; +st.shared.v2.f32 [r1180+48], {r806, r813}; +st.shared.v2.f32 [r1180+56], {r843, r850}; +st.shared.v2.f32 [r1180+64], {r880, r887}; +barrier.sync 0; +shl.b32 r1181, r1178, 6; +sub.s32 r1182, r1180, r1181; +ld.shared.u32 r916, [r1182]; +ld.shared.u32 r922, [r1182+4]; +ld.shared.u32 r1004, [r1182+24]; +ld.shared.u32 r1010, [r1182+28]; +ld.shared.u32 r1092, [r1182+48]; +ld.shared.u32 r1098, [r1182+52]; +ld.shared.u32 r913, [r1182+72]; +ld.shared.u32 r919, [r1182+76]; +ld.shared.u32 r1001, [r1182+96]; +ld.shared.u32 r1007, [r1182+100]; +ld.shared.u32 r1089, [r1182+120]; +ld.shared.u32 r1095, [r1182+124]; +ld.shared.u32 r914, [r1182+144]; +ld.shared.u32 r920, [r1182+148]; +ld.shared.u32 r1002, [r1182+168]; +ld.shared.u32 r1008, [r1182+172]; +ld.shared.u32 r1090, [r1182+192]; +ld.shared.u32 r1096, [r1182+196]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 %0, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 %1, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 %6, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 %12, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 %7, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 %13, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 %2, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 %3, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 %8, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 %14, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 %9, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 %15, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 %4, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 %5, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 %10, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 %16, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 %11, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 %17, r1160, r1166; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<868, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<92>; +.reg .b32 r<1183>; +.reg .b64 rd<4>; +mov.u32 r1172, %tid.y; +mov.u32 r1173, %18; +mad.lo.s32 r1174, r1172, 108, r1173; +mov.u32 r1175, %tid.x; +mov.f32 f86, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1, {low, high}; +} +mov.f32 f88, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f14, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r265, {low, high}; +} +mov.f32 f16, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r266, {low, high}; +} +mov.f32 f18, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f18; +mov.b32 r267, {low, high}; +} +mov.f32 f20, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f20; +cvt.rn.f16.f32 high, f20; +mov.b32 r268, {low, high}; +} +mov.f32 f26, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r271, {low, high}; +} +mov.f32 f28, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r1175, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1176, rd3; +mul.lo.s32 r1177, r1176, 3; +sub.s32 r1178, r1175, r1177; +mad.lo.s32 r1179, r1176, 108, r1174; +cvt.rn.f32.u32 f89, r1178; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f57, f90; +sin.approx.f32 f91, f90; +neg.f32 f58, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +barrier.sync 0; +mad.lo.s32 r1180, r1178, 36, r1179; +st.shared.u32 [r1180], r352; +st.shared.u32 [r1180+4], r621; +st.shared.u32 [r1180+8], r658; +st.shared.u32 [r1180+12], r695; +st.shared.u32 [r1180+16], r732; +st.shared.u32 [r1180+20], r769; +st.shared.u32 [r1180+24], r806; +st.shared.u32 [r1180+28], r843; +st.shared.u32 [r1180+32], r880; +barrier.sync 0; +shl.b32 r1181, r1178, 5; +sub.s32 r1182, r1180, r1181; +ld.shared.u32 r916, [r1182]; +ld.shared.u32 r1004, [r1182+12]; +ld.shared.u32 r1092, [r1182+24]; +ld.shared.u32 r913, [r1182+36]; +ld.shared.u32 r1001, [r1182+48]; +ld.shared.u32 r1089, [r1182+60]; +ld.shared.u32 r914, [r1182+72]; +ld.shared.u32 r1002, [r1182+84]; +ld.shared.u32 r1090, [r1182+96]; +barrier.sync 0; +st.shared.u32 [r1180], r358; +st.shared.u32 [r1180+4], r628; +st.shared.u32 [r1180+8], r665; +st.shared.u32 [r1180+12], r702; +st.shared.u32 [r1180+16], r739; +st.shared.u32 [r1180+20], r776; +st.shared.u32 [r1180+24], r813; +st.shared.u32 [r1180+28], r850; +st.shared.u32 [r1180+32], r887; +barrier.sync 0; +ld.shared.u32 r922, [r1182]; +ld.shared.u32 r1010, [r1182+12]; +ld.shared.u32 r1098, [r1182+24]; +ld.shared.u32 r919, [r1182+36]; +ld.shared.u32 r1007, [r1182+48]; +ld.shared.u32 r1095, [r1182+60]; +ld.shared.u32 r920, [r1182+72]; +ld.shared.u32 r1008, [r1182+84]; +ld.shared.u32 r1096, [r1182+96]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 %0, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 %1, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 %6, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 %12, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 %7, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 %13, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 %2, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 %3, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 %8, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 %14, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 %9, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 %15, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 %4, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 %5, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 %10, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 %16, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 %11, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 %17, r1160, r1166; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<870, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<35>; +.reg .b32 r<436>; +.reg .b64 rd<6>; +mov.u32 r419, %tid.y; +mov.u32 r420, %6; +mad.lo.s32 r421, r419, 216, r420; +mov.u32 r422, %tid.x; +mov.f32 f26, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r1, {low, high}; +} +mov.f32 f28, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r422, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r423, rd3; +mul.lo.s32 r424, r423, 9; +sub.s32 r425, r422, r424; +mad.lo.s32 r426, r423, 216, r421; +cvt.rn.f32.u32 f29, r425; +mul.f32 f30, f29, 0f3E6E4BAE; +cos.approx.f32 f5, f30; +sin.approx.f32 f31, f30; +neg.f32 f6, f31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r427, r425, 24, r426; +st.shared.v2.f32 [r427], {r8, r14}; +st.shared.v2.f32 [r427+8], {r101, r108}; +st.shared.v2.f32 [r427+16], {r138, r145}; +barrier.sync 0; +shl.b32 r428, r425, 4; +sub.s32 r429, r427, r428; +ld.shared.u32 r174, [r429]; +ld.shared.u32 r180, [r429+4]; +ld.shared.u32 r171, [r429+72]; +ld.shared.u32 r177, [r429+76]; +ld.shared.u32 r172, [r429+144]; +ld.shared.u32 r178, [r429+148]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r425, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r430, rd5; +mul.lo.s32 r431, r430, 3; +sub.s32 r432, r425, r431; +shl.b32 r433, r432, 3; +add.s32 r434, r426, r433; +cvt.rn.f32.u32 f32, r430; +mul.f32 f33, f32, 0f3F32B8C2; +cos.approx.f32 f17, f33; +sin.approx.f32 f34, f33; +neg.f32 f18, f34; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r435, r430, 72, r434; +st.shared.u32 [r435], r173; +st.shared.u32 [r435+4], r179; +st.shared.u32 [r435+24], r266; +st.shared.u32 [r435+28], r273; +st.shared.u32 [r435+48], r303; +st.shared.u32 [r435+52], r310; +barrier.sync 0; +ld.shared.u32 r339, [r429]; +ld.shared.u32 r345, [r429+4]; +ld.shared.u32 r336, [r429+72]; +ld.shared.u32 r342, [r429+76]; +ld.shared.u32 r337, [r429+144]; +ld.shared.u32 r343, [r429+148]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 %0, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 %1, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 %2, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 %4, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 %3, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 %5, r407, r413; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<871, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<35>; +.reg .b32 r<436>; +.reg .b64 rd<6>; +mov.u32 r419, %tid.y; +mov.u32 r420, %6; +mad.lo.s32 r421, r419, 108, r420; +mov.u32 r422, %tid.x; +mov.f32 f26, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r1, {low, high}; +} +mov.f32 f28, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r422, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r423, rd3; +mul.lo.s32 r424, r423, 9; +sub.s32 r425, r422, r424; +mad.lo.s32 r426, r423, 108, r421; +cvt.rn.f32.u32 f29, r425; +mul.f32 f30, f29, 0f3E6E4BAE; +cos.approx.f32 f5, f30; +sin.approx.f32 f31, f30; +neg.f32 f6, f31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r427, r425, 12, r426; +st.shared.u32 [r427], r8; +st.shared.u32 [r427+4], r101; +st.shared.u32 [r427+8], r138; +barrier.sync 0; +shl.b32 r428, r425, 3; +sub.s32 r429, r427, r428; +ld.shared.u32 r174, [r429]; +ld.shared.u32 r171, [r429+36]; +ld.shared.u32 r172, [r429+72]; +barrier.sync 0; +st.shared.u32 [r427], r14; +st.shared.u32 [r427+4], r108; +st.shared.u32 [r427+8], r145; +barrier.sync 0; +ld.shared.u32 r180, [r429]; +ld.shared.u32 r177, [r429+36]; +ld.shared.u32 r178, [r429+72]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r425, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r430, rd5; +mul.lo.s32 r431, r430, 3; +sub.s32 r432, r425, r431; +shl.b32 r433, r432, 2; +add.s32 r434, r426, r433; +cvt.rn.f32.u32 f32, r430; +mul.f32 f33, f32, 0f3F32B8C2; +cos.approx.f32 f17, f33; +sin.approx.f32 f34, f33; +neg.f32 f18, f34; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r435, r430, 36, r434; +st.shared.u32 [r435], r173; +st.shared.u32 [r435+12], r266; +st.shared.u32 [r435+24], r303; +barrier.sync 0; +ld.shared.u32 r339, [r429]; +ld.shared.u32 r336, [r429+36]; +ld.shared.u32 r337, [r429+72]; +barrier.sync 0; +st.shared.u32 [r435], r179; +st.shared.u32 [r435+12], r273; +st.shared.u32 [r435+24], r310; +barrier.sync 0; +ld.shared.u32 r345, [r429]; +ld.shared.u32 r342, [r429+36]; +ld.shared.u32 r343, [r429+72]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 %0, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 %1, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 %2, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 %4, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 %3, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 %5, r407, r413; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..61e1deeec7761 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp16_inv.hpp.inc @@ -0,0 +1,7092 @@ +#ifndef CUFFTDX_FFT_27_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_27_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1069, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<155>; +.reg .b32 r<2771>; +.reg .f64 fd<155>; +.reg .b64 rd<3>; +mov.f64 fd153, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd153; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd154, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd154; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %56, %54; +} +{ +add.f16x2 r4, %57, r1; +} +{ +add.f16x2 r7, %58, %55; +} +{ +add.f16x2 r10, %59, r7; +} +{ +add.f16x2 r13, %56, %54; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %57, r16; +} +{ +sub.f16x2 r22, %58, %55; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %56, %54; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %57, r34; +} +{ +sub.f16x2 r40, %58, %55; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %58, %55; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %59, r52; +} +{ +sub.f16x2 r58, %56, %54; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %58, %55; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %59, r70; +} +{ +sub.f16x2 r76, %56, %54; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs3, fd153; +} +mov.b32 r156, {rs3, rs3}; +{ +cvt.rn.f16.f64 rs4, fd154; +} +mov.b32 r165, {rs4, rs4}; +{ +add.f16x2 r85, %61, %65; +} +{ +add.f16x2 r88, %62, r85; +} +{ +add.f16x2 r91, %63, %60; +} +{ +add.f16x2 r94, %64, r91; +} +{ +add.f16x2 r97, %61, %65; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %62, r100; +} +{ +sub.f16x2 r106, %63, %60; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %61, %65; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %62, r118; +} +{ +sub.f16x2 r124, %63, %60; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %63, %60; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %64, r136; +} +{ +sub.f16x2 r142, %61, %65; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %63, %60; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %64, r154; +} +{ +sub.f16x2 r160, %61, %65; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +{ +cvt.rn.f16.f64 rs5, fd153; +} +mov.b32 r240, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd154; +} +mov.b32 r249, {rs6, rs6}; +{ +add.f16x2 r169, %69, %67; +} +{ +add.f16x2 r172, %70, r169; +} +{ +add.f16x2 r175, %71, %68; +} +{ +add.f16x2 r178, %66, r175; +} +{ +add.f16x2 r181, %69, %67; +} +{ +mul.f16x2 r184, r181, r240; +} +{ +add.f16x2 r187, %70, r184; +} +{ +sub.f16x2 r190, %71, %68; +} +{ +mul.f16x2 r193, r190, r249; +} +{ +add.f16x2 r196, r187, r193; +} +{ +add.f16x2 r199, %69, %67; +} +{ +mul.f16x2 r202, r199, r240; +} +{ +add.f16x2 r205, %70, r202; +} +{ +sub.f16x2 r208, %71, %68; +} +{ +mul.f16x2 r211, r208, r249; +} +{ +sub.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %71, %68; +} +{ +mul.f16x2 r220, r217, r240; +} +{ +add.f16x2 r223, %66, r220; +} +{ +sub.f16x2 r226, %69, %67; +} +{ +mul.f16x2 r229, r226, r249; +} +{ +sub.f16x2 r232, r223, r229; +} +{ +add.f16x2 r235, %71, %68; +} +{ +mul.f16x2 r238, r235, r240; +} +{ +add.f16x2 r241, %66, r238; +} +{ +sub.f16x2 r244, %69, %67; +} +{ +mul.f16x2 r247, r244, r249; +} +{ +add.f16x2 r250, r241, r247; +} +mov.f64 fd89, 0d3FE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs7, fd89; +} +mov.f64 fd90, 0d3FE491B7523C161D; +{ +cvt.rn.f16.f64 rs8, fd90; +} +mov.f64 fd95, 0d3FC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs9, fd95; +} +mov.f64 fd96, 0d3FEF838B8C811C17; +{ +cvt.rn.f16.f64 rs10, fd96; +} +mov.f64 fd107, 0dBFEE11F642522D1C; +{ +cvt.rn.f16.f64 rs13, fd107; +} +mov.f64 fd108, 0d3FD5E3A8748A0BF5; +{ +cvt.rn.f16.f64 rs14, fd108; +} +mov.b32 r267, {rs7, rs7}; +{ +mul.f16x2 r253, r112, r267; +} +mov.b32 r264, {rs8, rs8}; +{ +mul.f16x2 r256, r148, r264; +} +{ +sub.f16x2 r259, r253, r256; +} +{ +mul.f16x2 r262, r112, r264; +} +{ +fma.rn.f16x2 r265, r148, r267, r262; +} +mov.b32 r299, {rs9, rs9}; +{ +mul.f16x2 r269, r196, r299; +} +mov.b32 r296, {rs10, rs10}; +{ +mul.f16x2 r272, r232, r296; +} +{ +sub.f16x2 r275, r269, r272; +} +{ +mul.f16x2 r278, r196, r296; +} +{ +fma.rn.f16x2 r281, r232, r299, r278; +} +{ +mul.f16x2 r285, r130, r299; +} +{ +mul.f16x2 r288, r166, r296; +} +{ +sub.f16x2 r291, r285, r288; +} +{ +mul.f16x2 r294, r130, r296; +} +{ +fma.rn.f16x2 r297, r166, r299, r294; +} +mov.b32 r315, {rs13, rs13}; +{ +mul.f16x2 r301, r214, r315; +} +mov.b32 r312, {rs14, rs14}; +{ +mul.f16x2 r304, r250, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r214, r312; +} +{ +fma.rn.f16x2 r313, r250, r315, r310; +} +{ +cvt.rn.f16.f64 rs23, fd153; +} +mov.b32 r388, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs24, fd154; +} +mov.b32 r397, {rs24, rs24}; +{ +add.f16x2 r317, r88, r172; +} +{ +add.f16x2 r320, r4, r317; +} +{ +add.f16x2 r323, r94, r178; +} +{ +add.f16x2 r326, r10, r323; +} +{ +add.f16x2 r329, r88, r172; +} +{ +mul.f16x2 r332, r329, r388; +} +{ +add.f16x2 r335, r4, r332; +} +{ +sub.f16x2 r338, r94, r178; +} +{ +mul.f16x2 r341, r338, r397; +} +{ +add.f16x2 r344, r335, r341; +} +{ +add.f16x2 r347, r88, r172; +} +{ +mul.f16x2 r350, r347, r388; +} +{ +add.f16x2 r353, r4, r350; +} +{ +sub.f16x2 r356, r94, r178; +} +{ +mul.f16x2 r359, r356, r397; +} +{ +sub.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r94, r178; +} +{ +mul.f16x2 r368, r365, r388; +} +{ +add.f16x2 r371, r10, r368; +} +{ +sub.f16x2 r374, r88, r172; +} +{ +mul.f16x2 r377, r374, r397; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r94, r178; +} +{ +mul.f16x2 r386, r383, r388; +} +{ +add.f16x2 r389, r10, r386; +} +{ +sub.f16x2 r392, r88, r172; +} +{ +mul.f16x2 r395, r392, r397; +} +{ +add.f16x2 r398, r389, r395; +} +{ +cvt.rn.f16.f64 rs25, fd153; +} +mov.b32 r472, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd154; +} +mov.b32 r481, {rs26, rs26}; +{ +add.f16x2 r401, r259, r275; +} +{ +add.f16x2 r404, r28, r401; +} +{ +add.f16x2 r407, r265, r281; +} +{ +add.f16x2 r410, r64, r407; +} +{ +add.f16x2 r413, r259, r275; +} +{ +mul.f16x2 r416, r413, r472; +} +{ +add.f16x2 r419, r28, r416; +} +{ +sub.f16x2 r422, r265, r281; +} +{ +mul.f16x2 r425, r422, r481; +} +{ +add.f16x2 r428, r419, r425; +} +{ +add.f16x2 r431, r259, r275; +} +{ +mul.f16x2 r434, r431, r472; +} +{ +add.f16x2 r437, r28, r434; +} +{ +sub.f16x2 r440, r265, r281; +} +{ +mul.f16x2 r443, r440, r481; +} +{ +sub.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, r265, r281; +} +{ +mul.f16x2 r452, r449, r472; +} +{ +add.f16x2 r455, r64, r452; +} +{ +sub.f16x2 r458, r259, r275; +} +{ +mul.f16x2 r461, r458, r481; +} +{ +sub.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r265, r281; +} +{ +mul.f16x2 r470, r467, r472; +} +{ +add.f16x2 r473, r64, r470; +} +{ +sub.f16x2 r476, r259, r275; +} +{ +mul.f16x2 r479, r476, r481; +} +{ +add.f16x2 r482, r473, r479; +} +{ +cvt.rn.f16.f64 rs27, fd153; +} +mov.b32 r556, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs28, fd154; +} +mov.b32 r565, {rs28, rs28}; +{ +add.f16x2 r485, r291, r307; +} +{ +add.f16x2 r488, r46, r485; +} +{ +add.f16x2 r491, r297, r313; +} +{ +add.f16x2 r494, r82, r491; +} +{ +add.f16x2 r497, r291, r307; +} +{ +mul.f16x2 r500, r497, r556; +} +{ +add.f16x2 r503, r46, r500; +} +{ +sub.f16x2 r506, r297, r313; +} +{ +mul.f16x2 r509, r506, r565; +} +{ +add.f16x2 r512, r503, r509; +} +{ +add.f16x2 r515, r291, r307; +} +{ +mul.f16x2 r518, r515, r556; +} +{ +add.f16x2 r521, r46, r518; +} +{ +sub.f16x2 r524, r297, r313; +} +{ +mul.f16x2 r527, r524, r565; +} +{ +sub.f16x2 r530, r521, r527; +} +{ +add.f16x2 r533, r297, r313; +} +{ +mul.f16x2 r536, r533, r556; +} +{ +add.f16x2 r539, r82, r536; +} +{ +sub.f16x2 r542, r291, r307; +} +{ +mul.f16x2 r545, r542, r565; +} +{ +sub.f16x2 r548, r539, r545; +} +{ +add.f16x2 r551, r297, r313; +} +{ +mul.f16x2 r554, r551, r556; +} +{ +add.f16x2 r557, r82, r554; +} +{ +sub.f16x2 r560, r291, r307; +} +{ +mul.f16x2 r563, r560, r565; +} +{ +add.f16x2 r566, r557, r563; +} +{ +cvt.rn.f16.f64 rs29, fd153; +} +mov.b32 r640, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd154; +} +mov.b32 r649, {rs30, rs30}; +{ +add.f16x2 r569, %73, %77; +} +{ +add.f16x2 r572, %74, r569; +} +{ +add.f16x2 r575, %75, %72; +} +{ +add.f16x2 r578, %76, r575; +} +{ +add.f16x2 r581, %73, %77; +} +{ +mul.f16x2 r584, r581, r640; +} +{ +add.f16x2 r587, %74, r584; +} +{ +sub.f16x2 r590, %75, %72; +} +{ +mul.f16x2 r593, r590, r649; +} +{ +add.f16x2 r596, r587, r593; +} +{ +add.f16x2 r599, %73, %77; +} +{ +mul.f16x2 r602, r599, r640; +} +{ +add.f16x2 r605, %74, r602; +} +{ +sub.f16x2 r608, %75, %72; +} +{ +mul.f16x2 r611, r608, r649; +} +{ +sub.f16x2 r614, r605, r611; +} +{ +add.f16x2 r617, %75, %72; +} +{ +mul.f16x2 r620, r617, r640; +} +{ +add.f16x2 r623, %76, r620; +} +{ +sub.f16x2 r626, %73, %77; +} +{ +mul.f16x2 r629, r626, r649; +} +{ +sub.f16x2 r632, r623, r629; +} +{ +add.f16x2 r635, %75, %72; +} +{ +mul.f16x2 r638, r635, r640; +} +{ +add.f16x2 r641, %76, r638; +} +{ +sub.f16x2 r644, %73, %77; +} +{ +mul.f16x2 r647, r644, r649; +} +{ +add.f16x2 r650, r641, r647; +} +{ +cvt.rn.f16.f64 rs31, fd153; +} +mov.b32 r724, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd154; +} +mov.b32 r733, {rs32, rs32}; +{ +add.f16x2 r653, %82, %80; +} +{ +add.f16x2 r656, %83, r653; +} +{ +add.f16x2 r659, %78, %81; +} +{ +add.f16x2 r662, %79, r659; +} +{ +add.f16x2 r665, %82, %80; +} +{ +mul.f16x2 r668, r665, r724; +} +{ +add.f16x2 r671, %83, r668; +} +{ +sub.f16x2 r674, %78, %81; +} +{ +mul.f16x2 r677, r674, r733; +} +{ +add.f16x2 r680, r671, r677; +} +{ +add.f16x2 r683, %82, %80; +} +{ +mul.f16x2 r686, r683, r724; +} +{ +add.f16x2 r689, %83, r686; +} +{ +sub.f16x2 r692, %78, %81; +} +{ +mul.f16x2 r695, r692, r733; +} +{ +sub.f16x2 r698, r689, r695; +} +{ +add.f16x2 r701, %78, %81; +} +{ +mul.f16x2 r704, r701, r724; +} +{ +add.f16x2 r707, %79, r704; +} +{ +sub.f16x2 r710, %82, %80; +} +{ +mul.f16x2 r713, r710, r733; +} +{ +sub.f16x2 r716, r707, r713; +} +{ +add.f16x2 r719, %78, %81; +} +{ +mul.f16x2 r722, r719, r724; +} +{ +add.f16x2 r725, %79, r722; +} +{ +sub.f16x2 r728, %82, %80; +} +{ +mul.f16x2 r731, r728, r733; +} +{ +add.f16x2 r734, r725, r731; +} +{ +cvt.rn.f16.f64 rs33, fd153; +} +mov.b32 r808, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd154; +} +mov.b32 r817, {rs34, rs34}; +{ +add.f16x2 r737, %86, %84; +} +{ +add.f16x2 r740, %87, r737; +} +{ +add.f16x2 r743, %88, %85; +} +{ +add.f16x2 r746, %89, r743; +} +{ +add.f16x2 r749, %86, %84; +} +{ +mul.f16x2 r752, r749, r808; +} +{ +add.f16x2 r755, %87, r752; +} +{ +sub.f16x2 r758, %88, %85; +} +{ +mul.f16x2 r761, r758, r817; +} +{ +add.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %84; +} +{ +mul.f16x2 r770, r767, r808; +} +{ +add.f16x2 r773, %87, r770; +} +{ +sub.f16x2 r776, %88, %85; +} +{ +mul.f16x2 r779, r776, r817; +} +{ +sub.f16x2 r782, r773, r779; +} +{ +add.f16x2 r785, %88, %85; +} +{ +mul.f16x2 r788, r785, r808; +} +{ +add.f16x2 r791, %89, r788; +} +{ +sub.f16x2 r794, %86, %84; +} +{ +mul.f16x2 r797, r794, r817; +} +{ +sub.f16x2 r800, r791, r797; +} +{ +add.f16x2 r803, %88, %85; +} +{ +mul.f16x2 r806, r803, r808; +} +{ +add.f16x2 r809, %89, r806; +} +{ +sub.f16x2 r812, %86, %84; +} +{ +mul.f16x2 r815, r812, r817; +} +{ +add.f16x2 r818, r809, r815; +} +{ +cvt.rn.f16.f64 rs35, fd89; +} +{ +cvt.rn.f16.f64 rs36, fd90; +} +{ +cvt.rn.f16.f64 rs37, fd95; +} +{ +cvt.rn.f16.f64 rs38, fd96; +} +{ +cvt.rn.f16.f64 rs41, fd107; +} +{ +cvt.rn.f16.f64 rs42, fd108; +} +mov.b32 r835, {rs35, rs35}; +{ +mul.f16x2 r821, r680, r835; +} +mov.b32 r832, {rs36, rs36}; +{ +mul.f16x2 r824, r716, r832; +} +{ +sub.f16x2 r827, r821, r824; +} +{ +mul.f16x2 r830, r680, r832; +} +{ +fma.rn.f16x2 r833, r716, r835, r830; +} +mov.b32 r867, {rs37, rs37}; +{ +mul.f16x2 r837, r764, r867; +} +mov.b32 r864, {rs38, rs38}; +{ +mul.f16x2 r840, r800, r864; +} +{ +sub.f16x2 r843, r837, r840; +} +{ +mul.f16x2 r846, r764, r864; +} +{ +fma.rn.f16x2 r849, r800, r867, r846; +} +{ +mul.f16x2 r853, r698, r867; +} +{ +mul.f16x2 r856, r734, r864; +} +{ +sub.f16x2 r859, r853, r856; +} +{ +mul.f16x2 r862, r698, r864; +} +{ +fma.rn.f16x2 r865, r734, r867, r862; +} +mov.b32 r883, {rs41, rs41}; +{ +mul.f16x2 r869, r782, r883; +} +mov.b32 r880, {rs42, rs42}; +{ +mul.f16x2 r872, r818, r880; +} +{ +sub.f16x2 r875, r869, r872; +} +{ +mul.f16x2 r878, r782, r880; +} +{ +fma.rn.f16x2 r881, r818, r883, r878; +} +{ +cvt.rn.f16.f64 rs51, fd153; +} +mov.b32 r956, {rs51, rs51}; +{ +cvt.rn.f16.f64 rs52, fd154; +} +mov.b32 r965, {rs52, rs52}; +{ +add.f16x2 r885, r656, r740; +} +{ +add.f16x2 r888, r572, r885; +} +{ +add.f16x2 r891, r662, r746; +} +{ +add.f16x2 r894, r578, r891; +} +{ +add.f16x2 r897, r656, r740; +} +{ +mul.f16x2 r900, r897, r956; +} +{ +add.f16x2 r903, r572, r900; +} +{ +sub.f16x2 r906, r662, r746; +} +{ +mul.f16x2 r909, r906, r965; +} +{ +add.f16x2 r912, r903, r909; +} +{ +add.f16x2 r915, r656, r740; +} +{ +mul.f16x2 r918, r915, r956; +} +{ +add.f16x2 r921, r572, r918; +} +{ +sub.f16x2 r924, r662, r746; +} +{ +mul.f16x2 r927, r924, r965; +} +{ +sub.f16x2 r930, r921, r927; +} +{ +add.f16x2 r933, r662, r746; +} +{ +mul.f16x2 r936, r933, r956; +} +{ +add.f16x2 r939, r578, r936; +} +{ +sub.f16x2 r942, r656, r740; +} +{ +mul.f16x2 r945, r942, r965; +} +{ +sub.f16x2 r948, r939, r945; +} +{ +add.f16x2 r951, r662, r746; +} +{ +mul.f16x2 r954, r951, r956; +} +{ +add.f16x2 r957, r578, r954; +} +{ +sub.f16x2 r960, r656, r740; +} +{ +mul.f16x2 r963, r960, r965; +} +{ +add.f16x2 r966, r957, r963; +} +{ +cvt.rn.f16.f64 rs53, fd153; +} +mov.b32 r1040, {rs53, rs53}; +{ +cvt.rn.f16.f64 rs54, fd154; +} +mov.b32 r1049, {rs54, rs54}; +{ +add.f16x2 r969, r827, r843; +} +{ +add.f16x2 r972, r596, r969; +} +{ +add.f16x2 r975, r833, r849; +} +{ +add.f16x2 r978, r632, r975; +} +{ +add.f16x2 r981, r827, r843; +} +{ +mul.f16x2 r984, r981, r1040; +} +{ +add.f16x2 r987, r596, r984; +} +{ +sub.f16x2 r990, r833, r849; +} +{ +mul.f16x2 r993, r990, r1049; +} +{ +add.f16x2 r996, r987, r993; +} +{ +add.f16x2 r999, r827, r843; +} +{ +mul.f16x2 r1002, r999, r1040; +} +{ +add.f16x2 r1005, r596, r1002; +} +{ +sub.f16x2 r1008, r833, r849; +} +{ +mul.f16x2 r1011, r1008, r1049; +} +{ +sub.f16x2 r1014, r1005, r1011; +} +{ +add.f16x2 r1017, r833, r849; +} +{ +mul.f16x2 r1020, r1017, r1040; +} +{ +add.f16x2 r1023, r632, r1020; +} +{ +sub.f16x2 r1026, r827, r843; +} +{ +mul.f16x2 r1029, r1026, r1049; +} +{ +sub.f16x2 r1032, r1023, r1029; +} +{ +add.f16x2 r1035, r833, r849; +} +{ +mul.f16x2 r1038, r1035, r1040; +} +{ +add.f16x2 r1041, r632, r1038; +} +{ +sub.f16x2 r1044, r827, r843; +} +{ +mul.f16x2 r1047, r1044, r1049; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +cvt.rn.f16.f64 rs55, fd153; +} +mov.b32 r1124, {rs55, rs55}; +{ +cvt.rn.f16.f64 rs56, fd154; +} +mov.b32 r1133, {rs56, rs56}; +{ +add.f16x2 r1053, r859, r875; +} +{ +add.f16x2 r1056, r614, r1053; +} +{ +add.f16x2 r1059, r865, r881; +} +{ +add.f16x2 r1062, r650, r1059; +} +{ +add.f16x2 r1065, r859, r875; +} +{ +mul.f16x2 r1068, r1065, r1124; +} +{ +add.f16x2 r1071, r614, r1068; +} +{ +sub.f16x2 r1074, r865, r881; +} +{ +mul.f16x2 r1077, r1074, r1133; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r859, r875; +} +{ +mul.f16x2 r1086, r1083, r1124; +} +{ +add.f16x2 r1089, r614, r1086; +} +{ +sub.f16x2 r1092, r865, r881; +} +{ +mul.f16x2 r1095, r1092, r1133; +} +{ +sub.f16x2 r1098, r1089, r1095; +} +{ +add.f16x2 r1101, r865, r881; +} +{ +mul.f16x2 r1104, r1101, r1124; +} +{ +add.f16x2 r1107, r650, r1104; +} +{ +sub.f16x2 r1110, r859, r875; +} +{ +mul.f16x2 r1113, r1110, r1133; +} +{ +sub.f16x2 r1116, r1107, r1113; +} +{ +add.f16x2 r1119, r865, r881; +} +{ +mul.f16x2 r1122, r1119, r1124; +} +{ +add.f16x2 r1125, r650, r1122; +} +{ +sub.f16x2 r1128, r859, r875; +} +{ +mul.f16x2 r1131, r1128, r1133; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +cvt.rn.f16.f64 rs57, fd153; +} +mov.b32 r1208, {rs57, rs57}; +{ +cvt.rn.f16.f64 rs58, fd154; +} +mov.b32 r1217, {rs58, rs58}; +{ +add.f16x2 r1137, %94, %92; +} +{ +add.f16x2 r1140, %95, r1137; +} +{ +add.f16x2 r1143, %90, %93; +} +{ +add.f16x2 r1146, %91, r1143; +} +{ +add.f16x2 r1149, %94, %92; +} +{ +mul.f16x2 r1152, r1149, r1208; +} +{ +add.f16x2 r1155, %95, r1152; +} +{ +sub.f16x2 r1158, %90, %93; +} +{ +mul.f16x2 r1161, r1158, r1217; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +add.f16x2 r1167, %94, %92; +} +{ +mul.f16x2 r1170, r1167, r1208; +} +{ +add.f16x2 r1173, %95, r1170; +} +{ +sub.f16x2 r1176, %90, %93; +} +{ +mul.f16x2 r1179, r1176, r1217; +} +{ +sub.f16x2 r1182, r1173, r1179; +} +{ +add.f16x2 r1185, %90, %93; +} +{ +mul.f16x2 r1188, r1185, r1208; +} +{ +add.f16x2 r1191, %91, r1188; +} +{ +sub.f16x2 r1194, %94, %92; +} +{ +mul.f16x2 r1197, r1194, r1217; +} +{ +sub.f16x2 r1200, r1191, r1197; +} +{ +add.f16x2 r1203, %90, %93; +} +{ +mul.f16x2 r1206, r1203, r1208; +} +{ +add.f16x2 r1209, %91, r1206; +} +{ +sub.f16x2 r1212, %94, %92; +} +{ +mul.f16x2 r1215, r1212, r1217; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +cvt.rn.f16.f64 rs59, fd153; +} +mov.b32 r1292, {rs59, rs59}; +{ +cvt.rn.f16.f64 rs60, fd154; +} +mov.b32 r1301, {rs60, rs60}; +{ +add.f16x2 r1221, %98, %96; +} +{ +add.f16x2 r1224, %99, r1221; +} +{ +add.f16x2 r1227, %100, %97; +} +{ +add.f16x2 r1230, %101, r1227; +} +{ +add.f16x2 r1233, %98, %96; +} +{ +mul.f16x2 r1236, r1233, r1292; +} +{ +add.f16x2 r1239, %99, r1236; +} +{ +sub.f16x2 r1242, %100, %97; +} +{ +mul.f16x2 r1245, r1242, r1301; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %96; +} +{ +mul.f16x2 r1254, r1251, r1292; +} +{ +add.f16x2 r1257, %99, r1254; +} +{ +sub.f16x2 r1260, %100, %97; +} +{ +mul.f16x2 r1263, r1260, r1301; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %100, %97; +} +{ +mul.f16x2 r1272, r1269, r1292; +} +{ +add.f16x2 r1275, %101, r1272; +} +{ +sub.f16x2 r1278, %98, %96; +} +{ +mul.f16x2 r1281, r1278, r1301; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %100, %97; +} +{ +mul.f16x2 r1290, r1287, r1292; +} +{ +add.f16x2 r1293, %101, r1290; +} +{ +sub.f16x2 r1296, %98, %96; +} +{ +mul.f16x2 r1299, r1296, r1301; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +cvt.rn.f16.f64 rs61, fd153; +} +mov.b32 r1376, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs62, fd154; +} +mov.b32 r1385, {rs62, rs62}; +{ +add.f16x2 r1305, %107, %105; +} +{ +add.f16x2 r1308, %102, r1305; +} +{ +add.f16x2 r1311, %103, %106; +} +{ +add.f16x2 r1314, %104, r1311; +} +{ +add.f16x2 r1317, %107, %105; +} +{ +mul.f16x2 r1320, r1317, r1376; +} +{ +add.f16x2 r1323, %102, r1320; +} +{ +sub.f16x2 r1326, %103, %106; +} +{ +mul.f16x2 r1329, r1326, r1385; +} +{ +add.f16x2 r1332, r1323, r1329; +} +{ +add.f16x2 r1335, %107, %105; +} +{ +mul.f16x2 r1338, r1335, r1376; +} +{ +add.f16x2 r1341, %102, r1338; +} +{ +sub.f16x2 r1344, %103, %106; +} +{ +mul.f16x2 r1347, r1344, r1385; +} +{ +sub.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, %103, %106; +} +{ +mul.f16x2 r1356, r1353, r1376; +} +{ +add.f16x2 r1359, %104, r1356; +} +{ +sub.f16x2 r1362, %107, %105; +} +{ +mul.f16x2 r1365, r1362, r1385; +} +{ +sub.f16x2 r1368, r1359, r1365; +} +{ +add.f16x2 r1371, %103, %106; +} +{ +mul.f16x2 r1374, r1371, r1376; +} +{ +add.f16x2 r1377, %104, r1374; +} +{ +sub.f16x2 r1380, %107, %105; +} +{ +mul.f16x2 r1383, r1380, r1385; +} +{ +add.f16x2 r1386, r1377, r1383; +} +{ +cvt.rn.f16.f64 rs63, fd89; +} +{ +cvt.rn.f16.f64 rs64, fd90; +} +{ +cvt.rn.f16.f64 rs65, fd95; +} +{ +cvt.rn.f16.f64 rs66, fd96; +} +{ +cvt.rn.f16.f64 rs69, fd107; +} +{ +cvt.rn.f16.f64 rs70, fd108; +} +mov.b32 r1403, {rs63, rs63}; +{ +mul.f16x2 r1389, r1248, r1403; +} +mov.b32 r1400, {rs64, rs64}; +{ +mul.f16x2 r1392, r1284, r1400; +} +{ +sub.f16x2 r1395, r1389, r1392; +} +{ +mul.f16x2 r1398, r1248, r1400; +} +{ +fma.rn.f16x2 r1401, r1284, r1403, r1398; +} +mov.b32 r1435, {rs65, rs65}; +{ +mul.f16x2 r1405, r1332, r1435; +} +mov.b32 r1432, {rs66, rs66}; +{ +mul.f16x2 r1408, r1368, r1432; +} +{ +sub.f16x2 r1411, r1405, r1408; +} +{ +mul.f16x2 r1414, r1332, r1432; +} +{ +fma.rn.f16x2 r1417, r1368, r1435, r1414; +} +{ +mul.f16x2 r1421, r1266, r1435; +} +{ +mul.f16x2 r1424, r1302, r1432; +} +{ +sub.f16x2 r1427, r1421, r1424; +} +{ +mul.f16x2 r1430, r1266, r1432; +} +{ +fma.rn.f16x2 r1433, r1302, r1435, r1430; +} +mov.b32 r1451, {rs69, rs69}; +{ +mul.f16x2 r1437, r1350, r1451; +} +mov.b32 r1448, {rs70, rs70}; +{ +mul.f16x2 r1440, r1386, r1448; +} +{ +sub.f16x2 r1443, r1437, r1440; +} +{ +mul.f16x2 r1446, r1350, r1448; +} +{ +fma.rn.f16x2 r1449, r1386, r1451, r1446; +} +{ +cvt.rn.f16.f64 rs79, fd153; +} +mov.b32 r1524, {rs79, rs79}; +{ +cvt.rn.f16.f64 rs80, fd154; +} +mov.b32 r1533, {rs80, rs80}; +{ +add.f16x2 r1453, r1224, r1308; +} +{ +add.f16x2 r1456, r1140, r1453; +} +{ +add.f16x2 r1459, r1230, r1314; +} +{ +add.f16x2 r1462, r1146, r1459; +} +{ +add.f16x2 r1465, r1224, r1308; +} +{ +mul.f16x2 r1468, r1465, r1524; +} +{ +add.f16x2 r1471, r1140, r1468; +} +{ +sub.f16x2 r1474, r1230, r1314; +} +{ +mul.f16x2 r1477, r1474, r1533; +} +{ +add.f16x2 r1480, r1471, r1477; +} +{ +add.f16x2 r1483, r1224, r1308; +} +{ +mul.f16x2 r1486, r1483, r1524; +} +{ +add.f16x2 r1489, r1140, r1486; +} +{ +sub.f16x2 r1492, r1230, r1314; +} +{ +mul.f16x2 r1495, r1492, r1533; +} +{ +sub.f16x2 r1498, r1489, r1495; +} +{ +add.f16x2 r1501, r1230, r1314; +} +{ +mul.f16x2 r1504, r1501, r1524; +} +{ +add.f16x2 r1507, r1146, r1504; +} +{ +sub.f16x2 r1510, r1224, r1308; +} +{ +mul.f16x2 r1513, r1510, r1533; +} +{ +sub.f16x2 r1516, r1507, r1513; +} +{ +add.f16x2 r1519, r1230, r1314; +} +{ +mul.f16x2 r1522, r1519, r1524; +} +{ +add.f16x2 r1525, r1146, r1522; +} +{ +sub.f16x2 r1528, r1224, r1308; +} +{ +mul.f16x2 r1531, r1528, r1533; +} +{ +add.f16x2 r1534, r1525, r1531; +} +{ +cvt.rn.f16.f64 rs81, fd153; +} +mov.b32 r1608, {rs81, rs81}; +{ +cvt.rn.f16.f64 rs82, fd154; +} +mov.b32 r1617, {rs82, rs82}; +{ +add.f16x2 r1537, r1395, r1411; +} +{ +add.f16x2 r1540, r1164, r1537; +} +{ +add.f16x2 r1543, r1401, r1417; +} +{ +add.f16x2 r1546, r1200, r1543; +} +{ +add.f16x2 r1549, r1395, r1411; +} +{ +mul.f16x2 r1552, r1549, r1608; +} +{ +add.f16x2 r1555, r1164, r1552; +} +{ +sub.f16x2 r1558, r1401, r1417; +} +{ +mul.f16x2 r1561, r1558, r1617; +} +{ +add.f16x2 r1564, r1555, r1561; +} +{ +add.f16x2 r1567, r1395, r1411; +} +{ +mul.f16x2 r1570, r1567, r1608; +} +{ +add.f16x2 r1573, r1164, r1570; +} +{ +sub.f16x2 r1576, r1401, r1417; +} +{ +mul.f16x2 r1579, r1576, r1617; +} +{ +sub.f16x2 r1582, r1573, r1579; +} +{ +add.f16x2 r1585, r1401, r1417; +} +{ +mul.f16x2 r1588, r1585, r1608; +} +{ +add.f16x2 r1591, r1200, r1588; +} +{ +sub.f16x2 r1594, r1395, r1411; +} +{ +mul.f16x2 r1597, r1594, r1617; +} +{ +sub.f16x2 r1600, r1591, r1597; +} +{ +add.f16x2 r1603, r1401, r1417; +} +{ +mul.f16x2 r1606, r1603, r1608; +} +{ +add.f16x2 r1609, r1200, r1606; +} +{ +sub.f16x2 r1612, r1395, r1411; +} +{ +mul.f16x2 r1615, r1612, r1617; +} +{ +add.f16x2 r1618, r1609, r1615; +} +{ +cvt.rn.f16.f64 rs83, fd153; +} +mov.b32 r1692, {rs83, rs83}; +{ +cvt.rn.f16.f64 rs84, fd154; +} +mov.b32 r1701, {rs84, rs84}; +{ +add.f16x2 r1621, r1427, r1443; +} +{ +add.f16x2 r1624, r1182, r1621; +} +{ +add.f16x2 r1627, r1433, r1449; +} +{ +add.f16x2 r1630, r1218, r1627; +} +{ +add.f16x2 r1633, r1427, r1443; +} +{ +mul.f16x2 r1636, r1633, r1692; +} +{ +add.f16x2 r1639, r1182, r1636; +} +{ +sub.f16x2 r1642, r1433, r1449; +} +{ +mul.f16x2 r1645, r1642, r1701; +} +{ +add.f16x2 r1648, r1639, r1645; +} +{ +add.f16x2 r1651, r1427, r1443; +} +{ +mul.f16x2 r1654, r1651, r1692; +} +{ +add.f16x2 r1657, r1182, r1654; +} +{ +sub.f16x2 r1660, r1433, r1449; +} +{ +mul.f16x2 r1663, r1660, r1701; +} +{ +sub.f16x2 r1666, r1657, r1663; +} +{ +add.f16x2 r1669, r1433, r1449; +} +{ +mul.f16x2 r1672, r1669, r1692; +} +{ +add.f16x2 r1675, r1218, r1672; +} +{ +sub.f16x2 r1678, r1427, r1443; +} +{ +mul.f16x2 r1681, r1678, r1701; +} +{ +sub.f16x2 r1684, r1675, r1681; +} +{ +add.f16x2 r1687, r1433, r1449; +} +{ +mul.f16x2 r1690, r1687, r1692; +} +{ +add.f16x2 r1693, r1218, r1690; +} +{ +sub.f16x2 r1696, r1427, r1443; +} +{ +mul.f16x2 r1699, r1696, r1701; +} +{ +add.f16x2 r1702, r1693, r1699; +} +mov.f64 fd85, 0d3FEF232EFF15C9E6; +{ +cvt.rn.f16.f64 rs85, fd85; +} +mov.f64 fd86, 0d3FCD84D223638000; +{ +cvt.rn.f16.f64 rs86, fd86; +} +mov.f64 fd87, 0d3FEC98A37A9A7850; +{ +cvt.rn.f16.f64 rs87, fd87; +} +mov.f64 fd88, 0d3FDCB920325BAFA6; +{ +cvt.rn.f16.f64 rs88, fd88; +} +{ +cvt.rn.f16.f64 rs89, fd89; +} +{ +cvt.rn.f16.f64 rs90, fd90; +} +mov.f64 fd91, 0d3FE31BEC55BC71BC; +{ +cvt.rn.f16.f64 rs91, fd91; +} +mov.f64 fd92, 0d3FE9AAFE4207DF5F; +{ +cvt.rn.f16.f64 rs92, fd92; +} +mov.f64 fd93, 0d3FD9595EF26FB670; +{ +cvt.rn.f16.f64 rs93, fd93; +} +mov.f64 fd94, 0d3FED6206BEB6C24B; +{ +cvt.rn.f16.f64 rs94, fd94; +} +{ +cvt.rn.f16.f64 rs95, fd95; +} +{ +cvt.rn.f16.f64 rs96, fd96; +} +mov.f64 fd97, 0dBFADC528B5343A86; +{ +cvt.rn.f16.f64 rs97, fd97; +} +mov.f64 fd98, 0d3FEFF223F3635CE3; +{ +cvt.rn.f16.f64 rs98, fd98; +} +mov.f64 fd99, 0dBFD25AFBF23865BF; +{ +cvt.rn.f16.f64 rs99, fd99; +} +mov.f64 fd100, 0d3FEEA7D99F29CADE; +{ +cvt.rn.f16.f64 rs100, fd100; +} +mov.f64 fd103, 0dBFE5F5B105F99707; +{ +cvt.rn.f16.f64 rs103, fd103; +} +mov.f64 fd104, 0d3FE746A51650EADE; +{ +cvt.rn.f16.f64 rs104, fd104; +} +mov.f64 fd115, 0dBFEABC50EF4734A7; +{ +cvt.rn.f16.f64 rs107, fd107; +} +{ +cvt.rn.f16.f64 rs108, fd108; +} +mov.f64 fd111, 0dBFEFC89BCEF44CF4; +{ +cvt.rn.f16.f64 rs111, fd111; +} +mov.f64 fd112, 0dBFBDB843E577175E; +{ +cvt.rn.f16.f64 rs112, fd112; +} +{ +cvt.rn.f16.f64 rs115, fd115; +} +mov.f64 fd116, 0dBFE19593DA358510; +{ +cvt.rn.f16.f64 rs116, fd116; +} +mov.b32 r1719, {rs85, rs85}; +{ +mul.f16x2 r1705, r972, r1719; +} +mov.b32 r1716, {rs86, rs86}; +{ +mul.f16x2 r1708, r978, r1716; +} +{ +sub.f16x2 r1711, r1705, r1708; +} +{ +mul.f16x2 r1714, r972, r1716; +} +{ +fma.rn.f16x2 r1717, r978, r1719, r1714; +} +mov.b32 r1751, {rs87, rs87}; +{ +mul.f16x2 r1721, r1540, r1751; +} +mov.b32 r1748, {rs88, rs88}; +{ +mul.f16x2 r1724, r1546, r1748; +} +{ +sub.f16x2 r1727, r1721, r1724; +} +{ +mul.f16x2 r1730, r1540, r1748; +} +{ +fma.rn.f16x2 r1733, r1546, r1751, r1730; +} +{ +mul.f16x2 r1737, r1056, r1751; +} +{ +mul.f16x2 r1740, r1062, r1748; +} +{ +sub.f16x2 r1743, r1737, r1740; +} +{ +mul.f16x2 r1746, r1056, r1748; +} +{ +fma.rn.f16x2 r1749, r1062, r1751, r1746; +} +mov.b32 r1815, {rs91, rs91}; +{ +mul.f16x2 r1753, r1624, r1815; +} +mov.b32 r1812, {rs92, rs92}; +{ +mul.f16x2 r1756, r1630, r1812; +} +{ +sub.f16x2 r1759, r1753, r1756; +} +{ +mul.f16x2 r1762, r1624, r1812; +} +{ +fma.rn.f16x2 r1765, r1630, r1815, r1762; +} +mov.b32 r1783, {rs89, rs89}; +{ +mul.f16x2 r1769, r912, r1783; +} +mov.b32 r1780, {rs90, rs90}; +{ +mul.f16x2 r1772, r948, r1780; +} +{ +sub.f16x2 r1775, r1769, r1772; +} +{ +mul.f16x2 r1778, r912, r1780; +} +{ +fma.rn.f16x2 r1781, r948, r1783, r1778; +} +mov.b32 r1879, {rs95, rs95}; +{ +mul.f16x2 r1785, r1480, r1879; +} +mov.b32 r1876, {rs96, rs96}; +{ +mul.f16x2 r1788, r1516, r1876; +} +{ +sub.f16x2 r1791, r1785, r1788; +} +{ +mul.f16x2 r1794, r1480, r1876; +} +{ +fma.rn.f16x2 r1797, r1516, r1879, r1794; +} +{ +mul.f16x2 r1801, r996, r1815; +} +{ +mul.f16x2 r1804, r1032, r1812; +} +{ +sub.f16x2 r1807, r1801, r1804; +} +{ +mul.f16x2 r1810, r996, r1812; +} +{ +fma.rn.f16x2 r1813, r1032, r1815, r1810; +} +mov.b32 r1943, {rs99, rs99}; +{ +mul.f16x2 r1817, r1564, r1943; +} +mov.b32 r1940, {rs100, rs100}; +{ +mul.f16x2 r1820, r1600, r1940; +} +{ +sub.f16x2 r1823, r1817, r1820; +} +{ +mul.f16x2 r1826, r1564, r1940; +} +{ +fma.rn.f16x2 r1829, r1600, r1943, r1826; +} +mov.b32 r1847, {rs93, rs93}; +{ +mul.f16x2 r1833, r1080, r1847; +} +mov.b32 r1844, {rs94, rs94}; +{ +mul.f16x2 r1836, r1116, r1844; +} +{ +sub.f16x2 r1839, r1833, r1836; +} +{ +mul.f16x2 r1842, r1080, r1844; +} +{ +fma.rn.f16x2 r1845, r1116, r1847, r1842; +} +mov.b32 r1863, {rs103, rs103}; +{ +mul.f16x2 r1849, r1648, r1863; +} +mov.b32 r1860, {rs104, rs104}; +{ +mul.f16x2 r1852, r1684, r1860; +} +{ +sub.f16x2 r1855, r1849, r1852; +} +{ +mul.f16x2 r1858, r1648, r1860; +} +{ +fma.rn.f16x2 r1861, r1684, r1863, r1858; +} +{ +mul.f16x2 r1865, r930, r1879; +} +{ +mul.f16x2 r1868, r966, r1876; +} +{ +sub.f16x2 r1871, r1865, r1868; +} +{ +mul.f16x2 r1874, r930, r1876; +} +{ +fma.rn.f16x2 r1877, r966, r1879, r1874; +} +mov.b32 r1895, {rs107, rs107}; +{ +mul.f16x2 r1881, r1498, r1895; +} +mov.b32 r1892, {rs108, rs108}; +{ +mul.f16x2 r1884, r1534, r1892; +} +{ +sub.f16x2 r1887, r1881, r1884; +} +{ +mul.f16x2 r1890, r1498, r1892; +} +{ +fma.rn.f16x2 r1893, r1534, r1895, r1890; +} +mov.b32 r1911, {rs97, rs97}; +{ +mul.f16x2 r1897, r1014, r1911; +} +mov.b32 r1908, {rs98, rs98}; +{ +mul.f16x2 r1900, r1050, r1908; +} +{ +sub.f16x2 r1903, r1897, r1900; +} +{ +mul.f16x2 r1906, r1014, r1908; +} +{ +fma.rn.f16x2 r1909, r1050, r1911, r1906; +} +mov.b32 r1927, {rs111, rs111}; +{ +mul.f16x2 r1913, r1582, r1927; +} +mov.b32 r1924, {rs112, rs112}; +{ +mul.f16x2 r1916, r1618, r1924; +} +{ +sub.f16x2 r1919, r1913, r1916; +} +{ +mul.f16x2 r1922, r1582, r1924; +} +{ +fma.rn.f16x2 r1925, r1618, r1927, r1922; +} +{ +mul.f16x2 r1929, r1098, r1943; +} +{ +mul.f16x2 r1932, r1134, r1940; +} +{ +sub.f16x2 r1935, r1929, r1932; +} +{ +mul.f16x2 r1938, r1098, r1940; +} +{ +fma.rn.f16x2 r1941, r1134, r1943, r1938; +} +mov.b32 r1959, {rs115, rs115}; +{ +mul.f16x2 r1945, r1666, r1959; +} +mov.b32 r1956, {rs116, rs116}; +{ +mul.f16x2 r1948, r1702, r1956; +} +{ +sub.f16x2 r1951, r1945, r1948; +} +{ +mul.f16x2 r1954, r1666, r1956; +} +{ +fma.rn.f16x2 r1957, r1702, r1959, r1954; +} +{ +cvt.rn.f16.f64 rs137, fd153; +} +mov.b32 r2032, {rs137, rs137}; +{ +cvt.rn.f16.f64 rs138, fd154; +} +mov.b32 r2041, {rs138, rs138}; +{ +add.f16x2 r1961, r888, r1456; +} +{ +add.f16x2 %0, r320, r1961; +} +{ +add.f16x2 r1967, r894, r1462; +} +{ +add.f16x2 %1, r326, r1967; +} +{ +add.f16x2 r1973, r888, r1456; +} +{ +mul.f16x2 r1976, r1973, r2032; +} +{ +add.f16x2 r1979, r320, r1976; +} +{ +sub.f16x2 r1982, r894, r1462; +} +{ +mul.f16x2 r1985, r1982, r2041; +} +{ +add.f16x2 %18, r1979, r1985; +} +{ +add.f16x2 r1991, r888, r1456; +} +{ +mul.f16x2 r1994, r1991, r2032; +} +{ +add.f16x2 r1997, r320, r1994; +} +{ +sub.f16x2 r2000, r894, r1462; +} +{ +mul.f16x2 r2003, r2000, r2041; +} +{ +sub.f16x2 %36, r1997, r2003; +} +{ +add.f16x2 r2009, r894, r1462; +} +{ +mul.f16x2 r2012, r2009, r2032; +} +{ +add.f16x2 r2015, r326, r2012; +} +{ +sub.f16x2 r2018, r888, r1456; +} +{ +mul.f16x2 r2021, r2018, r2041; +} +{ +sub.f16x2 %19, r2015, r2021; +} +{ +add.f16x2 r2027, r894, r1462; +} +{ +mul.f16x2 r2030, r2027, r2032; +} +{ +add.f16x2 r2033, r326, r2030; +} +{ +sub.f16x2 r2036, r888, r1456; +} +{ +mul.f16x2 r2039, r2036, r2041; +} +{ +add.f16x2 %37, r2033, r2039; +} +{ +cvt.rn.f16.f64 rs139, fd153; +} +mov.b32 r2116, {rs139, rs139}; +{ +cvt.rn.f16.f64 rs140, fd154; +} +mov.b32 r2125, {rs140, rs140}; +{ +add.f16x2 r2045, r1711, r1727; +} +{ +add.f16x2 %2, r404, r2045; +} +{ +add.f16x2 r2051, r1717, r1733; +} +{ +add.f16x2 %3, r410, r2051; +} +{ +add.f16x2 r2057, r1711, r1727; +} +{ +mul.f16x2 r2060, r2057, r2116; +} +{ +add.f16x2 r2063, r404, r2060; +} +{ +sub.f16x2 r2066, r1717, r1733; +} +{ +mul.f16x2 r2069, r2066, r2125; +} +{ +add.f16x2 %20, r2063, r2069; +} +{ +add.f16x2 r2075, r1711, r1727; +} +{ +mul.f16x2 r2078, r2075, r2116; +} +{ +add.f16x2 r2081, r404, r2078; +} +{ +sub.f16x2 r2084, r1717, r1733; +} +{ +mul.f16x2 r2087, r2084, r2125; +} +{ +sub.f16x2 %38, r2081, r2087; +} +{ +add.f16x2 r2093, r1717, r1733; +} +{ +mul.f16x2 r2096, r2093, r2116; +} +{ +add.f16x2 r2099, r410, r2096; +} +{ +sub.f16x2 r2102, r1711, r1727; +} +{ +mul.f16x2 r2105, r2102, r2125; +} +{ +sub.f16x2 %21, r2099, r2105; +} +{ +add.f16x2 r2111, r1717, r1733; +} +{ +mul.f16x2 r2114, r2111, r2116; +} +{ +add.f16x2 r2117, r410, r2114; +} +{ +sub.f16x2 r2120, r1711, r1727; +} +{ +mul.f16x2 r2123, r2120, r2125; +} +{ +add.f16x2 %39, r2117, r2123; +} +{ +cvt.rn.f16.f64 rs141, fd153; +} +mov.b32 r2200, {rs141, rs141}; +{ +cvt.rn.f16.f64 rs142, fd154; +} +mov.b32 r2209, {rs142, rs142}; +{ +add.f16x2 r2129, r1743, r1759; +} +{ +add.f16x2 %4, r488, r2129; +} +{ +add.f16x2 r2135, r1749, r1765; +} +{ +add.f16x2 %5, r494, r2135; +} +{ +add.f16x2 r2141, r1743, r1759; +} +{ +mul.f16x2 r2144, r2141, r2200; +} +{ +add.f16x2 r2147, r488, r2144; +} +{ +sub.f16x2 r2150, r1749, r1765; +} +{ +mul.f16x2 r2153, r2150, r2209; +} +{ +add.f16x2 %22, r2147, r2153; +} +{ +add.f16x2 r2159, r1743, r1759; +} +{ +mul.f16x2 r2162, r2159, r2200; +} +{ +add.f16x2 r2165, r488, r2162; +} +{ +sub.f16x2 r2168, r1749, r1765; +} +{ +mul.f16x2 r2171, r2168, r2209; +} +{ +sub.f16x2 %40, r2165, r2171; +} +{ +add.f16x2 r2177, r1749, r1765; +} +{ +mul.f16x2 r2180, r2177, r2200; +} +{ +add.f16x2 r2183, r494, r2180; +} +{ +sub.f16x2 r2186, r1743, r1759; +} +{ +mul.f16x2 r2189, r2186, r2209; +} +{ +sub.f16x2 %23, r2183, r2189; +} +{ +add.f16x2 r2195, r1749, r1765; +} +{ +mul.f16x2 r2198, r2195, r2200; +} +{ +add.f16x2 r2201, r494, r2198; +} +{ +sub.f16x2 r2204, r1743, r1759; +} +{ +mul.f16x2 r2207, r2204, r2209; +} +{ +add.f16x2 %41, r2201, r2207; +} +{ +cvt.rn.f16.f64 rs143, fd153; +} +mov.b32 r2284, {rs143, rs143}; +{ +cvt.rn.f16.f64 rs144, fd154; +} +mov.b32 r2293, {rs144, rs144}; +{ +add.f16x2 r2213, r1775, r1791; +} +{ +add.f16x2 %6, r344, r2213; +} +{ +add.f16x2 r2219, r1781, r1797; +} +{ +add.f16x2 %7, r380, r2219; +} +{ +add.f16x2 r2225, r1775, r1791; +} +{ +mul.f16x2 r2228, r2225, r2284; +} +{ +add.f16x2 r2231, r344, r2228; +} +{ +sub.f16x2 r2234, r1781, r1797; +} +{ +mul.f16x2 r2237, r2234, r2293; +} +{ +add.f16x2 %24, r2231, r2237; +} +{ +add.f16x2 r2243, r1775, r1791; +} +{ +mul.f16x2 r2246, r2243, r2284; +} +{ +add.f16x2 r2249, r344, r2246; +} +{ +sub.f16x2 r2252, r1781, r1797; +} +{ +mul.f16x2 r2255, r2252, r2293; +} +{ +sub.f16x2 %42, r2249, r2255; +} +{ +add.f16x2 r2261, r1781, r1797; +} +{ +mul.f16x2 r2264, r2261, r2284; +} +{ +add.f16x2 r2267, r380, r2264; +} +{ +sub.f16x2 r2270, r1775, r1791; +} +{ +mul.f16x2 r2273, r2270, r2293; +} +{ +sub.f16x2 %25, r2267, r2273; +} +{ +add.f16x2 r2279, r1781, r1797; +} +{ +mul.f16x2 r2282, r2279, r2284; +} +{ +add.f16x2 r2285, r380, r2282; +} +{ +sub.f16x2 r2288, r1775, r1791; +} +{ +mul.f16x2 r2291, r2288, r2293; +} +{ +add.f16x2 %43, r2285, r2291; +} +{ +cvt.rn.f16.f64 rs145, fd153; +} +mov.b32 r2368, {rs145, rs145}; +{ +cvt.rn.f16.f64 rs146, fd154; +} +mov.b32 r2377, {rs146, rs146}; +{ +add.f16x2 r2297, r1807, r1823; +} +{ +add.f16x2 %8, r428, r2297; +} +{ +add.f16x2 r2303, r1813, r1829; +} +{ +add.f16x2 %9, r464, r2303; +} +{ +add.f16x2 r2309, r1807, r1823; +} +{ +mul.f16x2 r2312, r2309, r2368; +} +{ +add.f16x2 r2315, r428, r2312; +} +{ +sub.f16x2 r2318, r1813, r1829; +} +{ +mul.f16x2 r2321, r2318, r2377; +} +{ +add.f16x2 %26, r2315, r2321; +} +{ +add.f16x2 r2327, r1807, r1823; +} +{ +mul.f16x2 r2330, r2327, r2368; +} +{ +add.f16x2 r2333, r428, r2330; +} +{ +sub.f16x2 r2336, r1813, r1829; +} +{ +mul.f16x2 r2339, r2336, r2377; +} +{ +sub.f16x2 %44, r2333, r2339; +} +{ +add.f16x2 r2345, r1813, r1829; +} +{ +mul.f16x2 r2348, r2345, r2368; +} +{ +add.f16x2 r2351, r464, r2348; +} +{ +sub.f16x2 r2354, r1807, r1823; +} +{ +mul.f16x2 r2357, r2354, r2377; +} +{ +sub.f16x2 %27, r2351, r2357; +} +{ +add.f16x2 r2363, r1813, r1829; +} +{ +mul.f16x2 r2366, r2363, r2368; +} +{ +add.f16x2 r2369, r464, r2366; +} +{ +sub.f16x2 r2372, r1807, r1823; +} +{ +mul.f16x2 r2375, r2372, r2377; +} +{ +add.f16x2 %45, r2369, r2375; +} +{ +cvt.rn.f16.f64 rs147, fd153; +} +mov.b32 r2452, {rs147, rs147}; +{ +cvt.rn.f16.f64 rs148, fd154; +} +mov.b32 r2461, {rs148, rs148}; +{ +add.f16x2 r2381, r1839, r1855; +} +{ +add.f16x2 %10, r512, r2381; +} +{ +add.f16x2 r2387, r1845, r1861; +} +{ +add.f16x2 %11, r548, r2387; +} +{ +add.f16x2 r2393, r1839, r1855; +} +{ +mul.f16x2 r2396, r2393, r2452; +} +{ +add.f16x2 r2399, r512, r2396; +} +{ +sub.f16x2 r2402, r1845, r1861; +} +{ +mul.f16x2 r2405, r2402, r2461; +} +{ +add.f16x2 %28, r2399, r2405; +} +{ +add.f16x2 r2411, r1839, r1855; +} +{ +mul.f16x2 r2414, r2411, r2452; +} +{ +add.f16x2 r2417, r512, r2414; +} +{ +sub.f16x2 r2420, r1845, r1861; +} +{ +mul.f16x2 r2423, r2420, r2461; +} +{ +sub.f16x2 %46, r2417, r2423; +} +{ +add.f16x2 r2429, r1845, r1861; +} +{ +mul.f16x2 r2432, r2429, r2452; +} +{ +add.f16x2 r2435, r548, r2432; +} +{ +sub.f16x2 r2438, r1839, r1855; +} +{ +mul.f16x2 r2441, r2438, r2461; +} +{ +sub.f16x2 %29, r2435, r2441; +} +{ +add.f16x2 r2447, r1845, r1861; +} +{ +mul.f16x2 r2450, r2447, r2452; +} +{ +add.f16x2 r2453, r548, r2450; +} +{ +sub.f16x2 r2456, r1839, r1855; +} +{ +mul.f16x2 r2459, r2456, r2461; +} +{ +add.f16x2 %47, r2453, r2459; +} +{ +cvt.rn.f16.f64 rs149, fd153; +} +mov.b32 r2536, {rs149, rs149}; +{ +cvt.rn.f16.f64 rs150, fd154; +} +mov.b32 r2545, {rs150, rs150}; +{ +add.f16x2 r2465, r1871, r1887; +} +{ +add.f16x2 %12, r362, r2465; +} +{ +add.f16x2 r2471, r1877, r1893; +} +{ +add.f16x2 %13, r398, r2471; +} +{ +add.f16x2 r2477, r1871, r1887; +} +{ +mul.f16x2 r2480, r2477, r2536; +} +{ +add.f16x2 r2483, r362, r2480; +} +{ +sub.f16x2 r2486, r1877, r1893; +} +{ +mul.f16x2 r2489, r2486, r2545; +} +{ +add.f16x2 %30, r2483, r2489; +} +{ +add.f16x2 r2495, r1871, r1887; +} +{ +mul.f16x2 r2498, r2495, r2536; +} +{ +add.f16x2 r2501, r362, r2498; +} +{ +sub.f16x2 r2504, r1877, r1893; +} +{ +mul.f16x2 r2507, r2504, r2545; +} +{ +sub.f16x2 %48, r2501, r2507; +} +{ +add.f16x2 r2513, r1877, r1893; +} +{ +mul.f16x2 r2516, r2513, r2536; +} +{ +add.f16x2 r2519, r398, r2516; +} +{ +sub.f16x2 r2522, r1871, r1887; +} +{ +mul.f16x2 r2525, r2522, r2545; +} +{ +sub.f16x2 %31, r2519, r2525; +} +{ +add.f16x2 r2531, r1877, r1893; +} +{ +mul.f16x2 r2534, r2531, r2536; +} +{ +add.f16x2 r2537, r398, r2534; +} +{ +sub.f16x2 r2540, r1871, r1887; +} +{ +mul.f16x2 r2543, r2540, r2545; +} +{ +add.f16x2 %49, r2537, r2543; +} +{ +cvt.rn.f16.f64 rs151, fd153; +} +mov.b32 r2620, {rs151, rs151}; +{ +cvt.rn.f16.f64 rs152, fd154; +} +mov.b32 r2629, {rs152, rs152}; +{ +add.f16x2 r2549, r1903, r1919; +} +{ +add.f16x2 %14, r446, r2549; +} +{ +add.f16x2 r2555, r1909, r1925; +} +{ +add.f16x2 %15, r482, r2555; +} +{ +add.f16x2 r2561, r1903, r1919; +} +{ +mul.f16x2 r2564, r2561, r2620; +} +{ +add.f16x2 r2567, r446, r2564; +} +{ +sub.f16x2 r2570, r1909, r1925; +} +{ +mul.f16x2 r2573, r2570, r2629; +} +{ +add.f16x2 %32, r2567, r2573; +} +{ +add.f16x2 r2579, r1903, r1919; +} +{ +mul.f16x2 r2582, r2579, r2620; +} +{ +add.f16x2 r2585, r446, r2582; +} +{ +sub.f16x2 r2588, r1909, r1925; +} +{ +mul.f16x2 r2591, r2588, r2629; +} +{ +sub.f16x2 %50, r2585, r2591; +} +{ +add.f16x2 r2597, r1909, r1925; +} +{ +mul.f16x2 r2600, r2597, r2620; +} +{ +add.f16x2 r2603, r482, r2600; +} +{ +sub.f16x2 r2606, r1903, r1919; +} +{ +mul.f16x2 r2609, r2606, r2629; +} +{ +sub.f16x2 %33, r2603, r2609; +} +{ +add.f16x2 r2615, r1909, r1925; +} +{ +mul.f16x2 r2618, r2615, r2620; +} +{ +add.f16x2 r2621, r482, r2618; +} +{ +sub.f16x2 r2624, r1903, r1919; +} +{ +mul.f16x2 r2627, r2624, r2629; +} +{ +add.f16x2 %51, r2621, r2627; +} +{ +cvt.rn.f16.f64 rs153, fd153; +} +mov.b32 r2704, {rs153, rs153}; +{ +cvt.rn.f16.f64 rs154, fd154; +} +mov.b32 r2713, {rs154, rs154}; +{ +add.f16x2 r2633, r1935, r1951; +} +{ +add.f16x2 %16, r530, r2633; +} +{ +add.f16x2 r2639, r1941, r1957; +} +{ +add.f16x2 %17, r566, r2639; +} +{ +add.f16x2 r2645, r1935, r1951; +} +{ +mul.f16x2 r2648, r2645, r2704; +} +{ +add.f16x2 r2651, r530, r2648; +} +{ +sub.f16x2 r2654, r1941, r1957; +} +{ +mul.f16x2 r2657, r2654, r2713; +} +{ +add.f16x2 %34, r2651, r2657; +} +{ +add.f16x2 r2663, r1935, r1951; +} +{ +mul.f16x2 r2666, r2663, r2704; +} +{ +add.f16x2 r2669, r530, r2666; +} +{ +sub.f16x2 r2672, r1941, r1957; +} +{ +mul.f16x2 r2675, r2672, r2713; +} +{ +sub.f16x2 %52, r2669, r2675; +} +{ +add.f16x2 r2681, r1941, r1957; +} +{ +mul.f16x2 r2684, r2681, r2704; +} +{ +add.f16x2 r2687, r566, r2684; +} +{ +sub.f16x2 r2690, r1935, r1951; +} +{ +mul.f16x2 r2693, r2690, r2713; +} +{ +sub.f16x2 %35, r2687, r2693; +} +{ +add.f16x2 r2699, r1941, r1957; +} +{ +mul.f16x2 r2702, r2699, r2704; +} +{ +add.f16x2 r2705, r566, r2702; +} +{ +sub.f16x2 r2708, r1935, r1951; +} +{ +mul.f16x2 r2711, r2708, r2713; +} +{ +add.f16x2 %53, r2705, r2711; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[17].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1071, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<92>; +.reg .b32 r<1165>; +.reg .b64 rd<4>; +mov.u32 r1154, %tid.y; +mov.u32 r1155, %18; +mad.lo.s32 r1156, r1154, 216, r1155; +mov.u32 r1157, %tid.x; +mov.f32 f86, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1, {low, high}; +} +mov.f32 f88, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f14, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r259, {low, high}; +} +mov.f32 f16, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r260, {low, high}; +} +mov.f32 f18, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f18; +mov.b32 r261, {low, high}; +} +mov.f32 f20, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f20; +cvt.rn.f16.f32 high, f20; +mov.b32 r262, {low, high}; +} +mov.f32 f26, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r265, {low, high}; +} +mov.f32 f28, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r1157, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1158, rd3; +mul.lo.s32 r1159, r1158, 3; +sub.s32 r1160, r1157, r1159; +cvt.rn.f32.u32 f89, r1160; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f57, f90; +sin.approx.f32 f91, f90; +neg.f32 f58, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +mad.lo.s32 r1161, r1158, 216, r1156; +barrier.sync 0; +mad.lo.s32 r1162, r1160, 72, r1161; +st.shared.v2.f32 [r1162], {r344, r350}; +st.shared.v2.f32 [r1162+8], {r607, r616}; +st.shared.v2.f32 [r1162+16], {r644, r653}; +st.shared.v2.f32 [r1162+24], {r681, r690}; +st.shared.v2.f32 [r1162+32], {r718, r727}; +st.shared.v2.f32 [r1162+40], {r755, r764}; +st.shared.v2.f32 [r1162+48], {r792, r801}; +st.shared.v2.f32 [r1162+56], {r829, r838}; +st.shared.v2.f32 [r1162+64], {r866, r875}; +barrier.sync 0; +shl.b32 r1163, r1160, 6; +sub.s32 r1164, r1162, r1163; +ld.shared.u32 r902, [r1164]; +ld.shared.u32 r908, [r1164+4]; +ld.shared.u32 r988, [r1164+24]; +ld.shared.u32 r994, [r1164+28]; +ld.shared.u32 r1074, [r1164+48]; +ld.shared.u32 r1080, [r1164+52]; +ld.shared.u32 r899, [r1164+72]; +ld.shared.u32 r905, [r1164+76]; +ld.shared.u32 r985, [r1164+96]; +ld.shared.u32 r991, [r1164+100]; +ld.shared.u32 r1071, [r1164+120]; +ld.shared.u32 r1077, [r1164+124]; +ld.shared.u32 r900, [r1164+144]; +ld.shared.u32 r906, [r1164+148]; +ld.shared.u32 r986, [r1164+168]; +ld.shared.u32 r992, [r1164+172]; +ld.shared.u32 r1072, [r1164+192]; +ld.shared.u32 r1078, [r1164+196]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 %0, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 %1, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 %6, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 %12, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 %7, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 %13, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 %2, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 %3, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 %8, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 %14, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 %9, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 %15, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 %4, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 %5, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 %10, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 %16, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 %11, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 %17, r1142, r1148; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1070, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<92>; +.reg .b32 r<1165>; +.reg .b64 rd<4>; +mov.u32 r1154, %tid.y; +mov.u32 r1155, %18; +mad.lo.s32 r1156, r1154, 108, r1155; +mov.u32 r1157, %tid.x; +mov.f32 f86, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1, {low, high}; +} +mov.f32 f88, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f14, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r259, {low, high}; +} +mov.f32 f16, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r260, {low, high}; +} +mov.f32 f18, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f18; +cvt.rn.f16.f32 high, f18; +mov.b32 r261, {low, high}; +} +mov.f32 f20, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f20; +cvt.rn.f16.f32 high, f20; +mov.b32 r262, {low, high}; +} +mov.f32 f26, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r265, {low, high}; +} +mov.f32 f28, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r1157, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1158, rd3; +mul.lo.s32 r1159, r1158, 3; +sub.s32 r1160, r1157, r1159; +mad.lo.s32 r1161, r1158, 108, r1156; +cvt.rn.f32.u32 f89, r1160; +mul.f32 f90, f89, 0f3E6E4BAE; +cos.approx.f32 f57, f90; +sin.approx.f32 f91, f90; +neg.f32 f58, f91; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +barrier.sync 0; +mad.lo.s32 r1162, r1160, 36, r1161; +st.shared.u32 [r1162], r344; +st.shared.u32 [r1162+4], r607; +st.shared.u32 [r1162+8], r644; +st.shared.u32 [r1162+12], r681; +st.shared.u32 [r1162+16], r718; +st.shared.u32 [r1162+20], r755; +st.shared.u32 [r1162+24], r792; +st.shared.u32 [r1162+28], r829; +st.shared.u32 [r1162+32], r866; +barrier.sync 0; +shl.b32 r1163, r1160, 5; +sub.s32 r1164, r1162, r1163; +ld.shared.u32 r902, [r1164]; +ld.shared.u32 r988, [r1164+12]; +ld.shared.u32 r1074, [r1164+24]; +ld.shared.u32 r899, [r1164+36]; +ld.shared.u32 r985, [r1164+48]; +ld.shared.u32 r1071, [r1164+60]; +ld.shared.u32 r900, [r1164+72]; +ld.shared.u32 r986, [r1164+84]; +ld.shared.u32 r1072, [r1164+96]; +barrier.sync 0; +st.shared.u32 [r1162], r350; +st.shared.u32 [r1162+4], r616; +st.shared.u32 [r1162+8], r653; +st.shared.u32 [r1162+12], r690; +st.shared.u32 [r1162+16], r727; +st.shared.u32 [r1162+20], r764; +st.shared.u32 [r1162+24], r801; +st.shared.u32 [r1162+28], r838; +st.shared.u32 [r1162+32], r875; +barrier.sync 0; +ld.shared.u32 r908, [r1164]; +ld.shared.u32 r994, [r1164+12]; +ld.shared.u32 r1080, [r1164+24]; +ld.shared.u32 r905, [r1164+36]; +ld.shared.u32 r991, [r1164+48]; +ld.shared.u32 r1077, [r1164+60]; +ld.shared.u32 r906, [r1164+72]; +ld.shared.u32 r992, [r1164+84]; +ld.shared.u32 r1078, [r1164+96]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 %0, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 %1, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 %6, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 %12, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 %7, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 %13, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 %2, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 %3, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 %8, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 %14, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 %9, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 %15, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 %4, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 %5, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 %10, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 %16, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 %11, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 %17, r1142, r1148; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1072, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<35>; +.reg .b32 r<430>; +.reg .b64 rd<6>; +mov.u32 r413, %tid.y; +mov.u32 r414, %6; +mad.lo.s32 r415, r413, 216, r414; +mov.u32 r416, %tid.x; +mov.f32 f26, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r1, {low, high}; +} +mov.f32 f28, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r416, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r417, rd3; +mul.lo.s32 r418, r417, 9; +sub.s32 r419, r416, r418; +mad.lo.s32 r420, r417, 216, r415; +cvt.rn.f32.u32 f29, r419; +mul.f32 f30, f29, 0f3E6E4BAE; +cos.approx.f32 f5, f30; +sin.approx.f32 f31, f30; +neg.f32 f6, f31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r421, r419, 24, r420; +st.shared.v2.f32 [r421], {r6, r12}; +st.shared.v2.f32 [r421+8], {r97, r106}; +st.shared.v2.f32 [r421+16], {r134, r143}; +barrier.sync 0; +shl.b32 r422, r419, 4; +sub.s32 r423, r421, r422; +ld.shared.u32 r170, [r423]; +ld.shared.u32 r176, [r423+4]; +ld.shared.u32 r167, [r423+72]; +ld.shared.u32 r173, [r423+76]; +ld.shared.u32 r168, [r423+144]; +ld.shared.u32 r174, [r423+148]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r419, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r424, rd5; +mul.lo.s32 r425, r424, 3; +sub.s32 r426, r419, r425; +shl.b32 r427, r426, 3; +add.s32 r428, r420, r427; +cvt.rn.f32.u32 f32, r424; +mul.f32 f33, f32, 0f3F32B8C2; +cos.approx.f32 f17, f33; +sin.approx.f32 f34, f33; +neg.f32 f18, f34; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r429, r424, 72, r428; +st.shared.u32 [r429], r169; +st.shared.u32 [r429+4], r175; +st.shared.u32 [r429+24], r260; +st.shared.u32 [r429+28], r269; +st.shared.u32 [r429+48], r297; +st.shared.u32 [r429+52], r306; +barrier.sync 0; +ld.shared.u32 r333, [r423]; +ld.shared.u32 r339, [r423+4]; +ld.shared.u32 r330, [r423+72]; +ld.shared.u32 r336, [r423+76]; +ld.shared.u32 r331, [r423+144]; +ld.shared.u32 r337, [r423+148]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 %0, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 %1, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 %2, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 %4, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 %3, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 %5, r401, r407; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1073, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<35>; +.reg .b32 r<430>; +.reg .b64 rd<6>; +mov.u32 r413, %tid.y; +mov.u32 r414, %6; +mad.lo.s32 r415, r413, 108, r414; +mov.u32 r416, %tid.x; +mov.f32 f26, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r1, {low, high}; +} +mov.f32 f28, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r416, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r417, rd3; +mul.lo.s32 r418, r417, 9; +sub.s32 r419, r416, r418; +mad.lo.s32 r420, r417, 108, r415; +cvt.rn.f32.u32 f29, r419; +mul.f32 f30, f29, 0f3E6E4BAE; +cos.approx.f32 f5, f30; +sin.approx.f32 f31, f30; +neg.f32 f6, f31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f21, 0fBF800000; +mov.f32 f22, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r421, r419, 12, r420; +st.shared.u32 [r421], r6; +st.shared.u32 [r421+4], r97; +st.shared.u32 [r421+8], r134; +barrier.sync 0; +shl.b32 r422, r419, 3; +sub.s32 r423, r421, r422; +ld.shared.u32 r170, [r423]; +ld.shared.u32 r167, [r423+36]; +ld.shared.u32 r168, [r423+72]; +barrier.sync 0; +st.shared.u32 [r421], r12; +st.shared.u32 [r421+4], r106; +st.shared.u32 [r421+8], r143; +barrier.sync 0; +ld.shared.u32 r176, [r423]; +ld.shared.u32 r173, [r423+36]; +ld.shared.u32 r174, [r423+72]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r419, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r424, rd5; +mul.lo.s32 r425, r424, 3; +sub.s32 r426, r419, r425; +shl.b32 r427, r426, 2; +add.s32 r428, r420, r427; +cvt.rn.f32.u32 f32, r424; +mul.f32 f33, f32, 0f3F32B8C2; +cos.approx.f32 f17, f33; +sin.approx.f32 f34, f33; +neg.f32 f18, f34; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r429, r424, 36, r428; +st.shared.u32 [r429], r169; +st.shared.u32 [r429+12], r260; +st.shared.u32 [r429+24], r297; +barrier.sync 0; +ld.shared.u32 r333, [r423]; +ld.shared.u32 r330, [r423+36]; +ld.shared.u32 r331, [r423+72]; +barrier.sync 0; +st.shared.u32 [r429], r175; +st.shared.u32 [r429+12], r269; +st.shared.u32 [r429+24], r306; +barrier.sync 0; +ld.shared.u32 r339, [r423]; +ld.shared.u32 r336, [r423+36]; +ld.shared.u32 r337, [r423+72]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f26; +cvt.rn.f16.f32 high, f26; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f28; +cvt.rn.f16.f32 high, f28; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 %0, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 %1, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 %2, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 %4, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 %3, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 %5, r401, r407; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..8af0e633e0582 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp32_fwd.hpp.inc @@ -0,0 +1,1480 @@ +#ifndef CUFFTDX_FFT_27_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_27_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<121, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<796>; +.reg .b64 rd<5>; +add.f32 f109, %72, %90; +add.f32 f110, %54, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %54, f113; +add.f32 f789, %73, %91; +sub.f32 f115, %73, %91; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f788, %55, f789; +mul.f32 f119, f789, 0f3F000000; +sub.f32 f120, %55, f119; +sub.f32 f121, %72, %90; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %78, %96; +add.f32 f126, %60, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %60, f129; +add.f32 f787, %79, %97; +sub.f32 f131, %79, %97; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f786, %61, f787; +mul.f32 f135, f787, 0f3F000000; +sub.f32 f136, %61, f135; +sub.f32 f137, %78, %96; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %84, %102; +add.f32 f142, %66, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %66, f145; +add.f32 f785, %85, %103; +sub.f32 f147, %85, %103; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f784, %67, f785; +mul.f32 f151, f785, 0f3F000000; +sub.f32 f152, %67, f151; +sub.f32 f153, %84, %102; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f783, f133, 0f3F441B7D; +sub.f32 f159, f783, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f163, f155, 0fBF7C1C5C; +mul.f32 f782, f149, 0f3E31D0D4; +sub.f32 f164, f782, f163; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f168, f140, 0fBF7C1C5C; +mul.f32 f781, f134, 0f3E31D0D4; +sub.f32 f169, f781, f168; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f173, f156, 0fBEAF1D44; +mul.f32 f780, f150, 0fBF708FB2; +sub.f32 f174, f780, f173; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f779, f786, f784; +sub.f32 f183, f786, f784; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f778, f788, f779; +mul.f32 f187, f779, 0f3F000000; +sub.f32 f188, f788, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f777, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f776, f123, f777; +mul.f32 f203, f777, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f775, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f774, f124, f775; +mul.f32 f219, f775, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %74, %92; +add.f32 f226, %56, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %56, f229; +add.f32 f771, %109, %108; +sub.f32 f231, %109, %108; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f769, %110, f771; +mul.f32 f235, f771, 0f3F000000; +sub.f32 f236, %110, f235; +sub.f32 f237, %74, %92; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %80, %98; +add.f32 f242, %62, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %62, f245; +add.f32 f766, %111, %112; +sub.f32 f247, %111, %112; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f764, %113, f766; +mul.f32 f251, f766, 0f3F000000; +sub.f32 f252, %113, f251; +sub.f32 f253, %80, %98; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %86, %104; +add.f32 f258, %68, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %68, f261; +add.f32 f761, %115, %114; +sub.f32 f263, %115, %114; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f759, %116, f761; +mul.f32 f267, f761, 0f3F000000; +sub.f32 f268, %116, f267; +sub.f32 f269, %86, %104; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f758, f249, 0f3F441B7D; +sub.f32 f275, f758, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f757, f265, 0f3E31D0D4; +sub.f32 f280, f757, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f284, f256, 0fBF7C1C5C; +mul.f32 f756, f250, 0f3E31D0D4; +sub.f32 f285, f756, f284; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f289, f272, 0fBEAF1D44; +mul.f32 f755, f266, 0fBF708FB2; +sub.f32 f290, f755, f289; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f754, f764, f759; +sub.f32 f299, f764, f759; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f753, f769, f754; +mul.f32 f303, f754, 0f3F000000; +sub.f32 f304, f769, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f752, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f751, f239, f752; +mul.f32 f319, f752, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f750, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f749, f240, f750; +mul.f32 f335, f750, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %76, %94; +add.f32 f342, %58, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %58, f345; +add.f32 f746, %118, %117; +sub.f32 f347, %118, %117; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f744, %119, f746; +mul.f32 f351, f746, 0f3F000000; +sub.f32 f352, %119, f351; +sub.f32 f353, %76, %94; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %82, %100; +add.f32 f358, %64, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %64, f361; +add.f32 f741, %121, %120; +sub.f32 f363, %121, %120; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f739, %122, f741; +mul.f32 f367, f741, 0f3F000000; +sub.f32 f368, %122, f367; +sub.f32 f369, %82, %100; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %88, %106; +add.f32 f374, %70, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %70, f377; +add.f32 f737, %123, %107; +sub.f32 f379, %123, %107; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f735, %124, f737; +mul.f32 f383, f737, 0f3F000000; +sub.f32 f384, %124, f383; +sub.f32 f385, %88, %106; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f734, f365, 0f3F441B7D; +sub.f32 f391, f734, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f733, f381, 0f3E31D0D4; +sub.f32 f396, f733, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f400, f372, 0fBF7C1C5C; +mul.f32 f732, f366, 0f3E31D0D4; +sub.f32 f401, f732, f400; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f405, f388, 0fBEAF1D44; +mul.f32 f731, f382, 0fBF708FB2; +sub.f32 f406, f731, f405; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f730, f739, f735; +sub.f32 f415, f739, f735; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f729, f744, f730; +mul.f32 f419, f730, 0f3F000000; +sub.f32 f420, f744, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f728, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f727, f355, f728; +mul.f32 f435, f728, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f726, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f725, f356, f726; +mul.f32 f451, f726, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f723, f310, 0f3F791978; +mul.f32 f724, f751, 0fBE6C2691; +sub.f32 f459, f723, f724; +mul.f32 f460, f751, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f721, f426, 0f3F64C51C; +mul.f32 f722, f727, 0fBEE5C902; +sub.f32 f464, f721, f722; +mul.f32 f465, f727, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f468, f749, 0fBEE5C902; +mul.f32 f720, f326, 0f3F64C51C; +sub.f32 f469, f720, f468; +mul.f32 f470, f749, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f473, f725, 0fBF4D57F2; +mul.f32 f719, f442, 0f3F18DF63; +sub.f32 f474, f719, f473; +mul.f32 f475, f725, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f478, f307, 0fBF248DBB; +mul.f32 f718, f301, 0f3F441B7D; +sub.f32 f479, f718, f478; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f717, f417, 0f3E31D0D4; +sub.f32 f484, f717, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f716, f317, 0f3F18DF63; +sub.f32 f489, f716, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f715, f433, 0fBE92D7E0; +sub.f32 f494, f715, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f713, f333, 0f3ECACAF8; +mul.f32 f714, f339, 0fBF6B1036; +sub.f32 f499, f713, f714; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f711, f449, 0fBF2FAD88; +mul.f32 f712, f455, 0fBF3A3529; +sub.f32 f504, f711, f712; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f709, f302, 0f3E31D0D4; +mul.f32 f710, f308, 0fBF7C1C5C; +sub.f32 f509, f709, f710; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f707, f418, 0fBF708FB2; +mul.f32 f708, f424, 0fBEAF1D44; +sub.f32 f514, f707, f708; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f518, f324, 0fBF7F9120; +mul.f32 f706, f318, 0fBD6E2946; +sub.f32 f519, f706, f518; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f523, f440, 0f3DEDC21F; +mul.f32 f705, f434, 0fBF7E44DE; +sub.f32 f524, f705, f523; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f704, f334, 0fBE92D7E0; +sub.f32 f529, f704, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f703, f450, 0fBF55E287; +sub.f32 f534, f703, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f702, f753, f729; +sub.f32 f541, f753, f729; +mul.f32 f542, f541, 0f3F5DB3D7; +mul.f32 f543, f702, 0f3F000000; +sub.f32 f544, f778, f543; +sub.f32 f545, f294, f410; +mul.f32 f546, f545, 0f3F5DB3D7; +add.f32 f547, f459, f464; +mul.f32 f549, f547, 0f3F000000; +sub.f32 f550, f194, f549; +add.f32 f701, f461, f466; +sub.f32 f551, f461, f466; +mul.f32 f552, f551, 0f3F5DB3D7; +mul.f32 f553, f701, 0f3F000000; +sub.f32 f554, f776, f553; +sub.f32 f555, f459, f464; +mul.f32 f556, f555, 0f3F5DB3D7; +add.f32 f557, f469, f474; +mul.f32 f559, f557, 0f3F000000; +sub.f32 f560, f210, f559; +add.f32 f700, f471, f476; +sub.f32 f561, f471, f476; +mul.f32 f562, f561, 0f3F5DB3D7; +mul.f32 f563, f700, 0f3F000000; +sub.f32 f564, f774, f563; +sub.f32 f565, f469, f474; +mul.f32 f566, f565, 0f3F5DB3D7; +add.f32 f567, f479, f484; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f185, f569; +add.f32 f699, f481, f486; +sub.f32 f571, f481, f486; +mul.f32 f572, f571, 0f3F5DB3D7; +mul.f32 f573, f699, 0f3F000000; +sub.f32 f574, f191, f573; +sub.f32 f575, f479, f484; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f489, f494; +mul.f32 f579, f577, 0f3F000000; +sub.f32 f580, f201, f579; +add.f32 f698, f491, f496; +sub.f32 f581, f491, f496; +mul.f32 f582, f581, 0f3F5DB3D7; +mul.f32 f583, f698, 0f3F000000; +sub.f32 f584, f207, f583; +sub.f32 f585, f489, f494; +mul.f32 f586, f585, 0f3F5DB3D7; +add.f32 f587, f499, f504; +mul.f32 f589, f587, 0f3F000000; +sub.f32 f590, f217, f589; +add.f32 f697, f501, f506; +sub.f32 f591, f501, f506; +mul.f32 f592, f591, 0f3F5DB3D7; +mul.f32 f593, f697, 0f3F000000; +sub.f32 f594, f223, f593; +sub.f32 f595, f499, f504; +mul.f32 f596, f595, 0f3F5DB3D7; +add.f32 f597, f509, f514; +mul.f32 f599, f597, 0f3F000000; +sub.f32 f600, f186, f599; +add.f32 f696, f511, f516; +sub.f32 f601, f511, f516; +mul.f32 f602, f601, 0f3F5DB3D7; +mul.f32 f603, f696, 0f3F000000; +sub.f32 f604, f192, f603; +sub.f32 f605, f509, f514; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f519, f524; +mul.f32 f609, f607, 0f3F000000; +sub.f32 f610, f202, f609; +add.f32 f695, f521, f526; +sub.f32 f611, f521, f526; +mul.f32 f612, f611, 0f3F5DB3D7; +mul.f32 f613, f695, 0f3F000000; +sub.f32 f614, f208, f613; +sub.f32 f615, f519, f524; +mul.f32 f616, f615, 0f3F5DB3D7; +add.f32 f617, f529, f534; +mul.f32 f619, f617, 0f3F000000; +sub.f32 f620, f218, f619; +add.f32 f694, f531, f536; +sub.f32 f621, f531, f536; +mul.f32 f622, f621, 0f3F5DB3D7; +mul.f32 f623, f694, 0f3F000000; +sub.f32 f624, f224, f623; +sub.f32 f625, f529, f534; +mul.f32 f791, f700, 0f3F000000; +sub.f32 f790, f774, f791; +mul.f32 f626, f625, 0f3F5DB3D7; +add.f32 %1, f778, f702; +mul.f32 f793, f537, 0f3F000000; +sub.f32 f792, f178, f793; +add.f32 %0, f178, f537; +mul.f32 f795, f701, 0f3F000000; +sub.f32 f794, f776, f795; +add.f32 %3, f776, f701; +add.f32 %2, f194, f547; +add.f32 %5, f774, f700; +add.f32 %4, f210, f557; +add.f32 %7, f191, f699; +add.f32 %6, f185, f567; +add.f32 %9, f207, f698; +add.f32 %8, f201, f577; +add.f32 %11, f223, f697; +add.f32 %10, f217, f587; +add.f32 %13, f192, f696; +add.f32 %12, f186, f597; +add.f32 %15, f208, f695; +add.f32 %14, f202, f607; +add.f32 %17, f224, f694; +add.f32 %16, f218, f617; +add.f32 %18, f542, f792; +sub.f32 %19, f544, f546; +sub.f32 %21, f794, f556; +add.f32 %20, f552, f550; +sub.f32 %23, f790, f566; +add.f32 %22, f562, f560; +sub.f32 %25, f574, f576; +add.f32 %24, f572, f570; +add.f32 %26, f582, f580; +sub.f32 %27, f584, f586; +add.f32 %28, f592, f590; +sub.f32 %29, f594, f596; +add.f32 %30, f602, f600; +sub.f32 %31, f604, f606; +add.f32 %32, f612, f610; +sub.f32 %33, f614, f616; +sub.f32 %35, f624, f626; +add.f32 %34, f622, f620; +add.f32 %37, f546, f544; +sub.f32 %36, f792, f542; +add.f32 %39, f556, f794; +sub.f32 %38, f550, f552; +add.f32 %41, f566, f790; +sub.f32 %40, f560, f562; +add.f32 %43, f576, f574; +sub.f32 %42, f570, f572; +add.f32 %45, f586, f584; +sub.f32 %44, f580, f582; +add.f32 %47, f596, f594; +sub.f32 %46, f590, f592; +add.f32 %49, f606, f604; +sub.f32 %48, f600, f602; +add.f32 %51, f616, f614; +sub.f32 %50, f610, f612; +add.f32 %53, f626, f624; +sub.f32 %52, f620, f622; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[13].y), "f"(rmem[22].y), "f"(rmem[4].y), "f"(rmem[25].y), "f"(rmem[16].y), "f"(rmem[7].y), "f"(rmem[20].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<123, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<316>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 216, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0f3F5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0f3F5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 216, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f151, f120; +mul.f32 f156, f152, f122; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f160, f136; +mul.f32 f164, f162, f138; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f168, f111; +mul.f32 f172, f170, f117; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f176, f127; +mul.f32 f180, f178, f133; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f184, f143; +mul.f32 f188, f186, f149; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f192, f112; +mul.f32 f196, f194, f118; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f200, f128; +mul.f32 f204, f202, f134; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f208, f144; +mul.f32 f212, f210, f150; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f152, f120, f157; +sub.f32 f217, f155, f156; +st.shared.v2.f32 [r9+8], {f217, f216}; +fma.rn.f32 f218, f162, f136, f165; +sub.f32 f219, f163, f164; +st.shared.v2.f32 [r9+16], {f219, f218}; +sub.f32 f220, f171, f172; +fma.rn.f32 f221, f170, f111, f173; +st.shared.v2.f32 [r9+24], {f220, f221}; +fma.rn.f32 f222, f178, f127, f181; +sub.f32 f223, f179, f180; +st.shared.v2.f32 [r9+32], {f223, f222}; +sub.f32 f224, f187, f188; +fma.rn.f32 f225, f186, f143, f189; +st.shared.v2.f32 [r9+40], {f224, f225}; +fma.rn.f32 f226, f194, f112, f197; +sub.f32 f227, f195, f196; +st.shared.v2.f32 [r9+48], {f227, f226}; +fma.rn.f32 f228, f202, f128, f205; +sub.f32 f229, f203, f204; +st.shared.v2.f32 [r9+56], {f229, f228}; +fma.rn.f32 f230, f210, f144, f213; +sub.f32 f231, f211, f212; +st.shared.v2.f32 [r9+64], {f231, f230}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+24]; +ld.shared.v2.f32 {f240, f241}, [r11+48]; +ld.shared.v2.f32 {f244, f245}, [r11+72]; +ld.shared.v2.f32 {f248, f249}, [r11+96]; +ld.shared.v2.f32 {f252, f253}, [r11+120]; +ld.shared.v2.f32 {f256, f257}, [r11+144]; +ld.shared.v2.f32 {f260, f261}, [r11+168]; +ld.shared.v2.f32 {f264, f265}, [r11+192]; +add.f32 f268, f244, f256; +add.f32 f269, f245, f257; +mul.f32 f270, f268, 0f3F000000; +sub.f32 f271, f232, f270; +sub.f32 f272, f245, f257; +mul.f32 f273, f272, 0f3F5DB3D7; +mul.f32 f274, f269, 0f3F000000; +sub.f32 f275, f233, f274; +sub.f32 f276, f244, f256; +mul.f32 f277, f276, 0f3F5DB3D7; +add.f32 f278, f248, f260; +add.f32 f279, f249, f261; +mul.f32 f280, f278, 0f3F000000; +sub.f32 f281, f236, f280; +sub.f32 f282, f249, f261; +mul.f32 f283, f282, 0f3F5DB3D7; +mul.f32 f284, f279, 0f3F000000; +sub.f32 f285, f237, f284; +sub.f32 f286, f248, f260; +mul.f32 f287, f286, 0f3F5DB3D7; +add.f32 f288, f252, f264; +add.f32 f289, f253, f265; +mul.f32 f290, f288, 0f3F000000; +sub.f32 f291, f240, f290; +sub.f32 f292, f253, f265; +mul.f32 f293, f292, 0f3F5DB3D7; +mul.f32 f294, f289, 0f3F000000; +sub.f32 f295, f241, f294; +sub.f32 f296, f252, f264; +mul.f32 f297, f296, 0f3F5DB3D7; +add.f32 %1, f233, f269; +add.f32 %0, f232, f268; +add.f32 %3, f237, f279; +add.f32 %2, f236, f278; +add.f32 %5, f241, f289; +add.f32 %4, f240, f288; +sub.f32 %7, f275, f277; +add.f32 %6, f273, f271; +sub.f32 %9, f285, f287; +add.f32 %8, f283, f281; +sub.f32 %11, f295, f297; +add.f32 %10, f293, f291; +add.f32 %13, f277, f275; +sub.f32 %12, f271, f273; +add.f32 %15, f287, f285; +sub.f32 %14, f281, f283; +add.f32 %17, f297, f295; +sub.f32 %16, f291, f293; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<122, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<298>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 108, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0f3F5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0f3F5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0f3F5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0f3F5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f153, f122; +mul.f32 f158, f154, f124; +sub.f32 f159, f157, f158; +mul.f32 f160, f153, f124; +fma.rn.f32 f161, f154, f122, f160; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f164, f138; +mul.f32 f168, f166, f140; +sub.f32 f169, f167, f168; +mul.f32 f170, f164, f140; +fma.rn.f32 f171, f166, f138, f170; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f174, f113; +mul.f32 f178, f176, f119; +sub.f32 f179, f177, f178; +mul.f32 f180, f174, f119; +fma.rn.f32 f181, f176, f113, f180; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f184, f129; +mul.f32 f188, f186, f135; +sub.f32 f189, f187, f188; +mul.f32 f190, f184, f135; +fma.rn.f32 f191, f186, f129, f190; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f194, f145; +mul.f32 f198, f196, f151; +sub.f32 f199, f197, f198; +mul.f32 f200, f194, f151; +fma.rn.f32 f201, f196, f145, f200; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f204, f114; +mul.f32 f208, f206, f120; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f120; +fma.rn.f32 f211, f206, f114, f210; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f214, f130; +mul.f32 f218, f216, f136; +sub.f32 f219, f217, f218; +mul.f32 f220, f214, f136; +fma.rn.f32 f221, f216, f130, f220; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f224, f146; +mul.f32 f228, f226, f152; +sub.f32 f229, f227, f228; +mul.f32 f230, f224, f152; +fma.rn.f32 f231, f226, f146, f230; +mad.lo.s32 r8, r5, 108, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f159; +st.shared.f32 [r9+8], f169; +st.shared.f32 [r9+12], f179; +st.shared.f32 [r9+16], f189; +st.shared.f32 [r9+20], f199; +st.shared.f32 [r9+24], f209; +st.shared.f32 [r9+28], f219; +st.shared.f32 [r9+32], f229; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+12]; +ld.shared.f32 f234, [r11+24]; +ld.shared.f32 f235, [r11+36]; +ld.shared.f32 f236, [r11+48]; +ld.shared.f32 f237, [r11+60]; +ld.shared.f32 f238, [r11+72]; +ld.shared.f32 f239, [r11+84]; +ld.shared.f32 f240, [r11+96]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+12]; +ld.shared.f32 f243, [r11+24]; +ld.shared.f32 f244, [r11+36]; +ld.shared.f32 f245, [r11+48]; +ld.shared.f32 f246, [r11+60]; +ld.shared.f32 f247, [r11+72]; +ld.shared.f32 f248, [r11+84]; +ld.shared.f32 f249, [r11+96]; +add.f32 f250, f235, f238; +add.f32 f251, f244, f247; +mul.f32 f252, f250, 0f3F000000; +sub.f32 f253, f232, f252; +sub.f32 f254, f244, f247; +mul.f32 f255, f254, 0f3F5DB3D7; +mul.f32 f256, f251, 0f3F000000; +sub.f32 f257, f241, f256; +sub.f32 f258, f235, f238; +mul.f32 f259, f258, 0f3F5DB3D7; +add.f32 f260, f236, f239; +add.f32 f261, f245, f248; +mul.f32 f262, f260, 0f3F000000; +sub.f32 f263, f233, f262; +sub.f32 f264, f245, f248; +mul.f32 f265, f264, 0f3F5DB3D7; +mul.f32 f266, f261, 0f3F000000; +sub.f32 f267, f242, f266; +sub.f32 f268, f236, f239; +mul.f32 f269, f268, 0f3F5DB3D7; +add.f32 f270, f237, f240; +add.f32 f271, f246, f249; +mul.f32 f272, f270, 0f3F000000; +sub.f32 f273, f234, f272; +sub.f32 f274, f246, f249; +mul.f32 f275, f274, 0f3F5DB3D7; +mul.f32 f276, f271, 0f3F000000; +sub.f32 f277, f243, f276; +sub.f32 f278, f237, f240; +mul.f32 f279, f278, 0f3F5DB3D7; +add.f32 %0, f232, f250; +add.f32 %1, f241, f251; +add.f32 %2, f233, f260; +add.f32 %3, f242, f261; +add.f32 %4, f234, f270; +add.f32 %5, f243, f271; +add.f32 %6, f255, f253; +sub.f32 %7, f257, f259; +add.f32 %8, f265, f263; +sub.f32 %9, f267, f269; +add.f32 %10, f275, f273; +sub.f32 %11, f277, f279; +sub.f32 %12, f253, f255; +add.f32 %13, f259, f257; +sub.f32 %14, f263, f265; +add.f32 %15, f269, f267; +sub.f32 %16, f273, f275; +add.f32 %17, f279, f277; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<124, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<123>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 216, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %11, %14; +add.f32 f14, %13, %15; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %9, f15; +sub.f32 f17, %13, %15; +mul.f32 f18, f17, 0f3F5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %10, f21; +sub.f32 f23, %11, %14; +mul.f32 f24, f23, 0f3F5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 216, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f27, f19; +mul.f32 f32, f28, f25; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f36, f20; +mul.f32 f40, f38, f26; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %10, f14; +add.f32 f43, %9, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f28, f19, f33; +sub.f32 f45, f31, f32; +st.shared.v2.f32 [r9+8], {f45, f44}; +sub.f32 f46, f39, f40; +fma.rn.f32 f47, f38, f20, f41; +st.shared.v2.f32 [r9+16], {f46, f47}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+72]; +ld.shared.v2.f32 {f56, f57}, [r11+144]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0f3F5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0f3F5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f74, f66; +mul.f32 f79, f75, f72; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f83, f67; +mul.f32 f87, f85, f73; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f75, f66, f80; +sub.f32 f92, f78, f79; +st.shared.v2.f32 [r17+24], {f92, f91}; +fma.rn.f32 f93, f85, f67, f88; +sub.f32 f94, f86, f87; +st.shared.v2.f32 [r17+48], {f94, f93}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+72]; +ld.shared.v2.f32 {f103, f104}, [r11+144]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0f3F5DB3D7; +mul.f32 f113, f108, 0f3F000000; +sub.f32 f114, f96, f113; +sub.f32 f115, f99, f103; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 %1, f96, f108; +add.f32 %0, f95, f107; +sub.f32 %3, f114, f116; +add.f32 %2, f112, f110; +add.f32 %5, f116, f114; +sub.f32 %4, f110, f112; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<125, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<111>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 108, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %11, %14; +add.f32 f14, %9, f13; +add.f32 f15, %13, %15; +add.f32 f16, %10, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %9, f17; +sub.f32 f19, %13, %15; +mul.f32 f20, f19, 0f3F5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %10, f23; +sub.f32 f25, %11, %14; +mul.f32 f26, f25, 0f3F5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 108, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f29, f21; +mul.f32 f34, f30, f27; +sub.f32 f35, f33, f34; +mul.f32 f36, f29, f27; +fma.rn.f32 f37, f30, f21, f36; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f40, f22; +mul.f32 f44, f42, f28; +sub.f32 f45, f43, f44; +mul.f32 f46, f40, f28; +fma.rn.f32 f47, f42, f22, f46; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f35; +st.shared.f32 [r9+8], f45; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+36]; +ld.shared.f32 f50, [r11+72]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+36]; +ld.shared.f32 f53, [r11+72]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0f3F5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0f3F5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f70, f62; +mul.f32 f75, f71, f68; +sub.f32 f76, f74, f75; +mul.f32 f77, f70, f68; +fma.rn.f32 f78, f71, f62, f77; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f81, f63; +mul.f32 f85, f83, f69; +sub.f32 f86, f84, f85; +mul.f32 f87, f81, f69; +fma.rn.f32 f88, f83, f63, f87; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f76; +st.shared.f32 [r17+24], f86; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+36]; +ld.shared.f32 f91, [r11+72]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+36]; +ld.shared.f32 f94, [r11+72]; +add.f32 f95, f90, f91; +add.f32 f96, f93, f94; +mul.f32 f97, f95, 0f3F000000; +sub.f32 f98, f89, f97; +sub.f32 f99, f93, f94; +mul.f32 f100, f99, 0f3F5DB3D7; +mul.f32 f101, f96, 0f3F000000; +sub.f32 f102, f92, f101; +sub.f32 f103, f90, f91; +mul.f32 f104, f103, 0f3F5DB3D7; +add.f32 %0, f89, f95; +add.f32 %1, f92, f96; +add.f32 %2, f100, f98; +sub.f32 %3, f102, f104; +sub.f32 %4, f98, f100; +add.f32 %5, f104, f102; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..16b476296eeae --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp32_inv.hpp.inc @@ -0,0 +1,1480 @@ +#ifndef CUFFTDX_FFT_27_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_27_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<323, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<796>; +.reg .b64 rd<5>; +add.f32 f109, %72, %90; +add.f32 f110, %54, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %54, f113; +add.f32 f789, %73, %91; +sub.f32 f115, %73, %91; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f788, %55, f789; +mul.f32 f119, f789, 0f3F000000; +sub.f32 f120, %55, f119; +sub.f32 f121, %72, %90; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %78, %96; +add.f32 f126, %60, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %60, f129; +add.f32 f787, %79, %97; +sub.f32 f131, %79, %97; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f786, %61, f787; +mul.f32 f135, f787, 0f3F000000; +sub.f32 f136, %61, f135; +sub.f32 f137, %78, %96; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %84, %102; +add.f32 f142, %66, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %66, f145; +add.f32 f785, %85, %103; +sub.f32 f147, %85, %103; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f784, %67, f785; +mul.f32 f151, f785, 0f3F000000; +sub.f32 f152, %67, f151; +sub.f32 f153, %84, %102; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f783, f133, 0f3F441B7D; +sub.f32 f159, f783, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f163, f155, 0f3F7C1C5C; +mul.f32 f782, f149, 0f3E31D0D4; +sub.f32 f164, f782, f163; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f168, f140, 0f3F7C1C5C; +mul.f32 f781, f134, 0f3E31D0D4; +sub.f32 f169, f781, f168; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f173, f156, 0f3EAF1D44; +mul.f32 f780, f150, 0fBF708FB2; +sub.f32 f174, f780, f173; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f779, f786, f784; +sub.f32 f183, f786, f784; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f778, f788, f779; +mul.f32 f187, f779, 0f3F000000; +sub.f32 f188, f788, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f777, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f776, f123, f777; +mul.f32 f203, f777, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f775, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f774, f124, f775; +mul.f32 f219, f775, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %74, %92; +add.f32 f226, %56, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %56, f229; +add.f32 f771, %109, %108; +sub.f32 f231, %109, %108; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f769, %110, f771; +mul.f32 f235, f771, 0f3F000000; +sub.f32 f236, %110, f235; +sub.f32 f237, %74, %92; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %80, %98; +add.f32 f242, %62, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %62, f245; +add.f32 f766, %111, %112; +sub.f32 f247, %111, %112; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f764, %113, f766; +mul.f32 f251, f766, 0f3F000000; +sub.f32 f252, %113, f251; +sub.f32 f253, %80, %98; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %86, %104; +add.f32 f258, %68, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %68, f261; +add.f32 f761, %115, %114; +sub.f32 f263, %115, %114; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f759, %116, f761; +mul.f32 f267, f761, 0f3F000000; +sub.f32 f268, %116, f267; +sub.f32 f269, %86, %104; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f758, f249, 0f3F441B7D; +sub.f32 f275, f758, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f757, f265, 0f3E31D0D4; +sub.f32 f280, f757, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f284, f256, 0f3F7C1C5C; +mul.f32 f756, f250, 0f3E31D0D4; +sub.f32 f285, f756, f284; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f289, f272, 0f3EAF1D44; +mul.f32 f755, f266, 0fBF708FB2; +sub.f32 f290, f755, f289; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f754, f764, f759; +sub.f32 f299, f764, f759; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f753, f769, f754; +mul.f32 f303, f754, 0f3F000000; +sub.f32 f304, f769, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f752, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f751, f239, f752; +mul.f32 f319, f752, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f750, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f749, f240, f750; +mul.f32 f335, f750, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %76, %94; +add.f32 f342, %58, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %58, f345; +add.f32 f746, %118, %117; +sub.f32 f347, %118, %117; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f744, %119, f746; +mul.f32 f351, f746, 0f3F000000; +sub.f32 f352, %119, f351; +sub.f32 f353, %76, %94; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %82, %100; +add.f32 f358, %64, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %64, f361; +add.f32 f741, %121, %120; +sub.f32 f363, %121, %120; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f739, %122, f741; +mul.f32 f367, f741, 0f3F000000; +sub.f32 f368, %122, f367; +sub.f32 f369, %82, %100; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %88, %106; +add.f32 f374, %70, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %70, f377; +add.f32 f737, %123, %107; +sub.f32 f379, %123, %107; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f735, %124, f737; +mul.f32 f383, f737, 0f3F000000; +sub.f32 f384, %124, f383; +sub.f32 f385, %88, %106; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f734, f365, 0f3F441B7D; +sub.f32 f391, f734, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f733, f381, 0f3E31D0D4; +sub.f32 f396, f733, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f400, f372, 0f3F7C1C5C; +mul.f32 f732, f366, 0f3E31D0D4; +sub.f32 f401, f732, f400; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f405, f388, 0f3EAF1D44; +mul.f32 f731, f382, 0fBF708FB2; +sub.f32 f406, f731, f405; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f730, f739, f735; +sub.f32 f415, f739, f735; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f729, f744, f730; +mul.f32 f419, f730, 0f3F000000; +sub.f32 f420, f744, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f728, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f727, f355, f728; +mul.f32 f435, f728, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f726, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f725, f356, f726; +mul.f32 f451, f726, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f723, f310, 0f3F791978; +mul.f32 f724, f751, 0f3E6C2691; +sub.f32 f459, f723, f724; +mul.f32 f460, f751, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f721, f426, 0f3F64C51C; +mul.f32 f722, f727, 0f3EE5C902; +sub.f32 f464, f721, f722; +mul.f32 f465, f727, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f468, f749, 0f3EE5C902; +mul.f32 f720, f326, 0f3F64C51C; +sub.f32 f469, f720, f468; +mul.f32 f470, f749, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f473, f725, 0f3F4D57F2; +mul.f32 f719, f442, 0f3F18DF63; +sub.f32 f474, f719, f473; +mul.f32 f475, f725, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f478, f307, 0f3F248DBB; +mul.f32 f718, f301, 0f3F441B7D; +sub.f32 f479, f718, f478; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f717, f417, 0f3E31D0D4; +sub.f32 f484, f717, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f716, f317, 0f3F18DF63; +sub.f32 f489, f716, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f715, f433, 0fBE92D7E0; +sub.f32 f494, f715, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f713, f333, 0f3ECACAF8; +mul.f32 f714, f339, 0f3F6B1036; +sub.f32 f499, f713, f714; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f711, f449, 0fBF2FAD88; +mul.f32 f712, f455, 0f3F3A3529; +sub.f32 f504, f711, f712; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f709, f302, 0f3E31D0D4; +mul.f32 f710, f308, 0f3F7C1C5C; +sub.f32 f509, f709, f710; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f707, f418, 0fBF708FB2; +mul.f32 f708, f424, 0f3EAF1D44; +sub.f32 f514, f707, f708; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f518, f324, 0f3F7F9120; +mul.f32 f706, f318, 0fBD6E2946; +sub.f32 f519, f706, f518; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f523, f440, 0fBDEDC21F; +mul.f32 f705, f434, 0fBF7E44DE; +sub.f32 f524, f705, f523; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f704, f334, 0fBE92D7E0; +sub.f32 f529, f704, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f703, f450, 0fBF55E287; +sub.f32 f534, f703, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f702, f753, f729; +sub.f32 f541, f753, f729; +mul.f32 f542, f541, 0fBF5DB3D7; +mul.f32 f543, f702, 0f3F000000; +sub.f32 f544, f778, f543; +sub.f32 f545, f294, f410; +mul.f32 f546, f545, 0fBF5DB3D7; +add.f32 f547, f459, f464; +mul.f32 f549, f547, 0f3F000000; +sub.f32 f550, f194, f549; +add.f32 f701, f461, f466; +sub.f32 f551, f461, f466; +mul.f32 f552, f551, 0fBF5DB3D7; +mul.f32 f553, f701, 0f3F000000; +sub.f32 f554, f776, f553; +sub.f32 f555, f459, f464; +mul.f32 f556, f555, 0fBF5DB3D7; +add.f32 f557, f469, f474; +mul.f32 f559, f557, 0f3F000000; +sub.f32 f560, f210, f559; +add.f32 f700, f471, f476; +sub.f32 f561, f471, f476; +mul.f32 f562, f561, 0fBF5DB3D7; +mul.f32 f563, f700, 0f3F000000; +sub.f32 f564, f774, f563; +sub.f32 f565, f469, f474; +mul.f32 f566, f565, 0fBF5DB3D7; +add.f32 f567, f479, f484; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f185, f569; +add.f32 f699, f481, f486; +sub.f32 f571, f481, f486; +mul.f32 f572, f571, 0fBF5DB3D7; +mul.f32 f573, f699, 0f3F000000; +sub.f32 f574, f191, f573; +sub.f32 f575, f479, f484; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f489, f494; +mul.f32 f579, f577, 0f3F000000; +sub.f32 f580, f201, f579; +add.f32 f698, f491, f496; +sub.f32 f581, f491, f496; +mul.f32 f582, f581, 0fBF5DB3D7; +mul.f32 f583, f698, 0f3F000000; +sub.f32 f584, f207, f583; +sub.f32 f585, f489, f494; +mul.f32 f586, f585, 0fBF5DB3D7; +add.f32 f587, f499, f504; +mul.f32 f589, f587, 0f3F000000; +sub.f32 f590, f217, f589; +add.f32 f697, f501, f506; +sub.f32 f591, f501, f506; +mul.f32 f592, f591, 0fBF5DB3D7; +mul.f32 f593, f697, 0f3F000000; +sub.f32 f594, f223, f593; +sub.f32 f595, f499, f504; +mul.f32 f596, f595, 0fBF5DB3D7; +add.f32 f597, f509, f514; +mul.f32 f599, f597, 0f3F000000; +sub.f32 f600, f186, f599; +add.f32 f696, f511, f516; +sub.f32 f601, f511, f516; +mul.f32 f602, f601, 0fBF5DB3D7; +mul.f32 f603, f696, 0f3F000000; +sub.f32 f604, f192, f603; +sub.f32 f605, f509, f514; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f519, f524; +mul.f32 f609, f607, 0f3F000000; +sub.f32 f610, f202, f609; +add.f32 f695, f521, f526; +sub.f32 f611, f521, f526; +mul.f32 f612, f611, 0fBF5DB3D7; +mul.f32 f613, f695, 0f3F000000; +sub.f32 f614, f208, f613; +sub.f32 f615, f519, f524; +mul.f32 f616, f615, 0fBF5DB3D7; +add.f32 f617, f529, f534; +mul.f32 f619, f617, 0f3F000000; +sub.f32 f620, f218, f619; +add.f32 f694, f531, f536; +sub.f32 f621, f531, f536; +mul.f32 f622, f621, 0fBF5DB3D7; +mul.f32 f623, f694, 0f3F000000; +sub.f32 f624, f224, f623; +sub.f32 f625, f529, f534; +mul.f32 f791, f700, 0f3F000000; +sub.f32 f790, f774, f791; +mul.f32 f626, f625, 0fBF5DB3D7; +add.f32 %1, f778, f702; +mul.f32 f793, f537, 0f3F000000; +sub.f32 f792, f178, f793; +add.f32 %0, f178, f537; +mul.f32 f795, f701, 0f3F000000; +sub.f32 f794, f776, f795; +add.f32 %3, f776, f701; +add.f32 %2, f194, f547; +add.f32 %5, f774, f700; +add.f32 %4, f210, f557; +add.f32 %7, f191, f699; +add.f32 %6, f185, f567; +add.f32 %9, f207, f698; +add.f32 %8, f201, f577; +add.f32 %11, f223, f697; +add.f32 %10, f217, f587; +add.f32 %13, f192, f696; +add.f32 %12, f186, f597; +add.f32 %15, f208, f695; +add.f32 %14, f202, f607; +add.f32 %17, f224, f694; +add.f32 %16, f218, f617; +add.f32 %18, f542, f792; +sub.f32 %19, f544, f546; +sub.f32 %21, f794, f556; +add.f32 %20, f552, f550; +sub.f32 %23, f790, f566; +add.f32 %22, f562, f560; +sub.f32 %25, f574, f576; +add.f32 %24, f572, f570; +add.f32 %26, f582, f580; +sub.f32 %27, f584, f586; +add.f32 %28, f592, f590; +sub.f32 %29, f594, f596; +add.f32 %30, f602, f600; +sub.f32 %31, f604, f606; +add.f32 %32, f612, f610; +sub.f32 %33, f614, f616; +sub.f32 %35, f624, f626; +add.f32 %34, f622, f620; +add.f32 %37, f546, f544; +sub.f32 %36, f792, f542; +add.f32 %39, f556, f794; +sub.f32 %38, f550, f552; +add.f32 %41, f566, f790; +sub.f32 %40, f560, f562; +add.f32 %43, f576, f574; +sub.f32 %42, f570, f572; +add.f32 %45, f586, f584; +sub.f32 %44, f580, f582; +add.f32 %47, f596, f594; +sub.f32 %46, f590, f592; +add.f32 %49, f606, f604; +sub.f32 %48, f600, f602; +add.f32 %51, f616, f614; +sub.f32 %50, f610, f612; +add.f32 %53, f626, f624; +sub.f32 %52, f620, f622; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[13].y), "f"(rmem[22].y), "f"(rmem[4].y), "f"(rmem[25].y), "f"(rmem[16].y), "f"(rmem[7].y), "f"(rmem[20].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<325, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<316>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 216, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0fBF5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0fBF5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 216, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f122, f152; +mul.f32 f156, f120, f152; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f138, f162; +mul.f32 f164, f136, f162; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f117, f170; +mul.f32 f172, f111, f170; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f133, f178; +mul.f32 f180, f127, f178; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f149, f186; +mul.f32 f188, f143, f186; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f118, f194; +mul.f32 f196, f112, f194; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f134, f202; +mul.f32 f204, f128, f202; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f150, f210; +mul.f32 f212, f144, f210; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f151, f120, f155; +sub.f32 f217, f157, f156; +st.shared.v2.f32 [r9+8], {f216, f217}; +fma.rn.f32 f218, f160, f136, f163; +sub.f32 f219, f165, f164; +st.shared.v2.f32 [r9+16], {f218, f219}; +sub.f32 f220, f173, f172; +fma.rn.f32 f221, f168, f111, f171; +st.shared.v2.f32 [r9+24], {f221, f220}; +fma.rn.f32 f222, f176, f127, f179; +sub.f32 f223, f181, f180; +st.shared.v2.f32 [r9+32], {f222, f223}; +sub.f32 f224, f189, f188; +fma.rn.f32 f225, f184, f143, f187; +st.shared.v2.f32 [r9+40], {f225, f224}; +fma.rn.f32 f226, f192, f112, f195; +sub.f32 f227, f197, f196; +st.shared.v2.f32 [r9+48], {f226, f227}; +fma.rn.f32 f228, f200, f128, f203; +sub.f32 f229, f205, f204; +st.shared.v2.f32 [r9+56], {f228, f229}; +fma.rn.f32 f230, f208, f144, f211; +sub.f32 f231, f213, f212; +st.shared.v2.f32 [r9+64], {f230, f231}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+24]; +ld.shared.v2.f32 {f240, f241}, [r11+48]; +ld.shared.v2.f32 {f244, f245}, [r11+72]; +ld.shared.v2.f32 {f248, f249}, [r11+96]; +ld.shared.v2.f32 {f252, f253}, [r11+120]; +ld.shared.v2.f32 {f256, f257}, [r11+144]; +ld.shared.v2.f32 {f260, f261}, [r11+168]; +ld.shared.v2.f32 {f264, f265}, [r11+192]; +add.f32 f268, f244, f256; +add.f32 f269, f245, f257; +mul.f32 f270, f268, 0f3F000000; +sub.f32 f271, f232, f270; +sub.f32 f272, f245, f257; +mul.f32 f273, f272, 0fBF5DB3D7; +mul.f32 f274, f269, 0f3F000000; +sub.f32 f275, f233, f274; +sub.f32 f276, f244, f256; +mul.f32 f277, f276, 0fBF5DB3D7; +add.f32 f278, f248, f260; +add.f32 f279, f249, f261; +mul.f32 f280, f278, 0f3F000000; +sub.f32 f281, f236, f280; +sub.f32 f282, f249, f261; +mul.f32 f283, f282, 0fBF5DB3D7; +mul.f32 f284, f279, 0f3F000000; +sub.f32 f285, f237, f284; +sub.f32 f286, f248, f260; +mul.f32 f287, f286, 0fBF5DB3D7; +add.f32 f288, f252, f264; +add.f32 f289, f253, f265; +mul.f32 f290, f288, 0f3F000000; +sub.f32 f291, f240, f290; +sub.f32 f292, f253, f265; +mul.f32 f293, f292, 0fBF5DB3D7; +mul.f32 f294, f289, 0f3F000000; +sub.f32 f295, f241, f294; +sub.f32 f296, f252, f264; +mul.f32 f297, f296, 0fBF5DB3D7; +add.f32 %1, f233, f269; +add.f32 %0, f232, f268; +add.f32 %3, f237, f279; +add.f32 %2, f236, f278; +add.f32 %5, f241, f289; +add.f32 %4, f240, f288; +sub.f32 %7, f275, f277; +add.f32 %6, f273, f271; +sub.f32 %9, f285, f287; +add.f32 %8, f283, f281; +sub.f32 %11, f295, f297; +add.f32 %10, f293, f291; +add.f32 %13, f277, f275; +sub.f32 %12, f271, f273; +add.f32 %15, f287, f285; +sub.f32 %14, f281, f283; +add.f32 %17, f297, f295; +sub.f32 %16, f291, f293; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<324, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<298>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 108, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0fBF5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0fBF5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0fBF5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0fBF5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f124, f154; +fma.rn.f32 f158, f153, f122, f157; +mul.f32 f159, f122, f154; +mul.f32 f160, f153, f124; +sub.f32 f161, f160, f159; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f140, f166; +fma.rn.f32 f168, f164, f138, f167; +mul.f32 f169, f138, f166; +mul.f32 f170, f164, f140; +sub.f32 f171, f170, f169; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f119, f176; +fma.rn.f32 f178, f174, f113, f177; +mul.f32 f179, f113, f176; +mul.f32 f180, f174, f119; +sub.f32 f181, f180, f179; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f135, f186; +fma.rn.f32 f188, f184, f129, f187; +mul.f32 f189, f129, f186; +mul.f32 f190, f184, f135; +sub.f32 f191, f190, f189; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f151, f196; +fma.rn.f32 f198, f194, f145, f197; +mul.f32 f199, f145, f196; +mul.f32 f200, f194, f151; +sub.f32 f201, f200, f199; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f120, f206; +fma.rn.f32 f208, f204, f114, f207; +mul.f32 f209, f114, f206; +mul.f32 f210, f204, f120; +sub.f32 f211, f210, f209; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f136, f216; +fma.rn.f32 f218, f214, f130, f217; +mul.f32 f219, f130, f216; +mul.f32 f220, f214, f136; +sub.f32 f221, f220, f219; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f152, f226; +fma.rn.f32 f228, f224, f146, f227; +mul.f32 f229, f146, f226; +mul.f32 f230, f224, f152; +sub.f32 f231, f230, f229; +mad.lo.s32 r8, r5, 108, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f158; +st.shared.f32 [r9+8], f168; +st.shared.f32 [r9+12], f178; +st.shared.f32 [r9+16], f188; +st.shared.f32 [r9+20], f198; +st.shared.f32 [r9+24], f208; +st.shared.f32 [r9+28], f218; +st.shared.f32 [r9+32], f228; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+12]; +ld.shared.f32 f234, [r11+24]; +ld.shared.f32 f235, [r11+36]; +ld.shared.f32 f236, [r11+48]; +ld.shared.f32 f237, [r11+60]; +ld.shared.f32 f238, [r11+72]; +ld.shared.f32 f239, [r11+84]; +ld.shared.f32 f240, [r11+96]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+12]; +ld.shared.f32 f243, [r11+24]; +ld.shared.f32 f244, [r11+36]; +ld.shared.f32 f245, [r11+48]; +ld.shared.f32 f246, [r11+60]; +ld.shared.f32 f247, [r11+72]; +ld.shared.f32 f248, [r11+84]; +ld.shared.f32 f249, [r11+96]; +add.f32 f250, f235, f238; +add.f32 f251, f244, f247; +mul.f32 f252, f250, 0f3F000000; +sub.f32 f253, f232, f252; +sub.f32 f254, f244, f247; +mul.f32 f255, f254, 0fBF5DB3D7; +mul.f32 f256, f251, 0f3F000000; +sub.f32 f257, f241, f256; +sub.f32 f258, f235, f238; +mul.f32 f259, f258, 0fBF5DB3D7; +add.f32 f260, f236, f239; +add.f32 f261, f245, f248; +mul.f32 f262, f260, 0f3F000000; +sub.f32 f263, f233, f262; +sub.f32 f264, f245, f248; +mul.f32 f265, f264, 0fBF5DB3D7; +mul.f32 f266, f261, 0f3F000000; +sub.f32 f267, f242, f266; +sub.f32 f268, f236, f239; +mul.f32 f269, f268, 0fBF5DB3D7; +add.f32 f270, f237, f240; +add.f32 f271, f246, f249; +mul.f32 f272, f270, 0f3F000000; +sub.f32 f273, f234, f272; +sub.f32 f274, f246, f249; +mul.f32 f275, f274, 0fBF5DB3D7; +mul.f32 f276, f271, 0f3F000000; +sub.f32 f277, f243, f276; +sub.f32 f278, f237, f240; +mul.f32 f279, f278, 0fBF5DB3D7; +add.f32 %0, f232, f250; +add.f32 %1, f241, f251; +add.f32 %2, f233, f260; +add.f32 %3, f242, f261; +add.f32 %4, f234, f270; +add.f32 %5, f243, f271; +add.f32 %6, f255, f253; +sub.f32 %7, f257, f259; +add.f32 %8, f265, f263; +sub.f32 %9, f267, f269; +add.f32 %10, f275, f273; +sub.f32 %11, f277, f279; +sub.f32 %12, f253, f255; +add.f32 %13, f259, f257; +sub.f32 %14, f263, f265; +add.f32 %15, f269, f267; +sub.f32 %16, f273, f275; +add.f32 %17, f279, f277; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_27), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<326, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<123>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 216, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %11, %14; +add.f32 f14, %13, %15; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %9, f15; +sub.f32 f17, %13, %15; +mul.f32 f18, f17, 0fBF5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %10, f21; +sub.f32 f23, %11, %14; +mul.f32 f24, f23, 0fBF5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 216, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f25, f28; +mul.f32 f32, f19, f28; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f26, f38; +mul.f32 f40, f20, f38; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %10, f14; +add.f32 f43, %9, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f27, f19, f31; +sub.f32 f45, f33, f32; +st.shared.v2.f32 [r9+8], {f44, f45}; +sub.f32 f46, f41, f40; +fma.rn.f32 f47, f36, f20, f39; +st.shared.v2.f32 [r9+16], {f47, f46}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+72]; +ld.shared.v2.f32 {f56, f57}, [r11+144]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0fBF5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0fBF5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f72, f75; +mul.f32 f79, f66, f75; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f73, f85; +mul.f32 f87, f67, f85; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f74, f66, f78; +sub.f32 f92, f80, f79; +st.shared.v2.f32 [r17+24], {f91, f92}; +fma.rn.f32 f93, f83, f67, f86; +sub.f32 f94, f88, f87; +st.shared.v2.f32 [r17+48], {f93, f94}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+72]; +ld.shared.v2.f32 {f103, f104}, [r11+144]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0fBF5DB3D7; +mul.f32 f113, f108, 0f3F000000; +sub.f32 f114, f96, f113; +sub.f32 f115, f99, f103; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 %1, f96, f108; +add.f32 %0, f95, f107; +sub.f32 %3, f114, f116; +add.f32 %2, f112, f110; +add.f32 %5, f116, f114; +sub.f32 %4, f110, f112; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<327, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<111>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 108, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %11, %14; +add.f32 f14, %9, f13; +add.f32 f15, %13, %15; +add.f32 f16, %10, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %9, f17; +sub.f32 f19, %13, %15; +mul.f32 f20, f19, 0fBF5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %10, f23; +sub.f32 f25, %11, %14; +mul.f32 f26, f25, 0fBF5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 108, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f27, f30; +fma.rn.f32 f34, f29, f21, f33; +mul.f32 f35, f21, f30; +mul.f32 f36, f29, f27; +sub.f32 f37, f36, f35; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f28, f42; +fma.rn.f32 f44, f40, f22, f43; +mul.f32 f45, f22, f42; +mul.f32 f46, f40, f28; +sub.f32 f47, f46, f45; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f34; +st.shared.f32 [r9+8], f44; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+36]; +ld.shared.f32 f50, [r11+72]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+36]; +ld.shared.f32 f53, [r11+72]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0fBF5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0fBF5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f68, f71; +fma.rn.f32 f75, f70, f62, f74; +mul.f32 f76, f62, f71; +mul.f32 f77, f70, f68; +sub.f32 f78, f77, f76; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f69, f83; +fma.rn.f32 f85, f81, f63, f84; +mul.f32 f86, f63, f83; +mul.f32 f87, f81, f69; +sub.f32 f88, f87, f86; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f75; +st.shared.f32 [r17+24], f85; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+36]; +ld.shared.f32 f91, [r11+72]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+36]; +ld.shared.f32 f94, [r11+72]; +add.f32 f95, f90, f91; +add.f32 f96, f93, f94; +mul.f32 f97, f95, 0f3F000000; +sub.f32 f98, f89, f97; +sub.f32 f99, f93, f94; +mul.f32 f100, f99, 0fBF5DB3D7; +mul.f32 f101, f96, 0f3F000000; +sub.f32 f102, f92, f101; +sub.f32 f103, f90, f91; +mul.f32 f104, f103, 0fBF5DB3D7; +add.f32 %0, f89, f95; +add.f32 %1, f92, f96; +add.f32 %2, f100, f98; +sub.f32 %3, f102, f104; +sub.f32 %4, f98, f100; +add.f32 %5, f104, f102; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..1cff089215251 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp64_fwd.hpp.inc @@ -0,0 +1,1456 @@ +#ifndef CUFFTDX_FFT_27_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_27_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<504, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<796>; +.reg .b64 rd<5>; +add.f64 fd109, %72, %90; +add.f64 fd110, %54, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %54, fd113; +add.f64 fd789, %73, %91; +sub.f64 fd115, %73, %91; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd788, %55, fd789; +mul.f64 fd119, fd789, 0d3FE0000000000000; +sub.f64 fd120, %55, fd119; +sub.f64 fd121, %72, %90; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %78, %96; +add.f64 fd126, %60, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %60, fd129; +add.f64 fd787, %79, %97; +sub.f64 fd131, %79, %97; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd786, %61, fd787; +mul.f64 fd135, fd787, 0d3FE0000000000000; +sub.f64 fd136, %61, fd135; +sub.f64 fd137, %78, %96; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %84, %102; +add.f64 fd142, %66, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %66, fd145; +add.f64 fd785, %85, %103; +sub.f64 fd147, %85, %103; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd784, %67, fd785; +mul.f64 fd151, fd785, 0d3FE0000000000000; +sub.f64 fd152, %67, fd151; +sub.f64 fd153, %84, %102; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd783, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd783, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd163, fd155, 0dBFEF838B8C811C17; +mul.f64 fd782, fd149, 0d3FC63A1A7E0B738A; +sub.f64 fd164, fd782, fd163; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd168, fd140, 0dBFEF838B8C811C17; +mul.f64 fd781, fd134, 0d3FC63A1A7E0B738A; +sub.f64 fd169, fd781, fd168; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd173, fd156, 0dBFD5E3A8748A0BF5; +mul.f64 fd780, fd150, 0dBFEE11F642522D1C; +sub.f64 fd174, fd780, fd173; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd779, fd786, fd784; +sub.f64 fd183, fd786, fd784; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd778, fd788, fd779; +mul.f64 fd187, fd779, 0d3FE0000000000000; +sub.f64 fd188, fd788, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd777, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd776, fd123, fd777; +mul.f64 fd203, fd777, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd775, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd774, fd124, fd775; +mul.f64 fd219, fd775, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %74, %92; +add.f64 fd226, %56, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %56, fd229; +add.f64 fd771, %109, %108; +sub.f64 fd231, %109, %108; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd769, %110, fd771; +mul.f64 fd235, fd771, 0d3FE0000000000000; +sub.f64 fd236, %110, fd235; +sub.f64 fd237, %74, %92; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %80, %98; +add.f64 fd242, %62, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %62, fd245; +add.f64 fd766, %111, %112; +sub.f64 fd247, %111, %112; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd764, %113, fd766; +mul.f64 fd251, fd766, 0d3FE0000000000000; +sub.f64 fd252, %113, fd251; +sub.f64 fd253, %80, %98; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %86, %104; +add.f64 fd258, %68, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %68, fd261; +add.f64 fd761, %115, %114; +sub.f64 fd263, %115, %114; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd759, %116, fd761; +mul.f64 fd267, fd761, 0d3FE0000000000000; +sub.f64 fd268, %116, fd267; +sub.f64 fd269, %86, %104; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd758, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd758, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd757, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd757, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd284, fd256, 0dBFEF838B8C811C17; +mul.f64 fd756, fd250, 0d3FC63A1A7E0B738A; +sub.f64 fd285, fd756, fd284; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd289, fd272, 0dBFD5E3A8748A0BF5; +mul.f64 fd755, fd266, 0dBFEE11F642522D1C; +sub.f64 fd290, fd755, fd289; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd754, fd764, fd759; +sub.f64 fd299, fd764, fd759; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd753, fd769, fd754; +mul.f64 fd303, fd754, 0d3FE0000000000000; +sub.f64 fd304, fd769, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd752, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd751, fd239, fd752; +mul.f64 fd319, fd752, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd750, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd749, fd240, fd750; +mul.f64 fd335, fd750, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %76, %94; +add.f64 fd342, %58, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %58, fd345; +add.f64 fd746, %118, %117; +sub.f64 fd347, %118, %117; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd744, %119, fd746; +mul.f64 fd351, fd746, 0d3FE0000000000000; +sub.f64 fd352, %119, fd351; +sub.f64 fd353, %76, %94; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %82, %100; +add.f64 fd358, %64, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %64, fd361; +add.f64 fd741, %121, %120; +sub.f64 fd363, %121, %120; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd739, %122, fd741; +mul.f64 fd367, fd741, 0d3FE0000000000000; +sub.f64 fd368, %122, fd367; +sub.f64 fd369, %82, %100; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %88, %106; +add.f64 fd374, %70, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %70, fd377; +add.f64 fd737, %123, %107; +sub.f64 fd379, %123, %107; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd735, %124, fd737; +mul.f64 fd383, fd737, 0d3FE0000000000000; +sub.f64 fd384, %124, fd383; +sub.f64 fd385, %88, %106; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd734, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd734, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd733, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd733, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd400, fd372, 0dBFEF838B8C811C17; +mul.f64 fd732, fd366, 0d3FC63A1A7E0B738A; +sub.f64 fd401, fd732, fd400; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd405, fd388, 0dBFD5E3A8748A0BF5; +mul.f64 fd731, fd382, 0dBFEE11F642522D1C; +sub.f64 fd406, fd731, fd405; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd730, fd739, fd735; +sub.f64 fd415, fd739, fd735; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd729, fd744, fd730; +mul.f64 fd419, fd730, 0d3FE0000000000000; +sub.f64 fd420, fd744, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd728, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd727, fd355, fd728; +mul.f64 fd435, fd728, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd726, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd725, fd356, fd726; +mul.f64 fd451, fd726, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd723, fd310, 0d3FEF232EFF15C9E6; +mul.f64 fd724, fd751, 0dBFCD84D223638000; +sub.f64 fd459, fd723, fd724; +mul.f64 fd460, fd751, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd721, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd722, fd727, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd721, fd722; +mul.f64 fd465, fd727, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd468, fd749, 0dBFDCB920325BAFA6; +mul.f64 fd720, fd326, 0d3FEC98A37A9A7850; +sub.f64 fd469, fd720, fd468; +mul.f64 fd470, fd749, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd473, fd725, 0dBFE9AAFE4207DF5F; +mul.f64 fd719, fd442, 0d3FE31BEC55BC71BC; +sub.f64 fd474, fd719, fd473; +mul.f64 fd475, fd725, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd478, fd307, 0dBFE491B7523C161D; +mul.f64 fd718, fd301, 0d3FE8836FA2CF5039; +sub.f64 fd479, fd718, fd478; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd717, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd717, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd716, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd716, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd715, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd715, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd713, fd333, 0d3FD9595EF26FB670; +mul.f64 fd714, fd339, 0dBFED6206BEB6C24B; +sub.f64 fd499, fd713, fd714; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd711, fd449, 0dBFE5F5B105F99707; +mul.f64 fd712, fd455, 0dBFE746A51650EADE; +sub.f64 fd504, fd711, fd712; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd709, fd302, 0d3FC63A1A7E0B738A; +mul.f64 fd710, fd308, 0dBFEF838B8C811C17; +sub.f64 fd509, fd709, fd710; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd707, fd418, 0dBFEE11F642522D1C; +mul.f64 fd708, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd707, fd708; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd518, fd324, 0dBFEFF223F3635CE3; +mul.f64 fd706, fd318, 0dBFADC528B5343A86; +sub.f64 fd519, fd706, fd518; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd523, fd440, 0d3FBDB843E577175E; +mul.f64 fd705, fd434, 0dBFEFC89BCEF44CF4; +sub.f64 fd524, fd705, fd523; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd704, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd704, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd703, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd703, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd702, fd753, fd729; +sub.f64 fd541, fd753, fd729; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +mul.f64 fd543, fd702, 0d3FE0000000000000; +sub.f64 fd544, fd778, fd543; +sub.f64 fd545, fd294, fd410; +mul.f64 fd546, fd545, 0d3FEBB67AE8584CAA; +add.f64 fd547, fd459, fd464; +mul.f64 fd549, fd547, 0d3FE0000000000000; +sub.f64 fd550, fd194, fd549; +add.f64 fd701, fd461, fd466; +sub.f64 fd551, fd461, fd466; +mul.f64 fd552, fd551, 0d3FEBB67AE8584CAA; +mul.f64 fd553, fd701, 0d3FE0000000000000; +sub.f64 fd554, fd776, fd553; +sub.f64 fd555, fd459, fd464; +mul.f64 fd556, fd555, 0d3FEBB67AE8584CAA; +add.f64 fd557, fd469, fd474; +mul.f64 fd559, fd557, 0d3FE0000000000000; +sub.f64 fd560, fd210, fd559; +add.f64 fd700, fd471, fd476; +sub.f64 fd561, fd471, fd476; +mul.f64 fd562, fd561, 0d3FEBB67AE8584CAA; +mul.f64 fd563, fd700, 0d3FE0000000000000; +sub.f64 fd564, fd774, fd563; +sub.f64 fd565, fd469, fd474; +mul.f64 fd566, fd565, 0d3FEBB67AE8584CAA; +add.f64 fd567, fd479, fd484; +mul.f64 fd569, fd567, 0d3FE0000000000000; +sub.f64 fd570, fd185, fd569; +add.f64 fd699, fd481, fd486; +sub.f64 fd571, fd481, fd486; +mul.f64 fd572, fd571, 0d3FEBB67AE8584CAA; +mul.f64 fd573, fd699, 0d3FE0000000000000; +sub.f64 fd574, fd191, fd573; +sub.f64 fd575, fd479, fd484; +mul.f64 fd576, fd575, 0d3FEBB67AE8584CAA; +add.f64 fd577, fd489, fd494; +mul.f64 fd579, fd577, 0d3FE0000000000000; +sub.f64 fd580, fd201, fd579; +add.f64 fd698, fd491, fd496; +sub.f64 fd581, fd491, fd496; +mul.f64 fd582, fd581, 0d3FEBB67AE8584CAA; +mul.f64 fd583, fd698, 0d3FE0000000000000; +sub.f64 fd584, fd207, fd583; +sub.f64 fd585, fd489, fd494; +mul.f64 fd586, fd585, 0d3FEBB67AE8584CAA; +add.f64 fd587, fd499, fd504; +mul.f64 fd589, fd587, 0d3FE0000000000000; +sub.f64 fd590, fd217, fd589; +add.f64 fd697, fd501, fd506; +sub.f64 fd591, fd501, fd506; +mul.f64 fd592, fd591, 0d3FEBB67AE8584CAA; +mul.f64 fd593, fd697, 0d3FE0000000000000; +sub.f64 fd594, fd223, fd593; +sub.f64 fd595, fd499, fd504; +mul.f64 fd596, fd595, 0d3FEBB67AE8584CAA; +add.f64 fd597, fd509, fd514; +mul.f64 fd599, fd597, 0d3FE0000000000000; +sub.f64 fd600, fd186, fd599; +add.f64 fd696, fd511, fd516; +sub.f64 fd601, fd511, fd516; +mul.f64 fd602, fd601, 0d3FEBB67AE8584CAA; +mul.f64 fd603, fd696, 0d3FE0000000000000; +sub.f64 fd604, fd192, fd603; +sub.f64 fd605, fd509, fd514; +mul.f64 fd606, fd605, 0d3FEBB67AE8584CAA; +add.f64 fd607, fd519, fd524; +mul.f64 fd609, fd607, 0d3FE0000000000000; +sub.f64 fd610, fd202, fd609; +add.f64 fd695, fd521, fd526; +sub.f64 fd611, fd521, fd526; +mul.f64 fd612, fd611, 0d3FEBB67AE8584CAA; +mul.f64 fd613, fd695, 0d3FE0000000000000; +sub.f64 fd614, fd208, fd613; +sub.f64 fd615, fd519, fd524; +mul.f64 fd616, fd615, 0d3FEBB67AE8584CAA; +add.f64 fd617, fd529, fd534; +mul.f64 fd619, fd617, 0d3FE0000000000000; +sub.f64 fd620, fd218, fd619; +add.f64 fd694, fd531, fd536; +sub.f64 fd621, fd531, fd536; +mul.f64 fd622, fd621, 0d3FEBB67AE8584CAA; +mul.f64 fd623, fd694, 0d3FE0000000000000; +sub.f64 fd624, fd224, fd623; +sub.f64 fd625, fd529, fd534; +mul.f64 fd791, fd700, 0d3FE0000000000000; +sub.f64 fd790, fd774, fd791; +mul.f64 fd626, fd625, 0d3FEBB67AE8584CAA; +add.f64 %1, fd778, fd702; +mul.f64 fd793, fd537, 0d3FE0000000000000; +sub.f64 fd792, fd178, fd793; +add.f64 %0, fd178, fd537; +mul.f64 fd795, fd701, 0d3FE0000000000000; +sub.f64 fd794, fd776, fd795; +add.f64 %3, fd776, fd701; +add.f64 %2, fd194, fd547; +add.f64 %5, fd774, fd700; +add.f64 %4, fd210, fd557; +add.f64 %7, fd191, fd699; +add.f64 %6, fd185, fd567; +add.f64 %9, fd207, fd698; +add.f64 %8, fd201, fd577; +add.f64 %11, fd223, fd697; +add.f64 %10, fd217, fd587; +add.f64 %13, fd192, fd696; +add.f64 %12, fd186, fd597; +add.f64 %15, fd208, fd695; +add.f64 %14, fd202, fd607; +add.f64 %17, fd224, fd694; +add.f64 %16, fd218, fd617; +add.f64 %18, fd542, fd792; +sub.f64 %19, fd544, fd546; +sub.f64 %21, fd794, fd556; +add.f64 %20, fd552, fd550; +sub.f64 %23, fd790, fd566; +add.f64 %22, fd562, fd560; +sub.f64 %25, fd574, fd576; +add.f64 %24, fd572, fd570; +add.f64 %26, fd582, fd580; +sub.f64 %27, fd584, fd586; +add.f64 %28, fd592, fd590; +sub.f64 %29, fd594, fd596; +add.f64 %30, fd602, fd600; +sub.f64 %31, fd604, fd606; +add.f64 %32, fd612, fd610; +sub.f64 %33, fd614, fd616; +sub.f64 %35, fd624, fd626; +add.f64 %34, fd622, fd620; +add.f64 %37, fd546, fd544; +sub.f64 %36, fd792, fd542; +add.f64 %39, fd556, fd794; +sub.f64 %38, fd550, fd552; +add.f64 %41, fd566, fd790; +sub.f64 %40, fd560, fd562; +add.f64 %43, fd576, fd574; +sub.f64 %42, fd570, fd572; +add.f64 %45, fd586, fd584; +sub.f64 %44, fd580, fd582; +add.f64 %47, fd596, fd594; +sub.f64 %46, fd590, fd592; +add.f64 %49, fd606, fd604; +sub.f64 %48, fd600, fd602; +add.f64 %51, fd616, fd614; +sub.f64 %50, fd610, fd612; +add.f64 %53, fd626, fd624; +sub.f64 %52, fd620, fd622; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[13].y), "d"(rmem[22].y), "d"(rmem[4].y), "d"(rmem[25].y), "d"(rmem[16].y), "d"(rmem[7].y), "d"(rmem[20].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<506, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<315>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 432, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 432, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd151, fd120; +mul.f64 fd156, fd152, fd122; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd136; +mul.f64 fd164, fd162, fd138; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd168, fd111; +mul.f64 fd172, fd170, fd117; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd176, fd127; +mul.f64 fd180, fd178, fd133; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+48]; +mul.f64 fd186, fd182, fd143; +mul.f64 fd187, fd183, fd149; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd191, fd112; +mul.f64 fd195, fd193, fd118; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd199, fd128; +mul.f64 fd203, fd201, fd134; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd207, fd144; +mul.f64 fd211, fd209, fd150; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd152, fd120, fd157; +sub.f64 fd216, fd155, fd156; +st.shared.v2.f64 [r9+16], {fd216, fd215}; +fma.rn.f64 fd217, fd162, fd136, fd165; +sub.f64 fd218, fd163, fd164; +st.shared.v2.f64 [r9+32], {fd218, fd217}; +sub.f64 fd219, fd171, fd172; +fma.rn.f64 fd220, fd170, fd111, fd173; +st.shared.v2.f64 [r9+48], {fd219, fd220}; +fma.rn.f64 fd221, fd178, fd127, fd181; +sub.f64 fd222, fd179, fd180; +st.shared.v2.f64 [r9+64], {fd222, fd221}; +fma.rn.f64 fd223, fd183, fd143, fd188; +sub.f64 fd224, fd186, fd187; +st.shared.v2.f64 [r9+80], {fd224, fd223}; +fma.rn.f64 fd225, fd193, fd112, fd196; +sub.f64 fd226, fd194, fd195; +st.shared.v2.f64 [r9+96], {fd226, fd225}; +fma.rn.f64 fd227, fd201, fd128, fd204; +sub.f64 fd228, fd202, fd203; +st.shared.v2.f64 [r9+112], {fd228, fd227}; +fma.rn.f64 fd229, fd209, fd144, fd212; +sub.f64 fd230, fd210, fd211; +st.shared.v2.f64 [r9+128], {fd230, fd229}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+48]; +ld.shared.v2.f64 {fd239, fd240}, [r11+96]; +ld.shared.v2.f64 {fd243, fd244}, [r11+144]; +ld.shared.v2.f64 {fd247, fd248}, [r11+192]; +ld.shared.v2.f64 {fd251, fd252}, [r11+240]; +ld.shared.v2.f64 {fd255, fd256}, [r11+288]; +ld.shared.v2.f64 {fd259, fd260}, [r11+336]; +ld.shared.v2.f64 {fd263, fd264}, [r11+384]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd244, fd256; +mul.f64 fd269, fd267, 0d3FE0000000000000; +sub.f64 fd270, fd231, fd269; +sub.f64 fd271, fd244, fd256; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +mul.f64 fd273, fd268, 0d3FE0000000000000; +sub.f64 fd274, fd232, fd273; +sub.f64 fd275, fd243, fd255; +mul.f64 fd276, fd275, 0d3FEBB67AE8584CAA; +add.f64 fd277, fd247, fd259; +add.f64 fd278, fd248, fd260; +mul.f64 fd279, fd277, 0d3FE0000000000000; +sub.f64 fd280, fd235, fd279; +sub.f64 fd281, fd248, fd260; +mul.f64 fd282, fd281, 0d3FEBB67AE8584CAA; +mul.f64 fd283, fd278, 0d3FE0000000000000; +sub.f64 fd284, fd236, fd283; +sub.f64 fd285, fd247, fd259; +mul.f64 fd286, fd285, 0d3FEBB67AE8584CAA; +add.f64 fd287, fd251, fd263; +add.f64 fd288, fd252, fd264; +mul.f64 fd289, fd287, 0d3FE0000000000000; +sub.f64 fd290, fd239, fd289; +sub.f64 fd291, fd252, fd264; +mul.f64 fd292, fd291, 0d3FEBB67AE8584CAA; +mul.f64 fd293, fd288, 0d3FE0000000000000; +sub.f64 fd294, fd240, fd293; +sub.f64 fd295, fd251, fd263; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +add.f64 %1, fd232, fd268; +add.f64 %0, fd231, fd267; +add.f64 %3, fd236, fd278; +add.f64 %2, fd235, fd277; +add.f64 %5, fd240, fd288; +add.f64 %4, fd239, fd287; +sub.f64 %7, fd274, fd276; +add.f64 %6, fd272, fd270; +sub.f64 %9, fd284, fd286; +add.f64 %8, fd282, fd280; +sub.f64 %11, fd294, fd296; +add.f64 %10, fd292, fd290; +add.f64 %13, fd276, fd274; +sub.f64 %12, fd270, fd272; +add.f64 %15, fd286, fd284; +sub.f64 %14, fd280, fd282; +add.f64 %17, fd296, fd294; +sub.f64 %16, fd290, fd292; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<505, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<297>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 216, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0d3FEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0d3FEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0d3FEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd153, fd122; +mul.f64 fd158, fd154, fd124; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd153, fd124; +fma.rn.f64 fd161, fd154, fd122, fd160; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd164, fd138; +mul.f64 fd168, fd166, fd140; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd164, fd140; +fma.rn.f64 fd171, fd166, fd138, fd170; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd174, fd113; +mul.f64 fd178, fd176, fd119; +sub.f64 fd179, fd177, fd178; +mul.f64 fd180, fd174, fd119; +fma.rn.f64 fd181, fd176, fd113, fd180; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd184, fd129; +mul.f64 fd188, fd186, fd135; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd184, fd135; +fma.rn.f64 fd191, fd186, fd129, fd190; +ld.global.v2.f64 {fd192, fd193}, [rd6+48]; +mul.f64 fd196, fd192, fd145; +mul.f64 fd197, fd193, fd151; +sub.f64 fd198, fd196, fd197; +mul.f64 fd199, fd192, fd151; +fma.rn.f64 fd200, fd193, fd145, fd199; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd203, fd114; +mul.f64 fd207, fd205, fd120; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, fd120; +fma.rn.f64 fd210, fd205, fd114, fd209; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd213, fd130; +mul.f64 fd217, fd215, fd136; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd213, fd136; +fma.rn.f64 fd220, fd215, fd130, fd219; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd223, fd146; +mul.f64 fd227, fd225, fd152; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd223, fd152; +fma.rn.f64 fd230, fd225, fd146, fd229; +mad.lo.s32 r8, r5, 216, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd159; +st.shared.f64 [r9+16], fd169; +st.shared.f64 [r9+24], fd179; +st.shared.f64 [r9+32], fd189; +st.shared.f64 [r9+40], fd198; +st.shared.f64 [r9+48], fd208; +st.shared.f64 [r9+56], fd218; +st.shared.f64 [r9+64], fd228; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+24]; +ld.shared.f64 fd233, [r11+48]; +ld.shared.f64 fd234, [r11+72]; +ld.shared.f64 fd235, [r11+96]; +ld.shared.f64 fd236, [r11+120]; +ld.shared.f64 fd237, [r11+144]; +ld.shared.f64 fd238, [r11+168]; +ld.shared.f64 fd239, [r11+192]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+24]; +ld.shared.f64 fd242, [r11+48]; +ld.shared.f64 fd243, [r11+72]; +ld.shared.f64 fd244, [r11+96]; +ld.shared.f64 fd245, [r11+120]; +ld.shared.f64 fd246, [r11+144]; +ld.shared.f64 fd247, [r11+168]; +ld.shared.f64 fd248, [r11+192]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd243, fd246; +mul.f64 fd251, fd249, 0d3FE0000000000000; +sub.f64 fd252, fd231, fd251; +sub.f64 fd253, fd243, fd246; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +mul.f64 fd255, fd250, 0d3FE0000000000000; +sub.f64 fd256, fd240, fd255; +sub.f64 fd257, fd234, fd237; +mul.f64 fd258, fd257, 0d3FEBB67AE8584CAA; +add.f64 fd259, fd235, fd238; +add.f64 fd260, fd244, fd247; +mul.f64 fd261, fd259, 0d3FE0000000000000; +sub.f64 fd262, fd232, fd261; +sub.f64 fd263, fd244, fd247; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +mul.f64 fd265, fd260, 0d3FE0000000000000; +sub.f64 fd266, fd241, fd265; +sub.f64 fd267, fd235, fd238; +mul.f64 fd268, fd267, 0d3FEBB67AE8584CAA; +add.f64 fd269, fd236, fd239; +add.f64 fd270, fd245, fd248; +mul.f64 fd271, fd269, 0d3FE0000000000000; +sub.f64 fd272, fd233, fd271; +sub.f64 fd273, fd245, fd248; +mul.f64 fd274, fd273, 0d3FEBB67AE8584CAA; +mul.f64 fd275, fd270, 0d3FE0000000000000; +sub.f64 fd276, fd242, fd275; +sub.f64 fd277, fd236, fd239; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +add.f64 %0, fd231, fd249; +add.f64 %1, fd240, fd250; +add.f64 %2, fd232, fd259; +add.f64 %3, fd241, fd260; +add.f64 %4, fd233, fd269; +add.f64 %5, fd242, fd270; +add.f64 %6, fd254, fd252; +sub.f64 %7, fd256, fd258; +add.f64 %8, fd264, fd262; +sub.f64 %9, fd266, fd268; +add.f64 %10, fd274, fd272; +sub.f64 %11, fd276, fd278; +sub.f64 %12, fd252, fd254; +add.f64 %13, fd258, fd256; +sub.f64 %14, fd262, fd264; +add.f64 %15, fd268, fd266; +sub.f64 %16, fd272, fd274; +add.f64 %17, fd278, fd276; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<507, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<109>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 216, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %11, %14; +add.f64 fd14, %9, fd13; +add.f64 fd15, %13, %15; +add.f64 fd16, %10, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %9, fd17; +sub.f64 fd19, %13, %15; +mul.f64 fd20, fd19, 0d3FEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %10, fd23; +sub.f64 fd25, %11, %14; +mul.f64 fd26, fd25, 0d3FEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 216, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd29, fd21; +mul.f64 fd34, fd30, fd27; +sub.f64 fd35, fd33, fd34; +mul.f64 fd36, fd29, fd27; +fma.rn.f64 fd37, fd30, fd21, fd36; +ld.global.v2.f64 {fd38, fd39}, [rd6+144]; +mul.f64 fd42, fd38, fd22; +mul.f64 fd43, fd39, fd28; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd38, fd28; +fma.rn.f64 fd46, fd39, fd22, fd45; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd35; +st.shared.f64 [r9+16], fd44; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+72]; +ld.shared.f64 fd49, [r11+144]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+72]; +ld.shared.f64 fd52, [r11+144]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd69, fd61; +mul.f64 fd74, fd70, fd67; +sub.f64 fd75, fd73, fd74; +mul.f64 fd76, fd69, fd67; +fma.rn.f64 fd77, fd70, fd61, fd76; +ld.global.v2.f64 {fd78, fd79}, [rd11+48]; +mul.f64 fd82, fd78, fd62; +mul.f64 fd83, fd79, fd68; +sub.f64 fd84, fd82, fd83; +mul.f64 fd85, fd78, fd68; +fma.rn.f64 fd86, fd79, fd62, fd85; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd75; +st.shared.f64 [r17+48], fd84; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+72]; +ld.shared.f64 fd89, [r11+144]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+72]; +ld.shared.f64 fd92, [r11+144]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd91, fd92; +mul.f64 fd95, fd93, 0d3FE0000000000000; +sub.f64 fd96, fd87, fd95; +sub.f64 fd97, fd91, fd92; +mul.f64 fd98, fd97, 0d3FEBB67AE8584CAA; +mul.f64 fd99, fd94, 0d3FE0000000000000; +sub.f64 fd100, fd90, fd99; +sub.f64 fd101, fd88, fd89; +mul.f64 fd102, fd101, 0d3FEBB67AE8584CAA; +add.f64 %0, fd87, fd93; +add.f64 %1, fd90, fd94; +add.f64 %2, fd98, fd96; +sub.f64 %3, fd100, fd102; +sub.f64 %4, fd96, fd98; +add.f64 %5, fd102, fd100; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<508, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<121>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 432, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %11, %14; +add.f64 fd14, %13, %15; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %9, fd15; +sub.f64 fd17, %13, %15; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %10, fd21; +sub.f64 fd23, %11, %14; +mul.f64 fd24, fd23, 0d3FEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 432, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd27, fd19; +mul.f64 fd32, fd28, fd25; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+144]; +mul.f64 fd38, fd34, fd20; +mul.f64 fd39, fd35, fd26; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %10, fd14; +add.f64 fd42, %9, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd28, fd19, fd33; +sub.f64 fd44, fd31, fd32; +st.shared.v2.f64 [r9+16], {fd44, fd43}; +fma.rn.f64 fd45, fd35, fd20, fd40; +sub.f64 fd46, fd38, fd39; +st.shared.v2.f64 [r9+32], {fd46, fd45}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+144]; +ld.shared.v2.f64 {fd55, fd56}, [r11+288]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0d3FEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0d3FEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd73, fd65; +mul.f64 fd78, fd74, fd71; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+48]; +mul.f64 fd84, fd80, fd66; +mul.f64 fd85, fd81, fd72; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd74, fd65, fd79; +sub.f64 fd90, fd77, fd78; +st.shared.v2.f64 [r17+48], {fd90, fd89}; +fma.rn.f64 fd91, fd81, fd66, fd86; +sub.f64 fd92, fd84, fd85; +st.shared.v2.f64 [r17+96], {fd92, fd91}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+144]; +ld.shared.v2.f64 {fd101, fd102}, [r11+288]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +mul.f64 fd111, fd106, 0d3FE0000000000000; +sub.f64 fd112, fd94, fd111; +sub.f64 fd113, fd97, fd101; +mul.f64 fd114, fd113, 0d3FEBB67AE8584CAA; +add.f64 %1, fd94, fd106; +add.f64 %0, fd93, fd105; +sub.f64 %3, fd112, fd114; +add.f64 %2, fd110, fd108; +add.f64 %5, fd114, fd112; +sub.f64 %4, fd108, fd110; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..9e943d11fde4e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_27_fp64_inv.hpp.inc @@ -0,0 +1,1456 @@ +#ifndef CUFFTDX_FFT_27_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_27_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<675, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<796>; +.reg .b64 rd<5>; +add.f64 fd109, %72, %90; +add.f64 fd110, %54, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %54, fd113; +add.f64 fd789, %73, %91; +sub.f64 fd115, %73, %91; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd788, %55, fd789; +mul.f64 fd119, fd789, 0d3FE0000000000000; +sub.f64 fd120, %55, fd119; +sub.f64 fd121, %72, %90; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %78, %96; +add.f64 fd126, %60, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %60, fd129; +add.f64 fd787, %79, %97; +sub.f64 fd131, %79, %97; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd786, %61, fd787; +mul.f64 fd135, fd787, 0d3FE0000000000000; +sub.f64 fd136, %61, fd135; +sub.f64 fd137, %78, %96; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %84, %102; +add.f64 fd142, %66, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %66, fd145; +add.f64 fd785, %85, %103; +sub.f64 fd147, %85, %103; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd784, %67, fd785; +mul.f64 fd151, fd785, 0d3FE0000000000000; +sub.f64 fd152, %67, fd151; +sub.f64 fd153, %84, %102; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd783, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd783, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd163, fd155, 0d3FEF838B8C811C17; +mul.f64 fd782, fd149, 0d3FC63A1A7E0B738A; +sub.f64 fd164, fd782, fd163; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd168, fd140, 0d3FEF838B8C811C17; +mul.f64 fd781, fd134, 0d3FC63A1A7E0B738A; +sub.f64 fd169, fd781, fd168; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd173, fd156, 0d3FD5E3A8748A0BF5; +mul.f64 fd780, fd150, 0dBFEE11F642522D1C; +sub.f64 fd174, fd780, fd173; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd779, fd786, fd784; +sub.f64 fd183, fd786, fd784; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd778, fd788, fd779; +mul.f64 fd187, fd779, 0d3FE0000000000000; +sub.f64 fd188, fd788, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd777, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd776, fd123, fd777; +mul.f64 fd203, fd777, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd775, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd774, fd124, fd775; +mul.f64 fd219, fd775, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %74, %92; +add.f64 fd226, %56, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %56, fd229; +add.f64 fd771, %109, %108; +sub.f64 fd231, %109, %108; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd769, %110, fd771; +mul.f64 fd235, fd771, 0d3FE0000000000000; +sub.f64 fd236, %110, fd235; +sub.f64 fd237, %74, %92; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %80, %98; +add.f64 fd242, %62, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %62, fd245; +add.f64 fd766, %111, %112; +sub.f64 fd247, %111, %112; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd764, %113, fd766; +mul.f64 fd251, fd766, 0d3FE0000000000000; +sub.f64 fd252, %113, fd251; +sub.f64 fd253, %80, %98; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %86, %104; +add.f64 fd258, %68, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %68, fd261; +add.f64 fd761, %115, %114; +sub.f64 fd263, %115, %114; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd759, %116, fd761; +mul.f64 fd267, fd761, 0d3FE0000000000000; +sub.f64 fd268, %116, fd267; +sub.f64 fd269, %86, %104; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd758, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd758, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd757, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd757, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd284, fd256, 0d3FEF838B8C811C17; +mul.f64 fd756, fd250, 0d3FC63A1A7E0B738A; +sub.f64 fd285, fd756, fd284; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd289, fd272, 0d3FD5E3A8748A0BF5; +mul.f64 fd755, fd266, 0dBFEE11F642522D1C; +sub.f64 fd290, fd755, fd289; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd754, fd764, fd759; +sub.f64 fd299, fd764, fd759; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd753, fd769, fd754; +mul.f64 fd303, fd754, 0d3FE0000000000000; +sub.f64 fd304, fd769, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd752, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd751, fd239, fd752; +mul.f64 fd319, fd752, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd750, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd749, fd240, fd750; +mul.f64 fd335, fd750, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %76, %94; +add.f64 fd342, %58, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %58, fd345; +add.f64 fd746, %118, %117; +sub.f64 fd347, %118, %117; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd744, %119, fd746; +mul.f64 fd351, fd746, 0d3FE0000000000000; +sub.f64 fd352, %119, fd351; +sub.f64 fd353, %76, %94; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %82, %100; +add.f64 fd358, %64, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %64, fd361; +add.f64 fd741, %121, %120; +sub.f64 fd363, %121, %120; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd739, %122, fd741; +mul.f64 fd367, fd741, 0d3FE0000000000000; +sub.f64 fd368, %122, fd367; +sub.f64 fd369, %82, %100; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %88, %106; +add.f64 fd374, %70, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %70, fd377; +add.f64 fd737, %123, %107; +sub.f64 fd379, %123, %107; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd735, %124, fd737; +mul.f64 fd383, fd737, 0d3FE0000000000000; +sub.f64 fd384, %124, fd383; +sub.f64 fd385, %88, %106; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd734, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd734, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd733, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd733, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd400, fd372, 0d3FEF838B8C811C17; +mul.f64 fd732, fd366, 0d3FC63A1A7E0B738A; +sub.f64 fd401, fd732, fd400; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd405, fd388, 0d3FD5E3A8748A0BF5; +mul.f64 fd731, fd382, 0dBFEE11F642522D1C; +sub.f64 fd406, fd731, fd405; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd730, fd739, fd735; +sub.f64 fd415, fd739, fd735; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd729, fd744, fd730; +mul.f64 fd419, fd730, 0d3FE0000000000000; +sub.f64 fd420, fd744, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd728, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd727, fd355, fd728; +mul.f64 fd435, fd728, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd726, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd725, fd356, fd726; +mul.f64 fd451, fd726, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd723, fd310, 0d3FEF232EFF15C9E6; +mul.f64 fd724, fd751, 0d3FCD84D223638000; +sub.f64 fd459, fd723, fd724; +mul.f64 fd460, fd751, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd721, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd722, fd727, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd721, fd722; +mul.f64 fd465, fd727, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd468, fd749, 0d3FDCB920325BAFA6; +mul.f64 fd720, fd326, 0d3FEC98A37A9A7850; +sub.f64 fd469, fd720, fd468; +mul.f64 fd470, fd749, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd473, fd725, 0d3FE9AAFE4207DF5F; +mul.f64 fd719, fd442, 0d3FE31BEC55BC71BC; +sub.f64 fd474, fd719, fd473; +mul.f64 fd475, fd725, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd478, fd307, 0d3FE491B7523C161D; +mul.f64 fd718, fd301, 0d3FE8836FA2CF5039; +sub.f64 fd479, fd718, fd478; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd717, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd717, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd716, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd716, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd715, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd715, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd713, fd333, 0d3FD9595EF26FB670; +mul.f64 fd714, fd339, 0d3FED6206BEB6C24B; +sub.f64 fd499, fd713, fd714; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd711, fd449, 0dBFE5F5B105F99707; +mul.f64 fd712, fd455, 0d3FE746A51650EADE; +sub.f64 fd504, fd711, fd712; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd709, fd302, 0d3FC63A1A7E0B738A; +mul.f64 fd710, fd308, 0d3FEF838B8C811C17; +sub.f64 fd509, fd709, fd710; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd707, fd418, 0dBFEE11F642522D1C; +mul.f64 fd708, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd707, fd708; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd518, fd324, 0d3FEFF223F3635CE3; +mul.f64 fd706, fd318, 0dBFADC528B5343A86; +sub.f64 fd519, fd706, fd518; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd523, fd440, 0dBFBDB843E577175E; +mul.f64 fd705, fd434, 0dBFEFC89BCEF44CF4; +sub.f64 fd524, fd705, fd523; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd704, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd704, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd703, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd703, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd702, fd753, fd729; +sub.f64 fd541, fd753, fd729; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +mul.f64 fd543, fd702, 0d3FE0000000000000; +sub.f64 fd544, fd778, fd543; +sub.f64 fd545, fd294, fd410; +mul.f64 fd546, fd545, 0dBFEBB67AE8584CAA; +add.f64 fd547, fd459, fd464; +mul.f64 fd549, fd547, 0d3FE0000000000000; +sub.f64 fd550, fd194, fd549; +add.f64 fd701, fd461, fd466; +sub.f64 fd551, fd461, fd466; +mul.f64 fd552, fd551, 0dBFEBB67AE8584CAA; +mul.f64 fd553, fd701, 0d3FE0000000000000; +sub.f64 fd554, fd776, fd553; +sub.f64 fd555, fd459, fd464; +mul.f64 fd556, fd555, 0dBFEBB67AE8584CAA; +add.f64 fd557, fd469, fd474; +mul.f64 fd559, fd557, 0d3FE0000000000000; +sub.f64 fd560, fd210, fd559; +add.f64 fd700, fd471, fd476; +sub.f64 fd561, fd471, fd476; +mul.f64 fd562, fd561, 0dBFEBB67AE8584CAA; +mul.f64 fd563, fd700, 0d3FE0000000000000; +sub.f64 fd564, fd774, fd563; +sub.f64 fd565, fd469, fd474; +mul.f64 fd566, fd565, 0dBFEBB67AE8584CAA; +add.f64 fd567, fd479, fd484; +mul.f64 fd569, fd567, 0d3FE0000000000000; +sub.f64 fd570, fd185, fd569; +add.f64 fd699, fd481, fd486; +sub.f64 fd571, fd481, fd486; +mul.f64 fd572, fd571, 0dBFEBB67AE8584CAA; +mul.f64 fd573, fd699, 0d3FE0000000000000; +sub.f64 fd574, fd191, fd573; +sub.f64 fd575, fd479, fd484; +mul.f64 fd576, fd575, 0dBFEBB67AE8584CAA; +add.f64 fd577, fd489, fd494; +mul.f64 fd579, fd577, 0d3FE0000000000000; +sub.f64 fd580, fd201, fd579; +add.f64 fd698, fd491, fd496; +sub.f64 fd581, fd491, fd496; +mul.f64 fd582, fd581, 0dBFEBB67AE8584CAA; +mul.f64 fd583, fd698, 0d3FE0000000000000; +sub.f64 fd584, fd207, fd583; +sub.f64 fd585, fd489, fd494; +mul.f64 fd586, fd585, 0dBFEBB67AE8584CAA; +add.f64 fd587, fd499, fd504; +mul.f64 fd589, fd587, 0d3FE0000000000000; +sub.f64 fd590, fd217, fd589; +add.f64 fd697, fd501, fd506; +sub.f64 fd591, fd501, fd506; +mul.f64 fd592, fd591, 0dBFEBB67AE8584CAA; +mul.f64 fd593, fd697, 0d3FE0000000000000; +sub.f64 fd594, fd223, fd593; +sub.f64 fd595, fd499, fd504; +mul.f64 fd596, fd595, 0dBFEBB67AE8584CAA; +add.f64 fd597, fd509, fd514; +mul.f64 fd599, fd597, 0d3FE0000000000000; +sub.f64 fd600, fd186, fd599; +add.f64 fd696, fd511, fd516; +sub.f64 fd601, fd511, fd516; +mul.f64 fd602, fd601, 0dBFEBB67AE8584CAA; +mul.f64 fd603, fd696, 0d3FE0000000000000; +sub.f64 fd604, fd192, fd603; +sub.f64 fd605, fd509, fd514; +mul.f64 fd606, fd605, 0dBFEBB67AE8584CAA; +add.f64 fd607, fd519, fd524; +mul.f64 fd609, fd607, 0d3FE0000000000000; +sub.f64 fd610, fd202, fd609; +add.f64 fd695, fd521, fd526; +sub.f64 fd611, fd521, fd526; +mul.f64 fd612, fd611, 0dBFEBB67AE8584CAA; +mul.f64 fd613, fd695, 0d3FE0000000000000; +sub.f64 fd614, fd208, fd613; +sub.f64 fd615, fd519, fd524; +mul.f64 fd616, fd615, 0dBFEBB67AE8584CAA; +add.f64 fd617, fd529, fd534; +mul.f64 fd619, fd617, 0d3FE0000000000000; +sub.f64 fd620, fd218, fd619; +add.f64 fd694, fd531, fd536; +sub.f64 fd621, fd531, fd536; +mul.f64 fd622, fd621, 0dBFEBB67AE8584CAA; +mul.f64 fd623, fd694, 0d3FE0000000000000; +sub.f64 fd624, fd224, fd623; +sub.f64 fd625, fd529, fd534; +mul.f64 fd791, fd700, 0d3FE0000000000000; +sub.f64 fd790, fd774, fd791; +mul.f64 fd626, fd625, 0dBFEBB67AE8584CAA; +add.f64 %1, fd778, fd702; +mul.f64 fd793, fd537, 0d3FE0000000000000; +sub.f64 fd792, fd178, fd793; +add.f64 %0, fd178, fd537; +mul.f64 fd795, fd701, 0d3FE0000000000000; +sub.f64 fd794, fd776, fd795; +add.f64 %3, fd776, fd701; +add.f64 %2, fd194, fd547; +add.f64 %5, fd774, fd700; +add.f64 %4, fd210, fd557; +add.f64 %7, fd191, fd699; +add.f64 %6, fd185, fd567; +add.f64 %9, fd207, fd698; +add.f64 %8, fd201, fd577; +add.f64 %11, fd223, fd697; +add.f64 %10, fd217, fd587; +add.f64 %13, fd192, fd696; +add.f64 %12, fd186, fd597; +add.f64 %15, fd208, fd695; +add.f64 %14, fd202, fd607; +add.f64 %17, fd224, fd694; +add.f64 %16, fd218, fd617; +add.f64 %18, fd542, fd792; +sub.f64 %19, fd544, fd546; +sub.f64 %21, fd794, fd556; +add.f64 %20, fd552, fd550; +sub.f64 %23, fd790, fd566; +add.f64 %22, fd562, fd560; +sub.f64 %25, fd574, fd576; +add.f64 %24, fd572, fd570; +add.f64 %26, fd582, fd580; +sub.f64 %27, fd584, fd586; +add.f64 %28, fd592, fd590; +sub.f64 %29, fd594, fd596; +add.f64 %30, fd602, fd600; +sub.f64 %31, fd604, fd606; +add.f64 %32, fd612, fd610; +sub.f64 %33, fd614, fd616; +sub.f64 %35, fd624, fd626; +add.f64 %34, fd622, fd620; +add.f64 %37, fd546, fd544; +sub.f64 %36, fd792, fd542; +add.f64 %39, fd556, fd794; +sub.f64 %38, fd550, fd552; +add.f64 %41, fd566, fd790; +sub.f64 %40, fd560, fd562; +add.f64 %43, fd576, fd574; +sub.f64 %42, fd570, fd572; +add.f64 %45, fd586, fd584; +sub.f64 %44, fd580, fd582; +add.f64 %47, fd596, fd594; +sub.f64 %46, fd590, fd592; +add.f64 %49, fd606, fd604; +sub.f64 %48, fd600, fd602; +add.f64 %51, fd616, fd614; +sub.f64 %50, fd610, fd612; +add.f64 %53, fd626, fd624; +sub.f64 %52, fd620, fd622; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[13].y), "d"(rmem[22].y), "d"(rmem[4].y), "d"(rmem[25].y), "d"(rmem[16].y), "d"(rmem[7].y), "d"(rmem[20].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<677, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<315>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 432, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 432, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd122, fd152; +mul.f64 fd156, fd120, fd152; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd138, fd162; +mul.f64 fd164, fd136, fd162; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd117, fd170; +mul.f64 fd172, fd111, fd170; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd133, fd178; +mul.f64 fd180, fd127, fd178; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+48]; +mul.f64 fd186, fd149, fd183; +mul.f64 fd187, fd143, fd183; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd118, fd193; +mul.f64 fd195, fd112, fd193; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd134, fd201; +mul.f64 fd203, fd128, fd201; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd150, fd209; +mul.f64 fd211, fd144, fd209; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd151, fd120, fd155; +sub.f64 fd216, fd157, fd156; +st.shared.v2.f64 [r9+16], {fd215, fd216}; +fma.rn.f64 fd217, fd160, fd136, fd163; +sub.f64 fd218, fd165, fd164; +st.shared.v2.f64 [r9+32], {fd217, fd218}; +sub.f64 fd219, fd173, fd172; +fma.rn.f64 fd220, fd168, fd111, fd171; +st.shared.v2.f64 [r9+48], {fd220, fd219}; +fma.rn.f64 fd221, fd176, fd127, fd179; +sub.f64 fd222, fd181, fd180; +st.shared.v2.f64 [r9+64], {fd221, fd222}; +fma.rn.f64 fd223, fd182, fd143, fd186; +sub.f64 fd224, fd188, fd187; +st.shared.v2.f64 [r9+80], {fd223, fd224}; +fma.rn.f64 fd225, fd191, fd112, fd194; +sub.f64 fd226, fd196, fd195; +st.shared.v2.f64 [r9+96], {fd225, fd226}; +fma.rn.f64 fd227, fd199, fd128, fd202; +sub.f64 fd228, fd204, fd203; +st.shared.v2.f64 [r9+112], {fd227, fd228}; +fma.rn.f64 fd229, fd207, fd144, fd210; +sub.f64 fd230, fd212, fd211; +st.shared.v2.f64 [r9+128], {fd229, fd230}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+48]; +ld.shared.v2.f64 {fd239, fd240}, [r11+96]; +ld.shared.v2.f64 {fd243, fd244}, [r11+144]; +ld.shared.v2.f64 {fd247, fd248}, [r11+192]; +ld.shared.v2.f64 {fd251, fd252}, [r11+240]; +ld.shared.v2.f64 {fd255, fd256}, [r11+288]; +ld.shared.v2.f64 {fd259, fd260}, [r11+336]; +ld.shared.v2.f64 {fd263, fd264}, [r11+384]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd244, fd256; +mul.f64 fd269, fd267, 0d3FE0000000000000; +sub.f64 fd270, fd231, fd269; +sub.f64 fd271, fd244, fd256; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +mul.f64 fd273, fd268, 0d3FE0000000000000; +sub.f64 fd274, fd232, fd273; +sub.f64 fd275, fd243, fd255; +mul.f64 fd276, fd275, 0dBFEBB67AE8584CAA; +add.f64 fd277, fd247, fd259; +add.f64 fd278, fd248, fd260; +mul.f64 fd279, fd277, 0d3FE0000000000000; +sub.f64 fd280, fd235, fd279; +sub.f64 fd281, fd248, fd260; +mul.f64 fd282, fd281, 0dBFEBB67AE8584CAA; +mul.f64 fd283, fd278, 0d3FE0000000000000; +sub.f64 fd284, fd236, fd283; +sub.f64 fd285, fd247, fd259; +mul.f64 fd286, fd285, 0dBFEBB67AE8584CAA; +add.f64 fd287, fd251, fd263; +add.f64 fd288, fd252, fd264; +mul.f64 fd289, fd287, 0d3FE0000000000000; +sub.f64 fd290, fd239, fd289; +sub.f64 fd291, fd252, fd264; +mul.f64 fd292, fd291, 0dBFEBB67AE8584CAA; +mul.f64 fd293, fd288, 0d3FE0000000000000; +sub.f64 fd294, fd240, fd293; +sub.f64 fd295, fd251, fd263; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +add.f64 %1, fd232, fd268; +add.f64 %0, fd231, fd267; +add.f64 %3, fd236, fd278; +add.f64 %2, fd235, fd277; +add.f64 %5, fd240, fd288; +add.f64 %4, fd239, fd287; +sub.f64 %7, fd274, fd276; +add.f64 %6, fd272, fd270; +sub.f64 %9, fd284, fd286; +add.f64 %8, fd282, fd280; +sub.f64 %11, fd294, fd296; +add.f64 %10, fd292, fd290; +add.f64 %13, fd276, fd274; +sub.f64 %12, fd270, fd272; +add.f64 %15, fd286, fd284; +sub.f64 %14, fd280, fd282; +add.f64 %17, fd296, fd294; +sub.f64 %16, fd290, fd292; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<676, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<297>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 216, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0dBFEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0dBFEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0dBFEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd124, fd154; +fma.rn.f64 fd158, fd153, fd122, fd157; +mul.f64 fd159, fd122, fd154; +mul.f64 fd160, fd153, fd124; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd140, fd166; +fma.rn.f64 fd168, fd164, fd138, fd167; +mul.f64 fd169, fd138, fd166; +mul.f64 fd170, fd164, fd140; +sub.f64 fd171, fd170, fd169; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd119, fd176; +fma.rn.f64 fd178, fd174, fd113, fd177; +mul.f64 fd179, fd113, fd176; +mul.f64 fd180, fd174, fd119; +sub.f64 fd181, fd180, fd179; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd135, fd186; +fma.rn.f64 fd188, fd184, fd129, fd187; +mul.f64 fd189, fd129, fd186; +mul.f64 fd190, fd184, fd135; +sub.f64 fd191, fd190, fd189; +ld.global.v2.f64 {fd192, fd193}, [rd6+48]; +mul.f64 fd196, fd151, fd193; +fma.rn.f64 fd197, fd192, fd145, fd196; +mul.f64 fd198, fd145, fd193; +mul.f64 fd199, fd192, fd151; +sub.f64 fd200, fd199, fd198; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd120, fd205; +fma.rn.f64 fd207, fd203, fd114, fd206; +mul.f64 fd208, fd114, fd205; +mul.f64 fd209, fd203, fd120; +sub.f64 fd210, fd209, fd208; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd136, fd215; +fma.rn.f64 fd217, fd213, fd130, fd216; +mul.f64 fd218, fd130, fd215; +mul.f64 fd219, fd213, fd136; +sub.f64 fd220, fd219, fd218; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd152, fd225; +fma.rn.f64 fd227, fd223, fd146, fd226; +mul.f64 fd228, fd146, fd225; +mul.f64 fd229, fd223, fd152; +sub.f64 fd230, fd229, fd228; +mad.lo.s32 r8, r5, 216, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd158; +st.shared.f64 [r9+16], fd168; +st.shared.f64 [r9+24], fd178; +st.shared.f64 [r9+32], fd188; +st.shared.f64 [r9+40], fd197; +st.shared.f64 [r9+48], fd207; +st.shared.f64 [r9+56], fd217; +st.shared.f64 [r9+64], fd227; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+24]; +ld.shared.f64 fd233, [r11+48]; +ld.shared.f64 fd234, [r11+72]; +ld.shared.f64 fd235, [r11+96]; +ld.shared.f64 fd236, [r11+120]; +ld.shared.f64 fd237, [r11+144]; +ld.shared.f64 fd238, [r11+168]; +ld.shared.f64 fd239, [r11+192]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+24]; +ld.shared.f64 fd242, [r11+48]; +ld.shared.f64 fd243, [r11+72]; +ld.shared.f64 fd244, [r11+96]; +ld.shared.f64 fd245, [r11+120]; +ld.shared.f64 fd246, [r11+144]; +ld.shared.f64 fd247, [r11+168]; +ld.shared.f64 fd248, [r11+192]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd243, fd246; +mul.f64 fd251, fd249, 0d3FE0000000000000; +sub.f64 fd252, fd231, fd251; +sub.f64 fd253, fd243, fd246; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +mul.f64 fd255, fd250, 0d3FE0000000000000; +sub.f64 fd256, fd240, fd255; +sub.f64 fd257, fd234, fd237; +mul.f64 fd258, fd257, 0dBFEBB67AE8584CAA; +add.f64 fd259, fd235, fd238; +add.f64 fd260, fd244, fd247; +mul.f64 fd261, fd259, 0d3FE0000000000000; +sub.f64 fd262, fd232, fd261; +sub.f64 fd263, fd244, fd247; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +mul.f64 fd265, fd260, 0d3FE0000000000000; +sub.f64 fd266, fd241, fd265; +sub.f64 fd267, fd235, fd238; +mul.f64 fd268, fd267, 0dBFEBB67AE8584CAA; +add.f64 fd269, fd236, fd239; +add.f64 fd270, fd245, fd248; +mul.f64 fd271, fd269, 0d3FE0000000000000; +sub.f64 fd272, fd233, fd271; +sub.f64 fd273, fd245, fd248; +mul.f64 fd274, fd273, 0dBFEBB67AE8584CAA; +mul.f64 fd275, fd270, 0d3FE0000000000000; +sub.f64 fd276, fd242, fd275; +sub.f64 fd277, fd236, fd239; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +add.f64 %0, fd231, fd249; +add.f64 %1, fd240, fd250; +add.f64 %2, fd232, fd259; +add.f64 %3, fd241, fd260; +add.f64 %4, fd233, fd269; +add.f64 %5, fd242, fd270; +add.f64 %6, fd254, fd252; +sub.f64 %7, fd256, fd258; +add.f64 %8, fd264, fd262; +sub.f64 %9, fd266, fd268; +add.f64 %10, fd274, fd272; +sub.f64 %11, fd276, fd278; +sub.f64 %12, fd252, fd254; +add.f64 %13, fd258, fd256; +sub.f64 %14, fd262, fd264; +add.f64 %15, fd268, fd266; +sub.f64 %16, fd272, fd274; +add.f64 %17, fd278, fd276; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_27), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<678, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<109>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 216, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %11, %14; +add.f64 fd14, %9, fd13; +add.f64 fd15, %13, %15; +add.f64 fd16, %10, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %9, fd17; +sub.f64 fd19, %13, %15; +mul.f64 fd20, fd19, 0dBFEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %10, fd23; +sub.f64 fd25, %11, %14; +mul.f64 fd26, fd25, 0dBFEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 216, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd27, fd30; +fma.rn.f64 fd34, fd29, fd21, fd33; +mul.f64 fd35, fd21, fd30; +mul.f64 fd36, fd29, fd27; +sub.f64 fd37, fd36, fd35; +ld.global.v2.f64 {fd38, fd39}, [rd6+144]; +mul.f64 fd42, fd28, fd39; +fma.rn.f64 fd43, fd38, fd22, fd42; +mul.f64 fd44, fd22, fd39; +mul.f64 fd45, fd38, fd28; +sub.f64 fd46, fd45, fd44; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd34; +st.shared.f64 [r9+16], fd43; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+72]; +ld.shared.f64 fd49, [r11+144]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+72]; +ld.shared.f64 fd52, [r11+144]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd67, fd70; +fma.rn.f64 fd74, fd69, fd61, fd73; +mul.f64 fd75, fd61, fd70; +mul.f64 fd76, fd69, fd67; +sub.f64 fd77, fd76, fd75; +ld.global.v2.f64 {fd78, fd79}, [rd11+48]; +mul.f64 fd82, fd68, fd79; +fma.rn.f64 fd83, fd78, fd62, fd82; +mul.f64 fd84, fd62, fd79; +mul.f64 fd85, fd78, fd68; +sub.f64 fd86, fd85, fd84; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd74; +st.shared.f64 [r17+48], fd83; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+72]; +ld.shared.f64 fd89, [r11+144]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+72]; +ld.shared.f64 fd92, [r11+144]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd91, fd92; +mul.f64 fd95, fd93, 0d3FE0000000000000; +sub.f64 fd96, fd87, fd95; +sub.f64 fd97, fd91, fd92; +mul.f64 fd98, fd97, 0dBFEBB67AE8584CAA; +mul.f64 fd99, fd94, 0d3FE0000000000000; +sub.f64 fd100, fd90, fd99; +sub.f64 fd101, fd88, fd89; +mul.f64 fd102, fd101, 0dBFEBB67AE8584CAA; +add.f64 %0, fd87, fd93; +add.f64 %1, fd90, fd94; +add.f64 %2, fd98, fd96; +sub.f64 %3, fd100, fd102; +sub.f64 %4, fd96, fd98; +add.f64 %5, fd102, fd100; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<679, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<121>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 432, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %11, %14; +add.f64 fd14, %13, %15; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %9, fd15; +sub.f64 fd17, %13, %15; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %10, fd21; +sub.f64 fd23, %11, %14; +mul.f64 fd24, fd23, 0dBFEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 432, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd25, fd28; +mul.f64 fd32, fd19, fd28; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+144]; +mul.f64 fd38, fd26, fd35; +mul.f64 fd39, fd20, fd35; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %10, fd14; +add.f64 fd42, %9, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd27, fd19, fd31; +sub.f64 fd44, fd33, fd32; +st.shared.v2.f64 [r9+16], {fd43, fd44}; +fma.rn.f64 fd45, fd34, fd20, fd38; +sub.f64 fd46, fd40, fd39; +st.shared.v2.f64 [r9+32], {fd45, fd46}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+144]; +ld.shared.v2.f64 {fd55, fd56}, [r11+288]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0dBFEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0dBFEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd71, fd74; +mul.f64 fd78, fd65, fd74; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+48]; +mul.f64 fd84, fd72, fd81; +mul.f64 fd85, fd66, fd81; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd73, fd65, fd77; +sub.f64 fd90, fd79, fd78; +st.shared.v2.f64 [r17+48], {fd89, fd90}; +fma.rn.f64 fd91, fd80, fd66, fd84; +sub.f64 fd92, fd86, fd85; +st.shared.v2.f64 [r17+96], {fd91, fd92}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+144]; +ld.shared.v2.f64 {fd101, fd102}, [r11+288]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +mul.f64 fd111, fd106, 0d3FE0000000000000; +sub.f64 fd112, fd94, fd111; +sub.f64 fd113, fd97, fd101; +mul.f64 fd114, fd113, 0dBFEBB67AE8584CAA; +add.f64 %1, fd94, fd106; +add.f64 %0, fd93, fd105; +sub.f64 %3, fd112, fd114; +add.f64 %2, fd110, fd108; +add.f64 %5, fd114, fd112; +sub.f64 %4, fd108, fd110; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..737b3bf1fd54a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp16_fwd.hpp.inc @@ -0,0 +1,3841 @@ +#ifndef CUFFTDX_FFT_28_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_28_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<758, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<163>; +.reg .b32 r<3515>; +.reg .f64 fd<150>; +.reg .b64 rd<3>; +mov.f64 fd99, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd99; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd113, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd113; +} +mov.b32 r447, {rs2, rs2}; +mov.f64 fd118, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs3, fd118; +} +mov.b32 r654, {rs3, rs3}; +mov.f64 fd117, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs4, fd117; +} +mov.b32 r678, {rs4, rs4}; +mov.f64 fd115, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs5, fd115; +} +mov.b32 r636, {rs5, rs5}; +mov.f64 fd116, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs6, fd116; +} +mov.b32 r663, {rs6, rs6}; +{ +cvt.rn.f16.f64 rs7, fd115; +} +mov.b32 r537, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs8, fd116; +} +{ +neg.f16 rs9, rs8; +} +mov.b32 r561, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs11, fd99; +} +mov.b32 r645, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd113; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r669, {rs13, rs13}; +{ +add.f16x2 r1, %66, %61; +} +{ +add.f16x2 r4, %69, r1; +} +{ +add.f16x2 r7, %57, %65; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %60, %56; +} +{ +add.f16x2 r16, r10, r13; +} +{ +add.f16x2 r19, %63, %59; +} +{ +add.f16x2 r22, %64, r19; +} +{ +add.f16x2 r25, %68, %62; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %58, %67; +} +{ +add.f16x2 r34, r28, r31; +} +{ +add.f16x2 r37, %66, %61; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %69, r40; +} +{ +add.f16x2 r46, %57, %65; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %60, %56; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %63, %59; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %68, %62; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %58, %67; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r61, r85; +} +{ +add.f16x2 r91, %66, %61; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %69, r94; +} +{ +add.f16x2 r100, %57, %65; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %60, %56; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %63, %59; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %68, %62; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %58, %67; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 r142, r115, r139; +} +{ +add.f16x2 r145, %66, %61; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %69, r148; +} +{ +add.f16x2 r154, %57, %65; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %60, %56; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %63, %59; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %68, %62; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %58, %67; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 r196, r169, r193; +} +{ +add.f16x2 r199, %66, %61; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %69, r202; +} +{ +add.f16x2 r208, %57, %65; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %60, %56; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %63, %59; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %68, %62; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %58, %67; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 r250, r223, r247; +} +{ +add.f16x2 r253, %66, %61; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %69, r256; +} +{ +add.f16x2 r262, %57, %65; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %60, %56; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %63, %59; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %68, %62; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %58, %67; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 r304, r277, r301; +} +{ +add.f16x2 r307, %66, %61; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %69, r310; +} +{ +add.f16x2 r316, %57, %65; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %60, %56; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %63, %59; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %68, %62; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %58, %67; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 r358, r331, r355; +} +{ +add.f16x2 r361, %63, %59; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %64, r364; +} +{ +add.f16x2 r370, %68, %62; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %58, %67; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %66, %61; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %57, %65; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %60, %56; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 r412, r385, r409; +} +{ +add.f16x2 r415, %63, %59; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %64, r418; +} +{ +add.f16x2 r424, %68, %62; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %58, %67; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %66, %61; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %57, %65; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %60, %56; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 r466, r439, r463; +} +{ +add.f16x2 r469, %63, %59; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %64, r472; +} +{ +add.f16x2 r478, %68, %62; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %58, %67; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %66, %61; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %57, %65; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %60, %56; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r493, r517; +} +{ +add.f16x2 r523, %63, %59; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %64, r526; +} +{ +add.f16x2 r532, %68, %62; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %58, %67; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %66, %61; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %57, %65; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %60, %56; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 r574, r547, r571; +} +{ +add.f16x2 r577, %63, %59; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %64, r580; +} +{ +add.f16x2 r586, %68, %62; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %58, %67; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %66, %61; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %57, %65; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %60, %56; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r601, r625; +} +{ +add.f16x2 r631, %63, %59; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %64, r634; +} +{ +add.f16x2 r640, %68, %62; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %58, %67; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %66, %61; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %57, %65; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %60, %56; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r655, r679; +} +{ +cvt.rn.f16.f64 rs15, fd99; +} +mov.b32 r1104, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd113; +} +mov.b32 r1131, {rs16, rs16}; +{ +cvt.rn.f16.f64 rs17, fd118; +} +mov.b32 r1338, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd117; +} +mov.b32 r1362, {rs18, rs18}; +{ +cvt.rn.f16.f64 rs19, fd115; +} +mov.b32 r1320, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd116; +} +mov.b32 r1347, {rs20, rs20}; +{ +cvt.rn.f16.f64 rs21, fd115; +} +mov.b32 r1221, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd116; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r1245, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd99; +} +mov.b32 r1329, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd113; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r1353, {rs27, rs27}; +{ +add.f16x2 r685, %72, %81; +} +{ +add.f16x2 r688, %75, r685; +} +{ +add.f16x2 r691, %77, %71; +} +{ +add.f16x2 r694, r688, r691; +} +{ +add.f16x2 r697, %80, %76; +} +{ +add.f16x2 r700, r694, r697; +} +{ +add.f16x2 r703, %83, %79; +} +{ +add.f16x2 r706, %70, r703; +} +{ +add.f16x2 r709, %74, %82; +} +{ +add.f16x2 r712, r706, r709; +} +{ +add.f16x2 r715, %78, %73; +} +{ +add.f16x2 r718, r712, r715; +} +{ +add.f16x2 r721, %72, %81; +} +{ +mul.f16x2 r724, r721, r1104; +} +{ +add.f16x2 r727, %75, r724; +} +{ +add.f16x2 r730, %77, %71; +} +{ +mul.f16x2 r733, r730, r1338; +} +{ +add.f16x2 r736, r727, r733; +} +{ +add.f16x2 r739, %80, %76; +} +{ +mul.f16x2 r742, r739, r1320; +} +{ +add.f16x2 r745, r736, r742; +} +{ +sub.f16x2 r748, %83, %79; +} +{ +mul.f16x2 r751, r748, r1131; +} +{ +sub.f16x2 r754, %74, %82; +} +{ +mul.f16x2 r757, r754, r1362; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %78, %73; +} +{ +mul.f16x2 r766, r763, r1347; +} +{ +add.f16x2 r769, r760, r766; +} +{ +sub.f16x2 r772, r745, r769; +} +{ +add.f16x2 r775, %72, %81; +} +{ +mul.f16x2 r778, r775, r1104; +} +{ +add.f16x2 r781, %75, r778; +} +{ +add.f16x2 r784, %77, %71; +} +{ +mul.f16x2 r787, r784, r1338; +} +{ +add.f16x2 r790, r781, r787; +} +{ +add.f16x2 r793, %80, %76; +} +{ +mul.f16x2 r796, r793, r1320; +} +{ +add.f16x2 r799, r790, r796; +} +{ +sub.f16x2 r802, %83, %79; +} +{ +mul.f16x2 r805, r802, r1131; +} +{ +sub.f16x2 r808, %74, %82; +} +{ +mul.f16x2 r811, r808, r1362; +} +{ +add.f16x2 r814, r805, r811; +} +{ +sub.f16x2 r817, %78, %73; +} +{ +mul.f16x2 r820, r817, r1347; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r799, r823; +} +{ +add.f16x2 r829, %72, %81; +} +{ +mul.f16x2 r832, r829, r1338; +} +{ +add.f16x2 r835, %75, r832; +} +{ +add.f16x2 r838, %77, %71; +} +{ +mul.f16x2 r841, r838, r1221; +} +{ +add.f16x2 r844, r835, r841; +} +{ +add.f16x2 r847, %80, %76; +} +{ +mul.f16x2 r850, r847, r1329; +} +{ +add.f16x2 r853, r844, r850; +} +{ +sub.f16x2 r856, %83, %79; +} +{ +mul.f16x2 r859, r856, r1362; +} +{ +sub.f16x2 r862, %74, %82; +} +{ +mul.f16x2 r865, r862, r1245; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %78, %73; +} +{ +mul.f16x2 r874, r871, r1353; +} +{ +add.f16x2 r877, r868, r874; +} +{ +sub.f16x2 r880, r853, r877; +} +{ +add.f16x2 r883, %72, %81; +} +{ +mul.f16x2 r886, r883, r1338; +} +{ +add.f16x2 r889, %75, r886; +} +{ +add.f16x2 r892, %77, %71; +} +{ +mul.f16x2 r895, r892, r1221; +} +{ +add.f16x2 r898, r889, r895; +} +{ +add.f16x2 r901, %80, %76; +} +{ +mul.f16x2 r904, r901, r1329; +} +{ +add.f16x2 r907, r898, r904; +} +{ +sub.f16x2 r910, %83, %79; +} +{ +mul.f16x2 r913, r910, r1362; +} +{ +sub.f16x2 r916, %74, %82; +} +{ +mul.f16x2 r919, r916, r1245; +} +{ +add.f16x2 r922, r913, r919; +} +{ +sub.f16x2 r925, %78, %73; +} +{ +mul.f16x2 r928, r925, r1353; +} +{ +add.f16x2 r931, r922, r928; +} +{ +add.f16x2 r934, r907, r931; +} +{ +add.f16x2 r937, %72, %81; +} +{ +mul.f16x2 r940, r937, r1320; +} +{ +add.f16x2 r943, %75, r940; +} +{ +add.f16x2 r946, %77, %71; +} +{ +mul.f16x2 r949, r946, r1329; +} +{ +add.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, %80, %76; +} +{ +mul.f16x2 r958, r955, r1338; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, %83, %79; +} +{ +mul.f16x2 r967, r964, r1347; +} +{ +sub.f16x2 r970, %74, %82; +} +{ +mul.f16x2 r973, r970, r1353; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %78, %73; +} +{ +mul.f16x2 r982, r979, r1362; +} +{ +add.f16x2 r985, r976, r982; +} +{ +sub.f16x2 r988, r961, r985; +} +{ +add.f16x2 r991, %72, %81; +} +{ +mul.f16x2 r994, r991, r1320; +} +{ +add.f16x2 r997, %75, r994; +} +{ +add.f16x2 r1000, %77, %71; +} +{ +mul.f16x2 r1003, r1000, r1329; +} +{ +add.f16x2 r1006, r997, r1003; +} +{ +add.f16x2 r1009, %80, %76; +} +{ +mul.f16x2 r1012, r1009, r1338; +} +{ +add.f16x2 r1015, r1006, r1012; +} +{ +sub.f16x2 r1018, %83, %79; +} +{ +mul.f16x2 r1021, r1018, r1347; +} +{ +sub.f16x2 r1024, %74, %82; +} +{ +mul.f16x2 r1027, r1024, r1353; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %78, %73; +} +{ +mul.f16x2 r1036, r1033, r1362; +} +{ +add.f16x2 r1039, r1030, r1036; +} +{ +add.f16x2 r1042, r1015, r1039; +} +{ +add.f16x2 r1045, %83, %79; +} +{ +mul.f16x2 r1048, r1045, r1104; +} +{ +add.f16x2 r1051, %70, r1048; +} +{ +add.f16x2 r1054, %74, %82; +} +{ +mul.f16x2 r1057, r1054, r1338; +} +{ +add.f16x2 r1060, r1051, r1057; +} +{ +add.f16x2 r1063, %78, %73; +} +{ +mul.f16x2 r1066, r1063, r1320; +} +{ +add.f16x2 r1069, r1060, r1066; +} +{ +sub.f16x2 r1072, %72, %81; +} +{ +mul.f16x2 r1075, r1072, r1131; +} +{ +sub.f16x2 r1078, %77, %71; +} +{ +mul.f16x2 r1081, r1078, r1362; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %80, %76; +} +{ +mul.f16x2 r1090, r1087, r1347; +} +{ +add.f16x2 r1093, r1084, r1090; +} +{ +add.f16x2 r1096, r1069, r1093; +} +{ +add.f16x2 r1099, %83, %79; +} +{ +mul.f16x2 r1102, r1099, r1104; +} +{ +add.f16x2 r1105, %70, r1102; +} +{ +add.f16x2 r1108, %74, %82; +} +{ +mul.f16x2 r1111, r1108, r1338; +} +{ +add.f16x2 r1114, r1105, r1111; +} +{ +add.f16x2 r1117, %78, %73; +} +{ +mul.f16x2 r1120, r1117, r1320; +} +{ +add.f16x2 r1123, r1114, r1120; +} +{ +sub.f16x2 r1126, %72, %81; +} +{ +mul.f16x2 r1129, r1126, r1131; +} +{ +sub.f16x2 r1132, %77, %71; +} +{ +mul.f16x2 r1135, r1132, r1362; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %80, %76; +} +{ +mul.f16x2 r1144, r1141, r1347; +} +{ +add.f16x2 r1147, r1138, r1144; +} +{ +sub.f16x2 r1150, r1123, r1147; +} +{ +add.f16x2 r1153, %83, %79; +} +{ +mul.f16x2 r1156, r1153, r1338; +} +{ +add.f16x2 r1159, %70, r1156; +} +{ +add.f16x2 r1162, %74, %82; +} +{ +mul.f16x2 r1165, r1162, r1221; +} +{ +add.f16x2 r1168, r1159, r1165; +} +{ +add.f16x2 r1171, %78, %73; +} +{ +mul.f16x2 r1174, r1171, r1329; +} +{ +add.f16x2 r1177, r1168, r1174; +} +{ +sub.f16x2 r1180, %72, %81; +} +{ +mul.f16x2 r1183, r1180, r1362; +} +{ +sub.f16x2 r1186, %77, %71; +} +{ +mul.f16x2 r1189, r1186, r1245; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %80, %76; +} +{ +mul.f16x2 r1198, r1195, r1353; +} +{ +add.f16x2 r1201, r1192, r1198; +} +{ +add.f16x2 r1204, r1177, r1201; +} +{ +add.f16x2 r1207, %83, %79; +} +{ +mul.f16x2 r1210, r1207, r1338; +} +{ +add.f16x2 r1213, %70, r1210; +} +{ +add.f16x2 r1216, %74, %82; +} +{ +mul.f16x2 r1219, r1216, r1221; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %73; +} +{ +mul.f16x2 r1228, r1225, r1329; +} +{ +add.f16x2 r1231, r1222, r1228; +} +{ +sub.f16x2 r1234, %72, %81; +} +{ +mul.f16x2 r1237, r1234, r1362; +} +{ +sub.f16x2 r1240, %77, %71; +} +{ +mul.f16x2 r1243, r1240, r1245; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %80, %76; +} +{ +mul.f16x2 r1252, r1249, r1353; +} +{ +add.f16x2 r1255, r1246, r1252; +} +{ +sub.f16x2 r1258, r1231, r1255; +} +{ +add.f16x2 r1261, %83, %79; +} +{ +mul.f16x2 r1264, r1261, r1320; +} +{ +add.f16x2 r1267, %70, r1264; +} +{ +add.f16x2 r1270, %74, %82; +} +{ +mul.f16x2 r1273, r1270, r1329; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +add.f16x2 r1279, %78, %73; +} +{ +mul.f16x2 r1282, r1279, r1338; +} +{ +add.f16x2 r1285, r1276, r1282; +} +{ +sub.f16x2 r1288, %72, %81; +} +{ +mul.f16x2 r1291, r1288, r1347; +} +{ +sub.f16x2 r1294, %77, %71; +} +{ +mul.f16x2 r1297, r1294, r1353; +} +{ +add.f16x2 r1300, r1291, r1297; +} +{ +sub.f16x2 r1303, %80, %76; +} +{ +mul.f16x2 r1306, r1303, r1362; +} +{ +add.f16x2 r1309, r1300, r1306; +} +{ +add.f16x2 r1312, r1285, r1309; +} +{ +add.f16x2 r1315, %83, %79; +} +{ +mul.f16x2 r1318, r1315, r1320; +} +{ +add.f16x2 r1321, %70, r1318; +} +{ +add.f16x2 r1324, %74, %82; +} +{ +mul.f16x2 r1327, r1324, r1329; +} +{ +add.f16x2 r1330, r1321, r1327; +} +{ +add.f16x2 r1333, %78, %73; +} +{ +mul.f16x2 r1336, r1333, r1338; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +sub.f16x2 r1342, %72, %81; +} +{ +mul.f16x2 r1345, r1342, r1347; +} +{ +sub.f16x2 r1348, %77, %71; +} +{ +mul.f16x2 r1351, r1348, r1353; +} +{ +add.f16x2 r1354, r1345, r1351; +} +{ +sub.f16x2 r1357, %80, %76; +} +{ +mul.f16x2 r1360, r1357, r1362; +} +{ +add.f16x2 r1363, r1354, r1360; +} +{ +sub.f16x2 r1366, r1339, r1363; +} +mov.f64 fd95, 0d3FECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs29, fd95; +} +{ +cvt.rn.f16.f64 rs30, fd116; +} +{ +cvt.rn.f16.f64 rs31, fd99; +} +{ +cvt.rn.f16.f64 rs32, fd113; +} +mov.f64 fd103, 0d3FCC7B90E3024582; +{ +cvt.rn.f16.f64 rs33, fd103; +} +{ +cvt.rn.f16.f64 rs34, fd117; +} +{ +cvt.rn.f16.f64 rs35, fd118; +} +{ +cvt.rn.f16.f64 rs36, fd117; +} +mov.f64 fd114, 0dBFE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs37, fd114; +} +{ +cvt.rn.f16.f64 rs38, fd113; +} +{ +cvt.rn.f16.f64 rs39, fd115; +} +{ +cvt.rn.f16.f64 rs40, fd116; +} +mov.b32 r1383, {rs29, rs29}; +{ +mul.f16x2 r1369, r772, r1383; +} +mov.b32 r1380, {rs30, rs30}; +{ +mul.f16x2 r1372, r1096, r1380; +} +{ +sub.f16x2 r1375, r1369, r1372; +} +{ +mul.f16x2 r1378, r772, r1380; +} +{ +fma.rn.f16x2 r1381, r1096, r1383, r1378; +} +mov.b32 r1399, {rs31, rs31}; +{ +mul.f16x2 r1385, r880, r1399; +} +mov.b32 r1396, {rs32, rs32}; +{ +mul.f16x2 r1388, r1204, r1396; +} +{ +sub.f16x2 r1391, r1385, r1388; +} +{ +mul.f16x2 r1394, r880, r1396; +} +{ +fma.rn.f16x2 r1397, r1204, r1399, r1394; +} +mov.b32 r1415, {rs33, rs33}; +{ +mul.f16x2 r1401, r988, r1415; +} +mov.b32 r1412, {rs34, rs34}; +{ +mul.f16x2 r1404, r1312, r1412; +} +{ +sub.f16x2 r1407, r1401, r1404; +} +{ +mul.f16x2 r1410, r988, r1412; +} +{ +fma.rn.f16x2 r1413, r1312, r1415, r1410; +} +mov.b32 r1431, {rs35, rs35}; +{ +mul.f16x2 r1417, r1042, r1431; +} +mov.b32 r1428, {rs36, rs36}; +{ +mul.f16x2 r1420, r1366, r1428; +} +{ +sub.f16x2 r1423, r1417, r1420; +} +{ +mul.f16x2 r1426, r1042, r1428; +} +{ +fma.rn.f16x2 r1429, r1366, r1431, r1426; +} +mov.b32 r1447, {rs37, rs37}; +{ +mul.f16x2 r1433, r934, r1447; +} +mov.b32 r1444, {rs38, rs38}; +{ +mul.f16x2 r1436, r1258, r1444; +} +{ +sub.f16x2 r1439, r1433, r1436; +} +{ +mul.f16x2 r1442, r934, r1444; +} +{ +fma.rn.f16x2 r1445, r1258, r1447, r1442; +} +mov.b32 r1463, {rs39, rs39}; +{ +mul.f16x2 r1449, r826, r1463; +} +mov.b32 r1460, {rs40, rs40}; +{ +mul.f16x2 r1452, r1150, r1460; +} +{ +sub.f16x2 r1455, r1449, r1452; +} +{ +mul.f16x2 r1458, r826, r1460; +} +{ +fma.rn.f16x2 r1461, r1150, r1463, r1458; +} +{ +add.f16x2 r1465, r16, r700; +} +{ +add.f16x2 r1468, r34, r718; +} +{ +sub.f16x2 r1471, r16, r700; +} +{ +sub.f16x2 r1474, r34, r718; +} +{ +add.f16x2 r1477, r88, r1375; +} +{ +add.f16x2 r1480, r412, r1381; +} +{ +sub.f16x2 r1483, r88, r1375; +} +{ +sub.f16x2 r1486, r412, r1381; +} +{ +add.f16x2 r1489, r196, r1391; +} +{ +add.f16x2 r1492, r520, r1397; +} +{ +sub.f16x2 r1495, r196, r1391; +} +{ +sub.f16x2 r1498, r520, r1397; +} +{ +add.f16x2 r1501, r304, r1407; +} +{ +add.f16x2 r1504, r628, r1413; +} +{ +sub.f16x2 r1507, r304, r1407; +} +{ +sub.f16x2 r1510, r628, r1413; +} +{ +add.f16x2 r1513, r358, r1423; +} +{ +add.f16x2 r1516, r682, r1429; +} +{ +sub.f16x2 r1519, r358, r1423; +} +{ +sub.f16x2 r1522, r682, r1429; +} +{ +add.f16x2 r1525, r250, r1439; +} +{ +add.f16x2 r1528, r574, r1445; +} +{ +sub.f16x2 r1531, r250, r1439; +} +{ +sub.f16x2 r1534, r574, r1445; +} +{ +add.f16x2 r1537, r142, r1455; +} +{ +add.f16x2 r1540, r466, r1461; +} +{ +sub.f16x2 r1543, r142, r1455; +} +{ +sub.f16x2 r1546, r466, r1461; +} +{ +cvt.rn.f16.f64 rs55, fd99; +} +mov.b32 r1968, {rs55, rs55}; +{ +cvt.rn.f16.f64 rs56, fd113; +} +mov.b32 r1995, {rs56, rs56}; +{ +cvt.rn.f16.f64 rs57, fd118; +} +mov.b32 r2202, {rs57, rs57}; +{ +cvt.rn.f16.f64 rs58, fd117; +} +mov.b32 r2226, {rs58, rs58}; +{ +cvt.rn.f16.f64 rs59, fd115; +} +mov.b32 r2184, {rs59, rs59}; +{ +cvt.rn.f16.f64 rs60, fd116; +} +mov.b32 r2211, {rs60, rs60}; +{ +cvt.rn.f16.f64 rs61, fd115; +} +mov.b32 r2085, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs62, fd116; +} +{ +neg.f16 rs63, rs62; +} +mov.b32 r2109, {rs63, rs63}; +{ +cvt.rn.f16.f64 rs65, fd99; +} +mov.b32 r2193, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs66, fd113; +} +{ +neg.f16 rs67, rs66; +} +mov.b32 r2217, {rs67, rs67}; +{ +add.f16x2 r1549, %88, %84; +} +{ +add.f16x2 r1552, %92, r1549; +} +{ +add.f16x2 r1555, %94, %89; +} +{ +add.f16x2 r1558, r1552, r1555; +} +{ +add.f16x2 r1561, %97, %93; +} +{ +add.f16x2 r1564, r1558, r1561; +} +{ +add.f16x2 r1567, %86, %96; +} +{ +add.f16x2 r1570, %87, r1567; +} +{ +add.f16x2 r1573, %91, %85; +} +{ +add.f16x2 r1576, r1570, r1573; +} +{ +add.f16x2 r1579, %95, %90; +} +{ +add.f16x2 r1582, r1576, r1579; +} +{ +add.f16x2 r1585, %88, %84; +} +{ +mul.f16x2 r1588, r1585, r1968; +} +{ +add.f16x2 r1591, %92, r1588; +} +{ +add.f16x2 r1594, %94, %89; +} +{ +mul.f16x2 r1597, r1594, r2202; +} +{ +add.f16x2 r1600, r1591, r1597; +} +{ +add.f16x2 r1603, %97, %93; +} +{ +mul.f16x2 r1606, r1603, r2184; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +sub.f16x2 r1612, %86, %96; +} +{ +mul.f16x2 r1615, r1612, r1995; +} +{ +sub.f16x2 r1618, %91, %85; +} +{ +mul.f16x2 r1621, r1618, r2226; +} +{ +add.f16x2 r1624, r1615, r1621; +} +{ +sub.f16x2 r1627, %95, %90; +} +{ +mul.f16x2 r1630, r1627, r2211; +} +{ +add.f16x2 r1633, r1624, r1630; +} +{ +sub.f16x2 r1636, r1609, r1633; +} +{ +add.f16x2 r1639, %88, %84; +} +{ +mul.f16x2 r1642, r1639, r1968; +} +{ +add.f16x2 r1645, %92, r1642; +} +{ +add.f16x2 r1648, %94, %89; +} +{ +mul.f16x2 r1651, r1648, r2202; +} +{ +add.f16x2 r1654, r1645, r1651; +} +{ +add.f16x2 r1657, %97, %93; +} +{ +mul.f16x2 r1660, r1657, r2184; +} +{ +add.f16x2 r1663, r1654, r1660; +} +{ +sub.f16x2 r1666, %86, %96; +} +{ +mul.f16x2 r1669, r1666, r1995; +} +{ +sub.f16x2 r1672, %91, %85; +} +{ +mul.f16x2 r1675, r1672, r2226; +} +{ +add.f16x2 r1678, r1669, r1675; +} +{ +sub.f16x2 r1681, %95, %90; +} +{ +mul.f16x2 r1684, r1681, r2211; +} +{ +add.f16x2 r1687, r1678, r1684; +} +{ +add.f16x2 r1690, r1663, r1687; +} +{ +add.f16x2 r1693, %88, %84; +} +{ +mul.f16x2 r1696, r1693, r2202; +} +{ +add.f16x2 r1699, %92, r1696; +} +{ +add.f16x2 r1702, %94, %89; +} +{ +mul.f16x2 r1705, r1702, r2085; +} +{ +add.f16x2 r1708, r1699, r1705; +} +{ +add.f16x2 r1711, %97, %93; +} +{ +mul.f16x2 r1714, r1711, r2193; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +sub.f16x2 r1720, %86, %96; +} +{ +mul.f16x2 r1723, r1720, r2226; +} +{ +sub.f16x2 r1726, %91, %85; +} +{ +mul.f16x2 r1729, r1726, r2109; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +sub.f16x2 r1735, %95, %90; +} +{ +mul.f16x2 r1738, r1735, r2217; +} +{ +add.f16x2 r1741, r1732, r1738; +} +{ +sub.f16x2 r1744, r1717, r1741; +} +{ +add.f16x2 r1747, %88, %84; +} +{ +mul.f16x2 r1750, r1747, r2202; +} +{ +add.f16x2 r1753, %92, r1750; +} +{ +add.f16x2 r1756, %94, %89; +} +{ +mul.f16x2 r1759, r1756, r2085; +} +{ +add.f16x2 r1762, r1753, r1759; +} +{ +add.f16x2 r1765, %97, %93; +} +{ +mul.f16x2 r1768, r1765, r2193; +} +{ +add.f16x2 r1771, r1762, r1768; +} +{ +sub.f16x2 r1774, %86, %96; +} +{ +mul.f16x2 r1777, r1774, r2226; +} +{ +sub.f16x2 r1780, %91, %85; +} +{ +mul.f16x2 r1783, r1780, r2109; +} +{ +add.f16x2 r1786, r1777, r1783; +} +{ +sub.f16x2 r1789, %95, %90; +} +{ +mul.f16x2 r1792, r1789, r2217; +} +{ +add.f16x2 r1795, r1786, r1792; +} +{ +add.f16x2 r1798, r1771, r1795; +} +{ +add.f16x2 r1801, %88, %84; +} +{ +mul.f16x2 r1804, r1801, r2184; +} +{ +add.f16x2 r1807, %92, r1804; +} +{ +add.f16x2 r1810, %94, %89; +} +{ +mul.f16x2 r1813, r1810, r2193; +} +{ +add.f16x2 r1816, r1807, r1813; +} +{ +add.f16x2 r1819, %97, %93; +} +{ +mul.f16x2 r1822, r1819, r2202; +} +{ +add.f16x2 r1825, r1816, r1822; +} +{ +sub.f16x2 r1828, %86, %96; +} +{ +mul.f16x2 r1831, r1828, r2211; +} +{ +sub.f16x2 r1834, %91, %85; +} +{ +mul.f16x2 r1837, r1834, r2217; +} +{ +add.f16x2 r1840, r1831, r1837; +} +{ +sub.f16x2 r1843, %95, %90; +} +{ +mul.f16x2 r1846, r1843, r2226; +} +{ +add.f16x2 r1849, r1840, r1846; +} +{ +sub.f16x2 r1852, r1825, r1849; +} +{ +add.f16x2 r1855, %88, %84; +} +{ +mul.f16x2 r1858, r1855, r2184; +} +{ +add.f16x2 r1861, %92, r1858; +} +{ +add.f16x2 r1864, %94, %89; +} +{ +mul.f16x2 r1867, r1864, r2193; +} +{ +add.f16x2 r1870, r1861, r1867; +} +{ +add.f16x2 r1873, %97, %93; +} +{ +mul.f16x2 r1876, r1873, r2202; +} +{ +add.f16x2 r1879, r1870, r1876; +} +{ +sub.f16x2 r1882, %86, %96; +} +{ +mul.f16x2 r1885, r1882, r2211; +} +{ +sub.f16x2 r1888, %91, %85; +} +{ +mul.f16x2 r1891, r1888, r2217; +} +{ +add.f16x2 r1894, r1885, r1891; +} +{ +sub.f16x2 r1897, %95, %90; +} +{ +mul.f16x2 r1900, r1897, r2226; +} +{ +add.f16x2 r1903, r1894, r1900; +} +{ +add.f16x2 r1906, r1879, r1903; +} +{ +add.f16x2 r1909, %86, %96; +} +{ +mul.f16x2 r1912, r1909, r1968; +} +{ +add.f16x2 r1915, %87, r1912; +} +{ +add.f16x2 r1918, %91, %85; +} +{ +mul.f16x2 r1921, r1918, r2202; +} +{ +add.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, %95, %90; +} +{ +mul.f16x2 r1930, r1927, r2184; +} +{ +add.f16x2 r1933, r1924, r1930; +} +{ +sub.f16x2 r1936, %88, %84; +} +{ +mul.f16x2 r1939, r1936, r1995; +} +{ +sub.f16x2 r1942, %94, %89; +} +{ +mul.f16x2 r1945, r1942, r2226; +} +{ +add.f16x2 r1948, r1939, r1945; +} +{ +sub.f16x2 r1951, %97, %93; +} +{ +mul.f16x2 r1954, r1951, r2211; +} +{ +add.f16x2 r1957, r1948, r1954; +} +{ +add.f16x2 r1960, r1933, r1957; +} +{ +add.f16x2 r1963, %86, %96; +} +{ +mul.f16x2 r1966, r1963, r1968; +} +{ +add.f16x2 r1969, %87, r1966; +} +{ +add.f16x2 r1972, %91, %85; +} +{ +mul.f16x2 r1975, r1972, r2202; +} +{ +add.f16x2 r1978, r1969, r1975; +} +{ +add.f16x2 r1981, %95, %90; +} +{ +mul.f16x2 r1984, r1981, r2184; +} +{ +add.f16x2 r1987, r1978, r1984; +} +{ +sub.f16x2 r1990, %88, %84; +} +{ +mul.f16x2 r1993, r1990, r1995; +} +{ +sub.f16x2 r1996, %94, %89; +} +{ +mul.f16x2 r1999, r1996, r2226; +} +{ +add.f16x2 r2002, r1993, r1999; +} +{ +sub.f16x2 r2005, %97, %93; +} +{ +mul.f16x2 r2008, r2005, r2211; +} +{ +add.f16x2 r2011, r2002, r2008; +} +{ +sub.f16x2 r2014, r1987, r2011; +} +{ +add.f16x2 r2017, %86, %96; +} +{ +mul.f16x2 r2020, r2017, r2202; +} +{ +add.f16x2 r2023, %87, r2020; +} +{ +add.f16x2 r2026, %91, %85; +} +{ +mul.f16x2 r2029, r2026, r2085; +} +{ +add.f16x2 r2032, r2023, r2029; +} +{ +add.f16x2 r2035, %95, %90; +} +{ +mul.f16x2 r2038, r2035, r2193; +} +{ +add.f16x2 r2041, r2032, r2038; +} +{ +sub.f16x2 r2044, %88, %84; +} +{ +mul.f16x2 r2047, r2044, r2226; +} +{ +sub.f16x2 r2050, %94, %89; +} +{ +mul.f16x2 r2053, r2050, r2109; +} +{ +add.f16x2 r2056, r2047, r2053; +} +{ +sub.f16x2 r2059, %97, %93; +} +{ +mul.f16x2 r2062, r2059, r2217; +} +{ +add.f16x2 r2065, r2056, r2062; +} +{ +add.f16x2 r2068, r2041, r2065; +} +{ +add.f16x2 r2071, %86, %96; +} +{ +mul.f16x2 r2074, r2071, r2202; +} +{ +add.f16x2 r2077, %87, r2074; +} +{ +add.f16x2 r2080, %91, %85; +} +{ +mul.f16x2 r2083, r2080, r2085; +} +{ +add.f16x2 r2086, r2077, r2083; +} +{ +add.f16x2 r2089, %95, %90; +} +{ +mul.f16x2 r2092, r2089, r2193; +} +{ +add.f16x2 r2095, r2086, r2092; +} +{ +sub.f16x2 r2098, %88, %84; +} +{ +mul.f16x2 r2101, r2098, r2226; +} +{ +sub.f16x2 r2104, %94, %89; +} +{ +mul.f16x2 r2107, r2104, r2109; +} +{ +add.f16x2 r2110, r2101, r2107; +} +{ +sub.f16x2 r2113, %97, %93; +} +{ +mul.f16x2 r2116, r2113, r2217; +} +{ +add.f16x2 r2119, r2110, r2116; +} +{ +sub.f16x2 r2122, r2095, r2119; +} +{ +add.f16x2 r2125, %86, %96; +} +{ +mul.f16x2 r2128, r2125, r2184; +} +{ +add.f16x2 r2131, %87, r2128; +} +{ +add.f16x2 r2134, %91, %85; +} +{ +mul.f16x2 r2137, r2134, r2193; +} +{ +add.f16x2 r2140, r2131, r2137; +} +{ +add.f16x2 r2143, %95, %90; +} +{ +mul.f16x2 r2146, r2143, r2202; +} +{ +add.f16x2 r2149, r2140, r2146; +} +{ +sub.f16x2 r2152, %88, %84; +} +{ +mul.f16x2 r2155, r2152, r2211; +} +{ +sub.f16x2 r2158, %94, %89; +} +{ +mul.f16x2 r2161, r2158, r2217; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +sub.f16x2 r2167, %97, %93; +} +{ +mul.f16x2 r2170, r2167, r2226; +} +{ +add.f16x2 r2173, r2164, r2170; +} +{ +add.f16x2 r2176, r2149, r2173; +} +{ +add.f16x2 r2179, %86, %96; +} +{ +mul.f16x2 r2182, r2179, r2184; +} +{ +add.f16x2 r2185, %87, r2182; +} +{ +add.f16x2 r2188, %91, %85; +} +{ +mul.f16x2 r2191, r2188, r2193; +} +{ +add.f16x2 r2194, r2185, r2191; +} +{ +add.f16x2 r2197, %95, %90; +} +{ +mul.f16x2 r2200, r2197, r2202; +} +{ +add.f16x2 r2203, r2194, r2200; +} +{ +sub.f16x2 r2206, %88, %84; +} +{ +mul.f16x2 r2209, r2206, r2211; +} +{ +sub.f16x2 r2212, %94, %89; +} +{ +mul.f16x2 r2215, r2212, r2217; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +sub.f16x2 r2221, %97, %93; +} +{ +mul.f16x2 r2224, r2221, r2226; +} +{ +add.f16x2 r2227, r2218, r2224; +} +{ +sub.f16x2 r2230, r2203, r2227; +} +{ +cvt.rn.f16.f64 rs69, fd99; +} +mov.b32 r2652, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs70, fd113; +} +mov.b32 r2679, {rs70, rs70}; +{ +cvt.rn.f16.f64 rs71, fd118; +} +mov.b32 r2886, {rs71, rs71}; +{ +cvt.rn.f16.f64 rs72, fd117; +} +mov.b32 r2910, {rs72, rs72}; +{ +cvt.rn.f16.f64 rs73, fd115; +} +mov.b32 r2868, {rs73, rs73}; +{ +cvt.rn.f16.f64 rs74, fd116; +} +mov.b32 r2895, {rs74, rs74}; +{ +cvt.rn.f16.f64 rs75, fd115; +} +mov.b32 r2769, {rs75, rs75}; +{ +cvt.rn.f16.f64 rs76, fd116; +} +{ +neg.f16 rs77, rs76; +} +mov.b32 r2793, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs79, fd99; +} +mov.b32 r2877, {rs79, rs79}; +{ +cvt.rn.f16.f64 rs80, fd113; +} +{ +neg.f16 rs81, rs80; +} +mov.b32 r2901, {rs81, rs81}; +{ +add.f16x2 r2233, %107, %102; +} +{ +add.f16x2 r2236, %110, r2233; +} +{ +add.f16x2 r2239, %98, %106; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, %101, %111; +} +{ +add.f16x2 r2248, r2242, r2245; +} +{ +add.f16x2 r2251, %104, %100; +} +{ +add.f16x2 r2254, %105, r2251; +} +{ +add.f16x2 r2257, %109, %103; +} +{ +add.f16x2 r2260, r2254, r2257; +} +{ +add.f16x2 r2263, %99, %108; +} +{ +add.f16x2 r2266, r2260, r2263; +} +{ +add.f16x2 r2269, %107, %102; +} +{ +mul.f16x2 r2272, r2269, r2652; +} +{ +add.f16x2 r2275, %110, r2272; +} +{ +add.f16x2 r2278, %98, %106; +} +{ +mul.f16x2 r2281, r2278, r2886; +} +{ +add.f16x2 r2284, r2275, r2281; +} +{ +add.f16x2 r2287, %101, %111; +} +{ +mul.f16x2 r2290, r2287, r2868; +} +{ +add.f16x2 r2293, r2284, r2290; +} +{ +sub.f16x2 r2296, %104, %100; +} +{ +mul.f16x2 r2299, r2296, r2679; +} +{ +sub.f16x2 r2302, %109, %103; +} +{ +mul.f16x2 r2305, r2302, r2910; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, %99, %108; +} +{ +mul.f16x2 r2314, r2311, r2895; +} +{ +add.f16x2 r2317, r2308, r2314; +} +{ +sub.f16x2 r2320, r2293, r2317; +} +{ +add.f16x2 r2323, %107, %102; +} +{ +mul.f16x2 r2326, r2323, r2652; +} +{ +add.f16x2 r2329, %110, r2326; +} +{ +add.f16x2 r2332, %98, %106; +} +{ +mul.f16x2 r2335, r2332, r2886; +} +{ +add.f16x2 r2338, r2329, r2335; +} +{ +add.f16x2 r2341, %101, %111; +} +{ +mul.f16x2 r2344, r2341, r2868; +} +{ +add.f16x2 r2347, r2338, r2344; +} +{ +sub.f16x2 r2350, %104, %100; +} +{ +mul.f16x2 r2353, r2350, r2679; +} +{ +sub.f16x2 r2356, %109, %103; +} +{ +mul.f16x2 r2359, r2356, r2910; +} +{ +add.f16x2 r2362, r2353, r2359; +} +{ +sub.f16x2 r2365, %99, %108; +} +{ +mul.f16x2 r2368, r2365, r2895; +} +{ +add.f16x2 r2371, r2362, r2368; +} +{ +add.f16x2 r2374, r2347, r2371; +} +{ +add.f16x2 r2377, %107, %102; +} +{ +mul.f16x2 r2380, r2377, r2886; +} +{ +add.f16x2 r2383, %110, r2380; +} +{ +add.f16x2 r2386, %98, %106; +} +{ +mul.f16x2 r2389, r2386, r2769; +} +{ +add.f16x2 r2392, r2383, r2389; +} +{ +add.f16x2 r2395, %101, %111; +} +{ +mul.f16x2 r2398, r2395, r2877; +} +{ +add.f16x2 r2401, r2392, r2398; +} +{ +sub.f16x2 r2404, %104, %100; +} +{ +mul.f16x2 r2407, r2404, r2910; +} +{ +sub.f16x2 r2410, %109, %103; +} +{ +mul.f16x2 r2413, r2410, r2793; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, %99, %108; +} +{ +mul.f16x2 r2422, r2419, r2901; +} +{ +add.f16x2 r2425, r2416, r2422; +} +{ +sub.f16x2 r2428, r2401, r2425; +} +{ +add.f16x2 r2431, %107, %102; +} +{ +mul.f16x2 r2434, r2431, r2886; +} +{ +add.f16x2 r2437, %110, r2434; +} +{ +add.f16x2 r2440, %98, %106; +} +{ +mul.f16x2 r2443, r2440, r2769; +} +{ +add.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, %101, %111; +} +{ +mul.f16x2 r2452, r2449, r2877; +} +{ +add.f16x2 r2455, r2446, r2452; +} +{ +sub.f16x2 r2458, %104, %100; +} +{ +mul.f16x2 r2461, r2458, r2910; +} +{ +sub.f16x2 r2464, %109, %103; +} +{ +mul.f16x2 r2467, r2464, r2793; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +sub.f16x2 r2473, %99, %108; +} +{ +mul.f16x2 r2476, r2473, r2901; +} +{ +add.f16x2 r2479, r2470, r2476; +} +{ +add.f16x2 r2482, r2455, r2479; +} +{ +add.f16x2 r2485, %107, %102; +} +{ +mul.f16x2 r2488, r2485, r2868; +} +{ +add.f16x2 r2491, %110, r2488; +} +{ +add.f16x2 r2494, %98, %106; +} +{ +mul.f16x2 r2497, r2494, r2877; +} +{ +add.f16x2 r2500, r2491, r2497; +} +{ +add.f16x2 r2503, %101, %111; +} +{ +mul.f16x2 r2506, r2503, r2886; +} +{ +add.f16x2 r2509, r2500, r2506; +} +{ +sub.f16x2 r2512, %104, %100; +} +{ +mul.f16x2 r2515, r2512, r2895; +} +{ +sub.f16x2 r2518, %109, %103; +} +{ +mul.f16x2 r2521, r2518, r2901; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, %99, %108; +} +{ +mul.f16x2 r2530, r2527, r2910; +} +{ +add.f16x2 r2533, r2524, r2530; +} +{ +sub.f16x2 r2536, r2509, r2533; +} +{ +add.f16x2 r2539, %107, %102; +} +{ +mul.f16x2 r2542, r2539, r2868; +} +{ +add.f16x2 r2545, %110, r2542; +} +{ +add.f16x2 r2548, %98, %106; +} +{ +mul.f16x2 r2551, r2548, r2877; +} +{ +add.f16x2 r2554, r2545, r2551; +} +{ +add.f16x2 r2557, %101, %111; +} +{ +mul.f16x2 r2560, r2557, r2886; +} +{ +add.f16x2 r2563, r2554, r2560; +} +{ +sub.f16x2 r2566, %104, %100; +} +{ +mul.f16x2 r2569, r2566, r2895; +} +{ +sub.f16x2 r2572, %109, %103; +} +{ +mul.f16x2 r2575, r2572, r2901; +} +{ +add.f16x2 r2578, r2569, r2575; +} +{ +sub.f16x2 r2581, %99, %108; +} +{ +mul.f16x2 r2584, r2581, r2910; +} +{ +add.f16x2 r2587, r2578, r2584; +} +{ +add.f16x2 r2590, r2563, r2587; +} +{ +add.f16x2 r2593, %104, %100; +} +{ +mul.f16x2 r2596, r2593, r2652; +} +{ +add.f16x2 r2599, %105, r2596; +} +{ +add.f16x2 r2602, %109, %103; +} +{ +mul.f16x2 r2605, r2602, r2886; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +add.f16x2 r2611, %99, %108; +} +{ +mul.f16x2 r2614, r2611, r2868; +} +{ +add.f16x2 r2617, r2608, r2614; +} +{ +sub.f16x2 r2620, %107, %102; +} +{ +mul.f16x2 r2623, r2620, r2679; +} +{ +sub.f16x2 r2626, %98, %106; +} +{ +mul.f16x2 r2629, r2626, r2910; +} +{ +add.f16x2 r2632, r2623, r2629; +} +{ +sub.f16x2 r2635, %101, %111; +} +{ +mul.f16x2 r2638, r2635, r2895; +} +{ +add.f16x2 r2641, r2632, r2638; +} +{ +add.f16x2 r2644, r2617, r2641; +} +{ +add.f16x2 r2647, %104, %100; +} +{ +mul.f16x2 r2650, r2647, r2652; +} +{ +add.f16x2 r2653, %105, r2650; +} +{ +add.f16x2 r2656, %109, %103; +} +{ +mul.f16x2 r2659, r2656, r2886; +} +{ +add.f16x2 r2662, r2653, r2659; +} +{ +add.f16x2 r2665, %99, %108; +} +{ +mul.f16x2 r2668, r2665, r2868; +} +{ +add.f16x2 r2671, r2662, r2668; +} +{ +sub.f16x2 r2674, %107, %102; +} +{ +mul.f16x2 r2677, r2674, r2679; +} +{ +sub.f16x2 r2680, %98, %106; +} +{ +mul.f16x2 r2683, r2680, r2910; +} +{ +add.f16x2 r2686, r2677, r2683; +} +{ +sub.f16x2 r2689, %101, %111; +} +{ +mul.f16x2 r2692, r2689, r2895; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2671, r2695; +} +{ +add.f16x2 r2701, %104, %100; +} +{ +mul.f16x2 r2704, r2701, r2886; +} +{ +add.f16x2 r2707, %105, r2704; +} +{ +add.f16x2 r2710, %109, %103; +} +{ +mul.f16x2 r2713, r2710, r2769; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +add.f16x2 r2719, %99, %108; +} +{ +mul.f16x2 r2722, r2719, r2877; +} +{ +add.f16x2 r2725, r2716, r2722; +} +{ +sub.f16x2 r2728, %107, %102; +} +{ +mul.f16x2 r2731, r2728, r2910; +} +{ +sub.f16x2 r2734, %98, %106; +} +{ +mul.f16x2 r2737, r2734, r2793; +} +{ +add.f16x2 r2740, r2731, r2737; +} +{ +sub.f16x2 r2743, %101, %111; +} +{ +mul.f16x2 r2746, r2743, r2901; +} +{ +add.f16x2 r2749, r2740, r2746; +} +{ +add.f16x2 r2752, r2725, r2749; +} +{ +add.f16x2 r2755, %104, %100; +} +{ +mul.f16x2 r2758, r2755, r2886; +} +{ +add.f16x2 r2761, %105, r2758; +} +{ +add.f16x2 r2764, %109, %103; +} +{ +mul.f16x2 r2767, r2764, r2769; +} +{ +add.f16x2 r2770, r2761, r2767; +} +{ +add.f16x2 r2773, %99, %108; +} +{ +mul.f16x2 r2776, r2773, r2877; +} +{ +add.f16x2 r2779, r2770, r2776; +} +{ +sub.f16x2 r2782, %107, %102; +} +{ +mul.f16x2 r2785, r2782, r2910; +} +{ +sub.f16x2 r2788, %98, %106; +} +{ +mul.f16x2 r2791, r2788, r2793; +} +{ +add.f16x2 r2794, r2785, r2791; +} +{ +sub.f16x2 r2797, %101, %111; +} +{ +mul.f16x2 r2800, r2797, r2901; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2779, r2803; +} +{ +add.f16x2 r2809, %104, %100; +} +{ +mul.f16x2 r2812, r2809, r2868; +} +{ +add.f16x2 r2815, %105, r2812; +} +{ +add.f16x2 r2818, %109, %103; +} +{ +mul.f16x2 r2821, r2818, r2877; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +add.f16x2 r2827, %99, %108; +} +{ +mul.f16x2 r2830, r2827, r2886; +} +{ +add.f16x2 r2833, r2824, r2830; +} +{ +sub.f16x2 r2836, %107, %102; +} +{ +mul.f16x2 r2839, r2836, r2895; +} +{ +sub.f16x2 r2842, %98, %106; +} +{ +mul.f16x2 r2845, r2842, r2901; +} +{ +add.f16x2 r2848, r2839, r2845; +} +{ +sub.f16x2 r2851, %101, %111; +} +{ +mul.f16x2 r2854, r2851, r2910; +} +{ +add.f16x2 r2857, r2848, r2854; +} +{ +add.f16x2 r2860, r2833, r2857; +} +{ +add.f16x2 r2863, %104, %100; +} +{ +mul.f16x2 r2866, r2863, r2868; +} +{ +add.f16x2 r2869, %105, r2866; +} +{ +add.f16x2 r2872, %109, %103; +} +{ +mul.f16x2 r2875, r2872, r2877; +} +{ +add.f16x2 r2878, r2869, r2875; +} +{ +add.f16x2 r2881, %99, %108; +} +{ +mul.f16x2 r2884, r2881, r2886; +} +{ +add.f16x2 r2887, r2878, r2884; +} +{ +sub.f16x2 r2890, %107, %102; +} +{ +mul.f16x2 r2893, r2890, r2895; +} +{ +sub.f16x2 r2896, %98, %106; +} +{ +mul.f16x2 r2899, r2896, r2901; +} +{ +add.f16x2 r2902, r2893, r2899; +} +{ +sub.f16x2 r2905, %101, %111; +} +{ +mul.f16x2 r2908, r2905, r2910; +} +{ +add.f16x2 r2911, r2902, r2908; +} +{ +sub.f16x2 r2914, r2887, r2911; +} +{ +cvt.rn.f16.f64 rs83, fd95; +} +{ +cvt.rn.f16.f64 rs84, fd116; +} +{ +cvt.rn.f16.f64 rs85, fd99; +} +{ +cvt.rn.f16.f64 rs86, fd113; +} +{ +cvt.rn.f16.f64 rs87, fd103; +} +{ +cvt.rn.f16.f64 rs88, fd117; +} +{ +cvt.rn.f16.f64 rs89, fd118; +} +{ +cvt.rn.f16.f64 rs90, fd117; +} +{ +cvt.rn.f16.f64 rs91, fd114; +} +{ +cvt.rn.f16.f64 rs92, fd113; +} +{ +cvt.rn.f16.f64 rs93, fd115; +} +{ +cvt.rn.f16.f64 rs94, fd116; +} +mov.b32 r2931, {rs83, rs83}; +{ +mul.f16x2 r2917, r2320, r2931; +} +mov.b32 r2928, {rs84, rs84}; +{ +mul.f16x2 r2920, r2644, r2928; +} +{ +sub.f16x2 r2923, r2917, r2920; +} +{ +mul.f16x2 r2926, r2320, r2928; +} +{ +fma.rn.f16x2 r2929, r2644, r2931, r2926; +} +mov.b32 r2947, {rs85, rs85}; +{ +mul.f16x2 r2933, r2428, r2947; +} +mov.b32 r2944, {rs86, rs86}; +{ +mul.f16x2 r2936, r2752, r2944; +} +{ +sub.f16x2 r2939, r2933, r2936; +} +{ +mul.f16x2 r2942, r2428, r2944; +} +{ +fma.rn.f16x2 r2945, r2752, r2947, r2942; +} +mov.b32 r2963, {rs87, rs87}; +{ +mul.f16x2 r2949, r2536, r2963; +} +mov.b32 r2960, {rs88, rs88}; +{ +mul.f16x2 r2952, r2860, r2960; +} +{ +sub.f16x2 r2955, r2949, r2952; +} +{ +mul.f16x2 r2958, r2536, r2960; +} +{ +fma.rn.f16x2 r2961, r2860, r2963, r2958; +} +mov.b32 r2979, {rs89, rs89}; +{ +mul.f16x2 r2965, r2590, r2979; +} +mov.b32 r2976, {rs90, rs90}; +{ +mul.f16x2 r2968, r2914, r2976; +} +{ +sub.f16x2 r2971, r2965, r2968; +} +{ +mul.f16x2 r2974, r2590, r2976; +} +{ +fma.rn.f16x2 r2977, r2914, r2979, r2974; +} +mov.b32 r2995, {rs91, rs91}; +{ +mul.f16x2 r2981, r2482, r2995; +} +mov.b32 r2992, {rs92, rs92}; +{ +mul.f16x2 r2984, r2806, r2992; +} +{ +sub.f16x2 r2987, r2981, r2984; +} +{ +mul.f16x2 r2990, r2482, r2992; +} +{ +fma.rn.f16x2 r2993, r2806, r2995, r2990; +} +mov.b32 r3011, {rs93, rs93}; +{ +mul.f16x2 r2997, r2374, r3011; +} +mov.b32 r3008, {rs94, rs94}; +mov.f64 fd149, 0d3FDBC4C04D71ABC1; +mov.f64 fd148, 0d3FE904C37505DE4B; +mov.f64 fd147, 0d3FEF329C0558E969; +{ +mul.f16x2 r3000, r2698, r3008; +} +{ +sub.f16x2 r3003, r2997, r3000; +} +{ +mul.f16x2 r3006, r2374, r3008; +} +{ +fma.rn.f16x2 r3009, r2698, r3011, r3006; +} +{ +add.f16x2 r3013, r1564, r2248; +} +{ +add.f16x2 r3016, r1582, r2266; +} +{ +sub.f16x2 r3019, r1564, r2248; +} +{ +sub.f16x2 r3022, r1582, r2266; +} +{ +add.f16x2 r3025, r1636, r2923; +} +{ +add.f16x2 r3028, r1960, r2929; +} +{ +sub.f16x2 r3031, r1636, r2923; +} +{ +sub.f16x2 r3034, r1960, r2929; +} +{ +add.f16x2 r3037, r1744, r2939; +} +{ +add.f16x2 r3040, r2068, r2945; +} +{ +sub.f16x2 r3043, r1744, r2939; +} +{ +sub.f16x2 r3046, r2068, r2945; +} +{ +add.f16x2 r3049, r1852, r2955; +} +{ +add.f16x2 r3052, r2176, r2961; +} +{ +sub.f16x2 r3055, r1852, r2955; +} +{ +sub.f16x2 r3058, r2176, r2961; +} +{ +add.f16x2 r3061, r1906, r2971; +} +{ +add.f16x2 r3064, r2230, r2977; +} +{ +sub.f16x2 r3067, r1906, r2971; +} +{ +sub.f16x2 r3070, r2230, r2977; +} +{ +add.f16x2 r3073, r1798, r2987; +} +{ +add.f16x2 r3076, r2122, r2993; +} +{ +sub.f16x2 r3079, r1798, r2987; +} +{ +sub.f16x2 r3082, r2122, r2993; +} +{ +add.f16x2 r3085, r1690, r3003; +} +{ +add.f16x2 r3088, r2014, r3009; +} +{ +sub.f16x2 r3091, r1690, r3003; +} +{ +sub.f16x2 r3094, r2014, r3009; +} +{ +cvt.rn.f16.f64 rs109, fd147; +} +{ +cvt.rn.f16.f64 rs110, fd118; +} +{ +cvt.rn.f16.f64 rs111, fd95; +} +{ +cvt.rn.f16.f64 rs112, fd116; +} +{ +cvt.rn.f16.f64 rs113, fd148; +} +{ +cvt.rn.f16.f64 rs114, fd114; +} +{ +cvt.rn.f16.f64 rs115, fd99; +} +{ +cvt.rn.f16.f64 rs116, fd113; +} +{ +cvt.rn.f16.f64 rs117, fd149; +} +{ +cvt.rn.f16.f64 rs118, fd115; +} +{ +cvt.rn.f16.f64 rs119, fd103; +} +{ +cvt.rn.f16.f64 rs120, fd117; +} +{ +cvt.rn.f16.f64 rs123, fd118; +} +{ +cvt.rn.f16.f64 rs124, fd117; +} +{ +cvt.rn.f16.f64 rs125, fd116; +} +{ +cvt.rn.f16.f64 rs126, fd115; +} +{ +cvt.rn.f16.f64 rs127, fd114; +} +{ +cvt.rn.f16.f64 rs128, fd113; +} +{ +cvt.rn.f16.f64 rs129, fd113; +} +{ +cvt.rn.f16.f64 rs130, fd114; +} +{ +cvt.rn.f16.f64 rs131, fd115; +} +{ +cvt.rn.f16.f64 rs132, fd116; +} +{ +cvt.rn.f16.f64 rs133, fd117; +} +{ +cvt.rn.f16.f64 rs134, fd118; +} +mov.b32 r3111, {rs109, rs109}; +{ +mul.f16x2 r3097, r3025, r3111; +} +mov.b32 r3108, {rs110, rs110}; +{ +mul.f16x2 r3100, r3028, r3108; +} +{ +sub.f16x2 r3103, r3097, r3100; +} +{ +mul.f16x2 r3106, r3025, r3108; +} +{ +fma.rn.f16x2 r3109, r3028, r3111, r3106; +} +mov.b32 r3127, {rs111, rs111}; +{ +mul.f16x2 r3113, r3037, r3127; +} +mov.b32 r3124, {rs112, rs112}; +{ +mul.f16x2 r3116, r3040, r3124; +} +{ +sub.f16x2 r3119, r3113, r3116; +} +{ +mul.f16x2 r3122, r3037, r3124; +} +{ +fma.rn.f16x2 r3125, r3040, r3127, r3122; +} +mov.b32 r3143, {rs113, rs113}; +{ +mul.f16x2 r3129, r3049, r3143; +} +mov.b32 r3140, {rs114, rs114}; +{ +mul.f16x2 r3132, r3052, r3140; +} +{ +sub.f16x2 r3135, r3129, r3132; +} +{ +mul.f16x2 r3138, r3049, r3140; +} +{ +fma.rn.f16x2 r3141, r3052, r3143, r3138; +} +mov.b32 r3159, {rs115, rs115}; +{ +mul.f16x2 r3145, r3061, r3159; +} +mov.b32 r3156, {rs116, rs116}; +{ +mul.f16x2 r3148, r3064, r3156; +} +{ +sub.f16x2 r3151, r3145, r3148; +} +{ +mul.f16x2 r3154, r3061, r3156; +} +{ +fma.rn.f16x2 r3157, r3064, r3159, r3154; +} +mov.b32 r3175, {rs117, rs117}; +{ +mul.f16x2 r3161, r3073, r3175; +} +mov.b32 r3172, {rs118, rs118}; +{ +mul.f16x2 r3164, r3076, r3172; +} +{ +sub.f16x2 r3167, r3161, r3164; +} +{ +mul.f16x2 r3170, r3073, r3172; +} +{ +fma.rn.f16x2 r3173, r3076, r3175, r3170; +} +mov.b32 r3191, {rs119, rs119}; +{ +mul.f16x2 r3177, r3085, r3191; +} +mov.b32 r3188, {rs120, rs120}; +{ +mul.f16x2 r3180, r3088, r3188; +} +{ +sub.f16x2 r3183, r3177, r3180; +} +{ +mul.f16x2 r3186, r3085, r3188; +} +{ +fma.rn.f16x2 r3189, r3088, r3191, r3186; +} +{ +neg.f16x2 r3193, r3019; +} +mov.b32 r3209, {rs123, rs123}; +{ +mul.f16x2 r3195, r3031, r3209; +} +mov.b32 r3206, {rs124, rs124}; +{ +mul.f16x2 r3198, r3034, r3206; +} +{ +sub.f16x2 r3201, r3195, r3198; +} +{ +mul.f16x2 r3204, r3031, r3206; +} +{ +fma.rn.f16x2 r3207, r3034, r3209, r3204; +} +mov.b32 r3225, {rs125, rs125}; +{ +mul.f16x2 r3211, r3043, r3225; +} +mov.b32 r3222, {rs126, rs126}; +{ +mul.f16x2 r3214, r3046, r3222; +} +{ +sub.f16x2 r3217, r3211, r3214; +} +{ +mul.f16x2 r3220, r3043, r3222; +} +{ +fma.rn.f16x2 r3223, r3046, r3225, r3220; +} +mov.b32 r3241, {rs127, rs127}; +{ +mul.f16x2 r3227, r3055, r3241; +} +mov.b32 r3238, {rs128, rs128}; +{ +mul.f16x2 r3230, r3058, r3238; +} +{ +sub.f16x2 r3233, r3227, r3230; +} +{ +mul.f16x2 r3236, r3055, r3238; +} +{ +fma.rn.f16x2 r3239, r3058, r3241, r3236; +} +mov.b32 r3257, {rs129, rs129}; +{ +mul.f16x2 r3243, r3067, r3257; +} +mov.b32 r3254, {rs130, rs130}; +{ +mul.f16x2 r3246, r3070, r3254; +} +{ +sub.f16x2 r3249, r3243, r3246; +} +{ +mul.f16x2 r3252, r3067, r3254; +} +{ +fma.rn.f16x2 r3255, r3070, r3257, r3252; +} +mov.b32 r3273, {rs131, rs131}; +{ +mul.f16x2 r3259, r3079, r3273; +} +mov.b32 r3270, {rs132, rs132}; +{ +mul.f16x2 r3262, r3082, r3270; +} +{ +sub.f16x2 r3265, r3259, r3262; +} +{ +mul.f16x2 r3268, r3079, r3270; +} +{ +fma.rn.f16x2 r3271, r3082, r3273, r3268; +} +mov.b32 r3289, {rs133, rs133}; +{ +mul.f16x2 r3275, r3091, r3289; +} +mov.b32 r3286, {rs134, rs134}; +{ +mul.f16x2 r3278, r3094, r3286; +} +{ +sub.f16x2 r3281, r3275, r3278; +} +{ +mul.f16x2 r3284, r3091, r3286; +} +{ +fma.rn.f16x2 r3287, r3094, r3289, r3284; +} +{ +add.f16x2 %0, r1465, r3013; +} +{ +add.f16x2 %1, r1468, r3016; +} +{ +sub.f16x2 %28, r1465, r3013; +} +{ +sub.f16x2 %29, r1468, r3016; +} +{ +add.f16x2 %2, r1477, r3103; +} +{ +add.f16x2 %3, r1480, r3109; +} +{ +sub.f16x2 %30, r1477, r3103; +} +{ +sub.f16x2 %31, r1480, r3109; +} +{ +add.f16x2 %4, r1489, r3119; +} +{ +add.f16x2 %5, r1492, r3125; +} +{ +sub.f16x2 %32, r1489, r3119; +} +{ +sub.f16x2 %33, r1492, r3125; +} +{ +add.f16x2 %6, r1501, r3135; +} +{ +add.f16x2 %7, r1504, r3141; +} +{ +sub.f16x2 %34, r1501, r3135; +} +{ +sub.f16x2 %35, r1504, r3141; +} +{ +add.f16x2 %8, r1513, r3151; +} +{ +add.f16x2 %9, r1516, r3157; +} +{ +sub.f16x2 %36, r1513, r3151; +} +{ +sub.f16x2 %37, r1516, r3157; +} +{ +add.f16x2 %10, r1525, r3167; +} +{ +add.f16x2 %11, r1528, r3173; +} +{ +sub.f16x2 %38, r1525, r3167; +} +{ +sub.f16x2 %39, r1528, r3173; +} +{ +add.f16x2 %12, r1537, r3183; +} +{ +add.f16x2 %13, r1540, r3189; +} +{ +sub.f16x2 %40, r1537, r3183; +} +{ +sub.f16x2 %41, r1540, r3189; +} +{ +add.f16x2 %14, r1471, r3022; +} +{ +add.f16x2 %15, r1474, r3193; +} +{ +sub.f16x2 %42, r1471, r3022; +} +{ +sub.f16x2 %43, r1474, r3193; +} +{ +add.f16x2 %16, r1483, r3201; +} +{ +add.f16x2 %17, r1486, r3207; +} +{ +sub.f16x2 %44, r1483, r3201; +} +{ +sub.f16x2 %45, r1486, r3207; +} +{ +add.f16x2 %18, r1495, r3217; +} +{ +add.f16x2 %19, r1498, r3223; +} +{ +sub.f16x2 %46, r1495, r3217; +} +{ +sub.f16x2 %47, r1498, r3223; +} +{ +add.f16x2 %20, r1507, r3233; +} +{ +add.f16x2 %21, r1510, r3239; +} +{ +sub.f16x2 %48, r1507, r3233; +} +{ +sub.f16x2 %49, r1510, r3239; +} +{ +add.f16x2 %22, r1519, r3249; +} +{ +add.f16x2 %23, r1522, r3255; +} +{ +sub.f16x2 %50, r1519, r3249; +} +{ +sub.f16x2 %51, r1522, r3255; +} +{ +add.f16x2 %24, r1531, r3265; +} +{ +add.f16x2 %25, r1534, r3271; +} +{ +sub.f16x2 %52, r1531, r3265; +} +{ +sub.f16x2 %53, r1534, r3271; +} +{ +add.f16x2 %26, r1543, r3281; +} +{ +add.f16x2 %27, r1546, r3287; +} +{ +sub.f16x2 %54, r1543, r3281; +} +{ +sub.f16x2 %55, r1546, r3287; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)): "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[19].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..bb37b502e2024 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp16_inv.hpp.inc @@ -0,0 +1,3853 @@ +#ifndef CUFFTDX_FFT_28_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_28_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<960, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<171>; +.reg .b32 r<3515>; +.reg .f64 fd<147>; +.reg .b64 rd<3>; +mov.f64 fd114, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd114; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd113, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd113; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r447, {rs3, rs3}; +mov.f64 fd107, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs5, fd107; +} +mov.b32 r654, {rs5, rs5}; +mov.f64 fd117, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs6, fd117; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r678, {rs7, rs7}; +mov.f64 fd115, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs9, fd115; +} +mov.b32 r636, {rs9, rs9}; +mov.f64 fd109, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs10, fd109; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r663, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs13, fd115; +} +mov.b32 r537, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd109; +} +mov.b32 r561, {rs14, rs14}; +{ +cvt.rn.f16.f64 rs15, fd114; +} +mov.b32 r645, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd113; +} +mov.b32 r669, {rs16, rs16}; +{ +add.f16x2 r1, %58, %67; +} +{ +add.f16x2 r4, %61, r1; +} +{ +add.f16x2 r7, %63, %57; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %66, %62; +} +{ +add.f16x2 r16, r10, r13; +} +{ +add.f16x2 r19, %69, %65; +} +{ +add.f16x2 r22, %56, r19; +} +{ +add.f16x2 r25, %60, %68; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %64, %59; +} +{ +add.f16x2 r34, r28, r31; +} +{ +add.f16x2 r37, %58, %67; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %61, r40; +} +{ +add.f16x2 r46, %63, %57; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %66, %62; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %69, %65; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %60, %68; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %64, %59; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 r88, r61, r85; +} +{ +add.f16x2 r91, %58, %67; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %61, r94; +} +{ +add.f16x2 r100, %63, %57; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %66, %62; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %69, %65; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %60, %68; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %64, %59; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 r142, r115, r139; +} +{ +add.f16x2 r145, %58, %67; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %61, r148; +} +{ +add.f16x2 r154, %63, %57; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %66, %62; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %69, %65; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %60, %68; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %64, %59; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 r196, r169, r193; +} +{ +add.f16x2 r199, %58, %67; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %61, r202; +} +{ +add.f16x2 r208, %63, %57; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %66, %62; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %69, %65; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %60, %68; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %64, %59; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 r250, r223, r247; +} +{ +add.f16x2 r253, %58, %67; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %61, r256; +} +{ +add.f16x2 r262, %63, %57; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %66, %62; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %69, %65; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %60, %68; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %64, %59; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 r304, r277, r301; +} +{ +add.f16x2 r307, %58, %67; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %61, r310; +} +{ +add.f16x2 r316, %63, %57; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %66, %62; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %69, %65; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %60, %68; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %64, %59; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 r358, r331, r355; +} +{ +add.f16x2 r361, %69, %65; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %56, r364; +} +{ +add.f16x2 r370, %60, %68; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %64, %59; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %58, %67; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %63, %57; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %66, %62; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 r412, r385, r409; +} +{ +add.f16x2 r415, %69, %65; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %56, r418; +} +{ +add.f16x2 r424, %60, %68; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %64, %59; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %58, %67; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %63, %57; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %66, %62; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 r466, r439, r463; +} +{ +add.f16x2 r469, %69, %65; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %56, r472; +} +{ +add.f16x2 r478, %60, %68; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %64, %59; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %58, %67; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %63, %57; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %66, %62; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 r520, r493, r517; +} +{ +add.f16x2 r523, %69, %65; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %56, r526; +} +{ +add.f16x2 r532, %60, %68; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %64, %59; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %58, %67; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %63, %57; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %66, %62; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 r574, r547, r571; +} +{ +add.f16x2 r577, %69, %65; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %56, r580; +} +{ +add.f16x2 r586, %60, %68; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %64, %59; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %58, %67; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %63, %57; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %66, %62; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 r628, r601, r625; +} +{ +add.f16x2 r631, %69, %65; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %56, r634; +} +{ +add.f16x2 r640, %60, %68; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %64, %59; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %58, %67; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %63, %57; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %66, %62; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r655, r679; +} +{ +cvt.rn.f16.f64 rs17, fd114; +} +mov.b32 r1104, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd113; +} +{ +neg.f16 rs19, rs18; +} +mov.b32 r1131, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs21, fd107; +} +mov.b32 r1338, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd117; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r1362, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd115; +} +mov.b32 r1320, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd109; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r1347, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs29, fd115; +} +mov.b32 r1221, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd109; +} +mov.b32 r1245, {rs30, rs30}; +{ +cvt.rn.f16.f64 rs31, fd114; +} +mov.b32 r1329, {rs31, rs31}; +{ +cvt.rn.f16.f64 rs32, fd113; +} +mov.b32 r1353, {rs32, rs32}; +{ +add.f16x2 r685, %82, %77; +} +{ +add.f16x2 r688, %71, r685; +} +{ +add.f16x2 r691, %73, %81; +} +{ +add.f16x2 r694, r688, r691; +} +{ +add.f16x2 r697, %76, %72; +} +{ +add.f16x2 r700, r694, r697; +} +{ +add.f16x2 r703, %79, %75; +} +{ +add.f16x2 r706, %80, r703; +} +{ +add.f16x2 r709, %70, %78; +} +{ +add.f16x2 r712, r706, r709; +} +{ +add.f16x2 r715, %74, %83; +} +{ +add.f16x2 r718, r712, r715; +} +{ +add.f16x2 r721, %82, %77; +} +{ +mul.f16x2 r724, r721, r1104; +} +{ +add.f16x2 r727, %71, r724; +} +{ +add.f16x2 r730, %73, %81; +} +{ +mul.f16x2 r733, r730, r1338; +} +{ +add.f16x2 r736, r727, r733; +} +{ +add.f16x2 r739, %76, %72; +} +{ +mul.f16x2 r742, r739, r1320; +} +{ +add.f16x2 r745, r736, r742; +} +{ +sub.f16x2 r748, %79, %75; +} +{ +mul.f16x2 r751, r748, r1131; +} +{ +sub.f16x2 r754, %70, %78; +} +{ +mul.f16x2 r757, r754, r1362; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %83; +} +{ +mul.f16x2 r766, r763, r1347; +} +{ +add.f16x2 r769, r760, r766; +} +{ +sub.f16x2 r772, r745, r769; +} +{ +add.f16x2 r775, %82, %77; +} +{ +mul.f16x2 r778, r775, r1104; +} +{ +add.f16x2 r781, %71, r778; +} +{ +add.f16x2 r784, %73, %81; +} +{ +mul.f16x2 r787, r784, r1338; +} +{ +add.f16x2 r790, r781, r787; +} +{ +add.f16x2 r793, %76, %72; +} +{ +mul.f16x2 r796, r793, r1320; +} +{ +add.f16x2 r799, r790, r796; +} +{ +sub.f16x2 r802, %79, %75; +} +{ +mul.f16x2 r805, r802, r1131; +} +{ +sub.f16x2 r808, %70, %78; +} +{ +mul.f16x2 r811, r808, r1362; +} +{ +add.f16x2 r814, r805, r811; +} +{ +sub.f16x2 r817, %74, %83; +} +{ +mul.f16x2 r820, r817, r1347; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r799, r823; +} +{ +add.f16x2 r829, %82, %77; +} +{ +mul.f16x2 r832, r829, r1338; +} +{ +add.f16x2 r835, %71, r832; +} +{ +add.f16x2 r838, %73, %81; +} +{ +mul.f16x2 r841, r838, r1221; +} +{ +add.f16x2 r844, r835, r841; +} +{ +add.f16x2 r847, %76, %72; +} +{ +mul.f16x2 r850, r847, r1329; +} +{ +add.f16x2 r853, r844, r850; +} +{ +sub.f16x2 r856, %79, %75; +} +{ +mul.f16x2 r859, r856, r1362; +} +{ +sub.f16x2 r862, %70, %78; +} +{ +mul.f16x2 r865, r862, r1245; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %74, %83; +} +{ +mul.f16x2 r874, r871, r1353; +} +{ +add.f16x2 r877, r868, r874; +} +{ +sub.f16x2 r880, r853, r877; +} +{ +add.f16x2 r883, %82, %77; +} +{ +mul.f16x2 r886, r883, r1338; +} +{ +add.f16x2 r889, %71, r886; +} +{ +add.f16x2 r892, %73, %81; +} +{ +mul.f16x2 r895, r892, r1221; +} +{ +add.f16x2 r898, r889, r895; +} +{ +add.f16x2 r901, %76, %72; +} +{ +mul.f16x2 r904, r901, r1329; +} +{ +add.f16x2 r907, r898, r904; +} +{ +sub.f16x2 r910, %79, %75; +} +{ +mul.f16x2 r913, r910, r1362; +} +{ +sub.f16x2 r916, %70, %78; +} +{ +mul.f16x2 r919, r916, r1245; +} +{ +add.f16x2 r922, r913, r919; +} +{ +sub.f16x2 r925, %74, %83; +} +{ +mul.f16x2 r928, r925, r1353; +} +{ +add.f16x2 r931, r922, r928; +} +{ +add.f16x2 r934, r907, r931; +} +{ +add.f16x2 r937, %82, %77; +} +{ +mul.f16x2 r940, r937, r1320; +} +{ +add.f16x2 r943, %71, r940; +} +{ +add.f16x2 r946, %73, %81; +} +{ +mul.f16x2 r949, r946, r1329; +} +{ +add.f16x2 r952, r943, r949; +} +{ +add.f16x2 r955, %76, %72; +} +{ +mul.f16x2 r958, r955, r1338; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, %79, %75; +} +{ +mul.f16x2 r967, r964, r1347; +} +{ +sub.f16x2 r970, %70, %78; +} +{ +mul.f16x2 r973, r970, r1353; +} +{ +add.f16x2 r976, r967, r973; +} +{ +sub.f16x2 r979, %74, %83; +} +{ +mul.f16x2 r982, r979, r1362; +} +{ +add.f16x2 r985, r976, r982; +} +{ +sub.f16x2 r988, r961, r985; +} +{ +add.f16x2 r991, %82, %77; +} +{ +mul.f16x2 r994, r991, r1320; +} +{ +add.f16x2 r997, %71, r994; +} +{ +add.f16x2 r1000, %73, %81; +} +{ +mul.f16x2 r1003, r1000, r1329; +} +{ +add.f16x2 r1006, r997, r1003; +} +{ +add.f16x2 r1009, %76, %72; +} +{ +mul.f16x2 r1012, r1009, r1338; +} +{ +add.f16x2 r1015, r1006, r1012; +} +{ +sub.f16x2 r1018, %79, %75; +} +{ +mul.f16x2 r1021, r1018, r1347; +} +{ +sub.f16x2 r1024, %70, %78; +} +{ +mul.f16x2 r1027, r1024, r1353; +} +{ +add.f16x2 r1030, r1021, r1027; +} +{ +sub.f16x2 r1033, %74, %83; +} +{ +mul.f16x2 r1036, r1033, r1362; +} +{ +add.f16x2 r1039, r1030, r1036; +} +{ +add.f16x2 r1042, r1015, r1039; +} +{ +add.f16x2 r1045, %79, %75; +} +{ +mul.f16x2 r1048, r1045, r1104; +} +{ +add.f16x2 r1051, %80, r1048; +} +{ +add.f16x2 r1054, %70, %78; +} +{ +mul.f16x2 r1057, r1054, r1338; +} +{ +add.f16x2 r1060, r1051, r1057; +} +{ +add.f16x2 r1063, %74, %83; +} +{ +mul.f16x2 r1066, r1063, r1320; +} +{ +add.f16x2 r1069, r1060, r1066; +} +{ +sub.f16x2 r1072, %82, %77; +} +{ +mul.f16x2 r1075, r1072, r1131; +} +{ +sub.f16x2 r1078, %73, %81; +} +{ +mul.f16x2 r1081, r1078, r1362; +} +{ +add.f16x2 r1084, r1075, r1081; +} +{ +sub.f16x2 r1087, %76, %72; +} +{ +mul.f16x2 r1090, r1087, r1347; +} +{ +add.f16x2 r1093, r1084, r1090; +} +{ +add.f16x2 r1096, r1069, r1093; +} +{ +add.f16x2 r1099, %79, %75; +} +{ +mul.f16x2 r1102, r1099, r1104; +} +{ +add.f16x2 r1105, %80, r1102; +} +{ +add.f16x2 r1108, %70, %78; +} +{ +mul.f16x2 r1111, r1108, r1338; +} +{ +add.f16x2 r1114, r1105, r1111; +} +{ +add.f16x2 r1117, %74, %83; +} +{ +mul.f16x2 r1120, r1117, r1320; +} +{ +add.f16x2 r1123, r1114, r1120; +} +{ +sub.f16x2 r1126, %82, %77; +} +{ +mul.f16x2 r1129, r1126, r1131; +} +{ +sub.f16x2 r1132, %73, %81; +} +{ +mul.f16x2 r1135, r1132, r1362; +} +{ +add.f16x2 r1138, r1129, r1135; +} +{ +sub.f16x2 r1141, %76, %72; +} +{ +mul.f16x2 r1144, r1141, r1347; +} +{ +add.f16x2 r1147, r1138, r1144; +} +{ +sub.f16x2 r1150, r1123, r1147; +} +{ +add.f16x2 r1153, %79, %75; +} +{ +mul.f16x2 r1156, r1153, r1338; +} +{ +add.f16x2 r1159, %80, r1156; +} +{ +add.f16x2 r1162, %70, %78; +} +{ +mul.f16x2 r1165, r1162, r1221; +} +{ +add.f16x2 r1168, r1159, r1165; +} +{ +add.f16x2 r1171, %74, %83; +} +{ +mul.f16x2 r1174, r1171, r1329; +} +{ +add.f16x2 r1177, r1168, r1174; +} +{ +sub.f16x2 r1180, %82, %77; +} +{ +mul.f16x2 r1183, r1180, r1362; +} +{ +sub.f16x2 r1186, %73, %81; +} +{ +mul.f16x2 r1189, r1186, r1245; +} +{ +add.f16x2 r1192, r1183, r1189; +} +{ +sub.f16x2 r1195, %76, %72; +} +{ +mul.f16x2 r1198, r1195, r1353; +} +{ +add.f16x2 r1201, r1192, r1198; +} +{ +add.f16x2 r1204, r1177, r1201; +} +{ +add.f16x2 r1207, %79, %75; +} +{ +mul.f16x2 r1210, r1207, r1338; +} +{ +add.f16x2 r1213, %80, r1210; +} +{ +add.f16x2 r1216, %70, %78; +} +{ +mul.f16x2 r1219, r1216, r1221; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %74, %83; +} +{ +mul.f16x2 r1228, r1225, r1329; +} +{ +add.f16x2 r1231, r1222, r1228; +} +{ +sub.f16x2 r1234, %82, %77; +} +{ +mul.f16x2 r1237, r1234, r1362; +} +{ +sub.f16x2 r1240, %73, %81; +} +{ +mul.f16x2 r1243, r1240, r1245; +} +{ +add.f16x2 r1246, r1237, r1243; +} +{ +sub.f16x2 r1249, %76, %72; +} +{ +mul.f16x2 r1252, r1249, r1353; +} +{ +add.f16x2 r1255, r1246, r1252; +} +{ +sub.f16x2 r1258, r1231, r1255; +} +{ +add.f16x2 r1261, %79, %75; +} +{ +mul.f16x2 r1264, r1261, r1320; +} +{ +add.f16x2 r1267, %80, r1264; +} +{ +add.f16x2 r1270, %70, %78; +} +{ +mul.f16x2 r1273, r1270, r1329; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +add.f16x2 r1279, %74, %83; +} +{ +mul.f16x2 r1282, r1279, r1338; +} +{ +add.f16x2 r1285, r1276, r1282; +} +{ +sub.f16x2 r1288, %82, %77; +} +{ +mul.f16x2 r1291, r1288, r1347; +} +{ +sub.f16x2 r1294, %73, %81; +} +{ +mul.f16x2 r1297, r1294, r1353; +} +{ +add.f16x2 r1300, r1291, r1297; +} +{ +sub.f16x2 r1303, %76, %72; +} +{ +mul.f16x2 r1306, r1303, r1362; +} +{ +add.f16x2 r1309, r1300, r1306; +} +{ +add.f16x2 r1312, r1285, r1309; +} +{ +add.f16x2 r1315, %79, %75; +} +{ +mul.f16x2 r1318, r1315, r1320; +} +{ +add.f16x2 r1321, %80, r1318; +} +{ +add.f16x2 r1324, %70, %78; +} +{ +mul.f16x2 r1327, r1324, r1329; +} +{ +add.f16x2 r1330, r1321, r1327; +} +{ +add.f16x2 r1333, %74, %83; +} +{ +mul.f16x2 r1336, r1333, r1338; +} +{ +add.f16x2 r1339, r1330, r1336; +} +{ +sub.f16x2 r1342, %82, %77; +} +{ +mul.f16x2 r1345, r1342, r1347; +} +{ +sub.f16x2 r1348, %73, %81; +} +{ +mul.f16x2 r1351, r1348, r1353; +} +{ +add.f16x2 r1354, r1345, r1351; +} +{ +sub.f16x2 r1357, %76, %72; +} +{ +mul.f16x2 r1360, r1357, r1362; +} +{ +add.f16x2 r1363, r1354, r1360; +} +{ +sub.f16x2 r1366, r1339, r1363; +} +mov.f64 fd110, 0d3FECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs33, fd110; +} +mov.f64 fd116, 0d3FDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs34, fd116; +} +{ +cvt.rn.f16.f64 rs35, fd114; +} +mov.f64 fd112, 0d3FE904C37505DE4B; +{ +cvt.rn.f16.f64 rs36, fd112; +} +mov.f64 fd118, 0d3FCC7B90E3024582; +{ +cvt.rn.f16.f64 rs37, fd118; +} +mov.f64 fd108, 0d3FEF329C0558E969; +{ +cvt.rn.f16.f64 rs38, fd108; +} +{ +cvt.rn.f16.f64 rs39, fd107; +} +{ +cvt.rn.f16.f64 rs40, fd108; +} +mov.f64 fd111, 0dBFE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs41, fd111; +} +{ +cvt.rn.f16.f64 rs42, fd112; +} +{ +cvt.rn.f16.f64 rs43, fd115; +} +{ +cvt.rn.f16.f64 rs44, fd116; +} +mov.b32 r1383, {rs33, rs33}; +{ +mul.f16x2 r1369, r772, r1383; +} +mov.b32 r1380, {rs34, rs34}; +{ +mul.f16x2 r1372, r1096, r1380; +} +{ +sub.f16x2 r1375, r1369, r1372; +} +{ +mul.f16x2 r1378, r772, r1380; +} +{ +fma.rn.f16x2 r1381, r1096, r1383, r1378; +} +mov.b32 r1399, {rs35, rs35}; +{ +mul.f16x2 r1385, r880, r1399; +} +mov.b32 r1396, {rs36, rs36}; +{ +mul.f16x2 r1388, r1204, r1396; +} +{ +sub.f16x2 r1391, r1385, r1388; +} +{ +mul.f16x2 r1394, r880, r1396; +} +{ +fma.rn.f16x2 r1397, r1204, r1399, r1394; +} +mov.b32 r1415, {rs37, rs37}; +{ +mul.f16x2 r1401, r988, r1415; +} +mov.b32 r1412, {rs38, rs38}; +{ +mul.f16x2 r1404, r1312, r1412; +} +{ +sub.f16x2 r1407, r1401, r1404; +} +{ +mul.f16x2 r1410, r988, r1412; +} +{ +fma.rn.f16x2 r1413, r1312, r1415, r1410; +} +mov.b32 r1431, {rs39, rs39}; +{ +mul.f16x2 r1417, r1042, r1431; +} +mov.b32 r1428, {rs40, rs40}; +{ +mul.f16x2 r1420, r1366, r1428; +} +{ +sub.f16x2 r1423, r1417, r1420; +} +{ +mul.f16x2 r1426, r1042, r1428; +} +{ +fma.rn.f16x2 r1429, r1366, r1431, r1426; +} +mov.b32 r1447, {rs41, rs41}; +{ +mul.f16x2 r1433, r934, r1447; +} +mov.b32 r1444, {rs42, rs42}; +{ +mul.f16x2 r1436, r1258, r1444; +} +{ +sub.f16x2 r1439, r1433, r1436; +} +{ +mul.f16x2 r1442, r934, r1444; +} +{ +fma.rn.f16x2 r1445, r1258, r1447, r1442; +} +mov.b32 r1463, {rs43, rs43}; +{ +mul.f16x2 r1449, r826, r1463; +} +mov.b32 r1460, {rs44, rs44}; +{ +mul.f16x2 r1452, r1150, r1460; +} +{ +sub.f16x2 r1455, r1449, r1452; +} +{ +mul.f16x2 r1458, r826, r1460; +} +{ +fma.rn.f16x2 r1461, r1150, r1463, r1458; +} +{ +add.f16x2 r1465, r16, r700; +} +{ +add.f16x2 r1468, r34, r718; +} +{ +sub.f16x2 r1471, r16, r700; +} +{ +sub.f16x2 r1474, r34, r718; +} +{ +add.f16x2 r1477, r88, r1375; +} +{ +add.f16x2 r1480, r412, r1381; +} +{ +sub.f16x2 r1483, r88, r1375; +} +{ +sub.f16x2 r1486, r412, r1381; +} +{ +add.f16x2 r1489, r196, r1391; +} +{ +add.f16x2 r1492, r520, r1397; +} +{ +sub.f16x2 r1495, r196, r1391; +} +{ +sub.f16x2 r1498, r520, r1397; +} +{ +add.f16x2 r1501, r304, r1407; +} +{ +add.f16x2 r1504, r628, r1413; +} +{ +sub.f16x2 r1507, r304, r1407; +} +{ +sub.f16x2 r1510, r628, r1413; +} +{ +add.f16x2 r1513, r358, r1423; +} +{ +add.f16x2 r1516, r682, r1429; +} +{ +sub.f16x2 r1519, r358, r1423; +} +{ +sub.f16x2 r1522, r682, r1429; +} +{ +add.f16x2 r1525, r250, r1439; +} +{ +add.f16x2 r1528, r574, r1445; +} +{ +sub.f16x2 r1531, r250, r1439; +} +{ +sub.f16x2 r1534, r574, r1445; +} +{ +add.f16x2 r1537, r142, r1455; +} +{ +add.f16x2 r1540, r466, r1461; +} +{ +sub.f16x2 r1543, r142, r1455; +} +{ +sub.f16x2 r1546, r466, r1461; +} +{ +cvt.rn.f16.f64 rs59, fd114; +} +mov.b32 r1968, {rs59, rs59}; +{ +cvt.rn.f16.f64 rs60, fd113; +} +{ +neg.f16 rs61, rs60; +} +mov.b32 r1995, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs63, fd107; +} +mov.b32 r2202, {rs63, rs63}; +{ +cvt.rn.f16.f64 rs64, fd117; +} +{ +neg.f16 rs65, rs64; +} +mov.b32 r2226, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs67, fd115; +} +mov.b32 r2184, {rs67, rs67}; +{ +cvt.rn.f16.f64 rs68, fd109; +} +{ +neg.f16 rs69, rs68; +} +mov.b32 r2211, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs71, fd115; +} +mov.b32 r2085, {rs71, rs71}; +{ +cvt.rn.f16.f64 rs72, fd109; +} +mov.b32 r2109, {rs72, rs72}; +{ +cvt.rn.f16.f64 rs73, fd114; +} +mov.b32 r2193, {rs73, rs73}; +{ +cvt.rn.f16.f64 rs74, fd113; +} +mov.b32 r2217, {rs74, rs74}; +{ +add.f16x2 r1549, %91, %86; +} +{ +add.f16x2 r1552, %93, r1549; +} +{ +add.f16x2 r1555, %96, %90; +} +{ +add.f16x2 r1558, r1552, r1555; +} +{ +add.f16x2 r1561, %85, %95; +} +{ +add.f16x2 r1564, r1558, r1561; +} +{ +add.f16x2 r1567, %88, %84; +} +{ +add.f16x2 r1570, %89, r1567; +} +{ +add.f16x2 r1573, %94, %87; +} +{ +add.f16x2 r1576, r1570, r1573; +} +{ +add.f16x2 r1579, %97, %92; +} +{ +add.f16x2 r1582, r1576, r1579; +} +{ +add.f16x2 r1585, %91, %86; +} +{ +mul.f16x2 r1588, r1585, r1968; +} +{ +add.f16x2 r1591, %93, r1588; +} +{ +add.f16x2 r1594, %96, %90; +} +{ +mul.f16x2 r1597, r1594, r2202; +} +{ +add.f16x2 r1600, r1591, r1597; +} +{ +add.f16x2 r1603, %85, %95; +} +{ +mul.f16x2 r1606, r1603, r2184; +} +{ +add.f16x2 r1609, r1600, r1606; +} +{ +sub.f16x2 r1612, %88, %84; +} +{ +mul.f16x2 r1615, r1612, r1995; +} +{ +sub.f16x2 r1618, %94, %87; +} +{ +mul.f16x2 r1621, r1618, r2226; +} +{ +add.f16x2 r1624, r1615, r1621; +} +{ +sub.f16x2 r1627, %97, %92; +} +{ +mul.f16x2 r1630, r1627, r2211; +} +{ +add.f16x2 r1633, r1624, r1630; +} +{ +sub.f16x2 r1636, r1609, r1633; +} +{ +add.f16x2 r1639, %91, %86; +} +{ +mul.f16x2 r1642, r1639, r1968; +} +{ +add.f16x2 r1645, %93, r1642; +} +{ +add.f16x2 r1648, %96, %90; +} +{ +mul.f16x2 r1651, r1648, r2202; +} +{ +add.f16x2 r1654, r1645, r1651; +} +{ +add.f16x2 r1657, %85, %95; +} +{ +mul.f16x2 r1660, r1657, r2184; +} +{ +add.f16x2 r1663, r1654, r1660; +} +{ +sub.f16x2 r1666, %88, %84; +} +{ +mul.f16x2 r1669, r1666, r1995; +} +{ +sub.f16x2 r1672, %94, %87; +} +{ +mul.f16x2 r1675, r1672, r2226; +} +{ +add.f16x2 r1678, r1669, r1675; +} +{ +sub.f16x2 r1681, %97, %92; +} +{ +mul.f16x2 r1684, r1681, r2211; +} +{ +add.f16x2 r1687, r1678, r1684; +} +{ +add.f16x2 r1690, r1663, r1687; +} +{ +add.f16x2 r1693, %91, %86; +} +{ +mul.f16x2 r1696, r1693, r2202; +} +{ +add.f16x2 r1699, %93, r1696; +} +{ +add.f16x2 r1702, %96, %90; +} +{ +mul.f16x2 r1705, r1702, r2085; +} +{ +add.f16x2 r1708, r1699, r1705; +} +{ +add.f16x2 r1711, %85, %95; +} +{ +mul.f16x2 r1714, r1711, r2193; +} +{ +add.f16x2 r1717, r1708, r1714; +} +{ +sub.f16x2 r1720, %88, %84; +} +{ +mul.f16x2 r1723, r1720, r2226; +} +{ +sub.f16x2 r1726, %94, %87; +} +{ +mul.f16x2 r1729, r1726, r2109; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +sub.f16x2 r1735, %97, %92; +} +{ +mul.f16x2 r1738, r1735, r2217; +} +{ +add.f16x2 r1741, r1732, r1738; +} +{ +sub.f16x2 r1744, r1717, r1741; +} +{ +add.f16x2 r1747, %91, %86; +} +{ +mul.f16x2 r1750, r1747, r2202; +} +{ +add.f16x2 r1753, %93, r1750; +} +{ +add.f16x2 r1756, %96, %90; +} +{ +mul.f16x2 r1759, r1756, r2085; +} +{ +add.f16x2 r1762, r1753, r1759; +} +{ +add.f16x2 r1765, %85, %95; +} +{ +mul.f16x2 r1768, r1765, r2193; +} +{ +add.f16x2 r1771, r1762, r1768; +} +{ +sub.f16x2 r1774, %88, %84; +} +{ +mul.f16x2 r1777, r1774, r2226; +} +{ +sub.f16x2 r1780, %94, %87; +} +{ +mul.f16x2 r1783, r1780, r2109; +} +{ +add.f16x2 r1786, r1777, r1783; +} +{ +sub.f16x2 r1789, %97, %92; +} +{ +mul.f16x2 r1792, r1789, r2217; +} +{ +add.f16x2 r1795, r1786, r1792; +} +{ +add.f16x2 r1798, r1771, r1795; +} +{ +add.f16x2 r1801, %91, %86; +} +{ +mul.f16x2 r1804, r1801, r2184; +} +{ +add.f16x2 r1807, %93, r1804; +} +{ +add.f16x2 r1810, %96, %90; +} +{ +mul.f16x2 r1813, r1810, r2193; +} +{ +add.f16x2 r1816, r1807, r1813; +} +{ +add.f16x2 r1819, %85, %95; +} +{ +mul.f16x2 r1822, r1819, r2202; +} +{ +add.f16x2 r1825, r1816, r1822; +} +{ +sub.f16x2 r1828, %88, %84; +} +{ +mul.f16x2 r1831, r1828, r2211; +} +{ +sub.f16x2 r1834, %94, %87; +} +{ +mul.f16x2 r1837, r1834, r2217; +} +{ +add.f16x2 r1840, r1831, r1837; +} +{ +sub.f16x2 r1843, %97, %92; +} +{ +mul.f16x2 r1846, r1843, r2226; +} +{ +add.f16x2 r1849, r1840, r1846; +} +{ +sub.f16x2 r1852, r1825, r1849; +} +{ +add.f16x2 r1855, %91, %86; +} +{ +mul.f16x2 r1858, r1855, r2184; +} +{ +add.f16x2 r1861, %93, r1858; +} +{ +add.f16x2 r1864, %96, %90; +} +{ +mul.f16x2 r1867, r1864, r2193; +} +{ +add.f16x2 r1870, r1861, r1867; +} +{ +add.f16x2 r1873, %85, %95; +} +{ +mul.f16x2 r1876, r1873, r2202; +} +{ +add.f16x2 r1879, r1870, r1876; +} +{ +sub.f16x2 r1882, %88, %84; +} +{ +mul.f16x2 r1885, r1882, r2211; +} +{ +sub.f16x2 r1888, %94, %87; +} +{ +mul.f16x2 r1891, r1888, r2217; +} +{ +add.f16x2 r1894, r1885, r1891; +} +{ +sub.f16x2 r1897, %97, %92; +} +{ +mul.f16x2 r1900, r1897, r2226; +} +{ +add.f16x2 r1903, r1894, r1900; +} +{ +add.f16x2 r1906, r1879, r1903; +} +{ +add.f16x2 r1909, %88, %84; +} +{ +mul.f16x2 r1912, r1909, r1968; +} +{ +add.f16x2 r1915, %89, r1912; +} +{ +add.f16x2 r1918, %94, %87; +} +{ +mul.f16x2 r1921, r1918, r2202; +} +{ +add.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, %97, %92; +} +{ +mul.f16x2 r1930, r1927, r2184; +} +{ +add.f16x2 r1933, r1924, r1930; +} +{ +sub.f16x2 r1936, %91, %86; +} +{ +mul.f16x2 r1939, r1936, r1995; +} +{ +sub.f16x2 r1942, %96, %90; +} +{ +mul.f16x2 r1945, r1942, r2226; +} +{ +add.f16x2 r1948, r1939, r1945; +} +{ +sub.f16x2 r1951, %85, %95; +} +{ +mul.f16x2 r1954, r1951, r2211; +} +{ +add.f16x2 r1957, r1948, r1954; +} +{ +add.f16x2 r1960, r1933, r1957; +} +{ +add.f16x2 r1963, %88, %84; +} +{ +mul.f16x2 r1966, r1963, r1968; +} +{ +add.f16x2 r1969, %89, r1966; +} +{ +add.f16x2 r1972, %94, %87; +} +{ +mul.f16x2 r1975, r1972, r2202; +} +{ +add.f16x2 r1978, r1969, r1975; +} +{ +add.f16x2 r1981, %97, %92; +} +{ +mul.f16x2 r1984, r1981, r2184; +} +{ +add.f16x2 r1987, r1978, r1984; +} +{ +sub.f16x2 r1990, %91, %86; +} +{ +mul.f16x2 r1993, r1990, r1995; +} +{ +sub.f16x2 r1996, %96, %90; +} +{ +mul.f16x2 r1999, r1996, r2226; +} +{ +add.f16x2 r2002, r1993, r1999; +} +{ +sub.f16x2 r2005, %85, %95; +} +{ +mul.f16x2 r2008, r2005, r2211; +} +{ +add.f16x2 r2011, r2002, r2008; +} +{ +sub.f16x2 r2014, r1987, r2011; +} +{ +add.f16x2 r2017, %88, %84; +} +{ +mul.f16x2 r2020, r2017, r2202; +} +{ +add.f16x2 r2023, %89, r2020; +} +{ +add.f16x2 r2026, %94, %87; +} +{ +mul.f16x2 r2029, r2026, r2085; +} +{ +add.f16x2 r2032, r2023, r2029; +} +{ +add.f16x2 r2035, %97, %92; +} +{ +mul.f16x2 r2038, r2035, r2193; +} +{ +add.f16x2 r2041, r2032, r2038; +} +{ +sub.f16x2 r2044, %91, %86; +} +{ +mul.f16x2 r2047, r2044, r2226; +} +{ +sub.f16x2 r2050, %96, %90; +} +{ +mul.f16x2 r2053, r2050, r2109; +} +{ +add.f16x2 r2056, r2047, r2053; +} +{ +sub.f16x2 r2059, %85, %95; +} +{ +mul.f16x2 r2062, r2059, r2217; +} +{ +add.f16x2 r2065, r2056, r2062; +} +{ +add.f16x2 r2068, r2041, r2065; +} +{ +add.f16x2 r2071, %88, %84; +} +{ +mul.f16x2 r2074, r2071, r2202; +} +{ +add.f16x2 r2077, %89, r2074; +} +{ +add.f16x2 r2080, %94, %87; +} +{ +mul.f16x2 r2083, r2080, r2085; +} +{ +add.f16x2 r2086, r2077, r2083; +} +{ +add.f16x2 r2089, %97, %92; +} +{ +mul.f16x2 r2092, r2089, r2193; +} +{ +add.f16x2 r2095, r2086, r2092; +} +{ +sub.f16x2 r2098, %91, %86; +} +{ +mul.f16x2 r2101, r2098, r2226; +} +{ +sub.f16x2 r2104, %96, %90; +} +{ +mul.f16x2 r2107, r2104, r2109; +} +{ +add.f16x2 r2110, r2101, r2107; +} +{ +sub.f16x2 r2113, %85, %95; +} +{ +mul.f16x2 r2116, r2113, r2217; +} +{ +add.f16x2 r2119, r2110, r2116; +} +{ +sub.f16x2 r2122, r2095, r2119; +} +{ +add.f16x2 r2125, %88, %84; +} +{ +mul.f16x2 r2128, r2125, r2184; +} +{ +add.f16x2 r2131, %89, r2128; +} +{ +add.f16x2 r2134, %94, %87; +} +{ +mul.f16x2 r2137, r2134, r2193; +} +{ +add.f16x2 r2140, r2131, r2137; +} +{ +add.f16x2 r2143, %97, %92; +} +{ +mul.f16x2 r2146, r2143, r2202; +} +{ +add.f16x2 r2149, r2140, r2146; +} +{ +sub.f16x2 r2152, %91, %86; +} +{ +mul.f16x2 r2155, r2152, r2211; +} +{ +sub.f16x2 r2158, %96, %90; +} +{ +mul.f16x2 r2161, r2158, r2217; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +sub.f16x2 r2167, %85, %95; +} +{ +mul.f16x2 r2170, r2167, r2226; +} +{ +add.f16x2 r2173, r2164, r2170; +} +{ +add.f16x2 r2176, r2149, r2173; +} +{ +add.f16x2 r2179, %88, %84; +} +{ +mul.f16x2 r2182, r2179, r2184; +} +{ +add.f16x2 r2185, %89, r2182; +} +{ +add.f16x2 r2188, %94, %87; +} +{ +mul.f16x2 r2191, r2188, r2193; +} +{ +add.f16x2 r2194, r2185, r2191; +} +{ +add.f16x2 r2197, %97, %92; +} +{ +mul.f16x2 r2200, r2197, r2202; +} +{ +add.f16x2 r2203, r2194, r2200; +} +{ +sub.f16x2 r2206, %91, %86; +} +{ +mul.f16x2 r2209, r2206, r2211; +} +{ +sub.f16x2 r2212, %96, %90; +} +{ +mul.f16x2 r2215, r2212, r2217; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +sub.f16x2 r2221, %85, %95; +} +{ +mul.f16x2 r2224, r2221, r2226; +} +{ +add.f16x2 r2227, r2218, r2224; +} +{ +sub.f16x2 r2230, r2203, r2227; +} +{ +cvt.rn.f16.f64 rs75, fd114; +} +mov.b32 r2652, {rs75, rs75}; +{ +cvt.rn.f16.f64 rs76, fd113; +} +{ +neg.f16 rs77, rs76; +} +mov.b32 r2679, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs79, fd107; +} +mov.b32 r2886, {rs79, rs79}; +{ +cvt.rn.f16.f64 rs80, fd117; +} +{ +neg.f16 rs81, rs80; +} +mov.b32 r2910, {rs81, rs81}; +{ +cvt.rn.f16.f64 rs83, fd115; +} +mov.b32 r2868, {rs83, rs83}; +{ +cvt.rn.f16.f64 rs84, fd109; +} +{ +neg.f16 rs85, rs84; +} +mov.b32 r2895, {rs85, rs85}; +{ +cvt.rn.f16.f64 rs87, fd115; +} +mov.b32 r2769, {rs87, rs87}; +{ +cvt.rn.f16.f64 rs88, fd109; +} +mov.b32 r2793, {rs88, rs88}; +{ +cvt.rn.f16.f64 rs89, fd114; +} +mov.b32 r2877, {rs89, rs89}; +{ +cvt.rn.f16.f64 rs90, fd113; +} +mov.b32 r2901, {rs90, rs90}; +{ +add.f16x2 r2233, %103, %98; +} +{ +add.f16x2 r2236, %106, r2233; +} +{ +add.f16x2 r2239, %108, %101; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, %111, %107; +} +{ +add.f16x2 r2248, r2242, r2245; +} +{ +add.f16x2 r2251, %100, %110; +} +{ +add.f16x2 r2254, %102, r2251; +} +{ +add.f16x2 r2257, %105, %99; +} +{ +add.f16x2 r2260, r2254, r2257; +} +{ +add.f16x2 r2263, %109, %104; +} +{ +add.f16x2 r2266, r2260, r2263; +} +{ +add.f16x2 r2269, %103, %98; +} +{ +mul.f16x2 r2272, r2269, r2652; +} +{ +add.f16x2 r2275, %106, r2272; +} +{ +add.f16x2 r2278, %108, %101; +} +{ +mul.f16x2 r2281, r2278, r2886; +} +{ +add.f16x2 r2284, r2275, r2281; +} +{ +add.f16x2 r2287, %111, %107; +} +{ +mul.f16x2 r2290, r2287, r2868; +} +{ +add.f16x2 r2293, r2284, r2290; +} +{ +sub.f16x2 r2296, %100, %110; +} +{ +mul.f16x2 r2299, r2296, r2679; +} +{ +sub.f16x2 r2302, %105, %99; +} +{ +mul.f16x2 r2305, r2302, r2910; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, %109, %104; +} +{ +mul.f16x2 r2314, r2311, r2895; +} +{ +add.f16x2 r2317, r2308, r2314; +} +{ +sub.f16x2 r2320, r2293, r2317; +} +{ +add.f16x2 r2323, %103, %98; +} +{ +mul.f16x2 r2326, r2323, r2652; +} +{ +add.f16x2 r2329, %106, r2326; +} +{ +add.f16x2 r2332, %108, %101; +} +{ +mul.f16x2 r2335, r2332, r2886; +} +{ +add.f16x2 r2338, r2329, r2335; +} +{ +add.f16x2 r2341, %111, %107; +} +{ +mul.f16x2 r2344, r2341, r2868; +} +{ +add.f16x2 r2347, r2338, r2344; +} +{ +sub.f16x2 r2350, %100, %110; +} +{ +mul.f16x2 r2353, r2350, r2679; +} +{ +sub.f16x2 r2356, %105, %99; +} +{ +mul.f16x2 r2359, r2356, r2910; +} +{ +add.f16x2 r2362, r2353, r2359; +} +{ +sub.f16x2 r2365, %109, %104; +} +{ +mul.f16x2 r2368, r2365, r2895; +} +{ +add.f16x2 r2371, r2362, r2368; +} +{ +add.f16x2 r2374, r2347, r2371; +} +{ +add.f16x2 r2377, %103, %98; +} +{ +mul.f16x2 r2380, r2377, r2886; +} +{ +add.f16x2 r2383, %106, r2380; +} +{ +add.f16x2 r2386, %108, %101; +} +{ +mul.f16x2 r2389, r2386, r2769; +} +{ +add.f16x2 r2392, r2383, r2389; +} +{ +add.f16x2 r2395, %111, %107; +} +{ +mul.f16x2 r2398, r2395, r2877; +} +{ +add.f16x2 r2401, r2392, r2398; +} +{ +sub.f16x2 r2404, %100, %110; +} +{ +mul.f16x2 r2407, r2404, r2910; +} +{ +sub.f16x2 r2410, %105, %99; +} +{ +mul.f16x2 r2413, r2410, r2793; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, %109, %104; +} +{ +mul.f16x2 r2422, r2419, r2901; +} +{ +add.f16x2 r2425, r2416, r2422; +} +{ +sub.f16x2 r2428, r2401, r2425; +} +{ +add.f16x2 r2431, %103, %98; +} +{ +mul.f16x2 r2434, r2431, r2886; +} +{ +add.f16x2 r2437, %106, r2434; +} +{ +add.f16x2 r2440, %108, %101; +} +{ +mul.f16x2 r2443, r2440, r2769; +} +{ +add.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, %111, %107; +} +{ +mul.f16x2 r2452, r2449, r2877; +} +{ +add.f16x2 r2455, r2446, r2452; +} +{ +sub.f16x2 r2458, %100, %110; +} +{ +mul.f16x2 r2461, r2458, r2910; +} +{ +sub.f16x2 r2464, %105, %99; +} +{ +mul.f16x2 r2467, r2464, r2793; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +sub.f16x2 r2473, %109, %104; +} +{ +mul.f16x2 r2476, r2473, r2901; +} +{ +add.f16x2 r2479, r2470, r2476; +} +{ +add.f16x2 r2482, r2455, r2479; +} +{ +add.f16x2 r2485, %103, %98; +} +{ +mul.f16x2 r2488, r2485, r2868; +} +{ +add.f16x2 r2491, %106, r2488; +} +{ +add.f16x2 r2494, %108, %101; +} +{ +mul.f16x2 r2497, r2494, r2877; +} +{ +add.f16x2 r2500, r2491, r2497; +} +{ +add.f16x2 r2503, %111, %107; +} +{ +mul.f16x2 r2506, r2503, r2886; +} +{ +add.f16x2 r2509, r2500, r2506; +} +{ +sub.f16x2 r2512, %100, %110; +} +{ +mul.f16x2 r2515, r2512, r2895; +} +{ +sub.f16x2 r2518, %105, %99; +} +{ +mul.f16x2 r2521, r2518, r2901; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, %109, %104; +} +{ +mul.f16x2 r2530, r2527, r2910; +} +{ +add.f16x2 r2533, r2524, r2530; +} +{ +sub.f16x2 r2536, r2509, r2533; +} +{ +add.f16x2 r2539, %103, %98; +} +{ +mul.f16x2 r2542, r2539, r2868; +} +{ +add.f16x2 r2545, %106, r2542; +} +{ +add.f16x2 r2548, %108, %101; +} +{ +mul.f16x2 r2551, r2548, r2877; +} +{ +add.f16x2 r2554, r2545, r2551; +} +{ +add.f16x2 r2557, %111, %107; +} +{ +mul.f16x2 r2560, r2557, r2886; +} +{ +add.f16x2 r2563, r2554, r2560; +} +{ +sub.f16x2 r2566, %100, %110; +} +{ +mul.f16x2 r2569, r2566, r2895; +} +{ +sub.f16x2 r2572, %105, %99; +} +{ +mul.f16x2 r2575, r2572, r2901; +} +{ +add.f16x2 r2578, r2569, r2575; +} +{ +sub.f16x2 r2581, %109, %104; +} +{ +mul.f16x2 r2584, r2581, r2910; +} +{ +add.f16x2 r2587, r2578, r2584; +} +{ +add.f16x2 r2590, r2563, r2587; +} +{ +add.f16x2 r2593, %100, %110; +} +{ +mul.f16x2 r2596, r2593, r2652; +} +{ +add.f16x2 r2599, %102, r2596; +} +{ +add.f16x2 r2602, %105, %99; +} +{ +mul.f16x2 r2605, r2602, r2886; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +add.f16x2 r2611, %109, %104; +} +{ +mul.f16x2 r2614, r2611, r2868; +} +{ +add.f16x2 r2617, r2608, r2614; +} +{ +sub.f16x2 r2620, %103, %98; +} +{ +mul.f16x2 r2623, r2620, r2679; +} +{ +sub.f16x2 r2626, %108, %101; +} +{ +mul.f16x2 r2629, r2626, r2910; +} +{ +add.f16x2 r2632, r2623, r2629; +} +{ +sub.f16x2 r2635, %111, %107; +} +{ +mul.f16x2 r2638, r2635, r2895; +} +{ +add.f16x2 r2641, r2632, r2638; +} +{ +add.f16x2 r2644, r2617, r2641; +} +{ +add.f16x2 r2647, %100, %110; +} +{ +mul.f16x2 r2650, r2647, r2652; +} +{ +add.f16x2 r2653, %102, r2650; +} +{ +add.f16x2 r2656, %105, %99; +} +{ +mul.f16x2 r2659, r2656, r2886; +} +{ +add.f16x2 r2662, r2653, r2659; +} +{ +add.f16x2 r2665, %109, %104; +} +{ +mul.f16x2 r2668, r2665, r2868; +} +{ +add.f16x2 r2671, r2662, r2668; +} +{ +sub.f16x2 r2674, %103, %98; +} +{ +mul.f16x2 r2677, r2674, r2679; +} +{ +sub.f16x2 r2680, %108, %101; +} +{ +mul.f16x2 r2683, r2680, r2910; +} +{ +add.f16x2 r2686, r2677, r2683; +} +{ +sub.f16x2 r2689, %111, %107; +} +{ +mul.f16x2 r2692, r2689, r2895; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2671, r2695; +} +{ +add.f16x2 r2701, %100, %110; +} +{ +mul.f16x2 r2704, r2701, r2886; +} +{ +add.f16x2 r2707, %102, r2704; +} +{ +add.f16x2 r2710, %105, %99; +} +{ +mul.f16x2 r2713, r2710, r2769; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +add.f16x2 r2719, %109, %104; +} +{ +mul.f16x2 r2722, r2719, r2877; +} +{ +add.f16x2 r2725, r2716, r2722; +} +{ +sub.f16x2 r2728, %103, %98; +} +{ +mul.f16x2 r2731, r2728, r2910; +} +{ +sub.f16x2 r2734, %108, %101; +} +{ +mul.f16x2 r2737, r2734, r2793; +} +{ +add.f16x2 r2740, r2731, r2737; +} +{ +sub.f16x2 r2743, %111, %107; +} +{ +mul.f16x2 r2746, r2743, r2901; +} +{ +add.f16x2 r2749, r2740, r2746; +} +{ +add.f16x2 r2752, r2725, r2749; +} +{ +add.f16x2 r2755, %100, %110; +} +{ +mul.f16x2 r2758, r2755, r2886; +} +{ +add.f16x2 r2761, %102, r2758; +} +{ +add.f16x2 r2764, %105, %99; +} +{ +mul.f16x2 r2767, r2764, r2769; +} +{ +add.f16x2 r2770, r2761, r2767; +} +{ +add.f16x2 r2773, %109, %104; +} +{ +mul.f16x2 r2776, r2773, r2877; +} +{ +add.f16x2 r2779, r2770, r2776; +} +{ +sub.f16x2 r2782, %103, %98; +} +{ +mul.f16x2 r2785, r2782, r2910; +} +{ +sub.f16x2 r2788, %108, %101; +} +{ +mul.f16x2 r2791, r2788, r2793; +} +{ +add.f16x2 r2794, r2785, r2791; +} +{ +sub.f16x2 r2797, %111, %107; +} +{ +mul.f16x2 r2800, r2797, r2901; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2779, r2803; +} +{ +add.f16x2 r2809, %100, %110; +} +{ +mul.f16x2 r2812, r2809, r2868; +} +{ +add.f16x2 r2815, %102, r2812; +} +{ +add.f16x2 r2818, %105, %99; +} +{ +mul.f16x2 r2821, r2818, r2877; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +add.f16x2 r2827, %109, %104; +} +{ +mul.f16x2 r2830, r2827, r2886; +} +{ +add.f16x2 r2833, r2824, r2830; +} +{ +sub.f16x2 r2836, %103, %98; +} +{ +mul.f16x2 r2839, r2836, r2895; +} +{ +sub.f16x2 r2842, %108, %101; +} +{ +mul.f16x2 r2845, r2842, r2901; +} +{ +add.f16x2 r2848, r2839, r2845; +} +{ +sub.f16x2 r2851, %111, %107; +} +{ +mul.f16x2 r2854, r2851, r2910; +} +{ +add.f16x2 r2857, r2848, r2854; +} +{ +add.f16x2 r2860, r2833, r2857; +} +{ +add.f16x2 r2863, %100, %110; +} +{ +mul.f16x2 r2866, r2863, r2868; +} +{ +add.f16x2 r2869, %102, r2866; +} +{ +add.f16x2 r2872, %105, %99; +} +{ +mul.f16x2 r2875, r2872, r2877; +} +{ +add.f16x2 r2878, r2869, r2875; +} +{ +add.f16x2 r2881, %109, %104; +} +{ +mul.f16x2 r2884, r2881, r2886; +} +{ +add.f16x2 r2887, r2878, r2884; +} +{ +sub.f16x2 r2890, %103, %98; +} +{ +mul.f16x2 r2893, r2890, r2895; +} +{ +sub.f16x2 r2896, %108, %101; +} +{ +mul.f16x2 r2899, r2896, r2901; +} +{ +add.f16x2 r2902, r2893, r2899; +} +{ +sub.f16x2 r2905, %111, %107; +} +{ +mul.f16x2 r2908, r2905, r2910; +} +{ +add.f16x2 r2911, r2902, r2908; +} +{ +sub.f16x2 r2914, r2887, r2911; +} +{ +cvt.rn.f16.f64 rs91, fd110; +} +{ +cvt.rn.f16.f64 rs92, fd116; +} +{ +cvt.rn.f16.f64 rs93, fd114; +} +{ +cvt.rn.f16.f64 rs94, fd112; +} +{ +cvt.rn.f16.f64 rs95, fd118; +} +{ +cvt.rn.f16.f64 rs96, fd108; +} +{ +cvt.rn.f16.f64 rs97, fd107; +} +{ +cvt.rn.f16.f64 rs98, fd108; +} +{ +cvt.rn.f16.f64 rs99, fd111; +} +{ +cvt.rn.f16.f64 rs100, fd112; +} +{ +cvt.rn.f16.f64 rs101, fd115; +} +{ +cvt.rn.f16.f64 rs102, fd116; +} +mov.b32 r2931, {rs91, rs91}; +{ +mul.f16x2 r2917, r2320, r2931; +} +mov.b32 r2928, {rs92, rs92}; +{ +mul.f16x2 r2920, r2644, r2928; +} +{ +sub.f16x2 r2923, r2917, r2920; +} +{ +mul.f16x2 r2926, r2320, r2928; +} +{ +fma.rn.f16x2 r2929, r2644, r2931, r2926; +} +mov.b32 r2947, {rs93, rs93}; +{ +mul.f16x2 r2933, r2428, r2947; +} +mov.b32 r2944, {rs94, rs94}; +{ +mul.f16x2 r2936, r2752, r2944; +} +{ +sub.f16x2 r2939, r2933, r2936; +} +{ +mul.f16x2 r2942, r2428, r2944; +} +{ +fma.rn.f16x2 r2945, r2752, r2947, r2942; +} +mov.b32 r2963, {rs95, rs95}; +{ +mul.f16x2 r2949, r2536, r2963; +} +mov.b32 r2960, {rs96, rs96}; +{ +mul.f16x2 r2952, r2860, r2960; +} +{ +sub.f16x2 r2955, r2949, r2952; +} +{ +mul.f16x2 r2958, r2536, r2960; +} +{ +fma.rn.f16x2 r2961, r2860, r2963, r2958; +} +mov.b32 r2979, {rs97, rs97}; +{ +mul.f16x2 r2965, r2590, r2979; +} +mov.b32 r2976, {rs98, rs98}; +{ +mul.f16x2 r2968, r2914, r2976; +} +{ +sub.f16x2 r2971, r2965, r2968; +} +{ +mul.f16x2 r2974, r2590, r2976; +} +{ +fma.rn.f16x2 r2977, r2914, r2979, r2974; +} +mov.b32 r2995, {rs99, rs99}; +{ +mul.f16x2 r2981, r2482, r2995; +} +mov.b32 r2992, {rs100, rs100}; +{ +mul.f16x2 r2984, r2806, r2992; +} +{ +sub.f16x2 r2987, r2981, r2984; +} +{ +mul.f16x2 r2990, r2482, r2992; +} +{ +fma.rn.f16x2 r2993, r2806, r2995, r2990; +} +mov.b32 r3011, {rs101, rs101}; +{ +mul.f16x2 r2997, r2374, r3011; +} +mov.b32 r3008, {rs102, rs102}; +{ +mul.f16x2 r3000, r2698, r3008; +} +{ +sub.f16x2 r3003, r2997, r3000; +} +{ +mul.f16x2 r3006, r2374, r3008; +} +{ +fma.rn.f16x2 r3009, r2698, r3011, r3006; +} +{ +add.f16x2 r3013, r1564, r2248; +} +{ +add.f16x2 r3016, r1582, r2266; +} +{ +sub.f16x2 r3019, r1564, r2248; +} +{ +sub.f16x2 r3022, r1582, r2266; +} +{ +add.f16x2 r3025, r1636, r2923; +} +{ +add.f16x2 r3028, r1960, r2929; +} +{ +sub.f16x2 r3031, r1636, r2923; +} +{ +sub.f16x2 r3034, r1960, r2929; +} +{ +add.f16x2 r3037, r1744, r2939; +} +{ +add.f16x2 r3040, r2068, r2945; +} +{ +sub.f16x2 r3043, r1744, r2939; +} +{ +sub.f16x2 r3046, r2068, r2945; +} +{ +add.f16x2 r3049, r1852, r2955; +} +{ +add.f16x2 r3052, r2176, r2961; +} +{ +sub.f16x2 r3055, r1852, r2955; +} +{ +sub.f16x2 r3058, r2176, r2961; +} +{ +add.f16x2 r3061, r1906, r2971; +} +{ +add.f16x2 r3064, r2230, r2977; +} +{ +sub.f16x2 r3067, r1906, r2971; +} +{ +sub.f16x2 r3070, r2230, r2977; +} +{ +add.f16x2 r3073, r1798, r2987; +} +{ +add.f16x2 r3076, r2122, r2993; +} +{ +sub.f16x2 r3079, r1798, r2987; +} +{ +sub.f16x2 r3082, r2122, r2993; +} +{ +add.f16x2 r3085, r1690, r3003; +} +{ +add.f16x2 r3088, r2014, r3009; +} +{ +sub.f16x2 r3091, r1690, r3003; +} +{ +sub.f16x2 r3094, r2014, r3009; +} +{ +cvt.rn.f16.f64 rs117, fd108; +} +{ +cvt.rn.f16.f64 rs118, fd118; +} +{ +cvt.rn.f16.f64 rs119, fd110; +} +{ +cvt.rn.f16.f64 rs120, fd116; +} +{ +cvt.rn.f16.f64 rs121, fd112; +} +{ +cvt.rn.f16.f64 rs122, fd114; +} +{ +cvt.rn.f16.f64 rs123, fd114; +} +{ +cvt.rn.f16.f64 rs124, fd112; +} +{ +cvt.rn.f16.f64 rs125, fd116; +} +{ +cvt.rn.f16.f64 rs126, fd110; +} +{ +cvt.rn.f16.f64 rs127, fd118; +} +{ +cvt.rn.f16.f64 rs128, fd108; +} +{ +cvt.rn.f16.f64 rs131, fd107; +} +{ +cvt.rn.f16.f64 rs132, fd108; +} +{ +cvt.rn.f16.f64 rs133, fd109; +} +{ +cvt.rn.f16.f64 rs134, fd110; +} +{ +cvt.rn.f16.f64 rs135, fd111; +} +{ +cvt.rn.f16.f64 rs136, fd112; +} +{ +cvt.rn.f16.f64 rs137, fd113; +} +{ +cvt.rn.f16.f64 rs138, fd114; +} +{ +cvt.rn.f16.f64 rs139, fd115; +} +{ +cvt.rn.f16.f64 rs140, fd116; +} +{ +cvt.rn.f16.f64 rs141, fd117; +} +{ +cvt.rn.f16.f64 rs142, fd118; +} +mov.b32 r3111, {rs117, rs117}; +{ +mul.f16x2 r3097, r3025, r3111; +} +mov.b32 r3108, {rs118, rs118}; +{ +mul.f16x2 r3100, r3028, r3108; +} +{ +sub.f16x2 r3103, r3097, r3100; +} +{ +mul.f16x2 r3106, r3025, r3108; +} +{ +fma.rn.f16x2 r3109, r3028, r3111, r3106; +} +mov.b32 r3127, {rs119, rs119}; +{ +mul.f16x2 r3113, r3037, r3127; +} +mov.b32 r3124, {rs120, rs120}; +{ +mul.f16x2 r3116, r3040, r3124; +} +{ +sub.f16x2 r3119, r3113, r3116; +} +{ +mul.f16x2 r3122, r3037, r3124; +} +{ +fma.rn.f16x2 r3125, r3040, r3127, r3122; +} +mov.b32 r3143, {rs121, rs121}; +{ +mul.f16x2 r3129, r3049, r3143; +} +mov.b32 r3140, {rs122, rs122}; +{ +mul.f16x2 r3132, r3052, r3140; +} +{ +sub.f16x2 r3135, r3129, r3132; +} +{ +mul.f16x2 r3138, r3049, r3140; +} +{ +fma.rn.f16x2 r3141, r3052, r3143, r3138; +} +mov.b32 r3159, {rs123, rs123}; +{ +mul.f16x2 r3145, r3061, r3159; +} +mov.b32 r3156, {rs124, rs124}; +{ +mul.f16x2 r3148, r3064, r3156; +} +{ +sub.f16x2 r3151, r3145, r3148; +} +{ +mul.f16x2 r3154, r3061, r3156; +} +{ +fma.rn.f16x2 r3157, r3064, r3159, r3154; +} +mov.b32 r3175, {rs125, rs125}; +{ +mul.f16x2 r3161, r3073, r3175; +} +mov.b32 r3172, {rs126, rs126}; +{ +mul.f16x2 r3164, r3076, r3172; +} +{ +sub.f16x2 r3167, r3161, r3164; +} +{ +mul.f16x2 r3170, r3073, r3172; +} +{ +fma.rn.f16x2 r3173, r3076, r3175, r3170; +} +mov.b32 r3191, {rs127, rs127}; +{ +mul.f16x2 r3177, r3085, r3191; +} +mov.b32 r3188, {rs128, rs128}; +{ +mul.f16x2 r3180, r3088, r3188; +} +{ +sub.f16x2 r3183, r3177, r3180; +} +{ +mul.f16x2 r3186, r3085, r3188; +} +{ +fma.rn.f16x2 r3189, r3088, r3191, r3186; +} +{ +neg.f16x2 r3193, r3022; +} +mov.b32 r3209, {rs131, rs131}; +{ +mul.f16x2 r3195, r3031, r3209; +} +mov.b32 r3206, {rs132, rs132}; +{ +mul.f16x2 r3198, r3034, r3206; +} +{ +sub.f16x2 r3201, r3195, r3198; +} +{ +mul.f16x2 r3204, r3031, r3206; +} +{ +fma.rn.f16x2 r3207, r3034, r3209, r3204; +} +mov.b32 r3225, {rs133, rs133}; +{ +mul.f16x2 r3211, r3043, r3225; +} +mov.b32 r3222, {rs134, rs134}; +{ +mul.f16x2 r3214, r3046, r3222; +} +{ +sub.f16x2 r3217, r3211, r3214; +} +{ +mul.f16x2 r3220, r3043, r3222; +} +{ +fma.rn.f16x2 r3223, r3046, r3225, r3220; +} +mov.b32 r3241, {rs135, rs135}; +{ +mul.f16x2 r3227, r3055, r3241; +} +mov.b32 r3238, {rs136, rs136}; +{ +mul.f16x2 r3230, r3058, r3238; +} +{ +sub.f16x2 r3233, r3227, r3230; +} +{ +mul.f16x2 r3236, r3055, r3238; +} +{ +fma.rn.f16x2 r3239, r3058, r3241, r3236; +} +mov.b32 r3257, {rs137, rs137}; +{ +mul.f16x2 r3243, r3067, r3257; +} +mov.b32 r3254, {rs138, rs138}; +{ +mul.f16x2 r3246, r3070, r3254; +} +{ +sub.f16x2 r3249, r3243, r3246; +} +{ +mul.f16x2 r3252, r3067, r3254; +} +{ +fma.rn.f16x2 r3255, r3070, r3257, r3252; +} +mov.b32 r3273, {rs139, rs139}; +{ +mul.f16x2 r3259, r3079, r3273; +} +mov.b32 r3270, {rs140, rs140}; +{ +mul.f16x2 r3262, r3082, r3270; +} +{ +sub.f16x2 r3265, r3259, r3262; +} +{ +mul.f16x2 r3268, r3079, r3270; +} +{ +fma.rn.f16x2 r3271, r3082, r3273, r3268; +} +mov.b32 r3289, {rs141, rs141}; +{ +mul.f16x2 r3275, r3091, r3289; +} +mov.b32 r3286, {rs142, rs142}; +{ +mul.f16x2 r3278, r3094, r3286; +} +{ +sub.f16x2 r3281, r3275, r3278; +} +{ +mul.f16x2 r3284, r3091, r3286; +} +{ +fma.rn.f16x2 r3287, r3094, r3289, r3284; +} +{ +add.f16x2 %0, r1465, r3013; +} +{ +add.f16x2 %1, r1468, r3016; +} +{ +sub.f16x2 %28, r1465, r3013; +} +{ +sub.f16x2 %29, r1468, r3016; +} +{ +add.f16x2 %2, r1477, r3103; +} +{ +add.f16x2 %3, r1480, r3109; +} +{ +sub.f16x2 %30, r1477, r3103; +} +{ +sub.f16x2 %31, r1480, r3109; +} +{ +add.f16x2 %4, r1489, r3119; +} +{ +add.f16x2 %5, r1492, r3125; +} +{ +sub.f16x2 %32, r1489, r3119; +} +{ +sub.f16x2 %33, r1492, r3125; +} +{ +add.f16x2 %6, r1501, r3135; +} +{ +add.f16x2 %7, r1504, r3141; +} +{ +sub.f16x2 %34, r1501, r3135; +} +{ +sub.f16x2 %35, r1504, r3141; +} +{ +add.f16x2 %8, r1513, r3151; +} +{ +add.f16x2 %9, r1516, r3157; +} +{ +sub.f16x2 %36, r1513, r3151; +} +{ +sub.f16x2 %37, r1516, r3157; +} +{ +add.f16x2 %10, r1525, r3167; +} +{ +add.f16x2 %11, r1528, r3173; +} +{ +sub.f16x2 %38, r1525, r3167; +} +{ +sub.f16x2 %39, r1528, r3173; +} +{ +add.f16x2 %12, r1537, r3183; +} +{ +add.f16x2 %13, r1540, r3189; +} +{ +sub.f16x2 %40, r1537, r3183; +} +{ +sub.f16x2 %41, r1540, r3189; +} +{ +add.f16x2 %14, r1471, r3193; +} +{ +add.f16x2 %15, r1474, r3019; +} +{ +sub.f16x2 %42, r1471, r3193; +} +{ +sub.f16x2 %43, r1474, r3019; +} +{ +add.f16x2 %16, r1483, r3201; +} +{ +add.f16x2 %17, r1486, r3207; +} +{ +sub.f16x2 %44, r1483, r3201; +} +{ +sub.f16x2 %45, r1486, r3207; +} +{ +add.f16x2 %18, r1495, r3217; +} +{ +add.f16x2 %19, r1498, r3223; +} +{ +sub.f16x2 %46, r1495, r3217; +} +{ +sub.f16x2 %47, r1498, r3223; +} +{ +add.f16x2 %20, r1507, r3233; +} +{ +add.f16x2 %21, r1510, r3239; +} +{ +sub.f16x2 %48, r1507, r3233; +} +{ +sub.f16x2 %49, r1510, r3239; +} +{ +add.f16x2 %22, r1519, r3249; +} +{ +add.f16x2 %23, r1522, r3255; +} +{ +sub.f16x2 %50, r1519, r3249; +} +{ +sub.f16x2 %51, r1522, r3255; +} +{ +add.f16x2 %24, r1531, r3265; +} +{ +add.f16x2 %25, r1534, r3271; +} +{ +sub.f16x2 %52, r1531, r3265; +} +{ +sub.f16x2 %53, r1534, r3271; +} +{ +add.f16x2 %26, r1543, r3281; +} +{ +add.f16x2 %27, r1546, r3287; +} +{ +sub.f16x2 %54, r1543, r3281; +} +{ +sub.f16x2 %55, r1546, r3287; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)): "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[15].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..5d7010f44a827 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp32_fwd.hpp.inc @@ -0,0 +1,600 @@ +#ifndef CUFFTDX_FFT_28_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_28_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<12, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<697>; +.reg .b64 rd<2>; +add.f32 f113, %66, %120; +add.f32 f114, %56, f113; +add.f32 f115, %77, %109; +add.f32 f116, f115, f114; +add.f32 f117, %88, %98; +add.f32 f118, f117, f116; +add.f32 f119, %68, %121; +add.f32 f120, %57, f119; +add.f32 f121, %79, %111; +add.f32 f122, f121, f120; +add.f32 f123, %89, %100; +add.f32 f124, f123, f122; +fma.rn.f32 f125, f113, 0f3F1F9D07, %56; +mul.f32 f126, f115, 0f3E63DC87; +sub.f32 f127, f125, f126; +mul.f32 f128, f117, 0f3F66A5E5; +sub.f32 f129, f127, f128; +sub.f32 f130, %68, %121; +mul.f32 f131, f130, 0f3F48261C; +sub.f32 f132, %79, %111; +mul.f32 f133, f132, 0fBF7994E0; +sub.f32 f134, f133, f131; +sub.f32 f135, %89, %100; +mul.f32 f136, f135, 0f3EDE2602; +sub.f32 f137, f134, f136; +sub.f32 f138, f129, f137; +add.f32 f139, f137, f129; +mul.f32 f140, f113, 0f3E63DC87; +sub.f32 f141, %56, f140; +mul.f32 f142, f115, 0f3F66A5E5; +sub.f32 f143, f141, f142; +fma.rn.f32 f144, f117, 0f3F1F9D07, f143; +mul.f32 f145, f130, 0f3F7994E0; +mul.f32 f146, f132, 0f3EDE2602; +sub.f32 f147, f146, f145; +fma.rn.f32 f148, f135, 0f3F48261C, f147; +sub.f32 f149, f144, f148; +add.f32 f150, f148, f144; +mul.f32 f151, f113, 0f3F66A5E5; +sub.f32 f152, %56, f151; +fma.rn.f32 f153, f115, 0f3F1F9D07, f152; +mul.f32 f154, f117, 0f3E63DC87; +sub.f32 f155, f153, f154; +mul.f32 f156, f130, 0f3EDE2602; +mul.f32 f157, f132, 0f3F48261C; +sub.f32 f158, f157, f156; +mul.f32 f159, f135, 0f3F7994E0; +sub.f32 f160, f158, f159; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +fma.rn.f32 f163, f119, 0f3F1F9D07, %57; +mul.f32 f164, f121, 0f3E63DC87; +sub.f32 f165, f163, f164; +mul.f32 f166, f123, 0f3F66A5E5; +sub.f32 f167, f165, f166; +sub.f32 f168, %66, %120; +mul.f32 f169, f168, 0f3F48261C; +sub.f32 f170, %77, %109; +mul.f32 f171, f170, 0fBF7994E0; +sub.f32 f172, f171, f169; +sub.f32 f173, %88, %98; +mul.f32 f174, f173, 0f3EDE2602; +sub.f32 f175, f172, f174; +add.f32 f176, f175, f167; +sub.f32 f177, f167, f175; +mul.f32 f178, f119, 0f3E63DC87; +sub.f32 f179, %57, f178; +mul.f32 f180, f121, 0f3F66A5E5; +sub.f32 f181, f179, f180; +fma.rn.f32 f182, f123, 0f3F1F9D07, f181; +mul.f32 f183, f168, 0f3F7994E0; +mul.f32 f184, f170, 0f3EDE2602; +sub.f32 f185, f184, f183; +fma.rn.f32 f186, f173, 0f3F48261C, f185; +add.f32 f187, f186, f182; +sub.f32 f188, f182, f186; +mul.f32 f189, f119, 0f3F66A5E5; +sub.f32 f190, %57, f189; +fma.rn.f32 f191, f121, 0f3F1F9D07, f190; +mul.f32 f192, f123, 0f3E63DC87; +sub.f32 f193, f191, f192; +mul.f32 f194, f168, 0f3EDE2602; +mul.f32 f195, f170, 0f3F48261C; +sub.f32 f196, f195, f194; +mul.f32 f197, f173, 0f3F7994E0; +sub.f32 f198, f196, f197; +add.f32 f199, f198, f193; +sub.f32 f200, f193, f198; +add.f32 f201, %72, %125; +add.f32 f202, %61, f201; +add.f32 f203, %82, %114; +add.f32 f204, f203, f202; +add.f32 f205, %93, %104; +add.f32 f206, f205, f204; +add.f32 f207, %73, %127; +add.f32 f208, %63, f207; +add.f32 f209, %84, %116; +add.f32 f210, f209, f208; +add.f32 f211, %95, %105; +add.f32 f212, f211, f210; +fma.rn.f32 f213, f201, 0f3F1F9D07, %61; +mul.f32 f214, f203, 0f3E63DC87; +sub.f32 f215, f213, f214; +mul.f32 f216, f205, 0f3F66A5E5; +sub.f32 f217, f215, f216; +sub.f32 f218, %73, %127; +mul.f32 f219, f218, 0f3F48261C; +sub.f32 f220, %84, %116; +mul.f32 f221, f220, 0fBF7994E0; +sub.f32 f222, f221, f219; +sub.f32 f223, %95, %105; +mul.f32 f224, f223, 0f3EDE2602; +sub.f32 f225, f222, f224; +sub.f32 f226, f217, f225; +add.f32 f227, f225, f217; +mul.f32 f228, f201, 0f3E63DC87; +sub.f32 f229, %61, f228; +mul.f32 f230, f203, 0f3F66A5E5; +sub.f32 f231, f229, f230; +fma.rn.f32 f232, f205, 0f3F1F9D07, f231; +mul.f32 f233, f218, 0f3F7994E0; +mul.f32 f234, f220, 0f3EDE2602; +sub.f32 f235, f234, f233; +fma.rn.f32 f236, f223, 0f3F48261C, f235; +sub.f32 f237, f232, f236; +add.f32 f238, f236, f232; +mul.f32 f239, f201, 0f3F66A5E5; +sub.f32 f240, %61, f239; +fma.rn.f32 f241, f203, 0f3F1F9D07, f240; +mul.f32 f242, f205, 0f3E63DC87; +sub.f32 f243, f241, f242; +mul.f32 f244, f218, 0f3EDE2602; +mul.f32 f245, f220, 0f3F48261C; +sub.f32 f246, f245, f244; +mul.f32 f247, f223, 0f3F7994E0; +sub.f32 f248, f246, f247; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +fma.rn.f32 f251, f207, 0f3F1F9D07, %63; +mul.f32 f252, f209, 0f3E63DC87; +sub.f32 f253, f251, f252; +mul.f32 f254, f211, 0f3F66A5E5; +sub.f32 f255, f253, f254; +sub.f32 f256, %72, %125; +mul.f32 f257, f256, 0f3F48261C; +sub.f32 f258, %82, %114; +mul.f32 f259, f258, 0fBF7994E0; +sub.f32 f260, f259, f257; +sub.f32 f261, %93, %104; +mul.f32 f262, f261, 0f3EDE2602; +sub.f32 f263, f260, f262; +add.f32 f264, f263, f255; +sub.f32 f265, f255, f263; +mul.f32 f266, f207, 0f3E63DC87; +sub.f32 f267, %63, f266; +mul.f32 f268, f209, 0f3F66A5E5; +sub.f32 f269, f267, f268; +fma.rn.f32 f270, f211, 0f3F1F9D07, f269; +mul.f32 f271, f256, 0f3F7994E0; +mul.f32 f272, f258, 0f3EDE2602; +sub.f32 f273, f272, f271; +fma.rn.f32 f274, f261, 0f3F48261C, f273; +add.f32 f275, f274, f270; +sub.f32 f276, f270, f274; +mul.f32 f277, f207, 0f3F66A5E5; +sub.f32 f278, %63, f277; +fma.rn.f32 f279, f209, 0f3F1F9D07, f278; +mul.f32 f280, f211, 0f3E63DC87; +sub.f32 f281, f279, f280; +mul.f32 f282, f256, 0f3EDE2602; +mul.f32 f283, f258, 0f3F48261C; +sub.f32 f284, f283, f282; +mul.f32 f285, f261, 0f3F7994E0; +sub.f32 f286, f284, f285; +add.f32 f287, f286, f281; +sub.f32 f288, f281, f286; +mul.f32 f289, f226, 0f3F66A5E5; +mul.f32 f290, f264, 0fBEDE2602; +sub.f32 f291, f289, f290; +mul.f32 f292, f264, 0f3F66A5E5; +fma.rn.f32 f293, f226, 0fBEDE2602, f292; +mul.f32 f294, f237, 0f3F1F9D07; +mul.f32 f295, f275, 0fBF48261C; +sub.f32 f296, f294, f295; +mul.f32 f297, f275, 0f3F1F9D07; +fma.rn.f32 f298, f237, 0fBF48261C, f297; +mul.f32 f299, f249, 0f3E63DC87; +mul.f32 f300, f287, 0fBF7994E0; +sub.f32 f301, f299, f300; +mul.f32 f302, f287, 0f3E63DC87; +fma.rn.f32 f303, f249, 0fBF7994E0, f302; +mul.f32 f304, f250, 0fBE63DC87; +mul.f32 f305, f288, 0fBF7994E0; +sub.f32 f306, f304, f305; +mul.f32 f307, f288, 0fBE63DC87; +fma.rn.f32 f308, f250, 0fBF7994E0, f307; +mul.f32 f309, f238, 0fBF1F9D07; +mul.f32 f310, f276, 0fBF48261C; +sub.f32 f311, f309, f310; +mul.f32 f312, f276, 0fBF1F9D07; +fma.rn.f32 f313, f238, 0fBF48261C, f312; +mul.f32 f314, f227, 0fBF66A5E5; +mul.f32 f315, f265, 0fBEDE2602; +sub.f32 f316, f314, f315; +mul.f32 f317, f265, 0fBF66A5E5; +fma.rn.f32 f318, f227, 0fBEDE2602, f317; +add.f32 f319, f118, f206; +add.f32 f320, f124, f212; +sub.f32 f321, f118, f206; +sub.f32 f322, f124, f212; +add.f32 f323, f138, f291; +add.f32 f324, f176, f293; +sub.f32 f325, f138, f291; +sub.f32 f326, f176, f293; +add.f32 f327, f149, f296; +add.f32 f328, f187, f298; +sub.f32 f329, f149, f296; +sub.f32 f330, f187, f298; +add.f32 f331, f161, f301; +add.f32 f332, f199, f303; +sub.f32 f333, f161, f301; +sub.f32 f334, f199, f303; +add.f32 f335, f162, f306; +add.f32 f336, f200, f308; +sub.f32 f337, f162, f306; +sub.f32 f338, f200, f308; +add.f32 f339, f150, f311; +add.f32 f340, f188, f313; +sub.f32 f341, f150, f311; +sub.f32 f342, f188, f313; +add.f32 f343, f139, f316; +add.f32 f344, f177, f318; +sub.f32 f345, f139, f316; +sub.f32 f346, f177, f318; +add.f32 f347, %69, %122; +add.f32 f348, %58, f347; +add.f32 f349, %80, %112; +add.f32 f350, f349, f348; +add.f32 f351, %90, %101; +add.f32 f352, f351, f350; +add.f32 f353, %71, %124; +add.f32 f354, %60, f353; +add.f32 f355, %81, %113; +add.f32 f356, f355, f354; +add.f32 f357, %92, %103; +add.f32 f358, f357, f356; +fma.rn.f32 f359, f347, 0f3F1F9D07, %58; +mul.f32 f360, f349, 0f3E63DC87; +sub.f32 f361, f359, f360; +mul.f32 f362, f351, 0f3F66A5E5; +sub.f32 f363, f361, f362; +sub.f32 f364, %71, %124; +mul.f32 f365, f364, 0f3F48261C; +sub.f32 f366, %81, %113; +mul.f32 f367, f366, 0fBF7994E0; +sub.f32 f368, f367, f365; +sub.f32 f369, %92, %103; +mul.f32 f370, f369, 0f3EDE2602; +sub.f32 f371, f368, f370; +sub.f32 f372, f363, f371; +add.f32 f373, f371, f363; +mul.f32 f374, f347, 0f3E63DC87; +sub.f32 f375, %58, f374; +mul.f32 f376, f349, 0f3F66A5E5; +sub.f32 f377, f375, f376; +fma.rn.f32 f378, f351, 0f3F1F9D07, f377; +mul.f32 f379, f364, 0f3F7994E0; +mul.f32 f380, f366, 0f3EDE2602; +sub.f32 f381, f380, f379; +fma.rn.f32 f382, f369, 0f3F48261C, f381; +sub.f32 f383, f378, f382; +add.f32 f384, f382, f378; +mul.f32 f385, f347, 0f3F66A5E5; +sub.f32 f386, %58, f385; +fma.rn.f32 f387, f349, 0f3F1F9D07, f386; +mul.f32 f388, f351, 0f3E63DC87; +sub.f32 f389, f387, f388; +mul.f32 f390, f364, 0f3EDE2602; +mul.f32 f391, f366, 0f3F48261C; +sub.f32 f392, f391, f390; +mul.f32 f393, f369, 0f3F7994E0; +sub.f32 f394, f392, f393; +sub.f32 f395, f389, f394; +add.f32 f396, f394, f389; +fma.rn.f32 f397, f353, 0f3F1F9D07, %60; +mul.f32 f398, f355, 0f3E63DC87; +sub.f32 f399, f397, f398; +mul.f32 f400, f357, 0f3F66A5E5; +sub.f32 f401, f399, f400; +sub.f32 f402, %69, %122; +mul.f32 f403, f402, 0f3F48261C; +sub.f32 f404, %80, %112; +mul.f32 f405, f404, 0fBF7994E0; +sub.f32 f406, f405, f403; +sub.f32 f407, %90, %101; +mul.f32 f408, f407, 0f3EDE2602; +sub.f32 f409, f406, f408; +add.f32 f410, f409, f401; +sub.f32 f411, f401, f409; +mul.f32 f412, f353, 0f3E63DC87; +sub.f32 f413, %60, f412; +mul.f32 f414, f355, 0f3F66A5E5; +sub.f32 f415, f413, f414; +fma.rn.f32 f416, f357, 0f3F1F9D07, f415; +mul.f32 f417, f402, 0f3F7994E0; +mul.f32 f418, f404, 0f3EDE2602; +sub.f32 f419, f418, f417; +fma.rn.f32 f420, f407, 0f3F48261C, f419; +add.f32 f421, f420, f416; +sub.f32 f422, f416, f420; +mul.f32 f423, f353, 0f3F66A5E5; +sub.f32 f424, %60, f423; +fma.rn.f32 f425, f355, 0f3F1F9D07, f424; +mul.f32 f426, f357, 0f3E63DC87; +sub.f32 f427, f425, f426; +mul.f32 f428, f402, 0f3EDE2602; +mul.f32 f429, f404, 0f3F48261C; +sub.f32 f430, f429, f428; +mul.f32 f431, f407, 0f3F7994E0; +sub.f32 f432, f430, f431; +add.f32 f433, f432, f427; +sub.f32 f434, f427, f432; +add.f32 f435, %74, %128; +add.f32 f436, %64, f435; +add.f32 f437, %85, %117; +add.f32 f438, f437, f436; +add.f32 f439, %96, %106; +add.f32 f440, f439, f438; +add.f32 f441, %76, %129; +add.f32 f442, %65, f441; +add.f32 f443, %87, %119; +add.f32 f444, f443, f442; +add.f32 f445, %97, %108; +add.f32 f446, f445, f444; +fma.rn.f32 f447, f435, 0f3F1F9D07, %64; +mul.f32 f448, f437, 0f3E63DC87; +sub.f32 f449, f447, f448; +mul.f32 f450, f439, 0f3F66A5E5; +sub.f32 f451, f449, f450; +sub.f32 f452, %76, %129; +mul.f32 f453, f452, 0f3F48261C; +sub.f32 f454, %87, %119; +mul.f32 f455, f454, 0fBF7994E0; +sub.f32 f456, f455, f453; +sub.f32 f457, %97, %108; +mul.f32 f458, f457, 0f3EDE2602; +sub.f32 f459, f456, f458; +sub.f32 f460, f451, f459; +add.f32 f461, f459, f451; +mul.f32 f462, f435, 0f3E63DC87; +sub.f32 f463, %64, f462; +mul.f32 f464, f437, 0f3F66A5E5; +sub.f32 f465, f463, f464; +fma.rn.f32 f466, f439, 0f3F1F9D07, f465; +mul.f32 f467, f452, 0f3F7994E0; +mul.f32 f468, f454, 0f3EDE2602; +sub.f32 f469, f468, f467; +fma.rn.f32 f470, f457, 0f3F48261C, f469; +sub.f32 f471, f466, f470; +add.f32 f472, f470, f466; +mul.f32 f473, f435, 0f3F66A5E5; +sub.f32 f474, %64, f473; +fma.rn.f32 f475, f437, 0f3F1F9D07, f474; +mul.f32 f476, f439, 0f3E63DC87; +sub.f32 f477, f475, f476; +mul.f32 f478, f452, 0f3EDE2602; +mul.f32 f479, f454, 0f3F48261C; +sub.f32 f480, f479, f478; +mul.f32 f481, f457, 0f3F7994E0; +sub.f32 f482, f480, f481; +sub.f32 f483, f477, f482; +add.f32 f484, f482, f477; +fma.rn.f32 f485, f441, 0f3F1F9D07, %65; +mul.f32 f486, f443, 0f3E63DC87; +sub.f32 f487, f485, f486; +mul.f32 f488, f445, 0f3F66A5E5; +sub.f32 f489, f487, f488; +sub.f32 f490, %74, %128; +mul.f32 f491, f490, 0f3F48261C; +sub.f32 f492, %85, %117; +mul.f32 f493, f492, 0fBF7994E0; +sub.f32 f494, f493, f491; +sub.f32 f495, %96, %106; +mul.f32 f496, f495, 0f3EDE2602; +sub.f32 f497, f494, f496; +add.f32 f498, f497, f489; +sub.f32 f499, f489, f497; +mul.f32 f500, f441, 0f3E63DC87; +sub.f32 f501, %65, f500; +mul.f32 f502, f443, 0f3F66A5E5; +sub.f32 f503, f501, f502; +fma.rn.f32 f504, f445, 0f3F1F9D07, f503; +mul.f32 f505, f490, 0f3F7994E0; +mul.f32 f506, f492, 0f3EDE2602; +sub.f32 f507, f506, f505; +fma.rn.f32 f508, f495, 0f3F48261C, f507; +add.f32 f509, f508, f504; +sub.f32 f510, f504, f508; +mul.f32 f511, f441, 0f3F66A5E5; +sub.f32 f512, %65, f511; +fma.rn.f32 f513, f443, 0f3F1F9D07, f512; +mul.f32 f514, f445, 0f3E63DC87; +sub.f32 f515, f513, f514; +mul.f32 f516, f490, 0f3EDE2602; +mul.f32 f517, f492, 0f3F48261C; +sub.f32 f518, f517, f516; +mul.f32 f519, f495, 0f3F7994E0; +sub.f32 f520, f518, f519; +add.f32 f521, f520, f515; +sub.f32 f522, f515, f520; +mul.f32 f523, f460, 0f3F66A5E5; +mul.f32 f524, f498, 0fBEDE2602; +sub.f32 f525, f523, f524; +mul.f32 f526, f498, 0f3F66A5E5; +fma.rn.f32 f527, f460, 0fBEDE2602, f526; +mul.f32 f528, f471, 0f3F1F9D07; +mul.f32 f529, f509, 0fBF48261C; +sub.f32 f530, f528, f529; +mul.f32 f531, f509, 0f3F1F9D07; +fma.rn.f32 f532, f471, 0fBF48261C, f531; +mul.f32 f533, f483, 0f3E63DC87; +mul.f32 f534, f521, 0fBF7994E0; +sub.f32 f535, f533, f534; +mul.f32 f536, f521, 0f3E63DC87; +fma.rn.f32 f537, f483, 0fBF7994E0, f536; +mul.f32 f538, f484, 0fBE63DC87; +mul.f32 f539, f522, 0fBF7994E0; +sub.f32 f540, f538, f539; +mul.f32 f541, f522, 0fBE63DC87; +fma.rn.f32 f542, f484, 0fBF7994E0, f541; +mul.f32 f543, f472, 0fBF1F9D07; +mul.f32 f544, f510, 0fBF48261C; +sub.f32 f545, f543, f544; +mul.f32 f546, f510, 0fBF1F9D07; +fma.rn.f32 f547, f472, 0fBF48261C, f546; +mul.f32 f548, f461, 0fBF66A5E5; +mul.f32 f549, f499, 0fBEDE2602; +sub.f32 f550, f548, f549; +mul.f32 f551, f499, 0fBF66A5E5; +fma.rn.f32 f552, f461, 0fBEDE2602, f551; +add.f32 f553, f352, f440; +add.f32 f554, f358, f446; +sub.f32 f555, f352, f440; +sub.f32 f556, f358, f446; +add.f32 f557, f372, f525; +add.f32 f558, f410, f527; +sub.f32 f559, f372, f525; +sub.f32 f560, f410, f527; +add.f32 f561, f383, f530; +add.f32 f562, f421, f532; +sub.f32 f563, f383, f530; +sub.f32 f564, f421, f532; +add.f32 f565, f395, f535; +add.f32 f566, f433, f537; +sub.f32 f567, f395, f535; +sub.f32 f568, f433, f537; +add.f32 f569, f396, f540; +add.f32 f570, f434, f542; +sub.f32 f571, f396, f540; +sub.f32 f572, f434, f542; +add.f32 f573, f384, f545; +add.f32 f574, f422, f547; +sub.f32 f575, f384, f545; +sub.f32 f576, f422, f547; +add.f32 f577, f373, f550; +add.f32 f578, f411, f552; +sub.f32 f579, f373, f550; +sub.f32 f580, f411, f552; +mul.f32 f581, f557, 0f3F7994E0; +mul.f32 f582, f558, 0fBE63DC87; +sub.f32 f583, f581, f582; +mul.f32 f584, f558, 0f3F7994E0; +fma.rn.f32 f585, f557, 0fBE63DC87, f584; +mul.f32 f586, f561, 0f3F66A5E5; +mul.f32 f587, f562, 0fBEDE2602; +sub.f32 f588, f586, f587; +mul.f32 f589, f562, 0f3F66A5E5; +fma.rn.f32 f590, f561, 0fBEDE2602, f589; +mul.f32 f591, f565, 0f3F48261C; +mul.f32 f592, f566, 0fBF1F9D07; +sub.f32 f593, f591, f592; +mul.f32 f594, f566, 0f3F48261C; +fma.rn.f32 f595, f565, 0fBF1F9D07, f594; +mul.f32 f596, f569, 0f3F1F9D07; +mul.f32 f597, f570, 0fBF48261C; +sub.f32 f598, f596, f597; +mul.f32 f599, f570, 0f3F1F9D07; +fma.rn.f32 f600, f569, 0fBF48261C, f599; +mul.f32 f601, f573, 0f3EDE2602; +mul.f32 f602, f574, 0fBF66A5E5; +sub.f32 f603, f601, f602; +mul.f32 f604, f574, 0f3EDE2602; +fma.rn.f32 f605, f573, 0fBF66A5E5, f604; +mul.f32 f606, f577, 0f3E63DC87; +mul.f32 f607, f578, 0fBF7994E0; +sub.f32 f608, f606, f607; +mul.f32 f609, f578, 0f3E63DC87; +fma.rn.f32 f610, f577, 0fBF7994E0, f609; +mul.f32 f611, f559, 0fBE63DC87; +mul.f32 f612, f560, 0fBF7994E0; +sub.f32 f613, f611, f612; +mul.f32 f614, f560, 0fBE63DC87; +fma.rn.f32 f615, f559, 0fBF7994E0, f614; +mul.f32 f616, f563, 0fBEDE2602; +mul.f32 f617, f564, 0fBF66A5E5; +sub.f32 f618, f616, f617; +mul.f32 f619, f564, 0fBEDE2602; +fma.rn.f32 f620, f563, 0fBF66A5E5, f619; +mul.f32 f621, f567, 0fBF1F9D07; +mul.f32 f622, f568, 0fBF48261C; +sub.f32 f623, f621, f622; +mul.f32 f624, f568, 0fBF1F9D07; +fma.rn.f32 f625, f567, 0fBF48261C, f624; +mul.f32 f626, f571, 0fBF48261C; +mul.f32 f627, f572, 0fBF1F9D07; +sub.f32 f628, f626, f627; +mul.f32 f629, f572, 0fBF48261C; +fma.rn.f32 f630, f571, 0fBF1F9D07, f629; +mul.f32 f631, f575, 0fBF66A5E5; +mul.f32 f632, f576, 0fBEDE2602; +sub.f32 f633, f631, f632; +mul.f32 f634, f576, 0fBF66A5E5; +fma.rn.f32 f635, f575, 0fBEDE2602, f634; +mul.f32 f636, f579, 0fBF7994E0; +mul.f32 f637, f580, 0fBE63DC87; +sub.f32 f638, f636, f637; +mul.f32 f639, f580, 0fBF7994E0; +fma.rn.f32 f640, f579, 0fBE63DC87, f639; +add.f32 %1, f320, f554; +add.f32 %0, f319, f553; +add.f32 %3, f324, f585; +add.f32 %2, f323, f583; +add.f32 %5, f328, f590; +add.f32 %4, f327, f588; +add.f32 %7, f332, f595; +add.f32 %6, f331, f593; +add.f32 %9, f336, f600; +add.f32 %8, f335, f598; +add.f32 %11, f340, f605; +add.f32 %10, f339, f603; +add.f32 %13, f344, f610; +add.f32 %12, f343, f608; +sub.f32 %15, f322, f555; +add.f32 %14, f321, f556; +add.f32 %17, f326, f615; +add.f32 %16, f325, f613; +add.f32 %19, f330, f620; +add.f32 %18, f329, f618; +add.f32 %21, f334, f625; +add.f32 %20, f333, f623; +add.f32 %23, f338, f630; +add.f32 %22, f337, f628; +add.f32 %25, f342, f635; +add.f32 %24, f341, f633; +add.f32 %27, f346, f640; +add.f32 %26, f345, f638; +sub.f32 %29, f320, f554; +sub.f32 %28, f319, f553; +sub.f32 %31, f324, f585; +sub.f32 %30, f323, f583; +sub.f32 %33, f328, f590; +sub.f32 %32, f327, f588; +sub.f32 %35, f332, f595; +sub.f32 %34, f331, f593; +sub.f32 %37, f336, f600; +sub.f32 %36, f335, f598; +sub.f32 %39, f340, f605; +sub.f32 %38, f339, f603; +sub.f32 %41, f344, f610; +sub.f32 %40, f343, f608; +add.f32 %43, f322, f555; +sub.f32 %42, f321, f556; +sub.f32 %45, f326, f615; +sub.f32 %44, f325, f613; +sub.f32 %47, f330, f620; +sub.f32 %46, f329, f618; +sub.f32 %49, f334, f625; +sub.f32 %48, f333, f623; +sub.f32 %51, f338, f630; +sub.f32 %50, f337, f628; +sub.f32 %53, f342, f635; +sub.f32 %52, f341, f633; +sub.f32 %55, f346, f640; +sub.f32 %54, f345, f638; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..4832695840f37 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp32_inv.hpp.inc @@ -0,0 +1,584 @@ +#ifndef CUFFTDX_FFT_28_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_28_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<214, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<681>; +.reg .b64 rd<2>; +add.f32 f113, %66, %120; +add.f32 f114, %56, f113; +add.f32 f115, %77, %109; +add.f32 f116, f115, f114; +add.f32 f117, %88, %98; +add.f32 f118, f117, f116; +add.f32 f119, %68, %121; +add.f32 f120, %57, f119; +add.f32 f121, %79, %111; +add.f32 f122, f121, f120; +add.f32 f123, %89, %100; +add.f32 f124, f123, f122; +fma.rn.f32 f125, f113, 0f3F1F9D07, %56; +mul.f32 f126, f115, 0f3E63DC87; +sub.f32 f127, f125, f126; +mul.f32 f128, f117, 0f3F66A5E5; +sub.f32 f129, f127, f128; +sub.f32 f130, %68, %121; +mul.f32 f131, f130, 0f3F48261C; +sub.f32 f132, %79, %111; +fma.rn.f32 f133, f132, 0f3F7994E0, f131; +sub.f32 f134, %89, %100; +fma.rn.f32 f135, f134, 0f3EDE2602, f133; +sub.f32 f136, f129, f135; +add.f32 f137, f135, f129; +mul.f32 f138, f113, 0f3E63DC87; +sub.f32 f139, %56, f138; +mul.f32 f140, f115, 0f3F66A5E5; +sub.f32 f141, f139, f140; +fma.rn.f32 f142, f117, 0f3F1F9D07, f141; +mul.f32 f143, f130, 0f3F7994E0; +mul.f32 f144, f132, 0f3EDE2602; +sub.f32 f145, f143, f144; +mul.f32 f146, f134, 0f3F48261C; +sub.f32 f147, f145, f146; +sub.f32 f148, f142, f147; +add.f32 f149, f147, f142; +mul.f32 f150, f113, 0f3F66A5E5; +sub.f32 f151, %56, f150; +fma.rn.f32 f152, f115, 0f3F1F9D07, f151; +mul.f32 f153, f117, 0f3E63DC87; +sub.f32 f154, f152, f153; +mul.f32 f155, f130, 0f3EDE2602; +mul.f32 f156, f132, 0f3F48261C; +sub.f32 f157, f155, f156; +fma.rn.f32 f158, f134, 0f3F7994E0, f157; +sub.f32 f159, f154, f158; +add.f32 f160, f158, f154; +fma.rn.f32 f161, f119, 0f3F1F9D07, %57; +mul.f32 f162, f121, 0f3E63DC87; +sub.f32 f163, f161, f162; +mul.f32 f164, f123, 0f3F66A5E5; +sub.f32 f165, f163, f164; +sub.f32 f166, %66, %120; +mul.f32 f167, f166, 0f3F48261C; +sub.f32 f168, %77, %109; +fma.rn.f32 f169, f168, 0f3F7994E0, f167; +sub.f32 f170, %88, %98; +fma.rn.f32 f171, f170, 0f3EDE2602, f169; +add.f32 f172, f171, f165; +sub.f32 f173, f165, f171; +mul.f32 f174, f119, 0f3E63DC87; +sub.f32 f175, %57, f174; +mul.f32 f176, f121, 0f3F66A5E5; +sub.f32 f177, f175, f176; +fma.rn.f32 f178, f123, 0f3F1F9D07, f177; +mul.f32 f179, f166, 0f3F7994E0; +mul.f32 f180, f168, 0f3EDE2602; +sub.f32 f181, f179, f180; +mul.f32 f182, f170, 0f3F48261C; +sub.f32 f183, f181, f182; +add.f32 f184, f183, f178; +sub.f32 f185, f178, f183; +mul.f32 f186, f119, 0f3F66A5E5; +sub.f32 f187, %57, f186; +fma.rn.f32 f188, f121, 0f3F1F9D07, f187; +mul.f32 f189, f123, 0f3E63DC87; +sub.f32 f190, f188, f189; +mul.f32 f191, f166, 0f3EDE2602; +mul.f32 f192, f168, 0f3F48261C; +sub.f32 f193, f191, f192; +fma.rn.f32 f194, f170, 0f3F7994E0, f193; +add.f32 f195, f194, f190; +sub.f32 f196, f190, f194; +add.f32 f197, %72, %125; +add.f32 f198, %61, f197; +add.f32 f199, %82, %114; +add.f32 f200, f199, f198; +add.f32 f201, %93, %104; +add.f32 f202, f201, f200; +add.f32 f203, %73, %127; +add.f32 f204, %63, f203; +add.f32 f205, %84, %116; +add.f32 f206, f205, f204; +add.f32 f207, %95, %105; +add.f32 f208, f207, f206; +fma.rn.f32 f209, f197, 0f3F1F9D07, %61; +mul.f32 f210, f199, 0f3E63DC87; +sub.f32 f211, f209, f210; +mul.f32 f212, f201, 0f3F66A5E5; +sub.f32 f213, f211, f212; +sub.f32 f214, %73, %127; +mul.f32 f215, f214, 0f3F48261C; +sub.f32 f216, %84, %116; +fma.rn.f32 f217, f216, 0f3F7994E0, f215; +sub.f32 f218, %95, %105; +fma.rn.f32 f219, f218, 0f3EDE2602, f217; +sub.f32 f220, f213, f219; +add.f32 f221, f219, f213; +mul.f32 f222, f197, 0f3E63DC87; +sub.f32 f223, %61, f222; +mul.f32 f224, f199, 0f3F66A5E5; +sub.f32 f225, f223, f224; +fma.rn.f32 f226, f201, 0f3F1F9D07, f225; +mul.f32 f227, f214, 0f3F7994E0; +mul.f32 f228, f216, 0f3EDE2602; +sub.f32 f229, f227, f228; +mul.f32 f230, f218, 0f3F48261C; +sub.f32 f231, f229, f230; +sub.f32 f232, f226, f231; +add.f32 f233, f231, f226; +mul.f32 f234, f197, 0f3F66A5E5; +sub.f32 f235, %61, f234; +fma.rn.f32 f236, f199, 0f3F1F9D07, f235; +mul.f32 f237, f201, 0f3E63DC87; +sub.f32 f238, f236, f237; +mul.f32 f239, f214, 0f3EDE2602; +mul.f32 f240, f216, 0f3F48261C; +sub.f32 f241, f239, f240; +fma.rn.f32 f242, f218, 0f3F7994E0, f241; +sub.f32 f243, f238, f242; +add.f32 f244, f242, f238; +fma.rn.f32 f245, f203, 0f3F1F9D07, %63; +mul.f32 f246, f205, 0f3E63DC87; +sub.f32 f247, f245, f246; +mul.f32 f248, f207, 0f3F66A5E5; +sub.f32 f249, f247, f248; +sub.f32 f250, %72, %125; +mul.f32 f251, f250, 0f3F48261C; +sub.f32 f252, %82, %114; +fma.rn.f32 f253, f252, 0f3F7994E0, f251; +sub.f32 f254, %93, %104; +fma.rn.f32 f255, f254, 0f3EDE2602, f253; +add.f32 f256, f255, f249; +sub.f32 f257, f249, f255; +mul.f32 f258, f203, 0f3E63DC87; +sub.f32 f259, %63, f258; +mul.f32 f260, f205, 0f3F66A5E5; +sub.f32 f261, f259, f260; +fma.rn.f32 f262, f207, 0f3F1F9D07, f261; +mul.f32 f263, f250, 0f3F7994E0; +mul.f32 f264, f252, 0f3EDE2602; +sub.f32 f265, f263, f264; +mul.f32 f266, f254, 0f3F48261C; +sub.f32 f267, f265, f266; +add.f32 f268, f267, f262; +sub.f32 f269, f262, f267; +mul.f32 f270, f203, 0f3F66A5E5; +sub.f32 f271, %63, f270; +fma.rn.f32 f272, f205, 0f3F1F9D07, f271; +mul.f32 f273, f207, 0f3E63DC87; +sub.f32 f274, f272, f273; +mul.f32 f275, f250, 0f3EDE2602; +mul.f32 f276, f252, 0f3F48261C; +sub.f32 f277, f275, f276; +fma.rn.f32 f278, f254, 0f3F7994E0, f277; +add.f32 f279, f278, f274; +sub.f32 f280, f274, f278; +mul.f32 f281, f220, 0f3F66A5E5; +mul.f32 f282, f256, 0f3EDE2602; +sub.f32 f283, f281, f282; +mul.f32 f284, f256, 0f3F66A5E5; +fma.rn.f32 f285, f220, 0f3EDE2602, f284; +mul.f32 f286, f232, 0f3F1F9D07; +mul.f32 f287, f268, 0f3F48261C; +sub.f32 f288, f286, f287; +mul.f32 f289, f268, 0f3F1F9D07; +fma.rn.f32 f290, f232, 0f3F48261C, f289; +mul.f32 f291, f243, 0f3E63DC87; +mul.f32 f292, f279, 0f3F7994E0; +sub.f32 f293, f291, f292; +mul.f32 f294, f279, 0f3E63DC87; +fma.rn.f32 f295, f243, 0f3F7994E0, f294; +mul.f32 f296, f244, 0fBE63DC87; +mul.f32 f297, f280, 0f3F7994E0; +sub.f32 f298, f296, f297; +mul.f32 f299, f280, 0fBE63DC87; +fma.rn.f32 f300, f244, 0f3F7994E0, f299; +mul.f32 f301, f233, 0fBF1F9D07; +mul.f32 f302, f269, 0f3F48261C; +sub.f32 f303, f301, f302; +mul.f32 f304, f269, 0fBF1F9D07; +fma.rn.f32 f305, f233, 0f3F48261C, f304; +mul.f32 f306, f221, 0fBF66A5E5; +mul.f32 f307, f257, 0f3EDE2602; +sub.f32 f308, f306, f307; +mul.f32 f309, f257, 0fBF66A5E5; +fma.rn.f32 f310, f221, 0f3EDE2602, f309; +add.f32 f311, f118, f202; +add.f32 f312, f124, f208; +sub.f32 f313, f118, f202; +sub.f32 f314, f124, f208; +add.f32 f315, f136, f283; +add.f32 f316, f172, f285; +sub.f32 f317, f136, f283; +sub.f32 f318, f172, f285; +add.f32 f319, f148, f288; +add.f32 f320, f184, f290; +sub.f32 f321, f148, f288; +sub.f32 f322, f184, f290; +add.f32 f323, f159, f293; +add.f32 f324, f195, f295; +sub.f32 f325, f159, f293; +sub.f32 f326, f195, f295; +add.f32 f327, f160, f298; +add.f32 f328, f196, f300; +sub.f32 f329, f160, f298; +sub.f32 f330, f196, f300; +add.f32 f331, f149, f303; +add.f32 f332, f185, f305; +sub.f32 f333, f149, f303; +sub.f32 f334, f185, f305; +add.f32 f335, f137, f308; +add.f32 f336, f173, f310; +sub.f32 f337, f137, f308; +sub.f32 f338, f173, f310; +add.f32 f339, %69, %122; +add.f32 f340, %58, f339; +add.f32 f341, %80, %112; +add.f32 f342, f341, f340; +add.f32 f343, %90, %101; +add.f32 f344, f343, f342; +add.f32 f345, %71, %124; +add.f32 f346, %60, f345; +add.f32 f347, %81, %113; +add.f32 f348, f347, f346; +add.f32 f349, %92, %103; +add.f32 f350, f349, f348; +fma.rn.f32 f351, f339, 0f3F1F9D07, %58; +mul.f32 f352, f341, 0f3E63DC87; +sub.f32 f353, f351, f352; +mul.f32 f354, f343, 0f3F66A5E5; +sub.f32 f355, f353, f354; +sub.f32 f356, %71, %124; +mul.f32 f357, f356, 0f3F48261C; +sub.f32 f358, %81, %113; +fma.rn.f32 f359, f358, 0f3F7994E0, f357; +sub.f32 f360, %92, %103; +fma.rn.f32 f361, f360, 0f3EDE2602, f359; +sub.f32 f362, f355, f361; +add.f32 f363, f361, f355; +mul.f32 f364, f339, 0f3E63DC87; +sub.f32 f365, %58, f364; +mul.f32 f366, f341, 0f3F66A5E5; +sub.f32 f367, f365, f366; +fma.rn.f32 f368, f343, 0f3F1F9D07, f367; +mul.f32 f369, f356, 0f3F7994E0; +mul.f32 f370, f358, 0f3EDE2602; +sub.f32 f371, f369, f370; +mul.f32 f372, f360, 0f3F48261C; +sub.f32 f373, f371, f372; +sub.f32 f374, f368, f373; +add.f32 f375, f373, f368; +mul.f32 f376, f339, 0f3F66A5E5; +sub.f32 f377, %58, f376; +fma.rn.f32 f378, f341, 0f3F1F9D07, f377; +mul.f32 f379, f343, 0f3E63DC87; +sub.f32 f380, f378, f379; +mul.f32 f381, f356, 0f3EDE2602; +mul.f32 f382, f358, 0f3F48261C; +sub.f32 f383, f381, f382; +fma.rn.f32 f384, f360, 0f3F7994E0, f383; +sub.f32 f385, f380, f384; +add.f32 f386, f384, f380; +fma.rn.f32 f387, f345, 0f3F1F9D07, %60; +mul.f32 f388, f347, 0f3E63DC87; +sub.f32 f389, f387, f388; +mul.f32 f390, f349, 0f3F66A5E5; +sub.f32 f391, f389, f390; +sub.f32 f392, %69, %122; +mul.f32 f393, f392, 0f3F48261C; +sub.f32 f394, %80, %112; +fma.rn.f32 f395, f394, 0f3F7994E0, f393; +sub.f32 f396, %90, %101; +fma.rn.f32 f397, f396, 0f3EDE2602, f395; +add.f32 f398, f397, f391; +sub.f32 f399, f391, f397; +mul.f32 f400, f345, 0f3E63DC87; +sub.f32 f401, %60, f400; +mul.f32 f402, f347, 0f3F66A5E5; +sub.f32 f403, f401, f402; +fma.rn.f32 f404, f349, 0f3F1F9D07, f403; +mul.f32 f405, f392, 0f3F7994E0; +mul.f32 f406, f394, 0f3EDE2602; +sub.f32 f407, f405, f406; +mul.f32 f408, f396, 0f3F48261C; +sub.f32 f409, f407, f408; +add.f32 f410, f409, f404; +sub.f32 f411, f404, f409; +mul.f32 f412, f345, 0f3F66A5E5; +sub.f32 f413, %60, f412; +fma.rn.f32 f414, f347, 0f3F1F9D07, f413; +mul.f32 f415, f349, 0f3E63DC87; +sub.f32 f416, f414, f415; +mul.f32 f417, f392, 0f3EDE2602; +mul.f32 f418, f394, 0f3F48261C; +sub.f32 f419, f417, f418; +fma.rn.f32 f420, f396, 0f3F7994E0, f419; +add.f32 f421, f420, f416; +sub.f32 f422, f416, f420; +add.f32 f423, %74, %128; +add.f32 f424, %64, f423; +add.f32 f425, %85, %117; +add.f32 f426, f425, f424; +add.f32 f427, %96, %106; +add.f32 f428, f427, f426; +add.f32 f429, %76, %129; +add.f32 f430, %65, f429; +add.f32 f431, %87, %119; +add.f32 f432, f431, f430; +add.f32 f433, %97, %108; +add.f32 f434, f433, f432; +fma.rn.f32 f435, f423, 0f3F1F9D07, %64; +mul.f32 f436, f425, 0f3E63DC87; +sub.f32 f437, f435, f436; +mul.f32 f438, f427, 0f3F66A5E5; +sub.f32 f439, f437, f438; +sub.f32 f440, %76, %129; +mul.f32 f441, f440, 0f3F48261C; +sub.f32 f442, %87, %119; +fma.rn.f32 f443, f442, 0f3F7994E0, f441; +sub.f32 f444, %97, %108; +fma.rn.f32 f445, f444, 0f3EDE2602, f443; +sub.f32 f446, f439, f445; +add.f32 f447, f445, f439; +mul.f32 f448, f423, 0f3E63DC87; +sub.f32 f449, %64, f448; +mul.f32 f450, f425, 0f3F66A5E5; +sub.f32 f451, f449, f450; +fma.rn.f32 f452, f427, 0f3F1F9D07, f451; +mul.f32 f453, f440, 0f3F7994E0; +mul.f32 f454, f442, 0f3EDE2602; +sub.f32 f455, f453, f454; +mul.f32 f456, f444, 0f3F48261C; +sub.f32 f457, f455, f456; +sub.f32 f458, f452, f457; +add.f32 f459, f457, f452; +mul.f32 f460, f423, 0f3F66A5E5; +sub.f32 f461, %64, f460; +fma.rn.f32 f462, f425, 0f3F1F9D07, f461; +mul.f32 f463, f427, 0f3E63DC87; +sub.f32 f464, f462, f463; +mul.f32 f465, f440, 0f3EDE2602; +mul.f32 f466, f442, 0f3F48261C; +sub.f32 f467, f465, f466; +fma.rn.f32 f468, f444, 0f3F7994E0, f467; +sub.f32 f469, f464, f468; +add.f32 f470, f468, f464; +fma.rn.f32 f471, f429, 0f3F1F9D07, %65; +mul.f32 f472, f431, 0f3E63DC87; +sub.f32 f473, f471, f472; +mul.f32 f474, f433, 0f3F66A5E5; +sub.f32 f475, f473, f474; +sub.f32 f476, %74, %128; +mul.f32 f477, f476, 0f3F48261C; +sub.f32 f478, %85, %117; +fma.rn.f32 f479, f478, 0f3F7994E0, f477; +sub.f32 f480, %96, %106; +fma.rn.f32 f481, f480, 0f3EDE2602, f479; +add.f32 f482, f481, f475; +sub.f32 f483, f475, f481; +mul.f32 f484, f429, 0f3E63DC87; +sub.f32 f485, %65, f484; +mul.f32 f486, f431, 0f3F66A5E5; +sub.f32 f487, f485, f486; +fma.rn.f32 f488, f433, 0f3F1F9D07, f487; +mul.f32 f489, f476, 0f3F7994E0; +mul.f32 f490, f478, 0f3EDE2602; +sub.f32 f491, f489, f490; +mul.f32 f492, f480, 0f3F48261C; +sub.f32 f493, f491, f492; +add.f32 f494, f493, f488; +sub.f32 f495, f488, f493; +mul.f32 f496, f429, 0f3F66A5E5; +sub.f32 f497, %65, f496; +fma.rn.f32 f498, f431, 0f3F1F9D07, f497; +mul.f32 f499, f433, 0f3E63DC87; +sub.f32 f500, f498, f499; +mul.f32 f501, f476, 0f3EDE2602; +mul.f32 f502, f478, 0f3F48261C; +sub.f32 f503, f501, f502; +fma.rn.f32 f504, f480, 0f3F7994E0, f503; +add.f32 f505, f504, f500; +sub.f32 f506, f500, f504; +mul.f32 f507, f446, 0f3F66A5E5; +mul.f32 f508, f482, 0f3EDE2602; +sub.f32 f509, f507, f508; +mul.f32 f510, f482, 0f3F66A5E5; +fma.rn.f32 f511, f446, 0f3EDE2602, f510; +mul.f32 f512, f458, 0f3F1F9D07; +mul.f32 f513, f494, 0f3F48261C; +sub.f32 f514, f512, f513; +mul.f32 f515, f494, 0f3F1F9D07; +fma.rn.f32 f516, f458, 0f3F48261C, f515; +mul.f32 f517, f469, 0f3E63DC87; +mul.f32 f518, f505, 0f3F7994E0; +sub.f32 f519, f517, f518; +mul.f32 f520, f505, 0f3E63DC87; +fma.rn.f32 f521, f469, 0f3F7994E0, f520; +mul.f32 f522, f470, 0fBE63DC87; +mul.f32 f523, f506, 0f3F7994E0; +sub.f32 f524, f522, f523; +mul.f32 f525, f506, 0fBE63DC87; +fma.rn.f32 f526, f470, 0f3F7994E0, f525; +mul.f32 f527, f459, 0fBF1F9D07; +mul.f32 f528, f495, 0f3F48261C; +sub.f32 f529, f527, f528; +mul.f32 f530, f495, 0fBF1F9D07; +fma.rn.f32 f531, f459, 0f3F48261C, f530; +mul.f32 f532, f447, 0fBF66A5E5; +mul.f32 f533, f483, 0f3EDE2602; +sub.f32 f534, f532, f533; +mul.f32 f535, f483, 0fBF66A5E5; +fma.rn.f32 f536, f447, 0f3EDE2602, f535; +add.f32 f537, f344, f428; +add.f32 f538, f350, f434; +sub.f32 f539, f344, f428; +sub.f32 f540, f350, f434; +add.f32 f541, f362, f509; +add.f32 f542, f398, f511; +sub.f32 f543, f362, f509; +sub.f32 f544, f398, f511; +add.f32 f545, f374, f514; +add.f32 f546, f410, f516; +sub.f32 f547, f374, f514; +sub.f32 f548, f410, f516; +add.f32 f549, f385, f519; +add.f32 f550, f421, f521; +sub.f32 f551, f385, f519; +sub.f32 f552, f421, f521; +add.f32 f553, f386, f524; +add.f32 f554, f422, f526; +sub.f32 f555, f386, f524; +sub.f32 f556, f422, f526; +add.f32 f557, f375, f529; +add.f32 f558, f411, f531; +sub.f32 f559, f375, f529; +sub.f32 f560, f411, f531; +add.f32 f561, f363, f534; +add.f32 f562, f399, f536; +sub.f32 f563, f363, f534; +sub.f32 f564, f399, f536; +mul.f32 f565, f541, 0f3F7994E0; +mul.f32 f566, f542, 0f3E63DC87; +sub.f32 f567, f565, f566; +mul.f32 f568, f542, 0f3F7994E0; +fma.rn.f32 f569, f541, 0f3E63DC87, f568; +mul.f32 f570, f545, 0f3F66A5E5; +mul.f32 f571, f546, 0f3EDE2602; +sub.f32 f572, f570, f571; +mul.f32 f573, f546, 0f3F66A5E5; +fma.rn.f32 f574, f545, 0f3EDE2602, f573; +mul.f32 f575, f549, 0f3F48261C; +mul.f32 f576, f550, 0f3F1F9D07; +sub.f32 f577, f575, f576; +mul.f32 f578, f550, 0f3F48261C; +fma.rn.f32 f579, f549, 0f3F1F9D07, f578; +mul.f32 f580, f553, 0f3F1F9D07; +mul.f32 f581, f554, 0f3F48261C; +sub.f32 f582, f580, f581; +mul.f32 f583, f554, 0f3F1F9D07; +fma.rn.f32 f584, f553, 0f3F48261C, f583; +mul.f32 f585, f557, 0f3EDE2602; +mul.f32 f586, f558, 0f3F66A5E5; +sub.f32 f587, f585, f586; +mul.f32 f588, f558, 0f3EDE2602; +fma.rn.f32 f589, f557, 0f3F66A5E5, f588; +mul.f32 f590, f561, 0f3E63DC87; +mul.f32 f591, f562, 0f3F7994E0; +sub.f32 f592, f590, f591; +mul.f32 f593, f562, 0f3E63DC87; +fma.rn.f32 f594, f561, 0f3F7994E0, f593; +mul.f32 f595, f543, 0fBE63DC87; +mul.f32 f596, f544, 0f3F7994E0; +sub.f32 f597, f595, f596; +mul.f32 f598, f544, 0fBE63DC87; +fma.rn.f32 f599, f543, 0f3F7994E0, f598; +mul.f32 f600, f547, 0fBEDE2602; +mul.f32 f601, f548, 0f3F66A5E5; +sub.f32 f602, f600, f601; +mul.f32 f603, f548, 0fBEDE2602; +fma.rn.f32 f604, f547, 0f3F66A5E5, f603; +mul.f32 f605, f551, 0fBF1F9D07; +mul.f32 f606, f552, 0f3F48261C; +sub.f32 f607, f605, f606; +mul.f32 f608, f552, 0fBF1F9D07; +fma.rn.f32 f609, f551, 0f3F48261C, f608; +mul.f32 f610, f555, 0fBF48261C; +mul.f32 f611, f556, 0f3F1F9D07; +sub.f32 f612, f610, f611; +mul.f32 f613, f556, 0fBF48261C; +fma.rn.f32 f614, f555, 0f3F1F9D07, f613; +mul.f32 f615, f559, 0fBF66A5E5; +mul.f32 f616, f560, 0f3EDE2602; +sub.f32 f617, f615, f616; +mul.f32 f618, f560, 0fBF66A5E5; +fma.rn.f32 f619, f559, 0f3EDE2602, f618; +mul.f32 f620, f563, 0fBF7994E0; +mul.f32 f621, f564, 0f3E63DC87; +sub.f32 f622, f620, f621; +mul.f32 f623, f564, 0fBF7994E0; +fma.rn.f32 f624, f563, 0f3E63DC87, f623; +add.f32 %1, f312, f538; +add.f32 %0, f311, f537; +add.f32 %3, f316, f569; +add.f32 %2, f315, f567; +add.f32 %5, f320, f574; +add.f32 %4, f319, f572; +add.f32 %7, f324, f579; +add.f32 %6, f323, f577; +add.f32 %9, f328, f584; +add.f32 %8, f327, f582; +add.f32 %11, f332, f589; +add.f32 %10, f331, f587; +add.f32 %13, f336, f594; +add.f32 %12, f335, f592; +add.f32 %15, f314, f539; +sub.f32 %14, f313, f540; +add.f32 %17, f318, f599; +add.f32 %16, f317, f597; +add.f32 %19, f322, f604; +add.f32 %18, f321, f602; +add.f32 %21, f326, f609; +add.f32 %20, f325, f607; +add.f32 %23, f330, f614; +add.f32 %22, f329, f612; +add.f32 %25, f334, f619; +add.f32 %24, f333, f617; +add.f32 %27, f338, f624; +add.f32 %26, f337, f622; +sub.f32 %29, f312, f538; +sub.f32 %28, f311, f537; +sub.f32 %31, f316, f569; +sub.f32 %30, f315, f567; +sub.f32 %33, f320, f574; +sub.f32 %32, f319, f572; +sub.f32 %35, f324, f579; +sub.f32 %34, f323, f577; +sub.f32 %37, f328, f584; +sub.f32 %36, f327, f582; +sub.f32 %39, f332, f589; +sub.f32 %38, f331, f587; +sub.f32 %41, f336, f594; +sub.f32 %40, f335, f592; +sub.f32 %43, f314, f539; +add.f32 %42, f313, f540; +sub.f32 %45, f318, f599; +sub.f32 %44, f317, f597; +sub.f32 %47, f322, f604; +sub.f32 %46, f321, f602; +sub.f32 %49, f326, f609; +sub.f32 %48, f325, f607; +sub.f32 %51, f330, f614; +sub.f32 %50, f329, f612; +sub.f32 %53, f334, f619; +sub.f32 %52, f333, f617; +sub.f32 %55, f338, f624; +sub.f32 %54, f337, f622; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..593aa81c43260 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp64_fwd.hpp.inc @@ -0,0 +1,600 @@ +#ifndef CUFFTDX_FFT_28_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_28_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<416, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<697>; +.reg .b64 rd<2>; +add.f64 fd113, %66, %120; +add.f64 fd114, %56, fd113; +add.f64 fd115, %77, %109; +add.f64 fd116, fd115, fd114; +add.f64 fd117, %88, %98; +add.f64 fd118, fd117, fd116; +add.f64 fd119, %68, %121; +add.f64 fd120, %57, fd119; +add.f64 fd121, %79, %111; +add.f64 fd122, fd121, fd120; +add.f64 fd123, %89, %100; +add.f64 fd124, fd123, fd122; +fma.rn.f64 fd125, fd113, 0d3FE3F3A0E28BEDD1, %56; +mul.f64 fd126, fd115, 0d3FCC7B90E3024582; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd117, 0d3FECD4BCA9CB5C71; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, %68, %121; +mul.f64 fd131, fd130, 0d3FE904C37505DE4B; +sub.f64 fd132, %79, %111; +mul.f64 fd133, fd132, 0dBFEF329C0558E969; +sub.f64 fd134, fd133, fd131; +sub.f64 fd135, %89, %100; +mul.f64 fd136, fd135, 0d3FDBC4C04D71ABC1; +sub.f64 fd137, fd134, fd136; +sub.f64 fd138, fd129, fd137; +add.f64 fd139, fd137, fd129; +mul.f64 fd140, fd113, 0d3FCC7B90E3024582; +sub.f64 fd141, %56, fd140; +mul.f64 fd142, fd115, 0d3FECD4BCA9CB5C71; +sub.f64 fd143, fd141, fd142; +fma.rn.f64 fd144, fd117, 0d3FE3F3A0E28BEDD1, fd143; +mul.f64 fd145, fd130, 0d3FEF329C0558E969; +mul.f64 fd146, fd132, 0d3FDBC4C04D71ABC1; +sub.f64 fd147, fd146, fd145; +fma.rn.f64 fd148, fd135, 0d3FE904C37505DE4B, fd147; +sub.f64 fd149, fd144, fd148; +add.f64 fd150, fd148, fd144; +mul.f64 fd151, fd113, 0d3FECD4BCA9CB5C71; +sub.f64 fd152, %56, fd151; +fma.rn.f64 fd153, fd115, 0d3FE3F3A0E28BEDD1, fd152; +mul.f64 fd154, fd117, 0d3FCC7B90E3024582; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd130, 0d3FDBC4C04D71ABC1; +mul.f64 fd157, fd132, 0d3FE904C37505DE4B; +sub.f64 fd158, fd157, fd156; +mul.f64 fd159, fd135, 0d3FEF329C0558E969; +sub.f64 fd160, fd158, fd159; +sub.f64 fd161, fd155, fd160; +add.f64 fd162, fd160, fd155; +fma.rn.f64 fd163, fd119, 0d3FE3F3A0E28BEDD1, %57; +mul.f64 fd164, fd121, 0d3FCC7B90E3024582; +sub.f64 fd165, fd163, fd164; +mul.f64 fd166, fd123, 0d3FECD4BCA9CB5C71; +sub.f64 fd167, fd165, fd166; +sub.f64 fd168, %66, %120; +mul.f64 fd169, fd168, 0d3FE904C37505DE4B; +sub.f64 fd170, %77, %109; +mul.f64 fd171, fd170, 0dBFEF329C0558E969; +sub.f64 fd172, fd171, fd169; +sub.f64 fd173, %88, %98; +mul.f64 fd174, fd173, 0d3FDBC4C04D71ABC1; +sub.f64 fd175, fd172, fd174; +add.f64 fd176, fd175, fd167; +sub.f64 fd177, fd167, fd175; +mul.f64 fd178, fd119, 0d3FCC7B90E3024582; +sub.f64 fd179, %57, fd178; +mul.f64 fd180, fd121, 0d3FECD4BCA9CB5C71; +sub.f64 fd181, fd179, fd180; +fma.rn.f64 fd182, fd123, 0d3FE3F3A0E28BEDD1, fd181; +mul.f64 fd183, fd168, 0d3FEF329C0558E969; +mul.f64 fd184, fd170, 0d3FDBC4C04D71ABC1; +sub.f64 fd185, fd184, fd183; +fma.rn.f64 fd186, fd173, 0d3FE904C37505DE4B, fd185; +add.f64 fd187, fd186, fd182; +sub.f64 fd188, fd182, fd186; +mul.f64 fd189, fd119, 0d3FECD4BCA9CB5C71; +sub.f64 fd190, %57, fd189; +fma.rn.f64 fd191, fd121, 0d3FE3F3A0E28BEDD1, fd190; +mul.f64 fd192, fd123, 0d3FCC7B90E3024582; +sub.f64 fd193, fd191, fd192; +mul.f64 fd194, fd168, 0d3FDBC4C04D71ABC1; +mul.f64 fd195, fd170, 0d3FE904C37505DE4B; +sub.f64 fd196, fd195, fd194; +mul.f64 fd197, fd173, 0d3FEF329C0558E969; +sub.f64 fd198, fd196, fd197; +add.f64 fd199, fd198, fd193; +sub.f64 fd200, fd193, fd198; +add.f64 fd201, %72, %125; +add.f64 fd202, %61, fd201; +add.f64 fd203, %82, %114; +add.f64 fd204, fd203, fd202; +add.f64 fd205, %93, %104; +add.f64 fd206, fd205, fd204; +add.f64 fd207, %73, %127; +add.f64 fd208, %63, fd207; +add.f64 fd209, %84, %116; +add.f64 fd210, fd209, fd208; +add.f64 fd211, %95, %105; +add.f64 fd212, fd211, fd210; +fma.rn.f64 fd213, fd201, 0d3FE3F3A0E28BEDD1, %61; +mul.f64 fd214, fd203, 0d3FCC7B90E3024582; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd205, 0d3FECD4BCA9CB5C71; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, %73, %127; +mul.f64 fd219, fd218, 0d3FE904C37505DE4B; +sub.f64 fd220, %84, %116; +mul.f64 fd221, fd220, 0dBFEF329C0558E969; +sub.f64 fd222, fd221, fd219; +sub.f64 fd223, %95, %105; +mul.f64 fd224, fd223, 0d3FDBC4C04D71ABC1; +sub.f64 fd225, fd222, fd224; +sub.f64 fd226, fd217, fd225; +add.f64 fd227, fd225, fd217; +mul.f64 fd228, fd201, 0d3FCC7B90E3024582; +sub.f64 fd229, %61, fd228; +mul.f64 fd230, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd231, fd229, fd230; +fma.rn.f64 fd232, fd205, 0d3FE3F3A0E28BEDD1, fd231; +mul.f64 fd233, fd218, 0d3FEF329C0558E969; +mul.f64 fd234, fd220, 0d3FDBC4C04D71ABC1; +sub.f64 fd235, fd234, fd233; +fma.rn.f64 fd236, fd223, 0d3FE904C37505DE4B, fd235; +sub.f64 fd237, fd232, fd236; +add.f64 fd238, fd236, fd232; +mul.f64 fd239, fd201, 0d3FECD4BCA9CB5C71; +sub.f64 fd240, %61, fd239; +fma.rn.f64 fd241, fd203, 0d3FE3F3A0E28BEDD1, fd240; +mul.f64 fd242, fd205, 0d3FCC7B90E3024582; +sub.f64 fd243, fd241, fd242; +mul.f64 fd244, fd218, 0d3FDBC4C04D71ABC1; +mul.f64 fd245, fd220, 0d3FE904C37505DE4B; +sub.f64 fd246, fd245, fd244; +mul.f64 fd247, fd223, 0d3FEF329C0558E969; +sub.f64 fd248, fd246, fd247; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +fma.rn.f64 fd251, fd207, 0d3FE3F3A0E28BEDD1, %63; +mul.f64 fd252, fd209, 0d3FCC7B90E3024582; +sub.f64 fd253, fd251, fd252; +mul.f64 fd254, fd211, 0d3FECD4BCA9CB5C71; +sub.f64 fd255, fd253, fd254; +sub.f64 fd256, %72, %125; +mul.f64 fd257, fd256, 0d3FE904C37505DE4B; +sub.f64 fd258, %82, %114; +mul.f64 fd259, fd258, 0dBFEF329C0558E969; +sub.f64 fd260, fd259, fd257; +sub.f64 fd261, %93, %104; +mul.f64 fd262, fd261, 0d3FDBC4C04D71ABC1; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd263, fd255; +sub.f64 fd265, fd255, fd263; +mul.f64 fd266, fd207, 0d3FCC7B90E3024582; +sub.f64 fd267, %63, fd266; +mul.f64 fd268, fd209, 0d3FECD4BCA9CB5C71; +sub.f64 fd269, fd267, fd268; +fma.rn.f64 fd270, fd211, 0d3FE3F3A0E28BEDD1, fd269; +mul.f64 fd271, fd256, 0d3FEF329C0558E969; +mul.f64 fd272, fd258, 0d3FDBC4C04D71ABC1; +sub.f64 fd273, fd272, fd271; +fma.rn.f64 fd274, fd261, 0d3FE904C37505DE4B, fd273; +add.f64 fd275, fd274, fd270; +sub.f64 fd276, fd270, fd274; +mul.f64 fd277, fd207, 0d3FECD4BCA9CB5C71; +sub.f64 fd278, %63, fd277; +fma.rn.f64 fd279, fd209, 0d3FE3F3A0E28BEDD1, fd278; +mul.f64 fd280, fd211, 0d3FCC7B90E3024582; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd256, 0d3FDBC4C04D71ABC1; +mul.f64 fd283, fd258, 0d3FE904C37505DE4B; +sub.f64 fd284, fd283, fd282; +mul.f64 fd285, fd261, 0d3FEF329C0558E969; +sub.f64 fd286, fd284, fd285; +add.f64 fd287, fd286, fd281; +sub.f64 fd288, fd281, fd286; +mul.f64 fd289, fd226, 0d3FECD4BCA9CB5C71; +mul.f64 fd290, fd264, 0dBFDBC4C04D71ABC1; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd264, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd293, fd226, 0dBFDBC4C04D71ABC1, fd292; +mul.f64 fd294, fd237, 0d3FE3F3A0E28BEDD1; +mul.f64 fd295, fd275, 0dBFE904C37505DE4B; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd275, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd298, fd237, 0dBFE904C37505DE4B, fd297; +mul.f64 fd299, fd249, 0d3FCC7B90E3024582; +mul.f64 fd300, fd287, 0dBFEF329C0558E969; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd287, 0d3FCC7B90E3024582; +fma.rn.f64 fd303, fd249, 0dBFEF329C0558E969, fd302; +mul.f64 fd304, fd250, 0dBFCC7B90E3024582; +mul.f64 fd305, fd288, 0dBFEF329C0558E969; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd288, 0dBFCC7B90E3024582; +fma.rn.f64 fd308, fd250, 0dBFEF329C0558E969, fd307; +mul.f64 fd309, fd238, 0dBFE3F3A0E28BEDD1; +mul.f64 fd310, fd276, 0dBFE904C37505DE4B; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd276, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd313, fd238, 0dBFE904C37505DE4B, fd312; +mul.f64 fd314, fd227, 0dBFECD4BCA9CB5C71; +mul.f64 fd315, fd265, 0dBFDBC4C04D71ABC1; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd265, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd318, fd227, 0dBFDBC4C04D71ABC1, fd317; +add.f64 fd319, fd118, fd206; +add.f64 fd320, fd124, fd212; +sub.f64 fd321, fd118, fd206; +sub.f64 fd322, fd124, fd212; +add.f64 fd323, fd138, fd291; +add.f64 fd324, fd176, fd293; +sub.f64 fd325, fd138, fd291; +sub.f64 fd326, fd176, fd293; +add.f64 fd327, fd149, fd296; +add.f64 fd328, fd187, fd298; +sub.f64 fd329, fd149, fd296; +sub.f64 fd330, fd187, fd298; +add.f64 fd331, fd161, fd301; +add.f64 fd332, fd199, fd303; +sub.f64 fd333, fd161, fd301; +sub.f64 fd334, fd199, fd303; +add.f64 fd335, fd162, fd306; +add.f64 fd336, fd200, fd308; +sub.f64 fd337, fd162, fd306; +sub.f64 fd338, fd200, fd308; +add.f64 fd339, fd150, fd311; +add.f64 fd340, fd188, fd313; +sub.f64 fd341, fd150, fd311; +sub.f64 fd342, fd188, fd313; +add.f64 fd343, fd139, fd316; +add.f64 fd344, fd177, fd318; +sub.f64 fd345, fd139, fd316; +sub.f64 fd346, fd177, fd318; +add.f64 fd347, %69, %122; +add.f64 fd348, %58, fd347; +add.f64 fd349, %80, %112; +add.f64 fd350, fd349, fd348; +add.f64 fd351, %90, %101; +add.f64 fd352, fd351, fd350; +add.f64 fd353, %71, %124; +add.f64 fd354, %60, fd353; +add.f64 fd355, %81, %113; +add.f64 fd356, fd355, fd354; +add.f64 fd357, %92, %103; +add.f64 fd358, fd357, fd356; +fma.rn.f64 fd359, fd347, 0d3FE3F3A0E28BEDD1, %58; +mul.f64 fd360, fd349, 0d3FCC7B90E3024582; +sub.f64 fd361, fd359, fd360; +mul.f64 fd362, fd351, 0d3FECD4BCA9CB5C71; +sub.f64 fd363, fd361, fd362; +sub.f64 fd364, %71, %124; +mul.f64 fd365, fd364, 0d3FE904C37505DE4B; +sub.f64 fd366, %81, %113; +mul.f64 fd367, fd366, 0dBFEF329C0558E969; +sub.f64 fd368, fd367, fd365; +sub.f64 fd369, %92, %103; +mul.f64 fd370, fd369, 0d3FDBC4C04D71ABC1; +sub.f64 fd371, fd368, fd370; +sub.f64 fd372, fd363, fd371; +add.f64 fd373, fd371, fd363; +mul.f64 fd374, fd347, 0d3FCC7B90E3024582; +sub.f64 fd375, %58, fd374; +mul.f64 fd376, fd349, 0d3FECD4BCA9CB5C71; +sub.f64 fd377, fd375, fd376; +fma.rn.f64 fd378, fd351, 0d3FE3F3A0E28BEDD1, fd377; +mul.f64 fd379, fd364, 0d3FEF329C0558E969; +mul.f64 fd380, fd366, 0d3FDBC4C04D71ABC1; +sub.f64 fd381, fd380, fd379; +fma.rn.f64 fd382, fd369, 0d3FE904C37505DE4B, fd381; +sub.f64 fd383, fd378, fd382; +add.f64 fd384, fd382, fd378; +mul.f64 fd385, fd347, 0d3FECD4BCA9CB5C71; +sub.f64 fd386, %58, fd385; +fma.rn.f64 fd387, fd349, 0d3FE3F3A0E28BEDD1, fd386; +mul.f64 fd388, fd351, 0d3FCC7B90E3024582; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd364, 0d3FDBC4C04D71ABC1; +mul.f64 fd391, fd366, 0d3FE904C37505DE4B; +sub.f64 fd392, fd391, fd390; +mul.f64 fd393, fd369, 0d3FEF329C0558E969; +sub.f64 fd394, fd392, fd393; +sub.f64 fd395, fd389, fd394; +add.f64 fd396, fd394, fd389; +fma.rn.f64 fd397, fd353, 0d3FE3F3A0E28BEDD1, %60; +mul.f64 fd398, fd355, 0d3FCC7B90E3024582; +sub.f64 fd399, fd397, fd398; +mul.f64 fd400, fd357, 0d3FECD4BCA9CB5C71; +sub.f64 fd401, fd399, fd400; +sub.f64 fd402, %69, %122; +mul.f64 fd403, fd402, 0d3FE904C37505DE4B; +sub.f64 fd404, %80, %112; +mul.f64 fd405, fd404, 0dBFEF329C0558E969; +sub.f64 fd406, fd405, fd403; +sub.f64 fd407, %90, %101; +mul.f64 fd408, fd407, 0d3FDBC4C04D71ABC1; +sub.f64 fd409, fd406, fd408; +add.f64 fd410, fd409, fd401; +sub.f64 fd411, fd401, fd409; +mul.f64 fd412, fd353, 0d3FCC7B90E3024582; +sub.f64 fd413, %60, fd412; +mul.f64 fd414, fd355, 0d3FECD4BCA9CB5C71; +sub.f64 fd415, fd413, fd414; +fma.rn.f64 fd416, fd357, 0d3FE3F3A0E28BEDD1, fd415; +mul.f64 fd417, fd402, 0d3FEF329C0558E969; +mul.f64 fd418, fd404, 0d3FDBC4C04D71ABC1; +sub.f64 fd419, fd418, fd417; +fma.rn.f64 fd420, fd407, 0d3FE904C37505DE4B, fd419; +add.f64 fd421, fd420, fd416; +sub.f64 fd422, fd416, fd420; +mul.f64 fd423, fd353, 0d3FECD4BCA9CB5C71; +sub.f64 fd424, %60, fd423; +fma.rn.f64 fd425, fd355, 0d3FE3F3A0E28BEDD1, fd424; +mul.f64 fd426, fd357, 0d3FCC7B90E3024582; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, 0d3FDBC4C04D71ABC1; +mul.f64 fd429, fd404, 0d3FE904C37505DE4B; +sub.f64 fd430, fd429, fd428; +mul.f64 fd431, fd407, 0d3FEF329C0558E969; +sub.f64 fd432, fd430, fd431; +add.f64 fd433, fd432, fd427; +sub.f64 fd434, fd427, fd432; +add.f64 fd435, %74, %128; +add.f64 fd436, %64, fd435; +add.f64 fd437, %85, %117; +add.f64 fd438, fd437, fd436; +add.f64 fd439, %96, %106; +add.f64 fd440, fd439, fd438; +add.f64 fd441, %76, %129; +add.f64 fd442, %65, fd441; +add.f64 fd443, %87, %119; +add.f64 fd444, fd443, fd442; +add.f64 fd445, %97, %108; +add.f64 fd446, fd445, fd444; +fma.rn.f64 fd447, fd435, 0d3FE3F3A0E28BEDD1, %64; +mul.f64 fd448, fd437, 0d3FCC7B90E3024582; +sub.f64 fd449, fd447, fd448; +mul.f64 fd450, fd439, 0d3FECD4BCA9CB5C71; +sub.f64 fd451, fd449, fd450; +sub.f64 fd452, %76, %129; +mul.f64 fd453, fd452, 0d3FE904C37505DE4B; +sub.f64 fd454, %87, %119; +mul.f64 fd455, fd454, 0dBFEF329C0558E969; +sub.f64 fd456, fd455, fd453; +sub.f64 fd457, %97, %108; +mul.f64 fd458, fd457, 0d3FDBC4C04D71ABC1; +sub.f64 fd459, fd456, fd458; +sub.f64 fd460, fd451, fd459; +add.f64 fd461, fd459, fd451; +mul.f64 fd462, fd435, 0d3FCC7B90E3024582; +sub.f64 fd463, %64, fd462; +mul.f64 fd464, fd437, 0d3FECD4BCA9CB5C71; +sub.f64 fd465, fd463, fd464; +fma.rn.f64 fd466, fd439, 0d3FE3F3A0E28BEDD1, fd465; +mul.f64 fd467, fd452, 0d3FEF329C0558E969; +mul.f64 fd468, fd454, 0d3FDBC4C04D71ABC1; +sub.f64 fd469, fd468, fd467; +fma.rn.f64 fd470, fd457, 0d3FE904C37505DE4B, fd469; +sub.f64 fd471, fd466, fd470; +add.f64 fd472, fd470, fd466; +mul.f64 fd473, fd435, 0d3FECD4BCA9CB5C71; +sub.f64 fd474, %64, fd473; +fma.rn.f64 fd475, fd437, 0d3FE3F3A0E28BEDD1, fd474; +mul.f64 fd476, fd439, 0d3FCC7B90E3024582; +sub.f64 fd477, fd475, fd476; +mul.f64 fd478, fd452, 0d3FDBC4C04D71ABC1; +mul.f64 fd479, fd454, 0d3FE904C37505DE4B; +sub.f64 fd480, fd479, fd478; +mul.f64 fd481, fd457, 0d3FEF329C0558E969; +sub.f64 fd482, fd480, fd481; +sub.f64 fd483, fd477, fd482; +add.f64 fd484, fd482, fd477; +fma.rn.f64 fd485, fd441, 0d3FE3F3A0E28BEDD1, %65; +mul.f64 fd486, fd443, 0d3FCC7B90E3024582; +sub.f64 fd487, fd485, fd486; +mul.f64 fd488, fd445, 0d3FECD4BCA9CB5C71; +sub.f64 fd489, fd487, fd488; +sub.f64 fd490, %74, %128; +mul.f64 fd491, fd490, 0d3FE904C37505DE4B; +sub.f64 fd492, %85, %117; +mul.f64 fd493, fd492, 0dBFEF329C0558E969; +sub.f64 fd494, fd493, fd491; +sub.f64 fd495, %96, %106; +mul.f64 fd496, fd495, 0d3FDBC4C04D71ABC1; +sub.f64 fd497, fd494, fd496; +add.f64 fd498, fd497, fd489; +sub.f64 fd499, fd489, fd497; +mul.f64 fd500, fd441, 0d3FCC7B90E3024582; +sub.f64 fd501, %65, fd500; +mul.f64 fd502, fd443, 0d3FECD4BCA9CB5C71; +sub.f64 fd503, fd501, fd502; +fma.rn.f64 fd504, fd445, 0d3FE3F3A0E28BEDD1, fd503; +mul.f64 fd505, fd490, 0d3FEF329C0558E969; +mul.f64 fd506, fd492, 0d3FDBC4C04D71ABC1; +sub.f64 fd507, fd506, fd505; +fma.rn.f64 fd508, fd495, 0d3FE904C37505DE4B, fd507; +add.f64 fd509, fd508, fd504; +sub.f64 fd510, fd504, fd508; +mul.f64 fd511, fd441, 0d3FECD4BCA9CB5C71; +sub.f64 fd512, %65, fd511; +fma.rn.f64 fd513, fd443, 0d3FE3F3A0E28BEDD1, fd512; +mul.f64 fd514, fd445, 0d3FCC7B90E3024582; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd490, 0d3FDBC4C04D71ABC1; +mul.f64 fd517, fd492, 0d3FE904C37505DE4B; +sub.f64 fd518, fd517, fd516; +mul.f64 fd519, fd495, 0d3FEF329C0558E969; +sub.f64 fd520, fd518, fd519; +add.f64 fd521, fd520, fd515; +sub.f64 fd522, fd515, fd520; +mul.f64 fd523, fd460, 0d3FECD4BCA9CB5C71; +mul.f64 fd524, fd498, 0dBFDBC4C04D71ABC1; +sub.f64 fd525, fd523, fd524; +mul.f64 fd526, fd498, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd527, fd460, 0dBFDBC4C04D71ABC1, fd526; +mul.f64 fd528, fd471, 0d3FE3F3A0E28BEDD1; +mul.f64 fd529, fd509, 0dBFE904C37505DE4B; +sub.f64 fd530, fd528, fd529; +mul.f64 fd531, fd509, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd532, fd471, 0dBFE904C37505DE4B, fd531; +mul.f64 fd533, fd483, 0d3FCC7B90E3024582; +mul.f64 fd534, fd521, 0dBFEF329C0558E969; +sub.f64 fd535, fd533, fd534; +mul.f64 fd536, fd521, 0d3FCC7B90E3024582; +fma.rn.f64 fd537, fd483, 0dBFEF329C0558E969, fd536; +mul.f64 fd538, fd484, 0dBFCC7B90E3024582; +mul.f64 fd539, fd522, 0dBFEF329C0558E969; +sub.f64 fd540, fd538, fd539; +mul.f64 fd541, fd522, 0dBFCC7B90E3024582; +fma.rn.f64 fd542, fd484, 0dBFEF329C0558E969, fd541; +mul.f64 fd543, fd472, 0dBFE3F3A0E28BEDD1; +mul.f64 fd544, fd510, 0dBFE904C37505DE4B; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd510, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd547, fd472, 0dBFE904C37505DE4B, fd546; +mul.f64 fd548, fd461, 0dBFECD4BCA9CB5C71; +mul.f64 fd549, fd499, 0dBFDBC4C04D71ABC1; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd499, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd552, fd461, 0dBFDBC4C04D71ABC1, fd551; +add.f64 fd553, fd352, fd440; +add.f64 fd554, fd358, fd446; +sub.f64 fd555, fd352, fd440; +sub.f64 fd556, fd358, fd446; +add.f64 fd557, fd372, fd525; +add.f64 fd558, fd410, fd527; +sub.f64 fd559, fd372, fd525; +sub.f64 fd560, fd410, fd527; +add.f64 fd561, fd383, fd530; +add.f64 fd562, fd421, fd532; +sub.f64 fd563, fd383, fd530; +sub.f64 fd564, fd421, fd532; +add.f64 fd565, fd395, fd535; +add.f64 fd566, fd433, fd537; +sub.f64 fd567, fd395, fd535; +sub.f64 fd568, fd433, fd537; +add.f64 fd569, fd396, fd540; +add.f64 fd570, fd434, fd542; +sub.f64 fd571, fd396, fd540; +sub.f64 fd572, fd434, fd542; +add.f64 fd573, fd384, fd545; +add.f64 fd574, fd422, fd547; +sub.f64 fd575, fd384, fd545; +sub.f64 fd576, fd422, fd547; +add.f64 fd577, fd373, fd550; +add.f64 fd578, fd411, fd552; +sub.f64 fd579, fd373, fd550; +sub.f64 fd580, fd411, fd552; +mul.f64 fd581, fd557, 0d3FEF329C0558E969; +mul.f64 fd582, fd558, 0dBFCC7B90E3024582; +sub.f64 fd583, fd581, fd582; +mul.f64 fd584, fd558, 0d3FEF329C0558E969; +fma.rn.f64 fd585, fd557, 0dBFCC7B90E3024582, fd584; +mul.f64 fd586, fd561, 0d3FECD4BCA9CB5C71; +mul.f64 fd587, fd562, 0dBFDBC4C04D71ABC1; +sub.f64 fd588, fd586, fd587; +mul.f64 fd589, fd562, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd590, fd561, 0dBFDBC4C04D71ABC1, fd589; +mul.f64 fd591, fd565, 0d3FE904C37505DE4B; +mul.f64 fd592, fd566, 0dBFE3F3A0E28BEDD1; +sub.f64 fd593, fd591, fd592; +mul.f64 fd594, fd566, 0d3FE904C37505DE4B; +fma.rn.f64 fd595, fd565, 0dBFE3F3A0E28BEDD1, fd594; +mul.f64 fd596, fd569, 0d3FE3F3A0E28BEDD1; +mul.f64 fd597, fd570, 0dBFE904C37505DE4B; +sub.f64 fd598, fd596, fd597; +mul.f64 fd599, fd570, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd600, fd569, 0dBFE904C37505DE4B, fd599; +mul.f64 fd601, fd573, 0d3FDBC4C04D71ABC1; +mul.f64 fd602, fd574, 0dBFECD4BCA9CB5C71; +sub.f64 fd603, fd601, fd602; +mul.f64 fd604, fd574, 0d3FDBC4C04D71ABC1; +fma.rn.f64 fd605, fd573, 0dBFECD4BCA9CB5C71, fd604; +mul.f64 fd606, fd577, 0d3FCC7B90E3024582; +mul.f64 fd607, fd578, 0dBFEF329C0558E969; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd578, 0d3FCC7B90E3024582; +fma.rn.f64 fd610, fd577, 0dBFEF329C0558E969, fd609; +mul.f64 fd611, fd559, 0dBFCC7B90E3024582; +mul.f64 fd612, fd560, 0dBFEF329C0558E969; +sub.f64 fd613, fd611, fd612; +mul.f64 fd614, fd560, 0dBFCC7B90E3024582; +fma.rn.f64 fd615, fd559, 0dBFEF329C0558E969, fd614; +mul.f64 fd616, fd563, 0dBFDBC4C04D71ABC1; +mul.f64 fd617, fd564, 0dBFECD4BCA9CB5C71; +sub.f64 fd618, fd616, fd617; +mul.f64 fd619, fd564, 0dBFDBC4C04D71ABC1; +fma.rn.f64 fd620, fd563, 0dBFECD4BCA9CB5C71, fd619; +mul.f64 fd621, fd567, 0dBFE3F3A0E28BEDD1; +mul.f64 fd622, fd568, 0dBFE904C37505DE4B; +sub.f64 fd623, fd621, fd622; +mul.f64 fd624, fd568, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd625, fd567, 0dBFE904C37505DE4B, fd624; +mul.f64 fd626, fd571, 0dBFE904C37505DE4B; +mul.f64 fd627, fd572, 0dBFE3F3A0E28BEDD1; +sub.f64 fd628, fd626, fd627; +mul.f64 fd629, fd572, 0dBFE904C37505DE4B; +fma.rn.f64 fd630, fd571, 0dBFE3F3A0E28BEDD1, fd629; +mul.f64 fd631, fd575, 0dBFECD4BCA9CB5C71; +mul.f64 fd632, fd576, 0dBFDBC4C04D71ABC1; +sub.f64 fd633, fd631, fd632; +mul.f64 fd634, fd576, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd635, fd575, 0dBFDBC4C04D71ABC1, fd634; +mul.f64 fd636, fd579, 0dBFEF329C0558E969; +mul.f64 fd637, fd580, 0dBFCC7B90E3024582; +sub.f64 fd638, fd636, fd637; +mul.f64 fd639, fd580, 0dBFEF329C0558E969; +fma.rn.f64 fd640, fd579, 0dBFCC7B90E3024582, fd639; +add.f64 %1, fd320, fd554; +add.f64 %0, fd319, fd553; +add.f64 %3, fd324, fd585; +add.f64 %2, fd323, fd583; +add.f64 %5, fd328, fd590; +add.f64 %4, fd327, fd588; +add.f64 %7, fd332, fd595; +add.f64 %6, fd331, fd593; +add.f64 %9, fd336, fd600; +add.f64 %8, fd335, fd598; +add.f64 %11, fd340, fd605; +add.f64 %10, fd339, fd603; +add.f64 %13, fd344, fd610; +add.f64 %12, fd343, fd608; +sub.f64 %15, fd322, fd555; +add.f64 %14, fd321, fd556; +add.f64 %17, fd326, fd615; +add.f64 %16, fd325, fd613; +add.f64 %19, fd330, fd620; +add.f64 %18, fd329, fd618; +add.f64 %21, fd334, fd625; +add.f64 %20, fd333, fd623; +add.f64 %23, fd338, fd630; +add.f64 %22, fd337, fd628; +add.f64 %25, fd342, fd635; +add.f64 %24, fd341, fd633; +add.f64 %27, fd346, fd640; +add.f64 %26, fd345, fd638; +sub.f64 %29, fd320, fd554; +sub.f64 %28, fd319, fd553; +sub.f64 %31, fd324, fd585; +sub.f64 %30, fd323, fd583; +sub.f64 %33, fd328, fd590; +sub.f64 %32, fd327, fd588; +sub.f64 %35, fd332, fd595; +sub.f64 %34, fd331, fd593; +sub.f64 %37, fd336, fd600; +sub.f64 %36, fd335, fd598; +sub.f64 %39, fd340, fd605; +sub.f64 %38, fd339, fd603; +sub.f64 %41, fd344, fd610; +sub.f64 %40, fd343, fd608; +add.f64 %43, fd322, fd555; +sub.f64 %42, fd321, fd556; +sub.f64 %45, fd326, fd615; +sub.f64 %44, fd325, fd613; +sub.f64 %47, fd330, fd620; +sub.f64 %46, fd329, fd618; +sub.f64 %49, fd334, fd625; +sub.f64 %48, fd333, fd623; +sub.f64 %51, fd338, fd630; +sub.f64 %50, fd337, fd628; +sub.f64 %53, fd342, fd635; +sub.f64 %52, fd341, fd633; +sub.f64 %55, fd346, fd640; +sub.f64 %54, fd345, fd638; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..9cc267a49e030 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_28_fp64_inv.hpp.inc @@ -0,0 +1,584 @@ +#ifndef CUFFTDX_FFT_28_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_28_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<587, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<681>; +.reg .b64 rd<2>; +add.f64 fd113, %66, %120; +add.f64 fd114, %56, fd113; +add.f64 fd115, %77, %109; +add.f64 fd116, fd115, fd114; +add.f64 fd117, %88, %98; +add.f64 fd118, fd117, fd116; +add.f64 fd119, %68, %121; +add.f64 fd120, %57, fd119; +add.f64 fd121, %79, %111; +add.f64 fd122, fd121, fd120; +add.f64 fd123, %89, %100; +add.f64 fd124, fd123, fd122; +fma.rn.f64 fd125, fd113, 0d3FE3F3A0E28BEDD1, %56; +mul.f64 fd126, fd115, 0d3FCC7B90E3024582; +sub.f64 fd127, fd125, fd126; +mul.f64 fd128, fd117, 0d3FECD4BCA9CB5C71; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, %68, %121; +mul.f64 fd131, fd130, 0d3FE904C37505DE4B; +sub.f64 fd132, %79, %111; +fma.rn.f64 fd133, fd132, 0d3FEF329C0558E969, fd131; +sub.f64 fd134, %89, %100; +fma.rn.f64 fd135, fd134, 0d3FDBC4C04D71ABC1, fd133; +sub.f64 fd136, fd129, fd135; +add.f64 fd137, fd135, fd129; +mul.f64 fd138, fd113, 0d3FCC7B90E3024582; +sub.f64 fd139, %56, fd138; +mul.f64 fd140, fd115, 0d3FECD4BCA9CB5C71; +sub.f64 fd141, fd139, fd140; +fma.rn.f64 fd142, fd117, 0d3FE3F3A0E28BEDD1, fd141; +mul.f64 fd143, fd130, 0d3FEF329C0558E969; +mul.f64 fd144, fd132, 0d3FDBC4C04D71ABC1; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd134, 0d3FE904C37505DE4B; +sub.f64 fd147, fd145, fd146; +sub.f64 fd148, fd142, fd147; +add.f64 fd149, fd147, fd142; +mul.f64 fd150, fd113, 0d3FECD4BCA9CB5C71; +sub.f64 fd151, %56, fd150; +fma.rn.f64 fd152, fd115, 0d3FE3F3A0E28BEDD1, fd151; +mul.f64 fd153, fd117, 0d3FCC7B90E3024582; +sub.f64 fd154, fd152, fd153; +mul.f64 fd155, fd130, 0d3FDBC4C04D71ABC1; +mul.f64 fd156, fd132, 0d3FE904C37505DE4B; +sub.f64 fd157, fd155, fd156; +fma.rn.f64 fd158, fd134, 0d3FEF329C0558E969, fd157; +sub.f64 fd159, fd154, fd158; +add.f64 fd160, fd158, fd154; +fma.rn.f64 fd161, fd119, 0d3FE3F3A0E28BEDD1, %57; +mul.f64 fd162, fd121, 0d3FCC7B90E3024582; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd123, 0d3FECD4BCA9CB5C71; +sub.f64 fd165, fd163, fd164; +sub.f64 fd166, %66, %120; +mul.f64 fd167, fd166, 0d3FE904C37505DE4B; +sub.f64 fd168, %77, %109; +fma.rn.f64 fd169, fd168, 0d3FEF329C0558E969, fd167; +sub.f64 fd170, %88, %98; +fma.rn.f64 fd171, fd170, 0d3FDBC4C04D71ABC1, fd169; +add.f64 fd172, fd171, fd165; +sub.f64 fd173, fd165, fd171; +mul.f64 fd174, fd119, 0d3FCC7B90E3024582; +sub.f64 fd175, %57, fd174; +mul.f64 fd176, fd121, 0d3FECD4BCA9CB5C71; +sub.f64 fd177, fd175, fd176; +fma.rn.f64 fd178, fd123, 0d3FE3F3A0E28BEDD1, fd177; +mul.f64 fd179, fd166, 0d3FEF329C0558E969; +mul.f64 fd180, fd168, 0d3FDBC4C04D71ABC1; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd170, 0d3FE904C37505DE4B; +sub.f64 fd183, fd181, fd182; +add.f64 fd184, fd183, fd178; +sub.f64 fd185, fd178, fd183; +mul.f64 fd186, fd119, 0d3FECD4BCA9CB5C71; +sub.f64 fd187, %57, fd186; +fma.rn.f64 fd188, fd121, 0d3FE3F3A0E28BEDD1, fd187; +mul.f64 fd189, fd123, 0d3FCC7B90E3024582; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd166, 0d3FDBC4C04D71ABC1; +mul.f64 fd192, fd168, 0d3FE904C37505DE4B; +sub.f64 fd193, fd191, fd192; +fma.rn.f64 fd194, fd170, 0d3FEF329C0558E969, fd193; +add.f64 fd195, fd194, fd190; +sub.f64 fd196, fd190, fd194; +add.f64 fd197, %72, %125; +add.f64 fd198, %61, fd197; +add.f64 fd199, %82, %114; +add.f64 fd200, fd199, fd198; +add.f64 fd201, %93, %104; +add.f64 fd202, fd201, fd200; +add.f64 fd203, %73, %127; +add.f64 fd204, %63, fd203; +add.f64 fd205, %84, %116; +add.f64 fd206, fd205, fd204; +add.f64 fd207, %95, %105; +add.f64 fd208, fd207, fd206; +fma.rn.f64 fd209, fd197, 0d3FE3F3A0E28BEDD1, %61; +mul.f64 fd210, fd199, 0d3FCC7B90E3024582; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd201, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +sub.f64 fd214, %73, %127; +mul.f64 fd215, fd214, 0d3FE904C37505DE4B; +sub.f64 fd216, %84, %116; +fma.rn.f64 fd217, fd216, 0d3FEF329C0558E969, fd215; +sub.f64 fd218, %95, %105; +fma.rn.f64 fd219, fd218, 0d3FDBC4C04D71ABC1, fd217; +sub.f64 fd220, fd213, fd219; +add.f64 fd221, fd219, fd213; +mul.f64 fd222, fd197, 0d3FCC7B90E3024582; +sub.f64 fd223, %61, fd222; +mul.f64 fd224, fd199, 0d3FECD4BCA9CB5C71; +sub.f64 fd225, fd223, fd224; +fma.rn.f64 fd226, fd201, 0d3FE3F3A0E28BEDD1, fd225; +mul.f64 fd227, fd214, 0d3FEF329C0558E969; +mul.f64 fd228, fd216, 0d3FDBC4C04D71ABC1; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd218, 0d3FE904C37505DE4B; +sub.f64 fd231, fd229, fd230; +sub.f64 fd232, fd226, fd231; +add.f64 fd233, fd231, fd226; +mul.f64 fd234, fd197, 0d3FECD4BCA9CB5C71; +sub.f64 fd235, %61, fd234; +fma.rn.f64 fd236, fd199, 0d3FE3F3A0E28BEDD1, fd235; +mul.f64 fd237, fd201, 0d3FCC7B90E3024582; +sub.f64 fd238, fd236, fd237; +mul.f64 fd239, fd214, 0d3FDBC4C04D71ABC1; +mul.f64 fd240, fd216, 0d3FE904C37505DE4B; +sub.f64 fd241, fd239, fd240; +fma.rn.f64 fd242, fd218, 0d3FEF329C0558E969, fd241; +sub.f64 fd243, fd238, fd242; +add.f64 fd244, fd242, fd238; +fma.rn.f64 fd245, fd203, 0d3FE3F3A0E28BEDD1, %63; +mul.f64 fd246, fd205, 0d3FCC7B90E3024582; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd207, 0d3FECD4BCA9CB5C71; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, %72, %125; +mul.f64 fd251, fd250, 0d3FE904C37505DE4B; +sub.f64 fd252, %82, %114; +fma.rn.f64 fd253, fd252, 0d3FEF329C0558E969, fd251; +sub.f64 fd254, %93, %104; +fma.rn.f64 fd255, fd254, 0d3FDBC4C04D71ABC1, fd253; +add.f64 fd256, fd255, fd249; +sub.f64 fd257, fd249, fd255; +mul.f64 fd258, fd203, 0d3FCC7B90E3024582; +sub.f64 fd259, %63, fd258; +mul.f64 fd260, fd205, 0d3FECD4BCA9CB5C71; +sub.f64 fd261, fd259, fd260; +fma.rn.f64 fd262, fd207, 0d3FE3F3A0E28BEDD1, fd261; +mul.f64 fd263, fd250, 0d3FEF329C0558E969; +mul.f64 fd264, fd252, 0d3FDBC4C04D71ABC1; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd254, 0d3FE904C37505DE4B; +sub.f64 fd267, fd265, fd266; +add.f64 fd268, fd267, fd262; +sub.f64 fd269, fd262, fd267; +mul.f64 fd270, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd271, %63, fd270; +fma.rn.f64 fd272, fd205, 0d3FE3F3A0E28BEDD1, fd271; +mul.f64 fd273, fd207, 0d3FCC7B90E3024582; +sub.f64 fd274, fd272, fd273; +mul.f64 fd275, fd250, 0d3FDBC4C04D71ABC1; +mul.f64 fd276, fd252, 0d3FE904C37505DE4B; +sub.f64 fd277, fd275, fd276; +fma.rn.f64 fd278, fd254, 0d3FEF329C0558E969, fd277; +add.f64 fd279, fd278, fd274; +sub.f64 fd280, fd274, fd278; +mul.f64 fd281, fd220, 0d3FECD4BCA9CB5C71; +mul.f64 fd282, fd256, 0d3FDBC4C04D71ABC1; +sub.f64 fd283, fd281, fd282; +mul.f64 fd284, fd256, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd285, fd220, 0d3FDBC4C04D71ABC1, fd284; +mul.f64 fd286, fd232, 0d3FE3F3A0E28BEDD1; +mul.f64 fd287, fd268, 0d3FE904C37505DE4B; +sub.f64 fd288, fd286, fd287; +mul.f64 fd289, fd268, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd290, fd232, 0d3FE904C37505DE4B, fd289; +mul.f64 fd291, fd243, 0d3FCC7B90E3024582; +mul.f64 fd292, fd279, 0d3FEF329C0558E969; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd279, 0d3FCC7B90E3024582; +fma.rn.f64 fd295, fd243, 0d3FEF329C0558E969, fd294; +mul.f64 fd296, fd244, 0dBFCC7B90E3024582; +mul.f64 fd297, fd280, 0d3FEF329C0558E969; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd280, 0dBFCC7B90E3024582; +fma.rn.f64 fd300, fd244, 0d3FEF329C0558E969, fd299; +mul.f64 fd301, fd233, 0dBFE3F3A0E28BEDD1; +mul.f64 fd302, fd269, 0d3FE904C37505DE4B; +sub.f64 fd303, fd301, fd302; +mul.f64 fd304, fd269, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd305, fd233, 0d3FE904C37505DE4B, fd304; +mul.f64 fd306, fd221, 0dBFECD4BCA9CB5C71; +mul.f64 fd307, fd257, 0d3FDBC4C04D71ABC1; +sub.f64 fd308, fd306, fd307; +mul.f64 fd309, fd257, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd310, fd221, 0d3FDBC4C04D71ABC1, fd309; +add.f64 fd311, fd118, fd202; +add.f64 fd312, fd124, fd208; +sub.f64 fd313, fd118, fd202; +sub.f64 fd314, fd124, fd208; +add.f64 fd315, fd136, fd283; +add.f64 fd316, fd172, fd285; +sub.f64 fd317, fd136, fd283; +sub.f64 fd318, fd172, fd285; +add.f64 fd319, fd148, fd288; +add.f64 fd320, fd184, fd290; +sub.f64 fd321, fd148, fd288; +sub.f64 fd322, fd184, fd290; +add.f64 fd323, fd159, fd293; +add.f64 fd324, fd195, fd295; +sub.f64 fd325, fd159, fd293; +sub.f64 fd326, fd195, fd295; +add.f64 fd327, fd160, fd298; +add.f64 fd328, fd196, fd300; +sub.f64 fd329, fd160, fd298; +sub.f64 fd330, fd196, fd300; +add.f64 fd331, fd149, fd303; +add.f64 fd332, fd185, fd305; +sub.f64 fd333, fd149, fd303; +sub.f64 fd334, fd185, fd305; +add.f64 fd335, fd137, fd308; +add.f64 fd336, fd173, fd310; +sub.f64 fd337, fd137, fd308; +sub.f64 fd338, fd173, fd310; +add.f64 fd339, %69, %122; +add.f64 fd340, %58, fd339; +add.f64 fd341, %80, %112; +add.f64 fd342, fd341, fd340; +add.f64 fd343, %90, %101; +add.f64 fd344, fd343, fd342; +add.f64 fd345, %71, %124; +add.f64 fd346, %60, fd345; +add.f64 fd347, %81, %113; +add.f64 fd348, fd347, fd346; +add.f64 fd349, %92, %103; +add.f64 fd350, fd349, fd348; +fma.rn.f64 fd351, fd339, 0d3FE3F3A0E28BEDD1, %58; +mul.f64 fd352, fd341, 0d3FCC7B90E3024582; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd343, 0d3FECD4BCA9CB5C71; +sub.f64 fd355, fd353, fd354; +sub.f64 fd356, %71, %124; +mul.f64 fd357, fd356, 0d3FE904C37505DE4B; +sub.f64 fd358, %81, %113; +fma.rn.f64 fd359, fd358, 0d3FEF329C0558E969, fd357; +sub.f64 fd360, %92, %103; +fma.rn.f64 fd361, fd360, 0d3FDBC4C04D71ABC1, fd359; +sub.f64 fd362, fd355, fd361; +add.f64 fd363, fd361, fd355; +mul.f64 fd364, fd339, 0d3FCC7B90E3024582; +sub.f64 fd365, %58, fd364; +mul.f64 fd366, fd341, 0d3FECD4BCA9CB5C71; +sub.f64 fd367, fd365, fd366; +fma.rn.f64 fd368, fd343, 0d3FE3F3A0E28BEDD1, fd367; +mul.f64 fd369, fd356, 0d3FEF329C0558E969; +mul.f64 fd370, fd358, 0d3FDBC4C04D71ABC1; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd360, 0d3FE904C37505DE4B; +sub.f64 fd373, fd371, fd372; +sub.f64 fd374, fd368, fd373; +add.f64 fd375, fd373, fd368; +mul.f64 fd376, fd339, 0d3FECD4BCA9CB5C71; +sub.f64 fd377, %58, fd376; +fma.rn.f64 fd378, fd341, 0d3FE3F3A0E28BEDD1, fd377; +mul.f64 fd379, fd343, 0d3FCC7B90E3024582; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd356, 0d3FDBC4C04D71ABC1; +mul.f64 fd382, fd358, 0d3FE904C37505DE4B; +sub.f64 fd383, fd381, fd382; +fma.rn.f64 fd384, fd360, 0d3FEF329C0558E969, fd383; +sub.f64 fd385, fd380, fd384; +add.f64 fd386, fd384, fd380; +fma.rn.f64 fd387, fd345, 0d3FE3F3A0E28BEDD1, %60; +mul.f64 fd388, fd347, 0d3FCC7B90E3024582; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd349, 0d3FECD4BCA9CB5C71; +sub.f64 fd391, fd389, fd390; +sub.f64 fd392, %69, %122; +mul.f64 fd393, fd392, 0d3FE904C37505DE4B; +sub.f64 fd394, %80, %112; +fma.rn.f64 fd395, fd394, 0d3FEF329C0558E969, fd393; +sub.f64 fd396, %90, %101; +fma.rn.f64 fd397, fd396, 0d3FDBC4C04D71ABC1, fd395; +add.f64 fd398, fd397, fd391; +sub.f64 fd399, fd391, fd397; +mul.f64 fd400, fd345, 0d3FCC7B90E3024582; +sub.f64 fd401, %60, fd400; +mul.f64 fd402, fd347, 0d3FECD4BCA9CB5C71; +sub.f64 fd403, fd401, fd402; +fma.rn.f64 fd404, fd349, 0d3FE3F3A0E28BEDD1, fd403; +mul.f64 fd405, fd392, 0d3FEF329C0558E969; +mul.f64 fd406, fd394, 0d3FDBC4C04D71ABC1; +sub.f64 fd407, fd405, fd406; +mul.f64 fd408, fd396, 0d3FE904C37505DE4B; +sub.f64 fd409, fd407, fd408; +add.f64 fd410, fd409, fd404; +sub.f64 fd411, fd404, fd409; +mul.f64 fd412, fd345, 0d3FECD4BCA9CB5C71; +sub.f64 fd413, %60, fd412; +fma.rn.f64 fd414, fd347, 0d3FE3F3A0E28BEDD1, fd413; +mul.f64 fd415, fd349, 0d3FCC7B90E3024582; +sub.f64 fd416, fd414, fd415; +mul.f64 fd417, fd392, 0d3FDBC4C04D71ABC1; +mul.f64 fd418, fd394, 0d3FE904C37505DE4B; +sub.f64 fd419, fd417, fd418; +fma.rn.f64 fd420, fd396, 0d3FEF329C0558E969, fd419; +add.f64 fd421, fd420, fd416; +sub.f64 fd422, fd416, fd420; +add.f64 fd423, %74, %128; +add.f64 fd424, %64, fd423; +add.f64 fd425, %85, %117; +add.f64 fd426, fd425, fd424; +add.f64 fd427, %96, %106; +add.f64 fd428, fd427, fd426; +add.f64 fd429, %76, %129; +add.f64 fd430, %65, fd429; +add.f64 fd431, %87, %119; +add.f64 fd432, fd431, fd430; +add.f64 fd433, %97, %108; +add.f64 fd434, fd433, fd432; +fma.rn.f64 fd435, fd423, 0d3FE3F3A0E28BEDD1, %64; +mul.f64 fd436, fd425, 0d3FCC7B90E3024582; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd427, 0d3FECD4BCA9CB5C71; +sub.f64 fd439, fd437, fd438; +sub.f64 fd440, %76, %129; +mul.f64 fd441, fd440, 0d3FE904C37505DE4B; +sub.f64 fd442, %87, %119; +fma.rn.f64 fd443, fd442, 0d3FEF329C0558E969, fd441; +sub.f64 fd444, %97, %108; +fma.rn.f64 fd445, fd444, 0d3FDBC4C04D71ABC1, fd443; +sub.f64 fd446, fd439, fd445; +add.f64 fd447, fd445, fd439; +mul.f64 fd448, fd423, 0d3FCC7B90E3024582; +sub.f64 fd449, %64, fd448; +mul.f64 fd450, fd425, 0d3FECD4BCA9CB5C71; +sub.f64 fd451, fd449, fd450; +fma.rn.f64 fd452, fd427, 0d3FE3F3A0E28BEDD1, fd451; +mul.f64 fd453, fd440, 0d3FEF329C0558E969; +mul.f64 fd454, fd442, 0d3FDBC4C04D71ABC1; +sub.f64 fd455, fd453, fd454; +mul.f64 fd456, fd444, 0d3FE904C37505DE4B; +sub.f64 fd457, fd455, fd456; +sub.f64 fd458, fd452, fd457; +add.f64 fd459, fd457, fd452; +mul.f64 fd460, fd423, 0d3FECD4BCA9CB5C71; +sub.f64 fd461, %64, fd460; +fma.rn.f64 fd462, fd425, 0d3FE3F3A0E28BEDD1, fd461; +mul.f64 fd463, fd427, 0d3FCC7B90E3024582; +sub.f64 fd464, fd462, fd463; +mul.f64 fd465, fd440, 0d3FDBC4C04D71ABC1; +mul.f64 fd466, fd442, 0d3FE904C37505DE4B; +sub.f64 fd467, fd465, fd466; +fma.rn.f64 fd468, fd444, 0d3FEF329C0558E969, fd467; +sub.f64 fd469, fd464, fd468; +add.f64 fd470, fd468, fd464; +fma.rn.f64 fd471, fd429, 0d3FE3F3A0E28BEDD1, %65; +mul.f64 fd472, fd431, 0d3FCC7B90E3024582; +sub.f64 fd473, fd471, fd472; +mul.f64 fd474, fd433, 0d3FECD4BCA9CB5C71; +sub.f64 fd475, fd473, fd474; +sub.f64 fd476, %74, %128; +mul.f64 fd477, fd476, 0d3FE904C37505DE4B; +sub.f64 fd478, %85, %117; +fma.rn.f64 fd479, fd478, 0d3FEF329C0558E969, fd477; +sub.f64 fd480, %96, %106; +fma.rn.f64 fd481, fd480, 0d3FDBC4C04D71ABC1, fd479; +add.f64 fd482, fd481, fd475; +sub.f64 fd483, fd475, fd481; +mul.f64 fd484, fd429, 0d3FCC7B90E3024582; +sub.f64 fd485, %65, fd484; +mul.f64 fd486, fd431, 0d3FECD4BCA9CB5C71; +sub.f64 fd487, fd485, fd486; +fma.rn.f64 fd488, fd433, 0d3FE3F3A0E28BEDD1, fd487; +mul.f64 fd489, fd476, 0d3FEF329C0558E969; +mul.f64 fd490, fd478, 0d3FDBC4C04D71ABC1; +sub.f64 fd491, fd489, fd490; +mul.f64 fd492, fd480, 0d3FE904C37505DE4B; +sub.f64 fd493, fd491, fd492; +add.f64 fd494, fd493, fd488; +sub.f64 fd495, fd488, fd493; +mul.f64 fd496, fd429, 0d3FECD4BCA9CB5C71; +sub.f64 fd497, %65, fd496; +fma.rn.f64 fd498, fd431, 0d3FE3F3A0E28BEDD1, fd497; +mul.f64 fd499, fd433, 0d3FCC7B90E3024582; +sub.f64 fd500, fd498, fd499; +mul.f64 fd501, fd476, 0d3FDBC4C04D71ABC1; +mul.f64 fd502, fd478, 0d3FE904C37505DE4B; +sub.f64 fd503, fd501, fd502; +fma.rn.f64 fd504, fd480, 0d3FEF329C0558E969, fd503; +add.f64 fd505, fd504, fd500; +sub.f64 fd506, fd500, fd504; +mul.f64 fd507, fd446, 0d3FECD4BCA9CB5C71; +mul.f64 fd508, fd482, 0d3FDBC4C04D71ABC1; +sub.f64 fd509, fd507, fd508; +mul.f64 fd510, fd482, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd511, fd446, 0d3FDBC4C04D71ABC1, fd510; +mul.f64 fd512, fd458, 0d3FE3F3A0E28BEDD1; +mul.f64 fd513, fd494, 0d3FE904C37505DE4B; +sub.f64 fd514, fd512, fd513; +mul.f64 fd515, fd494, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd516, fd458, 0d3FE904C37505DE4B, fd515; +mul.f64 fd517, fd469, 0d3FCC7B90E3024582; +mul.f64 fd518, fd505, 0d3FEF329C0558E969; +sub.f64 fd519, fd517, fd518; +mul.f64 fd520, fd505, 0d3FCC7B90E3024582; +fma.rn.f64 fd521, fd469, 0d3FEF329C0558E969, fd520; +mul.f64 fd522, fd470, 0dBFCC7B90E3024582; +mul.f64 fd523, fd506, 0d3FEF329C0558E969; +sub.f64 fd524, fd522, fd523; +mul.f64 fd525, fd506, 0dBFCC7B90E3024582; +fma.rn.f64 fd526, fd470, 0d3FEF329C0558E969, fd525; +mul.f64 fd527, fd459, 0dBFE3F3A0E28BEDD1; +mul.f64 fd528, fd495, 0d3FE904C37505DE4B; +sub.f64 fd529, fd527, fd528; +mul.f64 fd530, fd495, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd531, fd459, 0d3FE904C37505DE4B, fd530; +mul.f64 fd532, fd447, 0dBFECD4BCA9CB5C71; +mul.f64 fd533, fd483, 0d3FDBC4C04D71ABC1; +sub.f64 fd534, fd532, fd533; +mul.f64 fd535, fd483, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd536, fd447, 0d3FDBC4C04D71ABC1, fd535; +add.f64 fd537, fd344, fd428; +add.f64 fd538, fd350, fd434; +sub.f64 fd539, fd344, fd428; +sub.f64 fd540, fd350, fd434; +add.f64 fd541, fd362, fd509; +add.f64 fd542, fd398, fd511; +sub.f64 fd543, fd362, fd509; +sub.f64 fd544, fd398, fd511; +add.f64 fd545, fd374, fd514; +add.f64 fd546, fd410, fd516; +sub.f64 fd547, fd374, fd514; +sub.f64 fd548, fd410, fd516; +add.f64 fd549, fd385, fd519; +add.f64 fd550, fd421, fd521; +sub.f64 fd551, fd385, fd519; +sub.f64 fd552, fd421, fd521; +add.f64 fd553, fd386, fd524; +add.f64 fd554, fd422, fd526; +sub.f64 fd555, fd386, fd524; +sub.f64 fd556, fd422, fd526; +add.f64 fd557, fd375, fd529; +add.f64 fd558, fd411, fd531; +sub.f64 fd559, fd375, fd529; +sub.f64 fd560, fd411, fd531; +add.f64 fd561, fd363, fd534; +add.f64 fd562, fd399, fd536; +sub.f64 fd563, fd363, fd534; +sub.f64 fd564, fd399, fd536; +mul.f64 fd565, fd541, 0d3FEF329C0558E969; +mul.f64 fd566, fd542, 0d3FCC7B90E3024582; +sub.f64 fd567, fd565, fd566; +mul.f64 fd568, fd542, 0d3FEF329C0558E969; +fma.rn.f64 fd569, fd541, 0d3FCC7B90E3024582, fd568; +mul.f64 fd570, fd545, 0d3FECD4BCA9CB5C71; +mul.f64 fd571, fd546, 0d3FDBC4C04D71ABC1; +sub.f64 fd572, fd570, fd571; +mul.f64 fd573, fd546, 0d3FECD4BCA9CB5C71; +fma.rn.f64 fd574, fd545, 0d3FDBC4C04D71ABC1, fd573; +mul.f64 fd575, fd549, 0d3FE904C37505DE4B; +mul.f64 fd576, fd550, 0d3FE3F3A0E28BEDD1; +sub.f64 fd577, fd575, fd576; +mul.f64 fd578, fd550, 0d3FE904C37505DE4B; +fma.rn.f64 fd579, fd549, 0d3FE3F3A0E28BEDD1, fd578; +mul.f64 fd580, fd553, 0d3FE3F3A0E28BEDD1; +mul.f64 fd581, fd554, 0d3FE904C37505DE4B; +sub.f64 fd582, fd580, fd581; +mul.f64 fd583, fd554, 0d3FE3F3A0E28BEDD1; +fma.rn.f64 fd584, fd553, 0d3FE904C37505DE4B, fd583; +mul.f64 fd585, fd557, 0d3FDBC4C04D71ABC1; +mul.f64 fd586, fd558, 0d3FECD4BCA9CB5C71; +sub.f64 fd587, fd585, fd586; +mul.f64 fd588, fd558, 0d3FDBC4C04D71ABC1; +fma.rn.f64 fd589, fd557, 0d3FECD4BCA9CB5C71, fd588; +mul.f64 fd590, fd561, 0d3FCC7B90E3024582; +mul.f64 fd591, fd562, 0d3FEF329C0558E969; +sub.f64 fd592, fd590, fd591; +mul.f64 fd593, fd562, 0d3FCC7B90E3024582; +fma.rn.f64 fd594, fd561, 0d3FEF329C0558E969, fd593; +mul.f64 fd595, fd543, 0dBFCC7B90E3024582; +mul.f64 fd596, fd544, 0d3FEF329C0558E969; +sub.f64 fd597, fd595, fd596; +mul.f64 fd598, fd544, 0dBFCC7B90E3024582; +fma.rn.f64 fd599, fd543, 0d3FEF329C0558E969, fd598; +mul.f64 fd600, fd547, 0dBFDBC4C04D71ABC1; +mul.f64 fd601, fd548, 0d3FECD4BCA9CB5C71; +sub.f64 fd602, fd600, fd601; +mul.f64 fd603, fd548, 0dBFDBC4C04D71ABC1; +fma.rn.f64 fd604, fd547, 0d3FECD4BCA9CB5C71, fd603; +mul.f64 fd605, fd551, 0dBFE3F3A0E28BEDD1; +mul.f64 fd606, fd552, 0d3FE904C37505DE4B; +sub.f64 fd607, fd605, fd606; +mul.f64 fd608, fd552, 0dBFE3F3A0E28BEDD1; +fma.rn.f64 fd609, fd551, 0d3FE904C37505DE4B, fd608; +mul.f64 fd610, fd555, 0dBFE904C37505DE4B; +mul.f64 fd611, fd556, 0d3FE3F3A0E28BEDD1; +sub.f64 fd612, fd610, fd611; +mul.f64 fd613, fd556, 0dBFE904C37505DE4B; +fma.rn.f64 fd614, fd555, 0d3FE3F3A0E28BEDD1, fd613; +mul.f64 fd615, fd559, 0dBFECD4BCA9CB5C71; +mul.f64 fd616, fd560, 0d3FDBC4C04D71ABC1; +sub.f64 fd617, fd615, fd616; +mul.f64 fd618, fd560, 0dBFECD4BCA9CB5C71; +fma.rn.f64 fd619, fd559, 0d3FDBC4C04D71ABC1, fd618; +mul.f64 fd620, fd563, 0dBFEF329C0558E969; +mul.f64 fd621, fd564, 0d3FCC7B90E3024582; +sub.f64 fd622, fd620, fd621; +mul.f64 fd623, fd564, 0dBFEF329C0558E969; +fma.rn.f64 fd624, fd563, 0d3FCC7B90E3024582, fd623; +add.f64 %1, fd312, fd538; +add.f64 %0, fd311, fd537; +add.f64 %3, fd316, fd569; +add.f64 %2, fd315, fd567; +add.f64 %5, fd320, fd574; +add.f64 %4, fd319, fd572; +add.f64 %7, fd324, fd579; +add.f64 %6, fd323, fd577; +add.f64 %9, fd328, fd584; +add.f64 %8, fd327, fd582; +add.f64 %11, fd332, fd589; +add.f64 %10, fd331, fd587; +add.f64 %13, fd336, fd594; +add.f64 %12, fd335, fd592; +add.f64 %15, fd314, fd539; +sub.f64 %14, fd313, fd540; +add.f64 %17, fd318, fd599; +add.f64 %16, fd317, fd597; +add.f64 %19, fd322, fd604; +add.f64 %18, fd321, fd602; +add.f64 %21, fd326, fd609; +add.f64 %20, fd325, fd607; +add.f64 %23, fd330, fd614; +add.f64 %22, fd329, fd612; +add.f64 %25, fd334, fd619; +add.f64 %24, fd333, fd617; +add.f64 %27, fd338, fd624; +add.f64 %26, fd337, fd622; +sub.f64 %29, fd312, fd538; +sub.f64 %28, fd311, fd537; +sub.f64 %31, fd316, fd569; +sub.f64 %30, fd315, fd567; +sub.f64 %33, fd320, fd574; +sub.f64 %32, fd319, fd572; +sub.f64 %35, fd324, fd579; +sub.f64 %34, fd323, fd577; +sub.f64 %37, fd328, fd584; +sub.f64 %36, fd327, fd582; +sub.f64 %39, fd332, fd589; +sub.f64 %38, fd331, fd587; +sub.f64 %41, fd336, fd594; +sub.f64 %40, fd335, fd592; +sub.f64 %43, fd314, fd539; +add.f64 %42, fd313, fd540; +sub.f64 %45, fd318, fd599; +sub.f64 %44, fd317, fd597; +sub.f64 %47, fd322, fd604; +sub.f64 %46, fd321, fd602; +sub.f64 %49, fd326, fd609; +sub.f64 %48, fd325, fd607; +sub.f64 %51, fd330, fd614; +sub.f64 %50, fd329, fd612; +sub.f64 %53, fd334, fd619; +sub.f64 %52, fd333, fd617; +sub.f64 %55, fd338, fd624; +sub.f64 %54, fd337, fd622; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..be5e6224c2bcc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp16_fwd.hpp.inc @@ -0,0 +1,8377 @@ +#ifndef CUFFTDX_FFT_29_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_29_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<759, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<813>; +.reg .b32 r<5153>; +.reg .f64 fd<785>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %60, %114; +} +{ +add.f16x2 r4, %61, %115; +} +{ +sub.f16x2 r7, %60, %114; +} +{ +sub.f16x2 r10, %61, %115; +} +{ +add.f16x2 r13, %62, %112; +} +{ +add.f16x2 r16, %63, %113; +} +{ +sub.f16x2 r19, %62, %112; +} +{ +sub.f16x2 r22, %63, %113; +} +{ +add.f16x2 r25, %64, %110; +} +{ +add.f16x2 r28, %65, %111; +} +{ +sub.f16x2 r31, %64, %110; +} +{ +sub.f16x2 r34, %65, %111; +} +{ +add.f16x2 r37, %66, %108; +} +{ +add.f16x2 r40, %67, %109; +} +{ +sub.f16x2 r43, %66, %108; +} +{ +sub.f16x2 r46, %67, %109; +} +{ +add.f16x2 r49, %68, %106; +} +{ +add.f16x2 r52, %69, %107; +} +{ +sub.f16x2 r55, %68, %106; +} +{ +sub.f16x2 r58, %69, %107; +} +{ +add.f16x2 r61, %70, %104; +} +{ +add.f16x2 r64, %71, %105; +} +{ +sub.f16x2 r67, %70, %104; +} +{ +sub.f16x2 r70, %71, %105; +} +{ +add.f16x2 r73, %72, %102; +} +{ +add.f16x2 r76, %73, %103; +} +{ +sub.f16x2 r79, %72, %102; +} +{ +sub.f16x2 r82, %73, %103; +} +{ +add.f16x2 r85, %74, %100; +} +{ +add.f16x2 r88, %75, %101; +} +{ +sub.f16x2 r91, %74, %100; +} +{ +sub.f16x2 r94, %75, %101; +} +{ +add.f16x2 r97, %76, %98; +} +{ +add.f16x2 r100, %77, %99; +} +{ +sub.f16x2 r103, %76, %98; +} +{ +sub.f16x2 r106, %77, %99; +} +{ +add.f16x2 r109, %78, %96; +} +{ +add.f16x2 r112, %79, %97; +} +{ +sub.f16x2 r115, %78, %96; +} +{ +sub.f16x2 r118, %79, %97; +} +{ +add.f16x2 r121, %80, %94; +} +{ +add.f16x2 r124, %81, %95; +} +{ +sub.f16x2 r127, %80, %94; +} +{ +sub.f16x2 r130, %81, %95; +} +{ +add.f16x2 r133, %82, %92; +} +{ +add.f16x2 r136, %83, %93; +} +{ +sub.f16x2 r139, %82, %92; +} +{ +sub.f16x2 r142, %83, %93; +} +{ +add.f16x2 r145, %84, %90; +} +{ +add.f16x2 r148, %85, %91; +} +{ +sub.f16x2 r151, %84, %90; +} +{ +sub.f16x2 r154, %85, %91; +} +{ +add.f16x2 r157, %86, %88; +} +{ +add.f16x2 r160, %87, %89; +} +{ +sub.f16x2 r163, %86, %88; +} +{ +sub.f16x2 r166, %87, %89; +} +{ +add.f16x2 r169, %58, r1; +} +{ +add.f16x2 r172, %59, r4; +} +{ +add.f16x2 r175, r169, r13; +} +{ +add.f16x2 r178, r172, r16; +} +{ +add.f16x2 r181, r175, r25; +} +{ +add.f16x2 r184, r178, r28; +} +{ +add.f16x2 r187, r181, r37; +} +{ +add.f16x2 r190, r184, r40; +} +{ +add.f16x2 r193, r187, r49; +} +{ +add.f16x2 r196, r190, r52; +} +{ +add.f16x2 r199, r193, r61; +} +{ +add.f16x2 r202, r196, r64; +} +{ +add.f16x2 r205, r199, r73; +} +{ +add.f16x2 r208, r202, r76; +} +{ +add.f16x2 r211, r205, r85; +} +{ +add.f16x2 r214, r208, r88; +} +{ +add.f16x2 r217, r211, r97; +} +{ +add.f16x2 r220, r214, r100; +} +{ +add.f16x2 r223, r217, r109; +} +{ +add.f16x2 r226, r220, r112; +} +{ +add.f16x2 r229, r223, r121; +} +{ +add.f16x2 r232, r226, r124; +} +{ +add.f16x2 r235, r229, r133; +} +{ +add.f16x2 r238, r232, r136; +} +{ +add.f16x2 r241, r235, r145; +} +{ +add.f16x2 r244, r238, r148; +} +{ +add.f16x2 %0, r241, r157; +} +{ +add.f16x2 %1, r244, r160; +} +mov.u32 r4804, 0; +cvt.rn.f16.s32 rs1, r4804; +mov.b32 r265, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r4804; +mov.b32 r277, {rs2, rs2}; +mov.f64 fd735, 0d3FEF4079C06C0992; +{ +cvt.rn.f16.f64 rs3, fd735; +} +mov.b32 r257, {rs3, rs3}; +{ +mul.f16x2 r255, r1, r257; +} +{ +add.f16x2 r258, %58, r255; +} +mov.f64 fd708, 0dBFCB8426C12812BC; +{ +cvt.rn.f16.f64 rs4, fd708; +} +mov.b32 r263, {rs4, rs4}; +{ +mul.f16x2 r261, r10, r263; +} +{ +add.f16x2 r264, r265, r261; +} +{ +cvt.rn.f16.f64 rs5, fd735; +} +mov.b32 r269, {rs5, rs5}; +{ +mul.f16x2 r267, r4, r269; +} +{ +add.f16x2 r270, %59, r267; +} +{ +cvt.rn.f16.f64 rs6, fd708; +} +mov.b32 r275, {rs6, rs6}; +{ +mul.f16x2 r273, r7, r275; +} +{ +add.f16x2 r276, r277, r273; +} +mov.f64 fd743, 0d3FED0ADB9B447CCF; +{ +cvt.rn.f16.f64 rs7, fd743; +} +mov.b32 r281, {rs7, rs7}; +{ +mul.f16x2 r279, r13, r281; +} +{ +add.f16x2 r282, r258, r279; +} +mov.f64 fd636, 0dBFDADF7689C97B70; +{ +cvt.rn.f16.f64 rs8, fd636; +} +mov.b32 r287, {rs8, rs8}; +{ +mul.f16x2 r285, r22, r287; +} +{ +add.f16x2 r288, r264, r285; +} +{ +cvt.rn.f16.f64 rs9, fd743; +} +mov.b32 r293, {rs9, rs9}; +{ +mul.f16x2 r291, r16, r293; +} +{ +add.f16x2 r294, r270, r291; +} +{ +cvt.rn.f16.f64 rs10, fd636; +} +mov.b32 r299, {rs10, rs10}; +{ +mul.f16x2 r297, r19, r299; +} +{ +add.f16x2 r300, r276, r297; +} +mov.f64 fd751, 0d3FE979982A38E65A; +{ +cvt.rn.f16.f64 rs11, fd751; +} +mov.b32 r305, {rs11, rs11}; +{ +mul.f16x2 r303, r25, r305; +} +{ +add.f16x2 r306, r282, r303; +} +mov.f64 fd540, 0dBFE35D9650D47852; +{ +cvt.rn.f16.f64 rs12, fd540; +} +mov.b32 r311, {rs12, rs12}; +{ +mul.f16x2 r309, r34, r311; +} +{ +add.f16x2 r312, r288, r309; +} +{ +cvt.rn.f16.f64 rs13, fd751; +} +mov.b32 r317, {rs13, rs13}; +{ +mul.f16x2 r315, r28, r317; +} +{ +add.f16x2 r318, r294, r315; +} +{ +cvt.rn.f16.f64 rs14, fd540; +} +mov.b32 r323, {rs14, rs14}; +{ +mul.f16x2 r321, r31, r323; +} +{ +add.f16x2 r324, r300, r321; +} +mov.f64 fd759, 0d3FE4B76371208A62; +{ +cvt.rn.f16.f64 rs15, fd759; +} +mov.b32 r329, {rs15, rs15}; +{ +mul.f16x2 r327, r37, r329; +} +{ +add.f16x2 r330, r306, r327; +} +mov.f64 fd700, 0dBFE863A1ADA0CFA6; +{ +cvt.rn.f16.f64 rs16, fd700; +} +mov.b32 r335, {rs16, rs16}; +{ +mul.f16x2 r333, r46, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +cvt.rn.f16.f64 rs17, fd759; +} +mov.b32 r341, {rs17, rs17}; +{ +mul.f16x2 r339, r40, r341; +} +{ +add.f16x2 r342, r318, r339; +} +{ +cvt.rn.f16.f64 rs18, fd700; +} +mov.b32 r347, {rs18, rs18}; +{ +mul.f16x2 r345, r43, r347; +} +{ +add.f16x2 r348, r324, r345; +} +mov.f64 fd767, 0d3FDDFA67657E7608; +{ +cvt.rn.f16.f64 rs19, fd767; +} +mov.b32 r353, {rs19, rs19}; +{ +mul.f16x2 r351, r49, r353; +} +{ +add.f16x2 r354, r330, r351; +} +mov.f64 fd604, 0dBFEC45BB0D10918C; +{ +cvt.rn.f16.f64 rs20, fd604; +} +mov.b32 r359, {rs20, rs20}; +{ +mul.f16x2 r357, r58, r359; +} +{ +add.f16x2 r360, r336, r357; +} +{ +cvt.rn.f16.f64 rs21, fd767; +} +mov.b32 r365, {rs21, rs21}; +{ +mul.f16x2 r363, r52, r365; +} +{ +add.f16x2 r366, r342, r363; +} +{ +cvt.rn.f16.f64 rs22, fd604; +} +mov.b32 r371, {rs22, rs22}; +{ +mul.f16x2 r369, r55, r371; +} +{ +add.f16x2 r372, r348, r369; +} +mov.f64 fd775, 0d3FD11F2F2E2F1E3B; +{ +cvt.rn.f16.f64 rs23, fd775; +} +mov.b32 r377, {rs23, rs23}; +{ +mul.f16x2 r375, r61, r377; +} +{ +add.f16x2 r378, r354, r375; +} +mov.f64 fd424, 0dBFEED566CB3DCBA1; +{ +cvt.rn.f16.f64 rs24, fd424; +} +mov.b32 r383, {rs24, rs24}; +{ +mul.f16x2 r381, r70, r383; +} +{ +add.f16x2 r384, r360, r381; +} +{ +cvt.rn.f16.f64 rs25, fd775; +} +mov.b32 r389, {rs25, rs25}; +{ +mul.f16x2 r387, r64, r389; +} +{ +add.f16x2 r390, r366, r387; +} +{ +cvt.rn.f16.f64 rs26, fd424; +} +mov.b32 r395, {rs26, rs26}; +{ +mul.f16x2 r393, r67, r395; +} +{ +add.f16x2 r396, r372, r393; +} +mov.f64 fd783, 0d3FABB81853A18977; +{ +cvt.rn.f16.f64 rs27, fd783; +} +mov.b32 r401, {rs27, rs27}; +{ +mul.f16x2 r399, r73, r401; +} +{ +add.f16x2 r402, r378, r399; +} +mov.f64 fd692, 0dBFEFF3FC588E859D; +{ +cvt.rn.f16.f64 rs28, fd692; +} +mov.b32 r407, {rs28, rs28}; +{ +mul.f16x2 r405, r82, r407; +} +{ +add.f16x2 r408, r384, r405; +} +{ +cvt.rn.f16.f64 rs29, fd783; +} +mov.b32 r413, {rs29, rs29}; +{ +mul.f16x2 r411, r76, r413; +} +{ +add.f16x2 r414, r390, r411; +} +{ +cvt.rn.f16.f64 rs30, fd692; +} +mov.b32 r419, {rs30, rs30}; +{ +mul.f16x2 r417, r79, r419; +} +{ +add.f16x2 r420, r396, r417; +} +mov.f64 fd779, 0dBFC4B545C0234A71; +{ +cvt.rn.f16.f64 rs31, fd779; +} +mov.b32 r425, {rs31, rs31}; +{ +mul.f16x2 r423, r85, r425; +} +{ +add.f16x2 r426, r402, r423; +} +mov.f64 fd780, 0dBFEF941537248537; +{ +cvt.rn.f16.f64 rs32, fd780; +} +mov.b32 r431, {rs32, rs32}; +{ +mul.f16x2 r429, r94, r431; +} +{ +add.f16x2 r432, r408, r429; +} +{ +cvt.rn.f16.f64 rs33, fd779; +} +mov.b32 r437, {rs33, rs33}; +{ +mul.f16x2 r435, r88, r437; +} +{ +add.f16x2 r438, r414, r435; +} +{ +cvt.rn.f16.f64 rs34, fd780; +} +mov.b32 r443, {rs34, rs34}; +{ +mul.f16x2 r441, r91, r443; +} +{ +add.f16x2 r444, r420, r441; +} +mov.f64 fd771, 0dBFD7B057F20BF2E4; +{ +cvt.rn.f16.f64 rs35, fd771; +} +mov.b32 r449, {rs35, rs35}; +{ +mul.f16x2 r447, r97, r449; +} +{ +add.f16x2 r450, r426, r447; +} +mov.f64 fd772, 0dBFEDBA2D62CB789F; +{ +cvt.rn.f16.f64 rs36, fd772; +} +mov.b32 r455, {rs36, rs36}; +{ +mul.f16x2 r453, r106, r455; +} +{ +add.f16x2 r456, r432, r453; +} +{ +cvt.rn.f16.f64 rs37, fd771; +} +mov.b32 r461, {rs37, rs37}; +{ +mul.f16x2 r459, r100, r461; +} +{ +add.f16x2 r462, r438, r459; +} +{ +cvt.rn.f16.f64 rs38, fd772; +} +mov.b32 r467, {rs38, rs38}; +{ +mul.f16x2 r465, r103, r467; +} +{ +add.f16x2 r468, r444, r465; +} +mov.f64 fd763, 0dBFE1F53E93956DBF; +{ +cvt.rn.f16.f64 rs39, fd763; +} +mov.b32 r473, {rs39, rs39}; +{ +mul.f16x2 r471, r109, r473; +} +{ +add.f16x2 r474, r450, r471; +} +mov.f64 fd764, 0dBFEA7C6DA34AF89F; +{ +cvt.rn.f16.f64 rs40, fd764; +} +mov.b32 r479, {rs40, rs40}; +{ +mul.f16x2 r477, r118, r479; +} +{ +add.f16x2 r480, r456, r477; +} +{ +cvt.rn.f16.f64 rs41, fd763; +} +mov.b32 r485, {rs41, rs41}; +{ +mul.f16x2 r483, r112, r485; +} +{ +add.f16x2 r486, r462, r483; +} +{ +cvt.rn.f16.f64 rs42, fd764; +} +mov.b32 r491, {rs42, rs42}; +{ +mul.f16x2 r489, r115, r491; +} +{ +add.f16x2 r492, r468, r489; +} +mov.f64 fd755, 0dBFE73B5AE5DB4E10; +{ +cvt.rn.f16.f64 rs43, fd755; +} +mov.b32 r497, {rs43, rs43}; +{ +mul.f16x2 r495, r121, r497; +} +{ +add.f16x2 r498, r474, r495; +} +mov.f64 fd756, 0dBFE601A24BA81342; +{ +cvt.rn.f16.f64 rs44, fd756; +} +mov.b32 r503, {rs44, rs44}; +{ +mul.f16x2 r501, r130, r503; +} +{ +add.f16x2 r504, r480, r501; +} +{ +cvt.rn.f16.f64 rs45, fd755; +} +mov.b32 r509, {rs45, rs45}; +{ +mul.f16x2 r507, r124, r509; +} +{ +add.f16x2 r510, r486, r507; +} +{ +cvt.rn.f16.f64 rs46, fd756; +} +mov.b32 r515, {rs46, rs46}; +{ +mul.f16x2 r513, r127, r515; +} +{ +add.f16x2 r516, r492, r513; +} +mov.f64 fd747, 0dBFEB6B5FBD9F7255; +{ +cvt.rn.f16.f64 rs47, fd747; +} +mov.b32 r521, {rs47, rs47}; +{ +mul.f16x2 r519, r133, r521; +} +{ +add.f16x2 r522, r498, r519; +} +mov.f64 fd748, 0dBFE07F6ACD7CDCE2; +{ +cvt.rn.f16.f64 rs48, fd748; +} +mov.b32 r527, {rs48, rs48}; +{ +mul.f16x2 r525, r142, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs49, fd747; +} +mov.b32 r533, {rs49, rs49}; +{ +mul.f16x2 r531, r136, r533; +} +{ +add.f16x2 r534, r510, r531; +} +{ +cvt.rn.f16.f64 rs50, fd748; +} +mov.b32 r539, {rs50, rs50}; +{ +mul.f16x2 r537, r139, r539; +} +{ +add.f16x2 r540, r516, r537; +} +mov.f64 fd739, 0dBFEE532CBE45C954; +{ +cvt.rn.f16.f64 rs51, fd739; +} +mov.b32 r545, {rs51, rs51}; +{ +mul.f16x2 r543, r145, r545; +} +{ +add.f16x2 r546, r522, r543; +} +mov.f64 fd740, 0dBFD46F6FAF5FCB72; +{ +cvt.rn.f16.f64 rs52, fd740; +} +mov.b32 r551, {rs52, rs52}; +{ +mul.f16x2 r549, r154, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs53, fd739; +} +mov.b32 r557, {rs53, rs53}; +{ +mul.f16x2 r555, r148, r557; +} +{ +add.f16x2 r558, r534, r555; +} +{ +cvt.rn.f16.f64 rs54, fd740; +} +mov.b32 r563, {rs54, rs54}; +{ +mul.f16x2 r561, r151, r563; +} +{ +add.f16x2 r564, r540, r561; +} +mov.f64 fd731, 0dBFEFCFFA67B61650; +{ +cvt.rn.f16.f64 rs55, fd731; +} +mov.b32 r569, {rs55, rs55}; +{ +mul.f16x2 r567, r157, r569; +} +{ +add.f16x2 r570, r546, r567; +} +mov.f64 fd732, 0dBFBBADB02034D9FF; +{ +cvt.rn.f16.f64 rs56, fd732; +} +mov.b32 r575, {rs56, rs56}; +{ +mul.f16x2 r573, r166, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs57, fd731; +} +mov.b32 r581, {rs57, rs57}; +{ +mul.f16x2 r579, r160, r581; +} +{ +add.f16x2 r582, r558, r579; +} +{ +cvt.rn.f16.f64 rs58, fd732; +} +mov.b32 r587, {rs58, rs58}; +{ +mul.f16x2 r585, r163, r587; +} +{ +add.f16x2 r588, r564, r585; +} +{ +sub.f16x2 %2, r570, r576; +} +{ +add.f16x2 %3, r582, r588; +} +{ +add.f16x2 %56, r570, r576; +} +{ +sub.f16x2 %57, r582, r588; +} +cvt.rn.f16.s32 rs59, r4804; +mov.b32 r615, {rs59, rs59}; +cvt.rn.f16.s32 rs60, r4804; +mov.b32 r627, {rs60, rs60}; +{ +cvt.rn.f16.f64 rs61, fd743; +} +mov.b32 r607, {rs61, rs61}; +{ +mul.f16x2 r605, r1, r607; +} +{ +add.f16x2 r608, %58, r605; +} +{ +cvt.rn.f16.f64 rs62, fd636; +} +mov.b32 r613, {rs62, rs62}; +{ +mul.f16x2 r611, r10, r613; +} +{ +add.f16x2 r614, r615, r611; +} +{ +cvt.rn.f16.f64 rs63, fd743; +} +mov.b32 r619, {rs63, rs63}; +{ +mul.f16x2 r617, r4, r619; +} +{ +add.f16x2 r620, %59, r617; +} +{ +cvt.rn.f16.f64 rs64, fd636; +} +mov.b32 r625, {rs64, rs64}; +{ +mul.f16x2 r623, r7, r625; +} +{ +add.f16x2 r626, r627, r623; +} +{ +cvt.rn.f16.f64 rs65, fd759; +} +mov.b32 r631, {rs65, rs65}; +{ +mul.f16x2 r629, r13, r631; +} +{ +add.f16x2 r632, r608, r629; +} +{ +cvt.rn.f16.f64 rs66, fd700; +} +mov.b32 r637, {rs66, rs66}; +{ +mul.f16x2 r635, r22, r637; +} +{ +add.f16x2 r638, r614, r635; +} +{ +cvt.rn.f16.f64 rs67, fd759; +} +mov.b32 r643, {rs67, rs67}; +{ +mul.f16x2 r641, r16, r643; +} +{ +add.f16x2 r644, r620, r641; +} +{ +cvt.rn.f16.f64 rs68, fd700; +} +mov.b32 r649, {rs68, rs68}; +{ +mul.f16x2 r647, r19, r649; +} +{ +add.f16x2 r650, r626, r647; +} +{ +cvt.rn.f16.f64 rs69, fd775; +} +mov.b32 r655, {rs69, rs69}; +{ +mul.f16x2 r653, r25, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs70, fd424; +} +mov.b32 r661, {rs70, rs70}; +{ +mul.f16x2 r659, r34, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs71, fd775; +} +mov.b32 r667, {rs71, rs71}; +{ +mul.f16x2 r665, r28, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs72, fd424; +} +mov.b32 r673, {rs72, rs72}; +{ +mul.f16x2 r671, r31, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs73, fd779; +} +mov.b32 r679, {rs73, rs73}; +{ +mul.f16x2 r677, r37, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs74, fd780; +} +mov.b32 r685, {rs74, rs74}; +{ +mul.f16x2 r683, r46, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs75, fd779; +} +mov.b32 r691, {rs75, rs75}; +{ +mul.f16x2 r689, r40, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs76, fd780; +} +mov.b32 r697, {rs76, rs76}; +{ +mul.f16x2 r695, r43, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs77, fd763; +} +mov.b32 r703, {rs77, rs77}; +{ +mul.f16x2 r701, r49, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs78, fd764; +} +mov.b32 r709, {rs78, rs78}; +{ +mul.f16x2 r707, r58, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs79, fd763; +} +mov.b32 r715, {rs79, rs79}; +{ +mul.f16x2 r713, r52, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs80, fd764; +} +mov.b32 r721, {rs80, rs80}; +{ +mul.f16x2 r719, r55, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs81, fd747; +} +mov.b32 r727, {rs81, rs81}; +{ +mul.f16x2 r725, r61, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs82, fd748; +} +mov.b32 r733, {rs82, rs82}; +{ +mul.f16x2 r731, r70, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs83, fd747; +} +mov.b32 r739, {rs83, rs83}; +{ +mul.f16x2 r737, r64, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs84, fd748; +} +mov.b32 r745, {rs84, rs84}; +{ +mul.f16x2 r743, r67, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +cvt.rn.f16.f64 rs85, fd731; +} +mov.b32 r751, {rs85, rs85}; +{ +mul.f16x2 r749, r73, r751; +} +{ +add.f16x2 r752, r728, r749; +} +{ +cvt.rn.f16.f64 rs86, fd732; +} +mov.b32 r757, {rs86, rs86}; +{ +mul.f16x2 r755, r82, r757; +} +{ +add.f16x2 r758, r734, r755; +} +{ +cvt.rn.f16.f64 rs87, fd731; +} +mov.b32 r763, {rs87, rs87}; +{ +mul.f16x2 r761, r76, r763; +} +{ +add.f16x2 r764, r740, r761; +} +{ +cvt.rn.f16.f64 rs88, fd732; +} +mov.b32 r769, {rs88, rs88}; +{ +mul.f16x2 r767, r79, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs89, fd739; +} +mov.b32 r775, {rs89, rs89}; +{ +mul.f16x2 r773, r85, r775; +} +{ +add.f16x2 r776, r752, r773; +} +mov.f64 fd660, 0d3FD46F6FAF5FCB72; +{ +cvt.rn.f16.f64 rs90, fd660; +} +mov.b32 r781, {rs90, rs90}; +{ +mul.f16x2 r779, r94, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs91, fd739; +} +mov.b32 r787, {rs91, rs91}; +{ +mul.f16x2 r785, r88, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs92, fd660; +} +mov.b32 r793, {rs92, rs92}; +{ +mul.f16x2 r791, r91, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs93, fd755; +} +mov.b32 r799, {rs93, rs93}; +{ +mul.f16x2 r797, r97, r799; +} +{ +add.f16x2 r800, r776, r797; +} +mov.f64 fd456, 0d3FE601A24BA81342; +{ +cvt.rn.f16.f64 rs94, fd456; +} +mov.b32 r805, {rs94, rs94}; +{ +mul.f16x2 r803, r106, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs95, fd755; +} +mov.b32 r811, {rs95, rs95}; +{ +mul.f16x2 r809, r100, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs96, fd456; +} +mov.b32 r817, {rs96, rs96}; +{ +mul.f16x2 r815, r103, r817; +} +{ +add.f16x2 r818, r794, r815; +} +{ +cvt.rn.f16.f64 rs97, fd771; +} +mov.b32 r823, {rs97, rs97}; +{ +mul.f16x2 r821, r109, r823; +} +{ +add.f16x2 r824, r800, r821; +} +mov.f64 fd696, 0d3FEDBA2D62CB789F; +{ +cvt.rn.f16.f64 rs98, fd696; +} +mov.b32 r829, {rs98, rs98}; +{ +mul.f16x2 r827, r118, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs99, fd771; +} +mov.b32 r835, {rs99, rs99}; +{ +mul.f16x2 r833, r112, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs100, fd696; +} +mov.b32 r841, {rs100, rs100}; +{ +mul.f16x2 r839, r115, r841; +} +{ +add.f16x2 r842, r818, r839; +} +{ +cvt.rn.f16.f64 rs101, fd783; +} +mov.b32 r847, {rs101, rs101}; +{ +mul.f16x2 r845, r121, r847; +} +{ +add.f16x2 r848, r824, r845; +} +mov.f64 fd784, 0d3FEFF3FC588E859D; +{ +cvt.rn.f16.f64 rs102, fd784; +} +mov.b32 r853, {rs102, rs102}; +{ +mul.f16x2 r851, r130, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs103, fd783; +} +mov.b32 r859, {rs103, rs103}; +{ +mul.f16x2 r857, r124, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs104, fd784; +} +mov.b32 r865, {rs104, rs104}; +{ +mul.f16x2 r863, r127, r865; +} +{ +add.f16x2 r866, r842, r863; +} +{ +cvt.rn.f16.f64 rs105, fd767; +} +mov.b32 r871, {rs105, rs105}; +{ +mul.f16x2 r869, r133, r871; +} +{ +add.f16x2 r872, r848, r869; +} +mov.f64 fd768, 0d3FEC45BB0D10918C; +{ +cvt.rn.f16.f64 rs106, fd768; +} +mov.b32 r877, {rs106, rs106}; +{ +mul.f16x2 r875, r142, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs107, fd767; +} +mov.b32 r883, {rs107, rs107}; +{ +mul.f16x2 r881, r136, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +cvt.rn.f16.f64 rs108, fd768; +} +mov.b32 r889, {rs108, rs108}; +{ +mul.f16x2 r887, r139, r889; +} +{ +add.f16x2 r890, r866, r887; +} +{ +cvt.rn.f16.f64 rs109, fd751; +} +mov.b32 r895, {rs109, rs109}; +{ +mul.f16x2 r893, r145, r895; +} +{ +add.f16x2 r896, r872, r893; +} +mov.f64 fd752, 0d3FE35D9650D47852; +{ +cvt.rn.f16.f64 rs110, fd752; +} +mov.b32 r901, {rs110, rs110}; +{ +mul.f16x2 r899, r154, r901; +} +{ +add.f16x2 r902, r878, r899; +} +{ +cvt.rn.f16.f64 rs111, fd751; +} +mov.b32 r907, {rs111, rs111}; +{ +mul.f16x2 r905, r148, r907; +} +{ +add.f16x2 r908, r884, r905; +} +{ +cvt.rn.f16.f64 rs112, fd752; +} +mov.b32 r913, {rs112, rs112}; +{ +mul.f16x2 r911, r151, r913; +} +{ +add.f16x2 r914, r890, r911; +} +{ +cvt.rn.f16.f64 rs113, fd735; +} +mov.b32 r919, {rs113, rs113}; +{ +mul.f16x2 r917, r157, r919; +} +{ +add.f16x2 r920, r896, r917; +} +mov.f64 fd736, 0d3FCB8426C12812BC; +{ +cvt.rn.f16.f64 rs114, fd736; +} +mov.b32 r925, {rs114, rs114}; +{ +mul.f16x2 r923, r166, r925; +} +{ +add.f16x2 r926, r902, r923; +} +{ +cvt.rn.f16.f64 rs115, fd735; +} +mov.b32 r931, {rs115, rs115}; +{ +mul.f16x2 r929, r160, r931; +} +{ +add.f16x2 r932, r908, r929; +} +{ +cvt.rn.f16.f64 rs116, fd736; +} +mov.b32 r937, {rs116, rs116}; +{ +mul.f16x2 r935, r163, r937; +} +{ +add.f16x2 r938, r914, r935; +} +{ +sub.f16x2 %4, r920, r926; +} +{ +add.f16x2 %5, r932, r938; +} +{ +add.f16x2 %54, r920, r926; +} +{ +sub.f16x2 %55, r932, r938; +} +cvt.rn.f16.s32 rs117, r4804; +mov.b32 r965, {rs117, rs117}; +cvt.rn.f16.s32 rs118, r4804; +mov.b32 r977, {rs118, rs118}; +{ +cvt.rn.f16.f64 rs119, fd751; +} +mov.b32 r957, {rs119, rs119}; +{ +mul.f16x2 r955, r1, r957; +} +{ +add.f16x2 r958, %58, r955; +} +{ +cvt.rn.f16.f64 rs120, fd540; +} +mov.b32 r963, {rs120, rs120}; +{ +mul.f16x2 r961, r10, r963; +} +{ +add.f16x2 r964, r965, r961; +} +{ +cvt.rn.f16.f64 rs121, fd751; +} +mov.b32 r969, {rs121, rs121}; +{ +mul.f16x2 r967, r4, r969; +} +{ +add.f16x2 r970, %59, r967; +} +{ +cvt.rn.f16.f64 rs122, fd540; +} +mov.b32 r975, {rs122, rs122}; +{ +mul.f16x2 r973, r7, r975; +} +{ +add.f16x2 r976, r977, r973; +} +{ +cvt.rn.f16.f64 rs123, fd775; +} +mov.b32 r981, {rs123, rs123}; +{ +mul.f16x2 r979, r13, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs124, fd424; +} +mov.b32 r987, {rs124, rs124}; +{ +mul.f16x2 r985, r22, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs125, fd775; +} +mov.b32 r993, {rs125, rs125}; +{ +mul.f16x2 r991, r16, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs126, fd424; +} +mov.b32 r999, {rs126, rs126}; +{ +mul.f16x2 r997, r19, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs127, fd771; +} +mov.b32 r1005, {rs127, rs127}; +{ +mul.f16x2 r1003, r25, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs128, fd772; +} +mov.b32 r1011, {rs128, rs128}; +{ +mul.f16x2 r1009, r34, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs129, fd771; +} +mov.b32 r1017, {rs129, rs129}; +{ +mul.f16x2 r1015, r28, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +cvt.rn.f16.f64 rs130, fd772; +} +mov.b32 r1023, {rs130, rs130}; +{ +mul.f16x2 r1021, r31, r1023; +} +{ +add.f16x2 r1024, r1000, r1021; +} +{ +cvt.rn.f16.f64 rs131, fd747; +} +mov.b32 r1029, {rs131, rs131}; +{ +mul.f16x2 r1027, r37, r1029; +} +{ +add.f16x2 r1030, r1006, r1027; +} +{ +cvt.rn.f16.f64 rs132, fd748; +} +mov.b32 r1035, {rs132, rs132}; +{ +mul.f16x2 r1033, r46, r1035; +} +{ +add.f16x2 r1036, r1012, r1033; +} +{ +cvt.rn.f16.f64 rs133, fd747; +} +mov.b32 r1041, {rs133, rs133}; +{ +mul.f16x2 r1039, r40, r1041; +} +{ +add.f16x2 r1042, r1018, r1039; +} +{ +cvt.rn.f16.f64 rs134, fd748; +} +mov.b32 r1047, {rs134, rs134}; +{ +mul.f16x2 r1045, r43, r1047; +} +{ +add.f16x2 r1048, r1024, r1045; +} +{ +cvt.rn.f16.f64 rs135, fd731; +} +mov.b32 r1053, {rs135, rs135}; +{ +mul.f16x2 r1051, r49, r1053; +} +{ +add.f16x2 r1054, r1030, r1051; +} +mov.f64 fd576, 0d3FBBADB02034D9FF; +{ +cvt.rn.f16.f64 rs136, fd576; +} +mov.b32 r1059, {rs136, rs136}; +{ +mul.f16x2 r1057, r58, r1059; +} +{ +add.f16x2 r1060, r1036, r1057; +} +{ +cvt.rn.f16.f64 rs137, fd731; +} +mov.b32 r1065, {rs137, rs137}; +{ +mul.f16x2 r1063, r52, r1065; +} +{ +add.f16x2 r1066, r1042, r1063; +} +{ +cvt.rn.f16.f64 rs138, fd576; +} +mov.b32 r1071, {rs138, rs138}; +{ +mul.f16x2 r1069, r55, r1071; +} +{ +add.f16x2 r1072, r1048, r1069; +} +{ +cvt.rn.f16.f64 rs139, fd755; +} +mov.b32 r1077, {rs139, rs139}; +{ +mul.f16x2 r1075, r61, r1077; +} +{ +add.f16x2 r1078, r1054, r1075; +} +{ +cvt.rn.f16.f64 rs140, fd456; +} +mov.b32 r1083, {rs140, rs140}; +{ +mul.f16x2 r1081, r70, r1083; +} +{ +add.f16x2 r1084, r1060, r1081; +} +{ +cvt.rn.f16.f64 rs141, fd755; +} +mov.b32 r1089, {rs141, rs141}; +{ +mul.f16x2 r1087, r64, r1089; +} +{ +add.f16x2 r1090, r1066, r1087; +} +{ +cvt.rn.f16.f64 rs142, fd456; +} +mov.b32 r1095, {rs142, rs142}; +{ +mul.f16x2 r1093, r67, r1095; +} +{ +add.f16x2 r1096, r1072, r1093; +} +{ +cvt.rn.f16.f64 rs143, fd779; +} +mov.b32 r1101, {rs143, rs143}; +{ +mul.f16x2 r1099, r73, r1101; +} +{ +add.f16x2 r1102, r1078, r1099; +} +mov.f64 fd652, 0d3FEF941537248537; +{ +cvt.rn.f16.f64 rs144, fd652; +} +mov.b32 r1107, {rs144, rs144}; +{ +mul.f16x2 r1105, r82, r1107; +} +{ +add.f16x2 r1108, r1084, r1105; +} +{ +cvt.rn.f16.f64 rs145, fd779; +} +mov.b32 r1113, {rs145, rs145}; +{ +mul.f16x2 r1111, r76, r1113; +} +{ +add.f16x2 r1114, r1090, r1111; +} +{ +cvt.rn.f16.f64 rs146, fd652; +} +mov.b32 r1119, {rs146, rs146}; +{ +mul.f16x2 r1117, r79, r1119; +} +{ +add.f16x2 r1120, r1096, r1117; +} +{ +cvt.rn.f16.f64 rs147, fd767; +} +mov.b32 r1125, {rs147, rs147}; +{ +mul.f16x2 r1123, r85, r1125; +} +{ +add.f16x2 r1126, r1102, r1123; +} +{ +cvt.rn.f16.f64 rs148, fd768; +} +mov.b32 r1131, {rs148, rs148}; +{ +mul.f16x2 r1129, r94, r1131; +} +{ +add.f16x2 r1132, r1108, r1129; +} +{ +cvt.rn.f16.f64 rs149, fd767; +} +mov.b32 r1137, {rs149, rs149}; +{ +mul.f16x2 r1135, r88, r1137; +} +{ +add.f16x2 r1138, r1114, r1135; +} +{ +cvt.rn.f16.f64 rs150, fd768; +} +mov.b32 r1143, {rs150, rs150}; +{ +mul.f16x2 r1141, r91, r1143; +} +{ +add.f16x2 r1144, r1120, r1141; +} +{ +cvt.rn.f16.f64 rs151, fd743; +} +mov.b32 r1149, {rs151, rs151}; +{ +mul.f16x2 r1147, r97, r1149; +} +{ +add.f16x2 r1150, r1126, r1147; +} +mov.f64 fd744, 0d3FDADF7689C97B70; +{ +cvt.rn.f16.f64 rs152, fd744; +} +mov.b32 r1155, {rs152, rs152}; +{ +mul.f16x2 r1153, r106, r1155; +} +{ +add.f16x2 r1156, r1132, r1153; +} +{ +cvt.rn.f16.f64 rs153, fd743; +} +mov.b32 r1161, {rs153, rs153}; +{ +mul.f16x2 r1159, r100, r1161; +} +{ +add.f16x2 r1162, r1138, r1159; +} +{ +cvt.rn.f16.f64 rs154, fd744; +} +mov.b32 r1167, {rs154, rs154}; +{ +mul.f16x2 r1165, r103, r1167; +} +{ +add.f16x2 r1168, r1144, r1165; +} +{ +cvt.rn.f16.f64 rs155, fd735; +} +mov.b32 r1173, {rs155, rs155}; +{ +mul.f16x2 r1171, r109, r1173; +} +{ +add.f16x2 r1174, r1150, r1171; +} +{ +cvt.rn.f16.f64 rs156, fd708; +} +mov.b32 r1179, {rs156, rs156}; +{ +mul.f16x2 r1177, r118, r1179; +} +{ +add.f16x2 r1180, r1156, r1177; +} +{ +cvt.rn.f16.f64 rs157, fd735; +} +mov.b32 r1185, {rs157, rs157}; +{ +mul.f16x2 r1183, r112, r1185; +} +{ +add.f16x2 r1186, r1162, r1183; +} +{ +cvt.rn.f16.f64 rs158, fd708; +} +mov.b32 r1191, {rs158, rs158}; +{ +mul.f16x2 r1189, r115, r1191; +} +{ +add.f16x2 r1192, r1168, r1189; +} +{ +cvt.rn.f16.f64 rs159, fd759; +} +mov.b32 r1197, {rs159, rs159}; +{ +mul.f16x2 r1195, r121, r1197; +} +{ +add.f16x2 r1198, r1174, r1195; +} +{ +cvt.rn.f16.f64 rs160, fd700; +} +mov.b32 r1203, {rs160, rs160}; +{ +mul.f16x2 r1201, r130, r1203; +} +{ +add.f16x2 r1204, r1180, r1201; +} +{ +cvt.rn.f16.f64 rs161, fd759; +} +mov.b32 r1209, {rs161, rs161}; +{ +mul.f16x2 r1207, r124, r1209; +} +{ +add.f16x2 r1210, r1186, r1207; +} +{ +cvt.rn.f16.f64 rs162, fd700; +} +mov.b32 r1215, {rs162, rs162}; +{ +mul.f16x2 r1213, r127, r1215; +} +{ +add.f16x2 r1216, r1192, r1213; +} +{ +cvt.rn.f16.f64 rs163, fd783; +} +mov.b32 r1221, {rs163, rs163}; +{ +mul.f16x2 r1219, r133, r1221; +} +{ +add.f16x2 r1222, r1198, r1219; +} +{ +cvt.rn.f16.f64 rs164, fd692; +} +mov.b32 r1227, {rs164, rs164}; +{ +mul.f16x2 r1225, r142, r1227; +} +{ +add.f16x2 r1228, r1204, r1225; +} +{ +cvt.rn.f16.f64 rs165, fd783; +} +mov.b32 r1233, {rs165, rs165}; +{ +mul.f16x2 r1231, r136, r1233; +} +{ +add.f16x2 r1234, r1210, r1231; +} +{ +cvt.rn.f16.f64 rs166, fd692; +} +mov.b32 r1239, {rs166, rs166}; +{ +mul.f16x2 r1237, r139, r1239; +} +{ +add.f16x2 r1240, r1216, r1237; +} +{ +cvt.rn.f16.f64 rs167, fd763; +} +mov.b32 r1245, {rs167, rs167}; +{ +mul.f16x2 r1243, r145, r1245; +} +{ +add.f16x2 r1246, r1222, r1243; +} +{ +cvt.rn.f16.f64 rs168, fd764; +} +mov.b32 r1251, {rs168, rs168}; +{ +mul.f16x2 r1249, r154, r1251; +} +{ +add.f16x2 r1252, r1228, r1249; +} +{ +cvt.rn.f16.f64 rs169, fd763; +} +mov.b32 r1257, {rs169, rs169}; +{ +mul.f16x2 r1255, r148, r1257; +} +{ +add.f16x2 r1258, r1234, r1255; +} +{ +cvt.rn.f16.f64 rs170, fd764; +} +mov.b32 r1263, {rs170, rs170}; +{ +mul.f16x2 r1261, r151, r1263; +} +{ +add.f16x2 r1264, r1240, r1261; +} +{ +cvt.rn.f16.f64 rs171, fd739; +} +mov.b32 r1269, {rs171, rs171}; +{ +mul.f16x2 r1267, r157, r1269; +} +{ +add.f16x2 r1270, r1246, r1267; +} +{ +cvt.rn.f16.f64 rs172, fd740; +} +mov.b32 r1275, {rs172, rs172}; +{ +mul.f16x2 r1273, r166, r1275; +} +{ +add.f16x2 r1276, r1252, r1273; +} +{ +cvt.rn.f16.f64 rs173, fd739; +} +mov.b32 r1281, {rs173, rs173}; +{ +mul.f16x2 r1279, r160, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs174, fd740; +} +mov.b32 r1287, {rs174, rs174}; +{ +mul.f16x2 r1285, r163, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +sub.f16x2 %6, r1270, r1276; +} +{ +add.f16x2 %7, r1282, r1288; +} +{ +add.f16x2 %52, r1270, r1276; +} +{ +sub.f16x2 %53, r1282, r1288; +} +cvt.rn.f16.s32 rs175, r4804; +mov.b32 r1315, {rs175, rs175}; +cvt.rn.f16.s32 rs176, r4804; +mov.b32 r1327, {rs176, rs176}; +{ +cvt.rn.f16.f64 rs177, fd759; +} +mov.b32 r1307, {rs177, rs177}; +{ +mul.f16x2 r1305, r1, r1307; +} +{ +add.f16x2 r1308, %58, r1305; +} +{ +cvt.rn.f16.f64 rs178, fd700; +} +mov.b32 r1313, {rs178, rs178}; +{ +mul.f16x2 r1311, r10, r1313; +} +{ +add.f16x2 r1314, r1315, r1311; +} +{ +cvt.rn.f16.f64 rs179, fd759; +} +mov.b32 r1319, {rs179, rs179}; +{ +mul.f16x2 r1317, r4, r1319; +} +{ +add.f16x2 r1320, %59, r1317; +} +{ +cvt.rn.f16.f64 rs180, fd700; +} +mov.b32 r1325, {rs180, rs180}; +{ +mul.f16x2 r1323, r7, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +cvt.rn.f16.f64 rs181, fd779; +} +mov.b32 r1331, {rs181, rs181}; +{ +mul.f16x2 r1329, r13, r1331; +} +{ +add.f16x2 r1332, r1308, r1329; +} +{ +cvt.rn.f16.f64 rs182, fd780; +} +mov.b32 r1337, {rs182, rs182}; +{ +mul.f16x2 r1335, r22, r1337; +} +{ +add.f16x2 r1338, r1314, r1335; +} +{ +cvt.rn.f16.f64 rs183, fd779; +} +mov.b32 r1343, {rs183, rs183}; +{ +mul.f16x2 r1341, r16, r1343; +} +{ +add.f16x2 r1344, r1320, r1341; +} +{ +cvt.rn.f16.f64 rs184, fd780; +} +mov.b32 r1349, {rs184, rs184}; +{ +mul.f16x2 r1347, r19, r1349; +} +{ +add.f16x2 r1350, r1326, r1347; +} +{ +cvt.rn.f16.f64 rs185, fd747; +} +mov.b32 r1355, {rs185, rs185}; +{ +mul.f16x2 r1353, r25, r1355; +} +{ +add.f16x2 r1356, r1332, r1353; +} +{ +cvt.rn.f16.f64 rs186, fd748; +} +mov.b32 r1361, {rs186, rs186}; +{ +mul.f16x2 r1359, r34, r1361; +} +{ +add.f16x2 r1362, r1338, r1359; +} +{ +cvt.rn.f16.f64 rs187, fd747; +} +mov.b32 r1367, {rs187, rs187}; +{ +mul.f16x2 r1365, r28, r1367; +} +{ +add.f16x2 r1368, r1344, r1365; +} +{ +cvt.rn.f16.f64 rs188, fd748; +} +mov.b32 r1373, {rs188, rs188}; +{ +mul.f16x2 r1371, r31, r1373; +} +{ +add.f16x2 r1374, r1350, r1371; +} +{ +cvt.rn.f16.f64 rs189, fd739; +} +mov.b32 r1379, {rs189, rs189}; +{ +mul.f16x2 r1377, r37, r1379; +} +{ +add.f16x2 r1380, r1356, r1377; +} +{ +cvt.rn.f16.f64 rs190, fd660; +} +mov.b32 r1385, {rs190, rs190}; +{ +mul.f16x2 r1383, r46, r1385; +} +{ +add.f16x2 r1386, r1362, r1383; +} +{ +cvt.rn.f16.f64 rs191, fd739; +} +mov.b32 r1391, {rs191, rs191}; +{ +mul.f16x2 r1389, r40, r1391; +} +{ +add.f16x2 r1392, r1368, r1389; +} +{ +cvt.rn.f16.f64 rs192, fd660; +} +mov.b32 r1397, {rs192, rs192}; +{ +mul.f16x2 r1395, r43, r1397; +} +{ +add.f16x2 r1398, r1374, r1395; +} +{ +cvt.rn.f16.f64 rs193, fd771; +} +mov.b32 r1403, {rs193, rs193}; +{ +mul.f16x2 r1401, r49, r1403; +} +{ +add.f16x2 r1404, r1380, r1401; +} +{ +cvt.rn.f16.f64 rs194, fd696; +} +mov.b32 r1409, {rs194, rs194}; +{ +mul.f16x2 r1407, r58, r1409; +} +{ +add.f16x2 r1410, r1386, r1407; +} +{ +cvt.rn.f16.f64 rs195, fd771; +} +mov.b32 r1415, {rs195, rs195}; +{ +mul.f16x2 r1413, r52, r1415; +} +{ +add.f16x2 r1416, r1392, r1413; +} +{ +cvt.rn.f16.f64 rs196, fd696; +} +mov.b32 r1421, {rs196, rs196}; +{ +mul.f16x2 r1419, r55, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs197, fd767; +} +mov.b32 r1427, {rs197, rs197}; +{ +mul.f16x2 r1425, r61, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs198, fd768; +} +mov.b32 r1433, {rs198, rs198}; +{ +mul.f16x2 r1431, r70, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs199, fd767; +} +mov.b32 r1439, {rs199, rs199}; +{ +mul.f16x2 r1437, r64, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs200, fd768; +} +mov.b32 r1445, {rs200, rs200}; +{ +mul.f16x2 r1443, r67, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs201, fd735; +} +mov.b32 r1451, {rs201, rs201}; +{ +mul.f16x2 r1449, r73, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs202, fd736; +} +mov.b32 r1457, {rs202, rs202}; +{ +mul.f16x2 r1455, r82, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs203, fd735; +} +mov.b32 r1463, {rs203, rs203}; +{ +mul.f16x2 r1461, r76, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs204, fd736; +} +mov.b32 r1469, {rs204, rs204}; +{ +mul.f16x2 r1467, r79, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs205, fd751; +} +mov.b32 r1475, {rs205, rs205}; +{ +mul.f16x2 r1473, r85, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs206, fd540; +} +mov.b32 r1481, {rs206, rs206}; +{ +mul.f16x2 r1479, r94, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs207, fd751; +} +mov.b32 r1487, {rs207, rs207}; +{ +mul.f16x2 r1485, r88, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs208, fd540; +} +mov.b32 r1493, {rs208, rs208}; +{ +mul.f16x2 r1491, r91, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs209, fd783; +} +mov.b32 r1499, {rs209, rs209}; +{ +mul.f16x2 r1497, r97, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs210, fd692; +} +mov.b32 r1505, {rs210, rs210}; +{ +mul.f16x2 r1503, r106, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +cvt.rn.f16.f64 rs211, fd783; +} +mov.b32 r1511, {rs211, rs211}; +{ +mul.f16x2 r1509, r100, r1511; +} +{ +add.f16x2 r1512, r1488, r1509; +} +{ +cvt.rn.f16.f64 rs212, fd692; +} +mov.b32 r1517, {rs212, rs212}; +{ +mul.f16x2 r1515, r103, r1517; +} +{ +add.f16x2 r1518, r1494, r1515; +} +{ +cvt.rn.f16.f64 rs213, fd755; +} +mov.b32 r1523, {rs213, rs213}; +{ +mul.f16x2 r1521, r109, r1523; +} +{ +add.f16x2 r1524, r1500, r1521; +} +{ +cvt.rn.f16.f64 rs214, fd756; +} +mov.b32 r1529, {rs214, rs214}; +{ +mul.f16x2 r1527, r118, r1529; +} +{ +add.f16x2 r1530, r1506, r1527; +} +{ +cvt.rn.f16.f64 rs215, fd755; +} +mov.b32 r1535, {rs215, rs215}; +{ +mul.f16x2 r1533, r112, r1535; +} +{ +add.f16x2 r1536, r1512, r1533; +} +{ +cvt.rn.f16.f64 rs216, fd756; +} +mov.b32 r1541, {rs216, rs216}; +{ +mul.f16x2 r1539, r115, r1541; +} +{ +add.f16x2 r1542, r1518, r1539; +} +{ +cvt.rn.f16.f64 rs217, fd731; +} +mov.b32 r1547, {rs217, rs217}; +{ +mul.f16x2 r1545, r121, r1547; +} +{ +add.f16x2 r1548, r1524, r1545; +} +{ +cvt.rn.f16.f64 rs218, fd576; +} +mov.b32 r1553, {rs218, rs218}; +{ +mul.f16x2 r1551, r130, r1553; +} +{ +add.f16x2 r1554, r1530, r1551; +} +{ +cvt.rn.f16.f64 rs219, fd731; +} +mov.b32 r1559, {rs219, rs219}; +{ +mul.f16x2 r1557, r124, r1559; +} +{ +add.f16x2 r1560, r1536, r1557; +} +{ +cvt.rn.f16.f64 rs220, fd576; +} +mov.b32 r1565, {rs220, rs220}; +{ +mul.f16x2 r1563, r127, r1565; +} +{ +add.f16x2 r1566, r1542, r1563; +} +{ +cvt.rn.f16.f64 rs221, fd763; +} +mov.b32 r1571, {rs221, rs221}; +{ +mul.f16x2 r1569, r133, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +mov.f64 fd632, 0d3FEA7C6DA34AF89F; +{ +cvt.rn.f16.f64 rs222, fd632; +} +mov.b32 r1577, {rs222, rs222}; +{ +mul.f16x2 r1575, r142, r1577; +} +{ +add.f16x2 r1578, r1554, r1575; +} +{ +cvt.rn.f16.f64 rs223, fd763; +} +mov.b32 r1583, {rs223, rs223}; +{ +mul.f16x2 r1581, r136, r1583; +} +{ +add.f16x2 r1584, r1560, r1581; +} +{ +cvt.rn.f16.f64 rs224, fd632; +} +mov.b32 r1589, {rs224, rs224}; +{ +mul.f16x2 r1587, r139, r1589; +} +{ +add.f16x2 r1590, r1566, r1587; +} +{ +cvt.rn.f16.f64 rs225, fd775; +} +mov.b32 r1595, {rs225, rs225}; +{ +mul.f16x2 r1593, r145, r1595; +} +{ +add.f16x2 r1596, r1572, r1593; +} +mov.f64 fd776, 0d3FEED566CB3DCBA1; +{ +cvt.rn.f16.f64 rs226, fd776; +} +mov.b32 r1601, {rs226, rs226}; +{ +mul.f16x2 r1599, r154, r1601; +} +{ +add.f16x2 r1602, r1578, r1599; +} +{ +cvt.rn.f16.f64 rs227, fd775; +} +mov.b32 r1607, {rs227, rs227}; +{ +mul.f16x2 r1605, r148, r1607; +} +{ +add.f16x2 r1608, r1584, r1605; +} +{ +cvt.rn.f16.f64 rs228, fd776; +} +mov.b32 r1613, {rs228, rs228}; +{ +mul.f16x2 r1611, r151, r1613; +} +{ +add.f16x2 r1614, r1590, r1611; +} +{ +cvt.rn.f16.f64 rs229, fd743; +} +mov.b32 r1619, {rs229, rs229}; +{ +mul.f16x2 r1617, r157, r1619; +} +{ +add.f16x2 r1620, r1596, r1617; +} +{ +cvt.rn.f16.f64 rs230, fd744; +} +mov.b32 r1625, {rs230, rs230}; +{ +mul.f16x2 r1623, r166, r1625; +} +{ +add.f16x2 r1626, r1602, r1623; +} +{ +cvt.rn.f16.f64 rs231, fd743; +} +mov.b32 r1631, {rs231, rs231}; +{ +mul.f16x2 r1629, r160, r1631; +} +{ +add.f16x2 r1632, r1608, r1629; +} +{ +cvt.rn.f16.f64 rs232, fd744; +} +mov.b32 r1637, {rs232, rs232}; +{ +mul.f16x2 r1635, r163, r1637; +} +{ +add.f16x2 r1638, r1614, r1635; +} +{ +sub.f16x2 %8, r1620, r1626; +} +{ +add.f16x2 %9, r1632, r1638; +} +{ +add.f16x2 %50, r1620, r1626; +} +{ +sub.f16x2 %51, r1632, r1638; +} +cvt.rn.f16.s32 rs233, r4804; +mov.b32 r1665, {rs233, rs233}; +cvt.rn.f16.s32 rs234, r4804; +mov.b32 r1677, {rs234, rs234}; +{ +cvt.rn.f16.f64 rs235, fd767; +} +mov.b32 r1657, {rs235, rs235}; +{ +mul.f16x2 r1655, r1, r1657; +} +{ +add.f16x2 r1658, %58, r1655; +} +{ +cvt.rn.f16.f64 rs236, fd604; +} +mov.b32 r1663, {rs236, rs236}; +{ +mul.f16x2 r1661, r10, r1663; +} +{ +add.f16x2 r1664, r1665, r1661; +} +{ +cvt.rn.f16.f64 rs237, fd767; +} +mov.b32 r1669, {rs237, rs237}; +{ +mul.f16x2 r1667, r4, r1669; +} +{ +add.f16x2 r1670, %59, r1667; +} +{ +cvt.rn.f16.f64 rs238, fd604; +} +mov.b32 r1675, {rs238, rs238}; +{ +mul.f16x2 r1673, r7, r1675; +} +{ +add.f16x2 r1676, r1677, r1673; +} +{ +cvt.rn.f16.f64 rs239, fd763; +} +mov.b32 r1681, {rs239, rs239}; +{ +mul.f16x2 r1679, r13, r1681; +} +{ +add.f16x2 r1682, r1658, r1679; +} +{ +cvt.rn.f16.f64 rs240, fd764; +} +mov.b32 r1687, {rs240, rs240}; +{ +mul.f16x2 r1685, r22, r1687; +} +{ +add.f16x2 r1688, r1664, r1685; +} +{ +cvt.rn.f16.f64 rs241, fd763; +} +mov.b32 r1693, {rs241, rs241}; +{ +mul.f16x2 r1691, r16, r1693; +} +{ +add.f16x2 r1694, r1670, r1691; +} +{ +cvt.rn.f16.f64 rs242, fd764; +} +mov.b32 r1699, {rs242, rs242}; +{ +mul.f16x2 r1697, r19, r1699; +} +{ +add.f16x2 r1700, r1676, r1697; +} +{ +cvt.rn.f16.f64 rs243, fd731; +} +mov.b32 r1705, {rs243, rs243}; +{ +mul.f16x2 r1703, r25, r1705; +} +{ +add.f16x2 r1706, r1682, r1703; +} +{ +cvt.rn.f16.f64 rs244, fd576; +} +mov.b32 r1711, {rs244, rs244}; +{ +mul.f16x2 r1709, r34, r1711; +} +{ +add.f16x2 r1712, r1688, r1709; +} +{ +cvt.rn.f16.f64 rs245, fd731; +} +mov.b32 r1717, {rs245, rs245}; +{ +mul.f16x2 r1715, r28, r1717; +} +{ +add.f16x2 r1718, r1694, r1715; +} +{ +cvt.rn.f16.f64 rs246, fd576; +} +mov.b32 r1723, {rs246, rs246}; +{ +mul.f16x2 r1721, r31, r1723; +} +{ +add.f16x2 r1724, r1700, r1721; +} +{ +cvt.rn.f16.f64 rs247, fd771; +} +mov.b32 r1729, {rs247, rs247}; +{ +mul.f16x2 r1727, r37, r1729; +} +{ +add.f16x2 r1730, r1706, r1727; +} +{ +cvt.rn.f16.f64 rs248, fd696; +} +mov.b32 r1735, {rs248, rs248}; +{ +mul.f16x2 r1733, r46, r1735; +} +{ +add.f16x2 r1736, r1712, r1733; +} +{ +cvt.rn.f16.f64 rs249, fd771; +} +mov.b32 r1741, {rs249, rs249}; +{ +mul.f16x2 r1739, r40, r1741; +} +{ +add.f16x2 r1742, r1718, r1739; +} +{ +cvt.rn.f16.f64 rs250, fd696; +} +mov.b32 r1747, {rs250, rs250}; +{ +mul.f16x2 r1745, r43, r1747; +} +{ +add.f16x2 r1748, r1724, r1745; +} +{ +cvt.rn.f16.f64 rs251, fd759; +} +mov.b32 r1753, {rs251, rs251}; +{ +mul.f16x2 r1751, r49, r1753; +} +{ +add.f16x2 r1754, r1730, r1751; +} +mov.f64 fd760, 0d3FE863A1ADA0CFA6; +{ +cvt.rn.f16.f64 rs252, fd760; +} +mov.b32 r1759, {rs252, rs252}; +{ +mul.f16x2 r1757, r58, r1759; +} +{ +add.f16x2 r1760, r1736, r1757; +} +{ +cvt.rn.f16.f64 rs253, fd759; +} +mov.b32 r1765, {rs253, rs253}; +{ +mul.f16x2 r1763, r52, r1765; +} +{ +add.f16x2 r1766, r1742, r1763; +} +{ +cvt.rn.f16.f64 rs254, fd760; +} +mov.b32 r1771, {rs254, rs254}; +{ +mul.f16x2 r1769, r55, r1771; +} +{ +add.f16x2 r1772, r1748, r1769; +} +{ +cvt.rn.f16.f64 rs255, fd735; +} +mov.b32 r1777, {rs255, rs255}; +{ +mul.f16x2 r1775, r61, r1777; +} +{ +add.f16x2 r1778, r1754, r1775; +} +{ +cvt.rn.f16.f64 rs256, fd708; +} +mov.b32 r1783, {rs256, rs256}; +{ +mul.f16x2 r1781, r70, r1783; +} +{ +add.f16x2 r1784, r1760, r1781; +} +{ +cvt.rn.f16.f64 rs257, fd735; +} +mov.b32 r1789, {rs257, rs257}; +{ +mul.f16x2 r1787, r64, r1789; +} +{ +add.f16x2 r1790, r1766, r1787; +} +{ +cvt.rn.f16.f64 rs258, fd708; +} +mov.b32 r1795, {rs258, rs258}; +{ +mul.f16x2 r1793, r67, r1795; +} +{ +add.f16x2 r1796, r1772, r1793; +} +{ +cvt.rn.f16.f64 rs259, fd775; +} +mov.b32 r1801, {rs259, rs259}; +{ +mul.f16x2 r1799, r73, r1801; +} +{ +add.f16x2 r1802, r1778, r1799; +} +{ +cvt.rn.f16.f64 rs260, fd424; +} +mov.b32 r1807, {rs260, rs260}; +{ +mul.f16x2 r1805, r82, r1807; +} +{ +add.f16x2 r1808, r1784, r1805; +} +{ +cvt.rn.f16.f64 rs261, fd775; +} +mov.b32 r1813, {rs261, rs261}; +{ +mul.f16x2 r1811, r76, r1813; +} +{ +add.f16x2 r1814, r1790, r1811; +} +{ +cvt.rn.f16.f64 rs262, fd424; +} +mov.b32 r1819, {rs262, rs262}; +{ +mul.f16x2 r1817, r79, r1819; +} +{ +add.f16x2 r1820, r1796, r1817; +} +{ +cvt.rn.f16.f64 rs263, fd755; +} +mov.b32 r1825, {rs263, rs263}; +{ +mul.f16x2 r1823, r85, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs264, fd756; +} +mov.b32 r1831, {rs264, rs264}; +{ +mul.f16x2 r1829, r94, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs265, fd755; +} +mov.b32 r1837, {rs265, rs265}; +{ +mul.f16x2 r1835, r88, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs266, fd756; +} +mov.b32 r1843, {rs266, rs266}; +{ +mul.f16x2 r1841, r91, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs267, fd739; +} +mov.b32 r1849, {rs267, rs267}; +{ +mul.f16x2 r1847, r97, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs268, fd660; +} +mov.b32 r1855, {rs268, rs268}; +{ +mul.f16x2 r1853, r106, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs269, fd739; +} +mov.b32 r1861, {rs269, rs269}; +{ +mul.f16x2 r1859, r100, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs270, fd660; +} +mov.b32 r1867, {rs270, rs270}; +{ +mul.f16x2 r1865, r103, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs271, fd779; +} +mov.b32 r1873, {rs271, rs271}; +{ +mul.f16x2 r1871, r109, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs272, fd652; +} +mov.b32 r1879, {rs272, rs272}; +{ +mul.f16x2 r1877, r118, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs273, fd779; +} +mov.b32 r1885, {rs273, rs273}; +{ +mul.f16x2 r1883, r112, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs274, fd652; +} +mov.b32 r1891, {rs274, rs274}; +{ +mul.f16x2 r1889, r115, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs275, fd751; +} +mov.b32 r1897, {rs275, rs275}; +{ +mul.f16x2 r1895, r121, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs276, fd752; +} +mov.b32 r1903, {rs276, rs276}; +{ +mul.f16x2 r1901, r130, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs277, fd751; +} +mov.b32 r1909, {rs277, rs277}; +{ +mul.f16x2 r1907, r124, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs278, fd752; +} +mov.b32 r1915, {rs278, rs278}; +{ +mul.f16x2 r1913, r127, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs279, fd743; +} +mov.b32 r1921, {rs279, rs279}; +{ +mul.f16x2 r1919, r133, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs280, fd636; +} +mov.b32 r1927, {rs280, rs280}; +{ +mul.f16x2 r1925, r142, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs281, fd743; +} +mov.b32 r1933, {rs281, rs281}; +{ +mul.f16x2 r1931, r136, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs282, fd636; +} +mov.b32 r1939, {rs282, rs282}; +{ +mul.f16x2 r1937, r139, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +cvt.rn.f16.f64 rs283, fd783; +} +mov.b32 r1945, {rs283, rs283}; +{ +mul.f16x2 r1943, r145, r1945; +} +{ +add.f16x2 r1946, r1922, r1943; +} +{ +cvt.rn.f16.f64 rs284, fd692; +} +mov.b32 r1951, {rs284, rs284}; +{ +mul.f16x2 r1949, r154, r1951; +} +{ +add.f16x2 r1952, r1928, r1949; +} +{ +cvt.rn.f16.f64 rs285, fd783; +} +mov.b32 r1957, {rs285, rs285}; +{ +mul.f16x2 r1955, r148, r1957; +} +{ +add.f16x2 r1958, r1934, r1955; +} +{ +cvt.rn.f16.f64 rs286, fd692; +} +mov.b32 r1963, {rs286, rs286}; +{ +mul.f16x2 r1961, r151, r1963; +} +{ +add.f16x2 r1964, r1940, r1961; +} +{ +cvt.rn.f16.f64 rs287, fd747; +} +mov.b32 r1969, {rs287, rs287}; +{ +mul.f16x2 r1967, r157, r1969; +} +{ +add.f16x2 r1970, r1946, r1967; +} +{ +cvt.rn.f16.f64 rs288, fd748; +} +mov.b32 r1975, {rs288, rs288}; +{ +mul.f16x2 r1973, r166, r1975; +} +{ +add.f16x2 r1976, r1952, r1973; +} +{ +cvt.rn.f16.f64 rs289, fd747; +} +mov.b32 r1981, {rs289, rs289}; +{ +mul.f16x2 r1979, r160, r1981; +} +{ +add.f16x2 r1982, r1958, r1979; +} +{ +cvt.rn.f16.f64 rs290, fd748; +} +mov.b32 r1987, {rs290, rs290}; +{ +mul.f16x2 r1985, r163, r1987; +} +{ +add.f16x2 r1988, r1964, r1985; +} +{ +sub.f16x2 %10, r1970, r1976; +} +{ +add.f16x2 %11, r1982, r1988; +} +{ +add.f16x2 %48, r1970, r1976; +} +{ +sub.f16x2 %49, r1982, r1988; +} +cvt.rn.f16.s32 rs291, r4804; +mov.b32 r2015, {rs291, rs291}; +cvt.rn.f16.s32 rs292, r4804; +mov.b32 r2027, {rs292, rs292}; +{ +cvt.rn.f16.f64 rs293, fd775; +} +mov.b32 r2007, {rs293, rs293}; +{ +mul.f16x2 r2005, r1, r2007; +} +{ +add.f16x2 r2008, %58, r2005; +} +{ +cvt.rn.f16.f64 rs294, fd424; +} +mov.b32 r2013, {rs294, rs294}; +{ +mul.f16x2 r2011, r10, r2013; +} +{ +add.f16x2 r2014, r2015, r2011; +} +{ +cvt.rn.f16.f64 rs295, fd775; +} +mov.b32 r2019, {rs295, rs295}; +{ +mul.f16x2 r2017, r4, r2019; +} +{ +add.f16x2 r2020, %59, r2017; +} +{ +cvt.rn.f16.f64 rs296, fd424; +} +mov.b32 r2025, {rs296, rs296}; +{ +mul.f16x2 r2023, r7, r2025; +} +{ +add.f16x2 r2026, r2027, r2023; +} +{ +cvt.rn.f16.f64 rs297, fd747; +} +mov.b32 r2031, {rs297, rs297}; +{ +mul.f16x2 r2029, r13, r2031; +} +{ +add.f16x2 r2032, r2008, r2029; +} +{ +cvt.rn.f16.f64 rs298, fd748; +} +mov.b32 r2037, {rs298, rs298}; +{ +mul.f16x2 r2035, r22, r2037; +} +{ +add.f16x2 r2038, r2014, r2035; +} +{ +cvt.rn.f16.f64 rs299, fd747; +} +mov.b32 r2043, {rs299, rs299}; +{ +mul.f16x2 r2041, r16, r2043; +} +{ +add.f16x2 r2044, r2020, r2041; +} +{ +cvt.rn.f16.f64 rs300, fd748; +} +mov.b32 r2049, {rs300, rs300}; +{ +mul.f16x2 r2047, r19, r2049; +} +{ +add.f16x2 r2050, r2026, r2047; +} +{ +cvt.rn.f16.f64 rs301, fd755; +} +mov.b32 r2055, {rs301, rs301}; +{ +mul.f16x2 r2053, r25, r2055; +} +{ +add.f16x2 r2056, r2032, r2053; +} +{ +cvt.rn.f16.f64 rs302, fd456; +} +mov.b32 r2061, {rs302, rs302}; +{ +mul.f16x2 r2059, r34, r2061; +} +{ +add.f16x2 r2062, r2038, r2059; +} +{ +cvt.rn.f16.f64 rs303, fd755; +} +mov.b32 r2067, {rs303, rs303}; +{ +mul.f16x2 r2065, r28, r2067; +} +{ +add.f16x2 r2068, r2044, r2065; +} +{ +cvt.rn.f16.f64 rs304, fd456; +} +mov.b32 r2073, {rs304, rs304}; +{ +mul.f16x2 r2071, r31, r2073; +} +{ +add.f16x2 r2074, r2050, r2071; +} +{ +cvt.rn.f16.f64 rs305, fd767; +} +mov.b32 r2079, {rs305, rs305}; +{ +mul.f16x2 r2077, r37, r2079; +} +{ +add.f16x2 r2080, r2056, r2077; +} +{ +cvt.rn.f16.f64 rs306, fd768; +} +mov.b32 r2085, {rs306, rs306}; +{ +mul.f16x2 r2083, r46, r2085; +} +{ +add.f16x2 r2086, r2062, r2083; +} +{ +cvt.rn.f16.f64 rs307, fd767; +} +mov.b32 r2091, {rs307, rs307}; +{ +mul.f16x2 r2089, r40, r2091; +} +{ +add.f16x2 r2092, r2068, r2089; +} +{ +cvt.rn.f16.f64 rs308, fd768; +} +mov.b32 r2097, {rs308, rs308}; +{ +mul.f16x2 r2095, r43, r2097; +} +{ +add.f16x2 r2098, r2074, r2095; +} +{ +cvt.rn.f16.f64 rs309, fd735; +} +mov.b32 r2103, {rs309, rs309}; +{ +mul.f16x2 r2101, r49, r2103; +} +{ +add.f16x2 r2104, r2080, r2101; +} +{ +cvt.rn.f16.f64 rs310, fd708; +} +mov.b32 r2109, {rs310, rs310}; +{ +mul.f16x2 r2107, r58, r2109; +} +{ +add.f16x2 r2110, r2086, r2107; +} +{ +cvt.rn.f16.f64 rs311, fd735; +} +mov.b32 r2115, {rs311, rs311}; +{ +mul.f16x2 r2113, r52, r2115; +} +{ +add.f16x2 r2116, r2092, r2113; +} +{ +cvt.rn.f16.f64 rs312, fd708; +} +mov.b32 r2121, {rs312, rs312}; +{ +mul.f16x2 r2119, r55, r2121; +} +{ +add.f16x2 r2122, r2098, r2119; +} +{ +cvt.rn.f16.f64 rs313, fd783; +} +mov.b32 r2127, {rs313, rs313}; +{ +mul.f16x2 r2125, r61, r2127; +} +{ +add.f16x2 r2128, r2104, r2125; +} +{ +cvt.rn.f16.f64 rs314, fd692; +} +mov.b32 r2133, {rs314, rs314}; +{ +mul.f16x2 r2131, r70, r2133; +} +{ +add.f16x2 r2134, r2110, r2131; +} +{ +cvt.rn.f16.f64 rs315, fd783; +} +mov.b32 r2139, {rs315, rs315}; +{ +mul.f16x2 r2137, r64, r2139; +} +{ +add.f16x2 r2140, r2116, r2137; +} +{ +cvt.rn.f16.f64 rs316, fd692; +} +mov.b32 r2145, {rs316, rs316}; +{ +mul.f16x2 r2143, r67, r2145; +} +{ +add.f16x2 r2146, r2122, r2143; +} +{ +cvt.rn.f16.f64 rs317, fd739; +} +mov.b32 r2151, {rs317, rs317}; +{ +mul.f16x2 r2149, r73, r2151; +} +{ +add.f16x2 r2152, r2128, r2149; +} +{ +cvt.rn.f16.f64 rs318, fd740; +} +mov.b32 r2157, {rs318, rs318}; +{ +mul.f16x2 r2155, r82, r2157; +} +{ +add.f16x2 r2158, r2134, r2155; +} +{ +cvt.rn.f16.f64 rs319, fd739; +} +mov.b32 r2163, {rs319, rs319}; +{ +mul.f16x2 r2161, r76, r2163; +} +{ +add.f16x2 r2164, r2140, r2161; +} +{ +cvt.rn.f16.f64 rs320, fd740; +} +mov.b32 r2169, {rs320, rs320}; +{ +mul.f16x2 r2167, r79, r2169; +} +{ +add.f16x2 r2170, r2146, r2167; +} +{ +cvt.rn.f16.f64 rs321, fd763; +} +mov.b32 r2175, {rs321, rs321}; +{ +mul.f16x2 r2173, r85, r2175; +} +{ +add.f16x2 r2176, r2152, r2173; +} +{ +cvt.rn.f16.f64 rs322, fd632; +} +mov.b32 r2181, {rs322, rs322}; +{ +mul.f16x2 r2179, r94, r2181; +} +{ +add.f16x2 r2182, r2158, r2179; +} +{ +cvt.rn.f16.f64 rs323, fd763; +} +mov.b32 r2187, {rs323, rs323}; +{ +mul.f16x2 r2185, r88, r2187; +} +{ +add.f16x2 r2188, r2164, r2185; +} +{ +cvt.rn.f16.f64 rs324, fd632; +} +mov.b32 r2193, {rs324, rs324}; +{ +mul.f16x2 r2191, r91, r2193; +} +{ +add.f16x2 r2194, r2170, r2191; +} +{ +cvt.rn.f16.f64 rs325, fd759; +} +mov.b32 r2199, {rs325, rs325}; +{ +mul.f16x2 r2197, r97, r2199; +} +{ +add.f16x2 r2200, r2176, r2197; +} +{ +cvt.rn.f16.f64 rs326, fd760; +} +mov.b32 r2205, {rs326, rs326}; +{ +mul.f16x2 r2203, r106, r2205; +} +{ +add.f16x2 r2206, r2182, r2203; +} +{ +cvt.rn.f16.f64 rs327, fd759; +} +mov.b32 r2211, {rs327, rs327}; +{ +mul.f16x2 r2209, r100, r2211; +} +{ +add.f16x2 r2212, r2188, r2209; +} +{ +cvt.rn.f16.f64 rs328, fd760; +} +mov.b32 r2217, {rs328, rs328}; +{ +mul.f16x2 r2215, r103, r2217; +} +{ +add.f16x2 r2218, r2194, r2215; +} +{ +cvt.rn.f16.f64 rs329, fd743; +} +mov.b32 r2223, {rs329, rs329}; +{ +mul.f16x2 r2221, r109, r2223; +} +{ +add.f16x2 r2224, r2200, r2221; +} +{ +cvt.rn.f16.f64 rs330, fd636; +} +mov.b32 r2229, {rs330, rs330}; +{ +mul.f16x2 r2227, r118, r2229; +} +{ +add.f16x2 r2230, r2206, r2227; +} +{ +cvt.rn.f16.f64 rs331, fd743; +} +mov.b32 r2235, {rs331, rs331}; +{ +mul.f16x2 r2233, r112, r2235; +} +{ +add.f16x2 r2236, r2212, r2233; +} +{ +cvt.rn.f16.f64 rs332, fd636; +} +mov.b32 r2241, {rs332, rs332}; +{ +mul.f16x2 r2239, r115, r2241; +} +{ +add.f16x2 r2242, r2218, r2239; +} +{ +cvt.rn.f16.f64 rs333, fd779; +} +mov.b32 r2247, {rs333, rs333}; +{ +mul.f16x2 r2245, r121, r2247; +} +{ +add.f16x2 r2248, r2224, r2245; +} +{ +cvt.rn.f16.f64 rs334, fd780; +} +mov.b32 r2253, {rs334, rs334}; +{ +mul.f16x2 r2251, r130, r2253; +} +{ +add.f16x2 r2254, r2230, r2251; +} +{ +cvt.rn.f16.f64 rs335, fd779; +} +mov.b32 r2259, {rs335, rs335}; +{ +mul.f16x2 r2257, r124, r2259; +} +{ +add.f16x2 r2260, r2236, r2257; +} +{ +cvt.rn.f16.f64 rs336, fd780; +} +mov.b32 r2265, {rs336, rs336}; +{ +mul.f16x2 r2263, r127, r2265; +} +{ +add.f16x2 r2266, r2242, r2263; +} +{ +cvt.rn.f16.f64 rs337, fd731; +} +mov.b32 r2271, {rs337, rs337}; +{ +mul.f16x2 r2269, r133, r2271; +} +{ +add.f16x2 r2272, r2248, r2269; +} +{ +cvt.rn.f16.f64 rs338, fd732; +} +mov.b32 r2277, {rs338, rs338}; +{ +mul.f16x2 r2275, r142, r2277; +} +{ +add.f16x2 r2278, r2254, r2275; +} +{ +cvt.rn.f16.f64 rs339, fd731; +} +mov.b32 r2283, {rs339, rs339}; +{ +mul.f16x2 r2281, r136, r2283; +} +{ +add.f16x2 r2284, r2260, r2281; +} +{ +cvt.rn.f16.f64 rs340, fd732; +} +mov.b32 r2289, {rs340, rs340}; +{ +mul.f16x2 r2287, r139, r2289; +} +{ +add.f16x2 r2290, r2266, r2287; +} +{ +cvt.rn.f16.f64 rs341, fd771; +} +mov.b32 r2295, {rs341, rs341}; +{ +mul.f16x2 r2293, r145, r2295; +} +{ +add.f16x2 r2296, r2272, r2293; +} +{ +cvt.rn.f16.f64 rs342, fd696; +} +mov.b32 r2301, {rs342, rs342}; +{ +mul.f16x2 r2299, r154, r2301; +} +{ +add.f16x2 r2302, r2278, r2299; +} +{ +cvt.rn.f16.f64 rs343, fd771; +} +mov.b32 r2307, {rs343, rs343}; +{ +mul.f16x2 r2305, r148, r2307; +} +{ +add.f16x2 r2308, r2284, r2305; +} +{ +cvt.rn.f16.f64 rs344, fd696; +} +mov.b32 r2313, {rs344, rs344}; +{ +mul.f16x2 r2311, r151, r2313; +} +{ +add.f16x2 r2314, r2290, r2311; +} +{ +cvt.rn.f16.f64 rs345, fd751; +} +mov.b32 r2319, {rs345, rs345}; +{ +mul.f16x2 r2317, r157, r2319; +} +{ +add.f16x2 r2320, r2296, r2317; +} +{ +cvt.rn.f16.f64 rs346, fd752; +} +mov.b32 r2325, {rs346, rs346}; +{ +mul.f16x2 r2323, r166, r2325; +} +{ +add.f16x2 r2326, r2302, r2323; +} +{ +cvt.rn.f16.f64 rs347, fd751; +} +mov.b32 r2331, {rs347, rs347}; +{ +mul.f16x2 r2329, r160, r2331; +} +{ +add.f16x2 r2332, r2308, r2329; +} +{ +cvt.rn.f16.f64 rs348, fd752; +} +mov.b32 r2337, {rs348, rs348}; +{ +mul.f16x2 r2335, r163, r2337; +} +{ +add.f16x2 r2338, r2314, r2335; +} +{ +sub.f16x2 %12, r2320, r2326; +} +{ +add.f16x2 %13, r2332, r2338; +} +{ +add.f16x2 %46, r2320, r2326; +} +{ +sub.f16x2 %47, r2332, r2338; +} +cvt.rn.f16.s32 rs349, r4804; +mov.b32 r2365, {rs349, rs349}; +cvt.rn.f16.s32 rs350, r4804; +mov.b32 r2377, {rs350, rs350}; +{ +cvt.rn.f16.f64 rs351, fd783; +} +mov.b32 r2357, {rs351, rs351}; +{ +mul.f16x2 r2355, r1, r2357; +} +{ +add.f16x2 r2358, %58, r2355; +} +{ +cvt.rn.f16.f64 rs352, fd692; +} +mov.b32 r2363, {rs352, rs352}; +{ +mul.f16x2 r2361, r10, r2363; +} +{ +add.f16x2 r2364, r2365, r2361; +} +{ +cvt.rn.f16.f64 rs353, fd783; +} +mov.b32 r2369, {rs353, rs353}; +{ +mul.f16x2 r2367, r4, r2369; +} +{ +add.f16x2 r2370, %59, r2367; +} +{ +cvt.rn.f16.f64 rs354, fd692; +} +mov.b32 r2375, {rs354, rs354}; +{ +mul.f16x2 r2373, r7, r2375; +} +{ +add.f16x2 r2376, r2377, r2373; +} +{ +cvt.rn.f16.f64 rs355, fd731; +} +mov.b32 r2381, {rs355, rs355}; +{ +mul.f16x2 r2379, r13, r2381; +} +{ +add.f16x2 r2382, r2358, r2379; +} +{ +cvt.rn.f16.f64 rs356, fd732; +} +mov.b32 r2387, {rs356, rs356}; +{ +mul.f16x2 r2385, r22, r2387; +} +{ +add.f16x2 r2388, r2364, r2385; +} +{ +cvt.rn.f16.f64 rs357, fd731; +} +mov.b32 r2393, {rs357, rs357}; +{ +mul.f16x2 r2391, r16, r2393; +} +{ +add.f16x2 r2394, r2370, r2391; +} +{ +cvt.rn.f16.f64 rs358, fd732; +} +mov.b32 r2399, {rs358, rs358}; +{ +mul.f16x2 r2397, r19, r2399; +} +{ +add.f16x2 r2400, r2376, r2397; +} +{ +cvt.rn.f16.f64 rs359, fd779; +} +mov.b32 r2405, {rs359, rs359}; +{ +mul.f16x2 r2403, r25, r2405; +} +{ +add.f16x2 r2406, r2382, r2403; +} +{ +cvt.rn.f16.f64 rs360, fd652; +} +mov.b32 r2411, {rs360, rs360}; +{ +mul.f16x2 r2409, r34, r2411; +} +{ +add.f16x2 r2412, r2388, r2409; +} +{ +cvt.rn.f16.f64 rs361, fd779; +} +mov.b32 r2417, {rs361, rs361}; +{ +mul.f16x2 r2415, r28, r2417; +} +{ +add.f16x2 r2418, r2394, r2415; +} +{ +cvt.rn.f16.f64 rs362, fd652; +} +mov.b32 r2423, {rs362, rs362}; +{ +mul.f16x2 r2421, r31, r2423; +} +{ +add.f16x2 r2424, r2400, r2421; +} +{ +cvt.rn.f16.f64 rs363, fd735; +} +mov.b32 r2429, {rs363, rs363}; +{ +mul.f16x2 r2427, r37, r2429; +} +{ +add.f16x2 r2430, r2406, r2427; +} +{ +cvt.rn.f16.f64 rs364, fd736; +} +mov.b32 r2435, {rs364, rs364}; +{ +mul.f16x2 r2433, r46, r2435; +} +{ +add.f16x2 r2436, r2412, r2433; +} +{ +cvt.rn.f16.f64 rs365, fd735; +} +mov.b32 r2441, {rs365, rs365}; +{ +mul.f16x2 r2439, r40, r2441; +} +{ +add.f16x2 r2442, r2418, r2439; +} +{ +cvt.rn.f16.f64 rs366, fd736; +} +mov.b32 r2447, {rs366, rs366}; +{ +mul.f16x2 r2445, r43, r2447; +} +{ +add.f16x2 r2448, r2424, r2445; +} +{ +cvt.rn.f16.f64 rs367, fd775; +} +mov.b32 r2453, {rs367, rs367}; +{ +mul.f16x2 r2451, r49, r2453; +} +{ +add.f16x2 r2454, r2430, r2451; +} +{ +cvt.rn.f16.f64 rs368, fd424; +} +mov.b32 r2459, {rs368, rs368}; +{ +mul.f16x2 r2457, r58, r2459; +} +{ +add.f16x2 r2460, r2436, r2457; +} +{ +cvt.rn.f16.f64 rs369, fd775; +} +mov.b32 r2465, {rs369, rs369}; +{ +mul.f16x2 r2463, r52, r2465; +} +{ +add.f16x2 r2466, r2442, r2463; +} +{ +cvt.rn.f16.f64 rs370, fd424; +} +mov.b32 r2471, {rs370, rs370}; +{ +mul.f16x2 r2469, r55, r2471; +} +{ +add.f16x2 r2472, r2448, r2469; +} +{ +cvt.rn.f16.f64 rs371, fd739; +} +mov.b32 r2477, {rs371, rs371}; +{ +mul.f16x2 r2475, r61, r2477; +} +{ +add.f16x2 r2478, r2454, r2475; +} +{ +cvt.rn.f16.f64 rs372, fd740; +} +mov.b32 r2483, {rs372, rs372}; +{ +mul.f16x2 r2481, r70, r2483; +} +{ +add.f16x2 r2484, r2460, r2481; +} +{ +cvt.rn.f16.f64 rs373, fd739; +} +mov.b32 r2489, {rs373, rs373}; +{ +mul.f16x2 r2487, r64, r2489; +} +{ +add.f16x2 r2490, r2466, r2487; +} +{ +cvt.rn.f16.f64 rs374, fd740; +} +mov.b32 r2495, {rs374, rs374}; +{ +mul.f16x2 r2493, r67, r2495; +} +{ +add.f16x2 r2496, r2472, r2493; +} +{ +cvt.rn.f16.f64 rs375, fd771; +} +mov.b32 r2501, {rs375, rs375}; +{ +mul.f16x2 r2499, r73, r2501; +} +{ +add.f16x2 r2502, r2478, r2499; +} +{ +cvt.rn.f16.f64 rs376, fd696; +} +mov.b32 r2507, {rs376, rs376}; +{ +mul.f16x2 r2505, r82, r2507; +} +{ +add.f16x2 r2508, r2484, r2505; +} +{ +cvt.rn.f16.f64 rs377, fd771; +} +mov.b32 r2513, {rs377, rs377}; +{ +mul.f16x2 r2511, r76, r2513; +} +{ +add.f16x2 r2514, r2490, r2511; +} +{ +cvt.rn.f16.f64 rs378, fd696; +} +mov.b32 r2519, {rs378, rs378}; +{ +mul.f16x2 r2517, r79, r2519; +} +{ +add.f16x2 r2520, r2496, r2517; +} +{ +cvt.rn.f16.f64 rs379, fd743; +} +mov.b32 r2525, {rs379, rs379}; +{ +mul.f16x2 r2523, r85, r2525; +} +{ +add.f16x2 r2526, r2502, r2523; +} +{ +cvt.rn.f16.f64 rs380, fd744; +} +mov.b32 r2531, {rs380, rs380}; +{ +mul.f16x2 r2529, r94, r2531; +} +{ +add.f16x2 r2532, r2508, r2529; +} +{ +cvt.rn.f16.f64 rs381, fd743; +} +mov.b32 r2537, {rs381, rs381}; +{ +mul.f16x2 r2535, r88, r2537; +} +{ +add.f16x2 r2538, r2514, r2535; +} +{ +cvt.rn.f16.f64 rs382, fd744; +} +mov.b32 r2543, {rs382, rs382}; +{ +mul.f16x2 r2541, r91, r2543; +} +{ +add.f16x2 r2544, r2520, r2541; +} +{ +cvt.rn.f16.f64 rs383, fd767; +} +mov.b32 r2549, {rs383, rs383}; +{ +mul.f16x2 r2547, r97, r2549; +} +{ +add.f16x2 r2550, r2526, r2547; +} +{ +cvt.rn.f16.f64 rs384, fd604; +} +mov.b32 r2555, {rs384, rs384}; +{ +mul.f16x2 r2553, r106, r2555; +} +{ +add.f16x2 r2556, r2532, r2553; +} +{ +cvt.rn.f16.f64 rs385, fd767; +} +mov.b32 r2561, {rs385, rs385}; +{ +mul.f16x2 r2559, r100, r2561; +} +{ +add.f16x2 r2562, r2538, r2559; +} +{ +cvt.rn.f16.f64 rs386, fd604; +} +mov.b32 r2567, {rs386, rs386}; +{ +mul.f16x2 r2565, r103, r2567; +} +{ +add.f16x2 r2568, r2544, r2565; +} +{ +cvt.rn.f16.f64 rs387, fd747; +} +mov.b32 r2573, {rs387, rs387}; +{ +mul.f16x2 r2571, r109, r2573; +} +{ +add.f16x2 r2574, r2550, r2571; +} +{ +cvt.rn.f16.f64 rs388, fd748; +} +mov.b32 r2579, {rs388, rs388}; +{ +mul.f16x2 r2577, r118, r2579; +} +{ +add.f16x2 r2580, r2556, r2577; +} +{ +cvt.rn.f16.f64 rs389, fd747; +} +mov.b32 r2585, {rs389, rs389}; +{ +mul.f16x2 r2583, r112, r2585; +} +{ +add.f16x2 r2586, r2562, r2583; +} +{ +cvt.rn.f16.f64 rs390, fd748; +} +mov.b32 r2591, {rs390, rs390}; +{ +mul.f16x2 r2589, r115, r2591; +} +{ +add.f16x2 r2592, r2568, r2589; +} +{ +cvt.rn.f16.f64 rs391, fd763; +} +mov.b32 r2597, {rs391, rs391}; +{ +mul.f16x2 r2595, r121, r2597; +} +{ +add.f16x2 r2598, r2574, r2595; +} +{ +cvt.rn.f16.f64 rs392, fd632; +} +mov.b32 r2603, {rs392, rs392}; +{ +mul.f16x2 r2601, r130, r2603; +} +{ +add.f16x2 r2604, r2580, r2601; +} +{ +cvt.rn.f16.f64 rs393, fd763; +} +mov.b32 r2609, {rs393, rs393}; +{ +mul.f16x2 r2607, r124, r2609; +} +{ +add.f16x2 r2610, r2586, r2607; +} +{ +cvt.rn.f16.f64 rs394, fd632; +} +mov.b32 r2615, {rs394, rs394}; +{ +mul.f16x2 r2613, r127, r2615; +} +{ +add.f16x2 r2616, r2592, r2613; +} +{ +cvt.rn.f16.f64 rs395, fd751; +} +mov.b32 r2621, {rs395, rs395}; +{ +mul.f16x2 r2619, r133, r2621; +} +{ +add.f16x2 r2622, r2598, r2619; +} +{ +cvt.rn.f16.f64 rs396, fd752; +} +mov.b32 r2627, {rs396, rs396}; +{ +mul.f16x2 r2625, r142, r2627; +} +{ +add.f16x2 r2628, r2604, r2625; +} +{ +cvt.rn.f16.f64 rs397, fd751; +} +mov.b32 r2633, {rs397, rs397}; +{ +mul.f16x2 r2631, r136, r2633; +} +{ +add.f16x2 r2634, r2610, r2631; +} +{ +cvt.rn.f16.f64 rs398, fd752; +} +mov.b32 r2639, {rs398, rs398}; +{ +mul.f16x2 r2637, r139, r2639; +} +{ +add.f16x2 r2640, r2616, r2637; +} +{ +cvt.rn.f16.f64 rs399, fd759; +} +mov.b32 r2645, {rs399, rs399}; +{ +mul.f16x2 r2643, r145, r2645; +} +{ +add.f16x2 r2646, r2622, r2643; +} +{ +cvt.rn.f16.f64 rs400, fd700; +} +mov.b32 r2651, {rs400, rs400}; +{ +mul.f16x2 r2649, r154, r2651; +} +{ +add.f16x2 r2652, r2628, r2649; +} +{ +cvt.rn.f16.f64 rs401, fd759; +} +mov.b32 r2657, {rs401, rs401}; +{ +mul.f16x2 r2655, r148, r2657; +} +{ +add.f16x2 r2658, r2634, r2655; +} +{ +cvt.rn.f16.f64 rs402, fd700; +} +mov.b32 r2663, {rs402, rs402}; +{ +mul.f16x2 r2661, r151, r2663; +} +{ +add.f16x2 r2664, r2640, r2661; +} +{ +cvt.rn.f16.f64 rs403, fd755; +} +mov.b32 r2669, {rs403, rs403}; +{ +mul.f16x2 r2667, r157, r2669; +} +{ +add.f16x2 r2670, r2646, r2667; +} +{ +cvt.rn.f16.f64 rs404, fd756; +} +mov.b32 r2675, {rs404, rs404}; +{ +mul.f16x2 r2673, r166, r2675; +} +{ +add.f16x2 r2676, r2652, r2673; +} +{ +cvt.rn.f16.f64 rs405, fd755; +} +mov.b32 r2681, {rs405, rs405}; +{ +mul.f16x2 r2679, r160, r2681; +} +{ +add.f16x2 r2682, r2658, r2679; +} +{ +cvt.rn.f16.f64 rs406, fd756; +} +mov.b32 r2687, {rs406, rs406}; +{ +mul.f16x2 r2685, r163, r2687; +} +{ +add.f16x2 r2688, r2664, r2685; +} +{ +sub.f16x2 %14, r2670, r2676; +} +{ +add.f16x2 %15, r2682, r2688; +} +{ +add.f16x2 %44, r2670, r2676; +} +{ +sub.f16x2 %45, r2682, r2688; +} +cvt.rn.f16.s32 rs407, r4804; +mov.b32 r2715, {rs407, rs407}; +cvt.rn.f16.s32 rs408, r4804; +mov.b32 r2727, {rs408, rs408}; +{ +cvt.rn.f16.f64 rs409, fd779; +} +mov.b32 r2707, {rs409, rs409}; +{ +mul.f16x2 r2705, r1, r2707; +} +{ +add.f16x2 r2708, %58, r2705; +} +{ +cvt.rn.f16.f64 rs410, fd780; +} +mov.b32 r2713, {rs410, rs410}; +{ +mul.f16x2 r2711, r10, r2713; +} +{ +add.f16x2 r2714, r2715, r2711; +} +{ +cvt.rn.f16.f64 rs411, fd779; +} +mov.b32 r2719, {rs411, rs411}; +{ +mul.f16x2 r2717, r4, r2719; +} +{ +add.f16x2 r2720, %59, r2717; +} +{ +cvt.rn.f16.f64 rs412, fd780; +} +mov.b32 r2725, {rs412, rs412}; +{ +mul.f16x2 r2723, r7, r2725; +} +{ +add.f16x2 r2726, r2727, r2723; +} +{ +cvt.rn.f16.f64 rs413, fd739; +} +mov.b32 r2731, {rs413, rs413}; +{ +mul.f16x2 r2729, r13, r2731; +} +{ +add.f16x2 r2732, r2708, r2729; +} +{ +cvt.rn.f16.f64 rs414, fd660; +} +mov.b32 r2737, {rs414, rs414}; +{ +mul.f16x2 r2735, r22, r2737; +} +{ +add.f16x2 r2738, r2714, r2735; +} +{ +cvt.rn.f16.f64 rs415, fd739; +} +mov.b32 r2743, {rs415, rs415}; +{ +mul.f16x2 r2741, r16, r2743; +} +{ +add.f16x2 r2744, r2720, r2741; +} +{ +cvt.rn.f16.f64 rs416, fd660; +} +mov.b32 r2749, {rs416, rs416}; +{ +mul.f16x2 r2747, r19, r2749; +} +{ +add.f16x2 r2750, r2726, r2747; +} +{ +cvt.rn.f16.f64 rs417, fd767; +} +mov.b32 r2755, {rs417, rs417}; +{ +mul.f16x2 r2753, r25, r2755; +} +{ +add.f16x2 r2756, r2732, r2753; +} +{ +cvt.rn.f16.f64 rs418, fd768; +} +mov.b32 r2761, {rs418, rs418}; +{ +mul.f16x2 r2759, r34, r2761; +} +{ +add.f16x2 r2762, r2738, r2759; +} +{ +cvt.rn.f16.f64 rs419, fd767; +} +mov.b32 r2767, {rs419, rs419}; +{ +mul.f16x2 r2765, r28, r2767; +} +{ +add.f16x2 r2768, r2744, r2765; +} +{ +cvt.rn.f16.f64 rs420, fd768; +} +mov.b32 r2773, {rs420, rs420}; +{ +mul.f16x2 r2771, r31, r2773; +} +{ +add.f16x2 r2774, r2750, r2771; +} +{ +cvt.rn.f16.f64 rs421, fd751; +} +mov.b32 r2779, {rs421, rs421}; +{ +mul.f16x2 r2777, r37, r2779; +} +{ +add.f16x2 r2780, r2756, r2777; +} +{ +cvt.rn.f16.f64 rs422, fd540; +} +mov.b32 r2785, {rs422, rs422}; +{ +mul.f16x2 r2783, r46, r2785; +} +{ +add.f16x2 r2786, r2762, r2783; +} +{ +cvt.rn.f16.f64 rs423, fd751; +} +mov.b32 r2791, {rs423, rs423}; +{ +mul.f16x2 r2789, r40, r2791; +} +{ +add.f16x2 r2792, r2768, r2789; +} +{ +cvt.rn.f16.f64 rs424, fd540; +} +mov.b32 r2797, {rs424, rs424}; +{ +mul.f16x2 r2795, r43, r2797; +} +{ +add.f16x2 r2798, r2774, r2795; +} +{ +cvt.rn.f16.f64 rs425, fd755; +} +mov.b32 r2803, {rs425, rs425}; +{ +mul.f16x2 r2801, r49, r2803; +} +{ +add.f16x2 r2804, r2780, r2801; +} +{ +cvt.rn.f16.f64 rs426, fd756; +} +mov.b32 r2809, {rs426, rs426}; +{ +mul.f16x2 r2807, r58, r2809; +} +{ +add.f16x2 r2810, r2786, r2807; +} +{ +cvt.rn.f16.f64 rs427, fd755; +} +mov.b32 r2815, {rs427, rs427}; +{ +mul.f16x2 r2813, r52, r2815; +} +{ +add.f16x2 r2816, r2792, r2813; +} +{ +cvt.rn.f16.f64 rs428, fd756; +} +mov.b32 r2821, {rs428, rs428}; +{ +mul.f16x2 r2819, r55, r2821; +} +{ +add.f16x2 r2822, r2798, r2819; +} +{ +cvt.rn.f16.f64 rs429, fd763; +} +mov.b32 r2827, {rs429, rs429}; +{ +mul.f16x2 r2825, r61, r2827; +} +{ +add.f16x2 r2828, r2804, r2825; +} +{ +cvt.rn.f16.f64 rs430, fd632; +} +mov.b32 r2833, {rs430, rs430}; +{ +mul.f16x2 r2831, r70, r2833; +} +{ +add.f16x2 r2834, r2810, r2831; +} +{ +cvt.rn.f16.f64 rs431, fd763; +} +mov.b32 r2839, {rs431, rs431}; +{ +mul.f16x2 r2837, r64, r2839; +} +{ +add.f16x2 r2840, r2816, r2837; +} +{ +cvt.rn.f16.f64 rs432, fd632; +} +mov.b32 r2845, {rs432, rs432}; +{ +mul.f16x2 r2843, r67, r2845; +} +{ +add.f16x2 r2846, r2822, r2843; +} +{ +cvt.rn.f16.f64 rs433, fd743; +} +mov.b32 r2851, {rs433, rs433}; +{ +mul.f16x2 r2849, r73, r2851; +} +{ +add.f16x2 r2852, r2828, r2849; +} +{ +cvt.rn.f16.f64 rs434, fd744; +} +mov.b32 r2857, {rs434, rs434}; +{ +mul.f16x2 r2855, r82, r2857; +} +{ +add.f16x2 r2858, r2834, r2855; +} +{ +cvt.rn.f16.f64 rs435, fd743; +} +mov.b32 r2863, {rs435, rs435}; +{ +mul.f16x2 r2861, r76, r2863; +} +{ +add.f16x2 r2864, r2840, r2861; +} +{ +cvt.rn.f16.f64 rs436, fd744; +} +mov.b32 r2869, {rs436, rs436}; +{ +mul.f16x2 r2867, r79, r2869; +} +{ +add.f16x2 r2870, r2846, r2867; +} +{ +cvt.rn.f16.f64 rs437, fd775; +} +mov.b32 r2875, {rs437, rs437}; +{ +mul.f16x2 r2873, r85, r2875; +} +{ +add.f16x2 r2876, r2852, r2873; +} +{ +cvt.rn.f16.f64 rs438, fd424; +} +mov.b32 r2881, {rs438, rs438}; +{ +mul.f16x2 r2879, r94, r2881; +} +{ +add.f16x2 r2882, r2858, r2879; +} +{ +cvt.rn.f16.f64 rs439, fd775; +} +mov.b32 r2887, {rs439, rs439}; +{ +mul.f16x2 r2885, r88, r2887; +} +{ +add.f16x2 r2888, r2864, r2885; +} +{ +cvt.rn.f16.f64 rs440, fd424; +} +mov.b32 r2893, {rs440, rs440}; +{ +mul.f16x2 r2891, r91, r2893; +} +{ +add.f16x2 r2894, r2870, r2891; +} +{ +cvt.rn.f16.f64 rs441, fd731; +} +mov.b32 r2899, {rs441, rs441}; +{ +mul.f16x2 r2897, r97, r2899; +} +{ +add.f16x2 r2900, r2876, r2897; +} +{ +cvt.rn.f16.f64 rs442, fd732; +} +mov.b32 r2905, {rs442, rs442}; +{ +mul.f16x2 r2903, r106, r2905; +} +{ +add.f16x2 r2906, r2882, r2903; +} +{ +cvt.rn.f16.f64 rs443, fd731; +} +mov.b32 r2911, {rs443, rs443}; +{ +mul.f16x2 r2909, r100, r2911; +} +{ +add.f16x2 r2912, r2888, r2909; +} +{ +cvt.rn.f16.f64 rs444, fd732; +} +mov.b32 r2917, {rs444, rs444}; +{ +mul.f16x2 r2915, r103, r2917; +} +{ +add.f16x2 r2918, r2894, r2915; +} +{ +cvt.rn.f16.f64 rs445, fd783; +} +mov.b32 r2923, {rs445, rs445}; +{ +mul.f16x2 r2921, r109, r2923; +} +{ +add.f16x2 r2924, r2900, r2921; +} +{ +cvt.rn.f16.f64 rs446, fd784; +} +mov.b32 r2929, {rs446, rs446}; +{ +mul.f16x2 r2927, r118, r2929; +} +{ +add.f16x2 r2930, r2906, r2927; +} +{ +cvt.rn.f16.f64 rs447, fd783; +} +mov.b32 r2935, {rs447, rs447}; +{ +mul.f16x2 r2933, r112, r2935; +} +{ +add.f16x2 r2936, r2912, r2933; +} +{ +cvt.rn.f16.f64 rs448, fd784; +} +mov.b32 r2941, {rs448, rs448}; +{ +mul.f16x2 r2939, r115, r2941; +} +{ +add.f16x2 r2942, r2918, r2939; +} +{ +cvt.rn.f16.f64 rs449, fd735; +} +mov.b32 r2947, {rs449, rs449}; +{ +mul.f16x2 r2945, r121, r2947; +} +{ +add.f16x2 r2948, r2924, r2945; +} +{ +cvt.rn.f16.f64 rs450, fd708; +} +mov.b32 r2953, {rs450, rs450}; +{ +mul.f16x2 r2951, r130, r2953; +} +{ +add.f16x2 r2954, r2930, r2951; +} +{ +cvt.rn.f16.f64 rs451, fd735; +} +mov.b32 r2959, {rs451, rs451}; +{ +mul.f16x2 r2957, r124, r2959; +} +{ +add.f16x2 r2960, r2936, r2957; +} +{ +cvt.rn.f16.f64 rs452, fd708; +} +mov.b32 r2965, {rs452, rs452}; +{ +mul.f16x2 r2963, r127, r2965; +} +{ +add.f16x2 r2966, r2942, r2963; +} +{ +cvt.rn.f16.f64 rs453, fd771; +} +mov.b32 r2971, {rs453, rs453}; +{ +mul.f16x2 r2969, r133, r2971; +} +{ +add.f16x2 r2972, r2948, r2969; +} +{ +cvt.rn.f16.f64 rs454, fd772; +} +mov.b32 r2977, {rs454, rs454}; +{ +mul.f16x2 r2975, r142, r2977; +} +{ +add.f16x2 r2978, r2954, r2975; +} +{ +cvt.rn.f16.f64 rs455, fd771; +} +mov.b32 r2983, {rs455, rs455}; +{ +mul.f16x2 r2981, r136, r2983; +} +{ +add.f16x2 r2984, r2960, r2981; +} +{ +cvt.rn.f16.f64 rs456, fd772; +} +mov.b32 r2989, {rs456, rs456}; +{ +mul.f16x2 r2987, r139, r2989; +} +{ +add.f16x2 r2990, r2966, r2987; +} +{ +cvt.rn.f16.f64 rs457, fd747; +} +mov.b32 r2995, {rs457, rs457}; +{ +mul.f16x2 r2993, r145, r2995; +} +{ +add.f16x2 r2996, r2972, r2993; +} +mov.f64 fd704, 0d3FE07F6ACD7CDCE2; +{ +cvt.rn.f16.f64 rs458, fd704; +} +mov.b32 r3001, {rs458, rs458}; +{ +mul.f16x2 r2999, r154, r3001; +} +{ +add.f16x2 r3002, r2978, r2999; +} +{ +cvt.rn.f16.f64 rs459, fd747; +} +mov.b32 r3007, {rs459, rs459}; +{ +mul.f16x2 r3005, r148, r3007; +} +{ +add.f16x2 r3008, r2984, r3005; +} +{ +cvt.rn.f16.f64 rs460, fd704; +} +mov.b32 r3013, {rs460, rs460}; +{ +mul.f16x2 r3011, r151, r3013; +} +{ +add.f16x2 r3014, r2990, r3011; +} +{ +cvt.rn.f16.f64 rs461, fd759; +} +mov.b32 r3019, {rs461, rs461}; +{ +mul.f16x2 r3017, r157, r3019; +} +{ +add.f16x2 r3020, r2996, r3017; +} +{ +cvt.rn.f16.f64 rs462, fd760; +} +mov.b32 r3025, {rs462, rs462}; +{ +mul.f16x2 r3023, r166, r3025; +} +{ +add.f16x2 r3026, r3002, r3023; +} +{ +cvt.rn.f16.f64 rs463, fd759; +} +mov.b32 r3031, {rs463, rs463}; +{ +mul.f16x2 r3029, r160, r3031; +} +{ +add.f16x2 r3032, r3008, r3029; +} +{ +cvt.rn.f16.f64 rs464, fd760; +} +mov.b32 r3037, {rs464, rs464}; +{ +mul.f16x2 r3035, r163, r3037; +} +{ +add.f16x2 r3038, r3014, r3035; +} +{ +sub.f16x2 %16, r3020, r3026; +} +{ +add.f16x2 %17, r3032, r3038; +} +{ +add.f16x2 %42, r3020, r3026; +} +{ +sub.f16x2 %43, r3032, r3038; +} +cvt.rn.f16.s32 rs465, r4804; +mov.b32 r3065, {rs465, rs465}; +cvt.rn.f16.s32 rs466, r4804; +mov.b32 r3077, {rs466, rs466}; +{ +cvt.rn.f16.f64 rs467, fd771; +} +mov.b32 r3057, {rs467, rs467}; +{ +mul.f16x2 r3055, r1, r3057; +} +{ +add.f16x2 r3058, %58, r3055; +} +{ +cvt.rn.f16.f64 rs468, fd772; +} +mov.b32 r3063, {rs468, rs468}; +{ +mul.f16x2 r3061, r10, r3063; +} +{ +add.f16x2 r3064, r3065, r3061; +} +{ +cvt.rn.f16.f64 rs469, fd771; +} +mov.b32 r3069, {rs469, rs469}; +{ +mul.f16x2 r3067, r4, r3069; +} +{ +add.f16x2 r3070, %59, r3067; +} +{ +cvt.rn.f16.f64 rs470, fd772; +} +mov.b32 r3075, {rs470, rs470}; +{ +mul.f16x2 r3073, r7, r3075; +} +{ +add.f16x2 r3076, r3077, r3073; +} +{ +cvt.rn.f16.f64 rs471, fd755; +} +mov.b32 r3081, {rs471, rs471}; +{ +mul.f16x2 r3079, r13, r3081; +} +{ +add.f16x2 r3082, r3058, r3079; +} +{ +cvt.rn.f16.f64 rs472, fd456; +} +mov.b32 r3087, {rs472, rs472}; +{ +mul.f16x2 r3085, r22, r3087; +} +{ +add.f16x2 r3088, r3064, r3085; +} +{ +cvt.rn.f16.f64 rs473, fd755; +} +mov.b32 r3093, {rs473, rs473}; +{ +mul.f16x2 r3091, r16, r3093; +} +{ +add.f16x2 r3094, r3070, r3091; +} +{ +cvt.rn.f16.f64 rs474, fd456; +} +mov.b32 r3099, {rs474, rs474}; +{ +mul.f16x2 r3097, r19, r3099; +} +{ +add.f16x2 r3100, r3076, r3097; +} +{ +cvt.rn.f16.f64 rs475, fd743; +} +mov.b32 r3105, {rs475, rs475}; +{ +mul.f16x2 r3103, r25, r3105; +} +{ +add.f16x2 r3106, r3082, r3103; +} +{ +cvt.rn.f16.f64 rs476, fd744; +} +mov.b32 r3111, {rs476, rs476}; +{ +mul.f16x2 r3109, r34, r3111; +} +{ +add.f16x2 r3112, r3088, r3109; +} +{ +cvt.rn.f16.f64 rs477, fd743; +} +mov.b32 r3117, {rs477, rs477}; +{ +mul.f16x2 r3115, r28, r3117; +} +{ +add.f16x2 r3118, r3094, r3115; +} +{ +cvt.rn.f16.f64 rs478, fd744; +} +mov.b32 r3123, {rs478, rs478}; +{ +mul.f16x2 r3121, r31, r3123; +} +{ +add.f16x2 r3124, r3100, r3121; +} +{ +cvt.rn.f16.f64 rs479, fd783; +} +mov.b32 r3129, {rs479, rs479}; +{ +mul.f16x2 r3127, r37, r3129; +} +{ +add.f16x2 r3130, r3106, r3127; +} +{ +cvt.rn.f16.f64 rs480, fd692; +} +mov.b32 r3135, {rs480, rs480}; +{ +mul.f16x2 r3133, r46, r3135; +} +{ +add.f16x2 r3136, r3112, r3133; +} +{ +cvt.rn.f16.f64 rs481, fd783; +} +mov.b32 r3141, {rs481, rs481}; +{ +mul.f16x2 r3139, r40, r3141; +} +{ +add.f16x2 r3142, r3118, r3139; +} +{ +cvt.rn.f16.f64 rs482, fd692; +} +mov.b32 r3147, {rs482, rs482}; +{ +mul.f16x2 r3145, r43, r3147; +} +{ +add.f16x2 r3148, r3124, r3145; +} +{ +cvt.rn.f16.f64 rs483, fd739; +} +mov.b32 r3153, {rs483, rs483}; +{ +mul.f16x2 r3151, r49, r3153; +} +{ +add.f16x2 r3154, r3130, r3151; +} +{ +cvt.rn.f16.f64 rs484, fd660; +} +mov.b32 r3159, {rs484, rs484}; +{ +mul.f16x2 r3157, r58, r3159; +} +{ +add.f16x2 r3160, r3136, r3157; +} +{ +cvt.rn.f16.f64 rs485, fd739; +} +mov.b32 r3165, {rs485, rs485}; +{ +mul.f16x2 r3163, r52, r3165; +} +{ +add.f16x2 r3166, r3142, r3163; +} +{ +cvt.rn.f16.f64 rs486, fd660; +} +mov.b32 r3171, {rs486, rs486}; +{ +mul.f16x2 r3169, r55, r3171; +} +{ +add.f16x2 r3172, r3148, r3169; +} +{ +cvt.rn.f16.f64 rs487, fd759; +} +mov.b32 r3177, {rs487, rs487}; +{ +mul.f16x2 r3175, r61, r3177; +} +{ +add.f16x2 r3178, r3154, r3175; +} +{ +cvt.rn.f16.f64 rs488, fd760; +} +mov.b32 r3183, {rs488, rs488}; +{ +mul.f16x2 r3181, r70, r3183; +} +{ +add.f16x2 r3184, r3160, r3181; +} +{ +cvt.rn.f16.f64 rs489, fd759; +} +mov.b32 r3189, {rs489, rs489}; +{ +mul.f16x2 r3187, r64, r3189; +} +{ +add.f16x2 r3190, r3166, r3187; +} +{ +cvt.rn.f16.f64 rs490, fd760; +} +mov.b32 r3195, {rs490, rs490}; +{ +mul.f16x2 r3193, r67, r3195; +} +{ +add.f16x2 r3196, r3172, r3193; +} +{ +cvt.rn.f16.f64 rs491, fd767; +} +mov.b32 r3201, {rs491, rs491}; +{ +mul.f16x2 r3199, r73, r3201; +} +{ +add.f16x2 r3202, r3178, r3199; +} +{ +cvt.rn.f16.f64 rs492, fd604; +} +mov.b32 r3207, {rs492, rs492}; +{ +mul.f16x2 r3205, r82, r3207; +} +{ +add.f16x2 r3208, r3184, r3205; +} +{ +cvt.rn.f16.f64 rs493, fd767; +} +mov.b32 r3213, {rs493, rs493}; +{ +mul.f16x2 r3211, r76, r3213; +} +{ +add.f16x2 r3214, r3190, r3211; +} +{ +cvt.rn.f16.f64 rs494, fd604; +} +mov.b32 r3219, {rs494, rs494}; +{ +mul.f16x2 r3217, r79, r3219; +} +{ +add.f16x2 r3220, r3196, r3217; +} +{ +cvt.rn.f16.f64 rs495, fd731; +} +mov.b32 r3225, {rs495, rs495}; +{ +mul.f16x2 r3223, r85, r3225; +} +{ +add.f16x2 r3226, r3202, r3223; +} +{ +cvt.rn.f16.f64 rs496, fd732; +} +mov.b32 r3231, {rs496, rs496}; +{ +mul.f16x2 r3229, r94, r3231; +} +{ +add.f16x2 r3232, r3208, r3229; +} +{ +cvt.rn.f16.f64 rs497, fd731; +} +mov.b32 r3237, {rs497, rs497}; +{ +mul.f16x2 r3235, r88, r3237; +} +{ +add.f16x2 r3238, r3214, r3235; +} +{ +cvt.rn.f16.f64 rs498, fd732; +} +mov.b32 r3243, {rs498, rs498}; +{ +mul.f16x2 r3241, r91, r3243; +} +{ +add.f16x2 r3244, r3220, r3241; +} +{ +cvt.rn.f16.f64 rs499, fd775; +} +mov.b32 r3249, {rs499, rs499}; +{ +mul.f16x2 r3247, r97, r3249; +} +{ +add.f16x2 r3250, r3226, r3247; +} +{ +cvt.rn.f16.f64 rs500, fd776; +} +mov.b32 r3255, {rs500, rs500}; +{ +mul.f16x2 r3253, r106, r3255; +} +{ +add.f16x2 r3256, r3232, r3253; +} +{ +cvt.rn.f16.f64 rs501, fd775; +} +mov.b32 r3261, {rs501, rs501}; +{ +mul.f16x2 r3259, r100, r3261; +} +{ +add.f16x2 r3262, r3238, r3259; +} +{ +cvt.rn.f16.f64 rs502, fd776; +} +mov.b32 r3267, {rs502, rs502}; +{ +mul.f16x2 r3265, r103, r3267; +} +{ +add.f16x2 r3268, r3244, r3265; +} +{ +cvt.rn.f16.f64 rs503, fd751; +} +mov.b32 r3273, {rs503, rs503}; +{ +mul.f16x2 r3271, r109, r3273; +} +{ +add.f16x2 r3274, r3250, r3271; +} +{ +cvt.rn.f16.f64 rs504, fd540; +} +mov.b32 r3279, {rs504, rs504}; +{ +mul.f16x2 r3277, r118, r3279; +} +{ +add.f16x2 r3280, r3256, r3277; +} +{ +cvt.rn.f16.f64 rs505, fd751; +} +mov.b32 r3285, {rs505, rs505}; +{ +mul.f16x2 r3283, r112, r3285; +} +{ +add.f16x2 r3286, r3262, r3283; +} +{ +cvt.rn.f16.f64 rs506, fd540; +} +mov.b32 r3291, {rs506, rs506}; +{ +mul.f16x2 r3289, r115, r3291; +} +{ +add.f16x2 r3292, r3268, r3289; +} +{ +cvt.rn.f16.f64 rs507, fd747; +} +mov.b32 r3297, {rs507, rs507}; +{ +mul.f16x2 r3295, r121, r3297; +} +{ +add.f16x2 r3298, r3274, r3295; +} +{ +cvt.rn.f16.f64 rs508, fd748; +} +mov.b32 r3303, {rs508, rs508}; +{ +mul.f16x2 r3301, r130, r3303; +} +{ +add.f16x2 r3304, r3280, r3301; +} +{ +cvt.rn.f16.f64 rs509, fd747; +} +mov.b32 r3309, {rs509, rs509}; +{ +mul.f16x2 r3307, r124, r3309; +} +{ +add.f16x2 r3310, r3286, r3307; +} +{ +cvt.rn.f16.f64 rs510, fd748; +} +mov.b32 r3315, {rs510, rs510}; +{ +mul.f16x2 r3313, r127, r3315; +} +{ +add.f16x2 r3316, r3292, r3313; +} +{ +cvt.rn.f16.f64 rs511, fd779; +} +mov.b32 r3321, {rs511, rs511}; +{ +mul.f16x2 r3319, r133, r3321; +} +{ +add.f16x2 r3322, r3298, r3319; +} +{ +cvt.rn.f16.f64 rs512, fd652; +} +mov.b32 r3327, {rs512, rs512}; +{ +mul.f16x2 r3325, r142, r3327; +} +{ +add.f16x2 r3328, r3304, r3325; +} +{ +cvt.rn.f16.f64 rs513, fd779; +} +mov.b32 r3333, {rs513, rs513}; +{ +mul.f16x2 r3331, r136, r3333; +} +{ +add.f16x2 r3334, r3310, r3331; +} +{ +cvt.rn.f16.f64 rs514, fd652; +} +mov.b32 r3339, {rs514, rs514}; +{ +mul.f16x2 r3337, r139, r3339; +} +{ +add.f16x2 r3340, r3316, r3337; +} +{ +cvt.rn.f16.f64 rs515, fd735; +} +mov.b32 r3345, {rs515, rs515}; +{ +mul.f16x2 r3343, r145, r3345; +} +{ +add.f16x2 r3346, r3322, r3343; +} +{ +cvt.rn.f16.f64 rs516, fd708; +} +mov.b32 r3351, {rs516, rs516}; +{ +mul.f16x2 r3349, r154, r3351; +} +{ +add.f16x2 r3352, r3328, r3349; +} +{ +cvt.rn.f16.f64 rs517, fd735; +} +mov.b32 r3357, {rs517, rs517}; +{ +mul.f16x2 r3355, r148, r3357; +} +{ +add.f16x2 r3358, r3334, r3355; +} +{ +cvt.rn.f16.f64 rs518, fd708; +} +mov.b32 r3363, {rs518, rs518}; +{ +mul.f16x2 r3361, r151, r3363; +} +{ +add.f16x2 r3364, r3340, r3361; +} +{ +cvt.rn.f16.f64 rs519, fd763; +} +mov.b32 r3369, {rs519, rs519}; +{ +mul.f16x2 r3367, r157, r3369; +} +{ +add.f16x2 r3370, r3346, r3367; +} +{ +cvt.rn.f16.f64 rs520, fd764; +} +mov.b32 r3375, {rs520, rs520}; +{ +mul.f16x2 r3373, r166, r3375; +} +{ +add.f16x2 r3376, r3352, r3373; +} +{ +cvt.rn.f16.f64 rs521, fd763; +} +mov.b32 r3381, {rs521, rs521}; +{ +mul.f16x2 r3379, r160, r3381; +} +{ +add.f16x2 r3382, r3358, r3379; +} +{ +cvt.rn.f16.f64 rs522, fd764; +} +mov.b32 r3387, {rs522, rs522}; +{ +mul.f16x2 r3385, r163, r3387; +} +{ +add.f16x2 r3388, r3364, r3385; +} +{ +sub.f16x2 %18, r3370, r3376; +} +{ +add.f16x2 %19, r3382, r3388; +} +{ +add.f16x2 %40, r3370, r3376; +} +{ +sub.f16x2 %41, r3382, r3388; +} +cvt.rn.f16.s32 rs523, r4804; +mov.b32 r3415, {rs523, rs523}; +cvt.rn.f16.s32 rs524, r4804; +mov.b32 r3427, {rs524, rs524}; +{ +cvt.rn.f16.f64 rs525, fd763; +} +mov.b32 r3407, {rs525, rs525}; +{ +mul.f16x2 r3405, r1, r3407; +} +{ +add.f16x2 r3408, %58, r3405; +} +{ +cvt.rn.f16.f64 rs526, fd764; +} +mov.b32 r3413, {rs526, rs526}; +{ +mul.f16x2 r3411, r10, r3413; +} +{ +add.f16x2 r3414, r3415, r3411; +} +{ +cvt.rn.f16.f64 rs527, fd763; +} +mov.b32 r3419, {rs527, rs527}; +{ +mul.f16x2 r3417, r4, r3419; +} +{ +add.f16x2 r3420, %59, r3417; +} +{ +cvt.rn.f16.f64 rs528, fd764; +} +mov.b32 r3425, {rs528, rs528}; +{ +mul.f16x2 r3423, r7, r3425; +} +{ +add.f16x2 r3426, r3427, r3423; +} +{ +cvt.rn.f16.f64 rs529, fd771; +} +mov.b32 r3431, {rs529, rs529}; +{ +mul.f16x2 r3429, r13, r3431; +} +{ +add.f16x2 r3432, r3408, r3429; +} +{ +cvt.rn.f16.f64 rs530, fd696; +} +mov.b32 r3437, {rs530, rs530}; +{ +mul.f16x2 r3435, r22, r3437; +} +{ +add.f16x2 r3438, r3414, r3435; +} +{ +cvt.rn.f16.f64 rs531, fd771; +} +mov.b32 r3443, {rs531, rs531}; +{ +mul.f16x2 r3441, r16, r3443; +} +{ +add.f16x2 r3444, r3420, r3441; +} +{ +cvt.rn.f16.f64 rs532, fd696; +} +mov.b32 r3449, {rs532, rs532}; +{ +mul.f16x2 r3447, r19, r3449; +} +{ +add.f16x2 r3450, r3426, r3447; +} +{ +cvt.rn.f16.f64 rs533, fd735; +} +mov.b32 r3455, {rs533, rs533}; +{ +mul.f16x2 r3453, r25, r3455; +} +{ +add.f16x2 r3456, r3432, r3453; +} +{ +cvt.rn.f16.f64 rs534, fd708; +} +mov.b32 r3461, {rs534, rs534}; +{ +mul.f16x2 r3459, r34, r3461; +} +{ +add.f16x2 r3462, r3438, r3459; +} +{ +cvt.rn.f16.f64 rs535, fd735; +} +mov.b32 r3467, {rs535, rs535}; +{ +mul.f16x2 r3465, r28, r3467; +} +{ +add.f16x2 r3468, r3444, r3465; +} +{ +cvt.rn.f16.f64 rs536, fd708; +} +mov.b32 r3473, {rs536, rs536}; +{ +mul.f16x2 r3471, r31, r3473; +} +{ +add.f16x2 r3474, r3450, r3471; +} +{ +cvt.rn.f16.f64 rs537, fd755; +} +mov.b32 r3479, {rs537, rs537}; +{ +mul.f16x2 r3477, r37, r3479; +} +{ +add.f16x2 r3480, r3456, r3477; +} +{ +cvt.rn.f16.f64 rs538, fd756; +} +mov.b32 r3485, {rs538, rs538}; +{ +mul.f16x2 r3483, r46, r3485; +} +{ +add.f16x2 r3486, r3462, r3483; +} +{ +cvt.rn.f16.f64 rs539, fd755; +} +mov.b32 r3491, {rs539, rs539}; +{ +mul.f16x2 r3489, r40, r3491; +} +{ +add.f16x2 r3492, r3468, r3489; +} +{ +cvt.rn.f16.f64 rs540, fd756; +} +mov.b32 r3497, {rs540, rs540}; +{ +mul.f16x2 r3495, r43, r3497; +} +{ +add.f16x2 r3498, r3474, r3495; +} +{ +cvt.rn.f16.f64 rs541, fd779; +} +mov.b32 r3503, {rs541, rs541}; +{ +mul.f16x2 r3501, r49, r3503; +} +{ +add.f16x2 r3504, r3480, r3501; +} +{ +cvt.rn.f16.f64 rs542, fd652; +} +mov.b32 r3509, {rs542, rs542}; +{ +mul.f16x2 r3507, r58, r3509; +} +{ +add.f16x2 r3510, r3486, r3507; +} +{ +cvt.rn.f16.f64 rs543, fd779; +} +mov.b32 r3515, {rs543, rs543}; +{ +mul.f16x2 r3513, r52, r3515; +} +{ +add.f16x2 r3516, r3492, r3513; +} +{ +cvt.rn.f16.f64 rs544, fd652; +} +mov.b32 r3521, {rs544, rs544}; +{ +mul.f16x2 r3519, r55, r3521; +} +{ +add.f16x2 r3522, r3498, r3519; +} +{ +cvt.rn.f16.f64 rs545, fd743; +} +mov.b32 r3527, {rs545, rs545}; +{ +mul.f16x2 r3525, r61, r3527; +} +{ +add.f16x2 r3528, r3504, r3525; +} +{ +cvt.rn.f16.f64 rs546, fd636; +} +mov.b32 r3533, {rs546, rs546}; +{ +mul.f16x2 r3531, r70, r3533; +} +{ +add.f16x2 r3534, r3510, r3531; +} +{ +cvt.rn.f16.f64 rs547, fd743; +} +mov.b32 r3539, {rs547, rs547}; +{ +mul.f16x2 r3537, r64, r3539; +} +{ +add.f16x2 r3540, r3516, r3537; +} +{ +cvt.rn.f16.f64 rs548, fd636; +} +mov.b32 r3545, {rs548, rs548}; +{ +mul.f16x2 r3543, r67, r3545; +} +{ +add.f16x2 r3546, r3522, r3543; +} +{ +cvt.rn.f16.f64 rs549, fd747; +} +mov.b32 r3551, {rs549, rs549}; +{ +mul.f16x2 r3549, r73, r3551; +} +{ +add.f16x2 r3552, r3528, r3549; +} +{ +cvt.rn.f16.f64 rs550, fd748; +} +mov.b32 r3557, {rs550, rs550}; +{ +mul.f16x2 r3555, r82, r3557; +} +{ +add.f16x2 r3558, r3534, r3555; +} +{ +cvt.rn.f16.f64 rs551, fd747; +} +mov.b32 r3563, {rs551, rs551}; +{ +mul.f16x2 r3561, r76, r3563; +} +{ +add.f16x2 r3564, r3540, r3561; +} +{ +cvt.rn.f16.f64 rs552, fd748; +} +mov.b32 r3569, {rs552, rs552}; +{ +mul.f16x2 r3567, r79, r3569; +} +{ +add.f16x2 r3570, r3546, r3567; +} +{ +cvt.rn.f16.f64 rs553, fd783; +} +mov.b32 r3575, {rs553, rs553}; +{ +mul.f16x2 r3573, r85, r3575; +} +{ +add.f16x2 r3576, r3552, r3573; +} +{ +cvt.rn.f16.f64 rs554, fd784; +} +mov.b32 r3581, {rs554, rs554}; +{ +mul.f16x2 r3579, r94, r3581; +} +{ +add.f16x2 r3582, r3558, r3579; +} +{ +cvt.rn.f16.f64 rs555, fd783; +} +mov.b32 r3587, {rs555, rs555}; +{ +mul.f16x2 r3585, r88, r3587; +} +{ +add.f16x2 r3588, r3564, r3585; +} +{ +cvt.rn.f16.f64 rs556, fd784; +} +mov.b32 r3593, {rs556, rs556}; +{ +mul.f16x2 r3591, r91, r3593; +} +{ +add.f16x2 r3594, r3570, r3591; +} +{ +cvt.rn.f16.f64 rs557, fd751; +} +mov.b32 r3599, {rs557, rs557}; +{ +mul.f16x2 r3597, r97, r3599; +} +{ +add.f16x2 r3600, r3576, r3597; +} +{ +cvt.rn.f16.f64 rs558, fd540; +} +mov.b32 r3605, {rs558, rs558}; +{ +mul.f16x2 r3603, r106, r3605; +} +{ +add.f16x2 r3606, r3582, r3603; +} +{ +cvt.rn.f16.f64 rs559, fd751; +} +mov.b32 r3611, {rs559, rs559}; +{ +mul.f16x2 r3609, r100, r3611; +} +{ +add.f16x2 r3612, r3588, r3609; +} +{ +cvt.rn.f16.f64 rs560, fd540; +} +mov.b32 r3617, {rs560, rs560}; +{ +mul.f16x2 r3615, r103, r3617; +} +{ +add.f16x2 r3618, r3594, r3615; +} +{ +cvt.rn.f16.f64 rs561, fd739; +} +mov.b32 r3623, {rs561, rs561}; +{ +mul.f16x2 r3621, r109, r3623; +} +{ +add.f16x2 r3624, r3600, r3621; +} +{ +cvt.rn.f16.f64 rs562, fd740; +} +mov.b32 r3629, {rs562, rs562}; +{ +mul.f16x2 r3627, r118, r3629; +} +{ +add.f16x2 r3630, r3606, r3627; +} +{ +cvt.rn.f16.f64 rs563, fd739; +} +mov.b32 r3635, {rs563, rs563}; +{ +mul.f16x2 r3633, r112, r3635; +} +{ +add.f16x2 r3636, r3612, r3633; +} +{ +cvt.rn.f16.f64 rs564, fd740; +} +mov.b32 r3641, {rs564, rs564}; +{ +mul.f16x2 r3639, r115, r3641; +} +{ +add.f16x2 r3642, r3618, r3639; +} +{ +cvt.rn.f16.f64 rs565, fd775; +} +mov.b32 r3647, {rs565, rs565}; +{ +mul.f16x2 r3645, r121, r3647; +} +{ +add.f16x2 r3648, r3624, r3645; +} +{ +cvt.rn.f16.f64 rs566, fd776; +} +mov.b32 r3653, {rs566, rs566}; +{ +mul.f16x2 r3651, r130, r3653; +} +{ +add.f16x2 r3654, r3630, r3651; +} +{ +cvt.rn.f16.f64 rs567, fd775; +} +mov.b32 r3659, {rs567, rs567}; +{ +mul.f16x2 r3657, r124, r3659; +} +{ +add.f16x2 r3660, r3636, r3657; +} +{ +cvt.rn.f16.f64 rs568, fd776; +} +mov.b32 r3665, {rs568, rs568}; +{ +mul.f16x2 r3663, r127, r3665; +} +{ +add.f16x2 r3666, r3642, r3663; +} +{ +cvt.rn.f16.f64 rs569, fd759; +} +mov.b32 r3671, {rs569, rs569}; +{ +mul.f16x2 r3669, r133, r3671; +} +{ +add.f16x2 r3672, r3648, r3669; +} +{ +cvt.rn.f16.f64 rs570, fd700; +} +mov.b32 r3677, {rs570, rs570}; +{ +mul.f16x2 r3675, r142, r3677; +} +{ +add.f16x2 r3678, r3654, r3675; +} +{ +cvt.rn.f16.f64 rs571, fd759; +} +mov.b32 r3683, {rs571, rs571}; +{ +mul.f16x2 r3681, r136, r3683; +} +{ +add.f16x2 r3684, r3660, r3681; +} +{ +cvt.rn.f16.f64 rs572, fd700; +} +mov.b32 r3689, {rs572, rs572}; +{ +mul.f16x2 r3687, r139, r3689; +} +{ +add.f16x2 r3690, r3666, r3687; +} +{ +cvt.rn.f16.f64 rs573, fd731; +} +mov.b32 r3695, {rs573, rs573}; +{ +mul.f16x2 r3693, r145, r3695; +} +{ +add.f16x2 r3696, r3672, r3693; +} +{ +cvt.rn.f16.f64 rs574, fd732; +} +mov.b32 r3701, {rs574, rs574}; +{ +mul.f16x2 r3699, r154, r3701; +} +{ +add.f16x2 r3702, r3678, r3699; +} +{ +cvt.rn.f16.f64 rs575, fd731; +} +mov.b32 r3707, {rs575, rs575}; +{ +mul.f16x2 r3705, r148, r3707; +} +{ +add.f16x2 r3708, r3684, r3705; +} +{ +cvt.rn.f16.f64 rs576, fd732; +} +mov.b32 r3713, {rs576, rs576}; +{ +mul.f16x2 r3711, r151, r3713; +} +{ +add.f16x2 r3714, r3690, r3711; +} +{ +cvt.rn.f16.f64 rs577, fd767; +} +mov.b32 r3719, {rs577, rs577}; +{ +mul.f16x2 r3717, r157, r3719; +} +{ +add.f16x2 r3720, r3696, r3717; +} +{ +cvt.rn.f16.f64 rs578, fd768; +} +mov.b32 r3725, {rs578, rs578}; +{ +mul.f16x2 r3723, r166, r3725; +} +{ +add.f16x2 r3726, r3702, r3723; +} +{ +cvt.rn.f16.f64 rs579, fd767; +} +mov.b32 r3731, {rs579, rs579}; +{ +mul.f16x2 r3729, r160, r3731; +} +{ +add.f16x2 r3732, r3708, r3729; +} +{ +cvt.rn.f16.f64 rs580, fd768; +} +mov.b32 r3737, {rs580, rs580}; +{ +mul.f16x2 r3735, r163, r3737; +} +{ +add.f16x2 r3738, r3714, r3735; +} +{ +sub.f16x2 %20, r3720, r3726; +} +{ +add.f16x2 %21, r3732, r3738; +} +{ +add.f16x2 %38, r3720, r3726; +} +{ +sub.f16x2 %39, r3732, r3738; +} +cvt.rn.f16.s32 rs581, r4804; +mov.b32 r3765, {rs581, rs581}; +cvt.rn.f16.s32 rs582, r4804; +mov.b32 r3777, {rs582, rs582}; +{ +cvt.rn.f16.f64 rs583, fd755; +} +mov.b32 r3757, {rs583, rs583}; +{ +mul.f16x2 r3755, r1, r3757; +} +{ +add.f16x2 r3758, %58, r3755; +} +{ +cvt.rn.f16.f64 rs584, fd756; +} +mov.b32 r3763, {rs584, rs584}; +{ +mul.f16x2 r3761, r10, r3763; +} +{ +add.f16x2 r3764, r3765, r3761; +} +{ +cvt.rn.f16.f64 rs585, fd755; +} +mov.b32 r3769, {rs585, rs585}; +{ +mul.f16x2 r3767, r4, r3769; +} +{ +add.f16x2 r3770, %59, r3767; +} +{ +cvt.rn.f16.f64 rs586, fd756; +} +mov.b32 r3775, {rs586, rs586}; +{ +mul.f16x2 r3773, r7, r3775; +} +{ +add.f16x2 r3776, r3777, r3773; +} +{ +cvt.rn.f16.f64 rs587, fd783; +} +mov.b32 r3781, {rs587, rs587}; +{ +mul.f16x2 r3779, r13, r3781; +} +{ +add.f16x2 r3782, r3758, r3779; +} +{ +cvt.rn.f16.f64 rs588, fd784; +} +mov.b32 r3787, {rs588, rs588}; +{ +mul.f16x2 r3785, r22, r3787; +} +{ +add.f16x2 r3788, r3764, r3785; +} +{ +cvt.rn.f16.f64 rs589, fd783; +} +mov.b32 r3793, {rs589, rs589}; +{ +mul.f16x2 r3791, r16, r3793; +} +{ +add.f16x2 r3794, r3770, r3791; +} +{ +cvt.rn.f16.f64 rs590, fd784; +} +mov.b32 r3799, {rs590, rs590}; +{ +mul.f16x2 r3797, r19, r3799; +} +{ +add.f16x2 r3800, r3776, r3797; +} +{ +cvt.rn.f16.f64 rs591, fd759; +} +mov.b32 r3805, {rs591, rs591}; +{ +mul.f16x2 r3803, r25, r3805; +} +{ +add.f16x2 r3806, r3782, r3803; +} +{ +cvt.rn.f16.f64 rs592, fd700; +} +mov.b32 r3811, {rs592, rs592}; +{ +mul.f16x2 r3809, r34, r3811; +} +{ +add.f16x2 r3812, r3788, r3809; +} +{ +cvt.rn.f16.f64 rs593, fd759; +} +mov.b32 r3817, {rs593, rs593}; +{ +mul.f16x2 r3815, r28, r3817; +} +{ +add.f16x2 r3818, r3794, r3815; +} +{ +cvt.rn.f16.f64 rs594, fd700; +} +mov.b32 r3823, {rs594, rs594}; +{ +mul.f16x2 r3821, r31, r3823; +} +{ +add.f16x2 r3824, r3800, r3821; +} +{ +cvt.rn.f16.f64 rs595, fd731; +} +mov.b32 r3829, {rs595, rs595}; +{ +mul.f16x2 r3827, r37, r3829; +} +{ +add.f16x2 r3830, r3806, r3827; +} +{ +cvt.rn.f16.f64 rs596, fd576; +} +mov.b32 r3835, {rs596, rs596}; +{ +mul.f16x2 r3833, r46, r3835; +} +{ +add.f16x2 r3836, r3812, r3833; +} +{ +cvt.rn.f16.f64 rs597, fd731; +} +mov.b32 r3841, {rs597, rs597}; +{ +mul.f16x2 r3839, r40, r3841; +} +{ +add.f16x2 r3842, r3818, r3839; +} +{ +cvt.rn.f16.f64 rs598, fd576; +} +mov.b32 r3847, {rs598, rs598}; +{ +mul.f16x2 r3845, r43, r3847; +} +{ +add.f16x2 r3848, r3824, r3845; +} +{ +cvt.rn.f16.f64 rs599, fd751; +} +mov.b32 r3853, {rs599, rs599}; +{ +mul.f16x2 r3851, r49, r3853; +} +{ +add.f16x2 r3854, r3830, r3851; +} +{ +cvt.rn.f16.f64 rs600, fd752; +} +mov.b32 r3859, {rs600, rs600}; +{ +mul.f16x2 r3857, r58, r3859; +} +{ +add.f16x2 r3860, r3836, r3857; +} +{ +cvt.rn.f16.f64 rs601, fd751; +} +mov.b32 r3865, {rs601, rs601}; +{ +mul.f16x2 r3863, r52, r3865; +} +{ +add.f16x2 r3866, r3842, r3863; +} +{ +cvt.rn.f16.f64 rs602, fd752; +} +mov.b32 r3871, {rs602, rs602}; +{ +mul.f16x2 r3869, r55, r3871; +} +{ +add.f16x2 r3872, r3848, r3869; +} +{ +cvt.rn.f16.f64 rs603, fd779; +} +mov.b32 r3877, {rs603, rs603}; +{ +mul.f16x2 r3875, r61, r3877; +} +{ +add.f16x2 r3878, r3854, r3875; +} +{ +cvt.rn.f16.f64 rs604, fd780; +} +mov.b32 r3883, {rs604, rs604}; +{ +mul.f16x2 r3881, r70, r3883; +} +{ +add.f16x2 r3884, r3860, r3881; +} +{ +cvt.rn.f16.f64 rs605, fd779; +} +mov.b32 r3889, {rs605, rs605}; +{ +mul.f16x2 r3887, r64, r3889; +} +{ +add.f16x2 r3890, r3866, r3887; +} +{ +cvt.rn.f16.f64 rs606, fd780; +} +mov.b32 r3895, {rs606, rs606}; +{ +mul.f16x2 r3893, r67, r3895; +} +{ +add.f16x2 r3896, r3872, r3893; +} +{ +cvt.rn.f16.f64 rs607, fd763; +} +mov.b32 r3901, {rs607, rs607}; +{ +mul.f16x2 r3899, r73, r3901; +} +{ +add.f16x2 r3902, r3878, r3899; +} +{ +cvt.rn.f16.f64 rs608, fd632; +} +mov.b32 r3907, {rs608, rs608}; +{ +mul.f16x2 r3905, r82, r3907; +} +{ +add.f16x2 r3908, r3884, r3905; +} +{ +cvt.rn.f16.f64 rs609, fd763; +} +mov.b32 r3913, {rs609, rs609}; +{ +mul.f16x2 r3911, r76, r3913; +} +{ +add.f16x2 r3914, r3890, r3911; +} +{ +cvt.rn.f16.f64 rs610, fd632; +} +mov.b32 r3919, {rs610, rs610}; +{ +mul.f16x2 r3917, r79, r3919; +} +{ +add.f16x2 r3920, r3896, r3917; +} +{ +cvt.rn.f16.f64 rs611, fd735; +} +mov.b32 r3925, {rs611, rs611}; +{ +mul.f16x2 r3923, r85, r3925; +} +{ +add.f16x2 r3926, r3902, r3923; +} +{ +cvt.rn.f16.f64 rs612, fd708; +} +mov.b32 r3931, {rs612, rs612}; +{ +mul.f16x2 r3929, r94, r3931; +} +{ +add.f16x2 r3932, r3908, r3929; +} +{ +cvt.rn.f16.f64 rs613, fd735; +} +mov.b32 r3937, {rs613, rs613}; +{ +mul.f16x2 r3935, r88, r3937; +} +{ +add.f16x2 r3938, r3914, r3935; +} +{ +cvt.rn.f16.f64 rs614, fd708; +} +mov.b32 r3943, {rs614, rs614}; +{ +mul.f16x2 r3941, r91, r3943; +} +{ +add.f16x2 r3944, r3920, r3941; +} +{ +cvt.rn.f16.f64 rs615, fd747; +} +mov.b32 r3949, {rs615, rs615}; +{ +mul.f16x2 r3947, r97, r3949; +} +{ +add.f16x2 r3950, r3926, r3947; +} +{ +cvt.rn.f16.f64 rs616, fd748; +} +mov.b32 r3955, {rs616, rs616}; +{ +mul.f16x2 r3953, r106, r3955; +} +{ +add.f16x2 r3956, r3932, r3953; +} +{ +cvt.rn.f16.f64 rs617, fd747; +} +mov.b32 r3961, {rs617, rs617}; +{ +mul.f16x2 r3959, r100, r3961; +} +{ +add.f16x2 r3962, r3938, r3959; +} +{ +cvt.rn.f16.f64 rs618, fd748; +} +mov.b32 r3967, {rs618, rs618}; +{ +mul.f16x2 r3965, r103, r3967; +} +{ +add.f16x2 r3968, r3944, r3965; +} +{ +cvt.rn.f16.f64 rs619, fd775; +} +mov.b32 r3973, {rs619, rs619}; +{ +mul.f16x2 r3971, r109, r3973; +} +{ +add.f16x2 r3974, r3950, r3971; +} +{ +cvt.rn.f16.f64 rs620, fd776; +} +mov.b32 r3979, {rs620, rs620}; +{ +mul.f16x2 r3977, r118, r3979; +} +{ +add.f16x2 r3980, r3956, r3977; +} +{ +cvt.rn.f16.f64 rs621, fd775; +} +mov.b32 r3985, {rs621, rs621}; +{ +mul.f16x2 r3983, r112, r3985; +} +{ +add.f16x2 r3986, r3962, r3983; +} +{ +cvt.rn.f16.f64 rs622, fd776; +} +mov.b32 r3991, {rs622, rs622}; +{ +mul.f16x2 r3989, r115, r3991; +} +{ +add.f16x2 r3992, r3968, r3989; +} +{ +cvt.rn.f16.f64 rs623, fd767; +} +mov.b32 r3997, {rs623, rs623}; +{ +mul.f16x2 r3995, r121, r3997; +} +{ +add.f16x2 r3998, r3974, r3995; +} +{ +cvt.rn.f16.f64 rs624, fd604; +} +mov.b32 r4003, {rs624, rs624}; +{ +mul.f16x2 r4001, r130, r4003; +} +{ +add.f16x2 r4004, r3980, r4001; +} +{ +cvt.rn.f16.f64 rs625, fd767; +} +mov.b32 r4009, {rs625, rs625}; +{ +mul.f16x2 r4007, r124, r4009; +} +{ +add.f16x2 r4010, r3986, r4007; +} +{ +cvt.rn.f16.f64 rs626, fd604; +} +mov.b32 r4015, {rs626, rs626}; +{ +mul.f16x2 r4013, r127, r4015; +} +{ +add.f16x2 r4016, r3992, r4013; +} +{ +cvt.rn.f16.f64 rs627, fd739; +} +mov.b32 r4021, {rs627, rs627}; +{ +mul.f16x2 r4019, r133, r4021; +} +{ +add.f16x2 r4022, r3998, r4019; +} +{ +cvt.rn.f16.f64 rs628, fd660; +} +mov.b32 r4027, {rs628, rs628}; +{ +mul.f16x2 r4025, r142, r4027; +} +{ +add.f16x2 r4028, r4004, r4025; +} +{ +cvt.rn.f16.f64 rs629, fd739; +} +mov.b32 r4033, {rs629, rs629}; +{ +mul.f16x2 r4031, r136, r4033; +} +{ +add.f16x2 r4034, r4010, r4031; +} +{ +cvt.rn.f16.f64 rs630, fd660; +} +mov.b32 r4039, {rs630, rs630}; +{ +mul.f16x2 r4037, r139, r4039; +} +{ +add.f16x2 r4040, r4016, r4037; +} +{ +cvt.rn.f16.f64 rs631, fd743; +} +mov.b32 r4045, {rs631, rs631}; +{ +mul.f16x2 r4043, r145, r4045; +} +{ +add.f16x2 r4046, r4022, r4043; +} +{ +cvt.rn.f16.f64 rs632, fd744; +} +mov.b32 r4051, {rs632, rs632}; +{ +mul.f16x2 r4049, r154, r4051; +} +{ +add.f16x2 r4052, r4028, r4049; +} +{ +cvt.rn.f16.f64 rs633, fd743; +} +mov.b32 r4057, {rs633, rs633}; +{ +mul.f16x2 r4055, r148, r4057; +} +{ +add.f16x2 r4058, r4034, r4055; +} +{ +cvt.rn.f16.f64 rs634, fd744; +} +mov.b32 r4063, {rs634, rs634}; +{ +mul.f16x2 r4061, r151, r4063; +} +{ +add.f16x2 r4064, r4040, r4061; +} +{ +cvt.rn.f16.f64 rs635, fd771; +} +mov.b32 r4069, {rs635, rs635}; +{ +mul.f16x2 r4067, r157, r4069; +} +{ +add.f16x2 r4070, r4046, r4067; +} +{ +cvt.rn.f16.f64 rs636, fd772; +} +mov.b32 r4075, {rs636, rs636}; +{ +mul.f16x2 r4073, r166, r4075; +} +{ +add.f16x2 r4076, r4052, r4073; +} +{ +cvt.rn.f16.f64 rs637, fd771; +} +mov.b32 r4081, {rs637, rs637}; +{ +mul.f16x2 r4079, r160, r4081; +} +{ +add.f16x2 r4082, r4058, r4079; +} +{ +cvt.rn.f16.f64 rs638, fd772; +} +mov.b32 r4087, {rs638, rs638}; +{ +mul.f16x2 r4085, r163, r4087; +} +{ +add.f16x2 r4088, r4064, r4085; +} +{ +sub.f16x2 %22, r4070, r4076; +} +{ +add.f16x2 %23, r4082, r4088; +} +{ +add.f16x2 %36, r4070, r4076; +} +{ +sub.f16x2 %37, r4082, r4088; +} +cvt.rn.f16.s32 rs639, r4804; +mov.b32 r4115, {rs639, rs639}; +cvt.rn.f16.s32 rs640, r4804; +mov.b32 r4127, {rs640, rs640}; +{ +cvt.rn.f16.f64 rs641, fd747; +} +mov.b32 r4107, {rs641, rs641}; +{ +mul.f16x2 r4105, r1, r4107; +} +{ +add.f16x2 r4108, %58, r4105; +} +{ +cvt.rn.f16.f64 rs642, fd748; +} +mov.b32 r4113, {rs642, rs642}; +{ +mul.f16x2 r4111, r10, r4113; +} +{ +add.f16x2 r4114, r4115, r4111; +} +{ +cvt.rn.f16.f64 rs643, fd747; +} +mov.b32 r4119, {rs643, rs643}; +{ +mul.f16x2 r4117, r4, r4119; +} +{ +add.f16x2 r4120, %59, r4117; +} +{ +cvt.rn.f16.f64 rs644, fd748; +} +mov.b32 r4125, {rs644, rs644}; +{ +mul.f16x2 r4123, r7, r4125; +} +{ +add.f16x2 r4126, r4127, r4123; +} +{ +cvt.rn.f16.f64 rs645, fd767; +} +mov.b32 r4131, {rs645, rs645}; +{ +mul.f16x2 r4129, r13, r4131; +} +{ +add.f16x2 r4132, r4108, r4129; +} +{ +cvt.rn.f16.f64 rs646, fd768; +} +mov.b32 r4137, {rs646, rs646}; +{ +mul.f16x2 r4135, r22, r4137; +} +{ +add.f16x2 r4138, r4114, r4135; +} +{ +cvt.rn.f16.f64 rs647, fd767; +} +mov.b32 r4143, {rs647, rs647}; +{ +mul.f16x2 r4141, r16, r4143; +} +{ +add.f16x2 r4144, r4120, r4141; +} +{ +cvt.rn.f16.f64 rs648, fd768; +} +mov.b32 r4149, {rs648, rs648}; +{ +mul.f16x2 r4147, r19, r4149; +} +{ +add.f16x2 r4150, r4126, r4147; +} +{ +cvt.rn.f16.f64 rs649, fd783; +} +mov.b32 r4155, {rs649, rs649}; +{ +mul.f16x2 r4153, r25, r4155; +} +{ +add.f16x2 r4156, r4132, r4153; +} +{ +cvt.rn.f16.f64 rs650, fd692; +} +mov.b32 r4161, {rs650, rs650}; +{ +mul.f16x2 r4159, r34, r4161; +} +{ +add.f16x2 r4162, r4138, r4159; +} +{ +cvt.rn.f16.f64 rs651, fd783; +} +mov.b32 r4167, {rs651, rs651}; +{ +mul.f16x2 r4165, r28, r4167; +} +{ +add.f16x2 r4168, r4144, r4165; +} +{ +cvt.rn.f16.f64 rs652, fd692; +} +mov.b32 r4173, {rs652, rs652}; +{ +mul.f16x2 r4171, r31, r4173; +} +{ +add.f16x2 r4174, r4150, r4171; +} +{ +cvt.rn.f16.f64 rs653, fd763; +} +mov.b32 r4179, {rs653, rs653}; +{ +mul.f16x2 r4177, r37, r4179; +} +{ +add.f16x2 r4180, r4156, r4177; +} +{ +cvt.rn.f16.f64 rs654, fd632; +} +mov.b32 r4185, {rs654, rs654}; +{ +mul.f16x2 r4183, r46, r4185; +} +{ +add.f16x2 r4186, r4162, r4183; +} +{ +cvt.rn.f16.f64 rs655, fd763; +} +mov.b32 r4191, {rs655, rs655}; +{ +mul.f16x2 r4189, r40, r4191; +} +{ +add.f16x2 r4192, r4168, r4189; +} +{ +cvt.rn.f16.f64 rs656, fd632; +} +mov.b32 r4197, {rs656, rs656}; +{ +mul.f16x2 r4195, r43, r4197; +} +{ +add.f16x2 r4198, r4174, r4195; +} +{ +cvt.rn.f16.f64 rs657, fd743; +} +mov.b32 r4203, {rs657, rs657}; +{ +mul.f16x2 r4201, r49, r4203; +} +{ +add.f16x2 r4204, r4180, r4201; +} +{ +cvt.rn.f16.f64 rs658, fd636; +} +mov.b32 r4209, {rs658, rs658}; +{ +mul.f16x2 r4207, r58, r4209; +} +{ +add.f16x2 r4210, r4186, r4207; +} +{ +cvt.rn.f16.f64 rs659, fd743; +} +mov.b32 r4215, {rs659, rs659}; +{ +mul.f16x2 r4213, r52, r4215; +} +{ +add.f16x2 r4216, r4192, r4213; +} +{ +cvt.rn.f16.f64 rs660, fd636; +} +mov.b32 r4221, {rs660, rs660}; +{ +mul.f16x2 r4219, r55, r4221; +} +{ +add.f16x2 r4222, r4198, r4219; +} +{ +cvt.rn.f16.f64 rs661, fd731; +} +mov.b32 r4227, {rs661, rs661}; +{ +mul.f16x2 r4225, r61, r4227; +} +{ +add.f16x2 r4228, r4204, r4225; +} +{ +cvt.rn.f16.f64 rs662, fd732; +} +mov.b32 r4233, {rs662, rs662}; +{ +mul.f16x2 r4231, r70, r4233; +} +{ +add.f16x2 r4234, r4210, r4231; +} +{ +cvt.rn.f16.f64 rs663, fd731; +} +mov.b32 r4239, {rs663, rs663}; +{ +mul.f16x2 r4237, r64, r4239; +} +{ +add.f16x2 r4240, r4216, r4237; +} +{ +cvt.rn.f16.f64 rs664, fd732; +} +mov.b32 r4245, {rs664, rs664}; +{ +mul.f16x2 r4243, r67, r4245; +} +{ +add.f16x2 r4246, r4222, r4243; +} +{ +cvt.rn.f16.f64 rs665, fd751; +} +mov.b32 r4251, {rs665, rs665}; +{ +mul.f16x2 r4249, r73, r4251; +} +{ +add.f16x2 r4252, r4228, r4249; +} +{ +cvt.rn.f16.f64 rs666, fd752; +} +mov.b32 r4257, {rs666, rs666}; +{ +mul.f16x2 r4255, r82, r4257; +} +{ +add.f16x2 r4258, r4234, r4255; +} +{ +cvt.rn.f16.f64 rs667, fd751; +} +mov.b32 r4263, {rs667, rs667}; +{ +mul.f16x2 r4261, r76, r4263; +} +{ +add.f16x2 r4264, r4240, r4261; +} +{ +cvt.rn.f16.f64 rs668, fd752; +} +mov.b32 r4269, {rs668, rs668}; +{ +mul.f16x2 r4267, r79, r4269; +} +{ +add.f16x2 r4270, r4246, r4267; +} +{ +cvt.rn.f16.f64 rs669, fd771; +} +mov.b32 r4275, {rs669, rs669}; +{ +mul.f16x2 r4273, r85, r4275; +} +{ +add.f16x2 r4276, r4252, r4273; +} +{ +cvt.rn.f16.f64 rs670, fd772; +} +mov.b32 r4281, {rs670, rs670}; +{ +mul.f16x2 r4279, r94, r4281; +} +{ +add.f16x2 r4282, r4258, r4279; +} +{ +cvt.rn.f16.f64 rs671, fd771; +} +mov.b32 r4287, {rs671, rs671}; +{ +mul.f16x2 r4285, r88, r4287; +} +{ +add.f16x2 r4288, r4264, r4285; +} +{ +cvt.rn.f16.f64 rs672, fd772; +} +mov.b32 r4293, {rs672, rs672}; +{ +mul.f16x2 r4291, r91, r4293; +} +{ +add.f16x2 r4294, r4270, r4291; +} +{ +cvt.rn.f16.f64 rs673, fd779; +} +mov.b32 r4299, {rs673, rs673}; +{ +mul.f16x2 r4297, r97, r4299; +} +{ +add.f16x2 r4300, r4276, r4297; +} +{ +cvt.rn.f16.f64 rs674, fd652; +} +mov.b32 r4305, {rs674, rs674}; +{ +mul.f16x2 r4303, r106, r4305; +} +{ +add.f16x2 r4306, r4282, r4303; +} +{ +cvt.rn.f16.f64 rs675, fd779; +} +mov.b32 r4311, {rs675, rs675}; +{ +mul.f16x2 r4309, r100, r4311; +} +{ +add.f16x2 r4312, r4288, r4309; +} +{ +cvt.rn.f16.f64 rs676, fd652; +} +mov.b32 r4317, {rs676, rs676}; +{ +mul.f16x2 r4315, r103, r4317; +} +{ +add.f16x2 r4318, r4294, r4315; +} +{ +cvt.rn.f16.f64 rs677, fd759; +} +mov.b32 r4323, {rs677, rs677}; +{ +mul.f16x2 r4321, r109, r4323; +} +{ +add.f16x2 r4324, r4300, r4321; +} +{ +cvt.rn.f16.f64 rs678, fd700; +} +mov.b32 r4329, {rs678, rs678}; +{ +mul.f16x2 r4327, r118, r4329; +} +{ +add.f16x2 r4330, r4306, r4327; +} +{ +cvt.rn.f16.f64 rs679, fd759; +} +mov.b32 r4335, {rs679, rs679}; +{ +mul.f16x2 r4333, r112, r4335; +} +{ +add.f16x2 r4336, r4312, r4333; +} +{ +cvt.rn.f16.f64 rs680, fd700; +} +mov.b32 r4341, {rs680, rs680}; +{ +mul.f16x2 r4339, r115, r4341; +} +{ +add.f16x2 r4342, r4318, r4339; +} +{ +cvt.rn.f16.f64 rs681, fd739; +} +mov.b32 r4347, {rs681, rs681}; +{ +mul.f16x2 r4345, r121, r4347; +} +{ +add.f16x2 r4348, r4324, r4345; +} +{ +cvt.rn.f16.f64 rs682, fd660; +} +mov.b32 r4353, {rs682, rs682}; +{ +mul.f16x2 r4351, r130, r4353; +} +{ +add.f16x2 r4354, r4330, r4351; +} +{ +cvt.rn.f16.f64 rs683, fd739; +} +mov.b32 r4359, {rs683, rs683}; +{ +mul.f16x2 r4357, r124, r4359; +} +{ +add.f16x2 r4360, r4336, r4357; +} +{ +cvt.rn.f16.f64 rs684, fd660; +} +mov.b32 r4365, {rs684, rs684}; +{ +mul.f16x2 r4363, r127, r4365; +} +{ +add.f16x2 r4366, r4342, r4363; +} +{ +cvt.rn.f16.f64 rs685, fd735; +} +mov.b32 r4371, {rs685, rs685}; +{ +mul.f16x2 r4369, r133, r4371; +} +{ +add.f16x2 r4372, r4348, r4369; +} +{ +cvt.rn.f16.f64 rs686, fd736; +} +mov.b32 r4377, {rs686, rs686}; +{ +mul.f16x2 r4375, r142, r4377; +} +{ +add.f16x2 r4378, r4354, r4375; +} +{ +cvt.rn.f16.f64 rs687, fd735; +} +mov.b32 r4383, {rs687, rs687}; +{ +mul.f16x2 r4381, r136, r4383; +} +{ +add.f16x2 r4384, r4360, r4381; +} +{ +cvt.rn.f16.f64 rs688, fd736; +} +mov.b32 r4389, {rs688, rs688}; +{ +mul.f16x2 r4387, r139, r4389; +} +{ +add.f16x2 r4390, r4366, r4387; +} +{ +cvt.rn.f16.f64 rs689, fd755; +} +mov.b32 r4395, {rs689, rs689}; +{ +mul.f16x2 r4393, r145, r4395; +} +{ +add.f16x2 r4396, r4372, r4393; +} +{ +cvt.rn.f16.f64 rs690, fd756; +} +mov.b32 r4401, {rs690, rs690}; +{ +mul.f16x2 r4399, r154, r4401; +} +{ +add.f16x2 r4402, r4378, r4399; +} +{ +cvt.rn.f16.f64 rs691, fd755; +} +mov.b32 r4407, {rs691, rs691}; +{ +mul.f16x2 r4405, r148, r4407; +} +{ +add.f16x2 r4408, r4384, r4405; +} +{ +cvt.rn.f16.f64 rs692, fd756; +} +mov.b32 r4413, {rs692, rs692}; +{ +mul.f16x2 r4411, r151, r4413; +} +{ +add.f16x2 r4414, r4390, r4411; +} +{ +cvt.rn.f16.f64 rs693, fd775; +} +mov.b32 r4419, {rs693, rs693}; +{ +mul.f16x2 r4417, r157, r4419; +} +{ +add.f16x2 r4420, r4396, r4417; +} +{ +cvt.rn.f16.f64 rs694, fd776; +} +mov.b32 r4425, {rs694, rs694}; +{ +mul.f16x2 r4423, r166, r4425; +} +{ +add.f16x2 r4426, r4402, r4423; +} +{ +cvt.rn.f16.f64 rs695, fd775; +} +mov.b32 r4431, {rs695, rs695}; +{ +mul.f16x2 r4429, r160, r4431; +} +{ +add.f16x2 r4432, r4408, r4429; +} +{ +cvt.rn.f16.f64 rs696, fd776; +} +mov.b32 r4437, {rs696, rs696}; +{ +mul.f16x2 r4435, r163, r4437; +} +{ +add.f16x2 r4438, r4414, r4435; +} +{ +sub.f16x2 %24, r4420, r4426; +} +{ +add.f16x2 %25, r4432, r4438; +} +{ +add.f16x2 %34, r4420, r4426; +} +{ +sub.f16x2 %35, r4432, r4438; +} +cvt.rn.f16.s32 rs697, r4804; +mov.b32 r4465, {rs697, rs697}; +cvt.rn.f16.s32 rs698, r4804; +mov.b32 r4477, {rs698, rs698}; +{ +cvt.rn.f16.f64 rs699, fd739; +} +mov.b32 r4457, {rs699, rs699}; +{ +mul.f16x2 r4455, r1, r4457; +} +{ +add.f16x2 r4458, %58, r4455; +} +{ +cvt.rn.f16.f64 rs700, fd740; +} +mov.b32 r4463, {rs700, rs700}; +{ +mul.f16x2 r4461, r10, r4463; +} +{ +add.f16x2 r4464, r4465, r4461; +} +{ +cvt.rn.f16.f64 rs701, fd739; +} +mov.b32 r4469, {rs701, rs701}; +{ +mul.f16x2 r4467, r4, r4469; +} +{ +add.f16x2 r4470, %59, r4467; +} +{ +cvt.rn.f16.f64 rs702, fd740; +} +mov.b32 r4475, {rs702, rs702}; +{ +mul.f16x2 r4473, r7, r4475; +} +{ +add.f16x2 r4476, r4477, r4473; +} +{ +cvt.rn.f16.f64 rs703, fd751; +} +mov.b32 r4481, {rs703, rs703}; +{ +mul.f16x2 r4479, r13, r4481; +} +{ +add.f16x2 r4482, r4458, r4479; +} +{ +cvt.rn.f16.f64 rs704, fd752; +} +mov.b32 r4487, {rs704, rs704}; +{ +mul.f16x2 r4485, r22, r4487; +} +{ +add.f16x2 r4488, r4464, r4485; +} +{ +cvt.rn.f16.f64 rs705, fd751; +} +mov.b32 r4493, {rs705, rs705}; +{ +mul.f16x2 r4491, r16, r4493; +} +{ +add.f16x2 r4494, r4470, r4491; +} +{ +cvt.rn.f16.f64 rs706, fd752; +} +mov.b32 r4499, {rs706, rs706}; +{ +mul.f16x2 r4497, r19, r4499; +} +{ +add.f16x2 r4500, r4476, r4497; +} +{ +cvt.rn.f16.f64 rs707, fd763; +} +mov.b32 r4505, {rs707, rs707}; +{ +mul.f16x2 r4503, r25, r4505; +} +{ +add.f16x2 r4506, r4482, r4503; +} +{ +cvt.rn.f16.f64 rs708, fd764; +} +mov.b32 r4511, {rs708, rs708}; +{ +mul.f16x2 r4509, r34, r4511; +} +{ +add.f16x2 r4512, r4488, r4509; +} +{ +cvt.rn.f16.f64 rs709, fd763; +} +mov.b32 r4517, {rs709, rs709}; +{ +mul.f16x2 r4515, r28, r4517; +} +{ +add.f16x2 r4518, r4494, r4515; +} +{ +cvt.rn.f16.f64 rs710, fd764; +} +mov.b32 r4523, {rs710, rs710}; +{ +mul.f16x2 r4521, r31, r4523; +} +{ +add.f16x2 r4524, r4500, r4521; +} +{ +cvt.rn.f16.f64 rs711, fd775; +} +mov.b32 r4529, {rs711, rs711}; +{ +mul.f16x2 r4527, r37, r4529; +} +{ +add.f16x2 r4530, r4506, r4527; +} +{ +cvt.rn.f16.f64 rs712, fd776; +} +mov.b32 r4535, {rs712, rs712}; +{ +mul.f16x2 r4533, r46, r4535; +} +{ +add.f16x2 r4536, r4512, r4533; +} +{ +cvt.rn.f16.f64 rs713, fd775; +} +mov.b32 r4541, {rs713, rs713}; +{ +mul.f16x2 r4539, r40, r4541; +} +{ +add.f16x2 r4542, r4518, r4539; +} +{ +cvt.rn.f16.f64 rs714, fd776; +} +mov.b32 r4547, {rs714, rs714}; +{ +mul.f16x2 r4545, r43, r4547; +} +{ +add.f16x2 r4548, r4524, r4545; +} +{ +cvt.rn.f16.f64 rs715, fd783; +} +mov.b32 r4553, {rs715, rs715}; +{ +mul.f16x2 r4551, r49, r4553; +} +{ +add.f16x2 r4554, r4530, r4551; +} +{ +cvt.rn.f16.f64 rs716, fd692; +} +mov.b32 r4559, {rs716, rs716}; +{ +mul.f16x2 r4557, r58, r4559; +} +{ +add.f16x2 r4560, r4536, r4557; +} +{ +cvt.rn.f16.f64 rs717, fd783; +} +mov.b32 r4565, {rs717, rs717}; +{ +mul.f16x2 r4563, r52, r4565; +} +{ +add.f16x2 r4566, r4542, r4563; +} +{ +cvt.rn.f16.f64 rs718, fd692; +} +mov.b32 r4571, {rs718, rs718}; +{ +mul.f16x2 r4569, r55, r4571; +} +{ +add.f16x2 r4572, r4548, r4569; +} +{ +cvt.rn.f16.f64 rs719, fd771; +} +mov.b32 r4577, {rs719, rs719}; +{ +mul.f16x2 r4575, r61, r4577; +} +{ +add.f16x2 r4578, r4554, r4575; +} +{ +cvt.rn.f16.f64 rs720, fd696; +} +mov.b32 r4583, {rs720, rs720}; +{ +mul.f16x2 r4581, r70, r4583; +} +{ +add.f16x2 r4584, r4560, r4581; +} +{ +cvt.rn.f16.f64 rs721, fd771; +} +mov.b32 r4589, {rs721, rs721}; +{ +mul.f16x2 r4587, r64, r4589; +} +{ +add.f16x2 r4590, r4566, r4587; +} +{ +cvt.rn.f16.f64 rs722, fd696; +} +mov.b32 r4595, {rs722, rs722}; +{ +mul.f16x2 r4593, r67, r4595; +} +{ +add.f16x2 r4596, r4572, r4593; +} +{ +cvt.rn.f16.f64 rs723, fd759; +} +mov.b32 r4601, {rs723, rs723}; +{ +mul.f16x2 r4599, r73, r4601; +} +{ +add.f16x2 r4602, r4578, r4599; +} +{ +cvt.rn.f16.f64 rs724, fd700; +} +mov.b32 r4607, {rs724, rs724}; +{ +mul.f16x2 r4605, r82, r4607; +} +{ +add.f16x2 r4608, r4584, r4605; +} +{ +cvt.rn.f16.f64 rs725, fd759; +} +mov.b32 r4613, {rs725, rs725}; +{ +mul.f16x2 r4611, r76, r4613; +} +{ +add.f16x2 r4614, r4590, r4611; +} +{ +cvt.rn.f16.f64 rs726, fd700; +} +mov.b32 r4619, {rs726, rs726}; +{ +mul.f16x2 r4617, r79, r4619; +} +{ +add.f16x2 r4620, r4596, r4617; +} +{ +cvt.rn.f16.f64 rs727, fd747; +} +mov.b32 r4625, {rs727, rs727}; +{ +mul.f16x2 r4623, r85, r4625; +} +{ +add.f16x2 r4626, r4602, r4623; +} +{ +cvt.rn.f16.f64 rs728, fd704; +} +mov.b32 r4631, {rs728, rs728}; +{ +mul.f16x2 r4629, r94, r4631; +} +{ +add.f16x2 r4632, r4608, r4629; +} +{ +cvt.rn.f16.f64 rs729, fd747; +} +mov.b32 r4637, {rs729, rs729}; +{ +mul.f16x2 r4635, r88, r4637; +} +{ +add.f16x2 r4638, r4614, r4635; +} +{ +cvt.rn.f16.f64 rs730, fd704; +} +mov.b32 r4643, {rs730, rs730}; +{ +mul.f16x2 r4641, r91, r4643; +} +{ +add.f16x2 r4644, r4620, r4641; +} +{ +cvt.rn.f16.f64 rs731, fd735; +} +mov.b32 r4649, {rs731, rs731}; +{ +mul.f16x2 r4647, r97, r4649; +} +{ +add.f16x2 r4650, r4626, r4647; +} +{ +cvt.rn.f16.f64 rs732, fd708; +} +mov.b32 r4655, {rs732, rs732}; +{ +mul.f16x2 r4653, r106, r4655; +} +{ +add.f16x2 r4656, r4632, r4653; +} +{ +cvt.rn.f16.f64 rs733, fd735; +} +mov.b32 r4661, {rs733, rs733}; +{ +mul.f16x2 r4659, r100, r4661; +} +{ +add.f16x2 r4662, r4638, r4659; +} +{ +cvt.rn.f16.f64 rs734, fd708; +} +mov.b32 r4667, {rs734, rs734}; +{ +mul.f16x2 r4665, r103, r4667; +} +{ +add.f16x2 r4668, r4644, r4665; +} +{ +cvt.rn.f16.f64 rs735, fd731; +} +mov.b32 r4673, {rs735, rs735}; +{ +mul.f16x2 r4671, r109, r4673; +} +{ +add.f16x2 r4674, r4650, r4671; +} +{ +cvt.rn.f16.f64 rs736, fd732; +} +mov.b32 r4679, {rs736, rs736}; +{ +mul.f16x2 r4677, r118, r4679; +} +{ +add.f16x2 r4680, r4656, r4677; +} +{ +cvt.rn.f16.f64 rs737, fd731; +} +mov.b32 r4685, {rs737, rs737}; +{ +mul.f16x2 r4683, r112, r4685; +} +{ +add.f16x2 r4686, r4662, r4683; +} +{ +cvt.rn.f16.f64 rs738, fd732; +} +mov.b32 r4691, {rs738, rs738}; +{ +mul.f16x2 r4689, r115, r4691; +} +{ +add.f16x2 r4692, r4668, r4689; +} +{ +cvt.rn.f16.f64 rs739, fd743; +} +mov.b32 r4697, {rs739, rs739}; +{ +mul.f16x2 r4695, r121, r4697; +} +{ +add.f16x2 r4698, r4674, r4695; +} +{ +cvt.rn.f16.f64 rs740, fd744; +} +mov.b32 r4703, {rs740, rs740}; +{ +mul.f16x2 r4701, r130, r4703; +} +{ +add.f16x2 r4704, r4680, r4701; +} +{ +cvt.rn.f16.f64 rs741, fd743; +} +mov.b32 r4709, {rs741, rs741}; +{ +mul.f16x2 r4707, r124, r4709; +} +{ +add.f16x2 r4710, r4686, r4707; +} +{ +cvt.rn.f16.f64 rs742, fd744; +} +mov.b32 r4715, {rs742, rs742}; +{ +mul.f16x2 r4713, r127, r4715; +} +{ +add.f16x2 r4716, r4692, r4713; +} +{ +cvt.rn.f16.f64 rs743, fd755; +} +mov.b32 r4721, {rs743, rs743}; +{ +mul.f16x2 r4719, r133, r4721; +} +{ +add.f16x2 r4722, r4698, r4719; +} +{ +cvt.rn.f16.f64 rs744, fd756; +} +mov.b32 r4727, {rs744, rs744}; +{ +mul.f16x2 r4725, r142, r4727; +} +{ +add.f16x2 r4728, r4704, r4725; +} +{ +cvt.rn.f16.f64 rs745, fd755; +} +mov.b32 r4733, {rs745, rs745}; +{ +mul.f16x2 r4731, r136, r4733; +} +{ +add.f16x2 r4734, r4710, r4731; +} +{ +cvt.rn.f16.f64 rs746, fd756; +} +mov.b32 r4739, {rs746, rs746}; +{ +mul.f16x2 r4737, r139, r4739; +} +{ +add.f16x2 r4740, r4716, r4737; +} +{ +cvt.rn.f16.f64 rs747, fd767; +} +mov.b32 r4745, {rs747, rs747}; +{ +mul.f16x2 r4743, r145, r4745; +} +{ +add.f16x2 r4746, r4722, r4743; +} +{ +cvt.rn.f16.f64 rs748, fd768; +} +mov.b32 r4751, {rs748, rs748}; +{ +mul.f16x2 r4749, r154, r4751; +} +{ +add.f16x2 r4752, r4728, r4749; +} +{ +cvt.rn.f16.f64 rs749, fd767; +} +mov.b32 r4757, {rs749, rs749}; +{ +mul.f16x2 r4755, r148, r4757; +} +{ +add.f16x2 r4758, r4734, r4755; +} +{ +cvt.rn.f16.f64 rs750, fd768; +} +mov.b32 r4763, {rs750, rs750}; +{ +mul.f16x2 r4761, r151, r4763; +} +{ +add.f16x2 r4764, r4740, r4761; +} +{ +cvt.rn.f16.f64 rs751, fd779; +} +mov.b32 r4769, {rs751, rs751}; +{ +mul.f16x2 r4767, r157, r4769; +} +{ +add.f16x2 r4770, r4746, r4767; +} +{ +cvt.rn.f16.f64 rs752, fd780; +} +mov.b32 r4775, {rs752, rs752}; +{ +mul.f16x2 r4773, r166, r4775; +} +{ +add.f16x2 r4776, r4752, r4773; +} +{ +cvt.rn.f16.f64 rs753, fd779; +} +mov.b32 r4781, {rs753, rs753}; +{ +mul.f16x2 r4779, r160, r4781; +} +{ +add.f16x2 r4782, r4758, r4779; +} +{ +cvt.rn.f16.f64 rs754, fd780; +} +mov.b32 r4787, {rs754, rs754}; +{ +mul.f16x2 r4785, r163, r4787; +} +{ +add.f16x2 r4788, r4764, r4785; +} +{ +sub.f16x2 %26, r4770, r4776; +} +{ +add.f16x2 %27, r4782, r4788; +} +{ +add.f16x2 %32, r4770, r4776; +} +{ +sub.f16x2 %33, r4782, r4788; +} +cvt.rn.f16.s32 rs755, r4804; +mov.b32 r4815, {rs755, rs755}; +cvt.rn.f16.s32 rs756, r4804; +mov.b32 r4827, {rs756, rs756}; +{ +cvt.rn.f16.f64 rs757, fd731; +} +mov.b32 r4807, {rs757, rs757}; +{ +mul.f16x2 r4805, r1, r4807; +} +{ +add.f16x2 r4808, %58, r4805; +} +{ +cvt.rn.f16.f64 rs758, fd732; +} +mov.b32 r4813, {rs758, rs758}; +{ +mul.f16x2 r4811, r10, r4813; +} +{ +add.f16x2 r4814, r4815, r4811; +} +{ +cvt.rn.f16.f64 rs759, fd731; +} +mov.b32 r4819, {rs759, rs759}; +{ +mul.f16x2 r4817, r4, r4819; +} +{ +add.f16x2 r4820, %59, r4817; +} +{ +cvt.rn.f16.f64 rs760, fd732; +} +mov.b32 r4825, {rs760, rs760}; +{ +mul.f16x2 r4823, r7, r4825; +} +{ +add.f16x2 r4826, r4827, r4823; +} +{ +cvt.rn.f16.f64 rs761, fd735; +} +mov.b32 r4831, {rs761, rs761}; +{ +mul.f16x2 r4829, r13, r4831; +} +{ +add.f16x2 r4832, r4808, r4829; +} +{ +cvt.rn.f16.f64 rs762, fd736; +} +mov.b32 r4837, {rs762, rs762}; +{ +mul.f16x2 r4835, r22, r4837; +} +{ +add.f16x2 r4838, r4814, r4835; +} +{ +cvt.rn.f16.f64 rs763, fd735; +} +mov.b32 r4843, {rs763, rs763}; +{ +mul.f16x2 r4841, r16, r4843; +} +{ +add.f16x2 r4844, r4820, r4841; +} +{ +cvt.rn.f16.f64 rs764, fd736; +} +mov.b32 r4849, {rs764, rs764}; +{ +mul.f16x2 r4847, r19, r4849; +} +{ +add.f16x2 r4850, r4826, r4847; +} +{ +cvt.rn.f16.f64 rs765, fd739; +} +mov.b32 r4855, {rs765, rs765}; +{ +mul.f16x2 r4853, r25, r4855; +} +{ +add.f16x2 r4856, r4832, r4853; +} +{ +cvt.rn.f16.f64 rs766, fd740; +} +mov.b32 r4861, {rs766, rs766}; +{ +mul.f16x2 r4859, r34, r4861; +} +{ +add.f16x2 r4862, r4838, r4859; +} +{ +cvt.rn.f16.f64 rs767, fd739; +} +mov.b32 r4867, {rs767, rs767}; +{ +mul.f16x2 r4865, r28, r4867; +} +{ +add.f16x2 r4868, r4844, r4865; +} +{ +cvt.rn.f16.f64 rs768, fd740; +} +mov.b32 r4873, {rs768, rs768}; +{ +mul.f16x2 r4871, r31, r4873; +} +{ +add.f16x2 r4874, r4850, r4871; +} +{ +cvt.rn.f16.f64 rs769, fd743; +} +mov.b32 r4879, {rs769, rs769}; +{ +mul.f16x2 r4877, r37, r4879; +} +{ +add.f16x2 r4880, r4856, r4877; +} +{ +cvt.rn.f16.f64 rs770, fd744; +} +mov.b32 r4885, {rs770, rs770}; +{ +mul.f16x2 r4883, r46, r4885; +} +{ +add.f16x2 r4886, r4862, r4883; +} +{ +cvt.rn.f16.f64 rs771, fd743; +} +mov.b32 r4891, {rs771, rs771}; +{ +mul.f16x2 r4889, r40, r4891; +} +{ +add.f16x2 r4892, r4868, r4889; +} +{ +cvt.rn.f16.f64 rs772, fd744; +} +mov.b32 r4897, {rs772, rs772}; +{ +mul.f16x2 r4895, r43, r4897; +} +{ +add.f16x2 r4898, r4874, r4895; +} +{ +cvt.rn.f16.f64 rs773, fd747; +} +mov.b32 r4903, {rs773, rs773}; +{ +mul.f16x2 r4901, r49, r4903; +} +{ +add.f16x2 r4904, r4880, r4901; +} +{ +cvt.rn.f16.f64 rs774, fd748; +} +mov.b32 r4909, {rs774, rs774}; +{ +mul.f16x2 r4907, r58, r4909; +} +{ +add.f16x2 r4910, r4886, r4907; +} +{ +cvt.rn.f16.f64 rs775, fd747; +} +mov.b32 r4915, {rs775, rs775}; +{ +mul.f16x2 r4913, r52, r4915; +} +{ +add.f16x2 r4916, r4892, r4913; +} +{ +cvt.rn.f16.f64 rs776, fd748; +} +mov.b32 r4921, {rs776, rs776}; +{ +mul.f16x2 r4919, r55, r4921; +} +{ +add.f16x2 r4922, r4898, r4919; +} +{ +cvt.rn.f16.f64 rs777, fd751; +} +mov.b32 r4927, {rs777, rs777}; +{ +mul.f16x2 r4925, r61, r4927; +} +{ +add.f16x2 r4928, r4904, r4925; +} +{ +cvt.rn.f16.f64 rs778, fd752; +} +mov.b32 r4933, {rs778, rs778}; +{ +mul.f16x2 r4931, r70, r4933; +} +{ +add.f16x2 r4934, r4910, r4931; +} +{ +cvt.rn.f16.f64 rs779, fd751; +} +mov.b32 r4939, {rs779, rs779}; +{ +mul.f16x2 r4937, r64, r4939; +} +{ +add.f16x2 r4940, r4916, r4937; +} +{ +cvt.rn.f16.f64 rs780, fd752; +} +mov.b32 r4945, {rs780, rs780}; +{ +mul.f16x2 r4943, r67, r4945; +} +{ +add.f16x2 r4946, r4922, r4943; +} +{ +cvt.rn.f16.f64 rs781, fd755; +} +mov.b32 r4951, {rs781, rs781}; +{ +mul.f16x2 r4949, r73, r4951; +} +{ +add.f16x2 r4952, r4928, r4949; +} +{ +cvt.rn.f16.f64 rs782, fd756; +} +mov.b32 r4957, {rs782, rs782}; +{ +mul.f16x2 r4955, r82, r4957; +} +{ +add.f16x2 r4958, r4934, r4955; +} +{ +cvt.rn.f16.f64 rs783, fd755; +} +mov.b32 r4963, {rs783, rs783}; +{ +mul.f16x2 r4961, r76, r4963; +} +{ +add.f16x2 r4964, r4940, r4961; +} +{ +cvt.rn.f16.f64 rs784, fd756; +} +mov.b32 r4969, {rs784, rs784}; +{ +mul.f16x2 r4967, r79, r4969; +} +{ +add.f16x2 r4970, r4946, r4967; +} +{ +cvt.rn.f16.f64 rs785, fd759; +} +mov.b32 r4975, {rs785, rs785}; +{ +mul.f16x2 r4973, r85, r4975; +} +{ +add.f16x2 r4976, r4952, r4973; +} +{ +cvt.rn.f16.f64 rs786, fd760; +} +mov.b32 r4981, {rs786, rs786}; +{ +mul.f16x2 r4979, r94, r4981; +} +{ +add.f16x2 r4982, r4958, r4979; +} +{ +cvt.rn.f16.f64 rs787, fd759; +} +mov.b32 r4987, {rs787, rs787}; +{ +mul.f16x2 r4985, r88, r4987; +} +{ +add.f16x2 r4988, r4964, r4985; +} +{ +cvt.rn.f16.f64 rs788, fd760; +} +mov.b32 r4993, {rs788, rs788}; +{ +mul.f16x2 r4991, r91, r4993; +} +{ +add.f16x2 r4994, r4970, r4991; +} +{ +cvt.rn.f16.f64 rs789, fd763; +} +mov.b32 r4999, {rs789, rs789}; +{ +mul.f16x2 r4997, r97, r4999; +} +{ +add.f16x2 r5000, r4976, r4997; +} +{ +cvt.rn.f16.f64 rs790, fd764; +} +mov.b32 r5005, {rs790, rs790}; +{ +mul.f16x2 r5003, r106, r5005; +} +{ +add.f16x2 r5006, r4982, r5003; +} +{ +cvt.rn.f16.f64 rs791, fd763; +} +mov.b32 r5011, {rs791, rs791}; +{ +mul.f16x2 r5009, r100, r5011; +} +{ +add.f16x2 r5012, r4988, r5009; +} +{ +cvt.rn.f16.f64 rs792, fd764; +} +mov.b32 r5017, {rs792, rs792}; +{ +mul.f16x2 r5015, r103, r5017; +} +{ +add.f16x2 r5018, r4994, r5015; +} +{ +cvt.rn.f16.f64 rs793, fd767; +} +mov.b32 r5023, {rs793, rs793}; +{ +mul.f16x2 r5021, r109, r5023; +} +{ +add.f16x2 r5024, r5000, r5021; +} +{ +cvt.rn.f16.f64 rs794, fd768; +} +mov.b32 r5029, {rs794, rs794}; +{ +mul.f16x2 r5027, r118, r5029; +} +{ +add.f16x2 r5030, r5006, r5027; +} +{ +cvt.rn.f16.f64 rs795, fd767; +} +mov.b32 r5035, {rs795, rs795}; +{ +mul.f16x2 r5033, r112, r5035; +} +{ +add.f16x2 r5036, r5012, r5033; +} +{ +cvt.rn.f16.f64 rs796, fd768; +} +mov.b32 r5041, {rs796, rs796}; +{ +mul.f16x2 r5039, r115, r5041; +} +{ +add.f16x2 r5042, r5018, r5039; +} +{ +cvt.rn.f16.f64 rs797, fd771; +} +mov.b32 r5047, {rs797, rs797}; +{ +mul.f16x2 r5045, r121, r5047; +} +{ +add.f16x2 r5048, r5024, r5045; +} +{ +cvt.rn.f16.f64 rs798, fd772; +} +mov.b32 r5053, {rs798, rs798}; +{ +mul.f16x2 r5051, r130, r5053; +} +{ +add.f16x2 r5054, r5030, r5051; +} +{ +cvt.rn.f16.f64 rs799, fd771; +} +mov.b32 r5059, {rs799, rs799}; +{ +mul.f16x2 r5057, r124, r5059; +} +{ +add.f16x2 r5060, r5036, r5057; +} +{ +cvt.rn.f16.f64 rs800, fd772; +} +mov.b32 r5065, {rs800, rs800}; +{ +mul.f16x2 r5063, r127, r5065; +} +{ +add.f16x2 r5066, r5042, r5063; +} +{ +cvt.rn.f16.f64 rs801, fd775; +} +mov.b32 r5071, {rs801, rs801}; +{ +mul.f16x2 r5069, r133, r5071; +} +{ +add.f16x2 r5072, r5048, r5069; +} +{ +cvt.rn.f16.f64 rs802, fd776; +} +mov.b32 r5077, {rs802, rs802}; +{ +mul.f16x2 r5075, r142, r5077; +} +{ +add.f16x2 r5078, r5054, r5075; +} +{ +cvt.rn.f16.f64 rs803, fd775; +} +mov.b32 r5083, {rs803, rs803}; +{ +mul.f16x2 r5081, r136, r5083; +} +{ +add.f16x2 r5084, r5060, r5081; +} +{ +cvt.rn.f16.f64 rs804, fd776; +} +mov.b32 r5089, {rs804, rs804}; +{ +mul.f16x2 r5087, r139, r5089; +} +{ +add.f16x2 r5090, r5066, r5087; +} +{ +cvt.rn.f16.f64 rs805, fd779; +} +mov.b32 r5095, {rs805, rs805}; +{ +mul.f16x2 r5093, r145, r5095; +} +{ +add.f16x2 r5096, r5072, r5093; +} +{ +cvt.rn.f16.f64 rs806, fd780; +} +mov.b32 r5101, {rs806, rs806}; +{ +mul.f16x2 r5099, r154, r5101; +} +{ +add.f16x2 r5102, r5078, r5099; +} +{ +cvt.rn.f16.f64 rs807, fd779; +} +mov.b32 r5107, {rs807, rs807}; +{ +mul.f16x2 r5105, r148, r5107; +} +{ +add.f16x2 r5108, r5084, r5105; +} +{ +cvt.rn.f16.f64 rs808, fd780; +} +mov.b32 r5113, {rs808, rs808}; +{ +mul.f16x2 r5111, r151, r5113; +} +{ +add.f16x2 r5114, r5090, r5111; +} +{ +cvt.rn.f16.f64 rs809, fd783; +} +mov.b32 r5119, {rs809, rs809}; +{ +mul.f16x2 r5117, r157, r5119; +} +{ +add.f16x2 r5120, r5096, r5117; +} +{ +cvt.rn.f16.f64 rs810, fd784; +} +mov.b32 r5125, {rs810, rs810}; +{ +mul.f16x2 r5123, r166, r5125; +} +{ +add.f16x2 r5126, r5102, r5123; +} +{ +cvt.rn.f16.f64 rs811, fd783; +} +mov.b32 r5131, {rs811, rs811}; +{ +mul.f16x2 r5129, r160, r5131; +} +{ +add.f16x2 r5132, r5108, r5129; +} +{ +cvt.rn.f16.f64 rs812, fd784; +} +mov.b32 r5137, {rs812, rs812}; +{ +mul.f16x2 r5135, r163, r5137; +} +{ +add.f16x2 r5138, r5114, r5135; +} +{ +sub.f16x2 %28, r5120, r5126; +} +{ +add.f16x2 %29, r5132, r5138; +} +{ +add.f16x2 %30, r5120, r5126; +} +{ +sub.f16x2 %31, r5132, r5138; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[28].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..69d27ba3396d6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp16_inv.hpp.inc @@ -0,0 +1,8377 @@ +#ifndef CUFFTDX_FFT_29_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_29_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<961, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<813>; +.reg .b32 r<5153>; +.reg .f64 fd<785>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %60, %114; +} +{ +add.f16x2 r4, %61, %115; +} +{ +sub.f16x2 r7, %60, %114; +} +{ +sub.f16x2 r10, %61, %115; +} +{ +add.f16x2 r13, %62, %112; +} +{ +add.f16x2 r16, %63, %113; +} +{ +sub.f16x2 r19, %62, %112; +} +{ +sub.f16x2 r22, %63, %113; +} +{ +add.f16x2 r25, %64, %110; +} +{ +add.f16x2 r28, %65, %111; +} +{ +sub.f16x2 r31, %64, %110; +} +{ +sub.f16x2 r34, %65, %111; +} +{ +add.f16x2 r37, %66, %108; +} +{ +add.f16x2 r40, %67, %109; +} +{ +sub.f16x2 r43, %66, %108; +} +{ +sub.f16x2 r46, %67, %109; +} +{ +add.f16x2 r49, %68, %106; +} +{ +add.f16x2 r52, %69, %107; +} +{ +sub.f16x2 r55, %68, %106; +} +{ +sub.f16x2 r58, %69, %107; +} +{ +add.f16x2 r61, %70, %104; +} +{ +add.f16x2 r64, %71, %105; +} +{ +sub.f16x2 r67, %70, %104; +} +{ +sub.f16x2 r70, %71, %105; +} +{ +add.f16x2 r73, %72, %102; +} +{ +add.f16x2 r76, %73, %103; +} +{ +sub.f16x2 r79, %72, %102; +} +{ +sub.f16x2 r82, %73, %103; +} +{ +add.f16x2 r85, %74, %100; +} +{ +add.f16x2 r88, %75, %101; +} +{ +sub.f16x2 r91, %74, %100; +} +{ +sub.f16x2 r94, %75, %101; +} +{ +add.f16x2 r97, %76, %98; +} +{ +add.f16x2 r100, %77, %99; +} +{ +sub.f16x2 r103, %76, %98; +} +{ +sub.f16x2 r106, %77, %99; +} +{ +add.f16x2 r109, %78, %96; +} +{ +add.f16x2 r112, %79, %97; +} +{ +sub.f16x2 r115, %78, %96; +} +{ +sub.f16x2 r118, %79, %97; +} +{ +add.f16x2 r121, %80, %94; +} +{ +add.f16x2 r124, %81, %95; +} +{ +sub.f16x2 r127, %80, %94; +} +{ +sub.f16x2 r130, %81, %95; +} +{ +add.f16x2 r133, %82, %92; +} +{ +add.f16x2 r136, %83, %93; +} +{ +sub.f16x2 r139, %82, %92; +} +{ +sub.f16x2 r142, %83, %93; +} +{ +add.f16x2 r145, %84, %90; +} +{ +add.f16x2 r148, %85, %91; +} +{ +sub.f16x2 r151, %84, %90; +} +{ +sub.f16x2 r154, %85, %91; +} +{ +add.f16x2 r157, %86, %88; +} +{ +add.f16x2 r160, %87, %89; +} +{ +sub.f16x2 r163, %86, %88; +} +{ +sub.f16x2 r166, %87, %89; +} +{ +add.f16x2 r169, %58, r1; +} +{ +add.f16x2 r172, %59, r4; +} +{ +add.f16x2 r175, r169, r13; +} +{ +add.f16x2 r178, r172, r16; +} +{ +add.f16x2 r181, r175, r25; +} +{ +add.f16x2 r184, r178, r28; +} +{ +add.f16x2 r187, r181, r37; +} +{ +add.f16x2 r190, r184, r40; +} +{ +add.f16x2 r193, r187, r49; +} +{ +add.f16x2 r196, r190, r52; +} +{ +add.f16x2 r199, r193, r61; +} +{ +add.f16x2 r202, r196, r64; +} +{ +add.f16x2 r205, r199, r73; +} +{ +add.f16x2 r208, r202, r76; +} +{ +add.f16x2 r211, r205, r85; +} +{ +add.f16x2 r214, r208, r88; +} +{ +add.f16x2 r217, r211, r97; +} +{ +add.f16x2 r220, r214, r100; +} +{ +add.f16x2 r223, r217, r109; +} +{ +add.f16x2 r226, r220, r112; +} +{ +add.f16x2 r229, r223, r121; +} +{ +add.f16x2 r232, r226, r124; +} +{ +add.f16x2 r235, r229, r133; +} +{ +add.f16x2 r238, r232, r136; +} +{ +add.f16x2 r241, r235, r145; +} +{ +add.f16x2 r244, r238, r148; +} +{ +add.f16x2 %0, r241, r157; +} +{ +add.f16x2 %1, r244, r160; +} +mov.u32 r4804, 0; +cvt.rn.f16.s32 rs1, r4804; +mov.b32 r265, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r4804; +mov.b32 r277, {rs2, rs2}; +mov.f64 fd735, 0d3FEF4079C06C0992; +{ +cvt.rn.f16.f64 rs3, fd735; +} +mov.b32 r257, {rs3, rs3}; +{ +mul.f16x2 r255, r1, r257; +} +{ +add.f16x2 r258, %58, r255; +} +mov.f64 fd708, 0d3FCB8426C12812BC; +{ +cvt.rn.f16.f64 rs4, fd708; +} +mov.b32 r263, {rs4, rs4}; +{ +mul.f16x2 r261, r10, r263; +} +{ +add.f16x2 r264, r265, r261; +} +{ +cvt.rn.f16.f64 rs5, fd735; +} +mov.b32 r269, {rs5, rs5}; +{ +mul.f16x2 r267, r4, r269; +} +{ +add.f16x2 r270, %59, r267; +} +{ +cvt.rn.f16.f64 rs6, fd708; +} +mov.b32 r275, {rs6, rs6}; +{ +mul.f16x2 r273, r7, r275; +} +{ +add.f16x2 r276, r277, r273; +} +mov.f64 fd743, 0d3FED0ADB9B447CCF; +{ +cvt.rn.f16.f64 rs7, fd743; +} +mov.b32 r281, {rs7, rs7}; +{ +mul.f16x2 r279, r13, r281; +} +{ +add.f16x2 r282, r258, r279; +} +mov.f64 fd636, 0d3FDADF7689C97B70; +{ +cvt.rn.f16.f64 rs8, fd636; +} +mov.b32 r287, {rs8, rs8}; +{ +mul.f16x2 r285, r22, r287; +} +{ +add.f16x2 r288, r264, r285; +} +{ +cvt.rn.f16.f64 rs9, fd743; +} +mov.b32 r293, {rs9, rs9}; +{ +mul.f16x2 r291, r16, r293; +} +{ +add.f16x2 r294, r270, r291; +} +{ +cvt.rn.f16.f64 rs10, fd636; +} +mov.b32 r299, {rs10, rs10}; +{ +mul.f16x2 r297, r19, r299; +} +{ +add.f16x2 r300, r276, r297; +} +mov.f64 fd751, 0d3FE979982A38E65A; +{ +cvt.rn.f16.f64 rs11, fd751; +} +mov.b32 r305, {rs11, rs11}; +{ +mul.f16x2 r303, r25, r305; +} +{ +add.f16x2 r306, r282, r303; +} +mov.f64 fd540, 0d3FE35D9650D47852; +{ +cvt.rn.f16.f64 rs12, fd540; +} +mov.b32 r311, {rs12, rs12}; +{ +mul.f16x2 r309, r34, r311; +} +{ +add.f16x2 r312, r288, r309; +} +{ +cvt.rn.f16.f64 rs13, fd751; +} +mov.b32 r317, {rs13, rs13}; +{ +mul.f16x2 r315, r28, r317; +} +{ +add.f16x2 r318, r294, r315; +} +{ +cvt.rn.f16.f64 rs14, fd540; +} +mov.b32 r323, {rs14, rs14}; +{ +mul.f16x2 r321, r31, r323; +} +{ +add.f16x2 r324, r300, r321; +} +mov.f64 fd759, 0d3FE4B76371208A62; +{ +cvt.rn.f16.f64 rs15, fd759; +} +mov.b32 r329, {rs15, rs15}; +{ +mul.f16x2 r327, r37, r329; +} +{ +add.f16x2 r330, r306, r327; +} +mov.f64 fd700, 0d3FE863A1ADA0CFA6; +{ +cvt.rn.f16.f64 rs16, fd700; +} +mov.b32 r335, {rs16, rs16}; +{ +mul.f16x2 r333, r46, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +cvt.rn.f16.f64 rs17, fd759; +} +mov.b32 r341, {rs17, rs17}; +{ +mul.f16x2 r339, r40, r341; +} +{ +add.f16x2 r342, r318, r339; +} +{ +cvt.rn.f16.f64 rs18, fd700; +} +mov.b32 r347, {rs18, rs18}; +{ +mul.f16x2 r345, r43, r347; +} +{ +add.f16x2 r348, r324, r345; +} +mov.f64 fd767, 0d3FDDFA67657E7608; +{ +cvt.rn.f16.f64 rs19, fd767; +} +mov.b32 r353, {rs19, rs19}; +{ +mul.f16x2 r351, r49, r353; +} +{ +add.f16x2 r354, r330, r351; +} +mov.f64 fd604, 0d3FEC45BB0D10918C; +{ +cvt.rn.f16.f64 rs20, fd604; +} +mov.b32 r359, {rs20, rs20}; +{ +mul.f16x2 r357, r58, r359; +} +{ +add.f16x2 r360, r336, r357; +} +{ +cvt.rn.f16.f64 rs21, fd767; +} +mov.b32 r365, {rs21, rs21}; +{ +mul.f16x2 r363, r52, r365; +} +{ +add.f16x2 r366, r342, r363; +} +{ +cvt.rn.f16.f64 rs22, fd604; +} +mov.b32 r371, {rs22, rs22}; +{ +mul.f16x2 r369, r55, r371; +} +{ +add.f16x2 r372, r348, r369; +} +mov.f64 fd775, 0d3FD11F2F2E2F1E3B; +{ +cvt.rn.f16.f64 rs23, fd775; +} +mov.b32 r377, {rs23, rs23}; +{ +mul.f16x2 r375, r61, r377; +} +{ +add.f16x2 r378, r354, r375; +} +mov.f64 fd424, 0d3FEED566CB3DCBA1; +{ +cvt.rn.f16.f64 rs24, fd424; +} +mov.b32 r383, {rs24, rs24}; +{ +mul.f16x2 r381, r70, r383; +} +{ +add.f16x2 r384, r360, r381; +} +{ +cvt.rn.f16.f64 rs25, fd775; +} +mov.b32 r389, {rs25, rs25}; +{ +mul.f16x2 r387, r64, r389; +} +{ +add.f16x2 r390, r366, r387; +} +{ +cvt.rn.f16.f64 rs26, fd424; +} +mov.b32 r395, {rs26, rs26}; +{ +mul.f16x2 r393, r67, r395; +} +{ +add.f16x2 r396, r372, r393; +} +mov.f64 fd783, 0d3FABB81853A18977; +{ +cvt.rn.f16.f64 rs27, fd783; +} +mov.b32 r401, {rs27, rs27}; +{ +mul.f16x2 r399, r73, r401; +} +{ +add.f16x2 r402, r378, r399; +} +mov.f64 fd692, 0d3FEFF3FC588E859D; +{ +cvt.rn.f16.f64 rs28, fd692; +} +mov.b32 r407, {rs28, rs28}; +{ +mul.f16x2 r405, r82, r407; +} +{ +add.f16x2 r408, r384, r405; +} +{ +cvt.rn.f16.f64 rs29, fd783; +} +mov.b32 r413, {rs29, rs29}; +{ +mul.f16x2 r411, r76, r413; +} +{ +add.f16x2 r414, r390, r411; +} +{ +cvt.rn.f16.f64 rs30, fd692; +} +mov.b32 r419, {rs30, rs30}; +{ +mul.f16x2 r417, r79, r419; +} +{ +add.f16x2 r420, r396, r417; +} +mov.f64 fd779, 0dBFC4B545C0234A71; +{ +cvt.rn.f16.f64 rs31, fd779; +} +mov.b32 r425, {rs31, rs31}; +{ +mul.f16x2 r423, r85, r425; +} +{ +add.f16x2 r426, r402, r423; +} +mov.f64 fd780, 0d3FEF941537248537; +{ +cvt.rn.f16.f64 rs32, fd780; +} +mov.b32 r431, {rs32, rs32}; +{ +mul.f16x2 r429, r94, r431; +} +{ +add.f16x2 r432, r408, r429; +} +{ +cvt.rn.f16.f64 rs33, fd779; +} +mov.b32 r437, {rs33, rs33}; +{ +mul.f16x2 r435, r88, r437; +} +{ +add.f16x2 r438, r414, r435; +} +{ +cvt.rn.f16.f64 rs34, fd780; +} +mov.b32 r443, {rs34, rs34}; +{ +mul.f16x2 r441, r91, r443; +} +{ +add.f16x2 r444, r420, r441; +} +mov.f64 fd771, 0dBFD7B057F20BF2E4; +{ +cvt.rn.f16.f64 rs35, fd771; +} +mov.b32 r449, {rs35, rs35}; +{ +mul.f16x2 r447, r97, r449; +} +{ +add.f16x2 r450, r426, r447; +} +mov.f64 fd772, 0d3FEDBA2D62CB789F; +{ +cvt.rn.f16.f64 rs36, fd772; +} +mov.b32 r455, {rs36, rs36}; +{ +mul.f16x2 r453, r106, r455; +} +{ +add.f16x2 r456, r432, r453; +} +{ +cvt.rn.f16.f64 rs37, fd771; +} +mov.b32 r461, {rs37, rs37}; +{ +mul.f16x2 r459, r100, r461; +} +{ +add.f16x2 r462, r438, r459; +} +{ +cvt.rn.f16.f64 rs38, fd772; +} +mov.b32 r467, {rs38, rs38}; +{ +mul.f16x2 r465, r103, r467; +} +{ +add.f16x2 r468, r444, r465; +} +mov.f64 fd763, 0dBFE1F53E93956DBF; +{ +cvt.rn.f16.f64 rs39, fd763; +} +mov.b32 r473, {rs39, rs39}; +{ +mul.f16x2 r471, r109, r473; +} +{ +add.f16x2 r474, r450, r471; +} +mov.f64 fd764, 0d3FEA7C6DA34AF89F; +{ +cvt.rn.f16.f64 rs40, fd764; +} +mov.b32 r479, {rs40, rs40}; +{ +mul.f16x2 r477, r118, r479; +} +{ +add.f16x2 r480, r456, r477; +} +{ +cvt.rn.f16.f64 rs41, fd763; +} +mov.b32 r485, {rs41, rs41}; +{ +mul.f16x2 r483, r112, r485; +} +{ +add.f16x2 r486, r462, r483; +} +{ +cvt.rn.f16.f64 rs42, fd764; +} +mov.b32 r491, {rs42, rs42}; +{ +mul.f16x2 r489, r115, r491; +} +{ +add.f16x2 r492, r468, r489; +} +mov.f64 fd755, 0dBFE73B5AE5DB4E10; +{ +cvt.rn.f16.f64 rs43, fd755; +} +mov.b32 r497, {rs43, rs43}; +{ +mul.f16x2 r495, r121, r497; +} +{ +add.f16x2 r498, r474, r495; +} +mov.f64 fd756, 0d3FE601A24BA81342; +{ +cvt.rn.f16.f64 rs44, fd756; +} +mov.b32 r503, {rs44, rs44}; +{ +mul.f16x2 r501, r130, r503; +} +{ +add.f16x2 r504, r480, r501; +} +{ +cvt.rn.f16.f64 rs45, fd755; +} +mov.b32 r509, {rs45, rs45}; +{ +mul.f16x2 r507, r124, r509; +} +{ +add.f16x2 r510, r486, r507; +} +{ +cvt.rn.f16.f64 rs46, fd756; +} +mov.b32 r515, {rs46, rs46}; +{ +mul.f16x2 r513, r127, r515; +} +{ +add.f16x2 r516, r492, r513; +} +mov.f64 fd747, 0dBFEB6B5FBD9F7255; +{ +cvt.rn.f16.f64 rs47, fd747; +} +mov.b32 r521, {rs47, rs47}; +{ +mul.f16x2 r519, r133, r521; +} +{ +add.f16x2 r522, r498, r519; +} +mov.f64 fd748, 0d3FE07F6ACD7CDCE2; +{ +cvt.rn.f16.f64 rs48, fd748; +} +mov.b32 r527, {rs48, rs48}; +{ +mul.f16x2 r525, r142, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs49, fd747; +} +mov.b32 r533, {rs49, rs49}; +{ +mul.f16x2 r531, r136, r533; +} +{ +add.f16x2 r534, r510, r531; +} +{ +cvt.rn.f16.f64 rs50, fd748; +} +mov.b32 r539, {rs50, rs50}; +{ +mul.f16x2 r537, r139, r539; +} +{ +add.f16x2 r540, r516, r537; +} +mov.f64 fd739, 0dBFEE532CBE45C954; +{ +cvt.rn.f16.f64 rs51, fd739; +} +mov.b32 r545, {rs51, rs51}; +{ +mul.f16x2 r543, r145, r545; +} +{ +add.f16x2 r546, r522, r543; +} +mov.f64 fd740, 0d3FD46F6FAF5FCB72; +{ +cvt.rn.f16.f64 rs52, fd740; +} +mov.b32 r551, {rs52, rs52}; +{ +mul.f16x2 r549, r154, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs53, fd739; +} +mov.b32 r557, {rs53, rs53}; +{ +mul.f16x2 r555, r148, r557; +} +{ +add.f16x2 r558, r534, r555; +} +{ +cvt.rn.f16.f64 rs54, fd740; +} +mov.b32 r563, {rs54, rs54}; +{ +mul.f16x2 r561, r151, r563; +} +{ +add.f16x2 r564, r540, r561; +} +mov.f64 fd731, 0dBFEFCFFA67B61650; +{ +cvt.rn.f16.f64 rs55, fd731; +} +mov.b32 r569, {rs55, rs55}; +{ +mul.f16x2 r567, r157, r569; +} +{ +add.f16x2 r570, r546, r567; +} +mov.f64 fd732, 0d3FBBADB02034D9FF; +{ +cvt.rn.f16.f64 rs56, fd732; +} +mov.b32 r575, {rs56, rs56}; +{ +mul.f16x2 r573, r166, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs57, fd731; +} +mov.b32 r581, {rs57, rs57}; +{ +mul.f16x2 r579, r160, r581; +} +{ +add.f16x2 r582, r558, r579; +} +{ +cvt.rn.f16.f64 rs58, fd732; +} +mov.b32 r587, {rs58, rs58}; +{ +mul.f16x2 r585, r163, r587; +} +{ +add.f16x2 r588, r564, r585; +} +{ +sub.f16x2 %2, r570, r576; +} +{ +add.f16x2 %3, r582, r588; +} +{ +add.f16x2 %56, r570, r576; +} +{ +sub.f16x2 %57, r582, r588; +} +cvt.rn.f16.s32 rs59, r4804; +mov.b32 r615, {rs59, rs59}; +cvt.rn.f16.s32 rs60, r4804; +mov.b32 r627, {rs60, rs60}; +{ +cvt.rn.f16.f64 rs61, fd743; +} +mov.b32 r607, {rs61, rs61}; +{ +mul.f16x2 r605, r1, r607; +} +{ +add.f16x2 r608, %58, r605; +} +{ +cvt.rn.f16.f64 rs62, fd636; +} +mov.b32 r613, {rs62, rs62}; +{ +mul.f16x2 r611, r10, r613; +} +{ +add.f16x2 r614, r615, r611; +} +{ +cvt.rn.f16.f64 rs63, fd743; +} +mov.b32 r619, {rs63, rs63}; +{ +mul.f16x2 r617, r4, r619; +} +{ +add.f16x2 r620, %59, r617; +} +{ +cvt.rn.f16.f64 rs64, fd636; +} +mov.b32 r625, {rs64, rs64}; +{ +mul.f16x2 r623, r7, r625; +} +{ +add.f16x2 r626, r627, r623; +} +{ +cvt.rn.f16.f64 rs65, fd759; +} +mov.b32 r631, {rs65, rs65}; +{ +mul.f16x2 r629, r13, r631; +} +{ +add.f16x2 r632, r608, r629; +} +{ +cvt.rn.f16.f64 rs66, fd700; +} +mov.b32 r637, {rs66, rs66}; +{ +mul.f16x2 r635, r22, r637; +} +{ +add.f16x2 r638, r614, r635; +} +{ +cvt.rn.f16.f64 rs67, fd759; +} +mov.b32 r643, {rs67, rs67}; +{ +mul.f16x2 r641, r16, r643; +} +{ +add.f16x2 r644, r620, r641; +} +{ +cvt.rn.f16.f64 rs68, fd700; +} +mov.b32 r649, {rs68, rs68}; +{ +mul.f16x2 r647, r19, r649; +} +{ +add.f16x2 r650, r626, r647; +} +{ +cvt.rn.f16.f64 rs69, fd775; +} +mov.b32 r655, {rs69, rs69}; +{ +mul.f16x2 r653, r25, r655; +} +{ +add.f16x2 r656, r632, r653; +} +{ +cvt.rn.f16.f64 rs70, fd424; +} +mov.b32 r661, {rs70, rs70}; +{ +mul.f16x2 r659, r34, r661; +} +{ +add.f16x2 r662, r638, r659; +} +{ +cvt.rn.f16.f64 rs71, fd775; +} +mov.b32 r667, {rs71, rs71}; +{ +mul.f16x2 r665, r28, r667; +} +{ +add.f16x2 r668, r644, r665; +} +{ +cvt.rn.f16.f64 rs72, fd424; +} +mov.b32 r673, {rs72, rs72}; +{ +mul.f16x2 r671, r31, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs73, fd779; +} +mov.b32 r679, {rs73, rs73}; +{ +mul.f16x2 r677, r37, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs74, fd780; +} +mov.b32 r685, {rs74, rs74}; +{ +mul.f16x2 r683, r46, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs75, fd779; +} +mov.b32 r691, {rs75, rs75}; +{ +mul.f16x2 r689, r40, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs76, fd780; +} +mov.b32 r697, {rs76, rs76}; +{ +mul.f16x2 r695, r43, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs77, fd763; +} +mov.b32 r703, {rs77, rs77}; +{ +mul.f16x2 r701, r49, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs78, fd764; +} +mov.b32 r709, {rs78, rs78}; +{ +mul.f16x2 r707, r58, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs79, fd763; +} +mov.b32 r715, {rs79, rs79}; +{ +mul.f16x2 r713, r52, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs80, fd764; +} +mov.b32 r721, {rs80, rs80}; +{ +mul.f16x2 r719, r55, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs81, fd747; +} +mov.b32 r727, {rs81, rs81}; +{ +mul.f16x2 r725, r61, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs82, fd748; +} +mov.b32 r733, {rs82, rs82}; +{ +mul.f16x2 r731, r70, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs83, fd747; +} +mov.b32 r739, {rs83, rs83}; +{ +mul.f16x2 r737, r64, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs84, fd748; +} +mov.b32 r745, {rs84, rs84}; +{ +mul.f16x2 r743, r67, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +cvt.rn.f16.f64 rs85, fd731; +} +mov.b32 r751, {rs85, rs85}; +{ +mul.f16x2 r749, r73, r751; +} +{ +add.f16x2 r752, r728, r749; +} +{ +cvt.rn.f16.f64 rs86, fd732; +} +mov.b32 r757, {rs86, rs86}; +{ +mul.f16x2 r755, r82, r757; +} +{ +add.f16x2 r758, r734, r755; +} +{ +cvt.rn.f16.f64 rs87, fd731; +} +mov.b32 r763, {rs87, rs87}; +{ +mul.f16x2 r761, r76, r763; +} +{ +add.f16x2 r764, r740, r761; +} +{ +cvt.rn.f16.f64 rs88, fd732; +} +mov.b32 r769, {rs88, rs88}; +{ +mul.f16x2 r767, r79, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs89, fd739; +} +mov.b32 r775, {rs89, rs89}; +{ +mul.f16x2 r773, r85, r775; +} +{ +add.f16x2 r776, r752, r773; +} +mov.f64 fd660, 0dBFD46F6FAF5FCB72; +{ +cvt.rn.f16.f64 rs90, fd660; +} +mov.b32 r781, {rs90, rs90}; +{ +mul.f16x2 r779, r94, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs91, fd739; +} +mov.b32 r787, {rs91, rs91}; +{ +mul.f16x2 r785, r88, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs92, fd660; +} +mov.b32 r793, {rs92, rs92}; +{ +mul.f16x2 r791, r91, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs93, fd755; +} +mov.b32 r799, {rs93, rs93}; +{ +mul.f16x2 r797, r97, r799; +} +{ +add.f16x2 r800, r776, r797; +} +mov.f64 fd456, 0dBFE601A24BA81342; +{ +cvt.rn.f16.f64 rs94, fd456; +} +mov.b32 r805, {rs94, rs94}; +{ +mul.f16x2 r803, r106, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs95, fd755; +} +mov.b32 r811, {rs95, rs95}; +{ +mul.f16x2 r809, r100, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs96, fd456; +} +mov.b32 r817, {rs96, rs96}; +{ +mul.f16x2 r815, r103, r817; +} +{ +add.f16x2 r818, r794, r815; +} +{ +cvt.rn.f16.f64 rs97, fd771; +} +mov.b32 r823, {rs97, rs97}; +{ +mul.f16x2 r821, r109, r823; +} +{ +add.f16x2 r824, r800, r821; +} +mov.f64 fd696, 0dBFEDBA2D62CB789F; +{ +cvt.rn.f16.f64 rs98, fd696; +} +mov.b32 r829, {rs98, rs98}; +{ +mul.f16x2 r827, r118, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs99, fd771; +} +mov.b32 r835, {rs99, rs99}; +{ +mul.f16x2 r833, r112, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs100, fd696; +} +mov.b32 r841, {rs100, rs100}; +{ +mul.f16x2 r839, r115, r841; +} +{ +add.f16x2 r842, r818, r839; +} +{ +cvt.rn.f16.f64 rs101, fd783; +} +mov.b32 r847, {rs101, rs101}; +{ +mul.f16x2 r845, r121, r847; +} +{ +add.f16x2 r848, r824, r845; +} +mov.f64 fd784, 0dBFEFF3FC588E859D; +{ +cvt.rn.f16.f64 rs102, fd784; +} +mov.b32 r853, {rs102, rs102}; +{ +mul.f16x2 r851, r130, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs103, fd783; +} +mov.b32 r859, {rs103, rs103}; +{ +mul.f16x2 r857, r124, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs104, fd784; +} +mov.b32 r865, {rs104, rs104}; +{ +mul.f16x2 r863, r127, r865; +} +{ +add.f16x2 r866, r842, r863; +} +{ +cvt.rn.f16.f64 rs105, fd767; +} +mov.b32 r871, {rs105, rs105}; +{ +mul.f16x2 r869, r133, r871; +} +{ +add.f16x2 r872, r848, r869; +} +mov.f64 fd768, 0dBFEC45BB0D10918C; +{ +cvt.rn.f16.f64 rs106, fd768; +} +mov.b32 r877, {rs106, rs106}; +{ +mul.f16x2 r875, r142, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs107, fd767; +} +mov.b32 r883, {rs107, rs107}; +{ +mul.f16x2 r881, r136, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +cvt.rn.f16.f64 rs108, fd768; +} +mov.b32 r889, {rs108, rs108}; +{ +mul.f16x2 r887, r139, r889; +} +{ +add.f16x2 r890, r866, r887; +} +{ +cvt.rn.f16.f64 rs109, fd751; +} +mov.b32 r895, {rs109, rs109}; +{ +mul.f16x2 r893, r145, r895; +} +{ +add.f16x2 r896, r872, r893; +} +mov.f64 fd752, 0dBFE35D9650D47852; +{ +cvt.rn.f16.f64 rs110, fd752; +} +mov.b32 r901, {rs110, rs110}; +{ +mul.f16x2 r899, r154, r901; +} +{ +add.f16x2 r902, r878, r899; +} +{ +cvt.rn.f16.f64 rs111, fd751; +} +mov.b32 r907, {rs111, rs111}; +{ +mul.f16x2 r905, r148, r907; +} +{ +add.f16x2 r908, r884, r905; +} +{ +cvt.rn.f16.f64 rs112, fd752; +} +mov.b32 r913, {rs112, rs112}; +{ +mul.f16x2 r911, r151, r913; +} +{ +add.f16x2 r914, r890, r911; +} +{ +cvt.rn.f16.f64 rs113, fd735; +} +mov.b32 r919, {rs113, rs113}; +{ +mul.f16x2 r917, r157, r919; +} +{ +add.f16x2 r920, r896, r917; +} +mov.f64 fd736, 0dBFCB8426C12812BC; +{ +cvt.rn.f16.f64 rs114, fd736; +} +mov.b32 r925, {rs114, rs114}; +{ +mul.f16x2 r923, r166, r925; +} +{ +add.f16x2 r926, r902, r923; +} +{ +cvt.rn.f16.f64 rs115, fd735; +} +mov.b32 r931, {rs115, rs115}; +{ +mul.f16x2 r929, r160, r931; +} +{ +add.f16x2 r932, r908, r929; +} +{ +cvt.rn.f16.f64 rs116, fd736; +} +mov.b32 r937, {rs116, rs116}; +{ +mul.f16x2 r935, r163, r937; +} +{ +add.f16x2 r938, r914, r935; +} +{ +sub.f16x2 %4, r920, r926; +} +{ +add.f16x2 %5, r932, r938; +} +{ +add.f16x2 %54, r920, r926; +} +{ +sub.f16x2 %55, r932, r938; +} +cvt.rn.f16.s32 rs117, r4804; +mov.b32 r965, {rs117, rs117}; +cvt.rn.f16.s32 rs118, r4804; +mov.b32 r977, {rs118, rs118}; +{ +cvt.rn.f16.f64 rs119, fd751; +} +mov.b32 r957, {rs119, rs119}; +{ +mul.f16x2 r955, r1, r957; +} +{ +add.f16x2 r958, %58, r955; +} +{ +cvt.rn.f16.f64 rs120, fd540; +} +mov.b32 r963, {rs120, rs120}; +{ +mul.f16x2 r961, r10, r963; +} +{ +add.f16x2 r964, r965, r961; +} +{ +cvt.rn.f16.f64 rs121, fd751; +} +mov.b32 r969, {rs121, rs121}; +{ +mul.f16x2 r967, r4, r969; +} +{ +add.f16x2 r970, %59, r967; +} +{ +cvt.rn.f16.f64 rs122, fd540; +} +mov.b32 r975, {rs122, rs122}; +{ +mul.f16x2 r973, r7, r975; +} +{ +add.f16x2 r976, r977, r973; +} +{ +cvt.rn.f16.f64 rs123, fd775; +} +mov.b32 r981, {rs123, rs123}; +{ +mul.f16x2 r979, r13, r981; +} +{ +add.f16x2 r982, r958, r979; +} +{ +cvt.rn.f16.f64 rs124, fd424; +} +mov.b32 r987, {rs124, rs124}; +{ +mul.f16x2 r985, r22, r987; +} +{ +add.f16x2 r988, r964, r985; +} +{ +cvt.rn.f16.f64 rs125, fd775; +} +mov.b32 r993, {rs125, rs125}; +{ +mul.f16x2 r991, r16, r993; +} +{ +add.f16x2 r994, r970, r991; +} +{ +cvt.rn.f16.f64 rs126, fd424; +} +mov.b32 r999, {rs126, rs126}; +{ +mul.f16x2 r997, r19, r999; +} +{ +add.f16x2 r1000, r976, r997; +} +{ +cvt.rn.f16.f64 rs127, fd771; +} +mov.b32 r1005, {rs127, rs127}; +{ +mul.f16x2 r1003, r25, r1005; +} +{ +add.f16x2 r1006, r982, r1003; +} +{ +cvt.rn.f16.f64 rs128, fd772; +} +mov.b32 r1011, {rs128, rs128}; +{ +mul.f16x2 r1009, r34, r1011; +} +{ +add.f16x2 r1012, r988, r1009; +} +{ +cvt.rn.f16.f64 rs129, fd771; +} +mov.b32 r1017, {rs129, rs129}; +{ +mul.f16x2 r1015, r28, r1017; +} +{ +add.f16x2 r1018, r994, r1015; +} +{ +cvt.rn.f16.f64 rs130, fd772; +} +mov.b32 r1023, {rs130, rs130}; +{ +mul.f16x2 r1021, r31, r1023; +} +{ +add.f16x2 r1024, r1000, r1021; +} +{ +cvt.rn.f16.f64 rs131, fd747; +} +mov.b32 r1029, {rs131, rs131}; +{ +mul.f16x2 r1027, r37, r1029; +} +{ +add.f16x2 r1030, r1006, r1027; +} +{ +cvt.rn.f16.f64 rs132, fd748; +} +mov.b32 r1035, {rs132, rs132}; +{ +mul.f16x2 r1033, r46, r1035; +} +{ +add.f16x2 r1036, r1012, r1033; +} +{ +cvt.rn.f16.f64 rs133, fd747; +} +mov.b32 r1041, {rs133, rs133}; +{ +mul.f16x2 r1039, r40, r1041; +} +{ +add.f16x2 r1042, r1018, r1039; +} +{ +cvt.rn.f16.f64 rs134, fd748; +} +mov.b32 r1047, {rs134, rs134}; +{ +mul.f16x2 r1045, r43, r1047; +} +{ +add.f16x2 r1048, r1024, r1045; +} +{ +cvt.rn.f16.f64 rs135, fd731; +} +mov.b32 r1053, {rs135, rs135}; +{ +mul.f16x2 r1051, r49, r1053; +} +{ +add.f16x2 r1054, r1030, r1051; +} +mov.f64 fd576, 0dBFBBADB02034D9FF; +{ +cvt.rn.f16.f64 rs136, fd576; +} +mov.b32 r1059, {rs136, rs136}; +{ +mul.f16x2 r1057, r58, r1059; +} +{ +add.f16x2 r1060, r1036, r1057; +} +{ +cvt.rn.f16.f64 rs137, fd731; +} +mov.b32 r1065, {rs137, rs137}; +{ +mul.f16x2 r1063, r52, r1065; +} +{ +add.f16x2 r1066, r1042, r1063; +} +{ +cvt.rn.f16.f64 rs138, fd576; +} +mov.b32 r1071, {rs138, rs138}; +{ +mul.f16x2 r1069, r55, r1071; +} +{ +add.f16x2 r1072, r1048, r1069; +} +{ +cvt.rn.f16.f64 rs139, fd755; +} +mov.b32 r1077, {rs139, rs139}; +{ +mul.f16x2 r1075, r61, r1077; +} +{ +add.f16x2 r1078, r1054, r1075; +} +{ +cvt.rn.f16.f64 rs140, fd456; +} +mov.b32 r1083, {rs140, rs140}; +{ +mul.f16x2 r1081, r70, r1083; +} +{ +add.f16x2 r1084, r1060, r1081; +} +{ +cvt.rn.f16.f64 rs141, fd755; +} +mov.b32 r1089, {rs141, rs141}; +{ +mul.f16x2 r1087, r64, r1089; +} +{ +add.f16x2 r1090, r1066, r1087; +} +{ +cvt.rn.f16.f64 rs142, fd456; +} +mov.b32 r1095, {rs142, rs142}; +{ +mul.f16x2 r1093, r67, r1095; +} +{ +add.f16x2 r1096, r1072, r1093; +} +{ +cvt.rn.f16.f64 rs143, fd779; +} +mov.b32 r1101, {rs143, rs143}; +{ +mul.f16x2 r1099, r73, r1101; +} +{ +add.f16x2 r1102, r1078, r1099; +} +mov.f64 fd652, 0dBFEF941537248537; +{ +cvt.rn.f16.f64 rs144, fd652; +} +mov.b32 r1107, {rs144, rs144}; +{ +mul.f16x2 r1105, r82, r1107; +} +{ +add.f16x2 r1108, r1084, r1105; +} +{ +cvt.rn.f16.f64 rs145, fd779; +} +mov.b32 r1113, {rs145, rs145}; +{ +mul.f16x2 r1111, r76, r1113; +} +{ +add.f16x2 r1114, r1090, r1111; +} +{ +cvt.rn.f16.f64 rs146, fd652; +} +mov.b32 r1119, {rs146, rs146}; +{ +mul.f16x2 r1117, r79, r1119; +} +{ +add.f16x2 r1120, r1096, r1117; +} +{ +cvt.rn.f16.f64 rs147, fd767; +} +mov.b32 r1125, {rs147, rs147}; +{ +mul.f16x2 r1123, r85, r1125; +} +{ +add.f16x2 r1126, r1102, r1123; +} +{ +cvt.rn.f16.f64 rs148, fd768; +} +mov.b32 r1131, {rs148, rs148}; +{ +mul.f16x2 r1129, r94, r1131; +} +{ +add.f16x2 r1132, r1108, r1129; +} +{ +cvt.rn.f16.f64 rs149, fd767; +} +mov.b32 r1137, {rs149, rs149}; +{ +mul.f16x2 r1135, r88, r1137; +} +{ +add.f16x2 r1138, r1114, r1135; +} +{ +cvt.rn.f16.f64 rs150, fd768; +} +mov.b32 r1143, {rs150, rs150}; +{ +mul.f16x2 r1141, r91, r1143; +} +{ +add.f16x2 r1144, r1120, r1141; +} +{ +cvt.rn.f16.f64 rs151, fd743; +} +mov.b32 r1149, {rs151, rs151}; +{ +mul.f16x2 r1147, r97, r1149; +} +{ +add.f16x2 r1150, r1126, r1147; +} +mov.f64 fd744, 0dBFDADF7689C97B70; +{ +cvt.rn.f16.f64 rs152, fd744; +} +mov.b32 r1155, {rs152, rs152}; +{ +mul.f16x2 r1153, r106, r1155; +} +{ +add.f16x2 r1156, r1132, r1153; +} +{ +cvt.rn.f16.f64 rs153, fd743; +} +mov.b32 r1161, {rs153, rs153}; +{ +mul.f16x2 r1159, r100, r1161; +} +{ +add.f16x2 r1162, r1138, r1159; +} +{ +cvt.rn.f16.f64 rs154, fd744; +} +mov.b32 r1167, {rs154, rs154}; +{ +mul.f16x2 r1165, r103, r1167; +} +{ +add.f16x2 r1168, r1144, r1165; +} +{ +cvt.rn.f16.f64 rs155, fd735; +} +mov.b32 r1173, {rs155, rs155}; +{ +mul.f16x2 r1171, r109, r1173; +} +{ +add.f16x2 r1174, r1150, r1171; +} +{ +cvt.rn.f16.f64 rs156, fd708; +} +mov.b32 r1179, {rs156, rs156}; +{ +mul.f16x2 r1177, r118, r1179; +} +{ +add.f16x2 r1180, r1156, r1177; +} +{ +cvt.rn.f16.f64 rs157, fd735; +} +mov.b32 r1185, {rs157, rs157}; +{ +mul.f16x2 r1183, r112, r1185; +} +{ +add.f16x2 r1186, r1162, r1183; +} +{ +cvt.rn.f16.f64 rs158, fd708; +} +mov.b32 r1191, {rs158, rs158}; +{ +mul.f16x2 r1189, r115, r1191; +} +{ +add.f16x2 r1192, r1168, r1189; +} +{ +cvt.rn.f16.f64 rs159, fd759; +} +mov.b32 r1197, {rs159, rs159}; +{ +mul.f16x2 r1195, r121, r1197; +} +{ +add.f16x2 r1198, r1174, r1195; +} +{ +cvt.rn.f16.f64 rs160, fd700; +} +mov.b32 r1203, {rs160, rs160}; +{ +mul.f16x2 r1201, r130, r1203; +} +{ +add.f16x2 r1204, r1180, r1201; +} +{ +cvt.rn.f16.f64 rs161, fd759; +} +mov.b32 r1209, {rs161, rs161}; +{ +mul.f16x2 r1207, r124, r1209; +} +{ +add.f16x2 r1210, r1186, r1207; +} +{ +cvt.rn.f16.f64 rs162, fd700; +} +mov.b32 r1215, {rs162, rs162}; +{ +mul.f16x2 r1213, r127, r1215; +} +{ +add.f16x2 r1216, r1192, r1213; +} +{ +cvt.rn.f16.f64 rs163, fd783; +} +mov.b32 r1221, {rs163, rs163}; +{ +mul.f16x2 r1219, r133, r1221; +} +{ +add.f16x2 r1222, r1198, r1219; +} +{ +cvt.rn.f16.f64 rs164, fd692; +} +mov.b32 r1227, {rs164, rs164}; +{ +mul.f16x2 r1225, r142, r1227; +} +{ +add.f16x2 r1228, r1204, r1225; +} +{ +cvt.rn.f16.f64 rs165, fd783; +} +mov.b32 r1233, {rs165, rs165}; +{ +mul.f16x2 r1231, r136, r1233; +} +{ +add.f16x2 r1234, r1210, r1231; +} +{ +cvt.rn.f16.f64 rs166, fd692; +} +mov.b32 r1239, {rs166, rs166}; +{ +mul.f16x2 r1237, r139, r1239; +} +{ +add.f16x2 r1240, r1216, r1237; +} +{ +cvt.rn.f16.f64 rs167, fd763; +} +mov.b32 r1245, {rs167, rs167}; +{ +mul.f16x2 r1243, r145, r1245; +} +{ +add.f16x2 r1246, r1222, r1243; +} +{ +cvt.rn.f16.f64 rs168, fd764; +} +mov.b32 r1251, {rs168, rs168}; +{ +mul.f16x2 r1249, r154, r1251; +} +{ +add.f16x2 r1252, r1228, r1249; +} +{ +cvt.rn.f16.f64 rs169, fd763; +} +mov.b32 r1257, {rs169, rs169}; +{ +mul.f16x2 r1255, r148, r1257; +} +{ +add.f16x2 r1258, r1234, r1255; +} +{ +cvt.rn.f16.f64 rs170, fd764; +} +mov.b32 r1263, {rs170, rs170}; +{ +mul.f16x2 r1261, r151, r1263; +} +{ +add.f16x2 r1264, r1240, r1261; +} +{ +cvt.rn.f16.f64 rs171, fd739; +} +mov.b32 r1269, {rs171, rs171}; +{ +mul.f16x2 r1267, r157, r1269; +} +{ +add.f16x2 r1270, r1246, r1267; +} +{ +cvt.rn.f16.f64 rs172, fd740; +} +mov.b32 r1275, {rs172, rs172}; +{ +mul.f16x2 r1273, r166, r1275; +} +{ +add.f16x2 r1276, r1252, r1273; +} +{ +cvt.rn.f16.f64 rs173, fd739; +} +mov.b32 r1281, {rs173, rs173}; +{ +mul.f16x2 r1279, r160, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs174, fd740; +} +mov.b32 r1287, {rs174, rs174}; +{ +mul.f16x2 r1285, r163, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +sub.f16x2 %6, r1270, r1276; +} +{ +add.f16x2 %7, r1282, r1288; +} +{ +add.f16x2 %52, r1270, r1276; +} +{ +sub.f16x2 %53, r1282, r1288; +} +cvt.rn.f16.s32 rs175, r4804; +mov.b32 r1315, {rs175, rs175}; +cvt.rn.f16.s32 rs176, r4804; +mov.b32 r1327, {rs176, rs176}; +{ +cvt.rn.f16.f64 rs177, fd759; +} +mov.b32 r1307, {rs177, rs177}; +{ +mul.f16x2 r1305, r1, r1307; +} +{ +add.f16x2 r1308, %58, r1305; +} +{ +cvt.rn.f16.f64 rs178, fd700; +} +mov.b32 r1313, {rs178, rs178}; +{ +mul.f16x2 r1311, r10, r1313; +} +{ +add.f16x2 r1314, r1315, r1311; +} +{ +cvt.rn.f16.f64 rs179, fd759; +} +mov.b32 r1319, {rs179, rs179}; +{ +mul.f16x2 r1317, r4, r1319; +} +{ +add.f16x2 r1320, %59, r1317; +} +{ +cvt.rn.f16.f64 rs180, fd700; +} +mov.b32 r1325, {rs180, rs180}; +{ +mul.f16x2 r1323, r7, r1325; +} +{ +add.f16x2 r1326, r1327, r1323; +} +{ +cvt.rn.f16.f64 rs181, fd779; +} +mov.b32 r1331, {rs181, rs181}; +{ +mul.f16x2 r1329, r13, r1331; +} +{ +add.f16x2 r1332, r1308, r1329; +} +{ +cvt.rn.f16.f64 rs182, fd780; +} +mov.b32 r1337, {rs182, rs182}; +{ +mul.f16x2 r1335, r22, r1337; +} +{ +add.f16x2 r1338, r1314, r1335; +} +{ +cvt.rn.f16.f64 rs183, fd779; +} +mov.b32 r1343, {rs183, rs183}; +{ +mul.f16x2 r1341, r16, r1343; +} +{ +add.f16x2 r1344, r1320, r1341; +} +{ +cvt.rn.f16.f64 rs184, fd780; +} +mov.b32 r1349, {rs184, rs184}; +{ +mul.f16x2 r1347, r19, r1349; +} +{ +add.f16x2 r1350, r1326, r1347; +} +{ +cvt.rn.f16.f64 rs185, fd747; +} +mov.b32 r1355, {rs185, rs185}; +{ +mul.f16x2 r1353, r25, r1355; +} +{ +add.f16x2 r1356, r1332, r1353; +} +{ +cvt.rn.f16.f64 rs186, fd748; +} +mov.b32 r1361, {rs186, rs186}; +{ +mul.f16x2 r1359, r34, r1361; +} +{ +add.f16x2 r1362, r1338, r1359; +} +{ +cvt.rn.f16.f64 rs187, fd747; +} +mov.b32 r1367, {rs187, rs187}; +{ +mul.f16x2 r1365, r28, r1367; +} +{ +add.f16x2 r1368, r1344, r1365; +} +{ +cvt.rn.f16.f64 rs188, fd748; +} +mov.b32 r1373, {rs188, rs188}; +{ +mul.f16x2 r1371, r31, r1373; +} +{ +add.f16x2 r1374, r1350, r1371; +} +{ +cvt.rn.f16.f64 rs189, fd739; +} +mov.b32 r1379, {rs189, rs189}; +{ +mul.f16x2 r1377, r37, r1379; +} +{ +add.f16x2 r1380, r1356, r1377; +} +{ +cvt.rn.f16.f64 rs190, fd660; +} +mov.b32 r1385, {rs190, rs190}; +{ +mul.f16x2 r1383, r46, r1385; +} +{ +add.f16x2 r1386, r1362, r1383; +} +{ +cvt.rn.f16.f64 rs191, fd739; +} +mov.b32 r1391, {rs191, rs191}; +{ +mul.f16x2 r1389, r40, r1391; +} +{ +add.f16x2 r1392, r1368, r1389; +} +{ +cvt.rn.f16.f64 rs192, fd660; +} +mov.b32 r1397, {rs192, rs192}; +{ +mul.f16x2 r1395, r43, r1397; +} +{ +add.f16x2 r1398, r1374, r1395; +} +{ +cvt.rn.f16.f64 rs193, fd771; +} +mov.b32 r1403, {rs193, rs193}; +{ +mul.f16x2 r1401, r49, r1403; +} +{ +add.f16x2 r1404, r1380, r1401; +} +{ +cvt.rn.f16.f64 rs194, fd696; +} +mov.b32 r1409, {rs194, rs194}; +{ +mul.f16x2 r1407, r58, r1409; +} +{ +add.f16x2 r1410, r1386, r1407; +} +{ +cvt.rn.f16.f64 rs195, fd771; +} +mov.b32 r1415, {rs195, rs195}; +{ +mul.f16x2 r1413, r52, r1415; +} +{ +add.f16x2 r1416, r1392, r1413; +} +{ +cvt.rn.f16.f64 rs196, fd696; +} +mov.b32 r1421, {rs196, rs196}; +{ +mul.f16x2 r1419, r55, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs197, fd767; +} +mov.b32 r1427, {rs197, rs197}; +{ +mul.f16x2 r1425, r61, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs198, fd768; +} +mov.b32 r1433, {rs198, rs198}; +{ +mul.f16x2 r1431, r70, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs199, fd767; +} +mov.b32 r1439, {rs199, rs199}; +{ +mul.f16x2 r1437, r64, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs200, fd768; +} +mov.b32 r1445, {rs200, rs200}; +{ +mul.f16x2 r1443, r67, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs201, fd735; +} +mov.b32 r1451, {rs201, rs201}; +{ +mul.f16x2 r1449, r73, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs202, fd736; +} +mov.b32 r1457, {rs202, rs202}; +{ +mul.f16x2 r1455, r82, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs203, fd735; +} +mov.b32 r1463, {rs203, rs203}; +{ +mul.f16x2 r1461, r76, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs204, fd736; +} +mov.b32 r1469, {rs204, rs204}; +{ +mul.f16x2 r1467, r79, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs205, fd751; +} +mov.b32 r1475, {rs205, rs205}; +{ +mul.f16x2 r1473, r85, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs206, fd540; +} +mov.b32 r1481, {rs206, rs206}; +{ +mul.f16x2 r1479, r94, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs207, fd751; +} +mov.b32 r1487, {rs207, rs207}; +{ +mul.f16x2 r1485, r88, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs208, fd540; +} +mov.b32 r1493, {rs208, rs208}; +{ +mul.f16x2 r1491, r91, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs209, fd783; +} +mov.b32 r1499, {rs209, rs209}; +{ +mul.f16x2 r1497, r97, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs210, fd692; +} +mov.b32 r1505, {rs210, rs210}; +{ +mul.f16x2 r1503, r106, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +cvt.rn.f16.f64 rs211, fd783; +} +mov.b32 r1511, {rs211, rs211}; +{ +mul.f16x2 r1509, r100, r1511; +} +{ +add.f16x2 r1512, r1488, r1509; +} +{ +cvt.rn.f16.f64 rs212, fd692; +} +mov.b32 r1517, {rs212, rs212}; +{ +mul.f16x2 r1515, r103, r1517; +} +{ +add.f16x2 r1518, r1494, r1515; +} +{ +cvt.rn.f16.f64 rs213, fd755; +} +mov.b32 r1523, {rs213, rs213}; +{ +mul.f16x2 r1521, r109, r1523; +} +{ +add.f16x2 r1524, r1500, r1521; +} +{ +cvt.rn.f16.f64 rs214, fd756; +} +mov.b32 r1529, {rs214, rs214}; +{ +mul.f16x2 r1527, r118, r1529; +} +{ +add.f16x2 r1530, r1506, r1527; +} +{ +cvt.rn.f16.f64 rs215, fd755; +} +mov.b32 r1535, {rs215, rs215}; +{ +mul.f16x2 r1533, r112, r1535; +} +{ +add.f16x2 r1536, r1512, r1533; +} +{ +cvt.rn.f16.f64 rs216, fd756; +} +mov.b32 r1541, {rs216, rs216}; +{ +mul.f16x2 r1539, r115, r1541; +} +{ +add.f16x2 r1542, r1518, r1539; +} +{ +cvt.rn.f16.f64 rs217, fd731; +} +mov.b32 r1547, {rs217, rs217}; +{ +mul.f16x2 r1545, r121, r1547; +} +{ +add.f16x2 r1548, r1524, r1545; +} +{ +cvt.rn.f16.f64 rs218, fd576; +} +mov.b32 r1553, {rs218, rs218}; +{ +mul.f16x2 r1551, r130, r1553; +} +{ +add.f16x2 r1554, r1530, r1551; +} +{ +cvt.rn.f16.f64 rs219, fd731; +} +mov.b32 r1559, {rs219, rs219}; +{ +mul.f16x2 r1557, r124, r1559; +} +{ +add.f16x2 r1560, r1536, r1557; +} +{ +cvt.rn.f16.f64 rs220, fd576; +} +mov.b32 r1565, {rs220, rs220}; +{ +mul.f16x2 r1563, r127, r1565; +} +{ +add.f16x2 r1566, r1542, r1563; +} +{ +cvt.rn.f16.f64 rs221, fd763; +} +mov.b32 r1571, {rs221, rs221}; +{ +mul.f16x2 r1569, r133, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +mov.f64 fd632, 0dBFEA7C6DA34AF89F; +{ +cvt.rn.f16.f64 rs222, fd632; +} +mov.b32 r1577, {rs222, rs222}; +{ +mul.f16x2 r1575, r142, r1577; +} +{ +add.f16x2 r1578, r1554, r1575; +} +{ +cvt.rn.f16.f64 rs223, fd763; +} +mov.b32 r1583, {rs223, rs223}; +{ +mul.f16x2 r1581, r136, r1583; +} +{ +add.f16x2 r1584, r1560, r1581; +} +{ +cvt.rn.f16.f64 rs224, fd632; +} +mov.b32 r1589, {rs224, rs224}; +{ +mul.f16x2 r1587, r139, r1589; +} +{ +add.f16x2 r1590, r1566, r1587; +} +{ +cvt.rn.f16.f64 rs225, fd775; +} +mov.b32 r1595, {rs225, rs225}; +{ +mul.f16x2 r1593, r145, r1595; +} +{ +add.f16x2 r1596, r1572, r1593; +} +mov.f64 fd776, 0dBFEED566CB3DCBA1; +{ +cvt.rn.f16.f64 rs226, fd776; +} +mov.b32 r1601, {rs226, rs226}; +{ +mul.f16x2 r1599, r154, r1601; +} +{ +add.f16x2 r1602, r1578, r1599; +} +{ +cvt.rn.f16.f64 rs227, fd775; +} +mov.b32 r1607, {rs227, rs227}; +{ +mul.f16x2 r1605, r148, r1607; +} +{ +add.f16x2 r1608, r1584, r1605; +} +{ +cvt.rn.f16.f64 rs228, fd776; +} +mov.b32 r1613, {rs228, rs228}; +{ +mul.f16x2 r1611, r151, r1613; +} +{ +add.f16x2 r1614, r1590, r1611; +} +{ +cvt.rn.f16.f64 rs229, fd743; +} +mov.b32 r1619, {rs229, rs229}; +{ +mul.f16x2 r1617, r157, r1619; +} +{ +add.f16x2 r1620, r1596, r1617; +} +{ +cvt.rn.f16.f64 rs230, fd744; +} +mov.b32 r1625, {rs230, rs230}; +{ +mul.f16x2 r1623, r166, r1625; +} +{ +add.f16x2 r1626, r1602, r1623; +} +{ +cvt.rn.f16.f64 rs231, fd743; +} +mov.b32 r1631, {rs231, rs231}; +{ +mul.f16x2 r1629, r160, r1631; +} +{ +add.f16x2 r1632, r1608, r1629; +} +{ +cvt.rn.f16.f64 rs232, fd744; +} +mov.b32 r1637, {rs232, rs232}; +{ +mul.f16x2 r1635, r163, r1637; +} +{ +add.f16x2 r1638, r1614, r1635; +} +{ +sub.f16x2 %8, r1620, r1626; +} +{ +add.f16x2 %9, r1632, r1638; +} +{ +add.f16x2 %50, r1620, r1626; +} +{ +sub.f16x2 %51, r1632, r1638; +} +cvt.rn.f16.s32 rs233, r4804; +mov.b32 r1665, {rs233, rs233}; +cvt.rn.f16.s32 rs234, r4804; +mov.b32 r1677, {rs234, rs234}; +{ +cvt.rn.f16.f64 rs235, fd767; +} +mov.b32 r1657, {rs235, rs235}; +{ +mul.f16x2 r1655, r1, r1657; +} +{ +add.f16x2 r1658, %58, r1655; +} +{ +cvt.rn.f16.f64 rs236, fd604; +} +mov.b32 r1663, {rs236, rs236}; +{ +mul.f16x2 r1661, r10, r1663; +} +{ +add.f16x2 r1664, r1665, r1661; +} +{ +cvt.rn.f16.f64 rs237, fd767; +} +mov.b32 r1669, {rs237, rs237}; +{ +mul.f16x2 r1667, r4, r1669; +} +{ +add.f16x2 r1670, %59, r1667; +} +{ +cvt.rn.f16.f64 rs238, fd604; +} +mov.b32 r1675, {rs238, rs238}; +{ +mul.f16x2 r1673, r7, r1675; +} +{ +add.f16x2 r1676, r1677, r1673; +} +{ +cvt.rn.f16.f64 rs239, fd763; +} +mov.b32 r1681, {rs239, rs239}; +{ +mul.f16x2 r1679, r13, r1681; +} +{ +add.f16x2 r1682, r1658, r1679; +} +{ +cvt.rn.f16.f64 rs240, fd764; +} +mov.b32 r1687, {rs240, rs240}; +{ +mul.f16x2 r1685, r22, r1687; +} +{ +add.f16x2 r1688, r1664, r1685; +} +{ +cvt.rn.f16.f64 rs241, fd763; +} +mov.b32 r1693, {rs241, rs241}; +{ +mul.f16x2 r1691, r16, r1693; +} +{ +add.f16x2 r1694, r1670, r1691; +} +{ +cvt.rn.f16.f64 rs242, fd764; +} +mov.b32 r1699, {rs242, rs242}; +{ +mul.f16x2 r1697, r19, r1699; +} +{ +add.f16x2 r1700, r1676, r1697; +} +{ +cvt.rn.f16.f64 rs243, fd731; +} +mov.b32 r1705, {rs243, rs243}; +{ +mul.f16x2 r1703, r25, r1705; +} +{ +add.f16x2 r1706, r1682, r1703; +} +{ +cvt.rn.f16.f64 rs244, fd576; +} +mov.b32 r1711, {rs244, rs244}; +{ +mul.f16x2 r1709, r34, r1711; +} +{ +add.f16x2 r1712, r1688, r1709; +} +{ +cvt.rn.f16.f64 rs245, fd731; +} +mov.b32 r1717, {rs245, rs245}; +{ +mul.f16x2 r1715, r28, r1717; +} +{ +add.f16x2 r1718, r1694, r1715; +} +{ +cvt.rn.f16.f64 rs246, fd576; +} +mov.b32 r1723, {rs246, rs246}; +{ +mul.f16x2 r1721, r31, r1723; +} +{ +add.f16x2 r1724, r1700, r1721; +} +{ +cvt.rn.f16.f64 rs247, fd771; +} +mov.b32 r1729, {rs247, rs247}; +{ +mul.f16x2 r1727, r37, r1729; +} +{ +add.f16x2 r1730, r1706, r1727; +} +{ +cvt.rn.f16.f64 rs248, fd696; +} +mov.b32 r1735, {rs248, rs248}; +{ +mul.f16x2 r1733, r46, r1735; +} +{ +add.f16x2 r1736, r1712, r1733; +} +{ +cvt.rn.f16.f64 rs249, fd771; +} +mov.b32 r1741, {rs249, rs249}; +{ +mul.f16x2 r1739, r40, r1741; +} +{ +add.f16x2 r1742, r1718, r1739; +} +{ +cvt.rn.f16.f64 rs250, fd696; +} +mov.b32 r1747, {rs250, rs250}; +{ +mul.f16x2 r1745, r43, r1747; +} +{ +add.f16x2 r1748, r1724, r1745; +} +{ +cvt.rn.f16.f64 rs251, fd759; +} +mov.b32 r1753, {rs251, rs251}; +{ +mul.f16x2 r1751, r49, r1753; +} +{ +add.f16x2 r1754, r1730, r1751; +} +mov.f64 fd760, 0dBFE863A1ADA0CFA6; +{ +cvt.rn.f16.f64 rs252, fd760; +} +mov.b32 r1759, {rs252, rs252}; +{ +mul.f16x2 r1757, r58, r1759; +} +{ +add.f16x2 r1760, r1736, r1757; +} +{ +cvt.rn.f16.f64 rs253, fd759; +} +mov.b32 r1765, {rs253, rs253}; +{ +mul.f16x2 r1763, r52, r1765; +} +{ +add.f16x2 r1766, r1742, r1763; +} +{ +cvt.rn.f16.f64 rs254, fd760; +} +mov.b32 r1771, {rs254, rs254}; +{ +mul.f16x2 r1769, r55, r1771; +} +{ +add.f16x2 r1772, r1748, r1769; +} +{ +cvt.rn.f16.f64 rs255, fd735; +} +mov.b32 r1777, {rs255, rs255}; +{ +mul.f16x2 r1775, r61, r1777; +} +{ +add.f16x2 r1778, r1754, r1775; +} +{ +cvt.rn.f16.f64 rs256, fd708; +} +mov.b32 r1783, {rs256, rs256}; +{ +mul.f16x2 r1781, r70, r1783; +} +{ +add.f16x2 r1784, r1760, r1781; +} +{ +cvt.rn.f16.f64 rs257, fd735; +} +mov.b32 r1789, {rs257, rs257}; +{ +mul.f16x2 r1787, r64, r1789; +} +{ +add.f16x2 r1790, r1766, r1787; +} +{ +cvt.rn.f16.f64 rs258, fd708; +} +mov.b32 r1795, {rs258, rs258}; +{ +mul.f16x2 r1793, r67, r1795; +} +{ +add.f16x2 r1796, r1772, r1793; +} +{ +cvt.rn.f16.f64 rs259, fd775; +} +mov.b32 r1801, {rs259, rs259}; +{ +mul.f16x2 r1799, r73, r1801; +} +{ +add.f16x2 r1802, r1778, r1799; +} +{ +cvt.rn.f16.f64 rs260, fd424; +} +mov.b32 r1807, {rs260, rs260}; +{ +mul.f16x2 r1805, r82, r1807; +} +{ +add.f16x2 r1808, r1784, r1805; +} +{ +cvt.rn.f16.f64 rs261, fd775; +} +mov.b32 r1813, {rs261, rs261}; +{ +mul.f16x2 r1811, r76, r1813; +} +{ +add.f16x2 r1814, r1790, r1811; +} +{ +cvt.rn.f16.f64 rs262, fd424; +} +mov.b32 r1819, {rs262, rs262}; +{ +mul.f16x2 r1817, r79, r1819; +} +{ +add.f16x2 r1820, r1796, r1817; +} +{ +cvt.rn.f16.f64 rs263, fd755; +} +mov.b32 r1825, {rs263, rs263}; +{ +mul.f16x2 r1823, r85, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs264, fd756; +} +mov.b32 r1831, {rs264, rs264}; +{ +mul.f16x2 r1829, r94, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs265, fd755; +} +mov.b32 r1837, {rs265, rs265}; +{ +mul.f16x2 r1835, r88, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs266, fd756; +} +mov.b32 r1843, {rs266, rs266}; +{ +mul.f16x2 r1841, r91, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs267, fd739; +} +mov.b32 r1849, {rs267, rs267}; +{ +mul.f16x2 r1847, r97, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs268, fd660; +} +mov.b32 r1855, {rs268, rs268}; +{ +mul.f16x2 r1853, r106, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs269, fd739; +} +mov.b32 r1861, {rs269, rs269}; +{ +mul.f16x2 r1859, r100, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs270, fd660; +} +mov.b32 r1867, {rs270, rs270}; +{ +mul.f16x2 r1865, r103, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs271, fd779; +} +mov.b32 r1873, {rs271, rs271}; +{ +mul.f16x2 r1871, r109, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs272, fd652; +} +mov.b32 r1879, {rs272, rs272}; +{ +mul.f16x2 r1877, r118, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs273, fd779; +} +mov.b32 r1885, {rs273, rs273}; +{ +mul.f16x2 r1883, r112, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs274, fd652; +} +mov.b32 r1891, {rs274, rs274}; +{ +mul.f16x2 r1889, r115, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs275, fd751; +} +mov.b32 r1897, {rs275, rs275}; +{ +mul.f16x2 r1895, r121, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs276, fd752; +} +mov.b32 r1903, {rs276, rs276}; +{ +mul.f16x2 r1901, r130, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs277, fd751; +} +mov.b32 r1909, {rs277, rs277}; +{ +mul.f16x2 r1907, r124, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs278, fd752; +} +mov.b32 r1915, {rs278, rs278}; +{ +mul.f16x2 r1913, r127, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs279, fd743; +} +mov.b32 r1921, {rs279, rs279}; +{ +mul.f16x2 r1919, r133, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs280, fd636; +} +mov.b32 r1927, {rs280, rs280}; +{ +mul.f16x2 r1925, r142, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs281, fd743; +} +mov.b32 r1933, {rs281, rs281}; +{ +mul.f16x2 r1931, r136, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs282, fd636; +} +mov.b32 r1939, {rs282, rs282}; +{ +mul.f16x2 r1937, r139, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +cvt.rn.f16.f64 rs283, fd783; +} +mov.b32 r1945, {rs283, rs283}; +{ +mul.f16x2 r1943, r145, r1945; +} +{ +add.f16x2 r1946, r1922, r1943; +} +{ +cvt.rn.f16.f64 rs284, fd692; +} +mov.b32 r1951, {rs284, rs284}; +{ +mul.f16x2 r1949, r154, r1951; +} +{ +add.f16x2 r1952, r1928, r1949; +} +{ +cvt.rn.f16.f64 rs285, fd783; +} +mov.b32 r1957, {rs285, rs285}; +{ +mul.f16x2 r1955, r148, r1957; +} +{ +add.f16x2 r1958, r1934, r1955; +} +{ +cvt.rn.f16.f64 rs286, fd692; +} +mov.b32 r1963, {rs286, rs286}; +{ +mul.f16x2 r1961, r151, r1963; +} +{ +add.f16x2 r1964, r1940, r1961; +} +{ +cvt.rn.f16.f64 rs287, fd747; +} +mov.b32 r1969, {rs287, rs287}; +{ +mul.f16x2 r1967, r157, r1969; +} +{ +add.f16x2 r1970, r1946, r1967; +} +{ +cvt.rn.f16.f64 rs288, fd748; +} +mov.b32 r1975, {rs288, rs288}; +{ +mul.f16x2 r1973, r166, r1975; +} +{ +add.f16x2 r1976, r1952, r1973; +} +{ +cvt.rn.f16.f64 rs289, fd747; +} +mov.b32 r1981, {rs289, rs289}; +{ +mul.f16x2 r1979, r160, r1981; +} +{ +add.f16x2 r1982, r1958, r1979; +} +{ +cvt.rn.f16.f64 rs290, fd748; +} +mov.b32 r1987, {rs290, rs290}; +{ +mul.f16x2 r1985, r163, r1987; +} +{ +add.f16x2 r1988, r1964, r1985; +} +{ +sub.f16x2 %10, r1970, r1976; +} +{ +add.f16x2 %11, r1982, r1988; +} +{ +add.f16x2 %48, r1970, r1976; +} +{ +sub.f16x2 %49, r1982, r1988; +} +cvt.rn.f16.s32 rs291, r4804; +mov.b32 r2015, {rs291, rs291}; +cvt.rn.f16.s32 rs292, r4804; +mov.b32 r2027, {rs292, rs292}; +{ +cvt.rn.f16.f64 rs293, fd775; +} +mov.b32 r2007, {rs293, rs293}; +{ +mul.f16x2 r2005, r1, r2007; +} +{ +add.f16x2 r2008, %58, r2005; +} +{ +cvt.rn.f16.f64 rs294, fd424; +} +mov.b32 r2013, {rs294, rs294}; +{ +mul.f16x2 r2011, r10, r2013; +} +{ +add.f16x2 r2014, r2015, r2011; +} +{ +cvt.rn.f16.f64 rs295, fd775; +} +mov.b32 r2019, {rs295, rs295}; +{ +mul.f16x2 r2017, r4, r2019; +} +{ +add.f16x2 r2020, %59, r2017; +} +{ +cvt.rn.f16.f64 rs296, fd424; +} +mov.b32 r2025, {rs296, rs296}; +{ +mul.f16x2 r2023, r7, r2025; +} +{ +add.f16x2 r2026, r2027, r2023; +} +{ +cvt.rn.f16.f64 rs297, fd747; +} +mov.b32 r2031, {rs297, rs297}; +{ +mul.f16x2 r2029, r13, r2031; +} +{ +add.f16x2 r2032, r2008, r2029; +} +{ +cvt.rn.f16.f64 rs298, fd748; +} +mov.b32 r2037, {rs298, rs298}; +{ +mul.f16x2 r2035, r22, r2037; +} +{ +add.f16x2 r2038, r2014, r2035; +} +{ +cvt.rn.f16.f64 rs299, fd747; +} +mov.b32 r2043, {rs299, rs299}; +{ +mul.f16x2 r2041, r16, r2043; +} +{ +add.f16x2 r2044, r2020, r2041; +} +{ +cvt.rn.f16.f64 rs300, fd748; +} +mov.b32 r2049, {rs300, rs300}; +{ +mul.f16x2 r2047, r19, r2049; +} +{ +add.f16x2 r2050, r2026, r2047; +} +{ +cvt.rn.f16.f64 rs301, fd755; +} +mov.b32 r2055, {rs301, rs301}; +{ +mul.f16x2 r2053, r25, r2055; +} +{ +add.f16x2 r2056, r2032, r2053; +} +{ +cvt.rn.f16.f64 rs302, fd456; +} +mov.b32 r2061, {rs302, rs302}; +{ +mul.f16x2 r2059, r34, r2061; +} +{ +add.f16x2 r2062, r2038, r2059; +} +{ +cvt.rn.f16.f64 rs303, fd755; +} +mov.b32 r2067, {rs303, rs303}; +{ +mul.f16x2 r2065, r28, r2067; +} +{ +add.f16x2 r2068, r2044, r2065; +} +{ +cvt.rn.f16.f64 rs304, fd456; +} +mov.b32 r2073, {rs304, rs304}; +{ +mul.f16x2 r2071, r31, r2073; +} +{ +add.f16x2 r2074, r2050, r2071; +} +{ +cvt.rn.f16.f64 rs305, fd767; +} +mov.b32 r2079, {rs305, rs305}; +{ +mul.f16x2 r2077, r37, r2079; +} +{ +add.f16x2 r2080, r2056, r2077; +} +{ +cvt.rn.f16.f64 rs306, fd768; +} +mov.b32 r2085, {rs306, rs306}; +{ +mul.f16x2 r2083, r46, r2085; +} +{ +add.f16x2 r2086, r2062, r2083; +} +{ +cvt.rn.f16.f64 rs307, fd767; +} +mov.b32 r2091, {rs307, rs307}; +{ +mul.f16x2 r2089, r40, r2091; +} +{ +add.f16x2 r2092, r2068, r2089; +} +{ +cvt.rn.f16.f64 rs308, fd768; +} +mov.b32 r2097, {rs308, rs308}; +{ +mul.f16x2 r2095, r43, r2097; +} +{ +add.f16x2 r2098, r2074, r2095; +} +{ +cvt.rn.f16.f64 rs309, fd735; +} +mov.b32 r2103, {rs309, rs309}; +{ +mul.f16x2 r2101, r49, r2103; +} +{ +add.f16x2 r2104, r2080, r2101; +} +{ +cvt.rn.f16.f64 rs310, fd708; +} +mov.b32 r2109, {rs310, rs310}; +{ +mul.f16x2 r2107, r58, r2109; +} +{ +add.f16x2 r2110, r2086, r2107; +} +{ +cvt.rn.f16.f64 rs311, fd735; +} +mov.b32 r2115, {rs311, rs311}; +{ +mul.f16x2 r2113, r52, r2115; +} +{ +add.f16x2 r2116, r2092, r2113; +} +{ +cvt.rn.f16.f64 rs312, fd708; +} +mov.b32 r2121, {rs312, rs312}; +{ +mul.f16x2 r2119, r55, r2121; +} +{ +add.f16x2 r2122, r2098, r2119; +} +{ +cvt.rn.f16.f64 rs313, fd783; +} +mov.b32 r2127, {rs313, rs313}; +{ +mul.f16x2 r2125, r61, r2127; +} +{ +add.f16x2 r2128, r2104, r2125; +} +{ +cvt.rn.f16.f64 rs314, fd692; +} +mov.b32 r2133, {rs314, rs314}; +{ +mul.f16x2 r2131, r70, r2133; +} +{ +add.f16x2 r2134, r2110, r2131; +} +{ +cvt.rn.f16.f64 rs315, fd783; +} +mov.b32 r2139, {rs315, rs315}; +{ +mul.f16x2 r2137, r64, r2139; +} +{ +add.f16x2 r2140, r2116, r2137; +} +{ +cvt.rn.f16.f64 rs316, fd692; +} +mov.b32 r2145, {rs316, rs316}; +{ +mul.f16x2 r2143, r67, r2145; +} +{ +add.f16x2 r2146, r2122, r2143; +} +{ +cvt.rn.f16.f64 rs317, fd739; +} +mov.b32 r2151, {rs317, rs317}; +{ +mul.f16x2 r2149, r73, r2151; +} +{ +add.f16x2 r2152, r2128, r2149; +} +{ +cvt.rn.f16.f64 rs318, fd740; +} +mov.b32 r2157, {rs318, rs318}; +{ +mul.f16x2 r2155, r82, r2157; +} +{ +add.f16x2 r2158, r2134, r2155; +} +{ +cvt.rn.f16.f64 rs319, fd739; +} +mov.b32 r2163, {rs319, rs319}; +{ +mul.f16x2 r2161, r76, r2163; +} +{ +add.f16x2 r2164, r2140, r2161; +} +{ +cvt.rn.f16.f64 rs320, fd740; +} +mov.b32 r2169, {rs320, rs320}; +{ +mul.f16x2 r2167, r79, r2169; +} +{ +add.f16x2 r2170, r2146, r2167; +} +{ +cvt.rn.f16.f64 rs321, fd763; +} +mov.b32 r2175, {rs321, rs321}; +{ +mul.f16x2 r2173, r85, r2175; +} +{ +add.f16x2 r2176, r2152, r2173; +} +{ +cvt.rn.f16.f64 rs322, fd632; +} +mov.b32 r2181, {rs322, rs322}; +{ +mul.f16x2 r2179, r94, r2181; +} +{ +add.f16x2 r2182, r2158, r2179; +} +{ +cvt.rn.f16.f64 rs323, fd763; +} +mov.b32 r2187, {rs323, rs323}; +{ +mul.f16x2 r2185, r88, r2187; +} +{ +add.f16x2 r2188, r2164, r2185; +} +{ +cvt.rn.f16.f64 rs324, fd632; +} +mov.b32 r2193, {rs324, rs324}; +{ +mul.f16x2 r2191, r91, r2193; +} +{ +add.f16x2 r2194, r2170, r2191; +} +{ +cvt.rn.f16.f64 rs325, fd759; +} +mov.b32 r2199, {rs325, rs325}; +{ +mul.f16x2 r2197, r97, r2199; +} +{ +add.f16x2 r2200, r2176, r2197; +} +{ +cvt.rn.f16.f64 rs326, fd760; +} +mov.b32 r2205, {rs326, rs326}; +{ +mul.f16x2 r2203, r106, r2205; +} +{ +add.f16x2 r2206, r2182, r2203; +} +{ +cvt.rn.f16.f64 rs327, fd759; +} +mov.b32 r2211, {rs327, rs327}; +{ +mul.f16x2 r2209, r100, r2211; +} +{ +add.f16x2 r2212, r2188, r2209; +} +{ +cvt.rn.f16.f64 rs328, fd760; +} +mov.b32 r2217, {rs328, rs328}; +{ +mul.f16x2 r2215, r103, r2217; +} +{ +add.f16x2 r2218, r2194, r2215; +} +{ +cvt.rn.f16.f64 rs329, fd743; +} +mov.b32 r2223, {rs329, rs329}; +{ +mul.f16x2 r2221, r109, r2223; +} +{ +add.f16x2 r2224, r2200, r2221; +} +{ +cvt.rn.f16.f64 rs330, fd636; +} +mov.b32 r2229, {rs330, rs330}; +{ +mul.f16x2 r2227, r118, r2229; +} +{ +add.f16x2 r2230, r2206, r2227; +} +{ +cvt.rn.f16.f64 rs331, fd743; +} +mov.b32 r2235, {rs331, rs331}; +{ +mul.f16x2 r2233, r112, r2235; +} +{ +add.f16x2 r2236, r2212, r2233; +} +{ +cvt.rn.f16.f64 rs332, fd636; +} +mov.b32 r2241, {rs332, rs332}; +{ +mul.f16x2 r2239, r115, r2241; +} +{ +add.f16x2 r2242, r2218, r2239; +} +{ +cvt.rn.f16.f64 rs333, fd779; +} +mov.b32 r2247, {rs333, rs333}; +{ +mul.f16x2 r2245, r121, r2247; +} +{ +add.f16x2 r2248, r2224, r2245; +} +{ +cvt.rn.f16.f64 rs334, fd780; +} +mov.b32 r2253, {rs334, rs334}; +{ +mul.f16x2 r2251, r130, r2253; +} +{ +add.f16x2 r2254, r2230, r2251; +} +{ +cvt.rn.f16.f64 rs335, fd779; +} +mov.b32 r2259, {rs335, rs335}; +{ +mul.f16x2 r2257, r124, r2259; +} +{ +add.f16x2 r2260, r2236, r2257; +} +{ +cvt.rn.f16.f64 rs336, fd780; +} +mov.b32 r2265, {rs336, rs336}; +{ +mul.f16x2 r2263, r127, r2265; +} +{ +add.f16x2 r2266, r2242, r2263; +} +{ +cvt.rn.f16.f64 rs337, fd731; +} +mov.b32 r2271, {rs337, rs337}; +{ +mul.f16x2 r2269, r133, r2271; +} +{ +add.f16x2 r2272, r2248, r2269; +} +{ +cvt.rn.f16.f64 rs338, fd732; +} +mov.b32 r2277, {rs338, rs338}; +{ +mul.f16x2 r2275, r142, r2277; +} +{ +add.f16x2 r2278, r2254, r2275; +} +{ +cvt.rn.f16.f64 rs339, fd731; +} +mov.b32 r2283, {rs339, rs339}; +{ +mul.f16x2 r2281, r136, r2283; +} +{ +add.f16x2 r2284, r2260, r2281; +} +{ +cvt.rn.f16.f64 rs340, fd732; +} +mov.b32 r2289, {rs340, rs340}; +{ +mul.f16x2 r2287, r139, r2289; +} +{ +add.f16x2 r2290, r2266, r2287; +} +{ +cvt.rn.f16.f64 rs341, fd771; +} +mov.b32 r2295, {rs341, rs341}; +{ +mul.f16x2 r2293, r145, r2295; +} +{ +add.f16x2 r2296, r2272, r2293; +} +{ +cvt.rn.f16.f64 rs342, fd696; +} +mov.b32 r2301, {rs342, rs342}; +{ +mul.f16x2 r2299, r154, r2301; +} +{ +add.f16x2 r2302, r2278, r2299; +} +{ +cvt.rn.f16.f64 rs343, fd771; +} +mov.b32 r2307, {rs343, rs343}; +{ +mul.f16x2 r2305, r148, r2307; +} +{ +add.f16x2 r2308, r2284, r2305; +} +{ +cvt.rn.f16.f64 rs344, fd696; +} +mov.b32 r2313, {rs344, rs344}; +{ +mul.f16x2 r2311, r151, r2313; +} +{ +add.f16x2 r2314, r2290, r2311; +} +{ +cvt.rn.f16.f64 rs345, fd751; +} +mov.b32 r2319, {rs345, rs345}; +{ +mul.f16x2 r2317, r157, r2319; +} +{ +add.f16x2 r2320, r2296, r2317; +} +{ +cvt.rn.f16.f64 rs346, fd752; +} +mov.b32 r2325, {rs346, rs346}; +{ +mul.f16x2 r2323, r166, r2325; +} +{ +add.f16x2 r2326, r2302, r2323; +} +{ +cvt.rn.f16.f64 rs347, fd751; +} +mov.b32 r2331, {rs347, rs347}; +{ +mul.f16x2 r2329, r160, r2331; +} +{ +add.f16x2 r2332, r2308, r2329; +} +{ +cvt.rn.f16.f64 rs348, fd752; +} +mov.b32 r2337, {rs348, rs348}; +{ +mul.f16x2 r2335, r163, r2337; +} +{ +add.f16x2 r2338, r2314, r2335; +} +{ +sub.f16x2 %12, r2320, r2326; +} +{ +add.f16x2 %13, r2332, r2338; +} +{ +add.f16x2 %46, r2320, r2326; +} +{ +sub.f16x2 %47, r2332, r2338; +} +cvt.rn.f16.s32 rs349, r4804; +mov.b32 r2365, {rs349, rs349}; +cvt.rn.f16.s32 rs350, r4804; +mov.b32 r2377, {rs350, rs350}; +{ +cvt.rn.f16.f64 rs351, fd783; +} +mov.b32 r2357, {rs351, rs351}; +{ +mul.f16x2 r2355, r1, r2357; +} +{ +add.f16x2 r2358, %58, r2355; +} +{ +cvt.rn.f16.f64 rs352, fd692; +} +mov.b32 r2363, {rs352, rs352}; +{ +mul.f16x2 r2361, r10, r2363; +} +{ +add.f16x2 r2364, r2365, r2361; +} +{ +cvt.rn.f16.f64 rs353, fd783; +} +mov.b32 r2369, {rs353, rs353}; +{ +mul.f16x2 r2367, r4, r2369; +} +{ +add.f16x2 r2370, %59, r2367; +} +{ +cvt.rn.f16.f64 rs354, fd692; +} +mov.b32 r2375, {rs354, rs354}; +{ +mul.f16x2 r2373, r7, r2375; +} +{ +add.f16x2 r2376, r2377, r2373; +} +{ +cvt.rn.f16.f64 rs355, fd731; +} +mov.b32 r2381, {rs355, rs355}; +{ +mul.f16x2 r2379, r13, r2381; +} +{ +add.f16x2 r2382, r2358, r2379; +} +{ +cvt.rn.f16.f64 rs356, fd732; +} +mov.b32 r2387, {rs356, rs356}; +{ +mul.f16x2 r2385, r22, r2387; +} +{ +add.f16x2 r2388, r2364, r2385; +} +{ +cvt.rn.f16.f64 rs357, fd731; +} +mov.b32 r2393, {rs357, rs357}; +{ +mul.f16x2 r2391, r16, r2393; +} +{ +add.f16x2 r2394, r2370, r2391; +} +{ +cvt.rn.f16.f64 rs358, fd732; +} +mov.b32 r2399, {rs358, rs358}; +{ +mul.f16x2 r2397, r19, r2399; +} +{ +add.f16x2 r2400, r2376, r2397; +} +{ +cvt.rn.f16.f64 rs359, fd779; +} +mov.b32 r2405, {rs359, rs359}; +{ +mul.f16x2 r2403, r25, r2405; +} +{ +add.f16x2 r2406, r2382, r2403; +} +{ +cvt.rn.f16.f64 rs360, fd652; +} +mov.b32 r2411, {rs360, rs360}; +{ +mul.f16x2 r2409, r34, r2411; +} +{ +add.f16x2 r2412, r2388, r2409; +} +{ +cvt.rn.f16.f64 rs361, fd779; +} +mov.b32 r2417, {rs361, rs361}; +{ +mul.f16x2 r2415, r28, r2417; +} +{ +add.f16x2 r2418, r2394, r2415; +} +{ +cvt.rn.f16.f64 rs362, fd652; +} +mov.b32 r2423, {rs362, rs362}; +{ +mul.f16x2 r2421, r31, r2423; +} +{ +add.f16x2 r2424, r2400, r2421; +} +{ +cvt.rn.f16.f64 rs363, fd735; +} +mov.b32 r2429, {rs363, rs363}; +{ +mul.f16x2 r2427, r37, r2429; +} +{ +add.f16x2 r2430, r2406, r2427; +} +{ +cvt.rn.f16.f64 rs364, fd736; +} +mov.b32 r2435, {rs364, rs364}; +{ +mul.f16x2 r2433, r46, r2435; +} +{ +add.f16x2 r2436, r2412, r2433; +} +{ +cvt.rn.f16.f64 rs365, fd735; +} +mov.b32 r2441, {rs365, rs365}; +{ +mul.f16x2 r2439, r40, r2441; +} +{ +add.f16x2 r2442, r2418, r2439; +} +{ +cvt.rn.f16.f64 rs366, fd736; +} +mov.b32 r2447, {rs366, rs366}; +{ +mul.f16x2 r2445, r43, r2447; +} +{ +add.f16x2 r2448, r2424, r2445; +} +{ +cvt.rn.f16.f64 rs367, fd775; +} +mov.b32 r2453, {rs367, rs367}; +{ +mul.f16x2 r2451, r49, r2453; +} +{ +add.f16x2 r2454, r2430, r2451; +} +{ +cvt.rn.f16.f64 rs368, fd424; +} +mov.b32 r2459, {rs368, rs368}; +{ +mul.f16x2 r2457, r58, r2459; +} +{ +add.f16x2 r2460, r2436, r2457; +} +{ +cvt.rn.f16.f64 rs369, fd775; +} +mov.b32 r2465, {rs369, rs369}; +{ +mul.f16x2 r2463, r52, r2465; +} +{ +add.f16x2 r2466, r2442, r2463; +} +{ +cvt.rn.f16.f64 rs370, fd424; +} +mov.b32 r2471, {rs370, rs370}; +{ +mul.f16x2 r2469, r55, r2471; +} +{ +add.f16x2 r2472, r2448, r2469; +} +{ +cvt.rn.f16.f64 rs371, fd739; +} +mov.b32 r2477, {rs371, rs371}; +{ +mul.f16x2 r2475, r61, r2477; +} +{ +add.f16x2 r2478, r2454, r2475; +} +{ +cvt.rn.f16.f64 rs372, fd740; +} +mov.b32 r2483, {rs372, rs372}; +{ +mul.f16x2 r2481, r70, r2483; +} +{ +add.f16x2 r2484, r2460, r2481; +} +{ +cvt.rn.f16.f64 rs373, fd739; +} +mov.b32 r2489, {rs373, rs373}; +{ +mul.f16x2 r2487, r64, r2489; +} +{ +add.f16x2 r2490, r2466, r2487; +} +{ +cvt.rn.f16.f64 rs374, fd740; +} +mov.b32 r2495, {rs374, rs374}; +{ +mul.f16x2 r2493, r67, r2495; +} +{ +add.f16x2 r2496, r2472, r2493; +} +{ +cvt.rn.f16.f64 rs375, fd771; +} +mov.b32 r2501, {rs375, rs375}; +{ +mul.f16x2 r2499, r73, r2501; +} +{ +add.f16x2 r2502, r2478, r2499; +} +{ +cvt.rn.f16.f64 rs376, fd696; +} +mov.b32 r2507, {rs376, rs376}; +{ +mul.f16x2 r2505, r82, r2507; +} +{ +add.f16x2 r2508, r2484, r2505; +} +{ +cvt.rn.f16.f64 rs377, fd771; +} +mov.b32 r2513, {rs377, rs377}; +{ +mul.f16x2 r2511, r76, r2513; +} +{ +add.f16x2 r2514, r2490, r2511; +} +{ +cvt.rn.f16.f64 rs378, fd696; +} +mov.b32 r2519, {rs378, rs378}; +{ +mul.f16x2 r2517, r79, r2519; +} +{ +add.f16x2 r2520, r2496, r2517; +} +{ +cvt.rn.f16.f64 rs379, fd743; +} +mov.b32 r2525, {rs379, rs379}; +{ +mul.f16x2 r2523, r85, r2525; +} +{ +add.f16x2 r2526, r2502, r2523; +} +{ +cvt.rn.f16.f64 rs380, fd744; +} +mov.b32 r2531, {rs380, rs380}; +{ +mul.f16x2 r2529, r94, r2531; +} +{ +add.f16x2 r2532, r2508, r2529; +} +{ +cvt.rn.f16.f64 rs381, fd743; +} +mov.b32 r2537, {rs381, rs381}; +{ +mul.f16x2 r2535, r88, r2537; +} +{ +add.f16x2 r2538, r2514, r2535; +} +{ +cvt.rn.f16.f64 rs382, fd744; +} +mov.b32 r2543, {rs382, rs382}; +{ +mul.f16x2 r2541, r91, r2543; +} +{ +add.f16x2 r2544, r2520, r2541; +} +{ +cvt.rn.f16.f64 rs383, fd767; +} +mov.b32 r2549, {rs383, rs383}; +{ +mul.f16x2 r2547, r97, r2549; +} +{ +add.f16x2 r2550, r2526, r2547; +} +{ +cvt.rn.f16.f64 rs384, fd604; +} +mov.b32 r2555, {rs384, rs384}; +{ +mul.f16x2 r2553, r106, r2555; +} +{ +add.f16x2 r2556, r2532, r2553; +} +{ +cvt.rn.f16.f64 rs385, fd767; +} +mov.b32 r2561, {rs385, rs385}; +{ +mul.f16x2 r2559, r100, r2561; +} +{ +add.f16x2 r2562, r2538, r2559; +} +{ +cvt.rn.f16.f64 rs386, fd604; +} +mov.b32 r2567, {rs386, rs386}; +{ +mul.f16x2 r2565, r103, r2567; +} +{ +add.f16x2 r2568, r2544, r2565; +} +{ +cvt.rn.f16.f64 rs387, fd747; +} +mov.b32 r2573, {rs387, rs387}; +{ +mul.f16x2 r2571, r109, r2573; +} +{ +add.f16x2 r2574, r2550, r2571; +} +{ +cvt.rn.f16.f64 rs388, fd748; +} +mov.b32 r2579, {rs388, rs388}; +{ +mul.f16x2 r2577, r118, r2579; +} +{ +add.f16x2 r2580, r2556, r2577; +} +{ +cvt.rn.f16.f64 rs389, fd747; +} +mov.b32 r2585, {rs389, rs389}; +{ +mul.f16x2 r2583, r112, r2585; +} +{ +add.f16x2 r2586, r2562, r2583; +} +{ +cvt.rn.f16.f64 rs390, fd748; +} +mov.b32 r2591, {rs390, rs390}; +{ +mul.f16x2 r2589, r115, r2591; +} +{ +add.f16x2 r2592, r2568, r2589; +} +{ +cvt.rn.f16.f64 rs391, fd763; +} +mov.b32 r2597, {rs391, rs391}; +{ +mul.f16x2 r2595, r121, r2597; +} +{ +add.f16x2 r2598, r2574, r2595; +} +{ +cvt.rn.f16.f64 rs392, fd632; +} +mov.b32 r2603, {rs392, rs392}; +{ +mul.f16x2 r2601, r130, r2603; +} +{ +add.f16x2 r2604, r2580, r2601; +} +{ +cvt.rn.f16.f64 rs393, fd763; +} +mov.b32 r2609, {rs393, rs393}; +{ +mul.f16x2 r2607, r124, r2609; +} +{ +add.f16x2 r2610, r2586, r2607; +} +{ +cvt.rn.f16.f64 rs394, fd632; +} +mov.b32 r2615, {rs394, rs394}; +{ +mul.f16x2 r2613, r127, r2615; +} +{ +add.f16x2 r2616, r2592, r2613; +} +{ +cvt.rn.f16.f64 rs395, fd751; +} +mov.b32 r2621, {rs395, rs395}; +{ +mul.f16x2 r2619, r133, r2621; +} +{ +add.f16x2 r2622, r2598, r2619; +} +{ +cvt.rn.f16.f64 rs396, fd752; +} +mov.b32 r2627, {rs396, rs396}; +{ +mul.f16x2 r2625, r142, r2627; +} +{ +add.f16x2 r2628, r2604, r2625; +} +{ +cvt.rn.f16.f64 rs397, fd751; +} +mov.b32 r2633, {rs397, rs397}; +{ +mul.f16x2 r2631, r136, r2633; +} +{ +add.f16x2 r2634, r2610, r2631; +} +{ +cvt.rn.f16.f64 rs398, fd752; +} +mov.b32 r2639, {rs398, rs398}; +{ +mul.f16x2 r2637, r139, r2639; +} +{ +add.f16x2 r2640, r2616, r2637; +} +{ +cvt.rn.f16.f64 rs399, fd759; +} +mov.b32 r2645, {rs399, rs399}; +{ +mul.f16x2 r2643, r145, r2645; +} +{ +add.f16x2 r2646, r2622, r2643; +} +{ +cvt.rn.f16.f64 rs400, fd700; +} +mov.b32 r2651, {rs400, rs400}; +{ +mul.f16x2 r2649, r154, r2651; +} +{ +add.f16x2 r2652, r2628, r2649; +} +{ +cvt.rn.f16.f64 rs401, fd759; +} +mov.b32 r2657, {rs401, rs401}; +{ +mul.f16x2 r2655, r148, r2657; +} +{ +add.f16x2 r2658, r2634, r2655; +} +{ +cvt.rn.f16.f64 rs402, fd700; +} +mov.b32 r2663, {rs402, rs402}; +{ +mul.f16x2 r2661, r151, r2663; +} +{ +add.f16x2 r2664, r2640, r2661; +} +{ +cvt.rn.f16.f64 rs403, fd755; +} +mov.b32 r2669, {rs403, rs403}; +{ +mul.f16x2 r2667, r157, r2669; +} +{ +add.f16x2 r2670, r2646, r2667; +} +{ +cvt.rn.f16.f64 rs404, fd756; +} +mov.b32 r2675, {rs404, rs404}; +{ +mul.f16x2 r2673, r166, r2675; +} +{ +add.f16x2 r2676, r2652, r2673; +} +{ +cvt.rn.f16.f64 rs405, fd755; +} +mov.b32 r2681, {rs405, rs405}; +{ +mul.f16x2 r2679, r160, r2681; +} +{ +add.f16x2 r2682, r2658, r2679; +} +{ +cvt.rn.f16.f64 rs406, fd756; +} +mov.b32 r2687, {rs406, rs406}; +{ +mul.f16x2 r2685, r163, r2687; +} +{ +add.f16x2 r2688, r2664, r2685; +} +{ +sub.f16x2 %14, r2670, r2676; +} +{ +add.f16x2 %15, r2682, r2688; +} +{ +add.f16x2 %44, r2670, r2676; +} +{ +sub.f16x2 %45, r2682, r2688; +} +cvt.rn.f16.s32 rs407, r4804; +mov.b32 r2715, {rs407, rs407}; +cvt.rn.f16.s32 rs408, r4804; +mov.b32 r2727, {rs408, rs408}; +{ +cvt.rn.f16.f64 rs409, fd779; +} +mov.b32 r2707, {rs409, rs409}; +{ +mul.f16x2 r2705, r1, r2707; +} +{ +add.f16x2 r2708, %58, r2705; +} +{ +cvt.rn.f16.f64 rs410, fd780; +} +mov.b32 r2713, {rs410, rs410}; +{ +mul.f16x2 r2711, r10, r2713; +} +{ +add.f16x2 r2714, r2715, r2711; +} +{ +cvt.rn.f16.f64 rs411, fd779; +} +mov.b32 r2719, {rs411, rs411}; +{ +mul.f16x2 r2717, r4, r2719; +} +{ +add.f16x2 r2720, %59, r2717; +} +{ +cvt.rn.f16.f64 rs412, fd780; +} +mov.b32 r2725, {rs412, rs412}; +{ +mul.f16x2 r2723, r7, r2725; +} +{ +add.f16x2 r2726, r2727, r2723; +} +{ +cvt.rn.f16.f64 rs413, fd739; +} +mov.b32 r2731, {rs413, rs413}; +{ +mul.f16x2 r2729, r13, r2731; +} +{ +add.f16x2 r2732, r2708, r2729; +} +{ +cvt.rn.f16.f64 rs414, fd660; +} +mov.b32 r2737, {rs414, rs414}; +{ +mul.f16x2 r2735, r22, r2737; +} +{ +add.f16x2 r2738, r2714, r2735; +} +{ +cvt.rn.f16.f64 rs415, fd739; +} +mov.b32 r2743, {rs415, rs415}; +{ +mul.f16x2 r2741, r16, r2743; +} +{ +add.f16x2 r2744, r2720, r2741; +} +{ +cvt.rn.f16.f64 rs416, fd660; +} +mov.b32 r2749, {rs416, rs416}; +{ +mul.f16x2 r2747, r19, r2749; +} +{ +add.f16x2 r2750, r2726, r2747; +} +{ +cvt.rn.f16.f64 rs417, fd767; +} +mov.b32 r2755, {rs417, rs417}; +{ +mul.f16x2 r2753, r25, r2755; +} +{ +add.f16x2 r2756, r2732, r2753; +} +{ +cvt.rn.f16.f64 rs418, fd768; +} +mov.b32 r2761, {rs418, rs418}; +{ +mul.f16x2 r2759, r34, r2761; +} +{ +add.f16x2 r2762, r2738, r2759; +} +{ +cvt.rn.f16.f64 rs419, fd767; +} +mov.b32 r2767, {rs419, rs419}; +{ +mul.f16x2 r2765, r28, r2767; +} +{ +add.f16x2 r2768, r2744, r2765; +} +{ +cvt.rn.f16.f64 rs420, fd768; +} +mov.b32 r2773, {rs420, rs420}; +{ +mul.f16x2 r2771, r31, r2773; +} +{ +add.f16x2 r2774, r2750, r2771; +} +{ +cvt.rn.f16.f64 rs421, fd751; +} +mov.b32 r2779, {rs421, rs421}; +{ +mul.f16x2 r2777, r37, r2779; +} +{ +add.f16x2 r2780, r2756, r2777; +} +{ +cvt.rn.f16.f64 rs422, fd540; +} +mov.b32 r2785, {rs422, rs422}; +{ +mul.f16x2 r2783, r46, r2785; +} +{ +add.f16x2 r2786, r2762, r2783; +} +{ +cvt.rn.f16.f64 rs423, fd751; +} +mov.b32 r2791, {rs423, rs423}; +{ +mul.f16x2 r2789, r40, r2791; +} +{ +add.f16x2 r2792, r2768, r2789; +} +{ +cvt.rn.f16.f64 rs424, fd540; +} +mov.b32 r2797, {rs424, rs424}; +{ +mul.f16x2 r2795, r43, r2797; +} +{ +add.f16x2 r2798, r2774, r2795; +} +{ +cvt.rn.f16.f64 rs425, fd755; +} +mov.b32 r2803, {rs425, rs425}; +{ +mul.f16x2 r2801, r49, r2803; +} +{ +add.f16x2 r2804, r2780, r2801; +} +{ +cvt.rn.f16.f64 rs426, fd756; +} +mov.b32 r2809, {rs426, rs426}; +{ +mul.f16x2 r2807, r58, r2809; +} +{ +add.f16x2 r2810, r2786, r2807; +} +{ +cvt.rn.f16.f64 rs427, fd755; +} +mov.b32 r2815, {rs427, rs427}; +{ +mul.f16x2 r2813, r52, r2815; +} +{ +add.f16x2 r2816, r2792, r2813; +} +{ +cvt.rn.f16.f64 rs428, fd756; +} +mov.b32 r2821, {rs428, rs428}; +{ +mul.f16x2 r2819, r55, r2821; +} +{ +add.f16x2 r2822, r2798, r2819; +} +{ +cvt.rn.f16.f64 rs429, fd763; +} +mov.b32 r2827, {rs429, rs429}; +{ +mul.f16x2 r2825, r61, r2827; +} +{ +add.f16x2 r2828, r2804, r2825; +} +{ +cvt.rn.f16.f64 rs430, fd632; +} +mov.b32 r2833, {rs430, rs430}; +{ +mul.f16x2 r2831, r70, r2833; +} +{ +add.f16x2 r2834, r2810, r2831; +} +{ +cvt.rn.f16.f64 rs431, fd763; +} +mov.b32 r2839, {rs431, rs431}; +{ +mul.f16x2 r2837, r64, r2839; +} +{ +add.f16x2 r2840, r2816, r2837; +} +{ +cvt.rn.f16.f64 rs432, fd632; +} +mov.b32 r2845, {rs432, rs432}; +{ +mul.f16x2 r2843, r67, r2845; +} +{ +add.f16x2 r2846, r2822, r2843; +} +{ +cvt.rn.f16.f64 rs433, fd743; +} +mov.b32 r2851, {rs433, rs433}; +{ +mul.f16x2 r2849, r73, r2851; +} +{ +add.f16x2 r2852, r2828, r2849; +} +{ +cvt.rn.f16.f64 rs434, fd744; +} +mov.b32 r2857, {rs434, rs434}; +{ +mul.f16x2 r2855, r82, r2857; +} +{ +add.f16x2 r2858, r2834, r2855; +} +{ +cvt.rn.f16.f64 rs435, fd743; +} +mov.b32 r2863, {rs435, rs435}; +{ +mul.f16x2 r2861, r76, r2863; +} +{ +add.f16x2 r2864, r2840, r2861; +} +{ +cvt.rn.f16.f64 rs436, fd744; +} +mov.b32 r2869, {rs436, rs436}; +{ +mul.f16x2 r2867, r79, r2869; +} +{ +add.f16x2 r2870, r2846, r2867; +} +{ +cvt.rn.f16.f64 rs437, fd775; +} +mov.b32 r2875, {rs437, rs437}; +{ +mul.f16x2 r2873, r85, r2875; +} +{ +add.f16x2 r2876, r2852, r2873; +} +{ +cvt.rn.f16.f64 rs438, fd424; +} +mov.b32 r2881, {rs438, rs438}; +{ +mul.f16x2 r2879, r94, r2881; +} +{ +add.f16x2 r2882, r2858, r2879; +} +{ +cvt.rn.f16.f64 rs439, fd775; +} +mov.b32 r2887, {rs439, rs439}; +{ +mul.f16x2 r2885, r88, r2887; +} +{ +add.f16x2 r2888, r2864, r2885; +} +{ +cvt.rn.f16.f64 rs440, fd424; +} +mov.b32 r2893, {rs440, rs440}; +{ +mul.f16x2 r2891, r91, r2893; +} +{ +add.f16x2 r2894, r2870, r2891; +} +{ +cvt.rn.f16.f64 rs441, fd731; +} +mov.b32 r2899, {rs441, rs441}; +{ +mul.f16x2 r2897, r97, r2899; +} +{ +add.f16x2 r2900, r2876, r2897; +} +{ +cvt.rn.f16.f64 rs442, fd732; +} +mov.b32 r2905, {rs442, rs442}; +{ +mul.f16x2 r2903, r106, r2905; +} +{ +add.f16x2 r2906, r2882, r2903; +} +{ +cvt.rn.f16.f64 rs443, fd731; +} +mov.b32 r2911, {rs443, rs443}; +{ +mul.f16x2 r2909, r100, r2911; +} +{ +add.f16x2 r2912, r2888, r2909; +} +{ +cvt.rn.f16.f64 rs444, fd732; +} +mov.b32 r2917, {rs444, rs444}; +{ +mul.f16x2 r2915, r103, r2917; +} +{ +add.f16x2 r2918, r2894, r2915; +} +{ +cvt.rn.f16.f64 rs445, fd783; +} +mov.b32 r2923, {rs445, rs445}; +{ +mul.f16x2 r2921, r109, r2923; +} +{ +add.f16x2 r2924, r2900, r2921; +} +{ +cvt.rn.f16.f64 rs446, fd784; +} +mov.b32 r2929, {rs446, rs446}; +{ +mul.f16x2 r2927, r118, r2929; +} +{ +add.f16x2 r2930, r2906, r2927; +} +{ +cvt.rn.f16.f64 rs447, fd783; +} +mov.b32 r2935, {rs447, rs447}; +{ +mul.f16x2 r2933, r112, r2935; +} +{ +add.f16x2 r2936, r2912, r2933; +} +{ +cvt.rn.f16.f64 rs448, fd784; +} +mov.b32 r2941, {rs448, rs448}; +{ +mul.f16x2 r2939, r115, r2941; +} +{ +add.f16x2 r2942, r2918, r2939; +} +{ +cvt.rn.f16.f64 rs449, fd735; +} +mov.b32 r2947, {rs449, rs449}; +{ +mul.f16x2 r2945, r121, r2947; +} +{ +add.f16x2 r2948, r2924, r2945; +} +{ +cvt.rn.f16.f64 rs450, fd708; +} +mov.b32 r2953, {rs450, rs450}; +{ +mul.f16x2 r2951, r130, r2953; +} +{ +add.f16x2 r2954, r2930, r2951; +} +{ +cvt.rn.f16.f64 rs451, fd735; +} +mov.b32 r2959, {rs451, rs451}; +{ +mul.f16x2 r2957, r124, r2959; +} +{ +add.f16x2 r2960, r2936, r2957; +} +{ +cvt.rn.f16.f64 rs452, fd708; +} +mov.b32 r2965, {rs452, rs452}; +{ +mul.f16x2 r2963, r127, r2965; +} +{ +add.f16x2 r2966, r2942, r2963; +} +{ +cvt.rn.f16.f64 rs453, fd771; +} +mov.b32 r2971, {rs453, rs453}; +{ +mul.f16x2 r2969, r133, r2971; +} +{ +add.f16x2 r2972, r2948, r2969; +} +{ +cvt.rn.f16.f64 rs454, fd772; +} +mov.b32 r2977, {rs454, rs454}; +{ +mul.f16x2 r2975, r142, r2977; +} +{ +add.f16x2 r2978, r2954, r2975; +} +{ +cvt.rn.f16.f64 rs455, fd771; +} +mov.b32 r2983, {rs455, rs455}; +{ +mul.f16x2 r2981, r136, r2983; +} +{ +add.f16x2 r2984, r2960, r2981; +} +{ +cvt.rn.f16.f64 rs456, fd772; +} +mov.b32 r2989, {rs456, rs456}; +{ +mul.f16x2 r2987, r139, r2989; +} +{ +add.f16x2 r2990, r2966, r2987; +} +{ +cvt.rn.f16.f64 rs457, fd747; +} +mov.b32 r2995, {rs457, rs457}; +{ +mul.f16x2 r2993, r145, r2995; +} +{ +add.f16x2 r2996, r2972, r2993; +} +mov.f64 fd704, 0dBFE07F6ACD7CDCE2; +{ +cvt.rn.f16.f64 rs458, fd704; +} +mov.b32 r3001, {rs458, rs458}; +{ +mul.f16x2 r2999, r154, r3001; +} +{ +add.f16x2 r3002, r2978, r2999; +} +{ +cvt.rn.f16.f64 rs459, fd747; +} +mov.b32 r3007, {rs459, rs459}; +{ +mul.f16x2 r3005, r148, r3007; +} +{ +add.f16x2 r3008, r2984, r3005; +} +{ +cvt.rn.f16.f64 rs460, fd704; +} +mov.b32 r3013, {rs460, rs460}; +{ +mul.f16x2 r3011, r151, r3013; +} +{ +add.f16x2 r3014, r2990, r3011; +} +{ +cvt.rn.f16.f64 rs461, fd759; +} +mov.b32 r3019, {rs461, rs461}; +{ +mul.f16x2 r3017, r157, r3019; +} +{ +add.f16x2 r3020, r2996, r3017; +} +{ +cvt.rn.f16.f64 rs462, fd760; +} +mov.b32 r3025, {rs462, rs462}; +{ +mul.f16x2 r3023, r166, r3025; +} +{ +add.f16x2 r3026, r3002, r3023; +} +{ +cvt.rn.f16.f64 rs463, fd759; +} +mov.b32 r3031, {rs463, rs463}; +{ +mul.f16x2 r3029, r160, r3031; +} +{ +add.f16x2 r3032, r3008, r3029; +} +{ +cvt.rn.f16.f64 rs464, fd760; +} +mov.b32 r3037, {rs464, rs464}; +{ +mul.f16x2 r3035, r163, r3037; +} +{ +add.f16x2 r3038, r3014, r3035; +} +{ +sub.f16x2 %16, r3020, r3026; +} +{ +add.f16x2 %17, r3032, r3038; +} +{ +add.f16x2 %42, r3020, r3026; +} +{ +sub.f16x2 %43, r3032, r3038; +} +cvt.rn.f16.s32 rs465, r4804; +mov.b32 r3065, {rs465, rs465}; +cvt.rn.f16.s32 rs466, r4804; +mov.b32 r3077, {rs466, rs466}; +{ +cvt.rn.f16.f64 rs467, fd771; +} +mov.b32 r3057, {rs467, rs467}; +{ +mul.f16x2 r3055, r1, r3057; +} +{ +add.f16x2 r3058, %58, r3055; +} +{ +cvt.rn.f16.f64 rs468, fd772; +} +mov.b32 r3063, {rs468, rs468}; +{ +mul.f16x2 r3061, r10, r3063; +} +{ +add.f16x2 r3064, r3065, r3061; +} +{ +cvt.rn.f16.f64 rs469, fd771; +} +mov.b32 r3069, {rs469, rs469}; +{ +mul.f16x2 r3067, r4, r3069; +} +{ +add.f16x2 r3070, %59, r3067; +} +{ +cvt.rn.f16.f64 rs470, fd772; +} +mov.b32 r3075, {rs470, rs470}; +{ +mul.f16x2 r3073, r7, r3075; +} +{ +add.f16x2 r3076, r3077, r3073; +} +{ +cvt.rn.f16.f64 rs471, fd755; +} +mov.b32 r3081, {rs471, rs471}; +{ +mul.f16x2 r3079, r13, r3081; +} +{ +add.f16x2 r3082, r3058, r3079; +} +{ +cvt.rn.f16.f64 rs472, fd456; +} +mov.b32 r3087, {rs472, rs472}; +{ +mul.f16x2 r3085, r22, r3087; +} +{ +add.f16x2 r3088, r3064, r3085; +} +{ +cvt.rn.f16.f64 rs473, fd755; +} +mov.b32 r3093, {rs473, rs473}; +{ +mul.f16x2 r3091, r16, r3093; +} +{ +add.f16x2 r3094, r3070, r3091; +} +{ +cvt.rn.f16.f64 rs474, fd456; +} +mov.b32 r3099, {rs474, rs474}; +{ +mul.f16x2 r3097, r19, r3099; +} +{ +add.f16x2 r3100, r3076, r3097; +} +{ +cvt.rn.f16.f64 rs475, fd743; +} +mov.b32 r3105, {rs475, rs475}; +{ +mul.f16x2 r3103, r25, r3105; +} +{ +add.f16x2 r3106, r3082, r3103; +} +{ +cvt.rn.f16.f64 rs476, fd744; +} +mov.b32 r3111, {rs476, rs476}; +{ +mul.f16x2 r3109, r34, r3111; +} +{ +add.f16x2 r3112, r3088, r3109; +} +{ +cvt.rn.f16.f64 rs477, fd743; +} +mov.b32 r3117, {rs477, rs477}; +{ +mul.f16x2 r3115, r28, r3117; +} +{ +add.f16x2 r3118, r3094, r3115; +} +{ +cvt.rn.f16.f64 rs478, fd744; +} +mov.b32 r3123, {rs478, rs478}; +{ +mul.f16x2 r3121, r31, r3123; +} +{ +add.f16x2 r3124, r3100, r3121; +} +{ +cvt.rn.f16.f64 rs479, fd783; +} +mov.b32 r3129, {rs479, rs479}; +{ +mul.f16x2 r3127, r37, r3129; +} +{ +add.f16x2 r3130, r3106, r3127; +} +{ +cvt.rn.f16.f64 rs480, fd692; +} +mov.b32 r3135, {rs480, rs480}; +{ +mul.f16x2 r3133, r46, r3135; +} +{ +add.f16x2 r3136, r3112, r3133; +} +{ +cvt.rn.f16.f64 rs481, fd783; +} +mov.b32 r3141, {rs481, rs481}; +{ +mul.f16x2 r3139, r40, r3141; +} +{ +add.f16x2 r3142, r3118, r3139; +} +{ +cvt.rn.f16.f64 rs482, fd692; +} +mov.b32 r3147, {rs482, rs482}; +{ +mul.f16x2 r3145, r43, r3147; +} +{ +add.f16x2 r3148, r3124, r3145; +} +{ +cvt.rn.f16.f64 rs483, fd739; +} +mov.b32 r3153, {rs483, rs483}; +{ +mul.f16x2 r3151, r49, r3153; +} +{ +add.f16x2 r3154, r3130, r3151; +} +{ +cvt.rn.f16.f64 rs484, fd660; +} +mov.b32 r3159, {rs484, rs484}; +{ +mul.f16x2 r3157, r58, r3159; +} +{ +add.f16x2 r3160, r3136, r3157; +} +{ +cvt.rn.f16.f64 rs485, fd739; +} +mov.b32 r3165, {rs485, rs485}; +{ +mul.f16x2 r3163, r52, r3165; +} +{ +add.f16x2 r3166, r3142, r3163; +} +{ +cvt.rn.f16.f64 rs486, fd660; +} +mov.b32 r3171, {rs486, rs486}; +{ +mul.f16x2 r3169, r55, r3171; +} +{ +add.f16x2 r3172, r3148, r3169; +} +{ +cvt.rn.f16.f64 rs487, fd759; +} +mov.b32 r3177, {rs487, rs487}; +{ +mul.f16x2 r3175, r61, r3177; +} +{ +add.f16x2 r3178, r3154, r3175; +} +{ +cvt.rn.f16.f64 rs488, fd760; +} +mov.b32 r3183, {rs488, rs488}; +{ +mul.f16x2 r3181, r70, r3183; +} +{ +add.f16x2 r3184, r3160, r3181; +} +{ +cvt.rn.f16.f64 rs489, fd759; +} +mov.b32 r3189, {rs489, rs489}; +{ +mul.f16x2 r3187, r64, r3189; +} +{ +add.f16x2 r3190, r3166, r3187; +} +{ +cvt.rn.f16.f64 rs490, fd760; +} +mov.b32 r3195, {rs490, rs490}; +{ +mul.f16x2 r3193, r67, r3195; +} +{ +add.f16x2 r3196, r3172, r3193; +} +{ +cvt.rn.f16.f64 rs491, fd767; +} +mov.b32 r3201, {rs491, rs491}; +{ +mul.f16x2 r3199, r73, r3201; +} +{ +add.f16x2 r3202, r3178, r3199; +} +{ +cvt.rn.f16.f64 rs492, fd604; +} +mov.b32 r3207, {rs492, rs492}; +{ +mul.f16x2 r3205, r82, r3207; +} +{ +add.f16x2 r3208, r3184, r3205; +} +{ +cvt.rn.f16.f64 rs493, fd767; +} +mov.b32 r3213, {rs493, rs493}; +{ +mul.f16x2 r3211, r76, r3213; +} +{ +add.f16x2 r3214, r3190, r3211; +} +{ +cvt.rn.f16.f64 rs494, fd604; +} +mov.b32 r3219, {rs494, rs494}; +{ +mul.f16x2 r3217, r79, r3219; +} +{ +add.f16x2 r3220, r3196, r3217; +} +{ +cvt.rn.f16.f64 rs495, fd731; +} +mov.b32 r3225, {rs495, rs495}; +{ +mul.f16x2 r3223, r85, r3225; +} +{ +add.f16x2 r3226, r3202, r3223; +} +{ +cvt.rn.f16.f64 rs496, fd732; +} +mov.b32 r3231, {rs496, rs496}; +{ +mul.f16x2 r3229, r94, r3231; +} +{ +add.f16x2 r3232, r3208, r3229; +} +{ +cvt.rn.f16.f64 rs497, fd731; +} +mov.b32 r3237, {rs497, rs497}; +{ +mul.f16x2 r3235, r88, r3237; +} +{ +add.f16x2 r3238, r3214, r3235; +} +{ +cvt.rn.f16.f64 rs498, fd732; +} +mov.b32 r3243, {rs498, rs498}; +{ +mul.f16x2 r3241, r91, r3243; +} +{ +add.f16x2 r3244, r3220, r3241; +} +{ +cvt.rn.f16.f64 rs499, fd775; +} +mov.b32 r3249, {rs499, rs499}; +{ +mul.f16x2 r3247, r97, r3249; +} +{ +add.f16x2 r3250, r3226, r3247; +} +{ +cvt.rn.f16.f64 rs500, fd776; +} +mov.b32 r3255, {rs500, rs500}; +{ +mul.f16x2 r3253, r106, r3255; +} +{ +add.f16x2 r3256, r3232, r3253; +} +{ +cvt.rn.f16.f64 rs501, fd775; +} +mov.b32 r3261, {rs501, rs501}; +{ +mul.f16x2 r3259, r100, r3261; +} +{ +add.f16x2 r3262, r3238, r3259; +} +{ +cvt.rn.f16.f64 rs502, fd776; +} +mov.b32 r3267, {rs502, rs502}; +{ +mul.f16x2 r3265, r103, r3267; +} +{ +add.f16x2 r3268, r3244, r3265; +} +{ +cvt.rn.f16.f64 rs503, fd751; +} +mov.b32 r3273, {rs503, rs503}; +{ +mul.f16x2 r3271, r109, r3273; +} +{ +add.f16x2 r3274, r3250, r3271; +} +{ +cvt.rn.f16.f64 rs504, fd540; +} +mov.b32 r3279, {rs504, rs504}; +{ +mul.f16x2 r3277, r118, r3279; +} +{ +add.f16x2 r3280, r3256, r3277; +} +{ +cvt.rn.f16.f64 rs505, fd751; +} +mov.b32 r3285, {rs505, rs505}; +{ +mul.f16x2 r3283, r112, r3285; +} +{ +add.f16x2 r3286, r3262, r3283; +} +{ +cvt.rn.f16.f64 rs506, fd540; +} +mov.b32 r3291, {rs506, rs506}; +{ +mul.f16x2 r3289, r115, r3291; +} +{ +add.f16x2 r3292, r3268, r3289; +} +{ +cvt.rn.f16.f64 rs507, fd747; +} +mov.b32 r3297, {rs507, rs507}; +{ +mul.f16x2 r3295, r121, r3297; +} +{ +add.f16x2 r3298, r3274, r3295; +} +{ +cvt.rn.f16.f64 rs508, fd748; +} +mov.b32 r3303, {rs508, rs508}; +{ +mul.f16x2 r3301, r130, r3303; +} +{ +add.f16x2 r3304, r3280, r3301; +} +{ +cvt.rn.f16.f64 rs509, fd747; +} +mov.b32 r3309, {rs509, rs509}; +{ +mul.f16x2 r3307, r124, r3309; +} +{ +add.f16x2 r3310, r3286, r3307; +} +{ +cvt.rn.f16.f64 rs510, fd748; +} +mov.b32 r3315, {rs510, rs510}; +{ +mul.f16x2 r3313, r127, r3315; +} +{ +add.f16x2 r3316, r3292, r3313; +} +{ +cvt.rn.f16.f64 rs511, fd779; +} +mov.b32 r3321, {rs511, rs511}; +{ +mul.f16x2 r3319, r133, r3321; +} +{ +add.f16x2 r3322, r3298, r3319; +} +{ +cvt.rn.f16.f64 rs512, fd652; +} +mov.b32 r3327, {rs512, rs512}; +{ +mul.f16x2 r3325, r142, r3327; +} +{ +add.f16x2 r3328, r3304, r3325; +} +{ +cvt.rn.f16.f64 rs513, fd779; +} +mov.b32 r3333, {rs513, rs513}; +{ +mul.f16x2 r3331, r136, r3333; +} +{ +add.f16x2 r3334, r3310, r3331; +} +{ +cvt.rn.f16.f64 rs514, fd652; +} +mov.b32 r3339, {rs514, rs514}; +{ +mul.f16x2 r3337, r139, r3339; +} +{ +add.f16x2 r3340, r3316, r3337; +} +{ +cvt.rn.f16.f64 rs515, fd735; +} +mov.b32 r3345, {rs515, rs515}; +{ +mul.f16x2 r3343, r145, r3345; +} +{ +add.f16x2 r3346, r3322, r3343; +} +{ +cvt.rn.f16.f64 rs516, fd708; +} +mov.b32 r3351, {rs516, rs516}; +{ +mul.f16x2 r3349, r154, r3351; +} +{ +add.f16x2 r3352, r3328, r3349; +} +{ +cvt.rn.f16.f64 rs517, fd735; +} +mov.b32 r3357, {rs517, rs517}; +{ +mul.f16x2 r3355, r148, r3357; +} +{ +add.f16x2 r3358, r3334, r3355; +} +{ +cvt.rn.f16.f64 rs518, fd708; +} +mov.b32 r3363, {rs518, rs518}; +{ +mul.f16x2 r3361, r151, r3363; +} +{ +add.f16x2 r3364, r3340, r3361; +} +{ +cvt.rn.f16.f64 rs519, fd763; +} +mov.b32 r3369, {rs519, rs519}; +{ +mul.f16x2 r3367, r157, r3369; +} +{ +add.f16x2 r3370, r3346, r3367; +} +{ +cvt.rn.f16.f64 rs520, fd764; +} +mov.b32 r3375, {rs520, rs520}; +{ +mul.f16x2 r3373, r166, r3375; +} +{ +add.f16x2 r3376, r3352, r3373; +} +{ +cvt.rn.f16.f64 rs521, fd763; +} +mov.b32 r3381, {rs521, rs521}; +{ +mul.f16x2 r3379, r160, r3381; +} +{ +add.f16x2 r3382, r3358, r3379; +} +{ +cvt.rn.f16.f64 rs522, fd764; +} +mov.b32 r3387, {rs522, rs522}; +{ +mul.f16x2 r3385, r163, r3387; +} +{ +add.f16x2 r3388, r3364, r3385; +} +{ +sub.f16x2 %18, r3370, r3376; +} +{ +add.f16x2 %19, r3382, r3388; +} +{ +add.f16x2 %40, r3370, r3376; +} +{ +sub.f16x2 %41, r3382, r3388; +} +cvt.rn.f16.s32 rs523, r4804; +mov.b32 r3415, {rs523, rs523}; +cvt.rn.f16.s32 rs524, r4804; +mov.b32 r3427, {rs524, rs524}; +{ +cvt.rn.f16.f64 rs525, fd763; +} +mov.b32 r3407, {rs525, rs525}; +{ +mul.f16x2 r3405, r1, r3407; +} +{ +add.f16x2 r3408, %58, r3405; +} +{ +cvt.rn.f16.f64 rs526, fd764; +} +mov.b32 r3413, {rs526, rs526}; +{ +mul.f16x2 r3411, r10, r3413; +} +{ +add.f16x2 r3414, r3415, r3411; +} +{ +cvt.rn.f16.f64 rs527, fd763; +} +mov.b32 r3419, {rs527, rs527}; +{ +mul.f16x2 r3417, r4, r3419; +} +{ +add.f16x2 r3420, %59, r3417; +} +{ +cvt.rn.f16.f64 rs528, fd764; +} +mov.b32 r3425, {rs528, rs528}; +{ +mul.f16x2 r3423, r7, r3425; +} +{ +add.f16x2 r3426, r3427, r3423; +} +{ +cvt.rn.f16.f64 rs529, fd771; +} +mov.b32 r3431, {rs529, rs529}; +{ +mul.f16x2 r3429, r13, r3431; +} +{ +add.f16x2 r3432, r3408, r3429; +} +{ +cvt.rn.f16.f64 rs530, fd696; +} +mov.b32 r3437, {rs530, rs530}; +{ +mul.f16x2 r3435, r22, r3437; +} +{ +add.f16x2 r3438, r3414, r3435; +} +{ +cvt.rn.f16.f64 rs531, fd771; +} +mov.b32 r3443, {rs531, rs531}; +{ +mul.f16x2 r3441, r16, r3443; +} +{ +add.f16x2 r3444, r3420, r3441; +} +{ +cvt.rn.f16.f64 rs532, fd696; +} +mov.b32 r3449, {rs532, rs532}; +{ +mul.f16x2 r3447, r19, r3449; +} +{ +add.f16x2 r3450, r3426, r3447; +} +{ +cvt.rn.f16.f64 rs533, fd735; +} +mov.b32 r3455, {rs533, rs533}; +{ +mul.f16x2 r3453, r25, r3455; +} +{ +add.f16x2 r3456, r3432, r3453; +} +{ +cvt.rn.f16.f64 rs534, fd708; +} +mov.b32 r3461, {rs534, rs534}; +{ +mul.f16x2 r3459, r34, r3461; +} +{ +add.f16x2 r3462, r3438, r3459; +} +{ +cvt.rn.f16.f64 rs535, fd735; +} +mov.b32 r3467, {rs535, rs535}; +{ +mul.f16x2 r3465, r28, r3467; +} +{ +add.f16x2 r3468, r3444, r3465; +} +{ +cvt.rn.f16.f64 rs536, fd708; +} +mov.b32 r3473, {rs536, rs536}; +{ +mul.f16x2 r3471, r31, r3473; +} +{ +add.f16x2 r3474, r3450, r3471; +} +{ +cvt.rn.f16.f64 rs537, fd755; +} +mov.b32 r3479, {rs537, rs537}; +{ +mul.f16x2 r3477, r37, r3479; +} +{ +add.f16x2 r3480, r3456, r3477; +} +{ +cvt.rn.f16.f64 rs538, fd756; +} +mov.b32 r3485, {rs538, rs538}; +{ +mul.f16x2 r3483, r46, r3485; +} +{ +add.f16x2 r3486, r3462, r3483; +} +{ +cvt.rn.f16.f64 rs539, fd755; +} +mov.b32 r3491, {rs539, rs539}; +{ +mul.f16x2 r3489, r40, r3491; +} +{ +add.f16x2 r3492, r3468, r3489; +} +{ +cvt.rn.f16.f64 rs540, fd756; +} +mov.b32 r3497, {rs540, rs540}; +{ +mul.f16x2 r3495, r43, r3497; +} +{ +add.f16x2 r3498, r3474, r3495; +} +{ +cvt.rn.f16.f64 rs541, fd779; +} +mov.b32 r3503, {rs541, rs541}; +{ +mul.f16x2 r3501, r49, r3503; +} +{ +add.f16x2 r3504, r3480, r3501; +} +{ +cvt.rn.f16.f64 rs542, fd652; +} +mov.b32 r3509, {rs542, rs542}; +{ +mul.f16x2 r3507, r58, r3509; +} +{ +add.f16x2 r3510, r3486, r3507; +} +{ +cvt.rn.f16.f64 rs543, fd779; +} +mov.b32 r3515, {rs543, rs543}; +{ +mul.f16x2 r3513, r52, r3515; +} +{ +add.f16x2 r3516, r3492, r3513; +} +{ +cvt.rn.f16.f64 rs544, fd652; +} +mov.b32 r3521, {rs544, rs544}; +{ +mul.f16x2 r3519, r55, r3521; +} +{ +add.f16x2 r3522, r3498, r3519; +} +{ +cvt.rn.f16.f64 rs545, fd743; +} +mov.b32 r3527, {rs545, rs545}; +{ +mul.f16x2 r3525, r61, r3527; +} +{ +add.f16x2 r3528, r3504, r3525; +} +{ +cvt.rn.f16.f64 rs546, fd636; +} +mov.b32 r3533, {rs546, rs546}; +{ +mul.f16x2 r3531, r70, r3533; +} +{ +add.f16x2 r3534, r3510, r3531; +} +{ +cvt.rn.f16.f64 rs547, fd743; +} +mov.b32 r3539, {rs547, rs547}; +{ +mul.f16x2 r3537, r64, r3539; +} +{ +add.f16x2 r3540, r3516, r3537; +} +{ +cvt.rn.f16.f64 rs548, fd636; +} +mov.b32 r3545, {rs548, rs548}; +{ +mul.f16x2 r3543, r67, r3545; +} +{ +add.f16x2 r3546, r3522, r3543; +} +{ +cvt.rn.f16.f64 rs549, fd747; +} +mov.b32 r3551, {rs549, rs549}; +{ +mul.f16x2 r3549, r73, r3551; +} +{ +add.f16x2 r3552, r3528, r3549; +} +{ +cvt.rn.f16.f64 rs550, fd748; +} +mov.b32 r3557, {rs550, rs550}; +{ +mul.f16x2 r3555, r82, r3557; +} +{ +add.f16x2 r3558, r3534, r3555; +} +{ +cvt.rn.f16.f64 rs551, fd747; +} +mov.b32 r3563, {rs551, rs551}; +{ +mul.f16x2 r3561, r76, r3563; +} +{ +add.f16x2 r3564, r3540, r3561; +} +{ +cvt.rn.f16.f64 rs552, fd748; +} +mov.b32 r3569, {rs552, rs552}; +{ +mul.f16x2 r3567, r79, r3569; +} +{ +add.f16x2 r3570, r3546, r3567; +} +{ +cvt.rn.f16.f64 rs553, fd783; +} +mov.b32 r3575, {rs553, rs553}; +{ +mul.f16x2 r3573, r85, r3575; +} +{ +add.f16x2 r3576, r3552, r3573; +} +{ +cvt.rn.f16.f64 rs554, fd784; +} +mov.b32 r3581, {rs554, rs554}; +{ +mul.f16x2 r3579, r94, r3581; +} +{ +add.f16x2 r3582, r3558, r3579; +} +{ +cvt.rn.f16.f64 rs555, fd783; +} +mov.b32 r3587, {rs555, rs555}; +{ +mul.f16x2 r3585, r88, r3587; +} +{ +add.f16x2 r3588, r3564, r3585; +} +{ +cvt.rn.f16.f64 rs556, fd784; +} +mov.b32 r3593, {rs556, rs556}; +{ +mul.f16x2 r3591, r91, r3593; +} +{ +add.f16x2 r3594, r3570, r3591; +} +{ +cvt.rn.f16.f64 rs557, fd751; +} +mov.b32 r3599, {rs557, rs557}; +{ +mul.f16x2 r3597, r97, r3599; +} +{ +add.f16x2 r3600, r3576, r3597; +} +{ +cvt.rn.f16.f64 rs558, fd540; +} +mov.b32 r3605, {rs558, rs558}; +{ +mul.f16x2 r3603, r106, r3605; +} +{ +add.f16x2 r3606, r3582, r3603; +} +{ +cvt.rn.f16.f64 rs559, fd751; +} +mov.b32 r3611, {rs559, rs559}; +{ +mul.f16x2 r3609, r100, r3611; +} +{ +add.f16x2 r3612, r3588, r3609; +} +{ +cvt.rn.f16.f64 rs560, fd540; +} +mov.b32 r3617, {rs560, rs560}; +{ +mul.f16x2 r3615, r103, r3617; +} +{ +add.f16x2 r3618, r3594, r3615; +} +{ +cvt.rn.f16.f64 rs561, fd739; +} +mov.b32 r3623, {rs561, rs561}; +{ +mul.f16x2 r3621, r109, r3623; +} +{ +add.f16x2 r3624, r3600, r3621; +} +{ +cvt.rn.f16.f64 rs562, fd740; +} +mov.b32 r3629, {rs562, rs562}; +{ +mul.f16x2 r3627, r118, r3629; +} +{ +add.f16x2 r3630, r3606, r3627; +} +{ +cvt.rn.f16.f64 rs563, fd739; +} +mov.b32 r3635, {rs563, rs563}; +{ +mul.f16x2 r3633, r112, r3635; +} +{ +add.f16x2 r3636, r3612, r3633; +} +{ +cvt.rn.f16.f64 rs564, fd740; +} +mov.b32 r3641, {rs564, rs564}; +{ +mul.f16x2 r3639, r115, r3641; +} +{ +add.f16x2 r3642, r3618, r3639; +} +{ +cvt.rn.f16.f64 rs565, fd775; +} +mov.b32 r3647, {rs565, rs565}; +{ +mul.f16x2 r3645, r121, r3647; +} +{ +add.f16x2 r3648, r3624, r3645; +} +{ +cvt.rn.f16.f64 rs566, fd776; +} +mov.b32 r3653, {rs566, rs566}; +{ +mul.f16x2 r3651, r130, r3653; +} +{ +add.f16x2 r3654, r3630, r3651; +} +{ +cvt.rn.f16.f64 rs567, fd775; +} +mov.b32 r3659, {rs567, rs567}; +{ +mul.f16x2 r3657, r124, r3659; +} +{ +add.f16x2 r3660, r3636, r3657; +} +{ +cvt.rn.f16.f64 rs568, fd776; +} +mov.b32 r3665, {rs568, rs568}; +{ +mul.f16x2 r3663, r127, r3665; +} +{ +add.f16x2 r3666, r3642, r3663; +} +{ +cvt.rn.f16.f64 rs569, fd759; +} +mov.b32 r3671, {rs569, rs569}; +{ +mul.f16x2 r3669, r133, r3671; +} +{ +add.f16x2 r3672, r3648, r3669; +} +{ +cvt.rn.f16.f64 rs570, fd700; +} +mov.b32 r3677, {rs570, rs570}; +{ +mul.f16x2 r3675, r142, r3677; +} +{ +add.f16x2 r3678, r3654, r3675; +} +{ +cvt.rn.f16.f64 rs571, fd759; +} +mov.b32 r3683, {rs571, rs571}; +{ +mul.f16x2 r3681, r136, r3683; +} +{ +add.f16x2 r3684, r3660, r3681; +} +{ +cvt.rn.f16.f64 rs572, fd700; +} +mov.b32 r3689, {rs572, rs572}; +{ +mul.f16x2 r3687, r139, r3689; +} +{ +add.f16x2 r3690, r3666, r3687; +} +{ +cvt.rn.f16.f64 rs573, fd731; +} +mov.b32 r3695, {rs573, rs573}; +{ +mul.f16x2 r3693, r145, r3695; +} +{ +add.f16x2 r3696, r3672, r3693; +} +{ +cvt.rn.f16.f64 rs574, fd732; +} +mov.b32 r3701, {rs574, rs574}; +{ +mul.f16x2 r3699, r154, r3701; +} +{ +add.f16x2 r3702, r3678, r3699; +} +{ +cvt.rn.f16.f64 rs575, fd731; +} +mov.b32 r3707, {rs575, rs575}; +{ +mul.f16x2 r3705, r148, r3707; +} +{ +add.f16x2 r3708, r3684, r3705; +} +{ +cvt.rn.f16.f64 rs576, fd732; +} +mov.b32 r3713, {rs576, rs576}; +{ +mul.f16x2 r3711, r151, r3713; +} +{ +add.f16x2 r3714, r3690, r3711; +} +{ +cvt.rn.f16.f64 rs577, fd767; +} +mov.b32 r3719, {rs577, rs577}; +{ +mul.f16x2 r3717, r157, r3719; +} +{ +add.f16x2 r3720, r3696, r3717; +} +{ +cvt.rn.f16.f64 rs578, fd768; +} +mov.b32 r3725, {rs578, rs578}; +{ +mul.f16x2 r3723, r166, r3725; +} +{ +add.f16x2 r3726, r3702, r3723; +} +{ +cvt.rn.f16.f64 rs579, fd767; +} +mov.b32 r3731, {rs579, rs579}; +{ +mul.f16x2 r3729, r160, r3731; +} +{ +add.f16x2 r3732, r3708, r3729; +} +{ +cvt.rn.f16.f64 rs580, fd768; +} +mov.b32 r3737, {rs580, rs580}; +{ +mul.f16x2 r3735, r163, r3737; +} +{ +add.f16x2 r3738, r3714, r3735; +} +{ +sub.f16x2 %20, r3720, r3726; +} +{ +add.f16x2 %21, r3732, r3738; +} +{ +add.f16x2 %38, r3720, r3726; +} +{ +sub.f16x2 %39, r3732, r3738; +} +cvt.rn.f16.s32 rs581, r4804; +mov.b32 r3765, {rs581, rs581}; +cvt.rn.f16.s32 rs582, r4804; +mov.b32 r3777, {rs582, rs582}; +{ +cvt.rn.f16.f64 rs583, fd755; +} +mov.b32 r3757, {rs583, rs583}; +{ +mul.f16x2 r3755, r1, r3757; +} +{ +add.f16x2 r3758, %58, r3755; +} +{ +cvt.rn.f16.f64 rs584, fd756; +} +mov.b32 r3763, {rs584, rs584}; +{ +mul.f16x2 r3761, r10, r3763; +} +{ +add.f16x2 r3764, r3765, r3761; +} +{ +cvt.rn.f16.f64 rs585, fd755; +} +mov.b32 r3769, {rs585, rs585}; +{ +mul.f16x2 r3767, r4, r3769; +} +{ +add.f16x2 r3770, %59, r3767; +} +{ +cvt.rn.f16.f64 rs586, fd756; +} +mov.b32 r3775, {rs586, rs586}; +{ +mul.f16x2 r3773, r7, r3775; +} +{ +add.f16x2 r3776, r3777, r3773; +} +{ +cvt.rn.f16.f64 rs587, fd783; +} +mov.b32 r3781, {rs587, rs587}; +{ +mul.f16x2 r3779, r13, r3781; +} +{ +add.f16x2 r3782, r3758, r3779; +} +{ +cvt.rn.f16.f64 rs588, fd784; +} +mov.b32 r3787, {rs588, rs588}; +{ +mul.f16x2 r3785, r22, r3787; +} +{ +add.f16x2 r3788, r3764, r3785; +} +{ +cvt.rn.f16.f64 rs589, fd783; +} +mov.b32 r3793, {rs589, rs589}; +{ +mul.f16x2 r3791, r16, r3793; +} +{ +add.f16x2 r3794, r3770, r3791; +} +{ +cvt.rn.f16.f64 rs590, fd784; +} +mov.b32 r3799, {rs590, rs590}; +{ +mul.f16x2 r3797, r19, r3799; +} +{ +add.f16x2 r3800, r3776, r3797; +} +{ +cvt.rn.f16.f64 rs591, fd759; +} +mov.b32 r3805, {rs591, rs591}; +{ +mul.f16x2 r3803, r25, r3805; +} +{ +add.f16x2 r3806, r3782, r3803; +} +{ +cvt.rn.f16.f64 rs592, fd700; +} +mov.b32 r3811, {rs592, rs592}; +{ +mul.f16x2 r3809, r34, r3811; +} +{ +add.f16x2 r3812, r3788, r3809; +} +{ +cvt.rn.f16.f64 rs593, fd759; +} +mov.b32 r3817, {rs593, rs593}; +{ +mul.f16x2 r3815, r28, r3817; +} +{ +add.f16x2 r3818, r3794, r3815; +} +{ +cvt.rn.f16.f64 rs594, fd700; +} +mov.b32 r3823, {rs594, rs594}; +{ +mul.f16x2 r3821, r31, r3823; +} +{ +add.f16x2 r3824, r3800, r3821; +} +{ +cvt.rn.f16.f64 rs595, fd731; +} +mov.b32 r3829, {rs595, rs595}; +{ +mul.f16x2 r3827, r37, r3829; +} +{ +add.f16x2 r3830, r3806, r3827; +} +{ +cvt.rn.f16.f64 rs596, fd576; +} +mov.b32 r3835, {rs596, rs596}; +{ +mul.f16x2 r3833, r46, r3835; +} +{ +add.f16x2 r3836, r3812, r3833; +} +{ +cvt.rn.f16.f64 rs597, fd731; +} +mov.b32 r3841, {rs597, rs597}; +{ +mul.f16x2 r3839, r40, r3841; +} +{ +add.f16x2 r3842, r3818, r3839; +} +{ +cvt.rn.f16.f64 rs598, fd576; +} +mov.b32 r3847, {rs598, rs598}; +{ +mul.f16x2 r3845, r43, r3847; +} +{ +add.f16x2 r3848, r3824, r3845; +} +{ +cvt.rn.f16.f64 rs599, fd751; +} +mov.b32 r3853, {rs599, rs599}; +{ +mul.f16x2 r3851, r49, r3853; +} +{ +add.f16x2 r3854, r3830, r3851; +} +{ +cvt.rn.f16.f64 rs600, fd752; +} +mov.b32 r3859, {rs600, rs600}; +{ +mul.f16x2 r3857, r58, r3859; +} +{ +add.f16x2 r3860, r3836, r3857; +} +{ +cvt.rn.f16.f64 rs601, fd751; +} +mov.b32 r3865, {rs601, rs601}; +{ +mul.f16x2 r3863, r52, r3865; +} +{ +add.f16x2 r3866, r3842, r3863; +} +{ +cvt.rn.f16.f64 rs602, fd752; +} +mov.b32 r3871, {rs602, rs602}; +{ +mul.f16x2 r3869, r55, r3871; +} +{ +add.f16x2 r3872, r3848, r3869; +} +{ +cvt.rn.f16.f64 rs603, fd779; +} +mov.b32 r3877, {rs603, rs603}; +{ +mul.f16x2 r3875, r61, r3877; +} +{ +add.f16x2 r3878, r3854, r3875; +} +{ +cvt.rn.f16.f64 rs604, fd780; +} +mov.b32 r3883, {rs604, rs604}; +{ +mul.f16x2 r3881, r70, r3883; +} +{ +add.f16x2 r3884, r3860, r3881; +} +{ +cvt.rn.f16.f64 rs605, fd779; +} +mov.b32 r3889, {rs605, rs605}; +{ +mul.f16x2 r3887, r64, r3889; +} +{ +add.f16x2 r3890, r3866, r3887; +} +{ +cvt.rn.f16.f64 rs606, fd780; +} +mov.b32 r3895, {rs606, rs606}; +{ +mul.f16x2 r3893, r67, r3895; +} +{ +add.f16x2 r3896, r3872, r3893; +} +{ +cvt.rn.f16.f64 rs607, fd763; +} +mov.b32 r3901, {rs607, rs607}; +{ +mul.f16x2 r3899, r73, r3901; +} +{ +add.f16x2 r3902, r3878, r3899; +} +{ +cvt.rn.f16.f64 rs608, fd632; +} +mov.b32 r3907, {rs608, rs608}; +{ +mul.f16x2 r3905, r82, r3907; +} +{ +add.f16x2 r3908, r3884, r3905; +} +{ +cvt.rn.f16.f64 rs609, fd763; +} +mov.b32 r3913, {rs609, rs609}; +{ +mul.f16x2 r3911, r76, r3913; +} +{ +add.f16x2 r3914, r3890, r3911; +} +{ +cvt.rn.f16.f64 rs610, fd632; +} +mov.b32 r3919, {rs610, rs610}; +{ +mul.f16x2 r3917, r79, r3919; +} +{ +add.f16x2 r3920, r3896, r3917; +} +{ +cvt.rn.f16.f64 rs611, fd735; +} +mov.b32 r3925, {rs611, rs611}; +{ +mul.f16x2 r3923, r85, r3925; +} +{ +add.f16x2 r3926, r3902, r3923; +} +{ +cvt.rn.f16.f64 rs612, fd708; +} +mov.b32 r3931, {rs612, rs612}; +{ +mul.f16x2 r3929, r94, r3931; +} +{ +add.f16x2 r3932, r3908, r3929; +} +{ +cvt.rn.f16.f64 rs613, fd735; +} +mov.b32 r3937, {rs613, rs613}; +{ +mul.f16x2 r3935, r88, r3937; +} +{ +add.f16x2 r3938, r3914, r3935; +} +{ +cvt.rn.f16.f64 rs614, fd708; +} +mov.b32 r3943, {rs614, rs614}; +{ +mul.f16x2 r3941, r91, r3943; +} +{ +add.f16x2 r3944, r3920, r3941; +} +{ +cvt.rn.f16.f64 rs615, fd747; +} +mov.b32 r3949, {rs615, rs615}; +{ +mul.f16x2 r3947, r97, r3949; +} +{ +add.f16x2 r3950, r3926, r3947; +} +{ +cvt.rn.f16.f64 rs616, fd748; +} +mov.b32 r3955, {rs616, rs616}; +{ +mul.f16x2 r3953, r106, r3955; +} +{ +add.f16x2 r3956, r3932, r3953; +} +{ +cvt.rn.f16.f64 rs617, fd747; +} +mov.b32 r3961, {rs617, rs617}; +{ +mul.f16x2 r3959, r100, r3961; +} +{ +add.f16x2 r3962, r3938, r3959; +} +{ +cvt.rn.f16.f64 rs618, fd748; +} +mov.b32 r3967, {rs618, rs618}; +{ +mul.f16x2 r3965, r103, r3967; +} +{ +add.f16x2 r3968, r3944, r3965; +} +{ +cvt.rn.f16.f64 rs619, fd775; +} +mov.b32 r3973, {rs619, rs619}; +{ +mul.f16x2 r3971, r109, r3973; +} +{ +add.f16x2 r3974, r3950, r3971; +} +{ +cvt.rn.f16.f64 rs620, fd776; +} +mov.b32 r3979, {rs620, rs620}; +{ +mul.f16x2 r3977, r118, r3979; +} +{ +add.f16x2 r3980, r3956, r3977; +} +{ +cvt.rn.f16.f64 rs621, fd775; +} +mov.b32 r3985, {rs621, rs621}; +{ +mul.f16x2 r3983, r112, r3985; +} +{ +add.f16x2 r3986, r3962, r3983; +} +{ +cvt.rn.f16.f64 rs622, fd776; +} +mov.b32 r3991, {rs622, rs622}; +{ +mul.f16x2 r3989, r115, r3991; +} +{ +add.f16x2 r3992, r3968, r3989; +} +{ +cvt.rn.f16.f64 rs623, fd767; +} +mov.b32 r3997, {rs623, rs623}; +{ +mul.f16x2 r3995, r121, r3997; +} +{ +add.f16x2 r3998, r3974, r3995; +} +{ +cvt.rn.f16.f64 rs624, fd604; +} +mov.b32 r4003, {rs624, rs624}; +{ +mul.f16x2 r4001, r130, r4003; +} +{ +add.f16x2 r4004, r3980, r4001; +} +{ +cvt.rn.f16.f64 rs625, fd767; +} +mov.b32 r4009, {rs625, rs625}; +{ +mul.f16x2 r4007, r124, r4009; +} +{ +add.f16x2 r4010, r3986, r4007; +} +{ +cvt.rn.f16.f64 rs626, fd604; +} +mov.b32 r4015, {rs626, rs626}; +{ +mul.f16x2 r4013, r127, r4015; +} +{ +add.f16x2 r4016, r3992, r4013; +} +{ +cvt.rn.f16.f64 rs627, fd739; +} +mov.b32 r4021, {rs627, rs627}; +{ +mul.f16x2 r4019, r133, r4021; +} +{ +add.f16x2 r4022, r3998, r4019; +} +{ +cvt.rn.f16.f64 rs628, fd660; +} +mov.b32 r4027, {rs628, rs628}; +{ +mul.f16x2 r4025, r142, r4027; +} +{ +add.f16x2 r4028, r4004, r4025; +} +{ +cvt.rn.f16.f64 rs629, fd739; +} +mov.b32 r4033, {rs629, rs629}; +{ +mul.f16x2 r4031, r136, r4033; +} +{ +add.f16x2 r4034, r4010, r4031; +} +{ +cvt.rn.f16.f64 rs630, fd660; +} +mov.b32 r4039, {rs630, rs630}; +{ +mul.f16x2 r4037, r139, r4039; +} +{ +add.f16x2 r4040, r4016, r4037; +} +{ +cvt.rn.f16.f64 rs631, fd743; +} +mov.b32 r4045, {rs631, rs631}; +{ +mul.f16x2 r4043, r145, r4045; +} +{ +add.f16x2 r4046, r4022, r4043; +} +{ +cvt.rn.f16.f64 rs632, fd744; +} +mov.b32 r4051, {rs632, rs632}; +{ +mul.f16x2 r4049, r154, r4051; +} +{ +add.f16x2 r4052, r4028, r4049; +} +{ +cvt.rn.f16.f64 rs633, fd743; +} +mov.b32 r4057, {rs633, rs633}; +{ +mul.f16x2 r4055, r148, r4057; +} +{ +add.f16x2 r4058, r4034, r4055; +} +{ +cvt.rn.f16.f64 rs634, fd744; +} +mov.b32 r4063, {rs634, rs634}; +{ +mul.f16x2 r4061, r151, r4063; +} +{ +add.f16x2 r4064, r4040, r4061; +} +{ +cvt.rn.f16.f64 rs635, fd771; +} +mov.b32 r4069, {rs635, rs635}; +{ +mul.f16x2 r4067, r157, r4069; +} +{ +add.f16x2 r4070, r4046, r4067; +} +{ +cvt.rn.f16.f64 rs636, fd772; +} +mov.b32 r4075, {rs636, rs636}; +{ +mul.f16x2 r4073, r166, r4075; +} +{ +add.f16x2 r4076, r4052, r4073; +} +{ +cvt.rn.f16.f64 rs637, fd771; +} +mov.b32 r4081, {rs637, rs637}; +{ +mul.f16x2 r4079, r160, r4081; +} +{ +add.f16x2 r4082, r4058, r4079; +} +{ +cvt.rn.f16.f64 rs638, fd772; +} +mov.b32 r4087, {rs638, rs638}; +{ +mul.f16x2 r4085, r163, r4087; +} +{ +add.f16x2 r4088, r4064, r4085; +} +{ +sub.f16x2 %22, r4070, r4076; +} +{ +add.f16x2 %23, r4082, r4088; +} +{ +add.f16x2 %36, r4070, r4076; +} +{ +sub.f16x2 %37, r4082, r4088; +} +cvt.rn.f16.s32 rs639, r4804; +mov.b32 r4115, {rs639, rs639}; +cvt.rn.f16.s32 rs640, r4804; +mov.b32 r4127, {rs640, rs640}; +{ +cvt.rn.f16.f64 rs641, fd747; +} +mov.b32 r4107, {rs641, rs641}; +{ +mul.f16x2 r4105, r1, r4107; +} +{ +add.f16x2 r4108, %58, r4105; +} +{ +cvt.rn.f16.f64 rs642, fd748; +} +mov.b32 r4113, {rs642, rs642}; +{ +mul.f16x2 r4111, r10, r4113; +} +{ +add.f16x2 r4114, r4115, r4111; +} +{ +cvt.rn.f16.f64 rs643, fd747; +} +mov.b32 r4119, {rs643, rs643}; +{ +mul.f16x2 r4117, r4, r4119; +} +{ +add.f16x2 r4120, %59, r4117; +} +{ +cvt.rn.f16.f64 rs644, fd748; +} +mov.b32 r4125, {rs644, rs644}; +{ +mul.f16x2 r4123, r7, r4125; +} +{ +add.f16x2 r4126, r4127, r4123; +} +{ +cvt.rn.f16.f64 rs645, fd767; +} +mov.b32 r4131, {rs645, rs645}; +{ +mul.f16x2 r4129, r13, r4131; +} +{ +add.f16x2 r4132, r4108, r4129; +} +{ +cvt.rn.f16.f64 rs646, fd768; +} +mov.b32 r4137, {rs646, rs646}; +{ +mul.f16x2 r4135, r22, r4137; +} +{ +add.f16x2 r4138, r4114, r4135; +} +{ +cvt.rn.f16.f64 rs647, fd767; +} +mov.b32 r4143, {rs647, rs647}; +{ +mul.f16x2 r4141, r16, r4143; +} +{ +add.f16x2 r4144, r4120, r4141; +} +{ +cvt.rn.f16.f64 rs648, fd768; +} +mov.b32 r4149, {rs648, rs648}; +{ +mul.f16x2 r4147, r19, r4149; +} +{ +add.f16x2 r4150, r4126, r4147; +} +{ +cvt.rn.f16.f64 rs649, fd783; +} +mov.b32 r4155, {rs649, rs649}; +{ +mul.f16x2 r4153, r25, r4155; +} +{ +add.f16x2 r4156, r4132, r4153; +} +{ +cvt.rn.f16.f64 rs650, fd692; +} +mov.b32 r4161, {rs650, rs650}; +{ +mul.f16x2 r4159, r34, r4161; +} +{ +add.f16x2 r4162, r4138, r4159; +} +{ +cvt.rn.f16.f64 rs651, fd783; +} +mov.b32 r4167, {rs651, rs651}; +{ +mul.f16x2 r4165, r28, r4167; +} +{ +add.f16x2 r4168, r4144, r4165; +} +{ +cvt.rn.f16.f64 rs652, fd692; +} +mov.b32 r4173, {rs652, rs652}; +{ +mul.f16x2 r4171, r31, r4173; +} +{ +add.f16x2 r4174, r4150, r4171; +} +{ +cvt.rn.f16.f64 rs653, fd763; +} +mov.b32 r4179, {rs653, rs653}; +{ +mul.f16x2 r4177, r37, r4179; +} +{ +add.f16x2 r4180, r4156, r4177; +} +{ +cvt.rn.f16.f64 rs654, fd632; +} +mov.b32 r4185, {rs654, rs654}; +{ +mul.f16x2 r4183, r46, r4185; +} +{ +add.f16x2 r4186, r4162, r4183; +} +{ +cvt.rn.f16.f64 rs655, fd763; +} +mov.b32 r4191, {rs655, rs655}; +{ +mul.f16x2 r4189, r40, r4191; +} +{ +add.f16x2 r4192, r4168, r4189; +} +{ +cvt.rn.f16.f64 rs656, fd632; +} +mov.b32 r4197, {rs656, rs656}; +{ +mul.f16x2 r4195, r43, r4197; +} +{ +add.f16x2 r4198, r4174, r4195; +} +{ +cvt.rn.f16.f64 rs657, fd743; +} +mov.b32 r4203, {rs657, rs657}; +{ +mul.f16x2 r4201, r49, r4203; +} +{ +add.f16x2 r4204, r4180, r4201; +} +{ +cvt.rn.f16.f64 rs658, fd636; +} +mov.b32 r4209, {rs658, rs658}; +{ +mul.f16x2 r4207, r58, r4209; +} +{ +add.f16x2 r4210, r4186, r4207; +} +{ +cvt.rn.f16.f64 rs659, fd743; +} +mov.b32 r4215, {rs659, rs659}; +{ +mul.f16x2 r4213, r52, r4215; +} +{ +add.f16x2 r4216, r4192, r4213; +} +{ +cvt.rn.f16.f64 rs660, fd636; +} +mov.b32 r4221, {rs660, rs660}; +{ +mul.f16x2 r4219, r55, r4221; +} +{ +add.f16x2 r4222, r4198, r4219; +} +{ +cvt.rn.f16.f64 rs661, fd731; +} +mov.b32 r4227, {rs661, rs661}; +{ +mul.f16x2 r4225, r61, r4227; +} +{ +add.f16x2 r4228, r4204, r4225; +} +{ +cvt.rn.f16.f64 rs662, fd732; +} +mov.b32 r4233, {rs662, rs662}; +{ +mul.f16x2 r4231, r70, r4233; +} +{ +add.f16x2 r4234, r4210, r4231; +} +{ +cvt.rn.f16.f64 rs663, fd731; +} +mov.b32 r4239, {rs663, rs663}; +{ +mul.f16x2 r4237, r64, r4239; +} +{ +add.f16x2 r4240, r4216, r4237; +} +{ +cvt.rn.f16.f64 rs664, fd732; +} +mov.b32 r4245, {rs664, rs664}; +{ +mul.f16x2 r4243, r67, r4245; +} +{ +add.f16x2 r4246, r4222, r4243; +} +{ +cvt.rn.f16.f64 rs665, fd751; +} +mov.b32 r4251, {rs665, rs665}; +{ +mul.f16x2 r4249, r73, r4251; +} +{ +add.f16x2 r4252, r4228, r4249; +} +{ +cvt.rn.f16.f64 rs666, fd752; +} +mov.b32 r4257, {rs666, rs666}; +{ +mul.f16x2 r4255, r82, r4257; +} +{ +add.f16x2 r4258, r4234, r4255; +} +{ +cvt.rn.f16.f64 rs667, fd751; +} +mov.b32 r4263, {rs667, rs667}; +{ +mul.f16x2 r4261, r76, r4263; +} +{ +add.f16x2 r4264, r4240, r4261; +} +{ +cvt.rn.f16.f64 rs668, fd752; +} +mov.b32 r4269, {rs668, rs668}; +{ +mul.f16x2 r4267, r79, r4269; +} +{ +add.f16x2 r4270, r4246, r4267; +} +{ +cvt.rn.f16.f64 rs669, fd771; +} +mov.b32 r4275, {rs669, rs669}; +{ +mul.f16x2 r4273, r85, r4275; +} +{ +add.f16x2 r4276, r4252, r4273; +} +{ +cvt.rn.f16.f64 rs670, fd772; +} +mov.b32 r4281, {rs670, rs670}; +{ +mul.f16x2 r4279, r94, r4281; +} +{ +add.f16x2 r4282, r4258, r4279; +} +{ +cvt.rn.f16.f64 rs671, fd771; +} +mov.b32 r4287, {rs671, rs671}; +{ +mul.f16x2 r4285, r88, r4287; +} +{ +add.f16x2 r4288, r4264, r4285; +} +{ +cvt.rn.f16.f64 rs672, fd772; +} +mov.b32 r4293, {rs672, rs672}; +{ +mul.f16x2 r4291, r91, r4293; +} +{ +add.f16x2 r4294, r4270, r4291; +} +{ +cvt.rn.f16.f64 rs673, fd779; +} +mov.b32 r4299, {rs673, rs673}; +{ +mul.f16x2 r4297, r97, r4299; +} +{ +add.f16x2 r4300, r4276, r4297; +} +{ +cvt.rn.f16.f64 rs674, fd652; +} +mov.b32 r4305, {rs674, rs674}; +{ +mul.f16x2 r4303, r106, r4305; +} +{ +add.f16x2 r4306, r4282, r4303; +} +{ +cvt.rn.f16.f64 rs675, fd779; +} +mov.b32 r4311, {rs675, rs675}; +{ +mul.f16x2 r4309, r100, r4311; +} +{ +add.f16x2 r4312, r4288, r4309; +} +{ +cvt.rn.f16.f64 rs676, fd652; +} +mov.b32 r4317, {rs676, rs676}; +{ +mul.f16x2 r4315, r103, r4317; +} +{ +add.f16x2 r4318, r4294, r4315; +} +{ +cvt.rn.f16.f64 rs677, fd759; +} +mov.b32 r4323, {rs677, rs677}; +{ +mul.f16x2 r4321, r109, r4323; +} +{ +add.f16x2 r4324, r4300, r4321; +} +{ +cvt.rn.f16.f64 rs678, fd700; +} +mov.b32 r4329, {rs678, rs678}; +{ +mul.f16x2 r4327, r118, r4329; +} +{ +add.f16x2 r4330, r4306, r4327; +} +{ +cvt.rn.f16.f64 rs679, fd759; +} +mov.b32 r4335, {rs679, rs679}; +{ +mul.f16x2 r4333, r112, r4335; +} +{ +add.f16x2 r4336, r4312, r4333; +} +{ +cvt.rn.f16.f64 rs680, fd700; +} +mov.b32 r4341, {rs680, rs680}; +{ +mul.f16x2 r4339, r115, r4341; +} +{ +add.f16x2 r4342, r4318, r4339; +} +{ +cvt.rn.f16.f64 rs681, fd739; +} +mov.b32 r4347, {rs681, rs681}; +{ +mul.f16x2 r4345, r121, r4347; +} +{ +add.f16x2 r4348, r4324, r4345; +} +{ +cvt.rn.f16.f64 rs682, fd660; +} +mov.b32 r4353, {rs682, rs682}; +{ +mul.f16x2 r4351, r130, r4353; +} +{ +add.f16x2 r4354, r4330, r4351; +} +{ +cvt.rn.f16.f64 rs683, fd739; +} +mov.b32 r4359, {rs683, rs683}; +{ +mul.f16x2 r4357, r124, r4359; +} +{ +add.f16x2 r4360, r4336, r4357; +} +{ +cvt.rn.f16.f64 rs684, fd660; +} +mov.b32 r4365, {rs684, rs684}; +{ +mul.f16x2 r4363, r127, r4365; +} +{ +add.f16x2 r4366, r4342, r4363; +} +{ +cvt.rn.f16.f64 rs685, fd735; +} +mov.b32 r4371, {rs685, rs685}; +{ +mul.f16x2 r4369, r133, r4371; +} +{ +add.f16x2 r4372, r4348, r4369; +} +{ +cvt.rn.f16.f64 rs686, fd736; +} +mov.b32 r4377, {rs686, rs686}; +{ +mul.f16x2 r4375, r142, r4377; +} +{ +add.f16x2 r4378, r4354, r4375; +} +{ +cvt.rn.f16.f64 rs687, fd735; +} +mov.b32 r4383, {rs687, rs687}; +{ +mul.f16x2 r4381, r136, r4383; +} +{ +add.f16x2 r4384, r4360, r4381; +} +{ +cvt.rn.f16.f64 rs688, fd736; +} +mov.b32 r4389, {rs688, rs688}; +{ +mul.f16x2 r4387, r139, r4389; +} +{ +add.f16x2 r4390, r4366, r4387; +} +{ +cvt.rn.f16.f64 rs689, fd755; +} +mov.b32 r4395, {rs689, rs689}; +{ +mul.f16x2 r4393, r145, r4395; +} +{ +add.f16x2 r4396, r4372, r4393; +} +{ +cvt.rn.f16.f64 rs690, fd756; +} +mov.b32 r4401, {rs690, rs690}; +{ +mul.f16x2 r4399, r154, r4401; +} +{ +add.f16x2 r4402, r4378, r4399; +} +{ +cvt.rn.f16.f64 rs691, fd755; +} +mov.b32 r4407, {rs691, rs691}; +{ +mul.f16x2 r4405, r148, r4407; +} +{ +add.f16x2 r4408, r4384, r4405; +} +{ +cvt.rn.f16.f64 rs692, fd756; +} +mov.b32 r4413, {rs692, rs692}; +{ +mul.f16x2 r4411, r151, r4413; +} +{ +add.f16x2 r4414, r4390, r4411; +} +{ +cvt.rn.f16.f64 rs693, fd775; +} +mov.b32 r4419, {rs693, rs693}; +{ +mul.f16x2 r4417, r157, r4419; +} +{ +add.f16x2 r4420, r4396, r4417; +} +{ +cvt.rn.f16.f64 rs694, fd776; +} +mov.b32 r4425, {rs694, rs694}; +{ +mul.f16x2 r4423, r166, r4425; +} +{ +add.f16x2 r4426, r4402, r4423; +} +{ +cvt.rn.f16.f64 rs695, fd775; +} +mov.b32 r4431, {rs695, rs695}; +{ +mul.f16x2 r4429, r160, r4431; +} +{ +add.f16x2 r4432, r4408, r4429; +} +{ +cvt.rn.f16.f64 rs696, fd776; +} +mov.b32 r4437, {rs696, rs696}; +{ +mul.f16x2 r4435, r163, r4437; +} +{ +add.f16x2 r4438, r4414, r4435; +} +{ +sub.f16x2 %24, r4420, r4426; +} +{ +add.f16x2 %25, r4432, r4438; +} +{ +add.f16x2 %34, r4420, r4426; +} +{ +sub.f16x2 %35, r4432, r4438; +} +cvt.rn.f16.s32 rs697, r4804; +mov.b32 r4465, {rs697, rs697}; +cvt.rn.f16.s32 rs698, r4804; +mov.b32 r4477, {rs698, rs698}; +{ +cvt.rn.f16.f64 rs699, fd739; +} +mov.b32 r4457, {rs699, rs699}; +{ +mul.f16x2 r4455, r1, r4457; +} +{ +add.f16x2 r4458, %58, r4455; +} +{ +cvt.rn.f16.f64 rs700, fd740; +} +mov.b32 r4463, {rs700, rs700}; +{ +mul.f16x2 r4461, r10, r4463; +} +{ +add.f16x2 r4464, r4465, r4461; +} +{ +cvt.rn.f16.f64 rs701, fd739; +} +mov.b32 r4469, {rs701, rs701}; +{ +mul.f16x2 r4467, r4, r4469; +} +{ +add.f16x2 r4470, %59, r4467; +} +{ +cvt.rn.f16.f64 rs702, fd740; +} +mov.b32 r4475, {rs702, rs702}; +{ +mul.f16x2 r4473, r7, r4475; +} +{ +add.f16x2 r4476, r4477, r4473; +} +{ +cvt.rn.f16.f64 rs703, fd751; +} +mov.b32 r4481, {rs703, rs703}; +{ +mul.f16x2 r4479, r13, r4481; +} +{ +add.f16x2 r4482, r4458, r4479; +} +{ +cvt.rn.f16.f64 rs704, fd752; +} +mov.b32 r4487, {rs704, rs704}; +{ +mul.f16x2 r4485, r22, r4487; +} +{ +add.f16x2 r4488, r4464, r4485; +} +{ +cvt.rn.f16.f64 rs705, fd751; +} +mov.b32 r4493, {rs705, rs705}; +{ +mul.f16x2 r4491, r16, r4493; +} +{ +add.f16x2 r4494, r4470, r4491; +} +{ +cvt.rn.f16.f64 rs706, fd752; +} +mov.b32 r4499, {rs706, rs706}; +{ +mul.f16x2 r4497, r19, r4499; +} +{ +add.f16x2 r4500, r4476, r4497; +} +{ +cvt.rn.f16.f64 rs707, fd763; +} +mov.b32 r4505, {rs707, rs707}; +{ +mul.f16x2 r4503, r25, r4505; +} +{ +add.f16x2 r4506, r4482, r4503; +} +{ +cvt.rn.f16.f64 rs708, fd764; +} +mov.b32 r4511, {rs708, rs708}; +{ +mul.f16x2 r4509, r34, r4511; +} +{ +add.f16x2 r4512, r4488, r4509; +} +{ +cvt.rn.f16.f64 rs709, fd763; +} +mov.b32 r4517, {rs709, rs709}; +{ +mul.f16x2 r4515, r28, r4517; +} +{ +add.f16x2 r4518, r4494, r4515; +} +{ +cvt.rn.f16.f64 rs710, fd764; +} +mov.b32 r4523, {rs710, rs710}; +{ +mul.f16x2 r4521, r31, r4523; +} +{ +add.f16x2 r4524, r4500, r4521; +} +{ +cvt.rn.f16.f64 rs711, fd775; +} +mov.b32 r4529, {rs711, rs711}; +{ +mul.f16x2 r4527, r37, r4529; +} +{ +add.f16x2 r4530, r4506, r4527; +} +{ +cvt.rn.f16.f64 rs712, fd776; +} +mov.b32 r4535, {rs712, rs712}; +{ +mul.f16x2 r4533, r46, r4535; +} +{ +add.f16x2 r4536, r4512, r4533; +} +{ +cvt.rn.f16.f64 rs713, fd775; +} +mov.b32 r4541, {rs713, rs713}; +{ +mul.f16x2 r4539, r40, r4541; +} +{ +add.f16x2 r4542, r4518, r4539; +} +{ +cvt.rn.f16.f64 rs714, fd776; +} +mov.b32 r4547, {rs714, rs714}; +{ +mul.f16x2 r4545, r43, r4547; +} +{ +add.f16x2 r4548, r4524, r4545; +} +{ +cvt.rn.f16.f64 rs715, fd783; +} +mov.b32 r4553, {rs715, rs715}; +{ +mul.f16x2 r4551, r49, r4553; +} +{ +add.f16x2 r4554, r4530, r4551; +} +{ +cvt.rn.f16.f64 rs716, fd692; +} +mov.b32 r4559, {rs716, rs716}; +{ +mul.f16x2 r4557, r58, r4559; +} +{ +add.f16x2 r4560, r4536, r4557; +} +{ +cvt.rn.f16.f64 rs717, fd783; +} +mov.b32 r4565, {rs717, rs717}; +{ +mul.f16x2 r4563, r52, r4565; +} +{ +add.f16x2 r4566, r4542, r4563; +} +{ +cvt.rn.f16.f64 rs718, fd692; +} +mov.b32 r4571, {rs718, rs718}; +{ +mul.f16x2 r4569, r55, r4571; +} +{ +add.f16x2 r4572, r4548, r4569; +} +{ +cvt.rn.f16.f64 rs719, fd771; +} +mov.b32 r4577, {rs719, rs719}; +{ +mul.f16x2 r4575, r61, r4577; +} +{ +add.f16x2 r4578, r4554, r4575; +} +{ +cvt.rn.f16.f64 rs720, fd696; +} +mov.b32 r4583, {rs720, rs720}; +{ +mul.f16x2 r4581, r70, r4583; +} +{ +add.f16x2 r4584, r4560, r4581; +} +{ +cvt.rn.f16.f64 rs721, fd771; +} +mov.b32 r4589, {rs721, rs721}; +{ +mul.f16x2 r4587, r64, r4589; +} +{ +add.f16x2 r4590, r4566, r4587; +} +{ +cvt.rn.f16.f64 rs722, fd696; +} +mov.b32 r4595, {rs722, rs722}; +{ +mul.f16x2 r4593, r67, r4595; +} +{ +add.f16x2 r4596, r4572, r4593; +} +{ +cvt.rn.f16.f64 rs723, fd759; +} +mov.b32 r4601, {rs723, rs723}; +{ +mul.f16x2 r4599, r73, r4601; +} +{ +add.f16x2 r4602, r4578, r4599; +} +{ +cvt.rn.f16.f64 rs724, fd700; +} +mov.b32 r4607, {rs724, rs724}; +{ +mul.f16x2 r4605, r82, r4607; +} +{ +add.f16x2 r4608, r4584, r4605; +} +{ +cvt.rn.f16.f64 rs725, fd759; +} +mov.b32 r4613, {rs725, rs725}; +{ +mul.f16x2 r4611, r76, r4613; +} +{ +add.f16x2 r4614, r4590, r4611; +} +{ +cvt.rn.f16.f64 rs726, fd700; +} +mov.b32 r4619, {rs726, rs726}; +{ +mul.f16x2 r4617, r79, r4619; +} +{ +add.f16x2 r4620, r4596, r4617; +} +{ +cvt.rn.f16.f64 rs727, fd747; +} +mov.b32 r4625, {rs727, rs727}; +{ +mul.f16x2 r4623, r85, r4625; +} +{ +add.f16x2 r4626, r4602, r4623; +} +{ +cvt.rn.f16.f64 rs728, fd704; +} +mov.b32 r4631, {rs728, rs728}; +{ +mul.f16x2 r4629, r94, r4631; +} +{ +add.f16x2 r4632, r4608, r4629; +} +{ +cvt.rn.f16.f64 rs729, fd747; +} +mov.b32 r4637, {rs729, rs729}; +{ +mul.f16x2 r4635, r88, r4637; +} +{ +add.f16x2 r4638, r4614, r4635; +} +{ +cvt.rn.f16.f64 rs730, fd704; +} +mov.b32 r4643, {rs730, rs730}; +{ +mul.f16x2 r4641, r91, r4643; +} +{ +add.f16x2 r4644, r4620, r4641; +} +{ +cvt.rn.f16.f64 rs731, fd735; +} +mov.b32 r4649, {rs731, rs731}; +{ +mul.f16x2 r4647, r97, r4649; +} +{ +add.f16x2 r4650, r4626, r4647; +} +{ +cvt.rn.f16.f64 rs732, fd708; +} +mov.b32 r4655, {rs732, rs732}; +{ +mul.f16x2 r4653, r106, r4655; +} +{ +add.f16x2 r4656, r4632, r4653; +} +{ +cvt.rn.f16.f64 rs733, fd735; +} +mov.b32 r4661, {rs733, rs733}; +{ +mul.f16x2 r4659, r100, r4661; +} +{ +add.f16x2 r4662, r4638, r4659; +} +{ +cvt.rn.f16.f64 rs734, fd708; +} +mov.b32 r4667, {rs734, rs734}; +{ +mul.f16x2 r4665, r103, r4667; +} +{ +add.f16x2 r4668, r4644, r4665; +} +{ +cvt.rn.f16.f64 rs735, fd731; +} +mov.b32 r4673, {rs735, rs735}; +{ +mul.f16x2 r4671, r109, r4673; +} +{ +add.f16x2 r4674, r4650, r4671; +} +{ +cvt.rn.f16.f64 rs736, fd732; +} +mov.b32 r4679, {rs736, rs736}; +{ +mul.f16x2 r4677, r118, r4679; +} +{ +add.f16x2 r4680, r4656, r4677; +} +{ +cvt.rn.f16.f64 rs737, fd731; +} +mov.b32 r4685, {rs737, rs737}; +{ +mul.f16x2 r4683, r112, r4685; +} +{ +add.f16x2 r4686, r4662, r4683; +} +{ +cvt.rn.f16.f64 rs738, fd732; +} +mov.b32 r4691, {rs738, rs738}; +{ +mul.f16x2 r4689, r115, r4691; +} +{ +add.f16x2 r4692, r4668, r4689; +} +{ +cvt.rn.f16.f64 rs739, fd743; +} +mov.b32 r4697, {rs739, rs739}; +{ +mul.f16x2 r4695, r121, r4697; +} +{ +add.f16x2 r4698, r4674, r4695; +} +{ +cvt.rn.f16.f64 rs740, fd744; +} +mov.b32 r4703, {rs740, rs740}; +{ +mul.f16x2 r4701, r130, r4703; +} +{ +add.f16x2 r4704, r4680, r4701; +} +{ +cvt.rn.f16.f64 rs741, fd743; +} +mov.b32 r4709, {rs741, rs741}; +{ +mul.f16x2 r4707, r124, r4709; +} +{ +add.f16x2 r4710, r4686, r4707; +} +{ +cvt.rn.f16.f64 rs742, fd744; +} +mov.b32 r4715, {rs742, rs742}; +{ +mul.f16x2 r4713, r127, r4715; +} +{ +add.f16x2 r4716, r4692, r4713; +} +{ +cvt.rn.f16.f64 rs743, fd755; +} +mov.b32 r4721, {rs743, rs743}; +{ +mul.f16x2 r4719, r133, r4721; +} +{ +add.f16x2 r4722, r4698, r4719; +} +{ +cvt.rn.f16.f64 rs744, fd756; +} +mov.b32 r4727, {rs744, rs744}; +{ +mul.f16x2 r4725, r142, r4727; +} +{ +add.f16x2 r4728, r4704, r4725; +} +{ +cvt.rn.f16.f64 rs745, fd755; +} +mov.b32 r4733, {rs745, rs745}; +{ +mul.f16x2 r4731, r136, r4733; +} +{ +add.f16x2 r4734, r4710, r4731; +} +{ +cvt.rn.f16.f64 rs746, fd756; +} +mov.b32 r4739, {rs746, rs746}; +{ +mul.f16x2 r4737, r139, r4739; +} +{ +add.f16x2 r4740, r4716, r4737; +} +{ +cvt.rn.f16.f64 rs747, fd767; +} +mov.b32 r4745, {rs747, rs747}; +{ +mul.f16x2 r4743, r145, r4745; +} +{ +add.f16x2 r4746, r4722, r4743; +} +{ +cvt.rn.f16.f64 rs748, fd768; +} +mov.b32 r4751, {rs748, rs748}; +{ +mul.f16x2 r4749, r154, r4751; +} +{ +add.f16x2 r4752, r4728, r4749; +} +{ +cvt.rn.f16.f64 rs749, fd767; +} +mov.b32 r4757, {rs749, rs749}; +{ +mul.f16x2 r4755, r148, r4757; +} +{ +add.f16x2 r4758, r4734, r4755; +} +{ +cvt.rn.f16.f64 rs750, fd768; +} +mov.b32 r4763, {rs750, rs750}; +{ +mul.f16x2 r4761, r151, r4763; +} +{ +add.f16x2 r4764, r4740, r4761; +} +{ +cvt.rn.f16.f64 rs751, fd779; +} +mov.b32 r4769, {rs751, rs751}; +{ +mul.f16x2 r4767, r157, r4769; +} +{ +add.f16x2 r4770, r4746, r4767; +} +{ +cvt.rn.f16.f64 rs752, fd780; +} +mov.b32 r4775, {rs752, rs752}; +{ +mul.f16x2 r4773, r166, r4775; +} +{ +add.f16x2 r4776, r4752, r4773; +} +{ +cvt.rn.f16.f64 rs753, fd779; +} +mov.b32 r4781, {rs753, rs753}; +{ +mul.f16x2 r4779, r160, r4781; +} +{ +add.f16x2 r4782, r4758, r4779; +} +{ +cvt.rn.f16.f64 rs754, fd780; +} +mov.b32 r4787, {rs754, rs754}; +{ +mul.f16x2 r4785, r163, r4787; +} +{ +add.f16x2 r4788, r4764, r4785; +} +{ +sub.f16x2 %26, r4770, r4776; +} +{ +add.f16x2 %27, r4782, r4788; +} +{ +add.f16x2 %32, r4770, r4776; +} +{ +sub.f16x2 %33, r4782, r4788; +} +cvt.rn.f16.s32 rs755, r4804; +mov.b32 r4815, {rs755, rs755}; +cvt.rn.f16.s32 rs756, r4804; +mov.b32 r4827, {rs756, rs756}; +{ +cvt.rn.f16.f64 rs757, fd731; +} +mov.b32 r4807, {rs757, rs757}; +{ +mul.f16x2 r4805, r1, r4807; +} +{ +add.f16x2 r4808, %58, r4805; +} +{ +cvt.rn.f16.f64 rs758, fd732; +} +mov.b32 r4813, {rs758, rs758}; +{ +mul.f16x2 r4811, r10, r4813; +} +{ +add.f16x2 r4814, r4815, r4811; +} +{ +cvt.rn.f16.f64 rs759, fd731; +} +mov.b32 r4819, {rs759, rs759}; +{ +mul.f16x2 r4817, r4, r4819; +} +{ +add.f16x2 r4820, %59, r4817; +} +{ +cvt.rn.f16.f64 rs760, fd732; +} +mov.b32 r4825, {rs760, rs760}; +{ +mul.f16x2 r4823, r7, r4825; +} +{ +add.f16x2 r4826, r4827, r4823; +} +{ +cvt.rn.f16.f64 rs761, fd735; +} +mov.b32 r4831, {rs761, rs761}; +{ +mul.f16x2 r4829, r13, r4831; +} +{ +add.f16x2 r4832, r4808, r4829; +} +{ +cvt.rn.f16.f64 rs762, fd736; +} +mov.b32 r4837, {rs762, rs762}; +{ +mul.f16x2 r4835, r22, r4837; +} +{ +add.f16x2 r4838, r4814, r4835; +} +{ +cvt.rn.f16.f64 rs763, fd735; +} +mov.b32 r4843, {rs763, rs763}; +{ +mul.f16x2 r4841, r16, r4843; +} +{ +add.f16x2 r4844, r4820, r4841; +} +{ +cvt.rn.f16.f64 rs764, fd736; +} +mov.b32 r4849, {rs764, rs764}; +{ +mul.f16x2 r4847, r19, r4849; +} +{ +add.f16x2 r4850, r4826, r4847; +} +{ +cvt.rn.f16.f64 rs765, fd739; +} +mov.b32 r4855, {rs765, rs765}; +{ +mul.f16x2 r4853, r25, r4855; +} +{ +add.f16x2 r4856, r4832, r4853; +} +{ +cvt.rn.f16.f64 rs766, fd740; +} +mov.b32 r4861, {rs766, rs766}; +{ +mul.f16x2 r4859, r34, r4861; +} +{ +add.f16x2 r4862, r4838, r4859; +} +{ +cvt.rn.f16.f64 rs767, fd739; +} +mov.b32 r4867, {rs767, rs767}; +{ +mul.f16x2 r4865, r28, r4867; +} +{ +add.f16x2 r4868, r4844, r4865; +} +{ +cvt.rn.f16.f64 rs768, fd740; +} +mov.b32 r4873, {rs768, rs768}; +{ +mul.f16x2 r4871, r31, r4873; +} +{ +add.f16x2 r4874, r4850, r4871; +} +{ +cvt.rn.f16.f64 rs769, fd743; +} +mov.b32 r4879, {rs769, rs769}; +{ +mul.f16x2 r4877, r37, r4879; +} +{ +add.f16x2 r4880, r4856, r4877; +} +{ +cvt.rn.f16.f64 rs770, fd744; +} +mov.b32 r4885, {rs770, rs770}; +{ +mul.f16x2 r4883, r46, r4885; +} +{ +add.f16x2 r4886, r4862, r4883; +} +{ +cvt.rn.f16.f64 rs771, fd743; +} +mov.b32 r4891, {rs771, rs771}; +{ +mul.f16x2 r4889, r40, r4891; +} +{ +add.f16x2 r4892, r4868, r4889; +} +{ +cvt.rn.f16.f64 rs772, fd744; +} +mov.b32 r4897, {rs772, rs772}; +{ +mul.f16x2 r4895, r43, r4897; +} +{ +add.f16x2 r4898, r4874, r4895; +} +{ +cvt.rn.f16.f64 rs773, fd747; +} +mov.b32 r4903, {rs773, rs773}; +{ +mul.f16x2 r4901, r49, r4903; +} +{ +add.f16x2 r4904, r4880, r4901; +} +{ +cvt.rn.f16.f64 rs774, fd748; +} +mov.b32 r4909, {rs774, rs774}; +{ +mul.f16x2 r4907, r58, r4909; +} +{ +add.f16x2 r4910, r4886, r4907; +} +{ +cvt.rn.f16.f64 rs775, fd747; +} +mov.b32 r4915, {rs775, rs775}; +{ +mul.f16x2 r4913, r52, r4915; +} +{ +add.f16x2 r4916, r4892, r4913; +} +{ +cvt.rn.f16.f64 rs776, fd748; +} +mov.b32 r4921, {rs776, rs776}; +{ +mul.f16x2 r4919, r55, r4921; +} +{ +add.f16x2 r4922, r4898, r4919; +} +{ +cvt.rn.f16.f64 rs777, fd751; +} +mov.b32 r4927, {rs777, rs777}; +{ +mul.f16x2 r4925, r61, r4927; +} +{ +add.f16x2 r4928, r4904, r4925; +} +{ +cvt.rn.f16.f64 rs778, fd752; +} +mov.b32 r4933, {rs778, rs778}; +{ +mul.f16x2 r4931, r70, r4933; +} +{ +add.f16x2 r4934, r4910, r4931; +} +{ +cvt.rn.f16.f64 rs779, fd751; +} +mov.b32 r4939, {rs779, rs779}; +{ +mul.f16x2 r4937, r64, r4939; +} +{ +add.f16x2 r4940, r4916, r4937; +} +{ +cvt.rn.f16.f64 rs780, fd752; +} +mov.b32 r4945, {rs780, rs780}; +{ +mul.f16x2 r4943, r67, r4945; +} +{ +add.f16x2 r4946, r4922, r4943; +} +{ +cvt.rn.f16.f64 rs781, fd755; +} +mov.b32 r4951, {rs781, rs781}; +{ +mul.f16x2 r4949, r73, r4951; +} +{ +add.f16x2 r4952, r4928, r4949; +} +{ +cvt.rn.f16.f64 rs782, fd756; +} +mov.b32 r4957, {rs782, rs782}; +{ +mul.f16x2 r4955, r82, r4957; +} +{ +add.f16x2 r4958, r4934, r4955; +} +{ +cvt.rn.f16.f64 rs783, fd755; +} +mov.b32 r4963, {rs783, rs783}; +{ +mul.f16x2 r4961, r76, r4963; +} +{ +add.f16x2 r4964, r4940, r4961; +} +{ +cvt.rn.f16.f64 rs784, fd756; +} +mov.b32 r4969, {rs784, rs784}; +{ +mul.f16x2 r4967, r79, r4969; +} +{ +add.f16x2 r4970, r4946, r4967; +} +{ +cvt.rn.f16.f64 rs785, fd759; +} +mov.b32 r4975, {rs785, rs785}; +{ +mul.f16x2 r4973, r85, r4975; +} +{ +add.f16x2 r4976, r4952, r4973; +} +{ +cvt.rn.f16.f64 rs786, fd760; +} +mov.b32 r4981, {rs786, rs786}; +{ +mul.f16x2 r4979, r94, r4981; +} +{ +add.f16x2 r4982, r4958, r4979; +} +{ +cvt.rn.f16.f64 rs787, fd759; +} +mov.b32 r4987, {rs787, rs787}; +{ +mul.f16x2 r4985, r88, r4987; +} +{ +add.f16x2 r4988, r4964, r4985; +} +{ +cvt.rn.f16.f64 rs788, fd760; +} +mov.b32 r4993, {rs788, rs788}; +{ +mul.f16x2 r4991, r91, r4993; +} +{ +add.f16x2 r4994, r4970, r4991; +} +{ +cvt.rn.f16.f64 rs789, fd763; +} +mov.b32 r4999, {rs789, rs789}; +{ +mul.f16x2 r4997, r97, r4999; +} +{ +add.f16x2 r5000, r4976, r4997; +} +{ +cvt.rn.f16.f64 rs790, fd764; +} +mov.b32 r5005, {rs790, rs790}; +{ +mul.f16x2 r5003, r106, r5005; +} +{ +add.f16x2 r5006, r4982, r5003; +} +{ +cvt.rn.f16.f64 rs791, fd763; +} +mov.b32 r5011, {rs791, rs791}; +{ +mul.f16x2 r5009, r100, r5011; +} +{ +add.f16x2 r5012, r4988, r5009; +} +{ +cvt.rn.f16.f64 rs792, fd764; +} +mov.b32 r5017, {rs792, rs792}; +{ +mul.f16x2 r5015, r103, r5017; +} +{ +add.f16x2 r5018, r4994, r5015; +} +{ +cvt.rn.f16.f64 rs793, fd767; +} +mov.b32 r5023, {rs793, rs793}; +{ +mul.f16x2 r5021, r109, r5023; +} +{ +add.f16x2 r5024, r5000, r5021; +} +{ +cvt.rn.f16.f64 rs794, fd768; +} +mov.b32 r5029, {rs794, rs794}; +{ +mul.f16x2 r5027, r118, r5029; +} +{ +add.f16x2 r5030, r5006, r5027; +} +{ +cvt.rn.f16.f64 rs795, fd767; +} +mov.b32 r5035, {rs795, rs795}; +{ +mul.f16x2 r5033, r112, r5035; +} +{ +add.f16x2 r5036, r5012, r5033; +} +{ +cvt.rn.f16.f64 rs796, fd768; +} +mov.b32 r5041, {rs796, rs796}; +{ +mul.f16x2 r5039, r115, r5041; +} +{ +add.f16x2 r5042, r5018, r5039; +} +{ +cvt.rn.f16.f64 rs797, fd771; +} +mov.b32 r5047, {rs797, rs797}; +{ +mul.f16x2 r5045, r121, r5047; +} +{ +add.f16x2 r5048, r5024, r5045; +} +{ +cvt.rn.f16.f64 rs798, fd772; +} +mov.b32 r5053, {rs798, rs798}; +{ +mul.f16x2 r5051, r130, r5053; +} +{ +add.f16x2 r5054, r5030, r5051; +} +{ +cvt.rn.f16.f64 rs799, fd771; +} +mov.b32 r5059, {rs799, rs799}; +{ +mul.f16x2 r5057, r124, r5059; +} +{ +add.f16x2 r5060, r5036, r5057; +} +{ +cvt.rn.f16.f64 rs800, fd772; +} +mov.b32 r5065, {rs800, rs800}; +{ +mul.f16x2 r5063, r127, r5065; +} +{ +add.f16x2 r5066, r5042, r5063; +} +{ +cvt.rn.f16.f64 rs801, fd775; +} +mov.b32 r5071, {rs801, rs801}; +{ +mul.f16x2 r5069, r133, r5071; +} +{ +add.f16x2 r5072, r5048, r5069; +} +{ +cvt.rn.f16.f64 rs802, fd776; +} +mov.b32 r5077, {rs802, rs802}; +{ +mul.f16x2 r5075, r142, r5077; +} +{ +add.f16x2 r5078, r5054, r5075; +} +{ +cvt.rn.f16.f64 rs803, fd775; +} +mov.b32 r5083, {rs803, rs803}; +{ +mul.f16x2 r5081, r136, r5083; +} +{ +add.f16x2 r5084, r5060, r5081; +} +{ +cvt.rn.f16.f64 rs804, fd776; +} +mov.b32 r5089, {rs804, rs804}; +{ +mul.f16x2 r5087, r139, r5089; +} +{ +add.f16x2 r5090, r5066, r5087; +} +{ +cvt.rn.f16.f64 rs805, fd779; +} +mov.b32 r5095, {rs805, rs805}; +{ +mul.f16x2 r5093, r145, r5095; +} +{ +add.f16x2 r5096, r5072, r5093; +} +{ +cvt.rn.f16.f64 rs806, fd780; +} +mov.b32 r5101, {rs806, rs806}; +{ +mul.f16x2 r5099, r154, r5101; +} +{ +add.f16x2 r5102, r5078, r5099; +} +{ +cvt.rn.f16.f64 rs807, fd779; +} +mov.b32 r5107, {rs807, rs807}; +{ +mul.f16x2 r5105, r148, r5107; +} +{ +add.f16x2 r5108, r5084, r5105; +} +{ +cvt.rn.f16.f64 rs808, fd780; +} +mov.b32 r5113, {rs808, rs808}; +{ +mul.f16x2 r5111, r151, r5113; +} +{ +add.f16x2 r5114, r5090, r5111; +} +{ +cvt.rn.f16.f64 rs809, fd783; +} +mov.b32 r5119, {rs809, rs809}; +{ +mul.f16x2 r5117, r157, r5119; +} +{ +add.f16x2 r5120, r5096, r5117; +} +{ +cvt.rn.f16.f64 rs810, fd784; +} +mov.b32 r5125, {rs810, rs810}; +{ +mul.f16x2 r5123, r166, r5125; +} +{ +add.f16x2 r5126, r5102, r5123; +} +{ +cvt.rn.f16.f64 rs811, fd783; +} +mov.b32 r5131, {rs811, rs811}; +{ +mul.f16x2 r5129, r160, r5131; +} +{ +add.f16x2 r5132, r5108, r5129; +} +{ +cvt.rn.f16.f64 rs812, fd784; +} +mov.b32 r5137, {rs812, rs812}; +{ +mul.f16x2 r5135, r163, r5137; +} +{ +add.f16x2 r5138, r5114, r5135; +} +{ +sub.f16x2 %28, r5120, r5126; +} +{ +add.f16x2 %29, r5132, r5138; +} +{ +add.f16x2 %30, r5120, r5126; +} +{ +sub.f16x2 %31, r5132, r5138; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[28].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..2eb1a41b8e4c0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp32_fwd.hpp.inc @@ -0,0 +1,940 @@ +#ifndef CUFFTDX_FFT_29_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_29_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<13, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1133>; +.reg .b64 rd<4>; +add.f32 f117, %60, %114; +sub.f32 f119, %60, %114; +add.f32 f1131, %116, %115; +sub.f32 f120, %116, %115; +add.f32 f121, %62, %112; +sub.f32 f123, %62, %112; +add.f32 f1129, %117, %113; +sub.f32 f124, %117, %113; +add.f32 f125, %64, %110; +sub.f32 f127, %64, %110; +add.f32 f1127, %65, %118; +sub.f32 f128, %65, %118; +add.f32 f129, %66, %108; +sub.f32 f131, %66, %108; +add.f32 f1124, %120, %119; +sub.f32 f132, %120, %119; +add.f32 f133, %68, %106; +sub.f32 f135, %68, %106; +add.f32 f1122, %121, %107; +sub.f32 f136, %121, %107; +add.f32 f137, %70, %104; +sub.f32 f139, %70, %104; +add.f32 f1120, %71, %122; +sub.f32 f140, %71, %122; +add.f32 f141, %72, %102; +sub.f32 f143, %72, %102; +add.f32 f1117, %124, %123; +sub.f32 f144, %124, %123; +add.f32 f145, %74, %100; +sub.f32 f147, %74, %100; +add.f32 f1115, %125, %101; +sub.f32 f148, %125, %101; +add.f32 f149, %76, %98; +sub.f32 f151, %76, %98; +add.f32 f1113, %77, %126; +sub.f32 f152, %77, %126; +add.f32 f153, %78, %96; +sub.f32 f155, %78, %96; +add.f32 f1110, %128, %127; +sub.f32 f156, %128, %127; +add.f32 f157, %80, %94; +sub.f32 f159, %80, %94; +add.f32 f1108, %129, %95; +sub.f32 f160, %129, %95; +add.f32 f161, %82, %92; +sub.f32 f163, %82, %92; +add.f32 f1106, %83, %130; +sub.f32 f164, %83, %130; +add.f32 f165, %84, %90; +sub.f32 f167, %84, %90; +add.f32 f1103, %132, %131; +sub.f32 f168, %132, %131; +add.f32 f169, %86, %88; +sub.f32 f171, %86, %88; +add.f32 f1101, %133, %89; +sub.f32 f172, %133, %89; +add.f32 f173, %58, f117; +add.f32 f175, f173, f121; +add.f32 f1100, %59, f1131; +add.f32 f176, f1100, f1129; +add.f32 f177, f175, f125; +add.f32 f178, f176, f1127; +add.f32 f179, f177, f129; +add.f32 f180, f178, f1124; +add.f32 f181, f179, f133; +add.f32 f182, f180, f1122; +add.f32 f183, f181, f137; +add.f32 f184, f182, f1120; +add.f32 f185, f183, f141; +add.f32 f186, f184, f1117; +add.f32 f187, f185, f145; +add.f32 f188, f186, f1115; +add.f32 f189, f187, f149; +add.f32 f190, f188, f1113; +add.f32 f191, f189, f153; +add.f32 f192, f190, f1110; +add.f32 f193, f191, f157; +add.f32 f194, f192, f1108; +add.f32 f195, f193, f161; +add.f32 f196, f194, f1106; +add.f32 f197, f195, f165; +add.f32 f198, f196, f1103; +fma.rn.f32 f199, f117, 0f3F7A03CE, %58; +fma.rn.f32 f203, f121, 0f3F6856DD, f199; +fma.rn.f32 f1099, f120, 0fBE5C2136, 0f00000000; +fma.rn.f32 f204, f124, 0fBED6FBB4, f1099; +fma.rn.f32 f1098, f1131, 0f3F7A03CE, %59; +fma.rn.f32 f205, f1129, 0f3F6856DD, f1098; +fma.rn.f32 f1097, f119, 0fBE5C2136, 0f00000000; +fma.rn.f32 f206, f123, 0fBED6FBB4, f1097; +fma.rn.f32 f207, f125, 0f3F4BCCC1, f203; +fma.rn.f32 f208, f128, 0fBF1AECB3, f204; +fma.rn.f32 f209, f1127, 0f3F4BCCC1, f205; +fma.rn.f32 f210, f127, 0fBF1AECB3, f206; +fma.rn.f32 f211, f129, 0f3F25BB1C, f207; +fma.rn.f32 f212, f132, 0fBF431D0D, f208; +fma.rn.f32 f213, f1124, 0f3F25BB1C, f209; +fma.rn.f32 f214, f131, 0fBF431D0D, f210; +fma.rn.f32 f215, f133, 0f3EEFD33B, f211; +fma.rn.f32 f216, f136, 0fBF622DD8, f212; +fma.rn.f32 f217, f1122, 0f3EEFD33B, f213; +fma.rn.f32 f218, f135, 0fBF622DD8, f214; +fma.rn.f32 f219, f137, 0f3E88F979, f215; +fma.rn.f32 f220, f140, 0fBF76AB36, f216; +fma.rn.f32 f221, f1120, 0f3E88F979, f217; +fma.rn.f32 f222, f139, 0fBF76AB36, f218; +fma.rn.f32 f223, f141, 0f3D5DC0C3, f219; +fma.rn.f32 f224, f144, 0fBF7F9FE3, f220; +fma.rn.f32 f225, f1117, 0f3D5DC0C3, f221; +fma.rn.f32 f226, f143, 0fBF7F9FE3, f222; +fma.rn.f32 f227, f145, 0fBE25AA2E, f223; +fma.rn.f32 f228, f148, 0fBF7CA0AA, f224; +fma.rn.f32 f229, f1115, 0fBE25AA2E, f225; +fma.rn.f32 f230, f147, 0fBF7CA0AA, f226; +fma.rn.f32 f231, f149, 0fBEBD82C0, f227; +fma.rn.f32 f232, f152, 0fBF6DD16B, f228; +fma.rn.f32 f233, f1113, 0fBEBD82C0, f229; +fma.rn.f32 f234, f151, 0fBF6DD16B, f230; +fma.rn.f32 f235, f153, 0fBF0FA9F5, f231; +fma.rn.f32 f236, f156, 0fBF53E36D, f232; +fma.rn.f32 f237, f1110, 0fBF0FA9F5, f233; +fma.rn.f32 f238, f155, 0fBF53E36D, f234; +fma.rn.f32 f239, f157, 0fBF39DAD7, f235; +fma.rn.f32 f240, f160, 0fBF300D12, f236; +fma.rn.f32 f241, f1108, 0fBF39DAD7, f237; +fma.rn.f32 f242, f159, 0fBF300D12, f238; +fma.rn.f32 f243, f161, 0fBF5B5AFE, f239; +fma.rn.f32 f244, f164, 0fBF03FB56, f240; +fma.rn.f32 f245, f1106, 0fBF5B5AFE, f241; +fma.rn.f32 f246, f163, 0fBF03FB56, f242; +fma.rn.f32 f247, f165, 0fBF729966, f243; +fma.rn.f32 f248, f168, 0fBEA37B7D, f244; +fma.rn.f32 f249, f1103, 0fBF729966, f245; +fma.rn.f32 f250, f167, 0fBEA37B7D, f246; +fma.rn.f32 f251, f169, 0fBF7E7FD3, f247; +fma.rn.f32 f252, f172, 0fBDDD6D81, f248; +fma.rn.f32 f253, f1101, 0fBF7E7FD3, f249; +fma.rn.f32 f254, f171, 0fBDDD6D81, f250; +fma.rn.f32 f255, f117, 0f3F6856DD, %58; +fma.rn.f32 f259, f121, 0f3F25BB1C, f255; +fma.rn.f32 f1096, f120, 0fBED6FBB4, 0f00000000; +fma.rn.f32 f260, f124, 0fBF431D0D, f1096; +fma.rn.f32 f1095, f1131, 0f3F6856DD, %59; +fma.rn.f32 f261, f1129, 0f3F25BB1C, f1095; +fma.rn.f32 f1094, f119, 0fBED6FBB4, 0f00000000; +fma.rn.f32 f262, f123, 0fBF431D0D, f1094; +fma.rn.f32 f263, f125, 0f3E88F979, f259; +fma.rn.f32 f264, f128, 0fBF76AB36, f260; +fma.rn.f32 f265, f1127, 0f3E88F979, f261; +fma.rn.f32 f266, f127, 0fBF76AB36, f262; +fma.rn.f32 f267, f129, 0fBE25AA2E, f263; +fma.rn.f32 f268, f132, 0fBF7CA0AA, f264; +fma.rn.f32 f269, f1124, 0fBE25AA2E, f265; +fma.rn.f32 f270, f131, 0fBF7CA0AA, f266; +fma.rn.f32 f271, f133, 0fBF0FA9F5, f267; +fma.rn.f32 f272, f136, 0fBF53E36D, f268; +fma.rn.f32 f273, f1122, 0fBF0FA9F5, f269; +fma.rn.f32 f274, f135, 0fBF53E36D, f270; +fma.rn.f32 f275, f137, 0fBF5B5AFE, f271; +fma.rn.f32 f276, f140, 0fBF03FB56, f272; +fma.rn.f32 f277, f1120, 0fBF5B5AFE, f273; +fma.rn.f32 f278, f139, 0fBF03FB56, f274; +fma.rn.f32 f279, f141, 0fBF7E7FD3, f275; +fma.rn.f32 f280, f144, 0fBDDD6D81, f276; +fma.rn.f32 f281, f1117, 0fBF7E7FD3, f277; +fma.rn.f32 f282, f143, 0fBDDD6D81, f278; +fma.rn.f32 f283, f145, 0fBF729966, f279; +fma.rn.f32 f284, f148, 0f3EA37B7D, f280; +fma.rn.f32 f285, f1115, 0fBF729966, f281; +fma.rn.f32 f286, f147, 0f3EA37B7D, f282; +fma.rn.f32 f287, f149, 0fBF39DAD7, f283; +fma.rn.f32 f288, f152, 0f3F300D12, f284; +fma.rn.f32 f289, f1113, 0fBF39DAD7, f285; +fma.rn.f32 f290, f151, 0f3F300D12, f286; +fma.rn.f32 f291, f153, 0fBEBD82C0, f287; +fma.rn.f32 f292, f156, 0f3F6DD16B, f288; +fma.rn.f32 f293, f1110, 0fBEBD82C0, f289; +fma.rn.f32 f294, f155, 0f3F6DD16B, f290; +fma.rn.f32 f295, f157, 0f3D5DC0C3, f291; +fma.rn.f32 f296, f160, 0f3F7F9FE3, f292; +fma.rn.f32 f297, f1108, 0f3D5DC0C3, f293; +fma.rn.f32 f298, f159, 0f3F7F9FE3, f294; +fma.rn.f32 f299, f161, 0f3EEFD33B, f295; +fma.rn.f32 f300, f164, 0f3F622DD8, f296; +fma.rn.f32 f301, f1106, 0f3EEFD33B, f297; +fma.rn.f32 f302, f163, 0f3F622DD8, f298; +fma.rn.f32 f303, f165, 0f3F4BCCC1, f299; +fma.rn.f32 f304, f168, 0f3F1AECB3, f300; +fma.rn.f32 f305, f1103, 0f3F4BCCC1, f301; +fma.rn.f32 f306, f167, 0f3F1AECB3, f302; +fma.rn.f32 f307, f169, 0f3F7A03CE, f303; +fma.rn.f32 f308, f172, 0f3E5C2136, f304; +fma.rn.f32 f309, f1101, 0f3F7A03CE, f305; +fma.rn.f32 f310, f171, 0f3E5C2136, f306; +fma.rn.f32 f311, f117, 0f3F4BCCC1, %58; +fma.rn.f32 f315, f121, 0f3E88F979, f311; +fma.rn.f32 f1093, f120, 0fBF1AECB3, 0f00000000; +fma.rn.f32 f316, f124, 0fBF76AB36, f1093; +fma.rn.f32 f1092, f1131, 0f3F4BCCC1, %59; +fma.rn.f32 f317, f1129, 0f3E88F979, f1092; +fma.rn.f32 f1091, f119, 0fBF1AECB3, 0f00000000; +fma.rn.f32 f318, f123, 0fBF76AB36, f1091; +fma.rn.f32 f319, f125, 0fBEBD82C0, f315; +fma.rn.f32 f320, f128, 0fBF6DD16B, f316; +fma.rn.f32 f321, f1127, 0fBEBD82C0, f317; +fma.rn.f32 f322, f127, 0fBF6DD16B, f318; +fma.rn.f32 f323, f129, 0fBF5B5AFE, f319; +fma.rn.f32 f324, f132, 0fBF03FB56, f320; +fma.rn.f32 f325, f1124, 0fBF5B5AFE, f321; +fma.rn.f32 f326, f131, 0fBF03FB56, f322; +fma.rn.f32 f327, f133, 0fBF7E7FD3, f323; +fma.rn.f32 f328, f136, 0f3DDD6D81, f324; +fma.rn.f32 f329, f1122, 0fBF7E7FD3, f325; +fma.rn.f32 f330, f135, 0f3DDD6D81, f326; +fma.rn.f32 f331, f137, 0fBF39DAD7, f327; +fma.rn.f32 f332, f140, 0f3F300D12, f328; +fma.rn.f32 f333, f1120, 0fBF39DAD7, f329; +fma.rn.f32 f334, f139, 0f3F300D12, f330; +fma.rn.f32 f335, f141, 0fBE25AA2E, f331; +fma.rn.f32 f336, f144, 0f3F7CA0AA, f332; +fma.rn.f32 f337, f1117, 0fBE25AA2E, f333; +fma.rn.f32 f338, f143, 0f3F7CA0AA, f334; +fma.rn.f32 f339, f145, 0f3EEFD33B, f335; +fma.rn.f32 f340, f148, 0f3F622DD8, f336; +fma.rn.f32 f341, f1115, 0f3EEFD33B, f337; +fma.rn.f32 f342, f147, 0f3F622DD8, f338; +fma.rn.f32 f343, f149, 0f3F6856DD, f339; +fma.rn.f32 f344, f152, 0f3ED6FBB4, f340; +fma.rn.f32 f345, f1113, 0f3F6856DD, f341; +fma.rn.f32 f346, f151, 0f3ED6FBB4, f342; +fma.rn.f32 f347, f153, 0f3F7A03CE, f343; +fma.rn.f32 f348, f156, 0fBE5C2136, f344; +fma.rn.f32 f349, f1110, 0f3F7A03CE, f345; +fma.rn.f32 f350, f155, 0fBE5C2136, f346; +fma.rn.f32 f351, f157, 0f3F25BB1C, f347; +fma.rn.f32 f352, f160, 0fBF431D0D, f348; +fma.rn.f32 f353, f1108, 0f3F25BB1C, f349; +fma.rn.f32 f354, f159, 0fBF431D0D, f350; +fma.rn.f32 f355, f161, 0f3D5DC0C3, f351; +fma.rn.f32 f356, f164, 0fBF7F9FE3, f352; +fma.rn.f32 f357, f1106, 0f3D5DC0C3, f353; +fma.rn.f32 f358, f163, 0fBF7F9FE3, f354; +fma.rn.f32 f359, f165, 0fBF0FA9F5, f355; +fma.rn.f32 f360, f168, 0fBF53E36D, f356; +fma.rn.f32 f361, f1103, 0fBF0FA9F5, f357; +fma.rn.f32 f362, f167, 0fBF53E36D, f358; +fma.rn.f32 f363, f169, 0fBF729966, f359; +fma.rn.f32 f364, f172, 0fBEA37B7D, f360; +fma.rn.f32 f365, f1101, 0fBF729966, f361; +fma.rn.f32 f366, f171, 0fBEA37B7D, f362; +fma.rn.f32 f367, f117, 0f3F25BB1C, %58; +fma.rn.f32 f371, f121, 0fBE25AA2E, f367; +fma.rn.f32 f1090, f120, 0fBF431D0D, 0f00000000; +fma.rn.f32 f372, f124, 0fBF7CA0AA, f1090; +fma.rn.f32 f1089, f1131, 0f3F25BB1C, %59; +fma.rn.f32 f373, f1129, 0fBE25AA2E, f1089; +fma.rn.f32 f1088, f119, 0fBF431D0D, 0f00000000; +fma.rn.f32 f374, f123, 0fBF7CA0AA, f1088; +fma.rn.f32 f375, f125, 0fBF5B5AFE, f371; +fma.rn.f32 f376, f128, 0fBF03FB56, f372; +fma.rn.f32 f377, f1127, 0fBF5B5AFE, f373; +fma.rn.f32 f378, f127, 0fBF03FB56, f374; +fma.rn.f32 f379, f129, 0fBF729966, f375; +fma.rn.f32 f380, f132, 0f3EA37B7D, f376; +fma.rn.f32 f381, f1124, 0fBF729966, f377; +fma.rn.f32 f382, f131, 0f3EA37B7D, f378; +fma.rn.f32 f383, f133, 0fBEBD82C0, f379; +fma.rn.f32 f384, f136, 0f3F6DD16B, f380; +fma.rn.f32 f385, f1122, 0fBEBD82C0, f381; +fma.rn.f32 f386, f135, 0f3F6DD16B, f382; +fma.rn.f32 f387, f137, 0f3EEFD33B, f383; +fma.rn.f32 f388, f140, 0f3F622DD8, f384; +fma.rn.f32 f389, f1120, 0f3EEFD33B, f385; +fma.rn.f32 f390, f139, 0f3F622DD8, f386; +fma.rn.f32 f391, f141, 0f3F7A03CE, f387; +fma.rn.f32 f392, f144, 0f3E5C2136, f388; +fma.rn.f32 f393, f1117, 0f3F7A03CE, f389; +fma.rn.f32 f394, f143, 0f3E5C2136, f390; +fma.rn.f32 f395, f145, 0f3F4BCCC1, f391; +fma.rn.f32 f396, f148, 0fBF1AECB3, f392; +fma.rn.f32 f397, f1115, 0f3F4BCCC1, f393; +fma.rn.f32 f398, f147, 0fBF1AECB3, f394; +fma.rn.f32 f399, f149, 0f3D5DC0C3, f395; +fma.rn.f32 f400, f152, 0fBF7F9FE3, f396; +fma.rn.f32 f401, f1113, 0f3D5DC0C3, f397; +fma.rn.f32 f402, f151, 0fBF7F9FE3, f398; +fma.rn.f32 f403, f153, 0fBF39DAD7, f399; +fma.rn.f32 f404, f156, 0fBF300D12, f400; +fma.rn.f32 f405, f1110, 0fBF39DAD7, f401; +fma.rn.f32 f406, f155, 0fBF300D12, f402; +fma.rn.f32 f407, f157, 0fBF7E7FD3, f403; +fma.rn.f32 f408, f160, 0f3DDD6D81, f404; +fma.rn.f32 f409, f1108, 0fBF7E7FD3, f405; +fma.rn.f32 f410, f159, 0f3DDD6D81, f406; +fma.rn.f32 f411, f161, 0fBF0FA9F5, f407; +fma.rn.f32 f412, f164, 0f3F53E36D, f408; +fma.rn.f32 f413, f1106, 0fBF0FA9F5, f409; +fma.rn.f32 f414, f163, 0f3F53E36D, f410; +fma.rn.f32 f415, f165, 0f3E88F979, f411; +fma.rn.f32 f416, f168, 0f3F76AB36, f412; +fma.rn.f32 f417, f1103, 0f3E88F979, f413; +fma.rn.f32 f418, f167, 0f3F76AB36, f414; +fma.rn.f32 f419, f169, 0f3F6856DD, f415; +fma.rn.f32 f420, f172, 0f3ED6FBB4, f416; +fma.rn.f32 f421, f1101, 0f3F6856DD, f417; +fma.rn.f32 f422, f171, 0f3ED6FBB4, f418; +fma.rn.f32 f423, f117, 0f3EEFD33B, %58; +fma.rn.f32 f427, f121, 0fBF0FA9F5, f423; +fma.rn.f32 f1087, f120, 0fBF622DD8, 0f00000000; +fma.rn.f32 f428, f124, 0fBF53E36D, f1087; +fma.rn.f32 f1086, f1131, 0f3EEFD33B, %59; +fma.rn.f32 f429, f1129, 0fBF0FA9F5, f1086; +fma.rn.f32 f1085, f119, 0fBF622DD8, 0f00000000; +fma.rn.f32 f430, f123, 0fBF53E36D, f1085; +fma.rn.f32 f431, f125, 0fBF7E7FD3, f427; +fma.rn.f32 f432, f128, 0f3DDD6D81, f428; +fma.rn.f32 f433, f1127, 0fBF7E7FD3, f429; +fma.rn.f32 f434, f127, 0f3DDD6D81, f430; +fma.rn.f32 f435, f129, 0fBEBD82C0, f431; +fma.rn.f32 f436, f132, 0f3F6DD16B, f432; +fma.rn.f32 f437, f1124, 0fBEBD82C0, f433; +fma.rn.f32 f438, f131, 0f3F6DD16B, f434; +fma.rn.f32 f439, f133, 0f3F25BB1C, f435; +fma.rn.f32 f440, f136, 0f3F431D0D, f436; +fma.rn.f32 f441, f1122, 0f3F25BB1C, f437; +fma.rn.f32 f442, f135, 0f3F431D0D, f438; +fma.rn.f32 f443, f137, 0f3F7A03CE, f439; +fma.rn.f32 f444, f140, 0fBE5C2136, f440; +fma.rn.f32 f445, f1120, 0f3F7A03CE, f441; +fma.rn.f32 f446, f139, 0fBE5C2136, f442; +fma.rn.f32 f447, f141, 0f3E88F979, f443; +fma.rn.f32 f448, f144, 0fBF76AB36, f444; +fma.rn.f32 f449, f1117, 0f3E88F979, f445; +fma.rn.f32 f450, f143, 0fBF76AB36, f446; +fma.rn.f32 f451, f145, 0fBF39DAD7, f447; +fma.rn.f32 f452, f148, 0fBF300D12, f448; +fma.rn.f32 f453, f1115, 0fBF39DAD7, f449; +fma.rn.f32 f454, f147, 0fBF300D12, f450; +fma.rn.f32 f455, f149, 0fBF729966, f451; +fma.rn.f32 f456, f152, 0f3EA37B7D, f452; +fma.rn.f32 f457, f1113, 0fBF729966, f453; +fma.rn.f32 f458, f151, 0f3EA37B7D, f454; +fma.rn.f32 f459, f153, 0fBE25AA2E, f455; +fma.rn.f32 f460, f156, 0f3F7CA0AA, f456; +fma.rn.f32 f461, f1110, 0fBE25AA2E, f457; +fma.rn.f32 f462, f155, 0f3F7CA0AA, f458; +fma.rn.f32 f463, f157, 0f3F4BCCC1, f459; +fma.rn.f32 f464, f160, 0f3F1AECB3, f460; +fma.rn.f32 f465, f1108, 0f3F4BCCC1, f461; +fma.rn.f32 f466, f159, 0f3F1AECB3, f462; +fma.rn.f32 f467, f161, 0f3F6856DD, f463; +fma.rn.f32 f468, f164, 0fBED6FBB4, f464; +fma.rn.f32 f469, f1106, 0f3F6856DD, f465; +fma.rn.f32 f470, f163, 0fBED6FBB4, f466; +fma.rn.f32 f471, f165, 0f3D5DC0C3, f467; +fma.rn.f32 f472, f168, 0fBF7F9FE3, f468; +fma.rn.f32 f473, f1103, 0f3D5DC0C3, f469; +fma.rn.f32 f474, f167, 0fBF7F9FE3, f470; +fma.rn.f32 f475, f169, 0fBF5B5AFE, f471; +fma.rn.f32 f476, f172, 0fBF03FB56, f472; +fma.rn.f32 f477, f1101, 0fBF5B5AFE, f473; +fma.rn.f32 f478, f171, 0fBF03FB56, f474; +fma.rn.f32 f479, f117, 0f3E88F979, %58; +fma.rn.f32 f483, f121, 0fBF5B5AFE, f479; +fma.rn.f32 f1084, f120, 0fBF76AB36, 0f00000000; +fma.rn.f32 f484, f124, 0fBF03FB56, f1084; +fma.rn.f32 f1083, f1131, 0f3E88F979, %59; +fma.rn.f32 f485, f1129, 0fBF5B5AFE, f1083; +fma.rn.f32 f1082, f119, 0fBF76AB36, 0f00000000; +fma.rn.f32 f486, f123, 0fBF03FB56, f1082; +fma.rn.f32 f487, f125, 0fBF39DAD7, f483; +fma.rn.f32 f488, f128, 0f3F300D12, f484; +fma.rn.f32 f489, f1127, 0fBF39DAD7, f485; +fma.rn.f32 f490, f127, 0f3F300D12, f486; +fma.rn.f32 f491, f129, 0f3EEFD33B, f487; +fma.rn.f32 f492, f132, 0f3F622DD8, f488; +fma.rn.f32 f493, f1124, 0f3EEFD33B, f489; +fma.rn.f32 f494, f131, 0f3F622DD8, f490; +fma.rn.f32 f495, f133, 0f3F7A03CE, f491; +fma.rn.f32 f496, f136, 0fBE5C2136, f492; +fma.rn.f32 f497, f1122, 0f3F7A03CE, f493; +fma.rn.f32 f498, f135, 0fBE5C2136, f494; +fma.rn.f32 f499, f137, 0f3D5DC0C3, f495; +fma.rn.f32 f500, f140, 0fBF7F9FE3, f496; +fma.rn.f32 f501, f1120, 0f3D5DC0C3, f497; +fma.rn.f32 f502, f139, 0fBF7F9FE3, f498; +fma.rn.f32 f503, f141, 0fBF729966, f499; +fma.rn.f32 f504, f144, 0fBEA37B7D, f500; +fma.rn.f32 f505, f1117, 0fBF729966, f501; +fma.rn.f32 f506, f143, 0fBEA37B7D, f502; +fma.rn.f32 f507, f145, 0fBF0FA9F5, f503; +fma.rn.f32 f508, f148, 0f3F53E36D, f504; +fma.rn.f32 f509, f1115, 0fBF0FA9F5, f505; +fma.rn.f32 f510, f147, 0f3F53E36D, f506; +fma.rn.f32 f511, f149, 0f3F25BB1C, f507; +fma.rn.f32 f512, f152, 0f3F431D0D, f508; +fma.rn.f32 f513, f1113, 0f3F25BB1C, f509; +fma.rn.f32 f514, f151, 0f3F431D0D, f510; +fma.rn.f32 f515, f153, 0f3F6856DD, f511; +fma.rn.f32 f516, f156, 0fBED6FBB4, f512; +fma.rn.f32 f517, f1110, 0f3F6856DD, f513; +fma.rn.f32 f518, f155, 0fBED6FBB4, f514; +fma.rn.f32 f519, f157, 0fBE25AA2E, f515; +fma.rn.f32 f520, f160, 0fBF7CA0AA, f516; +fma.rn.f32 f521, f1108, 0fBE25AA2E, f517; +fma.rn.f32 f522, f159, 0fBF7CA0AA, f518; +fma.rn.f32 f523, f161, 0fBF7E7FD3, f519; +fma.rn.f32 f524, f164, 0fBDDD6D81, f520; +fma.rn.f32 f525, f1106, 0fBF7E7FD3, f521; +fma.rn.f32 f526, f163, 0fBDDD6D81, f522; +fma.rn.f32 f527, f165, 0fBEBD82C0, f523; +fma.rn.f32 f528, f168, 0f3F6DD16B, f524; +fma.rn.f32 f529, f1103, 0fBEBD82C0, f525; +fma.rn.f32 f530, f167, 0f3F6DD16B, f526; +fma.rn.f32 f531, f169, 0f3F4BCCC1, f527; +fma.rn.f32 f532, f172, 0f3F1AECB3, f528; +fma.rn.f32 f533, f1101, 0f3F4BCCC1, f529; +fma.rn.f32 f534, f171, 0f3F1AECB3, f530; +fma.rn.f32 f535, f117, 0f3D5DC0C3, %58; +fma.rn.f32 f539, f121, 0fBF7E7FD3, f535; +fma.rn.f32 f1081, f120, 0fBF7F9FE3, 0f00000000; +fma.rn.f32 f540, f124, 0fBDDD6D81, f1081; +fma.rn.f32 f1080, f1131, 0f3D5DC0C3, %59; +fma.rn.f32 f541, f1129, 0fBF7E7FD3, f1080; +fma.rn.f32 f1079, f119, 0fBF7F9FE3, 0f00000000; +fma.rn.f32 f542, f123, 0fBDDD6D81, f1079; +fma.rn.f32 f543, f125, 0fBE25AA2E, f539; +fma.rn.f32 f544, f128, 0f3F7CA0AA, f540; +fma.rn.f32 f545, f1127, 0fBE25AA2E, f541; +fma.rn.f32 f546, f127, 0f3F7CA0AA, f542; +fma.rn.f32 f547, f129, 0f3F7A03CE, f543; +fma.rn.f32 f548, f132, 0f3E5C2136, f544; +fma.rn.f32 f549, f1124, 0f3F7A03CE, f545; +fma.rn.f32 f550, f131, 0f3E5C2136, f546; +fma.rn.f32 f551, f133, 0f3E88F979, f547; +fma.rn.f32 f552, f136, 0fBF76AB36, f548; +fma.rn.f32 f553, f1122, 0f3E88F979, f549; +fma.rn.f32 f554, f135, 0fBF76AB36, f550; +fma.rn.f32 f555, f137, 0fBF729966, f551; +fma.rn.f32 f556, f140, 0fBEA37B7D, f552; +fma.rn.f32 f557, f1120, 0fBF729966, f553; +fma.rn.f32 f558, f139, 0fBEA37B7D, f554; +fma.rn.f32 f559, f141, 0fBEBD82C0, f555; +fma.rn.f32 f560, f144, 0f3F6DD16B, f556; +fma.rn.f32 f561, f1117, 0fBEBD82C0, f557; +fma.rn.f32 f562, f143, 0f3F6DD16B, f558; +fma.rn.f32 f563, f145, 0f3F6856DD, f559; +fma.rn.f32 f564, f148, 0f3ED6FBB4, f560; +fma.rn.f32 f565, f1115, 0f3F6856DD, f561; +fma.rn.f32 f566, f147, 0f3ED6FBB4, f562; +fma.rn.f32 f567, f149, 0f3EEFD33B, f563; +fma.rn.f32 f568, f152, 0fBF622DD8, f564; +fma.rn.f32 f569, f1113, 0f3EEFD33B, f565; +fma.rn.f32 f570, f151, 0fBF622DD8, f566; +fma.rn.f32 f571, f153, 0fBF5B5AFE, f567; +fma.rn.f32 f572, f156, 0fBF03FB56, f568; +fma.rn.f32 f573, f1110, 0fBF5B5AFE, f569; +fma.rn.f32 f574, f155, 0fBF03FB56, f570; +fma.rn.f32 f575, f157, 0fBF0FA9F5, f571; +fma.rn.f32 f576, f160, 0f3F53E36D, f572; +fma.rn.f32 f577, f1108, 0fBF0FA9F5, f573; +fma.rn.f32 f578, f159, 0f3F53E36D, f574; +fma.rn.f32 f579, f161, 0f3F4BCCC1, f575; +fma.rn.f32 f580, f164, 0f3F1AECB3, f576; +fma.rn.f32 f581, f1106, 0f3F4BCCC1, f577; +fma.rn.f32 f582, f163, 0f3F1AECB3, f578; +fma.rn.f32 f583, f165, 0f3F25BB1C, f579; +fma.rn.f32 f584, f168, 0fBF431D0D, f580; +fma.rn.f32 f585, f1103, 0f3F25BB1C, f581; +fma.rn.f32 f586, f167, 0fBF431D0D, f582; +fma.rn.f32 f587, f169, 0fBF39DAD7, f583; +fma.rn.f32 f588, f172, 0fBF300D12, f584; +fma.rn.f32 f589, f1101, 0fBF39DAD7, f585; +fma.rn.f32 f590, f171, 0fBF300D12, f586; +fma.rn.f32 f591, f117, 0fBE25AA2E, %58; +fma.rn.f32 f595, f121, 0fBF729966, f591; +fma.rn.f32 f1078, f120, 0fBF7CA0AA, 0f00000000; +fma.rn.f32 f596, f124, 0f3EA37B7D, f1078; +fma.rn.f32 f1077, f1131, 0fBE25AA2E, %59; +fma.rn.f32 f597, f1129, 0fBF729966, f1077; +fma.rn.f32 f1076, f119, 0fBF7CA0AA, 0f00000000; +fma.rn.f32 f598, f123, 0f3EA37B7D, f1076; +fma.rn.f32 f599, f125, 0f3EEFD33B, f595; +fma.rn.f32 f600, f128, 0f3F622DD8, f596; +fma.rn.f32 f601, f1127, 0f3EEFD33B, f597; +fma.rn.f32 f602, f127, 0f3F622DD8, f598; +fma.rn.f32 f603, f129, 0f3F4BCCC1, f599; +fma.rn.f32 f604, f132, 0fBF1AECB3, f600; +fma.rn.f32 f605, f1124, 0f3F4BCCC1, f601; +fma.rn.f32 f606, f131, 0fBF1AECB3, f602; +fma.rn.f32 f607, f133, 0fBF39DAD7, f603; +fma.rn.f32 f608, f136, 0fBF300D12, f604; +fma.rn.f32 f609, f1122, 0fBF39DAD7, f605; +fma.rn.f32 f610, f135, 0fBF300D12, f606; +fma.rn.f32 f611, f137, 0fBF0FA9F5, f607; +fma.rn.f32 f612, f140, 0f3F53E36D, f608; +fma.rn.f32 f613, f1120, 0fBF0FA9F5, f609; +fma.rn.f32 f614, f139, 0f3F53E36D, f610; +fma.rn.f32 f615, f141, 0f3F6856DD, f611; +fma.rn.f32 f616, f144, 0f3ED6FBB4, f612; +fma.rn.f32 f617, f1117, 0f3F6856DD, f613; +fma.rn.f32 f618, f143, 0f3ED6FBB4, f614; +fma.rn.f32 f619, f145, 0f3E88F979, f615; +fma.rn.f32 f620, f148, 0fBF76AB36, f616; +fma.rn.f32 f621, f1115, 0f3E88F979, f617; +fma.rn.f32 f622, f147, 0fBF76AB36, f618; +fma.rn.f32 f623, f149, 0fBF7E7FD3, f619; +fma.rn.f32 f624, f152, 0fBDDD6D81, f620; +fma.rn.f32 f625, f1113, 0fBF7E7FD3, f621; +fma.rn.f32 f626, f151, 0fBDDD6D81, f622; +fma.rn.f32 f627, f153, 0f3D5DC0C3, f623; +fma.rn.f32 f628, f156, 0f3F7F9FE3, f624; +fma.rn.f32 f629, f1110, 0f3D5DC0C3, f625; +fma.rn.f32 f630, f155, 0f3F7F9FE3, f626; +fma.rn.f32 f631, f157, 0f3F7A03CE, f627; +fma.rn.f32 f632, f160, 0fBE5C2136, f628; +fma.rn.f32 f633, f1108, 0f3F7A03CE, f629; +fma.rn.f32 f634, f159, 0fBE5C2136, f630; +fma.rn.f32 f635, f161, 0fBEBD82C0, f631; +fma.rn.f32 f636, f164, 0fBF6DD16B, f632; +fma.rn.f32 f637, f1106, 0fBEBD82C0, f633; +fma.rn.f32 f638, f163, 0fBF6DD16B, f634; +fma.rn.f32 f639, f165, 0fBF5B5AFE, f635; +fma.rn.f32 f640, f168, 0f3F03FB56, f636; +fma.rn.f32 f641, f1103, 0fBF5B5AFE, f637; +fma.rn.f32 f642, f167, 0f3F03FB56, f638; +fma.rn.f32 f643, f169, 0f3F25BB1C, f639; +fma.rn.f32 f644, f172, 0f3F431D0D, f640; +fma.rn.f32 f645, f1101, 0f3F25BB1C, f641; +fma.rn.f32 f646, f171, 0f3F431D0D, f642; +fma.rn.f32 f647, f117, 0fBEBD82C0, %58; +fma.rn.f32 f651, f121, 0fBF39DAD7, f647; +fma.rn.f32 f1075, f120, 0fBF6DD16B, 0f00000000; +fma.rn.f32 f652, f124, 0f3F300D12, f1075; +fma.rn.f32 f1074, f1131, 0fBEBD82C0, %59; +fma.rn.f32 f653, f1129, 0fBF39DAD7, f1074; +fma.rn.f32 f1073, f119, 0fBF6DD16B, 0f00000000; +fma.rn.f32 f654, f123, 0f3F300D12, f1073; +fma.rn.f32 f655, f125, 0f3F6856DD, f651; +fma.rn.f32 f656, f128, 0f3ED6FBB4, f652; +fma.rn.f32 f657, f1127, 0f3F6856DD, f653; +fma.rn.f32 f658, f127, 0f3ED6FBB4, f654; +fma.rn.f32 f659, f129, 0f3D5DC0C3, f655; +fma.rn.f32 f660, f132, 0fBF7F9FE3, f656; +fma.rn.f32 f661, f1124, 0f3D5DC0C3, f657; +fma.rn.f32 f662, f131, 0fBF7F9FE3, f658; +fma.rn.f32 f663, f133, 0fBF729966, f659; +fma.rn.f32 f664, f136, 0f3EA37B7D, f660; +fma.rn.f32 f665, f1122, 0fBF729966, f661; +fma.rn.f32 f666, f135, 0f3EA37B7D, f662; +fma.rn.f32 f667, f137, 0f3F25BB1C, f663; +fma.rn.f32 f668, f140, 0f3F431D0D, f664; +fma.rn.f32 f669, f1120, 0f3F25BB1C, f665; +fma.rn.f32 f670, f139, 0f3F431D0D, f666; +fma.rn.f32 f671, f141, 0f3EEFD33B, f667; +fma.rn.f32 f672, f144, 0fBF622DD8, f668; +fma.rn.f32 f673, f1117, 0f3EEFD33B, f669; +fma.rn.f32 f674, f143, 0fBF622DD8, f670; +fma.rn.f32 f675, f145, 0fBF7E7FD3, f671; +fma.rn.f32 f676, f148, 0fBDDD6D81, f672; +fma.rn.f32 f677, f1115, 0fBF7E7FD3, f673; +fma.rn.f32 f678, f147, 0fBDDD6D81, f674; +fma.rn.f32 f679, f149, 0f3E88F979, f675; +fma.rn.f32 f680, f152, 0f3F76AB36, f676; +fma.rn.f32 f681, f1113, 0f3E88F979, f677; +fma.rn.f32 f682, f151, 0f3F76AB36, f678; +fma.rn.f32 f683, f153, 0f3F4BCCC1, f679; +fma.rn.f32 f684, f156, 0fBF1AECB3, f680; +fma.rn.f32 f685, f1110, 0f3F4BCCC1, f681; +fma.rn.f32 f686, f155, 0fBF1AECB3, f682; +fma.rn.f32 f687, f157, 0fBF5B5AFE, f683; +fma.rn.f32 f688, f160, 0fBF03FB56, f684; +fma.rn.f32 f689, f1108, 0fBF5B5AFE, f685; +fma.rn.f32 f690, f159, 0fBF03FB56, f686; +fma.rn.f32 f691, f161, 0fBE25AA2E, f687; +fma.rn.f32 f692, f164, 0f3F7CA0AA, f688; +fma.rn.f32 f693, f1106, 0fBE25AA2E, f689; +fma.rn.f32 f694, f163, 0f3F7CA0AA, f690; +fma.rn.f32 f695, f165, 0f3F7A03CE, f691; +fma.rn.f32 f696, f168, 0fBE5C2136, f692; +fma.rn.f32 f697, f1103, 0f3F7A03CE, f693; +fma.rn.f32 f698, f167, 0fBE5C2136, f694; +fma.rn.f32 f699, f169, 0fBF0FA9F5, f695; +fma.rn.f32 f700, f172, 0fBF53E36D, f696; +fma.rn.f32 f701, f1101, 0fBF0FA9F5, f697; +fma.rn.f32 f702, f171, 0fBF53E36D, f698; +fma.rn.f32 f703, f117, 0fBF0FA9F5, %58; +fma.rn.f32 f707, f121, 0fBEBD82C0, f703; +fma.rn.f32 f1072, f120, 0fBF53E36D, 0f00000000; +fma.rn.f32 f708, f124, 0f3F6DD16B, f1072; +fma.rn.f32 f1071, f1131, 0fBF0FA9F5, %59; +fma.rn.f32 f709, f1129, 0fBEBD82C0, f1071; +fma.rn.f32 f1070, f119, 0fBF53E36D, 0f00000000; +fma.rn.f32 f710, f123, 0f3F6DD16B, f1070; +fma.rn.f32 f711, f125, 0f3F7A03CE, f707; +fma.rn.f32 f712, f128, 0fBE5C2136, f708; +fma.rn.f32 f713, f1127, 0f3F7A03CE, f709; +fma.rn.f32 f714, f127, 0fBE5C2136, f710; +fma.rn.f32 f715, f129, 0fBF39DAD7, f711; +fma.rn.f32 f716, f132, 0fBF300D12, f712; +fma.rn.f32 f717, f1124, 0fBF39DAD7, f713; +fma.rn.f32 f718, f131, 0fBF300D12, f714; +fma.rn.f32 f719, f133, 0fBE25AA2E, f715; +fma.rn.f32 f720, f136, 0f3F7CA0AA, f716; +fma.rn.f32 f721, f1122, 0fBE25AA2E, f717; +fma.rn.f32 f722, f135, 0f3F7CA0AA, f718; +fma.rn.f32 f723, f137, 0f3F6856DD, f719; +fma.rn.f32 f724, f140, 0fBED6FBB4, f720; +fma.rn.f32 f725, f1120, 0f3F6856DD, f721; +fma.rn.f32 f726, f139, 0fBED6FBB4, f722; +fma.rn.f32 f727, f141, 0fBF5B5AFE, f723; +fma.rn.f32 f728, f144, 0fBF03FB56, f724; +fma.rn.f32 f729, f1117, 0fBF5B5AFE, f725; +fma.rn.f32 f730, f143, 0fBF03FB56, f726; +fma.rn.f32 f731, f145, 0f3D5DC0C3, f727; +fma.rn.f32 f732, f148, 0f3F7F9FE3, f728; +fma.rn.f32 f733, f1115, 0f3D5DC0C3, f729; +fma.rn.f32 f734, f147, 0f3F7F9FE3, f730; +fma.rn.f32 f735, f149, 0f3F4BCCC1, f731; +fma.rn.f32 f736, f152, 0fBF1AECB3, f732; +fma.rn.f32 f737, f1113, 0f3F4BCCC1, f733; +fma.rn.f32 f738, f151, 0fBF1AECB3, f734; +fma.rn.f32 f739, f153, 0fBF729966, f735; +fma.rn.f32 f740, f156, 0fBEA37B7D, f736; +fma.rn.f32 f741, f1110, 0fBF729966, f737; +fma.rn.f32 f742, f155, 0fBEA37B7D, f738; +fma.rn.f32 f743, f157, 0f3E88F979, f739; +fma.rn.f32 f744, f160, 0f3F76AB36, f740; +fma.rn.f32 f745, f1108, 0f3E88F979, f741; +fma.rn.f32 f746, f159, 0f3F76AB36, f742; +fma.rn.f32 f747, f161, 0f3F25BB1C, f743; +fma.rn.f32 f748, f164, 0fBF431D0D, f744; +fma.rn.f32 f749, f1106, 0f3F25BB1C, f745; +fma.rn.f32 f750, f163, 0fBF431D0D, f746; +fma.rn.f32 f751, f165, 0fBF7E7FD3, f747; +fma.rn.f32 f752, f168, 0fBDDD6D81, f748; +fma.rn.f32 f753, f1103, 0fBF7E7FD3, f749; +fma.rn.f32 f754, f167, 0fBDDD6D81, f750; +fma.rn.f32 f755, f169, 0f3EEFD33B, f751; +fma.rn.f32 f756, f172, 0f3F622DD8, f752; +fma.rn.f32 f757, f1101, 0f3EEFD33B, f753; +fma.rn.f32 f758, f171, 0f3F622DD8, f754; +fma.rn.f32 f759, f117, 0fBF39DAD7, %58; +fma.rn.f32 f763, f121, 0f3D5DC0C3, f759; +fma.rn.f32 f1069, f120, 0fBF300D12, 0f00000000; +fma.rn.f32 f764, f124, 0f3F7F9FE3, f1069; +fma.rn.f32 f1068, f1131, 0fBF39DAD7, %59; +fma.rn.f32 f765, f1129, 0f3D5DC0C3, f1068; +fma.rn.f32 f1067, f119, 0fBF300D12, 0f00000000; +fma.rn.f32 f766, f123, 0f3F7F9FE3, f1067; +fma.rn.f32 f767, f125, 0f3F25BB1C, f763; +fma.rn.f32 f768, f128, 0fBF431D0D, f764; +fma.rn.f32 f769, f1127, 0f3F25BB1C, f765; +fma.rn.f32 f770, f127, 0fBF431D0D, f766; +fma.rn.f32 f771, f129, 0fBF7E7FD3, f767; +fma.rn.f32 f772, f132, 0f3DDD6D81, f768; +fma.rn.f32 f773, f1124, 0fBF7E7FD3, f769; +fma.rn.f32 f774, f131, 0f3DDD6D81, f770; +fma.rn.f32 f775, f133, 0f3F4BCCC1, f771; +fma.rn.f32 f776, f136, 0f3F1AECB3, f772; +fma.rn.f32 f777, f1122, 0f3F4BCCC1, f773; +fma.rn.f32 f778, f135, 0f3F1AECB3, f774; +fma.rn.f32 f779, f137, 0fBE25AA2E, f775; +fma.rn.f32 f780, f140, 0fBF7CA0AA, f776; +fma.rn.f32 f781, f1120, 0fBE25AA2E, f777; +fma.rn.f32 f782, f139, 0fBF7CA0AA, f778; +fma.rn.f32 f783, f141, 0fBF0FA9F5, f779; +fma.rn.f32 f784, f144, 0f3F53E36D, f780; +fma.rn.f32 f785, f1117, 0fBF0FA9F5, f781; +fma.rn.f32 f786, f143, 0f3F53E36D, f782; +fma.rn.f32 f787, f145, 0f3F7A03CE, f783; +fma.rn.f32 f788, f148, 0fBE5C2136, f784; +fma.rn.f32 f789, f1115, 0f3F7A03CE, f785; +fma.rn.f32 f790, f147, 0fBE5C2136, f786; +fma.rn.f32 f791, f149, 0fBF5B5AFE, f787; +fma.rn.f32 f792, f152, 0fBF03FB56, f788; +fma.rn.f32 f793, f1113, 0fBF5B5AFE, f789; +fma.rn.f32 f794, f151, 0fBF03FB56, f790; +fma.rn.f32 f795, f153, 0f3E88F979, f791; +fma.rn.f32 f796, f156, 0f3F76AB36, f792; +fma.rn.f32 f797, f1110, 0f3E88F979, f793; +fma.rn.f32 f798, f155, 0f3F76AB36, f794; +fma.rn.f32 f799, f157, 0f3EEFD33B, f795; +fma.rn.f32 f800, f160, 0fBF622DD8, f796; +fma.rn.f32 f801, f1108, 0f3EEFD33B, f797; +fma.rn.f32 f802, f159, 0fBF622DD8, f798; +fma.rn.f32 f803, f161, 0fBF729966, f799; +fma.rn.f32 f804, f164, 0f3EA37B7D, f800; +fma.rn.f32 f805, f1106, 0fBF729966, f801; +fma.rn.f32 f806, f163, 0f3EA37B7D, f802; +fma.rn.f32 f807, f165, 0f3F6856DD, f803; +fma.rn.f32 f808, f168, 0f3ED6FBB4, f804; +fma.rn.f32 f809, f1103, 0f3F6856DD, f805; +fma.rn.f32 f810, f167, 0f3ED6FBB4, f806; +fma.rn.f32 f811, f169, 0fBEBD82C0, f807; +fma.rn.f32 f812, f172, 0fBF6DD16B, f808; +fma.rn.f32 f813, f1101, 0fBEBD82C0, f809; +fma.rn.f32 f814, f171, 0fBF6DD16B, f810; +fma.rn.f32 f815, f117, 0fBF5B5AFE, %58; +fma.rn.f32 f819, f121, 0f3EEFD33B, f815; +fma.rn.f32 f1066, f120, 0fBF03FB56, 0f00000000; +fma.rn.f32 f820, f124, 0f3F622DD8, f1066; +fma.rn.f32 f1065, f1131, 0fBF5B5AFE, %59; +fma.rn.f32 f821, f1129, 0f3EEFD33B, f1065; +fma.rn.f32 f1064, f119, 0fBF03FB56, 0f00000000; +fma.rn.f32 f822, f123, 0f3F622DD8, f1064; +fma.rn.f32 f823, f125, 0f3D5DC0C3, f819; +fma.rn.f32 f824, f128, 0fBF7F9FE3, f820; +fma.rn.f32 f825, f1127, 0f3D5DC0C3, f821; +fma.rn.f32 f826, f127, 0fBF7F9FE3, f822; +fma.rn.f32 f827, f129, 0fBF0FA9F5, f823; +fma.rn.f32 f828, f132, 0f3F53E36D, f824; +fma.rn.f32 f829, f1124, 0fBF0FA9F5, f825; +fma.rn.f32 f830, f131, 0f3F53E36D, f826; +fma.rn.f32 f831, f133, 0f3F6856DD, f827; +fma.rn.f32 f832, f136, 0fBED6FBB4, f828; +fma.rn.f32 f833, f1122, 0f3F6856DD, f829; +fma.rn.f32 f834, f135, 0fBED6FBB4, f830; +fma.rn.f32 f835, f137, 0fBF7E7FD3, f831; +fma.rn.f32 f836, f140, 0fBDDD6D81, f832; +fma.rn.f32 f837, f1120, 0fBF7E7FD3, f833; +fma.rn.f32 f838, f139, 0fBDDD6D81, f834; +fma.rn.f32 f839, f141, 0f3F4BCCC1, f835; +fma.rn.f32 f840, f144, 0f3F1AECB3, f836; +fma.rn.f32 f841, f1117, 0f3F4BCCC1, f837; +fma.rn.f32 f842, f143, 0f3F1AECB3, f838; +fma.rn.f32 f843, f145, 0fBEBD82C0, f839; +fma.rn.f32 f844, f148, 0fBF6DD16B, f840; +fma.rn.f32 f845, f1115, 0fBEBD82C0, f841; +fma.rn.f32 f846, f147, 0fBF6DD16B, f842; +fma.rn.f32 f847, f149, 0fBE25AA2E, f843; +fma.rn.f32 f848, f152, 0f3F7CA0AA, f844; +fma.rn.f32 f849, f1113, 0fBE25AA2E, f845; +fma.rn.f32 f850, f151, 0f3F7CA0AA, f846; +fma.rn.f32 f851, f153, 0f3F25BB1C, f847; +fma.rn.f32 f852, f156, 0fBF431D0D, f848; +fma.rn.f32 f853, f1110, 0f3F25BB1C, f849; +fma.rn.f32 f854, f155, 0fBF431D0D, f850; +fma.rn.f32 f855, f157, 0fBF729966, f851; +fma.rn.f32 f856, f160, 0f3EA37B7D, f852; +fma.rn.f32 f857, f1108, 0fBF729966, f853; +fma.rn.f32 f858, f159, 0f3EA37B7D, f854; +fma.rn.f32 f859, f161, 0f3F7A03CE, f855; +fma.rn.f32 f860, f164, 0f3E5C2136, f856; +fma.rn.f32 f861, f1106, 0f3F7A03CE, f857; +fma.rn.f32 f862, f163, 0f3E5C2136, f858; +fma.rn.f32 f863, f165, 0fBF39DAD7, f859; +fma.rn.f32 f864, f168, 0fBF300D12, f860; +fma.rn.f32 f865, f1103, 0fBF39DAD7, f861; +fma.rn.f32 f866, f167, 0fBF300D12, f862; +fma.rn.f32 f867, f169, 0f3E88F979, f863; +fma.rn.f32 f868, f172, 0f3F76AB36, f864; +fma.rn.f32 f869, f1101, 0f3E88F979, f865; +fma.rn.f32 f870, f171, 0f3F76AB36, f866; +fma.rn.f32 f871, f117, 0fBF729966, %58; +fma.rn.f32 f875, f121, 0f3F4BCCC1, f871; +fma.rn.f32 f1063, f120, 0fBEA37B7D, 0f00000000; +fma.rn.f32 f876, f124, 0f3F1AECB3, f1063; +fma.rn.f32 f1062, f1131, 0fBF729966, %59; +fma.rn.f32 f877, f1129, 0f3F4BCCC1, f1062; +fma.rn.f32 f1061, f119, 0fBEA37B7D, 0f00000000; +fma.rn.f32 f878, f123, 0f3F1AECB3, f1061; +fma.rn.f32 f879, f125, 0fBF0FA9F5, f875; +fma.rn.f32 f880, f128, 0fBF53E36D, f876; +fma.rn.f32 f881, f1127, 0fBF0FA9F5, f877; +fma.rn.f32 f882, f127, 0fBF53E36D, f878; +fma.rn.f32 f883, f129, 0f3E88F979, f879; +fma.rn.f32 f884, f132, 0f3F76AB36, f880; +fma.rn.f32 f885, f1124, 0f3E88F979, f881; +fma.rn.f32 f886, f131, 0f3F76AB36, f882; +fma.rn.f32 f887, f133, 0f3D5DC0C3, f883; +fma.rn.f32 f888, f136, 0fBF7F9FE3, f884; +fma.rn.f32 f889, f1122, 0f3D5DC0C3, f885; +fma.rn.f32 f890, f135, 0fBF7F9FE3, f886; +fma.rn.f32 f891, f137, 0fBEBD82C0, f887; +fma.rn.f32 f892, f140, 0f3F6DD16B, f888; +fma.rn.f32 f893, f1120, 0fBEBD82C0, f889; +fma.rn.f32 f894, f139, 0f3F6DD16B, f890; +fma.rn.f32 f895, f141, 0f3F25BB1C, f891; +fma.rn.f32 f896, f144, 0fBF431D0D, f892; +fma.rn.f32 f897, f1117, 0f3F25BB1C, f893; +fma.rn.f32 f898, f143, 0fBF431D0D, f894; +fma.rn.f32 f899, f145, 0fBF5B5AFE, f895; +fma.rn.f32 f900, f148, 0f3F03FB56, f896; +fma.rn.f32 f901, f1115, 0fBF5B5AFE, f897; +fma.rn.f32 f902, f147, 0f3F03FB56, f898; +fma.rn.f32 f903, f149, 0f3F7A03CE, f899; +fma.rn.f32 f904, f152, 0fBE5C2136, f900; +fma.rn.f32 f905, f1113, 0f3F7A03CE, f901; +fma.rn.f32 f906, f151, 0fBE5C2136, f902; +fma.rn.f32 f907, f153, 0fBF7E7FD3, f903; +fma.rn.f32 f908, f156, 0fBDDD6D81, f904; +fma.rn.f32 f909, f1110, 0fBF7E7FD3, f905; +fma.rn.f32 f910, f155, 0fBDDD6D81, f906; +fma.rn.f32 f911, f157, 0f3F6856DD, f907; +fma.rn.f32 f912, f160, 0f3ED6FBB4, f908; +fma.rn.f32 f913, f1108, 0f3F6856DD, f909; +fma.rn.f32 f914, f159, 0f3ED6FBB4, f910; +fma.rn.f32 f915, f161, 0fBF39DAD7, f911; +fma.rn.f32 f916, f164, 0fBF300D12, f912; +fma.rn.f32 f917, f1106, 0fBF39DAD7, f913; +fma.rn.f32 f918, f163, 0fBF300D12, f914; +fma.rn.f32 f919, f165, 0f3EEFD33B, f915; +fma.rn.f32 f920, f168, 0f3F622DD8, f916; +fma.rn.f32 f921, f1103, 0f3EEFD33B, f917; +fma.rn.f32 f922, f167, 0f3F622DD8, f918; +fma.rn.f32 f923, f169, 0fBE25AA2E, f919; +fma.rn.f32 f924, f172, 0fBF7CA0AA, f920; +fma.rn.f32 f925, f1101, 0fBE25AA2E, f921; +fma.rn.f32 f926, f171, 0fBF7CA0AA, f922; +fma.rn.f32 f927, f117, 0fBF7E7FD3, %58; +fma.rn.f32 f928, f120, 0fBDDD6D81, 0f00000000; +fma.rn.f32 f929, f1131, 0fBF7E7FD3, %59; +fma.rn.f32 f930, f119, 0fBDDD6D81, 0f00000000; +fma.rn.f32 f931, f121, 0f3F7A03CE, f927; +fma.rn.f32 f932, f124, 0f3E5C2136, f928; +fma.rn.f32 f933, f1129, 0f3F7A03CE, f929; +fma.rn.f32 f934, f123, 0f3E5C2136, f930; +fma.rn.f32 f935, f125, 0fBF729966, f931; +fma.rn.f32 f936, f128, 0fBEA37B7D, f932; +fma.rn.f32 f937, f1127, 0fBF729966, f933; +fma.rn.f32 f938, f127, 0fBEA37B7D, f934; +fma.rn.f32 f939, f129, 0f3F6856DD, f935; +fma.rn.f32 f940, f132, 0f3ED6FBB4, f936; +fma.rn.f32 f941, f1124, 0f3F6856DD, f937; +fma.rn.f32 f942, f131, 0f3ED6FBB4, f938; +fma.rn.f32 f943, f133, 0fBF5B5AFE, f939; +fma.rn.f32 f944, f136, 0fBF03FB56, f940; +fma.rn.f32 f945, f1122, 0fBF5B5AFE, f941; +fma.rn.f32 f946, f135, 0fBF03FB56, f942; +fma.rn.f32 f947, f137, 0f3F4BCCC1, f943; +fma.rn.f32 f948, f140, 0f3F1AECB3, f944; +fma.rn.f32 f949, f1120, 0f3F4BCCC1, f945; +fma.rn.f32 f950, f139, 0f3F1AECB3, f946; +fma.rn.f32 f951, f141, 0fBF39DAD7, f947; +fma.rn.f32 f952, f144, 0fBF300D12, f948; +fma.rn.f32 f953, f1117, 0fBF39DAD7, f949; +fma.rn.f32 f954, f143, 0fBF300D12, f950; +fma.rn.f32 f955, f145, 0f3F25BB1C, f951; +fma.rn.f32 f956, f148, 0f3F431D0D, f952; +fma.rn.f32 f957, f1115, 0f3F25BB1C, f953; +fma.rn.f32 f958, f147, 0f3F431D0D, f954; +fma.rn.f32 f959, f149, 0fBF0FA9F5, f955; +fma.rn.f32 f960, f152, 0fBF53E36D, f956; +fma.rn.f32 f961, f1113, 0fBF0FA9F5, f957; +fma.rn.f32 f962, f151, 0fBF53E36D, f958; +fma.rn.f32 f963, f153, 0f3EEFD33B, f959; +fma.rn.f32 f964, f156, 0f3F622DD8, f960; +fma.rn.f32 f965, f1110, 0f3EEFD33B, f961; +fma.rn.f32 f966, f155, 0f3F622DD8, f962; +fma.rn.f32 f967, f157, 0fBEBD82C0, f963; +fma.rn.f32 f968, f160, 0fBF6DD16B, f964; +fma.rn.f32 f969, f1108, 0fBEBD82C0, f965; +fma.rn.f32 f970, f159, 0fBF6DD16B, f966; +fma.rn.f32 f971, f161, 0f3E88F979, f967; +fma.rn.f32 f972, f164, 0f3F76AB36, f968; +fma.rn.f32 f973, f1106, 0f3E88F979, f969; +fma.rn.f32 f974, f163, 0f3F76AB36, f970; +fma.rn.f32 f975, f165, 0fBE25AA2E, f971; +fma.rn.f32 f976, f168, 0fBF7CA0AA, f972; +fma.rn.f32 f977, f1103, 0fBE25AA2E, f973; +fma.rn.f32 f978, f167, 0fBF7CA0AA, f974; +fma.rn.f32 f979, f169, 0f3D5DC0C3, f975; +fma.rn.f32 f980, f172, 0f3F7F9FE3, f976; +fma.rn.f32 f981, f1101, 0f3D5DC0C3, f977; +fma.rn.f32 f982, f171, 0f3F7F9FE3, f978; +add.f32 %1, f198, f1101; +add.f32 %0, f197, f169; +sub.f32 %2, f251, f252; +add.f32 %3, f253, f254; +sub.f32 %4, f307, f308; +add.f32 %5, f309, f310; +sub.f32 %6, f363, f364; +add.f32 %7, f365, f366; +sub.f32 %8, f419, f420; +add.f32 %9, f421, f422; +add.f32 %11, f477, f478; +sub.f32 %10, f475, f476; +add.f32 %13, f533, f534; +sub.f32 %12, f531, f532; +add.f32 %15, f589, f590; +sub.f32 %14, f587, f588; +sub.f32 %16, f643, f644; +add.f32 %17, f645, f646; +sub.f32 %18, f699, f700; +add.f32 %19, f701, f702; +sub.f32 %20, f755, f756; +add.f32 %21, f757, f758; +add.f32 %23, f813, f814; +sub.f32 %22, f811, f812; +add.f32 %25, f869, f870; +sub.f32 %24, f867, f868; +add.f32 %27, f925, f926; +sub.f32 %26, f923, f924; +sub.f32 %28, f979, f980; +add.f32 %29, f981, f982; +sub.f32 %31, f981, f982; +add.f32 %30, f979, f980; +sub.f32 %33, f925, f926; +add.f32 %32, f923, f924; +sub.f32 %35, f869, f870; +add.f32 %34, f867, f868; +sub.f32 %37, f813, f814; +add.f32 %36, f811, f812; +sub.f32 %39, f757, f758; +add.f32 %38, f755, f756; +sub.f32 %41, f701, f702; +add.f32 %40, f699, f700; +sub.f32 %43, f645, f646; +add.f32 %42, f643, f644; +sub.f32 %45, f589, f590; +add.f32 %44, f587, f588; +sub.f32 %47, f533, f534; +add.f32 %46, f531, f532; +sub.f32 %49, f477, f478; +add.f32 %48, f475, f476; +sub.f32 %51, f421, f422; +add.f32 %50, f419, f420; +sub.f32 %53, f365, f366; +add.f32 %52, f363, f364; +sub.f32 %55, f309, f310; +add.f32 %54, f307, f308; +sub.f32 %57, f253, f254; +add.f32 %56, f251, f252; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[1].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[25].y), "f"(rmem[4].y), "f"(rmem[5].y), "f"(rmem[23].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[11].y), "f"(rmem[17].y), "f"(rmem[16].y), "f"(rmem[13].y), "f"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..da980a40d98f7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp32_inv.hpp.inc @@ -0,0 +1,940 @@ +#ifndef CUFFTDX_FFT_29_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_29_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<215, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1133>; +.reg .b64 rd<4>; +add.f32 f117, %60, %114; +sub.f32 f119, %60, %114; +add.f32 f1131, %116, %115; +sub.f32 f120, %116, %115; +add.f32 f121, %62, %112; +sub.f32 f123, %62, %112; +add.f32 f1129, %117, %113; +sub.f32 f124, %117, %113; +add.f32 f125, %64, %110; +sub.f32 f127, %64, %110; +add.f32 f1127, %65, %118; +sub.f32 f128, %65, %118; +add.f32 f129, %66, %108; +sub.f32 f131, %66, %108; +add.f32 f1124, %120, %119; +sub.f32 f132, %120, %119; +add.f32 f133, %68, %106; +sub.f32 f135, %68, %106; +add.f32 f1122, %121, %107; +sub.f32 f136, %121, %107; +add.f32 f137, %70, %104; +sub.f32 f139, %70, %104; +add.f32 f1120, %71, %122; +sub.f32 f140, %71, %122; +add.f32 f141, %72, %102; +sub.f32 f143, %72, %102; +add.f32 f1117, %124, %123; +sub.f32 f144, %124, %123; +add.f32 f145, %74, %100; +sub.f32 f147, %74, %100; +add.f32 f1115, %125, %101; +sub.f32 f148, %125, %101; +add.f32 f149, %76, %98; +sub.f32 f151, %76, %98; +add.f32 f1113, %77, %126; +sub.f32 f152, %77, %126; +add.f32 f153, %78, %96; +sub.f32 f155, %78, %96; +add.f32 f1110, %128, %127; +sub.f32 f156, %128, %127; +add.f32 f157, %80, %94; +sub.f32 f159, %80, %94; +add.f32 f1108, %129, %95; +sub.f32 f160, %129, %95; +add.f32 f161, %82, %92; +sub.f32 f163, %82, %92; +add.f32 f1106, %83, %130; +sub.f32 f164, %83, %130; +add.f32 f165, %84, %90; +sub.f32 f167, %84, %90; +add.f32 f1103, %132, %131; +sub.f32 f168, %132, %131; +add.f32 f169, %86, %88; +sub.f32 f171, %86, %88; +add.f32 f1101, %133, %89; +sub.f32 f172, %133, %89; +add.f32 f173, %58, f117; +add.f32 f175, f173, f121; +add.f32 f1100, %59, f1131; +add.f32 f176, f1100, f1129; +add.f32 f177, f175, f125; +add.f32 f178, f176, f1127; +add.f32 f179, f177, f129; +add.f32 f180, f178, f1124; +add.f32 f181, f179, f133; +add.f32 f182, f180, f1122; +add.f32 f183, f181, f137; +add.f32 f184, f182, f1120; +add.f32 f185, f183, f141; +add.f32 f186, f184, f1117; +add.f32 f187, f185, f145; +add.f32 f188, f186, f1115; +add.f32 f189, f187, f149; +add.f32 f190, f188, f1113; +add.f32 f191, f189, f153; +add.f32 f192, f190, f1110; +add.f32 f193, f191, f157; +add.f32 f194, f192, f1108; +add.f32 f195, f193, f161; +add.f32 f196, f194, f1106; +add.f32 f197, f195, f165; +add.f32 f198, f196, f1103; +fma.rn.f32 f199, f117, 0f3F7A03CE, %58; +fma.rn.f32 f203, f121, 0f3F6856DD, f199; +fma.rn.f32 f1099, f120, 0f3E5C2136, 0f00000000; +fma.rn.f32 f204, f124, 0f3ED6FBB4, f1099; +fma.rn.f32 f1098, f1131, 0f3F7A03CE, %59; +fma.rn.f32 f205, f1129, 0f3F6856DD, f1098; +fma.rn.f32 f1097, f119, 0f3E5C2136, 0f00000000; +fma.rn.f32 f206, f123, 0f3ED6FBB4, f1097; +fma.rn.f32 f207, f125, 0f3F4BCCC1, f203; +fma.rn.f32 f208, f128, 0f3F1AECB3, f204; +fma.rn.f32 f209, f1127, 0f3F4BCCC1, f205; +fma.rn.f32 f210, f127, 0f3F1AECB3, f206; +fma.rn.f32 f211, f129, 0f3F25BB1C, f207; +fma.rn.f32 f212, f132, 0f3F431D0D, f208; +fma.rn.f32 f213, f1124, 0f3F25BB1C, f209; +fma.rn.f32 f214, f131, 0f3F431D0D, f210; +fma.rn.f32 f215, f133, 0f3EEFD33B, f211; +fma.rn.f32 f216, f136, 0f3F622DD8, f212; +fma.rn.f32 f217, f1122, 0f3EEFD33B, f213; +fma.rn.f32 f218, f135, 0f3F622DD8, f214; +fma.rn.f32 f219, f137, 0f3E88F979, f215; +fma.rn.f32 f220, f140, 0f3F76AB36, f216; +fma.rn.f32 f221, f1120, 0f3E88F979, f217; +fma.rn.f32 f222, f139, 0f3F76AB36, f218; +fma.rn.f32 f223, f141, 0f3D5DC0C3, f219; +fma.rn.f32 f224, f144, 0f3F7F9FE3, f220; +fma.rn.f32 f225, f1117, 0f3D5DC0C3, f221; +fma.rn.f32 f226, f143, 0f3F7F9FE3, f222; +fma.rn.f32 f227, f145, 0fBE25AA2E, f223; +fma.rn.f32 f228, f148, 0f3F7CA0AA, f224; +fma.rn.f32 f229, f1115, 0fBE25AA2E, f225; +fma.rn.f32 f230, f147, 0f3F7CA0AA, f226; +fma.rn.f32 f231, f149, 0fBEBD82C0, f227; +fma.rn.f32 f232, f152, 0f3F6DD16B, f228; +fma.rn.f32 f233, f1113, 0fBEBD82C0, f229; +fma.rn.f32 f234, f151, 0f3F6DD16B, f230; +fma.rn.f32 f235, f153, 0fBF0FA9F5, f231; +fma.rn.f32 f236, f156, 0f3F53E36D, f232; +fma.rn.f32 f237, f1110, 0fBF0FA9F5, f233; +fma.rn.f32 f238, f155, 0f3F53E36D, f234; +fma.rn.f32 f239, f157, 0fBF39DAD7, f235; +fma.rn.f32 f240, f160, 0f3F300D12, f236; +fma.rn.f32 f241, f1108, 0fBF39DAD7, f237; +fma.rn.f32 f242, f159, 0f3F300D12, f238; +fma.rn.f32 f243, f161, 0fBF5B5AFE, f239; +fma.rn.f32 f244, f164, 0f3F03FB56, f240; +fma.rn.f32 f245, f1106, 0fBF5B5AFE, f241; +fma.rn.f32 f246, f163, 0f3F03FB56, f242; +fma.rn.f32 f247, f165, 0fBF729966, f243; +fma.rn.f32 f248, f168, 0f3EA37B7D, f244; +fma.rn.f32 f249, f1103, 0fBF729966, f245; +fma.rn.f32 f250, f167, 0f3EA37B7D, f246; +fma.rn.f32 f251, f169, 0fBF7E7FD3, f247; +fma.rn.f32 f252, f172, 0f3DDD6D81, f248; +fma.rn.f32 f253, f1101, 0fBF7E7FD3, f249; +fma.rn.f32 f254, f171, 0f3DDD6D81, f250; +fma.rn.f32 f255, f117, 0f3F6856DD, %58; +fma.rn.f32 f259, f121, 0f3F25BB1C, f255; +fma.rn.f32 f1096, f120, 0f3ED6FBB4, 0f00000000; +fma.rn.f32 f260, f124, 0f3F431D0D, f1096; +fma.rn.f32 f1095, f1131, 0f3F6856DD, %59; +fma.rn.f32 f261, f1129, 0f3F25BB1C, f1095; +fma.rn.f32 f1094, f119, 0f3ED6FBB4, 0f00000000; +fma.rn.f32 f262, f123, 0f3F431D0D, f1094; +fma.rn.f32 f263, f125, 0f3E88F979, f259; +fma.rn.f32 f264, f128, 0f3F76AB36, f260; +fma.rn.f32 f265, f1127, 0f3E88F979, f261; +fma.rn.f32 f266, f127, 0f3F76AB36, f262; +fma.rn.f32 f267, f129, 0fBE25AA2E, f263; +fma.rn.f32 f268, f132, 0f3F7CA0AA, f264; +fma.rn.f32 f269, f1124, 0fBE25AA2E, f265; +fma.rn.f32 f270, f131, 0f3F7CA0AA, f266; +fma.rn.f32 f271, f133, 0fBF0FA9F5, f267; +fma.rn.f32 f272, f136, 0f3F53E36D, f268; +fma.rn.f32 f273, f1122, 0fBF0FA9F5, f269; +fma.rn.f32 f274, f135, 0f3F53E36D, f270; +fma.rn.f32 f275, f137, 0fBF5B5AFE, f271; +fma.rn.f32 f276, f140, 0f3F03FB56, f272; +fma.rn.f32 f277, f1120, 0fBF5B5AFE, f273; +fma.rn.f32 f278, f139, 0f3F03FB56, f274; +fma.rn.f32 f279, f141, 0fBF7E7FD3, f275; +fma.rn.f32 f280, f144, 0f3DDD6D81, f276; +fma.rn.f32 f281, f1117, 0fBF7E7FD3, f277; +fma.rn.f32 f282, f143, 0f3DDD6D81, f278; +fma.rn.f32 f283, f145, 0fBF729966, f279; +fma.rn.f32 f284, f148, 0fBEA37B7D, f280; +fma.rn.f32 f285, f1115, 0fBF729966, f281; +fma.rn.f32 f286, f147, 0fBEA37B7D, f282; +fma.rn.f32 f287, f149, 0fBF39DAD7, f283; +fma.rn.f32 f288, f152, 0fBF300D12, f284; +fma.rn.f32 f289, f1113, 0fBF39DAD7, f285; +fma.rn.f32 f290, f151, 0fBF300D12, f286; +fma.rn.f32 f291, f153, 0fBEBD82C0, f287; +fma.rn.f32 f292, f156, 0fBF6DD16B, f288; +fma.rn.f32 f293, f1110, 0fBEBD82C0, f289; +fma.rn.f32 f294, f155, 0fBF6DD16B, f290; +fma.rn.f32 f295, f157, 0f3D5DC0C3, f291; +fma.rn.f32 f296, f160, 0fBF7F9FE3, f292; +fma.rn.f32 f297, f1108, 0f3D5DC0C3, f293; +fma.rn.f32 f298, f159, 0fBF7F9FE3, f294; +fma.rn.f32 f299, f161, 0f3EEFD33B, f295; +fma.rn.f32 f300, f164, 0fBF622DD8, f296; +fma.rn.f32 f301, f1106, 0f3EEFD33B, f297; +fma.rn.f32 f302, f163, 0fBF622DD8, f298; +fma.rn.f32 f303, f165, 0f3F4BCCC1, f299; +fma.rn.f32 f304, f168, 0fBF1AECB3, f300; +fma.rn.f32 f305, f1103, 0f3F4BCCC1, f301; +fma.rn.f32 f306, f167, 0fBF1AECB3, f302; +fma.rn.f32 f307, f169, 0f3F7A03CE, f303; +fma.rn.f32 f308, f172, 0fBE5C2136, f304; +fma.rn.f32 f309, f1101, 0f3F7A03CE, f305; +fma.rn.f32 f310, f171, 0fBE5C2136, f306; +fma.rn.f32 f311, f117, 0f3F4BCCC1, %58; +fma.rn.f32 f315, f121, 0f3E88F979, f311; +fma.rn.f32 f1093, f120, 0f3F1AECB3, 0f00000000; +fma.rn.f32 f316, f124, 0f3F76AB36, f1093; +fma.rn.f32 f1092, f1131, 0f3F4BCCC1, %59; +fma.rn.f32 f317, f1129, 0f3E88F979, f1092; +fma.rn.f32 f1091, f119, 0f3F1AECB3, 0f00000000; +fma.rn.f32 f318, f123, 0f3F76AB36, f1091; +fma.rn.f32 f319, f125, 0fBEBD82C0, f315; +fma.rn.f32 f320, f128, 0f3F6DD16B, f316; +fma.rn.f32 f321, f1127, 0fBEBD82C0, f317; +fma.rn.f32 f322, f127, 0f3F6DD16B, f318; +fma.rn.f32 f323, f129, 0fBF5B5AFE, f319; +fma.rn.f32 f324, f132, 0f3F03FB56, f320; +fma.rn.f32 f325, f1124, 0fBF5B5AFE, f321; +fma.rn.f32 f326, f131, 0f3F03FB56, f322; +fma.rn.f32 f327, f133, 0fBF7E7FD3, f323; +fma.rn.f32 f328, f136, 0fBDDD6D81, f324; +fma.rn.f32 f329, f1122, 0fBF7E7FD3, f325; +fma.rn.f32 f330, f135, 0fBDDD6D81, f326; +fma.rn.f32 f331, f137, 0fBF39DAD7, f327; +fma.rn.f32 f332, f140, 0fBF300D12, f328; +fma.rn.f32 f333, f1120, 0fBF39DAD7, f329; +fma.rn.f32 f334, f139, 0fBF300D12, f330; +fma.rn.f32 f335, f141, 0fBE25AA2E, f331; +fma.rn.f32 f336, f144, 0fBF7CA0AA, f332; +fma.rn.f32 f337, f1117, 0fBE25AA2E, f333; +fma.rn.f32 f338, f143, 0fBF7CA0AA, f334; +fma.rn.f32 f339, f145, 0f3EEFD33B, f335; +fma.rn.f32 f340, f148, 0fBF622DD8, f336; +fma.rn.f32 f341, f1115, 0f3EEFD33B, f337; +fma.rn.f32 f342, f147, 0fBF622DD8, f338; +fma.rn.f32 f343, f149, 0f3F6856DD, f339; +fma.rn.f32 f344, f152, 0fBED6FBB4, f340; +fma.rn.f32 f345, f1113, 0f3F6856DD, f341; +fma.rn.f32 f346, f151, 0fBED6FBB4, f342; +fma.rn.f32 f347, f153, 0f3F7A03CE, f343; +fma.rn.f32 f348, f156, 0f3E5C2136, f344; +fma.rn.f32 f349, f1110, 0f3F7A03CE, f345; +fma.rn.f32 f350, f155, 0f3E5C2136, f346; +fma.rn.f32 f351, f157, 0f3F25BB1C, f347; +fma.rn.f32 f352, f160, 0f3F431D0D, f348; +fma.rn.f32 f353, f1108, 0f3F25BB1C, f349; +fma.rn.f32 f354, f159, 0f3F431D0D, f350; +fma.rn.f32 f355, f161, 0f3D5DC0C3, f351; +fma.rn.f32 f356, f164, 0f3F7F9FE3, f352; +fma.rn.f32 f357, f1106, 0f3D5DC0C3, f353; +fma.rn.f32 f358, f163, 0f3F7F9FE3, f354; +fma.rn.f32 f359, f165, 0fBF0FA9F5, f355; +fma.rn.f32 f360, f168, 0f3F53E36D, f356; +fma.rn.f32 f361, f1103, 0fBF0FA9F5, f357; +fma.rn.f32 f362, f167, 0f3F53E36D, f358; +fma.rn.f32 f363, f169, 0fBF729966, f359; +fma.rn.f32 f364, f172, 0f3EA37B7D, f360; +fma.rn.f32 f365, f1101, 0fBF729966, f361; +fma.rn.f32 f366, f171, 0f3EA37B7D, f362; +fma.rn.f32 f367, f117, 0f3F25BB1C, %58; +fma.rn.f32 f371, f121, 0fBE25AA2E, f367; +fma.rn.f32 f1090, f120, 0f3F431D0D, 0f00000000; +fma.rn.f32 f372, f124, 0f3F7CA0AA, f1090; +fma.rn.f32 f1089, f1131, 0f3F25BB1C, %59; +fma.rn.f32 f373, f1129, 0fBE25AA2E, f1089; +fma.rn.f32 f1088, f119, 0f3F431D0D, 0f00000000; +fma.rn.f32 f374, f123, 0f3F7CA0AA, f1088; +fma.rn.f32 f375, f125, 0fBF5B5AFE, f371; +fma.rn.f32 f376, f128, 0f3F03FB56, f372; +fma.rn.f32 f377, f1127, 0fBF5B5AFE, f373; +fma.rn.f32 f378, f127, 0f3F03FB56, f374; +fma.rn.f32 f379, f129, 0fBF729966, f375; +fma.rn.f32 f380, f132, 0fBEA37B7D, f376; +fma.rn.f32 f381, f1124, 0fBF729966, f377; +fma.rn.f32 f382, f131, 0fBEA37B7D, f378; +fma.rn.f32 f383, f133, 0fBEBD82C0, f379; +fma.rn.f32 f384, f136, 0fBF6DD16B, f380; +fma.rn.f32 f385, f1122, 0fBEBD82C0, f381; +fma.rn.f32 f386, f135, 0fBF6DD16B, f382; +fma.rn.f32 f387, f137, 0f3EEFD33B, f383; +fma.rn.f32 f388, f140, 0fBF622DD8, f384; +fma.rn.f32 f389, f1120, 0f3EEFD33B, f385; +fma.rn.f32 f390, f139, 0fBF622DD8, f386; +fma.rn.f32 f391, f141, 0f3F7A03CE, f387; +fma.rn.f32 f392, f144, 0fBE5C2136, f388; +fma.rn.f32 f393, f1117, 0f3F7A03CE, f389; +fma.rn.f32 f394, f143, 0fBE5C2136, f390; +fma.rn.f32 f395, f145, 0f3F4BCCC1, f391; +fma.rn.f32 f396, f148, 0f3F1AECB3, f392; +fma.rn.f32 f397, f1115, 0f3F4BCCC1, f393; +fma.rn.f32 f398, f147, 0f3F1AECB3, f394; +fma.rn.f32 f399, f149, 0f3D5DC0C3, f395; +fma.rn.f32 f400, f152, 0f3F7F9FE3, f396; +fma.rn.f32 f401, f1113, 0f3D5DC0C3, f397; +fma.rn.f32 f402, f151, 0f3F7F9FE3, f398; +fma.rn.f32 f403, f153, 0fBF39DAD7, f399; +fma.rn.f32 f404, f156, 0f3F300D12, f400; +fma.rn.f32 f405, f1110, 0fBF39DAD7, f401; +fma.rn.f32 f406, f155, 0f3F300D12, f402; +fma.rn.f32 f407, f157, 0fBF7E7FD3, f403; +fma.rn.f32 f408, f160, 0fBDDD6D81, f404; +fma.rn.f32 f409, f1108, 0fBF7E7FD3, f405; +fma.rn.f32 f410, f159, 0fBDDD6D81, f406; +fma.rn.f32 f411, f161, 0fBF0FA9F5, f407; +fma.rn.f32 f412, f164, 0fBF53E36D, f408; +fma.rn.f32 f413, f1106, 0fBF0FA9F5, f409; +fma.rn.f32 f414, f163, 0fBF53E36D, f410; +fma.rn.f32 f415, f165, 0f3E88F979, f411; +fma.rn.f32 f416, f168, 0fBF76AB36, f412; +fma.rn.f32 f417, f1103, 0f3E88F979, f413; +fma.rn.f32 f418, f167, 0fBF76AB36, f414; +fma.rn.f32 f419, f169, 0f3F6856DD, f415; +fma.rn.f32 f420, f172, 0fBED6FBB4, f416; +fma.rn.f32 f421, f1101, 0f3F6856DD, f417; +fma.rn.f32 f422, f171, 0fBED6FBB4, f418; +fma.rn.f32 f423, f117, 0f3EEFD33B, %58; +fma.rn.f32 f427, f121, 0fBF0FA9F5, f423; +fma.rn.f32 f1087, f120, 0f3F622DD8, 0f00000000; +fma.rn.f32 f428, f124, 0f3F53E36D, f1087; +fma.rn.f32 f1086, f1131, 0f3EEFD33B, %59; +fma.rn.f32 f429, f1129, 0fBF0FA9F5, f1086; +fma.rn.f32 f1085, f119, 0f3F622DD8, 0f00000000; +fma.rn.f32 f430, f123, 0f3F53E36D, f1085; +fma.rn.f32 f431, f125, 0fBF7E7FD3, f427; +fma.rn.f32 f432, f128, 0fBDDD6D81, f428; +fma.rn.f32 f433, f1127, 0fBF7E7FD3, f429; +fma.rn.f32 f434, f127, 0fBDDD6D81, f430; +fma.rn.f32 f435, f129, 0fBEBD82C0, f431; +fma.rn.f32 f436, f132, 0fBF6DD16B, f432; +fma.rn.f32 f437, f1124, 0fBEBD82C0, f433; +fma.rn.f32 f438, f131, 0fBF6DD16B, f434; +fma.rn.f32 f439, f133, 0f3F25BB1C, f435; +fma.rn.f32 f440, f136, 0fBF431D0D, f436; +fma.rn.f32 f441, f1122, 0f3F25BB1C, f437; +fma.rn.f32 f442, f135, 0fBF431D0D, f438; +fma.rn.f32 f443, f137, 0f3F7A03CE, f439; +fma.rn.f32 f444, f140, 0f3E5C2136, f440; +fma.rn.f32 f445, f1120, 0f3F7A03CE, f441; +fma.rn.f32 f446, f139, 0f3E5C2136, f442; +fma.rn.f32 f447, f141, 0f3E88F979, f443; +fma.rn.f32 f448, f144, 0f3F76AB36, f444; +fma.rn.f32 f449, f1117, 0f3E88F979, f445; +fma.rn.f32 f450, f143, 0f3F76AB36, f446; +fma.rn.f32 f451, f145, 0fBF39DAD7, f447; +fma.rn.f32 f452, f148, 0f3F300D12, f448; +fma.rn.f32 f453, f1115, 0fBF39DAD7, f449; +fma.rn.f32 f454, f147, 0f3F300D12, f450; +fma.rn.f32 f455, f149, 0fBF729966, f451; +fma.rn.f32 f456, f152, 0fBEA37B7D, f452; +fma.rn.f32 f457, f1113, 0fBF729966, f453; +fma.rn.f32 f458, f151, 0fBEA37B7D, f454; +fma.rn.f32 f459, f153, 0fBE25AA2E, f455; +fma.rn.f32 f460, f156, 0fBF7CA0AA, f456; +fma.rn.f32 f461, f1110, 0fBE25AA2E, f457; +fma.rn.f32 f462, f155, 0fBF7CA0AA, f458; +fma.rn.f32 f463, f157, 0f3F4BCCC1, f459; +fma.rn.f32 f464, f160, 0fBF1AECB3, f460; +fma.rn.f32 f465, f1108, 0f3F4BCCC1, f461; +fma.rn.f32 f466, f159, 0fBF1AECB3, f462; +fma.rn.f32 f467, f161, 0f3F6856DD, f463; +fma.rn.f32 f468, f164, 0f3ED6FBB4, f464; +fma.rn.f32 f469, f1106, 0f3F6856DD, f465; +fma.rn.f32 f470, f163, 0f3ED6FBB4, f466; +fma.rn.f32 f471, f165, 0f3D5DC0C3, f467; +fma.rn.f32 f472, f168, 0f3F7F9FE3, f468; +fma.rn.f32 f473, f1103, 0f3D5DC0C3, f469; +fma.rn.f32 f474, f167, 0f3F7F9FE3, f470; +fma.rn.f32 f475, f169, 0fBF5B5AFE, f471; +fma.rn.f32 f476, f172, 0f3F03FB56, f472; +fma.rn.f32 f477, f1101, 0fBF5B5AFE, f473; +fma.rn.f32 f478, f171, 0f3F03FB56, f474; +fma.rn.f32 f479, f117, 0f3E88F979, %58; +fma.rn.f32 f483, f121, 0fBF5B5AFE, f479; +fma.rn.f32 f1084, f120, 0f3F76AB36, 0f00000000; +fma.rn.f32 f484, f124, 0f3F03FB56, f1084; +fma.rn.f32 f1083, f1131, 0f3E88F979, %59; +fma.rn.f32 f485, f1129, 0fBF5B5AFE, f1083; +fma.rn.f32 f1082, f119, 0f3F76AB36, 0f00000000; +fma.rn.f32 f486, f123, 0f3F03FB56, f1082; +fma.rn.f32 f487, f125, 0fBF39DAD7, f483; +fma.rn.f32 f488, f128, 0fBF300D12, f484; +fma.rn.f32 f489, f1127, 0fBF39DAD7, f485; +fma.rn.f32 f490, f127, 0fBF300D12, f486; +fma.rn.f32 f491, f129, 0f3EEFD33B, f487; +fma.rn.f32 f492, f132, 0fBF622DD8, f488; +fma.rn.f32 f493, f1124, 0f3EEFD33B, f489; +fma.rn.f32 f494, f131, 0fBF622DD8, f490; +fma.rn.f32 f495, f133, 0f3F7A03CE, f491; +fma.rn.f32 f496, f136, 0f3E5C2136, f492; +fma.rn.f32 f497, f1122, 0f3F7A03CE, f493; +fma.rn.f32 f498, f135, 0f3E5C2136, f494; +fma.rn.f32 f499, f137, 0f3D5DC0C3, f495; +fma.rn.f32 f500, f140, 0f3F7F9FE3, f496; +fma.rn.f32 f501, f1120, 0f3D5DC0C3, f497; +fma.rn.f32 f502, f139, 0f3F7F9FE3, f498; +fma.rn.f32 f503, f141, 0fBF729966, f499; +fma.rn.f32 f504, f144, 0f3EA37B7D, f500; +fma.rn.f32 f505, f1117, 0fBF729966, f501; +fma.rn.f32 f506, f143, 0f3EA37B7D, f502; +fma.rn.f32 f507, f145, 0fBF0FA9F5, f503; +fma.rn.f32 f508, f148, 0fBF53E36D, f504; +fma.rn.f32 f509, f1115, 0fBF0FA9F5, f505; +fma.rn.f32 f510, f147, 0fBF53E36D, f506; +fma.rn.f32 f511, f149, 0f3F25BB1C, f507; +fma.rn.f32 f512, f152, 0fBF431D0D, f508; +fma.rn.f32 f513, f1113, 0f3F25BB1C, f509; +fma.rn.f32 f514, f151, 0fBF431D0D, f510; +fma.rn.f32 f515, f153, 0f3F6856DD, f511; +fma.rn.f32 f516, f156, 0f3ED6FBB4, f512; +fma.rn.f32 f517, f1110, 0f3F6856DD, f513; +fma.rn.f32 f518, f155, 0f3ED6FBB4, f514; +fma.rn.f32 f519, f157, 0fBE25AA2E, f515; +fma.rn.f32 f520, f160, 0f3F7CA0AA, f516; +fma.rn.f32 f521, f1108, 0fBE25AA2E, f517; +fma.rn.f32 f522, f159, 0f3F7CA0AA, f518; +fma.rn.f32 f523, f161, 0fBF7E7FD3, f519; +fma.rn.f32 f524, f164, 0f3DDD6D81, f520; +fma.rn.f32 f525, f1106, 0fBF7E7FD3, f521; +fma.rn.f32 f526, f163, 0f3DDD6D81, f522; +fma.rn.f32 f527, f165, 0fBEBD82C0, f523; +fma.rn.f32 f528, f168, 0fBF6DD16B, f524; +fma.rn.f32 f529, f1103, 0fBEBD82C0, f525; +fma.rn.f32 f530, f167, 0fBF6DD16B, f526; +fma.rn.f32 f531, f169, 0f3F4BCCC1, f527; +fma.rn.f32 f532, f172, 0fBF1AECB3, f528; +fma.rn.f32 f533, f1101, 0f3F4BCCC1, f529; +fma.rn.f32 f534, f171, 0fBF1AECB3, f530; +fma.rn.f32 f535, f117, 0f3D5DC0C3, %58; +fma.rn.f32 f539, f121, 0fBF7E7FD3, f535; +fma.rn.f32 f1081, f120, 0f3F7F9FE3, 0f00000000; +fma.rn.f32 f540, f124, 0f3DDD6D81, f1081; +fma.rn.f32 f1080, f1131, 0f3D5DC0C3, %59; +fma.rn.f32 f541, f1129, 0fBF7E7FD3, f1080; +fma.rn.f32 f1079, f119, 0f3F7F9FE3, 0f00000000; +fma.rn.f32 f542, f123, 0f3DDD6D81, f1079; +fma.rn.f32 f543, f125, 0fBE25AA2E, f539; +fma.rn.f32 f544, f128, 0fBF7CA0AA, f540; +fma.rn.f32 f545, f1127, 0fBE25AA2E, f541; +fma.rn.f32 f546, f127, 0fBF7CA0AA, f542; +fma.rn.f32 f547, f129, 0f3F7A03CE, f543; +fma.rn.f32 f548, f132, 0fBE5C2136, f544; +fma.rn.f32 f549, f1124, 0f3F7A03CE, f545; +fma.rn.f32 f550, f131, 0fBE5C2136, f546; +fma.rn.f32 f551, f133, 0f3E88F979, f547; +fma.rn.f32 f552, f136, 0f3F76AB36, f548; +fma.rn.f32 f553, f1122, 0f3E88F979, f549; +fma.rn.f32 f554, f135, 0f3F76AB36, f550; +fma.rn.f32 f555, f137, 0fBF729966, f551; +fma.rn.f32 f556, f140, 0f3EA37B7D, f552; +fma.rn.f32 f557, f1120, 0fBF729966, f553; +fma.rn.f32 f558, f139, 0f3EA37B7D, f554; +fma.rn.f32 f559, f141, 0fBEBD82C0, f555; +fma.rn.f32 f560, f144, 0fBF6DD16B, f556; +fma.rn.f32 f561, f1117, 0fBEBD82C0, f557; +fma.rn.f32 f562, f143, 0fBF6DD16B, f558; +fma.rn.f32 f563, f145, 0f3F6856DD, f559; +fma.rn.f32 f564, f148, 0fBED6FBB4, f560; +fma.rn.f32 f565, f1115, 0f3F6856DD, f561; +fma.rn.f32 f566, f147, 0fBED6FBB4, f562; +fma.rn.f32 f567, f149, 0f3EEFD33B, f563; +fma.rn.f32 f568, f152, 0f3F622DD8, f564; +fma.rn.f32 f569, f1113, 0f3EEFD33B, f565; +fma.rn.f32 f570, f151, 0f3F622DD8, f566; +fma.rn.f32 f571, f153, 0fBF5B5AFE, f567; +fma.rn.f32 f572, f156, 0f3F03FB56, f568; +fma.rn.f32 f573, f1110, 0fBF5B5AFE, f569; +fma.rn.f32 f574, f155, 0f3F03FB56, f570; +fma.rn.f32 f575, f157, 0fBF0FA9F5, f571; +fma.rn.f32 f576, f160, 0fBF53E36D, f572; +fma.rn.f32 f577, f1108, 0fBF0FA9F5, f573; +fma.rn.f32 f578, f159, 0fBF53E36D, f574; +fma.rn.f32 f579, f161, 0f3F4BCCC1, f575; +fma.rn.f32 f580, f164, 0fBF1AECB3, f576; +fma.rn.f32 f581, f1106, 0f3F4BCCC1, f577; +fma.rn.f32 f582, f163, 0fBF1AECB3, f578; +fma.rn.f32 f583, f165, 0f3F25BB1C, f579; +fma.rn.f32 f584, f168, 0f3F431D0D, f580; +fma.rn.f32 f585, f1103, 0f3F25BB1C, f581; +fma.rn.f32 f586, f167, 0f3F431D0D, f582; +fma.rn.f32 f587, f169, 0fBF39DAD7, f583; +fma.rn.f32 f588, f172, 0f3F300D12, f584; +fma.rn.f32 f589, f1101, 0fBF39DAD7, f585; +fma.rn.f32 f590, f171, 0f3F300D12, f586; +fma.rn.f32 f591, f117, 0fBE25AA2E, %58; +fma.rn.f32 f595, f121, 0fBF729966, f591; +fma.rn.f32 f1078, f120, 0f3F7CA0AA, 0f00000000; +fma.rn.f32 f596, f124, 0fBEA37B7D, f1078; +fma.rn.f32 f1077, f1131, 0fBE25AA2E, %59; +fma.rn.f32 f597, f1129, 0fBF729966, f1077; +fma.rn.f32 f1076, f119, 0f3F7CA0AA, 0f00000000; +fma.rn.f32 f598, f123, 0fBEA37B7D, f1076; +fma.rn.f32 f599, f125, 0f3EEFD33B, f595; +fma.rn.f32 f600, f128, 0fBF622DD8, f596; +fma.rn.f32 f601, f1127, 0f3EEFD33B, f597; +fma.rn.f32 f602, f127, 0fBF622DD8, f598; +fma.rn.f32 f603, f129, 0f3F4BCCC1, f599; +fma.rn.f32 f604, f132, 0f3F1AECB3, f600; +fma.rn.f32 f605, f1124, 0f3F4BCCC1, f601; +fma.rn.f32 f606, f131, 0f3F1AECB3, f602; +fma.rn.f32 f607, f133, 0fBF39DAD7, f603; +fma.rn.f32 f608, f136, 0f3F300D12, f604; +fma.rn.f32 f609, f1122, 0fBF39DAD7, f605; +fma.rn.f32 f610, f135, 0f3F300D12, f606; +fma.rn.f32 f611, f137, 0fBF0FA9F5, f607; +fma.rn.f32 f612, f140, 0fBF53E36D, f608; +fma.rn.f32 f613, f1120, 0fBF0FA9F5, f609; +fma.rn.f32 f614, f139, 0fBF53E36D, f610; +fma.rn.f32 f615, f141, 0f3F6856DD, f611; +fma.rn.f32 f616, f144, 0fBED6FBB4, f612; +fma.rn.f32 f617, f1117, 0f3F6856DD, f613; +fma.rn.f32 f618, f143, 0fBED6FBB4, f614; +fma.rn.f32 f619, f145, 0f3E88F979, f615; +fma.rn.f32 f620, f148, 0f3F76AB36, f616; +fma.rn.f32 f621, f1115, 0f3E88F979, f617; +fma.rn.f32 f622, f147, 0f3F76AB36, f618; +fma.rn.f32 f623, f149, 0fBF7E7FD3, f619; +fma.rn.f32 f624, f152, 0f3DDD6D81, f620; +fma.rn.f32 f625, f1113, 0fBF7E7FD3, f621; +fma.rn.f32 f626, f151, 0f3DDD6D81, f622; +fma.rn.f32 f627, f153, 0f3D5DC0C3, f623; +fma.rn.f32 f628, f156, 0fBF7F9FE3, f624; +fma.rn.f32 f629, f1110, 0f3D5DC0C3, f625; +fma.rn.f32 f630, f155, 0fBF7F9FE3, f626; +fma.rn.f32 f631, f157, 0f3F7A03CE, f627; +fma.rn.f32 f632, f160, 0f3E5C2136, f628; +fma.rn.f32 f633, f1108, 0f3F7A03CE, f629; +fma.rn.f32 f634, f159, 0f3E5C2136, f630; +fma.rn.f32 f635, f161, 0fBEBD82C0, f631; +fma.rn.f32 f636, f164, 0f3F6DD16B, f632; +fma.rn.f32 f637, f1106, 0fBEBD82C0, f633; +fma.rn.f32 f638, f163, 0f3F6DD16B, f634; +fma.rn.f32 f639, f165, 0fBF5B5AFE, f635; +fma.rn.f32 f640, f168, 0fBF03FB56, f636; +fma.rn.f32 f641, f1103, 0fBF5B5AFE, f637; +fma.rn.f32 f642, f167, 0fBF03FB56, f638; +fma.rn.f32 f643, f169, 0f3F25BB1C, f639; +fma.rn.f32 f644, f172, 0fBF431D0D, f640; +fma.rn.f32 f645, f1101, 0f3F25BB1C, f641; +fma.rn.f32 f646, f171, 0fBF431D0D, f642; +fma.rn.f32 f647, f117, 0fBEBD82C0, %58; +fma.rn.f32 f651, f121, 0fBF39DAD7, f647; +fma.rn.f32 f1075, f120, 0f3F6DD16B, 0f00000000; +fma.rn.f32 f652, f124, 0fBF300D12, f1075; +fma.rn.f32 f1074, f1131, 0fBEBD82C0, %59; +fma.rn.f32 f653, f1129, 0fBF39DAD7, f1074; +fma.rn.f32 f1073, f119, 0f3F6DD16B, 0f00000000; +fma.rn.f32 f654, f123, 0fBF300D12, f1073; +fma.rn.f32 f655, f125, 0f3F6856DD, f651; +fma.rn.f32 f656, f128, 0fBED6FBB4, f652; +fma.rn.f32 f657, f1127, 0f3F6856DD, f653; +fma.rn.f32 f658, f127, 0fBED6FBB4, f654; +fma.rn.f32 f659, f129, 0f3D5DC0C3, f655; +fma.rn.f32 f660, f132, 0f3F7F9FE3, f656; +fma.rn.f32 f661, f1124, 0f3D5DC0C3, f657; +fma.rn.f32 f662, f131, 0f3F7F9FE3, f658; +fma.rn.f32 f663, f133, 0fBF729966, f659; +fma.rn.f32 f664, f136, 0fBEA37B7D, f660; +fma.rn.f32 f665, f1122, 0fBF729966, f661; +fma.rn.f32 f666, f135, 0fBEA37B7D, f662; +fma.rn.f32 f667, f137, 0f3F25BB1C, f663; +fma.rn.f32 f668, f140, 0fBF431D0D, f664; +fma.rn.f32 f669, f1120, 0f3F25BB1C, f665; +fma.rn.f32 f670, f139, 0fBF431D0D, f666; +fma.rn.f32 f671, f141, 0f3EEFD33B, f667; +fma.rn.f32 f672, f144, 0f3F622DD8, f668; +fma.rn.f32 f673, f1117, 0f3EEFD33B, f669; +fma.rn.f32 f674, f143, 0f3F622DD8, f670; +fma.rn.f32 f675, f145, 0fBF7E7FD3, f671; +fma.rn.f32 f676, f148, 0f3DDD6D81, f672; +fma.rn.f32 f677, f1115, 0fBF7E7FD3, f673; +fma.rn.f32 f678, f147, 0f3DDD6D81, f674; +fma.rn.f32 f679, f149, 0f3E88F979, f675; +fma.rn.f32 f680, f152, 0fBF76AB36, f676; +fma.rn.f32 f681, f1113, 0f3E88F979, f677; +fma.rn.f32 f682, f151, 0fBF76AB36, f678; +fma.rn.f32 f683, f153, 0f3F4BCCC1, f679; +fma.rn.f32 f684, f156, 0f3F1AECB3, f680; +fma.rn.f32 f685, f1110, 0f3F4BCCC1, f681; +fma.rn.f32 f686, f155, 0f3F1AECB3, f682; +fma.rn.f32 f687, f157, 0fBF5B5AFE, f683; +fma.rn.f32 f688, f160, 0f3F03FB56, f684; +fma.rn.f32 f689, f1108, 0fBF5B5AFE, f685; +fma.rn.f32 f690, f159, 0f3F03FB56, f686; +fma.rn.f32 f691, f161, 0fBE25AA2E, f687; +fma.rn.f32 f692, f164, 0fBF7CA0AA, f688; +fma.rn.f32 f693, f1106, 0fBE25AA2E, f689; +fma.rn.f32 f694, f163, 0fBF7CA0AA, f690; +fma.rn.f32 f695, f165, 0f3F7A03CE, f691; +fma.rn.f32 f696, f168, 0f3E5C2136, f692; +fma.rn.f32 f697, f1103, 0f3F7A03CE, f693; +fma.rn.f32 f698, f167, 0f3E5C2136, f694; +fma.rn.f32 f699, f169, 0fBF0FA9F5, f695; +fma.rn.f32 f700, f172, 0f3F53E36D, f696; +fma.rn.f32 f701, f1101, 0fBF0FA9F5, f697; +fma.rn.f32 f702, f171, 0f3F53E36D, f698; +fma.rn.f32 f703, f117, 0fBF0FA9F5, %58; +fma.rn.f32 f707, f121, 0fBEBD82C0, f703; +fma.rn.f32 f1072, f120, 0f3F53E36D, 0f00000000; +fma.rn.f32 f708, f124, 0fBF6DD16B, f1072; +fma.rn.f32 f1071, f1131, 0fBF0FA9F5, %59; +fma.rn.f32 f709, f1129, 0fBEBD82C0, f1071; +fma.rn.f32 f1070, f119, 0f3F53E36D, 0f00000000; +fma.rn.f32 f710, f123, 0fBF6DD16B, f1070; +fma.rn.f32 f711, f125, 0f3F7A03CE, f707; +fma.rn.f32 f712, f128, 0f3E5C2136, f708; +fma.rn.f32 f713, f1127, 0f3F7A03CE, f709; +fma.rn.f32 f714, f127, 0f3E5C2136, f710; +fma.rn.f32 f715, f129, 0fBF39DAD7, f711; +fma.rn.f32 f716, f132, 0f3F300D12, f712; +fma.rn.f32 f717, f1124, 0fBF39DAD7, f713; +fma.rn.f32 f718, f131, 0f3F300D12, f714; +fma.rn.f32 f719, f133, 0fBE25AA2E, f715; +fma.rn.f32 f720, f136, 0fBF7CA0AA, f716; +fma.rn.f32 f721, f1122, 0fBE25AA2E, f717; +fma.rn.f32 f722, f135, 0fBF7CA0AA, f718; +fma.rn.f32 f723, f137, 0f3F6856DD, f719; +fma.rn.f32 f724, f140, 0f3ED6FBB4, f720; +fma.rn.f32 f725, f1120, 0f3F6856DD, f721; +fma.rn.f32 f726, f139, 0f3ED6FBB4, f722; +fma.rn.f32 f727, f141, 0fBF5B5AFE, f723; +fma.rn.f32 f728, f144, 0f3F03FB56, f724; +fma.rn.f32 f729, f1117, 0fBF5B5AFE, f725; +fma.rn.f32 f730, f143, 0f3F03FB56, f726; +fma.rn.f32 f731, f145, 0f3D5DC0C3, f727; +fma.rn.f32 f732, f148, 0fBF7F9FE3, f728; +fma.rn.f32 f733, f1115, 0f3D5DC0C3, f729; +fma.rn.f32 f734, f147, 0fBF7F9FE3, f730; +fma.rn.f32 f735, f149, 0f3F4BCCC1, f731; +fma.rn.f32 f736, f152, 0f3F1AECB3, f732; +fma.rn.f32 f737, f1113, 0f3F4BCCC1, f733; +fma.rn.f32 f738, f151, 0f3F1AECB3, f734; +fma.rn.f32 f739, f153, 0fBF729966, f735; +fma.rn.f32 f740, f156, 0f3EA37B7D, f736; +fma.rn.f32 f741, f1110, 0fBF729966, f737; +fma.rn.f32 f742, f155, 0f3EA37B7D, f738; +fma.rn.f32 f743, f157, 0f3E88F979, f739; +fma.rn.f32 f744, f160, 0fBF76AB36, f740; +fma.rn.f32 f745, f1108, 0f3E88F979, f741; +fma.rn.f32 f746, f159, 0fBF76AB36, f742; +fma.rn.f32 f747, f161, 0f3F25BB1C, f743; +fma.rn.f32 f748, f164, 0f3F431D0D, f744; +fma.rn.f32 f749, f1106, 0f3F25BB1C, f745; +fma.rn.f32 f750, f163, 0f3F431D0D, f746; +fma.rn.f32 f751, f165, 0fBF7E7FD3, f747; +fma.rn.f32 f752, f168, 0f3DDD6D81, f748; +fma.rn.f32 f753, f1103, 0fBF7E7FD3, f749; +fma.rn.f32 f754, f167, 0f3DDD6D81, f750; +fma.rn.f32 f755, f169, 0f3EEFD33B, f751; +fma.rn.f32 f756, f172, 0fBF622DD8, f752; +fma.rn.f32 f757, f1101, 0f3EEFD33B, f753; +fma.rn.f32 f758, f171, 0fBF622DD8, f754; +fma.rn.f32 f759, f117, 0fBF39DAD7, %58; +fma.rn.f32 f763, f121, 0f3D5DC0C3, f759; +fma.rn.f32 f1069, f120, 0f3F300D12, 0f00000000; +fma.rn.f32 f764, f124, 0fBF7F9FE3, f1069; +fma.rn.f32 f1068, f1131, 0fBF39DAD7, %59; +fma.rn.f32 f765, f1129, 0f3D5DC0C3, f1068; +fma.rn.f32 f1067, f119, 0f3F300D12, 0f00000000; +fma.rn.f32 f766, f123, 0fBF7F9FE3, f1067; +fma.rn.f32 f767, f125, 0f3F25BB1C, f763; +fma.rn.f32 f768, f128, 0f3F431D0D, f764; +fma.rn.f32 f769, f1127, 0f3F25BB1C, f765; +fma.rn.f32 f770, f127, 0f3F431D0D, f766; +fma.rn.f32 f771, f129, 0fBF7E7FD3, f767; +fma.rn.f32 f772, f132, 0fBDDD6D81, f768; +fma.rn.f32 f773, f1124, 0fBF7E7FD3, f769; +fma.rn.f32 f774, f131, 0fBDDD6D81, f770; +fma.rn.f32 f775, f133, 0f3F4BCCC1, f771; +fma.rn.f32 f776, f136, 0fBF1AECB3, f772; +fma.rn.f32 f777, f1122, 0f3F4BCCC1, f773; +fma.rn.f32 f778, f135, 0fBF1AECB3, f774; +fma.rn.f32 f779, f137, 0fBE25AA2E, f775; +fma.rn.f32 f780, f140, 0f3F7CA0AA, f776; +fma.rn.f32 f781, f1120, 0fBE25AA2E, f777; +fma.rn.f32 f782, f139, 0f3F7CA0AA, f778; +fma.rn.f32 f783, f141, 0fBF0FA9F5, f779; +fma.rn.f32 f784, f144, 0fBF53E36D, f780; +fma.rn.f32 f785, f1117, 0fBF0FA9F5, f781; +fma.rn.f32 f786, f143, 0fBF53E36D, f782; +fma.rn.f32 f787, f145, 0f3F7A03CE, f783; +fma.rn.f32 f788, f148, 0f3E5C2136, f784; +fma.rn.f32 f789, f1115, 0f3F7A03CE, f785; +fma.rn.f32 f790, f147, 0f3E5C2136, f786; +fma.rn.f32 f791, f149, 0fBF5B5AFE, f787; +fma.rn.f32 f792, f152, 0f3F03FB56, f788; +fma.rn.f32 f793, f1113, 0fBF5B5AFE, f789; +fma.rn.f32 f794, f151, 0f3F03FB56, f790; +fma.rn.f32 f795, f153, 0f3E88F979, f791; +fma.rn.f32 f796, f156, 0fBF76AB36, f792; +fma.rn.f32 f797, f1110, 0f3E88F979, f793; +fma.rn.f32 f798, f155, 0fBF76AB36, f794; +fma.rn.f32 f799, f157, 0f3EEFD33B, f795; +fma.rn.f32 f800, f160, 0f3F622DD8, f796; +fma.rn.f32 f801, f1108, 0f3EEFD33B, f797; +fma.rn.f32 f802, f159, 0f3F622DD8, f798; +fma.rn.f32 f803, f161, 0fBF729966, f799; +fma.rn.f32 f804, f164, 0fBEA37B7D, f800; +fma.rn.f32 f805, f1106, 0fBF729966, f801; +fma.rn.f32 f806, f163, 0fBEA37B7D, f802; +fma.rn.f32 f807, f165, 0f3F6856DD, f803; +fma.rn.f32 f808, f168, 0fBED6FBB4, f804; +fma.rn.f32 f809, f1103, 0f3F6856DD, f805; +fma.rn.f32 f810, f167, 0fBED6FBB4, f806; +fma.rn.f32 f811, f169, 0fBEBD82C0, f807; +fma.rn.f32 f812, f172, 0f3F6DD16B, f808; +fma.rn.f32 f813, f1101, 0fBEBD82C0, f809; +fma.rn.f32 f814, f171, 0f3F6DD16B, f810; +fma.rn.f32 f815, f117, 0fBF5B5AFE, %58; +fma.rn.f32 f819, f121, 0f3EEFD33B, f815; +fma.rn.f32 f1066, f120, 0f3F03FB56, 0f00000000; +fma.rn.f32 f820, f124, 0fBF622DD8, f1066; +fma.rn.f32 f1065, f1131, 0fBF5B5AFE, %59; +fma.rn.f32 f821, f1129, 0f3EEFD33B, f1065; +fma.rn.f32 f1064, f119, 0f3F03FB56, 0f00000000; +fma.rn.f32 f822, f123, 0fBF622DD8, f1064; +fma.rn.f32 f823, f125, 0f3D5DC0C3, f819; +fma.rn.f32 f824, f128, 0f3F7F9FE3, f820; +fma.rn.f32 f825, f1127, 0f3D5DC0C3, f821; +fma.rn.f32 f826, f127, 0f3F7F9FE3, f822; +fma.rn.f32 f827, f129, 0fBF0FA9F5, f823; +fma.rn.f32 f828, f132, 0fBF53E36D, f824; +fma.rn.f32 f829, f1124, 0fBF0FA9F5, f825; +fma.rn.f32 f830, f131, 0fBF53E36D, f826; +fma.rn.f32 f831, f133, 0f3F6856DD, f827; +fma.rn.f32 f832, f136, 0f3ED6FBB4, f828; +fma.rn.f32 f833, f1122, 0f3F6856DD, f829; +fma.rn.f32 f834, f135, 0f3ED6FBB4, f830; +fma.rn.f32 f835, f137, 0fBF7E7FD3, f831; +fma.rn.f32 f836, f140, 0f3DDD6D81, f832; +fma.rn.f32 f837, f1120, 0fBF7E7FD3, f833; +fma.rn.f32 f838, f139, 0f3DDD6D81, f834; +fma.rn.f32 f839, f141, 0f3F4BCCC1, f835; +fma.rn.f32 f840, f144, 0fBF1AECB3, f836; +fma.rn.f32 f841, f1117, 0f3F4BCCC1, f837; +fma.rn.f32 f842, f143, 0fBF1AECB3, f838; +fma.rn.f32 f843, f145, 0fBEBD82C0, f839; +fma.rn.f32 f844, f148, 0f3F6DD16B, f840; +fma.rn.f32 f845, f1115, 0fBEBD82C0, f841; +fma.rn.f32 f846, f147, 0f3F6DD16B, f842; +fma.rn.f32 f847, f149, 0fBE25AA2E, f843; +fma.rn.f32 f848, f152, 0fBF7CA0AA, f844; +fma.rn.f32 f849, f1113, 0fBE25AA2E, f845; +fma.rn.f32 f850, f151, 0fBF7CA0AA, f846; +fma.rn.f32 f851, f153, 0f3F25BB1C, f847; +fma.rn.f32 f852, f156, 0f3F431D0D, f848; +fma.rn.f32 f853, f1110, 0f3F25BB1C, f849; +fma.rn.f32 f854, f155, 0f3F431D0D, f850; +fma.rn.f32 f855, f157, 0fBF729966, f851; +fma.rn.f32 f856, f160, 0fBEA37B7D, f852; +fma.rn.f32 f857, f1108, 0fBF729966, f853; +fma.rn.f32 f858, f159, 0fBEA37B7D, f854; +fma.rn.f32 f859, f161, 0f3F7A03CE, f855; +fma.rn.f32 f860, f164, 0fBE5C2136, f856; +fma.rn.f32 f861, f1106, 0f3F7A03CE, f857; +fma.rn.f32 f862, f163, 0fBE5C2136, f858; +fma.rn.f32 f863, f165, 0fBF39DAD7, f859; +fma.rn.f32 f864, f168, 0f3F300D12, f860; +fma.rn.f32 f865, f1103, 0fBF39DAD7, f861; +fma.rn.f32 f866, f167, 0f3F300D12, f862; +fma.rn.f32 f867, f169, 0f3E88F979, f863; +fma.rn.f32 f868, f172, 0fBF76AB36, f864; +fma.rn.f32 f869, f1101, 0f3E88F979, f865; +fma.rn.f32 f870, f171, 0fBF76AB36, f866; +fma.rn.f32 f871, f117, 0fBF729966, %58; +fma.rn.f32 f875, f121, 0f3F4BCCC1, f871; +fma.rn.f32 f1063, f120, 0f3EA37B7D, 0f00000000; +fma.rn.f32 f876, f124, 0fBF1AECB3, f1063; +fma.rn.f32 f1062, f1131, 0fBF729966, %59; +fma.rn.f32 f877, f1129, 0f3F4BCCC1, f1062; +fma.rn.f32 f1061, f119, 0f3EA37B7D, 0f00000000; +fma.rn.f32 f878, f123, 0fBF1AECB3, f1061; +fma.rn.f32 f879, f125, 0fBF0FA9F5, f875; +fma.rn.f32 f880, f128, 0f3F53E36D, f876; +fma.rn.f32 f881, f1127, 0fBF0FA9F5, f877; +fma.rn.f32 f882, f127, 0f3F53E36D, f878; +fma.rn.f32 f883, f129, 0f3E88F979, f879; +fma.rn.f32 f884, f132, 0fBF76AB36, f880; +fma.rn.f32 f885, f1124, 0f3E88F979, f881; +fma.rn.f32 f886, f131, 0fBF76AB36, f882; +fma.rn.f32 f887, f133, 0f3D5DC0C3, f883; +fma.rn.f32 f888, f136, 0f3F7F9FE3, f884; +fma.rn.f32 f889, f1122, 0f3D5DC0C3, f885; +fma.rn.f32 f890, f135, 0f3F7F9FE3, f886; +fma.rn.f32 f891, f137, 0fBEBD82C0, f887; +fma.rn.f32 f892, f140, 0fBF6DD16B, f888; +fma.rn.f32 f893, f1120, 0fBEBD82C0, f889; +fma.rn.f32 f894, f139, 0fBF6DD16B, f890; +fma.rn.f32 f895, f141, 0f3F25BB1C, f891; +fma.rn.f32 f896, f144, 0f3F431D0D, f892; +fma.rn.f32 f897, f1117, 0f3F25BB1C, f893; +fma.rn.f32 f898, f143, 0f3F431D0D, f894; +fma.rn.f32 f899, f145, 0fBF5B5AFE, f895; +fma.rn.f32 f900, f148, 0fBF03FB56, f896; +fma.rn.f32 f901, f1115, 0fBF5B5AFE, f897; +fma.rn.f32 f902, f147, 0fBF03FB56, f898; +fma.rn.f32 f903, f149, 0f3F7A03CE, f899; +fma.rn.f32 f904, f152, 0f3E5C2136, f900; +fma.rn.f32 f905, f1113, 0f3F7A03CE, f901; +fma.rn.f32 f906, f151, 0f3E5C2136, f902; +fma.rn.f32 f907, f153, 0fBF7E7FD3, f903; +fma.rn.f32 f908, f156, 0f3DDD6D81, f904; +fma.rn.f32 f909, f1110, 0fBF7E7FD3, f905; +fma.rn.f32 f910, f155, 0f3DDD6D81, f906; +fma.rn.f32 f911, f157, 0f3F6856DD, f907; +fma.rn.f32 f912, f160, 0fBED6FBB4, f908; +fma.rn.f32 f913, f1108, 0f3F6856DD, f909; +fma.rn.f32 f914, f159, 0fBED6FBB4, f910; +fma.rn.f32 f915, f161, 0fBF39DAD7, f911; +fma.rn.f32 f916, f164, 0f3F300D12, f912; +fma.rn.f32 f917, f1106, 0fBF39DAD7, f913; +fma.rn.f32 f918, f163, 0f3F300D12, f914; +fma.rn.f32 f919, f165, 0f3EEFD33B, f915; +fma.rn.f32 f920, f168, 0fBF622DD8, f916; +fma.rn.f32 f921, f1103, 0f3EEFD33B, f917; +fma.rn.f32 f922, f167, 0fBF622DD8, f918; +fma.rn.f32 f923, f169, 0fBE25AA2E, f919; +fma.rn.f32 f924, f172, 0f3F7CA0AA, f920; +fma.rn.f32 f925, f1101, 0fBE25AA2E, f921; +fma.rn.f32 f926, f171, 0f3F7CA0AA, f922; +fma.rn.f32 f927, f117, 0fBF7E7FD3, %58; +fma.rn.f32 f928, f120, 0f3DDD6D81, 0f00000000; +fma.rn.f32 f929, f1131, 0fBF7E7FD3, %59; +fma.rn.f32 f930, f119, 0f3DDD6D81, 0f00000000; +fma.rn.f32 f931, f121, 0f3F7A03CE, f927; +fma.rn.f32 f932, f124, 0fBE5C2136, f928; +fma.rn.f32 f933, f1129, 0f3F7A03CE, f929; +fma.rn.f32 f934, f123, 0fBE5C2136, f930; +fma.rn.f32 f935, f125, 0fBF729966, f931; +fma.rn.f32 f936, f128, 0f3EA37B7D, f932; +fma.rn.f32 f937, f1127, 0fBF729966, f933; +fma.rn.f32 f938, f127, 0f3EA37B7D, f934; +fma.rn.f32 f939, f129, 0f3F6856DD, f935; +fma.rn.f32 f940, f132, 0fBED6FBB4, f936; +fma.rn.f32 f941, f1124, 0f3F6856DD, f937; +fma.rn.f32 f942, f131, 0fBED6FBB4, f938; +fma.rn.f32 f943, f133, 0fBF5B5AFE, f939; +fma.rn.f32 f944, f136, 0f3F03FB56, f940; +fma.rn.f32 f945, f1122, 0fBF5B5AFE, f941; +fma.rn.f32 f946, f135, 0f3F03FB56, f942; +fma.rn.f32 f947, f137, 0f3F4BCCC1, f943; +fma.rn.f32 f948, f140, 0fBF1AECB3, f944; +fma.rn.f32 f949, f1120, 0f3F4BCCC1, f945; +fma.rn.f32 f950, f139, 0fBF1AECB3, f946; +fma.rn.f32 f951, f141, 0fBF39DAD7, f947; +fma.rn.f32 f952, f144, 0f3F300D12, f948; +fma.rn.f32 f953, f1117, 0fBF39DAD7, f949; +fma.rn.f32 f954, f143, 0f3F300D12, f950; +fma.rn.f32 f955, f145, 0f3F25BB1C, f951; +fma.rn.f32 f956, f148, 0fBF431D0D, f952; +fma.rn.f32 f957, f1115, 0f3F25BB1C, f953; +fma.rn.f32 f958, f147, 0fBF431D0D, f954; +fma.rn.f32 f959, f149, 0fBF0FA9F5, f955; +fma.rn.f32 f960, f152, 0f3F53E36D, f956; +fma.rn.f32 f961, f1113, 0fBF0FA9F5, f957; +fma.rn.f32 f962, f151, 0f3F53E36D, f958; +fma.rn.f32 f963, f153, 0f3EEFD33B, f959; +fma.rn.f32 f964, f156, 0fBF622DD8, f960; +fma.rn.f32 f965, f1110, 0f3EEFD33B, f961; +fma.rn.f32 f966, f155, 0fBF622DD8, f962; +fma.rn.f32 f967, f157, 0fBEBD82C0, f963; +fma.rn.f32 f968, f160, 0f3F6DD16B, f964; +fma.rn.f32 f969, f1108, 0fBEBD82C0, f965; +fma.rn.f32 f970, f159, 0f3F6DD16B, f966; +fma.rn.f32 f971, f161, 0f3E88F979, f967; +fma.rn.f32 f972, f164, 0fBF76AB36, f968; +fma.rn.f32 f973, f1106, 0f3E88F979, f969; +fma.rn.f32 f974, f163, 0fBF76AB36, f970; +fma.rn.f32 f975, f165, 0fBE25AA2E, f971; +fma.rn.f32 f976, f168, 0f3F7CA0AA, f972; +fma.rn.f32 f977, f1103, 0fBE25AA2E, f973; +fma.rn.f32 f978, f167, 0f3F7CA0AA, f974; +fma.rn.f32 f979, f169, 0f3D5DC0C3, f975; +fma.rn.f32 f980, f172, 0fBF7F9FE3, f976; +fma.rn.f32 f981, f1101, 0f3D5DC0C3, f977; +fma.rn.f32 f982, f171, 0fBF7F9FE3, f978; +add.f32 %1, f198, f1101; +add.f32 %0, f197, f169; +sub.f32 %2, f251, f252; +add.f32 %3, f253, f254; +sub.f32 %4, f307, f308; +add.f32 %5, f309, f310; +sub.f32 %6, f363, f364; +add.f32 %7, f365, f366; +sub.f32 %8, f419, f420; +add.f32 %9, f421, f422; +add.f32 %11, f477, f478; +sub.f32 %10, f475, f476; +add.f32 %13, f533, f534; +sub.f32 %12, f531, f532; +add.f32 %15, f589, f590; +sub.f32 %14, f587, f588; +sub.f32 %16, f643, f644; +add.f32 %17, f645, f646; +sub.f32 %18, f699, f700; +add.f32 %19, f701, f702; +sub.f32 %20, f755, f756; +add.f32 %21, f757, f758; +add.f32 %23, f813, f814; +sub.f32 %22, f811, f812; +add.f32 %25, f869, f870; +sub.f32 %24, f867, f868; +add.f32 %27, f925, f926; +sub.f32 %26, f923, f924; +sub.f32 %28, f979, f980; +add.f32 %29, f981, f982; +sub.f32 %31, f981, f982; +add.f32 %30, f979, f980; +sub.f32 %33, f925, f926; +add.f32 %32, f923, f924; +sub.f32 %35, f869, f870; +add.f32 %34, f867, f868; +sub.f32 %37, f813, f814; +add.f32 %36, f811, f812; +sub.f32 %39, f757, f758; +add.f32 %38, f755, f756; +sub.f32 %41, f701, f702; +add.f32 %40, f699, f700; +sub.f32 %43, f645, f646; +add.f32 %42, f643, f644; +sub.f32 %45, f589, f590; +add.f32 %44, f587, f588; +sub.f32 %47, f533, f534; +add.f32 %46, f531, f532; +sub.f32 %49, f477, f478; +add.f32 %48, f475, f476; +sub.f32 %51, f421, f422; +add.f32 %50, f419, f420; +sub.f32 %53, f365, f366; +add.f32 %52, f363, f364; +sub.f32 %55, f309, f310; +add.f32 %54, f307, f308; +sub.f32 %57, f253, f254; +add.f32 %56, f251, f252; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[1].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[25].y), "f"(rmem[4].y), "f"(rmem[5].y), "f"(rmem[23].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[11].y), "f"(rmem[17].y), "f"(rmem[16].y), "f"(rmem[13].y), "f"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..a67c9f6ac8a0a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp64_fwd.hpp.inc @@ -0,0 +1,940 @@ +#ifndef CUFFTDX_FFT_29_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_29_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<417, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<1133>; +.reg .b64 rd<4>; +add.f64 fd117, %60, %114; +sub.f64 fd119, %60, %114; +add.f64 fd1131, %116, %115; +sub.f64 fd120, %116, %115; +add.f64 fd121, %62, %112; +sub.f64 fd123, %62, %112; +add.f64 fd1129, %117, %113; +sub.f64 fd124, %117, %113; +add.f64 fd125, %64, %110; +sub.f64 fd127, %64, %110; +add.f64 fd1127, %65, %118; +sub.f64 fd128, %65, %118; +add.f64 fd129, %66, %108; +sub.f64 fd131, %66, %108; +add.f64 fd1124, %120, %119; +sub.f64 fd132, %120, %119; +add.f64 fd133, %68, %106; +sub.f64 fd135, %68, %106; +add.f64 fd1122, %121, %107; +sub.f64 fd136, %121, %107; +add.f64 fd137, %70, %104; +sub.f64 fd139, %70, %104; +add.f64 fd1120, %71, %122; +sub.f64 fd140, %71, %122; +add.f64 fd141, %72, %102; +sub.f64 fd143, %72, %102; +add.f64 fd1117, %124, %123; +sub.f64 fd144, %124, %123; +add.f64 fd145, %74, %100; +sub.f64 fd147, %74, %100; +add.f64 fd1115, %125, %101; +sub.f64 fd148, %125, %101; +add.f64 fd149, %76, %98; +sub.f64 fd151, %76, %98; +add.f64 fd1113, %77, %126; +sub.f64 fd152, %77, %126; +add.f64 fd153, %78, %96; +sub.f64 fd155, %78, %96; +add.f64 fd1110, %128, %127; +sub.f64 fd156, %128, %127; +add.f64 fd157, %80, %94; +sub.f64 fd159, %80, %94; +add.f64 fd1108, %129, %95; +sub.f64 fd160, %129, %95; +add.f64 fd161, %82, %92; +sub.f64 fd163, %82, %92; +add.f64 fd1106, %83, %130; +sub.f64 fd164, %83, %130; +add.f64 fd165, %84, %90; +sub.f64 fd167, %84, %90; +add.f64 fd1103, %132, %131; +sub.f64 fd168, %132, %131; +add.f64 fd169, %86, %88; +sub.f64 fd171, %86, %88; +add.f64 fd1101, %133, %89; +sub.f64 fd172, %133, %89; +add.f64 fd173, %58, fd117; +add.f64 fd175, fd173, fd121; +add.f64 fd1100, %59, fd1131; +add.f64 fd176, fd1100, fd1129; +add.f64 fd177, fd175, fd125; +add.f64 fd178, fd176, fd1127; +add.f64 fd179, fd177, fd129; +add.f64 fd180, fd178, fd1124; +add.f64 fd181, fd179, fd133; +add.f64 fd182, fd180, fd1122; +add.f64 fd183, fd181, fd137; +add.f64 fd184, fd182, fd1120; +add.f64 fd185, fd183, fd141; +add.f64 fd186, fd184, fd1117; +add.f64 fd187, fd185, fd145; +add.f64 fd188, fd186, fd1115; +add.f64 fd189, fd187, fd149; +add.f64 fd190, fd188, fd1113; +add.f64 fd191, fd189, fd153; +add.f64 fd192, fd190, fd1110; +add.f64 fd193, fd191, fd157; +add.f64 fd194, fd192, fd1108; +add.f64 fd195, fd193, fd161; +add.f64 fd196, fd194, fd1106; +add.f64 fd197, fd195, fd165; +add.f64 fd198, fd196, fd1103; +fma.rn.f64 fd199, fd117, 0d3FEF4079C06C0992, %58; +fma.rn.f64 fd203, fd121, 0d3FED0ADB9B447CCF, fd199; +fma.rn.f64 fd1099, fd120, 0dBFCB8426C12812BC, 0d0000000000000000; +fma.rn.f64 fd204, fd124, 0dBFDADF7689C97B70, fd1099; +fma.rn.f64 fd1098, fd1131, 0d3FEF4079C06C0992, %59; +fma.rn.f64 fd205, fd1129, 0d3FED0ADB9B447CCF, fd1098; +fma.rn.f64 fd1097, fd119, 0dBFCB8426C12812BC, 0d0000000000000000; +fma.rn.f64 fd206, fd123, 0dBFDADF7689C97B70, fd1097; +fma.rn.f64 fd207, fd125, 0d3FE979982A38E65A, fd203; +fma.rn.f64 fd208, fd128, 0dBFE35D9650D47852, fd204; +fma.rn.f64 fd209, fd1127, 0d3FE979982A38E65A, fd205; +fma.rn.f64 fd210, fd127, 0dBFE35D9650D47852, fd206; +fma.rn.f64 fd211, fd129, 0d3FE4B76371208A62, fd207; +fma.rn.f64 fd212, fd132, 0dBFE863A1ADA0CFA6, fd208; +fma.rn.f64 fd213, fd1124, 0d3FE4B76371208A62, fd209; +fma.rn.f64 fd214, fd131, 0dBFE863A1ADA0CFA6, fd210; +fma.rn.f64 fd215, fd133, 0d3FDDFA67657E7608, fd211; +fma.rn.f64 fd216, fd136, 0dBFEC45BB0D10918C, fd212; +fma.rn.f64 fd217, fd1122, 0d3FDDFA67657E7608, fd213; +fma.rn.f64 fd218, fd135, 0dBFEC45BB0D10918C, fd214; +fma.rn.f64 fd219, fd137, 0d3FD11F2F2E2F1E3B, fd215; +fma.rn.f64 fd220, fd140, 0dBFEED566CB3DCBA1, fd216; +fma.rn.f64 fd221, fd1120, 0d3FD11F2F2E2F1E3B, fd217; +fma.rn.f64 fd222, fd139, 0dBFEED566CB3DCBA1, fd218; +fma.rn.f64 fd223, fd141, 0d3FABB81853A18977, fd219; +fma.rn.f64 fd224, fd144, 0dBFEFF3FC588E859D, fd220; +fma.rn.f64 fd225, fd1117, 0d3FABB81853A18977, fd221; +fma.rn.f64 fd226, fd143, 0dBFEFF3FC588E859D, fd222; +fma.rn.f64 fd227, fd145, 0dBFC4B545C0234A71, fd223; +fma.rn.f64 fd228, fd148, 0dBFEF941537248537, fd224; +fma.rn.f64 fd229, fd1115, 0dBFC4B545C0234A71, fd225; +fma.rn.f64 fd230, fd147, 0dBFEF941537248537, fd226; +fma.rn.f64 fd231, fd149, 0dBFD7B057F20BF2E4, fd227; +fma.rn.f64 fd232, fd152, 0dBFEDBA2D62CB789F, fd228; +fma.rn.f64 fd233, fd1113, 0dBFD7B057F20BF2E4, fd229; +fma.rn.f64 fd234, fd151, 0dBFEDBA2D62CB789F, fd230; +fma.rn.f64 fd235, fd153, 0dBFE1F53E93956DBF, fd231; +fma.rn.f64 fd236, fd156, 0dBFEA7C6DA34AF89F, fd232; +fma.rn.f64 fd237, fd1110, 0dBFE1F53E93956DBF, fd233; +fma.rn.f64 fd238, fd155, 0dBFEA7C6DA34AF89F, fd234; +fma.rn.f64 fd239, fd157, 0dBFE73B5AE5DB4E10, fd235; +fma.rn.f64 fd240, fd160, 0dBFE601A24BA81342, fd236; +fma.rn.f64 fd241, fd1108, 0dBFE73B5AE5DB4E10, fd237; +fma.rn.f64 fd242, fd159, 0dBFE601A24BA81342, fd238; +fma.rn.f64 fd243, fd161, 0dBFEB6B5FBD9F7255, fd239; +fma.rn.f64 fd244, fd164, 0dBFE07F6ACD7CDCE2, fd240; +fma.rn.f64 fd245, fd1106, 0dBFEB6B5FBD9F7255, fd241; +fma.rn.f64 fd246, fd163, 0dBFE07F6ACD7CDCE2, fd242; +fma.rn.f64 fd247, fd165, 0dBFEE532CBE45C954, fd243; +fma.rn.f64 fd248, fd168, 0dBFD46F6FAF5FCB72, fd244; +fma.rn.f64 fd249, fd1103, 0dBFEE532CBE45C954, fd245; +fma.rn.f64 fd250, fd167, 0dBFD46F6FAF5FCB72, fd246; +fma.rn.f64 fd251, fd169, 0dBFEFCFFA67B61650, fd247; +fma.rn.f64 fd252, fd172, 0dBFBBADB02034D9FF, fd248; +fma.rn.f64 fd253, fd1101, 0dBFEFCFFA67B61650, fd249; +fma.rn.f64 fd254, fd171, 0dBFBBADB02034D9FF, fd250; +fma.rn.f64 fd255, fd117, 0d3FED0ADB9B447CCF, %58; +fma.rn.f64 fd259, fd121, 0d3FE4B76371208A62, fd255; +fma.rn.f64 fd1096, fd120, 0dBFDADF7689C97B70, 0d0000000000000000; +fma.rn.f64 fd260, fd124, 0dBFE863A1ADA0CFA6, fd1096; +fma.rn.f64 fd1095, fd1131, 0d3FED0ADB9B447CCF, %59; +fma.rn.f64 fd261, fd1129, 0d3FE4B76371208A62, fd1095; +fma.rn.f64 fd1094, fd119, 0dBFDADF7689C97B70, 0d0000000000000000; +fma.rn.f64 fd262, fd123, 0dBFE863A1ADA0CFA6, fd1094; +fma.rn.f64 fd263, fd125, 0d3FD11F2F2E2F1E3B, fd259; +fma.rn.f64 fd264, fd128, 0dBFEED566CB3DCBA1, fd260; +fma.rn.f64 fd265, fd1127, 0d3FD11F2F2E2F1E3B, fd261; +fma.rn.f64 fd266, fd127, 0dBFEED566CB3DCBA1, fd262; +fma.rn.f64 fd267, fd129, 0dBFC4B545C0234A71, fd263; +fma.rn.f64 fd268, fd132, 0dBFEF941537248537, fd264; +fma.rn.f64 fd269, fd1124, 0dBFC4B545C0234A71, fd265; +fma.rn.f64 fd270, fd131, 0dBFEF941537248537, fd266; +fma.rn.f64 fd271, fd133, 0dBFE1F53E93956DBF, fd267; +fma.rn.f64 fd272, fd136, 0dBFEA7C6DA34AF89F, fd268; +fma.rn.f64 fd273, fd1122, 0dBFE1F53E93956DBF, fd269; +fma.rn.f64 fd274, fd135, 0dBFEA7C6DA34AF89F, fd270; +fma.rn.f64 fd275, fd137, 0dBFEB6B5FBD9F7255, fd271; +fma.rn.f64 fd276, fd140, 0dBFE07F6ACD7CDCE2, fd272; +fma.rn.f64 fd277, fd1120, 0dBFEB6B5FBD9F7255, fd273; +fma.rn.f64 fd278, fd139, 0dBFE07F6ACD7CDCE2, fd274; +fma.rn.f64 fd279, fd141, 0dBFEFCFFA67B61650, fd275; +fma.rn.f64 fd280, fd144, 0dBFBBADB02034D9FF, fd276; +fma.rn.f64 fd281, fd1117, 0dBFEFCFFA67B61650, fd277; +fma.rn.f64 fd282, fd143, 0dBFBBADB02034D9FF, fd278; +fma.rn.f64 fd283, fd145, 0dBFEE532CBE45C954, fd279; +fma.rn.f64 fd284, fd148, 0d3FD46F6FAF5FCB72, fd280; +fma.rn.f64 fd285, fd1115, 0dBFEE532CBE45C954, fd281; +fma.rn.f64 fd286, fd147, 0d3FD46F6FAF5FCB72, fd282; +fma.rn.f64 fd287, fd149, 0dBFE73B5AE5DB4E10, fd283; +fma.rn.f64 fd288, fd152, 0d3FE601A24BA81342, fd284; +fma.rn.f64 fd289, fd1113, 0dBFE73B5AE5DB4E10, fd285; +fma.rn.f64 fd290, fd151, 0d3FE601A24BA81342, fd286; +fma.rn.f64 fd291, fd153, 0dBFD7B057F20BF2E4, fd287; +fma.rn.f64 fd292, fd156, 0d3FEDBA2D62CB789F, fd288; +fma.rn.f64 fd293, fd1110, 0dBFD7B057F20BF2E4, fd289; +fma.rn.f64 fd294, fd155, 0d3FEDBA2D62CB789F, fd290; +fma.rn.f64 fd295, fd157, 0d3FABB81853A18977, fd291; +fma.rn.f64 fd296, fd160, 0d3FEFF3FC588E859D, fd292; +fma.rn.f64 fd297, fd1108, 0d3FABB81853A18977, fd293; +fma.rn.f64 fd298, fd159, 0d3FEFF3FC588E859D, fd294; +fma.rn.f64 fd299, fd161, 0d3FDDFA67657E7608, fd295; +fma.rn.f64 fd300, fd164, 0d3FEC45BB0D10918C, fd296; +fma.rn.f64 fd301, fd1106, 0d3FDDFA67657E7608, fd297; +fma.rn.f64 fd302, fd163, 0d3FEC45BB0D10918C, fd298; +fma.rn.f64 fd303, fd165, 0d3FE979982A38E65A, fd299; +fma.rn.f64 fd304, fd168, 0d3FE35D9650D47852, fd300; +fma.rn.f64 fd305, fd1103, 0d3FE979982A38E65A, fd301; +fma.rn.f64 fd306, fd167, 0d3FE35D9650D47852, fd302; +fma.rn.f64 fd307, fd169, 0d3FEF4079C06C0992, fd303; +fma.rn.f64 fd308, fd172, 0d3FCB8426C12812BC, fd304; +fma.rn.f64 fd309, fd1101, 0d3FEF4079C06C0992, fd305; +fma.rn.f64 fd310, fd171, 0d3FCB8426C12812BC, fd306; +fma.rn.f64 fd311, fd117, 0d3FE979982A38E65A, %58; +fma.rn.f64 fd315, fd121, 0d3FD11F2F2E2F1E3B, fd311; +fma.rn.f64 fd1093, fd120, 0dBFE35D9650D47852, 0d0000000000000000; +fma.rn.f64 fd316, fd124, 0dBFEED566CB3DCBA1, fd1093; +fma.rn.f64 fd1092, fd1131, 0d3FE979982A38E65A, %59; +fma.rn.f64 fd317, fd1129, 0d3FD11F2F2E2F1E3B, fd1092; +fma.rn.f64 fd1091, fd119, 0dBFE35D9650D47852, 0d0000000000000000; +fma.rn.f64 fd318, fd123, 0dBFEED566CB3DCBA1, fd1091; +fma.rn.f64 fd319, fd125, 0dBFD7B057F20BF2E4, fd315; +fma.rn.f64 fd320, fd128, 0dBFEDBA2D62CB789F, fd316; +fma.rn.f64 fd321, fd1127, 0dBFD7B057F20BF2E4, fd317; +fma.rn.f64 fd322, fd127, 0dBFEDBA2D62CB789F, fd318; +fma.rn.f64 fd323, fd129, 0dBFEB6B5FBD9F7255, fd319; +fma.rn.f64 fd324, fd132, 0dBFE07F6ACD7CDCE2, fd320; +fma.rn.f64 fd325, fd1124, 0dBFEB6B5FBD9F7255, fd321; +fma.rn.f64 fd326, fd131, 0dBFE07F6ACD7CDCE2, fd322; +fma.rn.f64 fd327, fd133, 0dBFEFCFFA67B61650, fd323; +fma.rn.f64 fd328, fd136, 0d3FBBADB02034D9FF, fd324; +fma.rn.f64 fd329, fd1122, 0dBFEFCFFA67B61650, fd325; +fma.rn.f64 fd330, fd135, 0d3FBBADB02034D9FF, fd326; +fma.rn.f64 fd331, fd137, 0dBFE73B5AE5DB4E10, fd327; +fma.rn.f64 fd332, fd140, 0d3FE601A24BA81342, fd328; +fma.rn.f64 fd333, fd1120, 0dBFE73B5AE5DB4E10, fd329; +fma.rn.f64 fd334, fd139, 0d3FE601A24BA81342, fd330; +fma.rn.f64 fd335, fd141, 0dBFC4B545C0234A71, fd331; +fma.rn.f64 fd336, fd144, 0d3FEF941537248537, fd332; +fma.rn.f64 fd337, fd1117, 0dBFC4B545C0234A71, fd333; +fma.rn.f64 fd338, fd143, 0d3FEF941537248537, fd334; +fma.rn.f64 fd339, fd145, 0d3FDDFA67657E7608, fd335; +fma.rn.f64 fd340, fd148, 0d3FEC45BB0D10918C, fd336; +fma.rn.f64 fd341, fd1115, 0d3FDDFA67657E7608, fd337; +fma.rn.f64 fd342, fd147, 0d3FEC45BB0D10918C, fd338; +fma.rn.f64 fd343, fd149, 0d3FED0ADB9B447CCF, fd339; +fma.rn.f64 fd344, fd152, 0d3FDADF7689C97B70, fd340; +fma.rn.f64 fd345, fd1113, 0d3FED0ADB9B447CCF, fd341; +fma.rn.f64 fd346, fd151, 0d3FDADF7689C97B70, fd342; +fma.rn.f64 fd347, fd153, 0d3FEF4079C06C0992, fd343; +fma.rn.f64 fd348, fd156, 0dBFCB8426C12812BC, fd344; +fma.rn.f64 fd349, fd1110, 0d3FEF4079C06C0992, fd345; +fma.rn.f64 fd350, fd155, 0dBFCB8426C12812BC, fd346; +fma.rn.f64 fd351, fd157, 0d3FE4B76371208A62, fd347; +fma.rn.f64 fd352, fd160, 0dBFE863A1ADA0CFA6, fd348; +fma.rn.f64 fd353, fd1108, 0d3FE4B76371208A62, fd349; +fma.rn.f64 fd354, fd159, 0dBFE863A1ADA0CFA6, fd350; +fma.rn.f64 fd355, fd161, 0d3FABB81853A18977, fd351; +fma.rn.f64 fd356, fd164, 0dBFEFF3FC588E859D, fd352; +fma.rn.f64 fd357, fd1106, 0d3FABB81853A18977, fd353; +fma.rn.f64 fd358, fd163, 0dBFEFF3FC588E859D, fd354; +fma.rn.f64 fd359, fd165, 0dBFE1F53E93956DBF, fd355; +fma.rn.f64 fd360, fd168, 0dBFEA7C6DA34AF89F, fd356; +fma.rn.f64 fd361, fd1103, 0dBFE1F53E93956DBF, fd357; +fma.rn.f64 fd362, fd167, 0dBFEA7C6DA34AF89F, fd358; +fma.rn.f64 fd363, fd169, 0dBFEE532CBE45C954, fd359; +fma.rn.f64 fd364, fd172, 0dBFD46F6FAF5FCB72, fd360; +fma.rn.f64 fd365, fd1101, 0dBFEE532CBE45C954, fd361; +fma.rn.f64 fd366, fd171, 0dBFD46F6FAF5FCB72, fd362; +fma.rn.f64 fd367, fd117, 0d3FE4B76371208A62, %58; +fma.rn.f64 fd371, fd121, 0dBFC4B545C0234A71, fd367; +fma.rn.f64 fd1090, fd120, 0dBFE863A1ADA0CFA6, 0d0000000000000000; +fma.rn.f64 fd372, fd124, 0dBFEF941537248537, fd1090; +fma.rn.f64 fd1089, fd1131, 0d3FE4B76371208A62, %59; +fma.rn.f64 fd373, fd1129, 0dBFC4B545C0234A71, fd1089; +fma.rn.f64 fd1088, fd119, 0dBFE863A1ADA0CFA6, 0d0000000000000000; +fma.rn.f64 fd374, fd123, 0dBFEF941537248537, fd1088; +fma.rn.f64 fd375, fd125, 0dBFEB6B5FBD9F7255, fd371; +fma.rn.f64 fd376, fd128, 0dBFE07F6ACD7CDCE2, fd372; +fma.rn.f64 fd377, fd1127, 0dBFEB6B5FBD9F7255, fd373; +fma.rn.f64 fd378, fd127, 0dBFE07F6ACD7CDCE2, fd374; +fma.rn.f64 fd379, fd129, 0dBFEE532CBE45C954, fd375; +fma.rn.f64 fd380, fd132, 0d3FD46F6FAF5FCB72, fd376; +fma.rn.f64 fd381, fd1124, 0dBFEE532CBE45C954, fd377; +fma.rn.f64 fd382, fd131, 0d3FD46F6FAF5FCB72, fd378; +fma.rn.f64 fd383, fd133, 0dBFD7B057F20BF2E4, fd379; +fma.rn.f64 fd384, fd136, 0d3FEDBA2D62CB789F, fd380; +fma.rn.f64 fd385, fd1122, 0dBFD7B057F20BF2E4, fd381; +fma.rn.f64 fd386, fd135, 0d3FEDBA2D62CB789F, fd382; +fma.rn.f64 fd387, fd137, 0d3FDDFA67657E7608, fd383; +fma.rn.f64 fd388, fd140, 0d3FEC45BB0D10918C, fd384; +fma.rn.f64 fd389, fd1120, 0d3FDDFA67657E7608, fd385; +fma.rn.f64 fd390, fd139, 0d3FEC45BB0D10918C, fd386; +fma.rn.f64 fd391, fd141, 0d3FEF4079C06C0992, fd387; +fma.rn.f64 fd392, fd144, 0d3FCB8426C12812BC, fd388; +fma.rn.f64 fd393, fd1117, 0d3FEF4079C06C0992, fd389; +fma.rn.f64 fd394, fd143, 0d3FCB8426C12812BC, fd390; +fma.rn.f64 fd395, fd145, 0d3FE979982A38E65A, fd391; +fma.rn.f64 fd396, fd148, 0dBFE35D9650D47852, fd392; +fma.rn.f64 fd397, fd1115, 0d3FE979982A38E65A, fd393; +fma.rn.f64 fd398, fd147, 0dBFE35D9650D47852, fd394; +fma.rn.f64 fd399, fd149, 0d3FABB81853A18977, fd395; +fma.rn.f64 fd400, fd152, 0dBFEFF3FC588E859D, fd396; +fma.rn.f64 fd401, fd1113, 0d3FABB81853A18977, fd397; +fma.rn.f64 fd402, fd151, 0dBFEFF3FC588E859D, fd398; +fma.rn.f64 fd403, fd153, 0dBFE73B5AE5DB4E10, fd399; +fma.rn.f64 fd404, fd156, 0dBFE601A24BA81342, fd400; +fma.rn.f64 fd405, fd1110, 0dBFE73B5AE5DB4E10, fd401; +fma.rn.f64 fd406, fd155, 0dBFE601A24BA81342, fd402; +fma.rn.f64 fd407, fd157, 0dBFEFCFFA67B61650, fd403; +fma.rn.f64 fd408, fd160, 0d3FBBADB02034D9FF, fd404; +fma.rn.f64 fd409, fd1108, 0dBFEFCFFA67B61650, fd405; +fma.rn.f64 fd410, fd159, 0d3FBBADB02034D9FF, fd406; +fma.rn.f64 fd411, fd161, 0dBFE1F53E93956DBF, fd407; +fma.rn.f64 fd412, fd164, 0d3FEA7C6DA34AF89F, fd408; +fma.rn.f64 fd413, fd1106, 0dBFE1F53E93956DBF, fd409; +fma.rn.f64 fd414, fd163, 0d3FEA7C6DA34AF89F, fd410; +fma.rn.f64 fd415, fd165, 0d3FD11F2F2E2F1E3B, fd411; +fma.rn.f64 fd416, fd168, 0d3FEED566CB3DCBA1, fd412; +fma.rn.f64 fd417, fd1103, 0d3FD11F2F2E2F1E3B, fd413; +fma.rn.f64 fd418, fd167, 0d3FEED566CB3DCBA1, fd414; +fma.rn.f64 fd419, fd169, 0d3FED0ADB9B447CCF, fd415; +fma.rn.f64 fd420, fd172, 0d3FDADF7689C97B70, fd416; +fma.rn.f64 fd421, fd1101, 0d3FED0ADB9B447CCF, fd417; +fma.rn.f64 fd422, fd171, 0d3FDADF7689C97B70, fd418; +fma.rn.f64 fd423, fd117, 0d3FDDFA67657E7608, %58; +fma.rn.f64 fd427, fd121, 0dBFE1F53E93956DBF, fd423; +fma.rn.f64 fd1087, fd120, 0dBFEC45BB0D10918C, 0d0000000000000000; +fma.rn.f64 fd428, fd124, 0dBFEA7C6DA34AF89F, fd1087; +fma.rn.f64 fd1086, fd1131, 0d3FDDFA67657E7608, %59; +fma.rn.f64 fd429, fd1129, 0dBFE1F53E93956DBF, fd1086; +fma.rn.f64 fd1085, fd119, 0dBFEC45BB0D10918C, 0d0000000000000000; +fma.rn.f64 fd430, fd123, 0dBFEA7C6DA34AF89F, fd1085; +fma.rn.f64 fd431, fd125, 0dBFEFCFFA67B61650, fd427; +fma.rn.f64 fd432, fd128, 0d3FBBADB02034D9FF, fd428; +fma.rn.f64 fd433, fd1127, 0dBFEFCFFA67B61650, fd429; +fma.rn.f64 fd434, fd127, 0d3FBBADB02034D9FF, fd430; +fma.rn.f64 fd435, fd129, 0dBFD7B057F20BF2E4, fd431; +fma.rn.f64 fd436, fd132, 0d3FEDBA2D62CB789F, fd432; +fma.rn.f64 fd437, fd1124, 0dBFD7B057F20BF2E4, fd433; +fma.rn.f64 fd438, fd131, 0d3FEDBA2D62CB789F, fd434; +fma.rn.f64 fd439, fd133, 0d3FE4B76371208A62, fd435; +fma.rn.f64 fd440, fd136, 0d3FE863A1ADA0CFA6, fd436; +fma.rn.f64 fd441, fd1122, 0d3FE4B76371208A62, fd437; +fma.rn.f64 fd442, fd135, 0d3FE863A1ADA0CFA6, fd438; +fma.rn.f64 fd443, fd137, 0d3FEF4079C06C0992, fd439; +fma.rn.f64 fd444, fd140, 0dBFCB8426C12812BC, fd440; +fma.rn.f64 fd445, fd1120, 0d3FEF4079C06C0992, fd441; +fma.rn.f64 fd446, fd139, 0dBFCB8426C12812BC, fd442; +fma.rn.f64 fd447, fd141, 0d3FD11F2F2E2F1E3B, fd443; +fma.rn.f64 fd448, fd144, 0dBFEED566CB3DCBA1, fd444; +fma.rn.f64 fd449, fd1117, 0d3FD11F2F2E2F1E3B, fd445; +fma.rn.f64 fd450, fd143, 0dBFEED566CB3DCBA1, fd446; +fma.rn.f64 fd451, fd145, 0dBFE73B5AE5DB4E10, fd447; +fma.rn.f64 fd452, fd148, 0dBFE601A24BA81342, fd448; +fma.rn.f64 fd453, fd1115, 0dBFE73B5AE5DB4E10, fd449; +fma.rn.f64 fd454, fd147, 0dBFE601A24BA81342, fd450; +fma.rn.f64 fd455, fd149, 0dBFEE532CBE45C954, fd451; +fma.rn.f64 fd456, fd152, 0d3FD46F6FAF5FCB72, fd452; +fma.rn.f64 fd457, fd1113, 0dBFEE532CBE45C954, fd453; +fma.rn.f64 fd458, fd151, 0d3FD46F6FAF5FCB72, fd454; +fma.rn.f64 fd459, fd153, 0dBFC4B545C0234A71, fd455; +fma.rn.f64 fd460, fd156, 0d3FEF941537248537, fd456; +fma.rn.f64 fd461, fd1110, 0dBFC4B545C0234A71, fd457; +fma.rn.f64 fd462, fd155, 0d3FEF941537248537, fd458; +fma.rn.f64 fd463, fd157, 0d3FE979982A38E65A, fd459; +fma.rn.f64 fd464, fd160, 0d3FE35D9650D47852, fd460; +fma.rn.f64 fd465, fd1108, 0d3FE979982A38E65A, fd461; +fma.rn.f64 fd466, fd159, 0d3FE35D9650D47852, fd462; +fma.rn.f64 fd467, fd161, 0d3FED0ADB9B447CCF, fd463; +fma.rn.f64 fd468, fd164, 0dBFDADF7689C97B70, fd464; +fma.rn.f64 fd469, fd1106, 0d3FED0ADB9B447CCF, fd465; +fma.rn.f64 fd470, fd163, 0dBFDADF7689C97B70, fd466; +fma.rn.f64 fd471, fd165, 0d3FABB81853A18977, fd467; +fma.rn.f64 fd472, fd168, 0dBFEFF3FC588E859D, fd468; +fma.rn.f64 fd473, fd1103, 0d3FABB81853A18977, fd469; +fma.rn.f64 fd474, fd167, 0dBFEFF3FC588E859D, fd470; +fma.rn.f64 fd475, fd169, 0dBFEB6B5FBD9F7255, fd471; +fma.rn.f64 fd476, fd172, 0dBFE07F6ACD7CDCE2, fd472; +fma.rn.f64 fd477, fd1101, 0dBFEB6B5FBD9F7255, fd473; +fma.rn.f64 fd478, fd171, 0dBFE07F6ACD7CDCE2, fd474; +fma.rn.f64 fd479, fd117, 0d3FD11F2F2E2F1E3B, %58; +fma.rn.f64 fd483, fd121, 0dBFEB6B5FBD9F7255, fd479; +fma.rn.f64 fd1084, fd120, 0dBFEED566CB3DCBA1, 0d0000000000000000; +fma.rn.f64 fd484, fd124, 0dBFE07F6ACD7CDCE2, fd1084; +fma.rn.f64 fd1083, fd1131, 0d3FD11F2F2E2F1E3B, %59; +fma.rn.f64 fd485, fd1129, 0dBFEB6B5FBD9F7255, fd1083; +fma.rn.f64 fd1082, fd119, 0dBFEED566CB3DCBA1, 0d0000000000000000; +fma.rn.f64 fd486, fd123, 0dBFE07F6ACD7CDCE2, fd1082; +fma.rn.f64 fd487, fd125, 0dBFE73B5AE5DB4E10, fd483; +fma.rn.f64 fd488, fd128, 0d3FE601A24BA81342, fd484; +fma.rn.f64 fd489, fd1127, 0dBFE73B5AE5DB4E10, fd485; +fma.rn.f64 fd490, fd127, 0d3FE601A24BA81342, fd486; +fma.rn.f64 fd491, fd129, 0d3FDDFA67657E7608, fd487; +fma.rn.f64 fd492, fd132, 0d3FEC45BB0D10918C, fd488; +fma.rn.f64 fd493, fd1124, 0d3FDDFA67657E7608, fd489; +fma.rn.f64 fd494, fd131, 0d3FEC45BB0D10918C, fd490; +fma.rn.f64 fd495, fd133, 0d3FEF4079C06C0992, fd491; +fma.rn.f64 fd496, fd136, 0dBFCB8426C12812BC, fd492; +fma.rn.f64 fd497, fd1122, 0d3FEF4079C06C0992, fd493; +fma.rn.f64 fd498, fd135, 0dBFCB8426C12812BC, fd494; +fma.rn.f64 fd499, fd137, 0d3FABB81853A18977, fd495; +fma.rn.f64 fd500, fd140, 0dBFEFF3FC588E859D, fd496; +fma.rn.f64 fd501, fd1120, 0d3FABB81853A18977, fd497; +fma.rn.f64 fd502, fd139, 0dBFEFF3FC588E859D, fd498; +fma.rn.f64 fd503, fd141, 0dBFEE532CBE45C954, fd499; +fma.rn.f64 fd504, fd144, 0dBFD46F6FAF5FCB72, fd500; +fma.rn.f64 fd505, fd1117, 0dBFEE532CBE45C954, fd501; +fma.rn.f64 fd506, fd143, 0dBFD46F6FAF5FCB72, fd502; +fma.rn.f64 fd507, fd145, 0dBFE1F53E93956DBF, fd503; +fma.rn.f64 fd508, fd148, 0d3FEA7C6DA34AF89F, fd504; +fma.rn.f64 fd509, fd1115, 0dBFE1F53E93956DBF, fd505; +fma.rn.f64 fd510, fd147, 0d3FEA7C6DA34AF89F, fd506; +fma.rn.f64 fd511, fd149, 0d3FE4B76371208A62, fd507; +fma.rn.f64 fd512, fd152, 0d3FE863A1ADA0CFA6, fd508; +fma.rn.f64 fd513, fd1113, 0d3FE4B76371208A62, fd509; +fma.rn.f64 fd514, fd151, 0d3FE863A1ADA0CFA6, fd510; +fma.rn.f64 fd515, fd153, 0d3FED0ADB9B447CCF, fd511; +fma.rn.f64 fd516, fd156, 0dBFDADF7689C97B70, fd512; +fma.rn.f64 fd517, fd1110, 0d3FED0ADB9B447CCF, fd513; +fma.rn.f64 fd518, fd155, 0dBFDADF7689C97B70, fd514; +fma.rn.f64 fd519, fd157, 0dBFC4B545C0234A71, fd515; +fma.rn.f64 fd520, fd160, 0dBFEF941537248537, fd516; +fma.rn.f64 fd521, fd1108, 0dBFC4B545C0234A71, fd517; +fma.rn.f64 fd522, fd159, 0dBFEF941537248537, fd518; +fma.rn.f64 fd523, fd161, 0dBFEFCFFA67B61650, fd519; +fma.rn.f64 fd524, fd164, 0dBFBBADB02034D9FF, fd520; +fma.rn.f64 fd525, fd1106, 0dBFEFCFFA67B61650, fd521; +fma.rn.f64 fd526, fd163, 0dBFBBADB02034D9FF, fd522; +fma.rn.f64 fd527, fd165, 0dBFD7B057F20BF2E4, fd523; +fma.rn.f64 fd528, fd168, 0d3FEDBA2D62CB789F, fd524; +fma.rn.f64 fd529, fd1103, 0dBFD7B057F20BF2E4, fd525; +fma.rn.f64 fd530, fd167, 0d3FEDBA2D62CB789F, fd526; +fma.rn.f64 fd531, fd169, 0d3FE979982A38E65A, fd527; +fma.rn.f64 fd532, fd172, 0d3FE35D9650D47852, fd528; +fma.rn.f64 fd533, fd1101, 0d3FE979982A38E65A, fd529; +fma.rn.f64 fd534, fd171, 0d3FE35D9650D47852, fd530; +fma.rn.f64 fd535, fd117, 0d3FABB81853A18977, %58; +fma.rn.f64 fd539, fd121, 0dBFEFCFFA67B61650, fd535; +fma.rn.f64 fd1081, fd120, 0dBFEFF3FC588E859D, 0d0000000000000000; +fma.rn.f64 fd540, fd124, 0dBFBBADB02034D9FF, fd1081; +fma.rn.f64 fd1080, fd1131, 0d3FABB81853A18977, %59; +fma.rn.f64 fd541, fd1129, 0dBFEFCFFA67B61650, fd1080; +fma.rn.f64 fd1079, fd119, 0dBFEFF3FC588E859D, 0d0000000000000000; +fma.rn.f64 fd542, fd123, 0dBFBBADB02034D9FF, fd1079; +fma.rn.f64 fd543, fd125, 0dBFC4B545C0234A71, fd539; +fma.rn.f64 fd544, fd128, 0d3FEF941537248537, fd540; +fma.rn.f64 fd545, fd1127, 0dBFC4B545C0234A71, fd541; +fma.rn.f64 fd546, fd127, 0d3FEF941537248537, fd542; +fma.rn.f64 fd547, fd129, 0d3FEF4079C06C0992, fd543; +fma.rn.f64 fd548, fd132, 0d3FCB8426C12812BC, fd544; +fma.rn.f64 fd549, fd1124, 0d3FEF4079C06C0992, fd545; +fma.rn.f64 fd550, fd131, 0d3FCB8426C12812BC, fd546; +fma.rn.f64 fd551, fd133, 0d3FD11F2F2E2F1E3B, fd547; +fma.rn.f64 fd552, fd136, 0dBFEED566CB3DCBA1, fd548; +fma.rn.f64 fd553, fd1122, 0d3FD11F2F2E2F1E3B, fd549; +fma.rn.f64 fd554, fd135, 0dBFEED566CB3DCBA1, fd550; +fma.rn.f64 fd555, fd137, 0dBFEE532CBE45C954, fd551; +fma.rn.f64 fd556, fd140, 0dBFD46F6FAF5FCB72, fd552; +fma.rn.f64 fd557, fd1120, 0dBFEE532CBE45C954, fd553; +fma.rn.f64 fd558, fd139, 0dBFD46F6FAF5FCB72, fd554; +fma.rn.f64 fd559, fd141, 0dBFD7B057F20BF2E4, fd555; +fma.rn.f64 fd560, fd144, 0d3FEDBA2D62CB789F, fd556; +fma.rn.f64 fd561, fd1117, 0dBFD7B057F20BF2E4, fd557; +fma.rn.f64 fd562, fd143, 0d3FEDBA2D62CB789F, fd558; +fma.rn.f64 fd563, fd145, 0d3FED0ADB9B447CCF, fd559; +fma.rn.f64 fd564, fd148, 0d3FDADF7689C97B70, fd560; +fma.rn.f64 fd565, fd1115, 0d3FED0ADB9B447CCF, fd561; +fma.rn.f64 fd566, fd147, 0d3FDADF7689C97B70, fd562; +fma.rn.f64 fd567, fd149, 0d3FDDFA67657E7608, fd563; +fma.rn.f64 fd568, fd152, 0dBFEC45BB0D10918C, fd564; +fma.rn.f64 fd569, fd1113, 0d3FDDFA67657E7608, fd565; +fma.rn.f64 fd570, fd151, 0dBFEC45BB0D10918C, fd566; +fma.rn.f64 fd571, fd153, 0dBFEB6B5FBD9F7255, fd567; +fma.rn.f64 fd572, fd156, 0dBFE07F6ACD7CDCE2, fd568; +fma.rn.f64 fd573, fd1110, 0dBFEB6B5FBD9F7255, fd569; +fma.rn.f64 fd574, fd155, 0dBFE07F6ACD7CDCE2, fd570; +fma.rn.f64 fd575, fd157, 0dBFE1F53E93956DBF, fd571; +fma.rn.f64 fd576, fd160, 0d3FEA7C6DA34AF89F, fd572; +fma.rn.f64 fd577, fd1108, 0dBFE1F53E93956DBF, fd573; +fma.rn.f64 fd578, fd159, 0d3FEA7C6DA34AF89F, fd574; +fma.rn.f64 fd579, fd161, 0d3FE979982A38E65A, fd575; +fma.rn.f64 fd580, fd164, 0d3FE35D9650D47852, fd576; +fma.rn.f64 fd581, fd1106, 0d3FE979982A38E65A, fd577; +fma.rn.f64 fd582, fd163, 0d3FE35D9650D47852, fd578; +fma.rn.f64 fd583, fd165, 0d3FE4B76371208A62, fd579; +fma.rn.f64 fd584, fd168, 0dBFE863A1ADA0CFA6, fd580; +fma.rn.f64 fd585, fd1103, 0d3FE4B76371208A62, fd581; +fma.rn.f64 fd586, fd167, 0dBFE863A1ADA0CFA6, fd582; +fma.rn.f64 fd587, fd169, 0dBFE73B5AE5DB4E10, fd583; +fma.rn.f64 fd588, fd172, 0dBFE601A24BA81342, fd584; +fma.rn.f64 fd589, fd1101, 0dBFE73B5AE5DB4E10, fd585; +fma.rn.f64 fd590, fd171, 0dBFE601A24BA81342, fd586; +fma.rn.f64 fd591, fd117, 0dBFC4B545C0234A71, %58; +fma.rn.f64 fd595, fd121, 0dBFEE532CBE45C954, fd591; +fma.rn.f64 fd1078, fd120, 0dBFEF941537248537, 0d0000000000000000; +fma.rn.f64 fd596, fd124, 0d3FD46F6FAF5FCB72, fd1078; +fma.rn.f64 fd1077, fd1131, 0dBFC4B545C0234A71, %59; +fma.rn.f64 fd597, fd1129, 0dBFEE532CBE45C954, fd1077; +fma.rn.f64 fd1076, fd119, 0dBFEF941537248537, 0d0000000000000000; +fma.rn.f64 fd598, fd123, 0d3FD46F6FAF5FCB72, fd1076; +fma.rn.f64 fd599, fd125, 0d3FDDFA67657E7608, fd595; +fma.rn.f64 fd600, fd128, 0d3FEC45BB0D10918C, fd596; +fma.rn.f64 fd601, fd1127, 0d3FDDFA67657E7608, fd597; +fma.rn.f64 fd602, fd127, 0d3FEC45BB0D10918C, fd598; +fma.rn.f64 fd603, fd129, 0d3FE979982A38E65A, fd599; +fma.rn.f64 fd604, fd132, 0dBFE35D9650D47852, fd600; +fma.rn.f64 fd605, fd1124, 0d3FE979982A38E65A, fd601; +fma.rn.f64 fd606, fd131, 0dBFE35D9650D47852, fd602; +fma.rn.f64 fd607, fd133, 0dBFE73B5AE5DB4E10, fd603; +fma.rn.f64 fd608, fd136, 0dBFE601A24BA81342, fd604; +fma.rn.f64 fd609, fd1122, 0dBFE73B5AE5DB4E10, fd605; +fma.rn.f64 fd610, fd135, 0dBFE601A24BA81342, fd606; +fma.rn.f64 fd611, fd137, 0dBFE1F53E93956DBF, fd607; +fma.rn.f64 fd612, fd140, 0d3FEA7C6DA34AF89F, fd608; +fma.rn.f64 fd613, fd1120, 0dBFE1F53E93956DBF, fd609; +fma.rn.f64 fd614, fd139, 0d3FEA7C6DA34AF89F, fd610; +fma.rn.f64 fd615, fd141, 0d3FED0ADB9B447CCF, fd611; +fma.rn.f64 fd616, fd144, 0d3FDADF7689C97B70, fd612; +fma.rn.f64 fd617, fd1117, 0d3FED0ADB9B447CCF, fd613; +fma.rn.f64 fd618, fd143, 0d3FDADF7689C97B70, fd614; +fma.rn.f64 fd619, fd145, 0d3FD11F2F2E2F1E3B, fd615; +fma.rn.f64 fd620, fd148, 0dBFEED566CB3DCBA1, fd616; +fma.rn.f64 fd621, fd1115, 0d3FD11F2F2E2F1E3B, fd617; +fma.rn.f64 fd622, fd147, 0dBFEED566CB3DCBA1, fd618; +fma.rn.f64 fd623, fd149, 0dBFEFCFFA67B61650, fd619; +fma.rn.f64 fd624, fd152, 0dBFBBADB02034D9FF, fd620; +fma.rn.f64 fd625, fd1113, 0dBFEFCFFA67B61650, fd621; +fma.rn.f64 fd626, fd151, 0dBFBBADB02034D9FF, fd622; +fma.rn.f64 fd627, fd153, 0d3FABB81853A18977, fd623; +fma.rn.f64 fd628, fd156, 0d3FEFF3FC588E859D, fd624; +fma.rn.f64 fd629, fd1110, 0d3FABB81853A18977, fd625; +fma.rn.f64 fd630, fd155, 0d3FEFF3FC588E859D, fd626; +fma.rn.f64 fd631, fd157, 0d3FEF4079C06C0992, fd627; +fma.rn.f64 fd632, fd160, 0dBFCB8426C12812BC, fd628; +fma.rn.f64 fd633, fd1108, 0d3FEF4079C06C0992, fd629; +fma.rn.f64 fd634, fd159, 0dBFCB8426C12812BC, fd630; +fma.rn.f64 fd635, fd161, 0dBFD7B057F20BF2E4, fd631; +fma.rn.f64 fd636, fd164, 0dBFEDBA2D62CB789F, fd632; +fma.rn.f64 fd637, fd1106, 0dBFD7B057F20BF2E4, fd633; +fma.rn.f64 fd638, fd163, 0dBFEDBA2D62CB789F, fd634; +fma.rn.f64 fd639, fd165, 0dBFEB6B5FBD9F7255, fd635; +fma.rn.f64 fd640, fd168, 0d3FE07F6ACD7CDCE2, fd636; +fma.rn.f64 fd641, fd1103, 0dBFEB6B5FBD9F7255, fd637; +fma.rn.f64 fd642, fd167, 0d3FE07F6ACD7CDCE2, fd638; +fma.rn.f64 fd643, fd169, 0d3FE4B76371208A62, fd639; +fma.rn.f64 fd644, fd172, 0d3FE863A1ADA0CFA6, fd640; +fma.rn.f64 fd645, fd1101, 0d3FE4B76371208A62, fd641; +fma.rn.f64 fd646, fd171, 0d3FE863A1ADA0CFA6, fd642; +fma.rn.f64 fd647, fd117, 0dBFD7B057F20BF2E4, %58; +fma.rn.f64 fd651, fd121, 0dBFE73B5AE5DB4E10, fd647; +fma.rn.f64 fd1075, fd120, 0dBFEDBA2D62CB789F, 0d0000000000000000; +fma.rn.f64 fd652, fd124, 0d3FE601A24BA81342, fd1075; +fma.rn.f64 fd1074, fd1131, 0dBFD7B057F20BF2E4, %59; +fma.rn.f64 fd653, fd1129, 0dBFE73B5AE5DB4E10, fd1074; +fma.rn.f64 fd1073, fd119, 0dBFEDBA2D62CB789F, 0d0000000000000000; +fma.rn.f64 fd654, fd123, 0d3FE601A24BA81342, fd1073; +fma.rn.f64 fd655, fd125, 0d3FED0ADB9B447CCF, fd651; +fma.rn.f64 fd656, fd128, 0d3FDADF7689C97B70, fd652; +fma.rn.f64 fd657, fd1127, 0d3FED0ADB9B447CCF, fd653; +fma.rn.f64 fd658, fd127, 0d3FDADF7689C97B70, fd654; +fma.rn.f64 fd659, fd129, 0d3FABB81853A18977, fd655; +fma.rn.f64 fd660, fd132, 0dBFEFF3FC588E859D, fd656; +fma.rn.f64 fd661, fd1124, 0d3FABB81853A18977, fd657; +fma.rn.f64 fd662, fd131, 0dBFEFF3FC588E859D, fd658; +fma.rn.f64 fd663, fd133, 0dBFEE532CBE45C954, fd659; +fma.rn.f64 fd664, fd136, 0d3FD46F6FAF5FCB72, fd660; +fma.rn.f64 fd665, fd1122, 0dBFEE532CBE45C954, fd661; +fma.rn.f64 fd666, fd135, 0d3FD46F6FAF5FCB72, fd662; +fma.rn.f64 fd667, fd137, 0d3FE4B76371208A62, fd663; +fma.rn.f64 fd668, fd140, 0d3FE863A1ADA0CFA6, fd664; +fma.rn.f64 fd669, fd1120, 0d3FE4B76371208A62, fd665; +fma.rn.f64 fd670, fd139, 0d3FE863A1ADA0CFA6, fd666; +fma.rn.f64 fd671, fd141, 0d3FDDFA67657E7608, fd667; +fma.rn.f64 fd672, fd144, 0dBFEC45BB0D10918C, fd668; +fma.rn.f64 fd673, fd1117, 0d3FDDFA67657E7608, fd669; +fma.rn.f64 fd674, fd143, 0dBFEC45BB0D10918C, fd670; +fma.rn.f64 fd675, fd145, 0dBFEFCFFA67B61650, fd671; +fma.rn.f64 fd676, fd148, 0dBFBBADB02034D9FF, fd672; +fma.rn.f64 fd677, fd1115, 0dBFEFCFFA67B61650, fd673; +fma.rn.f64 fd678, fd147, 0dBFBBADB02034D9FF, fd674; +fma.rn.f64 fd679, fd149, 0d3FD11F2F2E2F1E3B, fd675; +fma.rn.f64 fd680, fd152, 0d3FEED566CB3DCBA1, fd676; +fma.rn.f64 fd681, fd1113, 0d3FD11F2F2E2F1E3B, fd677; +fma.rn.f64 fd682, fd151, 0d3FEED566CB3DCBA1, fd678; +fma.rn.f64 fd683, fd153, 0d3FE979982A38E65A, fd679; +fma.rn.f64 fd684, fd156, 0dBFE35D9650D47852, fd680; +fma.rn.f64 fd685, fd1110, 0d3FE979982A38E65A, fd681; +fma.rn.f64 fd686, fd155, 0dBFE35D9650D47852, fd682; +fma.rn.f64 fd687, fd157, 0dBFEB6B5FBD9F7255, fd683; +fma.rn.f64 fd688, fd160, 0dBFE07F6ACD7CDCE2, fd684; +fma.rn.f64 fd689, fd1108, 0dBFEB6B5FBD9F7255, fd685; +fma.rn.f64 fd690, fd159, 0dBFE07F6ACD7CDCE2, fd686; +fma.rn.f64 fd691, fd161, 0dBFC4B545C0234A71, fd687; +fma.rn.f64 fd692, fd164, 0d3FEF941537248537, fd688; +fma.rn.f64 fd693, fd1106, 0dBFC4B545C0234A71, fd689; +fma.rn.f64 fd694, fd163, 0d3FEF941537248537, fd690; +fma.rn.f64 fd695, fd165, 0d3FEF4079C06C0992, fd691; +fma.rn.f64 fd696, fd168, 0dBFCB8426C12812BC, fd692; +fma.rn.f64 fd697, fd1103, 0d3FEF4079C06C0992, fd693; +fma.rn.f64 fd698, fd167, 0dBFCB8426C12812BC, fd694; +fma.rn.f64 fd699, fd169, 0dBFE1F53E93956DBF, fd695; +fma.rn.f64 fd700, fd172, 0dBFEA7C6DA34AF89F, fd696; +fma.rn.f64 fd701, fd1101, 0dBFE1F53E93956DBF, fd697; +fma.rn.f64 fd702, fd171, 0dBFEA7C6DA34AF89F, fd698; +fma.rn.f64 fd703, fd117, 0dBFE1F53E93956DBF, %58; +fma.rn.f64 fd707, fd121, 0dBFD7B057F20BF2E4, fd703; +fma.rn.f64 fd1072, fd120, 0dBFEA7C6DA34AF89F, 0d0000000000000000; +fma.rn.f64 fd708, fd124, 0d3FEDBA2D62CB789F, fd1072; +fma.rn.f64 fd1071, fd1131, 0dBFE1F53E93956DBF, %59; +fma.rn.f64 fd709, fd1129, 0dBFD7B057F20BF2E4, fd1071; +fma.rn.f64 fd1070, fd119, 0dBFEA7C6DA34AF89F, 0d0000000000000000; +fma.rn.f64 fd710, fd123, 0d3FEDBA2D62CB789F, fd1070; +fma.rn.f64 fd711, fd125, 0d3FEF4079C06C0992, fd707; +fma.rn.f64 fd712, fd128, 0dBFCB8426C12812BC, fd708; +fma.rn.f64 fd713, fd1127, 0d3FEF4079C06C0992, fd709; +fma.rn.f64 fd714, fd127, 0dBFCB8426C12812BC, fd710; +fma.rn.f64 fd715, fd129, 0dBFE73B5AE5DB4E10, fd711; +fma.rn.f64 fd716, fd132, 0dBFE601A24BA81342, fd712; +fma.rn.f64 fd717, fd1124, 0dBFE73B5AE5DB4E10, fd713; +fma.rn.f64 fd718, fd131, 0dBFE601A24BA81342, fd714; +fma.rn.f64 fd719, fd133, 0dBFC4B545C0234A71, fd715; +fma.rn.f64 fd720, fd136, 0d3FEF941537248537, fd716; +fma.rn.f64 fd721, fd1122, 0dBFC4B545C0234A71, fd717; +fma.rn.f64 fd722, fd135, 0d3FEF941537248537, fd718; +fma.rn.f64 fd723, fd137, 0d3FED0ADB9B447CCF, fd719; +fma.rn.f64 fd724, fd140, 0dBFDADF7689C97B70, fd720; +fma.rn.f64 fd725, fd1120, 0d3FED0ADB9B447CCF, fd721; +fma.rn.f64 fd726, fd139, 0dBFDADF7689C97B70, fd722; +fma.rn.f64 fd727, fd141, 0dBFEB6B5FBD9F7255, fd723; +fma.rn.f64 fd728, fd144, 0dBFE07F6ACD7CDCE2, fd724; +fma.rn.f64 fd729, fd1117, 0dBFEB6B5FBD9F7255, fd725; +fma.rn.f64 fd730, fd143, 0dBFE07F6ACD7CDCE2, fd726; +fma.rn.f64 fd731, fd145, 0d3FABB81853A18977, fd727; +fma.rn.f64 fd732, fd148, 0d3FEFF3FC588E859D, fd728; +fma.rn.f64 fd733, fd1115, 0d3FABB81853A18977, fd729; +fma.rn.f64 fd734, fd147, 0d3FEFF3FC588E859D, fd730; +fma.rn.f64 fd735, fd149, 0d3FE979982A38E65A, fd731; +fma.rn.f64 fd736, fd152, 0dBFE35D9650D47852, fd732; +fma.rn.f64 fd737, fd1113, 0d3FE979982A38E65A, fd733; +fma.rn.f64 fd738, fd151, 0dBFE35D9650D47852, fd734; +fma.rn.f64 fd739, fd153, 0dBFEE532CBE45C954, fd735; +fma.rn.f64 fd740, fd156, 0dBFD46F6FAF5FCB72, fd736; +fma.rn.f64 fd741, fd1110, 0dBFEE532CBE45C954, fd737; +fma.rn.f64 fd742, fd155, 0dBFD46F6FAF5FCB72, fd738; +fma.rn.f64 fd743, fd157, 0d3FD11F2F2E2F1E3B, fd739; +fma.rn.f64 fd744, fd160, 0d3FEED566CB3DCBA1, fd740; +fma.rn.f64 fd745, fd1108, 0d3FD11F2F2E2F1E3B, fd741; +fma.rn.f64 fd746, fd159, 0d3FEED566CB3DCBA1, fd742; +fma.rn.f64 fd747, fd161, 0d3FE4B76371208A62, fd743; +fma.rn.f64 fd748, fd164, 0dBFE863A1ADA0CFA6, fd744; +fma.rn.f64 fd749, fd1106, 0d3FE4B76371208A62, fd745; +fma.rn.f64 fd750, fd163, 0dBFE863A1ADA0CFA6, fd746; +fma.rn.f64 fd751, fd165, 0dBFEFCFFA67B61650, fd747; +fma.rn.f64 fd752, fd168, 0dBFBBADB02034D9FF, fd748; +fma.rn.f64 fd753, fd1103, 0dBFEFCFFA67B61650, fd749; +fma.rn.f64 fd754, fd167, 0dBFBBADB02034D9FF, fd750; +fma.rn.f64 fd755, fd169, 0d3FDDFA67657E7608, fd751; +fma.rn.f64 fd756, fd172, 0d3FEC45BB0D10918C, fd752; +fma.rn.f64 fd757, fd1101, 0d3FDDFA67657E7608, fd753; +fma.rn.f64 fd758, fd171, 0d3FEC45BB0D10918C, fd754; +fma.rn.f64 fd759, fd117, 0dBFE73B5AE5DB4E10, %58; +fma.rn.f64 fd763, fd121, 0d3FABB81853A18977, fd759; +fma.rn.f64 fd1069, fd120, 0dBFE601A24BA81342, 0d0000000000000000; +fma.rn.f64 fd764, fd124, 0d3FEFF3FC588E859D, fd1069; +fma.rn.f64 fd1068, fd1131, 0dBFE73B5AE5DB4E10, %59; +fma.rn.f64 fd765, fd1129, 0d3FABB81853A18977, fd1068; +fma.rn.f64 fd1067, fd119, 0dBFE601A24BA81342, 0d0000000000000000; +fma.rn.f64 fd766, fd123, 0d3FEFF3FC588E859D, fd1067; +fma.rn.f64 fd767, fd125, 0d3FE4B76371208A62, fd763; +fma.rn.f64 fd768, fd128, 0dBFE863A1ADA0CFA6, fd764; +fma.rn.f64 fd769, fd1127, 0d3FE4B76371208A62, fd765; +fma.rn.f64 fd770, fd127, 0dBFE863A1ADA0CFA6, fd766; +fma.rn.f64 fd771, fd129, 0dBFEFCFFA67B61650, fd767; +fma.rn.f64 fd772, fd132, 0d3FBBADB02034D9FF, fd768; +fma.rn.f64 fd773, fd1124, 0dBFEFCFFA67B61650, fd769; +fma.rn.f64 fd774, fd131, 0d3FBBADB02034D9FF, fd770; +fma.rn.f64 fd775, fd133, 0d3FE979982A38E65A, fd771; +fma.rn.f64 fd776, fd136, 0d3FE35D9650D47852, fd772; +fma.rn.f64 fd777, fd1122, 0d3FE979982A38E65A, fd773; +fma.rn.f64 fd778, fd135, 0d3FE35D9650D47852, fd774; +fma.rn.f64 fd779, fd137, 0dBFC4B545C0234A71, fd775; +fma.rn.f64 fd780, fd140, 0dBFEF941537248537, fd776; +fma.rn.f64 fd781, fd1120, 0dBFC4B545C0234A71, fd777; +fma.rn.f64 fd782, fd139, 0dBFEF941537248537, fd778; +fma.rn.f64 fd783, fd141, 0dBFE1F53E93956DBF, fd779; +fma.rn.f64 fd784, fd144, 0d3FEA7C6DA34AF89F, fd780; +fma.rn.f64 fd785, fd1117, 0dBFE1F53E93956DBF, fd781; +fma.rn.f64 fd786, fd143, 0d3FEA7C6DA34AF89F, fd782; +fma.rn.f64 fd787, fd145, 0d3FEF4079C06C0992, fd783; +fma.rn.f64 fd788, fd148, 0dBFCB8426C12812BC, fd784; +fma.rn.f64 fd789, fd1115, 0d3FEF4079C06C0992, fd785; +fma.rn.f64 fd790, fd147, 0dBFCB8426C12812BC, fd786; +fma.rn.f64 fd791, fd149, 0dBFEB6B5FBD9F7255, fd787; +fma.rn.f64 fd792, fd152, 0dBFE07F6ACD7CDCE2, fd788; +fma.rn.f64 fd793, fd1113, 0dBFEB6B5FBD9F7255, fd789; +fma.rn.f64 fd794, fd151, 0dBFE07F6ACD7CDCE2, fd790; +fma.rn.f64 fd795, fd153, 0d3FD11F2F2E2F1E3B, fd791; +fma.rn.f64 fd796, fd156, 0d3FEED566CB3DCBA1, fd792; +fma.rn.f64 fd797, fd1110, 0d3FD11F2F2E2F1E3B, fd793; +fma.rn.f64 fd798, fd155, 0d3FEED566CB3DCBA1, fd794; +fma.rn.f64 fd799, fd157, 0d3FDDFA67657E7608, fd795; +fma.rn.f64 fd800, fd160, 0dBFEC45BB0D10918C, fd796; +fma.rn.f64 fd801, fd1108, 0d3FDDFA67657E7608, fd797; +fma.rn.f64 fd802, fd159, 0dBFEC45BB0D10918C, fd798; +fma.rn.f64 fd803, fd161, 0dBFEE532CBE45C954, fd799; +fma.rn.f64 fd804, fd164, 0d3FD46F6FAF5FCB72, fd800; +fma.rn.f64 fd805, fd1106, 0dBFEE532CBE45C954, fd801; +fma.rn.f64 fd806, fd163, 0d3FD46F6FAF5FCB72, fd802; +fma.rn.f64 fd807, fd165, 0d3FED0ADB9B447CCF, fd803; +fma.rn.f64 fd808, fd168, 0d3FDADF7689C97B70, fd804; +fma.rn.f64 fd809, fd1103, 0d3FED0ADB9B447CCF, fd805; +fma.rn.f64 fd810, fd167, 0d3FDADF7689C97B70, fd806; +fma.rn.f64 fd811, fd169, 0dBFD7B057F20BF2E4, fd807; +fma.rn.f64 fd812, fd172, 0dBFEDBA2D62CB789F, fd808; +fma.rn.f64 fd813, fd1101, 0dBFD7B057F20BF2E4, fd809; +fma.rn.f64 fd814, fd171, 0dBFEDBA2D62CB789F, fd810; +fma.rn.f64 fd815, fd117, 0dBFEB6B5FBD9F7255, %58; +fma.rn.f64 fd819, fd121, 0d3FDDFA67657E7608, fd815; +fma.rn.f64 fd1066, fd120, 0dBFE07F6ACD7CDCE2, 0d0000000000000000; +fma.rn.f64 fd820, fd124, 0d3FEC45BB0D10918C, fd1066; +fma.rn.f64 fd1065, fd1131, 0dBFEB6B5FBD9F7255, %59; +fma.rn.f64 fd821, fd1129, 0d3FDDFA67657E7608, fd1065; +fma.rn.f64 fd1064, fd119, 0dBFE07F6ACD7CDCE2, 0d0000000000000000; +fma.rn.f64 fd822, fd123, 0d3FEC45BB0D10918C, fd1064; +fma.rn.f64 fd823, fd125, 0d3FABB81853A18977, fd819; +fma.rn.f64 fd824, fd128, 0dBFEFF3FC588E859D, fd820; +fma.rn.f64 fd825, fd1127, 0d3FABB81853A18977, fd821; +fma.rn.f64 fd826, fd127, 0dBFEFF3FC588E859D, fd822; +fma.rn.f64 fd827, fd129, 0dBFE1F53E93956DBF, fd823; +fma.rn.f64 fd828, fd132, 0d3FEA7C6DA34AF89F, fd824; +fma.rn.f64 fd829, fd1124, 0dBFE1F53E93956DBF, fd825; +fma.rn.f64 fd830, fd131, 0d3FEA7C6DA34AF89F, fd826; +fma.rn.f64 fd831, fd133, 0d3FED0ADB9B447CCF, fd827; +fma.rn.f64 fd832, fd136, 0dBFDADF7689C97B70, fd828; +fma.rn.f64 fd833, fd1122, 0d3FED0ADB9B447CCF, fd829; +fma.rn.f64 fd834, fd135, 0dBFDADF7689C97B70, fd830; +fma.rn.f64 fd835, fd137, 0dBFEFCFFA67B61650, fd831; +fma.rn.f64 fd836, fd140, 0dBFBBADB02034D9FF, fd832; +fma.rn.f64 fd837, fd1120, 0dBFEFCFFA67B61650, fd833; +fma.rn.f64 fd838, fd139, 0dBFBBADB02034D9FF, fd834; +fma.rn.f64 fd839, fd141, 0d3FE979982A38E65A, fd835; +fma.rn.f64 fd840, fd144, 0d3FE35D9650D47852, fd836; +fma.rn.f64 fd841, fd1117, 0d3FE979982A38E65A, fd837; +fma.rn.f64 fd842, fd143, 0d3FE35D9650D47852, fd838; +fma.rn.f64 fd843, fd145, 0dBFD7B057F20BF2E4, fd839; +fma.rn.f64 fd844, fd148, 0dBFEDBA2D62CB789F, fd840; +fma.rn.f64 fd845, fd1115, 0dBFD7B057F20BF2E4, fd841; +fma.rn.f64 fd846, fd147, 0dBFEDBA2D62CB789F, fd842; +fma.rn.f64 fd847, fd149, 0dBFC4B545C0234A71, fd843; +fma.rn.f64 fd848, fd152, 0d3FEF941537248537, fd844; +fma.rn.f64 fd849, fd1113, 0dBFC4B545C0234A71, fd845; +fma.rn.f64 fd850, fd151, 0d3FEF941537248537, fd846; +fma.rn.f64 fd851, fd153, 0d3FE4B76371208A62, fd847; +fma.rn.f64 fd852, fd156, 0dBFE863A1ADA0CFA6, fd848; +fma.rn.f64 fd853, fd1110, 0d3FE4B76371208A62, fd849; +fma.rn.f64 fd854, fd155, 0dBFE863A1ADA0CFA6, fd850; +fma.rn.f64 fd855, fd157, 0dBFEE532CBE45C954, fd851; +fma.rn.f64 fd856, fd160, 0d3FD46F6FAF5FCB72, fd852; +fma.rn.f64 fd857, fd1108, 0dBFEE532CBE45C954, fd853; +fma.rn.f64 fd858, fd159, 0d3FD46F6FAF5FCB72, fd854; +fma.rn.f64 fd859, fd161, 0d3FEF4079C06C0992, fd855; +fma.rn.f64 fd860, fd164, 0d3FCB8426C12812BC, fd856; +fma.rn.f64 fd861, fd1106, 0d3FEF4079C06C0992, fd857; +fma.rn.f64 fd862, fd163, 0d3FCB8426C12812BC, fd858; +fma.rn.f64 fd863, fd165, 0dBFE73B5AE5DB4E10, fd859; +fma.rn.f64 fd864, fd168, 0dBFE601A24BA81342, fd860; +fma.rn.f64 fd865, fd1103, 0dBFE73B5AE5DB4E10, fd861; +fma.rn.f64 fd866, fd167, 0dBFE601A24BA81342, fd862; +fma.rn.f64 fd867, fd169, 0d3FD11F2F2E2F1E3B, fd863; +fma.rn.f64 fd868, fd172, 0d3FEED566CB3DCBA1, fd864; +fma.rn.f64 fd869, fd1101, 0d3FD11F2F2E2F1E3B, fd865; +fma.rn.f64 fd870, fd171, 0d3FEED566CB3DCBA1, fd866; +fma.rn.f64 fd871, fd117, 0dBFEE532CBE45C954, %58; +fma.rn.f64 fd875, fd121, 0d3FE979982A38E65A, fd871; +fma.rn.f64 fd1063, fd120, 0dBFD46F6FAF5FCB72, 0d0000000000000000; +fma.rn.f64 fd876, fd124, 0d3FE35D9650D47852, fd1063; +fma.rn.f64 fd1062, fd1131, 0dBFEE532CBE45C954, %59; +fma.rn.f64 fd877, fd1129, 0d3FE979982A38E65A, fd1062; +fma.rn.f64 fd1061, fd119, 0dBFD46F6FAF5FCB72, 0d0000000000000000; +fma.rn.f64 fd878, fd123, 0d3FE35D9650D47852, fd1061; +fma.rn.f64 fd879, fd125, 0dBFE1F53E93956DBF, fd875; +fma.rn.f64 fd880, fd128, 0dBFEA7C6DA34AF89F, fd876; +fma.rn.f64 fd881, fd1127, 0dBFE1F53E93956DBF, fd877; +fma.rn.f64 fd882, fd127, 0dBFEA7C6DA34AF89F, fd878; +fma.rn.f64 fd883, fd129, 0d3FD11F2F2E2F1E3B, fd879; +fma.rn.f64 fd884, fd132, 0d3FEED566CB3DCBA1, fd880; +fma.rn.f64 fd885, fd1124, 0d3FD11F2F2E2F1E3B, fd881; +fma.rn.f64 fd886, fd131, 0d3FEED566CB3DCBA1, fd882; +fma.rn.f64 fd887, fd133, 0d3FABB81853A18977, fd883; +fma.rn.f64 fd888, fd136, 0dBFEFF3FC588E859D, fd884; +fma.rn.f64 fd889, fd1122, 0d3FABB81853A18977, fd885; +fma.rn.f64 fd890, fd135, 0dBFEFF3FC588E859D, fd886; +fma.rn.f64 fd891, fd137, 0dBFD7B057F20BF2E4, fd887; +fma.rn.f64 fd892, fd140, 0d3FEDBA2D62CB789F, fd888; +fma.rn.f64 fd893, fd1120, 0dBFD7B057F20BF2E4, fd889; +fma.rn.f64 fd894, fd139, 0d3FEDBA2D62CB789F, fd890; +fma.rn.f64 fd895, fd141, 0d3FE4B76371208A62, fd891; +fma.rn.f64 fd896, fd144, 0dBFE863A1ADA0CFA6, fd892; +fma.rn.f64 fd897, fd1117, 0d3FE4B76371208A62, fd893; +fma.rn.f64 fd898, fd143, 0dBFE863A1ADA0CFA6, fd894; +fma.rn.f64 fd899, fd145, 0dBFEB6B5FBD9F7255, fd895; +fma.rn.f64 fd900, fd148, 0d3FE07F6ACD7CDCE2, fd896; +fma.rn.f64 fd901, fd1115, 0dBFEB6B5FBD9F7255, fd897; +fma.rn.f64 fd902, fd147, 0d3FE07F6ACD7CDCE2, fd898; +fma.rn.f64 fd903, fd149, 0d3FEF4079C06C0992, fd899; +fma.rn.f64 fd904, fd152, 0dBFCB8426C12812BC, fd900; +fma.rn.f64 fd905, fd1113, 0d3FEF4079C06C0992, fd901; +fma.rn.f64 fd906, fd151, 0dBFCB8426C12812BC, fd902; +fma.rn.f64 fd907, fd153, 0dBFEFCFFA67B61650, fd903; +fma.rn.f64 fd908, fd156, 0dBFBBADB02034D9FF, fd904; +fma.rn.f64 fd909, fd1110, 0dBFEFCFFA67B61650, fd905; +fma.rn.f64 fd910, fd155, 0dBFBBADB02034D9FF, fd906; +fma.rn.f64 fd911, fd157, 0d3FED0ADB9B447CCF, fd907; +fma.rn.f64 fd912, fd160, 0d3FDADF7689C97B70, fd908; +fma.rn.f64 fd913, fd1108, 0d3FED0ADB9B447CCF, fd909; +fma.rn.f64 fd914, fd159, 0d3FDADF7689C97B70, fd910; +fma.rn.f64 fd915, fd161, 0dBFE73B5AE5DB4E10, fd911; +fma.rn.f64 fd916, fd164, 0dBFE601A24BA81342, fd912; +fma.rn.f64 fd917, fd1106, 0dBFE73B5AE5DB4E10, fd913; +fma.rn.f64 fd918, fd163, 0dBFE601A24BA81342, fd914; +fma.rn.f64 fd919, fd165, 0d3FDDFA67657E7608, fd915; +fma.rn.f64 fd920, fd168, 0d3FEC45BB0D10918C, fd916; +fma.rn.f64 fd921, fd1103, 0d3FDDFA67657E7608, fd917; +fma.rn.f64 fd922, fd167, 0d3FEC45BB0D10918C, fd918; +fma.rn.f64 fd923, fd169, 0dBFC4B545C0234A71, fd919; +fma.rn.f64 fd924, fd172, 0dBFEF941537248537, fd920; +fma.rn.f64 fd925, fd1101, 0dBFC4B545C0234A71, fd921; +fma.rn.f64 fd926, fd171, 0dBFEF941537248537, fd922; +fma.rn.f64 fd927, fd117, 0dBFEFCFFA67B61650, %58; +fma.rn.f64 fd928, fd120, 0dBFBBADB02034D9FF, 0d0000000000000000; +fma.rn.f64 fd929, fd1131, 0dBFEFCFFA67B61650, %59; +fma.rn.f64 fd930, fd119, 0dBFBBADB02034D9FF, 0d0000000000000000; +fma.rn.f64 fd931, fd121, 0d3FEF4079C06C0992, fd927; +fma.rn.f64 fd932, fd124, 0d3FCB8426C12812BC, fd928; +fma.rn.f64 fd933, fd1129, 0d3FEF4079C06C0992, fd929; +fma.rn.f64 fd934, fd123, 0d3FCB8426C12812BC, fd930; +fma.rn.f64 fd935, fd125, 0dBFEE532CBE45C954, fd931; +fma.rn.f64 fd936, fd128, 0dBFD46F6FAF5FCB72, fd932; +fma.rn.f64 fd937, fd1127, 0dBFEE532CBE45C954, fd933; +fma.rn.f64 fd938, fd127, 0dBFD46F6FAF5FCB72, fd934; +fma.rn.f64 fd939, fd129, 0d3FED0ADB9B447CCF, fd935; +fma.rn.f64 fd940, fd132, 0d3FDADF7689C97B70, fd936; +fma.rn.f64 fd941, fd1124, 0d3FED0ADB9B447CCF, fd937; +fma.rn.f64 fd942, fd131, 0d3FDADF7689C97B70, fd938; +fma.rn.f64 fd943, fd133, 0dBFEB6B5FBD9F7255, fd939; +fma.rn.f64 fd944, fd136, 0dBFE07F6ACD7CDCE2, fd940; +fma.rn.f64 fd945, fd1122, 0dBFEB6B5FBD9F7255, fd941; +fma.rn.f64 fd946, fd135, 0dBFE07F6ACD7CDCE2, fd942; +fma.rn.f64 fd947, fd137, 0d3FE979982A38E65A, fd943; +fma.rn.f64 fd948, fd140, 0d3FE35D9650D47852, fd944; +fma.rn.f64 fd949, fd1120, 0d3FE979982A38E65A, fd945; +fma.rn.f64 fd950, fd139, 0d3FE35D9650D47852, fd946; +fma.rn.f64 fd951, fd141, 0dBFE73B5AE5DB4E10, fd947; +fma.rn.f64 fd952, fd144, 0dBFE601A24BA81342, fd948; +fma.rn.f64 fd953, fd1117, 0dBFE73B5AE5DB4E10, fd949; +fma.rn.f64 fd954, fd143, 0dBFE601A24BA81342, fd950; +fma.rn.f64 fd955, fd145, 0d3FE4B76371208A62, fd951; +fma.rn.f64 fd956, fd148, 0d3FE863A1ADA0CFA6, fd952; +fma.rn.f64 fd957, fd1115, 0d3FE4B76371208A62, fd953; +fma.rn.f64 fd958, fd147, 0d3FE863A1ADA0CFA6, fd954; +fma.rn.f64 fd959, fd149, 0dBFE1F53E93956DBF, fd955; +fma.rn.f64 fd960, fd152, 0dBFEA7C6DA34AF89F, fd956; +fma.rn.f64 fd961, fd1113, 0dBFE1F53E93956DBF, fd957; +fma.rn.f64 fd962, fd151, 0dBFEA7C6DA34AF89F, fd958; +fma.rn.f64 fd963, fd153, 0d3FDDFA67657E7608, fd959; +fma.rn.f64 fd964, fd156, 0d3FEC45BB0D10918C, fd960; +fma.rn.f64 fd965, fd1110, 0d3FDDFA67657E7608, fd961; +fma.rn.f64 fd966, fd155, 0d3FEC45BB0D10918C, fd962; +fma.rn.f64 fd967, fd157, 0dBFD7B057F20BF2E4, fd963; +fma.rn.f64 fd968, fd160, 0dBFEDBA2D62CB789F, fd964; +fma.rn.f64 fd969, fd1108, 0dBFD7B057F20BF2E4, fd965; +fma.rn.f64 fd970, fd159, 0dBFEDBA2D62CB789F, fd966; +fma.rn.f64 fd971, fd161, 0d3FD11F2F2E2F1E3B, fd967; +fma.rn.f64 fd972, fd164, 0d3FEED566CB3DCBA1, fd968; +fma.rn.f64 fd973, fd1106, 0d3FD11F2F2E2F1E3B, fd969; +fma.rn.f64 fd974, fd163, 0d3FEED566CB3DCBA1, fd970; +fma.rn.f64 fd975, fd165, 0dBFC4B545C0234A71, fd971; +fma.rn.f64 fd976, fd168, 0dBFEF941537248537, fd972; +fma.rn.f64 fd977, fd1103, 0dBFC4B545C0234A71, fd973; +fma.rn.f64 fd978, fd167, 0dBFEF941537248537, fd974; +fma.rn.f64 fd979, fd169, 0d3FABB81853A18977, fd975; +fma.rn.f64 fd980, fd172, 0d3FEFF3FC588E859D, fd976; +fma.rn.f64 fd981, fd1101, 0d3FABB81853A18977, fd977; +fma.rn.f64 fd982, fd171, 0d3FEFF3FC588E859D, fd978; +add.f64 %1, fd198, fd1101; +add.f64 %0, fd197, fd169; +sub.f64 %2, fd251, fd252; +add.f64 %3, fd253, fd254; +sub.f64 %4, fd307, fd308; +add.f64 %5, fd309, fd310; +sub.f64 %6, fd363, fd364; +add.f64 %7, fd365, fd366; +sub.f64 %8, fd419, fd420; +add.f64 %9, fd421, fd422; +add.f64 %11, fd477, fd478; +sub.f64 %10, fd475, fd476; +add.f64 %13, fd533, fd534; +sub.f64 %12, fd531, fd532; +add.f64 %15, fd589, fd590; +sub.f64 %14, fd587, fd588; +sub.f64 %16, fd643, fd644; +add.f64 %17, fd645, fd646; +sub.f64 %18, fd699, fd700; +add.f64 %19, fd701, fd702; +sub.f64 %20, fd755, fd756; +add.f64 %21, fd757, fd758; +add.f64 %23, fd813, fd814; +sub.f64 %22, fd811, fd812; +add.f64 %25, fd869, fd870; +sub.f64 %24, fd867, fd868; +add.f64 %27, fd925, fd926; +sub.f64 %26, fd923, fd924; +sub.f64 %28, fd979, fd980; +add.f64 %29, fd981, fd982; +sub.f64 %31, fd981, fd982; +add.f64 %30, fd979, fd980; +sub.f64 %33, fd925, fd926; +add.f64 %32, fd923, fd924; +sub.f64 %35, fd869, fd870; +add.f64 %34, fd867, fd868; +sub.f64 %37, fd813, fd814; +add.f64 %36, fd811, fd812; +sub.f64 %39, fd757, fd758; +add.f64 %38, fd755, fd756; +sub.f64 %41, fd701, fd702; +add.f64 %40, fd699, fd700; +sub.f64 %43, fd645, fd646; +add.f64 %42, fd643, fd644; +sub.f64 %45, fd589, fd590; +add.f64 %44, fd587, fd588; +sub.f64 %47, fd533, fd534; +add.f64 %46, fd531, fd532; +sub.f64 %49, fd477, fd478; +add.f64 %48, fd475, fd476; +sub.f64 %51, fd421, fd422; +add.f64 %50, fd419, fd420; +sub.f64 %53, fd365, fd366; +add.f64 %52, fd363, fd364; +sub.f64 %55, fd309, fd310; +add.f64 %54, fd307, fd308; +sub.f64 %57, fd253, fd254; +add.f64 %56, fd251, fd252; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y), "=d"(rmem[28].x), "=d"(rmem[28].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y), "d"(rmem[28].x), "d"(rmem[28].y), "d"(rmem[1].y), "d"(rmem[2].y), "d"(rmem[26].y), "d"(rmem[25].y), "d"(rmem[4].y), "d"(rmem[5].y), "d"(rmem[23].y), "d"(rmem[22].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[20].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[11].y), "d"(rmem[17].y), "d"(rmem[16].y), "d"(rmem[13].y), "d"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..23bf3a03e7dff --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_29_fp64_inv.hpp.inc @@ -0,0 +1,940 @@ +#ifndef CUFFTDX_FFT_29_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_29_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<588, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<1133>; +.reg .b64 rd<4>; +add.f64 fd117, %60, %114; +sub.f64 fd119, %60, %114; +add.f64 fd1131, %116, %115; +sub.f64 fd120, %116, %115; +add.f64 fd121, %62, %112; +sub.f64 fd123, %62, %112; +add.f64 fd1129, %117, %113; +sub.f64 fd124, %117, %113; +add.f64 fd125, %64, %110; +sub.f64 fd127, %64, %110; +add.f64 fd1127, %65, %118; +sub.f64 fd128, %65, %118; +add.f64 fd129, %66, %108; +sub.f64 fd131, %66, %108; +add.f64 fd1124, %120, %119; +sub.f64 fd132, %120, %119; +add.f64 fd133, %68, %106; +sub.f64 fd135, %68, %106; +add.f64 fd1122, %121, %107; +sub.f64 fd136, %121, %107; +add.f64 fd137, %70, %104; +sub.f64 fd139, %70, %104; +add.f64 fd1120, %71, %122; +sub.f64 fd140, %71, %122; +add.f64 fd141, %72, %102; +sub.f64 fd143, %72, %102; +add.f64 fd1117, %124, %123; +sub.f64 fd144, %124, %123; +add.f64 fd145, %74, %100; +sub.f64 fd147, %74, %100; +add.f64 fd1115, %125, %101; +sub.f64 fd148, %125, %101; +add.f64 fd149, %76, %98; +sub.f64 fd151, %76, %98; +add.f64 fd1113, %77, %126; +sub.f64 fd152, %77, %126; +add.f64 fd153, %78, %96; +sub.f64 fd155, %78, %96; +add.f64 fd1110, %128, %127; +sub.f64 fd156, %128, %127; +add.f64 fd157, %80, %94; +sub.f64 fd159, %80, %94; +add.f64 fd1108, %129, %95; +sub.f64 fd160, %129, %95; +add.f64 fd161, %82, %92; +sub.f64 fd163, %82, %92; +add.f64 fd1106, %83, %130; +sub.f64 fd164, %83, %130; +add.f64 fd165, %84, %90; +sub.f64 fd167, %84, %90; +add.f64 fd1103, %132, %131; +sub.f64 fd168, %132, %131; +add.f64 fd169, %86, %88; +sub.f64 fd171, %86, %88; +add.f64 fd1101, %133, %89; +sub.f64 fd172, %133, %89; +add.f64 fd173, %58, fd117; +add.f64 fd175, fd173, fd121; +add.f64 fd1100, %59, fd1131; +add.f64 fd176, fd1100, fd1129; +add.f64 fd177, fd175, fd125; +add.f64 fd178, fd176, fd1127; +add.f64 fd179, fd177, fd129; +add.f64 fd180, fd178, fd1124; +add.f64 fd181, fd179, fd133; +add.f64 fd182, fd180, fd1122; +add.f64 fd183, fd181, fd137; +add.f64 fd184, fd182, fd1120; +add.f64 fd185, fd183, fd141; +add.f64 fd186, fd184, fd1117; +add.f64 fd187, fd185, fd145; +add.f64 fd188, fd186, fd1115; +add.f64 fd189, fd187, fd149; +add.f64 fd190, fd188, fd1113; +add.f64 fd191, fd189, fd153; +add.f64 fd192, fd190, fd1110; +add.f64 fd193, fd191, fd157; +add.f64 fd194, fd192, fd1108; +add.f64 fd195, fd193, fd161; +add.f64 fd196, fd194, fd1106; +add.f64 fd197, fd195, fd165; +add.f64 fd198, fd196, fd1103; +fma.rn.f64 fd199, fd117, 0d3FEF4079C06C0992, %58; +fma.rn.f64 fd203, fd121, 0d3FED0ADB9B447CCF, fd199; +fma.rn.f64 fd1099, fd120, 0d3FCB8426C12812BC, 0d0000000000000000; +fma.rn.f64 fd204, fd124, 0d3FDADF7689C97B70, fd1099; +fma.rn.f64 fd1098, fd1131, 0d3FEF4079C06C0992, %59; +fma.rn.f64 fd205, fd1129, 0d3FED0ADB9B447CCF, fd1098; +fma.rn.f64 fd1097, fd119, 0d3FCB8426C12812BC, 0d0000000000000000; +fma.rn.f64 fd206, fd123, 0d3FDADF7689C97B70, fd1097; +fma.rn.f64 fd207, fd125, 0d3FE979982A38E65A, fd203; +fma.rn.f64 fd208, fd128, 0d3FE35D9650D47852, fd204; +fma.rn.f64 fd209, fd1127, 0d3FE979982A38E65A, fd205; +fma.rn.f64 fd210, fd127, 0d3FE35D9650D47852, fd206; +fma.rn.f64 fd211, fd129, 0d3FE4B76371208A62, fd207; +fma.rn.f64 fd212, fd132, 0d3FE863A1ADA0CFA6, fd208; +fma.rn.f64 fd213, fd1124, 0d3FE4B76371208A62, fd209; +fma.rn.f64 fd214, fd131, 0d3FE863A1ADA0CFA6, fd210; +fma.rn.f64 fd215, fd133, 0d3FDDFA67657E7608, fd211; +fma.rn.f64 fd216, fd136, 0d3FEC45BB0D10918C, fd212; +fma.rn.f64 fd217, fd1122, 0d3FDDFA67657E7608, fd213; +fma.rn.f64 fd218, fd135, 0d3FEC45BB0D10918C, fd214; +fma.rn.f64 fd219, fd137, 0d3FD11F2F2E2F1E3B, fd215; +fma.rn.f64 fd220, fd140, 0d3FEED566CB3DCBA1, fd216; +fma.rn.f64 fd221, fd1120, 0d3FD11F2F2E2F1E3B, fd217; +fma.rn.f64 fd222, fd139, 0d3FEED566CB3DCBA1, fd218; +fma.rn.f64 fd223, fd141, 0d3FABB81853A18977, fd219; +fma.rn.f64 fd224, fd144, 0d3FEFF3FC588E859D, fd220; +fma.rn.f64 fd225, fd1117, 0d3FABB81853A18977, fd221; +fma.rn.f64 fd226, fd143, 0d3FEFF3FC588E859D, fd222; +fma.rn.f64 fd227, fd145, 0dBFC4B545C0234A71, fd223; +fma.rn.f64 fd228, fd148, 0d3FEF941537248537, fd224; +fma.rn.f64 fd229, fd1115, 0dBFC4B545C0234A71, fd225; +fma.rn.f64 fd230, fd147, 0d3FEF941537248537, fd226; +fma.rn.f64 fd231, fd149, 0dBFD7B057F20BF2E4, fd227; +fma.rn.f64 fd232, fd152, 0d3FEDBA2D62CB789F, fd228; +fma.rn.f64 fd233, fd1113, 0dBFD7B057F20BF2E4, fd229; +fma.rn.f64 fd234, fd151, 0d3FEDBA2D62CB789F, fd230; +fma.rn.f64 fd235, fd153, 0dBFE1F53E93956DBF, fd231; +fma.rn.f64 fd236, fd156, 0d3FEA7C6DA34AF89F, fd232; +fma.rn.f64 fd237, fd1110, 0dBFE1F53E93956DBF, fd233; +fma.rn.f64 fd238, fd155, 0d3FEA7C6DA34AF89F, fd234; +fma.rn.f64 fd239, fd157, 0dBFE73B5AE5DB4E10, fd235; +fma.rn.f64 fd240, fd160, 0d3FE601A24BA81342, fd236; +fma.rn.f64 fd241, fd1108, 0dBFE73B5AE5DB4E10, fd237; +fma.rn.f64 fd242, fd159, 0d3FE601A24BA81342, fd238; +fma.rn.f64 fd243, fd161, 0dBFEB6B5FBD9F7255, fd239; +fma.rn.f64 fd244, fd164, 0d3FE07F6ACD7CDCE2, fd240; +fma.rn.f64 fd245, fd1106, 0dBFEB6B5FBD9F7255, fd241; +fma.rn.f64 fd246, fd163, 0d3FE07F6ACD7CDCE2, fd242; +fma.rn.f64 fd247, fd165, 0dBFEE532CBE45C954, fd243; +fma.rn.f64 fd248, fd168, 0d3FD46F6FAF5FCB72, fd244; +fma.rn.f64 fd249, fd1103, 0dBFEE532CBE45C954, fd245; +fma.rn.f64 fd250, fd167, 0d3FD46F6FAF5FCB72, fd246; +fma.rn.f64 fd251, fd169, 0dBFEFCFFA67B61650, fd247; +fma.rn.f64 fd252, fd172, 0d3FBBADB02034D9FF, fd248; +fma.rn.f64 fd253, fd1101, 0dBFEFCFFA67B61650, fd249; +fma.rn.f64 fd254, fd171, 0d3FBBADB02034D9FF, fd250; +fma.rn.f64 fd255, fd117, 0d3FED0ADB9B447CCF, %58; +fma.rn.f64 fd259, fd121, 0d3FE4B76371208A62, fd255; +fma.rn.f64 fd1096, fd120, 0d3FDADF7689C97B70, 0d0000000000000000; +fma.rn.f64 fd260, fd124, 0d3FE863A1ADA0CFA6, fd1096; +fma.rn.f64 fd1095, fd1131, 0d3FED0ADB9B447CCF, %59; +fma.rn.f64 fd261, fd1129, 0d3FE4B76371208A62, fd1095; +fma.rn.f64 fd1094, fd119, 0d3FDADF7689C97B70, 0d0000000000000000; +fma.rn.f64 fd262, fd123, 0d3FE863A1ADA0CFA6, fd1094; +fma.rn.f64 fd263, fd125, 0d3FD11F2F2E2F1E3B, fd259; +fma.rn.f64 fd264, fd128, 0d3FEED566CB3DCBA1, fd260; +fma.rn.f64 fd265, fd1127, 0d3FD11F2F2E2F1E3B, fd261; +fma.rn.f64 fd266, fd127, 0d3FEED566CB3DCBA1, fd262; +fma.rn.f64 fd267, fd129, 0dBFC4B545C0234A71, fd263; +fma.rn.f64 fd268, fd132, 0d3FEF941537248537, fd264; +fma.rn.f64 fd269, fd1124, 0dBFC4B545C0234A71, fd265; +fma.rn.f64 fd270, fd131, 0d3FEF941537248537, fd266; +fma.rn.f64 fd271, fd133, 0dBFE1F53E93956DBF, fd267; +fma.rn.f64 fd272, fd136, 0d3FEA7C6DA34AF89F, fd268; +fma.rn.f64 fd273, fd1122, 0dBFE1F53E93956DBF, fd269; +fma.rn.f64 fd274, fd135, 0d3FEA7C6DA34AF89F, fd270; +fma.rn.f64 fd275, fd137, 0dBFEB6B5FBD9F7255, fd271; +fma.rn.f64 fd276, fd140, 0d3FE07F6ACD7CDCE2, fd272; +fma.rn.f64 fd277, fd1120, 0dBFEB6B5FBD9F7255, fd273; +fma.rn.f64 fd278, fd139, 0d3FE07F6ACD7CDCE2, fd274; +fma.rn.f64 fd279, fd141, 0dBFEFCFFA67B61650, fd275; +fma.rn.f64 fd280, fd144, 0d3FBBADB02034D9FF, fd276; +fma.rn.f64 fd281, fd1117, 0dBFEFCFFA67B61650, fd277; +fma.rn.f64 fd282, fd143, 0d3FBBADB02034D9FF, fd278; +fma.rn.f64 fd283, fd145, 0dBFEE532CBE45C954, fd279; +fma.rn.f64 fd284, fd148, 0dBFD46F6FAF5FCB72, fd280; +fma.rn.f64 fd285, fd1115, 0dBFEE532CBE45C954, fd281; +fma.rn.f64 fd286, fd147, 0dBFD46F6FAF5FCB72, fd282; +fma.rn.f64 fd287, fd149, 0dBFE73B5AE5DB4E10, fd283; +fma.rn.f64 fd288, fd152, 0dBFE601A24BA81342, fd284; +fma.rn.f64 fd289, fd1113, 0dBFE73B5AE5DB4E10, fd285; +fma.rn.f64 fd290, fd151, 0dBFE601A24BA81342, fd286; +fma.rn.f64 fd291, fd153, 0dBFD7B057F20BF2E4, fd287; +fma.rn.f64 fd292, fd156, 0dBFEDBA2D62CB789F, fd288; +fma.rn.f64 fd293, fd1110, 0dBFD7B057F20BF2E4, fd289; +fma.rn.f64 fd294, fd155, 0dBFEDBA2D62CB789F, fd290; +fma.rn.f64 fd295, fd157, 0d3FABB81853A18977, fd291; +fma.rn.f64 fd296, fd160, 0dBFEFF3FC588E859D, fd292; +fma.rn.f64 fd297, fd1108, 0d3FABB81853A18977, fd293; +fma.rn.f64 fd298, fd159, 0dBFEFF3FC588E859D, fd294; +fma.rn.f64 fd299, fd161, 0d3FDDFA67657E7608, fd295; +fma.rn.f64 fd300, fd164, 0dBFEC45BB0D10918C, fd296; +fma.rn.f64 fd301, fd1106, 0d3FDDFA67657E7608, fd297; +fma.rn.f64 fd302, fd163, 0dBFEC45BB0D10918C, fd298; +fma.rn.f64 fd303, fd165, 0d3FE979982A38E65A, fd299; +fma.rn.f64 fd304, fd168, 0dBFE35D9650D47852, fd300; +fma.rn.f64 fd305, fd1103, 0d3FE979982A38E65A, fd301; +fma.rn.f64 fd306, fd167, 0dBFE35D9650D47852, fd302; +fma.rn.f64 fd307, fd169, 0d3FEF4079C06C0992, fd303; +fma.rn.f64 fd308, fd172, 0dBFCB8426C12812BC, fd304; +fma.rn.f64 fd309, fd1101, 0d3FEF4079C06C0992, fd305; +fma.rn.f64 fd310, fd171, 0dBFCB8426C12812BC, fd306; +fma.rn.f64 fd311, fd117, 0d3FE979982A38E65A, %58; +fma.rn.f64 fd315, fd121, 0d3FD11F2F2E2F1E3B, fd311; +fma.rn.f64 fd1093, fd120, 0d3FE35D9650D47852, 0d0000000000000000; +fma.rn.f64 fd316, fd124, 0d3FEED566CB3DCBA1, fd1093; +fma.rn.f64 fd1092, fd1131, 0d3FE979982A38E65A, %59; +fma.rn.f64 fd317, fd1129, 0d3FD11F2F2E2F1E3B, fd1092; +fma.rn.f64 fd1091, fd119, 0d3FE35D9650D47852, 0d0000000000000000; +fma.rn.f64 fd318, fd123, 0d3FEED566CB3DCBA1, fd1091; +fma.rn.f64 fd319, fd125, 0dBFD7B057F20BF2E4, fd315; +fma.rn.f64 fd320, fd128, 0d3FEDBA2D62CB789F, fd316; +fma.rn.f64 fd321, fd1127, 0dBFD7B057F20BF2E4, fd317; +fma.rn.f64 fd322, fd127, 0d3FEDBA2D62CB789F, fd318; +fma.rn.f64 fd323, fd129, 0dBFEB6B5FBD9F7255, fd319; +fma.rn.f64 fd324, fd132, 0d3FE07F6ACD7CDCE2, fd320; +fma.rn.f64 fd325, fd1124, 0dBFEB6B5FBD9F7255, fd321; +fma.rn.f64 fd326, fd131, 0d3FE07F6ACD7CDCE2, fd322; +fma.rn.f64 fd327, fd133, 0dBFEFCFFA67B61650, fd323; +fma.rn.f64 fd328, fd136, 0dBFBBADB02034D9FF, fd324; +fma.rn.f64 fd329, fd1122, 0dBFEFCFFA67B61650, fd325; +fma.rn.f64 fd330, fd135, 0dBFBBADB02034D9FF, fd326; +fma.rn.f64 fd331, fd137, 0dBFE73B5AE5DB4E10, fd327; +fma.rn.f64 fd332, fd140, 0dBFE601A24BA81342, fd328; +fma.rn.f64 fd333, fd1120, 0dBFE73B5AE5DB4E10, fd329; +fma.rn.f64 fd334, fd139, 0dBFE601A24BA81342, fd330; +fma.rn.f64 fd335, fd141, 0dBFC4B545C0234A71, fd331; +fma.rn.f64 fd336, fd144, 0dBFEF941537248537, fd332; +fma.rn.f64 fd337, fd1117, 0dBFC4B545C0234A71, fd333; +fma.rn.f64 fd338, fd143, 0dBFEF941537248537, fd334; +fma.rn.f64 fd339, fd145, 0d3FDDFA67657E7608, fd335; +fma.rn.f64 fd340, fd148, 0dBFEC45BB0D10918C, fd336; +fma.rn.f64 fd341, fd1115, 0d3FDDFA67657E7608, fd337; +fma.rn.f64 fd342, fd147, 0dBFEC45BB0D10918C, fd338; +fma.rn.f64 fd343, fd149, 0d3FED0ADB9B447CCF, fd339; +fma.rn.f64 fd344, fd152, 0dBFDADF7689C97B70, fd340; +fma.rn.f64 fd345, fd1113, 0d3FED0ADB9B447CCF, fd341; +fma.rn.f64 fd346, fd151, 0dBFDADF7689C97B70, fd342; +fma.rn.f64 fd347, fd153, 0d3FEF4079C06C0992, fd343; +fma.rn.f64 fd348, fd156, 0d3FCB8426C12812BC, fd344; +fma.rn.f64 fd349, fd1110, 0d3FEF4079C06C0992, fd345; +fma.rn.f64 fd350, fd155, 0d3FCB8426C12812BC, fd346; +fma.rn.f64 fd351, fd157, 0d3FE4B76371208A62, fd347; +fma.rn.f64 fd352, fd160, 0d3FE863A1ADA0CFA6, fd348; +fma.rn.f64 fd353, fd1108, 0d3FE4B76371208A62, fd349; +fma.rn.f64 fd354, fd159, 0d3FE863A1ADA0CFA6, fd350; +fma.rn.f64 fd355, fd161, 0d3FABB81853A18977, fd351; +fma.rn.f64 fd356, fd164, 0d3FEFF3FC588E859D, fd352; +fma.rn.f64 fd357, fd1106, 0d3FABB81853A18977, fd353; +fma.rn.f64 fd358, fd163, 0d3FEFF3FC588E859D, fd354; +fma.rn.f64 fd359, fd165, 0dBFE1F53E93956DBF, fd355; +fma.rn.f64 fd360, fd168, 0d3FEA7C6DA34AF89F, fd356; +fma.rn.f64 fd361, fd1103, 0dBFE1F53E93956DBF, fd357; +fma.rn.f64 fd362, fd167, 0d3FEA7C6DA34AF89F, fd358; +fma.rn.f64 fd363, fd169, 0dBFEE532CBE45C954, fd359; +fma.rn.f64 fd364, fd172, 0d3FD46F6FAF5FCB72, fd360; +fma.rn.f64 fd365, fd1101, 0dBFEE532CBE45C954, fd361; +fma.rn.f64 fd366, fd171, 0d3FD46F6FAF5FCB72, fd362; +fma.rn.f64 fd367, fd117, 0d3FE4B76371208A62, %58; +fma.rn.f64 fd371, fd121, 0dBFC4B545C0234A71, fd367; +fma.rn.f64 fd1090, fd120, 0d3FE863A1ADA0CFA6, 0d0000000000000000; +fma.rn.f64 fd372, fd124, 0d3FEF941537248537, fd1090; +fma.rn.f64 fd1089, fd1131, 0d3FE4B76371208A62, %59; +fma.rn.f64 fd373, fd1129, 0dBFC4B545C0234A71, fd1089; +fma.rn.f64 fd1088, fd119, 0d3FE863A1ADA0CFA6, 0d0000000000000000; +fma.rn.f64 fd374, fd123, 0d3FEF941537248537, fd1088; +fma.rn.f64 fd375, fd125, 0dBFEB6B5FBD9F7255, fd371; +fma.rn.f64 fd376, fd128, 0d3FE07F6ACD7CDCE2, fd372; +fma.rn.f64 fd377, fd1127, 0dBFEB6B5FBD9F7255, fd373; +fma.rn.f64 fd378, fd127, 0d3FE07F6ACD7CDCE2, fd374; +fma.rn.f64 fd379, fd129, 0dBFEE532CBE45C954, fd375; +fma.rn.f64 fd380, fd132, 0dBFD46F6FAF5FCB72, fd376; +fma.rn.f64 fd381, fd1124, 0dBFEE532CBE45C954, fd377; +fma.rn.f64 fd382, fd131, 0dBFD46F6FAF5FCB72, fd378; +fma.rn.f64 fd383, fd133, 0dBFD7B057F20BF2E4, fd379; +fma.rn.f64 fd384, fd136, 0dBFEDBA2D62CB789F, fd380; +fma.rn.f64 fd385, fd1122, 0dBFD7B057F20BF2E4, fd381; +fma.rn.f64 fd386, fd135, 0dBFEDBA2D62CB789F, fd382; +fma.rn.f64 fd387, fd137, 0d3FDDFA67657E7608, fd383; +fma.rn.f64 fd388, fd140, 0dBFEC45BB0D10918C, fd384; +fma.rn.f64 fd389, fd1120, 0d3FDDFA67657E7608, fd385; +fma.rn.f64 fd390, fd139, 0dBFEC45BB0D10918C, fd386; +fma.rn.f64 fd391, fd141, 0d3FEF4079C06C0992, fd387; +fma.rn.f64 fd392, fd144, 0dBFCB8426C12812BC, fd388; +fma.rn.f64 fd393, fd1117, 0d3FEF4079C06C0992, fd389; +fma.rn.f64 fd394, fd143, 0dBFCB8426C12812BC, fd390; +fma.rn.f64 fd395, fd145, 0d3FE979982A38E65A, fd391; +fma.rn.f64 fd396, fd148, 0d3FE35D9650D47852, fd392; +fma.rn.f64 fd397, fd1115, 0d3FE979982A38E65A, fd393; +fma.rn.f64 fd398, fd147, 0d3FE35D9650D47852, fd394; +fma.rn.f64 fd399, fd149, 0d3FABB81853A18977, fd395; +fma.rn.f64 fd400, fd152, 0d3FEFF3FC588E859D, fd396; +fma.rn.f64 fd401, fd1113, 0d3FABB81853A18977, fd397; +fma.rn.f64 fd402, fd151, 0d3FEFF3FC588E859D, fd398; +fma.rn.f64 fd403, fd153, 0dBFE73B5AE5DB4E10, fd399; +fma.rn.f64 fd404, fd156, 0d3FE601A24BA81342, fd400; +fma.rn.f64 fd405, fd1110, 0dBFE73B5AE5DB4E10, fd401; +fma.rn.f64 fd406, fd155, 0d3FE601A24BA81342, fd402; +fma.rn.f64 fd407, fd157, 0dBFEFCFFA67B61650, fd403; +fma.rn.f64 fd408, fd160, 0dBFBBADB02034D9FF, fd404; +fma.rn.f64 fd409, fd1108, 0dBFEFCFFA67B61650, fd405; +fma.rn.f64 fd410, fd159, 0dBFBBADB02034D9FF, fd406; +fma.rn.f64 fd411, fd161, 0dBFE1F53E93956DBF, fd407; +fma.rn.f64 fd412, fd164, 0dBFEA7C6DA34AF89F, fd408; +fma.rn.f64 fd413, fd1106, 0dBFE1F53E93956DBF, fd409; +fma.rn.f64 fd414, fd163, 0dBFEA7C6DA34AF89F, fd410; +fma.rn.f64 fd415, fd165, 0d3FD11F2F2E2F1E3B, fd411; +fma.rn.f64 fd416, fd168, 0dBFEED566CB3DCBA1, fd412; +fma.rn.f64 fd417, fd1103, 0d3FD11F2F2E2F1E3B, fd413; +fma.rn.f64 fd418, fd167, 0dBFEED566CB3DCBA1, fd414; +fma.rn.f64 fd419, fd169, 0d3FED0ADB9B447CCF, fd415; +fma.rn.f64 fd420, fd172, 0dBFDADF7689C97B70, fd416; +fma.rn.f64 fd421, fd1101, 0d3FED0ADB9B447CCF, fd417; +fma.rn.f64 fd422, fd171, 0dBFDADF7689C97B70, fd418; +fma.rn.f64 fd423, fd117, 0d3FDDFA67657E7608, %58; +fma.rn.f64 fd427, fd121, 0dBFE1F53E93956DBF, fd423; +fma.rn.f64 fd1087, fd120, 0d3FEC45BB0D10918C, 0d0000000000000000; +fma.rn.f64 fd428, fd124, 0d3FEA7C6DA34AF89F, fd1087; +fma.rn.f64 fd1086, fd1131, 0d3FDDFA67657E7608, %59; +fma.rn.f64 fd429, fd1129, 0dBFE1F53E93956DBF, fd1086; +fma.rn.f64 fd1085, fd119, 0d3FEC45BB0D10918C, 0d0000000000000000; +fma.rn.f64 fd430, fd123, 0d3FEA7C6DA34AF89F, fd1085; +fma.rn.f64 fd431, fd125, 0dBFEFCFFA67B61650, fd427; +fma.rn.f64 fd432, fd128, 0dBFBBADB02034D9FF, fd428; +fma.rn.f64 fd433, fd1127, 0dBFEFCFFA67B61650, fd429; +fma.rn.f64 fd434, fd127, 0dBFBBADB02034D9FF, fd430; +fma.rn.f64 fd435, fd129, 0dBFD7B057F20BF2E4, fd431; +fma.rn.f64 fd436, fd132, 0dBFEDBA2D62CB789F, fd432; +fma.rn.f64 fd437, fd1124, 0dBFD7B057F20BF2E4, fd433; +fma.rn.f64 fd438, fd131, 0dBFEDBA2D62CB789F, fd434; +fma.rn.f64 fd439, fd133, 0d3FE4B76371208A62, fd435; +fma.rn.f64 fd440, fd136, 0dBFE863A1ADA0CFA6, fd436; +fma.rn.f64 fd441, fd1122, 0d3FE4B76371208A62, fd437; +fma.rn.f64 fd442, fd135, 0dBFE863A1ADA0CFA6, fd438; +fma.rn.f64 fd443, fd137, 0d3FEF4079C06C0992, fd439; +fma.rn.f64 fd444, fd140, 0d3FCB8426C12812BC, fd440; +fma.rn.f64 fd445, fd1120, 0d3FEF4079C06C0992, fd441; +fma.rn.f64 fd446, fd139, 0d3FCB8426C12812BC, fd442; +fma.rn.f64 fd447, fd141, 0d3FD11F2F2E2F1E3B, fd443; +fma.rn.f64 fd448, fd144, 0d3FEED566CB3DCBA1, fd444; +fma.rn.f64 fd449, fd1117, 0d3FD11F2F2E2F1E3B, fd445; +fma.rn.f64 fd450, fd143, 0d3FEED566CB3DCBA1, fd446; +fma.rn.f64 fd451, fd145, 0dBFE73B5AE5DB4E10, fd447; +fma.rn.f64 fd452, fd148, 0d3FE601A24BA81342, fd448; +fma.rn.f64 fd453, fd1115, 0dBFE73B5AE5DB4E10, fd449; +fma.rn.f64 fd454, fd147, 0d3FE601A24BA81342, fd450; +fma.rn.f64 fd455, fd149, 0dBFEE532CBE45C954, fd451; +fma.rn.f64 fd456, fd152, 0dBFD46F6FAF5FCB72, fd452; +fma.rn.f64 fd457, fd1113, 0dBFEE532CBE45C954, fd453; +fma.rn.f64 fd458, fd151, 0dBFD46F6FAF5FCB72, fd454; +fma.rn.f64 fd459, fd153, 0dBFC4B545C0234A71, fd455; +fma.rn.f64 fd460, fd156, 0dBFEF941537248537, fd456; +fma.rn.f64 fd461, fd1110, 0dBFC4B545C0234A71, fd457; +fma.rn.f64 fd462, fd155, 0dBFEF941537248537, fd458; +fma.rn.f64 fd463, fd157, 0d3FE979982A38E65A, fd459; +fma.rn.f64 fd464, fd160, 0dBFE35D9650D47852, fd460; +fma.rn.f64 fd465, fd1108, 0d3FE979982A38E65A, fd461; +fma.rn.f64 fd466, fd159, 0dBFE35D9650D47852, fd462; +fma.rn.f64 fd467, fd161, 0d3FED0ADB9B447CCF, fd463; +fma.rn.f64 fd468, fd164, 0d3FDADF7689C97B70, fd464; +fma.rn.f64 fd469, fd1106, 0d3FED0ADB9B447CCF, fd465; +fma.rn.f64 fd470, fd163, 0d3FDADF7689C97B70, fd466; +fma.rn.f64 fd471, fd165, 0d3FABB81853A18977, fd467; +fma.rn.f64 fd472, fd168, 0d3FEFF3FC588E859D, fd468; +fma.rn.f64 fd473, fd1103, 0d3FABB81853A18977, fd469; +fma.rn.f64 fd474, fd167, 0d3FEFF3FC588E859D, fd470; +fma.rn.f64 fd475, fd169, 0dBFEB6B5FBD9F7255, fd471; +fma.rn.f64 fd476, fd172, 0d3FE07F6ACD7CDCE2, fd472; +fma.rn.f64 fd477, fd1101, 0dBFEB6B5FBD9F7255, fd473; +fma.rn.f64 fd478, fd171, 0d3FE07F6ACD7CDCE2, fd474; +fma.rn.f64 fd479, fd117, 0d3FD11F2F2E2F1E3B, %58; +fma.rn.f64 fd483, fd121, 0dBFEB6B5FBD9F7255, fd479; +fma.rn.f64 fd1084, fd120, 0d3FEED566CB3DCBA1, 0d0000000000000000; +fma.rn.f64 fd484, fd124, 0d3FE07F6ACD7CDCE2, fd1084; +fma.rn.f64 fd1083, fd1131, 0d3FD11F2F2E2F1E3B, %59; +fma.rn.f64 fd485, fd1129, 0dBFEB6B5FBD9F7255, fd1083; +fma.rn.f64 fd1082, fd119, 0d3FEED566CB3DCBA1, 0d0000000000000000; +fma.rn.f64 fd486, fd123, 0d3FE07F6ACD7CDCE2, fd1082; +fma.rn.f64 fd487, fd125, 0dBFE73B5AE5DB4E10, fd483; +fma.rn.f64 fd488, fd128, 0dBFE601A24BA81342, fd484; +fma.rn.f64 fd489, fd1127, 0dBFE73B5AE5DB4E10, fd485; +fma.rn.f64 fd490, fd127, 0dBFE601A24BA81342, fd486; +fma.rn.f64 fd491, fd129, 0d3FDDFA67657E7608, fd487; +fma.rn.f64 fd492, fd132, 0dBFEC45BB0D10918C, fd488; +fma.rn.f64 fd493, fd1124, 0d3FDDFA67657E7608, fd489; +fma.rn.f64 fd494, fd131, 0dBFEC45BB0D10918C, fd490; +fma.rn.f64 fd495, fd133, 0d3FEF4079C06C0992, fd491; +fma.rn.f64 fd496, fd136, 0d3FCB8426C12812BC, fd492; +fma.rn.f64 fd497, fd1122, 0d3FEF4079C06C0992, fd493; +fma.rn.f64 fd498, fd135, 0d3FCB8426C12812BC, fd494; +fma.rn.f64 fd499, fd137, 0d3FABB81853A18977, fd495; +fma.rn.f64 fd500, fd140, 0d3FEFF3FC588E859D, fd496; +fma.rn.f64 fd501, fd1120, 0d3FABB81853A18977, fd497; +fma.rn.f64 fd502, fd139, 0d3FEFF3FC588E859D, fd498; +fma.rn.f64 fd503, fd141, 0dBFEE532CBE45C954, fd499; +fma.rn.f64 fd504, fd144, 0d3FD46F6FAF5FCB72, fd500; +fma.rn.f64 fd505, fd1117, 0dBFEE532CBE45C954, fd501; +fma.rn.f64 fd506, fd143, 0d3FD46F6FAF5FCB72, fd502; +fma.rn.f64 fd507, fd145, 0dBFE1F53E93956DBF, fd503; +fma.rn.f64 fd508, fd148, 0dBFEA7C6DA34AF89F, fd504; +fma.rn.f64 fd509, fd1115, 0dBFE1F53E93956DBF, fd505; +fma.rn.f64 fd510, fd147, 0dBFEA7C6DA34AF89F, fd506; +fma.rn.f64 fd511, fd149, 0d3FE4B76371208A62, fd507; +fma.rn.f64 fd512, fd152, 0dBFE863A1ADA0CFA6, fd508; +fma.rn.f64 fd513, fd1113, 0d3FE4B76371208A62, fd509; +fma.rn.f64 fd514, fd151, 0dBFE863A1ADA0CFA6, fd510; +fma.rn.f64 fd515, fd153, 0d3FED0ADB9B447CCF, fd511; +fma.rn.f64 fd516, fd156, 0d3FDADF7689C97B70, fd512; +fma.rn.f64 fd517, fd1110, 0d3FED0ADB9B447CCF, fd513; +fma.rn.f64 fd518, fd155, 0d3FDADF7689C97B70, fd514; +fma.rn.f64 fd519, fd157, 0dBFC4B545C0234A71, fd515; +fma.rn.f64 fd520, fd160, 0d3FEF941537248537, fd516; +fma.rn.f64 fd521, fd1108, 0dBFC4B545C0234A71, fd517; +fma.rn.f64 fd522, fd159, 0d3FEF941537248537, fd518; +fma.rn.f64 fd523, fd161, 0dBFEFCFFA67B61650, fd519; +fma.rn.f64 fd524, fd164, 0d3FBBADB02034D9FF, fd520; +fma.rn.f64 fd525, fd1106, 0dBFEFCFFA67B61650, fd521; +fma.rn.f64 fd526, fd163, 0d3FBBADB02034D9FF, fd522; +fma.rn.f64 fd527, fd165, 0dBFD7B057F20BF2E4, fd523; +fma.rn.f64 fd528, fd168, 0dBFEDBA2D62CB789F, fd524; +fma.rn.f64 fd529, fd1103, 0dBFD7B057F20BF2E4, fd525; +fma.rn.f64 fd530, fd167, 0dBFEDBA2D62CB789F, fd526; +fma.rn.f64 fd531, fd169, 0d3FE979982A38E65A, fd527; +fma.rn.f64 fd532, fd172, 0dBFE35D9650D47852, fd528; +fma.rn.f64 fd533, fd1101, 0d3FE979982A38E65A, fd529; +fma.rn.f64 fd534, fd171, 0dBFE35D9650D47852, fd530; +fma.rn.f64 fd535, fd117, 0d3FABB81853A18977, %58; +fma.rn.f64 fd539, fd121, 0dBFEFCFFA67B61650, fd535; +fma.rn.f64 fd1081, fd120, 0d3FEFF3FC588E859D, 0d0000000000000000; +fma.rn.f64 fd540, fd124, 0d3FBBADB02034D9FF, fd1081; +fma.rn.f64 fd1080, fd1131, 0d3FABB81853A18977, %59; +fma.rn.f64 fd541, fd1129, 0dBFEFCFFA67B61650, fd1080; +fma.rn.f64 fd1079, fd119, 0d3FEFF3FC588E859D, 0d0000000000000000; +fma.rn.f64 fd542, fd123, 0d3FBBADB02034D9FF, fd1079; +fma.rn.f64 fd543, fd125, 0dBFC4B545C0234A71, fd539; +fma.rn.f64 fd544, fd128, 0dBFEF941537248537, fd540; +fma.rn.f64 fd545, fd1127, 0dBFC4B545C0234A71, fd541; +fma.rn.f64 fd546, fd127, 0dBFEF941537248537, fd542; +fma.rn.f64 fd547, fd129, 0d3FEF4079C06C0992, fd543; +fma.rn.f64 fd548, fd132, 0dBFCB8426C12812BC, fd544; +fma.rn.f64 fd549, fd1124, 0d3FEF4079C06C0992, fd545; +fma.rn.f64 fd550, fd131, 0dBFCB8426C12812BC, fd546; +fma.rn.f64 fd551, fd133, 0d3FD11F2F2E2F1E3B, fd547; +fma.rn.f64 fd552, fd136, 0d3FEED566CB3DCBA1, fd548; +fma.rn.f64 fd553, fd1122, 0d3FD11F2F2E2F1E3B, fd549; +fma.rn.f64 fd554, fd135, 0d3FEED566CB3DCBA1, fd550; +fma.rn.f64 fd555, fd137, 0dBFEE532CBE45C954, fd551; +fma.rn.f64 fd556, fd140, 0d3FD46F6FAF5FCB72, fd552; +fma.rn.f64 fd557, fd1120, 0dBFEE532CBE45C954, fd553; +fma.rn.f64 fd558, fd139, 0d3FD46F6FAF5FCB72, fd554; +fma.rn.f64 fd559, fd141, 0dBFD7B057F20BF2E4, fd555; +fma.rn.f64 fd560, fd144, 0dBFEDBA2D62CB789F, fd556; +fma.rn.f64 fd561, fd1117, 0dBFD7B057F20BF2E4, fd557; +fma.rn.f64 fd562, fd143, 0dBFEDBA2D62CB789F, fd558; +fma.rn.f64 fd563, fd145, 0d3FED0ADB9B447CCF, fd559; +fma.rn.f64 fd564, fd148, 0dBFDADF7689C97B70, fd560; +fma.rn.f64 fd565, fd1115, 0d3FED0ADB9B447CCF, fd561; +fma.rn.f64 fd566, fd147, 0dBFDADF7689C97B70, fd562; +fma.rn.f64 fd567, fd149, 0d3FDDFA67657E7608, fd563; +fma.rn.f64 fd568, fd152, 0d3FEC45BB0D10918C, fd564; +fma.rn.f64 fd569, fd1113, 0d3FDDFA67657E7608, fd565; +fma.rn.f64 fd570, fd151, 0d3FEC45BB0D10918C, fd566; +fma.rn.f64 fd571, fd153, 0dBFEB6B5FBD9F7255, fd567; +fma.rn.f64 fd572, fd156, 0d3FE07F6ACD7CDCE2, fd568; +fma.rn.f64 fd573, fd1110, 0dBFEB6B5FBD9F7255, fd569; +fma.rn.f64 fd574, fd155, 0d3FE07F6ACD7CDCE2, fd570; +fma.rn.f64 fd575, fd157, 0dBFE1F53E93956DBF, fd571; +fma.rn.f64 fd576, fd160, 0dBFEA7C6DA34AF89F, fd572; +fma.rn.f64 fd577, fd1108, 0dBFE1F53E93956DBF, fd573; +fma.rn.f64 fd578, fd159, 0dBFEA7C6DA34AF89F, fd574; +fma.rn.f64 fd579, fd161, 0d3FE979982A38E65A, fd575; +fma.rn.f64 fd580, fd164, 0dBFE35D9650D47852, fd576; +fma.rn.f64 fd581, fd1106, 0d3FE979982A38E65A, fd577; +fma.rn.f64 fd582, fd163, 0dBFE35D9650D47852, fd578; +fma.rn.f64 fd583, fd165, 0d3FE4B76371208A62, fd579; +fma.rn.f64 fd584, fd168, 0d3FE863A1ADA0CFA6, fd580; +fma.rn.f64 fd585, fd1103, 0d3FE4B76371208A62, fd581; +fma.rn.f64 fd586, fd167, 0d3FE863A1ADA0CFA6, fd582; +fma.rn.f64 fd587, fd169, 0dBFE73B5AE5DB4E10, fd583; +fma.rn.f64 fd588, fd172, 0d3FE601A24BA81342, fd584; +fma.rn.f64 fd589, fd1101, 0dBFE73B5AE5DB4E10, fd585; +fma.rn.f64 fd590, fd171, 0d3FE601A24BA81342, fd586; +fma.rn.f64 fd591, fd117, 0dBFC4B545C0234A71, %58; +fma.rn.f64 fd595, fd121, 0dBFEE532CBE45C954, fd591; +fma.rn.f64 fd1078, fd120, 0d3FEF941537248537, 0d0000000000000000; +fma.rn.f64 fd596, fd124, 0dBFD46F6FAF5FCB72, fd1078; +fma.rn.f64 fd1077, fd1131, 0dBFC4B545C0234A71, %59; +fma.rn.f64 fd597, fd1129, 0dBFEE532CBE45C954, fd1077; +fma.rn.f64 fd1076, fd119, 0d3FEF941537248537, 0d0000000000000000; +fma.rn.f64 fd598, fd123, 0dBFD46F6FAF5FCB72, fd1076; +fma.rn.f64 fd599, fd125, 0d3FDDFA67657E7608, fd595; +fma.rn.f64 fd600, fd128, 0dBFEC45BB0D10918C, fd596; +fma.rn.f64 fd601, fd1127, 0d3FDDFA67657E7608, fd597; +fma.rn.f64 fd602, fd127, 0dBFEC45BB0D10918C, fd598; +fma.rn.f64 fd603, fd129, 0d3FE979982A38E65A, fd599; +fma.rn.f64 fd604, fd132, 0d3FE35D9650D47852, fd600; +fma.rn.f64 fd605, fd1124, 0d3FE979982A38E65A, fd601; +fma.rn.f64 fd606, fd131, 0d3FE35D9650D47852, fd602; +fma.rn.f64 fd607, fd133, 0dBFE73B5AE5DB4E10, fd603; +fma.rn.f64 fd608, fd136, 0d3FE601A24BA81342, fd604; +fma.rn.f64 fd609, fd1122, 0dBFE73B5AE5DB4E10, fd605; +fma.rn.f64 fd610, fd135, 0d3FE601A24BA81342, fd606; +fma.rn.f64 fd611, fd137, 0dBFE1F53E93956DBF, fd607; +fma.rn.f64 fd612, fd140, 0dBFEA7C6DA34AF89F, fd608; +fma.rn.f64 fd613, fd1120, 0dBFE1F53E93956DBF, fd609; +fma.rn.f64 fd614, fd139, 0dBFEA7C6DA34AF89F, fd610; +fma.rn.f64 fd615, fd141, 0d3FED0ADB9B447CCF, fd611; +fma.rn.f64 fd616, fd144, 0dBFDADF7689C97B70, fd612; +fma.rn.f64 fd617, fd1117, 0d3FED0ADB9B447CCF, fd613; +fma.rn.f64 fd618, fd143, 0dBFDADF7689C97B70, fd614; +fma.rn.f64 fd619, fd145, 0d3FD11F2F2E2F1E3B, fd615; +fma.rn.f64 fd620, fd148, 0d3FEED566CB3DCBA1, fd616; +fma.rn.f64 fd621, fd1115, 0d3FD11F2F2E2F1E3B, fd617; +fma.rn.f64 fd622, fd147, 0d3FEED566CB3DCBA1, fd618; +fma.rn.f64 fd623, fd149, 0dBFEFCFFA67B61650, fd619; +fma.rn.f64 fd624, fd152, 0d3FBBADB02034D9FF, fd620; +fma.rn.f64 fd625, fd1113, 0dBFEFCFFA67B61650, fd621; +fma.rn.f64 fd626, fd151, 0d3FBBADB02034D9FF, fd622; +fma.rn.f64 fd627, fd153, 0d3FABB81853A18977, fd623; +fma.rn.f64 fd628, fd156, 0dBFEFF3FC588E859D, fd624; +fma.rn.f64 fd629, fd1110, 0d3FABB81853A18977, fd625; +fma.rn.f64 fd630, fd155, 0dBFEFF3FC588E859D, fd626; +fma.rn.f64 fd631, fd157, 0d3FEF4079C06C0992, fd627; +fma.rn.f64 fd632, fd160, 0d3FCB8426C12812BC, fd628; +fma.rn.f64 fd633, fd1108, 0d3FEF4079C06C0992, fd629; +fma.rn.f64 fd634, fd159, 0d3FCB8426C12812BC, fd630; +fma.rn.f64 fd635, fd161, 0dBFD7B057F20BF2E4, fd631; +fma.rn.f64 fd636, fd164, 0d3FEDBA2D62CB789F, fd632; +fma.rn.f64 fd637, fd1106, 0dBFD7B057F20BF2E4, fd633; +fma.rn.f64 fd638, fd163, 0d3FEDBA2D62CB789F, fd634; +fma.rn.f64 fd639, fd165, 0dBFEB6B5FBD9F7255, fd635; +fma.rn.f64 fd640, fd168, 0dBFE07F6ACD7CDCE2, fd636; +fma.rn.f64 fd641, fd1103, 0dBFEB6B5FBD9F7255, fd637; +fma.rn.f64 fd642, fd167, 0dBFE07F6ACD7CDCE2, fd638; +fma.rn.f64 fd643, fd169, 0d3FE4B76371208A62, fd639; +fma.rn.f64 fd644, fd172, 0dBFE863A1ADA0CFA6, fd640; +fma.rn.f64 fd645, fd1101, 0d3FE4B76371208A62, fd641; +fma.rn.f64 fd646, fd171, 0dBFE863A1ADA0CFA6, fd642; +fma.rn.f64 fd647, fd117, 0dBFD7B057F20BF2E4, %58; +fma.rn.f64 fd651, fd121, 0dBFE73B5AE5DB4E10, fd647; +fma.rn.f64 fd1075, fd120, 0d3FEDBA2D62CB789F, 0d0000000000000000; +fma.rn.f64 fd652, fd124, 0dBFE601A24BA81342, fd1075; +fma.rn.f64 fd1074, fd1131, 0dBFD7B057F20BF2E4, %59; +fma.rn.f64 fd653, fd1129, 0dBFE73B5AE5DB4E10, fd1074; +fma.rn.f64 fd1073, fd119, 0d3FEDBA2D62CB789F, 0d0000000000000000; +fma.rn.f64 fd654, fd123, 0dBFE601A24BA81342, fd1073; +fma.rn.f64 fd655, fd125, 0d3FED0ADB9B447CCF, fd651; +fma.rn.f64 fd656, fd128, 0dBFDADF7689C97B70, fd652; +fma.rn.f64 fd657, fd1127, 0d3FED0ADB9B447CCF, fd653; +fma.rn.f64 fd658, fd127, 0dBFDADF7689C97B70, fd654; +fma.rn.f64 fd659, fd129, 0d3FABB81853A18977, fd655; +fma.rn.f64 fd660, fd132, 0d3FEFF3FC588E859D, fd656; +fma.rn.f64 fd661, fd1124, 0d3FABB81853A18977, fd657; +fma.rn.f64 fd662, fd131, 0d3FEFF3FC588E859D, fd658; +fma.rn.f64 fd663, fd133, 0dBFEE532CBE45C954, fd659; +fma.rn.f64 fd664, fd136, 0dBFD46F6FAF5FCB72, fd660; +fma.rn.f64 fd665, fd1122, 0dBFEE532CBE45C954, fd661; +fma.rn.f64 fd666, fd135, 0dBFD46F6FAF5FCB72, fd662; +fma.rn.f64 fd667, fd137, 0d3FE4B76371208A62, fd663; +fma.rn.f64 fd668, fd140, 0dBFE863A1ADA0CFA6, fd664; +fma.rn.f64 fd669, fd1120, 0d3FE4B76371208A62, fd665; +fma.rn.f64 fd670, fd139, 0dBFE863A1ADA0CFA6, fd666; +fma.rn.f64 fd671, fd141, 0d3FDDFA67657E7608, fd667; +fma.rn.f64 fd672, fd144, 0d3FEC45BB0D10918C, fd668; +fma.rn.f64 fd673, fd1117, 0d3FDDFA67657E7608, fd669; +fma.rn.f64 fd674, fd143, 0d3FEC45BB0D10918C, fd670; +fma.rn.f64 fd675, fd145, 0dBFEFCFFA67B61650, fd671; +fma.rn.f64 fd676, fd148, 0d3FBBADB02034D9FF, fd672; +fma.rn.f64 fd677, fd1115, 0dBFEFCFFA67B61650, fd673; +fma.rn.f64 fd678, fd147, 0d3FBBADB02034D9FF, fd674; +fma.rn.f64 fd679, fd149, 0d3FD11F2F2E2F1E3B, fd675; +fma.rn.f64 fd680, fd152, 0dBFEED566CB3DCBA1, fd676; +fma.rn.f64 fd681, fd1113, 0d3FD11F2F2E2F1E3B, fd677; +fma.rn.f64 fd682, fd151, 0dBFEED566CB3DCBA1, fd678; +fma.rn.f64 fd683, fd153, 0d3FE979982A38E65A, fd679; +fma.rn.f64 fd684, fd156, 0d3FE35D9650D47852, fd680; +fma.rn.f64 fd685, fd1110, 0d3FE979982A38E65A, fd681; +fma.rn.f64 fd686, fd155, 0d3FE35D9650D47852, fd682; +fma.rn.f64 fd687, fd157, 0dBFEB6B5FBD9F7255, fd683; +fma.rn.f64 fd688, fd160, 0d3FE07F6ACD7CDCE2, fd684; +fma.rn.f64 fd689, fd1108, 0dBFEB6B5FBD9F7255, fd685; +fma.rn.f64 fd690, fd159, 0d3FE07F6ACD7CDCE2, fd686; +fma.rn.f64 fd691, fd161, 0dBFC4B545C0234A71, fd687; +fma.rn.f64 fd692, fd164, 0dBFEF941537248537, fd688; +fma.rn.f64 fd693, fd1106, 0dBFC4B545C0234A71, fd689; +fma.rn.f64 fd694, fd163, 0dBFEF941537248537, fd690; +fma.rn.f64 fd695, fd165, 0d3FEF4079C06C0992, fd691; +fma.rn.f64 fd696, fd168, 0d3FCB8426C12812BC, fd692; +fma.rn.f64 fd697, fd1103, 0d3FEF4079C06C0992, fd693; +fma.rn.f64 fd698, fd167, 0d3FCB8426C12812BC, fd694; +fma.rn.f64 fd699, fd169, 0dBFE1F53E93956DBF, fd695; +fma.rn.f64 fd700, fd172, 0d3FEA7C6DA34AF89F, fd696; +fma.rn.f64 fd701, fd1101, 0dBFE1F53E93956DBF, fd697; +fma.rn.f64 fd702, fd171, 0d3FEA7C6DA34AF89F, fd698; +fma.rn.f64 fd703, fd117, 0dBFE1F53E93956DBF, %58; +fma.rn.f64 fd707, fd121, 0dBFD7B057F20BF2E4, fd703; +fma.rn.f64 fd1072, fd120, 0d3FEA7C6DA34AF89F, 0d0000000000000000; +fma.rn.f64 fd708, fd124, 0dBFEDBA2D62CB789F, fd1072; +fma.rn.f64 fd1071, fd1131, 0dBFE1F53E93956DBF, %59; +fma.rn.f64 fd709, fd1129, 0dBFD7B057F20BF2E4, fd1071; +fma.rn.f64 fd1070, fd119, 0d3FEA7C6DA34AF89F, 0d0000000000000000; +fma.rn.f64 fd710, fd123, 0dBFEDBA2D62CB789F, fd1070; +fma.rn.f64 fd711, fd125, 0d3FEF4079C06C0992, fd707; +fma.rn.f64 fd712, fd128, 0d3FCB8426C12812BC, fd708; +fma.rn.f64 fd713, fd1127, 0d3FEF4079C06C0992, fd709; +fma.rn.f64 fd714, fd127, 0d3FCB8426C12812BC, fd710; +fma.rn.f64 fd715, fd129, 0dBFE73B5AE5DB4E10, fd711; +fma.rn.f64 fd716, fd132, 0d3FE601A24BA81342, fd712; +fma.rn.f64 fd717, fd1124, 0dBFE73B5AE5DB4E10, fd713; +fma.rn.f64 fd718, fd131, 0d3FE601A24BA81342, fd714; +fma.rn.f64 fd719, fd133, 0dBFC4B545C0234A71, fd715; +fma.rn.f64 fd720, fd136, 0dBFEF941537248537, fd716; +fma.rn.f64 fd721, fd1122, 0dBFC4B545C0234A71, fd717; +fma.rn.f64 fd722, fd135, 0dBFEF941537248537, fd718; +fma.rn.f64 fd723, fd137, 0d3FED0ADB9B447CCF, fd719; +fma.rn.f64 fd724, fd140, 0d3FDADF7689C97B70, fd720; +fma.rn.f64 fd725, fd1120, 0d3FED0ADB9B447CCF, fd721; +fma.rn.f64 fd726, fd139, 0d3FDADF7689C97B70, fd722; +fma.rn.f64 fd727, fd141, 0dBFEB6B5FBD9F7255, fd723; +fma.rn.f64 fd728, fd144, 0d3FE07F6ACD7CDCE2, fd724; +fma.rn.f64 fd729, fd1117, 0dBFEB6B5FBD9F7255, fd725; +fma.rn.f64 fd730, fd143, 0d3FE07F6ACD7CDCE2, fd726; +fma.rn.f64 fd731, fd145, 0d3FABB81853A18977, fd727; +fma.rn.f64 fd732, fd148, 0dBFEFF3FC588E859D, fd728; +fma.rn.f64 fd733, fd1115, 0d3FABB81853A18977, fd729; +fma.rn.f64 fd734, fd147, 0dBFEFF3FC588E859D, fd730; +fma.rn.f64 fd735, fd149, 0d3FE979982A38E65A, fd731; +fma.rn.f64 fd736, fd152, 0d3FE35D9650D47852, fd732; +fma.rn.f64 fd737, fd1113, 0d3FE979982A38E65A, fd733; +fma.rn.f64 fd738, fd151, 0d3FE35D9650D47852, fd734; +fma.rn.f64 fd739, fd153, 0dBFEE532CBE45C954, fd735; +fma.rn.f64 fd740, fd156, 0d3FD46F6FAF5FCB72, fd736; +fma.rn.f64 fd741, fd1110, 0dBFEE532CBE45C954, fd737; +fma.rn.f64 fd742, fd155, 0d3FD46F6FAF5FCB72, fd738; +fma.rn.f64 fd743, fd157, 0d3FD11F2F2E2F1E3B, fd739; +fma.rn.f64 fd744, fd160, 0dBFEED566CB3DCBA1, fd740; +fma.rn.f64 fd745, fd1108, 0d3FD11F2F2E2F1E3B, fd741; +fma.rn.f64 fd746, fd159, 0dBFEED566CB3DCBA1, fd742; +fma.rn.f64 fd747, fd161, 0d3FE4B76371208A62, fd743; +fma.rn.f64 fd748, fd164, 0d3FE863A1ADA0CFA6, fd744; +fma.rn.f64 fd749, fd1106, 0d3FE4B76371208A62, fd745; +fma.rn.f64 fd750, fd163, 0d3FE863A1ADA0CFA6, fd746; +fma.rn.f64 fd751, fd165, 0dBFEFCFFA67B61650, fd747; +fma.rn.f64 fd752, fd168, 0d3FBBADB02034D9FF, fd748; +fma.rn.f64 fd753, fd1103, 0dBFEFCFFA67B61650, fd749; +fma.rn.f64 fd754, fd167, 0d3FBBADB02034D9FF, fd750; +fma.rn.f64 fd755, fd169, 0d3FDDFA67657E7608, fd751; +fma.rn.f64 fd756, fd172, 0dBFEC45BB0D10918C, fd752; +fma.rn.f64 fd757, fd1101, 0d3FDDFA67657E7608, fd753; +fma.rn.f64 fd758, fd171, 0dBFEC45BB0D10918C, fd754; +fma.rn.f64 fd759, fd117, 0dBFE73B5AE5DB4E10, %58; +fma.rn.f64 fd763, fd121, 0d3FABB81853A18977, fd759; +fma.rn.f64 fd1069, fd120, 0d3FE601A24BA81342, 0d0000000000000000; +fma.rn.f64 fd764, fd124, 0dBFEFF3FC588E859D, fd1069; +fma.rn.f64 fd1068, fd1131, 0dBFE73B5AE5DB4E10, %59; +fma.rn.f64 fd765, fd1129, 0d3FABB81853A18977, fd1068; +fma.rn.f64 fd1067, fd119, 0d3FE601A24BA81342, 0d0000000000000000; +fma.rn.f64 fd766, fd123, 0dBFEFF3FC588E859D, fd1067; +fma.rn.f64 fd767, fd125, 0d3FE4B76371208A62, fd763; +fma.rn.f64 fd768, fd128, 0d3FE863A1ADA0CFA6, fd764; +fma.rn.f64 fd769, fd1127, 0d3FE4B76371208A62, fd765; +fma.rn.f64 fd770, fd127, 0d3FE863A1ADA0CFA6, fd766; +fma.rn.f64 fd771, fd129, 0dBFEFCFFA67B61650, fd767; +fma.rn.f64 fd772, fd132, 0dBFBBADB02034D9FF, fd768; +fma.rn.f64 fd773, fd1124, 0dBFEFCFFA67B61650, fd769; +fma.rn.f64 fd774, fd131, 0dBFBBADB02034D9FF, fd770; +fma.rn.f64 fd775, fd133, 0d3FE979982A38E65A, fd771; +fma.rn.f64 fd776, fd136, 0dBFE35D9650D47852, fd772; +fma.rn.f64 fd777, fd1122, 0d3FE979982A38E65A, fd773; +fma.rn.f64 fd778, fd135, 0dBFE35D9650D47852, fd774; +fma.rn.f64 fd779, fd137, 0dBFC4B545C0234A71, fd775; +fma.rn.f64 fd780, fd140, 0d3FEF941537248537, fd776; +fma.rn.f64 fd781, fd1120, 0dBFC4B545C0234A71, fd777; +fma.rn.f64 fd782, fd139, 0d3FEF941537248537, fd778; +fma.rn.f64 fd783, fd141, 0dBFE1F53E93956DBF, fd779; +fma.rn.f64 fd784, fd144, 0dBFEA7C6DA34AF89F, fd780; +fma.rn.f64 fd785, fd1117, 0dBFE1F53E93956DBF, fd781; +fma.rn.f64 fd786, fd143, 0dBFEA7C6DA34AF89F, fd782; +fma.rn.f64 fd787, fd145, 0d3FEF4079C06C0992, fd783; +fma.rn.f64 fd788, fd148, 0d3FCB8426C12812BC, fd784; +fma.rn.f64 fd789, fd1115, 0d3FEF4079C06C0992, fd785; +fma.rn.f64 fd790, fd147, 0d3FCB8426C12812BC, fd786; +fma.rn.f64 fd791, fd149, 0dBFEB6B5FBD9F7255, fd787; +fma.rn.f64 fd792, fd152, 0d3FE07F6ACD7CDCE2, fd788; +fma.rn.f64 fd793, fd1113, 0dBFEB6B5FBD9F7255, fd789; +fma.rn.f64 fd794, fd151, 0d3FE07F6ACD7CDCE2, fd790; +fma.rn.f64 fd795, fd153, 0d3FD11F2F2E2F1E3B, fd791; +fma.rn.f64 fd796, fd156, 0dBFEED566CB3DCBA1, fd792; +fma.rn.f64 fd797, fd1110, 0d3FD11F2F2E2F1E3B, fd793; +fma.rn.f64 fd798, fd155, 0dBFEED566CB3DCBA1, fd794; +fma.rn.f64 fd799, fd157, 0d3FDDFA67657E7608, fd795; +fma.rn.f64 fd800, fd160, 0d3FEC45BB0D10918C, fd796; +fma.rn.f64 fd801, fd1108, 0d3FDDFA67657E7608, fd797; +fma.rn.f64 fd802, fd159, 0d3FEC45BB0D10918C, fd798; +fma.rn.f64 fd803, fd161, 0dBFEE532CBE45C954, fd799; +fma.rn.f64 fd804, fd164, 0dBFD46F6FAF5FCB72, fd800; +fma.rn.f64 fd805, fd1106, 0dBFEE532CBE45C954, fd801; +fma.rn.f64 fd806, fd163, 0dBFD46F6FAF5FCB72, fd802; +fma.rn.f64 fd807, fd165, 0d3FED0ADB9B447CCF, fd803; +fma.rn.f64 fd808, fd168, 0dBFDADF7689C97B70, fd804; +fma.rn.f64 fd809, fd1103, 0d3FED0ADB9B447CCF, fd805; +fma.rn.f64 fd810, fd167, 0dBFDADF7689C97B70, fd806; +fma.rn.f64 fd811, fd169, 0dBFD7B057F20BF2E4, fd807; +fma.rn.f64 fd812, fd172, 0d3FEDBA2D62CB789F, fd808; +fma.rn.f64 fd813, fd1101, 0dBFD7B057F20BF2E4, fd809; +fma.rn.f64 fd814, fd171, 0d3FEDBA2D62CB789F, fd810; +fma.rn.f64 fd815, fd117, 0dBFEB6B5FBD9F7255, %58; +fma.rn.f64 fd819, fd121, 0d3FDDFA67657E7608, fd815; +fma.rn.f64 fd1066, fd120, 0d3FE07F6ACD7CDCE2, 0d0000000000000000; +fma.rn.f64 fd820, fd124, 0dBFEC45BB0D10918C, fd1066; +fma.rn.f64 fd1065, fd1131, 0dBFEB6B5FBD9F7255, %59; +fma.rn.f64 fd821, fd1129, 0d3FDDFA67657E7608, fd1065; +fma.rn.f64 fd1064, fd119, 0d3FE07F6ACD7CDCE2, 0d0000000000000000; +fma.rn.f64 fd822, fd123, 0dBFEC45BB0D10918C, fd1064; +fma.rn.f64 fd823, fd125, 0d3FABB81853A18977, fd819; +fma.rn.f64 fd824, fd128, 0d3FEFF3FC588E859D, fd820; +fma.rn.f64 fd825, fd1127, 0d3FABB81853A18977, fd821; +fma.rn.f64 fd826, fd127, 0d3FEFF3FC588E859D, fd822; +fma.rn.f64 fd827, fd129, 0dBFE1F53E93956DBF, fd823; +fma.rn.f64 fd828, fd132, 0dBFEA7C6DA34AF89F, fd824; +fma.rn.f64 fd829, fd1124, 0dBFE1F53E93956DBF, fd825; +fma.rn.f64 fd830, fd131, 0dBFEA7C6DA34AF89F, fd826; +fma.rn.f64 fd831, fd133, 0d3FED0ADB9B447CCF, fd827; +fma.rn.f64 fd832, fd136, 0d3FDADF7689C97B70, fd828; +fma.rn.f64 fd833, fd1122, 0d3FED0ADB9B447CCF, fd829; +fma.rn.f64 fd834, fd135, 0d3FDADF7689C97B70, fd830; +fma.rn.f64 fd835, fd137, 0dBFEFCFFA67B61650, fd831; +fma.rn.f64 fd836, fd140, 0d3FBBADB02034D9FF, fd832; +fma.rn.f64 fd837, fd1120, 0dBFEFCFFA67B61650, fd833; +fma.rn.f64 fd838, fd139, 0d3FBBADB02034D9FF, fd834; +fma.rn.f64 fd839, fd141, 0d3FE979982A38E65A, fd835; +fma.rn.f64 fd840, fd144, 0dBFE35D9650D47852, fd836; +fma.rn.f64 fd841, fd1117, 0d3FE979982A38E65A, fd837; +fma.rn.f64 fd842, fd143, 0dBFE35D9650D47852, fd838; +fma.rn.f64 fd843, fd145, 0dBFD7B057F20BF2E4, fd839; +fma.rn.f64 fd844, fd148, 0d3FEDBA2D62CB789F, fd840; +fma.rn.f64 fd845, fd1115, 0dBFD7B057F20BF2E4, fd841; +fma.rn.f64 fd846, fd147, 0d3FEDBA2D62CB789F, fd842; +fma.rn.f64 fd847, fd149, 0dBFC4B545C0234A71, fd843; +fma.rn.f64 fd848, fd152, 0dBFEF941537248537, fd844; +fma.rn.f64 fd849, fd1113, 0dBFC4B545C0234A71, fd845; +fma.rn.f64 fd850, fd151, 0dBFEF941537248537, fd846; +fma.rn.f64 fd851, fd153, 0d3FE4B76371208A62, fd847; +fma.rn.f64 fd852, fd156, 0d3FE863A1ADA0CFA6, fd848; +fma.rn.f64 fd853, fd1110, 0d3FE4B76371208A62, fd849; +fma.rn.f64 fd854, fd155, 0d3FE863A1ADA0CFA6, fd850; +fma.rn.f64 fd855, fd157, 0dBFEE532CBE45C954, fd851; +fma.rn.f64 fd856, fd160, 0dBFD46F6FAF5FCB72, fd852; +fma.rn.f64 fd857, fd1108, 0dBFEE532CBE45C954, fd853; +fma.rn.f64 fd858, fd159, 0dBFD46F6FAF5FCB72, fd854; +fma.rn.f64 fd859, fd161, 0d3FEF4079C06C0992, fd855; +fma.rn.f64 fd860, fd164, 0dBFCB8426C12812BC, fd856; +fma.rn.f64 fd861, fd1106, 0d3FEF4079C06C0992, fd857; +fma.rn.f64 fd862, fd163, 0dBFCB8426C12812BC, fd858; +fma.rn.f64 fd863, fd165, 0dBFE73B5AE5DB4E10, fd859; +fma.rn.f64 fd864, fd168, 0d3FE601A24BA81342, fd860; +fma.rn.f64 fd865, fd1103, 0dBFE73B5AE5DB4E10, fd861; +fma.rn.f64 fd866, fd167, 0d3FE601A24BA81342, fd862; +fma.rn.f64 fd867, fd169, 0d3FD11F2F2E2F1E3B, fd863; +fma.rn.f64 fd868, fd172, 0dBFEED566CB3DCBA1, fd864; +fma.rn.f64 fd869, fd1101, 0d3FD11F2F2E2F1E3B, fd865; +fma.rn.f64 fd870, fd171, 0dBFEED566CB3DCBA1, fd866; +fma.rn.f64 fd871, fd117, 0dBFEE532CBE45C954, %58; +fma.rn.f64 fd875, fd121, 0d3FE979982A38E65A, fd871; +fma.rn.f64 fd1063, fd120, 0d3FD46F6FAF5FCB72, 0d0000000000000000; +fma.rn.f64 fd876, fd124, 0dBFE35D9650D47852, fd1063; +fma.rn.f64 fd1062, fd1131, 0dBFEE532CBE45C954, %59; +fma.rn.f64 fd877, fd1129, 0d3FE979982A38E65A, fd1062; +fma.rn.f64 fd1061, fd119, 0d3FD46F6FAF5FCB72, 0d0000000000000000; +fma.rn.f64 fd878, fd123, 0dBFE35D9650D47852, fd1061; +fma.rn.f64 fd879, fd125, 0dBFE1F53E93956DBF, fd875; +fma.rn.f64 fd880, fd128, 0d3FEA7C6DA34AF89F, fd876; +fma.rn.f64 fd881, fd1127, 0dBFE1F53E93956DBF, fd877; +fma.rn.f64 fd882, fd127, 0d3FEA7C6DA34AF89F, fd878; +fma.rn.f64 fd883, fd129, 0d3FD11F2F2E2F1E3B, fd879; +fma.rn.f64 fd884, fd132, 0dBFEED566CB3DCBA1, fd880; +fma.rn.f64 fd885, fd1124, 0d3FD11F2F2E2F1E3B, fd881; +fma.rn.f64 fd886, fd131, 0dBFEED566CB3DCBA1, fd882; +fma.rn.f64 fd887, fd133, 0d3FABB81853A18977, fd883; +fma.rn.f64 fd888, fd136, 0d3FEFF3FC588E859D, fd884; +fma.rn.f64 fd889, fd1122, 0d3FABB81853A18977, fd885; +fma.rn.f64 fd890, fd135, 0d3FEFF3FC588E859D, fd886; +fma.rn.f64 fd891, fd137, 0dBFD7B057F20BF2E4, fd887; +fma.rn.f64 fd892, fd140, 0dBFEDBA2D62CB789F, fd888; +fma.rn.f64 fd893, fd1120, 0dBFD7B057F20BF2E4, fd889; +fma.rn.f64 fd894, fd139, 0dBFEDBA2D62CB789F, fd890; +fma.rn.f64 fd895, fd141, 0d3FE4B76371208A62, fd891; +fma.rn.f64 fd896, fd144, 0d3FE863A1ADA0CFA6, fd892; +fma.rn.f64 fd897, fd1117, 0d3FE4B76371208A62, fd893; +fma.rn.f64 fd898, fd143, 0d3FE863A1ADA0CFA6, fd894; +fma.rn.f64 fd899, fd145, 0dBFEB6B5FBD9F7255, fd895; +fma.rn.f64 fd900, fd148, 0dBFE07F6ACD7CDCE2, fd896; +fma.rn.f64 fd901, fd1115, 0dBFEB6B5FBD9F7255, fd897; +fma.rn.f64 fd902, fd147, 0dBFE07F6ACD7CDCE2, fd898; +fma.rn.f64 fd903, fd149, 0d3FEF4079C06C0992, fd899; +fma.rn.f64 fd904, fd152, 0d3FCB8426C12812BC, fd900; +fma.rn.f64 fd905, fd1113, 0d3FEF4079C06C0992, fd901; +fma.rn.f64 fd906, fd151, 0d3FCB8426C12812BC, fd902; +fma.rn.f64 fd907, fd153, 0dBFEFCFFA67B61650, fd903; +fma.rn.f64 fd908, fd156, 0d3FBBADB02034D9FF, fd904; +fma.rn.f64 fd909, fd1110, 0dBFEFCFFA67B61650, fd905; +fma.rn.f64 fd910, fd155, 0d3FBBADB02034D9FF, fd906; +fma.rn.f64 fd911, fd157, 0d3FED0ADB9B447CCF, fd907; +fma.rn.f64 fd912, fd160, 0dBFDADF7689C97B70, fd908; +fma.rn.f64 fd913, fd1108, 0d3FED0ADB9B447CCF, fd909; +fma.rn.f64 fd914, fd159, 0dBFDADF7689C97B70, fd910; +fma.rn.f64 fd915, fd161, 0dBFE73B5AE5DB4E10, fd911; +fma.rn.f64 fd916, fd164, 0d3FE601A24BA81342, fd912; +fma.rn.f64 fd917, fd1106, 0dBFE73B5AE5DB4E10, fd913; +fma.rn.f64 fd918, fd163, 0d3FE601A24BA81342, fd914; +fma.rn.f64 fd919, fd165, 0d3FDDFA67657E7608, fd915; +fma.rn.f64 fd920, fd168, 0dBFEC45BB0D10918C, fd916; +fma.rn.f64 fd921, fd1103, 0d3FDDFA67657E7608, fd917; +fma.rn.f64 fd922, fd167, 0dBFEC45BB0D10918C, fd918; +fma.rn.f64 fd923, fd169, 0dBFC4B545C0234A71, fd919; +fma.rn.f64 fd924, fd172, 0d3FEF941537248537, fd920; +fma.rn.f64 fd925, fd1101, 0dBFC4B545C0234A71, fd921; +fma.rn.f64 fd926, fd171, 0d3FEF941537248537, fd922; +fma.rn.f64 fd927, fd117, 0dBFEFCFFA67B61650, %58; +fma.rn.f64 fd928, fd120, 0d3FBBADB02034D9FF, 0d0000000000000000; +fma.rn.f64 fd929, fd1131, 0dBFEFCFFA67B61650, %59; +fma.rn.f64 fd930, fd119, 0d3FBBADB02034D9FF, 0d0000000000000000; +fma.rn.f64 fd931, fd121, 0d3FEF4079C06C0992, fd927; +fma.rn.f64 fd932, fd124, 0dBFCB8426C12812BC, fd928; +fma.rn.f64 fd933, fd1129, 0d3FEF4079C06C0992, fd929; +fma.rn.f64 fd934, fd123, 0dBFCB8426C12812BC, fd930; +fma.rn.f64 fd935, fd125, 0dBFEE532CBE45C954, fd931; +fma.rn.f64 fd936, fd128, 0d3FD46F6FAF5FCB72, fd932; +fma.rn.f64 fd937, fd1127, 0dBFEE532CBE45C954, fd933; +fma.rn.f64 fd938, fd127, 0d3FD46F6FAF5FCB72, fd934; +fma.rn.f64 fd939, fd129, 0d3FED0ADB9B447CCF, fd935; +fma.rn.f64 fd940, fd132, 0dBFDADF7689C97B70, fd936; +fma.rn.f64 fd941, fd1124, 0d3FED0ADB9B447CCF, fd937; +fma.rn.f64 fd942, fd131, 0dBFDADF7689C97B70, fd938; +fma.rn.f64 fd943, fd133, 0dBFEB6B5FBD9F7255, fd939; +fma.rn.f64 fd944, fd136, 0d3FE07F6ACD7CDCE2, fd940; +fma.rn.f64 fd945, fd1122, 0dBFEB6B5FBD9F7255, fd941; +fma.rn.f64 fd946, fd135, 0d3FE07F6ACD7CDCE2, fd942; +fma.rn.f64 fd947, fd137, 0d3FE979982A38E65A, fd943; +fma.rn.f64 fd948, fd140, 0dBFE35D9650D47852, fd944; +fma.rn.f64 fd949, fd1120, 0d3FE979982A38E65A, fd945; +fma.rn.f64 fd950, fd139, 0dBFE35D9650D47852, fd946; +fma.rn.f64 fd951, fd141, 0dBFE73B5AE5DB4E10, fd947; +fma.rn.f64 fd952, fd144, 0d3FE601A24BA81342, fd948; +fma.rn.f64 fd953, fd1117, 0dBFE73B5AE5DB4E10, fd949; +fma.rn.f64 fd954, fd143, 0d3FE601A24BA81342, fd950; +fma.rn.f64 fd955, fd145, 0d3FE4B76371208A62, fd951; +fma.rn.f64 fd956, fd148, 0dBFE863A1ADA0CFA6, fd952; +fma.rn.f64 fd957, fd1115, 0d3FE4B76371208A62, fd953; +fma.rn.f64 fd958, fd147, 0dBFE863A1ADA0CFA6, fd954; +fma.rn.f64 fd959, fd149, 0dBFE1F53E93956DBF, fd955; +fma.rn.f64 fd960, fd152, 0d3FEA7C6DA34AF89F, fd956; +fma.rn.f64 fd961, fd1113, 0dBFE1F53E93956DBF, fd957; +fma.rn.f64 fd962, fd151, 0d3FEA7C6DA34AF89F, fd958; +fma.rn.f64 fd963, fd153, 0d3FDDFA67657E7608, fd959; +fma.rn.f64 fd964, fd156, 0dBFEC45BB0D10918C, fd960; +fma.rn.f64 fd965, fd1110, 0d3FDDFA67657E7608, fd961; +fma.rn.f64 fd966, fd155, 0dBFEC45BB0D10918C, fd962; +fma.rn.f64 fd967, fd157, 0dBFD7B057F20BF2E4, fd963; +fma.rn.f64 fd968, fd160, 0d3FEDBA2D62CB789F, fd964; +fma.rn.f64 fd969, fd1108, 0dBFD7B057F20BF2E4, fd965; +fma.rn.f64 fd970, fd159, 0d3FEDBA2D62CB789F, fd966; +fma.rn.f64 fd971, fd161, 0d3FD11F2F2E2F1E3B, fd967; +fma.rn.f64 fd972, fd164, 0dBFEED566CB3DCBA1, fd968; +fma.rn.f64 fd973, fd1106, 0d3FD11F2F2E2F1E3B, fd969; +fma.rn.f64 fd974, fd163, 0dBFEED566CB3DCBA1, fd970; +fma.rn.f64 fd975, fd165, 0dBFC4B545C0234A71, fd971; +fma.rn.f64 fd976, fd168, 0d3FEF941537248537, fd972; +fma.rn.f64 fd977, fd1103, 0dBFC4B545C0234A71, fd973; +fma.rn.f64 fd978, fd167, 0d3FEF941537248537, fd974; +fma.rn.f64 fd979, fd169, 0d3FABB81853A18977, fd975; +fma.rn.f64 fd980, fd172, 0dBFEFF3FC588E859D, fd976; +fma.rn.f64 fd981, fd1101, 0d3FABB81853A18977, fd977; +fma.rn.f64 fd982, fd171, 0dBFEFF3FC588E859D, fd978; +add.f64 %1, fd198, fd1101; +add.f64 %0, fd197, fd169; +sub.f64 %2, fd251, fd252; +add.f64 %3, fd253, fd254; +sub.f64 %4, fd307, fd308; +add.f64 %5, fd309, fd310; +sub.f64 %6, fd363, fd364; +add.f64 %7, fd365, fd366; +sub.f64 %8, fd419, fd420; +add.f64 %9, fd421, fd422; +add.f64 %11, fd477, fd478; +sub.f64 %10, fd475, fd476; +add.f64 %13, fd533, fd534; +sub.f64 %12, fd531, fd532; +add.f64 %15, fd589, fd590; +sub.f64 %14, fd587, fd588; +sub.f64 %16, fd643, fd644; +add.f64 %17, fd645, fd646; +sub.f64 %18, fd699, fd700; +add.f64 %19, fd701, fd702; +sub.f64 %20, fd755, fd756; +add.f64 %21, fd757, fd758; +add.f64 %23, fd813, fd814; +sub.f64 %22, fd811, fd812; +add.f64 %25, fd869, fd870; +sub.f64 %24, fd867, fd868; +add.f64 %27, fd925, fd926; +sub.f64 %26, fd923, fd924; +sub.f64 %28, fd979, fd980; +add.f64 %29, fd981, fd982; +sub.f64 %31, fd981, fd982; +add.f64 %30, fd979, fd980; +sub.f64 %33, fd925, fd926; +add.f64 %32, fd923, fd924; +sub.f64 %35, fd869, fd870; +add.f64 %34, fd867, fd868; +sub.f64 %37, fd813, fd814; +add.f64 %36, fd811, fd812; +sub.f64 %39, fd757, fd758; +add.f64 %38, fd755, fd756; +sub.f64 %41, fd701, fd702; +add.f64 %40, fd699, fd700; +sub.f64 %43, fd645, fd646; +add.f64 %42, fd643, fd644; +sub.f64 %45, fd589, fd590; +add.f64 %44, fd587, fd588; +sub.f64 %47, fd533, fd534; +add.f64 %46, fd531, fd532; +sub.f64 %49, fd477, fd478; +add.f64 %48, fd475, fd476; +sub.f64 %51, fd421, fd422; +add.f64 %50, fd419, fd420; +sub.f64 %53, fd365, fd366; +add.f64 %52, fd363, fd364; +sub.f64 %55, fd309, fd310; +add.f64 %54, fd307, fd308; +sub.f64 %57, fd253, fd254; +add.f64 %56, fd251, fd252; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y), "=d"(rmem[28].x), "=d"(rmem[28].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y), "d"(rmem[28].x), "d"(rmem[28].y), "d"(rmem[1].y), "d"(rmem[2].y), "d"(rmem[26].y), "d"(rmem[25].y), "d"(rmem[4].y), "d"(rmem[5].y), "d"(rmem[23].y), "d"(rmem[22].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[20].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[11].y), "d"(rmem[17].y), "d"(rmem[16].y), "d"(rmem[13].y), "d"(rmem[14].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..51819c073f70d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp16_fwd.hpp.inc @@ -0,0 +1,28 @@ +#ifndef CUFFTDX_FFT_2_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_2_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<762, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<13>; +.reg .b64 rd<2>; +{ +add.f16x2 %0, %4, %5; +} +{ +add.f16x2 %1, %6, %7; +} +{ +sub.f16x2 %2, %4, %5; +} +{ +sub.f16x2 %3, %6, %7; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..4b44dc0f755c5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp16_inv.hpp.inc @@ -0,0 +1,28 @@ +#ifndef CUFFTDX_FFT_2_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_2_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<964, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<13>; +.reg .b64 rd<2>; +{ +add.f16x2 %0, %4, %5; +} +{ +add.f16x2 %1, %6, %7; +} +{ +sub.f16x2 %2, %4, %5; +} +{ +sub.f16x2 %3, %6, %7; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..25a4143cfbe92 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp32_fwd.hpp.inc @@ -0,0 +1,20 @@ +#ifndef CUFFTDX_FFT_2_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_2_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<16, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<13>; +.reg .b64 rd<2>; +add.f32 %1, %5, %7; +add.f32 %0, %4, %6; +sub.f32 %3, %5, %7; +sub.f32 %2, %4, %6; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..3756a6d634c8c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp32_inv.hpp.inc @@ -0,0 +1,20 @@ +#ifndef CUFFTDX_FFT_2_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_2_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<218, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<13>; +.reg .b64 rd<2>; +add.f32 %1, %5, %7; +add.f32 %0, %4, %6; +sub.f32 %3, %5, %7; +sub.f32 %2, %4, %6; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..ee24d99dc54bb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp64_fwd.hpp.inc @@ -0,0 +1,20 @@ +#ifndef CUFFTDX_FFT_2_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_2_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<420, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<13>; +.reg .b64 rd<2>; +add.f64 %1, %5, %7; +add.f64 %0, %4, %6; +sub.f64 %3, %5, %7; +sub.f64 %2, %4, %6; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..a1efc221396e6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_2_fp64_inv.hpp.inc @@ -0,0 +1,20 @@ +#ifndef CUFFTDX_FFT_2_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_2_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<591, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<13>; +.reg .b64 rd<2>; +add.f64 %1, %5, %7; +add.f64 %0, %4, %6; +sub.f64 %3, %5, %7; +sub.f64 %2, %4, %6; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..2df1ec1da8b03 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp16_fwd.hpp.inc @@ -0,0 +1,3862 @@ +#ifndef CUFFTDX_FFT_30_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_30_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<760, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<203>; +.reg .b32 r<3433>; +.reg .f64 fd<174>; +.reg .b64 rd<3>; +mov.f64 fd123, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd123; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd130, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd130; +} +mov.b32 r228, {rs2, rs2}; +mov.f64 fd135, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs3, fd135; +} +mov.b32 r282, {rs3, rs3}; +mov.f64 fd136, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs4, fd136; +} +mov.b32 r300, {rs4, rs4}; +{ +cvt.rn.f16.f64 rs5, fd123; +} +mov.b32 r291, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd130; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r306, {rs7, rs7}; +{ +add.f16x2 r1, %66, %63; +} +{ +add.f16x2 r4, %67, r1; +} +{ +add.f16x2 r7, %69, %65; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %61, %68; +} +{ +add.f16x2 r16, %62, r13; +} +{ +add.f16x2 r19, %64, %60; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %66, %63; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %67, r28; +} +{ +add.f16x2 r34, %69, %65; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %61, %68; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %64, %60; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %66, %63; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %67, r64; +} +{ +add.f16x2 r70, %69, %65; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %61, %68; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %64, %60; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %66, %63; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %67, r100; +} +{ +add.f16x2 r106, %69, %65; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %61, %68; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %64, %60; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %66, %63; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %67, r136; +} +{ +add.f16x2 r142, %69, %65; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %61, %68; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %64, %60; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %61, %68; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %62, r172; +} +{ +add.f16x2 r178, %64, %60; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %66, %63; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %69, %65; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %61, %68; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %62, r208; +} +{ +add.f16x2 r214, %64, %60; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %66, %63; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %69, %65; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %61, %68; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %62, r244; +} +{ +add.f16x2 r250, %64, %60; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %66, %63; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %69, %65; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %61, %68; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %62, r280; +} +{ +add.f16x2 r286, %64, %60; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %66, %63; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %69, %65; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs9, fd123; +} +mov.b32 r522, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd130; +} +mov.b32 r540, {rs10, rs10}; +{ +cvt.rn.f16.f64 rs11, fd135; +} +mov.b32 r594, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd136; +} +mov.b32 r612, {rs12, rs12}; +{ +cvt.rn.f16.f64 rs13, fd123; +} +mov.b32 r603, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd130; +} +{ +neg.f16 rs15, rs14; +} +mov.b32 r618, {rs15, rs15}; +{ +add.f16x2 r313, %74, %71; +} +{ +add.f16x2 r316, %75, r313; +} +{ +add.f16x2 r319, %77, %73; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %79, %76; +} +{ +add.f16x2 r328, %70, r325; +} +{ +add.f16x2 r331, %72, %78; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %74, %71; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %75, r340; +} +{ +add.f16x2 r346, %77, %73; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %79, %76; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %72, %78; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %74, %71; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %75, r376; +} +{ +add.f16x2 r382, %77, %73; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %79, %76; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %72, %78; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %74, %71; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %75, r412; +} +{ +add.f16x2 r418, %77, %73; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %79, %76; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %72, %78; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %74, %71; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %75, r448; +} +{ +add.f16x2 r454, %77, %73; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %79, %76; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %72, %78; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %79, %76; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %70, r484; +} +{ +add.f16x2 r490, %72, %78; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %74, %71; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %77, %73; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %79, %76; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %70, r520; +} +{ +add.f16x2 r526, %72, %78; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %74, %71; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %77, %73; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %79, %76; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %70, r556; +} +{ +add.f16x2 r562, %72, %78; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %74, %71; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %77, %73; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %79, %76; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %70, r592; +} +{ +add.f16x2 r598, %72, %78; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %74, %71; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %77, %73; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +{ +cvt.rn.f16.f64 rs17, fd123; +} +mov.b32 r834, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs18, fd130; +} +mov.b32 r852, {rs18, rs18}; +{ +cvt.rn.f16.f64 rs19, fd135; +} +mov.b32 r906, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd136; +} +mov.b32 r924, {rs20, rs20}; +{ +cvt.rn.f16.f64 rs21, fd123; +} +mov.b32 r915, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd130; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r930, {rs23, rs23}; +{ +add.f16x2 r625, %81, %88; +} +{ +add.f16x2 r628, %82, r625; +} +{ +add.f16x2 r631, %84, %80; +} +{ +add.f16x2 r634, r628, r631; +} +{ +add.f16x2 r637, %86, %83; +} +{ +add.f16x2 r640, %87, r637; +} +{ +add.f16x2 r643, %89, %85; +} +{ +add.f16x2 r646, r640, r643; +} +{ +add.f16x2 r649, %81, %88; +} +{ +mul.f16x2 r652, r649, r834; +} +{ +add.f16x2 r655, %82, r652; +} +{ +add.f16x2 r658, %84, %80; +} +{ +mul.f16x2 r661, r658, r906; +} +{ +add.f16x2 r664, r655, r661; +} +{ +sub.f16x2 r667, %86, %83; +} +{ +mul.f16x2 r670, r667, r852; +} +{ +sub.f16x2 r673, %89, %85; +} +{ +mul.f16x2 r676, r673, r924; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r664, r679; +} +{ +add.f16x2 r685, %81, %88; +} +{ +mul.f16x2 r688, r685, r834; +} +{ +add.f16x2 r691, %82, r688; +} +{ +add.f16x2 r694, %84, %80; +} +{ +mul.f16x2 r697, r694, r906; +} +{ +add.f16x2 r700, r691, r697; +} +{ +sub.f16x2 r703, %86, %83; +} +{ +mul.f16x2 r706, r703, r852; +} +{ +sub.f16x2 r709, %89, %85; +} +{ +mul.f16x2 r712, r709, r924; +} +{ +add.f16x2 r715, r706, r712; +} +{ +add.f16x2 r718, r700, r715; +} +{ +add.f16x2 r721, %81, %88; +} +{ +mul.f16x2 r724, r721, r906; +} +{ +add.f16x2 r727, %82, r724; +} +{ +add.f16x2 r730, %84, %80; +} +{ +mul.f16x2 r733, r730, r915; +} +{ +add.f16x2 r736, r727, r733; +} +{ +sub.f16x2 r739, %86, %83; +} +{ +mul.f16x2 r742, r739, r924; +} +{ +sub.f16x2 r745, %89, %85; +} +{ +mul.f16x2 r748, r745, r930; +} +{ +add.f16x2 r751, r742, r748; +} +{ +sub.f16x2 r754, r736, r751; +} +{ +add.f16x2 r757, %81, %88; +} +{ +mul.f16x2 r760, r757, r906; +} +{ +add.f16x2 r763, %82, r760; +} +{ +add.f16x2 r766, %84, %80; +} +{ +mul.f16x2 r769, r766, r915; +} +{ +add.f16x2 r772, r763, r769; +} +{ +sub.f16x2 r775, %86, %83; +} +{ +mul.f16x2 r778, r775, r924; +} +{ +sub.f16x2 r781, %89, %85; +} +{ +mul.f16x2 r784, r781, r930; +} +{ +add.f16x2 r787, r778, r784; +} +{ +add.f16x2 r790, r772, r787; +} +{ +add.f16x2 r793, %86, %83; +} +{ +mul.f16x2 r796, r793, r834; +} +{ +add.f16x2 r799, %87, r796; +} +{ +add.f16x2 r802, %89, %85; +} +{ +mul.f16x2 r805, r802, r906; +} +{ +add.f16x2 r808, r799, r805; +} +{ +sub.f16x2 r811, %81, %88; +} +{ +mul.f16x2 r814, r811, r852; +} +{ +sub.f16x2 r817, %84, %80; +} +{ +mul.f16x2 r820, r817, r924; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r808, r823; +} +{ +add.f16x2 r829, %86, %83; +} +{ +mul.f16x2 r832, r829, r834; +} +{ +add.f16x2 r835, %87, r832; +} +{ +add.f16x2 r838, %89, %85; +} +{ +mul.f16x2 r841, r838, r906; +} +{ +add.f16x2 r844, r835, r841; +} +{ +sub.f16x2 r847, %81, %88; +} +{ +mul.f16x2 r850, r847, r852; +} +{ +sub.f16x2 r853, %84, %80; +} +{ +mul.f16x2 r856, r853, r924; +} +{ +add.f16x2 r859, r850, r856; +} +{ +sub.f16x2 r862, r844, r859; +} +{ +add.f16x2 r865, %86, %83; +} +{ +mul.f16x2 r868, r865, r906; +} +{ +add.f16x2 r871, %87, r868; +} +{ +add.f16x2 r874, %89, %85; +} +{ +mul.f16x2 r877, r874, r915; +} +{ +add.f16x2 r880, r871, r877; +} +{ +sub.f16x2 r883, %81, %88; +} +{ +mul.f16x2 r886, r883, r924; +} +{ +sub.f16x2 r889, %84, %80; +} +{ +mul.f16x2 r892, r889, r930; +} +{ +add.f16x2 r895, r886, r892; +} +{ +add.f16x2 r898, r880, r895; +} +{ +add.f16x2 r901, %86, %83; +} +{ +mul.f16x2 r904, r901, r906; +} +{ +add.f16x2 r907, %87, r904; +} +{ +add.f16x2 r910, %89, %85; +} +{ +mul.f16x2 r913, r910, r915; +} +{ +add.f16x2 r916, r907, r913; +} +{ +sub.f16x2 r919, %81, %88; +} +{ +mul.f16x2 r922, r919, r924; +} +{ +sub.f16x2 r925, %84, %80; +} +{ +mul.f16x2 r928, r925, r930; +} +{ +add.f16x2 r931, r922, r928; +} +{ +sub.f16x2 r934, r916, r931; +} +mov.f64 fd115, 0d3FED3BC3AEFF7F95; +{ +cvt.rn.f16.f64 rs25, fd115; +} +mov.f64 fd138, 0dBFDA07F921061AD1; +{ +cvt.rn.f16.f64 rs26, fd138; +} +mov.f64 fd119, 0d3FE5698496E20BD8; +{ +cvt.rn.f16.f64 rs27, fd119; +} +mov.f64 fd134, 0dBFE7C7D7A833BEC2; +{ +cvt.rn.f16.f64 rs28, fd134; +} +{ +cvt.rn.f16.f64 rs29, fd123; +} +{ +cvt.rn.f16.f64 rs30, fd130; +} +mov.f64 fd127, 0dBFBAC2609B3C576C; +{ +cvt.rn.f16.f64 rs31, fd127; +} +mov.f64 fd128, 0dBFEFD31F94F867C6; +{ +cvt.rn.f16.f64 rs32, fd128; +} +{ +cvt.rn.f16.f64 rs35, fd135; +} +{ +cvt.rn.f16.f64 rs36, fd136; +} +mov.f64 fd139, 0dBFEF4CFC327A0080; +{ +cvt.rn.f16.f64 rs39, fd139; +} +mov.f64 fd90, 0d3FCA9CD9AC4258F6; +{ +cvt.rn.f16.f64 rs40, fd90; +} +mov.b32 r951, {rs25, rs25}; +{ +mul.f16x2 r937, r370, r951; +} +mov.b32 r948, {rs26, rs26}; +{ +mul.f16x2 r940, r514, r948; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r370, r948; +} +{ +fma.rn.f16x2 r949, r514, r951, r946; +} +mov.b32 r983, {rs27, rs27}; +{ +mul.f16x2 r953, r682, r983; +} +mov.b32 r980, {rs28, rs28}; +{ +mul.f16x2 r956, r826, r980; +} +{ +sub.f16x2 r959, r953, r956; +} +{ +mul.f16x2 r962, r682, r980; +} +{ +fma.rn.f16x2 r965, r826, r983, r962; +} +{ +mul.f16x2 r969, r442, r983; +} +{ +mul.f16x2 r972, r586, r980; +} +{ +sub.f16x2 r975, r969, r972; +} +{ +mul.f16x2 r978, r442, r980; +} +{ +fma.rn.f16x2 r981, r586, r983, r978; +} +mov.b32 r1047, {rs31, rs31}; +{ +mul.f16x2 r985, r754, r1047; +} +mov.b32 r1044, {rs32, rs32}; +{ +mul.f16x2 r988, r898, r1044; +} +{ +sub.f16x2 r991, r985, r988; +} +{ +mul.f16x2 r994, r754, r1044; +} +{ +fma.rn.f16x2 r997, r898, r1047, r994; +} +mov.b32 r1015, {rs29, rs29}; +{ +mul.f16x2 r1001, r478, r1015; +} +mov.b32 r1012, {rs30, rs30}; +{ +mul.f16x2 r1004, r622, r1012; +} +{ +sub.f16x2 r1007, r1001, r1004; +} +{ +mul.f16x2 r1010, r478, r1012; +} +{ +fma.rn.f16x2 r1013, r622, r1015, r1010; +} +mov.b32 r1031, {rs35, rs35}; +{ +mul.f16x2 r1017, r790, r1031; +} +mov.b32 r1028, {rs36, rs36}; +{ +mul.f16x2 r1020, r934, r1028; +} +{ +sub.f16x2 r1023, r1017, r1020; +} +{ +mul.f16x2 r1026, r790, r1028; +} +{ +fma.rn.f16x2 r1029, r934, r1031, r1026; +} +{ +mul.f16x2 r1033, r406, r1047; +} +{ +mul.f16x2 r1036, r550, r1044; +} +{ +sub.f16x2 r1039, r1033, r1036; +} +{ +mul.f16x2 r1042, r406, r1044; +} +{ +fma.rn.f16x2 r1045, r550, r1047, r1042; +} +mov.b32 r1063, {rs39, rs39}; +{ +mul.f16x2 r1049, r718, r1063; +} +mov.b32 r1060, {rs40, rs40}; +mov.f64 fd173, 0dBFE0000000000000; +{ +mul.f16x2 r1052, r862, r1060; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r718, r1060; +} +{ +fma.rn.f16x2 r1061, r862, r1063, r1058; +} +{ +cvt.rn.f16.f64 rs53, fd173; +} +mov.b32 r1136, {rs53, rs53}; +mov.f64 fd172, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs54, fd172; +} +{ +neg.f16 rs55, rs54; +} +mov.b32 r1145, {rs55, rs55}; +{ +add.f16x2 r1065, r322, r634; +} +{ +add.f16x2 r1068, r10, r1065; +} +{ +add.f16x2 r1071, r334, r646; +} +{ +add.f16x2 r1074, r22, r1071; +} +{ +add.f16x2 r1077, r322, r634; +} +{ +mul.f16x2 r1080, r1077, r1136; +} +{ +add.f16x2 r1083, r10, r1080; +} +{ +sub.f16x2 r1086, r334, r646; +} +{ +mul.f16x2 r1089, r1086, r1145; +} +{ +add.f16x2 r1092, r1083, r1089; +} +{ +add.f16x2 r1095, r322, r634; +} +{ +mul.f16x2 r1098, r1095, r1136; +} +{ +add.f16x2 r1101, r10, r1098; +} +{ +sub.f16x2 r1104, r334, r646; +} +{ +mul.f16x2 r1107, r1104, r1145; +} +{ +sub.f16x2 r1110, r1101, r1107; +} +{ +add.f16x2 r1113, r334, r646; +} +{ +mul.f16x2 r1116, r1113, r1136; +} +{ +add.f16x2 r1119, r22, r1116; +} +{ +sub.f16x2 r1122, r322, r634; +} +{ +mul.f16x2 r1125, r1122, r1145; +} +{ +sub.f16x2 r1128, r1119, r1125; +} +{ +add.f16x2 r1131, r334, r646; +} +{ +mul.f16x2 r1134, r1131, r1136; +} +{ +add.f16x2 r1137, r22, r1134; +} +{ +sub.f16x2 r1140, r322, r634; +} +{ +mul.f16x2 r1143, r1140, r1145; +} +{ +add.f16x2 r1146, r1137, r1143; +} +{ +cvt.rn.f16.f64 rs57, fd173; +} +mov.b32 r1220, {rs57, rs57}; +{ +cvt.rn.f16.f64 rs58, fd172; +} +{ +neg.f16 rs59, rs58; +} +mov.b32 r1229, {rs59, rs59}; +{ +add.f16x2 r1149, r943, r959; +} +{ +add.f16x2 r1152, r58, r1149; +} +{ +add.f16x2 r1155, r949, r965; +} +{ +add.f16x2 r1158, r202, r1155; +} +{ +add.f16x2 r1161, r943, r959; +} +{ +mul.f16x2 r1164, r1161, r1220; +} +{ +add.f16x2 r1167, r58, r1164; +} +{ +sub.f16x2 r1170, r949, r965; +} +{ +mul.f16x2 r1173, r1170, r1229; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +add.f16x2 r1179, r943, r959; +} +{ +mul.f16x2 r1182, r1179, r1220; +} +{ +add.f16x2 r1185, r58, r1182; +} +{ +sub.f16x2 r1188, r949, r965; +} +{ +mul.f16x2 r1191, r1188, r1229; +} +{ +sub.f16x2 r1194, r1185, r1191; +} +{ +add.f16x2 r1197, r949, r965; +} +{ +mul.f16x2 r1200, r1197, r1220; +} +{ +add.f16x2 r1203, r202, r1200; +} +{ +sub.f16x2 r1206, r943, r959; +} +{ +mul.f16x2 r1209, r1206, r1229; +} +{ +sub.f16x2 r1212, r1203, r1209; +} +{ +add.f16x2 r1215, r949, r965; +} +{ +mul.f16x2 r1218, r1215, r1220; +} +{ +add.f16x2 r1221, r202, r1218; +} +{ +sub.f16x2 r1224, r943, r959; +} +{ +mul.f16x2 r1227, r1224, r1229; +} +{ +add.f16x2 r1230, r1221, r1227; +} +{ +cvt.rn.f16.f64 rs61, fd173; +} +mov.b32 r1304, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs62, fd172; +} +{ +neg.f16 rs63, rs62; +} +mov.b32 r1313, {rs63, rs63}; +{ +add.f16x2 r1233, r975, r991; +} +{ +add.f16x2 r1236, r130, r1233; +} +{ +add.f16x2 r1239, r981, r997; +} +{ +add.f16x2 r1242, r274, r1239; +} +{ +add.f16x2 r1245, r975, r991; +} +{ +mul.f16x2 r1248, r1245, r1304; +} +{ +add.f16x2 r1251, r130, r1248; +} +{ +sub.f16x2 r1254, r981, r997; +} +{ +mul.f16x2 r1257, r1254, r1313; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +add.f16x2 r1263, r975, r991; +} +{ +mul.f16x2 r1266, r1263, r1304; +} +{ +add.f16x2 r1269, r130, r1266; +} +{ +sub.f16x2 r1272, r981, r997; +} +{ +mul.f16x2 r1275, r1272, r1313; +} +{ +sub.f16x2 r1278, r1269, r1275; +} +{ +add.f16x2 r1281, r981, r997; +} +{ +mul.f16x2 r1284, r1281, r1304; +} +{ +add.f16x2 r1287, r274, r1284; +} +{ +sub.f16x2 r1290, r975, r991; +} +{ +mul.f16x2 r1293, r1290, r1313; +} +{ +sub.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 r1299, r981, r997; +} +{ +mul.f16x2 r1302, r1299, r1304; +} +{ +add.f16x2 r1305, r274, r1302; +} +{ +sub.f16x2 r1308, r975, r991; +} +{ +mul.f16x2 r1311, r1308, r1313; +} +{ +add.f16x2 r1314, r1305, r1311; +} +{ +cvt.rn.f16.f64 rs65, fd173; +} +mov.b32 r1388, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs66, fd172; +} +{ +neg.f16 rs67, rs66; +} +mov.b32 r1397, {rs67, rs67}; +{ +add.f16x2 r1317, r1007, r1023; +} +{ +add.f16x2 r1320, r166, r1317; +} +{ +add.f16x2 r1323, r1013, r1029; +} +{ +add.f16x2 r1326, r310, r1323; +} +{ +add.f16x2 r1329, r1007, r1023; +} +{ +mul.f16x2 r1332, r1329, r1388; +} +{ +add.f16x2 r1335, r166, r1332; +} +{ +sub.f16x2 r1338, r1013, r1029; +} +{ +mul.f16x2 r1341, r1338, r1397; +} +{ +add.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, r1007, r1023; +} +{ +mul.f16x2 r1350, r1347, r1388; +} +{ +add.f16x2 r1353, r166, r1350; +} +{ +sub.f16x2 r1356, r1013, r1029; +} +{ +mul.f16x2 r1359, r1356, r1397; +} +{ +sub.f16x2 r1362, r1353, r1359; +} +{ +add.f16x2 r1365, r1013, r1029; +} +{ +mul.f16x2 r1368, r1365, r1388; +} +{ +add.f16x2 r1371, r310, r1368; +} +{ +sub.f16x2 r1374, r1007, r1023; +} +{ +mul.f16x2 r1377, r1374, r1397; +} +{ +sub.f16x2 r1380, r1371, r1377; +} +{ +add.f16x2 r1383, r1013, r1029; +} +{ +mul.f16x2 r1386, r1383, r1388; +} +{ +add.f16x2 r1389, r310, r1386; +} +{ +sub.f16x2 r1392, r1007, r1023; +} +{ +mul.f16x2 r1395, r1392, r1397; +} +{ +add.f16x2 r1398, r1389, r1395; +} +{ +cvt.rn.f16.f64 rs69, fd173; +} +mov.b32 r1472, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs70, fd172; +} +{ +neg.f16 rs71, rs70; +} +mov.b32 r1481, {rs71, rs71}; +{ +add.f16x2 r1401, r1039, r1055; +} +{ +add.f16x2 r1404, r94, r1401; +} +{ +add.f16x2 r1407, r1045, r1061; +} +{ +add.f16x2 r1410, r238, r1407; +} +{ +add.f16x2 r1413, r1039, r1055; +} +{ +mul.f16x2 r1416, r1413, r1472; +} +{ +add.f16x2 r1419, r94, r1416; +} +{ +sub.f16x2 r1422, r1045, r1061; +} +{ +mul.f16x2 r1425, r1422, r1481; +} +{ +add.f16x2 r1428, r1419, r1425; +} +{ +add.f16x2 r1431, r1039, r1055; +} +{ +mul.f16x2 r1434, r1431, r1472; +} +{ +add.f16x2 r1437, r94, r1434; +} +{ +sub.f16x2 r1440, r1045, r1061; +} +{ +mul.f16x2 r1443, r1440, r1481; +} +{ +sub.f16x2 r1446, r1437, r1443; +} +{ +add.f16x2 r1449, r1045, r1061; +} +{ +mul.f16x2 r1452, r1449, r1472; +} +{ +add.f16x2 r1455, r238, r1452; +} +{ +sub.f16x2 r1458, r1039, r1055; +} +{ +mul.f16x2 r1461, r1458, r1481; +} +{ +sub.f16x2 r1464, r1455, r1461; +} +{ +add.f16x2 r1467, r1045, r1061; +} +{ +mul.f16x2 r1470, r1467, r1472; +} +{ +add.f16x2 r1473, r238, r1470; +} +{ +sub.f16x2 r1476, r1039, r1055; +} +{ +mul.f16x2 r1479, r1476, r1481; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +cvt.rn.f16.f64 rs73, fd123; +} +mov.b32 r1694, {rs73, rs73}; +{ +cvt.rn.f16.f64 rs74, fd130; +} +mov.b32 r1712, {rs74, rs74}; +{ +cvt.rn.f16.f64 rs75, fd135; +} +mov.b32 r1766, {rs75, rs75}; +{ +cvt.rn.f16.f64 rs76, fd136; +} +mov.b32 r1784, {rs76, rs76}; +{ +cvt.rn.f16.f64 rs77, fd123; +} +mov.b32 r1775, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs78, fd130; +} +{ +neg.f16 rs79, rs78; +} +mov.b32 r1790, {rs79, rs79}; +{ +add.f16x2 r1485, %96, %93; +} +{ +add.f16x2 r1488, %97, r1485; +} +{ +add.f16x2 r1491, %99, %95; +} +{ +add.f16x2 r1494, r1488, r1491; +} +{ +add.f16x2 r1497, %91, %98; +} +{ +add.f16x2 r1500, %92, r1497; +} +{ +add.f16x2 r1503, %94, %90; +} +{ +add.f16x2 r1506, r1500, r1503; +} +{ +add.f16x2 r1509, %96, %93; +} +{ +mul.f16x2 r1512, r1509, r1694; +} +{ +add.f16x2 r1515, %97, r1512; +} +{ +add.f16x2 r1518, %99, %95; +} +{ +mul.f16x2 r1521, r1518, r1766; +} +{ +add.f16x2 r1524, r1515, r1521; +} +{ +sub.f16x2 r1527, %91, %98; +} +{ +mul.f16x2 r1530, r1527, r1712; +} +{ +sub.f16x2 r1533, %94, %90; +} +{ +mul.f16x2 r1536, r1533, r1784; +} +{ +add.f16x2 r1539, r1530, r1536; +} +{ +sub.f16x2 r1542, r1524, r1539; +} +{ +add.f16x2 r1545, %96, %93; +} +{ +mul.f16x2 r1548, r1545, r1694; +} +{ +add.f16x2 r1551, %97, r1548; +} +{ +add.f16x2 r1554, %99, %95; +} +{ +mul.f16x2 r1557, r1554, r1766; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +sub.f16x2 r1563, %91, %98; +} +{ +mul.f16x2 r1566, r1563, r1712; +} +{ +sub.f16x2 r1569, %94, %90; +} +{ +mul.f16x2 r1572, r1569, r1784; +} +{ +add.f16x2 r1575, r1566, r1572; +} +{ +add.f16x2 r1578, r1560, r1575; +} +{ +add.f16x2 r1581, %96, %93; +} +{ +mul.f16x2 r1584, r1581, r1766; +} +{ +add.f16x2 r1587, %97, r1584; +} +{ +add.f16x2 r1590, %99, %95; +} +{ +mul.f16x2 r1593, r1590, r1775; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, %91, %98; +} +{ +mul.f16x2 r1602, r1599, r1784; +} +{ +sub.f16x2 r1605, %94, %90; +} +{ +mul.f16x2 r1608, r1605, r1790; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r1596, r1611; +} +{ +add.f16x2 r1617, %96, %93; +} +{ +mul.f16x2 r1620, r1617, r1766; +} +{ +add.f16x2 r1623, %97, r1620; +} +{ +add.f16x2 r1626, %99, %95; +} +{ +mul.f16x2 r1629, r1626, r1775; +} +{ +add.f16x2 r1632, r1623, r1629; +} +{ +sub.f16x2 r1635, %91, %98; +} +{ +mul.f16x2 r1638, r1635, r1784; +} +{ +sub.f16x2 r1641, %94, %90; +} +{ +mul.f16x2 r1644, r1641, r1790; +} +{ +add.f16x2 r1647, r1638, r1644; +} +{ +add.f16x2 r1650, r1632, r1647; +} +{ +add.f16x2 r1653, %91, %98; +} +{ +mul.f16x2 r1656, r1653, r1694; +} +{ +add.f16x2 r1659, %92, r1656; +} +{ +add.f16x2 r1662, %94, %90; +} +{ +mul.f16x2 r1665, r1662, r1766; +} +{ +add.f16x2 r1668, r1659, r1665; +} +{ +sub.f16x2 r1671, %96, %93; +} +{ +mul.f16x2 r1674, r1671, r1712; +} +{ +sub.f16x2 r1677, %99, %95; +} +{ +mul.f16x2 r1680, r1677, r1784; +} +{ +add.f16x2 r1683, r1674, r1680; +} +{ +add.f16x2 r1686, r1668, r1683; +} +{ +add.f16x2 r1689, %91, %98; +} +{ +mul.f16x2 r1692, r1689, r1694; +} +{ +add.f16x2 r1695, %92, r1692; +} +{ +add.f16x2 r1698, %94, %90; +} +{ +mul.f16x2 r1701, r1698, r1766; +} +{ +add.f16x2 r1704, r1695, r1701; +} +{ +sub.f16x2 r1707, %96, %93; +} +{ +mul.f16x2 r1710, r1707, r1712; +} +{ +sub.f16x2 r1713, %99, %95; +} +{ +mul.f16x2 r1716, r1713, r1784; +} +{ +add.f16x2 r1719, r1710, r1716; +} +{ +sub.f16x2 r1722, r1704, r1719; +} +{ +add.f16x2 r1725, %91, %98; +} +{ +mul.f16x2 r1728, r1725, r1766; +} +{ +add.f16x2 r1731, %92, r1728; +} +{ +add.f16x2 r1734, %94, %90; +} +{ +mul.f16x2 r1737, r1734, r1775; +} +{ +add.f16x2 r1740, r1731, r1737; +} +{ +sub.f16x2 r1743, %96, %93; +} +{ +mul.f16x2 r1746, r1743, r1784; +} +{ +sub.f16x2 r1749, %99, %95; +} +{ +mul.f16x2 r1752, r1749, r1790; +} +{ +add.f16x2 r1755, r1746, r1752; +} +{ +add.f16x2 r1758, r1740, r1755; +} +{ +add.f16x2 r1761, %91, %98; +} +{ +mul.f16x2 r1764, r1761, r1766; +} +{ +add.f16x2 r1767, %92, r1764; +} +{ +add.f16x2 r1770, %94, %90; +} +{ +mul.f16x2 r1773, r1770, r1775; +} +{ +add.f16x2 r1776, r1767, r1773; +} +{ +sub.f16x2 r1779, %96, %93; +} +{ +mul.f16x2 r1782, r1779, r1784; +} +{ +sub.f16x2 r1785, %99, %95; +} +{ +mul.f16x2 r1788, r1785, r1790; +} +{ +add.f16x2 r1791, r1782, r1788; +} +{ +sub.f16x2 r1794, r1776, r1791; +} +{ +cvt.rn.f16.f64 rs81, fd123; +} +mov.b32 r2006, {rs81, rs81}; +{ +cvt.rn.f16.f64 rs82, fd130; +} +mov.b32 r2024, {rs82, rs82}; +{ +cvt.rn.f16.f64 rs83, fd135; +} +mov.b32 r2078, {rs83, rs83}; +{ +cvt.rn.f16.f64 rs84, fd136; +} +mov.b32 r2096, {rs84, rs84}; +{ +cvt.rn.f16.f64 rs85, fd123; +} +mov.b32 r2087, {rs85, rs85}; +{ +cvt.rn.f16.f64 rs86, fd130; +} +{ +neg.f16 rs87, rs86; +} +mov.b32 r2102, {rs87, rs87}; +{ +add.f16x2 r1797, %103, %101; +} +{ +add.f16x2 r1800, %105, r1797; +} +{ +add.f16x2 r1803, %107, %104; +} +{ +add.f16x2 r1806, r1800, r1803; +} +{ +add.f16x2 r1809, %109, %106; +} +{ +add.f16x2 r1812, %100, r1809; +} +{ +add.f16x2 r1815, %102, %108; +} +{ +add.f16x2 r1818, r1812, r1815; +} +{ +add.f16x2 r1821, %103, %101; +} +{ +mul.f16x2 r1824, r1821, r2006; +} +{ +add.f16x2 r1827, %105, r1824; +} +{ +add.f16x2 r1830, %107, %104; +} +{ +mul.f16x2 r1833, r1830, r2078; +} +{ +add.f16x2 r1836, r1827, r1833; +} +{ +sub.f16x2 r1839, %109, %106; +} +{ +mul.f16x2 r1842, r1839, r2024; +} +{ +sub.f16x2 r1845, %102, %108; +} +{ +mul.f16x2 r1848, r1845, r2096; +} +{ +add.f16x2 r1851, r1842, r1848; +} +{ +sub.f16x2 r1854, r1836, r1851; +} +{ +add.f16x2 r1857, %103, %101; +} +{ +mul.f16x2 r1860, r1857, r2006; +} +{ +add.f16x2 r1863, %105, r1860; +} +{ +add.f16x2 r1866, %107, %104; +} +{ +mul.f16x2 r1869, r1866, r2078; +} +{ +add.f16x2 r1872, r1863, r1869; +} +{ +sub.f16x2 r1875, %109, %106; +} +{ +mul.f16x2 r1878, r1875, r2024; +} +{ +sub.f16x2 r1881, %102, %108; +} +{ +mul.f16x2 r1884, r1881, r2096; +} +{ +add.f16x2 r1887, r1878, r1884; +} +{ +add.f16x2 r1890, r1872, r1887; +} +{ +add.f16x2 r1893, %103, %101; +} +{ +mul.f16x2 r1896, r1893, r2078; +} +{ +add.f16x2 r1899, %105, r1896; +} +{ +add.f16x2 r1902, %107, %104; +} +{ +mul.f16x2 r1905, r1902, r2087; +} +{ +add.f16x2 r1908, r1899, r1905; +} +{ +sub.f16x2 r1911, %109, %106; +} +{ +mul.f16x2 r1914, r1911, r2096; +} +{ +sub.f16x2 r1917, %102, %108; +} +{ +mul.f16x2 r1920, r1917, r2102; +} +{ +add.f16x2 r1923, r1914, r1920; +} +{ +sub.f16x2 r1926, r1908, r1923; +} +{ +add.f16x2 r1929, %103, %101; +} +{ +mul.f16x2 r1932, r1929, r2078; +} +{ +add.f16x2 r1935, %105, r1932; +} +{ +add.f16x2 r1938, %107, %104; +} +{ +mul.f16x2 r1941, r1938, r2087; +} +{ +add.f16x2 r1944, r1935, r1941; +} +{ +sub.f16x2 r1947, %109, %106; +} +{ +mul.f16x2 r1950, r1947, r2096; +} +{ +sub.f16x2 r1953, %102, %108; +} +{ +mul.f16x2 r1956, r1953, r2102; +} +{ +add.f16x2 r1959, r1950, r1956; +} +{ +add.f16x2 r1962, r1944, r1959; +} +{ +add.f16x2 r1965, %109, %106; +} +{ +mul.f16x2 r1968, r1965, r2006; +} +{ +add.f16x2 r1971, %100, r1968; +} +{ +add.f16x2 r1974, %102, %108; +} +{ +mul.f16x2 r1977, r1974, r2078; +} +{ +add.f16x2 r1980, r1971, r1977; +} +{ +sub.f16x2 r1983, %103, %101; +} +{ +mul.f16x2 r1986, r1983, r2024; +} +{ +sub.f16x2 r1989, %107, %104; +} +{ +mul.f16x2 r1992, r1989, r2096; +} +{ +add.f16x2 r1995, r1986, r1992; +} +{ +add.f16x2 r1998, r1980, r1995; +} +{ +add.f16x2 r2001, %109, %106; +} +{ +mul.f16x2 r2004, r2001, r2006; +} +{ +add.f16x2 r2007, %100, r2004; +} +{ +add.f16x2 r2010, %102, %108; +} +{ +mul.f16x2 r2013, r2010, r2078; +} +{ +add.f16x2 r2016, r2007, r2013; +} +{ +sub.f16x2 r2019, %103, %101; +} +{ +mul.f16x2 r2022, r2019, r2024; +} +{ +sub.f16x2 r2025, %107, %104; +} +{ +mul.f16x2 r2028, r2025, r2096; +} +{ +add.f16x2 r2031, r2022, r2028; +} +{ +sub.f16x2 r2034, r2016, r2031; +} +{ +add.f16x2 r2037, %109, %106; +} +{ +mul.f16x2 r2040, r2037, r2078; +} +{ +add.f16x2 r2043, %100, r2040; +} +{ +add.f16x2 r2046, %102, %108; +} +{ +mul.f16x2 r2049, r2046, r2087; +} +{ +add.f16x2 r2052, r2043, r2049; +} +{ +sub.f16x2 r2055, %103, %101; +} +{ +mul.f16x2 r2058, r2055, r2096; +} +{ +sub.f16x2 r2061, %107, %104; +} +{ +mul.f16x2 r2064, r2061, r2102; +} +{ +add.f16x2 r2067, r2058, r2064; +} +{ +add.f16x2 r2070, r2052, r2067; +} +{ +add.f16x2 r2073, %109, %106; +} +{ +mul.f16x2 r2076, r2073, r2078; +} +{ +add.f16x2 r2079, %100, r2076; +} +{ +add.f16x2 r2082, %102, %108; +} +{ +mul.f16x2 r2085, r2082, r2087; +} +{ +add.f16x2 r2088, r2079, r2085; +} +{ +sub.f16x2 r2091, %103, %101; +} +{ +mul.f16x2 r2094, r2091, r2096; +} +{ +sub.f16x2 r2097, %107, %104; +} +{ +mul.f16x2 r2100, r2097, r2102; +} +{ +add.f16x2 r2103, r2094, r2100; +} +{ +sub.f16x2 r2106, r2088, r2103; +} +{ +cvt.rn.f16.f64 rs89, fd123; +} +mov.b32 r2318, {rs89, rs89}; +{ +cvt.rn.f16.f64 rs90, fd130; +} +mov.b32 r2336, {rs90, rs90}; +{ +cvt.rn.f16.f64 rs91, fd135; +} +mov.b32 r2390, {rs91, rs91}; +{ +cvt.rn.f16.f64 rs92, fd136; +} +mov.b32 r2408, {rs92, rs92}; +{ +cvt.rn.f16.f64 rs93, fd123; +} +mov.b32 r2399, {rs93, rs93}; +{ +cvt.rn.f16.f64 rs94, fd130; +} +{ +neg.f16 rs95, rs94; +} +mov.b32 r2414, {rs95, rs95}; +{ +add.f16x2 r2109, %113, %119; +} +{ +add.f16x2 r2112, %111, r2109; +} +{ +add.f16x2 r2115, %114, %112; +} +{ +add.f16x2 r2118, r2112, r2115; +} +{ +add.f16x2 r2121, %116, %115; +} +{ +add.f16x2 r2124, %118, r2121; +} +{ +add.f16x2 r2127, %110, %117; +} +{ +add.f16x2 r2130, r2124, r2127; +} +{ +add.f16x2 r2133, %113, %119; +} +{ +mul.f16x2 r2136, r2133, r2318; +} +{ +add.f16x2 r2139, %111, r2136; +} +{ +add.f16x2 r2142, %114, %112; +} +{ +mul.f16x2 r2145, r2142, r2390; +} +{ +add.f16x2 r2148, r2139, r2145; +} +{ +sub.f16x2 r2151, %116, %115; +} +{ +mul.f16x2 r2154, r2151, r2336; +} +{ +sub.f16x2 r2157, %110, %117; +} +{ +mul.f16x2 r2160, r2157, r2408; +} +{ +add.f16x2 r2163, r2154, r2160; +} +{ +sub.f16x2 r2166, r2148, r2163; +} +{ +add.f16x2 r2169, %113, %119; +} +{ +mul.f16x2 r2172, r2169, r2318; +} +{ +add.f16x2 r2175, %111, r2172; +} +{ +add.f16x2 r2178, %114, %112; +} +{ +mul.f16x2 r2181, r2178, r2390; +} +{ +add.f16x2 r2184, r2175, r2181; +} +{ +sub.f16x2 r2187, %116, %115; +} +{ +mul.f16x2 r2190, r2187, r2336; +} +{ +sub.f16x2 r2193, %110, %117; +} +{ +mul.f16x2 r2196, r2193, r2408; +} +{ +add.f16x2 r2199, r2190, r2196; +} +{ +add.f16x2 r2202, r2184, r2199; +} +{ +add.f16x2 r2205, %113, %119; +} +{ +mul.f16x2 r2208, r2205, r2390; +} +{ +add.f16x2 r2211, %111, r2208; +} +{ +add.f16x2 r2214, %114, %112; +} +{ +mul.f16x2 r2217, r2214, r2399; +} +{ +add.f16x2 r2220, r2211, r2217; +} +{ +sub.f16x2 r2223, %116, %115; +} +{ +mul.f16x2 r2226, r2223, r2408; +} +{ +sub.f16x2 r2229, %110, %117; +} +{ +mul.f16x2 r2232, r2229, r2414; +} +{ +add.f16x2 r2235, r2226, r2232; +} +{ +sub.f16x2 r2238, r2220, r2235; +} +{ +add.f16x2 r2241, %113, %119; +} +{ +mul.f16x2 r2244, r2241, r2390; +} +{ +add.f16x2 r2247, %111, r2244; +} +{ +add.f16x2 r2250, %114, %112; +} +{ +mul.f16x2 r2253, r2250, r2399; +} +{ +add.f16x2 r2256, r2247, r2253; +} +{ +sub.f16x2 r2259, %116, %115; +} +{ +mul.f16x2 r2262, r2259, r2408; +} +{ +sub.f16x2 r2265, %110, %117; +} +{ +mul.f16x2 r2268, r2265, r2414; +} +{ +add.f16x2 r2271, r2262, r2268; +} +{ +add.f16x2 r2274, r2256, r2271; +} +{ +add.f16x2 r2277, %116, %115; +} +{ +mul.f16x2 r2280, r2277, r2318; +} +{ +add.f16x2 r2283, %118, r2280; +} +{ +add.f16x2 r2286, %110, %117; +} +{ +mul.f16x2 r2289, r2286, r2390; +} +{ +add.f16x2 r2292, r2283, r2289; +} +{ +sub.f16x2 r2295, %113, %119; +} +{ +mul.f16x2 r2298, r2295, r2336; +} +{ +sub.f16x2 r2301, %114, %112; +} +{ +mul.f16x2 r2304, r2301, r2408; +} +{ +add.f16x2 r2307, r2298, r2304; +} +{ +add.f16x2 r2310, r2292, r2307; +} +{ +add.f16x2 r2313, %116, %115; +} +{ +mul.f16x2 r2316, r2313, r2318; +} +{ +add.f16x2 r2319, %118, r2316; +} +{ +add.f16x2 r2322, %110, %117; +} +{ +mul.f16x2 r2325, r2322, r2390; +} +{ +add.f16x2 r2328, r2319, r2325; +} +{ +sub.f16x2 r2331, %113, %119; +} +{ +mul.f16x2 r2334, r2331, r2336; +} +{ +sub.f16x2 r2337, %114, %112; +} +{ +mul.f16x2 r2340, r2337, r2408; +} +{ +add.f16x2 r2343, r2334, r2340; +} +{ +sub.f16x2 r2346, r2328, r2343; +} +{ +add.f16x2 r2349, %116, %115; +} +{ +mul.f16x2 r2352, r2349, r2390; +} +{ +add.f16x2 r2355, %118, r2352; +} +{ +add.f16x2 r2358, %110, %117; +} +{ +mul.f16x2 r2361, r2358, r2399; +} +{ +add.f16x2 r2364, r2355, r2361; +} +{ +sub.f16x2 r2367, %113, %119; +} +{ +mul.f16x2 r2370, r2367, r2408; +} +{ +sub.f16x2 r2373, %114, %112; +} +{ +mul.f16x2 r2376, r2373, r2414; +} +{ +add.f16x2 r2379, r2370, r2376; +} +{ +add.f16x2 r2382, r2364, r2379; +} +{ +add.f16x2 r2385, %116, %115; +} +{ +mul.f16x2 r2388, r2385, r2390; +} +{ +add.f16x2 r2391, %118, r2388; +} +{ +add.f16x2 r2394, %110, %117; +} +{ +mul.f16x2 r2397, r2394, r2399; +} +{ +add.f16x2 r2400, r2391, r2397; +} +{ +sub.f16x2 r2403, %113, %119; +} +{ +mul.f16x2 r2406, r2403, r2408; +} +{ +sub.f16x2 r2409, %114, %112; +} +{ +mul.f16x2 r2412, r2409, r2414; +} +{ +add.f16x2 r2415, r2406, r2412; +} +{ +sub.f16x2 r2418, r2400, r2415; +} +{ +cvt.rn.f16.f64 rs97, fd115; +} +{ +cvt.rn.f16.f64 rs98, fd138; +} +{ +cvt.rn.f16.f64 rs99, fd119; +} +{ +cvt.rn.f16.f64 rs100, fd134; +} +{ +cvt.rn.f16.f64 rs101, fd123; +} +{ +cvt.rn.f16.f64 rs102, fd130; +} +{ +cvt.rn.f16.f64 rs103, fd127; +} +{ +cvt.rn.f16.f64 rs104, fd128; +} +{ +cvt.rn.f16.f64 rs107, fd135; +} +{ +cvt.rn.f16.f64 rs108, fd136; +} +{ +cvt.rn.f16.f64 rs111, fd139; +} +{ +cvt.rn.f16.f64 rs112, fd90; +} +mov.b32 r2435, {rs97, rs97}; +{ +mul.f16x2 r2421, r1854, r2435; +} +mov.b32 r2432, {rs98, rs98}; +{ +mul.f16x2 r2424, r1998, r2432; +} +{ +sub.f16x2 r2427, r2421, r2424; +} +{ +mul.f16x2 r2430, r1854, r2432; +} +{ +fma.rn.f16x2 r2433, r1998, r2435, r2430; +} +mov.b32 r2467, {rs99, rs99}; +{ +mul.f16x2 r2437, r2166, r2467; +} +mov.b32 r2464, {rs100, rs100}; +{ +mul.f16x2 r2440, r2310, r2464; +} +{ +sub.f16x2 r2443, r2437, r2440; +} +{ +mul.f16x2 r2446, r2166, r2464; +} +{ +fma.rn.f16x2 r2449, r2310, r2467, r2446; +} +{ +mul.f16x2 r2453, r1926, r2467; +} +{ +mul.f16x2 r2456, r2070, r2464; +} +{ +sub.f16x2 r2459, r2453, r2456; +} +{ +mul.f16x2 r2462, r1926, r2464; +} +{ +fma.rn.f16x2 r2465, r2070, r2467, r2462; +} +mov.b32 r2531, {rs103, rs103}; +{ +mul.f16x2 r2469, r2238, r2531; +} +mov.b32 r2528, {rs104, rs104}; +{ +mul.f16x2 r2472, r2382, r2528; +} +{ +sub.f16x2 r2475, r2469, r2472; +} +{ +mul.f16x2 r2478, r2238, r2528; +} +{ +fma.rn.f16x2 r2481, r2382, r2531, r2478; +} +mov.b32 r2499, {rs101, rs101}; +{ +mul.f16x2 r2485, r1962, r2499; +} +mov.b32 r2496, {rs102, rs102}; +{ +mul.f16x2 r2488, r2106, r2496; +} +{ +sub.f16x2 r2491, r2485, r2488; +} +{ +mul.f16x2 r2494, r1962, r2496; +} +{ +fma.rn.f16x2 r2497, r2106, r2499, r2494; +} +mov.b32 r2515, {rs107, rs107}; +{ +mul.f16x2 r2501, r2274, r2515; +} +mov.b32 r2512, {rs108, rs108}; +{ +mul.f16x2 r2504, r2418, r2512; +} +{ +sub.f16x2 r2507, r2501, r2504; +} +{ +mul.f16x2 r2510, r2274, r2512; +} +{ +fma.rn.f16x2 r2513, r2418, r2515, r2510; +} +{ +mul.f16x2 r2517, r1890, r2531; +} +{ +mul.f16x2 r2520, r2034, r2528; +} +{ +sub.f16x2 r2523, r2517, r2520; +} +{ +mul.f16x2 r2526, r1890, r2528; +} +{ +fma.rn.f16x2 r2529, r2034, r2531, r2526; +} +mov.b32 r2547, {rs111, rs111}; +{ +mul.f16x2 r2533, r2202, r2547; +} +mov.b32 r2544, {rs112, rs112}; +{ +mul.f16x2 r2536, r2346, r2544; +} +{ +sub.f16x2 r2539, r2533, r2536; +} +{ +mul.f16x2 r2542, r2202, r2544; +} +{ +fma.rn.f16x2 r2545, r2346, r2547, r2542; +} +{ +cvt.rn.f16.f64 rs125, fd173; +} +mov.b32 r2620, {rs125, rs125}; +{ +cvt.rn.f16.f64 rs126, fd172; +} +{ +neg.f16 rs127, rs126; +} +mov.b32 r2629, {rs127, rs127}; +{ +add.f16x2 r2549, r1806, r2118; +} +{ +add.f16x2 r2552, r1494, r2549; +} +{ +add.f16x2 r2555, r1818, r2130; +} +{ +add.f16x2 r2558, r1506, r2555; +} +{ +add.f16x2 r2561, r1806, r2118; +} +{ +mul.f16x2 r2564, r2561, r2620; +} +{ +add.f16x2 r2567, r1494, r2564; +} +{ +sub.f16x2 r2570, r1818, r2130; +} +{ +mul.f16x2 r2573, r2570, r2629; +} +{ +add.f16x2 r2576, r2567, r2573; +} +{ +add.f16x2 r2579, r1806, r2118; +} +{ +mul.f16x2 r2582, r2579, r2620; +} +{ +add.f16x2 r2585, r1494, r2582; +} +{ +sub.f16x2 r2588, r1818, r2130; +} +{ +mul.f16x2 r2591, r2588, r2629; +} +{ +sub.f16x2 r2594, r2585, r2591; +} +{ +add.f16x2 r2597, r1818, r2130; +} +{ +mul.f16x2 r2600, r2597, r2620; +} +{ +add.f16x2 r2603, r1506, r2600; +} +{ +sub.f16x2 r2606, r1806, r2118; +} +{ +mul.f16x2 r2609, r2606, r2629; +} +{ +sub.f16x2 r2612, r2603, r2609; +} +{ +add.f16x2 r2615, r1818, r2130; +} +{ +mul.f16x2 r2618, r2615, r2620; +} +{ +add.f16x2 r2621, r1506, r2618; +} +{ +sub.f16x2 r2624, r1806, r2118; +} +{ +mul.f16x2 r2627, r2624, r2629; +} +{ +add.f16x2 r2630, r2621, r2627; +} +{ +cvt.rn.f16.f64 rs129, fd173; +} +mov.b32 r2704, {rs129, rs129}; +{ +cvt.rn.f16.f64 rs130, fd172; +} +{ +neg.f16 rs131, rs130; +} +mov.b32 r2713, {rs131, rs131}; +{ +add.f16x2 r2633, r2427, r2443; +} +{ +add.f16x2 r2636, r1542, r2633; +} +{ +add.f16x2 r2639, r2433, r2449; +} +{ +add.f16x2 r2642, r1686, r2639; +} +{ +add.f16x2 r2645, r2427, r2443; +} +{ +mul.f16x2 r2648, r2645, r2704; +} +{ +add.f16x2 r2651, r1542, r2648; +} +{ +sub.f16x2 r2654, r2433, r2449; +} +{ +mul.f16x2 r2657, r2654, r2713; +} +{ +add.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2427, r2443; +} +{ +mul.f16x2 r2666, r2663, r2704; +} +{ +add.f16x2 r2669, r1542, r2666; +} +{ +sub.f16x2 r2672, r2433, r2449; +} +{ +mul.f16x2 r2675, r2672, r2713; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2433, r2449; +} +{ +mul.f16x2 r2684, r2681, r2704; +} +{ +add.f16x2 r2687, r1686, r2684; +} +{ +sub.f16x2 r2690, r2427, r2443; +} +{ +mul.f16x2 r2693, r2690, r2713; +} +{ +sub.f16x2 r2696, r2687, r2693; +} +{ +add.f16x2 r2699, r2433, r2449; +} +{ +mul.f16x2 r2702, r2699, r2704; +} +{ +add.f16x2 r2705, r1686, r2702; +} +{ +sub.f16x2 r2708, r2427, r2443; +} +{ +mul.f16x2 r2711, r2708, r2713; +} +{ +add.f16x2 r2714, r2705, r2711; +} +{ +cvt.rn.f16.f64 rs133, fd173; +} +mov.b32 r2788, {rs133, rs133}; +{ +cvt.rn.f16.f64 rs134, fd172; +} +{ +neg.f16 rs135, rs134; +} +mov.b32 r2797, {rs135, rs135}; +{ +add.f16x2 r2717, r2459, r2475; +} +{ +add.f16x2 r2720, r1614, r2717; +} +{ +add.f16x2 r2723, r2465, r2481; +} +{ +add.f16x2 r2726, r1758, r2723; +} +{ +add.f16x2 r2729, r2459, r2475; +} +{ +mul.f16x2 r2732, r2729, r2788; +} +{ +add.f16x2 r2735, r1614, r2732; +} +{ +sub.f16x2 r2738, r2465, r2481; +} +{ +mul.f16x2 r2741, r2738, r2797; +} +{ +add.f16x2 r2744, r2735, r2741; +} +{ +add.f16x2 r2747, r2459, r2475; +} +{ +mul.f16x2 r2750, r2747, r2788; +} +{ +add.f16x2 r2753, r1614, r2750; +} +{ +sub.f16x2 r2756, r2465, r2481; +} +{ +mul.f16x2 r2759, r2756, r2797; +} +{ +sub.f16x2 r2762, r2753, r2759; +} +{ +add.f16x2 r2765, r2465, r2481; +} +{ +mul.f16x2 r2768, r2765, r2788; +} +{ +add.f16x2 r2771, r1758, r2768; +} +{ +sub.f16x2 r2774, r2459, r2475; +} +{ +mul.f16x2 r2777, r2774, r2797; +} +{ +sub.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2465, r2481; +} +{ +mul.f16x2 r2786, r2783, r2788; +} +{ +add.f16x2 r2789, r1758, r2786; +} +{ +sub.f16x2 r2792, r2459, r2475; +} +{ +mul.f16x2 r2795, r2792, r2797; +} +{ +add.f16x2 r2798, r2789, r2795; +} +{ +cvt.rn.f16.f64 rs137, fd173; +} +mov.b32 r2872, {rs137, rs137}; +{ +cvt.rn.f16.f64 rs138, fd172; +} +{ +neg.f16 rs139, rs138; +} +mov.b32 r2881, {rs139, rs139}; +{ +add.f16x2 r2801, r2491, r2507; +} +{ +add.f16x2 r2804, r1650, r2801; +} +{ +add.f16x2 r2807, r2497, r2513; +} +{ +add.f16x2 r2810, r1794, r2807; +} +{ +add.f16x2 r2813, r2491, r2507; +} +{ +mul.f16x2 r2816, r2813, r2872; +} +{ +add.f16x2 r2819, r1650, r2816; +} +{ +sub.f16x2 r2822, r2497, r2513; +} +{ +mul.f16x2 r2825, r2822, r2881; +} +{ +add.f16x2 r2828, r2819, r2825; +} +{ +add.f16x2 r2831, r2491, r2507; +} +{ +mul.f16x2 r2834, r2831, r2872; +} +{ +add.f16x2 r2837, r1650, r2834; +} +{ +sub.f16x2 r2840, r2497, r2513; +} +{ +mul.f16x2 r2843, r2840, r2881; +} +{ +sub.f16x2 r2846, r2837, r2843; +} +{ +add.f16x2 r2849, r2497, r2513; +} +{ +mul.f16x2 r2852, r2849, r2872; +} +{ +add.f16x2 r2855, r1794, r2852; +} +{ +sub.f16x2 r2858, r2491, r2507; +} +{ +mul.f16x2 r2861, r2858, r2881; +} +{ +sub.f16x2 r2864, r2855, r2861; +} +{ +add.f16x2 r2867, r2497, r2513; +} +{ +mul.f16x2 r2870, r2867, r2872; +} +{ +add.f16x2 r2873, r1794, r2870; +} +{ +sub.f16x2 r2876, r2491, r2507; +} +{ +mul.f16x2 r2879, r2876, r2881; +} +{ +add.f16x2 r2882, r2873, r2879; +} +{ +cvt.rn.f16.f64 rs141, fd173; +} +mov.b32 r2956, {rs141, rs141}; +{ +cvt.rn.f16.f64 rs142, fd172; +} +{ +neg.f16 rs143, rs142; +} +mov.b32 r2965, {rs143, rs143}; +mov.f64 fd171, 0dBFCA9CD9AC4258F6; +{ +add.f16x2 r2885, r2523, r2539; +} +{ +add.f16x2 r2888, r1578, r2885; +} +{ +add.f16x2 r2891, r2529, r2545; +} +{ +add.f16x2 r2894, r1722, r2891; +} +{ +add.f16x2 r2897, r2523, r2539; +} +{ +mul.f16x2 r2900, r2897, r2956; +} +{ +add.f16x2 r2903, r1578, r2900; +} +{ +sub.f16x2 r2906, r2529, r2545; +} +{ +mul.f16x2 r2909, r2906, r2965; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +add.f16x2 r2915, r2523, r2539; +} +{ +mul.f16x2 r2918, r2915, r2956; +} +{ +add.f16x2 r2921, r1578, r2918; +} +{ +sub.f16x2 r2924, r2529, r2545; +} +{ +mul.f16x2 r2927, r2924, r2965; +} +{ +sub.f16x2 r2930, r2921, r2927; +} +{ +add.f16x2 r2933, r2529, r2545; +} +{ +mul.f16x2 r2936, r2933, r2956; +} +{ +add.f16x2 r2939, r1722, r2936; +} +{ +sub.f16x2 r2942, r2523, r2539; +} +{ +mul.f16x2 r2945, r2942, r2965; +} +{ +sub.f16x2 r2948, r2939, r2945; +} +{ +add.f16x2 r2951, r2529, r2545; +} +{ +mul.f16x2 r2954, r2951, r2956; +} +{ +add.f16x2 r2957, r1722, r2954; +} +{ +sub.f16x2 r2960, r2523, r2539; +} +{ +mul.f16x2 r2963, r2960, r2965; +} +{ +add.f16x2 r2966, r2957, r2963; +} +mov.f64 fd113, 0d3FEF4CFC327A0080; +{ +cvt.rn.f16.f64 rs145, fd113; +} +{ +cvt.rn.f16.f64 rs146, fd171; +} +{ +cvt.rn.f16.f64 rs147, fd115; +} +{ +cvt.rn.f16.f64 rs148, fd138; +} +mov.f64 fd117, 0d3FE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs149, fd117; +} +{ +cvt.rn.f16.f64 rs150, fd136; +} +{ +cvt.rn.f16.f64 rs151, fd119; +} +{ +cvt.rn.f16.f64 rs152, fd134; +} +mov.f64 fd121, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs153, fd121; +} +{ +cvt.rn.f16.f64 rs154, fd172; +} +{ +cvt.rn.f16.f64 rs155, fd123; +} +{ +cvt.rn.f16.f64 rs156, fd130; +} +mov.f64 fd125, 0d3FBAC2609B3C576C; +{ +cvt.rn.f16.f64 rs157, fd125; +} +{ +cvt.rn.f16.f64 rs158, fd128; +} +{ +cvt.rn.f16.f64 rs159, fd127; +} +{ +cvt.rn.f16.f64 rs160, fd128; +} +mov.f64 fd129, 0dBFD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs161, fd129; +} +{ +cvt.rn.f16.f64 rs162, fd130; +} +{ +cvt.rn.f16.f64 rs163, fd173; +} +{ +cvt.rn.f16.f64 rs164, fd172; +} +mov.f64 fd133, 0dBFE5698496E20BD8; +{ +cvt.rn.f16.f64 rs165, fd133; +} +{ +cvt.rn.f16.f64 rs166, fd134; +} +{ +cvt.rn.f16.f64 rs167, fd135; +} +{ +cvt.rn.f16.f64 rs168, fd136; +} +mov.f64 fd137, 0dBFED3BC3AEFF7F95; +{ +cvt.rn.f16.f64 rs169, fd137; +} +{ +cvt.rn.f16.f64 rs170, fd138; +} +{ +cvt.rn.f16.f64 rs171, fd139; +} +{ +cvt.rn.f16.f64 rs172, fd171; +} +mov.b32 r2983, {rs145, rs145}; +{ +mul.f16x2 r2969, r2636, r2983; +} +mov.b32 r2980, {rs146, rs146}; +{ +mul.f16x2 r2972, r2642, r2980; +} +{ +sub.f16x2 r2975, r2969, r2972; +} +{ +mul.f16x2 r2978, r2636, r2980; +} +{ +fma.rn.f16x2 r2981, r2642, r2983, r2978; +} +mov.b32 r2999, {rs147, rs147}; +{ +mul.f16x2 r2985, r2720, r2999; +} +mov.b32 r2996, {rs148, rs148}; +{ +mul.f16x2 r2988, r2726, r2996; +} +{ +sub.f16x2 r2991, r2985, r2988; +} +{ +mul.f16x2 r2994, r2720, r2996; +} +{ +fma.rn.f16x2 r2997, r2726, r2999, r2994; +} +mov.b32 r3015, {rs149, rs149}; +{ +mul.f16x2 r3001, r2804, r3015; +} +mov.b32 r3012, {rs150, rs150}; +{ +mul.f16x2 r3004, r2810, r3012; +} +{ +sub.f16x2 r3007, r3001, r3004; +} +{ +mul.f16x2 r3010, r2804, r3012; +} +{ +fma.rn.f16x2 r3013, r2810, r3015, r3010; +} +mov.b32 r3031, {rs151, rs151}; +{ +mul.f16x2 r3017, r2888, r3031; +} +mov.b32 r3028, {rs152, rs152}; +{ +mul.f16x2 r3020, r2894, r3028; +} +{ +sub.f16x2 r3023, r3017, r3020; +} +{ +mul.f16x2 r3026, r2888, r3028; +} +{ +fma.rn.f16x2 r3029, r2894, r3031, r3026; +} +mov.b32 r3047, {rs153, rs153}; +{ +mul.f16x2 r3033, r2576, r3047; +} +mov.b32 r3044, {rs154, rs154}; +{ +mul.f16x2 r3036, r2612, r3044; +} +{ +sub.f16x2 r3039, r3033, r3036; +} +{ +mul.f16x2 r3042, r2576, r3044; +} +{ +fma.rn.f16x2 r3045, r2612, r3047, r3042; +} +mov.b32 r3063, {rs155, rs155}; +{ +mul.f16x2 r3049, r2660, r3063; +} +mov.b32 r3060, {rs156, rs156}; +{ +mul.f16x2 r3052, r2696, r3060; +} +{ +sub.f16x2 r3055, r3049, r3052; +} +{ +mul.f16x2 r3058, r2660, r3060; +} +{ +fma.rn.f16x2 r3061, r2696, r3063, r3058; +} +mov.b32 r3079, {rs157, rs157}; +{ +mul.f16x2 r3065, r2744, r3079; +} +mov.b32 r3076, {rs158, rs158}; +{ +mul.f16x2 r3068, r2780, r3076; +} +{ +sub.f16x2 r3071, r3065, r3068; +} +{ +mul.f16x2 r3074, r2744, r3076; +} +{ +fma.rn.f16x2 r3077, r2780, r3079, r3074; +} +mov.b32 r3095, {rs159, rs159}; +{ +mul.f16x2 r3081, r2828, r3095; +} +mov.b32 r3092, {rs160, rs160}; +{ +mul.f16x2 r3084, r2864, r3092; +} +{ +sub.f16x2 r3087, r3081, r3084; +} +{ +mul.f16x2 r3090, r2828, r3092; +} +{ +fma.rn.f16x2 r3093, r2864, r3095, r3090; +} +mov.b32 r3111, {rs161, rs161}; +{ +mul.f16x2 r3097, r2912, r3111; +} +mov.b32 r3108, {rs162, rs162}; +{ +mul.f16x2 r3100, r2948, r3108; +} +{ +sub.f16x2 r3103, r3097, r3100; +} +{ +mul.f16x2 r3106, r2912, r3108; +} +{ +fma.rn.f16x2 r3109, r2948, r3111, r3106; +} +mov.b32 r3127, {rs163, rs163}; +{ +mul.f16x2 r3113, r2594, r3127; +} +mov.b32 r3124, {rs164, rs164}; +{ +mul.f16x2 r3116, r2630, r3124; +} +{ +sub.f16x2 r3119, r3113, r3116; +} +{ +mul.f16x2 r3122, r2594, r3124; +} +{ +fma.rn.f16x2 r3125, r2630, r3127, r3122; +} +mov.b32 r3143, {rs165, rs165}; +{ +mul.f16x2 r3129, r2678, r3143; +} +mov.b32 r3140, {rs166, rs166}; +{ +mul.f16x2 r3132, r2714, r3140; +} +{ +sub.f16x2 r3135, r3129, r3132; +} +{ +mul.f16x2 r3138, r2678, r3140; +} +{ +fma.rn.f16x2 r3141, r2714, r3143, r3138; +} +mov.b32 r3159, {rs167, rs167}; +{ +mul.f16x2 r3145, r2762, r3159; +} +mov.b32 r3156, {rs168, rs168}; +{ +mul.f16x2 r3148, r2798, r3156; +} +{ +sub.f16x2 r3151, r3145, r3148; +} +{ +mul.f16x2 r3154, r2762, r3156; +} +{ +fma.rn.f16x2 r3157, r2798, r3159, r3154; +} +mov.b32 r3175, {rs169, rs169}; +{ +mul.f16x2 r3161, r2846, r3175; +} +mov.b32 r3172, {rs170, rs170}; +{ +mul.f16x2 r3164, r2882, r3172; +} +{ +sub.f16x2 r3167, r3161, r3164; +} +{ +mul.f16x2 r3170, r2846, r3172; +} +{ +fma.rn.f16x2 r3173, r2882, r3175, r3170; +} +mov.b32 r3191, {rs171, rs171}; +{ +mul.f16x2 r3177, r2930, r3191; +} +mov.b32 r3188, {rs172, rs172}; +{ +mul.f16x2 r3180, r2966, r3188; +} +{ +sub.f16x2 r3183, r3177, r3180; +} +{ +mul.f16x2 r3186, r2930, r3188; +} +{ +fma.rn.f16x2 r3189, r2966, r3191, r3186; +} +{ +add.f16x2 %0, r1068, r2552; +} +{ +add.f16x2 %1, r1074, r2558; +} +{ +sub.f16x2 %30, r1068, r2552; +} +{ +sub.f16x2 %31, r1074, r2558; +} +{ +add.f16x2 %2, r1152, r2975; +} +{ +add.f16x2 %3, r1158, r2981; +} +{ +sub.f16x2 %32, r1152, r2975; +} +{ +sub.f16x2 %33, r1158, r2981; +} +{ +add.f16x2 %4, r1236, r2991; +} +{ +add.f16x2 %5, r1242, r2997; +} +{ +sub.f16x2 %34, r1236, r2991; +} +{ +sub.f16x2 %35, r1242, r2997; +} +{ +add.f16x2 %6, r1320, r3007; +} +{ +add.f16x2 %7, r1326, r3013; +} +{ +sub.f16x2 %36, r1320, r3007; +} +{ +sub.f16x2 %37, r1326, r3013; +} +{ +add.f16x2 %8, r1404, r3023; +} +{ +add.f16x2 %9, r1410, r3029; +} +{ +sub.f16x2 %38, r1404, r3023; +} +{ +sub.f16x2 %39, r1410, r3029; +} +{ +add.f16x2 %10, r1092, r3039; +} +{ +add.f16x2 %11, r1128, r3045; +} +{ +sub.f16x2 %40, r1092, r3039; +} +{ +sub.f16x2 %41, r1128, r3045; +} +{ +add.f16x2 %12, r1176, r3055; +} +{ +add.f16x2 %13, r1212, r3061; +} +{ +sub.f16x2 %42, r1176, r3055; +} +{ +sub.f16x2 %43, r1212, r3061; +} +{ +add.f16x2 %14, r1260, r3071; +} +{ +add.f16x2 %15, r1296, r3077; +} +{ +sub.f16x2 %44, r1260, r3071; +} +{ +sub.f16x2 %45, r1296, r3077; +} +{ +add.f16x2 %16, r1344, r3087; +} +{ +add.f16x2 %17, r1380, r3093; +} +{ +sub.f16x2 %46, r1344, r3087; +} +{ +sub.f16x2 %47, r1380, r3093; +} +{ +add.f16x2 %18, r1428, r3103; +} +{ +add.f16x2 %19, r1464, r3109; +} +{ +sub.f16x2 %48, r1428, r3103; +} +{ +sub.f16x2 %49, r1464, r3109; +} +{ +add.f16x2 %20, r1110, r3119; +} +{ +add.f16x2 %21, r1146, r3125; +} +{ +sub.f16x2 %50, r1110, r3119; +} +{ +sub.f16x2 %51, r1146, r3125; +} +{ +add.f16x2 %22, r1194, r3135; +} +{ +add.f16x2 %23, r1230, r3141; +} +{ +sub.f16x2 %52, r1194, r3135; +} +{ +sub.f16x2 %53, r1230, r3141; +} +{ +add.f16x2 %24, r1278, r3151; +} +{ +add.f16x2 %25, r1314, r3157; +} +{ +sub.f16x2 %54, r1278, r3151; +} +{ +sub.f16x2 %55, r1314, r3157; +} +{ +add.f16x2 %26, r1362, r3167; +} +{ +add.f16x2 %27, r1398, r3173; +} +{ +sub.f16x2 %56, r1362, r3167; +} +{ +sub.f16x2 %57, r1398, r3173; +} +{ +add.f16x2 %28, r1446, r3183; +} +{ +add.f16x2 %29, r1482, r3189; +} +{ +sub.f16x2 %58, r1446, r3183; +} +{ +sub.f16x2 %59, r1482, r3189; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)): "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[29].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..48e8dd43fe9d4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp16_inv.hpp.inc @@ -0,0 +1,3853 @@ +#ifndef CUFFTDX_FFT_30_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_30_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<962, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<195>; +.reg .b32 r<3433>; +.reg .f64 fd<175>; +.reg .b64 rd<3>; +mov.f64 fd123, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd123; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd74, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd74; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r228, {rs3, rs3}; +mov.f64 fd135, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs5, fd135; +} +mov.b32 r282, {rs5, rs5}; +mov.f64 fd72, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs6, fd72; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r300, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs9, fd123; +} +mov.b32 r291, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd74; +} +mov.b32 r306, {rs10, rs10}; +{ +add.f16x2 r1, %61, %68; +} +{ +add.f16x2 r4, %62, r1; +} +{ +add.f16x2 r7, %64, %60; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %66, %63; +} +{ +add.f16x2 r16, %67, r13; +} +{ +add.f16x2 r19, %69, %65; +} +{ +add.f16x2 r22, r16, r19; +} +{ +add.f16x2 r25, %61, %68; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %62, r28; +} +{ +add.f16x2 r34, %64, %60; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %66, %63; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %69, %65; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 r58, r40, r55; +} +{ +add.f16x2 r61, %61, %68; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %62, r64; +} +{ +add.f16x2 r70, %64, %60; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %66, %63; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %69, %65; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 r94, r76, r91; +} +{ +add.f16x2 r97, %61, %68; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %62, r100; +} +{ +add.f16x2 r106, %64, %60; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %66, %63; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %69, %65; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 r130, r112, r127; +} +{ +add.f16x2 r133, %61, %68; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %62, r136; +} +{ +add.f16x2 r142, %64, %60; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %66, %63; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %69, %65; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 r166, r148, r163; +} +{ +add.f16x2 r169, %66, %63; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %67, r172; +} +{ +add.f16x2 r178, %69, %65; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %61, %68; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %64, %60; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 r202, r184, r199; +} +{ +add.f16x2 r205, %66, %63; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %67, r208; +} +{ +add.f16x2 r214, %69, %65; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %61, %68; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %64, %60; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 r238, r220, r235; +} +{ +add.f16x2 r241, %66, %63; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %67, r244; +} +{ +add.f16x2 r250, %69, %65; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %61, %68; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %64, %60; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 r274, r256, r271; +} +{ +add.f16x2 r277, %66, %63; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %67, r280; +} +{ +add.f16x2 r286, %69, %65; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %61, %68; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %64, %60; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 r310, r292, r307; +} +{ +cvt.rn.f16.f64 rs11, fd123; +} +mov.b32 r522, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd74; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r540, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs15, fd135; +} +mov.b32 r594, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd72; +} +{ +neg.f16 rs17, rs16; +} +mov.b32 r612, {rs17, rs17}; +{ +cvt.rn.f16.f64 rs19, fd123; +} +mov.b32 r603, {rs19, rs19}; +{ +cvt.rn.f16.f64 rs20, fd74; +} +mov.b32 r618, {rs20, rs20}; +{ +add.f16x2 r313, %72, %79; +} +{ +add.f16x2 r316, %73, r313; +} +{ +add.f16x2 r319, %75, %71; +} +{ +add.f16x2 r322, r316, r319; +} +{ +add.f16x2 r325, %77, %74; +} +{ +add.f16x2 r328, %78, r325; +} +{ +add.f16x2 r331, %70, %76; +} +{ +add.f16x2 r334, r328, r331; +} +{ +add.f16x2 r337, %72, %79; +} +{ +mul.f16x2 r340, r337, r522; +} +{ +add.f16x2 r343, %73, r340; +} +{ +add.f16x2 r346, %75, %71; +} +{ +mul.f16x2 r349, r346, r594; +} +{ +add.f16x2 r352, r343, r349; +} +{ +sub.f16x2 r355, %77, %74; +} +{ +mul.f16x2 r358, r355, r540; +} +{ +sub.f16x2 r361, %70, %76; +} +{ +mul.f16x2 r364, r361, r612; +} +{ +add.f16x2 r367, r358, r364; +} +{ +sub.f16x2 r370, r352, r367; +} +{ +add.f16x2 r373, %72, %79; +} +{ +mul.f16x2 r376, r373, r522; +} +{ +add.f16x2 r379, %73, r376; +} +{ +add.f16x2 r382, %75, %71; +} +{ +mul.f16x2 r385, r382, r594; +} +{ +add.f16x2 r388, r379, r385; +} +{ +sub.f16x2 r391, %77, %74; +} +{ +mul.f16x2 r394, r391, r540; +} +{ +sub.f16x2 r397, %70, %76; +} +{ +mul.f16x2 r400, r397, r612; +} +{ +add.f16x2 r403, r394, r400; +} +{ +add.f16x2 r406, r388, r403; +} +{ +add.f16x2 r409, %72, %79; +} +{ +mul.f16x2 r412, r409, r594; +} +{ +add.f16x2 r415, %73, r412; +} +{ +add.f16x2 r418, %75, %71; +} +{ +mul.f16x2 r421, r418, r603; +} +{ +add.f16x2 r424, r415, r421; +} +{ +sub.f16x2 r427, %77, %74; +} +{ +mul.f16x2 r430, r427, r612; +} +{ +sub.f16x2 r433, %70, %76; +} +{ +mul.f16x2 r436, r433, r618; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, r424, r439; +} +{ +add.f16x2 r445, %72, %79; +} +{ +mul.f16x2 r448, r445, r594; +} +{ +add.f16x2 r451, %73, r448; +} +{ +add.f16x2 r454, %75, %71; +} +{ +mul.f16x2 r457, r454, r603; +} +{ +add.f16x2 r460, r451, r457; +} +{ +sub.f16x2 r463, %77, %74; +} +{ +mul.f16x2 r466, r463, r612; +} +{ +sub.f16x2 r469, %70, %76; +} +{ +mul.f16x2 r472, r469, r618; +} +{ +add.f16x2 r475, r466, r472; +} +{ +add.f16x2 r478, r460, r475; +} +{ +add.f16x2 r481, %77, %74; +} +{ +mul.f16x2 r484, r481, r522; +} +{ +add.f16x2 r487, %78, r484; +} +{ +add.f16x2 r490, %70, %76; +} +{ +mul.f16x2 r493, r490, r594; +} +{ +add.f16x2 r496, r487, r493; +} +{ +sub.f16x2 r499, %72, %79; +} +{ +mul.f16x2 r502, r499, r540; +} +{ +sub.f16x2 r505, %75, %71; +} +{ +mul.f16x2 r508, r505, r612; +} +{ +add.f16x2 r511, r502, r508; +} +{ +add.f16x2 r514, r496, r511; +} +{ +add.f16x2 r517, %77, %74; +} +{ +mul.f16x2 r520, r517, r522; +} +{ +add.f16x2 r523, %78, r520; +} +{ +add.f16x2 r526, %70, %76; +} +{ +mul.f16x2 r529, r526, r594; +} +{ +add.f16x2 r532, r523, r529; +} +{ +sub.f16x2 r535, %72, %79; +} +{ +mul.f16x2 r538, r535, r540; +} +{ +sub.f16x2 r541, %75, %71; +} +{ +mul.f16x2 r544, r541, r612; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, r532, r547; +} +{ +add.f16x2 r553, %77, %74; +} +{ +mul.f16x2 r556, r553, r594; +} +{ +add.f16x2 r559, %78, r556; +} +{ +add.f16x2 r562, %70, %76; +} +{ +mul.f16x2 r565, r562, r603; +} +{ +add.f16x2 r568, r559, r565; +} +{ +sub.f16x2 r571, %72, %79; +} +{ +mul.f16x2 r574, r571, r612; +} +{ +sub.f16x2 r577, %75, %71; +} +{ +mul.f16x2 r580, r577, r618; +} +{ +add.f16x2 r583, r574, r580; +} +{ +add.f16x2 r586, r568, r583; +} +{ +add.f16x2 r589, %77, %74; +} +{ +mul.f16x2 r592, r589, r594; +} +{ +add.f16x2 r595, %78, r592; +} +{ +add.f16x2 r598, %70, %76; +} +{ +mul.f16x2 r601, r598, r603; +} +{ +add.f16x2 r604, r595, r601; +} +{ +sub.f16x2 r607, %72, %79; +} +{ +mul.f16x2 r610, r607, r612; +} +{ +sub.f16x2 r613, %75, %71; +} +{ +mul.f16x2 r616, r613, r618; +} +{ +add.f16x2 r619, r610, r616; +} +{ +sub.f16x2 r622, r604, r619; +} +{ +cvt.rn.f16.f64 rs21, fd123; +} +mov.b32 r834, {rs21, rs21}; +{ +cvt.rn.f16.f64 rs22, fd74; +} +{ +neg.f16 rs23, rs22; +} +mov.b32 r852, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs25, fd135; +} +mov.b32 r906, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd72; +} +{ +neg.f16 rs27, rs26; +} +mov.b32 r924, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs29, fd123; +} +mov.b32 r915, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd74; +} +mov.b32 r930, {rs30, rs30}; +{ +add.f16x2 r625, %83, %81; +} +{ +add.f16x2 r628, %85, r625; +} +{ +add.f16x2 r631, %87, %84; +} +{ +add.f16x2 r634, r628, r631; +} +{ +add.f16x2 r637, %89, %86; +} +{ +add.f16x2 r640, %80, r637; +} +{ +add.f16x2 r643, %82, %88; +} +{ +add.f16x2 r646, r640, r643; +} +{ +add.f16x2 r649, %83, %81; +} +{ +mul.f16x2 r652, r649, r834; +} +{ +add.f16x2 r655, %85, r652; +} +{ +add.f16x2 r658, %87, %84; +} +{ +mul.f16x2 r661, r658, r906; +} +{ +add.f16x2 r664, r655, r661; +} +{ +sub.f16x2 r667, %89, %86; +} +{ +mul.f16x2 r670, r667, r852; +} +{ +sub.f16x2 r673, %82, %88; +} +{ +mul.f16x2 r676, r673, r924; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 r682, r664, r679; +} +{ +add.f16x2 r685, %83, %81; +} +{ +mul.f16x2 r688, r685, r834; +} +{ +add.f16x2 r691, %85, r688; +} +{ +add.f16x2 r694, %87, %84; +} +{ +mul.f16x2 r697, r694, r906; +} +{ +add.f16x2 r700, r691, r697; +} +{ +sub.f16x2 r703, %89, %86; +} +{ +mul.f16x2 r706, r703, r852; +} +{ +sub.f16x2 r709, %82, %88; +} +{ +mul.f16x2 r712, r709, r924; +} +{ +add.f16x2 r715, r706, r712; +} +{ +add.f16x2 r718, r700, r715; +} +{ +add.f16x2 r721, %83, %81; +} +{ +mul.f16x2 r724, r721, r906; +} +{ +add.f16x2 r727, %85, r724; +} +{ +add.f16x2 r730, %87, %84; +} +{ +mul.f16x2 r733, r730, r915; +} +{ +add.f16x2 r736, r727, r733; +} +{ +sub.f16x2 r739, %89, %86; +} +{ +mul.f16x2 r742, r739, r924; +} +{ +sub.f16x2 r745, %82, %88; +} +{ +mul.f16x2 r748, r745, r930; +} +{ +add.f16x2 r751, r742, r748; +} +{ +sub.f16x2 r754, r736, r751; +} +{ +add.f16x2 r757, %83, %81; +} +{ +mul.f16x2 r760, r757, r906; +} +{ +add.f16x2 r763, %85, r760; +} +{ +add.f16x2 r766, %87, %84; +} +{ +mul.f16x2 r769, r766, r915; +} +{ +add.f16x2 r772, r763, r769; +} +{ +sub.f16x2 r775, %89, %86; +} +{ +mul.f16x2 r778, r775, r924; +} +{ +sub.f16x2 r781, %82, %88; +} +{ +mul.f16x2 r784, r781, r930; +} +{ +add.f16x2 r787, r778, r784; +} +{ +add.f16x2 r790, r772, r787; +} +{ +add.f16x2 r793, %89, %86; +} +{ +mul.f16x2 r796, r793, r834; +} +{ +add.f16x2 r799, %80, r796; +} +{ +add.f16x2 r802, %82, %88; +} +{ +mul.f16x2 r805, r802, r906; +} +{ +add.f16x2 r808, r799, r805; +} +{ +sub.f16x2 r811, %83, %81; +} +{ +mul.f16x2 r814, r811, r852; +} +{ +sub.f16x2 r817, %87, %84; +} +{ +mul.f16x2 r820, r817, r924; +} +{ +add.f16x2 r823, r814, r820; +} +{ +add.f16x2 r826, r808, r823; +} +{ +add.f16x2 r829, %89, %86; +} +{ +mul.f16x2 r832, r829, r834; +} +{ +add.f16x2 r835, %80, r832; +} +{ +add.f16x2 r838, %82, %88; +} +{ +mul.f16x2 r841, r838, r906; +} +{ +add.f16x2 r844, r835, r841; +} +{ +sub.f16x2 r847, %83, %81; +} +{ +mul.f16x2 r850, r847, r852; +} +{ +sub.f16x2 r853, %87, %84; +} +{ +mul.f16x2 r856, r853, r924; +} +{ +add.f16x2 r859, r850, r856; +} +{ +sub.f16x2 r862, r844, r859; +} +{ +add.f16x2 r865, %89, %86; +} +{ +mul.f16x2 r868, r865, r906; +} +{ +add.f16x2 r871, %80, r868; +} +{ +add.f16x2 r874, %82, %88; +} +{ +mul.f16x2 r877, r874, r915; +} +{ +add.f16x2 r880, r871, r877; +} +{ +sub.f16x2 r883, %83, %81; +} +{ +mul.f16x2 r886, r883, r924; +} +{ +sub.f16x2 r889, %87, %84; +} +{ +mul.f16x2 r892, r889, r930; +} +{ +add.f16x2 r895, r886, r892; +} +{ +add.f16x2 r898, r880, r895; +} +{ +add.f16x2 r901, %89, %86; +} +{ +mul.f16x2 r904, r901, r906; +} +{ +add.f16x2 r907, %80, r904; +} +{ +add.f16x2 r910, %82, %88; +} +{ +mul.f16x2 r913, r910, r915; +} +{ +add.f16x2 r916, r907, r913; +} +{ +sub.f16x2 r919, %83, %81; +} +{ +mul.f16x2 r922, r919, r924; +} +{ +sub.f16x2 r925, %87, %84; +} +{ +mul.f16x2 r928, r925, r930; +} +{ +add.f16x2 r931, r922, r928; +} +{ +sub.f16x2 r934, r916, r931; +} +mov.f64 fd115, 0d3FED3BC3AEFF7F95; +{ +cvt.rn.f16.f64 rs31, fd115; +} +mov.f64 fd138, 0d3FDA07F921061AD1; +{ +cvt.rn.f16.f64 rs32, fd138; +} +mov.f64 fd119, 0d3FE5698496E20BD8; +{ +cvt.rn.f16.f64 rs33, fd119; +} +mov.f64 fd134, 0d3FE7C7D7A833BEC2; +{ +cvt.rn.f16.f64 rs34, fd134; +} +{ +cvt.rn.f16.f64 rs35, fd123; +} +mov.f64 fd130, 0d3FEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs36, fd130; +} +mov.f64 fd127, 0dBFBAC2609B3C576C; +{ +cvt.rn.f16.f64 rs37, fd127; +} +mov.f64 fd128, 0d3FEFD31F94F867C6; +{ +cvt.rn.f16.f64 rs38, fd128; +} +{ +cvt.rn.f16.f64 rs41, fd135; +} +mov.f64 fd136, 0d3FE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs42, fd136; +} +mov.f64 fd139, 0dBFEF4CFC327A0080; +{ +cvt.rn.f16.f64 rs45, fd139; +} +mov.f64 fd90, 0dBFCA9CD9AC4258F6; +{ +cvt.rn.f16.f64 rs46, fd90; +} +mov.b32 r951, {rs31, rs31}; +{ +mul.f16x2 r937, r370, r951; +} +mov.b32 r948, {rs32, rs32}; +{ +mul.f16x2 r940, r514, r948; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r370, r948; +} +{ +fma.rn.f16x2 r949, r514, r951, r946; +} +mov.b32 r983, {rs33, rs33}; +{ +mul.f16x2 r953, r682, r983; +} +mov.b32 r980, {rs34, rs34}; +{ +mul.f16x2 r956, r826, r980; +} +{ +sub.f16x2 r959, r953, r956; +} +{ +mul.f16x2 r962, r682, r980; +} +{ +fma.rn.f16x2 r965, r826, r983, r962; +} +{ +mul.f16x2 r969, r442, r983; +} +{ +mul.f16x2 r972, r586, r980; +} +{ +sub.f16x2 r975, r969, r972; +} +{ +mul.f16x2 r978, r442, r980; +} +{ +fma.rn.f16x2 r981, r586, r983, r978; +} +mov.b32 r1047, {rs37, rs37}; +{ +mul.f16x2 r985, r754, r1047; +} +mov.b32 r1044, {rs38, rs38}; +{ +mul.f16x2 r988, r898, r1044; +} +{ +sub.f16x2 r991, r985, r988; +} +{ +mul.f16x2 r994, r754, r1044; +} +{ +fma.rn.f16x2 r997, r898, r1047, r994; +} +mov.b32 r1015, {rs35, rs35}; +{ +mul.f16x2 r1001, r478, r1015; +} +mov.b32 r1012, {rs36, rs36}; +{ +mul.f16x2 r1004, r622, r1012; +} +{ +sub.f16x2 r1007, r1001, r1004; +} +{ +mul.f16x2 r1010, r478, r1012; +} +{ +fma.rn.f16x2 r1013, r622, r1015, r1010; +} +mov.b32 r1031, {rs41, rs41}; +{ +mul.f16x2 r1017, r790, r1031; +} +mov.b32 r1028, {rs42, rs42}; +{ +mul.f16x2 r1020, r934, r1028; +} +{ +sub.f16x2 r1023, r1017, r1020; +} +{ +mul.f16x2 r1026, r790, r1028; +} +{ +fma.rn.f16x2 r1029, r934, r1031, r1026; +} +{ +mul.f16x2 r1033, r406, r1047; +} +{ +mul.f16x2 r1036, r550, r1044; +} +{ +sub.f16x2 r1039, r1033, r1036; +} +{ +mul.f16x2 r1042, r406, r1044; +} +{ +fma.rn.f16x2 r1045, r550, r1047, r1042; +} +mov.b32 r1063, {rs45, rs45}; +{ +mul.f16x2 r1049, r718, r1063; +} +mov.b32 r1060, {rs46, rs46}; +mov.f64 fd174, 0dBFE0000000000000; +{ +mul.f16x2 r1052, r862, r1060; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r718, r1060; +} +{ +fma.rn.f16x2 r1061, r862, r1063, r1058; +} +{ +cvt.rn.f16.f64 rs59, fd174; +} +mov.b32 r1136, {rs59, rs59}; +mov.f64 fd173, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs60, fd173; +} +mov.b32 r1145, {rs60, rs60}; +{ +add.f16x2 r1065, r322, r634; +} +{ +add.f16x2 r1068, r10, r1065; +} +{ +add.f16x2 r1071, r334, r646; +} +{ +add.f16x2 r1074, r22, r1071; +} +{ +add.f16x2 r1077, r322, r634; +} +{ +mul.f16x2 r1080, r1077, r1136; +} +{ +add.f16x2 r1083, r10, r1080; +} +{ +sub.f16x2 r1086, r334, r646; +} +{ +mul.f16x2 r1089, r1086, r1145; +} +{ +add.f16x2 r1092, r1083, r1089; +} +{ +add.f16x2 r1095, r322, r634; +} +{ +mul.f16x2 r1098, r1095, r1136; +} +{ +add.f16x2 r1101, r10, r1098; +} +{ +sub.f16x2 r1104, r334, r646; +} +{ +mul.f16x2 r1107, r1104, r1145; +} +{ +sub.f16x2 r1110, r1101, r1107; +} +{ +add.f16x2 r1113, r334, r646; +} +{ +mul.f16x2 r1116, r1113, r1136; +} +{ +add.f16x2 r1119, r22, r1116; +} +{ +sub.f16x2 r1122, r322, r634; +} +{ +mul.f16x2 r1125, r1122, r1145; +} +{ +sub.f16x2 r1128, r1119, r1125; +} +{ +add.f16x2 r1131, r334, r646; +} +{ +mul.f16x2 r1134, r1131, r1136; +} +{ +add.f16x2 r1137, r22, r1134; +} +{ +sub.f16x2 r1140, r322, r634; +} +{ +mul.f16x2 r1143, r1140, r1145; +} +{ +add.f16x2 r1146, r1137, r1143; +} +{ +cvt.rn.f16.f64 rs61, fd174; +} +mov.b32 r1220, {rs61, rs61}; +{ +cvt.rn.f16.f64 rs62, fd173; +} +mov.b32 r1229, {rs62, rs62}; +{ +add.f16x2 r1149, r943, r959; +} +{ +add.f16x2 r1152, r58, r1149; +} +{ +add.f16x2 r1155, r949, r965; +} +{ +add.f16x2 r1158, r202, r1155; +} +{ +add.f16x2 r1161, r943, r959; +} +{ +mul.f16x2 r1164, r1161, r1220; +} +{ +add.f16x2 r1167, r58, r1164; +} +{ +sub.f16x2 r1170, r949, r965; +} +{ +mul.f16x2 r1173, r1170, r1229; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +add.f16x2 r1179, r943, r959; +} +{ +mul.f16x2 r1182, r1179, r1220; +} +{ +add.f16x2 r1185, r58, r1182; +} +{ +sub.f16x2 r1188, r949, r965; +} +{ +mul.f16x2 r1191, r1188, r1229; +} +{ +sub.f16x2 r1194, r1185, r1191; +} +{ +add.f16x2 r1197, r949, r965; +} +{ +mul.f16x2 r1200, r1197, r1220; +} +{ +add.f16x2 r1203, r202, r1200; +} +{ +sub.f16x2 r1206, r943, r959; +} +{ +mul.f16x2 r1209, r1206, r1229; +} +{ +sub.f16x2 r1212, r1203, r1209; +} +{ +add.f16x2 r1215, r949, r965; +} +{ +mul.f16x2 r1218, r1215, r1220; +} +{ +add.f16x2 r1221, r202, r1218; +} +{ +sub.f16x2 r1224, r943, r959; +} +{ +mul.f16x2 r1227, r1224, r1229; +} +{ +add.f16x2 r1230, r1221, r1227; +} +{ +cvt.rn.f16.f64 rs63, fd174; +} +mov.b32 r1304, {rs63, rs63}; +{ +cvt.rn.f16.f64 rs64, fd173; +} +mov.b32 r1313, {rs64, rs64}; +{ +add.f16x2 r1233, r975, r991; +} +{ +add.f16x2 r1236, r130, r1233; +} +{ +add.f16x2 r1239, r981, r997; +} +{ +add.f16x2 r1242, r274, r1239; +} +{ +add.f16x2 r1245, r975, r991; +} +{ +mul.f16x2 r1248, r1245, r1304; +} +{ +add.f16x2 r1251, r130, r1248; +} +{ +sub.f16x2 r1254, r981, r997; +} +{ +mul.f16x2 r1257, r1254, r1313; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +add.f16x2 r1263, r975, r991; +} +{ +mul.f16x2 r1266, r1263, r1304; +} +{ +add.f16x2 r1269, r130, r1266; +} +{ +sub.f16x2 r1272, r981, r997; +} +{ +mul.f16x2 r1275, r1272, r1313; +} +{ +sub.f16x2 r1278, r1269, r1275; +} +{ +add.f16x2 r1281, r981, r997; +} +{ +mul.f16x2 r1284, r1281, r1304; +} +{ +add.f16x2 r1287, r274, r1284; +} +{ +sub.f16x2 r1290, r975, r991; +} +{ +mul.f16x2 r1293, r1290, r1313; +} +{ +sub.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 r1299, r981, r997; +} +{ +mul.f16x2 r1302, r1299, r1304; +} +{ +add.f16x2 r1305, r274, r1302; +} +{ +sub.f16x2 r1308, r975, r991; +} +{ +mul.f16x2 r1311, r1308, r1313; +} +{ +add.f16x2 r1314, r1305, r1311; +} +{ +cvt.rn.f16.f64 rs65, fd174; +} +mov.b32 r1388, {rs65, rs65}; +{ +cvt.rn.f16.f64 rs66, fd173; +} +mov.b32 r1397, {rs66, rs66}; +{ +add.f16x2 r1317, r1007, r1023; +} +{ +add.f16x2 r1320, r166, r1317; +} +{ +add.f16x2 r1323, r1013, r1029; +} +{ +add.f16x2 r1326, r310, r1323; +} +{ +add.f16x2 r1329, r1007, r1023; +} +{ +mul.f16x2 r1332, r1329, r1388; +} +{ +add.f16x2 r1335, r166, r1332; +} +{ +sub.f16x2 r1338, r1013, r1029; +} +{ +mul.f16x2 r1341, r1338, r1397; +} +{ +add.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, r1007, r1023; +} +{ +mul.f16x2 r1350, r1347, r1388; +} +{ +add.f16x2 r1353, r166, r1350; +} +{ +sub.f16x2 r1356, r1013, r1029; +} +{ +mul.f16x2 r1359, r1356, r1397; +} +{ +sub.f16x2 r1362, r1353, r1359; +} +{ +add.f16x2 r1365, r1013, r1029; +} +{ +mul.f16x2 r1368, r1365, r1388; +} +{ +add.f16x2 r1371, r310, r1368; +} +{ +sub.f16x2 r1374, r1007, r1023; +} +{ +mul.f16x2 r1377, r1374, r1397; +} +{ +sub.f16x2 r1380, r1371, r1377; +} +{ +add.f16x2 r1383, r1013, r1029; +} +{ +mul.f16x2 r1386, r1383, r1388; +} +{ +add.f16x2 r1389, r310, r1386; +} +{ +sub.f16x2 r1392, r1007, r1023; +} +{ +mul.f16x2 r1395, r1392, r1397; +} +{ +add.f16x2 r1398, r1389, r1395; +} +{ +cvt.rn.f16.f64 rs67, fd174; +} +mov.b32 r1472, {rs67, rs67}; +{ +cvt.rn.f16.f64 rs68, fd173; +} +mov.b32 r1481, {rs68, rs68}; +{ +add.f16x2 r1401, r1039, r1055; +} +{ +add.f16x2 r1404, r94, r1401; +} +{ +add.f16x2 r1407, r1045, r1061; +} +{ +add.f16x2 r1410, r238, r1407; +} +{ +add.f16x2 r1413, r1039, r1055; +} +{ +mul.f16x2 r1416, r1413, r1472; +} +{ +add.f16x2 r1419, r94, r1416; +} +{ +sub.f16x2 r1422, r1045, r1061; +} +{ +mul.f16x2 r1425, r1422, r1481; +} +{ +add.f16x2 r1428, r1419, r1425; +} +{ +add.f16x2 r1431, r1039, r1055; +} +{ +mul.f16x2 r1434, r1431, r1472; +} +{ +add.f16x2 r1437, r94, r1434; +} +{ +sub.f16x2 r1440, r1045, r1061; +} +{ +mul.f16x2 r1443, r1440, r1481; +} +{ +sub.f16x2 r1446, r1437, r1443; +} +{ +add.f16x2 r1449, r1045, r1061; +} +{ +mul.f16x2 r1452, r1449, r1472; +} +{ +add.f16x2 r1455, r238, r1452; +} +{ +sub.f16x2 r1458, r1039, r1055; +} +{ +mul.f16x2 r1461, r1458, r1481; +} +{ +sub.f16x2 r1464, r1455, r1461; +} +{ +add.f16x2 r1467, r1045, r1061; +} +{ +mul.f16x2 r1470, r1467, r1472; +} +{ +add.f16x2 r1473, r238, r1470; +} +{ +sub.f16x2 r1476, r1039, r1055; +} +{ +mul.f16x2 r1479, r1476, r1481; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +cvt.rn.f16.f64 rs69, fd123; +} +mov.b32 r1694, {rs69, rs69}; +{ +cvt.rn.f16.f64 rs70, fd74; +} +{ +neg.f16 rs71, rs70; +} +mov.b32 r1712, {rs71, rs71}; +{ +cvt.rn.f16.f64 rs73, fd135; +} +mov.b32 r1766, {rs73, rs73}; +{ +cvt.rn.f16.f64 rs74, fd72; +} +{ +neg.f16 rs75, rs74; +} +mov.b32 r1784, {rs75, rs75}; +{ +cvt.rn.f16.f64 rs77, fd123; +} +mov.b32 r1775, {rs77, rs77}; +{ +cvt.rn.f16.f64 rs78, fd74; +} +mov.b32 r1790, {rs78, rs78}; +{ +add.f16x2 r1485, %92, %99; +} +{ +add.f16x2 r1488, %93, r1485; +} +{ +add.f16x2 r1491, %95, %91; +} +{ +add.f16x2 r1494, r1488, r1491; +} +{ +add.f16x2 r1497, %97, %94; +} +{ +add.f16x2 r1500, %98, r1497; +} +{ +add.f16x2 r1503, %90, %96; +} +{ +add.f16x2 r1506, r1500, r1503; +} +{ +add.f16x2 r1509, %92, %99; +} +{ +mul.f16x2 r1512, r1509, r1694; +} +{ +add.f16x2 r1515, %93, r1512; +} +{ +add.f16x2 r1518, %95, %91; +} +{ +mul.f16x2 r1521, r1518, r1766; +} +{ +add.f16x2 r1524, r1515, r1521; +} +{ +sub.f16x2 r1527, %97, %94; +} +{ +mul.f16x2 r1530, r1527, r1712; +} +{ +sub.f16x2 r1533, %90, %96; +} +{ +mul.f16x2 r1536, r1533, r1784; +} +{ +add.f16x2 r1539, r1530, r1536; +} +{ +sub.f16x2 r1542, r1524, r1539; +} +{ +add.f16x2 r1545, %92, %99; +} +{ +mul.f16x2 r1548, r1545, r1694; +} +{ +add.f16x2 r1551, %93, r1548; +} +{ +add.f16x2 r1554, %95, %91; +} +{ +mul.f16x2 r1557, r1554, r1766; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +sub.f16x2 r1563, %97, %94; +} +{ +mul.f16x2 r1566, r1563, r1712; +} +{ +sub.f16x2 r1569, %90, %96; +} +{ +mul.f16x2 r1572, r1569, r1784; +} +{ +add.f16x2 r1575, r1566, r1572; +} +{ +add.f16x2 r1578, r1560, r1575; +} +{ +add.f16x2 r1581, %92, %99; +} +{ +mul.f16x2 r1584, r1581, r1766; +} +{ +add.f16x2 r1587, %93, r1584; +} +{ +add.f16x2 r1590, %95, %91; +} +{ +mul.f16x2 r1593, r1590, r1775; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, %97, %94; +} +{ +mul.f16x2 r1602, r1599, r1784; +} +{ +sub.f16x2 r1605, %90, %96; +} +{ +mul.f16x2 r1608, r1605, r1790; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r1596, r1611; +} +{ +add.f16x2 r1617, %92, %99; +} +{ +mul.f16x2 r1620, r1617, r1766; +} +{ +add.f16x2 r1623, %93, r1620; +} +{ +add.f16x2 r1626, %95, %91; +} +{ +mul.f16x2 r1629, r1626, r1775; +} +{ +add.f16x2 r1632, r1623, r1629; +} +{ +sub.f16x2 r1635, %97, %94; +} +{ +mul.f16x2 r1638, r1635, r1784; +} +{ +sub.f16x2 r1641, %90, %96; +} +{ +mul.f16x2 r1644, r1641, r1790; +} +{ +add.f16x2 r1647, r1638, r1644; +} +{ +add.f16x2 r1650, r1632, r1647; +} +{ +add.f16x2 r1653, %97, %94; +} +{ +mul.f16x2 r1656, r1653, r1694; +} +{ +add.f16x2 r1659, %98, r1656; +} +{ +add.f16x2 r1662, %90, %96; +} +{ +mul.f16x2 r1665, r1662, r1766; +} +{ +add.f16x2 r1668, r1659, r1665; +} +{ +sub.f16x2 r1671, %92, %99; +} +{ +mul.f16x2 r1674, r1671, r1712; +} +{ +sub.f16x2 r1677, %95, %91; +} +{ +mul.f16x2 r1680, r1677, r1784; +} +{ +add.f16x2 r1683, r1674, r1680; +} +{ +add.f16x2 r1686, r1668, r1683; +} +{ +add.f16x2 r1689, %97, %94; +} +{ +mul.f16x2 r1692, r1689, r1694; +} +{ +add.f16x2 r1695, %98, r1692; +} +{ +add.f16x2 r1698, %90, %96; +} +{ +mul.f16x2 r1701, r1698, r1766; +} +{ +add.f16x2 r1704, r1695, r1701; +} +{ +sub.f16x2 r1707, %92, %99; +} +{ +mul.f16x2 r1710, r1707, r1712; +} +{ +sub.f16x2 r1713, %95, %91; +} +{ +mul.f16x2 r1716, r1713, r1784; +} +{ +add.f16x2 r1719, r1710, r1716; +} +{ +sub.f16x2 r1722, r1704, r1719; +} +{ +add.f16x2 r1725, %97, %94; +} +{ +mul.f16x2 r1728, r1725, r1766; +} +{ +add.f16x2 r1731, %98, r1728; +} +{ +add.f16x2 r1734, %90, %96; +} +{ +mul.f16x2 r1737, r1734, r1775; +} +{ +add.f16x2 r1740, r1731, r1737; +} +{ +sub.f16x2 r1743, %92, %99; +} +{ +mul.f16x2 r1746, r1743, r1784; +} +{ +sub.f16x2 r1749, %95, %91; +} +{ +mul.f16x2 r1752, r1749, r1790; +} +{ +add.f16x2 r1755, r1746, r1752; +} +{ +add.f16x2 r1758, r1740, r1755; +} +{ +add.f16x2 r1761, %97, %94; +} +{ +mul.f16x2 r1764, r1761, r1766; +} +{ +add.f16x2 r1767, %98, r1764; +} +{ +add.f16x2 r1770, %90, %96; +} +{ +mul.f16x2 r1773, r1770, r1775; +} +{ +add.f16x2 r1776, r1767, r1773; +} +{ +sub.f16x2 r1779, %92, %99; +} +{ +mul.f16x2 r1782, r1779, r1784; +} +{ +sub.f16x2 r1785, %95, %91; +} +{ +mul.f16x2 r1788, r1785, r1790; +} +{ +add.f16x2 r1791, r1782, r1788; +} +{ +sub.f16x2 r1794, r1776, r1791; +} +{ +cvt.rn.f16.f64 rs79, fd123; +} +mov.b32 r2006, {rs79, rs79}; +{ +cvt.rn.f16.f64 rs80, fd74; +} +{ +neg.f16 rs81, rs80; +} +mov.b32 r2024, {rs81, rs81}; +{ +cvt.rn.f16.f64 rs83, fd135; +} +mov.b32 r2078, {rs83, rs83}; +{ +cvt.rn.f16.f64 rs84, fd72; +} +{ +neg.f16 rs85, rs84; +} +mov.b32 r2096, {rs85, rs85}; +{ +cvt.rn.f16.f64 rs87, fd123; +} +mov.b32 r2087, {rs87, rs87}; +{ +cvt.rn.f16.f64 rs88, fd74; +} +mov.b32 r2102, {rs88, rs88}; +{ +add.f16x2 r1797, %103, %101; +} +{ +add.f16x2 r1800, %105, r1797; +} +{ +add.f16x2 r1803, %107, %104; +} +{ +add.f16x2 r1806, r1800, r1803; +} +{ +add.f16x2 r1809, %109, %106; +} +{ +add.f16x2 r1812, %100, r1809; +} +{ +add.f16x2 r1815, %102, %108; +} +{ +add.f16x2 r1818, r1812, r1815; +} +{ +add.f16x2 r1821, %103, %101; +} +{ +mul.f16x2 r1824, r1821, r2006; +} +{ +add.f16x2 r1827, %105, r1824; +} +{ +add.f16x2 r1830, %107, %104; +} +{ +mul.f16x2 r1833, r1830, r2078; +} +{ +add.f16x2 r1836, r1827, r1833; +} +{ +sub.f16x2 r1839, %109, %106; +} +{ +mul.f16x2 r1842, r1839, r2024; +} +{ +sub.f16x2 r1845, %102, %108; +} +{ +mul.f16x2 r1848, r1845, r2096; +} +{ +add.f16x2 r1851, r1842, r1848; +} +{ +sub.f16x2 r1854, r1836, r1851; +} +{ +add.f16x2 r1857, %103, %101; +} +{ +mul.f16x2 r1860, r1857, r2006; +} +{ +add.f16x2 r1863, %105, r1860; +} +{ +add.f16x2 r1866, %107, %104; +} +{ +mul.f16x2 r1869, r1866, r2078; +} +{ +add.f16x2 r1872, r1863, r1869; +} +{ +sub.f16x2 r1875, %109, %106; +} +{ +mul.f16x2 r1878, r1875, r2024; +} +{ +sub.f16x2 r1881, %102, %108; +} +{ +mul.f16x2 r1884, r1881, r2096; +} +{ +add.f16x2 r1887, r1878, r1884; +} +{ +add.f16x2 r1890, r1872, r1887; +} +{ +add.f16x2 r1893, %103, %101; +} +{ +mul.f16x2 r1896, r1893, r2078; +} +{ +add.f16x2 r1899, %105, r1896; +} +{ +add.f16x2 r1902, %107, %104; +} +{ +mul.f16x2 r1905, r1902, r2087; +} +{ +add.f16x2 r1908, r1899, r1905; +} +{ +sub.f16x2 r1911, %109, %106; +} +{ +mul.f16x2 r1914, r1911, r2096; +} +{ +sub.f16x2 r1917, %102, %108; +} +{ +mul.f16x2 r1920, r1917, r2102; +} +{ +add.f16x2 r1923, r1914, r1920; +} +{ +sub.f16x2 r1926, r1908, r1923; +} +{ +add.f16x2 r1929, %103, %101; +} +{ +mul.f16x2 r1932, r1929, r2078; +} +{ +add.f16x2 r1935, %105, r1932; +} +{ +add.f16x2 r1938, %107, %104; +} +{ +mul.f16x2 r1941, r1938, r2087; +} +{ +add.f16x2 r1944, r1935, r1941; +} +{ +sub.f16x2 r1947, %109, %106; +} +{ +mul.f16x2 r1950, r1947, r2096; +} +{ +sub.f16x2 r1953, %102, %108; +} +{ +mul.f16x2 r1956, r1953, r2102; +} +{ +add.f16x2 r1959, r1950, r1956; +} +{ +add.f16x2 r1962, r1944, r1959; +} +{ +add.f16x2 r1965, %109, %106; +} +{ +mul.f16x2 r1968, r1965, r2006; +} +{ +add.f16x2 r1971, %100, r1968; +} +{ +add.f16x2 r1974, %102, %108; +} +{ +mul.f16x2 r1977, r1974, r2078; +} +{ +add.f16x2 r1980, r1971, r1977; +} +{ +sub.f16x2 r1983, %103, %101; +} +{ +mul.f16x2 r1986, r1983, r2024; +} +{ +sub.f16x2 r1989, %107, %104; +} +{ +mul.f16x2 r1992, r1989, r2096; +} +{ +add.f16x2 r1995, r1986, r1992; +} +{ +add.f16x2 r1998, r1980, r1995; +} +{ +add.f16x2 r2001, %109, %106; +} +{ +mul.f16x2 r2004, r2001, r2006; +} +{ +add.f16x2 r2007, %100, r2004; +} +{ +add.f16x2 r2010, %102, %108; +} +{ +mul.f16x2 r2013, r2010, r2078; +} +{ +add.f16x2 r2016, r2007, r2013; +} +{ +sub.f16x2 r2019, %103, %101; +} +{ +mul.f16x2 r2022, r2019, r2024; +} +{ +sub.f16x2 r2025, %107, %104; +} +{ +mul.f16x2 r2028, r2025, r2096; +} +{ +add.f16x2 r2031, r2022, r2028; +} +{ +sub.f16x2 r2034, r2016, r2031; +} +{ +add.f16x2 r2037, %109, %106; +} +{ +mul.f16x2 r2040, r2037, r2078; +} +{ +add.f16x2 r2043, %100, r2040; +} +{ +add.f16x2 r2046, %102, %108; +} +{ +mul.f16x2 r2049, r2046, r2087; +} +{ +add.f16x2 r2052, r2043, r2049; +} +{ +sub.f16x2 r2055, %103, %101; +} +{ +mul.f16x2 r2058, r2055, r2096; +} +{ +sub.f16x2 r2061, %107, %104; +} +{ +mul.f16x2 r2064, r2061, r2102; +} +{ +add.f16x2 r2067, r2058, r2064; +} +{ +add.f16x2 r2070, r2052, r2067; +} +{ +add.f16x2 r2073, %109, %106; +} +{ +mul.f16x2 r2076, r2073, r2078; +} +{ +add.f16x2 r2079, %100, r2076; +} +{ +add.f16x2 r2082, %102, %108; +} +{ +mul.f16x2 r2085, r2082, r2087; +} +{ +add.f16x2 r2088, r2079, r2085; +} +{ +sub.f16x2 r2091, %103, %101; +} +{ +mul.f16x2 r2094, r2091, r2096; +} +{ +sub.f16x2 r2097, %107, %104; +} +{ +mul.f16x2 r2100, r2097, r2102; +} +{ +add.f16x2 r2103, r2094, r2100; +} +{ +sub.f16x2 r2106, r2088, r2103; +} +{ +cvt.rn.f16.f64 rs89, fd123; +} +mov.b32 r2318, {rs89, rs89}; +{ +cvt.rn.f16.f64 rs90, fd74; +} +{ +neg.f16 rs91, rs90; +} +mov.b32 r2336, {rs91, rs91}; +{ +cvt.rn.f16.f64 rs93, fd135; +} +mov.b32 r2390, {rs93, rs93}; +{ +cvt.rn.f16.f64 rs94, fd72; +} +{ +neg.f16 rs95, rs94; +} +mov.b32 r2408, {rs95, rs95}; +{ +cvt.rn.f16.f64 rs97, fd123; +} +mov.b32 r2399, {rs97, rs97}; +{ +cvt.rn.f16.f64 rs98, fd74; +} +mov.b32 r2414, {rs98, rs98}; +{ +add.f16x2 r2109, %115, %113; +} +{ +add.f16x2 r2112, %117, r2109; +} +{ +add.f16x2 r2115, %119, %116; +} +{ +add.f16x2 r2118, r2112, r2115; +} +{ +add.f16x2 r2121, %111, %118; +} +{ +add.f16x2 r2124, %112, r2121; +} +{ +add.f16x2 r2127, %114, %110; +} +{ +add.f16x2 r2130, r2124, r2127; +} +{ +add.f16x2 r2133, %115, %113; +} +{ +mul.f16x2 r2136, r2133, r2318; +} +{ +add.f16x2 r2139, %117, r2136; +} +{ +add.f16x2 r2142, %119, %116; +} +{ +mul.f16x2 r2145, r2142, r2390; +} +{ +add.f16x2 r2148, r2139, r2145; +} +{ +sub.f16x2 r2151, %111, %118; +} +{ +mul.f16x2 r2154, r2151, r2336; +} +{ +sub.f16x2 r2157, %114, %110; +} +{ +mul.f16x2 r2160, r2157, r2408; +} +{ +add.f16x2 r2163, r2154, r2160; +} +{ +sub.f16x2 r2166, r2148, r2163; +} +{ +add.f16x2 r2169, %115, %113; +} +{ +mul.f16x2 r2172, r2169, r2318; +} +{ +add.f16x2 r2175, %117, r2172; +} +{ +add.f16x2 r2178, %119, %116; +} +{ +mul.f16x2 r2181, r2178, r2390; +} +{ +add.f16x2 r2184, r2175, r2181; +} +{ +sub.f16x2 r2187, %111, %118; +} +{ +mul.f16x2 r2190, r2187, r2336; +} +{ +sub.f16x2 r2193, %114, %110; +} +{ +mul.f16x2 r2196, r2193, r2408; +} +{ +add.f16x2 r2199, r2190, r2196; +} +{ +add.f16x2 r2202, r2184, r2199; +} +{ +add.f16x2 r2205, %115, %113; +} +{ +mul.f16x2 r2208, r2205, r2390; +} +{ +add.f16x2 r2211, %117, r2208; +} +{ +add.f16x2 r2214, %119, %116; +} +{ +mul.f16x2 r2217, r2214, r2399; +} +{ +add.f16x2 r2220, r2211, r2217; +} +{ +sub.f16x2 r2223, %111, %118; +} +{ +mul.f16x2 r2226, r2223, r2408; +} +{ +sub.f16x2 r2229, %114, %110; +} +{ +mul.f16x2 r2232, r2229, r2414; +} +{ +add.f16x2 r2235, r2226, r2232; +} +{ +sub.f16x2 r2238, r2220, r2235; +} +{ +add.f16x2 r2241, %115, %113; +} +{ +mul.f16x2 r2244, r2241, r2390; +} +{ +add.f16x2 r2247, %117, r2244; +} +{ +add.f16x2 r2250, %119, %116; +} +{ +mul.f16x2 r2253, r2250, r2399; +} +{ +add.f16x2 r2256, r2247, r2253; +} +{ +sub.f16x2 r2259, %111, %118; +} +{ +mul.f16x2 r2262, r2259, r2408; +} +{ +sub.f16x2 r2265, %114, %110; +} +{ +mul.f16x2 r2268, r2265, r2414; +} +{ +add.f16x2 r2271, r2262, r2268; +} +{ +add.f16x2 r2274, r2256, r2271; +} +{ +add.f16x2 r2277, %111, %118; +} +{ +mul.f16x2 r2280, r2277, r2318; +} +{ +add.f16x2 r2283, %112, r2280; +} +{ +add.f16x2 r2286, %114, %110; +} +{ +mul.f16x2 r2289, r2286, r2390; +} +{ +add.f16x2 r2292, r2283, r2289; +} +{ +sub.f16x2 r2295, %115, %113; +} +{ +mul.f16x2 r2298, r2295, r2336; +} +{ +sub.f16x2 r2301, %119, %116; +} +{ +mul.f16x2 r2304, r2301, r2408; +} +{ +add.f16x2 r2307, r2298, r2304; +} +{ +add.f16x2 r2310, r2292, r2307; +} +{ +add.f16x2 r2313, %111, %118; +} +{ +mul.f16x2 r2316, r2313, r2318; +} +{ +add.f16x2 r2319, %112, r2316; +} +{ +add.f16x2 r2322, %114, %110; +} +{ +mul.f16x2 r2325, r2322, r2390; +} +{ +add.f16x2 r2328, r2319, r2325; +} +{ +sub.f16x2 r2331, %115, %113; +} +{ +mul.f16x2 r2334, r2331, r2336; +} +{ +sub.f16x2 r2337, %119, %116; +} +{ +mul.f16x2 r2340, r2337, r2408; +} +{ +add.f16x2 r2343, r2334, r2340; +} +{ +sub.f16x2 r2346, r2328, r2343; +} +{ +add.f16x2 r2349, %111, %118; +} +{ +mul.f16x2 r2352, r2349, r2390; +} +{ +add.f16x2 r2355, %112, r2352; +} +{ +add.f16x2 r2358, %114, %110; +} +{ +mul.f16x2 r2361, r2358, r2399; +} +{ +add.f16x2 r2364, r2355, r2361; +} +{ +sub.f16x2 r2367, %115, %113; +} +{ +mul.f16x2 r2370, r2367, r2408; +} +{ +sub.f16x2 r2373, %119, %116; +} +{ +mul.f16x2 r2376, r2373, r2414; +} +{ +add.f16x2 r2379, r2370, r2376; +} +{ +add.f16x2 r2382, r2364, r2379; +} +{ +add.f16x2 r2385, %111, %118; +} +{ +mul.f16x2 r2388, r2385, r2390; +} +{ +add.f16x2 r2391, %112, r2388; +} +{ +add.f16x2 r2394, %114, %110; +} +{ +mul.f16x2 r2397, r2394, r2399; +} +{ +add.f16x2 r2400, r2391, r2397; +} +{ +sub.f16x2 r2403, %115, %113; +} +{ +mul.f16x2 r2406, r2403, r2408; +} +{ +sub.f16x2 r2409, %119, %116; +} +{ +mul.f16x2 r2412, r2409, r2414; +} +{ +add.f16x2 r2415, r2406, r2412; +} +{ +sub.f16x2 r2418, r2400, r2415; +} +{ +cvt.rn.f16.f64 rs99, fd115; +} +{ +cvt.rn.f16.f64 rs100, fd138; +} +{ +cvt.rn.f16.f64 rs101, fd119; +} +{ +cvt.rn.f16.f64 rs102, fd134; +} +{ +cvt.rn.f16.f64 rs103, fd123; +} +{ +cvt.rn.f16.f64 rs104, fd130; +} +{ +cvt.rn.f16.f64 rs105, fd127; +} +{ +cvt.rn.f16.f64 rs106, fd128; +} +{ +cvt.rn.f16.f64 rs109, fd135; +} +{ +cvt.rn.f16.f64 rs110, fd136; +} +{ +cvt.rn.f16.f64 rs113, fd139; +} +{ +cvt.rn.f16.f64 rs114, fd90; +} +mov.b32 r2435, {rs99, rs99}; +{ +mul.f16x2 r2421, r1854, r2435; +} +mov.b32 r2432, {rs100, rs100}; +{ +mul.f16x2 r2424, r1998, r2432; +} +{ +sub.f16x2 r2427, r2421, r2424; +} +{ +mul.f16x2 r2430, r1854, r2432; +} +{ +fma.rn.f16x2 r2433, r1998, r2435, r2430; +} +mov.b32 r2467, {rs101, rs101}; +{ +mul.f16x2 r2437, r2166, r2467; +} +mov.b32 r2464, {rs102, rs102}; +{ +mul.f16x2 r2440, r2310, r2464; +} +{ +sub.f16x2 r2443, r2437, r2440; +} +{ +mul.f16x2 r2446, r2166, r2464; +} +{ +fma.rn.f16x2 r2449, r2310, r2467, r2446; +} +{ +mul.f16x2 r2453, r1926, r2467; +} +{ +mul.f16x2 r2456, r2070, r2464; +} +{ +sub.f16x2 r2459, r2453, r2456; +} +{ +mul.f16x2 r2462, r1926, r2464; +} +{ +fma.rn.f16x2 r2465, r2070, r2467, r2462; +} +mov.b32 r2531, {rs105, rs105}; +{ +mul.f16x2 r2469, r2238, r2531; +} +mov.b32 r2528, {rs106, rs106}; +{ +mul.f16x2 r2472, r2382, r2528; +} +{ +sub.f16x2 r2475, r2469, r2472; +} +{ +mul.f16x2 r2478, r2238, r2528; +} +{ +fma.rn.f16x2 r2481, r2382, r2531, r2478; +} +mov.b32 r2499, {rs103, rs103}; +{ +mul.f16x2 r2485, r1962, r2499; +} +mov.b32 r2496, {rs104, rs104}; +{ +mul.f16x2 r2488, r2106, r2496; +} +{ +sub.f16x2 r2491, r2485, r2488; +} +{ +mul.f16x2 r2494, r1962, r2496; +} +{ +fma.rn.f16x2 r2497, r2106, r2499, r2494; +} +mov.b32 r2515, {rs109, rs109}; +{ +mul.f16x2 r2501, r2274, r2515; +} +mov.b32 r2512, {rs110, rs110}; +{ +mul.f16x2 r2504, r2418, r2512; +} +{ +sub.f16x2 r2507, r2501, r2504; +} +{ +mul.f16x2 r2510, r2274, r2512; +} +{ +fma.rn.f16x2 r2513, r2418, r2515, r2510; +} +{ +mul.f16x2 r2517, r1890, r2531; +} +{ +mul.f16x2 r2520, r2034, r2528; +} +{ +sub.f16x2 r2523, r2517, r2520; +} +{ +mul.f16x2 r2526, r1890, r2528; +} +{ +fma.rn.f16x2 r2529, r2034, r2531, r2526; +} +mov.b32 r2547, {rs113, rs113}; +{ +mul.f16x2 r2533, r2202, r2547; +} +mov.b32 r2544, {rs114, rs114}; +{ +mul.f16x2 r2536, r2346, r2544; +} +{ +sub.f16x2 r2539, r2533, r2536; +} +{ +mul.f16x2 r2542, r2202, r2544; +} +{ +fma.rn.f16x2 r2545, r2346, r2547, r2542; +} +{ +cvt.rn.f16.f64 rs127, fd174; +} +mov.b32 r2620, {rs127, rs127}; +{ +cvt.rn.f16.f64 rs128, fd173; +} +mov.b32 r2629, {rs128, rs128}; +{ +add.f16x2 r2549, r1806, r2118; +} +{ +add.f16x2 r2552, r1494, r2549; +} +{ +add.f16x2 r2555, r1818, r2130; +} +{ +add.f16x2 r2558, r1506, r2555; +} +{ +add.f16x2 r2561, r1806, r2118; +} +{ +mul.f16x2 r2564, r2561, r2620; +} +{ +add.f16x2 r2567, r1494, r2564; +} +{ +sub.f16x2 r2570, r1818, r2130; +} +{ +mul.f16x2 r2573, r2570, r2629; +} +{ +add.f16x2 r2576, r2567, r2573; +} +{ +add.f16x2 r2579, r1806, r2118; +} +{ +mul.f16x2 r2582, r2579, r2620; +} +{ +add.f16x2 r2585, r1494, r2582; +} +{ +sub.f16x2 r2588, r1818, r2130; +} +{ +mul.f16x2 r2591, r2588, r2629; +} +{ +sub.f16x2 r2594, r2585, r2591; +} +{ +add.f16x2 r2597, r1818, r2130; +} +{ +mul.f16x2 r2600, r2597, r2620; +} +{ +add.f16x2 r2603, r1506, r2600; +} +{ +sub.f16x2 r2606, r1806, r2118; +} +{ +mul.f16x2 r2609, r2606, r2629; +} +{ +sub.f16x2 r2612, r2603, r2609; +} +{ +add.f16x2 r2615, r1818, r2130; +} +{ +mul.f16x2 r2618, r2615, r2620; +} +{ +add.f16x2 r2621, r1506, r2618; +} +{ +sub.f16x2 r2624, r1806, r2118; +} +{ +mul.f16x2 r2627, r2624, r2629; +} +{ +add.f16x2 r2630, r2621, r2627; +} +{ +cvt.rn.f16.f64 rs129, fd174; +} +mov.b32 r2704, {rs129, rs129}; +{ +cvt.rn.f16.f64 rs130, fd173; +} +mov.b32 r2713, {rs130, rs130}; +{ +add.f16x2 r2633, r2427, r2443; +} +{ +add.f16x2 r2636, r1542, r2633; +} +{ +add.f16x2 r2639, r2433, r2449; +} +{ +add.f16x2 r2642, r1686, r2639; +} +{ +add.f16x2 r2645, r2427, r2443; +} +{ +mul.f16x2 r2648, r2645, r2704; +} +{ +add.f16x2 r2651, r1542, r2648; +} +{ +sub.f16x2 r2654, r2433, r2449; +} +{ +mul.f16x2 r2657, r2654, r2713; +} +{ +add.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2427, r2443; +} +{ +mul.f16x2 r2666, r2663, r2704; +} +{ +add.f16x2 r2669, r1542, r2666; +} +{ +sub.f16x2 r2672, r2433, r2449; +} +{ +mul.f16x2 r2675, r2672, r2713; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2433, r2449; +} +{ +mul.f16x2 r2684, r2681, r2704; +} +{ +add.f16x2 r2687, r1686, r2684; +} +{ +sub.f16x2 r2690, r2427, r2443; +} +{ +mul.f16x2 r2693, r2690, r2713; +} +{ +sub.f16x2 r2696, r2687, r2693; +} +{ +add.f16x2 r2699, r2433, r2449; +} +{ +mul.f16x2 r2702, r2699, r2704; +} +{ +add.f16x2 r2705, r1686, r2702; +} +{ +sub.f16x2 r2708, r2427, r2443; +} +{ +mul.f16x2 r2711, r2708, r2713; +} +{ +add.f16x2 r2714, r2705, r2711; +} +{ +cvt.rn.f16.f64 rs131, fd174; +} +mov.b32 r2788, {rs131, rs131}; +{ +cvt.rn.f16.f64 rs132, fd173; +} +mov.b32 r2797, {rs132, rs132}; +{ +add.f16x2 r2717, r2459, r2475; +} +{ +add.f16x2 r2720, r1614, r2717; +} +{ +add.f16x2 r2723, r2465, r2481; +} +{ +add.f16x2 r2726, r1758, r2723; +} +{ +add.f16x2 r2729, r2459, r2475; +} +{ +mul.f16x2 r2732, r2729, r2788; +} +{ +add.f16x2 r2735, r1614, r2732; +} +{ +sub.f16x2 r2738, r2465, r2481; +} +{ +mul.f16x2 r2741, r2738, r2797; +} +{ +add.f16x2 r2744, r2735, r2741; +} +{ +add.f16x2 r2747, r2459, r2475; +} +{ +mul.f16x2 r2750, r2747, r2788; +} +{ +add.f16x2 r2753, r1614, r2750; +} +{ +sub.f16x2 r2756, r2465, r2481; +} +{ +mul.f16x2 r2759, r2756, r2797; +} +{ +sub.f16x2 r2762, r2753, r2759; +} +{ +add.f16x2 r2765, r2465, r2481; +} +{ +mul.f16x2 r2768, r2765, r2788; +} +{ +add.f16x2 r2771, r1758, r2768; +} +{ +sub.f16x2 r2774, r2459, r2475; +} +{ +mul.f16x2 r2777, r2774, r2797; +} +{ +sub.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2465, r2481; +} +{ +mul.f16x2 r2786, r2783, r2788; +} +{ +add.f16x2 r2789, r1758, r2786; +} +{ +sub.f16x2 r2792, r2459, r2475; +} +{ +mul.f16x2 r2795, r2792, r2797; +} +{ +add.f16x2 r2798, r2789, r2795; +} +{ +cvt.rn.f16.f64 rs133, fd174; +} +mov.b32 r2872, {rs133, rs133}; +{ +cvt.rn.f16.f64 rs134, fd173; +} +mov.b32 r2881, {rs134, rs134}; +{ +add.f16x2 r2801, r2491, r2507; +} +{ +add.f16x2 r2804, r1650, r2801; +} +{ +add.f16x2 r2807, r2497, r2513; +} +{ +add.f16x2 r2810, r1794, r2807; +} +{ +add.f16x2 r2813, r2491, r2507; +} +{ +mul.f16x2 r2816, r2813, r2872; +} +{ +add.f16x2 r2819, r1650, r2816; +} +{ +sub.f16x2 r2822, r2497, r2513; +} +{ +mul.f16x2 r2825, r2822, r2881; +} +{ +add.f16x2 r2828, r2819, r2825; +} +{ +add.f16x2 r2831, r2491, r2507; +} +{ +mul.f16x2 r2834, r2831, r2872; +} +{ +add.f16x2 r2837, r1650, r2834; +} +{ +sub.f16x2 r2840, r2497, r2513; +} +{ +mul.f16x2 r2843, r2840, r2881; +} +{ +sub.f16x2 r2846, r2837, r2843; +} +{ +add.f16x2 r2849, r2497, r2513; +} +{ +mul.f16x2 r2852, r2849, r2872; +} +{ +add.f16x2 r2855, r1794, r2852; +} +{ +sub.f16x2 r2858, r2491, r2507; +} +{ +mul.f16x2 r2861, r2858, r2881; +} +{ +sub.f16x2 r2864, r2855, r2861; +} +{ +add.f16x2 r2867, r2497, r2513; +} +{ +mul.f16x2 r2870, r2867, r2872; +} +{ +add.f16x2 r2873, r1794, r2870; +} +{ +sub.f16x2 r2876, r2491, r2507; +} +{ +mul.f16x2 r2879, r2876, r2881; +} +{ +add.f16x2 r2882, r2873, r2879; +} +{ +cvt.rn.f16.f64 rs135, fd174; +} +mov.b32 r2956, {rs135, rs135}; +{ +cvt.rn.f16.f64 rs136, fd173; +} +mov.b32 r2965, {rs136, rs136}; +mov.f64 fd172, 0d3FEBB67AE8584CAA; +mov.f64 fd171, 0d3FCA9CD9AC4258F6; +{ +add.f16x2 r2885, r2523, r2539; +} +{ +add.f16x2 r2888, r1578, r2885; +} +{ +add.f16x2 r2891, r2529, r2545; +} +{ +add.f16x2 r2894, r1722, r2891; +} +{ +add.f16x2 r2897, r2523, r2539; +} +{ +mul.f16x2 r2900, r2897, r2956; +} +{ +add.f16x2 r2903, r1578, r2900; +} +{ +sub.f16x2 r2906, r2529, r2545; +} +{ +mul.f16x2 r2909, r2906, r2965; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +add.f16x2 r2915, r2523, r2539; +} +{ +mul.f16x2 r2918, r2915, r2956; +} +{ +add.f16x2 r2921, r1578, r2918; +} +{ +sub.f16x2 r2924, r2529, r2545; +} +{ +mul.f16x2 r2927, r2924, r2965; +} +{ +sub.f16x2 r2930, r2921, r2927; +} +{ +add.f16x2 r2933, r2529, r2545; +} +{ +mul.f16x2 r2936, r2933, r2956; +} +{ +add.f16x2 r2939, r1722, r2936; +} +{ +sub.f16x2 r2942, r2523, r2539; +} +{ +mul.f16x2 r2945, r2942, r2965; +} +{ +sub.f16x2 r2948, r2939, r2945; +} +{ +add.f16x2 r2951, r2529, r2545; +} +{ +mul.f16x2 r2954, r2951, r2956; +} +{ +add.f16x2 r2957, r1722, r2954; +} +{ +sub.f16x2 r2960, r2523, r2539; +} +{ +mul.f16x2 r2963, r2960, r2965; +} +{ +add.f16x2 r2966, r2957, r2963; +} +mov.f64 fd113, 0d3FEF4CFC327A0080; +{ +cvt.rn.f16.f64 rs137, fd113; +} +{ +cvt.rn.f16.f64 rs138, fd171; +} +{ +cvt.rn.f16.f64 rs139, fd115; +} +{ +cvt.rn.f16.f64 rs140, fd138; +} +mov.f64 fd117, 0d3FE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs141, fd117; +} +{ +cvt.rn.f16.f64 rs142, fd136; +} +{ +cvt.rn.f16.f64 rs143, fd119; +} +{ +cvt.rn.f16.f64 rs144, fd134; +} +mov.f64 fd121, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs145, fd121; +} +{ +cvt.rn.f16.f64 rs146, fd172; +} +{ +cvt.rn.f16.f64 rs147, fd123; +} +{ +cvt.rn.f16.f64 rs148, fd130; +} +mov.f64 fd125, 0d3FBAC2609B3C576C; +{ +cvt.rn.f16.f64 rs149, fd125; +} +{ +cvt.rn.f16.f64 rs150, fd128; +} +{ +cvt.rn.f16.f64 rs151, fd127; +} +{ +cvt.rn.f16.f64 rs152, fd128; +} +mov.f64 fd129, 0dBFD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs153, fd129; +} +{ +cvt.rn.f16.f64 rs154, fd130; +} +{ +cvt.rn.f16.f64 rs155, fd174; +} +{ +cvt.rn.f16.f64 rs156, fd172; +} +mov.f64 fd133, 0dBFE5698496E20BD8; +{ +cvt.rn.f16.f64 rs157, fd133; +} +{ +cvt.rn.f16.f64 rs158, fd134; +} +{ +cvt.rn.f16.f64 rs159, fd135; +} +{ +cvt.rn.f16.f64 rs160, fd136; +} +mov.f64 fd137, 0dBFED3BC3AEFF7F95; +{ +cvt.rn.f16.f64 rs161, fd137; +} +{ +cvt.rn.f16.f64 rs162, fd138; +} +{ +cvt.rn.f16.f64 rs163, fd139; +} +{ +cvt.rn.f16.f64 rs164, fd171; +} +mov.b32 r2983, {rs137, rs137}; +{ +mul.f16x2 r2969, r2636, r2983; +} +mov.b32 r2980, {rs138, rs138}; +{ +mul.f16x2 r2972, r2642, r2980; +} +{ +sub.f16x2 r2975, r2969, r2972; +} +{ +mul.f16x2 r2978, r2636, r2980; +} +{ +fma.rn.f16x2 r2981, r2642, r2983, r2978; +} +mov.b32 r2999, {rs139, rs139}; +{ +mul.f16x2 r2985, r2720, r2999; +} +mov.b32 r2996, {rs140, rs140}; +{ +mul.f16x2 r2988, r2726, r2996; +} +{ +sub.f16x2 r2991, r2985, r2988; +} +{ +mul.f16x2 r2994, r2720, r2996; +} +{ +fma.rn.f16x2 r2997, r2726, r2999, r2994; +} +mov.b32 r3015, {rs141, rs141}; +{ +mul.f16x2 r3001, r2804, r3015; +} +mov.b32 r3012, {rs142, rs142}; +{ +mul.f16x2 r3004, r2810, r3012; +} +{ +sub.f16x2 r3007, r3001, r3004; +} +{ +mul.f16x2 r3010, r2804, r3012; +} +{ +fma.rn.f16x2 r3013, r2810, r3015, r3010; +} +mov.b32 r3031, {rs143, rs143}; +{ +mul.f16x2 r3017, r2888, r3031; +} +mov.b32 r3028, {rs144, rs144}; +{ +mul.f16x2 r3020, r2894, r3028; +} +{ +sub.f16x2 r3023, r3017, r3020; +} +{ +mul.f16x2 r3026, r2888, r3028; +} +{ +fma.rn.f16x2 r3029, r2894, r3031, r3026; +} +mov.b32 r3047, {rs145, rs145}; +{ +mul.f16x2 r3033, r2576, r3047; +} +mov.b32 r3044, {rs146, rs146}; +{ +mul.f16x2 r3036, r2612, r3044; +} +{ +sub.f16x2 r3039, r3033, r3036; +} +{ +mul.f16x2 r3042, r2576, r3044; +} +{ +fma.rn.f16x2 r3045, r2612, r3047, r3042; +} +mov.b32 r3063, {rs147, rs147}; +{ +mul.f16x2 r3049, r2660, r3063; +} +mov.b32 r3060, {rs148, rs148}; +{ +mul.f16x2 r3052, r2696, r3060; +} +{ +sub.f16x2 r3055, r3049, r3052; +} +{ +mul.f16x2 r3058, r2660, r3060; +} +{ +fma.rn.f16x2 r3061, r2696, r3063, r3058; +} +mov.b32 r3079, {rs149, rs149}; +{ +mul.f16x2 r3065, r2744, r3079; +} +mov.b32 r3076, {rs150, rs150}; +{ +mul.f16x2 r3068, r2780, r3076; +} +{ +sub.f16x2 r3071, r3065, r3068; +} +{ +mul.f16x2 r3074, r2744, r3076; +} +{ +fma.rn.f16x2 r3077, r2780, r3079, r3074; +} +mov.b32 r3095, {rs151, rs151}; +{ +mul.f16x2 r3081, r2828, r3095; +} +mov.b32 r3092, {rs152, rs152}; +{ +mul.f16x2 r3084, r2864, r3092; +} +{ +sub.f16x2 r3087, r3081, r3084; +} +{ +mul.f16x2 r3090, r2828, r3092; +} +{ +fma.rn.f16x2 r3093, r2864, r3095, r3090; +} +mov.b32 r3111, {rs153, rs153}; +{ +mul.f16x2 r3097, r2912, r3111; +} +mov.b32 r3108, {rs154, rs154}; +{ +mul.f16x2 r3100, r2948, r3108; +} +{ +sub.f16x2 r3103, r3097, r3100; +} +{ +mul.f16x2 r3106, r2912, r3108; +} +{ +fma.rn.f16x2 r3109, r2948, r3111, r3106; +} +mov.b32 r3127, {rs155, rs155}; +{ +mul.f16x2 r3113, r2594, r3127; +} +mov.b32 r3124, {rs156, rs156}; +{ +mul.f16x2 r3116, r2630, r3124; +} +{ +sub.f16x2 r3119, r3113, r3116; +} +{ +mul.f16x2 r3122, r2594, r3124; +} +{ +fma.rn.f16x2 r3125, r2630, r3127, r3122; +} +mov.b32 r3143, {rs157, rs157}; +{ +mul.f16x2 r3129, r2678, r3143; +} +mov.b32 r3140, {rs158, rs158}; +{ +mul.f16x2 r3132, r2714, r3140; +} +{ +sub.f16x2 r3135, r3129, r3132; +} +{ +mul.f16x2 r3138, r2678, r3140; +} +{ +fma.rn.f16x2 r3141, r2714, r3143, r3138; +} +mov.b32 r3159, {rs159, rs159}; +{ +mul.f16x2 r3145, r2762, r3159; +} +mov.b32 r3156, {rs160, rs160}; +{ +mul.f16x2 r3148, r2798, r3156; +} +{ +sub.f16x2 r3151, r3145, r3148; +} +{ +mul.f16x2 r3154, r2762, r3156; +} +{ +fma.rn.f16x2 r3157, r2798, r3159, r3154; +} +mov.b32 r3175, {rs161, rs161}; +{ +mul.f16x2 r3161, r2846, r3175; +} +mov.b32 r3172, {rs162, rs162}; +{ +mul.f16x2 r3164, r2882, r3172; +} +{ +sub.f16x2 r3167, r3161, r3164; +} +{ +mul.f16x2 r3170, r2846, r3172; +} +{ +fma.rn.f16x2 r3173, r2882, r3175, r3170; +} +mov.b32 r3191, {rs163, rs163}; +{ +mul.f16x2 r3177, r2930, r3191; +} +mov.b32 r3188, {rs164, rs164}; +{ +mul.f16x2 r3180, r2966, r3188; +} +{ +sub.f16x2 r3183, r3177, r3180; +} +{ +mul.f16x2 r3186, r2930, r3188; +} +{ +fma.rn.f16x2 r3189, r2966, r3191, r3186; +} +{ +add.f16x2 %0, r1068, r2552; +} +{ +add.f16x2 %1, r1074, r2558; +} +{ +sub.f16x2 %30, r1068, r2552; +} +{ +sub.f16x2 %31, r1074, r2558; +} +{ +add.f16x2 %2, r1152, r2975; +} +{ +add.f16x2 %3, r1158, r2981; +} +{ +sub.f16x2 %32, r1152, r2975; +} +{ +sub.f16x2 %33, r1158, r2981; +} +{ +add.f16x2 %4, r1236, r2991; +} +{ +add.f16x2 %5, r1242, r2997; +} +{ +sub.f16x2 %34, r1236, r2991; +} +{ +sub.f16x2 %35, r1242, r2997; +} +{ +add.f16x2 %6, r1320, r3007; +} +{ +add.f16x2 %7, r1326, r3013; +} +{ +sub.f16x2 %36, r1320, r3007; +} +{ +sub.f16x2 %37, r1326, r3013; +} +{ +add.f16x2 %8, r1404, r3023; +} +{ +add.f16x2 %9, r1410, r3029; +} +{ +sub.f16x2 %38, r1404, r3023; +} +{ +sub.f16x2 %39, r1410, r3029; +} +{ +add.f16x2 %10, r1092, r3039; +} +{ +add.f16x2 %11, r1128, r3045; +} +{ +sub.f16x2 %40, r1092, r3039; +} +{ +sub.f16x2 %41, r1128, r3045; +} +{ +add.f16x2 %12, r1176, r3055; +} +{ +add.f16x2 %13, r1212, r3061; +} +{ +sub.f16x2 %42, r1176, r3055; +} +{ +sub.f16x2 %43, r1212, r3061; +} +{ +add.f16x2 %14, r1260, r3071; +} +{ +add.f16x2 %15, r1296, r3077; +} +{ +sub.f16x2 %44, r1260, r3071; +} +{ +sub.f16x2 %45, r1296, r3077; +} +{ +add.f16x2 %16, r1344, r3087; +} +{ +add.f16x2 %17, r1380, r3093; +} +{ +sub.f16x2 %46, r1344, r3087; +} +{ +sub.f16x2 %47, r1380, r3093; +} +{ +add.f16x2 %18, r1428, r3103; +} +{ +add.f16x2 %19, r1464, r3109; +} +{ +sub.f16x2 %48, r1428, r3103; +} +{ +sub.f16x2 %49, r1464, r3109; +} +{ +add.f16x2 %20, r1110, r3119; +} +{ +add.f16x2 %21, r1146, r3125; +} +{ +sub.f16x2 %50, r1110, r3119; +} +{ +sub.f16x2 %51, r1146, r3125; +} +{ +add.f16x2 %22, r1194, r3135; +} +{ +add.f16x2 %23, r1230, r3141; +} +{ +sub.f16x2 %52, r1194, r3135; +} +{ +sub.f16x2 %53, r1230, r3141; +} +{ +add.f16x2 %24, r1278, r3151; +} +{ +add.f16x2 %25, r1314, r3157; +} +{ +sub.f16x2 %54, r1278, r3151; +} +{ +sub.f16x2 %55, r1314, r3157; +} +{ +add.f16x2 %26, r1362, r3167; +} +{ +add.f16x2 %27, r1398, r3173; +} +{ +sub.f16x2 %56, r1362, r3167; +} +{ +sub.f16x2 %57, r1398, r3173; +} +{ +add.f16x2 %28, r1446, r3183; +} +{ +add.f16x2 %29, r1482, r3189; +} +{ +sub.f16x2 %58, r1446, r3183; +} +{ +sub.f16x2 %59, r1482, r3189; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)): "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[17].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..8a9e5a42cdcd4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp32_fwd.hpp.inc @@ -0,0 +1,650 @@ +#ifndef CUFFTDX_FFT_30_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_30_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<14, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<755>; +.reg .b64 rd<2>; +add.f32 f121, %76, %124; +add.f32 f122, %60, f121; +add.f32 f123, %92, %108; +add.f32 f124, f123, f122; +add.f32 f125, %77, %125; +add.f32 f126, %61, f125; +add.f32 f127, %93, %109; +add.f32 f128, f127, f126; +fma.rn.f32 f129, f121, 0f3E9E377A, %60; +mul.f32 f130, f123, 0f3F4F1BBD; +sub.f32 f131, f129, f130; +sub.f32 f132, %77, %125; +mul.f32 f133, f132, 0f3F737871; +sub.f32 f134, %93, %109; +mul.f32 f135, f134, 0fBF167918; +sub.f32 f136, f135, f133; +sub.f32 f137, f131, f136; +add.f32 f138, f136, f131; +mul.f32 f139, f121, 0f3F4F1BBD; +sub.f32 f140, %60, f139; +fma.rn.f32 f141, f123, 0f3E9E377A, f140; +mul.f32 f142, f132, 0f3F167918; +mul.f32 f143, f134, 0f3F737871; +sub.f32 f144, f143, f142; +sub.f32 f145, f141, f144; +add.f32 f146, f144, f141; +fma.rn.f32 f147, f125, 0f3E9E377A, %61; +mul.f32 f148, f127, 0f3F4F1BBD; +sub.f32 f149, f147, f148; +sub.f32 f150, %76, %124; +mul.f32 f151, f150, 0f3F737871; +sub.f32 f152, %92, %108; +mul.f32 f153, f152, 0fBF167918; +sub.f32 f154, f153, f151; +add.f32 f155, f154, f149; +sub.f32 f156, f149, f154; +mul.f32 f157, f125, 0f3F4F1BBD; +sub.f32 f158, %61, f157; +fma.rn.f32 f159, f127, 0f3E9E377A, f158; +mul.f32 f160, f150, 0f3F167918; +mul.f32 f161, f152, 0f3F737871; +sub.f32 f162, f161, f160; +add.f32 f163, f162, f159; +sub.f32 f164, f159, f162; +add.f32 f165, %81, %129; +add.f32 f166, %65, f165; +add.f32 f167, %97, %113; +add.f32 f168, f167, f166; +add.f32 f169, %83, %131; +add.f32 f170, %67, f169; +add.f32 f171, %99, %115; +add.f32 f172, f171, f170; +fma.rn.f32 f173, f165, 0f3E9E377A, %65; +mul.f32 f174, f167, 0f3F4F1BBD; +sub.f32 f175, f173, f174; +sub.f32 f176, %83, %131; +mul.f32 f177, f176, 0f3F737871; +sub.f32 f178, %99, %115; +mul.f32 f179, f178, 0fBF167918; +sub.f32 f180, f179, f177; +sub.f32 f181, f175, f180; +add.f32 f182, f180, f175; +mul.f32 f183, f165, 0f3F4F1BBD; +sub.f32 f184, %65, f183; +fma.rn.f32 f185, f167, 0f3E9E377A, f184; +mul.f32 f186, f176, 0f3F167918; +mul.f32 f187, f178, 0f3F737871; +sub.f32 f188, f187, f186; +sub.f32 f189, f185, f188; +add.f32 f190, f188, f185; +fma.rn.f32 f191, f169, 0f3E9E377A, %67; +mul.f32 f192, f171, 0f3F4F1BBD; +sub.f32 f193, f191, f192; +sub.f32 f194, %81, %129; +mul.f32 f195, f194, 0f3F737871; +sub.f32 f196, %97, %113; +mul.f32 f197, f196, 0fBF167918; +sub.f32 f198, f197, f195; +add.f32 f199, f198, f193; +sub.f32 f200, f193, f198; +mul.f32 f201, f169, 0f3F4F1BBD; +sub.f32 f202, %67, f201; +fma.rn.f32 f203, f171, 0f3E9E377A, f202; +mul.f32 f204, f194, 0f3F167918; +mul.f32 f205, f196, 0f3F737871; +sub.f32 f206, f205, f204; +add.f32 f207, f206, f203; +sub.f32 f208, f203, f206; +add.f32 f209, %86, %134; +add.f32 f210, %70, f209; +add.f32 f211, %102, %118; +add.f32 f212, f211, f210; +add.f32 f213, %88, %136; +add.f32 f214, %72, f213; +add.f32 f215, %104, %120; +add.f32 f216, f215, f214; +fma.rn.f32 f217, f209, 0f3E9E377A, %70; +mul.f32 f218, f211, 0f3F4F1BBD; +sub.f32 f219, f217, f218; +sub.f32 f220, %88, %136; +mul.f32 f221, f220, 0f3F737871; +sub.f32 f222, %104, %120; +mul.f32 f223, f222, 0fBF167918; +sub.f32 f224, f223, f221; +sub.f32 f225, f219, f224; +add.f32 f226, f224, f219; +mul.f32 f227, f209, 0f3F4F1BBD; +sub.f32 f228, %70, f227; +fma.rn.f32 f229, f211, 0f3E9E377A, f228; +mul.f32 f230, f220, 0f3F167918; +mul.f32 f231, f222, 0f3F737871; +sub.f32 f232, f231, f230; +sub.f32 f233, f229, f232; +add.f32 f234, f232, f229; +fma.rn.f32 f235, f213, 0f3E9E377A, %72; +mul.f32 f236, f215, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %86, %134; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %102, %118; +mul.f32 f241, f240, 0fBF167918; +sub.f32 f242, f241, f239; +add.f32 f243, f242, f237; +sub.f32 f244, f237, f242; +mul.f32 f245, f213, 0f3F4F1BBD; +sub.f32 f246, %72, f245; +fma.rn.f32 f247, f215, 0f3E9E377A, f246; +mul.f32 f248, f238, 0f3F167918; +mul.f32 f249, f240, 0f3F737871; +sub.f32 f250, f249, f248; +add.f32 f251, f250, f247; +sub.f32 f252, f247, f250; +mul.f32 f253, f181, 0f3F69DE1D; +mul.f32 f254, f199, 0fBED03FC9; +sub.f32 f255, f253, f254; +mul.f32 f256, f199, 0f3F69DE1D; +fma.rn.f32 f257, f181, 0fBED03FC9, f256; +mul.f32 f258, f225, 0f3F2B4C25; +mul.f32 f259, f243, 0fBF3E3EBD; +sub.f32 f260, f258, f259; +mul.f32 f261, f243, 0f3F2B4C25; +fma.rn.f32 f262, f225, 0fBF3E3EBD, f261; +mul.f32 f263, f189, 0f3F2B4C25; +mul.f32 f264, f207, 0fBF3E3EBD; +sub.f32 f265, f263, f264; +mul.f32 f266, f207, 0f3F2B4C25; +fma.rn.f32 f267, f189, 0fBF3E3EBD, f266; +mul.f32 f268, f233, 0fBDD61305; +mul.f32 f269, f251, 0fBF7E98FD; +sub.f32 f270, f268, f269; +mul.f32 f271, f251, 0fBDD61305; +fma.rn.f32 f272, f233, 0fBF7E98FD, f271; +mul.f32 f273, f190, 0f3E9E377A; +mul.f32 f274, f208, 0fBF737871; +sub.f32 f275, f273, f274; +mul.f32 f276, f208, 0f3E9E377A; +fma.rn.f32 f277, f190, 0fBF737871, f276; +mul.f32 f278, f234, 0fBF4F1BBD; +mul.f32 f279, f252, 0fBF167918; +sub.f32 f280, f278, f279; +mul.f32 f281, f252, 0fBF4F1BBD; +fma.rn.f32 f282, f234, 0fBF167918, f281; +mul.f32 f283, f182, 0fBDD61305; +mul.f32 f284, f200, 0fBF7E98FD; +sub.f32 f285, f283, f284; +mul.f32 f286, f200, 0fBDD61305; +fma.rn.f32 f287, f182, 0fBF7E98FD, f286; +mul.f32 f288, f226, 0fBF7A67E2; +mul.f32 f289, f244, 0f3E54E6CD; +sub.f32 f290, f288, f289; +mul.f32 f291, f244, 0fBF7A67E2; +fma.rn.f32 f292, f226, 0f3E54E6CD, f291; +add.f32 f293, f168, f212; +add.f32 f294, f124, f293; +add.f32 f295, f172, f216; +add.f32 f296, f128, f295; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f124, f297; +sub.f32 f299, f172, f216; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +mul.f32 f303, f295, 0f3F000000; +sub.f32 f304, f128, f303; +sub.f32 f305, f168, f212; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f255, f260; +add.f32 f310, f137, f309; +add.f32 f311, f257, f262; +add.f32 f312, f155, f311; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f137, f313; +sub.f32 f315, f257, f262; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +mul.f32 f319, f311, 0f3F000000; +sub.f32 f320, f155, f319; +sub.f32 f321, f255, f260; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f265, f270; +add.f32 f326, f145, f325; +add.f32 f327, f267, f272; +add.f32 f328, f163, f327; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f145, f329; +sub.f32 f331, f267, f272; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +mul.f32 f335, f327, 0f3F000000; +sub.f32 f336, f163, f335; +sub.f32 f337, f265, f270; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, f275, f280; +add.f32 f342, f146, f341; +add.f32 f343, f277, f282; +add.f32 f344, f164, f343; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, f146, f345; +sub.f32 f347, f277, f282; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +mul.f32 f351, f343, 0f3F000000; +sub.f32 f352, f164, f351; +sub.f32 f353, f275, f280; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, f285, f290; +add.f32 f358, f138, f357; +add.f32 f359, f287, f292; +add.f32 f360, f156, f359; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, f138, f361; +sub.f32 f363, f287, f292; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +mul.f32 f367, f359, 0f3F000000; +sub.f32 f368, f156, f367; +sub.f32 f369, f285, f290; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %78, %126; +add.f32 f374, %62, f373; +add.f32 f375, %94, %110; +add.f32 f376, f375, f374; +add.f32 f377, %80, %128; +add.f32 f378, %64, f377; +add.f32 f379, %96, %112; +add.f32 f380, f379, f378; +fma.rn.f32 f381, f373, 0f3E9E377A, %62; +mul.f32 f382, f375, 0f3F4F1BBD; +sub.f32 f383, f381, f382; +sub.f32 f384, %80, %128; +mul.f32 f385, f384, 0f3F737871; +sub.f32 f386, %96, %112; +mul.f32 f387, f386, 0fBF167918; +sub.f32 f388, f387, f385; +sub.f32 f389, f383, f388; +add.f32 f390, f388, f383; +mul.f32 f391, f373, 0f3F4F1BBD; +sub.f32 f392, %62, f391; +fma.rn.f32 f393, f375, 0f3E9E377A, f392; +mul.f32 f394, f384, 0f3F167918; +mul.f32 f395, f386, 0f3F737871; +sub.f32 f396, f395, f394; +sub.f32 f397, f393, f396; +add.f32 f398, f396, f393; +fma.rn.f32 f399, f377, 0f3E9E377A, %64; +mul.f32 f400, f379, 0f3F4F1BBD; +sub.f32 f401, f399, f400; +sub.f32 f402, %78, %126; +mul.f32 f403, f402, 0f3F737871; +sub.f32 f404, %94, %110; +mul.f32 f405, f404, 0fBF167918; +sub.f32 f406, f405, f403; +add.f32 f407, f406, f401; +sub.f32 f408, f401, f406; +mul.f32 f409, f377, 0f3F4F1BBD; +sub.f32 f410, %64, f409; +fma.rn.f32 f411, f379, 0f3E9E377A, f410; +mul.f32 f412, f402, 0f3F167918; +mul.f32 f413, f404, 0f3F737871; +sub.f32 f414, f413, f412; +add.f32 f415, f414, f411; +sub.f32 f416, f411, f414; +add.f32 f417, %84, %132; +add.f32 f418, %68, f417; +add.f32 f419, %100, %116; +add.f32 f420, f419, f418; +add.f32 f421, %85, %133; +add.f32 f422, %69, f421; +add.f32 f423, %101, %117; +add.f32 f424, f423, f422; +fma.rn.f32 f425, f417, 0f3E9E377A, %68; +mul.f32 f426, f419, 0f3F4F1BBD; +sub.f32 f427, f425, f426; +sub.f32 f428, %85, %133; +mul.f32 f429, f428, 0f3F737871; +sub.f32 f430, %101, %117; +mul.f32 f431, f430, 0fBF167918; +sub.f32 f432, f431, f429; +sub.f32 f433, f427, f432; +add.f32 f434, f432, f427; +mul.f32 f435, f417, 0f3F4F1BBD; +sub.f32 f436, %68, f435; +fma.rn.f32 f437, f419, 0f3E9E377A, f436; +mul.f32 f438, f428, 0f3F167918; +mul.f32 f439, f430, 0f3F737871; +sub.f32 f440, f439, f438; +sub.f32 f441, f437, f440; +add.f32 f442, f440, f437; +fma.rn.f32 f443, f421, 0f3E9E377A, %69; +mul.f32 f444, f423, 0f3F4F1BBD; +sub.f32 f445, f443, f444; +sub.f32 f446, %84, %132; +mul.f32 f447, f446, 0f3F737871; +sub.f32 f448, %100, %116; +mul.f32 f449, f448, 0fBF167918; +sub.f32 f450, f449, f447; +add.f32 f451, f450, f445; +sub.f32 f452, f445, f450; +mul.f32 f453, f421, 0f3F4F1BBD; +sub.f32 f454, %69, f453; +fma.rn.f32 f455, f423, 0f3E9E377A, f454; +mul.f32 f456, f446, 0f3F167918; +mul.f32 f457, f448, 0f3F737871; +sub.f32 f458, f457, f456; +add.f32 f459, f458, f455; +sub.f32 f460, f455, f458; +add.f32 f461, %89, %137; +add.f32 f462, %73, f461; +add.f32 f463, %105, %121; +add.f32 f464, f463, f462; +add.f32 f465, %91, %138; +add.f32 f466, %75, f465; +add.f32 f467, %107, %123; +add.f32 f468, f467, f466; +fma.rn.f32 f469, f461, 0f3E9E377A, %73; +mul.f32 f470, f463, 0f3F4F1BBD; +sub.f32 f471, f469, f470; +sub.f32 f472, %91, %138; +mul.f32 f473, f472, 0f3F737871; +sub.f32 f474, %107, %123; +mul.f32 f475, f474, 0fBF167918; +sub.f32 f476, f475, f473; +sub.f32 f477, f471, f476; +add.f32 f478, f476, f471; +mul.f32 f479, f461, 0f3F4F1BBD; +sub.f32 f480, %73, f479; +fma.rn.f32 f481, f463, 0f3E9E377A, f480; +mul.f32 f482, f472, 0f3F167918; +mul.f32 f483, f474, 0f3F737871; +sub.f32 f484, f483, f482; +sub.f32 f485, f481, f484; +add.f32 f486, f484, f481; +fma.rn.f32 f487, f465, 0f3E9E377A, %75; +mul.f32 f488, f467, 0f3F4F1BBD; +sub.f32 f489, f487, f488; +sub.f32 f490, %89, %137; +mul.f32 f491, f490, 0f3F737871; +sub.f32 f492, %105, %121; +mul.f32 f493, f492, 0fBF167918; +sub.f32 f494, f493, f491; +add.f32 f495, f494, f489; +sub.f32 f496, f489, f494; +mul.f32 f497, f465, 0f3F4F1BBD; +sub.f32 f498, %75, f497; +fma.rn.f32 f499, f467, 0f3E9E377A, f498; +mul.f32 f500, f490, 0f3F167918; +mul.f32 f501, f492, 0f3F737871; +sub.f32 f502, f501, f500; +add.f32 f503, f502, f499; +sub.f32 f504, f499, f502; +mul.f32 f505, f433, 0f3F69DE1D; +mul.f32 f506, f451, 0fBED03FC9; +sub.f32 f507, f505, f506; +mul.f32 f508, f451, 0f3F69DE1D; +fma.rn.f32 f509, f433, 0fBED03FC9, f508; +mul.f32 f510, f477, 0f3F2B4C25; +mul.f32 f511, f495, 0fBF3E3EBD; +sub.f32 f512, f510, f511; +mul.f32 f513, f495, 0f3F2B4C25; +fma.rn.f32 f514, f477, 0fBF3E3EBD, f513; +mul.f32 f515, f441, 0f3F2B4C25; +mul.f32 f516, f459, 0fBF3E3EBD; +sub.f32 f517, f515, f516; +mul.f32 f518, f459, 0f3F2B4C25; +fma.rn.f32 f519, f441, 0fBF3E3EBD, f518; +mul.f32 f520, f485, 0fBDD61305; +mul.f32 f521, f503, 0fBF7E98FD; +sub.f32 f522, f520, f521; +mul.f32 f523, f503, 0fBDD61305; +fma.rn.f32 f524, f485, 0fBF7E98FD, f523; +mul.f32 f525, f442, 0f3E9E377A; +mul.f32 f526, f460, 0fBF737871; +sub.f32 f527, f525, f526; +mul.f32 f528, f460, 0f3E9E377A; +fma.rn.f32 f529, f442, 0fBF737871, f528; +mul.f32 f530, f486, 0fBF4F1BBD; +mul.f32 f531, f504, 0fBF167918; +sub.f32 f532, f530, f531; +mul.f32 f533, f504, 0fBF4F1BBD; +fma.rn.f32 f534, f486, 0fBF167918, f533; +mul.f32 f535, f434, 0fBDD61305; +mul.f32 f536, f452, 0fBF7E98FD; +sub.f32 f537, f535, f536; +mul.f32 f538, f452, 0fBDD61305; +fma.rn.f32 f539, f434, 0fBF7E98FD, f538; +mul.f32 f540, f478, 0fBF7A67E2; +mul.f32 f541, f496, 0f3E54E6CD; +sub.f32 f542, f540, f541; +mul.f32 f543, f496, 0fBF7A67E2; +fma.rn.f32 f544, f478, 0f3E54E6CD, f543; +add.f32 f545, f420, f464; +add.f32 f546, f376, f545; +add.f32 f547, f424, f468; +add.f32 f548, f380, f547; +mul.f32 f549, f545, 0f3F000000; +sub.f32 f550, f376, f549; +sub.f32 f551, f424, f468; +mul.f32 f552, f551, 0f3F5DB3D7; +add.f32 f553, f552, f550; +sub.f32 f554, f550, f552; +mul.f32 f555, f547, 0f3F000000; +sub.f32 f556, f380, f555; +sub.f32 f557, f420, f464; +mul.f32 f558, f557, 0f3F5DB3D7; +sub.f32 f559, f556, f558; +add.f32 f560, f558, f556; +add.f32 f561, f507, f512; +add.f32 f562, f389, f561; +add.f32 f563, f509, f514; +add.f32 f564, f407, f563; +mul.f32 f565, f561, 0f3F000000; +sub.f32 f566, f389, f565; +sub.f32 f567, f509, f514; +mul.f32 f568, f567, 0f3F5DB3D7; +add.f32 f569, f568, f566; +sub.f32 f570, f566, f568; +mul.f32 f571, f563, 0f3F000000; +sub.f32 f572, f407, f571; +sub.f32 f573, f507, f512; +mul.f32 f574, f573, 0f3F5DB3D7; +sub.f32 f575, f572, f574; +add.f32 f576, f574, f572; +add.f32 f577, f517, f522; +add.f32 f578, f397, f577; +add.f32 f579, f519, f524; +add.f32 f580, f415, f579; +mul.f32 f581, f577, 0f3F000000; +sub.f32 f582, f397, f581; +sub.f32 f583, f519, f524; +mul.f32 f584, f583, 0f3F5DB3D7; +add.f32 f585, f584, f582; +sub.f32 f586, f582, f584; +mul.f32 f587, f579, 0f3F000000; +sub.f32 f588, f415, f587; +sub.f32 f589, f517, f522; +mul.f32 f590, f589, 0f3F5DB3D7; +sub.f32 f591, f588, f590; +add.f32 f592, f590, f588; +add.f32 f593, f527, f532; +add.f32 f594, f398, f593; +add.f32 f595, f529, f534; +add.f32 f596, f416, f595; +mul.f32 f597, f593, 0f3F000000; +sub.f32 f598, f398, f597; +sub.f32 f599, f529, f534; +mul.f32 f600, f599, 0f3F5DB3D7; +add.f32 f601, f600, f598; +sub.f32 f602, f598, f600; +mul.f32 f603, f595, 0f3F000000; +sub.f32 f604, f416, f603; +sub.f32 f605, f527, f532; +mul.f32 f606, f605, 0f3F5DB3D7; +sub.f32 f607, f604, f606; +add.f32 f608, f606, f604; +add.f32 f609, f537, f542; +add.f32 f610, f390, f609; +add.f32 f611, f539, f544; +add.f32 f612, f408, f611; +mul.f32 f613, f609, 0f3F000000; +sub.f32 f614, f390, f613; +sub.f32 f615, f539, f544; +mul.f32 f616, f615, 0f3F5DB3D7; +add.f32 f617, f616, f614; +sub.f32 f618, f614, f616; +mul.f32 f619, f611, 0f3F000000; +sub.f32 f620, f408, f619; +sub.f32 f621, f537, f542; +mul.f32 f622, f621, 0f3F5DB3D7; +sub.f32 f623, f620, f622; +add.f32 f624, f622, f620; +mul.f32 f625, f562, 0f3F7A67E2; +mul.f32 f626, f564, 0fBE54E6CD; +sub.f32 f627, f625, f626; +mul.f32 f628, f564, 0f3F7A67E2; +fma.rn.f32 f629, f562, 0fBE54E6CD, f628; +mul.f32 f630, f578, 0f3F69DE1D; +mul.f32 f631, f580, 0fBED03FC9; +sub.f32 f632, f630, f631; +mul.f32 f633, f580, 0f3F69DE1D; +fma.rn.f32 f634, f578, 0fBED03FC9, f633; +mul.f32 f635, f594, 0f3F4F1BBD; +mul.f32 f636, f596, 0fBF167918; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, 0f3F4F1BBD; +fma.rn.f32 f639, f594, 0fBF167918, f638; +mul.f32 f640, f610, 0f3F2B4C25; +mul.f32 f641, f612, 0fBF3E3EBD; +sub.f32 f642, f640, f641; +mul.f32 f643, f612, 0f3F2B4C25; +fma.rn.f32 f644, f610, 0fBF3E3EBD, f643; +mul.f32 f645, f553, 0f3F000000; +mul.f32 f646, f559, 0fBF5DB3D7; +sub.f32 f647, f645, f646; +mul.f32 f648, f559, 0f3F000000; +fma.rn.f32 f649, f553, 0fBF5DB3D7, f648; +mul.f32 f650, f569, 0f3E9E377A; +mul.f32 f651, f575, 0fBF737871; +sub.f32 f652, f650, f651; +mul.f32 f653, f575, 0f3E9E377A; +fma.rn.f32 f654, f569, 0fBF737871, f653; +mul.f32 f655, f585, 0f3DD61305; +mul.f32 f656, f591, 0fBF7E98FD; +sub.f32 f657, f655, f656; +mul.f32 f658, f591, 0f3DD61305; +fma.rn.f32 f659, f585, 0fBF7E98FD, f658; +mul.f32 f660, f601, 0fBDD61305; +mul.f32 f661, f607, 0fBF7E98FD; +sub.f32 f662, f660, f661; +mul.f32 f663, f607, 0fBDD61305; +fma.rn.f32 f664, f601, 0fBF7E98FD, f663; +mul.f32 f665, f617, 0fBE9E377A; +mul.f32 f666, f623, 0fBF737871; +sub.f32 f667, f665, f666; +mul.f32 f668, f623, 0fBE9E377A; +fma.rn.f32 f669, f617, 0fBF737871, f668; +mul.f32 f670, f554, 0fBF000000; +mul.f32 f671, f560, 0fBF5DB3D7; +sub.f32 f672, f670, f671; +mul.f32 f673, f560, 0fBF000000; +fma.rn.f32 f674, f554, 0fBF5DB3D7, f673; +mul.f32 f675, f570, 0fBF2B4C25; +mul.f32 f676, f576, 0fBF3E3EBD; +sub.f32 f677, f675, f676; +mul.f32 f678, f576, 0fBF2B4C25; +fma.rn.f32 f679, f570, 0fBF3E3EBD, f678; +mul.f32 f680, f586, 0fBF4F1BBD; +mul.f32 f681, f592, 0fBF167918; +sub.f32 f682, f680, f681; +mul.f32 f683, f592, 0fBF4F1BBD; +fma.rn.f32 f684, f586, 0fBF167918, f683; +mul.f32 f685, f602, 0fBF69DE1D; +mul.f32 f686, f608, 0fBED03FC9; +sub.f32 f687, f685, f686; +mul.f32 f688, f608, 0fBF69DE1D; +fma.rn.f32 f689, f602, 0fBED03FC9, f688; +mul.f32 f690, f618, 0fBF7A67E2; +mul.f32 f691, f624, 0fBE54E6CD; +sub.f32 f692, f690, f691; +mul.f32 f693, f624, 0fBF7A67E2; +fma.rn.f32 f694, f618, 0fBE54E6CD, f693; +add.f32 %1, f296, f548; +add.f32 %0, f294, f546; +add.f32 %3, f312, f629; +add.f32 %2, f310, f627; +add.f32 %5, f328, f634; +add.f32 %4, f326, f632; +add.f32 %7, f344, f639; +add.f32 %6, f342, f637; +add.f32 %9, f360, f644; +add.f32 %8, f358, f642; +add.f32 %11, f307, f649; +add.f32 %10, f301, f647; +add.f32 %13, f323, f654; +add.f32 %12, f317, f652; +add.f32 %15, f339, f659; +add.f32 %14, f333, f657; +add.f32 %17, f355, f664; +add.f32 %16, f349, f662; +add.f32 %19, f371, f669; +add.f32 %18, f365, f667; +add.f32 %21, f308, f674; +add.f32 %20, f302, f672; +add.f32 %23, f324, f679; +add.f32 %22, f318, f677; +add.f32 %25, f340, f684; +add.f32 %24, f334, f682; +add.f32 %27, f356, f689; +add.f32 %26, f350, f687; +add.f32 %29, f372, f694; +add.f32 %28, f366, f692; +sub.f32 %31, f296, f548; +sub.f32 %30, f294, f546; +sub.f32 %33, f312, f629; +sub.f32 %32, f310, f627; +sub.f32 %35, f328, f634; +sub.f32 %34, f326, f632; +sub.f32 %37, f344, f639; +sub.f32 %36, f342, f637; +sub.f32 %39, f360, f644; +sub.f32 %38, f358, f642; +sub.f32 %41, f307, f649; +sub.f32 %40, f301, f647; +sub.f32 %43, f323, f654; +sub.f32 %42, f317, f652; +sub.f32 %45, f339, f659; +sub.f32 %44, f333, f657; +sub.f32 %47, f355, f664; +sub.f32 %46, f349, f662; +sub.f32 %49, f371, f669; +sub.f32 %48, f365, f667; +sub.f32 %51, f308, f674; +sub.f32 %50, f302, f672; +sub.f32 %53, f324, f679; +sub.f32 %52, f318, f677; +sub.f32 %55, f340, f684; +sub.f32 %54, f334, f682; +sub.f32 %57, f356, f689; +sub.f32 %56, f350, f687; +sub.f32 %59, f372, f694; +sub.f32 %58, f366, f692; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..ad507cea5ee2c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp32_inv.hpp.inc @@ -0,0 +1,638 @@ +#ifndef CUFFTDX_FFT_30_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_30_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<216, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<743>; +.reg .b64 rd<2>; +add.f32 f121, %76, %124; +add.f32 f122, %60, f121; +add.f32 f123, %92, %108; +add.f32 f124, f123, f122; +add.f32 f125, %77, %125; +add.f32 f126, %61, f125; +add.f32 f127, %93, %109; +add.f32 f128, f127, f126; +fma.rn.f32 f129, f121, 0f3E9E377A, %60; +mul.f32 f130, f123, 0f3F4F1BBD; +sub.f32 f131, f129, f130; +sub.f32 f132, %77, %125; +mul.f32 f133, f132, 0f3F737871; +sub.f32 f134, %93, %109; +fma.rn.f32 f135, f134, 0f3F167918, f133; +sub.f32 f136, f131, f135; +add.f32 f137, f135, f131; +mul.f32 f138, f121, 0f3F4F1BBD; +sub.f32 f139, %60, f138; +fma.rn.f32 f140, f123, 0f3E9E377A, f139; +mul.f32 f141, f132, 0f3F167918; +mul.f32 f142, f134, 0f3F737871; +sub.f32 f143, f141, f142; +sub.f32 f144, f140, f143; +add.f32 f145, f143, f140; +fma.rn.f32 f146, f125, 0f3E9E377A, %61; +mul.f32 f147, f127, 0f3F4F1BBD; +sub.f32 f148, f146, f147; +sub.f32 f149, %76, %124; +mul.f32 f150, f149, 0f3F737871; +sub.f32 f151, %92, %108; +fma.rn.f32 f152, f151, 0f3F167918, f150; +add.f32 f153, f152, f148; +sub.f32 f154, f148, f152; +mul.f32 f155, f125, 0f3F4F1BBD; +sub.f32 f156, %61, f155; +fma.rn.f32 f157, f127, 0f3E9E377A, f156; +mul.f32 f158, f149, 0f3F167918; +mul.f32 f159, f151, 0f3F737871; +sub.f32 f160, f158, f159; +add.f32 f161, f160, f157; +sub.f32 f162, f157, f160; +add.f32 f163, %81, %129; +add.f32 f164, %65, f163; +add.f32 f165, %97, %113; +add.f32 f166, f165, f164; +add.f32 f167, %83, %131; +add.f32 f168, %67, f167; +add.f32 f169, %99, %115; +add.f32 f170, f169, f168; +fma.rn.f32 f171, f163, 0f3E9E377A, %65; +mul.f32 f172, f165, 0f3F4F1BBD; +sub.f32 f173, f171, f172; +sub.f32 f174, %83, %131; +mul.f32 f175, f174, 0f3F737871; +sub.f32 f176, %99, %115; +fma.rn.f32 f177, f176, 0f3F167918, f175; +sub.f32 f178, f173, f177; +add.f32 f179, f177, f173; +mul.f32 f180, f163, 0f3F4F1BBD; +sub.f32 f181, %65, f180; +fma.rn.f32 f182, f165, 0f3E9E377A, f181; +mul.f32 f183, f174, 0f3F167918; +mul.f32 f184, f176, 0f3F737871; +sub.f32 f185, f183, f184; +sub.f32 f186, f182, f185; +add.f32 f187, f185, f182; +fma.rn.f32 f188, f167, 0f3E9E377A, %67; +mul.f32 f189, f169, 0f3F4F1BBD; +sub.f32 f190, f188, f189; +sub.f32 f191, %81, %129; +mul.f32 f192, f191, 0f3F737871; +sub.f32 f193, %97, %113; +fma.rn.f32 f194, f193, 0f3F167918, f192; +add.f32 f195, f194, f190; +sub.f32 f196, f190, f194; +mul.f32 f197, f167, 0f3F4F1BBD; +sub.f32 f198, %67, f197; +fma.rn.f32 f199, f169, 0f3E9E377A, f198; +mul.f32 f200, f191, 0f3F167918; +mul.f32 f201, f193, 0f3F737871; +sub.f32 f202, f200, f201; +add.f32 f203, f202, f199; +sub.f32 f204, f199, f202; +add.f32 f205, %86, %134; +add.f32 f206, %70, f205; +add.f32 f207, %102, %118; +add.f32 f208, f207, f206; +add.f32 f209, %88, %136; +add.f32 f210, %72, f209; +add.f32 f211, %104, %120; +add.f32 f212, f211, f210; +fma.rn.f32 f213, f205, 0f3E9E377A, %70; +mul.f32 f214, f207, 0f3F4F1BBD; +sub.f32 f215, f213, f214; +sub.f32 f216, %88, %136; +mul.f32 f217, f216, 0f3F737871; +sub.f32 f218, %104, %120; +fma.rn.f32 f219, f218, 0f3F167918, f217; +sub.f32 f220, f215, f219; +add.f32 f221, f219, f215; +mul.f32 f222, f205, 0f3F4F1BBD; +sub.f32 f223, %70, f222; +fma.rn.f32 f224, f207, 0f3E9E377A, f223; +mul.f32 f225, f216, 0f3F167918; +mul.f32 f226, f218, 0f3F737871; +sub.f32 f227, f225, f226; +sub.f32 f228, f224, f227; +add.f32 f229, f227, f224; +fma.rn.f32 f230, f209, 0f3E9E377A, %72; +mul.f32 f231, f211, 0f3F4F1BBD; +sub.f32 f232, f230, f231; +sub.f32 f233, %86, %134; +mul.f32 f234, f233, 0f3F737871; +sub.f32 f235, %102, %118; +fma.rn.f32 f236, f235, 0f3F167918, f234; +add.f32 f237, f236, f232; +sub.f32 f238, f232, f236; +mul.f32 f239, f209, 0f3F4F1BBD; +sub.f32 f240, %72, f239; +fma.rn.f32 f241, f211, 0f3E9E377A, f240; +mul.f32 f242, f233, 0f3F167918; +mul.f32 f243, f235, 0f3F737871; +sub.f32 f244, f242, f243; +add.f32 f245, f244, f241; +sub.f32 f246, f241, f244; +mul.f32 f247, f178, 0f3F69DE1D; +mul.f32 f248, f195, 0f3ED03FC9; +sub.f32 f249, f247, f248; +mul.f32 f250, f195, 0f3F69DE1D; +fma.rn.f32 f251, f178, 0f3ED03FC9, f250; +mul.f32 f252, f220, 0f3F2B4C25; +mul.f32 f253, f237, 0f3F3E3EBD; +sub.f32 f254, f252, f253; +mul.f32 f255, f237, 0f3F2B4C25; +fma.rn.f32 f256, f220, 0f3F3E3EBD, f255; +mul.f32 f257, f186, 0f3F2B4C25; +mul.f32 f258, f203, 0f3F3E3EBD; +sub.f32 f259, f257, f258; +mul.f32 f260, f203, 0f3F2B4C25; +fma.rn.f32 f261, f186, 0f3F3E3EBD, f260; +mul.f32 f262, f228, 0fBDD61305; +mul.f32 f263, f245, 0f3F7E98FD; +sub.f32 f264, f262, f263; +mul.f32 f265, f245, 0fBDD61305; +fma.rn.f32 f266, f228, 0f3F7E98FD, f265; +mul.f32 f267, f187, 0f3E9E377A; +mul.f32 f268, f204, 0f3F737871; +sub.f32 f269, f267, f268; +mul.f32 f270, f204, 0f3E9E377A; +fma.rn.f32 f271, f187, 0f3F737871, f270; +mul.f32 f272, f229, 0fBF4F1BBD; +mul.f32 f273, f246, 0f3F167918; +sub.f32 f274, f272, f273; +mul.f32 f275, f246, 0fBF4F1BBD; +fma.rn.f32 f276, f229, 0f3F167918, f275; +mul.f32 f277, f179, 0fBDD61305; +mul.f32 f278, f196, 0f3F7E98FD; +sub.f32 f279, f277, f278; +mul.f32 f280, f196, 0fBDD61305; +fma.rn.f32 f281, f179, 0f3F7E98FD, f280; +mul.f32 f282, f221, 0fBF7A67E2; +mul.f32 f283, f238, 0fBE54E6CD; +sub.f32 f284, f282, f283; +mul.f32 f285, f238, 0fBF7A67E2; +fma.rn.f32 f286, f221, 0fBE54E6CD, f285; +add.f32 f287, f166, f208; +add.f32 f288, f124, f287; +add.f32 f289, f170, f212; +add.f32 f290, f128, f289; +mul.f32 f291, f287, 0f3F000000; +sub.f32 f292, f124, f291; +sub.f32 f293, f170, f212; +mul.f32 f294, f293, 0fBF5DB3D7; +add.f32 f295, f294, f292; +sub.f32 f296, f292, f294; +mul.f32 f297, f289, 0f3F000000; +sub.f32 f298, f128, f297; +sub.f32 f299, f166, f208; +mul.f32 f300, f299, 0fBF5DB3D7; +sub.f32 f301, f298, f300; +add.f32 f302, f300, f298; +add.f32 f303, f249, f254; +add.f32 f304, f136, f303; +add.f32 f305, f251, f256; +add.f32 f306, f153, f305; +mul.f32 f307, f303, 0f3F000000; +sub.f32 f308, f136, f307; +sub.f32 f309, f251, f256; +mul.f32 f310, f309, 0fBF5DB3D7; +add.f32 f311, f310, f308; +sub.f32 f312, f308, f310; +mul.f32 f313, f305, 0f3F000000; +sub.f32 f314, f153, f313; +sub.f32 f315, f249, f254; +mul.f32 f316, f315, 0fBF5DB3D7; +sub.f32 f317, f314, f316; +add.f32 f318, f316, f314; +add.f32 f319, f259, f264; +add.f32 f320, f144, f319; +add.f32 f321, f261, f266; +add.f32 f322, f161, f321; +mul.f32 f323, f319, 0f3F000000; +sub.f32 f324, f144, f323; +sub.f32 f325, f261, f266; +mul.f32 f326, f325, 0fBF5DB3D7; +add.f32 f327, f326, f324; +sub.f32 f328, f324, f326; +mul.f32 f329, f321, 0f3F000000; +sub.f32 f330, f161, f329; +sub.f32 f331, f259, f264; +mul.f32 f332, f331, 0fBF5DB3D7; +sub.f32 f333, f330, f332; +add.f32 f334, f332, f330; +add.f32 f335, f269, f274; +add.f32 f336, f145, f335; +add.f32 f337, f271, f276; +add.f32 f338, f162, f337; +mul.f32 f339, f335, 0f3F000000; +sub.f32 f340, f145, f339; +sub.f32 f341, f271, f276; +mul.f32 f342, f341, 0fBF5DB3D7; +add.f32 f343, f342, f340; +sub.f32 f344, f340, f342; +mul.f32 f345, f337, 0f3F000000; +sub.f32 f346, f162, f345; +sub.f32 f347, f269, f274; +mul.f32 f348, f347, 0fBF5DB3D7; +sub.f32 f349, f346, f348; +add.f32 f350, f348, f346; +add.f32 f351, f279, f284; +add.f32 f352, f137, f351; +add.f32 f353, f281, f286; +add.f32 f354, f154, f353; +mul.f32 f355, f351, 0f3F000000; +sub.f32 f356, f137, f355; +sub.f32 f357, f281, f286; +mul.f32 f358, f357, 0fBF5DB3D7; +add.f32 f359, f358, f356; +sub.f32 f360, f356, f358; +mul.f32 f361, f353, 0f3F000000; +sub.f32 f362, f154, f361; +sub.f32 f363, f279, f284; +mul.f32 f364, f363, 0fBF5DB3D7; +sub.f32 f365, f362, f364; +add.f32 f366, f364, f362; +add.f32 f367, %78, %126; +add.f32 f368, %62, f367; +add.f32 f369, %94, %110; +add.f32 f370, f369, f368; +add.f32 f371, %80, %128; +add.f32 f372, %64, f371; +add.f32 f373, %96, %112; +add.f32 f374, f373, f372; +fma.rn.f32 f375, f367, 0f3E9E377A, %62; +mul.f32 f376, f369, 0f3F4F1BBD; +sub.f32 f377, f375, f376; +sub.f32 f378, %80, %128; +mul.f32 f379, f378, 0f3F737871; +sub.f32 f380, %96, %112; +fma.rn.f32 f381, f380, 0f3F167918, f379; +sub.f32 f382, f377, f381; +add.f32 f383, f381, f377; +mul.f32 f384, f367, 0f3F4F1BBD; +sub.f32 f385, %62, f384; +fma.rn.f32 f386, f369, 0f3E9E377A, f385; +mul.f32 f387, f378, 0f3F167918; +mul.f32 f388, f380, 0f3F737871; +sub.f32 f389, f387, f388; +sub.f32 f390, f386, f389; +add.f32 f391, f389, f386; +fma.rn.f32 f392, f371, 0f3E9E377A, %64; +mul.f32 f393, f373, 0f3F4F1BBD; +sub.f32 f394, f392, f393; +sub.f32 f395, %78, %126; +mul.f32 f396, f395, 0f3F737871; +sub.f32 f397, %94, %110; +fma.rn.f32 f398, f397, 0f3F167918, f396; +add.f32 f399, f398, f394; +sub.f32 f400, f394, f398; +mul.f32 f401, f371, 0f3F4F1BBD; +sub.f32 f402, %64, f401; +fma.rn.f32 f403, f373, 0f3E9E377A, f402; +mul.f32 f404, f395, 0f3F167918; +mul.f32 f405, f397, 0f3F737871; +sub.f32 f406, f404, f405; +add.f32 f407, f406, f403; +sub.f32 f408, f403, f406; +add.f32 f409, %84, %132; +add.f32 f410, %68, f409; +add.f32 f411, %100, %116; +add.f32 f412, f411, f410; +add.f32 f413, %85, %133; +add.f32 f414, %69, f413; +add.f32 f415, %101, %117; +add.f32 f416, f415, f414; +fma.rn.f32 f417, f409, 0f3E9E377A, %68; +mul.f32 f418, f411, 0f3F4F1BBD; +sub.f32 f419, f417, f418; +sub.f32 f420, %85, %133; +mul.f32 f421, f420, 0f3F737871; +sub.f32 f422, %101, %117; +fma.rn.f32 f423, f422, 0f3F167918, f421; +sub.f32 f424, f419, f423; +add.f32 f425, f423, f419; +mul.f32 f426, f409, 0f3F4F1BBD; +sub.f32 f427, %68, f426; +fma.rn.f32 f428, f411, 0f3E9E377A, f427; +mul.f32 f429, f420, 0f3F167918; +mul.f32 f430, f422, 0f3F737871; +sub.f32 f431, f429, f430; +sub.f32 f432, f428, f431; +add.f32 f433, f431, f428; +fma.rn.f32 f434, f413, 0f3E9E377A, %69; +mul.f32 f435, f415, 0f3F4F1BBD; +sub.f32 f436, f434, f435; +sub.f32 f437, %84, %132; +mul.f32 f438, f437, 0f3F737871; +sub.f32 f439, %100, %116; +fma.rn.f32 f440, f439, 0f3F167918, f438; +add.f32 f441, f440, f436; +sub.f32 f442, f436, f440; +mul.f32 f443, f413, 0f3F4F1BBD; +sub.f32 f444, %69, f443; +fma.rn.f32 f445, f415, 0f3E9E377A, f444; +mul.f32 f446, f437, 0f3F167918; +mul.f32 f447, f439, 0f3F737871; +sub.f32 f448, f446, f447; +add.f32 f449, f448, f445; +sub.f32 f450, f445, f448; +add.f32 f451, %89, %137; +add.f32 f452, %73, f451; +add.f32 f453, %105, %121; +add.f32 f454, f453, f452; +add.f32 f455, %91, %138; +add.f32 f456, %75, f455; +add.f32 f457, %107, %123; +add.f32 f458, f457, f456; +fma.rn.f32 f459, f451, 0f3E9E377A, %73; +mul.f32 f460, f453, 0f3F4F1BBD; +sub.f32 f461, f459, f460; +sub.f32 f462, %91, %138; +mul.f32 f463, f462, 0f3F737871; +sub.f32 f464, %107, %123; +fma.rn.f32 f465, f464, 0f3F167918, f463; +sub.f32 f466, f461, f465; +add.f32 f467, f465, f461; +mul.f32 f468, f451, 0f3F4F1BBD; +sub.f32 f469, %73, f468; +fma.rn.f32 f470, f453, 0f3E9E377A, f469; +mul.f32 f471, f462, 0f3F167918; +mul.f32 f472, f464, 0f3F737871; +sub.f32 f473, f471, f472; +sub.f32 f474, f470, f473; +add.f32 f475, f473, f470; +fma.rn.f32 f476, f455, 0f3E9E377A, %75; +mul.f32 f477, f457, 0f3F4F1BBD; +sub.f32 f478, f476, f477; +sub.f32 f479, %89, %137; +mul.f32 f480, f479, 0f3F737871; +sub.f32 f481, %105, %121; +fma.rn.f32 f482, f481, 0f3F167918, f480; +add.f32 f483, f482, f478; +sub.f32 f484, f478, f482; +mul.f32 f485, f455, 0f3F4F1BBD; +sub.f32 f486, %75, f485; +fma.rn.f32 f487, f457, 0f3E9E377A, f486; +mul.f32 f488, f479, 0f3F167918; +mul.f32 f489, f481, 0f3F737871; +sub.f32 f490, f488, f489; +add.f32 f491, f490, f487; +sub.f32 f492, f487, f490; +mul.f32 f493, f424, 0f3F69DE1D; +mul.f32 f494, f441, 0f3ED03FC9; +sub.f32 f495, f493, f494; +mul.f32 f496, f441, 0f3F69DE1D; +fma.rn.f32 f497, f424, 0f3ED03FC9, f496; +mul.f32 f498, f466, 0f3F2B4C25; +mul.f32 f499, f483, 0f3F3E3EBD; +sub.f32 f500, f498, f499; +mul.f32 f501, f483, 0f3F2B4C25; +fma.rn.f32 f502, f466, 0f3F3E3EBD, f501; +mul.f32 f503, f432, 0f3F2B4C25; +mul.f32 f504, f449, 0f3F3E3EBD; +sub.f32 f505, f503, f504; +mul.f32 f506, f449, 0f3F2B4C25; +fma.rn.f32 f507, f432, 0f3F3E3EBD, f506; +mul.f32 f508, f474, 0fBDD61305; +mul.f32 f509, f491, 0f3F7E98FD; +sub.f32 f510, f508, f509; +mul.f32 f511, f491, 0fBDD61305; +fma.rn.f32 f512, f474, 0f3F7E98FD, f511; +mul.f32 f513, f433, 0f3E9E377A; +mul.f32 f514, f450, 0f3F737871; +sub.f32 f515, f513, f514; +mul.f32 f516, f450, 0f3E9E377A; +fma.rn.f32 f517, f433, 0f3F737871, f516; +mul.f32 f518, f475, 0fBF4F1BBD; +mul.f32 f519, f492, 0f3F167918; +sub.f32 f520, f518, f519; +mul.f32 f521, f492, 0fBF4F1BBD; +fma.rn.f32 f522, f475, 0f3F167918, f521; +mul.f32 f523, f425, 0fBDD61305; +mul.f32 f524, f442, 0f3F7E98FD; +sub.f32 f525, f523, f524; +mul.f32 f526, f442, 0fBDD61305; +fma.rn.f32 f527, f425, 0f3F7E98FD, f526; +mul.f32 f528, f467, 0fBF7A67E2; +mul.f32 f529, f484, 0fBE54E6CD; +sub.f32 f530, f528, f529; +mul.f32 f531, f484, 0fBF7A67E2; +fma.rn.f32 f532, f467, 0fBE54E6CD, f531; +add.f32 f533, f412, f454; +add.f32 f534, f370, f533; +add.f32 f535, f416, f458; +add.f32 f536, f374, f535; +mul.f32 f537, f533, 0f3F000000; +sub.f32 f538, f370, f537; +sub.f32 f539, f416, f458; +mul.f32 f540, f539, 0fBF5DB3D7; +add.f32 f541, f540, f538; +sub.f32 f542, f538, f540; +mul.f32 f543, f535, 0f3F000000; +sub.f32 f544, f374, f543; +sub.f32 f545, f412, f454; +mul.f32 f546, f545, 0fBF5DB3D7; +sub.f32 f547, f544, f546; +add.f32 f548, f546, f544; +add.f32 f549, f495, f500; +add.f32 f550, f382, f549; +add.f32 f551, f497, f502; +add.f32 f552, f399, f551; +mul.f32 f553, f549, 0f3F000000; +sub.f32 f554, f382, f553; +sub.f32 f555, f497, f502; +mul.f32 f556, f555, 0fBF5DB3D7; +add.f32 f557, f556, f554; +sub.f32 f558, f554, f556; +mul.f32 f559, f551, 0f3F000000; +sub.f32 f560, f399, f559; +sub.f32 f561, f495, f500; +mul.f32 f562, f561, 0fBF5DB3D7; +sub.f32 f563, f560, f562; +add.f32 f564, f562, f560; +add.f32 f565, f505, f510; +add.f32 f566, f390, f565; +add.f32 f567, f507, f512; +add.f32 f568, f407, f567; +mul.f32 f569, f565, 0f3F000000; +sub.f32 f570, f390, f569; +sub.f32 f571, f507, f512; +mul.f32 f572, f571, 0fBF5DB3D7; +add.f32 f573, f572, f570; +sub.f32 f574, f570, f572; +mul.f32 f575, f567, 0f3F000000; +sub.f32 f576, f407, f575; +sub.f32 f577, f505, f510; +mul.f32 f578, f577, 0fBF5DB3D7; +sub.f32 f579, f576, f578; +add.f32 f580, f578, f576; +add.f32 f581, f515, f520; +add.f32 f582, f391, f581; +add.f32 f583, f517, f522; +add.f32 f584, f408, f583; +mul.f32 f585, f581, 0f3F000000; +sub.f32 f586, f391, f585; +sub.f32 f587, f517, f522; +mul.f32 f588, f587, 0fBF5DB3D7; +add.f32 f589, f588, f586; +sub.f32 f590, f586, f588; +mul.f32 f591, f583, 0f3F000000; +sub.f32 f592, f408, f591; +sub.f32 f593, f515, f520; +mul.f32 f594, f593, 0fBF5DB3D7; +sub.f32 f595, f592, f594; +add.f32 f596, f594, f592; +add.f32 f597, f525, f530; +add.f32 f598, f383, f597; +add.f32 f599, f527, f532; +add.f32 f600, f400, f599; +mul.f32 f601, f597, 0f3F000000; +sub.f32 f602, f383, f601; +sub.f32 f603, f527, f532; +mul.f32 f604, f603, 0fBF5DB3D7; +add.f32 f605, f604, f602; +sub.f32 f606, f602, f604; +mul.f32 f607, f599, 0f3F000000; +sub.f32 f608, f400, f607; +sub.f32 f609, f525, f530; +mul.f32 f610, f609, 0fBF5DB3D7; +sub.f32 f611, f608, f610; +add.f32 f612, f610, f608; +mul.f32 f613, f550, 0f3F7A67E2; +mul.f32 f614, f552, 0f3E54E6CD; +sub.f32 f615, f613, f614; +mul.f32 f616, f552, 0f3F7A67E2; +fma.rn.f32 f617, f550, 0f3E54E6CD, f616; +mul.f32 f618, f566, 0f3F69DE1D; +mul.f32 f619, f568, 0f3ED03FC9; +sub.f32 f620, f618, f619; +mul.f32 f621, f568, 0f3F69DE1D; +fma.rn.f32 f622, f566, 0f3ED03FC9, f621; +mul.f32 f623, f582, 0f3F4F1BBD; +mul.f32 f624, f584, 0f3F167918; +sub.f32 f625, f623, f624; +mul.f32 f626, f584, 0f3F4F1BBD; +fma.rn.f32 f627, f582, 0f3F167918, f626; +mul.f32 f628, f598, 0f3F2B4C25; +mul.f32 f629, f600, 0f3F3E3EBD; +sub.f32 f630, f628, f629; +mul.f32 f631, f600, 0f3F2B4C25; +fma.rn.f32 f632, f598, 0f3F3E3EBD, f631; +mul.f32 f633, f541, 0f3F000000; +mul.f32 f634, f547, 0f3F5DB3D7; +sub.f32 f635, f633, f634; +mul.f32 f636, f547, 0f3F000000; +fma.rn.f32 f637, f541, 0f3F5DB3D7, f636; +mul.f32 f638, f557, 0f3E9E377A; +mul.f32 f639, f563, 0f3F737871; +sub.f32 f640, f638, f639; +mul.f32 f641, f563, 0f3E9E377A; +fma.rn.f32 f642, f557, 0f3F737871, f641; +mul.f32 f643, f573, 0f3DD61305; +mul.f32 f644, f579, 0f3F7E98FD; +sub.f32 f645, f643, f644; +mul.f32 f646, f579, 0f3DD61305; +fma.rn.f32 f647, f573, 0f3F7E98FD, f646; +mul.f32 f648, f589, 0fBDD61305; +mul.f32 f649, f595, 0f3F7E98FD; +sub.f32 f650, f648, f649; +mul.f32 f651, f595, 0fBDD61305; +fma.rn.f32 f652, f589, 0f3F7E98FD, f651; +mul.f32 f653, f605, 0fBE9E377A; +mul.f32 f654, f611, 0f3F737871; +sub.f32 f655, f653, f654; +mul.f32 f656, f611, 0fBE9E377A; +fma.rn.f32 f657, f605, 0f3F737871, f656; +mul.f32 f658, f542, 0fBF000000; +mul.f32 f659, f548, 0f3F5DB3D7; +sub.f32 f660, f658, f659; +mul.f32 f661, f548, 0fBF000000; +fma.rn.f32 f662, f542, 0f3F5DB3D7, f661; +mul.f32 f663, f558, 0fBF2B4C25; +mul.f32 f664, f564, 0f3F3E3EBD; +sub.f32 f665, f663, f664; +mul.f32 f666, f564, 0fBF2B4C25; +fma.rn.f32 f667, f558, 0f3F3E3EBD, f666; +mul.f32 f668, f574, 0fBF4F1BBD; +mul.f32 f669, f580, 0f3F167918; +sub.f32 f670, f668, f669; +mul.f32 f671, f580, 0fBF4F1BBD; +fma.rn.f32 f672, f574, 0f3F167918, f671; +mul.f32 f673, f590, 0fBF69DE1D; +mul.f32 f674, f596, 0f3ED03FC9; +sub.f32 f675, f673, f674; +mul.f32 f676, f596, 0fBF69DE1D; +fma.rn.f32 f677, f590, 0f3ED03FC9, f676; +mul.f32 f678, f606, 0fBF7A67E2; +mul.f32 f679, f612, 0f3E54E6CD; +sub.f32 f680, f678, f679; +mul.f32 f681, f612, 0fBF7A67E2; +fma.rn.f32 f682, f606, 0f3E54E6CD, f681; +add.f32 %1, f290, f536; +add.f32 %0, f288, f534; +add.f32 %3, f306, f617; +add.f32 %2, f304, f615; +add.f32 %5, f322, f622; +add.f32 %4, f320, f620; +add.f32 %7, f338, f627; +add.f32 %6, f336, f625; +add.f32 %9, f354, f632; +add.f32 %8, f352, f630; +add.f32 %11, f301, f637; +add.f32 %10, f295, f635; +add.f32 %13, f317, f642; +add.f32 %12, f311, f640; +add.f32 %15, f333, f647; +add.f32 %14, f327, f645; +add.f32 %17, f349, f652; +add.f32 %16, f343, f650; +add.f32 %19, f365, f657; +add.f32 %18, f359, f655; +add.f32 %21, f302, f662; +add.f32 %20, f296, f660; +add.f32 %23, f318, f667; +add.f32 %22, f312, f665; +add.f32 %25, f334, f672; +add.f32 %24, f328, f670; +add.f32 %27, f350, f677; +add.f32 %26, f344, f675; +add.f32 %29, f366, f682; +add.f32 %28, f360, f680; +sub.f32 %31, f290, f536; +sub.f32 %30, f288, f534; +sub.f32 %33, f306, f617; +sub.f32 %32, f304, f615; +sub.f32 %35, f322, f622; +sub.f32 %34, f320, f620; +sub.f32 %37, f338, f627; +sub.f32 %36, f336, f625; +sub.f32 %39, f354, f632; +sub.f32 %38, f352, f630; +sub.f32 %41, f301, f637; +sub.f32 %40, f295, f635; +sub.f32 %43, f317, f642; +sub.f32 %42, f311, f640; +sub.f32 %45, f333, f647; +sub.f32 %44, f327, f645; +sub.f32 %47, f349, f652; +sub.f32 %46, f343, f650; +sub.f32 %49, f365, f657; +sub.f32 %48, f359, f655; +sub.f32 %51, f302, f662; +sub.f32 %50, f296, f660; +sub.f32 %53, f318, f667; +sub.f32 %52, f312, f665; +sub.f32 %55, f334, f672; +sub.f32 %54, f328, f670; +sub.f32 %57, f350, f677; +sub.f32 %56, f344, f675; +sub.f32 %59, f366, f682; +sub.f32 %58, f360, f680; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..d496588466c8c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp64_fwd.hpp.inc @@ -0,0 +1,650 @@ +#ifndef CUFFTDX_FFT_30_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_30_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<418, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<755>; +.reg .b64 rd<2>; +add.f64 fd121, %76, %124; +add.f64 fd122, %60, fd121; +add.f64 fd123, %92, %108; +add.f64 fd124, fd123, fd122; +add.f64 fd125, %77, %125; +add.f64 fd126, %61, fd125; +add.f64 fd127, %93, %109; +add.f64 fd128, fd127, fd126; +fma.rn.f64 fd129, fd121, 0d3FD3C6EF372FE950, %60; +mul.f64 fd130, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd131, fd129, fd130; +sub.f64 fd132, %77, %125; +mul.f64 fd133, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd134, %93, %109; +mul.f64 fd135, fd134, 0dBFE2CF2304755A5E; +sub.f64 fd136, fd135, fd133; +sub.f64 fd137, fd131, fd136; +add.f64 fd138, fd136, fd131; +mul.f64 fd139, fd121, 0d3FE9E3779B97F4A8; +sub.f64 fd140, %60, fd139; +fma.rn.f64 fd141, fd123, 0d3FD3C6EF372FE950, fd140; +mul.f64 fd142, fd132, 0d3FE2CF2304755A5E; +mul.f64 fd143, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd143, fd142; +sub.f64 fd145, fd141, fd144; +add.f64 fd146, fd144, fd141; +fma.rn.f64 fd147, fd125, 0d3FD3C6EF372FE950, %61; +mul.f64 fd148, fd127, 0d3FE9E3779B97F4A8; +sub.f64 fd149, fd147, fd148; +sub.f64 fd150, %76, %124; +mul.f64 fd151, fd150, 0d3FEE6F0E134454FF; +sub.f64 fd152, %92, %108; +mul.f64 fd153, fd152, 0dBFE2CF2304755A5E; +sub.f64 fd154, fd153, fd151; +add.f64 fd155, fd154, fd149; +sub.f64 fd156, fd149, fd154; +mul.f64 fd157, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd158, %61, fd157; +fma.rn.f64 fd159, fd127, 0d3FD3C6EF372FE950, fd158; +mul.f64 fd160, fd150, 0d3FE2CF2304755A5E; +mul.f64 fd161, fd152, 0d3FEE6F0E134454FF; +sub.f64 fd162, fd161, fd160; +add.f64 fd163, fd162, fd159; +sub.f64 fd164, fd159, fd162; +add.f64 fd165, %81, %129; +add.f64 fd166, %65, fd165; +add.f64 fd167, %97, %113; +add.f64 fd168, fd167, fd166; +add.f64 fd169, %83, %131; +add.f64 fd170, %67, fd169; +add.f64 fd171, %99, %115; +add.f64 fd172, fd171, fd170; +fma.rn.f64 fd173, fd165, 0d3FD3C6EF372FE950, %65; +mul.f64 fd174, fd167, 0d3FE9E3779B97F4A8; +sub.f64 fd175, fd173, fd174; +sub.f64 fd176, %83, %131; +mul.f64 fd177, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd178, %99, %115; +mul.f64 fd179, fd178, 0dBFE2CF2304755A5E; +sub.f64 fd180, fd179, fd177; +sub.f64 fd181, fd175, fd180; +add.f64 fd182, fd180, fd175; +mul.f64 fd183, fd165, 0d3FE9E3779B97F4A8; +sub.f64 fd184, %65, fd183; +fma.rn.f64 fd185, fd167, 0d3FD3C6EF372FE950, fd184; +mul.f64 fd186, fd176, 0d3FE2CF2304755A5E; +mul.f64 fd187, fd178, 0d3FEE6F0E134454FF; +sub.f64 fd188, fd187, fd186; +sub.f64 fd189, fd185, fd188; +add.f64 fd190, fd188, fd185; +fma.rn.f64 fd191, fd169, 0d3FD3C6EF372FE950, %67; +mul.f64 fd192, fd171, 0d3FE9E3779B97F4A8; +sub.f64 fd193, fd191, fd192; +sub.f64 fd194, %81, %129; +mul.f64 fd195, fd194, 0d3FEE6F0E134454FF; +sub.f64 fd196, %97, %113; +mul.f64 fd197, fd196, 0dBFE2CF2304755A5E; +sub.f64 fd198, fd197, fd195; +add.f64 fd199, fd198, fd193; +sub.f64 fd200, fd193, fd198; +mul.f64 fd201, fd169, 0d3FE9E3779B97F4A8; +sub.f64 fd202, %67, fd201; +fma.rn.f64 fd203, fd171, 0d3FD3C6EF372FE950, fd202; +mul.f64 fd204, fd194, 0d3FE2CF2304755A5E; +mul.f64 fd205, fd196, 0d3FEE6F0E134454FF; +sub.f64 fd206, fd205, fd204; +add.f64 fd207, fd206, fd203; +sub.f64 fd208, fd203, fd206; +add.f64 fd209, %86, %134; +add.f64 fd210, %70, fd209; +add.f64 fd211, %102, %118; +add.f64 fd212, fd211, fd210; +add.f64 fd213, %88, %136; +add.f64 fd214, %72, fd213; +add.f64 fd215, %104, %120; +add.f64 fd216, fd215, fd214; +fma.rn.f64 fd217, fd209, 0d3FD3C6EF372FE950, %70; +mul.f64 fd218, fd211, 0d3FE9E3779B97F4A8; +sub.f64 fd219, fd217, fd218; +sub.f64 fd220, %88, %136; +mul.f64 fd221, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd222, %104, %120; +mul.f64 fd223, fd222, 0dBFE2CF2304755A5E; +sub.f64 fd224, fd223, fd221; +sub.f64 fd225, fd219, fd224; +add.f64 fd226, fd224, fd219; +mul.f64 fd227, fd209, 0d3FE9E3779B97F4A8; +sub.f64 fd228, %70, fd227; +fma.rn.f64 fd229, fd211, 0d3FD3C6EF372FE950, fd228; +mul.f64 fd230, fd220, 0d3FE2CF2304755A5E; +mul.f64 fd231, fd222, 0d3FEE6F0E134454FF; +sub.f64 fd232, fd231, fd230; +sub.f64 fd233, fd229, fd232; +add.f64 fd234, fd232, fd229; +fma.rn.f64 fd235, fd213, 0d3FD3C6EF372FE950, %72; +mul.f64 fd236, fd215, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, %86, %134; +mul.f64 fd239, fd238, 0d3FEE6F0E134454FF; +sub.f64 fd240, %102, %118; +mul.f64 fd241, fd240, 0dBFE2CF2304755A5E; +sub.f64 fd242, fd241, fd239; +add.f64 fd243, fd242, fd237; +sub.f64 fd244, fd237, fd242; +mul.f64 fd245, fd213, 0d3FE9E3779B97F4A8; +sub.f64 fd246, %72, fd245; +fma.rn.f64 fd247, fd215, 0d3FD3C6EF372FE950, fd246; +mul.f64 fd248, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd249, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd250, fd249, fd248; +add.f64 fd251, fd250, fd247; +sub.f64 fd252, fd247, fd250; +mul.f64 fd253, fd181, 0d3FED3BC3AEFF7F95; +mul.f64 fd254, fd199, 0dBFDA07F921061AD1; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd199, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd257, fd181, 0dBFDA07F921061AD1, fd256; +mul.f64 fd258, fd225, 0d3FE5698496E20BD8; +mul.f64 fd259, fd243, 0dBFE7C7D7A833BEC2; +sub.f64 fd260, fd258, fd259; +mul.f64 fd261, fd243, 0d3FE5698496E20BD8; +fma.rn.f64 fd262, fd225, 0dBFE7C7D7A833BEC2, fd261; +mul.f64 fd263, fd189, 0d3FE5698496E20BD8; +mul.f64 fd264, fd207, 0dBFE7C7D7A833BEC2; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd207, 0d3FE5698496E20BD8; +fma.rn.f64 fd267, fd189, 0dBFE7C7D7A833BEC2, fd266; +mul.f64 fd268, fd233, 0dBFBAC2609B3C576C; +mul.f64 fd269, fd251, 0dBFEFD31F94F867C6; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd251, 0dBFBAC2609B3C576C; +fma.rn.f64 fd272, fd233, 0dBFEFD31F94F867C6, fd271; +mul.f64 fd273, fd190, 0d3FD3C6EF372FE950; +mul.f64 fd274, fd208, 0dBFEE6F0E134454FF; +sub.f64 fd275, fd273, fd274; +mul.f64 fd276, fd208, 0d3FD3C6EF372FE950; +fma.rn.f64 fd277, fd190, 0dBFEE6F0E134454FF, fd276; +mul.f64 fd278, fd234, 0dBFE9E3779B97F4A8; +mul.f64 fd279, fd252, 0dBFE2CF2304755A5E; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd252, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd282, fd234, 0dBFE2CF2304755A5E, fd281; +mul.f64 fd283, fd182, 0dBFBAC2609B3C576C; +mul.f64 fd284, fd200, 0dBFEFD31F94F867C6; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd200, 0dBFBAC2609B3C576C; +fma.rn.f64 fd287, fd182, 0dBFEFD31F94F867C6, fd286; +mul.f64 fd288, fd226, 0dBFEF4CFC327A0080; +mul.f64 fd289, fd244, 0d3FCA9CD9AC4258F6; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd244, 0dBFEF4CFC327A0080; +fma.rn.f64 fd292, fd226, 0d3FCA9CD9AC4258F6, fd291; +add.f64 fd293, fd168, fd212; +add.f64 fd294, fd124, fd293; +add.f64 fd295, fd172, fd216; +add.f64 fd296, fd128, fd295; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd124, fd297; +sub.f64 fd299, fd172, fd216; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +mul.f64 fd303, fd295, 0d3FE0000000000000; +sub.f64 fd304, fd128, fd303; +sub.f64 fd305, fd168, fd212; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd255, fd260; +add.f64 fd310, fd137, fd309; +add.f64 fd311, fd257, fd262; +add.f64 fd312, fd155, fd311; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd137, fd313; +sub.f64 fd315, fd257, fd262; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +mul.f64 fd319, fd311, 0d3FE0000000000000; +sub.f64 fd320, fd155, fd319; +sub.f64 fd321, fd255, fd260; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd265, fd270; +add.f64 fd326, fd145, fd325; +add.f64 fd327, fd267, fd272; +add.f64 fd328, fd163, fd327; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd145, fd329; +sub.f64 fd331, fd267, fd272; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +mul.f64 fd335, fd327, 0d3FE0000000000000; +sub.f64 fd336, fd163, fd335; +sub.f64 fd337, fd265, fd270; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, fd275, fd280; +add.f64 fd342, fd146, fd341; +add.f64 fd343, fd277, fd282; +add.f64 fd344, fd164, fd343; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, fd146, fd345; +sub.f64 fd347, fd277, fd282; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +mul.f64 fd351, fd343, 0d3FE0000000000000; +sub.f64 fd352, fd164, fd351; +sub.f64 fd353, fd275, fd280; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, fd285, fd290; +add.f64 fd358, fd138, fd357; +add.f64 fd359, fd287, fd292; +add.f64 fd360, fd156, fd359; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, fd138, fd361; +sub.f64 fd363, fd287, fd292; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +mul.f64 fd367, fd359, 0d3FE0000000000000; +sub.f64 fd368, fd156, fd367; +sub.f64 fd369, fd285, fd290; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %78, %126; +add.f64 fd374, %62, fd373; +add.f64 fd375, %94, %110; +add.f64 fd376, fd375, fd374; +add.f64 fd377, %80, %128; +add.f64 fd378, %64, fd377; +add.f64 fd379, %96, %112; +add.f64 fd380, fd379, fd378; +fma.rn.f64 fd381, fd373, 0d3FD3C6EF372FE950, %62; +mul.f64 fd382, fd375, 0d3FE9E3779B97F4A8; +sub.f64 fd383, fd381, fd382; +sub.f64 fd384, %80, %128; +mul.f64 fd385, fd384, 0d3FEE6F0E134454FF; +sub.f64 fd386, %96, %112; +mul.f64 fd387, fd386, 0dBFE2CF2304755A5E; +sub.f64 fd388, fd387, fd385; +sub.f64 fd389, fd383, fd388; +add.f64 fd390, fd388, fd383; +mul.f64 fd391, fd373, 0d3FE9E3779B97F4A8; +sub.f64 fd392, %62, fd391; +fma.rn.f64 fd393, fd375, 0d3FD3C6EF372FE950, fd392; +mul.f64 fd394, fd384, 0d3FE2CF2304755A5E; +mul.f64 fd395, fd386, 0d3FEE6F0E134454FF; +sub.f64 fd396, fd395, fd394; +sub.f64 fd397, fd393, fd396; +add.f64 fd398, fd396, fd393; +fma.rn.f64 fd399, fd377, 0d3FD3C6EF372FE950, %64; +mul.f64 fd400, fd379, 0d3FE9E3779B97F4A8; +sub.f64 fd401, fd399, fd400; +sub.f64 fd402, %78, %126; +mul.f64 fd403, fd402, 0d3FEE6F0E134454FF; +sub.f64 fd404, %94, %110; +mul.f64 fd405, fd404, 0dBFE2CF2304755A5E; +sub.f64 fd406, fd405, fd403; +add.f64 fd407, fd406, fd401; +sub.f64 fd408, fd401, fd406; +mul.f64 fd409, fd377, 0d3FE9E3779B97F4A8; +sub.f64 fd410, %64, fd409; +fma.rn.f64 fd411, fd379, 0d3FD3C6EF372FE950, fd410; +mul.f64 fd412, fd402, 0d3FE2CF2304755A5E; +mul.f64 fd413, fd404, 0d3FEE6F0E134454FF; +sub.f64 fd414, fd413, fd412; +add.f64 fd415, fd414, fd411; +sub.f64 fd416, fd411, fd414; +add.f64 fd417, %84, %132; +add.f64 fd418, %68, fd417; +add.f64 fd419, %100, %116; +add.f64 fd420, fd419, fd418; +add.f64 fd421, %85, %133; +add.f64 fd422, %69, fd421; +add.f64 fd423, %101, %117; +add.f64 fd424, fd423, fd422; +fma.rn.f64 fd425, fd417, 0d3FD3C6EF372FE950, %68; +mul.f64 fd426, fd419, 0d3FE9E3779B97F4A8; +sub.f64 fd427, fd425, fd426; +sub.f64 fd428, %85, %133; +mul.f64 fd429, fd428, 0d3FEE6F0E134454FF; +sub.f64 fd430, %101, %117; +mul.f64 fd431, fd430, 0dBFE2CF2304755A5E; +sub.f64 fd432, fd431, fd429; +sub.f64 fd433, fd427, fd432; +add.f64 fd434, fd432, fd427; +mul.f64 fd435, fd417, 0d3FE9E3779B97F4A8; +sub.f64 fd436, %68, fd435; +fma.rn.f64 fd437, fd419, 0d3FD3C6EF372FE950, fd436; +mul.f64 fd438, fd428, 0d3FE2CF2304755A5E; +mul.f64 fd439, fd430, 0d3FEE6F0E134454FF; +sub.f64 fd440, fd439, fd438; +sub.f64 fd441, fd437, fd440; +add.f64 fd442, fd440, fd437; +fma.rn.f64 fd443, fd421, 0d3FD3C6EF372FE950, %69; +mul.f64 fd444, fd423, 0d3FE9E3779B97F4A8; +sub.f64 fd445, fd443, fd444; +sub.f64 fd446, %84, %132; +mul.f64 fd447, fd446, 0d3FEE6F0E134454FF; +sub.f64 fd448, %100, %116; +mul.f64 fd449, fd448, 0dBFE2CF2304755A5E; +sub.f64 fd450, fd449, fd447; +add.f64 fd451, fd450, fd445; +sub.f64 fd452, fd445, fd450; +mul.f64 fd453, fd421, 0d3FE9E3779B97F4A8; +sub.f64 fd454, %69, fd453; +fma.rn.f64 fd455, fd423, 0d3FD3C6EF372FE950, fd454; +mul.f64 fd456, fd446, 0d3FE2CF2304755A5E; +mul.f64 fd457, fd448, 0d3FEE6F0E134454FF; +sub.f64 fd458, fd457, fd456; +add.f64 fd459, fd458, fd455; +sub.f64 fd460, fd455, fd458; +add.f64 fd461, %89, %137; +add.f64 fd462, %73, fd461; +add.f64 fd463, %105, %121; +add.f64 fd464, fd463, fd462; +add.f64 fd465, %91, %138; +add.f64 fd466, %75, fd465; +add.f64 fd467, %107, %123; +add.f64 fd468, fd467, fd466; +fma.rn.f64 fd469, fd461, 0d3FD3C6EF372FE950, %73; +mul.f64 fd470, fd463, 0d3FE9E3779B97F4A8; +sub.f64 fd471, fd469, fd470; +sub.f64 fd472, %91, %138; +mul.f64 fd473, fd472, 0d3FEE6F0E134454FF; +sub.f64 fd474, %107, %123; +mul.f64 fd475, fd474, 0dBFE2CF2304755A5E; +sub.f64 fd476, fd475, fd473; +sub.f64 fd477, fd471, fd476; +add.f64 fd478, fd476, fd471; +mul.f64 fd479, fd461, 0d3FE9E3779B97F4A8; +sub.f64 fd480, %73, fd479; +fma.rn.f64 fd481, fd463, 0d3FD3C6EF372FE950, fd480; +mul.f64 fd482, fd472, 0d3FE2CF2304755A5E; +mul.f64 fd483, fd474, 0d3FEE6F0E134454FF; +sub.f64 fd484, fd483, fd482; +sub.f64 fd485, fd481, fd484; +add.f64 fd486, fd484, fd481; +fma.rn.f64 fd487, fd465, 0d3FD3C6EF372FE950, %75; +mul.f64 fd488, fd467, 0d3FE9E3779B97F4A8; +sub.f64 fd489, fd487, fd488; +sub.f64 fd490, %89, %137; +mul.f64 fd491, fd490, 0d3FEE6F0E134454FF; +sub.f64 fd492, %105, %121; +mul.f64 fd493, fd492, 0dBFE2CF2304755A5E; +sub.f64 fd494, fd493, fd491; +add.f64 fd495, fd494, fd489; +sub.f64 fd496, fd489, fd494; +mul.f64 fd497, fd465, 0d3FE9E3779B97F4A8; +sub.f64 fd498, %75, fd497; +fma.rn.f64 fd499, fd467, 0d3FD3C6EF372FE950, fd498; +mul.f64 fd500, fd490, 0d3FE2CF2304755A5E; +mul.f64 fd501, fd492, 0d3FEE6F0E134454FF; +sub.f64 fd502, fd501, fd500; +add.f64 fd503, fd502, fd499; +sub.f64 fd504, fd499, fd502; +mul.f64 fd505, fd433, 0d3FED3BC3AEFF7F95; +mul.f64 fd506, fd451, 0dBFDA07F921061AD1; +sub.f64 fd507, fd505, fd506; +mul.f64 fd508, fd451, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd509, fd433, 0dBFDA07F921061AD1, fd508; +mul.f64 fd510, fd477, 0d3FE5698496E20BD8; +mul.f64 fd511, fd495, 0dBFE7C7D7A833BEC2; +sub.f64 fd512, fd510, fd511; +mul.f64 fd513, fd495, 0d3FE5698496E20BD8; +fma.rn.f64 fd514, fd477, 0dBFE7C7D7A833BEC2, fd513; +mul.f64 fd515, fd441, 0d3FE5698496E20BD8; +mul.f64 fd516, fd459, 0dBFE7C7D7A833BEC2; +sub.f64 fd517, fd515, fd516; +mul.f64 fd518, fd459, 0d3FE5698496E20BD8; +fma.rn.f64 fd519, fd441, 0dBFE7C7D7A833BEC2, fd518; +mul.f64 fd520, fd485, 0dBFBAC2609B3C576C; +mul.f64 fd521, fd503, 0dBFEFD31F94F867C6; +sub.f64 fd522, fd520, fd521; +mul.f64 fd523, fd503, 0dBFBAC2609B3C576C; +fma.rn.f64 fd524, fd485, 0dBFEFD31F94F867C6, fd523; +mul.f64 fd525, fd442, 0d3FD3C6EF372FE950; +mul.f64 fd526, fd460, 0dBFEE6F0E134454FF; +sub.f64 fd527, fd525, fd526; +mul.f64 fd528, fd460, 0d3FD3C6EF372FE950; +fma.rn.f64 fd529, fd442, 0dBFEE6F0E134454FF, fd528; +mul.f64 fd530, fd486, 0dBFE9E3779B97F4A8; +mul.f64 fd531, fd504, 0dBFE2CF2304755A5E; +sub.f64 fd532, fd530, fd531; +mul.f64 fd533, fd504, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd534, fd486, 0dBFE2CF2304755A5E, fd533; +mul.f64 fd535, fd434, 0dBFBAC2609B3C576C; +mul.f64 fd536, fd452, 0dBFEFD31F94F867C6; +sub.f64 fd537, fd535, fd536; +mul.f64 fd538, fd452, 0dBFBAC2609B3C576C; +fma.rn.f64 fd539, fd434, 0dBFEFD31F94F867C6, fd538; +mul.f64 fd540, fd478, 0dBFEF4CFC327A0080; +mul.f64 fd541, fd496, 0d3FCA9CD9AC4258F6; +sub.f64 fd542, fd540, fd541; +mul.f64 fd543, fd496, 0dBFEF4CFC327A0080; +fma.rn.f64 fd544, fd478, 0d3FCA9CD9AC4258F6, fd543; +add.f64 fd545, fd420, fd464; +add.f64 fd546, fd376, fd545; +add.f64 fd547, fd424, fd468; +add.f64 fd548, fd380, fd547; +mul.f64 fd549, fd545, 0d3FE0000000000000; +sub.f64 fd550, fd376, fd549; +sub.f64 fd551, fd424, fd468; +mul.f64 fd552, fd551, 0d3FEBB67AE8584CAA; +add.f64 fd553, fd552, fd550; +sub.f64 fd554, fd550, fd552; +mul.f64 fd555, fd547, 0d3FE0000000000000; +sub.f64 fd556, fd380, fd555; +sub.f64 fd557, fd420, fd464; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +sub.f64 fd559, fd556, fd558; +add.f64 fd560, fd558, fd556; +add.f64 fd561, fd507, fd512; +add.f64 fd562, fd389, fd561; +add.f64 fd563, fd509, fd514; +add.f64 fd564, fd407, fd563; +mul.f64 fd565, fd561, 0d3FE0000000000000; +sub.f64 fd566, fd389, fd565; +sub.f64 fd567, fd509, fd514; +mul.f64 fd568, fd567, 0d3FEBB67AE8584CAA; +add.f64 fd569, fd568, fd566; +sub.f64 fd570, fd566, fd568; +mul.f64 fd571, fd563, 0d3FE0000000000000; +sub.f64 fd572, fd407, fd571; +sub.f64 fd573, fd507, fd512; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +sub.f64 fd575, fd572, fd574; +add.f64 fd576, fd574, fd572; +add.f64 fd577, fd517, fd522; +add.f64 fd578, fd397, fd577; +add.f64 fd579, fd519, fd524; +add.f64 fd580, fd415, fd579; +mul.f64 fd581, fd577, 0d3FE0000000000000; +sub.f64 fd582, fd397, fd581; +sub.f64 fd583, fd519, fd524; +mul.f64 fd584, fd583, 0d3FEBB67AE8584CAA; +add.f64 fd585, fd584, fd582; +sub.f64 fd586, fd582, fd584; +mul.f64 fd587, fd579, 0d3FE0000000000000; +sub.f64 fd588, fd415, fd587; +sub.f64 fd589, fd517, fd522; +mul.f64 fd590, fd589, 0d3FEBB67AE8584CAA; +sub.f64 fd591, fd588, fd590; +add.f64 fd592, fd590, fd588; +add.f64 fd593, fd527, fd532; +add.f64 fd594, fd398, fd593; +add.f64 fd595, fd529, fd534; +add.f64 fd596, fd416, fd595; +mul.f64 fd597, fd593, 0d3FE0000000000000; +sub.f64 fd598, fd398, fd597; +sub.f64 fd599, fd529, fd534; +mul.f64 fd600, fd599, 0d3FEBB67AE8584CAA; +add.f64 fd601, fd600, fd598; +sub.f64 fd602, fd598, fd600; +mul.f64 fd603, fd595, 0d3FE0000000000000; +sub.f64 fd604, fd416, fd603; +sub.f64 fd605, fd527, fd532; +mul.f64 fd606, fd605, 0d3FEBB67AE8584CAA; +sub.f64 fd607, fd604, fd606; +add.f64 fd608, fd606, fd604; +add.f64 fd609, fd537, fd542; +add.f64 fd610, fd390, fd609; +add.f64 fd611, fd539, fd544; +add.f64 fd612, fd408, fd611; +mul.f64 fd613, fd609, 0d3FE0000000000000; +sub.f64 fd614, fd390, fd613; +sub.f64 fd615, fd539, fd544; +mul.f64 fd616, fd615, 0d3FEBB67AE8584CAA; +add.f64 fd617, fd616, fd614; +sub.f64 fd618, fd614, fd616; +mul.f64 fd619, fd611, 0d3FE0000000000000; +sub.f64 fd620, fd408, fd619; +sub.f64 fd621, fd537, fd542; +mul.f64 fd622, fd621, 0d3FEBB67AE8584CAA; +sub.f64 fd623, fd620, fd622; +add.f64 fd624, fd622, fd620; +mul.f64 fd625, fd562, 0d3FEF4CFC327A0080; +mul.f64 fd626, fd564, 0dBFCA9CD9AC4258F6; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd564, 0d3FEF4CFC327A0080; +fma.rn.f64 fd629, fd562, 0dBFCA9CD9AC4258F6, fd628; +mul.f64 fd630, fd578, 0d3FED3BC3AEFF7F95; +mul.f64 fd631, fd580, 0dBFDA07F921061AD1; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd580, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd634, fd578, 0dBFDA07F921061AD1, fd633; +mul.f64 fd635, fd594, 0d3FE9E3779B97F4A8; +mul.f64 fd636, fd596, 0dBFE2CF2304755A5E; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd596, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd639, fd594, 0dBFE2CF2304755A5E, fd638; +mul.f64 fd640, fd610, 0d3FE5698496E20BD8; +mul.f64 fd641, fd612, 0dBFE7C7D7A833BEC2; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd612, 0d3FE5698496E20BD8; +fma.rn.f64 fd644, fd610, 0dBFE7C7D7A833BEC2, fd643; +mul.f64 fd645, fd553, 0d3FE0000000000000; +mul.f64 fd646, fd559, 0dBFEBB67AE8584CAA; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd559, 0d3FE0000000000000; +fma.rn.f64 fd649, fd553, 0dBFEBB67AE8584CAA, fd648; +mul.f64 fd650, fd569, 0d3FD3C6EF372FE950; +mul.f64 fd651, fd575, 0dBFEE6F0E134454FF; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd575, 0d3FD3C6EF372FE950; +fma.rn.f64 fd654, fd569, 0dBFEE6F0E134454FF, fd653; +mul.f64 fd655, fd585, 0d3FBAC2609B3C576C; +mul.f64 fd656, fd591, 0dBFEFD31F94F867C6; +sub.f64 fd657, fd655, fd656; +mul.f64 fd658, fd591, 0d3FBAC2609B3C576C; +fma.rn.f64 fd659, fd585, 0dBFEFD31F94F867C6, fd658; +mul.f64 fd660, fd601, 0dBFBAC2609B3C576C; +mul.f64 fd661, fd607, 0dBFEFD31F94F867C6; +sub.f64 fd662, fd660, fd661; +mul.f64 fd663, fd607, 0dBFBAC2609B3C576C; +fma.rn.f64 fd664, fd601, 0dBFEFD31F94F867C6, fd663; +mul.f64 fd665, fd617, 0dBFD3C6EF372FE950; +mul.f64 fd666, fd623, 0dBFEE6F0E134454FF; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd623, 0dBFD3C6EF372FE950; +fma.rn.f64 fd669, fd617, 0dBFEE6F0E134454FF, fd668; +mul.f64 fd670, fd554, 0dBFE0000000000000; +mul.f64 fd671, fd560, 0dBFEBB67AE8584CAA; +sub.f64 fd672, fd670, fd671; +mul.f64 fd673, fd560, 0dBFE0000000000000; +fma.rn.f64 fd674, fd554, 0dBFEBB67AE8584CAA, fd673; +mul.f64 fd675, fd570, 0dBFE5698496E20BD8; +mul.f64 fd676, fd576, 0dBFE7C7D7A833BEC2; +sub.f64 fd677, fd675, fd676; +mul.f64 fd678, fd576, 0dBFE5698496E20BD8; +fma.rn.f64 fd679, fd570, 0dBFE7C7D7A833BEC2, fd678; +mul.f64 fd680, fd586, 0dBFE9E3779B97F4A8; +mul.f64 fd681, fd592, 0dBFE2CF2304755A5E; +sub.f64 fd682, fd680, fd681; +mul.f64 fd683, fd592, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd684, fd586, 0dBFE2CF2304755A5E, fd683; +mul.f64 fd685, fd602, 0dBFED3BC3AEFF7F95; +mul.f64 fd686, fd608, 0dBFDA07F921061AD1; +sub.f64 fd687, fd685, fd686; +mul.f64 fd688, fd608, 0dBFED3BC3AEFF7F95; +fma.rn.f64 fd689, fd602, 0dBFDA07F921061AD1, fd688; +mul.f64 fd690, fd618, 0dBFEF4CFC327A0080; +mul.f64 fd691, fd624, 0dBFCA9CD9AC4258F6; +sub.f64 fd692, fd690, fd691; +mul.f64 fd693, fd624, 0dBFEF4CFC327A0080; +fma.rn.f64 fd694, fd618, 0dBFCA9CD9AC4258F6, fd693; +add.f64 %1, fd296, fd548; +add.f64 %0, fd294, fd546; +add.f64 %3, fd312, fd629; +add.f64 %2, fd310, fd627; +add.f64 %5, fd328, fd634; +add.f64 %4, fd326, fd632; +add.f64 %7, fd344, fd639; +add.f64 %6, fd342, fd637; +add.f64 %9, fd360, fd644; +add.f64 %8, fd358, fd642; +add.f64 %11, fd307, fd649; +add.f64 %10, fd301, fd647; +add.f64 %13, fd323, fd654; +add.f64 %12, fd317, fd652; +add.f64 %15, fd339, fd659; +add.f64 %14, fd333, fd657; +add.f64 %17, fd355, fd664; +add.f64 %16, fd349, fd662; +add.f64 %19, fd371, fd669; +add.f64 %18, fd365, fd667; +add.f64 %21, fd308, fd674; +add.f64 %20, fd302, fd672; +add.f64 %23, fd324, fd679; +add.f64 %22, fd318, fd677; +add.f64 %25, fd340, fd684; +add.f64 %24, fd334, fd682; +add.f64 %27, fd356, fd689; +add.f64 %26, fd350, fd687; +add.f64 %29, fd372, fd694; +add.f64 %28, fd366, fd692; +sub.f64 %31, fd296, fd548; +sub.f64 %30, fd294, fd546; +sub.f64 %33, fd312, fd629; +sub.f64 %32, fd310, fd627; +sub.f64 %35, fd328, fd634; +sub.f64 %34, fd326, fd632; +sub.f64 %37, fd344, fd639; +sub.f64 %36, fd342, fd637; +sub.f64 %39, fd360, fd644; +sub.f64 %38, fd358, fd642; +sub.f64 %41, fd307, fd649; +sub.f64 %40, fd301, fd647; +sub.f64 %43, fd323, fd654; +sub.f64 %42, fd317, fd652; +sub.f64 %45, fd339, fd659; +sub.f64 %44, fd333, fd657; +sub.f64 %47, fd355, fd664; +sub.f64 %46, fd349, fd662; +sub.f64 %49, fd371, fd669; +sub.f64 %48, fd365, fd667; +sub.f64 %51, fd308, fd674; +sub.f64 %50, fd302, fd672; +sub.f64 %53, fd324, fd679; +sub.f64 %52, fd318, fd677; +sub.f64 %55, fd340, fd684; +sub.f64 %54, fd334, fd682; +sub.f64 %57, fd356, fd689; +sub.f64 %56, fd350, fd687; +sub.f64 %59, fd372, fd694; +sub.f64 %58, fd366, fd692; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y), "=d"(rmem[28].x), "=d"(rmem[28].y), "=d"(rmem[29].x), "=d"(rmem[29].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y), "d"(rmem[28].x), "d"(rmem[28].y), "d"(rmem[28].y), "d"(rmem[29].x), "d"(rmem[29].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..f83890c18e322 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_30_fp64_inv.hpp.inc @@ -0,0 +1,638 @@ +#ifndef CUFFTDX_FFT_30_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_30_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<589, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<743>; +.reg .b64 rd<2>; +add.f64 fd121, %76, %124; +add.f64 fd122, %60, fd121; +add.f64 fd123, %92, %108; +add.f64 fd124, fd123, fd122; +add.f64 fd125, %77, %125; +add.f64 fd126, %61, fd125; +add.f64 fd127, %93, %109; +add.f64 fd128, fd127, fd126; +fma.rn.f64 fd129, fd121, 0d3FD3C6EF372FE950, %60; +mul.f64 fd130, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd131, fd129, fd130; +sub.f64 fd132, %77, %125; +mul.f64 fd133, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd134, %93, %109; +fma.rn.f64 fd135, fd134, 0d3FE2CF2304755A5E, fd133; +sub.f64 fd136, fd131, fd135; +add.f64 fd137, fd135, fd131; +mul.f64 fd138, fd121, 0d3FE9E3779B97F4A8; +sub.f64 fd139, %60, fd138; +fma.rn.f64 fd140, fd123, 0d3FD3C6EF372FE950, fd139; +mul.f64 fd141, fd132, 0d3FE2CF2304755A5E; +mul.f64 fd142, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd143, fd141, fd142; +sub.f64 fd144, fd140, fd143; +add.f64 fd145, fd143, fd140; +fma.rn.f64 fd146, fd125, 0d3FD3C6EF372FE950, %61; +mul.f64 fd147, fd127, 0d3FE9E3779B97F4A8; +sub.f64 fd148, fd146, fd147; +sub.f64 fd149, %76, %124; +mul.f64 fd150, fd149, 0d3FEE6F0E134454FF; +sub.f64 fd151, %92, %108; +fma.rn.f64 fd152, fd151, 0d3FE2CF2304755A5E, fd150; +add.f64 fd153, fd152, fd148; +sub.f64 fd154, fd148, fd152; +mul.f64 fd155, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd156, %61, fd155; +fma.rn.f64 fd157, fd127, 0d3FD3C6EF372FE950, fd156; +mul.f64 fd158, fd149, 0d3FE2CF2304755A5E; +mul.f64 fd159, fd151, 0d3FEE6F0E134454FF; +sub.f64 fd160, fd158, fd159; +add.f64 fd161, fd160, fd157; +sub.f64 fd162, fd157, fd160; +add.f64 fd163, %81, %129; +add.f64 fd164, %65, fd163; +add.f64 fd165, %97, %113; +add.f64 fd166, fd165, fd164; +add.f64 fd167, %83, %131; +add.f64 fd168, %67, fd167; +add.f64 fd169, %99, %115; +add.f64 fd170, fd169, fd168; +fma.rn.f64 fd171, fd163, 0d3FD3C6EF372FE950, %65; +mul.f64 fd172, fd165, 0d3FE9E3779B97F4A8; +sub.f64 fd173, fd171, fd172; +sub.f64 fd174, %83, %131; +mul.f64 fd175, fd174, 0d3FEE6F0E134454FF; +sub.f64 fd176, %99, %115; +fma.rn.f64 fd177, fd176, 0d3FE2CF2304755A5E, fd175; +sub.f64 fd178, fd173, fd177; +add.f64 fd179, fd177, fd173; +mul.f64 fd180, fd163, 0d3FE9E3779B97F4A8; +sub.f64 fd181, %65, fd180; +fma.rn.f64 fd182, fd165, 0d3FD3C6EF372FE950, fd181; +mul.f64 fd183, fd174, 0d3FE2CF2304755A5E; +mul.f64 fd184, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd185, fd183, fd184; +sub.f64 fd186, fd182, fd185; +add.f64 fd187, fd185, fd182; +fma.rn.f64 fd188, fd167, 0d3FD3C6EF372FE950, %67; +mul.f64 fd189, fd169, 0d3FE9E3779B97F4A8; +sub.f64 fd190, fd188, fd189; +sub.f64 fd191, %81, %129; +mul.f64 fd192, fd191, 0d3FEE6F0E134454FF; +sub.f64 fd193, %97, %113; +fma.rn.f64 fd194, fd193, 0d3FE2CF2304755A5E, fd192; +add.f64 fd195, fd194, fd190; +sub.f64 fd196, fd190, fd194; +mul.f64 fd197, fd167, 0d3FE9E3779B97F4A8; +sub.f64 fd198, %67, fd197; +fma.rn.f64 fd199, fd169, 0d3FD3C6EF372FE950, fd198; +mul.f64 fd200, fd191, 0d3FE2CF2304755A5E; +mul.f64 fd201, fd193, 0d3FEE6F0E134454FF; +sub.f64 fd202, fd200, fd201; +add.f64 fd203, fd202, fd199; +sub.f64 fd204, fd199, fd202; +add.f64 fd205, %86, %134; +add.f64 fd206, %70, fd205; +add.f64 fd207, %102, %118; +add.f64 fd208, fd207, fd206; +add.f64 fd209, %88, %136; +add.f64 fd210, %72, fd209; +add.f64 fd211, %104, %120; +add.f64 fd212, fd211, fd210; +fma.rn.f64 fd213, fd205, 0d3FD3C6EF372FE950, %70; +mul.f64 fd214, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd215, fd213, fd214; +sub.f64 fd216, %88, %136; +mul.f64 fd217, fd216, 0d3FEE6F0E134454FF; +sub.f64 fd218, %104, %120; +fma.rn.f64 fd219, fd218, 0d3FE2CF2304755A5E, fd217; +sub.f64 fd220, fd215, fd219; +add.f64 fd221, fd219, fd215; +mul.f64 fd222, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd223, %70, fd222; +fma.rn.f64 fd224, fd207, 0d3FD3C6EF372FE950, fd223; +mul.f64 fd225, fd216, 0d3FE2CF2304755A5E; +mul.f64 fd226, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd227, fd225, fd226; +sub.f64 fd228, fd224, fd227; +add.f64 fd229, fd227, fd224; +fma.rn.f64 fd230, fd209, 0d3FD3C6EF372FE950, %72; +mul.f64 fd231, fd211, 0d3FE9E3779B97F4A8; +sub.f64 fd232, fd230, fd231; +sub.f64 fd233, %86, %134; +mul.f64 fd234, fd233, 0d3FEE6F0E134454FF; +sub.f64 fd235, %102, %118; +fma.rn.f64 fd236, fd235, 0d3FE2CF2304755A5E, fd234; +add.f64 fd237, fd236, fd232; +sub.f64 fd238, fd232, fd236; +mul.f64 fd239, fd209, 0d3FE9E3779B97F4A8; +sub.f64 fd240, %72, fd239; +fma.rn.f64 fd241, fd211, 0d3FD3C6EF372FE950, fd240; +mul.f64 fd242, fd233, 0d3FE2CF2304755A5E; +mul.f64 fd243, fd235, 0d3FEE6F0E134454FF; +sub.f64 fd244, fd242, fd243; +add.f64 fd245, fd244, fd241; +sub.f64 fd246, fd241, fd244; +mul.f64 fd247, fd178, 0d3FED3BC3AEFF7F95; +mul.f64 fd248, fd195, 0d3FDA07F921061AD1; +sub.f64 fd249, fd247, fd248; +mul.f64 fd250, fd195, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd251, fd178, 0d3FDA07F921061AD1, fd250; +mul.f64 fd252, fd220, 0d3FE5698496E20BD8; +mul.f64 fd253, fd237, 0d3FE7C7D7A833BEC2; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd237, 0d3FE5698496E20BD8; +fma.rn.f64 fd256, fd220, 0d3FE7C7D7A833BEC2, fd255; +mul.f64 fd257, fd186, 0d3FE5698496E20BD8; +mul.f64 fd258, fd203, 0d3FE7C7D7A833BEC2; +sub.f64 fd259, fd257, fd258; +mul.f64 fd260, fd203, 0d3FE5698496E20BD8; +fma.rn.f64 fd261, fd186, 0d3FE7C7D7A833BEC2, fd260; +mul.f64 fd262, fd228, 0dBFBAC2609B3C576C; +mul.f64 fd263, fd245, 0d3FEFD31F94F867C6; +sub.f64 fd264, fd262, fd263; +mul.f64 fd265, fd245, 0dBFBAC2609B3C576C; +fma.rn.f64 fd266, fd228, 0d3FEFD31F94F867C6, fd265; +mul.f64 fd267, fd187, 0d3FD3C6EF372FE950; +mul.f64 fd268, fd204, 0d3FEE6F0E134454FF; +sub.f64 fd269, fd267, fd268; +mul.f64 fd270, fd204, 0d3FD3C6EF372FE950; +fma.rn.f64 fd271, fd187, 0d3FEE6F0E134454FF, fd270; +mul.f64 fd272, fd229, 0dBFE9E3779B97F4A8; +mul.f64 fd273, fd246, 0d3FE2CF2304755A5E; +sub.f64 fd274, fd272, fd273; +mul.f64 fd275, fd246, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd276, fd229, 0d3FE2CF2304755A5E, fd275; +mul.f64 fd277, fd179, 0dBFBAC2609B3C576C; +mul.f64 fd278, fd196, 0d3FEFD31F94F867C6; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd196, 0dBFBAC2609B3C576C; +fma.rn.f64 fd281, fd179, 0d3FEFD31F94F867C6, fd280; +mul.f64 fd282, fd221, 0dBFEF4CFC327A0080; +mul.f64 fd283, fd238, 0dBFCA9CD9AC4258F6; +sub.f64 fd284, fd282, fd283; +mul.f64 fd285, fd238, 0dBFEF4CFC327A0080; +fma.rn.f64 fd286, fd221, 0dBFCA9CD9AC4258F6, fd285; +add.f64 fd287, fd166, fd208; +add.f64 fd288, fd124, fd287; +add.f64 fd289, fd170, fd212; +add.f64 fd290, fd128, fd289; +mul.f64 fd291, fd287, 0d3FE0000000000000; +sub.f64 fd292, fd124, fd291; +sub.f64 fd293, fd170, fd212; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +add.f64 fd295, fd294, fd292; +sub.f64 fd296, fd292, fd294; +mul.f64 fd297, fd289, 0d3FE0000000000000; +sub.f64 fd298, fd128, fd297; +sub.f64 fd299, fd166, fd208; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +sub.f64 fd301, fd298, fd300; +add.f64 fd302, fd300, fd298; +add.f64 fd303, fd249, fd254; +add.f64 fd304, fd136, fd303; +add.f64 fd305, fd251, fd256; +add.f64 fd306, fd153, fd305; +mul.f64 fd307, fd303, 0d3FE0000000000000; +sub.f64 fd308, fd136, fd307; +sub.f64 fd309, fd251, fd256; +mul.f64 fd310, fd309, 0dBFEBB67AE8584CAA; +add.f64 fd311, fd310, fd308; +sub.f64 fd312, fd308, fd310; +mul.f64 fd313, fd305, 0d3FE0000000000000; +sub.f64 fd314, fd153, fd313; +sub.f64 fd315, fd249, fd254; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +sub.f64 fd317, fd314, fd316; +add.f64 fd318, fd316, fd314; +add.f64 fd319, fd259, fd264; +add.f64 fd320, fd144, fd319; +add.f64 fd321, fd261, fd266; +add.f64 fd322, fd161, fd321; +mul.f64 fd323, fd319, 0d3FE0000000000000; +sub.f64 fd324, fd144, fd323; +sub.f64 fd325, fd261, fd266; +mul.f64 fd326, fd325, 0dBFEBB67AE8584CAA; +add.f64 fd327, fd326, fd324; +sub.f64 fd328, fd324, fd326; +mul.f64 fd329, fd321, 0d3FE0000000000000; +sub.f64 fd330, fd161, fd329; +sub.f64 fd331, fd259, fd264; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +sub.f64 fd333, fd330, fd332; +add.f64 fd334, fd332, fd330; +add.f64 fd335, fd269, fd274; +add.f64 fd336, fd145, fd335; +add.f64 fd337, fd271, fd276; +add.f64 fd338, fd162, fd337; +mul.f64 fd339, fd335, 0d3FE0000000000000; +sub.f64 fd340, fd145, fd339; +sub.f64 fd341, fd271, fd276; +mul.f64 fd342, fd341, 0dBFEBB67AE8584CAA; +add.f64 fd343, fd342, fd340; +sub.f64 fd344, fd340, fd342; +mul.f64 fd345, fd337, 0d3FE0000000000000; +sub.f64 fd346, fd162, fd345; +sub.f64 fd347, fd269, fd274; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +sub.f64 fd349, fd346, fd348; +add.f64 fd350, fd348, fd346; +add.f64 fd351, fd279, fd284; +add.f64 fd352, fd137, fd351; +add.f64 fd353, fd281, fd286; +add.f64 fd354, fd154, fd353; +mul.f64 fd355, fd351, 0d3FE0000000000000; +sub.f64 fd356, fd137, fd355; +sub.f64 fd357, fd281, fd286; +mul.f64 fd358, fd357, 0dBFEBB67AE8584CAA; +add.f64 fd359, fd358, fd356; +sub.f64 fd360, fd356, fd358; +mul.f64 fd361, fd353, 0d3FE0000000000000; +sub.f64 fd362, fd154, fd361; +sub.f64 fd363, fd279, fd284; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +sub.f64 fd365, fd362, fd364; +add.f64 fd366, fd364, fd362; +add.f64 fd367, %78, %126; +add.f64 fd368, %62, fd367; +add.f64 fd369, %94, %110; +add.f64 fd370, fd369, fd368; +add.f64 fd371, %80, %128; +add.f64 fd372, %64, fd371; +add.f64 fd373, %96, %112; +add.f64 fd374, fd373, fd372; +fma.rn.f64 fd375, fd367, 0d3FD3C6EF372FE950, %62; +mul.f64 fd376, fd369, 0d3FE9E3779B97F4A8; +sub.f64 fd377, fd375, fd376; +sub.f64 fd378, %80, %128; +mul.f64 fd379, fd378, 0d3FEE6F0E134454FF; +sub.f64 fd380, %96, %112; +fma.rn.f64 fd381, fd380, 0d3FE2CF2304755A5E, fd379; +sub.f64 fd382, fd377, fd381; +add.f64 fd383, fd381, fd377; +mul.f64 fd384, fd367, 0d3FE9E3779B97F4A8; +sub.f64 fd385, %62, fd384; +fma.rn.f64 fd386, fd369, 0d3FD3C6EF372FE950, fd385; +mul.f64 fd387, fd378, 0d3FE2CF2304755A5E; +mul.f64 fd388, fd380, 0d3FEE6F0E134454FF; +sub.f64 fd389, fd387, fd388; +sub.f64 fd390, fd386, fd389; +add.f64 fd391, fd389, fd386; +fma.rn.f64 fd392, fd371, 0d3FD3C6EF372FE950, %64; +mul.f64 fd393, fd373, 0d3FE9E3779B97F4A8; +sub.f64 fd394, fd392, fd393; +sub.f64 fd395, %78, %126; +mul.f64 fd396, fd395, 0d3FEE6F0E134454FF; +sub.f64 fd397, %94, %110; +fma.rn.f64 fd398, fd397, 0d3FE2CF2304755A5E, fd396; +add.f64 fd399, fd398, fd394; +sub.f64 fd400, fd394, fd398; +mul.f64 fd401, fd371, 0d3FE9E3779B97F4A8; +sub.f64 fd402, %64, fd401; +fma.rn.f64 fd403, fd373, 0d3FD3C6EF372FE950, fd402; +mul.f64 fd404, fd395, 0d3FE2CF2304755A5E; +mul.f64 fd405, fd397, 0d3FEE6F0E134454FF; +sub.f64 fd406, fd404, fd405; +add.f64 fd407, fd406, fd403; +sub.f64 fd408, fd403, fd406; +add.f64 fd409, %84, %132; +add.f64 fd410, %68, fd409; +add.f64 fd411, %100, %116; +add.f64 fd412, fd411, fd410; +add.f64 fd413, %85, %133; +add.f64 fd414, %69, fd413; +add.f64 fd415, %101, %117; +add.f64 fd416, fd415, fd414; +fma.rn.f64 fd417, fd409, 0d3FD3C6EF372FE950, %68; +mul.f64 fd418, fd411, 0d3FE9E3779B97F4A8; +sub.f64 fd419, fd417, fd418; +sub.f64 fd420, %85, %133; +mul.f64 fd421, fd420, 0d3FEE6F0E134454FF; +sub.f64 fd422, %101, %117; +fma.rn.f64 fd423, fd422, 0d3FE2CF2304755A5E, fd421; +sub.f64 fd424, fd419, fd423; +add.f64 fd425, fd423, fd419; +mul.f64 fd426, fd409, 0d3FE9E3779B97F4A8; +sub.f64 fd427, %68, fd426; +fma.rn.f64 fd428, fd411, 0d3FD3C6EF372FE950, fd427; +mul.f64 fd429, fd420, 0d3FE2CF2304755A5E; +mul.f64 fd430, fd422, 0d3FEE6F0E134454FF; +sub.f64 fd431, fd429, fd430; +sub.f64 fd432, fd428, fd431; +add.f64 fd433, fd431, fd428; +fma.rn.f64 fd434, fd413, 0d3FD3C6EF372FE950, %69; +mul.f64 fd435, fd415, 0d3FE9E3779B97F4A8; +sub.f64 fd436, fd434, fd435; +sub.f64 fd437, %84, %132; +mul.f64 fd438, fd437, 0d3FEE6F0E134454FF; +sub.f64 fd439, %100, %116; +fma.rn.f64 fd440, fd439, 0d3FE2CF2304755A5E, fd438; +add.f64 fd441, fd440, fd436; +sub.f64 fd442, fd436, fd440; +mul.f64 fd443, fd413, 0d3FE9E3779B97F4A8; +sub.f64 fd444, %69, fd443; +fma.rn.f64 fd445, fd415, 0d3FD3C6EF372FE950, fd444; +mul.f64 fd446, fd437, 0d3FE2CF2304755A5E; +mul.f64 fd447, fd439, 0d3FEE6F0E134454FF; +sub.f64 fd448, fd446, fd447; +add.f64 fd449, fd448, fd445; +sub.f64 fd450, fd445, fd448; +add.f64 fd451, %89, %137; +add.f64 fd452, %73, fd451; +add.f64 fd453, %105, %121; +add.f64 fd454, fd453, fd452; +add.f64 fd455, %91, %138; +add.f64 fd456, %75, fd455; +add.f64 fd457, %107, %123; +add.f64 fd458, fd457, fd456; +fma.rn.f64 fd459, fd451, 0d3FD3C6EF372FE950, %73; +mul.f64 fd460, fd453, 0d3FE9E3779B97F4A8; +sub.f64 fd461, fd459, fd460; +sub.f64 fd462, %91, %138; +mul.f64 fd463, fd462, 0d3FEE6F0E134454FF; +sub.f64 fd464, %107, %123; +fma.rn.f64 fd465, fd464, 0d3FE2CF2304755A5E, fd463; +sub.f64 fd466, fd461, fd465; +add.f64 fd467, fd465, fd461; +mul.f64 fd468, fd451, 0d3FE9E3779B97F4A8; +sub.f64 fd469, %73, fd468; +fma.rn.f64 fd470, fd453, 0d3FD3C6EF372FE950, fd469; +mul.f64 fd471, fd462, 0d3FE2CF2304755A5E; +mul.f64 fd472, fd464, 0d3FEE6F0E134454FF; +sub.f64 fd473, fd471, fd472; +sub.f64 fd474, fd470, fd473; +add.f64 fd475, fd473, fd470; +fma.rn.f64 fd476, fd455, 0d3FD3C6EF372FE950, %75; +mul.f64 fd477, fd457, 0d3FE9E3779B97F4A8; +sub.f64 fd478, fd476, fd477; +sub.f64 fd479, %89, %137; +mul.f64 fd480, fd479, 0d3FEE6F0E134454FF; +sub.f64 fd481, %105, %121; +fma.rn.f64 fd482, fd481, 0d3FE2CF2304755A5E, fd480; +add.f64 fd483, fd482, fd478; +sub.f64 fd484, fd478, fd482; +mul.f64 fd485, fd455, 0d3FE9E3779B97F4A8; +sub.f64 fd486, %75, fd485; +fma.rn.f64 fd487, fd457, 0d3FD3C6EF372FE950, fd486; +mul.f64 fd488, fd479, 0d3FE2CF2304755A5E; +mul.f64 fd489, fd481, 0d3FEE6F0E134454FF; +sub.f64 fd490, fd488, fd489; +add.f64 fd491, fd490, fd487; +sub.f64 fd492, fd487, fd490; +mul.f64 fd493, fd424, 0d3FED3BC3AEFF7F95; +mul.f64 fd494, fd441, 0d3FDA07F921061AD1; +sub.f64 fd495, fd493, fd494; +mul.f64 fd496, fd441, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd497, fd424, 0d3FDA07F921061AD1, fd496; +mul.f64 fd498, fd466, 0d3FE5698496E20BD8; +mul.f64 fd499, fd483, 0d3FE7C7D7A833BEC2; +sub.f64 fd500, fd498, fd499; +mul.f64 fd501, fd483, 0d3FE5698496E20BD8; +fma.rn.f64 fd502, fd466, 0d3FE7C7D7A833BEC2, fd501; +mul.f64 fd503, fd432, 0d3FE5698496E20BD8; +mul.f64 fd504, fd449, 0d3FE7C7D7A833BEC2; +sub.f64 fd505, fd503, fd504; +mul.f64 fd506, fd449, 0d3FE5698496E20BD8; +fma.rn.f64 fd507, fd432, 0d3FE7C7D7A833BEC2, fd506; +mul.f64 fd508, fd474, 0dBFBAC2609B3C576C; +mul.f64 fd509, fd491, 0d3FEFD31F94F867C6; +sub.f64 fd510, fd508, fd509; +mul.f64 fd511, fd491, 0dBFBAC2609B3C576C; +fma.rn.f64 fd512, fd474, 0d3FEFD31F94F867C6, fd511; +mul.f64 fd513, fd433, 0d3FD3C6EF372FE950; +mul.f64 fd514, fd450, 0d3FEE6F0E134454FF; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd450, 0d3FD3C6EF372FE950; +fma.rn.f64 fd517, fd433, 0d3FEE6F0E134454FF, fd516; +mul.f64 fd518, fd475, 0dBFE9E3779B97F4A8; +mul.f64 fd519, fd492, 0d3FE2CF2304755A5E; +sub.f64 fd520, fd518, fd519; +mul.f64 fd521, fd492, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd522, fd475, 0d3FE2CF2304755A5E, fd521; +mul.f64 fd523, fd425, 0dBFBAC2609B3C576C; +mul.f64 fd524, fd442, 0d3FEFD31F94F867C6; +sub.f64 fd525, fd523, fd524; +mul.f64 fd526, fd442, 0dBFBAC2609B3C576C; +fma.rn.f64 fd527, fd425, 0d3FEFD31F94F867C6, fd526; +mul.f64 fd528, fd467, 0dBFEF4CFC327A0080; +mul.f64 fd529, fd484, 0dBFCA9CD9AC4258F6; +sub.f64 fd530, fd528, fd529; +mul.f64 fd531, fd484, 0dBFEF4CFC327A0080; +fma.rn.f64 fd532, fd467, 0dBFCA9CD9AC4258F6, fd531; +add.f64 fd533, fd412, fd454; +add.f64 fd534, fd370, fd533; +add.f64 fd535, fd416, fd458; +add.f64 fd536, fd374, fd535; +mul.f64 fd537, fd533, 0d3FE0000000000000; +sub.f64 fd538, fd370, fd537; +sub.f64 fd539, fd416, fd458; +mul.f64 fd540, fd539, 0dBFEBB67AE8584CAA; +add.f64 fd541, fd540, fd538; +sub.f64 fd542, fd538, fd540; +mul.f64 fd543, fd535, 0d3FE0000000000000; +sub.f64 fd544, fd374, fd543; +sub.f64 fd545, fd412, fd454; +mul.f64 fd546, fd545, 0dBFEBB67AE8584CAA; +sub.f64 fd547, fd544, fd546; +add.f64 fd548, fd546, fd544; +add.f64 fd549, fd495, fd500; +add.f64 fd550, fd382, fd549; +add.f64 fd551, fd497, fd502; +add.f64 fd552, fd399, fd551; +mul.f64 fd553, fd549, 0d3FE0000000000000; +sub.f64 fd554, fd382, fd553; +sub.f64 fd555, fd497, fd502; +mul.f64 fd556, fd555, 0dBFEBB67AE8584CAA; +add.f64 fd557, fd556, fd554; +sub.f64 fd558, fd554, fd556; +mul.f64 fd559, fd551, 0d3FE0000000000000; +sub.f64 fd560, fd399, fd559; +sub.f64 fd561, fd495, fd500; +mul.f64 fd562, fd561, 0dBFEBB67AE8584CAA; +sub.f64 fd563, fd560, fd562; +add.f64 fd564, fd562, fd560; +add.f64 fd565, fd505, fd510; +add.f64 fd566, fd390, fd565; +add.f64 fd567, fd507, fd512; +add.f64 fd568, fd407, fd567; +mul.f64 fd569, fd565, 0d3FE0000000000000; +sub.f64 fd570, fd390, fd569; +sub.f64 fd571, fd507, fd512; +mul.f64 fd572, fd571, 0dBFEBB67AE8584CAA; +add.f64 fd573, fd572, fd570; +sub.f64 fd574, fd570, fd572; +mul.f64 fd575, fd567, 0d3FE0000000000000; +sub.f64 fd576, fd407, fd575; +sub.f64 fd577, fd505, fd510; +mul.f64 fd578, fd577, 0dBFEBB67AE8584CAA; +sub.f64 fd579, fd576, fd578; +add.f64 fd580, fd578, fd576; +add.f64 fd581, fd515, fd520; +add.f64 fd582, fd391, fd581; +add.f64 fd583, fd517, fd522; +add.f64 fd584, fd408, fd583; +mul.f64 fd585, fd581, 0d3FE0000000000000; +sub.f64 fd586, fd391, fd585; +sub.f64 fd587, fd517, fd522; +mul.f64 fd588, fd587, 0dBFEBB67AE8584CAA; +add.f64 fd589, fd588, fd586; +sub.f64 fd590, fd586, fd588; +mul.f64 fd591, fd583, 0d3FE0000000000000; +sub.f64 fd592, fd408, fd591; +sub.f64 fd593, fd515, fd520; +mul.f64 fd594, fd593, 0dBFEBB67AE8584CAA; +sub.f64 fd595, fd592, fd594; +add.f64 fd596, fd594, fd592; +add.f64 fd597, fd525, fd530; +add.f64 fd598, fd383, fd597; +add.f64 fd599, fd527, fd532; +add.f64 fd600, fd400, fd599; +mul.f64 fd601, fd597, 0d3FE0000000000000; +sub.f64 fd602, fd383, fd601; +sub.f64 fd603, fd527, fd532; +mul.f64 fd604, fd603, 0dBFEBB67AE8584CAA; +add.f64 fd605, fd604, fd602; +sub.f64 fd606, fd602, fd604; +mul.f64 fd607, fd599, 0d3FE0000000000000; +sub.f64 fd608, fd400, fd607; +sub.f64 fd609, fd525, fd530; +mul.f64 fd610, fd609, 0dBFEBB67AE8584CAA; +sub.f64 fd611, fd608, fd610; +add.f64 fd612, fd610, fd608; +mul.f64 fd613, fd550, 0d3FEF4CFC327A0080; +mul.f64 fd614, fd552, 0d3FCA9CD9AC4258F6; +sub.f64 fd615, fd613, fd614; +mul.f64 fd616, fd552, 0d3FEF4CFC327A0080; +fma.rn.f64 fd617, fd550, 0d3FCA9CD9AC4258F6, fd616; +mul.f64 fd618, fd566, 0d3FED3BC3AEFF7F95; +mul.f64 fd619, fd568, 0d3FDA07F921061AD1; +sub.f64 fd620, fd618, fd619; +mul.f64 fd621, fd568, 0d3FED3BC3AEFF7F95; +fma.rn.f64 fd622, fd566, 0d3FDA07F921061AD1, fd621; +mul.f64 fd623, fd582, 0d3FE9E3779B97F4A8; +mul.f64 fd624, fd584, 0d3FE2CF2304755A5E; +sub.f64 fd625, fd623, fd624; +mul.f64 fd626, fd584, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd627, fd582, 0d3FE2CF2304755A5E, fd626; +mul.f64 fd628, fd598, 0d3FE5698496E20BD8; +mul.f64 fd629, fd600, 0d3FE7C7D7A833BEC2; +sub.f64 fd630, fd628, fd629; +mul.f64 fd631, fd600, 0d3FE5698496E20BD8; +fma.rn.f64 fd632, fd598, 0d3FE7C7D7A833BEC2, fd631; +mul.f64 fd633, fd541, 0d3FE0000000000000; +mul.f64 fd634, fd547, 0d3FEBB67AE8584CAA; +sub.f64 fd635, fd633, fd634; +mul.f64 fd636, fd547, 0d3FE0000000000000; +fma.rn.f64 fd637, fd541, 0d3FEBB67AE8584CAA, fd636; +mul.f64 fd638, fd557, 0d3FD3C6EF372FE950; +mul.f64 fd639, fd563, 0d3FEE6F0E134454FF; +sub.f64 fd640, fd638, fd639; +mul.f64 fd641, fd563, 0d3FD3C6EF372FE950; +fma.rn.f64 fd642, fd557, 0d3FEE6F0E134454FF, fd641; +mul.f64 fd643, fd573, 0d3FBAC2609B3C576C; +mul.f64 fd644, fd579, 0d3FEFD31F94F867C6; +sub.f64 fd645, fd643, fd644; +mul.f64 fd646, fd579, 0d3FBAC2609B3C576C; +fma.rn.f64 fd647, fd573, 0d3FEFD31F94F867C6, fd646; +mul.f64 fd648, fd589, 0dBFBAC2609B3C576C; +mul.f64 fd649, fd595, 0d3FEFD31F94F867C6; +sub.f64 fd650, fd648, fd649; +mul.f64 fd651, fd595, 0dBFBAC2609B3C576C; +fma.rn.f64 fd652, fd589, 0d3FEFD31F94F867C6, fd651; +mul.f64 fd653, fd605, 0dBFD3C6EF372FE950; +mul.f64 fd654, fd611, 0d3FEE6F0E134454FF; +sub.f64 fd655, fd653, fd654; +mul.f64 fd656, fd611, 0dBFD3C6EF372FE950; +fma.rn.f64 fd657, fd605, 0d3FEE6F0E134454FF, fd656; +mul.f64 fd658, fd542, 0dBFE0000000000000; +mul.f64 fd659, fd548, 0d3FEBB67AE8584CAA; +sub.f64 fd660, fd658, fd659; +mul.f64 fd661, fd548, 0dBFE0000000000000; +fma.rn.f64 fd662, fd542, 0d3FEBB67AE8584CAA, fd661; +mul.f64 fd663, fd558, 0dBFE5698496E20BD8; +mul.f64 fd664, fd564, 0d3FE7C7D7A833BEC2; +sub.f64 fd665, fd663, fd664; +mul.f64 fd666, fd564, 0dBFE5698496E20BD8; +fma.rn.f64 fd667, fd558, 0d3FE7C7D7A833BEC2, fd666; +mul.f64 fd668, fd574, 0dBFE9E3779B97F4A8; +mul.f64 fd669, fd580, 0d3FE2CF2304755A5E; +sub.f64 fd670, fd668, fd669; +mul.f64 fd671, fd580, 0dBFE9E3779B97F4A8; +fma.rn.f64 fd672, fd574, 0d3FE2CF2304755A5E, fd671; +mul.f64 fd673, fd590, 0dBFED3BC3AEFF7F95; +mul.f64 fd674, fd596, 0d3FDA07F921061AD1; +sub.f64 fd675, fd673, fd674; +mul.f64 fd676, fd596, 0dBFED3BC3AEFF7F95; +fma.rn.f64 fd677, fd590, 0d3FDA07F921061AD1, fd676; +mul.f64 fd678, fd606, 0dBFEF4CFC327A0080; +mul.f64 fd679, fd612, 0d3FCA9CD9AC4258F6; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd612, 0dBFEF4CFC327A0080; +fma.rn.f64 fd682, fd606, 0d3FCA9CD9AC4258F6, fd681; +add.f64 %1, fd290, fd536; +add.f64 %0, fd288, fd534; +add.f64 %3, fd306, fd617; +add.f64 %2, fd304, fd615; +add.f64 %5, fd322, fd622; +add.f64 %4, fd320, fd620; +add.f64 %7, fd338, fd627; +add.f64 %6, fd336, fd625; +add.f64 %9, fd354, fd632; +add.f64 %8, fd352, fd630; +add.f64 %11, fd301, fd637; +add.f64 %10, fd295, fd635; +add.f64 %13, fd317, fd642; +add.f64 %12, fd311, fd640; +add.f64 %15, fd333, fd647; +add.f64 %14, fd327, fd645; +add.f64 %17, fd349, fd652; +add.f64 %16, fd343, fd650; +add.f64 %19, fd365, fd657; +add.f64 %18, fd359, fd655; +add.f64 %21, fd302, fd662; +add.f64 %20, fd296, fd660; +add.f64 %23, fd318, fd667; +add.f64 %22, fd312, fd665; +add.f64 %25, fd334, fd672; +add.f64 %24, fd328, fd670; +add.f64 %27, fd350, fd677; +add.f64 %26, fd344, fd675; +add.f64 %29, fd366, fd682; +add.f64 %28, fd360, fd680; +sub.f64 %31, fd290, fd536; +sub.f64 %30, fd288, fd534; +sub.f64 %33, fd306, fd617; +sub.f64 %32, fd304, fd615; +sub.f64 %35, fd322, fd622; +sub.f64 %34, fd320, fd620; +sub.f64 %37, fd338, fd627; +sub.f64 %36, fd336, fd625; +sub.f64 %39, fd354, fd632; +sub.f64 %38, fd352, fd630; +sub.f64 %41, fd301, fd637; +sub.f64 %40, fd295, fd635; +sub.f64 %43, fd317, fd642; +sub.f64 %42, fd311, fd640; +sub.f64 %45, fd333, fd647; +sub.f64 %44, fd327, fd645; +sub.f64 %47, fd349, fd652; +sub.f64 %46, fd343, fd650; +sub.f64 %49, fd365, fd657; +sub.f64 %48, fd359, fd655; +sub.f64 %51, fd302, fd662; +sub.f64 %50, fd296, fd660; +sub.f64 %53, fd318, fd667; +sub.f64 %52, fd312, fd665; +sub.f64 %55, fd334, fd672; +sub.f64 %54, fd328, fd670; +sub.f64 %57, fd350, fd677; +sub.f64 %56, fd344, fd675; +sub.f64 %59, fd366, fd682; +sub.f64 %58, fd360, fd680; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y), "=d"(rmem[28].x), "=d"(rmem[28].y), "=d"(rmem[29].x), "=d"(rmem[29].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y), "d"(rmem[28].x), "d"(rmem[28].y), "d"(rmem[28].y), "d"(rmem[29].x), "d"(rmem[29].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..64b5ffd86d481 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp16_fwd.hpp.inc @@ -0,0 +1,30058 @@ +#ifndef CUFFTDX_FFT_3125_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_3125_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<913, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<603>; +.reg .b32 r<10458>; +.reg .b64 rd<6>; +mov.u32 r10392, %50; +mov.u32 r10457, %tid.y; +mad.lo.s32 r10393, r10457, 25000, r10392; +mov.u32 r10394, %tid.x; +mov.f32 f594, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1, {low, high}; +} +mov.f32 f596, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2, {low, high}; +} +mov.f32 f590, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r3, {low, high}; +} +mov.f32 f592, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %95, %81; +} +{ +add.f16x2 r12, %54, r9; +} +{ +add.f16x2 r15, %59, %94; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %70, %58; +} +{ +add.f16x2 r24, %79, r21; +} +{ +add.f16x2 r27, %87, %69; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %95, %81; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %54, r36; +} +{ +add.f16x2 r42, %59, %94; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %70, %58; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %87, %69; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %95, %81; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %54, r72; +} +{ +add.f16x2 r78, %59, %94; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %70, %58; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %87, %69; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %95, %81; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %54, r108; +} +{ +add.f16x2 r114, %59, %94; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %70, %58; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %87, %69; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %95, %81; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %54, r144; +} +{ +add.f16x2 r150, %59, %94; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %70, %58; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %87, %69; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %70, %58; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %79, r180; +} +{ +add.f16x2 r186, %87, %69; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %95, %81; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %59, %94; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %70, %58; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %79, r216; +} +{ +add.f16x2 r222, %87, %69; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %95, %81; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %59, %94; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %70, %58; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %79, r252; +} +{ +add.f16x2 r258, %87, %69; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %95, %81; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %59, %94; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %70, %58; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %79, r288; +} +{ +add.f16x2 r294, %87, %69; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %95, %81; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %59, %94; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %97, %83; +} +{ +add.f16x2 r332, %55, r329; +} +{ +add.f16x2 r335, %62, %96; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %72, %61; +} +{ +add.f16x2 r344, %80, r341; +} +{ +add.f16x2 r347, %89, %71; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %97, %83; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %55, r356; +} +{ +add.f16x2 r362, %62, %96; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %72, %61; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %89, %71; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %97, %83; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %55, r392; +} +{ +add.f16x2 r398, %62, %96; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %72, %61; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %89, %71; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %97, %83; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %55, r428; +} +{ +add.f16x2 r434, %62, %96; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %72, %61; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %89, %71; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %97, %83; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %55, r464; +} +{ +add.f16x2 r470, %62, %96; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %72, %61; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %89, %71; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %72, %61; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %80, r500; +} +{ +add.f16x2 r506, %89, %71; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %97, %83; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %62, %96; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %72, %61; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %80, r536; +} +{ +add.f16x2 r542, %89, %71; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %97, %83; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %62, %96; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %72, %61; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %80, r572; +} +{ +add.f16x2 r578, %89, %71; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %97, %83; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %62, %96; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %72, %61; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %80, r608; +} +{ +add.f16x2 r614, %89, %71; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %97, %83; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %62, %96; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +add.f16x2 r649, %99, %85; +} +{ +add.f16x2 r652, %56, r649; +} +{ +add.f16x2 r655, %64, %98; +} +{ +add.f16x2 r658, r652, r655; +} +{ +add.f16x2 r661, %74, %63; +} +{ +add.f16x2 r664, %82, r661; +} +{ +add.f16x2 r667, %91, %73; +} +{ +add.f16x2 r670, r664, r667; +} +{ +add.f16x2 r673, %99, %85; +} +{ +mul.f16x2 r676, r673, r641; +} +{ +add.f16x2 r679, %56, r676; +} +{ +add.f16x2 r682, %64, %98; +} +{ +mul.f16x2 r685, r682, r643; +} +{ +add.f16x2 r688, r679, r685; +} +{ +sub.f16x2 r691, %74, %63; +} +{ +mul.f16x2 r694, r691, r642; +} +{ +sub.f16x2 r697, %91, %73; +} +{ +mul.f16x2 r700, r697, r644; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r688, r703; +} +{ +add.f16x2 r709, %99, %85; +} +{ +mul.f16x2 r712, r709, r641; +} +{ +add.f16x2 r715, %56, r712; +} +{ +add.f16x2 r718, %64, %98; +} +{ +mul.f16x2 r721, r718, r643; +} +{ +add.f16x2 r724, r715, r721; +} +{ +sub.f16x2 r727, %74, %63; +} +{ +mul.f16x2 r730, r727, r642; +} +{ +sub.f16x2 r733, %91, %73; +} +{ +mul.f16x2 r736, r733, r644; +} +{ +add.f16x2 r739, r730, r736; +} +{ +add.f16x2 r742, r724, r739; +} +{ +add.f16x2 r745, %99, %85; +} +{ +mul.f16x2 r748, r745, r643; +} +{ +add.f16x2 r751, %56, r748; +} +{ +add.f16x2 r754, %64, %98; +} +{ +mul.f16x2 r757, r754, r645; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %63; +} +{ +mul.f16x2 r766, r763, r644; +} +{ +sub.f16x2 r769, %91, %73; +} +{ +mul.f16x2 r772, r769, r647; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r760, r775; +} +{ +add.f16x2 r781, %99, %85; +} +{ +mul.f16x2 r784, r781, r643; +} +{ +add.f16x2 r787, %56, r784; +} +{ +add.f16x2 r790, %64, %98; +} +{ +mul.f16x2 r793, r790, r645; +} +{ +add.f16x2 r796, r787, r793; +} +{ +sub.f16x2 r799, %74, %63; +} +{ +mul.f16x2 r802, r799, r644; +} +{ +sub.f16x2 r805, %91, %73; +} +{ +mul.f16x2 r808, r805, r647; +} +{ +add.f16x2 r811, r802, r808; +} +{ +add.f16x2 r814, r796, r811; +} +{ +add.f16x2 r817, %74, %63; +} +{ +mul.f16x2 r820, r817, r641; +} +{ +add.f16x2 r823, %82, r820; +} +{ +add.f16x2 r826, %91, %73; +} +{ +mul.f16x2 r829, r826, r643; +} +{ +add.f16x2 r832, r823, r829; +} +{ +sub.f16x2 r835, %99, %85; +} +{ +mul.f16x2 r838, r835, r642; +} +{ +sub.f16x2 r841, %64, %98; +} +{ +mul.f16x2 r844, r841, r644; +} +{ +add.f16x2 r847, r838, r844; +} +{ +add.f16x2 r850, r832, r847; +} +{ +add.f16x2 r853, %74, %63; +} +{ +mul.f16x2 r856, r853, r641; +} +{ +add.f16x2 r859, %82, r856; +} +{ +add.f16x2 r862, %91, %73; +} +{ +mul.f16x2 r865, r862, r643; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %99, %85; +} +{ +mul.f16x2 r874, r871, r642; +} +{ +sub.f16x2 r877, %64, %98; +} +{ +mul.f16x2 r880, r877, r644; +} +{ +add.f16x2 r883, r874, r880; +} +{ +sub.f16x2 r886, r868, r883; +} +{ +add.f16x2 r889, %74, %63; +} +{ +mul.f16x2 r892, r889, r643; +} +{ +add.f16x2 r895, %82, r892; +} +{ +add.f16x2 r898, %91, %73; +} +{ +mul.f16x2 r901, r898, r645; +} +{ +add.f16x2 r904, r895, r901; +} +{ +sub.f16x2 r907, %99, %85; +} +{ +mul.f16x2 r910, r907, r644; +} +{ +sub.f16x2 r913, %64, %98; +} +{ +mul.f16x2 r916, r913, r647; +} +{ +add.f16x2 r919, r910, r916; +} +{ +add.f16x2 r922, r904, r919; +} +{ +add.f16x2 r925, %74, %63; +} +{ +mul.f16x2 r928, r925, r643; +} +{ +add.f16x2 r931, %82, r928; +} +{ +add.f16x2 r934, %91, %73; +} +{ +mul.f16x2 r937, r934, r645; +} +{ +add.f16x2 r940, r931, r937; +} +{ +sub.f16x2 r943, %99, %85; +} +{ +mul.f16x2 r946, r943, r644; +} +{ +sub.f16x2 r949, %64, %98; +} +{ +mul.f16x2 r952, r949, r647; +} +{ +add.f16x2 r955, r946, r952; +} +{ +sub.f16x2 r958, r940, r955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r966, {low, high}; +} +{ +neg.f16x2 r967, r966; +} +{ +add.f16x2 r969, %51, %88; +} +{ +add.f16x2 r972, %57, r969; +} +{ +add.f16x2 r975, %66, %100; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, %76, %65; +} +{ +add.f16x2 r984, %84, r981; +} +{ +add.f16x2 r987, %92, %75; +} +{ +add.f16x2 r990, r984, r987; +} +{ +add.f16x2 r993, %51, %88; +} +{ +mul.f16x2 r996, r993, r961; +} +{ +add.f16x2 r999, %57, r996; +} +{ +add.f16x2 r1002, %66, %100; +} +{ +mul.f16x2 r1005, r1002, r963; +} +{ +add.f16x2 r1008, r999, r1005; +} +{ +sub.f16x2 r1011, %76, %65; +} +{ +mul.f16x2 r1014, r1011, r962; +} +{ +sub.f16x2 r1017, %92, %75; +} +{ +mul.f16x2 r1020, r1017, r964; +} +{ +add.f16x2 r1023, r1014, r1020; +} +{ +sub.f16x2 r1026, r1008, r1023; +} +{ +add.f16x2 r1029, %51, %88; +} +{ +mul.f16x2 r1032, r1029, r961; +} +{ +add.f16x2 r1035, %57, r1032; +} +{ +add.f16x2 r1038, %66, %100; +} +{ +mul.f16x2 r1041, r1038, r963; +} +{ +add.f16x2 r1044, r1035, r1041; +} +{ +sub.f16x2 r1047, %76, %65; +} +{ +mul.f16x2 r1050, r1047, r962; +} +{ +sub.f16x2 r1053, %92, %75; +} +{ +mul.f16x2 r1056, r1053, r964; +} +{ +add.f16x2 r1059, r1050, r1056; +} +{ +add.f16x2 r1062, r1044, r1059; +} +{ +add.f16x2 r1065, %51, %88; +} +{ +mul.f16x2 r1068, r1065, r963; +} +{ +add.f16x2 r1071, %57, r1068; +} +{ +add.f16x2 r1074, %66, %100; +} +{ +mul.f16x2 r1077, r1074, r965; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +sub.f16x2 r1083, %76, %65; +} +{ +mul.f16x2 r1086, r1083, r964; +} +{ +sub.f16x2 r1089, %92, %75; +} +{ +mul.f16x2 r1092, r1089, r967; +} +{ +add.f16x2 r1095, r1086, r1092; +} +{ +sub.f16x2 r1098, r1080, r1095; +} +{ +add.f16x2 r1101, %51, %88; +} +{ +mul.f16x2 r1104, r1101, r963; +} +{ +add.f16x2 r1107, %57, r1104; +} +{ +add.f16x2 r1110, %66, %100; +} +{ +mul.f16x2 r1113, r1110, r965; +} +{ +add.f16x2 r1116, r1107, r1113; +} +{ +sub.f16x2 r1119, %76, %65; +} +{ +mul.f16x2 r1122, r1119, r964; +} +{ +sub.f16x2 r1125, %92, %75; +} +{ +mul.f16x2 r1128, r1125, r967; +} +{ +add.f16x2 r1131, r1122, r1128; +} +{ +add.f16x2 r1134, r1116, r1131; +} +{ +add.f16x2 r1137, %76, %65; +} +{ +mul.f16x2 r1140, r1137, r961; +} +{ +add.f16x2 r1143, %84, r1140; +} +{ +add.f16x2 r1146, %92, %75; +} +{ +mul.f16x2 r1149, r1146, r963; +} +{ +add.f16x2 r1152, r1143, r1149; +} +{ +sub.f16x2 r1155, %51, %88; +} +{ +mul.f16x2 r1158, r1155, r962; +} +{ +sub.f16x2 r1161, %66, %100; +} +{ +mul.f16x2 r1164, r1161, r964; +} +{ +add.f16x2 r1167, r1158, r1164; +} +{ +add.f16x2 r1170, r1152, r1167; +} +{ +add.f16x2 r1173, %76, %65; +} +{ +mul.f16x2 r1176, r1173, r961; +} +{ +add.f16x2 r1179, %84, r1176; +} +{ +add.f16x2 r1182, %92, %75; +} +{ +mul.f16x2 r1185, r1182, r963; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +sub.f16x2 r1191, %51, %88; +} +{ +mul.f16x2 r1194, r1191, r962; +} +{ +sub.f16x2 r1197, %66, %100; +} +{ +mul.f16x2 r1200, r1197, r964; +} +{ +add.f16x2 r1203, r1194, r1200; +} +{ +sub.f16x2 r1206, r1188, r1203; +} +{ +add.f16x2 r1209, %76, %65; +} +{ +mul.f16x2 r1212, r1209, r963; +} +{ +add.f16x2 r1215, %84, r1212; +} +{ +add.f16x2 r1218, %92, %75; +} +{ +mul.f16x2 r1221, r1218, r965; +} +{ +add.f16x2 r1224, r1215, r1221; +} +{ +sub.f16x2 r1227, %51, %88; +} +{ +mul.f16x2 r1230, r1227, r964; +} +{ +sub.f16x2 r1233, %66, %100; +} +{ +mul.f16x2 r1236, r1233, r967; +} +{ +add.f16x2 r1239, r1230, r1236; +} +{ +add.f16x2 r1242, r1224, r1239; +} +{ +add.f16x2 r1245, %76, %65; +} +{ +mul.f16x2 r1248, r1245, r963; +} +{ +add.f16x2 r1251, %84, r1248; +} +{ +add.f16x2 r1254, %92, %75; +} +{ +mul.f16x2 r1257, r1254, r965; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +sub.f16x2 r1263, %51, %88; +} +{ +mul.f16x2 r1266, r1263, r964; +} +{ +sub.f16x2 r1269, %66, %100; +} +{ +mul.f16x2 r1272, r1269, r967; +} +{ +add.f16x2 r1275, r1266, r1272; +} +{ +sub.f16x2 r1278, r1260, r1275; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1281, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1286, {low, high}; +} +{ +neg.f16x2 r1287, r1286; +} +{ +add.f16x2 r1289, %53, %90; +} +{ +add.f16x2 r1292, %60, r1289; +} +{ +add.f16x2 r1295, %68, %52; +} +{ +add.f16x2 r1298, r1292, r1295; +} +{ +add.f16x2 r1301, %78, %67; +} +{ +add.f16x2 r1304, %86, r1301; +} +{ +add.f16x2 r1307, %93, %77; +} +{ +add.f16x2 r1310, r1304, r1307; +} +{ +add.f16x2 r1313, %53, %90; +} +{ +mul.f16x2 r1316, r1313, r1281; +} +{ +add.f16x2 r1319, %60, r1316; +} +{ +add.f16x2 r1322, %68, %52; +} +{ +mul.f16x2 r1325, r1322, r1283; +} +{ +add.f16x2 r1328, r1319, r1325; +} +{ +sub.f16x2 r1331, %78, %67; +} +{ +mul.f16x2 r1334, r1331, r1282; +} +{ +sub.f16x2 r1337, %93, %77; +} +{ +mul.f16x2 r1340, r1337, r1284; +} +{ +add.f16x2 r1343, r1334, r1340; +} +{ +sub.f16x2 r1346, r1328, r1343; +} +{ +add.f16x2 r1349, %53, %90; +} +{ +mul.f16x2 r1352, r1349, r1281; +} +{ +add.f16x2 r1355, %60, r1352; +} +{ +add.f16x2 r1358, %68, %52; +} +{ +mul.f16x2 r1361, r1358, r1283; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +sub.f16x2 r1367, %78, %67; +} +{ +mul.f16x2 r1370, r1367, r1282; +} +{ +sub.f16x2 r1373, %93, %77; +} +{ +mul.f16x2 r1376, r1373, r1284; +} +{ +add.f16x2 r1379, r1370, r1376; +} +{ +add.f16x2 r1382, r1364, r1379; +} +{ +add.f16x2 r1385, %53, %90; +} +{ +mul.f16x2 r1388, r1385, r1283; +} +{ +add.f16x2 r1391, %60, r1388; +} +{ +add.f16x2 r1394, %68, %52; +} +{ +mul.f16x2 r1397, r1394, r1285; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, %78, %67; +} +{ +mul.f16x2 r1406, r1403, r1284; +} +{ +sub.f16x2 r1409, %93, %77; +} +{ +mul.f16x2 r1412, r1409, r1287; +} +{ +add.f16x2 r1415, r1406, r1412; +} +{ +sub.f16x2 r1418, r1400, r1415; +} +{ +add.f16x2 r1421, %53, %90; +} +{ +mul.f16x2 r1424, r1421, r1283; +} +{ +add.f16x2 r1427, %60, r1424; +} +{ +add.f16x2 r1430, %68, %52; +} +{ +mul.f16x2 r1433, r1430, r1285; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +sub.f16x2 r1439, %78, %67; +} +{ +mul.f16x2 r1442, r1439, r1284; +} +{ +sub.f16x2 r1445, %93, %77; +} +{ +mul.f16x2 r1448, r1445, r1287; +} +{ +add.f16x2 r1451, r1442, r1448; +} +{ +add.f16x2 r1454, r1436, r1451; +} +{ +add.f16x2 r1457, %78, %67; +} +{ +mul.f16x2 r1460, r1457, r1281; +} +{ +add.f16x2 r1463, %86, r1460; +} +{ +add.f16x2 r1466, %93, %77; +} +{ +mul.f16x2 r1469, r1466, r1283; +} +{ +add.f16x2 r1472, r1463, r1469; +} +{ +sub.f16x2 r1475, %53, %90; +} +{ +mul.f16x2 r1478, r1475, r1282; +} +{ +sub.f16x2 r1481, %68, %52; +} +{ +mul.f16x2 r1484, r1481, r1284; +} +{ +add.f16x2 r1487, r1478, r1484; +} +{ +add.f16x2 r1490, r1472, r1487; +} +{ +add.f16x2 r1493, %78, %67; +} +{ +mul.f16x2 r1496, r1493, r1281; +} +{ +add.f16x2 r1499, %86, r1496; +} +{ +add.f16x2 r1502, %93, %77; +} +{ +mul.f16x2 r1505, r1502, r1283; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, %53, %90; +} +{ +mul.f16x2 r1514, r1511, r1282; +} +{ +sub.f16x2 r1517, %68, %52; +} +{ +mul.f16x2 r1520, r1517, r1284; +} +{ +add.f16x2 r1523, r1514, r1520; +} +{ +sub.f16x2 r1526, r1508, r1523; +} +{ +add.f16x2 r1529, %78, %67; +} +{ +mul.f16x2 r1532, r1529, r1283; +} +{ +add.f16x2 r1535, %86, r1532; +} +{ +add.f16x2 r1538, %93, %77; +} +{ +mul.f16x2 r1541, r1538, r1285; +} +{ +add.f16x2 r1544, r1535, r1541; +} +{ +sub.f16x2 r1547, %53, %90; +} +{ +mul.f16x2 r1550, r1547, r1284; +} +{ +sub.f16x2 r1553, %68, %52; +} +{ +mul.f16x2 r1556, r1553, r1287; +} +{ +add.f16x2 r1559, r1550, r1556; +} +{ +add.f16x2 r1562, r1544, r1559; +} +{ +add.f16x2 r1565, %78, %67; +} +{ +mul.f16x2 r1568, r1565, r1283; +} +{ +add.f16x2 r1571, %86, r1568; +} +{ +add.f16x2 r1574, %93, %77; +} +{ +mul.f16x2 r1577, r1574, r1285; +} +{ +add.f16x2 r1580, r1571, r1577; +} +{ +sub.f16x2 r1583, %53, %90; +} +{ +mul.f16x2 r1586, r1583, r1284; +} +{ +sub.f16x2 r1589, %68, %52; +} +{ +mul.f16x2 r1592, r1589, r1287; +} +{ +add.f16x2 r1595, r1586, r1592; +} +{ +sub.f16x2 r1598, r1580, r1595; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1601, {low, high}; +} +mov.f32 f332, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1602, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1603, {low, high}; +} +mov.f32 f336, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1604, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1605, {low, high}; +} +mov.f32 f340, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1606, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1607, {low, high}; +} +mov.f32 f344, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1608, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1611, {low, high}; +} +mov.f32 f352, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1612, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1615, {low, high}; +} +mov.f32 f360, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1616, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1617, {low, high}; +} +mov.f32 f364, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1618, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1623, {low, high}; +} +mov.f32 f376, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1631, {low, high}; +} +mov.f32 f392, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1632, {low, high}; +} +{ +mul.f16x2 r1649, r386, r1601; +} +{ +mul.f16x2 r1652, r530, r1602; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r386, r1602; +} +{ +fma.rn.f16x2 r1661, r530, r1601, r1658; +} +{ +mul.f16x2 r1665, r706, r1603; +} +{ +mul.f16x2 r1668, r850, r1604; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r706, r1604; +} +{ +fma.rn.f16x2 r1677, r850, r1603, r1674; +} +{ +mul.f16x2 r1681, r1026, r1605; +} +{ +mul.f16x2 r1684, r1170, r1606; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r1026, r1606; +} +{ +fma.rn.f16x2 r1693, r1170, r1605, r1690; +} +{ +mul.f16x2 r1697, r1346, r1607; +} +{ +mul.f16x2 r1700, r1490, r1608; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r1346, r1608; +} +{ +fma.rn.f16x2 r1709, r1490, r1607, r1706; +} +{ +mul.f16x2 r1713, r458, r1603; +} +{ +mul.f16x2 r1716, r602, r1604; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r458, r1604; +} +{ +fma.rn.f16x2 r1725, r602, r1603, r1722; +} +{ +mul.f16x2 r1729, r778, r1607; +} +{ +mul.f16x2 r1732, r922, r1608; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r778, r1608; +} +{ +fma.rn.f16x2 r1741, r922, r1607, r1738; +} +{ +mul.f16x2 r1745, r1098, r1611; +} +{ +mul.f16x2 r1748, r1242, r1612; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r1098, r1612; +} +{ +fma.rn.f16x2 r1757, r1242, r1611, r1754; +} +{ +mul.f16x2 r1761, r1418, r1615; +} +{ +mul.f16x2 r1764, r1562, r1616; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r1418, r1616; +} +{ +fma.rn.f16x2 r1773, r1562, r1615, r1770; +} +{ +mul.f16x2 r1777, r494, r1605; +} +{ +mul.f16x2 r1780, r638, r1606; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r494, r1606; +} +{ +fma.rn.f16x2 r1789, r638, r1605, r1786; +} +{ +mul.f16x2 r1793, r814, r1611; +} +{ +mul.f16x2 r1796, r958, r1612; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r814, r1612; +} +{ +fma.rn.f16x2 r1805, r958, r1611, r1802; +} +{ +mul.f16x2 r1809, r1134, r1617; +} +{ +mul.f16x2 r1812, r1278, r1618; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1134, r1618; +} +{ +fma.rn.f16x2 r1821, r1278, r1617, r1818; +} +{ +mul.f16x2 r1825, r1454, r1623; +} +{ +mul.f16x2 r1828, r1598, r1624; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1454, r1624; +} +{ +fma.rn.f16x2 r1837, r1598, r1623, r1834; +} +{ +mul.f16x2 r1841, r422, r1607; +} +{ +mul.f16x2 r1844, r566, r1608; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r422, r1608; +} +{ +fma.rn.f16x2 r1853, r566, r1607, r1850; +} +{ +mul.f16x2 r1857, r742, r1615; +} +{ +mul.f16x2 r1860, r886, r1616; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r742, r1616; +} +{ +fma.rn.f16x2 r1869, r886, r1615, r1866; +} +{ +mul.f16x2 r1873, r1062, r1623; +} +{ +mul.f16x2 r1876, r1206, r1624; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1062, r1624; +} +{ +fma.rn.f16x2 r1885, r1206, r1623, r1882; +} +{ +mul.f16x2 r1889, r1382, r1631; +} +{ +mul.f16x2 r1892, r1526, r1632; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1382, r1632; +} +{ +fma.rn.f16x2 r1901, r1526, r1631, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1905, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1906, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1910, {low, high}; +} +{ +neg.f16x2 r1911, r1910; +} +{ +add.f16x2 r1913, r338, r1298; +} +{ +add.f16x2 r1916, r18, r1913; +} +{ +add.f16x2 r1919, r658, r978; +} +{ +add.f16x2 r1922, r1916, r1919; +} +{ +add.f16x2 r1925, r350, r1310; +} +{ +add.f16x2 r1928, r30, r1925; +} +{ +add.f16x2 r1931, r670, r990; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r338, r1298; +} +{ +mul.f16x2 r1940, r1937, r1905; +} +{ +add.f16x2 r1943, r18, r1940; +} +{ +add.f16x2 r1946, r658, r978; +} +{ +mul.f16x2 r1949, r1946, r1907; +} +{ +add.f16x2 r1952, r1943, r1949; +} +{ +sub.f16x2 r1955, r350, r1310; +} +{ +mul.f16x2 r1958, r1955, r1906; +} +{ +sub.f16x2 r1961, r670, r990; +} +{ +mul.f16x2 r1964, r1961, r1908; +} +{ +add.f16x2 r1967, r1958, r1964; +} +{ +sub.f16x2 r1970, r1952, r1967; +} +{ +add.f16x2 r1973, r338, r1298; +} +{ +mul.f16x2 r1976, r1973, r1905; +} +{ +add.f16x2 r1979, r18, r1976; +} +{ +add.f16x2 r1982, r658, r978; +} +{ +mul.f16x2 r1985, r1982, r1907; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +sub.f16x2 r1991, r350, r1310; +} +{ +mul.f16x2 r1994, r1991, r1906; +} +{ +sub.f16x2 r1997, r670, r990; +} +{ +mul.f16x2 r2000, r1997, r1908; +} +{ +add.f16x2 r2003, r1994, r2000; +} +{ +add.f16x2 r2006, r1988, r2003; +} +{ +add.f16x2 r2009, r338, r1298; +} +{ +mul.f16x2 r2012, r2009, r1907; +} +{ +add.f16x2 r2015, r18, r2012; +} +{ +add.f16x2 r2018, r658, r978; +} +{ +mul.f16x2 r2021, r2018, r1909; +} +{ +add.f16x2 r2024, r2015, r2021; +} +{ +sub.f16x2 r2027, r350, r1310; +} +{ +mul.f16x2 r2030, r2027, r1908; +} +{ +sub.f16x2 r2033, r670, r990; +} +{ +mul.f16x2 r2036, r2033, r1911; +} +{ +add.f16x2 r2039, r2030, r2036; +} +{ +sub.f16x2 r2042, r2024, r2039; +} +{ +add.f16x2 r2045, r338, r1298; +} +{ +mul.f16x2 r2048, r2045, r1907; +} +{ +add.f16x2 r2051, r18, r2048; +} +{ +add.f16x2 r2054, r658, r978; +} +{ +mul.f16x2 r2057, r2054, r1909; +} +{ +add.f16x2 r2060, r2051, r2057; +} +{ +sub.f16x2 r2063, r350, r1310; +} +{ +mul.f16x2 r2066, r2063, r1908; +} +{ +sub.f16x2 r2069, r670, r990; +} +{ +mul.f16x2 r2072, r2069, r1911; +} +{ +add.f16x2 r2075, r2066, r2072; +} +{ +add.f16x2 r2078, r2060, r2075; +} +{ +add.f16x2 r2081, r350, r1310; +} +{ +mul.f16x2 r2084, r2081, r1905; +} +{ +add.f16x2 r2087, r30, r2084; +} +{ +add.f16x2 r2090, r670, r990; +} +{ +mul.f16x2 r2093, r2090, r1907; +} +{ +add.f16x2 r2096, r2087, r2093; +} +{ +sub.f16x2 r2099, r338, r1298; +} +{ +mul.f16x2 r2102, r2099, r1906; +} +{ +sub.f16x2 r2105, r658, r978; +} +{ +mul.f16x2 r2108, r2105, r1908; +} +{ +add.f16x2 r2111, r2102, r2108; +} +{ +add.f16x2 r2114, r2096, r2111; +} +{ +add.f16x2 r2117, r350, r1310; +} +{ +mul.f16x2 r2120, r2117, r1905; +} +{ +add.f16x2 r2123, r30, r2120; +} +{ +add.f16x2 r2126, r670, r990; +} +{ +mul.f16x2 r2129, r2126, r1907; +} +{ +add.f16x2 r2132, r2123, r2129; +} +{ +sub.f16x2 r2135, r338, r1298; +} +{ +mul.f16x2 r2138, r2135, r1906; +} +{ +sub.f16x2 r2141, r658, r978; +} +{ +mul.f16x2 r2144, r2141, r1908; +} +{ +add.f16x2 r2147, r2138, r2144; +} +{ +sub.f16x2 r2150, r2132, r2147; +} +{ +add.f16x2 r2153, r350, r1310; +} +{ +mul.f16x2 r2156, r2153, r1907; +} +{ +add.f16x2 r2159, r30, r2156; +} +{ +add.f16x2 r2162, r670, r990; +} +{ +mul.f16x2 r2165, r2162, r1909; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r338, r1298; +} +{ +mul.f16x2 r2174, r2171, r1908; +} +{ +sub.f16x2 r2177, r658, r978; +} +{ +mul.f16x2 r2180, r2177, r1911; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +add.f16x2 r2186, r2168, r2183; +} +{ +add.f16x2 r2189, r350, r1310; +} +{ +mul.f16x2 r2192, r2189, r1907; +} +{ +add.f16x2 r2195, r30, r2192; +} +{ +add.f16x2 r2198, r670, r990; +} +{ +mul.f16x2 r2201, r2198, r1909; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r338, r1298; +} +{ +mul.f16x2 r2210, r2207, r1908; +} +{ +sub.f16x2 r2213, r658, r978; +} +{ +mul.f16x2 r2216, r2213, r1911; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +sub.f16x2 r2222, r2204, r2219; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2226, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2228, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2230, {low, high}; +} +{ +neg.f16x2 r2231, r2230; +} +{ +add.f16x2 r2233, r1655, r1703; +} +{ +add.f16x2 r2236, r66, r2233; +} +{ +add.f16x2 r2239, r1671, r1687; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, r1661, r1709; +} +{ +add.f16x2 r2248, r210, r2245; +} +{ +add.f16x2 r2251, r1677, r1693; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r1655, r1703; +} +{ +mul.f16x2 r2260, r2257, r2225; +} +{ +add.f16x2 r2263, r66, r2260; +} +{ +add.f16x2 r2266, r1671, r1687; +} +{ +mul.f16x2 r2269, r2266, r2227; +} +{ +add.f16x2 r2272, r2263, r2269; +} +{ +sub.f16x2 r2275, r1661, r1709; +} +{ +mul.f16x2 r2278, r2275, r2226; +} +{ +sub.f16x2 r2281, r1677, r1693; +} +{ +mul.f16x2 r2284, r2281, r2228; +} +{ +add.f16x2 r2287, r2278, r2284; +} +{ +sub.f16x2 r2290, r2272, r2287; +} +{ +add.f16x2 r2293, r1655, r1703; +} +{ +mul.f16x2 r2296, r2293, r2225; +} +{ +add.f16x2 r2299, r66, r2296; +} +{ +add.f16x2 r2302, r1671, r1687; +} +{ +mul.f16x2 r2305, r2302, r2227; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1661, r1709; +} +{ +mul.f16x2 r2314, r2311, r2226; +} +{ +sub.f16x2 r2317, r1677, r1693; +} +{ +mul.f16x2 r2320, r2317, r2228; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +add.f16x2 r2326, r2308, r2323; +} +{ +add.f16x2 r2329, r1655, r1703; +} +{ +mul.f16x2 r2332, r2329, r2227; +} +{ +add.f16x2 r2335, r66, r2332; +} +{ +add.f16x2 r2338, r1671, r1687; +} +{ +mul.f16x2 r2341, r2338, r2229; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1661, r1709; +} +{ +mul.f16x2 r2350, r2347, r2228; +} +{ +sub.f16x2 r2353, r1677, r1693; +} +{ +mul.f16x2 r2356, r2353, r2231; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r2344, r2359; +} +{ +add.f16x2 r2365, r1655, r1703; +} +{ +mul.f16x2 r2368, r2365, r2227; +} +{ +add.f16x2 r2371, r66, r2368; +} +{ +add.f16x2 r2374, r1671, r1687; +} +{ +mul.f16x2 r2377, r2374, r2229; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1661, r1709; +} +{ +mul.f16x2 r2386, r2383, r2228; +} +{ +sub.f16x2 r2389, r1677, r1693; +} +{ +mul.f16x2 r2392, r2389, r2231; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +add.f16x2 r2398, r2380, r2395; +} +{ +add.f16x2 r2401, r1661, r1709; +} +{ +mul.f16x2 r2404, r2401, r2225; +} +{ +add.f16x2 r2407, r210, r2404; +} +{ +add.f16x2 r2410, r1677, r1693; +} +{ +mul.f16x2 r2413, r2410, r2227; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1655, r1703; +} +{ +mul.f16x2 r2422, r2419, r2226; +} +{ +sub.f16x2 r2425, r1671, r1687; +} +{ +mul.f16x2 r2428, r2425, r2228; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +{ +add.f16x2 r2437, r1661, r1709; +} +{ +mul.f16x2 r2440, r2437, r2225; +} +{ +add.f16x2 r2443, r210, r2440; +} +{ +add.f16x2 r2446, r1677, r1693; +} +{ +mul.f16x2 r2449, r2446, r2227; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1655, r1703; +} +{ +mul.f16x2 r2458, r2455, r2226; +} +{ +sub.f16x2 r2461, r1671, r1687; +} +{ +mul.f16x2 r2464, r2461, r2228; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r2452, r2467; +} +{ +add.f16x2 r2473, r1661, r1709; +} +{ +mul.f16x2 r2476, r2473, r2227; +} +{ +add.f16x2 r2479, r210, r2476; +} +{ +add.f16x2 r2482, r1677, r1693; +} +{ +mul.f16x2 r2485, r2482, r2229; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1655, r1703; +} +{ +mul.f16x2 r2494, r2491, r2228; +} +{ +sub.f16x2 r2497, r1671, r1687; +} +{ +mul.f16x2 r2500, r2497, r2231; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +add.f16x2 r2506, r2488, r2503; +} +{ +add.f16x2 r2509, r1661, r1709; +} +{ +mul.f16x2 r2512, r2509, r2227; +} +{ +add.f16x2 r2515, r210, r2512; +} +{ +add.f16x2 r2518, r1677, r1693; +} +{ +mul.f16x2 r2521, r2518, r2229; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1655, r1703; +} +{ +mul.f16x2 r2530, r2527, r2228; +} +{ +sub.f16x2 r2533, r1671, r1687; +} +{ +mul.f16x2 r2536, r2533, r2231; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2524, r2539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2545, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2546, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2547, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2548, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2549, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2550, {low, high}; +} +{ +neg.f16x2 r2551, r2550; +} +{ +add.f16x2 r2553, r1719, r1767; +} +{ +add.f16x2 r2556, r138, r2553; +} +{ +add.f16x2 r2559, r1735, r1751; +} +{ +add.f16x2 r2562, r2556, r2559; +} +{ +add.f16x2 r2565, r1725, r1773; +} +{ +add.f16x2 r2568, r282, r2565; +} +{ +add.f16x2 r2571, r1741, r1757; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r1719, r1767; +} +{ +mul.f16x2 r2580, r2577, r2545; +} +{ +add.f16x2 r2583, r138, r2580; +} +{ +add.f16x2 r2586, r1735, r1751; +} +{ +mul.f16x2 r2589, r2586, r2547; +} +{ +add.f16x2 r2592, r2583, r2589; +} +{ +sub.f16x2 r2595, r1725, r1773; +} +{ +mul.f16x2 r2598, r2595, r2546; +} +{ +sub.f16x2 r2601, r1741, r1757; +} +{ +mul.f16x2 r2604, r2601, r2548; +} +{ +add.f16x2 r2607, r2598, r2604; +} +{ +sub.f16x2 r2610, r2592, r2607; +} +{ +add.f16x2 r2613, r1719, r1767; +} +{ +mul.f16x2 r2616, r2613, r2545; +} +{ +add.f16x2 r2619, r138, r2616; +} +{ +add.f16x2 r2622, r1735, r1751; +} +{ +mul.f16x2 r2625, r2622, r2547; +} +{ +add.f16x2 r2628, r2619, r2625; +} +{ +sub.f16x2 r2631, r1725, r1773; +} +{ +mul.f16x2 r2634, r2631, r2546; +} +{ +sub.f16x2 r2637, r1741, r1757; +} +{ +mul.f16x2 r2640, r2637, r2548; +} +{ +add.f16x2 r2643, r2634, r2640; +} +{ +add.f16x2 r2646, r2628, r2643; +} +{ +add.f16x2 r2649, r1719, r1767; +} +{ +mul.f16x2 r2652, r2649, r2547; +} +{ +add.f16x2 r2655, r138, r2652; +} +{ +add.f16x2 r2658, r1735, r1751; +} +{ +mul.f16x2 r2661, r2658, r2549; +} +{ +add.f16x2 r2664, r2655, r2661; +} +{ +sub.f16x2 r2667, r1725, r1773; +} +{ +mul.f16x2 r2670, r2667, r2548; +} +{ +sub.f16x2 r2673, r1741, r1757; +} +{ +mul.f16x2 r2676, r2673, r2551; +} +{ +add.f16x2 r2679, r2670, r2676; +} +{ +sub.f16x2 r2682, r2664, r2679; +} +{ +add.f16x2 r2685, r1719, r1767; +} +{ +mul.f16x2 r2688, r2685, r2547; +} +{ +add.f16x2 r2691, r138, r2688; +} +{ +add.f16x2 r2694, r1735, r1751; +} +{ +mul.f16x2 r2697, r2694, r2549; +} +{ +add.f16x2 r2700, r2691, r2697; +} +{ +sub.f16x2 r2703, r1725, r1773; +} +{ +mul.f16x2 r2706, r2703, r2548; +} +{ +sub.f16x2 r2709, r1741, r1757; +} +{ +mul.f16x2 r2712, r2709, r2551; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2700, r2715; +} +{ +add.f16x2 r2721, r1725, r1773; +} +{ +mul.f16x2 r2724, r2721, r2545; +} +{ +add.f16x2 r2727, r282, r2724; +} +{ +add.f16x2 r2730, r1741, r1757; +} +{ +mul.f16x2 r2733, r2730, r2547; +} +{ +add.f16x2 r2736, r2727, r2733; +} +{ +sub.f16x2 r2739, r1719, r1767; +} +{ +mul.f16x2 r2742, r2739, r2546; +} +{ +sub.f16x2 r2745, r1735, r1751; +} +{ +mul.f16x2 r2748, r2745, r2548; +} +{ +add.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2736, r2751; +} +{ +add.f16x2 r2757, r1725, r1773; +} +{ +mul.f16x2 r2760, r2757, r2545; +} +{ +add.f16x2 r2763, r282, r2760; +} +{ +add.f16x2 r2766, r1741, r1757; +} +{ +mul.f16x2 r2769, r2766, r2547; +} +{ +add.f16x2 r2772, r2763, r2769; +} +{ +sub.f16x2 r2775, r1719, r1767; +} +{ +mul.f16x2 r2778, r2775, r2546; +} +{ +sub.f16x2 r2781, r1735, r1751; +} +{ +mul.f16x2 r2784, r2781, r2548; +} +{ +add.f16x2 r2787, r2778, r2784; +} +{ +sub.f16x2 r2790, r2772, r2787; +} +{ +add.f16x2 r2793, r1725, r1773; +} +{ +mul.f16x2 r2796, r2793, r2547; +} +{ +add.f16x2 r2799, r282, r2796; +} +{ +add.f16x2 r2802, r1741, r1757; +} +{ +mul.f16x2 r2805, r2802, r2549; +} +{ +add.f16x2 r2808, r2799, r2805; +} +{ +sub.f16x2 r2811, r1719, r1767; +} +{ +mul.f16x2 r2814, r2811, r2548; +} +{ +sub.f16x2 r2817, r1735, r1751; +} +{ +mul.f16x2 r2820, r2817, r2551; +} +{ +add.f16x2 r2823, r2814, r2820; +} +{ +add.f16x2 r2826, r2808, r2823; +} +{ +add.f16x2 r2829, r1725, r1773; +} +{ +mul.f16x2 r2832, r2829, r2547; +} +{ +add.f16x2 r2835, r282, r2832; +} +{ +add.f16x2 r2838, r1741, r1757; +} +{ +mul.f16x2 r2841, r2838, r2549; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r1719, r1767; +} +{ +mul.f16x2 r2850, r2847, r2548; +} +{ +sub.f16x2 r2853, r1735, r1751; +} +{ +mul.f16x2 r2856, r2853, r2551; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2844, r2859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2866, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2868, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2869, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2870, {low, high}; +} +{ +neg.f16x2 r2871, r2870; +} +{ +add.f16x2 r2873, r1783, r1831; +} +{ +add.f16x2 r2876, r174, r2873; +} +{ +add.f16x2 r2879, r1799, r1815; +} +{ +add.f16x2 r2882, r2876, r2879; +} +{ +add.f16x2 r2885, r1789, r1837; +} +{ +add.f16x2 r2888, r318, r2885; +} +{ +add.f16x2 r2891, r1805, r1821; +} +{ +add.f16x2 r2894, r2888, r2891; +} +{ +add.f16x2 r2897, r1783, r1831; +} +{ +mul.f16x2 r2900, r2897, r2865; +} +{ +add.f16x2 r2903, r174, r2900; +} +{ +add.f16x2 r2906, r1799, r1815; +} +{ +mul.f16x2 r2909, r2906, r2867; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +sub.f16x2 r2915, r1789, r1837; +} +{ +mul.f16x2 r2918, r2915, r2866; +} +{ +sub.f16x2 r2921, r1805, r1821; +} +{ +mul.f16x2 r2924, r2921, r2868; +} +{ +add.f16x2 r2927, r2918, r2924; +} +{ +sub.f16x2 r2930, r2912, r2927; +} +{ +add.f16x2 r2933, r1783, r1831; +} +{ +mul.f16x2 r2936, r2933, r2865; +} +{ +add.f16x2 r2939, r174, r2936; +} +{ +add.f16x2 r2942, r1799, r1815; +} +{ +mul.f16x2 r2945, r2942, r2867; +} +{ +add.f16x2 r2948, r2939, r2945; +} +{ +sub.f16x2 r2951, r1789, r1837; +} +{ +mul.f16x2 r2954, r2951, r2866; +} +{ +sub.f16x2 r2957, r1805, r1821; +} +{ +mul.f16x2 r2960, r2957, r2868; +} +{ +add.f16x2 r2963, r2954, r2960; +} +{ +add.f16x2 r2966, r2948, r2963; +} +{ +add.f16x2 r2969, r1783, r1831; +} +{ +mul.f16x2 r2972, r2969, r2867; +} +{ +add.f16x2 r2975, r174, r2972; +} +{ +add.f16x2 r2978, r1799, r1815; +} +{ +mul.f16x2 r2981, r2978, r2869; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 r2987, r1789, r1837; +} +{ +mul.f16x2 r2990, r2987, r2868; +} +{ +sub.f16x2 r2993, r1805, r1821; +} +{ +mul.f16x2 r2996, r2993, r2871; +} +{ +add.f16x2 r2999, r2990, r2996; +} +{ +sub.f16x2 r3002, r2984, r2999; +} +{ +add.f16x2 r3005, r1783, r1831; +} +{ +mul.f16x2 r3008, r3005, r2867; +} +{ +add.f16x2 r3011, r174, r3008; +} +{ +add.f16x2 r3014, r1799, r1815; +} +{ +mul.f16x2 r3017, r3014, r2869; +} +{ +add.f16x2 r3020, r3011, r3017; +} +{ +sub.f16x2 r3023, r1789, r1837; +} +{ +mul.f16x2 r3026, r3023, r2868; +} +{ +sub.f16x2 r3029, r1805, r1821; +} +{ +mul.f16x2 r3032, r3029, r2871; +} +{ +add.f16x2 r3035, r3026, r3032; +} +{ +add.f16x2 r3038, r3020, r3035; +} +{ +add.f16x2 r3041, r1789, r1837; +} +{ +mul.f16x2 r3044, r3041, r2865; +} +{ +add.f16x2 r3047, r318, r3044; +} +{ +add.f16x2 r3050, r1805, r1821; +} +{ +mul.f16x2 r3053, r3050, r2867; +} +{ +add.f16x2 r3056, r3047, r3053; +} +{ +sub.f16x2 r3059, r1783, r1831; +} +{ +mul.f16x2 r3062, r3059, r2866; +} +{ +sub.f16x2 r3065, r1799, r1815; +} +{ +mul.f16x2 r3068, r3065, r2868; +} +{ +add.f16x2 r3071, r3062, r3068; +} +{ +add.f16x2 r3074, r3056, r3071; +} +{ +add.f16x2 r3077, r1789, r1837; +} +{ +mul.f16x2 r3080, r3077, r2865; +} +{ +add.f16x2 r3083, r318, r3080; +} +{ +add.f16x2 r3086, r1805, r1821; +} +{ +mul.f16x2 r3089, r3086, r2867; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 r3095, r1783, r1831; +} +{ +mul.f16x2 r3098, r3095, r2866; +} +{ +sub.f16x2 r3101, r1799, r1815; +} +{ +mul.f16x2 r3104, r3101, r2868; +} +{ +add.f16x2 r3107, r3098, r3104; +} +{ +sub.f16x2 r3110, r3092, r3107; +} +{ +add.f16x2 r3113, r1789, r1837; +} +{ +mul.f16x2 r3116, r3113, r2867; +} +{ +add.f16x2 r3119, r318, r3116; +} +{ +add.f16x2 r3122, r1805, r1821; +} +{ +mul.f16x2 r3125, r3122, r2869; +} +{ +add.f16x2 r3128, r3119, r3125; +} +{ +sub.f16x2 r3131, r1783, r1831; +} +{ +mul.f16x2 r3134, r3131, r2868; +} +{ +sub.f16x2 r3137, r1799, r1815; +} +{ +mul.f16x2 r3140, r3137, r2871; +} +{ +add.f16x2 r3143, r3134, r3140; +} +{ +add.f16x2 r3146, r3128, r3143; +} +{ +add.f16x2 r3149, r1789, r1837; +} +{ +mul.f16x2 r3152, r3149, r2867; +} +{ +add.f16x2 r3155, r318, r3152; +} +{ +add.f16x2 r3158, r1805, r1821; +} +{ +mul.f16x2 r3161, r3158, r2869; +} +{ +add.f16x2 r3164, r3155, r3161; +} +{ +sub.f16x2 r3167, r1783, r1831; +} +{ +mul.f16x2 r3170, r3167, r2868; +} +{ +sub.f16x2 r3173, r1799, r1815; +} +{ +mul.f16x2 r3176, r3173, r2871; +} +{ +add.f16x2 r3179, r3170, r3176; +} +{ +sub.f16x2 r3182, r3164, r3179; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3185, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3186, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r3187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r3188, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3189, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3190, {low, high}; +} +{ +neg.f16x2 r3191, r3190; +} +{ +add.f16x2 r3193, r1847, r1895; +} +{ +add.f16x2 r3196, r102, r3193; +} +{ +add.f16x2 r3199, r1863, r1879; +} +{ +add.f16x2 r3202, r3196, r3199; +} +{ +add.f16x2 r3205, r1853, r1901; +} +{ +add.f16x2 r3208, r246, r3205; +} +{ +add.f16x2 r3211, r1869, r1885; +} +{ +add.f16x2 r3214, r3208, r3211; +} +{ +add.f16x2 r3217, r1847, r1895; +} +{ +mul.f16x2 r3220, r3217, r3185; +} +{ +add.f16x2 r3223, r102, r3220; +} +{ +add.f16x2 r3226, r1863, r1879; +} +{ +mul.f16x2 r3229, r3226, r3187; +} +{ +add.f16x2 r3232, r3223, r3229; +} +{ +sub.f16x2 r3235, r1853, r1901; +} +{ +mul.f16x2 r3238, r3235, r3186; +} +{ +sub.f16x2 r3241, r1869, r1885; +} +{ +mul.f16x2 r3244, r3241, r3188; +} +{ +add.f16x2 r3247, r3238, r3244; +} +{ +sub.f16x2 r3250, r3232, r3247; +} +{ +add.f16x2 r3253, r1847, r1895; +} +{ +mul.f16x2 r3256, r3253, r3185; +} +{ +add.f16x2 r3259, r102, r3256; +} +{ +add.f16x2 r3262, r1863, r1879; +} +{ +mul.f16x2 r3265, r3262, r3187; +} +{ +add.f16x2 r3268, r3259, r3265; +} +{ +sub.f16x2 r3271, r1853, r1901; +} +{ +mul.f16x2 r3274, r3271, r3186; +} +{ +sub.f16x2 r3277, r1869, r1885; +} +{ +mul.f16x2 r3280, r3277, r3188; +} +{ +add.f16x2 r3283, r3274, r3280; +} +{ +add.f16x2 r3286, r3268, r3283; +} +{ +add.f16x2 r3289, r1847, r1895; +} +{ +mul.f16x2 r3292, r3289, r3187; +} +{ +add.f16x2 r3295, r102, r3292; +} +{ +add.f16x2 r3298, r1863, r1879; +} +{ +mul.f16x2 r3301, r3298, r3189; +} +{ +add.f16x2 r3304, r3295, r3301; +} +{ +sub.f16x2 r3307, r1853, r1901; +} +{ +mul.f16x2 r3310, r3307, r3188; +} +{ +sub.f16x2 r3313, r1869, r1885; +} +{ +mul.f16x2 r3316, r3313, r3191; +} +{ +add.f16x2 r3319, r3310, r3316; +} +{ +sub.f16x2 r3322, r3304, r3319; +} +{ +add.f16x2 r3325, r1847, r1895; +} +{ +mul.f16x2 r3328, r3325, r3187; +} +{ +add.f16x2 r3331, r102, r3328; +} +{ +add.f16x2 r3334, r1863, r1879; +} +{ +mul.f16x2 r3337, r3334, r3189; +} +{ +add.f16x2 r3340, r3331, r3337; +} +{ +sub.f16x2 r3343, r1853, r1901; +} +{ +mul.f16x2 r3346, r3343, r3188; +} +{ +sub.f16x2 r3349, r1869, r1885; +} +{ +mul.f16x2 r3352, r3349, r3191; +} +{ +add.f16x2 r3355, r3346, r3352; +} +{ +add.f16x2 r3358, r3340, r3355; +} +{ +add.f16x2 r3361, r1853, r1901; +} +{ +mul.f16x2 r3364, r3361, r3185; +} +{ +add.f16x2 r3367, r246, r3364; +} +{ +add.f16x2 r3370, r1869, r1885; +} +{ +mul.f16x2 r3373, r3370, r3187; +} +{ +add.f16x2 r3376, r3367, r3373; +} +{ +sub.f16x2 r3379, r1847, r1895; +} +{ +mul.f16x2 r3382, r3379, r3186; +} +{ +sub.f16x2 r3385, r1863, r1879; +} +{ +mul.f16x2 r3388, r3385, r3188; +} +{ +add.f16x2 r3391, r3382, r3388; +} +{ +add.f16x2 r3394, r3376, r3391; +} +{ +add.f16x2 r3397, r1853, r1901; +} +{ +mul.f16x2 r3400, r3397, r3185; +} +{ +add.f16x2 r3403, r246, r3400; +} +{ +add.f16x2 r3406, r1869, r1885; +} +{ +mul.f16x2 r3409, r3406, r3187; +} +{ +add.f16x2 r3412, r3403, r3409; +} +{ +sub.f16x2 r3415, r1847, r1895; +} +{ +mul.f16x2 r3418, r3415, r3186; +} +{ +sub.f16x2 r3421, r1863, r1879; +} +{ +mul.f16x2 r3424, r3421, r3188; +} +{ +add.f16x2 r3427, r3418, r3424; +} +{ +sub.f16x2 r3430, r3412, r3427; +} +{ +add.f16x2 r3433, r1853, r1901; +} +{ +mul.f16x2 r3436, r3433, r3187; +} +{ +add.f16x2 r3439, r246, r3436; +} +{ +add.f16x2 r3442, r1869, r1885; +} +{ +mul.f16x2 r3445, r3442, r3189; +} +{ +add.f16x2 r3448, r3439, r3445; +} +{ +sub.f16x2 r3451, r1847, r1895; +} +{ +mul.f16x2 r3454, r3451, r3188; +} +{ +sub.f16x2 r3457, r1863, r1879; +} +{ +mul.f16x2 r3460, r3457, r3191; +} +{ +add.f16x2 r3463, r3454, r3460; +} +{ +add.f16x2 r3466, r3448, r3463; +} +{ +add.f16x2 r3469, r1853, r1901; +} +{ +mul.f16x2 r3472, r3469, r3187; +} +{ +add.f16x2 r3475, r246, r3472; +} +{ +add.f16x2 r3478, r1869, r1885; +} +{ +mul.f16x2 r3481, r3478, r3189; +} +{ +add.f16x2 r3484, r3475, r3481; +} +{ +sub.f16x2 r3487, r1847, r1895; +} +{ +mul.f16x2 r3490, r3487, r3188; +} +{ +sub.f16x2 r3493, r1863, r1879; +} +{ +mul.f16x2 r3496, r3493, r3191; +} +{ +add.f16x2 r3499, r3490, r3496; +} +{ +sub.f16x2 r3502, r3484, r3499; +} +mul.wide.u32 rd2, r10394, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r10395, rd3; +mul.lo.s32 r10396, r10395, 125; +sub.s32 r10397, r10394, r10396; +cvt.rn.f32.u32 f597, r10397; +mul.f32 f598, f597, 0f3B03C498; +cos.approx.f32 f217, f598; +sin.approx.f32 f599, f598; +neg.f32 f218, f599; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3505, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3508, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3510, {high, high}; +} +{ +mul.f16x2 r3512, r2254, r3510; +} +{ +neg.f16x2 r3515, r3512; +} +{ +fma.rn.f16x2 r3517, r2242, r3508, r3515; +} +{ +mul.f16x2 r3521, r2242, r3510; +} +{ +fma.rn.f16x2 r3524, r2254, r3508, r3521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3530, {high, high}; +} +mov.f32 f533, 0fBF800000; +mov.f32 f534, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3532, {low, high}; +} +{ +mul.f16x2 r3533, r3530, r3532; +} +{ +mul.f16x2 r3536, r3505, r3528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3539, {high, low}; +} +{ +fma.rn.f16x2 r3541, r3533, r3539, r3536; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3545, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3547, {high, high}; +} +{ +mul.f16x2 r3549, r2574, r3547; +} +{ +neg.f16x2 r3552, r3549; +} +{ +fma.rn.f16x2 r3554, r2562, r3545, r3552; +} +{ +mul.f16x2 r3558, r2562, r3547; +} +{ +fma.rn.f16x2 r3561, r2574, r3545, r3558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3567, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3569, {low, high}; +} +{ +mul.f16x2 r3570, r3567, r3569; +} +{ +mul.f16x2 r3573, r3541, r3565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3576, {high, low}; +} +{ +fma.rn.f16x2 r3578, r3570, r3576, r3573; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3582, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3584, {high, high}; +} +{ +mul.f16x2 r3586, r2894, r3584; +} +{ +neg.f16x2 r3589, r3586; +} +{ +fma.rn.f16x2 r3591, r2882, r3582, r3589; +} +{ +mul.f16x2 r3595, r2882, r3584; +} +{ +fma.rn.f16x2 r3598, r2894, r3582, r3595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3604, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3606, {low, high}; +} +{ +mul.f16x2 r3607, r3604, r3606; +} +{ +mul.f16x2 r3610, r3578, r3602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3613, {high, low}; +} +{ +fma.rn.f16x2 r3615, r3607, r3613, r3610; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3619, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3621, {high, high}; +} +{ +mul.f16x2 r3623, r3214, r3621; +} +{ +neg.f16x2 r3626, r3623; +} +{ +fma.rn.f16x2 r3628, r3202, r3619, r3626; +} +{ +mul.f16x2 r3632, r3202, r3621; +} +{ +fma.rn.f16x2 r3635, r3214, r3619, r3632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3641, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3643, {low, high}; +} +{ +mul.f16x2 r3644, r3641, r3643; +} +{ +mul.f16x2 r3647, r3615, r3639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3650, {high, low}; +} +{ +fma.rn.f16x2 r3652, r3644, r3650, r3647; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3656, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3658, {high, high}; +} +{ +mul.f16x2 r3660, r2114, r3658; +} +{ +neg.f16x2 r3663, r3660; +} +{ +fma.rn.f16x2 r3665, r1970, r3656, r3663; +} +{ +mul.f16x2 r3669, r1970, r3658; +} +{ +fma.rn.f16x2 r3672, r2114, r3656, r3669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3678, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3680, {low, high}; +} +{ +mul.f16x2 r3681, r3678, r3680; +} +{ +mul.f16x2 r3684, r3652, r3676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3687, {high, low}; +} +{ +fma.rn.f16x2 r3689, r3681, r3687, r3684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3693, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3695, {high, high}; +} +{ +mul.f16x2 r3697, r2434, r3695; +} +{ +neg.f16x2 r3700, r3697; +} +{ +fma.rn.f16x2 r3702, r2290, r3693, r3700; +} +{ +mul.f16x2 r3706, r2290, r3695; +} +{ +fma.rn.f16x2 r3709, r2434, r3693, r3706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3715, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3717, {low, high}; +} +{ +mul.f16x2 r3718, r3715, r3717; +} +{ +mul.f16x2 r3721, r3689, r3713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3724, {high, low}; +} +{ +fma.rn.f16x2 r3726, r3718, r3724, r3721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3730, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3732, {high, high}; +} +{ +mul.f16x2 r3734, r2754, r3732; +} +{ +neg.f16x2 r3737, r3734; +} +{ +fma.rn.f16x2 r3739, r2610, r3730, r3737; +} +{ +mul.f16x2 r3743, r2610, r3732; +} +{ +fma.rn.f16x2 r3746, r2754, r3730, r3743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3752, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3754, {low, high}; +} +{ +mul.f16x2 r3755, r3752, r3754; +} +{ +mul.f16x2 r3758, r3726, r3750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3761, {high, low}; +} +{ +fma.rn.f16x2 r3763, r3755, r3761, r3758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3767, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3769, {high, high}; +} +{ +mul.f16x2 r3771, r3074, r3769; +} +{ +neg.f16x2 r3774, r3771; +} +{ +fma.rn.f16x2 r3776, r2930, r3767, r3774; +} +{ +mul.f16x2 r3780, r2930, r3769; +} +{ +fma.rn.f16x2 r3783, r3074, r3767, r3780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3789, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3791, {low, high}; +} +{ +mul.f16x2 r3792, r3789, r3791; +} +{ +mul.f16x2 r3795, r3763, r3787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3798, {high, low}; +} +{ +fma.rn.f16x2 r3800, r3792, r3798, r3795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3804, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3806, {high, high}; +} +{ +mul.f16x2 r3808, r3394, r3806; +} +{ +neg.f16x2 r3811, r3808; +} +{ +fma.rn.f16x2 r3813, r3250, r3804, r3811; +} +{ +mul.f16x2 r3817, r3250, r3806; +} +{ +fma.rn.f16x2 r3820, r3394, r3804, r3817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3826, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3828, {low, high}; +} +{ +mul.f16x2 r3829, r3826, r3828; +} +{ +mul.f16x2 r3832, r3800, r3824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3835, {high, low}; +} +{ +fma.rn.f16x2 r3837, r3829, r3835, r3832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3841, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3843, {high, high}; +} +{ +mul.f16x2 r3845, r2186, r3843; +} +{ +neg.f16x2 r3848, r3845; +} +{ +fma.rn.f16x2 r3850, r2042, r3841, r3848; +} +{ +mul.f16x2 r3854, r2042, r3843; +} +{ +fma.rn.f16x2 r3857, r2186, r3841, r3854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3863, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3865, {low, high}; +} +{ +mul.f16x2 r3866, r3863, r3865; +} +{ +mul.f16x2 r3869, r3837, r3861; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3872, {high, low}; +} +{ +fma.rn.f16x2 r3874, r3866, r3872, r3869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3878, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3880, {high, high}; +} +{ +mul.f16x2 r3882, r2506, r3880; +} +{ +neg.f16x2 r3885, r3882; +} +{ +fma.rn.f16x2 r3887, r2362, r3878, r3885; +} +{ +mul.f16x2 r3891, r2362, r3880; +} +{ +fma.rn.f16x2 r3894, r2506, r3878, r3891; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3900, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3902, {low, high}; +} +{ +mul.f16x2 r3903, r3900, r3902; +} +{ +mul.f16x2 r3906, r3874, r3898; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3909, {high, low}; +} +{ +fma.rn.f16x2 r3911, r3903, r3909, r3906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3915, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3917, {high, high}; +} +{ +mul.f16x2 r3919, r2826, r3917; +} +{ +neg.f16x2 r3922, r3919; +} +{ +fma.rn.f16x2 r3924, r2682, r3915, r3922; +} +{ +mul.f16x2 r3928, r2682, r3917; +} +{ +fma.rn.f16x2 r3931, r2826, r3915, r3928; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3937, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3939, {low, high}; +} +{ +mul.f16x2 r3940, r3937, r3939; +} +{ +mul.f16x2 r3943, r3911, r3935; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3946, {high, low}; +} +{ +fma.rn.f16x2 r3948, r3940, r3946, r3943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3952, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3954, {high, high}; +} +{ +mul.f16x2 r3956, r3146, r3954; +} +{ +neg.f16x2 r3959, r3956; +} +{ +fma.rn.f16x2 r3961, r3002, r3952, r3959; +} +{ +mul.f16x2 r3965, r3002, r3954; +} +{ +fma.rn.f16x2 r3968, r3146, r3952, r3965; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3974, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3976, {low, high}; +} +{ +mul.f16x2 r3977, r3974, r3976; +} +{ +mul.f16x2 r3980, r3948, r3972; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3983, {high, low}; +} +{ +fma.rn.f16x2 r3985, r3977, r3983, r3980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3989, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3991, {high, high}; +} +{ +mul.f16x2 r3993, r3466, r3991; +} +{ +neg.f16x2 r3996, r3993; +} +{ +fma.rn.f16x2 r3998, r3322, r3989, r3996; +} +{ +mul.f16x2 r4002, r3322, r3991; +} +{ +fma.rn.f16x2 r4005, r3466, r3989, r4002; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4011, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4013, {low, high}; +} +{ +mul.f16x2 r4014, r4011, r4013; +} +{ +mul.f16x2 r4017, r3985, r4009; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r4020, {high, low}; +} +{ +fma.rn.f16x2 r4022, r4014, r4020, r4017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4026, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4028, {high, high}; +} +{ +mul.f16x2 r4030, r2222, r4028; +} +{ +neg.f16x2 r4033, r4030; +} +{ +fma.rn.f16x2 r4035, r2078, r4026, r4033; +} +{ +mul.f16x2 r4039, r2078, r4028; +} +{ +fma.rn.f16x2 r4042, r2222, r4026, r4039; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4048, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4050, {low, high}; +} +{ +mul.f16x2 r4051, r4048, r4050; +} +{ +mul.f16x2 r4054, r4022, r4046; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4057, {high, low}; +} +{ +fma.rn.f16x2 r4059, r4051, r4057, r4054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4063, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4065, {high, high}; +} +{ +mul.f16x2 r4067, r2542, r4065; +} +{ +neg.f16x2 r4070, r4067; +} +{ +fma.rn.f16x2 r4072, r2398, r4063, r4070; +} +{ +mul.f16x2 r4076, r2398, r4065; +} +{ +fma.rn.f16x2 r4079, r2542, r4063, r4076; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4085, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4087, {low, high}; +} +{ +mul.f16x2 r4088, r4085, r4087; +} +{ +mul.f16x2 r4091, r4059, r4083; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4094, {high, low}; +} +{ +fma.rn.f16x2 r4096, r4088, r4094, r4091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4100, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4102, {high, high}; +} +{ +mul.f16x2 r4104, r2862, r4102; +} +{ +neg.f16x2 r4107, r4104; +} +{ +fma.rn.f16x2 r4109, r2718, r4100, r4107; +} +{ +mul.f16x2 r4113, r2718, r4102; +} +{ +fma.rn.f16x2 r4116, r2862, r4100, r4113; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4122, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4125, r4122, r4124; +} +{ +mul.f16x2 r4128, r4096, r4120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4131, {high, low}; +} +{ +fma.rn.f16x2 r4133, r4125, r4131, r4128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4137, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4139, {high, high}; +} +{ +mul.f16x2 r4141, r3182, r4139; +} +{ +neg.f16x2 r4144, r4141; +} +{ +fma.rn.f16x2 r4146, r3038, r4137, r4144; +} +{ +mul.f16x2 r4150, r3038, r4139; +} +{ +fma.rn.f16x2 r4153, r3182, r4137, r4150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4159, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4162, r4159, r4161; +} +{ +mul.f16x2 r4165, r4133, r4157; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4168, {high, low}; +} +{ +fma.rn.f16x2 r4170, r4162, r4168, r4165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4174, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4176, {high, high}; +} +{ +mul.f16x2 r4178, r3502, r4176; +} +{ +neg.f16x2 r4181, r4178; +} +{ +fma.rn.f16x2 r4183, r3358, r4174, r4181; +} +{ +mul.f16x2 r4187, r3358, r4176; +} +{ +fma.rn.f16x2 r4190, r3502, r4174, r4187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4196, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4198, {low, high}; +} +{ +mul.f16x2 r4199, r4196, r4198; +} +{ +mul.f16x2 r4202, r4170, r4194; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4205, {high, low}; +} +{ +fma.rn.f16x2 r4207, r4199, r4205, r4202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4211, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4213, {high, high}; +} +{ +mul.f16x2 r4215, r2150, r4213; +} +{ +neg.f16x2 r4218, r4215; +} +{ +fma.rn.f16x2 r4220, r2006, r4211, r4218; +} +{ +mul.f16x2 r4224, r2006, r4213; +} +{ +fma.rn.f16x2 r4227, r2150, r4211, r4224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4233, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4235, {low, high}; +} +{ +mul.f16x2 r4236, r4233, r4235; +} +{ +mul.f16x2 r4239, r4207, r4231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4242, {high, low}; +} +{ +fma.rn.f16x2 r4244, r4236, r4242, r4239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4250, {high, high}; +} +{ +mul.f16x2 r4252, r2470, r4250; +} +{ +neg.f16x2 r4255, r4252; +} +{ +fma.rn.f16x2 r4257, r2326, r4248, r4255; +} +{ +mul.f16x2 r4261, r2326, r4250; +} +{ +fma.rn.f16x2 r4264, r2470, r4248, r4261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4272, {low, high}; +} +{ +mul.f16x2 r4273, r4270, r4272; +} +{ +mul.f16x2 r4276, r4244, r4268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4279, {high, low}; +} +{ +fma.rn.f16x2 r4281, r4273, r4279, r4276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4287, {high, high}; +} +{ +mul.f16x2 r4289, r2790, r4287; +} +{ +neg.f16x2 r4292, r4289; +} +{ +fma.rn.f16x2 r4294, r2646, r4285, r4292; +} +{ +mul.f16x2 r4298, r2646, r4287; +} +{ +fma.rn.f16x2 r4301, r2790, r4285, r4298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4309, {low, high}; +} +{ +mul.f16x2 r4310, r4307, r4309; +} +{ +mul.f16x2 r4313, r4281, r4305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4316, {high, low}; +} +{ +fma.rn.f16x2 r4318, r4310, r4316, r4313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4324, {high, high}; +} +{ +mul.f16x2 r4326, r3110, r4324; +} +{ +neg.f16x2 r4329, r4326; +} +{ +fma.rn.f16x2 r4331, r2966, r4322, r4329; +} +{ +mul.f16x2 r4335, r2966, r4324; +} +{ +fma.rn.f16x2 r4338, r3110, r4322, r4335; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4344, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4346, {low, high}; +} +{ +mul.f16x2 r4347, r4344, r4346; +} +{ +mul.f16x2 r4350, r4318, r4342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4353, {high, low}; +} +{ +fma.rn.f16x2 r4355, r4347, r4353, r4350; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4359, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4361, {high, high}; +} +{ +mul.f16x2 r4363, r3430, r4361; +} +{ +neg.f16x2 r4366, r4363; +} +{ +fma.rn.f16x2 r4368, r3286, r4359, r4366; +} +{ +mul.f16x2 r4372, r3286, r4361; +} +{ +fma.rn.f16x2 r4375, r3430, r4359, r4372; +} +mad.lo.s32 r10398, r10395, 25000, r10393; +barrier.sync 0; +mad.lo.s32 r10399, r10397, 200, r10398; +st.shared.v2.f32 [r10399], {r1922, r1934}; +st.shared.v2.f32 [r10399+8], {r3517, r3524}; +st.shared.v2.f32 [r10399+16], {r3554, r3561}; +st.shared.v2.f32 [r10399+24], {r3591, r3598}; +st.shared.v2.f32 [r10399+32], {r3628, r3635}; +st.shared.v2.f32 [r10399+40], {r3665, r3672}; +st.shared.v2.f32 [r10399+48], {r3702, r3709}; +st.shared.v2.f32 [r10399+56], {r3739, r3746}; +st.shared.v2.f32 [r10399+64], {r3776, r3783}; +st.shared.v2.f32 [r10399+72], {r3813, r3820}; +st.shared.v2.f32 [r10399+80], {r3850, r3857}; +st.shared.v2.f32 [r10399+88], {r3887, r3894}; +st.shared.v2.f32 [r10399+96], {r3924, r3931}; +st.shared.v2.f32 [r10399+104], {r3961, r3968}; +st.shared.v2.f32 [r10399+112], {r3998, r4005}; +st.shared.v2.f32 [r10399+120], {r4035, r4042}; +st.shared.v2.f32 [r10399+128], {r4072, r4079}; +st.shared.v2.f32 [r10399+136], {r4109, r4116}; +st.shared.v2.f32 [r10399+144], {r4146, r4153}; +st.shared.v2.f32 [r10399+152], {r4183, r4190}; +st.shared.v2.f32 [r10399+160], {r4220, r4227}; +st.shared.v2.f32 [r10399+168], {r4257, r4264}; +st.shared.v2.f32 [r10399+176], {r4294, r4301}; +st.shared.v2.f32 [r10399+184], {r4331, r4338}; +st.shared.v2.f32 [r10399+192], {r4368, r4375}; +barrier.sync 0; +mad.lo.s32 r10400, r10397, -192, r10399; +ld.shared.u32 r4408, [r10400]; +ld.shared.u32 r4420, [r10400+4]; +ld.shared.u32 r4728, [r10400+1000]; +ld.shared.u32 r4740, [r10400+1004]; +ld.shared.u32 r5048, [r10400+2000]; +ld.shared.u32 r5060, [r10400+2004]; +ld.shared.u32 r5368, [r10400+3000]; +ld.shared.u32 r5380, [r10400+3004]; +ld.shared.u32 r5688, [r10400+4000]; +ld.shared.u32 r5700, [r10400+4004]; +ld.shared.u32 r4405, [r10400+5000]; +ld.shared.u32 r4417, [r10400+5004]; +ld.shared.u32 r4725, [r10400+6000]; +ld.shared.u32 r4737, [r10400+6004]; +ld.shared.u32 r5045, [r10400+7000]; +ld.shared.u32 r5057, [r10400+7004]; +ld.shared.u32 r5365, [r10400+8000]; +ld.shared.u32 r5377, [r10400+8004]; +ld.shared.u32 r5685, [r10400+9000]; +ld.shared.u32 r5697, [r10400+9004]; +ld.shared.u32 r4411, [r10400+10000]; +ld.shared.u32 r4423, [r10400+10004]; +ld.shared.u32 r4731, [r10400+11000]; +ld.shared.u32 r4743, [r10400+11004]; +ld.shared.u32 r5051, [r10400+12000]; +ld.shared.u32 r5063, [r10400+12004]; +ld.shared.u32 r5371, [r10400+13000]; +ld.shared.u32 r5383, [r10400+13004]; +ld.shared.u32 r5691, [r10400+14000]; +ld.shared.u32 r5703, [r10400+14004]; +ld.shared.u32 r4412, [r10400+15000]; +ld.shared.u32 r4424, [r10400+15004]; +ld.shared.u32 r4732, [r10400+16000]; +ld.shared.u32 r4744, [r10400+16004]; +ld.shared.u32 r5052, [r10400+17000]; +ld.shared.u32 r5064, [r10400+17004]; +ld.shared.u32 r5372, [r10400+18000]; +ld.shared.u32 r5384, [r10400+18004]; +ld.shared.u32 r5692, [r10400+19000]; +ld.shared.u32 r5704, [r10400+19004]; +ld.shared.u32 r4406, [r10400+20000]; +ld.shared.u32 r4418, [r10400+20004]; +ld.shared.u32 r4726, [r10400+21000]; +ld.shared.u32 r4738, [r10400+21004]; +ld.shared.u32 r5046, [r10400+22000]; +ld.shared.u32 r5058, [r10400+22004]; +ld.shared.u32 r5366, [r10400+23000]; +ld.shared.u32 r5378, [r10400+23004]; +ld.shared.u32 r5686, [r10400+24000]; +ld.shared.u32 r5698, [r10400+24004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4401, {low, high}; +} +{ +neg.f16x2 r4402, r4401; +} +{ +add.f16x2 r4404, r4405, r4406; +} +{ +add.f16x2 r4407, r4408, r4404; +} +{ +add.f16x2 r4410, r4411, r4412; +} +{ +add.f16x2 r4413, r4407, r4410; +} +{ +add.f16x2 r4416, r4417, r4418; +} +{ +add.f16x2 r4419, r4420, r4416; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 r4425, r4419, r4422; +} +{ +add.f16x2 r4428, r4405, r4406; +} +{ +mul.f16x2 r4431, r4428, r4396; +} +{ +add.f16x2 r4434, r4408, r4431; +} +{ +add.f16x2 r4437, r4411, r4412; +} +{ +mul.f16x2 r4440, r4437, r4398; +} +{ +add.f16x2 r4443, r4434, r4440; +} +{ +sub.f16x2 r4446, r4417, r4418; +} +{ +mul.f16x2 r4449, r4446, r4397; +} +{ +sub.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4399; +} +{ +add.f16x2 r4458, r4449, r4455; +} +{ +sub.f16x2 r4461, r4443, r4458; +} +{ +add.f16x2 r4464, r4405, r4406; +} +{ +mul.f16x2 r4467, r4464, r4396; +} +{ +add.f16x2 r4470, r4408, r4467; +} +{ +add.f16x2 r4473, r4411, r4412; +} +{ +mul.f16x2 r4476, r4473, r4398; +} +{ +add.f16x2 r4479, r4470, r4476; +} +{ +sub.f16x2 r4482, r4417, r4418; +} +{ +mul.f16x2 r4485, r4482, r4397; +} +{ +sub.f16x2 r4488, r4423, r4424; +} +{ +mul.f16x2 r4491, r4488, r4399; +} +{ +add.f16x2 r4494, r4485, r4491; +} +{ +add.f16x2 r4497, r4479, r4494; +} +{ +add.f16x2 r4500, r4405, r4406; +} +{ +mul.f16x2 r4503, r4500, r4398; +} +{ +add.f16x2 r4506, r4408, r4503; +} +{ +add.f16x2 r4509, r4411, r4412; +} +{ +mul.f16x2 r4512, r4509, r4400; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +sub.f16x2 r4518, r4417, r4418; +} +{ +mul.f16x2 r4521, r4518, r4399; +} +{ +sub.f16x2 r4524, r4423, r4424; +} +{ +mul.f16x2 r4527, r4524, r4402; +} +{ +add.f16x2 r4530, r4521, r4527; +} +{ +sub.f16x2 r4533, r4515, r4530; +} +{ +add.f16x2 r4536, r4405, r4406; +} +{ +mul.f16x2 r4539, r4536, r4398; +} +{ +add.f16x2 r4542, r4408, r4539; +} +{ +add.f16x2 r4545, r4411, r4412; +} +{ +mul.f16x2 r4548, r4545, r4400; +} +{ +add.f16x2 r4551, r4542, r4548; +} +{ +sub.f16x2 r4554, r4417, r4418; +} +{ +mul.f16x2 r4557, r4554, r4399; +} +{ +sub.f16x2 r4560, r4423, r4424; +} +{ +mul.f16x2 r4563, r4560, r4402; +} +{ +add.f16x2 r4566, r4557, r4563; +} +{ +add.f16x2 r4569, r4551, r4566; +} +{ +add.f16x2 r4572, r4417, r4418; +} +{ +mul.f16x2 r4575, r4572, r4396; +} +{ +add.f16x2 r4578, r4420, r4575; +} +{ +add.f16x2 r4581, r4423, r4424; +} +{ +mul.f16x2 r4584, r4581, r4398; +} +{ +add.f16x2 r4587, r4578, r4584; +} +{ +sub.f16x2 r4590, r4405, r4406; +} +{ +mul.f16x2 r4593, r4590, r4397; +} +{ +sub.f16x2 r4596, r4411, r4412; +} +{ +mul.f16x2 r4599, r4596, r4399; +} +{ +add.f16x2 r4602, r4593, r4599; +} +{ +add.f16x2 r4605, r4587, r4602; +} +{ +add.f16x2 r4608, r4417, r4418; +} +{ +mul.f16x2 r4611, r4608, r4396; +} +{ +add.f16x2 r4614, r4420, r4611; +} +{ +add.f16x2 r4617, r4423, r4424; +} +{ +mul.f16x2 r4620, r4617, r4398; +} +{ +add.f16x2 r4623, r4614, r4620; +} +{ +sub.f16x2 r4626, r4405, r4406; +} +{ +mul.f16x2 r4629, r4626, r4397; +} +{ +sub.f16x2 r4632, r4411, r4412; +} +{ +mul.f16x2 r4635, r4632, r4399; +} +{ +add.f16x2 r4638, r4629, r4635; +} +{ +sub.f16x2 r4641, r4623, r4638; +} +{ +add.f16x2 r4644, r4417, r4418; +} +{ +mul.f16x2 r4647, r4644, r4398; +} +{ +add.f16x2 r4650, r4420, r4647; +} +{ +add.f16x2 r4653, r4423, r4424; +} +{ +mul.f16x2 r4656, r4653, r4400; +} +{ +add.f16x2 r4659, r4650, r4656; +} +{ +sub.f16x2 r4662, r4405, r4406; +} +{ +mul.f16x2 r4665, r4662, r4399; +} +{ +sub.f16x2 r4668, r4411, r4412; +} +{ +mul.f16x2 r4671, r4668, r4402; +} +{ +add.f16x2 r4674, r4665, r4671; +} +{ +add.f16x2 r4677, r4659, r4674; +} +{ +add.f16x2 r4680, r4417, r4418; +} +{ +mul.f16x2 r4683, r4680, r4398; +} +{ +add.f16x2 r4686, r4420, r4683; +} +{ +add.f16x2 r4689, r4423, r4424; +} +{ +mul.f16x2 r4692, r4689, r4400; +} +{ +add.f16x2 r4695, r4686, r4692; +} +{ +sub.f16x2 r4698, r4405, r4406; +} +{ +mul.f16x2 r4701, r4698, r4399; +} +{ +sub.f16x2 r4704, r4411, r4412; +} +{ +mul.f16x2 r4707, r4704, r4402; +} +{ +add.f16x2 r4710, r4701, r4707; +} +{ +sub.f16x2 r4713, r4695, r4710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4721, {low, high}; +} +{ +neg.f16x2 r4722, r4721; +} +{ +add.f16x2 r4724, r4725, r4726; +} +{ +add.f16x2 r4727, r4728, r4724; +} +{ +add.f16x2 r4730, r4731, r4732; +} +{ +add.f16x2 r4733, r4727, r4730; +} +{ +add.f16x2 r4736, r4737, r4738; +} +{ +add.f16x2 r4739, r4740, r4736; +} +{ +add.f16x2 r4742, r4743, r4744; +} +{ +add.f16x2 r4745, r4739, r4742; +} +{ +add.f16x2 r4748, r4725, r4726; +} +{ +mul.f16x2 r4751, r4748, r4716; +} +{ +add.f16x2 r4754, r4728, r4751; +} +{ +add.f16x2 r4757, r4731, r4732; +} +{ +mul.f16x2 r4760, r4757, r4718; +} +{ +add.f16x2 r4763, r4754, r4760; +} +{ +sub.f16x2 r4766, r4737, r4738; +} +{ +mul.f16x2 r4769, r4766, r4717; +} +{ +sub.f16x2 r4772, r4743, r4744; +} +{ +mul.f16x2 r4775, r4772, r4719; +} +{ +add.f16x2 r4778, r4769, r4775; +} +{ +sub.f16x2 r4781, r4763, r4778; +} +{ +add.f16x2 r4784, r4725, r4726; +} +{ +mul.f16x2 r4787, r4784, r4716; +} +{ +add.f16x2 r4790, r4728, r4787; +} +{ +add.f16x2 r4793, r4731, r4732; +} +{ +mul.f16x2 r4796, r4793, r4718; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +sub.f16x2 r4802, r4737, r4738; +} +{ +mul.f16x2 r4805, r4802, r4717; +} +{ +sub.f16x2 r4808, r4743, r4744; +} +{ +mul.f16x2 r4811, r4808, r4719; +} +{ +add.f16x2 r4814, r4805, r4811; +} +{ +add.f16x2 r4817, r4799, r4814; +} +{ +add.f16x2 r4820, r4725, r4726; +} +{ +mul.f16x2 r4823, r4820, r4718; +} +{ +add.f16x2 r4826, r4728, r4823; +} +{ +add.f16x2 r4829, r4731, r4732; +} +{ +mul.f16x2 r4832, r4829, r4720; +} +{ +add.f16x2 r4835, r4826, r4832; +} +{ +sub.f16x2 r4838, r4737, r4738; +} +{ +mul.f16x2 r4841, r4838, r4719; +} +{ +sub.f16x2 r4844, r4743, r4744; +} +{ +mul.f16x2 r4847, r4844, r4722; +} +{ +add.f16x2 r4850, r4841, r4847; +} +{ +sub.f16x2 r4853, r4835, r4850; +} +{ +add.f16x2 r4856, r4725, r4726; +} +{ +mul.f16x2 r4859, r4856, r4718; +} +{ +add.f16x2 r4862, r4728, r4859; +} +{ +add.f16x2 r4865, r4731, r4732; +} +{ +mul.f16x2 r4868, r4865, r4720; +} +{ +add.f16x2 r4871, r4862, r4868; +} +{ +sub.f16x2 r4874, r4737, r4738; +} +{ +mul.f16x2 r4877, r4874, r4719; +} +{ +sub.f16x2 r4880, r4743, r4744; +} +{ +mul.f16x2 r4883, r4880, r4722; +} +{ +add.f16x2 r4886, r4877, r4883; +} +{ +add.f16x2 r4889, r4871, r4886; +} +{ +add.f16x2 r4892, r4737, r4738; +} +{ +mul.f16x2 r4895, r4892, r4716; +} +{ +add.f16x2 r4898, r4740, r4895; +} +{ +add.f16x2 r4901, r4743, r4744; +} +{ +mul.f16x2 r4904, r4901, r4718; +} +{ +add.f16x2 r4907, r4898, r4904; +} +{ +sub.f16x2 r4910, r4725, r4726; +} +{ +mul.f16x2 r4913, r4910, r4717; +} +{ +sub.f16x2 r4916, r4731, r4732; +} +{ +mul.f16x2 r4919, r4916, r4719; +} +{ +add.f16x2 r4922, r4913, r4919; +} +{ +add.f16x2 r4925, r4907, r4922; +} +{ +add.f16x2 r4928, r4737, r4738; +} +{ +mul.f16x2 r4931, r4928, r4716; +} +{ +add.f16x2 r4934, r4740, r4931; +} +{ +add.f16x2 r4937, r4743, r4744; +} +{ +mul.f16x2 r4940, r4937, r4718; +} +{ +add.f16x2 r4943, r4934, r4940; +} +{ +sub.f16x2 r4946, r4725, r4726; +} +{ +mul.f16x2 r4949, r4946, r4717; +} +{ +sub.f16x2 r4952, r4731, r4732; +} +{ +mul.f16x2 r4955, r4952, r4719; +} +{ +add.f16x2 r4958, r4949, r4955; +} +{ +sub.f16x2 r4961, r4943, r4958; +} +{ +add.f16x2 r4964, r4737, r4738; +} +{ +mul.f16x2 r4967, r4964, r4718; +} +{ +add.f16x2 r4970, r4740, r4967; +} +{ +add.f16x2 r4973, r4743, r4744; +} +{ +mul.f16x2 r4976, r4973, r4720; +} +{ +add.f16x2 r4979, r4970, r4976; +} +{ +sub.f16x2 r4982, r4725, r4726; +} +{ +mul.f16x2 r4985, r4982, r4719; +} +{ +sub.f16x2 r4988, r4731, r4732; +} +{ +mul.f16x2 r4991, r4988, r4722; +} +{ +add.f16x2 r4994, r4985, r4991; +} +{ +add.f16x2 r4997, r4979, r4994; +} +{ +add.f16x2 r5000, r4737, r4738; +} +{ +mul.f16x2 r5003, r5000, r4718; +} +{ +add.f16x2 r5006, r4740, r5003; +} +{ +add.f16x2 r5009, r4743, r4744; +} +{ +mul.f16x2 r5012, r5009, r4720; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +sub.f16x2 r5018, r4725, r4726; +} +{ +mul.f16x2 r5021, r5018, r4719; +} +{ +sub.f16x2 r5024, r4731, r4732; +} +{ +mul.f16x2 r5027, r5024, r4722; +} +{ +add.f16x2 r5030, r5021, r5027; +} +{ +sub.f16x2 r5033, r5015, r5030; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5038, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5039, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5041, {low, high}; +} +{ +neg.f16x2 r5042, r5041; +} +{ +add.f16x2 r5044, r5045, r5046; +} +{ +add.f16x2 r5047, r5048, r5044; +} +{ +add.f16x2 r5050, r5051, r5052; +} +{ +add.f16x2 r5053, r5047, r5050; +} +{ +add.f16x2 r5056, r5057, r5058; +} +{ +add.f16x2 r5059, r5060, r5056; +} +{ +add.f16x2 r5062, r5063, r5064; +} +{ +add.f16x2 r5065, r5059, r5062; +} +{ +add.f16x2 r5068, r5045, r5046; +} +{ +mul.f16x2 r5071, r5068, r5036; +} +{ +add.f16x2 r5074, r5048, r5071; +} +{ +add.f16x2 r5077, r5051, r5052; +} +{ +mul.f16x2 r5080, r5077, r5038; +} +{ +add.f16x2 r5083, r5074, r5080; +} +{ +sub.f16x2 r5086, r5057, r5058; +} +{ +mul.f16x2 r5089, r5086, r5037; +} +{ +sub.f16x2 r5092, r5063, r5064; +} +{ +mul.f16x2 r5095, r5092, r5039; +} +{ +add.f16x2 r5098, r5089, r5095; +} +{ +sub.f16x2 r5101, r5083, r5098; +} +{ +add.f16x2 r5104, r5045, r5046; +} +{ +mul.f16x2 r5107, r5104, r5036; +} +{ +add.f16x2 r5110, r5048, r5107; +} +{ +add.f16x2 r5113, r5051, r5052; +} +{ +mul.f16x2 r5116, r5113, r5038; +} +{ +add.f16x2 r5119, r5110, r5116; +} +{ +sub.f16x2 r5122, r5057, r5058; +} +{ +mul.f16x2 r5125, r5122, r5037; +} +{ +sub.f16x2 r5128, r5063, r5064; +} +{ +mul.f16x2 r5131, r5128, r5039; +} +{ +add.f16x2 r5134, r5125, r5131; +} +{ +add.f16x2 r5137, r5119, r5134; +} +{ +add.f16x2 r5140, r5045, r5046; +} +{ +mul.f16x2 r5143, r5140, r5038; +} +{ +add.f16x2 r5146, r5048, r5143; +} +{ +add.f16x2 r5149, r5051, r5052; +} +{ +mul.f16x2 r5152, r5149, r5040; +} +{ +add.f16x2 r5155, r5146, r5152; +} +{ +sub.f16x2 r5158, r5057, r5058; +} +{ +mul.f16x2 r5161, r5158, r5039; +} +{ +sub.f16x2 r5164, r5063, r5064; +} +{ +mul.f16x2 r5167, r5164, r5042; +} +{ +add.f16x2 r5170, r5161, r5167; +} +{ +sub.f16x2 r5173, r5155, r5170; +} +{ +add.f16x2 r5176, r5045, r5046; +} +{ +mul.f16x2 r5179, r5176, r5038; +} +{ +add.f16x2 r5182, r5048, r5179; +} +{ +add.f16x2 r5185, r5051, r5052; +} +{ +mul.f16x2 r5188, r5185, r5040; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +sub.f16x2 r5194, r5057, r5058; +} +{ +mul.f16x2 r5197, r5194, r5039; +} +{ +sub.f16x2 r5200, r5063, r5064; +} +{ +mul.f16x2 r5203, r5200, r5042; +} +{ +add.f16x2 r5206, r5197, r5203; +} +{ +add.f16x2 r5209, r5191, r5206; +} +{ +add.f16x2 r5212, r5057, r5058; +} +{ +mul.f16x2 r5215, r5212, r5036; +} +{ +add.f16x2 r5218, r5060, r5215; +} +{ +add.f16x2 r5221, r5063, r5064; +} +{ +mul.f16x2 r5224, r5221, r5038; +} +{ +add.f16x2 r5227, r5218, r5224; +} +{ +sub.f16x2 r5230, r5045, r5046; +} +{ +mul.f16x2 r5233, r5230, r5037; +} +{ +sub.f16x2 r5236, r5051, r5052; +} +{ +mul.f16x2 r5239, r5236, r5039; +} +{ +add.f16x2 r5242, r5233, r5239; +} +{ +add.f16x2 r5245, r5227, r5242; +} +{ +add.f16x2 r5248, r5057, r5058; +} +{ +mul.f16x2 r5251, r5248, r5036; +} +{ +add.f16x2 r5254, r5060, r5251; +} +{ +add.f16x2 r5257, r5063, r5064; +} +{ +mul.f16x2 r5260, r5257, r5038; +} +{ +add.f16x2 r5263, r5254, r5260; +} +{ +sub.f16x2 r5266, r5045, r5046; +} +{ +mul.f16x2 r5269, r5266, r5037; +} +{ +sub.f16x2 r5272, r5051, r5052; +} +{ +mul.f16x2 r5275, r5272, r5039; +} +{ +add.f16x2 r5278, r5269, r5275; +} +{ +sub.f16x2 r5281, r5263, r5278; +} +{ +add.f16x2 r5284, r5057, r5058; +} +{ +mul.f16x2 r5287, r5284, r5038; +} +{ +add.f16x2 r5290, r5060, r5287; +} +{ +add.f16x2 r5293, r5063, r5064; +} +{ +mul.f16x2 r5296, r5293, r5040; +} +{ +add.f16x2 r5299, r5290, r5296; +} +{ +sub.f16x2 r5302, r5045, r5046; +} +{ +mul.f16x2 r5305, r5302, r5039; +} +{ +sub.f16x2 r5308, r5051, r5052; +} +{ +mul.f16x2 r5311, r5308, r5042; +} +{ +add.f16x2 r5314, r5305, r5311; +} +{ +add.f16x2 r5317, r5299, r5314; +} +{ +add.f16x2 r5320, r5057, r5058; +} +{ +mul.f16x2 r5323, r5320, r5038; +} +{ +add.f16x2 r5326, r5060, r5323; +} +{ +add.f16x2 r5329, r5063, r5064; +} +{ +mul.f16x2 r5332, r5329, r5040; +} +{ +add.f16x2 r5335, r5326, r5332; +} +{ +sub.f16x2 r5338, r5045, r5046; +} +{ +mul.f16x2 r5341, r5338, r5039; +} +{ +sub.f16x2 r5344, r5051, r5052; +} +{ +mul.f16x2 r5347, r5344, r5042; +} +{ +add.f16x2 r5350, r5341, r5347; +} +{ +sub.f16x2 r5353, r5335, r5350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5361, {low, high}; +} +{ +neg.f16x2 r5362, r5361; +} +{ +add.f16x2 r5364, r5365, r5366; +} +{ +add.f16x2 r5367, r5368, r5364; +} +{ +add.f16x2 r5370, r5371, r5372; +} +{ +add.f16x2 r5373, r5367, r5370; +} +{ +add.f16x2 r5376, r5377, r5378; +} +{ +add.f16x2 r5379, r5380, r5376; +} +{ +add.f16x2 r5382, r5383, r5384; +} +{ +add.f16x2 r5385, r5379, r5382; +} +{ +add.f16x2 r5388, r5365, r5366; +} +{ +mul.f16x2 r5391, r5388, r5356; +} +{ +add.f16x2 r5394, r5368, r5391; +} +{ +add.f16x2 r5397, r5371, r5372; +} +{ +mul.f16x2 r5400, r5397, r5358; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5377, r5378; +} +{ +mul.f16x2 r5409, r5406, r5357; +} +{ +sub.f16x2 r5412, r5383, r5384; +} +{ +mul.f16x2 r5415, r5412, r5359; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +sub.f16x2 r5421, r5403, r5418; +} +{ +add.f16x2 r5424, r5365, r5366; +} +{ +mul.f16x2 r5427, r5424, r5356; +} +{ +add.f16x2 r5430, r5368, r5427; +} +{ +add.f16x2 r5433, r5371, r5372; +} +{ +mul.f16x2 r5436, r5433, r5358; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5377, r5378; +} +{ +mul.f16x2 r5445, r5442, r5357; +} +{ +sub.f16x2 r5448, r5383, r5384; +} +{ +mul.f16x2 r5451, r5448, r5359; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +add.f16x2 r5457, r5439, r5454; +} +{ +add.f16x2 r5460, r5365, r5366; +} +{ +mul.f16x2 r5463, r5460, r5358; +} +{ +add.f16x2 r5466, r5368, r5463; +} +{ +add.f16x2 r5469, r5371, r5372; +} +{ +mul.f16x2 r5472, r5469, r5360; +} +{ +add.f16x2 r5475, r5466, r5472; +} +{ +sub.f16x2 r5478, r5377, r5378; +} +{ +mul.f16x2 r5481, r5478, r5359; +} +{ +sub.f16x2 r5484, r5383, r5384; +} +{ +mul.f16x2 r5487, r5484, r5362; +} +{ +add.f16x2 r5490, r5481, r5487; +} +{ +sub.f16x2 r5493, r5475, r5490; +} +{ +add.f16x2 r5496, r5365, r5366; +} +{ +mul.f16x2 r5499, r5496, r5358; +} +{ +add.f16x2 r5502, r5368, r5499; +} +{ +add.f16x2 r5505, r5371, r5372; +} +{ +mul.f16x2 r5508, r5505, r5360; +} +{ +add.f16x2 r5511, r5502, r5508; +} +{ +sub.f16x2 r5514, r5377, r5378; +} +{ +mul.f16x2 r5517, r5514, r5359; +} +{ +sub.f16x2 r5520, r5383, r5384; +} +{ +mul.f16x2 r5523, r5520, r5362; +} +{ +add.f16x2 r5526, r5517, r5523; +} +{ +add.f16x2 r5529, r5511, r5526; +} +{ +add.f16x2 r5532, r5377, r5378; +} +{ +mul.f16x2 r5535, r5532, r5356; +} +{ +add.f16x2 r5538, r5380, r5535; +} +{ +add.f16x2 r5541, r5383, r5384; +} +{ +mul.f16x2 r5544, r5541, r5358; +} +{ +add.f16x2 r5547, r5538, r5544; +} +{ +sub.f16x2 r5550, r5365, r5366; +} +{ +mul.f16x2 r5553, r5550, r5357; +} +{ +sub.f16x2 r5556, r5371, r5372; +} +{ +mul.f16x2 r5559, r5556, r5359; +} +{ +add.f16x2 r5562, r5553, r5559; +} +{ +add.f16x2 r5565, r5547, r5562; +} +{ +add.f16x2 r5568, r5377, r5378; +} +{ +mul.f16x2 r5571, r5568, r5356; +} +{ +add.f16x2 r5574, r5380, r5571; +} +{ +add.f16x2 r5577, r5383, r5384; +} +{ +mul.f16x2 r5580, r5577, r5358; +} +{ +add.f16x2 r5583, r5574, r5580; +} +{ +sub.f16x2 r5586, r5365, r5366; +} +{ +mul.f16x2 r5589, r5586, r5357; +} +{ +sub.f16x2 r5592, r5371, r5372; +} +{ +mul.f16x2 r5595, r5592, r5359; +} +{ +add.f16x2 r5598, r5589, r5595; +} +{ +sub.f16x2 r5601, r5583, r5598; +} +{ +add.f16x2 r5604, r5377, r5378; +} +{ +mul.f16x2 r5607, r5604, r5358; +} +{ +add.f16x2 r5610, r5380, r5607; +} +{ +add.f16x2 r5613, r5383, r5384; +} +{ +mul.f16x2 r5616, r5613, r5360; +} +{ +add.f16x2 r5619, r5610, r5616; +} +{ +sub.f16x2 r5622, r5365, r5366; +} +{ +mul.f16x2 r5625, r5622, r5359; +} +{ +sub.f16x2 r5628, r5371, r5372; +} +{ +mul.f16x2 r5631, r5628, r5362; +} +{ +add.f16x2 r5634, r5625, r5631; +} +{ +add.f16x2 r5637, r5619, r5634; +} +{ +add.f16x2 r5640, r5377, r5378; +} +{ +mul.f16x2 r5643, r5640, r5358; +} +{ +add.f16x2 r5646, r5380, r5643; +} +{ +add.f16x2 r5649, r5383, r5384; +} +{ +mul.f16x2 r5652, r5649, r5360; +} +{ +add.f16x2 r5655, r5646, r5652; +} +{ +sub.f16x2 r5658, r5365, r5366; +} +{ +mul.f16x2 r5661, r5658, r5359; +} +{ +sub.f16x2 r5664, r5371, r5372; +} +{ +mul.f16x2 r5667, r5664, r5362; +} +{ +add.f16x2 r5670, r5661, r5667; +} +{ +sub.f16x2 r5673, r5655, r5670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5680, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5681, {low, high}; +} +{ +neg.f16x2 r5682, r5681; +} +{ +add.f16x2 r5684, r5685, r5686; +} +{ +add.f16x2 r5687, r5688, r5684; +} +{ +add.f16x2 r5690, r5691, r5692; +} +{ +add.f16x2 r5693, r5687, r5690; +} +{ +add.f16x2 r5696, r5697, r5698; +} +{ +add.f16x2 r5699, r5700, r5696; +} +{ +add.f16x2 r5702, r5703, r5704; +} +{ +add.f16x2 r5705, r5699, r5702; +} +{ +add.f16x2 r5708, r5685, r5686; +} +{ +mul.f16x2 r5711, r5708, r5676; +} +{ +add.f16x2 r5714, r5688, r5711; +} +{ +add.f16x2 r5717, r5691, r5692; +} +{ +mul.f16x2 r5720, r5717, r5678; +} +{ +add.f16x2 r5723, r5714, r5720; +} +{ +sub.f16x2 r5726, r5697, r5698; +} +{ +mul.f16x2 r5729, r5726, r5677; +} +{ +sub.f16x2 r5732, r5703, r5704; +} +{ +mul.f16x2 r5735, r5732, r5679; +} +{ +add.f16x2 r5738, r5729, r5735; +} +{ +sub.f16x2 r5741, r5723, r5738; +} +{ +add.f16x2 r5744, r5685, r5686; +} +{ +mul.f16x2 r5747, r5744, r5676; +} +{ +add.f16x2 r5750, r5688, r5747; +} +{ +add.f16x2 r5753, r5691, r5692; +} +{ +mul.f16x2 r5756, r5753, r5678; +} +{ +add.f16x2 r5759, r5750, r5756; +} +{ +sub.f16x2 r5762, r5697, r5698; +} +{ +mul.f16x2 r5765, r5762, r5677; +} +{ +sub.f16x2 r5768, r5703, r5704; +} +{ +mul.f16x2 r5771, r5768, r5679; +} +{ +add.f16x2 r5774, r5765, r5771; +} +{ +add.f16x2 r5777, r5759, r5774; +} +{ +add.f16x2 r5780, r5685, r5686; +} +{ +mul.f16x2 r5783, r5780, r5678; +} +{ +add.f16x2 r5786, r5688, r5783; +} +{ +add.f16x2 r5789, r5691, r5692; +} +{ +mul.f16x2 r5792, r5789, r5680; +} +{ +add.f16x2 r5795, r5786, r5792; +} +{ +sub.f16x2 r5798, r5697, r5698; +} +{ +mul.f16x2 r5801, r5798, r5679; +} +{ +sub.f16x2 r5804, r5703, r5704; +} +{ +mul.f16x2 r5807, r5804, r5682; +} +{ +add.f16x2 r5810, r5801, r5807; +} +{ +sub.f16x2 r5813, r5795, r5810; +} +{ +add.f16x2 r5816, r5685, r5686; +} +{ +mul.f16x2 r5819, r5816, r5678; +} +{ +add.f16x2 r5822, r5688, r5819; +} +{ +add.f16x2 r5825, r5691, r5692; +} +{ +mul.f16x2 r5828, r5825, r5680; +} +{ +add.f16x2 r5831, r5822, r5828; +} +{ +sub.f16x2 r5834, r5697, r5698; +} +{ +mul.f16x2 r5837, r5834, r5679; +} +{ +sub.f16x2 r5840, r5703, r5704; +} +{ +mul.f16x2 r5843, r5840, r5682; +} +{ +add.f16x2 r5846, r5837, r5843; +} +{ +add.f16x2 r5849, r5831, r5846; +} +{ +add.f16x2 r5852, r5697, r5698; +} +{ +mul.f16x2 r5855, r5852, r5676; +} +{ +add.f16x2 r5858, r5700, r5855; +} +{ +add.f16x2 r5861, r5703, r5704; +} +{ +mul.f16x2 r5864, r5861, r5678; +} +{ +add.f16x2 r5867, r5858, r5864; +} +{ +sub.f16x2 r5870, r5685, r5686; +} +{ +mul.f16x2 r5873, r5870, r5677; +} +{ +sub.f16x2 r5876, r5691, r5692; +} +{ +mul.f16x2 r5879, r5876, r5679; +} +{ +add.f16x2 r5882, r5873, r5879; +} +{ +add.f16x2 r5885, r5867, r5882; +} +{ +add.f16x2 r5888, r5697, r5698; +} +{ +mul.f16x2 r5891, r5888, r5676; +} +{ +add.f16x2 r5894, r5700, r5891; +} +{ +add.f16x2 r5897, r5703, r5704; +} +{ +mul.f16x2 r5900, r5897, r5678; +} +{ +add.f16x2 r5903, r5894, r5900; +} +{ +sub.f16x2 r5906, r5685, r5686; +} +{ +mul.f16x2 r5909, r5906, r5677; +} +{ +sub.f16x2 r5912, r5691, r5692; +} +{ +mul.f16x2 r5915, r5912, r5679; +} +{ +add.f16x2 r5918, r5909, r5915; +} +{ +sub.f16x2 r5921, r5903, r5918; +} +{ +add.f16x2 r5924, r5697, r5698; +} +{ +mul.f16x2 r5927, r5924, r5678; +} +{ +add.f16x2 r5930, r5700, r5927; +} +{ +add.f16x2 r5933, r5703, r5704; +} +{ +mul.f16x2 r5936, r5933, r5680; +} +{ +add.f16x2 r5939, r5930, r5936; +} +{ +sub.f16x2 r5942, r5685, r5686; +} +{ +mul.f16x2 r5945, r5942, r5679; +} +{ +sub.f16x2 r5948, r5691, r5692; +} +{ +mul.f16x2 r5951, r5948, r5682; +} +{ +add.f16x2 r5954, r5945, r5951; +} +{ +add.f16x2 r5957, r5939, r5954; +} +{ +add.f16x2 r5960, r5697, r5698; +} +{ +mul.f16x2 r5963, r5960, r5678; +} +{ +add.f16x2 r5966, r5700, r5963; +} +{ +add.f16x2 r5969, r5703, r5704; +} +{ +mul.f16x2 r5972, r5969, r5680; +} +{ +add.f16x2 r5975, r5966, r5972; +} +{ +sub.f16x2 r5978, r5685, r5686; +} +{ +mul.f16x2 r5981, r5978, r5679; +} +{ +sub.f16x2 r5984, r5691, r5692; +} +{ +mul.f16x2 r5987, r5984, r5682; +} +{ +add.f16x2 r5990, r5981, r5987; +} +{ +sub.f16x2 r5993, r5975, r5990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r5996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r5997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r5998, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r5999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6000, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6001, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6002, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6003, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6006, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6007, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6027, {low, high}; +} +{ +mul.f16x2 r6044, r4781, r5996; +} +{ +mul.f16x2 r6047, r4925, r5997; +} +{ +sub.f16x2 r6050, r6044, r6047; +} +{ +mul.f16x2 r6053, r4781, r5997; +} +{ +fma.rn.f16x2 r6056, r4925, r5996, r6053; +} +{ +mul.f16x2 r6060, r5101, r5998; +} +{ +mul.f16x2 r6063, r5245, r5999; +} +{ +sub.f16x2 r6066, r6060, r6063; +} +{ +mul.f16x2 r6069, r5101, r5999; +} +{ +fma.rn.f16x2 r6072, r5245, r5998, r6069; +} +{ +mul.f16x2 r6076, r5421, r6000; +} +{ +mul.f16x2 r6079, r5565, r6001; +} +{ +sub.f16x2 r6082, r6076, r6079; +} +{ +mul.f16x2 r6085, r5421, r6001; +} +{ +fma.rn.f16x2 r6088, r5565, r6000, r6085; +} +{ +mul.f16x2 r6092, r5741, r6002; +} +{ +mul.f16x2 r6095, r5885, r6003; +} +{ +sub.f16x2 r6098, r6092, r6095; +} +{ +mul.f16x2 r6101, r5741, r6003; +} +{ +fma.rn.f16x2 r6104, r5885, r6002, r6101; +} +{ +mul.f16x2 r6108, r4853, r5998; +} +{ +mul.f16x2 r6111, r4997, r5999; +} +{ +sub.f16x2 r6114, r6108, r6111; +} +{ +mul.f16x2 r6117, r4853, r5999; +} +{ +fma.rn.f16x2 r6120, r4997, r5998, r6117; +} +{ +mul.f16x2 r6124, r5173, r6002; +} +{ +mul.f16x2 r6127, r5317, r6003; +} +{ +sub.f16x2 r6130, r6124, r6127; +} +{ +mul.f16x2 r6133, r5173, r6003; +} +{ +fma.rn.f16x2 r6136, r5317, r6002, r6133; +} +{ +mul.f16x2 r6140, r5493, r6006; +} +{ +mul.f16x2 r6143, r5637, r6007; +} +{ +sub.f16x2 r6146, r6140, r6143; +} +{ +mul.f16x2 r6149, r5493, r6007; +} +{ +fma.rn.f16x2 r6152, r5637, r6006, r6149; +} +{ +mul.f16x2 r6156, r5813, r6010; +} +{ +mul.f16x2 r6159, r5957, r6011; +} +{ +sub.f16x2 r6162, r6156, r6159; +} +{ +mul.f16x2 r6165, r5813, r6011; +} +{ +fma.rn.f16x2 r6168, r5957, r6010, r6165; +} +{ +mul.f16x2 r6172, r4889, r6000; +} +{ +mul.f16x2 r6175, r5033, r6001; +} +{ +sub.f16x2 r6178, r6172, r6175; +} +{ +mul.f16x2 r6181, r4889, r6001; +} +{ +fma.rn.f16x2 r6184, r5033, r6000, r6181; +} +{ +mul.f16x2 r6188, r5209, r6006; +} +{ +mul.f16x2 r6191, r5353, r6007; +} +{ +sub.f16x2 r6194, r6188, r6191; +} +{ +mul.f16x2 r6197, r5209, r6007; +} +{ +fma.rn.f16x2 r6200, r5353, r6006, r6197; +} +{ +mul.f16x2 r6204, r5529, r6012; +} +{ +mul.f16x2 r6207, r5673, r6013; +} +{ +sub.f16x2 r6210, r6204, r6207; +} +{ +mul.f16x2 r6213, r5529, r6013; +} +{ +fma.rn.f16x2 r6216, r5673, r6012, r6213; +} +{ +mul.f16x2 r6220, r5849, r6018; +} +{ +mul.f16x2 r6223, r5993, r6019; +} +{ +sub.f16x2 r6226, r6220, r6223; +} +{ +mul.f16x2 r6229, r5849, r6019; +} +{ +fma.rn.f16x2 r6232, r5993, r6018, r6229; +} +{ +mul.f16x2 r6236, r4817, r6002; +} +{ +mul.f16x2 r6239, r4961, r6003; +} +{ +sub.f16x2 r6242, r6236, r6239; +} +{ +mul.f16x2 r6245, r4817, r6003; +} +{ +fma.rn.f16x2 r6248, r4961, r6002, r6245; +} +{ +mul.f16x2 r6252, r5137, r6010; +} +{ +mul.f16x2 r6255, r5281, r6011; +} +{ +sub.f16x2 r6258, r6252, r6255; +} +{ +mul.f16x2 r6261, r5137, r6011; +} +{ +fma.rn.f16x2 r6264, r5281, r6010, r6261; +} +{ +mul.f16x2 r6268, r5457, r6018; +} +{ +mul.f16x2 r6271, r5601, r6019; +} +{ +sub.f16x2 r6274, r6268, r6271; +} +{ +mul.f16x2 r6277, r5457, r6019; +} +{ +fma.rn.f16x2 r6280, r5601, r6018, r6277; +} +{ +mul.f16x2 r6284, r5777, r6026; +} +{ +mul.f16x2 r6287, r5921, r6027; +} +{ +sub.f16x2 r6290, r6284, r6287; +} +{ +mul.f16x2 r6293, r5777, r6027; +} +{ +fma.rn.f16x2 r6296, r5921, r6026, r6293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6302, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6305, {low, high}; +} +{ +neg.f16x2 r6306, r6305; +} +{ +add.f16x2 r6308, r4733, r5693; +} +{ +add.f16x2 r6311, r4413, r6308; +} +{ +add.f16x2 r6314, r5053, r5373; +} +{ +add.f16x2 r6317, r6311, r6314; +} +{ +add.f16x2 r6320, r4745, r5705; +} +{ +add.f16x2 r6323, r4425, r6320; +} +{ +add.f16x2 r6326, r5065, r5385; +} +{ +add.f16x2 r6329, r6323, r6326; +} +{ +add.f16x2 r6332, r4733, r5693; +} +{ +mul.f16x2 r6335, r6332, r6300; +} +{ +add.f16x2 r6338, r4413, r6335; +} +{ +add.f16x2 r6341, r5053, r5373; +} +{ +mul.f16x2 r6344, r6341, r6302; +} +{ +add.f16x2 r6347, r6338, r6344; +} +{ +sub.f16x2 r6350, r4745, r5705; +} +{ +mul.f16x2 r6353, r6350, r6301; +} +{ +sub.f16x2 r6356, r5065, r5385; +} +{ +mul.f16x2 r6359, r6356, r6303; +} +{ +add.f16x2 r6362, r6353, r6359; +} +{ +sub.f16x2 r6365, r6347, r6362; +} +{ +add.f16x2 r6368, r4733, r5693; +} +{ +mul.f16x2 r6371, r6368, r6300; +} +{ +add.f16x2 r6374, r4413, r6371; +} +{ +add.f16x2 r6377, r5053, r5373; +} +{ +mul.f16x2 r6380, r6377, r6302; +} +{ +add.f16x2 r6383, r6374, r6380; +} +{ +sub.f16x2 r6386, r4745, r5705; +} +{ +mul.f16x2 r6389, r6386, r6301; +} +{ +sub.f16x2 r6392, r5065, r5385; +} +{ +mul.f16x2 r6395, r6392, r6303; +} +{ +add.f16x2 r6398, r6389, r6395; +} +{ +add.f16x2 r6401, r6383, r6398; +} +{ +add.f16x2 r6404, r4733, r5693; +} +{ +mul.f16x2 r6407, r6404, r6302; +} +{ +add.f16x2 r6410, r4413, r6407; +} +{ +add.f16x2 r6413, r5053, r5373; +} +{ +mul.f16x2 r6416, r6413, r6304; +} +{ +add.f16x2 r6419, r6410, r6416; +} +{ +sub.f16x2 r6422, r4745, r5705; +} +{ +mul.f16x2 r6425, r6422, r6303; +} +{ +sub.f16x2 r6428, r5065, r5385; +} +{ +mul.f16x2 r6431, r6428, r6306; +} +{ +add.f16x2 r6434, r6425, r6431; +} +{ +sub.f16x2 r6437, r6419, r6434; +} +{ +add.f16x2 r6440, r4733, r5693; +} +{ +mul.f16x2 r6443, r6440, r6302; +} +{ +add.f16x2 r6446, r4413, r6443; +} +{ +add.f16x2 r6449, r5053, r5373; +} +{ +mul.f16x2 r6452, r6449, r6304; +} +{ +add.f16x2 r6455, r6446, r6452; +} +{ +sub.f16x2 r6458, r4745, r5705; +} +{ +mul.f16x2 r6461, r6458, r6303; +} +{ +sub.f16x2 r6464, r5065, r5385; +} +{ +mul.f16x2 r6467, r6464, r6306; +} +{ +add.f16x2 r6470, r6461, r6467; +} +{ +add.f16x2 r6473, r6455, r6470; +} +{ +add.f16x2 r6476, r4745, r5705; +} +{ +mul.f16x2 r6479, r6476, r6300; +} +{ +add.f16x2 r6482, r4425, r6479; +} +{ +add.f16x2 r6485, r5065, r5385; +} +{ +mul.f16x2 r6488, r6485, r6302; +} +{ +add.f16x2 r6491, r6482, r6488; +} +{ +sub.f16x2 r6494, r4733, r5693; +} +{ +mul.f16x2 r6497, r6494, r6301; +} +{ +sub.f16x2 r6500, r5053, r5373; +} +{ +mul.f16x2 r6503, r6500, r6303; +} +{ +add.f16x2 r6506, r6497, r6503; +} +{ +add.f16x2 r6509, r6491, r6506; +} +{ +add.f16x2 r6512, r4745, r5705; +} +{ +mul.f16x2 r6515, r6512, r6300; +} +{ +add.f16x2 r6518, r4425, r6515; +} +{ +add.f16x2 r6521, r5065, r5385; +} +{ +mul.f16x2 r6524, r6521, r6302; +} +{ +add.f16x2 r6527, r6518, r6524; +} +{ +sub.f16x2 r6530, r4733, r5693; +} +{ +mul.f16x2 r6533, r6530, r6301; +} +{ +sub.f16x2 r6536, r5053, r5373; +} +{ +mul.f16x2 r6539, r6536, r6303; +} +{ +add.f16x2 r6542, r6533, r6539; +} +{ +sub.f16x2 r6545, r6527, r6542; +} +{ +add.f16x2 r6548, r4745, r5705; +} +{ +mul.f16x2 r6551, r6548, r6302; +} +{ +add.f16x2 r6554, r4425, r6551; +} +{ +add.f16x2 r6557, r5065, r5385; +} +{ +mul.f16x2 r6560, r6557, r6304; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +sub.f16x2 r6566, r4733, r5693; +} +{ +mul.f16x2 r6569, r6566, r6303; +} +{ +sub.f16x2 r6572, r5053, r5373; +} +{ +mul.f16x2 r6575, r6572, r6306; +} +{ +add.f16x2 r6578, r6569, r6575; +} +{ +add.f16x2 r6581, r6563, r6578; +} +{ +add.f16x2 r6584, r4745, r5705; +} +{ +mul.f16x2 r6587, r6584, r6302; +} +{ +add.f16x2 r6590, r4425, r6587; +} +{ +add.f16x2 r6593, r5065, r5385; +} +{ +mul.f16x2 r6596, r6593, r6304; +} +{ +add.f16x2 r6599, r6590, r6596; +} +{ +sub.f16x2 r6602, r4733, r5693; +} +{ +mul.f16x2 r6605, r6602, r6303; +} +{ +sub.f16x2 r6608, r5053, r5373; +} +{ +mul.f16x2 r6611, r6608, r6306; +} +{ +add.f16x2 r6614, r6605, r6611; +} +{ +sub.f16x2 r6617, r6599, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6623, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6625, {low, high}; +} +{ +neg.f16x2 r6626, r6625; +} +{ +add.f16x2 r6628, r6050, r6098; +} +{ +add.f16x2 r6631, r4461, r6628; +} +{ +add.f16x2 r6634, r6066, r6082; +} +{ +add.f16x2 r6637, r6631, r6634; +} +{ +add.f16x2 r6640, r6056, r6104; +} +{ +add.f16x2 r6643, r4605, r6640; +} +{ +add.f16x2 r6646, r6072, r6088; +} +{ +add.f16x2 r6649, r6643, r6646; +} +{ +add.f16x2 r6652, r6050, r6098; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4461, r6655; +} +{ +add.f16x2 r6661, r6066, r6082; +} +{ +mul.f16x2 r6664, r6661, r6622; +} +{ +add.f16x2 r6667, r6658, r6664; +} +{ +sub.f16x2 r6670, r6056, r6104; +} +{ +mul.f16x2 r6673, r6670, r6621; +} +{ +sub.f16x2 r6676, r6072, r6088; +} +{ +mul.f16x2 r6679, r6676, r6623; +} +{ +add.f16x2 r6682, r6673, r6679; +} +{ +sub.f16x2 r6685, r6667, r6682; +} +{ +add.f16x2 r6688, r6050, r6098; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4461, r6691; +} +{ +add.f16x2 r6697, r6066, r6082; +} +{ +mul.f16x2 r6700, r6697, r6622; +} +{ +add.f16x2 r6703, r6694, r6700; +} +{ +sub.f16x2 r6706, r6056, r6104; +} +{ +mul.f16x2 r6709, r6706, r6621; +} +{ +sub.f16x2 r6712, r6072, r6088; +} +{ +mul.f16x2 r6715, r6712, r6623; +} +{ +add.f16x2 r6718, r6709, r6715; +} +{ +add.f16x2 r6721, r6703, r6718; +} +{ +add.f16x2 r6724, r6050, r6098; +} +{ +mul.f16x2 r6727, r6724, r6622; +} +{ +add.f16x2 r6730, r4461, r6727; +} +{ +add.f16x2 r6733, r6066, r6082; +} +{ +mul.f16x2 r6736, r6733, r6624; +} +{ +add.f16x2 r6739, r6730, r6736; +} +{ +sub.f16x2 r6742, r6056, r6104; +} +{ +mul.f16x2 r6745, r6742, r6623; +} +{ +sub.f16x2 r6748, r6072, r6088; +} +{ +mul.f16x2 r6751, r6748, r6626; +} +{ +add.f16x2 r6754, r6745, r6751; +} +{ +sub.f16x2 r6757, r6739, r6754; +} +{ +add.f16x2 r6760, r6050, r6098; +} +{ +mul.f16x2 r6763, r6760, r6622; +} +{ +add.f16x2 r6766, r4461, r6763; +} +{ +add.f16x2 r6769, r6066, r6082; +} +{ +mul.f16x2 r6772, r6769, r6624; +} +{ +add.f16x2 r6775, r6766, r6772; +} +{ +sub.f16x2 r6778, r6056, r6104; +} +{ +mul.f16x2 r6781, r6778, r6623; +} +{ +sub.f16x2 r6784, r6072, r6088; +} +{ +mul.f16x2 r6787, r6784, r6626; +} +{ +add.f16x2 r6790, r6781, r6787; +} +{ +add.f16x2 r6793, r6775, r6790; +} +{ +add.f16x2 r6796, r6056, r6104; +} +{ +mul.f16x2 r6799, r6796, r6620; +} +{ +add.f16x2 r6802, r4605, r6799; +} +{ +add.f16x2 r6805, r6072, r6088; +} +{ +mul.f16x2 r6808, r6805, r6622; +} +{ +add.f16x2 r6811, r6802, r6808; +} +{ +sub.f16x2 r6814, r6050, r6098; +} +{ +mul.f16x2 r6817, r6814, r6621; +} +{ +sub.f16x2 r6820, r6066, r6082; +} +{ +mul.f16x2 r6823, r6820, r6623; +} +{ +add.f16x2 r6826, r6817, r6823; +} +{ +add.f16x2 r6829, r6811, r6826; +} +{ +add.f16x2 r6832, r6056, r6104; +} +{ +mul.f16x2 r6835, r6832, r6620; +} +{ +add.f16x2 r6838, r4605, r6835; +} +{ +add.f16x2 r6841, r6072, r6088; +} +{ +mul.f16x2 r6844, r6841, r6622; +} +{ +add.f16x2 r6847, r6838, r6844; +} +{ +sub.f16x2 r6850, r6050, r6098; +} +{ +mul.f16x2 r6853, r6850, r6621; +} +{ +sub.f16x2 r6856, r6066, r6082; +} +{ +mul.f16x2 r6859, r6856, r6623; +} +{ +add.f16x2 r6862, r6853, r6859; +} +{ +sub.f16x2 r6865, r6847, r6862; +} +{ +add.f16x2 r6868, r6056, r6104; +} +{ +mul.f16x2 r6871, r6868, r6622; +} +{ +add.f16x2 r6874, r4605, r6871; +} +{ +add.f16x2 r6877, r6072, r6088; +} +{ +mul.f16x2 r6880, r6877, r6624; +} +{ +add.f16x2 r6883, r6874, r6880; +} +{ +sub.f16x2 r6886, r6050, r6098; +} +{ +mul.f16x2 r6889, r6886, r6623; +} +{ +sub.f16x2 r6892, r6066, r6082; +} +{ +mul.f16x2 r6895, r6892, r6626; +} +{ +add.f16x2 r6898, r6889, r6895; +} +{ +add.f16x2 r6901, r6883, r6898; +} +{ +add.f16x2 r6904, r6056, r6104; +} +{ +mul.f16x2 r6907, r6904, r6622; +} +{ +add.f16x2 r6910, r4605, r6907; +} +{ +add.f16x2 r6913, r6072, r6088; +} +{ +mul.f16x2 r6916, r6913, r6624; +} +{ +add.f16x2 r6919, r6910, r6916; +} +{ +sub.f16x2 r6922, r6050, r6098; +} +{ +mul.f16x2 r6925, r6922, r6623; +} +{ +sub.f16x2 r6928, r6066, r6082; +} +{ +mul.f16x2 r6931, r6928, r6626; +} +{ +add.f16x2 r6934, r6925, r6931; +} +{ +sub.f16x2 r6937, r6919, r6934; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6945, {low, high}; +} +{ +neg.f16x2 r6946, r6945; +} +{ +add.f16x2 r6948, r6114, r6162; +} +{ +add.f16x2 r6951, r4533, r6948; +} +{ +add.f16x2 r6954, r6130, r6146; +} +{ +add.f16x2 r6957, r6951, r6954; +} +{ +add.f16x2 r6960, r6120, r6168; +} +{ +add.f16x2 r6963, r4677, r6960; +} +{ +add.f16x2 r6966, r6136, r6152; +} +{ +add.f16x2 r6969, r6963, r6966; +} +{ +add.f16x2 r6972, r6114, r6162; +} +{ +mul.f16x2 r6975, r6972, r6940; +} +{ +add.f16x2 r6978, r4533, r6975; +} +{ +add.f16x2 r6981, r6130, r6146; +} +{ +mul.f16x2 r6984, r6981, r6942; +} +{ +add.f16x2 r6987, r6978, r6984; +} +{ +sub.f16x2 r6990, r6120, r6168; +} +{ +mul.f16x2 r6993, r6990, r6941; +} +{ +sub.f16x2 r6996, r6136, r6152; +} +{ +mul.f16x2 r6999, r6996, r6943; +} +{ +add.f16x2 r7002, r6993, r6999; +} +{ +sub.f16x2 r7005, r6987, r7002; +} +{ +add.f16x2 r7008, r6114, r6162; +} +{ +mul.f16x2 r7011, r7008, r6940; +} +{ +add.f16x2 r7014, r4533, r7011; +} +{ +add.f16x2 r7017, r6130, r6146; +} +{ +mul.f16x2 r7020, r7017, r6942; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6120, r6168; +} +{ +mul.f16x2 r7029, r7026, r6941; +} +{ +sub.f16x2 r7032, r6136, r6152; +} +{ +mul.f16x2 r7035, r7032, r6943; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +add.f16x2 r7041, r7023, r7038; +} +{ +add.f16x2 r7044, r6114, r6162; +} +{ +mul.f16x2 r7047, r7044, r6942; +} +{ +add.f16x2 r7050, r4533, r7047; +} +{ +add.f16x2 r7053, r6130, r6146; +} +{ +mul.f16x2 r7056, r7053, r6944; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6120, r6168; +} +{ +mul.f16x2 r7065, r7062, r6943; +} +{ +sub.f16x2 r7068, r6136, r6152; +} +{ +mul.f16x2 r7071, r7068, r6946; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +sub.f16x2 r7077, r7059, r7074; +} +{ +add.f16x2 r7080, r6114, r6162; +} +{ +mul.f16x2 r7083, r7080, r6942; +} +{ +add.f16x2 r7086, r4533, r7083; +} +{ +add.f16x2 r7089, r6130, r6146; +} +{ +mul.f16x2 r7092, r7089, r6944; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6120, r6168; +} +{ +mul.f16x2 r7101, r7098, r6943; +} +{ +sub.f16x2 r7104, r6136, r6152; +} +{ +mul.f16x2 r7107, r7104, r6946; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +add.f16x2 r7113, r7095, r7110; +} +{ +add.f16x2 r7116, r6120, r6168; +} +{ +mul.f16x2 r7119, r7116, r6940; +} +{ +add.f16x2 r7122, r4677, r7119; +} +{ +add.f16x2 r7125, r6136, r6152; +} +{ +mul.f16x2 r7128, r7125, r6942; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6114, r6162; +} +{ +mul.f16x2 r7137, r7134, r6941; +} +{ +sub.f16x2 r7140, r6130, r6146; +} +{ +mul.f16x2 r7143, r7140, r6943; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 r7149, r7131, r7146; +} +{ +add.f16x2 r7152, r6120, r6168; +} +{ +mul.f16x2 r7155, r7152, r6940; +} +{ +add.f16x2 r7158, r4677, r7155; +} +{ +add.f16x2 r7161, r6136, r6152; +} +{ +mul.f16x2 r7164, r7161, r6942; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6114, r6162; +} +{ +mul.f16x2 r7173, r7170, r6941; +} +{ +sub.f16x2 r7176, r6130, r6146; +} +{ +mul.f16x2 r7179, r7176, r6943; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +sub.f16x2 r7185, r7167, r7182; +} +{ +add.f16x2 r7188, r6120, r6168; +} +{ +mul.f16x2 r7191, r7188, r6942; +} +{ +add.f16x2 r7194, r4677, r7191; +} +{ +add.f16x2 r7197, r6136, r6152; +} +{ +mul.f16x2 r7200, r7197, r6944; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6114, r6162; +} +{ +mul.f16x2 r7209, r7206, r6943; +} +{ +sub.f16x2 r7212, r6130, r6146; +} +{ +mul.f16x2 r7215, r7212, r6946; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +add.f16x2 r7221, r7203, r7218; +} +{ +add.f16x2 r7224, r6120, r6168; +} +{ +mul.f16x2 r7227, r7224, r6942; +} +{ +add.f16x2 r7230, r4677, r7227; +} +{ +add.f16x2 r7233, r6136, r6152; +} +{ +mul.f16x2 r7236, r7233, r6944; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6114, r6162; +} +{ +mul.f16x2 r7245, r7242, r6943; +} +{ +sub.f16x2 r7248, r6130, r6146; +} +{ +mul.f16x2 r7251, r7248, r6946; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +sub.f16x2 r7257, r7239, r7254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7265, {low, high}; +} +{ +neg.f16x2 r7266, r7265; +} +{ +add.f16x2 r7268, r6178, r6226; +} +{ +add.f16x2 r7271, r4569, r7268; +} +{ +add.f16x2 r7274, r6194, r6210; +} +{ +add.f16x2 r7277, r7271, r7274; +} +{ +add.f16x2 r7280, r6184, r6232; +} +{ +add.f16x2 r7283, r4713, r7280; +} +{ +add.f16x2 r7286, r6200, r6216; +} +{ +add.f16x2 r7289, r7283, r7286; +} +{ +add.f16x2 r7292, r6178, r6226; +} +{ +mul.f16x2 r7295, r7292, r7260; +} +{ +add.f16x2 r7298, r4569, r7295; +} +{ +add.f16x2 r7301, r6194, r6210; +} +{ +mul.f16x2 r7304, r7301, r7262; +} +{ +add.f16x2 r7307, r7298, r7304; +} +{ +sub.f16x2 r7310, r6184, r6232; +} +{ +mul.f16x2 r7313, r7310, r7261; +} +{ +sub.f16x2 r7316, r6200, r6216; +} +{ +mul.f16x2 r7319, r7316, r7263; +} +{ +add.f16x2 r7322, r7313, r7319; +} +{ +sub.f16x2 r7325, r7307, r7322; +} +{ +add.f16x2 r7328, r6178, r6226; +} +{ +mul.f16x2 r7331, r7328, r7260; +} +{ +add.f16x2 r7334, r4569, r7331; +} +{ +add.f16x2 r7337, r6194, r6210; +} +{ +mul.f16x2 r7340, r7337, r7262; +} +{ +add.f16x2 r7343, r7334, r7340; +} +{ +sub.f16x2 r7346, r6184, r6232; +} +{ +mul.f16x2 r7349, r7346, r7261; +} +{ +sub.f16x2 r7352, r6200, r6216; +} +{ +mul.f16x2 r7355, r7352, r7263; +} +{ +add.f16x2 r7358, r7349, r7355; +} +{ +add.f16x2 r7361, r7343, r7358; +} +{ +add.f16x2 r7364, r6178, r6226; +} +{ +mul.f16x2 r7367, r7364, r7262; +} +{ +add.f16x2 r7370, r4569, r7367; +} +{ +add.f16x2 r7373, r6194, r6210; +} +{ +mul.f16x2 r7376, r7373, r7264; +} +{ +add.f16x2 r7379, r7370, r7376; +} +{ +sub.f16x2 r7382, r6184, r6232; +} +{ +mul.f16x2 r7385, r7382, r7263; +} +{ +sub.f16x2 r7388, r6200, r6216; +} +{ +mul.f16x2 r7391, r7388, r7266; +} +{ +add.f16x2 r7394, r7385, r7391; +} +{ +sub.f16x2 r7397, r7379, r7394; +} +{ +add.f16x2 r7400, r6178, r6226; +} +{ +mul.f16x2 r7403, r7400, r7262; +} +{ +add.f16x2 r7406, r4569, r7403; +} +{ +add.f16x2 r7409, r6194, r6210; +} +{ +mul.f16x2 r7412, r7409, r7264; +} +{ +add.f16x2 r7415, r7406, r7412; +} +{ +sub.f16x2 r7418, r6184, r6232; +} +{ +mul.f16x2 r7421, r7418, r7263; +} +{ +sub.f16x2 r7424, r6200, r6216; +} +{ +mul.f16x2 r7427, r7424, r7266; +} +{ +add.f16x2 r7430, r7421, r7427; +} +{ +add.f16x2 r7433, r7415, r7430; +} +{ +add.f16x2 r7436, r6184, r6232; +} +{ +mul.f16x2 r7439, r7436, r7260; +} +{ +add.f16x2 r7442, r4713, r7439; +} +{ +add.f16x2 r7445, r6200, r6216; +} +{ +mul.f16x2 r7448, r7445, r7262; +} +{ +add.f16x2 r7451, r7442, r7448; +} +{ +sub.f16x2 r7454, r6178, r6226; +} +{ +mul.f16x2 r7457, r7454, r7261; +} +{ +sub.f16x2 r7460, r6194, r6210; +} +{ +mul.f16x2 r7463, r7460, r7263; +} +{ +add.f16x2 r7466, r7457, r7463; +} +{ +add.f16x2 r7469, r7451, r7466; +} +{ +add.f16x2 r7472, r6184, r6232; +} +{ +mul.f16x2 r7475, r7472, r7260; +} +{ +add.f16x2 r7478, r4713, r7475; +} +{ +add.f16x2 r7481, r6200, r6216; +} +{ +mul.f16x2 r7484, r7481, r7262; +} +{ +add.f16x2 r7487, r7478, r7484; +} +{ +sub.f16x2 r7490, r6178, r6226; +} +{ +mul.f16x2 r7493, r7490, r7261; +} +{ +sub.f16x2 r7496, r6194, r6210; +} +{ +mul.f16x2 r7499, r7496, r7263; +} +{ +add.f16x2 r7502, r7493, r7499; +} +{ +sub.f16x2 r7505, r7487, r7502; +} +{ +add.f16x2 r7508, r6184, r6232; +} +{ +mul.f16x2 r7511, r7508, r7262; +} +{ +add.f16x2 r7514, r4713, r7511; +} +{ +add.f16x2 r7517, r6200, r6216; +} +{ +mul.f16x2 r7520, r7517, r7264; +} +{ +add.f16x2 r7523, r7514, r7520; +} +{ +sub.f16x2 r7526, r6178, r6226; +} +{ +mul.f16x2 r7529, r7526, r7263; +} +{ +sub.f16x2 r7532, r6194, r6210; +} +{ +mul.f16x2 r7535, r7532, r7266; +} +{ +add.f16x2 r7538, r7529, r7535; +} +{ +add.f16x2 r7541, r7523, r7538; +} +{ +add.f16x2 r7544, r6184, r6232; +} +{ +mul.f16x2 r7547, r7544, r7262; +} +{ +add.f16x2 r7550, r4713, r7547; +} +{ +add.f16x2 r7553, r6200, r6216; +} +{ +mul.f16x2 r7556, r7553, r7264; +} +{ +add.f16x2 r7559, r7550, r7556; +} +{ +sub.f16x2 r7562, r6178, r6226; +} +{ +mul.f16x2 r7565, r7562, r7263; +} +{ +sub.f16x2 r7568, r6194, r6210; +} +{ +mul.f16x2 r7571, r7568, r7266; +} +{ +add.f16x2 r7574, r7565, r7571; +} +{ +sub.f16x2 r7577, r7559, r7574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7581, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7582, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7584, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7585, {low, high}; +} +{ +neg.f16x2 r7586, r7585; +} +{ +add.f16x2 r7588, r6242, r6290; +} +{ +add.f16x2 r7591, r4497, r7588; +} +{ +add.f16x2 r7594, r6258, r6274; +} +{ +add.f16x2 r7597, r7591, r7594; +} +{ +add.f16x2 r7600, r6248, r6296; +} +{ +add.f16x2 r7603, r4641, r7600; +} +{ +add.f16x2 r7606, r6264, r6280; +} +{ +add.f16x2 r7609, r7603, r7606; +} +{ +add.f16x2 r7612, r6242, r6290; +} +{ +mul.f16x2 r7615, r7612, r7580; +} +{ +add.f16x2 r7618, r4497, r7615; +} +{ +add.f16x2 r7621, r6258, r6274; +} +{ +mul.f16x2 r7624, r7621, r7582; +} +{ +add.f16x2 r7627, r7618, r7624; +} +{ +sub.f16x2 r7630, r6248, r6296; +} +{ +mul.f16x2 r7633, r7630, r7581; +} +{ +sub.f16x2 r7636, r6264, r6280; +} +{ +mul.f16x2 r7639, r7636, r7583; +} +{ +add.f16x2 r7642, r7633, r7639; +} +{ +sub.f16x2 r7645, r7627, r7642; +} +{ +add.f16x2 r7648, r6242, r6290; +} +{ +mul.f16x2 r7651, r7648, r7580; +} +{ +add.f16x2 r7654, r4497, r7651; +} +{ +add.f16x2 r7657, r6258, r6274; +} +{ +mul.f16x2 r7660, r7657, r7582; +} +{ +add.f16x2 r7663, r7654, r7660; +} +{ +sub.f16x2 r7666, r6248, r6296; +} +{ +mul.f16x2 r7669, r7666, r7581; +} +{ +sub.f16x2 r7672, r6264, r6280; +} +{ +mul.f16x2 r7675, r7672, r7583; +} +{ +add.f16x2 r7678, r7669, r7675; +} +{ +add.f16x2 r7681, r7663, r7678; +} +{ +add.f16x2 r7684, r6242, r6290; +} +{ +mul.f16x2 r7687, r7684, r7582; +} +{ +add.f16x2 r7690, r4497, r7687; +} +{ +add.f16x2 r7693, r6258, r6274; +} +{ +mul.f16x2 r7696, r7693, r7584; +} +{ +add.f16x2 r7699, r7690, r7696; +} +{ +sub.f16x2 r7702, r6248, r6296; +} +{ +mul.f16x2 r7705, r7702, r7583; +} +{ +sub.f16x2 r7708, r6264, r6280; +} +{ +mul.f16x2 r7711, r7708, r7586; +} +{ +add.f16x2 r7714, r7705, r7711; +} +{ +sub.f16x2 r7717, r7699, r7714; +} +{ +add.f16x2 r7720, r6242, r6290; +} +{ +mul.f16x2 r7723, r7720, r7582; +} +{ +add.f16x2 r7726, r4497, r7723; +} +{ +add.f16x2 r7729, r6258, r6274; +} +{ +mul.f16x2 r7732, r7729, r7584; +} +{ +add.f16x2 r7735, r7726, r7732; +} +{ +sub.f16x2 r7738, r6248, r6296; +} +{ +mul.f16x2 r7741, r7738, r7583; +} +{ +sub.f16x2 r7744, r6264, r6280; +} +{ +mul.f16x2 r7747, r7744, r7586; +} +{ +add.f16x2 r7750, r7741, r7747; +} +{ +add.f16x2 r7753, r7735, r7750; +} +{ +add.f16x2 r7756, r6248, r6296; +} +{ +mul.f16x2 r7759, r7756, r7580; +} +{ +add.f16x2 r7762, r4641, r7759; +} +{ +add.f16x2 r7765, r6264, r6280; +} +{ +mul.f16x2 r7768, r7765, r7582; +} +{ +add.f16x2 r7771, r7762, r7768; +} +{ +sub.f16x2 r7774, r6242, r6290; +} +{ +mul.f16x2 r7777, r7774, r7581; +} +{ +sub.f16x2 r7780, r6258, r6274; +} +{ +mul.f16x2 r7783, r7780, r7583; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7771, r7786; +} +{ +add.f16x2 r7792, r6248, r6296; +} +{ +mul.f16x2 r7795, r7792, r7580; +} +{ +add.f16x2 r7798, r4641, r7795; +} +{ +add.f16x2 r7801, r6264, r6280; +} +{ +mul.f16x2 r7804, r7801, r7582; +} +{ +add.f16x2 r7807, r7798, r7804; +} +{ +sub.f16x2 r7810, r6242, r6290; +} +{ +mul.f16x2 r7813, r7810, r7581; +} +{ +sub.f16x2 r7816, r6258, r6274; +} +{ +mul.f16x2 r7819, r7816, r7583; +} +{ +add.f16x2 r7822, r7813, r7819; +} +{ +sub.f16x2 r7825, r7807, r7822; +} +{ +add.f16x2 r7828, r6248, r6296; +} +{ +mul.f16x2 r7831, r7828, r7582; +} +{ +add.f16x2 r7834, r4641, r7831; +} +{ +add.f16x2 r7837, r6264, r6280; +} +{ +mul.f16x2 r7840, r7837, r7584; +} +{ +add.f16x2 r7843, r7834, r7840; +} +{ +sub.f16x2 r7846, r6242, r6290; +} +{ +mul.f16x2 r7849, r7846, r7583; +} +{ +sub.f16x2 r7852, r6258, r6274; +} +{ +mul.f16x2 r7855, r7852, r7586; +} +{ +add.f16x2 r7858, r7849, r7855; +} +{ +add.f16x2 r7861, r7843, r7858; +} +{ +add.f16x2 r7864, r6248, r6296; +} +{ +mul.f16x2 r7867, r7864, r7582; +} +{ +add.f16x2 r7870, r4641, r7867; +} +{ +add.f16x2 r7873, r6264, r6280; +} +{ +mul.f16x2 r7876, r7873, r7584; +} +{ +add.f16x2 r7879, r7870, r7876; +} +{ +sub.f16x2 r7882, r6242, r6290; +} +{ +mul.f16x2 r7885, r7882, r7583; +} +{ +sub.f16x2 r7888, r6258, r6274; +} +{ +mul.f16x2 r7891, r7888, r7586; +} +{ +add.f16x2 r7894, r7885, r7891; +} +{ +sub.f16x2 r7897, r7879, r7894; +} +mul.wide.u32 rd4, r10397, 1374389535; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r10401, rd5; +cvt.rn.f32.u32 f600, r10401; +mul.f32 f601, f600, 0f3D4DE32E; +cos.approx.f32 f485, f601; +sin.approx.f32 f602, f601; +neg.f32 f486, f602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f485; +cvt.rn.f16.f32 high, f486; +mov.b32 r7900, {low, high}; +} +mul.lo.s32 r10402, r10401, 25; +sub.s32 r10403, r10397, r10402; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7903, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7905, {high, high}; +} +{ +mul.f16x2 r7907, r6649, r7905; +} +{ +neg.f16x2 r7910, r7907; +} +{ +fma.rn.f16x2 r7912, r6637, r7903, r7910; +} +{ +mul.f16x2 r7916, r6637, r7905; +} +{ +fma.rn.f16x2 r7919, r6649, r7903, r7916; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7923, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7925, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r7927, {low, high}; +} +{ +mul.f16x2 r7928, r7925, r7927; +} +{ +mul.f16x2 r7931, r7900, r7923; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7934, {high, low}; +} +{ +fma.rn.f16x2 r7936, r7928, r7934, r7931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7936; +mov.b32 r7940, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7936; +mov.b32 r7942, {high, high}; +} +{ +mul.f16x2 r7944, r6969, r7942; +} +{ +neg.f16x2 r7947, r7944; +} +{ +fma.rn.f16x2 r7949, r6957, r7940, r7947; +} +{ +mul.f16x2 r7953, r6957, r7942; +} +{ +fma.rn.f16x2 r7956, r6969, r7940, r7953; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7960, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7962, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r7964, {low, high}; +} +{ +mul.f16x2 r7965, r7962, r7964; +} +{ +mul.f16x2 r7968, r7936, r7960; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7936; +mov.b32 r7971, {high, low}; +} +{ +fma.rn.f16x2 r7973, r7965, r7971, r7968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7973; +mov.b32 r7977, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7973; +mov.b32 r7979, {high, high}; +} +{ +mul.f16x2 r7981, r7289, r7979; +} +{ +neg.f16x2 r7984, r7981; +} +{ +fma.rn.f16x2 r7986, r7277, r7977, r7984; +} +{ +mul.f16x2 r7990, r7277, r7979; +} +{ +fma.rn.f16x2 r7993, r7289, r7977, r7990; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7997, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7999, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8001, {low, high}; +} +{ +mul.f16x2 r8002, r7999, r8001; +} +{ +mul.f16x2 r8005, r7973, r7997; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7973; +mov.b32 r8008, {high, low}; +} +{ +fma.rn.f16x2 r8010, r8002, r8008, r8005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8010; +mov.b32 r8014, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8010; +mov.b32 r8016, {high, high}; +} +{ +mul.f16x2 r8018, r7609, r8016; +} +{ +neg.f16x2 r8021, r8018; +} +{ +fma.rn.f16x2 r8023, r7597, r8014, r8021; +} +{ +mul.f16x2 r8027, r7597, r8016; +} +{ +fma.rn.f16x2 r8030, r7609, r8014, r8027; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8036, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8038, {low, high}; +} +{ +mul.f16x2 r8039, r8036, r8038; +} +{ +mul.f16x2 r8042, r8010, r8034; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8010; +mov.b32 r8045, {high, low}; +} +{ +fma.rn.f16x2 r8047, r8039, r8045, r8042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8047; +mov.b32 r8051, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8047; +mov.b32 r8053, {high, high}; +} +{ +mul.f16x2 r8055, r6509, r8053; +} +{ +neg.f16x2 r8058, r8055; +} +{ +fma.rn.f16x2 r8060, r6365, r8051, r8058; +} +{ +mul.f16x2 r8064, r6365, r8053; +} +{ +fma.rn.f16x2 r8067, r6509, r8051, r8064; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8071, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8073, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8075, {low, high}; +} +{ +mul.f16x2 r8076, r8073, r8075; +} +{ +mul.f16x2 r8079, r8047, r8071; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8047; +mov.b32 r8082, {high, low}; +} +{ +fma.rn.f16x2 r8084, r8076, r8082, r8079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8084; +mov.b32 r8088, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8084; +mov.b32 r8090, {high, high}; +} +{ +mul.f16x2 r8092, r6829, r8090; +} +{ +neg.f16x2 r8095, r8092; +} +{ +fma.rn.f16x2 r8097, r6685, r8088, r8095; +} +{ +mul.f16x2 r8101, r6685, r8090; +} +{ +fma.rn.f16x2 r8104, r6829, r8088, r8101; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8108, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8110, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8112, {low, high}; +} +{ +mul.f16x2 r8113, r8110, r8112; +} +{ +mul.f16x2 r8116, r8084, r8108; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8084; +mov.b32 r8119, {high, low}; +} +{ +fma.rn.f16x2 r8121, r8113, r8119, r8116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8121; +mov.b32 r8125, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8121; +mov.b32 r8127, {high, high}; +} +{ +mul.f16x2 r8129, r7149, r8127; +} +{ +neg.f16x2 r8132, r8129; +} +{ +fma.rn.f16x2 r8134, r7005, r8125, r8132; +} +{ +mul.f16x2 r8138, r7005, r8127; +} +{ +fma.rn.f16x2 r8141, r7149, r8125, r8138; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8145, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8147, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8149, {low, high}; +} +{ +mul.f16x2 r8150, r8147, r8149; +} +{ +mul.f16x2 r8153, r8121, r8145; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8121; +mov.b32 r8156, {high, low}; +} +{ +fma.rn.f16x2 r8158, r8150, r8156, r8153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8158; +mov.b32 r8162, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8158; +mov.b32 r8164, {high, high}; +} +{ +mul.f16x2 r8166, r7469, r8164; +} +{ +neg.f16x2 r8169, r8166; +} +{ +fma.rn.f16x2 r8171, r7325, r8162, r8169; +} +{ +mul.f16x2 r8175, r7325, r8164; +} +{ +fma.rn.f16x2 r8178, r7469, r8162, r8175; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8182, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8184, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8186, {low, high}; +} +{ +mul.f16x2 r8187, r8184, r8186; +} +{ +mul.f16x2 r8190, r8158, r8182; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8158; +mov.b32 r8193, {high, low}; +} +{ +fma.rn.f16x2 r8195, r8187, r8193, r8190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8195; +mov.b32 r8199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8195; +mov.b32 r8201, {high, high}; +} +{ +mul.f16x2 r8203, r7789, r8201; +} +{ +neg.f16x2 r8206, r8203; +} +{ +fma.rn.f16x2 r8208, r7645, r8199, r8206; +} +{ +mul.f16x2 r8212, r7645, r8201; +} +{ +fma.rn.f16x2 r8215, r7789, r8199, r8212; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8219, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8221, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8223, {low, high}; +} +{ +mul.f16x2 r8224, r8221, r8223; +} +{ +mul.f16x2 r8227, r8195, r8219; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8195; +mov.b32 r8230, {high, low}; +} +{ +fma.rn.f16x2 r8232, r8224, r8230, r8227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8232; +mov.b32 r8236, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8232; +mov.b32 r8238, {high, high}; +} +{ +mul.f16x2 r8240, r6581, r8238; +} +{ +neg.f16x2 r8243, r8240; +} +{ +fma.rn.f16x2 r8245, r6437, r8236, r8243; +} +{ +mul.f16x2 r8249, r6437, r8238; +} +{ +fma.rn.f16x2 r8252, r6581, r8236, r8249; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8256, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8258, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8260, {low, high}; +} +{ +mul.f16x2 r8261, r8258, r8260; +} +{ +mul.f16x2 r8264, r8232, r8256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8232; +mov.b32 r8267, {high, low}; +} +{ +fma.rn.f16x2 r8269, r8261, r8267, r8264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8269; +mov.b32 r8273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8269; +mov.b32 r8275, {high, high}; +} +{ +mul.f16x2 r8277, r6901, r8275; +} +{ +neg.f16x2 r8280, r8277; +} +{ +fma.rn.f16x2 r8282, r6757, r8273, r8280; +} +{ +mul.f16x2 r8286, r6757, r8275; +} +{ +fma.rn.f16x2 r8289, r6901, r8273, r8286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8293, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8295, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8297, {low, high}; +} +{ +mul.f16x2 r8298, r8295, r8297; +} +{ +mul.f16x2 r8301, r8269, r8293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8269; +mov.b32 r8304, {high, low}; +} +{ +fma.rn.f16x2 r8306, r8298, r8304, r8301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8306; +mov.b32 r8310, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8306; +mov.b32 r8312, {high, high}; +} +{ +mul.f16x2 r8314, r7221, r8312; +} +{ +neg.f16x2 r8317, r8314; +} +{ +fma.rn.f16x2 r8319, r7077, r8310, r8317; +} +{ +mul.f16x2 r8323, r7077, r8312; +} +{ +fma.rn.f16x2 r8326, r7221, r8310, r8323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8330, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8332, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8334, {low, high}; +} +{ +mul.f16x2 r8335, r8332, r8334; +} +{ +mul.f16x2 r8338, r8306, r8330; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8306; +mov.b32 r8341, {high, low}; +} +{ +fma.rn.f16x2 r8343, r8335, r8341, r8338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8343; +mov.b32 r8347, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8343; +mov.b32 r8349, {high, high}; +} +{ +mul.f16x2 r8351, r7541, r8349; +} +{ +neg.f16x2 r8354, r8351; +} +{ +fma.rn.f16x2 r8356, r7397, r8347, r8354; +} +{ +mul.f16x2 r8360, r7397, r8349; +} +{ +fma.rn.f16x2 r8363, r7541, r8347, r8360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8367, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8369, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8371, {low, high}; +} +{ +mul.f16x2 r8372, r8369, r8371; +} +{ +mul.f16x2 r8375, r8343, r8367; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8343; +mov.b32 r8378, {high, low}; +} +{ +fma.rn.f16x2 r8380, r8372, r8378, r8375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8380; +mov.b32 r8384, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8380; +mov.b32 r8386, {high, high}; +} +{ +mul.f16x2 r8388, r7861, r8386; +} +{ +neg.f16x2 r8391, r8388; +} +{ +fma.rn.f16x2 r8393, r7717, r8384, r8391; +} +{ +mul.f16x2 r8397, r7717, r8386; +} +{ +fma.rn.f16x2 r8400, r7861, r8384, r8397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8404, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8406, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8408, {low, high}; +} +{ +mul.f16x2 r8409, r8406, r8408; +} +{ +mul.f16x2 r8412, r8380, r8404; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8380; +mov.b32 r8415, {high, low}; +} +{ +fma.rn.f16x2 r8417, r8409, r8415, r8412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8417; +mov.b32 r8421, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8417; +mov.b32 r8423, {high, high}; +} +{ +mul.f16x2 r8425, r6617, r8423; +} +{ +neg.f16x2 r8428, r8425; +} +{ +fma.rn.f16x2 r8430, r6473, r8421, r8428; +} +{ +mul.f16x2 r8434, r6473, r8423; +} +{ +fma.rn.f16x2 r8437, r6617, r8421, r8434; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8441, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8443, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8445, {low, high}; +} +{ +mul.f16x2 r8446, r8443, r8445; +} +{ +mul.f16x2 r8449, r8417, r8441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8417; +mov.b32 r8452, {high, low}; +} +{ +fma.rn.f16x2 r8454, r8446, r8452, r8449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8454; +mov.b32 r8458, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8454; +mov.b32 r8460, {high, high}; +} +{ +mul.f16x2 r8462, r6937, r8460; +} +{ +neg.f16x2 r8465, r8462; +} +{ +fma.rn.f16x2 r8467, r6793, r8458, r8465; +} +{ +mul.f16x2 r8471, r6793, r8460; +} +{ +fma.rn.f16x2 r8474, r6937, r8458, r8471; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8478, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8480, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8482, {low, high}; +} +{ +mul.f16x2 r8483, r8480, r8482; +} +{ +mul.f16x2 r8486, r8454, r8478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8454; +mov.b32 r8489, {high, low}; +} +{ +fma.rn.f16x2 r8491, r8483, r8489, r8486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8491; +mov.b32 r8495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8491; +mov.b32 r8497, {high, high}; +} +{ +mul.f16x2 r8499, r7257, r8497; +} +{ +neg.f16x2 r8502, r8499; +} +{ +fma.rn.f16x2 r8504, r7113, r8495, r8502; +} +{ +mul.f16x2 r8508, r7113, r8497; +} +{ +fma.rn.f16x2 r8511, r7257, r8495, r8508; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8519, {low, high}; +} +{ +mul.f16x2 r8520, r8517, r8519; +} +{ +mul.f16x2 r8523, r8491, r8515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8491; +mov.b32 r8526, {high, low}; +} +{ +fma.rn.f16x2 r8528, r8520, r8526, r8523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8528; +mov.b32 r8532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8528; +mov.b32 r8534, {high, high}; +} +{ +mul.f16x2 r8536, r7577, r8534; +} +{ +neg.f16x2 r8539, r8536; +} +{ +fma.rn.f16x2 r8541, r7433, r8532, r8539; +} +{ +mul.f16x2 r8545, r7433, r8534; +} +{ +fma.rn.f16x2 r8548, r7577, r8532, r8545; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8556, {low, high}; +} +{ +mul.f16x2 r8557, r8554, r8556; +} +{ +mul.f16x2 r8560, r8528, r8552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8528; +mov.b32 r8563, {high, low}; +} +{ +fma.rn.f16x2 r8565, r8557, r8563, r8560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8565; +mov.b32 r8569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8565; +mov.b32 r8571, {high, high}; +} +{ +mul.f16x2 r8573, r7897, r8571; +} +{ +neg.f16x2 r8576, r8573; +} +{ +fma.rn.f16x2 r8578, r7753, r8569, r8576; +} +{ +mul.f16x2 r8582, r7753, r8571; +} +{ +fma.rn.f16x2 r8585, r7897, r8569, r8582; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8593, {low, high}; +} +{ +mul.f16x2 r8594, r8591, r8593; +} +{ +mul.f16x2 r8597, r8565, r8589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8565; +mov.b32 r8600, {high, low}; +} +{ +fma.rn.f16x2 r8602, r8594, r8600, r8597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8602; +mov.b32 r8606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8602; +mov.b32 r8608, {high, high}; +} +{ +mul.f16x2 r8610, r6545, r8608; +} +{ +neg.f16x2 r8613, r8610; +} +{ +fma.rn.f16x2 r8615, r6401, r8606, r8613; +} +{ +mul.f16x2 r8619, r6401, r8608; +} +{ +fma.rn.f16x2 r8622, r6545, r8606, r8619; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8630, {low, high}; +} +{ +mul.f16x2 r8631, r8628, r8630; +} +{ +mul.f16x2 r8634, r8602, r8626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8602; +mov.b32 r8637, {high, low}; +} +{ +fma.rn.f16x2 r8639, r8631, r8637, r8634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8639; +mov.b32 r8643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8639; +mov.b32 r8645, {high, high}; +} +{ +mul.f16x2 r8647, r6865, r8645; +} +{ +neg.f16x2 r8650, r8647; +} +{ +fma.rn.f16x2 r8652, r6721, r8643, r8650; +} +{ +mul.f16x2 r8656, r6721, r8645; +} +{ +fma.rn.f16x2 r8659, r6865, r8643, r8656; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8667, {low, high}; +} +{ +mul.f16x2 r8668, r8665, r8667; +} +{ +mul.f16x2 r8671, r8639, r8663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8639; +mov.b32 r8674, {high, low}; +} +{ +fma.rn.f16x2 r8676, r8668, r8674, r8671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8676; +mov.b32 r8680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8676; +mov.b32 r8682, {high, high}; +} +{ +mul.f16x2 r8684, r7185, r8682; +} +{ +neg.f16x2 r8687, r8684; +} +{ +fma.rn.f16x2 r8689, r7041, r8680, r8687; +} +{ +mul.f16x2 r8693, r7041, r8682; +} +{ +fma.rn.f16x2 r8696, r7185, r8680, r8693; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8704, {low, high}; +} +{ +mul.f16x2 r8705, r8702, r8704; +} +{ +mul.f16x2 r8708, r8676, r8700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8676; +mov.b32 r8711, {high, low}; +} +{ +fma.rn.f16x2 r8713, r8705, r8711, r8708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8713; +mov.b32 r8717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8713; +mov.b32 r8719, {high, high}; +} +{ +mul.f16x2 r8721, r7505, r8719; +} +{ +neg.f16x2 r8724, r8721; +} +{ +fma.rn.f16x2 r8726, r7361, r8717, r8724; +} +{ +mul.f16x2 r8730, r7361, r8719; +} +{ +fma.rn.f16x2 r8733, r7505, r8717, r8730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8741, {low, high}; +} +{ +mul.f16x2 r8742, r8739, r8741; +} +{ +mul.f16x2 r8745, r8713, r8737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8713; +mov.b32 r8748, {high, low}; +} +{ +fma.rn.f16x2 r8750, r8742, r8748, r8745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8750; +mov.b32 r8754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8750; +mov.b32 r8756, {high, high}; +} +{ +mul.f16x2 r8758, r7825, r8756; +} +{ +neg.f16x2 r8761, r8758; +} +{ +fma.rn.f16x2 r8763, r7681, r8754, r8761; +} +{ +mul.f16x2 r8767, r7681, r8756; +} +{ +fma.rn.f16x2 r8770, r7825, r8754, r8767; +} +shl.b32 r10404, r10403, 3; +add.s32 r10405, r10398, r10404; +barrier.sync 0; +mad.lo.s32 r10406, r10401, 5000, r10405; +st.shared.u32 [r10406], r6317; +st.shared.u32 [r10406+4], r6329; +st.shared.u32 [r10406+200], r7912; +st.shared.u32 [r10406+204], r7919; +st.shared.u32 [r10406+400], r7949; +st.shared.u32 [r10406+404], r7956; +st.shared.u32 [r10406+600], r7986; +st.shared.u32 [r10406+604], r7993; +st.shared.u32 [r10406+800], r8023; +st.shared.u32 [r10406+804], r8030; +st.shared.u32 [r10406+1000], r8060; +st.shared.u32 [r10406+1004], r8067; +st.shared.u32 [r10406+1200], r8097; +st.shared.u32 [r10406+1204], r8104; +st.shared.u32 [r10406+1400], r8134; +st.shared.u32 [r10406+1404], r8141; +st.shared.u32 [r10406+1600], r8171; +st.shared.u32 [r10406+1604], r8178; +st.shared.u32 [r10406+1800], r8208; +st.shared.u32 [r10406+1804], r8215; +st.shared.u32 [r10406+2000], r8245; +st.shared.u32 [r10406+2004], r8252; +st.shared.u32 [r10406+2200], r8282; +st.shared.u32 [r10406+2204], r8289; +st.shared.u32 [r10406+2400], r8319; +st.shared.u32 [r10406+2404], r8326; +st.shared.u32 [r10406+2600], r8356; +st.shared.u32 [r10406+2604], r8363; +st.shared.u32 [r10406+2800], r8393; +st.shared.u32 [r10406+2804], r8400; +st.shared.u32 [r10406+3000], r8430; +st.shared.u32 [r10406+3004], r8437; +st.shared.u32 [r10406+3200], r8467; +st.shared.u32 [r10406+3204], r8474; +st.shared.u32 [r10406+3400], r8504; +st.shared.u32 [r10406+3404], r8511; +st.shared.u32 [r10406+3600], r8541; +st.shared.u32 [r10406+3604], r8548; +st.shared.u32 [r10406+3800], r8578; +st.shared.u32 [r10406+3804], r8585; +st.shared.u32 [r10406+4000], r8615; +st.shared.u32 [r10406+4004], r8622; +st.shared.u32 [r10406+4200], r8652; +st.shared.u32 [r10406+4204], r8659; +st.shared.u32 [r10406+4400], r8689; +st.shared.u32 [r10406+4404], r8696; +st.shared.u32 [r10406+4600], r8726; +st.shared.u32 [r10406+4604], r8733; +st.shared.u32 [r10406+4800], r8763; +st.shared.u32 [r10406+4804], r8770; +barrier.sync 0; +ld.shared.u32 r8803, [r10400]; +ld.shared.u32 r8815, [r10400+4]; +ld.shared.u32 r9123, [r10400+1000]; +ld.shared.u32 r9135, [r10400+1004]; +ld.shared.u32 r9443, [r10400+2000]; +ld.shared.u32 r9455, [r10400+2004]; +ld.shared.u32 r9763, [r10400+3000]; +ld.shared.u32 r9775, [r10400+3004]; +ld.shared.u32 r10083, [r10400+4000]; +ld.shared.u32 r10095, [r10400+4004]; +ld.shared.u32 r8800, [r10400+5000]; +ld.shared.u32 r8812, [r10400+5004]; +ld.shared.u32 r9120, [r10400+6000]; +ld.shared.u32 r9132, [r10400+6004]; +ld.shared.u32 r9440, [r10400+7000]; +ld.shared.u32 r9452, [r10400+7004]; +ld.shared.u32 r9760, [r10400+8000]; +ld.shared.u32 r9772, [r10400+8004]; +ld.shared.u32 r10080, [r10400+9000]; +ld.shared.u32 r10092, [r10400+9004]; +ld.shared.u32 r8806, [r10400+10000]; +ld.shared.u32 r8818, [r10400+10004]; +ld.shared.u32 r9126, [r10400+11000]; +ld.shared.u32 r9138, [r10400+11004]; +ld.shared.u32 r9446, [r10400+12000]; +ld.shared.u32 r9458, [r10400+12004]; +ld.shared.u32 r9766, [r10400+13000]; +ld.shared.u32 r9778, [r10400+13004]; +ld.shared.u32 r10086, [r10400+14000]; +ld.shared.u32 r10098, [r10400+14004]; +ld.shared.u32 r8807, [r10400+15000]; +ld.shared.u32 r8819, [r10400+15004]; +ld.shared.u32 r9127, [r10400+16000]; +ld.shared.u32 r9139, [r10400+16004]; +ld.shared.u32 r9447, [r10400+17000]; +ld.shared.u32 r9459, [r10400+17004]; +ld.shared.u32 r9767, [r10400+18000]; +ld.shared.u32 r9779, [r10400+18004]; +ld.shared.u32 r10087, [r10400+19000]; +ld.shared.u32 r10099, [r10400+19004]; +ld.shared.u32 r8801, [r10400+20000]; +ld.shared.u32 r8813, [r10400+20004]; +ld.shared.u32 r9121, [r10400+21000]; +ld.shared.u32 r9133, [r10400+21004]; +ld.shared.u32 r9441, [r10400+22000]; +ld.shared.u32 r9453, [r10400+22004]; +ld.shared.u32 r9761, [r10400+23000]; +ld.shared.u32 r9773, [r10400+23004]; +ld.shared.u32 r10081, [r10400+24000]; +ld.shared.u32 r10093, [r10400+24004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r8793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r8794, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8795, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8796, {low, high}; +} +{ +neg.f16x2 r8797, r8796; +} +{ +add.f16x2 r8799, r8800, r8801; +} +{ +add.f16x2 r8802, r8803, r8799; +} +{ +add.f16x2 r8805, r8806, r8807; +} +{ +add.f16x2 %0, r8802, r8805; +} +{ +add.f16x2 r8811, r8812, r8813; +} +{ +add.f16x2 r8814, r8815, r8811; +} +{ +add.f16x2 r8817, r8818, r8819; +} +{ +add.f16x2 %1, r8814, r8817; +} +{ +add.f16x2 r8823, r8800, r8801; +} +{ +mul.f16x2 r8826, r8823, r8791; +} +{ +add.f16x2 r8829, r8803, r8826; +} +{ +add.f16x2 r8832, r8806, r8807; +} +{ +mul.f16x2 r8835, r8832, r8793; +} +{ +add.f16x2 r8838, r8829, r8835; +} +{ +sub.f16x2 r8841, r8812, r8813; +} +{ +mul.f16x2 r8844, r8841, r8792; +} +{ +sub.f16x2 r8847, r8818, r8819; +} +{ +mul.f16x2 r8850, r8847, r8794; +} +{ +add.f16x2 r8853, r8844, r8850; +} +{ +sub.f16x2 %10, r8838, r8853; +} +{ +add.f16x2 r8859, r8800, r8801; +} +{ +mul.f16x2 r8862, r8859, r8791; +} +{ +add.f16x2 r8865, r8803, r8862; +} +{ +add.f16x2 r8868, r8806, r8807; +} +{ +mul.f16x2 r8871, r8868, r8793; +} +{ +add.f16x2 r8874, r8865, r8871; +} +{ +sub.f16x2 r8877, r8812, r8813; +} +{ +mul.f16x2 r8880, r8877, r8792; +} +{ +sub.f16x2 r8883, r8818, r8819; +} +{ +mul.f16x2 r8886, r8883, r8794; +} +{ +add.f16x2 r8889, r8880, r8886; +} +{ +add.f16x2 %40, r8874, r8889; +} +{ +add.f16x2 r8895, r8800, r8801; +} +{ +mul.f16x2 r8898, r8895, r8793; +} +{ +add.f16x2 r8901, r8803, r8898; +} +{ +add.f16x2 r8904, r8806, r8807; +} +{ +mul.f16x2 r8907, r8904, r8795; +} +{ +add.f16x2 r8910, r8901, r8907; +} +{ +sub.f16x2 r8913, r8812, r8813; +} +{ +mul.f16x2 r8916, r8913, r8794; +} +{ +sub.f16x2 r8919, r8818, r8819; +} +{ +mul.f16x2 r8922, r8919, r8797; +} +{ +add.f16x2 r8925, r8916, r8922; +} +{ +sub.f16x2 %20, r8910, r8925; +} +{ +add.f16x2 r8931, r8800, r8801; +} +{ +mul.f16x2 r8934, r8931, r8793; +} +{ +add.f16x2 r8937, r8803, r8934; +} +{ +add.f16x2 r8940, r8806, r8807; +} +{ +mul.f16x2 r8943, r8940, r8795; +} +{ +add.f16x2 r8946, r8937, r8943; +} +{ +sub.f16x2 r8949, r8812, r8813; +} +{ +mul.f16x2 r8952, r8949, r8794; +} +{ +sub.f16x2 r8955, r8818, r8819; +} +{ +mul.f16x2 r8958, r8955, r8797; +} +{ +add.f16x2 r8961, r8952, r8958; +} +{ +add.f16x2 %30, r8946, r8961; +} +{ +add.f16x2 r8967, r8812, r8813; +} +{ +mul.f16x2 r8970, r8967, r8791; +} +{ +add.f16x2 r8973, r8815, r8970; +} +{ +add.f16x2 r8976, r8818, r8819; +} +{ +mul.f16x2 r8979, r8976, r8793; +} +{ +add.f16x2 r8982, r8973, r8979; +} +{ +sub.f16x2 r8985, r8800, r8801; +} +{ +mul.f16x2 r8988, r8985, r8792; +} +{ +sub.f16x2 r8991, r8806, r8807; +} +{ +mul.f16x2 r8994, r8991, r8794; +} +{ +add.f16x2 r8997, r8988, r8994; +} +{ +add.f16x2 %11, r8982, r8997; +} +{ +add.f16x2 r9003, r8812, r8813; +} +{ +mul.f16x2 r9006, r9003, r8791; +} +{ +add.f16x2 r9009, r8815, r9006; +} +{ +add.f16x2 r9012, r8818, r8819; +} +{ +mul.f16x2 r9015, r9012, r8793; +} +{ +add.f16x2 r9018, r9009, r9015; +} +{ +sub.f16x2 r9021, r8800, r8801; +} +{ +mul.f16x2 r9024, r9021, r8792; +} +{ +sub.f16x2 r9027, r8806, r8807; +} +{ +mul.f16x2 r9030, r9027, r8794; +} +{ +add.f16x2 r9033, r9024, r9030; +} +{ +sub.f16x2 %41, r9018, r9033; +} +{ +add.f16x2 r9039, r8812, r8813; +} +{ +mul.f16x2 r9042, r9039, r8793; +} +{ +add.f16x2 r9045, r8815, r9042; +} +{ +add.f16x2 r9048, r8818, r8819; +} +{ +mul.f16x2 r9051, r9048, r8795; +} +{ +add.f16x2 r9054, r9045, r9051; +} +{ +sub.f16x2 r9057, r8800, r8801; +} +{ +mul.f16x2 r9060, r9057, r8794; +} +{ +sub.f16x2 r9063, r8806, r8807; +} +{ +mul.f16x2 r9066, r9063, r8797; +} +{ +add.f16x2 r9069, r9060, r9066; +} +{ +add.f16x2 %21, r9054, r9069; +} +{ +add.f16x2 r9075, r8812, r8813; +} +{ +mul.f16x2 r9078, r9075, r8793; +} +{ +add.f16x2 r9081, r8815, r9078; +} +{ +add.f16x2 r9084, r8818, r8819; +} +{ +mul.f16x2 r9087, r9084, r8795; +} +{ +add.f16x2 r9090, r9081, r9087; +} +{ +sub.f16x2 r9093, r8800, r8801; +} +{ +mul.f16x2 r9096, r9093, r8794; +} +{ +sub.f16x2 r9099, r8806, r8807; +} +{ +mul.f16x2 r9102, r9099, r8797; +} +{ +add.f16x2 r9105, r9096, r9102; +} +{ +sub.f16x2 %31, r9090, r9105; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9116, {low, high}; +} +{ +neg.f16x2 r9117, r9116; +} +{ +add.f16x2 r9119, r9120, r9121; +} +{ +add.f16x2 r9122, r9123, r9119; +} +{ +add.f16x2 r9125, r9126, r9127; +} +{ +add.f16x2 %2, r9122, r9125; +} +{ +add.f16x2 r9131, r9132, r9133; +} +{ +add.f16x2 r9134, r9135, r9131; +} +{ +add.f16x2 r9137, r9138, r9139; +} +{ +add.f16x2 %3, r9134, r9137; +} +{ +add.f16x2 r9143, r9120, r9121; +} +{ +mul.f16x2 r9146, r9143, r9111; +} +{ +add.f16x2 r9149, r9123, r9146; +} +{ +add.f16x2 r9152, r9126, r9127; +} +{ +mul.f16x2 r9155, r9152, r9113; +} +{ +add.f16x2 r9158, r9149, r9155; +} +{ +sub.f16x2 r9161, r9132, r9133; +} +{ +mul.f16x2 r9164, r9161, r9112; +} +{ +sub.f16x2 r9167, r9138, r9139; +} +{ +mul.f16x2 r9170, r9167, r9114; +} +{ +add.f16x2 r9173, r9164, r9170; +} +{ +sub.f16x2 %12, r9158, r9173; +} +{ +add.f16x2 r9179, r9120, r9121; +} +{ +mul.f16x2 r9182, r9179, r9111; +} +{ +add.f16x2 r9185, r9123, r9182; +} +{ +add.f16x2 r9188, r9126, r9127; +} +{ +mul.f16x2 r9191, r9188, r9113; +} +{ +add.f16x2 r9194, r9185, r9191; +} +{ +sub.f16x2 r9197, r9132, r9133; +} +{ +mul.f16x2 r9200, r9197, r9112; +} +{ +sub.f16x2 r9203, r9138, r9139; +} +{ +mul.f16x2 r9206, r9203, r9114; +} +{ +add.f16x2 r9209, r9200, r9206; +} +{ +add.f16x2 %42, r9194, r9209; +} +{ +add.f16x2 r9215, r9120, r9121; +} +{ +mul.f16x2 r9218, r9215, r9113; +} +{ +add.f16x2 r9221, r9123, r9218; +} +{ +add.f16x2 r9224, r9126, r9127; +} +{ +mul.f16x2 r9227, r9224, r9115; +} +{ +add.f16x2 r9230, r9221, r9227; +} +{ +sub.f16x2 r9233, r9132, r9133; +} +{ +mul.f16x2 r9236, r9233, r9114; +} +{ +sub.f16x2 r9239, r9138, r9139; +} +{ +mul.f16x2 r9242, r9239, r9117; +} +{ +add.f16x2 r9245, r9236, r9242; +} +{ +sub.f16x2 %22, r9230, r9245; +} +{ +add.f16x2 r9251, r9120, r9121; +} +{ +mul.f16x2 r9254, r9251, r9113; +} +{ +add.f16x2 r9257, r9123, r9254; +} +{ +add.f16x2 r9260, r9126, r9127; +} +{ +mul.f16x2 r9263, r9260, r9115; +} +{ +add.f16x2 r9266, r9257, r9263; +} +{ +sub.f16x2 r9269, r9132, r9133; +} +{ +mul.f16x2 r9272, r9269, r9114; +} +{ +sub.f16x2 r9275, r9138, r9139; +} +{ +mul.f16x2 r9278, r9275, r9117; +} +{ +add.f16x2 r9281, r9272, r9278; +} +{ +add.f16x2 %32, r9266, r9281; +} +{ +add.f16x2 r9287, r9132, r9133; +} +{ +mul.f16x2 r9290, r9287, r9111; +} +{ +add.f16x2 r9293, r9135, r9290; +} +{ +add.f16x2 r9296, r9138, r9139; +} +{ +mul.f16x2 r9299, r9296, r9113; +} +{ +add.f16x2 r9302, r9293, r9299; +} +{ +sub.f16x2 r9305, r9120, r9121; +} +{ +mul.f16x2 r9308, r9305, r9112; +} +{ +sub.f16x2 r9311, r9126, r9127; +} +{ +mul.f16x2 r9314, r9311, r9114; +} +{ +add.f16x2 r9317, r9308, r9314; +} +{ +add.f16x2 %13, r9302, r9317; +} +{ +add.f16x2 r9323, r9132, r9133; +} +{ +mul.f16x2 r9326, r9323, r9111; +} +{ +add.f16x2 r9329, r9135, r9326; +} +{ +add.f16x2 r9332, r9138, r9139; +} +{ +mul.f16x2 r9335, r9332, r9113; +} +{ +add.f16x2 r9338, r9329, r9335; +} +{ +sub.f16x2 r9341, r9120, r9121; +} +{ +mul.f16x2 r9344, r9341, r9112; +} +{ +sub.f16x2 r9347, r9126, r9127; +} +{ +mul.f16x2 r9350, r9347, r9114; +} +{ +add.f16x2 r9353, r9344, r9350; +} +{ +sub.f16x2 %43, r9338, r9353; +} +{ +add.f16x2 r9359, r9132, r9133; +} +{ +mul.f16x2 r9362, r9359, r9113; +} +{ +add.f16x2 r9365, r9135, r9362; +} +{ +add.f16x2 r9368, r9138, r9139; +} +{ +mul.f16x2 r9371, r9368, r9115; +} +{ +add.f16x2 r9374, r9365, r9371; +} +{ +sub.f16x2 r9377, r9120, r9121; +} +{ +mul.f16x2 r9380, r9377, r9114; +} +{ +sub.f16x2 r9383, r9126, r9127; +} +{ +mul.f16x2 r9386, r9383, r9117; +} +{ +add.f16x2 r9389, r9380, r9386; +} +{ +add.f16x2 %23, r9374, r9389; +} +{ +add.f16x2 r9395, r9132, r9133; +} +{ +mul.f16x2 r9398, r9395, r9113; +} +{ +add.f16x2 r9401, r9135, r9398; +} +{ +add.f16x2 r9404, r9138, r9139; +} +{ +mul.f16x2 r9407, r9404, r9115; +} +{ +add.f16x2 r9410, r9401, r9407; +} +{ +sub.f16x2 r9413, r9120, r9121; +} +{ +mul.f16x2 r9416, r9413, r9114; +} +{ +sub.f16x2 r9419, r9126, r9127; +} +{ +mul.f16x2 r9422, r9419, r9117; +} +{ +add.f16x2 r9425, r9416, r9422; +} +{ +sub.f16x2 %33, r9410, r9425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9434, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9435, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9436, {low, high}; +} +{ +neg.f16x2 r9437, r9436; +} +{ +add.f16x2 r9439, r9440, r9441; +} +{ +add.f16x2 r9442, r9443, r9439; +} +{ +add.f16x2 r9445, r9446, r9447; +} +{ +add.f16x2 %4, r9442, r9445; +} +{ +add.f16x2 r9451, r9452, r9453; +} +{ +add.f16x2 r9454, r9455, r9451; +} +{ +add.f16x2 r9457, r9458, r9459; +} +{ +add.f16x2 %5, r9454, r9457; +} +{ +add.f16x2 r9463, r9440, r9441; +} +{ +mul.f16x2 r9466, r9463, r9431; +} +{ +add.f16x2 r9469, r9443, r9466; +} +{ +add.f16x2 r9472, r9446, r9447; +} +{ +mul.f16x2 r9475, r9472, r9433; +} +{ +add.f16x2 r9478, r9469, r9475; +} +{ +sub.f16x2 r9481, r9452, r9453; +} +{ +mul.f16x2 r9484, r9481, r9432; +} +{ +sub.f16x2 r9487, r9458, r9459; +} +{ +mul.f16x2 r9490, r9487, r9434; +} +{ +add.f16x2 r9493, r9484, r9490; +} +{ +sub.f16x2 %14, r9478, r9493; +} +{ +add.f16x2 r9499, r9440, r9441; +} +{ +mul.f16x2 r9502, r9499, r9431; +} +{ +add.f16x2 r9505, r9443, r9502; +} +{ +add.f16x2 r9508, r9446, r9447; +} +{ +mul.f16x2 r9511, r9508, r9433; +} +{ +add.f16x2 r9514, r9505, r9511; +} +{ +sub.f16x2 r9517, r9452, r9453; +} +{ +mul.f16x2 r9520, r9517, r9432; +} +{ +sub.f16x2 r9523, r9458, r9459; +} +{ +mul.f16x2 r9526, r9523, r9434; +} +{ +add.f16x2 r9529, r9520, r9526; +} +{ +add.f16x2 %44, r9514, r9529; +} +{ +add.f16x2 r9535, r9440, r9441; +} +{ +mul.f16x2 r9538, r9535, r9433; +} +{ +add.f16x2 r9541, r9443, r9538; +} +{ +add.f16x2 r9544, r9446, r9447; +} +{ +mul.f16x2 r9547, r9544, r9435; +} +{ +add.f16x2 r9550, r9541, r9547; +} +{ +sub.f16x2 r9553, r9452, r9453; +} +{ +mul.f16x2 r9556, r9553, r9434; +} +{ +sub.f16x2 r9559, r9458, r9459; +} +{ +mul.f16x2 r9562, r9559, r9437; +} +{ +add.f16x2 r9565, r9556, r9562; +} +{ +sub.f16x2 %24, r9550, r9565; +} +{ +add.f16x2 r9571, r9440, r9441; +} +{ +mul.f16x2 r9574, r9571, r9433; +} +{ +add.f16x2 r9577, r9443, r9574; +} +{ +add.f16x2 r9580, r9446, r9447; +} +{ +mul.f16x2 r9583, r9580, r9435; +} +{ +add.f16x2 r9586, r9577, r9583; +} +{ +sub.f16x2 r9589, r9452, r9453; +} +{ +mul.f16x2 r9592, r9589, r9434; +} +{ +sub.f16x2 r9595, r9458, r9459; +} +{ +mul.f16x2 r9598, r9595, r9437; +} +{ +add.f16x2 r9601, r9592, r9598; +} +{ +add.f16x2 %34, r9586, r9601; +} +{ +add.f16x2 r9607, r9452, r9453; +} +{ +mul.f16x2 r9610, r9607, r9431; +} +{ +add.f16x2 r9613, r9455, r9610; +} +{ +add.f16x2 r9616, r9458, r9459; +} +{ +mul.f16x2 r9619, r9616, r9433; +} +{ +add.f16x2 r9622, r9613, r9619; +} +{ +sub.f16x2 r9625, r9440, r9441; +} +{ +mul.f16x2 r9628, r9625, r9432; +} +{ +sub.f16x2 r9631, r9446, r9447; +} +{ +mul.f16x2 r9634, r9631, r9434; +} +{ +add.f16x2 r9637, r9628, r9634; +} +{ +add.f16x2 %15, r9622, r9637; +} +{ +add.f16x2 r9643, r9452, r9453; +} +{ +mul.f16x2 r9646, r9643, r9431; +} +{ +add.f16x2 r9649, r9455, r9646; +} +{ +add.f16x2 r9652, r9458, r9459; +} +{ +mul.f16x2 r9655, r9652, r9433; +} +{ +add.f16x2 r9658, r9649, r9655; +} +{ +sub.f16x2 r9661, r9440, r9441; +} +{ +mul.f16x2 r9664, r9661, r9432; +} +{ +sub.f16x2 r9667, r9446, r9447; +} +{ +mul.f16x2 r9670, r9667, r9434; +} +{ +add.f16x2 r9673, r9664, r9670; +} +{ +sub.f16x2 %45, r9658, r9673; +} +{ +add.f16x2 r9679, r9452, r9453; +} +{ +mul.f16x2 r9682, r9679, r9433; +} +{ +add.f16x2 r9685, r9455, r9682; +} +{ +add.f16x2 r9688, r9458, r9459; +} +{ +mul.f16x2 r9691, r9688, r9435; +} +{ +add.f16x2 r9694, r9685, r9691; +} +{ +sub.f16x2 r9697, r9440, r9441; +} +{ +mul.f16x2 r9700, r9697, r9434; +} +{ +sub.f16x2 r9703, r9446, r9447; +} +{ +mul.f16x2 r9706, r9703, r9437; +} +{ +add.f16x2 r9709, r9700, r9706; +} +{ +add.f16x2 %25, r9694, r9709; +} +{ +add.f16x2 r9715, r9452, r9453; +} +{ +mul.f16x2 r9718, r9715, r9433; +} +{ +add.f16x2 r9721, r9455, r9718; +} +{ +add.f16x2 r9724, r9458, r9459; +} +{ +mul.f16x2 r9727, r9724, r9435; +} +{ +add.f16x2 r9730, r9721, r9727; +} +{ +sub.f16x2 r9733, r9440, r9441; +} +{ +mul.f16x2 r9736, r9733, r9434; +} +{ +sub.f16x2 r9739, r9446, r9447; +} +{ +mul.f16x2 r9742, r9739, r9437; +} +{ +add.f16x2 r9745, r9736, r9742; +} +{ +sub.f16x2 %35, r9730, r9745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9754, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9755, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9756, {low, high}; +} +{ +neg.f16x2 r9757, r9756; +} +{ +add.f16x2 r9759, r9760, r9761; +} +{ +add.f16x2 r9762, r9763, r9759; +} +{ +add.f16x2 r9765, r9766, r9767; +} +{ +add.f16x2 %6, r9762, r9765; +} +{ +add.f16x2 r9771, r9772, r9773; +} +{ +add.f16x2 r9774, r9775, r9771; +} +{ +add.f16x2 r9777, r9778, r9779; +} +{ +add.f16x2 %7, r9774, r9777; +} +{ +add.f16x2 r9783, r9760, r9761; +} +{ +mul.f16x2 r9786, r9783, r9751; +} +{ +add.f16x2 r9789, r9763, r9786; +} +{ +add.f16x2 r9792, r9766, r9767; +} +{ +mul.f16x2 r9795, r9792, r9753; +} +{ +add.f16x2 r9798, r9789, r9795; +} +{ +sub.f16x2 r9801, r9772, r9773; +} +{ +mul.f16x2 r9804, r9801, r9752; +} +{ +sub.f16x2 r9807, r9778, r9779; +} +{ +mul.f16x2 r9810, r9807, r9754; +} +{ +add.f16x2 r9813, r9804, r9810; +} +{ +sub.f16x2 %16, r9798, r9813; +} +{ +add.f16x2 r9819, r9760, r9761; +} +{ +mul.f16x2 r9822, r9819, r9751; +} +{ +add.f16x2 r9825, r9763, r9822; +} +{ +add.f16x2 r9828, r9766, r9767; +} +{ +mul.f16x2 r9831, r9828, r9753; +} +{ +add.f16x2 r9834, r9825, r9831; +} +{ +sub.f16x2 r9837, r9772, r9773; +} +{ +mul.f16x2 r9840, r9837, r9752; +} +{ +sub.f16x2 r9843, r9778, r9779; +} +{ +mul.f16x2 r9846, r9843, r9754; +} +{ +add.f16x2 r9849, r9840, r9846; +} +{ +add.f16x2 %46, r9834, r9849; +} +{ +add.f16x2 r9855, r9760, r9761; +} +{ +mul.f16x2 r9858, r9855, r9753; +} +{ +add.f16x2 r9861, r9763, r9858; +} +{ +add.f16x2 r9864, r9766, r9767; +} +{ +mul.f16x2 r9867, r9864, r9755; +} +{ +add.f16x2 r9870, r9861, r9867; +} +{ +sub.f16x2 r9873, r9772, r9773; +} +{ +mul.f16x2 r9876, r9873, r9754; +} +{ +sub.f16x2 r9879, r9778, r9779; +} +{ +mul.f16x2 r9882, r9879, r9757; +} +{ +add.f16x2 r9885, r9876, r9882; +} +{ +sub.f16x2 %26, r9870, r9885; +} +{ +add.f16x2 r9891, r9760, r9761; +} +{ +mul.f16x2 r9894, r9891, r9753; +} +{ +add.f16x2 r9897, r9763, r9894; +} +{ +add.f16x2 r9900, r9766, r9767; +} +{ +mul.f16x2 r9903, r9900, r9755; +} +{ +add.f16x2 r9906, r9897, r9903; +} +{ +sub.f16x2 r9909, r9772, r9773; +} +{ +mul.f16x2 r9912, r9909, r9754; +} +{ +sub.f16x2 r9915, r9778, r9779; +} +{ +mul.f16x2 r9918, r9915, r9757; +} +{ +add.f16x2 r9921, r9912, r9918; +} +{ +add.f16x2 %36, r9906, r9921; +} +{ +add.f16x2 r9927, r9772, r9773; +} +{ +mul.f16x2 r9930, r9927, r9751; +} +{ +add.f16x2 r9933, r9775, r9930; +} +{ +add.f16x2 r9936, r9778, r9779; +} +{ +mul.f16x2 r9939, r9936, r9753; +} +{ +add.f16x2 r9942, r9933, r9939; +} +{ +sub.f16x2 r9945, r9760, r9761; +} +{ +mul.f16x2 r9948, r9945, r9752; +} +{ +sub.f16x2 r9951, r9766, r9767; +} +{ +mul.f16x2 r9954, r9951, r9754; +} +{ +add.f16x2 r9957, r9948, r9954; +} +{ +add.f16x2 %17, r9942, r9957; +} +{ +add.f16x2 r9963, r9772, r9773; +} +{ +mul.f16x2 r9966, r9963, r9751; +} +{ +add.f16x2 r9969, r9775, r9966; +} +{ +add.f16x2 r9972, r9778, r9779; +} +{ +mul.f16x2 r9975, r9972, r9753; +} +{ +add.f16x2 r9978, r9969, r9975; +} +{ +sub.f16x2 r9981, r9760, r9761; +} +{ +mul.f16x2 r9984, r9981, r9752; +} +{ +sub.f16x2 r9987, r9766, r9767; +} +{ +mul.f16x2 r9990, r9987, r9754; +} +{ +add.f16x2 r9993, r9984, r9990; +} +{ +sub.f16x2 %47, r9978, r9993; +} +{ +add.f16x2 r9999, r9772, r9773; +} +{ +mul.f16x2 r10002, r9999, r9753; +} +{ +add.f16x2 r10005, r9775, r10002; +} +{ +add.f16x2 r10008, r9778, r9779; +} +{ +mul.f16x2 r10011, r10008, r9755; +} +{ +add.f16x2 r10014, r10005, r10011; +} +{ +sub.f16x2 r10017, r9760, r9761; +} +{ +mul.f16x2 r10020, r10017, r9754; +} +{ +sub.f16x2 r10023, r9766, r9767; +} +{ +mul.f16x2 r10026, r10023, r9757; +} +{ +add.f16x2 r10029, r10020, r10026; +} +{ +add.f16x2 %27, r10014, r10029; +} +{ +add.f16x2 r10035, r9772, r9773; +} +{ +mul.f16x2 r10038, r10035, r9753; +} +{ +add.f16x2 r10041, r9775, r10038; +} +{ +add.f16x2 r10044, r9778, r9779; +} +{ +mul.f16x2 r10047, r10044, r9755; +} +{ +add.f16x2 r10050, r10041, r10047; +} +{ +sub.f16x2 r10053, r9760, r9761; +} +{ +mul.f16x2 r10056, r10053, r9754; +} +{ +sub.f16x2 r10059, r9766, r9767; +} +{ +mul.f16x2 r10062, r10059, r9757; +} +{ +add.f16x2 r10065, r10056, r10062; +} +{ +sub.f16x2 %37, r10050, r10065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10071, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10072, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r10073, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r10074, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10075, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10076, {low, high}; +} +{ +neg.f16x2 r10077, r10076; +} +{ +add.f16x2 r10079, r10080, r10081; +} +{ +add.f16x2 r10082, r10083, r10079; +} +{ +add.f16x2 r10085, r10086, r10087; +} +{ +add.f16x2 %8, r10082, r10085; +} +{ +add.f16x2 r10091, r10092, r10093; +} +{ +add.f16x2 r10094, r10095, r10091; +} +{ +add.f16x2 r10097, r10098, r10099; +} +{ +add.f16x2 %9, r10094, r10097; +} +{ +add.f16x2 r10103, r10080, r10081; +} +{ +mul.f16x2 r10106, r10103, r10071; +} +{ +add.f16x2 r10109, r10083, r10106; +} +{ +add.f16x2 r10112, r10086, r10087; +} +{ +mul.f16x2 r10115, r10112, r10073; +} +{ +add.f16x2 r10118, r10109, r10115; +} +{ +sub.f16x2 r10121, r10092, r10093; +} +{ +mul.f16x2 r10124, r10121, r10072; +} +{ +sub.f16x2 r10127, r10098, r10099; +} +{ +mul.f16x2 r10130, r10127, r10074; +} +{ +add.f16x2 r10133, r10124, r10130; +} +{ +sub.f16x2 %18, r10118, r10133; +} +{ +add.f16x2 r10139, r10080, r10081; +} +{ +mul.f16x2 r10142, r10139, r10071; +} +{ +add.f16x2 r10145, r10083, r10142; +} +{ +add.f16x2 r10148, r10086, r10087; +} +{ +mul.f16x2 r10151, r10148, r10073; +} +{ +add.f16x2 r10154, r10145, r10151; +} +{ +sub.f16x2 r10157, r10092, r10093; +} +{ +mul.f16x2 r10160, r10157, r10072; +} +{ +sub.f16x2 r10163, r10098, r10099; +} +{ +mul.f16x2 r10166, r10163, r10074; +} +{ +add.f16x2 r10169, r10160, r10166; +} +{ +add.f16x2 %48, r10154, r10169; +} +{ +add.f16x2 r10175, r10080, r10081; +} +{ +mul.f16x2 r10178, r10175, r10073; +} +{ +add.f16x2 r10181, r10083, r10178; +} +{ +add.f16x2 r10184, r10086, r10087; +} +{ +mul.f16x2 r10187, r10184, r10075; +} +{ +add.f16x2 r10190, r10181, r10187; +} +{ +sub.f16x2 r10193, r10092, r10093; +} +{ +mul.f16x2 r10196, r10193, r10074; +} +{ +sub.f16x2 r10199, r10098, r10099; +} +{ +mul.f16x2 r10202, r10199, r10077; +} +{ +add.f16x2 r10205, r10196, r10202; +} +{ +sub.f16x2 %28, r10190, r10205; +} +{ +add.f16x2 r10211, r10080, r10081; +} +{ +mul.f16x2 r10214, r10211, r10073; +} +{ +add.f16x2 r10217, r10083, r10214; +} +{ +add.f16x2 r10220, r10086, r10087; +} +{ +mul.f16x2 r10223, r10220, r10075; +} +{ +add.f16x2 r10226, r10217, r10223; +} +{ +sub.f16x2 r10229, r10092, r10093; +} +{ +mul.f16x2 r10232, r10229, r10074; +} +{ +sub.f16x2 r10235, r10098, r10099; +} +{ +mul.f16x2 r10238, r10235, r10077; +} +{ +add.f16x2 r10241, r10232, r10238; +} +{ +add.f16x2 %38, r10226, r10241; +} +{ +add.f16x2 r10247, r10092, r10093; +} +{ +mul.f16x2 r10250, r10247, r10071; +} +{ +add.f16x2 r10253, r10095, r10250; +} +{ +add.f16x2 r10256, r10098, r10099; +} +{ +mul.f16x2 r10259, r10256, r10073; +} +{ +add.f16x2 r10262, r10253, r10259; +} +{ +sub.f16x2 r10265, r10080, r10081; +} +{ +mul.f16x2 r10268, r10265, r10072; +} +{ +sub.f16x2 r10271, r10086, r10087; +} +{ +mul.f16x2 r10274, r10271, r10074; +} +{ +add.f16x2 r10277, r10268, r10274; +} +{ +add.f16x2 %19, r10262, r10277; +} +{ +add.f16x2 r10283, r10092, r10093; +} +{ +mul.f16x2 r10286, r10283, r10071; +} +{ +add.f16x2 r10289, r10095, r10286; +} +{ +add.f16x2 r10292, r10098, r10099; +} +{ +mul.f16x2 r10295, r10292, r10073; +} +{ +add.f16x2 r10298, r10289, r10295; +} +{ +sub.f16x2 r10301, r10080, r10081; +} +{ +mul.f16x2 r10304, r10301, r10072; +} +{ +sub.f16x2 r10307, r10086, r10087; +} +{ +mul.f16x2 r10310, r10307, r10074; +} +{ +add.f16x2 r10313, r10304, r10310; +} +{ +sub.f16x2 %49, r10298, r10313; +} +{ +add.f16x2 r10319, r10092, r10093; +} +{ +mul.f16x2 r10322, r10319, r10073; +} +{ +add.f16x2 r10325, r10095, r10322; +} +{ +add.f16x2 r10328, r10098, r10099; +} +{ +mul.f16x2 r10331, r10328, r10075; +} +{ +add.f16x2 r10334, r10325, r10331; +} +{ +sub.f16x2 r10337, r10080, r10081; +} +{ +mul.f16x2 r10340, r10337, r10074; +} +{ +sub.f16x2 r10343, r10086, r10087; +} +{ +mul.f16x2 r10346, r10343, r10077; +} +{ +add.f16x2 r10349, r10340, r10346; +} +{ +add.f16x2 %29, r10334, r10349; +} +{ +add.f16x2 r10355, r10092, r10093; +} +{ +mul.f16x2 r10358, r10355, r10073; +} +{ +add.f16x2 r10361, r10095, r10358; +} +{ +add.f16x2 r10364, r10098, r10099; +} +{ +mul.f16x2 r10367, r10364, r10075; +} +{ +add.f16x2 r10370, r10361, r10367; +} +{ +sub.f16x2 r10373, r10080, r10081; +} +{ +mul.f16x2 r10376, r10373, r10074; +} +{ +sub.f16x2 r10379, r10086, r10087; +} +{ +mul.f16x2 r10382, r10379, r10077; +} +{ +add.f16x2 r10385, r10376, r10382; +} +{ +sub.f16x2 %39, r10370, r10385; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[18].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<912, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<603>; +.reg .b32 r<10458>; +.reg .b64 rd<6>; +mov.u32 r10392, %50; +mov.u32 r10457, %tid.y; +mad.lo.s32 r10393, r10457, 12500, r10392; +mov.u32 r10394, %tid.x; +mov.f32 f594, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1, {low, high}; +} +mov.f32 f596, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2, {low, high}; +} +mov.f32 f590, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r3, {low, high}; +} +mov.f32 f592, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %95, %81; +} +{ +add.f16x2 r12, %54, r9; +} +{ +add.f16x2 r15, %59, %94; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %70, %58; +} +{ +add.f16x2 r24, %79, r21; +} +{ +add.f16x2 r27, %87, %69; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %95, %81; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %54, r36; +} +{ +add.f16x2 r42, %59, %94; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %70, %58; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %87, %69; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %95, %81; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %54, r72; +} +{ +add.f16x2 r78, %59, %94; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %70, %58; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %87, %69; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %95, %81; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %54, r108; +} +{ +add.f16x2 r114, %59, %94; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %70, %58; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %87, %69; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %95, %81; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %54, r144; +} +{ +add.f16x2 r150, %59, %94; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %70, %58; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %87, %69; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %70, %58; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %79, r180; +} +{ +add.f16x2 r186, %87, %69; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %95, %81; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %59, %94; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %70, %58; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %79, r216; +} +{ +add.f16x2 r222, %87, %69; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %95, %81; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %59, %94; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %70, %58; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %79, r252; +} +{ +add.f16x2 r258, %87, %69; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %95, %81; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %59, %94; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %70, %58; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %79, r288; +} +{ +add.f16x2 r294, %87, %69; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %95, %81; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %59, %94; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %97, %83; +} +{ +add.f16x2 r332, %55, r329; +} +{ +add.f16x2 r335, %62, %96; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %72, %61; +} +{ +add.f16x2 r344, %80, r341; +} +{ +add.f16x2 r347, %89, %71; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %97, %83; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %55, r356; +} +{ +add.f16x2 r362, %62, %96; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %72, %61; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %89, %71; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %97, %83; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %55, r392; +} +{ +add.f16x2 r398, %62, %96; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %72, %61; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %89, %71; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %97, %83; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %55, r428; +} +{ +add.f16x2 r434, %62, %96; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %72, %61; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %89, %71; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %97, %83; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %55, r464; +} +{ +add.f16x2 r470, %62, %96; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %72, %61; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %89, %71; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %72, %61; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %80, r500; +} +{ +add.f16x2 r506, %89, %71; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %97, %83; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %62, %96; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %72, %61; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %80, r536; +} +{ +add.f16x2 r542, %89, %71; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %97, %83; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %62, %96; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %72, %61; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %80, r572; +} +{ +add.f16x2 r578, %89, %71; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %97, %83; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %62, %96; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %72, %61; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %80, r608; +} +{ +add.f16x2 r614, %89, %71; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %97, %83; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %62, %96; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +add.f16x2 r649, %99, %85; +} +{ +add.f16x2 r652, %56, r649; +} +{ +add.f16x2 r655, %64, %98; +} +{ +add.f16x2 r658, r652, r655; +} +{ +add.f16x2 r661, %74, %63; +} +{ +add.f16x2 r664, %82, r661; +} +{ +add.f16x2 r667, %91, %73; +} +{ +add.f16x2 r670, r664, r667; +} +{ +add.f16x2 r673, %99, %85; +} +{ +mul.f16x2 r676, r673, r641; +} +{ +add.f16x2 r679, %56, r676; +} +{ +add.f16x2 r682, %64, %98; +} +{ +mul.f16x2 r685, r682, r643; +} +{ +add.f16x2 r688, r679, r685; +} +{ +sub.f16x2 r691, %74, %63; +} +{ +mul.f16x2 r694, r691, r642; +} +{ +sub.f16x2 r697, %91, %73; +} +{ +mul.f16x2 r700, r697, r644; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r688, r703; +} +{ +add.f16x2 r709, %99, %85; +} +{ +mul.f16x2 r712, r709, r641; +} +{ +add.f16x2 r715, %56, r712; +} +{ +add.f16x2 r718, %64, %98; +} +{ +mul.f16x2 r721, r718, r643; +} +{ +add.f16x2 r724, r715, r721; +} +{ +sub.f16x2 r727, %74, %63; +} +{ +mul.f16x2 r730, r727, r642; +} +{ +sub.f16x2 r733, %91, %73; +} +{ +mul.f16x2 r736, r733, r644; +} +{ +add.f16x2 r739, r730, r736; +} +{ +add.f16x2 r742, r724, r739; +} +{ +add.f16x2 r745, %99, %85; +} +{ +mul.f16x2 r748, r745, r643; +} +{ +add.f16x2 r751, %56, r748; +} +{ +add.f16x2 r754, %64, %98; +} +{ +mul.f16x2 r757, r754, r645; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %63; +} +{ +mul.f16x2 r766, r763, r644; +} +{ +sub.f16x2 r769, %91, %73; +} +{ +mul.f16x2 r772, r769, r647; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r760, r775; +} +{ +add.f16x2 r781, %99, %85; +} +{ +mul.f16x2 r784, r781, r643; +} +{ +add.f16x2 r787, %56, r784; +} +{ +add.f16x2 r790, %64, %98; +} +{ +mul.f16x2 r793, r790, r645; +} +{ +add.f16x2 r796, r787, r793; +} +{ +sub.f16x2 r799, %74, %63; +} +{ +mul.f16x2 r802, r799, r644; +} +{ +sub.f16x2 r805, %91, %73; +} +{ +mul.f16x2 r808, r805, r647; +} +{ +add.f16x2 r811, r802, r808; +} +{ +add.f16x2 r814, r796, r811; +} +{ +add.f16x2 r817, %74, %63; +} +{ +mul.f16x2 r820, r817, r641; +} +{ +add.f16x2 r823, %82, r820; +} +{ +add.f16x2 r826, %91, %73; +} +{ +mul.f16x2 r829, r826, r643; +} +{ +add.f16x2 r832, r823, r829; +} +{ +sub.f16x2 r835, %99, %85; +} +{ +mul.f16x2 r838, r835, r642; +} +{ +sub.f16x2 r841, %64, %98; +} +{ +mul.f16x2 r844, r841, r644; +} +{ +add.f16x2 r847, r838, r844; +} +{ +add.f16x2 r850, r832, r847; +} +{ +add.f16x2 r853, %74, %63; +} +{ +mul.f16x2 r856, r853, r641; +} +{ +add.f16x2 r859, %82, r856; +} +{ +add.f16x2 r862, %91, %73; +} +{ +mul.f16x2 r865, r862, r643; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %99, %85; +} +{ +mul.f16x2 r874, r871, r642; +} +{ +sub.f16x2 r877, %64, %98; +} +{ +mul.f16x2 r880, r877, r644; +} +{ +add.f16x2 r883, r874, r880; +} +{ +sub.f16x2 r886, r868, r883; +} +{ +add.f16x2 r889, %74, %63; +} +{ +mul.f16x2 r892, r889, r643; +} +{ +add.f16x2 r895, %82, r892; +} +{ +add.f16x2 r898, %91, %73; +} +{ +mul.f16x2 r901, r898, r645; +} +{ +add.f16x2 r904, r895, r901; +} +{ +sub.f16x2 r907, %99, %85; +} +{ +mul.f16x2 r910, r907, r644; +} +{ +sub.f16x2 r913, %64, %98; +} +{ +mul.f16x2 r916, r913, r647; +} +{ +add.f16x2 r919, r910, r916; +} +{ +add.f16x2 r922, r904, r919; +} +{ +add.f16x2 r925, %74, %63; +} +{ +mul.f16x2 r928, r925, r643; +} +{ +add.f16x2 r931, %82, r928; +} +{ +add.f16x2 r934, %91, %73; +} +{ +mul.f16x2 r937, r934, r645; +} +{ +add.f16x2 r940, r931, r937; +} +{ +sub.f16x2 r943, %99, %85; +} +{ +mul.f16x2 r946, r943, r644; +} +{ +sub.f16x2 r949, %64, %98; +} +{ +mul.f16x2 r952, r949, r647; +} +{ +add.f16x2 r955, r946, r952; +} +{ +sub.f16x2 r958, r940, r955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r966, {low, high}; +} +{ +neg.f16x2 r967, r966; +} +{ +add.f16x2 r969, %51, %88; +} +{ +add.f16x2 r972, %57, r969; +} +{ +add.f16x2 r975, %66, %100; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, %76, %65; +} +{ +add.f16x2 r984, %84, r981; +} +{ +add.f16x2 r987, %92, %75; +} +{ +add.f16x2 r990, r984, r987; +} +{ +add.f16x2 r993, %51, %88; +} +{ +mul.f16x2 r996, r993, r961; +} +{ +add.f16x2 r999, %57, r996; +} +{ +add.f16x2 r1002, %66, %100; +} +{ +mul.f16x2 r1005, r1002, r963; +} +{ +add.f16x2 r1008, r999, r1005; +} +{ +sub.f16x2 r1011, %76, %65; +} +{ +mul.f16x2 r1014, r1011, r962; +} +{ +sub.f16x2 r1017, %92, %75; +} +{ +mul.f16x2 r1020, r1017, r964; +} +{ +add.f16x2 r1023, r1014, r1020; +} +{ +sub.f16x2 r1026, r1008, r1023; +} +{ +add.f16x2 r1029, %51, %88; +} +{ +mul.f16x2 r1032, r1029, r961; +} +{ +add.f16x2 r1035, %57, r1032; +} +{ +add.f16x2 r1038, %66, %100; +} +{ +mul.f16x2 r1041, r1038, r963; +} +{ +add.f16x2 r1044, r1035, r1041; +} +{ +sub.f16x2 r1047, %76, %65; +} +{ +mul.f16x2 r1050, r1047, r962; +} +{ +sub.f16x2 r1053, %92, %75; +} +{ +mul.f16x2 r1056, r1053, r964; +} +{ +add.f16x2 r1059, r1050, r1056; +} +{ +add.f16x2 r1062, r1044, r1059; +} +{ +add.f16x2 r1065, %51, %88; +} +{ +mul.f16x2 r1068, r1065, r963; +} +{ +add.f16x2 r1071, %57, r1068; +} +{ +add.f16x2 r1074, %66, %100; +} +{ +mul.f16x2 r1077, r1074, r965; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +sub.f16x2 r1083, %76, %65; +} +{ +mul.f16x2 r1086, r1083, r964; +} +{ +sub.f16x2 r1089, %92, %75; +} +{ +mul.f16x2 r1092, r1089, r967; +} +{ +add.f16x2 r1095, r1086, r1092; +} +{ +sub.f16x2 r1098, r1080, r1095; +} +{ +add.f16x2 r1101, %51, %88; +} +{ +mul.f16x2 r1104, r1101, r963; +} +{ +add.f16x2 r1107, %57, r1104; +} +{ +add.f16x2 r1110, %66, %100; +} +{ +mul.f16x2 r1113, r1110, r965; +} +{ +add.f16x2 r1116, r1107, r1113; +} +{ +sub.f16x2 r1119, %76, %65; +} +{ +mul.f16x2 r1122, r1119, r964; +} +{ +sub.f16x2 r1125, %92, %75; +} +{ +mul.f16x2 r1128, r1125, r967; +} +{ +add.f16x2 r1131, r1122, r1128; +} +{ +add.f16x2 r1134, r1116, r1131; +} +{ +add.f16x2 r1137, %76, %65; +} +{ +mul.f16x2 r1140, r1137, r961; +} +{ +add.f16x2 r1143, %84, r1140; +} +{ +add.f16x2 r1146, %92, %75; +} +{ +mul.f16x2 r1149, r1146, r963; +} +{ +add.f16x2 r1152, r1143, r1149; +} +{ +sub.f16x2 r1155, %51, %88; +} +{ +mul.f16x2 r1158, r1155, r962; +} +{ +sub.f16x2 r1161, %66, %100; +} +{ +mul.f16x2 r1164, r1161, r964; +} +{ +add.f16x2 r1167, r1158, r1164; +} +{ +add.f16x2 r1170, r1152, r1167; +} +{ +add.f16x2 r1173, %76, %65; +} +{ +mul.f16x2 r1176, r1173, r961; +} +{ +add.f16x2 r1179, %84, r1176; +} +{ +add.f16x2 r1182, %92, %75; +} +{ +mul.f16x2 r1185, r1182, r963; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +sub.f16x2 r1191, %51, %88; +} +{ +mul.f16x2 r1194, r1191, r962; +} +{ +sub.f16x2 r1197, %66, %100; +} +{ +mul.f16x2 r1200, r1197, r964; +} +{ +add.f16x2 r1203, r1194, r1200; +} +{ +sub.f16x2 r1206, r1188, r1203; +} +{ +add.f16x2 r1209, %76, %65; +} +{ +mul.f16x2 r1212, r1209, r963; +} +{ +add.f16x2 r1215, %84, r1212; +} +{ +add.f16x2 r1218, %92, %75; +} +{ +mul.f16x2 r1221, r1218, r965; +} +{ +add.f16x2 r1224, r1215, r1221; +} +{ +sub.f16x2 r1227, %51, %88; +} +{ +mul.f16x2 r1230, r1227, r964; +} +{ +sub.f16x2 r1233, %66, %100; +} +{ +mul.f16x2 r1236, r1233, r967; +} +{ +add.f16x2 r1239, r1230, r1236; +} +{ +add.f16x2 r1242, r1224, r1239; +} +{ +add.f16x2 r1245, %76, %65; +} +{ +mul.f16x2 r1248, r1245, r963; +} +{ +add.f16x2 r1251, %84, r1248; +} +{ +add.f16x2 r1254, %92, %75; +} +{ +mul.f16x2 r1257, r1254, r965; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +sub.f16x2 r1263, %51, %88; +} +{ +mul.f16x2 r1266, r1263, r964; +} +{ +sub.f16x2 r1269, %66, %100; +} +{ +mul.f16x2 r1272, r1269, r967; +} +{ +add.f16x2 r1275, r1266, r1272; +} +{ +sub.f16x2 r1278, r1260, r1275; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1281, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1286, {low, high}; +} +{ +neg.f16x2 r1287, r1286; +} +{ +add.f16x2 r1289, %53, %90; +} +{ +add.f16x2 r1292, %60, r1289; +} +{ +add.f16x2 r1295, %68, %52; +} +{ +add.f16x2 r1298, r1292, r1295; +} +{ +add.f16x2 r1301, %78, %67; +} +{ +add.f16x2 r1304, %86, r1301; +} +{ +add.f16x2 r1307, %93, %77; +} +{ +add.f16x2 r1310, r1304, r1307; +} +{ +add.f16x2 r1313, %53, %90; +} +{ +mul.f16x2 r1316, r1313, r1281; +} +{ +add.f16x2 r1319, %60, r1316; +} +{ +add.f16x2 r1322, %68, %52; +} +{ +mul.f16x2 r1325, r1322, r1283; +} +{ +add.f16x2 r1328, r1319, r1325; +} +{ +sub.f16x2 r1331, %78, %67; +} +{ +mul.f16x2 r1334, r1331, r1282; +} +{ +sub.f16x2 r1337, %93, %77; +} +{ +mul.f16x2 r1340, r1337, r1284; +} +{ +add.f16x2 r1343, r1334, r1340; +} +{ +sub.f16x2 r1346, r1328, r1343; +} +{ +add.f16x2 r1349, %53, %90; +} +{ +mul.f16x2 r1352, r1349, r1281; +} +{ +add.f16x2 r1355, %60, r1352; +} +{ +add.f16x2 r1358, %68, %52; +} +{ +mul.f16x2 r1361, r1358, r1283; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +sub.f16x2 r1367, %78, %67; +} +{ +mul.f16x2 r1370, r1367, r1282; +} +{ +sub.f16x2 r1373, %93, %77; +} +{ +mul.f16x2 r1376, r1373, r1284; +} +{ +add.f16x2 r1379, r1370, r1376; +} +{ +add.f16x2 r1382, r1364, r1379; +} +{ +add.f16x2 r1385, %53, %90; +} +{ +mul.f16x2 r1388, r1385, r1283; +} +{ +add.f16x2 r1391, %60, r1388; +} +{ +add.f16x2 r1394, %68, %52; +} +{ +mul.f16x2 r1397, r1394, r1285; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, %78, %67; +} +{ +mul.f16x2 r1406, r1403, r1284; +} +{ +sub.f16x2 r1409, %93, %77; +} +{ +mul.f16x2 r1412, r1409, r1287; +} +{ +add.f16x2 r1415, r1406, r1412; +} +{ +sub.f16x2 r1418, r1400, r1415; +} +{ +add.f16x2 r1421, %53, %90; +} +{ +mul.f16x2 r1424, r1421, r1283; +} +{ +add.f16x2 r1427, %60, r1424; +} +{ +add.f16x2 r1430, %68, %52; +} +{ +mul.f16x2 r1433, r1430, r1285; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +sub.f16x2 r1439, %78, %67; +} +{ +mul.f16x2 r1442, r1439, r1284; +} +{ +sub.f16x2 r1445, %93, %77; +} +{ +mul.f16x2 r1448, r1445, r1287; +} +{ +add.f16x2 r1451, r1442, r1448; +} +{ +add.f16x2 r1454, r1436, r1451; +} +{ +add.f16x2 r1457, %78, %67; +} +{ +mul.f16x2 r1460, r1457, r1281; +} +{ +add.f16x2 r1463, %86, r1460; +} +{ +add.f16x2 r1466, %93, %77; +} +{ +mul.f16x2 r1469, r1466, r1283; +} +{ +add.f16x2 r1472, r1463, r1469; +} +{ +sub.f16x2 r1475, %53, %90; +} +{ +mul.f16x2 r1478, r1475, r1282; +} +{ +sub.f16x2 r1481, %68, %52; +} +{ +mul.f16x2 r1484, r1481, r1284; +} +{ +add.f16x2 r1487, r1478, r1484; +} +{ +add.f16x2 r1490, r1472, r1487; +} +{ +add.f16x2 r1493, %78, %67; +} +{ +mul.f16x2 r1496, r1493, r1281; +} +{ +add.f16x2 r1499, %86, r1496; +} +{ +add.f16x2 r1502, %93, %77; +} +{ +mul.f16x2 r1505, r1502, r1283; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, %53, %90; +} +{ +mul.f16x2 r1514, r1511, r1282; +} +{ +sub.f16x2 r1517, %68, %52; +} +{ +mul.f16x2 r1520, r1517, r1284; +} +{ +add.f16x2 r1523, r1514, r1520; +} +{ +sub.f16x2 r1526, r1508, r1523; +} +{ +add.f16x2 r1529, %78, %67; +} +{ +mul.f16x2 r1532, r1529, r1283; +} +{ +add.f16x2 r1535, %86, r1532; +} +{ +add.f16x2 r1538, %93, %77; +} +{ +mul.f16x2 r1541, r1538, r1285; +} +{ +add.f16x2 r1544, r1535, r1541; +} +{ +sub.f16x2 r1547, %53, %90; +} +{ +mul.f16x2 r1550, r1547, r1284; +} +{ +sub.f16x2 r1553, %68, %52; +} +{ +mul.f16x2 r1556, r1553, r1287; +} +{ +add.f16x2 r1559, r1550, r1556; +} +{ +add.f16x2 r1562, r1544, r1559; +} +{ +add.f16x2 r1565, %78, %67; +} +{ +mul.f16x2 r1568, r1565, r1283; +} +{ +add.f16x2 r1571, %86, r1568; +} +{ +add.f16x2 r1574, %93, %77; +} +{ +mul.f16x2 r1577, r1574, r1285; +} +{ +add.f16x2 r1580, r1571, r1577; +} +{ +sub.f16x2 r1583, %53, %90; +} +{ +mul.f16x2 r1586, r1583, r1284; +} +{ +sub.f16x2 r1589, %68, %52; +} +{ +mul.f16x2 r1592, r1589, r1287; +} +{ +add.f16x2 r1595, r1586, r1592; +} +{ +sub.f16x2 r1598, r1580, r1595; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1601, {low, high}; +} +mov.f32 f332, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1602, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1603, {low, high}; +} +mov.f32 f336, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1604, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1605, {low, high}; +} +mov.f32 f340, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1606, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1607, {low, high}; +} +mov.f32 f344, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1608, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1611, {low, high}; +} +mov.f32 f352, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1612, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1615, {low, high}; +} +mov.f32 f360, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1616, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1617, {low, high}; +} +mov.f32 f364, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1618, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1623, {low, high}; +} +mov.f32 f376, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1631, {low, high}; +} +mov.f32 f392, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1632, {low, high}; +} +{ +mul.f16x2 r1649, r386, r1601; +} +{ +mul.f16x2 r1652, r530, r1602; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r386, r1602; +} +{ +fma.rn.f16x2 r1661, r530, r1601, r1658; +} +{ +mul.f16x2 r1665, r706, r1603; +} +{ +mul.f16x2 r1668, r850, r1604; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r706, r1604; +} +{ +fma.rn.f16x2 r1677, r850, r1603, r1674; +} +{ +mul.f16x2 r1681, r1026, r1605; +} +{ +mul.f16x2 r1684, r1170, r1606; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r1026, r1606; +} +{ +fma.rn.f16x2 r1693, r1170, r1605, r1690; +} +{ +mul.f16x2 r1697, r1346, r1607; +} +{ +mul.f16x2 r1700, r1490, r1608; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r1346, r1608; +} +{ +fma.rn.f16x2 r1709, r1490, r1607, r1706; +} +{ +mul.f16x2 r1713, r458, r1603; +} +{ +mul.f16x2 r1716, r602, r1604; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r458, r1604; +} +{ +fma.rn.f16x2 r1725, r602, r1603, r1722; +} +{ +mul.f16x2 r1729, r778, r1607; +} +{ +mul.f16x2 r1732, r922, r1608; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r778, r1608; +} +{ +fma.rn.f16x2 r1741, r922, r1607, r1738; +} +{ +mul.f16x2 r1745, r1098, r1611; +} +{ +mul.f16x2 r1748, r1242, r1612; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r1098, r1612; +} +{ +fma.rn.f16x2 r1757, r1242, r1611, r1754; +} +{ +mul.f16x2 r1761, r1418, r1615; +} +{ +mul.f16x2 r1764, r1562, r1616; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r1418, r1616; +} +{ +fma.rn.f16x2 r1773, r1562, r1615, r1770; +} +{ +mul.f16x2 r1777, r494, r1605; +} +{ +mul.f16x2 r1780, r638, r1606; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r494, r1606; +} +{ +fma.rn.f16x2 r1789, r638, r1605, r1786; +} +{ +mul.f16x2 r1793, r814, r1611; +} +{ +mul.f16x2 r1796, r958, r1612; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r814, r1612; +} +{ +fma.rn.f16x2 r1805, r958, r1611, r1802; +} +{ +mul.f16x2 r1809, r1134, r1617; +} +{ +mul.f16x2 r1812, r1278, r1618; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1134, r1618; +} +{ +fma.rn.f16x2 r1821, r1278, r1617, r1818; +} +{ +mul.f16x2 r1825, r1454, r1623; +} +{ +mul.f16x2 r1828, r1598, r1624; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1454, r1624; +} +{ +fma.rn.f16x2 r1837, r1598, r1623, r1834; +} +{ +mul.f16x2 r1841, r422, r1607; +} +{ +mul.f16x2 r1844, r566, r1608; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r422, r1608; +} +{ +fma.rn.f16x2 r1853, r566, r1607, r1850; +} +{ +mul.f16x2 r1857, r742, r1615; +} +{ +mul.f16x2 r1860, r886, r1616; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r742, r1616; +} +{ +fma.rn.f16x2 r1869, r886, r1615, r1866; +} +{ +mul.f16x2 r1873, r1062, r1623; +} +{ +mul.f16x2 r1876, r1206, r1624; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1062, r1624; +} +{ +fma.rn.f16x2 r1885, r1206, r1623, r1882; +} +{ +mul.f16x2 r1889, r1382, r1631; +} +{ +mul.f16x2 r1892, r1526, r1632; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1382, r1632; +} +{ +fma.rn.f16x2 r1901, r1526, r1631, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1905, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1906, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1910, {low, high}; +} +{ +neg.f16x2 r1911, r1910; +} +{ +add.f16x2 r1913, r338, r1298; +} +{ +add.f16x2 r1916, r18, r1913; +} +{ +add.f16x2 r1919, r658, r978; +} +{ +add.f16x2 r1922, r1916, r1919; +} +{ +add.f16x2 r1925, r350, r1310; +} +{ +add.f16x2 r1928, r30, r1925; +} +{ +add.f16x2 r1931, r670, r990; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r338, r1298; +} +{ +mul.f16x2 r1940, r1937, r1905; +} +{ +add.f16x2 r1943, r18, r1940; +} +{ +add.f16x2 r1946, r658, r978; +} +{ +mul.f16x2 r1949, r1946, r1907; +} +{ +add.f16x2 r1952, r1943, r1949; +} +{ +sub.f16x2 r1955, r350, r1310; +} +{ +mul.f16x2 r1958, r1955, r1906; +} +{ +sub.f16x2 r1961, r670, r990; +} +{ +mul.f16x2 r1964, r1961, r1908; +} +{ +add.f16x2 r1967, r1958, r1964; +} +{ +sub.f16x2 r1970, r1952, r1967; +} +{ +add.f16x2 r1973, r338, r1298; +} +{ +mul.f16x2 r1976, r1973, r1905; +} +{ +add.f16x2 r1979, r18, r1976; +} +{ +add.f16x2 r1982, r658, r978; +} +{ +mul.f16x2 r1985, r1982, r1907; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +sub.f16x2 r1991, r350, r1310; +} +{ +mul.f16x2 r1994, r1991, r1906; +} +{ +sub.f16x2 r1997, r670, r990; +} +{ +mul.f16x2 r2000, r1997, r1908; +} +{ +add.f16x2 r2003, r1994, r2000; +} +{ +add.f16x2 r2006, r1988, r2003; +} +{ +add.f16x2 r2009, r338, r1298; +} +{ +mul.f16x2 r2012, r2009, r1907; +} +{ +add.f16x2 r2015, r18, r2012; +} +{ +add.f16x2 r2018, r658, r978; +} +{ +mul.f16x2 r2021, r2018, r1909; +} +{ +add.f16x2 r2024, r2015, r2021; +} +{ +sub.f16x2 r2027, r350, r1310; +} +{ +mul.f16x2 r2030, r2027, r1908; +} +{ +sub.f16x2 r2033, r670, r990; +} +{ +mul.f16x2 r2036, r2033, r1911; +} +{ +add.f16x2 r2039, r2030, r2036; +} +{ +sub.f16x2 r2042, r2024, r2039; +} +{ +add.f16x2 r2045, r338, r1298; +} +{ +mul.f16x2 r2048, r2045, r1907; +} +{ +add.f16x2 r2051, r18, r2048; +} +{ +add.f16x2 r2054, r658, r978; +} +{ +mul.f16x2 r2057, r2054, r1909; +} +{ +add.f16x2 r2060, r2051, r2057; +} +{ +sub.f16x2 r2063, r350, r1310; +} +{ +mul.f16x2 r2066, r2063, r1908; +} +{ +sub.f16x2 r2069, r670, r990; +} +{ +mul.f16x2 r2072, r2069, r1911; +} +{ +add.f16x2 r2075, r2066, r2072; +} +{ +add.f16x2 r2078, r2060, r2075; +} +{ +add.f16x2 r2081, r350, r1310; +} +{ +mul.f16x2 r2084, r2081, r1905; +} +{ +add.f16x2 r2087, r30, r2084; +} +{ +add.f16x2 r2090, r670, r990; +} +{ +mul.f16x2 r2093, r2090, r1907; +} +{ +add.f16x2 r2096, r2087, r2093; +} +{ +sub.f16x2 r2099, r338, r1298; +} +{ +mul.f16x2 r2102, r2099, r1906; +} +{ +sub.f16x2 r2105, r658, r978; +} +{ +mul.f16x2 r2108, r2105, r1908; +} +{ +add.f16x2 r2111, r2102, r2108; +} +{ +add.f16x2 r2114, r2096, r2111; +} +{ +add.f16x2 r2117, r350, r1310; +} +{ +mul.f16x2 r2120, r2117, r1905; +} +{ +add.f16x2 r2123, r30, r2120; +} +{ +add.f16x2 r2126, r670, r990; +} +{ +mul.f16x2 r2129, r2126, r1907; +} +{ +add.f16x2 r2132, r2123, r2129; +} +{ +sub.f16x2 r2135, r338, r1298; +} +{ +mul.f16x2 r2138, r2135, r1906; +} +{ +sub.f16x2 r2141, r658, r978; +} +{ +mul.f16x2 r2144, r2141, r1908; +} +{ +add.f16x2 r2147, r2138, r2144; +} +{ +sub.f16x2 r2150, r2132, r2147; +} +{ +add.f16x2 r2153, r350, r1310; +} +{ +mul.f16x2 r2156, r2153, r1907; +} +{ +add.f16x2 r2159, r30, r2156; +} +{ +add.f16x2 r2162, r670, r990; +} +{ +mul.f16x2 r2165, r2162, r1909; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r338, r1298; +} +{ +mul.f16x2 r2174, r2171, r1908; +} +{ +sub.f16x2 r2177, r658, r978; +} +{ +mul.f16x2 r2180, r2177, r1911; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +add.f16x2 r2186, r2168, r2183; +} +{ +add.f16x2 r2189, r350, r1310; +} +{ +mul.f16x2 r2192, r2189, r1907; +} +{ +add.f16x2 r2195, r30, r2192; +} +{ +add.f16x2 r2198, r670, r990; +} +{ +mul.f16x2 r2201, r2198, r1909; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r338, r1298; +} +{ +mul.f16x2 r2210, r2207, r1908; +} +{ +sub.f16x2 r2213, r658, r978; +} +{ +mul.f16x2 r2216, r2213, r1911; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +sub.f16x2 r2222, r2204, r2219; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2226, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2228, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2230, {low, high}; +} +{ +neg.f16x2 r2231, r2230; +} +{ +add.f16x2 r2233, r1655, r1703; +} +{ +add.f16x2 r2236, r66, r2233; +} +{ +add.f16x2 r2239, r1671, r1687; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, r1661, r1709; +} +{ +add.f16x2 r2248, r210, r2245; +} +{ +add.f16x2 r2251, r1677, r1693; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r1655, r1703; +} +{ +mul.f16x2 r2260, r2257, r2225; +} +{ +add.f16x2 r2263, r66, r2260; +} +{ +add.f16x2 r2266, r1671, r1687; +} +{ +mul.f16x2 r2269, r2266, r2227; +} +{ +add.f16x2 r2272, r2263, r2269; +} +{ +sub.f16x2 r2275, r1661, r1709; +} +{ +mul.f16x2 r2278, r2275, r2226; +} +{ +sub.f16x2 r2281, r1677, r1693; +} +{ +mul.f16x2 r2284, r2281, r2228; +} +{ +add.f16x2 r2287, r2278, r2284; +} +{ +sub.f16x2 r2290, r2272, r2287; +} +{ +add.f16x2 r2293, r1655, r1703; +} +{ +mul.f16x2 r2296, r2293, r2225; +} +{ +add.f16x2 r2299, r66, r2296; +} +{ +add.f16x2 r2302, r1671, r1687; +} +{ +mul.f16x2 r2305, r2302, r2227; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1661, r1709; +} +{ +mul.f16x2 r2314, r2311, r2226; +} +{ +sub.f16x2 r2317, r1677, r1693; +} +{ +mul.f16x2 r2320, r2317, r2228; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +add.f16x2 r2326, r2308, r2323; +} +{ +add.f16x2 r2329, r1655, r1703; +} +{ +mul.f16x2 r2332, r2329, r2227; +} +{ +add.f16x2 r2335, r66, r2332; +} +{ +add.f16x2 r2338, r1671, r1687; +} +{ +mul.f16x2 r2341, r2338, r2229; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1661, r1709; +} +{ +mul.f16x2 r2350, r2347, r2228; +} +{ +sub.f16x2 r2353, r1677, r1693; +} +{ +mul.f16x2 r2356, r2353, r2231; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r2344, r2359; +} +{ +add.f16x2 r2365, r1655, r1703; +} +{ +mul.f16x2 r2368, r2365, r2227; +} +{ +add.f16x2 r2371, r66, r2368; +} +{ +add.f16x2 r2374, r1671, r1687; +} +{ +mul.f16x2 r2377, r2374, r2229; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1661, r1709; +} +{ +mul.f16x2 r2386, r2383, r2228; +} +{ +sub.f16x2 r2389, r1677, r1693; +} +{ +mul.f16x2 r2392, r2389, r2231; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +add.f16x2 r2398, r2380, r2395; +} +{ +add.f16x2 r2401, r1661, r1709; +} +{ +mul.f16x2 r2404, r2401, r2225; +} +{ +add.f16x2 r2407, r210, r2404; +} +{ +add.f16x2 r2410, r1677, r1693; +} +{ +mul.f16x2 r2413, r2410, r2227; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1655, r1703; +} +{ +mul.f16x2 r2422, r2419, r2226; +} +{ +sub.f16x2 r2425, r1671, r1687; +} +{ +mul.f16x2 r2428, r2425, r2228; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +{ +add.f16x2 r2437, r1661, r1709; +} +{ +mul.f16x2 r2440, r2437, r2225; +} +{ +add.f16x2 r2443, r210, r2440; +} +{ +add.f16x2 r2446, r1677, r1693; +} +{ +mul.f16x2 r2449, r2446, r2227; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1655, r1703; +} +{ +mul.f16x2 r2458, r2455, r2226; +} +{ +sub.f16x2 r2461, r1671, r1687; +} +{ +mul.f16x2 r2464, r2461, r2228; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r2452, r2467; +} +{ +add.f16x2 r2473, r1661, r1709; +} +{ +mul.f16x2 r2476, r2473, r2227; +} +{ +add.f16x2 r2479, r210, r2476; +} +{ +add.f16x2 r2482, r1677, r1693; +} +{ +mul.f16x2 r2485, r2482, r2229; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1655, r1703; +} +{ +mul.f16x2 r2494, r2491, r2228; +} +{ +sub.f16x2 r2497, r1671, r1687; +} +{ +mul.f16x2 r2500, r2497, r2231; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +add.f16x2 r2506, r2488, r2503; +} +{ +add.f16x2 r2509, r1661, r1709; +} +{ +mul.f16x2 r2512, r2509, r2227; +} +{ +add.f16x2 r2515, r210, r2512; +} +{ +add.f16x2 r2518, r1677, r1693; +} +{ +mul.f16x2 r2521, r2518, r2229; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1655, r1703; +} +{ +mul.f16x2 r2530, r2527, r2228; +} +{ +sub.f16x2 r2533, r1671, r1687; +} +{ +mul.f16x2 r2536, r2533, r2231; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2524, r2539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2545, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2546, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2547, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2548, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2549, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2550, {low, high}; +} +{ +neg.f16x2 r2551, r2550; +} +{ +add.f16x2 r2553, r1719, r1767; +} +{ +add.f16x2 r2556, r138, r2553; +} +{ +add.f16x2 r2559, r1735, r1751; +} +{ +add.f16x2 r2562, r2556, r2559; +} +{ +add.f16x2 r2565, r1725, r1773; +} +{ +add.f16x2 r2568, r282, r2565; +} +{ +add.f16x2 r2571, r1741, r1757; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r1719, r1767; +} +{ +mul.f16x2 r2580, r2577, r2545; +} +{ +add.f16x2 r2583, r138, r2580; +} +{ +add.f16x2 r2586, r1735, r1751; +} +{ +mul.f16x2 r2589, r2586, r2547; +} +{ +add.f16x2 r2592, r2583, r2589; +} +{ +sub.f16x2 r2595, r1725, r1773; +} +{ +mul.f16x2 r2598, r2595, r2546; +} +{ +sub.f16x2 r2601, r1741, r1757; +} +{ +mul.f16x2 r2604, r2601, r2548; +} +{ +add.f16x2 r2607, r2598, r2604; +} +{ +sub.f16x2 r2610, r2592, r2607; +} +{ +add.f16x2 r2613, r1719, r1767; +} +{ +mul.f16x2 r2616, r2613, r2545; +} +{ +add.f16x2 r2619, r138, r2616; +} +{ +add.f16x2 r2622, r1735, r1751; +} +{ +mul.f16x2 r2625, r2622, r2547; +} +{ +add.f16x2 r2628, r2619, r2625; +} +{ +sub.f16x2 r2631, r1725, r1773; +} +{ +mul.f16x2 r2634, r2631, r2546; +} +{ +sub.f16x2 r2637, r1741, r1757; +} +{ +mul.f16x2 r2640, r2637, r2548; +} +{ +add.f16x2 r2643, r2634, r2640; +} +{ +add.f16x2 r2646, r2628, r2643; +} +{ +add.f16x2 r2649, r1719, r1767; +} +{ +mul.f16x2 r2652, r2649, r2547; +} +{ +add.f16x2 r2655, r138, r2652; +} +{ +add.f16x2 r2658, r1735, r1751; +} +{ +mul.f16x2 r2661, r2658, r2549; +} +{ +add.f16x2 r2664, r2655, r2661; +} +{ +sub.f16x2 r2667, r1725, r1773; +} +{ +mul.f16x2 r2670, r2667, r2548; +} +{ +sub.f16x2 r2673, r1741, r1757; +} +{ +mul.f16x2 r2676, r2673, r2551; +} +{ +add.f16x2 r2679, r2670, r2676; +} +{ +sub.f16x2 r2682, r2664, r2679; +} +{ +add.f16x2 r2685, r1719, r1767; +} +{ +mul.f16x2 r2688, r2685, r2547; +} +{ +add.f16x2 r2691, r138, r2688; +} +{ +add.f16x2 r2694, r1735, r1751; +} +{ +mul.f16x2 r2697, r2694, r2549; +} +{ +add.f16x2 r2700, r2691, r2697; +} +{ +sub.f16x2 r2703, r1725, r1773; +} +{ +mul.f16x2 r2706, r2703, r2548; +} +{ +sub.f16x2 r2709, r1741, r1757; +} +{ +mul.f16x2 r2712, r2709, r2551; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2700, r2715; +} +{ +add.f16x2 r2721, r1725, r1773; +} +{ +mul.f16x2 r2724, r2721, r2545; +} +{ +add.f16x2 r2727, r282, r2724; +} +{ +add.f16x2 r2730, r1741, r1757; +} +{ +mul.f16x2 r2733, r2730, r2547; +} +{ +add.f16x2 r2736, r2727, r2733; +} +{ +sub.f16x2 r2739, r1719, r1767; +} +{ +mul.f16x2 r2742, r2739, r2546; +} +{ +sub.f16x2 r2745, r1735, r1751; +} +{ +mul.f16x2 r2748, r2745, r2548; +} +{ +add.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2736, r2751; +} +{ +add.f16x2 r2757, r1725, r1773; +} +{ +mul.f16x2 r2760, r2757, r2545; +} +{ +add.f16x2 r2763, r282, r2760; +} +{ +add.f16x2 r2766, r1741, r1757; +} +{ +mul.f16x2 r2769, r2766, r2547; +} +{ +add.f16x2 r2772, r2763, r2769; +} +{ +sub.f16x2 r2775, r1719, r1767; +} +{ +mul.f16x2 r2778, r2775, r2546; +} +{ +sub.f16x2 r2781, r1735, r1751; +} +{ +mul.f16x2 r2784, r2781, r2548; +} +{ +add.f16x2 r2787, r2778, r2784; +} +{ +sub.f16x2 r2790, r2772, r2787; +} +{ +add.f16x2 r2793, r1725, r1773; +} +{ +mul.f16x2 r2796, r2793, r2547; +} +{ +add.f16x2 r2799, r282, r2796; +} +{ +add.f16x2 r2802, r1741, r1757; +} +{ +mul.f16x2 r2805, r2802, r2549; +} +{ +add.f16x2 r2808, r2799, r2805; +} +{ +sub.f16x2 r2811, r1719, r1767; +} +{ +mul.f16x2 r2814, r2811, r2548; +} +{ +sub.f16x2 r2817, r1735, r1751; +} +{ +mul.f16x2 r2820, r2817, r2551; +} +{ +add.f16x2 r2823, r2814, r2820; +} +{ +add.f16x2 r2826, r2808, r2823; +} +{ +add.f16x2 r2829, r1725, r1773; +} +{ +mul.f16x2 r2832, r2829, r2547; +} +{ +add.f16x2 r2835, r282, r2832; +} +{ +add.f16x2 r2838, r1741, r1757; +} +{ +mul.f16x2 r2841, r2838, r2549; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r1719, r1767; +} +{ +mul.f16x2 r2850, r2847, r2548; +} +{ +sub.f16x2 r2853, r1735, r1751; +} +{ +mul.f16x2 r2856, r2853, r2551; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2844, r2859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2866, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2868, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2869, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2870, {low, high}; +} +{ +neg.f16x2 r2871, r2870; +} +{ +add.f16x2 r2873, r1783, r1831; +} +{ +add.f16x2 r2876, r174, r2873; +} +{ +add.f16x2 r2879, r1799, r1815; +} +{ +add.f16x2 r2882, r2876, r2879; +} +{ +add.f16x2 r2885, r1789, r1837; +} +{ +add.f16x2 r2888, r318, r2885; +} +{ +add.f16x2 r2891, r1805, r1821; +} +{ +add.f16x2 r2894, r2888, r2891; +} +{ +add.f16x2 r2897, r1783, r1831; +} +{ +mul.f16x2 r2900, r2897, r2865; +} +{ +add.f16x2 r2903, r174, r2900; +} +{ +add.f16x2 r2906, r1799, r1815; +} +{ +mul.f16x2 r2909, r2906, r2867; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +sub.f16x2 r2915, r1789, r1837; +} +{ +mul.f16x2 r2918, r2915, r2866; +} +{ +sub.f16x2 r2921, r1805, r1821; +} +{ +mul.f16x2 r2924, r2921, r2868; +} +{ +add.f16x2 r2927, r2918, r2924; +} +{ +sub.f16x2 r2930, r2912, r2927; +} +{ +add.f16x2 r2933, r1783, r1831; +} +{ +mul.f16x2 r2936, r2933, r2865; +} +{ +add.f16x2 r2939, r174, r2936; +} +{ +add.f16x2 r2942, r1799, r1815; +} +{ +mul.f16x2 r2945, r2942, r2867; +} +{ +add.f16x2 r2948, r2939, r2945; +} +{ +sub.f16x2 r2951, r1789, r1837; +} +{ +mul.f16x2 r2954, r2951, r2866; +} +{ +sub.f16x2 r2957, r1805, r1821; +} +{ +mul.f16x2 r2960, r2957, r2868; +} +{ +add.f16x2 r2963, r2954, r2960; +} +{ +add.f16x2 r2966, r2948, r2963; +} +{ +add.f16x2 r2969, r1783, r1831; +} +{ +mul.f16x2 r2972, r2969, r2867; +} +{ +add.f16x2 r2975, r174, r2972; +} +{ +add.f16x2 r2978, r1799, r1815; +} +{ +mul.f16x2 r2981, r2978, r2869; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 r2987, r1789, r1837; +} +{ +mul.f16x2 r2990, r2987, r2868; +} +{ +sub.f16x2 r2993, r1805, r1821; +} +{ +mul.f16x2 r2996, r2993, r2871; +} +{ +add.f16x2 r2999, r2990, r2996; +} +{ +sub.f16x2 r3002, r2984, r2999; +} +{ +add.f16x2 r3005, r1783, r1831; +} +{ +mul.f16x2 r3008, r3005, r2867; +} +{ +add.f16x2 r3011, r174, r3008; +} +{ +add.f16x2 r3014, r1799, r1815; +} +{ +mul.f16x2 r3017, r3014, r2869; +} +{ +add.f16x2 r3020, r3011, r3017; +} +{ +sub.f16x2 r3023, r1789, r1837; +} +{ +mul.f16x2 r3026, r3023, r2868; +} +{ +sub.f16x2 r3029, r1805, r1821; +} +{ +mul.f16x2 r3032, r3029, r2871; +} +{ +add.f16x2 r3035, r3026, r3032; +} +{ +add.f16x2 r3038, r3020, r3035; +} +{ +add.f16x2 r3041, r1789, r1837; +} +{ +mul.f16x2 r3044, r3041, r2865; +} +{ +add.f16x2 r3047, r318, r3044; +} +{ +add.f16x2 r3050, r1805, r1821; +} +{ +mul.f16x2 r3053, r3050, r2867; +} +{ +add.f16x2 r3056, r3047, r3053; +} +{ +sub.f16x2 r3059, r1783, r1831; +} +{ +mul.f16x2 r3062, r3059, r2866; +} +{ +sub.f16x2 r3065, r1799, r1815; +} +{ +mul.f16x2 r3068, r3065, r2868; +} +{ +add.f16x2 r3071, r3062, r3068; +} +{ +add.f16x2 r3074, r3056, r3071; +} +{ +add.f16x2 r3077, r1789, r1837; +} +{ +mul.f16x2 r3080, r3077, r2865; +} +{ +add.f16x2 r3083, r318, r3080; +} +{ +add.f16x2 r3086, r1805, r1821; +} +{ +mul.f16x2 r3089, r3086, r2867; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 r3095, r1783, r1831; +} +{ +mul.f16x2 r3098, r3095, r2866; +} +{ +sub.f16x2 r3101, r1799, r1815; +} +{ +mul.f16x2 r3104, r3101, r2868; +} +{ +add.f16x2 r3107, r3098, r3104; +} +{ +sub.f16x2 r3110, r3092, r3107; +} +{ +add.f16x2 r3113, r1789, r1837; +} +{ +mul.f16x2 r3116, r3113, r2867; +} +{ +add.f16x2 r3119, r318, r3116; +} +{ +add.f16x2 r3122, r1805, r1821; +} +{ +mul.f16x2 r3125, r3122, r2869; +} +{ +add.f16x2 r3128, r3119, r3125; +} +{ +sub.f16x2 r3131, r1783, r1831; +} +{ +mul.f16x2 r3134, r3131, r2868; +} +{ +sub.f16x2 r3137, r1799, r1815; +} +{ +mul.f16x2 r3140, r3137, r2871; +} +{ +add.f16x2 r3143, r3134, r3140; +} +{ +add.f16x2 r3146, r3128, r3143; +} +{ +add.f16x2 r3149, r1789, r1837; +} +{ +mul.f16x2 r3152, r3149, r2867; +} +{ +add.f16x2 r3155, r318, r3152; +} +{ +add.f16x2 r3158, r1805, r1821; +} +{ +mul.f16x2 r3161, r3158, r2869; +} +{ +add.f16x2 r3164, r3155, r3161; +} +{ +sub.f16x2 r3167, r1783, r1831; +} +{ +mul.f16x2 r3170, r3167, r2868; +} +{ +sub.f16x2 r3173, r1799, r1815; +} +{ +mul.f16x2 r3176, r3173, r2871; +} +{ +add.f16x2 r3179, r3170, r3176; +} +{ +sub.f16x2 r3182, r3164, r3179; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3185, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3186, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r3187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r3188, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3189, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3190, {low, high}; +} +{ +neg.f16x2 r3191, r3190; +} +{ +add.f16x2 r3193, r1847, r1895; +} +{ +add.f16x2 r3196, r102, r3193; +} +{ +add.f16x2 r3199, r1863, r1879; +} +{ +add.f16x2 r3202, r3196, r3199; +} +{ +add.f16x2 r3205, r1853, r1901; +} +{ +add.f16x2 r3208, r246, r3205; +} +{ +add.f16x2 r3211, r1869, r1885; +} +{ +add.f16x2 r3214, r3208, r3211; +} +{ +add.f16x2 r3217, r1847, r1895; +} +{ +mul.f16x2 r3220, r3217, r3185; +} +{ +add.f16x2 r3223, r102, r3220; +} +{ +add.f16x2 r3226, r1863, r1879; +} +{ +mul.f16x2 r3229, r3226, r3187; +} +{ +add.f16x2 r3232, r3223, r3229; +} +{ +sub.f16x2 r3235, r1853, r1901; +} +{ +mul.f16x2 r3238, r3235, r3186; +} +{ +sub.f16x2 r3241, r1869, r1885; +} +{ +mul.f16x2 r3244, r3241, r3188; +} +{ +add.f16x2 r3247, r3238, r3244; +} +{ +sub.f16x2 r3250, r3232, r3247; +} +{ +add.f16x2 r3253, r1847, r1895; +} +{ +mul.f16x2 r3256, r3253, r3185; +} +{ +add.f16x2 r3259, r102, r3256; +} +{ +add.f16x2 r3262, r1863, r1879; +} +{ +mul.f16x2 r3265, r3262, r3187; +} +{ +add.f16x2 r3268, r3259, r3265; +} +{ +sub.f16x2 r3271, r1853, r1901; +} +{ +mul.f16x2 r3274, r3271, r3186; +} +{ +sub.f16x2 r3277, r1869, r1885; +} +{ +mul.f16x2 r3280, r3277, r3188; +} +{ +add.f16x2 r3283, r3274, r3280; +} +{ +add.f16x2 r3286, r3268, r3283; +} +{ +add.f16x2 r3289, r1847, r1895; +} +{ +mul.f16x2 r3292, r3289, r3187; +} +{ +add.f16x2 r3295, r102, r3292; +} +{ +add.f16x2 r3298, r1863, r1879; +} +{ +mul.f16x2 r3301, r3298, r3189; +} +{ +add.f16x2 r3304, r3295, r3301; +} +{ +sub.f16x2 r3307, r1853, r1901; +} +{ +mul.f16x2 r3310, r3307, r3188; +} +{ +sub.f16x2 r3313, r1869, r1885; +} +{ +mul.f16x2 r3316, r3313, r3191; +} +{ +add.f16x2 r3319, r3310, r3316; +} +{ +sub.f16x2 r3322, r3304, r3319; +} +{ +add.f16x2 r3325, r1847, r1895; +} +{ +mul.f16x2 r3328, r3325, r3187; +} +{ +add.f16x2 r3331, r102, r3328; +} +{ +add.f16x2 r3334, r1863, r1879; +} +{ +mul.f16x2 r3337, r3334, r3189; +} +{ +add.f16x2 r3340, r3331, r3337; +} +{ +sub.f16x2 r3343, r1853, r1901; +} +{ +mul.f16x2 r3346, r3343, r3188; +} +{ +sub.f16x2 r3349, r1869, r1885; +} +{ +mul.f16x2 r3352, r3349, r3191; +} +{ +add.f16x2 r3355, r3346, r3352; +} +{ +add.f16x2 r3358, r3340, r3355; +} +{ +add.f16x2 r3361, r1853, r1901; +} +{ +mul.f16x2 r3364, r3361, r3185; +} +{ +add.f16x2 r3367, r246, r3364; +} +{ +add.f16x2 r3370, r1869, r1885; +} +{ +mul.f16x2 r3373, r3370, r3187; +} +{ +add.f16x2 r3376, r3367, r3373; +} +{ +sub.f16x2 r3379, r1847, r1895; +} +{ +mul.f16x2 r3382, r3379, r3186; +} +{ +sub.f16x2 r3385, r1863, r1879; +} +{ +mul.f16x2 r3388, r3385, r3188; +} +{ +add.f16x2 r3391, r3382, r3388; +} +{ +add.f16x2 r3394, r3376, r3391; +} +{ +add.f16x2 r3397, r1853, r1901; +} +{ +mul.f16x2 r3400, r3397, r3185; +} +{ +add.f16x2 r3403, r246, r3400; +} +{ +add.f16x2 r3406, r1869, r1885; +} +{ +mul.f16x2 r3409, r3406, r3187; +} +{ +add.f16x2 r3412, r3403, r3409; +} +{ +sub.f16x2 r3415, r1847, r1895; +} +{ +mul.f16x2 r3418, r3415, r3186; +} +{ +sub.f16x2 r3421, r1863, r1879; +} +{ +mul.f16x2 r3424, r3421, r3188; +} +{ +add.f16x2 r3427, r3418, r3424; +} +{ +sub.f16x2 r3430, r3412, r3427; +} +{ +add.f16x2 r3433, r1853, r1901; +} +{ +mul.f16x2 r3436, r3433, r3187; +} +{ +add.f16x2 r3439, r246, r3436; +} +{ +add.f16x2 r3442, r1869, r1885; +} +{ +mul.f16x2 r3445, r3442, r3189; +} +{ +add.f16x2 r3448, r3439, r3445; +} +{ +sub.f16x2 r3451, r1847, r1895; +} +{ +mul.f16x2 r3454, r3451, r3188; +} +{ +sub.f16x2 r3457, r1863, r1879; +} +{ +mul.f16x2 r3460, r3457, r3191; +} +{ +add.f16x2 r3463, r3454, r3460; +} +{ +add.f16x2 r3466, r3448, r3463; +} +{ +add.f16x2 r3469, r1853, r1901; +} +{ +mul.f16x2 r3472, r3469, r3187; +} +{ +add.f16x2 r3475, r246, r3472; +} +{ +add.f16x2 r3478, r1869, r1885; +} +{ +mul.f16x2 r3481, r3478, r3189; +} +{ +add.f16x2 r3484, r3475, r3481; +} +{ +sub.f16x2 r3487, r1847, r1895; +} +{ +mul.f16x2 r3490, r3487, r3188; +} +{ +sub.f16x2 r3493, r1863, r1879; +} +{ +mul.f16x2 r3496, r3493, r3191; +} +{ +add.f16x2 r3499, r3490, r3496; +} +{ +sub.f16x2 r3502, r3484, r3499; +} +mul.wide.u32 rd2, r10394, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r10395, rd3; +mul.lo.s32 r10396, r10395, 125; +sub.s32 r10397, r10394, r10396; +mad.lo.s32 r10398, r10395, 12500, r10393; +cvt.rn.f32.u32 f597, r10397; +mul.f32 f598, f597, 0f3B03C498; +cos.approx.f32 f217, f598; +sin.approx.f32 f599, f598; +neg.f32 f218, f599; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3505, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3508, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3510, {high, high}; +} +{ +mul.f16x2 r3512, r2254, r3510; +} +{ +neg.f16x2 r3515, r3512; +} +{ +fma.rn.f16x2 r3517, r2242, r3508, r3515; +} +{ +mul.f16x2 r3521, r2242, r3510; +} +{ +fma.rn.f16x2 r3524, r2254, r3508, r3521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3530, {high, high}; +} +mov.f32 f533, 0fBF800000; +mov.f32 f534, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3532, {low, high}; +} +{ +mul.f16x2 r3533, r3530, r3532; +} +{ +mul.f16x2 r3536, r3505, r3528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3539, {high, low}; +} +{ +fma.rn.f16x2 r3541, r3533, r3539, r3536; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3545, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3547, {high, high}; +} +{ +mul.f16x2 r3549, r2574, r3547; +} +{ +neg.f16x2 r3552, r3549; +} +{ +fma.rn.f16x2 r3554, r2562, r3545, r3552; +} +{ +mul.f16x2 r3558, r2562, r3547; +} +{ +fma.rn.f16x2 r3561, r2574, r3545, r3558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3567, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3569, {low, high}; +} +{ +mul.f16x2 r3570, r3567, r3569; +} +{ +mul.f16x2 r3573, r3541, r3565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3576, {high, low}; +} +{ +fma.rn.f16x2 r3578, r3570, r3576, r3573; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3582, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3584, {high, high}; +} +{ +mul.f16x2 r3586, r2894, r3584; +} +{ +neg.f16x2 r3589, r3586; +} +{ +fma.rn.f16x2 r3591, r2882, r3582, r3589; +} +{ +mul.f16x2 r3595, r2882, r3584; +} +{ +fma.rn.f16x2 r3598, r2894, r3582, r3595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3604, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3606, {low, high}; +} +{ +mul.f16x2 r3607, r3604, r3606; +} +{ +mul.f16x2 r3610, r3578, r3602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3613, {high, low}; +} +{ +fma.rn.f16x2 r3615, r3607, r3613, r3610; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3619, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3621, {high, high}; +} +{ +mul.f16x2 r3623, r3214, r3621; +} +{ +neg.f16x2 r3626, r3623; +} +{ +fma.rn.f16x2 r3628, r3202, r3619, r3626; +} +{ +mul.f16x2 r3632, r3202, r3621; +} +{ +fma.rn.f16x2 r3635, r3214, r3619, r3632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3641, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3643, {low, high}; +} +{ +mul.f16x2 r3644, r3641, r3643; +} +{ +mul.f16x2 r3647, r3615, r3639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3650, {high, low}; +} +{ +fma.rn.f16x2 r3652, r3644, r3650, r3647; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3656, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3658, {high, high}; +} +{ +mul.f16x2 r3660, r2114, r3658; +} +{ +neg.f16x2 r3663, r3660; +} +{ +fma.rn.f16x2 r3665, r1970, r3656, r3663; +} +{ +mul.f16x2 r3669, r1970, r3658; +} +{ +fma.rn.f16x2 r3672, r2114, r3656, r3669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3678, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3680, {low, high}; +} +{ +mul.f16x2 r3681, r3678, r3680; +} +{ +mul.f16x2 r3684, r3652, r3676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3687, {high, low}; +} +{ +fma.rn.f16x2 r3689, r3681, r3687, r3684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3693, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3695, {high, high}; +} +{ +mul.f16x2 r3697, r2434, r3695; +} +{ +neg.f16x2 r3700, r3697; +} +{ +fma.rn.f16x2 r3702, r2290, r3693, r3700; +} +{ +mul.f16x2 r3706, r2290, r3695; +} +{ +fma.rn.f16x2 r3709, r2434, r3693, r3706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3715, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3717, {low, high}; +} +{ +mul.f16x2 r3718, r3715, r3717; +} +{ +mul.f16x2 r3721, r3689, r3713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3724, {high, low}; +} +{ +fma.rn.f16x2 r3726, r3718, r3724, r3721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3730, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3732, {high, high}; +} +{ +mul.f16x2 r3734, r2754, r3732; +} +{ +neg.f16x2 r3737, r3734; +} +{ +fma.rn.f16x2 r3739, r2610, r3730, r3737; +} +{ +mul.f16x2 r3743, r2610, r3732; +} +{ +fma.rn.f16x2 r3746, r2754, r3730, r3743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3752, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3754, {low, high}; +} +{ +mul.f16x2 r3755, r3752, r3754; +} +{ +mul.f16x2 r3758, r3726, r3750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3761, {high, low}; +} +{ +fma.rn.f16x2 r3763, r3755, r3761, r3758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3767, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3769, {high, high}; +} +{ +mul.f16x2 r3771, r3074, r3769; +} +{ +neg.f16x2 r3774, r3771; +} +{ +fma.rn.f16x2 r3776, r2930, r3767, r3774; +} +{ +mul.f16x2 r3780, r2930, r3769; +} +{ +fma.rn.f16x2 r3783, r3074, r3767, r3780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3789, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3791, {low, high}; +} +{ +mul.f16x2 r3792, r3789, r3791; +} +{ +mul.f16x2 r3795, r3763, r3787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3798, {high, low}; +} +{ +fma.rn.f16x2 r3800, r3792, r3798, r3795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3804, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3806, {high, high}; +} +{ +mul.f16x2 r3808, r3394, r3806; +} +{ +neg.f16x2 r3811, r3808; +} +{ +fma.rn.f16x2 r3813, r3250, r3804, r3811; +} +{ +mul.f16x2 r3817, r3250, r3806; +} +{ +fma.rn.f16x2 r3820, r3394, r3804, r3817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3826, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3828, {low, high}; +} +{ +mul.f16x2 r3829, r3826, r3828; +} +{ +mul.f16x2 r3832, r3800, r3824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3835, {high, low}; +} +{ +fma.rn.f16x2 r3837, r3829, r3835, r3832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3841, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3843, {high, high}; +} +{ +mul.f16x2 r3845, r2186, r3843; +} +{ +neg.f16x2 r3848, r3845; +} +{ +fma.rn.f16x2 r3850, r2042, r3841, r3848; +} +{ +mul.f16x2 r3854, r2042, r3843; +} +{ +fma.rn.f16x2 r3857, r2186, r3841, r3854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3863, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3865, {low, high}; +} +{ +mul.f16x2 r3866, r3863, r3865; +} +{ +mul.f16x2 r3869, r3837, r3861; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3872, {high, low}; +} +{ +fma.rn.f16x2 r3874, r3866, r3872, r3869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3878, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3880, {high, high}; +} +{ +mul.f16x2 r3882, r2506, r3880; +} +{ +neg.f16x2 r3885, r3882; +} +{ +fma.rn.f16x2 r3887, r2362, r3878, r3885; +} +{ +mul.f16x2 r3891, r2362, r3880; +} +{ +fma.rn.f16x2 r3894, r2506, r3878, r3891; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3900, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3902, {low, high}; +} +{ +mul.f16x2 r3903, r3900, r3902; +} +{ +mul.f16x2 r3906, r3874, r3898; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3909, {high, low}; +} +{ +fma.rn.f16x2 r3911, r3903, r3909, r3906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3915, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3917, {high, high}; +} +{ +mul.f16x2 r3919, r2826, r3917; +} +{ +neg.f16x2 r3922, r3919; +} +{ +fma.rn.f16x2 r3924, r2682, r3915, r3922; +} +{ +mul.f16x2 r3928, r2682, r3917; +} +{ +fma.rn.f16x2 r3931, r2826, r3915, r3928; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3937, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3939, {low, high}; +} +{ +mul.f16x2 r3940, r3937, r3939; +} +{ +mul.f16x2 r3943, r3911, r3935; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3946, {high, low}; +} +{ +fma.rn.f16x2 r3948, r3940, r3946, r3943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3952, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3954, {high, high}; +} +{ +mul.f16x2 r3956, r3146, r3954; +} +{ +neg.f16x2 r3959, r3956; +} +{ +fma.rn.f16x2 r3961, r3002, r3952, r3959; +} +{ +mul.f16x2 r3965, r3002, r3954; +} +{ +fma.rn.f16x2 r3968, r3146, r3952, r3965; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3974, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3976, {low, high}; +} +{ +mul.f16x2 r3977, r3974, r3976; +} +{ +mul.f16x2 r3980, r3948, r3972; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3983, {high, low}; +} +{ +fma.rn.f16x2 r3985, r3977, r3983, r3980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3989, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3991, {high, high}; +} +{ +mul.f16x2 r3993, r3466, r3991; +} +{ +neg.f16x2 r3996, r3993; +} +{ +fma.rn.f16x2 r3998, r3322, r3989, r3996; +} +{ +mul.f16x2 r4002, r3322, r3991; +} +{ +fma.rn.f16x2 r4005, r3466, r3989, r4002; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4011, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4013, {low, high}; +} +{ +mul.f16x2 r4014, r4011, r4013; +} +{ +mul.f16x2 r4017, r3985, r4009; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r4020, {high, low}; +} +{ +fma.rn.f16x2 r4022, r4014, r4020, r4017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4026, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4028, {high, high}; +} +{ +mul.f16x2 r4030, r2222, r4028; +} +{ +neg.f16x2 r4033, r4030; +} +{ +fma.rn.f16x2 r4035, r2078, r4026, r4033; +} +{ +mul.f16x2 r4039, r2078, r4028; +} +{ +fma.rn.f16x2 r4042, r2222, r4026, r4039; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4048, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4050, {low, high}; +} +{ +mul.f16x2 r4051, r4048, r4050; +} +{ +mul.f16x2 r4054, r4022, r4046; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4057, {high, low}; +} +{ +fma.rn.f16x2 r4059, r4051, r4057, r4054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4063, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4065, {high, high}; +} +{ +mul.f16x2 r4067, r2542, r4065; +} +{ +neg.f16x2 r4070, r4067; +} +{ +fma.rn.f16x2 r4072, r2398, r4063, r4070; +} +{ +mul.f16x2 r4076, r2398, r4065; +} +{ +fma.rn.f16x2 r4079, r2542, r4063, r4076; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4085, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4087, {low, high}; +} +{ +mul.f16x2 r4088, r4085, r4087; +} +{ +mul.f16x2 r4091, r4059, r4083; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4094, {high, low}; +} +{ +fma.rn.f16x2 r4096, r4088, r4094, r4091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4100, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4102, {high, high}; +} +{ +mul.f16x2 r4104, r2862, r4102; +} +{ +neg.f16x2 r4107, r4104; +} +{ +fma.rn.f16x2 r4109, r2718, r4100, r4107; +} +{ +mul.f16x2 r4113, r2718, r4102; +} +{ +fma.rn.f16x2 r4116, r2862, r4100, r4113; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4122, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4125, r4122, r4124; +} +{ +mul.f16x2 r4128, r4096, r4120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4131, {high, low}; +} +{ +fma.rn.f16x2 r4133, r4125, r4131, r4128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4137, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4139, {high, high}; +} +{ +mul.f16x2 r4141, r3182, r4139; +} +{ +neg.f16x2 r4144, r4141; +} +{ +fma.rn.f16x2 r4146, r3038, r4137, r4144; +} +{ +mul.f16x2 r4150, r3038, r4139; +} +{ +fma.rn.f16x2 r4153, r3182, r4137, r4150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4159, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4162, r4159, r4161; +} +{ +mul.f16x2 r4165, r4133, r4157; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4168, {high, low}; +} +{ +fma.rn.f16x2 r4170, r4162, r4168, r4165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4174, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4176, {high, high}; +} +{ +mul.f16x2 r4178, r3502, r4176; +} +{ +neg.f16x2 r4181, r4178; +} +{ +fma.rn.f16x2 r4183, r3358, r4174, r4181; +} +{ +mul.f16x2 r4187, r3358, r4176; +} +{ +fma.rn.f16x2 r4190, r3502, r4174, r4187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4196, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4198, {low, high}; +} +{ +mul.f16x2 r4199, r4196, r4198; +} +{ +mul.f16x2 r4202, r4170, r4194; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4205, {high, low}; +} +{ +fma.rn.f16x2 r4207, r4199, r4205, r4202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4211, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4213, {high, high}; +} +{ +mul.f16x2 r4215, r2150, r4213; +} +{ +neg.f16x2 r4218, r4215; +} +{ +fma.rn.f16x2 r4220, r2006, r4211, r4218; +} +{ +mul.f16x2 r4224, r2006, r4213; +} +{ +fma.rn.f16x2 r4227, r2150, r4211, r4224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4233, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4235, {low, high}; +} +{ +mul.f16x2 r4236, r4233, r4235; +} +{ +mul.f16x2 r4239, r4207, r4231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4242, {high, low}; +} +{ +fma.rn.f16x2 r4244, r4236, r4242, r4239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4250, {high, high}; +} +{ +mul.f16x2 r4252, r2470, r4250; +} +{ +neg.f16x2 r4255, r4252; +} +{ +fma.rn.f16x2 r4257, r2326, r4248, r4255; +} +{ +mul.f16x2 r4261, r2326, r4250; +} +{ +fma.rn.f16x2 r4264, r2470, r4248, r4261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4272, {low, high}; +} +{ +mul.f16x2 r4273, r4270, r4272; +} +{ +mul.f16x2 r4276, r4244, r4268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4279, {high, low}; +} +{ +fma.rn.f16x2 r4281, r4273, r4279, r4276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4287, {high, high}; +} +{ +mul.f16x2 r4289, r2790, r4287; +} +{ +neg.f16x2 r4292, r4289; +} +{ +fma.rn.f16x2 r4294, r2646, r4285, r4292; +} +{ +mul.f16x2 r4298, r2646, r4287; +} +{ +fma.rn.f16x2 r4301, r2790, r4285, r4298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4309, {low, high}; +} +{ +mul.f16x2 r4310, r4307, r4309; +} +{ +mul.f16x2 r4313, r4281, r4305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4316, {high, low}; +} +{ +fma.rn.f16x2 r4318, r4310, r4316, r4313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4324, {high, high}; +} +{ +mul.f16x2 r4326, r3110, r4324; +} +{ +neg.f16x2 r4329, r4326; +} +{ +fma.rn.f16x2 r4331, r2966, r4322, r4329; +} +{ +mul.f16x2 r4335, r2966, r4324; +} +{ +fma.rn.f16x2 r4338, r3110, r4322, r4335; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4344, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4346, {low, high}; +} +{ +mul.f16x2 r4347, r4344, r4346; +} +{ +mul.f16x2 r4350, r4318, r4342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4353, {high, low}; +} +{ +fma.rn.f16x2 r4355, r4347, r4353, r4350; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4359, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4361, {high, high}; +} +{ +mul.f16x2 r4363, r3430, r4361; +} +{ +neg.f16x2 r4366, r4363; +} +{ +fma.rn.f16x2 r4368, r3286, r4359, r4366; +} +{ +mul.f16x2 r4372, r3286, r4361; +} +{ +fma.rn.f16x2 r4375, r3430, r4359, r4372; +} +barrier.sync 0; +mad.lo.s32 r10399, r10397, 100, r10398; +st.shared.u32 [r10399], r1922; +st.shared.u32 [r10399+4], r3517; +st.shared.u32 [r10399+8], r3554; +st.shared.u32 [r10399+12], r3591; +st.shared.u32 [r10399+16], r3628; +st.shared.u32 [r10399+20], r3665; +st.shared.u32 [r10399+24], r3702; +st.shared.u32 [r10399+28], r3739; +st.shared.u32 [r10399+32], r3776; +st.shared.u32 [r10399+36], r3813; +st.shared.u32 [r10399+40], r3850; +st.shared.u32 [r10399+44], r3887; +st.shared.u32 [r10399+48], r3924; +st.shared.u32 [r10399+52], r3961; +st.shared.u32 [r10399+56], r3998; +st.shared.u32 [r10399+60], r4035; +st.shared.u32 [r10399+64], r4072; +st.shared.u32 [r10399+68], r4109; +st.shared.u32 [r10399+72], r4146; +st.shared.u32 [r10399+76], r4183; +st.shared.u32 [r10399+80], r4220; +st.shared.u32 [r10399+84], r4257; +st.shared.u32 [r10399+88], r4294; +st.shared.u32 [r10399+92], r4331; +st.shared.u32 [r10399+96], r4368; +barrier.sync 0; +mad.lo.s32 r10400, r10397, -96, r10399; +ld.shared.u32 r4408, [r10400]; +ld.shared.u32 r4728, [r10400+500]; +ld.shared.u32 r5048, [r10400+1000]; +ld.shared.u32 r5368, [r10400+1500]; +ld.shared.u32 r5688, [r10400+2000]; +ld.shared.u32 r4405, [r10400+2500]; +ld.shared.u32 r4725, [r10400+3000]; +ld.shared.u32 r5045, [r10400+3500]; +ld.shared.u32 r5365, [r10400+4000]; +ld.shared.u32 r5685, [r10400+4500]; +ld.shared.u32 r4411, [r10400+5000]; +ld.shared.u32 r4731, [r10400+5500]; +ld.shared.u32 r5051, [r10400+6000]; +ld.shared.u32 r5371, [r10400+6500]; +ld.shared.u32 r5691, [r10400+7000]; +ld.shared.u32 r4412, [r10400+7500]; +ld.shared.u32 r4732, [r10400+8000]; +ld.shared.u32 r5052, [r10400+8500]; +ld.shared.u32 r5372, [r10400+9000]; +ld.shared.u32 r5692, [r10400+9500]; +ld.shared.u32 r4406, [r10400+10000]; +ld.shared.u32 r4726, [r10400+10500]; +ld.shared.u32 r5046, [r10400+11000]; +ld.shared.u32 r5366, [r10400+11500]; +ld.shared.u32 r5686, [r10400+12000]; +barrier.sync 0; +st.shared.u32 [r10399], r1934; +st.shared.u32 [r10399+4], r3524; +st.shared.u32 [r10399+8], r3561; +st.shared.u32 [r10399+12], r3598; +st.shared.u32 [r10399+16], r3635; +st.shared.u32 [r10399+20], r3672; +st.shared.u32 [r10399+24], r3709; +st.shared.u32 [r10399+28], r3746; +st.shared.u32 [r10399+32], r3783; +st.shared.u32 [r10399+36], r3820; +st.shared.u32 [r10399+40], r3857; +st.shared.u32 [r10399+44], r3894; +st.shared.u32 [r10399+48], r3931; +st.shared.u32 [r10399+52], r3968; +st.shared.u32 [r10399+56], r4005; +st.shared.u32 [r10399+60], r4042; +st.shared.u32 [r10399+64], r4079; +st.shared.u32 [r10399+68], r4116; +st.shared.u32 [r10399+72], r4153; +st.shared.u32 [r10399+76], r4190; +st.shared.u32 [r10399+80], r4227; +st.shared.u32 [r10399+84], r4264; +st.shared.u32 [r10399+88], r4301; +st.shared.u32 [r10399+92], r4338; +st.shared.u32 [r10399+96], r4375; +barrier.sync 0; +ld.shared.u32 r4420, [r10400]; +ld.shared.u32 r4740, [r10400+500]; +ld.shared.u32 r5060, [r10400+1000]; +ld.shared.u32 r5380, [r10400+1500]; +ld.shared.u32 r5700, [r10400+2000]; +ld.shared.u32 r4417, [r10400+2500]; +ld.shared.u32 r4737, [r10400+3000]; +ld.shared.u32 r5057, [r10400+3500]; +ld.shared.u32 r5377, [r10400+4000]; +ld.shared.u32 r5697, [r10400+4500]; +ld.shared.u32 r4423, [r10400+5000]; +ld.shared.u32 r4743, [r10400+5500]; +ld.shared.u32 r5063, [r10400+6000]; +ld.shared.u32 r5383, [r10400+6500]; +ld.shared.u32 r5703, [r10400+7000]; +ld.shared.u32 r4424, [r10400+7500]; +ld.shared.u32 r4744, [r10400+8000]; +ld.shared.u32 r5064, [r10400+8500]; +ld.shared.u32 r5384, [r10400+9000]; +ld.shared.u32 r5704, [r10400+9500]; +ld.shared.u32 r4418, [r10400+10000]; +ld.shared.u32 r4738, [r10400+10500]; +ld.shared.u32 r5058, [r10400+11000]; +ld.shared.u32 r5378, [r10400+11500]; +ld.shared.u32 r5698, [r10400+12000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4401, {low, high}; +} +{ +neg.f16x2 r4402, r4401; +} +{ +add.f16x2 r4404, r4405, r4406; +} +{ +add.f16x2 r4407, r4408, r4404; +} +{ +add.f16x2 r4410, r4411, r4412; +} +{ +add.f16x2 r4413, r4407, r4410; +} +{ +add.f16x2 r4416, r4417, r4418; +} +{ +add.f16x2 r4419, r4420, r4416; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 r4425, r4419, r4422; +} +{ +add.f16x2 r4428, r4405, r4406; +} +{ +mul.f16x2 r4431, r4428, r4396; +} +{ +add.f16x2 r4434, r4408, r4431; +} +{ +add.f16x2 r4437, r4411, r4412; +} +{ +mul.f16x2 r4440, r4437, r4398; +} +{ +add.f16x2 r4443, r4434, r4440; +} +{ +sub.f16x2 r4446, r4417, r4418; +} +{ +mul.f16x2 r4449, r4446, r4397; +} +{ +sub.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4399; +} +{ +add.f16x2 r4458, r4449, r4455; +} +{ +sub.f16x2 r4461, r4443, r4458; +} +{ +add.f16x2 r4464, r4405, r4406; +} +{ +mul.f16x2 r4467, r4464, r4396; +} +{ +add.f16x2 r4470, r4408, r4467; +} +{ +add.f16x2 r4473, r4411, r4412; +} +{ +mul.f16x2 r4476, r4473, r4398; +} +{ +add.f16x2 r4479, r4470, r4476; +} +{ +sub.f16x2 r4482, r4417, r4418; +} +{ +mul.f16x2 r4485, r4482, r4397; +} +{ +sub.f16x2 r4488, r4423, r4424; +} +{ +mul.f16x2 r4491, r4488, r4399; +} +{ +add.f16x2 r4494, r4485, r4491; +} +{ +add.f16x2 r4497, r4479, r4494; +} +{ +add.f16x2 r4500, r4405, r4406; +} +{ +mul.f16x2 r4503, r4500, r4398; +} +{ +add.f16x2 r4506, r4408, r4503; +} +{ +add.f16x2 r4509, r4411, r4412; +} +{ +mul.f16x2 r4512, r4509, r4400; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +sub.f16x2 r4518, r4417, r4418; +} +{ +mul.f16x2 r4521, r4518, r4399; +} +{ +sub.f16x2 r4524, r4423, r4424; +} +{ +mul.f16x2 r4527, r4524, r4402; +} +{ +add.f16x2 r4530, r4521, r4527; +} +{ +sub.f16x2 r4533, r4515, r4530; +} +{ +add.f16x2 r4536, r4405, r4406; +} +{ +mul.f16x2 r4539, r4536, r4398; +} +{ +add.f16x2 r4542, r4408, r4539; +} +{ +add.f16x2 r4545, r4411, r4412; +} +{ +mul.f16x2 r4548, r4545, r4400; +} +{ +add.f16x2 r4551, r4542, r4548; +} +{ +sub.f16x2 r4554, r4417, r4418; +} +{ +mul.f16x2 r4557, r4554, r4399; +} +{ +sub.f16x2 r4560, r4423, r4424; +} +{ +mul.f16x2 r4563, r4560, r4402; +} +{ +add.f16x2 r4566, r4557, r4563; +} +{ +add.f16x2 r4569, r4551, r4566; +} +{ +add.f16x2 r4572, r4417, r4418; +} +{ +mul.f16x2 r4575, r4572, r4396; +} +{ +add.f16x2 r4578, r4420, r4575; +} +{ +add.f16x2 r4581, r4423, r4424; +} +{ +mul.f16x2 r4584, r4581, r4398; +} +{ +add.f16x2 r4587, r4578, r4584; +} +{ +sub.f16x2 r4590, r4405, r4406; +} +{ +mul.f16x2 r4593, r4590, r4397; +} +{ +sub.f16x2 r4596, r4411, r4412; +} +{ +mul.f16x2 r4599, r4596, r4399; +} +{ +add.f16x2 r4602, r4593, r4599; +} +{ +add.f16x2 r4605, r4587, r4602; +} +{ +add.f16x2 r4608, r4417, r4418; +} +{ +mul.f16x2 r4611, r4608, r4396; +} +{ +add.f16x2 r4614, r4420, r4611; +} +{ +add.f16x2 r4617, r4423, r4424; +} +{ +mul.f16x2 r4620, r4617, r4398; +} +{ +add.f16x2 r4623, r4614, r4620; +} +{ +sub.f16x2 r4626, r4405, r4406; +} +{ +mul.f16x2 r4629, r4626, r4397; +} +{ +sub.f16x2 r4632, r4411, r4412; +} +{ +mul.f16x2 r4635, r4632, r4399; +} +{ +add.f16x2 r4638, r4629, r4635; +} +{ +sub.f16x2 r4641, r4623, r4638; +} +{ +add.f16x2 r4644, r4417, r4418; +} +{ +mul.f16x2 r4647, r4644, r4398; +} +{ +add.f16x2 r4650, r4420, r4647; +} +{ +add.f16x2 r4653, r4423, r4424; +} +{ +mul.f16x2 r4656, r4653, r4400; +} +{ +add.f16x2 r4659, r4650, r4656; +} +{ +sub.f16x2 r4662, r4405, r4406; +} +{ +mul.f16x2 r4665, r4662, r4399; +} +{ +sub.f16x2 r4668, r4411, r4412; +} +{ +mul.f16x2 r4671, r4668, r4402; +} +{ +add.f16x2 r4674, r4665, r4671; +} +{ +add.f16x2 r4677, r4659, r4674; +} +{ +add.f16x2 r4680, r4417, r4418; +} +{ +mul.f16x2 r4683, r4680, r4398; +} +{ +add.f16x2 r4686, r4420, r4683; +} +{ +add.f16x2 r4689, r4423, r4424; +} +{ +mul.f16x2 r4692, r4689, r4400; +} +{ +add.f16x2 r4695, r4686, r4692; +} +{ +sub.f16x2 r4698, r4405, r4406; +} +{ +mul.f16x2 r4701, r4698, r4399; +} +{ +sub.f16x2 r4704, r4411, r4412; +} +{ +mul.f16x2 r4707, r4704, r4402; +} +{ +add.f16x2 r4710, r4701, r4707; +} +{ +sub.f16x2 r4713, r4695, r4710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4721, {low, high}; +} +{ +neg.f16x2 r4722, r4721; +} +{ +add.f16x2 r4724, r4725, r4726; +} +{ +add.f16x2 r4727, r4728, r4724; +} +{ +add.f16x2 r4730, r4731, r4732; +} +{ +add.f16x2 r4733, r4727, r4730; +} +{ +add.f16x2 r4736, r4737, r4738; +} +{ +add.f16x2 r4739, r4740, r4736; +} +{ +add.f16x2 r4742, r4743, r4744; +} +{ +add.f16x2 r4745, r4739, r4742; +} +{ +add.f16x2 r4748, r4725, r4726; +} +{ +mul.f16x2 r4751, r4748, r4716; +} +{ +add.f16x2 r4754, r4728, r4751; +} +{ +add.f16x2 r4757, r4731, r4732; +} +{ +mul.f16x2 r4760, r4757, r4718; +} +{ +add.f16x2 r4763, r4754, r4760; +} +{ +sub.f16x2 r4766, r4737, r4738; +} +{ +mul.f16x2 r4769, r4766, r4717; +} +{ +sub.f16x2 r4772, r4743, r4744; +} +{ +mul.f16x2 r4775, r4772, r4719; +} +{ +add.f16x2 r4778, r4769, r4775; +} +{ +sub.f16x2 r4781, r4763, r4778; +} +{ +add.f16x2 r4784, r4725, r4726; +} +{ +mul.f16x2 r4787, r4784, r4716; +} +{ +add.f16x2 r4790, r4728, r4787; +} +{ +add.f16x2 r4793, r4731, r4732; +} +{ +mul.f16x2 r4796, r4793, r4718; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +sub.f16x2 r4802, r4737, r4738; +} +{ +mul.f16x2 r4805, r4802, r4717; +} +{ +sub.f16x2 r4808, r4743, r4744; +} +{ +mul.f16x2 r4811, r4808, r4719; +} +{ +add.f16x2 r4814, r4805, r4811; +} +{ +add.f16x2 r4817, r4799, r4814; +} +{ +add.f16x2 r4820, r4725, r4726; +} +{ +mul.f16x2 r4823, r4820, r4718; +} +{ +add.f16x2 r4826, r4728, r4823; +} +{ +add.f16x2 r4829, r4731, r4732; +} +{ +mul.f16x2 r4832, r4829, r4720; +} +{ +add.f16x2 r4835, r4826, r4832; +} +{ +sub.f16x2 r4838, r4737, r4738; +} +{ +mul.f16x2 r4841, r4838, r4719; +} +{ +sub.f16x2 r4844, r4743, r4744; +} +{ +mul.f16x2 r4847, r4844, r4722; +} +{ +add.f16x2 r4850, r4841, r4847; +} +{ +sub.f16x2 r4853, r4835, r4850; +} +{ +add.f16x2 r4856, r4725, r4726; +} +{ +mul.f16x2 r4859, r4856, r4718; +} +{ +add.f16x2 r4862, r4728, r4859; +} +{ +add.f16x2 r4865, r4731, r4732; +} +{ +mul.f16x2 r4868, r4865, r4720; +} +{ +add.f16x2 r4871, r4862, r4868; +} +{ +sub.f16x2 r4874, r4737, r4738; +} +{ +mul.f16x2 r4877, r4874, r4719; +} +{ +sub.f16x2 r4880, r4743, r4744; +} +{ +mul.f16x2 r4883, r4880, r4722; +} +{ +add.f16x2 r4886, r4877, r4883; +} +{ +add.f16x2 r4889, r4871, r4886; +} +{ +add.f16x2 r4892, r4737, r4738; +} +{ +mul.f16x2 r4895, r4892, r4716; +} +{ +add.f16x2 r4898, r4740, r4895; +} +{ +add.f16x2 r4901, r4743, r4744; +} +{ +mul.f16x2 r4904, r4901, r4718; +} +{ +add.f16x2 r4907, r4898, r4904; +} +{ +sub.f16x2 r4910, r4725, r4726; +} +{ +mul.f16x2 r4913, r4910, r4717; +} +{ +sub.f16x2 r4916, r4731, r4732; +} +{ +mul.f16x2 r4919, r4916, r4719; +} +{ +add.f16x2 r4922, r4913, r4919; +} +{ +add.f16x2 r4925, r4907, r4922; +} +{ +add.f16x2 r4928, r4737, r4738; +} +{ +mul.f16x2 r4931, r4928, r4716; +} +{ +add.f16x2 r4934, r4740, r4931; +} +{ +add.f16x2 r4937, r4743, r4744; +} +{ +mul.f16x2 r4940, r4937, r4718; +} +{ +add.f16x2 r4943, r4934, r4940; +} +{ +sub.f16x2 r4946, r4725, r4726; +} +{ +mul.f16x2 r4949, r4946, r4717; +} +{ +sub.f16x2 r4952, r4731, r4732; +} +{ +mul.f16x2 r4955, r4952, r4719; +} +{ +add.f16x2 r4958, r4949, r4955; +} +{ +sub.f16x2 r4961, r4943, r4958; +} +{ +add.f16x2 r4964, r4737, r4738; +} +{ +mul.f16x2 r4967, r4964, r4718; +} +{ +add.f16x2 r4970, r4740, r4967; +} +{ +add.f16x2 r4973, r4743, r4744; +} +{ +mul.f16x2 r4976, r4973, r4720; +} +{ +add.f16x2 r4979, r4970, r4976; +} +{ +sub.f16x2 r4982, r4725, r4726; +} +{ +mul.f16x2 r4985, r4982, r4719; +} +{ +sub.f16x2 r4988, r4731, r4732; +} +{ +mul.f16x2 r4991, r4988, r4722; +} +{ +add.f16x2 r4994, r4985, r4991; +} +{ +add.f16x2 r4997, r4979, r4994; +} +{ +add.f16x2 r5000, r4737, r4738; +} +{ +mul.f16x2 r5003, r5000, r4718; +} +{ +add.f16x2 r5006, r4740, r5003; +} +{ +add.f16x2 r5009, r4743, r4744; +} +{ +mul.f16x2 r5012, r5009, r4720; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +sub.f16x2 r5018, r4725, r4726; +} +{ +mul.f16x2 r5021, r5018, r4719; +} +{ +sub.f16x2 r5024, r4731, r4732; +} +{ +mul.f16x2 r5027, r5024, r4722; +} +{ +add.f16x2 r5030, r5021, r5027; +} +{ +sub.f16x2 r5033, r5015, r5030; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5038, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5039, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5041, {low, high}; +} +{ +neg.f16x2 r5042, r5041; +} +{ +add.f16x2 r5044, r5045, r5046; +} +{ +add.f16x2 r5047, r5048, r5044; +} +{ +add.f16x2 r5050, r5051, r5052; +} +{ +add.f16x2 r5053, r5047, r5050; +} +{ +add.f16x2 r5056, r5057, r5058; +} +{ +add.f16x2 r5059, r5060, r5056; +} +{ +add.f16x2 r5062, r5063, r5064; +} +{ +add.f16x2 r5065, r5059, r5062; +} +{ +add.f16x2 r5068, r5045, r5046; +} +{ +mul.f16x2 r5071, r5068, r5036; +} +{ +add.f16x2 r5074, r5048, r5071; +} +{ +add.f16x2 r5077, r5051, r5052; +} +{ +mul.f16x2 r5080, r5077, r5038; +} +{ +add.f16x2 r5083, r5074, r5080; +} +{ +sub.f16x2 r5086, r5057, r5058; +} +{ +mul.f16x2 r5089, r5086, r5037; +} +{ +sub.f16x2 r5092, r5063, r5064; +} +{ +mul.f16x2 r5095, r5092, r5039; +} +{ +add.f16x2 r5098, r5089, r5095; +} +{ +sub.f16x2 r5101, r5083, r5098; +} +{ +add.f16x2 r5104, r5045, r5046; +} +{ +mul.f16x2 r5107, r5104, r5036; +} +{ +add.f16x2 r5110, r5048, r5107; +} +{ +add.f16x2 r5113, r5051, r5052; +} +{ +mul.f16x2 r5116, r5113, r5038; +} +{ +add.f16x2 r5119, r5110, r5116; +} +{ +sub.f16x2 r5122, r5057, r5058; +} +{ +mul.f16x2 r5125, r5122, r5037; +} +{ +sub.f16x2 r5128, r5063, r5064; +} +{ +mul.f16x2 r5131, r5128, r5039; +} +{ +add.f16x2 r5134, r5125, r5131; +} +{ +add.f16x2 r5137, r5119, r5134; +} +{ +add.f16x2 r5140, r5045, r5046; +} +{ +mul.f16x2 r5143, r5140, r5038; +} +{ +add.f16x2 r5146, r5048, r5143; +} +{ +add.f16x2 r5149, r5051, r5052; +} +{ +mul.f16x2 r5152, r5149, r5040; +} +{ +add.f16x2 r5155, r5146, r5152; +} +{ +sub.f16x2 r5158, r5057, r5058; +} +{ +mul.f16x2 r5161, r5158, r5039; +} +{ +sub.f16x2 r5164, r5063, r5064; +} +{ +mul.f16x2 r5167, r5164, r5042; +} +{ +add.f16x2 r5170, r5161, r5167; +} +{ +sub.f16x2 r5173, r5155, r5170; +} +{ +add.f16x2 r5176, r5045, r5046; +} +{ +mul.f16x2 r5179, r5176, r5038; +} +{ +add.f16x2 r5182, r5048, r5179; +} +{ +add.f16x2 r5185, r5051, r5052; +} +{ +mul.f16x2 r5188, r5185, r5040; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +sub.f16x2 r5194, r5057, r5058; +} +{ +mul.f16x2 r5197, r5194, r5039; +} +{ +sub.f16x2 r5200, r5063, r5064; +} +{ +mul.f16x2 r5203, r5200, r5042; +} +{ +add.f16x2 r5206, r5197, r5203; +} +{ +add.f16x2 r5209, r5191, r5206; +} +{ +add.f16x2 r5212, r5057, r5058; +} +{ +mul.f16x2 r5215, r5212, r5036; +} +{ +add.f16x2 r5218, r5060, r5215; +} +{ +add.f16x2 r5221, r5063, r5064; +} +{ +mul.f16x2 r5224, r5221, r5038; +} +{ +add.f16x2 r5227, r5218, r5224; +} +{ +sub.f16x2 r5230, r5045, r5046; +} +{ +mul.f16x2 r5233, r5230, r5037; +} +{ +sub.f16x2 r5236, r5051, r5052; +} +{ +mul.f16x2 r5239, r5236, r5039; +} +{ +add.f16x2 r5242, r5233, r5239; +} +{ +add.f16x2 r5245, r5227, r5242; +} +{ +add.f16x2 r5248, r5057, r5058; +} +{ +mul.f16x2 r5251, r5248, r5036; +} +{ +add.f16x2 r5254, r5060, r5251; +} +{ +add.f16x2 r5257, r5063, r5064; +} +{ +mul.f16x2 r5260, r5257, r5038; +} +{ +add.f16x2 r5263, r5254, r5260; +} +{ +sub.f16x2 r5266, r5045, r5046; +} +{ +mul.f16x2 r5269, r5266, r5037; +} +{ +sub.f16x2 r5272, r5051, r5052; +} +{ +mul.f16x2 r5275, r5272, r5039; +} +{ +add.f16x2 r5278, r5269, r5275; +} +{ +sub.f16x2 r5281, r5263, r5278; +} +{ +add.f16x2 r5284, r5057, r5058; +} +{ +mul.f16x2 r5287, r5284, r5038; +} +{ +add.f16x2 r5290, r5060, r5287; +} +{ +add.f16x2 r5293, r5063, r5064; +} +{ +mul.f16x2 r5296, r5293, r5040; +} +{ +add.f16x2 r5299, r5290, r5296; +} +{ +sub.f16x2 r5302, r5045, r5046; +} +{ +mul.f16x2 r5305, r5302, r5039; +} +{ +sub.f16x2 r5308, r5051, r5052; +} +{ +mul.f16x2 r5311, r5308, r5042; +} +{ +add.f16x2 r5314, r5305, r5311; +} +{ +add.f16x2 r5317, r5299, r5314; +} +{ +add.f16x2 r5320, r5057, r5058; +} +{ +mul.f16x2 r5323, r5320, r5038; +} +{ +add.f16x2 r5326, r5060, r5323; +} +{ +add.f16x2 r5329, r5063, r5064; +} +{ +mul.f16x2 r5332, r5329, r5040; +} +{ +add.f16x2 r5335, r5326, r5332; +} +{ +sub.f16x2 r5338, r5045, r5046; +} +{ +mul.f16x2 r5341, r5338, r5039; +} +{ +sub.f16x2 r5344, r5051, r5052; +} +{ +mul.f16x2 r5347, r5344, r5042; +} +{ +add.f16x2 r5350, r5341, r5347; +} +{ +sub.f16x2 r5353, r5335, r5350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5361, {low, high}; +} +{ +neg.f16x2 r5362, r5361; +} +{ +add.f16x2 r5364, r5365, r5366; +} +{ +add.f16x2 r5367, r5368, r5364; +} +{ +add.f16x2 r5370, r5371, r5372; +} +{ +add.f16x2 r5373, r5367, r5370; +} +{ +add.f16x2 r5376, r5377, r5378; +} +{ +add.f16x2 r5379, r5380, r5376; +} +{ +add.f16x2 r5382, r5383, r5384; +} +{ +add.f16x2 r5385, r5379, r5382; +} +{ +add.f16x2 r5388, r5365, r5366; +} +{ +mul.f16x2 r5391, r5388, r5356; +} +{ +add.f16x2 r5394, r5368, r5391; +} +{ +add.f16x2 r5397, r5371, r5372; +} +{ +mul.f16x2 r5400, r5397, r5358; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5377, r5378; +} +{ +mul.f16x2 r5409, r5406, r5357; +} +{ +sub.f16x2 r5412, r5383, r5384; +} +{ +mul.f16x2 r5415, r5412, r5359; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +sub.f16x2 r5421, r5403, r5418; +} +{ +add.f16x2 r5424, r5365, r5366; +} +{ +mul.f16x2 r5427, r5424, r5356; +} +{ +add.f16x2 r5430, r5368, r5427; +} +{ +add.f16x2 r5433, r5371, r5372; +} +{ +mul.f16x2 r5436, r5433, r5358; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5377, r5378; +} +{ +mul.f16x2 r5445, r5442, r5357; +} +{ +sub.f16x2 r5448, r5383, r5384; +} +{ +mul.f16x2 r5451, r5448, r5359; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +add.f16x2 r5457, r5439, r5454; +} +{ +add.f16x2 r5460, r5365, r5366; +} +{ +mul.f16x2 r5463, r5460, r5358; +} +{ +add.f16x2 r5466, r5368, r5463; +} +{ +add.f16x2 r5469, r5371, r5372; +} +{ +mul.f16x2 r5472, r5469, r5360; +} +{ +add.f16x2 r5475, r5466, r5472; +} +{ +sub.f16x2 r5478, r5377, r5378; +} +{ +mul.f16x2 r5481, r5478, r5359; +} +{ +sub.f16x2 r5484, r5383, r5384; +} +{ +mul.f16x2 r5487, r5484, r5362; +} +{ +add.f16x2 r5490, r5481, r5487; +} +{ +sub.f16x2 r5493, r5475, r5490; +} +{ +add.f16x2 r5496, r5365, r5366; +} +{ +mul.f16x2 r5499, r5496, r5358; +} +{ +add.f16x2 r5502, r5368, r5499; +} +{ +add.f16x2 r5505, r5371, r5372; +} +{ +mul.f16x2 r5508, r5505, r5360; +} +{ +add.f16x2 r5511, r5502, r5508; +} +{ +sub.f16x2 r5514, r5377, r5378; +} +{ +mul.f16x2 r5517, r5514, r5359; +} +{ +sub.f16x2 r5520, r5383, r5384; +} +{ +mul.f16x2 r5523, r5520, r5362; +} +{ +add.f16x2 r5526, r5517, r5523; +} +{ +add.f16x2 r5529, r5511, r5526; +} +{ +add.f16x2 r5532, r5377, r5378; +} +{ +mul.f16x2 r5535, r5532, r5356; +} +{ +add.f16x2 r5538, r5380, r5535; +} +{ +add.f16x2 r5541, r5383, r5384; +} +{ +mul.f16x2 r5544, r5541, r5358; +} +{ +add.f16x2 r5547, r5538, r5544; +} +{ +sub.f16x2 r5550, r5365, r5366; +} +{ +mul.f16x2 r5553, r5550, r5357; +} +{ +sub.f16x2 r5556, r5371, r5372; +} +{ +mul.f16x2 r5559, r5556, r5359; +} +{ +add.f16x2 r5562, r5553, r5559; +} +{ +add.f16x2 r5565, r5547, r5562; +} +{ +add.f16x2 r5568, r5377, r5378; +} +{ +mul.f16x2 r5571, r5568, r5356; +} +{ +add.f16x2 r5574, r5380, r5571; +} +{ +add.f16x2 r5577, r5383, r5384; +} +{ +mul.f16x2 r5580, r5577, r5358; +} +{ +add.f16x2 r5583, r5574, r5580; +} +{ +sub.f16x2 r5586, r5365, r5366; +} +{ +mul.f16x2 r5589, r5586, r5357; +} +{ +sub.f16x2 r5592, r5371, r5372; +} +{ +mul.f16x2 r5595, r5592, r5359; +} +{ +add.f16x2 r5598, r5589, r5595; +} +{ +sub.f16x2 r5601, r5583, r5598; +} +{ +add.f16x2 r5604, r5377, r5378; +} +{ +mul.f16x2 r5607, r5604, r5358; +} +{ +add.f16x2 r5610, r5380, r5607; +} +{ +add.f16x2 r5613, r5383, r5384; +} +{ +mul.f16x2 r5616, r5613, r5360; +} +{ +add.f16x2 r5619, r5610, r5616; +} +{ +sub.f16x2 r5622, r5365, r5366; +} +{ +mul.f16x2 r5625, r5622, r5359; +} +{ +sub.f16x2 r5628, r5371, r5372; +} +{ +mul.f16x2 r5631, r5628, r5362; +} +{ +add.f16x2 r5634, r5625, r5631; +} +{ +add.f16x2 r5637, r5619, r5634; +} +{ +add.f16x2 r5640, r5377, r5378; +} +{ +mul.f16x2 r5643, r5640, r5358; +} +{ +add.f16x2 r5646, r5380, r5643; +} +{ +add.f16x2 r5649, r5383, r5384; +} +{ +mul.f16x2 r5652, r5649, r5360; +} +{ +add.f16x2 r5655, r5646, r5652; +} +{ +sub.f16x2 r5658, r5365, r5366; +} +{ +mul.f16x2 r5661, r5658, r5359; +} +{ +sub.f16x2 r5664, r5371, r5372; +} +{ +mul.f16x2 r5667, r5664, r5362; +} +{ +add.f16x2 r5670, r5661, r5667; +} +{ +sub.f16x2 r5673, r5655, r5670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5680, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5681, {low, high}; +} +{ +neg.f16x2 r5682, r5681; +} +{ +add.f16x2 r5684, r5685, r5686; +} +{ +add.f16x2 r5687, r5688, r5684; +} +{ +add.f16x2 r5690, r5691, r5692; +} +{ +add.f16x2 r5693, r5687, r5690; +} +{ +add.f16x2 r5696, r5697, r5698; +} +{ +add.f16x2 r5699, r5700, r5696; +} +{ +add.f16x2 r5702, r5703, r5704; +} +{ +add.f16x2 r5705, r5699, r5702; +} +{ +add.f16x2 r5708, r5685, r5686; +} +{ +mul.f16x2 r5711, r5708, r5676; +} +{ +add.f16x2 r5714, r5688, r5711; +} +{ +add.f16x2 r5717, r5691, r5692; +} +{ +mul.f16x2 r5720, r5717, r5678; +} +{ +add.f16x2 r5723, r5714, r5720; +} +{ +sub.f16x2 r5726, r5697, r5698; +} +{ +mul.f16x2 r5729, r5726, r5677; +} +{ +sub.f16x2 r5732, r5703, r5704; +} +{ +mul.f16x2 r5735, r5732, r5679; +} +{ +add.f16x2 r5738, r5729, r5735; +} +{ +sub.f16x2 r5741, r5723, r5738; +} +{ +add.f16x2 r5744, r5685, r5686; +} +{ +mul.f16x2 r5747, r5744, r5676; +} +{ +add.f16x2 r5750, r5688, r5747; +} +{ +add.f16x2 r5753, r5691, r5692; +} +{ +mul.f16x2 r5756, r5753, r5678; +} +{ +add.f16x2 r5759, r5750, r5756; +} +{ +sub.f16x2 r5762, r5697, r5698; +} +{ +mul.f16x2 r5765, r5762, r5677; +} +{ +sub.f16x2 r5768, r5703, r5704; +} +{ +mul.f16x2 r5771, r5768, r5679; +} +{ +add.f16x2 r5774, r5765, r5771; +} +{ +add.f16x2 r5777, r5759, r5774; +} +{ +add.f16x2 r5780, r5685, r5686; +} +{ +mul.f16x2 r5783, r5780, r5678; +} +{ +add.f16x2 r5786, r5688, r5783; +} +{ +add.f16x2 r5789, r5691, r5692; +} +{ +mul.f16x2 r5792, r5789, r5680; +} +{ +add.f16x2 r5795, r5786, r5792; +} +{ +sub.f16x2 r5798, r5697, r5698; +} +{ +mul.f16x2 r5801, r5798, r5679; +} +{ +sub.f16x2 r5804, r5703, r5704; +} +{ +mul.f16x2 r5807, r5804, r5682; +} +{ +add.f16x2 r5810, r5801, r5807; +} +{ +sub.f16x2 r5813, r5795, r5810; +} +{ +add.f16x2 r5816, r5685, r5686; +} +{ +mul.f16x2 r5819, r5816, r5678; +} +{ +add.f16x2 r5822, r5688, r5819; +} +{ +add.f16x2 r5825, r5691, r5692; +} +{ +mul.f16x2 r5828, r5825, r5680; +} +{ +add.f16x2 r5831, r5822, r5828; +} +{ +sub.f16x2 r5834, r5697, r5698; +} +{ +mul.f16x2 r5837, r5834, r5679; +} +{ +sub.f16x2 r5840, r5703, r5704; +} +{ +mul.f16x2 r5843, r5840, r5682; +} +{ +add.f16x2 r5846, r5837, r5843; +} +{ +add.f16x2 r5849, r5831, r5846; +} +{ +add.f16x2 r5852, r5697, r5698; +} +{ +mul.f16x2 r5855, r5852, r5676; +} +{ +add.f16x2 r5858, r5700, r5855; +} +{ +add.f16x2 r5861, r5703, r5704; +} +{ +mul.f16x2 r5864, r5861, r5678; +} +{ +add.f16x2 r5867, r5858, r5864; +} +{ +sub.f16x2 r5870, r5685, r5686; +} +{ +mul.f16x2 r5873, r5870, r5677; +} +{ +sub.f16x2 r5876, r5691, r5692; +} +{ +mul.f16x2 r5879, r5876, r5679; +} +{ +add.f16x2 r5882, r5873, r5879; +} +{ +add.f16x2 r5885, r5867, r5882; +} +{ +add.f16x2 r5888, r5697, r5698; +} +{ +mul.f16x2 r5891, r5888, r5676; +} +{ +add.f16x2 r5894, r5700, r5891; +} +{ +add.f16x2 r5897, r5703, r5704; +} +{ +mul.f16x2 r5900, r5897, r5678; +} +{ +add.f16x2 r5903, r5894, r5900; +} +{ +sub.f16x2 r5906, r5685, r5686; +} +{ +mul.f16x2 r5909, r5906, r5677; +} +{ +sub.f16x2 r5912, r5691, r5692; +} +{ +mul.f16x2 r5915, r5912, r5679; +} +{ +add.f16x2 r5918, r5909, r5915; +} +{ +sub.f16x2 r5921, r5903, r5918; +} +{ +add.f16x2 r5924, r5697, r5698; +} +{ +mul.f16x2 r5927, r5924, r5678; +} +{ +add.f16x2 r5930, r5700, r5927; +} +{ +add.f16x2 r5933, r5703, r5704; +} +{ +mul.f16x2 r5936, r5933, r5680; +} +{ +add.f16x2 r5939, r5930, r5936; +} +{ +sub.f16x2 r5942, r5685, r5686; +} +{ +mul.f16x2 r5945, r5942, r5679; +} +{ +sub.f16x2 r5948, r5691, r5692; +} +{ +mul.f16x2 r5951, r5948, r5682; +} +{ +add.f16x2 r5954, r5945, r5951; +} +{ +add.f16x2 r5957, r5939, r5954; +} +{ +add.f16x2 r5960, r5697, r5698; +} +{ +mul.f16x2 r5963, r5960, r5678; +} +{ +add.f16x2 r5966, r5700, r5963; +} +{ +add.f16x2 r5969, r5703, r5704; +} +{ +mul.f16x2 r5972, r5969, r5680; +} +{ +add.f16x2 r5975, r5966, r5972; +} +{ +sub.f16x2 r5978, r5685, r5686; +} +{ +mul.f16x2 r5981, r5978, r5679; +} +{ +sub.f16x2 r5984, r5691, r5692; +} +{ +mul.f16x2 r5987, r5984, r5682; +} +{ +add.f16x2 r5990, r5981, r5987; +} +{ +sub.f16x2 r5993, r5975, r5990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r5996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r5997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r5998, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r5999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6000, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6001, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6002, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6003, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6006, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6007, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6027, {low, high}; +} +{ +mul.f16x2 r6044, r4781, r5996; +} +{ +mul.f16x2 r6047, r4925, r5997; +} +{ +sub.f16x2 r6050, r6044, r6047; +} +{ +mul.f16x2 r6053, r4781, r5997; +} +{ +fma.rn.f16x2 r6056, r4925, r5996, r6053; +} +{ +mul.f16x2 r6060, r5101, r5998; +} +{ +mul.f16x2 r6063, r5245, r5999; +} +{ +sub.f16x2 r6066, r6060, r6063; +} +{ +mul.f16x2 r6069, r5101, r5999; +} +{ +fma.rn.f16x2 r6072, r5245, r5998, r6069; +} +{ +mul.f16x2 r6076, r5421, r6000; +} +{ +mul.f16x2 r6079, r5565, r6001; +} +{ +sub.f16x2 r6082, r6076, r6079; +} +{ +mul.f16x2 r6085, r5421, r6001; +} +{ +fma.rn.f16x2 r6088, r5565, r6000, r6085; +} +{ +mul.f16x2 r6092, r5741, r6002; +} +{ +mul.f16x2 r6095, r5885, r6003; +} +{ +sub.f16x2 r6098, r6092, r6095; +} +{ +mul.f16x2 r6101, r5741, r6003; +} +{ +fma.rn.f16x2 r6104, r5885, r6002, r6101; +} +{ +mul.f16x2 r6108, r4853, r5998; +} +{ +mul.f16x2 r6111, r4997, r5999; +} +{ +sub.f16x2 r6114, r6108, r6111; +} +{ +mul.f16x2 r6117, r4853, r5999; +} +{ +fma.rn.f16x2 r6120, r4997, r5998, r6117; +} +{ +mul.f16x2 r6124, r5173, r6002; +} +{ +mul.f16x2 r6127, r5317, r6003; +} +{ +sub.f16x2 r6130, r6124, r6127; +} +{ +mul.f16x2 r6133, r5173, r6003; +} +{ +fma.rn.f16x2 r6136, r5317, r6002, r6133; +} +{ +mul.f16x2 r6140, r5493, r6006; +} +{ +mul.f16x2 r6143, r5637, r6007; +} +{ +sub.f16x2 r6146, r6140, r6143; +} +{ +mul.f16x2 r6149, r5493, r6007; +} +{ +fma.rn.f16x2 r6152, r5637, r6006, r6149; +} +{ +mul.f16x2 r6156, r5813, r6010; +} +{ +mul.f16x2 r6159, r5957, r6011; +} +{ +sub.f16x2 r6162, r6156, r6159; +} +{ +mul.f16x2 r6165, r5813, r6011; +} +{ +fma.rn.f16x2 r6168, r5957, r6010, r6165; +} +{ +mul.f16x2 r6172, r4889, r6000; +} +{ +mul.f16x2 r6175, r5033, r6001; +} +{ +sub.f16x2 r6178, r6172, r6175; +} +{ +mul.f16x2 r6181, r4889, r6001; +} +{ +fma.rn.f16x2 r6184, r5033, r6000, r6181; +} +{ +mul.f16x2 r6188, r5209, r6006; +} +{ +mul.f16x2 r6191, r5353, r6007; +} +{ +sub.f16x2 r6194, r6188, r6191; +} +{ +mul.f16x2 r6197, r5209, r6007; +} +{ +fma.rn.f16x2 r6200, r5353, r6006, r6197; +} +{ +mul.f16x2 r6204, r5529, r6012; +} +{ +mul.f16x2 r6207, r5673, r6013; +} +{ +sub.f16x2 r6210, r6204, r6207; +} +{ +mul.f16x2 r6213, r5529, r6013; +} +{ +fma.rn.f16x2 r6216, r5673, r6012, r6213; +} +{ +mul.f16x2 r6220, r5849, r6018; +} +{ +mul.f16x2 r6223, r5993, r6019; +} +{ +sub.f16x2 r6226, r6220, r6223; +} +{ +mul.f16x2 r6229, r5849, r6019; +} +{ +fma.rn.f16x2 r6232, r5993, r6018, r6229; +} +{ +mul.f16x2 r6236, r4817, r6002; +} +{ +mul.f16x2 r6239, r4961, r6003; +} +{ +sub.f16x2 r6242, r6236, r6239; +} +{ +mul.f16x2 r6245, r4817, r6003; +} +{ +fma.rn.f16x2 r6248, r4961, r6002, r6245; +} +{ +mul.f16x2 r6252, r5137, r6010; +} +{ +mul.f16x2 r6255, r5281, r6011; +} +{ +sub.f16x2 r6258, r6252, r6255; +} +{ +mul.f16x2 r6261, r5137, r6011; +} +{ +fma.rn.f16x2 r6264, r5281, r6010, r6261; +} +{ +mul.f16x2 r6268, r5457, r6018; +} +{ +mul.f16x2 r6271, r5601, r6019; +} +{ +sub.f16x2 r6274, r6268, r6271; +} +{ +mul.f16x2 r6277, r5457, r6019; +} +{ +fma.rn.f16x2 r6280, r5601, r6018, r6277; +} +{ +mul.f16x2 r6284, r5777, r6026; +} +{ +mul.f16x2 r6287, r5921, r6027; +} +{ +sub.f16x2 r6290, r6284, r6287; +} +{ +mul.f16x2 r6293, r5777, r6027; +} +{ +fma.rn.f16x2 r6296, r5921, r6026, r6293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6302, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6305, {low, high}; +} +{ +neg.f16x2 r6306, r6305; +} +{ +add.f16x2 r6308, r4733, r5693; +} +{ +add.f16x2 r6311, r4413, r6308; +} +{ +add.f16x2 r6314, r5053, r5373; +} +{ +add.f16x2 r6317, r6311, r6314; +} +{ +add.f16x2 r6320, r4745, r5705; +} +{ +add.f16x2 r6323, r4425, r6320; +} +{ +add.f16x2 r6326, r5065, r5385; +} +{ +add.f16x2 r6329, r6323, r6326; +} +{ +add.f16x2 r6332, r4733, r5693; +} +{ +mul.f16x2 r6335, r6332, r6300; +} +{ +add.f16x2 r6338, r4413, r6335; +} +{ +add.f16x2 r6341, r5053, r5373; +} +{ +mul.f16x2 r6344, r6341, r6302; +} +{ +add.f16x2 r6347, r6338, r6344; +} +{ +sub.f16x2 r6350, r4745, r5705; +} +{ +mul.f16x2 r6353, r6350, r6301; +} +{ +sub.f16x2 r6356, r5065, r5385; +} +{ +mul.f16x2 r6359, r6356, r6303; +} +{ +add.f16x2 r6362, r6353, r6359; +} +{ +sub.f16x2 r6365, r6347, r6362; +} +{ +add.f16x2 r6368, r4733, r5693; +} +{ +mul.f16x2 r6371, r6368, r6300; +} +{ +add.f16x2 r6374, r4413, r6371; +} +{ +add.f16x2 r6377, r5053, r5373; +} +{ +mul.f16x2 r6380, r6377, r6302; +} +{ +add.f16x2 r6383, r6374, r6380; +} +{ +sub.f16x2 r6386, r4745, r5705; +} +{ +mul.f16x2 r6389, r6386, r6301; +} +{ +sub.f16x2 r6392, r5065, r5385; +} +{ +mul.f16x2 r6395, r6392, r6303; +} +{ +add.f16x2 r6398, r6389, r6395; +} +{ +add.f16x2 r6401, r6383, r6398; +} +{ +add.f16x2 r6404, r4733, r5693; +} +{ +mul.f16x2 r6407, r6404, r6302; +} +{ +add.f16x2 r6410, r4413, r6407; +} +{ +add.f16x2 r6413, r5053, r5373; +} +{ +mul.f16x2 r6416, r6413, r6304; +} +{ +add.f16x2 r6419, r6410, r6416; +} +{ +sub.f16x2 r6422, r4745, r5705; +} +{ +mul.f16x2 r6425, r6422, r6303; +} +{ +sub.f16x2 r6428, r5065, r5385; +} +{ +mul.f16x2 r6431, r6428, r6306; +} +{ +add.f16x2 r6434, r6425, r6431; +} +{ +sub.f16x2 r6437, r6419, r6434; +} +{ +add.f16x2 r6440, r4733, r5693; +} +{ +mul.f16x2 r6443, r6440, r6302; +} +{ +add.f16x2 r6446, r4413, r6443; +} +{ +add.f16x2 r6449, r5053, r5373; +} +{ +mul.f16x2 r6452, r6449, r6304; +} +{ +add.f16x2 r6455, r6446, r6452; +} +{ +sub.f16x2 r6458, r4745, r5705; +} +{ +mul.f16x2 r6461, r6458, r6303; +} +{ +sub.f16x2 r6464, r5065, r5385; +} +{ +mul.f16x2 r6467, r6464, r6306; +} +{ +add.f16x2 r6470, r6461, r6467; +} +{ +add.f16x2 r6473, r6455, r6470; +} +{ +add.f16x2 r6476, r4745, r5705; +} +{ +mul.f16x2 r6479, r6476, r6300; +} +{ +add.f16x2 r6482, r4425, r6479; +} +{ +add.f16x2 r6485, r5065, r5385; +} +{ +mul.f16x2 r6488, r6485, r6302; +} +{ +add.f16x2 r6491, r6482, r6488; +} +{ +sub.f16x2 r6494, r4733, r5693; +} +{ +mul.f16x2 r6497, r6494, r6301; +} +{ +sub.f16x2 r6500, r5053, r5373; +} +{ +mul.f16x2 r6503, r6500, r6303; +} +{ +add.f16x2 r6506, r6497, r6503; +} +{ +add.f16x2 r6509, r6491, r6506; +} +{ +add.f16x2 r6512, r4745, r5705; +} +{ +mul.f16x2 r6515, r6512, r6300; +} +{ +add.f16x2 r6518, r4425, r6515; +} +{ +add.f16x2 r6521, r5065, r5385; +} +{ +mul.f16x2 r6524, r6521, r6302; +} +{ +add.f16x2 r6527, r6518, r6524; +} +{ +sub.f16x2 r6530, r4733, r5693; +} +{ +mul.f16x2 r6533, r6530, r6301; +} +{ +sub.f16x2 r6536, r5053, r5373; +} +{ +mul.f16x2 r6539, r6536, r6303; +} +{ +add.f16x2 r6542, r6533, r6539; +} +{ +sub.f16x2 r6545, r6527, r6542; +} +{ +add.f16x2 r6548, r4745, r5705; +} +{ +mul.f16x2 r6551, r6548, r6302; +} +{ +add.f16x2 r6554, r4425, r6551; +} +{ +add.f16x2 r6557, r5065, r5385; +} +{ +mul.f16x2 r6560, r6557, r6304; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +sub.f16x2 r6566, r4733, r5693; +} +{ +mul.f16x2 r6569, r6566, r6303; +} +{ +sub.f16x2 r6572, r5053, r5373; +} +{ +mul.f16x2 r6575, r6572, r6306; +} +{ +add.f16x2 r6578, r6569, r6575; +} +{ +add.f16x2 r6581, r6563, r6578; +} +{ +add.f16x2 r6584, r4745, r5705; +} +{ +mul.f16x2 r6587, r6584, r6302; +} +{ +add.f16x2 r6590, r4425, r6587; +} +{ +add.f16x2 r6593, r5065, r5385; +} +{ +mul.f16x2 r6596, r6593, r6304; +} +{ +add.f16x2 r6599, r6590, r6596; +} +{ +sub.f16x2 r6602, r4733, r5693; +} +{ +mul.f16x2 r6605, r6602, r6303; +} +{ +sub.f16x2 r6608, r5053, r5373; +} +{ +mul.f16x2 r6611, r6608, r6306; +} +{ +add.f16x2 r6614, r6605, r6611; +} +{ +sub.f16x2 r6617, r6599, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6623, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6625, {low, high}; +} +{ +neg.f16x2 r6626, r6625; +} +{ +add.f16x2 r6628, r6050, r6098; +} +{ +add.f16x2 r6631, r4461, r6628; +} +{ +add.f16x2 r6634, r6066, r6082; +} +{ +add.f16x2 r6637, r6631, r6634; +} +{ +add.f16x2 r6640, r6056, r6104; +} +{ +add.f16x2 r6643, r4605, r6640; +} +{ +add.f16x2 r6646, r6072, r6088; +} +{ +add.f16x2 r6649, r6643, r6646; +} +{ +add.f16x2 r6652, r6050, r6098; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4461, r6655; +} +{ +add.f16x2 r6661, r6066, r6082; +} +{ +mul.f16x2 r6664, r6661, r6622; +} +{ +add.f16x2 r6667, r6658, r6664; +} +{ +sub.f16x2 r6670, r6056, r6104; +} +{ +mul.f16x2 r6673, r6670, r6621; +} +{ +sub.f16x2 r6676, r6072, r6088; +} +{ +mul.f16x2 r6679, r6676, r6623; +} +{ +add.f16x2 r6682, r6673, r6679; +} +{ +sub.f16x2 r6685, r6667, r6682; +} +{ +add.f16x2 r6688, r6050, r6098; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4461, r6691; +} +{ +add.f16x2 r6697, r6066, r6082; +} +{ +mul.f16x2 r6700, r6697, r6622; +} +{ +add.f16x2 r6703, r6694, r6700; +} +{ +sub.f16x2 r6706, r6056, r6104; +} +{ +mul.f16x2 r6709, r6706, r6621; +} +{ +sub.f16x2 r6712, r6072, r6088; +} +{ +mul.f16x2 r6715, r6712, r6623; +} +{ +add.f16x2 r6718, r6709, r6715; +} +{ +add.f16x2 r6721, r6703, r6718; +} +{ +add.f16x2 r6724, r6050, r6098; +} +{ +mul.f16x2 r6727, r6724, r6622; +} +{ +add.f16x2 r6730, r4461, r6727; +} +{ +add.f16x2 r6733, r6066, r6082; +} +{ +mul.f16x2 r6736, r6733, r6624; +} +{ +add.f16x2 r6739, r6730, r6736; +} +{ +sub.f16x2 r6742, r6056, r6104; +} +{ +mul.f16x2 r6745, r6742, r6623; +} +{ +sub.f16x2 r6748, r6072, r6088; +} +{ +mul.f16x2 r6751, r6748, r6626; +} +{ +add.f16x2 r6754, r6745, r6751; +} +{ +sub.f16x2 r6757, r6739, r6754; +} +{ +add.f16x2 r6760, r6050, r6098; +} +{ +mul.f16x2 r6763, r6760, r6622; +} +{ +add.f16x2 r6766, r4461, r6763; +} +{ +add.f16x2 r6769, r6066, r6082; +} +{ +mul.f16x2 r6772, r6769, r6624; +} +{ +add.f16x2 r6775, r6766, r6772; +} +{ +sub.f16x2 r6778, r6056, r6104; +} +{ +mul.f16x2 r6781, r6778, r6623; +} +{ +sub.f16x2 r6784, r6072, r6088; +} +{ +mul.f16x2 r6787, r6784, r6626; +} +{ +add.f16x2 r6790, r6781, r6787; +} +{ +add.f16x2 r6793, r6775, r6790; +} +{ +add.f16x2 r6796, r6056, r6104; +} +{ +mul.f16x2 r6799, r6796, r6620; +} +{ +add.f16x2 r6802, r4605, r6799; +} +{ +add.f16x2 r6805, r6072, r6088; +} +{ +mul.f16x2 r6808, r6805, r6622; +} +{ +add.f16x2 r6811, r6802, r6808; +} +{ +sub.f16x2 r6814, r6050, r6098; +} +{ +mul.f16x2 r6817, r6814, r6621; +} +{ +sub.f16x2 r6820, r6066, r6082; +} +{ +mul.f16x2 r6823, r6820, r6623; +} +{ +add.f16x2 r6826, r6817, r6823; +} +{ +add.f16x2 r6829, r6811, r6826; +} +{ +add.f16x2 r6832, r6056, r6104; +} +{ +mul.f16x2 r6835, r6832, r6620; +} +{ +add.f16x2 r6838, r4605, r6835; +} +{ +add.f16x2 r6841, r6072, r6088; +} +{ +mul.f16x2 r6844, r6841, r6622; +} +{ +add.f16x2 r6847, r6838, r6844; +} +{ +sub.f16x2 r6850, r6050, r6098; +} +{ +mul.f16x2 r6853, r6850, r6621; +} +{ +sub.f16x2 r6856, r6066, r6082; +} +{ +mul.f16x2 r6859, r6856, r6623; +} +{ +add.f16x2 r6862, r6853, r6859; +} +{ +sub.f16x2 r6865, r6847, r6862; +} +{ +add.f16x2 r6868, r6056, r6104; +} +{ +mul.f16x2 r6871, r6868, r6622; +} +{ +add.f16x2 r6874, r4605, r6871; +} +{ +add.f16x2 r6877, r6072, r6088; +} +{ +mul.f16x2 r6880, r6877, r6624; +} +{ +add.f16x2 r6883, r6874, r6880; +} +{ +sub.f16x2 r6886, r6050, r6098; +} +{ +mul.f16x2 r6889, r6886, r6623; +} +{ +sub.f16x2 r6892, r6066, r6082; +} +{ +mul.f16x2 r6895, r6892, r6626; +} +{ +add.f16x2 r6898, r6889, r6895; +} +{ +add.f16x2 r6901, r6883, r6898; +} +{ +add.f16x2 r6904, r6056, r6104; +} +{ +mul.f16x2 r6907, r6904, r6622; +} +{ +add.f16x2 r6910, r4605, r6907; +} +{ +add.f16x2 r6913, r6072, r6088; +} +{ +mul.f16x2 r6916, r6913, r6624; +} +{ +add.f16x2 r6919, r6910, r6916; +} +{ +sub.f16x2 r6922, r6050, r6098; +} +{ +mul.f16x2 r6925, r6922, r6623; +} +{ +sub.f16x2 r6928, r6066, r6082; +} +{ +mul.f16x2 r6931, r6928, r6626; +} +{ +add.f16x2 r6934, r6925, r6931; +} +{ +sub.f16x2 r6937, r6919, r6934; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6945, {low, high}; +} +{ +neg.f16x2 r6946, r6945; +} +{ +add.f16x2 r6948, r6114, r6162; +} +{ +add.f16x2 r6951, r4533, r6948; +} +{ +add.f16x2 r6954, r6130, r6146; +} +{ +add.f16x2 r6957, r6951, r6954; +} +{ +add.f16x2 r6960, r6120, r6168; +} +{ +add.f16x2 r6963, r4677, r6960; +} +{ +add.f16x2 r6966, r6136, r6152; +} +{ +add.f16x2 r6969, r6963, r6966; +} +{ +add.f16x2 r6972, r6114, r6162; +} +{ +mul.f16x2 r6975, r6972, r6940; +} +{ +add.f16x2 r6978, r4533, r6975; +} +{ +add.f16x2 r6981, r6130, r6146; +} +{ +mul.f16x2 r6984, r6981, r6942; +} +{ +add.f16x2 r6987, r6978, r6984; +} +{ +sub.f16x2 r6990, r6120, r6168; +} +{ +mul.f16x2 r6993, r6990, r6941; +} +{ +sub.f16x2 r6996, r6136, r6152; +} +{ +mul.f16x2 r6999, r6996, r6943; +} +{ +add.f16x2 r7002, r6993, r6999; +} +{ +sub.f16x2 r7005, r6987, r7002; +} +{ +add.f16x2 r7008, r6114, r6162; +} +{ +mul.f16x2 r7011, r7008, r6940; +} +{ +add.f16x2 r7014, r4533, r7011; +} +{ +add.f16x2 r7017, r6130, r6146; +} +{ +mul.f16x2 r7020, r7017, r6942; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6120, r6168; +} +{ +mul.f16x2 r7029, r7026, r6941; +} +{ +sub.f16x2 r7032, r6136, r6152; +} +{ +mul.f16x2 r7035, r7032, r6943; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +add.f16x2 r7041, r7023, r7038; +} +{ +add.f16x2 r7044, r6114, r6162; +} +{ +mul.f16x2 r7047, r7044, r6942; +} +{ +add.f16x2 r7050, r4533, r7047; +} +{ +add.f16x2 r7053, r6130, r6146; +} +{ +mul.f16x2 r7056, r7053, r6944; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6120, r6168; +} +{ +mul.f16x2 r7065, r7062, r6943; +} +{ +sub.f16x2 r7068, r6136, r6152; +} +{ +mul.f16x2 r7071, r7068, r6946; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +sub.f16x2 r7077, r7059, r7074; +} +{ +add.f16x2 r7080, r6114, r6162; +} +{ +mul.f16x2 r7083, r7080, r6942; +} +{ +add.f16x2 r7086, r4533, r7083; +} +{ +add.f16x2 r7089, r6130, r6146; +} +{ +mul.f16x2 r7092, r7089, r6944; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6120, r6168; +} +{ +mul.f16x2 r7101, r7098, r6943; +} +{ +sub.f16x2 r7104, r6136, r6152; +} +{ +mul.f16x2 r7107, r7104, r6946; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +add.f16x2 r7113, r7095, r7110; +} +{ +add.f16x2 r7116, r6120, r6168; +} +{ +mul.f16x2 r7119, r7116, r6940; +} +{ +add.f16x2 r7122, r4677, r7119; +} +{ +add.f16x2 r7125, r6136, r6152; +} +{ +mul.f16x2 r7128, r7125, r6942; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6114, r6162; +} +{ +mul.f16x2 r7137, r7134, r6941; +} +{ +sub.f16x2 r7140, r6130, r6146; +} +{ +mul.f16x2 r7143, r7140, r6943; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 r7149, r7131, r7146; +} +{ +add.f16x2 r7152, r6120, r6168; +} +{ +mul.f16x2 r7155, r7152, r6940; +} +{ +add.f16x2 r7158, r4677, r7155; +} +{ +add.f16x2 r7161, r6136, r6152; +} +{ +mul.f16x2 r7164, r7161, r6942; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6114, r6162; +} +{ +mul.f16x2 r7173, r7170, r6941; +} +{ +sub.f16x2 r7176, r6130, r6146; +} +{ +mul.f16x2 r7179, r7176, r6943; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +sub.f16x2 r7185, r7167, r7182; +} +{ +add.f16x2 r7188, r6120, r6168; +} +{ +mul.f16x2 r7191, r7188, r6942; +} +{ +add.f16x2 r7194, r4677, r7191; +} +{ +add.f16x2 r7197, r6136, r6152; +} +{ +mul.f16x2 r7200, r7197, r6944; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6114, r6162; +} +{ +mul.f16x2 r7209, r7206, r6943; +} +{ +sub.f16x2 r7212, r6130, r6146; +} +{ +mul.f16x2 r7215, r7212, r6946; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +add.f16x2 r7221, r7203, r7218; +} +{ +add.f16x2 r7224, r6120, r6168; +} +{ +mul.f16x2 r7227, r7224, r6942; +} +{ +add.f16x2 r7230, r4677, r7227; +} +{ +add.f16x2 r7233, r6136, r6152; +} +{ +mul.f16x2 r7236, r7233, r6944; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6114, r6162; +} +{ +mul.f16x2 r7245, r7242, r6943; +} +{ +sub.f16x2 r7248, r6130, r6146; +} +{ +mul.f16x2 r7251, r7248, r6946; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +sub.f16x2 r7257, r7239, r7254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7265, {low, high}; +} +{ +neg.f16x2 r7266, r7265; +} +{ +add.f16x2 r7268, r6178, r6226; +} +{ +add.f16x2 r7271, r4569, r7268; +} +{ +add.f16x2 r7274, r6194, r6210; +} +{ +add.f16x2 r7277, r7271, r7274; +} +{ +add.f16x2 r7280, r6184, r6232; +} +{ +add.f16x2 r7283, r4713, r7280; +} +{ +add.f16x2 r7286, r6200, r6216; +} +{ +add.f16x2 r7289, r7283, r7286; +} +{ +add.f16x2 r7292, r6178, r6226; +} +{ +mul.f16x2 r7295, r7292, r7260; +} +{ +add.f16x2 r7298, r4569, r7295; +} +{ +add.f16x2 r7301, r6194, r6210; +} +{ +mul.f16x2 r7304, r7301, r7262; +} +{ +add.f16x2 r7307, r7298, r7304; +} +{ +sub.f16x2 r7310, r6184, r6232; +} +{ +mul.f16x2 r7313, r7310, r7261; +} +{ +sub.f16x2 r7316, r6200, r6216; +} +{ +mul.f16x2 r7319, r7316, r7263; +} +{ +add.f16x2 r7322, r7313, r7319; +} +{ +sub.f16x2 r7325, r7307, r7322; +} +{ +add.f16x2 r7328, r6178, r6226; +} +{ +mul.f16x2 r7331, r7328, r7260; +} +{ +add.f16x2 r7334, r4569, r7331; +} +{ +add.f16x2 r7337, r6194, r6210; +} +{ +mul.f16x2 r7340, r7337, r7262; +} +{ +add.f16x2 r7343, r7334, r7340; +} +{ +sub.f16x2 r7346, r6184, r6232; +} +{ +mul.f16x2 r7349, r7346, r7261; +} +{ +sub.f16x2 r7352, r6200, r6216; +} +{ +mul.f16x2 r7355, r7352, r7263; +} +{ +add.f16x2 r7358, r7349, r7355; +} +{ +add.f16x2 r7361, r7343, r7358; +} +{ +add.f16x2 r7364, r6178, r6226; +} +{ +mul.f16x2 r7367, r7364, r7262; +} +{ +add.f16x2 r7370, r4569, r7367; +} +{ +add.f16x2 r7373, r6194, r6210; +} +{ +mul.f16x2 r7376, r7373, r7264; +} +{ +add.f16x2 r7379, r7370, r7376; +} +{ +sub.f16x2 r7382, r6184, r6232; +} +{ +mul.f16x2 r7385, r7382, r7263; +} +{ +sub.f16x2 r7388, r6200, r6216; +} +{ +mul.f16x2 r7391, r7388, r7266; +} +{ +add.f16x2 r7394, r7385, r7391; +} +{ +sub.f16x2 r7397, r7379, r7394; +} +{ +add.f16x2 r7400, r6178, r6226; +} +{ +mul.f16x2 r7403, r7400, r7262; +} +{ +add.f16x2 r7406, r4569, r7403; +} +{ +add.f16x2 r7409, r6194, r6210; +} +{ +mul.f16x2 r7412, r7409, r7264; +} +{ +add.f16x2 r7415, r7406, r7412; +} +{ +sub.f16x2 r7418, r6184, r6232; +} +{ +mul.f16x2 r7421, r7418, r7263; +} +{ +sub.f16x2 r7424, r6200, r6216; +} +{ +mul.f16x2 r7427, r7424, r7266; +} +{ +add.f16x2 r7430, r7421, r7427; +} +{ +add.f16x2 r7433, r7415, r7430; +} +{ +add.f16x2 r7436, r6184, r6232; +} +{ +mul.f16x2 r7439, r7436, r7260; +} +{ +add.f16x2 r7442, r4713, r7439; +} +{ +add.f16x2 r7445, r6200, r6216; +} +{ +mul.f16x2 r7448, r7445, r7262; +} +{ +add.f16x2 r7451, r7442, r7448; +} +{ +sub.f16x2 r7454, r6178, r6226; +} +{ +mul.f16x2 r7457, r7454, r7261; +} +{ +sub.f16x2 r7460, r6194, r6210; +} +{ +mul.f16x2 r7463, r7460, r7263; +} +{ +add.f16x2 r7466, r7457, r7463; +} +{ +add.f16x2 r7469, r7451, r7466; +} +{ +add.f16x2 r7472, r6184, r6232; +} +{ +mul.f16x2 r7475, r7472, r7260; +} +{ +add.f16x2 r7478, r4713, r7475; +} +{ +add.f16x2 r7481, r6200, r6216; +} +{ +mul.f16x2 r7484, r7481, r7262; +} +{ +add.f16x2 r7487, r7478, r7484; +} +{ +sub.f16x2 r7490, r6178, r6226; +} +{ +mul.f16x2 r7493, r7490, r7261; +} +{ +sub.f16x2 r7496, r6194, r6210; +} +{ +mul.f16x2 r7499, r7496, r7263; +} +{ +add.f16x2 r7502, r7493, r7499; +} +{ +sub.f16x2 r7505, r7487, r7502; +} +{ +add.f16x2 r7508, r6184, r6232; +} +{ +mul.f16x2 r7511, r7508, r7262; +} +{ +add.f16x2 r7514, r4713, r7511; +} +{ +add.f16x2 r7517, r6200, r6216; +} +{ +mul.f16x2 r7520, r7517, r7264; +} +{ +add.f16x2 r7523, r7514, r7520; +} +{ +sub.f16x2 r7526, r6178, r6226; +} +{ +mul.f16x2 r7529, r7526, r7263; +} +{ +sub.f16x2 r7532, r6194, r6210; +} +{ +mul.f16x2 r7535, r7532, r7266; +} +{ +add.f16x2 r7538, r7529, r7535; +} +{ +add.f16x2 r7541, r7523, r7538; +} +{ +add.f16x2 r7544, r6184, r6232; +} +{ +mul.f16x2 r7547, r7544, r7262; +} +{ +add.f16x2 r7550, r4713, r7547; +} +{ +add.f16x2 r7553, r6200, r6216; +} +{ +mul.f16x2 r7556, r7553, r7264; +} +{ +add.f16x2 r7559, r7550, r7556; +} +{ +sub.f16x2 r7562, r6178, r6226; +} +{ +mul.f16x2 r7565, r7562, r7263; +} +{ +sub.f16x2 r7568, r6194, r6210; +} +{ +mul.f16x2 r7571, r7568, r7266; +} +{ +add.f16x2 r7574, r7565, r7571; +} +{ +sub.f16x2 r7577, r7559, r7574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7581, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7582, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7584, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7585, {low, high}; +} +{ +neg.f16x2 r7586, r7585; +} +{ +add.f16x2 r7588, r6242, r6290; +} +{ +add.f16x2 r7591, r4497, r7588; +} +{ +add.f16x2 r7594, r6258, r6274; +} +{ +add.f16x2 r7597, r7591, r7594; +} +{ +add.f16x2 r7600, r6248, r6296; +} +{ +add.f16x2 r7603, r4641, r7600; +} +{ +add.f16x2 r7606, r6264, r6280; +} +{ +add.f16x2 r7609, r7603, r7606; +} +{ +add.f16x2 r7612, r6242, r6290; +} +{ +mul.f16x2 r7615, r7612, r7580; +} +{ +add.f16x2 r7618, r4497, r7615; +} +{ +add.f16x2 r7621, r6258, r6274; +} +{ +mul.f16x2 r7624, r7621, r7582; +} +{ +add.f16x2 r7627, r7618, r7624; +} +{ +sub.f16x2 r7630, r6248, r6296; +} +{ +mul.f16x2 r7633, r7630, r7581; +} +{ +sub.f16x2 r7636, r6264, r6280; +} +{ +mul.f16x2 r7639, r7636, r7583; +} +{ +add.f16x2 r7642, r7633, r7639; +} +{ +sub.f16x2 r7645, r7627, r7642; +} +{ +add.f16x2 r7648, r6242, r6290; +} +{ +mul.f16x2 r7651, r7648, r7580; +} +{ +add.f16x2 r7654, r4497, r7651; +} +{ +add.f16x2 r7657, r6258, r6274; +} +{ +mul.f16x2 r7660, r7657, r7582; +} +{ +add.f16x2 r7663, r7654, r7660; +} +{ +sub.f16x2 r7666, r6248, r6296; +} +{ +mul.f16x2 r7669, r7666, r7581; +} +{ +sub.f16x2 r7672, r6264, r6280; +} +{ +mul.f16x2 r7675, r7672, r7583; +} +{ +add.f16x2 r7678, r7669, r7675; +} +{ +add.f16x2 r7681, r7663, r7678; +} +{ +add.f16x2 r7684, r6242, r6290; +} +{ +mul.f16x2 r7687, r7684, r7582; +} +{ +add.f16x2 r7690, r4497, r7687; +} +{ +add.f16x2 r7693, r6258, r6274; +} +{ +mul.f16x2 r7696, r7693, r7584; +} +{ +add.f16x2 r7699, r7690, r7696; +} +{ +sub.f16x2 r7702, r6248, r6296; +} +{ +mul.f16x2 r7705, r7702, r7583; +} +{ +sub.f16x2 r7708, r6264, r6280; +} +{ +mul.f16x2 r7711, r7708, r7586; +} +{ +add.f16x2 r7714, r7705, r7711; +} +{ +sub.f16x2 r7717, r7699, r7714; +} +{ +add.f16x2 r7720, r6242, r6290; +} +{ +mul.f16x2 r7723, r7720, r7582; +} +{ +add.f16x2 r7726, r4497, r7723; +} +{ +add.f16x2 r7729, r6258, r6274; +} +{ +mul.f16x2 r7732, r7729, r7584; +} +{ +add.f16x2 r7735, r7726, r7732; +} +{ +sub.f16x2 r7738, r6248, r6296; +} +{ +mul.f16x2 r7741, r7738, r7583; +} +{ +sub.f16x2 r7744, r6264, r6280; +} +{ +mul.f16x2 r7747, r7744, r7586; +} +{ +add.f16x2 r7750, r7741, r7747; +} +{ +add.f16x2 r7753, r7735, r7750; +} +{ +add.f16x2 r7756, r6248, r6296; +} +{ +mul.f16x2 r7759, r7756, r7580; +} +{ +add.f16x2 r7762, r4641, r7759; +} +{ +add.f16x2 r7765, r6264, r6280; +} +{ +mul.f16x2 r7768, r7765, r7582; +} +{ +add.f16x2 r7771, r7762, r7768; +} +{ +sub.f16x2 r7774, r6242, r6290; +} +{ +mul.f16x2 r7777, r7774, r7581; +} +{ +sub.f16x2 r7780, r6258, r6274; +} +{ +mul.f16x2 r7783, r7780, r7583; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7771, r7786; +} +{ +add.f16x2 r7792, r6248, r6296; +} +{ +mul.f16x2 r7795, r7792, r7580; +} +{ +add.f16x2 r7798, r4641, r7795; +} +{ +add.f16x2 r7801, r6264, r6280; +} +{ +mul.f16x2 r7804, r7801, r7582; +} +{ +add.f16x2 r7807, r7798, r7804; +} +{ +sub.f16x2 r7810, r6242, r6290; +} +{ +mul.f16x2 r7813, r7810, r7581; +} +{ +sub.f16x2 r7816, r6258, r6274; +} +{ +mul.f16x2 r7819, r7816, r7583; +} +{ +add.f16x2 r7822, r7813, r7819; +} +{ +sub.f16x2 r7825, r7807, r7822; +} +{ +add.f16x2 r7828, r6248, r6296; +} +{ +mul.f16x2 r7831, r7828, r7582; +} +{ +add.f16x2 r7834, r4641, r7831; +} +{ +add.f16x2 r7837, r6264, r6280; +} +{ +mul.f16x2 r7840, r7837, r7584; +} +{ +add.f16x2 r7843, r7834, r7840; +} +{ +sub.f16x2 r7846, r6242, r6290; +} +{ +mul.f16x2 r7849, r7846, r7583; +} +{ +sub.f16x2 r7852, r6258, r6274; +} +{ +mul.f16x2 r7855, r7852, r7586; +} +{ +add.f16x2 r7858, r7849, r7855; +} +{ +add.f16x2 r7861, r7843, r7858; +} +{ +add.f16x2 r7864, r6248, r6296; +} +{ +mul.f16x2 r7867, r7864, r7582; +} +{ +add.f16x2 r7870, r4641, r7867; +} +{ +add.f16x2 r7873, r6264, r6280; +} +{ +mul.f16x2 r7876, r7873, r7584; +} +{ +add.f16x2 r7879, r7870, r7876; +} +{ +sub.f16x2 r7882, r6242, r6290; +} +{ +mul.f16x2 r7885, r7882, r7583; +} +{ +sub.f16x2 r7888, r6258, r6274; +} +{ +mul.f16x2 r7891, r7888, r7586; +} +{ +add.f16x2 r7894, r7885, r7891; +} +{ +sub.f16x2 r7897, r7879, r7894; +} +mul.wide.u32 rd4, r10397, 1374389535; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r10401, rd5; +mul.lo.s32 r10402, r10401, 25; +sub.s32 r10403, r10397, r10402; +shl.b32 r10404, r10403, 2; +add.s32 r10405, r10398, r10404; +cvt.rn.f32.u32 f600, r10401; +mul.f32 f601, f600, 0f3D4DE32E; +cos.approx.f32 f485, f601; +sin.approx.f32 f602, f601; +neg.f32 f486, f602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f485; +cvt.rn.f16.f32 high, f486; +mov.b32 r7900, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7903, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7905, {high, high}; +} +{ +mul.f16x2 r7907, r6649, r7905; +} +{ +neg.f16x2 r7910, r7907; +} +{ +fma.rn.f16x2 r7912, r6637, r7903, r7910; +} +{ +mul.f16x2 r7916, r6637, r7905; +} +{ +fma.rn.f16x2 r7919, r6649, r7903, r7916; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7923, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7925, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r7927, {low, high}; +} +{ +mul.f16x2 r7928, r7925, r7927; +} +{ +mul.f16x2 r7931, r7900, r7923; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7934, {high, low}; +} +{ +fma.rn.f16x2 r7936, r7928, r7934, r7931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7936; +mov.b32 r7940, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7936; +mov.b32 r7942, {high, high}; +} +{ +mul.f16x2 r7944, r6969, r7942; +} +{ +neg.f16x2 r7947, r7944; +} +{ +fma.rn.f16x2 r7949, r6957, r7940, r7947; +} +{ +mul.f16x2 r7953, r6957, r7942; +} +{ +fma.rn.f16x2 r7956, r6969, r7940, r7953; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7960, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7962, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r7964, {low, high}; +} +{ +mul.f16x2 r7965, r7962, r7964; +} +{ +mul.f16x2 r7968, r7936, r7960; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7936; +mov.b32 r7971, {high, low}; +} +{ +fma.rn.f16x2 r7973, r7965, r7971, r7968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7973; +mov.b32 r7977, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7973; +mov.b32 r7979, {high, high}; +} +{ +mul.f16x2 r7981, r7289, r7979; +} +{ +neg.f16x2 r7984, r7981; +} +{ +fma.rn.f16x2 r7986, r7277, r7977, r7984; +} +{ +mul.f16x2 r7990, r7277, r7979; +} +{ +fma.rn.f16x2 r7993, r7289, r7977, r7990; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7997, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r7999, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8001, {low, high}; +} +{ +mul.f16x2 r8002, r7999, r8001; +} +{ +mul.f16x2 r8005, r7973, r7997; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7973; +mov.b32 r8008, {high, low}; +} +{ +fma.rn.f16x2 r8010, r8002, r8008, r8005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8010; +mov.b32 r8014, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8010; +mov.b32 r8016, {high, high}; +} +{ +mul.f16x2 r8018, r7609, r8016; +} +{ +neg.f16x2 r8021, r8018; +} +{ +fma.rn.f16x2 r8023, r7597, r8014, r8021; +} +{ +mul.f16x2 r8027, r7597, r8016; +} +{ +fma.rn.f16x2 r8030, r7609, r8014, r8027; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8034, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8036, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8038, {low, high}; +} +{ +mul.f16x2 r8039, r8036, r8038; +} +{ +mul.f16x2 r8042, r8010, r8034; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8010; +mov.b32 r8045, {high, low}; +} +{ +fma.rn.f16x2 r8047, r8039, r8045, r8042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8047; +mov.b32 r8051, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8047; +mov.b32 r8053, {high, high}; +} +{ +mul.f16x2 r8055, r6509, r8053; +} +{ +neg.f16x2 r8058, r8055; +} +{ +fma.rn.f16x2 r8060, r6365, r8051, r8058; +} +{ +mul.f16x2 r8064, r6365, r8053; +} +{ +fma.rn.f16x2 r8067, r6509, r8051, r8064; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8071, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8073, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8075, {low, high}; +} +{ +mul.f16x2 r8076, r8073, r8075; +} +{ +mul.f16x2 r8079, r8047, r8071; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8047; +mov.b32 r8082, {high, low}; +} +{ +fma.rn.f16x2 r8084, r8076, r8082, r8079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8084; +mov.b32 r8088, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8084; +mov.b32 r8090, {high, high}; +} +{ +mul.f16x2 r8092, r6829, r8090; +} +{ +neg.f16x2 r8095, r8092; +} +{ +fma.rn.f16x2 r8097, r6685, r8088, r8095; +} +{ +mul.f16x2 r8101, r6685, r8090; +} +{ +fma.rn.f16x2 r8104, r6829, r8088, r8101; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8108, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8110, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8112, {low, high}; +} +{ +mul.f16x2 r8113, r8110, r8112; +} +{ +mul.f16x2 r8116, r8084, r8108; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8084; +mov.b32 r8119, {high, low}; +} +{ +fma.rn.f16x2 r8121, r8113, r8119, r8116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8121; +mov.b32 r8125, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8121; +mov.b32 r8127, {high, high}; +} +{ +mul.f16x2 r8129, r7149, r8127; +} +{ +neg.f16x2 r8132, r8129; +} +{ +fma.rn.f16x2 r8134, r7005, r8125, r8132; +} +{ +mul.f16x2 r8138, r7005, r8127; +} +{ +fma.rn.f16x2 r8141, r7149, r8125, r8138; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8145, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8147, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8149, {low, high}; +} +{ +mul.f16x2 r8150, r8147, r8149; +} +{ +mul.f16x2 r8153, r8121, r8145; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8121; +mov.b32 r8156, {high, low}; +} +{ +fma.rn.f16x2 r8158, r8150, r8156, r8153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8158; +mov.b32 r8162, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8158; +mov.b32 r8164, {high, high}; +} +{ +mul.f16x2 r8166, r7469, r8164; +} +{ +neg.f16x2 r8169, r8166; +} +{ +fma.rn.f16x2 r8171, r7325, r8162, r8169; +} +{ +mul.f16x2 r8175, r7325, r8164; +} +{ +fma.rn.f16x2 r8178, r7469, r8162, r8175; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8182, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8184, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8186, {low, high}; +} +{ +mul.f16x2 r8187, r8184, r8186; +} +{ +mul.f16x2 r8190, r8158, r8182; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8158; +mov.b32 r8193, {high, low}; +} +{ +fma.rn.f16x2 r8195, r8187, r8193, r8190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8195; +mov.b32 r8199, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8195; +mov.b32 r8201, {high, high}; +} +{ +mul.f16x2 r8203, r7789, r8201; +} +{ +neg.f16x2 r8206, r8203; +} +{ +fma.rn.f16x2 r8208, r7645, r8199, r8206; +} +{ +mul.f16x2 r8212, r7645, r8201; +} +{ +fma.rn.f16x2 r8215, r7789, r8199, r8212; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8219, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8221, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8223, {low, high}; +} +{ +mul.f16x2 r8224, r8221, r8223; +} +{ +mul.f16x2 r8227, r8195, r8219; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8195; +mov.b32 r8230, {high, low}; +} +{ +fma.rn.f16x2 r8232, r8224, r8230, r8227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8232; +mov.b32 r8236, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8232; +mov.b32 r8238, {high, high}; +} +{ +mul.f16x2 r8240, r6581, r8238; +} +{ +neg.f16x2 r8243, r8240; +} +{ +fma.rn.f16x2 r8245, r6437, r8236, r8243; +} +{ +mul.f16x2 r8249, r6437, r8238; +} +{ +fma.rn.f16x2 r8252, r6581, r8236, r8249; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8256, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8258, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8260, {low, high}; +} +{ +mul.f16x2 r8261, r8258, r8260; +} +{ +mul.f16x2 r8264, r8232, r8256; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8232; +mov.b32 r8267, {high, low}; +} +{ +fma.rn.f16x2 r8269, r8261, r8267, r8264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8269; +mov.b32 r8273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8269; +mov.b32 r8275, {high, high}; +} +{ +mul.f16x2 r8277, r6901, r8275; +} +{ +neg.f16x2 r8280, r8277; +} +{ +fma.rn.f16x2 r8282, r6757, r8273, r8280; +} +{ +mul.f16x2 r8286, r6757, r8275; +} +{ +fma.rn.f16x2 r8289, r6901, r8273, r8286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8293, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8295, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8297, {low, high}; +} +{ +mul.f16x2 r8298, r8295, r8297; +} +{ +mul.f16x2 r8301, r8269, r8293; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8269; +mov.b32 r8304, {high, low}; +} +{ +fma.rn.f16x2 r8306, r8298, r8304, r8301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8306; +mov.b32 r8310, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8306; +mov.b32 r8312, {high, high}; +} +{ +mul.f16x2 r8314, r7221, r8312; +} +{ +neg.f16x2 r8317, r8314; +} +{ +fma.rn.f16x2 r8319, r7077, r8310, r8317; +} +{ +mul.f16x2 r8323, r7077, r8312; +} +{ +fma.rn.f16x2 r8326, r7221, r8310, r8323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8330, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8332, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8334, {low, high}; +} +{ +mul.f16x2 r8335, r8332, r8334; +} +{ +mul.f16x2 r8338, r8306, r8330; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8306; +mov.b32 r8341, {high, low}; +} +{ +fma.rn.f16x2 r8343, r8335, r8341, r8338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8343; +mov.b32 r8347, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8343; +mov.b32 r8349, {high, high}; +} +{ +mul.f16x2 r8351, r7541, r8349; +} +{ +neg.f16x2 r8354, r8351; +} +{ +fma.rn.f16x2 r8356, r7397, r8347, r8354; +} +{ +mul.f16x2 r8360, r7397, r8349; +} +{ +fma.rn.f16x2 r8363, r7541, r8347, r8360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8367, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8369, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8371, {low, high}; +} +{ +mul.f16x2 r8372, r8369, r8371; +} +{ +mul.f16x2 r8375, r8343, r8367; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8343; +mov.b32 r8378, {high, low}; +} +{ +fma.rn.f16x2 r8380, r8372, r8378, r8375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8380; +mov.b32 r8384, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8380; +mov.b32 r8386, {high, high}; +} +{ +mul.f16x2 r8388, r7861, r8386; +} +{ +neg.f16x2 r8391, r8388; +} +{ +fma.rn.f16x2 r8393, r7717, r8384, r8391; +} +{ +mul.f16x2 r8397, r7717, r8386; +} +{ +fma.rn.f16x2 r8400, r7861, r8384, r8397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8404, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8406, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8408, {low, high}; +} +{ +mul.f16x2 r8409, r8406, r8408; +} +{ +mul.f16x2 r8412, r8380, r8404; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8380; +mov.b32 r8415, {high, low}; +} +{ +fma.rn.f16x2 r8417, r8409, r8415, r8412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8417; +mov.b32 r8421, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8417; +mov.b32 r8423, {high, high}; +} +{ +mul.f16x2 r8425, r6617, r8423; +} +{ +neg.f16x2 r8428, r8425; +} +{ +fma.rn.f16x2 r8430, r6473, r8421, r8428; +} +{ +mul.f16x2 r8434, r6473, r8423; +} +{ +fma.rn.f16x2 r8437, r6617, r8421, r8434; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8441, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8443, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8445, {low, high}; +} +{ +mul.f16x2 r8446, r8443, r8445; +} +{ +mul.f16x2 r8449, r8417, r8441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8417; +mov.b32 r8452, {high, low}; +} +{ +fma.rn.f16x2 r8454, r8446, r8452, r8449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8454; +mov.b32 r8458, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8454; +mov.b32 r8460, {high, high}; +} +{ +mul.f16x2 r8462, r6937, r8460; +} +{ +neg.f16x2 r8465, r8462; +} +{ +fma.rn.f16x2 r8467, r6793, r8458, r8465; +} +{ +mul.f16x2 r8471, r6793, r8460; +} +{ +fma.rn.f16x2 r8474, r6937, r8458, r8471; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8478, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8480, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8482, {low, high}; +} +{ +mul.f16x2 r8483, r8480, r8482; +} +{ +mul.f16x2 r8486, r8454, r8478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8454; +mov.b32 r8489, {high, low}; +} +{ +fma.rn.f16x2 r8491, r8483, r8489, r8486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8491; +mov.b32 r8495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8491; +mov.b32 r8497, {high, high}; +} +{ +mul.f16x2 r8499, r7257, r8497; +} +{ +neg.f16x2 r8502, r8499; +} +{ +fma.rn.f16x2 r8504, r7113, r8495, r8502; +} +{ +mul.f16x2 r8508, r7113, r8497; +} +{ +fma.rn.f16x2 r8511, r7257, r8495, r8508; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8519, {low, high}; +} +{ +mul.f16x2 r8520, r8517, r8519; +} +{ +mul.f16x2 r8523, r8491, r8515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8491; +mov.b32 r8526, {high, low}; +} +{ +fma.rn.f16x2 r8528, r8520, r8526, r8523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8528; +mov.b32 r8532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8528; +mov.b32 r8534, {high, high}; +} +{ +mul.f16x2 r8536, r7577, r8534; +} +{ +neg.f16x2 r8539, r8536; +} +{ +fma.rn.f16x2 r8541, r7433, r8532, r8539; +} +{ +mul.f16x2 r8545, r7433, r8534; +} +{ +fma.rn.f16x2 r8548, r7577, r8532, r8545; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8556, {low, high}; +} +{ +mul.f16x2 r8557, r8554, r8556; +} +{ +mul.f16x2 r8560, r8528, r8552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8528; +mov.b32 r8563, {high, low}; +} +{ +fma.rn.f16x2 r8565, r8557, r8563, r8560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8565; +mov.b32 r8569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8565; +mov.b32 r8571, {high, high}; +} +{ +mul.f16x2 r8573, r7897, r8571; +} +{ +neg.f16x2 r8576, r8573; +} +{ +fma.rn.f16x2 r8578, r7753, r8569, r8576; +} +{ +mul.f16x2 r8582, r7753, r8571; +} +{ +fma.rn.f16x2 r8585, r7897, r8569, r8582; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8593, {low, high}; +} +{ +mul.f16x2 r8594, r8591, r8593; +} +{ +mul.f16x2 r8597, r8565, r8589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8565; +mov.b32 r8600, {high, low}; +} +{ +fma.rn.f16x2 r8602, r8594, r8600, r8597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8602; +mov.b32 r8606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8602; +mov.b32 r8608, {high, high}; +} +{ +mul.f16x2 r8610, r6545, r8608; +} +{ +neg.f16x2 r8613, r8610; +} +{ +fma.rn.f16x2 r8615, r6401, r8606, r8613; +} +{ +mul.f16x2 r8619, r6401, r8608; +} +{ +fma.rn.f16x2 r8622, r6545, r8606, r8619; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8630, {low, high}; +} +{ +mul.f16x2 r8631, r8628, r8630; +} +{ +mul.f16x2 r8634, r8602, r8626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8602; +mov.b32 r8637, {high, low}; +} +{ +fma.rn.f16x2 r8639, r8631, r8637, r8634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8639; +mov.b32 r8643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8639; +mov.b32 r8645, {high, high}; +} +{ +mul.f16x2 r8647, r6865, r8645; +} +{ +neg.f16x2 r8650, r8647; +} +{ +fma.rn.f16x2 r8652, r6721, r8643, r8650; +} +{ +mul.f16x2 r8656, r6721, r8645; +} +{ +fma.rn.f16x2 r8659, r6865, r8643, r8656; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8667, {low, high}; +} +{ +mul.f16x2 r8668, r8665, r8667; +} +{ +mul.f16x2 r8671, r8639, r8663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8639; +mov.b32 r8674, {high, low}; +} +{ +fma.rn.f16x2 r8676, r8668, r8674, r8671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8676; +mov.b32 r8680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8676; +mov.b32 r8682, {high, high}; +} +{ +mul.f16x2 r8684, r7185, r8682; +} +{ +neg.f16x2 r8687, r8684; +} +{ +fma.rn.f16x2 r8689, r7041, r8680, r8687; +} +{ +mul.f16x2 r8693, r7041, r8682; +} +{ +fma.rn.f16x2 r8696, r7185, r8680, r8693; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8704, {low, high}; +} +{ +mul.f16x2 r8705, r8702, r8704; +} +{ +mul.f16x2 r8708, r8676, r8700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8676; +mov.b32 r8711, {high, low}; +} +{ +fma.rn.f16x2 r8713, r8705, r8711, r8708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8713; +mov.b32 r8717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8713; +mov.b32 r8719, {high, high}; +} +{ +mul.f16x2 r8721, r7505, r8719; +} +{ +neg.f16x2 r8724, r8721; +} +{ +fma.rn.f16x2 r8726, r7361, r8717, r8724; +} +{ +mul.f16x2 r8730, r7361, r8719; +} +{ +fma.rn.f16x2 r8733, r7505, r8717, r8730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7900; +mov.b32 r8739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8741, {low, high}; +} +{ +mul.f16x2 r8742, r8739, r8741; +} +{ +mul.f16x2 r8745, r8713, r8737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8713; +mov.b32 r8748, {high, low}; +} +{ +fma.rn.f16x2 r8750, r8742, r8748, r8745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8750; +mov.b32 r8754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8750; +mov.b32 r8756, {high, high}; +} +{ +mul.f16x2 r8758, r7825, r8756; +} +{ +neg.f16x2 r8761, r8758; +} +{ +fma.rn.f16x2 r8763, r7681, r8754, r8761; +} +{ +mul.f16x2 r8767, r7681, r8756; +} +{ +fma.rn.f16x2 r8770, r7825, r8754, r8767; +} +barrier.sync 0; +mad.lo.s32 r10406, r10401, 2500, r10405; +st.shared.u32 [r10406], r6317; +st.shared.u32 [r10406+100], r7912; +st.shared.u32 [r10406+200], r7949; +st.shared.u32 [r10406+300], r7986; +st.shared.u32 [r10406+400], r8023; +st.shared.u32 [r10406+500], r8060; +st.shared.u32 [r10406+600], r8097; +st.shared.u32 [r10406+700], r8134; +st.shared.u32 [r10406+800], r8171; +st.shared.u32 [r10406+900], r8208; +st.shared.u32 [r10406+1000], r8245; +st.shared.u32 [r10406+1100], r8282; +st.shared.u32 [r10406+1200], r8319; +st.shared.u32 [r10406+1300], r8356; +st.shared.u32 [r10406+1400], r8393; +st.shared.u32 [r10406+1500], r8430; +st.shared.u32 [r10406+1600], r8467; +st.shared.u32 [r10406+1700], r8504; +st.shared.u32 [r10406+1800], r8541; +st.shared.u32 [r10406+1900], r8578; +st.shared.u32 [r10406+2000], r8615; +st.shared.u32 [r10406+2100], r8652; +st.shared.u32 [r10406+2200], r8689; +st.shared.u32 [r10406+2300], r8726; +st.shared.u32 [r10406+2400], r8763; +barrier.sync 0; +ld.shared.u32 r8803, [r10400]; +ld.shared.u32 r9123, [r10400+500]; +ld.shared.u32 r9443, [r10400+1000]; +ld.shared.u32 r9763, [r10400+1500]; +ld.shared.u32 r10083, [r10400+2000]; +ld.shared.u32 r8800, [r10400+2500]; +ld.shared.u32 r9120, [r10400+3000]; +ld.shared.u32 r9440, [r10400+3500]; +ld.shared.u32 r9760, [r10400+4000]; +ld.shared.u32 r10080, [r10400+4500]; +ld.shared.u32 r8806, [r10400+5000]; +ld.shared.u32 r9126, [r10400+5500]; +ld.shared.u32 r9446, [r10400+6000]; +ld.shared.u32 r9766, [r10400+6500]; +ld.shared.u32 r10086, [r10400+7000]; +ld.shared.u32 r8807, [r10400+7500]; +ld.shared.u32 r9127, [r10400+8000]; +ld.shared.u32 r9447, [r10400+8500]; +ld.shared.u32 r9767, [r10400+9000]; +ld.shared.u32 r10087, [r10400+9500]; +ld.shared.u32 r8801, [r10400+10000]; +ld.shared.u32 r9121, [r10400+10500]; +ld.shared.u32 r9441, [r10400+11000]; +ld.shared.u32 r9761, [r10400+11500]; +ld.shared.u32 r10081, [r10400+12000]; +barrier.sync 0; +st.shared.u32 [r10406], r6329; +st.shared.u32 [r10406+100], r7919; +st.shared.u32 [r10406+200], r7956; +st.shared.u32 [r10406+300], r7993; +st.shared.u32 [r10406+400], r8030; +st.shared.u32 [r10406+500], r8067; +st.shared.u32 [r10406+600], r8104; +st.shared.u32 [r10406+700], r8141; +st.shared.u32 [r10406+800], r8178; +st.shared.u32 [r10406+900], r8215; +st.shared.u32 [r10406+1000], r8252; +st.shared.u32 [r10406+1100], r8289; +st.shared.u32 [r10406+1200], r8326; +st.shared.u32 [r10406+1300], r8363; +st.shared.u32 [r10406+1400], r8400; +st.shared.u32 [r10406+1500], r8437; +st.shared.u32 [r10406+1600], r8474; +st.shared.u32 [r10406+1700], r8511; +st.shared.u32 [r10406+1800], r8548; +st.shared.u32 [r10406+1900], r8585; +st.shared.u32 [r10406+2000], r8622; +st.shared.u32 [r10406+2100], r8659; +st.shared.u32 [r10406+2200], r8696; +st.shared.u32 [r10406+2300], r8733; +st.shared.u32 [r10406+2400], r8770; +barrier.sync 0; +ld.shared.u32 r8815, [r10400]; +ld.shared.u32 r9135, [r10400+500]; +ld.shared.u32 r9455, [r10400+1000]; +ld.shared.u32 r9775, [r10400+1500]; +ld.shared.u32 r10095, [r10400+2000]; +ld.shared.u32 r8812, [r10400+2500]; +ld.shared.u32 r9132, [r10400+3000]; +ld.shared.u32 r9452, [r10400+3500]; +ld.shared.u32 r9772, [r10400+4000]; +ld.shared.u32 r10092, [r10400+4500]; +ld.shared.u32 r8818, [r10400+5000]; +ld.shared.u32 r9138, [r10400+5500]; +ld.shared.u32 r9458, [r10400+6000]; +ld.shared.u32 r9778, [r10400+6500]; +ld.shared.u32 r10098, [r10400+7000]; +ld.shared.u32 r8819, [r10400+7500]; +ld.shared.u32 r9139, [r10400+8000]; +ld.shared.u32 r9459, [r10400+8500]; +ld.shared.u32 r9779, [r10400+9000]; +ld.shared.u32 r10099, [r10400+9500]; +ld.shared.u32 r8813, [r10400+10000]; +ld.shared.u32 r9133, [r10400+10500]; +ld.shared.u32 r9453, [r10400+11000]; +ld.shared.u32 r9773, [r10400+11500]; +ld.shared.u32 r10093, [r10400+12000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r8793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r8794, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8795, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8796, {low, high}; +} +{ +neg.f16x2 r8797, r8796; +} +{ +add.f16x2 r8799, r8800, r8801; +} +{ +add.f16x2 r8802, r8803, r8799; +} +{ +add.f16x2 r8805, r8806, r8807; +} +{ +add.f16x2 %0, r8802, r8805; +} +{ +add.f16x2 r8811, r8812, r8813; +} +{ +add.f16x2 r8814, r8815, r8811; +} +{ +add.f16x2 r8817, r8818, r8819; +} +{ +add.f16x2 %1, r8814, r8817; +} +{ +add.f16x2 r8823, r8800, r8801; +} +{ +mul.f16x2 r8826, r8823, r8791; +} +{ +add.f16x2 r8829, r8803, r8826; +} +{ +add.f16x2 r8832, r8806, r8807; +} +{ +mul.f16x2 r8835, r8832, r8793; +} +{ +add.f16x2 r8838, r8829, r8835; +} +{ +sub.f16x2 r8841, r8812, r8813; +} +{ +mul.f16x2 r8844, r8841, r8792; +} +{ +sub.f16x2 r8847, r8818, r8819; +} +{ +mul.f16x2 r8850, r8847, r8794; +} +{ +add.f16x2 r8853, r8844, r8850; +} +{ +sub.f16x2 %10, r8838, r8853; +} +{ +add.f16x2 r8859, r8800, r8801; +} +{ +mul.f16x2 r8862, r8859, r8791; +} +{ +add.f16x2 r8865, r8803, r8862; +} +{ +add.f16x2 r8868, r8806, r8807; +} +{ +mul.f16x2 r8871, r8868, r8793; +} +{ +add.f16x2 r8874, r8865, r8871; +} +{ +sub.f16x2 r8877, r8812, r8813; +} +{ +mul.f16x2 r8880, r8877, r8792; +} +{ +sub.f16x2 r8883, r8818, r8819; +} +{ +mul.f16x2 r8886, r8883, r8794; +} +{ +add.f16x2 r8889, r8880, r8886; +} +{ +add.f16x2 %40, r8874, r8889; +} +{ +add.f16x2 r8895, r8800, r8801; +} +{ +mul.f16x2 r8898, r8895, r8793; +} +{ +add.f16x2 r8901, r8803, r8898; +} +{ +add.f16x2 r8904, r8806, r8807; +} +{ +mul.f16x2 r8907, r8904, r8795; +} +{ +add.f16x2 r8910, r8901, r8907; +} +{ +sub.f16x2 r8913, r8812, r8813; +} +{ +mul.f16x2 r8916, r8913, r8794; +} +{ +sub.f16x2 r8919, r8818, r8819; +} +{ +mul.f16x2 r8922, r8919, r8797; +} +{ +add.f16x2 r8925, r8916, r8922; +} +{ +sub.f16x2 %20, r8910, r8925; +} +{ +add.f16x2 r8931, r8800, r8801; +} +{ +mul.f16x2 r8934, r8931, r8793; +} +{ +add.f16x2 r8937, r8803, r8934; +} +{ +add.f16x2 r8940, r8806, r8807; +} +{ +mul.f16x2 r8943, r8940, r8795; +} +{ +add.f16x2 r8946, r8937, r8943; +} +{ +sub.f16x2 r8949, r8812, r8813; +} +{ +mul.f16x2 r8952, r8949, r8794; +} +{ +sub.f16x2 r8955, r8818, r8819; +} +{ +mul.f16x2 r8958, r8955, r8797; +} +{ +add.f16x2 r8961, r8952, r8958; +} +{ +add.f16x2 %30, r8946, r8961; +} +{ +add.f16x2 r8967, r8812, r8813; +} +{ +mul.f16x2 r8970, r8967, r8791; +} +{ +add.f16x2 r8973, r8815, r8970; +} +{ +add.f16x2 r8976, r8818, r8819; +} +{ +mul.f16x2 r8979, r8976, r8793; +} +{ +add.f16x2 r8982, r8973, r8979; +} +{ +sub.f16x2 r8985, r8800, r8801; +} +{ +mul.f16x2 r8988, r8985, r8792; +} +{ +sub.f16x2 r8991, r8806, r8807; +} +{ +mul.f16x2 r8994, r8991, r8794; +} +{ +add.f16x2 r8997, r8988, r8994; +} +{ +add.f16x2 %11, r8982, r8997; +} +{ +add.f16x2 r9003, r8812, r8813; +} +{ +mul.f16x2 r9006, r9003, r8791; +} +{ +add.f16x2 r9009, r8815, r9006; +} +{ +add.f16x2 r9012, r8818, r8819; +} +{ +mul.f16x2 r9015, r9012, r8793; +} +{ +add.f16x2 r9018, r9009, r9015; +} +{ +sub.f16x2 r9021, r8800, r8801; +} +{ +mul.f16x2 r9024, r9021, r8792; +} +{ +sub.f16x2 r9027, r8806, r8807; +} +{ +mul.f16x2 r9030, r9027, r8794; +} +{ +add.f16x2 r9033, r9024, r9030; +} +{ +sub.f16x2 %41, r9018, r9033; +} +{ +add.f16x2 r9039, r8812, r8813; +} +{ +mul.f16x2 r9042, r9039, r8793; +} +{ +add.f16x2 r9045, r8815, r9042; +} +{ +add.f16x2 r9048, r8818, r8819; +} +{ +mul.f16x2 r9051, r9048, r8795; +} +{ +add.f16x2 r9054, r9045, r9051; +} +{ +sub.f16x2 r9057, r8800, r8801; +} +{ +mul.f16x2 r9060, r9057, r8794; +} +{ +sub.f16x2 r9063, r8806, r8807; +} +{ +mul.f16x2 r9066, r9063, r8797; +} +{ +add.f16x2 r9069, r9060, r9066; +} +{ +add.f16x2 %21, r9054, r9069; +} +{ +add.f16x2 r9075, r8812, r8813; +} +{ +mul.f16x2 r9078, r9075, r8793; +} +{ +add.f16x2 r9081, r8815, r9078; +} +{ +add.f16x2 r9084, r8818, r8819; +} +{ +mul.f16x2 r9087, r9084, r8795; +} +{ +add.f16x2 r9090, r9081, r9087; +} +{ +sub.f16x2 r9093, r8800, r8801; +} +{ +mul.f16x2 r9096, r9093, r8794; +} +{ +sub.f16x2 r9099, r8806, r8807; +} +{ +mul.f16x2 r9102, r9099, r8797; +} +{ +add.f16x2 r9105, r9096, r9102; +} +{ +sub.f16x2 %31, r9090, r9105; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9116, {low, high}; +} +{ +neg.f16x2 r9117, r9116; +} +{ +add.f16x2 r9119, r9120, r9121; +} +{ +add.f16x2 r9122, r9123, r9119; +} +{ +add.f16x2 r9125, r9126, r9127; +} +{ +add.f16x2 %2, r9122, r9125; +} +{ +add.f16x2 r9131, r9132, r9133; +} +{ +add.f16x2 r9134, r9135, r9131; +} +{ +add.f16x2 r9137, r9138, r9139; +} +{ +add.f16x2 %3, r9134, r9137; +} +{ +add.f16x2 r9143, r9120, r9121; +} +{ +mul.f16x2 r9146, r9143, r9111; +} +{ +add.f16x2 r9149, r9123, r9146; +} +{ +add.f16x2 r9152, r9126, r9127; +} +{ +mul.f16x2 r9155, r9152, r9113; +} +{ +add.f16x2 r9158, r9149, r9155; +} +{ +sub.f16x2 r9161, r9132, r9133; +} +{ +mul.f16x2 r9164, r9161, r9112; +} +{ +sub.f16x2 r9167, r9138, r9139; +} +{ +mul.f16x2 r9170, r9167, r9114; +} +{ +add.f16x2 r9173, r9164, r9170; +} +{ +sub.f16x2 %12, r9158, r9173; +} +{ +add.f16x2 r9179, r9120, r9121; +} +{ +mul.f16x2 r9182, r9179, r9111; +} +{ +add.f16x2 r9185, r9123, r9182; +} +{ +add.f16x2 r9188, r9126, r9127; +} +{ +mul.f16x2 r9191, r9188, r9113; +} +{ +add.f16x2 r9194, r9185, r9191; +} +{ +sub.f16x2 r9197, r9132, r9133; +} +{ +mul.f16x2 r9200, r9197, r9112; +} +{ +sub.f16x2 r9203, r9138, r9139; +} +{ +mul.f16x2 r9206, r9203, r9114; +} +{ +add.f16x2 r9209, r9200, r9206; +} +{ +add.f16x2 %42, r9194, r9209; +} +{ +add.f16x2 r9215, r9120, r9121; +} +{ +mul.f16x2 r9218, r9215, r9113; +} +{ +add.f16x2 r9221, r9123, r9218; +} +{ +add.f16x2 r9224, r9126, r9127; +} +{ +mul.f16x2 r9227, r9224, r9115; +} +{ +add.f16x2 r9230, r9221, r9227; +} +{ +sub.f16x2 r9233, r9132, r9133; +} +{ +mul.f16x2 r9236, r9233, r9114; +} +{ +sub.f16x2 r9239, r9138, r9139; +} +{ +mul.f16x2 r9242, r9239, r9117; +} +{ +add.f16x2 r9245, r9236, r9242; +} +{ +sub.f16x2 %22, r9230, r9245; +} +{ +add.f16x2 r9251, r9120, r9121; +} +{ +mul.f16x2 r9254, r9251, r9113; +} +{ +add.f16x2 r9257, r9123, r9254; +} +{ +add.f16x2 r9260, r9126, r9127; +} +{ +mul.f16x2 r9263, r9260, r9115; +} +{ +add.f16x2 r9266, r9257, r9263; +} +{ +sub.f16x2 r9269, r9132, r9133; +} +{ +mul.f16x2 r9272, r9269, r9114; +} +{ +sub.f16x2 r9275, r9138, r9139; +} +{ +mul.f16x2 r9278, r9275, r9117; +} +{ +add.f16x2 r9281, r9272, r9278; +} +{ +add.f16x2 %32, r9266, r9281; +} +{ +add.f16x2 r9287, r9132, r9133; +} +{ +mul.f16x2 r9290, r9287, r9111; +} +{ +add.f16x2 r9293, r9135, r9290; +} +{ +add.f16x2 r9296, r9138, r9139; +} +{ +mul.f16x2 r9299, r9296, r9113; +} +{ +add.f16x2 r9302, r9293, r9299; +} +{ +sub.f16x2 r9305, r9120, r9121; +} +{ +mul.f16x2 r9308, r9305, r9112; +} +{ +sub.f16x2 r9311, r9126, r9127; +} +{ +mul.f16x2 r9314, r9311, r9114; +} +{ +add.f16x2 r9317, r9308, r9314; +} +{ +add.f16x2 %13, r9302, r9317; +} +{ +add.f16x2 r9323, r9132, r9133; +} +{ +mul.f16x2 r9326, r9323, r9111; +} +{ +add.f16x2 r9329, r9135, r9326; +} +{ +add.f16x2 r9332, r9138, r9139; +} +{ +mul.f16x2 r9335, r9332, r9113; +} +{ +add.f16x2 r9338, r9329, r9335; +} +{ +sub.f16x2 r9341, r9120, r9121; +} +{ +mul.f16x2 r9344, r9341, r9112; +} +{ +sub.f16x2 r9347, r9126, r9127; +} +{ +mul.f16x2 r9350, r9347, r9114; +} +{ +add.f16x2 r9353, r9344, r9350; +} +{ +sub.f16x2 %43, r9338, r9353; +} +{ +add.f16x2 r9359, r9132, r9133; +} +{ +mul.f16x2 r9362, r9359, r9113; +} +{ +add.f16x2 r9365, r9135, r9362; +} +{ +add.f16x2 r9368, r9138, r9139; +} +{ +mul.f16x2 r9371, r9368, r9115; +} +{ +add.f16x2 r9374, r9365, r9371; +} +{ +sub.f16x2 r9377, r9120, r9121; +} +{ +mul.f16x2 r9380, r9377, r9114; +} +{ +sub.f16x2 r9383, r9126, r9127; +} +{ +mul.f16x2 r9386, r9383, r9117; +} +{ +add.f16x2 r9389, r9380, r9386; +} +{ +add.f16x2 %23, r9374, r9389; +} +{ +add.f16x2 r9395, r9132, r9133; +} +{ +mul.f16x2 r9398, r9395, r9113; +} +{ +add.f16x2 r9401, r9135, r9398; +} +{ +add.f16x2 r9404, r9138, r9139; +} +{ +mul.f16x2 r9407, r9404, r9115; +} +{ +add.f16x2 r9410, r9401, r9407; +} +{ +sub.f16x2 r9413, r9120, r9121; +} +{ +mul.f16x2 r9416, r9413, r9114; +} +{ +sub.f16x2 r9419, r9126, r9127; +} +{ +mul.f16x2 r9422, r9419, r9117; +} +{ +add.f16x2 r9425, r9416, r9422; +} +{ +sub.f16x2 %33, r9410, r9425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9431, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9434, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9435, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9436, {low, high}; +} +{ +neg.f16x2 r9437, r9436; +} +{ +add.f16x2 r9439, r9440, r9441; +} +{ +add.f16x2 r9442, r9443, r9439; +} +{ +add.f16x2 r9445, r9446, r9447; +} +{ +add.f16x2 %4, r9442, r9445; +} +{ +add.f16x2 r9451, r9452, r9453; +} +{ +add.f16x2 r9454, r9455, r9451; +} +{ +add.f16x2 r9457, r9458, r9459; +} +{ +add.f16x2 %5, r9454, r9457; +} +{ +add.f16x2 r9463, r9440, r9441; +} +{ +mul.f16x2 r9466, r9463, r9431; +} +{ +add.f16x2 r9469, r9443, r9466; +} +{ +add.f16x2 r9472, r9446, r9447; +} +{ +mul.f16x2 r9475, r9472, r9433; +} +{ +add.f16x2 r9478, r9469, r9475; +} +{ +sub.f16x2 r9481, r9452, r9453; +} +{ +mul.f16x2 r9484, r9481, r9432; +} +{ +sub.f16x2 r9487, r9458, r9459; +} +{ +mul.f16x2 r9490, r9487, r9434; +} +{ +add.f16x2 r9493, r9484, r9490; +} +{ +sub.f16x2 %14, r9478, r9493; +} +{ +add.f16x2 r9499, r9440, r9441; +} +{ +mul.f16x2 r9502, r9499, r9431; +} +{ +add.f16x2 r9505, r9443, r9502; +} +{ +add.f16x2 r9508, r9446, r9447; +} +{ +mul.f16x2 r9511, r9508, r9433; +} +{ +add.f16x2 r9514, r9505, r9511; +} +{ +sub.f16x2 r9517, r9452, r9453; +} +{ +mul.f16x2 r9520, r9517, r9432; +} +{ +sub.f16x2 r9523, r9458, r9459; +} +{ +mul.f16x2 r9526, r9523, r9434; +} +{ +add.f16x2 r9529, r9520, r9526; +} +{ +add.f16x2 %44, r9514, r9529; +} +{ +add.f16x2 r9535, r9440, r9441; +} +{ +mul.f16x2 r9538, r9535, r9433; +} +{ +add.f16x2 r9541, r9443, r9538; +} +{ +add.f16x2 r9544, r9446, r9447; +} +{ +mul.f16x2 r9547, r9544, r9435; +} +{ +add.f16x2 r9550, r9541, r9547; +} +{ +sub.f16x2 r9553, r9452, r9453; +} +{ +mul.f16x2 r9556, r9553, r9434; +} +{ +sub.f16x2 r9559, r9458, r9459; +} +{ +mul.f16x2 r9562, r9559, r9437; +} +{ +add.f16x2 r9565, r9556, r9562; +} +{ +sub.f16x2 %24, r9550, r9565; +} +{ +add.f16x2 r9571, r9440, r9441; +} +{ +mul.f16x2 r9574, r9571, r9433; +} +{ +add.f16x2 r9577, r9443, r9574; +} +{ +add.f16x2 r9580, r9446, r9447; +} +{ +mul.f16x2 r9583, r9580, r9435; +} +{ +add.f16x2 r9586, r9577, r9583; +} +{ +sub.f16x2 r9589, r9452, r9453; +} +{ +mul.f16x2 r9592, r9589, r9434; +} +{ +sub.f16x2 r9595, r9458, r9459; +} +{ +mul.f16x2 r9598, r9595, r9437; +} +{ +add.f16x2 r9601, r9592, r9598; +} +{ +add.f16x2 %34, r9586, r9601; +} +{ +add.f16x2 r9607, r9452, r9453; +} +{ +mul.f16x2 r9610, r9607, r9431; +} +{ +add.f16x2 r9613, r9455, r9610; +} +{ +add.f16x2 r9616, r9458, r9459; +} +{ +mul.f16x2 r9619, r9616, r9433; +} +{ +add.f16x2 r9622, r9613, r9619; +} +{ +sub.f16x2 r9625, r9440, r9441; +} +{ +mul.f16x2 r9628, r9625, r9432; +} +{ +sub.f16x2 r9631, r9446, r9447; +} +{ +mul.f16x2 r9634, r9631, r9434; +} +{ +add.f16x2 r9637, r9628, r9634; +} +{ +add.f16x2 %15, r9622, r9637; +} +{ +add.f16x2 r9643, r9452, r9453; +} +{ +mul.f16x2 r9646, r9643, r9431; +} +{ +add.f16x2 r9649, r9455, r9646; +} +{ +add.f16x2 r9652, r9458, r9459; +} +{ +mul.f16x2 r9655, r9652, r9433; +} +{ +add.f16x2 r9658, r9649, r9655; +} +{ +sub.f16x2 r9661, r9440, r9441; +} +{ +mul.f16x2 r9664, r9661, r9432; +} +{ +sub.f16x2 r9667, r9446, r9447; +} +{ +mul.f16x2 r9670, r9667, r9434; +} +{ +add.f16x2 r9673, r9664, r9670; +} +{ +sub.f16x2 %45, r9658, r9673; +} +{ +add.f16x2 r9679, r9452, r9453; +} +{ +mul.f16x2 r9682, r9679, r9433; +} +{ +add.f16x2 r9685, r9455, r9682; +} +{ +add.f16x2 r9688, r9458, r9459; +} +{ +mul.f16x2 r9691, r9688, r9435; +} +{ +add.f16x2 r9694, r9685, r9691; +} +{ +sub.f16x2 r9697, r9440, r9441; +} +{ +mul.f16x2 r9700, r9697, r9434; +} +{ +sub.f16x2 r9703, r9446, r9447; +} +{ +mul.f16x2 r9706, r9703, r9437; +} +{ +add.f16x2 r9709, r9700, r9706; +} +{ +add.f16x2 %25, r9694, r9709; +} +{ +add.f16x2 r9715, r9452, r9453; +} +{ +mul.f16x2 r9718, r9715, r9433; +} +{ +add.f16x2 r9721, r9455, r9718; +} +{ +add.f16x2 r9724, r9458, r9459; +} +{ +mul.f16x2 r9727, r9724, r9435; +} +{ +add.f16x2 r9730, r9721, r9727; +} +{ +sub.f16x2 r9733, r9440, r9441; +} +{ +mul.f16x2 r9736, r9733, r9434; +} +{ +sub.f16x2 r9739, r9446, r9447; +} +{ +mul.f16x2 r9742, r9739, r9437; +} +{ +add.f16x2 r9745, r9736, r9742; +} +{ +sub.f16x2 %35, r9730, r9745; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9754, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9755, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9756, {low, high}; +} +{ +neg.f16x2 r9757, r9756; +} +{ +add.f16x2 r9759, r9760, r9761; +} +{ +add.f16x2 r9762, r9763, r9759; +} +{ +add.f16x2 r9765, r9766, r9767; +} +{ +add.f16x2 %6, r9762, r9765; +} +{ +add.f16x2 r9771, r9772, r9773; +} +{ +add.f16x2 r9774, r9775, r9771; +} +{ +add.f16x2 r9777, r9778, r9779; +} +{ +add.f16x2 %7, r9774, r9777; +} +{ +add.f16x2 r9783, r9760, r9761; +} +{ +mul.f16x2 r9786, r9783, r9751; +} +{ +add.f16x2 r9789, r9763, r9786; +} +{ +add.f16x2 r9792, r9766, r9767; +} +{ +mul.f16x2 r9795, r9792, r9753; +} +{ +add.f16x2 r9798, r9789, r9795; +} +{ +sub.f16x2 r9801, r9772, r9773; +} +{ +mul.f16x2 r9804, r9801, r9752; +} +{ +sub.f16x2 r9807, r9778, r9779; +} +{ +mul.f16x2 r9810, r9807, r9754; +} +{ +add.f16x2 r9813, r9804, r9810; +} +{ +sub.f16x2 %16, r9798, r9813; +} +{ +add.f16x2 r9819, r9760, r9761; +} +{ +mul.f16x2 r9822, r9819, r9751; +} +{ +add.f16x2 r9825, r9763, r9822; +} +{ +add.f16x2 r9828, r9766, r9767; +} +{ +mul.f16x2 r9831, r9828, r9753; +} +{ +add.f16x2 r9834, r9825, r9831; +} +{ +sub.f16x2 r9837, r9772, r9773; +} +{ +mul.f16x2 r9840, r9837, r9752; +} +{ +sub.f16x2 r9843, r9778, r9779; +} +{ +mul.f16x2 r9846, r9843, r9754; +} +{ +add.f16x2 r9849, r9840, r9846; +} +{ +add.f16x2 %46, r9834, r9849; +} +{ +add.f16x2 r9855, r9760, r9761; +} +{ +mul.f16x2 r9858, r9855, r9753; +} +{ +add.f16x2 r9861, r9763, r9858; +} +{ +add.f16x2 r9864, r9766, r9767; +} +{ +mul.f16x2 r9867, r9864, r9755; +} +{ +add.f16x2 r9870, r9861, r9867; +} +{ +sub.f16x2 r9873, r9772, r9773; +} +{ +mul.f16x2 r9876, r9873, r9754; +} +{ +sub.f16x2 r9879, r9778, r9779; +} +{ +mul.f16x2 r9882, r9879, r9757; +} +{ +add.f16x2 r9885, r9876, r9882; +} +{ +sub.f16x2 %26, r9870, r9885; +} +{ +add.f16x2 r9891, r9760, r9761; +} +{ +mul.f16x2 r9894, r9891, r9753; +} +{ +add.f16x2 r9897, r9763, r9894; +} +{ +add.f16x2 r9900, r9766, r9767; +} +{ +mul.f16x2 r9903, r9900, r9755; +} +{ +add.f16x2 r9906, r9897, r9903; +} +{ +sub.f16x2 r9909, r9772, r9773; +} +{ +mul.f16x2 r9912, r9909, r9754; +} +{ +sub.f16x2 r9915, r9778, r9779; +} +{ +mul.f16x2 r9918, r9915, r9757; +} +{ +add.f16x2 r9921, r9912, r9918; +} +{ +add.f16x2 %36, r9906, r9921; +} +{ +add.f16x2 r9927, r9772, r9773; +} +{ +mul.f16x2 r9930, r9927, r9751; +} +{ +add.f16x2 r9933, r9775, r9930; +} +{ +add.f16x2 r9936, r9778, r9779; +} +{ +mul.f16x2 r9939, r9936, r9753; +} +{ +add.f16x2 r9942, r9933, r9939; +} +{ +sub.f16x2 r9945, r9760, r9761; +} +{ +mul.f16x2 r9948, r9945, r9752; +} +{ +sub.f16x2 r9951, r9766, r9767; +} +{ +mul.f16x2 r9954, r9951, r9754; +} +{ +add.f16x2 r9957, r9948, r9954; +} +{ +add.f16x2 %17, r9942, r9957; +} +{ +add.f16x2 r9963, r9772, r9773; +} +{ +mul.f16x2 r9966, r9963, r9751; +} +{ +add.f16x2 r9969, r9775, r9966; +} +{ +add.f16x2 r9972, r9778, r9779; +} +{ +mul.f16x2 r9975, r9972, r9753; +} +{ +add.f16x2 r9978, r9969, r9975; +} +{ +sub.f16x2 r9981, r9760, r9761; +} +{ +mul.f16x2 r9984, r9981, r9752; +} +{ +sub.f16x2 r9987, r9766, r9767; +} +{ +mul.f16x2 r9990, r9987, r9754; +} +{ +add.f16x2 r9993, r9984, r9990; +} +{ +sub.f16x2 %47, r9978, r9993; +} +{ +add.f16x2 r9999, r9772, r9773; +} +{ +mul.f16x2 r10002, r9999, r9753; +} +{ +add.f16x2 r10005, r9775, r10002; +} +{ +add.f16x2 r10008, r9778, r9779; +} +{ +mul.f16x2 r10011, r10008, r9755; +} +{ +add.f16x2 r10014, r10005, r10011; +} +{ +sub.f16x2 r10017, r9760, r9761; +} +{ +mul.f16x2 r10020, r10017, r9754; +} +{ +sub.f16x2 r10023, r9766, r9767; +} +{ +mul.f16x2 r10026, r10023, r9757; +} +{ +add.f16x2 r10029, r10020, r10026; +} +{ +add.f16x2 %27, r10014, r10029; +} +{ +add.f16x2 r10035, r9772, r9773; +} +{ +mul.f16x2 r10038, r10035, r9753; +} +{ +add.f16x2 r10041, r9775, r10038; +} +{ +add.f16x2 r10044, r9778, r9779; +} +{ +mul.f16x2 r10047, r10044, r9755; +} +{ +add.f16x2 r10050, r10041, r10047; +} +{ +sub.f16x2 r10053, r9760, r9761; +} +{ +mul.f16x2 r10056, r10053, r9754; +} +{ +sub.f16x2 r10059, r9766, r9767; +} +{ +mul.f16x2 r10062, r10059, r9757; +} +{ +add.f16x2 r10065, r10056, r10062; +} +{ +sub.f16x2 %37, r10050, r10065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10071, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10072, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r10073, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r10074, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10075, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10076, {low, high}; +} +{ +neg.f16x2 r10077, r10076; +} +{ +add.f16x2 r10079, r10080, r10081; +} +{ +add.f16x2 r10082, r10083, r10079; +} +{ +add.f16x2 r10085, r10086, r10087; +} +{ +add.f16x2 %8, r10082, r10085; +} +{ +add.f16x2 r10091, r10092, r10093; +} +{ +add.f16x2 r10094, r10095, r10091; +} +{ +add.f16x2 r10097, r10098, r10099; +} +{ +add.f16x2 %9, r10094, r10097; +} +{ +add.f16x2 r10103, r10080, r10081; +} +{ +mul.f16x2 r10106, r10103, r10071; +} +{ +add.f16x2 r10109, r10083, r10106; +} +{ +add.f16x2 r10112, r10086, r10087; +} +{ +mul.f16x2 r10115, r10112, r10073; +} +{ +add.f16x2 r10118, r10109, r10115; +} +{ +sub.f16x2 r10121, r10092, r10093; +} +{ +mul.f16x2 r10124, r10121, r10072; +} +{ +sub.f16x2 r10127, r10098, r10099; +} +{ +mul.f16x2 r10130, r10127, r10074; +} +{ +add.f16x2 r10133, r10124, r10130; +} +{ +sub.f16x2 %18, r10118, r10133; +} +{ +add.f16x2 r10139, r10080, r10081; +} +{ +mul.f16x2 r10142, r10139, r10071; +} +{ +add.f16x2 r10145, r10083, r10142; +} +{ +add.f16x2 r10148, r10086, r10087; +} +{ +mul.f16x2 r10151, r10148, r10073; +} +{ +add.f16x2 r10154, r10145, r10151; +} +{ +sub.f16x2 r10157, r10092, r10093; +} +{ +mul.f16x2 r10160, r10157, r10072; +} +{ +sub.f16x2 r10163, r10098, r10099; +} +{ +mul.f16x2 r10166, r10163, r10074; +} +{ +add.f16x2 r10169, r10160, r10166; +} +{ +add.f16x2 %48, r10154, r10169; +} +{ +add.f16x2 r10175, r10080, r10081; +} +{ +mul.f16x2 r10178, r10175, r10073; +} +{ +add.f16x2 r10181, r10083, r10178; +} +{ +add.f16x2 r10184, r10086, r10087; +} +{ +mul.f16x2 r10187, r10184, r10075; +} +{ +add.f16x2 r10190, r10181, r10187; +} +{ +sub.f16x2 r10193, r10092, r10093; +} +{ +mul.f16x2 r10196, r10193, r10074; +} +{ +sub.f16x2 r10199, r10098, r10099; +} +{ +mul.f16x2 r10202, r10199, r10077; +} +{ +add.f16x2 r10205, r10196, r10202; +} +{ +sub.f16x2 %28, r10190, r10205; +} +{ +add.f16x2 r10211, r10080, r10081; +} +{ +mul.f16x2 r10214, r10211, r10073; +} +{ +add.f16x2 r10217, r10083, r10214; +} +{ +add.f16x2 r10220, r10086, r10087; +} +{ +mul.f16x2 r10223, r10220, r10075; +} +{ +add.f16x2 r10226, r10217, r10223; +} +{ +sub.f16x2 r10229, r10092, r10093; +} +{ +mul.f16x2 r10232, r10229, r10074; +} +{ +sub.f16x2 r10235, r10098, r10099; +} +{ +mul.f16x2 r10238, r10235, r10077; +} +{ +add.f16x2 r10241, r10232, r10238; +} +{ +add.f16x2 %38, r10226, r10241; +} +{ +add.f16x2 r10247, r10092, r10093; +} +{ +mul.f16x2 r10250, r10247, r10071; +} +{ +add.f16x2 r10253, r10095, r10250; +} +{ +add.f16x2 r10256, r10098, r10099; +} +{ +mul.f16x2 r10259, r10256, r10073; +} +{ +add.f16x2 r10262, r10253, r10259; +} +{ +sub.f16x2 r10265, r10080, r10081; +} +{ +mul.f16x2 r10268, r10265, r10072; +} +{ +sub.f16x2 r10271, r10086, r10087; +} +{ +mul.f16x2 r10274, r10271, r10074; +} +{ +add.f16x2 r10277, r10268, r10274; +} +{ +add.f16x2 %19, r10262, r10277; +} +{ +add.f16x2 r10283, r10092, r10093; +} +{ +mul.f16x2 r10286, r10283, r10071; +} +{ +add.f16x2 r10289, r10095, r10286; +} +{ +add.f16x2 r10292, r10098, r10099; +} +{ +mul.f16x2 r10295, r10292, r10073; +} +{ +add.f16x2 r10298, r10289, r10295; +} +{ +sub.f16x2 r10301, r10080, r10081; +} +{ +mul.f16x2 r10304, r10301, r10072; +} +{ +sub.f16x2 r10307, r10086, r10087; +} +{ +mul.f16x2 r10310, r10307, r10074; +} +{ +add.f16x2 r10313, r10304, r10310; +} +{ +sub.f16x2 %49, r10298, r10313; +} +{ +add.f16x2 r10319, r10092, r10093; +} +{ +mul.f16x2 r10322, r10319, r10073; +} +{ +add.f16x2 r10325, r10095, r10322; +} +{ +add.f16x2 r10328, r10098, r10099; +} +{ +mul.f16x2 r10331, r10328, r10075; +} +{ +add.f16x2 r10334, r10325, r10331; +} +{ +sub.f16x2 r10337, r10080, r10081; +} +{ +mul.f16x2 r10340, r10337, r10074; +} +{ +sub.f16x2 r10343, r10086, r10087; +} +{ +mul.f16x2 r10346, r10343, r10077; +} +{ +add.f16x2 r10349, r10340, r10346; +} +{ +add.f16x2 %29, r10334, r10349; +} +{ +add.f16x2 r10355, r10092, r10093; +} +{ +mul.f16x2 r10358, r10355, r10073; +} +{ +add.f16x2 r10361, r10095, r10358; +} +{ +add.f16x2 r10364, r10098, r10099; +} +{ +mul.f16x2 r10367, r10364, r10075; +} +{ +add.f16x2 r10370, r10361, r10367; +} +{ +sub.f16x2 r10373, r10080, r10081; +} +{ +mul.f16x2 r10376, r10373, r10074; +} +{ +sub.f16x2 r10379, r10086, r10087; +} +{ +mul.f16x2 r10382, r10379, r10077; +} +{ +add.f16x2 r10385, r10376, r10382; +} +{ +sub.f16x2 %39, r10370, r10385; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[18].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<914, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<121>; +.reg .b32 r<2234>; +.reg .b64 rd<10>; +mov.u32 r2205, %tid.x; +mov.f32 f106, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1, {low, high}; +} +mov.f32 f108, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r2, {low, high}; +} +mov.f32 f102, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r3, {low, high}; +} +mov.f32 f104, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r2205, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r2206, rd3; +mul.lo.s32 r2207, r2206, 625; +sub.s32 r2208, r2205, r2207; +cvt.rn.f32.u32 f109, r2208; +mul.f32 f110, f109, 0f3B03C498; +cos.approx.f32 f13, f110; +sin.approx.f32 f111, f110; +neg.f32 f14, f111; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +mov.u32 r2209, %tid.y; +mov.u32 r2210, %10; +mad.lo.s32 r2211, r2209, 25000, r2210; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f93, 0fBF800000; +mov.f32 f94, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +mad.lo.s32 r2212, r2206, 25000, r2211; +barrier.sync 0; +mad.lo.s32 r2213, r2208, 40, r2212; +st.shared.v2.f32 [r2213], {r18, r30}; +st.shared.v2.f32 [r2213+8], {r333, r340}; +st.shared.v2.f32 [r2213+16], {r370, r377}; +st.shared.v2.f32 [r2213+24], {r407, r414}; +st.shared.v2.f32 [r2213+32], {r444, r451}; +barrier.sync 0; +shl.b32 r2214, r2208, 5; +sub.s32 r2215, r2213, r2214; +ld.shared.u32 r484, [r2215]; +ld.shared.u32 r496, [r2215+4]; +ld.shared.u32 r481, [r2215+5000]; +ld.shared.u32 r493, [r2215+5004]; +ld.shared.u32 r487, [r2215+10000]; +ld.shared.u32 r499, [r2215+10004]; +ld.shared.u32 r488, [r2215+15000]; +ld.shared.u32 r500, [r2215+15004]; +ld.shared.u32 r482, [r2215+20000]; +ld.shared.u32 r494, [r2215+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 r489, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 r645, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 r681, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 r717, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 r753, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 r789, r771, r786; +} +mul.wide.u32 rd4, r2208, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r2216, rd5; +cvt.rn.f32.u32 f112, r2216; +mul.f32 f113, f112, 0f3C24B5BE; +cos.approx.f32 f37, f113; +sin.approx.f32 f114, f113; +neg.f32 f38, f114; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r792, {low, high}; +} +mul.lo.s32 r2217, r2216, 5; +sub.s32 r2218, r2208, r2217; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r797, {high, high}; +} +{ +mul.f16x2 r799, r681, r797; +} +{ +neg.f16x2 r802, r799; +} +{ +fma.rn.f16x2 r804, r537, r795, r802; +} +{ +mul.f16x2 r808, r537, r797; +} +{ +fma.rn.f16x2 r811, r681, r795, r808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r817, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r819, {low, high}; +} +{ +mul.f16x2 r820, r817, r819; +} +{ +mul.f16x2 r823, r792, r815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r826, {high, low}; +} +{ +fma.rn.f16x2 r828, r820, r826, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r834, {high, high}; +} +{ +mul.f16x2 r836, r753, r834; +} +{ +neg.f16x2 r839, r836; +} +{ +fma.rn.f16x2 r841, r609, r832, r839; +} +{ +mul.f16x2 r845, r609, r834; +} +{ +fma.rn.f16x2 r848, r753, r832, r845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r854, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r856, {low, high}; +} +{ +mul.f16x2 r857, r854, r856; +} +{ +mul.f16x2 r860, r828, r852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r863, {high, low}; +} +{ +fma.rn.f16x2 r865, r857, r863, r860; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r871, {high, high}; +} +{ +mul.f16x2 r873, r789, r871; +} +{ +neg.f16x2 r876, r873; +} +{ +fma.rn.f16x2 r878, r645, r869, r876; +} +{ +mul.f16x2 r882, r645, r871; +} +{ +fma.rn.f16x2 r885, r789, r869, r882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r891, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r893, {low, high}; +} +{ +mul.f16x2 r894, r891, r893; +} +{ +mul.f16x2 r897, r865, r889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r900, {high, low}; +} +{ +fma.rn.f16x2 r902, r894, r900, r897; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r908, {high, high}; +} +{ +mul.f16x2 r910, r717, r908; +} +{ +neg.f16x2 r913, r910; +} +{ +fma.rn.f16x2 r915, r573, r906, r913; +} +{ +mul.f16x2 r919, r573, r908; +} +{ +fma.rn.f16x2 r922, r717, r906, r919; +} +shl.b32 r2219, r2218, 3; +add.s32 r2220, r2212, r2219; +barrier.sync 0; +mad.lo.s32 r2221, r2216, 200, r2220; +st.shared.u32 [r2221], r489; +st.shared.u32 [r2221+4], r501; +st.shared.u32 [r2221+40], r804; +st.shared.u32 [r2221+44], r811; +st.shared.u32 [r2221+80], r841; +st.shared.u32 [r2221+84], r848; +st.shared.u32 [r2221+120], r878; +st.shared.u32 [r2221+124], r885; +st.shared.u32 [r2221+160], r915; +st.shared.u32 [r2221+164], r922; +barrier.sync 0; +ld.shared.u32 r955, [r2215]; +ld.shared.u32 r967, [r2215+4]; +ld.shared.u32 r952, [r2215+5000]; +ld.shared.u32 r964, [r2215+5004]; +ld.shared.u32 r958, [r2215+10000]; +ld.shared.u32 r970, [r2215+10004]; +ld.shared.u32 r959, [r2215+15000]; +ld.shared.u32 r971, [r2215+15004]; +ld.shared.u32 r953, [r2215+20000]; +ld.shared.u32 r965, [r2215+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +add.f16x2 r951, r952, r953; +} +{ +add.f16x2 r954, r955, r951; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r954, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r967, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r966, r969; +} +{ +add.f16x2 r975, r952, r953; +} +{ +mul.f16x2 r978, r975, r943; +} +{ +add.f16x2 r981, r955, r978; +} +{ +add.f16x2 r984, r958, r959; +} +{ +mul.f16x2 r987, r984, r945; +} +{ +add.f16x2 r990, r981, r987; +} +{ +sub.f16x2 r993, r964, r965; +} +{ +mul.f16x2 r996, r993, r944; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r946; +} +{ +add.f16x2 r1005, r996, r1002; +} +{ +sub.f16x2 r1008, r990, r1005; +} +{ +add.f16x2 r1011, r952, r953; +} +{ +mul.f16x2 r1014, r1011, r943; +} +{ +add.f16x2 r1017, r955, r1014; +} +{ +add.f16x2 r1020, r958, r959; +} +{ +mul.f16x2 r1023, r1020, r945; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r964, r965; +} +{ +mul.f16x2 r1032, r1029, r944; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r946; +} +{ +add.f16x2 r1041, r1032, r1038; +} +{ +add.f16x2 r1044, r1026, r1041; +} +{ +add.f16x2 r1047, r952, r953; +} +{ +mul.f16x2 r1050, r1047, r945; +} +{ +add.f16x2 r1053, r955, r1050; +} +{ +add.f16x2 r1056, r958, r959; +} +{ +mul.f16x2 r1059, r1056, r947; +} +{ +add.f16x2 r1062, r1053, r1059; +} +{ +sub.f16x2 r1065, r964, r965; +} +{ +mul.f16x2 r1068, r1065, r946; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r949; +} +{ +add.f16x2 r1077, r1068, r1074; +} +{ +sub.f16x2 r1080, r1062, r1077; +} +{ +add.f16x2 r1083, r952, r953; +} +{ +mul.f16x2 r1086, r1083, r945; +} +{ +add.f16x2 r1089, r955, r1086; +} +{ +add.f16x2 r1092, r958, r959; +} +{ +mul.f16x2 r1095, r1092, r947; +} +{ +add.f16x2 r1098, r1089, r1095; +} +{ +sub.f16x2 r1101, r964, r965; +} +{ +mul.f16x2 r1104, r1101, r946; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r949; +} +{ +add.f16x2 r1113, r1104, r1110; +} +{ +add.f16x2 r1116, r1098, r1113; +} +{ +add.f16x2 r1119, r964, r965; +} +{ +mul.f16x2 r1122, r1119, r943; +} +{ +add.f16x2 r1125, r967, r1122; +} +{ +add.f16x2 r1128, r970, r971; +} +{ +mul.f16x2 r1131, r1128, r945; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r952, r953; +} +{ +mul.f16x2 r1140, r1137, r944; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r946; +} +{ +add.f16x2 r1149, r1140, r1146; +} +{ +add.f16x2 r1152, r1134, r1149; +} +{ +add.f16x2 r1155, r964, r965; +} +{ +mul.f16x2 r1158, r1155, r943; +} +{ +add.f16x2 r1161, r967, r1158; +} +{ +add.f16x2 r1164, r970, r971; +} +{ +mul.f16x2 r1167, r1164, r945; +} +{ +add.f16x2 r1170, r1161, r1167; +} +{ +sub.f16x2 r1173, r952, r953; +} +{ +mul.f16x2 r1176, r1173, r944; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r946; +} +{ +add.f16x2 r1185, r1176, r1182; +} +{ +sub.f16x2 r1188, r1170, r1185; +} +{ +add.f16x2 r1191, r964, r965; +} +{ +mul.f16x2 r1194, r1191, r945; +} +{ +add.f16x2 r1197, r967, r1194; +} +{ +add.f16x2 r1200, r970, r971; +} +{ +mul.f16x2 r1203, r1200, r947; +} +{ +add.f16x2 r1206, r1197, r1203; +} +{ +sub.f16x2 r1209, r952, r953; +} +{ +mul.f16x2 r1212, r1209, r946; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r949; +} +{ +add.f16x2 r1221, r1212, r1218; +} +{ +add.f16x2 r1224, r1206, r1221; +} +{ +add.f16x2 r1227, r964, r965; +} +{ +mul.f16x2 r1230, r1227, r945; +} +{ +add.f16x2 r1233, r967, r1230; +} +{ +add.f16x2 r1236, r970, r971; +} +{ +mul.f16x2 r1239, r1236, r947; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r952, r953; +} +{ +mul.f16x2 r1248, r1245, r946; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r949; +} +{ +add.f16x2 r1257, r1248, r1254; +} +{ +sub.f16x2 r1260, r1242, r1257; +} +mul.wide.u32 rd6, r2208, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r2222, rd7; +cvt.rn.f32.u32 f115, r2222; +mul.f32 f116, f115, 0f3D4DE32E; +cos.approx.f32 f61, f116; +sin.approx.f32 f117, f116; +neg.f32 f62, f117; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1263, {low, high}; +} +mul.lo.s32 r2223, r2222, 25; +sub.s32 r2224, r2208, r2223; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1266, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1268, {high, high}; +} +{ +mul.f16x2 r1270, r1152, r1268; +} +{ +neg.f16x2 r1273, r1270; +} +{ +fma.rn.f16x2 r1275, r1008, r1266, r1273; +} +{ +mul.f16x2 r1279, r1008, r1268; +} +{ +fma.rn.f16x2 r1282, r1152, r1266, r1279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1286, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1288, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1290, {low, high}; +} +{ +mul.f16x2 r1291, r1288, r1290; +} +{ +mul.f16x2 r1294, r1263, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1297, {high, low}; +} +{ +fma.rn.f16x2 r1299, r1291, r1297, r1294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1305, {high, high}; +} +{ +mul.f16x2 r1307, r1224, r1305; +} +{ +neg.f16x2 r1310, r1307; +} +{ +fma.rn.f16x2 r1312, r1080, r1303, r1310; +} +{ +mul.f16x2 r1316, r1080, r1305; +} +{ +fma.rn.f16x2 r1319, r1224, r1303, r1316; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1323, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1325, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1327, {low, high}; +} +{ +mul.f16x2 r1328, r1325, r1327; +} +{ +mul.f16x2 r1331, r1299, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1334, {high, low}; +} +{ +fma.rn.f16x2 r1336, r1328, r1334, r1331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1340, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1342, {high, high}; +} +{ +mul.f16x2 r1344, r1260, r1342; +} +{ +neg.f16x2 r1347, r1344; +} +{ +fma.rn.f16x2 r1349, r1116, r1340, r1347; +} +{ +mul.f16x2 r1353, r1116, r1342; +} +{ +fma.rn.f16x2 r1356, r1260, r1340, r1353; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1360, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1362, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1364, {low, high}; +} +{ +mul.f16x2 r1365, r1362, r1364; +} +{ +mul.f16x2 r1368, r1336, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1371, {high, low}; +} +{ +fma.rn.f16x2 r1373, r1365, r1371, r1368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1377, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1379, {high, high}; +} +{ +mul.f16x2 r1381, r1188, r1379; +} +{ +neg.f16x2 r1384, r1381; +} +{ +fma.rn.f16x2 r1386, r1044, r1377, r1384; +} +{ +mul.f16x2 r1390, r1044, r1379; +} +{ +fma.rn.f16x2 r1393, r1188, r1377, r1390; +} +shl.b32 r2225, r2224, 3; +add.s32 r2226, r2212, r2225; +barrier.sync 0; +mad.lo.s32 r2227, r2222, 1000, r2226; +st.shared.u32 [r2227], r960; +st.shared.u32 [r2227+4], r972; +st.shared.u32 [r2227+200], r1275; +st.shared.u32 [r2227+204], r1282; +st.shared.u32 [r2227+400], r1312; +st.shared.u32 [r2227+404], r1319; +st.shared.u32 [r2227+600], r1349; +st.shared.u32 [r2227+604], r1356; +st.shared.u32 [r2227+800], r1386; +st.shared.u32 [r2227+804], r1393; +barrier.sync 0; +ld.shared.u32 r1426, [r2215]; +ld.shared.u32 r1438, [r2215+4]; +ld.shared.u32 r1423, [r2215+5000]; +ld.shared.u32 r1435, [r2215+5004]; +ld.shared.u32 r1429, [r2215+10000]; +ld.shared.u32 r1441, [r2215+10004]; +ld.shared.u32 r1430, [r2215+15000]; +ld.shared.u32 r1442, [r2215+15004]; +ld.shared.u32 r1424, [r2215+20000]; +ld.shared.u32 r1436, [r2215+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1415, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1419, {low, high}; +} +{ +neg.f16x2 r1420, r1419; +} +{ +add.f16x2 r1422, r1423, r1424; +} +{ +add.f16x2 r1425, r1426, r1422; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +add.f16x2 r1431, r1425, r1428; +} +{ +add.f16x2 r1434, r1435, r1436; +} +{ +add.f16x2 r1437, r1438, r1434; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +add.f16x2 r1443, r1437, r1440; +} +{ +add.f16x2 r1446, r1423, r1424; +} +{ +mul.f16x2 r1449, r1446, r1414; +} +{ +add.f16x2 r1452, r1426, r1449; +} +{ +add.f16x2 r1455, r1429, r1430; +} +{ +mul.f16x2 r1458, r1455, r1416; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +sub.f16x2 r1464, r1435, r1436; +} +{ +mul.f16x2 r1467, r1464, r1415; +} +{ +sub.f16x2 r1470, r1441, r1442; +} +{ +mul.f16x2 r1473, r1470, r1417; +} +{ +add.f16x2 r1476, r1467, r1473; +} +{ +sub.f16x2 r1479, r1461, r1476; +} +{ +add.f16x2 r1482, r1423, r1424; +} +{ +mul.f16x2 r1485, r1482, r1414; +} +{ +add.f16x2 r1488, r1426, r1485; +} +{ +add.f16x2 r1491, r1429, r1430; +} +{ +mul.f16x2 r1494, r1491, r1416; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +sub.f16x2 r1500, r1435, r1436; +} +{ +mul.f16x2 r1503, r1500, r1415; +} +{ +sub.f16x2 r1506, r1441, r1442; +} +{ +mul.f16x2 r1509, r1506, r1417; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +add.f16x2 r1515, r1497, r1512; +} +{ +add.f16x2 r1518, r1423, r1424; +} +{ +mul.f16x2 r1521, r1518, r1416; +} +{ +add.f16x2 r1524, r1426, r1521; +} +{ +add.f16x2 r1527, r1429, r1430; +} +{ +mul.f16x2 r1530, r1527, r1418; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1435, r1436; +} +{ +mul.f16x2 r1539, r1536, r1417; +} +{ +sub.f16x2 r1542, r1441, r1442; +} +{ +mul.f16x2 r1545, r1542, r1420; +} +{ +add.f16x2 r1548, r1539, r1545; +} +{ +sub.f16x2 r1551, r1533, r1548; +} +{ +add.f16x2 r1554, r1423, r1424; +} +{ +mul.f16x2 r1557, r1554, r1416; +} +{ +add.f16x2 r1560, r1426, r1557; +} +{ +add.f16x2 r1563, r1429, r1430; +} +{ +mul.f16x2 r1566, r1563, r1418; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +sub.f16x2 r1572, r1435, r1436; +} +{ +mul.f16x2 r1575, r1572, r1417; +} +{ +sub.f16x2 r1578, r1441, r1442; +} +{ +mul.f16x2 r1581, r1578, r1420; +} +{ +add.f16x2 r1584, r1575, r1581; +} +{ +add.f16x2 r1587, r1569, r1584; +} +{ +add.f16x2 r1590, r1435, r1436; +} +{ +mul.f16x2 r1593, r1590, r1414; +} +{ +add.f16x2 r1596, r1438, r1593; +} +{ +add.f16x2 r1599, r1441, r1442; +} +{ +mul.f16x2 r1602, r1599, r1416; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1423, r1424; +} +{ +mul.f16x2 r1611, r1608, r1415; +} +{ +sub.f16x2 r1614, r1429, r1430; +} +{ +mul.f16x2 r1617, r1614, r1417; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +add.f16x2 r1623, r1605, r1620; +} +{ +add.f16x2 r1626, r1435, r1436; +} +{ +mul.f16x2 r1629, r1626, r1414; +} +{ +add.f16x2 r1632, r1438, r1629; +} +{ +add.f16x2 r1635, r1441, r1442; +} +{ +mul.f16x2 r1638, r1635, r1416; +} +{ +add.f16x2 r1641, r1632, r1638; +} +{ +sub.f16x2 r1644, r1423, r1424; +} +{ +mul.f16x2 r1647, r1644, r1415; +} +{ +sub.f16x2 r1650, r1429, r1430; +} +{ +mul.f16x2 r1653, r1650, r1417; +} +{ +add.f16x2 r1656, r1647, r1653; +} +{ +sub.f16x2 r1659, r1641, r1656; +} +{ +add.f16x2 r1662, r1435, r1436; +} +{ +mul.f16x2 r1665, r1662, r1416; +} +{ +add.f16x2 r1668, r1438, r1665; +} +{ +add.f16x2 r1671, r1441, r1442; +} +{ +mul.f16x2 r1674, r1671, r1418; +} +{ +add.f16x2 r1677, r1668, r1674; +} +{ +sub.f16x2 r1680, r1423, r1424; +} +{ +mul.f16x2 r1683, r1680, r1417; +} +{ +sub.f16x2 r1686, r1429, r1430; +} +{ +mul.f16x2 r1689, r1686, r1420; +} +{ +add.f16x2 r1692, r1683, r1689; +} +{ +add.f16x2 r1695, r1677, r1692; +} +{ +add.f16x2 r1698, r1435, r1436; +} +{ +mul.f16x2 r1701, r1698, r1416; +} +{ +add.f16x2 r1704, r1438, r1701; +} +{ +add.f16x2 r1707, r1441, r1442; +} +{ +mul.f16x2 r1710, r1707, r1418; +} +{ +add.f16x2 r1713, r1704, r1710; +} +{ +sub.f16x2 r1716, r1423, r1424; +} +{ +mul.f16x2 r1719, r1716, r1417; +} +{ +sub.f16x2 r1722, r1429, r1430; +} +{ +mul.f16x2 r1725, r1722, r1420; +} +{ +add.f16x2 r1728, r1719, r1725; +} +{ +sub.f16x2 r1731, r1713, r1728; +} +mul.wide.u32 rd8, r2208, 274877907; +shr.u64 rd9, rd8, 35; +cvt.u32.u64 r2228, rd9; +cvt.rn.f32.u32 f118, r2228; +mul.f32 f119, f118, 0f3E80ADFD; +cos.approx.f32 f85, f119; +sin.approx.f32 f120, f119; +neg.f32 f86, f120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f86; +mov.b32 r1734, {low, high}; +} +mul.lo.s32 r2229, r2228, 125; +sub.s32 r2230, r2208, r2229; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1739, {high, high}; +} +{ +mul.f16x2 r1741, r1623, r1739; +} +{ +neg.f16x2 r1744, r1741; +} +{ +fma.rn.f16x2 r1746, r1479, r1737, r1744; +} +{ +mul.f16x2 r1750, r1479, r1739; +} +{ +fma.rn.f16x2 r1753, r1623, r1737, r1750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1759, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1761, {low, high}; +} +{ +mul.f16x2 r1762, r1759, r1761; +} +{ +mul.f16x2 r1765, r1734, r1757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1768, {high, low}; +} +{ +fma.rn.f16x2 r1770, r1762, r1768, r1765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1770; +mov.b32 r1774, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1770; +mov.b32 r1776, {high, high}; +} +{ +mul.f16x2 r1778, r1695, r1776; +} +{ +neg.f16x2 r1781, r1778; +} +{ +fma.rn.f16x2 r1783, r1551, r1774, r1781; +} +{ +mul.f16x2 r1787, r1551, r1776; +} +{ +fma.rn.f16x2 r1790, r1695, r1774, r1787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1796, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1798, {low, high}; +} +{ +mul.f16x2 r1799, r1796, r1798; +} +{ +mul.f16x2 r1802, r1770, r1794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1770; +mov.b32 r1805, {high, low}; +} +{ +fma.rn.f16x2 r1807, r1799, r1805, r1802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1807; +mov.b32 r1811, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1807; +mov.b32 r1813, {high, high}; +} +{ +mul.f16x2 r1815, r1731, r1813; +} +{ +neg.f16x2 r1818, r1815; +} +{ +fma.rn.f16x2 r1820, r1587, r1811, r1818; +} +{ +mul.f16x2 r1824, r1587, r1813; +} +{ +fma.rn.f16x2 r1827, r1731, r1811, r1824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1833, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1835, {low, high}; +} +{ +mul.f16x2 r1836, r1833, r1835; +} +{ +mul.f16x2 r1839, r1807, r1831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1807; +mov.b32 r1842, {high, low}; +} +{ +fma.rn.f16x2 r1844, r1836, r1842, r1839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1844; +mov.b32 r1848, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1844; +mov.b32 r1850, {high, high}; +} +{ +mul.f16x2 r1852, r1659, r1850; +} +{ +neg.f16x2 r1855, r1852; +} +{ +fma.rn.f16x2 r1857, r1515, r1848, r1855; +} +{ +mul.f16x2 r1861, r1515, r1850; +} +{ +fma.rn.f16x2 r1864, r1659, r1848, r1861; +} +shl.b32 r2231, r2230, 3; +add.s32 r2232, r2212, r2231; +barrier.sync 0; +mad.lo.s32 r2233, r2228, 5000, r2232; +st.shared.u32 [r2233], r1431; +st.shared.u32 [r2233+4], r1443; +st.shared.u32 [r2233+1000], r1746; +st.shared.u32 [r2233+1004], r1753; +st.shared.u32 [r2233+2000], r1783; +st.shared.u32 [r2233+2004], r1790; +st.shared.u32 [r2233+3000], r1820; +st.shared.u32 [r2233+3004], r1827; +st.shared.u32 [r2233+4000], r1857; +st.shared.u32 [r2233+4004], r1864; +barrier.sync 0; +ld.shared.u32 r1897, [r2215]; +ld.shared.u32 r1909, [r2215+4]; +ld.shared.u32 r1894, [r2215+5000]; +ld.shared.u32 r1906, [r2215+5004]; +ld.shared.u32 r1900, [r2215+10000]; +ld.shared.u32 r1912, [r2215+10004]; +ld.shared.u32 r1901, [r2215+15000]; +ld.shared.u32 r1913, [r2215+15004]; +ld.shared.u32 r1895, [r2215+20000]; +ld.shared.u32 r1907, [r2215+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1886, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1887, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1888, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1890, {low, high}; +} +{ +neg.f16x2 r1891, r1890; +} +{ +add.f16x2 r1893, r1894, r1895; +} +{ +add.f16x2 r1896, r1897, r1893; +} +{ +add.f16x2 r1899, r1900, r1901; +} +{ +add.f16x2 %0, r1896, r1899; +} +{ +add.f16x2 r1905, r1906, r1907; +} +{ +add.f16x2 r1908, r1909, r1905; +} +{ +add.f16x2 r1911, r1912, r1913; +} +{ +add.f16x2 %1, r1908, r1911; +} +{ +add.f16x2 r1917, r1894, r1895; +} +{ +mul.f16x2 r1920, r1917, r1885; +} +{ +add.f16x2 r1923, r1897, r1920; +} +{ +add.f16x2 r1926, r1900, r1901; +} +{ +mul.f16x2 r1929, r1926, r1887; +} +{ +add.f16x2 r1932, r1923, r1929; +} +{ +sub.f16x2 r1935, r1906, r1907; +} +{ +mul.f16x2 r1938, r1935, r1886; +} +{ +sub.f16x2 r1941, r1912, r1913; +} +{ +mul.f16x2 r1944, r1941, r1888; +} +{ +add.f16x2 r1947, r1938, r1944; +} +{ +sub.f16x2 %2, r1932, r1947; +} +{ +add.f16x2 r1953, r1894, r1895; +} +{ +mul.f16x2 r1956, r1953, r1885; +} +{ +add.f16x2 r1959, r1897, r1956; +} +{ +add.f16x2 r1962, r1900, r1901; +} +{ +mul.f16x2 r1965, r1962, r1887; +} +{ +add.f16x2 r1968, r1959, r1965; +} +{ +sub.f16x2 r1971, r1906, r1907; +} +{ +mul.f16x2 r1974, r1971, r1886; +} +{ +sub.f16x2 r1977, r1912, r1913; +} +{ +mul.f16x2 r1980, r1977, r1888; +} +{ +add.f16x2 r1983, r1974, r1980; +} +{ +add.f16x2 %8, r1968, r1983; +} +{ +add.f16x2 r1989, r1894, r1895; +} +{ +mul.f16x2 r1992, r1989, r1887; +} +{ +add.f16x2 r1995, r1897, r1992; +} +{ +add.f16x2 r1998, r1900, r1901; +} +{ +mul.f16x2 r2001, r1998, r1889; +} +{ +add.f16x2 r2004, r1995, r2001; +} +{ +sub.f16x2 r2007, r1906, r1907; +} +{ +mul.f16x2 r2010, r2007, r1888; +} +{ +sub.f16x2 r2013, r1912, r1913; +} +{ +mul.f16x2 r2016, r2013, r1891; +} +{ +add.f16x2 r2019, r2010, r2016; +} +{ +sub.f16x2 %4, r2004, r2019; +} +{ +add.f16x2 r2025, r1894, r1895; +} +{ +mul.f16x2 r2028, r2025, r1887; +} +{ +add.f16x2 r2031, r1897, r2028; +} +{ +add.f16x2 r2034, r1900, r1901; +} +{ +mul.f16x2 r2037, r2034, r1889; +} +{ +add.f16x2 r2040, r2031, r2037; +} +{ +sub.f16x2 r2043, r1906, r1907; +} +{ +mul.f16x2 r2046, r2043, r1888; +} +{ +sub.f16x2 r2049, r1912, r1913; +} +{ +mul.f16x2 r2052, r2049, r1891; +} +{ +add.f16x2 r2055, r2046, r2052; +} +{ +add.f16x2 %6, r2040, r2055; +} +{ +add.f16x2 r2061, r1906, r1907; +} +{ +mul.f16x2 r2064, r2061, r1885; +} +{ +add.f16x2 r2067, r1909, r2064; +} +{ +add.f16x2 r2070, r1912, r1913; +} +{ +mul.f16x2 r2073, r2070, r1887; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +sub.f16x2 r2079, r1894, r1895; +} +{ +mul.f16x2 r2082, r2079, r1886; +} +{ +sub.f16x2 r2085, r1900, r1901; +} +{ +mul.f16x2 r2088, r2085, r1888; +} +{ +add.f16x2 r2091, r2082, r2088; +} +{ +add.f16x2 %3, r2076, r2091; +} +{ +add.f16x2 r2097, r1906, r1907; +} +{ +mul.f16x2 r2100, r2097, r1885; +} +{ +add.f16x2 r2103, r1909, r2100; +} +{ +add.f16x2 r2106, r1912, r1913; +} +{ +mul.f16x2 r2109, r2106, r1887; +} +{ +add.f16x2 r2112, r2103, r2109; +} +{ +sub.f16x2 r2115, r1894, r1895; +} +{ +mul.f16x2 r2118, r2115, r1886; +} +{ +sub.f16x2 r2121, r1900, r1901; +} +{ +mul.f16x2 r2124, r2121, r1888; +} +{ +add.f16x2 r2127, r2118, r2124; +} +{ +sub.f16x2 %9, r2112, r2127; +} +{ +add.f16x2 r2133, r1906, r1907; +} +{ +mul.f16x2 r2136, r2133, r1887; +} +{ +add.f16x2 r2139, r1909, r2136; +} +{ +add.f16x2 r2142, r1912, r1913; +} +{ +mul.f16x2 r2145, r2142, r1889; +} +{ +add.f16x2 r2148, r2139, r2145; +} +{ +sub.f16x2 r2151, r1894, r1895; +} +{ +mul.f16x2 r2154, r2151, r1888; +} +{ +sub.f16x2 r2157, r1900, r1901; +} +{ +mul.f16x2 r2160, r2157, r1891; +} +{ +add.f16x2 r2163, r2154, r2160; +} +{ +add.f16x2 %5, r2148, r2163; +} +{ +add.f16x2 r2169, r1906, r1907; +} +{ +mul.f16x2 r2172, r2169, r1887; +} +{ +add.f16x2 r2175, r1909, r2172; +} +{ +add.f16x2 r2178, r1912, r1913; +} +{ +mul.f16x2 r2181, r2178, r1889; +} +{ +add.f16x2 r2184, r2175, r2181; +} +{ +sub.f16x2 r2187, r1894, r1895; +} +{ +mul.f16x2 r2190, r2187, r1888; +} +{ +sub.f16x2 r2193, r1900, r1901; +} +{ +mul.f16x2 r2196, r2193, r1891; +} +{ +add.f16x2 r2199, r2190, r2196; +} +{ +sub.f16x2 %7, r2184, r2199; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<915, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<121>; +.reg .b32 r<2234>; +.reg .b64 rd<10>; +mov.u32 r2205, %tid.y; +mov.u32 r2206, %10; +mad.lo.s32 r2207, r2205, 12500, r2206; +mov.u32 r2208, %tid.x; +mov.f32 f106, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1, {low, high}; +} +mov.f32 f108, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r2, {low, high}; +} +mov.f32 f102, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r3, {low, high}; +} +mov.f32 f104, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r2208, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r2209, rd3; +mul.lo.s32 r2210, r2209, 625; +sub.s32 r2211, r2208, r2210; +cvt.rn.f32.u32 f109, r2211; +mul.f32 f110, f109, 0f3B03C498; +cos.approx.f32 f13, f110; +sin.approx.f32 f111, f110; +neg.f32 f14, f111; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f93, 0fBF800000; +mov.f32 f94, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +mad.lo.s32 r2212, r2209, 12500, r2207; +barrier.sync 0; +mad.lo.s32 r2213, r2211, 20, r2212; +st.shared.u32 [r2213], r18; +st.shared.u32 [r2213+4], r333; +st.shared.u32 [r2213+8], r370; +st.shared.u32 [r2213+12], r407; +st.shared.u32 [r2213+16], r444; +barrier.sync 0; +shl.b32 r2214, r2211, 4; +sub.s32 r2215, r2213, r2214; +ld.shared.u32 r484, [r2215]; +ld.shared.u32 r481, [r2215+2500]; +ld.shared.u32 r487, [r2215+5000]; +ld.shared.u32 r488, [r2215+7500]; +ld.shared.u32 r482, [r2215+10000]; +barrier.sync 0; +st.shared.u32 [r2213], r30; +st.shared.u32 [r2213+4], r340; +st.shared.u32 [r2213+8], r377; +st.shared.u32 [r2213+12], r414; +st.shared.u32 [r2213+16], r451; +barrier.sync 0; +ld.shared.u32 r496, [r2215]; +ld.shared.u32 r493, [r2215+2500]; +ld.shared.u32 r499, [r2215+5000]; +ld.shared.u32 r500, [r2215+7500]; +ld.shared.u32 r494, [r2215+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 r489, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 r645, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 r681, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 r717, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 r753, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 r789, r771, r786; +} +mul.wide.u32 rd4, r2211, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r2216, rd5; +cvt.rn.f32.u32 f112, r2216; +mul.f32 f113, f112, 0f3C24B5BE; +cos.approx.f32 f37, f113; +sin.approx.f32 f114, f113; +neg.f32 f38, f114; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r792, {low, high}; +} +mul.lo.s32 r2217, r2216, 5; +sub.s32 r2218, r2211, r2217; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r797, {high, high}; +} +{ +mul.f16x2 r799, r681, r797; +} +{ +neg.f16x2 r802, r799; +} +{ +fma.rn.f16x2 r804, r537, r795, r802; +} +{ +mul.f16x2 r808, r537, r797; +} +{ +fma.rn.f16x2 r811, r681, r795, r808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r817, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r819, {low, high}; +} +{ +mul.f16x2 r820, r817, r819; +} +{ +mul.f16x2 r823, r792, r815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r826, {high, low}; +} +{ +fma.rn.f16x2 r828, r820, r826, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r834, {high, high}; +} +{ +mul.f16x2 r836, r753, r834; +} +{ +neg.f16x2 r839, r836; +} +{ +fma.rn.f16x2 r841, r609, r832, r839; +} +{ +mul.f16x2 r845, r609, r834; +} +{ +fma.rn.f16x2 r848, r753, r832, r845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r854, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r856, {low, high}; +} +{ +mul.f16x2 r857, r854, r856; +} +{ +mul.f16x2 r860, r828, r852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r863, {high, low}; +} +{ +fma.rn.f16x2 r865, r857, r863, r860; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r871, {high, high}; +} +{ +mul.f16x2 r873, r789, r871; +} +{ +neg.f16x2 r876, r873; +} +{ +fma.rn.f16x2 r878, r645, r869, r876; +} +{ +mul.f16x2 r882, r645, r871; +} +{ +fma.rn.f16x2 r885, r789, r869, r882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r891, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r893, {low, high}; +} +{ +mul.f16x2 r894, r891, r893; +} +{ +mul.f16x2 r897, r865, r889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r900, {high, low}; +} +{ +fma.rn.f16x2 r902, r894, r900, r897; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r908, {high, high}; +} +{ +mul.f16x2 r910, r717, r908; +} +{ +neg.f16x2 r913, r910; +} +{ +fma.rn.f16x2 r915, r573, r906, r913; +} +{ +mul.f16x2 r919, r573, r908; +} +{ +fma.rn.f16x2 r922, r717, r906, r919; +} +shl.b32 r2219, r2218, 2; +add.s32 r2220, r2212, r2219; +barrier.sync 0; +mad.lo.s32 r2221, r2216, 100, r2220; +st.shared.u32 [r2221], r489; +st.shared.u32 [r2221+20], r804; +st.shared.u32 [r2221+40], r841; +st.shared.u32 [r2221+60], r878; +st.shared.u32 [r2221+80], r915; +barrier.sync 0; +ld.shared.u32 r955, [r2215]; +ld.shared.u32 r952, [r2215+2500]; +ld.shared.u32 r958, [r2215+5000]; +ld.shared.u32 r959, [r2215+7500]; +ld.shared.u32 r953, [r2215+10000]; +barrier.sync 0; +st.shared.u32 [r2221], r501; +st.shared.u32 [r2221+20], r811; +st.shared.u32 [r2221+40], r848; +st.shared.u32 [r2221+60], r885; +st.shared.u32 [r2221+80], r922; +barrier.sync 0; +ld.shared.u32 r967, [r2215]; +ld.shared.u32 r964, [r2215+2500]; +ld.shared.u32 r970, [r2215+5000]; +ld.shared.u32 r971, [r2215+7500]; +ld.shared.u32 r965, [r2215+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +add.f16x2 r951, r952, r953; +} +{ +add.f16x2 r954, r955, r951; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r954, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r967, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r966, r969; +} +{ +add.f16x2 r975, r952, r953; +} +{ +mul.f16x2 r978, r975, r943; +} +{ +add.f16x2 r981, r955, r978; +} +{ +add.f16x2 r984, r958, r959; +} +{ +mul.f16x2 r987, r984, r945; +} +{ +add.f16x2 r990, r981, r987; +} +{ +sub.f16x2 r993, r964, r965; +} +{ +mul.f16x2 r996, r993, r944; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r946; +} +{ +add.f16x2 r1005, r996, r1002; +} +{ +sub.f16x2 r1008, r990, r1005; +} +{ +add.f16x2 r1011, r952, r953; +} +{ +mul.f16x2 r1014, r1011, r943; +} +{ +add.f16x2 r1017, r955, r1014; +} +{ +add.f16x2 r1020, r958, r959; +} +{ +mul.f16x2 r1023, r1020, r945; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r964, r965; +} +{ +mul.f16x2 r1032, r1029, r944; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r946; +} +{ +add.f16x2 r1041, r1032, r1038; +} +{ +add.f16x2 r1044, r1026, r1041; +} +{ +add.f16x2 r1047, r952, r953; +} +{ +mul.f16x2 r1050, r1047, r945; +} +{ +add.f16x2 r1053, r955, r1050; +} +{ +add.f16x2 r1056, r958, r959; +} +{ +mul.f16x2 r1059, r1056, r947; +} +{ +add.f16x2 r1062, r1053, r1059; +} +{ +sub.f16x2 r1065, r964, r965; +} +{ +mul.f16x2 r1068, r1065, r946; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r949; +} +{ +add.f16x2 r1077, r1068, r1074; +} +{ +sub.f16x2 r1080, r1062, r1077; +} +{ +add.f16x2 r1083, r952, r953; +} +{ +mul.f16x2 r1086, r1083, r945; +} +{ +add.f16x2 r1089, r955, r1086; +} +{ +add.f16x2 r1092, r958, r959; +} +{ +mul.f16x2 r1095, r1092, r947; +} +{ +add.f16x2 r1098, r1089, r1095; +} +{ +sub.f16x2 r1101, r964, r965; +} +{ +mul.f16x2 r1104, r1101, r946; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r949; +} +{ +add.f16x2 r1113, r1104, r1110; +} +{ +add.f16x2 r1116, r1098, r1113; +} +{ +add.f16x2 r1119, r964, r965; +} +{ +mul.f16x2 r1122, r1119, r943; +} +{ +add.f16x2 r1125, r967, r1122; +} +{ +add.f16x2 r1128, r970, r971; +} +{ +mul.f16x2 r1131, r1128, r945; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r952, r953; +} +{ +mul.f16x2 r1140, r1137, r944; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r946; +} +{ +add.f16x2 r1149, r1140, r1146; +} +{ +add.f16x2 r1152, r1134, r1149; +} +{ +add.f16x2 r1155, r964, r965; +} +{ +mul.f16x2 r1158, r1155, r943; +} +{ +add.f16x2 r1161, r967, r1158; +} +{ +add.f16x2 r1164, r970, r971; +} +{ +mul.f16x2 r1167, r1164, r945; +} +{ +add.f16x2 r1170, r1161, r1167; +} +{ +sub.f16x2 r1173, r952, r953; +} +{ +mul.f16x2 r1176, r1173, r944; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r946; +} +{ +add.f16x2 r1185, r1176, r1182; +} +{ +sub.f16x2 r1188, r1170, r1185; +} +{ +add.f16x2 r1191, r964, r965; +} +{ +mul.f16x2 r1194, r1191, r945; +} +{ +add.f16x2 r1197, r967, r1194; +} +{ +add.f16x2 r1200, r970, r971; +} +{ +mul.f16x2 r1203, r1200, r947; +} +{ +add.f16x2 r1206, r1197, r1203; +} +{ +sub.f16x2 r1209, r952, r953; +} +{ +mul.f16x2 r1212, r1209, r946; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r949; +} +{ +add.f16x2 r1221, r1212, r1218; +} +{ +add.f16x2 r1224, r1206, r1221; +} +{ +add.f16x2 r1227, r964, r965; +} +{ +mul.f16x2 r1230, r1227, r945; +} +{ +add.f16x2 r1233, r967, r1230; +} +{ +add.f16x2 r1236, r970, r971; +} +{ +mul.f16x2 r1239, r1236, r947; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r952, r953; +} +{ +mul.f16x2 r1248, r1245, r946; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r949; +} +{ +add.f16x2 r1257, r1248, r1254; +} +{ +sub.f16x2 r1260, r1242, r1257; +} +mul.wide.u32 rd6, r2211, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r2222, rd7; +cvt.rn.f32.u32 f115, r2222; +mul.f32 f116, f115, 0f3D4DE32E; +cos.approx.f32 f61, f116; +sin.approx.f32 f117, f116; +neg.f32 f62, f117; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1263, {low, high}; +} +mul.lo.s32 r2223, r2222, 25; +sub.s32 r2224, r2211, r2223; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1266, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1268, {high, high}; +} +{ +mul.f16x2 r1270, r1152, r1268; +} +{ +neg.f16x2 r1273, r1270; +} +{ +fma.rn.f16x2 r1275, r1008, r1266, r1273; +} +{ +mul.f16x2 r1279, r1008, r1268; +} +{ +fma.rn.f16x2 r1282, r1152, r1266, r1279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1286, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1288, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1290, {low, high}; +} +{ +mul.f16x2 r1291, r1288, r1290; +} +{ +mul.f16x2 r1294, r1263, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1297, {high, low}; +} +{ +fma.rn.f16x2 r1299, r1291, r1297, r1294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1305, {high, high}; +} +{ +mul.f16x2 r1307, r1224, r1305; +} +{ +neg.f16x2 r1310, r1307; +} +{ +fma.rn.f16x2 r1312, r1080, r1303, r1310; +} +{ +mul.f16x2 r1316, r1080, r1305; +} +{ +fma.rn.f16x2 r1319, r1224, r1303, r1316; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1323, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1325, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1327, {low, high}; +} +{ +mul.f16x2 r1328, r1325, r1327; +} +{ +mul.f16x2 r1331, r1299, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1334, {high, low}; +} +{ +fma.rn.f16x2 r1336, r1328, r1334, r1331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1340, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1342, {high, high}; +} +{ +mul.f16x2 r1344, r1260, r1342; +} +{ +neg.f16x2 r1347, r1344; +} +{ +fma.rn.f16x2 r1349, r1116, r1340, r1347; +} +{ +mul.f16x2 r1353, r1116, r1342; +} +{ +fma.rn.f16x2 r1356, r1260, r1340, r1353; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1360, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1362, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1364, {low, high}; +} +{ +mul.f16x2 r1365, r1362, r1364; +} +{ +mul.f16x2 r1368, r1336, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1371, {high, low}; +} +{ +fma.rn.f16x2 r1373, r1365, r1371, r1368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1377, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1379, {high, high}; +} +{ +mul.f16x2 r1381, r1188, r1379; +} +{ +neg.f16x2 r1384, r1381; +} +{ +fma.rn.f16x2 r1386, r1044, r1377, r1384; +} +{ +mul.f16x2 r1390, r1044, r1379; +} +{ +fma.rn.f16x2 r1393, r1188, r1377, r1390; +} +shl.b32 r2225, r2224, 2; +add.s32 r2226, r2212, r2225; +barrier.sync 0; +mad.lo.s32 r2227, r2222, 500, r2226; +st.shared.u32 [r2227], r960; +st.shared.u32 [r2227+100], r1275; +st.shared.u32 [r2227+200], r1312; +st.shared.u32 [r2227+300], r1349; +st.shared.u32 [r2227+400], r1386; +barrier.sync 0; +ld.shared.u32 r1426, [r2215]; +ld.shared.u32 r1423, [r2215+2500]; +ld.shared.u32 r1429, [r2215+5000]; +ld.shared.u32 r1430, [r2215+7500]; +ld.shared.u32 r1424, [r2215+10000]; +barrier.sync 0; +st.shared.u32 [r2227], r972; +st.shared.u32 [r2227+100], r1282; +st.shared.u32 [r2227+200], r1319; +st.shared.u32 [r2227+300], r1356; +st.shared.u32 [r2227+400], r1393; +barrier.sync 0; +ld.shared.u32 r1438, [r2215]; +ld.shared.u32 r1435, [r2215+2500]; +ld.shared.u32 r1441, [r2215+5000]; +ld.shared.u32 r1442, [r2215+7500]; +ld.shared.u32 r1436, [r2215+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1415, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1419, {low, high}; +} +{ +neg.f16x2 r1420, r1419; +} +{ +add.f16x2 r1422, r1423, r1424; +} +{ +add.f16x2 r1425, r1426, r1422; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +add.f16x2 r1431, r1425, r1428; +} +{ +add.f16x2 r1434, r1435, r1436; +} +{ +add.f16x2 r1437, r1438, r1434; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +add.f16x2 r1443, r1437, r1440; +} +{ +add.f16x2 r1446, r1423, r1424; +} +{ +mul.f16x2 r1449, r1446, r1414; +} +{ +add.f16x2 r1452, r1426, r1449; +} +{ +add.f16x2 r1455, r1429, r1430; +} +{ +mul.f16x2 r1458, r1455, r1416; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +sub.f16x2 r1464, r1435, r1436; +} +{ +mul.f16x2 r1467, r1464, r1415; +} +{ +sub.f16x2 r1470, r1441, r1442; +} +{ +mul.f16x2 r1473, r1470, r1417; +} +{ +add.f16x2 r1476, r1467, r1473; +} +{ +sub.f16x2 r1479, r1461, r1476; +} +{ +add.f16x2 r1482, r1423, r1424; +} +{ +mul.f16x2 r1485, r1482, r1414; +} +{ +add.f16x2 r1488, r1426, r1485; +} +{ +add.f16x2 r1491, r1429, r1430; +} +{ +mul.f16x2 r1494, r1491, r1416; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +sub.f16x2 r1500, r1435, r1436; +} +{ +mul.f16x2 r1503, r1500, r1415; +} +{ +sub.f16x2 r1506, r1441, r1442; +} +{ +mul.f16x2 r1509, r1506, r1417; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +add.f16x2 r1515, r1497, r1512; +} +{ +add.f16x2 r1518, r1423, r1424; +} +{ +mul.f16x2 r1521, r1518, r1416; +} +{ +add.f16x2 r1524, r1426, r1521; +} +{ +add.f16x2 r1527, r1429, r1430; +} +{ +mul.f16x2 r1530, r1527, r1418; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1435, r1436; +} +{ +mul.f16x2 r1539, r1536, r1417; +} +{ +sub.f16x2 r1542, r1441, r1442; +} +{ +mul.f16x2 r1545, r1542, r1420; +} +{ +add.f16x2 r1548, r1539, r1545; +} +{ +sub.f16x2 r1551, r1533, r1548; +} +{ +add.f16x2 r1554, r1423, r1424; +} +{ +mul.f16x2 r1557, r1554, r1416; +} +{ +add.f16x2 r1560, r1426, r1557; +} +{ +add.f16x2 r1563, r1429, r1430; +} +{ +mul.f16x2 r1566, r1563, r1418; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +sub.f16x2 r1572, r1435, r1436; +} +{ +mul.f16x2 r1575, r1572, r1417; +} +{ +sub.f16x2 r1578, r1441, r1442; +} +{ +mul.f16x2 r1581, r1578, r1420; +} +{ +add.f16x2 r1584, r1575, r1581; +} +{ +add.f16x2 r1587, r1569, r1584; +} +{ +add.f16x2 r1590, r1435, r1436; +} +{ +mul.f16x2 r1593, r1590, r1414; +} +{ +add.f16x2 r1596, r1438, r1593; +} +{ +add.f16x2 r1599, r1441, r1442; +} +{ +mul.f16x2 r1602, r1599, r1416; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1423, r1424; +} +{ +mul.f16x2 r1611, r1608, r1415; +} +{ +sub.f16x2 r1614, r1429, r1430; +} +{ +mul.f16x2 r1617, r1614, r1417; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +add.f16x2 r1623, r1605, r1620; +} +{ +add.f16x2 r1626, r1435, r1436; +} +{ +mul.f16x2 r1629, r1626, r1414; +} +{ +add.f16x2 r1632, r1438, r1629; +} +{ +add.f16x2 r1635, r1441, r1442; +} +{ +mul.f16x2 r1638, r1635, r1416; +} +{ +add.f16x2 r1641, r1632, r1638; +} +{ +sub.f16x2 r1644, r1423, r1424; +} +{ +mul.f16x2 r1647, r1644, r1415; +} +{ +sub.f16x2 r1650, r1429, r1430; +} +{ +mul.f16x2 r1653, r1650, r1417; +} +{ +add.f16x2 r1656, r1647, r1653; +} +{ +sub.f16x2 r1659, r1641, r1656; +} +{ +add.f16x2 r1662, r1435, r1436; +} +{ +mul.f16x2 r1665, r1662, r1416; +} +{ +add.f16x2 r1668, r1438, r1665; +} +{ +add.f16x2 r1671, r1441, r1442; +} +{ +mul.f16x2 r1674, r1671, r1418; +} +{ +add.f16x2 r1677, r1668, r1674; +} +{ +sub.f16x2 r1680, r1423, r1424; +} +{ +mul.f16x2 r1683, r1680, r1417; +} +{ +sub.f16x2 r1686, r1429, r1430; +} +{ +mul.f16x2 r1689, r1686, r1420; +} +{ +add.f16x2 r1692, r1683, r1689; +} +{ +add.f16x2 r1695, r1677, r1692; +} +{ +add.f16x2 r1698, r1435, r1436; +} +{ +mul.f16x2 r1701, r1698, r1416; +} +{ +add.f16x2 r1704, r1438, r1701; +} +{ +add.f16x2 r1707, r1441, r1442; +} +{ +mul.f16x2 r1710, r1707, r1418; +} +{ +add.f16x2 r1713, r1704, r1710; +} +{ +sub.f16x2 r1716, r1423, r1424; +} +{ +mul.f16x2 r1719, r1716, r1417; +} +{ +sub.f16x2 r1722, r1429, r1430; +} +{ +mul.f16x2 r1725, r1722, r1420; +} +{ +add.f16x2 r1728, r1719, r1725; +} +{ +sub.f16x2 r1731, r1713, r1728; +} +mul.wide.u32 rd8, r2211, 274877907; +shr.u64 rd9, rd8, 35; +cvt.u32.u64 r2228, rd9; +cvt.rn.f32.u32 f118, r2228; +mul.f32 f119, f118, 0f3E80ADFD; +cos.approx.f32 f85, f119; +sin.approx.f32 f120, f119; +neg.f32 f86, f120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f86; +mov.b32 r1734, {low, high}; +} +mul.lo.s32 r2229, r2228, 125; +sub.s32 r2230, r2211, r2229; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1739, {high, high}; +} +{ +mul.f16x2 r1741, r1623, r1739; +} +{ +neg.f16x2 r1744, r1741; +} +{ +fma.rn.f16x2 r1746, r1479, r1737, r1744; +} +{ +mul.f16x2 r1750, r1479, r1739; +} +{ +fma.rn.f16x2 r1753, r1623, r1737, r1750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1759, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1761, {low, high}; +} +{ +mul.f16x2 r1762, r1759, r1761; +} +{ +mul.f16x2 r1765, r1734, r1757; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1768, {high, low}; +} +{ +fma.rn.f16x2 r1770, r1762, r1768, r1765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1770; +mov.b32 r1774, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1770; +mov.b32 r1776, {high, high}; +} +{ +mul.f16x2 r1778, r1695, r1776; +} +{ +neg.f16x2 r1781, r1778; +} +{ +fma.rn.f16x2 r1783, r1551, r1774, r1781; +} +{ +mul.f16x2 r1787, r1551, r1776; +} +{ +fma.rn.f16x2 r1790, r1695, r1774, r1787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1796, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1798, {low, high}; +} +{ +mul.f16x2 r1799, r1796, r1798; +} +{ +mul.f16x2 r1802, r1770, r1794; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1770; +mov.b32 r1805, {high, low}; +} +{ +fma.rn.f16x2 r1807, r1799, r1805, r1802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1807; +mov.b32 r1811, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1807; +mov.b32 r1813, {high, high}; +} +{ +mul.f16x2 r1815, r1731, r1813; +} +{ +neg.f16x2 r1818, r1815; +} +{ +fma.rn.f16x2 r1820, r1587, r1811, r1818; +} +{ +mul.f16x2 r1824, r1587, r1813; +} +{ +fma.rn.f16x2 r1827, r1731, r1811, r1824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1734; +mov.b32 r1833, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1835, {low, high}; +} +{ +mul.f16x2 r1836, r1833, r1835; +} +{ +mul.f16x2 r1839, r1807, r1831; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1807; +mov.b32 r1842, {high, low}; +} +{ +fma.rn.f16x2 r1844, r1836, r1842, r1839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1844; +mov.b32 r1848, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1844; +mov.b32 r1850, {high, high}; +} +{ +mul.f16x2 r1852, r1659, r1850; +} +{ +neg.f16x2 r1855, r1852; +} +{ +fma.rn.f16x2 r1857, r1515, r1848, r1855; +} +{ +mul.f16x2 r1861, r1515, r1850; +} +{ +fma.rn.f16x2 r1864, r1659, r1848, r1861; +} +shl.b32 r2231, r2230, 2; +add.s32 r2232, r2212, r2231; +barrier.sync 0; +mad.lo.s32 r2233, r2228, 2500, r2232; +st.shared.u32 [r2233], r1431; +st.shared.u32 [r2233+500], r1746; +st.shared.u32 [r2233+1000], r1783; +st.shared.u32 [r2233+1500], r1820; +st.shared.u32 [r2233+2000], r1857; +barrier.sync 0; +ld.shared.u32 r1897, [r2215]; +ld.shared.u32 r1894, [r2215+2500]; +ld.shared.u32 r1900, [r2215+5000]; +ld.shared.u32 r1901, [r2215+7500]; +ld.shared.u32 r1895, [r2215+10000]; +barrier.sync 0; +st.shared.u32 [r2233], r1443; +st.shared.u32 [r2233+500], r1753; +st.shared.u32 [r2233+1000], r1790; +st.shared.u32 [r2233+1500], r1827; +st.shared.u32 [r2233+2000], r1864; +barrier.sync 0; +ld.shared.u32 r1909, [r2215]; +ld.shared.u32 r1906, [r2215+2500]; +ld.shared.u32 r1912, [r2215+5000]; +ld.shared.u32 r1913, [r2215+7500]; +ld.shared.u32 r1907, [r2215+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1886, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1887, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1888, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1890, {low, high}; +} +{ +neg.f16x2 r1891, r1890; +} +{ +add.f16x2 r1893, r1894, r1895; +} +{ +add.f16x2 r1896, r1897, r1893; +} +{ +add.f16x2 r1899, r1900, r1901; +} +{ +add.f16x2 %0, r1896, r1899; +} +{ +add.f16x2 r1905, r1906, r1907; +} +{ +add.f16x2 r1908, r1909, r1905; +} +{ +add.f16x2 r1911, r1912, r1913; +} +{ +add.f16x2 %1, r1908, r1911; +} +{ +add.f16x2 r1917, r1894, r1895; +} +{ +mul.f16x2 r1920, r1917, r1885; +} +{ +add.f16x2 r1923, r1897, r1920; +} +{ +add.f16x2 r1926, r1900, r1901; +} +{ +mul.f16x2 r1929, r1926, r1887; +} +{ +add.f16x2 r1932, r1923, r1929; +} +{ +sub.f16x2 r1935, r1906, r1907; +} +{ +mul.f16x2 r1938, r1935, r1886; +} +{ +sub.f16x2 r1941, r1912, r1913; +} +{ +mul.f16x2 r1944, r1941, r1888; +} +{ +add.f16x2 r1947, r1938, r1944; +} +{ +sub.f16x2 %2, r1932, r1947; +} +{ +add.f16x2 r1953, r1894, r1895; +} +{ +mul.f16x2 r1956, r1953, r1885; +} +{ +add.f16x2 r1959, r1897, r1956; +} +{ +add.f16x2 r1962, r1900, r1901; +} +{ +mul.f16x2 r1965, r1962, r1887; +} +{ +add.f16x2 r1968, r1959, r1965; +} +{ +sub.f16x2 r1971, r1906, r1907; +} +{ +mul.f16x2 r1974, r1971, r1886; +} +{ +sub.f16x2 r1977, r1912, r1913; +} +{ +mul.f16x2 r1980, r1977, r1888; +} +{ +add.f16x2 r1983, r1974, r1980; +} +{ +add.f16x2 %8, r1968, r1983; +} +{ +add.f16x2 r1989, r1894, r1895; +} +{ +mul.f16x2 r1992, r1989, r1887; +} +{ +add.f16x2 r1995, r1897, r1992; +} +{ +add.f16x2 r1998, r1900, r1901; +} +{ +mul.f16x2 r2001, r1998, r1889; +} +{ +add.f16x2 r2004, r1995, r2001; +} +{ +sub.f16x2 r2007, r1906, r1907; +} +{ +mul.f16x2 r2010, r2007, r1888; +} +{ +sub.f16x2 r2013, r1912, r1913; +} +{ +mul.f16x2 r2016, r2013, r1891; +} +{ +add.f16x2 r2019, r2010, r2016; +} +{ +sub.f16x2 %4, r2004, r2019; +} +{ +add.f16x2 r2025, r1894, r1895; +} +{ +mul.f16x2 r2028, r2025, r1887; +} +{ +add.f16x2 r2031, r1897, r2028; +} +{ +add.f16x2 r2034, r1900, r1901; +} +{ +mul.f16x2 r2037, r2034, r1889; +} +{ +add.f16x2 r2040, r2031, r2037; +} +{ +sub.f16x2 r2043, r1906, r1907; +} +{ +mul.f16x2 r2046, r2043, r1888; +} +{ +sub.f16x2 r2049, r1912, r1913; +} +{ +mul.f16x2 r2052, r2049, r1891; +} +{ +add.f16x2 r2055, r2046, r2052; +} +{ +add.f16x2 %6, r2040, r2055; +} +{ +add.f16x2 r2061, r1906, r1907; +} +{ +mul.f16x2 r2064, r2061, r1885; +} +{ +add.f16x2 r2067, r1909, r2064; +} +{ +add.f16x2 r2070, r1912, r1913; +} +{ +mul.f16x2 r2073, r2070, r1887; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +sub.f16x2 r2079, r1894, r1895; +} +{ +mul.f16x2 r2082, r2079, r1886; +} +{ +sub.f16x2 r2085, r1900, r1901; +} +{ +mul.f16x2 r2088, r2085, r1888; +} +{ +add.f16x2 r2091, r2082, r2088; +} +{ +add.f16x2 %3, r2076, r2091; +} +{ +add.f16x2 r2097, r1906, r1907; +} +{ +mul.f16x2 r2100, r2097, r1885; +} +{ +add.f16x2 r2103, r1909, r2100; +} +{ +add.f16x2 r2106, r1912, r1913; +} +{ +mul.f16x2 r2109, r2106, r1887; +} +{ +add.f16x2 r2112, r2103, r2109; +} +{ +sub.f16x2 r2115, r1894, r1895; +} +{ +mul.f16x2 r2118, r2115, r1886; +} +{ +sub.f16x2 r2121, r1900, r1901; +} +{ +mul.f16x2 r2124, r2121, r1888; +} +{ +add.f16x2 r2127, r2118, r2124; +} +{ +sub.f16x2 %9, r2112, r2127; +} +{ +add.f16x2 r2133, r1906, r1907; +} +{ +mul.f16x2 r2136, r2133, r1887; +} +{ +add.f16x2 r2139, r1909, r2136; +} +{ +add.f16x2 r2142, r1912, r1913; +} +{ +mul.f16x2 r2145, r2142, r1889; +} +{ +add.f16x2 r2148, r2139, r2145; +} +{ +sub.f16x2 r2151, r1894, r1895; +} +{ +mul.f16x2 r2154, r2151, r1888; +} +{ +sub.f16x2 r2157, r1900, r1901; +} +{ +mul.f16x2 r2160, r2157, r1891; +} +{ +add.f16x2 r2163, r2154, r2160; +} +{ +add.f16x2 %5, r2148, r2163; +} +{ +add.f16x2 r2169, r1906, r1907; +} +{ +mul.f16x2 r2172, r2169, r1887; +} +{ +add.f16x2 r2175, r1909, r2172; +} +{ +add.f16x2 r2178, r1912, r1913; +} +{ +mul.f16x2 r2181, r2178, r1889; +} +{ +add.f16x2 r2184, r2175, r2181; +} +{ +sub.f16x2 r2187, r1894, r1895; +} +{ +mul.f16x2 r2190, r2187, r1888; +} +{ +sub.f16x2 r2193, r1900, r1901; +} +{ +mul.f16x2 r2196, r2193, r1891; +} +{ +add.f16x2 r2199, r2190, r2196; +} +{ +sub.f16x2 %7, r2184, r2199; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..aa14d783375ba --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp16_inv.hpp.inc @@ -0,0 +1,30238 @@ +#ifndef CUFFTDX_FFT_3125_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_3125_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1115, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<603>; +.reg .b32 r<10508>; +.reg .b64 rd<6>; +mov.u32 r10442, %50; +mov.u32 r10507, %tid.y; +mad.lo.s32 r10443, r10507, 25000, r10442; +mov.u32 r10444, %tid.x; +mov.f32 f594, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1, {low, high}; +} +mov.f32 f596, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f590, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5, {low, high}; +} +mov.f32 f592, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %67, %54; +} +{ +add.f16x2 r14, %74, r11; +} +{ +add.f16x2 r17, %81, %65; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %94, %79; +} +{ +add.f16x2 r26, %51, r23; +} +{ +add.f16x2 r29, %57, %92; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %67, %54; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %74, r38; +} +{ +add.f16x2 r44, %81, %65; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %94, %79; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %57, %92; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %67, %54; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %74, r74; +} +{ +add.f16x2 r80, %81, %65; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %94, %79; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %57, %92; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %67, %54; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %74, r110; +} +{ +add.f16x2 r116, %81, %65; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %94, %79; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %57, %92; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %67, %54; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %74, r146; +} +{ +add.f16x2 r152, %81, %65; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %94, %79; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %57, %92; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %94, %79; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %51, r182; +} +{ +add.f16x2 r188, %57, %92; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %67, %54; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %81, %65; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %94, %79; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %51, r218; +} +{ +add.f16x2 r224, %57, %92; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %67, %54; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %81, %65; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %94, %79; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %51, r254; +} +{ +add.f16x2 r260, %57, %92; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %67, %54; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %81, %65; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %94, %79; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %51, r290; +} +{ +add.f16x2 r296, %57, %92; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %67, %54; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %81, %65; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %89, %77; +} +{ +add.f16x2 r336, %96, r333; +} +{ +add.f16x2 r339, %55, %87; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %66, %52; +} +{ +add.f16x2 r348, %73, r345; +} +{ +add.f16x2 r351, %80, %64; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %89, %77; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %96, r360; +} +{ +add.f16x2 r366, %55, %87; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %66, %52; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %80, %64; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %89, %77; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %96, r396; +} +{ +add.f16x2 r402, %55, %87; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %66, %52; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %80, %64; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %89, %77; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %96, r432; +} +{ +add.f16x2 r438, %55, %87; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %66, %52; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %80, %64; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %89, %77; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %96, r468; +} +{ +add.f16x2 r474, %55, %87; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %66, %52; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %80, %64; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %66, %52; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %73, r504; +} +{ +add.f16x2 r510, %80, %64; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %89, %77; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %55, %87; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %66, %52; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %73, r540; +} +{ +add.f16x2 r546, %80, %64; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %89, %77; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %55, %87; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %66, %52; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %73, r576; +} +{ +add.f16x2 r582, %80, %64; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %89, %77; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %55, %87; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %66, %52; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %73, r612; +} +{ +add.f16x2 r618, %80, %64; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %89, %77; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %55, %87; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r650, {low, high}; +} +{ +neg.f16x2 r651, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, %62, %99; +} +{ +add.f16x2 r658, %69, r655; +} +{ +add.f16x2 r661, %78, %60; +} +{ +add.f16x2 r664, r658, r661; +} +{ +add.f16x2 r667, %88, %75; +} +{ +add.f16x2 r670, %95, r667; +} +{ +add.f16x2 r673, %53, %86; +} +{ +add.f16x2 r676, r670, r673; +} +{ +add.f16x2 r679, %62, %99; +} +{ +mul.f16x2 r682, r679, r645; +} +{ +add.f16x2 r685, %69, r682; +} +{ +add.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r649; +} +{ +add.f16x2 r694, r685, r691; +} +{ +sub.f16x2 r697, %88, %75; +} +{ +mul.f16x2 r700, r697, r647; +} +{ +sub.f16x2 r703, %53, %86; +} +{ +mul.f16x2 r706, r703, r651; +} +{ +add.f16x2 r709, r700, r706; +} +{ +sub.f16x2 r712, r694, r709; +} +{ +add.f16x2 r715, %62, %99; +} +{ +mul.f16x2 r718, r715, r645; +} +{ +add.f16x2 r721, %69, r718; +} +{ +add.f16x2 r724, %78, %60; +} +{ +mul.f16x2 r727, r724, r649; +} +{ +add.f16x2 r730, r721, r727; +} +{ +sub.f16x2 r733, %88, %75; +} +{ +mul.f16x2 r736, r733, r647; +} +{ +sub.f16x2 r739, %53, %86; +} +{ +mul.f16x2 r742, r739, r651; +} +{ +add.f16x2 r745, r736, r742; +} +{ +add.f16x2 r748, r730, r745; +} +{ +add.f16x2 r751, %62, %99; +} +{ +mul.f16x2 r754, r751, r649; +} +{ +add.f16x2 r757, %69, r754; +} +{ +add.f16x2 r760, %78, %60; +} +{ +mul.f16x2 r763, r760, r653; +} +{ +add.f16x2 r766, r757, r763; +} +{ +sub.f16x2 r769, %88, %75; +} +{ +mul.f16x2 r772, r769, r651; +} +{ +sub.f16x2 r775, %53, %86; +} +{ +mul.f16x2 r778, r775, r654; +} +{ +add.f16x2 r781, r772, r778; +} +{ +sub.f16x2 r784, r766, r781; +} +{ +add.f16x2 r787, %62, %99; +} +{ +mul.f16x2 r790, r787, r649; +} +{ +add.f16x2 r793, %69, r790; +} +{ +add.f16x2 r796, %78, %60; +} +{ +mul.f16x2 r799, r796, r653; +} +{ +add.f16x2 r802, r793, r799; +} +{ +sub.f16x2 r805, %88, %75; +} +{ +mul.f16x2 r808, r805, r651; +} +{ +sub.f16x2 r811, %53, %86; +} +{ +mul.f16x2 r814, r811, r654; +} +{ +add.f16x2 r817, r808, r814; +} +{ +add.f16x2 r820, r802, r817; +} +{ +add.f16x2 r823, %88, %75; +} +{ +mul.f16x2 r826, r823, r645; +} +{ +add.f16x2 r829, %95, r826; +} +{ +add.f16x2 r832, %53, %86; +} +{ +mul.f16x2 r835, r832, r649; +} +{ +add.f16x2 r838, r829, r835; +} +{ +sub.f16x2 r841, %62, %99; +} +{ +mul.f16x2 r844, r841, r647; +} +{ +sub.f16x2 r847, %78, %60; +} +{ +mul.f16x2 r850, r847, r651; +} +{ +add.f16x2 r853, r844, r850; +} +{ +add.f16x2 r856, r838, r853; +} +{ +add.f16x2 r859, %88, %75; +} +{ +mul.f16x2 r862, r859, r645; +} +{ +add.f16x2 r865, %95, r862; +} +{ +add.f16x2 r868, %53, %86; +} +{ +mul.f16x2 r871, r868, r649; +} +{ +add.f16x2 r874, r865, r871; +} +{ +sub.f16x2 r877, %62, %99; +} +{ +mul.f16x2 r880, r877, r647; +} +{ +sub.f16x2 r883, %78, %60; +} +{ +mul.f16x2 r886, r883, r651; +} +{ +add.f16x2 r889, r880, r886; +} +{ +sub.f16x2 r892, r874, r889; +} +{ +add.f16x2 r895, %88, %75; +} +{ +mul.f16x2 r898, r895, r649; +} +{ +add.f16x2 r901, %95, r898; +} +{ +add.f16x2 r904, %53, %86; +} +{ +mul.f16x2 r907, r904, r653; +} +{ +add.f16x2 r910, r901, r907; +} +{ +sub.f16x2 r913, %62, %99; +} +{ +mul.f16x2 r916, r913, r651; +} +{ +sub.f16x2 r919, %78, %60; +} +{ +mul.f16x2 r922, r919, r654; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r910, r925; +} +{ +add.f16x2 r931, %88, %75; +} +{ +mul.f16x2 r934, r931, r649; +} +{ +add.f16x2 r937, %95, r934; +} +{ +add.f16x2 r940, %53, %86; +} +{ +mul.f16x2 r943, r940, r653; +} +{ +add.f16x2 r946, r937, r943; +} +{ +sub.f16x2 r949, %62, %99; +} +{ +mul.f16x2 r952, r949, r651; +} +{ +sub.f16x2 r955, %78, %60; +} +{ +mul.f16x2 r958, r955, r654; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, r946, r961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r968, {low, high}; +} +{ +neg.f16x2 r969, r968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r972, {low, high}; +} +{ +neg.f16x2 r973, r972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r976, {low, high}; +} +{ +add.f16x2 r977, %85, %71; +} +{ +add.f16x2 r980, %91, r977; +} +{ +add.f16x2 r983, %100, %83; +} +{ +add.f16x2 r986, r980, r983; +} +{ +add.f16x2 r989, %61, %97; +} +{ +add.f16x2 r992, %68, r989; +} +{ +add.f16x2 r995, %76, %59; +} +{ +add.f16x2 r998, r992, r995; +} +{ +add.f16x2 r1001, %85, %71; +} +{ +mul.f16x2 r1004, r1001, r967; +} +{ +add.f16x2 r1007, %91, r1004; +} +{ +add.f16x2 r1010, %100, %83; +} +{ +mul.f16x2 r1013, r1010, r971; +} +{ +add.f16x2 r1016, r1007, r1013; +} +{ +sub.f16x2 r1019, %61, %97; +} +{ +mul.f16x2 r1022, r1019, r969; +} +{ +sub.f16x2 r1025, %76, %59; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r1022, r1028; +} +{ +sub.f16x2 r1034, r1016, r1031; +} +{ +add.f16x2 r1037, %85, %71; +} +{ +mul.f16x2 r1040, r1037, r967; +} +{ +add.f16x2 r1043, %91, r1040; +} +{ +add.f16x2 r1046, %100, %83; +} +{ +mul.f16x2 r1049, r1046, r971; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, %61, %97; +} +{ +mul.f16x2 r1058, r1055, r969; +} +{ +sub.f16x2 r1061, %76, %59; +} +{ +mul.f16x2 r1064, r1061, r973; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +add.f16x2 r1070, r1052, r1067; +} +{ +add.f16x2 r1073, %85, %71; +} +{ +mul.f16x2 r1076, r1073, r971; +} +{ +add.f16x2 r1079, %91, r1076; +} +{ +add.f16x2 r1082, %100, %83; +} +{ +mul.f16x2 r1085, r1082, r975; +} +{ +add.f16x2 r1088, r1079, r1085; +} +{ +sub.f16x2 r1091, %61, %97; +} +{ +mul.f16x2 r1094, r1091, r973; +} +{ +sub.f16x2 r1097, %76, %59; +} +{ +mul.f16x2 r1100, r1097, r976; +} +{ +add.f16x2 r1103, r1094, r1100; +} +{ +sub.f16x2 r1106, r1088, r1103; +} +{ +add.f16x2 r1109, %85, %71; +} +{ +mul.f16x2 r1112, r1109, r971; +} +{ +add.f16x2 r1115, %91, r1112; +} +{ +add.f16x2 r1118, %100, %83; +} +{ +mul.f16x2 r1121, r1118, r975; +} +{ +add.f16x2 r1124, r1115, r1121; +} +{ +sub.f16x2 r1127, %61, %97; +} +{ +mul.f16x2 r1130, r1127, r973; +} +{ +sub.f16x2 r1133, %76, %59; +} +{ +mul.f16x2 r1136, r1133, r976; +} +{ +add.f16x2 r1139, r1130, r1136; +} +{ +add.f16x2 r1142, r1124, r1139; +} +{ +add.f16x2 r1145, %61, %97; +} +{ +mul.f16x2 r1148, r1145, r967; +} +{ +add.f16x2 r1151, %68, r1148; +} +{ +add.f16x2 r1154, %76, %59; +} +{ +mul.f16x2 r1157, r1154, r971; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, %85, %71; +} +{ +mul.f16x2 r1166, r1163, r969; +} +{ +sub.f16x2 r1169, %100, %83; +} +{ +mul.f16x2 r1172, r1169, r973; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +add.f16x2 r1178, r1160, r1175; +} +{ +add.f16x2 r1181, %61, %97; +} +{ +mul.f16x2 r1184, r1181, r967; +} +{ +add.f16x2 r1187, %68, r1184; +} +{ +add.f16x2 r1190, %76, %59; +} +{ +mul.f16x2 r1193, r1190, r971; +} +{ +add.f16x2 r1196, r1187, r1193; +} +{ +sub.f16x2 r1199, %85, %71; +} +{ +mul.f16x2 r1202, r1199, r969; +} +{ +sub.f16x2 r1205, %100, %83; +} +{ +mul.f16x2 r1208, r1205, r973; +} +{ +add.f16x2 r1211, r1202, r1208; +} +{ +sub.f16x2 r1214, r1196, r1211; +} +{ +add.f16x2 r1217, %61, %97; +} +{ +mul.f16x2 r1220, r1217, r971; +} +{ +add.f16x2 r1223, %68, r1220; +} +{ +add.f16x2 r1226, %76, %59; +} +{ +mul.f16x2 r1229, r1226, r975; +} +{ +add.f16x2 r1232, r1223, r1229; +} +{ +sub.f16x2 r1235, %85, %71; +} +{ +mul.f16x2 r1238, r1235, r973; +} +{ +sub.f16x2 r1241, %100, %83; +} +{ +mul.f16x2 r1244, r1241, r976; +} +{ +add.f16x2 r1247, r1238, r1244; +} +{ +add.f16x2 r1250, r1232, r1247; +} +{ +add.f16x2 r1253, %61, %97; +} +{ +mul.f16x2 r1256, r1253, r971; +} +{ +add.f16x2 r1259, %68, r1256; +} +{ +add.f16x2 r1262, %76, %59; +} +{ +mul.f16x2 r1265, r1262, r975; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, %85, %71; +} +{ +mul.f16x2 r1274, r1271, r973; +} +{ +sub.f16x2 r1277, %100, %83; +} +{ +mul.f16x2 r1280, r1277, r976; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r1268, r1283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1290, {low, high}; +} +{ +neg.f16x2 r1291, r1290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1294, {low, high}; +} +{ +neg.f16x2 r1295, r1294; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1298, {low, high}; +} +{ +add.f16x2 r1299, %58, %93; +} +{ +add.f16x2 r1302, %63, r1299; +} +{ +add.f16x2 r1305, %72, %56; +} +{ +add.f16x2 r1308, r1302, r1305; +} +{ +add.f16x2 r1311, %84, %70; +} +{ +add.f16x2 r1314, %90, r1311; +} +{ +add.f16x2 r1317, %98, %82; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %58, %93; +} +{ +mul.f16x2 r1326, r1323, r1289; +} +{ +add.f16x2 r1329, %63, r1326; +} +{ +add.f16x2 r1332, %72, %56; +} +{ +mul.f16x2 r1335, r1332, r1293; +} +{ +add.f16x2 r1338, r1329, r1335; +} +{ +sub.f16x2 r1341, %84, %70; +} +{ +mul.f16x2 r1344, r1341, r1291; +} +{ +sub.f16x2 r1347, %98, %82; +} +{ +mul.f16x2 r1350, r1347, r1295; +} +{ +add.f16x2 r1353, r1344, r1350; +} +{ +sub.f16x2 r1356, r1338, r1353; +} +{ +add.f16x2 r1359, %58, %93; +} +{ +mul.f16x2 r1362, r1359, r1289; +} +{ +add.f16x2 r1365, %63, r1362; +} +{ +add.f16x2 r1368, %72, %56; +} +{ +mul.f16x2 r1371, r1368, r1293; +} +{ +add.f16x2 r1374, r1365, r1371; +} +{ +sub.f16x2 r1377, %84, %70; +} +{ +mul.f16x2 r1380, r1377, r1291; +} +{ +sub.f16x2 r1383, %98, %82; +} +{ +mul.f16x2 r1386, r1383, r1295; +} +{ +add.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1374, r1389; +} +{ +add.f16x2 r1395, %58, %93; +} +{ +mul.f16x2 r1398, r1395, r1293; +} +{ +add.f16x2 r1401, %63, r1398; +} +{ +add.f16x2 r1404, %72, %56; +} +{ +mul.f16x2 r1407, r1404, r1297; +} +{ +add.f16x2 r1410, r1401, r1407; +} +{ +sub.f16x2 r1413, %84, %70; +} +{ +mul.f16x2 r1416, r1413, r1295; +} +{ +sub.f16x2 r1419, %98, %82; +} +{ +mul.f16x2 r1422, r1419, r1298; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +sub.f16x2 r1428, r1410, r1425; +} +{ +add.f16x2 r1431, %58, %93; +} +{ +mul.f16x2 r1434, r1431, r1293; +} +{ +add.f16x2 r1437, %63, r1434; +} +{ +add.f16x2 r1440, %72, %56; +} +{ +mul.f16x2 r1443, r1440, r1297; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +sub.f16x2 r1449, %84, %70; +} +{ +mul.f16x2 r1452, r1449, r1295; +} +{ +sub.f16x2 r1455, %98, %82; +} +{ +mul.f16x2 r1458, r1455, r1298; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +add.f16x2 r1464, r1446, r1461; +} +{ +add.f16x2 r1467, %84, %70; +} +{ +mul.f16x2 r1470, r1467, r1289; +} +{ +add.f16x2 r1473, %90, r1470; +} +{ +add.f16x2 r1476, %98, %82; +} +{ +mul.f16x2 r1479, r1476, r1293; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +sub.f16x2 r1485, %58, %93; +} +{ +mul.f16x2 r1488, r1485, r1291; +} +{ +sub.f16x2 r1491, %72, %56; +} +{ +mul.f16x2 r1494, r1491, r1295; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +add.f16x2 r1500, r1482, r1497; +} +{ +add.f16x2 r1503, %84, %70; +} +{ +mul.f16x2 r1506, r1503, r1289; +} +{ +add.f16x2 r1509, %90, r1506; +} +{ +add.f16x2 r1512, %98, %82; +} +{ +mul.f16x2 r1515, r1512, r1293; +} +{ +add.f16x2 r1518, r1509, r1515; +} +{ +sub.f16x2 r1521, %58, %93; +} +{ +mul.f16x2 r1524, r1521, r1291; +} +{ +sub.f16x2 r1527, %72, %56; +} +{ +mul.f16x2 r1530, r1527, r1295; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1518, r1533; +} +{ +add.f16x2 r1539, %84, %70; +} +{ +mul.f16x2 r1542, r1539, r1293; +} +{ +add.f16x2 r1545, %90, r1542; +} +{ +add.f16x2 r1548, %98, %82; +} +{ +mul.f16x2 r1551, r1548, r1297; +} +{ +add.f16x2 r1554, r1545, r1551; +} +{ +sub.f16x2 r1557, %58, %93; +} +{ +mul.f16x2 r1560, r1557, r1295; +} +{ +sub.f16x2 r1563, %72, %56; +} +{ +mul.f16x2 r1566, r1563, r1298; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +add.f16x2 r1572, r1554, r1569; +} +{ +add.f16x2 r1575, %84, %70; +} +{ +mul.f16x2 r1578, r1575, r1293; +} +{ +add.f16x2 r1581, %90, r1578; +} +{ +add.f16x2 r1584, %98, %82; +} +{ +mul.f16x2 r1587, r1584, r1297; +} +{ +add.f16x2 r1590, r1581, r1587; +} +{ +sub.f16x2 r1593, %58, %93; +} +{ +mul.f16x2 r1596, r1593, r1295; +} +{ +sub.f16x2 r1599, %72, %56; +} +{ +mul.f16x2 r1602, r1599, r1298; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1590, r1605; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1611, {low, high}; +} +mov.f32 f332, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1612, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1613, {low, high}; +} +mov.f32 f336, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1614, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1615, {low, high}; +} +mov.f32 f340, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1616, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1617, {low, high}; +} +mov.f32 f344, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1618, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1621, {low, high}; +} +mov.f32 f352, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1622, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1625, {low, high}; +} +mov.f32 f360, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1626, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1627, {low, high}; +} +mov.f32 f364, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1628, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1633, {low, high}; +} +mov.f32 f376, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1641, {low, high}; +} +mov.f32 f392, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1642, {low, high}; +} +{ +mul.f16x2 r1659, r390, r1611; +} +{ +mul.f16x2 r1662, r534, r1612; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r390, r1612; +} +{ +fma.rn.f16x2 r1671, r534, r1611, r1668; +} +{ +mul.f16x2 r1675, r712, r1613; +} +{ +mul.f16x2 r1678, r856, r1614; +} +{ +sub.f16x2 r1681, r1675, r1678; +} +{ +mul.f16x2 r1684, r712, r1614; +} +{ +fma.rn.f16x2 r1687, r856, r1613, r1684; +} +{ +mul.f16x2 r1691, r1034, r1615; +} +{ +mul.f16x2 r1694, r1178, r1616; +} +{ +sub.f16x2 r1697, r1691, r1694; +} +{ +mul.f16x2 r1700, r1034, r1616; +} +{ +fma.rn.f16x2 r1703, r1178, r1615, r1700; +} +{ +mul.f16x2 r1707, r1356, r1617; +} +{ +mul.f16x2 r1710, r1500, r1618; +} +{ +sub.f16x2 r1713, r1707, r1710; +} +{ +mul.f16x2 r1716, r1356, r1618; +} +{ +fma.rn.f16x2 r1719, r1500, r1617, r1716; +} +{ +mul.f16x2 r1723, r462, r1613; +} +{ +mul.f16x2 r1726, r606, r1614; +} +{ +sub.f16x2 r1729, r1723, r1726; +} +{ +mul.f16x2 r1732, r462, r1614; +} +{ +fma.rn.f16x2 r1735, r606, r1613, r1732; +} +{ +mul.f16x2 r1739, r784, r1617; +} +{ +mul.f16x2 r1742, r928, r1618; +} +{ +sub.f16x2 r1745, r1739, r1742; +} +{ +mul.f16x2 r1748, r784, r1618; +} +{ +fma.rn.f16x2 r1751, r928, r1617, r1748; +} +{ +mul.f16x2 r1755, r1106, r1621; +} +{ +mul.f16x2 r1758, r1250, r1622; +} +{ +sub.f16x2 r1761, r1755, r1758; +} +{ +mul.f16x2 r1764, r1106, r1622; +} +{ +fma.rn.f16x2 r1767, r1250, r1621, r1764; +} +{ +mul.f16x2 r1771, r1428, r1625; +} +{ +mul.f16x2 r1774, r1572, r1626; +} +{ +sub.f16x2 r1777, r1771, r1774; +} +{ +mul.f16x2 r1780, r1428, r1626; +} +{ +fma.rn.f16x2 r1783, r1572, r1625, r1780; +} +{ +mul.f16x2 r1787, r498, r1615; +} +{ +mul.f16x2 r1790, r642, r1616; +} +{ +sub.f16x2 r1793, r1787, r1790; +} +{ +mul.f16x2 r1796, r498, r1616; +} +{ +fma.rn.f16x2 r1799, r642, r1615, r1796; +} +{ +mul.f16x2 r1803, r820, r1621; +} +{ +mul.f16x2 r1806, r964, r1622; +} +{ +sub.f16x2 r1809, r1803, r1806; +} +{ +mul.f16x2 r1812, r820, r1622; +} +{ +fma.rn.f16x2 r1815, r964, r1621, r1812; +} +{ +mul.f16x2 r1819, r1142, r1627; +} +{ +mul.f16x2 r1822, r1286, r1628; +} +{ +sub.f16x2 r1825, r1819, r1822; +} +{ +mul.f16x2 r1828, r1142, r1628; +} +{ +fma.rn.f16x2 r1831, r1286, r1627, r1828; +} +{ +mul.f16x2 r1835, r1464, r1633; +} +{ +mul.f16x2 r1838, r1608, r1634; +} +{ +sub.f16x2 r1841, r1835, r1838; +} +{ +mul.f16x2 r1844, r1464, r1634; +} +{ +fma.rn.f16x2 r1847, r1608, r1633, r1844; +} +{ +mul.f16x2 r1851, r426, r1617; +} +{ +mul.f16x2 r1854, r570, r1618; +} +{ +sub.f16x2 r1857, r1851, r1854; +} +{ +mul.f16x2 r1860, r426, r1618; +} +{ +fma.rn.f16x2 r1863, r570, r1617, r1860; +} +{ +mul.f16x2 r1867, r748, r1625; +} +{ +mul.f16x2 r1870, r892, r1626; +} +{ +sub.f16x2 r1873, r1867, r1870; +} +{ +mul.f16x2 r1876, r748, r1626; +} +{ +fma.rn.f16x2 r1879, r892, r1625, r1876; +} +{ +mul.f16x2 r1883, r1070, r1633; +} +{ +mul.f16x2 r1886, r1214, r1634; +} +{ +sub.f16x2 r1889, r1883, r1886; +} +{ +mul.f16x2 r1892, r1070, r1634; +} +{ +fma.rn.f16x2 r1895, r1214, r1633, r1892; +} +{ +mul.f16x2 r1899, r1392, r1641; +} +{ +mul.f16x2 r1902, r1536, r1642; +} +{ +sub.f16x2 r1905, r1899, r1902; +} +{ +mul.f16x2 r1908, r1392, r1642; +} +{ +fma.rn.f16x2 r1911, r1536, r1641, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1916, {low, high}; +} +{ +neg.f16x2 r1917, r1916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1919, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1920, {low, high}; +} +{ +neg.f16x2 r1921, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1924, {low, high}; +} +{ +add.f16x2 r1925, r342, r1308; +} +{ +add.f16x2 r1928, r20, r1925; +} +{ +add.f16x2 r1931, r664, r986; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r354, r1320; +} +{ +add.f16x2 r1940, r32, r1937; +} +{ +add.f16x2 r1943, r676, r998; +} +{ +add.f16x2 r1946, r1940, r1943; +} +{ +add.f16x2 r1949, r342, r1308; +} +{ +mul.f16x2 r1952, r1949, r1915; +} +{ +add.f16x2 r1955, r20, r1952; +} +{ +add.f16x2 r1958, r664, r986; +} +{ +mul.f16x2 r1961, r1958, r1919; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r354, r1320; +} +{ +mul.f16x2 r1970, r1967, r1917; +} +{ +sub.f16x2 r1973, r676, r998; +} +{ +mul.f16x2 r1976, r1973, r1921; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +sub.f16x2 r1982, r1964, r1979; +} +{ +add.f16x2 r1985, r342, r1308; +} +{ +mul.f16x2 r1988, r1985, r1915; +} +{ +add.f16x2 r1991, r20, r1988; +} +{ +add.f16x2 r1994, r664, r986; +} +{ +mul.f16x2 r1997, r1994, r1919; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r354, r1320; +} +{ +mul.f16x2 r2006, r2003, r1917; +} +{ +sub.f16x2 r2009, r676, r998; +} +{ +mul.f16x2 r2012, r2009, r1921; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 r2018, r2000, r2015; +} +{ +add.f16x2 r2021, r342, r1308; +} +{ +mul.f16x2 r2024, r2021, r1919; +} +{ +add.f16x2 r2027, r20, r2024; +} +{ +add.f16x2 r2030, r664, r986; +} +{ +mul.f16x2 r2033, r2030, r1923; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r354, r1320; +} +{ +mul.f16x2 r2042, r2039, r1921; +} +{ +sub.f16x2 r2045, r676, r998; +} +{ +mul.f16x2 r2048, r2045, r1924; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 r2054, r2036, r2051; +} +{ +add.f16x2 r2057, r342, r1308; +} +{ +mul.f16x2 r2060, r2057, r1919; +} +{ +add.f16x2 r2063, r20, r2060; +} +{ +add.f16x2 r2066, r664, r986; +} +{ +mul.f16x2 r2069, r2066, r1923; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r354, r1320; +} +{ +mul.f16x2 r2078, r2075, r1921; +} +{ +sub.f16x2 r2081, r676, r998; +} +{ +mul.f16x2 r2084, r2081, r1924; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 r2090, r2072, r2087; +} +{ +add.f16x2 r2093, r354, r1320; +} +{ +mul.f16x2 r2096, r2093, r1915; +} +{ +add.f16x2 r2099, r32, r2096; +} +{ +add.f16x2 r2102, r676, r998; +} +{ +mul.f16x2 r2105, r2102, r1919; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r342, r1308; +} +{ +mul.f16x2 r2114, r2111, r1917; +} +{ +sub.f16x2 r2117, r664, r986; +} +{ +mul.f16x2 r2120, r2117, r1921; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +add.f16x2 r2126, r2108, r2123; +} +{ +add.f16x2 r2129, r354, r1320; +} +{ +mul.f16x2 r2132, r2129, r1915; +} +{ +add.f16x2 r2135, r32, r2132; +} +{ +add.f16x2 r2138, r676, r998; +} +{ +mul.f16x2 r2141, r2138, r1919; +} +{ +add.f16x2 r2144, r2135, r2141; +} +{ +sub.f16x2 r2147, r342, r1308; +} +{ +mul.f16x2 r2150, r2147, r1917; +} +{ +sub.f16x2 r2153, r664, r986; +} +{ +mul.f16x2 r2156, r2153, r1921; +} +{ +add.f16x2 r2159, r2150, r2156; +} +{ +sub.f16x2 r2162, r2144, r2159; +} +{ +add.f16x2 r2165, r354, r1320; +} +{ +mul.f16x2 r2168, r2165, r1919; +} +{ +add.f16x2 r2171, r32, r2168; +} +{ +add.f16x2 r2174, r676, r998; +} +{ +mul.f16x2 r2177, r2174, r1923; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +sub.f16x2 r2183, r342, r1308; +} +{ +mul.f16x2 r2186, r2183, r1921; +} +{ +sub.f16x2 r2189, r664, r986; +} +{ +mul.f16x2 r2192, r2189, r1924; +} +{ +add.f16x2 r2195, r2186, r2192; +} +{ +add.f16x2 r2198, r2180, r2195; +} +{ +add.f16x2 r2201, r354, r1320; +} +{ +mul.f16x2 r2204, r2201, r1919; +} +{ +add.f16x2 r2207, r32, r2204; +} +{ +add.f16x2 r2210, r676, r998; +} +{ +mul.f16x2 r2213, r2210, r1923; +} +{ +add.f16x2 r2216, r2207, r2213; +} +{ +sub.f16x2 r2219, r342, r1308; +} +{ +mul.f16x2 r2222, r2219, r1921; +} +{ +sub.f16x2 r2225, r664, r986; +} +{ +mul.f16x2 r2228, r2225, r1924; +} +{ +add.f16x2 r2231, r2222, r2228; +} +{ +sub.f16x2 r2234, r2216, r2231; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2238, {low, high}; +} +{ +neg.f16x2 r2239, r2238; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2246, {low, high}; +} +{ +add.f16x2 r2247, r1665, r1713; +} +{ +add.f16x2 r2250, r68, r2247; +} +{ +add.f16x2 r2253, r1681, r1697; +} +{ +add.f16x2 r2256, r2250, r2253; +} +{ +add.f16x2 r2259, r1671, r1719; +} +{ +add.f16x2 r2262, r212, r2259; +} +{ +add.f16x2 r2265, r1687, r1703; +} +{ +add.f16x2 r2268, r2262, r2265; +} +{ +add.f16x2 r2271, r1665, r1713; +} +{ +mul.f16x2 r2274, r2271, r2237; +} +{ +add.f16x2 r2277, r68, r2274; +} +{ +add.f16x2 r2280, r1681, r1697; +} +{ +mul.f16x2 r2283, r2280, r2241; +} +{ +add.f16x2 r2286, r2277, r2283; +} +{ +sub.f16x2 r2289, r1671, r1719; +} +{ +mul.f16x2 r2292, r2289, r2239; +} +{ +sub.f16x2 r2295, r1687, r1703; +} +{ +mul.f16x2 r2298, r2295, r2243; +} +{ +add.f16x2 r2301, r2292, r2298; +} +{ +sub.f16x2 r2304, r2286, r2301; +} +{ +add.f16x2 r2307, r1665, r1713; +} +{ +mul.f16x2 r2310, r2307, r2237; +} +{ +add.f16x2 r2313, r68, r2310; +} +{ +add.f16x2 r2316, r1681, r1697; +} +{ +mul.f16x2 r2319, r2316, r2241; +} +{ +add.f16x2 r2322, r2313, r2319; +} +{ +sub.f16x2 r2325, r1671, r1719; +} +{ +mul.f16x2 r2328, r2325, r2239; +} +{ +sub.f16x2 r2331, r1687, r1703; +} +{ +mul.f16x2 r2334, r2331, r2243; +} +{ +add.f16x2 r2337, r2328, r2334; +} +{ +add.f16x2 r2340, r2322, r2337; +} +{ +add.f16x2 r2343, r1665, r1713; +} +{ +mul.f16x2 r2346, r2343, r2241; +} +{ +add.f16x2 r2349, r68, r2346; +} +{ +add.f16x2 r2352, r1681, r1697; +} +{ +mul.f16x2 r2355, r2352, r2245; +} +{ +add.f16x2 r2358, r2349, r2355; +} +{ +sub.f16x2 r2361, r1671, r1719; +} +{ +mul.f16x2 r2364, r2361, r2243; +} +{ +sub.f16x2 r2367, r1687, r1703; +} +{ +mul.f16x2 r2370, r2367, r2246; +} +{ +add.f16x2 r2373, r2364, r2370; +} +{ +sub.f16x2 r2376, r2358, r2373; +} +{ +add.f16x2 r2379, r1665, r1713; +} +{ +mul.f16x2 r2382, r2379, r2241; +} +{ +add.f16x2 r2385, r68, r2382; +} +{ +add.f16x2 r2388, r1681, r1697; +} +{ +mul.f16x2 r2391, r2388, r2245; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +sub.f16x2 r2397, r1671, r1719; +} +{ +mul.f16x2 r2400, r2397, r2243; +} +{ +sub.f16x2 r2403, r1687, r1703; +} +{ +mul.f16x2 r2406, r2403, r2246; +} +{ +add.f16x2 r2409, r2400, r2406; +} +{ +add.f16x2 r2412, r2394, r2409; +} +{ +add.f16x2 r2415, r1671, r1719; +} +{ +mul.f16x2 r2418, r2415, r2237; +} +{ +add.f16x2 r2421, r212, r2418; +} +{ +add.f16x2 r2424, r1687, r1703; +} +{ +mul.f16x2 r2427, r2424, r2241; +} +{ +add.f16x2 r2430, r2421, r2427; +} +{ +sub.f16x2 r2433, r1665, r1713; +} +{ +mul.f16x2 r2436, r2433, r2239; +} +{ +sub.f16x2 r2439, r1681, r1697; +} +{ +mul.f16x2 r2442, r2439, r2243; +} +{ +add.f16x2 r2445, r2436, r2442; +} +{ +add.f16x2 r2448, r2430, r2445; +} +{ +add.f16x2 r2451, r1671, r1719; +} +{ +mul.f16x2 r2454, r2451, r2237; +} +{ +add.f16x2 r2457, r212, r2454; +} +{ +add.f16x2 r2460, r1687, r1703; +} +{ +mul.f16x2 r2463, r2460, r2241; +} +{ +add.f16x2 r2466, r2457, r2463; +} +{ +sub.f16x2 r2469, r1665, r1713; +} +{ +mul.f16x2 r2472, r2469, r2239; +} +{ +sub.f16x2 r2475, r1681, r1697; +} +{ +mul.f16x2 r2478, r2475, r2243; +} +{ +add.f16x2 r2481, r2472, r2478; +} +{ +sub.f16x2 r2484, r2466, r2481; +} +{ +add.f16x2 r2487, r1671, r1719; +} +{ +mul.f16x2 r2490, r2487, r2241; +} +{ +add.f16x2 r2493, r212, r2490; +} +{ +add.f16x2 r2496, r1687, r1703; +} +{ +mul.f16x2 r2499, r2496, r2245; +} +{ +add.f16x2 r2502, r2493, r2499; +} +{ +sub.f16x2 r2505, r1665, r1713; +} +{ +mul.f16x2 r2508, r2505, r2243; +} +{ +sub.f16x2 r2511, r1681, r1697; +} +{ +mul.f16x2 r2514, r2511, r2246; +} +{ +add.f16x2 r2517, r2508, r2514; +} +{ +add.f16x2 r2520, r2502, r2517; +} +{ +add.f16x2 r2523, r1671, r1719; +} +{ +mul.f16x2 r2526, r2523, r2241; +} +{ +add.f16x2 r2529, r212, r2526; +} +{ +add.f16x2 r2532, r1687, r1703; +} +{ +mul.f16x2 r2535, r2532, r2245; +} +{ +add.f16x2 r2538, r2529, r2535; +} +{ +sub.f16x2 r2541, r1665, r1713; +} +{ +mul.f16x2 r2544, r2541, r2243; +} +{ +sub.f16x2 r2547, r1681, r1697; +} +{ +mul.f16x2 r2550, r2547, r2246; +} +{ +add.f16x2 r2553, r2544, r2550; +} +{ +sub.f16x2 r2556, r2538, r2553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2560, {low, high}; +} +{ +neg.f16x2 r2561, r2560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2564, {low, high}; +} +{ +neg.f16x2 r2565, r2564; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2568, {low, high}; +} +{ +add.f16x2 r2569, r1729, r1777; +} +{ +add.f16x2 r2572, r140, r2569; +} +{ +add.f16x2 r2575, r1745, r1761; +} +{ +add.f16x2 r2578, r2572, r2575; +} +{ +add.f16x2 r2581, r1735, r1783; +} +{ +add.f16x2 r2584, r284, r2581; +} +{ +add.f16x2 r2587, r1751, r1767; +} +{ +add.f16x2 r2590, r2584, r2587; +} +{ +add.f16x2 r2593, r1729, r1777; +} +{ +mul.f16x2 r2596, r2593, r2559; +} +{ +add.f16x2 r2599, r140, r2596; +} +{ +add.f16x2 r2602, r1745, r1761; +} +{ +mul.f16x2 r2605, r2602, r2563; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +sub.f16x2 r2611, r1735, r1783; +} +{ +mul.f16x2 r2614, r2611, r2561; +} +{ +sub.f16x2 r2617, r1751, r1767; +} +{ +mul.f16x2 r2620, r2617, r2565; +} +{ +add.f16x2 r2623, r2614, r2620; +} +{ +sub.f16x2 r2626, r2608, r2623; +} +{ +add.f16x2 r2629, r1729, r1777; +} +{ +mul.f16x2 r2632, r2629, r2559; +} +{ +add.f16x2 r2635, r140, r2632; +} +{ +add.f16x2 r2638, r1745, r1761; +} +{ +mul.f16x2 r2641, r2638, r2563; +} +{ +add.f16x2 r2644, r2635, r2641; +} +{ +sub.f16x2 r2647, r1735, r1783; +} +{ +mul.f16x2 r2650, r2647, r2561; +} +{ +sub.f16x2 r2653, r1751, r1767; +} +{ +mul.f16x2 r2656, r2653, r2565; +} +{ +add.f16x2 r2659, r2650, r2656; +} +{ +add.f16x2 r2662, r2644, r2659; +} +{ +add.f16x2 r2665, r1729, r1777; +} +{ +mul.f16x2 r2668, r2665, r2563; +} +{ +add.f16x2 r2671, r140, r2668; +} +{ +add.f16x2 r2674, r1745, r1761; +} +{ +mul.f16x2 r2677, r2674, r2567; +} +{ +add.f16x2 r2680, r2671, r2677; +} +{ +sub.f16x2 r2683, r1735, r1783; +} +{ +mul.f16x2 r2686, r2683, r2565; +} +{ +sub.f16x2 r2689, r1751, r1767; +} +{ +mul.f16x2 r2692, r2689, r2568; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2680, r2695; +} +{ +add.f16x2 r2701, r1729, r1777; +} +{ +mul.f16x2 r2704, r2701, r2563; +} +{ +add.f16x2 r2707, r140, r2704; +} +{ +add.f16x2 r2710, r1745, r1761; +} +{ +mul.f16x2 r2713, r2710, r2567; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +sub.f16x2 r2719, r1735, r1783; +} +{ +mul.f16x2 r2722, r2719, r2565; +} +{ +sub.f16x2 r2725, r1751, r1767; +} +{ +mul.f16x2 r2728, r2725, r2568; +} +{ +add.f16x2 r2731, r2722, r2728; +} +{ +add.f16x2 r2734, r2716, r2731; +} +{ +add.f16x2 r2737, r1735, r1783; +} +{ +mul.f16x2 r2740, r2737, r2559; +} +{ +add.f16x2 r2743, r284, r2740; +} +{ +add.f16x2 r2746, r1751, r1767; +} +{ +mul.f16x2 r2749, r2746, r2563; +} +{ +add.f16x2 r2752, r2743, r2749; +} +{ +sub.f16x2 r2755, r1729, r1777; +} +{ +mul.f16x2 r2758, r2755, r2561; +} +{ +sub.f16x2 r2761, r1745, r1761; +} +{ +mul.f16x2 r2764, r2761, r2565; +} +{ +add.f16x2 r2767, r2758, r2764; +} +{ +add.f16x2 r2770, r2752, r2767; +} +{ +add.f16x2 r2773, r1735, r1783; +} +{ +mul.f16x2 r2776, r2773, r2559; +} +{ +add.f16x2 r2779, r284, r2776; +} +{ +add.f16x2 r2782, r1751, r1767; +} +{ +mul.f16x2 r2785, r2782, r2563; +} +{ +add.f16x2 r2788, r2779, r2785; +} +{ +sub.f16x2 r2791, r1729, r1777; +} +{ +mul.f16x2 r2794, r2791, r2561; +} +{ +sub.f16x2 r2797, r1745, r1761; +} +{ +mul.f16x2 r2800, r2797, r2565; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2788, r2803; +} +{ +add.f16x2 r2809, r1735, r1783; +} +{ +mul.f16x2 r2812, r2809, r2563; +} +{ +add.f16x2 r2815, r284, r2812; +} +{ +add.f16x2 r2818, r1751, r1767; +} +{ +mul.f16x2 r2821, r2818, r2567; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +sub.f16x2 r2827, r1729, r1777; +} +{ +mul.f16x2 r2830, r2827, r2565; +} +{ +sub.f16x2 r2833, r1745, r1761; +} +{ +mul.f16x2 r2836, r2833, r2568; +} +{ +add.f16x2 r2839, r2830, r2836; +} +{ +add.f16x2 r2842, r2824, r2839; +} +{ +add.f16x2 r2845, r1735, r1783; +} +{ +mul.f16x2 r2848, r2845, r2563; +} +{ +add.f16x2 r2851, r284, r2848; +} +{ +add.f16x2 r2854, r1751, r1767; +} +{ +mul.f16x2 r2857, r2854, r2567; +} +{ +add.f16x2 r2860, r2851, r2857; +} +{ +sub.f16x2 r2863, r1729, r1777; +} +{ +mul.f16x2 r2866, r2863, r2565; +} +{ +sub.f16x2 r2869, r1745, r1761; +} +{ +mul.f16x2 r2872, r2869, r2568; +} +{ +add.f16x2 r2875, r2866, r2872; +} +{ +sub.f16x2 r2878, r2860, r2875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2882, {low, high}; +} +{ +neg.f16x2 r2883, r2882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2886, {low, high}; +} +{ +neg.f16x2 r2887, r2886; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2890, {low, high}; +} +{ +add.f16x2 r2891, r1793, r1841; +} +{ +add.f16x2 r2894, r176, r2891; +} +{ +add.f16x2 r2897, r1809, r1825; +} +{ +add.f16x2 r2900, r2894, r2897; +} +{ +add.f16x2 r2903, r1799, r1847; +} +{ +add.f16x2 r2906, r320, r2903; +} +{ +add.f16x2 r2909, r1815, r1831; +} +{ +add.f16x2 r2912, r2906, r2909; +} +{ +add.f16x2 r2915, r1793, r1841; +} +{ +mul.f16x2 r2918, r2915, r2881; +} +{ +add.f16x2 r2921, r176, r2918; +} +{ +add.f16x2 r2924, r1809, r1825; +} +{ +mul.f16x2 r2927, r2924, r2885; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +sub.f16x2 r2933, r1799, r1847; +} +{ +mul.f16x2 r2936, r2933, r2883; +} +{ +sub.f16x2 r2939, r1815, r1831; +} +{ +mul.f16x2 r2942, r2939, r2887; +} +{ +add.f16x2 r2945, r2936, r2942; +} +{ +sub.f16x2 r2948, r2930, r2945; +} +{ +add.f16x2 r2951, r1793, r1841; +} +{ +mul.f16x2 r2954, r2951, r2881; +} +{ +add.f16x2 r2957, r176, r2954; +} +{ +add.f16x2 r2960, r1809, r1825; +} +{ +mul.f16x2 r2963, r2960, r2885; +} +{ +add.f16x2 r2966, r2957, r2963; +} +{ +sub.f16x2 r2969, r1799, r1847; +} +{ +mul.f16x2 r2972, r2969, r2883; +} +{ +sub.f16x2 r2975, r1815, r1831; +} +{ +mul.f16x2 r2978, r2975, r2887; +} +{ +add.f16x2 r2981, r2972, r2978; +} +{ +add.f16x2 r2984, r2966, r2981; +} +{ +add.f16x2 r2987, r1793, r1841; +} +{ +mul.f16x2 r2990, r2987, r2885; +} +{ +add.f16x2 r2993, r176, r2990; +} +{ +add.f16x2 r2996, r1809, r1825; +} +{ +mul.f16x2 r2999, r2996, r2889; +} +{ +add.f16x2 r3002, r2993, r2999; +} +{ +sub.f16x2 r3005, r1799, r1847; +} +{ +mul.f16x2 r3008, r3005, r2887; +} +{ +sub.f16x2 r3011, r1815, r1831; +} +{ +mul.f16x2 r3014, r3011, r2890; +} +{ +add.f16x2 r3017, r3008, r3014; +} +{ +sub.f16x2 r3020, r3002, r3017; +} +{ +add.f16x2 r3023, r1793, r1841; +} +{ +mul.f16x2 r3026, r3023, r2885; +} +{ +add.f16x2 r3029, r176, r3026; +} +{ +add.f16x2 r3032, r1809, r1825; +} +{ +mul.f16x2 r3035, r3032, r2889; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +sub.f16x2 r3041, r1799, r1847; +} +{ +mul.f16x2 r3044, r3041, r2887; +} +{ +sub.f16x2 r3047, r1815, r1831; +} +{ +mul.f16x2 r3050, r3047, r2890; +} +{ +add.f16x2 r3053, r3044, r3050; +} +{ +add.f16x2 r3056, r3038, r3053; +} +{ +add.f16x2 r3059, r1799, r1847; +} +{ +mul.f16x2 r3062, r3059, r2881; +} +{ +add.f16x2 r3065, r320, r3062; +} +{ +add.f16x2 r3068, r1815, r1831; +} +{ +mul.f16x2 r3071, r3068, r2885; +} +{ +add.f16x2 r3074, r3065, r3071; +} +{ +sub.f16x2 r3077, r1793, r1841; +} +{ +mul.f16x2 r3080, r3077, r2883; +} +{ +sub.f16x2 r3083, r1809, r1825; +} +{ +mul.f16x2 r3086, r3083, r2887; +} +{ +add.f16x2 r3089, r3080, r3086; +} +{ +add.f16x2 r3092, r3074, r3089; +} +{ +add.f16x2 r3095, r1799, r1847; +} +{ +mul.f16x2 r3098, r3095, r2881; +} +{ +add.f16x2 r3101, r320, r3098; +} +{ +add.f16x2 r3104, r1815, r1831; +} +{ +mul.f16x2 r3107, r3104, r2885; +} +{ +add.f16x2 r3110, r3101, r3107; +} +{ +sub.f16x2 r3113, r1793, r1841; +} +{ +mul.f16x2 r3116, r3113, r2883; +} +{ +sub.f16x2 r3119, r1809, r1825; +} +{ +mul.f16x2 r3122, r3119, r2887; +} +{ +add.f16x2 r3125, r3116, r3122; +} +{ +sub.f16x2 r3128, r3110, r3125; +} +{ +add.f16x2 r3131, r1799, r1847; +} +{ +mul.f16x2 r3134, r3131, r2885; +} +{ +add.f16x2 r3137, r320, r3134; +} +{ +add.f16x2 r3140, r1815, r1831; +} +{ +mul.f16x2 r3143, r3140, r2889; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +sub.f16x2 r3149, r1793, r1841; +} +{ +mul.f16x2 r3152, r3149, r2887; +} +{ +sub.f16x2 r3155, r1809, r1825; +} +{ +mul.f16x2 r3158, r3155, r2890; +} +{ +add.f16x2 r3161, r3152, r3158; +} +{ +add.f16x2 r3164, r3146, r3161; +} +{ +add.f16x2 r3167, r1799, r1847; +} +{ +mul.f16x2 r3170, r3167, r2885; +} +{ +add.f16x2 r3173, r320, r3170; +} +{ +add.f16x2 r3176, r1815, r1831; +} +{ +mul.f16x2 r3179, r3176, r2889; +} +{ +add.f16x2 r3182, r3173, r3179; +} +{ +sub.f16x2 r3185, r1793, r1841; +} +{ +mul.f16x2 r3188, r3185, r2887; +} +{ +sub.f16x2 r3191, r1809, r1825; +} +{ +mul.f16x2 r3194, r3191, r2890; +} +{ +add.f16x2 r3197, r3188, r3194; +} +{ +sub.f16x2 r3200, r3182, r3197; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3204, {low, high}; +} +{ +neg.f16x2 r3205, r3204; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r3208, {low, high}; +} +{ +neg.f16x2 r3209, r3208; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3212, {low, high}; +} +{ +add.f16x2 r3213, r1857, r1905; +} +{ +add.f16x2 r3216, r104, r3213; +} +{ +add.f16x2 r3219, r1873, r1889; +} +{ +add.f16x2 r3222, r3216, r3219; +} +{ +add.f16x2 r3225, r1863, r1911; +} +{ +add.f16x2 r3228, r248, r3225; +} +{ +add.f16x2 r3231, r1879, r1895; +} +{ +add.f16x2 r3234, r3228, r3231; +} +{ +add.f16x2 r3237, r1857, r1905; +} +{ +mul.f16x2 r3240, r3237, r3203; +} +{ +add.f16x2 r3243, r104, r3240; +} +{ +add.f16x2 r3246, r1873, r1889; +} +{ +mul.f16x2 r3249, r3246, r3207; +} +{ +add.f16x2 r3252, r3243, r3249; +} +{ +sub.f16x2 r3255, r1863, r1911; +} +{ +mul.f16x2 r3258, r3255, r3205; +} +{ +sub.f16x2 r3261, r1879, r1895; +} +{ +mul.f16x2 r3264, r3261, r3209; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +sub.f16x2 r3270, r3252, r3267; +} +{ +add.f16x2 r3273, r1857, r1905; +} +{ +mul.f16x2 r3276, r3273, r3203; +} +{ +add.f16x2 r3279, r104, r3276; +} +{ +add.f16x2 r3282, r1873, r1889; +} +{ +mul.f16x2 r3285, r3282, r3207; +} +{ +add.f16x2 r3288, r3279, r3285; +} +{ +sub.f16x2 r3291, r1863, r1911; +} +{ +mul.f16x2 r3294, r3291, r3205; +} +{ +sub.f16x2 r3297, r1879, r1895; +} +{ +mul.f16x2 r3300, r3297, r3209; +} +{ +add.f16x2 r3303, r3294, r3300; +} +{ +add.f16x2 r3306, r3288, r3303; +} +{ +add.f16x2 r3309, r1857, r1905; +} +{ +mul.f16x2 r3312, r3309, r3207; +} +{ +add.f16x2 r3315, r104, r3312; +} +{ +add.f16x2 r3318, r1873, r1889; +} +{ +mul.f16x2 r3321, r3318, r3211; +} +{ +add.f16x2 r3324, r3315, r3321; +} +{ +sub.f16x2 r3327, r1863, r1911; +} +{ +mul.f16x2 r3330, r3327, r3209; +} +{ +sub.f16x2 r3333, r1879, r1895; +} +{ +mul.f16x2 r3336, r3333, r3212; +} +{ +add.f16x2 r3339, r3330, r3336; +} +{ +sub.f16x2 r3342, r3324, r3339; +} +{ +add.f16x2 r3345, r1857, r1905; +} +{ +mul.f16x2 r3348, r3345, r3207; +} +{ +add.f16x2 r3351, r104, r3348; +} +{ +add.f16x2 r3354, r1873, r1889; +} +{ +mul.f16x2 r3357, r3354, r3211; +} +{ +add.f16x2 r3360, r3351, r3357; +} +{ +sub.f16x2 r3363, r1863, r1911; +} +{ +mul.f16x2 r3366, r3363, r3209; +} +{ +sub.f16x2 r3369, r1879, r1895; +} +{ +mul.f16x2 r3372, r3369, r3212; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r3360, r3375; +} +{ +add.f16x2 r3381, r1863, r1911; +} +{ +mul.f16x2 r3384, r3381, r3203; +} +{ +add.f16x2 r3387, r248, r3384; +} +{ +add.f16x2 r3390, r1879, r1895; +} +{ +mul.f16x2 r3393, r3390, r3207; +} +{ +add.f16x2 r3396, r3387, r3393; +} +{ +sub.f16x2 r3399, r1857, r1905; +} +{ +mul.f16x2 r3402, r3399, r3205; +} +{ +sub.f16x2 r3405, r1873, r1889; +} +{ +mul.f16x2 r3408, r3405, r3209; +} +{ +add.f16x2 r3411, r3402, r3408; +} +{ +add.f16x2 r3414, r3396, r3411; +} +{ +add.f16x2 r3417, r1863, r1911; +} +{ +mul.f16x2 r3420, r3417, r3203; +} +{ +add.f16x2 r3423, r248, r3420; +} +{ +add.f16x2 r3426, r1879, r1895; +} +{ +mul.f16x2 r3429, r3426, r3207; +} +{ +add.f16x2 r3432, r3423, r3429; +} +{ +sub.f16x2 r3435, r1857, r1905; +} +{ +mul.f16x2 r3438, r3435, r3205; +} +{ +sub.f16x2 r3441, r1873, r1889; +} +{ +mul.f16x2 r3444, r3441, r3209; +} +{ +add.f16x2 r3447, r3438, r3444; +} +{ +sub.f16x2 r3450, r3432, r3447; +} +{ +add.f16x2 r3453, r1863, r1911; +} +{ +mul.f16x2 r3456, r3453, r3207; +} +{ +add.f16x2 r3459, r248, r3456; +} +{ +add.f16x2 r3462, r1879, r1895; +} +{ +mul.f16x2 r3465, r3462, r3211; +} +{ +add.f16x2 r3468, r3459, r3465; +} +{ +sub.f16x2 r3471, r1857, r1905; +} +{ +mul.f16x2 r3474, r3471, r3209; +} +{ +sub.f16x2 r3477, r1873, r1889; +} +{ +mul.f16x2 r3480, r3477, r3212; +} +{ +add.f16x2 r3483, r3474, r3480; +} +{ +add.f16x2 r3486, r3468, r3483; +} +{ +add.f16x2 r3489, r1863, r1911; +} +{ +mul.f16x2 r3492, r3489, r3207; +} +{ +add.f16x2 r3495, r248, r3492; +} +{ +add.f16x2 r3498, r1879, r1895; +} +{ +mul.f16x2 r3501, r3498, r3211; +} +{ +add.f16x2 r3504, r3495, r3501; +} +{ +sub.f16x2 r3507, r1857, r1905; +} +{ +mul.f16x2 r3510, r3507, r3209; +} +{ +sub.f16x2 r3513, r1873, r1889; +} +{ +mul.f16x2 r3516, r3513, r3212; +} +{ +add.f16x2 r3519, r3510, r3516; +} +{ +sub.f16x2 r3522, r3504, r3519; +} +mul.wide.u32 rd2, r10444, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r10445, rd3; +mul.lo.s32 r10446, r10445, 125; +sub.s32 r10447, r10444, r10446; +cvt.rn.f32.u32 f597, r10447; +mul.f32 f598, f597, 0f3B03C498; +cos.approx.f32 f217, f598; +sin.approx.f32 f599, f598; +neg.f32 f218, f599; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3525, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3530, {high, high}; +} +{ +mul.f16x2 r3532, r2268, r3530; +} +{ +fma.rn.f16x2 r3535, r2256, r3528, r3532; +} +{ +mul.f16x2 r3539, r2256, r3530; +} +{ +neg.f16x2 r3542, r3539; +} +{ +fma.rn.f16x2 r3544, r2268, r3528, r3542; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3550, {high, high}; +} +mov.f32 f533, 0fBF800000; +mov.f32 f534, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3552, {low, high}; +} +{ +mul.f16x2 r3553, r3550, r3552; +} +{ +mul.f16x2 r3556, r3525, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3559, {high, low}; +} +{ +fma.rn.f16x2 r3561, r3553, r3559, r3556; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3567, {high, high}; +} +{ +mul.f16x2 r3569, r2590, r3567; +} +{ +fma.rn.f16x2 r3572, r2578, r3565, r3569; +} +{ +mul.f16x2 r3576, r2578, r3567; +} +{ +neg.f16x2 r3579, r3576; +} +{ +fma.rn.f16x2 r3581, r2590, r3565, r3579; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3587, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3589, {low, high}; +} +{ +mul.f16x2 r3590, r3587, r3589; +} +{ +mul.f16x2 r3593, r3561, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3596, {high, low}; +} +{ +fma.rn.f16x2 r3598, r3590, r3596, r3593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3604, {high, high}; +} +{ +mul.f16x2 r3606, r2912, r3604; +} +{ +fma.rn.f16x2 r3609, r2900, r3602, r3606; +} +{ +mul.f16x2 r3613, r2900, r3604; +} +{ +neg.f16x2 r3616, r3613; +} +{ +fma.rn.f16x2 r3618, r2912, r3602, r3616; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3624, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3626, {low, high}; +} +{ +mul.f16x2 r3627, r3624, r3626; +} +{ +mul.f16x2 r3630, r3598, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3633, {high, low}; +} +{ +fma.rn.f16x2 r3635, r3627, r3633, r3630; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3641, {high, high}; +} +{ +mul.f16x2 r3643, r3234, r3641; +} +{ +fma.rn.f16x2 r3646, r3222, r3639, r3643; +} +{ +mul.f16x2 r3650, r3222, r3641; +} +{ +neg.f16x2 r3653, r3650; +} +{ +fma.rn.f16x2 r3655, r3234, r3639, r3653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3661, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3663, {low, high}; +} +{ +mul.f16x2 r3664, r3661, r3663; +} +{ +mul.f16x2 r3667, r3635, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3670, {high, low}; +} +{ +fma.rn.f16x2 r3672, r3664, r3670, r3667; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3678, {high, high}; +} +{ +mul.f16x2 r3680, r2126, r3678; +} +{ +fma.rn.f16x2 r3683, r1982, r3676, r3680; +} +{ +mul.f16x2 r3687, r1982, r3678; +} +{ +neg.f16x2 r3690, r3687; +} +{ +fma.rn.f16x2 r3692, r2126, r3676, r3690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3698, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3700, {low, high}; +} +{ +mul.f16x2 r3701, r3698, r3700; +} +{ +mul.f16x2 r3704, r3672, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3707, {high, low}; +} +{ +fma.rn.f16x2 r3709, r3701, r3707, r3704; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3715, {high, high}; +} +{ +mul.f16x2 r3717, r2448, r3715; +} +{ +fma.rn.f16x2 r3720, r2304, r3713, r3717; +} +{ +mul.f16x2 r3724, r2304, r3715; +} +{ +neg.f16x2 r3727, r3724; +} +{ +fma.rn.f16x2 r3729, r2448, r3713, r3727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3735, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3737, {low, high}; +} +{ +mul.f16x2 r3738, r3735, r3737; +} +{ +mul.f16x2 r3741, r3709, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3744, {high, low}; +} +{ +fma.rn.f16x2 r3746, r3738, r3744, r3741; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3752, {high, high}; +} +{ +mul.f16x2 r3754, r2770, r3752; +} +{ +fma.rn.f16x2 r3757, r2626, r3750, r3754; +} +{ +mul.f16x2 r3761, r2626, r3752; +} +{ +neg.f16x2 r3764, r3761; +} +{ +fma.rn.f16x2 r3766, r2770, r3750, r3764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3774, {low, high}; +} +{ +mul.f16x2 r3775, r3772, r3774; +} +{ +mul.f16x2 r3778, r3746, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3781, {high, low}; +} +{ +fma.rn.f16x2 r3783, r3775, r3781, r3778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3789, {high, high}; +} +{ +mul.f16x2 r3791, r3092, r3789; +} +{ +fma.rn.f16x2 r3794, r2948, r3787, r3791; +} +{ +mul.f16x2 r3798, r2948, r3789; +} +{ +neg.f16x2 r3801, r3798; +} +{ +fma.rn.f16x2 r3803, r3092, r3787, r3801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3811, {low, high}; +} +{ +mul.f16x2 r3812, r3809, r3811; +} +{ +mul.f16x2 r3815, r3783, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3818, {high, low}; +} +{ +fma.rn.f16x2 r3820, r3812, r3818, r3815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3826, {high, high}; +} +{ +mul.f16x2 r3828, r3414, r3826; +} +{ +fma.rn.f16x2 r3831, r3270, r3824, r3828; +} +{ +mul.f16x2 r3835, r3270, r3826; +} +{ +neg.f16x2 r3838, r3835; +} +{ +fma.rn.f16x2 r3840, r3414, r3824, r3838; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3848, {low, high}; +} +{ +mul.f16x2 r3849, r3846, r3848; +} +{ +mul.f16x2 r3852, r3820, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3855, {high, low}; +} +{ +fma.rn.f16x2 r3857, r3849, r3855, r3852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3863, {high, high}; +} +{ +mul.f16x2 r3865, r2198, r3863; +} +{ +fma.rn.f16x2 r3868, r2054, r3861, r3865; +} +{ +mul.f16x2 r3872, r2054, r3863; +} +{ +neg.f16x2 r3875, r3872; +} +{ +fma.rn.f16x2 r3877, r2198, r3861, r3875; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3885, {low, high}; +} +{ +mul.f16x2 r3886, r3883, r3885; +} +{ +mul.f16x2 r3889, r3857, r3881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3892, {high, low}; +} +{ +fma.rn.f16x2 r3894, r3886, r3892, r3889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3900, {high, high}; +} +{ +mul.f16x2 r3902, r2520, r3900; +} +{ +fma.rn.f16x2 r3905, r2376, r3898, r3902; +} +{ +mul.f16x2 r3909, r2376, r3900; +} +{ +neg.f16x2 r3912, r3909; +} +{ +fma.rn.f16x2 r3914, r2520, r3898, r3912; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3922, {low, high}; +} +{ +mul.f16x2 r3923, r3920, r3922; +} +{ +mul.f16x2 r3926, r3894, r3918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3929, {high, low}; +} +{ +fma.rn.f16x2 r3931, r3923, r3929, r3926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3937, {high, high}; +} +{ +mul.f16x2 r3939, r2842, r3937; +} +{ +fma.rn.f16x2 r3942, r2698, r3935, r3939; +} +{ +mul.f16x2 r3946, r2698, r3937; +} +{ +neg.f16x2 r3949, r3946; +} +{ +fma.rn.f16x2 r3951, r2842, r3935, r3949; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3959, {low, high}; +} +{ +mul.f16x2 r3960, r3957, r3959; +} +{ +mul.f16x2 r3963, r3931, r3955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3966, {high, low}; +} +{ +fma.rn.f16x2 r3968, r3960, r3966, r3963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3974, {high, high}; +} +{ +mul.f16x2 r3976, r3164, r3974; +} +{ +fma.rn.f16x2 r3979, r3020, r3972, r3976; +} +{ +mul.f16x2 r3983, r3020, r3974; +} +{ +neg.f16x2 r3986, r3983; +} +{ +fma.rn.f16x2 r3988, r3164, r3972, r3986; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3996, {low, high}; +} +{ +mul.f16x2 r3997, r3994, r3996; +} +{ +mul.f16x2 r4000, r3968, r3992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r4003, {high, low}; +} +{ +fma.rn.f16x2 r4005, r3997, r4003, r4000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4011, {high, high}; +} +{ +mul.f16x2 r4013, r3486, r4011; +} +{ +fma.rn.f16x2 r4016, r3342, r4009, r4013; +} +{ +mul.f16x2 r4020, r3342, r4011; +} +{ +neg.f16x2 r4023, r4020; +} +{ +fma.rn.f16x2 r4025, r3486, r4009, r4023; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4033, {low, high}; +} +{ +mul.f16x2 r4034, r4031, r4033; +} +{ +mul.f16x2 r4037, r4005, r4029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4040, {high, low}; +} +{ +fma.rn.f16x2 r4042, r4034, r4040, r4037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4048, {high, high}; +} +{ +mul.f16x2 r4050, r2234, r4048; +} +{ +fma.rn.f16x2 r4053, r2090, r4046, r4050; +} +{ +mul.f16x2 r4057, r2090, r4048; +} +{ +neg.f16x2 r4060, r4057; +} +{ +fma.rn.f16x2 r4062, r2234, r4046, r4060; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4070, {low, high}; +} +{ +mul.f16x2 r4071, r4068, r4070; +} +{ +mul.f16x2 r4074, r4042, r4066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4077, {high, low}; +} +{ +fma.rn.f16x2 r4079, r4071, r4077, r4074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4085, {high, high}; +} +{ +mul.f16x2 r4087, r2556, r4085; +} +{ +fma.rn.f16x2 r4090, r2412, r4083, r4087; +} +{ +mul.f16x2 r4094, r2412, r4085; +} +{ +neg.f16x2 r4097, r4094; +} +{ +fma.rn.f16x2 r4099, r2556, r4083, r4097; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4107, {low, high}; +} +{ +mul.f16x2 r4108, r4105, r4107; +} +{ +mul.f16x2 r4111, r4079, r4103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4114, {high, low}; +} +{ +fma.rn.f16x2 r4116, r4108, r4114, r4111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4122, {high, high}; +} +{ +mul.f16x2 r4124, r2878, r4122; +} +{ +fma.rn.f16x2 r4127, r2734, r4120, r4124; +} +{ +mul.f16x2 r4131, r2734, r4122; +} +{ +neg.f16x2 r4134, r4131; +} +{ +fma.rn.f16x2 r4136, r2878, r4120, r4134; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4140, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4142, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4144, {low, high}; +} +{ +mul.f16x2 r4145, r4142, r4144; +} +{ +mul.f16x2 r4148, r4116, r4140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4151, {high, low}; +} +{ +fma.rn.f16x2 r4153, r4145, r4151, r4148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4159, {high, high}; +} +{ +mul.f16x2 r4161, r3200, r4159; +} +{ +fma.rn.f16x2 r4164, r3056, r4157, r4161; +} +{ +mul.f16x2 r4168, r3056, r4159; +} +{ +neg.f16x2 r4171, r4168; +} +{ +fma.rn.f16x2 r4173, r3200, r4157, r4171; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4177, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4179, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4181, {low, high}; +} +{ +mul.f16x2 r4182, r4179, r4181; +} +{ +mul.f16x2 r4185, r4153, r4177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4188, {high, low}; +} +{ +fma.rn.f16x2 r4190, r4182, r4188, r4185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4196, {high, high}; +} +{ +mul.f16x2 r4198, r3522, r4196; +} +{ +fma.rn.f16x2 r4201, r3378, r4194, r4198; +} +{ +mul.f16x2 r4205, r3378, r4196; +} +{ +neg.f16x2 r4208, r4205; +} +{ +fma.rn.f16x2 r4210, r3522, r4194, r4208; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4214, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4216, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4218, {low, high}; +} +{ +mul.f16x2 r4219, r4216, r4218; +} +{ +mul.f16x2 r4222, r4190, r4214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4225, {high, low}; +} +{ +fma.rn.f16x2 r4227, r4219, r4225, r4222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4233, {high, high}; +} +{ +mul.f16x2 r4235, r2162, r4233; +} +{ +fma.rn.f16x2 r4238, r2018, r4231, r4235; +} +{ +mul.f16x2 r4242, r2018, r4233; +} +{ +neg.f16x2 r4245, r4242; +} +{ +fma.rn.f16x2 r4247, r2162, r4231, r4245; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4253, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4255, {low, high}; +} +{ +mul.f16x2 r4256, r4253, r4255; +} +{ +mul.f16x2 r4259, r4227, r4251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4262, {high, low}; +} +{ +fma.rn.f16x2 r4264, r4256, r4262, r4259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4270, {high, high}; +} +{ +mul.f16x2 r4272, r2484, r4270; +} +{ +fma.rn.f16x2 r4275, r2340, r4268, r4272; +} +{ +mul.f16x2 r4279, r2340, r4270; +} +{ +neg.f16x2 r4282, r4279; +} +{ +fma.rn.f16x2 r4284, r2484, r4268, r4282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4288, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4290, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4292, {low, high}; +} +{ +mul.f16x2 r4293, r4290, r4292; +} +{ +mul.f16x2 r4296, r4264, r4288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4299, {high, low}; +} +{ +fma.rn.f16x2 r4301, r4293, r4299, r4296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4307, {high, high}; +} +{ +mul.f16x2 r4309, r2806, r4307; +} +{ +fma.rn.f16x2 r4312, r2662, r4305, r4309; +} +{ +mul.f16x2 r4316, r2662, r4307; +} +{ +neg.f16x2 r4319, r4316; +} +{ +fma.rn.f16x2 r4321, r2806, r4305, r4319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4325, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4327, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4329, {low, high}; +} +{ +mul.f16x2 r4330, r4327, r4329; +} +{ +mul.f16x2 r4333, r4301, r4325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4336, {high, low}; +} +{ +fma.rn.f16x2 r4338, r4330, r4336, r4333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4344, {high, high}; +} +{ +mul.f16x2 r4346, r3128, r4344; +} +{ +fma.rn.f16x2 r4349, r2984, r4342, r4346; +} +{ +mul.f16x2 r4353, r2984, r4344; +} +{ +neg.f16x2 r4356, r4353; +} +{ +fma.rn.f16x2 r4358, r3128, r4342, r4356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4362, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4364, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4366, {low, high}; +} +{ +mul.f16x2 r4367, r4364, r4366; +} +{ +mul.f16x2 r4370, r4338, r4362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4373, {high, low}; +} +{ +fma.rn.f16x2 r4375, r4367, r4373, r4370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4379, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4381, {high, high}; +} +{ +mul.f16x2 r4383, r3450, r4381; +} +{ +fma.rn.f16x2 r4386, r3306, r4379, r4383; +} +{ +mul.f16x2 r4390, r3306, r4381; +} +{ +neg.f16x2 r4393, r4390; +} +{ +fma.rn.f16x2 r4395, r3450, r4379, r4393; +} +mad.lo.s32 r10448, r10445, 25000, r10443; +barrier.sync 0; +mad.lo.s32 r10449, r10447, 200, r10448; +st.shared.v2.f32 [r10449], {r1934, r1946}; +st.shared.v2.f32 [r10449+8], {r3535, r3544}; +st.shared.v2.f32 [r10449+16], {r3572, r3581}; +st.shared.v2.f32 [r10449+24], {r3609, r3618}; +st.shared.v2.f32 [r10449+32], {r3646, r3655}; +st.shared.v2.f32 [r10449+40], {r3683, r3692}; +st.shared.v2.f32 [r10449+48], {r3720, r3729}; +st.shared.v2.f32 [r10449+56], {r3757, r3766}; +st.shared.v2.f32 [r10449+64], {r3794, r3803}; +st.shared.v2.f32 [r10449+72], {r3831, r3840}; +st.shared.v2.f32 [r10449+80], {r3868, r3877}; +st.shared.v2.f32 [r10449+88], {r3905, r3914}; +st.shared.v2.f32 [r10449+96], {r3942, r3951}; +st.shared.v2.f32 [r10449+104], {r3979, r3988}; +st.shared.v2.f32 [r10449+112], {r4016, r4025}; +st.shared.v2.f32 [r10449+120], {r4053, r4062}; +st.shared.v2.f32 [r10449+128], {r4090, r4099}; +st.shared.v2.f32 [r10449+136], {r4127, r4136}; +st.shared.v2.f32 [r10449+144], {r4164, r4173}; +st.shared.v2.f32 [r10449+152], {r4201, r4210}; +st.shared.v2.f32 [r10449+160], {r4238, r4247}; +st.shared.v2.f32 [r10449+168], {r4275, r4284}; +st.shared.v2.f32 [r10449+176], {r4312, r4321}; +st.shared.v2.f32 [r10449+184], {r4349, r4358}; +st.shared.v2.f32 [r10449+192], {r4386, r4395}; +barrier.sync 0; +mad.lo.s32 r10450, r10447, -192, r10449; +ld.shared.u32 r4430, [r10450]; +ld.shared.u32 r4442, [r10450+4]; +ld.shared.u32 r4752, [r10450+1000]; +ld.shared.u32 r4764, [r10450+1004]; +ld.shared.u32 r5074, [r10450+2000]; +ld.shared.u32 r5086, [r10450+2004]; +ld.shared.u32 r5396, [r10450+3000]; +ld.shared.u32 r5408, [r10450+3004]; +ld.shared.u32 r5718, [r10450+4000]; +ld.shared.u32 r5730, [r10450+4004]; +ld.shared.u32 r4427, [r10450+5000]; +ld.shared.u32 r4439, [r10450+5004]; +ld.shared.u32 r4749, [r10450+6000]; +ld.shared.u32 r4761, [r10450+6004]; +ld.shared.u32 r5071, [r10450+7000]; +ld.shared.u32 r5083, [r10450+7004]; +ld.shared.u32 r5393, [r10450+8000]; +ld.shared.u32 r5405, [r10450+8004]; +ld.shared.u32 r5715, [r10450+9000]; +ld.shared.u32 r5727, [r10450+9004]; +ld.shared.u32 r4433, [r10450+10000]; +ld.shared.u32 r4445, [r10450+10004]; +ld.shared.u32 r4755, [r10450+11000]; +ld.shared.u32 r4767, [r10450+11004]; +ld.shared.u32 r5077, [r10450+12000]; +ld.shared.u32 r5089, [r10450+12004]; +ld.shared.u32 r5399, [r10450+13000]; +ld.shared.u32 r5411, [r10450+13004]; +ld.shared.u32 r5721, [r10450+14000]; +ld.shared.u32 r5733, [r10450+14004]; +ld.shared.u32 r4434, [r10450+15000]; +ld.shared.u32 r4446, [r10450+15004]; +ld.shared.u32 r4756, [r10450+16000]; +ld.shared.u32 r4768, [r10450+16004]; +ld.shared.u32 r5078, [r10450+17000]; +ld.shared.u32 r5090, [r10450+17004]; +ld.shared.u32 r5400, [r10450+18000]; +ld.shared.u32 r5412, [r10450+18004]; +ld.shared.u32 r5722, [r10450+19000]; +ld.shared.u32 r5734, [r10450+19004]; +ld.shared.u32 r4428, [r10450+20000]; +ld.shared.u32 r4440, [r10450+20004]; +ld.shared.u32 r4750, [r10450+21000]; +ld.shared.u32 r4762, [r10450+21004]; +ld.shared.u32 r5072, [r10450+22000]; +ld.shared.u32 r5084, [r10450+22004]; +ld.shared.u32 r5394, [r10450+23000]; +ld.shared.u32 r5406, [r10450+23004]; +ld.shared.u32 r5716, [r10450+24000]; +ld.shared.u32 r5728, [r10450+24004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4417, {low, high}; +} +{ +neg.f16x2 r4418, r4417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4421, {low, high}; +} +{ +neg.f16x2 r4422, r4421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4425, {low, high}; +} +{ +add.f16x2 r4426, r4427, r4428; +} +{ +add.f16x2 r4429, r4430, r4426; +} +{ +add.f16x2 r4432, r4433, r4434; +} +{ +add.f16x2 r4435, r4429, r4432; +} +{ +add.f16x2 r4438, r4439, r4440; +} +{ +add.f16x2 r4441, r4442, r4438; +} +{ +add.f16x2 r4444, r4445, r4446; +} +{ +add.f16x2 r4447, r4441, r4444; +} +{ +add.f16x2 r4450, r4427, r4428; +} +{ +mul.f16x2 r4453, r4450, r4416; +} +{ +add.f16x2 r4456, r4430, r4453; +} +{ +add.f16x2 r4459, r4433, r4434; +} +{ +mul.f16x2 r4462, r4459, r4420; +} +{ +add.f16x2 r4465, r4456, r4462; +} +{ +sub.f16x2 r4468, r4439, r4440; +} +{ +mul.f16x2 r4471, r4468, r4418; +} +{ +sub.f16x2 r4474, r4445, r4446; +} +{ +mul.f16x2 r4477, r4474, r4422; +} +{ +add.f16x2 r4480, r4471, r4477; +} +{ +sub.f16x2 r4483, r4465, r4480; +} +{ +add.f16x2 r4486, r4427, r4428; +} +{ +mul.f16x2 r4489, r4486, r4416; +} +{ +add.f16x2 r4492, r4430, r4489; +} +{ +add.f16x2 r4495, r4433, r4434; +} +{ +mul.f16x2 r4498, r4495, r4420; +} +{ +add.f16x2 r4501, r4492, r4498; +} +{ +sub.f16x2 r4504, r4439, r4440; +} +{ +mul.f16x2 r4507, r4504, r4418; +} +{ +sub.f16x2 r4510, r4445, r4446; +} +{ +mul.f16x2 r4513, r4510, r4422; +} +{ +add.f16x2 r4516, r4507, r4513; +} +{ +add.f16x2 r4519, r4501, r4516; +} +{ +add.f16x2 r4522, r4427, r4428; +} +{ +mul.f16x2 r4525, r4522, r4420; +} +{ +add.f16x2 r4528, r4430, r4525; +} +{ +add.f16x2 r4531, r4433, r4434; +} +{ +mul.f16x2 r4534, r4531, r4424; +} +{ +add.f16x2 r4537, r4528, r4534; +} +{ +sub.f16x2 r4540, r4439, r4440; +} +{ +mul.f16x2 r4543, r4540, r4422; +} +{ +sub.f16x2 r4546, r4445, r4446; +} +{ +mul.f16x2 r4549, r4546, r4425; +} +{ +add.f16x2 r4552, r4543, r4549; +} +{ +sub.f16x2 r4555, r4537, r4552; +} +{ +add.f16x2 r4558, r4427, r4428; +} +{ +mul.f16x2 r4561, r4558, r4420; +} +{ +add.f16x2 r4564, r4430, r4561; +} +{ +add.f16x2 r4567, r4433, r4434; +} +{ +mul.f16x2 r4570, r4567, r4424; +} +{ +add.f16x2 r4573, r4564, r4570; +} +{ +sub.f16x2 r4576, r4439, r4440; +} +{ +mul.f16x2 r4579, r4576, r4422; +} +{ +sub.f16x2 r4582, r4445, r4446; +} +{ +mul.f16x2 r4585, r4582, r4425; +} +{ +add.f16x2 r4588, r4579, r4585; +} +{ +add.f16x2 r4591, r4573, r4588; +} +{ +add.f16x2 r4594, r4439, r4440; +} +{ +mul.f16x2 r4597, r4594, r4416; +} +{ +add.f16x2 r4600, r4442, r4597; +} +{ +add.f16x2 r4603, r4445, r4446; +} +{ +mul.f16x2 r4606, r4603, r4420; +} +{ +add.f16x2 r4609, r4600, r4606; +} +{ +sub.f16x2 r4612, r4427, r4428; +} +{ +mul.f16x2 r4615, r4612, r4418; +} +{ +sub.f16x2 r4618, r4433, r4434; +} +{ +mul.f16x2 r4621, r4618, r4422; +} +{ +add.f16x2 r4624, r4615, r4621; +} +{ +add.f16x2 r4627, r4609, r4624; +} +{ +add.f16x2 r4630, r4439, r4440; +} +{ +mul.f16x2 r4633, r4630, r4416; +} +{ +add.f16x2 r4636, r4442, r4633; +} +{ +add.f16x2 r4639, r4445, r4446; +} +{ +mul.f16x2 r4642, r4639, r4420; +} +{ +add.f16x2 r4645, r4636, r4642; +} +{ +sub.f16x2 r4648, r4427, r4428; +} +{ +mul.f16x2 r4651, r4648, r4418; +} +{ +sub.f16x2 r4654, r4433, r4434; +} +{ +mul.f16x2 r4657, r4654, r4422; +} +{ +add.f16x2 r4660, r4651, r4657; +} +{ +sub.f16x2 r4663, r4645, r4660; +} +{ +add.f16x2 r4666, r4439, r4440; +} +{ +mul.f16x2 r4669, r4666, r4420; +} +{ +add.f16x2 r4672, r4442, r4669; +} +{ +add.f16x2 r4675, r4445, r4446; +} +{ +mul.f16x2 r4678, r4675, r4424; +} +{ +add.f16x2 r4681, r4672, r4678; +} +{ +sub.f16x2 r4684, r4427, r4428; +} +{ +mul.f16x2 r4687, r4684, r4422; +} +{ +sub.f16x2 r4690, r4433, r4434; +} +{ +mul.f16x2 r4693, r4690, r4425; +} +{ +add.f16x2 r4696, r4687, r4693; +} +{ +add.f16x2 r4699, r4681, r4696; +} +{ +add.f16x2 r4702, r4439, r4440; +} +{ +mul.f16x2 r4705, r4702, r4420; +} +{ +add.f16x2 r4708, r4442, r4705; +} +{ +add.f16x2 r4711, r4445, r4446; +} +{ +mul.f16x2 r4714, r4711, r4424; +} +{ +add.f16x2 r4717, r4708, r4714; +} +{ +sub.f16x2 r4720, r4427, r4428; +} +{ +mul.f16x2 r4723, r4720, r4422; +} +{ +sub.f16x2 r4726, r4433, r4434; +} +{ +mul.f16x2 r4729, r4726, r4425; +} +{ +add.f16x2 r4732, r4723, r4729; +} +{ +sub.f16x2 r4735, r4717, r4732; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4739, {low, high}; +} +{ +neg.f16x2 r4740, r4739; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4743, {low, high}; +} +{ +neg.f16x2 r4744, r4743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4747, {low, high}; +} +{ +add.f16x2 r4748, r4749, r4750; +} +{ +add.f16x2 r4751, r4752, r4748; +} +{ +add.f16x2 r4754, r4755, r4756; +} +{ +add.f16x2 r4757, r4751, r4754; +} +{ +add.f16x2 r4760, r4761, r4762; +} +{ +add.f16x2 r4763, r4764, r4760; +} +{ +add.f16x2 r4766, r4767, r4768; +} +{ +add.f16x2 r4769, r4763, r4766; +} +{ +add.f16x2 r4772, r4749, r4750; +} +{ +mul.f16x2 r4775, r4772, r4738; +} +{ +add.f16x2 r4778, r4752, r4775; +} +{ +add.f16x2 r4781, r4755, r4756; +} +{ +mul.f16x2 r4784, r4781, r4742; +} +{ +add.f16x2 r4787, r4778, r4784; +} +{ +sub.f16x2 r4790, r4761, r4762; +} +{ +mul.f16x2 r4793, r4790, r4740; +} +{ +sub.f16x2 r4796, r4767, r4768; +} +{ +mul.f16x2 r4799, r4796, r4744; +} +{ +add.f16x2 r4802, r4793, r4799; +} +{ +sub.f16x2 r4805, r4787, r4802; +} +{ +add.f16x2 r4808, r4749, r4750; +} +{ +mul.f16x2 r4811, r4808, r4738; +} +{ +add.f16x2 r4814, r4752, r4811; +} +{ +add.f16x2 r4817, r4755, r4756; +} +{ +mul.f16x2 r4820, r4817, r4742; +} +{ +add.f16x2 r4823, r4814, r4820; +} +{ +sub.f16x2 r4826, r4761, r4762; +} +{ +mul.f16x2 r4829, r4826, r4740; +} +{ +sub.f16x2 r4832, r4767, r4768; +} +{ +mul.f16x2 r4835, r4832, r4744; +} +{ +add.f16x2 r4838, r4829, r4835; +} +{ +add.f16x2 r4841, r4823, r4838; +} +{ +add.f16x2 r4844, r4749, r4750; +} +{ +mul.f16x2 r4847, r4844, r4742; +} +{ +add.f16x2 r4850, r4752, r4847; +} +{ +add.f16x2 r4853, r4755, r4756; +} +{ +mul.f16x2 r4856, r4853, r4746; +} +{ +add.f16x2 r4859, r4850, r4856; +} +{ +sub.f16x2 r4862, r4761, r4762; +} +{ +mul.f16x2 r4865, r4862, r4744; +} +{ +sub.f16x2 r4868, r4767, r4768; +} +{ +mul.f16x2 r4871, r4868, r4747; +} +{ +add.f16x2 r4874, r4865, r4871; +} +{ +sub.f16x2 r4877, r4859, r4874; +} +{ +add.f16x2 r4880, r4749, r4750; +} +{ +mul.f16x2 r4883, r4880, r4742; +} +{ +add.f16x2 r4886, r4752, r4883; +} +{ +add.f16x2 r4889, r4755, r4756; +} +{ +mul.f16x2 r4892, r4889, r4746; +} +{ +add.f16x2 r4895, r4886, r4892; +} +{ +sub.f16x2 r4898, r4761, r4762; +} +{ +mul.f16x2 r4901, r4898, r4744; +} +{ +sub.f16x2 r4904, r4767, r4768; +} +{ +mul.f16x2 r4907, r4904, r4747; +} +{ +add.f16x2 r4910, r4901, r4907; +} +{ +add.f16x2 r4913, r4895, r4910; +} +{ +add.f16x2 r4916, r4761, r4762; +} +{ +mul.f16x2 r4919, r4916, r4738; +} +{ +add.f16x2 r4922, r4764, r4919; +} +{ +add.f16x2 r4925, r4767, r4768; +} +{ +mul.f16x2 r4928, r4925, r4742; +} +{ +add.f16x2 r4931, r4922, r4928; +} +{ +sub.f16x2 r4934, r4749, r4750; +} +{ +mul.f16x2 r4937, r4934, r4740; +} +{ +sub.f16x2 r4940, r4755, r4756; +} +{ +mul.f16x2 r4943, r4940, r4744; +} +{ +add.f16x2 r4946, r4937, r4943; +} +{ +add.f16x2 r4949, r4931, r4946; +} +{ +add.f16x2 r4952, r4761, r4762; +} +{ +mul.f16x2 r4955, r4952, r4738; +} +{ +add.f16x2 r4958, r4764, r4955; +} +{ +add.f16x2 r4961, r4767, r4768; +} +{ +mul.f16x2 r4964, r4961, r4742; +} +{ +add.f16x2 r4967, r4958, r4964; +} +{ +sub.f16x2 r4970, r4749, r4750; +} +{ +mul.f16x2 r4973, r4970, r4740; +} +{ +sub.f16x2 r4976, r4755, r4756; +} +{ +mul.f16x2 r4979, r4976, r4744; +} +{ +add.f16x2 r4982, r4973, r4979; +} +{ +sub.f16x2 r4985, r4967, r4982; +} +{ +add.f16x2 r4988, r4761, r4762; +} +{ +mul.f16x2 r4991, r4988, r4742; +} +{ +add.f16x2 r4994, r4764, r4991; +} +{ +add.f16x2 r4997, r4767, r4768; +} +{ +mul.f16x2 r5000, r4997, r4746; +} +{ +add.f16x2 r5003, r4994, r5000; +} +{ +sub.f16x2 r5006, r4749, r4750; +} +{ +mul.f16x2 r5009, r5006, r4744; +} +{ +sub.f16x2 r5012, r4755, r4756; +} +{ +mul.f16x2 r5015, r5012, r4747; +} +{ +add.f16x2 r5018, r5009, r5015; +} +{ +add.f16x2 r5021, r5003, r5018; +} +{ +add.f16x2 r5024, r4761, r4762; +} +{ +mul.f16x2 r5027, r5024, r4742; +} +{ +add.f16x2 r5030, r4764, r5027; +} +{ +add.f16x2 r5033, r4767, r4768; +} +{ +mul.f16x2 r5036, r5033, r4746; +} +{ +add.f16x2 r5039, r5030, r5036; +} +{ +sub.f16x2 r5042, r4749, r4750; +} +{ +mul.f16x2 r5045, r5042, r4744; +} +{ +sub.f16x2 r5048, r4755, r4756; +} +{ +mul.f16x2 r5051, r5048, r4747; +} +{ +add.f16x2 r5054, r5045, r5051; +} +{ +sub.f16x2 r5057, r5039, r5054; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5061, {low, high}; +} +{ +neg.f16x2 r5062, r5061; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5064, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5065, {low, high}; +} +{ +neg.f16x2 r5066, r5065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5069, {low, high}; +} +{ +add.f16x2 r5070, r5071, r5072; +} +{ +add.f16x2 r5073, r5074, r5070; +} +{ +add.f16x2 r5076, r5077, r5078; +} +{ +add.f16x2 r5079, r5073, r5076; +} +{ +add.f16x2 r5082, r5083, r5084; +} +{ +add.f16x2 r5085, r5086, r5082; +} +{ +add.f16x2 r5088, r5089, r5090; +} +{ +add.f16x2 r5091, r5085, r5088; +} +{ +add.f16x2 r5094, r5071, r5072; +} +{ +mul.f16x2 r5097, r5094, r5060; +} +{ +add.f16x2 r5100, r5074, r5097; +} +{ +add.f16x2 r5103, r5077, r5078; +} +{ +mul.f16x2 r5106, r5103, r5064; +} +{ +add.f16x2 r5109, r5100, r5106; +} +{ +sub.f16x2 r5112, r5083, r5084; +} +{ +mul.f16x2 r5115, r5112, r5062; +} +{ +sub.f16x2 r5118, r5089, r5090; +} +{ +mul.f16x2 r5121, r5118, r5066; +} +{ +add.f16x2 r5124, r5115, r5121; +} +{ +sub.f16x2 r5127, r5109, r5124; +} +{ +add.f16x2 r5130, r5071, r5072; +} +{ +mul.f16x2 r5133, r5130, r5060; +} +{ +add.f16x2 r5136, r5074, r5133; +} +{ +add.f16x2 r5139, r5077, r5078; +} +{ +mul.f16x2 r5142, r5139, r5064; +} +{ +add.f16x2 r5145, r5136, r5142; +} +{ +sub.f16x2 r5148, r5083, r5084; +} +{ +mul.f16x2 r5151, r5148, r5062; +} +{ +sub.f16x2 r5154, r5089, r5090; +} +{ +mul.f16x2 r5157, r5154, r5066; +} +{ +add.f16x2 r5160, r5151, r5157; +} +{ +add.f16x2 r5163, r5145, r5160; +} +{ +add.f16x2 r5166, r5071, r5072; +} +{ +mul.f16x2 r5169, r5166, r5064; +} +{ +add.f16x2 r5172, r5074, r5169; +} +{ +add.f16x2 r5175, r5077, r5078; +} +{ +mul.f16x2 r5178, r5175, r5068; +} +{ +add.f16x2 r5181, r5172, r5178; +} +{ +sub.f16x2 r5184, r5083, r5084; +} +{ +mul.f16x2 r5187, r5184, r5066; +} +{ +sub.f16x2 r5190, r5089, r5090; +} +{ +mul.f16x2 r5193, r5190, r5069; +} +{ +add.f16x2 r5196, r5187, r5193; +} +{ +sub.f16x2 r5199, r5181, r5196; +} +{ +add.f16x2 r5202, r5071, r5072; +} +{ +mul.f16x2 r5205, r5202, r5064; +} +{ +add.f16x2 r5208, r5074, r5205; +} +{ +add.f16x2 r5211, r5077, r5078; +} +{ +mul.f16x2 r5214, r5211, r5068; +} +{ +add.f16x2 r5217, r5208, r5214; +} +{ +sub.f16x2 r5220, r5083, r5084; +} +{ +mul.f16x2 r5223, r5220, r5066; +} +{ +sub.f16x2 r5226, r5089, r5090; +} +{ +mul.f16x2 r5229, r5226, r5069; +} +{ +add.f16x2 r5232, r5223, r5229; +} +{ +add.f16x2 r5235, r5217, r5232; +} +{ +add.f16x2 r5238, r5083, r5084; +} +{ +mul.f16x2 r5241, r5238, r5060; +} +{ +add.f16x2 r5244, r5086, r5241; +} +{ +add.f16x2 r5247, r5089, r5090; +} +{ +mul.f16x2 r5250, r5247, r5064; +} +{ +add.f16x2 r5253, r5244, r5250; +} +{ +sub.f16x2 r5256, r5071, r5072; +} +{ +mul.f16x2 r5259, r5256, r5062; +} +{ +sub.f16x2 r5262, r5077, r5078; +} +{ +mul.f16x2 r5265, r5262, r5066; +} +{ +add.f16x2 r5268, r5259, r5265; +} +{ +add.f16x2 r5271, r5253, r5268; +} +{ +add.f16x2 r5274, r5083, r5084; +} +{ +mul.f16x2 r5277, r5274, r5060; +} +{ +add.f16x2 r5280, r5086, r5277; +} +{ +add.f16x2 r5283, r5089, r5090; +} +{ +mul.f16x2 r5286, r5283, r5064; +} +{ +add.f16x2 r5289, r5280, r5286; +} +{ +sub.f16x2 r5292, r5071, r5072; +} +{ +mul.f16x2 r5295, r5292, r5062; +} +{ +sub.f16x2 r5298, r5077, r5078; +} +{ +mul.f16x2 r5301, r5298, r5066; +} +{ +add.f16x2 r5304, r5295, r5301; +} +{ +sub.f16x2 r5307, r5289, r5304; +} +{ +add.f16x2 r5310, r5083, r5084; +} +{ +mul.f16x2 r5313, r5310, r5064; +} +{ +add.f16x2 r5316, r5086, r5313; +} +{ +add.f16x2 r5319, r5089, r5090; +} +{ +mul.f16x2 r5322, r5319, r5068; +} +{ +add.f16x2 r5325, r5316, r5322; +} +{ +sub.f16x2 r5328, r5071, r5072; +} +{ +mul.f16x2 r5331, r5328, r5066; +} +{ +sub.f16x2 r5334, r5077, r5078; +} +{ +mul.f16x2 r5337, r5334, r5069; +} +{ +add.f16x2 r5340, r5331, r5337; +} +{ +add.f16x2 r5343, r5325, r5340; +} +{ +add.f16x2 r5346, r5083, r5084; +} +{ +mul.f16x2 r5349, r5346, r5064; +} +{ +add.f16x2 r5352, r5086, r5349; +} +{ +add.f16x2 r5355, r5089, r5090; +} +{ +mul.f16x2 r5358, r5355, r5068; +} +{ +add.f16x2 r5361, r5352, r5358; +} +{ +sub.f16x2 r5364, r5071, r5072; +} +{ +mul.f16x2 r5367, r5364, r5066; +} +{ +sub.f16x2 r5370, r5077, r5078; +} +{ +mul.f16x2 r5373, r5370, r5069; +} +{ +add.f16x2 r5376, r5367, r5373; +} +{ +sub.f16x2 r5379, r5361, r5376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5383, {low, high}; +} +{ +neg.f16x2 r5384, r5383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5386, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5387, {low, high}; +} +{ +neg.f16x2 r5388, r5387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5391, {low, high}; +} +{ +add.f16x2 r5392, r5393, r5394; +} +{ +add.f16x2 r5395, r5396, r5392; +} +{ +add.f16x2 r5398, r5399, r5400; +} +{ +add.f16x2 r5401, r5395, r5398; +} +{ +add.f16x2 r5404, r5405, r5406; +} +{ +add.f16x2 r5407, r5408, r5404; +} +{ +add.f16x2 r5410, r5411, r5412; +} +{ +add.f16x2 r5413, r5407, r5410; +} +{ +add.f16x2 r5416, r5393, r5394; +} +{ +mul.f16x2 r5419, r5416, r5382; +} +{ +add.f16x2 r5422, r5396, r5419; +} +{ +add.f16x2 r5425, r5399, r5400; +} +{ +mul.f16x2 r5428, r5425, r5386; +} +{ +add.f16x2 r5431, r5422, r5428; +} +{ +sub.f16x2 r5434, r5405, r5406; +} +{ +mul.f16x2 r5437, r5434, r5384; +} +{ +sub.f16x2 r5440, r5411, r5412; +} +{ +mul.f16x2 r5443, r5440, r5388; +} +{ +add.f16x2 r5446, r5437, r5443; +} +{ +sub.f16x2 r5449, r5431, r5446; +} +{ +add.f16x2 r5452, r5393, r5394; +} +{ +mul.f16x2 r5455, r5452, r5382; +} +{ +add.f16x2 r5458, r5396, r5455; +} +{ +add.f16x2 r5461, r5399, r5400; +} +{ +mul.f16x2 r5464, r5461, r5386; +} +{ +add.f16x2 r5467, r5458, r5464; +} +{ +sub.f16x2 r5470, r5405, r5406; +} +{ +mul.f16x2 r5473, r5470, r5384; +} +{ +sub.f16x2 r5476, r5411, r5412; +} +{ +mul.f16x2 r5479, r5476, r5388; +} +{ +add.f16x2 r5482, r5473, r5479; +} +{ +add.f16x2 r5485, r5467, r5482; +} +{ +add.f16x2 r5488, r5393, r5394; +} +{ +mul.f16x2 r5491, r5488, r5386; +} +{ +add.f16x2 r5494, r5396, r5491; +} +{ +add.f16x2 r5497, r5399, r5400; +} +{ +mul.f16x2 r5500, r5497, r5390; +} +{ +add.f16x2 r5503, r5494, r5500; +} +{ +sub.f16x2 r5506, r5405, r5406; +} +{ +mul.f16x2 r5509, r5506, r5388; +} +{ +sub.f16x2 r5512, r5411, r5412; +} +{ +mul.f16x2 r5515, r5512, r5391; +} +{ +add.f16x2 r5518, r5509, r5515; +} +{ +sub.f16x2 r5521, r5503, r5518; +} +{ +add.f16x2 r5524, r5393, r5394; +} +{ +mul.f16x2 r5527, r5524, r5386; +} +{ +add.f16x2 r5530, r5396, r5527; +} +{ +add.f16x2 r5533, r5399, r5400; +} +{ +mul.f16x2 r5536, r5533, r5390; +} +{ +add.f16x2 r5539, r5530, r5536; +} +{ +sub.f16x2 r5542, r5405, r5406; +} +{ +mul.f16x2 r5545, r5542, r5388; +} +{ +sub.f16x2 r5548, r5411, r5412; +} +{ +mul.f16x2 r5551, r5548, r5391; +} +{ +add.f16x2 r5554, r5545, r5551; +} +{ +add.f16x2 r5557, r5539, r5554; +} +{ +add.f16x2 r5560, r5405, r5406; +} +{ +mul.f16x2 r5563, r5560, r5382; +} +{ +add.f16x2 r5566, r5408, r5563; +} +{ +add.f16x2 r5569, r5411, r5412; +} +{ +mul.f16x2 r5572, r5569, r5386; +} +{ +add.f16x2 r5575, r5566, r5572; +} +{ +sub.f16x2 r5578, r5393, r5394; +} +{ +mul.f16x2 r5581, r5578, r5384; +} +{ +sub.f16x2 r5584, r5399, r5400; +} +{ +mul.f16x2 r5587, r5584, r5388; +} +{ +add.f16x2 r5590, r5581, r5587; +} +{ +add.f16x2 r5593, r5575, r5590; +} +{ +add.f16x2 r5596, r5405, r5406; +} +{ +mul.f16x2 r5599, r5596, r5382; +} +{ +add.f16x2 r5602, r5408, r5599; +} +{ +add.f16x2 r5605, r5411, r5412; +} +{ +mul.f16x2 r5608, r5605, r5386; +} +{ +add.f16x2 r5611, r5602, r5608; +} +{ +sub.f16x2 r5614, r5393, r5394; +} +{ +mul.f16x2 r5617, r5614, r5384; +} +{ +sub.f16x2 r5620, r5399, r5400; +} +{ +mul.f16x2 r5623, r5620, r5388; +} +{ +add.f16x2 r5626, r5617, r5623; +} +{ +sub.f16x2 r5629, r5611, r5626; +} +{ +add.f16x2 r5632, r5405, r5406; +} +{ +mul.f16x2 r5635, r5632, r5386; +} +{ +add.f16x2 r5638, r5408, r5635; +} +{ +add.f16x2 r5641, r5411, r5412; +} +{ +mul.f16x2 r5644, r5641, r5390; +} +{ +add.f16x2 r5647, r5638, r5644; +} +{ +sub.f16x2 r5650, r5393, r5394; +} +{ +mul.f16x2 r5653, r5650, r5388; +} +{ +sub.f16x2 r5656, r5399, r5400; +} +{ +mul.f16x2 r5659, r5656, r5391; +} +{ +add.f16x2 r5662, r5653, r5659; +} +{ +add.f16x2 r5665, r5647, r5662; +} +{ +add.f16x2 r5668, r5405, r5406; +} +{ +mul.f16x2 r5671, r5668, r5386; +} +{ +add.f16x2 r5674, r5408, r5671; +} +{ +add.f16x2 r5677, r5411, r5412; +} +{ +mul.f16x2 r5680, r5677, r5390; +} +{ +add.f16x2 r5683, r5674, r5680; +} +{ +sub.f16x2 r5686, r5393, r5394; +} +{ +mul.f16x2 r5689, r5686, r5388; +} +{ +sub.f16x2 r5692, r5399, r5400; +} +{ +mul.f16x2 r5695, r5692, r5391; +} +{ +add.f16x2 r5698, r5689, r5695; +} +{ +sub.f16x2 r5701, r5683, r5698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5704, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5705, {low, high}; +} +{ +neg.f16x2 r5706, r5705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5709, {low, high}; +} +{ +neg.f16x2 r5710, r5709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5712, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5713, {low, high}; +} +{ +add.f16x2 r5714, r5715, r5716; +} +{ +add.f16x2 r5717, r5718, r5714; +} +{ +add.f16x2 r5720, r5721, r5722; +} +{ +add.f16x2 r5723, r5717, r5720; +} +{ +add.f16x2 r5726, r5727, r5728; +} +{ +add.f16x2 r5729, r5730, r5726; +} +{ +add.f16x2 r5732, r5733, r5734; +} +{ +add.f16x2 r5735, r5729, r5732; +} +{ +add.f16x2 r5738, r5715, r5716; +} +{ +mul.f16x2 r5741, r5738, r5704; +} +{ +add.f16x2 r5744, r5718, r5741; +} +{ +add.f16x2 r5747, r5721, r5722; +} +{ +mul.f16x2 r5750, r5747, r5708; +} +{ +add.f16x2 r5753, r5744, r5750; +} +{ +sub.f16x2 r5756, r5727, r5728; +} +{ +mul.f16x2 r5759, r5756, r5706; +} +{ +sub.f16x2 r5762, r5733, r5734; +} +{ +mul.f16x2 r5765, r5762, r5710; +} +{ +add.f16x2 r5768, r5759, r5765; +} +{ +sub.f16x2 r5771, r5753, r5768; +} +{ +add.f16x2 r5774, r5715, r5716; +} +{ +mul.f16x2 r5777, r5774, r5704; +} +{ +add.f16x2 r5780, r5718, r5777; +} +{ +add.f16x2 r5783, r5721, r5722; +} +{ +mul.f16x2 r5786, r5783, r5708; +} +{ +add.f16x2 r5789, r5780, r5786; +} +{ +sub.f16x2 r5792, r5727, r5728; +} +{ +mul.f16x2 r5795, r5792, r5706; +} +{ +sub.f16x2 r5798, r5733, r5734; +} +{ +mul.f16x2 r5801, r5798, r5710; +} +{ +add.f16x2 r5804, r5795, r5801; +} +{ +add.f16x2 r5807, r5789, r5804; +} +{ +add.f16x2 r5810, r5715, r5716; +} +{ +mul.f16x2 r5813, r5810, r5708; +} +{ +add.f16x2 r5816, r5718, r5813; +} +{ +add.f16x2 r5819, r5721, r5722; +} +{ +mul.f16x2 r5822, r5819, r5712; +} +{ +add.f16x2 r5825, r5816, r5822; +} +{ +sub.f16x2 r5828, r5727, r5728; +} +{ +mul.f16x2 r5831, r5828, r5710; +} +{ +sub.f16x2 r5834, r5733, r5734; +} +{ +mul.f16x2 r5837, r5834, r5713; +} +{ +add.f16x2 r5840, r5831, r5837; +} +{ +sub.f16x2 r5843, r5825, r5840; +} +{ +add.f16x2 r5846, r5715, r5716; +} +{ +mul.f16x2 r5849, r5846, r5708; +} +{ +add.f16x2 r5852, r5718, r5849; +} +{ +add.f16x2 r5855, r5721, r5722; +} +{ +mul.f16x2 r5858, r5855, r5712; +} +{ +add.f16x2 r5861, r5852, r5858; +} +{ +sub.f16x2 r5864, r5727, r5728; +} +{ +mul.f16x2 r5867, r5864, r5710; +} +{ +sub.f16x2 r5870, r5733, r5734; +} +{ +mul.f16x2 r5873, r5870, r5713; +} +{ +add.f16x2 r5876, r5867, r5873; +} +{ +add.f16x2 r5879, r5861, r5876; +} +{ +add.f16x2 r5882, r5727, r5728; +} +{ +mul.f16x2 r5885, r5882, r5704; +} +{ +add.f16x2 r5888, r5730, r5885; +} +{ +add.f16x2 r5891, r5733, r5734; +} +{ +mul.f16x2 r5894, r5891, r5708; +} +{ +add.f16x2 r5897, r5888, r5894; +} +{ +sub.f16x2 r5900, r5715, r5716; +} +{ +mul.f16x2 r5903, r5900, r5706; +} +{ +sub.f16x2 r5906, r5721, r5722; +} +{ +mul.f16x2 r5909, r5906, r5710; +} +{ +add.f16x2 r5912, r5903, r5909; +} +{ +add.f16x2 r5915, r5897, r5912; +} +{ +add.f16x2 r5918, r5727, r5728; +} +{ +mul.f16x2 r5921, r5918, r5704; +} +{ +add.f16x2 r5924, r5730, r5921; +} +{ +add.f16x2 r5927, r5733, r5734; +} +{ +mul.f16x2 r5930, r5927, r5708; +} +{ +add.f16x2 r5933, r5924, r5930; +} +{ +sub.f16x2 r5936, r5715, r5716; +} +{ +mul.f16x2 r5939, r5936, r5706; +} +{ +sub.f16x2 r5942, r5721, r5722; +} +{ +mul.f16x2 r5945, r5942, r5710; +} +{ +add.f16x2 r5948, r5939, r5945; +} +{ +sub.f16x2 r5951, r5933, r5948; +} +{ +add.f16x2 r5954, r5727, r5728; +} +{ +mul.f16x2 r5957, r5954, r5708; +} +{ +add.f16x2 r5960, r5730, r5957; +} +{ +add.f16x2 r5963, r5733, r5734; +} +{ +mul.f16x2 r5966, r5963, r5712; +} +{ +add.f16x2 r5969, r5960, r5966; +} +{ +sub.f16x2 r5972, r5715, r5716; +} +{ +mul.f16x2 r5975, r5972, r5710; +} +{ +sub.f16x2 r5978, r5721, r5722; +} +{ +mul.f16x2 r5981, r5978, r5713; +} +{ +add.f16x2 r5984, r5975, r5981; +} +{ +add.f16x2 r5987, r5969, r5984; +} +{ +add.f16x2 r5990, r5727, r5728; +} +{ +mul.f16x2 r5993, r5990, r5708; +} +{ +add.f16x2 r5996, r5730, r5993; +} +{ +add.f16x2 r5999, r5733, r5734; +} +{ +mul.f16x2 r6002, r5999, r5712; +} +{ +add.f16x2 r6005, r5996, r6002; +} +{ +sub.f16x2 r6008, r5715, r5716; +} +{ +mul.f16x2 r6011, r6008, r5710; +} +{ +sub.f16x2 r6014, r5721, r5722; +} +{ +mul.f16x2 r6017, r6014, r5713; +} +{ +add.f16x2 r6020, r6011, r6017; +} +{ +sub.f16x2 r6023, r6005, r6020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r6027, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r6028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r6029, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6030, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6031, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6032, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6033, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6042, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6048, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6057, {low, high}; +} +{ +mul.f16x2 r6074, r4805, r6026; +} +{ +mul.f16x2 r6077, r4949, r6027; +} +{ +sub.f16x2 r6080, r6074, r6077; +} +{ +mul.f16x2 r6083, r4805, r6027; +} +{ +fma.rn.f16x2 r6086, r4949, r6026, r6083; +} +{ +mul.f16x2 r6090, r5127, r6028; +} +{ +mul.f16x2 r6093, r5271, r6029; +} +{ +sub.f16x2 r6096, r6090, r6093; +} +{ +mul.f16x2 r6099, r5127, r6029; +} +{ +fma.rn.f16x2 r6102, r5271, r6028, r6099; +} +{ +mul.f16x2 r6106, r5449, r6030; +} +{ +mul.f16x2 r6109, r5593, r6031; +} +{ +sub.f16x2 r6112, r6106, r6109; +} +{ +mul.f16x2 r6115, r5449, r6031; +} +{ +fma.rn.f16x2 r6118, r5593, r6030, r6115; +} +{ +mul.f16x2 r6122, r5771, r6032; +} +{ +mul.f16x2 r6125, r5915, r6033; +} +{ +sub.f16x2 r6128, r6122, r6125; +} +{ +mul.f16x2 r6131, r5771, r6033; +} +{ +fma.rn.f16x2 r6134, r5915, r6032, r6131; +} +{ +mul.f16x2 r6138, r4877, r6028; +} +{ +mul.f16x2 r6141, r5021, r6029; +} +{ +sub.f16x2 r6144, r6138, r6141; +} +{ +mul.f16x2 r6147, r4877, r6029; +} +{ +fma.rn.f16x2 r6150, r5021, r6028, r6147; +} +{ +mul.f16x2 r6154, r5199, r6032; +} +{ +mul.f16x2 r6157, r5343, r6033; +} +{ +sub.f16x2 r6160, r6154, r6157; +} +{ +mul.f16x2 r6163, r5199, r6033; +} +{ +fma.rn.f16x2 r6166, r5343, r6032, r6163; +} +{ +mul.f16x2 r6170, r5521, r6036; +} +{ +mul.f16x2 r6173, r5665, r6037; +} +{ +sub.f16x2 r6176, r6170, r6173; +} +{ +mul.f16x2 r6179, r5521, r6037; +} +{ +fma.rn.f16x2 r6182, r5665, r6036, r6179; +} +{ +mul.f16x2 r6186, r5843, r6040; +} +{ +mul.f16x2 r6189, r5987, r6041; +} +{ +sub.f16x2 r6192, r6186, r6189; +} +{ +mul.f16x2 r6195, r5843, r6041; +} +{ +fma.rn.f16x2 r6198, r5987, r6040, r6195; +} +{ +mul.f16x2 r6202, r4913, r6030; +} +{ +mul.f16x2 r6205, r5057, r6031; +} +{ +sub.f16x2 r6208, r6202, r6205; +} +{ +mul.f16x2 r6211, r4913, r6031; +} +{ +fma.rn.f16x2 r6214, r5057, r6030, r6211; +} +{ +mul.f16x2 r6218, r5235, r6036; +} +{ +mul.f16x2 r6221, r5379, r6037; +} +{ +sub.f16x2 r6224, r6218, r6221; +} +{ +mul.f16x2 r6227, r5235, r6037; +} +{ +fma.rn.f16x2 r6230, r5379, r6036, r6227; +} +{ +mul.f16x2 r6234, r5557, r6042; +} +{ +mul.f16x2 r6237, r5701, r6043; +} +{ +sub.f16x2 r6240, r6234, r6237; +} +{ +mul.f16x2 r6243, r5557, r6043; +} +{ +fma.rn.f16x2 r6246, r5701, r6042, r6243; +} +{ +mul.f16x2 r6250, r5879, r6048; +} +{ +mul.f16x2 r6253, r6023, r6049; +} +{ +sub.f16x2 r6256, r6250, r6253; +} +{ +mul.f16x2 r6259, r5879, r6049; +} +{ +fma.rn.f16x2 r6262, r6023, r6048, r6259; +} +{ +mul.f16x2 r6266, r4841, r6032; +} +{ +mul.f16x2 r6269, r4985, r6033; +} +{ +sub.f16x2 r6272, r6266, r6269; +} +{ +mul.f16x2 r6275, r4841, r6033; +} +{ +fma.rn.f16x2 r6278, r4985, r6032, r6275; +} +{ +mul.f16x2 r6282, r5163, r6040; +} +{ +mul.f16x2 r6285, r5307, r6041; +} +{ +sub.f16x2 r6288, r6282, r6285; +} +{ +mul.f16x2 r6291, r5163, r6041; +} +{ +fma.rn.f16x2 r6294, r5307, r6040, r6291; +} +{ +mul.f16x2 r6298, r5485, r6048; +} +{ +mul.f16x2 r6301, r5629, r6049; +} +{ +sub.f16x2 r6304, r6298, r6301; +} +{ +mul.f16x2 r6307, r5485, r6049; +} +{ +fma.rn.f16x2 r6310, r5629, r6048, r6307; +} +{ +mul.f16x2 r6314, r5807, r6056; +} +{ +mul.f16x2 r6317, r5951, r6057; +} +{ +sub.f16x2 r6320, r6314, r6317; +} +{ +mul.f16x2 r6323, r5807, r6057; +} +{ +fma.rn.f16x2 r6326, r5951, r6056, r6323; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6331, {low, high}; +} +{ +neg.f16x2 r6332, r6331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6334, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6335, {low, high}; +} +{ +neg.f16x2 r6336, r6335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6338, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6339, {low, high}; +} +{ +add.f16x2 r6340, r4757, r5723; +} +{ +add.f16x2 r6343, r4435, r6340; +} +{ +add.f16x2 r6346, r5079, r5401; +} +{ +add.f16x2 r6349, r6343, r6346; +} +{ +add.f16x2 r6352, r4769, r5735; +} +{ +add.f16x2 r6355, r4447, r6352; +} +{ +add.f16x2 r6358, r5091, r5413; +} +{ +add.f16x2 r6361, r6355, r6358; +} +{ +add.f16x2 r6364, r4757, r5723; +} +{ +mul.f16x2 r6367, r6364, r6330; +} +{ +add.f16x2 r6370, r4435, r6367; +} +{ +add.f16x2 r6373, r5079, r5401; +} +{ +mul.f16x2 r6376, r6373, r6334; +} +{ +add.f16x2 r6379, r6370, r6376; +} +{ +sub.f16x2 r6382, r4769, r5735; +} +{ +mul.f16x2 r6385, r6382, r6332; +} +{ +sub.f16x2 r6388, r5091, r5413; +} +{ +mul.f16x2 r6391, r6388, r6336; +} +{ +add.f16x2 r6394, r6385, r6391; +} +{ +sub.f16x2 r6397, r6379, r6394; +} +{ +add.f16x2 r6400, r4757, r5723; +} +{ +mul.f16x2 r6403, r6400, r6330; +} +{ +add.f16x2 r6406, r4435, r6403; +} +{ +add.f16x2 r6409, r5079, r5401; +} +{ +mul.f16x2 r6412, r6409, r6334; +} +{ +add.f16x2 r6415, r6406, r6412; +} +{ +sub.f16x2 r6418, r4769, r5735; +} +{ +mul.f16x2 r6421, r6418, r6332; +} +{ +sub.f16x2 r6424, r5091, r5413; +} +{ +mul.f16x2 r6427, r6424, r6336; +} +{ +add.f16x2 r6430, r6421, r6427; +} +{ +add.f16x2 r6433, r6415, r6430; +} +{ +add.f16x2 r6436, r4757, r5723; +} +{ +mul.f16x2 r6439, r6436, r6334; +} +{ +add.f16x2 r6442, r4435, r6439; +} +{ +add.f16x2 r6445, r5079, r5401; +} +{ +mul.f16x2 r6448, r6445, r6338; +} +{ +add.f16x2 r6451, r6442, r6448; +} +{ +sub.f16x2 r6454, r4769, r5735; +} +{ +mul.f16x2 r6457, r6454, r6336; +} +{ +sub.f16x2 r6460, r5091, r5413; +} +{ +mul.f16x2 r6463, r6460, r6339; +} +{ +add.f16x2 r6466, r6457, r6463; +} +{ +sub.f16x2 r6469, r6451, r6466; +} +{ +add.f16x2 r6472, r4757, r5723; +} +{ +mul.f16x2 r6475, r6472, r6334; +} +{ +add.f16x2 r6478, r4435, r6475; +} +{ +add.f16x2 r6481, r5079, r5401; +} +{ +mul.f16x2 r6484, r6481, r6338; +} +{ +add.f16x2 r6487, r6478, r6484; +} +{ +sub.f16x2 r6490, r4769, r5735; +} +{ +mul.f16x2 r6493, r6490, r6336; +} +{ +sub.f16x2 r6496, r5091, r5413; +} +{ +mul.f16x2 r6499, r6496, r6339; +} +{ +add.f16x2 r6502, r6493, r6499; +} +{ +add.f16x2 r6505, r6487, r6502; +} +{ +add.f16x2 r6508, r4769, r5735; +} +{ +mul.f16x2 r6511, r6508, r6330; +} +{ +add.f16x2 r6514, r4447, r6511; +} +{ +add.f16x2 r6517, r5091, r5413; +} +{ +mul.f16x2 r6520, r6517, r6334; +} +{ +add.f16x2 r6523, r6514, r6520; +} +{ +sub.f16x2 r6526, r4757, r5723; +} +{ +mul.f16x2 r6529, r6526, r6332; +} +{ +sub.f16x2 r6532, r5079, r5401; +} +{ +mul.f16x2 r6535, r6532, r6336; +} +{ +add.f16x2 r6538, r6529, r6535; +} +{ +add.f16x2 r6541, r6523, r6538; +} +{ +add.f16x2 r6544, r4769, r5735; +} +{ +mul.f16x2 r6547, r6544, r6330; +} +{ +add.f16x2 r6550, r4447, r6547; +} +{ +add.f16x2 r6553, r5091, r5413; +} +{ +mul.f16x2 r6556, r6553, r6334; +} +{ +add.f16x2 r6559, r6550, r6556; +} +{ +sub.f16x2 r6562, r4757, r5723; +} +{ +mul.f16x2 r6565, r6562, r6332; +} +{ +sub.f16x2 r6568, r5079, r5401; +} +{ +mul.f16x2 r6571, r6568, r6336; +} +{ +add.f16x2 r6574, r6565, r6571; +} +{ +sub.f16x2 r6577, r6559, r6574; +} +{ +add.f16x2 r6580, r4769, r5735; +} +{ +mul.f16x2 r6583, r6580, r6334; +} +{ +add.f16x2 r6586, r4447, r6583; +} +{ +add.f16x2 r6589, r5091, r5413; +} +{ +mul.f16x2 r6592, r6589, r6338; +} +{ +add.f16x2 r6595, r6586, r6592; +} +{ +sub.f16x2 r6598, r4757, r5723; +} +{ +mul.f16x2 r6601, r6598, r6336; +} +{ +sub.f16x2 r6604, r5079, r5401; +} +{ +mul.f16x2 r6607, r6604, r6339; +} +{ +add.f16x2 r6610, r6601, r6607; +} +{ +add.f16x2 r6613, r6595, r6610; +} +{ +add.f16x2 r6616, r4769, r5735; +} +{ +mul.f16x2 r6619, r6616, r6334; +} +{ +add.f16x2 r6622, r4447, r6619; +} +{ +add.f16x2 r6625, r5091, r5413; +} +{ +mul.f16x2 r6628, r6625, r6338; +} +{ +add.f16x2 r6631, r6622, r6628; +} +{ +sub.f16x2 r6634, r4757, r5723; +} +{ +mul.f16x2 r6637, r6634, r6336; +} +{ +sub.f16x2 r6640, r5079, r5401; +} +{ +mul.f16x2 r6643, r6640, r6339; +} +{ +add.f16x2 r6646, r6637, r6643; +} +{ +sub.f16x2 r6649, r6631, r6646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6653, {low, high}; +} +{ +neg.f16x2 r6654, r6653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6656, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6657, {low, high}; +} +{ +neg.f16x2 r6658, r6657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6660, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6661, {low, high}; +} +{ +add.f16x2 r6662, r6080, r6128; +} +{ +add.f16x2 r6665, r4483, r6662; +} +{ +add.f16x2 r6668, r6096, r6112; +} +{ +add.f16x2 r6671, r6665, r6668; +} +{ +add.f16x2 r6674, r6086, r6134; +} +{ +add.f16x2 r6677, r4627, r6674; +} +{ +add.f16x2 r6680, r6102, r6118; +} +{ +add.f16x2 r6683, r6677, r6680; +} +{ +add.f16x2 r6686, r6080, r6128; +} +{ +mul.f16x2 r6689, r6686, r6652; +} +{ +add.f16x2 r6692, r4483, r6689; +} +{ +add.f16x2 r6695, r6096, r6112; +} +{ +mul.f16x2 r6698, r6695, r6656; +} +{ +add.f16x2 r6701, r6692, r6698; +} +{ +sub.f16x2 r6704, r6086, r6134; +} +{ +mul.f16x2 r6707, r6704, r6654; +} +{ +sub.f16x2 r6710, r6102, r6118; +} +{ +mul.f16x2 r6713, r6710, r6658; +} +{ +add.f16x2 r6716, r6707, r6713; +} +{ +sub.f16x2 r6719, r6701, r6716; +} +{ +add.f16x2 r6722, r6080, r6128; +} +{ +mul.f16x2 r6725, r6722, r6652; +} +{ +add.f16x2 r6728, r4483, r6725; +} +{ +add.f16x2 r6731, r6096, r6112; +} +{ +mul.f16x2 r6734, r6731, r6656; +} +{ +add.f16x2 r6737, r6728, r6734; +} +{ +sub.f16x2 r6740, r6086, r6134; +} +{ +mul.f16x2 r6743, r6740, r6654; +} +{ +sub.f16x2 r6746, r6102, r6118; +} +{ +mul.f16x2 r6749, r6746, r6658; +} +{ +add.f16x2 r6752, r6743, r6749; +} +{ +add.f16x2 r6755, r6737, r6752; +} +{ +add.f16x2 r6758, r6080, r6128; +} +{ +mul.f16x2 r6761, r6758, r6656; +} +{ +add.f16x2 r6764, r4483, r6761; +} +{ +add.f16x2 r6767, r6096, r6112; +} +{ +mul.f16x2 r6770, r6767, r6660; +} +{ +add.f16x2 r6773, r6764, r6770; +} +{ +sub.f16x2 r6776, r6086, r6134; +} +{ +mul.f16x2 r6779, r6776, r6658; +} +{ +sub.f16x2 r6782, r6102, r6118; +} +{ +mul.f16x2 r6785, r6782, r6661; +} +{ +add.f16x2 r6788, r6779, r6785; +} +{ +sub.f16x2 r6791, r6773, r6788; +} +{ +add.f16x2 r6794, r6080, r6128; +} +{ +mul.f16x2 r6797, r6794, r6656; +} +{ +add.f16x2 r6800, r4483, r6797; +} +{ +add.f16x2 r6803, r6096, r6112; +} +{ +mul.f16x2 r6806, r6803, r6660; +} +{ +add.f16x2 r6809, r6800, r6806; +} +{ +sub.f16x2 r6812, r6086, r6134; +} +{ +mul.f16x2 r6815, r6812, r6658; +} +{ +sub.f16x2 r6818, r6102, r6118; +} +{ +mul.f16x2 r6821, r6818, r6661; +} +{ +add.f16x2 r6824, r6815, r6821; +} +{ +add.f16x2 r6827, r6809, r6824; +} +{ +add.f16x2 r6830, r6086, r6134; +} +{ +mul.f16x2 r6833, r6830, r6652; +} +{ +add.f16x2 r6836, r4627, r6833; +} +{ +add.f16x2 r6839, r6102, r6118; +} +{ +mul.f16x2 r6842, r6839, r6656; +} +{ +add.f16x2 r6845, r6836, r6842; +} +{ +sub.f16x2 r6848, r6080, r6128; +} +{ +mul.f16x2 r6851, r6848, r6654; +} +{ +sub.f16x2 r6854, r6096, r6112; +} +{ +mul.f16x2 r6857, r6854, r6658; +} +{ +add.f16x2 r6860, r6851, r6857; +} +{ +add.f16x2 r6863, r6845, r6860; +} +{ +add.f16x2 r6866, r6086, r6134; +} +{ +mul.f16x2 r6869, r6866, r6652; +} +{ +add.f16x2 r6872, r4627, r6869; +} +{ +add.f16x2 r6875, r6102, r6118; +} +{ +mul.f16x2 r6878, r6875, r6656; +} +{ +add.f16x2 r6881, r6872, r6878; +} +{ +sub.f16x2 r6884, r6080, r6128; +} +{ +mul.f16x2 r6887, r6884, r6654; +} +{ +sub.f16x2 r6890, r6096, r6112; +} +{ +mul.f16x2 r6893, r6890, r6658; +} +{ +add.f16x2 r6896, r6887, r6893; +} +{ +sub.f16x2 r6899, r6881, r6896; +} +{ +add.f16x2 r6902, r6086, r6134; +} +{ +mul.f16x2 r6905, r6902, r6656; +} +{ +add.f16x2 r6908, r4627, r6905; +} +{ +add.f16x2 r6911, r6102, r6118; +} +{ +mul.f16x2 r6914, r6911, r6660; +} +{ +add.f16x2 r6917, r6908, r6914; +} +{ +sub.f16x2 r6920, r6080, r6128; +} +{ +mul.f16x2 r6923, r6920, r6658; +} +{ +sub.f16x2 r6926, r6096, r6112; +} +{ +mul.f16x2 r6929, r6926, r6661; +} +{ +add.f16x2 r6932, r6923, r6929; +} +{ +add.f16x2 r6935, r6917, r6932; +} +{ +add.f16x2 r6938, r6086, r6134; +} +{ +mul.f16x2 r6941, r6938, r6656; +} +{ +add.f16x2 r6944, r4627, r6941; +} +{ +add.f16x2 r6947, r6102, r6118; +} +{ +mul.f16x2 r6950, r6947, r6660; +} +{ +add.f16x2 r6953, r6944, r6950; +} +{ +sub.f16x2 r6956, r6080, r6128; +} +{ +mul.f16x2 r6959, r6956, r6658; +} +{ +sub.f16x2 r6962, r6096, r6112; +} +{ +mul.f16x2 r6965, r6962, r6661; +} +{ +add.f16x2 r6968, r6959, r6965; +} +{ +sub.f16x2 r6971, r6953, r6968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6974, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6975, {low, high}; +} +{ +neg.f16x2 r6976, r6975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6979, {low, high}; +} +{ +neg.f16x2 r6980, r6979; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6983, {low, high}; +} +{ +add.f16x2 r6984, r6144, r6192; +} +{ +add.f16x2 r6987, r4555, r6984; +} +{ +add.f16x2 r6990, r6160, r6176; +} +{ +add.f16x2 r6993, r6987, r6990; +} +{ +add.f16x2 r6996, r6150, r6198; +} +{ +add.f16x2 r6999, r4699, r6996; +} +{ +add.f16x2 r7002, r6166, r6182; +} +{ +add.f16x2 r7005, r6999, r7002; +} +{ +add.f16x2 r7008, r6144, r6192; +} +{ +mul.f16x2 r7011, r7008, r6974; +} +{ +add.f16x2 r7014, r4555, r7011; +} +{ +add.f16x2 r7017, r6160, r6176; +} +{ +mul.f16x2 r7020, r7017, r6978; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6150, r6198; +} +{ +mul.f16x2 r7029, r7026, r6976; +} +{ +sub.f16x2 r7032, r6166, r6182; +} +{ +mul.f16x2 r7035, r7032, r6980; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +sub.f16x2 r7041, r7023, r7038; +} +{ +add.f16x2 r7044, r6144, r6192; +} +{ +mul.f16x2 r7047, r7044, r6974; +} +{ +add.f16x2 r7050, r4555, r7047; +} +{ +add.f16x2 r7053, r6160, r6176; +} +{ +mul.f16x2 r7056, r7053, r6978; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6150, r6198; +} +{ +mul.f16x2 r7065, r7062, r6976; +} +{ +sub.f16x2 r7068, r6166, r6182; +} +{ +mul.f16x2 r7071, r7068, r6980; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +add.f16x2 r7077, r7059, r7074; +} +{ +add.f16x2 r7080, r6144, r6192; +} +{ +mul.f16x2 r7083, r7080, r6978; +} +{ +add.f16x2 r7086, r4555, r7083; +} +{ +add.f16x2 r7089, r6160, r6176; +} +{ +mul.f16x2 r7092, r7089, r6982; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6150, r6198; +} +{ +mul.f16x2 r7101, r7098, r6980; +} +{ +sub.f16x2 r7104, r6166, r6182; +} +{ +mul.f16x2 r7107, r7104, r6983; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +sub.f16x2 r7113, r7095, r7110; +} +{ +add.f16x2 r7116, r6144, r6192; +} +{ +mul.f16x2 r7119, r7116, r6978; +} +{ +add.f16x2 r7122, r4555, r7119; +} +{ +add.f16x2 r7125, r6160, r6176; +} +{ +mul.f16x2 r7128, r7125, r6982; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6150, r6198; +} +{ +mul.f16x2 r7137, r7134, r6980; +} +{ +sub.f16x2 r7140, r6166, r6182; +} +{ +mul.f16x2 r7143, r7140, r6983; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 r7149, r7131, r7146; +} +{ +add.f16x2 r7152, r6150, r6198; +} +{ +mul.f16x2 r7155, r7152, r6974; +} +{ +add.f16x2 r7158, r4699, r7155; +} +{ +add.f16x2 r7161, r6166, r6182; +} +{ +mul.f16x2 r7164, r7161, r6978; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6144, r6192; +} +{ +mul.f16x2 r7173, r7170, r6976; +} +{ +sub.f16x2 r7176, r6160, r6176; +} +{ +mul.f16x2 r7179, r7176, r6980; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +add.f16x2 r7185, r7167, r7182; +} +{ +add.f16x2 r7188, r6150, r6198; +} +{ +mul.f16x2 r7191, r7188, r6974; +} +{ +add.f16x2 r7194, r4699, r7191; +} +{ +add.f16x2 r7197, r6166, r6182; +} +{ +mul.f16x2 r7200, r7197, r6978; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6144, r6192; +} +{ +mul.f16x2 r7209, r7206, r6976; +} +{ +sub.f16x2 r7212, r6160, r6176; +} +{ +mul.f16x2 r7215, r7212, r6980; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +sub.f16x2 r7221, r7203, r7218; +} +{ +add.f16x2 r7224, r6150, r6198; +} +{ +mul.f16x2 r7227, r7224, r6978; +} +{ +add.f16x2 r7230, r4699, r7227; +} +{ +add.f16x2 r7233, r6166, r6182; +} +{ +mul.f16x2 r7236, r7233, r6982; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6144, r6192; +} +{ +mul.f16x2 r7245, r7242, r6980; +} +{ +sub.f16x2 r7248, r6160, r6176; +} +{ +mul.f16x2 r7251, r7248, r6983; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +add.f16x2 r7257, r7239, r7254; +} +{ +add.f16x2 r7260, r6150, r6198; +} +{ +mul.f16x2 r7263, r7260, r6978; +} +{ +add.f16x2 r7266, r4699, r7263; +} +{ +add.f16x2 r7269, r6166, r6182; +} +{ +mul.f16x2 r7272, r7269, r6982; +} +{ +add.f16x2 r7275, r7266, r7272; +} +{ +sub.f16x2 r7278, r6144, r6192; +} +{ +mul.f16x2 r7281, r7278, r6980; +} +{ +sub.f16x2 r7284, r6160, r6176; +} +{ +mul.f16x2 r7287, r7284, r6983; +} +{ +add.f16x2 r7290, r7281, r7287; +} +{ +sub.f16x2 r7293, r7275, r7290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7296, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7297, {low, high}; +} +{ +neg.f16x2 r7298, r7297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7301, {low, high}; +} +{ +neg.f16x2 r7302, r7301; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7305, {low, high}; +} +{ +add.f16x2 r7306, r6208, r6256; +} +{ +add.f16x2 r7309, r4591, r7306; +} +{ +add.f16x2 r7312, r6224, r6240; +} +{ +add.f16x2 r7315, r7309, r7312; +} +{ +add.f16x2 r7318, r6214, r6262; +} +{ +add.f16x2 r7321, r4735, r7318; +} +{ +add.f16x2 r7324, r6230, r6246; +} +{ +add.f16x2 r7327, r7321, r7324; +} +{ +add.f16x2 r7330, r6208, r6256; +} +{ +mul.f16x2 r7333, r7330, r7296; +} +{ +add.f16x2 r7336, r4591, r7333; +} +{ +add.f16x2 r7339, r6224, r6240; +} +{ +mul.f16x2 r7342, r7339, r7300; +} +{ +add.f16x2 r7345, r7336, r7342; +} +{ +sub.f16x2 r7348, r6214, r6262; +} +{ +mul.f16x2 r7351, r7348, r7298; +} +{ +sub.f16x2 r7354, r6230, r6246; +} +{ +mul.f16x2 r7357, r7354, r7302; +} +{ +add.f16x2 r7360, r7351, r7357; +} +{ +sub.f16x2 r7363, r7345, r7360; +} +{ +add.f16x2 r7366, r6208, r6256; +} +{ +mul.f16x2 r7369, r7366, r7296; +} +{ +add.f16x2 r7372, r4591, r7369; +} +{ +add.f16x2 r7375, r6224, r6240; +} +{ +mul.f16x2 r7378, r7375, r7300; +} +{ +add.f16x2 r7381, r7372, r7378; +} +{ +sub.f16x2 r7384, r6214, r6262; +} +{ +mul.f16x2 r7387, r7384, r7298; +} +{ +sub.f16x2 r7390, r6230, r6246; +} +{ +mul.f16x2 r7393, r7390, r7302; +} +{ +add.f16x2 r7396, r7387, r7393; +} +{ +add.f16x2 r7399, r7381, r7396; +} +{ +add.f16x2 r7402, r6208, r6256; +} +{ +mul.f16x2 r7405, r7402, r7300; +} +{ +add.f16x2 r7408, r4591, r7405; +} +{ +add.f16x2 r7411, r6224, r6240; +} +{ +mul.f16x2 r7414, r7411, r7304; +} +{ +add.f16x2 r7417, r7408, r7414; +} +{ +sub.f16x2 r7420, r6214, r6262; +} +{ +mul.f16x2 r7423, r7420, r7302; +} +{ +sub.f16x2 r7426, r6230, r6246; +} +{ +mul.f16x2 r7429, r7426, r7305; +} +{ +add.f16x2 r7432, r7423, r7429; +} +{ +sub.f16x2 r7435, r7417, r7432; +} +{ +add.f16x2 r7438, r6208, r6256; +} +{ +mul.f16x2 r7441, r7438, r7300; +} +{ +add.f16x2 r7444, r4591, r7441; +} +{ +add.f16x2 r7447, r6224, r6240; +} +{ +mul.f16x2 r7450, r7447, r7304; +} +{ +add.f16x2 r7453, r7444, r7450; +} +{ +sub.f16x2 r7456, r6214, r6262; +} +{ +mul.f16x2 r7459, r7456, r7302; +} +{ +sub.f16x2 r7462, r6230, r6246; +} +{ +mul.f16x2 r7465, r7462, r7305; +} +{ +add.f16x2 r7468, r7459, r7465; +} +{ +add.f16x2 r7471, r7453, r7468; +} +{ +add.f16x2 r7474, r6214, r6262; +} +{ +mul.f16x2 r7477, r7474, r7296; +} +{ +add.f16x2 r7480, r4735, r7477; +} +{ +add.f16x2 r7483, r6230, r6246; +} +{ +mul.f16x2 r7486, r7483, r7300; +} +{ +add.f16x2 r7489, r7480, r7486; +} +{ +sub.f16x2 r7492, r6208, r6256; +} +{ +mul.f16x2 r7495, r7492, r7298; +} +{ +sub.f16x2 r7498, r6224, r6240; +} +{ +mul.f16x2 r7501, r7498, r7302; +} +{ +add.f16x2 r7504, r7495, r7501; +} +{ +add.f16x2 r7507, r7489, r7504; +} +{ +add.f16x2 r7510, r6214, r6262; +} +{ +mul.f16x2 r7513, r7510, r7296; +} +{ +add.f16x2 r7516, r4735, r7513; +} +{ +add.f16x2 r7519, r6230, r6246; +} +{ +mul.f16x2 r7522, r7519, r7300; +} +{ +add.f16x2 r7525, r7516, r7522; +} +{ +sub.f16x2 r7528, r6208, r6256; +} +{ +mul.f16x2 r7531, r7528, r7298; +} +{ +sub.f16x2 r7534, r6224, r6240; +} +{ +mul.f16x2 r7537, r7534, r7302; +} +{ +add.f16x2 r7540, r7531, r7537; +} +{ +sub.f16x2 r7543, r7525, r7540; +} +{ +add.f16x2 r7546, r6214, r6262; +} +{ +mul.f16x2 r7549, r7546, r7300; +} +{ +add.f16x2 r7552, r4735, r7549; +} +{ +add.f16x2 r7555, r6230, r6246; +} +{ +mul.f16x2 r7558, r7555, r7304; +} +{ +add.f16x2 r7561, r7552, r7558; +} +{ +sub.f16x2 r7564, r6208, r6256; +} +{ +mul.f16x2 r7567, r7564, r7302; +} +{ +sub.f16x2 r7570, r6224, r6240; +} +{ +mul.f16x2 r7573, r7570, r7305; +} +{ +add.f16x2 r7576, r7567, r7573; +} +{ +add.f16x2 r7579, r7561, r7576; +} +{ +add.f16x2 r7582, r6214, r6262; +} +{ +mul.f16x2 r7585, r7582, r7300; +} +{ +add.f16x2 r7588, r4735, r7585; +} +{ +add.f16x2 r7591, r6230, r6246; +} +{ +mul.f16x2 r7594, r7591, r7304; +} +{ +add.f16x2 r7597, r7588, r7594; +} +{ +sub.f16x2 r7600, r6208, r6256; +} +{ +mul.f16x2 r7603, r7600, r7302; +} +{ +sub.f16x2 r7606, r6224, r6240; +} +{ +mul.f16x2 r7609, r7606, r7305; +} +{ +add.f16x2 r7612, r7603, r7609; +} +{ +sub.f16x2 r7615, r7597, r7612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7618, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7619, {low, high}; +} +{ +neg.f16x2 r7620, r7619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7623, {low, high}; +} +{ +neg.f16x2 r7624, r7623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7627, {low, high}; +} +{ +add.f16x2 r7628, r6272, r6320; +} +{ +add.f16x2 r7631, r4519, r7628; +} +{ +add.f16x2 r7634, r6288, r6304; +} +{ +add.f16x2 r7637, r7631, r7634; +} +{ +add.f16x2 r7640, r6278, r6326; +} +{ +add.f16x2 r7643, r4663, r7640; +} +{ +add.f16x2 r7646, r6294, r6310; +} +{ +add.f16x2 r7649, r7643, r7646; +} +{ +add.f16x2 r7652, r6272, r6320; +} +{ +mul.f16x2 r7655, r7652, r7618; +} +{ +add.f16x2 r7658, r4519, r7655; +} +{ +add.f16x2 r7661, r6288, r6304; +} +{ +mul.f16x2 r7664, r7661, r7622; +} +{ +add.f16x2 r7667, r7658, r7664; +} +{ +sub.f16x2 r7670, r6278, r6326; +} +{ +mul.f16x2 r7673, r7670, r7620; +} +{ +sub.f16x2 r7676, r6294, r6310; +} +{ +mul.f16x2 r7679, r7676, r7624; +} +{ +add.f16x2 r7682, r7673, r7679; +} +{ +sub.f16x2 r7685, r7667, r7682; +} +{ +add.f16x2 r7688, r6272, r6320; +} +{ +mul.f16x2 r7691, r7688, r7618; +} +{ +add.f16x2 r7694, r4519, r7691; +} +{ +add.f16x2 r7697, r6288, r6304; +} +{ +mul.f16x2 r7700, r7697, r7622; +} +{ +add.f16x2 r7703, r7694, r7700; +} +{ +sub.f16x2 r7706, r6278, r6326; +} +{ +mul.f16x2 r7709, r7706, r7620; +} +{ +sub.f16x2 r7712, r6294, r6310; +} +{ +mul.f16x2 r7715, r7712, r7624; +} +{ +add.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 r7721, r7703, r7718; +} +{ +add.f16x2 r7724, r6272, r6320; +} +{ +mul.f16x2 r7727, r7724, r7622; +} +{ +add.f16x2 r7730, r4519, r7727; +} +{ +add.f16x2 r7733, r6288, r6304; +} +{ +mul.f16x2 r7736, r7733, r7626; +} +{ +add.f16x2 r7739, r7730, r7736; +} +{ +sub.f16x2 r7742, r6278, r6326; +} +{ +mul.f16x2 r7745, r7742, r7624; +} +{ +sub.f16x2 r7748, r6294, r6310; +} +{ +mul.f16x2 r7751, r7748, r7627; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +sub.f16x2 r7757, r7739, r7754; +} +{ +add.f16x2 r7760, r6272, r6320; +} +{ +mul.f16x2 r7763, r7760, r7622; +} +{ +add.f16x2 r7766, r4519, r7763; +} +{ +add.f16x2 r7769, r6288, r6304; +} +{ +mul.f16x2 r7772, r7769, r7626; +} +{ +add.f16x2 r7775, r7766, r7772; +} +{ +sub.f16x2 r7778, r6278, r6326; +} +{ +mul.f16x2 r7781, r7778, r7624; +} +{ +sub.f16x2 r7784, r6294, r6310; +} +{ +mul.f16x2 r7787, r7784, r7627; +} +{ +add.f16x2 r7790, r7781, r7787; +} +{ +add.f16x2 r7793, r7775, r7790; +} +{ +add.f16x2 r7796, r6278, r6326; +} +{ +mul.f16x2 r7799, r7796, r7618; +} +{ +add.f16x2 r7802, r4663, r7799; +} +{ +add.f16x2 r7805, r6294, r6310; +} +{ +mul.f16x2 r7808, r7805, r7622; +} +{ +add.f16x2 r7811, r7802, r7808; +} +{ +sub.f16x2 r7814, r6272, r6320; +} +{ +mul.f16x2 r7817, r7814, r7620; +} +{ +sub.f16x2 r7820, r6288, r6304; +} +{ +mul.f16x2 r7823, r7820, r7624; +} +{ +add.f16x2 r7826, r7817, r7823; +} +{ +add.f16x2 r7829, r7811, r7826; +} +{ +add.f16x2 r7832, r6278, r6326; +} +{ +mul.f16x2 r7835, r7832, r7618; +} +{ +add.f16x2 r7838, r4663, r7835; +} +{ +add.f16x2 r7841, r6294, r6310; +} +{ +mul.f16x2 r7844, r7841, r7622; +} +{ +add.f16x2 r7847, r7838, r7844; +} +{ +sub.f16x2 r7850, r6272, r6320; +} +{ +mul.f16x2 r7853, r7850, r7620; +} +{ +sub.f16x2 r7856, r6288, r6304; +} +{ +mul.f16x2 r7859, r7856, r7624; +} +{ +add.f16x2 r7862, r7853, r7859; +} +{ +sub.f16x2 r7865, r7847, r7862; +} +{ +add.f16x2 r7868, r6278, r6326; +} +{ +mul.f16x2 r7871, r7868, r7622; +} +{ +add.f16x2 r7874, r4663, r7871; +} +{ +add.f16x2 r7877, r6294, r6310; +} +{ +mul.f16x2 r7880, r7877, r7626; +} +{ +add.f16x2 r7883, r7874, r7880; +} +{ +sub.f16x2 r7886, r6272, r6320; +} +{ +mul.f16x2 r7889, r7886, r7624; +} +{ +sub.f16x2 r7892, r6288, r6304; +} +{ +mul.f16x2 r7895, r7892, r7627; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 r7901, r7883, r7898; +} +{ +add.f16x2 r7904, r6278, r6326; +} +{ +mul.f16x2 r7907, r7904, r7622; +} +{ +add.f16x2 r7910, r4663, r7907; +} +{ +add.f16x2 r7913, r6294, r6310; +} +{ +mul.f16x2 r7916, r7913, r7626; +} +{ +add.f16x2 r7919, r7910, r7916; +} +{ +sub.f16x2 r7922, r6272, r6320; +} +{ +mul.f16x2 r7925, r7922, r7624; +} +{ +sub.f16x2 r7928, r6288, r6304; +} +{ +mul.f16x2 r7931, r7928, r7627; +} +{ +add.f16x2 r7934, r7925, r7931; +} +{ +sub.f16x2 r7937, r7919, r7934; +} +mul.wide.u32 rd4, r10447, 1374389535; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r10451, rd5; +cvt.rn.f32.u32 f600, r10451; +mul.f32 f601, f600, 0f3D4DE32E; +cos.approx.f32 f485, f601; +sin.approx.f32 f602, f601; +neg.f32 f486, f602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f485; +cvt.rn.f16.f32 high, f486; +mov.b32 r7940, {low, high}; +} +mul.lo.s32 r10452, r10451, 25; +sub.s32 r10453, r10447, r10452; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7943, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7945, {high, high}; +} +{ +mul.f16x2 r7947, r6683, r7945; +} +{ +fma.rn.f16x2 r7950, r6671, r7943, r7947; +} +{ +mul.f16x2 r7954, r6671, r7945; +} +{ +neg.f16x2 r7957, r7954; +} +{ +fma.rn.f16x2 r7959, r6683, r7943, r7957; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7963, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7965, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r7967, {low, high}; +} +{ +mul.f16x2 r7968, r7965, r7967; +} +{ +mul.f16x2 r7971, r7940, r7963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7974, {high, low}; +} +{ +fma.rn.f16x2 r7976, r7968, r7974, r7971; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7976; +mov.b32 r7980, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7976; +mov.b32 r7982, {high, high}; +} +{ +mul.f16x2 r7984, r7005, r7982; +} +{ +fma.rn.f16x2 r7987, r6993, r7980, r7984; +} +{ +mul.f16x2 r7991, r6993, r7982; +} +{ +neg.f16x2 r7994, r7991; +} +{ +fma.rn.f16x2 r7996, r7005, r7980, r7994; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8000, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8002, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8004, {low, high}; +} +{ +mul.f16x2 r8005, r8002, r8004; +} +{ +mul.f16x2 r8008, r7976, r8000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7976; +mov.b32 r8011, {high, low}; +} +{ +fma.rn.f16x2 r8013, r8005, r8011, r8008; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8013; +mov.b32 r8017, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8013; +mov.b32 r8019, {high, high}; +} +{ +mul.f16x2 r8021, r7327, r8019; +} +{ +fma.rn.f16x2 r8024, r7315, r8017, r8021; +} +{ +mul.f16x2 r8028, r7315, r8019; +} +{ +neg.f16x2 r8031, r8028; +} +{ +fma.rn.f16x2 r8033, r7327, r8017, r8031; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8037, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8039, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8041, {low, high}; +} +{ +mul.f16x2 r8042, r8039, r8041; +} +{ +mul.f16x2 r8045, r8013, r8037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8013; +mov.b32 r8048, {high, low}; +} +{ +fma.rn.f16x2 r8050, r8042, r8048, r8045; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8050; +mov.b32 r8054, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8050; +mov.b32 r8056, {high, high}; +} +{ +mul.f16x2 r8058, r7649, r8056; +} +{ +fma.rn.f16x2 r8061, r7637, r8054, r8058; +} +{ +mul.f16x2 r8065, r7637, r8056; +} +{ +neg.f16x2 r8068, r8065; +} +{ +fma.rn.f16x2 r8070, r7649, r8054, r8068; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8074, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8076, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8078, {low, high}; +} +{ +mul.f16x2 r8079, r8076, r8078; +} +{ +mul.f16x2 r8082, r8050, r8074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8050; +mov.b32 r8085, {high, low}; +} +{ +fma.rn.f16x2 r8087, r8079, r8085, r8082; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8087; +mov.b32 r8091, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8087; +mov.b32 r8093, {high, high}; +} +{ +mul.f16x2 r8095, r6541, r8093; +} +{ +fma.rn.f16x2 r8098, r6397, r8091, r8095; +} +{ +mul.f16x2 r8102, r6397, r8093; +} +{ +neg.f16x2 r8105, r8102; +} +{ +fma.rn.f16x2 r8107, r6541, r8091, r8105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8115, {low, high}; +} +{ +mul.f16x2 r8116, r8113, r8115; +} +{ +mul.f16x2 r8119, r8087, r8111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8087; +mov.b32 r8122, {high, low}; +} +{ +fma.rn.f16x2 r8124, r8116, r8122, r8119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8124; +mov.b32 r8128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8124; +mov.b32 r8130, {high, high}; +} +{ +mul.f16x2 r8132, r6863, r8130; +} +{ +fma.rn.f16x2 r8135, r6719, r8128, r8132; +} +{ +mul.f16x2 r8139, r6719, r8130; +} +{ +neg.f16x2 r8142, r8139; +} +{ +fma.rn.f16x2 r8144, r6863, r8128, r8142; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8148, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8150, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8152, {low, high}; +} +{ +mul.f16x2 r8153, r8150, r8152; +} +{ +mul.f16x2 r8156, r8124, r8148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8124; +mov.b32 r8159, {high, low}; +} +{ +fma.rn.f16x2 r8161, r8153, r8159, r8156; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8161; +mov.b32 r8165, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8161; +mov.b32 r8167, {high, high}; +} +{ +mul.f16x2 r8169, r7185, r8167; +} +{ +fma.rn.f16x2 r8172, r7041, r8165, r8169; +} +{ +mul.f16x2 r8176, r7041, r8167; +} +{ +neg.f16x2 r8179, r8176; +} +{ +fma.rn.f16x2 r8181, r7185, r8165, r8179; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8185, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8187, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8189, {low, high}; +} +{ +mul.f16x2 r8190, r8187, r8189; +} +{ +mul.f16x2 r8193, r8161, r8185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8161; +mov.b32 r8196, {high, low}; +} +{ +fma.rn.f16x2 r8198, r8190, r8196, r8193; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8198; +mov.b32 r8202, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8198; +mov.b32 r8204, {high, high}; +} +{ +mul.f16x2 r8206, r7507, r8204; +} +{ +fma.rn.f16x2 r8209, r7363, r8202, r8206; +} +{ +mul.f16x2 r8213, r7363, r8204; +} +{ +neg.f16x2 r8216, r8213; +} +{ +fma.rn.f16x2 r8218, r7507, r8202, r8216; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8222, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8224, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8226, {low, high}; +} +{ +mul.f16x2 r8227, r8224, r8226; +} +{ +mul.f16x2 r8230, r8198, r8222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8198; +mov.b32 r8233, {high, low}; +} +{ +fma.rn.f16x2 r8235, r8227, r8233, r8230; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8235; +mov.b32 r8239, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8235; +mov.b32 r8241, {high, high}; +} +{ +mul.f16x2 r8243, r7829, r8241; +} +{ +fma.rn.f16x2 r8246, r7685, r8239, r8243; +} +{ +mul.f16x2 r8250, r7685, r8241; +} +{ +neg.f16x2 r8253, r8250; +} +{ +fma.rn.f16x2 r8255, r7829, r8239, r8253; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8259, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8261, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8263, {low, high}; +} +{ +mul.f16x2 r8264, r8261, r8263; +} +{ +mul.f16x2 r8267, r8235, r8259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8235; +mov.b32 r8270, {high, low}; +} +{ +fma.rn.f16x2 r8272, r8264, r8270, r8267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8272; +mov.b32 r8276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8272; +mov.b32 r8278, {high, high}; +} +{ +mul.f16x2 r8280, r6613, r8278; +} +{ +fma.rn.f16x2 r8283, r6469, r8276, r8280; +} +{ +mul.f16x2 r8287, r6469, r8278; +} +{ +neg.f16x2 r8290, r8287; +} +{ +fma.rn.f16x2 r8292, r6613, r8276, r8290; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8296, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8298, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8300, {low, high}; +} +{ +mul.f16x2 r8301, r8298, r8300; +} +{ +mul.f16x2 r8304, r8272, r8296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8272; +mov.b32 r8307, {high, low}; +} +{ +fma.rn.f16x2 r8309, r8301, r8307, r8304; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8309; +mov.b32 r8313, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8309; +mov.b32 r8315, {high, high}; +} +{ +mul.f16x2 r8317, r6935, r8315; +} +{ +fma.rn.f16x2 r8320, r6791, r8313, r8317; +} +{ +mul.f16x2 r8324, r6791, r8315; +} +{ +neg.f16x2 r8327, r8324; +} +{ +fma.rn.f16x2 r8329, r6935, r8313, r8327; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8333, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8335, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8337, {low, high}; +} +{ +mul.f16x2 r8338, r8335, r8337; +} +{ +mul.f16x2 r8341, r8309, r8333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8309; +mov.b32 r8344, {high, low}; +} +{ +fma.rn.f16x2 r8346, r8338, r8344, r8341; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8346; +mov.b32 r8350, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8346; +mov.b32 r8352, {high, high}; +} +{ +mul.f16x2 r8354, r7257, r8352; +} +{ +fma.rn.f16x2 r8357, r7113, r8350, r8354; +} +{ +mul.f16x2 r8361, r7113, r8352; +} +{ +neg.f16x2 r8364, r8361; +} +{ +fma.rn.f16x2 r8366, r7257, r8350, r8364; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8370, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8372, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8374, {low, high}; +} +{ +mul.f16x2 r8375, r8372, r8374; +} +{ +mul.f16x2 r8378, r8346, r8370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8346; +mov.b32 r8381, {high, low}; +} +{ +fma.rn.f16x2 r8383, r8375, r8381, r8378; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8383; +mov.b32 r8387, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8383; +mov.b32 r8389, {high, high}; +} +{ +mul.f16x2 r8391, r7579, r8389; +} +{ +fma.rn.f16x2 r8394, r7435, r8387, r8391; +} +{ +mul.f16x2 r8398, r7435, r8389; +} +{ +neg.f16x2 r8401, r8398; +} +{ +fma.rn.f16x2 r8403, r7579, r8387, r8401; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8407, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8409, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8411, {low, high}; +} +{ +mul.f16x2 r8412, r8409, r8411; +} +{ +mul.f16x2 r8415, r8383, r8407; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8383; +mov.b32 r8418, {high, low}; +} +{ +fma.rn.f16x2 r8420, r8412, r8418, r8415; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8420; +mov.b32 r8424, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8420; +mov.b32 r8426, {high, high}; +} +{ +mul.f16x2 r8428, r7901, r8426; +} +{ +fma.rn.f16x2 r8431, r7757, r8424, r8428; +} +{ +mul.f16x2 r8435, r7757, r8426; +} +{ +neg.f16x2 r8438, r8435; +} +{ +fma.rn.f16x2 r8440, r7901, r8424, r8438; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8444, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8446, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8448, {low, high}; +} +{ +mul.f16x2 r8449, r8446, r8448; +} +{ +mul.f16x2 r8452, r8420, r8444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8420; +mov.b32 r8455, {high, low}; +} +{ +fma.rn.f16x2 r8457, r8449, r8455, r8452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8457; +mov.b32 r8461, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8457; +mov.b32 r8463, {high, high}; +} +{ +mul.f16x2 r8465, r6649, r8463; +} +{ +fma.rn.f16x2 r8468, r6505, r8461, r8465; +} +{ +mul.f16x2 r8472, r6505, r8463; +} +{ +neg.f16x2 r8475, r8472; +} +{ +fma.rn.f16x2 r8477, r6649, r8461, r8475; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8481, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8483, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8485, {low, high}; +} +{ +mul.f16x2 r8486, r8483, r8485; +} +{ +mul.f16x2 r8489, r8457, r8481; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8457; +mov.b32 r8492, {high, low}; +} +{ +fma.rn.f16x2 r8494, r8486, r8492, r8489; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8494; +mov.b32 r8498, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8494; +mov.b32 r8500, {high, high}; +} +{ +mul.f16x2 r8502, r6971, r8500; +} +{ +fma.rn.f16x2 r8505, r6827, r8498, r8502; +} +{ +mul.f16x2 r8509, r6827, r8500; +} +{ +neg.f16x2 r8512, r8509; +} +{ +fma.rn.f16x2 r8514, r6971, r8498, r8512; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8518, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8520, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8522, {low, high}; +} +{ +mul.f16x2 r8523, r8520, r8522; +} +{ +mul.f16x2 r8526, r8494, r8518; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8494; +mov.b32 r8529, {high, low}; +} +{ +fma.rn.f16x2 r8531, r8523, r8529, r8526; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8531; +mov.b32 r8535, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8531; +mov.b32 r8537, {high, high}; +} +{ +mul.f16x2 r8539, r7293, r8537; +} +{ +fma.rn.f16x2 r8542, r7149, r8535, r8539; +} +{ +mul.f16x2 r8546, r7149, r8537; +} +{ +neg.f16x2 r8549, r8546; +} +{ +fma.rn.f16x2 r8551, r7293, r8535, r8549; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8555, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8557, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8559, {low, high}; +} +{ +mul.f16x2 r8560, r8557, r8559; +} +{ +mul.f16x2 r8563, r8531, r8555; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8531; +mov.b32 r8566, {high, low}; +} +{ +fma.rn.f16x2 r8568, r8560, r8566, r8563; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8568; +mov.b32 r8572, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8568; +mov.b32 r8574, {high, high}; +} +{ +mul.f16x2 r8576, r7615, r8574; +} +{ +fma.rn.f16x2 r8579, r7471, r8572, r8576; +} +{ +mul.f16x2 r8583, r7471, r8574; +} +{ +neg.f16x2 r8586, r8583; +} +{ +fma.rn.f16x2 r8588, r7615, r8572, r8586; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8592, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8594, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8596, {low, high}; +} +{ +mul.f16x2 r8597, r8594, r8596; +} +{ +mul.f16x2 r8600, r8568, r8592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8568; +mov.b32 r8603, {high, low}; +} +{ +fma.rn.f16x2 r8605, r8597, r8603, r8600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8605; +mov.b32 r8609, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8605; +mov.b32 r8611, {high, high}; +} +{ +mul.f16x2 r8613, r7937, r8611; +} +{ +fma.rn.f16x2 r8616, r7793, r8609, r8613; +} +{ +mul.f16x2 r8620, r7793, r8611; +} +{ +neg.f16x2 r8623, r8620; +} +{ +fma.rn.f16x2 r8625, r7937, r8609, r8623; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8629, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8631, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8633, {low, high}; +} +{ +mul.f16x2 r8634, r8631, r8633; +} +{ +mul.f16x2 r8637, r8605, r8629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8605; +mov.b32 r8640, {high, low}; +} +{ +fma.rn.f16x2 r8642, r8634, r8640, r8637; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8642; +mov.b32 r8646, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8642; +mov.b32 r8648, {high, high}; +} +{ +mul.f16x2 r8650, r6577, r8648; +} +{ +fma.rn.f16x2 r8653, r6433, r8646, r8650; +} +{ +mul.f16x2 r8657, r6433, r8648; +} +{ +neg.f16x2 r8660, r8657; +} +{ +fma.rn.f16x2 r8662, r6577, r8646, r8660; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8666, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8668, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8670, {low, high}; +} +{ +mul.f16x2 r8671, r8668, r8670; +} +{ +mul.f16x2 r8674, r8642, r8666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8642; +mov.b32 r8677, {high, low}; +} +{ +fma.rn.f16x2 r8679, r8671, r8677, r8674; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8679; +mov.b32 r8683, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8679; +mov.b32 r8685, {high, high}; +} +{ +mul.f16x2 r8687, r6899, r8685; +} +{ +fma.rn.f16x2 r8690, r6755, r8683, r8687; +} +{ +mul.f16x2 r8694, r6755, r8685; +} +{ +neg.f16x2 r8697, r8694; +} +{ +fma.rn.f16x2 r8699, r6899, r8683, r8697; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8703, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8705, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8707, {low, high}; +} +{ +mul.f16x2 r8708, r8705, r8707; +} +{ +mul.f16x2 r8711, r8679, r8703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8679; +mov.b32 r8714, {high, low}; +} +{ +fma.rn.f16x2 r8716, r8708, r8714, r8711; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8716; +mov.b32 r8720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8716; +mov.b32 r8722, {high, high}; +} +{ +mul.f16x2 r8724, r7221, r8722; +} +{ +fma.rn.f16x2 r8727, r7077, r8720, r8724; +} +{ +mul.f16x2 r8731, r7077, r8722; +} +{ +neg.f16x2 r8734, r8731; +} +{ +fma.rn.f16x2 r8736, r7221, r8720, r8734; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8742, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8744, {low, high}; +} +{ +mul.f16x2 r8745, r8742, r8744; +} +{ +mul.f16x2 r8748, r8716, r8740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8716; +mov.b32 r8751, {high, low}; +} +{ +fma.rn.f16x2 r8753, r8745, r8751, r8748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8753; +mov.b32 r8757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8753; +mov.b32 r8759, {high, high}; +} +{ +mul.f16x2 r8761, r7543, r8759; +} +{ +fma.rn.f16x2 r8764, r7399, r8757, r8761; +} +{ +mul.f16x2 r8768, r7399, r8759; +} +{ +neg.f16x2 r8771, r8768; +} +{ +fma.rn.f16x2 r8773, r7543, r8757, r8771; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8779, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8781, {low, high}; +} +{ +mul.f16x2 r8782, r8779, r8781; +} +{ +mul.f16x2 r8785, r8753, r8777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8753; +mov.b32 r8788, {high, low}; +} +{ +fma.rn.f16x2 r8790, r8782, r8788, r8785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8790; +mov.b32 r8794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8790; +mov.b32 r8796, {high, high}; +} +{ +mul.f16x2 r8798, r7865, r8796; +} +{ +fma.rn.f16x2 r8801, r7721, r8794, r8798; +} +{ +mul.f16x2 r8805, r7721, r8796; +} +{ +neg.f16x2 r8808, r8805; +} +{ +fma.rn.f16x2 r8810, r7865, r8794, r8808; +} +shl.b32 r10454, r10453, 3; +add.s32 r10455, r10448, r10454; +barrier.sync 0; +mad.lo.s32 r10456, r10451, 5000, r10455; +st.shared.u32 [r10456], r6349; +st.shared.u32 [r10456+4], r6361; +st.shared.u32 [r10456+200], r7950; +st.shared.u32 [r10456+204], r7959; +st.shared.u32 [r10456+400], r7987; +st.shared.u32 [r10456+404], r7996; +st.shared.u32 [r10456+600], r8024; +st.shared.u32 [r10456+604], r8033; +st.shared.u32 [r10456+800], r8061; +st.shared.u32 [r10456+804], r8070; +st.shared.u32 [r10456+1000], r8098; +st.shared.u32 [r10456+1004], r8107; +st.shared.u32 [r10456+1200], r8135; +st.shared.u32 [r10456+1204], r8144; +st.shared.u32 [r10456+1400], r8172; +st.shared.u32 [r10456+1404], r8181; +st.shared.u32 [r10456+1600], r8209; +st.shared.u32 [r10456+1604], r8218; +st.shared.u32 [r10456+1800], r8246; +st.shared.u32 [r10456+1804], r8255; +st.shared.u32 [r10456+2000], r8283; +st.shared.u32 [r10456+2004], r8292; +st.shared.u32 [r10456+2200], r8320; +st.shared.u32 [r10456+2204], r8329; +st.shared.u32 [r10456+2400], r8357; +st.shared.u32 [r10456+2404], r8366; +st.shared.u32 [r10456+2600], r8394; +st.shared.u32 [r10456+2604], r8403; +st.shared.u32 [r10456+2800], r8431; +st.shared.u32 [r10456+2804], r8440; +st.shared.u32 [r10456+3000], r8468; +st.shared.u32 [r10456+3004], r8477; +st.shared.u32 [r10456+3200], r8505; +st.shared.u32 [r10456+3204], r8514; +st.shared.u32 [r10456+3400], r8542; +st.shared.u32 [r10456+3404], r8551; +st.shared.u32 [r10456+3600], r8579; +st.shared.u32 [r10456+3604], r8588; +st.shared.u32 [r10456+3800], r8616; +st.shared.u32 [r10456+3804], r8625; +st.shared.u32 [r10456+4000], r8653; +st.shared.u32 [r10456+4004], r8662; +st.shared.u32 [r10456+4200], r8690; +st.shared.u32 [r10456+4204], r8699; +st.shared.u32 [r10456+4400], r8727; +st.shared.u32 [r10456+4404], r8736; +st.shared.u32 [r10456+4600], r8764; +st.shared.u32 [r10456+4604], r8773; +st.shared.u32 [r10456+4800], r8801; +st.shared.u32 [r10456+4804], r8810; +barrier.sync 0; +ld.shared.u32 r8845, [r10450]; +ld.shared.u32 r8857, [r10450+4]; +ld.shared.u32 r9167, [r10450+1000]; +ld.shared.u32 r9179, [r10450+1004]; +ld.shared.u32 r9489, [r10450+2000]; +ld.shared.u32 r9501, [r10450+2004]; +ld.shared.u32 r9811, [r10450+3000]; +ld.shared.u32 r9823, [r10450+3004]; +ld.shared.u32 r10133, [r10450+4000]; +ld.shared.u32 r10145, [r10450+4004]; +ld.shared.u32 r8842, [r10450+5000]; +ld.shared.u32 r8854, [r10450+5004]; +ld.shared.u32 r9164, [r10450+6000]; +ld.shared.u32 r9176, [r10450+6004]; +ld.shared.u32 r9486, [r10450+7000]; +ld.shared.u32 r9498, [r10450+7004]; +ld.shared.u32 r9808, [r10450+8000]; +ld.shared.u32 r9820, [r10450+8004]; +ld.shared.u32 r10130, [r10450+9000]; +ld.shared.u32 r10142, [r10450+9004]; +ld.shared.u32 r8848, [r10450+10000]; +ld.shared.u32 r8860, [r10450+10004]; +ld.shared.u32 r9170, [r10450+11000]; +ld.shared.u32 r9182, [r10450+11004]; +ld.shared.u32 r9492, [r10450+12000]; +ld.shared.u32 r9504, [r10450+12004]; +ld.shared.u32 r9814, [r10450+13000]; +ld.shared.u32 r9826, [r10450+13004]; +ld.shared.u32 r10136, [r10450+14000]; +ld.shared.u32 r10148, [r10450+14004]; +ld.shared.u32 r8849, [r10450+15000]; +ld.shared.u32 r8861, [r10450+15004]; +ld.shared.u32 r9171, [r10450+16000]; +ld.shared.u32 r9183, [r10450+16004]; +ld.shared.u32 r9493, [r10450+17000]; +ld.shared.u32 r9505, [r10450+17004]; +ld.shared.u32 r9815, [r10450+18000]; +ld.shared.u32 r9827, [r10450+18004]; +ld.shared.u32 r10137, [r10450+19000]; +ld.shared.u32 r10149, [r10450+19004]; +ld.shared.u32 r8843, [r10450+20000]; +ld.shared.u32 r8855, [r10450+20004]; +ld.shared.u32 r9165, [r10450+21000]; +ld.shared.u32 r9177, [r10450+21004]; +ld.shared.u32 r9487, [r10450+22000]; +ld.shared.u32 r9499, [r10450+22004]; +ld.shared.u32 r9809, [r10450+23000]; +ld.shared.u32 r9821, [r10450+23004]; +ld.shared.u32 r10131, [r10450+24000]; +ld.shared.u32 r10143, [r10450+24004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8831, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8832, {low, high}; +} +{ +neg.f16x2 r8833, r8832; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r8835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r8836, {low, high}; +} +{ +neg.f16x2 r8837, r8836; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8839, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8840, {low, high}; +} +{ +add.f16x2 r8841, r8842, r8843; +} +{ +add.f16x2 r8844, r8845, r8841; +} +{ +add.f16x2 r8847, r8848, r8849; +} +{ +add.f16x2 %0, r8844, r8847; +} +{ +add.f16x2 r8853, r8854, r8855; +} +{ +add.f16x2 r8856, r8857, r8853; +} +{ +add.f16x2 r8859, r8860, r8861; +} +{ +add.f16x2 %1, r8856, r8859; +} +{ +add.f16x2 r8865, r8842, r8843; +} +{ +mul.f16x2 r8868, r8865, r8831; +} +{ +add.f16x2 r8871, r8845, r8868; +} +{ +add.f16x2 r8874, r8848, r8849; +} +{ +mul.f16x2 r8877, r8874, r8835; +} +{ +add.f16x2 r8880, r8871, r8877; +} +{ +sub.f16x2 r8883, r8854, r8855; +} +{ +mul.f16x2 r8886, r8883, r8833; +} +{ +sub.f16x2 r8889, r8860, r8861; +} +{ +mul.f16x2 r8892, r8889, r8837; +} +{ +add.f16x2 r8895, r8886, r8892; +} +{ +sub.f16x2 %10, r8880, r8895; +} +{ +add.f16x2 r8901, r8842, r8843; +} +{ +mul.f16x2 r8904, r8901, r8831; +} +{ +add.f16x2 r8907, r8845, r8904; +} +{ +add.f16x2 r8910, r8848, r8849; +} +{ +mul.f16x2 r8913, r8910, r8835; +} +{ +add.f16x2 r8916, r8907, r8913; +} +{ +sub.f16x2 r8919, r8854, r8855; +} +{ +mul.f16x2 r8922, r8919, r8833; +} +{ +sub.f16x2 r8925, r8860, r8861; +} +{ +mul.f16x2 r8928, r8925, r8837; +} +{ +add.f16x2 r8931, r8922, r8928; +} +{ +add.f16x2 %40, r8916, r8931; +} +{ +add.f16x2 r8937, r8842, r8843; +} +{ +mul.f16x2 r8940, r8937, r8835; +} +{ +add.f16x2 r8943, r8845, r8940; +} +{ +add.f16x2 r8946, r8848, r8849; +} +{ +mul.f16x2 r8949, r8946, r8839; +} +{ +add.f16x2 r8952, r8943, r8949; +} +{ +sub.f16x2 r8955, r8854, r8855; +} +{ +mul.f16x2 r8958, r8955, r8837; +} +{ +sub.f16x2 r8961, r8860, r8861; +} +{ +mul.f16x2 r8964, r8961, r8840; +} +{ +add.f16x2 r8967, r8958, r8964; +} +{ +sub.f16x2 %20, r8952, r8967; +} +{ +add.f16x2 r8973, r8842, r8843; +} +{ +mul.f16x2 r8976, r8973, r8835; +} +{ +add.f16x2 r8979, r8845, r8976; +} +{ +add.f16x2 r8982, r8848, r8849; +} +{ +mul.f16x2 r8985, r8982, r8839; +} +{ +add.f16x2 r8988, r8979, r8985; +} +{ +sub.f16x2 r8991, r8854, r8855; +} +{ +mul.f16x2 r8994, r8991, r8837; +} +{ +sub.f16x2 r8997, r8860, r8861; +} +{ +mul.f16x2 r9000, r8997, r8840; +} +{ +add.f16x2 r9003, r8994, r9000; +} +{ +add.f16x2 %30, r8988, r9003; +} +{ +add.f16x2 r9009, r8854, r8855; +} +{ +mul.f16x2 r9012, r9009, r8831; +} +{ +add.f16x2 r9015, r8857, r9012; +} +{ +add.f16x2 r9018, r8860, r8861; +} +{ +mul.f16x2 r9021, r9018, r8835; +} +{ +add.f16x2 r9024, r9015, r9021; +} +{ +sub.f16x2 r9027, r8842, r8843; +} +{ +mul.f16x2 r9030, r9027, r8833; +} +{ +sub.f16x2 r9033, r8848, r8849; +} +{ +mul.f16x2 r9036, r9033, r8837; +} +{ +add.f16x2 r9039, r9030, r9036; +} +{ +add.f16x2 %11, r9024, r9039; +} +{ +add.f16x2 r9045, r8854, r8855; +} +{ +mul.f16x2 r9048, r9045, r8831; +} +{ +add.f16x2 r9051, r8857, r9048; +} +{ +add.f16x2 r9054, r8860, r8861; +} +{ +mul.f16x2 r9057, r9054, r8835; +} +{ +add.f16x2 r9060, r9051, r9057; +} +{ +sub.f16x2 r9063, r8842, r8843; +} +{ +mul.f16x2 r9066, r9063, r8833; +} +{ +sub.f16x2 r9069, r8848, r8849; +} +{ +mul.f16x2 r9072, r9069, r8837; +} +{ +add.f16x2 r9075, r9066, r9072; +} +{ +sub.f16x2 %41, r9060, r9075; +} +{ +add.f16x2 r9081, r8854, r8855; +} +{ +mul.f16x2 r9084, r9081, r8835; +} +{ +add.f16x2 r9087, r8857, r9084; +} +{ +add.f16x2 r9090, r8860, r8861; +} +{ +mul.f16x2 r9093, r9090, r8839; +} +{ +add.f16x2 r9096, r9087, r9093; +} +{ +sub.f16x2 r9099, r8842, r8843; +} +{ +mul.f16x2 r9102, r9099, r8837; +} +{ +sub.f16x2 r9105, r8848, r8849; +} +{ +mul.f16x2 r9108, r9105, r8840; +} +{ +add.f16x2 r9111, r9102, r9108; +} +{ +add.f16x2 %21, r9096, r9111; +} +{ +add.f16x2 r9117, r8854, r8855; +} +{ +mul.f16x2 r9120, r9117, r8835; +} +{ +add.f16x2 r9123, r8857, r9120; +} +{ +add.f16x2 r9126, r8860, r8861; +} +{ +mul.f16x2 r9129, r9126, r8839; +} +{ +add.f16x2 r9132, r9123, r9129; +} +{ +sub.f16x2 r9135, r8842, r8843; +} +{ +mul.f16x2 r9138, r9135, r8837; +} +{ +sub.f16x2 r9141, r8848, r8849; +} +{ +mul.f16x2 r9144, r9141, r8840; +} +{ +add.f16x2 r9147, r9138, r9144; +} +{ +sub.f16x2 %31, r9132, r9147; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9153, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9154, {low, high}; +} +{ +neg.f16x2 r9155, r9154; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9158, {low, high}; +} +{ +neg.f16x2 r9159, r9158; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9161, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9162, {low, high}; +} +{ +add.f16x2 r9163, r9164, r9165; +} +{ +add.f16x2 r9166, r9167, r9163; +} +{ +add.f16x2 r9169, r9170, r9171; +} +{ +add.f16x2 %2, r9166, r9169; +} +{ +add.f16x2 r9175, r9176, r9177; +} +{ +add.f16x2 r9178, r9179, r9175; +} +{ +add.f16x2 r9181, r9182, r9183; +} +{ +add.f16x2 %3, r9178, r9181; +} +{ +add.f16x2 r9187, r9164, r9165; +} +{ +mul.f16x2 r9190, r9187, r9153; +} +{ +add.f16x2 r9193, r9167, r9190; +} +{ +add.f16x2 r9196, r9170, r9171; +} +{ +mul.f16x2 r9199, r9196, r9157; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +sub.f16x2 r9205, r9176, r9177; +} +{ +mul.f16x2 r9208, r9205, r9155; +} +{ +sub.f16x2 r9211, r9182, r9183; +} +{ +mul.f16x2 r9214, r9211, r9159; +} +{ +add.f16x2 r9217, r9208, r9214; +} +{ +sub.f16x2 %12, r9202, r9217; +} +{ +add.f16x2 r9223, r9164, r9165; +} +{ +mul.f16x2 r9226, r9223, r9153; +} +{ +add.f16x2 r9229, r9167, r9226; +} +{ +add.f16x2 r9232, r9170, r9171; +} +{ +mul.f16x2 r9235, r9232, r9157; +} +{ +add.f16x2 r9238, r9229, r9235; +} +{ +sub.f16x2 r9241, r9176, r9177; +} +{ +mul.f16x2 r9244, r9241, r9155; +} +{ +sub.f16x2 r9247, r9182, r9183; +} +{ +mul.f16x2 r9250, r9247, r9159; +} +{ +add.f16x2 r9253, r9244, r9250; +} +{ +add.f16x2 %42, r9238, r9253; +} +{ +add.f16x2 r9259, r9164, r9165; +} +{ +mul.f16x2 r9262, r9259, r9157; +} +{ +add.f16x2 r9265, r9167, r9262; +} +{ +add.f16x2 r9268, r9170, r9171; +} +{ +mul.f16x2 r9271, r9268, r9161; +} +{ +add.f16x2 r9274, r9265, r9271; +} +{ +sub.f16x2 r9277, r9176, r9177; +} +{ +mul.f16x2 r9280, r9277, r9159; +} +{ +sub.f16x2 r9283, r9182, r9183; +} +{ +mul.f16x2 r9286, r9283, r9162; +} +{ +add.f16x2 r9289, r9280, r9286; +} +{ +sub.f16x2 %22, r9274, r9289; +} +{ +add.f16x2 r9295, r9164, r9165; +} +{ +mul.f16x2 r9298, r9295, r9157; +} +{ +add.f16x2 r9301, r9167, r9298; +} +{ +add.f16x2 r9304, r9170, r9171; +} +{ +mul.f16x2 r9307, r9304, r9161; +} +{ +add.f16x2 r9310, r9301, r9307; +} +{ +sub.f16x2 r9313, r9176, r9177; +} +{ +mul.f16x2 r9316, r9313, r9159; +} +{ +sub.f16x2 r9319, r9182, r9183; +} +{ +mul.f16x2 r9322, r9319, r9162; +} +{ +add.f16x2 r9325, r9316, r9322; +} +{ +add.f16x2 %32, r9310, r9325; +} +{ +add.f16x2 r9331, r9176, r9177; +} +{ +mul.f16x2 r9334, r9331, r9153; +} +{ +add.f16x2 r9337, r9179, r9334; +} +{ +add.f16x2 r9340, r9182, r9183; +} +{ +mul.f16x2 r9343, r9340, r9157; +} +{ +add.f16x2 r9346, r9337, r9343; +} +{ +sub.f16x2 r9349, r9164, r9165; +} +{ +mul.f16x2 r9352, r9349, r9155; +} +{ +sub.f16x2 r9355, r9170, r9171; +} +{ +mul.f16x2 r9358, r9355, r9159; +} +{ +add.f16x2 r9361, r9352, r9358; +} +{ +add.f16x2 %13, r9346, r9361; +} +{ +add.f16x2 r9367, r9176, r9177; +} +{ +mul.f16x2 r9370, r9367, r9153; +} +{ +add.f16x2 r9373, r9179, r9370; +} +{ +add.f16x2 r9376, r9182, r9183; +} +{ +mul.f16x2 r9379, r9376, r9157; +} +{ +add.f16x2 r9382, r9373, r9379; +} +{ +sub.f16x2 r9385, r9164, r9165; +} +{ +mul.f16x2 r9388, r9385, r9155; +} +{ +sub.f16x2 r9391, r9170, r9171; +} +{ +mul.f16x2 r9394, r9391, r9159; +} +{ +add.f16x2 r9397, r9388, r9394; +} +{ +sub.f16x2 %43, r9382, r9397; +} +{ +add.f16x2 r9403, r9176, r9177; +} +{ +mul.f16x2 r9406, r9403, r9157; +} +{ +add.f16x2 r9409, r9179, r9406; +} +{ +add.f16x2 r9412, r9182, r9183; +} +{ +mul.f16x2 r9415, r9412, r9161; +} +{ +add.f16x2 r9418, r9409, r9415; +} +{ +sub.f16x2 r9421, r9164, r9165; +} +{ +mul.f16x2 r9424, r9421, r9159; +} +{ +sub.f16x2 r9427, r9170, r9171; +} +{ +mul.f16x2 r9430, r9427, r9162; +} +{ +add.f16x2 r9433, r9424, r9430; +} +{ +add.f16x2 %23, r9418, r9433; +} +{ +add.f16x2 r9439, r9176, r9177; +} +{ +mul.f16x2 r9442, r9439, r9157; +} +{ +add.f16x2 r9445, r9179, r9442; +} +{ +add.f16x2 r9448, r9182, r9183; +} +{ +mul.f16x2 r9451, r9448, r9161; +} +{ +add.f16x2 r9454, r9445, r9451; +} +{ +sub.f16x2 r9457, r9164, r9165; +} +{ +mul.f16x2 r9460, r9457, r9159; +} +{ +sub.f16x2 r9463, r9170, r9171; +} +{ +mul.f16x2 r9466, r9463, r9162; +} +{ +add.f16x2 r9469, r9460, r9466; +} +{ +sub.f16x2 %33, r9454, r9469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9476, {low, high}; +} +{ +neg.f16x2 r9477, r9476; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9480, {low, high}; +} +{ +neg.f16x2 r9481, r9480; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9484, {low, high}; +} +{ +add.f16x2 r9485, r9486, r9487; +} +{ +add.f16x2 r9488, r9489, r9485; +} +{ +add.f16x2 r9491, r9492, r9493; +} +{ +add.f16x2 %4, r9488, r9491; +} +{ +add.f16x2 r9497, r9498, r9499; +} +{ +add.f16x2 r9500, r9501, r9497; +} +{ +add.f16x2 r9503, r9504, r9505; +} +{ +add.f16x2 %5, r9500, r9503; +} +{ +add.f16x2 r9509, r9486, r9487; +} +{ +mul.f16x2 r9512, r9509, r9475; +} +{ +add.f16x2 r9515, r9489, r9512; +} +{ +add.f16x2 r9518, r9492, r9493; +} +{ +mul.f16x2 r9521, r9518, r9479; +} +{ +add.f16x2 r9524, r9515, r9521; +} +{ +sub.f16x2 r9527, r9498, r9499; +} +{ +mul.f16x2 r9530, r9527, r9477; +} +{ +sub.f16x2 r9533, r9504, r9505; +} +{ +mul.f16x2 r9536, r9533, r9481; +} +{ +add.f16x2 r9539, r9530, r9536; +} +{ +sub.f16x2 %14, r9524, r9539; +} +{ +add.f16x2 r9545, r9486, r9487; +} +{ +mul.f16x2 r9548, r9545, r9475; +} +{ +add.f16x2 r9551, r9489, r9548; +} +{ +add.f16x2 r9554, r9492, r9493; +} +{ +mul.f16x2 r9557, r9554, r9479; +} +{ +add.f16x2 r9560, r9551, r9557; +} +{ +sub.f16x2 r9563, r9498, r9499; +} +{ +mul.f16x2 r9566, r9563, r9477; +} +{ +sub.f16x2 r9569, r9504, r9505; +} +{ +mul.f16x2 r9572, r9569, r9481; +} +{ +add.f16x2 r9575, r9566, r9572; +} +{ +add.f16x2 %44, r9560, r9575; +} +{ +add.f16x2 r9581, r9486, r9487; +} +{ +mul.f16x2 r9584, r9581, r9479; +} +{ +add.f16x2 r9587, r9489, r9584; +} +{ +add.f16x2 r9590, r9492, r9493; +} +{ +mul.f16x2 r9593, r9590, r9483; +} +{ +add.f16x2 r9596, r9587, r9593; +} +{ +sub.f16x2 r9599, r9498, r9499; +} +{ +mul.f16x2 r9602, r9599, r9481; +} +{ +sub.f16x2 r9605, r9504, r9505; +} +{ +mul.f16x2 r9608, r9605, r9484; +} +{ +add.f16x2 r9611, r9602, r9608; +} +{ +sub.f16x2 %24, r9596, r9611; +} +{ +add.f16x2 r9617, r9486, r9487; +} +{ +mul.f16x2 r9620, r9617, r9479; +} +{ +add.f16x2 r9623, r9489, r9620; +} +{ +add.f16x2 r9626, r9492, r9493; +} +{ +mul.f16x2 r9629, r9626, r9483; +} +{ +add.f16x2 r9632, r9623, r9629; +} +{ +sub.f16x2 r9635, r9498, r9499; +} +{ +mul.f16x2 r9638, r9635, r9481; +} +{ +sub.f16x2 r9641, r9504, r9505; +} +{ +mul.f16x2 r9644, r9641, r9484; +} +{ +add.f16x2 r9647, r9638, r9644; +} +{ +add.f16x2 %34, r9632, r9647; +} +{ +add.f16x2 r9653, r9498, r9499; +} +{ +mul.f16x2 r9656, r9653, r9475; +} +{ +add.f16x2 r9659, r9501, r9656; +} +{ +add.f16x2 r9662, r9504, r9505; +} +{ +mul.f16x2 r9665, r9662, r9479; +} +{ +add.f16x2 r9668, r9659, r9665; +} +{ +sub.f16x2 r9671, r9486, r9487; +} +{ +mul.f16x2 r9674, r9671, r9477; +} +{ +sub.f16x2 r9677, r9492, r9493; +} +{ +mul.f16x2 r9680, r9677, r9481; +} +{ +add.f16x2 r9683, r9674, r9680; +} +{ +add.f16x2 %15, r9668, r9683; +} +{ +add.f16x2 r9689, r9498, r9499; +} +{ +mul.f16x2 r9692, r9689, r9475; +} +{ +add.f16x2 r9695, r9501, r9692; +} +{ +add.f16x2 r9698, r9504, r9505; +} +{ +mul.f16x2 r9701, r9698, r9479; +} +{ +add.f16x2 r9704, r9695, r9701; +} +{ +sub.f16x2 r9707, r9486, r9487; +} +{ +mul.f16x2 r9710, r9707, r9477; +} +{ +sub.f16x2 r9713, r9492, r9493; +} +{ +mul.f16x2 r9716, r9713, r9481; +} +{ +add.f16x2 r9719, r9710, r9716; +} +{ +sub.f16x2 %45, r9704, r9719; +} +{ +add.f16x2 r9725, r9498, r9499; +} +{ +mul.f16x2 r9728, r9725, r9479; +} +{ +add.f16x2 r9731, r9501, r9728; +} +{ +add.f16x2 r9734, r9504, r9505; +} +{ +mul.f16x2 r9737, r9734, r9483; +} +{ +add.f16x2 r9740, r9731, r9737; +} +{ +sub.f16x2 r9743, r9486, r9487; +} +{ +mul.f16x2 r9746, r9743, r9481; +} +{ +sub.f16x2 r9749, r9492, r9493; +} +{ +mul.f16x2 r9752, r9749, r9484; +} +{ +add.f16x2 r9755, r9746, r9752; +} +{ +add.f16x2 %25, r9740, r9755; +} +{ +add.f16x2 r9761, r9498, r9499; +} +{ +mul.f16x2 r9764, r9761, r9479; +} +{ +add.f16x2 r9767, r9501, r9764; +} +{ +add.f16x2 r9770, r9504, r9505; +} +{ +mul.f16x2 r9773, r9770, r9483; +} +{ +add.f16x2 r9776, r9767, r9773; +} +{ +sub.f16x2 r9779, r9486, r9487; +} +{ +mul.f16x2 r9782, r9779, r9481; +} +{ +sub.f16x2 r9785, r9492, r9493; +} +{ +mul.f16x2 r9788, r9785, r9484; +} +{ +add.f16x2 r9791, r9782, r9788; +} +{ +sub.f16x2 %35, r9776, r9791; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9797, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9798, {low, high}; +} +{ +neg.f16x2 r9799, r9798; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9801, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9802, {low, high}; +} +{ +neg.f16x2 r9803, r9802; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9805, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9806, {low, high}; +} +{ +add.f16x2 r9807, r9808, r9809; +} +{ +add.f16x2 r9810, r9811, r9807; +} +{ +add.f16x2 r9813, r9814, r9815; +} +{ +add.f16x2 %6, r9810, r9813; +} +{ +add.f16x2 r9819, r9820, r9821; +} +{ +add.f16x2 r9822, r9823, r9819; +} +{ +add.f16x2 r9825, r9826, r9827; +} +{ +add.f16x2 %7, r9822, r9825; +} +{ +add.f16x2 r9831, r9808, r9809; +} +{ +mul.f16x2 r9834, r9831, r9797; +} +{ +add.f16x2 r9837, r9811, r9834; +} +{ +add.f16x2 r9840, r9814, r9815; +} +{ +mul.f16x2 r9843, r9840, r9801; +} +{ +add.f16x2 r9846, r9837, r9843; +} +{ +sub.f16x2 r9849, r9820, r9821; +} +{ +mul.f16x2 r9852, r9849, r9799; +} +{ +sub.f16x2 r9855, r9826, r9827; +} +{ +mul.f16x2 r9858, r9855, r9803; +} +{ +add.f16x2 r9861, r9852, r9858; +} +{ +sub.f16x2 %16, r9846, r9861; +} +{ +add.f16x2 r9867, r9808, r9809; +} +{ +mul.f16x2 r9870, r9867, r9797; +} +{ +add.f16x2 r9873, r9811, r9870; +} +{ +add.f16x2 r9876, r9814, r9815; +} +{ +mul.f16x2 r9879, r9876, r9801; +} +{ +add.f16x2 r9882, r9873, r9879; +} +{ +sub.f16x2 r9885, r9820, r9821; +} +{ +mul.f16x2 r9888, r9885, r9799; +} +{ +sub.f16x2 r9891, r9826, r9827; +} +{ +mul.f16x2 r9894, r9891, r9803; +} +{ +add.f16x2 r9897, r9888, r9894; +} +{ +add.f16x2 %46, r9882, r9897; +} +{ +add.f16x2 r9903, r9808, r9809; +} +{ +mul.f16x2 r9906, r9903, r9801; +} +{ +add.f16x2 r9909, r9811, r9906; +} +{ +add.f16x2 r9912, r9814, r9815; +} +{ +mul.f16x2 r9915, r9912, r9805; +} +{ +add.f16x2 r9918, r9909, r9915; +} +{ +sub.f16x2 r9921, r9820, r9821; +} +{ +mul.f16x2 r9924, r9921, r9803; +} +{ +sub.f16x2 r9927, r9826, r9827; +} +{ +mul.f16x2 r9930, r9927, r9806; +} +{ +add.f16x2 r9933, r9924, r9930; +} +{ +sub.f16x2 %26, r9918, r9933; +} +{ +add.f16x2 r9939, r9808, r9809; +} +{ +mul.f16x2 r9942, r9939, r9801; +} +{ +add.f16x2 r9945, r9811, r9942; +} +{ +add.f16x2 r9948, r9814, r9815; +} +{ +mul.f16x2 r9951, r9948, r9805; +} +{ +add.f16x2 r9954, r9945, r9951; +} +{ +sub.f16x2 r9957, r9820, r9821; +} +{ +mul.f16x2 r9960, r9957, r9803; +} +{ +sub.f16x2 r9963, r9826, r9827; +} +{ +mul.f16x2 r9966, r9963, r9806; +} +{ +add.f16x2 r9969, r9960, r9966; +} +{ +add.f16x2 %36, r9954, r9969; +} +{ +add.f16x2 r9975, r9820, r9821; +} +{ +mul.f16x2 r9978, r9975, r9797; +} +{ +add.f16x2 r9981, r9823, r9978; +} +{ +add.f16x2 r9984, r9826, r9827; +} +{ +mul.f16x2 r9987, r9984, r9801; +} +{ +add.f16x2 r9990, r9981, r9987; +} +{ +sub.f16x2 r9993, r9808, r9809; +} +{ +mul.f16x2 r9996, r9993, r9799; +} +{ +sub.f16x2 r9999, r9814, r9815; +} +{ +mul.f16x2 r10002, r9999, r9803; +} +{ +add.f16x2 r10005, r9996, r10002; +} +{ +add.f16x2 %17, r9990, r10005; +} +{ +add.f16x2 r10011, r9820, r9821; +} +{ +mul.f16x2 r10014, r10011, r9797; +} +{ +add.f16x2 r10017, r9823, r10014; +} +{ +add.f16x2 r10020, r9826, r9827; +} +{ +mul.f16x2 r10023, r10020, r9801; +} +{ +add.f16x2 r10026, r10017, r10023; +} +{ +sub.f16x2 r10029, r9808, r9809; +} +{ +mul.f16x2 r10032, r10029, r9799; +} +{ +sub.f16x2 r10035, r9814, r9815; +} +{ +mul.f16x2 r10038, r10035, r9803; +} +{ +add.f16x2 r10041, r10032, r10038; +} +{ +sub.f16x2 %47, r10026, r10041; +} +{ +add.f16x2 r10047, r9820, r9821; +} +{ +mul.f16x2 r10050, r10047, r9801; +} +{ +add.f16x2 r10053, r9823, r10050; +} +{ +add.f16x2 r10056, r9826, r9827; +} +{ +mul.f16x2 r10059, r10056, r9805; +} +{ +add.f16x2 r10062, r10053, r10059; +} +{ +sub.f16x2 r10065, r9808, r9809; +} +{ +mul.f16x2 r10068, r10065, r9803; +} +{ +sub.f16x2 r10071, r9814, r9815; +} +{ +mul.f16x2 r10074, r10071, r9806; +} +{ +add.f16x2 r10077, r10068, r10074; +} +{ +add.f16x2 %27, r10062, r10077; +} +{ +add.f16x2 r10083, r9820, r9821; +} +{ +mul.f16x2 r10086, r10083, r9801; +} +{ +add.f16x2 r10089, r9823, r10086; +} +{ +add.f16x2 r10092, r9826, r9827; +} +{ +mul.f16x2 r10095, r10092, r9805; +} +{ +add.f16x2 r10098, r10089, r10095; +} +{ +sub.f16x2 r10101, r9808, r9809; +} +{ +mul.f16x2 r10104, r10101, r9803; +} +{ +sub.f16x2 r10107, r9814, r9815; +} +{ +mul.f16x2 r10110, r10107, r9806; +} +{ +add.f16x2 r10113, r10104, r10110; +} +{ +sub.f16x2 %37, r10098, r10113; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10120, {low, high}; +} +{ +neg.f16x2 r10121, r10120; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r10123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r10124, {low, high}; +} +{ +neg.f16x2 r10125, r10124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10128, {low, high}; +} +{ +add.f16x2 r10129, r10130, r10131; +} +{ +add.f16x2 r10132, r10133, r10129; +} +{ +add.f16x2 r10135, r10136, r10137; +} +{ +add.f16x2 %8, r10132, r10135; +} +{ +add.f16x2 r10141, r10142, r10143; +} +{ +add.f16x2 r10144, r10145, r10141; +} +{ +add.f16x2 r10147, r10148, r10149; +} +{ +add.f16x2 %9, r10144, r10147; +} +{ +add.f16x2 r10153, r10130, r10131; +} +{ +mul.f16x2 r10156, r10153, r10119; +} +{ +add.f16x2 r10159, r10133, r10156; +} +{ +add.f16x2 r10162, r10136, r10137; +} +{ +mul.f16x2 r10165, r10162, r10123; +} +{ +add.f16x2 r10168, r10159, r10165; +} +{ +sub.f16x2 r10171, r10142, r10143; +} +{ +mul.f16x2 r10174, r10171, r10121; +} +{ +sub.f16x2 r10177, r10148, r10149; +} +{ +mul.f16x2 r10180, r10177, r10125; +} +{ +add.f16x2 r10183, r10174, r10180; +} +{ +sub.f16x2 %18, r10168, r10183; +} +{ +add.f16x2 r10189, r10130, r10131; +} +{ +mul.f16x2 r10192, r10189, r10119; +} +{ +add.f16x2 r10195, r10133, r10192; +} +{ +add.f16x2 r10198, r10136, r10137; +} +{ +mul.f16x2 r10201, r10198, r10123; +} +{ +add.f16x2 r10204, r10195, r10201; +} +{ +sub.f16x2 r10207, r10142, r10143; +} +{ +mul.f16x2 r10210, r10207, r10121; +} +{ +sub.f16x2 r10213, r10148, r10149; +} +{ +mul.f16x2 r10216, r10213, r10125; +} +{ +add.f16x2 r10219, r10210, r10216; +} +{ +add.f16x2 %48, r10204, r10219; +} +{ +add.f16x2 r10225, r10130, r10131; +} +{ +mul.f16x2 r10228, r10225, r10123; +} +{ +add.f16x2 r10231, r10133, r10228; +} +{ +add.f16x2 r10234, r10136, r10137; +} +{ +mul.f16x2 r10237, r10234, r10127; +} +{ +add.f16x2 r10240, r10231, r10237; +} +{ +sub.f16x2 r10243, r10142, r10143; +} +{ +mul.f16x2 r10246, r10243, r10125; +} +{ +sub.f16x2 r10249, r10148, r10149; +} +{ +mul.f16x2 r10252, r10249, r10128; +} +{ +add.f16x2 r10255, r10246, r10252; +} +{ +sub.f16x2 %28, r10240, r10255; +} +{ +add.f16x2 r10261, r10130, r10131; +} +{ +mul.f16x2 r10264, r10261, r10123; +} +{ +add.f16x2 r10267, r10133, r10264; +} +{ +add.f16x2 r10270, r10136, r10137; +} +{ +mul.f16x2 r10273, r10270, r10127; +} +{ +add.f16x2 r10276, r10267, r10273; +} +{ +sub.f16x2 r10279, r10142, r10143; +} +{ +mul.f16x2 r10282, r10279, r10125; +} +{ +sub.f16x2 r10285, r10148, r10149; +} +{ +mul.f16x2 r10288, r10285, r10128; +} +{ +add.f16x2 r10291, r10282, r10288; +} +{ +add.f16x2 %38, r10276, r10291; +} +{ +add.f16x2 r10297, r10142, r10143; +} +{ +mul.f16x2 r10300, r10297, r10119; +} +{ +add.f16x2 r10303, r10145, r10300; +} +{ +add.f16x2 r10306, r10148, r10149; +} +{ +mul.f16x2 r10309, r10306, r10123; +} +{ +add.f16x2 r10312, r10303, r10309; +} +{ +sub.f16x2 r10315, r10130, r10131; +} +{ +mul.f16x2 r10318, r10315, r10121; +} +{ +sub.f16x2 r10321, r10136, r10137; +} +{ +mul.f16x2 r10324, r10321, r10125; +} +{ +add.f16x2 r10327, r10318, r10324; +} +{ +add.f16x2 %19, r10312, r10327; +} +{ +add.f16x2 r10333, r10142, r10143; +} +{ +mul.f16x2 r10336, r10333, r10119; +} +{ +add.f16x2 r10339, r10145, r10336; +} +{ +add.f16x2 r10342, r10148, r10149; +} +{ +mul.f16x2 r10345, r10342, r10123; +} +{ +add.f16x2 r10348, r10339, r10345; +} +{ +sub.f16x2 r10351, r10130, r10131; +} +{ +mul.f16x2 r10354, r10351, r10121; +} +{ +sub.f16x2 r10357, r10136, r10137; +} +{ +mul.f16x2 r10360, r10357, r10125; +} +{ +add.f16x2 r10363, r10354, r10360; +} +{ +sub.f16x2 %49, r10348, r10363; +} +{ +add.f16x2 r10369, r10142, r10143; +} +{ +mul.f16x2 r10372, r10369, r10123; +} +{ +add.f16x2 r10375, r10145, r10372; +} +{ +add.f16x2 r10378, r10148, r10149; +} +{ +mul.f16x2 r10381, r10378, r10127; +} +{ +add.f16x2 r10384, r10375, r10381; +} +{ +sub.f16x2 r10387, r10130, r10131; +} +{ +mul.f16x2 r10390, r10387, r10125; +} +{ +sub.f16x2 r10393, r10136, r10137; +} +{ +mul.f16x2 r10396, r10393, r10128; +} +{ +add.f16x2 r10399, r10390, r10396; +} +{ +add.f16x2 %29, r10384, r10399; +} +{ +add.f16x2 r10405, r10142, r10143; +} +{ +mul.f16x2 r10408, r10405, r10123; +} +{ +add.f16x2 r10411, r10145, r10408; +} +{ +add.f16x2 r10414, r10148, r10149; +} +{ +mul.f16x2 r10417, r10414, r10127; +} +{ +add.f16x2 r10420, r10411, r10417; +} +{ +sub.f16x2 r10423, r10130, r10131; +} +{ +mul.f16x2 r10426, r10423, r10125; +} +{ +sub.f16x2 r10429, r10136, r10137; +} +{ +mul.f16x2 r10432, r10429, r10128; +} +{ +add.f16x2 r10435, r10426, r10432; +} +{ +sub.f16x2 %39, r10420, r10435; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[13].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1114, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<603>; +.reg .b32 r<10508>; +.reg .b64 rd<6>; +mov.u32 r10442, %50; +mov.u32 r10507, %tid.y; +mad.lo.s32 r10443, r10507, 12500, r10442; +mov.u32 r10444, %tid.x; +mov.f32 f594, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1, {low, high}; +} +mov.f32 f596, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f590, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5, {low, high}; +} +mov.f32 f592, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %67, %54; +} +{ +add.f16x2 r14, %74, r11; +} +{ +add.f16x2 r17, %81, %65; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %94, %79; +} +{ +add.f16x2 r26, %51, r23; +} +{ +add.f16x2 r29, %57, %92; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %67, %54; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %74, r38; +} +{ +add.f16x2 r44, %81, %65; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %94, %79; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %57, %92; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %67, %54; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %74, r74; +} +{ +add.f16x2 r80, %81, %65; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %94, %79; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %57, %92; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %67, %54; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %74, r110; +} +{ +add.f16x2 r116, %81, %65; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %94, %79; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %57, %92; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %67, %54; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %74, r146; +} +{ +add.f16x2 r152, %81, %65; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %94, %79; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %57, %92; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %94, %79; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %51, r182; +} +{ +add.f16x2 r188, %57, %92; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %67, %54; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %81, %65; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %94, %79; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %51, r218; +} +{ +add.f16x2 r224, %57, %92; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %67, %54; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %81, %65; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %94, %79; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %51, r254; +} +{ +add.f16x2 r260, %57, %92; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %67, %54; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %81, %65; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %94, %79; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %51, r290; +} +{ +add.f16x2 r296, %57, %92; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %67, %54; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %81, %65; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %89, %77; +} +{ +add.f16x2 r336, %96, r333; +} +{ +add.f16x2 r339, %55, %87; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %66, %52; +} +{ +add.f16x2 r348, %73, r345; +} +{ +add.f16x2 r351, %80, %64; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %89, %77; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %96, r360; +} +{ +add.f16x2 r366, %55, %87; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %66, %52; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %80, %64; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %89, %77; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %96, r396; +} +{ +add.f16x2 r402, %55, %87; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %66, %52; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %80, %64; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %89, %77; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %96, r432; +} +{ +add.f16x2 r438, %55, %87; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %66, %52; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %80, %64; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %89, %77; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %96, r468; +} +{ +add.f16x2 r474, %55, %87; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %66, %52; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %80, %64; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %66, %52; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %73, r504; +} +{ +add.f16x2 r510, %80, %64; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %89, %77; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %55, %87; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %66, %52; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %73, r540; +} +{ +add.f16x2 r546, %80, %64; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %89, %77; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %55, %87; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %66, %52; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %73, r576; +} +{ +add.f16x2 r582, %80, %64; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %89, %77; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %55, %87; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %66, %52; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %73, r612; +} +{ +add.f16x2 r618, %80, %64; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %89, %77; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %55, %87; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r650, {low, high}; +} +{ +neg.f16x2 r651, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, %62, %99; +} +{ +add.f16x2 r658, %69, r655; +} +{ +add.f16x2 r661, %78, %60; +} +{ +add.f16x2 r664, r658, r661; +} +{ +add.f16x2 r667, %88, %75; +} +{ +add.f16x2 r670, %95, r667; +} +{ +add.f16x2 r673, %53, %86; +} +{ +add.f16x2 r676, r670, r673; +} +{ +add.f16x2 r679, %62, %99; +} +{ +mul.f16x2 r682, r679, r645; +} +{ +add.f16x2 r685, %69, r682; +} +{ +add.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r649; +} +{ +add.f16x2 r694, r685, r691; +} +{ +sub.f16x2 r697, %88, %75; +} +{ +mul.f16x2 r700, r697, r647; +} +{ +sub.f16x2 r703, %53, %86; +} +{ +mul.f16x2 r706, r703, r651; +} +{ +add.f16x2 r709, r700, r706; +} +{ +sub.f16x2 r712, r694, r709; +} +{ +add.f16x2 r715, %62, %99; +} +{ +mul.f16x2 r718, r715, r645; +} +{ +add.f16x2 r721, %69, r718; +} +{ +add.f16x2 r724, %78, %60; +} +{ +mul.f16x2 r727, r724, r649; +} +{ +add.f16x2 r730, r721, r727; +} +{ +sub.f16x2 r733, %88, %75; +} +{ +mul.f16x2 r736, r733, r647; +} +{ +sub.f16x2 r739, %53, %86; +} +{ +mul.f16x2 r742, r739, r651; +} +{ +add.f16x2 r745, r736, r742; +} +{ +add.f16x2 r748, r730, r745; +} +{ +add.f16x2 r751, %62, %99; +} +{ +mul.f16x2 r754, r751, r649; +} +{ +add.f16x2 r757, %69, r754; +} +{ +add.f16x2 r760, %78, %60; +} +{ +mul.f16x2 r763, r760, r653; +} +{ +add.f16x2 r766, r757, r763; +} +{ +sub.f16x2 r769, %88, %75; +} +{ +mul.f16x2 r772, r769, r651; +} +{ +sub.f16x2 r775, %53, %86; +} +{ +mul.f16x2 r778, r775, r654; +} +{ +add.f16x2 r781, r772, r778; +} +{ +sub.f16x2 r784, r766, r781; +} +{ +add.f16x2 r787, %62, %99; +} +{ +mul.f16x2 r790, r787, r649; +} +{ +add.f16x2 r793, %69, r790; +} +{ +add.f16x2 r796, %78, %60; +} +{ +mul.f16x2 r799, r796, r653; +} +{ +add.f16x2 r802, r793, r799; +} +{ +sub.f16x2 r805, %88, %75; +} +{ +mul.f16x2 r808, r805, r651; +} +{ +sub.f16x2 r811, %53, %86; +} +{ +mul.f16x2 r814, r811, r654; +} +{ +add.f16x2 r817, r808, r814; +} +{ +add.f16x2 r820, r802, r817; +} +{ +add.f16x2 r823, %88, %75; +} +{ +mul.f16x2 r826, r823, r645; +} +{ +add.f16x2 r829, %95, r826; +} +{ +add.f16x2 r832, %53, %86; +} +{ +mul.f16x2 r835, r832, r649; +} +{ +add.f16x2 r838, r829, r835; +} +{ +sub.f16x2 r841, %62, %99; +} +{ +mul.f16x2 r844, r841, r647; +} +{ +sub.f16x2 r847, %78, %60; +} +{ +mul.f16x2 r850, r847, r651; +} +{ +add.f16x2 r853, r844, r850; +} +{ +add.f16x2 r856, r838, r853; +} +{ +add.f16x2 r859, %88, %75; +} +{ +mul.f16x2 r862, r859, r645; +} +{ +add.f16x2 r865, %95, r862; +} +{ +add.f16x2 r868, %53, %86; +} +{ +mul.f16x2 r871, r868, r649; +} +{ +add.f16x2 r874, r865, r871; +} +{ +sub.f16x2 r877, %62, %99; +} +{ +mul.f16x2 r880, r877, r647; +} +{ +sub.f16x2 r883, %78, %60; +} +{ +mul.f16x2 r886, r883, r651; +} +{ +add.f16x2 r889, r880, r886; +} +{ +sub.f16x2 r892, r874, r889; +} +{ +add.f16x2 r895, %88, %75; +} +{ +mul.f16x2 r898, r895, r649; +} +{ +add.f16x2 r901, %95, r898; +} +{ +add.f16x2 r904, %53, %86; +} +{ +mul.f16x2 r907, r904, r653; +} +{ +add.f16x2 r910, r901, r907; +} +{ +sub.f16x2 r913, %62, %99; +} +{ +mul.f16x2 r916, r913, r651; +} +{ +sub.f16x2 r919, %78, %60; +} +{ +mul.f16x2 r922, r919, r654; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r910, r925; +} +{ +add.f16x2 r931, %88, %75; +} +{ +mul.f16x2 r934, r931, r649; +} +{ +add.f16x2 r937, %95, r934; +} +{ +add.f16x2 r940, %53, %86; +} +{ +mul.f16x2 r943, r940, r653; +} +{ +add.f16x2 r946, r937, r943; +} +{ +sub.f16x2 r949, %62, %99; +} +{ +mul.f16x2 r952, r949, r651; +} +{ +sub.f16x2 r955, %78, %60; +} +{ +mul.f16x2 r958, r955, r654; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, r946, r961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r968, {low, high}; +} +{ +neg.f16x2 r969, r968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r972, {low, high}; +} +{ +neg.f16x2 r973, r972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r976, {low, high}; +} +{ +add.f16x2 r977, %85, %71; +} +{ +add.f16x2 r980, %91, r977; +} +{ +add.f16x2 r983, %100, %83; +} +{ +add.f16x2 r986, r980, r983; +} +{ +add.f16x2 r989, %61, %97; +} +{ +add.f16x2 r992, %68, r989; +} +{ +add.f16x2 r995, %76, %59; +} +{ +add.f16x2 r998, r992, r995; +} +{ +add.f16x2 r1001, %85, %71; +} +{ +mul.f16x2 r1004, r1001, r967; +} +{ +add.f16x2 r1007, %91, r1004; +} +{ +add.f16x2 r1010, %100, %83; +} +{ +mul.f16x2 r1013, r1010, r971; +} +{ +add.f16x2 r1016, r1007, r1013; +} +{ +sub.f16x2 r1019, %61, %97; +} +{ +mul.f16x2 r1022, r1019, r969; +} +{ +sub.f16x2 r1025, %76, %59; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r1022, r1028; +} +{ +sub.f16x2 r1034, r1016, r1031; +} +{ +add.f16x2 r1037, %85, %71; +} +{ +mul.f16x2 r1040, r1037, r967; +} +{ +add.f16x2 r1043, %91, r1040; +} +{ +add.f16x2 r1046, %100, %83; +} +{ +mul.f16x2 r1049, r1046, r971; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, %61, %97; +} +{ +mul.f16x2 r1058, r1055, r969; +} +{ +sub.f16x2 r1061, %76, %59; +} +{ +mul.f16x2 r1064, r1061, r973; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +add.f16x2 r1070, r1052, r1067; +} +{ +add.f16x2 r1073, %85, %71; +} +{ +mul.f16x2 r1076, r1073, r971; +} +{ +add.f16x2 r1079, %91, r1076; +} +{ +add.f16x2 r1082, %100, %83; +} +{ +mul.f16x2 r1085, r1082, r975; +} +{ +add.f16x2 r1088, r1079, r1085; +} +{ +sub.f16x2 r1091, %61, %97; +} +{ +mul.f16x2 r1094, r1091, r973; +} +{ +sub.f16x2 r1097, %76, %59; +} +{ +mul.f16x2 r1100, r1097, r976; +} +{ +add.f16x2 r1103, r1094, r1100; +} +{ +sub.f16x2 r1106, r1088, r1103; +} +{ +add.f16x2 r1109, %85, %71; +} +{ +mul.f16x2 r1112, r1109, r971; +} +{ +add.f16x2 r1115, %91, r1112; +} +{ +add.f16x2 r1118, %100, %83; +} +{ +mul.f16x2 r1121, r1118, r975; +} +{ +add.f16x2 r1124, r1115, r1121; +} +{ +sub.f16x2 r1127, %61, %97; +} +{ +mul.f16x2 r1130, r1127, r973; +} +{ +sub.f16x2 r1133, %76, %59; +} +{ +mul.f16x2 r1136, r1133, r976; +} +{ +add.f16x2 r1139, r1130, r1136; +} +{ +add.f16x2 r1142, r1124, r1139; +} +{ +add.f16x2 r1145, %61, %97; +} +{ +mul.f16x2 r1148, r1145, r967; +} +{ +add.f16x2 r1151, %68, r1148; +} +{ +add.f16x2 r1154, %76, %59; +} +{ +mul.f16x2 r1157, r1154, r971; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, %85, %71; +} +{ +mul.f16x2 r1166, r1163, r969; +} +{ +sub.f16x2 r1169, %100, %83; +} +{ +mul.f16x2 r1172, r1169, r973; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +add.f16x2 r1178, r1160, r1175; +} +{ +add.f16x2 r1181, %61, %97; +} +{ +mul.f16x2 r1184, r1181, r967; +} +{ +add.f16x2 r1187, %68, r1184; +} +{ +add.f16x2 r1190, %76, %59; +} +{ +mul.f16x2 r1193, r1190, r971; +} +{ +add.f16x2 r1196, r1187, r1193; +} +{ +sub.f16x2 r1199, %85, %71; +} +{ +mul.f16x2 r1202, r1199, r969; +} +{ +sub.f16x2 r1205, %100, %83; +} +{ +mul.f16x2 r1208, r1205, r973; +} +{ +add.f16x2 r1211, r1202, r1208; +} +{ +sub.f16x2 r1214, r1196, r1211; +} +{ +add.f16x2 r1217, %61, %97; +} +{ +mul.f16x2 r1220, r1217, r971; +} +{ +add.f16x2 r1223, %68, r1220; +} +{ +add.f16x2 r1226, %76, %59; +} +{ +mul.f16x2 r1229, r1226, r975; +} +{ +add.f16x2 r1232, r1223, r1229; +} +{ +sub.f16x2 r1235, %85, %71; +} +{ +mul.f16x2 r1238, r1235, r973; +} +{ +sub.f16x2 r1241, %100, %83; +} +{ +mul.f16x2 r1244, r1241, r976; +} +{ +add.f16x2 r1247, r1238, r1244; +} +{ +add.f16x2 r1250, r1232, r1247; +} +{ +add.f16x2 r1253, %61, %97; +} +{ +mul.f16x2 r1256, r1253, r971; +} +{ +add.f16x2 r1259, %68, r1256; +} +{ +add.f16x2 r1262, %76, %59; +} +{ +mul.f16x2 r1265, r1262, r975; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, %85, %71; +} +{ +mul.f16x2 r1274, r1271, r973; +} +{ +sub.f16x2 r1277, %100, %83; +} +{ +mul.f16x2 r1280, r1277, r976; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r1268, r1283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1290, {low, high}; +} +{ +neg.f16x2 r1291, r1290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1294, {low, high}; +} +{ +neg.f16x2 r1295, r1294; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1298, {low, high}; +} +{ +add.f16x2 r1299, %58, %93; +} +{ +add.f16x2 r1302, %63, r1299; +} +{ +add.f16x2 r1305, %72, %56; +} +{ +add.f16x2 r1308, r1302, r1305; +} +{ +add.f16x2 r1311, %84, %70; +} +{ +add.f16x2 r1314, %90, r1311; +} +{ +add.f16x2 r1317, %98, %82; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %58, %93; +} +{ +mul.f16x2 r1326, r1323, r1289; +} +{ +add.f16x2 r1329, %63, r1326; +} +{ +add.f16x2 r1332, %72, %56; +} +{ +mul.f16x2 r1335, r1332, r1293; +} +{ +add.f16x2 r1338, r1329, r1335; +} +{ +sub.f16x2 r1341, %84, %70; +} +{ +mul.f16x2 r1344, r1341, r1291; +} +{ +sub.f16x2 r1347, %98, %82; +} +{ +mul.f16x2 r1350, r1347, r1295; +} +{ +add.f16x2 r1353, r1344, r1350; +} +{ +sub.f16x2 r1356, r1338, r1353; +} +{ +add.f16x2 r1359, %58, %93; +} +{ +mul.f16x2 r1362, r1359, r1289; +} +{ +add.f16x2 r1365, %63, r1362; +} +{ +add.f16x2 r1368, %72, %56; +} +{ +mul.f16x2 r1371, r1368, r1293; +} +{ +add.f16x2 r1374, r1365, r1371; +} +{ +sub.f16x2 r1377, %84, %70; +} +{ +mul.f16x2 r1380, r1377, r1291; +} +{ +sub.f16x2 r1383, %98, %82; +} +{ +mul.f16x2 r1386, r1383, r1295; +} +{ +add.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1374, r1389; +} +{ +add.f16x2 r1395, %58, %93; +} +{ +mul.f16x2 r1398, r1395, r1293; +} +{ +add.f16x2 r1401, %63, r1398; +} +{ +add.f16x2 r1404, %72, %56; +} +{ +mul.f16x2 r1407, r1404, r1297; +} +{ +add.f16x2 r1410, r1401, r1407; +} +{ +sub.f16x2 r1413, %84, %70; +} +{ +mul.f16x2 r1416, r1413, r1295; +} +{ +sub.f16x2 r1419, %98, %82; +} +{ +mul.f16x2 r1422, r1419, r1298; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +sub.f16x2 r1428, r1410, r1425; +} +{ +add.f16x2 r1431, %58, %93; +} +{ +mul.f16x2 r1434, r1431, r1293; +} +{ +add.f16x2 r1437, %63, r1434; +} +{ +add.f16x2 r1440, %72, %56; +} +{ +mul.f16x2 r1443, r1440, r1297; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +sub.f16x2 r1449, %84, %70; +} +{ +mul.f16x2 r1452, r1449, r1295; +} +{ +sub.f16x2 r1455, %98, %82; +} +{ +mul.f16x2 r1458, r1455, r1298; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +add.f16x2 r1464, r1446, r1461; +} +{ +add.f16x2 r1467, %84, %70; +} +{ +mul.f16x2 r1470, r1467, r1289; +} +{ +add.f16x2 r1473, %90, r1470; +} +{ +add.f16x2 r1476, %98, %82; +} +{ +mul.f16x2 r1479, r1476, r1293; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +sub.f16x2 r1485, %58, %93; +} +{ +mul.f16x2 r1488, r1485, r1291; +} +{ +sub.f16x2 r1491, %72, %56; +} +{ +mul.f16x2 r1494, r1491, r1295; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +add.f16x2 r1500, r1482, r1497; +} +{ +add.f16x2 r1503, %84, %70; +} +{ +mul.f16x2 r1506, r1503, r1289; +} +{ +add.f16x2 r1509, %90, r1506; +} +{ +add.f16x2 r1512, %98, %82; +} +{ +mul.f16x2 r1515, r1512, r1293; +} +{ +add.f16x2 r1518, r1509, r1515; +} +{ +sub.f16x2 r1521, %58, %93; +} +{ +mul.f16x2 r1524, r1521, r1291; +} +{ +sub.f16x2 r1527, %72, %56; +} +{ +mul.f16x2 r1530, r1527, r1295; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1518, r1533; +} +{ +add.f16x2 r1539, %84, %70; +} +{ +mul.f16x2 r1542, r1539, r1293; +} +{ +add.f16x2 r1545, %90, r1542; +} +{ +add.f16x2 r1548, %98, %82; +} +{ +mul.f16x2 r1551, r1548, r1297; +} +{ +add.f16x2 r1554, r1545, r1551; +} +{ +sub.f16x2 r1557, %58, %93; +} +{ +mul.f16x2 r1560, r1557, r1295; +} +{ +sub.f16x2 r1563, %72, %56; +} +{ +mul.f16x2 r1566, r1563, r1298; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +add.f16x2 r1572, r1554, r1569; +} +{ +add.f16x2 r1575, %84, %70; +} +{ +mul.f16x2 r1578, r1575, r1293; +} +{ +add.f16x2 r1581, %90, r1578; +} +{ +add.f16x2 r1584, %98, %82; +} +{ +mul.f16x2 r1587, r1584, r1297; +} +{ +add.f16x2 r1590, r1581, r1587; +} +{ +sub.f16x2 r1593, %58, %93; +} +{ +mul.f16x2 r1596, r1593, r1295; +} +{ +sub.f16x2 r1599, %72, %56; +} +{ +mul.f16x2 r1602, r1599, r1298; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1590, r1605; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1611, {low, high}; +} +mov.f32 f332, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1612, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1613, {low, high}; +} +mov.f32 f336, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1614, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1615, {low, high}; +} +mov.f32 f340, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1616, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1617, {low, high}; +} +mov.f32 f344, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1618, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1621, {low, high}; +} +mov.f32 f352, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1622, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1625, {low, high}; +} +mov.f32 f360, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1626, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1627, {low, high}; +} +mov.f32 f364, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1628, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1633, {low, high}; +} +mov.f32 f376, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1641, {low, high}; +} +mov.f32 f392, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1642, {low, high}; +} +{ +mul.f16x2 r1659, r390, r1611; +} +{ +mul.f16x2 r1662, r534, r1612; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r390, r1612; +} +{ +fma.rn.f16x2 r1671, r534, r1611, r1668; +} +{ +mul.f16x2 r1675, r712, r1613; +} +{ +mul.f16x2 r1678, r856, r1614; +} +{ +sub.f16x2 r1681, r1675, r1678; +} +{ +mul.f16x2 r1684, r712, r1614; +} +{ +fma.rn.f16x2 r1687, r856, r1613, r1684; +} +{ +mul.f16x2 r1691, r1034, r1615; +} +{ +mul.f16x2 r1694, r1178, r1616; +} +{ +sub.f16x2 r1697, r1691, r1694; +} +{ +mul.f16x2 r1700, r1034, r1616; +} +{ +fma.rn.f16x2 r1703, r1178, r1615, r1700; +} +{ +mul.f16x2 r1707, r1356, r1617; +} +{ +mul.f16x2 r1710, r1500, r1618; +} +{ +sub.f16x2 r1713, r1707, r1710; +} +{ +mul.f16x2 r1716, r1356, r1618; +} +{ +fma.rn.f16x2 r1719, r1500, r1617, r1716; +} +{ +mul.f16x2 r1723, r462, r1613; +} +{ +mul.f16x2 r1726, r606, r1614; +} +{ +sub.f16x2 r1729, r1723, r1726; +} +{ +mul.f16x2 r1732, r462, r1614; +} +{ +fma.rn.f16x2 r1735, r606, r1613, r1732; +} +{ +mul.f16x2 r1739, r784, r1617; +} +{ +mul.f16x2 r1742, r928, r1618; +} +{ +sub.f16x2 r1745, r1739, r1742; +} +{ +mul.f16x2 r1748, r784, r1618; +} +{ +fma.rn.f16x2 r1751, r928, r1617, r1748; +} +{ +mul.f16x2 r1755, r1106, r1621; +} +{ +mul.f16x2 r1758, r1250, r1622; +} +{ +sub.f16x2 r1761, r1755, r1758; +} +{ +mul.f16x2 r1764, r1106, r1622; +} +{ +fma.rn.f16x2 r1767, r1250, r1621, r1764; +} +{ +mul.f16x2 r1771, r1428, r1625; +} +{ +mul.f16x2 r1774, r1572, r1626; +} +{ +sub.f16x2 r1777, r1771, r1774; +} +{ +mul.f16x2 r1780, r1428, r1626; +} +{ +fma.rn.f16x2 r1783, r1572, r1625, r1780; +} +{ +mul.f16x2 r1787, r498, r1615; +} +{ +mul.f16x2 r1790, r642, r1616; +} +{ +sub.f16x2 r1793, r1787, r1790; +} +{ +mul.f16x2 r1796, r498, r1616; +} +{ +fma.rn.f16x2 r1799, r642, r1615, r1796; +} +{ +mul.f16x2 r1803, r820, r1621; +} +{ +mul.f16x2 r1806, r964, r1622; +} +{ +sub.f16x2 r1809, r1803, r1806; +} +{ +mul.f16x2 r1812, r820, r1622; +} +{ +fma.rn.f16x2 r1815, r964, r1621, r1812; +} +{ +mul.f16x2 r1819, r1142, r1627; +} +{ +mul.f16x2 r1822, r1286, r1628; +} +{ +sub.f16x2 r1825, r1819, r1822; +} +{ +mul.f16x2 r1828, r1142, r1628; +} +{ +fma.rn.f16x2 r1831, r1286, r1627, r1828; +} +{ +mul.f16x2 r1835, r1464, r1633; +} +{ +mul.f16x2 r1838, r1608, r1634; +} +{ +sub.f16x2 r1841, r1835, r1838; +} +{ +mul.f16x2 r1844, r1464, r1634; +} +{ +fma.rn.f16x2 r1847, r1608, r1633, r1844; +} +{ +mul.f16x2 r1851, r426, r1617; +} +{ +mul.f16x2 r1854, r570, r1618; +} +{ +sub.f16x2 r1857, r1851, r1854; +} +{ +mul.f16x2 r1860, r426, r1618; +} +{ +fma.rn.f16x2 r1863, r570, r1617, r1860; +} +{ +mul.f16x2 r1867, r748, r1625; +} +{ +mul.f16x2 r1870, r892, r1626; +} +{ +sub.f16x2 r1873, r1867, r1870; +} +{ +mul.f16x2 r1876, r748, r1626; +} +{ +fma.rn.f16x2 r1879, r892, r1625, r1876; +} +{ +mul.f16x2 r1883, r1070, r1633; +} +{ +mul.f16x2 r1886, r1214, r1634; +} +{ +sub.f16x2 r1889, r1883, r1886; +} +{ +mul.f16x2 r1892, r1070, r1634; +} +{ +fma.rn.f16x2 r1895, r1214, r1633, r1892; +} +{ +mul.f16x2 r1899, r1392, r1641; +} +{ +mul.f16x2 r1902, r1536, r1642; +} +{ +sub.f16x2 r1905, r1899, r1902; +} +{ +mul.f16x2 r1908, r1392, r1642; +} +{ +fma.rn.f16x2 r1911, r1536, r1641, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1916, {low, high}; +} +{ +neg.f16x2 r1917, r1916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r1919, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r1920, {low, high}; +} +{ +neg.f16x2 r1921, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1924, {low, high}; +} +{ +add.f16x2 r1925, r342, r1308; +} +{ +add.f16x2 r1928, r20, r1925; +} +{ +add.f16x2 r1931, r664, r986; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r354, r1320; +} +{ +add.f16x2 r1940, r32, r1937; +} +{ +add.f16x2 r1943, r676, r998; +} +{ +add.f16x2 r1946, r1940, r1943; +} +{ +add.f16x2 r1949, r342, r1308; +} +{ +mul.f16x2 r1952, r1949, r1915; +} +{ +add.f16x2 r1955, r20, r1952; +} +{ +add.f16x2 r1958, r664, r986; +} +{ +mul.f16x2 r1961, r1958, r1919; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r354, r1320; +} +{ +mul.f16x2 r1970, r1967, r1917; +} +{ +sub.f16x2 r1973, r676, r998; +} +{ +mul.f16x2 r1976, r1973, r1921; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +sub.f16x2 r1982, r1964, r1979; +} +{ +add.f16x2 r1985, r342, r1308; +} +{ +mul.f16x2 r1988, r1985, r1915; +} +{ +add.f16x2 r1991, r20, r1988; +} +{ +add.f16x2 r1994, r664, r986; +} +{ +mul.f16x2 r1997, r1994, r1919; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r354, r1320; +} +{ +mul.f16x2 r2006, r2003, r1917; +} +{ +sub.f16x2 r2009, r676, r998; +} +{ +mul.f16x2 r2012, r2009, r1921; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 r2018, r2000, r2015; +} +{ +add.f16x2 r2021, r342, r1308; +} +{ +mul.f16x2 r2024, r2021, r1919; +} +{ +add.f16x2 r2027, r20, r2024; +} +{ +add.f16x2 r2030, r664, r986; +} +{ +mul.f16x2 r2033, r2030, r1923; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r354, r1320; +} +{ +mul.f16x2 r2042, r2039, r1921; +} +{ +sub.f16x2 r2045, r676, r998; +} +{ +mul.f16x2 r2048, r2045, r1924; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 r2054, r2036, r2051; +} +{ +add.f16x2 r2057, r342, r1308; +} +{ +mul.f16x2 r2060, r2057, r1919; +} +{ +add.f16x2 r2063, r20, r2060; +} +{ +add.f16x2 r2066, r664, r986; +} +{ +mul.f16x2 r2069, r2066, r1923; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r354, r1320; +} +{ +mul.f16x2 r2078, r2075, r1921; +} +{ +sub.f16x2 r2081, r676, r998; +} +{ +mul.f16x2 r2084, r2081, r1924; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 r2090, r2072, r2087; +} +{ +add.f16x2 r2093, r354, r1320; +} +{ +mul.f16x2 r2096, r2093, r1915; +} +{ +add.f16x2 r2099, r32, r2096; +} +{ +add.f16x2 r2102, r676, r998; +} +{ +mul.f16x2 r2105, r2102, r1919; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r342, r1308; +} +{ +mul.f16x2 r2114, r2111, r1917; +} +{ +sub.f16x2 r2117, r664, r986; +} +{ +mul.f16x2 r2120, r2117, r1921; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +add.f16x2 r2126, r2108, r2123; +} +{ +add.f16x2 r2129, r354, r1320; +} +{ +mul.f16x2 r2132, r2129, r1915; +} +{ +add.f16x2 r2135, r32, r2132; +} +{ +add.f16x2 r2138, r676, r998; +} +{ +mul.f16x2 r2141, r2138, r1919; +} +{ +add.f16x2 r2144, r2135, r2141; +} +{ +sub.f16x2 r2147, r342, r1308; +} +{ +mul.f16x2 r2150, r2147, r1917; +} +{ +sub.f16x2 r2153, r664, r986; +} +{ +mul.f16x2 r2156, r2153, r1921; +} +{ +add.f16x2 r2159, r2150, r2156; +} +{ +sub.f16x2 r2162, r2144, r2159; +} +{ +add.f16x2 r2165, r354, r1320; +} +{ +mul.f16x2 r2168, r2165, r1919; +} +{ +add.f16x2 r2171, r32, r2168; +} +{ +add.f16x2 r2174, r676, r998; +} +{ +mul.f16x2 r2177, r2174, r1923; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +sub.f16x2 r2183, r342, r1308; +} +{ +mul.f16x2 r2186, r2183, r1921; +} +{ +sub.f16x2 r2189, r664, r986; +} +{ +mul.f16x2 r2192, r2189, r1924; +} +{ +add.f16x2 r2195, r2186, r2192; +} +{ +add.f16x2 r2198, r2180, r2195; +} +{ +add.f16x2 r2201, r354, r1320; +} +{ +mul.f16x2 r2204, r2201, r1919; +} +{ +add.f16x2 r2207, r32, r2204; +} +{ +add.f16x2 r2210, r676, r998; +} +{ +mul.f16x2 r2213, r2210, r1923; +} +{ +add.f16x2 r2216, r2207, r2213; +} +{ +sub.f16x2 r2219, r342, r1308; +} +{ +mul.f16x2 r2222, r2219, r1921; +} +{ +sub.f16x2 r2225, r664, r986; +} +{ +mul.f16x2 r2228, r2225, r1924; +} +{ +add.f16x2 r2231, r2222, r2228; +} +{ +sub.f16x2 r2234, r2216, r2231; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2238, {low, high}; +} +{ +neg.f16x2 r2239, r2238; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2246, {low, high}; +} +{ +add.f16x2 r2247, r1665, r1713; +} +{ +add.f16x2 r2250, r68, r2247; +} +{ +add.f16x2 r2253, r1681, r1697; +} +{ +add.f16x2 r2256, r2250, r2253; +} +{ +add.f16x2 r2259, r1671, r1719; +} +{ +add.f16x2 r2262, r212, r2259; +} +{ +add.f16x2 r2265, r1687, r1703; +} +{ +add.f16x2 r2268, r2262, r2265; +} +{ +add.f16x2 r2271, r1665, r1713; +} +{ +mul.f16x2 r2274, r2271, r2237; +} +{ +add.f16x2 r2277, r68, r2274; +} +{ +add.f16x2 r2280, r1681, r1697; +} +{ +mul.f16x2 r2283, r2280, r2241; +} +{ +add.f16x2 r2286, r2277, r2283; +} +{ +sub.f16x2 r2289, r1671, r1719; +} +{ +mul.f16x2 r2292, r2289, r2239; +} +{ +sub.f16x2 r2295, r1687, r1703; +} +{ +mul.f16x2 r2298, r2295, r2243; +} +{ +add.f16x2 r2301, r2292, r2298; +} +{ +sub.f16x2 r2304, r2286, r2301; +} +{ +add.f16x2 r2307, r1665, r1713; +} +{ +mul.f16x2 r2310, r2307, r2237; +} +{ +add.f16x2 r2313, r68, r2310; +} +{ +add.f16x2 r2316, r1681, r1697; +} +{ +mul.f16x2 r2319, r2316, r2241; +} +{ +add.f16x2 r2322, r2313, r2319; +} +{ +sub.f16x2 r2325, r1671, r1719; +} +{ +mul.f16x2 r2328, r2325, r2239; +} +{ +sub.f16x2 r2331, r1687, r1703; +} +{ +mul.f16x2 r2334, r2331, r2243; +} +{ +add.f16x2 r2337, r2328, r2334; +} +{ +add.f16x2 r2340, r2322, r2337; +} +{ +add.f16x2 r2343, r1665, r1713; +} +{ +mul.f16x2 r2346, r2343, r2241; +} +{ +add.f16x2 r2349, r68, r2346; +} +{ +add.f16x2 r2352, r1681, r1697; +} +{ +mul.f16x2 r2355, r2352, r2245; +} +{ +add.f16x2 r2358, r2349, r2355; +} +{ +sub.f16x2 r2361, r1671, r1719; +} +{ +mul.f16x2 r2364, r2361, r2243; +} +{ +sub.f16x2 r2367, r1687, r1703; +} +{ +mul.f16x2 r2370, r2367, r2246; +} +{ +add.f16x2 r2373, r2364, r2370; +} +{ +sub.f16x2 r2376, r2358, r2373; +} +{ +add.f16x2 r2379, r1665, r1713; +} +{ +mul.f16x2 r2382, r2379, r2241; +} +{ +add.f16x2 r2385, r68, r2382; +} +{ +add.f16x2 r2388, r1681, r1697; +} +{ +mul.f16x2 r2391, r2388, r2245; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +sub.f16x2 r2397, r1671, r1719; +} +{ +mul.f16x2 r2400, r2397, r2243; +} +{ +sub.f16x2 r2403, r1687, r1703; +} +{ +mul.f16x2 r2406, r2403, r2246; +} +{ +add.f16x2 r2409, r2400, r2406; +} +{ +add.f16x2 r2412, r2394, r2409; +} +{ +add.f16x2 r2415, r1671, r1719; +} +{ +mul.f16x2 r2418, r2415, r2237; +} +{ +add.f16x2 r2421, r212, r2418; +} +{ +add.f16x2 r2424, r1687, r1703; +} +{ +mul.f16x2 r2427, r2424, r2241; +} +{ +add.f16x2 r2430, r2421, r2427; +} +{ +sub.f16x2 r2433, r1665, r1713; +} +{ +mul.f16x2 r2436, r2433, r2239; +} +{ +sub.f16x2 r2439, r1681, r1697; +} +{ +mul.f16x2 r2442, r2439, r2243; +} +{ +add.f16x2 r2445, r2436, r2442; +} +{ +add.f16x2 r2448, r2430, r2445; +} +{ +add.f16x2 r2451, r1671, r1719; +} +{ +mul.f16x2 r2454, r2451, r2237; +} +{ +add.f16x2 r2457, r212, r2454; +} +{ +add.f16x2 r2460, r1687, r1703; +} +{ +mul.f16x2 r2463, r2460, r2241; +} +{ +add.f16x2 r2466, r2457, r2463; +} +{ +sub.f16x2 r2469, r1665, r1713; +} +{ +mul.f16x2 r2472, r2469, r2239; +} +{ +sub.f16x2 r2475, r1681, r1697; +} +{ +mul.f16x2 r2478, r2475, r2243; +} +{ +add.f16x2 r2481, r2472, r2478; +} +{ +sub.f16x2 r2484, r2466, r2481; +} +{ +add.f16x2 r2487, r1671, r1719; +} +{ +mul.f16x2 r2490, r2487, r2241; +} +{ +add.f16x2 r2493, r212, r2490; +} +{ +add.f16x2 r2496, r1687, r1703; +} +{ +mul.f16x2 r2499, r2496, r2245; +} +{ +add.f16x2 r2502, r2493, r2499; +} +{ +sub.f16x2 r2505, r1665, r1713; +} +{ +mul.f16x2 r2508, r2505, r2243; +} +{ +sub.f16x2 r2511, r1681, r1697; +} +{ +mul.f16x2 r2514, r2511, r2246; +} +{ +add.f16x2 r2517, r2508, r2514; +} +{ +add.f16x2 r2520, r2502, r2517; +} +{ +add.f16x2 r2523, r1671, r1719; +} +{ +mul.f16x2 r2526, r2523, r2241; +} +{ +add.f16x2 r2529, r212, r2526; +} +{ +add.f16x2 r2532, r1687, r1703; +} +{ +mul.f16x2 r2535, r2532, r2245; +} +{ +add.f16x2 r2538, r2529, r2535; +} +{ +sub.f16x2 r2541, r1665, r1713; +} +{ +mul.f16x2 r2544, r2541, r2243; +} +{ +sub.f16x2 r2547, r1681, r1697; +} +{ +mul.f16x2 r2550, r2547, r2246; +} +{ +add.f16x2 r2553, r2544, r2550; +} +{ +sub.f16x2 r2556, r2538, r2553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2560, {low, high}; +} +{ +neg.f16x2 r2561, r2560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2564, {low, high}; +} +{ +neg.f16x2 r2565, r2564; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2568, {low, high}; +} +{ +add.f16x2 r2569, r1729, r1777; +} +{ +add.f16x2 r2572, r140, r2569; +} +{ +add.f16x2 r2575, r1745, r1761; +} +{ +add.f16x2 r2578, r2572, r2575; +} +{ +add.f16x2 r2581, r1735, r1783; +} +{ +add.f16x2 r2584, r284, r2581; +} +{ +add.f16x2 r2587, r1751, r1767; +} +{ +add.f16x2 r2590, r2584, r2587; +} +{ +add.f16x2 r2593, r1729, r1777; +} +{ +mul.f16x2 r2596, r2593, r2559; +} +{ +add.f16x2 r2599, r140, r2596; +} +{ +add.f16x2 r2602, r1745, r1761; +} +{ +mul.f16x2 r2605, r2602, r2563; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +sub.f16x2 r2611, r1735, r1783; +} +{ +mul.f16x2 r2614, r2611, r2561; +} +{ +sub.f16x2 r2617, r1751, r1767; +} +{ +mul.f16x2 r2620, r2617, r2565; +} +{ +add.f16x2 r2623, r2614, r2620; +} +{ +sub.f16x2 r2626, r2608, r2623; +} +{ +add.f16x2 r2629, r1729, r1777; +} +{ +mul.f16x2 r2632, r2629, r2559; +} +{ +add.f16x2 r2635, r140, r2632; +} +{ +add.f16x2 r2638, r1745, r1761; +} +{ +mul.f16x2 r2641, r2638, r2563; +} +{ +add.f16x2 r2644, r2635, r2641; +} +{ +sub.f16x2 r2647, r1735, r1783; +} +{ +mul.f16x2 r2650, r2647, r2561; +} +{ +sub.f16x2 r2653, r1751, r1767; +} +{ +mul.f16x2 r2656, r2653, r2565; +} +{ +add.f16x2 r2659, r2650, r2656; +} +{ +add.f16x2 r2662, r2644, r2659; +} +{ +add.f16x2 r2665, r1729, r1777; +} +{ +mul.f16x2 r2668, r2665, r2563; +} +{ +add.f16x2 r2671, r140, r2668; +} +{ +add.f16x2 r2674, r1745, r1761; +} +{ +mul.f16x2 r2677, r2674, r2567; +} +{ +add.f16x2 r2680, r2671, r2677; +} +{ +sub.f16x2 r2683, r1735, r1783; +} +{ +mul.f16x2 r2686, r2683, r2565; +} +{ +sub.f16x2 r2689, r1751, r1767; +} +{ +mul.f16x2 r2692, r2689, r2568; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2680, r2695; +} +{ +add.f16x2 r2701, r1729, r1777; +} +{ +mul.f16x2 r2704, r2701, r2563; +} +{ +add.f16x2 r2707, r140, r2704; +} +{ +add.f16x2 r2710, r1745, r1761; +} +{ +mul.f16x2 r2713, r2710, r2567; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +sub.f16x2 r2719, r1735, r1783; +} +{ +mul.f16x2 r2722, r2719, r2565; +} +{ +sub.f16x2 r2725, r1751, r1767; +} +{ +mul.f16x2 r2728, r2725, r2568; +} +{ +add.f16x2 r2731, r2722, r2728; +} +{ +add.f16x2 r2734, r2716, r2731; +} +{ +add.f16x2 r2737, r1735, r1783; +} +{ +mul.f16x2 r2740, r2737, r2559; +} +{ +add.f16x2 r2743, r284, r2740; +} +{ +add.f16x2 r2746, r1751, r1767; +} +{ +mul.f16x2 r2749, r2746, r2563; +} +{ +add.f16x2 r2752, r2743, r2749; +} +{ +sub.f16x2 r2755, r1729, r1777; +} +{ +mul.f16x2 r2758, r2755, r2561; +} +{ +sub.f16x2 r2761, r1745, r1761; +} +{ +mul.f16x2 r2764, r2761, r2565; +} +{ +add.f16x2 r2767, r2758, r2764; +} +{ +add.f16x2 r2770, r2752, r2767; +} +{ +add.f16x2 r2773, r1735, r1783; +} +{ +mul.f16x2 r2776, r2773, r2559; +} +{ +add.f16x2 r2779, r284, r2776; +} +{ +add.f16x2 r2782, r1751, r1767; +} +{ +mul.f16x2 r2785, r2782, r2563; +} +{ +add.f16x2 r2788, r2779, r2785; +} +{ +sub.f16x2 r2791, r1729, r1777; +} +{ +mul.f16x2 r2794, r2791, r2561; +} +{ +sub.f16x2 r2797, r1745, r1761; +} +{ +mul.f16x2 r2800, r2797, r2565; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2788, r2803; +} +{ +add.f16x2 r2809, r1735, r1783; +} +{ +mul.f16x2 r2812, r2809, r2563; +} +{ +add.f16x2 r2815, r284, r2812; +} +{ +add.f16x2 r2818, r1751, r1767; +} +{ +mul.f16x2 r2821, r2818, r2567; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +sub.f16x2 r2827, r1729, r1777; +} +{ +mul.f16x2 r2830, r2827, r2565; +} +{ +sub.f16x2 r2833, r1745, r1761; +} +{ +mul.f16x2 r2836, r2833, r2568; +} +{ +add.f16x2 r2839, r2830, r2836; +} +{ +add.f16x2 r2842, r2824, r2839; +} +{ +add.f16x2 r2845, r1735, r1783; +} +{ +mul.f16x2 r2848, r2845, r2563; +} +{ +add.f16x2 r2851, r284, r2848; +} +{ +add.f16x2 r2854, r1751, r1767; +} +{ +mul.f16x2 r2857, r2854, r2567; +} +{ +add.f16x2 r2860, r2851, r2857; +} +{ +sub.f16x2 r2863, r1729, r1777; +} +{ +mul.f16x2 r2866, r2863, r2565; +} +{ +sub.f16x2 r2869, r1745, r1761; +} +{ +mul.f16x2 r2872, r2869, r2568; +} +{ +add.f16x2 r2875, r2866, r2872; +} +{ +sub.f16x2 r2878, r2860, r2875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2882, {low, high}; +} +{ +neg.f16x2 r2883, r2882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r2885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r2886, {low, high}; +} +{ +neg.f16x2 r2887, r2886; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r2890, {low, high}; +} +{ +add.f16x2 r2891, r1793, r1841; +} +{ +add.f16x2 r2894, r176, r2891; +} +{ +add.f16x2 r2897, r1809, r1825; +} +{ +add.f16x2 r2900, r2894, r2897; +} +{ +add.f16x2 r2903, r1799, r1847; +} +{ +add.f16x2 r2906, r320, r2903; +} +{ +add.f16x2 r2909, r1815, r1831; +} +{ +add.f16x2 r2912, r2906, r2909; +} +{ +add.f16x2 r2915, r1793, r1841; +} +{ +mul.f16x2 r2918, r2915, r2881; +} +{ +add.f16x2 r2921, r176, r2918; +} +{ +add.f16x2 r2924, r1809, r1825; +} +{ +mul.f16x2 r2927, r2924, r2885; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +sub.f16x2 r2933, r1799, r1847; +} +{ +mul.f16x2 r2936, r2933, r2883; +} +{ +sub.f16x2 r2939, r1815, r1831; +} +{ +mul.f16x2 r2942, r2939, r2887; +} +{ +add.f16x2 r2945, r2936, r2942; +} +{ +sub.f16x2 r2948, r2930, r2945; +} +{ +add.f16x2 r2951, r1793, r1841; +} +{ +mul.f16x2 r2954, r2951, r2881; +} +{ +add.f16x2 r2957, r176, r2954; +} +{ +add.f16x2 r2960, r1809, r1825; +} +{ +mul.f16x2 r2963, r2960, r2885; +} +{ +add.f16x2 r2966, r2957, r2963; +} +{ +sub.f16x2 r2969, r1799, r1847; +} +{ +mul.f16x2 r2972, r2969, r2883; +} +{ +sub.f16x2 r2975, r1815, r1831; +} +{ +mul.f16x2 r2978, r2975, r2887; +} +{ +add.f16x2 r2981, r2972, r2978; +} +{ +add.f16x2 r2984, r2966, r2981; +} +{ +add.f16x2 r2987, r1793, r1841; +} +{ +mul.f16x2 r2990, r2987, r2885; +} +{ +add.f16x2 r2993, r176, r2990; +} +{ +add.f16x2 r2996, r1809, r1825; +} +{ +mul.f16x2 r2999, r2996, r2889; +} +{ +add.f16x2 r3002, r2993, r2999; +} +{ +sub.f16x2 r3005, r1799, r1847; +} +{ +mul.f16x2 r3008, r3005, r2887; +} +{ +sub.f16x2 r3011, r1815, r1831; +} +{ +mul.f16x2 r3014, r3011, r2890; +} +{ +add.f16x2 r3017, r3008, r3014; +} +{ +sub.f16x2 r3020, r3002, r3017; +} +{ +add.f16x2 r3023, r1793, r1841; +} +{ +mul.f16x2 r3026, r3023, r2885; +} +{ +add.f16x2 r3029, r176, r3026; +} +{ +add.f16x2 r3032, r1809, r1825; +} +{ +mul.f16x2 r3035, r3032, r2889; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +sub.f16x2 r3041, r1799, r1847; +} +{ +mul.f16x2 r3044, r3041, r2887; +} +{ +sub.f16x2 r3047, r1815, r1831; +} +{ +mul.f16x2 r3050, r3047, r2890; +} +{ +add.f16x2 r3053, r3044, r3050; +} +{ +add.f16x2 r3056, r3038, r3053; +} +{ +add.f16x2 r3059, r1799, r1847; +} +{ +mul.f16x2 r3062, r3059, r2881; +} +{ +add.f16x2 r3065, r320, r3062; +} +{ +add.f16x2 r3068, r1815, r1831; +} +{ +mul.f16x2 r3071, r3068, r2885; +} +{ +add.f16x2 r3074, r3065, r3071; +} +{ +sub.f16x2 r3077, r1793, r1841; +} +{ +mul.f16x2 r3080, r3077, r2883; +} +{ +sub.f16x2 r3083, r1809, r1825; +} +{ +mul.f16x2 r3086, r3083, r2887; +} +{ +add.f16x2 r3089, r3080, r3086; +} +{ +add.f16x2 r3092, r3074, r3089; +} +{ +add.f16x2 r3095, r1799, r1847; +} +{ +mul.f16x2 r3098, r3095, r2881; +} +{ +add.f16x2 r3101, r320, r3098; +} +{ +add.f16x2 r3104, r1815, r1831; +} +{ +mul.f16x2 r3107, r3104, r2885; +} +{ +add.f16x2 r3110, r3101, r3107; +} +{ +sub.f16x2 r3113, r1793, r1841; +} +{ +mul.f16x2 r3116, r3113, r2883; +} +{ +sub.f16x2 r3119, r1809, r1825; +} +{ +mul.f16x2 r3122, r3119, r2887; +} +{ +add.f16x2 r3125, r3116, r3122; +} +{ +sub.f16x2 r3128, r3110, r3125; +} +{ +add.f16x2 r3131, r1799, r1847; +} +{ +mul.f16x2 r3134, r3131, r2885; +} +{ +add.f16x2 r3137, r320, r3134; +} +{ +add.f16x2 r3140, r1815, r1831; +} +{ +mul.f16x2 r3143, r3140, r2889; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +sub.f16x2 r3149, r1793, r1841; +} +{ +mul.f16x2 r3152, r3149, r2887; +} +{ +sub.f16x2 r3155, r1809, r1825; +} +{ +mul.f16x2 r3158, r3155, r2890; +} +{ +add.f16x2 r3161, r3152, r3158; +} +{ +add.f16x2 r3164, r3146, r3161; +} +{ +add.f16x2 r3167, r1799, r1847; +} +{ +mul.f16x2 r3170, r3167, r2885; +} +{ +add.f16x2 r3173, r320, r3170; +} +{ +add.f16x2 r3176, r1815, r1831; +} +{ +mul.f16x2 r3179, r3176, r2889; +} +{ +add.f16x2 r3182, r3173, r3179; +} +{ +sub.f16x2 r3185, r1793, r1841; +} +{ +mul.f16x2 r3188, r3185, r2887; +} +{ +sub.f16x2 r3191, r1809, r1825; +} +{ +mul.f16x2 r3194, r3191, r2890; +} +{ +add.f16x2 r3197, r3188, r3194; +} +{ +sub.f16x2 r3200, r3182, r3197; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3204, {low, high}; +} +{ +neg.f16x2 r3205, r3204; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r3208, {low, high}; +} +{ +neg.f16x2 r3209, r3208; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3212, {low, high}; +} +{ +add.f16x2 r3213, r1857, r1905; +} +{ +add.f16x2 r3216, r104, r3213; +} +{ +add.f16x2 r3219, r1873, r1889; +} +{ +add.f16x2 r3222, r3216, r3219; +} +{ +add.f16x2 r3225, r1863, r1911; +} +{ +add.f16x2 r3228, r248, r3225; +} +{ +add.f16x2 r3231, r1879, r1895; +} +{ +add.f16x2 r3234, r3228, r3231; +} +{ +add.f16x2 r3237, r1857, r1905; +} +{ +mul.f16x2 r3240, r3237, r3203; +} +{ +add.f16x2 r3243, r104, r3240; +} +{ +add.f16x2 r3246, r1873, r1889; +} +{ +mul.f16x2 r3249, r3246, r3207; +} +{ +add.f16x2 r3252, r3243, r3249; +} +{ +sub.f16x2 r3255, r1863, r1911; +} +{ +mul.f16x2 r3258, r3255, r3205; +} +{ +sub.f16x2 r3261, r1879, r1895; +} +{ +mul.f16x2 r3264, r3261, r3209; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +sub.f16x2 r3270, r3252, r3267; +} +{ +add.f16x2 r3273, r1857, r1905; +} +{ +mul.f16x2 r3276, r3273, r3203; +} +{ +add.f16x2 r3279, r104, r3276; +} +{ +add.f16x2 r3282, r1873, r1889; +} +{ +mul.f16x2 r3285, r3282, r3207; +} +{ +add.f16x2 r3288, r3279, r3285; +} +{ +sub.f16x2 r3291, r1863, r1911; +} +{ +mul.f16x2 r3294, r3291, r3205; +} +{ +sub.f16x2 r3297, r1879, r1895; +} +{ +mul.f16x2 r3300, r3297, r3209; +} +{ +add.f16x2 r3303, r3294, r3300; +} +{ +add.f16x2 r3306, r3288, r3303; +} +{ +add.f16x2 r3309, r1857, r1905; +} +{ +mul.f16x2 r3312, r3309, r3207; +} +{ +add.f16x2 r3315, r104, r3312; +} +{ +add.f16x2 r3318, r1873, r1889; +} +{ +mul.f16x2 r3321, r3318, r3211; +} +{ +add.f16x2 r3324, r3315, r3321; +} +{ +sub.f16x2 r3327, r1863, r1911; +} +{ +mul.f16x2 r3330, r3327, r3209; +} +{ +sub.f16x2 r3333, r1879, r1895; +} +{ +mul.f16x2 r3336, r3333, r3212; +} +{ +add.f16x2 r3339, r3330, r3336; +} +{ +sub.f16x2 r3342, r3324, r3339; +} +{ +add.f16x2 r3345, r1857, r1905; +} +{ +mul.f16x2 r3348, r3345, r3207; +} +{ +add.f16x2 r3351, r104, r3348; +} +{ +add.f16x2 r3354, r1873, r1889; +} +{ +mul.f16x2 r3357, r3354, r3211; +} +{ +add.f16x2 r3360, r3351, r3357; +} +{ +sub.f16x2 r3363, r1863, r1911; +} +{ +mul.f16x2 r3366, r3363, r3209; +} +{ +sub.f16x2 r3369, r1879, r1895; +} +{ +mul.f16x2 r3372, r3369, r3212; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r3360, r3375; +} +{ +add.f16x2 r3381, r1863, r1911; +} +{ +mul.f16x2 r3384, r3381, r3203; +} +{ +add.f16x2 r3387, r248, r3384; +} +{ +add.f16x2 r3390, r1879, r1895; +} +{ +mul.f16x2 r3393, r3390, r3207; +} +{ +add.f16x2 r3396, r3387, r3393; +} +{ +sub.f16x2 r3399, r1857, r1905; +} +{ +mul.f16x2 r3402, r3399, r3205; +} +{ +sub.f16x2 r3405, r1873, r1889; +} +{ +mul.f16x2 r3408, r3405, r3209; +} +{ +add.f16x2 r3411, r3402, r3408; +} +{ +add.f16x2 r3414, r3396, r3411; +} +{ +add.f16x2 r3417, r1863, r1911; +} +{ +mul.f16x2 r3420, r3417, r3203; +} +{ +add.f16x2 r3423, r248, r3420; +} +{ +add.f16x2 r3426, r1879, r1895; +} +{ +mul.f16x2 r3429, r3426, r3207; +} +{ +add.f16x2 r3432, r3423, r3429; +} +{ +sub.f16x2 r3435, r1857, r1905; +} +{ +mul.f16x2 r3438, r3435, r3205; +} +{ +sub.f16x2 r3441, r1873, r1889; +} +{ +mul.f16x2 r3444, r3441, r3209; +} +{ +add.f16x2 r3447, r3438, r3444; +} +{ +sub.f16x2 r3450, r3432, r3447; +} +{ +add.f16x2 r3453, r1863, r1911; +} +{ +mul.f16x2 r3456, r3453, r3207; +} +{ +add.f16x2 r3459, r248, r3456; +} +{ +add.f16x2 r3462, r1879, r1895; +} +{ +mul.f16x2 r3465, r3462, r3211; +} +{ +add.f16x2 r3468, r3459, r3465; +} +{ +sub.f16x2 r3471, r1857, r1905; +} +{ +mul.f16x2 r3474, r3471, r3209; +} +{ +sub.f16x2 r3477, r1873, r1889; +} +{ +mul.f16x2 r3480, r3477, r3212; +} +{ +add.f16x2 r3483, r3474, r3480; +} +{ +add.f16x2 r3486, r3468, r3483; +} +{ +add.f16x2 r3489, r1863, r1911; +} +{ +mul.f16x2 r3492, r3489, r3207; +} +{ +add.f16x2 r3495, r248, r3492; +} +{ +add.f16x2 r3498, r1879, r1895; +} +{ +mul.f16x2 r3501, r3498, r3211; +} +{ +add.f16x2 r3504, r3495, r3501; +} +{ +sub.f16x2 r3507, r1857, r1905; +} +{ +mul.f16x2 r3510, r3507, r3209; +} +{ +sub.f16x2 r3513, r1873, r1889; +} +{ +mul.f16x2 r3516, r3513, r3212; +} +{ +add.f16x2 r3519, r3510, r3516; +} +{ +sub.f16x2 r3522, r3504, r3519; +} +mul.wide.u32 rd2, r10444, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r10445, rd3; +mul.lo.s32 r10446, r10445, 125; +sub.s32 r10447, r10444, r10446; +mad.lo.s32 r10448, r10445, 12500, r10443; +cvt.rn.f32.u32 f597, r10447; +mul.f32 f598, f597, 0f3B03C498; +cos.approx.f32 f217, f598; +sin.approx.f32 f599, f598; +neg.f32 f218, f599; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3525, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3530, {high, high}; +} +{ +mul.f16x2 r3532, r2268, r3530; +} +{ +fma.rn.f16x2 r3535, r2256, r3528, r3532; +} +{ +mul.f16x2 r3539, r2256, r3530; +} +{ +neg.f16x2 r3542, r3539; +} +{ +fma.rn.f16x2 r3544, r2268, r3528, r3542; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3550, {high, high}; +} +mov.f32 f533, 0fBF800000; +mov.f32 f534, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3552, {low, high}; +} +{ +mul.f16x2 r3553, r3550, r3552; +} +{ +mul.f16x2 r3556, r3525, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3559, {high, low}; +} +{ +fma.rn.f16x2 r3561, r3553, r3559, r3556; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3567, {high, high}; +} +{ +mul.f16x2 r3569, r2590, r3567; +} +{ +fma.rn.f16x2 r3572, r2578, r3565, r3569; +} +{ +mul.f16x2 r3576, r2578, r3567; +} +{ +neg.f16x2 r3579, r3576; +} +{ +fma.rn.f16x2 r3581, r2590, r3565, r3579; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3587, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3589, {low, high}; +} +{ +mul.f16x2 r3590, r3587, r3589; +} +{ +mul.f16x2 r3593, r3561, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3596, {high, low}; +} +{ +fma.rn.f16x2 r3598, r3590, r3596, r3593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3604, {high, high}; +} +{ +mul.f16x2 r3606, r2912, r3604; +} +{ +fma.rn.f16x2 r3609, r2900, r3602, r3606; +} +{ +mul.f16x2 r3613, r2900, r3604; +} +{ +neg.f16x2 r3616, r3613; +} +{ +fma.rn.f16x2 r3618, r2912, r3602, r3616; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3624, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3626, {low, high}; +} +{ +mul.f16x2 r3627, r3624, r3626; +} +{ +mul.f16x2 r3630, r3598, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3633, {high, low}; +} +{ +fma.rn.f16x2 r3635, r3627, r3633, r3630; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3641, {high, high}; +} +{ +mul.f16x2 r3643, r3234, r3641; +} +{ +fma.rn.f16x2 r3646, r3222, r3639, r3643; +} +{ +mul.f16x2 r3650, r3222, r3641; +} +{ +neg.f16x2 r3653, r3650; +} +{ +fma.rn.f16x2 r3655, r3234, r3639, r3653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3661, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3663, {low, high}; +} +{ +mul.f16x2 r3664, r3661, r3663; +} +{ +mul.f16x2 r3667, r3635, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3670, {high, low}; +} +{ +fma.rn.f16x2 r3672, r3664, r3670, r3667; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3678, {high, high}; +} +{ +mul.f16x2 r3680, r2126, r3678; +} +{ +fma.rn.f16x2 r3683, r1982, r3676, r3680; +} +{ +mul.f16x2 r3687, r1982, r3678; +} +{ +neg.f16x2 r3690, r3687; +} +{ +fma.rn.f16x2 r3692, r2126, r3676, r3690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3698, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3700, {low, high}; +} +{ +mul.f16x2 r3701, r3698, r3700; +} +{ +mul.f16x2 r3704, r3672, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3707, {high, low}; +} +{ +fma.rn.f16x2 r3709, r3701, r3707, r3704; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3715, {high, high}; +} +{ +mul.f16x2 r3717, r2448, r3715; +} +{ +fma.rn.f16x2 r3720, r2304, r3713, r3717; +} +{ +mul.f16x2 r3724, r2304, r3715; +} +{ +neg.f16x2 r3727, r3724; +} +{ +fma.rn.f16x2 r3729, r2448, r3713, r3727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3735, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3737, {low, high}; +} +{ +mul.f16x2 r3738, r3735, r3737; +} +{ +mul.f16x2 r3741, r3709, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3744, {high, low}; +} +{ +fma.rn.f16x2 r3746, r3738, r3744, r3741; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3752, {high, high}; +} +{ +mul.f16x2 r3754, r2770, r3752; +} +{ +fma.rn.f16x2 r3757, r2626, r3750, r3754; +} +{ +mul.f16x2 r3761, r2626, r3752; +} +{ +neg.f16x2 r3764, r3761; +} +{ +fma.rn.f16x2 r3766, r2770, r3750, r3764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3774, {low, high}; +} +{ +mul.f16x2 r3775, r3772, r3774; +} +{ +mul.f16x2 r3778, r3746, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3781, {high, low}; +} +{ +fma.rn.f16x2 r3783, r3775, r3781, r3778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3789, {high, high}; +} +{ +mul.f16x2 r3791, r3092, r3789; +} +{ +fma.rn.f16x2 r3794, r2948, r3787, r3791; +} +{ +mul.f16x2 r3798, r2948, r3789; +} +{ +neg.f16x2 r3801, r3798; +} +{ +fma.rn.f16x2 r3803, r3092, r3787, r3801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3811, {low, high}; +} +{ +mul.f16x2 r3812, r3809, r3811; +} +{ +mul.f16x2 r3815, r3783, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3818, {high, low}; +} +{ +fma.rn.f16x2 r3820, r3812, r3818, r3815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3826, {high, high}; +} +{ +mul.f16x2 r3828, r3414, r3826; +} +{ +fma.rn.f16x2 r3831, r3270, r3824, r3828; +} +{ +mul.f16x2 r3835, r3270, r3826; +} +{ +neg.f16x2 r3838, r3835; +} +{ +fma.rn.f16x2 r3840, r3414, r3824, r3838; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3848, {low, high}; +} +{ +mul.f16x2 r3849, r3846, r3848; +} +{ +mul.f16x2 r3852, r3820, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3855, {high, low}; +} +{ +fma.rn.f16x2 r3857, r3849, r3855, r3852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3863, {high, high}; +} +{ +mul.f16x2 r3865, r2198, r3863; +} +{ +fma.rn.f16x2 r3868, r2054, r3861, r3865; +} +{ +mul.f16x2 r3872, r2054, r3863; +} +{ +neg.f16x2 r3875, r3872; +} +{ +fma.rn.f16x2 r3877, r2198, r3861, r3875; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3885, {low, high}; +} +{ +mul.f16x2 r3886, r3883, r3885; +} +{ +mul.f16x2 r3889, r3857, r3881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3892, {high, low}; +} +{ +fma.rn.f16x2 r3894, r3886, r3892, r3889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3900, {high, high}; +} +{ +mul.f16x2 r3902, r2520, r3900; +} +{ +fma.rn.f16x2 r3905, r2376, r3898, r3902; +} +{ +mul.f16x2 r3909, r2376, r3900; +} +{ +neg.f16x2 r3912, r3909; +} +{ +fma.rn.f16x2 r3914, r2520, r3898, r3912; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3922, {low, high}; +} +{ +mul.f16x2 r3923, r3920, r3922; +} +{ +mul.f16x2 r3926, r3894, r3918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3929, {high, low}; +} +{ +fma.rn.f16x2 r3931, r3923, r3929, r3926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3937, {high, high}; +} +{ +mul.f16x2 r3939, r2842, r3937; +} +{ +fma.rn.f16x2 r3942, r2698, r3935, r3939; +} +{ +mul.f16x2 r3946, r2698, r3937; +} +{ +neg.f16x2 r3949, r3946; +} +{ +fma.rn.f16x2 r3951, r2842, r3935, r3949; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3959, {low, high}; +} +{ +mul.f16x2 r3960, r3957, r3959; +} +{ +mul.f16x2 r3963, r3931, r3955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3966, {high, low}; +} +{ +fma.rn.f16x2 r3968, r3960, r3966, r3963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3974, {high, high}; +} +{ +mul.f16x2 r3976, r3164, r3974; +} +{ +fma.rn.f16x2 r3979, r3020, r3972, r3976; +} +{ +mul.f16x2 r3983, r3020, r3974; +} +{ +neg.f16x2 r3986, r3983; +} +{ +fma.rn.f16x2 r3988, r3164, r3972, r3986; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r3996, {low, high}; +} +{ +mul.f16x2 r3997, r3994, r3996; +} +{ +mul.f16x2 r4000, r3968, r3992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r4003, {high, low}; +} +{ +fma.rn.f16x2 r4005, r3997, r4003, r4000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4011, {high, high}; +} +{ +mul.f16x2 r4013, r3486, r4011; +} +{ +fma.rn.f16x2 r4016, r3342, r4009, r4013; +} +{ +mul.f16x2 r4020, r3342, r4011; +} +{ +neg.f16x2 r4023, r4020; +} +{ +fma.rn.f16x2 r4025, r3486, r4009, r4023; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4033, {low, high}; +} +{ +mul.f16x2 r4034, r4031, r4033; +} +{ +mul.f16x2 r4037, r4005, r4029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4040, {high, low}; +} +{ +fma.rn.f16x2 r4042, r4034, r4040, r4037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4048, {high, high}; +} +{ +mul.f16x2 r4050, r2234, r4048; +} +{ +fma.rn.f16x2 r4053, r2090, r4046, r4050; +} +{ +mul.f16x2 r4057, r2090, r4048; +} +{ +neg.f16x2 r4060, r4057; +} +{ +fma.rn.f16x2 r4062, r2234, r4046, r4060; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4070, {low, high}; +} +{ +mul.f16x2 r4071, r4068, r4070; +} +{ +mul.f16x2 r4074, r4042, r4066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4077, {high, low}; +} +{ +fma.rn.f16x2 r4079, r4071, r4077, r4074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4085, {high, high}; +} +{ +mul.f16x2 r4087, r2556, r4085; +} +{ +fma.rn.f16x2 r4090, r2412, r4083, r4087; +} +{ +mul.f16x2 r4094, r2412, r4085; +} +{ +neg.f16x2 r4097, r4094; +} +{ +fma.rn.f16x2 r4099, r2556, r4083, r4097; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4107, {low, high}; +} +{ +mul.f16x2 r4108, r4105, r4107; +} +{ +mul.f16x2 r4111, r4079, r4103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4114, {high, low}; +} +{ +fma.rn.f16x2 r4116, r4108, r4114, r4111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4122, {high, high}; +} +{ +mul.f16x2 r4124, r2878, r4122; +} +{ +fma.rn.f16x2 r4127, r2734, r4120, r4124; +} +{ +mul.f16x2 r4131, r2734, r4122; +} +{ +neg.f16x2 r4134, r4131; +} +{ +fma.rn.f16x2 r4136, r2878, r4120, r4134; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4140, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4142, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4144, {low, high}; +} +{ +mul.f16x2 r4145, r4142, r4144; +} +{ +mul.f16x2 r4148, r4116, r4140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4151, {high, low}; +} +{ +fma.rn.f16x2 r4153, r4145, r4151, r4148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4159, {high, high}; +} +{ +mul.f16x2 r4161, r3200, r4159; +} +{ +fma.rn.f16x2 r4164, r3056, r4157, r4161; +} +{ +mul.f16x2 r4168, r3056, r4159; +} +{ +neg.f16x2 r4171, r4168; +} +{ +fma.rn.f16x2 r4173, r3200, r4157, r4171; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4177, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4179, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4181, {low, high}; +} +{ +mul.f16x2 r4182, r4179, r4181; +} +{ +mul.f16x2 r4185, r4153, r4177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4188, {high, low}; +} +{ +fma.rn.f16x2 r4190, r4182, r4188, r4185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4196, {high, high}; +} +{ +mul.f16x2 r4198, r3522, r4196; +} +{ +fma.rn.f16x2 r4201, r3378, r4194, r4198; +} +{ +mul.f16x2 r4205, r3378, r4196; +} +{ +neg.f16x2 r4208, r4205; +} +{ +fma.rn.f16x2 r4210, r3522, r4194, r4208; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4214, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4216, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4218, {low, high}; +} +{ +mul.f16x2 r4219, r4216, r4218; +} +{ +mul.f16x2 r4222, r4190, r4214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4225, {high, low}; +} +{ +fma.rn.f16x2 r4227, r4219, r4225, r4222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4233, {high, high}; +} +{ +mul.f16x2 r4235, r2162, r4233; +} +{ +fma.rn.f16x2 r4238, r2018, r4231, r4235; +} +{ +mul.f16x2 r4242, r2018, r4233; +} +{ +neg.f16x2 r4245, r4242; +} +{ +fma.rn.f16x2 r4247, r2162, r4231, r4245; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4253, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4255, {low, high}; +} +{ +mul.f16x2 r4256, r4253, r4255; +} +{ +mul.f16x2 r4259, r4227, r4251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4262, {high, low}; +} +{ +fma.rn.f16x2 r4264, r4256, r4262, r4259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4270, {high, high}; +} +{ +mul.f16x2 r4272, r2484, r4270; +} +{ +fma.rn.f16x2 r4275, r2340, r4268, r4272; +} +{ +mul.f16x2 r4279, r2340, r4270; +} +{ +neg.f16x2 r4282, r4279; +} +{ +fma.rn.f16x2 r4284, r2484, r4268, r4282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4288, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4290, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4292, {low, high}; +} +{ +mul.f16x2 r4293, r4290, r4292; +} +{ +mul.f16x2 r4296, r4264, r4288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4299, {high, low}; +} +{ +fma.rn.f16x2 r4301, r4293, r4299, r4296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4307, {high, high}; +} +{ +mul.f16x2 r4309, r2806, r4307; +} +{ +fma.rn.f16x2 r4312, r2662, r4305, r4309; +} +{ +mul.f16x2 r4316, r2662, r4307; +} +{ +neg.f16x2 r4319, r4316; +} +{ +fma.rn.f16x2 r4321, r2806, r4305, r4319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4325, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4327, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4329, {low, high}; +} +{ +mul.f16x2 r4330, r4327, r4329; +} +{ +mul.f16x2 r4333, r4301, r4325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4336, {high, low}; +} +{ +fma.rn.f16x2 r4338, r4330, r4336, r4333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4344, {high, high}; +} +{ +mul.f16x2 r4346, r3128, r4344; +} +{ +fma.rn.f16x2 r4349, r2984, r4342, r4346; +} +{ +mul.f16x2 r4353, r2984, r4344; +} +{ +neg.f16x2 r4356, r4353; +} +{ +fma.rn.f16x2 r4358, r3128, r4342, r4356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4362, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4364, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r4366, {low, high}; +} +{ +mul.f16x2 r4367, r4364, r4366; +} +{ +mul.f16x2 r4370, r4338, r4362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4373, {high, low}; +} +{ +fma.rn.f16x2 r4375, r4367, r4373, r4370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4379, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4381, {high, high}; +} +{ +mul.f16x2 r4383, r3450, r4381; +} +{ +fma.rn.f16x2 r4386, r3306, r4379, r4383; +} +{ +mul.f16x2 r4390, r3306, r4381; +} +{ +neg.f16x2 r4393, r4390; +} +{ +fma.rn.f16x2 r4395, r3450, r4379, r4393; +} +barrier.sync 0; +mad.lo.s32 r10449, r10447, 100, r10448; +st.shared.u32 [r10449], r1934; +st.shared.u32 [r10449+4], r3535; +st.shared.u32 [r10449+8], r3572; +st.shared.u32 [r10449+12], r3609; +st.shared.u32 [r10449+16], r3646; +st.shared.u32 [r10449+20], r3683; +st.shared.u32 [r10449+24], r3720; +st.shared.u32 [r10449+28], r3757; +st.shared.u32 [r10449+32], r3794; +st.shared.u32 [r10449+36], r3831; +st.shared.u32 [r10449+40], r3868; +st.shared.u32 [r10449+44], r3905; +st.shared.u32 [r10449+48], r3942; +st.shared.u32 [r10449+52], r3979; +st.shared.u32 [r10449+56], r4016; +st.shared.u32 [r10449+60], r4053; +st.shared.u32 [r10449+64], r4090; +st.shared.u32 [r10449+68], r4127; +st.shared.u32 [r10449+72], r4164; +st.shared.u32 [r10449+76], r4201; +st.shared.u32 [r10449+80], r4238; +st.shared.u32 [r10449+84], r4275; +st.shared.u32 [r10449+88], r4312; +st.shared.u32 [r10449+92], r4349; +st.shared.u32 [r10449+96], r4386; +barrier.sync 0; +mad.lo.s32 r10450, r10447, -96, r10449; +ld.shared.u32 r4430, [r10450]; +ld.shared.u32 r4752, [r10450+500]; +ld.shared.u32 r5074, [r10450+1000]; +ld.shared.u32 r5396, [r10450+1500]; +ld.shared.u32 r5718, [r10450+2000]; +ld.shared.u32 r4427, [r10450+2500]; +ld.shared.u32 r4749, [r10450+3000]; +ld.shared.u32 r5071, [r10450+3500]; +ld.shared.u32 r5393, [r10450+4000]; +ld.shared.u32 r5715, [r10450+4500]; +ld.shared.u32 r4433, [r10450+5000]; +ld.shared.u32 r4755, [r10450+5500]; +ld.shared.u32 r5077, [r10450+6000]; +ld.shared.u32 r5399, [r10450+6500]; +ld.shared.u32 r5721, [r10450+7000]; +ld.shared.u32 r4434, [r10450+7500]; +ld.shared.u32 r4756, [r10450+8000]; +ld.shared.u32 r5078, [r10450+8500]; +ld.shared.u32 r5400, [r10450+9000]; +ld.shared.u32 r5722, [r10450+9500]; +ld.shared.u32 r4428, [r10450+10000]; +ld.shared.u32 r4750, [r10450+10500]; +ld.shared.u32 r5072, [r10450+11000]; +ld.shared.u32 r5394, [r10450+11500]; +ld.shared.u32 r5716, [r10450+12000]; +barrier.sync 0; +st.shared.u32 [r10449], r1946; +st.shared.u32 [r10449+4], r3544; +st.shared.u32 [r10449+8], r3581; +st.shared.u32 [r10449+12], r3618; +st.shared.u32 [r10449+16], r3655; +st.shared.u32 [r10449+20], r3692; +st.shared.u32 [r10449+24], r3729; +st.shared.u32 [r10449+28], r3766; +st.shared.u32 [r10449+32], r3803; +st.shared.u32 [r10449+36], r3840; +st.shared.u32 [r10449+40], r3877; +st.shared.u32 [r10449+44], r3914; +st.shared.u32 [r10449+48], r3951; +st.shared.u32 [r10449+52], r3988; +st.shared.u32 [r10449+56], r4025; +st.shared.u32 [r10449+60], r4062; +st.shared.u32 [r10449+64], r4099; +st.shared.u32 [r10449+68], r4136; +st.shared.u32 [r10449+72], r4173; +st.shared.u32 [r10449+76], r4210; +st.shared.u32 [r10449+80], r4247; +st.shared.u32 [r10449+84], r4284; +st.shared.u32 [r10449+88], r4321; +st.shared.u32 [r10449+92], r4358; +st.shared.u32 [r10449+96], r4395; +barrier.sync 0; +ld.shared.u32 r4442, [r10450]; +ld.shared.u32 r4764, [r10450+500]; +ld.shared.u32 r5086, [r10450+1000]; +ld.shared.u32 r5408, [r10450+1500]; +ld.shared.u32 r5730, [r10450+2000]; +ld.shared.u32 r4439, [r10450+2500]; +ld.shared.u32 r4761, [r10450+3000]; +ld.shared.u32 r5083, [r10450+3500]; +ld.shared.u32 r5405, [r10450+4000]; +ld.shared.u32 r5727, [r10450+4500]; +ld.shared.u32 r4445, [r10450+5000]; +ld.shared.u32 r4767, [r10450+5500]; +ld.shared.u32 r5089, [r10450+6000]; +ld.shared.u32 r5411, [r10450+6500]; +ld.shared.u32 r5733, [r10450+7000]; +ld.shared.u32 r4446, [r10450+7500]; +ld.shared.u32 r4768, [r10450+8000]; +ld.shared.u32 r5090, [r10450+8500]; +ld.shared.u32 r5412, [r10450+9000]; +ld.shared.u32 r5734, [r10450+9500]; +ld.shared.u32 r4440, [r10450+10000]; +ld.shared.u32 r4762, [r10450+10500]; +ld.shared.u32 r5084, [r10450+11000]; +ld.shared.u32 r5406, [r10450+11500]; +ld.shared.u32 r5728, [r10450+12000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4417, {low, high}; +} +{ +neg.f16x2 r4418, r4417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4421, {low, high}; +} +{ +neg.f16x2 r4422, r4421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4425, {low, high}; +} +{ +add.f16x2 r4426, r4427, r4428; +} +{ +add.f16x2 r4429, r4430, r4426; +} +{ +add.f16x2 r4432, r4433, r4434; +} +{ +add.f16x2 r4435, r4429, r4432; +} +{ +add.f16x2 r4438, r4439, r4440; +} +{ +add.f16x2 r4441, r4442, r4438; +} +{ +add.f16x2 r4444, r4445, r4446; +} +{ +add.f16x2 r4447, r4441, r4444; +} +{ +add.f16x2 r4450, r4427, r4428; +} +{ +mul.f16x2 r4453, r4450, r4416; +} +{ +add.f16x2 r4456, r4430, r4453; +} +{ +add.f16x2 r4459, r4433, r4434; +} +{ +mul.f16x2 r4462, r4459, r4420; +} +{ +add.f16x2 r4465, r4456, r4462; +} +{ +sub.f16x2 r4468, r4439, r4440; +} +{ +mul.f16x2 r4471, r4468, r4418; +} +{ +sub.f16x2 r4474, r4445, r4446; +} +{ +mul.f16x2 r4477, r4474, r4422; +} +{ +add.f16x2 r4480, r4471, r4477; +} +{ +sub.f16x2 r4483, r4465, r4480; +} +{ +add.f16x2 r4486, r4427, r4428; +} +{ +mul.f16x2 r4489, r4486, r4416; +} +{ +add.f16x2 r4492, r4430, r4489; +} +{ +add.f16x2 r4495, r4433, r4434; +} +{ +mul.f16x2 r4498, r4495, r4420; +} +{ +add.f16x2 r4501, r4492, r4498; +} +{ +sub.f16x2 r4504, r4439, r4440; +} +{ +mul.f16x2 r4507, r4504, r4418; +} +{ +sub.f16x2 r4510, r4445, r4446; +} +{ +mul.f16x2 r4513, r4510, r4422; +} +{ +add.f16x2 r4516, r4507, r4513; +} +{ +add.f16x2 r4519, r4501, r4516; +} +{ +add.f16x2 r4522, r4427, r4428; +} +{ +mul.f16x2 r4525, r4522, r4420; +} +{ +add.f16x2 r4528, r4430, r4525; +} +{ +add.f16x2 r4531, r4433, r4434; +} +{ +mul.f16x2 r4534, r4531, r4424; +} +{ +add.f16x2 r4537, r4528, r4534; +} +{ +sub.f16x2 r4540, r4439, r4440; +} +{ +mul.f16x2 r4543, r4540, r4422; +} +{ +sub.f16x2 r4546, r4445, r4446; +} +{ +mul.f16x2 r4549, r4546, r4425; +} +{ +add.f16x2 r4552, r4543, r4549; +} +{ +sub.f16x2 r4555, r4537, r4552; +} +{ +add.f16x2 r4558, r4427, r4428; +} +{ +mul.f16x2 r4561, r4558, r4420; +} +{ +add.f16x2 r4564, r4430, r4561; +} +{ +add.f16x2 r4567, r4433, r4434; +} +{ +mul.f16x2 r4570, r4567, r4424; +} +{ +add.f16x2 r4573, r4564, r4570; +} +{ +sub.f16x2 r4576, r4439, r4440; +} +{ +mul.f16x2 r4579, r4576, r4422; +} +{ +sub.f16x2 r4582, r4445, r4446; +} +{ +mul.f16x2 r4585, r4582, r4425; +} +{ +add.f16x2 r4588, r4579, r4585; +} +{ +add.f16x2 r4591, r4573, r4588; +} +{ +add.f16x2 r4594, r4439, r4440; +} +{ +mul.f16x2 r4597, r4594, r4416; +} +{ +add.f16x2 r4600, r4442, r4597; +} +{ +add.f16x2 r4603, r4445, r4446; +} +{ +mul.f16x2 r4606, r4603, r4420; +} +{ +add.f16x2 r4609, r4600, r4606; +} +{ +sub.f16x2 r4612, r4427, r4428; +} +{ +mul.f16x2 r4615, r4612, r4418; +} +{ +sub.f16x2 r4618, r4433, r4434; +} +{ +mul.f16x2 r4621, r4618, r4422; +} +{ +add.f16x2 r4624, r4615, r4621; +} +{ +add.f16x2 r4627, r4609, r4624; +} +{ +add.f16x2 r4630, r4439, r4440; +} +{ +mul.f16x2 r4633, r4630, r4416; +} +{ +add.f16x2 r4636, r4442, r4633; +} +{ +add.f16x2 r4639, r4445, r4446; +} +{ +mul.f16x2 r4642, r4639, r4420; +} +{ +add.f16x2 r4645, r4636, r4642; +} +{ +sub.f16x2 r4648, r4427, r4428; +} +{ +mul.f16x2 r4651, r4648, r4418; +} +{ +sub.f16x2 r4654, r4433, r4434; +} +{ +mul.f16x2 r4657, r4654, r4422; +} +{ +add.f16x2 r4660, r4651, r4657; +} +{ +sub.f16x2 r4663, r4645, r4660; +} +{ +add.f16x2 r4666, r4439, r4440; +} +{ +mul.f16x2 r4669, r4666, r4420; +} +{ +add.f16x2 r4672, r4442, r4669; +} +{ +add.f16x2 r4675, r4445, r4446; +} +{ +mul.f16x2 r4678, r4675, r4424; +} +{ +add.f16x2 r4681, r4672, r4678; +} +{ +sub.f16x2 r4684, r4427, r4428; +} +{ +mul.f16x2 r4687, r4684, r4422; +} +{ +sub.f16x2 r4690, r4433, r4434; +} +{ +mul.f16x2 r4693, r4690, r4425; +} +{ +add.f16x2 r4696, r4687, r4693; +} +{ +add.f16x2 r4699, r4681, r4696; +} +{ +add.f16x2 r4702, r4439, r4440; +} +{ +mul.f16x2 r4705, r4702, r4420; +} +{ +add.f16x2 r4708, r4442, r4705; +} +{ +add.f16x2 r4711, r4445, r4446; +} +{ +mul.f16x2 r4714, r4711, r4424; +} +{ +add.f16x2 r4717, r4708, r4714; +} +{ +sub.f16x2 r4720, r4427, r4428; +} +{ +mul.f16x2 r4723, r4720, r4422; +} +{ +sub.f16x2 r4726, r4433, r4434; +} +{ +mul.f16x2 r4729, r4726, r4425; +} +{ +add.f16x2 r4732, r4723, r4729; +} +{ +sub.f16x2 r4735, r4717, r4732; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4739, {low, high}; +} +{ +neg.f16x2 r4740, r4739; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r4742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r4743, {low, high}; +} +{ +neg.f16x2 r4744, r4743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r4746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r4747, {low, high}; +} +{ +add.f16x2 r4748, r4749, r4750; +} +{ +add.f16x2 r4751, r4752, r4748; +} +{ +add.f16x2 r4754, r4755, r4756; +} +{ +add.f16x2 r4757, r4751, r4754; +} +{ +add.f16x2 r4760, r4761, r4762; +} +{ +add.f16x2 r4763, r4764, r4760; +} +{ +add.f16x2 r4766, r4767, r4768; +} +{ +add.f16x2 r4769, r4763, r4766; +} +{ +add.f16x2 r4772, r4749, r4750; +} +{ +mul.f16x2 r4775, r4772, r4738; +} +{ +add.f16x2 r4778, r4752, r4775; +} +{ +add.f16x2 r4781, r4755, r4756; +} +{ +mul.f16x2 r4784, r4781, r4742; +} +{ +add.f16x2 r4787, r4778, r4784; +} +{ +sub.f16x2 r4790, r4761, r4762; +} +{ +mul.f16x2 r4793, r4790, r4740; +} +{ +sub.f16x2 r4796, r4767, r4768; +} +{ +mul.f16x2 r4799, r4796, r4744; +} +{ +add.f16x2 r4802, r4793, r4799; +} +{ +sub.f16x2 r4805, r4787, r4802; +} +{ +add.f16x2 r4808, r4749, r4750; +} +{ +mul.f16x2 r4811, r4808, r4738; +} +{ +add.f16x2 r4814, r4752, r4811; +} +{ +add.f16x2 r4817, r4755, r4756; +} +{ +mul.f16x2 r4820, r4817, r4742; +} +{ +add.f16x2 r4823, r4814, r4820; +} +{ +sub.f16x2 r4826, r4761, r4762; +} +{ +mul.f16x2 r4829, r4826, r4740; +} +{ +sub.f16x2 r4832, r4767, r4768; +} +{ +mul.f16x2 r4835, r4832, r4744; +} +{ +add.f16x2 r4838, r4829, r4835; +} +{ +add.f16x2 r4841, r4823, r4838; +} +{ +add.f16x2 r4844, r4749, r4750; +} +{ +mul.f16x2 r4847, r4844, r4742; +} +{ +add.f16x2 r4850, r4752, r4847; +} +{ +add.f16x2 r4853, r4755, r4756; +} +{ +mul.f16x2 r4856, r4853, r4746; +} +{ +add.f16x2 r4859, r4850, r4856; +} +{ +sub.f16x2 r4862, r4761, r4762; +} +{ +mul.f16x2 r4865, r4862, r4744; +} +{ +sub.f16x2 r4868, r4767, r4768; +} +{ +mul.f16x2 r4871, r4868, r4747; +} +{ +add.f16x2 r4874, r4865, r4871; +} +{ +sub.f16x2 r4877, r4859, r4874; +} +{ +add.f16x2 r4880, r4749, r4750; +} +{ +mul.f16x2 r4883, r4880, r4742; +} +{ +add.f16x2 r4886, r4752, r4883; +} +{ +add.f16x2 r4889, r4755, r4756; +} +{ +mul.f16x2 r4892, r4889, r4746; +} +{ +add.f16x2 r4895, r4886, r4892; +} +{ +sub.f16x2 r4898, r4761, r4762; +} +{ +mul.f16x2 r4901, r4898, r4744; +} +{ +sub.f16x2 r4904, r4767, r4768; +} +{ +mul.f16x2 r4907, r4904, r4747; +} +{ +add.f16x2 r4910, r4901, r4907; +} +{ +add.f16x2 r4913, r4895, r4910; +} +{ +add.f16x2 r4916, r4761, r4762; +} +{ +mul.f16x2 r4919, r4916, r4738; +} +{ +add.f16x2 r4922, r4764, r4919; +} +{ +add.f16x2 r4925, r4767, r4768; +} +{ +mul.f16x2 r4928, r4925, r4742; +} +{ +add.f16x2 r4931, r4922, r4928; +} +{ +sub.f16x2 r4934, r4749, r4750; +} +{ +mul.f16x2 r4937, r4934, r4740; +} +{ +sub.f16x2 r4940, r4755, r4756; +} +{ +mul.f16x2 r4943, r4940, r4744; +} +{ +add.f16x2 r4946, r4937, r4943; +} +{ +add.f16x2 r4949, r4931, r4946; +} +{ +add.f16x2 r4952, r4761, r4762; +} +{ +mul.f16x2 r4955, r4952, r4738; +} +{ +add.f16x2 r4958, r4764, r4955; +} +{ +add.f16x2 r4961, r4767, r4768; +} +{ +mul.f16x2 r4964, r4961, r4742; +} +{ +add.f16x2 r4967, r4958, r4964; +} +{ +sub.f16x2 r4970, r4749, r4750; +} +{ +mul.f16x2 r4973, r4970, r4740; +} +{ +sub.f16x2 r4976, r4755, r4756; +} +{ +mul.f16x2 r4979, r4976, r4744; +} +{ +add.f16x2 r4982, r4973, r4979; +} +{ +sub.f16x2 r4985, r4967, r4982; +} +{ +add.f16x2 r4988, r4761, r4762; +} +{ +mul.f16x2 r4991, r4988, r4742; +} +{ +add.f16x2 r4994, r4764, r4991; +} +{ +add.f16x2 r4997, r4767, r4768; +} +{ +mul.f16x2 r5000, r4997, r4746; +} +{ +add.f16x2 r5003, r4994, r5000; +} +{ +sub.f16x2 r5006, r4749, r4750; +} +{ +mul.f16x2 r5009, r5006, r4744; +} +{ +sub.f16x2 r5012, r4755, r4756; +} +{ +mul.f16x2 r5015, r5012, r4747; +} +{ +add.f16x2 r5018, r5009, r5015; +} +{ +add.f16x2 r5021, r5003, r5018; +} +{ +add.f16x2 r5024, r4761, r4762; +} +{ +mul.f16x2 r5027, r5024, r4742; +} +{ +add.f16x2 r5030, r4764, r5027; +} +{ +add.f16x2 r5033, r4767, r4768; +} +{ +mul.f16x2 r5036, r5033, r4746; +} +{ +add.f16x2 r5039, r5030, r5036; +} +{ +sub.f16x2 r5042, r4749, r4750; +} +{ +mul.f16x2 r5045, r5042, r4744; +} +{ +sub.f16x2 r5048, r4755, r4756; +} +{ +mul.f16x2 r5051, r5048, r4747; +} +{ +add.f16x2 r5054, r5045, r5051; +} +{ +sub.f16x2 r5057, r5039, r5054; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5061, {low, high}; +} +{ +neg.f16x2 r5062, r5061; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5064, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5065, {low, high}; +} +{ +neg.f16x2 r5066, r5065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5069, {low, high}; +} +{ +add.f16x2 r5070, r5071, r5072; +} +{ +add.f16x2 r5073, r5074, r5070; +} +{ +add.f16x2 r5076, r5077, r5078; +} +{ +add.f16x2 r5079, r5073, r5076; +} +{ +add.f16x2 r5082, r5083, r5084; +} +{ +add.f16x2 r5085, r5086, r5082; +} +{ +add.f16x2 r5088, r5089, r5090; +} +{ +add.f16x2 r5091, r5085, r5088; +} +{ +add.f16x2 r5094, r5071, r5072; +} +{ +mul.f16x2 r5097, r5094, r5060; +} +{ +add.f16x2 r5100, r5074, r5097; +} +{ +add.f16x2 r5103, r5077, r5078; +} +{ +mul.f16x2 r5106, r5103, r5064; +} +{ +add.f16x2 r5109, r5100, r5106; +} +{ +sub.f16x2 r5112, r5083, r5084; +} +{ +mul.f16x2 r5115, r5112, r5062; +} +{ +sub.f16x2 r5118, r5089, r5090; +} +{ +mul.f16x2 r5121, r5118, r5066; +} +{ +add.f16x2 r5124, r5115, r5121; +} +{ +sub.f16x2 r5127, r5109, r5124; +} +{ +add.f16x2 r5130, r5071, r5072; +} +{ +mul.f16x2 r5133, r5130, r5060; +} +{ +add.f16x2 r5136, r5074, r5133; +} +{ +add.f16x2 r5139, r5077, r5078; +} +{ +mul.f16x2 r5142, r5139, r5064; +} +{ +add.f16x2 r5145, r5136, r5142; +} +{ +sub.f16x2 r5148, r5083, r5084; +} +{ +mul.f16x2 r5151, r5148, r5062; +} +{ +sub.f16x2 r5154, r5089, r5090; +} +{ +mul.f16x2 r5157, r5154, r5066; +} +{ +add.f16x2 r5160, r5151, r5157; +} +{ +add.f16x2 r5163, r5145, r5160; +} +{ +add.f16x2 r5166, r5071, r5072; +} +{ +mul.f16x2 r5169, r5166, r5064; +} +{ +add.f16x2 r5172, r5074, r5169; +} +{ +add.f16x2 r5175, r5077, r5078; +} +{ +mul.f16x2 r5178, r5175, r5068; +} +{ +add.f16x2 r5181, r5172, r5178; +} +{ +sub.f16x2 r5184, r5083, r5084; +} +{ +mul.f16x2 r5187, r5184, r5066; +} +{ +sub.f16x2 r5190, r5089, r5090; +} +{ +mul.f16x2 r5193, r5190, r5069; +} +{ +add.f16x2 r5196, r5187, r5193; +} +{ +sub.f16x2 r5199, r5181, r5196; +} +{ +add.f16x2 r5202, r5071, r5072; +} +{ +mul.f16x2 r5205, r5202, r5064; +} +{ +add.f16x2 r5208, r5074, r5205; +} +{ +add.f16x2 r5211, r5077, r5078; +} +{ +mul.f16x2 r5214, r5211, r5068; +} +{ +add.f16x2 r5217, r5208, r5214; +} +{ +sub.f16x2 r5220, r5083, r5084; +} +{ +mul.f16x2 r5223, r5220, r5066; +} +{ +sub.f16x2 r5226, r5089, r5090; +} +{ +mul.f16x2 r5229, r5226, r5069; +} +{ +add.f16x2 r5232, r5223, r5229; +} +{ +add.f16x2 r5235, r5217, r5232; +} +{ +add.f16x2 r5238, r5083, r5084; +} +{ +mul.f16x2 r5241, r5238, r5060; +} +{ +add.f16x2 r5244, r5086, r5241; +} +{ +add.f16x2 r5247, r5089, r5090; +} +{ +mul.f16x2 r5250, r5247, r5064; +} +{ +add.f16x2 r5253, r5244, r5250; +} +{ +sub.f16x2 r5256, r5071, r5072; +} +{ +mul.f16x2 r5259, r5256, r5062; +} +{ +sub.f16x2 r5262, r5077, r5078; +} +{ +mul.f16x2 r5265, r5262, r5066; +} +{ +add.f16x2 r5268, r5259, r5265; +} +{ +add.f16x2 r5271, r5253, r5268; +} +{ +add.f16x2 r5274, r5083, r5084; +} +{ +mul.f16x2 r5277, r5274, r5060; +} +{ +add.f16x2 r5280, r5086, r5277; +} +{ +add.f16x2 r5283, r5089, r5090; +} +{ +mul.f16x2 r5286, r5283, r5064; +} +{ +add.f16x2 r5289, r5280, r5286; +} +{ +sub.f16x2 r5292, r5071, r5072; +} +{ +mul.f16x2 r5295, r5292, r5062; +} +{ +sub.f16x2 r5298, r5077, r5078; +} +{ +mul.f16x2 r5301, r5298, r5066; +} +{ +add.f16x2 r5304, r5295, r5301; +} +{ +sub.f16x2 r5307, r5289, r5304; +} +{ +add.f16x2 r5310, r5083, r5084; +} +{ +mul.f16x2 r5313, r5310, r5064; +} +{ +add.f16x2 r5316, r5086, r5313; +} +{ +add.f16x2 r5319, r5089, r5090; +} +{ +mul.f16x2 r5322, r5319, r5068; +} +{ +add.f16x2 r5325, r5316, r5322; +} +{ +sub.f16x2 r5328, r5071, r5072; +} +{ +mul.f16x2 r5331, r5328, r5066; +} +{ +sub.f16x2 r5334, r5077, r5078; +} +{ +mul.f16x2 r5337, r5334, r5069; +} +{ +add.f16x2 r5340, r5331, r5337; +} +{ +add.f16x2 r5343, r5325, r5340; +} +{ +add.f16x2 r5346, r5083, r5084; +} +{ +mul.f16x2 r5349, r5346, r5064; +} +{ +add.f16x2 r5352, r5086, r5349; +} +{ +add.f16x2 r5355, r5089, r5090; +} +{ +mul.f16x2 r5358, r5355, r5068; +} +{ +add.f16x2 r5361, r5352, r5358; +} +{ +sub.f16x2 r5364, r5071, r5072; +} +{ +mul.f16x2 r5367, r5364, r5066; +} +{ +sub.f16x2 r5370, r5077, r5078; +} +{ +mul.f16x2 r5373, r5370, r5069; +} +{ +add.f16x2 r5376, r5367, r5373; +} +{ +sub.f16x2 r5379, r5361, r5376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5383, {low, high}; +} +{ +neg.f16x2 r5384, r5383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5386, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5387, {low, high}; +} +{ +neg.f16x2 r5388, r5387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5391, {low, high}; +} +{ +add.f16x2 r5392, r5393, r5394; +} +{ +add.f16x2 r5395, r5396, r5392; +} +{ +add.f16x2 r5398, r5399, r5400; +} +{ +add.f16x2 r5401, r5395, r5398; +} +{ +add.f16x2 r5404, r5405, r5406; +} +{ +add.f16x2 r5407, r5408, r5404; +} +{ +add.f16x2 r5410, r5411, r5412; +} +{ +add.f16x2 r5413, r5407, r5410; +} +{ +add.f16x2 r5416, r5393, r5394; +} +{ +mul.f16x2 r5419, r5416, r5382; +} +{ +add.f16x2 r5422, r5396, r5419; +} +{ +add.f16x2 r5425, r5399, r5400; +} +{ +mul.f16x2 r5428, r5425, r5386; +} +{ +add.f16x2 r5431, r5422, r5428; +} +{ +sub.f16x2 r5434, r5405, r5406; +} +{ +mul.f16x2 r5437, r5434, r5384; +} +{ +sub.f16x2 r5440, r5411, r5412; +} +{ +mul.f16x2 r5443, r5440, r5388; +} +{ +add.f16x2 r5446, r5437, r5443; +} +{ +sub.f16x2 r5449, r5431, r5446; +} +{ +add.f16x2 r5452, r5393, r5394; +} +{ +mul.f16x2 r5455, r5452, r5382; +} +{ +add.f16x2 r5458, r5396, r5455; +} +{ +add.f16x2 r5461, r5399, r5400; +} +{ +mul.f16x2 r5464, r5461, r5386; +} +{ +add.f16x2 r5467, r5458, r5464; +} +{ +sub.f16x2 r5470, r5405, r5406; +} +{ +mul.f16x2 r5473, r5470, r5384; +} +{ +sub.f16x2 r5476, r5411, r5412; +} +{ +mul.f16x2 r5479, r5476, r5388; +} +{ +add.f16x2 r5482, r5473, r5479; +} +{ +add.f16x2 r5485, r5467, r5482; +} +{ +add.f16x2 r5488, r5393, r5394; +} +{ +mul.f16x2 r5491, r5488, r5386; +} +{ +add.f16x2 r5494, r5396, r5491; +} +{ +add.f16x2 r5497, r5399, r5400; +} +{ +mul.f16x2 r5500, r5497, r5390; +} +{ +add.f16x2 r5503, r5494, r5500; +} +{ +sub.f16x2 r5506, r5405, r5406; +} +{ +mul.f16x2 r5509, r5506, r5388; +} +{ +sub.f16x2 r5512, r5411, r5412; +} +{ +mul.f16x2 r5515, r5512, r5391; +} +{ +add.f16x2 r5518, r5509, r5515; +} +{ +sub.f16x2 r5521, r5503, r5518; +} +{ +add.f16x2 r5524, r5393, r5394; +} +{ +mul.f16x2 r5527, r5524, r5386; +} +{ +add.f16x2 r5530, r5396, r5527; +} +{ +add.f16x2 r5533, r5399, r5400; +} +{ +mul.f16x2 r5536, r5533, r5390; +} +{ +add.f16x2 r5539, r5530, r5536; +} +{ +sub.f16x2 r5542, r5405, r5406; +} +{ +mul.f16x2 r5545, r5542, r5388; +} +{ +sub.f16x2 r5548, r5411, r5412; +} +{ +mul.f16x2 r5551, r5548, r5391; +} +{ +add.f16x2 r5554, r5545, r5551; +} +{ +add.f16x2 r5557, r5539, r5554; +} +{ +add.f16x2 r5560, r5405, r5406; +} +{ +mul.f16x2 r5563, r5560, r5382; +} +{ +add.f16x2 r5566, r5408, r5563; +} +{ +add.f16x2 r5569, r5411, r5412; +} +{ +mul.f16x2 r5572, r5569, r5386; +} +{ +add.f16x2 r5575, r5566, r5572; +} +{ +sub.f16x2 r5578, r5393, r5394; +} +{ +mul.f16x2 r5581, r5578, r5384; +} +{ +sub.f16x2 r5584, r5399, r5400; +} +{ +mul.f16x2 r5587, r5584, r5388; +} +{ +add.f16x2 r5590, r5581, r5587; +} +{ +add.f16x2 r5593, r5575, r5590; +} +{ +add.f16x2 r5596, r5405, r5406; +} +{ +mul.f16x2 r5599, r5596, r5382; +} +{ +add.f16x2 r5602, r5408, r5599; +} +{ +add.f16x2 r5605, r5411, r5412; +} +{ +mul.f16x2 r5608, r5605, r5386; +} +{ +add.f16x2 r5611, r5602, r5608; +} +{ +sub.f16x2 r5614, r5393, r5394; +} +{ +mul.f16x2 r5617, r5614, r5384; +} +{ +sub.f16x2 r5620, r5399, r5400; +} +{ +mul.f16x2 r5623, r5620, r5388; +} +{ +add.f16x2 r5626, r5617, r5623; +} +{ +sub.f16x2 r5629, r5611, r5626; +} +{ +add.f16x2 r5632, r5405, r5406; +} +{ +mul.f16x2 r5635, r5632, r5386; +} +{ +add.f16x2 r5638, r5408, r5635; +} +{ +add.f16x2 r5641, r5411, r5412; +} +{ +mul.f16x2 r5644, r5641, r5390; +} +{ +add.f16x2 r5647, r5638, r5644; +} +{ +sub.f16x2 r5650, r5393, r5394; +} +{ +mul.f16x2 r5653, r5650, r5388; +} +{ +sub.f16x2 r5656, r5399, r5400; +} +{ +mul.f16x2 r5659, r5656, r5391; +} +{ +add.f16x2 r5662, r5653, r5659; +} +{ +add.f16x2 r5665, r5647, r5662; +} +{ +add.f16x2 r5668, r5405, r5406; +} +{ +mul.f16x2 r5671, r5668, r5386; +} +{ +add.f16x2 r5674, r5408, r5671; +} +{ +add.f16x2 r5677, r5411, r5412; +} +{ +mul.f16x2 r5680, r5677, r5390; +} +{ +add.f16x2 r5683, r5674, r5680; +} +{ +sub.f16x2 r5686, r5393, r5394; +} +{ +mul.f16x2 r5689, r5686, r5388; +} +{ +sub.f16x2 r5692, r5399, r5400; +} +{ +mul.f16x2 r5695, r5692, r5391; +} +{ +add.f16x2 r5698, r5689, r5695; +} +{ +sub.f16x2 r5701, r5683, r5698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5704, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5705, {low, high}; +} +{ +neg.f16x2 r5706, r5705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r5708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r5709, {low, high}; +} +{ +neg.f16x2 r5710, r5709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5712, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5713, {low, high}; +} +{ +add.f16x2 r5714, r5715, r5716; +} +{ +add.f16x2 r5717, r5718, r5714; +} +{ +add.f16x2 r5720, r5721, r5722; +} +{ +add.f16x2 r5723, r5717, r5720; +} +{ +add.f16x2 r5726, r5727, r5728; +} +{ +add.f16x2 r5729, r5730, r5726; +} +{ +add.f16x2 r5732, r5733, r5734; +} +{ +add.f16x2 r5735, r5729, r5732; +} +{ +add.f16x2 r5738, r5715, r5716; +} +{ +mul.f16x2 r5741, r5738, r5704; +} +{ +add.f16x2 r5744, r5718, r5741; +} +{ +add.f16x2 r5747, r5721, r5722; +} +{ +mul.f16x2 r5750, r5747, r5708; +} +{ +add.f16x2 r5753, r5744, r5750; +} +{ +sub.f16x2 r5756, r5727, r5728; +} +{ +mul.f16x2 r5759, r5756, r5706; +} +{ +sub.f16x2 r5762, r5733, r5734; +} +{ +mul.f16x2 r5765, r5762, r5710; +} +{ +add.f16x2 r5768, r5759, r5765; +} +{ +sub.f16x2 r5771, r5753, r5768; +} +{ +add.f16x2 r5774, r5715, r5716; +} +{ +mul.f16x2 r5777, r5774, r5704; +} +{ +add.f16x2 r5780, r5718, r5777; +} +{ +add.f16x2 r5783, r5721, r5722; +} +{ +mul.f16x2 r5786, r5783, r5708; +} +{ +add.f16x2 r5789, r5780, r5786; +} +{ +sub.f16x2 r5792, r5727, r5728; +} +{ +mul.f16x2 r5795, r5792, r5706; +} +{ +sub.f16x2 r5798, r5733, r5734; +} +{ +mul.f16x2 r5801, r5798, r5710; +} +{ +add.f16x2 r5804, r5795, r5801; +} +{ +add.f16x2 r5807, r5789, r5804; +} +{ +add.f16x2 r5810, r5715, r5716; +} +{ +mul.f16x2 r5813, r5810, r5708; +} +{ +add.f16x2 r5816, r5718, r5813; +} +{ +add.f16x2 r5819, r5721, r5722; +} +{ +mul.f16x2 r5822, r5819, r5712; +} +{ +add.f16x2 r5825, r5816, r5822; +} +{ +sub.f16x2 r5828, r5727, r5728; +} +{ +mul.f16x2 r5831, r5828, r5710; +} +{ +sub.f16x2 r5834, r5733, r5734; +} +{ +mul.f16x2 r5837, r5834, r5713; +} +{ +add.f16x2 r5840, r5831, r5837; +} +{ +sub.f16x2 r5843, r5825, r5840; +} +{ +add.f16x2 r5846, r5715, r5716; +} +{ +mul.f16x2 r5849, r5846, r5708; +} +{ +add.f16x2 r5852, r5718, r5849; +} +{ +add.f16x2 r5855, r5721, r5722; +} +{ +mul.f16x2 r5858, r5855, r5712; +} +{ +add.f16x2 r5861, r5852, r5858; +} +{ +sub.f16x2 r5864, r5727, r5728; +} +{ +mul.f16x2 r5867, r5864, r5710; +} +{ +sub.f16x2 r5870, r5733, r5734; +} +{ +mul.f16x2 r5873, r5870, r5713; +} +{ +add.f16x2 r5876, r5867, r5873; +} +{ +add.f16x2 r5879, r5861, r5876; +} +{ +add.f16x2 r5882, r5727, r5728; +} +{ +mul.f16x2 r5885, r5882, r5704; +} +{ +add.f16x2 r5888, r5730, r5885; +} +{ +add.f16x2 r5891, r5733, r5734; +} +{ +mul.f16x2 r5894, r5891, r5708; +} +{ +add.f16x2 r5897, r5888, r5894; +} +{ +sub.f16x2 r5900, r5715, r5716; +} +{ +mul.f16x2 r5903, r5900, r5706; +} +{ +sub.f16x2 r5906, r5721, r5722; +} +{ +mul.f16x2 r5909, r5906, r5710; +} +{ +add.f16x2 r5912, r5903, r5909; +} +{ +add.f16x2 r5915, r5897, r5912; +} +{ +add.f16x2 r5918, r5727, r5728; +} +{ +mul.f16x2 r5921, r5918, r5704; +} +{ +add.f16x2 r5924, r5730, r5921; +} +{ +add.f16x2 r5927, r5733, r5734; +} +{ +mul.f16x2 r5930, r5927, r5708; +} +{ +add.f16x2 r5933, r5924, r5930; +} +{ +sub.f16x2 r5936, r5715, r5716; +} +{ +mul.f16x2 r5939, r5936, r5706; +} +{ +sub.f16x2 r5942, r5721, r5722; +} +{ +mul.f16x2 r5945, r5942, r5710; +} +{ +add.f16x2 r5948, r5939, r5945; +} +{ +sub.f16x2 r5951, r5933, r5948; +} +{ +add.f16x2 r5954, r5727, r5728; +} +{ +mul.f16x2 r5957, r5954, r5708; +} +{ +add.f16x2 r5960, r5730, r5957; +} +{ +add.f16x2 r5963, r5733, r5734; +} +{ +mul.f16x2 r5966, r5963, r5712; +} +{ +add.f16x2 r5969, r5960, r5966; +} +{ +sub.f16x2 r5972, r5715, r5716; +} +{ +mul.f16x2 r5975, r5972, r5710; +} +{ +sub.f16x2 r5978, r5721, r5722; +} +{ +mul.f16x2 r5981, r5978, r5713; +} +{ +add.f16x2 r5984, r5975, r5981; +} +{ +add.f16x2 r5987, r5969, r5984; +} +{ +add.f16x2 r5990, r5727, r5728; +} +{ +mul.f16x2 r5993, r5990, r5708; +} +{ +add.f16x2 r5996, r5730, r5993; +} +{ +add.f16x2 r5999, r5733, r5734; +} +{ +mul.f16x2 r6002, r5999, r5712; +} +{ +add.f16x2 r6005, r5996, r6002; +} +{ +sub.f16x2 r6008, r5715, r5716; +} +{ +mul.f16x2 r6011, r6008, r5710; +} +{ +sub.f16x2 r6014, r5721, r5722; +} +{ +mul.f16x2 r6017, r6014, r5713; +} +{ +add.f16x2 r6020, r6011, r6017; +} +{ +sub.f16x2 r6023, r6005, r6020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r6027, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r6028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r6029, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6030, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6031, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6032, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6033, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6042, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6048, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6057, {low, high}; +} +{ +mul.f16x2 r6074, r4805, r6026; +} +{ +mul.f16x2 r6077, r4949, r6027; +} +{ +sub.f16x2 r6080, r6074, r6077; +} +{ +mul.f16x2 r6083, r4805, r6027; +} +{ +fma.rn.f16x2 r6086, r4949, r6026, r6083; +} +{ +mul.f16x2 r6090, r5127, r6028; +} +{ +mul.f16x2 r6093, r5271, r6029; +} +{ +sub.f16x2 r6096, r6090, r6093; +} +{ +mul.f16x2 r6099, r5127, r6029; +} +{ +fma.rn.f16x2 r6102, r5271, r6028, r6099; +} +{ +mul.f16x2 r6106, r5449, r6030; +} +{ +mul.f16x2 r6109, r5593, r6031; +} +{ +sub.f16x2 r6112, r6106, r6109; +} +{ +mul.f16x2 r6115, r5449, r6031; +} +{ +fma.rn.f16x2 r6118, r5593, r6030, r6115; +} +{ +mul.f16x2 r6122, r5771, r6032; +} +{ +mul.f16x2 r6125, r5915, r6033; +} +{ +sub.f16x2 r6128, r6122, r6125; +} +{ +mul.f16x2 r6131, r5771, r6033; +} +{ +fma.rn.f16x2 r6134, r5915, r6032, r6131; +} +{ +mul.f16x2 r6138, r4877, r6028; +} +{ +mul.f16x2 r6141, r5021, r6029; +} +{ +sub.f16x2 r6144, r6138, r6141; +} +{ +mul.f16x2 r6147, r4877, r6029; +} +{ +fma.rn.f16x2 r6150, r5021, r6028, r6147; +} +{ +mul.f16x2 r6154, r5199, r6032; +} +{ +mul.f16x2 r6157, r5343, r6033; +} +{ +sub.f16x2 r6160, r6154, r6157; +} +{ +mul.f16x2 r6163, r5199, r6033; +} +{ +fma.rn.f16x2 r6166, r5343, r6032, r6163; +} +{ +mul.f16x2 r6170, r5521, r6036; +} +{ +mul.f16x2 r6173, r5665, r6037; +} +{ +sub.f16x2 r6176, r6170, r6173; +} +{ +mul.f16x2 r6179, r5521, r6037; +} +{ +fma.rn.f16x2 r6182, r5665, r6036, r6179; +} +{ +mul.f16x2 r6186, r5843, r6040; +} +{ +mul.f16x2 r6189, r5987, r6041; +} +{ +sub.f16x2 r6192, r6186, r6189; +} +{ +mul.f16x2 r6195, r5843, r6041; +} +{ +fma.rn.f16x2 r6198, r5987, r6040, r6195; +} +{ +mul.f16x2 r6202, r4913, r6030; +} +{ +mul.f16x2 r6205, r5057, r6031; +} +{ +sub.f16x2 r6208, r6202, r6205; +} +{ +mul.f16x2 r6211, r4913, r6031; +} +{ +fma.rn.f16x2 r6214, r5057, r6030, r6211; +} +{ +mul.f16x2 r6218, r5235, r6036; +} +{ +mul.f16x2 r6221, r5379, r6037; +} +{ +sub.f16x2 r6224, r6218, r6221; +} +{ +mul.f16x2 r6227, r5235, r6037; +} +{ +fma.rn.f16x2 r6230, r5379, r6036, r6227; +} +{ +mul.f16x2 r6234, r5557, r6042; +} +{ +mul.f16x2 r6237, r5701, r6043; +} +{ +sub.f16x2 r6240, r6234, r6237; +} +{ +mul.f16x2 r6243, r5557, r6043; +} +{ +fma.rn.f16x2 r6246, r5701, r6042, r6243; +} +{ +mul.f16x2 r6250, r5879, r6048; +} +{ +mul.f16x2 r6253, r6023, r6049; +} +{ +sub.f16x2 r6256, r6250, r6253; +} +{ +mul.f16x2 r6259, r5879, r6049; +} +{ +fma.rn.f16x2 r6262, r6023, r6048, r6259; +} +{ +mul.f16x2 r6266, r4841, r6032; +} +{ +mul.f16x2 r6269, r4985, r6033; +} +{ +sub.f16x2 r6272, r6266, r6269; +} +{ +mul.f16x2 r6275, r4841, r6033; +} +{ +fma.rn.f16x2 r6278, r4985, r6032, r6275; +} +{ +mul.f16x2 r6282, r5163, r6040; +} +{ +mul.f16x2 r6285, r5307, r6041; +} +{ +sub.f16x2 r6288, r6282, r6285; +} +{ +mul.f16x2 r6291, r5163, r6041; +} +{ +fma.rn.f16x2 r6294, r5307, r6040, r6291; +} +{ +mul.f16x2 r6298, r5485, r6048; +} +{ +mul.f16x2 r6301, r5629, r6049; +} +{ +sub.f16x2 r6304, r6298, r6301; +} +{ +mul.f16x2 r6307, r5485, r6049; +} +{ +fma.rn.f16x2 r6310, r5629, r6048, r6307; +} +{ +mul.f16x2 r6314, r5807, r6056; +} +{ +mul.f16x2 r6317, r5951, r6057; +} +{ +sub.f16x2 r6320, r6314, r6317; +} +{ +mul.f16x2 r6323, r5807, r6057; +} +{ +fma.rn.f16x2 r6326, r5951, r6056, r6323; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6331, {low, high}; +} +{ +neg.f16x2 r6332, r6331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6334, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6335, {low, high}; +} +{ +neg.f16x2 r6336, r6335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6338, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6339, {low, high}; +} +{ +add.f16x2 r6340, r4757, r5723; +} +{ +add.f16x2 r6343, r4435, r6340; +} +{ +add.f16x2 r6346, r5079, r5401; +} +{ +add.f16x2 r6349, r6343, r6346; +} +{ +add.f16x2 r6352, r4769, r5735; +} +{ +add.f16x2 r6355, r4447, r6352; +} +{ +add.f16x2 r6358, r5091, r5413; +} +{ +add.f16x2 r6361, r6355, r6358; +} +{ +add.f16x2 r6364, r4757, r5723; +} +{ +mul.f16x2 r6367, r6364, r6330; +} +{ +add.f16x2 r6370, r4435, r6367; +} +{ +add.f16x2 r6373, r5079, r5401; +} +{ +mul.f16x2 r6376, r6373, r6334; +} +{ +add.f16x2 r6379, r6370, r6376; +} +{ +sub.f16x2 r6382, r4769, r5735; +} +{ +mul.f16x2 r6385, r6382, r6332; +} +{ +sub.f16x2 r6388, r5091, r5413; +} +{ +mul.f16x2 r6391, r6388, r6336; +} +{ +add.f16x2 r6394, r6385, r6391; +} +{ +sub.f16x2 r6397, r6379, r6394; +} +{ +add.f16x2 r6400, r4757, r5723; +} +{ +mul.f16x2 r6403, r6400, r6330; +} +{ +add.f16x2 r6406, r4435, r6403; +} +{ +add.f16x2 r6409, r5079, r5401; +} +{ +mul.f16x2 r6412, r6409, r6334; +} +{ +add.f16x2 r6415, r6406, r6412; +} +{ +sub.f16x2 r6418, r4769, r5735; +} +{ +mul.f16x2 r6421, r6418, r6332; +} +{ +sub.f16x2 r6424, r5091, r5413; +} +{ +mul.f16x2 r6427, r6424, r6336; +} +{ +add.f16x2 r6430, r6421, r6427; +} +{ +add.f16x2 r6433, r6415, r6430; +} +{ +add.f16x2 r6436, r4757, r5723; +} +{ +mul.f16x2 r6439, r6436, r6334; +} +{ +add.f16x2 r6442, r4435, r6439; +} +{ +add.f16x2 r6445, r5079, r5401; +} +{ +mul.f16x2 r6448, r6445, r6338; +} +{ +add.f16x2 r6451, r6442, r6448; +} +{ +sub.f16x2 r6454, r4769, r5735; +} +{ +mul.f16x2 r6457, r6454, r6336; +} +{ +sub.f16x2 r6460, r5091, r5413; +} +{ +mul.f16x2 r6463, r6460, r6339; +} +{ +add.f16x2 r6466, r6457, r6463; +} +{ +sub.f16x2 r6469, r6451, r6466; +} +{ +add.f16x2 r6472, r4757, r5723; +} +{ +mul.f16x2 r6475, r6472, r6334; +} +{ +add.f16x2 r6478, r4435, r6475; +} +{ +add.f16x2 r6481, r5079, r5401; +} +{ +mul.f16x2 r6484, r6481, r6338; +} +{ +add.f16x2 r6487, r6478, r6484; +} +{ +sub.f16x2 r6490, r4769, r5735; +} +{ +mul.f16x2 r6493, r6490, r6336; +} +{ +sub.f16x2 r6496, r5091, r5413; +} +{ +mul.f16x2 r6499, r6496, r6339; +} +{ +add.f16x2 r6502, r6493, r6499; +} +{ +add.f16x2 r6505, r6487, r6502; +} +{ +add.f16x2 r6508, r4769, r5735; +} +{ +mul.f16x2 r6511, r6508, r6330; +} +{ +add.f16x2 r6514, r4447, r6511; +} +{ +add.f16x2 r6517, r5091, r5413; +} +{ +mul.f16x2 r6520, r6517, r6334; +} +{ +add.f16x2 r6523, r6514, r6520; +} +{ +sub.f16x2 r6526, r4757, r5723; +} +{ +mul.f16x2 r6529, r6526, r6332; +} +{ +sub.f16x2 r6532, r5079, r5401; +} +{ +mul.f16x2 r6535, r6532, r6336; +} +{ +add.f16x2 r6538, r6529, r6535; +} +{ +add.f16x2 r6541, r6523, r6538; +} +{ +add.f16x2 r6544, r4769, r5735; +} +{ +mul.f16x2 r6547, r6544, r6330; +} +{ +add.f16x2 r6550, r4447, r6547; +} +{ +add.f16x2 r6553, r5091, r5413; +} +{ +mul.f16x2 r6556, r6553, r6334; +} +{ +add.f16x2 r6559, r6550, r6556; +} +{ +sub.f16x2 r6562, r4757, r5723; +} +{ +mul.f16x2 r6565, r6562, r6332; +} +{ +sub.f16x2 r6568, r5079, r5401; +} +{ +mul.f16x2 r6571, r6568, r6336; +} +{ +add.f16x2 r6574, r6565, r6571; +} +{ +sub.f16x2 r6577, r6559, r6574; +} +{ +add.f16x2 r6580, r4769, r5735; +} +{ +mul.f16x2 r6583, r6580, r6334; +} +{ +add.f16x2 r6586, r4447, r6583; +} +{ +add.f16x2 r6589, r5091, r5413; +} +{ +mul.f16x2 r6592, r6589, r6338; +} +{ +add.f16x2 r6595, r6586, r6592; +} +{ +sub.f16x2 r6598, r4757, r5723; +} +{ +mul.f16x2 r6601, r6598, r6336; +} +{ +sub.f16x2 r6604, r5079, r5401; +} +{ +mul.f16x2 r6607, r6604, r6339; +} +{ +add.f16x2 r6610, r6601, r6607; +} +{ +add.f16x2 r6613, r6595, r6610; +} +{ +add.f16x2 r6616, r4769, r5735; +} +{ +mul.f16x2 r6619, r6616, r6334; +} +{ +add.f16x2 r6622, r4447, r6619; +} +{ +add.f16x2 r6625, r5091, r5413; +} +{ +mul.f16x2 r6628, r6625, r6338; +} +{ +add.f16x2 r6631, r6622, r6628; +} +{ +sub.f16x2 r6634, r4757, r5723; +} +{ +mul.f16x2 r6637, r6634, r6336; +} +{ +sub.f16x2 r6640, r5079, r5401; +} +{ +mul.f16x2 r6643, r6640, r6339; +} +{ +add.f16x2 r6646, r6637, r6643; +} +{ +sub.f16x2 r6649, r6631, r6646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6653, {low, high}; +} +{ +neg.f16x2 r6654, r6653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6656, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6657, {low, high}; +} +{ +neg.f16x2 r6658, r6657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6660, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6661, {low, high}; +} +{ +add.f16x2 r6662, r6080, r6128; +} +{ +add.f16x2 r6665, r4483, r6662; +} +{ +add.f16x2 r6668, r6096, r6112; +} +{ +add.f16x2 r6671, r6665, r6668; +} +{ +add.f16x2 r6674, r6086, r6134; +} +{ +add.f16x2 r6677, r4627, r6674; +} +{ +add.f16x2 r6680, r6102, r6118; +} +{ +add.f16x2 r6683, r6677, r6680; +} +{ +add.f16x2 r6686, r6080, r6128; +} +{ +mul.f16x2 r6689, r6686, r6652; +} +{ +add.f16x2 r6692, r4483, r6689; +} +{ +add.f16x2 r6695, r6096, r6112; +} +{ +mul.f16x2 r6698, r6695, r6656; +} +{ +add.f16x2 r6701, r6692, r6698; +} +{ +sub.f16x2 r6704, r6086, r6134; +} +{ +mul.f16x2 r6707, r6704, r6654; +} +{ +sub.f16x2 r6710, r6102, r6118; +} +{ +mul.f16x2 r6713, r6710, r6658; +} +{ +add.f16x2 r6716, r6707, r6713; +} +{ +sub.f16x2 r6719, r6701, r6716; +} +{ +add.f16x2 r6722, r6080, r6128; +} +{ +mul.f16x2 r6725, r6722, r6652; +} +{ +add.f16x2 r6728, r4483, r6725; +} +{ +add.f16x2 r6731, r6096, r6112; +} +{ +mul.f16x2 r6734, r6731, r6656; +} +{ +add.f16x2 r6737, r6728, r6734; +} +{ +sub.f16x2 r6740, r6086, r6134; +} +{ +mul.f16x2 r6743, r6740, r6654; +} +{ +sub.f16x2 r6746, r6102, r6118; +} +{ +mul.f16x2 r6749, r6746, r6658; +} +{ +add.f16x2 r6752, r6743, r6749; +} +{ +add.f16x2 r6755, r6737, r6752; +} +{ +add.f16x2 r6758, r6080, r6128; +} +{ +mul.f16x2 r6761, r6758, r6656; +} +{ +add.f16x2 r6764, r4483, r6761; +} +{ +add.f16x2 r6767, r6096, r6112; +} +{ +mul.f16x2 r6770, r6767, r6660; +} +{ +add.f16x2 r6773, r6764, r6770; +} +{ +sub.f16x2 r6776, r6086, r6134; +} +{ +mul.f16x2 r6779, r6776, r6658; +} +{ +sub.f16x2 r6782, r6102, r6118; +} +{ +mul.f16x2 r6785, r6782, r6661; +} +{ +add.f16x2 r6788, r6779, r6785; +} +{ +sub.f16x2 r6791, r6773, r6788; +} +{ +add.f16x2 r6794, r6080, r6128; +} +{ +mul.f16x2 r6797, r6794, r6656; +} +{ +add.f16x2 r6800, r4483, r6797; +} +{ +add.f16x2 r6803, r6096, r6112; +} +{ +mul.f16x2 r6806, r6803, r6660; +} +{ +add.f16x2 r6809, r6800, r6806; +} +{ +sub.f16x2 r6812, r6086, r6134; +} +{ +mul.f16x2 r6815, r6812, r6658; +} +{ +sub.f16x2 r6818, r6102, r6118; +} +{ +mul.f16x2 r6821, r6818, r6661; +} +{ +add.f16x2 r6824, r6815, r6821; +} +{ +add.f16x2 r6827, r6809, r6824; +} +{ +add.f16x2 r6830, r6086, r6134; +} +{ +mul.f16x2 r6833, r6830, r6652; +} +{ +add.f16x2 r6836, r4627, r6833; +} +{ +add.f16x2 r6839, r6102, r6118; +} +{ +mul.f16x2 r6842, r6839, r6656; +} +{ +add.f16x2 r6845, r6836, r6842; +} +{ +sub.f16x2 r6848, r6080, r6128; +} +{ +mul.f16x2 r6851, r6848, r6654; +} +{ +sub.f16x2 r6854, r6096, r6112; +} +{ +mul.f16x2 r6857, r6854, r6658; +} +{ +add.f16x2 r6860, r6851, r6857; +} +{ +add.f16x2 r6863, r6845, r6860; +} +{ +add.f16x2 r6866, r6086, r6134; +} +{ +mul.f16x2 r6869, r6866, r6652; +} +{ +add.f16x2 r6872, r4627, r6869; +} +{ +add.f16x2 r6875, r6102, r6118; +} +{ +mul.f16x2 r6878, r6875, r6656; +} +{ +add.f16x2 r6881, r6872, r6878; +} +{ +sub.f16x2 r6884, r6080, r6128; +} +{ +mul.f16x2 r6887, r6884, r6654; +} +{ +sub.f16x2 r6890, r6096, r6112; +} +{ +mul.f16x2 r6893, r6890, r6658; +} +{ +add.f16x2 r6896, r6887, r6893; +} +{ +sub.f16x2 r6899, r6881, r6896; +} +{ +add.f16x2 r6902, r6086, r6134; +} +{ +mul.f16x2 r6905, r6902, r6656; +} +{ +add.f16x2 r6908, r4627, r6905; +} +{ +add.f16x2 r6911, r6102, r6118; +} +{ +mul.f16x2 r6914, r6911, r6660; +} +{ +add.f16x2 r6917, r6908, r6914; +} +{ +sub.f16x2 r6920, r6080, r6128; +} +{ +mul.f16x2 r6923, r6920, r6658; +} +{ +sub.f16x2 r6926, r6096, r6112; +} +{ +mul.f16x2 r6929, r6926, r6661; +} +{ +add.f16x2 r6932, r6923, r6929; +} +{ +add.f16x2 r6935, r6917, r6932; +} +{ +add.f16x2 r6938, r6086, r6134; +} +{ +mul.f16x2 r6941, r6938, r6656; +} +{ +add.f16x2 r6944, r4627, r6941; +} +{ +add.f16x2 r6947, r6102, r6118; +} +{ +mul.f16x2 r6950, r6947, r6660; +} +{ +add.f16x2 r6953, r6944, r6950; +} +{ +sub.f16x2 r6956, r6080, r6128; +} +{ +mul.f16x2 r6959, r6956, r6658; +} +{ +sub.f16x2 r6962, r6096, r6112; +} +{ +mul.f16x2 r6965, r6962, r6661; +} +{ +add.f16x2 r6968, r6959, r6965; +} +{ +sub.f16x2 r6971, r6953, r6968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6974, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6975, {low, high}; +} +{ +neg.f16x2 r6976, r6975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r6978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r6979, {low, high}; +} +{ +neg.f16x2 r6980, r6979; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r6982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r6983, {low, high}; +} +{ +add.f16x2 r6984, r6144, r6192; +} +{ +add.f16x2 r6987, r4555, r6984; +} +{ +add.f16x2 r6990, r6160, r6176; +} +{ +add.f16x2 r6993, r6987, r6990; +} +{ +add.f16x2 r6996, r6150, r6198; +} +{ +add.f16x2 r6999, r4699, r6996; +} +{ +add.f16x2 r7002, r6166, r6182; +} +{ +add.f16x2 r7005, r6999, r7002; +} +{ +add.f16x2 r7008, r6144, r6192; +} +{ +mul.f16x2 r7011, r7008, r6974; +} +{ +add.f16x2 r7014, r4555, r7011; +} +{ +add.f16x2 r7017, r6160, r6176; +} +{ +mul.f16x2 r7020, r7017, r6978; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6150, r6198; +} +{ +mul.f16x2 r7029, r7026, r6976; +} +{ +sub.f16x2 r7032, r6166, r6182; +} +{ +mul.f16x2 r7035, r7032, r6980; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +sub.f16x2 r7041, r7023, r7038; +} +{ +add.f16x2 r7044, r6144, r6192; +} +{ +mul.f16x2 r7047, r7044, r6974; +} +{ +add.f16x2 r7050, r4555, r7047; +} +{ +add.f16x2 r7053, r6160, r6176; +} +{ +mul.f16x2 r7056, r7053, r6978; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6150, r6198; +} +{ +mul.f16x2 r7065, r7062, r6976; +} +{ +sub.f16x2 r7068, r6166, r6182; +} +{ +mul.f16x2 r7071, r7068, r6980; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +add.f16x2 r7077, r7059, r7074; +} +{ +add.f16x2 r7080, r6144, r6192; +} +{ +mul.f16x2 r7083, r7080, r6978; +} +{ +add.f16x2 r7086, r4555, r7083; +} +{ +add.f16x2 r7089, r6160, r6176; +} +{ +mul.f16x2 r7092, r7089, r6982; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6150, r6198; +} +{ +mul.f16x2 r7101, r7098, r6980; +} +{ +sub.f16x2 r7104, r6166, r6182; +} +{ +mul.f16x2 r7107, r7104, r6983; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +sub.f16x2 r7113, r7095, r7110; +} +{ +add.f16x2 r7116, r6144, r6192; +} +{ +mul.f16x2 r7119, r7116, r6978; +} +{ +add.f16x2 r7122, r4555, r7119; +} +{ +add.f16x2 r7125, r6160, r6176; +} +{ +mul.f16x2 r7128, r7125, r6982; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6150, r6198; +} +{ +mul.f16x2 r7137, r7134, r6980; +} +{ +sub.f16x2 r7140, r6166, r6182; +} +{ +mul.f16x2 r7143, r7140, r6983; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 r7149, r7131, r7146; +} +{ +add.f16x2 r7152, r6150, r6198; +} +{ +mul.f16x2 r7155, r7152, r6974; +} +{ +add.f16x2 r7158, r4699, r7155; +} +{ +add.f16x2 r7161, r6166, r6182; +} +{ +mul.f16x2 r7164, r7161, r6978; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6144, r6192; +} +{ +mul.f16x2 r7173, r7170, r6976; +} +{ +sub.f16x2 r7176, r6160, r6176; +} +{ +mul.f16x2 r7179, r7176, r6980; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +add.f16x2 r7185, r7167, r7182; +} +{ +add.f16x2 r7188, r6150, r6198; +} +{ +mul.f16x2 r7191, r7188, r6974; +} +{ +add.f16x2 r7194, r4699, r7191; +} +{ +add.f16x2 r7197, r6166, r6182; +} +{ +mul.f16x2 r7200, r7197, r6978; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6144, r6192; +} +{ +mul.f16x2 r7209, r7206, r6976; +} +{ +sub.f16x2 r7212, r6160, r6176; +} +{ +mul.f16x2 r7215, r7212, r6980; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +sub.f16x2 r7221, r7203, r7218; +} +{ +add.f16x2 r7224, r6150, r6198; +} +{ +mul.f16x2 r7227, r7224, r6978; +} +{ +add.f16x2 r7230, r4699, r7227; +} +{ +add.f16x2 r7233, r6166, r6182; +} +{ +mul.f16x2 r7236, r7233, r6982; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6144, r6192; +} +{ +mul.f16x2 r7245, r7242, r6980; +} +{ +sub.f16x2 r7248, r6160, r6176; +} +{ +mul.f16x2 r7251, r7248, r6983; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +add.f16x2 r7257, r7239, r7254; +} +{ +add.f16x2 r7260, r6150, r6198; +} +{ +mul.f16x2 r7263, r7260, r6978; +} +{ +add.f16x2 r7266, r4699, r7263; +} +{ +add.f16x2 r7269, r6166, r6182; +} +{ +mul.f16x2 r7272, r7269, r6982; +} +{ +add.f16x2 r7275, r7266, r7272; +} +{ +sub.f16x2 r7278, r6144, r6192; +} +{ +mul.f16x2 r7281, r7278, r6980; +} +{ +sub.f16x2 r7284, r6160, r6176; +} +{ +mul.f16x2 r7287, r7284, r6983; +} +{ +add.f16x2 r7290, r7281, r7287; +} +{ +sub.f16x2 r7293, r7275, r7290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7296, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7297, {low, high}; +} +{ +neg.f16x2 r7298, r7297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7301, {low, high}; +} +{ +neg.f16x2 r7302, r7301; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7305, {low, high}; +} +{ +add.f16x2 r7306, r6208, r6256; +} +{ +add.f16x2 r7309, r4591, r7306; +} +{ +add.f16x2 r7312, r6224, r6240; +} +{ +add.f16x2 r7315, r7309, r7312; +} +{ +add.f16x2 r7318, r6214, r6262; +} +{ +add.f16x2 r7321, r4735, r7318; +} +{ +add.f16x2 r7324, r6230, r6246; +} +{ +add.f16x2 r7327, r7321, r7324; +} +{ +add.f16x2 r7330, r6208, r6256; +} +{ +mul.f16x2 r7333, r7330, r7296; +} +{ +add.f16x2 r7336, r4591, r7333; +} +{ +add.f16x2 r7339, r6224, r6240; +} +{ +mul.f16x2 r7342, r7339, r7300; +} +{ +add.f16x2 r7345, r7336, r7342; +} +{ +sub.f16x2 r7348, r6214, r6262; +} +{ +mul.f16x2 r7351, r7348, r7298; +} +{ +sub.f16x2 r7354, r6230, r6246; +} +{ +mul.f16x2 r7357, r7354, r7302; +} +{ +add.f16x2 r7360, r7351, r7357; +} +{ +sub.f16x2 r7363, r7345, r7360; +} +{ +add.f16x2 r7366, r6208, r6256; +} +{ +mul.f16x2 r7369, r7366, r7296; +} +{ +add.f16x2 r7372, r4591, r7369; +} +{ +add.f16x2 r7375, r6224, r6240; +} +{ +mul.f16x2 r7378, r7375, r7300; +} +{ +add.f16x2 r7381, r7372, r7378; +} +{ +sub.f16x2 r7384, r6214, r6262; +} +{ +mul.f16x2 r7387, r7384, r7298; +} +{ +sub.f16x2 r7390, r6230, r6246; +} +{ +mul.f16x2 r7393, r7390, r7302; +} +{ +add.f16x2 r7396, r7387, r7393; +} +{ +add.f16x2 r7399, r7381, r7396; +} +{ +add.f16x2 r7402, r6208, r6256; +} +{ +mul.f16x2 r7405, r7402, r7300; +} +{ +add.f16x2 r7408, r4591, r7405; +} +{ +add.f16x2 r7411, r6224, r6240; +} +{ +mul.f16x2 r7414, r7411, r7304; +} +{ +add.f16x2 r7417, r7408, r7414; +} +{ +sub.f16x2 r7420, r6214, r6262; +} +{ +mul.f16x2 r7423, r7420, r7302; +} +{ +sub.f16x2 r7426, r6230, r6246; +} +{ +mul.f16x2 r7429, r7426, r7305; +} +{ +add.f16x2 r7432, r7423, r7429; +} +{ +sub.f16x2 r7435, r7417, r7432; +} +{ +add.f16x2 r7438, r6208, r6256; +} +{ +mul.f16x2 r7441, r7438, r7300; +} +{ +add.f16x2 r7444, r4591, r7441; +} +{ +add.f16x2 r7447, r6224, r6240; +} +{ +mul.f16x2 r7450, r7447, r7304; +} +{ +add.f16x2 r7453, r7444, r7450; +} +{ +sub.f16x2 r7456, r6214, r6262; +} +{ +mul.f16x2 r7459, r7456, r7302; +} +{ +sub.f16x2 r7462, r6230, r6246; +} +{ +mul.f16x2 r7465, r7462, r7305; +} +{ +add.f16x2 r7468, r7459, r7465; +} +{ +add.f16x2 r7471, r7453, r7468; +} +{ +add.f16x2 r7474, r6214, r6262; +} +{ +mul.f16x2 r7477, r7474, r7296; +} +{ +add.f16x2 r7480, r4735, r7477; +} +{ +add.f16x2 r7483, r6230, r6246; +} +{ +mul.f16x2 r7486, r7483, r7300; +} +{ +add.f16x2 r7489, r7480, r7486; +} +{ +sub.f16x2 r7492, r6208, r6256; +} +{ +mul.f16x2 r7495, r7492, r7298; +} +{ +sub.f16x2 r7498, r6224, r6240; +} +{ +mul.f16x2 r7501, r7498, r7302; +} +{ +add.f16x2 r7504, r7495, r7501; +} +{ +add.f16x2 r7507, r7489, r7504; +} +{ +add.f16x2 r7510, r6214, r6262; +} +{ +mul.f16x2 r7513, r7510, r7296; +} +{ +add.f16x2 r7516, r4735, r7513; +} +{ +add.f16x2 r7519, r6230, r6246; +} +{ +mul.f16x2 r7522, r7519, r7300; +} +{ +add.f16x2 r7525, r7516, r7522; +} +{ +sub.f16x2 r7528, r6208, r6256; +} +{ +mul.f16x2 r7531, r7528, r7298; +} +{ +sub.f16x2 r7534, r6224, r6240; +} +{ +mul.f16x2 r7537, r7534, r7302; +} +{ +add.f16x2 r7540, r7531, r7537; +} +{ +sub.f16x2 r7543, r7525, r7540; +} +{ +add.f16x2 r7546, r6214, r6262; +} +{ +mul.f16x2 r7549, r7546, r7300; +} +{ +add.f16x2 r7552, r4735, r7549; +} +{ +add.f16x2 r7555, r6230, r6246; +} +{ +mul.f16x2 r7558, r7555, r7304; +} +{ +add.f16x2 r7561, r7552, r7558; +} +{ +sub.f16x2 r7564, r6208, r6256; +} +{ +mul.f16x2 r7567, r7564, r7302; +} +{ +sub.f16x2 r7570, r6224, r6240; +} +{ +mul.f16x2 r7573, r7570, r7305; +} +{ +add.f16x2 r7576, r7567, r7573; +} +{ +add.f16x2 r7579, r7561, r7576; +} +{ +add.f16x2 r7582, r6214, r6262; +} +{ +mul.f16x2 r7585, r7582, r7300; +} +{ +add.f16x2 r7588, r4735, r7585; +} +{ +add.f16x2 r7591, r6230, r6246; +} +{ +mul.f16x2 r7594, r7591, r7304; +} +{ +add.f16x2 r7597, r7588, r7594; +} +{ +sub.f16x2 r7600, r6208, r6256; +} +{ +mul.f16x2 r7603, r7600, r7302; +} +{ +sub.f16x2 r7606, r6224, r6240; +} +{ +mul.f16x2 r7609, r7606, r7305; +} +{ +add.f16x2 r7612, r7603, r7609; +} +{ +sub.f16x2 r7615, r7597, r7612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7618, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7619, {low, high}; +} +{ +neg.f16x2 r7620, r7619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r7622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r7623, {low, high}; +} +{ +neg.f16x2 r7624, r7623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r7626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r7627, {low, high}; +} +{ +add.f16x2 r7628, r6272, r6320; +} +{ +add.f16x2 r7631, r4519, r7628; +} +{ +add.f16x2 r7634, r6288, r6304; +} +{ +add.f16x2 r7637, r7631, r7634; +} +{ +add.f16x2 r7640, r6278, r6326; +} +{ +add.f16x2 r7643, r4663, r7640; +} +{ +add.f16x2 r7646, r6294, r6310; +} +{ +add.f16x2 r7649, r7643, r7646; +} +{ +add.f16x2 r7652, r6272, r6320; +} +{ +mul.f16x2 r7655, r7652, r7618; +} +{ +add.f16x2 r7658, r4519, r7655; +} +{ +add.f16x2 r7661, r6288, r6304; +} +{ +mul.f16x2 r7664, r7661, r7622; +} +{ +add.f16x2 r7667, r7658, r7664; +} +{ +sub.f16x2 r7670, r6278, r6326; +} +{ +mul.f16x2 r7673, r7670, r7620; +} +{ +sub.f16x2 r7676, r6294, r6310; +} +{ +mul.f16x2 r7679, r7676, r7624; +} +{ +add.f16x2 r7682, r7673, r7679; +} +{ +sub.f16x2 r7685, r7667, r7682; +} +{ +add.f16x2 r7688, r6272, r6320; +} +{ +mul.f16x2 r7691, r7688, r7618; +} +{ +add.f16x2 r7694, r4519, r7691; +} +{ +add.f16x2 r7697, r6288, r6304; +} +{ +mul.f16x2 r7700, r7697, r7622; +} +{ +add.f16x2 r7703, r7694, r7700; +} +{ +sub.f16x2 r7706, r6278, r6326; +} +{ +mul.f16x2 r7709, r7706, r7620; +} +{ +sub.f16x2 r7712, r6294, r6310; +} +{ +mul.f16x2 r7715, r7712, r7624; +} +{ +add.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 r7721, r7703, r7718; +} +{ +add.f16x2 r7724, r6272, r6320; +} +{ +mul.f16x2 r7727, r7724, r7622; +} +{ +add.f16x2 r7730, r4519, r7727; +} +{ +add.f16x2 r7733, r6288, r6304; +} +{ +mul.f16x2 r7736, r7733, r7626; +} +{ +add.f16x2 r7739, r7730, r7736; +} +{ +sub.f16x2 r7742, r6278, r6326; +} +{ +mul.f16x2 r7745, r7742, r7624; +} +{ +sub.f16x2 r7748, r6294, r6310; +} +{ +mul.f16x2 r7751, r7748, r7627; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +sub.f16x2 r7757, r7739, r7754; +} +{ +add.f16x2 r7760, r6272, r6320; +} +{ +mul.f16x2 r7763, r7760, r7622; +} +{ +add.f16x2 r7766, r4519, r7763; +} +{ +add.f16x2 r7769, r6288, r6304; +} +{ +mul.f16x2 r7772, r7769, r7626; +} +{ +add.f16x2 r7775, r7766, r7772; +} +{ +sub.f16x2 r7778, r6278, r6326; +} +{ +mul.f16x2 r7781, r7778, r7624; +} +{ +sub.f16x2 r7784, r6294, r6310; +} +{ +mul.f16x2 r7787, r7784, r7627; +} +{ +add.f16x2 r7790, r7781, r7787; +} +{ +add.f16x2 r7793, r7775, r7790; +} +{ +add.f16x2 r7796, r6278, r6326; +} +{ +mul.f16x2 r7799, r7796, r7618; +} +{ +add.f16x2 r7802, r4663, r7799; +} +{ +add.f16x2 r7805, r6294, r6310; +} +{ +mul.f16x2 r7808, r7805, r7622; +} +{ +add.f16x2 r7811, r7802, r7808; +} +{ +sub.f16x2 r7814, r6272, r6320; +} +{ +mul.f16x2 r7817, r7814, r7620; +} +{ +sub.f16x2 r7820, r6288, r6304; +} +{ +mul.f16x2 r7823, r7820, r7624; +} +{ +add.f16x2 r7826, r7817, r7823; +} +{ +add.f16x2 r7829, r7811, r7826; +} +{ +add.f16x2 r7832, r6278, r6326; +} +{ +mul.f16x2 r7835, r7832, r7618; +} +{ +add.f16x2 r7838, r4663, r7835; +} +{ +add.f16x2 r7841, r6294, r6310; +} +{ +mul.f16x2 r7844, r7841, r7622; +} +{ +add.f16x2 r7847, r7838, r7844; +} +{ +sub.f16x2 r7850, r6272, r6320; +} +{ +mul.f16x2 r7853, r7850, r7620; +} +{ +sub.f16x2 r7856, r6288, r6304; +} +{ +mul.f16x2 r7859, r7856, r7624; +} +{ +add.f16x2 r7862, r7853, r7859; +} +{ +sub.f16x2 r7865, r7847, r7862; +} +{ +add.f16x2 r7868, r6278, r6326; +} +{ +mul.f16x2 r7871, r7868, r7622; +} +{ +add.f16x2 r7874, r4663, r7871; +} +{ +add.f16x2 r7877, r6294, r6310; +} +{ +mul.f16x2 r7880, r7877, r7626; +} +{ +add.f16x2 r7883, r7874, r7880; +} +{ +sub.f16x2 r7886, r6272, r6320; +} +{ +mul.f16x2 r7889, r7886, r7624; +} +{ +sub.f16x2 r7892, r6288, r6304; +} +{ +mul.f16x2 r7895, r7892, r7627; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 r7901, r7883, r7898; +} +{ +add.f16x2 r7904, r6278, r6326; +} +{ +mul.f16x2 r7907, r7904, r7622; +} +{ +add.f16x2 r7910, r4663, r7907; +} +{ +add.f16x2 r7913, r6294, r6310; +} +{ +mul.f16x2 r7916, r7913, r7626; +} +{ +add.f16x2 r7919, r7910, r7916; +} +{ +sub.f16x2 r7922, r6272, r6320; +} +{ +mul.f16x2 r7925, r7922, r7624; +} +{ +sub.f16x2 r7928, r6288, r6304; +} +{ +mul.f16x2 r7931, r7928, r7627; +} +{ +add.f16x2 r7934, r7925, r7931; +} +{ +sub.f16x2 r7937, r7919, r7934; +} +mul.wide.u32 rd4, r10447, 1374389535; +shr.u64 rd5, rd4, 35; +cvt.u32.u64 r10451, rd5; +mul.lo.s32 r10452, r10451, 25; +sub.s32 r10453, r10447, r10452; +shl.b32 r10454, r10453, 2; +add.s32 r10455, r10448, r10454; +cvt.rn.f32.u32 f600, r10451; +mul.f32 f601, f600, 0f3D4DE32E; +cos.approx.f32 f485, f601; +sin.approx.f32 f602, f601; +neg.f32 f486, f602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f485; +cvt.rn.f16.f32 high, f486; +mov.b32 r7940, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7943, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7945, {high, high}; +} +{ +mul.f16x2 r7947, r6683, r7945; +} +{ +fma.rn.f16x2 r7950, r6671, r7943, r7947; +} +{ +mul.f16x2 r7954, r6671, r7945; +} +{ +neg.f16x2 r7957, r7954; +} +{ +fma.rn.f16x2 r7959, r6683, r7943, r7957; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7963, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7965, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r7967, {low, high}; +} +{ +mul.f16x2 r7968, r7965, r7967; +} +{ +mul.f16x2 r7971, r7940, r7963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r7974, {high, low}; +} +{ +fma.rn.f16x2 r7976, r7968, r7974, r7971; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7976; +mov.b32 r7980, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7976; +mov.b32 r7982, {high, high}; +} +{ +mul.f16x2 r7984, r7005, r7982; +} +{ +fma.rn.f16x2 r7987, r6993, r7980, r7984; +} +{ +mul.f16x2 r7991, r6993, r7982; +} +{ +neg.f16x2 r7994, r7991; +} +{ +fma.rn.f16x2 r7996, r7005, r7980, r7994; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8000, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8002, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8004, {low, high}; +} +{ +mul.f16x2 r8005, r8002, r8004; +} +{ +mul.f16x2 r8008, r7976, r8000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7976; +mov.b32 r8011, {high, low}; +} +{ +fma.rn.f16x2 r8013, r8005, r8011, r8008; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8013; +mov.b32 r8017, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8013; +mov.b32 r8019, {high, high}; +} +{ +mul.f16x2 r8021, r7327, r8019; +} +{ +fma.rn.f16x2 r8024, r7315, r8017, r8021; +} +{ +mul.f16x2 r8028, r7315, r8019; +} +{ +neg.f16x2 r8031, r8028; +} +{ +fma.rn.f16x2 r8033, r7327, r8017, r8031; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8037, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8039, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8041, {low, high}; +} +{ +mul.f16x2 r8042, r8039, r8041; +} +{ +mul.f16x2 r8045, r8013, r8037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8013; +mov.b32 r8048, {high, low}; +} +{ +fma.rn.f16x2 r8050, r8042, r8048, r8045; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8050; +mov.b32 r8054, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8050; +mov.b32 r8056, {high, high}; +} +{ +mul.f16x2 r8058, r7649, r8056; +} +{ +fma.rn.f16x2 r8061, r7637, r8054, r8058; +} +{ +mul.f16x2 r8065, r7637, r8056; +} +{ +neg.f16x2 r8068, r8065; +} +{ +fma.rn.f16x2 r8070, r7649, r8054, r8068; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8074, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8076, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8078, {low, high}; +} +{ +mul.f16x2 r8079, r8076, r8078; +} +{ +mul.f16x2 r8082, r8050, r8074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8050; +mov.b32 r8085, {high, low}; +} +{ +fma.rn.f16x2 r8087, r8079, r8085, r8082; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8087; +mov.b32 r8091, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8087; +mov.b32 r8093, {high, high}; +} +{ +mul.f16x2 r8095, r6541, r8093; +} +{ +fma.rn.f16x2 r8098, r6397, r8091, r8095; +} +{ +mul.f16x2 r8102, r6397, r8093; +} +{ +neg.f16x2 r8105, r8102; +} +{ +fma.rn.f16x2 r8107, r6541, r8091, r8105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8115, {low, high}; +} +{ +mul.f16x2 r8116, r8113, r8115; +} +{ +mul.f16x2 r8119, r8087, r8111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8087; +mov.b32 r8122, {high, low}; +} +{ +fma.rn.f16x2 r8124, r8116, r8122, r8119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8124; +mov.b32 r8128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8124; +mov.b32 r8130, {high, high}; +} +{ +mul.f16x2 r8132, r6863, r8130; +} +{ +fma.rn.f16x2 r8135, r6719, r8128, r8132; +} +{ +mul.f16x2 r8139, r6719, r8130; +} +{ +neg.f16x2 r8142, r8139; +} +{ +fma.rn.f16x2 r8144, r6863, r8128, r8142; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8148, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8150, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8152, {low, high}; +} +{ +mul.f16x2 r8153, r8150, r8152; +} +{ +mul.f16x2 r8156, r8124, r8148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8124; +mov.b32 r8159, {high, low}; +} +{ +fma.rn.f16x2 r8161, r8153, r8159, r8156; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8161; +mov.b32 r8165, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8161; +mov.b32 r8167, {high, high}; +} +{ +mul.f16x2 r8169, r7185, r8167; +} +{ +fma.rn.f16x2 r8172, r7041, r8165, r8169; +} +{ +mul.f16x2 r8176, r7041, r8167; +} +{ +neg.f16x2 r8179, r8176; +} +{ +fma.rn.f16x2 r8181, r7185, r8165, r8179; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8185, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8187, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8189, {low, high}; +} +{ +mul.f16x2 r8190, r8187, r8189; +} +{ +mul.f16x2 r8193, r8161, r8185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8161; +mov.b32 r8196, {high, low}; +} +{ +fma.rn.f16x2 r8198, r8190, r8196, r8193; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8198; +mov.b32 r8202, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8198; +mov.b32 r8204, {high, high}; +} +{ +mul.f16x2 r8206, r7507, r8204; +} +{ +fma.rn.f16x2 r8209, r7363, r8202, r8206; +} +{ +mul.f16x2 r8213, r7363, r8204; +} +{ +neg.f16x2 r8216, r8213; +} +{ +fma.rn.f16x2 r8218, r7507, r8202, r8216; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8222, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8224, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8226, {low, high}; +} +{ +mul.f16x2 r8227, r8224, r8226; +} +{ +mul.f16x2 r8230, r8198, r8222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8198; +mov.b32 r8233, {high, low}; +} +{ +fma.rn.f16x2 r8235, r8227, r8233, r8230; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8235; +mov.b32 r8239, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8235; +mov.b32 r8241, {high, high}; +} +{ +mul.f16x2 r8243, r7829, r8241; +} +{ +fma.rn.f16x2 r8246, r7685, r8239, r8243; +} +{ +mul.f16x2 r8250, r7685, r8241; +} +{ +neg.f16x2 r8253, r8250; +} +{ +fma.rn.f16x2 r8255, r7829, r8239, r8253; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8259, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8261, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8263, {low, high}; +} +{ +mul.f16x2 r8264, r8261, r8263; +} +{ +mul.f16x2 r8267, r8235, r8259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8235; +mov.b32 r8270, {high, low}; +} +{ +fma.rn.f16x2 r8272, r8264, r8270, r8267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8272; +mov.b32 r8276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8272; +mov.b32 r8278, {high, high}; +} +{ +mul.f16x2 r8280, r6613, r8278; +} +{ +fma.rn.f16x2 r8283, r6469, r8276, r8280; +} +{ +mul.f16x2 r8287, r6469, r8278; +} +{ +neg.f16x2 r8290, r8287; +} +{ +fma.rn.f16x2 r8292, r6613, r8276, r8290; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8296, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8298, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8300, {low, high}; +} +{ +mul.f16x2 r8301, r8298, r8300; +} +{ +mul.f16x2 r8304, r8272, r8296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8272; +mov.b32 r8307, {high, low}; +} +{ +fma.rn.f16x2 r8309, r8301, r8307, r8304; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8309; +mov.b32 r8313, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8309; +mov.b32 r8315, {high, high}; +} +{ +mul.f16x2 r8317, r6935, r8315; +} +{ +fma.rn.f16x2 r8320, r6791, r8313, r8317; +} +{ +mul.f16x2 r8324, r6791, r8315; +} +{ +neg.f16x2 r8327, r8324; +} +{ +fma.rn.f16x2 r8329, r6935, r8313, r8327; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8333, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8335, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8337, {low, high}; +} +{ +mul.f16x2 r8338, r8335, r8337; +} +{ +mul.f16x2 r8341, r8309, r8333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8309; +mov.b32 r8344, {high, low}; +} +{ +fma.rn.f16x2 r8346, r8338, r8344, r8341; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8346; +mov.b32 r8350, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8346; +mov.b32 r8352, {high, high}; +} +{ +mul.f16x2 r8354, r7257, r8352; +} +{ +fma.rn.f16x2 r8357, r7113, r8350, r8354; +} +{ +mul.f16x2 r8361, r7113, r8352; +} +{ +neg.f16x2 r8364, r8361; +} +{ +fma.rn.f16x2 r8366, r7257, r8350, r8364; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8370, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8372, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8374, {low, high}; +} +{ +mul.f16x2 r8375, r8372, r8374; +} +{ +mul.f16x2 r8378, r8346, r8370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8346; +mov.b32 r8381, {high, low}; +} +{ +fma.rn.f16x2 r8383, r8375, r8381, r8378; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8383; +mov.b32 r8387, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8383; +mov.b32 r8389, {high, high}; +} +{ +mul.f16x2 r8391, r7579, r8389; +} +{ +fma.rn.f16x2 r8394, r7435, r8387, r8391; +} +{ +mul.f16x2 r8398, r7435, r8389; +} +{ +neg.f16x2 r8401, r8398; +} +{ +fma.rn.f16x2 r8403, r7579, r8387, r8401; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8407, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8409, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8411, {low, high}; +} +{ +mul.f16x2 r8412, r8409, r8411; +} +{ +mul.f16x2 r8415, r8383, r8407; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8383; +mov.b32 r8418, {high, low}; +} +{ +fma.rn.f16x2 r8420, r8412, r8418, r8415; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8420; +mov.b32 r8424, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8420; +mov.b32 r8426, {high, high}; +} +{ +mul.f16x2 r8428, r7901, r8426; +} +{ +fma.rn.f16x2 r8431, r7757, r8424, r8428; +} +{ +mul.f16x2 r8435, r7757, r8426; +} +{ +neg.f16x2 r8438, r8435; +} +{ +fma.rn.f16x2 r8440, r7901, r8424, r8438; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8444, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8446, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8448, {low, high}; +} +{ +mul.f16x2 r8449, r8446, r8448; +} +{ +mul.f16x2 r8452, r8420, r8444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8420; +mov.b32 r8455, {high, low}; +} +{ +fma.rn.f16x2 r8457, r8449, r8455, r8452; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8457; +mov.b32 r8461, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8457; +mov.b32 r8463, {high, high}; +} +{ +mul.f16x2 r8465, r6649, r8463; +} +{ +fma.rn.f16x2 r8468, r6505, r8461, r8465; +} +{ +mul.f16x2 r8472, r6505, r8463; +} +{ +neg.f16x2 r8475, r8472; +} +{ +fma.rn.f16x2 r8477, r6649, r8461, r8475; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8481, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8483, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8485, {low, high}; +} +{ +mul.f16x2 r8486, r8483, r8485; +} +{ +mul.f16x2 r8489, r8457, r8481; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8457; +mov.b32 r8492, {high, low}; +} +{ +fma.rn.f16x2 r8494, r8486, r8492, r8489; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8494; +mov.b32 r8498, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8494; +mov.b32 r8500, {high, high}; +} +{ +mul.f16x2 r8502, r6971, r8500; +} +{ +fma.rn.f16x2 r8505, r6827, r8498, r8502; +} +{ +mul.f16x2 r8509, r6827, r8500; +} +{ +neg.f16x2 r8512, r8509; +} +{ +fma.rn.f16x2 r8514, r6971, r8498, r8512; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8518, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8520, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8522, {low, high}; +} +{ +mul.f16x2 r8523, r8520, r8522; +} +{ +mul.f16x2 r8526, r8494, r8518; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8494; +mov.b32 r8529, {high, low}; +} +{ +fma.rn.f16x2 r8531, r8523, r8529, r8526; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8531; +mov.b32 r8535, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8531; +mov.b32 r8537, {high, high}; +} +{ +mul.f16x2 r8539, r7293, r8537; +} +{ +fma.rn.f16x2 r8542, r7149, r8535, r8539; +} +{ +mul.f16x2 r8546, r7149, r8537; +} +{ +neg.f16x2 r8549, r8546; +} +{ +fma.rn.f16x2 r8551, r7293, r8535, r8549; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8555, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8557, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8559, {low, high}; +} +{ +mul.f16x2 r8560, r8557, r8559; +} +{ +mul.f16x2 r8563, r8531, r8555; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8531; +mov.b32 r8566, {high, low}; +} +{ +fma.rn.f16x2 r8568, r8560, r8566, r8563; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8568; +mov.b32 r8572, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8568; +mov.b32 r8574, {high, high}; +} +{ +mul.f16x2 r8576, r7615, r8574; +} +{ +fma.rn.f16x2 r8579, r7471, r8572, r8576; +} +{ +mul.f16x2 r8583, r7471, r8574; +} +{ +neg.f16x2 r8586, r8583; +} +{ +fma.rn.f16x2 r8588, r7615, r8572, r8586; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8592, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8594, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8596, {low, high}; +} +{ +mul.f16x2 r8597, r8594, r8596; +} +{ +mul.f16x2 r8600, r8568, r8592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8568; +mov.b32 r8603, {high, low}; +} +{ +fma.rn.f16x2 r8605, r8597, r8603, r8600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8605; +mov.b32 r8609, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8605; +mov.b32 r8611, {high, high}; +} +{ +mul.f16x2 r8613, r7937, r8611; +} +{ +fma.rn.f16x2 r8616, r7793, r8609, r8613; +} +{ +mul.f16x2 r8620, r7793, r8611; +} +{ +neg.f16x2 r8623, r8620; +} +{ +fma.rn.f16x2 r8625, r7937, r8609, r8623; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8629, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8631, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8633, {low, high}; +} +{ +mul.f16x2 r8634, r8631, r8633; +} +{ +mul.f16x2 r8637, r8605, r8629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8605; +mov.b32 r8640, {high, low}; +} +{ +fma.rn.f16x2 r8642, r8634, r8640, r8637; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8642; +mov.b32 r8646, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8642; +mov.b32 r8648, {high, high}; +} +{ +mul.f16x2 r8650, r6577, r8648; +} +{ +fma.rn.f16x2 r8653, r6433, r8646, r8650; +} +{ +mul.f16x2 r8657, r6433, r8648; +} +{ +neg.f16x2 r8660, r8657; +} +{ +fma.rn.f16x2 r8662, r6577, r8646, r8660; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8666, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8668, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8670, {low, high}; +} +{ +mul.f16x2 r8671, r8668, r8670; +} +{ +mul.f16x2 r8674, r8642, r8666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8642; +mov.b32 r8677, {high, low}; +} +{ +fma.rn.f16x2 r8679, r8671, r8677, r8674; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8679; +mov.b32 r8683, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8679; +mov.b32 r8685, {high, high}; +} +{ +mul.f16x2 r8687, r6899, r8685; +} +{ +fma.rn.f16x2 r8690, r6755, r8683, r8687; +} +{ +mul.f16x2 r8694, r6755, r8685; +} +{ +neg.f16x2 r8697, r8694; +} +{ +fma.rn.f16x2 r8699, r6899, r8683, r8697; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8703, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8705, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8707, {low, high}; +} +{ +mul.f16x2 r8708, r8705, r8707; +} +{ +mul.f16x2 r8711, r8679, r8703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8679; +mov.b32 r8714, {high, low}; +} +{ +fma.rn.f16x2 r8716, r8708, r8714, r8711; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8716; +mov.b32 r8720, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8716; +mov.b32 r8722, {high, high}; +} +{ +mul.f16x2 r8724, r7221, r8722; +} +{ +fma.rn.f16x2 r8727, r7077, r8720, r8724; +} +{ +mul.f16x2 r8731, r7077, r8722; +} +{ +neg.f16x2 r8734, r8731; +} +{ +fma.rn.f16x2 r8736, r7221, r8720, r8734; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8742, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8744, {low, high}; +} +{ +mul.f16x2 r8745, r8742, r8744; +} +{ +mul.f16x2 r8748, r8716, r8740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8716; +mov.b32 r8751, {high, low}; +} +{ +fma.rn.f16x2 r8753, r8745, r8751, r8748; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8753; +mov.b32 r8757, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8753; +mov.b32 r8759, {high, high}; +} +{ +mul.f16x2 r8761, r7543, r8759; +} +{ +fma.rn.f16x2 r8764, r7399, r8757, r8761; +} +{ +mul.f16x2 r8768, r7399, r8759; +} +{ +neg.f16x2 r8771, r8768; +} +{ +fma.rn.f16x2 r8773, r7543, r8757, r8771; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7940; +mov.b32 r8779, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f533; +cvt.rn.f16.f32 high, f534; +mov.b32 r8781, {low, high}; +} +{ +mul.f16x2 r8782, r8779, r8781; +} +{ +mul.f16x2 r8785, r8753, r8777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8753; +mov.b32 r8788, {high, low}; +} +{ +fma.rn.f16x2 r8790, r8782, r8788, r8785; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8790; +mov.b32 r8794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r8790; +mov.b32 r8796, {high, high}; +} +{ +mul.f16x2 r8798, r7865, r8796; +} +{ +fma.rn.f16x2 r8801, r7721, r8794, r8798; +} +{ +mul.f16x2 r8805, r7721, r8796; +} +{ +neg.f16x2 r8808, r8805; +} +{ +fma.rn.f16x2 r8810, r7865, r8794, r8808; +} +barrier.sync 0; +mad.lo.s32 r10456, r10451, 2500, r10455; +st.shared.u32 [r10456], r6349; +st.shared.u32 [r10456+100], r7950; +st.shared.u32 [r10456+200], r7987; +st.shared.u32 [r10456+300], r8024; +st.shared.u32 [r10456+400], r8061; +st.shared.u32 [r10456+500], r8098; +st.shared.u32 [r10456+600], r8135; +st.shared.u32 [r10456+700], r8172; +st.shared.u32 [r10456+800], r8209; +st.shared.u32 [r10456+900], r8246; +st.shared.u32 [r10456+1000], r8283; +st.shared.u32 [r10456+1100], r8320; +st.shared.u32 [r10456+1200], r8357; +st.shared.u32 [r10456+1300], r8394; +st.shared.u32 [r10456+1400], r8431; +st.shared.u32 [r10456+1500], r8468; +st.shared.u32 [r10456+1600], r8505; +st.shared.u32 [r10456+1700], r8542; +st.shared.u32 [r10456+1800], r8579; +st.shared.u32 [r10456+1900], r8616; +st.shared.u32 [r10456+2000], r8653; +st.shared.u32 [r10456+2100], r8690; +st.shared.u32 [r10456+2200], r8727; +st.shared.u32 [r10456+2300], r8764; +st.shared.u32 [r10456+2400], r8801; +barrier.sync 0; +ld.shared.u32 r8845, [r10450]; +ld.shared.u32 r9167, [r10450+500]; +ld.shared.u32 r9489, [r10450+1000]; +ld.shared.u32 r9811, [r10450+1500]; +ld.shared.u32 r10133, [r10450+2000]; +ld.shared.u32 r8842, [r10450+2500]; +ld.shared.u32 r9164, [r10450+3000]; +ld.shared.u32 r9486, [r10450+3500]; +ld.shared.u32 r9808, [r10450+4000]; +ld.shared.u32 r10130, [r10450+4500]; +ld.shared.u32 r8848, [r10450+5000]; +ld.shared.u32 r9170, [r10450+5500]; +ld.shared.u32 r9492, [r10450+6000]; +ld.shared.u32 r9814, [r10450+6500]; +ld.shared.u32 r10136, [r10450+7000]; +ld.shared.u32 r8849, [r10450+7500]; +ld.shared.u32 r9171, [r10450+8000]; +ld.shared.u32 r9493, [r10450+8500]; +ld.shared.u32 r9815, [r10450+9000]; +ld.shared.u32 r10137, [r10450+9500]; +ld.shared.u32 r8843, [r10450+10000]; +ld.shared.u32 r9165, [r10450+10500]; +ld.shared.u32 r9487, [r10450+11000]; +ld.shared.u32 r9809, [r10450+11500]; +ld.shared.u32 r10131, [r10450+12000]; +barrier.sync 0; +st.shared.u32 [r10456], r6361; +st.shared.u32 [r10456+100], r7959; +st.shared.u32 [r10456+200], r7996; +st.shared.u32 [r10456+300], r8033; +st.shared.u32 [r10456+400], r8070; +st.shared.u32 [r10456+500], r8107; +st.shared.u32 [r10456+600], r8144; +st.shared.u32 [r10456+700], r8181; +st.shared.u32 [r10456+800], r8218; +st.shared.u32 [r10456+900], r8255; +st.shared.u32 [r10456+1000], r8292; +st.shared.u32 [r10456+1100], r8329; +st.shared.u32 [r10456+1200], r8366; +st.shared.u32 [r10456+1300], r8403; +st.shared.u32 [r10456+1400], r8440; +st.shared.u32 [r10456+1500], r8477; +st.shared.u32 [r10456+1600], r8514; +st.shared.u32 [r10456+1700], r8551; +st.shared.u32 [r10456+1800], r8588; +st.shared.u32 [r10456+1900], r8625; +st.shared.u32 [r10456+2000], r8662; +st.shared.u32 [r10456+2100], r8699; +st.shared.u32 [r10456+2200], r8736; +st.shared.u32 [r10456+2300], r8773; +st.shared.u32 [r10456+2400], r8810; +barrier.sync 0; +ld.shared.u32 r8857, [r10450]; +ld.shared.u32 r9179, [r10450+500]; +ld.shared.u32 r9501, [r10450+1000]; +ld.shared.u32 r9823, [r10450+1500]; +ld.shared.u32 r10145, [r10450+2000]; +ld.shared.u32 r8854, [r10450+2500]; +ld.shared.u32 r9176, [r10450+3000]; +ld.shared.u32 r9498, [r10450+3500]; +ld.shared.u32 r9820, [r10450+4000]; +ld.shared.u32 r10142, [r10450+4500]; +ld.shared.u32 r8860, [r10450+5000]; +ld.shared.u32 r9182, [r10450+5500]; +ld.shared.u32 r9504, [r10450+6000]; +ld.shared.u32 r9826, [r10450+6500]; +ld.shared.u32 r10148, [r10450+7000]; +ld.shared.u32 r8861, [r10450+7500]; +ld.shared.u32 r9183, [r10450+8000]; +ld.shared.u32 r9505, [r10450+8500]; +ld.shared.u32 r9827, [r10450+9000]; +ld.shared.u32 r10149, [r10450+9500]; +ld.shared.u32 r8855, [r10450+10000]; +ld.shared.u32 r9177, [r10450+10500]; +ld.shared.u32 r9499, [r10450+11000]; +ld.shared.u32 r9821, [r10450+11500]; +ld.shared.u32 r10143, [r10450+12000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8831, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8832, {low, high}; +} +{ +neg.f16x2 r8833, r8832; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r8835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r8836, {low, high}; +} +{ +neg.f16x2 r8837, r8836; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r8839, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r8840, {low, high}; +} +{ +add.f16x2 r8841, r8842, r8843; +} +{ +add.f16x2 r8844, r8845, r8841; +} +{ +add.f16x2 r8847, r8848, r8849; +} +{ +add.f16x2 %0, r8844, r8847; +} +{ +add.f16x2 r8853, r8854, r8855; +} +{ +add.f16x2 r8856, r8857, r8853; +} +{ +add.f16x2 r8859, r8860, r8861; +} +{ +add.f16x2 %1, r8856, r8859; +} +{ +add.f16x2 r8865, r8842, r8843; +} +{ +mul.f16x2 r8868, r8865, r8831; +} +{ +add.f16x2 r8871, r8845, r8868; +} +{ +add.f16x2 r8874, r8848, r8849; +} +{ +mul.f16x2 r8877, r8874, r8835; +} +{ +add.f16x2 r8880, r8871, r8877; +} +{ +sub.f16x2 r8883, r8854, r8855; +} +{ +mul.f16x2 r8886, r8883, r8833; +} +{ +sub.f16x2 r8889, r8860, r8861; +} +{ +mul.f16x2 r8892, r8889, r8837; +} +{ +add.f16x2 r8895, r8886, r8892; +} +{ +sub.f16x2 %10, r8880, r8895; +} +{ +add.f16x2 r8901, r8842, r8843; +} +{ +mul.f16x2 r8904, r8901, r8831; +} +{ +add.f16x2 r8907, r8845, r8904; +} +{ +add.f16x2 r8910, r8848, r8849; +} +{ +mul.f16x2 r8913, r8910, r8835; +} +{ +add.f16x2 r8916, r8907, r8913; +} +{ +sub.f16x2 r8919, r8854, r8855; +} +{ +mul.f16x2 r8922, r8919, r8833; +} +{ +sub.f16x2 r8925, r8860, r8861; +} +{ +mul.f16x2 r8928, r8925, r8837; +} +{ +add.f16x2 r8931, r8922, r8928; +} +{ +add.f16x2 %40, r8916, r8931; +} +{ +add.f16x2 r8937, r8842, r8843; +} +{ +mul.f16x2 r8940, r8937, r8835; +} +{ +add.f16x2 r8943, r8845, r8940; +} +{ +add.f16x2 r8946, r8848, r8849; +} +{ +mul.f16x2 r8949, r8946, r8839; +} +{ +add.f16x2 r8952, r8943, r8949; +} +{ +sub.f16x2 r8955, r8854, r8855; +} +{ +mul.f16x2 r8958, r8955, r8837; +} +{ +sub.f16x2 r8961, r8860, r8861; +} +{ +mul.f16x2 r8964, r8961, r8840; +} +{ +add.f16x2 r8967, r8958, r8964; +} +{ +sub.f16x2 %20, r8952, r8967; +} +{ +add.f16x2 r8973, r8842, r8843; +} +{ +mul.f16x2 r8976, r8973, r8835; +} +{ +add.f16x2 r8979, r8845, r8976; +} +{ +add.f16x2 r8982, r8848, r8849; +} +{ +mul.f16x2 r8985, r8982, r8839; +} +{ +add.f16x2 r8988, r8979, r8985; +} +{ +sub.f16x2 r8991, r8854, r8855; +} +{ +mul.f16x2 r8994, r8991, r8837; +} +{ +sub.f16x2 r8997, r8860, r8861; +} +{ +mul.f16x2 r9000, r8997, r8840; +} +{ +add.f16x2 r9003, r8994, r9000; +} +{ +add.f16x2 %30, r8988, r9003; +} +{ +add.f16x2 r9009, r8854, r8855; +} +{ +mul.f16x2 r9012, r9009, r8831; +} +{ +add.f16x2 r9015, r8857, r9012; +} +{ +add.f16x2 r9018, r8860, r8861; +} +{ +mul.f16x2 r9021, r9018, r8835; +} +{ +add.f16x2 r9024, r9015, r9021; +} +{ +sub.f16x2 r9027, r8842, r8843; +} +{ +mul.f16x2 r9030, r9027, r8833; +} +{ +sub.f16x2 r9033, r8848, r8849; +} +{ +mul.f16x2 r9036, r9033, r8837; +} +{ +add.f16x2 r9039, r9030, r9036; +} +{ +add.f16x2 %11, r9024, r9039; +} +{ +add.f16x2 r9045, r8854, r8855; +} +{ +mul.f16x2 r9048, r9045, r8831; +} +{ +add.f16x2 r9051, r8857, r9048; +} +{ +add.f16x2 r9054, r8860, r8861; +} +{ +mul.f16x2 r9057, r9054, r8835; +} +{ +add.f16x2 r9060, r9051, r9057; +} +{ +sub.f16x2 r9063, r8842, r8843; +} +{ +mul.f16x2 r9066, r9063, r8833; +} +{ +sub.f16x2 r9069, r8848, r8849; +} +{ +mul.f16x2 r9072, r9069, r8837; +} +{ +add.f16x2 r9075, r9066, r9072; +} +{ +sub.f16x2 %41, r9060, r9075; +} +{ +add.f16x2 r9081, r8854, r8855; +} +{ +mul.f16x2 r9084, r9081, r8835; +} +{ +add.f16x2 r9087, r8857, r9084; +} +{ +add.f16x2 r9090, r8860, r8861; +} +{ +mul.f16x2 r9093, r9090, r8839; +} +{ +add.f16x2 r9096, r9087, r9093; +} +{ +sub.f16x2 r9099, r8842, r8843; +} +{ +mul.f16x2 r9102, r9099, r8837; +} +{ +sub.f16x2 r9105, r8848, r8849; +} +{ +mul.f16x2 r9108, r9105, r8840; +} +{ +add.f16x2 r9111, r9102, r9108; +} +{ +add.f16x2 %21, r9096, r9111; +} +{ +add.f16x2 r9117, r8854, r8855; +} +{ +mul.f16x2 r9120, r9117, r8835; +} +{ +add.f16x2 r9123, r8857, r9120; +} +{ +add.f16x2 r9126, r8860, r8861; +} +{ +mul.f16x2 r9129, r9126, r8839; +} +{ +add.f16x2 r9132, r9123, r9129; +} +{ +sub.f16x2 r9135, r8842, r8843; +} +{ +mul.f16x2 r9138, r9135, r8837; +} +{ +sub.f16x2 r9141, r8848, r8849; +} +{ +mul.f16x2 r9144, r9141, r8840; +} +{ +add.f16x2 r9147, r9138, r9144; +} +{ +sub.f16x2 %31, r9132, r9147; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9153, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9154, {low, high}; +} +{ +neg.f16x2 r9155, r9154; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9158, {low, high}; +} +{ +neg.f16x2 r9159, r9158; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9161, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9162, {low, high}; +} +{ +add.f16x2 r9163, r9164, r9165; +} +{ +add.f16x2 r9166, r9167, r9163; +} +{ +add.f16x2 r9169, r9170, r9171; +} +{ +add.f16x2 %2, r9166, r9169; +} +{ +add.f16x2 r9175, r9176, r9177; +} +{ +add.f16x2 r9178, r9179, r9175; +} +{ +add.f16x2 r9181, r9182, r9183; +} +{ +add.f16x2 %3, r9178, r9181; +} +{ +add.f16x2 r9187, r9164, r9165; +} +{ +mul.f16x2 r9190, r9187, r9153; +} +{ +add.f16x2 r9193, r9167, r9190; +} +{ +add.f16x2 r9196, r9170, r9171; +} +{ +mul.f16x2 r9199, r9196, r9157; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +sub.f16x2 r9205, r9176, r9177; +} +{ +mul.f16x2 r9208, r9205, r9155; +} +{ +sub.f16x2 r9211, r9182, r9183; +} +{ +mul.f16x2 r9214, r9211, r9159; +} +{ +add.f16x2 r9217, r9208, r9214; +} +{ +sub.f16x2 %12, r9202, r9217; +} +{ +add.f16x2 r9223, r9164, r9165; +} +{ +mul.f16x2 r9226, r9223, r9153; +} +{ +add.f16x2 r9229, r9167, r9226; +} +{ +add.f16x2 r9232, r9170, r9171; +} +{ +mul.f16x2 r9235, r9232, r9157; +} +{ +add.f16x2 r9238, r9229, r9235; +} +{ +sub.f16x2 r9241, r9176, r9177; +} +{ +mul.f16x2 r9244, r9241, r9155; +} +{ +sub.f16x2 r9247, r9182, r9183; +} +{ +mul.f16x2 r9250, r9247, r9159; +} +{ +add.f16x2 r9253, r9244, r9250; +} +{ +add.f16x2 %42, r9238, r9253; +} +{ +add.f16x2 r9259, r9164, r9165; +} +{ +mul.f16x2 r9262, r9259, r9157; +} +{ +add.f16x2 r9265, r9167, r9262; +} +{ +add.f16x2 r9268, r9170, r9171; +} +{ +mul.f16x2 r9271, r9268, r9161; +} +{ +add.f16x2 r9274, r9265, r9271; +} +{ +sub.f16x2 r9277, r9176, r9177; +} +{ +mul.f16x2 r9280, r9277, r9159; +} +{ +sub.f16x2 r9283, r9182, r9183; +} +{ +mul.f16x2 r9286, r9283, r9162; +} +{ +add.f16x2 r9289, r9280, r9286; +} +{ +sub.f16x2 %22, r9274, r9289; +} +{ +add.f16x2 r9295, r9164, r9165; +} +{ +mul.f16x2 r9298, r9295, r9157; +} +{ +add.f16x2 r9301, r9167, r9298; +} +{ +add.f16x2 r9304, r9170, r9171; +} +{ +mul.f16x2 r9307, r9304, r9161; +} +{ +add.f16x2 r9310, r9301, r9307; +} +{ +sub.f16x2 r9313, r9176, r9177; +} +{ +mul.f16x2 r9316, r9313, r9159; +} +{ +sub.f16x2 r9319, r9182, r9183; +} +{ +mul.f16x2 r9322, r9319, r9162; +} +{ +add.f16x2 r9325, r9316, r9322; +} +{ +add.f16x2 %32, r9310, r9325; +} +{ +add.f16x2 r9331, r9176, r9177; +} +{ +mul.f16x2 r9334, r9331, r9153; +} +{ +add.f16x2 r9337, r9179, r9334; +} +{ +add.f16x2 r9340, r9182, r9183; +} +{ +mul.f16x2 r9343, r9340, r9157; +} +{ +add.f16x2 r9346, r9337, r9343; +} +{ +sub.f16x2 r9349, r9164, r9165; +} +{ +mul.f16x2 r9352, r9349, r9155; +} +{ +sub.f16x2 r9355, r9170, r9171; +} +{ +mul.f16x2 r9358, r9355, r9159; +} +{ +add.f16x2 r9361, r9352, r9358; +} +{ +add.f16x2 %13, r9346, r9361; +} +{ +add.f16x2 r9367, r9176, r9177; +} +{ +mul.f16x2 r9370, r9367, r9153; +} +{ +add.f16x2 r9373, r9179, r9370; +} +{ +add.f16x2 r9376, r9182, r9183; +} +{ +mul.f16x2 r9379, r9376, r9157; +} +{ +add.f16x2 r9382, r9373, r9379; +} +{ +sub.f16x2 r9385, r9164, r9165; +} +{ +mul.f16x2 r9388, r9385, r9155; +} +{ +sub.f16x2 r9391, r9170, r9171; +} +{ +mul.f16x2 r9394, r9391, r9159; +} +{ +add.f16x2 r9397, r9388, r9394; +} +{ +sub.f16x2 %43, r9382, r9397; +} +{ +add.f16x2 r9403, r9176, r9177; +} +{ +mul.f16x2 r9406, r9403, r9157; +} +{ +add.f16x2 r9409, r9179, r9406; +} +{ +add.f16x2 r9412, r9182, r9183; +} +{ +mul.f16x2 r9415, r9412, r9161; +} +{ +add.f16x2 r9418, r9409, r9415; +} +{ +sub.f16x2 r9421, r9164, r9165; +} +{ +mul.f16x2 r9424, r9421, r9159; +} +{ +sub.f16x2 r9427, r9170, r9171; +} +{ +mul.f16x2 r9430, r9427, r9162; +} +{ +add.f16x2 r9433, r9424, r9430; +} +{ +add.f16x2 %23, r9418, r9433; +} +{ +add.f16x2 r9439, r9176, r9177; +} +{ +mul.f16x2 r9442, r9439, r9157; +} +{ +add.f16x2 r9445, r9179, r9442; +} +{ +add.f16x2 r9448, r9182, r9183; +} +{ +mul.f16x2 r9451, r9448, r9161; +} +{ +add.f16x2 r9454, r9445, r9451; +} +{ +sub.f16x2 r9457, r9164, r9165; +} +{ +mul.f16x2 r9460, r9457, r9159; +} +{ +sub.f16x2 r9463, r9170, r9171; +} +{ +mul.f16x2 r9466, r9463, r9162; +} +{ +add.f16x2 r9469, r9460, r9466; +} +{ +sub.f16x2 %33, r9454, r9469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9476, {low, high}; +} +{ +neg.f16x2 r9477, r9476; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9480, {low, high}; +} +{ +neg.f16x2 r9481, r9480; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9484, {low, high}; +} +{ +add.f16x2 r9485, r9486, r9487; +} +{ +add.f16x2 r9488, r9489, r9485; +} +{ +add.f16x2 r9491, r9492, r9493; +} +{ +add.f16x2 %4, r9488, r9491; +} +{ +add.f16x2 r9497, r9498, r9499; +} +{ +add.f16x2 r9500, r9501, r9497; +} +{ +add.f16x2 r9503, r9504, r9505; +} +{ +add.f16x2 %5, r9500, r9503; +} +{ +add.f16x2 r9509, r9486, r9487; +} +{ +mul.f16x2 r9512, r9509, r9475; +} +{ +add.f16x2 r9515, r9489, r9512; +} +{ +add.f16x2 r9518, r9492, r9493; +} +{ +mul.f16x2 r9521, r9518, r9479; +} +{ +add.f16x2 r9524, r9515, r9521; +} +{ +sub.f16x2 r9527, r9498, r9499; +} +{ +mul.f16x2 r9530, r9527, r9477; +} +{ +sub.f16x2 r9533, r9504, r9505; +} +{ +mul.f16x2 r9536, r9533, r9481; +} +{ +add.f16x2 r9539, r9530, r9536; +} +{ +sub.f16x2 %14, r9524, r9539; +} +{ +add.f16x2 r9545, r9486, r9487; +} +{ +mul.f16x2 r9548, r9545, r9475; +} +{ +add.f16x2 r9551, r9489, r9548; +} +{ +add.f16x2 r9554, r9492, r9493; +} +{ +mul.f16x2 r9557, r9554, r9479; +} +{ +add.f16x2 r9560, r9551, r9557; +} +{ +sub.f16x2 r9563, r9498, r9499; +} +{ +mul.f16x2 r9566, r9563, r9477; +} +{ +sub.f16x2 r9569, r9504, r9505; +} +{ +mul.f16x2 r9572, r9569, r9481; +} +{ +add.f16x2 r9575, r9566, r9572; +} +{ +add.f16x2 %44, r9560, r9575; +} +{ +add.f16x2 r9581, r9486, r9487; +} +{ +mul.f16x2 r9584, r9581, r9479; +} +{ +add.f16x2 r9587, r9489, r9584; +} +{ +add.f16x2 r9590, r9492, r9493; +} +{ +mul.f16x2 r9593, r9590, r9483; +} +{ +add.f16x2 r9596, r9587, r9593; +} +{ +sub.f16x2 r9599, r9498, r9499; +} +{ +mul.f16x2 r9602, r9599, r9481; +} +{ +sub.f16x2 r9605, r9504, r9505; +} +{ +mul.f16x2 r9608, r9605, r9484; +} +{ +add.f16x2 r9611, r9602, r9608; +} +{ +sub.f16x2 %24, r9596, r9611; +} +{ +add.f16x2 r9617, r9486, r9487; +} +{ +mul.f16x2 r9620, r9617, r9479; +} +{ +add.f16x2 r9623, r9489, r9620; +} +{ +add.f16x2 r9626, r9492, r9493; +} +{ +mul.f16x2 r9629, r9626, r9483; +} +{ +add.f16x2 r9632, r9623, r9629; +} +{ +sub.f16x2 r9635, r9498, r9499; +} +{ +mul.f16x2 r9638, r9635, r9481; +} +{ +sub.f16x2 r9641, r9504, r9505; +} +{ +mul.f16x2 r9644, r9641, r9484; +} +{ +add.f16x2 r9647, r9638, r9644; +} +{ +add.f16x2 %34, r9632, r9647; +} +{ +add.f16x2 r9653, r9498, r9499; +} +{ +mul.f16x2 r9656, r9653, r9475; +} +{ +add.f16x2 r9659, r9501, r9656; +} +{ +add.f16x2 r9662, r9504, r9505; +} +{ +mul.f16x2 r9665, r9662, r9479; +} +{ +add.f16x2 r9668, r9659, r9665; +} +{ +sub.f16x2 r9671, r9486, r9487; +} +{ +mul.f16x2 r9674, r9671, r9477; +} +{ +sub.f16x2 r9677, r9492, r9493; +} +{ +mul.f16x2 r9680, r9677, r9481; +} +{ +add.f16x2 r9683, r9674, r9680; +} +{ +add.f16x2 %15, r9668, r9683; +} +{ +add.f16x2 r9689, r9498, r9499; +} +{ +mul.f16x2 r9692, r9689, r9475; +} +{ +add.f16x2 r9695, r9501, r9692; +} +{ +add.f16x2 r9698, r9504, r9505; +} +{ +mul.f16x2 r9701, r9698, r9479; +} +{ +add.f16x2 r9704, r9695, r9701; +} +{ +sub.f16x2 r9707, r9486, r9487; +} +{ +mul.f16x2 r9710, r9707, r9477; +} +{ +sub.f16x2 r9713, r9492, r9493; +} +{ +mul.f16x2 r9716, r9713, r9481; +} +{ +add.f16x2 r9719, r9710, r9716; +} +{ +sub.f16x2 %45, r9704, r9719; +} +{ +add.f16x2 r9725, r9498, r9499; +} +{ +mul.f16x2 r9728, r9725, r9479; +} +{ +add.f16x2 r9731, r9501, r9728; +} +{ +add.f16x2 r9734, r9504, r9505; +} +{ +mul.f16x2 r9737, r9734, r9483; +} +{ +add.f16x2 r9740, r9731, r9737; +} +{ +sub.f16x2 r9743, r9486, r9487; +} +{ +mul.f16x2 r9746, r9743, r9481; +} +{ +sub.f16x2 r9749, r9492, r9493; +} +{ +mul.f16x2 r9752, r9749, r9484; +} +{ +add.f16x2 r9755, r9746, r9752; +} +{ +add.f16x2 %25, r9740, r9755; +} +{ +add.f16x2 r9761, r9498, r9499; +} +{ +mul.f16x2 r9764, r9761, r9479; +} +{ +add.f16x2 r9767, r9501, r9764; +} +{ +add.f16x2 r9770, r9504, r9505; +} +{ +mul.f16x2 r9773, r9770, r9483; +} +{ +add.f16x2 r9776, r9767, r9773; +} +{ +sub.f16x2 r9779, r9486, r9487; +} +{ +mul.f16x2 r9782, r9779, r9481; +} +{ +sub.f16x2 r9785, r9492, r9493; +} +{ +mul.f16x2 r9788, r9785, r9484; +} +{ +add.f16x2 r9791, r9782, r9788; +} +{ +sub.f16x2 %35, r9776, r9791; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9797, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9798, {low, high}; +} +{ +neg.f16x2 r9799, r9798; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r9801, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r9802, {low, high}; +} +{ +neg.f16x2 r9803, r9802; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r9805, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r9806, {low, high}; +} +{ +add.f16x2 r9807, r9808, r9809; +} +{ +add.f16x2 r9810, r9811, r9807; +} +{ +add.f16x2 r9813, r9814, r9815; +} +{ +add.f16x2 %6, r9810, r9813; +} +{ +add.f16x2 r9819, r9820, r9821; +} +{ +add.f16x2 r9822, r9823, r9819; +} +{ +add.f16x2 r9825, r9826, r9827; +} +{ +add.f16x2 %7, r9822, r9825; +} +{ +add.f16x2 r9831, r9808, r9809; +} +{ +mul.f16x2 r9834, r9831, r9797; +} +{ +add.f16x2 r9837, r9811, r9834; +} +{ +add.f16x2 r9840, r9814, r9815; +} +{ +mul.f16x2 r9843, r9840, r9801; +} +{ +add.f16x2 r9846, r9837, r9843; +} +{ +sub.f16x2 r9849, r9820, r9821; +} +{ +mul.f16x2 r9852, r9849, r9799; +} +{ +sub.f16x2 r9855, r9826, r9827; +} +{ +mul.f16x2 r9858, r9855, r9803; +} +{ +add.f16x2 r9861, r9852, r9858; +} +{ +sub.f16x2 %16, r9846, r9861; +} +{ +add.f16x2 r9867, r9808, r9809; +} +{ +mul.f16x2 r9870, r9867, r9797; +} +{ +add.f16x2 r9873, r9811, r9870; +} +{ +add.f16x2 r9876, r9814, r9815; +} +{ +mul.f16x2 r9879, r9876, r9801; +} +{ +add.f16x2 r9882, r9873, r9879; +} +{ +sub.f16x2 r9885, r9820, r9821; +} +{ +mul.f16x2 r9888, r9885, r9799; +} +{ +sub.f16x2 r9891, r9826, r9827; +} +{ +mul.f16x2 r9894, r9891, r9803; +} +{ +add.f16x2 r9897, r9888, r9894; +} +{ +add.f16x2 %46, r9882, r9897; +} +{ +add.f16x2 r9903, r9808, r9809; +} +{ +mul.f16x2 r9906, r9903, r9801; +} +{ +add.f16x2 r9909, r9811, r9906; +} +{ +add.f16x2 r9912, r9814, r9815; +} +{ +mul.f16x2 r9915, r9912, r9805; +} +{ +add.f16x2 r9918, r9909, r9915; +} +{ +sub.f16x2 r9921, r9820, r9821; +} +{ +mul.f16x2 r9924, r9921, r9803; +} +{ +sub.f16x2 r9927, r9826, r9827; +} +{ +mul.f16x2 r9930, r9927, r9806; +} +{ +add.f16x2 r9933, r9924, r9930; +} +{ +sub.f16x2 %26, r9918, r9933; +} +{ +add.f16x2 r9939, r9808, r9809; +} +{ +mul.f16x2 r9942, r9939, r9801; +} +{ +add.f16x2 r9945, r9811, r9942; +} +{ +add.f16x2 r9948, r9814, r9815; +} +{ +mul.f16x2 r9951, r9948, r9805; +} +{ +add.f16x2 r9954, r9945, r9951; +} +{ +sub.f16x2 r9957, r9820, r9821; +} +{ +mul.f16x2 r9960, r9957, r9803; +} +{ +sub.f16x2 r9963, r9826, r9827; +} +{ +mul.f16x2 r9966, r9963, r9806; +} +{ +add.f16x2 r9969, r9960, r9966; +} +{ +add.f16x2 %36, r9954, r9969; +} +{ +add.f16x2 r9975, r9820, r9821; +} +{ +mul.f16x2 r9978, r9975, r9797; +} +{ +add.f16x2 r9981, r9823, r9978; +} +{ +add.f16x2 r9984, r9826, r9827; +} +{ +mul.f16x2 r9987, r9984, r9801; +} +{ +add.f16x2 r9990, r9981, r9987; +} +{ +sub.f16x2 r9993, r9808, r9809; +} +{ +mul.f16x2 r9996, r9993, r9799; +} +{ +sub.f16x2 r9999, r9814, r9815; +} +{ +mul.f16x2 r10002, r9999, r9803; +} +{ +add.f16x2 r10005, r9996, r10002; +} +{ +add.f16x2 %17, r9990, r10005; +} +{ +add.f16x2 r10011, r9820, r9821; +} +{ +mul.f16x2 r10014, r10011, r9797; +} +{ +add.f16x2 r10017, r9823, r10014; +} +{ +add.f16x2 r10020, r9826, r9827; +} +{ +mul.f16x2 r10023, r10020, r9801; +} +{ +add.f16x2 r10026, r10017, r10023; +} +{ +sub.f16x2 r10029, r9808, r9809; +} +{ +mul.f16x2 r10032, r10029, r9799; +} +{ +sub.f16x2 r10035, r9814, r9815; +} +{ +mul.f16x2 r10038, r10035, r9803; +} +{ +add.f16x2 r10041, r10032, r10038; +} +{ +sub.f16x2 %47, r10026, r10041; +} +{ +add.f16x2 r10047, r9820, r9821; +} +{ +mul.f16x2 r10050, r10047, r9801; +} +{ +add.f16x2 r10053, r9823, r10050; +} +{ +add.f16x2 r10056, r9826, r9827; +} +{ +mul.f16x2 r10059, r10056, r9805; +} +{ +add.f16x2 r10062, r10053, r10059; +} +{ +sub.f16x2 r10065, r9808, r9809; +} +{ +mul.f16x2 r10068, r10065, r9803; +} +{ +sub.f16x2 r10071, r9814, r9815; +} +{ +mul.f16x2 r10074, r10071, r9806; +} +{ +add.f16x2 r10077, r10068, r10074; +} +{ +add.f16x2 %27, r10062, r10077; +} +{ +add.f16x2 r10083, r9820, r9821; +} +{ +mul.f16x2 r10086, r10083, r9801; +} +{ +add.f16x2 r10089, r9823, r10086; +} +{ +add.f16x2 r10092, r9826, r9827; +} +{ +mul.f16x2 r10095, r10092, r9805; +} +{ +add.f16x2 r10098, r10089, r10095; +} +{ +sub.f16x2 r10101, r9808, r9809; +} +{ +mul.f16x2 r10104, r10101, r9803; +} +{ +sub.f16x2 r10107, r9814, r9815; +} +{ +mul.f16x2 r10110, r10107, r9806; +} +{ +add.f16x2 r10113, r10104, r10110; +} +{ +sub.f16x2 %37, r10098, r10113; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10120, {low, high}; +} +{ +neg.f16x2 r10121, r10120; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f590; +cvt.rn.f16.f32 high, f590; +mov.b32 r10123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f592; +cvt.rn.f16.f32 high, f592; +mov.b32 r10124, {low, high}; +} +{ +neg.f16x2 r10125, r10124; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r10127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r10128, {low, high}; +} +{ +add.f16x2 r10129, r10130, r10131; +} +{ +add.f16x2 r10132, r10133, r10129; +} +{ +add.f16x2 r10135, r10136, r10137; +} +{ +add.f16x2 %8, r10132, r10135; +} +{ +add.f16x2 r10141, r10142, r10143; +} +{ +add.f16x2 r10144, r10145, r10141; +} +{ +add.f16x2 r10147, r10148, r10149; +} +{ +add.f16x2 %9, r10144, r10147; +} +{ +add.f16x2 r10153, r10130, r10131; +} +{ +mul.f16x2 r10156, r10153, r10119; +} +{ +add.f16x2 r10159, r10133, r10156; +} +{ +add.f16x2 r10162, r10136, r10137; +} +{ +mul.f16x2 r10165, r10162, r10123; +} +{ +add.f16x2 r10168, r10159, r10165; +} +{ +sub.f16x2 r10171, r10142, r10143; +} +{ +mul.f16x2 r10174, r10171, r10121; +} +{ +sub.f16x2 r10177, r10148, r10149; +} +{ +mul.f16x2 r10180, r10177, r10125; +} +{ +add.f16x2 r10183, r10174, r10180; +} +{ +sub.f16x2 %18, r10168, r10183; +} +{ +add.f16x2 r10189, r10130, r10131; +} +{ +mul.f16x2 r10192, r10189, r10119; +} +{ +add.f16x2 r10195, r10133, r10192; +} +{ +add.f16x2 r10198, r10136, r10137; +} +{ +mul.f16x2 r10201, r10198, r10123; +} +{ +add.f16x2 r10204, r10195, r10201; +} +{ +sub.f16x2 r10207, r10142, r10143; +} +{ +mul.f16x2 r10210, r10207, r10121; +} +{ +sub.f16x2 r10213, r10148, r10149; +} +{ +mul.f16x2 r10216, r10213, r10125; +} +{ +add.f16x2 r10219, r10210, r10216; +} +{ +add.f16x2 %48, r10204, r10219; +} +{ +add.f16x2 r10225, r10130, r10131; +} +{ +mul.f16x2 r10228, r10225, r10123; +} +{ +add.f16x2 r10231, r10133, r10228; +} +{ +add.f16x2 r10234, r10136, r10137; +} +{ +mul.f16x2 r10237, r10234, r10127; +} +{ +add.f16x2 r10240, r10231, r10237; +} +{ +sub.f16x2 r10243, r10142, r10143; +} +{ +mul.f16x2 r10246, r10243, r10125; +} +{ +sub.f16x2 r10249, r10148, r10149; +} +{ +mul.f16x2 r10252, r10249, r10128; +} +{ +add.f16x2 r10255, r10246, r10252; +} +{ +sub.f16x2 %28, r10240, r10255; +} +{ +add.f16x2 r10261, r10130, r10131; +} +{ +mul.f16x2 r10264, r10261, r10123; +} +{ +add.f16x2 r10267, r10133, r10264; +} +{ +add.f16x2 r10270, r10136, r10137; +} +{ +mul.f16x2 r10273, r10270, r10127; +} +{ +add.f16x2 r10276, r10267, r10273; +} +{ +sub.f16x2 r10279, r10142, r10143; +} +{ +mul.f16x2 r10282, r10279, r10125; +} +{ +sub.f16x2 r10285, r10148, r10149; +} +{ +mul.f16x2 r10288, r10285, r10128; +} +{ +add.f16x2 r10291, r10282, r10288; +} +{ +add.f16x2 %38, r10276, r10291; +} +{ +add.f16x2 r10297, r10142, r10143; +} +{ +mul.f16x2 r10300, r10297, r10119; +} +{ +add.f16x2 r10303, r10145, r10300; +} +{ +add.f16x2 r10306, r10148, r10149; +} +{ +mul.f16x2 r10309, r10306, r10123; +} +{ +add.f16x2 r10312, r10303, r10309; +} +{ +sub.f16x2 r10315, r10130, r10131; +} +{ +mul.f16x2 r10318, r10315, r10121; +} +{ +sub.f16x2 r10321, r10136, r10137; +} +{ +mul.f16x2 r10324, r10321, r10125; +} +{ +add.f16x2 r10327, r10318, r10324; +} +{ +add.f16x2 %19, r10312, r10327; +} +{ +add.f16x2 r10333, r10142, r10143; +} +{ +mul.f16x2 r10336, r10333, r10119; +} +{ +add.f16x2 r10339, r10145, r10336; +} +{ +add.f16x2 r10342, r10148, r10149; +} +{ +mul.f16x2 r10345, r10342, r10123; +} +{ +add.f16x2 r10348, r10339, r10345; +} +{ +sub.f16x2 r10351, r10130, r10131; +} +{ +mul.f16x2 r10354, r10351, r10121; +} +{ +sub.f16x2 r10357, r10136, r10137; +} +{ +mul.f16x2 r10360, r10357, r10125; +} +{ +add.f16x2 r10363, r10354, r10360; +} +{ +sub.f16x2 %49, r10348, r10363; +} +{ +add.f16x2 r10369, r10142, r10143; +} +{ +mul.f16x2 r10372, r10369, r10123; +} +{ +add.f16x2 r10375, r10145, r10372; +} +{ +add.f16x2 r10378, r10148, r10149; +} +{ +mul.f16x2 r10381, r10378, r10127; +} +{ +add.f16x2 r10384, r10375, r10381; +} +{ +sub.f16x2 r10387, r10130, r10131; +} +{ +mul.f16x2 r10390, r10387, r10125; +} +{ +sub.f16x2 r10393, r10136, r10137; +} +{ +mul.f16x2 r10396, r10393, r10128; +} +{ +add.f16x2 r10399, r10390, r10396; +} +{ +add.f16x2 %29, r10384, r10399; +} +{ +add.f16x2 r10405, r10142, r10143; +} +{ +mul.f16x2 r10408, r10405, r10123; +} +{ +add.f16x2 r10411, r10145, r10408; +} +{ +add.f16x2 r10414, r10148, r10149; +} +{ +mul.f16x2 r10417, r10414, r10127; +} +{ +add.f16x2 r10420, r10411, r10417; +} +{ +sub.f16x2 r10423, r10130, r10131; +} +{ +mul.f16x2 r10426, r10423, r10125; +} +{ +sub.f16x2 r10429, r10136, r10137; +} +{ +mul.f16x2 r10432, r10429, r10128; +} +{ +add.f16x2 r10435, r10426, r10432; +} +{ +sub.f16x2 %39, r10420, r10435; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[13].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1116, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<121>; +.reg .b32 r<2244>; +.reg .b64 rd<10>; +mov.u32 r2215, %tid.x; +mov.f32 f106, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1, {low, high}; +} +mov.f32 f108, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f102, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r5, {low, high}; +} +mov.f32 f104, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r2215, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r2216, rd3; +mul.lo.s32 r2217, r2216, 625; +sub.s32 r2218, r2215, r2217; +cvt.rn.f32.u32 f109, r2218; +mul.f32 f110, f109, 0f3B03C498; +cos.approx.f32 f13, f110; +sin.approx.f32 f111, f110; +neg.f32 f14, f111; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +mov.u32 r2219, %tid.y; +mov.u32 r2220, %10; +mad.lo.s32 r2221, r2219, 25000, r2220; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f93, 0fBF800000; +mov.f32 f94, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +mad.lo.s32 r2222, r2216, 25000, r2221; +barrier.sync 0; +mad.lo.s32 r2223, r2218, 40, r2222; +st.shared.v2.f32 [r2223], {r20, r32}; +st.shared.v2.f32 [r2223+8], {r333, r342}; +st.shared.v2.f32 [r2223+16], {r370, r379}; +st.shared.v2.f32 [r2223+24], {r407, r416}; +st.shared.v2.f32 [r2223+32], {r444, r453}; +barrier.sync 0; +shl.b32 r2224, r2218, 5; +sub.s32 r2225, r2223, r2224; +ld.shared.u32 r488, [r2225]; +ld.shared.u32 r500, [r2225+4]; +ld.shared.u32 r485, [r2225+5000]; +ld.shared.u32 r497, [r2225+5004]; +ld.shared.u32 r491, [r2225+10000]; +ld.shared.u32 r503, [r2225+10004]; +ld.shared.u32 r492, [r2225+15000]; +ld.shared.u32 r504, [r2225+15004]; +ld.shared.u32 r486, [r2225+20000]; +ld.shared.u32 r498, [r2225+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 r493, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 r505, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 r577, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 r721, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 r757, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 r793, r775, r790; +} +mul.wide.u32 rd4, r2218, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r2226, rd5; +cvt.rn.f32.u32 f112, r2226; +mul.f32 f113, f112, 0f3C24B5BE; +cos.approx.f32 f37, f113; +sin.approx.f32 f114, f113; +neg.f32 f38, f114; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r796, {low, high}; +} +mul.lo.s32 r2227, r2226, 5; +sub.s32 r2228, r2218, r2227; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r801, {high, high}; +} +{ +mul.f16x2 r803, r685, r801; +} +{ +fma.rn.f16x2 r806, r541, r799, r803; +} +{ +mul.f16x2 r810, r541, r801; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r685, r799, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r821, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r823, {low, high}; +} +{ +mul.f16x2 r824, r821, r823; +} +{ +mul.f16x2 r827, r796, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r830, {high, low}; +} +{ +fma.rn.f16x2 r832, r824, r830, r827; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r838, {high, high}; +} +{ +mul.f16x2 r840, r757, r838; +} +{ +fma.rn.f16x2 r843, r613, r836, r840; +} +{ +mul.f16x2 r847, r613, r838; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r757, r836, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r858, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r860, {low, high}; +} +{ +mul.f16x2 r861, r858, r860; +} +{ +mul.f16x2 r864, r832, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r867, {high, low}; +} +{ +fma.rn.f16x2 r869, r861, r867, r864; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r873, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r875, {high, high}; +} +{ +mul.f16x2 r877, r793, r875; +} +{ +fma.rn.f16x2 r880, r649, r873, r877; +} +{ +mul.f16x2 r884, r649, r875; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r793, r873, r887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r893, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r895, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r897, {low, high}; +} +{ +mul.f16x2 r898, r895, r897; +} +{ +mul.f16x2 r901, r869, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r904, {high, low}; +} +{ +fma.rn.f16x2 r906, r898, r904, r901; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r910, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r912, {high, high}; +} +{ +mul.f16x2 r914, r721, r912; +} +{ +fma.rn.f16x2 r917, r577, r910, r914; +} +{ +mul.f16x2 r921, r577, r912; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r721, r910, r924; +} +shl.b32 r2229, r2228, 3; +add.s32 r2230, r2222, r2229; +barrier.sync 0; +mad.lo.s32 r2231, r2226, 200, r2230; +st.shared.u32 [r2231], r493; +st.shared.u32 [r2231+4], r505; +st.shared.u32 [r2231+40], r806; +st.shared.u32 [r2231+44], r815; +st.shared.u32 [r2231+80], r843; +st.shared.u32 [r2231+84], r852; +st.shared.u32 [r2231+120], r880; +st.shared.u32 [r2231+124], r889; +st.shared.u32 [r2231+160], r917; +st.shared.u32 [r2231+164], r926; +barrier.sync 0; +ld.shared.u32 r961, [r2225]; +ld.shared.u32 r973, [r2225+4]; +ld.shared.u32 r958, [r2225+5000]; +ld.shared.u32 r970, [r2225+5004]; +ld.shared.u32 r964, [r2225+10000]; +ld.shared.u32 r976, [r2225+10004]; +ld.shared.u32 r965, [r2225+15000]; +ld.shared.u32 r977, [r2225+15004]; +ld.shared.u32 r959, [r2225+20000]; +ld.shared.u32 r971, [r2225+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r951, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r952, {low, high}; +} +{ +neg.f16x2 r953, r952; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r956, {low, high}; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r961, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r960, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r973, r969; +} +{ +add.f16x2 r975, r976, r977; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, r958, r959; +} +{ +mul.f16x2 r984, r981, r947; +} +{ +add.f16x2 r987, r961, r984; +} +{ +add.f16x2 r990, r964, r965; +} +{ +mul.f16x2 r993, r990, r951; +} +{ +add.f16x2 r996, r987, r993; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r949; +} +{ +sub.f16x2 r1005, r976, r977; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +sub.f16x2 r1014, r996, r1011; +} +{ +add.f16x2 r1017, r958, r959; +} +{ +mul.f16x2 r1020, r1017, r947; +} +{ +add.f16x2 r1023, r961, r1020; +} +{ +add.f16x2 r1026, r964, r965; +} +{ +mul.f16x2 r1029, r1026, r951; +} +{ +add.f16x2 r1032, r1023, r1029; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r949; +} +{ +sub.f16x2 r1041, r976, r977; +} +{ +mul.f16x2 r1044, r1041, r953; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r1032, r1047; +} +{ +add.f16x2 r1053, r958, r959; +} +{ +mul.f16x2 r1056, r1053, r951; +} +{ +add.f16x2 r1059, r961, r1056; +} +{ +add.f16x2 r1062, r964, r965; +} +{ +mul.f16x2 r1065, r1062, r955; +} +{ +add.f16x2 r1068, r1059, r1065; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r953; +} +{ +sub.f16x2 r1077, r976, r977; +} +{ +mul.f16x2 r1080, r1077, r956; +} +{ +add.f16x2 r1083, r1074, r1080; +} +{ +sub.f16x2 r1086, r1068, r1083; +} +{ +add.f16x2 r1089, r958, r959; +} +{ +mul.f16x2 r1092, r1089, r951; +} +{ +add.f16x2 r1095, r961, r1092; +} +{ +add.f16x2 r1098, r964, r965; +} +{ +mul.f16x2 r1101, r1098, r955; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r953; +} +{ +sub.f16x2 r1113, r976, r977; +} +{ +mul.f16x2 r1116, r1113, r956; +} +{ +add.f16x2 r1119, r1110, r1116; +} +{ +add.f16x2 r1122, r1104, r1119; +} +{ +add.f16x2 r1125, r970, r971; +} +{ +mul.f16x2 r1128, r1125, r947; +} +{ +add.f16x2 r1131, r973, r1128; +} +{ +add.f16x2 r1134, r976, r977; +} +{ +mul.f16x2 r1137, r1134, r951; +} +{ +add.f16x2 r1140, r1131, r1137; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r949; +} +{ +sub.f16x2 r1149, r964, r965; +} +{ +mul.f16x2 r1152, r1149, r953; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r1140, r1155; +} +{ +add.f16x2 r1161, r970, r971; +} +{ +mul.f16x2 r1164, r1161, r947; +} +{ +add.f16x2 r1167, r973, r1164; +} +{ +add.f16x2 r1170, r976, r977; +} +{ +mul.f16x2 r1173, r1170, r951; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r949; +} +{ +sub.f16x2 r1185, r964, r965; +} +{ +mul.f16x2 r1188, r1185, r953; +} +{ +add.f16x2 r1191, r1182, r1188; +} +{ +sub.f16x2 r1194, r1176, r1191; +} +{ +add.f16x2 r1197, r970, r971; +} +{ +mul.f16x2 r1200, r1197, r951; +} +{ +add.f16x2 r1203, r973, r1200; +} +{ +add.f16x2 r1206, r976, r977; +} +{ +mul.f16x2 r1209, r1206, r955; +} +{ +add.f16x2 r1212, r1203, r1209; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r953; +} +{ +sub.f16x2 r1221, r964, r965; +} +{ +mul.f16x2 r1224, r1221, r956; +} +{ +add.f16x2 r1227, r1218, r1224; +} +{ +add.f16x2 r1230, r1212, r1227; +} +{ +add.f16x2 r1233, r970, r971; +} +{ +mul.f16x2 r1236, r1233, r951; +} +{ +add.f16x2 r1239, r973, r1236; +} +{ +add.f16x2 r1242, r976, r977; +} +{ +mul.f16x2 r1245, r1242, r955; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r953; +} +{ +sub.f16x2 r1257, r964, r965; +} +{ +mul.f16x2 r1260, r1257, r956; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +sub.f16x2 r1266, r1248, r1263; +} +mul.wide.u32 rd6, r2218, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r2232, rd7; +cvt.rn.f32.u32 f115, r2232; +mul.f32 f116, f115, 0f3D4DE32E; +cos.approx.f32 f61, f116; +sin.approx.f32 f117, f116; +neg.f32 f62, f117; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1269, {low, high}; +} +mul.lo.s32 r2233, r2232, 25; +sub.s32 r2234, r2218, r2233; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1272, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1274, {high, high}; +} +{ +mul.f16x2 r1276, r1158, r1274; +} +{ +fma.rn.f16x2 r1279, r1014, r1272, r1276; +} +{ +mul.f16x2 r1283, r1014, r1274; +} +{ +neg.f16x2 r1286, r1283; +} +{ +fma.rn.f16x2 r1288, r1158, r1272, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1294, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1296, {low, high}; +} +{ +mul.f16x2 r1297, r1294, r1296; +} +{ +mul.f16x2 r1300, r1269, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1303, {high, low}; +} +{ +fma.rn.f16x2 r1305, r1297, r1303, r1300; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1309, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1311, {high, high}; +} +{ +mul.f16x2 r1313, r1230, r1311; +} +{ +fma.rn.f16x2 r1316, r1086, r1309, r1313; +} +{ +mul.f16x2 r1320, r1086, r1311; +} +{ +neg.f16x2 r1323, r1320; +} +{ +fma.rn.f16x2 r1325, r1230, r1309, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1329, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1331, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1331, r1333; +} +{ +mul.f16x2 r1337, r1305, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1340, {high, low}; +} +{ +fma.rn.f16x2 r1342, r1334, r1340, r1337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1348, {high, high}; +} +{ +mul.f16x2 r1350, r1266, r1348; +} +{ +fma.rn.f16x2 r1353, r1122, r1346, r1350; +} +{ +mul.f16x2 r1357, r1122, r1348; +} +{ +neg.f16x2 r1360, r1357; +} +{ +fma.rn.f16x2 r1362, r1266, r1346, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1366, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1368, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1368, r1370; +} +{ +mul.f16x2 r1374, r1342, r1366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1377, {high, low}; +} +{ +fma.rn.f16x2 r1379, r1371, r1377, r1374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1385, {high, high}; +} +{ +mul.f16x2 r1387, r1194, r1385; +} +{ +fma.rn.f16x2 r1390, r1050, r1383, r1387; +} +{ +mul.f16x2 r1394, r1050, r1385; +} +{ +neg.f16x2 r1397, r1394; +} +{ +fma.rn.f16x2 r1399, r1194, r1383, r1397; +} +shl.b32 r2235, r2234, 3; +add.s32 r2236, r2222, r2235; +barrier.sync 0; +mad.lo.s32 r2237, r2232, 1000, r2236; +st.shared.u32 [r2237], r966; +st.shared.u32 [r2237+4], r978; +st.shared.u32 [r2237+200], r1279; +st.shared.u32 [r2237+204], r1288; +st.shared.u32 [r2237+400], r1316; +st.shared.u32 [r2237+404], r1325; +st.shared.u32 [r2237+600], r1353; +st.shared.u32 [r2237+604], r1362; +st.shared.u32 [r2237+800], r1390; +st.shared.u32 [r2237+804], r1399; +barrier.sync 0; +ld.shared.u32 r1434, [r2225]; +ld.shared.u32 r1446, [r2225+4]; +ld.shared.u32 r1431, [r2225+5000]; +ld.shared.u32 r1443, [r2225+5004]; +ld.shared.u32 r1437, [r2225+10000]; +ld.shared.u32 r1449, [r2225+10004]; +ld.shared.u32 r1438, [r2225+15000]; +ld.shared.u32 r1450, [r2225+15004]; +ld.shared.u32 r1432, [r2225+20000]; +ld.shared.u32 r1444, [r2225+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1421, {low, high}; +} +{ +neg.f16x2 r1422, r1421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1425, {low, high}; +} +{ +neg.f16x2 r1426, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1429, {low, high}; +} +{ +add.f16x2 r1430, r1431, r1432; +} +{ +add.f16x2 r1433, r1434, r1430; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +add.f16x2 r1439, r1433, r1436; +} +{ +add.f16x2 r1442, r1443, r1444; +} +{ +add.f16x2 r1445, r1446, r1442; +} +{ +add.f16x2 r1448, r1449, r1450; +} +{ +add.f16x2 r1451, r1445, r1448; +} +{ +add.f16x2 r1454, r1431, r1432; +} +{ +mul.f16x2 r1457, r1454, r1420; +} +{ +add.f16x2 r1460, r1434, r1457; +} +{ +add.f16x2 r1463, r1437, r1438; +} +{ +mul.f16x2 r1466, r1463, r1424; +} +{ +add.f16x2 r1469, r1460, r1466; +} +{ +sub.f16x2 r1472, r1443, r1444; +} +{ +mul.f16x2 r1475, r1472, r1422; +} +{ +sub.f16x2 r1478, r1449, r1450; +} +{ +mul.f16x2 r1481, r1478, r1426; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r1469, r1484; +} +{ +add.f16x2 r1490, r1431, r1432; +} +{ +mul.f16x2 r1493, r1490, r1420; +} +{ +add.f16x2 r1496, r1434, r1493; +} +{ +add.f16x2 r1499, r1437, r1438; +} +{ +mul.f16x2 r1502, r1499, r1424; +} +{ +add.f16x2 r1505, r1496, r1502; +} +{ +sub.f16x2 r1508, r1443, r1444; +} +{ +mul.f16x2 r1511, r1508, r1422; +} +{ +sub.f16x2 r1514, r1449, r1450; +} +{ +mul.f16x2 r1517, r1514, r1426; +} +{ +add.f16x2 r1520, r1511, r1517; +} +{ +add.f16x2 r1523, r1505, r1520; +} +{ +add.f16x2 r1526, r1431, r1432; +} +{ +mul.f16x2 r1529, r1526, r1424; +} +{ +add.f16x2 r1532, r1434, r1529; +} +{ +add.f16x2 r1535, r1437, r1438; +} +{ +mul.f16x2 r1538, r1535, r1428; +} +{ +add.f16x2 r1541, r1532, r1538; +} +{ +sub.f16x2 r1544, r1443, r1444; +} +{ +mul.f16x2 r1547, r1544, r1426; +} +{ +sub.f16x2 r1550, r1449, r1450; +} +{ +mul.f16x2 r1553, r1550, r1429; +} +{ +add.f16x2 r1556, r1547, r1553; +} +{ +sub.f16x2 r1559, r1541, r1556; +} +{ +add.f16x2 r1562, r1431, r1432; +} +{ +mul.f16x2 r1565, r1562, r1424; +} +{ +add.f16x2 r1568, r1434, r1565; +} +{ +add.f16x2 r1571, r1437, r1438; +} +{ +mul.f16x2 r1574, r1571, r1428; +} +{ +add.f16x2 r1577, r1568, r1574; +} +{ +sub.f16x2 r1580, r1443, r1444; +} +{ +mul.f16x2 r1583, r1580, r1426; +} +{ +sub.f16x2 r1586, r1449, r1450; +} +{ +mul.f16x2 r1589, r1586, r1429; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1577, r1592; +} +{ +add.f16x2 r1598, r1443, r1444; +} +{ +mul.f16x2 r1601, r1598, r1420; +} +{ +add.f16x2 r1604, r1446, r1601; +} +{ +add.f16x2 r1607, r1449, r1450; +} +{ +mul.f16x2 r1610, r1607, r1424; +} +{ +add.f16x2 r1613, r1604, r1610; +} +{ +sub.f16x2 r1616, r1431, r1432; +} +{ +mul.f16x2 r1619, r1616, r1422; +} +{ +sub.f16x2 r1622, r1437, r1438; +} +{ +mul.f16x2 r1625, r1622, r1426; +} +{ +add.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1613, r1628; +} +{ +add.f16x2 r1634, r1443, r1444; +} +{ +mul.f16x2 r1637, r1634, r1420; +} +{ +add.f16x2 r1640, r1446, r1637; +} +{ +add.f16x2 r1643, r1449, r1450; +} +{ +mul.f16x2 r1646, r1643, r1424; +} +{ +add.f16x2 r1649, r1640, r1646; +} +{ +sub.f16x2 r1652, r1431, r1432; +} +{ +mul.f16x2 r1655, r1652, r1422; +} +{ +sub.f16x2 r1658, r1437, r1438; +} +{ +mul.f16x2 r1661, r1658, r1426; +} +{ +add.f16x2 r1664, r1655, r1661; +} +{ +sub.f16x2 r1667, r1649, r1664; +} +{ +add.f16x2 r1670, r1443, r1444; +} +{ +mul.f16x2 r1673, r1670, r1424; +} +{ +add.f16x2 r1676, r1446, r1673; +} +{ +add.f16x2 r1679, r1449, r1450; +} +{ +mul.f16x2 r1682, r1679, r1428; +} +{ +add.f16x2 r1685, r1676, r1682; +} +{ +sub.f16x2 r1688, r1431, r1432; +} +{ +mul.f16x2 r1691, r1688, r1426; +} +{ +sub.f16x2 r1694, r1437, r1438; +} +{ +mul.f16x2 r1697, r1694, r1429; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +add.f16x2 r1703, r1685, r1700; +} +{ +add.f16x2 r1706, r1443, r1444; +} +{ +mul.f16x2 r1709, r1706, r1424; +} +{ +add.f16x2 r1712, r1446, r1709; +} +{ +add.f16x2 r1715, r1449, r1450; +} +{ +mul.f16x2 r1718, r1715, r1428; +} +{ +add.f16x2 r1721, r1712, r1718; +} +{ +sub.f16x2 r1724, r1431, r1432; +} +{ +mul.f16x2 r1727, r1724, r1426; +} +{ +sub.f16x2 r1730, r1437, r1438; +} +{ +mul.f16x2 r1733, r1730, r1429; +} +{ +add.f16x2 r1736, r1727, r1733; +} +{ +sub.f16x2 r1739, r1721, r1736; +} +mul.wide.u32 rd8, r2218, 274877907; +shr.u64 rd9, rd8, 35; +cvt.u32.u64 r2238, rd9; +cvt.rn.f32.u32 f118, r2238; +mul.f32 f119, f118, 0f3E80ADFD; +cos.approx.f32 f85, f119; +sin.approx.f32 f120, f119; +neg.f32 f86, f120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f86; +mov.b32 r1742, {low, high}; +} +mul.lo.s32 r2239, r2238, 125; +sub.s32 r2240, r2218, r2239; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1747, {high, high}; +} +{ +mul.f16x2 r1749, r1631, r1747; +} +{ +fma.rn.f16x2 r1752, r1487, r1745, r1749; +} +{ +mul.f16x2 r1756, r1487, r1747; +} +{ +neg.f16x2 r1759, r1756; +} +{ +fma.rn.f16x2 r1761, r1631, r1745, r1759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1765, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1767, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1769, {low, high}; +} +{ +mul.f16x2 r1770, r1767, r1769; +} +{ +mul.f16x2 r1773, r1742, r1765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1776, {high, low}; +} +{ +fma.rn.f16x2 r1778, r1770, r1776, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1778; +mov.b32 r1782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1778; +mov.b32 r1784, {high, high}; +} +{ +mul.f16x2 r1786, r1703, r1784; +} +{ +fma.rn.f16x2 r1789, r1559, r1782, r1786; +} +{ +mul.f16x2 r1793, r1559, r1784; +} +{ +neg.f16x2 r1796, r1793; +} +{ +fma.rn.f16x2 r1798, r1703, r1782, r1796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1802, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1804, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1806, {low, high}; +} +{ +mul.f16x2 r1807, r1804, r1806; +} +{ +mul.f16x2 r1810, r1778, r1802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1778; +mov.b32 r1813, {high, low}; +} +{ +fma.rn.f16x2 r1815, r1807, r1813, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1815; +mov.b32 r1819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1815; +mov.b32 r1821, {high, high}; +} +{ +mul.f16x2 r1823, r1739, r1821; +} +{ +fma.rn.f16x2 r1826, r1595, r1819, r1823; +} +{ +mul.f16x2 r1830, r1595, r1821; +} +{ +neg.f16x2 r1833, r1830; +} +{ +fma.rn.f16x2 r1835, r1739, r1819, r1833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1839, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1841, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1843, {low, high}; +} +{ +mul.f16x2 r1844, r1841, r1843; +} +{ +mul.f16x2 r1847, r1815, r1839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1815; +mov.b32 r1850, {high, low}; +} +{ +fma.rn.f16x2 r1852, r1844, r1850, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1852; +mov.b32 r1856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1852; +mov.b32 r1858, {high, high}; +} +{ +mul.f16x2 r1860, r1667, r1858; +} +{ +fma.rn.f16x2 r1863, r1523, r1856, r1860; +} +{ +mul.f16x2 r1867, r1523, r1858; +} +{ +neg.f16x2 r1870, r1867; +} +{ +fma.rn.f16x2 r1872, r1667, r1856, r1870; +} +shl.b32 r2241, r2240, 3; +add.s32 r2242, r2222, r2241; +barrier.sync 0; +mad.lo.s32 r2243, r2238, 5000, r2242; +st.shared.u32 [r2243], r1439; +st.shared.u32 [r2243+4], r1451; +st.shared.u32 [r2243+1000], r1752; +st.shared.u32 [r2243+1004], r1761; +st.shared.u32 [r2243+2000], r1789; +st.shared.u32 [r2243+2004], r1798; +st.shared.u32 [r2243+3000], r1826; +st.shared.u32 [r2243+3004], r1835; +st.shared.u32 [r2243+4000], r1863; +st.shared.u32 [r2243+4004], r1872; +barrier.sync 0; +ld.shared.u32 r1907, [r2225]; +ld.shared.u32 r1919, [r2225+4]; +ld.shared.u32 r1904, [r2225+5000]; +ld.shared.u32 r1916, [r2225+5004]; +ld.shared.u32 r1910, [r2225+10000]; +ld.shared.u32 r1922, [r2225+10004]; +ld.shared.u32 r1911, [r2225+15000]; +ld.shared.u32 r1923, [r2225+15004]; +ld.shared.u32 r1905, [r2225+20000]; +ld.shared.u32 r1917, [r2225+20004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1893, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1894, {low, high}; +} +{ +neg.f16x2 r1895, r1894; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1897, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1898, {low, high}; +} +{ +neg.f16x2 r1899, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1901, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1902, {low, high}; +} +{ +add.f16x2 r1903, r1904, r1905; +} +{ +add.f16x2 r1906, r1907, r1903; +} +{ +add.f16x2 r1909, r1910, r1911; +} +{ +add.f16x2 %0, r1906, r1909; +} +{ +add.f16x2 r1915, r1916, r1917; +} +{ +add.f16x2 r1918, r1919, r1915; +} +{ +add.f16x2 r1921, r1922, r1923; +} +{ +add.f16x2 %1, r1918, r1921; +} +{ +add.f16x2 r1927, r1904, r1905; +} +{ +mul.f16x2 r1930, r1927, r1893; +} +{ +add.f16x2 r1933, r1907, r1930; +} +{ +add.f16x2 r1936, r1910, r1911; +} +{ +mul.f16x2 r1939, r1936, r1897; +} +{ +add.f16x2 r1942, r1933, r1939; +} +{ +sub.f16x2 r1945, r1916, r1917; +} +{ +mul.f16x2 r1948, r1945, r1895; +} +{ +sub.f16x2 r1951, r1922, r1923; +} +{ +mul.f16x2 r1954, r1951, r1899; +} +{ +add.f16x2 r1957, r1948, r1954; +} +{ +sub.f16x2 %2, r1942, r1957; +} +{ +add.f16x2 r1963, r1904, r1905; +} +{ +mul.f16x2 r1966, r1963, r1893; +} +{ +add.f16x2 r1969, r1907, r1966; +} +{ +add.f16x2 r1972, r1910, r1911; +} +{ +mul.f16x2 r1975, r1972, r1897; +} +{ +add.f16x2 r1978, r1969, r1975; +} +{ +sub.f16x2 r1981, r1916, r1917; +} +{ +mul.f16x2 r1984, r1981, r1895; +} +{ +sub.f16x2 r1987, r1922, r1923; +} +{ +mul.f16x2 r1990, r1987, r1899; +} +{ +add.f16x2 r1993, r1984, r1990; +} +{ +add.f16x2 %8, r1978, r1993; +} +{ +add.f16x2 r1999, r1904, r1905; +} +{ +mul.f16x2 r2002, r1999, r1897; +} +{ +add.f16x2 r2005, r1907, r2002; +} +{ +add.f16x2 r2008, r1910, r1911; +} +{ +mul.f16x2 r2011, r2008, r1901; +} +{ +add.f16x2 r2014, r2005, r2011; +} +{ +sub.f16x2 r2017, r1916, r1917; +} +{ +mul.f16x2 r2020, r2017, r1899; +} +{ +sub.f16x2 r2023, r1922, r1923; +} +{ +mul.f16x2 r2026, r2023, r1902; +} +{ +add.f16x2 r2029, r2020, r2026; +} +{ +sub.f16x2 %4, r2014, r2029; +} +{ +add.f16x2 r2035, r1904, r1905; +} +{ +mul.f16x2 r2038, r2035, r1897; +} +{ +add.f16x2 r2041, r1907, r2038; +} +{ +add.f16x2 r2044, r1910, r1911; +} +{ +mul.f16x2 r2047, r2044, r1901; +} +{ +add.f16x2 r2050, r2041, r2047; +} +{ +sub.f16x2 r2053, r1916, r1917; +} +{ +mul.f16x2 r2056, r2053, r1899; +} +{ +sub.f16x2 r2059, r1922, r1923; +} +{ +mul.f16x2 r2062, r2059, r1902; +} +{ +add.f16x2 r2065, r2056, r2062; +} +{ +add.f16x2 %6, r2050, r2065; +} +{ +add.f16x2 r2071, r1916, r1917; +} +{ +mul.f16x2 r2074, r2071, r1893; +} +{ +add.f16x2 r2077, r1919, r2074; +} +{ +add.f16x2 r2080, r1922, r1923; +} +{ +mul.f16x2 r2083, r2080, r1897; +} +{ +add.f16x2 r2086, r2077, r2083; +} +{ +sub.f16x2 r2089, r1904, r1905; +} +{ +mul.f16x2 r2092, r2089, r1895; +} +{ +sub.f16x2 r2095, r1910, r1911; +} +{ +mul.f16x2 r2098, r2095, r1899; +} +{ +add.f16x2 r2101, r2092, r2098; +} +{ +add.f16x2 %3, r2086, r2101; +} +{ +add.f16x2 r2107, r1916, r1917; +} +{ +mul.f16x2 r2110, r2107, r1893; +} +{ +add.f16x2 r2113, r1919, r2110; +} +{ +add.f16x2 r2116, r1922, r1923; +} +{ +mul.f16x2 r2119, r2116, r1897; +} +{ +add.f16x2 r2122, r2113, r2119; +} +{ +sub.f16x2 r2125, r1904, r1905; +} +{ +mul.f16x2 r2128, r2125, r1895; +} +{ +sub.f16x2 r2131, r1910, r1911; +} +{ +mul.f16x2 r2134, r2131, r1899; +} +{ +add.f16x2 r2137, r2128, r2134; +} +{ +sub.f16x2 %9, r2122, r2137; +} +{ +add.f16x2 r2143, r1916, r1917; +} +{ +mul.f16x2 r2146, r2143, r1897; +} +{ +add.f16x2 r2149, r1919, r2146; +} +{ +add.f16x2 r2152, r1922, r1923; +} +{ +mul.f16x2 r2155, r2152, r1901; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +sub.f16x2 r2161, r1904, r1905; +} +{ +mul.f16x2 r2164, r2161, r1899; +} +{ +sub.f16x2 r2167, r1910, r1911; +} +{ +mul.f16x2 r2170, r2167, r1902; +} +{ +add.f16x2 r2173, r2164, r2170; +} +{ +add.f16x2 %5, r2158, r2173; +} +{ +add.f16x2 r2179, r1916, r1917; +} +{ +mul.f16x2 r2182, r2179, r1897; +} +{ +add.f16x2 r2185, r1919, r2182; +} +{ +add.f16x2 r2188, r1922, r1923; +} +{ +mul.f16x2 r2191, r2188, r1901; +} +{ +add.f16x2 r2194, r2185, r2191; +} +{ +sub.f16x2 r2197, r1904, r1905; +} +{ +mul.f16x2 r2200, r2197, r1899; +} +{ +sub.f16x2 r2203, r1910, r1911; +} +{ +mul.f16x2 r2206, r2203, r1902; +} +{ +add.f16x2 r2209, r2200, r2206; +} +{ +sub.f16x2 %7, r2194, r2209; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1117, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<121>; +.reg .b32 r<2244>; +.reg .b64 rd<10>; +mov.u32 r2215, %tid.y; +mov.u32 r2216, %10; +mad.lo.s32 r2217, r2215, 12500, r2216; +mov.u32 r2218, %tid.x; +mov.f32 f106, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1, {low, high}; +} +mov.f32 f108, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f102, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r5, {low, high}; +} +mov.f32 f104, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r2218, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r2219, rd3; +mul.lo.s32 r2220, r2219, 625; +sub.s32 r2221, r2218, r2220; +cvt.rn.f32.u32 f109, r2221; +mul.f32 f110, f109, 0f3B03C498; +cos.approx.f32 f13, f110; +sin.approx.f32 f111, f110; +neg.f32 f14, f111; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f93, 0fBF800000; +mov.f32 f94, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +mad.lo.s32 r2222, r2219, 12500, r2217; +barrier.sync 0; +mad.lo.s32 r2223, r2221, 20, r2222; +st.shared.u32 [r2223], r20; +st.shared.u32 [r2223+4], r333; +st.shared.u32 [r2223+8], r370; +st.shared.u32 [r2223+12], r407; +st.shared.u32 [r2223+16], r444; +barrier.sync 0; +shl.b32 r2224, r2221, 4; +sub.s32 r2225, r2223, r2224; +ld.shared.u32 r488, [r2225]; +ld.shared.u32 r485, [r2225+2500]; +ld.shared.u32 r491, [r2225+5000]; +ld.shared.u32 r492, [r2225+7500]; +ld.shared.u32 r486, [r2225+10000]; +barrier.sync 0; +st.shared.u32 [r2223], r32; +st.shared.u32 [r2223+4], r342; +st.shared.u32 [r2223+8], r379; +st.shared.u32 [r2223+12], r416; +st.shared.u32 [r2223+16], r453; +barrier.sync 0; +ld.shared.u32 r500, [r2225]; +ld.shared.u32 r497, [r2225+2500]; +ld.shared.u32 r503, [r2225+5000]; +ld.shared.u32 r504, [r2225+7500]; +ld.shared.u32 r498, [r2225+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 r493, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 r505, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 r577, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 r721, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 r757, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 r793, r775, r790; +} +mul.wide.u32 rd4, r2221, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r2226, rd5; +cvt.rn.f32.u32 f112, r2226; +mul.f32 f113, f112, 0f3C24B5BE; +cos.approx.f32 f37, f113; +sin.approx.f32 f114, f113; +neg.f32 f38, f114; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r796, {low, high}; +} +mul.lo.s32 r2227, r2226, 5; +sub.s32 r2228, r2221, r2227; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r801, {high, high}; +} +{ +mul.f16x2 r803, r685, r801; +} +{ +fma.rn.f16x2 r806, r541, r799, r803; +} +{ +mul.f16x2 r810, r541, r801; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r685, r799, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r821, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r823, {low, high}; +} +{ +mul.f16x2 r824, r821, r823; +} +{ +mul.f16x2 r827, r796, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r830, {high, low}; +} +{ +fma.rn.f16x2 r832, r824, r830, r827; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r838, {high, high}; +} +{ +mul.f16x2 r840, r757, r838; +} +{ +fma.rn.f16x2 r843, r613, r836, r840; +} +{ +mul.f16x2 r847, r613, r838; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r757, r836, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r858, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r860, {low, high}; +} +{ +mul.f16x2 r861, r858, r860; +} +{ +mul.f16x2 r864, r832, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r867, {high, low}; +} +{ +fma.rn.f16x2 r869, r861, r867, r864; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r873, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r875, {high, high}; +} +{ +mul.f16x2 r877, r793, r875; +} +{ +fma.rn.f16x2 r880, r649, r873, r877; +} +{ +mul.f16x2 r884, r649, r875; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r793, r873, r887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r893, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r895, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r897, {low, high}; +} +{ +mul.f16x2 r898, r895, r897; +} +{ +mul.f16x2 r901, r869, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r904, {high, low}; +} +{ +fma.rn.f16x2 r906, r898, r904, r901; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r910, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r912, {high, high}; +} +{ +mul.f16x2 r914, r721, r912; +} +{ +fma.rn.f16x2 r917, r577, r910, r914; +} +{ +mul.f16x2 r921, r577, r912; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r721, r910, r924; +} +shl.b32 r2229, r2228, 2; +add.s32 r2230, r2222, r2229; +barrier.sync 0; +mad.lo.s32 r2231, r2226, 100, r2230; +st.shared.u32 [r2231], r493; +st.shared.u32 [r2231+20], r806; +st.shared.u32 [r2231+40], r843; +st.shared.u32 [r2231+60], r880; +st.shared.u32 [r2231+80], r917; +barrier.sync 0; +ld.shared.u32 r961, [r2225]; +ld.shared.u32 r958, [r2225+2500]; +ld.shared.u32 r964, [r2225+5000]; +ld.shared.u32 r965, [r2225+7500]; +ld.shared.u32 r959, [r2225+10000]; +barrier.sync 0; +st.shared.u32 [r2231], r505; +st.shared.u32 [r2231+20], r815; +st.shared.u32 [r2231+40], r852; +st.shared.u32 [r2231+60], r889; +st.shared.u32 [r2231+80], r926; +barrier.sync 0; +ld.shared.u32 r973, [r2225]; +ld.shared.u32 r970, [r2225+2500]; +ld.shared.u32 r976, [r2225+5000]; +ld.shared.u32 r977, [r2225+7500]; +ld.shared.u32 r971, [r2225+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r951, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r952, {low, high}; +} +{ +neg.f16x2 r953, r952; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r956, {low, high}; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r961, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r960, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r973, r969; +} +{ +add.f16x2 r975, r976, r977; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, r958, r959; +} +{ +mul.f16x2 r984, r981, r947; +} +{ +add.f16x2 r987, r961, r984; +} +{ +add.f16x2 r990, r964, r965; +} +{ +mul.f16x2 r993, r990, r951; +} +{ +add.f16x2 r996, r987, r993; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r949; +} +{ +sub.f16x2 r1005, r976, r977; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +sub.f16x2 r1014, r996, r1011; +} +{ +add.f16x2 r1017, r958, r959; +} +{ +mul.f16x2 r1020, r1017, r947; +} +{ +add.f16x2 r1023, r961, r1020; +} +{ +add.f16x2 r1026, r964, r965; +} +{ +mul.f16x2 r1029, r1026, r951; +} +{ +add.f16x2 r1032, r1023, r1029; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r949; +} +{ +sub.f16x2 r1041, r976, r977; +} +{ +mul.f16x2 r1044, r1041, r953; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r1032, r1047; +} +{ +add.f16x2 r1053, r958, r959; +} +{ +mul.f16x2 r1056, r1053, r951; +} +{ +add.f16x2 r1059, r961, r1056; +} +{ +add.f16x2 r1062, r964, r965; +} +{ +mul.f16x2 r1065, r1062, r955; +} +{ +add.f16x2 r1068, r1059, r1065; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r953; +} +{ +sub.f16x2 r1077, r976, r977; +} +{ +mul.f16x2 r1080, r1077, r956; +} +{ +add.f16x2 r1083, r1074, r1080; +} +{ +sub.f16x2 r1086, r1068, r1083; +} +{ +add.f16x2 r1089, r958, r959; +} +{ +mul.f16x2 r1092, r1089, r951; +} +{ +add.f16x2 r1095, r961, r1092; +} +{ +add.f16x2 r1098, r964, r965; +} +{ +mul.f16x2 r1101, r1098, r955; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r953; +} +{ +sub.f16x2 r1113, r976, r977; +} +{ +mul.f16x2 r1116, r1113, r956; +} +{ +add.f16x2 r1119, r1110, r1116; +} +{ +add.f16x2 r1122, r1104, r1119; +} +{ +add.f16x2 r1125, r970, r971; +} +{ +mul.f16x2 r1128, r1125, r947; +} +{ +add.f16x2 r1131, r973, r1128; +} +{ +add.f16x2 r1134, r976, r977; +} +{ +mul.f16x2 r1137, r1134, r951; +} +{ +add.f16x2 r1140, r1131, r1137; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r949; +} +{ +sub.f16x2 r1149, r964, r965; +} +{ +mul.f16x2 r1152, r1149, r953; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r1140, r1155; +} +{ +add.f16x2 r1161, r970, r971; +} +{ +mul.f16x2 r1164, r1161, r947; +} +{ +add.f16x2 r1167, r973, r1164; +} +{ +add.f16x2 r1170, r976, r977; +} +{ +mul.f16x2 r1173, r1170, r951; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r949; +} +{ +sub.f16x2 r1185, r964, r965; +} +{ +mul.f16x2 r1188, r1185, r953; +} +{ +add.f16x2 r1191, r1182, r1188; +} +{ +sub.f16x2 r1194, r1176, r1191; +} +{ +add.f16x2 r1197, r970, r971; +} +{ +mul.f16x2 r1200, r1197, r951; +} +{ +add.f16x2 r1203, r973, r1200; +} +{ +add.f16x2 r1206, r976, r977; +} +{ +mul.f16x2 r1209, r1206, r955; +} +{ +add.f16x2 r1212, r1203, r1209; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r953; +} +{ +sub.f16x2 r1221, r964, r965; +} +{ +mul.f16x2 r1224, r1221, r956; +} +{ +add.f16x2 r1227, r1218, r1224; +} +{ +add.f16x2 r1230, r1212, r1227; +} +{ +add.f16x2 r1233, r970, r971; +} +{ +mul.f16x2 r1236, r1233, r951; +} +{ +add.f16x2 r1239, r973, r1236; +} +{ +add.f16x2 r1242, r976, r977; +} +{ +mul.f16x2 r1245, r1242, r955; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r953; +} +{ +sub.f16x2 r1257, r964, r965; +} +{ +mul.f16x2 r1260, r1257, r956; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +sub.f16x2 r1266, r1248, r1263; +} +mul.wide.u32 rd6, r2221, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r2232, rd7; +cvt.rn.f32.u32 f115, r2232; +mul.f32 f116, f115, 0f3D4DE32E; +cos.approx.f32 f61, f116; +sin.approx.f32 f117, f116; +neg.f32 f62, f117; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1269, {low, high}; +} +mul.lo.s32 r2233, r2232, 25; +sub.s32 r2234, r2221, r2233; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1272, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1274, {high, high}; +} +{ +mul.f16x2 r1276, r1158, r1274; +} +{ +fma.rn.f16x2 r1279, r1014, r1272, r1276; +} +{ +mul.f16x2 r1283, r1014, r1274; +} +{ +neg.f16x2 r1286, r1283; +} +{ +fma.rn.f16x2 r1288, r1158, r1272, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1294, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1296, {low, high}; +} +{ +mul.f16x2 r1297, r1294, r1296; +} +{ +mul.f16x2 r1300, r1269, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1303, {high, low}; +} +{ +fma.rn.f16x2 r1305, r1297, r1303, r1300; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1309, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1311, {high, high}; +} +{ +mul.f16x2 r1313, r1230, r1311; +} +{ +fma.rn.f16x2 r1316, r1086, r1309, r1313; +} +{ +mul.f16x2 r1320, r1086, r1311; +} +{ +neg.f16x2 r1323, r1320; +} +{ +fma.rn.f16x2 r1325, r1230, r1309, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1329, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1331, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1331, r1333; +} +{ +mul.f16x2 r1337, r1305, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1340, {high, low}; +} +{ +fma.rn.f16x2 r1342, r1334, r1340, r1337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1348, {high, high}; +} +{ +mul.f16x2 r1350, r1266, r1348; +} +{ +fma.rn.f16x2 r1353, r1122, r1346, r1350; +} +{ +mul.f16x2 r1357, r1122, r1348; +} +{ +neg.f16x2 r1360, r1357; +} +{ +fma.rn.f16x2 r1362, r1266, r1346, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1366, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1368, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1368, r1370; +} +{ +mul.f16x2 r1374, r1342, r1366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1377, {high, low}; +} +{ +fma.rn.f16x2 r1379, r1371, r1377, r1374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1385, {high, high}; +} +{ +mul.f16x2 r1387, r1194, r1385; +} +{ +fma.rn.f16x2 r1390, r1050, r1383, r1387; +} +{ +mul.f16x2 r1394, r1050, r1385; +} +{ +neg.f16x2 r1397, r1394; +} +{ +fma.rn.f16x2 r1399, r1194, r1383, r1397; +} +shl.b32 r2235, r2234, 2; +add.s32 r2236, r2222, r2235; +barrier.sync 0; +mad.lo.s32 r2237, r2232, 500, r2236; +st.shared.u32 [r2237], r966; +st.shared.u32 [r2237+100], r1279; +st.shared.u32 [r2237+200], r1316; +st.shared.u32 [r2237+300], r1353; +st.shared.u32 [r2237+400], r1390; +barrier.sync 0; +ld.shared.u32 r1434, [r2225]; +ld.shared.u32 r1431, [r2225+2500]; +ld.shared.u32 r1437, [r2225+5000]; +ld.shared.u32 r1438, [r2225+7500]; +ld.shared.u32 r1432, [r2225+10000]; +barrier.sync 0; +st.shared.u32 [r2237], r978; +st.shared.u32 [r2237+100], r1288; +st.shared.u32 [r2237+200], r1325; +st.shared.u32 [r2237+300], r1362; +st.shared.u32 [r2237+400], r1399; +barrier.sync 0; +ld.shared.u32 r1446, [r2225]; +ld.shared.u32 r1443, [r2225+2500]; +ld.shared.u32 r1449, [r2225+5000]; +ld.shared.u32 r1450, [r2225+7500]; +ld.shared.u32 r1444, [r2225+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1421, {low, high}; +} +{ +neg.f16x2 r1422, r1421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1425, {low, high}; +} +{ +neg.f16x2 r1426, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1429, {low, high}; +} +{ +add.f16x2 r1430, r1431, r1432; +} +{ +add.f16x2 r1433, r1434, r1430; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +add.f16x2 r1439, r1433, r1436; +} +{ +add.f16x2 r1442, r1443, r1444; +} +{ +add.f16x2 r1445, r1446, r1442; +} +{ +add.f16x2 r1448, r1449, r1450; +} +{ +add.f16x2 r1451, r1445, r1448; +} +{ +add.f16x2 r1454, r1431, r1432; +} +{ +mul.f16x2 r1457, r1454, r1420; +} +{ +add.f16x2 r1460, r1434, r1457; +} +{ +add.f16x2 r1463, r1437, r1438; +} +{ +mul.f16x2 r1466, r1463, r1424; +} +{ +add.f16x2 r1469, r1460, r1466; +} +{ +sub.f16x2 r1472, r1443, r1444; +} +{ +mul.f16x2 r1475, r1472, r1422; +} +{ +sub.f16x2 r1478, r1449, r1450; +} +{ +mul.f16x2 r1481, r1478, r1426; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r1469, r1484; +} +{ +add.f16x2 r1490, r1431, r1432; +} +{ +mul.f16x2 r1493, r1490, r1420; +} +{ +add.f16x2 r1496, r1434, r1493; +} +{ +add.f16x2 r1499, r1437, r1438; +} +{ +mul.f16x2 r1502, r1499, r1424; +} +{ +add.f16x2 r1505, r1496, r1502; +} +{ +sub.f16x2 r1508, r1443, r1444; +} +{ +mul.f16x2 r1511, r1508, r1422; +} +{ +sub.f16x2 r1514, r1449, r1450; +} +{ +mul.f16x2 r1517, r1514, r1426; +} +{ +add.f16x2 r1520, r1511, r1517; +} +{ +add.f16x2 r1523, r1505, r1520; +} +{ +add.f16x2 r1526, r1431, r1432; +} +{ +mul.f16x2 r1529, r1526, r1424; +} +{ +add.f16x2 r1532, r1434, r1529; +} +{ +add.f16x2 r1535, r1437, r1438; +} +{ +mul.f16x2 r1538, r1535, r1428; +} +{ +add.f16x2 r1541, r1532, r1538; +} +{ +sub.f16x2 r1544, r1443, r1444; +} +{ +mul.f16x2 r1547, r1544, r1426; +} +{ +sub.f16x2 r1550, r1449, r1450; +} +{ +mul.f16x2 r1553, r1550, r1429; +} +{ +add.f16x2 r1556, r1547, r1553; +} +{ +sub.f16x2 r1559, r1541, r1556; +} +{ +add.f16x2 r1562, r1431, r1432; +} +{ +mul.f16x2 r1565, r1562, r1424; +} +{ +add.f16x2 r1568, r1434, r1565; +} +{ +add.f16x2 r1571, r1437, r1438; +} +{ +mul.f16x2 r1574, r1571, r1428; +} +{ +add.f16x2 r1577, r1568, r1574; +} +{ +sub.f16x2 r1580, r1443, r1444; +} +{ +mul.f16x2 r1583, r1580, r1426; +} +{ +sub.f16x2 r1586, r1449, r1450; +} +{ +mul.f16x2 r1589, r1586, r1429; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1577, r1592; +} +{ +add.f16x2 r1598, r1443, r1444; +} +{ +mul.f16x2 r1601, r1598, r1420; +} +{ +add.f16x2 r1604, r1446, r1601; +} +{ +add.f16x2 r1607, r1449, r1450; +} +{ +mul.f16x2 r1610, r1607, r1424; +} +{ +add.f16x2 r1613, r1604, r1610; +} +{ +sub.f16x2 r1616, r1431, r1432; +} +{ +mul.f16x2 r1619, r1616, r1422; +} +{ +sub.f16x2 r1622, r1437, r1438; +} +{ +mul.f16x2 r1625, r1622, r1426; +} +{ +add.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1613, r1628; +} +{ +add.f16x2 r1634, r1443, r1444; +} +{ +mul.f16x2 r1637, r1634, r1420; +} +{ +add.f16x2 r1640, r1446, r1637; +} +{ +add.f16x2 r1643, r1449, r1450; +} +{ +mul.f16x2 r1646, r1643, r1424; +} +{ +add.f16x2 r1649, r1640, r1646; +} +{ +sub.f16x2 r1652, r1431, r1432; +} +{ +mul.f16x2 r1655, r1652, r1422; +} +{ +sub.f16x2 r1658, r1437, r1438; +} +{ +mul.f16x2 r1661, r1658, r1426; +} +{ +add.f16x2 r1664, r1655, r1661; +} +{ +sub.f16x2 r1667, r1649, r1664; +} +{ +add.f16x2 r1670, r1443, r1444; +} +{ +mul.f16x2 r1673, r1670, r1424; +} +{ +add.f16x2 r1676, r1446, r1673; +} +{ +add.f16x2 r1679, r1449, r1450; +} +{ +mul.f16x2 r1682, r1679, r1428; +} +{ +add.f16x2 r1685, r1676, r1682; +} +{ +sub.f16x2 r1688, r1431, r1432; +} +{ +mul.f16x2 r1691, r1688, r1426; +} +{ +sub.f16x2 r1694, r1437, r1438; +} +{ +mul.f16x2 r1697, r1694, r1429; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +add.f16x2 r1703, r1685, r1700; +} +{ +add.f16x2 r1706, r1443, r1444; +} +{ +mul.f16x2 r1709, r1706, r1424; +} +{ +add.f16x2 r1712, r1446, r1709; +} +{ +add.f16x2 r1715, r1449, r1450; +} +{ +mul.f16x2 r1718, r1715, r1428; +} +{ +add.f16x2 r1721, r1712, r1718; +} +{ +sub.f16x2 r1724, r1431, r1432; +} +{ +mul.f16x2 r1727, r1724, r1426; +} +{ +sub.f16x2 r1730, r1437, r1438; +} +{ +mul.f16x2 r1733, r1730, r1429; +} +{ +add.f16x2 r1736, r1727, r1733; +} +{ +sub.f16x2 r1739, r1721, r1736; +} +mul.wide.u32 rd8, r2221, 274877907; +shr.u64 rd9, rd8, 35; +cvt.u32.u64 r2238, rd9; +cvt.rn.f32.u32 f118, r2238; +mul.f32 f119, f118, 0f3E80ADFD; +cos.approx.f32 f85, f119; +sin.approx.f32 f120, f119; +neg.f32 f86, f120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f85; +cvt.rn.f16.f32 high, f86; +mov.b32 r1742, {low, high}; +} +mul.lo.s32 r2239, r2238, 125; +sub.s32 r2240, r2221, r2239; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1747, {high, high}; +} +{ +mul.f16x2 r1749, r1631, r1747; +} +{ +fma.rn.f16x2 r1752, r1487, r1745, r1749; +} +{ +mul.f16x2 r1756, r1487, r1747; +} +{ +neg.f16x2 r1759, r1756; +} +{ +fma.rn.f16x2 r1761, r1631, r1745, r1759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1765, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1767, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1769, {low, high}; +} +{ +mul.f16x2 r1770, r1767, r1769; +} +{ +mul.f16x2 r1773, r1742, r1765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1776, {high, low}; +} +{ +fma.rn.f16x2 r1778, r1770, r1776, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1778; +mov.b32 r1782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1778; +mov.b32 r1784, {high, high}; +} +{ +mul.f16x2 r1786, r1703, r1784; +} +{ +fma.rn.f16x2 r1789, r1559, r1782, r1786; +} +{ +mul.f16x2 r1793, r1559, r1784; +} +{ +neg.f16x2 r1796, r1793; +} +{ +fma.rn.f16x2 r1798, r1703, r1782, r1796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1802, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1804, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1806, {low, high}; +} +{ +mul.f16x2 r1807, r1804, r1806; +} +{ +mul.f16x2 r1810, r1778, r1802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1778; +mov.b32 r1813, {high, low}; +} +{ +fma.rn.f16x2 r1815, r1807, r1813, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1815; +mov.b32 r1819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1815; +mov.b32 r1821, {high, high}; +} +{ +mul.f16x2 r1823, r1739, r1821; +} +{ +fma.rn.f16x2 r1826, r1595, r1819, r1823; +} +{ +mul.f16x2 r1830, r1595, r1821; +} +{ +neg.f16x2 r1833, r1830; +} +{ +fma.rn.f16x2 r1835, r1739, r1819, r1833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1839, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1742; +mov.b32 r1841, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f93; +cvt.rn.f16.f32 high, f94; +mov.b32 r1843, {low, high}; +} +{ +mul.f16x2 r1844, r1841, r1843; +} +{ +mul.f16x2 r1847, r1815, r1839; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1815; +mov.b32 r1850, {high, low}; +} +{ +fma.rn.f16x2 r1852, r1844, r1850, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1852; +mov.b32 r1856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1852; +mov.b32 r1858, {high, high}; +} +{ +mul.f16x2 r1860, r1667, r1858; +} +{ +fma.rn.f16x2 r1863, r1523, r1856, r1860; +} +{ +mul.f16x2 r1867, r1523, r1858; +} +{ +neg.f16x2 r1870, r1867; +} +{ +fma.rn.f16x2 r1872, r1667, r1856, r1870; +} +shl.b32 r2241, r2240, 2; +add.s32 r2242, r2222, r2241; +barrier.sync 0; +mad.lo.s32 r2243, r2238, 2500, r2242; +st.shared.u32 [r2243], r1439; +st.shared.u32 [r2243+500], r1752; +st.shared.u32 [r2243+1000], r1789; +st.shared.u32 [r2243+1500], r1826; +st.shared.u32 [r2243+2000], r1863; +barrier.sync 0; +ld.shared.u32 r1907, [r2225]; +ld.shared.u32 r1904, [r2225+2500]; +ld.shared.u32 r1910, [r2225+5000]; +ld.shared.u32 r1911, [r2225+7500]; +ld.shared.u32 r1905, [r2225+10000]; +barrier.sync 0; +st.shared.u32 [r2243], r1451; +st.shared.u32 [r2243+500], r1761; +st.shared.u32 [r2243+1000], r1798; +st.shared.u32 [r2243+1500], r1835; +st.shared.u32 [r2243+2000], r1872; +barrier.sync 0; +ld.shared.u32 r1919, [r2225]; +ld.shared.u32 r1916, [r2225+2500]; +ld.shared.u32 r1922, [r2225+5000]; +ld.shared.u32 r1923, [r2225+7500]; +ld.shared.u32 r1917, [r2225+10000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1893, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1894, {low, high}; +} +{ +neg.f16x2 r1895, r1894; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1897, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1898, {low, high}; +} +{ +neg.f16x2 r1899, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f106; +cvt.rn.f16.f32 high, f106; +mov.b32 r1901, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f108; +cvt.rn.f16.f32 high, f108; +mov.b32 r1902, {low, high}; +} +{ +add.f16x2 r1903, r1904, r1905; +} +{ +add.f16x2 r1906, r1907, r1903; +} +{ +add.f16x2 r1909, r1910, r1911; +} +{ +add.f16x2 %0, r1906, r1909; +} +{ +add.f16x2 r1915, r1916, r1917; +} +{ +add.f16x2 r1918, r1919, r1915; +} +{ +add.f16x2 r1921, r1922, r1923; +} +{ +add.f16x2 %1, r1918, r1921; +} +{ +add.f16x2 r1927, r1904, r1905; +} +{ +mul.f16x2 r1930, r1927, r1893; +} +{ +add.f16x2 r1933, r1907, r1930; +} +{ +add.f16x2 r1936, r1910, r1911; +} +{ +mul.f16x2 r1939, r1936, r1897; +} +{ +add.f16x2 r1942, r1933, r1939; +} +{ +sub.f16x2 r1945, r1916, r1917; +} +{ +mul.f16x2 r1948, r1945, r1895; +} +{ +sub.f16x2 r1951, r1922, r1923; +} +{ +mul.f16x2 r1954, r1951, r1899; +} +{ +add.f16x2 r1957, r1948, r1954; +} +{ +sub.f16x2 %2, r1942, r1957; +} +{ +add.f16x2 r1963, r1904, r1905; +} +{ +mul.f16x2 r1966, r1963, r1893; +} +{ +add.f16x2 r1969, r1907, r1966; +} +{ +add.f16x2 r1972, r1910, r1911; +} +{ +mul.f16x2 r1975, r1972, r1897; +} +{ +add.f16x2 r1978, r1969, r1975; +} +{ +sub.f16x2 r1981, r1916, r1917; +} +{ +mul.f16x2 r1984, r1981, r1895; +} +{ +sub.f16x2 r1987, r1922, r1923; +} +{ +mul.f16x2 r1990, r1987, r1899; +} +{ +add.f16x2 r1993, r1984, r1990; +} +{ +add.f16x2 %8, r1978, r1993; +} +{ +add.f16x2 r1999, r1904, r1905; +} +{ +mul.f16x2 r2002, r1999, r1897; +} +{ +add.f16x2 r2005, r1907, r2002; +} +{ +add.f16x2 r2008, r1910, r1911; +} +{ +mul.f16x2 r2011, r2008, r1901; +} +{ +add.f16x2 r2014, r2005, r2011; +} +{ +sub.f16x2 r2017, r1916, r1917; +} +{ +mul.f16x2 r2020, r2017, r1899; +} +{ +sub.f16x2 r2023, r1922, r1923; +} +{ +mul.f16x2 r2026, r2023, r1902; +} +{ +add.f16x2 r2029, r2020, r2026; +} +{ +sub.f16x2 %4, r2014, r2029; +} +{ +add.f16x2 r2035, r1904, r1905; +} +{ +mul.f16x2 r2038, r2035, r1897; +} +{ +add.f16x2 r2041, r1907, r2038; +} +{ +add.f16x2 r2044, r1910, r1911; +} +{ +mul.f16x2 r2047, r2044, r1901; +} +{ +add.f16x2 r2050, r2041, r2047; +} +{ +sub.f16x2 r2053, r1916, r1917; +} +{ +mul.f16x2 r2056, r2053, r1899; +} +{ +sub.f16x2 r2059, r1922, r1923; +} +{ +mul.f16x2 r2062, r2059, r1902; +} +{ +add.f16x2 r2065, r2056, r2062; +} +{ +add.f16x2 %6, r2050, r2065; +} +{ +add.f16x2 r2071, r1916, r1917; +} +{ +mul.f16x2 r2074, r2071, r1893; +} +{ +add.f16x2 r2077, r1919, r2074; +} +{ +add.f16x2 r2080, r1922, r1923; +} +{ +mul.f16x2 r2083, r2080, r1897; +} +{ +add.f16x2 r2086, r2077, r2083; +} +{ +sub.f16x2 r2089, r1904, r1905; +} +{ +mul.f16x2 r2092, r2089, r1895; +} +{ +sub.f16x2 r2095, r1910, r1911; +} +{ +mul.f16x2 r2098, r2095, r1899; +} +{ +add.f16x2 r2101, r2092, r2098; +} +{ +add.f16x2 %3, r2086, r2101; +} +{ +add.f16x2 r2107, r1916, r1917; +} +{ +mul.f16x2 r2110, r2107, r1893; +} +{ +add.f16x2 r2113, r1919, r2110; +} +{ +add.f16x2 r2116, r1922, r1923; +} +{ +mul.f16x2 r2119, r2116, r1897; +} +{ +add.f16x2 r2122, r2113, r2119; +} +{ +sub.f16x2 r2125, r1904, r1905; +} +{ +mul.f16x2 r2128, r2125, r1895; +} +{ +sub.f16x2 r2131, r1910, r1911; +} +{ +mul.f16x2 r2134, r2131, r1899; +} +{ +add.f16x2 r2137, r2128, r2134; +} +{ +sub.f16x2 %9, r2122, r2137; +} +{ +add.f16x2 r2143, r1916, r1917; +} +{ +mul.f16x2 r2146, r2143, r1897; +} +{ +add.f16x2 r2149, r1919, r2146; +} +{ +add.f16x2 r2152, r1922, r1923; +} +{ +mul.f16x2 r2155, r2152, r1901; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +sub.f16x2 r2161, r1904, r1905; +} +{ +mul.f16x2 r2164, r2161, r1899; +} +{ +sub.f16x2 r2167, r1910, r1911; +} +{ +mul.f16x2 r2170, r2167, r1902; +} +{ +add.f16x2 r2173, r2164, r2170; +} +{ +add.f16x2 %5, r2158, r2173; +} +{ +add.f16x2 r2179, r1916, r1917; +} +{ +mul.f16x2 r2182, r2179, r1897; +} +{ +add.f16x2 r2185, r1919, r2182; +} +{ +add.f16x2 r2188, r1922, r1923; +} +{ +mul.f16x2 r2191, r2188, r1901; +} +{ +add.f16x2 r2194, r2185, r2191; +} +{ +sub.f16x2 r2197, r1904, r1905; +} +{ +mul.f16x2 r2200, r2197, r1899; +} +{ +sub.f16x2 r2203, r1910, r1911; +} +{ +mul.f16x2 r2206, r2203, r1902; +} +{ +add.f16x2 r2209, r2200, r2206; +} +{ +sub.f16x2 %7, r2194, r2209; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..f06f3ee32d062 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp32_fwd.hpp.inc @@ -0,0 +1,4854 @@ +#ifndef CUFFTDX_FFT_3125_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_3125_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<167, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2449>; +.reg .b32 r<21>; +.reg .b64 rd<15>; +mov.u32 r19, %tid.y; +mov.u32 r20, %50; +mad.lo.s32 r3, r19, 25000, r20; +add.f32 f101, %63, %93; +add.f32 f103, %73, %83; +add.f32 f2448, %53, f101; +add.f32 f104, f103, f2448; +add.f32 f105, %103, %105; +add.f32 f107, %104, %84; +add.f32 f2444, %54, f105; +add.f32 f108, f107, f2444; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f2443, f101, 0f3E9E377A, %53; +sub.f32 f111, f2443, f110; +sub.f32 f112, %103, %105; +sub.f32 f114, %104, %84; +mul.f32 f2441, f112, 0f3F737871; +mul.f32 f2442, f114, 0fBF167918; +sub.f32 f116, f2442, f2441; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %53, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f2439, f105, 0f3E9E377A, %54; +mul.f32 f2440, f107, 0f3F4F1BBD; +sub.f32 f129, f2439, f2440; +sub.f32 f130, %63, %93; +sub.f32 f132, %73, %83; +mul.f32 f2437, f130, 0f3F737871; +mul.f32 f2438, f132, 0fBF167918; +sub.f32 f134, f2438, f2437; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %54, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %65, %95; +add.f32 f147, %75, %85; +add.f32 f2436, %55, f145; +add.f32 f148, f147, f2436; +add.f32 f149, %66, %96; +add.f32 f151, %108, %106; +add.f32 f2432, %107, f149; +add.f32 f152, f151, f2432; +fma.rn.f32 f2430, f145, 0f3E9E377A, %55; +mul.f32 f2431, f147, 0f3F4F1BBD; +sub.f32 f155, f2430, f2431; +sub.f32 f156, %66, %96; +sub.f32 f158, %108, %106; +mul.f32 f2428, f156, 0f3F737871; +mul.f32 f2429, f158, 0fBF167918; +sub.f32 f160, f2429, f2428; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %55, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +mul.f32 f172, f151, 0f3F4F1BBD; +fma.rn.f32 f2427, f149, 0f3E9E377A, %107; +sub.f32 f173, f2427, f172; +sub.f32 f174, %65, %95; +sub.f32 f176, %75, %85; +mul.f32 f177, f176, 0fBF167918; +mul.f32 f2426, f174, 0f3F737871; +sub.f32 f178, f177, f2426; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %107, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %67, %97; +add.f32 f191, %77, %87; +add.f32 f2425, %57, f189; +add.f32 f192, f191, f2425; +add.f32 f193, %111, %110; +add.f32 f195, %78, %112; +add.f32 f2420, %109, f193; +add.f32 f196, f195, f2420; +mul.f32 f198, f191, 0f3F4F1BBD; +fma.rn.f32 f2419, f189, 0f3E9E377A, %57; +sub.f32 f199, f2419, f198; +sub.f32 f200, %111, %110; +sub.f32 f202, %78, %112; +mul.f32 f203, f202, 0fBF167918; +mul.f32 f2418, f200, 0f3F737871; +sub.f32 f204, f203, f2418; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %57, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f2416, f193, 0f3E9E377A, %109; +mul.f32 f2417, f195, 0f3F4F1BBD; +sub.f32 f217, f2416, f2417; +sub.f32 f218, %67, %97; +sub.f32 f220, %77, %87; +mul.f32 f2414, f218, 0f3F737871; +mul.f32 f2415, f220, 0fBF167918; +sub.f32 f222, f2415, f2414; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %109, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %69, %99; +add.f32 f235, %79, %89; +add.f32 f2413, %59, f233; +add.f32 f236, f235, f2413; +add.f32 f237, %114, %113; +add.f32 f239, %115, %90; +add.f32 f2409, %60, f237; +add.f32 f240, f239, f2409; +fma.rn.f32 f2407, f233, 0f3E9E377A, %59; +mul.f32 f2408, f235, 0f3F4F1BBD; +sub.f32 f243, f2407, f2408; +sub.f32 f244, %114, %113; +sub.f32 f246, %115, %90; +mul.f32 f2405, f244, 0f3F737871; +mul.f32 f2406, f246, 0fBF167918; +sub.f32 f248, f2406, f2405; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %59, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +mul.f32 f260, f239, 0f3F4F1BBD; +fma.rn.f32 f2404, f237, 0f3E9E377A, %60; +sub.f32 f261, f2404, f260; +sub.f32 f262, %69, %99; +sub.f32 f264, %79, %89; +mul.f32 f2402, f262, 0f3F737871; +mul.f32 f2403, f264, 0fBF167918; +sub.f32 f266, f2403, f2402; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %60, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %71, %101; +add.f32 f279, %81, %91; +add.f32 f2401, %61, f277; +add.f32 f280, f279, f2401; +add.f32 f281, %72, %102; +add.f32 f283, %118, %116; +add.f32 f2397, %117, f281; +add.f32 f284, f283, f2397; +mul.f32 f286, f279, 0f3F4F1BBD; +fma.rn.f32 f2396, f277, 0f3E9E377A, %61; +sub.f32 f287, f2396, f286; +sub.f32 f288, %72, %102; +sub.f32 f290, %118, %116; +mul.f32 f2394, f288, 0f3F737871; +mul.f32 f2395, f290, 0fBF167918; +sub.f32 f292, f2395, f2394; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %61, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +mul.f32 f304, f283, 0f3F4F1BBD; +fma.rn.f32 f2393, f281, 0f3E9E377A, %117; +sub.f32 f305, f2393, f304; +sub.f32 f306, %71, %101; +sub.f32 f308, %81, %91; +mul.f32 f2391, f306, 0f3F737871; +mul.f32 f2392, f308, 0fBF167918; +sub.f32 f310, f2392, f2391; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %117, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mul.f32 f322, f179, 0fBE7EA890; +mul.f32 f2390, f161, 0f3F77F511; +sub.f32 f323, f2390, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f327, f223, 0fBEF6A86B; +mul.f32 f2389, f205, 0f3F6055A2; +sub.f32 f328, f2389, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f332, f267, 0fBF2F3E7B; +mul.f32 f2388, f249, 0f3F3A9DB0; +sub.f32 f333, f2388, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f337, f311, 0fBF5825E0; +mul.f32 f2387, f293, 0f3F092BF2; +sub.f32 f338, f2387, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f342, f187, 0fBEF6A86B; +mul.f32 f2386, f169, 0f3F6055A2; +sub.f32 f343, f2386, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f2384, f213, 0f3F092BF2; +mul.f32 f2385, f231, 0fBF5825E0; +sub.f32 f348, f2384, f2385; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f2382, f257, 0f3D809851; +mul.f32 f2383, f275, 0fBF7F7EAE; +sub.f32 f353, f2382, f2383; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f2380, f301, 0fBED9FFBE; +mul.f32 f2381, f319, 0fBF67A2BF; +sub.f32 f358, f2380, f2381; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f2378, f170, 0f3F3A9DB0; +mul.f32 f2379, f188, 0fBF2F3E7B; +sub.f32 f363, f2378, f2379; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f367, f232, 0fBF7F7EAE; +mul.f32 f2377, f214, 0f3D809851; +sub.f32 f368, f2377, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f372, f276, 0fBF45405B; +mul.f32 f2376, f258, 0fBF232E38; +sub.f32 f373, f2376, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f377, f320, 0fBE00575B; +mul.f32 f2375, f302, 0fBF7DFB3B; +sub.f32 f378, f2375, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f382, f180, 0fBF5825E0; +mul.f32 f2374, f162, 0f3F092BF2; +sub.f32 f383, f2374, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f387, f224, 0fBF67A2BF; +mul.f32 f2373, f206, 0fBED9FFBE; +sub.f32 f388, f2373, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f2371, f250, 0fBF7DFB3B; +mul.f32 f2372, f268, 0fBE00575B; +sub.f32 f393, f2371, f2372; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f2369, f294, 0fBF232E38; +mul.f32 f2370, f312, 0f3F45405B; +sub.f32 f398, f2369, f2370; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f403, f192, f236; +mul.f32 f408, f403, 0f3F4F1BBD; +fma.rn.f32 f2368, f401, 0f3E9E377A, f104; +sub.f32 f409, f2368, f408; +add.f32 f2367, f152, f284; +sub.f32 f410, f152, f284; +add.f32 f2366, f196, f240; +sub.f32 f412, f196, f240; +mul.f32 f413, f412, 0fBF167918; +mul.f32 f2365, f410, 0f3F737871; +sub.f32 f414, f413, f2365; +sub.f32 f415, f409, f414; +add.f32 f416, f414, f409; +add.f32 f2364, f104, f401; +mul.f32 f417, f401, 0f3F4F1BBD; +sub.f32 f418, f104, f417; +fma.rn.f32 f419, f403, 0f3E9E377A, f418; +mul.f32 f420, f410, 0f3F167918; +mul.f32 f421, f412, 0f3F737871; +sub.f32 f422, f421, f420; +sub.f32 f423, f419, f422; +add.f32 f424, f422, f419; +fma.rn.f32 f2362, f2367, 0f3E9E377A, f108; +mul.f32 f2363, f2366, 0f3F4F1BBD; +sub.f32 f427, f2362, f2363; +sub.f32 f428, f148, f280; +sub.f32 f430, f192, f236; +mul.f32 f2360, f428, 0f3F737871; +mul.f32 f2361, f430, 0fBF167918; +sub.f32 f432, f2361, f2360; +add.f32 f433, f432, f427; +sub.f32 f434, f427, f432; +add.f32 f2359, f108, f2367; +mul.f32 f435, f2367, 0f3F4F1BBD; +sub.f32 f436, f108, f435; +fma.rn.f32 f437, f2366, 0f3E9E377A, f436; +mul.f32 f438, f428, 0f3F167918; +mul.f32 f439, f430, 0f3F737871; +sub.f32 f440, f439, f438; +add.f32 f441, f440, f437; +sub.f32 f442, f437, f440; +add.f32 f443, f323, f338; +add.f32 f445, f328, f333; +add.f32 f2358, f117, f443; +add.f32 f446, f445, f2358; +add.f32 f447, f325, f340; +add.f32 f449, f330, f335; +add.f32 f2357, f135, f447; +add.f32 f450, f449, f2357; +fma.rn.f32 f2355, f443, 0f3E9E377A, f117; +mul.f32 f2356, f445, 0f3F4F1BBD; +sub.f32 f453, f2355, f2356; +sub.f32 f454, f325, f340; +sub.f32 f456, f330, f335; +mul.f32 f2353, f454, 0f3F737871; +mul.f32 f2354, f456, 0fBF167918; +sub.f32 f458, f2354, f2353; +sub.f32 f459, f453, f458; +add.f32 f460, f458, f453; +mul.f32 f461, f443, 0f3F4F1BBD; +sub.f32 f462, f117, f461; +fma.rn.f32 f463, f445, 0f3E9E377A, f462; +mul.f32 f464, f454, 0f3F167918; +mul.f32 f465, f456, 0f3F737871; +sub.f32 f466, f465, f464; +sub.f32 f467, f463, f466; +add.f32 f468, f466, f463; +mul.f32 f470, f449, 0f3F4F1BBD; +fma.rn.f32 f2352, f447, 0f3E9E377A, f135; +sub.f32 f471, f2352, f470; +sub.f32 f472, f323, f338; +sub.f32 f474, f328, f333; +mul.f32 f2350, f472, 0f3F737871; +mul.f32 f2351, f474, 0fBF167918; +sub.f32 f476, f2351, f2350; +add.f32 f477, f476, f471; +sub.f32 f478, f471, f476; +mul.f32 f479, f447, 0f3F4F1BBD; +sub.f32 f480, f135, f479; +fma.rn.f32 f481, f449, 0f3E9E377A, f480; +mul.f32 f482, f472, 0f3F167918; +mul.f32 f483, f474, 0f3F737871; +sub.f32 f484, f483, f482; +add.f32 f485, f484, f481; +sub.f32 f486, f481, f484; +add.f32 f487, f343, f358; +add.f32 f489, f348, f353; +add.f32 f2349, f125, f487; +add.f32 f490, f489, f2349; +add.f32 f491, f345, f360; +add.f32 f493, f350, f355; +add.f32 f2348, f143, f491; +add.f32 f494, f493, f2348; +mul.f32 f496, f489, 0f3F4F1BBD; +fma.rn.f32 f2347, f487, 0f3E9E377A, f125; +sub.f32 f497, f2347, f496; +sub.f32 f498, f345, f360; +sub.f32 f500, f350, f355; +mul.f32 f2345, f498, 0f3F737871; +mul.f32 f2346, f500, 0fBF167918; +sub.f32 f502, f2346, f2345; +sub.f32 f503, f497, f502; +add.f32 f504, f502, f497; +mul.f32 f505, f487, 0f3F4F1BBD; +sub.f32 f506, f125, f505; +fma.rn.f32 f507, f489, 0f3E9E377A, f506; +mul.f32 f508, f498, 0f3F167918; +mul.f32 f509, f500, 0f3F737871; +sub.f32 f510, f509, f508; +sub.f32 f511, f507, f510; +add.f32 f512, f510, f507; +mul.f32 f514, f493, 0f3F4F1BBD; +fma.rn.f32 f2344, f491, 0f3E9E377A, f143; +sub.f32 f515, f2344, f514; +sub.f32 f516, f343, f358; +sub.f32 f518, f348, f353; +mul.f32 f2342, f516, 0f3F737871; +mul.f32 f2343, f518, 0fBF167918; +sub.f32 f520, f2343, f2342; +add.f32 f521, f520, f515; +sub.f32 f522, f515, f520; +mul.f32 f523, f491, 0f3F4F1BBD; +sub.f32 f524, f143, f523; +fma.rn.f32 f525, f493, 0f3E9E377A, f524; +mul.f32 f526, f516, 0f3F167918; +mul.f32 f527, f518, 0f3F737871; +sub.f32 f528, f527, f526; +add.f32 f529, f528, f525; +sub.f32 f530, f525, f528; +add.f32 f531, f363, f378; +add.f32 f533, f368, f373; +add.f32 f2341, f126, f531; +add.f32 f534, f533, f2341; +add.f32 f535, f365, f380; +add.f32 f537, f370, f375; +add.f32 f2340, f144, f535; +add.f32 f538, f537, f2340; +mul.f32 f540, f533, 0f3F4F1BBD; +fma.rn.f32 f2339, f531, 0f3E9E377A, f126; +sub.f32 f541, f2339, f540; +sub.f32 f542, f365, f380; +sub.f32 f544, f370, f375; +mul.f32 f2337, f542, 0f3F737871; +mul.f32 f2338, f544, 0fBF167918; +sub.f32 f546, f2338, f2337; +sub.f32 f547, f541, f546; +add.f32 f548, f546, f541; +mul.f32 f549, f531, 0f3F4F1BBD; +sub.f32 f550, f126, f549; +fma.rn.f32 f551, f533, 0f3E9E377A, f550; +mul.f32 f552, f542, 0f3F167918; +mul.f32 f553, f544, 0f3F737871; +sub.f32 f554, f553, f552; +sub.f32 f555, f551, f554; +add.f32 f556, f554, f551; +fma.rn.f32 f2335, f535, 0f3E9E377A, f144; +mul.f32 f2336, f537, 0f3F4F1BBD; +sub.f32 f559, f2335, f2336; +sub.f32 f560, f363, f378; +sub.f32 f562, f368, f373; +mul.f32 f2333, f560, 0f3F737871; +mul.f32 f2334, f562, 0fBF167918; +sub.f32 f564, f2334, f2333; +add.f32 f565, f564, f559; +sub.f32 f566, f559, f564; +mul.f32 f567, f535, 0f3F4F1BBD; +sub.f32 f568, f144, f567; +fma.rn.f32 f569, f537, 0f3E9E377A, f568; +mul.f32 f570, f560, 0f3F167918; +mul.f32 f571, f562, 0f3F737871; +sub.f32 f572, f571, f570; +add.f32 f573, f572, f569; +sub.f32 f574, f569, f572; +add.f32 f575, f383, f398; +add.f32 f577, f388, f393; +add.f32 f2332, f118, f575; +add.f32 f578, f577, f2332; +add.f32 f579, f385, f400; +add.f32 f581, f390, f395; +add.f32 f2331, f136, f579; +add.f32 f582, f581, f2331; +fma.rn.f32 f2329, f575, 0f3E9E377A, f118; +mul.f32 f2330, f577, 0f3F4F1BBD; +sub.f32 f585, f2329, f2330; +sub.f32 f586, f385, f400; +sub.f32 f588, f390, f395; +mul.f32 f2327, f586, 0f3F737871; +mul.f32 f2328, f588, 0fBF167918; +sub.f32 f590, f2328, f2327; +sub.f32 f591, f585, f590; +add.f32 f592, f590, f585; +mul.f32 f593, f575, 0f3F4F1BBD; +sub.f32 f594, f118, f593; +fma.rn.f32 f595, f577, 0f3E9E377A, f594; +mul.f32 f596, f586, 0f3F167918; +mul.f32 f597, f588, 0f3F737871; +sub.f32 f598, f597, f596; +sub.f32 f599, f595, f598; +add.f32 f600, f598, f595; +mul.f32 f602, f581, 0f3F4F1BBD; +fma.rn.f32 f2326, f579, 0f3E9E377A, f136; +sub.f32 f603, f2326, f602; +sub.f32 f604, f383, f398; +sub.f32 f606, f388, f393; +mul.f32 f2324, f604, 0f3F737871; +mul.f32 f2325, f606, 0fBF167918; +sub.f32 f608, f2325, f2324; +add.f32 f609, f608, f603; +sub.f32 f610, f603, f608; +mul.f32 f611, f579, 0f3F4F1BBD; +sub.f32 f612, f136, f611; +fma.rn.f32 f613, f581, 0f3E9E377A, f612; +mul.f32 f614, f604, 0f3F167918; +mul.f32 f615, f606, 0f3F737871; +sub.f32 f616, f615, f614; +add.f32 f617, f616, f613; +sub.f32 f618, f613, f616; +mov.u32 r18, %tid.x; +mul.wide.u32 rd2, r18, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r18, r6; +mad.lo.s32 r8, r5, 25000, r3; +mov.u64 rd5, %51; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f619, f620}, [rd6]; +mul.f32 f624, f620, f450; +mul.f32 f625, f619, f450; +mul.f32 f627, f620, f620; +mul.f32 f2323, f619, f619; +sub.f32 f628, f2323, f627; +mul.f32 f629, f620, f619; +fma.rn.f32 f630, f620, f619, f629; +mul.f32 f632, f630, f494; +mul.f32 f633, f628, f494; +mul.f32 f2321, f619, f628; +mul.f32 f2322, f620, f630; +sub.f32 f636, f2321, f2322; +mul.f32 f2320, f628, f490; +mul.f32 f637, f619, f630; +fma.rn.f32 f638, f620, f628, f637; +mul.f32 f640, f638, f538; +mul.f32 f641, f636, f538; +mul.f32 f643, f620, f638; +mul.f32 f2319, f619, f636; +sub.f32 f644, f2319, f643; +mul.f32 f2318, f636, f534; +mul.f32 f645, f619, f638; +fma.rn.f32 f646, f620, f636, f645; +mul.f32 f648, f646, f582; +mul.f32 f649, f644, f582; +mul.f32 f2316, f619, f644; +mul.f32 f2317, f620, f646; +sub.f32 f652, f2316, f2317; +mul.f32 f2315, f644, f578; +mul.f32 f653, f619, f646; +fma.rn.f32 f654, f620, f644, f653; +mul.f32 f656, f654, f433; +mul.f32 f657, f652, f433; +mul.f32 f659, f620, f654; +mul.f32 f2314, f619, f652; +sub.f32 f660, f2314, f659; +mul.f32 f2313, f652, f415; +mul.f32 f661, f619, f654; +fma.rn.f32 f662, f620, f652, f661; +mul.f32 f664, f662, f477; +mul.f32 f665, f660, f477; +mul.f32 f667, f620, f662; +mul.f32 f2312, f619, f660; +sub.f32 f668, f2312, f667; +mul.f32 f2311, f660, f459; +mul.f32 f669, f619, f662; +fma.rn.f32 f670, f620, f660, f669; +mul.f32 f672, f670, f521; +mul.f32 f673, f668, f521; +mul.f32 f2309, f619, f668; +mul.f32 f2310, f620, f670; +sub.f32 f676, f2309, f2310; +mul.f32 f2308, f668, f503; +mul.f32 f677, f619, f670; +fma.rn.f32 f678, f620, f668, f677; +mul.f32 f680, f678, f565; +mul.f32 f681, f676, f565; +mul.f32 f683, f620, f678; +mul.f32 f2307, f619, f676; +sub.f32 f684, f2307, f683; +mul.f32 f2306, f676, f547; +mul.f32 f685, f619, f678; +fma.rn.f32 f686, f620, f676, f685; +mul.f32 f688, f686, f609; +mul.f32 f689, f684, f609; +mul.f32 f691, f620, f686; +mul.f32 f2305, f619, f684; +sub.f32 f692, f2305, f691; +mul.f32 f2304, f684, f591; +mul.f32 f693, f619, f686; +fma.rn.f32 f694, f620, f684, f693; +mul.f32 f696, f694, f441; +mul.f32 f697, f692, f441; +mul.f32 f2302, f619, f692; +mul.f32 f2303, f620, f694; +sub.f32 f700, f2302, f2303; +mul.f32 f2301, f692, f423; +mul.f32 f701, f619, f694; +fma.rn.f32 f702, f620, f692, f701; +mul.f32 f704, f702, f485; +mul.f32 f705, f700, f485; +mul.f32 f707, f620, f702; +mul.f32 f2300, f619, f700; +sub.f32 f708, f2300, f707; +mul.f32 f2299, f700, f467; +mul.f32 f709, f619, f702; +fma.rn.f32 f710, f620, f700, f709; +mul.f32 f712, f710, f529; +mul.f32 f713, f708, f529; +mul.f32 f2297, f619, f708; +mul.f32 f2298, f620, f710; +sub.f32 f716, f2297, f2298; +mul.f32 f2296, f708, f511; +mul.f32 f717, f619, f710; +fma.rn.f32 f718, f620, f708, f717; +mul.f32 f720, f718, f573; +mul.f32 f721, f716, f573; +mul.f32 f723, f620, f718; +mul.f32 f2295, f619, f716; +sub.f32 f724, f2295, f723; +mul.f32 f2294, f716, f555; +mul.f32 f725, f619, f718; +fma.rn.f32 f726, f620, f716, f725; +mul.f32 f728, f726, f617; +mul.f32 f729, f724, f617; +mul.f32 f731, f620, f726; +mul.f32 f2293, f619, f724; +sub.f32 f732, f2293, f731; +mul.f32 f2292, f724, f599; +mul.f32 f733, f619, f726; +fma.rn.f32 f734, f620, f724, f733; +mul.f32 f736, f734, f442; +mul.f32 f737, f732, f442; +mul.f32 f2290, f619, f732; +mul.f32 f2291, f620, f734; +sub.f32 f740, f2290, f2291; +mul.f32 f2289, f732, f424; +mul.f32 f741, f619, f734; +fma.rn.f32 f742, f620, f732, f741; +mul.f32 f744, f742, f486; +mul.f32 f745, f740, f486; +mul.f32 f747, f620, f742; +mul.f32 f2288, f619, f740; +sub.f32 f748, f2288, f747; +mul.f32 f2287, f740, f468; +mul.f32 f749, f619, f742; +fma.rn.f32 f750, f620, f740, f749; +mul.f32 f752, f750, f530; +mul.f32 f753, f748, f530; +mul.f32 f755, f620, f750; +mul.f32 f2286, f619, f748; +sub.f32 f756, f2286, f755; +mul.f32 f2285, f748, f512; +mul.f32 f757, f619, f750; +fma.rn.f32 f758, f620, f748, f757; +mul.f32 f760, f758, f574; +mul.f32 f761, f756, f574; +mul.f32 f2283, f619, f756; +mul.f32 f2284, f620, f758; +sub.f32 f764, f2283, f2284; +mul.f32 f2282, f756, f556; +mul.f32 f765, f619, f758; +fma.rn.f32 f766, f620, f756, f765; +mul.f32 f768, f766, f618; +mul.f32 f769, f764, f618; +mul.f32 f771, f620, f766; +mul.f32 f2281, f619, f764; +sub.f32 f772, f2281, f771; +mul.f32 f2280, f764, f600; +mul.f32 f773, f619, f766; +fma.rn.f32 f774, f620, f764, f773; +mul.f32 f776, f774, f434; +mul.f32 f777, f772, f434; +mul.f32 f2278, f619, f772; +mul.f32 f2279, f620, f774; +sub.f32 f780, f2278, f2279; +mul.f32 f2277, f772, f416; +mul.f32 f781, f619, f774; +fma.rn.f32 f782, f620, f772, f781; +mul.f32 f784, f782, f478; +mul.f32 f785, f780, f478; +mul.f32 f787, f620, f782; +mul.f32 f2276, f619, f780; +sub.f32 f788, f2276, f787; +mul.f32 f2275, f780, f460; +mul.f32 f789, f619, f782; +fma.rn.f32 f790, f620, f780, f789; +mul.f32 f792, f790, f522; +mul.f32 f793, f788, f522; +mul.f32 f795, f620, f790; +mul.f32 f2274, f619, f788; +sub.f32 f796, f2274, f795; +mul.f32 f2273, f788, f504; +mul.f32 f797, f619, f790; +fma.rn.f32 f798, f620, f788, f797; +mul.f32 f800, f798, f566; +mul.f32 f801, f796, f566; +mul.f32 f2271, f619, f796; +mul.f32 f2272, f620, f798; +sub.f32 f804, f2271, f2272; +mul.f32 f2270, f619, f446; +mul.f32 f805, f619, f798; +mul.f32 f2269, f796, f548; +fma.rn.f32 f806, f620, f796, f805; +mul.f32 f807, f804, f592; +mul.f32 f808, f806, f610; +mul.f32 f809, f804, f610; +barrier.sync 0; +add.f32 f810, f2366, f2359; +add.f32 f811, f403, f2364; +mad.lo.s32 r17, r7, 200, r8; +st.shared.v2.f32 [r17], {f811, f810}; +fma.rn.f32 f812, f620, f446, f625; +sub.f32 f813, f2270, f624; +st.shared.v2.f32 [r17+8], {f813, f812}; +fma.rn.f32 f814, f630, f490, f633; +sub.f32 f815, f2320, f632; +st.shared.v2.f32 [r17+16], {f815, f814}; +fma.rn.f32 f816, f638, f534, f641; +sub.f32 f817, f2318, f640; +st.shared.v2.f32 [r17+24], {f817, f816}; +fma.rn.f32 f818, f646, f578, f649; +sub.f32 f819, f2315, f648; +st.shared.v2.f32 [r17+32], {f819, f818}; +sub.f32 f820, f2313, f656; +fma.rn.f32 f821, f654, f415, f657; +st.shared.v2.f32 [r17+40], {f820, f821}; +fma.rn.f32 f822, f662, f459, f665; +sub.f32 f823, f2311, f664; +st.shared.v2.f32 [r17+48], {f823, f822}; +sub.f32 f824, f2308, f672; +fma.rn.f32 f825, f670, f503, f673; +st.shared.v2.f32 [r17+56], {f824, f825}; +fma.rn.f32 f826, f678, f547, f681; +sub.f32 f827, f2306, f680; +st.shared.v2.f32 [r17+64], {f827, f826}; +fma.rn.f32 f828, f686, f591, f689; +sub.f32 f829, f2304, f688; +st.shared.v2.f32 [r17+72], {f829, f828}; +fma.rn.f32 f830, f694, f423, f697; +sub.f32 f831, f2301, f696; +st.shared.v2.f32 [r17+80], {f831, f830}; +fma.rn.f32 f832, f702, f467, f705; +sub.f32 f833, f2299, f704; +st.shared.v2.f32 [r17+88], {f833, f832}; +fma.rn.f32 f834, f710, f511, f713; +sub.f32 f835, f2296, f712; +st.shared.v2.f32 [r17+96], {f835, f834}; +fma.rn.f32 f836, f718, f555, f721; +sub.f32 f837, f2294, f720; +st.shared.v2.f32 [r17+104], {f837, f836}; +fma.rn.f32 f838, f726, f599, f729; +sub.f32 f839, f2292, f728; +st.shared.v2.f32 [r17+112], {f839, f838}; +fma.rn.f32 f840, f734, f424, f737; +sub.f32 f841, f2289, f736; +st.shared.v2.f32 [r17+120], {f841, f840}; +fma.rn.f32 f842, f742, f468, f745; +sub.f32 f843, f2287, f744; +st.shared.v2.f32 [r17+128], {f843, f842}; +fma.rn.f32 f844, f750, f512, f753; +sub.f32 f845, f2285, f752; +st.shared.v2.f32 [r17+136], {f845, f844}; +fma.rn.f32 f846, f758, f556, f761; +sub.f32 f847, f2282, f760; +st.shared.v2.f32 [r17+144], {f847, f846}; +fma.rn.f32 f848, f766, f600, f769; +sub.f32 f849, f2280, f768; +st.shared.v2.f32 [r17+152], {f849, f848}; +fma.rn.f32 f850, f774, f416, f777; +sub.f32 f851, f2277, f776; +st.shared.v2.f32 [r17+160], {f851, f850}; +fma.rn.f32 f852, f782, f460, f785; +sub.f32 f853, f2275, f784; +st.shared.v2.f32 [r17+168], {f853, f852}; +fma.rn.f32 f854, f790, f504, f793; +sub.f32 f855, f2273, f792; +st.shared.v2.f32 [r17+176], {f855, f854}; +fma.rn.f32 f856, f798, f548, f801; +sub.f32 f857, f2269, f800; +st.shared.v2.f32 [r17+184], {f857, f856}; +fma.rn.f32 f858, f806, f592, f809; +sub.f32 f859, f807, f808; +st.shared.v2.f32 [r17+192], {f859, f858}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r17; +ld.shared.v2.f32 {f860, f861}, [r10]; +ld.shared.v2.f32 {f864, f865}, [r10+1000]; +ld.shared.v2.f32 {f868, f869}, [r10+2000]; +ld.shared.v2.f32 {f872, f873}, [r10+3000]; +ld.shared.v2.f32 {f876, f877}, [r10+4000]; +ld.shared.v2.f32 {f880, f881}, [r10+5000]; +ld.shared.v2.f32 {f884, f885}, [r10+6000]; +ld.shared.v2.f32 {f888, f889}, [r10+7000]; +ld.shared.v2.f32 {f892, f893}, [r10+8000]; +ld.shared.v2.f32 {f896, f897}, [r10+9000]; +ld.shared.v2.f32 {f900, f901}, [r10+10000]; +ld.shared.v2.f32 {f904, f905}, [r10+11000]; +ld.shared.v2.f32 {f908, f909}, [r10+12000]; +ld.shared.v2.f32 {f912, f913}, [r10+13000]; +ld.shared.v2.f32 {f916, f917}, [r10+14000]; +ld.shared.v2.f32 {f920, f921}, [r10+15000]; +ld.shared.v2.f32 {f924, f925}, [r10+16000]; +ld.shared.v2.f32 {f928, f929}, [r10+17000]; +ld.shared.v2.f32 {f932, f933}, [r10+18000]; +ld.shared.v2.f32 {f936, f937}, [r10+19000]; +ld.shared.v2.f32 {f940, f941}, [r10+20000]; +ld.shared.v2.f32 {f944, f945}, [r10+21000]; +ld.shared.v2.f32 {f948, f949}, [r10+22000]; +ld.shared.v2.f32 {f952, f953}, [r10+23000]; +ld.shared.v2.f32 {f956, f957}, [r10+24000]; +add.f32 f960, f880, f940; +add.f32 f962, f900, f920; +add.f32 f2268, f860, f960; +add.f32 f963, f962, f2268; +add.f32 f964, f881, f941; +add.f32 f966, f901, f921; +add.f32 f2267, f861, f964; +add.f32 f967, f966, f2267; +fma.rn.f32 f2265, f960, 0f3E9E377A, f860; +mul.f32 f2266, f962, 0f3F4F1BBD; +sub.f32 f970, f2265, f2266; +sub.f32 f971, f881, f941; +sub.f32 f973, f901, f921; +mul.f32 f2263, f971, 0f3F737871; +mul.f32 f2264, f973, 0fBF167918; +sub.f32 f975, f2264, f2263; +sub.f32 f976, f970, f975; +add.f32 f977, f975, f970; +mul.f32 f978, f960, 0f3F4F1BBD; +sub.f32 f979, f860, f978; +fma.rn.f32 f980, f962, 0f3E9E377A, f979; +mul.f32 f981, f971, 0f3F167918; +mul.f32 f982, f973, 0f3F737871; +sub.f32 f983, f982, f981; +sub.f32 f984, f980, f983; +add.f32 f985, f983, f980; +mul.f32 f987, f966, 0f3F4F1BBD; +fma.rn.f32 f2262, f964, 0f3E9E377A, f861; +sub.f32 f988, f2262, f987; +sub.f32 f989, f880, f940; +sub.f32 f991, f900, f920; +mul.f32 f2260, f989, 0f3F737871; +mul.f32 f2261, f991, 0fBF167918; +sub.f32 f993, f2261, f2260; +add.f32 f994, f993, f988; +sub.f32 f995, f988, f993; +mul.f32 f996, f964, 0f3F4F1BBD; +sub.f32 f997, f861, f996; +fma.rn.f32 f998, f966, 0f3E9E377A, f997; +mul.f32 f999, f989, 0f3F167918; +mul.f32 f1000, f991, 0f3F737871; +sub.f32 f1001, f1000, f999; +add.f32 f1002, f1001, f998; +sub.f32 f1003, f998, f1001; +add.f32 f1004, f884, f944; +add.f32 f1006, f904, f924; +add.f32 f2259, f864, f1004; +add.f32 f1007, f1006, f2259; +add.f32 f1008, f885, f945; +add.f32 f1010, f905, f925; +add.f32 f2258, f865, f1008; +add.f32 f1011, f1010, f2258; +mul.f32 f1013, f1006, 0f3F4F1BBD; +fma.rn.f32 f2257, f1004, 0f3E9E377A, f864; +sub.f32 f1014, f2257, f1013; +sub.f32 f1015, f885, f945; +sub.f32 f1017, f905, f925; +mul.f32 f2255, f1015, 0f3F737871; +mul.f32 f2256, f1017, 0fBF167918; +sub.f32 f1019, f2256, f2255; +sub.f32 f1020, f1014, f1019; +add.f32 f1021, f1019, f1014; +mul.f32 f1022, f1004, 0f3F4F1BBD; +sub.f32 f1023, f864, f1022; +fma.rn.f32 f1024, f1006, 0f3E9E377A, f1023; +mul.f32 f1025, f1015, 0f3F167918; +mul.f32 f1026, f1017, 0f3F737871; +sub.f32 f1027, f1026, f1025; +sub.f32 f1028, f1024, f1027; +add.f32 f1029, f1027, f1024; +fma.rn.f32 f2253, f1008, 0f3E9E377A, f865; +mul.f32 f2254, f1010, 0f3F4F1BBD; +sub.f32 f1032, f2253, f2254; +sub.f32 f1033, f884, f944; +sub.f32 f1035, f904, f924; +mul.f32 f2251, f1033, 0f3F737871; +mul.f32 f2252, f1035, 0fBF167918; +sub.f32 f1037, f2252, f2251; +add.f32 f1038, f1037, f1032; +sub.f32 f1039, f1032, f1037; +mul.f32 f1040, f1008, 0f3F4F1BBD; +sub.f32 f1041, f865, f1040; +fma.rn.f32 f1042, f1010, 0f3E9E377A, f1041; +mul.f32 f1043, f1033, 0f3F167918; +mul.f32 f1044, f1035, 0f3F737871; +sub.f32 f1045, f1044, f1043; +add.f32 f1046, f1045, f1042; +sub.f32 f1047, f1042, f1045; +add.f32 f1048, f888, f948; +add.f32 f1050, f908, f928; +add.f32 f2250, f868, f1048; +add.f32 f1051, f1050, f2250; +add.f32 f1052, f889, f949; +add.f32 f1054, f909, f929; +add.f32 f2249, f869, f1052; +add.f32 f1055, f1054, f2249; +fma.rn.f32 f2247, f1048, 0f3E9E377A, f868; +mul.f32 f2248, f1050, 0f3F4F1BBD; +sub.f32 f1058, f2247, f2248; +sub.f32 f1059, f889, f949; +sub.f32 f1061, f909, f929; +mul.f32 f2245, f1059, 0f3F737871; +mul.f32 f2246, f1061, 0fBF167918; +sub.f32 f1063, f2246, f2245; +sub.f32 f1064, f1058, f1063; +add.f32 f1065, f1063, f1058; +mul.f32 f1066, f1048, 0f3F4F1BBD; +sub.f32 f1067, f868, f1066; +fma.rn.f32 f1068, f1050, 0f3E9E377A, f1067; +mul.f32 f1069, f1059, 0f3F167918; +mul.f32 f1070, f1061, 0f3F737871; +sub.f32 f1071, f1070, f1069; +sub.f32 f1072, f1068, f1071; +add.f32 f1073, f1071, f1068; +mul.f32 f1075, f1054, 0f3F4F1BBD; +fma.rn.f32 f2244, f1052, 0f3E9E377A, f869; +sub.f32 f1076, f2244, f1075; +sub.f32 f1077, f888, f948; +sub.f32 f1079, f908, f928; +mul.f32 f1080, f1079, 0fBF167918; +mul.f32 f2243, f1077, 0f3F737871; +sub.f32 f1081, f1080, f2243; +add.f32 f1082, f1081, f1076; +sub.f32 f1083, f1076, f1081; +mul.f32 f1084, f1052, 0f3F4F1BBD; +sub.f32 f1085, f869, f1084; +fma.rn.f32 f1086, f1054, 0f3E9E377A, f1085; +mul.f32 f1087, f1077, 0f3F167918; +mul.f32 f1088, f1079, 0f3F737871; +sub.f32 f1089, f1088, f1087; +add.f32 f1090, f1089, f1086; +sub.f32 f1091, f1086, f1089; +add.f32 f1092, f892, f952; +add.f32 f1094, f912, f932; +add.f32 f2242, f872, f1092; +add.f32 f1095, f1094, f2242; +add.f32 f1096, f893, f953; +add.f32 f1098, f913, f933; +add.f32 f2241, f873, f1096; +add.f32 f1099, f1098, f2241; +mul.f32 f1101, f1094, 0f3F4F1BBD; +fma.rn.f32 f2240, f1092, 0f3E9E377A, f872; +sub.f32 f1102, f2240, f1101; +sub.f32 f1103, f893, f953; +sub.f32 f1105, f913, f933; +mul.f32 f1106, f1105, 0fBF167918; +mul.f32 f2239, f1103, 0f3F737871; +sub.f32 f1107, f1106, f2239; +sub.f32 f1108, f1102, f1107; +add.f32 f1109, f1107, f1102; +mul.f32 f1110, f1092, 0f3F4F1BBD; +sub.f32 f1111, f872, f1110; +fma.rn.f32 f1112, f1094, 0f3E9E377A, f1111; +mul.f32 f1113, f1103, 0f3F167918; +mul.f32 f1114, f1105, 0f3F737871; +sub.f32 f1115, f1114, f1113; +sub.f32 f1116, f1112, f1115; +add.f32 f1117, f1115, f1112; +fma.rn.f32 f2237, f1096, 0f3E9E377A, f873; +mul.f32 f2238, f1098, 0f3F4F1BBD; +sub.f32 f1120, f2237, f2238; +sub.f32 f1121, f892, f952; +sub.f32 f1123, f912, f932; +mul.f32 f2235, f1121, 0f3F737871; +mul.f32 f2236, f1123, 0fBF167918; +sub.f32 f1125, f2236, f2235; +add.f32 f1126, f1125, f1120; +sub.f32 f1127, f1120, f1125; +mul.f32 f1128, f1096, 0f3F4F1BBD; +sub.f32 f1129, f873, f1128; +fma.rn.f32 f1130, f1098, 0f3E9E377A, f1129; +mul.f32 f1131, f1121, 0f3F167918; +mul.f32 f1132, f1123, 0f3F737871; +sub.f32 f1133, f1132, f1131; +add.f32 f1134, f1133, f1130; +sub.f32 f1135, f1130, f1133; +add.f32 f1136, f896, f956; +add.f32 f1138, f916, f936; +add.f32 f2234, f876, f1136; +add.f32 f1139, f1138, f2234; +add.f32 f1140, f897, f957; +add.f32 f1142, f917, f937; +add.f32 f2233, f877, f1140; +add.f32 f1143, f1142, f2233; +fma.rn.f32 f2231, f1136, 0f3E9E377A, f876; +mul.f32 f2232, f1138, 0f3F4F1BBD; +sub.f32 f1146, f2231, f2232; +sub.f32 f1147, f897, f957; +sub.f32 f1149, f917, f937; +mul.f32 f2229, f1147, 0f3F737871; +mul.f32 f2230, f1149, 0fBF167918; +sub.f32 f1151, f2230, f2229; +sub.f32 f1152, f1146, f1151; +add.f32 f1153, f1151, f1146; +mul.f32 f1154, f1136, 0f3F4F1BBD; +sub.f32 f1155, f876, f1154; +fma.rn.f32 f1156, f1138, 0f3E9E377A, f1155; +mul.f32 f1157, f1147, 0f3F167918; +mul.f32 f1158, f1149, 0f3F737871; +sub.f32 f1159, f1158, f1157; +sub.f32 f1160, f1156, f1159; +add.f32 f1161, f1159, f1156; +mul.f32 f1163, f1142, 0f3F4F1BBD; +fma.rn.f32 f2228, f1140, 0f3E9E377A, f877; +sub.f32 f1164, f2228, f1163; +sub.f32 f1165, f896, f956; +sub.f32 f1167, f916, f936; +mul.f32 f2226, f1165, 0f3F737871; +mul.f32 f2227, f1167, 0fBF167918; +sub.f32 f1169, f2227, f2226; +add.f32 f1170, f1169, f1164; +sub.f32 f1171, f1164, f1169; +mul.f32 f1172, f1140, 0f3F4F1BBD; +sub.f32 f1173, f877, f1172; +fma.rn.f32 f1174, f1142, 0f3E9E377A, f1173; +mul.f32 f1175, f1165, 0f3F167918; +mul.f32 f1176, f1167, 0f3F737871; +sub.f32 f1177, f1176, f1175; +add.f32 f1178, f1177, f1174; +sub.f32 f1179, f1174, f1177; +mul.f32 f1181, f1038, 0fBE7EA890; +mul.f32 f2225, f1020, 0f3F77F511; +sub.f32 f1182, f2225, f1181; +mul.f32 f1183, f1038, 0f3F77F511; +fma.rn.f32 f1184, f1020, 0fBE7EA890, f1183; +mul.f32 f1186, f1082, 0fBEF6A86B; +mul.f32 f2224, f1064, 0f3F6055A2; +sub.f32 f1187, f2224, f1186; +mul.f32 f1188, f1082, 0f3F6055A2; +fma.rn.f32 f1189, f1064, 0fBEF6A86B, f1188; +mul.f32 f1191, f1126, 0fBF2F3E7B; +mul.f32 f2223, f1108, 0f3F3A9DB0; +sub.f32 f1192, f2223, f1191; +mul.f32 f1193, f1126, 0f3F3A9DB0; +fma.rn.f32 f1194, f1108, 0fBF2F3E7B, f1193; +mul.f32 f1196, f1170, 0fBF5825E0; +mul.f32 f2222, f1152, 0f3F092BF2; +sub.f32 f1197, f2222, f1196; +mul.f32 f1198, f1170, 0f3F092BF2; +fma.rn.f32 f1199, f1152, 0fBF5825E0, f1198; +mul.f32 f1201, f1046, 0fBEF6A86B; +mul.f32 f2221, f1028, 0f3F6055A2; +sub.f32 f1202, f2221, f1201; +mul.f32 f1203, f1046, 0f3F6055A2; +fma.rn.f32 f1204, f1028, 0fBEF6A86B, f1203; +mul.f32 f2219, f1072, 0f3F092BF2; +mul.f32 f2220, f1090, 0fBF5825E0; +sub.f32 f1207, f2219, f2220; +mul.f32 f1208, f1090, 0f3F092BF2; +fma.rn.f32 f1209, f1072, 0fBF5825E0, f1208; +mul.f32 f2217, f1116, 0f3D809851; +mul.f32 f2218, f1134, 0fBF7F7EAE; +sub.f32 f1212, f2217, f2218; +mul.f32 f1213, f1134, 0f3D809851; +fma.rn.f32 f1214, f1116, 0fBF7F7EAE, f1213; +mul.f32 f2215, f1160, 0fBED9FFBE; +mul.f32 f2216, f1178, 0fBF67A2BF; +sub.f32 f1217, f2215, f2216; +mul.f32 f1218, f1178, 0fBED9FFBE; +fma.rn.f32 f1219, f1160, 0fBF67A2BF, f1218; +mul.f32 f1221, f1047, 0fBF2F3E7B; +mul.f32 f2214, f1029, 0f3F3A9DB0; +sub.f32 f1222, f2214, f1221; +mul.f32 f1223, f1047, 0f3F3A9DB0; +fma.rn.f32 f1224, f1029, 0fBF2F3E7B, f1223; +mul.f32 f1226, f1091, 0fBF7F7EAE; +mul.f32 f2213, f1073, 0f3D809851; +sub.f32 f1227, f2213, f1226; +mul.f32 f1228, f1091, 0f3D809851; +fma.rn.f32 f1229, f1073, 0fBF7F7EAE, f1228; +mul.f32 f1231, f1135, 0fBF45405B; +mul.f32 f2212, f1117, 0fBF232E38; +sub.f32 f1232, f2212, f1231; +mul.f32 f1233, f1135, 0fBF232E38; +fma.rn.f32 f1234, f1117, 0fBF45405B, f1233; +mul.f32 f1236, f1179, 0fBE00575B; +mul.f32 f2211, f1161, 0fBF7DFB3B; +sub.f32 f1237, f2211, f1236; +mul.f32 f1238, f1179, 0fBF7DFB3B; +fma.rn.f32 f1239, f1161, 0fBE00575B, f1238; +mul.f32 f1241, f1039, 0fBF5825E0; +mul.f32 f2210, f1021, 0f3F092BF2; +sub.f32 f1242, f2210, f1241; +mul.f32 f1243, f1039, 0f3F092BF2; +fma.rn.f32 f1244, f1021, 0fBF5825E0, f1243; +mul.f32 f1246, f1083, 0fBF67A2BF; +mul.f32 f2209, f1065, 0fBED9FFBE; +sub.f32 f1247, f2209, f1246; +mul.f32 f1248, f1083, 0fBED9FFBE; +fma.rn.f32 f1249, f1065, 0fBF67A2BF, f1248; +mul.f32 f2207, f1109, 0fBF7DFB3B; +mul.f32 f2208, f1127, 0fBE00575B; +sub.f32 f1252, f2207, f2208; +mul.f32 f1253, f1127, 0fBF7DFB3B; +fma.rn.f32 f1254, f1109, 0fBE00575B, f1253; +mul.f32 f2205, f1153, 0fBF232E38; +mul.f32 f2206, f1171, 0f3F45405B; +sub.f32 f1257, f2205, f2206; +mul.f32 f1258, f1171, 0fBF232E38; +fma.rn.f32 f1259, f1153, 0f3F45405B, f1258; +add.f32 f1260, f1007, f1139; +add.f32 f1262, f1051, f1095; +mul.f32 f1267, f1262, 0f3F4F1BBD; +fma.rn.f32 f2204, f1260, 0f3E9E377A, f963; +sub.f32 f1268, f2204, f1267; +add.f32 f2203, f1011, f1143; +sub.f32 f1269, f1011, f1143; +add.f32 f2202, f1055, f1099; +sub.f32 f1271, f1055, f1099; +mul.f32 f1272, f1271, 0fBF167918; +mul.f32 f2201, f1269, 0f3F737871; +sub.f32 f1273, f1272, f2201; +sub.f32 f1274, f1268, f1273; +add.f32 f1275, f1273, f1268; +add.f32 f2200, f963, f1260; +mul.f32 f1276, f1260, 0f3F4F1BBD; +sub.f32 f1277, f963, f1276; +fma.rn.f32 f1278, f1262, 0f3E9E377A, f1277; +mul.f32 f1279, f1269, 0f3F167918; +mul.f32 f1280, f1271, 0f3F737871; +sub.f32 f1281, f1280, f1279; +sub.f32 f1282, f1278, f1281; +add.f32 f1283, f1281, f1278; +mul.f32 f1285, f2202, 0f3F4F1BBD; +fma.rn.f32 f2199, f2203, 0f3E9E377A, f967; +sub.f32 f1286, f2199, f1285; +sub.f32 f1287, f1007, f1139; +sub.f32 f1289, f1051, f1095; +mul.f32 f2197, f1287, 0f3F737871; +mul.f32 f2198, f1289, 0fBF167918; +sub.f32 f1291, f2198, f2197; +add.f32 f1292, f1291, f1286; +sub.f32 f1293, f1286, f1291; +add.f32 f2196, f967, f2203; +mul.f32 f1294, f2203, 0f3F4F1BBD; +sub.f32 f1295, f967, f1294; +fma.rn.f32 f1296, f2202, 0f3E9E377A, f1295; +mul.f32 f1297, f1287, 0f3F167918; +mul.f32 f1298, f1289, 0f3F737871; +sub.f32 f1299, f1298, f1297; +add.f32 f1300, f1299, f1296; +sub.f32 f1301, f1296, f1299; +add.f32 f1302, f1182, f1197; +add.f32 f1304, f1187, f1192; +add.f32 f2195, f976, f1302; +add.f32 f1305, f1304, f2195; +add.f32 f1306, f1184, f1199; +add.f32 f1308, f1189, f1194; +add.f32 f2194, f994, f1306; +add.f32 f1309, f1308, f2194; +fma.rn.f32 f2192, f1302, 0f3E9E377A, f976; +mul.f32 f2193, f1304, 0f3F4F1BBD; +sub.f32 f1312, f2192, f2193; +sub.f32 f1313, f1184, f1199; +sub.f32 f1315, f1189, f1194; +mul.f32 f2190, f1313, 0f3F737871; +mul.f32 f2191, f1315, 0fBF167918; +sub.f32 f1317, f2191, f2190; +sub.f32 f1318, f1312, f1317; +add.f32 f1319, f1317, f1312; +mul.f32 f1320, f1302, 0f3F4F1BBD; +sub.f32 f1321, f976, f1320; +fma.rn.f32 f1322, f1304, 0f3E9E377A, f1321; +mul.f32 f1323, f1313, 0f3F167918; +mul.f32 f1324, f1315, 0f3F737871; +sub.f32 f1325, f1324, f1323; +sub.f32 f1326, f1322, f1325; +add.f32 f1327, f1325, f1322; +mul.f32 f1329, f1308, 0f3F4F1BBD; +fma.rn.f32 f2189, f1306, 0f3E9E377A, f994; +sub.f32 f1330, f2189, f1329; +sub.f32 f1331, f1182, f1197; +sub.f32 f1333, f1187, f1192; +mul.f32 f2187, f1331, 0f3F737871; +mul.f32 f2188, f1333, 0fBF167918; +sub.f32 f1335, f2188, f2187; +add.f32 f1336, f1335, f1330; +sub.f32 f1337, f1330, f1335; +mul.f32 f1338, f1306, 0f3F4F1BBD; +sub.f32 f1339, f994, f1338; +fma.rn.f32 f1340, f1308, 0f3E9E377A, f1339; +mul.f32 f1341, f1331, 0f3F167918; +mul.f32 f1342, f1333, 0f3F737871; +sub.f32 f1343, f1342, f1341; +add.f32 f1344, f1343, f1340; +sub.f32 f1345, f1340, f1343; +add.f32 f1346, f1202, f1217; +add.f32 f1348, f1207, f1212; +add.f32 f2186, f984, f1346; +add.f32 f1349, f1348, f2186; +add.f32 f1350, f1204, f1219; +add.f32 f1352, f1209, f1214; +add.f32 f2185, f1002, f1350; +add.f32 f1353, f1352, f2185; +mul.f32 f1355, f1348, 0f3F4F1BBD; +fma.rn.f32 f2184, f1346, 0f3E9E377A, f984; +sub.f32 f1356, f2184, f1355; +sub.f32 f1357, f1204, f1219; +sub.f32 f1359, f1209, f1214; +mul.f32 f2182, f1357, 0f3F737871; +mul.f32 f2183, f1359, 0fBF167918; +sub.f32 f1361, f2183, f2182; +sub.f32 f1362, f1356, f1361; +add.f32 f1363, f1361, f1356; +mul.f32 f1364, f1346, 0f3F4F1BBD; +sub.f32 f1365, f984, f1364; +fma.rn.f32 f1366, f1348, 0f3E9E377A, f1365; +mul.f32 f1367, f1357, 0f3F167918; +mul.f32 f1368, f1359, 0f3F737871; +sub.f32 f1369, f1368, f1367; +sub.f32 f1370, f1366, f1369; +add.f32 f1371, f1369, f1366; +mul.f32 f1373, f1352, 0f3F4F1BBD; +fma.rn.f32 f2181, f1350, 0f3E9E377A, f1002; +sub.f32 f1374, f2181, f1373; +sub.f32 f1375, f1202, f1217; +sub.f32 f1377, f1207, f1212; +mul.f32 f2179, f1375, 0f3F737871; +mul.f32 f2180, f1377, 0fBF167918; +sub.f32 f1379, f2180, f2179; +add.f32 f1380, f1379, f1374; +sub.f32 f1381, f1374, f1379; +mul.f32 f1382, f1350, 0f3F4F1BBD; +sub.f32 f1383, f1002, f1382; +fma.rn.f32 f1384, f1352, 0f3E9E377A, f1383; +mul.f32 f1385, f1375, 0f3F167918; +mul.f32 f1386, f1377, 0f3F737871; +sub.f32 f1387, f1386, f1385; +add.f32 f1388, f1387, f1384; +sub.f32 f1389, f1384, f1387; +add.f32 f1390, f1222, f1237; +add.f32 f1392, f1227, f1232; +add.f32 f2178, f985, f1390; +add.f32 f1393, f1392, f2178; +add.f32 f1394, f1224, f1239; +add.f32 f1396, f1229, f1234; +add.f32 f2177, f1003, f1394; +add.f32 f1397, f1396, f2177; +mul.f32 f1399, f1392, 0f3F4F1BBD; +fma.rn.f32 f2176, f1390, 0f3E9E377A, f985; +sub.f32 f1400, f2176, f1399; +sub.f32 f1401, f1224, f1239; +sub.f32 f1403, f1229, f1234; +mul.f32 f2174, f1401, 0f3F737871; +mul.f32 f2175, f1403, 0fBF167918; +sub.f32 f1405, f2175, f2174; +sub.f32 f1406, f1400, f1405; +add.f32 f1407, f1405, f1400; +mul.f32 f1408, f1390, 0f3F4F1BBD; +sub.f32 f1409, f985, f1408; +fma.rn.f32 f1410, f1392, 0f3E9E377A, f1409; +mul.f32 f1411, f1401, 0f3F167918; +mul.f32 f1412, f1403, 0f3F737871; +sub.f32 f1413, f1412, f1411; +sub.f32 f1414, f1410, f1413; +add.f32 f1415, f1413, f1410; +fma.rn.f32 f2172, f1394, 0f3E9E377A, f1003; +mul.f32 f2173, f1396, 0f3F4F1BBD; +sub.f32 f1418, f2172, f2173; +sub.f32 f1419, f1222, f1237; +sub.f32 f1421, f1227, f1232; +mul.f32 f2170, f1419, 0f3F737871; +mul.f32 f2171, f1421, 0fBF167918; +sub.f32 f1423, f2171, f2170; +add.f32 f1424, f1423, f1418; +sub.f32 f1425, f1418, f1423; +mul.f32 f1426, f1394, 0f3F4F1BBD; +sub.f32 f1427, f1003, f1426; +fma.rn.f32 f1428, f1396, 0f3E9E377A, f1427; +mul.f32 f1429, f1419, 0f3F167918; +mul.f32 f1430, f1421, 0f3F737871; +sub.f32 f1431, f1430, f1429; +add.f32 f1432, f1431, f1428; +sub.f32 f1433, f1428, f1431; +add.f32 f1434, f1242, f1257; +add.f32 f1436, f1247, f1252; +add.f32 f2169, f977, f1434; +add.f32 f1437, f1436, f2169; +add.f32 f1438, f1244, f1259; +add.f32 f1440, f1249, f1254; +add.f32 f2168, f995, f1438; +add.f32 f1441, f1440, f2168; +fma.rn.f32 f2166, f1434, 0f3E9E377A, f977; +mul.f32 f2167, f1436, 0f3F4F1BBD; +sub.f32 f1444, f2166, f2167; +sub.f32 f1445, f1244, f1259; +sub.f32 f1447, f1249, f1254; +mul.f32 f2164, f1445, 0f3F737871; +mul.f32 f2165, f1447, 0fBF167918; +sub.f32 f1449, f2165, f2164; +sub.f32 f1450, f1444, f1449; +add.f32 f1451, f1449, f1444; +mul.f32 f1452, f1434, 0f3F4F1BBD; +sub.f32 f1453, f977, f1452; +fma.rn.f32 f1454, f1436, 0f3E9E377A, f1453; +mul.f32 f1455, f1445, 0f3F167918; +mul.f32 f1456, f1447, 0f3F737871; +sub.f32 f1457, f1456, f1455; +sub.f32 f1458, f1454, f1457; +add.f32 f1459, f1457, f1454; +mul.f32 f1461, f1440, 0f3F4F1BBD; +fma.rn.f32 f2163, f1438, 0f3E9E377A, f995; +sub.f32 f1462, f2163, f1461; +sub.f32 f1463, f1242, f1257; +sub.f32 f1465, f1247, f1252; +mul.f32 f2161, f1463, 0f3F737871; +mul.f32 f2162, f1465, 0fBF167918; +sub.f32 f1467, f2162, f2161; +add.f32 f1468, f1467, f1462; +sub.f32 f1469, f1462, f1467; +mul.f32 f1470, f1438, 0f3F4F1BBD; +sub.f32 f1471, f995, f1470; +fma.rn.f32 f1472, f1440, 0f3E9E377A, f1471; +mul.f32 f1473, f1463, 0f3F167918; +mul.f32 f1474, f1465, 0f3F737871; +sub.f32 f1475, f1474, f1473; +add.f32 f1476, f1475, f1472; +sub.f32 f1477, f1472, f1475; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mul.wide.u32 rd12, r11, 8; +mov.u64 rd13, %52; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1478, f1479}, [rd11]; +mul.f32 f1483, f1479, f1309; +mul.f32 f1484, f1478, f1309; +mul.f32 f1486, f1479, f1479; +mul.f32 f2160, f1478, f1478; +sub.f32 f1487, f2160, f1486; +mul.f32 f1488, f1479, f1478; +fma.rn.f32 f1489, f1479, f1478, f1488; +mul.f32 f1491, f1489, f1353; +mul.f32 f1492, f1487, f1353; +mul.f32 f1494, f1479, f1489; +mul.f32 f2159, f1478, f1487; +sub.f32 f1495, f2159, f1494; +mul.f32 f2158, f1487, f1349; +mul.f32 f1496, f1478, f1489; +fma.rn.f32 f1497, f1479, f1487, f1496; +mul.f32 f1499, f1497, f1397; +mul.f32 f1500, f1495, f1397; +mul.f32 f2156, f1478, f1495; +mul.f32 f2157, f1479, f1497; +sub.f32 f1503, f2156, f2157; +mul.f32 f2155, f1495, f1393; +mul.f32 f1504, f1478, f1497; +fma.rn.f32 f1505, f1479, f1495, f1504; +mul.f32 f1507, f1505, f1441; +mul.f32 f1508, f1503, f1441; +mul.f32 f1510, f1479, f1505; +mul.f32 f2154, f1478, f1503; +sub.f32 f1511, f2154, f1510; +mul.f32 f2153, f1503, f1437; +mul.f32 f1512, f1478, f1505; +fma.rn.f32 f1513, f1479, f1503, f1512; +mul.f32 f1515, f1513, f1292; +mul.f32 f1516, f1511, f1292; +mul.f32 f2151, f1478, f1511; +mul.f32 f2152, f1479, f1513; +sub.f32 f1519, f2151, f2152; +mul.f32 f2150, f1511, f1274; +mul.f32 f1520, f1478, f1513; +fma.rn.f32 f1521, f1479, f1511, f1520; +mul.f32 f1523, f1521, f1336; +mul.f32 f1524, f1519, f1336; +mul.f32 f1526, f1479, f1521; +mul.f32 f2149, f1478, f1519; +sub.f32 f1527, f2149, f1526; +mul.f32 f2148, f1519, f1318; +mul.f32 f1528, f1478, f1521; +fma.rn.f32 f1529, f1479, f1519, f1528; +mul.f32 f1531, f1529, f1380; +mul.f32 f1532, f1527, f1380; +mul.f32 f1534, f1479, f1529; +mul.f32 f2147, f1478, f1527; +sub.f32 f1535, f2147, f1534; +mul.f32 f2146, f1527, f1362; +mul.f32 f1536, f1478, f1529; +fma.rn.f32 f1537, f1479, f1527, f1536; +mul.f32 f1539, f1537, f1424; +mul.f32 f1540, f1535, f1424; +mul.f32 f2144, f1478, f1535; +mul.f32 f2145, f1479, f1537; +sub.f32 f1543, f2144, f2145; +mul.f32 f2143, f1535, f1406; +mul.f32 f1544, f1478, f1537; +fma.rn.f32 f1545, f1479, f1535, f1544; +mul.f32 f1547, f1545, f1468; +mul.f32 f1548, f1543, f1468; +mul.f32 f1550, f1479, f1545; +mul.f32 f2142, f1478, f1543; +sub.f32 f1551, f2142, f1550; +mul.f32 f2141, f1543, f1450; +mul.f32 f1552, f1478, f1545; +fma.rn.f32 f1553, f1479, f1543, f1552; +mul.f32 f1555, f1553, f1300; +mul.f32 f1556, f1551, f1300; +mul.f32 f1558, f1479, f1553; +mul.f32 f2140, f1478, f1551; +sub.f32 f1559, f2140, f1558; +mul.f32 f2139, f1551, f1282; +mul.f32 f1560, f1478, f1553; +fma.rn.f32 f1561, f1479, f1551, f1560; +mul.f32 f1563, f1561, f1344; +mul.f32 f1564, f1559, f1344; +mul.f32 f2137, f1478, f1559; +mul.f32 f2138, f1479, f1561; +sub.f32 f1567, f2137, f2138; +mul.f32 f2136, f1559, f1326; +mul.f32 f1568, f1478, f1561; +fma.rn.f32 f1569, f1479, f1559, f1568; +mul.f32 f1571, f1569, f1388; +mul.f32 f1572, f1567, f1388; +mul.f32 f1574, f1479, f1569; +mul.f32 f2135, f1478, f1567; +sub.f32 f1575, f2135, f1574; +mul.f32 f2134, f1567, f1370; +mul.f32 f1576, f1478, f1569; +fma.rn.f32 f1577, f1479, f1567, f1576; +mul.f32 f1579, f1577, f1432; +mul.f32 f1580, f1575, f1432; +mul.f32 f2132, f1478, f1575; +mul.f32 f2133, f1479, f1577; +sub.f32 f1583, f2132, f2133; +mul.f32 f2131, f1575, f1414; +mul.f32 f1584, f1478, f1577; +fma.rn.f32 f1585, f1479, f1575, f1584; +mul.f32 f1587, f1585, f1476; +mul.f32 f1588, f1583, f1476; +mul.f32 f1590, f1479, f1585; +mul.f32 f2130, f1478, f1583; +sub.f32 f1591, f2130, f1590; +mul.f32 f2129, f1583, f1458; +mul.f32 f1592, f1478, f1585; +fma.rn.f32 f1593, f1479, f1583, f1592; +mul.f32 f1595, f1593, f1301; +mul.f32 f1596, f1591, f1301; +mul.f32 f1598, f1479, f1593; +mul.f32 f2128, f1478, f1591; +sub.f32 f1599, f2128, f1598; +mul.f32 f2127, f1591, f1283; +mul.f32 f1600, f1478, f1593; +fma.rn.f32 f1601, f1479, f1591, f1600; +mul.f32 f1603, f1601, f1345; +mul.f32 f1604, f1599, f1345; +mul.f32 f2125, f1478, f1599; +mul.f32 f2126, f1479, f1601; +sub.f32 f1607, f2125, f2126; +mul.f32 f2124, f1599, f1327; +mul.f32 f1608, f1478, f1601; +fma.rn.f32 f1609, f1479, f1599, f1608; +mul.f32 f1611, f1609, f1389; +mul.f32 f1612, f1607, f1389; +mul.f32 f1614, f1479, f1609; +mul.f32 f2123, f1478, f1607; +sub.f32 f1615, f2123, f1614; +mul.f32 f2122, f1607, f1371; +mul.f32 f1616, f1478, f1609; +fma.rn.f32 f1617, f1479, f1607, f1616; +mul.f32 f1619, f1617, f1433; +mul.f32 f1620, f1615, f1433; +mul.f32 f1622, f1479, f1617; +mul.f32 f2121, f1478, f1615; +sub.f32 f1623, f2121, f1622; +mul.f32 f2120, f1615, f1415; +mul.f32 f1624, f1478, f1617; +fma.rn.f32 f1625, f1479, f1615, f1624; +mul.f32 f1627, f1625, f1477; +mul.f32 f1628, f1623, f1477; +mul.f32 f2118, f1478, f1623; +mul.f32 f2119, f1479, f1625; +sub.f32 f1631, f2118, f2119; +mul.f32 f2117, f1623, f1459; +mul.f32 f1632, f1478, f1625; +fma.rn.f32 f1633, f1479, f1623, f1632; +mul.f32 f1635, f1633, f1293; +mul.f32 f1636, f1631, f1293; +mul.f32 f1638, f1479, f1633; +mul.f32 f2116, f1478, f1631; +sub.f32 f1639, f2116, f1638; +mul.f32 f2115, f1631, f1275; +mul.f32 f1640, f1478, f1633; +fma.rn.f32 f1641, f1479, f1631, f1640; +mul.f32 f1643, f1641, f1337; +mul.f32 f1644, f1639, f1337; +mul.f32 f2113, f1478, f1639; +mul.f32 f2114, f1479, f1641; +sub.f32 f1647, f2113, f2114; +mul.f32 f2112, f1639, f1319; +mul.f32 f1648, f1478, f1641; +fma.rn.f32 f1649, f1479, f1639, f1648; +mul.f32 f1651, f1649, f1381; +mul.f32 f1652, f1647, f1381; +mul.f32 f1654, f1479, f1649; +mul.f32 f2111, f1478, f1647; +sub.f32 f1655, f2111, f1654; +mul.f32 f2110, f1647, f1363; +mul.f32 f1656, f1478, f1649; +fma.rn.f32 f1657, f1479, f1647, f1656; +mul.f32 f1659, f1657, f1425; +mul.f32 f1660, f1655, f1425; +mul.f32 f1662, f1479, f1657; +mul.f32 f2109, f1478, f1655; +sub.f32 f1663, f2109, f1662; +mul.f32 f2108, f1478, f1305; +mul.f32 f1664, f1478, f1657; +mul.f32 f2107, f1655, f1407; +fma.rn.f32 f1665, f1479, f1655, f1664; +mul.f32 f1666, f1663, f1451; +mul.f32 f1667, f1665, f1469; +mul.f32 f1668, f1663, f1469; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 5000, r15; +add.f32 f1669, f2202, f2196; +add.f32 f1670, f1262, f2200; +st.shared.v2.f32 [r16], {f1670, f1669}; +fma.rn.f32 f1671, f1479, f1305, f1484; +sub.f32 f1672, f2108, f1483; +st.shared.v2.f32 [r16+200], {f1672, f1671}; +fma.rn.f32 f1673, f1489, f1349, f1492; +sub.f32 f1674, f2158, f1491; +st.shared.v2.f32 [r16+400], {f1674, f1673}; +fma.rn.f32 f1675, f1497, f1393, f1500; +sub.f32 f1676, f2155, f1499; +st.shared.v2.f32 [r16+600], {f1676, f1675}; +fma.rn.f32 f1677, f1505, f1437, f1508; +sub.f32 f1678, f2153, f1507; +st.shared.v2.f32 [r16+800], {f1678, f1677}; +fma.rn.f32 f1679, f1513, f1274, f1516; +sub.f32 f1680, f2150, f1515; +st.shared.v2.f32 [r16+1000], {f1680, f1679}; +fma.rn.f32 f1681, f1521, f1318, f1524; +sub.f32 f1682, f2148, f1523; +st.shared.v2.f32 [r16+1200], {f1682, f1681}; +fma.rn.f32 f1683, f1529, f1362, f1532; +sub.f32 f1684, f2146, f1531; +st.shared.v2.f32 [r16+1400], {f1684, f1683}; +fma.rn.f32 f1685, f1537, f1406, f1540; +sub.f32 f1686, f2143, f1539; +st.shared.v2.f32 [r16+1600], {f1686, f1685}; +sub.f32 f1687, f2141, f1547; +fma.rn.f32 f1688, f1545, f1450, f1548; +st.shared.v2.f32 [r16+1800], {f1687, f1688}; +fma.rn.f32 f1689, f1553, f1282, f1556; +sub.f32 f1690, f2139, f1555; +st.shared.v2.f32 [r16+2000], {f1690, f1689}; +fma.rn.f32 f1691, f1561, f1326, f1564; +sub.f32 f1692, f2136, f1563; +st.shared.v2.f32 [r16+2200], {f1692, f1691}; +fma.rn.f32 f1693, f1569, f1370, f1572; +sub.f32 f1694, f2134, f1571; +st.shared.v2.f32 [r16+2400], {f1694, f1693}; +fma.rn.f32 f1695, f1577, f1414, f1580; +sub.f32 f1696, f2131, f1579; +st.shared.v2.f32 [r16+2600], {f1696, f1695}; +fma.rn.f32 f1697, f1585, f1458, f1588; +sub.f32 f1698, f2129, f1587; +st.shared.v2.f32 [r16+2800], {f1698, f1697}; +fma.rn.f32 f1699, f1593, f1283, f1596; +sub.f32 f1700, f2127, f1595; +st.shared.v2.f32 [r16+3000], {f1700, f1699}; +fma.rn.f32 f1701, f1601, f1327, f1604; +sub.f32 f1702, f2124, f1603; +st.shared.v2.f32 [r16+3200], {f1702, f1701}; +fma.rn.f32 f1703, f1609, f1371, f1612; +sub.f32 f1704, f2122, f1611; +st.shared.v2.f32 [r16+3400], {f1704, f1703}; +fma.rn.f32 f1705, f1617, f1415, f1620; +sub.f32 f1706, f2120, f1619; +st.shared.v2.f32 [r16+3600], {f1706, f1705}; +fma.rn.f32 f1707, f1625, f1459, f1628; +sub.f32 f1708, f2117, f1627; +st.shared.v2.f32 [r16+3800], {f1708, f1707}; +fma.rn.f32 f1709, f1633, f1275, f1636; +sub.f32 f1710, f2115, f1635; +st.shared.v2.f32 [r16+4000], {f1710, f1709}; +fma.rn.f32 f1711, f1641, f1319, f1644; +sub.f32 f1712, f2112, f1643; +st.shared.v2.f32 [r16+4200], {f1712, f1711}; +fma.rn.f32 f1713, f1649, f1363, f1652; +sub.f32 f1714, f2110, f1651; +st.shared.v2.f32 [r16+4400], {f1714, f1713}; +fma.rn.f32 f1715, f1657, f1407, f1660; +sub.f32 f1716, f2107, f1659; +st.shared.v2.f32 [r16+4600], {f1716, f1715}; +fma.rn.f32 f1717, f1665, f1451, f1668; +sub.f32 f1718, f1666, f1667; +st.shared.v2.f32 [r16+4800], {f1718, f1717}; +barrier.sync 0; +ld.shared.v2.f32 {f1719, f1720}, [r10]; +ld.shared.v2.f32 {f1723, f1724}, [r10+1000]; +ld.shared.v2.f32 {f1727, f1728}, [r10+2000]; +ld.shared.v2.f32 {f1731, f1732}, [r10+3000]; +ld.shared.v2.f32 {f1735, f1736}, [r10+4000]; +ld.shared.v2.f32 {f1739, f1740}, [r10+5000]; +ld.shared.v2.f32 {f1743, f1744}, [r10+6000]; +ld.shared.v2.f32 {f1747, f1748}, [r10+7000]; +ld.shared.v2.f32 {f1751, f1752}, [r10+8000]; +ld.shared.v2.f32 {f1755, f1756}, [r10+9000]; +ld.shared.v2.f32 {f1759, f1760}, [r10+10000]; +ld.shared.v2.f32 {f1763, f1764}, [r10+11000]; +ld.shared.v2.f32 {f1767, f1768}, [r10+12000]; +ld.shared.v2.f32 {f1771, f1772}, [r10+13000]; +ld.shared.v2.f32 {f1775, f1776}, [r10+14000]; +ld.shared.v2.f32 {f1779, f1780}, [r10+15000]; +ld.shared.v2.f32 {f1783, f1784}, [r10+16000]; +ld.shared.v2.f32 {f1787, f1788}, [r10+17000]; +ld.shared.v2.f32 {f1791, f1792}, [r10+18000]; +ld.shared.v2.f32 {f1795, f1796}, [r10+19000]; +ld.shared.v2.f32 {f1799, f1800}, [r10+20000]; +ld.shared.v2.f32 {f1803, f1804}, [r10+21000]; +ld.shared.v2.f32 {f1807, f1808}, [r10+22000]; +ld.shared.v2.f32 {f1811, f1812}, [r10+23000]; +ld.shared.v2.f32 {f1815, f1816}, [r10+24000]; +add.f32 f1819, f1739, f1799; +add.f32 f1821, f1759, f1779; +fma.rn.f32 f2105, f1819, 0f3E9E377A, f1719; +mul.f32 f2106, f1821, 0f3F4F1BBD; +sub.f32 f1827, f2105, f2106; +add.f32 f2104, f1740, f1800; +sub.f32 f1828, f1740, f1800; +add.f32 f2103, f1760, f1780; +sub.f32 f1830, f1760, f1780; +mul.f32 f2101, f1828, 0f3F737871; +mul.f32 f2102, f1830, 0fBF167918; +sub.f32 f1832, f2102, f2101; +add.f32 f2100, f1719, f1819; +mul.f32 f1833, f1819, 0f3F4F1BBD; +sub.f32 f1834, f1719, f1833; +fma.rn.f32 f1835, f1821, 0f3E9E377A, f1834; +mul.f32 f1836, f1828, 0f3F167918; +mul.f32 f1837, f1830, 0f3F737871; +sub.f32 f1838, f1837, f1836; +fma.rn.f32 f2098, f2104, 0f3E9E377A, f1720; +mul.f32 f2099, f2103, 0f3F4F1BBD; +sub.f32 f1841, f2098, f2099; +sub.f32 f1842, f1739, f1799; +sub.f32 f1844, f1759, f1779; +mul.f32 f2096, f1842, 0f3F737871; +mul.f32 f2097, f1844, 0fBF167918; +sub.f32 f1846, f2097, f2096; +add.f32 f2095, f1720, f2104; +mul.f32 f1847, f2104, 0f3F4F1BBD; +sub.f32 f1848, f1720, f1847; +fma.rn.f32 f1849, f2103, 0f3E9E377A, f1848; +mul.f32 f1850, f1842, 0f3F167918; +mul.f32 f1851, f1844, 0f3F737871; +sub.f32 f1852, f1851, f1850; +add.f32 f1853, f1743, f1803; +add.f32 f1855, f1763, f1783; +mul.f32 f1860, f1855, 0f3F4F1BBD; +fma.rn.f32 f2094, f1853, 0f3E9E377A, f1723; +sub.f32 f1861, f2094, f1860; +add.f32 f2093, f1744, f1804; +sub.f32 f1862, f1744, f1804; +add.f32 f2092, f1764, f1784; +sub.f32 f1864, f1764, f1784; +mul.f32 f2090, f1862, 0f3F737871; +mul.f32 f2091, f1864, 0fBF167918; +sub.f32 f1866, f2091, f2090; +add.f32 f2089, f1723, f1853; +mul.f32 f1867, f1853, 0f3F4F1BBD; +sub.f32 f1868, f1723, f1867; +fma.rn.f32 f1869, f1855, 0f3E9E377A, f1868; +mul.f32 f1870, f1862, 0f3F167918; +mul.f32 f1871, f1864, 0f3F737871; +sub.f32 f1872, f1871, f1870; +mul.f32 f1874, f2092, 0f3F4F1BBD; +fma.rn.f32 f2088, f2093, 0f3E9E377A, f1724; +sub.f32 f1875, f2088, f1874; +sub.f32 f1876, f1743, f1803; +sub.f32 f1878, f1763, f1783; +mul.f32 f2086, f1876, 0f3F737871; +mul.f32 f2087, f1878, 0fBF167918; +sub.f32 f1880, f2087, f2086; +add.f32 f2085, f1724, f2093; +mul.f32 f1881, f2093, 0f3F4F1BBD; +sub.f32 f1882, f1724, f1881; +fma.rn.f32 f1883, f2092, 0f3E9E377A, f1882; +mul.f32 f1884, f1876, 0f3F167918; +mul.f32 f1885, f1878, 0f3F737871; +sub.f32 f1886, f1885, f1884; +add.f32 f1887, f1747, f1807; +add.f32 f1889, f1767, f1787; +mul.f32 f1894, f1889, 0f3F4F1BBD; +fma.rn.f32 f2084, f1887, 0f3E9E377A, f1727; +sub.f32 f1895, f2084, f1894; +add.f32 f2083, f1748, f1808; +sub.f32 f1896, f1748, f1808; +add.f32 f2082, f1768, f1788; +sub.f32 f1898, f1768, f1788; +mul.f32 f2080, f1896, 0f3F737871; +mul.f32 f2081, f1898, 0fBF167918; +sub.f32 f1900, f2081, f2080; +add.f32 f2079, f1727, f1887; +mul.f32 f1901, f1887, 0f3F4F1BBD; +sub.f32 f1902, f1727, f1901; +fma.rn.f32 f1903, f1889, 0f3E9E377A, f1902; +mul.f32 f1904, f1896, 0f3F167918; +mul.f32 f1905, f1898, 0f3F737871; +sub.f32 f1906, f1905, f1904; +mul.f32 f1908, f2082, 0f3F4F1BBD; +fma.rn.f32 f2078, f2083, 0f3E9E377A, f1728; +sub.f32 f1909, f2078, f1908; +sub.f32 f1910, f1747, f1807; +sub.f32 f1912, f1767, f1787; +mul.f32 f2076, f1910, 0f3F737871; +mul.f32 f2077, f1912, 0fBF167918; +sub.f32 f1914, f2077, f2076; +add.f32 f2075, f1728, f2083; +mul.f32 f1915, f2083, 0f3F4F1BBD; +sub.f32 f1916, f1728, f1915; +fma.rn.f32 f1917, f2082, 0f3E9E377A, f1916; +mul.f32 f1918, f1910, 0f3F167918; +mul.f32 f1919, f1912, 0f3F737871; +sub.f32 f1920, f1919, f1918; +add.f32 f1921, f1751, f1811; +add.f32 f1923, f1771, f1791; +fma.rn.f32 f2073, f1921, 0f3E9E377A, f1731; +mul.f32 f2074, f1923, 0f3F4F1BBD; +sub.f32 f1929, f2073, f2074; +add.f32 f2072, f1752, f1812; +sub.f32 f1930, f1752, f1812; +add.f32 f2071, f1772, f1792; +sub.f32 f1932, f1772, f1792; +mul.f32 f2069, f1930, 0f3F737871; +mul.f32 f2070, f1932, 0fBF167918; +sub.f32 f1934, f2070, f2069; +add.f32 f2068, f1731, f1921; +mul.f32 f1935, f1921, 0f3F4F1BBD; +sub.f32 f1936, f1731, f1935; +fma.rn.f32 f1937, f1923, 0f3E9E377A, f1936; +mul.f32 f1938, f1930, 0f3F167918; +mul.f32 f1939, f1932, 0f3F737871; +sub.f32 f1940, f1939, f1938; +fma.rn.f32 f2066, f2072, 0f3E9E377A, f1732; +mul.f32 f2067, f2071, 0f3F4F1BBD; +sub.f32 f1943, f2066, f2067; +sub.f32 f1944, f1751, f1811; +sub.f32 f1946, f1771, f1791; +mul.f32 f2064, f1944, 0f3F737871; +mul.f32 f2065, f1946, 0fBF167918; +sub.f32 f1948, f2065, f2064; +add.f32 f2063, f1732, f2072; +mul.f32 f1949, f2072, 0f3F4F1BBD; +sub.f32 f1950, f1732, f1949; +fma.rn.f32 f1951, f2071, 0f3E9E377A, f1950; +mul.f32 f1952, f1944, 0f3F167918; +mul.f32 f1953, f1946, 0f3F737871; +sub.f32 f1954, f1953, f1952; +add.f32 f1955, f1755, f1815; +add.f32 f1957, f1775, f1795; +mul.f32 f1962, f1957, 0f3F4F1BBD; +fma.rn.f32 f2062, f1955, 0f3E9E377A, f1735; +sub.f32 f1963, f2062, f1962; +add.f32 f2061, f1756, f1816; +sub.f32 f1964, f1756, f1816; +add.f32 f2060, f1776, f1796; +sub.f32 f1966, f1776, f1796; +mul.f32 f2058, f1964, 0f3F737871; +mul.f32 f2059, f1966, 0fBF167918; +sub.f32 f1968, f2059, f2058; +add.f32 f2057, f1735, f1955; +mul.f32 f1969, f1955, 0f3F4F1BBD; +sub.f32 f1970, f1735, f1969; +fma.rn.f32 f1971, f1957, 0f3E9E377A, f1970; +mul.f32 f1972, f1964, 0f3F167918; +mul.f32 f1973, f1966, 0f3F737871; +sub.f32 f1974, f1973, f1972; +mul.f32 f1976, f2060, 0f3F4F1BBD; +fma.rn.f32 f2056, f2061, 0f3E9E377A, f1736; +sub.f32 f1977, f2056, f1976; +sub.f32 f1978, f1755, f1815; +sub.f32 f1980, f1775, f1795; +mul.f32 f2054, f1978, 0f3F737871; +mul.f32 f2055, f1980, 0fBF167918; +sub.f32 f1982, f2055, f2054; +add.f32 f2053, f1736, f2061; +mul.f32 f1983, f2061, 0f3F4F1BBD; +sub.f32 f1984, f1736, f1983; +fma.rn.f32 f1985, f2060, 0f3E9E377A, f1984; +mul.f32 f1986, f1978, 0f3F167918; +mul.f32 f1987, f1980, 0f3F737871; +sub.f32 f1988, f1987, f1986; +add.f32 %1, f2103, f2095; +add.f32 %0, f1821, f2100; +add.f32 %3, f2092, f2085; +add.f32 %2, f1855, f2089; +add.f32 %5, f2082, f2075; +add.f32 %4, f1889, f2079; +add.f32 %7, f2071, f2063; +add.f32 %6, f1923, f2068; +add.f32 %9, f2060, f2053; +add.f32 %8, f1957, f2057; +sub.f32 %10, f1827, f1832; +add.f32 %11, f1846, f1841; +sub.f32 %12, f1861, f1866; +add.f32 %13, f1880, f1875; +add.f32 %15, f1914, f1909; +sub.f32 %14, f1895, f1900; +add.f32 %17, f1948, f1943; +sub.f32 %16, f1929, f1934; +add.f32 %19, f1982, f1977; +sub.f32 %18, f1963, f1968; +sub.f32 %20, f1835, f1838; +add.f32 %21, f1852, f1849; +sub.f32 %22, f1869, f1872; +add.f32 %23, f1886, f1883; +sub.f32 %24, f1903, f1906; +add.f32 %25, f1920, f1917; +sub.f32 %26, f1937, f1940; +add.f32 %27, f1954, f1951; +add.f32 %29, f1988, f1985; +sub.f32 %28, f1971, f1974; +sub.f32 %31, f1849, f1852; +add.f32 %30, f1838, f1835; +sub.f32 %33, f1883, f1886; +add.f32 %32, f1872, f1869; +sub.f32 %35, f1917, f1920; +add.f32 %34, f1906, f1903; +sub.f32 %37, f1951, f1954; +add.f32 %36, f1940, f1937; +sub.f32 %39, f1985, f1988; +add.f32 %38, f1974, f1971; +sub.f32 %41, f1841, f1846; +add.f32 %40, f1832, f1827; +sub.f32 %43, f1875, f1880; +add.f32 %42, f1866, f1861; +sub.f32 %45, f1909, f1914; +add.f32 %44, f1900, f1895; +sub.f32 %47, f1943, f1948; +add.f32 %46, f1934, f1929; +sub.f32 %49, f1977, f1982; +add.f32 %48, f1968, f1963; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_3125), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<166, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1939>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 12500, r2; +add.f32 f101, %66, %106; +add.f32 f102, %53, f101; +add.f32 f103, %79, %93; +add.f32 f104, f103, f102; +add.f32 f105, %68, %108; +add.f32 f106, %54, f105; +add.f32 f107, %81, %94; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %53; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %68, %108; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %81, %94; +mul.f32 f115, f114, 0fBF167918; +sub.f32 f116, f115, f113; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %53, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f127, f105, 0f3E9E377A, %54; +mul.f32 f128, f107, 0f3F4F1BBD; +sub.f32 f129, f127, f128; +sub.f32 f130, %66, %106; +mul.f32 f131, f130, 0f3F737871; +sub.f32 f132, %79, %93; +mul.f32 f133, f132, 0fBF167918; +sub.f32 f134, f133, f131; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %54, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %69, %109; +add.f32 f146, %55, f145; +add.f32 f147, %82, %95; +add.f32 f148, f147, f146; +add.f32 f149, %70, %110; +add.f32 f150, %57, f149; +add.f32 f151, %84, %97; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f145, 0f3E9E377A, %55; +mul.f32 f154, f147, 0f3F4F1BBD; +sub.f32 f155, f153, f154; +sub.f32 f156, %70, %110; +mul.f32 f157, f156, 0f3F737871; +sub.f32 f158, %84, %97; +mul.f32 f159, f158, 0fBF167918; +sub.f32 f160, f159, f157; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %55, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +fma.rn.f32 f171, f149, 0f3E9E377A, %57; +mul.f32 f172, f151, 0f3F4F1BBD; +sub.f32 f173, f171, f172; +sub.f32 f174, %69, %109; +mul.f32 f175, f174, 0f3F737871; +sub.f32 f176, %82, %95; +mul.f32 f177, f176, 0fBF167918; +sub.f32 f178, f177, f175; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %57, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %71, %111; +add.f32 f190, %58, f189; +add.f32 f191, %85, %98; +add.f32 f192, f191, f190; +add.f32 f193, %73, %113; +add.f32 f194, %60, f193; +add.f32 f195, %86, %100; +add.f32 f196, f195, f194; +fma.rn.f32 f197, f189, 0f3E9E377A, %58; +mul.f32 f198, f191, 0f3F4F1BBD; +sub.f32 f199, f197, f198; +sub.f32 f200, %73, %113; +mul.f32 f201, f200, 0f3F737871; +sub.f32 f202, %86, %100; +mul.f32 f203, f202, 0fBF167918; +sub.f32 f204, f203, f201; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %58, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f215, f193, 0f3E9E377A, %60; +mul.f32 f216, f195, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, %71, %111; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, %85, %98; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %60, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %74, %114; +add.f32 f234, %61, f233; +add.f32 f235, %87, %101; +add.f32 f236, f235, f234; +add.f32 f237, %76, %116; +add.f32 f238, %62, f237; +add.f32 f239, %89, %102; +add.f32 f240, f239, f238; +fma.rn.f32 f241, f233, 0f3E9E377A, %61; +mul.f32 f242, f235, 0f3F4F1BBD; +sub.f32 f243, f241, f242; +sub.f32 f244, %76, %116; +mul.f32 f245, f244, 0f3F737871; +sub.f32 f246, %89, %102; +mul.f32 f247, f246, 0fBF167918; +sub.f32 f248, f247, f245; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %61, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +fma.rn.f32 f259, f237, 0f3E9E377A, %62; +mul.f32 f260, f239, 0f3F4F1BBD; +sub.f32 f261, f259, f260; +sub.f32 f262, %74, %114; +mul.f32 f263, f262, 0f3F737871; +sub.f32 f264, %87, %101; +mul.f32 f265, f264, 0fBF167918; +sub.f32 f266, f265, f263; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %62, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %77, %117; +add.f32 f278, %63, f277; +add.f32 f279, %90, %103; +add.f32 f280, f279, f278; +add.f32 f281, %78, %118; +add.f32 f282, %65, f281; +add.f32 f283, %92, %105; +add.f32 f284, f283, f282; +fma.rn.f32 f285, f277, 0f3E9E377A, %63; +mul.f32 f286, f279, 0f3F4F1BBD; +sub.f32 f287, f285, f286; +sub.f32 f288, %78, %118; +mul.f32 f289, f288, 0f3F737871; +sub.f32 f290, %92, %105; +mul.f32 f291, f290, 0fBF167918; +sub.f32 f292, f291, f289; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %63, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +fma.rn.f32 f303, f281, 0f3E9E377A, %65; +mul.f32 f304, f283, 0f3F4F1BBD; +sub.f32 f305, f303, f304; +sub.f32 f306, %77, %117; +mul.f32 f307, f306, 0f3F737871; +sub.f32 f308, %90, %103; +mul.f32 f309, f308, 0fBF167918; +sub.f32 f310, f309, f307; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %65, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mov.u32 r4, %tid.x; +mul.f32 f321, f161, 0f3F77F511; +mul.f32 f322, f179, 0fBE7EA890; +sub.f32 f323, f321, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f326, f205, 0f3F6055A2; +mul.f32 f327, f223, 0fBEF6A86B; +sub.f32 f328, f326, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f331, f249, 0f3F3A9DB0; +mul.f32 f332, f267, 0fBF2F3E7B; +sub.f32 f333, f331, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f336, f293, 0f3F092BF2; +mul.f32 f337, f311, 0fBF5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f341, f169, 0f3F6055A2; +mul.f32 f342, f187, 0fBEF6A86B; +sub.f32 f343, f341, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f346, f213, 0f3F092BF2; +mul.f32 f347, f231, 0fBF5825E0; +sub.f32 f348, f346, f347; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f351, f257, 0f3D809851; +mul.f32 f352, f275, 0fBF7F7EAE; +sub.f32 f353, f351, f352; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f356, f301, 0fBED9FFBE; +mul.f32 f357, f319, 0fBF67A2BF; +sub.f32 f358, f356, f357; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f361, f170, 0f3F3A9DB0; +mul.f32 f362, f188, 0fBF2F3E7B; +sub.f32 f363, f361, f362; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f366, f214, 0f3D809851; +mul.f32 f367, f232, 0fBF7F7EAE; +sub.f32 f368, f366, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f371, f258, 0fBF232E38; +mul.f32 f372, f276, 0fBF45405B; +sub.f32 f373, f371, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f376, f302, 0fBF7DFB3B; +mul.f32 f377, f320, 0fBE00575B; +sub.f32 f378, f376, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f381, f162, 0f3F092BF2; +mul.f32 f382, f180, 0fBF5825E0; +sub.f32 f383, f381, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f386, f206, 0fBED9FFBE; +mul.f32 f387, f224, 0fBF67A2BF; +sub.f32 f388, f386, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f391, f250, 0fBF7DFB3B; +mul.f32 f392, f268, 0fBE00575B; +sub.f32 f393, f391, f392; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f396, f294, 0fBF232E38; +mul.f32 f397, f312, 0f3F45405B; +sub.f32 f398, f396, f397; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f402, f104, f401; +add.f32 f403, f192, f236; +add.f32 f404, f403, f402; +add.f32 f405, f152, f284; +add.f32 f406, f108, f405; +add.f32 f407, f196, f240; +add.f32 f408, f407, f406; +fma.rn.f32 f409, f401, 0f3E9E377A, f104; +mul.f32 f410, f403, 0f3F4F1BBD; +sub.f32 f411, f409, f410; +sub.f32 f412, f152, f284; +mul.f32 f413, f412, 0f3F737871; +sub.f32 f414, f196, f240; +mul.f32 f415, f414, 0fBF167918; +sub.f32 f416, f415, f413; +sub.f32 f417, f411, f416; +add.f32 f418, f416, f411; +mul.f32 f419, f401, 0f3F4F1BBD; +sub.f32 f420, f104, f419; +fma.rn.f32 f421, f403, 0f3E9E377A, f420; +mul.f32 f422, f412, 0f3F167918; +mul.f32 f423, f414, 0f3F737871; +sub.f32 f424, f423, f422; +sub.f32 f425, f421, f424; +add.f32 f426, f424, f421; +fma.rn.f32 f427, f405, 0f3E9E377A, f108; +mul.f32 f428, f407, 0f3F4F1BBD; +sub.f32 f429, f427, f428; +sub.f32 f430, f148, f280; +mul.f32 f431, f430, 0f3F737871; +sub.f32 f432, f192, f236; +mul.f32 f433, f432, 0fBF167918; +sub.f32 f434, f433, f431; +add.f32 f435, f434, f429; +sub.f32 f436, f429, f434; +mul.f32 f437, f405, 0f3F4F1BBD; +sub.f32 f438, f108, f437; +fma.rn.f32 f439, f407, 0f3E9E377A, f438; +mul.f32 f440, f430, 0f3F167918; +mul.f32 f441, f432, 0f3F737871; +sub.f32 f442, f441, f440; +add.f32 f443, f442, f439; +sub.f32 f444, f439, f442; +add.f32 f445, f323, f338; +add.f32 f446, f117, f445; +add.f32 f447, f328, f333; +add.f32 f448, f447, f446; +add.f32 f449, f325, f340; +add.f32 f450, f135, f449; +add.f32 f451, f330, f335; +add.f32 f452, f451, f450; +fma.rn.f32 f453, f445, 0f3E9E377A, f117; +mul.f32 f454, f447, 0f3F4F1BBD; +sub.f32 f455, f453, f454; +sub.f32 f456, f325, f340; +mul.f32 f457, f456, 0f3F737871; +sub.f32 f458, f330, f335; +mul.f32 f459, f458, 0fBF167918; +sub.f32 f460, f459, f457; +sub.f32 f461, f455, f460; +add.f32 f462, f460, f455; +mul.f32 f463, f445, 0f3F4F1BBD; +sub.f32 f464, f117, f463; +fma.rn.f32 f465, f447, 0f3E9E377A, f464; +mul.f32 f466, f456, 0f3F167918; +mul.f32 f467, f458, 0f3F737871; +sub.f32 f468, f467, f466; +sub.f32 f469, f465, f468; +add.f32 f470, f468, f465; +fma.rn.f32 f471, f449, 0f3E9E377A, f135; +mul.f32 f472, f451, 0f3F4F1BBD; +sub.f32 f473, f471, f472; +sub.f32 f474, f323, f338; +mul.f32 f475, f474, 0f3F737871; +sub.f32 f476, f328, f333; +mul.f32 f477, f476, 0fBF167918; +sub.f32 f478, f477, f475; +add.f32 f479, f478, f473; +sub.f32 f480, f473, f478; +mul.f32 f481, f449, 0f3F4F1BBD; +sub.f32 f482, f135, f481; +fma.rn.f32 f483, f451, 0f3E9E377A, f482; +mul.f32 f484, f474, 0f3F167918; +mul.f32 f485, f476, 0f3F737871; +sub.f32 f486, f485, f484; +add.f32 f487, f486, f483; +sub.f32 f488, f483, f486; +add.f32 f489, f343, f358; +add.f32 f490, f125, f489; +add.f32 f491, f348, f353; +add.f32 f492, f491, f490; +add.f32 f493, f345, f360; +add.f32 f494, f143, f493; +add.f32 f495, f350, f355; +add.f32 f496, f495, f494; +fma.rn.f32 f497, f489, 0f3E9E377A, f125; +mul.f32 f498, f491, 0f3F4F1BBD; +sub.f32 f499, f497, f498; +sub.f32 f500, f345, f360; +mul.f32 f501, f500, 0f3F737871; +sub.f32 f502, f350, f355; +mul.f32 f503, f502, 0fBF167918; +sub.f32 f504, f503, f501; +sub.f32 f505, f499, f504; +add.f32 f506, f504, f499; +mul.f32 f507, f489, 0f3F4F1BBD; +sub.f32 f508, f125, f507; +fma.rn.f32 f509, f491, 0f3E9E377A, f508; +mul.f32 f510, f500, 0f3F167918; +mul.f32 f511, f502, 0f3F737871; +sub.f32 f512, f511, f510; +sub.f32 f513, f509, f512; +add.f32 f514, f512, f509; +fma.rn.f32 f515, f493, 0f3E9E377A, f143; +mul.f32 f516, f495, 0f3F4F1BBD; +sub.f32 f517, f515, f516; +sub.f32 f518, f343, f358; +mul.f32 f519, f518, 0f3F737871; +sub.f32 f520, f348, f353; +mul.f32 f521, f520, 0fBF167918; +sub.f32 f522, f521, f519; +add.f32 f523, f522, f517; +sub.f32 f524, f517, f522; +mul.f32 f525, f493, 0f3F4F1BBD; +sub.f32 f526, f143, f525; +fma.rn.f32 f527, f495, 0f3E9E377A, f526; +mul.f32 f528, f518, 0f3F167918; +mul.f32 f529, f520, 0f3F737871; +sub.f32 f530, f529, f528; +add.f32 f531, f530, f527; +sub.f32 f532, f527, f530; +add.f32 f533, f363, f378; +add.f32 f534, f126, f533; +add.f32 f535, f368, f373; +add.f32 f536, f535, f534; +add.f32 f537, f365, f380; +add.f32 f538, f144, f537; +add.f32 f539, f370, f375; +add.f32 f540, f539, f538; +fma.rn.f32 f541, f533, 0f3E9E377A, f126; +mul.f32 f542, f535, 0f3F4F1BBD; +sub.f32 f543, f541, f542; +sub.f32 f544, f365, f380; +mul.f32 f545, f544, 0f3F737871; +sub.f32 f546, f370, f375; +mul.f32 f547, f546, 0fBF167918; +sub.f32 f548, f547, f545; +sub.f32 f549, f543, f548; +add.f32 f550, f548, f543; +mul.f32 f551, f533, 0f3F4F1BBD; +sub.f32 f552, f126, f551; +fma.rn.f32 f553, f535, 0f3E9E377A, f552; +mul.f32 f554, f544, 0f3F167918; +mul.f32 f555, f546, 0f3F737871; +sub.f32 f556, f555, f554; +sub.f32 f557, f553, f556; +add.f32 f558, f556, f553; +fma.rn.f32 f559, f537, 0f3E9E377A, f144; +mul.f32 f560, f539, 0f3F4F1BBD; +sub.f32 f561, f559, f560; +sub.f32 f562, f363, f378; +mul.f32 f563, f562, 0f3F737871; +sub.f32 f564, f368, f373; +mul.f32 f565, f564, 0fBF167918; +sub.f32 f566, f565, f563; +add.f32 f567, f566, f561; +sub.f32 f568, f561, f566; +mul.f32 f569, f537, 0f3F4F1BBD; +sub.f32 f570, f144, f569; +fma.rn.f32 f571, f539, 0f3E9E377A, f570; +mul.f32 f572, f562, 0f3F167918; +mul.f32 f573, f564, 0f3F737871; +sub.f32 f574, f573, f572; +add.f32 f575, f574, f571; +sub.f32 f576, f571, f574; +add.f32 f577, f383, f398; +add.f32 f578, f118, f577; +add.f32 f579, f388, f393; +add.f32 f580, f579, f578; +add.f32 f581, f385, f400; +add.f32 f582, f136, f581; +add.f32 f583, f390, f395; +add.f32 f584, f583, f582; +fma.rn.f32 f585, f577, 0f3E9E377A, f118; +mul.f32 f586, f579, 0f3F4F1BBD; +sub.f32 f587, f585, f586; +sub.f32 f588, f385, f400; +mul.f32 f589, f588, 0f3F737871; +sub.f32 f590, f390, f395; +mul.f32 f591, f590, 0fBF167918; +sub.f32 f592, f591, f589; +sub.f32 f593, f587, f592; +add.f32 f594, f592, f587; +mul.f32 f595, f577, 0f3F4F1BBD; +sub.f32 f596, f118, f595; +fma.rn.f32 f597, f579, 0f3E9E377A, f596; +mul.f32 f598, f588, 0f3F167918; +mul.f32 f599, f590, 0f3F737871; +sub.f32 f600, f599, f598; +sub.f32 f601, f597, f600; +add.f32 f602, f600, f597; +fma.rn.f32 f603, f581, 0f3E9E377A, f136; +mul.f32 f604, f583, 0f3F4F1BBD; +sub.f32 f605, f603, f604; +sub.f32 f606, f383, f398; +mul.f32 f607, f606, 0f3F737871; +sub.f32 f608, f388, f393; +mul.f32 f609, f608, 0fBF167918; +sub.f32 f610, f609, f607; +add.f32 f611, f610, f605; +sub.f32 f612, f605, f610; +mul.f32 f613, f581, 0f3F4F1BBD; +sub.f32 f614, f136, f613; +fma.rn.f32 f615, f583, 0f3E9E377A, f614; +mul.f32 f616, f606, 0f3F167918; +mul.f32 f617, f608, 0f3F737871; +sub.f32 f618, f617, f616; +add.f32 f619, f618, f615; +sub.f32 f620, f615, f618; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f621, f622}, [rd6]; +mul.f32 f625, f621, f448; +mul.f32 f626, f622, f452; +sub.f32 f627, f625, f626; +mul.f32 f628, f621, f452; +fma.rn.f32 f629, f622, f448, f628; +mul.f32 f630, f621, f621; +mul.f32 f631, f622, f622; +sub.f32 f632, f630, f631; +mul.f32 f633, f622, f621; +fma.rn.f32 f634, f622, f621, f633; +mul.f32 f635, f632, f492; +mul.f32 f636, f634, f496; +sub.f32 f637, f635, f636; +mul.f32 f638, f632, f496; +fma.rn.f32 f639, f634, f492, f638; +mul.f32 f640, f621, f632; +mul.f32 f641, f622, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f621, f634; +fma.rn.f32 f644, f622, f632, f643; +mul.f32 f645, f642, f536; +mul.f32 f646, f644, f540; +sub.f32 f647, f645, f646; +mul.f32 f648, f642, f540; +fma.rn.f32 f649, f644, f536, f648; +mul.f32 f650, f621, f642; +mul.f32 f651, f622, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f621, f644; +fma.rn.f32 f654, f622, f642, f653; +mul.f32 f655, f652, f580; +mul.f32 f656, f654, f584; +sub.f32 f657, f655, f656; +mul.f32 f658, f652, f584; +fma.rn.f32 f659, f654, f580, f658; +mul.f32 f660, f621, f652; +mul.f32 f661, f622, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f621, f654; +fma.rn.f32 f664, f622, f652, f663; +mul.f32 f665, f662, f417; +mul.f32 f666, f664, f435; +sub.f32 f667, f665, f666; +mul.f32 f668, f662, f435; +fma.rn.f32 f669, f664, f417, f668; +mul.f32 f670, f621, f662; +mul.f32 f671, f622, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f621, f664; +fma.rn.f32 f674, f622, f662, f673; +mul.f32 f675, f672, f461; +mul.f32 f676, f674, f479; +sub.f32 f677, f675, f676; +mul.f32 f678, f672, f479; +fma.rn.f32 f679, f674, f461, f678; +mul.f32 f680, f621, f672; +mul.f32 f681, f622, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f621, f674; +fma.rn.f32 f684, f622, f672, f683; +mul.f32 f685, f682, f505; +mul.f32 f686, f684, f523; +sub.f32 f687, f685, f686; +mul.f32 f688, f682, f523; +fma.rn.f32 f689, f684, f505, f688; +mul.f32 f690, f621, f682; +mul.f32 f691, f622, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f621, f684; +fma.rn.f32 f694, f622, f682, f693; +mul.f32 f695, f692, f549; +mul.f32 f696, f694, f567; +sub.f32 f697, f695, f696; +mul.f32 f698, f692, f567; +fma.rn.f32 f699, f694, f549, f698; +mul.f32 f700, f621, f692; +mul.f32 f701, f622, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f621, f694; +fma.rn.f32 f704, f622, f692, f703; +mul.f32 f705, f702, f593; +mul.f32 f706, f704, f611; +sub.f32 f707, f705, f706; +mul.f32 f708, f702, f611; +fma.rn.f32 f709, f704, f593, f708; +mul.f32 f710, f621, f702; +mul.f32 f711, f622, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f621, f704; +fma.rn.f32 f714, f622, f702, f713; +mul.f32 f715, f712, f425; +mul.f32 f716, f714, f443; +sub.f32 f717, f715, f716; +mul.f32 f718, f712, f443; +fma.rn.f32 f719, f714, f425, f718; +mul.f32 f720, f621, f712; +mul.f32 f721, f622, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f621, f714; +fma.rn.f32 f724, f622, f712, f723; +mul.f32 f725, f722, f469; +mul.f32 f726, f724, f487; +sub.f32 f727, f725, f726; +mul.f32 f728, f722, f487; +fma.rn.f32 f729, f724, f469, f728; +mul.f32 f730, f621, f722; +mul.f32 f731, f622, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f621, f724; +fma.rn.f32 f734, f622, f722, f733; +mul.f32 f735, f732, f513; +mul.f32 f736, f734, f531; +sub.f32 f737, f735, f736; +mul.f32 f738, f732, f531; +fma.rn.f32 f739, f734, f513, f738; +mul.f32 f740, f621, f732; +mul.f32 f741, f622, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f621, f734; +fma.rn.f32 f744, f622, f732, f743; +mul.f32 f745, f742, f557; +mul.f32 f746, f744, f575; +sub.f32 f747, f745, f746; +mul.f32 f748, f742, f575; +fma.rn.f32 f749, f744, f557, f748; +mul.f32 f750, f621, f742; +mul.f32 f751, f622, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f621, f744; +fma.rn.f32 f754, f622, f742, f753; +mul.f32 f755, f752, f601; +mul.f32 f756, f754, f619; +sub.f32 f757, f755, f756; +mul.f32 f758, f752, f619; +fma.rn.f32 f759, f754, f601, f758; +mul.f32 f760, f621, f752; +mul.f32 f761, f622, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f621, f754; +fma.rn.f32 f764, f622, f752, f763; +mul.f32 f765, f762, f426; +mul.f32 f766, f764, f444; +sub.f32 f767, f765, f766; +mul.f32 f768, f762, f444; +fma.rn.f32 f769, f764, f426, f768; +mul.f32 f770, f621, f762; +mul.f32 f771, f622, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f621, f764; +fma.rn.f32 f774, f622, f762, f773; +mul.f32 f775, f772, f470; +mul.f32 f776, f774, f488; +sub.f32 f777, f775, f776; +mul.f32 f778, f772, f488; +fma.rn.f32 f779, f774, f470, f778; +mul.f32 f780, f621, f772; +mul.f32 f781, f622, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f621, f774; +fma.rn.f32 f784, f622, f772, f783; +mul.f32 f785, f782, f514; +mul.f32 f786, f784, f532; +sub.f32 f787, f785, f786; +mul.f32 f788, f782, f532; +fma.rn.f32 f789, f784, f514, f788; +mul.f32 f790, f621, f782; +mul.f32 f791, f622, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f621, f784; +fma.rn.f32 f794, f622, f782, f793; +mul.f32 f795, f792, f558; +mul.f32 f796, f794, f576; +sub.f32 f797, f795, f796; +mul.f32 f798, f792, f576; +fma.rn.f32 f799, f794, f558, f798; +mul.f32 f800, f621, f792; +mul.f32 f801, f622, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f621, f794; +fma.rn.f32 f804, f622, f792, f803; +mul.f32 f805, f802, f602; +mul.f32 f806, f804, f620; +sub.f32 f807, f805, f806; +mul.f32 f808, f802, f620; +fma.rn.f32 f809, f804, f602, f808; +mul.f32 f810, f621, f802; +mul.f32 f811, f622, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f621, f804; +fma.rn.f32 f814, f622, f802, f813; +mul.f32 f815, f812, f418; +mul.f32 f816, f814, f436; +sub.f32 f817, f815, f816; +mul.f32 f818, f812, f436; +fma.rn.f32 f819, f814, f418, f818; +mul.f32 f820, f621, f812; +mul.f32 f821, f622, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f621, f814; +fma.rn.f32 f824, f622, f812, f823; +mul.f32 f825, f822, f462; +mul.f32 f826, f824, f480; +sub.f32 f827, f825, f826; +mul.f32 f828, f822, f480; +fma.rn.f32 f829, f824, f462, f828; +mul.f32 f830, f621, f822; +mul.f32 f831, f622, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f621, f824; +fma.rn.f32 f834, f622, f822, f833; +mul.f32 f835, f832, f506; +mul.f32 f836, f834, f524; +sub.f32 f837, f835, f836; +mul.f32 f838, f832, f524; +fma.rn.f32 f839, f834, f506, f838; +mul.f32 f840, f621, f832; +mul.f32 f841, f622, f834; +sub.f32 f842, f840, f841; +mul.f32 f843, f621, f834; +fma.rn.f32 f844, f622, f832, f843; +mul.f32 f845, f842, f550; +mul.f32 f846, f844, f568; +sub.f32 f847, f845, f846; +mul.f32 f848, f842, f568; +fma.rn.f32 f849, f844, f550, f848; +mul.f32 f850, f621, f842; +mul.f32 f851, f622, f844; +sub.f32 f852, f850, f851; +mul.f32 f853, f621, f844; +fma.rn.f32 f854, f622, f842, f853; +mul.f32 f855, f852, f594; +mul.f32 f856, f854, f612; +sub.f32 f857, f855, f856; +mul.f32 f858, f852, f612; +fma.rn.f32 f859, f854, f594, f858; +mad.lo.s32 r8, r5, 12500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f404; +st.shared.f32 [r9+4], f627; +st.shared.f32 [r9+8], f637; +st.shared.f32 [r9+12], f647; +st.shared.f32 [r9+16], f657; +st.shared.f32 [r9+20], f667; +st.shared.f32 [r9+24], f677; +st.shared.f32 [r9+28], f687; +st.shared.f32 [r9+32], f697; +st.shared.f32 [r9+36], f707; +st.shared.f32 [r9+40], f717; +st.shared.f32 [r9+44], f727; +st.shared.f32 [r9+48], f737; +st.shared.f32 [r9+52], f747; +st.shared.f32 [r9+56], f757; +st.shared.f32 [r9+60], f767; +st.shared.f32 [r9+64], f777; +st.shared.f32 [r9+68], f787; +st.shared.f32 [r9+72], f797; +st.shared.f32 [r9+76], f807; +st.shared.f32 [r9+80], f817; +st.shared.f32 [r9+84], f827; +st.shared.f32 [r9+88], f837; +st.shared.f32 [r9+92], f847; +st.shared.f32 [r9+96], f857; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f860, [r10]; +ld.shared.f32 f861, [r10+500]; +ld.shared.f32 f862, [r10+1000]; +ld.shared.f32 f863, [r10+1500]; +ld.shared.f32 f864, [r10+2000]; +ld.shared.f32 f865, [r10+2500]; +ld.shared.f32 f866, [r10+3000]; +ld.shared.f32 f867, [r10+3500]; +ld.shared.f32 f868, [r10+4000]; +ld.shared.f32 f869, [r10+4500]; +ld.shared.f32 f870, [r10+5000]; +ld.shared.f32 f871, [r10+5500]; +ld.shared.f32 f872, [r10+6000]; +ld.shared.f32 f873, [r10+6500]; +ld.shared.f32 f874, [r10+7000]; +ld.shared.f32 f875, [r10+7500]; +ld.shared.f32 f876, [r10+8000]; +ld.shared.f32 f877, [r10+8500]; +ld.shared.f32 f878, [r10+9000]; +ld.shared.f32 f879, [r10+9500]; +ld.shared.f32 f880, [r10+10000]; +ld.shared.f32 f881, [r10+10500]; +ld.shared.f32 f882, [r10+11000]; +ld.shared.f32 f883, [r10+11500]; +ld.shared.f32 f884, [r10+12000]; +barrier.sync 0; +st.shared.f32 [r9], f408; +st.shared.f32 [r9+4], f629; +st.shared.f32 [r9+8], f639; +st.shared.f32 [r9+12], f649; +st.shared.f32 [r9+16], f659; +st.shared.f32 [r9+20], f669; +st.shared.f32 [r9+24], f679; +st.shared.f32 [r9+28], f689; +st.shared.f32 [r9+32], f699; +st.shared.f32 [r9+36], f709; +st.shared.f32 [r9+40], f719; +st.shared.f32 [r9+44], f729; +st.shared.f32 [r9+48], f739; +st.shared.f32 [r9+52], f749; +st.shared.f32 [r9+56], f759; +st.shared.f32 [r9+60], f769; +st.shared.f32 [r9+64], f779; +st.shared.f32 [r9+68], f789; +st.shared.f32 [r9+72], f799; +st.shared.f32 [r9+76], f809; +st.shared.f32 [r9+80], f819; +st.shared.f32 [r9+84], f829; +st.shared.f32 [r9+88], f839; +st.shared.f32 [r9+92], f849; +st.shared.f32 [r9+96], f859; +barrier.sync 0; +ld.shared.f32 f885, [r10]; +ld.shared.f32 f886, [r10+500]; +ld.shared.f32 f887, [r10+1000]; +ld.shared.f32 f888, [r10+1500]; +ld.shared.f32 f889, [r10+2000]; +ld.shared.f32 f890, [r10+2500]; +ld.shared.f32 f891, [r10+3000]; +ld.shared.f32 f892, [r10+3500]; +ld.shared.f32 f893, [r10+4000]; +ld.shared.f32 f894, [r10+4500]; +ld.shared.f32 f895, [r10+5000]; +ld.shared.f32 f896, [r10+5500]; +ld.shared.f32 f897, [r10+6000]; +ld.shared.f32 f898, [r10+6500]; +ld.shared.f32 f899, [r10+7000]; +ld.shared.f32 f900, [r10+7500]; +ld.shared.f32 f901, [r10+8000]; +ld.shared.f32 f902, [r10+8500]; +ld.shared.f32 f903, [r10+9000]; +ld.shared.f32 f904, [r10+9500]; +ld.shared.f32 f905, [r10+10000]; +ld.shared.f32 f906, [r10+10500]; +ld.shared.f32 f907, [r10+11000]; +ld.shared.f32 f908, [r10+11500]; +ld.shared.f32 f909, [r10+12000]; +add.f32 f910, f865, f880; +add.f32 f911, f860, f910; +add.f32 f912, f870, f875; +add.f32 f913, f912, f911; +add.f32 f914, f890, f905; +add.f32 f915, f885, f914; +add.f32 f916, f895, f900; +add.f32 f917, f916, f915; +fma.rn.f32 f918, f910, 0f3E9E377A, f860; +mul.f32 f919, f912, 0f3F4F1BBD; +sub.f32 f920, f918, f919; +sub.f32 f921, f890, f905; +mul.f32 f922, f921, 0f3F737871; +sub.f32 f923, f895, f900; +mul.f32 f924, f923, 0fBF167918; +sub.f32 f925, f924, f922; +sub.f32 f926, f920, f925; +add.f32 f927, f925, f920; +mul.f32 f928, f910, 0f3F4F1BBD; +sub.f32 f929, f860, f928; +fma.rn.f32 f930, f912, 0f3E9E377A, f929; +mul.f32 f931, f921, 0f3F167918; +mul.f32 f932, f923, 0f3F737871; +sub.f32 f933, f932, f931; +sub.f32 f934, f930, f933; +add.f32 f935, f933, f930; +fma.rn.f32 f936, f914, 0f3E9E377A, f885; +mul.f32 f937, f916, 0f3F4F1BBD; +sub.f32 f938, f936, f937; +sub.f32 f939, f865, f880; +mul.f32 f940, f939, 0f3F737871; +sub.f32 f941, f870, f875; +mul.f32 f942, f941, 0fBF167918; +sub.f32 f943, f942, f940; +add.f32 f944, f943, f938; +sub.f32 f945, f938, f943; +mul.f32 f946, f914, 0f3F4F1BBD; +sub.f32 f947, f885, f946; +fma.rn.f32 f948, f916, 0f3E9E377A, f947; +mul.f32 f949, f939, 0f3F167918; +mul.f32 f950, f941, 0f3F737871; +sub.f32 f951, f950, f949; +add.f32 f952, f951, f948; +sub.f32 f953, f948, f951; +add.f32 f954, f866, f881; +add.f32 f955, f861, f954; +add.f32 f956, f871, f876; +add.f32 f957, f956, f955; +add.f32 f958, f891, f906; +add.f32 f959, f886, f958; +add.f32 f960, f896, f901; +add.f32 f961, f960, f959; +fma.rn.f32 f962, f954, 0f3E9E377A, f861; +mul.f32 f963, f956, 0f3F4F1BBD; +sub.f32 f964, f962, f963; +sub.f32 f965, f891, f906; +mul.f32 f966, f965, 0f3F737871; +sub.f32 f967, f896, f901; +mul.f32 f968, f967, 0fBF167918; +sub.f32 f969, f968, f966; +sub.f32 f970, f964, f969; +add.f32 f971, f969, f964; +mul.f32 f972, f954, 0f3F4F1BBD; +sub.f32 f973, f861, f972; +fma.rn.f32 f974, f956, 0f3E9E377A, f973; +mul.f32 f975, f965, 0f3F167918; +mul.f32 f976, f967, 0f3F737871; +sub.f32 f977, f976, f975; +sub.f32 f978, f974, f977; +add.f32 f979, f977, f974; +fma.rn.f32 f980, f958, 0f3E9E377A, f886; +mul.f32 f981, f960, 0f3F4F1BBD; +sub.f32 f982, f980, f981; +sub.f32 f983, f866, f881; +mul.f32 f984, f983, 0f3F737871; +sub.f32 f985, f871, f876; +mul.f32 f986, f985, 0fBF167918; +sub.f32 f987, f986, f984; +add.f32 f988, f987, f982; +sub.f32 f989, f982, f987; +mul.f32 f990, f958, 0f3F4F1BBD; +sub.f32 f991, f886, f990; +fma.rn.f32 f992, f960, 0f3E9E377A, f991; +mul.f32 f993, f983, 0f3F167918; +mul.f32 f994, f985, 0f3F737871; +sub.f32 f995, f994, f993; +add.f32 f996, f995, f992; +sub.f32 f997, f992, f995; +add.f32 f998, f867, f882; +add.f32 f999, f862, f998; +add.f32 f1000, f872, f877; +add.f32 f1001, f1000, f999; +add.f32 f1002, f892, f907; +add.f32 f1003, f887, f1002; +add.f32 f1004, f897, f902; +add.f32 f1005, f1004, f1003; +fma.rn.f32 f1006, f998, 0f3E9E377A, f862; +mul.f32 f1007, f1000, 0f3F4F1BBD; +sub.f32 f1008, f1006, f1007; +sub.f32 f1009, f892, f907; +mul.f32 f1010, f1009, 0f3F737871; +sub.f32 f1011, f897, f902; +mul.f32 f1012, f1011, 0fBF167918; +sub.f32 f1013, f1012, f1010; +sub.f32 f1014, f1008, f1013; +add.f32 f1015, f1013, f1008; +mul.f32 f1016, f998, 0f3F4F1BBD; +sub.f32 f1017, f862, f1016; +fma.rn.f32 f1018, f1000, 0f3E9E377A, f1017; +mul.f32 f1019, f1009, 0f3F167918; +mul.f32 f1020, f1011, 0f3F737871; +sub.f32 f1021, f1020, f1019; +sub.f32 f1022, f1018, f1021; +add.f32 f1023, f1021, f1018; +fma.rn.f32 f1024, f1002, 0f3E9E377A, f887; +mul.f32 f1025, f1004, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f867, f882; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f872, f877; +mul.f32 f1030, f1029, 0fBF167918; +sub.f32 f1031, f1030, f1028; +add.f32 f1032, f1031, f1026; +sub.f32 f1033, f1026, f1031; +mul.f32 f1034, f1002, 0f3F4F1BBD; +sub.f32 f1035, f887, f1034; +fma.rn.f32 f1036, f1004, 0f3E9E377A, f1035; +mul.f32 f1037, f1027, 0f3F167918; +mul.f32 f1038, f1029, 0f3F737871; +sub.f32 f1039, f1038, f1037; +add.f32 f1040, f1039, f1036; +sub.f32 f1041, f1036, f1039; +add.f32 f1042, f868, f883; +add.f32 f1043, f863, f1042; +add.f32 f1044, f873, f878; +add.f32 f1045, f1044, f1043; +add.f32 f1046, f893, f908; +add.f32 f1047, f888, f1046; +add.f32 f1048, f898, f903; +add.f32 f1049, f1048, f1047; +fma.rn.f32 f1050, f1042, 0f3E9E377A, f863; +mul.f32 f1051, f1044, 0f3F4F1BBD; +sub.f32 f1052, f1050, f1051; +sub.f32 f1053, f893, f908; +mul.f32 f1054, f1053, 0f3F737871; +sub.f32 f1055, f898, f903; +mul.f32 f1056, f1055, 0fBF167918; +sub.f32 f1057, f1056, f1054; +sub.f32 f1058, f1052, f1057; +add.f32 f1059, f1057, f1052; +mul.f32 f1060, f1042, 0f3F4F1BBD; +sub.f32 f1061, f863, f1060; +fma.rn.f32 f1062, f1044, 0f3E9E377A, f1061; +mul.f32 f1063, f1053, 0f3F167918; +mul.f32 f1064, f1055, 0f3F737871; +sub.f32 f1065, f1064, f1063; +sub.f32 f1066, f1062, f1065; +add.f32 f1067, f1065, f1062; +fma.rn.f32 f1068, f1046, 0f3E9E377A, f888; +mul.f32 f1069, f1048, 0f3F4F1BBD; +sub.f32 f1070, f1068, f1069; +sub.f32 f1071, f868, f883; +mul.f32 f1072, f1071, 0f3F737871; +sub.f32 f1073, f873, f878; +mul.f32 f1074, f1073, 0fBF167918; +sub.f32 f1075, f1074, f1072; +add.f32 f1076, f1075, f1070; +sub.f32 f1077, f1070, f1075; +mul.f32 f1078, f1046, 0f3F4F1BBD; +sub.f32 f1079, f888, f1078; +fma.rn.f32 f1080, f1048, 0f3E9E377A, f1079; +mul.f32 f1081, f1071, 0f3F167918; +mul.f32 f1082, f1073, 0f3F737871; +sub.f32 f1083, f1082, f1081; +add.f32 f1084, f1083, f1080; +sub.f32 f1085, f1080, f1083; +add.f32 f1086, f869, f884; +add.f32 f1087, f864, f1086; +add.f32 f1088, f874, f879; +add.f32 f1089, f1088, f1087; +add.f32 f1090, f894, f909; +add.f32 f1091, f889, f1090; +add.f32 f1092, f899, f904; +add.f32 f1093, f1092, f1091; +fma.rn.f32 f1094, f1086, 0f3E9E377A, f864; +mul.f32 f1095, f1088, 0f3F4F1BBD; +sub.f32 f1096, f1094, f1095; +sub.f32 f1097, f894, f909; +mul.f32 f1098, f1097, 0f3F737871; +sub.f32 f1099, f899, f904; +mul.f32 f1100, f1099, 0fBF167918; +sub.f32 f1101, f1100, f1098; +sub.f32 f1102, f1096, f1101; +add.f32 f1103, f1101, f1096; +mul.f32 f1104, f1086, 0f3F4F1BBD; +sub.f32 f1105, f864, f1104; +fma.rn.f32 f1106, f1088, 0f3E9E377A, f1105; +mul.f32 f1107, f1097, 0f3F167918; +mul.f32 f1108, f1099, 0f3F737871; +sub.f32 f1109, f1108, f1107; +sub.f32 f1110, f1106, f1109; +add.f32 f1111, f1109, f1106; +fma.rn.f32 f1112, f1090, 0f3E9E377A, f889; +mul.f32 f1113, f1092, 0f3F4F1BBD; +sub.f32 f1114, f1112, f1113; +sub.f32 f1115, f869, f884; +mul.f32 f1116, f1115, 0f3F737871; +sub.f32 f1117, f874, f879; +mul.f32 f1118, f1117, 0fBF167918; +sub.f32 f1119, f1118, f1116; +add.f32 f1120, f1119, f1114; +sub.f32 f1121, f1114, f1119; +mul.f32 f1122, f1090, 0f3F4F1BBD; +sub.f32 f1123, f889, f1122; +fma.rn.f32 f1124, f1092, 0f3E9E377A, f1123; +mul.f32 f1125, f1115, 0f3F167918; +mul.f32 f1126, f1117, 0f3F737871; +sub.f32 f1127, f1126, f1125; +add.f32 f1128, f1127, f1124; +sub.f32 f1129, f1124, f1127; +mul.f32 f1130, f970, 0f3F77F511; +mul.f32 f1131, f988, 0fBE7EA890; +sub.f32 f1132, f1130, f1131; +mul.f32 f1133, f988, 0f3F77F511; +fma.rn.f32 f1134, f970, 0fBE7EA890, f1133; +mul.f32 f1135, f1014, 0f3F6055A2; +mul.f32 f1136, f1032, 0fBEF6A86B; +sub.f32 f1137, f1135, f1136; +mul.f32 f1138, f1032, 0f3F6055A2; +fma.rn.f32 f1139, f1014, 0fBEF6A86B, f1138; +mul.f32 f1140, f1058, 0f3F3A9DB0; +mul.f32 f1141, f1076, 0fBF2F3E7B; +sub.f32 f1142, f1140, f1141; +mul.f32 f1143, f1076, 0f3F3A9DB0; +fma.rn.f32 f1144, f1058, 0fBF2F3E7B, f1143; +mul.f32 f1145, f1102, 0f3F092BF2; +mul.f32 f1146, f1120, 0fBF5825E0; +sub.f32 f1147, f1145, f1146; +mul.f32 f1148, f1120, 0f3F092BF2; +fma.rn.f32 f1149, f1102, 0fBF5825E0, f1148; +mul.f32 f1150, f978, 0f3F6055A2; +mul.f32 f1151, f996, 0fBEF6A86B; +sub.f32 f1152, f1150, f1151; +mul.f32 f1153, f996, 0f3F6055A2; +fma.rn.f32 f1154, f978, 0fBEF6A86B, f1153; +mul.f32 f1155, f1022, 0f3F092BF2; +mul.f32 f1156, f1040, 0fBF5825E0; +sub.f32 f1157, f1155, f1156; +mul.f32 f1158, f1040, 0f3F092BF2; +fma.rn.f32 f1159, f1022, 0fBF5825E0, f1158; +mul.f32 f1160, f1066, 0f3D809851; +mul.f32 f1161, f1084, 0fBF7F7EAE; +sub.f32 f1162, f1160, f1161; +mul.f32 f1163, f1084, 0f3D809851; +fma.rn.f32 f1164, f1066, 0fBF7F7EAE, f1163; +mul.f32 f1165, f1110, 0fBED9FFBE; +mul.f32 f1166, f1128, 0fBF67A2BF; +sub.f32 f1167, f1165, f1166; +mul.f32 f1168, f1128, 0fBED9FFBE; +fma.rn.f32 f1169, f1110, 0fBF67A2BF, f1168; +mul.f32 f1170, f979, 0f3F3A9DB0; +mul.f32 f1171, f997, 0fBF2F3E7B; +sub.f32 f1172, f1170, f1171; +mul.f32 f1173, f997, 0f3F3A9DB0; +fma.rn.f32 f1174, f979, 0fBF2F3E7B, f1173; +mul.f32 f1175, f1023, 0f3D809851; +mul.f32 f1176, f1041, 0fBF7F7EAE; +sub.f32 f1177, f1175, f1176; +mul.f32 f1178, f1041, 0f3D809851; +fma.rn.f32 f1179, f1023, 0fBF7F7EAE, f1178; +mul.f32 f1180, f1067, 0fBF232E38; +mul.f32 f1181, f1085, 0fBF45405B; +sub.f32 f1182, f1180, f1181; +mul.f32 f1183, f1085, 0fBF232E38; +fma.rn.f32 f1184, f1067, 0fBF45405B, f1183; +mul.f32 f1185, f1111, 0fBF7DFB3B; +mul.f32 f1186, f1129, 0fBE00575B; +sub.f32 f1187, f1185, f1186; +mul.f32 f1188, f1129, 0fBF7DFB3B; +fma.rn.f32 f1189, f1111, 0fBE00575B, f1188; +mul.f32 f1190, f971, 0f3F092BF2; +mul.f32 f1191, f989, 0fBF5825E0; +sub.f32 f1192, f1190, f1191; +mul.f32 f1193, f989, 0f3F092BF2; +fma.rn.f32 f1194, f971, 0fBF5825E0, f1193; +mul.f32 f1195, f1015, 0fBED9FFBE; +mul.f32 f1196, f1033, 0fBF67A2BF; +sub.f32 f1197, f1195, f1196; +mul.f32 f1198, f1033, 0fBED9FFBE; +fma.rn.f32 f1199, f1015, 0fBF67A2BF, f1198; +mul.f32 f1200, f1059, 0fBF7DFB3B; +mul.f32 f1201, f1077, 0fBE00575B; +sub.f32 f1202, f1200, f1201; +mul.f32 f1203, f1077, 0fBF7DFB3B; +fma.rn.f32 f1204, f1059, 0fBE00575B, f1203; +mul.f32 f1205, f1103, 0fBF232E38; +mul.f32 f1206, f1121, 0f3F45405B; +sub.f32 f1207, f1205, f1206; +mul.f32 f1208, f1121, 0fBF232E38; +fma.rn.f32 f1209, f1103, 0f3F45405B, f1208; +add.f32 f1210, f957, f1089; +add.f32 f1211, f913, f1210; +add.f32 f1212, f1001, f1045; +add.f32 f1213, f1212, f1211; +add.f32 f1214, f961, f1093; +add.f32 f1215, f917, f1214; +add.f32 f1216, f1005, f1049; +add.f32 f1217, f1216, f1215; +fma.rn.f32 f1218, f1210, 0f3E9E377A, f913; +mul.f32 f1219, f1212, 0f3F4F1BBD; +sub.f32 f1220, f1218, f1219; +sub.f32 f1221, f961, f1093; +mul.f32 f1222, f1221, 0f3F737871; +sub.f32 f1223, f1005, f1049; +mul.f32 f1224, f1223, 0fBF167918; +sub.f32 f1225, f1224, f1222; +sub.f32 f1226, f1220, f1225; +add.f32 f1227, f1225, f1220; +mul.f32 f1228, f1210, 0f3F4F1BBD; +sub.f32 f1229, f913, f1228; +fma.rn.f32 f1230, f1212, 0f3E9E377A, f1229; +mul.f32 f1231, f1221, 0f3F167918; +mul.f32 f1232, f1223, 0f3F737871; +sub.f32 f1233, f1232, f1231; +sub.f32 f1234, f1230, f1233; +add.f32 f1235, f1233, f1230; +fma.rn.f32 f1236, f1214, 0f3E9E377A, f917; +mul.f32 f1237, f1216, 0f3F4F1BBD; +sub.f32 f1238, f1236, f1237; +sub.f32 f1239, f957, f1089; +mul.f32 f1240, f1239, 0f3F737871; +sub.f32 f1241, f1001, f1045; +mul.f32 f1242, f1241, 0fBF167918; +sub.f32 f1243, f1242, f1240; +add.f32 f1244, f1243, f1238; +sub.f32 f1245, f1238, f1243; +mul.f32 f1246, f1214, 0f3F4F1BBD; +sub.f32 f1247, f917, f1246; +fma.rn.f32 f1248, f1216, 0f3E9E377A, f1247; +mul.f32 f1249, f1239, 0f3F167918; +mul.f32 f1250, f1241, 0f3F737871; +sub.f32 f1251, f1250, f1249; +add.f32 f1252, f1251, f1248; +sub.f32 f1253, f1248, f1251; +add.f32 f1254, f1132, f1147; +add.f32 f1255, f926, f1254; +add.f32 f1256, f1137, f1142; +add.f32 f1257, f1256, f1255; +add.f32 f1258, f1134, f1149; +add.f32 f1259, f944, f1258; +add.f32 f1260, f1139, f1144; +add.f32 f1261, f1260, f1259; +fma.rn.f32 f1262, f1254, 0f3E9E377A, f926; +mul.f32 f1263, f1256, 0f3F4F1BBD; +sub.f32 f1264, f1262, f1263; +sub.f32 f1265, f1134, f1149; +mul.f32 f1266, f1265, 0f3F737871; +sub.f32 f1267, f1139, f1144; +mul.f32 f1268, f1267, 0fBF167918; +sub.f32 f1269, f1268, f1266; +sub.f32 f1270, f1264, f1269; +add.f32 f1271, f1269, f1264; +mul.f32 f1272, f1254, 0f3F4F1BBD; +sub.f32 f1273, f926, f1272; +fma.rn.f32 f1274, f1256, 0f3E9E377A, f1273; +mul.f32 f1275, f1265, 0f3F167918; +mul.f32 f1276, f1267, 0f3F737871; +sub.f32 f1277, f1276, f1275; +sub.f32 f1278, f1274, f1277; +add.f32 f1279, f1277, f1274; +fma.rn.f32 f1280, f1258, 0f3E9E377A, f944; +mul.f32 f1281, f1260, 0f3F4F1BBD; +sub.f32 f1282, f1280, f1281; +sub.f32 f1283, f1132, f1147; +mul.f32 f1284, f1283, 0f3F737871; +sub.f32 f1285, f1137, f1142; +mul.f32 f1286, f1285, 0fBF167918; +sub.f32 f1287, f1286, f1284; +add.f32 f1288, f1287, f1282; +sub.f32 f1289, f1282, f1287; +mul.f32 f1290, f1258, 0f3F4F1BBD; +sub.f32 f1291, f944, f1290; +fma.rn.f32 f1292, f1260, 0f3E9E377A, f1291; +mul.f32 f1293, f1283, 0f3F167918; +mul.f32 f1294, f1285, 0f3F737871; +sub.f32 f1295, f1294, f1293; +add.f32 f1296, f1295, f1292; +sub.f32 f1297, f1292, f1295; +add.f32 f1298, f1152, f1167; +add.f32 f1299, f934, f1298; +add.f32 f1300, f1157, f1162; +add.f32 f1301, f1300, f1299; +add.f32 f1302, f1154, f1169; +add.f32 f1303, f952, f1302; +add.f32 f1304, f1159, f1164; +add.f32 f1305, f1304, f1303; +fma.rn.f32 f1306, f1298, 0f3E9E377A, f934; +mul.f32 f1307, f1300, 0f3F4F1BBD; +sub.f32 f1308, f1306, f1307; +sub.f32 f1309, f1154, f1169; +mul.f32 f1310, f1309, 0f3F737871; +sub.f32 f1311, f1159, f1164; +mul.f32 f1312, f1311, 0fBF167918; +sub.f32 f1313, f1312, f1310; +sub.f32 f1314, f1308, f1313; +add.f32 f1315, f1313, f1308; +mul.f32 f1316, f1298, 0f3F4F1BBD; +sub.f32 f1317, f934, f1316; +fma.rn.f32 f1318, f1300, 0f3E9E377A, f1317; +mul.f32 f1319, f1309, 0f3F167918; +mul.f32 f1320, f1311, 0f3F737871; +sub.f32 f1321, f1320, f1319; +sub.f32 f1322, f1318, f1321; +add.f32 f1323, f1321, f1318; +fma.rn.f32 f1324, f1302, 0f3E9E377A, f952; +mul.f32 f1325, f1304, 0f3F4F1BBD; +sub.f32 f1326, f1324, f1325; +sub.f32 f1327, f1152, f1167; +mul.f32 f1328, f1327, 0f3F737871; +sub.f32 f1329, f1157, f1162; +mul.f32 f1330, f1329, 0fBF167918; +sub.f32 f1331, f1330, f1328; +add.f32 f1332, f1331, f1326; +sub.f32 f1333, f1326, f1331; +mul.f32 f1334, f1302, 0f3F4F1BBD; +sub.f32 f1335, f952, f1334; +fma.rn.f32 f1336, f1304, 0f3E9E377A, f1335; +mul.f32 f1337, f1327, 0f3F167918; +mul.f32 f1338, f1329, 0f3F737871; +sub.f32 f1339, f1338, f1337; +add.f32 f1340, f1339, f1336; +sub.f32 f1341, f1336, f1339; +add.f32 f1342, f1172, f1187; +add.f32 f1343, f935, f1342; +add.f32 f1344, f1177, f1182; +add.f32 f1345, f1344, f1343; +add.f32 f1346, f1174, f1189; +add.f32 f1347, f953, f1346; +add.f32 f1348, f1179, f1184; +add.f32 f1349, f1348, f1347; +fma.rn.f32 f1350, f1342, 0f3E9E377A, f935; +mul.f32 f1351, f1344, 0f3F4F1BBD; +sub.f32 f1352, f1350, f1351; +sub.f32 f1353, f1174, f1189; +mul.f32 f1354, f1353, 0f3F737871; +sub.f32 f1355, f1179, f1184; +mul.f32 f1356, f1355, 0fBF167918; +sub.f32 f1357, f1356, f1354; +sub.f32 f1358, f1352, f1357; +add.f32 f1359, f1357, f1352; +mul.f32 f1360, f1342, 0f3F4F1BBD; +sub.f32 f1361, f935, f1360; +fma.rn.f32 f1362, f1344, 0f3E9E377A, f1361; +mul.f32 f1363, f1353, 0f3F167918; +mul.f32 f1364, f1355, 0f3F737871; +sub.f32 f1365, f1364, f1363; +sub.f32 f1366, f1362, f1365; +add.f32 f1367, f1365, f1362; +fma.rn.f32 f1368, f1346, 0f3E9E377A, f953; +mul.f32 f1369, f1348, 0f3F4F1BBD; +sub.f32 f1370, f1368, f1369; +sub.f32 f1371, f1172, f1187; +mul.f32 f1372, f1371, 0f3F737871; +sub.f32 f1373, f1177, f1182; +mul.f32 f1374, f1373, 0fBF167918; +sub.f32 f1375, f1374, f1372; +add.f32 f1376, f1375, f1370; +sub.f32 f1377, f1370, f1375; +mul.f32 f1378, f1346, 0f3F4F1BBD; +sub.f32 f1379, f953, f1378; +fma.rn.f32 f1380, f1348, 0f3E9E377A, f1379; +mul.f32 f1381, f1371, 0f3F167918; +mul.f32 f1382, f1373, 0f3F737871; +sub.f32 f1383, f1382, f1381; +add.f32 f1384, f1383, f1380; +sub.f32 f1385, f1380, f1383; +add.f32 f1386, f1192, f1207; +add.f32 f1387, f927, f1386; +add.f32 f1388, f1197, f1202; +add.f32 f1389, f1388, f1387; +add.f32 f1390, f1194, f1209; +add.f32 f1391, f945, f1390; +add.f32 f1392, f1199, f1204; +add.f32 f1393, f1392, f1391; +fma.rn.f32 f1394, f1386, 0f3E9E377A, f927; +mul.f32 f1395, f1388, 0f3F4F1BBD; +sub.f32 f1396, f1394, f1395; +sub.f32 f1397, f1194, f1209; +mul.f32 f1398, f1397, 0f3F737871; +sub.f32 f1399, f1199, f1204; +mul.f32 f1400, f1399, 0fBF167918; +sub.f32 f1401, f1400, f1398; +sub.f32 f1402, f1396, f1401; +add.f32 f1403, f1401, f1396; +mul.f32 f1404, f1386, 0f3F4F1BBD; +sub.f32 f1405, f927, f1404; +fma.rn.f32 f1406, f1388, 0f3E9E377A, f1405; +mul.f32 f1407, f1397, 0f3F167918; +mul.f32 f1408, f1399, 0f3F737871; +sub.f32 f1409, f1408, f1407; +sub.f32 f1410, f1406, f1409; +add.f32 f1411, f1409, f1406; +fma.rn.f32 f1412, f1390, 0f3E9E377A, f945; +mul.f32 f1413, f1392, 0f3F4F1BBD; +sub.f32 f1414, f1412, f1413; +sub.f32 f1415, f1192, f1207; +mul.f32 f1416, f1415, 0f3F737871; +sub.f32 f1417, f1197, f1202; +mul.f32 f1418, f1417, 0fBF167918; +sub.f32 f1419, f1418, f1416; +add.f32 f1420, f1419, f1414; +sub.f32 f1421, f1414, f1419; +mul.f32 f1422, f1390, 0f3F4F1BBD; +sub.f32 f1423, f945, f1422; +fma.rn.f32 f1424, f1392, 0f3E9E377A, f1423; +mul.f32 f1425, f1415, 0f3F167918; +mul.f32 f1426, f1417, 0f3F737871; +sub.f32 f1427, f1426, f1425; +add.f32 f1428, f1427, f1424; +sub.f32 f1429, f1424, f1427; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %52; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1430, f1431}, [rd11]; +mul.f32 f1434, f1430, f1257; +mul.f32 f1435, f1431, f1261; +sub.f32 f1436, f1434, f1435; +mul.f32 f1437, f1430, f1261; +fma.rn.f32 f1438, f1431, f1257, f1437; +mul.f32 f1439, f1430, f1430; +mul.f32 f1440, f1431, f1431; +sub.f32 f1441, f1439, f1440; +mul.f32 f1442, f1431, f1430; +fma.rn.f32 f1443, f1431, f1430, f1442; +mul.f32 f1444, f1441, f1301; +mul.f32 f1445, f1443, f1305; +sub.f32 f1446, f1444, f1445; +mul.f32 f1447, f1441, f1305; +fma.rn.f32 f1448, f1443, f1301, f1447; +mul.f32 f1449, f1430, f1441; +mul.f32 f1450, f1431, f1443; +sub.f32 f1451, f1449, f1450; +mul.f32 f1452, f1430, f1443; +fma.rn.f32 f1453, f1431, f1441, f1452; +mul.f32 f1454, f1451, f1345; +mul.f32 f1455, f1453, f1349; +sub.f32 f1456, f1454, f1455; +mul.f32 f1457, f1451, f1349; +fma.rn.f32 f1458, f1453, f1345, f1457; +mul.f32 f1459, f1430, f1451; +mul.f32 f1460, f1431, f1453; +sub.f32 f1461, f1459, f1460; +mul.f32 f1462, f1430, f1453; +fma.rn.f32 f1463, f1431, f1451, f1462; +mul.f32 f1464, f1461, f1389; +mul.f32 f1465, f1463, f1393; +sub.f32 f1466, f1464, f1465; +mul.f32 f1467, f1461, f1393; +fma.rn.f32 f1468, f1463, f1389, f1467; +mul.f32 f1469, f1430, f1461; +mul.f32 f1470, f1431, f1463; +sub.f32 f1471, f1469, f1470; +mul.f32 f1472, f1430, f1463; +fma.rn.f32 f1473, f1431, f1461, f1472; +mul.f32 f1474, f1471, f1226; +mul.f32 f1475, f1473, f1244; +sub.f32 f1476, f1474, f1475; +mul.f32 f1477, f1471, f1244; +fma.rn.f32 f1478, f1473, f1226, f1477; +mul.f32 f1479, f1430, f1471; +mul.f32 f1480, f1431, f1473; +sub.f32 f1481, f1479, f1480; +mul.f32 f1482, f1430, f1473; +fma.rn.f32 f1483, f1431, f1471, f1482; +mul.f32 f1484, f1481, f1270; +mul.f32 f1485, f1483, f1288; +sub.f32 f1486, f1484, f1485; +mul.f32 f1487, f1481, f1288; +fma.rn.f32 f1488, f1483, f1270, f1487; +mul.f32 f1489, f1430, f1481; +mul.f32 f1490, f1431, f1483; +sub.f32 f1491, f1489, f1490; +mul.f32 f1492, f1430, f1483; +fma.rn.f32 f1493, f1431, f1481, f1492; +mul.f32 f1494, f1491, f1314; +mul.f32 f1495, f1493, f1332; +sub.f32 f1496, f1494, f1495; +mul.f32 f1497, f1491, f1332; +fma.rn.f32 f1498, f1493, f1314, f1497; +mul.f32 f1499, f1430, f1491; +mul.f32 f1500, f1431, f1493; +sub.f32 f1501, f1499, f1500; +mul.f32 f1502, f1430, f1493; +fma.rn.f32 f1503, f1431, f1491, f1502; +mul.f32 f1504, f1501, f1358; +mul.f32 f1505, f1503, f1376; +sub.f32 f1506, f1504, f1505; +mul.f32 f1507, f1501, f1376; +fma.rn.f32 f1508, f1503, f1358, f1507; +mul.f32 f1509, f1430, f1501; +mul.f32 f1510, f1431, f1503; +sub.f32 f1511, f1509, f1510; +mul.f32 f1512, f1430, f1503; +fma.rn.f32 f1513, f1431, f1501, f1512; +mul.f32 f1514, f1511, f1402; +mul.f32 f1515, f1513, f1420; +sub.f32 f1516, f1514, f1515; +mul.f32 f1517, f1511, f1420; +fma.rn.f32 f1518, f1513, f1402, f1517; +mul.f32 f1519, f1430, f1511; +mul.f32 f1520, f1431, f1513; +sub.f32 f1521, f1519, f1520; +mul.f32 f1522, f1430, f1513; +fma.rn.f32 f1523, f1431, f1511, f1522; +mul.f32 f1524, f1521, f1234; +mul.f32 f1525, f1523, f1252; +sub.f32 f1526, f1524, f1525; +mul.f32 f1527, f1521, f1252; +fma.rn.f32 f1528, f1523, f1234, f1527; +mul.f32 f1529, f1430, f1521; +mul.f32 f1530, f1431, f1523; +sub.f32 f1531, f1529, f1530; +mul.f32 f1532, f1430, f1523; +fma.rn.f32 f1533, f1431, f1521, f1532; +mul.f32 f1534, f1531, f1278; +mul.f32 f1535, f1533, f1296; +sub.f32 f1536, f1534, f1535; +mul.f32 f1537, f1531, f1296; +fma.rn.f32 f1538, f1533, f1278, f1537; +mul.f32 f1539, f1430, f1531; +mul.f32 f1540, f1431, f1533; +sub.f32 f1541, f1539, f1540; +mul.f32 f1542, f1430, f1533; +fma.rn.f32 f1543, f1431, f1531, f1542; +mul.f32 f1544, f1541, f1322; +mul.f32 f1545, f1543, f1340; +sub.f32 f1546, f1544, f1545; +mul.f32 f1547, f1541, f1340; +fma.rn.f32 f1548, f1543, f1322, f1547; +mul.f32 f1549, f1430, f1541; +mul.f32 f1550, f1431, f1543; +sub.f32 f1551, f1549, f1550; +mul.f32 f1552, f1430, f1543; +fma.rn.f32 f1553, f1431, f1541, f1552; +mul.f32 f1554, f1551, f1366; +mul.f32 f1555, f1553, f1384; +sub.f32 f1556, f1554, f1555; +mul.f32 f1557, f1551, f1384; +fma.rn.f32 f1558, f1553, f1366, f1557; +mul.f32 f1559, f1430, f1551; +mul.f32 f1560, f1431, f1553; +sub.f32 f1561, f1559, f1560; +mul.f32 f1562, f1430, f1553; +fma.rn.f32 f1563, f1431, f1551, f1562; +mul.f32 f1564, f1561, f1410; +mul.f32 f1565, f1563, f1428; +sub.f32 f1566, f1564, f1565; +mul.f32 f1567, f1561, f1428; +fma.rn.f32 f1568, f1563, f1410, f1567; +mul.f32 f1569, f1430, f1561; +mul.f32 f1570, f1431, f1563; +sub.f32 f1571, f1569, f1570; +mul.f32 f1572, f1430, f1563; +fma.rn.f32 f1573, f1431, f1561, f1572; +mul.f32 f1574, f1571, f1235; +mul.f32 f1575, f1573, f1253; +sub.f32 f1576, f1574, f1575; +mul.f32 f1577, f1571, f1253; +fma.rn.f32 f1578, f1573, f1235, f1577; +mul.f32 f1579, f1430, f1571; +mul.f32 f1580, f1431, f1573; +sub.f32 f1581, f1579, f1580; +mul.f32 f1582, f1430, f1573; +fma.rn.f32 f1583, f1431, f1571, f1582; +mul.f32 f1584, f1581, f1279; +mul.f32 f1585, f1583, f1297; +sub.f32 f1586, f1584, f1585; +mul.f32 f1587, f1581, f1297; +fma.rn.f32 f1588, f1583, f1279, f1587; +mul.f32 f1589, f1430, f1581; +mul.f32 f1590, f1431, f1583; +sub.f32 f1591, f1589, f1590; +mul.f32 f1592, f1430, f1583; +fma.rn.f32 f1593, f1431, f1581, f1592; +mul.f32 f1594, f1591, f1323; +mul.f32 f1595, f1593, f1341; +sub.f32 f1596, f1594, f1595; +mul.f32 f1597, f1591, f1341; +fma.rn.f32 f1598, f1593, f1323, f1597; +mul.f32 f1599, f1430, f1591; +mul.f32 f1600, f1431, f1593; +sub.f32 f1601, f1599, f1600; +mul.f32 f1602, f1430, f1593; +fma.rn.f32 f1603, f1431, f1591, f1602; +mul.f32 f1604, f1601, f1367; +mul.f32 f1605, f1603, f1385; +sub.f32 f1606, f1604, f1605; +mul.f32 f1607, f1601, f1385; +fma.rn.f32 f1608, f1603, f1367, f1607; +mul.f32 f1609, f1430, f1601; +mul.f32 f1610, f1431, f1603; +sub.f32 f1611, f1609, f1610; +mul.f32 f1612, f1430, f1603; +fma.rn.f32 f1613, f1431, f1601, f1612; +mul.f32 f1614, f1611, f1411; +mul.f32 f1615, f1613, f1429; +sub.f32 f1616, f1614, f1615; +mul.f32 f1617, f1611, f1429; +fma.rn.f32 f1618, f1613, f1411, f1617; +mul.f32 f1619, f1430, f1611; +mul.f32 f1620, f1431, f1613; +sub.f32 f1621, f1619, f1620; +mul.f32 f1622, f1430, f1613; +fma.rn.f32 f1623, f1431, f1611, f1622; +mul.f32 f1624, f1621, f1227; +mul.f32 f1625, f1623, f1245; +sub.f32 f1626, f1624, f1625; +mul.f32 f1627, f1621, f1245; +fma.rn.f32 f1628, f1623, f1227, f1627; +mul.f32 f1629, f1430, f1621; +mul.f32 f1630, f1431, f1623; +sub.f32 f1631, f1629, f1630; +mul.f32 f1632, f1430, f1623; +fma.rn.f32 f1633, f1431, f1621, f1632; +mul.f32 f1634, f1631, f1271; +mul.f32 f1635, f1633, f1289; +sub.f32 f1636, f1634, f1635; +mul.f32 f1637, f1631, f1289; +fma.rn.f32 f1638, f1633, f1271, f1637; +mul.f32 f1639, f1430, f1631; +mul.f32 f1640, f1431, f1633; +sub.f32 f1641, f1639, f1640; +mul.f32 f1642, f1430, f1633; +fma.rn.f32 f1643, f1431, f1631, f1642; +mul.f32 f1644, f1641, f1315; +mul.f32 f1645, f1643, f1333; +sub.f32 f1646, f1644, f1645; +mul.f32 f1647, f1641, f1333; +fma.rn.f32 f1648, f1643, f1315, f1647; +mul.f32 f1649, f1430, f1641; +mul.f32 f1650, f1431, f1643; +sub.f32 f1651, f1649, f1650; +mul.f32 f1652, f1430, f1643; +fma.rn.f32 f1653, f1431, f1641, f1652; +mul.f32 f1654, f1651, f1359; +mul.f32 f1655, f1653, f1377; +sub.f32 f1656, f1654, f1655; +mul.f32 f1657, f1651, f1377; +fma.rn.f32 f1658, f1653, f1359, f1657; +mul.f32 f1659, f1430, f1651; +mul.f32 f1660, f1431, f1653; +sub.f32 f1661, f1659, f1660; +mul.f32 f1662, f1430, f1653; +fma.rn.f32 f1663, f1431, f1651, f1662; +mul.f32 f1664, f1661, f1403; +mul.f32 f1665, f1663, f1421; +sub.f32 f1666, f1664, f1665; +mul.f32 f1667, f1661, f1421; +fma.rn.f32 f1668, f1663, f1403, f1667; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 2500, r15; +st.shared.f32 [r16], f1213; +st.shared.f32 [r16+100], f1436; +st.shared.f32 [r16+200], f1446; +st.shared.f32 [r16+300], f1456; +st.shared.f32 [r16+400], f1466; +st.shared.f32 [r16+500], f1476; +st.shared.f32 [r16+600], f1486; +st.shared.f32 [r16+700], f1496; +st.shared.f32 [r16+800], f1506; +st.shared.f32 [r16+900], f1516; +st.shared.f32 [r16+1000], f1526; +st.shared.f32 [r16+1100], f1536; +st.shared.f32 [r16+1200], f1546; +st.shared.f32 [r16+1300], f1556; +st.shared.f32 [r16+1400], f1566; +st.shared.f32 [r16+1500], f1576; +st.shared.f32 [r16+1600], f1586; +st.shared.f32 [r16+1700], f1596; +st.shared.f32 [r16+1800], f1606; +st.shared.f32 [r16+1900], f1616; +st.shared.f32 [r16+2000], f1626; +st.shared.f32 [r16+2100], f1636; +st.shared.f32 [r16+2200], f1646; +st.shared.f32 [r16+2300], f1656; +st.shared.f32 [r16+2400], f1666; +barrier.sync 0; +ld.shared.f32 f1669, [r10]; +ld.shared.f32 f1670, [r10+500]; +ld.shared.f32 f1671, [r10+1000]; +ld.shared.f32 f1672, [r10+1500]; +ld.shared.f32 f1673, [r10+2000]; +ld.shared.f32 f1674, [r10+2500]; +ld.shared.f32 f1675, [r10+3000]; +ld.shared.f32 f1676, [r10+3500]; +ld.shared.f32 f1677, [r10+4000]; +ld.shared.f32 f1678, [r10+4500]; +ld.shared.f32 f1679, [r10+5000]; +ld.shared.f32 f1680, [r10+5500]; +ld.shared.f32 f1681, [r10+6000]; +ld.shared.f32 f1682, [r10+6500]; +ld.shared.f32 f1683, [r10+7000]; +ld.shared.f32 f1684, [r10+7500]; +ld.shared.f32 f1685, [r10+8000]; +ld.shared.f32 f1686, [r10+8500]; +ld.shared.f32 f1687, [r10+9000]; +ld.shared.f32 f1688, [r10+9500]; +ld.shared.f32 f1689, [r10+10000]; +ld.shared.f32 f1690, [r10+10500]; +ld.shared.f32 f1691, [r10+11000]; +ld.shared.f32 f1692, [r10+11500]; +ld.shared.f32 f1693, [r10+12000]; +barrier.sync 0; +st.shared.f32 [r16], f1217; +st.shared.f32 [r16+100], f1438; +st.shared.f32 [r16+200], f1448; +st.shared.f32 [r16+300], f1458; +st.shared.f32 [r16+400], f1468; +st.shared.f32 [r16+500], f1478; +st.shared.f32 [r16+600], f1488; +st.shared.f32 [r16+700], f1498; +st.shared.f32 [r16+800], f1508; +st.shared.f32 [r16+900], f1518; +st.shared.f32 [r16+1000], f1528; +st.shared.f32 [r16+1100], f1538; +st.shared.f32 [r16+1200], f1548; +st.shared.f32 [r16+1300], f1558; +st.shared.f32 [r16+1400], f1568; +st.shared.f32 [r16+1500], f1578; +st.shared.f32 [r16+1600], f1588; +st.shared.f32 [r16+1700], f1598; +st.shared.f32 [r16+1800], f1608; +st.shared.f32 [r16+1900], f1618; +st.shared.f32 [r16+2000], f1628; +st.shared.f32 [r16+2100], f1638; +st.shared.f32 [r16+2200], f1648; +st.shared.f32 [r16+2300], f1658; +st.shared.f32 [r16+2400], f1668; +barrier.sync 0; +ld.shared.f32 f1694, [r10]; +ld.shared.f32 f1695, [r10+500]; +ld.shared.f32 f1696, [r10+1000]; +ld.shared.f32 f1697, [r10+1500]; +ld.shared.f32 f1698, [r10+2000]; +ld.shared.f32 f1699, [r10+2500]; +ld.shared.f32 f1700, [r10+3000]; +ld.shared.f32 f1701, [r10+3500]; +ld.shared.f32 f1702, [r10+4000]; +ld.shared.f32 f1703, [r10+4500]; +ld.shared.f32 f1704, [r10+5000]; +ld.shared.f32 f1705, [r10+5500]; +ld.shared.f32 f1706, [r10+6000]; +ld.shared.f32 f1707, [r10+6500]; +ld.shared.f32 f1708, [r10+7000]; +ld.shared.f32 f1709, [r10+7500]; +ld.shared.f32 f1710, [r10+8000]; +ld.shared.f32 f1711, [r10+8500]; +ld.shared.f32 f1712, [r10+9000]; +ld.shared.f32 f1713, [r10+9500]; +ld.shared.f32 f1714, [r10+10000]; +ld.shared.f32 f1715, [r10+10500]; +ld.shared.f32 f1716, [r10+11000]; +ld.shared.f32 f1717, [r10+11500]; +ld.shared.f32 f1718, [r10+12000]; +add.f32 f1719, f1674, f1689; +add.f32 f1720, f1669, f1719; +add.f32 f1721, f1679, f1684; +add.f32 f1722, f1699, f1714; +add.f32 f1723, f1694, f1722; +add.f32 f1724, f1704, f1709; +fma.rn.f32 f1725, f1719, 0f3E9E377A, f1669; +mul.f32 f1726, f1721, 0f3F4F1BBD; +sub.f32 f1727, f1725, f1726; +sub.f32 f1728, f1699, f1714; +mul.f32 f1729, f1728, 0f3F737871; +sub.f32 f1730, f1704, f1709; +mul.f32 f1731, f1730, 0fBF167918; +sub.f32 f1732, f1731, f1729; +mul.f32 f1733, f1719, 0f3F4F1BBD; +sub.f32 f1734, f1669, f1733; +fma.rn.f32 f1735, f1721, 0f3E9E377A, f1734; +mul.f32 f1736, f1728, 0f3F167918; +mul.f32 f1737, f1730, 0f3F737871; +sub.f32 f1738, f1737, f1736; +fma.rn.f32 f1739, f1722, 0f3E9E377A, f1694; +mul.f32 f1740, f1724, 0f3F4F1BBD; +sub.f32 f1741, f1739, f1740; +sub.f32 f1742, f1674, f1689; +mul.f32 f1743, f1742, 0f3F737871; +sub.f32 f1744, f1679, f1684; +mul.f32 f1745, f1744, 0fBF167918; +sub.f32 f1746, f1745, f1743; +mul.f32 f1747, f1722, 0f3F4F1BBD; +sub.f32 f1748, f1694, f1747; +fma.rn.f32 f1749, f1724, 0f3E9E377A, f1748; +mul.f32 f1750, f1742, 0f3F167918; +mul.f32 f1751, f1744, 0f3F737871; +sub.f32 f1752, f1751, f1750; +add.f32 f1753, f1675, f1690; +add.f32 f1754, f1670, f1753; +add.f32 f1755, f1680, f1685; +add.f32 f1756, f1700, f1715; +add.f32 f1757, f1695, f1756; +add.f32 f1758, f1705, f1710; +fma.rn.f32 f1759, f1753, 0f3E9E377A, f1670; +mul.f32 f1760, f1755, 0f3F4F1BBD; +sub.f32 f1761, f1759, f1760; +sub.f32 f1762, f1700, f1715; +mul.f32 f1763, f1762, 0f3F737871; +sub.f32 f1764, f1705, f1710; +mul.f32 f1765, f1764, 0fBF167918; +sub.f32 f1766, f1765, f1763; +mul.f32 f1767, f1753, 0f3F4F1BBD; +sub.f32 f1768, f1670, f1767; +fma.rn.f32 f1769, f1755, 0f3E9E377A, f1768; +mul.f32 f1770, f1762, 0f3F167918; +mul.f32 f1771, f1764, 0f3F737871; +sub.f32 f1772, f1771, f1770; +fma.rn.f32 f1773, f1756, 0f3E9E377A, f1695; +mul.f32 f1774, f1758, 0f3F4F1BBD; +sub.f32 f1775, f1773, f1774; +sub.f32 f1776, f1675, f1690; +mul.f32 f1777, f1776, 0f3F737871; +sub.f32 f1778, f1680, f1685; +mul.f32 f1779, f1778, 0fBF167918; +sub.f32 f1780, f1779, f1777; +mul.f32 f1781, f1756, 0f3F4F1BBD; +sub.f32 f1782, f1695, f1781; +fma.rn.f32 f1783, f1758, 0f3E9E377A, f1782; +mul.f32 f1784, f1776, 0f3F167918; +mul.f32 f1785, f1778, 0f3F737871; +sub.f32 f1786, f1785, f1784; +add.f32 f1787, f1676, f1691; +add.f32 f1788, f1671, f1787; +add.f32 f1789, f1681, f1686; +add.f32 f1790, f1701, f1716; +add.f32 f1791, f1696, f1790; +add.f32 f1792, f1706, f1711; +fma.rn.f32 f1793, f1787, 0f3E9E377A, f1671; +mul.f32 f1794, f1789, 0f3F4F1BBD; +sub.f32 f1795, f1793, f1794; +sub.f32 f1796, f1701, f1716; +mul.f32 f1797, f1796, 0f3F737871; +sub.f32 f1798, f1706, f1711; +mul.f32 f1799, f1798, 0fBF167918; +sub.f32 f1800, f1799, f1797; +mul.f32 f1801, f1787, 0f3F4F1BBD; +sub.f32 f1802, f1671, f1801; +fma.rn.f32 f1803, f1789, 0f3E9E377A, f1802; +mul.f32 f1804, f1796, 0f3F167918; +mul.f32 f1805, f1798, 0f3F737871; +sub.f32 f1806, f1805, f1804; +fma.rn.f32 f1807, f1790, 0f3E9E377A, f1696; +mul.f32 f1808, f1792, 0f3F4F1BBD; +sub.f32 f1809, f1807, f1808; +sub.f32 f1810, f1676, f1691; +mul.f32 f1811, f1810, 0f3F737871; +sub.f32 f1812, f1681, f1686; +mul.f32 f1813, f1812, 0fBF167918; +sub.f32 f1814, f1813, f1811; +mul.f32 f1815, f1790, 0f3F4F1BBD; +sub.f32 f1816, f1696, f1815; +fma.rn.f32 f1817, f1792, 0f3E9E377A, f1816; +mul.f32 f1818, f1810, 0f3F167918; +mul.f32 f1819, f1812, 0f3F737871; +sub.f32 f1820, f1819, f1818; +add.f32 f1821, f1677, f1692; +add.f32 f1822, f1672, f1821; +add.f32 f1823, f1682, f1687; +add.f32 f1824, f1702, f1717; +add.f32 f1825, f1697, f1824; +add.f32 f1826, f1707, f1712; +fma.rn.f32 f1827, f1821, 0f3E9E377A, f1672; +mul.f32 f1828, f1823, 0f3F4F1BBD; +sub.f32 f1829, f1827, f1828; +sub.f32 f1830, f1702, f1717; +mul.f32 f1831, f1830, 0f3F737871; +sub.f32 f1832, f1707, f1712; +mul.f32 f1833, f1832, 0fBF167918; +sub.f32 f1834, f1833, f1831; +mul.f32 f1835, f1821, 0f3F4F1BBD; +sub.f32 f1836, f1672, f1835; +fma.rn.f32 f1837, f1823, 0f3E9E377A, f1836; +mul.f32 f1838, f1830, 0f3F167918; +mul.f32 f1839, f1832, 0f3F737871; +sub.f32 f1840, f1839, f1838; +fma.rn.f32 f1841, f1824, 0f3E9E377A, f1697; +mul.f32 f1842, f1826, 0f3F4F1BBD; +sub.f32 f1843, f1841, f1842; +sub.f32 f1844, f1677, f1692; +mul.f32 f1845, f1844, 0f3F737871; +sub.f32 f1846, f1682, f1687; +mul.f32 f1847, f1846, 0fBF167918; +sub.f32 f1848, f1847, f1845; +mul.f32 f1849, f1824, 0f3F4F1BBD; +sub.f32 f1850, f1697, f1849; +fma.rn.f32 f1851, f1826, 0f3E9E377A, f1850; +mul.f32 f1852, f1844, 0f3F167918; +mul.f32 f1853, f1846, 0f3F737871; +sub.f32 f1854, f1853, f1852; +add.f32 f1855, f1678, f1693; +add.f32 f1856, f1673, f1855; +add.f32 f1857, f1683, f1688; +add.f32 f1858, f1703, f1718; +add.f32 f1859, f1698, f1858; +add.f32 f1860, f1708, f1713; +fma.rn.f32 f1861, f1855, 0f3E9E377A, f1673; +mul.f32 f1862, f1857, 0f3F4F1BBD; +sub.f32 f1863, f1861, f1862; +sub.f32 f1864, f1703, f1718; +mul.f32 f1865, f1864, 0f3F737871; +sub.f32 f1866, f1708, f1713; +mul.f32 f1867, f1866, 0fBF167918; +sub.f32 f1868, f1867, f1865; +mul.f32 f1869, f1855, 0f3F4F1BBD; +sub.f32 f1870, f1673, f1869; +fma.rn.f32 f1871, f1857, 0f3E9E377A, f1870; +mul.f32 f1872, f1864, 0f3F167918; +mul.f32 f1873, f1866, 0f3F737871; +sub.f32 f1874, f1873, f1872; +fma.rn.f32 f1875, f1858, 0f3E9E377A, f1698; +mul.f32 f1876, f1860, 0f3F4F1BBD; +sub.f32 f1877, f1875, f1876; +sub.f32 f1878, f1678, f1693; +mul.f32 f1879, f1878, 0f3F737871; +sub.f32 f1880, f1683, f1688; +mul.f32 f1881, f1880, 0fBF167918; +sub.f32 f1882, f1881, f1879; +mul.f32 f1883, f1858, 0f3F4F1BBD; +sub.f32 f1884, f1698, f1883; +fma.rn.f32 f1885, f1860, 0f3E9E377A, f1884; +mul.f32 f1886, f1878, 0f3F167918; +mul.f32 f1887, f1880, 0f3F737871; +sub.f32 f1888, f1887, f1886; +add.f32 %0, f1721, f1720; +add.f32 %1, f1724, f1723; +add.f32 %2, f1755, f1754; +add.f32 %3, f1758, f1757; +add.f32 %4, f1789, f1788; +add.f32 %5, f1792, f1791; +add.f32 %6, f1823, f1822; +add.f32 %7, f1826, f1825; +add.f32 %8, f1857, f1856; +add.f32 %9, f1860, f1859; +add.f32 %11, f1746, f1741; +sub.f32 %10, f1727, f1732; +add.f32 %13, f1780, f1775; +sub.f32 %12, f1761, f1766; +add.f32 %15, f1814, f1809; +sub.f32 %14, f1795, f1800; +add.f32 %17, f1848, f1843; +sub.f32 %16, f1829, f1834; +add.f32 %19, f1882, f1877; +sub.f32 %18, f1863, f1868; +sub.f32 %20, f1735, f1738; +add.f32 %21, f1752, f1749; +sub.f32 %22, f1769, f1772; +add.f32 %23, f1786, f1783; +sub.f32 %24, f1803, f1806; +add.f32 %25, f1820, f1817; +sub.f32 %26, f1837, f1840; +add.f32 %27, f1854, f1851; +sub.f32 %28, f1871, f1874; +add.f32 %29, f1888, f1885; +add.f32 %30, f1738, f1735; +sub.f32 %31, f1749, f1752; +add.f32 %32, f1772, f1769; +sub.f32 %33, f1783, f1786; +add.f32 %34, f1806, f1803; +sub.f32 %35, f1817, f1820; +add.f32 %36, f1840, f1837; +sub.f32 %37, f1851, f1854; +add.f32 %38, f1874, f1871; +sub.f32 %39, f1885, f1888; +sub.f32 %41, f1741, f1746; +add.f32 %40, f1732, f1727; +sub.f32 %43, f1775, f1780; +add.f32 %42, f1766, f1761; +sub.f32 %45, f1809, f1814; +add.f32 %44, f1800, f1795; +sub.f32 %47, f1843, f1848; +add.f32 %46, f1834, f1829; +sub.f32 %49, f1877, f1882; +add.f32 %48, f1868, f1863; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_3125), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<168, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<477>; +.reg .b32 r<30>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 25000, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %17, %25; +add.f32 f22, %15, f21; +add.f32 f23, %20, %23; +add.f32 f24, %19, %26; +add.f32 f25, %16, f24; +add.f32 f26, %22, %24; +fma.rn.f32 f27, f21, 0f3E9E377A, %15; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %19, %26; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %22, %24; +mul.f32 f33, f32, 0fBF167918; +sub.f32 f34, f33, f31; +sub.f32 f35, f29, f34; +add.f32 f36, f34, f29; +mul.f32 f37, f21, 0f3F4F1BBD; +sub.f32 f38, %15, f37; +fma.rn.f32 f39, f23, 0f3E9E377A, f38; +mul.f32 f40, f30, 0f3F167918; +mul.f32 f41, f32, 0f3F737871; +sub.f32 f42, f41, f40; +sub.f32 f43, f39, f42; +add.f32 f44, f42, f39; +fma.rn.f32 f45, f24, 0f3E9E377A, %16; +mul.f32 f46, f26, 0f3F4F1BBD; +sub.f32 f47, f45, f46; +sub.f32 f48, %17, %25; +mul.f32 f49, f48, 0f3F737871; +sub.f32 f50, %20, %23; +mul.f32 f51, f50, 0fBF167918; +sub.f32 f52, f51, f49; +add.f32 f53, f52, f47; +sub.f32 f54, f47, f52; +mul.f32 f55, f24, 0f3F4F1BBD; +sub.f32 f56, %16, f55; +fma.rn.f32 f57, f26, 0f3E9E377A, f56; +mul.f32 f58, f48, 0f3F167918; +mul.f32 f59, f50, 0f3F737871; +sub.f32 f60, f59, f58; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 25000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f63, f35; +mul.f32 f68, f64, f53; +mul.f32 f69, f63, f53; +mul.f32 f70, f63, f63; +mul.f32 f71, f64, f64; +sub.f32 f72, f70, f71; +mul.f32 f73, f64, f63; +fma.rn.f32 f74, f64, f63, f73; +mul.f32 f75, f72, f43; +mul.f32 f76, f74, f61; +mul.f32 f77, f72, f61; +mul.f32 f78, f63, f72; +mul.f32 f79, f64, f74; +sub.f32 f80, f78, f79; +mul.f32 f81, f63, f74; +fma.rn.f32 f82, f64, f72, f81; +mul.f32 f83, f80, f44; +mul.f32 f84, f82, f62; +mul.f32 f85, f80, f62; +mul.f32 f86, f63, f80; +mul.f32 f87, f64, f82; +sub.f32 f88, f86, f87; +mul.f32 f89, f63, f82; +fma.rn.f32 f90, f64, f80, f89; +mul.f32 f91, f88, f36; +mul.f32 f92, f90, f54; +mul.f32 f93, f88, f54; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f94, f26, f25; +add.f32 f95, f23, f22; +st.shared.v2.f32 [r9], {f95, f94}; +fma.rn.f32 f96, f64, f35, f69; +sub.f32 f97, f67, f68; +st.shared.v2.f32 [r9+8], {f97, f96}; +fma.rn.f32 f98, f74, f43, f77; +sub.f32 f99, f75, f76; +st.shared.v2.f32 [r9+16], {f99, f98}; +sub.f32 f100, f83, f84; +fma.rn.f32 f101, f82, f44, f85; +st.shared.v2.f32 [r9+24], {f100, f101}; +fma.rn.f32 f102, f90, f36, f93; +sub.f32 f103, f91, f92; +st.shared.v2.f32 [r9+32], {f103, f102}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f104, f105}, [r11]; +ld.shared.v2.f32 {f108, f109}, [r11+5000]; +ld.shared.v2.f32 {f112, f113}, [r11+10000]; +ld.shared.v2.f32 {f116, f117}, [r11+15000]; +ld.shared.v2.f32 {f120, f121}, [r11+20000]; +add.f32 f124, f108, f120; +add.f32 f125, f104, f124; +add.f32 f126, f112, f116; +add.f32 f127, f109, f121; +add.f32 f128, f105, f127; +add.f32 f129, f113, f117; +fma.rn.f32 f130, f124, 0f3E9E377A, f104; +mul.f32 f131, f126, 0f3F4F1BBD; +sub.f32 f132, f130, f131; +sub.f32 f133, f109, f121; +mul.f32 f134, f133, 0f3F737871; +sub.f32 f135, f113, f117; +mul.f32 f136, f135, 0fBF167918; +sub.f32 f137, f136, f134; +sub.f32 f138, f132, f137; +add.f32 f139, f137, f132; +mul.f32 f140, f124, 0f3F4F1BBD; +sub.f32 f141, f104, f140; +fma.rn.f32 f142, f126, 0f3E9E377A, f141; +mul.f32 f143, f133, 0f3F167918; +mul.f32 f144, f135, 0f3F737871; +sub.f32 f145, f144, f143; +sub.f32 f146, f142, f145; +add.f32 f147, f145, f142; +fma.rn.f32 f148, f127, 0f3E9E377A, f105; +mul.f32 f149, f129, 0f3F4F1BBD; +sub.f32 f150, f148, f149; +sub.f32 f151, f108, f120; +mul.f32 f152, f151, 0f3F737871; +sub.f32 f153, f112, f116; +mul.f32 f154, f153, 0fBF167918; +sub.f32 f155, f154, f152; +add.f32 f156, f155, f150; +sub.f32 f157, f150, f155; +mul.f32 f158, f127, 0f3F4F1BBD; +sub.f32 f159, f105, f158; +fma.rn.f32 f160, f129, 0f3E9E377A, f159; +mul.f32 f161, f151, 0f3F167918; +mul.f32 f162, f153, 0f3F737871; +sub.f32 f163, f162, f161; +add.f32 f164, f163, f160; +sub.f32 f165, f160, f163; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f166, f167}, [rd11]; +mul.f32 f170, f166, f138; +mul.f32 f171, f167, f156; +mul.f32 f172, f166, f156; +mul.f32 f173, f166, f166; +mul.f32 f174, f167, f167; +sub.f32 f175, f173, f174; +mul.f32 f176, f167, f166; +fma.rn.f32 f177, f167, f166, f176; +mul.f32 f178, f175, f146; +mul.f32 f179, f177, f164; +mul.f32 f180, f175, f164; +mul.f32 f181, f166, f175; +mul.f32 f182, f167, f177; +sub.f32 f183, f181, f182; +mul.f32 f184, f166, f177; +fma.rn.f32 f185, f167, f175, f184; +mul.f32 f186, f183, f147; +mul.f32 f187, f185, f165; +mul.f32 f188, f183, f165; +mul.f32 f189, f166, f183; +mul.f32 f190, f167, f185; +sub.f32 f191, f189, f190; +mul.f32 f192, f166, f185; +fma.rn.f32 f193, f167, f183, f192; +mul.f32 f194, f191, f139; +mul.f32 f195, f193, f157; +mul.f32 f196, f191, f157; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +add.f32 f197, f129, f128; +add.f32 f198, f126, f125; +st.shared.v2.f32 [r17], {f198, f197}; +fma.rn.f32 f199, f167, f138, f172; +sub.f32 f200, f170, f171; +st.shared.v2.f32 [r17+40], {f200, f199}; +fma.rn.f32 f201, f177, f146, f180; +sub.f32 f202, f178, f179; +st.shared.v2.f32 [r17+80], {f202, f201}; +fma.rn.f32 f203, f185, f147, f188; +sub.f32 f204, f186, f187; +st.shared.v2.f32 [r17+120], {f204, f203}; +fma.rn.f32 f205, f193, f139, f196; +sub.f32 f206, f194, f195; +st.shared.v2.f32 [r17+160], {f206, f205}; +barrier.sync 0; +ld.shared.v2.f32 {f207, f208}, [r11]; +ld.shared.v2.f32 {f211, f212}, [r11+5000]; +ld.shared.v2.f32 {f215, f216}, [r11+10000]; +ld.shared.v2.f32 {f219, f220}, [r11+15000]; +ld.shared.v2.f32 {f223, f224}, [r11+20000]; +add.f32 f227, f211, f223; +add.f32 f228, f207, f227; +add.f32 f229, f215, f219; +add.f32 f230, f212, f224; +add.f32 f231, f208, f230; +add.f32 f232, f216, f220; +fma.rn.f32 f233, f227, 0f3E9E377A, f207; +mul.f32 f234, f229, 0f3F4F1BBD; +sub.f32 f235, f233, f234; +sub.f32 f236, f212, f224; +mul.f32 f237, f236, 0f3F737871; +sub.f32 f238, f216, f220; +mul.f32 f239, f238, 0fBF167918; +sub.f32 f240, f239, f237; +sub.f32 f241, f235, f240; +add.f32 f242, f240, f235; +mul.f32 f243, f227, 0f3F4F1BBD; +sub.f32 f244, f207, f243; +fma.rn.f32 f245, f229, 0f3E9E377A, f244; +mul.f32 f246, f236, 0f3F167918; +mul.f32 f247, f238, 0f3F737871; +sub.f32 f248, f247, f246; +sub.f32 f249, f245, f248; +add.f32 f250, f248, f245; +fma.rn.f32 f251, f230, 0f3E9E377A, f208; +mul.f32 f252, f232, 0f3F4F1BBD; +sub.f32 f253, f251, f252; +sub.f32 f254, f211, f223; +mul.f32 f255, f254, 0f3F737871; +sub.f32 f256, f215, f219; +mul.f32 f257, f256, 0fBF167918; +sub.f32 f258, f257, f255; +add.f32 f259, f258, f253; +sub.f32 f260, f253, f258; +mul.f32 f261, f230, 0f3F4F1BBD; +sub.f32 f262, f208, f261; +fma.rn.f32 f263, f232, 0f3E9E377A, f262; +mul.f32 f264, f254, 0f3F167918; +mul.f32 f265, f256, 0f3F737871; +sub.f32 f266, f265, f264; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f269, f270}, [rd16]; +mul.f32 f273, f269, f241; +mul.f32 f274, f270, f259; +mul.f32 f275, f269, f259; +mul.f32 f276, f269, f269; +mul.f32 f277, f270, f270; +sub.f32 f278, f276, f277; +mul.f32 f279, f270, f269; +fma.rn.f32 f280, f270, f269, f279; +mul.f32 f281, f278, f249; +mul.f32 f282, f280, f267; +mul.f32 f283, f278, f267; +mul.f32 f284, f269, f278; +mul.f32 f285, f270, f280; +sub.f32 f286, f284, f285; +mul.f32 f287, f269, f280; +fma.rn.f32 f288, f270, f278, f287; +mul.f32 f289, f286, f250; +mul.f32 f290, f288, f268; +mul.f32 f291, f286, f268; +mul.f32 f292, f269, f286; +mul.f32 f293, f270, f288; +sub.f32 f294, f292, f293; +mul.f32 f295, f269, f288; +fma.rn.f32 f296, f270, f286, f295; +mul.f32 f297, f294, f242; +mul.f32 f298, f296, f260; +mul.f32 f299, f294, f260; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +add.f32 f300, f232, f231; +add.f32 f301, f229, f228; +st.shared.v2.f32 [r23], {f301, f300}; +fma.rn.f32 f302, f270, f241, f275; +sub.f32 f303, f273, f274; +st.shared.v2.f32 [r23+200], {f303, f302}; +fma.rn.f32 f304, f280, f249, f283; +sub.f32 f305, f281, f282; +st.shared.v2.f32 [r23+400], {f305, f304}; +fma.rn.f32 f306, f288, f250, f291; +sub.f32 f307, f289, f290; +st.shared.v2.f32 [r23+600], {f307, f306}; +fma.rn.f32 f308, f296, f242, f299; +sub.f32 f309, f297, f298; +st.shared.v2.f32 [r23+800], {f309, f308}; +barrier.sync 0; +ld.shared.v2.f32 {f310, f311}, [r11]; +ld.shared.v2.f32 {f314, f315}, [r11+5000]; +ld.shared.v2.f32 {f318, f319}, [r11+10000]; +ld.shared.v2.f32 {f322, f323}, [r11+15000]; +ld.shared.v2.f32 {f326, f327}, [r11+20000]; +add.f32 f330, f314, f326; +add.f32 f331, f310, f330; +add.f32 f332, f318, f322; +add.f32 f333, f315, f327; +add.f32 f334, f311, f333; +add.f32 f335, f319, f323; +fma.rn.f32 f336, f330, 0f3E9E377A, f310; +mul.f32 f337, f332, 0f3F4F1BBD; +sub.f32 f338, f336, f337; +sub.f32 f339, f315, f327; +mul.f32 f340, f339, 0f3F737871; +sub.f32 f341, f319, f323; +mul.f32 f342, f341, 0fBF167918; +sub.f32 f343, f342, f340; +sub.f32 f344, f338, f343; +add.f32 f345, f343, f338; +mul.f32 f346, f330, 0f3F4F1BBD; +sub.f32 f347, f310, f346; +fma.rn.f32 f348, f332, 0f3E9E377A, f347; +mul.f32 f349, f339, 0f3F167918; +mul.f32 f350, f341, 0f3F737871; +sub.f32 f351, f350, f349; +sub.f32 f352, f348, f351; +add.f32 f353, f351, f348; +fma.rn.f32 f354, f333, 0f3E9E377A, f311; +mul.f32 f355, f335, 0f3F4F1BBD; +sub.f32 f356, f354, f355; +sub.f32 f357, f314, f326; +mul.f32 f358, f357, 0f3F737871; +sub.f32 f359, f318, f322; +mul.f32 f360, f359, 0fBF167918; +sub.f32 f361, f360, f358; +add.f32 f362, f361, f356; +sub.f32 f363, f356, f361; +mul.f32 f364, f333, 0f3F4F1BBD; +sub.f32 f365, f311, f364; +fma.rn.f32 f366, f335, 0f3E9E377A, f365; +mul.f32 f367, f357, 0f3F167918; +mul.f32 f368, f359, 0f3F737871; +sub.f32 f369, f368, f367; +add.f32 f370, f369, f366; +sub.f32 f371, f366, f369; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 8; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f372, f373}, [rd21]; +mul.f32 f376, f372, f344; +mul.f32 f377, f373, f362; +mul.f32 f378, f372, f362; +mul.f32 f379, f372, f372; +mul.f32 f380, f373, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f373, f372; +fma.rn.f32 f383, f373, f372, f382; +mul.f32 f384, f381, f352; +mul.f32 f385, f383, f370; +mul.f32 f386, f381, f370; +mul.f32 f387, f372, f381; +mul.f32 f388, f373, f383; +sub.f32 f389, f387, f388; +mul.f32 f390, f372, f383; +fma.rn.f32 f391, f373, f381, f390; +mul.f32 f392, f389, f353; +mul.f32 f393, f391, f371; +mul.f32 f394, f389, f371; +mul.f32 f395, f372, f389; +mul.f32 f396, f373, f391; +sub.f32 f397, f395, f396; +mul.f32 f398, f372, f391; +fma.rn.f32 f399, f373, f389, f398; +mul.f32 f400, f397, f345; +mul.f32 f401, f399, f363; +mul.f32 f402, f397, f363; +shl.b32 r27, r26, 3; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 5000, r28; +add.f32 f403, f335, f334; +add.f32 f404, f332, f331; +st.shared.v2.f32 [r29], {f404, f403}; +fma.rn.f32 f405, f373, f344, f378; +sub.f32 f406, f376, f377; +st.shared.v2.f32 [r29+1000], {f406, f405}; +fma.rn.f32 f407, f383, f352, f386; +sub.f32 f408, f384, f385; +st.shared.v2.f32 [r29+2000], {f408, f407}; +fma.rn.f32 f409, f391, f353, f394; +sub.f32 f410, f392, f393; +st.shared.v2.f32 [r29+3000], {f410, f409}; +fma.rn.f32 f411, f399, f345, f402; +sub.f32 f412, f400, f401; +st.shared.v2.f32 [r29+4000], {f412, f411}; +barrier.sync 0; +ld.shared.v2.f32 {f413, f414}, [r11]; +ld.shared.v2.f32 {f417, f418}, [r11+5000]; +ld.shared.v2.f32 {f421, f422}, [r11+10000]; +ld.shared.v2.f32 {f425, f426}, [r11+15000]; +ld.shared.v2.f32 {f429, f430}, [r11+20000]; +add.f32 f433, f417, f429; +add.f32 f434, f413, f433; +add.f32 f435, f421, f425; +add.f32 f436, f418, f430; +add.f32 f437, f414, f436; +add.f32 f438, f422, f426; +fma.rn.f32 f439, f433, 0f3E9E377A, f413; +mul.f32 f440, f435, 0f3F4F1BBD; +sub.f32 f441, f439, f440; +sub.f32 f442, f418, f430; +mul.f32 f443, f442, 0f3F737871; +sub.f32 f444, f422, f426; +mul.f32 f445, f444, 0fBF167918; +sub.f32 f446, f445, f443; +mul.f32 f447, f433, 0f3F4F1BBD; +sub.f32 f448, f413, f447; +fma.rn.f32 f449, f435, 0f3E9E377A, f448; +mul.f32 f450, f442, 0f3F167918; +mul.f32 f451, f444, 0f3F737871; +sub.f32 f452, f451, f450; +fma.rn.f32 f453, f436, 0f3E9E377A, f414; +mul.f32 f454, f438, 0f3F4F1BBD; +sub.f32 f455, f453, f454; +sub.f32 f456, f417, f429; +mul.f32 f457, f456, 0f3F737871; +sub.f32 f458, f421, f425; +mul.f32 f459, f458, 0fBF167918; +sub.f32 f460, f459, f457; +mul.f32 f461, f436, 0f3F4F1BBD; +sub.f32 f462, f414, f461; +fma.rn.f32 f463, f438, 0f3E9E377A, f462; +mul.f32 f464, f456, 0f3F167918; +mul.f32 f465, f458, 0f3F737871; +sub.f32 f466, f465, f464; +add.f32 %1, f438, f437; +add.f32 %0, f435, f434; +add.f32 %3, f460, f455; +sub.f32 %2, f441, f446; +add.f32 %5, f466, f463; +sub.f32 %4, f449, f452; +sub.f32 %7, f463, f466; +add.f32 %6, f452, f449; +sub.f32 %9, f455, f460; +add.f32 %8, f446, f441; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_3125), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<169, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<437>; +.reg .b32 r<30>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 12500, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %17, %25; +add.f32 f22, %15, f21; +add.f32 f23, %20, %23; +add.f32 f24, f23, f22; +add.f32 f25, %19, %26; +add.f32 f26, %16, f25; +add.f32 f27, %22, %24; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %15; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %19, %26; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %22, %24; +mul.f32 f35, f34, 0fBF167918; +sub.f32 f36, f35, f33; +sub.f32 f37, f31, f36; +add.f32 f38, f36, f31; +mul.f32 f39, f21, 0f3F4F1BBD; +sub.f32 f40, %15, f39; +fma.rn.f32 f41, f23, 0f3E9E377A, f40; +mul.f32 f42, f32, 0f3F167918; +mul.f32 f43, f34, 0f3F737871; +sub.f32 f44, f43, f42; +sub.f32 f45, f41, f44; +add.f32 f46, f44, f41; +fma.rn.f32 f47, f25, 0f3E9E377A, %16; +mul.f32 f48, f27, 0f3F4F1BBD; +sub.f32 f49, f47, f48; +sub.f32 f50, %17, %25; +mul.f32 f51, f50, 0f3F737871; +sub.f32 f52, %20, %23; +mul.f32 f53, f52, 0fBF167918; +sub.f32 f54, f53, f51; +add.f32 f55, f54, f49; +sub.f32 f56, f49, f54; +mul.f32 f57, f25, 0f3F4F1BBD; +sub.f32 f58, %16, f57; +fma.rn.f32 f59, f27, 0f3E9E377A, f58; +mul.f32 f60, f50, 0f3F167918; +mul.f32 f61, f52, 0f3F737871; +sub.f32 f62, f61, f60; +add.f32 f63, f62, f59; +sub.f32 f64, f59, f62; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f65, f66}, [rd6]; +mul.f32 f69, f65, f37; +mul.f32 f70, f66, f55; +sub.f32 f71, f69, f70; +mul.f32 f72, f65, f55; +fma.rn.f32 f73, f66, f37, f72; +mul.f32 f74, f65, f65; +mul.f32 f75, f66, f66; +sub.f32 f76, f74, f75; +mul.f32 f77, f66, f65; +fma.rn.f32 f78, f66, f65, f77; +mul.f32 f79, f76, f45; +mul.f32 f80, f78, f63; +sub.f32 f81, f79, f80; +mul.f32 f82, f76, f63; +fma.rn.f32 f83, f78, f45, f82; +mul.f32 f84, f65, f76; +mul.f32 f85, f66, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f65, f78; +fma.rn.f32 f88, f66, f76, f87; +mul.f32 f89, f86, f46; +mul.f32 f90, f88, f64; +sub.f32 f91, f89, f90; +mul.f32 f92, f86, f64; +fma.rn.f32 f93, f88, f46, f92; +mul.f32 f94, f65, f86; +mul.f32 f95, f66, f88; +sub.f32 f96, f94, f95; +mul.f32 f97, f65, f88; +fma.rn.f32 f98, f66, f86, f97; +mul.f32 f99, f96, f38; +mul.f32 f100, f98, f56; +sub.f32 f101, f99, f100; +mul.f32 f102, f96, f56; +fma.rn.f32 f103, f98, f38, f102; +mad.lo.s32 r8, r5, 12500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f104, [r11]; +ld.shared.f32 f105, [r11+2500]; +ld.shared.f32 f106, [r11+5000]; +ld.shared.f32 f107, [r11+7500]; +ld.shared.f32 f108, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f73; +st.shared.f32 [r9+8], f83; +st.shared.f32 [r9+12], f93; +st.shared.f32 [r9+16], f103; +barrier.sync 0; +ld.shared.f32 f109, [r11]; +ld.shared.f32 f110, [r11+2500]; +ld.shared.f32 f111, [r11+5000]; +ld.shared.f32 f112, [r11+7500]; +ld.shared.f32 f113, [r11+10000]; +add.f32 f114, f105, f108; +add.f32 f115, f104, f114; +add.f32 f116, f106, f107; +add.f32 f117, f116, f115; +add.f32 f118, f110, f113; +add.f32 f119, f109, f118; +add.f32 f120, f111, f112; +add.f32 f121, f120, f119; +fma.rn.f32 f122, f114, 0f3E9E377A, f104; +mul.f32 f123, f116, 0f3F4F1BBD; +sub.f32 f124, f122, f123; +sub.f32 f125, f110, f113; +mul.f32 f126, f125, 0f3F737871; +sub.f32 f127, f111, f112; +mul.f32 f128, f127, 0fBF167918; +sub.f32 f129, f128, f126; +sub.f32 f130, f124, f129; +add.f32 f131, f129, f124; +mul.f32 f132, f114, 0f3F4F1BBD; +sub.f32 f133, f104, f132; +fma.rn.f32 f134, f116, 0f3E9E377A, f133; +mul.f32 f135, f125, 0f3F167918; +mul.f32 f136, f127, 0f3F737871; +sub.f32 f137, f136, f135; +sub.f32 f138, f134, f137; +add.f32 f139, f137, f134; +fma.rn.f32 f140, f118, 0f3E9E377A, f109; +mul.f32 f141, f120, 0f3F4F1BBD; +sub.f32 f142, f140, f141; +sub.f32 f143, f105, f108; +mul.f32 f144, f143, 0f3F737871; +sub.f32 f145, f106, f107; +mul.f32 f146, f145, 0fBF167918; +sub.f32 f147, f146, f144; +add.f32 f148, f147, f142; +sub.f32 f149, f142, f147; +mul.f32 f150, f118, 0f3F4F1BBD; +sub.f32 f151, f109, f150; +fma.rn.f32 f152, f120, 0f3E9E377A, f151; +mul.f32 f153, f143, 0f3F167918; +mul.f32 f154, f145, 0f3F737871; +sub.f32 f155, f154, f153; +add.f32 f156, f155, f152; +sub.f32 f157, f152, f155; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f158, f159}, [rd11]; +mul.f32 f162, f158, f130; +mul.f32 f163, f159, f148; +sub.f32 f164, f162, f163; +mul.f32 f165, f158, f148; +fma.rn.f32 f166, f159, f130, f165; +mul.f32 f167, f158, f158; +mul.f32 f168, f159, f159; +sub.f32 f169, f167, f168; +mul.f32 f170, f159, f158; +fma.rn.f32 f171, f159, f158, f170; +mul.f32 f172, f169, f138; +mul.f32 f173, f171, f156; +sub.f32 f174, f172, f173; +mul.f32 f175, f169, f156; +fma.rn.f32 f176, f171, f138, f175; +mul.f32 f177, f158, f169; +mul.f32 f178, f159, f171; +sub.f32 f179, f177, f178; +mul.f32 f180, f158, f171; +fma.rn.f32 f181, f159, f169, f180; +mul.f32 f182, f179, f139; +mul.f32 f183, f181, f157; +sub.f32 f184, f182, f183; +mul.f32 f185, f179, f157; +fma.rn.f32 f186, f181, f139, f185; +mul.f32 f187, f158, f179; +mul.f32 f188, f159, f181; +sub.f32 f189, f187, f188; +mul.f32 f190, f158, f181; +fma.rn.f32 f191, f159, f179, f190; +mul.f32 f192, f189, f131; +mul.f32 f193, f191, f149; +sub.f32 f194, f192, f193; +mul.f32 f195, f189, f149; +fma.rn.f32 f196, f191, f131, f195; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 100, r16; +st.shared.f32 [r17], f117; +st.shared.f32 [r17+20], f164; +st.shared.f32 [r17+40], f174; +st.shared.f32 [r17+60], f184; +st.shared.f32 [r17+80], f194; +barrier.sync 0; +ld.shared.f32 f197, [r11]; +ld.shared.f32 f198, [r11+2500]; +ld.shared.f32 f199, [r11+5000]; +ld.shared.f32 f200, [r11+7500]; +ld.shared.f32 f201, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r17], f121; +st.shared.f32 [r17+20], f166; +st.shared.f32 [r17+40], f176; +st.shared.f32 [r17+60], f186; +st.shared.f32 [r17+80], f196; +barrier.sync 0; +ld.shared.f32 f202, [r11]; +ld.shared.f32 f203, [r11+2500]; +ld.shared.f32 f204, [r11+5000]; +ld.shared.f32 f205, [r11+7500]; +ld.shared.f32 f206, [r11+10000]; +add.f32 f207, f198, f201; +add.f32 f208, f197, f207; +add.f32 f209, f199, f200; +add.f32 f210, f209, f208; +add.f32 f211, f203, f206; +add.f32 f212, f202, f211; +add.f32 f213, f204, f205; +add.f32 f214, f213, f212; +fma.rn.f32 f215, f207, 0f3E9E377A, f197; +mul.f32 f216, f209, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, f203, f206; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, f204, f205; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +sub.f32 f223, f217, f222; +add.f32 f224, f222, f217; +mul.f32 f225, f207, 0f3F4F1BBD; +sub.f32 f226, f197, f225; +fma.rn.f32 f227, f209, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +sub.f32 f231, f227, f230; +add.f32 f232, f230, f227; +fma.rn.f32 f233, f211, 0f3E9E377A, f202; +mul.f32 f234, f213, 0f3F4F1BBD; +sub.f32 f235, f233, f234; +sub.f32 f236, f198, f201; +mul.f32 f237, f236, 0f3F737871; +sub.f32 f238, f199, f200; +mul.f32 f239, f238, 0fBF167918; +sub.f32 f240, f239, f237; +add.f32 f241, f240, f235; +sub.f32 f242, f235, f240; +mul.f32 f243, f211, 0f3F4F1BBD; +sub.f32 f244, f202, f243; +fma.rn.f32 f245, f213, 0f3E9E377A, f244; +mul.f32 f246, f236, 0f3F167918; +mul.f32 f247, f238, 0f3F737871; +sub.f32 f248, f247, f246; +add.f32 f249, f248, f245; +sub.f32 f250, f245, f248; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f251, f252}, [rd16]; +mul.f32 f255, f251, f223; +mul.f32 f256, f252, f241; +sub.f32 f257, f255, f256; +mul.f32 f258, f251, f241; +fma.rn.f32 f259, f252, f223, f258; +mul.f32 f260, f251, f251; +mul.f32 f261, f252, f252; +sub.f32 f262, f260, f261; +mul.f32 f263, f252, f251; +fma.rn.f32 f264, f252, f251, f263; +mul.f32 f265, f262, f231; +mul.f32 f266, f264, f249; +sub.f32 f267, f265, f266; +mul.f32 f268, f262, f249; +fma.rn.f32 f269, f264, f231, f268; +mul.f32 f270, f251, f262; +mul.f32 f271, f252, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f251, f264; +fma.rn.f32 f274, f252, f262, f273; +mul.f32 f275, f272, f232; +mul.f32 f276, f274, f250; +sub.f32 f277, f275, f276; +mul.f32 f278, f272, f250; +fma.rn.f32 f279, f274, f232, f278; +mul.f32 f280, f251, f272; +mul.f32 f281, f252, f274; +sub.f32 f282, f280, f281; +mul.f32 f283, f251, f274; +fma.rn.f32 f284, f252, f272, f283; +mul.f32 f285, f282, f224; +mul.f32 f286, f284, f242; +sub.f32 f287, f285, f286; +mul.f32 f288, f282, f242; +fma.rn.f32 f289, f284, f224, f288; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 500, r22; +st.shared.f32 [r23], f210; +st.shared.f32 [r23+100], f257; +st.shared.f32 [r23+200], f267; +st.shared.f32 [r23+300], f277; +st.shared.f32 [r23+400], f287; +barrier.sync 0; +ld.shared.f32 f290, [r11]; +ld.shared.f32 f291, [r11+2500]; +ld.shared.f32 f292, [r11+5000]; +ld.shared.f32 f293, [r11+7500]; +ld.shared.f32 f294, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r23], f214; +st.shared.f32 [r23+100], f259; +st.shared.f32 [r23+200], f269; +st.shared.f32 [r23+300], f279; +st.shared.f32 [r23+400], f289; +barrier.sync 0; +ld.shared.f32 f295, [r11]; +ld.shared.f32 f296, [r11+2500]; +ld.shared.f32 f297, [r11+5000]; +ld.shared.f32 f298, [r11+7500]; +ld.shared.f32 f299, [r11+10000]; +add.f32 f300, f291, f294; +add.f32 f301, f290, f300; +add.f32 f302, f292, f293; +add.f32 f303, f302, f301; +add.f32 f304, f296, f299; +add.f32 f305, f295, f304; +add.f32 f306, f297, f298; +add.f32 f307, f306, f305; +fma.rn.f32 f308, f300, 0f3E9E377A, f290; +mul.f32 f309, f302, 0f3F4F1BBD; +sub.f32 f310, f308, f309; +sub.f32 f311, f296, f299; +mul.f32 f312, f311, 0f3F737871; +sub.f32 f313, f297, f298; +mul.f32 f314, f313, 0fBF167918; +sub.f32 f315, f314, f312; +sub.f32 f316, f310, f315; +add.f32 f317, f315, f310; +mul.f32 f318, f300, 0f3F4F1BBD; +sub.f32 f319, f290, f318; +fma.rn.f32 f320, f302, 0f3E9E377A, f319; +mul.f32 f321, f311, 0f3F167918; +mul.f32 f322, f313, 0f3F737871; +sub.f32 f323, f322, f321; +sub.f32 f324, f320, f323; +add.f32 f325, f323, f320; +fma.rn.f32 f326, f304, 0f3E9E377A, f295; +mul.f32 f327, f306, 0f3F4F1BBD; +sub.f32 f328, f326, f327; +sub.f32 f329, f291, f294; +mul.f32 f330, f329, 0f3F737871; +sub.f32 f331, f292, f293; +mul.f32 f332, f331, 0fBF167918; +sub.f32 f333, f332, f330; +add.f32 f334, f333, f328; +sub.f32 f335, f328, f333; +mul.f32 f336, f304, 0f3F4F1BBD; +sub.f32 f337, f295, f336; +fma.rn.f32 f338, f306, 0f3E9E377A, f337; +mul.f32 f339, f329, 0f3F167918; +mul.f32 f340, f331, 0f3F737871; +sub.f32 f341, f340, f339; +add.f32 f342, f341, f338; +sub.f32 f343, f338, f341; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 8; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f344, f345}, [rd21]; +mul.f32 f348, f344, f316; +mul.f32 f349, f345, f334; +sub.f32 f350, f348, f349; +mul.f32 f351, f344, f334; +fma.rn.f32 f352, f345, f316, f351; +mul.f32 f353, f344, f344; +mul.f32 f354, f345, f345; +sub.f32 f355, f353, f354; +mul.f32 f356, f345, f344; +fma.rn.f32 f357, f345, f344, f356; +mul.f32 f358, f355, f324; +mul.f32 f359, f357, f342; +sub.f32 f360, f358, f359; +mul.f32 f361, f355, f342; +fma.rn.f32 f362, f357, f324, f361; +mul.f32 f363, f344, f355; +mul.f32 f364, f345, f357; +sub.f32 f365, f363, f364; +mul.f32 f366, f344, f357; +fma.rn.f32 f367, f345, f355, f366; +mul.f32 f368, f365, f325; +mul.f32 f369, f367, f343; +sub.f32 f370, f368, f369; +mul.f32 f371, f365, f343; +fma.rn.f32 f372, f367, f325, f371; +mul.f32 f373, f344, f365; +mul.f32 f374, f345, f367; +sub.f32 f375, f373, f374; +mul.f32 f376, f344, f367; +fma.rn.f32 f377, f345, f365, f376; +mul.f32 f378, f375, f317; +mul.f32 f379, f377, f335; +sub.f32 f380, f378, f379; +mul.f32 f381, f375, f335; +fma.rn.f32 f382, f377, f317, f381; +shl.b32 r27, r26, 2; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 2500, r28; +st.shared.f32 [r29], f303; +st.shared.f32 [r29+500], f350; +st.shared.f32 [r29+1000], f360; +st.shared.f32 [r29+1500], f370; +st.shared.f32 [r29+2000], f380; +barrier.sync 0; +ld.shared.f32 f383, [r11]; +ld.shared.f32 f384, [r11+2500]; +ld.shared.f32 f385, [r11+5000]; +ld.shared.f32 f386, [r11+7500]; +ld.shared.f32 f387, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r29], f307; +st.shared.f32 [r29+500], f352; +st.shared.f32 [r29+1000], f362; +st.shared.f32 [r29+1500], f372; +st.shared.f32 [r29+2000], f382; +barrier.sync 0; +ld.shared.f32 f388, [r11]; +ld.shared.f32 f389, [r11+2500]; +ld.shared.f32 f390, [r11+5000]; +ld.shared.f32 f391, [r11+7500]; +ld.shared.f32 f392, [r11+10000]; +add.f32 f393, f384, f387; +add.f32 f394, f383, f393; +add.f32 f395, f385, f386; +add.f32 f396, f389, f392; +add.f32 f397, f388, f396; +add.f32 f398, f390, f391; +fma.rn.f32 f399, f393, 0f3E9E377A, f383; +mul.f32 f400, f395, 0f3F4F1BBD; +sub.f32 f401, f399, f400; +sub.f32 f402, f389, f392; +mul.f32 f403, f402, 0f3F737871; +sub.f32 f404, f390, f391; +mul.f32 f405, f404, 0fBF167918; +sub.f32 f406, f405, f403; +mul.f32 f407, f393, 0f3F4F1BBD; +sub.f32 f408, f383, f407; +fma.rn.f32 f409, f395, 0f3E9E377A, f408; +mul.f32 f410, f402, 0f3F167918; +mul.f32 f411, f404, 0f3F737871; +sub.f32 f412, f411, f410; +fma.rn.f32 f413, f396, 0f3E9E377A, f388; +mul.f32 f414, f398, 0f3F4F1BBD; +sub.f32 f415, f413, f414; +sub.f32 f416, f384, f387; +mul.f32 f417, f416, 0f3F737871; +sub.f32 f418, f385, f386; +mul.f32 f419, f418, 0fBF167918; +sub.f32 f420, f419, f417; +mul.f32 f421, f396, 0f3F4F1BBD; +sub.f32 f422, f388, f421; +fma.rn.f32 f423, f398, 0f3E9E377A, f422; +mul.f32 f424, f416, 0f3F167918; +mul.f32 f425, f418, 0f3F737871; +sub.f32 f426, f425, f424; +add.f32 %0, f395, f394; +add.f32 %1, f398, f397; +add.f32 %3, f420, f415; +sub.f32 %2, f401, f406; +sub.f32 %4, f409, f412; +add.f32 %5, f426, f423; +add.f32 %6, f412, f409; +sub.f32 %7, f423, f426; +sub.f32 %9, f415, f420; +add.f32 %8, f406, f401; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_3125), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..0b80b14319d0b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp32_inv.hpp.inc @@ -0,0 +1,4734 @@ +#ifndef CUFFTDX_FFT_3125_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_3125_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<369, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2354>; +.reg .b32 r<21>; +.reg .b64 rd<15>; +mov.u32 r19, %tid.y; +mov.u32 r20, %50; +mad.lo.s32 r3, r19, 25000, r20; +add.f32 f101, %63, %93; +add.f32 f103, %73, %83; +add.f32 f2353, %53, f101; +add.f32 f104, f103, f2353; +add.f32 f105, %103, %105; +add.f32 f107, %104, %84; +add.f32 f2349, %54, f105; +add.f32 f108, f107, f2349; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f2348, f101, 0f3E9E377A, %53; +sub.f32 f111, f2348, f110; +sub.f32 f112, %103, %105; +sub.f32 f114, %104, %84; +mul.f32 f2347, f112, 0f3F737871; +fma.rn.f32 f115, f114, 0f3F167918, f2347; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %53, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +mul.f32 f127, f107, 0f3F4F1BBD; +fma.rn.f32 f2346, f105, 0f3E9E377A, %54; +sub.f32 f128, f2346, f127; +sub.f32 f129, %63, %93; +sub.f32 f131, %73, %83; +mul.f32 f2345, f129, 0f3F737871; +fma.rn.f32 f132, f131, 0f3F167918, f2345; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %54, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %65, %95; +add.f32 f145, %75, %85; +add.f32 f2344, %55, f143; +add.f32 f146, f145, f2344; +add.f32 f147, %66, %96; +add.f32 f149, %108, %106; +add.f32 f2340, %107, f147; +add.f32 f150, f149, f2340; +fma.rn.f32 f2338, f143, 0f3E9E377A, %55; +mul.f32 f2339, f145, 0f3F4F1BBD; +sub.f32 f153, f2338, f2339; +sub.f32 f154, %66, %96; +sub.f32 f156, %108, %106; +mul.f32 f2337, f154, 0f3F737871; +fma.rn.f32 f157, f156, 0f3F167918, f2337; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %55, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +mul.f32 f169, f149, 0f3F4F1BBD; +fma.rn.f32 f2336, f147, 0f3E9E377A, %107; +sub.f32 f170, f2336, f169; +sub.f32 f171, %65, %95; +sub.f32 f173, %75, %85; +mul.f32 f2335, f171, 0f3F737871; +fma.rn.f32 f174, f173, 0f3F167918, f2335; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %107, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %67, %97; +add.f32 f187, %77, %87; +add.f32 f2334, %57, f185; +add.f32 f188, f187, f2334; +add.f32 f189, %111, %110; +add.f32 f191, %78, %112; +add.f32 f2329, %109, f189; +add.f32 f192, f191, f2329; +fma.rn.f32 f2327, f185, 0f3E9E377A, %57; +mul.f32 f2328, f187, 0f3F4F1BBD; +sub.f32 f195, f2327, f2328; +sub.f32 f196, %111, %110; +sub.f32 f198, %78, %112; +mul.f32 f2326, f196, 0f3F737871; +fma.rn.f32 f199, f198, 0f3F167918, f2326; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %57, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f2324, f189, 0f3E9E377A, %109; +mul.f32 f2325, f191, 0f3F4F1BBD; +sub.f32 f212, f2324, f2325; +sub.f32 f213, %67, %97; +sub.f32 f215, %77, %87; +mul.f32 f2323, f213, 0f3F737871; +fma.rn.f32 f216, f215, 0f3F167918, f2323; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %109, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %69, %99; +add.f32 f229, %79, %89; +add.f32 f2322, %59, f227; +add.f32 f230, f229, f2322; +add.f32 f231, %114, %113; +add.f32 f233, %115, %90; +add.f32 f2318, %60, f231; +add.f32 f234, f233, f2318; +mul.f32 f236, f229, 0f3F4F1BBD; +fma.rn.f32 f2317, f227, 0f3E9E377A, %59; +sub.f32 f237, f2317, f236; +sub.f32 f238, %114, %113; +sub.f32 f240, %115, %90; +mul.f32 f2316, f238, 0f3F737871; +fma.rn.f32 f241, f240, 0f3F167918, f2316; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %59, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +mul.f32 f253, f233, 0f3F4F1BBD; +fma.rn.f32 f2315, f231, 0f3E9E377A, %60; +sub.f32 f254, f2315, f253; +sub.f32 f255, %69, %99; +sub.f32 f257, %79, %89; +mul.f32 f2314, f255, 0f3F737871; +fma.rn.f32 f258, f257, 0f3F167918, f2314; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %60, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %71, %101; +add.f32 f271, %81, %91; +add.f32 f2313, %61, f269; +add.f32 f272, f271, f2313; +add.f32 f273, %72, %102; +add.f32 f275, %118, %116; +add.f32 f2309, %117, f273; +add.f32 f276, f275, f2309; +mul.f32 f278, f271, 0f3F4F1BBD; +fma.rn.f32 f2308, f269, 0f3E9E377A, %61; +sub.f32 f279, f2308, f278; +sub.f32 f280, %72, %102; +sub.f32 f282, %118, %116; +mul.f32 f2307, f280, 0f3F737871; +fma.rn.f32 f283, f282, 0f3F167918, f2307; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %61, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +mul.f32 f295, f275, 0f3F4F1BBD; +fma.rn.f32 f2306, f273, 0f3E9E377A, %117; +sub.f32 f296, f2306, f295; +sub.f32 f297, %71, %101; +sub.f32 f299, %81, %91; +mul.f32 f2305, f297, 0f3F737871; +fma.rn.f32 f300, f299, 0f3F167918, f2305; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %117, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mul.f32 f2303, f158, 0f3F77F511; +mul.f32 f2304, f175, 0f3E7EA890; +sub.f32 f313, f2303, f2304; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f2301, f200, 0f3F6055A2; +mul.f32 f2302, f217, 0f3EF6A86B; +sub.f32 f318, f2301, f2302; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f322, f259, 0f3F2F3E7B; +mul.f32 f2300, f242, 0f3F3A9DB0; +sub.f32 f323, f2300, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f327, f301, 0f3F5825E0; +mul.f32 f2299, f284, 0f3F092BF2; +sub.f32 f328, f2299, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f332, f183, 0f3EF6A86B; +mul.f32 f2298, f166, 0f3F6055A2; +sub.f32 f333, f2298, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f337, f225, 0f3F5825E0; +mul.f32 f2297, f208, 0f3F092BF2; +sub.f32 f338, f2297, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f342, f267, 0f3F7F7EAE; +mul.f32 f2296, f250, 0f3D809851; +sub.f32 f343, f2296, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f2294, f292, 0fBED9FFBE; +mul.f32 f2295, f309, 0f3F67A2BF; +sub.f32 f348, f2294, f2295; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f2292, f167, 0f3F3A9DB0; +mul.f32 f2293, f184, 0f3F2F3E7B; +sub.f32 f353, f2292, f2293; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f2290, f209, 0f3D809851; +mul.f32 f2291, f226, 0f3F7F7EAE; +sub.f32 f358, f2290, f2291; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f2288, f251, 0fBF232E38; +mul.f32 f2289, f268, 0f3F45405B; +sub.f32 f363, f2288, f2289; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f367, f310, 0f3E00575B; +mul.f32 f2287, f293, 0fBF7DFB3B; +sub.f32 f368, f2287, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f372, f176, 0f3F5825E0; +mul.f32 f2286, f159, 0f3F092BF2; +sub.f32 f373, f2286, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f377, f218, 0f3F67A2BF; +mul.f32 f2285, f201, 0fBED9FFBE; +sub.f32 f378, f2285, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f382, f260, 0f3E00575B; +mul.f32 f2284, f243, 0fBF7DFB3B; +sub.f32 f383, f2284, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f387, f302, 0fBF45405B; +mul.f32 f2283, f285, 0fBF232E38; +sub.f32 f388, f2283, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f393, f188, f230; +mul.f32 f398, f393, 0f3F4F1BBD; +fma.rn.f32 f2282, f391, 0f3E9E377A, f104; +sub.f32 f399, f2282, f398; +add.f32 f2281, f150, f276; +sub.f32 f400, f150, f276; +add.f32 f2280, f192, f234; +sub.f32 f402, f192, f234; +mul.f32 f2279, f400, 0f3F737871; +fma.rn.f32 f403, f402, 0f3F167918, f2279; +sub.f32 f404, f399, f403; +add.f32 f405, f403, f399; +add.f32 f2278, f104, f391; +mul.f32 f406, f391, 0f3F4F1BBD; +sub.f32 f407, f104, f406; +fma.rn.f32 f408, f393, 0f3E9E377A, f407; +mul.f32 f409, f400, 0f3F167918; +mul.f32 f410, f402, 0f3F737871; +sub.f32 f411, f409, f410; +sub.f32 f412, f408, f411; +add.f32 f413, f411, f408; +mul.f32 f415, f2280, 0f3F4F1BBD; +fma.rn.f32 f2277, f2281, 0f3E9E377A, f108; +sub.f32 f416, f2277, f415; +sub.f32 f417, f146, f272; +sub.f32 f419, f188, f230; +mul.f32 f2276, f417, 0f3F737871; +fma.rn.f32 f420, f419, 0f3F167918, f2276; +add.f32 f421, f420, f416; +sub.f32 f422, f416, f420; +add.f32 f2275, f108, f2281; +mul.f32 f423, f2281, 0f3F4F1BBD; +sub.f32 f424, f108, f423; +fma.rn.f32 f425, f2280, 0f3E9E377A, f424; +mul.f32 f426, f417, 0f3F167918; +mul.f32 f427, f419, 0f3F737871; +sub.f32 f428, f426, f427; +add.f32 f429, f428, f425; +sub.f32 f430, f425, f428; +add.f32 f431, f313, f328; +add.f32 f433, f318, f323; +add.f32 f2274, f116, f431; +add.f32 f434, f433, f2274; +add.f32 f435, f315, f330; +add.f32 f437, f320, f325; +add.f32 f2273, f133, f435; +add.f32 f438, f437, f2273; +fma.rn.f32 f2271, f431, 0f3E9E377A, f116; +mul.f32 f2272, f433, 0f3F4F1BBD; +sub.f32 f441, f2271, f2272; +sub.f32 f442, f315, f330; +sub.f32 f444, f320, f325; +mul.f32 f2270, f442, 0f3F737871; +fma.rn.f32 f445, f444, 0f3F167918, f2270; +sub.f32 f446, f441, f445; +add.f32 f447, f445, f441; +mul.f32 f448, f431, 0f3F4F1BBD; +sub.f32 f449, f116, f448; +fma.rn.f32 f450, f433, 0f3E9E377A, f449; +mul.f32 f451, f442, 0f3F167918; +mul.f32 f452, f444, 0f3F737871; +sub.f32 f453, f451, f452; +sub.f32 f454, f450, f453; +add.f32 f455, f453, f450; +mul.f32 f457, f437, 0f3F4F1BBD; +fma.rn.f32 f2269, f435, 0f3E9E377A, f133; +sub.f32 f458, f2269, f457; +sub.f32 f459, f313, f328; +sub.f32 f461, f318, f323; +mul.f32 f2268, f459, 0f3F737871; +fma.rn.f32 f462, f461, 0f3F167918, f2268; +add.f32 f463, f462, f458; +sub.f32 f464, f458, f462; +mul.f32 f465, f435, 0f3F4F1BBD; +sub.f32 f466, f133, f465; +fma.rn.f32 f467, f437, 0f3E9E377A, f466; +mul.f32 f468, f459, 0f3F167918; +mul.f32 f469, f461, 0f3F737871; +sub.f32 f470, f468, f469; +add.f32 f471, f470, f467; +sub.f32 f472, f467, f470; +add.f32 f473, f333, f348; +add.f32 f475, f338, f343; +add.f32 f2267, f124, f473; +add.f32 f476, f475, f2267; +add.f32 f477, f335, f350; +add.f32 f479, f340, f345; +add.f32 f2266, f141, f477; +add.f32 f480, f479, f2266; +fma.rn.f32 f2264, f473, 0f3E9E377A, f124; +mul.f32 f2265, f475, 0f3F4F1BBD; +sub.f32 f483, f2264, f2265; +sub.f32 f484, f335, f350; +sub.f32 f486, f340, f345; +mul.f32 f2263, f484, 0f3F737871; +fma.rn.f32 f487, f486, 0f3F167918, f2263; +sub.f32 f488, f483, f487; +add.f32 f489, f487, f483; +mul.f32 f490, f473, 0f3F4F1BBD; +sub.f32 f491, f124, f490; +fma.rn.f32 f492, f475, 0f3E9E377A, f491; +mul.f32 f493, f484, 0f3F167918; +mul.f32 f494, f486, 0f3F737871; +sub.f32 f495, f493, f494; +sub.f32 f496, f492, f495; +add.f32 f497, f495, f492; +fma.rn.f32 f2261, f477, 0f3E9E377A, f141; +mul.f32 f2262, f479, 0f3F4F1BBD; +sub.f32 f500, f2261, f2262; +sub.f32 f501, f333, f348; +sub.f32 f503, f338, f343; +mul.f32 f2260, f501, 0f3F737871; +fma.rn.f32 f504, f503, 0f3F167918, f2260; +add.f32 f505, f504, f500; +sub.f32 f506, f500, f504; +mul.f32 f507, f477, 0f3F4F1BBD; +sub.f32 f508, f141, f507; +fma.rn.f32 f509, f479, 0f3E9E377A, f508; +mul.f32 f510, f501, 0f3F167918; +mul.f32 f511, f503, 0f3F737871; +sub.f32 f512, f510, f511; +add.f32 f513, f512, f509; +sub.f32 f514, f509, f512; +add.f32 f515, f353, f368; +add.f32 f517, f358, f363; +add.f32 f2259, f125, f515; +add.f32 f518, f517, f2259; +add.f32 f519, f355, f370; +add.f32 f521, f360, f365; +add.f32 f2258, f142, f519; +add.f32 f522, f521, f2258; +mul.f32 f524, f517, 0f3F4F1BBD; +fma.rn.f32 f2257, f515, 0f3E9E377A, f125; +sub.f32 f525, f2257, f524; +sub.f32 f526, f355, f370; +sub.f32 f528, f360, f365; +mul.f32 f2256, f526, 0f3F737871; +fma.rn.f32 f529, f528, 0f3F167918, f2256; +sub.f32 f530, f525, f529; +add.f32 f531, f529, f525; +mul.f32 f532, f515, 0f3F4F1BBD; +sub.f32 f533, f125, f532; +fma.rn.f32 f534, f517, 0f3E9E377A, f533; +mul.f32 f535, f526, 0f3F167918; +mul.f32 f536, f528, 0f3F737871; +sub.f32 f537, f535, f536; +sub.f32 f538, f534, f537; +add.f32 f539, f537, f534; +mul.f32 f541, f521, 0f3F4F1BBD; +fma.rn.f32 f2255, f519, 0f3E9E377A, f142; +sub.f32 f542, f2255, f541; +sub.f32 f543, f353, f368; +sub.f32 f545, f358, f363; +mul.f32 f2254, f543, 0f3F737871; +fma.rn.f32 f546, f545, 0f3F167918, f2254; +add.f32 f547, f546, f542; +sub.f32 f548, f542, f546; +mul.f32 f549, f519, 0f3F4F1BBD; +sub.f32 f550, f142, f549; +fma.rn.f32 f551, f521, 0f3E9E377A, f550; +mul.f32 f552, f543, 0f3F167918; +mul.f32 f553, f545, 0f3F737871; +sub.f32 f554, f552, f553; +add.f32 f555, f554, f551; +sub.f32 f556, f551, f554; +add.f32 f557, f373, f388; +add.f32 f559, f378, f383; +add.f32 f2253, f117, f557; +add.f32 f560, f559, f2253; +add.f32 f561, f375, f390; +add.f32 f563, f380, f385; +add.f32 f2252, f134, f561; +add.f32 f564, f563, f2252; +mul.f32 f566, f559, 0f3F4F1BBD; +fma.rn.f32 f2251, f557, 0f3E9E377A, f117; +sub.f32 f567, f2251, f566; +sub.f32 f568, f375, f390; +sub.f32 f570, f380, f385; +mul.f32 f2250, f568, 0f3F737871; +fma.rn.f32 f571, f570, 0f3F167918, f2250; +sub.f32 f572, f567, f571; +add.f32 f573, f571, f567; +mul.f32 f574, f557, 0f3F4F1BBD; +sub.f32 f575, f117, f574; +fma.rn.f32 f576, f559, 0f3E9E377A, f575; +mul.f32 f577, f568, 0f3F167918; +mul.f32 f578, f570, 0f3F737871; +sub.f32 f579, f577, f578; +sub.f32 f580, f576, f579; +add.f32 f581, f579, f576; +mul.f32 f583, f563, 0f3F4F1BBD; +fma.rn.f32 f2249, f561, 0f3E9E377A, f134; +sub.f32 f584, f2249, f583; +sub.f32 f585, f373, f388; +sub.f32 f587, f378, f383; +mul.f32 f2248, f585, 0f3F737871; +fma.rn.f32 f588, f587, 0f3F167918, f2248; +add.f32 f589, f588, f584; +sub.f32 f590, f584, f588; +mul.f32 f591, f561, 0f3F4F1BBD; +sub.f32 f592, f134, f591; +fma.rn.f32 f593, f563, 0f3E9E377A, f592; +mul.f32 f594, f585, 0f3F167918; +mul.f32 f595, f587, 0f3F737871; +sub.f32 f596, f594, f595; +add.f32 f597, f596, f593; +sub.f32 f598, f593, f596; +mov.u32 r18, %tid.x; +mul.wide.u32 rd2, r18, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r18, r6; +mad.lo.s32 r8, r5, 25000, r3; +mul.wide.u32 rd13, r7, 8; +mov.u64 rd14, %51; +add.s64 rd6, rd14, rd13; +ld.global.v2.f32 {f599, f600}, [rd6]; +mul.f32 f603, f438, f600; +mul.f32 f605, f599, f438; +mul.f32 f607, f600, f600; +mul.f32 f2247, f599, f599; +sub.f32 f608, f2247, f607; +mul.f32 f609, f600, f599; +fma.rn.f32 f610, f600, f599, f609; +mul.f32 f611, f480, f610; +mul.f32 f613, f608, f480; +mul.f32 f615, f600, f610; +mul.f32 f2246, f599, f608; +sub.f32 f616, f2246, f615; +mul.f32 f2245, f476, f610; +mul.f32 f617, f599, f610; +fma.rn.f32 f618, f600, f608, f617; +mul.f32 f619, f522, f618; +mul.f32 f621, f616, f522; +mul.f32 f2243, f599, f616; +mul.f32 f2244, f600, f618; +sub.f32 f624, f2243, f2244; +mul.f32 f2242, f518, f618; +mul.f32 f625, f599, f618; +fma.rn.f32 f626, f600, f616, f625; +mul.f32 f627, f564, f626; +mul.f32 f629, f624, f564; +mul.f32 f631, f600, f626; +mul.f32 f2241, f599, f624; +sub.f32 f632, f2241, f631; +mul.f32 f2240, f560, f626; +mul.f32 f633, f599, f626; +fma.rn.f32 f634, f600, f624, f633; +mul.f32 f635, f421, f634; +mul.f32 f637, f632, f421; +mul.f32 f2238, f599, f632; +mul.f32 f2239, f600, f634; +sub.f32 f640, f2238, f2239; +mul.f32 f2237, f404, f634; +mul.f32 f641, f599, f634; +fma.rn.f32 f642, f600, f632, f641; +mul.f32 f643, f463, f642; +mul.f32 f645, f640, f463; +mul.f32 f647, f600, f642; +mul.f32 f2236, f599, f640; +sub.f32 f648, f2236, f647; +mul.f32 f2235, f446, f642; +mul.f32 f649, f599, f642; +fma.rn.f32 f650, f600, f640, f649; +mul.f32 f651, f505, f650; +mul.f32 f653, f648, f505; +mul.f32 f655, f600, f650; +mul.f32 f2234, f599, f648; +sub.f32 f656, f2234, f655; +mul.f32 f2233, f488, f650; +mul.f32 f657, f599, f650; +fma.rn.f32 f658, f600, f648, f657; +mul.f32 f659, f547, f658; +mul.f32 f661, f656, f547; +mul.f32 f2231, f599, f656; +mul.f32 f2232, f600, f658; +sub.f32 f664, f2231, f2232; +mul.f32 f2230, f530, f658; +mul.f32 f665, f599, f658; +fma.rn.f32 f666, f600, f656, f665; +mul.f32 f667, f589, f666; +mul.f32 f669, f664, f589; +mul.f32 f671, f600, f666; +mul.f32 f2229, f599, f664; +sub.f32 f672, f2229, f671; +mul.f32 f2228, f572, f666; +mul.f32 f673, f599, f666; +fma.rn.f32 f674, f600, f664, f673; +mul.f32 f675, f429, f674; +mul.f32 f677, f672, f429; +mul.f32 f679, f600, f674; +mul.f32 f2227, f599, f672; +sub.f32 f680, f2227, f679; +mul.f32 f2226, f412, f674; +mul.f32 f681, f599, f674; +fma.rn.f32 f682, f600, f672, f681; +mul.f32 f683, f471, f682; +mul.f32 f685, f680, f471; +mul.f32 f2224, f599, f680; +mul.f32 f2225, f600, f682; +sub.f32 f688, f2224, f2225; +mul.f32 f2223, f454, f682; +mul.f32 f689, f599, f682; +fma.rn.f32 f690, f600, f680, f689; +mul.f32 f691, f513, f690; +mul.f32 f693, f688, f513; +mul.f32 f695, f600, f690; +mul.f32 f2222, f599, f688; +sub.f32 f696, f2222, f695; +mul.f32 f2221, f496, f690; +mul.f32 f697, f599, f690; +fma.rn.f32 f698, f600, f688, f697; +mul.f32 f699, f555, f698; +mul.f32 f701, f696, f555; +mul.f32 f2219, f599, f696; +mul.f32 f2220, f600, f698; +sub.f32 f704, f2219, f2220; +mul.f32 f2218, f538, f698; +mul.f32 f705, f599, f698; +fma.rn.f32 f706, f600, f696, f705; +mul.f32 f707, f597, f706; +mul.f32 f709, f704, f597; +mul.f32 f711, f600, f706; +mul.f32 f2217, f599, f704; +sub.f32 f712, f2217, f711; +mul.f32 f2216, f580, f706; +mul.f32 f713, f599, f706; +fma.rn.f32 f714, f600, f704, f713; +mul.f32 f715, f430, f714; +mul.f32 f717, f712, f430; +mul.f32 f719, f600, f714; +mul.f32 f2215, f599, f712; +sub.f32 f720, f2215, f719; +mul.f32 f2214, f413, f714; +mul.f32 f721, f599, f714; +fma.rn.f32 f722, f600, f712, f721; +mul.f32 f723, f472, f722; +mul.f32 f725, f720, f472; +mul.f32 f2212, f599, f720; +mul.f32 f2213, f600, f722; +sub.f32 f728, f2212, f2213; +mul.f32 f2211, f455, f722; +mul.f32 f729, f599, f722; +fma.rn.f32 f730, f600, f720, f729; +mul.f32 f731, f514, f730; +mul.f32 f733, f728, f514; +mul.f32 f735, f600, f730; +mul.f32 f2210, f599, f728; +sub.f32 f736, f2210, f735; +mul.f32 f2209, f497, f730; +mul.f32 f737, f599, f730; +fma.rn.f32 f738, f600, f728, f737; +mul.f32 f739, f556, f738; +mul.f32 f741, f736, f556; +mul.f32 f743, f600, f738; +mul.f32 f2208, f599, f736; +sub.f32 f744, f2208, f743; +mul.f32 f2207, f539, f738; +mul.f32 f745, f599, f738; +fma.rn.f32 f746, f600, f736, f745; +mul.f32 f747, f598, f746; +mul.f32 f749, f744, f598; +mul.f32 f2205, f599, f744; +mul.f32 f2206, f600, f746; +sub.f32 f752, f2205, f2206; +mul.f32 f2204, f581, f746; +mul.f32 f753, f599, f746; +fma.rn.f32 f754, f600, f744, f753; +mul.f32 f755, f422, f754; +mul.f32 f757, f752, f422; +mul.f32 f759, f600, f754; +mul.f32 f2203, f599, f752; +sub.f32 f760, f2203, f759; +mul.f32 f2202, f405, f754; +mul.f32 f761, f599, f754; +fma.rn.f32 f762, f600, f752, f761; +mul.f32 f763, f464, f762; +mul.f32 f765, f760, f464; +mul.f32 f2200, f599, f760; +mul.f32 f2201, f600, f762; +sub.f32 f768, f2200, f2201; +mul.f32 f2199, f447, f762; +mul.f32 f769, f599, f762; +fma.rn.f32 f770, f600, f760, f769; +mul.f32 f771, f506, f770; +mul.f32 f773, f768, f506; +mul.f32 f775, f600, f770; +mul.f32 f2198, f599, f768; +sub.f32 f776, f2198, f775; +mul.f32 f2197, f489, f770; +mul.f32 f777, f599, f770; +fma.rn.f32 f778, f600, f768, f777; +mul.f32 f779, f548, f778; +mul.f32 f781, f776, f548; +mul.f32 f783, f600, f778; +mul.f32 f2196, f599, f776; +sub.f32 f784, f2196, f783; +mul.f32 f2195, f531, f778; +mul.f32 f785, f599, f778; +mul.f32 f2194, f434, f600; +fma.rn.f32 f786, f600, f776, f785; +mul.f32 f787, f590, f786; +mul.f32 f788, f573, f786; +mul.f32 f789, f784, f590; +barrier.sync 0; +add.f32 f790, f2280, f2275; +add.f32 f791, f393, f2278; +mad.lo.s32 r17, r7, 200, r8; +st.shared.v2.f32 [r17], {f791, f790}; +fma.rn.f32 f792, f599, f434, f603; +sub.f32 f793, f605, f2194; +st.shared.v2.f32 [r17+8], {f792, f793}; +fma.rn.f32 f794, f608, f476, f611; +sub.f32 f795, f613, f2245; +st.shared.v2.f32 [r17+16], {f794, f795}; +fma.rn.f32 f796, f616, f518, f619; +sub.f32 f797, f621, f2242; +st.shared.v2.f32 [r17+24], {f796, f797}; +fma.rn.f32 f798, f624, f560, f627; +sub.f32 f799, f629, f2240; +st.shared.v2.f32 [r17+32], {f798, f799}; +sub.f32 f800, f637, f2237; +fma.rn.f32 f801, f632, f404, f635; +st.shared.v2.f32 [r17+40], {f801, f800}; +fma.rn.f32 f802, f640, f446, f643; +sub.f32 f803, f645, f2235; +st.shared.v2.f32 [r17+48], {f802, f803}; +sub.f32 f804, f653, f2233; +fma.rn.f32 f805, f648, f488, f651; +st.shared.v2.f32 [r17+56], {f805, f804}; +fma.rn.f32 f806, f656, f530, f659; +sub.f32 f807, f661, f2230; +st.shared.v2.f32 [r17+64], {f806, f807}; +fma.rn.f32 f808, f664, f572, f667; +sub.f32 f809, f669, f2228; +st.shared.v2.f32 [r17+72], {f808, f809}; +fma.rn.f32 f810, f672, f412, f675; +sub.f32 f811, f677, f2226; +st.shared.v2.f32 [r17+80], {f810, f811}; +fma.rn.f32 f812, f680, f454, f683; +sub.f32 f813, f685, f2223; +st.shared.v2.f32 [r17+88], {f812, f813}; +fma.rn.f32 f814, f688, f496, f691; +sub.f32 f815, f693, f2221; +st.shared.v2.f32 [r17+96], {f814, f815}; +fma.rn.f32 f816, f696, f538, f699; +sub.f32 f817, f701, f2218; +st.shared.v2.f32 [r17+104], {f816, f817}; +fma.rn.f32 f818, f704, f580, f707; +sub.f32 f819, f709, f2216; +st.shared.v2.f32 [r17+112], {f818, f819}; +fma.rn.f32 f820, f712, f413, f715; +sub.f32 f821, f717, f2214; +st.shared.v2.f32 [r17+120], {f820, f821}; +fma.rn.f32 f822, f720, f455, f723; +sub.f32 f823, f725, f2211; +st.shared.v2.f32 [r17+128], {f822, f823}; +fma.rn.f32 f824, f728, f497, f731; +sub.f32 f825, f733, f2209; +st.shared.v2.f32 [r17+136], {f824, f825}; +fma.rn.f32 f826, f736, f539, f739; +sub.f32 f827, f741, f2207; +st.shared.v2.f32 [r17+144], {f826, f827}; +fma.rn.f32 f828, f744, f581, f747; +sub.f32 f829, f749, f2204; +st.shared.v2.f32 [r17+152], {f828, f829}; +fma.rn.f32 f830, f752, f405, f755; +sub.f32 f831, f757, f2202; +st.shared.v2.f32 [r17+160], {f830, f831}; +fma.rn.f32 f832, f760, f447, f763; +sub.f32 f833, f765, f2199; +st.shared.v2.f32 [r17+168], {f832, f833}; +fma.rn.f32 f834, f768, f489, f771; +sub.f32 f835, f773, f2197; +st.shared.v2.f32 [r17+176], {f834, f835}; +fma.rn.f32 f836, f776, f531, f779; +sub.f32 f837, f781, f2195; +st.shared.v2.f32 [r17+184], {f836, f837}; +fma.rn.f32 f838, f784, f573, f787; +sub.f32 f839, f789, f788; +st.shared.v2.f32 [r17+192], {f838, f839}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r17; +ld.shared.v2.f32 {f840, f841}, [r10]; +ld.shared.v2.f32 {f844, f845}, [r10+1000]; +ld.shared.v2.f32 {f848, f849}, [r10+2000]; +ld.shared.v2.f32 {f852, f853}, [r10+3000]; +ld.shared.v2.f32 {f856, f857}, [r10+4000]; +ld.shared.v2.f32 {f860, f861}, [r10+5000]; +ld.shared.v2.f32 {f864, f865}, [r10+6000]; +ld.shared.v2.f32 {f868, f869}, [r10+7000]; +ld.shared.v2.f32 {f872, f873}, [r10+8000]; +ld.shared.v2.f32 {f876, f877}, [r10+9000]; +ld.shared.v2.f32 {f880, f881}, [r10+10000]; +ld.shared.v2.f32 {f884, f885}, [r10+11000]; +ld.shared.v2.f32 {f888, f889}, [r10+12000]; +ld.shared.v2.f32 {f892, f893}, [r10+13000]; +ld.shared.v2.f32 {f896, f897}, [r10+14000]; +ld.shared.v2.f32 {f900, f901}, [r10+15000]; +ld.shared.v2.f32 {f904, f905}, [r10+16000]; +ld.shared.v2.f32 {f908, f909}, [r10+17000]; +ld.shared.v2.f32 {f912, f913}, [r10+18000]; +ld.shared.v2.f32 {f916, f917}, [r10+19000]; +ld.shared.v2.f32 {f920, f921}, [r10+20000]; +ld.shared.v2.f32 {f924, f925}, [r10+21000]; +ld.shared.v2.f32 {f928, f929}, [r10+22000]; +ld.shared.v2.f32 {f932, f933}, [r10+23000]; +ld.shared.v2.f32 {f936, f937}, [r10+24000]; +add.f32 f940, f860, f920; +add.f32 f942, f880, f900; +add.f32 f2193, f840, f940; +add.f32 f943, f942, f2193; +add.f32 f944, f861, f921; +add.f32 f946, f881, f901; +add.f32 f2192, f841, f944; +add.f32 f947, f946, f2192; +mul.f32 f949, f942, 0f3F4F1BBD; +fma.rn.f32 f2191, f940, 0f3E9E377A, f840; +sub.f32 f950, f2191, f949; +sub.f32 f951, f861, f921; +sub.f32 f953, f881, f901; +mul.f32 f2190, f951, 0f3F737871; +fma.rn.f32 f954, f953, 0f3F167918, f2190; +sub.f32 f955, f950, f954; +add.f32 f956, f954, f950; +mul.f32 f957, f940, 0f3F4F1BBD; +sub.f32 f958, f840, f957; +fma.rn.f32 f959, f942, 0f3E9E377A, f958; +mul.f32 f960, f951, 0f3F167918; +mul.f32 f961, f953, 0f3F737871; +sub.f32 f962, f960, f961; +sub.f32 f963, f959, f962; +add.f32 f964, f962, f959; +mul.f32 f966, f946, 0f3F4F1BBD; +fma.rn.f32 f2189, f944, 0f3E9E377A, f841; +sub.f32 f967, f2189, f966; +sub.f32 f968, f860, f920; +sub.f32 f970, f880, f900; +mul.f32 f2188, f968, 0f3F737871; +fma.rn.f32 f971, f970, 0f3F167918, f2188; +add.f32 f972, f971, f967; +sub.f32 f973, f967, f971; +mul.f32 f974, f944, 0f3F4F1BBD; +sub.f32 f975, f841, f974; +fma.rn.f32 f976, f946, 0f3E9E377A, f975; +mul.f32 f977, f968, 0f3F167918; +mul.f32 f978, f970, 0f3F737871; +sub.f32 f979, f977, f978; +add.f32 f980, f979, f976; +sub.f32 f981, f976, f979; +add.f32 f982, f864, f924; +add.f32 f984, f884, f904; +add.f32 f2187, f844, f982; +add.f32 f985, f984, f2187; +add.f32 f986, f865, f925; +add.f32 f988, f885, f905; +add.f32 f2186, f845, f986; +add.f32 f989, f988, f2186; +fma.rn.f32 f2184, f982, 0f3E9E377A, f844; +mul.f32 f2185, f984, 0f3F4F1BBD; +sub.f32 f992, f2184, f2185; +sub.f32 f993, f865, f925; +sub.f32 f995, f885, f905; +mul.f32 f2183, f993, 0f3F737871; +fma.rn.f32 f996, f995, 0f3F167918, f2183; +sub.f32 f997, f992, f996; +add.f32 f998, f996, f992; +mul.f32 f999, f982, 0f3F4F1BBD; +sub.f32 f1000, f844, f999; +fma.rn.f32 f1001, f984, 0f3E9E377A, f1000; +mul.f32 f1002, f993, 0f3F167918; +mul.f32 f1003, f995, 0f3F737871; +sub.f32 f1004, f1002, f1003; +sub.f32 f1005, f1001, f1004; +add.f32 f1006, f1004, f1001; +mul.f32 f1008, f988, 0f3F4F1BBD; +fma.rn.f32 f2182, f986, 0f3E9E377A, f845; +sub.f32 f1009, f2182, f1008; +sub.f32 f1010, f864, f924; +sub.f32 f1012, f884, f904; +mul.f32 f2181, f1010, 0f3F737871; +fma.rn.f32 f1013, f1012, 0f3F167918, f2181; +add.f32 f1014, f1013, f1009; +sub.f32 f1015, f1009, f1013; +mul.f32 f1016, f986, 0f3F4F1BBD; +sub.f32 f1017, f845, f1016; +fma.rn.f32 f1018, f988, 0f3E9E377A, f1017; +mul.f32 f1019, f1010, 0f3F167918; +mul.f32 f1020, f1012, 0f3F737871; +sub.f32 f1021, f1019, f1020; +add.f32 f1022, f1021, f1018; +sub.f32 f1023, f1018, f1021; +add.f32 f1024, f868, f928; +add.f32 f1026, f888, f908; +add.f32 f2180, f848, f1024; +add.f32 f1027, f1026, f2180; +add.f32 f1028, f869, f929; +add.f32 f1030, f889, f909; +add.f32 f2179, f849, f1028; +add.f32 f1031, f1030, f2179; +fma.rn.f32 f2177, f1024, 0f3E9E377A, f848; +mul.f32 f2178, f1026, 0f3F4F1BBD; +sub.f32 f1034, f2177, f2178; +sub.f32 f1035, f869, f929; +sub.f32 f1037, f889, f909; +mul.f32 f2176, f1035, 0f3F737871; +fma.rn.f32 f1038, f1037, 0f3F167918, f2176; +sub.f32 f1039, f1034, f1038; +add.f32 f1040, f1038, f1034; +mul.f32 f1041, f1024, 0f3F4F1BBD; +sub.f32 f1042, f848, f1041; +fma.rn.f32 f1043, f1026, 0f3E9E377A, f1042; +mul.f32 f1044, f1035, 0f3F167918; +mul.f32 f1045, f1037, 0f3F737871; +sub.f32 f1046, f1044, f1045; +sub.f32 f1047, f1043, f1046; +add.f32 f1048, f1046, f1043; +fma.rn.f32 f2174, f1028, 0f3E9E377A, f849; +mul.f32 f2175, f1030, 0f3F4F1BBD; +sub.f32 f1051, f2174, f2175; +sub.f32 f1052, f868, f928; +sub.f32 f1054, f888, f908; +mul.f32 f2173, f1052, 0f3F737871; +fma.rn.f32 f1055, f1054, 0f3F167918, f2173; +add.f32 f1056, f1055, f1051; +sub.f32 f1057, f1051, f1055; +mul.f32 f1058, f1028, 0f3F4F1BBD; +sub.f32 f1059, f849, f1058; +fma.rn.f32 f1060, f1030, 0f3E9E377A, f1059; +mul.f32 f1061, f1052, 0f3F167918; +mul.f32 f1062, f1054, 0f3F737871; +sub.f32 f1063, f1061, f1062; +add.f32 f1064, f1063, f1060; +sub.f32 f1065, f1060, f1063; +add.f32 f1066, f872, f932; +add.f32 f1068, f892, f912; +add.f32 f2172, f852, f1066; +add.f32 f1069, f1068, f2172; +add.f32 f1070, f873, f933; +add.f32 f1072, f893, f913; +add.f32 f2171, f853, f1070; +add.f32 f1073, f1072, f2171; +mul.f32 f1075, f1068, 0f3F4F1BBD; +fma.rn.f32 f2170, f1066, 0f3E9E377A, f852; +sub.f32 f1076, f2170, f1075; +sub.f32 f1077, f873, f933; +sub.f32 f1079, f893, f913; +mul.f32 f2169, f1077, 0f3F737871; +fma.rn.f32 f1080, f1079, 0f3F167918, f2169; +sub.f32 f1081, f1076, f1080; +add.f32 f1082, f1080, f1076; +mul.f32 f1083, f1066, 0f3F4F1BBD; +sub.f32 f1084, f852, f1083; +fma.rn.f32 f1085, f1068, 0f3E9E377A, f1084; +mul.f32 f1086, f1077, 0f3F167918; +mul.f32 f1087, f1079, 0f3F737871; +sub.f32 f1088, f1086, f1087; +sub.f32 f1089, f1085, f1088; +add.f32 f1090, f1088, f1085; +fma.rn.f32 f2167, f1070, 0f3E9E377A, f853; +mul.f32 f2168, f1072, 0f3F4F1BBD; +sub.f32 f1093, f2167, f2168; +sub.f32 f1094, f872, f932; +sub.f32 f1096, f892, f912; +mul.f32 f2166, f1094, 0f3F737871; +fma.rn.f32 f1097, f1096, 0f3F167918, f2166; +add.f32 f1098, f1097, f1093; +sub.f32 f1099, f1093, f1097; +mul.f32 f1100, f1070, 0f3F4F1BBD; +sub.f32 f1101, f853, f1100; +fma.rn.f32 f1102, f1072, 0f3E9E377A, f1101; +mul.f32 f1103, f1094, 0f3F167918; +mul.f32 f1104, f1096, 0f3F737871; +sub.f32 f1105, f1103, f1104; +add.f32 f1106, f1105, f1102; +sub.f32 f1107, f1102, f1105; +add.f32 f1108, f876, f936; +add.f32 f1110, f896, f916; +add.f32 f2165, f856, f1108; +add.f32 f1111, f1110, f2165; +add.f32 f1112, f877, f937; +add.f32 f1114, f897, f917; +add.f32 f2164, f857, f1112; +add.f32 f1115, f1114, f2164; +mul.f32 f1117, f1110, 0f3F4F1BBD; +fma.rn.f32 f2163, f1108, 0f3E9E377A, f856; +sub.f32 f1118, f2163, f1117; +sub.f32 f1119, f877, f937; +sub.f32 f1121, f897, f917; +mul.f32 f2162, f1119, 0f3F737871; +fma.rn.f32 f1122, f1121, 0f3F167918, f2162; +sub.f32 f1123, f1118, f1122; +add.f32 f1124, f1122, f1118; +mul.f32 f1125, f1108, 0f3F4F1BBD; +sub.f32 f1126, f856, f1125; +fma.rn.f32 f1127, f1110, 0f3E9E377A, f1126; +mul.f32 f1128, f1119, 0f3F167918; +mul.f32 f1129, f1121, 0f3F737871; +sub.f32 f1130, f1128, f1129; +sub.f32 f1131, f1127, f1130; +add.f32 f1132, f1130, f1127; +mul.f32 f1134, f1114, 0f3F4F1BBD; +fma.rn.f32 f2161, f1112, 0f3E9E377A, f857; +sub.f32 f1135, f2161, f1134; +sub.f32 f1136, f876, f936; +sub.f32 f1138, f896, f916; +mul.f32 f2160, f1136, 0f3F737871; +fma.rn.f32 f1139, f1138, 0f3F167918, f2160; +add.f32 f1140, f1139, f1135; +sub.f32 f1141, f1135, f1139; +mul.f32 f1142, f1112, 0f3F4F1BBD; +sub.f32 f1143, f857, f1142; +fma.rn.f32 f1144, f1114, 0f3E9E377A, f1143; +mul.f32 f1145, f1136, 0f3F167918; +mul.f32 f1146, f1138, 0f3F737871; +sub.f32 f1147, f1145, f1146; +add.f32 f1148, f1147, f1144; +sub.f32 f1149, f1144, f1147; +mul.f32 f1151, f1014, 0f3E7EA890; +mul.f32 f2159, f997, 0f3F77F511; +sub.f32 f1152, f2159, f1151; +mul.f32 f1153, f1014, 0f3F77F511; +fma.rn.f32 f1154, f997, 0f3E7EA890, f1153; +mul.f32 f2157, f1039, 0f3F6055A2; +mul.f32 f2158, f1056, 0f3EF6A86B; +sub.f32 f1157, f2157, f2158; +mul.f32 f1158, f1056, 0f3F6055A2; +fma.rn.f32 f1159, f1039, 0f3EF6A86B, f1158; +mul.f32 f2155, f1081, 0f3F3A9DB0; +mul.f32 f2156, f1098, 0f3F2F3E7B; +sub.f32 f1162, f2155, f2156; +mul.f32 f1163, f1098, 0f3F3A9DB0; +fma.rn.f32 f1164, f1081, 0f3F2F3E7B, f1163; +mul.f32 f2153, f1123, 0f3F092BF2; +mul.f32 f2154, f1140, 0f3F5825E0; +sub.f32 f1167, f2153, f2154; +mul.f32 f1168, f1140, 0f3F092BF2; +fma.rn.f32 f1169, f1123, 0f3F5825E0, f1168; +mul.f32 f2151, f1005, 0f3F6055A2; +mul.f32 f2152, f1022, 0f3EF6A86B; +sub.f32 f1172, f2151, f2152; +mul.f32 f1173, f1022, 0f3F6055A2; +fma.rn.f32 f1174, f1005, 0f3EF6A86B, f1173; +mul.f32 f1176, f1064, 0f3F5825E0; +mul.f32 f2150, f1047, 0f3F092BF2; +sub.f32 f1177, f2150, f1176; +mul.f32 f1178, f1064, 0f3F092BF2; +fma.rn.f32 f1179, f1047, 0f3F5825E0, f1178; +mul.f32 f1181, f1106, 0f3F7F7EAE; +mul.f32 f2149, f1089, 0f3D809851; +sub.f32 f1182, f2149, f1181; +mul.f32 f1183, f1106, 0f3D809851; +fma.rn.f32 f1184, f1089, 0f3F7F7EAE, f1183; +mul.f32 f1186, f1148, 0f3F67A2BF; +mul.f32 f2148, f1131, 0fBED9FFBE; +sub.f32 f1187, f2148, f1186; +mul.f32 f1188, f1148, 0fBED9FFBE; +fma.rn.f32 f1189, f1131, 0f3F67A2BF, f1188; +mul.f32 f1191, f1023, 0f3F2F3E7B; +mul.f32 f2147, f1006, 0f3F3A9DB0; +sub.f32 f1192, f2147, f1191; +mul.f32 f1193, f1023, 0f3F3A9DB0; +fma.rn.f32 f1194, f1006, 0f3F2F3E7B, f1193; +mul.f32 f1196, f1065, 0f3F7F7EAE; +mul.f32 f2146, f1048, 0f3D809851; +sub.f32 f1197, f2146, f1196; +mul.f32 f1198, f1065, 0f3D809851; +fma.rn.f32 f1199, f1048, 0f3F7F7EAE, f1198; +mul.f32 f1201, f1107, 0f3F45405B; +mul.f32 f2145, f1090, 0fBF232E38; +sub.f32 f1202, f2145, f1201; +mul.f32 f1203, f1107, 0fBF232E38; +fma.rn.f32 f1204, f1090, 0f3F45405B, f1203; +mul.f32 f2143, f1132, 0fBF7DFB3B; +mul.f32 f2144, f1149, 0f3E00575B; +sub.f32 f1207, f2143, f2144; +mul.f32 f1208, f1149, 0fBF7DFB3B; +fma.rn.f32 f1209, f1132, 0f3E00575B, f1208; +mul.f32 f2141, f998, 0f3F092BF2; +mul.f32 f2142, f1015, 0f3F5825E0; +sub.f32 f1212, f2141, f2142; +mul.f32 f1213, f1015, 0f3F092BF2; +fma.rn.f32 f1214, f998, 0f3F5825E0, f1213; +mul.f32 f2139, f1040, 0fBED9FFBE; +mul.f32 f2140, f1057, 0f3F67A2BF; +sub.f32 f1217, f2139, f2140; +mul.f32 f1218, f1057, 0fBED9FFBE; +fma.rn.f32 f1219, f1040, 0f3F67A2BF, f1218; +mul.f32 f1221, f1099, 0f3E00575B; +mul.f32 f2138, f1082, 0fBF7DFB3B; +sub.f32 f1222, f2138, f1221; +mul.f32 f1223, f1099, 0fBF7DFB3B; +fma.rn.f32 f1224, f1082, 0f3E00575B, f1223; +mul.f32 f1226, f1141, 0fBF45405B; +mul.f32 f2137, f1124, 0fBF232E38; +sub.f32 f1227, f2137, f1226; +mul.f32 f1228, f1141, 0fBF232E38; +fma.rn.f32 f1229, f1124, 0fBF45405B, f1228; +add.f32 f1230, f985, f1111; +add.f32 f1232, f1027, f1069; +fma.rn.f32 f2135, f1230, 0f3E9E377A, f943; +mul.f32 f2136, f1232, 0f3F4F1BBD; +sub.f32 f1238, f2135, f2136; +add.f32 f2134, f989, f1115; +sub.f32 f1239, f989, f1115; +add.f32 f2133, f1031, f1073; +sub.f32 f1241, f1031, f1073; +mul.f32 f2132, f1239, 0f3F737871; +fma.rn.f32 f1242, f1241, 0f3F167918, f2132; +sub.f32 f1243, f1238, f1242; +add.f32 f1244, f1242, f1238; +add.f32 f2131, f943, f1230; +mul.f32 f1245, f1230, 0f3F4F1BBD; +sub.f32 f1246, f943, f1245; +fma.rn.f32 f1247, f1232, 0f3E9E377A, f1246; +mul.f32 f1248, f1239, 0f3F167918; +mul.f32 f1249, f1241, 0f3F737871; +sub.f32 f1250, f1248, f1249; +sub.f32 f1251, f1247, f1250; +add.f32 f1252, f1250, f1247; +fma.rn.f32 f2129, f2134, 0f3E9E377A, f947; +mul.f32 f2130, f2133, 0f3F4F1BBD; +sub.f32 f1255, f2129, f2130; +sub.f32 f1256, f985, f1111; +sub.f32 f1258, f1027, f1069; +mul.f32 f2128, f1256, 0f3F737871; +fma.rn.f32 f1259, f1258, 0f3F167918, f2128; +add.f32 f1260, f1259, f1255; +sub.f32 f1261, f1255, f1259; +add.f32 f2127, f947, f2134; +mul.f32 f1262, f2134, 0f3F4F1BBD; +sub.f32 f1263, f947, f1262; +fma.rn.f32 f1264, f2133, 0f3E9E377A, f1263; +mul.f32 f1265, f1256, 0f3F167918; +mul.f32 f1266, f1258, 0f3F737871; +sub.f32 f1267, f1265, f1266; +add.f32 f1268, f1267, f1264; +sub.f32 f1269, f1264, f1267; +add.f32 f1270, f1152, f1167; +add.f32 f1272, f1157, f1162; +add.f32 f2126, f955, f1270; +add.f32 f1273, f1272, f2126; +add.f32 f1274, f1154, f1169; +add.f32 f1276, f1159, f1164; +add.f32 f2125, f972, f1274; +add.f32 f1277, f1276, f2125; +mul.f32 f1279, f1272, 0f3F4F1BBD; +fma.rn.f32 f2124, f1270, 0f3E9E377A, f955; +sub.f32 f1280, f2124, f1279; +sub.f32 f1281, f1154, f1169; +sub.f32 f1283, f1159, f1164; +mul.f32 f2123, f1281, 0f3F737871; +fma.rn.f32 f1284, f1283, 0f3F167918, f2123; +sub.f32 f1285, f1280, f1284; +add.f32 f1286, f1284, f1280; +mul.f32 f1287, f1270, 0f3F4F1BBD; +sub.f32 f1288, f955, f1287; +fma.rn.f32 f1289, f1272, 0f3E9E377A, f1288; +mul.f32 f1290, f1281, 0f3F167918; +mul.f32 f1291, f1283, 0f3F737871; +sub.f32 f1292, f1290, f1291; +sub.f32 f1293, f1289, f1292; +add.f32 f1294, f1292, f1289; +fma.rn.f32 f2121, f1274, 0f3E9E377A, f972; +mul.f32 f2122, f1276, 0f3F4F1BBD; +sub.f32 f1297, f2121, f2122; +sub.f32 f1298, f1152, f1167; +sub.f32 f1300, f1157, f1162; +mul.f32 f2120, f1298, 0f3F737871; +fma.rn.f32 f1301, f1300, 0f3F167918, f2120; +add.f32 f1302, f1301, f1297; +sub.f32 f1303, f1297, f1301; +mul.f32 f1304, f1274, 0f3F4F1BBD; +sub.f32 f1305, f972, f1304; +fma.rn.f32 f1306, f1276, 0f3E9E377A, f1305; +mul.f32 f1307, f1298, 0f3F167918; +mul.f32 f1308, f1300, 0f3F737871; +sub.f32 f1309, f1307, f1308; +add.f32 f1310, f1309, f1306; +sub.f32 f1311, f1306, f1309; +add.f32 f1312, f1172, f1187; +add.f32 f1314, f1177, f1182; +add.f32 f2119, f963, f1312; +add.f32 f1315, f1314, f2119; +add.f32 f1316, f1174, f1189; +add.f32 f1318, f1179, f1184; +add.f32 f2118, f980, f1316; +add.f32 f1319, f1318, f2118; +mul.f32 f1321, f1314, 0f3F4F1BBD; +fma.rn.f32 f2117, f1312, 0f3E9E377A, f963; +sub.f32 f1322, f2117, f1321; +sub.f32 f1323, f1174, f1189; +sub.f32 f1325, f1179, f1184; +mul.f32 f2116, f1323, 0f3F737871; +fma.rn.f32 f1326, f1325, 0f3F167918, f2116; +sub.f32 f1327, f1322, f1326; +add.f32 f1328, f1326, f1322; +mul.f32 f1329, f1312, 0f3F4F1BBD; +sub.f32 f1330, f963, f1329; +fma.rn.f32 f1331, f1314, 0f3E9E377A, f1330; +mul.f32 f1332, f1323, 0f3F167918; +mul.f32 f1333, f1325, 0f3F737871; +sub.f32 f1334, f1332, f1333; +sub.f32 f1335, f1331, f1334; +add.f32 f1336, f1334, f1331; +mul.f32 f1338, f1318, 0f3F4F1BBD; +fma.rn.f32 f2115, f1316, 0f3E9E377A, f980; +sub.f32 f1339, f2115, f1338; +sub.f32 f1340, f1172, f1187; +sub.f32 f1342, f1177, f1182; +mul.f32 f2114, f1340, 0f3F737871; +fma.rn.f32 f1343, f1342, 0f3F167918, f2114; +add.f32 f1344, f1343, f1339; +sub.f32 f1345, f1339, f1343; +mul.f32 f1346, f1316, 0f3F4F1BBD; +sub.f32 f1347, f980, f1346; +fma.rn.f32 f1348, f1318, 0f3E9E377A, f1347; +mul.f32 f1349, f1340, 0f3F167918; +mul.f32 f1350, f1342, 0f3F737871; +sub.f32 f1351, f1349, f1350; +add.f32 f1352, f1351, f1348; +sub.f32 f1353, f1348, f1351; +add.f32 f1354, f1192, f1207; +add.f32 f1356, f1197, f1202; +add.f32 f2113, f964, f1354; +add.f32 f1357, f1356, f2113; +add.f32 f1358, f1194, f1209; +add.f32 f1360, f1199, f1204; +add.f32 f2112, f981, f1358; +add.f32 f1361, f1360, f2112; +fma.rn.f32 f2110, f1354, 0f3E9E377A, f964; +mul.f32 f2111, f1356, 0f3F4F1BBD; +sub.f32 f1364, f2110, f2111; +sub.f32 f1365, f1194, f1209; +sub.f32 f1367, f1199, f1204; +mul.f32 f2109, f1365, 0f3F737871; +fma.rn.f32 f1368, f1367, 0f3F167918, f2109; +sub.f32 f1369, f1364, f1368; +add.f32 f1370, f1368, f1364; +mul.f32 f1371, f1354, 0f3F4F1BBD; +sub.f32 f1372, f964, f1371; +fma.rn.f32 f1373, f1356, 0f3E9E377A, f1372; +mul.f32 f1374, f1365, 0f3F167918; +mul.f32 f1375, f1367, 0f3F737871; +sub.f32 f1376, f1374, f1375; +sub.f32 f1377, f1373, f1376; +add.f32 f1378, f1376, f1373; +mul.f32 f1380, f1360, 0f3F4F1BBD; +fma.rn.f32 f2108, f1358, 0f3E9E377A, f981; +sub.f32 f1381, f2108, f1380; +sub.f32 f1382, f1192, f1207; +sub.f32 f1384, f1197, f1202; +mul.f32 f2107, f1382, 0f3F737871; +fma.rn.f32 f1385, f1384, 0f3F167918, f2107; +add.f32 f1386, f1385, f1381; +sub.f32 f1387, f1381, f1385; +mul.f32 f1388, f1358, 0f3F4F1BBD; +sub.f32 f1389, f981, f1388; +fma.rn.f32 f1390, f1360, 0f3E9E377A, f1389; +mul.f32 f1391, f1382, 0f3F167918; +mul.f32 f1392, f1384, 0f3F737871; +sub.f32 f1393, f1391, f1392; +add.f32 f1394, f1393, f1390; +sub.f32 f1395, f1390, f1393; +add.f32 f1396, f1212, f1227; +add.f32 f1398, f1217, f1222; +add.f32 f2106, f956, f1396; +add.f32 f1399, f1398, f2106; +add.f32 f1400, f1214, f1229; +add.f32 f1402, f1219, f1224; +add.f32 f2105, f973, f1400; +add.f32 f1403, f1402, f2105; +fma.rn.f32 f2103, f1396, 0f3E9E377A, f956; +mul.f32 f2104, f1398, 0f3F4F1BBD; +sub.f32 f1406, f2103, f2104; +sub.f32 f1407, f1214, f1229; +sub.f32 f1409, f1219, f1224; +mul.f32 f2102, f1407, 0f3F737871; +fma.rn.f32 f1410, f1409, 0f3F167918, f2102; +sub.f32 f1411, f1406, f1410; +add.f32 f1412, f1410, f1406; +mul.f32 f1413, f1396, 0f3F4F1BBD; +sub.f32 f1414, f956, f1413; +fma.rn.f32 f1415, f1398, 0f3E9E377A, f1414; +mul.f32 f1416, f1407, 0f3F167918; +mul.f32 f1417, f1409, 0f3F737871; +sub.f32 f1418, f1416, f1417; +sub.f32 f1419, f1415, f1418; +add.f32 f1420, f1418, f1415; +fma.rn.f32 f2100, f1400, 0f3E9E377A, f973; +mul.f32 f2101, f1402, 0f3F4F1BBD; +sub.f32 f1423, f2100, f2101; +sub.f32 f1424, f1212, f1227; +sub.f32 f1426, f1217, f1222; +mul.f32 f2099, f1424, 0f3F737871; +fma.rn.f32 f1427, f1426, 0f3F167918, f2099; +add.f32 f1428, f1427, f1423; +sub.f32 f1429, f1423, f1427; +mul.f32 f1430, f1400, 0f3F4F1BBD; +sub.f32 f1431, f973, f1430; +fma.rn.f32 f1432, f1402, 0f3E9E377A, f1431; +mul.f32 f1433, f1424, 0f3F167918; +mul.f32 f1434, f1426, 0f3F737871; +sub.f32 f1435, f1433, f1434; +add.f32 f1436, f1435, f1432; +sub.f32 f1437, f1432, f1435; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mov.u64 rd10, %52; +mul.wide.u32 rd12, r11, 8; +add.s64 rd11, rd10, rd12; +ld.global.v2.f32 {f1438, f1439}, [rd11]; +mul.f32 f1442, f1277, f1439; +mul.f32 f1444, f1438, f1277; +mul.f32 f1446, f1439, f1439; +mul.f32 f2098, f1438, f1438; +sub.f32 f1447, f2098, f1446; +mul.f32 f1448, f1439, f1438; +fma.rn.f32 f1449, f1439, f1438, f1448; +mul.f32 f1450, f1319, f1449; +mul.f32 f1452, f1447, f1319; +mul.f32 f2096, f1438, f1447; +mul.f32 f2097, f1439, f1449; +sub.f32 f1455, f2096, f2097; +mul.f32 f2095, f1315, f1449; +mul.f32 f1456, f1438, f1449; +fma.rn.f32 f1457, f1439, f1447, f1456; +mul.f32 f1458, f1361, f1457; +mul.f32 f1460, f1455, f1361; +mul.f32 f1462, f1439, f1457; +mul.f32 f2094, f1438, f1455; +sub.f32 f1463, f2094, f1462; +mul.f32 f2093, f1357, f1457; +mul.f32 f1464, f1438, f1457; +fma.rn.f32 f1465, f1439, f1455, f1464; +mul.f32 f1466, f1403, f1465; +mul.f32 f1468, f1463, f1403; +mul.f32 f1470, f1439, f1465; +mul.f32 f2092, f1438, f1463; +sub.f32 f1471, f2092, f1470; +mul.f32 f2091, f1399, f1465; +mul.f32 f1472, f1438, f1465; +fma.rn.f32 f1473, f1439, f1463, f1472; +mul.f32 f1474, f1260, f1473; +mul.f32 f1476, f1471, f1260; +mul.f32 f2089, f1438, f1471; +mul.f32 f2090, f1439, f1473; +sub.f32 f1479, f2089, f2090; +mul.f32 f2088, f1243, f1473; +mul.f32 f1480, f1438, f1473; +fma.rn.f32 f1481, f1439, f1471, f1480; +mul.f32 f1482, f1302, f1481; +mul.f32 f1484, f1479, f1302; +mul.f32 f1486, f1439, f1481; +mul.f32 f2087, f1438, f1479; +sub.f32 f1487, f2087, f1486; +mul.f32 f2086, f1285, f1481; +mul.f32 f1488, f1438, f1481; +fma.rn.f32 f1489, f1439, f1479, f1488; +mul.f32 f1490, f1344, f1489; +mul.f32 f1492, f1487, f1344; +mul.f32 f1494, f1439, f1489; +mul.f32 f2085, f1438, f1487; +sub.f32 f1495, f2085, f1494; +mul.f32 f2084, f1327, f1489; +mul.f32 f1496, f1438, f1489; +fma.rn.f32 f1497, f1439, f1487, f1496; +mul.f32 f1498, f1386, f1497; +mul.f32 f1500, f1495, f1386; +mul.f32 f2082, f1438, f1495; +mul.f32 f2083, f1439, f1497; +sub.f32 f1503, f2082, f2083; +mul.f32 f2081, f1369, f1497; +mul.f32 f1504, f1438, f1497; +fma.rn.f32 f1505, f1439, f1495, f1504; +mul.f32 f1506, f1428, f1505; +mul.f32 f1508, f1503, f1428; +mul.f32 f1510, f1439, f1505; +mul.f32 f2080, f1438, f1503; +sub.f32 f1511, f2080, f1510; +mul.f32 f2079, f1411, f1505; +mul.f32 f1512, f1438, f1505; +fma.rn.f32 f1513, f1439, f1503, f1512; +mul.f32 f1514, f1268, f1513; +mul.f32 f1516, f1511, f1268; +mul.f32 f2077, f1438, f1511; +mul.f32 f2078, f1439, f1513; +sub.f32 f1519, f2077, f2078; +mul.f32 f2076, f1251, f1513; +mul.f32 f1520, f1438, f1513; +fma.rn.f32 f1521, f1439, f1511, f1520; +mul.f32 f1522, f1310, f1521; +mul.f32 f1524, f1519, f1310; +mul.f32 f1526, f1439, f1521; +mul.f32 f2075, f1438, f1519; +sub.f32 f1527, f2075, f1526; +mul.f32 f2074, f1293, f1521; +mul.f32 f1528, f1438, f1521; +fma.rn.f32 f1529, f1439, f1519, f1528; +mul.f32 f1530, f1352, f1529; +mul.f32 f1532, f1527, f1352; +mul.f32 f1534, f1439, f1529; +mul.f32 f2073, f1438, f1527; +sub.f32 f1535, f2073, f1534; +mul.f32 f2072, f1335, f1529; +mul.f32 f1536, f1438, f1529; +fma.rn.f32 f1537, f1439, f1527, f1536; +mul.f32 f1538, f1394, f1537; +mul.f32 f1540, f1535, f1394; +mul.f32 f2070, f1438, f1535; +mul.f32 f2071, f1439, f1537; +sub.f32 f1543, f2070, f2071; +mul.f32 f2069, f1377, f1537; +mul.f32 f1544, f1438, f1537; +fma.rn.f32 f1545, f1439, f1535, f1544; +mul.f32 f1546, f1436, f1545; +mul.f32 f1548, f1543, f1436; +mul.f32 f1550, f1439, f1545; +mul.f32 f2068, f1438, f1543; +sub.f32 f1551, f2068, f1550; +mul.f32 f2067, f1419, f1545; +mul.f32 f1552, f1438, f1545; +fma.rn.f32 f1553, f1439, f1543, f1552; +mul.f32 f1554, f1269, f1553; +mul.f32 f1556, f1551, f1269; +mul.f32 f1558, f1439, f1553; +mul.f32 f2066, f1438, f1551; +sub.f32 f1559, f2066, f1558; +mul.f32 f2065, f1252, f1553; +mul.f32 f1560, f1438, f1553; +fma.rn.f32 f1561, f1439, f1551, f1560; +mul.f32 f1562, f1311, f1561; +mul.f32 f1564, f1559, f1311; +mul.f32 f2063, f1438, f1559; +mul.f32 f2064, f1439, f1561; +sub.f32 f1567, f2063, f2064; +mul.f32 f2062, f1294, f1561; +mul.f32 f1568, f1438, f1561; +fma.rn.f32 f1569, f1439, f1559, f1568; +mul.f32 f1570, f1353, f1569; +mul.f32 f1572, f1567, f1353; +mul.f32 f1574, f1439, f1569; +mul.f32 f2061, f1438, f1567; +sub.f32 f1575, f2061, f1574; +mul.f32 f2060, f1336, f1569; +mul.f32 f1576, f1438, f1569; +fma.rn.f32 f1577, f1439, f1567, f1576; +mul.f32 f1578, f1395, f1577; +mul.f32 f1580, f1575, f1395; +mul.f32 f2058, f1438, f1575; +mul.f32 f2059, f1439, f1577; +sub.f32 f1583, f2058, f2059; +mul.f32 f2057, f1378, f1577; +mul.f32 f1584, f1438, f1577; +fma.rn.f32 f1585, f1439, f1575, f1584; +mul.f32 f1586, f1437, f1585; +mul.f32 f1588, f1583, f1437; +mul.f32 f1590, f1439, f1585; +mul.f32 f2056, f1438, f1583; +sub.f32 f1591, f2056, f1590; +mul.f32 f2055, f1420, f1585; +mul.f32 f1592, f1438, f1585; +fma.rn.f32 f1593, f1439, f1583, f1592; +mul.f32 f1594, f1261, f1593; +mul.f32 f1596, f1591, f1261; +mul.f32 f1598, f1439, f1593; +mul.f32 f2054, f1438, f1591; +sub.f32 f1599, f2054, f1598; +mul.f32 f2053, f1244, f1593; +mul.f32 f1600, f1438, f1593; +fma.rn.f32 f1601, f1439, f1591, f1600; +mul.f32 f1602, f1303, f1601; +mul.f32 f1604, f1599, f1303; +mul.f32 f2051, f1438, f1599; +mul.f32 f2052, f1439, f1601; +sub.f32 f1607, f2051, f2052; +mul.f32 f2050, f1286, f1601; +mul.f32 f1608, f1438, f1601; +fma.rn.f32 f1609, f1439, f1599, f1608; +mul.f32 f1610, f1345, f1609; +mul.f32 f1612, f1607, f1345; +mul.f32 f1614, f1439, f1609; +mul.f32 f2049, f1438, f1607; +sub.f32 f1615, f2049, f1614; +mul.f32 f2048, f1328, f1609; +mul.f32 f1616, f1438, f1609; +fma.rn.f32 f1617, f1439, f1607, f1616; +mul.f32 f1618, f1387, f1617; +mul.f32 f1620, f1615, f1387; +mul.f32 f1622, f1439, f1617; +mul.f32 f2047, f1438, f1615; +sub.f32 f1623, f2047, f1622; +mul.f32 f2046, f1370, f1617; +mul.f32 f1624, f1438, f1617; +mul.f32 f2045, f1273, f1439; +fma.rn.f32 f1625, f1439, f1615, f1624; +mul.f32 f1626, f1429, f1625; +mul.f32 f1627, f1412, f1625; +mul.f32 f1628, f1623, f1429; +shl.b32 r14, r13, 3; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 5000, r15; +add.f32 f1629, f2133, f2127; +add.f32 f1630, f1232, f2131; +st.shared.v2.f32 [r16], {f1630, f1629}; +fma.rn.f32 f1631, f1438, f1273, f1442; +sub.f32 f1632, f1444, f2045; +st.shared.v2.f32 [r16+200], {f1631, f1632}; +fma.rn.f32 f1633, f1447, f1315, f1450; +sub.f32 f1634, f1452, f2095; +st.shared.v2.f32 [r16+400], {f1633, f1634}; +fma.rn.f32 f1635, f1455, f1357, f1458; +sub.f32 f1636, f1460, f2093; +st.shared.v2.f32 [r16+600], {f1635, f1636}; +fma.rn.f32 f1637, f1463, f1399, f1466; +sub.f32 f1638, f1468, f2091; +st.shared.v2.f32 [r16+800], {f1637, f1638}; +fma.rn.f32 f1639, f1471, f1243, f1474; +sub.f32 f1640, f1476, f2088; +st.shared.v2.f32 [r16+1000], {f1639, f1640}; +fma.rn.f32 f1641, f1479, f1285, f1482; +sub.f32 f1642, f1484, f2086; +st.shared.v2.f32 [r16+1200], {f1641, f1642}; +fma.rn.f32 f1643, f1487, f1327, f1490; +sub.f32 f1644, f1492, f2084; +st.shared.v2.f32 [r16+1400], {f1643, f1644}; +fma.rn.f32 f1645, f1495, f1369, f1498; +sub.f32 f1646, f1500, f2081; +st.shared.v2.f32 [r16+1600], {f1645, f1646}; +sub.f32 f1647, f1508, f2079; +fma.rn.f32 f1648, f1503, f1411, f1506; +st.shared.v2.f32 [r16+1800], {f1648, f1647}; +fma.rn.f32 f1649, f1511, f1251, f1514; +sub.f32 f1650, f1516, f2076; +st.shared.v2.f32 [r16+2000], {f1649, f1650}; +fma.rn.f32 f1651, f1519, f1293, f1522; +sub.f32 f1652, f1524, f2074; +st.shared.v2.f32 [r16+2200], {f1651, f1652}; +fma.rn.f32 f1653, f1527, f1335, f1530; +sub.f32 f1654, f1532, f2072; +st.shared.v2.f32 [r16+2400], {f1653, f1654}; +fma.rn.f32 f1655, f1535, f1377, f1538; +sub.f32 f1656, f1540, f2069; +st.shared.v2.f32 [r16+2600], {f1655, f1656}; +fma.rn.f32 f1657, f1543, f1419, f1546; +sub.f32 f1658, f1548, f2067; +st.shared.v2.f32 [r16+2800], {f1657, f1658}; +fma.rn.f32 f1659, f1551, f1252, f1554; +sub.f32 f1660, f1556, f2065; +st.shared.v2.f32 [r16+3000], {f1659, f1660}; +fma.rn.f32 f1661, f1559, f1294, f1562; +sub.f32 f1662, f1564, f2062; +st.shared.v2.f32 [r16+3200], {f1661, f1662}; +fma.rn.f32 f1663, f1567, f1336, f1570; +sub.f32 f1664, f1572, f2060; +st.shared.v2.f32 [r16+3400], {f1663, f1664}; +fma.rn.f32 f1665, f1575, f1378, f1578; +sub.f32 f1666, f1580, f2057; +st.shared.v2.f32 [r16+3600], {f1665, f1666}; +fma.rn.f32 f1667, f1583, f1420, f1586; +sub.f32 f1668, f1588, f2055; +st.shared.v2.f32 [r16+3800], {f1667, f1668}; +fma.rn.f32 f1669, f1591, f1244, f1594; +sub.f32 f1670, f1596, f2053; +st.shared.v2.f32 [r16+4000], {f1669, f1670}; +fma.rn.f32 f1671, f1599, f1286, f1602; +sub.f32 f1672, f1604, f2050; +st.shared.v2.f32 [r16+4200], {f1671, f1672}; +fma.rn.f32 f1673, f1607, f1328, f1610; +sub.f32 f1674, f1612, f2048; +st.shared.v2.f32 [r16+4400], {f1673, f1674}; +fma.rn.f32 f1675, f1615, f1370, f1618; +sub.f32 f1676, f1620, f2046; +st.shared.v2.f32 [r16+4600], {f1675, f1676}; +fma.rn.f32 f1677, f1623, f1412, f1626; +sub.f32 f1678, f1628, f1627; +st.shared.v2.f32 [r16+4800], {f1677, f1678}; +barrier.sync 0; +ld.shared.v2.f32 {f1679, f1680}, [r10]; +ld.shared.v2.f32 {f1683, f1684}, [r10+1000]; +ld.shared.v2.f32 {f1687, f1688}, [r10+2000]; +ld.shared.v2.f32 {f1691, f1692}, [r10+3000]; +ld.shared.v2.f32 {f1695, f1696}, [r10+4000]; +ld.shared.v2.f32 {f1699, f1700}, [r10+5000]; +ld.shared.v2.f32 {f1703, f1704}, [r10+6000]; +ld.shared.v2.f32 {f1707, f1708}, [r10+7000]; +ld.shared.v2.f32 {f1711, f1712}, [r10+8000]; +ld.shared.v2.f32 {f1715, f1716}, [r10+9000]; +ld.shared.v2.f32 {f1719, f1720}, [r10+10000]; +ld.shared.v2.f32 {f1723, f1724}, [r10+11000]; +ld.shared.v2.f32 {f1727, f1728}, [r10+12000]; +ld.shared.v2.f32 {f1731, f1732}, [r10+13000]; +ld.shared.v2.f32 {f1735, f1736}, [r10+14000]; +ld.shared.v2.f32 {f1739, f1740}, [r10+15000]; +ld.shared.v2.f32 {f1743, f1744}, [r10+16000]; +ld.shared.v2.f32 {f1747, f1748}, [r10+17000]; +ld.shared.v2.f32 {f1751, f1752}, [r10+18000]; +ld.shared.v2.f32 {f1755, f1756}, [r10+19000]; +ld.shared.v2.f32 {f1759, f1760}, [r10+20000]; +ld.shared.v2.f32 {f1763, f1764}, [r10+21000]; +ld.shared.v2.f32 {f1767, f1768}, [r10+22000]; +ld.shared.v2.f32 {f1771, f1772}, [r10+23000]; +ld.shared.v2.f32 {f1775, f1776}, [r10+24000]; +add.f32 f1779, f1699, f1759; +add.f32 f1781, f1719, f1739; +mul.f32 f1786, f1781, 0f3F4F1BBD; +fma.rn.f32 f2044, f1779, 0f3E9E377A, f1679; +sub.f32 f1787, f2044, f1786; +add.f32 f2043, f1700, f1760; +sub.f32 f1788, f1700, f1760; +add.f32 f2042, f1720, f1740; +sub.f32 f1790, f1720, f1740; +mul.f32 f2041, f1788, 0f3F737871; +fma.rn.f32 f1791, f1790, 0f3F167918, f2041; +add.f32 f2040, f1679, f1779; +mul.f32 f1792, f1779, 0f3F4F1BBD; +sub.f32 f1793, f1679, f1792; +fma.rn.f32 f1794, f1781, 0f3E9E377A, f1793; +mul.f32 f1795, f1788, 0f3F167918; +mul.f32 f1796, f1790, 0f3F737871; +sub.f32 f1797, f1795, f1796; +mul.f32 f1799, f2042, 0f3F4F1BBD; +fma.rn.f32 f2039, f2043, 0f3E9E377A, f1680; +sub.f32 f1800, f2039, f1799; +sub.f32 f1801, f1699, f1759; +sub.f32 f1803, f1719, f1739; +mul.f32 f2038, f1801, 0f3F737871; +fma.rn.f32 f1804, f1803, 0f3F167918, f2038; +add.f32 f2037, f1680, f2043; +mul.f32 f1805, f2043, 0f3F4F1BBD; +sub.f32 f1806, f1680, f1805; +fma.rn.f32 f1807, f2042, 0f3E9E377A, f1806; +mul.f32 f1808, f1801, 0f3F167918; +mul.f32 f1809, f1803, 0f3F737871; +sub.f32 f1810, f1808, f1809; +add.f32 f1811, f1703, f1763; +add.f32 f1813, f1723, f1743; +mul.f32 f1818, f1813, 0f3F4F1BBD; +fma.rn.f32 f2036, f1811, 0f3E9E377A, f1683; +sub.f32 f1819, f2036, f1818; +add.f32 f2035, f1704, f1764; +sub.f32 f1820, f1704, f1764; +add.f32 f2034, f1724, f1744; +sub.f32 f1822, f1724, f1744; +mul.f32 f2033, f1820, 0f3F737871; +fma.rn.f32 f1823, f1822, 0f3F167918, f2033; +add.f32 f2032, f1683, f1811; +mul.f32 f1824, f1811, 0f3F4F1BBD; +sub.f32 f1825, f1683, f1824; +fma.rn.f32 f1826, f1813, 0f3E9E377A, f1825; +mul.f32 f1827, f1820, 0f3F167918; +mul.f32 f1828, f1822, 0f3F737871; +sub.f32 f1829, f1827, f1828; +fma.rn.f32 f2030, f2035, 0f3E9E377A, f1684; +mul.f32 f2031, f2034, 0f3F4F1BBD; +sub.f32 f1832, f2030, f2031; +sub.f32 f1833, f1703, f1763; +sub.f32 f1835, f1723, f1743; +mul.f32 f2029, f1833, 0f3F737871; +fma.rn.f32 f1836, f1835, 0f3F167918, f2029; +add.f32 f2028, f1684, f2035; +mul.f32 f1837, f2035, 0f3F4F1BBD; +sub.f32 f1838, f1684, f1837; +fma.rn.f32 f1839, f2034, 0f3E9E377A, f1838; +mul.f32 f1840, f1833, 0f3F167918; +mul.f32 f1841, f1835, 0f3F737871; +sub.f32 f1842, f1840, f1841; +add.f32 f1843, f1707, f1767; +add.f32 f1845, f1727, f1747; +mul.f32 f1850, f1845, 0f3F4F1BBD; +fma.rn.f32 f2027, f1843, 0f3E9E377A, f1687; +sub.f32 f1851, f2027, f1850; +add.f32 f2026, f1708, f1768; +sub.f32 f1852, f1708, f1768; +add.f32 f2025, f1728, f1748; +sub.f32 f1854, f1728, f1748; +mul.f32 f2024, f1852, 0f3F737871; +fma.rn.f32 f1855, f1854, 0f3F167918, f2024; +add.f32 f2023, f1687, f1843; +mul.f32 f1856, f1843, 0f3F4F1BBD; +sub.f32 f1857, f1687, f1856; +fma.rn.f32 f1858, f1845, 0f3E9E377A, f1857; +mul.f32 f1859, f1852, 0f3F167918; +mul.f32 f1860, f1854, 0f3F737871; +sub.f32 f1861, f1859, f1860; +mul.f32 f1863, f2025, 0f3F4F1BBD; +fma.rn.f32 f2022, f2026, 0f3E9E377A, f1688; +sub.f32 f1864, f2022, f1863; +sub.f32 f1865, f1707, f1767; +sub.f32 f1867, f1727, f1747; +mul.f32 f2021, f1865, 0f3F737871; +fma.rn.f32 f1868, f1867, 0f3F167918, f2021; +add.f32 f2020, f1688, f2026; +mul.f32 f1869, f2026, 0f3F4F1BBD; +sub.f32 f1870, f1688, f1869; +fma.rn.f32 f1871, f2025, 0f3E9E377A, f1870; +mul.f32 f1872, f1865, 0f3F167918; +mul.f32 f1873, f1867, 0f3F737871; +sub.f32 f1874, f1872, f1873; +add.f32 f1875, f1711, f1771; +add.f32 f1877, f1731, f1751; +mul.f32 f1882, f1877, 0f3F4F1BBD; +fma.rn.f32 f2019, f1875, 0f3E9E377A, f1691; +sub.f32 f1883, f2019, f1882; +add.f32 f2018, f1712, f1772; +sub.f32 f1884, f1712, f1772; +add.f32 f2017, f1732, f1752; +sub.f32 f1886, f1732, f1752; +mul.f32 f2016, f1884, 0f3F737871; +fma.rn.f32 f1887, f1886, 0f3F167918, f2016; +add.f32 f2015, f1691, f1875; +mul.f32 f1888, f1875, 0f3F4F1BBD; +sub.f32 f1889, f1691, f1888; +fma.rn.f32 f1890, f1877, 0f3E9E377A, f1889; +mul.f32 f1891, f1884, 0f3F167918; +mul.f32 f1892, f1886, 0f3F737871; +sub.f32 f1893, f1891, f1892; +fma.rn.f32 f2013, f2018, 0f3E9E377A, f1692; +mul.f32 f2014, f2017, 0f3F4F1BBD; +sub.f32 f1896, f2013, f2014; +sub.f32 f1897, f1711, f1771; +sub.f32 f1899, f1731, f1751; +mul.f32 f2012, f1897, 0f3F737871; +fma.rn.f32 f1900, f1899, 0f3F167918, f2012; +add.f32 f2011, f1692, f2018; +mul.f32 f1901, f2018, 0f3F4F1BBD; +sub.f32 f1902, f1692, f1901; +fma.rn.f32 f1903, f2017, 0f3E9E377A, f1902; +mul.f32 f1904, f1897, 0f3F167918; +mul.f32 f1905, f1899, 0f3F737871; +sub.f32 f1906, f1904, f1905; +add.f32 f1907, f1715, f1775; +add.f32 f1909, f1735, f1755; +mul.f32 f1914, f1909, 0f3F4F1BBD; +fma.rn.f32 f2010, f1907, 0f3E9E377A, f1695; +sub.f32 f1915, f2010, f1914; +add.f32 f2009, f1716, f1776; +sub.f32 f1916, f1716, f1776; +add.f32 f2008, f1736, f1756; +sub.f32 f1918, f1736, f1756; +mul.f32 f2007, f1916, 0f3F737871; +fma.rn.f32 f1919, f1918, 0f3F167918, f2007; +add.f32 f2006, f1695, f1907; +mul.f32 f1920, f1907, 0f3F4F1BBD; +sub.f32 f1921, f1695, f1920; +fma.rn.f32 f1922, f1909, 0f3E9E377A, f1921; +mul.f32 f1923, f1916, 0f3F167918; +mul.f32 f1924, f1918, 0f3F737871; +sub.f32 f1925, f1923, f1924; +mul.f32 f1927, f2008, 0f3F4F1BBD; +fma.rn.f32 f2005, f2009, 0f3E9E377A, f1696; +sub.f32 f1928, f2005, f1927; +sub.f32 f1929, f1715, f1775; +sub.f32 f1931, f1735, f1755; +mul.f32 f2004, f1929, 0f3F737871; +fma.rn.f32 f1932, f1931, 0f3F167918, f2004; +add.f32 f2003, f1696, f2009; +mul.f32 f1933, f2009, 0f3F4F1BBD; +sub.f32 f1934, f1696, f1933; +fma.rn.f32 f1935, f2008, 0f3E9E377A, f1934; +mul.f32 f1936, f1929, 0f3F167918; +mul.f32 f1937, f1931, 0f3F737871; +sub.f32 f1938, f1936, f1937; +add.f32 %1, f2042, f2037; +add.f32 %0, f1781, f2040; +add.f32 %3, f2034, f2028; +add.f32 %2, f1813, f2032; +add.f32 %5, f2025, f2020; +add.f32 %4, f1845, f2023; +add.f32 %7, f2017, f2011; +add.f32 %6, f1877, f2015; +add.f32 %9, f2008, f2003; +add.f32 %8, f1909, f2006; +sub.f32 %10, f1787, f1791; +add.f32 %11, f1804, f1800; +sub.f32 %12, f1819, f1823; +add.f32 %13, f1836, f1832; +add.f32 %15, f1868, f1864; +sub.f32 %14, f1851, f1855; +add.f32 %17, f1900, f1896; +sub.f32 %16, f1883, f1887; +sub.f32 %18, f1915, f1919; +add.f32 %19, f1932, f1928; +sub.f32 %20, f1794, f1797; +add.f32 %21, f1810, f1807; +sub.f32 %22, f1826, f1829; +add.f32 %23, f1842, f1839; +sub.f32 %24, f1858, f1861; +add.f32 %25, f1874, f1871; +add.f32 %27, f1906, f1903; +sub.f32 %26, f1890, f1893; +add.f32 %29, f1938, f1935; +sub.f32 %28, f1922, f1925; +sub.f32 %31, f1807, f1810; +add.f32 %30, f1797, f1794; +sub.f32 %33, f1839, f1842; +add.f32 %32, f1829, f1826; +sub.f32 %35, f1871, f1874; +add.f32 %34, f1861, f1858; +sub.f32 %37, f1903, f1906; +add.f32 %36, f1893, f1890; +sub.f32 %39, f1935, f1938; +add.f32 %38, f1925, f1922; +sub.f32 %41, f1800, f1804; +add.f32 %40, f1791, f1787; +sub.f32 %43, f1832, f1836; +add.f32 %42, f1823, f1819; +sub.f32 %45, f1864, f1868; +add.f32 %44, f1855, f1851; +sub.f32 %47, f1896, f1900; +add.f32 %46, f1887, f1883; +sub.f32 %49, f1928, f1932; +add.f32 %48, f1919, f1915; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_3125), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<368, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1889>; +.reg .b32 r<17>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 12500, r2; +add.f32 f101, %66, %106; +add.f32 f102, %53, f101; +add.f32 f103, %79, %93; +add.f32 f104, f103, f102; +add.f32 f105, %68, %108; +add.f32 f106, %54, f105; +add.f32 f107, %81, %94; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %53; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %68, %108; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %81, %94; +fma.rn.f32 f115, f114, 0f3F167918, f113; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %53, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +fma.rn.f32 f126, f105, 0f3E9E377A, %54; +mul.f32 f127, f107, 0f3F4F1BBD; +sub.f32 f128, f126, f127; +sub.f32 f129, %66, %106; +mul.f32 f130, f129, 0f3F737871; +sub.f32 f131, %79, %93; +fma.rn.f32 f132, f131, 0f3F167918, f130; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %54, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %69, %109; +add.f32 f144, %55, f143; +add.f32 f145, %82, %95; +add.f32 f146, f145, f144; +add.f32 f147, %70, %110; +add.f32 f148, %57, f147; +add.f32 f149, %84, %97; +add.f32 f150, f149, f148; +fma.rn.f32 f151, f143, 0f3E9E377A, %55; +mul.f32 f152, f145, 0f3F4F1BBD; +sub.f32 f153, f151, f152; +sub.f32 f154, %70, %110; +mul.f32 f155, f154, 0f3F737871; +sub.f32 f156, %84, %97; +fma.rn.f32 f157, f156, 0f3F167918, f155; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %55, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +fma.rn.f32 f168, f147, 0f3E9E377A, %57; +mul.f32 f169, f149, 0f3F4F1BBD; +sub.f32 f170, f168, f169; +sub.f32 f171, %69, %109; +mul.f32 f172, f171, 0f3F737871; +sub.f32 f173, %82, %95; +fma.rn.f32 f174, f173, 0f3F167918, f172; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %57, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %71, %111; +add.f32 f186, %58, f185; +add.f32 f187, %85, %98; +add.f32 f188, f187, f186; +add.f32 f189, %73, %113; +add.f32 f190, %60, f189; +add.f32 f191, %86, %100; +add.f32 f192, f191, f190; +fma.rn.f32 f193, f185, 0f3E9E377A, %58; +mul.f32 f194, f187, 0f3F4F1BBD; +sub.f32 f195, f193, f194; +sub.f32 f196, %73, %113; +mul.f32 f197, f196, 0f3F737871; +sub.f32 f198, %86, %100; +fma.rn.f32 f199, f198, 0f3F167918, f197; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %58, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f210, f189, 0f3E9E377A, %60; +mul.f32 f211, f191, 0f3F4F1BBD; +sub.f32 f212, f210, f211; +sub.f32 f213, %71, %111; +mul.f32 f214, f213, 0f3F737871; +sub.f32 f215, %85, %98; +fma.rn.f32 f216, f215, 0f3F167918, f214; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %60, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %74, %114; +add.f32 f228, %61, f227; +add.f32 f229, %87, %101; +add.f32 f230, f229, f228; +add.f32 f231, %76, %116; +add.f32 f232, %62, f231; +add.f32 f233, %89, %102; +add.f32 f234, f233, f232; +fma.rn.f32 f235, f227, 0f3E9E377A, %61; +mul.f32 f236, f229, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %76, %116; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %89, %102; +fma.rn.f32 f241, f240, 0f3F167918, f239; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %61, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +fma.rn.f32 f252, f231, 0f3E9E377A, %62; +mul.f32 f253, f233, 0f3F4F1BBD; +sub.f32 f254, f252, f253; +sub.f32 f255, %74, %114; +mul.f32 f256, f255, 0f3F737871; +sub.f32 f257, %87, %101; +fma.rn.f32 f258, f257, 0f3F167918, f256; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %62, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %77, %117; +add.f32 f270, %63, f269; +add.f32 f271, %90, %103; +add.f32 f272, f271, f270; +add.f32 f273, %78, %118; +add.f32 f274, %65, f273; +add.f32 f275, %92, %105; +add.f32 f276, f275, f274; +fma.rn.f32 f277, f269, 0f3E9E377A, %63; +mul.f32 f278, f271, 0f3F4F1BBD; +sub.f32 f279, f277, f278; +sub.f32 f280, %78, %118; +mul.f32 f281, f280, 0f3F737871; +sub.f32 f282, %92, %105; +fma.rn.f32 f283, f282, 0f3F167918, f281; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %63, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +fma.rn.f32 f294, f273, 0f3E9E377A, %65; +mul.f32 f295, f275, 0f3F4F1BBD; +sub.f32 f296, f294, f295; +sub.f32 f297, %77, %117; +mul.f32 f298, f297, 0f3F737871; +sub.f32 f299, %90, %103; +fma.rn.f32 f300, f299, 0f3F167918, f298; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %65, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mov.u32 r4, %tid.x; +mul.f32 f311, f158, 0f3F77F511; +mul.f32 f312, f175, 0f3E7EA890; +sub.f32 f313, f311, f312; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f316, f200, 0f3F6055A2; +mul.f32 f317, f217, 0f3EF6A86B; +sub.f32 f318, f316, f317; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f321, f242, 0f3F3A9DB0; +mul.f32 f322, f259, 0f3F2F3E7B; +sub.f32 f323, f321, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f326, f284, 0f3F092BF2; +mul.f32 f327, f301, 0f3F5825E0; +sub.f32 f328, f326, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f331, f166, 0f3F6055A2; +mul.f32 f332, f183, 0f3EF6A86B; +sub.f32 f333, f331, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f336, f208, 0f3F092BF2; +mul.f32 f337, f225, 0f3F5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f341, f250, 0f3D809851; +mul.f32 f342, f267, 0f3F7F7EAE; +sub.f32 f343, f341, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f346, f292, 0fBED9FFBE; +mul.f32 f347, f309, 0f3F67A2BF; +sub.f32 f348, f346, f347; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f351, f167, 0f3F3A9DB0; +mul.f32 f352, f184, 0f3F2F3E7B; +sub.f32 f353, f351, f352; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f356, f209, 0f3D809851; +mul.f32 f357, f226, 0f3F7F7EAE; +sub.f32 f358, f356, f357; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f361, f251, 0fBF232E38; +mul.f32 f362, f268, 0f3F45405B; +sub.f32 f363, f361, f362; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f366, f293, 0fBF7DFB3B; +mul.f32 f367, f310, 0f3E00575B; +sub.f32 f368, f366, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f371, f159, 0f3F092BF2; +mul.f32 f372, f176, 0f3F5825E0; +sub.f32 f373, f371, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f376, f201, 0fBED9FFBE; +mul.f32 f377, f218, 0f3F67A2BF; +sub.f32 f378, f376, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f381, f243, 0fBF7DFB3B; +mul.f32 f382, f260, 0f3E00575B; +sub.f32 f383, f381, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f386, f285, 0fBF232E38; +mul.f32 f387, f302, 0fBF45405B; +sub.f32 f388, f386, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f392, f104, f391; +add.f32 f393, f188, f230; +add.f32 f394, f393, f392; +add.f32 f395, f150, f276; +add.f32 f396, f108, f395; +add.f32 f397, f192, f234; +add.f32 f398, f397, f396; +fma.rn.f32 f399, f391, 0f3E9E377A, f104; +mul.f32 f400, f393, 0f3F4F1BBD; +sub.f32 f401, f399, f400; +sub.f32 f402, f150, f276; +mul.f32 f403, f402, 0f3F737871; +sub.f32 f404, f192, f234; +fma.rn.f32 f405, f404, 0f3F167918, f403; +sub.f32 f406, f401, f405; +add.f32 f407, f405, f401; +mul.f32 f408, f391, 0f3F4F1BBD; +sub.f32 f409, f104, f408; +fma.rn.f32 f410, f393, 0f3E9E377A, f409; +mul.f32 f411, f402, 0f3F167918; +mul.f32 f412, f404, 0f3F737871; +sub.f32 f413, f411, f412; +sub.f32 f414, f410, f413; +add.f32 f415, f413, f410; +fma.rn.f32 f416, f395, 0f3E9E377A, f108; +mul.f32 f417, f397, 0f3F4F1BBD; +sub.f32 f418, f416, f417; +sub.f32 f419, f146, f272; +mul.f32 f420, f419, 0f3F737871; +sub.f32 f421, f188, f230; +fma.rn.f32 f422, f421, 0f3F167918, f420; +add.f32 f423, f422, f418; +sub.f32 f424, f418, f422; +mul.f32 f425, f395, 0f3F4F1BBD; +sub.f32 f426, f108, f425; +fma.rn.f32 f427, f397, 0f3E9E377A, f426; +mul.f32 f428, f419, 0f3F167918; +mul.f32 f429, f421, 0f3F737871; +sub.f32 f430, f428, f429; +add.f32 f431, f430, f427; +sub.f32 f432, f427, f430; +add.f32 f433, f313, f328; +add.f32 f434, f116, f433; +add.f32 f435, f318, f323; +add.f32 f436, f435, f434; +add.f32 f437, f315, f330; +add.f32 f438, f133, f437; +add.f32 f439, f320, f325; +add.f32 f440, f439, f438; +fma.rn.f32 f441, f433, 0f3E9E377A, f116; +mul.f32 f442, f435, 0f3F4F1BBD; +sub.f32 f443, f441, f442; +sub.f32 f444, f315, f330; +mul.f32 f445, f444, 0f3F737871; +sub.f32 f446, f320, f325; +fma.rn.f32 f447, f446, 0f3F167918, f445; +sub.f32 f448, f443, f447; +add.f32 f449, f447, f443; +mul.f32 f450, f433, 0f3F4F1BBD; +sub.f32 f451, f116, f450; +fma.rn.f32 f452, f435, 0f3E9E377A, f451; +mul.f32 f453, f444, 0f3F167918; +mul.f32 f454, f446, 0f3F737871; +sub.f32 f455, f453, f454; +sub.f32 f456, f452, f455; +add.f32 f457, f455, f452; +fma.rn.f32 f458, f437, 0f3E9E377A, f133; +mul.f32 f459, f439, 0f3F4F1BBD; +sub.f32 f460, f458, f459; +sub.f32 f461, f313, f328; +mul.f32 f462, f461, 0f3F737871; +sub.f32 f463, f318, f323; +fma.rn.f32 f464, f463, 0f3F167918, f462; +add.f32 f465, f464, f460; +sub.f32 f466, f460, f464; +mul.f32 f467, f437, 0f3F4F1BBD; +sub.f32 f468, f133, f467; +fma.rn.f32 f469, f439, 0f3E9E377A, f468; +mul.f32 f470, f461, 0f3F167918; +mul.f32 f471, f463, 0f3F737871; +sub.f32 f472, f470, f471; +add.f32 f473, f472, f469; +sub.f32 f474, f469, f472; +add.f32 f475, f333, f348; +add.f32 f476, f124, f475; +add.f32 f477, f338, f343; +add.f32 f478, f477, f476; +add.f32 f479, f335, f350; +add.f32 f480, f141, f479; +add.f32 f481, f340, f345; +add.f32 f482, f481, f480; +fma.rn.f32 f483, f475, 0f3E9E377A, f124; +mul.f32 f484, f477, 0f3F4F1BBD; +sub.f32 f485, f483, f484; +sub.f32 f486, f335, f350; +mul.f32 f487, f486, 0f3F737871; +sub.f32 f488, f340, f345; +fma.rn.f32 f489, f488, 0f3F167918, f487; +sub.f32 f490, f485, f489; +add.f32 f491, f489, f485; +mul.f32 f492, f475, 0f3F4F1BBD; +sub.f32 f493, f124, f492; +fma.rn.f32 f494, f477, 0f3E9E377A, f493; +mul.f32 f495, f486, 0f3F167918; +mul.f32 f496, f488, 0f3F737871; +sub.f32 f497, f495, f496; +sub.f32 f498, f494, f497; +add.f32 f499, f497, f494; +fma.rn.f32 f500, f479, 0f3E9E377A, f141; +mul.f32 f501, f481, 0f3F4F1BBD; +sub.f32 f502, f500, f501; +sub.f32 f503, f333, f348; +mul.f32 f504, f503, 0f3F737871; +sub.f32 f505, f338, f343; +fma.rn.f32 f506, f505, 0f3F167918, f504; +add.f32 f507, f506, f502; +sub.f32 f508, f502, f506; +mul.f32 f509, f479, 0f3F4F1BBD; +sub.f32 f510, f141, f509; +fma.rn.f32 f511, f481, 0f3E9E377A, f510; +mul.f32 f512, f503, 0f3F167918; +mul.f32 f513, f505, 0f3F737871; +sub.f32 f514, f512, f513; +add.f32 f515, f514, f511; +sub.f32 f516, f511, f514; +add.f32 f517, f353, f368; +add.f32 f518, f125, f517; +add.f32 f519, f358, f363; +add.f32 f520, f519, f518; +add.f32 f521, f355, f370; +add.f32 f522, f142, f521; +add.f32 f523, f360, f365; +add.f32 f524, f523, f522; +fma.rn.f32 f525, f517, 0f3E9E377A, f125; +mul.f32 f526, f519, 0f3F4F1BBD; +sub.f32 f527, f525, f526; +sub.f32 f528, f355, f370; +mul.f32 f529, f528, 0f3F737871; +sub.f32 f530, f360, f365; +fma.rn.f32 f531, f530, 0f3F167918, f529; +sub.f32 f532, f527, f531; +add.f32 f533, f531, f527; +mul.f32 f534, f517, 0f3F4F1BBD; +sub.f32 f535, f125, f534; +fma.rn.f32 f536, f519, 0f3E9E377A, f535; +mul.f32 f537, f528, 0f3F167918; +mul.f32 f538, f530, 0f3F737871; +sub.f32 f539, f537, f538; +sub.f32 f540, f536, f539; +add.f32 f541, f539, f536; +fma.rn.f32 f542, f521, 0f3E9E377A, f142; +mul.f32 f543, f523, 0f3F4F1BBD; +sub.f32 f544, f542, f543; +sub.f32 f545, f353, f368; +mul.f32 f546, f545, 0f3F737871; +sub.f32 f547, f358, f363; +fma.rn.f32 f548, f547, 0f3F167918, f546; +add.f32 f549, f548, f544; +sub.f32 f550, f544, f548; +mul.f32 f551, f521, 0f3F4F1BBD; +sub.f32 f552, f142, f551; +fma.rn.f32 f553, f523, 0f3E9E377A, f552; +mul.f32 f554, f545, 0f3F167918; +mul.f32 f555, f547, 0f3F737871; +sub.f32 f556, f554, f555; +add.f32 f557, f556, f553; +sub.f32 f558, f553, f556; +add.f32 f559, f373, f388; +add.f32 f560, f117, f559; +add.f32 f561, f378, f383; +add.f32 f562, f561, f560; +add.f32 f563, f375, f390; +add.f32 f564, f134, f563; +add.f32 f565, f380, f385; +add.f32 f566, f565, f564; +fma.rn.f32 f567, f559, 0f3E9E377A, f117; +mul.f32 f568, f561, 0f3F4F1BBD; +sub.f32 f569, f567, f568; +sub.f32 f570, f375, f390; +mul.f32 f571, f570, 0f3F737871; +sub.f32 f572, f380, f385; +fma.rn.f32 f573, f572, 0f3F167918, f571; +sub.f32 f574, f569, f573; +add.f32 f575, f573, f569; +mul.f32 f576, f559, 0f3F4F1BBD; +sub.f32 f577, f117, f576; +fma.rn.f32 f578, f561, 0f3E9E377A, f577; +mul.f32 f579, f570, 0f3F167918; +mul.f32 f580, f572, 0f3F737871; +sub.f32 f581, f579, f580; +sub.f32 f582, f578, f581; +add.f32 f583, f581, f578; +fma.rn.f32 f584, f563, 0f3E9E377A, f134; +mul.f32 f585, f565, 0f3F4F1BBD; +sub.f32 f586, f584, f585; +sub.f32 f587, f373, f388; +mul.f32 f588, f587, 0f3F737871; +sub.f32 f589, f378, f383; +fma.rn.f32 f590, f589, 0f3F167918, f588; +add.f32 f591, f590, f586; +sub.f32 f592, f586, f590; +mul.f32 f593, f563, 0f3F4F1BBD; +sub.f32 f594, f134, f593; +fma.rn.f32 f595, f565, 0f3E9E377A, f594; +mul.f32 f596, f587, 0f3F167918; +mul.f32 f597, f589, 0f3F737871; +sub.f32 f598, f596, f597; +add.f32 f599, f598, f595; +sub.f32 f600, f595, f598; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f601, f602}, [rd6]; +mul.f32 f605, f440, f602; +fma.rn.f32 f606, f601, f436, f605; +mul.f32 f607, f436, f602; +mul.f32 f608, f601, f440; +sub.f32 f609, f608, f607; +mul.f32 f610, f601, f601; +mul.f32 f611, f602, f602; +sub.f32 f612, f610, f611; +mul.f32 f613, f602, f601; +fma.rn.f32 f614, f602, f601, f613; +mul.f32 f615, f482, f614; +fma.rn.f32 f616, f612, f478, f615; +mul.f32 f617, f478, f614; +mul.f32 f618, f612, f482; +sub.f32 f619, f618, f617; +mul.f32 f620, f601, f612; +mul.f32 f621, f602, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f601, f614; +fma.rn.f32 f624, f602, f612, f623; +mul.f32 f625, f524, f624; +fma.rn.f32 f626, f622, f520, f625; +mul.f32 f627, f520, f624; +mul.f32 f628, f622, f524; +sub.f32 f629, f628, f627; +mul.f32 f630, f601, f622; +mul.f32 f631, f602, f624; +sub.f32 f632, f630, f631; +mul.f32 f633, f601, f624; +fma.rn.f32 f634, f602, f622, f633; +mul.f32 f635, f566, f634; +fma.rn.f32 f636, f632, f562, f635; +mul.f32 f637, f562, f634; +mul.f32 f638, f632, f566; +sub.f32 f639, f638, f637; +mul.f32 f640, f601, f632; +mul.f32 f641, f602, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f601, f634; +fma.rn.f32 f644, f602, f632, f643; +mul.f32 f645, f423, f644; +fma.rn.f32 f646, f642, f406, f645; +mul.f32 f647, f406, f644; +mul.f32 f648, f642, f423; +sub.f32 f649, f648, f647; +mul.f32 f650, f601, f642; +mul.f32 f651, f602, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f601, f644; +fma.rn.f32 f654, f602, f642, f653; +mul.f32 f655, f465, f654; +fma.rn.f32 f656, f652, f448, f655; +mul.f32 f657, f448, f654; +mul.f32 f658, f652, f465; +sub.f32 f659, f658, f657; +mul.f32 f660, f601, f652; +mul.f32 f661, f602, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f601, f654; +fma.rn.f32 f664, f602, f652, f663; +mul.f32 f665, f507, f664; +fma.rn.f32 f666, f662, f490, f665; +mul.f32 f667, f490, f664; +mul.f32 f668, f662, f507; +sub.f32 f669, f668, f667; +mul.f32 f670, f601, f662; +mul.f32 f671, f602, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f601, f664; +fma.rn.f32 f674, f602, f662, f673; +mul.f32 f675, f549, f674; +fma.rn.f32 f676, f672, f532, f675; +mul.f32 f677, f532, f674; +mul.f32 f678, f672, f549; +sub.f32 f679, f678, f677; +mul.f32 f680, f601, f672; +mul.f32 f681, f602, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f601, f674; +fma.rn.f32 f684, f602, f672, f683; +mul.f32 f685, f591, f684; +fma.rn.f32 f686, f682, f574, f685; +mul.f32 f687, f574, f684; +mul.f32 f688, f682, f591; +sub.f32 f689, f688, f687; +mul.f32 f690, f601, f682; +mul.f32 f691, f602, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f601, f684; +fma.rn.f32 f694, f602, f682, f693; +mul.f32 f695, f431, f694; +fma.rn.f32 f696, f692, f414, f695; +mul.f32 f697, f414, f694; +mul.f32 f698, f692, f431; +sub.f32 f699, f698, f697; +mul.f32 f700, f601, f692; +mul.f32 f701, f602, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f601, f694; +fma.rn.f32 f704, f602, f692, f703; +mul.f32 f705, f473, f704; +fma.rn.f32 f706, f702, f456, f705; +mul.f32 f707, f456, f704; +mul.f32 f708, f702, f473; +sub.f32 f709, f708, f707; +mul.f32 f710, f601, f702; +mul.f32 f711, f602, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f601, f704; +fma.rn.f32 f714, f602, f702, f713; +mul.f32 f715, f515, f714; +fma.rn.f32 f716, f712, f498, f715; +mul.f32 f717, f498, f714; +mul.f32 f718, f712, f515; +sub.f32 f719, f718, f717; +mul.f32 f720, f601, f712; +mul.f32 f721, f602, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f601, f714; +fma.rn.f32 f724, f602, f712, f723; +mul.f32 f725, f557, f724; +fma.rn.f32 f726, f722, f540, f725; +mul.f32 f727, f540, f724; +mul.f32 f728, f722, f557; +sub.f32 f729, f728, f727; +mul.f32 f730, f601, f722; +mul.f32 f731, f602, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f601, f724; +fma.rn.f32 f734, f602, f722, f733; +mul.f32 f735, f599, f734; +fma.rn.f32 f736, f732, f582, f735; +mul.f32 f737, f582, f734; +mul.f32 f738, f732, f599; +sub.f32 f739, f738, f737; +mul.f32 f740, f601, f732; +mul.f32 f741, f602, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f601, f734; +fma.rn.f32 f744, f602, f732, f743; +mul.f32 f745, f432, f744; +fma.rn.f32 f746, f742, f415, f745; +mul.f32 f747, f415, f744; +mul.f32 f748, f742, f432; +sub.f32 f749, f748, f747; +mul.f32 f750, f601, f742; +mul.f32 f751, f602, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f601, f744; +fma.rn.f32 f754, f602, f742, f753; +mul.f32 f755, f474, f754; +fma.rn.f32 f756, f752, f457, f755; +mul.f32 f757, f457, f754; +mul.f32 f758, f752, f474; +sub.f32 f759, f758, f757; +mul.f32 f760, f601, f752; +mul.f32 f761, f602, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f601, f754; +fma.rn.f32 f764, f602, f752, f763; +mul.f32 f765, f516, f764; +fma.rn.f32 f766, f762, f499, f765; +mul.f32 f767, f499, f764; +mul.f32 f768, f762, f516; +sub.f32 f769, f768, f767; +mul.f32 f770, f601, f762; +mul.f32 f771, f602, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f601, f764; +fma.rn.f32 f774, f602, f762, f773; +mul.f32 f775, f558, f774; +fma.rn.f32 f776, f772, f541, f775; +mul.f32 f777, f541, f774; +mul.f32 f778, f772, f558; +sub.f32 f779, f778, f777; +mul.f32 f780, f601, f772; +mul.f32 f781, f602, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f601, f774; +fma.rn.f32 f784, f602, f772, f783; +mul.f32 f785, f600, f784; +fma.rn.f32 f786, f782, f583, f785; +mul.f32 f787, f583, f784; +mul.f32 f788, f782, f600; +sub.f32 f789, f788, f787; +mul.f32 f790, f601, f782; +mul.f32 f791, f602, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f601, f784; +fma.rn.f32 f794, f602, f782, f793; +mul.f32 f795, f424, f794; +fma.rn.f32 f796, f792, f407, f795; +mul.f32 f797, f407, f794; +mul.f32 f798, f792, f424; +sub.f32 f799, f798, f797; +mul.f32 f800, f601, f792; +mul.f32 f801, f602, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f601, f794; +fma.rn.f32 f804, f602, f792, f803; +mul.f32 f805, f466, f804; +fma.rn.f32 f806, f802, f449, f805; +mul.f32 f807, f449, f804; +mul.f32 f808, f802, f466; +sub.f32 f809, f808, f807; +mul.f32 f810, f601, f802; +mul.f32 f811, f602, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f601, f804; +fma.rn.f32 f814, f602, f802, f813; +mul.f32 f815, f508, f814; +fma.rn.f32 f816, f812, f491, f815; +mul.f32 f817, f491, f814; +mul.f32 f818, f812, f508; +sub.f32 f819, f818, f817; +mul.f32 f820, f601, f812; +mul.f32 f821, f602, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f601, f814; +fma.rn.f32 f824, f602, f812, f823; +mul.f32 f825, f550, f824; +fma.rn.f32 f826, f822, f533, f825; +mul.f32 f827, f533, f824; +mul.f32 f828, f822, f550; +sub.f32 f829, f828, f827; +mul.f32 f830, f601, f822; +mul.f32 f831, f602, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f601, f824; +fma.rn.f32 f834, f602, f822, f833; +mul.f32 f835, f592, f834; +fma.rn.f32 f836, f832, f575, f835; +mul.f32 f837, f575, f834; +mul.f32 f838, f832, f592; +sub.f32 f839, f838, f837; +mad.lo.s32 r8, r5, 12500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f394; +st.shared.f32 [r9+4], f606; +st.shared.f32 [r9+8], f616; +st.shared.f32 [r9+12], f626; +st.shared.f32 [r9+16], f636; +st.shared.f32 [r9+20], f646; +st.shared.f32 [r9+24], f656; +st.shared.f32 [r9+28], f666; +st.shared.f32 [r9+32], f676; +st.shared.f32 [r9+36], f686; +st.shared.f32 [r9+40], f696; +st.shared.f32 [r9+44], f706; +st.shared.f32 [r9+48], f716; +st.shared.f32 [r9+52], f726; +st.shared.f32 [r9+56], f736; +st.shared.f32 [r9+60], f746; +st.shared.f32 [r9+64], f756; +st.shared.f32 [r9+68], f766; +st.shared.f32 [r9+72], f776; +st.shared.f32 [r9+76], f786; +st.shared.f32 [r9+80], f796; +st.shared.f32 [r9+84], f806; +st.shared.f32 [r9+88], f816; +st.shared.f32 [r9+92], f826; +st.shared.f32 [r9+96], f836; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f840, [r10]; +ld.shared.f32 f841, [r10+500]; +ld.shared.f32 f842, [r10+1000]; +ld.shared.f32 f843, [r10+1500]; +ld.shared.f32 f844, [r10+2000]; +ld.shared.f32 f845, [r10+2500]; +ld.shared.f32 f846, [r10+3000]; +ld.shared.f32 f847, [r10+3500]; +ld.shared.f32 f848, [r10+4000]; +ld.shared.f32 f849, [r10+4500]; +ld.shared.f32 f850, [r10+5000]; +ld.shared.f32 f851, [r10+5500]; +ld.shared.f32 f852, [r10+6000]; +ld.shared.f32 f853, [r10+6500]; +ld.shared.f32 f854, [r10+7000]; +ld.shared.f32 f855, [r10+7500]; +ld.shared.f32 f856, [r10+8000]; +ld.shared.f32 f857, [r10+8500]; +ld.shared.f32 f858, [r10+9000]; +ld.shared.f32 f859, [r10+9500]; +ld.shared.f32 f860, [r10+10000]; +ld.shared.f32 f861, [r10+10500]; +ld.shared.f32 f862, [r10+11000]; +ld.shared.f32 f863, [r10+11500]; +ld.shared.f32 f864, [r10+12000]; +barrier.sync 0; +st.shared.f32 [r9], f398; +st.shared.f32 [r9+4], f609; +st.shared.f32 [r9+8], f619; +st.shared.f32 [r9+12], f629; +st.shared.f32 [r9+16], f639; +st.shared.f32 [r9+20], f649; +st.shared.f32 [r9+24], f659; +st.shared.f32 [r9+28], f669; +st.shared.f32 [r9+32], f679; +st.shared.f32 [r9+36], f689; +st.shared.f32 [r9+40], f699; +st.shared.f32 [r9+44], f709; +st.shared.f32 [r9+48], f719; +st.shared.f32 [r9+52], f729; +st.shared.f32 [r9+56], f739; +st.shared.f32 [r9+60], f749; +st.shared.f32 [r9+64], f759; +st.shared.f32 [r9+68], f769; +st.shared.f32 [r9+72], f779; +st.shared.f32 [r9+76], f789; +st.shared.f32 [r9+80], f799; +st.shared.f32 [r9+84], f809; +st.shared.f32 [r9+88], f819; +st.shared.f32 [r9+92], f829; +st.shared.f32 [r9+96], f839; +barrier.sync 0; +ld.shared.f32 f865, [r10]; +ld.shared.f32 f866, [r10+500]; +ld.shared.f32 f867, [r10+1000]; +ld.shared.f32 f868, [r10+1500]; +ld.shared.f32 f869, [r10+2000]; +ld.shared.f32 f870, [r10+2500]; +ld.shared.f32 f871, [r10+3000]; +ld.shared.f32 f872, [r10+3500]; +ld.shared.f32 f873, [r10+4000]; +ld.shared.f32 f874, [r10+4500]; +ld.shared.f32 f875, [r10+5000]; +ld.shared.f32 f876, [r10+5500]; +ld.shared.f32 f877, [r10+6000]; +ld.shared.f32 f878, [r10+6500]; +ld.shared.f32 f879, [r10+7000]; +ld.shared.f32 f880, [r10+7500]; +ld.shared.f32 f881, [r10+8000]; +ld.shared.f32 f882, [r10+8500]; +ld.shared.f32 f883, [r10+9000]; +ld.shared.f32 f884, [r10+9500]; +ld.shared.f32 f885, [r10+10000]; +ld.shared.f32 f886, [r10+10500]; +ld.shared.f32 f887, [r10+11000]; +ld.shared.f32 f888, [r10+11500]; +ld.shared.f32 f889, [r10+12000]; +add.f32 f890, f845, f860; +add.f32 f891, f840, f890; +add.f32 f892, f850, f855; +add.f32 f893, f892, f891; +add.f32 f894, f870, f885; +add.f32 f895, f865, f894; +add.f32 f896, f875, f880; +add.f32 f897, f896, f895; +fma.rn.f32 f898, f890, 0f3E9E377A, f840; +mul.f32 f899, f892, 0f3F4F1BBD; +sub.f32 f900, f898, f899; +sub.f32 f901, f870, f885; +mul.f32 f902, f901, 0f3F737871; +sub.f32 f903, f875, f880; +fma.rn.f32 f904, f903, 0f3F167918, f902; +sub.f32 f905, f900, f904; +add.f32 f906, f904, f900; +mul.f32 f907, f890, 0f3F4F1BBD; +sub.f32 f908, f840, f907; +fma.rn.f32 f909, f892, 0f3E9E377A, f908; +mul.f32 f910, f901, 0f3F167918; +mul.f32 f911, f903, 0f3F737871; +sub.f32 f912, f910, f911; +sub.f32 f913, f909, f912; +add.f32 f914, f912, f909; +fma.rn.f32 f915, f894, 0f3E9E377A, f865; +mul.f32 f916, f896, 0f3F4F1BBD; +sub.f32 f917, f915, f916; +sub.f32 f918, f845, f860; +mul.f32 f919, f918, 0f3F737871; +sub.f32 f920, f850, f855; +fma.rn.f32 f921, f920, 0f3F167918, f919; +add.f32 f922, f921, f917; +sub.f32 f923, f917, f921; +mul.f32 f924, f894, 0f3F4F1BBD; +sub.f32 f925, f865, f924; +fma.rn.f32 f926, f896, 0f3E9E377A, f925; +mul.f32 f927, f918, 0f3F167918; +mul.f32 f928, f920, 0f3F737871; +sub.f32 f929, f927, f928; +add.f32 f930, f929, f926; +sub.f32 f931, f926, f929; +add.f32 f932, f846, f861; +add.f32 f933, f841, f932; +add.f32 f934, f851, f856; +add.f32 f935, f934, f933; +add.f32 f936, f871, f886; +add.f32 f937, f866, f936; +add.f32 f938, f876, f881; +add.f32 f939, f938, f937; +fma.rn.f32 f940, f932, 0f3E9E377A, f841; +mul.f32 f941, f934, 0f3F4F1BBD; +sub.f32 f942, f940, f941; +sub.f32 f943, f871, f886; +mul.f32 f944, f943, 0f3F737871; +sub.f32 f945, f876, f881; +fma.rn.f32 f946, f945, 0f3F167918, f944; +sub.f32 f947, f942, f946; +add.f32 f948, f946, f942; +mul.f32 f949, f932, 0f3F4F1BBD; +sub.f32 f950, f841, f949; +fma.rn.f32 f951, f934, 0f3E9E377A, f950; +mul.f32 f952, f943, 0f3F167918; +mul.f32 f953, f945, 0f3F737871; +sub.f32 f954, f952, f953; +sub.f32 f955, f951, f954; +add.f32 f956, f954, f951; +fma.rn.f32 f957, f936, 0f3E9E377A, f866; +mul.f32 f958, f938, 0f3F4F1BBD; +sub.f32 f959, f957, f958; +sub.f32 f960, f846, f861; +mul.f32 f961, f960, 0f3F737871; +sub.f32 f962, f851, f856; +fma.rn.f32 f963, f962, 0f3F167918, f961; +add.f32 f964, f963, f959; +sub.f32 f965, f959, f963; +mul.f32 f966, f936, 0f3F4F1BBD; +sub.f32 f967, f866, f966; +fma.rn.f32 f968, f938, 0f3E9E377A, f967; +mul.f32 f969, f960, 0f3F167918; +mul.f32 f970, f962, 0f3F737871; +sub.f32 f971, f969, f970; +add.f32 f972, f971, f968; +sub.f32 f973, f968, f971; +add.f32 f974, f847, f862; +add.f32 f975, f842, f974; +add.f32 f976, f852, f857; +add.f32 f977, f976, f975; +add.f32 f978, f872, f887; +add.f32 f979, f867, f978; +add.f32 f980, f877, f882; +add.f32 f981, f980, f979; +fma.rn.f32 f982, f974, 0f3E9E377A, f842; +mul.f32 f983, f976, 0f3F4F1BBD; +sub.f32 f984, f982, f983; +sub.f32 f985, f872, f887; +mul.f32 f986, f985, 0f3F737871; +sub.f32 f987, f877, f882; +fma.rn.f32 f988, f987, 0f3F167918, f986; +sub.f32 f989, f984, f988; +add.f32 f990, f988, f984; +mul.f32 f991, f974, 0f3F4F1BBD; +sub.f32 f992, f842, f991; +fma.rn.f32 f993, f976, 0f3E9E377A, f992; +mul.f32 f994, f985, 0f3F167918; +mul.f32 f995, f987, 0f3F737871; +sub.f32 f996, f994, f995; +sub.f32 f997, f993, f996; +add.f32 f998, f996, f993; +fma.rn.f32 f999, f978, 0f3E9E377A, f867; +mul.f32 f1000, f980, 0f3F4F1BBD; +sub.f32 f1001, f999, f1000; +sub.f32 f1002, f847, f862; +mul.f32 f1003, f1002, 0f3F737871; +sub.f32 f1004, f852, f857; +fma.rn.f32 f1005, f1004, 0f3F167918, f1003; +add.f32 f1006, f1005, f1001; +sub.f32 f1007, f1001, f1005; +mul.f32 f1008, f978, 0f3F4F1BBD; +sub.f32 f1009, f867, f1008; +fma.rn.f32 f1010, f980, 0f3E9E377A, f1009; +mul.f32 f1011, f1002, 0f3F167918; +mul.f32 f1012, f1004, 0f3F737871; +sub.f32 f1013, f1011, f1012; +add.f32 f1014, f1013, f1010; +sub.f32 f1015, f1010, f1013; +add.f32 f1016, f848, f863; +add.f32 f1017, f843, f1016; +add.f32 f1018, f853, f858; +add.f32 f1019, f1018, f1017; +add.f32 f1020, f873, f888; +add.f32 f1021, f868, f1020; +add.f32 f1022, f878, f883; +add.f32 f1023, f1022, f1021; +fma.rn.f32 f1024, f1016, 0f3E9E377A, f843; +mul.f32 f1025, f1018, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f873, f888; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f878, f883; +fma.rn.f32 f1030, f1029, 0f3F167918, f1028; +sub.f32 f1031, f1026, f1030; +add.f32 f1032, f1030, f1026; +mul.f32 f1033, f1016, 0f3F4F1BBD; +sub.f32 f1034, f843, f1033; +fma.rn.f32 f1035, f1018, 0f3E9E377A, f1034; +mul.f32 f1036, f1027, 0f3F167918; +mul.f32 f1037, f1029, 0f3F737871; +sub.f32 f1038, f1036, f1037; +sub.f32 f1039, f1035, f1038; +add.f32 f1040, f1038, f1035; +fma.rn.f32 f1041, f1020, 0f3E9E377A, f868; +mul.f32 f1042, f1022, 0f3F4F1BBD; +sub.f32 f1043, f1041, f1042; +sub.f32 f1044, f848, f863; +mul.f32 f1045, f1044, 0f3F737871; +sub.f32 f1046, f853, f858; +fma.rn.f32 f1047, f1046, 0f3F167918, f1045; +add.f32 f1048, f1047, f1043; +sub.f32 f1049, f1043, f1047; +mul.f32 f1050, f1020, 0f3F4F1BBD; +sub.f32 f1051, f868, f1050; +fma.rn.f32 f1052, f1022, 0f3E9E377A, f1051; +mul.f32 f1053, f1044, 0f3F167918; +mul.f32 f1054, f1046, 0f3F737871; +sub.f32 f1055, f1053, f1054; +add.f32 f1056, f1055, f1052; +sub.f32 f1057, f1052, f1055; +add.f32 f1058, f849, f864; +add.f32 f1059, f844, f1058; +add.f32 f1060, f854, f859; +add.f32 f1061, f1060, f1059; +add.f32 f1062, f874, f889; +add.f32 f1063, f869, f1062; +add.f32 f1064, f879, f884; +add.f32 f1065, f1064, f1063; +fma.rn.f32 f1066, f1058, 0f3E9E377A, f844; +mul.f32 f1067, f1060, 0f3F4F1BBD; +sub.f32 f1068, f1066, f1067; +sub.f32 f1069, f874, f889; +mul.f32 f1070, f1069, 0f3F737871; +sub.f32 f1071, f879, f884; +fma.rn.f32 f1072, f1071, 0f3F167918, f1070; +sub.f32 f1073, f1068, f1072; +add.f32 f1074, f1072, f1068; +mul.f32 f1075, f1058, 0f3F4F1BBD; +sub.f32 f1076, f844, f1075; +fma.rn.f32 f1077, f1060, 0f3E9E377A, f1076; +mul.f32 f1078, f1069, 0f3F167918; +mul.f32 f1079, f1071, 0f3F737871; +sub.f32 f1080, f1078, f1079; +sub.f32 f1081, f1077, f1080; +add.f32 f1082, f1080, f1077; +fma.rn.f32 f1083, f1062, 0f3E9E377A, f869; +mul.f32 f1084, f1064, 0f3F4F1BBD; +sub.f32 f1085, f1083, f1084; +sub.f32 f1086, f849, f864; +mul.f32 f1087, f1086, 0f3F737871; +sub.f32 f1088, f854, f859; +fma.rn.f32 f1089, f1088, 0f3F167918, f1087; +add.f32 f1090, f1089, f1085; +sub.f32 f1091, f1085, f1089; +mul.f32 f1092, f1062, 0f3F4F1BBD; +sub.f32 f1093, f869, f1092; +fma.rn.f32 f1094, f1064, 0f3E9E377A, f1093; +mul.f32 f1095, f1086, 0f3F167918; +mul.f32 f1096, f1088, 0f3F737871; +sub.f32 f1097, f1095, f1096; +add.f32 f1098, f1097, f1094; +sub.f32 f1099, f1094, f1097; +mul.f32 f1100, f947, 0f3F77F511; +mul.f32 f1101, f964, 0f3E7EA890; +sub.f32 f1102, f1100, f1101; +mul.f32 f1103, f964, 0f3F77F511; +fma.rn.f32 f1104, f947, 0f3E7EA890, f1103; +mul.f32 f1105, f989, 0f3F6055A2; +mul.f32 f1106, f1006, 0f3EF6A86B; +sub.f32 f1107, f1105, f1106; +mul.f32 f1108, f1006, 0f3F6055A2; +fma.rn.f32 f1109, f989, 0f3EF6A86B, f1108; +mul.f32 f1110, f1031, 0f3F3A9DB0; +mul.f32 f1111, f1048, 0f3F2F3E7B; +sub.f32 f1112, f1110, f1111; +mul.f32 f1113, f1048, 0f3F3A9DB0; +fma.rn.f32 f1114, f1031, 0f3F2F3E7B, f1113; +mul.f32 f1115, f1073, 0f3F092BF2; +mul.f32 f1116, f1090, 0f3F5825E0; +sub.f32 f1117, f1115, f1116; +mul.f32 f1118, f1090, 0f3F092BF2; +fma.rn.f32 f1119, f1073, 0f3F5825E0, f1118; +mul.f32 f1120, f955, 0f3F6055A2; +mul.f32 f1121, f972, 0f3EF6A86B; +sub.f32 f1122, f1120, f1121; +mul.f32 f1123, f972, 0f3F6055A2; +fma.rn.f32 f1124, f955, 0f3EF6A86B, f1123; +mul.f32 f1125, f997, 0f3F092BF2; +mul.f32 f1126, f1014, 0f3F5825E0; +sub.f32 f1127, f1125, f1126; +mul.f32 f1128, f1014, 0f3F092BF2; +fma.rn.f32 f1129, f997, 0f3F5825E0, f1128; +mul.f32 f1130, f1039, 0f3D809851; +mul.f32 f1131, f1056, 0f3F7F7EAE; +sub.f32 f1132, f1130, f1131; +mul.f32 f1133, f1056, 0f3D809851; +fma.rn.f32 f1134, f1039, 0f3F7F7EAE, f1133; +mul.f32 f1135, f1081, 0fBED9FFBE; +mul.f32 f1136, f1098, 0f3F67A2BF; +sub.f32 f1137, f1135, f1136; +mul.f32 f1138, f1098, 0fBED9FFBE; +fma.rn.f32 f1139, f1081, 0f3F67A2BF, f1138; +mul.f32 f1140, f956, 0f3F3A9DB0; +mul.f32 f1141, f973, 0f3F2F3E7B; +sub.f32 f1142, f1140, f1141; +mul.f32 f1143, f973, 0f3F3A9DB0; +fma.rn.f32 f1144, f956, 0f3F2F3E7B, f1143; +mul.f32 f1145, f998, 0f3D809851; +mul.f32 f1146, f1015, 0f3F7F7EAE; +sub.f32 f1147, f1145, f1146; +mul.f32 f1148, f1015, 0f3D809851; +fma.rn.f32 f1149, f998, 0f3F7F7EAE, f1148; +mul.f32 f1150, f1040, 0fBF232E38; +mul.f32 f1151, f1057, 0f3F45405B; +sub.f32 f1152, f1150, f1151; +mul.f32 f1153, f1057, 0fBF232E38; +fma.rn.f32 f1154, f1040, 0f3F45405B, f1153; +mul.f32 f1155, f1082, 0fBF7DFB3B; +mul.f32 f1156, f1099, 0f3E00575B; +sub.f32 f1157, f1155, f1156; +mul.f32 f1158, f1099, 0fBF7DFB3B; +fma.rn.f32 f1159, f1082, 0f3E00575B, f1158; +mul.f32 f1160, f948, 0f3F092BF2; +mul.f32 f1161, f965, 0f3F5825E0; +sub.f32 f1162, f1160, f1161; +mul.f32 f1163, f965, 0f3F092BF2; +fma.rn.f32 f1164, f948, 0f3F5825E0, f1163; +mul.f32 f1165, f990, 0fBED9FFBE; +mul.f32 f1166, f1007, 0f3F67A2BF; +sub.f32 f1167, f1165, f1166; +mul.f32 f1168, f1007, 0fBED9FFBE; +fma.rn.f32 f1169, f990, 0f3F67A2BF, f1168; +mul.f32 f1170, f1032, 0fBF7DFB3B; +mul.f32 f1171, f1049, 0f3E00575B; +sub.f32 f1172, f1170, f1171; +mul.f32 f1173, f1049, 0fBF7DFB3B; +fma.rn.f32 f1174, f1032, 0f3E00575B, f1173; +mul.f32 f1175, f1074, 0fBF232E38; +mul.f32 f1176, f1091, 0fBF45405B; +sub.f32 f1177, f1175, f1176; +mul.f32 f1178, f1091, 0fBF232E38; +fma.rn.f32 f1179, f1074, 0fBF45405B, f1178; +add.f32 f1180, f935, f1061; +add.f32 f1181, f893, f1180; +add.f32 f1182, f977, f1019; +add.f32 f1183, f1182, f1181; +add.f32 f1184, f939, f1065; +add.f32 f1185, f897, f1184; +add.f32 f1186, f981, f1023; +add.f32 f1187, f1186, f1185; +fma.rn.f32 f1188, f1180, 0f3E9E377A, f893; +mul.f32 f1189, f1182, 0f3F4F1BBD; +sub.f32 f1190, f1188, f1189; +sub.f32 f1191, f939, f1065; +mul.f32 f1192, f1191, 0f3F737871; +sub.f32 f1193, f981, f1023; +fma.rn.f32 f1194, f1193, 0f3F167918, f1192; +sub.f32 f1195, f1190, f1194; +add.f32 f1196, f1194, f1190; +mul.f32 f1197, f1180, 0f3F4F1BBD; +sub.f32 f1198, f893, f1197; +fma.rn.f32 f1199, f1182, 0f3E9E377A, f1198; +mul.f32 f1200, f1191, 0f3F167918; +mul.f32 f1201, f1193, 0f3F737871; +sub.f32 f1202, f1200, f1201; +sub.f32 f1203, f1199, f1202; +add.f32 f1204, f1202, f1199; +fma.rn.f32 f1205, f1184, 0f3E9E377A, f897; +mul.f32 f1206, f1186, 0f3F4F1BBD; +sub.f32 f1207, f1205, f1206; +sub.f32 f1208, f935, f1061; +mul.f32 f1209, f1208, 0f3F737871; +sub.f32 f1210, f977, f1019; +fma.rn.f32 f1211, f1210, 0f3F167918, f1209; +add.f32 f1212, f1211, f1207; +sub.f32 f1213, f1207, f1211; +mul.f32 f1214, f1184, 0f3F4F1BBD; +sub.f32 f1215, f897, f1214; +fma.rn.f32 f1216, f1186, 0f3E9E377A, f1215; +mul.f32 f1217, f1208, 0f3F167918; +mul.f32 f1218, f1210, 0f3F737871; +sub.f32 f1219, f1217, f1218; +add.f32 f1220, f1219, f1216; +sub.f32 f1221, f1216, f1219; +add.f32 f1222, f1102, f1117; +add.f32 f1223, f905, f1222; +add.f32 f1224, f1107, f1112; +add.f32 f1225, f1224, f1223; +add.f32 f1226, f1104, f1119; +add.f32 f1227, f922, f1226; +add.f32 f1228, f1109, f1114; +add.f32 f1229, f1228, f1227; +fma.rn.f32 f1230, f1222, 0f3E9E377A, f905; +mul.f32 f1231, f1224, 0f3F4F1BBD; +sub.f32 f1232, f1230, f1231; +sub.f32 f1233, f1104, f1119; +mul.f32 f1234, f1233, 0f3F737871; +sub.f32 f1235, f1109, f1114; +fma.rn.f32 f1236, f1235, 0f3F167918, f1234; +sub.f32 f1237, f1232, f1236; +add.f32 f1238, f1236, f1232; +mul.f32 f1239, f1222, 0f3F4F1BBD; +sub.f32 f1240, f905, f1239; +fma.rn.f32 f1241, f1224, 0f3E9E377A, f1240; +mul.f32 f1242, f1233, 0f3F167918; +mul.f32 f1243, f1235, 0f3F737871; +sub.f32 f1244, f1242, f1243; +sub.f32 f1245, f1241, f1244; +add.f32 f1246, f1244, f1241; +fma.rn.f32 f1247, f1226, 0f3E9E377A, f922; +mul.f32 f1248, f1228, 0f3F4F1BBD; +sub.f32 f1249, f1247, f1248; +sub.f32 f1250, f1102, f1117; +mul.f32 f1251, f1250, 0f3F737871; +sub.f32 f1252, f1107, f1112; +fma.rn.f32 f1253, f1252, 0f3F167918, f1251; +add.f32 f1254, f1253, f1249; +sub.f32 f1255, f1249, f1253; +mul.f32 f1256, f1226, 0f3F4F1BBD; +sub.f32 f1257, f922, f1256; +fma.rn.f32 f1258, f1228, 0f3E9E377A, f1257; +mul.f32 f1259, f1250, 0f3F167918; +mul.f32 f1260, f1252, 0f3F737871; +sub.f32 f1261, f1259, f1260; +add.f32 f1262, f1261, f1258; +sub.f32 f1263, f1258, f1261; +add.f32 f1264, f1122, f1137; +add.f32 f1265, f913, f1264; +add.f32 f1266, f1127, f1132; +add.f32 f1267, f1266, f1265; +add.f32 f1268, f1124, f1139; +add.f32 f1269, f930, f1268; +add.f32 f1270, f1129, f1134; +add.f32 f1271, f1270, f1269; +fma.rn.f32 f1272, f1264, 0f3E9E377A, f913; +mul.f32 f1273, f1266, 0f3F4F1BBD; +sub.f32 f1274, f1272, f1273; +sub.f32 f1275, f1124, f1139; +mul.f32 f1276, f1275, 0f3F737871; +sub.f32 f1277, f1129, f1134; +fma.rn.f32 f1278, f1277, 0f3F167918, f1276; +sub.f32 f1279, f1274, f1278; +add.f32 f1280, f1278, f1274; +mul.f32 f1281, f1264, 0f3F4F1BBD; +sub.f32 f1282, f913, f1281; +fma.rn.f32 f1283, f1266, 0f3E9E377A, f1282; +mul.f32 f1284, f1275, 0f3F167918; +mul.f32 f1285, f1277, 0f3F737871; +sub.f32 f1286, f1284, f1285; +sub.f32 f1287, f1283, f1286; +add.f32 f1288, f1286, f1283; +fma.rn.f32 f1289, f1268, 0f3E9E377A, f930; +mul.f32 f1290, f1270, 0f3F4F1BBD; +sub.f32 f1291, f1289, f1290; +sub.f32 f1292, f1122, f1137; +mul.f32 f1293, f1292, 0f3F737871; +sub.f32 f1294, f1127, f1132; +fma.rn.f32 f1295, f1294, 0f3F167918, f1293; +add.f32 f1296, f1295, f1291; +sub.f32 f1297, f1291, f1295; +mul.f32 f1298, f1268, 0f3F4F1BBD; +sub.f32 f1299, f930, f1298; +fma.rn.f32 f1300, f1270, 0f3E9E377A, f1299; +mul.f32 f1301, f1292, 0f3F167918; +mul.f32 f1302, f1294, 0f3F737871; +sub.f32 f1303, f1301, f1302; +add.f32 f1304, f1303, f1300; +sub.f32 f1305, f1300, f1303; +add.f32 f1306, f1142, f1157; +add.f32 f1307, f914, f1306; +add.f32 f1308, f1147, f1152; +add.f32 f1309, f1308, f1307; +add.f32 f1310, f1144, f1159; +add.f32 f1311, f931, f1310; +add.f32 f1312, f1149, f1154; +add.f32 f1313, f1312, f1311; +fma.rn.f32 f1314, f1306, 0f3E9E377A, f914; +mul.f32 f1315, f1308, 0f3F4F1BBD; +sub.f32 f1316, f1314, f1315; +sub.f32 f1317, f1144, f1159; +mul.f32 f1318, f1317, 0f3F737871; +sub.f32 f1319, f1149, f1154; +fma.rn.f32 f1320, f1319, 0f3F167918, f1318; +sub.f32 f1321, f1316, f1320; +add.f32 f1322, f1320, f1316; +mul.f32 f1323, f1306, 0f3F4F1BBD; +sub.f32 f1324, f914, f1323; +fma.rn.f32 f1325, f1308, 0f3E9E377A, f1324; +mul.f32 f1326, f1317, 0f3F167918; +mul.f32 f1327, f1319, 0f3F737871; +sub.f32 f1328, f1326, f1327; +sub.f32 f1329, f1325, f1328; +add.f32 f1330, f1328, f1325; +fma.rn.f32 f1331, f1310, 0f3E9E377A, f931; +mul.f32 f1332, f1312, 0f3F4F1BBD; +sub.f32 f1333, f1331, f1332; +sub.f32 f1334, f1142, f1157; +mul.f32 f1335, f1334, 0f3F737871; +sub.f32 f1336, f1147, f1152; +fma.rn.f32 f1337, f1336, 0f3F167918, f1335; +add.f32 f1338, f1337, f1333; +sub.f32 f1339, f1333, f1337; +mul.f32 f1340, f1310, 0f3F4F1BBD; +sub.f32 f1341, f931, f1340; +fma.rn.f32 f1342, f1312, 0f3E9E377A, f1341; +mul.f32 f1343, f1334, 0f3F167918; +mul.f32 f1344, f1336, 0f3F737871; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1345, f1342; +sub.f32 f1347, f1342, f1345; +add.f32 f1348, f1162, f1177; +add.f32 f1349, f906, f1348; +add.f32 f1350, f1167, f1172; +add.f32 f1351, f1350, f1349; +add.f32 f1352, f1164, f1179; +add.f32 f1353, f923, f1352; +add.f32 f1354, f1169, f1174; +add.f32 f1355, f1354, f1353; +fma.rn.f32 f1356, f1348, 0f3E9E377A, f906; +mul.f32 f1357, f1350, 0f3F4F1BBD; +sub.f32 f1358, f1356, f1357; +sub.f32 f1359, f1164, f1179; +mul.f32 f1360, f1359, 0f3F737871; +sub.f32 f1361, f1169, f1174; +fma.rn.f32 f1362, f1361, 0f3F167918, f1360; +sub.f32 f1363, f1358, f1362; +add.f32 f1364, f1362, f1358; +mul.f32 f1365, f1348, 0f3F4F1BBD; +sub.f32 f1366, f906, f1365; +fma.rn.f32 f1367, f1350, 0f3E9E377A, f1366; +mul.f32 f1368, f1359, 0f3F167918; +mul.f32 f1369, f1361, 0f3F737871; +sub.f32 f1370, f1368, f1369; +sub.f32 f1371, f1367, f1370; +add.f32 f1372, f1370, f1367; +fma.rn.f32 f1373, f1352, 0f3E9E377A, f923; +mul.f32 f1374, f1354, 0f3F4F1BBD; +sub.f32 f1375, f1373, f1374; +sub.f32 f1376, f1162, f1177; +mul.f32 f1377, f1376, 0f3F737871; +sub.f32 f1378, f1167, f1172; +fma.rn.f32 f1379, f1378, 0f3F167918, f1377; +add.f32 f1380, f1379, f1375; +sub.f32 f1381, f1375, f1379; +mul.f32 f1382, f1352, 0f3F4F1BBD; +sub.f32 f1383, f923, f1382; +fma.rn.f32 f1384, f1354, 0f3E9E377A, f1383; +mul.f32 f1385, f1376, 0f3F167918; +mul.f32 f1386, f1378, 0f3F737871; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1387, f1384; +sub.f32 f1389, f1384, f1387; +mul.wide.u32 rd7, r7, 1374389535; +shr.u64 rd8, rd7, 35; +cvt.u32.u64 r11, rd8; +mul.lo.s32 r12, r11, 25; +sub.s32 r13, r7, r12; +mul.wide.u32 rd9, r11, 8; +mov.u64 rd10, %52; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1390, f1391}, [rd11]; +mul.f32 f1394, f1229, f1391; +fma.rn.f32 f1395, f1390, f1225, f1394; +mul.f32 f1396, f1225, f1391; +mul.f32 f1397, f1390, f1229; +sub.f32 f1398, f1397, f1396; +mul.f32 f1399, f1390, f1390; +mul.f32 f1400, f1391, f1391; +sub.f32 f1401, f1399, f1400; +mul.f32 f1402, f1391, f1390; +fma.rn.f32 f1403, f1391, f1390, f1402; +mul.f32 f1404, f1271, f1403; +fma.rn.f32 f1405, f1401, f1267, f1404; +mul.f32 f1406, f1267, f1403; +mul.f32 f1407, f1401, f1271; +sub.f32 f1408, f1407, f1406; +mul.f32 f1409, f1390, f1401; +mul.f32 f1410, f1391, f1403; +sub.f32 f1411, f1409, f1410; +mul.f32 f1412, f1390, f1403; +fma.rn.f32 f1413, f1391, f1401, f1412; +mul.f32 f1414, f1313, f1413; +fma.rn.f32 f1415, f1411, f1309, f1414; +mul.f32 f1416, f1309, f1413; +mul.f32 f1417, f1411, f1313; +sub.f32 f1418, f1417, f1416; +mul.f32 f1419, f1390, f1411; +mul.f32 f1420, f1391, f1413; +sub.f32 f1421, f1419, f1420; +mul.f32 f1422, f1390, f1413; +fma.rn.f32 f1423, f1391, f1411, f1422; +mul.f32 f1424, f1355, f1423; +fma.rn.f32 f1425, f1421, f1351, f1424; +mul.f32 f1426, f1351, f1423; +mul.f32 f1427, f1421, f1355; +sub.f32 f1428, f1427, f1426; +mul.f32 f1429, f1390, f1421; +mul.f32 f1430, f1391, f1423; +sub.f32 f1431, f1429, f1430; +mul.f32 f1432, f1390, f1423; +fma.rn.f32 f1433, f1391, f1421, f1432; +mul.f32 f1434, f1212, f1433; +fma.rn.f32 f1435, f1431, f1195, f1434; +mul.f32 f1436, f1195, f1433; +mul.f32 f1437, f1431, f1212; +sub.f32 f1438, f1437, f1436; +mul.f32 f1439, f1390, f1431; +mul.f32 f1440, f1391, f1433; +sub.f32 f1441, f1439, f1440; +mul.f32 f1442, f1390, f1433; +fma.rn.f32 f1443, f1391, f1431, f1442; +mul.f32 f1444, f1254, f1443; +fma.rn.f32 f1445, f1441, f1237, f1444; +mul.f32 f1446, f1237, f1443; +mul.f32 f1447, f1441, f1254; +sub.f32 f1448, f1447, f1446; +mul.f32 f1449, f1390, f1441; +mul.f32 f1450, f1391, f1443; +sub.f32 f1451, f1449, f1450; +mul.f32 f1452, f1390, f1443; +fma.rn.f32 f1453, f1391, f1441, f1452; +mul.f32 f1454, f1296, f1453; +fma.rn.f32 f1455, f1451, f1279, f1454; +mul.f32 f1456, f1279, f1453; +mul.f32 f1457, f1451, f1296; +sub.f32 f1458, f1457, f1456; +mul.f32 f1459, f1390, f1451; +mul.f32 f1460, f1391, f1453; +sub.f32 f1461, f1459, f1460; +mul.f32 f1462, f1390, f1453; +fma.rn.f32 f1463, f1391, f1451, f1462; +mul.f32 f1464, f1338, f1463; +fma.rn.f32 f1465, f1461, f1321, f1464; +mul.f32 f1466, f1321, f1463; +mul.f32 f1467, f1461, f1338; +sub.f32 f1468, f1467, f1466; +mul.f32 f1469, f1390, f1461; +mul.f32 f1470, f1391, f1463; +sub.f32 f1471, f1469, f1470; +mul.f32 f1472, f1390, f1463; +fma.rn.f32 f1473, f1391, f1461, f1472; +mul.f32 f1474, f1380, f1473; +fma.rn.f32 f1475, f1471, f1363, f1474; +mul.f32 f1476, f1363, f1473; +mul.f32 f1477, f1471, f1380; +sub.f32 f1478, f1477, f1476; +mul.f32 f1479, f1390, f1471; +mul.f32 f1480, f1391, f1473; +sub.f32 f1481, f1479, f1480; +mul.f32 f1482, f1390, f1473; +fma.rn.f32 f1483, f1391, f1471, f1482; +mul.f32 f1484, f1220, f1483; +fma.rn.f32 f1485, f1481, f1203, f1484; +mul.f32 f1486, f1203, f1483; +mul.f32 f1487, f1481, f1220; +sub.f32 f1488, f1487, f1486; +mul.f32 f1489, f1390, f1481; +mul.f32 f1490, f1391, f1483; +sub.f32 f1491, f1489, f1490; +mul.f32 f1492, f1390, f1483; +fma.rn.f32 f1493, f1391, f1481, f1492; +mul.f32 f1494, f1262, f1493; +fma.rn.f32 f1495, f1491, f1245, f1494; +mul.f32 f1496, f1245, f1493; +mul.f32 f1497, f1491, f1262; +sub.f32 f1498, f1497, f1496; +mul.f32 f1499, f1390, f1491; +mul.f32 f1500, f1391, f1493; +sub.f32 f1501, f1499, f1500; +mul.f32 f1502, f1390, f1493; +fma.rn.f32 f1503, f1391, f1491, f1502; +mul.f32 f1504, f1304, f1503; +fma.rn.f32 f1505, f1501, f1287, f1504; +mul.f32 f1506, f1287, f1503; +mul.f32 f1507, f1501, f1304; +sub.f32 f1508, f1507, f1506; +mul.f32 f1509, f1390, f1501; +mul.f32 f1510, f1391, f1503; +sub.f32 f1511, f1509, f1510; +mul.f32 f1512, f1390, f1503; +fma.rn.f32 f1513, f1391, f1501, f1512; +mul.f32 f1514, f1346, f1513; +fma.rn.f32 f1515, f1511, f1329, f1514; +mul.f32 f1516, f1329, f1513; +mul.f32 f1517, f1511, f1346; +sub.f32 f1518, f1517, f1516; +mul.f32 f1519, f1390, f1511; +mul.f32 f1520, f1391, f1513; +sub.f32 f1521, f1519, f1520; +mul.f32 f1522, f1390, f1513; +fma.rn.f32 f1523, f1391, f1511, f1522; +mul.f32 f1524, f1388, f1523; +fma.rn.f32 f1525, f1521, f1371, f1524; +mul.f32 f1526, f1371, f1523; +mul.f32 f1527, f1521, f1388; +sub.f32 f1528, f1527, f1526; +mul.f32 f1529, f1390, f1521; +mul.f32 f1530, f1391, f1523; +sub.f32 f1531, f1529, f1530; +mul.f32 f1532, f1390, f1523; +fma.rn.f32 f1533, f1391, f1521, f1532; +mul.f32 f1534, f1221, f1533; +fma.rn.f32 f1535, f1531, f1204, f1534; +mul.f32 f1536, f1204, f1533; +mul.f32 f1537, f1531, f1221; +sub.f32 f1538, f1537, f1536; +mul.f32 f1539, f1390, f1531; +mul.f32 f1540, f1391, f1533; +sub.f32 f1541, f1539, f1540; +mul.f32 f1542, f1390, f1533; +fma.rn.f32 f1543, f1391, f1531, f1542; +mul.f32 f1544, f1263, f1543; +fma.rn.f32 f1545, f1541, f1246, f1544; +mul.f32 f1546, f1246, f1543; +mul.f32 f1547, f1541, f1263; +sub.f32 f1548, f1547, f1546; +mul.f32 f1549, f1390, f1541; +mul.f32 f1550, f1391, f1543; +sub.f32 f1551, f1549, f1550; +mul.f32 f1552, f1390, f1543; +fma.rn.f32 f1553, f1391, f1541, f1552; +mul.f32 f1554, f1305, f1553; +fma.rn.f32 f1555, f1551, f1288, f1554; +mul.f32 f1556, f1288, f1553; +mul.f32 f1557, f1551, f1305; +sub.f32 f1558, f1557, f1556; +mul.f32 f1559, f1390, f1551; +mul.f32 f1560, f1391, f1553; +sub.f32 f1561, f1559, f1560; +mul.f32 f1562, f1390, f1553; +fma.rn.f32 f1563, f1391, f1551, f1562; +mul.f32 f1564, f1347, f1563; +fma.rn.f32 f1565, f1561, f1330, f1564; +mul.f32 f1566, f1330, f1563; +mul.f32 f1567, f1561, f1347; +sub.f32 f1568, f1567, f1566; +mul.f32 f1569, f1390, f1561; +mul.f32 f1570, f1391, f1563; +sub.f32 f1571, f1569, f1570; +mul.f32 f1572, f1390, f1563; +fma.rn.f32 f1573, f1391, f1561, f1572; +mul.f32 f1574, f1389, f1573; +fma.rn.f32 f1575, f1571, f1372, f1574; +mul.f32 f1576, f1372, f1573; +mul.f32 f1577, f1571, f1389; +sub.f32 f1578, f1577, f1576; +mul.f32 f1579, f1390, f1571; +mul.f32 f1580, f1391, f1573; +sub.f32 f1581, f1579, f1580; +mul.f32 f1582, f1390, f1573; +fma.rn.f32 f1583, f1391, f1571, f1582; +mul.f32 f1584, f1213, f1583; +fma.rn.f32 f1585, f1581, f1196, f1584; +mul.f32 f1586, f1196, f1583; +mul.f32 f1587, f1581, f1213; +sub.f32 f1588, f1587, f1586; +mul.f32 f1589, f1390, f1581; +mul.f32 f1590, f1391, f1583; +sub.f32 f1591, f1589, f1590; +mul.f32 f1592, f1390, f1583; +fma.rn.f32 f1593, f1391, f1581, f1592; +mul.f32 f1594, f1255, f1593; +fma.rn.f32 f1595, f1591, f1238, f1594; +mul.f32 f1596, f1238, f1593; +mul.f32 f1597, f1591, f1255; +sub.f32 f1598, f1597, f1596; +mul.f32 f1599, f1390, f1591; +mul.f32 f1600, f1391, f1593; +sub.f32 f1601, f1599, f1600; +mul.f32 f1602, f1390, f1593; +fma.rn.f32 f1603, f1391, f1591, f1602; +mul.f32 f1604, f1297, f1603; +fma.rn.f32 f1605, f1601, f1280, f1604; +mul.f32 f1606, f1280, f1603; +mul.f32 f1607, f1601, f1297; +sub.f32 f1608, f1607, f1606; +mul.f32 f1609, f1390, f1601; +mul.f32 f1610, f1391, f1603; +sub.f32 f1611, f1609, f1610; +mul.f32 f1612, f1390, f1603; +fma.rn.f32 f1613, f1391, f1601, f1612; +mul.f32 f1614, f1339, f1613; +fma.rn.f32 f1615, f1611, f1322, f1614; +mul.f32 f1616, f1322, f1613; +mul.f32 f1617, f1611, f1339; +sub.f32 f1618, f1617, f1616; +mul.f32 f1619, f1390, f1611; +mul.f32 f1620, f1391, f1613; +sub.f32 f1621, f1619, f1620; +mul.f32 f1622, f1390, f1613; +fma.rn.f32 f1623, f1391, f1611, f1622; +mul.f32 f1624, f1381, f1623; +fma.rn.f32 f1625, f1621, f1364, f1624; +mul.f32 f1626, f1364, f1623; +mul.f32 f1627, f1621, f1381; +sub.f32 f1628, f1627, f1626; +shl.b32 r14, r13, 2; +add.s32 r15, r8, r14; +barrier.sync 0; +mad.lo.s32 r16, r11, 2500, r15; +st.shared.f32 [r16], f1183; +st.shared.f32 [r16+100], f1395; +st.shared.f32 [r16+200], f1405; +st.shared.f32 [r16+300], f1415; +st.shared.f32 [r16+400], f1425; +st.shared.f32 [r16+500], f1435; +st.shared.f32 [r16+600], f1445; +st.shared.f32 [r16+700], f1455; +st.shared.f32 [r16+800], f1465; +st.shared.f32 [r16+900], f1475; +st.shared.f32 [r16+1000], f1485; +st.shared.f32 [r16+1100], f1495; +st.shared.f32 [r16+1200], f1505; +st.shared.f32 [r16+1300], f1515; +st.shared.f32 [r16+1400], f1525; +st.shared.f32 [r16+1500], f1535; +st.shared.f32 [r16+1600], f1545; +st.shared.f32 [r16+1700], f1555; +st.shared.f32 [r16+1800], f1565; +st.shared.f32 [r16+1900], f1575; +st.shared.f32 [r16+2000], f1585; +st.shared.f32 [r16+2100], f1595; +st.shared.f32 [r16+2200], f1605; +st.shared.f32 [r16+2300], f1615; +st.shared.f32 [r16+2400], f1625; +barrier.sync 0; +ld.shared.f32 f1629, [r10]; +ld.shared.f32 f1630, [r10+500]; +ld.shared.f32 f1631, [r10+1000]; +ld.shared.f32 f1632, [r10+1500]; +ld.shared.f32 f1633, [r10+2000]; +ld.shared.f32 f1634, [r10+2500]; +ld.shared.f32 f1635, [r10+3000]; +ld.shared.f32 f1636, [r10+3500]; +ld.shared.f32 f1637, [r10+4000]; +ld.shared.f32 f1638, [r10+4500]; +ld.shared.f32 f1639, [r10+5000]; +ld.shared.f32 f1640, [r10+5500]; +ld.shared.f32 f1641, [r10+6000]; +ld.shared.f32 f1642, [r10+6500]; +ld.shared.f32 f1643, [r10+7000]; +ld.shared.f32 f1644, [r10+7500]; +ld.shared.f32 f1645, [r10+8000]; +ld.shared.f32 f1646, [r10+8500]; +ld.shared.f32 f1647, [r10+9000]; +ld.shared.f32 f1648, [r10+9500]; +ld.shared.f32 f1649, [r10+10000]; +ld.shared.f32 f1650, [r10+10500]; +ld.shared.f32 f1651, [r10+11000]; +ld.shared.f32 f1652, [r10+11500]; +ld.shared.f32 f1653, [r10+12000]; +barrier.sync 0; +st.shared.f32 [r16], f1187; +st.shared.f32 [r16+100], f1398; +st.shared.f32 [r16+200], f1408; +st.shared.f32 [r16+300], f1418; +st.shared.f32 [r16+400], f1428; +st.shared.f32 [r16+500], f1438; +st.shared.f32 [r16+600], f1448; +st.shared.f32 [r16+700], f1458; +st.shared.f32 [r16+800], f1468; +st.shared.f32 [r16+900], f1478; +st.shared.f32 [r16+1000], f1488; +st.shared.f32 [r16+1100], f1498; +st.shared.f32 [r16+1200], f1508; +st.shared.f32 [r16+1300], f1518; +st.shared.f32 [r16+1400], f1528; +st.shared.f32 [r16+1500], f1538; +st.shared.f32 [r16+1600], f1548; +st.shared.f32 [r16+1700], f1558; +st.shared.f32 [r16+1800], f1568; +st.shared.f32 [r16+1900], f1578; +st.shared.f32 [r16+2000], f1588; +st.shared.f32 [r16+2100], f1598; +st.shared.f32 [r16+2200], f1608; +st.shared.f32 [r16+2300], f1618; +st.shared.f32 [r16+2400], f1628; +barrier.sync 0; +ld.shared.f32 f1654, [r10]; +ld.shared.f32 f1655, [r10+500]; +ld.shared.f32 f1656, [r10+1000]; +ld.shared.f32 f1657, [r10+1500]; +ld.shared.f32 f1658, [r10+2000]; +ld.shared.f32 f1659, [r10+2500]; +ld.shared.f32 f1660, [r10+3000]; +ld.shared.f32 f1661, [r10+3500]; +ld.shared.f32 f1662, [r10+4000]; +ld.shared.f32 f1663, [r10+4500]; +ld.shared.f32 f1664, [r10+5000]; +ld.shared.f32 f1665, [r10+5500]; +ld.shared.f32 f1666, [r10+6000]; +ld.shared.f32 f1667, [r10+6500]; +ld.shared.f32 f1668, [r10+7000]; +ld.shared.f32 f1669, [r10+7500]; +ld.shared.f32 f1670, [r10+8000]; +ld.shared.f32 f1671, [r10+8500]; +ld.shared.f32 f1672, [r10+9000]; +ld.shared.f32 f1673, [r10+9500]; +ld.shared.f32 f1674, [r10+10000]; +ld.shared.f32 f1675, [r10+10500]; +ld.shared.f32 f1676, [r10+11000]; +ld.shared.f32 f1677, [r10+11500]; +ld.shared.f32 f1678, [r10+12000]; +add.f32 f1679, f1634, f1649; +add.f32 f1680, f1629, f1679; +add.f32 f1681, f1639, f1644; +add.f32 f1682, f1659, f1674; +add.f32 f1683, f1654, f1682; +add.f32 f1684, f1664, f1669; +fma.rn.f32 f1685, f1679, 0f3E9E377A, f1629; +mul.f32 f1686, f1681, 0f3F4F1BBD; +sub.f32 f1687, f1685, f1686; +sub.f32 f1688, f1659, f1674; +mul.f32 f1689, f1688, 0f3F737871; +sub.f32 f1690, f1664, f1669; +fma.rn.f32 f1691, f1690, 0f3F167918, f1689; +mul.f32 f1692, f1679, 0f3F4F1BBD; +sub.f32 f1693, f1629, f1692; +fma.rn.f32 f1694, f1681, 0f3E9E377A, f1693; +mul.f32 f1695, f1688, 0f3F167918; +mul.f32 f1696, f1690, 0f3F737871; +sub.f32 f1697, f1695, f1696; +fma.rn.f32 f1698, f1682, 0f3E9E377A, f1654; +mul.f32 f1699, f1684, 0f3F4F1BBD; +sub.f32 f1700, f1698, f1699; +sub.f32 f1701, f1634, f1649; +mul.f32 f1702, f1701, 0f3F737871; +sub.f32 f1703, f1639, f1644; +fma.rn.f32 f1704, f1703, 0f3F167918, f1702; +mul.f32 f1705, f1682, 0f3F4F1BBD; +sub.f32 f1706, f1654, f1705; +fma.rn.f32 f1707, f1684, 0f3E9E377A, f1706; +mul.f32 f1708, f1701, 0f3F167918; +mul.f32 f1709, f1703, 0f3F737871; +sub.f32 f1710, f1708, f1709; +add.f32 f1711, f1635, f1650; +add.f32 f1712, f1630, f1711; +add.f32 f1713, f1640, f1645; +add.f32 f1714, f1660, f1675; +add.f32 f1715, f1655, f1714; +add.f32 f1716, f1665, f1670; +fma.rn.f32 f1717, f1711, 0f3E9E377A, f1630; +mul.f32 f1718, f1713, 0f3F4F1BBD; +sub.f32 f1719, f1717, f1718; +sub.f32 f1720, f1660, f1675; +mul.f32 f1721, f1720, 0f3F737871; +sub.f32 f1722, f1665, f1670; +fma.rn.f32 f1723, f1722, 0f3F167918, f1721; +mul.f32 f1724, f1711, 0f3F4F1BBD; +sub.f32 f1725, f1630, f1724; +fma.rn.f32 f1726, f1713, 0f3E9E377A, f1725; +mul.f32 f1727, f1720, 0f3F167918; +mul.f32 f1728, f1722, 0f3F737871; +sub.f32 f1729, f1727, f1728; +fma.rn.f32 f1730, f1714, 0f3E9E377A, f1655; +mul.f32 f1731, f1716, 0f3F4F1BBD; +sub.f32 f1732, f1730, f1731; +sub.f32 f1733, f1635, f1650; +mul.f32 f1734, f1733, 0f3F737871; +sub.f32 f1735, f1640, f1645; +fma.rn.f32 f1736, f1735, 0f3F167918, f1734; +mul.f32 f1737, f1714, 0f3F4F1BBD; +sub.f32 f1738, f1655, f1737; +fma.rn.f32 f1739, f1716, 0f3E9E377A, f1738; +mul.f32 f1740, f1733, 0f3F167918; +mul.f32 f1741, f1735, 0f3F737871; +sub.f32 f1742, f1740, f1741; +add.f32 f1743, f1636, f1651; +add.f32 f1744, f1631, f1743; +add.f32 f1745, f1641, f1646; +add.f32 f1746, f1661, f1676; +add.f32 f1747, f1656, f1746; +add.f32 f1748, f1666, f1671; +fma.rn.f32 f1749, f1743, 0f3E9E377A, f1631; +mul.f32 f1750, f1745, 0f3F4F1BBD; +sub.f32 f1751, f1749, f1750; +sub.f32 f1752, f1661, f1676; +mul.f32 f1753, f1752, 0f3F737871; +sub.f32 f1754, f1666, f1671; +fma.rn.f32 f1755, f1754, 0f3F167918, f1753; +mul.f32 f1756, f1743, 0f3F4F1BBD; +sub.f32 f1757, f1631, f1756; +fma.rn.f32 f1758, f1745, 0f3E9E377A, f1757; +mul.f32 f1759, f1752, 0f3F167918; +mul.f32 f1760, f1754, 0f3F737871; +sub.f32 f1761, f1759, f1760; +fma.rn.f32 f1762, f1746, 0f3E9E377A, f1656; +mul.f32 f1763, f1748, 0f3F4F1BBD; +sub.f32 f1764, f1762, f1763; +sub.f32 f1765, f1636, f1651; +mul.f32 f1766, f1765, 0f3F737871; +sub.f32 f1767, f1641, f1646; +fma.rn.f32 f1768, f1767, 0f3F167918, f1766; +mul.f32 f1769, f1746, 0f3F4F1BBD; +sub.f32 f1770, f1656, f1769; +fma.rn.f32 f1771, f1748, 0f3E9E377A, f1770; +mul.f32 f1772, f1765, 0f3F167918; +mul.f32 f1773, f1767, 0f3F737871; +sub.f32 f1774, f1772, f1773; +add.f32 f1775, f1637, f1652; +add.f32 f1776, f1632, f1775; +add.f32 f1777, f1642, f1647; +add.f32 f1778, f1662, f1677; +add.f32 f1779, f1657, f1778; +add.f32 f1780, f1667, f1672; +fma.rn.f32 f1781, f1775, 0f3E9E377A, f1632; +mul.f32 f1782, f1777, 0f3F4F1BBD; +sub.f32 f1783, f1781, f1782; +sub.f32 f1784, f1662, f1677; +mul.f32 f1785, f1784, 0f3F737871; +sub.f32 f1786, f1667, f1672; +fma.rn.f32 f1787, f1786, 0f3F167918, f1785; +mul.f32 f1788, f1775, 0f3F4F1BBD; +sub.f32 f1789, f1632, f1788; +fma.rn.f32 f1790, f1777, 0f3E9E377A, f1789; +mul.f32 f1791, f1784, 0f3F167918; +mul.f32 f1792, f1786, 0f3F737871; +sub.f32 f1793, f1791, f1792; +fma.rn.f32 f1794, f1778, 0f3E9E377A, f1657; +mul.f32 f1795, f1780, 0f3F4F1BBD; +sub.f32 f1796, f1794, f1795; +sub.f32 f1797, f1637, f1652; +mul.f32 f1798, f1797, 0f3F737871; +sub.f32 f1799, f1642, f1647; +fma.rn.f32 f1800, f1799, 0f3F167918, f1798; +mul.f32 f1801, f1778, 0f3F4F1BBD; +sub.f32 f1802, f1657, f1801; +fma.rn.f32 f1803, f1780, 0f3E9E377A, f1802; +mul.f32 f1804, f1797, 0f3F167918; +mul.f32 f1805, f1799, 0f3F737871; +sub.f32 f1806, f1804, f1805; +add.f32 f1807, f1638, f1653; +add.f32 f1808, f1633, f1807; +add.f32 f1809, f1643, f1648; +add.f32 f1810, f1663, f1678; +add.f32 f1811, f1658, f1810; +add.f32 f1812, f1668, f1673; +fma.rn.f32 f1813, f1807, 0f3E9E377A, f1633; +mul.f32 f1814, f1809, 0f3F4F1BBD; +sub.f32 f1815, f1813, f1814; +sub.f32 f1816, f1663, f1678; +mul.f32 f1817, f1816, 0f3F737871; +sub.f32 f1818, f1668, f1673; +fma.rn.f32 f1819, f1818, 0f3F167918, f1817; +mul.f32 f1820, f1807, 0f3F4F1BBD; +sub.f32 f1821, f1633, f1820; +fma.rn.f32 f1822, f1809, 0f3E9E377A, f1821; +mul.f32 f1823, f1816, 0f3F167918; +mul.f32 f1824, f1818, 0f3F737871; +sub.f32 f1825, f1823, f1824; +fma.rn.f32 f1826, f1810, 0f3E9E377A, f1658; +mul.f32 f1827, f1812, 0f3F4F1BBD; +sub.f32 f1828, f1826, f1827; +sub.f32 f1829, f1638, f1653; +mul.f32 f1830, f1829, 0f3F737871; +sub.f32 f1831, f1643, f1648; +fma.rn.f32 f1832, f1831, 0f3F167918, f1830; +mul.f32 f1833, f1810, 0f3F4F1BBD; +sub.f32 f1834, f1658, f1833; +fma.rn.f32 f1835, f1812, 0f3E9E377A, f1834; +mul.f32 f1836, f1829, 0f3F167918; +mul.f32 f1837, f1831, 0f3F737871; +sub.f32 f1838, f1836, f1837; +add.f32 %0, f1681, f1680; +add.f32 %1, f1684, f1683; +add.f32 %2, f1713, f1712; +add.f32 %3, f1716, f1715; +add.f32 %4, f1745, f1744; +add.f32 %5, f1748, f1747; +add.f32 %6, f1777, f1776; +add.f32 %7, f1780, f1779; +add.f32 %8, f1809, f1808; +add.f32 %9, f1812, f1811; +add.f32 %11, f1704, f1700; +sub.f32 %10, f1687, f1691; +add.f32 %13, f1736, f1732; +sub.f32 %12, f1719, f1723; +add.f32 %15, f1768, f1764; +sub.f32 %14, f1751, f1755; +add.f32 %17, f1800, f1796; +sub.f32 %16, f1783, f1787; +add.f32 %19, f1832, f1828; +sub.f32 %18, f1815, f1819; +sub.f32 %20, f1694, f1697; +add.f32 %21, f1710, f1707; +sub.f32 %22, f1726, f1729; +add.f32 %23, f1742, f1739; +sub.f32 %24, f1758, f1761; +add.f32 %25, f1774, f1771; +sub.f32 %26, f1790, f1793; +add.f32 %27, f1806, f1803; +sub.f32 %28, f1822, f1825; +add.f32 %29, f1838, f1835; +add.f32 %30, f1697, f1694; +sub.f32 %31, f1707, f1710; +add.f32 %32, f1729, f1726; +sub.f32 %33, f1739, f1742; +add.f32 %34, f1761, f1758; +sub.f32 %35, f1771, f1774; +add.f32 %36, f1793, f1790; +sub.f32 %37, f1803, f1806; +add.f32 %38, f1825, f1822; +sub.f32 %39, f1835, f1838; +sub.f32 %41, f1700, f1704; +add.f32 %40, f1691, f1687; +sub.f32 %43, f1732, f1736; +add.f32 %42, f1723, f1719; +sub.f32 %45, f1764, f1768; +add.f32 %44, f1755, f1751; +sub.f32 %47, f1796, f1800; +add.f32 %46, f1787, f1783; +sub.f32 %49, f1828, f1832; +add.f32 %48, f1819, f1815; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_3125), "l"(lut_sp_25_125), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<370, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<467>; +.reg .b32 r<30>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 25000, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %17, %25; +add.f32 f22, %15, f21; +add.f32 f23, %20, %23; +add.f32 f24, %19, %26; +add.f32 f25, %16, f24; +add.f32 f26, %22, %24; +fma.rn.f32 f27, f21, 0f3E9E377A, %15; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %19, %26; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %22, %24; +fma.rn.f32 f33, f32, 0f3F167918, f31; +sub.f32 f34, f29, f33; +add.f32 f35, f33, f29; +mul.f32 f36, f21, 0f3F4F1BBD; +sub.f32 f37, %15, f36; +fma.rn.f32 f38, f23, 0f3E9E377A, f37; +mul.f32 f39, f30, 0f3F167918; +mul.f32 f40, f32, 0f3F737871; +sub.f32 f41, f39, f40; +sub.f32 f42, f38, f41; +add.f32 f43, f41, f38; +fma.rn.f32 f44, f24, 0f3E9E377A, %16; +mul.f32 f45, f26, 0f3F4F1BBD; +sub.f32 f46, f44, f45; +sub.f32 f47, %17, %25; +mul.f32 f48, f47, 0f3F737871; +sub.f32 f49, %20, %23; +fma.rn.f32 f50, f49, 0f3F167918, f48; +add.f32 f51, f50, f46; +sub.f32 f52, f46, f50; +mul.f32 f53, f24, 0f3F4F1BBD; +sub.f32 f54, %16, f53; +fma.rn.f32 f55, f26, 0f3E9E377A, f54; +mul.f32 f56, f47, 0f3F167918; +mul.f32 f57, f49, 0f3F737871; +sub.f32 f58, f56, f57; +add.f32 f59, f58, f55; +sub.f32 f60, f55, f58; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 25000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f61, f62}, [rd6]; +mul.f32 f65, f51, f62; +mul.f32 f66, f34, f62; +mul.f32 f67, f61, f51; +mul.f32 f68, f61, f61; +mul.f32 f69, f62, f62; +sub.f32 f70, f68, f69; +mul.f32 f71, f62, f61; +fma.rn.f32 f72, f62, f61, f71; +mul.f32 f73, f59, f72; +mul.f32 f74, f42, f72; +mul.f32 f75, f70, f59; +mul.f32 f76, f61, f70; +mul.f32 f77, f62, f72; +sub.f32 f78, f76, f77; +mul.f32 f79, f61, f72; +fma.rn.f32 f80, f62, f70, f79; +mul.f32 f81, f60, f80; +mul.f32 f82, f43, f80; +mul.f32 f83, f78, f60; +mul.f32 f84, f61, f78; +mul.f32 f85, f62, f80; +sub.f32 f86, f84, f85; +mul.f32 f87, f61, f80; +fma.rn.f32 f88, f62, f78, f87; +mul.f32 f89, f52, f88; +mul.f32 f90, f35, f88; +mul.f32 f91, f86, f52; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f92, f26, f25; +add.f32 f93, f23, f22; +st.shared.v2.f32 [r9], {f93, f92}; +fma.rn.f32 f94, f61, f34, f65; +sub.f32 f95, f67, f66; +st.shared.v2.f32 [r9+8], {f94, f95}; +fma.rn.f32 f96, f70, f42, f73; +sub.f32 f97, f75, f74; +st.shared.v2.f32 [r9+16], {f96, f97}; +sub.f32 f98, f83, f82; +fma.rn.f32 f99, f78, f43, f81; +st.shared.v2.f32 [r9+24], {f99, f98}; +fma.rn.f32 f100, f86, f35, f89; +sub.f32 f101, f91, f90; +st.shared.v2.f32 [r9+32], {f100, f101}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f102, f103}, [r11]; +ld.shared.v2.f32 {f106, f107}, [r11+5000]; +ld.shared.v2.f32 {f110, f111}, [r11+10000]; +ld.shared.v2.f32 {f114, f115}, [r11+15000]; +ld.shared.v2.f32 {f118, f119}, [r11+20000]; +add.f32 f122, f106, f118; +add.f32 f123, f102, f122; +add.f32 f124, f110, f114; +add.f32 f125, f107, f119; +add.f32 f126, f103, f125; +add.f32 f127, f111, f115; +fma.rn.f32 f128, f122, 0f3E9E377A, f102; +mul.f32 f129, f124, 0f3F4F1BBD; +sub.f32 f130, f128, f129; +sub.f32 f131, f107, f119; +mul.f32 f132, f131, 0f3F737871; +sub.f32 f133, f111, f115; +fma.rn.f32 f134, f133, 0f3F167918, f132; +sub.f32 f135, f130, f134; +add.f32 f136, f134, f130; +mul.f32 f137, f122, 0f3F4F1BBD; +sub.f32 f138, f102, f137; +fma.rn.f32 f139, f124, 0f3E9E377A, f138; +mul.f32 f140, f131, 0f3F167918; +mul.f32 f141, f133, 0f3F737871; +sub.f32 f142, f140, f141; +sub.f32 f143, f139, f142; +add.f32 f144, f142, f139; +fma.rn.f32 f145, f125, 0f3E9E377A, f103; +mul.f32 f146, f127, 0f3F4F1BBD; +sub.f32 f147, f145, f146; +sub.f32 f148, f106, f118; +mul.f32 f149, f148, 0f3F737871; +sub.f32 f150, f110, f114; +fma.rn.f32 f151, f150, 0f3F167918, f149; +add.f32 f152, f151, f147; +sub.f32 f153, f147, f151; +mul.f32 f154, f125, 0f3F4F1BBD; +sub.f32 f155, f103, f154; +fma.rn.f32 f156, f127, 0f3E9E377A, f155; +mul.f32 f157, f148, 0f3F167918; +mul.f32 f158, f150, 0f3F737871; +sub.f32 f159, f157, f158; +add.f32 f160, f159, f156; +sub.f32 f161, f156, f159; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f162, f163}, [rd11]; +mul.f32 f166, f152, f163; +mul.f32 f167, f135, f163; +mul.f32 f168, f162, f152; +mul.f32 f169, f162, f162; +mul.f32 f170, f163, f163; +sub.f32 f171, f169, f170; +mul.f32 f172, f163, f162; +fma.rn.f32 f173, f163, f162, f172; +mul.f32 f174, f160, f173; +mul.f32 f175, f143, f173; +mul.f32 f176, f171, f160; +mul.f32 f177, f162, f171; +mul.f32 f178, f163, f173; +sub.f32 f179, f177, f178; +mul.f32 f180, f162, f173; +fma.rn.f32 f181, f163, f171, f180; +mul.f32 f182, f161, f181; +mul.f32 f183, f144, f181; +mul.f32 f184, f179, f161; +mul.f32 f185, f162, f179; +mul.f32 f186, f163, f181; +sub.f32 f187, f185, f186; +mul.f32 f188, f162, f181; +fma.rn.f32 f189, f163, f179, f188; +mul.f32 f190, f153, f189; +mul.f32 f191, f136, f189; +mul.f32 f192, f187, f153; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +add.f32 f193, f127, f126; +add.f32 f194, f124, f123; +st.shared.v2.f32 [r17], {f194, f193}; +fma.rn.f32 f195, f162, f135, f166; +sub.f32 f196, f168, f167; +st.shared.v2.f32 [r17+40], {f195, f196}; +fma.rn.f32 f197, f171, f143, f174; +sub.f32 f198, f176, f175; +st.shared.v2.f32 [r17+80], {f197, f198}; +fma.rn.f32 f199, f179, f144, f182; +sub.f32 f200, f184, f183; +st.shared.v2.f32 [r17+120], {f199, f200}; +fma.rn.f32 f201, f187, f136, f190; +sub.f32 f202, f192, f191; +st.shared.v2.f32 [r17+160], {f201, f202}; +barrier.sync 0; +ld.shared.v2.f32 {f203, f204}, [r11]; +ld.shared.v2.f32 {f207, f208}, [r11+5000]; +ld.shared.v2.f32 {f211, f212}, [r11+10000]; +ld.shared.v2.f32 {f215, f216}, [r11+15000]; +ld.shared.v2.f32 {f219, f220}, [r11+20000]; +add.f32 f223, f207, f219; +add.f32 f224, f203, f223; +add.f32 f225, f211, f215; +add.f32 f226, f208, f220; +add.f32 f227, f204, f226; +add.f32 f228, f212, f216; +fma.rn.f32 f229, f223, 0f3E9E377A, f203; +mul.f32 f230, f225, 0f3F4F1BBD; +sub.f32 f231, f229, f230; +sub.f32 f232, f208, f220; +mul.f32 f233, f232, 0f3F737871; +sub.f32 f234, f212, f216; +fma.rn.f32 f235, f234, 0f3F167918, f233; +sub.f32 f236, f231, f235; +add.f32 f237, f235, f231; +mul.f32 f238, f223, 0f3F4F1BBD; +sub.f32 f239, f203, f238; +fma.rn.f32 f240, f225, 0f3E9E377A, f239; +mul.f32 f241, f232, 0f3F167918; +mul.f32 f242, f234, 0f3F737871; +sub.f32 f243, f241, f242; +sub.f32 f244, f240, f243; +add.f32 f245, f243, f240; +fma.rn.f32 f246, f226, 0f3E9E377A, f204; +mul.f32 f247, f228, 0f3F4F1BBD; +sub.f32 f248, f246, f247; +sub.f32 f249, f207, f219; +mul.f32 f250, f249, 0f3F737871; +sub.f32 f251, f211, f215; +fma.rn.f32 f252, f251, 0f3F167918, f250; +add.f32 f253, f252, f248; +sub.f32 f254, f248, f252; +mul.f32 f255, f226, 0f3F4F1BBD; +sub.f32 f256, f204, f255; +fma.rn.f32 f257, f228, 0f3E9E377A, f256; +mul.f32 f258, f249, 0f3F167918; +mul.f32 f259, f251, 0f3F737871; +sub.f32 f260, f258, f259; +add.f32 f261, f260, f257; +sub.f32 f262, f257, f260; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f263, f264}, [rd16]; +mul.f32 f267, f253, f264; +mul.f32 f268, f236, f264; +mul.f32 f269, f263, f253; +mul.f32 f270, f263, f263; +mul.f32 f271, f264, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f264, f263; +fma.rn.f32 f274, f264, f263, f273; +mul.f32 f275, f261, f274; +mul.f32 f276, f244, f274; +mul.f32 f277, f272, f261; +mul.f32 f278, f263, f272; +mul.f32 f279, f264, f274; +sub.f32 f280, f278, f279; +mul.f32 f281, f263, f274; +fma.rn.f32 f282, f264, f272, f281; +mul.f32 f283, f262, f282; +mul.f32 f284, f245, f282; +mul.f32 f285, f280, f262; +mul.f32 f286, f263, f280; +mul.f32 f287, f264, f282; +sub.f32 f288, f286, f287; +mul.f32 f289, f263, f282; +fma.rn.f32 f290, f264, f280, f289; +mul.f32 f291, f254, f290; +mul.f32 f292, f237, f290; +mul.f32 f293, f288, f254; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +add.f32 f294, f228, f227; +add.f32 f295, f225, f224; +st.shared.v2.f32 [r23], {f295, f294}; +fma.rn.f32 f296, f263, f236, f267; +sub.f32 f297, f269, f268; +st.shared.v2.f32 [r23+200], {f296, f297}; +fma.rn.f32 f298, f272, f244, f275; +sub.f32 f299, f277, f276; +st.shared.v2.f32 [r23+400], {f298, f299}; +fma.rn.f32 f300, f280, f245, f283; +sub.f32 f301, f285, f284; +st.shared.v2.f32 [r23+600], {f300, f301}; +fma.rn.f32 f302, f288, f237, f291; +sub.f32 f303, f293, f292; +st.shared.v2.f32 [r23+800], {f302, f303}; +barrier.sync 0; +ld.shared.v2.f32 {f304, f305}, [r11]; +ld.shared.v2.f32 {f308, f309}, [r11+5000]; +ld.shared.v2.f32 {f312, f313}, [r11+10000]; +ld.shared.v2.f32 {f316, f317}, [r11+15000]; +ld.shared.v2.f32 {f320, f321}, [r11+20000]; +add.f32 f324, f308, f320; +add.f32 f325, f304, f324; +add.f32 f326, f312, f316; +add.f32 f327, f309, f321; +add.f32 f328, f305, f327; +add.f32 f329, f313, f317; +fma.rn.f32 f330, f324, 0f3E9E377A, f304; +mul.f32 f331, f326, 0f3F4F1BBD; +sub.f32 f332, f330, f331; +sub.f32 f333, f309, f321; +mul.f32 f334, f333, 0f3F737871; +sub.f32 f335, f313, f317; +fma.rn.f32 f336, f335, 0f3F167918, f334; +sub.f32 f337, f332, f336; +add.f32 f338, f336, f332; +mul.f32 f339, f324, 0f3F4F1BBD; +sub.f32 f340, f304, f339; +fma.rn.f32 f341, f326, 0f3E9E377A, f340; +mul.f32 f342, f333, 0f3F167918; +mul.f32 f343, f335, 0f3F737871; +sub.f32 f344, f342, f343; +sub.f32 f345, f341, f344; +add.f32 f346, f344, f341; +fma.rn.f32 f347, f327, 0f3E9E377A, f305; +mul.f32 f348, f329, 0f3F4F1BBD; +sub.f32 f349, f347, f348; +sub.f32 f350, f308, f320; +mul.f32 f351, f350, 0f3F737871; +sub.f32 f352, f312, f316; +fma.rn.f32 f353, f352, 0f3F167918, f351; +add.f32 f354, f353, f349; +sub.f32 f355, f349, f353; +mul.f32 f356, f327, 0f3F4F1BBD; +sub.f32 f357, f305, f356; +fma.rn.f32 f358, f329, 0f3E9E377A, f357; +mul.f32 f359, f350, 0f3F167918; +mul.f32 f360, f352, 0f3F737871; +sub.f32 f361, f359, f360; +add.f32 f362, f361, f358; +sub.f32 f363, f358, f361; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 8; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f364, f365}, [rd21]; +mul.f32 f368, f354, f365; +mul.f32 f369, f337, f365; +mul.f32 f370, f364, f354; +mul.f32 f371, f364, f364; +mul.f32 f372, f365, f365; +sub.f32 f373, f371, f372; +mul.f32 f374, f365, f364; +fma.rn.f32 f375, f365, f364, f374; +mul.f32 f376, f362, f375; +mul.f32 f377, f345, f375; +mul.f32 f378, f373, f362; +mul.f32 f379, f364, f373; +mul.f32 f380, f365, f375; +sub.f32 f381, f379, f380; +mul.f32 f382, f364, f375; +fma.rn.f32 f383, f365, f373, f382; +mul.f32 f384, f363, f383; +mul.f32 f385, f346, f383; +mul.f32 f386, f381, f363; +mul.f32 f387, f364, f381; +mul.f32 f388, f365, f383; +sub.f32 f389, f387, f388; +mul.f32 f390, f364, f383; +fma.rn.f32 f391, f365, f381, f390; +mul.f32 f392, f355, f391; +mul.f32 f393, f338, f391; +mul.f32 f394, f389, f355; +shl.b32 r27, r26, 3; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 5000, r28; +add.f32 f395, f329, f328; +add.f32 f396, f326, f325; +st.shared.v2.f32 [r29], {f396, f395}; +fma.rn.f32 f397, f364, f337, f368; +sub.f32 f398, f370, f369; +st.shared.v2.f32 [r29+1000], {f397, f398}; +fma.rn.f32 f399, f373, f345, f376; +sub.f32 f400, f378, f377; +st.shared.v2.f32 [r29+2000], {f399, f400}; +fma.rn.f32 f401, f381, f346, f384; +sub.f32 f402, f386, f385; +st.shared.v2.f32 [r29+3000], {f401, f402}; +fma.rn.f32 f403, f389, f338, f392; +sub.f32 f404, f394, f393; +st.shared.v2.f32 [r29+4000], {f403, f404}; +barrier.sync 0; +ld.shared.v2.f32 {f405, f406}, [r11]; +ld.shared.v2.f32 {f409, f410}, [r11+5000]; +ld.shared.v2.f32 {f413, f414}, [r11+10000]; +ld.shared.v2.f32 {f417, f418}, [r11+15000]; +ld.shared.v2.f32 {f421, f422}, [r11+20000]; +add.f32 f425, f409, f421; +add.f32 f426, f405, f425; +add.f32 f427, f413, f417; +add.f32 f428, f410, f422; +add.f32 f429, f406, f428; +add.f32 f430, f414, f418; +fma.rn.f32 f431, f425, 0f3E9E377A, f405; +mul.f32 f432, f427, 0f3F4F1BBD; +sub.f32 f433, f431, f432; +sub.f32 f434, f410, f422; +mul.f32 f435, f434, 0f3F737871; +sub.f32 f436, f414, f418; +fma.rn.f32 f437, f436, 0f3F167918, f435; +mul.f32 f438, f425, 0f3F4F1BBD; +sub.f32 f439, f405, f438; +fma.rn.f32 f440, f427, 0f3E9E377A, f439; +mul.f32 f441, f434, 0f3F167918; +mul.f32 f442, f436, 0f3F737871; +sub.f32 f443, f441, f442; +fma.rn.f32 f444, f428, 0f3E9E377A, f406; +mul.f32 f445, f430, 0f3F4F1BBD; +sub.f32 f446, f444, f445; +sub.f32 f447, f409, f421; +mul.f32 f448, f447, 0f3F737871; +sub.f32 f449, f413, f417; +fma.rn.f32 f450, f449, 0f3F167918, f448; +mul.f32 f451, f428, 0f3F4F1BBD; +sub.f32 f452, f406, f451; +fma.rn.f32 f453, f430, 0f3E9E377A, f452; +mul.f32 f454, f447, 0f3F167918; +mul.f32 f455, f449, 0f3F737871; +sub.f32 f456, f454, f455; +add.f32 %1, f430, f429; +add.f32 %0, f427, f426; +add.f32 %3, f450, f446; +sub.f32 %2, f433, f437; +add.f32 %5, f456, f453; +sub.f32 %4, f440, f443; +sub.f32 %7, f453, f456; +add.f32 %6, f443, f440; +sub.f32 %9, f446, f450; +add.f32 %8, f437, f433; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_3125), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<371, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<427>; +.reg .b32 r<30>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 12500, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %17, %25; +add.f32 f22, %15, f21; +add.f32 f23, %20, %23; +add.f32 f24, f23, f22; +add.f32 f25, %19, %26; +add.f32 f26, %16, f25; +add.f32 f27, %22, %24; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %15; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %19, %26; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %22, %24; +fma.rn.f32 f35, f34, 0f3F167918, f33; +sub.f32 f36, f31, f35; +add.f32 f37, f35, f31; +mul.f32 f38, f21, 0f3F4F1BBD; +sub.f32 f39, %15, f38; +fma.rn.f32 f40, f23, 0f3E9E377A, f39; +mul.f32 f41, f32, 0f3F167918; +mul.f32 f42, f34, 0f3F737871; +sub.f32 f43, f41, f42; +sub.f32 f44, f40, f43; +add.f32 f45, f43, f40; +fma.rn.f32 f46, f25, 0f3E9E377A, %16; +mul.f32 f47, f27, 0f3F4F1BBD; +sub.f32 f48, f46, f47; +sub.f32 f49, %17, %25; +mul.f32 f50, f49, 0f3F737871; +sub.f32 f51, %20, %23; +fma.rn.f32 f52, f51, 0f3F167918, f50; +add.f32 f53, f52, f48; +sub.f32 f54, f48, f52; +mul.f32 f55, f25, 0f3F4F1BBD; +sub.f32 f56, %16, f55; +fma.rn.f32 f57, f27, 0f3E9E377A, f56; +mul.f32 f58, f49, 0f3F167918; +mul.f32 f59, f51, 0f3F737871; +sub.f32 f60, f58, f59; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f53, f64; +fma.rn.f32 f68, f63, f36, f67; +mul.f32 f69, f36, f64; +mul.f32 f70, f63, f53; +sub.f32 f71, f70, f69; +mul.f32 f72, f63, f63; +mul.f32 f73, f64, f64; +sub.f32 f74, f72, f73; +mul.f32 f75, f64, f63; +fma.rn.f32 f76, f64, f63, f75; +mul.f32 f77, f61, f76; +fma.rn.f32 f78, f74, f44, f77; +mul.f32 f79, f44, f76; +mul.f32 f80, f74, f61; +sub.f32 f81, f80, f79; +mul.f32 f82, f63, f74; +mul.f32 f83, f64, f76; +sub.f32 f84, f82, f83; +mul.f32 f85, f63, f76; +fma.rn.f32 f86, f64, f74, f85; +mul.f32 f87, f62, f86; +fma.rn.f32 f88, f84, f45, f87; +mul.f32 f89, f45, f86; +mul.f32 f90, f84, f62; +sub.f32 f91, f90, f89; +mul.f32 f92, f63, f84; +mul.f32 f93, f64, f86; +sub.f32 f94, f92, f93; +mul.f32 f95, f63, f86; +fma.rn.f32 f96, f64, f84, f95; +mul.f32 f97, f54, f96; +fma.rn.f32 f98, f94, f37, f97; +mul.f32 f99, f37, f96; +mul.f32 f100, f94, f54; +sub.f32 f101, f100, f99; +mad.lo.s32 r8, r5, 12500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f68; +st.shared.f32 [r9+8], f78; +st.shared.f32 [r9+12], f88; +st.shared.f32 [r9+16], f98; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f102, [r11]; +ld.shared.f32 f103, [r11+2500]; +ld.shared.f32 f104, [r11+5000]; +ld.shared.f32 f105, [r11+7500]; +ld.shared.f32 f106, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +ld.shared.f32 f107, [r11]; +ld.shared.f32 f108, [r11+2500]; +ld.shared.f32 f109, [r11+5000]; +ld.shared.f32 f110, [r11+7500]; +ld.shared.f32 f111, [r11+10000]; +add.f32 f112, f103, f106; +add.f32 f113, f102, f112; +add.f32 f114, f104, f105; +add.f32 f115, f114, f113; +add.f32 f116, f108, f111; +add.f32 f117, f107, f116; +add.f32 f118, f109, f110; +add.f32 f119, f118, f117; +fma.rn.f32 f120, f112, 0f3E9E377A, f102; +mul.f32 f121, f114, 0f3F4F1BBD; +sub.f32 f122, f120, f121; +sub.f32 f123, f108, f111; +mul.f32 f124, f123, 0f3F737871; +sub.f32 f125, f109, f110; +fma.rn.f32 f126, f125, 0f3F167918, f124; +sub.f32 f127, f122, f126; +add.f32 f128, f126, f122; +mul.f32 f129, f112, 0f3F4F1BBD; +sub.f32 f130, f102, f129; +fma.rn.f32 f131, f114, 0f3E9E377A, f130; +mul.f32 f132, f123, 0f3F167918; +mul.f32 f133, f125, 0f3F737871; +sub.f32 f134, f132, f133; +sub.f32 f135, f131, f134; +add.f32 f136, f134, f131; +fma.rn.f32 f137, f116, 0f3E9E377A, f107; +mul.f32 f138, f118, 0f3F4F1BBD; +sub.f32 f139, f137, f138; +sub.f32 f140, f103, f106; +mul.f32 f141, f140, 0f3F737871; +sub.f32 f142, f104, f105; +fma.rn.f32 f143, f142, 0f3F167918, f141; +add.f32 f144, f143, f139; +sub.f32 f145, f139, f143; +mul.f32 f146, f116, 0f3F4F1BBD; +sub.f32 f147, f107, f146; +fma.rn.f32 f148, f118, 0f3E9E377A, f147; +mul.f32 f149, f140, 0f3F167918; +mul.f32 f150, f142, 0f3F737871; +sub.f32 f151, f149, f150; +add.f32 f152, f151, f148; +sub.f32 f153, f148, f151; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f154, f155}, [rd11]; +mul.f32 f158, f144, f155; +fma.rn.f32 f159, f154, f127, f158; +mul.f32 f160, f127, f155; +mul.f32 f161, f154, f144; +sub.f32 f162, f161, f160; +mul.f32 f163, f154, f154; +mul.f32 f164, f155, f155; +sub.f32 f165, f163, f164; +mul.f32 f166, f155, f154; +fma.rn.f32 f167, f155, f154, f166; +mul.f32 f168, f152, f167; +fma.rn.f32 f169, f165, f135, f168; +mul.f32 f170, f135, f167; +mul.f32 f171, f165, f152; +sub.f32 f172, f171, f170; +mul.f32 f173, f154, f165; +mul.f32 f174, f155, f167; +sub.f32 f175, f173, f174; +mul.f32 f176, f154, f167; +fma.rn.f32 f177, f155, f165, f176; +mul.f32 f178, f153, f177; +fma.rn.f32 f179, f175, f136, f178; +mul.f32 f180, f136, f177; +mul.f32 f181, f175, f153; +sub.f32 f182, f181, f180; +mul.f32 f183, f154, f175; +mul.f32 f184, f155, f177; +sub.f32 f185, f183, f184; +mul.f32 f186, f154, f177; +fma.rn.f32 f187, f155, f175, f186; +mul.f32 f188, f145, f187; +fma.rn.f32 f189, f185, f128, f188; +mul.f32 f190, f128, f187; +mul.f32 f191, f185, f145; +sub.f32 f192, f191, f190; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 100, r16; +st.shared.f32 [r17], f115; +st.shared.f32 [r17+20], f159; +st.shared.f32 [r17+40], f169; +st.shared.f32 [r17+60], f179; +st.shared.f32 [r17+80], f189; +barrier.sync 0; +ld.shared.f32 f193, [r11]; +ld.shared.f32 f194, [r11+2500]; +ld.shared.f32 f195, [r11+5000]; +ld.shared.f32 f196, [r11+7500]; +ld.shared.f32 f197, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r17], f119; +st.shared.f32 [r17+20], f162; +st.shared.f32 [r17+40], f172; +st.shared.f32 [r17+60], f182; +st.shared.f32 [r17+80], f192; +barrier.sync 0; +ld.shared.f32 f198, [r11]; +ld.shared.f32 f199, [r11+2500]; +ld.shared.f32 f200, [r11+5000]; +ld.shared.f32 f201, [r11+7500]; +ld.shared.f32 f202, [r11+10000]; +add.f32 f203, f194, f197; +add.f32 f204, f193, f203; +add.f32 f205, f195, f196; +add.f32 f206, f205, f204; +add.f32 f207, f199, f202; +add.f32 f208, f198, f207; +add.f32 f209, f200, f201; +add.f32 f210, f209, f208; +fma.rn.f32 f211, f203, 0f3E9E377A, f193; +mul.f32 f212, f205, 0f3F4F1BBD; +sub.f32 f213, f211, f212; +sub.f32 f214, f199, f202; +mul.f32 f215, f214, 0f3F737871; +sub.f32 f216, f200, f201; +fma.rn.f32 f217, f216, 0f3F167918, f215; +sub.f32 f218, f213, f217; +add.f32 f219, f217, f213; +mul.f32 f220, f203, 0f3F4F1BBD; +sub.f32 f221, f193, f220; +fma.rn.f32 f222, f205, 0f3E9E377A, f221; +mul.f32 f223, f214, 0f3F167918; +mul.f32 f224, f216, 0f3F737871; +sub.f32 f225, f223, f224; +sub.f32 f226, f222, f225; +add.f32 f227, f225, f222; +fma.rn.f32 f228, f207, 0f3E9E377A, f198; +mul.f32 f229, f209, 0f3F4F1BBD; +sub.f32 f230, f228, f229; +sub.f32 f231, f194, f197; +mul.f32 f232, f231, 0f3F737871; +sub.f32 f233, f195, f196; +fma.rn.f32 f234, f233, 0f3F167918, f232; +add.f32 f235, f234, f230; +sub.f32 f236, f230, f234; +mul.f32 f237, f207, 0f3F4F1BBD; +sub.f32 f238, f198, f237; +fma.rn.f32 f239, f209, 0f3E9E377A, f238; +mul.f32 f240, f231, 0f3F167918; +mul.f32 f241, f233, 0f3F737871; +sub.f32 f242, f240, f241; +add.f32 f243, f242, f239; +sub.f32 f244, f239, f242; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f245, f246}, [rd16]; +mul.f32 f249, f235, f246; +fma.rn.f32 f250, f245, f218, f249; +mul.f32 f251, f218, f246; +mul.f32 f252, f245, f235; +sub.f32 f253, f252, f251; +mul.f32 f254, f245, f245; +mul.f32 f255, f246, f246; +sub.f32 f256, f254, f255; +mul.f32 f257, f246, f245; +fma.rn.f32 f258, f246, f245, f257; +mul.f32 f259, f243, f258; +fma.rn.f32 f260, f256, f226, f259; +mul.f32 f261, f226, f258; +mul.f32 f262, f256, f243; +sub.f32 f263, f262, f261; +mul.f32 f264, f245, f256; +mul.f32 f265, f246, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f245, f258; +fma.rn.f32 f268, f246, f256, f267; +mul.f32 f269, f244, f268; +fma.rn.f32 f270, f266, f227, f269; +mul.f32 f271, f227, f268; +mul.f32 f272, f266, f244; +sub.f32 f273, f272, f271; +mul.f32 f274, f245, f266; +mul.f32 f275, f246, f268; +sub.f32 f276, f274, f275; +mul.f32 f277, f245, f268; +fma.rn.f32 f278, f246, f266, f277; +mul.f32 f279, f236, f278; +fma.rn.f32 f280, f276, f219, f279; +mul.f32 f281, f219, f278; +mul.f32 f282, f276, f236; +sub.f32 f283, f282, f281; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 500, r22; +st.shared.f32 [r23], f206; +st.shared.f32 [r23+100], f250; +st.shared.f32 [r23+200], f260; +st.shared.f32 [r23+300], f270; +st.shared.f32 [r23+400], f280; +barrier.sync 0; +ld.shared.f32 f284, [r11]; +ld.shared.f32 f285, [r11+2500]; +ld.shared.f32 f286, [r11+5000]; +ld.shared.f32 f287, [r11+7500]; +ld.shared.f32 f288, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r23], f210; +st.shared.f32 [r23+100], f253; +st.shared.f32 [r23+200], f263; +st.shared.f32 [r23+300], f273; +st.shared.f32 [r23+400], f283; +barrier.sync 0; +ld.shared.f32 f289, [r11]; +ld.shared.f32 f290, [r11+2500]; +ld.shared.f32 f291, [r11+5000]; +ld.shared.f32 f292, [r11+7500]; +ld.shared.f32 f293, [r11+10000]; +add.f32 f294, f285, f288; +add.f32 f295, f284, f294; +add.f32 f296, f286, f287; +add.f32 f297, f296, f295; +add.f32 f298, f290, f293; +add.f32 f299, f289, f298; +add.f32 f300, f291, f292; +add.f32 f301, f300, f299; +fma.rn.f32 f302, f294, 0f3E9E377A, f284; +mul.f32 f303, f296, 0f3F4F1BBD; +sub.f32 f304, f302, f303; +sub.f32 f305, f290, f293; +mul.f32 f306, f305, 0f3F737871; +sub.f32 f307, f291, f292; +fma.rn.f32 f308, f307, 0f3F167918, f306; +sub.f32 f309, f304, f308; +add.f32 f310, f308, f304; +mul.f32 f311, f294, 0f3F4F1BBD; +sub.f32 f312, f284, f311; +fma.rn.f32 f313, f296, 0f3E9E377A, f312; +mul.f32 f314, f305, 0f3F167918; +mul.f32 f315, f307, 0f3F737871; +sub.f32 f316, f314, f315; +sub.f32 f317, f313, f316; +add.f32 f318, f316, f313; +fma.rn.f32 f319, f298, 0f3E9E377A, f289; +mul.f32 f320, f300, 0f3F4F1BBD; +sub.f32 f321, f319, f320; +sub.f32 f322, f285, f288; +mul.f32 f323, f322, 0f3F737871; +sub.f32 f324, f286, f287; +fma.rn.f32 f325, f324, 0f3F167918, f323; +add.f32 f326, f325, f321; +sub.f32 f327, f321, f325; +mul.f32 f328, f298, 0f3F4F1BBD; +sub.f32 f329, f289, f328; +fma.rn.f32 f330, f300, 0f3E9E377A, f329; +mul.f32 f331, f322, 0f3F167918; +mul.f32 f332, f324, 0f3F737871; +sub.f32 f333, f331, f332; +add.f32 f334, f333, f330; +sub.f32 f335, f330, f333; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 8; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f336, f337}, [rd21]; +mul.f32 f340, f326, f337; +fma.rn.f32 f341, f336, f309, f340; +mul.f32 f342, f309, f337; +mul.f32 f343, f336, f326; +sub.f32 f344, f343, f342; +mul.f32 f345, f336, f336; +mul.f32 f346, f337, f337; +sub.f32 f347, f345, f346; +mul.f32 f348, f337, f336; +fma.rn.f32 f349, f337, f336, f348; +mul.f32 f350, f334, f349; +fma.rn.f32 f351, f347, f317, f350; +mul.f32 f352, f317, f349; +mul.f32 f353, f347, f334; +sub.f32 f354, f353, f352; +mul.f32 f355, f336, f347; +mul.f32 f356, f337, f349; +sub.f32 f357, f355, f356; +mul.f32 f358, f336, f349; +fma.rn.f32 f359, f337, f347, f358; +mul.f32 f360, f335, f359; +fma.rn.f32 f361, f357, f318, f360; +mul.f32 f362, f318, f359; +mul.f32 f363, f357, f335; +sub.f32 f364, f363, f362; +mul.f32 f365, f336, f357; +mul.f32 f366, f337, f359; +sub.f32 f367, f365, f366; +mul.f32 f368, f336, f359; +fma.rn.f32 f369, f337, f357, f368; +mul.f32 f370, f327, f369; +fma.rn.f32 f371, f367, f310, f370; +mul.f32 f372, f310, f369; +mul.f32 f373, f367, f327; +sub.f32 f374, f373, f372; +shl.b32 r27, r26, 2; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 2500, r28; +st.shared.f32 [r29], f297; +st.shared.f32 [r29+500], f341; +st.shared.f32 [r29+1000], f351; +st.shared.f32 [r29+1500], f361; +st.shared.f32 [r29+2000], f371; +barrier.sync 0; +ld.shared.f32 f375, [r11]; +ld.shared.f32 f376, [r11+2500]; +ld.shared.f32 f377, [r11+5000]; +ld.shared.f32 f378, [r11+7500]; +ld.shared.f32 f379, [r11+10000]; +barrier.sync 0; +st.shared.f32 [r29], f301; +st.shared.f32 [r29+500], f344; +st.shared.f32 [r29+1000], f354; +st.shared.f32 [r29+1500], f364; +st.shared.f32 [r29+2000], f374; +barrier.sync 0; +ld.shared.f32 f380, [r11]; +ld.shared.f32 f381, [r11+2500]; +ld.shared.f32 f382, [r11+5000]; +ld.shared.f32 f383, [r11+7500]; +ld.shared.f32 f384, [r11+10000]; +add.f32 f385, f376, f379; +add.f32 f386, f375, f385; +add.f32 f387, f377, f378; +add.f32 f388, f381, f384; +add.f32 f389, f380, f388; +add.f32 f390, f382, f383; +fma.rn.f32 f391, f385, 0f3E9E377A, f375; +mul.f32 f392, f387, 0f3F4F1BBD; +sub.f32 f393, f391, f392; +sub.f32 f394, f381, f384; +mul.f32 f395, f394, 0f3F737871; +sub.f32 f396, f382, f383; +fma.rn.f32 f397, f396, 0f3F167918, f395; +mul.f32 f398, f385, 0f3F4F1BBD; +sub.f32 f399, f375, f398; +fma.rn.f32 f400, f387, 0f3E9E377A, f399; +mul.f32 f401, f394, 0f3F167918; +mul.f32 f402, f396, 0f3F737871; +sub.f32 f403, f401, f402; +fma.rn.f32 f404, f388, 0f3E9E377A, f380; +mul.f32 f405, f390, 0f3F4F1BBD; +sub.f32 f406, f404, f405; +sub.f32 f407, f376, f379; +mul.f32 f408, f407, 0f3F737871; +sub.f32 f409, f377, f378; +fma.rn.f32 f410, f409, 0f3F167918, f408; +mul.f32 f411, f388, 0f3F4F1BBD; +sub.f32 f412, f380, f411; +fma.rn.f32 f413, f390, 0f3E9E377A, f412; +mul.f32 f414, f407, 0f3F167918; +mul.f32 f415, f409, 0f3F737871; +sub.f32 f416, f414, f415; +add.f32 %0, f387, f386; +add.f32 %1, f390, f389; +add.f32 %3, f410, f406; +sub.f32 %2, f393, f397; +sub.f32 %4, f400, f403; +add.f32 %5, f416, f413; +add.f32 %6, f403, f400; +sub.f32 %7, f413, f416; +sub.f32 %9, f406, f410; +add.f32 %8, f397, f393; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_3125), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..f17e387e1c323 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp64_fwd.hpp.inc @@ -0,0 +1,968 @@ +#ifndef CUFFTDX_FFT_3125_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_3125_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<545, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<30>; +.reg .f64 fd<433>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 25000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %17, %25; +add.f64 fd22, %15, fd21; +add.f64 fd23, %20, %23; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %19, %26; +add.f64 fd26, %16, fd25; +add.f64 fd27, %22, %24; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %15; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %19, %26; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %22, %24; +mul.f64 fd35, fd34, 0dBFE2CF2304755A5E; +sub.f64 fd36, fd35, fd33; +sub.f64 fd37, fd31, fd36; +add.f64 fd38, fd36, fd31; +mul.f64 fd39, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd40, %15, fd39; +fma.rn.f64 fd41, fd23, 0d3FD3C6EF372FE950, fd40; +mul.f64 fd42, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd43, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd44, fd43, fd42; +sub.f64 fd45, fd41, fd44; +add.f64 fd46, fd44, fd41; +fma.rn.f64 fd47, fd25, 0d3FD3C6EF372FE950, %16; +mul.f64 fd48, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd49, fd47, fd48; +sub.f64 fd50, %17, %25; +mul.f64 fd51, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd52, %20, %23; +mul.f64 fd53, fd52, 0dBFE2CF2304755A5E; +sub.f64 fd54, fd53, fd51; +add.f64 fd55, fd54, fd49; +sub.f64 fd56, fd49, fd54; +mul.f64 fd57, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd58, %16, fd57; +fma.rn.f64 fd59, fd27, 0d3FD3C6EF372FE950, fd58; +mul.f64 fd60, fd50, 0d3FE2CF2304755A5E; +mul.f64 fd61, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd62, fd61, fd60; +add.f64 fd63, fd62, fd59; +sub.f64 fd64, fd59, fd62; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd65, fd66}, [rd6]; +mul.f64 fd69, fd65, fd37; +mul.f64 fd70, fd66, fd55; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd65, fd55; +fma.rn.f64 fd73, fd66, fd37, fd72; +mul.f64 fd74, fd65, fd65; +mul.f64 fd75, fd66, fd66; +sub.f64 fd76, fd74, fd75; +mul.f64 fd77, fd66, fd65; +fma.rn.f64 fd78, fd66, fd65, fd77; +mul.f64 fd79, fd76, fd45; +mul.f64 fd80, fd78, fd63; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd76, fd63; +fma.rn.f64 fd83, fd78, fd45, fd82; +ld.global.v2.f64 {fd84, fd85}, [rd6+10000]; +mul.f64 fd88, fd84, fd46; +mul.f64 fd89, fd85, fd64; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd84, fd64; +fma.rn.f64 fd92, fd85, fd46, fd91; +mul.f64 fd93, fd65, fd84; +mul.f64 fd94, fd66, fd85; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd65, fd85; +fma.rn.f64 fd97, fd66, fd84, fd96; +mul.f64 fd98, fd95, fd38; +mul.f64 fd99, fd97, fd56; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd95, fd56; +fma.rn.f64 fd102, fd97, fd38, fd101; +mad.lo.s32 r8, r5, 25000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd103, [r11]; +ld.shared.f64 fd104, [r11+5000]; +ld.shared.f64 fd105, [r11+10000]; +ld.shared.f64 fd106, [r11+15000]; +ld.shared.f64 fd107, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd73; +st.shared.f64 [r9+16], fd83; +st.shared.f64 [r9+24], fd92; +st.shared.f64 [r9+32], fd102; +barrier.sync 0; +ld.shared.f64 fd108, [r11]; +ld.shared.f64 fd109, [r11+5000]; +ld.shared.f64 fd110, [r11+10000]; +ld.shared.f64 fd111, [r11+15000]; +ld.shared.f64 fd112, [r11+20000]; +add.f64 fd113, fd104, fd107; +add.f64 fd114, fd103, fd113; +add.f64 fd115, fd105, fd106; +add.f64 fd116, fd115, fd114; +add.f64 fd117, fd109, fd112; +add.f64 fd118, fd108, fd117; +add.f64 fd119, fd110, fd111; +add.f64 fd120, fd119, fd118; +fma.rn.f64 fd121, fd113, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd122, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd109, fd112; +mul.f64 fd125, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd110, fd111; +mul.f64 fd127, fd126, 0dBFE2CF2304755A5E; +sub.f64 fd128, fd127, fd125; +sub.f64 fd129, fd123, fd128; +add.f64 fd130, fd128, fd123; +mul.f64 fd131, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd132, fd103, fd131; +fma.rn.f64 fd133, fd115, 0d3FD3C6EF372FE950, fd132; +mul.f64 fd134, fd124, 0d3FE2CF2304755A5E; +mul.f64 fd135, fd126, 0d3FEE6F0E134454FF; +sub.f64 fd136, fd135, fd134; +sub.f64 fd137, fd133, fd136; +add.f64 fd138, fd136, fd133; +fma.rn.f64 fd139, fd117, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd140, fd119, 0d3FE9E3779B97F4A8; +sub.f64 fd141, fd139, fd140; +sub.f64 fd142, fd104, fd107; +mul.f64 fd143, fd142, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd105, fd106; +mul.f64 fd145, fd144, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd145, fd143; +add.f64 fd147, fd146, fd141; +sub.f64 fd148, fd141, fd146; +mul.f64 fd149, fd117, 0d3FE9E3779B97F4A8; +sub.f64 fd150, fd108, fd149; +fma.rn.f64 fd151, fd119, 0d3FD3C6EF372FE950, fd150; +mul.f64 fd152, fd142, 0d3FE2CF2304755A5E; +mul.f64 fd153, fd144, 0d3FEE6F0E134454FF; +sub.f64 fd154, fd153, fd152; +add.f64 fd155, fd154, fd151; +sub.f64 fd156, fd151, fd154; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd157, fd158}, [rd11]; +mul.f64 fd161, fd157, fd129; +mul.f64 fd162, fd158, fd147; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd157, fd147; +fma.rn.f64 fd165, fd158, fd129, fd164; +mul.f64 fd166, fd157, fd157; +mul.f64 fd167, fd158, fd158; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd158, fd157; +fma.rn.f64 fd170, fd158, fd157, fd169; +mul.f64 fd171, fd168, fd137; +mul.f64 fd172, fd170, fd155; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd168, fd155; +fma.rn.f64 fd175, fd170, fd137, fd174; +ld.global.v2.f64 {fd176, fd177}, [rd11+2000]; +mul.f64 fd180, fd176, fd138; +mul.f64 fd181, fd177, fd156; +sub.f64 fd182, fd180, fd181; +mul.f64 fd183, fd176, fd156; +fma.rn.f64 fd184, fd177, fd138, fd183; +mul.f64 fd185, fd157, fd176; +mul.f64 fd186, fd158, fd177; +sub.f64 fd187, fd185, fd186; +mul.f64 fd188, fd157, fd177; +fma.rn.f64 fd189, fd158, fd176, fd188; +mul.f64 fd190, fd187, fd130; +mul.f64 fd191, fd189, fd148; +sub.f64 fd192, fd190, fd191; +mul.f64 fd193, fd187, fd148; +fma.rn.f64 fd194, fd189, fd130, fd193; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +st.shared.f64 [r17], fd116; +st.shared.f64 [r17+40], fd163; +st.shared.f64 [r17+80], fd173; +st.shared.f64 [r17+120], fd182; +st.shared.f64 [r17+160], fd192; +barrier.sync 0; +ld.shared.f64 fd195, [r11]; +ld.shared.f64 fd196, [r11+5000]; +ld.shared.f64 fd197, [r11+10000]; +ld.shared.f64 fd198, [r11+15000]; +ld.shared.f64 fd199, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r17], fd120; +st.shared.f64 [r17+40], fd165; +st.shared.f64 [r17+80], fd175; +st.shared.f64 [r17+120], fd184; +st.shared.f64 [r17+160], fd194; +barrier.sync 0; +ld.shared.f64 fd200, [r11]; +ld.shared.f64 fd201, [r11+5000]; +ld.shared.f64 fd202, [r11+10000]; +ld.shared.f64 fd203, [r11+15000]; +ld.shared.f64 fd204, [r11+20000]; +add.f64 fd205, fd196, fd199; +add.f64 fd206, fd195, fd205; +add.f64 fd207, fd197, fd198; +add.f64 fd208, fd207, fd206; +add.f64 fd209, fd201, fd204; +add.f64 fd210, fd200, fd209; +add.f64 fd211, fd202, fd203; +add.f64 fd212, fd211, fd210; +fma.rn.f64 fd213, fd205, 0d3FD3C6EF372FE950, fd195; +mul.f64 fd214, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd215, fd213, fd214; +sub.f64 fd216, fd201, fd204; +mul.f64 fd217, fd216, 0d3FEE6F0E134454FF; +sub.f64 fd218, fd202, fd203; +mul.f64 fd219, fd218, 0dBFE2CF2304755A5E; +sub.f64 fd220, fd219, fd217; +sub.f64 fd221, fd215, fd220; +add.f64 fd222, fd220, fd215; +mul.f64 fd223, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd224, fd195, fd223; +fma.rn.f64 fd225, fd207, 0d3FD3C6EF372FE950, fd224; +mul.f64 fd226, fd216, 0d3FE2CF2304755A5E; +mul.f64 fd227, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd228, fd227, fd226; +sub.f64 fd229, fd225, fd228; +add.f64 fd230, fd228, fd225; +fma.rn.f64 fd231, fd209, 0d3FD3C6EF372FE950, fd200; +mul.f64 fd232, fd211, 0d3FE9E3779B97F4A8; +sub.f64 fd233, fd231, fd232; +sub.f64 fd234, fd196, fd199; +mul.f64 fd235, fd234, 0d3FEE6F0E134454FF; +sub.f64 fd236, fd197, fd198; +mul.f64 fd237, fd236, 0dBFE2CF2304755A5E; +sub.f64 fd238, fd237, fd235; +add.f64 fd239, fd238, fd233; +sub.f64 fd240, fd233, fd238; +mul.f64 fd241, fd209, 0d3FE9E3779B97F4A8; +sub.f64 fd242, fd200, fd241; +fma.rn.f64 fd243, fd211, 0d3FD3C6EF372FE950, fd242; +mul.f64 fd244, fd234, 0d3FE2CF2304755A5E; +mul.f64 fd245, fd236, 0d3FEE6F0E134454FF; +sub.f64 fd246, fd245, fd244; +add.f64 fd247, fd246, fd243; +sub.f64 fd248, fd243, fd246; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd249, fd250}, [rd16]; +mul.f64 fd253, fd249, fd221; +mul.f64 fd254, fd250, fd239; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd249, fd239; +fma.rn.f64 fd257, fd250, fd221, fd256; +mul.f64 fd258, fd249, fd249; +mul.f64 fd259, fd250, fd250; +sub.f64 fd260, fd258, fd259; +mul.f64 fd261, fd250, fd249; +fma.rn.f64 fd262, fd250, fd249, fd261; +mul.f64 fd263, fd260, fd229; +mul.f64 fd264, fd262, fd247; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd260, fd247; +fma.rn.f64 fd267, fd262, fd229, fd266; +ld.global.v2.f64 {fd268, fd269}, [rd16+400]; +mul.f64 fd272, fd268, fd230; +mul.f64 fd273, fd269, fd248; +sub.f64 fd274, fd272, fd273; +mul.f64 fd275, fd268, fd248; +fma.rn.f64 fd276, fd269, fd230, fd275; +mul.f64 fd277, fd249, fd268; +mul.f64 fd278, fd250, fd269; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd249, fd269; +fma.rn.f64 fd281, fd250, fd268, fd280; +mul.f64 fd282, fd279, fd222; +mul.f64 fd283, fd281, fd240; +sub.f64 fd284, fd282, fd283; +mul.f64 fd285, fd279, fd240; +fma.rn.f64 fd286, fd281, fd222, fd285; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +st.shared.f64 [r23], fd208; +st.shared.f64 [r23+200], fd255; +st.shared.f64 [r23+400], fd265; +st.shared.f64 [r23+600], fd274; +st.shared.f64 [r23+800], fd284; +barrier.sync 0; +ld.shared.f64 fd287, [r11]; +ld.shared.f64 fd288, [r11+5000]; +ld.shared.f64 fd289, [r11+10000]; +ld.shared.f64 fd290, [r11+15000]; +ld.shared.f64 fd291, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r23], fd212; +st.shared.f64 [r23+200], fd257; +st.shared.f64 [r23+400], fd267; +st.shared.f64 [r23+600], fd276; +st.shared.f64 [r23+800], fd286; +barrier.sync 0; +ld.shared.f64 fd292, [r11]; +ld.shared.f64 fd293, [r11+5000]; +ld.shared.f64 fd294, [r11+10000]; +ld.shared.f64 fd295, [r11+15000]; +ld.shared.f64 fd296, [r11+20000]; +add.f64 fd297, fd288, fd291; +add.f64 fd298, fd287, fd297; +add.f64 fd299, fd289, fd290; +add.f64 fd300, fd299, fd298; +add.f64 fd301, fd293, fd296; +add.f64 fd302, fd292, fd301; +add.f64 fd303, fd294, fd295; +add.f64 fd304, fd303, fd302; +fma.rn.f64 fd305, fd297, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd306, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd307, fd305, fd306; +sub.f64 fd308, fd293, fd296; +mul.f64 fd309, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd310, fd294, fd295; +mul.f64 fd311, fd310, 0dBFE2CF2304755A5E; +sub.f64 fd312, fd311, fd309; +sub.f64 fd313, fd307, fd312; +add.f64 fd314, fd312, fd307; +mul.f64 fd315, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd316, fd287, fd315; +fma.rn.f64 fd317, fd299, 0d3FD3C6EF372FE950, fd316; +mul.f64 fd318, fd308, 0d3FE2CF2304755A5E; +mul.f64 fd319, fd310, 0d3FEE6F0E134454FF; +sub.f64 fd320, fd319, fd318; +sub.f64 fd321, fd317, fd320; +add.f64 fd322, fd320, fd317; +fma.rn.f64 fd323, fd301, 0d3FD3C6EF372FE950, fd292; +mul.f64 fd324, fd303, 0d3FE9E3779B97F4A8; +sub.f64 fd325, fd323, fd324; +sub.f64 fd326, fd288, fd291; +mul.f64 fd327, fd326, 0d3FEE6F0E134454FF; +sub.f64 fd328, fd289, fd290; +mul.f64 fd329, fd328, 0dBFE2CF2304755A5E; +sub.f64 fd330, fd329, fd327; +add.f64 fd331, fd330, fd325; +sub.f64 fd332, fd325, fd330; +mul.f64 fd333, fd301, 0d3FE9E3779B97F4A8; +sub.f64 fd334, fd292, fd333; +fma.rn.f64 fd335, fd303, 0d3FD3C6EF372FE950, fd334; +mul.f64 fd336, fd326, 0d3FE2CF2304755A5E; +mul.f64 fd337, fd328, 0d3FEE6F0E134454FF; +sub.f64 fd338, fd337, fd336; +add.f64 fd339, fd338, fd335; +sub.f64 fd340, fd335, fd338; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 16; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd341, fd342}, [rd21]; +mul.f64 fd345, fd341, fd313; +mul.f64 fd346, fd342, fd331; +sub.f64 fd347, fd345, fd346; +mul.f64 fd348, fd341, fd331; +fma.rn.f64 fd349, fd342, fd313, fd348; +mul.f64 fd350, fd341, fd341; +mul.f64 fd351, fd342, fd342; +sub.f64 fd352, fd350, fd351; +mul.f64 fd353, fd342, fd341; +fma.rn.f64 fd354, fd342, fd341, fd353; +mul.f64 fd355, fd352, fd321; +mul.f64 fd356, fd354, fd339; +sub.f64 fd357, fd355, fd356; +mul.f64 fd358, fd352, fd339; +fma.rn.f64 fd359, fd354, fd321, fd358; +ld.global.v2.f64 {fd360, fd361}, [rd21+80]; +mul.f64 fd364, fd360, fd322; +mul.f64 fd365, fd361, fd340; +sub.f64 fd366, fd364, fd365; +mul.f64 fd367, fd360, fd340; +fma.rn.f64 fd368, fd361, fd322, fd367; +mul.f64 fd369, fd341, fd360; +mul.f64 fd370, fd342, fd361; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd341, fd361; +fma.rn.f64 fd373, fd342, fd360, fd372; +mul.f64 fd374, fd371, fd314; +mul.f64 fd375, fd373, fd332; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd371, fd332; +fma.rn.f64 fd378, fd373, fd314, fd377; +shl.b32 r27, r26, 3; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 5000, r28; +st.shared.f64 [r29], fd300; +st.shared.f64 [r29+1000], fd347; +st.shared.f64 [r29+2000], fd357; +st.shared.f64 [r29+3000], fd366; +st.shared.f64 [r29+4000], fd376; +barrier.sync 0; +ld.shared.f64 fd379, [r11]; +ld.shared.f64 fd380, [r11+5000]; +ld.shared.f64 fd381, [r11+10000]; +ld.shared.f64 fd382, [r11+15000]; +ld.shared.f64 fd383, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r29], fd304; +st.shared.f64 [r29+1000], fd349; +st.shared.f64 [r29+2000], fd359; +st.shared.f64 [r29+3000], fd368; +st.shared.f64 [r29+4000], fd378; +barrier.sync 0; +ld.shared.f64 fd384, [r11]; +ld.shared.f64 fd385, [r11+5000]; +ld.shared.f64 fd386, [r11+10000]; +ld.shared.f64 fd387, [r11+15000]; +ld.shared.f64 fd388, [r11+20000]; +add.f64 fd389, fd380, fd383; +add.f64 fd390, fd379, fd389; +add.f64 fd391, fd381, fd382; +add.f64 fd392, fd385, fd388; +add.f64 fd393, fd384, fd392; +add.f64 fd394, fd386, fd387; +fma.rn.f64 fd395, fd389, 0d3FD3C6EF372FE950, fd379; +mul.f64 fd396, fd391, 0d3FE9E3779B97F4A8; +sub.f64 fd397, fd395, fd396; +sub.f64 fd398, fd385, fd388; +mul.f64 fd399, fd398, 0d3FEE6F0E134454FF; +sub.f64 fd400, fd386, fd387; +mul.f64 fd401, fd400, 0dBFE2CF2304755A5E; +sub.f64 fd402, fd401, fd399; +mul.f64 fd403, fd389, 0d3FE9E3779B97F4A8; +sub.f64 fd404, fd379, fd403; +fma.rn.f64 fd405, fd391, 0d3FD3C6EF372FE950, fd404; +mul.f64 fd406, fd398, 0d3FE2CF2304755A5E; +mul.f64 fd407, fd400, 0d3FEE6F0E134454FF; +sub.f64 fd408, fd407, fd406; +fma.rn.f64 fd409, fd392, 0d3FD3C6EF372FE950, fd384; +mul.f64 fd410, fd394, 0d3FE9E3779B97F4A8; +sub.f64 fd411, fd409, fd410; +sub.f64 fd412, fd380, fd383; +mul.f64 fd413, fd412, 0d3FEE6F0E134454FF; +sub.f64 fd414, fd381, fd382; +mul.f64 fd415, fd414, 0dBFE2CF2304755A5E; +sub.f64 fd416, fd415, fd413; +mul.f64 fd417, fd392, 0d3FE9E3779B97F4A8; +sub.f64 fd418, fd384, fd417; +fma.rn.f64 fd419, fd394, 0d3FD3C6EF372FE950, fd418; +mul.f64 fd420, fd412, 0d3FE2CF2304755A5E; +mul.f64 fd421, fd414, 0d3FEE6F0E134454FF; +sub.f64 fd422, fd421, fd420; +add.f64 %0, fd391, fd390; +add.f64 %1, fd394, fd393; +add.f64 %3, fd416, fd411; +sub.f64 %2, fd397, fd402; +sub.f64 %4, fd405, fd408; +add.f64 %5, fd422, fd419; +add.f64 %6, fd408, fd405; +sub.f64 %7, fd419, fd422; +sub.f64 %9, fd411, fd416; +add.f64 %8, fd402, fd397; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_3125), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<544, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<30>; +.reg .f64 fd<473>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 50000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %17, %25; +add.f64 fd22, %15, fd21; +add.f64 fd23, %20, %23; +add.f64 fd24, %19, %26; +add.f64 fd25, %16, fd24; +add.f64 fd26, %22, %24; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %15; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %19, %26; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %22, %24; +mul.f64 fd33, fd32, 0dBFE2CF2304755A5E; +sub.f64 fd34, fd33, fd31; +sub.f64 fd35, fd29, fd34; +add.f64 fd36, fd34, fd29; +mul.f64 fd37, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd38, %15, fd37; +fma.rn.f64 fd39, fd23, 0d3FD3C6EF372FE950, fd38; +mul.f64 fd40, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd41, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd42, fd41, fd40; +sub.f64 fd43, fd39, fd42; +add.f64 fd44, fd42, fd39; +fma.rn.f64 fd45, fd24, 0d3FD3C6EF372FE950, %16; +mul.f64 fd46, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd47, fd45, fd46; +sub.f64 fd48, %17, %25; +mul.f64 fd49, fd48, 0d3FEE6F0E134454FF; +sub.f64 fd50, %20, %23; +mul.f64 fd51, fd50, 0dBFE2CF2304755A5E; +sub.f64 fd52, fd51, fd49; +add.f64 fd53, fd52, fd47; +sub.f64 fd54, fd47, fd52; +mul.f64 fd55, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %16, fd55; +fma.rn.f64 fd57, fd26, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd48, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd59, fd58; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 50000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd63, fd35; +mul.f64 fd68, fd64, fd53; +mul.f64 fd69, fd63, fd53; +mul.f64 fd70, fd63, fd63; +mul.f64 fd71, fd64, fd64; +sub.f64 fd72, fd70, fd71; +mul.f64 fd73, fd64, fd63; +fma.rn.f64 fd74, fd64, fd63, fd73; +mul.f64 fd75, fd72, fd43; +mul.f64 fd76, fd74, fd61; +mul.f64 fd77, fd72, fd61; +ld.global.v2.f64 {fd78, fd79}, [rd6+10000]; +mul.f64 fd82, fd78, fd44; +mul.f64 fd83, fd79, fd62; +mul.f64 fd84, fd78, fd62; +mul.f64 fd85, fd63, fd78; +mul.f64 fd86, fd64, fd79; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd63, fd79; +fma.rn.f64 fd89, fd64, fd78, fd88; +mul.f64 fd90, fd87, fd36; +mul.f64 fd91, fd89, fd54; +mul.f64 fd92, fd87, fd54; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd93, fd26, fd25; +add.f64 fd94, fd23, fd22; +st.shared.v2.f64 [r9], {fd94, fd93}; +fma.rn.f64 fd95, fd64, fd35, fd69; +sub.f64 fd96, fd67, fd68; +st.shared.v2.f64 [r9+16], {fd96, fd95}; +fma.rn.f64 fd97, fd74, fd43, fd77; +sub.f64 fd98, fd75, fd76; +st.shared.v2.f64 [r9+32], {fd98, fd97}; +fma.rn.f64 fd99, fd79, fd44, fd84; +sub.f64 fd100, fd82, fd83; +st.shared.v2.f64 [r9+48], {fd100, fd99}; +fma.rn.f64 fd101, fd89, fd36, fd92; +sub.f64 fd102, fd90, fd91; +st.shared.v2.f64 [r9+64], {fd102, fd101}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd103, fd104}, [r11]; +ld.shared.v2.f64 {fd107, fd108}, [r11+10000]; +ld.shared.v2.f64 {fd111, fd112}, [r11+20000]; +ld.shared.v2.f64 {fd115, fd116}, [r11+30000]; +ld.shared.v2.f64 {fd119, fd120}, [r11+40000]; +add.f64 fd123, fd107, fd119; +add.f64 fd124, fd103, fd123; +add.f64 fd125, fd111, fd115; +add.f64 fd126, fd108, fd120; +add.f64 fd127, fd104, fd126; +add.f64 fd128, fd112, fd116; +fma.rn.f64 fd129, fd123, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd130, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd131, fd129, fd130; +sub.f64 fd132, fd108, fd120; +mul.f64 fd133, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd134, fd112, fd116; +mul.f64 fd135, fd134, 0dBFE2CF2304755A5E; +sub.f64 fd136, fd135, fd133; +sub.f64 fd137, fd131, fd136; +add.f64 fd138, fd136, fd131; +mul.f64 fd139, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd140, fd103, fd139; +fma.rn.f64 fd141, fd125, 0d3FD3C6EF372FE950, fd140; +mul.f64 fd142, fd132, 0d3FE2CF2304755A5E; +mul.f64 fd143, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd143, fd142; +sub.f64 fd145, fd141, fd144; +add.f64 fd146, fd144, fd141; +fma.rn.f64 fd147, fd126, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd148, fd128, 0d3FE9E3779B97F4A8; +sub.f64 fd149, fd147, fd148; +sub.f64 fd150, fd107, fd119; +mul.f64 fd151, fd150, 0d3FEE6F0E134454FF; +sub.f64 fd152, fd111, fd115; +mul.f64 fd153, fd152, 0dBFE2CF2304755A5E; +sub.f64 fd154, fd153, fd151; +add.f64 fd155, fd154, fd149; +sub.f64 fd156, fd149, fd154; +mul.f64 fd157, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd158, fd104, fd157; +fma.rn.f64 fd159, fd128, 0d3FD3C6EF372FE950, fd158; +mul.f64 fd160, fd150, 0d3FE2CF2304755A5E; +mul.f64 fd161, fd152, 0d3FEE6F0E134454FF; +sub.f64 fd162, fd161, fd160; +add.f64 fd163, fd162, fd159; +sub.f64 fd164, fd159, fd162; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd165, fd166}, [rd11]; +mul.f64 fd169, fd165, fd137; +mul.f64 fd170, fd166, fd155; +mul.f64 fd171, fd165, fd155; +mul.f64 fd172, fd165, fd165; +mul.f64 fd173, fd166, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd166, fd165; +fma.rn.f64 fd176, fd166, fd165, fd175; +mul.f64 fd177, fd174, fd145; +mul.f64 fd178, fd176, fd163; +mul.f64 fd179, fd174, fd163; +ld.global.v2.f64 {fd180, fd181}, [rd11+2000]; +mul.f64 fd184, fd180, fd146; +mul.f64 fd185, fd181, fd164; +mul.f64 fd186, fd180, fd164; +mul.f64 fd187, fd165, fd180; +mul.f64 fd188, fd166, fd181; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd165, fd181; +fma.rn.f64 fd191, fd166, fd180, fd190; +mul.f64 fd192, fd189, fd138; +mul.f64 fd193, fd191, fd156; +mul.f64 fd194, fd189, fd156; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 400, r16; +add.f64 fd195, fd128, fd127; +add.f64 fd196, fd125, fd124; +st.shared.v2.f64 [r17], {fd196, fd195}; +fma.rn.f64 fd197, fd166, fd137, fd171; +sub.f64 fd198, fd169, fd170; +st.shared.v2.f64 [r17+80], {fd198, fd197}; +fma.rn.f64 fd199, fd176, fd145, fd179; +sub.f64 fd200, fd177, fd178; +st.shared.v2.f64 [r17+160], {fd200, fd199}; +fma.rn.f64 fd201, fd181, fd146, fd186; +sub.f64 fd202, fd184, fd185; +st.shared.v2.f64 [r17+240], {fd202, fd201}; +fma.rn.f64 fd203, fd191, fd138, fd194; +sub.f64 fd204, fd192, fd193; +st.shared.v2.f64 [r17+320], {fd204, fd203}; +barrier.sync 0; +ld.shared.v2.f64 {fd205, fd206}, [r11]; +ld.shared.v2.f64 {fd209, fd210}, [r11+10000]; +ld.shared.v2.f64 {fd213, fd214}, [r11+20000]; +ld.shared.v2.f64 {fd217, fd218}, [r11+30000]; +ld.shared.v2.f64 {fd221, fd222}, [r11+40000]; +add.f64 fd225, fd209, fd221; +add.f64 fd226, fd205, fd225; +add.f64 fd227, fd213, fd217; +add.f64 fd228, fd210, fd222; +add.f64 fd229, fd206, fd228; +add.f64 fd230, fd214, fd218; +fma.rn.f64 fd231, fd225, 0d3FD3C6EF372FE950, fd205; +mul.f64 fd232, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd233, fd231, fd232; +sub.f64 fd234, fd210, fd222; +mul.f64 fd235, fd234, 0d3FEE6F0E134454FF; +sub.f64 fd236, fd214, fd218; +mul.f64 fd237, fd236, 0dBFE2CF2304755A5E; +sub.f64 fd238, fd237, fd235; +sub.f64 fd239, fd233, fd238; +add.f64 fd240, fd238, fd233; +mul.f64 fd241, fd225, 0d3FE9E3779B97F4A8; +sub.f64 fd242, fd205, fd241; +fma.rn.f64 fd243, fd227, 0d3FD3C6EF372FE950, fd242; +mul.f64 fd244, fd234, 0d3FE2CF2304755A5E; +mul.f64 fd245, fd236, 0d3FEE6F0E134454FF; +sub.f64 fd246, fd245, fd244; +sub.f64 fd247, fd243, fd246; +add.f64 fd248, fd246, fd243; +fma.rn.f64 fd249, fd228, 0d3FD3C6EF372FE950, fd206; +mul.f64 fd250, fd230, 0d3FE9E3779B97F4A8; +sub.f64 fd251, fd249, fd250; +sub.f64 fd252, fd209, fd221; +mul.f64 fd253, fd252, 0d3FEE6F0E134454FF; +sub.f64 fd254, fd213, fd217; +mul.f64 fd255, fd254, 0dBFE2CF2304755A5E; +sub.f64 fd256, fd255, fd253; +add.f64 fd257, fd256, fd251; +sub.f64 fd258, fd251, fd256; +mul.f64 fd259, fd228, 0d3FE9E3779B97F4A8; +sub.f64 fd260, fd206, fd259; +fma.rn.f64 fd261, fd230, 0d3FD3C6EF372FE950, fd260; +mul.f64 fd262, fd252, 0d3FE2CF2304755A5E; +mul.f64 fd263, fd254, 0d3FEE6F0E134454FF; +sub.f64 fd264, fd263, fd262; +add.f64 fd265, fd264, fd261; +sub.f64 fd266, fd261, fd264; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd267, fd268}, [rd16]; +mul.f64 fd271, fd267, fd239; +mul.f64 fd272, fd268, fd257; +mul.f64 fd273, fd267, fd257; +mul.f64 fd274, fd267, fd267; +mul.f64 fd275, fd268, fd268; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd268, fd267; +fma.rn.f64 fd278, fd268, fd267, fd277; +mul.f64 fd279, fd276, fd247; +mul.f64 fd280, fd278, fd265; +mul.f64 fd281, fd276, fd265; +ld.global.v2.f64 {fd282, fd283}, [rd16+400]; +mul.f64 fd286, fd282, fd248; +mul.f64 fd287, fd283, fd266; +mul.f64 fd288, fd282, fd266; +mul.f64 fd289, fd267, fd282; +mul.f64 fd290, fd268, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd267, fd283; +fma.rn.f64 fd293, fd268, fd282, fd292; +mul.f64 fd294, fd291, fd240; +mul.f64 fd295, fd293, fd258; +mul.f64 fd296, fd291, fd258; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2000, r22; +add.f64 fd297, fd230, fd229; +add.f64 fd298, fd227, fd226; +st.shared.v2.f64 [r23], {fd298, fd297}; +fma.rn.f64 fd299, fd268, fd239, fd273; +sub.f64 fd300, fd271, fd272; +st.shared.v2.f64 [r23+400], {fd300, fd299}; +fma.rn.f64 fd301, fd278, fd247, fd281; +sub.f64 fd302, fd279, fd280; +st.shared.v2.f64 [r23+800], {fd302, fd301}; +fma.rn.f64 fd303, fd283, fd248, fd288; +sub.f64 fd304, fd286, fd287; +st.shared.v2.f64 [r23+1200], {fd304, fd303}; +fma.rn.f64 fd305, fd293, fd240, fd296; +sub.f64 fd306, fd294, fd295; +st.shared.v2.f64 [r23+1600], {fd306, fd305}; +barrier.sync 0; +ld.shared.v2.f64 {fd307, fd308}, [r11]; +ld.shared.v2.f64 {fd311, fd312}, [r11+10000]; +ld.shared.v2.f64 {fd315, fd316}, [r11+20000]; +ld.shared.v2.f64 {fd319, fd320}, [r11+30000]; +ld.shared.v2.f64 {fd323, fd324}, [r11+40000]; +add.f64 fd327, fd311, fd323; +add.f64 fd328, fd307, fd327; +add.f64 fd329, fd315, fd319; +add.f64 fd330, fd312, fd324; +add.f64 fd331, fd308, fd330; +add.f64 fd332, fd316, fd320; +fma.rn.f64 fd333, fd327, 0d3FD3C6EF372FE950, fd307; +mul.f64 fd334, fd329, 0d3FE9E3779B97F4A8; +sub.f64 fd335, fd333, fd334; +sub.f64 fd336, fd312, fd324; +mul.f64 fd337, fd336, 0d3FEE6F0E134454FF; +sub.f64 fd338, fd316, fd320; +mul.f64 fd339, fd338, 0dBFE2CF2304755A5E; +sub.f64 fd340, fd339, fd337; +sub.f64 fd341, fd335, fd340; +add.f64 fd342, fd340, fd335; +mul.f64 fd343, fd327, 0d3FE9E3779B97F4A8; +sub.f64 fd344, fd307, fd343; +fma.rn.f64 fd345, fd329, 0d3FD3C6EF372FE950, fd344; +mul.f64 fd346, fd336, 0d3FE2CF2304755A5E; +mul.f64 fd347, fd338, 0d3FEE6F0E134454FF; +sub.f64 fd348, fd347, fd346; +sub.f64 fd349, fd345, fd348; +add.f64 fd350, fd348, fd345; +fma.rn.f64 fd351, fd330, 0d3FD3C6EF372FE950, fd308; +mul.f64 fd352, fd332, 0d3FE9E3779B97F4A8; +sub.f64 fd353, fd351, fd352; +sub.f64 fd354, fd311, fd323; +mul.f64 fd355, fd354, 0d3FEE6F0E134454FF; +sub.f64 fd356, fd315, fd319; +mul.f64 fd357, fd356, 0dBFE2CF2304755A5E; +sub.f64 fd358, fd357, fd355; +add.f64 fd359, fd358, fd353; +sub.f64 fd360, fd353, fd358; +mul.f64 fd361, fd330, 0d3FE9E3779B97F4A8; +sub.f64 fd362, fd308, fd361; +fma.rn.f64 fd363, fd332, 0d3FD3C6EF372FE950, fd362; +mul.f64 fd364, fd354, 0d3FE2CF2304755A5E; +mul.f64 fd365, fd356, 0d3FEE6F0E134454FF; +sub.f64 fd366, fd365, fd364; +add.f64 fd367, fd366, fd363; +sub.f64 fd368, fd363, fd366; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 16; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd369, fd370}, [rd21]; +mul.f64 fd373, fd369, fd341; +mul.f64 fd374, fd370, fd359; +mul.f64 fd375, fd369, fd359; +mul.f64 fd376, fd369, fd369; +mul.f64 fd377, fd370, fd370; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd370, fd369; +fma.rn.f64 fd380, fd370, fd369, fd379; +mul.f64 fd381, fd378, fd349; +mul.f64 fd382, fd380, fd367; +mul.f64 fd383, fd378, fd367; +ld.global.v2.f64 {fd384, fd385}, [rd21+80]; +mul.f64 fd388, fd384, fd350; +mul.f64 fd389, fd385, fd368; +mul.f64 fd390, fd384, fd368; +mul.f64 fd391, fd369, fd384; +mul.f64 fd392, fd370, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd369, fd385; +fma.rn.f64 fd395, fd370, fd384, fd394; +mul.f64 fd396, fd393, fd342; +mul.f64 fd397, fd395, fd360; +mul.f64 fd398, fd393, fd360; +shl.b32 r27, r26, 4; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 10000, r28; +add.f64 fd399, fd332, fd331; +add.f64 fd400, fd329, fd328; +st.shared.v2.f64 [r29], {fd400, fd399}; +fma.rn.f64 fd401, fd370, fd341, fd375; +sub.f64 fd402, fd373, fd374; +st.shared.v2.f64 [r29+2000], {fd402, fd401}; +fma.rn.f64 fd403, fd380, fd349, fd383; +sub.f64 fd404, fd381, fd382; +st.shared.v2.f64 [r29+4000], {fd404, fd403}; +fma.rn.f64 fd405, fd385, fd350, fd390; +sub.f64 fd406, fd388, fd389; +st.shared.v2.f64 [r29+6000], {fd406, fd405}; +fma.rn.f64 fd407, fd395, fd342, fd398; +sub.f64 fd408, fd396, fd397; +st.shared.v2.f64 [r29+8000], {fd408, fd407}; +barrier.sync 0; +ld.shared.v2.f64 {fd409, fd410}, [r11]; +ld.shared.v2.f64 {fd413, fd414}, [r11+10000]; +ld.shared.v2.f64 {fd417, fd418}, [r11+20000]; +ld.shared.v2.f64 {fd421, fd422}, [r11+30000]; +ld.shared.v2.f64 {fd425, fd426}, [r11+40000]; +add.f64 fd429, fd413, fd425; +add.f64 fd430, fd409, fd429; +add.f64 fd431, fd417, fd421; +add.f64 fd432, fd414, fd426; +add.f64 fd433, fd410, fd432; +add.f64 fd434, fd418, fd422; +fma.rn.f64 fd435, fd429, 0d3FD3C6EF372FE950, fd409; +mul.f64 fd436, fd431, 0d3FE9E3779B97F4A8; +sub.f64 fd437, fd435, fd436; +sub.f64 fd438, fd414, fd426; +mul.f64 fd439, fd438, 0d3FEE6F0E134454FF; +sub.f64 fd440, fd418, fd422; +mul.f64 fd441, fd440, 0dBFE2CF2304755A5E; +sub.f64 fd442, fd441, fd439; +mul.f64 fd443, fd429, 0d3FE9E3779B97F4A8; +sub.f64 fd444, fd409, fd443; +fma.rn.f64 fd445, fd431, 0d3FD3C6EF372FE950, fd444; +mul.f64 fd446, fd438, 0d3FE2CF2304755A5E; +mul.f64 fd447, fd440, 0d3FEE6F0E134454FF; +sub.f64 fd448, fd447, fd446; +fma.rn.f64 fd449, fd432, 0d3FD3C6EF372FE950, fd410; +mul.f64 fd450, fd434, 0d3FE9E3779B97F4A8; +sub.f64 fd451, fd449, fd450; +sub.f64 fd452, fd413, fd425; +mul.f64 fd453, fd452, 0d3FEE6F0E134454FF; +sub.f64 fd454, fd417, fd421; +mul.f64 fd455, fd454, 0dBFE2CF2304755A5E; +sub.f64 fd456, fd455, fd453; +mul.f64 fd457, fd432, 0d3FE9E3779B97F4A8; +sub.f64 fd458, fd410, fd457; +fma.rn.f64 fd459, fd434, 0d3FD3C6EF372FE950, fd458; +mul.f64 fd460, fd452, 0d3FE2CF2304755A5E; +mul.f64 fd461, fd454, 0d3FEE6F0E134454FF; +sub.f64 fd462, fd461, fd460; +add.f64 %1, fd434, fd433; +add.f64 %0, fd431, fd430; +add.f64 %3, fd456, fd451; +sub.f64 %2, fd437, fd442; +add.f64 %5, fd462, fd459; +sub.f64 %4, fd445, fd448; +sub.f64 %7, fd459, fd462; +add.f64 %6, fd448, fd445; +sub.f64 %9, fd451, fd456; +add.f64 %8, fd442, fd437; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_3125), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..d9cc3ae111926 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3125_fp64_inv.hpp.inc @@ -0,0 +1,948 @@ +#ifndef CUFFTDX_FFT_3125_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_3125_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<716, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<30>; +.reg .f64 fd<423>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 25000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %17, %25; +add.f64 fd22, %15, fd21; +add.f64 fd23, %20, %23; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %19, %26; +add.f64 fd26, %16, fd25; +add.f64 fd27, %22, %24; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %15; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %19, %26; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %22, %24; +fma.rn.f64 fd35, fd34, 0d3FE2CF2304755A5E, fd33; +sub.f64 fd36, fd31, fd35; +add.f64 fd37, fd35, fd31; +mul.f64 fd38, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd39, %15, fd38; +fma.rn.f64 fd40, fd23, 0d3FD3C6EF372FE950, fd39; +mul.f64 fd41, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd42, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, fd40, fd43; +add.f64 fd45, fd43, fd40; +fma.rn.f64 fd46, fd25, 0d3FD3C6EF372FE950, %16; +mul.f64 fd47, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd48, fd46, fd47; +sub.f64 fd49, %17, %25; +mul.f64 fd50, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd51, %20, %23; +fma.rn.f64 fd52, fd51, 0d3FE2CF2304755A5E, fd50; +add.f64 fd53, fd52, fd48; +sub.f64 fd54, fd48, fd52; +mul.f64 fd55, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %16, fd55; +fma.rn.f64 fd57, fd27, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd49, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd51, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd58, fd59; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd53, fd64; +fma.rn.f64 fd68, fd63, fd36, fd67; +mul.f64 fd69, fd36, fd64; +mul.f64 fd70, fd63, fd53; +sub.f64 fd71, fd70, fd69; +mul.f64 fd72, fd63, fd63; +mul.f64 fd73, fd64, fd64; +sub.f64 fd74, fd72, fd73; +mul.f64 fd75, fd64, fd63; +fma.rn.f64 fd76, fd64, fd63, fd75; +mul.f64 fd77, fd61, fd76; +fma.rn.f64 fd78, fd74, fd44, fd77; +mul.f64 fd79, fd44, fd76; +mul.f64 fd80, fd74, fd61; +sub.f64 fd81, fd80, fd79; +ld.global.v2.f64 {fd82, fd83}, [rd6+10000]; +mul.f64 fd86, fd62, fd83; +fma.rn.f64 fd87, fd82, fd45, fd86; +mul.f64 fd88, fd45, fd83; +mul.f64 fd89, fd82, fd62; +sub.f64 fd90, fd89, fd88; +mul.f64 fd91, fd63, fd82; +mul.f64 fd92, fd64, fd83; +sub.f64 fd93, fd91, fd92; +mul.f64 fd94, fd63, fd83; +fma.rn.f64 fd95, fd64, fd82, fd94; +mul.f64 fd96, fd54, fd95; +fma.rn.f64 fd97, fd93, fd37, fd96; +mul.f64 fd98, fd37, fd95; +mul.f64 fd99, fd93, fd54; +sub.f64 fd100, fd99, fd98; +mad.lo.s32 r8, r5, 25000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd68; +st.shared.f64 [r9+16], fd78; +st.shared.f64 [r9+24], fd87; +st.shared.f64 [r9+32], fd97; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd101, [r11]; +ld.shared.f64 fd102, [r11+5000]; +ld.shared.f64 fd103, [r11+10000]; +ld.shared.f64 fd104, [r11+15000]; +ld.shared.f64 fd105, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +ld.shared.f64 fd106, [r11]; +ld.shared.f64 fd107, [r11+5000]; +ld.shared.f64 fd108, [r11+10000]; +ld.shared.f64 fd109, [r11+15000]; +ld.shared.f64 fd110, [r11+20000]; +add.f64 fd111, fd102, fd105; +add.f64 fd112, fd101, fd111; +add.f64 fd113, fd103, fd104; +add.f64 fd114, fd113, fd112; +add.f64 fd115, fd107, fd110; +add.f64 fd116, fd106, fd115; +add.f64 fd117, fd108, fd109; +add.f64 fd118, fd117, fd116; +fma.rn.f64 fd119, fd111, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd120, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd121, fd119, fd120; +sub.f64 fd122, fd107, fd110; +mul.f64 fd123, fd122, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd108, fd109; +fma.rn.f64 fd125, fd124, 0d3FE2CF2304755A5E, fd123; +sub.f64 fd126, fd121, fd125; +add.f64 fd127, fd125, fd121; +mul.f64 fd128, fd111, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd101, fd128; +fma.rn.f64 fd130, fd113, 0d3FD3C6EF372FE950, fd129; +mul.f64 fd131, fd122, 0d3FE2CF2304755A5E; +mul.f64 fd132, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd133, fd131, fd132; +sub.f64 fd134, fd130, fd133; +add.f64 fd135, fd133, fd130; +fma.rn.f64 fd136, fd115, 0d3FD3C6EF372FE950, fd106; +mul.f64 fd137, fd117, 0d3FE9E3779B97F4A8; +sub.f64 fd138, fd136, fd137; +sub.f64 fd139, fd102, fd105; +mul.f64 fd140, fd139, 0d3FEE6F0E134454FF; +sub.f64 fd141, fd103, fd104; +fma.rn.f64 fd142, fd141, 0d3FE2CF2304755A5E, fd140; +add.f64 fd143, fd142, fd138; +sub.f64 fd144, fd138, fd142; +mul.f64 fd145, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd146, fd106, fd145; +fma.rn.f64 fd147, fd117, 0d3FD3C6EF372FE950, fd146; +mul.f64 fd148, fd139, 0d3FE2CF2304755A5E; +mul.f64 fd149, fd141, 0d3FEE6F0E134454FF; +sub.f64 fd150, fd148, fd149; +add.f64 fd151, fd150, fd147; +sub.f64 fd152, fd147, fd150; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd153, fd154}, [rd11]; +mul.f64 fd157, fd143, fd154; +fma.rn.f64 fd158, fd153, fd126, fd157; +mul.f64 fd159, fd126, fd154; +mul.f64 fd160, fd153, fd143; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd151, fd166; +fma.rn.f64 fd168, fd164, fd134, fd167; +mul.f64 fd169, fd134, fd166; +mul.f64 fd170, fd164, fd151; +sub.f64 fd171, fd170, fd169; +ld.global.v2.f64 {fd172, fd173}, [rd11+2000]; +mul.f64 fd176, fd152, fd173; +fma.rn.f64 fd177, fd172, fd135, fd176; +mul.f64 fd178, fd135, fd173; +mul.f64 fd179, fd172, fd152; +sub.f64 fd180, fd179, fd178; +mul.f64 fd181, fd153, fd172; +mul.f64 fd182, fd154, fd173; +sub.f64 fd183, fd181, fd182; +mul.f64 fd184, fd153, fd173; +fma.rn.f64 fd185, fd154, fd172, fd184; +mul.f64 fd186, fd144, fd185; +fma.rn.f64 fd187, fd183, fd127, fd186; +mul.f64 fd188, fd127, fd185; +mul.f64 fd189, fd183, fd144; +sub.f64 fd190, fd189, fd188; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +st.shared.f64 [r17], fd114; +st.shared.f64 [r17+40], fd158; +st.shared.f64 [r17+80], fd168; +st.shared.f64 [r17+120], fd177; +st.shared.f64 [r17+160], fd187; +barrier.sync 0; +ld.shared.f64 fd191, [r11]; +ld.shared.f64 fd192, [r11+5000]; +ld.shared.f64 fd193, [r11+10000]; +ld.shared.f64 fd194, [r11+15000]; +ld.shared.f64 fd195, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r17], fd118; +st.shared.f64 [r17+40], fd161; +st.shared.f64 [r17+80], fd171; +st.shared.f64 [r17+120], fd180; +st.shared.f64 [r17+160], fd190; +barrier.sync 0; +ld.shared.f64 fd196, [r11]; +ld.shared.f64 fd197, [r11+5000]; +ld.shared.f64 fd198, [r11+10000]; +ld.shared.f64 fd199, [r11+15000]; +ld.shared.f64 fd200, [r11+20000]; +add.f64 fd201, fd192, fd195; +add.f64 fd202, fd191, fd201; +add.f64 fd203, fd193, fd194; +add.f64 fd204, fd203, fd202; +add.f64 fd205, fd197, fd200; +add.f64 fd206, fd196, fd205; +add.f64 fd207, fd198, fd199; +add.f64 fd208, fd207, fd206; +fma.rn.f64 fd209, fd201, 0d3FD3C6EF372FE950, fd191; +mul.f64 fd210, fd203, 0d3FE9E3779B97F4A8; +sub.f64 fd211, fd209, fd210; +sub.f64 fd212, fd197, fd200; +mul.f64 fd213, fd212, 0d3FEE6F0E134454FF; +sub.f64 fd214, fd198, fd199; +fma.rn.f64 fd215, fd214, 0d3FE2CF2304755A5E, fd213; +sub.f64 fd216, fd211, fd215; +add.f64 fd217, fd215, fd211; +mul.f64 fd218, fd201, 0d3FE9E3779B97F4A8; +sub.f64 fd219, fd191, fd218; +fma.rn.f64 fd220, fd203, 0d3FD3C6EF372FE950, fd219; +mul.f64 fd221, fd212, 0d3FE2CF2304755A5E; +mul.f64 fd222, fd214, 0d3FEE6F0E134454FF; +sub.f64 fd223, fd221, fd222; +sub.f64 fd224, fd220, fd223; +add.f64 fd225, fd223, fd220; +fma.rn.f64 fd226, fd205, 0d3FD3C6EF372FE950, fd196; +mul.f64 fd227, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd228, fd226, fd227; +sub.f64 fd229, fd192, fd195; +mul.f64 fd230, fd229, 0d3FEE6F0E134454FF; +sub.f64 fd231, fd193, fd194; +fma.rn.f64 fd232, fd231, 0d3FE2CF2304755A5E, fd230; +add.f64 fd233, fd232, fd228; +sub.f64 fd234, fd228, fd232; +mul.f64 fd235, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd236, fd196, fd235; +fma.rn.f64 fd237, fd207, 0d3FD3C6EF372FE950, fd236; +mul.f64 fd238, fd229, 0d3FE2CF2304755A5E; +mul.f64 fd239, fd231, 0d3FEE6F0E134454FF; +sub.f64 fd240, fd238, fd239; +add.f64 fd241, fd240, fd237; +sub.f64 fd242, fd237, fd240; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd243, fd244}, [rd16]; +mul.f64 fd247, fd233, fd244; +fma.rn.f64 fd248, fd243, fd216, fd247; +mul.f64 fd249, fd216, fd244; +mul.f64 fd250, fd243, fd233; +sub.f64 fd251, fd250, fd249; +mul.f64 fd252, fd243, fd243; +mul.f64 fd253, fd244, fd244; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd244, fd243; +fma.rn.f64 fd256, fd244, fd243, fd255; +mul.f64 fd257, fd241, fd256; +fma.rn.f64 fd258, fd254, fd224, fd257; +mul.f64 fd259, fd224, fd256; +mul.f64 fd260, fd254, fd241; +sub.f64 fd261, fd260, fd259; +ld.global.v2.f64 {fd262, fd263}, [rd16+400]; +mul.f64 fd266, fd242, fd263; +fma.rn.f64 fd267, fd262, fd225, fd266; +mul.f64 fd268, fd225, fd263; +mul.f64 fd269, fd262, fd242; +sub.f64 fd270, fd269, fd268; +mul.f64 fd271, fd243, fd262; +mul.f64 fd272, fd244, fd263; +sub.f64 fd273, fd271, fd272; +mul.f64 fd274, fd243, fd263; +fma.rn.f64 fd275, fd244, fd262, fd274; +mul.f64 fd276, fd234, fd275; +fma.rn.f64 fd277, fd273, fd217, fd276; +mul.f64 fd278, fd217, fd275; +mul.f64 fd279, fd273, fd234; +sub.f64 fd280, fd279, fd278; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +st.shared.f64 [r23], fd204; +st.shared.f64 [r23+200], fd248; +st.shared.f64 [r23+400], fd258; +st.shared.f64 [r23+600], fd267; +st.shared.f64 [r23+800], fd277; +barrier.sync 0; +ld.shared.f64 fd281, [r11]; +ld.shared.f64 fd282, [r11+5000]; +ld.shared.f64 fd283, [r11+10000]; +ld.shared.f64 fd284, [r11+15000]; +ld.shared.f64 fd285, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r23], fd208; +st.shared.f64 [r23+200], fd251; +st.shared.f64 [r23+400], fd261; +st.shared.f64 [r23+600], fd270; +st.shared.f64 [r23+800], fd280; +barrier.sync 0; +ld.shared.f64 fd286, [r11]; +ld.shared.f64 fd287, [r11+5000]; +ld.shared.f64 fd288, [r11+10000]; +ld.shared.f64 fd289, [r11+15000]; +ld.shared.f64 fd290, [r11+20000]; +add.f64 fd291, fd282, fd285; +add.f64 fd292, fd281, fd291; +add.f64 fd293, fd283, fd284; +add.f64 fd294, fd293, fd292; +add.f64 fd295, fd287, fd290; +add.f64 fd296, fd286, fd295; +add.f64 fd297, fd288, fd289; +add.f64 fd298, fd297, fd296; +fma.rn.f64 fd299, fd291, 0d3FD3C6EF372FE950, fd281; +mul.f64 fd300, fd293, 0d3FE9E3779B97F4A8; +sub.f64 fd301, fd299, fd300; +sub.f64 fd302, fd287, fd290; +mul.f64 fd303, fd302, 0d3FEE6F0E134454FF; +sub.f64 fd304, fd288, fd289; +fma.rn.f64 fd305, fd304, 0d3FE2CF2304755A5E, fd303; +sub.f64 fd306, fd301, fd305; +add.f64 fd307, fd305, fd301; +mul.f64 fd308, fd291, 0d3FE9E3779B97F4A8; +sub.f64 fd309, fd281, fd308; +fma.rn.f64 fd310, fd293, 0d3FD3C6EF372FE950, fd309; +mul.f64 fd311, fd302, 0d3FE2CF2304755A5E; +mul.f64 fd312, fd304, 0d3FEE6F0E134454FF; +sub.f64 fd313, fd311, fd312; +sub.f64 fd314, fd310, fd313; +add.f64 fd315, fd313, fd310; +fma.rn.f64 fd316, fd295, 0d3FD3C6EF372FE950, fd286; +mul.f64 fd317, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd318, fd316, fd317; +sub.f64 fd319, fd282, fd285; +mul.f64 fd320, fd319, 0d3FEE6F0E134454FF; +sub.f64 fd321, fd283, fd284; +fma.rn.f64 fd322, fd321, 0d3FE2CF2304755A5E, fd320; +add.f64 fd323, fd322, fd318; +sub.f64 fd324, fd318, fd322; +mul.f64 fd325, fd295, 0d3FE9E3779B97F4A8; +sub.f64 fd326, fd286, fd325; +fma.rn.f64 fd327, fd297, 0d3FD3C6EF372FE950, fd326; +mul.f64 fd328, fd319, 0d3FE2CF2304755A5E; +mul.f64 fd329, fd321, 0d3FEE6F0E134454FF; +sub.f64 fd330, fd328, fd329; +add.f64 fd331, fd330, fd327; +sub.f64 fd332, fd327, fd330; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 16; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd333, fd334}, [rd21]; +mul.f64 fd337, fd323, fd334; +fma.rn.f64 fd338, fd333, fd306, fd337; +mul.f64 fd339, fd306, fd334; +mul.f64 fd340, fd333, fd323; +sub.f64 fd341, fd340, fd339; +mul.f64 fd342, fd333, fd333; +mul.f64 fd343, fd334, fd334; +sub.f64 fd344, fd342, fd343; +mul.f64 fd345, fd334, fd333; +fma.rn.f64 fd346, fd334, fd333, fd345; +mul.f64 fd347, fd331, fd346; +fma.rn.f64 fd348, fd344, fd314, fd347; +mul.f64 fd349, fd314, fd346; +mul.f64 fd350, fd344, fd331; +sub.f64 fd351, fd350, fd349; +ld.global.v2.f64 {fd352, fd353}, [rd21+80]; +mul.f64 fd356, fd332, fd353; +fma.rn.f64 fd357, fd352, fd315, fd356; +mul.f64 fd358, fd315, fd353; +mul.f64 fd359, fd352, fd332; +sub.f64 fd360, fd359, fd358; +mul.f64 fd361, fd333, fd352; +mul.f64 fd362, fd334, fd353; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd333, fd353; +fma.rn.f64 fd365, fd334, fd352, fd364; +mul.f64 fd366, fd324, fd365; +fma.rn.f64 fd367, fd363, fd307, fd366; +mul.f64 fd368, fd307, fd365; +mul.f64 fd369, fd363, fd324; +sub.f64 fd370, fd369, fd368; +shl.b32 r27, r26, 3; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 5000, r28; +st.shared.f64 [r29], fd294; +st.shared.f64 [r29+1000], fd338; +st.shared.f64 [r29+2000], fd348; +st.shared.f64 [r29+3000], fd357; +st.shared.f64 [r29+4000], fd367; +barrier.sync 0; +ld.shared.f64 fd371, [r11]; +ld.shared.f64 fd372, [r11+5000]; +ld.shared.f64 fd373, [r11+10000]; +ld.shared.f64 fd374, [r11+15000]; +ld.shared.f64 fd375, [r11+20000]; +barrier.sync 0; +st.shared.f64 [r29], fd298; +st.shared.f64 [r29+1000], fd341; +st.shared.f64 [r29+2000], fd351; +st.shared.f64 [r29+3000], fd360; +st.shared.f64 [r29+4000], fd370; +barrier.sync 0; +ld.shared.f64 fd376, [r11]; +ld.shared.f64 fd377, [r11+5000]; +ld.shared.f64 fd378, [r11+10000]; +ld.shared.f64 fd379, [r11+15000]; +ld.shared.f64 fd380, [r11+20000]; +add.f64 fd381, fd372, fd375; +add.f64 fd382, fd371, fd381; +add.f64 fd383, fd373, fd374; +add.f64 fd384, fd377, fd380; +add.f64 fd385, fd376, fd384; +add.f64 fd386, fd378, fd379; +fma.rn.f64 fd387, fd381, 0d3FD3C6EF372FE950, fd371; +mul.f64 fd388, fd383, 0d3FE9E3779B97F4A8; +sub.f64 fd389, fd387, fd388; +sub.f64 fd390, fd377, fd380; +mul.f64 fd391, fd390, 0d3FEE6F0E134454FF; +sub.f64 fd392, fd378, fd379; +fma.rn.f64 fd393, fd392, 0d3FE2CF2304755A5E, fd391; +mul.f64 fd394, fd381, 0d3FE9E3779B97F4A8; +sub.f64 fd395, fd371, fd394; +fma.rn.f64 fd396, fd383, 0d3FD3C6EF372FE950, fd395; +mul.f64 fd397, fd390, 0d3FE2CF2304755A5E; +mul.f64 fd398, fd392, 0d3FEE6F0E134454FF; +sub.f64 fd399, fd397, fd398; +fma.rn.f64 fd400, fd384, 0d3FD3C6EF372FE950, fd376; +mul.f64 fd401, fd386, 0d3FE9E3779B97F4A8; +sub.f64 fd402, fd400, fd401; +sub.f64 fd403, fd372, fd375; +mul.f64 fd404, fd403, 0d3FEE6F0E134454FF; +sub.f64 fd405, fd373, fd374; +fma.rn.f64 fd406, fd405, 0d3FE2CF2304755A5E, fd404; +mul.f64 fd407, fd384, 0d3FE9E3779B97F4A8; +sub.f64 fd408, fd376, fd407; +fma.rn.f64 fd409, fd386, 0d3FD3C6EF372FE950, fd408; +mul.f64 fd410, fd403, 0d3FE2CF2304755A5E; +mul.f64 fd411, fd405, 0d3FEE6F0E134454FF; +sub.f64 fd412, fd410, fd411; +add.f64 %0, fd383, fd382; +add.f64 %1, fd386, fd385; +add.f64 %3, fd406, fd402; +sub.f64 %2, fd389, fd393; +sub.f64 %4, fd396, fd399; +add.f64 %5, fd412, fd409; +add.f64 %6, fd399, fd396; +sub.f64 %7, fd409, fd412; +sub.f64 %9, fd402, fd406; +add.f64 %8, fd393, fd389; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_3125), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<715, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<30>; +.reg .f64 fd<463>; +.reg .b64 rd<22>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 50000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %17, %25; +add.f64 fd22, %15, fd21; +add.f64 fd23, %20, %23; +add.f64 fd24, %19, %26; +add.f64 fd25, %16, fd24; +add.f64 fd26, %22, %24; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %15; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %19, %26; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %22, %24; +fma.rn.f64 fd33, fd32, 0d3FE2CF2304755A5E, fd31; +sub.f64 fd34, fd29, fd33; +add.f64 fd35, fd33, fd29; +mul.f64 fd36, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd37, %15, fd36; +fma.rn.f64 fd38, fd23, 0d3FD3C6EF372FE950, fd37; +mul.f64 fd39, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd40, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd41, fd39, fd40; +sub.f64 fd42, fd38, fd41; +add.f64 fd43, fd41, fd38; +fma.rn.f64 fd44, fd24, 0d3FD3C6EF372FE950, %16; +mul.f64 fd45, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd46, fd44, fd45; +sub.f64 fd47, %17, %25; +mul.f64 fd48, fd47, 0d3FEE6F0E134454FF; +sub.f64 fd49, %20, %23; +fma.rn.f64 fd50, fd49, 0d3FE2CF2304755A5E, fd48; +add.f64 fd51, fd50, fd46; +sub.f64 fd52, fd46, fd50; +mul.f64 fd53, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd54, %16, fd53; +fma.rn.f64 fd55, fd26, 0d3FD3C6EF372FE950, fd54; +mul.f64 fd56, fd47, 0d3FE2CF2304755A5E; +mul.f64 fd57, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd58, fd56, fd57; +add.f64 fd59, fd58, fd55; +sub.f64 fd60, fd55, fd58; +mul.wide.u32 rd2, r4, -776530087; +shr.u64 rd3, rd2, 41; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 625; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 50000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd61, fd62}, [rd6]; +mul.f64 fd65, fd51, fd62; +mul.f64 fd66, fd34, fd62; +mul.f64 fd67, fd61, fd51; +mul.f64 fd68, fd61, fd61; +mul.f64 fd69, fd62, fd62; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd62, fd61; +fma.rn.f64 fd72, fd62, fd61, fd71; +mul.f64 fd73, fd59, fd72; +mul.f64 fd74, fd42, fd72; +mul.f64 fd75, fd70, fd59; +ld.global.v2.f64 {fd76, fd77}, [rd6+10000]; +mul.f64 fd80, fd60, fd77; +mul.f64 fd81, fd43, fd77; +mul.f64 fd82, fd76, fd60; +mul.f64 fd83, fd61, fd76; +mul.f64 fd84, fd62, fd77; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd61, fd77; +fma.rn.f64 fd87, fd62, fd76, fd86; +mul.f64 fd88, fd52, fd87; +mul.f64 fd89, fd35, fd87; +mul.f64 fd90, fd85, fd52; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd91, fd26, fd25; +add.f64 fd92, fd23, fd22; +st.shared.v2.f64 [r9], {fd92, fd91}; +fma.rn.f64 fd93, fd61, fd34, fd65; +sub.f64 fd94, fd67, fd66; +st.shared.v2.f64 [r9+16], {fd93, fd94}; +fma.rn.f64 fd95, fd70, fd42, fd73; +sub.f64 fd96, fd75, fd74; +st.shared.v2.f64 [r9+32], {fd95, fd96}; +fma.rn.f64 fd97, fd76, fd43, fd80; +sub.f64 fd98, fd82, fd81; +st.shared.v2.f64 [r9+48], {fd97, fd98}; +fma.rn.f64 fd99, fd85, fd35, fd88; +sub.f64 fd100, fd90, fd89; +st.shared.v2.f64 [r9+64], {fd99, fd100}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd101, fd102}, [r11]; +ld.shared.v2.f64 {fd105, fd106}, [r11+10000]; +ld.shared.v2.f64 {fd109, fd110}, [r11+20000]; +ld.shared.v2.f64 {fd113, fd114}, [r11+30000]; +ld.shared.v2.f64 {fd117, fd118}, [r11+40000]; +add.f64 fd121, fd105, fd117; +add.f64 fd122, fd101, fd121; +add.f64 fd123, fd109, fd113; +add.f64 fd124, fd106, fd118; +add.f64 fd125, fd102, fd124; +add.f64 fd126, fd110, fd114; +fma.rn.f64 fd127, fd121, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd128, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, fd106, fd118; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd110, fd114; +fma.rn.f64 fd133, fd132, 0d3FE2CF2304755A5E, fd131; +sub.f64 fd134, fd129, fd133; +add.f64 fd135, fd133, fd129; +mul.f64 fd136, fd121, 0d3FE9E3779B97F4A8; +sub.f64 fd137, fd101, fd136; +fma.rn.f64 fd138, fd123, 0d3FD3C6EF372FE950, fd137; +mul.f64 fd139, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd140, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +sub.f64 fd142, fd138, fd141; +add.f64 fd143, fd141, fd138; +fma.rn.f64 fd144, fd124, 0d3FD3C6EF372FE950, fd102; +mul.f64 fd145, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd146, fd144, fd145; +sub.f64 fd147, fd105, fd117; +mul.f64 fd148, fd147, 0d3FEE6F0E134454FF; +sub.f64 fd149, fd109, fd113; +fma.rn.f64 fd150, fd149, 0d3FE2CF2304755A5E, fd148; +add.f64 fd151, fd150, fd146; +sub.f64 fd152, fd146, fd150; +mul.f64 fd153, fd124, 0d3FE9E3779B97F4A8; +sub.f64 fd154, fd102, fd153; +fma.rn.f64 fd155, fd126, 0d3FD3C6EF372FE950, fd154; +mul.f64 fd156, fd147, 0d3FE2CF2304755A5E; +mul.f64 fd157, fd149, 0d3FEE6F0E134454FF; +sub.f64 fd158, fd156, fd157; +add.f64 fd159, fd158, fd155; +sub.f64 fd160, fd155, fd158; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd161, fd162}, [rd11]; +mul.f64 fd165, fd151, fd162; +mul.f64 fd166, fd134, fd162; +mul.f64 fd167, fd161, fd151; +mul.f64 fd168, fd161, fd161; +mul.f64 fd169, fd162, fd162; +sub.f64 fd170, fd168, fd169; +mul.f64 fd171, fd162, fd161; +fma.rn.f64 fd172, fd162, fd161, fd171; +mul.f64 fd173, fd159, fd172; +mul.f64 fd174, fd142, fd172; +mul.f64 fd175, fd170, fd159; +ld.global.v2.f64 {fd176, fd177}, [rd11+2000]; +mul.f64 fd180, fd160, fd177; +mul.f64 fd181, fd143, fd177; +mul.f64 fd182, fd176, fd160; +mul.f64 fd183, fd161, fd176; +mul.f64 fd184, fd162, fd177; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd161, fd177; +fma.rn.f64 fd187, fd162, fd176, fd186; +mul.f64 fd188, fd152, fd187; +mul.f64 fd189, fd135, fd187; +mul.f64 fd190, fd185, fd152; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 400, r16; +add.f64 fd191, fd126, fd125; +add.f64 fd192, fd123, fd122; +st.shared.v2.f64 [r17], {fd192, fd191}; +fma.rn.f64 fd193, fd161, fd134, fd165; +sub.f64 fd194, fd167, fd166; +st.shared.v2.f64 [r17+80], {fd193, fd194}; +fma.rn.f64 fd195, fd170, fd142, fd173; +sub.f64 fd196, fd175, fd174; +st.shared.v2.f64 [r17+160], {fd195, fd196}; +fma.rn.f64 fd197, fd176, fd143, fd180; +sub.f64 fd198, fd182, fd181; +st.shared.v2.f64 [r17+240], {fd197, fd198}; +fma.rn.f64 fd199, fd185, fd135, fd188; +sub.f64 fd200, fd190, fd189; +st.shared.v2.f64 [r17+320], {fd199, fd200}; +barrier.sync 0; +ld.shared.v2.f64 {fd201, fd202}, [r11]; +ld.shared.v2.f64 {fd205, fd206}, [r11+10000]; +ld.shared.v2.f64 {fd209, fd210}, [r11+20000]; +ld.shared.v2.f64 {fd213, fd214}, [r11+30000]; +ld.shared.v2.f64 {fd217, fd218}, [r11+40000]; +add.f64 fd221, fd205, fd217; +add.f64 fd222, fd201, fd221; +add.f64 fd223, fd209, fd213; +add.f64 fd224, fd206, fd218; +add.f64 fd225, fd202, fd224; +add.f64 fd226, fd210, fd214; +fma.rn.f64 fd227, fd221, 0d3FD3C6EF372FE950, fd201; +mul.f64 fd228, fd223, 0d3FE9E3779B97F4A8; +sub.f64 fd229, fd227, fd228; +sub.f64 fd230, fd206, fd218; +mul.f64 fd231, fd230, 0d3FEE6F0E134454FF; +sub.f64 fd232, fd210, fd214; +fma.rn.f64 fd233, fd232, 0d3FE2CF2304755A5E, fd231; +sub.f64 fd234, fd229, fd233; +add.f64 fd235, fd233, fd229; +mul.f64 fd236, fd221, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd201, fd236; +fma.rn.f64 fd238, fd223, 0d3FD3C6EF372FE950, fd237; +mul.f64 fd239, fd230, 0d3FE2CF2304755A5E; +mul.f64 fd240, fd232, 0d3FEE6F0E134454FF; +sub.f64 fd241, fd239, fd240; +sub.f64 fd242, fd238, fd241; +add.f64 fd243, fd241, fd238; +fma.rn.f64 fd244, fd224, 0d3FD3C6EF372FE950, fd202; +mul.f64 fd245, fd226, 0d3FE9E3779B97F4A8; +sub.f64 fd246, fd244, fd245; +sub.f64 fd247, fd205, fd217; +mul.f64 fd248, fd247, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd209, fd213; +fma.rn.f64 fd250, fd249, 0d3FE2CF2304755A5E, fd248; +add.f64 fd251, fd250, fd246; +sub.f64 fd252, fd246, fd250; +mul.f64 fd253, fd224, 0d3FE9E3779B97F4A8; +sub.f64 fd254, fd202, fd253; +fma.rn.f64 fd255, fd226, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd256, fd247, 0d3FE2CF2304755A5E; +mul.f64 fd257, fd249, 0d3FEE6F0E134454FF; +sub.f64 fd258, fd256, fd257; +add.f64 fd259, fd258, fd255; +sub.f64 fd260, fd255, fd258; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd261, fd262}, [rd16]; +mul.f64 fd265, fd251, fd262; +mul.f64 fd266, fd234, fd262; +mul.f64 fd267, fd261, fd251; +mul.f64 fd268, fd261, fd261; +mul.f64 fd269, fd262, fd262; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd262, fd261; +fma.rn.f64 fd272, fd262, fd261, fd271; +mul.f64 fd273, fd259, fd272; +mul.f64 fd274, fd242, fd272; +mul.f64 fd275, fd270, fd259; +ld.global.v2.f64 {fd276, fd277}, [rd16+400]; +mul.f64 fd280, fd260, fd277; +mul.f64 fd281, fd243, fd277; +mul.f64 fd282, fd276, fd260; +mul.f64 fd283, fd261, fd276; +mul.f64 fd284, fd262, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd261, fd277; +fma.rn.f64 fd287, fd262, fd276, fd286; +mul.f64 fd288, fd252, fd287; +mul.f64 fd289, fd235, fd287; +mul.f64 fd290, fd285, fd252; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2000, r22; +add.f64 fd291, fd226, fd225; +add.f64 fd292, fd223, fd222; +st.shared.v2.f64 [r23], {fd292, fd291}; +fma.rn.f64 fd293, fd261, fd234, fd265; +sub.f64 fd294, fd267, fd266; +st.shared.v2.f64 [r23+400], {fd293, fd294}; +fma.rn.f64 fd295, fd270, fd242, fd273; +sub.f64 fd296, fd275, fd274; +st.shared.v2.f64 [r23+800], {fd295, fd296}; +fma.rn.f64 fd297, fd276, fd243, fd280; +sub.f64 fd298, fd282, fd281; +st.shared.v2.f64 [r23+1200], {fd297, fd298}; +fma.rn.f64 fd299, fd285, fd235, fd288; +sub.f64 fd300, fd290, fd289; +st.shared.v2.f64 [r23+1600], {fd299, fd300}; +barrier.sync 0; +ld.shared.v2.f64 {fd301, fd302}, [r11]; +ld.shared.v2.f64 {fd305, fd306}, [r11+10000]; +ld.shared.v2.f64 {fd309, fd310}, [r11+20000]; +ld.shared.v2.f64 {fd313, fd314}, [r11+30000]; +ld.shared.v2.f64 {fd317, fd318}, [r11+40000]; +add.f64 fd321, fd305, fd317; +add.f64 fd322, fd301, fd321; +add.f64 fd323, fd309, fd313; +add.f64 fd324, fd306, fd318; +add.f64 fd325, fd302, fd324; +add.f64 fd326, fd310, fd314; +fma.rn.f64 fd327, fd321, 0d3FD3C6EF372FE950, fd301; +mul.f64 fd328, fd323, 0d3FE9E3779B97F4A8; +sub.f64 fd329, fd327, fd328; +sub.f64 fd330, fd306, fd318; +mul.f64 fd331, fd330, 0d3FEE6F0E134454FF; +sub.f64 fd332, fd310, fd314; +fma.rn.f64 fd333, fd332, 0d3FE2CF2304755A5E, fd331; +sub.f64 fd334, fd329, fd333; +add.f64 fd335, fd333, fd329; +mul.f64 fd336, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd337, fd301, fd336; +fma.rn.f64 fd338, fd323, 0d3FD3C6EF372FE950, fd337; +mul.f64 fd339, fd330, 0d3FE2CF2304755A5E; +mul.f64 fd340, fd332, 0d3FEE6F0E134454FF; +sub.f64 fd341, fd339, fd340; +sub.f64 fd342, fd338, fd341; +add.f64 fd343, fd341, fd338; +fma.rn.f64 fd344, fd324, 0d3FD3C6EF372FE950, fd302; +mul.f64 fd345, fd326, 0d3FE9E3779B97F4A8; +sub.f64 fd346, fd344, fd345; +sub.f64 fd347, fd305, fd317; +mul.f64 fd348, fd347, 0d3FEE6F0E134454FF; +sub.f64 fd349, fd309, fd313; +fma.rn.f64 fd350, fd349, 0d3FE2CF2304755A5E, fd348; +add.f64 fd351, fd350, fd346; +sub.f64 fd352, fd346, fd350; +mul.f64 fd353, fd324, 0d3FE9E3779B97F4A8; +sub.f64 fd354, fd302, fd353; +fma.rn.f64 fd355, fd326, 0d3FD3C6EF372FE950, fd354; +mul.f64 fd356, fd347, 0d3FE2CF2304755A5E; +mul.f64 fd357, fd349, 0d3FEE6F0E134454FF; +sub.f64 fd358, fd356, fd357; +add.f64 fd359, fd358, fd355; +sub.f64 fd360, fd355, fd358; +mul.wide.u32 rd17, r7, 274877907; +shr.u64 rd18, rd17, 35; +cvt.u32.u64 r24, rd18; +mul.lo.s32 r25, r24, 125; +sub.s32 r26, r7, r25; +mul.wide.u32 rd19, r24, 16; +mov.u64 rd20, %14; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd361, fd362}, [rd21]; +mul.f64 fd365, fd351, fd362; +mul.f64 fd366, fd334, fd362; +mul.f64 fd367, fd361, fd351; +mul.f64 fd368, fd361, fd361; +mul.f64 fd369, fd362, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd362, fd361; +fma.rn.f64 fd372, fd362, fd361, fd371; +mul.f64 fd373, fd359, fd372; +mul.f64 fd374, fd342, fd372; +mul.f64 fd375, fd370, fd359; +ld.global.v2.f64 {fd376, fd377}, [rd21+80]; +mul.f64 fd380, fd360, fd377; +mul.f64 fd381, fd343, fd377; +mul.f64 fd382, fd376, fd360; +mul.f64 fd383, fd361, fd376; +mul.f64 fd384, fd362, fd377; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd361, fd377; +fma.rn.f64 fd387, fd362, fd376, fd386; +mul.f64 fd388, fd352, fd387; +mul.f64 fd389, fd335, fd387; +mul.f64 fd390, fd385, fd352; +shl.b32 r27, r26, 4; +add.s32 r28, r8, r27; +barrier.sync 0; +mad.lo.s32 r29, r24, 10000, r28; +add.f64 fd391, fd326, fd325; +add.f64 fd392, fd323, fd322; +st.shared.v2.f64 [r29], {fd392, fd391}; +fma.rn.f64 fd393, fd361, fd334, fd365; +sub.f64 fd394, fd367, fd366; +st.shared.v2.f64 [r29+2000], {fd393, fd394}; +fma.rn.f64 fd395, fd370, fd342, fd373; +sub.f64 fd396, fd375, fd374; +st.shared.v2.f64 [r29+4000], {fd395, fd396}; +fma.rn.f64 fd397, fd376, fd343, fd380; +sub.f64 fd398, fd382, fd381; +st.shared.v2.f64 [r29+6000], {fd397, fd398}; +fma.rn.f64 fd399, fd385, fd335, fd388; +sub.f64 fd400, fd390, fd389; +st.shared.v2.f64 [r29+8000], {fd399, fd400}; +barrier.sync 0; +ld.shared.v2.f64 {fd401, fd402}, [r11]; +ld.shared.v2.f64 {fd405, fd406}, [r11+10000]; +ld.shared.v2.f64 {fd409, fd410}, [r11+20000]; +ld.shared.v2.f64 {fd413, fd414}, [r11+30000]; +ld.shared.v2.f64 {fd417, fd418}, [r11+40000]; +add.f64 fd421, fd405, fd417; +add.f64 fd422, fd401, fd421; +add.f64 fd423, fd409, fd413; +add.f64 fd424, fd406, fd418; +add.f64 fd425, fd402, fd424; +add.f64 fd426, fd410, fd414; +fma.rn.f64 fd427, fd421, 0d3FD3C6EF372FE950, fd401; +mul.f64 fd428, fd423, 0d3FE9E3779B97F4A8; +sub.f64 fd429, fd427, fd428; +sub.f64 fd430, fd406, fd418; +mul.f64 fd431, fd430, 0d3FEE6F0E134454FF; +sub.f64 fd432, fd410, fd414; +fma.rn.f64 fd433, fd432, 0d3FE2CF2304755A5E, fd431; +mul.f64 fd434, fd421, 0d3FE9E3779B97F4A8; +sub.f64 fd435, fd401, fd434; +fma.rn.f64 fd436, fd423, 0d3FD3C6EF372FE950, fd435; +mul.f64 fd437, fd430, 0d3FE2CF2304755A5E; +mul.f64 fd438, fd432, 0d3FEE6F0E134454FF; +sub.f64 fd439, fd437, fd438; +fma.rn.f64 fd440, fd424, 0d3FD3C6EF372FE950, fd402; +mul.f64 fd441, fd426, 0d3FE9E3779B97F4A8; +sub.f64 fd442, fd440, fd441; +sub.f64 fd443, fd405, fd417; +mul.f64 fd444, fd443, 0d3FEE6F0E134454FF; +sub.f64 fd445, fd409, fd413; +fma.rn.f64 fd446, fd445, 0d3FE2CF2304755A5E, fd444; +mul.f64 fd447, fd424, 0d3FE9E3779B97F4A8; +sub.f64 fd448, fd402, fd447; +fma.rn.f64 fd449, fd426, 0d3FD3C6EF372FE950, fd448; +mul.f64 fd450, fd443, 0d3FE2CF2304755A5E; +mul.f64 fd451, fd445, 0d3FEE6F0E134454FF; +sub.f64 fd452, fd450, fd451; +add.f64 %1, fd426, fd425; +add.f64 %0, fd423, fd422; +add.f64 %3, fd446, fd442; +sub.f64 %2, fd429, fd433; +add.f64 %5, fd452, fd449; +sub.f64 %4, fd436, fd439; +sub.f64 %7, fd449, fd452; +add.f64 %6, fd439, fd436; +sub.f64 %9, fd442, fd446; +add.f64 %8, fd433, fd429; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_3125), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..33adf464e68e7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp16_fwd.hpp.inc @@ -0,0 +1,9574 @@ +#ifndef CUFFTDX_FFT_31_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_31_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<761, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<931>; +.reg .b32 r<5881>; +.reg .f64 fd<901>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %64, %122; +} +{ +add.f16x2 r4, %65, %123; +} +{ +sub.f16x2 r7, %64, %122; +} +{ +sub.f16x2 r10, %65, %123; +} +{ +add.f16x2 r13, %66, %120; +} +{ +add.f16x2 r16, %67, %121; +} +{ +sub.f16x2 r19, %66, %120; +} +{ +sub.f16x2 r22, %67, %121; +} +{ +add.f16x2 r25, %68, %118; +} +{ +add.f16x2 r28, %69, %119; +} +{ +sub.f16x2 r31, %68, %118; +} +{ +sub.f16x2 r34, %69, %119; +} +{ +add.f16x2 r37, %70, %116; +} +{ +add.f16x2 r40, %71, %117; +} +{ +sub.f16x2 r43, %70, %116; +} +{ +sub.f16x2 r46, %71, %117; +} +{ +add.f16x2 r49, %72, %114; +} +{ +add.f16x2 r52, %73, %115; +} +{ +sub.f16x2 r55, %72, %114; +} +{ +sub.f16x2 r58, %73, %115; +} +{ +add.f16x2 r61, %74, %112; +} +{ +add.f16x2 r64, %75, %113; +} +{ +sub.f16x2 r67, %74, %112; +} +{ +sub.f16x2 r70, %75, %113; +} +{ +add.f16x2 r73, %76, %110; +} +{ +add.f16x2 r76, %77, %111; +} +{ +sub.f16x2 r79, %76, %110; +} +{ +sub.f16x2 r82, %77, %111; +} +{ +add.f16x2 r85, %78, %108; +} +{ +add.f16x2 r88, %79, %109; +} +{ +sub.f16x2 r91, %78, %108; +} +{ +sub.f16x2 r94, %79, %109; +} +{ +add.f16x2 r97, %80, %106; +} +{ +add.f16x2 r100, %81, %107; +} +{ +sub.f16x2 r103, %80, %106; +} +{ +sub.f16x2 r106, %81, %107; +} +{ +add.f16x2 r109, %82, %104; +} +{ +add.f16x2 r112, %83, %105; +} +{ +sub.f16x2 r115, %82, %104; +} +{ +sub.f16x2 r118, %83, %105; +} +{ +add.f16x2 r121, %84, %102; +} +{ +add.f16x2 r124, %85, %103; +} +{ +sub.f16x2 r127, %84, %102; +} +{ +sub.f16x2 r130, %85, %103; +} +{ +add.f16x2 r133, %86, %100; +} +{ +add.f16x2 r136, %87, %101; +} +{ +sub.f16x2 r139, %86, %100; +} +{ +sub.f16x2 r142, %87, %101; +} +{ +add.f16x2 r145, %88, %98; +} +{ +add.f16x2 r148, %89, %99; +} +{ +sub.f16x2 r151, %88, %98; +} +{ +sub.f16x2 r154, %89, %99; +} +{ +add.f16x2 r157, %90, %96; +} +{ +add.f16x2 r160, %91, %97; +} +{ +sub.f16x2 r163, %90, %96; +} +{ +sub.f16x2 r166, %91, %97; +} +{ +add.f16x2 r169, %92, %94; +} +{ +add.f16x2 r172, %93, %95; +} +{ +sub.f16x2 r175, %92, %94; +} +{ +sub.f16x2 r178, %93, %95; +} +{ +add.f16x2 r181, %62, r1; +} +{ +add.f16x2 r184, %63, r4; +} +{ +add.f16x2 r187, r181, r13; +} +{ +add.f16x2 r190, r184, r16; +} +{ +add.f16x2 r193, r187, r25; +} +{ +add.f16x2 r196, r190, r28; +} +{ +add.f16x2 r199, r193, r37; +} +{ +add.f16x2 r202, r196, r40; +} +{ +add.f16x2 r205, r199, r49; +} +{ +add.f16x2 r208, r202, r52; +} +{ +add.f16x2 r211, r205, r61; +} +{ +add.f16x2 r214, r208, r64; +} +{ +add.f16x2 r217, r211, r73; +} +{ +add.f16x2 r220, r214, r76; +} +{ +add.f16x2 r223, r217, r85; +} +{ +add.f16x2 r226, r220, r88; +} +{ +add.f16x2 r229, r223, r97; +} +{ +add.f16x2 r232, r226, r100; +} +{ +add.f16x2 r235, r229, r109; +} +{ +add.f16x2 r238, r232, r112; +} +{ +add.f16x2 r241, r235, r121; +} +{ +add.f16x2 r244, r238, r124; +} +{ +add.f16x2 r247, r241, r133; +} +{ +add.f16x2 r250, r244, r136; +} +{ +add.f16x2 r253, r247, r145; +} +{ +add.f16x2 r256, r250, r148; +} +{ +add.f16x2 r259, r253, r157; +} +{ +add.f16x2 r262, r256, r160; +} +{ +add.f16x2 %0, r259, r169; +} +{ +add.f16x2 %1, r262, r172; +} +mov.u32 r5508, 0; +cvt.rn.f16.s32 rs1, r5508; +mov.b32 r283, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r5508; +mov.b32 r295, {rs2, rs2}; +mov.f64 fd847, 0d3FEF584F2CE43B84; +{ +cvt.rn.f16.f64 rs3, fd847; +} +mov.b32 r275, {rs3, rs3}; +{ +mul.f16x2 r273, r1, r275; +} +{ +add.f16x2 r276, %62, r273; +} +mov.f64 fd768, 0dBFC9C4266041CA8F; +{ +cvt.rn.f16.f64 rs4, fd768; +} +mov.b32 r281, {rs4, rs4}; +{ +mul.f16x2 r279, r10, r281; +} +{ +add.f16x2 r282, r283, r279; +} +{ +cvt.rn.f16.f64 rs5, fd847; +} +mov.b32 r287, {rs5, rs5}; +{ +mul.f16x2 r285, r4, r287; +} +{ +add.f16x2 r288, %63, r285; +} +{ +cvt.rn.f16.f64 rs6, fd768; +} +mov.b32 r293, {rs6, rs6}; +{ +mul.f16x2 r291, r7, r293; +} +{ +add.f16x2 r294, r295, r291; +} +mov.f64 fd855, 0d3FED681A366A00FA; +{ +cvt.rn.f16.f64 rs7, fd855; +} +mov.b32 r299, {rs7, rs7}; +{ +mul.f16x2 r297, r13, r299; +} +{ +add.f16x2 r300, r276, r297; +} +mov.f64 fd816, 0dBFD93D20572CA90B; +{ +cvt.rn.f16.f64 rs8, fd816; +} +mov.b32 r305, {rs8, rs8}; +{ +mul.f16x2 r303, r22, r305; +} +{ +add.f16x2 r306, r282, r303; +} +{ +cvt.rn.f16.f64 rs9, fd855; +} +mov.b32 r311, {rs9, rs9}; +{ +mul.f16x2 r309, r16, r311; +} +{ +add.f16x2 r312, r288, r309; +} +{ +cvt.rn.f16.f64 rs10, fd816; +} +mov.b32 r317, {rs10, rs10}; +{ +mul.f16x2 r315, r19, r317; +} +{ +add.f16x2 r318, r294, r315; +} +mov.f64 fd863, 0d3FEA43B1B1379AFF; +{ +cvt.rn.f16.f64 rs11, fd863; +} +mov.b32 r323, {rs11, rs11}; +{ +mul.f16x2 r321, r25, r323; +} +{ +add.f16x2 r324, r300, r321; +} +mov.f64 fd740, 0dBFE247D447A27216; +{ +cvt.rn.f16.f64 rs12, fd740; +} +mov.b32 r329, {rs12, rs12}; +{ +mul.f16x2 r327, r34, r329; +} +{ +add.f16x2 r330, r306, r327; +} +{ +cvt.rn.f16.f64 rs13, fd863; +} +mov.b32 r335, {rs13, rs13}; +{ +mul.f16x2 r333, r28, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +cvt.rn.f16.f64 rs14, fd740; +} +mov.b32 r341, {rs14, rs14}; +{ +mul.f16x2 r339, r31, r341; +} +{ +add.f16x2 r342, r318, r339; +} +mov.f64 fd871, 0d3FE60C045A2E9729; +{ +cvt.rn.f16.f64 rs15, fd871; +} +mov.b32 r347, {rs15, rs15}; +{ +mul.f16x2 r345, r37, r347; +} +{ +add.f16x2 r348, r324, r345; +} +mov.f64 fd624, 0dBFE73180A4B0D300; +{ +cvt.rn.f16.f64 rs16, fd624; +} +mov.b32 r353, {rs16, rs16}; +{ +mul.f16x2 r351, r46, r353; +} +{ +add.f16x2 r354, r330, r351; +} +{ +cvt.rn.f16.f64 rs17, fd871; +} +mov.b32 r359, {rs17, rs17}; +{ +mul.f16x2 r357, r40, r359; +} +{ +add.f16x2 r360, r336, r357; +} +{ +cvt.rn.f16.f64 rs18, fd624; +} +mov.b32 r365, {rs18, rs18}; +{ +mul.f16x2 r363, r43, r365; +} +{ +add.f16x2 r366, r342, r363; +} +mov.f64 fd879, 0d3FE0ED45EEA3B09F; +{ +cvt.rn.f16.f64 rs19, fd879; +} +mov.b32 r371, {rs19, rs19}; +{ +mul.f16x2 r369, r49, r371; +} +{ +add.f16x2 r372, r348, r369; +} +mov.f64 fd808, 0dBFEB2818007C19DF; +{ +cvt.rn.f16.f64 rs20, fd808; +} +mov.b32 r377, {rs20, rs20}; +{ +mul.f16x2 r375, r58, r377; +} +{ +add.f16x2 r378, r354, r375; +} +{ +cvt.rn.f16.f64 rs21, fd879; +} +mov.b32 r383, {rs21, rs21}; +{ +mul.f16x2 r381, r52, r383; +} +{ +add.f16x2 r384, r360, r381; +} +{ +cvt.rn.f16.f64 rs22, fd808; +} +mov.b32 r389, {rs22, rs22}; +{ +mul.f16x2 r387, r55, r389; +} +{ +add.f16x2 r390, r366, r387; +} +mov.f64 fd887, 0d3FD63A3FCFACA412; +{ +cvt.rn.f16.f64 rs23, fd887; +} +mov.b32 r395, {rs23, rs23}; +{ +mul.f16x2 r393, r61, r395; +} +{ +add.f16x2 r396, r372, r393; +} +mov.f64 fd760, 0dBFEE0210C26A6E6F; +{ +cvt.rn.f16.f64 rs24, fd760; +} +mov.b32 r401, {rs24, rs24}; +{ +mul.f16x2 r399, r70, r401; +} +{ +add.f16x2 r402, r378, r399; +} +{ +cvt.rn.f16.f64 rs25, fd887; +} +mov.b32 r407, {rs25, rs25}; +{ +mul.f16x2 r405, r64, r407; +} +{ +add.f16x2 r408, r384, r405; +} +{ +cvt.rn.f16.f64 rs26, fd760; +} +mov.b32 r413, {rs26, rs26}; +{ +mul.f16x2 r411, r67, r413; +} +{ +add.f16x2 r414, r390, r411; +} +mov.f64 fd895, 0d3FC361FC440B478F; +{ +cvt.rn.f16.f64 rs27, fd895; +} +mov.b32 r419, {rs27, rs27}; +{ +mul.f16x2 r417, r73, r419; +} +{ +add.f16x2 r420, r396, r417; +} +mov.f64 fd580, 0dBFEFA18852C3E08A; +{ +cvt.rn.f16.f64 rs28, fd580; +} +mov.b32 r425, {rs28, rs28}; +{ +mul.f16x2 r423, r82, r425; +} +{ +add.f16x2 r426, r402, r423; +} +{ +cvt.rn.f16.f64 rs29, fd895; +} +mov.b32 r431, {rs29, rs29}; +{ +mul.f16x2 r429, r76, r431; +} +{ +add.f16x2 r432, r408, r429; +} +{ +cvt.rn.f16.f64 rs30, fd580; +} +mov.b32 r437, {rs30, rs30}; +{ +mul.f16x2 r435, r79, r437; +} +{ +add.f16x2 r438, r414, r435; +} +mov.f64 fd899, 0dBFA9EEB01776B57D; +{ +cvt.rn.f16.f64 rs31, fd899; +} +mov.b32 r443, {rs31, rs31}; +{ +mul.f16x2 r441, r85, r443; +} +{ +add.f16x2 r444, r420, r441; +} +mov.f64 fd900, 0dBFEFF57C5208CCF9; +{ +cvt.rn.f16.f64 rs32, fd900; +} +mov.b32 r449, {rs32, rs32}; +{ +mul.f16x2 r447, r94, r449; +} +{ +add.f16x2 r450, r426, r447; +} +{ +cvt.rn.f16.f64 rs33, fd899; +} +mov.b32 r455, {rs33, rs33}; +{ +mul.f16x2 r453, r88, r455; +} +{ +add.f16x2 r456, r432, r453; +} +{ +cvt.rn.f16.f64 rs34, fd900; +} +mov.b32 r461, {rs34, rs34}; +{ +mul.f16x2 r459, r91, r461; +} +{ +add.f16x2 r462, r438, r459; +} +mov.f64 fd891, 0dBFD00AB0EB2D7D94; +{ +cvt.rn.f16.f64 rs35, fd891; +} +mov.b32 r467, {rs35, rs35}; +{ +mul.f16x2 r465, r97, r467; +} +{ +add.f16x2 r468, r444, r465; +} +mov.f64 fd892, 0dBFEEFA7CDDB128FA; +{ +cvt.rn.f16.f64 rs36, fd892; +} +mov.b32 r473, {rs36, rs36}; +{ +mul.f16x2 r471, r106, r473; +} +{ +add.f16x2 r474, r450, r471; +} +{ +cvt.rn.f16.f64 rs37, fd891; +} +mov.b32 r479, {rs37, rs37}; +{ +mul.f16x2 r477, r100, r479; +} +{ +add.f16x2 r480, r456, r477; +} +{ +cvt.rn.f16.f64 rs38, fd892; +} +mov.b32 r485, {rs38, rs38}; +{ +mul.f16x2 r483, r103, r485; +} +{ +add.f16x2 r486, r462, r483; +} +mov.f64 fd883, 0dBFDC2F6AF3928A8E; +{ +cvt.rn.f16.f64 rs39, fd883; +} +mov.b32 r491, {rs39, rs39}; +{ +mul.f16x2 r489, r109, r491; +} +{ +add.f16x2 r492, r468, r489; +} +mov.f64 fd884, 0dBFECBAD095F50378; +{ +cvt.rn.f16.f64 rs40, fd884; +} +mov.b32 r497, {rs40, rs40}; +{ +mul.f16x2 r495, r118, r497; +} +{ +add.f16x2 r498, r474, r495; +} +{ +cvt.rn.f16.f64 rs41, fd883; +} +mov.b32 r503, {rs41, rs41}; +{ +mul.f16x2 r501, r112, r503; +} +{ +add.f16x2 r504, r480, r501; +} +{ +cvt.rn.f16.f64 rs42, fd884; +} +mov.b32 r509, {rs42, rs42}; +{ +mul.f16x2 r507, r115, r509; +} +{ +add.f16x2 r510, r486, r507; +} +mov.f64 fd875, 0dBFE3965F49174D13; +{ +cvt.rn.f16.f64 rs43, fd875; +} +mov.b32 r515, {rs43, rs43}; +{ +mul.f16x2 r513, r121, r515; +} +{ +add.f16x2 r516, r492, r513; +} +mov.f64 fd876, 0dBFE94E08EB13C451; +{ +cvt.rn.f16.f64 rs44, fd876; +} +mov.b32 r521, {rs44, rs44}; +{ +mul.f16x2 r519, r130, r521; +} +{ +add.f16x2 r522, r498, r519; +} +{ +cvt.rn.f16.f64 rs45, fd875; +} +mov.b32 r527, {rs45, rs45}; +{ +mul.f16x2 r525, r124, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs46, fd876; +} +mov.b32 r533, {rs46, rs46}; +{ +mul.f16x2 r531, r127, r533; +} +{ +add.f16x2 r534, r510, r531; +} +mov.f64 fd867, 0dBFE847BF1D5146CC; +{ +cvt.rn.f16.f64 rs47, fd867; +} +mov.b32 r539, {rs47, rs47}; +{ +mul.f16x2 r537, r133, r539; +} +{ +add.f16x2 r540, r516, r537; +} +mov.f64 fd868, 0dBFE4D80B1AD9CCF6; +{ +cvt.rn.f16.f64 rs48, fd868; +} +mov.b32 r545, {rs48, rs48}; +{ +mul.f16x2 r543, r142, r545; +} +{ +add.f16x2 r546, r522, r543; +} +{ +cvt.rn.f16.f64 rs49, fd867; +} +mov.b32 r551, {rs49, rs49}; +{ +mul.f16x2 r549, r136, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs50, fd868; +} +mov.b32 r557, {rs50, rs50}; +{ +mul.f16x2 r555, r139, r557; +} +{ +add.f16x2 r558, r534, r555; +} +mov.f64 fd859, 0dBFEBFAA5C136B224; +{ +cvt.rn.f16.f64 rs51, fd859; +} +mov.b32 r563, {rs51, rs51}; +{ +mul.f16x2 r561, r145, r563; +} +{ +add.f16x2 r564, r540, r561; +} +mov.f64 fd860, 0dBFDF0F2FF6705BEC; +{ +cvt.rn.f16.f64 rs52, fd860; +} +mov.b32 r569, {rs52, rs52}; +{ +mul.f16x2 r567, r154, r569; +} +{ +add.f16x2 r570, r546, r567; +} +{ +cvt.rn.f16.f64 rs53, fd859; +} +mov.b32 r575, {rs53, rs53}; +{ +mul.f16x2 r573, r148, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs54, fd860; +} +mov.b32 r581, {rs54, rs54}; +{ +mul.f16x2 r579, r151, r581; +} +{ +add.f16x2 r582, r558, r579; +} +mov.f64 fd851, 0dBFEE884F0CC22CCC; +{ +cvt.rn.f16.f64 rs55, fd851; +} +mov.b32 r587, {rs55, rs55}; +{ +mul.f16x2 r585, r157, r587; +} +{ +add.f16x2 r588, r564, r585; +} +mov.f64 fd852, 0dBFD328C3F1B322CB; +{ +cvt.rn.f16.f64 rs56, fd852; +} +mov.b32 r593, {rs56, rs56}; +{ +mul.f16x2 r591, r166, r593; +} +{ +add.f16x2 r594, r570, r591; +} +{ +cvt.rn.f16.f64 rs57, fd851; +} +mov.b32 r599, {rs57, rs57}; +{ +mul.f16x2 r597, r160, r599; +} +{ +add.f16x2 r600, r576, r597; +} +{ +cvt.rn.f16.f64 rs58, fd852; +} +mov.b32 r605, {rs58, rs58}; +{ +mul.f16x2 r603, r163, r605; +} +{ +add.f16x2 r606, r582, r603; +} +mov.f64 fd843, 0dBFEFD5F830F860F9; +{ +cvt.rn.f16.f64 rs59, fd843; +} +mov.b32 r611, {rs59, rs59}; +{ +mul.f16x2 r609, r169, r611; +} +{ +add.f16x2 r612, r588, r609; +} +mov.f64 fd844, 0dBFB9E62ACA53C49F; +{ +cvt.rn.f16.f64 rs60, fd844; +} +mov.b32 r617, {rs60, rs60}; +{ +mul.f16x2 r615, r178, r617; +} +{ +add.f16x2 r618, r594, r615; +} +{ +cvt.rn.f16.f64 rs61, fd843; +} +mov.b32 r623, {rs61, rs61}; +{ +mul.f16x2 r621, r172, r623; +} +{ +add.f16x2 r624, r600, r621; +} +{ +cvt.rn.f16.f64 rs62, fd844; +} +mov.b32 r629, {rs62, rs62}; +{ +mul.f16x2 r627, r175, r629; +} +{ +add.f16x2 r630, r606, r627; +} +{ +sub.f16x2 %2, r612, r618; +} +{ +add.f16x2 %3, r624, r630; +} +{ +add.f16x2 %60, r612, r618; +} +{ +sub.f16x2 %61, r624, r630; +} +cvt.rn.f16.s32 rs63, r5508; +mov.b32 r657, {rs63, rs63}; +cvt.rn.f16.s32 rs64, r5508; +mov.b32 r669, {rs64, rs64}; +{ +cvt.rn.f16.f64 rs65, fd855; +} +mov.b32 r649, {rs65, rs65}; +{ +mul.f16x2 r647, r1, r649; +} +{ +add.f16x2 r650, %62, r647; +} +{ +cvt.rn.f16.f64 rs66, fd816; +} +mov.b32 r655, {rs66, rs66}; +{ +mul.f16x2 r653, r10, r655; +} +{ +add.f16x2 r656, r657, r653; +} +{ +cvt.rn.f16.f64 rs67, fd855; +} +mov.b32 r661, {rs67, rs67}; +{ +mul.f16x2 r659, r4, r661; +} +{ +add.f16x2 r662, %63, r659; +} +{ +cvt.rn.f16.f64 rs68, fd816; +} +mov.b32 r667, {rs68, rs68}; +{ +mul.f16x2 r665, r7, r667; +} +{ +add.f16x2 r668, r669, r665; +} +{ +cvt.rn.f16.f64 rs69, fd871; +} +mov.b32 r673, {rs69, rs69}; +{ +mul.f16x2 r671, r13, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs70, fd624; +} +mov.b32 r679, {rs70, rs70}; +{ +mul.f16x2 r677, r22, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs71, fd871; +} +mov.b32 r685, {rs71, rs71}; +{ +mul.f16x2 r683, r16, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs72, fd624; +} +mov.b32 r691, {rs72, rs72}; +{ +mul.f16x2 r689, r19, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs73, fd887; +} +mov.b32 r697, {rs73, rs73}; +{ +mul.f16x2 r695, r25, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs74, fd760; +} +mov.b32 r703, {rs74, rs74}; +{ +mul.f16x2 r701, r34, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs75, fd887; +} +mov.b32 r709, {rs75, rs75}; +{ +mul.f16x2 r707, r28, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs76, fd760; +} +mov.b32 r715, {rs76, rs76}; +{ +mul.f16x2 r713, r31, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs77, fd899; +} +mov.b32 r721, {rs77, rs77}; +{ +mul.f16x2 r719, r37, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs78, fd900; +} +mov.b32 r727, {rs78, rs78}; +{ +mul.f16x2 r725, r46, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs79, fd899; +} +mov.b32 r733, {rs79, rs79}; +{ +mul.f16x2 r731, r40, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs80, fd900; +} +mov.b32 r739, {rs80, rs80}; +{ +mul.f16x2 r737, r43, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs81, fd883; +} +mov.b32 r745, {rs81, rs81}; +{ +mul.f16x2 r743, r49, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +cvt.rn.f16.f64 rs82, fd884; +} +mov.b32 r751, {rs82, rs82}; +{ +mul.f16x2 r749, r58, r751; +} +{ +add.f16x2 r752, r728, r749; +} +{ +cvt.rn.f16.f64 rs83, fd883; +} +mov.b32 r757, {rs83, rs83}; +{ +mul.f16x2 r755, r52, r757; +} +{ +add.f16x2 r758, r734, r755; +} +{ +cvt.rn.f16.f64 rs84, fd884; +} +mov.b32 r763, {rs84, rs84}; +{ +mul.f16x2 r761, r55, r763; +} +{ +add.f16x2 r764, r740, r761; +} +{ +cvt.rn.f16.f64 rs85, fd867; +} +mov.b32 r769, {rs85, rs85}; +{ +mul.f16x2 r767, r61, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs86, fd868; +} +mov.b32 r775, {rs86, rs86}; +{ +mul.f16x2 r773, r70, r775; +} +{ +add.f16x2 r776, r752, r773; +} +{ +cvt.rn.f16.f64 rs87, fd867; +} +mov.b32 r781, {rs87, rs87}; +{ +mul.f16x2 r779, r64, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs88, fd868; +} +mov.b32 r787, {rs88, rs88}; +{ +mul.f16x2 r785, r67, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs89, fd851; +} +mov.b32 r793, {rs89, rs89}; +{ +mul.f16x2 r791, r73, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs90, fd852; +} +mov.b32 r799, {rs90, rs90}; +{ +mul.f16x2 r797, r82, r799; +} +{ +add.f16x2 r800, r776, r797; +} +{ +cvt.rn.f16.f64 rs91, fd851; +} +mov.b32 r805, {rs91, rs91}; +{ +mul.f16x2 r803, r76, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs92, fd852; +} +mov.b32 r811, {rs92, rs92}; +{ +mul.f16x2 r809, r79, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs93, fd843; +} +mov.b32 r817, {rs93, rs93}; +{ +mul.f16x2 r815, r85, r817; +} +{ +add.f16x2 r818, r794, r815; +} +mov.f64 fd820, 0d3FB9E62ACA53C49F; +{ +cvt.rn.f16.f64 rs94, fd820; +} +mov.b32 r823, {rs94, rs94}; +{ +mul.f16x2 r821, r94, r823; +} +{ +add.f16x2 r824, r800, r821; +} +{ +cvt.rn.f16.f64 rs95, fd843; +} +mov.b32 r829, {rs95, rs95}; +{ +mul.f16x2 r827, r88, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs96, fd820; +} +mov.b32 r835, {rs96, rs96}; +{ +mul.f16x2 r833, r91, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs97, fd859; +} +mov.b32 r841, {rs97, rs97}; +{ +mul.f16x2 r839, r97, r841; +} +{ +add.f16x2 r842, r818, r839; +} +mov.f64 fd572, 0d3FDF0F2FF6705BEC; +{ +cvt.rn.f16.f64 rs98, fd572; +} +mov.b32 r847, {rs98, rs98}; +{ +mul.f16x2 r845, r106, r847; +} +{ +add.f16x2 r848, r824, r845; +} +{ +cvt.rn.f16.f64 rs99, fd859; +} +mov.b32 r853, {rs99, rs99}; +{ +mul.f16x2 r851, r100, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs100, fd572; +} +mov.b32 r859, {rs100, rs100}; +{ +mul.f16x2 r857, r103, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs101, fd875; +} +mov.b32 r865, {rs101, rs101}; +{ +mul.f16x2 r863, r109, r865; +} +{ +add.f16x2 r866, r842, r863; +} +mov.f64 fd708, 0d3FE94E08EB13C451; +{ +cvt.rn.f16.f64 rs102, fd708; +} +mov.b32 r871, {rs102, rs102}; +{ +mul.f16x2 r869, r118, r871; +} +{ +add.f16x2 r872, r848, r869; +} +{ +cvt.rn.f16.f64 rs103, fd875; +} +mov.b32 r877, {rs103, rs103}; +{ +mul.f16x2 r875, r112, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs104, fd708; +} +mov.b32 r883, {rs104, rs104}; +{ +mul.f16x2 r881, r115, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +cvt.rn.f16.f64 rs105, fd891; +} +mov.b32 r889, {rs105, rs105}; +{ +mul.f16x2 r887, r121, r889; +} +{ +add.f16x2 r890, r866, r887; +} +mov.f64 fd804, 0d3FEEFA7CDDB128FA; +{ +cvt.rn.f16.f64 rs106, fd804; +} +mov.b32 r895, {rs106, rs106}; +{ +mul.f16x2 r893, r130, r895; +} +{ +add.f16x2 r896, r872, r893; +} +{ +cvt.rn.f16.f64 rs107, fd891; +} +mov.b32 r901, {rs107, rs107}; +{ +mul.f16x2 r899, r124, r901; +} +{ +add.f16x2 r902, r878, r899; +} +{ +cvt.rn.f16.f64 rs108, fd804; +} +mov.b32 r907, {rs108, rs108}; +{ +mul.f16x2 r905, r127, r907; +} +{ +add.f16x2 r908, r884, r905; +} +{ +cvt.rn.f16.f64 rs109, fd895; +} +mov.b32 r913, {rs109, rs109}; +{ +mul.f16x2 r911, r133, r913; +} +{ +add.f16x2 r914, r890, r911; +} +mov.f64 fd896, 0d3FEFA18852C3E08A; +{ +cvt.rn.f16.f64 rs110, fd896; +} +mov.b32 r919, {rs110, rs110}; +{ +mul.f16x2 r917, r142, r919; +} +{ +add.f16x2 r920, r896, r917; +} +{ +cvt.rn.f16.f64 rs111, fd895; +} +mov.b32 r925, {rs111, rs111}; +{ +mul.f16x2 r923, r136, r925; +} +{ +add.f16x2 r926, r902, r923; +} +{ +cvt.rn.f16.f64 rs112, fd896; +} +mov.b32 r931, {rs112, rs112}; +{ +mul.f16x2 r929, r139, r931; +} +{ +add.f16x2 r932, r908, r929; +} +{ +cvt.rn.f16.f64 rs113, fd879; +} +mov.b32 r937, {rs113, rs113}; +{ +mul.f16x2 r935, r145, r937; +} +{ +add.f16x2 r938, r914, r935; +} +mov.f64 fd880, 0d3FEB2818007C19DF; +{ +cvt.rn.f16.f64 rs114, fd880; +} +mov.b32 r943, {rs114, rs114}; +{ +mul.f16x2 r941, r154, r943; +} +{ +add.f16x2 r944, r920, r941; +} +{ +cvt.rn.f16.f64 rs115, fd879; +} +mov.b32 r949, {rs115, rs115}; +{ +mul.f16x2 r947, r148, r949; +} +{ +add.f16x2 r950, r926, r947; +} +{ +cvt.rn.f16.f64 rs116, fd880; +} +mov.b32 r955, {rs116, rs116}; +{ +mul.f16x2 r953, r151, r955; +} +{ +add.f16x2 r956, r932, r953; +} +{ +cvt.rn.f16.f64 rs117, fd863; +} +mov.b32 r961, {rs117, rs117}; +{ +mul.f16x2 r959, r157, r961; +} +{ +add.f16x2 r962, r938, r959; +} +mov.f64 fd864, 0d3FE247D447A27216; +{ +cvt.rn.f16.f64 rs118, fd864; +} +mov.b32 r967, {rs118, rs118}; +{ +mul.f16x2 r965, r166, r967; +} +{ +add.f16x2 r968, r944, r965; +} +{ +cvt.rn.f16.f64 rs119, fd863; +} +mov.b32 r973, {rs119, rs119}; +{ +mul.f16x2 r971, r160, r973; +} +{ +add.f16x2 r974, r950, r971; +} +{ +cvt.rn.f16.f64 rs120, fd864; +} +mov.b32 r979, {rs120, rs120}; +{ +mul.f16x2 r977, r163, r979; +} +{ +add.f16x2 r980, r956, r977; +} +{ +cvt.rn.f16.f64 rs121, fd847; +} +mov.b32 r985, {rs121, rs121}; +{ +mul.f16x2 r983, r169, r985; +} +{ +add.f16x2 r986, r962, r983; +} +mov.f64 fd848, 0d3FC9C4266041CA8F; +{ +cvt.rn.f16.f64 rs122, fd848; +} +mov.b32 r991, {rs122, rs122}; +{ +mul.f16x2 r989, r178, r991; +} +{ +add.f16x2 r992, r968, r989; +} +{ +cvt.rn.f16.f64 rs123, fd847; +} +mov.b32 r997, {rs123, rs123}; +{ +mul.f16x2 r995, r172, r997; +} +{ +add.f16x2 r998, r974, r995; +} +{ +cvt.rn.f16.f64 rs124, fd848; +} +mov.b32 r1003, {rs124, rs124}; +{ +mul.f16x2 r1001, r175, r1003; +} +{ +add.f16x2 r1004, r980, r1001; +} +{ +sub.f16x2 %4, r986, r992; +} +{ +add.f16x2 %5, r998, r1004; +} +{ +add.f16x2 %58, r986, r992; +} +{ +sub.f16x2 %59, r998, r1004; +} +cvt.rn.f16.s32 rs125, r5508; +mov.b32 r1031, {rs125, rs125}; +cvt.rn.f16.s32 rs126, r5508; +mov.b32 r1043, {rs126, rs126}; +{ +cvt.rn.f16.f64 rs127, fd863; +} +mov.b32 r1023, {rs127, rs127}; +{ +mul.f16x2 r1021, r1, r1023; +} +{ +add.f16x2 r1024, %62, r1021; +} +{ +cvt.rn.f16.f64 rs128, fd740; +} +mov.b32 r1029, {rs128, rs128}; +{ +mul.f16x2 r1027, r10, r1029; +} +{ +add.f16x2 r1030, r1031, r1027; +} +{ +cvt.rn.f16.f64 rs129, fd863; +} +mov.b32 r1035, {rs129, rs129}; +{ +mul.f16x2 r1033, r4, r1035; +} +{ +add.f16x2 r1036, %63, r1033; +} +{ +cvt.rn.f16.f64 rs130, fd740; +} +mov.b32 r1041, {rs130, rs130}; +{ +mul.f16x2 r1039, r7, r1041; +} +{ +add.f16x2 r1042, r1043, r1039; +} +{ +cvt.rn.f16.f64 rs131, fd887; +} +mov.b32 r1047, {rs131, rs131}; +{ +mul.f16x2 r1045, r13, r1047; +} +{ +add.f16x2 r1048, r1024, r1045; +} +{ +cvt.rn.f16.f64 rs132, fd760; +} +mov.b32 r1053, {rs132, rs132}; +{ +mul.f16x2 r1051, r22, r1053; +} +{ +add.f16x2 r1054, r1030, r1051; +} +{ +cvt.rn.f16.f64 rs133, fd887; +} +mov.b32 r1059, {rs133, rs133}; +{ +mul.f16x2 r1057, r16, r1059; +} +{ +add.f16x2 r1060, r1036, r1057; +} +{ +cvt.rn.f16.f64 rs134, fd760; +} +mov.b32 r1065, {rs134, rs134}; +{ +mul.f16x2 r1063, r19, r1065; +} +{ +add.f16x2 r1066, r1042, r1063; +} +{ +cvt.rn.f16.f64 rs135, fd891; +} +mov.b32 r1071, {rs135, rs135}; +{ +mul.f16x2 r1069, r25, r1071; +} +{ +add.f16x2 r1072, r1048, r1069; +} +{ +cvt.rn.f16.f64 rs136, fd892; +} +mov.b32 r1077, {rs136, rs136}; +{ +mul.f16x2 r1075, r34, r1077; +} +{ +add.f16x2 r1078, r1054, r1075; +} +{ +cvt.rn.f16.f64 rs137, fd891; +} +mov.b32 r1083, {rs137, rs137}; +{ +mul.f16x2 r1081, r28, r1083; +} +{ +add.f16x2 r1084, r1060, r1081; +} +{ +cvt.rn.f16.f64 rs138, fd892; +} +mov.b32 r1089, {rs138, rs138}; +{ +mul.f16x2 r1087, r31, r1089; +} +{ +add.f16x2 r1090, r1066, r1087; +} +{ +cvt.rn.f16.f64 rs139, fd867; +} +mov.b32 r1095, {rs139, rs139}; +{ +mul.f16x2 r1093, r37, r1095; +} +{ +add.f16x2 r1096, r1072, r1093; +} +{ +cvt.rn.f16.f64 rs140, fd868; +} +mov.b32 r1101, {rs140, rs140}; +{ +mul.f16x2 r1099, r46, r1101; +} +{ +add.f16x2 r1102, r1078, r1099; +} +{ +cvt.rn.f16.f64 rs141, fd867; +} +mov.b32 r1107, {rs141, rs141}; +{ +mul.f16x2 r1105, r40, r1107; +} +{ +add.f16x2 r1108, r1084, r1105; +} +{ +cvt.rn.f16.f64 rs142, fd868; +} +mov.b32 r1113, {rs142, rs142}; +{ +mul.f16x2 r1111, r43, r1113; +} +{ +add.f16x2 r1114, r1090, r1111; +} +{ +cvt.rn.f16.f64 rs143, fd843; +} +mov.b32 r1119, {rs143, rs143}; +{ +mul.f16x2 r1117, r49, r1119; +} +{ +add.f16x2 r1120, r1096, r1117; +} +{ +cvt.rn.f16.f64 rs144, fd844; +} +mov.b32 r1125, {rs144, rs144}; +{ +mul.f16x2 r1123, r58, r1125; +} +{ +add.f16x2 r1126, r1102, r1123; +} +{ +cvt.rn.f16.f64 rs145, fd843; +} +mov.b32 r1131, {rs145, rs145}; +{ +mul.f16x2 r1129, r52, r1131; +} +{ +add.f16x2 r1132, r1108, r1129; +} +{ +cvt.rn.f16.f64 rs146, fd844; +} +mov.b32 r1137, {rs146, rs146}; +{ +mul.f16x2 r1135, r55, r1137; +} +{ +add.f16x2 r1138, r1114, r1135; +} +{ +cvt.rn.f16.f64 rs147, fd859; +} +mov.b32 r1143, {rs147, rs147}; +{ +mul.f16x2 r1141, r61, r1143; +} +{ +add.f16x2 r1144, r1120, r1141; +} +{ +cvt.rn.f16.f64 rs148, fd572; +} +mov.b32 r1149, {rs148, rs148}; +{ +mul.f16x2 r1147, r70, r1149; +} +{ +add.f16x2 r1150, r1126, r1147; +} +{ +cvt.rn.f16.f64 rs149, fd859; +} +mov.b32 r1155, {rs149, rs149}; +{ +mul.f16x2 r1153, r64, r1155; +} +{ +add.f16x2 r1156, r1132, r1153; +} +{ +cvt.rn.f16.f64 rs150, fd572; +} +mov.b32 r1161, {rs150, rs150}; +{ +mul.f16x2 r1159, r67, r1161; +} +{ +add.f16x2 r1162, r1138, r1159; +} +{ +cvt.rn.f16.f64 rs151, fd883; +} +mov.b32 r1167, {rs151, rs151}; +{ +mul.f16x2 r1165, r73, r1167; +} +{ +add.f16x2 r1168, r1144, r1165; +} +mov.f64 fd736, 0d3FECBAD095F50378; +{ +cvt.rn.f16.f64 rs152, fd736; +} +mov.b32 r1173, {rs152, rs152}; +{ +mul.f16x2 r1171, r82, r1173; +} +{ +add.f16x2 r1174, r1150, r1171; +} +{ +cvt.rn.f16.f64 rs153, fd883; +} +mov.b32 r1179, {rs153, rs153}; +{ +mul.f16x2 r1177, r76, r1179; +} +{ +add.f16x2 r1180, r1156, r1177; +} +{ +cvt.rn.f16.f64 rs154, fd736; +} +mov.b32 r1185, {rs154, rs154}; +{ +mul.f16x2 r1183, r79, r1185; +} +{ +add.f16x2 r1186, r1162, r1183; +} +{ +cvt.rn.f16.f64 rs155, fd895; +} +mov.b32 r1191, {rs155, rs155}; +{ +mul.f16x2 r1189, r85, r1191; +} +{ +add.f16x2 r1192, r1168, r1189; +} +{ +cvt.rn.f16.f64 rs156, fd896; +} +mov.b32 r1197, {rs156, rs156}; +{ +mul.f16x2 r1195, r94, r1197; +} +{ +add.f16x2 r1198, r1174, r1195; +} +{ +cvt.rn.f16.f64 rs157, fd895; +} +mov.b32 r1203, {rs157, rs157}; +{ +mul.f16x2 r1201, r88, r1203; +} +{ +add.f16x2 r1204, r1180, r1201; +} +{ +cvt.rn.f16.f64 rs158, fd896; +} +mov.b32 r1209, {rs158, rs158}; +{ +mul.f16x2 r1207, r91, r1209; +} +{ +add.f16x2 r1210, r1186, r1207; +} +{ +cvt.rn.f16.f64 rs159, fd871; +} +mov.b32 r1215, {rs159, rs159}; +{ +mul.f16x2 r1213, r97, r1215; +} +{ +add.f16x2 r1216, r1192, r1213; +} +mov.f64 fd872, 0d3FE73180A4B0D300; +{ +cvt.rn.f16.f64 rs160, fd872; +} +mov.b32 r1221, {rs160, rs160}; +{ +mul.f16x2 r1219, r106, r1221; +} +{ +add.f16x2 r1222, r1198, r1219; +} +{ +cvt.rn.f16.f64 rs161, fd871; +} +mov.b32 r1227, {rs161, rs161}; +{ +mul.f16x2 r1225, r100, r1227; +} +{ +add.f16x2 r1228, r1204, r1225; +} +{ +cvt.rn.f16.f64 rs162, fd872; +} +mov.b32 r1233, {rs162, rs162}; +{ +mul.f16x2 r1231, r103, r1233; +} +{ +add.f16x2 r1234, r1210, r1231; +} +{ +cvt.rn.f16.f64 rs163, fd847; +} +mov.b32 r1239, {rs163, rs163}; +{ +mul.f16x2 r1237, r109, r1239; +} +{ +add.f16x2 r1240, r1216, r1237; +} +{ +cvt.rn.f16.f64 rs164, fd848; +} +mov.b32 r1245, {rs164, rs164}; +{ +mul.f16x2 r1243, r118, r1245; +} +{ +add.f16x2 r1246, r1222, r1243; +} +{ +cvt.rn.f16.f64 rs165, fd847; +} +mov.b32 r1251, {rs165, rs165}; +{ +mul.f16x2 r1249, r112, r1251; +} +{ +add.f16x2 r1252, r1228, r1249; +} +{ +cvt.rn.f16.f64 rs166, fd848; +} +mov.b32 r1257, {rs166, rs166}; +{ +mul.f16x2 r1255, r115, r1257; +} +{ +add.f16x2 r1258, r1234, r1255; +} +{ +cvt.rn.f16.f64 rs167, fd855; +} +mov.b32 r1263, {rs167, rs167}; +{ +mul.f16x2 r1261, r121, r1263; +} +{ +add.f16x2 r1264, r1240, r1261; +} +{ +cvt.rn.f16.f64 rs168, fd816; +} +mov.b32 r1269, {rs168, rs168}; +{ +mul.f16x2 r1267, r130, r1269; +} +{ +add.f16x2 r1270, r1246, r1267; +} +{ +cvt.rn.f16.f64 rs169, fd855; +} +mov.b32 r1275, {rs169, rs169}; +{ +mul.f16x2 r1273, r124, r1275; +} +{ +add.f16x2 r1276, r1252, r1273; +} +{ +cvt.rn.f16.f64 rs170, fd816; +} +mov.b32 r1281, {rs170, rs170}; +{ +mul.f16x2 r1279, r127, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs171, fd879; +} +mov.b32 r1287, {rs171, rs171}; +{ +mul.f16x2 r1285, r133, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +cvt.rn.f16.f64 rs172, fd808; +} +mov.b32 r1293, {rs172, rs172}; +{ +mul.f16x2 r1291, r142, r1293; +} +{ +add.f16x2 r1294, r1270, r1291; +} +{ +cvt.rn.f16.f64 rs173, fd879; +} +mov.b32 r1299, {rs173, rs173}; +{ +mul.f16x2 r1297, r136, r1299; +} +{ +add.f16x2 r1300, r1276, r1297; +} +{ +cvt.rn.f16.f64 rs174, fd808; +} +mov.b32 r1305, {rs174, rs174}; +{ +mul.f16x2 r1303, r139, r1305; +} +{ +add.f16x2 r1306, r1282, r1303; +} +{ +cvt.rn.f16.f64 rs175, fd899; +} +mov.b32 r1311, {rs175, rs175}; +{ +mul.f16x2 r1309, r145, r1311; +} +{ +add.f16x2 r1312, r1288, r1309; +} +{ +cvt.rn.f16.f64 rs176, fd900; +} +mov.b32 r1317, {rs176, rs176}; +{ +mul.f16x2 r1315, r154, r1317; +} +{ +add.f16x2 r1318, r1294, r1315; +} +{ +cvt.rn.f16.f64 rs177, fd899; +} +mov.b32 r1323, {rs177, rs177}; +{ +mul.f16x2 r1321, r148, r1323; +} +{ +add.f16x2 r1324, r1300, r1321; +} +{ +cvt.rn.f16.f64 rs178, fd900; +} +mov.b32 r1329, {rs178, rs178}; +{ +mul.f16x2 r1327, r151, r1329; +} +{ +add.f16x2 r1330, r1306, r1327; +} +{ +cvt.rn.f16.f64 rs179, fd875; +} +mov.b32 r1335, {rs179, rs179}; +{ +mul.f16x2 r1333, r157, r1335; +} +{ +add.f16x2 r1336, r1312, r1333; +} +{ +cvt.rn.f16.f64 rs180, fd876; +} +mov.b32 r1341, {rs180, rs180}; +{ +mul.f16x2 r1339, r166, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs181, fd875; +} +mov.b32 r1347, {rs181, rs181}; +{ +mul.f16x2 r1345, r160, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs182, fd876; +} +mov.b32 r1353, {rs182, rs182}; +{ +mul.f16x2 r1351, r163, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs183, fd851; +} +mov.b32 r1359, {rs183, rs183}; +{ +mul.f16x2 r1357, r169, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs184, fd852; +} +mov.b32 r1365, {rs184, rs184}; +{ +mul.f16x2 r1363, r178, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +cvt.rn.f16.f64 rs185, fd851; +} +mov.b32 r1371, {rs185, rs185}; +{ +mul.f16x2 r1369, r172, r1371; +} +{ +add.f16x2 r1372, r1348, r1369; +} +{ +cvt.rn.f16.f64 rs186, fd852; +} +mov.b32 r1377, {rs186, rs186}; +{ +mul.f16x2 r1375, r175, r1377; +} +{ +add.f16x2 r1378, r1354, r1375; +} +{ +sub.f16x2 %6, r1360, r1366; +} +{ +add.f16x2 %7, r1372, r1378; +} +{ +add.f16x2 %56, r1360, r1366; +} +{ +sub.f16x2 %57, r1372, r1378; +} +cvt.rn.f16.s32 rs187, r5508; +mov.b32 r1405, {rs187, rs187}; +cvt.rn.f16.s32 rs188, r5508; +mov.b32 r1417, {rs188, rs188}; +{ +cvt.rn.f16.f64 rs189, fd871; +} +mov.b32 r1397, {rs189, rs189}; +{ +mul.f16x2 r1395, r1, r1397; +} +{ +add.f16x2 r1398, %62, r1395; +} +{ +cvt.rn.f16.f64 rs190, fd624; +} +mov.b32 r1403, {rs190, rs190}; +{ +mul.f16x2 r1401, r10, r1403; +} +{ +add.f16x2 r1404, r1405, r1401; +} +{ +cvt.rn.f16.f64 rs191, fd871; +} +mov.b32 r1409, {rs191, rs191}; +{ +mul.f16x2 r1407, r4, r1409; +} +{ +add.f16x2 r1410, %63, r1407; +} +{ +cvt.rn.f16.f64 rs192, fd624; +} +mov.b32 r1415, {rs192, rs192}; +{ +mul.f16x2 r1413, r7, r1415; +} +{ +add.f16x2 r1416, r1417, r1413; +} +{ +cvt.rn.f16.f64 rs193, fd899; +} +mov.b32 r1421, {rs193, rs193}; +{ +mul.f16x2 r1419, r13, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs194, fd900; +} +mov.b32 r1427, {rs194, rs194}; +{ +mul.f16x2 r1425, r22, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs195, fd899; +} +mov.b32 r1433, {rs195, rs195}; +{ +mul.f16x2 r1431, r16, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs196, fd900; +} +mov.b32 r1439, {rs196, rs196}; +{ +mul.f16x2 r1437, r19, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs197, fd867; +} +mov.b32 r1445, {rs197, rs197}; +{ +mul.f16x2 r1443, r25, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs198, fd868; +} +mov.b32 r1451, {rs198, rs198}; +{ +mul.f16x2 r1449, r34, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs199, fd867; +} +mov.b32 r1457, {rs199, rs199}; +{ +mul.f16x2 r1455, r28, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs200, fd868; +} +mov.b32 r1463, {rs200, rs200}; +{ +mul.f16x2 r1461, r31, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs201, fd843; +} +mov.b32 r1469, {rs201, rs201}; +{ +mul.f16x2 r1467, r37, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs202, fd820; +} +mov.b32 r1475, {rs202, rs202}; +{ +mul.f16x2 r1473, r46, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs203, fd843; +} +mov.b32 r1481, {rs203, rs203}; +{ +mul.f16x2 r1479, r40, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs204, fd820; +} +mov.b32 r1487, {rs204, rs204}; +{ +mul.f16x2 r1485, r43, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs205, fd875; +} +mov.b32 r1493, {rs205, rs205}; +{ +mul.f16x2 r1491, r49, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs206, fd708; +} +mov.b32 r1499, {rs206, rs206}; +{ +mul.f16x2 r1497, r58, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs207, fd875; +} +mov.b32 r1505, {rs207, rs207}; +{ +mul.f16x2 r1503, r52, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +cvt.rn.f16.f64 rs208, fd708; +} +mov.b32 r1511, {rs208, rs208}; +{ +mul.f16x2 r1509, r55, r1511; +} +{ +add.f16x2 r1512, r1488, r1509; +} +{ +cvt.rn.f16.f64 rs209, fd895; +} +mov.b32 r1517, {rs209, rs209}; +{ +mul.f16x2 r1515, r61, r1517; +} +{ +add.f16x2 r1518, r1494, r1515; +} +{ +cvt.rn.f16.f64 rs210, fd896; +} +mov.b32 r1523, {rs210, rs210}; +{ +mul.f16x2 r1521, r70, r1523; +} +{ +add.f16x2 r1524, r1500, r1521; +} +{ +cvt.rn.f16.f64 rs211, fd895; +} +mov.b32 r1529, {rs211, rs211}; +{ +mul.f16x2 r1527, r64, r1529; +} +{ +add.f16x2 r1530, r1506, r1527; +} +{ +cvt.rn.f16.f64 rs212, fd896; +} +mov.b32 r1535, {rs212, rs212}; +{ +mul.f16x2 r1533, r67, r1535; +} +{ +add.f16x2 r1536, r1512, r1533; +} +{ +cvt.rn.f16.f64 rs213, fd863; +} +mov.b32 r1541, {rs213, rs213}; +{ +mul.f16x2 r1539, r73, r1541; +} +{ +add.f16x2 r1542, r1518, r1539; +} +{ +cvt.rn.f16.f64 rs214, fd864; +} +mov.b32 r1547, {rs214, rs214}; +{ +mul.f16x2 r1545, r82, r1547; +} +{ +add.f16x2 r1548, r1524, r1545; +} +{ +cvt.rn.f16.f64 rs215, fd863; +} +mov.b32 r1553, {rs215, rs215}; +{ +mul.f16x2 r1551, r76, r1553; +} +{ +add.f16x2 r1554, r1530, r1551; +} +{ +cvt.rn.f16.f64 rs216, fd864; +} +mov.b32 r1559, {rs216, rs216}; +{ +mul.f16x2 r1557, r79, r1559; +} +{ +add.f16x2 r1560, r1536, r1557; +} +{ +cvt.rn.f16.f64 rs217, fd847; +} +mov.b32 r1565, {rs217, rs217}; +{ +mul.f16x2 r1563, r85, r1565; +} +{ +add.f16x2 r1566, r1542, r1563; +} +{ +cvt.rn.f16.f64 rs218, fd768; +} +mov.b32 r1571, {rs218, rs218}; +{ +mul.f16x2 r1569, r94, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +{ +cvt.rn.f16.f64 rs219, fd847; +} +mov.b32 r1577, {rs219, rs219}; +{ +mul.f16x2 r1575, r88, r1577; +} +{ +add.f16x2 r1578, r1554, r1575; +} +{ +cvt.rn.f16.f64 rs220, fd768; +} +mov.b32 r1583, {rs220, rs220}; +{ +mul.f16x2 r1581, r91, r1583; +} +{ +add.f16x2 r1584, r1560, r1581; +} +{ +cvt.rn.f16.f64 rs221, fd879; +} +mov.b32 r1589, {rs221, rs221}; +{ +mul.f16x2 r1587, r97, r1589; +} +{ +add.f16x2 r1590, r1566, r1587; +} +{ +cvt.rn.f16.f64 rs222, fd808; +} +mov.b32 r1595, {rs222, rs222}; +{ +mul.f16x2 r1593, r106, r1595; +} +{ +add.f16x2 r1596, r1572, r1593; +} +{ +cvt.rn.f16.f64 rs223, fd879; +} +mov.b32 r1601, {rs223, rs223}; +{ +mul.f16x2 r1599, r100, r1601; +} +{ +add.f16x2 r1602, r1578, r1599; +} +{ +cvt.rn.f16.f64 rs224, fd808; +} +mov.b32 r1607, {rs224, rs224}; +{ +mul.f16x2 r1605, r103, r1607; +} +{ +add.f16x2 r1608, r1584, r1605; +} +{ +cvt.rn.f16.f64 rs225, fd891; +} +mov.b32 r1613, {rs225, rs225}; +{ +mul.f16x2 r1611, r109, r1613; +} +{ +add.f16x2 r1614, r1590, r1611; +} +{ +cvt.rn.f16.f64 rs226, fd892; +} +mov.b32 r1619, {rs226, rs226}; +{ +mul.f16x2 r1617, r118, r1619; +} +{ +add.f16x2 r1620, r1596, r1617; +} +{ +cvt.rn.f16.f64 rs227, fd891; +} +mov.b32 r1625, {rs227, rs227}; +{ +mul.f16x2 r1623, r112, r1625; +} +{ +add.f16x2 r1626, r1602, r1623; +} +{ +cvt.rn.f16.f64 rs228, fd892; +} +mov.b32 r1631, {rs228, rs228}; +{ +mul.f16x2 r1629, r115, r1631; +} +{ +add.f16x2 r1632, r1608, r1629; +} +{ +cvt.rn.f16.f64 rs229, fd859; +} +mov.b32 r1637, {rs229, rs229}; +{ +mul.f16x2 r1635, r121, r1637; +} +{ +add.f16x2 r1638, r1614, r1635; +} +{ +cvt.rn.f16.f64 rs230, fd860; +} +mov.b32 r1643, {rs230, rs230}; +{ +mul.f16x2 r1641, r130, r1643; +} +{ +add.f16x2 r1644, r1620, r1641; +} +{ +cvt.rn.f16.f64 rs231, fd859; +} +mov.b32 r1649, {rs231, rs231}; +{ +mul.f16x2 r1647, r124, r1649; +} +{ +add.f16x2 r1650, r1626, r1647; +} +{ +cvt.rn.f16.f64 rs232, fd860; +} +mov.b32 r1655, {rs232, rs232}; +{ +mul.f16x2 r1653, r127, r1655; +} +{ +add.f16x2 r1656, r1632, r1653; +} +{ +cvt.rn.f16.f64 rs233, fd851; +} +mov.b32 r1661, {rs233, rs233}; +{ +mul.f16x2 r1659, r133, r1661; +} +{ +add.f16x2 r1662, r1638, r1659; +} +mov.f64 fd676, 0d3FD328C3F1B322CB; +{ +cvt.rn.f16.f64 rs234, fd676; +} +mov.b32 r1667, {rs234, rs234}; +{ +mul.f16x2 r1665, r142, r1667; +} +{ +add.f16x2 r1668, r1644, r1665; +} +{ +cvt.rn.f16.f64 rs235, fd851; +} +mov.b32 r1673, {rs235, rs235}; +{ +mul.f16x2 r1671, r136, r1673; +} +{ +add.f16x2 r1674, r1650, r1671; +} +{ +cvt.rn.f16.f64 rs236, fd676; +} +mov.b32 r1679, {rs236, rs236}; +{ +mul.f16x2 r1677, r139, r1679; +} +{ +add.f16x2 r1680, r1656, r1677; +} +{ +cvt.rn.f16.f64 rs237, fd883; +} +mov.b32 r1685, {rs237, rs237}; +{ +mul.f16x2 r1683, r145, r1685; +} +{ +add.f16x2 r1686, r1662, r1683; +} +{ +cvt.rn.f16.f64 rs238, fd736; +} +mov.b32 r1691, {rs238, rs238}; +{ +mul.f16x2 r1689, r154, r1691; +} +{ +add.f16x2 r1692, r1668, r1689; +} +{ +cvt.rn.f16.f64 rs239, fd883; +} +mov.b32 r1697, {rs239, rs239}; +{ +mul.f16x2 r1695, r148, r1697; +} +{ +add.f16x2 r1698, r1674, r1695; +} +{ +cvt.rn.f16.f64 rs240, fd736; +} +mov.b32 r1703, {rs240, rs240}; +{ +mul.f16x2 r1701, r151, r1703; +} +{ +add.f16x2 r1704, r1680, r1701; +} +{ +cvt.rn.f16.f64 rs241, fd887; +} +mov.b32 r1709, {rs241, rs241}; +{ +mul.f16x2 r1707, r157, r1709; +} +{ +add.f16x2 r1710, r1686, r1707; +} +mov.f64 fd888, 0d3FEE0210C26A6E6F; +{ +cvt.rn.f16.f64 rs242, fd888; +} +mov.b32 r1715, {rs242, rs242}; +{ +mul.f16x2 r1713, r166, r1715; +} +{ +add.f16x2 r1716, r1692, r1713; +} +{ +cvt.rn.f16.f64 rs243, fd887; +} +mov.b32 r1721, {rs243, rs243}; +{ +mul.f16x2 r1719, r160, r1721; +} +{ +add.f16x2 r1722, r1698, r1719; +} +{ +cvt.rn.f16.f64 rs244, fd888; +} +mov.b32 r1727, {rs244, rs244}; +{ +mul.f16x2 r1725, r163, r1727; +} +{ +add.f16x2 r1728, r1704, r1725; +} +{ +cvt.rn.f16.f64 rs245, fd855; +} +mov.b32 r1733, {rs245, rs245}; +{ +mul.f16x2 r1731, r169, r1733; +} +{ +add.f16x2 r1734, r1710, r1731; +} +mov.f64 fd856, 0d3FD93D20572CA90B; +{ +cvt.rn.f16.f64 rs246, fd856; +} +mov.b32 r1739, {rs246, rs246}; +{ +mul.f16x2 r1737, r178, r1739; +} +{ +add.f16x2 r1740, r1716, r1737; +} +{ +cvt.rn.f16.f64 rs247, fd855; +} +mov.b32 r1745, {rs247, rs247}; +{ +mul.f16x2 r1743, r172, r1745; +} +{ +add.f16x2 r1746, r1722, r1743; +} +{ +cvt.rn.f16.f64 rs248, fd856; +} +mov.b32 r1751, {rs248, rs248}; +{ +mul.f16x2 r1749, r175, r1751; +} +{ +add.f16x2 r1752, r1728, r1749; +} +{ +sub.f16x2 %8, r1734, r1740; +} +{ +add.f16x2 %9, r1746, r1752; +} +{ +add.f16x2 %54, r1734, r1740; +} +{ +sub.f16x2 %55, r1746, r1752; +} +cvt.rn.f16.s32 rs249, r5508; +mov.b32 r1779, {rs249, rs249}; +cvt.rn.f16.s32 rs250, r5508; +mov.b32 r1791, {rs250, rs250}; +{ +cvt.rn.f16.f64 rs251, fd879; +} +mov.b32 r1771, {rs251, rs251}; +{ +mul.f16x2 r1769, r1, r1771; +} +{ +add.f16x2 r1772, %62, r1769; +} +{ +cvt.rn.f16.f64 rs252, fd808; +} +mov.b32 r1777, {rs252, rs252}; +{ +mul.f16x2 r1775, r10, r1777; +} +{ +add.f16x2 r1778, r1779, r1775; +} +{ +cvt.rn.f16.f64 rs253, fd879; +} +mov.b32 r1783, {rs253, rs253}; +{ +mul.f16x2 r1781, r4, r1783; +} +{ +add.f16x2 r1784, %63, r1781; +} +{ +cvt.rn.f16.f64 rs254, fd808; +} +mov.b32 r1789, {rs254, rs254}; +{ +mul.f16x2 r1787, r7, r1789; +} +{ +add.f16x2 r1790, r1791, r1787; +} +{ +cvt.rn.f16.f64 rs255, fd883; +} +mov.b32 r1795, {rs255, rs255}; +{ +mul.f16x2 r1793, r13, r1795; +} +{ +add.f16x2 r1796, r1772, r1793; +} +{ +cvt.rn.f16.f64 rs256, fd884; +} +mov.b32 r1801, {rs256, rs256}; +{ +mul.f16x2 r1799, r22, r1801; +} +{ +add.f16x2 r1802, r1778, r1799; +} +{ +cvt.rn.f16.f64 rs257, fd883; +} +mov.b32 r1807, {rs257, rs257}; +{ +mul.f16x2 r1805, r16, r1807; +} +{ +add.f16x2 r1808, r1784, r1805; +} +{ +cvt.rn.f16.f64 rs258, fd884; +} +mov.b32 r1813, {rs258, rs258}; +{ +mul.f16x2 r1811, r19, r1813; +} +{ +add.f16x2 r1814, r1790, r1811; +} +{ +cvt.rn.f16.f64 rs259, fd843; +} +mov.b32 r1819, {rs259, rs259}; +{ +mul.f16x2 r1817, r25, r1819; +} +{ +add.f16x2 r1820, r1796, r1817; +} +{ +cvt.rn.f16.f64 rs260, fd844; +} +mov.b32 r1825, {rs260, rs260}; +{ +mul.f16x2 r1823, r34, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs261, fd843; +} +mov.b32 r1831, {rs261, rs261}; +{ +mul.f16x2 r1829, r28, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs262, fd844; +} +mov.b32 r1837, {rs262, rs262}; +{ +mul.f16x2 r1835, r31, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs263, fd875; +} +mov.b32 r1843, {rs263, rs263}; +{ +mul.f16x2 r1841, r37, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs264, fd708; +} +mov.b32 r1849, {rs264, rs264}; +{ +mul.f16x2 r1847, r46, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs265, fd875; +} +mov.b32 r1855, {rs265, rs265}; +{ +mul.f16x2 r1853, r40, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs266, fd708; +} +mov.b32 r1861, {rs266, rs266}; +{ +mul.f16x2 r1859, r43, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs267, fd887; +} +mov.b32 r1867, {rs267, rs267}; +{ +mul.f16x2 r1865, r49, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs268, fd888; +} +mov.b32 r1873, {rs268, rs268}; +{ +mul.f16x2 r1871, r58, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs269, fd887; +} +mov.b32 r1879, {rs269, rs269}; +{ +mul.f16x2 r1877, r52, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs270, fd888; +} +mov.b32 r1885, {rs270, rs270}; +{ +mul.f16x2 r1883, r55, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs271, fd847; +} +mov.b32 r1891, {rs271, rs271}; +{ +mul.f16x2 r1889, r61, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs272, fd848; +} +mov.b32 r1897, {rs272, rs272}; +{ +mul.f16x2 r1895, r70, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs273, fd847; +} +mov.b32 r1903, {rs273, rs273}; +{ +mul.f16x2 r1901, r64, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs274, fd848; +} +mov.b32 r1909, {rs274, rs274}; +{ +mul.f16x2 r1907, r67, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs275, fd871; +} +mov.b32 r1915, {rs275, rs275}; +{ +mul.f16x2 r1913, r73, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs276, fd624; +} +mov.b32 r1921, {rs276, rs276}; +{ +mul.f16x2 r1919, r82, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs277, fd871; +} +mov.b32 r1927, {rs277, rs277}; +{ +mul.f16x2 r1925, r76, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs278, fd624; +} +mov.b32 r1933, {rs278, rs278}; +{ +mul.f16x2 r1931, r79, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs279, fd891; +} +mov.b32 r1939, {rs279, rs279}; +{ +mul.f16x2 r1937, r85, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +cvt.rn.f16.f64 rs280, fd892; +} +mov.b32 r1945, {rs280, rs280}; +{ +mul.f16x2 r1943, r94, r1945; +} +{ +add.f16x2 r1946, r1922, r1943; +} +{ +cvt.rn.f16.f64 rs281, fd891; +} +mov.b32 r1951, {rs281, rs281}; +{ +mul.f16x2 r1949, r88, r1951; +} +{ +add.f16x2 r1952, r1928, r1949; +} +{ +cvt.rn.f16.f64 rs282, fd892; +} +mov.b32 r1957, {rs282, rs282}; +{ +mul.f16x2 r1955, r91, r1957; +} +{ +add.f16x2 r1958, r1934, r1955; +} +{ +cvt.rn.f16.f64 rs283, fd851; +} +mov.b32 r1963, {rs283, rs283}; +{ +mul.f16x2 r1961, r97, r1963; +} +{ +add.f16x2 r1964, r1940, r1961; +} +{ +cvt.rn.f16.f64 rs284, fd852; +} +mov.b32 r1969, {rs284, rs284}; +{ +mul.f16x2 r1967, r106, r1969; +} +{ +add.f16x2 r1970, r1946, r1967; +} +{ +cvt.rn.f16.f64 rs285, fd851; +} +mov.b32 r1975, {rs285, rs285}; +{ +mul.f16x2 r1973, r100, r1975; +} +{ +add.f16x2 r1976, r1952, r1973; +} +{ +cvt.rn.f16.f64 rs286, fd852; +} +mov.b32 r1981, {rs286, rs286}; +{ +mul.f16x2 r1979, r103, r1981; +} +{ +add.f16x2 r1982, r1958, r1979; +} +{ +cvt.rn.f16.f64 rs287, fd867; +} +mov.b32 r1987, {rs287, rs287}; +{ +mul.f16x2 r1985, r109, r1987; +} +{ +add.f16x2 r1988, r1964, r1985; +} +mov.f64 fd812, 0d3FE4D80B1AD9CCF6; +{ +cvt.rn.f16.f64 rs288, fd812; +} +mov.b32 r1993, {rs288, rs288}; +{ +mul.f16x2 r1991, r118, r1993; +} +{ +add.f16x2 r1994, r1970, r1991; +} +{ +cvt.rn.f16.f64 rs289, fd867; +} +mov.b32 r1999, {rs289, rs289}; +{ +mul.f16x2 r1997, r112, r1999; +} +{ +add.f16x2 r2000, r1976, r1997; +} +{ +cvt.rn.f16.f64 rs290, fd812; +} +mov.b32 r2005, {rs290, rs290}; +{ +mul.f16x2 r2003, r115, r2005; +} +{ +add.f16x2 r2006, r1982, r2003; +} +{ +cvt.rn.f16.f64 rs291, fd895; +} +mov.b32 r2011, {rs291, rs291}; +{ +mul.f16x2 r2009, r121, r2011; +} +{ +add.f16x2 r2012, r1988, r2009; +} +{ +cvt.rn.f16.f64 rs292, fd896; +} +mov.b32 r2017, {rs292, rs292}; +{ +mul.f16x2 r2015, r130, r2017; +} +{ +add.f16x2 r2018, r1994, r2015; +} +{ +cvt.rn.f16.f64 rs293, fd895; +} +mov.b32 r2023, {rs293, rs293}; +{ +mul.f16x2 r2021, r124, r2023; +} +{ +add.f16x2 r2024, r2000, r2021; +} +{ +cvt.rn.f16.f64 rs294, fd896; +} +mov.b32 r2029, {rs294, rs294}; +{ +mul.f16x2 r2027, r127, r2029; +} +{ +add.f16x2 r2030, r2006, r2027; +} +{ +cvt.rn.f16.f64 rs295, fd855; +} +mov.b32 r2035, {rs295, rs295}; +{ +mul.f16x2 r2033, r133, r2035; +} +{ +add.f16x2 r2036, r2012, r2033; +} +{ +cvt.rn.f16.f64 rs296, fd856; +} +mov.b32 r2041, {rs296, rs296}; +{ +mul.f16x2 r2039, r142, r2041; +} +{ +add.f16x2 r2042, r2018, r2039; +} +{ +cvt.rn.f16.f64 rs297, fd855; +} +mov.b32 r2047, {rs297, rs297}; +{ +mul.f16x2 r2045, r136, r2047; +} +{ +add.f16x2 r2048, r2024, r2045; +} +{ +cvt.rn.f16.f64 rs298, fd856; +} +mov.b32 r2053, {rs298, rs298}; +{ +mul.f16x2 r2051, r139, r2053; +} +{ +add.f16x2 r2054, r2030, r2051; +} +{ +cvt.rn.f16.f64 rs299, fd863; +} +mov.b32 r2059, {rs299, rs299}; +{ +mul.f16x2 r2057, r145, r2059; +} +{ +add.f16x2 r2060, r2036, r2057; +} +{ +cvt.rn.f16.f64 rs300, fd740; +} +mov.b32 r2065, {rs300, rs300}; +{ +mul.f16x2 r2063, r154, r2065; +} +{ +add.f16x2 r2066, r2042, r2063; +} +{ +cvt.rn.f16.f64 rs301, fd863; +} +mov.b32 r2071, {rs301, rs301}; +{ +mul.f16x2 r2069, r148, r2071; +} +{ +add.f16x2 r2072, r2048, r2069; +} +{ +cvt.rn.f16.f64 rs302, fd740; +} +mov.b32 r2077, {rs302, rs302}; +{ +mul.f16x2 r2075, r151, r2077; +} +{ +add.f16x2 r2078, r2054, r2075; +} +{ +cvt.rn.f16.f64 rs303, fd899; +} +mov.b32 r2083, {rs303, rs303}; +{ +mul.f16x2 r2081, r157, r2083; +} +{ +add.f16x2 r2084, r2060, r2081; +} +{ +cvt.rn.f16.f64 rs304, fd900; +} +mov.b32 r2089, {rs304, rs304}; +{ +mul.f16x2 r2087, r166, r2089; +} +{ +add.f16x2 r2090, r2066, r2087; +} +{ +cvt.rn.f16.f64 rs305, fd899; +} +mov.b32 r2095, {rs305, rs305}; +{ +mul.f16x2 r2093, r160, r2095; +} +{ +add.f16x2 r2096, r2072, r2093; +} +{ +cvt.rn.f16.f64 rs306, fd900; +} +mov.b32 r2101, {rs306, rs306}; +{ +mul.f16x2 r2099, r163, r2101; +} +{ +add.f16x2 r2102, r2078, r2099; +} +{ +cvt.rn.f16.f64 rs307, fd859; +} +mov.b32 r2107, {rs307, rs307}; +{ +mul.f16x2 r2105, r169, r2107; +} +{ +add.f16x2 r2108, r2084, r2105; +} +{ +cvt.rn.f16.f64 rs308, fd860; +} +mov.b32 r2113, {rs308, rs308}; +{ +mul.f16x2 r2111, r178, r2113; +} +{ +add.f16x2 r2114, r2090, r2111; +} +{ +cvt.rn.f16.f64 rs309, fd859; +} +mov.b32 r2119, {rs309, rs309}; +{ +mul.f16x2 r2117, r172, r2119; +} +{ +add.f16x2 r2120, r2096, r2117; +} +{ +cvt.rn.f16.f64 rs310, fd860; +} +mov.b32 r2125, {rs310, rs310}; +{ +mul.f16x2 r2123, r175, r2125; +} +{ +add.f16x2 r2126, r2102, r2123; +} +{ +sub.f16x2 %10, r2108, r2114; +} +{ +add.f16x2 %11, r2120, r2126; +} +{ +add.f16x2 %52, r2108, r2114; +} +{ +sub.f16x2 %53, r2120, r2126; +} +cvt.rn.f16.s32 rs311, r5508; +mov.b32 r2153, {rs311, rs311}; +cvt.rn.f16.s32 rs312, r5508; +mov.b32 r2165, {rs312, rs312}; +{ +cvt.rn.f16.f64 rs313, fd887; +} +mov.b32 r2145, {rs313, rs313}; +{ +mul.f16x2 r2143, r1, r2145; +} +{ +add.f16x2 r2146, %62, r2143; +} +{ +cvt.rn.f16.f64 rs314, fd760; +} +mov.b32 r2151, {rs314, rs314}; +{ +mul.f16x2 r2149, r10, r2151; +} +{ +add.f16x2 r2152, r2153, r2149; +} +{ +cvt.rn.f16.f64 rs315, fd887; +} +mov.b32 r2157, {rs315, rs315}; +{ +mul.f16x2 r2155, r4, r2157; +} +{ +add.f16x2 r2158, %63, r2155; +} +{ +cvt.rn.f16.f64 rs316, fd760; +} +mov.b32 r2163, {rs316, rs316}; +{ +mul.f16x2 r2161, r7, r2163; +} +{ +add.f16x2 r2164, r2165, r2161; +} +{ +cvt.rn.f16.f64 rs317, fd867; +} +mov.b32 r2169, {rs317, rs317}; +{ +mul.f16x2 r2167, r13, r2169; +} +{ +add.f16x2 r2170, r2146, r2167; +} +{ +cvt.rn.f16.f64 rs318, fd868; +} +mov.b32 r2175, {rs318, rs318}; +{ +mul.f16x2 r2173, r22, r2175; +} +{ +add.f16x2 r2176, r2152, r2173; +} +{ +cvt.rn.f16.f64 rs319, fd867; +} +mov.b32 r2181, {rs319, rs319}; +{ +mul.f16x2 r2179, r16, r2181; +} +{ +add.f16x2 r2182, r2158, r2179; +} +{ +cvt.rn.f16.f64 rs320, fd868; +} +mov.b32 r2187, {rs320, rs320}; +{ +mul.f16x2 r2185, r19, r2187; +} +{ +add.f16x2 r2188, r2164, r2185; +} +{ +cvt.rn.f16.f64 rs321, fd859; +} +mov.b32 r2193, {rs321, rs321}; +{ +mul.f16x2 r2191, r25, r2193; +} +{ +add.f16x2 r2194, r2170, r2191; +} +{ +cvt.rn.f16.f64 rs322, fd572; +} +mov.b32 r2199, {rs322, rs322}; +{ +mul.f16x2 r2197, r34, r2199; +} +{ +add.f16x2 r2200, r2176, r2197; +} +{ +cvt.rn.f16.f64 rs323, fd859; +} +mov.b32 r2205, {rs323, rs323}; +{ +mul.f16x2 r2203, r28, r2205; +} +{ +add.f16x2 r2206, r2182, r2203; +} +{ +cvt.rn.f16.f64 rs324, fd572; +} +mov.b32 r2211, {rs324, rs324}; +{ +mul.f16x2 r2209, r31, r2211; +} +{ +add.f16x2 r2212, r2188, r2209; +} +{ +cvt.rn.f16.f64 rs325, fd895; +} +mov.b32 r2217, {rs325, rs325}; +{ +mul.f16x2 r2215, r37, r2217; +} +{ +add.f16x2 r2218, r2194, r2215; +} +{ +cvt.rn.f16.f64 rs326, fd896; +} +mov.b32 r2223, {rs326, rs326}; +{ +mul.f16x2 r2221, r46, r2223; +} +{ +add.f16x2 r2224, r2200, r2221; +} +{ +cvt.rn.f16.f64 rs327, fd895; +} +mov.b32 r2229, {rs327, rs327}; +{ +mul.f16x2 r2227, r40, r2229; +} +{ +add.f16x2 r2230, r2206, r2227; +} +{ +cvt.rn.f16.f64 rs328, fd896; +} +mov.b32 r2235, {rs328, rs328}; +{ +mul.f16x2 r2233, r43, r2235; +} +{ +add.f16x2 r2236, r2212, r2233; +} +{ +cvt.rn.f16.f64 rs329, fd847; +} +mov.b32 r2241, {rs329, rs329}; +{ +mul.f16x2 r2239, r49, r2241; +} +{ +add.f16x2 r2242, r2218, r2239; +} +{ +cvt.rn.f16.f64 rs330, fd848; +} +mov.b32 r2247, {rs330, rs330}; +{ +mul.f16x2 r2245, r58, r2247; +} +{ +add.f16x2 r2248, r2224, r2245; +} +{ +cvt.rn.f16.f64 rs331, fd847; +} +mov.b32 r2253, {rs331, rs331}; +{ +mul.f16x2 r2251, r52, r2253; +} +{ +add.f16x2 r2254, r2230, r2251; +} +{ +cvt.rn.f16.f64 rs332, fd848; +} +mov.b32 r2259, {rs332, rs332}; +{ +mul.f16x2 r2257, r55, r2259; +} +{ +add.f16x2 r2260, r2236, r2257; +} +{ +cvt.rn.f16.f64 rs333, fd879; +} +mov.b32 r2265, {rs333, rs333}; +{ +mul.f16x2 r2263, r61, r2265; +} +{ +add.f16x2 r2266, r2242, r2263; +} +{ +cvt.rn.f16.f64 rs334, fd808; +} +mov.b32 r2271, {rs334, rs334}; +{ +mul.f16x2 r2269, r70, r2271; +} +{ +add.f16x2 r2272, r2248, r2269; +} +{ +cvt.rn.f16.f64 rs335, fd879; +} +mov.b32 r2277, {rs335, rs335}; +{ +mul.f16x2 r2275, r64, r2277; +} +{ +add.f16x2 r2278, r2254, r2275; +} +{ +cvt.rn.f16.f64 rs336, fd808; +} +mov.b32 r2283, {rs336, rs336}; +{ +mul.f16x2 r2281, r67, r2283; +} +{ +add.f16x2 r2284, r2260, r2281; +} +{ +cvt.rn.f16.f64 rs337, fd875; +} +mov.b32 r2289, {rs337, rs337}; +{ +mul.f16x2 r2287, r73, r2289; +} +{ +add.f16x2 r2290, r2266, r2287; +} +{ +cvt.rn.f16.f64 rs338, fd876; +} +mov.b32 r2295, {rs338, rs338}; +{ +mul.f16x2 r2293, r82, r2295; +} +{ +add.f16x2 r2296, r2272, r2293; +} +{ +cvt.rn.f16.f64 rs339, fd875; +} +mov.b32 r2301, {rs339, rs339}; +{ +mul.f16x2 r2299, r76, r2301; +} +{ +add.f16x2 r2302, r2278, r2299; +} +{ +cvt.rn.f16.f64 rs340, fd876; +} +mov.b32 r2307, {rs340, rs340}; +{ +mul.f16x2 r2305, r79, r2307; +} +{ +add.f16x2 r2308, r2284, r2305; +} +{ +cvt.rn.f16.f64 rs341, fd851; +} +mov.b32 r2313, {rs341, rs341}; +{ +mul.f16x2 r2311, r85, r2313; +} +{ +add.f16x2 r2314, r2290, r2311; +} +{ +cvt.rn.f16.f64 rs342, fd676; +} +mov.b32 r2319, {rs342, rs342}; +{ +mul.f16x2 r2317, r94, r2319; +} +{ +add.f16x2 r2320, r2296, r2317; +} +{ +cvt.rn.f16.f64 rs343, fd851; +} +mov.b32 r2325, {rs343, rs343}; +{ +mul.f16x2 r2323, r88, r2325; +} +{ +add.f16x2 r2326, r2302, r2323; +} +{ +cvt.rn.f16.f64 rs344, fd676; +} +mov.b32 r2331, {rs344, rs344}; +{ +mul.f16x2 r2329, r91, r2331; +} +{ +add.f16x2 r2332, r2308, r2329; +} +{ +cvt.rn.f16.f64 rs345, fd899; +} +mov.b32 r2337, {rs345, rs345}; +{ +mul.f16x2 r2335, r97, r2337; +} +{ +add.f16x2 r2338, r2314, r2335; +} +mov.f64 fd504, 0d3FEFF57C5208CCF9; +{ +cvt.rn.f16.f64 rs346, fd504; +} +mov.b32 r2343, {rs346, rs346}; +{ +mul.f16x2 r2341, r106, r2343; +} +{ +add.f16x2 r2344, r2320, r2341; +} +{ +cvt.rn.f16.f64 rs347, fd899; +} +mov.b32 r2349, {rs347, rs347}; +{ +mul.f16x2 r2347, r100, r2349; +} +{ +add.f16x2 r2350, r2326, r2347; +} +{ +cvt.rn.f16.f64 rs348, fd504; +} +mov.b32 r2355, {rs348, rs348}; +{ +mul.f16x2 r2353, r103, r2355; +} +{ +add.f16x2 r2356, r2332, r2353; +} +{ +cvt.rn.f16.f64 rs349, fd855; +} +mov.b32 r2361, {rs349, rs349}; +{ +mul.f16x2 r2359, r109, r2361; +} +{ +add.f16x2 r2362, r2338, r2359; +} +{ +cvt.rn.f16.f64 rs350, fd856; +} +mov.b32 r2367, {rs350, rs350}; +{ +mul.f16x2 r2365, r118, r2367; +} +{ +add.f16x2 r2368, r2344, r2365; +} +{ +cvt.rn.f16.f64 rs351, fd855; +} +mov.b32 r2373, {rs351, rs351}; +{ +mul.f16x2 r2371, r112, r2373; +} +{ +add.f16x2 r2374, r2350, r2371; +} +{ +cvt.rn.f16.f64 rs352, fd856; +} +mov.b32 r2379, {rs352, rs352}; +{ +mul.f16x2 r2377, r115, r2379; +} +{ +add.f16x2 r2380, r2356, r2377; +} +{ +cvt.rn.f16.f64 rs353, fd871; +} +mov.b32 r2385, {rs353, rs353}; +{ +mul.f16x2 r2383, r121, r2385; +} +{ +add.f16x2 r2386, r2362, r2383; +} +{ +cvt.rn.f16.f64 rs354, fd624; +} +mov.b32 r2391, {rs354, rs354}; +{ +mul.f16x2 r2389, r130, r2391; +} +{ +add.f16x2 r2392, r2368, r2389; +} +{ +cvt.rn.f16.f64 rs355, fd871; +} +mov.b32 r2397, {rs355, rs355}; +{ +mul.f16x2 r2395, r124, r2397; +} +{ +add.f16x2 r2398, r2374, r2395; +} +{ +cvt.rn.f16.f64 rs356, fd624; +} +mov.b32 r2403, {rs356, rs356}; +{ +mul.f16x2 r2401, r127, r2403; +} +{ +add.f16x2 r2404, r2380, r2401; +} +{ +cvt.rn.f16.f64 rs357, fd883; +} +mov.b32 r2409, {rs357, rs357}; +{ +mul.f16x2 r2407, r133, r2409; +} +{ +add.f16x2 r2410, r2386, r2407; +} +{ +cvt.rn.f16.f64 rs358, fd884; +} +mov.b32 r2415, {rs358, rs358}; +{ +mul.f16x2 r2413, r142, r2415; +} +{ +add.f16x2 r2416, r2392, r2413; +} +{ +cvt.rn.f16.f64 rs359, fd883; +} +mov.b32 r2421, {rs359, rs359}; +{ +mul.f16x2 r2419, r136, r2421; +} +{ +add.f16x2 r2422, r2398, r2419; +} +{ +cvt.rn.f16.f64 rs360, fd884; +} +mov.b32 r2427, {rs360, rs360}; +{ +mul.f16x2 r2425, r139, r2427; +} +{ +add.f16x2 r2428, r2404, r2425; +} +{ +cvt.rn.f16.f64 rs361, fd843; +} +mov.b32 r2433, {rs361, rs361}; +{ +mul.f16x2 r2431, r145, r2433; +} +{ +add.f16x2 r2434, r2410, r2431; +} +{ +cvt.rn.f16.f64 rs362, fd820; +} +mov.b32 r2439, {rs362, rs362}; +{ +mul.f16x2 r2437, r154, r2439; +} +{ +add.f16x2 r2440, r2416, r2437; +} +{ +cvt.rn.f16.f64 rs363, fd843; +} +mov.b32 r2445, {rs363, rs363}; +{ +mul.f16x2 r2443, r148, r2445; +} +{ +add.f16x2 r2446, r2422, r2443; +} +{ +cvt.rn.f16.f64 rs364, fd820; +} +mov.b32 r2451, {rs364, rs364}; +{ +mul.f16x2 r2449, r151, r2451; +} +{ +add.f16x2 r2452, r2428, r2449; +} +{ +cvt.rn.f16.f64 rs365, fd891; +} +mov.b32 r2457, {rs365, rs365}; +{ +mul.f16x2 r2455, r157, r2457; +} +{ +add.f16x2 r2458, r2434, r2455; +} +{ +cvt.rn.f16.f64 rs366, fd804; +} +mov.b32 r2463, {rs366, rs366}; +{ +mul.f16x2 r2461, r166, r2463; +} +{ +add.f16x2 r2464, r2440, r2461; +} +{ +cvt.rn.f16.f64 rs367, fd891; +} +mov.b32 r2469, {rs367, rs367}; +{ +mul.f16x2 r2467, r160, r2469; +} +{ +add.f16x2 r2470, r2446, r2467; +} +{ +cvt.rn.f16.f64 rs368, fd804; +} +mov.b32 r2475, {rs368, rs368}; +{ +mul.f16x2 r2473, r163, r2475; +} +{ +add.f16x2 r2476, r2452, r2473; +} +{ +cvt.rn.f16.f64 rs369, fd863; +} +mov.b32 r2481, {rs369, rs369}; +{ +mul.f16x2 r2479, r169, r2481; +} +{ +add.f16x2 r2482, r2458, r2479; +} +{ +cvt.rn.f16.f64 rs370, fd864; +} +mov.b32 r2487, {rs370, rs370}; +{ +mul.f16x2 r2485, r178, r2487; +} +{ +add.f16x2 r2488, r2464, r2485; +} +{ +cvt.rn.f16.f64 rs371, fd863; +} +mov.b32 r2493, {rs371, rs371}; +{ +mul.f16x2 r2491, r172, r2493; +} +{ +add.f16x2 r2494, r2470, r2491; +} +{ +cvt.rn.f16.f64 rs372, fd864; +} +mov.b32 r2499, {rs372, rs372}; +{ +mul.f16x2 r2497, r175, r2499; +} +{ +add.f16x2 r2500, r2476, r2497; +} +{ +sub.f16x2 %12, r2482, r2488; +} +{ +add.f16x2 %13, r2494, r2500; +} +{ +add.f16x2 %50, r2482, r2488; +} +{ +sub.f16x2 %51, r2494, r2500; +} +cvt.rn.f16.s32 rs373, r5508; +mov.b32 r2527, {rs373, rs373}; +cvt.rn.f16.s32 rs374, r5508; +mov.b32 r2539, {rs374, rs374}; +{ +cvt.rn.f16.f64 rs375, fd895; +} +mov.b32 r2519, {rs375, rs375}; +{ +mul.f16x2 r2517, r1, r2519; +} +{ +add.f16x2 r2520, %62, r2517; +} +{ +cvt.rn.f16.f64 rs376, fd580; +} +mov.b32 r2525, {rs376, rs376}; +{ +mul.f16x2 r2523, r10, r2525; +} +{ +add.f16x2 r2526, r2527, r2523; +} +{ +cvt.rn.f16.f64 rs377, fd895; +} +mov.b32 r2531, {rs377, rs377}; +{ +mul.f16x2 r2529, r4, r2531; +} +{ +add.f16x2 r2532, %63, r2529; +} +{ +cvt.rn.f16.f64 rs378, fd580; +} +mov.b32 r2537, {rs378, rs378}; +{ +mul.f16x2 r2535, r7, r2537; +} +{ +add.f16x2 r2538, r2539, r2535; +} +{ +cvt.rn.f16.f64 rs379, fd851; +} +mov.b32 r2543, {rs379, rs379}; +{ +mul.f16x2 r2541, r13, r2543; +} +{ +add.f16x2 r2544, r2520, r2541; +} +{ +cvt.rn.f16.f64 rs380, fd852; +} +mov.b32 r2549, {rs380, rs380}; +{ +mul.f16x2 r2547, r22, r2549; +} +{ +add.f16x2 r2550, r2526, r2547; +} +{ +cvt.rn.f16.f64 rs381, fd851; +} +mov.b32 r2555, {rs381, rs381}; +{ +mul.f16x2 r2553, r16, r2555; +} +{ +add.f16x2 r2556, r2532, r2553; +} +{ +cvt.rn.f16.f64 rs382, fd852; +} +mov.b32 r2561, {rs382, rs382}; +{ +mul.f16x2 r2559, r19, r2561; +} +{ +add.f16x2 r2562, r2538, r2559; +} +{ +cvt.rn.f16.f64 rs383, fd883; +} +mov.b32 r2567, {rs383, rs383}; +{ +mul.f16x2 r2565, r25, r2567; +} +{ +add.f16x2 r2568, r2544, r2565; +} +{ +cvt.rn.f16.f64 rs384, fd736; +} +mov.b32 r2573, {rs384, rs384}; +{ +mul.f16x2 r2571, r34, r2573; +} +{ +add.f16x2 r2574, r2550, r2571; +} +{ +cvt.rn.f16.f64 rs385, fd883; +} +mov.b32 r2579, {rs385, rs385}; +{ +mul.f16x2 r2577, r28, r2579; +} +{ +add.f16x2 r2580, r2556, r2577; +} +{ +cvt.rn.f16.f64 rs386, fd736; +} +mov.b32 r2585, {rs386, rs386}; +{ +mul.f16x2 r2583, r31, r2585; +} +{ +add.f16x2 r2586, r2562, r2583; +} +{ +cvt.rn.f16.f64 rs387, fd863; +} +mov.b32 r2591, {rs387, rs387}; +{ +mul.f16x2 r2589, r37, r2591; +} +{ +add.f16x2 r2592, r2568, r2589; +} +{ +cvt.rn.f16.f64 rs388, fd864; +} +mov.b32 r2597, {rs388, rs388}; +{ +mul.f16x2 r2595, r46, r2597; +} +{ +add.f16x2 r2598, r2574, r2595; +} +{ +cvt.rn.f16.f64 rs389, fd863; +} +mov.b32 r2603, {rs389, rs389}; +{ +mul.f16x2 r2601, r40, r2603; +} +{ +add.f16x2 r2604, r2580, r2601; +} +{ +cvt.rn.f16.f64 rs390, fd864; +} +mov.b32 r2609, {rs390, rs390}; +{ +mul.f16x2 r2607, r43, r2609; +} +{ +add.f16x2 r2610, r2586, r2607; +} +{ +cvt.rn.f16.f64 rs391, fd871; +} +mov.b32 r2615, {rs391, rs391}; +{ +mul.f16x2 r2613, r49, r2615; +} +{ +add.f16x2 r2616, r2592, r2613; +} +{ +cvt.rn.f16.f64 rs392, fd624; +} +mov.b32 r2621, {rs392, rs392}; +{ +mul.f16x2 r2619, r58, r2621; +} +{ +add.f16x2 r2622, r2598, r2619; +} +{ +cvt.rn.f16.f64 rs393, fd871; +} +mov.b32 r2627, {rs393, rs393}; +{ +mul.f16x2 r2625, r52, r2627; +} +{ +add.f16x2 r2628, r2604, r2625; +} +{ +cvt.rn.f16.f64 rs394, fd624; +} +mov.b32 r2633, {rs394, rs394}; +{ +mul.f16x2 r2631, r55, r2633; +} +{ +add.f16x2 r2634, r2610, r2631; +} +{ +cvt.rn.f16.f64 rs395, fd875; +} +mov.b32 r2639, {rs395, rs395}; +{ +mul.f16x2 r2637, r61, r2639; +} +{ +add.f16x2 r2640, r2616, r2637; +} +{ +cvt.rn.f16.f64 rs396, fd876; +} +mov.b32 r2645, {rs396, rs396}; +{ +mul.f16x2 r2643, r70, r2645; +} +{ +add.f16x2 r2646, r2622, r2643; +} +{ +cvt.rn.f16.f64 rs397, fd875; +} +mov.b32 r2651, {rs397, rs397}; +{ +mul.f16x2 r2649, r64, r2651; +} +{ +add.f16x2 r2652, r2628, r2649; +} +{ +cvt.rn.f16.f64 rs398, fd876; +} +mov.b32 r2657, {rs398, rs398}; +{ +mul.f16x2 r2655, r67, r2657; +} +{ +add.f16x2 r2658, r2634, r2655; +} +{ +cvt.rn.f16.f64 rs399, fd859; +} +mov.b32 r2663, {rs399, rs399}; +{ +mul.f16x2 r2661, r73, r2663; +} +{ +add.f16x2 r2664, r2640, r2661; +} +{ +cvt.rn.f16.f64 rs400, fd572; +} +mov.b32 r2669, {rs400, rs400}; +{ +mul.f16x2 r2667, r82, r2669; +} +{ +add.f16x2 r2670, r2646, r2667; +} +{ +cvt.rn.f16.f64 rs401, fd859; +} +mov.b32 r2675, {rs401, rs401}; +{ +mul.f16x2 r2673, r76, r2675; +} +{ +add.f16x2 r2676, r2652, r2673; +} +{ +cvt.rn.f16.f64 rs402, fd572; +} +mov.b32 r2681, {rs402, rs402}; +{ +mul.f16x2 r2679, r79, r2681; +} +{ +add.f16x2 r2682, r2658, r2679; +} +{ +cvt.rn.f16.f64 rs403, fd887; +} +mov.b32 r2687, {rs403, rs403}; +{ +mul.f16x2 r2685, r85, r2687; +} +{ +add.f16x2 r2688, r2664, r2685; +} +{ +cvt.rn.f16.f64 rs404, fd888; +} +mov.b32 r2693, {rs404, rs404}; +{ +mul.f16x2 r2691, r94, r2693; +} +{ +add.f16x2 r2694, r2670, r2691; +} +{ +cvt.rn.f16.f64 rs405, fd887; +} +mov.b32 r2699, {rs405, rs405}; +{ +mul.f16x2 r2697, r88, r2699; +} +{ +add.f16x2 r2700, r2676, r2697; +} +{ +cvt.rn.f16.f64 rs406, fd888; +} +mov.b32 r2705, {rs406, rs406}; +{ +mul.f16x2 r2703, r91, r2705; +} +{ +add.f16x2 r2706, r2682, r2703; +} +{ +cvt.rn.f16.f64 rs407, fd847; +} +mov.b32 r2711, {rs407, rs407}; +{ +mul.f16x2 r2709, r97, r2711; +} +{ +add.f16x2 r2712, r2688, r2709; +} +{ +cvt.rn.f16.f64 rs408, fd768; +} +mov.b32 r2717, {rs408, rs408}; +{ +mul.f16x2 r2715, r106, r2717; +} +{ +add.f16x2 r2718, r2694, r2715; +} +{ +cvt.rn.f16.f64 rs409, fd847; +} +mov.b32 r2723, {rs409, rs409}; +{ +mul.f16x2 r2721, r100, r2723; +} +{ +add.f16x2 r2724, r2700, r2721; +} +{ +cvt.rn.f16.f64 rs410, fd768; +} +mov.b32 r2729, {rs410, rs410}; +{ +mul.f16x2 r2727, r103, r2729; +} +{ +add.f16x2 r2730, r2706, r2727; +} +{ +cvt.rn.f16.f64 rs411, fd899; +} +mov.b32 r2735, {rs411, rs411}; +{ +mul.f16x2 r2733, r109, r2735; +} +{ +add.f16x2 r2736, r2712, r2733; +} +{ +cvt.rn.f16.f64 rs412, fd900; +} +mov.b32 r2741, {rs412, rs412}; +{ +mul.f16x2 r2739, r118, r2741; +} +{ +add.f16x2 r2742, r2718, r2739; +} +{ +cvt.rn.f16.f64 rs413, fd899; +} +mov.b32 r2747, {rs413, rs413}; +{ +mul.f16x2 r2745, r112, r2747; +} +{ +add.f16x2 r2748, r2724, r2745; +} +{ +cvt.rn.f16.f64 rs414, fd900; +} +mov.b32 r2753, {rs414, rs414}; +{ +mul.f16x2 r2751, r115, r2753; +} +{ +add.f16x2 r2754, r2730, r2751; +} +{ +cvt.rn.f16.f64 rs415, fd843; +} +mov.b32 r2759, {rs415, rs415}; +{ +mul.f16x2 r2757, r121, r2759; +} +{ +add.f16x2 r2760, r2736, r2757; +} +{ +cvt.rn.f16.f64 rs416, fd844; +} +mov.b32 r2765, {rs416, rs416}; +{ +mul.f16x2 r2763, r130, r2765; +} +{ +add.f16x2 r2766, r2742, r2763; +} +{ +cvt.rn.f16.f64 rs417, fd843; +} +mov.b32 r2771, {rs417, rs417}; +{ +mul.f16x2 r2769, r124, r2771; +} +{ +add.f16x2 r2772, r2748, r2769; +} +{ +cvt.rn.f16.f64 rs418, fd844; +} +mov.b32 r2777, {rs418, rs418}; +{ +mul.f16x2 r2775, r127, r2777; +} +{ +add.f16x2 r2778, r2754, r2775; +} +{ +cvt.rn.f16.f64 rs419, fd891; +} +mov.b32 r2783, {rs419, rs419}; +{ +mul.f16x2 r2781, r133, r2783; +} +{ +add.f16x2 r2784, r2760, r2781; +} +{ +cvt.rn.f16.f64 rs420, fd804; +} +mov.b32 r2789, {rs420, rs420}; +{ +mul.f16x2 r2787, r142, r2789; +} +{ +add.f16x2 r2790, r2766, r2787; +} +{ +cvt.rn.f16.f64 rs421, fd891; +} +mov.b32 r2795, {rs421, rs421}; +{ +mul.f16x2 r2793, r136, r2795; +} +{ +add.f16x2 r2796, r2772, r2793; +} +{ +cvt.rn.f16.f64 rs422, fd804; +} +mov.b32 r2801, {rs422, rs422}; +{ +mul.f16x2 r2799, r139, r2801; +} +{ +add.f16x2 r2802, r2778, r2799; +} +{ +cvt.rn.f16.f64 rs423, fd855; +} +mov.b32 r2807, {rs423, rs423}; +{ +mul.f16x2 r2805, r145, r2807; +} +{ +add.f16x2 r2808, r2784, r2805; +} +{ +cvt.rn.f16.f64 rs424, fd856; +} +mov.b32 r2813, {rs424, rs424}; +{ +mul.f16x2 r2811, r154, r2813; +} +{ +add.f16x2 r2814, r2790, r2811; +} +{ +cvt.rn.f16.f64 rs425, fd855; +} +mov.b32 r2819, {rs425, rs425}; +{ +mul.f16x2 r2817, r148, r2819; +} +{ +add.f16x2 r2820, r2796, r2817; +} +{ +cvt.rn.f16.f64 rs426, fd856; +} +mov.b32 r2825, {rs426, rs426}; +{ +mul.f16x2 r2823, r151, r2825; +} +{ +add.f16x2 r2826, r2802, r2823; +} +{ +cvt.rn.f16.f64 rs427, fd879; +} +mov.b32 r2831, {rs427, rs427}; +{ +mul.f16x2 r2829, r157, r2831; +} +{ +add.f16x2 r2832, r2808, r2829; +} +{ +cvt.rn.f16.f64 rs428, fd808; +} +mov.b32 r2837, {rs428, rs428}; +{ +mul.f16x2 r2835, r166, r2837; +} +{ +add.f16x2 r2838, r2814, r2835; +} +{ +cvt.rn.f16.f64 rs429, fd879; +} +mov.b32 r2843, {rs429, rs429}; +{ +mul.f16x2 r2841, r160, r2843; +} +{ +add.f16x2 r2844, r2820, r2841; +} +{ +cvt.rn.f16.f64 rs430, fd808; +} +mov.b32 r2849, {rs430, rs430}; +{ +mul.f16x2 r2847, r163, r2849; +} +{ +add.f16x2 r2850, r2826, r2847; +} +{ +cvt.rn.f16.f64 rs431, fd867; +} +mov.b32 r2855, {rs431, rs431}; +{ +mul.f16x2 r2853, r169, r2855; +} +{ +add.f16x2 r2856, r2832, r2853; +} +{ +cvt.rn.f16.f64 rs432, fd868; +} +mov.b32 r2861, {rs432, rs432}; +{ +mul.f16x2 r2859, r178, r2861; +} +{ +add.f16x2 r2862, r2838, r2859; +} +{ +cvt.rn.f16.f64 rs433, fd867; +} +mov.b32 r2867, {rs433, rs433}; +{ +mul.f16x2 r2865, r172, r2867; +} +{ +add.f16x2 r2868, r2844, r2865; +} +{ +cvt.rn.f16.f64 rs434, fd868; +} +mov.b32 r2873, {rs434, rs434}; +{ +mul.f16x2 r2871, r175, r2873; +} +{ +add.f16x2 r2874, r2850, r2871; +} +{ +sub.f16x2 %14, r2856, r2862; +} +{ +add.f16x2 %15, r2868, r2874; +} +{ +add.f16x2 %48, r2856, r2862; +} +{ +sub.f16x2 %49, r2868, r2874; +} +cvt.rn.f16.s32 rs435, r5508; +mov.b32 r2901, {rs435, rs435}; +cvt.rn.f16.s32 rs436, r5508; +mov.b32 r2913, {rs436, rs436}; +{ +cvt.rn.f16.f64 rs437, fd899; +} +mov.b32 r2893, {rs437, rs437}; +{ +mul.f16x2 r2891, r1, r2893; +} +{ +add.f16x2 r2894, %62, r2891; +} +{ +cvt.rn.f16.f64 rs438, fd900; +} +mov.b32 r2899, {rs438, rs438}; +{ +mul.f16x2 r2897, r10, r2899; +} +{ +add.f16x2 r2900, r2901, r2897; +} +{ +cvt.rn.f16.f64 rs439, fd899; +} +mov.b32 r2905, {rs439, rs439}; +{ +mul.f16x2 r2903, r4, r2905; +} +{ +add.f16x2 r2906, %63, r2903; +} +{ +cvt.rn.f16.f64 rs440, fd900; +} +mov.b32 r2911, {rs440, rs440}; +{ +mul.f16x2 r2909, r7, r2911; +} +{ +add.f16x2 r2912, r2913, r2909; +} +{ +cvt.rn.f16.f64 rs441, fd843; +} +mov.b32 r2917, {rs441, rs441}; +{ +mul.f16x2 r2915, r13, r2917; +} +{ +add.f16x2 r2918, r2894, r2915; +} +{ +cvt.rn.f16.f64 rs442, fd820; +} +mov.b32 r2923, {rs442, rs442}; +{ +mul.f16x2 r2921, r22, r2923; +} +{ +add.f16x2 r2924, r2900, r2921; +} +{ +cvt.rn.f16.f64 rs443, fd843; +} +mov.b32 r2929, {rs443, rs443}; +{ +mul.f16x2 r2927, r16, r2929; +} +{ +add.f16x2 r2930, r2906, r2927; +} +{ +cvt.rn.f16.f64 rs444, fd820; +} +mov.b32 r2935, {rs444, rs444}; +{ +mul.f16x2 r2933, r19, r2935; +} +{ +add.f16x2 r2936, r2912, r2933; +} +{ +cvt.rn.f16.f64 rs445, fd895; +} +mov.b32 r2941, {rs445, rs445}; +{ +mul.f16x2 r2939, r25, r2941; +} +{ +add.f16x2 r2942, r2918, r2939; +} +{ +cvt.rn.f16.f64 rs446, fd896; +} +mov.b32 r2947, {rs446, rs446}; +{ +mul.f16x2 r2945, r34, r2947; +} +{ +add.f16x2 r2948, r2924, r2945; +} +{ +cvt.rn.f16.f64 rs447, fd895; +} +mov.b32 r2953, {rs447, rs447}; +{ +mul.f16x2 r2951, r28, r2953; +} +{ +add.f16x2 r2954, r2930, r2951; +} +{ +cvt.rn.f16.f64 rs448, fd896; +} +mov.b32 r2959, {rs448, rs448}; +{ +mul.f16x2 r2957, r31, r2959; +} +{ +add.f16x2 r2960, r2936, r2957; +} +{ +cvt.rn.f16.f64 rs449, fd847; +} +mov.b32 r2965, {rs449, rs449}; +{ +mul.f16x2 r2963, r37, r2965; +} +{ +add.f16x2 r2966, r2942, r2963; +} +{ +cvt.rn.f16.f64 rs450, fd768; +} +mov.b32 r2971, {rs450, rs450}; +{ +mul.f16x2 r2969, r46, r2971; +} +{ +add.f16x2 r2972, r2948, r2969; +} +{ +cvt.rn.f16.f64 rs451, fd847; +} +mov.b32 r2977, {rs451, rs451}; +{ +mul.f16x2 r2975, r40, r2977; +} +{ +add.f16x2 r2978, r2954, r2975; +} +{ +cvt.rn.f16.f64 rs452, fd768; +} +mov.b32 r2983, {rs452, rs452}; +{ +mul.f16x2 r2981, r43, r2983; +} +{ +add.f16x2 r2984, r2960, r2981; +} +{ +cvt.rn.f16.f64 rs453, fd891; +} +mov.b32 r2989, {rs453, rs453}; +{ +mul.f16x2 r2987, r49, r2989; +} +{ +add.f16x2 r2990, r2966, r2987; +} +{ +cvt.rn.f16.f64 rs454, fd892; +} +mov.b32 r2995, {rs454, rs454}; +{ +mul.f16x2 r2993, r58, r2995; +} +{ +add.f16x2 r2996, r2972, r2993; +} +{ +cvt.rn.f16.f64 rs455, fd891; +} +mov.b32 r3001, {rs455, rs455}; +{ +mul.f16x2 r2999, r52, r3001; +} +{ +add.f16x2 r3002, r2978, r2999; +} +{ +cvt.rn.f16.f64 rs456, fd892; +} +mov.b32 r3007, {rs456, rs456}; +{ +mul.f16x2 r3005, r55, r3007; +} +{ +add.f16x2 r3008, r2984, r3005; +} +{ +cvt.rn.f16.f64 rs457, fd851; +} +mov.b32 r3013, {rs457, rs457}; +{ +mul.f16x2 r3011, r61, r3013; +} +{ +add.f16x2 r3014, r2990, r3011; +} +{ +cvt.rn.f16.f64 rs458, fd676; +} +mov.b32 r3019, {rs458, rs458}; +{ +mul.f16x2 r3017, r70, r3019; +} +{ +add.f16x2 r3020, r2996, r3017; +} +{ +cvt.rn.f16.f64 rs459, fd851; +} +mov.b32 r3025, {rs459, rs459}; +{ +mul.f16x2 r3023, r64, r3025; +} +{ +add.f16x2 r3026, r3002, r3023; +} +{ +cvt.rn.f16.f64 rs460, fd676; +} +mov.b32 r3031, {rs460, rs460}; +{ +mul.f16x2 r3029, r67, r3031; +} +{ +add.f16x2 r3032, r3008, r3029; +} +{ +cvt.rn.f16.f64 rs461, fd887; +} +mov.b32 r3037, {rs461, rs461}; +{ +mul.f16x2 r3035, r73, r3037; +} +{ +add.f16x2 r3038, r3014, r3035; +} +{ +cvt.rn.f16.f64 rs462, fd888; +} +mov.b32 r3043, {rs462, rs462}; +{ +mul.f16x2 r3041, r82, r3043; +} +{ +add.f16x2 r3044, r3020, r3041; +} +{ +cvt.rn.f16.f64 rs463, fd887; +} +mov.b32 r3049, {rs463, rs463}; +{ +mul.f16x2 r3047, r76, r3049; +} +{ +add.f16x2 r3050, r3026, r3047; +} +{ +cvt.rn.f16.f64 rs464, fd888; +} +mov.b32 r3055, {rs464, rs464}; +{ +mul.f16x2 r3053, r79, r3055; +} +{ +add.f16x2 r3056, r3032, r3053; +} +{ +cvt.rn.f16.f64 rs465, fd855; +} +mov.b32 r3061, {rs465, rs465}; +{ +mul.f16x2 r3059, r85, r3061; +} +{ +add.f16x2 r3062, r3038, r3059; +} +{ +cvt.rn.f16.f64 rs466, fd816; +} +mov.b32 r3067, {rs466, rs466}; +{ +mul.f16x2 r3065, r94, r3067; +} +{ +add.f16x2 r3068, r3044, r3065; +} +{ +cvt.rn.f16.f64 rs467, fd855; +} +mov.b32 r3073, {rs467, rs467}; +{ +mul.f16x2 r3071, r88, r3073; +} +{ +add.f16x2 r3074, r3050, r3071; +} +{ +cvt.rn.f16.f64 rs468, fd816; +} +mov.b32 r3079, {rs468, rs468}; +{ +mul.f16x2 r3077, r91, r3079; +} +{ +add.f16x2 r3080, r3056, r3077; +} +{ +cvt.rn.f16.f64 rs469, fd883; +} +mov.b32 r3085, {rs469, rs469}; +{ +mul.f16x2 r3083, r97, r3085; +} +{ +add.f16x2 r3086, r3062, r3083; +} +{ +cvt.rn.f16.f64 rs470, fd884; +} +mov.b32 r3091, {rs470, rs470}; +{ +mul.f16x2 r3089, r106, r3091; +} +{ +add.f16x2 r3092, r3068, r3089; +} +{ +cvt.rn.f16.f64 rs471, fd883; +} +mov.b32 r3097, {rs471, rs471}; +{ +mul.f16x2 r3095, r100, r3097; +} +{ +add.f16x2 r3098, r3074, r3095; +} +{ +cvt.rn.f16.f64 rs472, fd884; +} +mov.b32 r3103, {rs472, rs472}; +{ +mul.f16x2 r3101, r103, r3103; +} +{ +add.f16x2 r3104, r3080, r3101; +} +{ +cvt.rn.f16.f64 rs473, fd859; +} +mov.b32 r3109, {rs473, rs473}; +{ +mul.f16x2 r3107, r109, r3109; +} +{ +add.f16x2 r3110, r3086, r3107; +} +{ +cvt.rn.f16.f64 rs474, fd572; +} +mov.b32 r3115, {rs474, rs474}; +{ +mul.f16x2 r3113, r118, r3115; +} +{ +add.f16x2 r3116, r3092, r3113; +} +{ +cvt.rn.f16.f64 rs475, fd859; +} +mov.b32 r3121, {rs475, rs475}; +{ +mul.f16x2 r3119, r112, r3121; +} +{ +add.f16x2 r3122, r3098, r3119; +} +{ +cvt.rn.f16.f64 rs476, fd572; +} +mov.b32 r3127, {rs476, rs476}; +{ +mul.f16x2 r3125, r115, r3127; +} +{ +add.f16x2 r3128, r3104, r3125; +} +{ +cvt.rn.f16.f64 rs477, fd879; +} +mov.b32 r3133, {rs477, rs477}; +{ +mul.f16x2 r3131, r121, r3133; +} +{ +add.f16x2 r3134, r3110, r3131; +} +{ +cvt.rn.f16.f64 rs478, fd880; +} +mov.b32 r3139, {rs478, rs478}; +{ +mul.f16x2 r3137, r130, r3139; +} +{ +add.f16x2 r3140, r3116, r3137; +} +{ +cvt.rn.f16.f64 rs479, fd879; +} +mov.b32 r3145, {rs479, rs479}; +{ +mul.f16x2 r3143, r124, r3145; +} +{ +add.f16x2 r3146, r3122, r3143; +} +{ +cvt.rn.f16.f64 rs480, fd880; +} +mov.b32 r3151, {rs480, rs480}; +{ +mul.f16x2 r3149, r127, r3151; +} +{ +add.f16x2 r3152, r3128, r3149; +} +{ +cvt.rn.f16.f64 rs481, fd863; +} +mov.b32 r3157, {rs481, rs481}; +{ +mul.f16x2 r3155, r133, r3157; +} +{ +add.f16x2 r3158, r3134, r3155; +} +{ +cvt.rn.f16.f64 rs482, fd740; +} +mov.b32 r3163, {rs482, rs482}; +{ +mul.f16x2 r3161, r142, r3163; +} +{ +add.f16x2 r3164, r3140, r3161; +} +{ +cvt.rn.f16.f64 rs483, fd863; +} +mov.b32 r3169, {rs483, rs483}; +{ +mul.f16x2 r3167, r136, r3169; +} +{ +add.f16x2 r3170, r3146, r3167; +} +{ +cvt.rn.f16.f64 rs484, fd740; +} +mov.b32 r3175, {rs484, rs484}; +{ +mul.f16x2 r3173, r139, r3175; +} +{ +add.f16x2 r3176, r3152, r3173; +} +{ +cvt.rn.f16.f64 rs485, fd875; +} +mov.b32 r3181, {rs485, rs485}; +{ +mul.f16x2 r3179, r145, r3181; +} +{ +add.f16x2 r3182, r3158, r3179; +} +{ +cvt.rn.f16.f64 rs486, fd876; +} +mov.b32 r3187, {rs486, rs486}; +{ +mul.f16x2 r3185, r154, r3187; +} +{ +add.f16x2 r3188, r3164, r3185; +} +{ +cvt.rn.f16.f64 rs487, fd875; +} +mov.b32 r3193, {rs487, rs487}; +{ +mul.f16x2 r3191, r148, r3193; +} +{ +add.f16x2 r3194, r3170, r3191; +} +{ +cvt.rn.f16.f64 rs488, fd876; +} +mov.b32 r3199, {rs488, rs488}; +{ +mul.f16x2 r3197, r151, r3199; +} +{ +add.f16x2 r3200, r3176, r3197; +} +{ +cvt.rn.f16.f64 rs489, fd867; +} +mov.b32 r3205, {rs489, rs489}; +{ +mul.f16x2 r3203, r157, r3205; +} +{ +add.f16x2 r3206, r3182, r3203; +} +{ +cvt.rn.f16.f64 rs490, fd812; +} +mov.b32 r3211, {rs490, rs490}; +{ +mul.f16x2 r3209, r166, r3211; +} +{ +add.f16x2 r3212, r3188, r3209; +} +{ +cvt.rn.f16.f64 rs491, fd867; +} +mov.b32 r3217, {rs491, rs491}; +{ +mul.f16x2 r3215, r160, r3217; +} +{ +add.f16x2 r3218, r3194, r3215; +} +{ +cvt.rn.f16.f64 rs492, fd812; +} +mov.b32 r3223, {rs492, rs492}; +{ +mul.f16x2 r3221, r163, r3223; +} +{ +add.f16x2 r3224, r3200, r3221; +} +{ +cvt.rn.f16.f64 rs493, fd871; +} +mov.b32 r3229, {rs493, rs493}; +{ +mul.f16x2 r3227, r169, r3229; +} +{ +add.f16x2 r3230, r3206, r3227; +} +{ +cvt.rn.f16.f64 rs494, fd872; +} +mov.b32 r3235, {rs494, rs494}; +{ +mul.f16x2 r3233, r178, r3235; +} +{ +add.f16x2 r3236, r3212, r3233; +} +{ +cvt.rn.f16.f64 rs495, fd871; +} +mov.b32 r3241, {rs495, rs495}; +{ +mul.f16x2 r3239, r172, r3241; +} +{ +add.f16x2 r3242, r3218, r3239; +} +{ +cvt.rn.f16.f64 rs496, fd872; +} +mov.b32 r3247, {rs496, rs496}; +{ +mul.f16x2 r3245, r175, r3247; +} +{ +add.f16x2 r3248, r3224, r3245; +} +{ +sub.f16x2 %16, r3230, r3236; +} +{ +add.f16x2 %17, r3242, r3248; +} +{ +add.f16x2 %46, r3230, r3236; +} +{ +sub.f16x2 %47, r3242, r3248; +} +cvt.rn.f16.s32 rs497, r5508; +mov.b32 r3275, {rs497, rs497}; +cvt.rn.f16.s32 rs498, r5508; +mov.b32 r3287, {rs498, rs498}; +{ +cvt.rn.f16.f64 rs499, fd891; +} +mov.b32 r3267, {rs499, rs499}; +{ +mul.f16x2 r3265, r1, r3267; +} +{ +add.f16x2 r3268, %62, r3265; +} +{ +cvt.rn.f16.f64 rs500, fd892; +} +mov.b32 r3273, {rs500, rs500}; +{ +mul.f16x2 r3271, r10, r3273; +} +{ +add.f16x2 r3274, r3275, r3271; +} +{ +cvt.rn.f16.f64 rs501, fd891; +} +mov.b32 r3279, {rs501, rs501}; +{ +mul.f16x2 r3277, r4, r3279; +} +{ +add.f16x2 r3280, %63, r3277; +} +{ +cvt.rn.f16.f64 rs502, fd892; +} +mov.b32 r3285, {rs502, rs502}; +{ +mul.f16x2 r3283, r7, r3285; +} +{ +add.f16x2 r3286, r3287, r3283; +} +{ +cvt.rn.f16.f64 rs503, fd859; +} +mov.b32 r3291, {rs503, rs503}; +{ +mul.f16x2 r3289, r13, r3291; +} +{ +add.f16x2 r3292, r3268, r3289; +} +{ +cvt.rn.f16.f64 rs504, fd572; +} +mov.b32 r3297, {rs504, rs504}; +{ +mul.f16x2 r3295, r22, r3297; +} +{ +add.f16x2 r3298, r3274, r3295; +} +{ +cvt.rn.f16.f64 rs505, fd859; +} +mov.b32 r3303, {rs505, rs505}; +{ +mul.f16x2 r3301, r16, r3303; +} +{ +add.f16x2 r3304, r3280, r3301; +} +{ +cvt.rn.f16.f64 rs506, fd572; +} +mov.b32 r3309, {rs506, rs506}; +{ +mul.f16x2 r3307, r19, r3309; +} +{ +add.f16x2 r3310, r3286, r3307; +} +{ +cvt.rn.f16.f64 rs507, fd871; +} +mov.b32 r3315, {rs507, rs507}; +{ +mul.f16x2 r3313, r25, r3315; +} +{ +add.f16x2 r3316, r3292, r3313; +} +{ +cvt.rn.f16.f64 rs508, fd872; +} +mov.b32 r3321, {rs508, rs508}; +{ +mul.f16x2 r3319, r34, r3321; +} +{ +add.f16x2 r3322, r3298, r3319; +} +{ +cvt.rn.f16.f64 rs509, fd871; +} +mov.b32 r3327, {rs509, rs509}; +{ +mul.f16x2 r3325, r28, r3327; +} +{ +add.f16x2 r3328, r3304, r3325; +} +{ +cvt.rn.f16.f64 rs510, fd872; +} +mov.b32 r3333, {rs510, rs510}; +{ +mul.f16x2 r3331, r31, r3333; +} +{ +add.f16x2 r3334, r3310, r3331; +} +{ +cvt.rn.f16.f64 rs511, fd879; +} +mov.b32 r3339, {rs511, rs511}; +{ +mul.f16x2 r3337, r37, r3339; +} +{ +add.f16x2 r3340, r3316, r3337; +} +{ +cvt.rn.f16.f64 rs512, fd808; +} +mov.b32 r3345, {rs512, rs512}; +{ +mul.f16x2 r3343, r46, r3345; +} +{ +add.f16x2 r3346, r3322, r3343; +} +{ +cvt.rn.f16.f64 rs513, fd879; +} +mov.b32 r3351, {rs513, rs513}; +{ +mul.f16x2 r3349, r40, r3351; +} +{ +add.f16x2 r3352, r3328, r3349; +} +{ +cvt.rn.f16.f64 rs514, fd808; +} +mov.b32 r3357, {rs514, rs514}; +{ +mul.f16x2 r3355, r43, r3357; +} +{ +add.f16x2 r3358, r3334, r3355; +} +{ +cvt.rn.f16.f64 rs515, fd851; +} +mov.b32 r3363, {rs515, rs515}; +{ +mul.f16x2 r3361, r49, r3363; +} +{ +add.f16x2 r3364, r3340, r3361; +} +{ +cvt.rn.f16.f64 rs516, fd852; +} +mov.b32 r3369, {rs516, rs516}; +{ +mul.f16x2 r3367, r58, r3369; +} +{ +add.f16x2 r3370, r3346, r3367; +} +{ +cvt.rn.f16.f64 rs517, fd851; +} +mov.b32 r3375, {rs517, rs517}; +{ +mul.f16x2 r3373, r52, r3375; +} +{ +add.f16x2 r3376, r3352, r3373; +} +{ +cvt.rn.f16.f64 rs518, fd852; +} +mov.b32 r3381, {rs518, rs518}; +{ +mul.f16x2 r3379, r55, r3381; +} +{ +add.f16x2 r3382, r3358, r3379; +} +{ +cvt.rn.f16.f64 rs519, fd899; +} +mov.b32 r3387, {rs519, rs519}; +{ +mul.f16x2 r3385, r61, r3387; +} +{ +add.f16x2 r3388, r3364, r3385; +} +{ +cvt.rn.f16.f64 rs520, fd504; +} +mov.b32 r3393, {rs520, rs520}; +{ +mul.f16x2 r3391, r70, r3393; +} +{ +add.f16x2 r3394, r3370, r3391; +} +{ +cvt.rn.f16.f64 rs521, fd899; +} +mov.b32 r3399, {rs521, rs521}; +{ +mul.f16x2 r3397, r64, r3399; +} +{ +add.f16x2 r3400, r3376, r3397; +} +{ +cvt.rn.f16.f64 rs522, fd504; +} +mov.b32 r3405, {rs522, rs522}; +{ +mul.f16x2 r3403, r67, r3405; +} +{ +add.f16x2 r3406, r3382, r3403; +} +{ +cvt.rn.f16.f64 rs523, fd847; +} +mov.b32 r3411, {rs523, rs523}; +{ +mul.f16x2 r3409, r73, r3411; +} +{ +add.f16x2 r3412, r3388, r3409; +} +{ +cvt.rn.f16.f64 rs524, fd768; +} +mov.b32 r3417, {rs524, rs524}; +{ +mul.f16x2 r3415, r82, r3417; +} +{ +add.f16x2 r3418, r3394, r3415; +} +{ +cvt.rn.f16.f64 rs525, fd847; +} +mov.b32 r3423, {rs525, rs525}; +{ +mul.f16x2 r3421, r76, r3423; +} +{ +add.f16x2 r3424, r3400, r3421; +} +{ +cvt.rn.f16.f64 rs526, fd768; +} +mov.b32 r3429, {rs526, rs526}; +{ +mul.f16x2 r3427, r79, r3429; +} +{ +add.f16x2 r3430, r3406, r3427; +} +{ +cvt.rn.f16.f64 rs527, fd883; +} +mov.b32 r3435, {rs527, rs527}; +{ +mul.f16x2 r3433, r85, r3435; +} +{ +add.f16x2 r3436, r3412, r3433; +} +{ +cvt.rn.f16.f64 rs528, fd884; +} +mov.b32 r3441, {rs528, rs528}; +{ +mul.f16x2 r3439, r94, r3441; +} +{ +add.f16x2 r3442, r3418, r3439; +} +{ +cvt.rn.f16.f64 rs529, fd883; +} +mov.b32 r3447, {rs529, rs529}; +{ +mul.f16x2 r3445, r88, r3447; +} +{ +add.f16x2 r3448, r3424, r3445; +} +{ +cvt.rn.f16.f64 rs530, fd884; +} +mov.b32 r3453, {rs530, rs530}; +{ +mul.f16x2 r3451, r91, r3453; +} +{ +add.f16x2 r3454, r3430, r3451; +} +{ +cvt.rn.f16.f64 rs531, fd867; +} +mov.b32 r3459, {rs531, rs531}; +{ +mul.f16x2 r3457, r97, r3459; +} +{ +add.f16x2 r3460, r3436, r3457; +} +{ +cvt.rn.f16.f64 rs532, fd812; +} +mov.b32 r3465, {rs532, rs532}; +{ +mul.f16x2 r3463, r106, r3465; +} +{ +add.f16x2 r3466, r3442, r3463; +} +{ +cvt.rn.f16.f64 rs533, fd867; +} +mov.b32 r3471, {rs533, rs533}; +{ +mul.f16x2 r3469, r100, r3471; +} +{ +add.f16x2 r3472, r3448, r3469; +} +{ +cvt.rn.f16.f64 rs534, fd812; +} +mov.b32 r3477, {rs534, rs534}; +{ +mul.f16x2 r3475, r103, r3477; +} +{ +add.f16x2 r3478, r3454, r3475; +} +{ +cvt.rn.f16.f64 rs535, fd863; +} +mov.b32 r3483, {rs535, rs535}; +{ +mul.f16x2 r3481, r109, r3483; +} +{ +add.f16x2 r3484, r3460, r3481; +} +{ +cvt.rn.f16.f64 rs536, fd864; +} +mov.b32 r3489, {rs536, rs536}; +{ +mul.f16x2 r3487, r118, r3489; +} +{ +add.f16x2 r3490, r3466, r3487; +} +{ +cvt.rn.f16.f64 rs537, fd863; +} +mov.b32 r3495, {rs537, rs537}; +{ +mul.f16x2 r3493, r112, r3495; +} +{ +add.f16x2 r3496, r3472, r3493; +} +{ +cvt.rn.f16.f64 rs538, fd864; +} +mov.b32 r3501, {rs538, rs538}; +{ +mul.f16x2 r3499, r115, r3501; +} +{ +add.f16x2 r3502, r3478, r3499; +} +{ +cvt.rn.f16.f64 rs539, fd887; +} +mov.b32 r3507, {rs539, rs539}; +{ +mul.f16x2 r3505, r121, r3507; +} +{ +add.f16x2 r3508, r3484, r3505; +} +{ +cvt.rn.f16.f64 rs540, fd760; +} +mov.b32 r3513, {rs540, rs540}; +{ +mul.f16x2 r3511, r130, r3513; +} +{ +add.f16x2 r3514, r3490, r3511; +} +{ +cvt.rn.f16.f64 rs541, fd887; +} +mov.b32 r3519, {rs541, rs541}; +{ +mul.f16x2 r3517, r124, r3519; +} +{ +add.f16x2 r3520, r3496, r3517; +} +{ +cvt.rn.f16.f64 rs542, fd760; +} +mov.b32 r3525, {rs542, rs542}; +{ +mul.f16x2 r3523, r127, r3525; +} +{ +add.f16x2 r3526, r3502, r3523; +} +{ +cvt.rn.f16.f64 rs543, fd843; +} +mov.b32 r3531, {rs543, rs543}; +{ +mul.f16x2 r3529, r133, r3531; +} +{ +add.f16x2 r3532, r3508, r3529; +} +{ +cvt.rn.f16.f64 rs544, fd844; +} +mov.b32 r3537, {rs544, rs544}; +{ +mul.f16x2 r3535, r142, r3537; +} +{ +add.f16x2 r3538, r3514, r3535; +} +{ +cvt.rn.f16.f64 rs545, fd843; +} +mov.b32 r3543, {rs545, rs545}; +{ +mul.f16x2 r3541, r136, r3543; +} +{ +add.f16x2 r3544, r3520, r3541; +} +{ +cvt.rn.f16.f64 rs546, fd844; +} +mov.b32 r3549, {rs546, rs546}; +{ +mul.f16x2 r3547, r139, r3549; +} +{ +add.f16x2 r3550, r3526, r3547; +} +{ +cvt.rn.f16.f64 rs547, fd895; +} +mov.b32 r3555, {rs547, rs547}; +{ +mul.f16x2 r3553, r145, r3555; +} +{ +add.f16x2 r3556, r3532, r3553; +} +{ +cvt.rn.f16.f64 rs548, fd896; +} +mov.b32 r3561, {rs548, rs548}; +{ +mul.f16x2 r3559, r154, r3561; +} +{ +add.f16x2 r3562, r3538, r3559; +} +{ +cvt.rn.f16.f64 rs549, fd895; +} +mov.b32 r3567, {rs549, rs549}; +{ +mul.f16x2 r3565, r148, r3567; +} +{ +add.f16x2 r3568, r3544, r3565; +} +{ +cvt.rn.f16.f64 rs550, fd896; +} +mov.b32 r3573, {rs550, rs550}; +{ +mul.f16x2 r3571, r151, r3573; +} +{ +add.f16x2 r3574, r3550, r3571; +} +{ +cvt.rn.f16.f64 rs551, fd855; +} +mov.b32 r3579, {rs551, rs551}; +{ +mul.f16x2 r3577, r157, r3579; +} +{ +add.f16x2 r3580, r3556, r3577; +} +{ +cvt.rn.f16.f64 rs552, fd816; +} +mov.b32 r3585, {rs552, rs552}; +{ +mul.f16x2 r3583, r166, r3585; +} +{ +add.f16x2 r3586, r3562, r3583; +} +{ +cvt.rn.f16.f64 rs553, fd855; +} +mov.b32 r3591, {rs553, rs553}; +{ +mul.f16x2 r3589, r160, r3591; +} +{ +add.f16x2 r3592, r3568, r3589; +} +{ +cvt.rn.f16.f64 rs554, fd816; +} +mov.b32 r3597, {rs554, rs554}; +{ +mul.f16x2 r3595, r163, r3597; +} +{ +add.f16x2 r3598, r3574, r3595; +} +{ +cvt.rn.f16.f64 rs555, fd875; +} +mov.b32 r3603, {rs555, rs555}; +{ +mul.f16x2 r3601, r169, r3603; +} +{ +add.f16x2 r3604, r3580, r3601; +} +{ +cvt.rn.f16.f64 rs556, fd876; +} +mov.b32 r3609, {rs556, rs556}; +{ +mul.f16x2 r3607, r178, r3609; +} +{ +add.f16x2 r3610, r3586, r3607; +} +{ +cvt.rn.f16.f64 rs557, fd875; +} +mov.b32 r3615, {rs557, rs557}; +{ +mul.f16x2 r3613, r172, r3615; +} +{ +add.f16x2 r3616, r3592, r3613; +} +{ +cvt.rn.f16.f64 rs558, fd876; +} +mov.b32 r3621, {rs558, rs558}; +{ +mul.f16x2 r3619, r175, r3621; +} +{ +add.f16x2 r3622, r3598, r3619; +} +{ +sub.f16x2 %18, r3604, r3610; +} +{ +add.f16x2 %19, r3616, r3622; +} +{ +add.f16x2 %44, r3604, r3610; +} +{ +sub.f16x2 %45, r3616, r3622; +} +cvt.rn.f16.s32 rs559, r5508; +mov.b32 r3649, {rs559, rs559}; +cvt.rn.f16.s32 rs560, r5508; +mov.b32 r3661, {rs560, rs560}; +{ +cvt.rn.f16.f64 rs561, fd883; +} +mov.b32 r3641, {rs561, rs561}; +{ +mul.f16x2 r3639, r1, r3641; +} +{ +add.f16x2 r3642, %62, r3639; +} +{ +cvt.rn.f16.f64 rs562, fd884; +} +mov.b32 r3647, {rs562, rs562}; +{ +mul.f16x2 r3645, r10, r3647; +} +{ +add.f16x2 r3648, r3649, r3645; +} +{ +cvt.rn.f16.f64 rs563, fd883; +} +mov.b32 r3653, {rs563, rs563}; +{ +mul.f16x2 r3651, r4, r3653; +} +{ +add.f16x2 r3654, %63, r3651; +} +{ +cvt.rn.f16.f64 rs564, fd884; +} +mov.b32 r3659, {rs564, rs564}; +{ +mul.f16x2 r3657, r7, r3659; +} +{ +add.f16x2 r3660, r3661, r3657; +} +{ +cvt.rn.f16.f64 rs565, fd875; +} +mov.b32 r3665, {rs565, rs565}; +{ +mul.f16x2 r3663, r13, r3665; +} +{ +add.f16x2 r3666, r3642, r3663; +} +{ +cvt.rn.f16.f64 rs566, fd708; +} +mov.b32 r3671, {rs566, rs566}; +{ +mul.f16x2 r3669, r22, r3671; +} +{ +add.f16x2 r3672, r3648, r3669; +} +{ +cvt.rn.f16.f64 rs567, fd875; +} +mov.b32 r3677, {rs567, rs567}; +{ +mul.f16x2 r3675, r16, r3677; +} +{ +add.f16x2 r3678, r3654, r3675; +} +{ +cvt.rn.f16.f64 rs568, fd708; +} +mov.b32 r3683, {rs568, rs568}; +{ +mul.f16x2 r3681, r19, r3683; +} +{ +add.f16x2 r3684, r3660, r3681; +} +{ +cvt.rn.f16.f64 rs569, fd847; +} +mov.b32 r3689, {rs569, rs569}; +{ +mul.f16x2 r3687, r25, r3689; +} +{ +add.f16x2 r3690, r3666, r3687; +} +{ +cvt.rn.f16.f64 rs570, fd848; +} +mov.b32 r3695, {rs570, rs570}; +{ +mul.f16x2 r3693, r34, r3695; +} +{ +add.f16x2 r3696, r3672, r3693; +} +{ +cvt.rn.f16.f64 rs571, fd847; +} +mov.b32 r3701, {rs571, rs571}; +{ +mul.f16x2 r3699, r28, r3701; +} +{ +add.f16x2 r3702, r3678, r3699; +} +{ +cvt.rn.f16.f64 rs572, fd848; +} +mov.b32 r3707, {rs572, rs572}; +{ +mul.f16x2 r3705, r31, r3707; +} +{ +add.f16x2 r3708, r3684, r3705; +} +{ +cvt.rn.f16.f64 rs573, fd891; +} +mov.b32 r3713, {rs573, rs573}; +{ +mul.f16x2 r3711, r37, r3713; +} +{ +add.f16x2 r3714, r3690, r3711; +} +{ +cvt.rn.f16.f64 rs574, fd892; +} +mov.b32 r3719, {rs574, rs574}; +{ +mul.f16x2 r3717, r46, r3719; +} +{ +add.f16x2 r3720, r3696, r3717; +} +{ +cvt.rn.f16.f64 rs575, fd891; +} +mov.b32 r3725, {rs575, rs575}; +{ +mul.f16x2 r3723, r40, r3725; +} +{ +add.f16x2 r3726, r3702, r3723; +} +{ +cvt.rn.f16.f64 rs576, fd892; +} +mov.b32 r3731, {rs576, rs576}; +{ +mul.f16x2 r3729, r43, r3731; +} +{ +add.f16x2 r3732, r3708, r3729; +} +{ +cvt.rn.f16.f64 rs577, fd867; +} +mov.b32 r3737, {rs577, rs577}; +{ +mul.f16x2 r3735, r49, r3737; +} +{ +add.f16x2 r3738, r3714, r3735; +} +{ +cvt.rn.f16.f64 rs578, fd812; +} +mov.b32 r3743, {rs578, rs578}; +{ +mul.f16x2 r3741, r58, r3743; +} +{ +add.f16x2 r3744, r3720, r3741; +} +{ +cvt.rn.f16.f64 rs579, fd867; +} +mov.b32 r3749, {rs579, rs579}; +{ +mul.f16x2 r3747, r52, r3749; +} +{ +add.f16x2 r3750, r3726, r3747; +} +{ +cvt.rn.f16.f64 rs580, fd812; +} +mov.b32 r3755, {rs580, rs580}; +{ +mul.f16x2 r3753, r55, r3755; +} +{ +add.f16x2 r3756, r3732, r3753; +} +{ +cvt.rn.f16.f64 rs581, fd855; +} +mov.b32 r3761, {rs581, rs581}; +{ +mul.f16x2 r3759, r61, r3761; +} +{ +add.f16x2 r3762, r3738, r3759; +} +{ +cvt.rn.f16.f64 rs582, fd856; +} +mov.b32 r3767, {rs582, rs582}; +{ +mul.f16x2 r3765, r70, r3767; +} +{ +add.f16x2 r3768, r3744, r3765; +} +{ +cvt.rn.f16.f64 rs583, fd855; +} +mov.b32 r3773, {rs583, rs583}; +{ +mul.f16x2 r3771, r64, r3773; +} +{ +add.f16x2 r3774, r3750, r3771; +} +{ +cvt.rn.f16.f64 rs584, fd856; +} +mov.b32 r3779, {rs584, rs584}; +{ +mul.f16x2 r3777, r67, r3779; +} +{ +add.f16x2 r3780, r3756, r3777; +} +{ +cvt.rn.f16.f64 rs585, fd899; +} +mov.b32 r3785, {rs585, rs585}; +{ +mul.f16x2 r3783, r73, r3785; +} +{ +add.f16x2 r3786, r3762, r3783; +} +{ +cvt.rn.f16.f64 rs586, fd900; +} +mov.b32 r3791, {rs586, rs586}; +{ +mul.f16x2 r3789, r82, r3791; +} +{ +add.f16x2 r3792, r3768, r3789; +} +{ +cvt.rn.f16.f64 rs587, fd899; +} +mov.b32 r3797, {rs587, rs587}; +{ +mul.f16x2 r3795, r76, r3797; +} +{ +add.f16x2 r3798, r3774, r3795; +} +{ +cvt.rn.f16.f64 rs588, fd900; +} +mov.b32 r3803, {rs588, rs588}; +{ +mul.f16x2 r3801, r79, r3803; +} +{ +add.f16x2 r3804, r3780, r3801; +} +{ +cvt.rn.f16.f64 rs589, fd859; +} +mov.b32 r3809, {rs589, rs589}; +{ +mul.f16x2 r3807, r85, r3809; +} +{ +add.f16x2 r3810, r3786, r3807; +} +{ +cvt.rn.f16.f64 rs590, fd572; +} +mov.b32 r3815, {rs590, rs590}; +{ +mul.f16x2 r3813, r94, r3815; +} +{ +add.f16x2 r3816, r3792, r3813; +} +{ +cvt.rn.f16.f64 rs591, fd859; +} +mov.b32 r3821, {rs591, rs591}; +{ +mul.f16x2 r3819, r88, r3821; +} +{ +add.f16x2 r3822, r3798, r3819; +} +{ +cvt.rn.f16.f64 rs592, fd572; +} +mov.b32 r3827, {rs592, rs592}; +{ +mul.f16x2 r3825, r91, r3827; +} +{ +add.f16x2 r3828, r3804, r3825; +} +{ +cvt.rn.f16.f64 rs593, fd863; +} +mov.b32 r3833, {rs593, rs593}; +{ +mul.f16x2 r3831, r97, r3833; +} +{ +add.f16x2 r3834, r3810, r3831; +} +{ +cvt.rn.f16.f64 rs594, fd864; +} +mov.b32 r3839, {rs594, rs594}; +{ +mul.f16x2 r3837, r106, r3839; +} +{ +add.f16x2 r3840, r3816, r3837; +} +{ +cvt.rn.f16.f64 rs595, fd863; +} +mov.b32 r3845, {rs595, rs595}; +{ +mul.f16x2 r3843, r100, r3845; +} +{ +add.f16x2 r3846, r3822, r3843; +} +{ +cvt.rn.f16.f64 rs596, fd864; +} +mov.b32 r3851, {rs596, rs596}; +{ +mul.f16x2 r3849, r103, r3851; +} +{ +add.f16x2 r3852, r3828, r3849; +} +{ +cvt.rn.f16.f64 rs597, fd895; +} +mov.b32 r3857, {rs597, rs597}; +{ +mul.f16x2 r3855, r109, r3857; +} +{ +add.f16x2 r3858, r3834, r3855; +} +{ +cvt.rn.f16.f64 rs598, fd580; +} +mov.b32 r3863, {rs598, rs598}; +{ +mul.f16x2 r3861, r118, r3863; +} +{ +add.f16x2 r3864, r3840, r3861; +} +{ +cvt.rn.f16.f64 rs599, fd895; +} +mov.b32 r3869, {rs599, rs599}; +{ +mul.f16x2 r3867, r112, r3869; +} +{ +add.f16x2 r3870, r3846, r3867; +} +{ +cvt.rn.f16.f64 rs600, fd580; +} +mov.b32 r3875, {rs600, rs600}; +{ +mul.f16x2 r3873, r115, r3875; +} +{ +add.f16x2 r3876, r3852, r3873; +} +{ +cvt.rn.f16.f64 rs601, fd851; +} +mov.b32 r3881, {rs601, rs601}; +{ +mul.f16x2 r3879, r121, r3881; +} +{ +add.f16x2 r3882, r3858, r3879; +} +{ +cvt.rn.f16.f64 rs602, fd676; +} +mov.b32 r3887, {rs602, rs602}; +{ +mul.f16x2 r3885, r130, r3887; +} +{ +add.f16x2 r3888, r3864, r3885; +} +{ +cvt.rn.f16.f64 rs603, fd851; +} +mov.b32 r3893, {rs603, rs603}; +{ +mul.f16x2 r3891, r124, r3893; +} +{ +add.f16x2 r3894, r3870, r3891; +} +{ +cvt.rn.f16.f64 rs604, fd676; +} +mov.b32 r3899, {rs604, rs604}; +{ +mul.f16x2 r3897, r127, r3899; +} +{ +add.f16x2 r3900, r3876, r3897; +} +{ +cvt.rn.f16.f64 rs605, fd871; +} +mov.b32 r3905, {rs605, rs605}; +{ +mul.f16x2 r3903, r133, r3905; +} +{ +add.f16x2 r3906, r3882, r3903; +} +{ +cvt.rn.f16.f64 rs606, fd872; +} +mov.b32 r3911, {rs606, rs606}; +{ +mul.f16x2 r3909, r142, r3911; +} +{ +add.f16x2 r3912, r3888, r3909; +} +{ +cvt.rn.f16.f64 rs607, fd871; +} +mov.b32 r3917, {rs607, rs607}; +{ +mul.f16x2 r3915, r136, r3917; +} +{ +add.f16x2 r3918, r3894, r3915; +} +{ +cvt.rn.f16.f64 rs608, fd872; +} +mov.b32 r3923, {rs608, rs608}; +{ +mul.f16x2 r3921, r139, r3923; +} +{ +add.f16x2 r3924, r3900, r3921; +} +{ +cvt.rn.f16.f64 rs609, fd887; +} +mov.b32 r3929, {rs609, rs609}; +{ +mul.f16x2 r3927, r145, r3929; +} +{ +add.f16x2 r3930, r3906, r3927; +} +{ +cvt.rn.f16.f64 rs610, fd760; +} +mov.b32 r3935, {rs610, rs610}; +{ +mul.f16x2 r3933, r154, r3935; +} +{ +add.f16x2 r3936, r3912, r3933; +} +{ +cvt.rn.f16.f64 rs611, fd887; +} +mov.b32 r3941, {rs611, rs611}; +{ +mul.f16x2 r3939, r148, r3941; +} +{ +add.f16x2 r3942, r3918, r3939; +} +{ +cvt.rn.f16.f64 rs612, fd760; +} +mov.b32 r3947, {rs612, rs612}; +{ +mul.f16x2 r3945, r151, r3947; +} +{ +add.f16x2 r3948, r3924, r3945; +} +{ +cvt.rn.f16.f64 rs613, fd843; +} +mov.b32 r3953, {rs613, rs613}; +{ +mul.f16x2 r3951, r157, r3953; +} +{ +add.f16x2 r3954, r3930, r3951; +} +{ +cvt.rn.f16.f64 rs614, fd820; +} +mov.b32 r3959, {rs614, rs614}; +{ +mul.f16x2 r3957, r166, r3959; +} +{ +add.f16x2 r3960, r3936, r3957; +} +{ +cvt.rn.f16.f64 rs615, fd843; +} +mov.b32 r3965, {rs615, rs615}; +{ +mul.f16x2 r3963, r160, r3965; +} +{ +add.f16x2 r3966, r3942, r3963; +} +{ +cvt.rn.f16.f64 rs616, fd820; +} +mov.b32 r3971, {rs616, rs616}; +{ +mul.f16x2 r3969, r163, r3971; +} +{ +add.f16x2 r3972, r3948, r3969; +} +{ +cvt.rn.f16.f64 rs617, fd879; +} +mov.b32 r3977, {rs617, rs617}; +{ +mul.f16x2 r3975, r169, r3977; +} +{ +add.f16x2 r3978, r3954, r3975; +} +{ +cvt.rn.f16.f64 rs618, fd880; +} +mov.b32 r3983, {rs618, rs618}; +{ +mul.f16x2 r3981, r178, r3983; +} +{ +add.f16x2 r3984, r3960, r3981; +} +{ +cvt.rn.f16.f64 rs619, fd879; +} +mov.b32 r3989, {rs619, rs619}; +{ +mul.f16x2 r3987, r172, r3989; +} +{ +add.f16x2 r3990, r3966, r3987; +} +{ +cvt.rn.f16.f64 rs620, fd880; +} +mov.b32 r3995, {rs620, rs620}; +{ +mul.f16x2 r3993, r175, r3995; +} +{ +add.f16x2 r3996, r3972, r3993; +} +{ +sub.f16x2 %20, r3978, r3984; +} +{ +add.f16x2 %21, r3990, r3996; +} +{ +add.f16x2 %42, r3978, r3984; +} +{ +sub.f16x2 %43, r3990, r3996; +} +cvt.rn.f16.s32 rs621, r5508; +mov.b32 r4023, {rs621, rs621}; +cvt.rn.f16.s32 rs622, r5508; +mov.b32 r4035, {rs622, rs622}; +{ +cvt.rn.f16.f64 rs623, fd875; +} +mov.b32 r4015, {rs623, rs623}; +{ +mul.f16x2 r4013, r1, r4015; +} +{ +add.f16x2 r4016, %62, r4013; +} +{ +cvt.rn.f16.f64 rs624, fd876; +} +mov.b32 r4021, {rs624, rs624}; +{ +mul.f16x2 r4019, r10, r4021; +} +{ +add.f16x2 r4022, r4023, r4019; +} +{ +cvt.rn.f16.f64 rs625, fd875; +} +mov.b32 r4027, {rs625, rs625}; +{ +mul.f16x2 r4025, r4, r4027; +} +{ +add.f16x2 r4028, %63, r4025; +} +{ +cvt.rn.f16.f64 rs626, fd876; +} +mov.b32 r4033, {rs626, rs626}; +{ +mul.f16x2 r4031, r7, r4033; +} +{ +add.f16x2 r4034, r4035, r4031; +} +{ +cvt.rn.f16.f64 rs627, fd891; +} +mov.b32 r4039, {rs627, rs627}; +{ +mul.f16x2 r4037, r13, r4039; +} +{ +add.f16x2 r4040, r4016, r4037; +} +{ +cvt.rn.f16.f64 rs628, fd804; +} +mov.b32 r4045, {rs628, rs628}; +{ +mul.f16x2 r4043, r22, r4045; +} +{ +add.f16x2 r4046, r4022, r4043; +} +{ +cvt.rn.f16.f64 rs629, fd891; +} +mov.b32 r4051, {rs629, rs629}; +{ +mul.f16x2 r4049, r16, r4051; +} +{ +add.f16x2 r4052, r4028, r4049; +} +{ +cvt.rn.f16.f64 rs630, fd804; +} +mov.b32 r4057, {rs630, rs630}; +{ +mul.f16x2 r4055, r19, r4057; +} +{ +add.f16x2 r4058, r4034, r4055; +} +{ +cvt.rn.f16.f64 rs631, fd855; +} +mov.b32 r4063, {rs631, rs631}; +{ +mul.f16x2 r4061, r25, r4063; +} +{ +add.f16x2 r4064, r4040, r4061; +} +{ +cvt.rn.f16.f64 rs632, fd816; +} +mov.b32 r4069, {rs632, rs632}; +{ +mul.f16x2 r4067, r34, r4069; +} +{ +add.f16x2 r4070, r4046, r4067; +} +{ +cvt.rn.f16.f64 rs633, fd855; +} +mov.b32 r4075, {rs633, rs633}; +{ +mul.f16x2 r4073, r28, r4075; +} +{ +add.f16x2 r4076, r4052, r4073; +} +{ +cvt.rn.f16.f64 rs634, fd816; +} +mov.b32 r4081, {rs634, rs634}; +{ +mul.f16x2 r4079, r31, r4081; +} +{ +add.f16x2 r4082, r4058, r4079; +} +{ +cvt.rn.f16.f64 rs635, fd859; +} +mov.b32 r4087, {rs635, rs635}; +{ +mul.f16x2 r4085, r37, r4087; +} +{ +add.f16x2 r4088, r4064, r4085; +} +{ +cvt.rn.f16.f64 rs636, fd860; +} +mov.b32 r4093, {rs636, rs636}; +{ +mul.f16x2 r4091, r46, r4093; +} +{ +add.f16x2 r4094, r4070, r4091; +} +{ +cvt.rn.f16.f64 rs637, fd859; +} +mov.b32 r4099, {rs637, rs637}; +{ +mul.f16x2 r4097, r40, r4099; +} +{ +add.f16x2 r4100, r4076, r4097; +} +{ +cvt.rn.f16.f64 rs638, fd860; +} +mov.b32 r4105, {rs638, rs638}; +{ +mul.f16x2 r4103, r43, r4105; +} +{ +add.f16x2 r4106, r4082, r4103; +} +{ +cvt.rn.f16.f64 rs639, fd895; +} +mov.b32 r4111, {rs639, rs639}; +{ +mul.f16x2 r4109, r49, r4111; +} +{ +add.f16x2 r4112, r4088, r4109; +} +{ +cvt.rn.f16.f64 rs640, fd896; +} +mov.b32 r4117, {rs640, rs640}; +{ +mul.f16x2 r4115, r58, r4117; +} +{ +add.f16x2 r4118, r4094, r4115; +} +{ +cvt.rn.f16.f64 rs641, fd895; +} +mov.b32 r4123, {rs641, rs641}; +{ +mul.f16x2 r4121, r52, r4123; +} +{ +add.f16x2 r4124, r4100, r4121; +} +{ +cvt.rn.f16.f64 rs642, fd896; +} +mov.b32 r4129, {rs642, rs642}; +{ +mul.f16x2 r4127, r55, r4129; +} +{ +add.f16x2 r4130, r4106, r4127; +} +{ +cvt.rn.f16.f64 rs643, fd871; +} +mov.b32 r4135, {rs643, rs643}; +{ +mul.f16x2 r4133, r61, r4135; +} +{ +add.f16x2 r4136, r4112, r4133; +} +{ +cvt.rn.f16.f64 rs644, fd624; +} +mov.b32 r4141, {rs644, rs644}; +{ +mul.f16x2 r4139, r70, r4141; +} +{ +add.f16x2 r4142, r4118, r4139; +} +{ +cvt.rn.f16.f64 rs645, fd871; +} +mov.b32 r4147, {rs645, rs645}; +{ +mul.f16x2 r4145, r64, r4147; +} +{ +add.f16x2 r4148, r4124, r4145; +} +{ +cvt.rn.f16.f64 rs646, fd624; +} +mov.b32 r4153, {rs646, rs646}; +{ +mul.f16x2 r4151, r67, r4153; +} +{ +add.f16x2 r4154, r4130, r4151; +} +{ +cvt.rn.f16.f64 rs647, fd843; +} +mov.b32 r4159, {rs647, rs647}; +{ +mul.f16x2 r4157, r73, r4159; +} +{ +add.f16x2 r4160, r4136, r4157; +} +{ +cvt.rn.f16.f64 rs648, fd844; +} +mov.b32 r4165, {rs648, rs648}; +{ +mul.f16x2 r4163, r82, r4165; +} +{ +add.f16x2 r4166, r4142, r4163; +} +{ +cvt.rn.f16.f64 rs649, fd843; +} +mov.b32 r4171, {rs649, rs649}; +{ +mul.f16x2 r4169, r76, r4171; +} +{ +add.f16x2 r4172, r4148, r4169; +} +{ +cvt.rn.f16.f64 rs650, fd844; +} +mov.b32 r4177, {rs650, rs650}; +{ +mul.f16x2 r4175, r79, r4177; +} +{ +add.f16x2 r4178, r4154, r4175; +} +{ +cvt.rn.f16.f64 rs651, fd879; +} +mov.b32 r4183, {rs651, rs651}; +{ +mul.f16x2 r4181, r85, r4183; +} +{ +add.f16x2 r4184, r4160, r4181; +} +{ +cvt.rn.f16.f64 rs652, fd880; +} +mov.b32 r4189, {rs652, rs652}; +{ +mul.f16x2 r4187, r94, r4189; +} +{ +add.f16x2 r4190, r4166, r4187; +} +{ +cvt.rn.f16.f64 rs653, fd879; +} +mov.b32 r4195, {rs653, rs653}; +{ +mul.f16x2 r4193, r88, r4195; +} +{ +add.f16x2 r4196, r4172, r4193; +} +{ +cvt.rn.f16.f64 rs654, fd880; +} +mov.b32 r4201, {rs654, rs654}; +{ +mul.f16x2 r4199, r91, r4201; +} +{ +add.f16x2 r4202, r4178, r4199; +} +{ +cvt.rn.f16.f64 rs655, fd887; +} +mov.b32 r4207, {rs655, rs655}; +{ +mul.f16x2 r4205, r97, r4207; +} +{ +add.f16x2 r4208, r4184, r4205; +} +{ +cvt.rn.f16.f64 rs656, fd760; +} +mov.b32 r4213, {rs656, rs656}; +{ +mul.f16x2 r4211, r106, r4213; +} +{ +add.f16x2 r4214, r4190, r4211; +} +{ +cvt.rn.f16.f64 rs657, fd887; +} +mov.b32 r4219, {rs657, rs657}; +{ +mul.f16x2 r4217, r100, r4219; +} +{ +add.f16x2 r4220, r4196, r4217; +} +{ +cvt.rn.f16.f64 rs658, fd760; +} +mov.b32 r4225, {rs658, rs658}; +{ +mul.f16x2 r4223, r103, r4225; +} +{ +add.f16x2 r4226, r4202, r4223; +} +{ +cvt.rn.f16.f64 rs659, fd851; +} +mov.b32 r4231, {rs659, rs659}; +{ +mul.f16x2 r4229, r109, r4231; +} +{ +add.f16x2 r4232, r4208, r4229; +} +{ +cvt.rn.f16.f64 rs660, fd676; +} +mov.b32 r4237, {rs660, rs660}; +{ +mul.f16x2 r4235, r118, r4237; +} +{ +add.f16x2 r4238, r4214, r4235; +} +{ +cvt.rn.f16.f64 rs661, fd851; +} +mov.b32 r4243, {rs661, rs661}; +{ +mul.f16x2 r4241, r112, r4243; +} +{ +add.f16x2 r4244, r4220, r4241; +} +{ +cvt.rn.f16.f64 rs662, fd676; +} +mov.b32 r4249, {rs662, rs662}; +{ +mul.f16x2 r4247, r115, r4249; +} +{ +add.f16x2 r4250, r4226, r4247; +} +{ +cvt.rn.f16.f64 rs663, fd863; +} +mov.b32 r4255, {rs663, rs663}; +{ +mul.f16x2 r4253, r121, r4255; +} +{ +add.f16x2 r4256, r4232, r4253; +} +{ +cvt.rn.f16.f64 rs664, fd864; +} +mov.b32 r4261, {rs664, rs664}; +{ +mul.f16x2 r4259, r130, r4261; +} +{ +add.f16x2 r4262, r4238, r4259; +} +{ +cvt.rn.f16.f64 rs665, fd863; +} +mov.b32 r4267, {rs665, rs665}; +{ +mul.f16x2 r4265, r124, r4267; +} +{ +add.f16x2 r4268, r4244, r4265; +} +{ +cvt.rn.f16.f64 rs666, fd864; +} +mov.b32 r4273, {rs666, rs666}; +{ +mul.f16x2 r4271, r127, r4273; +} +{ +add.f16x2 r4274, r4250, r4271; +} +{ +cvt.rn.f16.f64 rs667, fd899; +} +mov.b32 r4279, {rs667, rs667}; +{ +mul.f16x2 r4277, r133, r4279; +} +{ +add.f16x2 r4280, r4256, r4277; +} +{ +cvt.rn.f16.f64 rs668, fd900; +} +mov.b32 r4285, {rs668, rs668}; +{ +mul.f16x2 r4283, r142, r4285; +} +{ +add.f16x2 r4286, r4262, r4283; +} +{ +cvt.rn.f16.f64 rs669, fd899; +} +mov.b32 r4291, {rs669, rs669}; +{ +mul.f16x2 r4289, r136, r4291; +} +{ +add.f16x2 r4292, r4268, r4289; +} +{ +cvt.rn.f16.f64 rs670, fd900; +} +mov.b32 r4297, {rs670, rs670}; +{ +mul.f16x2 r4295, r139, r4297; +} +{ +add.f16x2 r4298, r4274, r4295; +} +{ +cvt.rn.f16.f64 rs671, fd867; +} +mov.b32 r4303, {rs671, rs671}; +{ +mul.f16x2 r4301, r145, r4303; +} +{ +add.f16x2 r4304, r4280, r4301; +} +{ +cvt.rn.f16.f64 rs672, fd812; +} +mov.b32 r4309, {rs672, rs672}; +{ +mul.f16x2 r4307, r154, r4309; +} +{ +add.f16x2 r4310, r4286, r4307; +} +{ +cvt.rn.f16.f64 rs673, fd867; +} +mov.b32 r4315, {rs673, rs673}; +{ +mul.f16x2 r4313, r148, r4315; +} +{ +add.f16x2 r4316, r4292, r4313; +} +{ +cvt.rn.f16.f64 rs674, fd812; +} +mov.b32 r4321, {rs674, rs674}; +{ +mul.f16x2 r4319, r151, r4321; +} +{ +add.f16x2 r4322, r4298, r4319; +} +{ +cvt.rn.f16.f64 rs675, fd847; +} +mov.b32 r4327, {rs675, rs675}; +{ +mul.f16x2 r4325, r157, r4327; +} +{ +add.f16x2 r4328, r4304, r4325; +} +{ +cvt.rn.f16.f64 rs676, fd848; +} +mov.b32 r4333, {rs676, rs676}; +{ +mul.f16x2 r4331, r166, r4333; +} +{ +add.f16x2 r4334, r4310, r4331; +} +{ +cvt.rn.f16.f64 rs677, fd847; +} +mov.b32 r4339, {rs677, rs677}; +{ +mul.f16x2 r4337, r160, r4339; +} +{ +add.f16x2 r4340, r4316, r4337; +} +{ +cvt.rn.f16.f64 rs678, fd848; +} +mov.b32 r4345, {rs678, rs678}; +{ +mul.f16x2 r4343, r163, r4345; +} +{ +add.f16x2 r4346, r4322, r4343; +} +{ +cvt.rn.f16.f64 rs679, fd883; +} +mov.b32 r4351, {rs679, rs679}; +{ +mul.f16x2 r4349, r169, r4351; +} +{ +add.f16x2 r4352, r4328, r4349; +} +{ +cvt.rn.f16.f64 rs680, fd884; +} +mov.b32 r4357, {rs680, rs680}; +{ +mul.f16x2 r4355, r178, r4357; +} +{ +add.f16x2 r4358, r4334, r4355; +} +{ +cvt.rn.f16.f64 rs681, fd883; +} +mov.b32 r4363, {rs681, rs681}; +{ +mul.f16x2 r4361, r172, r4363; +} +{ +add.f16x2 r4364, r4340, r4361; +} +{ +cvt.rn.f16.f64 rs682, fd884; +} +mov.b32 r4369, {rs682, rs682}; +{ +mul.f16x2 r4367, r175, r4369; +} +{ +add.f16x2 r4370, r4346, r4367; +} +{ +sub.f16x2 %22, r4352, r4358; +} +{ +add.f16x2 %23, r4364, r4370; +} +{ +add.f16x2 %40, r4352, r4358; +} +{ +sub.f16x2 %41, r4364, r4370; +} +cvt.rn.f16.s32 rs683, r5508; +mov.b32 r4397, {rs683, rs683}; +cvt.rn.f16.s32 rs684, r5508; +mov.b32 r4409, {rs684, rs684}; +{ +cvt.rn.f16.f64 rs685, fd867; +} +mov.b32 r4389, {rs685, rs685}; +{ +mul.f16x2 r4387, r1, r4389; +} +{ +add.f16x2 r4390, %62, r4387; +} +{ +cvt.rn.f16.f64 rs686, fd868; +} +mov.b32 r4395, {rs686, rs686}; +{ +mul.f16x2 r4393, r10, r4395; +} +{ +add.f16x2 r4396, r4397, r4393; +} +{ +cvt.rn.f16.f64 rs687, fd867; +} +mov.b32 r4401, {rs687, rs687}; +{ +mul.f16x2 r4399, r4, r4401; +} +{ +add.f16x2 r4402, %63, r4399; +} +{ +cvt.rn.f16.f64 rs688, fd868; +} +mov.b32 r4407, {rs688, rs688}; +{ +mul.f16x2 r4405, r7, r4407; +} +{ +add.f16x2 r4408, r4409, r4405; +} +{ +cvt.rn.f16.f64 rs689, fd895; +} +mov.b32 r4413, {rs689, rs689}; +{ +mul.f16x2 r4411, r13, r4413; +} +{ +add.f16x2 r4414, r4390, r4411; +} +{ +cvt.rn.f16.f64 rs690, fd896; +} +mov.b32 r4419, {rs690, rs690}; +{ +mul.f16x2 r4417, r22, r4419; +} +{ +add.f16x2 r4420, r4396, r4417; +} +{ +cvt.rn.f16.f64 rs691, fd895; +} +mov.b32 r4425, {rs691, rs691}; +{ +mul.f16x2 r4423, r16, r4425; +} +{ +add.f16x2 r4426, r4402, r4423; +} +{ +cvt.rn.f16.f64 rs692, fd896; +} +mov.b32 r4431, {rs692, rs692}; +{ +mul.f16x2 r4429, r19, r4431; +} +{ +add.f16x2 r4432, r4408, r4429; +} +{ +cvt.rn.f16.f64 rs693, fd879; +} +mov.b32 r4437, {rs693, rs693}; +{ +mul.f16x2 r4435, r25, r4437; +} +{ +add.f16x2 r4438, r4414, r4435; +} +{ +cvt.rn.f16.f64 rs694, fd808; +} +mov.b32 r4443, {rs694, rs694}; +{ +mul.f16x2 r4441, r34, r4443; +} +{ +add.f16x2 r4444, r4420, r4441; +} +{ +cvt.rn.f16.f64 rs695, fd879; +} +mov.b32 r4449, {rs695, rs695}; +{ +mul.f16x2 r4447, r28, r4449; +} +{ +add.f16x2 r4450, r4426, r4447; +} +{ +cvt.rn.f16.f64 rs696, fd808; +} +mov.b32 r4455, {rs696, rs696}; +{ +mul.f16x2 r4453, r31, r4455; +} +{ +add.f16x2 r4456, r4432, r4453; +} +{ +cvt.rn.f16.f64 rs697, fd851; +} +mov.b32 r4461, {rs697, rs697}; +{ +mul.f16x2 r4459, r37, r4461; +} +{ +add.f16x2 r4462, r4438, r4459; +} +{ +cvt.rn.f16.f64 rs698, fd676; +} +mov.b32 r4467, {rs698, rs698}; +{ +mul.f16x2 r4465, r46, r4467; +} +{ +add.f16x2 r4468, r4444, r4465; +} +{ +cvt.rn.f16.f64 rs699, fd851; +} +mov.b32 r4473, {rs699, rs699}; +{ +mul.f16x2 r4471, r40, r4473; +} +{ +add.f16x2 r4474, r4450, r4471; +} +{ +cvt.rn.f16.f64 rs700, fd676; +} +mov.b32 r4479, {rs700, rs700}; +{ +mul.f16x2 r4477, r43, r4479; +} +{ +add.f16x2 r4480, r4456, r4477; +} +{ +cvt.rn.f16.f64 rs701, fd855; +} +mov.b32 r4485, {rs701, rs701}; +{ +mul.f16x2 r4483, r49, r4485; +} +{ +add.f16x2 r4486, r4462, r4483; +} +{ +cvt.rn.f16.f64 rs702, fd856; +} +mov.b32 r4491, {rs702, rs702}; +{ +mul.f16x2 r4489, r58, r4491; +} +{ +add.f16x2 r4492, r4468, r4489; +} +{ +cvt.rn.f16.f64 rs703, fd855; +} +mov.b32 r4497, {rs703, rs703}; +{ +mul.f16x2 r4495, r52, r4497; +} +{ +add.f16x2 r4498, r4474, r4495; +} +{ +cvt.rn.f16.f64 rs704, fd856; +} +mov.b32 r4503, {rs704, rs704}; +{ +mul.f16x2 r4501, r55, r4503; +} +{ +add.f16x2 r4504, r4480, r4501; +} +{ +cvt.rn.f16.f64 rs705, fd883; +} +mov.b32 r4509, {rs705, rs705}; +{ +mul.f16x2 r4507, r61, r4509; +} +{ +add.f16x2 r4510, r4486, r4507; +} +{ +cvt.rn.f16.f64 rs706, fd884; +} +mov.b32 r4515, {rs706, rs706}; +{ +mul.f16x2 r4513, r70, r4515; +} +{ +add.f16x2 r4516, r4492, r4513; +} +{ +cvt.rn.f16.f64 rs707, fd883; +} +mov.b32 r4521, {rs707, rs707}; +{ +mul.f16x2 r4519, r64, r4521; +} +{ +add.f16x2 r4522, r4498, r4519; +} +{ +cvt.rn.f16.f64 rs708, fd884; +} +mov.b32 r4527, {rs708, rs708}; +{ +mul.f16x2 r4525, r67, r4527; +} +{ +add.f16x2 r4528, r4504, r4525; +} +{ +cvt.rn.f16.f64 rs709, fd891; +} +mov.b32 r4533, {rs709, rs709}; +{ +mul.f16x2 r4531, r73, r4533; +} +{ +add.f16x2 r4534, r4510, r4531; +} +{ +cvt.rn.f16.f64 rs710, fd804; +} +mov.b32 r4539, {rs710, rs710}; +{ +mul.f16x2 r4537, r82, r4539; +} +{ +add.f16x2 r4540, r4516, r4537; +} +{ +cvt.rn.f16.f64 rs711, fd891; +} +mov.b32 r4545, {rs711, rs711}; +{ +mul.f16x2 r4543, r76, r4545; +} +{ +add.f16x2 r4546, r4522, r4543; +} +{ +cvt.rn.f16.f64 rs712, fd804; +} +mov.b32 r4551, {rs712, rs712}; +{ +mul.f16x2 r4549, r79, r4551; +} +{ +add.f16x2 r4552, r4528, r4549; +} +{ +cvt.rn.f16.f64 rs713, fd863; +} +mov.b32 r4557, {rs713, rs713}; +{ +mul.f16x2 r4555, r85, r4557; +} +{ +add.f16x2 r4558, r4534, r4555; +} +{ +cvt.rn.f16.f64 rs714, fd740; +} +mov.b32 r4563, {rs714, rs714}; +{ +mul.f16x2 r4561, r94, r4563; +} +{ +add.f16x2 r4564, r4540, r4561; +} +{ +cvt.rn.f16.f64 rs715, fd863; +} +mov.b32 r4569, {rs715, rs715}; +{ +mul.f16x2 r4567, r88, r4569; +} +{ +add.f16x2 r4570, r4546, r4567; +} +{ +cvt.rn.f16.f64 rs716, fd740; +} +mov.b32 r4575, {rs716, rs716}; +{ +mul.f16x2 r4573, r91, r4575; +} +{ +add.f16x2 r4576, r4552, r4573; +} +{ +cvt.rn.f16.f64 rs717, fd843; +} +mov.b32 r4581, {rs717, rs717}; +{ +mul.f16x2 r4579, r97, r4581; +} +{ +add.f16x2 r4582, r4558, r4579; +} +{ +cvt.rn.f16.f64 rs718, fd844; +} +mov.b32 r4587, {rs718, rs718}; +{ +mul.f16x2 r4585, r106, r4587; +} +{ +add.f16x2 r4588, r4564, r4585; +} +{ +cvt.rn.f16.f64 rs719, fd843; +} +mov.b32 r4593, {rs719, rs719}; +{ +mul.f16x2 r4591, r100, r4593; +} +{ +add.f16x2 r4594, r4570, r4591; +} +{ +cvt.rn.f16.f64 rs720, fd844; +} +mov.b32 r4599, {rs720, rs720}; +{ +mul.f16x2 r4597, r103, r4599; +} +{ +add.f16x2 r4600, r4576, r4597; +} +{ +cvt.rn.f16.f64 rs721, fd871; +} +mov.b32 r4605, {rs721, rs721}; +{ +mul.f16x2 r4603, r109, r4605; +} +{ +add.f16x2 r4606, r4582, r4603; +} +{ +cvt.rn.f16.f64 rs722, fd872; +} +mov.b32 r4611, {rs722, rs722}; +{ +mul.f16x2 r4609, r118, r4611; +} +{ +add.f16x2 r4612, r4588, r4609; +} +{ +cvt.rn.f16.f64 rs723, fd871; +} +mov.b32 r4617, {rs723, rs723}; +{ +mul.f16x2 r4615, r112, r4617; +} +{ +add.f16x2 r4618, r4594, r4615; +} +{ +cvt.rn.f16.f64 rs724, fd872; +} +mov.b32 r4623, {rs724, rs724}; +{ +mul.f16x2 r4621, r115, r4623; +} +{ +add.f16x2 r4624, r4600, r4621; +} +{ +cvt.rn.f16.f64 rs725, fd899; +} +mov.b32 r4629, {rs725, rs725}; +{ +mul.f16x2 r4627, r121, r4629; +} +{ +add.f16x2 r4630, r4606, r4627; +} +{ +cvt.rn.f16.f64 rs726, fd900; +} +mov.b32 r4635, {rs726, rs726}; +{ +mul.f16x2 r4633, r130, r4635; +} +{ +add.f16x2 r4636, r4612, r4633; +} +{ +cvt.rn.f16.f64 rs727, fd899; +} +mov.b32 r4641, {rs727, rs727}; +{ +mul.f16x2 r4639, r124, r4641; +} +{ +add.f16x2 r4642, r4618, r4639; +} +{ +cvt.rn.f16.f64 rs728, fd900; +} +mov.b32 r4647, {rs728, rs728}; +{ +mul.f16x2 r4645, r127, r4647; +} +{ +add.f16x2 r4648, r4624, r4645; +} +{ +cvt.rn.f16.f64 rs729, fd875; +} +mov.b32 r4653, {rs729, rs729}; +{ +mul.f16x2 r4651, r133, r4653; +} +{ +add.f16x2 r4654, r4630, r4651; +} +{ +cvt.rn.f16.f64 rs730, fd708; +} +mov.b32 r4659, {rs730, rs730}; +{ +mul.f16x2 r4657, r142, r4659; +} +{ +add.f16x2 r4660, r4636, r4657; +} +{ +cvt.rn.f16.f64 rs731, fd875; +} +mov.b32 r4665, {rs731, rs731}; +{ +mul.f16x2 r4663, r136, r4665; +} +{ +add.f16x2 r4666, r4642, r4663; +} +{ +cvt.rn.f16.f64 rs732, fd708; +} +mov.b32 r4671, {rs732, rs732}; +{ +mul.f16x2 r4669, r139, r4671; +} +{ +add.f16x2 r4672, r4648, r4669; +} +{ +cvt.rn.f16.f64 rs733, fd847; +} +mov.b32 r4677, {rs733, rs733}; +{ +mul.f16x2 r4675, r145, r4677; +} +{ +add.f16x2 r4678, r4654, r4675; +} +{ +cvt.rn.f16.f64 rs734, fd768; +} +mov.b32 r4683, {rs734, rs734}; +{ +mul.f16x2 r4681, r154, r4683; +} +{ +add.f16x2 r4684, r4660, r4681; +} +{ +cvt.rn.f16.f64 rs735, fd847; +} +mov.b32 r4689, {rs735, rs735}; +{ +mul.f16x2 r4687, r148, r4689; +} +{ +add.f16x2 r4690, r4666, r4687; +} +{ +cvt.rn.f16.f64 rs736, fd768; +} +mov.b32 r4695, {rs736, rs736}; +{ +mul.f16x2 r4693, r151, r4695; +} +{ +add.f16x2 r4696, r4672, r4693; +} +{ +cvt.rn.f16.f64 rs737, fd859; +} +mov.b32 r4701, {rs737, rs737}; +{ +mul.f16x2 r4699, r157, r4701; +} +{ +add.f16x2 r4702, r4678, r4699; +} +{ +cvt.rn.f16.f64 rs738, fd860; +} +mov.b32 r4707, {rs738, rs738}; +{ +mul.f16x2 r4705, r166, r4707; +} +{ +add.f16x2 r4708, r4684, r4705; +} +{ +cvt.rn.f16.f64 rs739, fd859; +} +mov.b32 r4713, {rs739, rs739}; +{ +mul.f16x2 r4711, r160, r4713; +} +{ +add.f16x2 r4714, r4690, r4711; +} +{ +cvt.rn.f16.f64 rs740, fd860; +} +mov.b32 r4719, {rs740, rs740}; +{ +mul.f16x2 r4717, r163, r4719; +} +{ +add.f16x2 r4720, r4696, r4717; +} +{ +cvt.rn.f16.f64 rs741, fd887; +} +mov.b32 r4725, {rs741, rs741}; +{ +mul.f16x2 r4723, r169, r4725; +} +{ +add.f16x2 r4726, r4702, r4723; +} +{ +cvt.rn.f16.f64 rs742, fd888; +} +mov.b32 r4731, {rs742, rs742}; +{ +mul.f16x2 r4729, r178, r4731; +} +{ +add.f16x2 r4732, r4708, r4729; +} +{ +cvt.rn.f16.f64 rs743, fd887; +} +mov.b32 r4737, {rs743, rs743}; +{ +mul.f16x2 r4735, r172, r4737; +} +{ +add.f16x2 r4738, r4714, r4735; +} +{ +cvt.rn.f16.f64 rs744, fd888; +} +mov.b32 r4743, {rs744, rs744}; +{ +mul.f16x2 r4741, r175, r4743; +} +{ +add.f16x2 r4744, r4720, r4741; +} +{ +sub.f16x2 %24, r4726, r4732; +} +{ +add.f16x2 %25, r4738, r4744; +} +{ +add.f16x2 %38, r4726, r4732; +} +{ +sub.f16x2 %39, r4738, r4744; +} +cvt.rn.f16.s32 rs745, r5508; +mov.b32 r4771, {rs745, rs745}; +cvt.rn.f16.s32 rs746, r5508; +mov.b32 r4783, {rs746, rs746}; +{ +cvt.rn.f16.f64 rs747, fd859; +} +mov.b32 r4763, {rs747, rs747}; +{ +mul.f16x2 r4761, r1, r4763; +} +{ +add.f16x2 r4764, %62, r4761; +} +{ +cvt.rn.f16.f64 rs748, fd860; +} +mov.b32 r4769, {rs748, rs748}; +{ +mul.f16x2 r4767, r10, r4769; +} +{ +add.f16x2 r4770, r4771, r4767; +} +{ +cvt.rn.f16.f64 rs749, fd859; +} +mov.b32 r4775, {rs749, rs749}; +{ +mul.f16x2 r4773, r4, r4775; +} +{ +add.f16x2 r4776, %63, r4773; +} +{ +cvt.rn.f16.f64 rs750, fd860; +} +mov.b32 r4781, {rs750, rs750}; +{ +mul.f16x2 r4779, r7, r4781; +} +{ +add.f16x2 r4782, r4783, r4779; +} +{ +cvt.rn.f16.f64 rs751, fd879; +} +mov.b32 r4787, {rs751, rs751}; +{ +mul.f16x2 r4785, r13, r4787; +} +{ +add.f16x2 r4788, r4764, r4785; +} +{ +cvt.rn.f16.f64 rs752, fd880; +} +mov.b32 r4793, {rs752, rs752}; +{ +mul.f16x2 r4791, r22, r4793; +} +{ +add.f16x2 r4794, r4770, r4791; +} +{ +cvt.rn.f16.f64 rs753, fd879; +} +mov.b32 r4799, {rs753, rs753}; +{ +mul.f16x2 r4797, r16, r4799; +} +{ +add.f16x2 r4800, r4776, r4797; +} +{ +cvt.rn.f16.f64 rs754, fd880; +} +mov.b32 r4805, {rs754, rs754}; +{ +mul.f16x2 r4803, r19, r4805; +} +{ +add.f16x2 r4806, r4782, r4803; +} +{ +cvt.rn.f16.f64 rs755, fd899; +} +mov.b32 r4811, {rs755, rs755}; +{ +mul.f16x2 r4809, r25, r4811; +} +{ +add.f16x2 r4812, r4788, r4809; +} +{ +cvt.rn.f16.f64 rs756, fd900; +} +mov.b32 r4817, {rs756, rs756}; +{ +mul.f16x2 r4815, r34, r4817; +} +{ +add.f16x2 r4818, r4794, r4815; +} +{ +cvt.rn.f16.f64 rs757, fd899; +} +mov.b32 r4823, {rs757, rs757}; +{ +mul.f16x2 r4821, r28, r4823; +} +{ +add.f16x2 r4824, r4800, r4821; +} +{ +cvt.rn.f16.f64 rs758, fd900; +} +mov.b32 r4829, {rs758, rs758}; +{ +mul.f16x2 r4827, r31, r4829; +} +{ +add.f16x2 r4830, r4806, r4827; +} +{ +cvt.rn.f16.f64 rs759, fd883; +} +mov.b32 r4835, {rs759, rs759}; +{ +mul.f16x2 r4833, r37, r4835; +} +{ +add.f16x2 r4836, r4812, r4833; +} +{ +cvt.rn.f16.f64 rs760, fd736; +} +mov.b32 r4841, {rs760, rs760}; +{ +mul.f16x2 r4839, r46, r4841; +} +{ +add.f16x2 r4842, r4818, r4839; +} +{ +cvt.rn.f16.f64 rs761, fd883; +} +mov.b32 r4847, {rs761, rs761}; +{ +mul.f16x2 r4845, r40, r4847; +} +{ +add.f16x2 r4848, r4824, r4845; +} +{ +cvt.rn.f16.f64 rs762, fd736; +} +mov.b32 r4853, {rs762, rs762}; +{ +mul.f16x2 r4851, r43, r4853; +} +{ +add.f16x2 r4854, r4830, r4851; +} +{ +cvt.rn.f16.f64 rs763, fd863; +} +mov.b32 r4859, {rs763, rs763}; +{ +mul.f16x2 r4857, r49, r4859; +} +{ +add.f16x2 r4860, r4836, r4857; +} +{ +cvt.rn.f16.f64 rs764, fd740; +} +mov.b32 r4865, {rs764, rs764}; +{ +mul.f16x2 r4863, r58, r4865; +} +{ +add.f16x2 r4866, r4842, r4863; +} +{ +cvt.rn.f16.f64 rs765, fd863; +} +mov.b32 r4871, {rs765, rs765}; +{ +mul.f16x2 r4869, r52, r4871; +} +{ +add.f16x2 r4872, r4848, r4869; +} +{ +cvt.rn.f16.f64 rs766, fd740; +} +mov.b32 r4877, {rs766, rs766}; +{ +mul.f16x2 r4875, r55, r4877; +} +{ +add.f16x2 r4878, r4854, r4875; +} +{ +cvt.rn.f16.f64 rs767, fd843; +} +mov.b32 r4883, {rs767, rs767}; +{ +mul.f16x2 r4881, r61, r4883; +} +{ +add.f16x2 r4884, r4860, r4881; +} +{ +cvt.rn.f16.f64 rs768, fd820; +} +mov.b32 r4889, {rs768, rs768}; +{ +mul.f16x2 r4887, r70, r4889; +} +{ +add.f16x2 r4890, r4866, r4887; +} +{ +cvt.rn.f16.f64 rs769, fd843; +} +mov.b32 r4895, {rs769, rs769}; +{ +mul.f16x2 r4893, r64, r4895; +} +{ +add.f16x2 r4896, r4872, r4893; +} +{ +cvt.rn.f16.f64 rs770, fd820; +} +mov.b32 r4901, {rs770, rs770}; +{ +mul.f16x2 r4899, r67, r4901; +} +{ +add.f16x2 r4902, r4878, r4899; +} +{ +cvt.rn.f16.f64 rs771, fd855; +} +mov.b32 r4907, {rs771, rs771}; +{ +mul.f16x2 r4905, r73, r4907; +} +{ +add.f16x2 r4908, r4884, r4905; +} +{ +cvt.rn.f16.f64 rs772, fd856; +} +mov.b32 r4913, {rs772, rs772}; +{ +mul.f16x2 r4911, r82, r4913; +} +{ +add.f16x2 r4914, r4890, r4911; +} +{ +cvt.rn.f16.f64 rs773, fd855; +} +mov.b32 r4919, {rs773, rs773}; +{ +mul.f16x2 r4917, r76, r4919; +} +{ +add.f16x2 r4920, r4896, r4917; +} +{ +cvt.rn.f16.f64 rs774, fd856; +} +mov.b32 r4925, {rs774, rs774}; +{ +mul.f16x2 r4923, r79, r4925; +} +{ +add.f16x2 r4926, r4902, r4923; +} +{ +cvt.rn.f16.f64 rs775, fd875; +} +mov.b32 r4931, {rs775, rs775}; +{ +mul.f16x2 r4929, r85, r4931; +} +{ +add.f16x2 r4932, r4908, r4929; +} +{ +cvt.rn.f16.f64 rs776, fd876; +} +mov.b32 r4937, {rs776, rs776}; +{ +mul.f16x2 r4935, r94, r4937; +} +{ +add.f16x2 r4938, r4914, r4935; +} +{ +cvt.rn.f16.f64 rs777, fd875; +} +mov.b32 r4943, {rs777, rs777}; +{ +mul.f16x2 r4941, r88, r4943; +} +{ +add.f16x2 r4944, r4920, r4941; +} +{ +cvt.rn.f16.f64 rs778, fd876; +} +mov.b32 r4949, {rs778, rs778}; +{ +mul.f16x2 r4947, r91, r4949; +} +{ +add.f16x2 r4950, r4926, r4947; +} +{ +cvt.rn.f16.f64 rs779, fd895; +} +mov.b32 r4955, {rs779, rs779}; +{ +mul.f16x2 r4953, r97, r4955; +} +{ +add.f16x2 r4956, r4932, r4953; +} +{ +cvt.rn.f16.f64 rs780, fd896; +} +mov.b32 r4961, {rs780, rs780}; +{ +mul.f16x2 r4959, r106, r4961; +} +{ +add.f16x2 r4962, r4938, r4959; +} +{ +cvt.rn.f16.f64 rs781, fd895; +} +mov.b32 r4967, {rs781, rs781}; +{ +mul.f16x2 r4965, r100, r4967; +} +{ +add.f16x2 r4968, r4944, r4965; +} +{ +cvt.rn.f16.f64 rs782, fd896; +} +mov.b32 r4973, {rs782, rs782}; +{ +mul.f16x2 r4971, r103, r4973; +} +{ +add.f16x2 r4974, r4950, r4971; +} +{ +cvt.rn.f16.f64 rs783, fd887; +} +mov.b32 r4979, {rs783, rs783}; +{ +mul.f16x2 r4977, r109, r4979; +} +{ +add.f16x2 r4980, r4956, r4977; +} +{ +cvt.rn.f16.f64 rs784, fd760; +} +mov.b32 r4985, {rs784, rs784}; +{ +mul.f16x2 r4983, r118, r4985; +} +{ +add.f16x2 r4986, r4962, r4983; +} +{ +cvt.rn.f16.f64 rs785, fd887; +} +mov.b32 r4991, {rs785, rs785}; +{ +mul.f16x2 r4989, r112, r4991; +} +{ +add.f16x2 r4992, r4968, r4989; +} +{ +cvt.rn.f16.f64 rs786, fd760; +} +mov.b32 r4997, {rs786, rs786}; +{ +mul.f16x2 r4995, r115, r4997; +} +{ +add.f16x2 r4998, r4974, r4995; +} +{ +cvt.rn.f16.f64 rs787, fd867; +} +mov.b32 r5003, {rs787, rs787}; +{ +mul.f16x2 r5001, r121, r5003; +} +{ +add.f16x2 r5004, r4980, r5001; +} +{ +cvt.rn.f16.f64 rs788, fd812; +} +mov.b32 r5009, {rs788, rs788}; +{ +mul.f16x2 r5007, r130, r5009; +} +{ +add.f16x2 r5010, r4986, r5007; +} +{ +cvt.rn.f16.f64 rs789, fd867; +} +mov.b32 r5015, {rs789, rs789}; +{ +mul.f16x2 r5013, r124, r5015; +} +{ +add.f16x2 r5016, r4992, r5013; +} +{ +cvt.rn.f16.f64 rs790, fd812; +} +mov.b32 r5021, {rs790, rs790}; +{ +mul.f16x2 r5019, r127, r5021; +} +{ +add.f16x2 r5022, r4998, r5019; +} +{ +cvt.rn.f16.f64 rs791, fd847; +} +mov.b32 r5027, {rs791, rs791}; +{ +mul.f16x2 r5025, r133, r5027; +} +{ +add.f16x2 r5028, r5004, r5025; +} +{ +cvt.rn.f16.f64 rs792, fd768; +} +mov.b32 r5033, {rs792, rs792}; +{ +mul.f16x2 r5031, r142, r5033; +} +{ +add.f16x2 r5034, r5010, r5031; +} +{ +cvt.rn.f16.f64 rs793, fd847; +} +mov.b32 r5039, {rs793, rs793}; +{ +mul.f16x2 r5037, r136, r5039; +} +{ +add.f16x2 r5040, r5016, r5037; +} +{ +cvt.rn.f16.f64 rs794, fd768; +} +mov.b32 r5045, {rs794, rs794}; +{ +mul.f16x2 r5043, r139, r5045; +} +{ +add.f16x2 r5046, r5022, r5043; +} +{ +cvt.rn.f16.f64 rs795, fd851; +} +mov.b32 r5051, {rs795, rs795}; +{ +mul.f16x2 r5049, r145, r5051; +} +{ +add.f16x2 r5052, r5028, r5049; +} +{ +cvt.rn.f16.f64 rs796, fd852; +} +mov.b32 r5057, {rs796, rs796}; +{ +mul.f16x2 r5055, r154, r5057; +} +{ +add.f16x2 r5058, r5034, r5055; +} +{ +cvt.rn.f16.f64 rs797, fd851; +} +mov.b32 r5063, {rs797, rs797}; +{ +mul.f16x2 r5061, r148, r5063; +} +{ +add.f16x2 r5064, r5040, r5061; +} +{ +cvt.rn.f16.f64 rs798, fd852; +} +mov.b32 r5069, {rs798, rs798}; +{ +mul.f16x2 r5067, r151, r5069; +} +{ +add.f16x2 r5070, r5046, r5067; +} +{ +cvt.rn.f16.f64 rs799, fd871; +} +mov.b32 r5075, {rs799, rs799}; +{ +mul.f16x2 r5073, r157, r5075; +} +{ +add.f16x2 r5076, r5052, r5073; +} +{ +cvt.rn.f16.f64 rs800, fd872; +} +mov.b32 r5081, {rs800, rs800}; +{ +mul.f16x2 r5079, r166, r5081; +} +{ +add.f16x2 r5082, r5058, r5079; +} +{ +cvt.rn.f16.f64 rs801, fd871; +} +mov.b32 r5087, {rs801, rs801}; +{ +mul.f16x2 r5085, r160, r5087; +} +{ +add.f16x2 r5088, r5064, r5085; +} +{ +cvt.rn.f16.f64 rs802, fd872; +} +mov.b32 r5093, {rs802, rs802}; +{ +mul.f16x2 r5091, r163, r5093; +} +{ +add.f16x2 r5094, r5070, r5091; +} +{ +cvt.rn.f16.f64 rs803, fd891; +} +mov.b32 r5099, {rs803, rs803}; +{ +mul.f16x2 r5097, r169, r5099; +} +{ +add.f16x2 r5100, r5076, r5097; +} +{ +cvt.rn.f16.f64 rs804, fd892; +} +mov.b32 r5105, {rs804, rs804}; +{ +mul.f16x2 r5103, r178, r5105; +} +{ +add.f16x2 r5106, r5082, r5103; +} +{ +cvt.rn.f16.f64 rs805, fd891; +} +mov.b32 r5111, {rs805, rs805}; +{ +mul.f16x2 r5109, r172, r5111; +} +{ +add.f16x2 r5112, r5088, r5109; +} +{ +cvt.rn.f16.f64 rs806, fd892; +} +mov.b32 r5117, {rs806, rs806}; +{ +mul.f16x2 r5115, r175, r5117; +} +{ +add.f16x2 r5118, r5094, r5115; +} +{ +sub.f16x2 %26, r5100, r5106; +} +{ +add.f16x2 %27, r5112, r5118; +} +{ +add.f16x2 %36, r5100, r5106; +} +{ +sub.f16x2 %37, r5112, r5118; +} +cvt.rn.f16.s32 rs807, r5508; +mov.b32 r5145, {rs807, rs807}; +cvt.rn.f16.s32 rs808, r5508; +mov.b32 r5157, {rs808, rs808}; +{ +cvt.rn.f16.f64 rs809, fd851; +} +mov.b32 r5137, {rs809, rs809}; +{ +mul.f16x2 r5135, r1, r5137; +} +{ +add.f16x2 r5138, %62, r5135; +} +{ +cvt.rn.f16.f64 rs810, fd852; +} +mov.b32 r5143, {rs810, rs810}; +{ +mul.f16x2 r5141, r10, r5143; +} +{ +add.f16x2 r5144, r5145, r5141; +} +{ +cvt.rn.f16.f64 rs811, fd851; +} +mov.b32 r5149, {rs811, rs811}; +{ +mul.f16x2 r5147, r4, r5149; +} +{ +add.f16x2 r5150, %63, r5147; +} +{ +cvt.rn.f16.f64 rs812, fd852; +} +mov.b32 r5155, {rs812, rs812}; +{ +mul.f16x2 r5153, r7, r5155; +} +{ +add.f16x2 r5156, r5157, r5153; +} +{ +cvt.rn.f16.f64 rs813, fd863; +} +mov.b32 r5161, {rs813, rs813}; +{ +mul.f16x2 r5159, r13, r5161; +} +{ +add.f16x2 r5162, r5138, r5159; +} +{ +cvt.rn.f16.f64 rs814, fd864; +} +mov.b32 r5167, {rs814, rs814}; +{ +mul.f16x2 r5165, r22, r5167; +} +{ +add.f16x2 r5168, r5144, r5165; +} +{ +cvt.rn.f16.f64 rs815, fd863; +} +mov.b32 r5173, {rs815, rs815}; +{ +mul.f16x2 r5171, r16, r5173; +} +{ +add.f16x2 r5174, r5150, r5171; +} +{ +cvt.rn.f16.f64 rs816, fd864; +} +mov.b32 r5179, {rs816, rs816}; +{ +mul.f16x2 r5177, r19, r5179; +} +{ +add.f16x2 r5180, r5156, r5177; +} +{ +cvt.rn.f16.f64 rs817, fd875; +} +mov.b32 r5185, {rs817, rs817}; +{ +mul.f16x2 r5183, r25, r5185; +} +{ +add.f16x2 r5186, r5162, r5183; +} +{ +cvt.rn.f16.f64 rs818, fd876; +} +mov.b32 r5191, {rs818, rs818}; +{ +mul.f16x2 r5189, r34, r5191; +} +{ +add.f16x2 r5192, r5168, r5189; +} +{ +cvt.rn.f16.f64 rs819, fd875; +} +mov.b32 r5197, {rs819, rs819}; +{ +mul.f16x2 r5195, r28, r5197; +} +{ +add.f16x2 r5198, r5174, r5195; +} +{ +cvt.rn.f16.f64 rs820, fd876; +} +mov.b32 r5203, {rs820, rs820}; +{ +mul.f16x2 r5201, r31, r5203; +} +{ +add.f16x2 r5204, r5180, r5201; +} +{ +cvt.rn.f16.f64 rs821, fd887; +} +mov.b32 r5209, {rs821, rs821}; +{ +mul.f16x2 r5207, r37, r5209; +} +{ +add.f16x2 r5210, r5186, r5207; +} +{ +cvt.rn.f16.f64 rs822, fd888; +} +mov.b32 r5215, {rs822, rs822}; +{ +mul.f16x2 r5213, r46, r5215; +} +{ +add.f16x2 r5216, r5192, r5213; +} +{ +cvt.rn.f16.f64 rs823, fd887; +} +mov.b32 r5221, {rs823, rs823}; +{ +mul.f16x2 r5219, r40, r5221; +} +{ +add.f16x2 r5222, r5198, r5219; +} +{ +cvt.rn.f16.f64 rs824, fd888; +} +mov.b32 r5227, {rs824, rs824}; +{ +mul.f16x2 r5225, r43, r5227; +} +{ +add.f16x2 r5228, r5204, r5225; +} +{ +cvt.rn.f16.f64 rs825, fd899; +} +mov.b32 r5233, {rs825, rs825}; +{ +mul.f16x2 r5231, r49, r5233; +} +{ +add.f16x2 r5234, r5210, r5231; +} +{ +cvt.rn.f16.f64 rs826, fd900; +} +mov.b32 r5239, {rs826, rs826}; +{ +mul.f16x2 r5237, r58, r5239; +} +{ +add.f16x2 r5240, r5216, r5237; +} +{ +cvt.rn.f16.f64 rs827, fd899; +} +mov.b32 r5245, {rs827, rs827}; +{ +mul.f16x2 r5243, r52, r5245; +} +{ +add.f16x2 r5246, r5222, r5243; +} +{ +cvt.rn.f16.f64 rs828, fd900; +} +mov.b32 r5251, {rs828, rs828}; +{ +mul.f16x2 r5249, r55, r5251; +} +{ +add.f16x2 r5252, r5228, r5249; +} +{ +cvt.rn.f16.f64 rs829, fd891; +} +mov.b32 r5257, {rs829, rs829}; +{ +mul.f16x2 r5255, r61, r5257; +} +{ +add.f16x2 r5258, r5234, r5255; +} +{ +cvt.rn.f16.f64 rs830, fd804; +} +mov.b32 r5263, {rs830, rs830}; +{ +mul.f16x2 r5261, r70, r5263; +} +{ +add.f16x2 r5264, r5240, r5261; +} +{ +cvt.rn.f16.f64 rs831, fd891; +} +mov.b32 r5269, {rs831, rs831}; +{ +mul.f16x2 r5267, r64, r5269; +} +{ +add.f16x2 r5270, r5246, r5267; +} +{ +cvt.rn.f16.f64 rs832, fd804; +} +mov.b32 r5275, {rs832, rs832}; +{ +mul.f16x2 r5273, r67, r5275; +} +{ +add.f16x2 r5276, r5252, r5273; +} +{ +cvt.rn.f16.f64 rs833, fd879; +} +mov.b32 r5281, {rs833, rs833}; +{ +mul.f16x2 r5279, r73, r5281; +} +{ +add.f16x2 r5282, r5258, r5279; +} +{ +cvt.rn.f16.f64 rs834, fd808; +} +mov.b32 r5287, {rs834, rs834}; +{ +mul.f16x2 r5285, r82, r5287; +} +{ +add.f16x2 r5288, r5264, r5285; +} +{ +cvt.rn.f16.f64 rs835, fd879; +} +mov.b32 r5293, {rs835, rs835}; +{ +mul.f16x2 r5291, r76, r5293; +} +{ +add.f16x2 r5294, r5270, r5291; +} +{ +cvt.rn.f16.f64 rs836, fd808; +} +mov.b32 r5299, {rs836, rs836}; +{ +mul.f16x2 r5297, r79, r5299; +} +{ +add.f16x2 r5300, r5276, r5297; +} +{ +cvt.rn.f16.f64 rs837, fd867; +} +mov.b32 r5305, {rs837, rs837}; +{ +mul.f16x2 r5303, r85, r5305; +} +{ +add.f16x2 r5306, r5282, r5303; +} +{ +cvt.rn.f16.f64 rs838, fd812; +} +mov.b32 r5311, {rs838, rs838}; +{ +mul.f16x2 r5309, r94, r5311; +} +{ +add.f16x2 r5312, r5288, r5309; +} +{ +cvt.rn.f16.f64 rs839, fd867; +} +mov.b32 r5317, {rs839, rs839}; +{ +mul.f16x2 r5315, r88, r5317; +} +{ +add.f16x2 r5318, r5294, r5315; +} +{ +cvt.rn.f16.f64 rs840, fd812; +} +mov.b32 r5323, {rs840, rs840}; +{ +mul.f16x2 r5321, r91, r5323; +} +{ +add.f16x2 r5324, r5300, r5321; +} +{ +cvt.rn.f16.f64 rs841, fd855; +} +mov.b32 r5329, {rs841, rs841}; +{ +mul.f16x2 r5327, r97, r5329; +} +{ +add.f16x2 r5330, r5306, r5327; +} +{ +cvt.rn.f16.f64 rs842, fd816; +} +mov.b32 r5335, {rs842, rs842}; +{ +mul.f16x2 r5333, r106, r5335; +} +{ +add.f16x2 r5336, r5312, r5333; +} +{ +cvt.rn.f16.f64 rs843, fd855; +} +mov.b32 r5341, {rs843, rs843}; +{ +mul.f16x2 r5339, r100, r5341; +} +{ +add.f16x2 r5342, r5318, r5339; +} +{ +cvt.rn.f16.f64 rs844, fd816; +} +mov.b32 r5347, {rs844, rs844}; +{ +mul.f16x2 r5345, r103, r5347; +} +{ +add.f16x2 r5348, r5324, r5345; +} +{ +cvt.rn.f16.f64 rs845, fd843; +} +mov.b32 r5353, {rs845, rs845}; +{ +mul.f16x2 r5351, r109, r5353; +} +{ +add.f16x2 r5354, r5330, r5351; +} +{ +cvt.rn.f16.f64 rs846, fd820; +} +mov.b32 r5359, {rs846, rs846}; +{ +mul.f16x2 r5357, r118, r5359; +} +{ +add.f16x2 r5360, r5336, r5357; +} +{ +cvt.rn.f16.f64 rs847, fd843; +} +mov.b32 r5365, {rs847, rs847}; +{ +mul.f16x2 r5363, r112, r5365; +} +{ +add.f16x2 r5366, r5342, r5363; +} +{ +cvt.rn.f16.f64 rs848, fd820; +} +mov.b32 r5371, {rs848, rs848}; +{ +mul.f16x2 r5369, r115, r5371; +} +{ +add.f16x2 r5372, r5348, r5369; +} +{ +cvt.rn.f16.f64 rs849, fd847; +} +mov.b32 r5377, {rs849, rs849}; +{ +mul.f16x2 r5375, r121, r5377; +} +{ +add.f16x2 r5378, r5354, r5375; +} +{ +cvt.rn.f16.f64 rs850, fd848; +} +mov.b32 r5383, {rs850, rs850}; +{ +mul.f16x2 r5381, r130, r5383; +} +{ +add.f16x2 r5384, r5360, r5381; +} +{ +cvt.rn.f16.f64 rs851, fd847; +} +mov.b32 r5389, {rs851, rs851}; +{ +mul.f16x2 r5387, r124, r5389; +} +{ +add.f16x2 r5390, r5366, r5387; +} +{ +cvt.rn.f16.f64 rs852, fd848; +} +mov.b32 r5395, {rs852, rs852}; +{ +mul.f16x2 r5393, r127, r5395; +} +{ +add.f16x2 r5396, r5372, r5393; +} +{ +cvt.rn.f16.f64 rs853, fd859; +} +mov.b32 r5401, {rs853, rs853}; +{ +mul.f16x2 r5399, r133, r5401; +} +{ +add.f16x2 r5402, r5378, r5399; +} +{ +cvt.rn.f16.f64 rs854, fd860; +} +mov.b32 r5407, {rs854, rs854}; +{ +mul.f16x2 r5405, r142, r5407; +} +{ +add.f16x2 r5408, r5384, r5405; +} +{ +cvt.rn.f16.f64 rs855, fd859; +} +mov.b32 r5413, {rs855, rs855}; +{ +mul.f16x2 r5411, r136, r5413; +} +{ +add.f16x2 r5414, r5390, r5411; +} +{ +cvt.rn.f16.f64 rs856, fd860; +} +mov.b32 r5419, {rs856, rs856}; +{ +mul.f16x2 r5417, r139, r5419; +} +{ +add.f16x2 r5420, r5396, r5417; +} +{ +cvt.rn.f16.f64 rs857, fd871; +} +mov.b32 r5425, {rs857, rs857}; +{ +mul.f16x2 r5423, r145, r5425; +} +{ +add.f16x2 r5426, r5402, r5423; +} +{ +cvt.rn.f16.f64 rs858, fd872; +} +mov.b32 r5431, {rs858, rs858}; +{ +mul.f16x2 r5429, r154, r5431; +} +{ +add.f16x2 r5432, r5408, r5429; +} +{ +cvt.rn.f16.f64 rs859, fd871; +} +mov.b32 r5437, {rs859, rs859}; +{ +mul.f16x2 r5435, r148, r5437; +} +{ +add.f16x2 r5438, r5414, r5435; +} +{ +cvt.rn.f16.f64 rs860, fd872; +} +mov.b32 r5443, {rs860, rs860}; +{ +mul.f16x2 r5441, r151, r5443; +} +{ +add.f16x2 r5444, r5420, r5441; +} +{ +cvt.rn.f16.f64 rs861, fd883; +} +mov.b32 r5449, {rs861, rs861}; +{ +mul.f16x2 r5447, r157, r5449; +} +{ +add.f16x2 r5450, r5426, r5447; +} +{ +cvt.rn.f16.f64 rs862, fd884; +} +mov.b32 r5455, {rs862, rs862}; +{ +mul.f16x2 r5453, r166, r5455; +} +{ +add.f16x2 r5456, r5432, r5453; +} +{ +cvt.rn.f16.f64 rs863, fd883; +} +mov.b32 r5461, {rs863, rs863}; +{ +mul.f16x2 r5459, r160, r5461; +} +{ +add.f16x2 r5462, r5438, r5459; +} +{ +cvt.rn.f16.f64 rs864, fd884; +} +mov.b32 r5467, {rs864, rs864}; +{ +mul.f16x2 r5465, r163, r5467; +} +{ +add.f16x2 r5468, r5444, r5465; +} +{ +cvt.rn.f16.f64 rs865, fd895; +} +mov.b32 r5473, {rs865, rs865}; +{ +mul.f16x2 r5471, r169, r5473; +} +{ +add.f16x2 r5474, r5450, r5471; +} +{ +cvt.rn.f16.f64 rs866, fd896; +} +mov.b32 r5479, {rs866, rs866}; +{ +mul.f16x2 r5477, r178, r5479; +} +{ +add.f16x2 r5480, r5456, r5477; +} +{ +cvt.rn.f16.f64 rs867, fd895; +} +mov.b32 r5485, {rs867, rs867}; +{ +mul.f16x2 r5483, r172, r5485; +} +{ +add.f16x2 r5486, r5462, r5483; +} +{ +cvt.rn.f16.f64 rs868, fd896; +} +mov.b32 r5491, {rs868, rs868}; +{ +mul.f16x2 r5489, r175, r5491; +} +{ +add.f16x2 r5492, r5468, r5489; +} +{ +sub.f16x2 %28, r5474, r5480; +} +{ +add.f16x2 %29, r5486, r5492; +} +{ +add.f16x2 %34, r5474, r5480; +} +{ +sub.f16x2 %35, r5486, r5492; +} +cvt.rn.f16.s32 rs869, r5508; +mov.b32 r5519, {rs869, rs869}; +cvt.rn.f16.s32 rs870, r5508; +mov.b32 r5531, {rs870, rs870}; +{ +cvt.rn.f16.f64 rs871, fd843; +} +mov.b32 r5511, {rs871, rs871}; +{ +mul.f16x2 r5509, r1, r5511; +} +{ +add.f16x2 r5512, %62, r5509; +} +{ +cvt.rn.f16.f64 rs872, fd844; +} +mov.b32 r5517, {rs872, rs872}; +{ +mul.f16x2 r5515, r10, r5517; +} +{ +add.f16x2 r5518, r5519, r5515; +} +{ +cvt.rn.f16.f64 rs873, fd843; +} +mov.b32 r5523, {rs873, rs873}; +{ +mul.f16x2 r5521, r4, r5523; +} +{ +add.f16x2 r5524, %63, r5521; +} +{ +cvt.rn.f16.f64 rs874, fd844; +} +mov.b32 r5529, {rs874, rs874}; +{ +mul.f16x2 r5527, r7, r5529; +} +{ +add.f16x2 r5530, r5531, r5527; +} +{ +cvt.rn.f16.f64 rs875, fd847; +} +mov.b32 r5535, {rs875, rs875}; +{ +mul.f16x2 r5533, r13, r5535; +} +{ +add.f16x2 r5536, r5512, r5533; +} +{ +cvt.rn.f16.f64 rs876, fd848; +} +mov.b32 r5541, {rs876, rs876}; +{ +mul.f16x2 r5539, r22, r5541; +} +{ +add.f16x2 r5542, r5518, r5539; +} +{ +cvt.rn.f16.f64 rs877, fd847; +} +mov.b32 r5547, {rs877, rs877}; +{ +mul.f16x2 r5545, r16, r5547; +} +{ +add.f16x2 r5548, r5524, r5545; +} +{ +cvt.rn.f16.f64 rs878, fd848; +} +mov.b32 r5553, {rs878, rs878}; +{ +mul.f16x2 r5551, r19, r5553; +} +{ +add.f16x2 r5554, r5530, r5551; +} +{ +cvt.rn.f16.f64 rs879, fd851; +} +mov.b32 r5559, {rs879, rs879}; +{ +mul.f16x2 r5557, r25, r5559; +} +{ +add.f16x2 r5560, r5536, r5557; +} +{ +cvt.rn.f16.f64 rs880, fd852; +} +mov.b32 r5565, {rs880, rs880}; +{ +mul.f16x2 r5563, r34, r5565; +} +{ +add.f16x2 r5566, r5542, r5563; +} +{ +cvt.rn.f16.f64 rs881, fd851; +} +mov.b32 r5571, {rs881, rs881}; +{ +mul.f16x2 r5569, r28, r5571; +} +{ +add.f16x2 r5572, r5548, r5569; +} +{ +cvt.rn.f16.f64 rs882, fd852; +} +mov.b32 r5577, {rs882, rs882}; +{ +mul.f16x2 r5575, r31, r5577; +} +{ +add.f16x2 r5578, r5554, r5575; +} +{ +cvt.rn.f16.f64 rs883, fd855; +} +mov.b32 r5583, {rs883, rs883}; +{ +mul.f16x2 r5581, r37, r5583; +} +{ +add.f16x2 r5584, r5560, r5581; +} +{ +cvt.rn.f16.f64 rs884, fd856; +} +mov.b32 r5589, {rs884, rs884}; +{ +mul.f16x2 r5587, r46, r5589; +} +{ +add.f16x2 r5590, r5566, r5587; +} +{ +cvt.rn.f16.f64 rs885, fd855; +} +mov.b32 r5595, {rs885, rs885}; +{ +mul.f16x2 r5593, r40, r5595; +} +{ +add.f16x2 r5596, r5572, r5593; +} +{ +cvt.rn.f16.f64 rs886, fd856; +} +mov.b32 r5601, {rs886, rs886}; +{ +mul.f16x2 r5599, r43, r5601; +} +{ +add.f16x2 r5602, r5578, r5599; +} +{ +cvt.rn.f16.f64 rs887, fd859; +} +mov.b32 r5607, {rs887, rs887}; +{ +mul.f16x2 r5605, r49, r5607; +} +{ +add.f16x2 r5608, r5584, r5605; +} +{ +cvt.rn.f16.f64 rs888, fd860; +} +mov.b32 r5613, {rs888, rs888}; +{ +mul.f16x2 r5611, r58, r5613; +} +{ +add.f16x2 r5614, r5590, r5611; +} +{ +cvt.rn.f16.f64 rs889, fd859; +} +mov.b32 r5619, {rs889, rs889}; +{ +mul.f16x2 r5617, r52, r5619; +} +{ +add.f16x2 r5620, r5596, r5617; +} +{ +cvt.rn.f16.f64 rs890, fd860; +} +mov.b32 r5625, {rs890, rs890}; +{ +mul.f16x2 r5623, r55, r5625; +} +{ +add.f16x2 r5626, r5602, r5623; +} +{ +cvt.rn.f16.f64 rs891, fd863; +} +mov.b32 r5631, {rs891, rs891}; +{ +mul.f16x2 r5629, r61, r5631; +} +{ +add.f16x2 r5632, r5608, r5629; +} +{ +cvt.rn.f16.f64 rs892, fd864; +} +mov.b32 r5637, {rs892, rs892}; +{ +mul.f16x2 r5635, r70, r5637; +} +{ +add.f16x2 r5638, r5614, r5635; +} +{ +cvt.rn.f16.f64 rs893, fd863; +} +mov.b32 r5643, {rs893, rs893}; +{ +mul.f16x2 r5641, r64, r5643; +} +{ +add.f16x2 r5644, r5620, r5641; +} +{ +cvt.rn.f16.f64 rs894, fd864; +} +mov.b32 r5649, {rs894, rs894}; +{ +mul.f16x2 r5647, r67, r5649; +} +{ +add.f16x2 r5650, r5626, r5647; +} +{ +cvt.rn.f16.f64 rs895, fd867; +} +mov.b32 r5655, {rs895, rs895}; +{ +mul.f16x2 r5653, r73, r5655; +} +{ +add.f16x2 r5656, r5632, r5653; +} +{ +cvt.rn.f16.f64 rs896, fd868; +} +mov.b32 r5661, {rs896, rs896}; +{ +mul.f16x2 r5659, r82, r5661; +} +{ +add.f16x2 r5662, r5638, r5659; +} +{ +cvt.rn.f16.f64 rs897, fd867; +} +mov.b32 r5667, {rs897, rs897}; +{ +mul.f16x2 r5665, r76, r5667; +} +{ +add.f16x2 r5668, r5644, r5665; +} +{ +cvt.rn.f16.f64 rs898, fd868; +} +mov.b32 r5673, {rs898, rs898}; +{ +mul.f16x2 r5671, r79, r5673; +} +{ +add.f16x2 r5674, r5650, r5671; +} +{ +cvt.rn.f16.f64 rs899, fd871; +} +mov.b32 r5679, {rs899, rs899}; +{ +mul.f16x2 r5677, r85, r5679; +} +{ +add.f16x2 r5680, r5656, r5677; +} +{ +cvt.rn.f16.f64 rs900, fd872; +} +mov.b32 r5685, {rs900, rs900}; +{ +mul.f16x2 r5683, r94, r5685; +} +{ +add.f16x2 r5686, r5662, r5683; +} +{ +cvt.rn.f16.f64 rs901, fd871; +} +mov.b32 r5691, {rs901, rs901}; +{ +mul.f16x2 r5689, r88, r5691; +} +{ +add.f16x2 r5692, r5668, r5689; +} +{ +cvt.rn.f16.f64 rs902, fd872; +} +mov.b32 r5697, {rs902, rs902}; +{ +mul.f16x2 r5695, r91, r5697; +} +{ +add.f16x2 r5698, r5674, r5695; +} +{ +cvt.rn.f16.f64 rs903, fd875; +} +mov.b32 r5703, {rs903, rs903}; +{ +mul.f16x2 r5701, r97, r5703; +} +{ +add.f16x2 r5704, r5680, r5701; +} +{ +cvt.rn.f16.f64 rs904, fd876; +} +mov.b32 r5709, {rs904, rs904}; +{ +mul.f16x2 r5707, r106, r5709; +} +{ +add.f16x2 r5710, r5686, r5707; +} +{ +cvt.rn.f16.f64 rs905, fd875; +} +mov.b32 r5715, {rs905, rs905}; +{ +mul.f16x2 r5713, r100, r5715; +} +{ +add.f16x2 r5716, r5692, r5713; +} +{ +cvt.rn.f16.f64 rs906, fd876; +} +mov.b32 r5721, {rs906, rs906}; +{ +mul.f16x2 r5719, r103, r5721; +} +{ +add.f16x2 r5722, r5698, r5719; +} +{ +cvt.rn.f16.f64 rs907, fd879; +} +mov.b32 r5727, {rs907, rs907}; +{ +mul.f16x2 r5725, r109, r5727; +} +{ +add.f16x2 r5728, r5704, r5725; +} +{ +cvt.rn.f16.f64 rs908, fd880; +} +mov.b32 r5733, {rs908, rs908}; +{ +mul.f16x2 r5731, r118, r5733; +} +{ +add.f16x2 r5734, r5710, r5731; +} +{ +cvt.rn.f16.f64 rs909, fd879; +} +mov.b32 r5739, {rs909, rs909}; +{ +mul.f16x2 r5737, r112, r5739; +} +{ +add.f16x2 r5740, r5716, r5737; +} +{ +cvt.rn.f16.f64 rs910, fd880; +} +mov.b32 r5745, {rs910, rs910}; +{ +mul.f16x2 r5743, r115, r5745; +} +{ +add.f16x2 r5746, r5722, r5743; +} +{ +cvt.rn.f16.f64 rs911, fd883; +} +mov.b32 r5751, {rs911, rs911}; +{ +mul.f16x2 r5749, r121, r5751; +} +{ +add.f16x2 r5752, r5728, r5749; +} +{ +cvt.rn.f16.f64 rs912, fd884; +} +mov.b32 r5757, {rs912, rs912}; +{ +mul.f16x2 r5755, r130, r5757; +} +{ +add.f16x2 r5758, r5734, r5755; +} +{ +cvt.rn.f16.f64 rs913, fd883; +} +mov.b32 r5763, {rs913, rs913}; +{ +mul.f16x2 r5761, r124, r5763; +} +{ +add.f16x2 r5764, r5740, r5761; +} +{ +cvt.rn.f16.f64 rs914, fd884; +} +mov.b32 r5769, {rs914, rs914}; +{ +mul.f16x2 r5767, r127, r5769; +} +{ +add.f16x2 r5770, r5746, r5767; +} +{ +cvt.rn.f16.f64 rs915, fd887; +} +mov.b32 r5775, {rs915, rs915}; +{ +mul.f16x2 r5773, r133, r5775; +} +{ +add.f16x2 r5776, r5752, r5773; +} +{ +cvt.rn.f16.f64 rs916, fd888; +} +mov.b32 r5781, {rs916, rs916}; +{ +mul.f16x2 r5779, r142, r5781; +} +{ +add.f16x2 r5782, r5758, r5779; +} +{ +cvt.rn.f16.f64 rs917, fd887; +} +mov.b32 r5787, {rs917, rs917}; +{ +mul.f16x2 r5785, r136, r5787; +} +{ +add.f16x2 r5788, r5764, r5785; +} +{ +cvt.rn.f16.f64 rs918, fd888; +} +mov.b32 r5793, {rs918, rs918}; +{ +mul.f16x2 r5791, r139, r5793; +} +{ +add.f16x2 r5794, r5770, r5791; +} +{ +cvt.rn.f16.f64 rs919, fd891; +} +mov.b32 r5799, {rs919, rs919}; +{ +mul.f16x2 r5797, r145, r5799; +} +{ +add.f16x2 r5800, r5776, r5797; +} +{ +cvt.rn.f16.f64 rs920, fd892; +} +mov.b32 r5805, {rs920, rs920}; +{ +mul.f16x2 r5803, r154, r5805; +} +{ +add.f16x2 r5806, r5782, r5803; +} +{ +cvt.rn.f16.f64 rs921, fd891; +} +mov.b32 r5811, {rs921, rs921}; +{ +mul.f16x2 r5809, r148, r5811; +} +{ +add.f16x2 r5812, r5788, r5809; +} +{ +cvt.rn.f16.f64 rs922, fd892; +} +mov.b32 r5817, {rs922, rs922}; +{ +mul.f16x2 r5815, r151, r5817; +} +{ +add.f16x2 r5818, r5794, r5815; +} +{ +cvt.rn.f16.f64 rs923, fd895; +} +mov.b32 r5823, {rs923, rs923}; +{ +mul.f16x2 r5821, r157, r5823; +} +{ +add.f16x2 r5824, r5800, r5821; +} +{ +cvt.rn.f16.f64 rs924, fd896; +} +mov.b32 r5829, {rs924, rs924}; +{ +mul.f16x2 r5827, r166, r5829; +} +{ +add.f16x2 r5830, r5806, r5827; +} +{ +cvt.rn.f16.f64 rs925, fd895; +} +mov.b32 r5835, {rs925, rs925}; +{ +mul.f16x2 r5833, r160, r5835; +} +{ +add.f16x2 r5836, r5812, r5833; +} +{ +cvt.rn.f16.f64 rs926, fd896; +} +mov.b32 r5841, {rs926, rs926}; +{ +mul.f16x2 r5839, r163, r5841; +} +{ +add.f16x2 r5842, r5818, r5839; +} +{ +cvt.rn.f16.f64 rs927, fd899; +} +mov.b32 r5847, {rs927, rs927}; +{ +mul.f16x2 r5845, r169, r5847; +} +{ +add.f16x2 r5848, r5824, r5845; +} +{ +cvt.rn.f16.f64 rs928, fd900; +} +mov.b32 r5853, {rs928, rs928}; +{ +mul.f16x2 r5851, r178, r5853; +} +{ +add.f16x2 r5854, r5830, r5851; +} +{ +cvt.rn.f16.f64 rs929, fd899; +} +mov.b32 r5859, {rs929, rs929}; +{ +mul.f16x2 r5857, r172, r5859; +} +{ +add.f16x2 r5860, r5836, r5857; +} +{ +cvt.rn.f16.f64 rs930, fd900; +} +mov.b32 r5865, {rs930, rs930}; +{ +mul.f16x2 r5863, r175, r5865; +} +{ +add.f16x2 r5866, r5842, r5863; +} +{ +sub.f16x2 %30, r5848, r5854; +} +{ +add.f16x2 %31, r5860, r5866; +} +{ +add.f16x2 %32, r5848, r5854; +} +{ +sub.f16x2 %33, r5860, r5866; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[30].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..321002d332a72 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp16_inv.hpp.inc @@ -0,0 +1,9574 @@ +#ifndef CUFFTDX_FFT_31_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_31_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<963, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<931>; +.reg .b32 r<5881>; +.reg .f64 fd<901>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %64, %122; +} +{ +add.f16x2 r4, %65, %123; +} +{ +sub.f16x2 r7, %64, %122; +} +{ +sub.f16x2 r10, %65, %123; +} +{ +add.f16x2 r13, %66, %120; +} +{ +add.f16x2 r16, %67, %121; +} +{ +sub.f16x2 r19, %66, %120; +} +{ +sub.f16x2 r22, %67, %121; +} +{ +add.f16x2 r25, %68, %118; +} +{ +add.f16x2 r28, %69, %119; +} +{ +sub.f16x2 r31, %68, %118; +} +{ +sub.f16x2 r34, %69, %119; +} +{ +add.f16x2 r37, %70, %116; +} +{ +add.f16x2 r40, %71, %117; +} +{ +sub.f16x2 r43, %70, %116; +} +{ +sub.f16x2 r46, %71, %117; +} +{ +add.f16x2 r49, %72, %114; +} +{ +add.f16x2 r52, %73, %115; +} +{ +sub.f16x2 r55, %72, %114; +} +{ +sub.f16x2 r58, %73, %115; +} +{ +add.f16x2 r61, %74, %112; +} +{ +add.f16x2 r64, %75, %113; +} +{ +sub.f16x2 r67, %74, %112; +} +{ +sub.f16x2 r70, %75, %113; +} +{ +add.f16x2 r73, %76, %110; +} +{ +add.f16x2 r76, %77, %111; +} +{ +sub.f16x2 r79, %76, %110; +} +{ +sub.f16x2 r82, %77, %111; +} +{ +add.f16x2 r85, %78, %108; +} +{ +add.f16x2 r88, %79, %109; +} +{ +sub.f16x2 r91, %78, %108; +} +{ +sub.f16x2 r94, %79, %109; +} +{ +add.f16x2 r97, %80, %106; +} +{ +add.f16x2 r100, %81, %107; +} +{ +sub.f16x2 r103, %80, %106; +} +{ +sub.f16x2 r106, %81, %107; +} +{ +add.f16x2 r109, %82, %104; +} +{ +add.f16x2 r112, %83, %105; +} +{ +sub.f16x2 r115, %82, %104; +} +{ +sub.f16x2 r118, %83, %105; +} +{ +add.f16x2 r121, %84, %102; +} +{ +add.f16x2 r124, %85, %103; +} +{ +sub.f16x2 r127, %84, %102; +} +{ +sub.f16x2 r130, %85, %103; +} +{ +add.f16x2 r133, %86, %100; +} +{ +add.f16x2 r136, %87, %101; +} +{ +sub.f16x2 r139, %86, %100; +} +{ +sub.f16x2 r142, %87, %101; +} +{ +add.f16x2 r145, %88, %98; +} +{ +add.f16x2 r148, %89, %99; +} +{ +sub.f16x2 r151, %88, %98; +} +{ +sub.f16x2 r154, %89, %99; +} +{ +add.f16x2 r157, %90, %96; +} +{ +add.f16x2 r160, %91, %97; +} +{ +sub.f16x2 r163, %90, %96; +} +{ +sub.f16x2 r166, %91, %97; +} +{ +add.f16x2 r169, %92, %94; +} +{ +add.f16x2 r172, %93, %95; +} +{ +sub.f16x2 r175, %92, %94; +} +{ +sub.f16x2 r178, %93, %95; +} +{ +add.f16x2 r181, %62, r1; +} +{ +add.f16x2 r184, %63, r4; +} +{ +add.f16x2 r187, r181, r13; +} +{ +add.f16x2 r190, r184, r16; +} +{ +add.f16x2 r193, r187, r25; +} +{ +add.f16x2 r196, r190, r28; +} +{ +add.f16x2 r199, r193, r37; +} +{ +add.f16x2 r202, r196, r40; +} +{ +add.f16x2 r205, r199, r49; +} +{ +add.f16x2 r208, r202, r52; +} +{ +add.f16x2 r211, r205, r61; +} +{ +add.f16x2 r214, r208, r64; +} +{ +add.f16x2 r217, r211, r73; +} +{ +add.f16x2 r220, r214, r76; +} +{ +add.f16x2 r223, r217, r85; +} +{ +add.f16x2 r226, r220, r88; +} +{ +add.f16x2 r229, r223, r97; +} +{ +add.f16x2 r232, r226, r100; +} +{ +add.f16x2 r235, r229, r109; +} +{ +add.f16x2 r238, r232, r112; +} +{ +add.f16x2 r241, r235, r121; +} +{ +add.f16x2 r244, r238, r124; +} +{ +add.f16x2 r247, r241, r133; +} +{ +add.f16x2 r250, r244, r136; +} +{ +add.f16x2 r253, r247, r145; +} +{ +add.f16x2 r256, r250, r148; +} +{ +add.f16x2 r259, r253, r157; +} +{ +add.f16x2 r262, r256, r160; +} +{ +add.f16x2 %0, r259, r169; +} +{ +add.f16x2 %1, r262, r172; +} +mov.u32 r5508, 0; +cvt.rn.f16.s32 rs1, r5508; +mov.b32 r283, {rs1, rs1}; +cvt.rn.f16.s32 rs2, r5508; +mov.b32 r295, {rs2, rs2}; +mov.f64 fd847, 0d3FEF584F2CE43B84; +{ +cvt.rn.f16.f64 rs3, fd847; +} +mov.b32 r275, {rs3, rs3}; +{ +mul.f16x2 r273, r1, r275; +} +{ +add.f16x2 r276, %62, r273; +} +mov.f64 fd768, 0d3FC9C4266041CA8F; +{ +cvt.rn.f16.f64 rs4, fd768; +} +mov.b32 r281, {rs4, rs4}; +{ +mul.f16x2 r279, r10, r281; +} +{ +add.f16x2 r282, r283, r279; +} +{ +cvt.rn.f16.f64 rs5, fd847; +} +mov.b32 r287, {rs5, rs5}; +{ +mul.f16x2 r285, r4, r287; +} +{ +add.f16x2 r288, %63, r285; +} +{ +cvt.rn.f16.f64 rs6, fd768; +} +mov.b32 r293, {rs6, rs6}; +{ +mul.f16x2 r291, r7, r293; +} +{ +add.f16x2 r294, r295, r291; +} +mov.f64 fd855, 0d3FED681A366A00FA; +{ +cvt.rn.f16.f64 rs7, fd855; +} +mov.b32 r299, {rs7, rs7}; +{ +mul.f16x2 r297, r13, r299; +} +{ +add.f16x2 r300, r276, r297; +} +mov.f64 fd816, 0d3FD93D20572CA90B; +{ +cvt.rn.f16.f64 rs8, fd816; +} +mov.b32 r305, {rs8, rs8}; +{ +mul.f16x2 r303, r22, r305; +} +{ +add.f16x2 r306, r282, r303; +} +{ +cvt.rn.f16.f64 rs9, fd855; +} +mov.b32 r311, {rs9, rs9}; +{ +mul.f16x2 r309, r16, r311; +} +{ +add.f16x2 r312, r288, r309; +} +{ +cvt.rn.f16.f64 rs10, fd816; +} +mov.b32 r317, {rs10, rs10}; +{ +mul.f16x2 r315, r19, r317; +} +{ +add.f16x2 r318, r294, r315; +} +mov.f64 fd863, 0d3FEA43B1B1379AFF; +{ +cvt.rn.f16.f64 rs11, fd863; +} +mov.b32 r323, {rs11, rs11}; +{ +mul.f16x2 r321, r25, r323; +} +{ +add.f16x2 r324, r300, r321; +} +mov.f64 fd740, 0d3FE247D447A27216; +{ +cvt.rn.f16.f64 rs12, fd740; +} +mov.b32 r329, {rs12, rs12}; +{ +mul.f16x2 r327, r34, r329; +} +{ +add.f16x2 r330, r306, r327; +} +{ +cvt.rn.f16.f64 rs13, fd863; +} +mov.b32 r335, {rs13, rs13}; +{ +mul.f16x2 r333, r28, r335; +} +{ +add.f16x2 r336, r312, r333; +} +{ +cvt.rn.f16.f64 rs14, fd740; +} +mov.b32 r341, {rs14, rs14}; +{ +mul.f16x2 r339, r31, r341; +} +{ +add.f16x2 r342, r318, r339; +} +mov.f64 fd871, 0d3FE60C045A2E9729; +{ +cvt.rn.f16.f64 rs15, fd871; +} +mov.b32 r347, {rs15, rs15}; +{ +mul.f16x2 r345, r37, r347; +} +{ +add.f16x2 r348, r324, r345; +} +mov.f64 fd624, 0d3FE73180A4B0D300; +{ +cvt.rn.f16.f64 rs16, fd624; +} +mov.b32 r353, {rs16, rs16}; +{ +mul.f16x2 r351, r46, r353; +} +{ +add.f16x2 r354, r330, r351; +} +{ +cvt.rn.f16.f64 rs17, fd871; +} +mov.b32 r359, {rs17, rs17}; +{ +mul.f16x2 r357, r40, r359; +} +{ +add.f16x2 r360, r336, r357; +} +{ +cvt.rn.f16.f64 rs18, fd624; +} +mov.b32 r365, {rs18, rs18}; +{ +mul.f16x2 r363, r43, r365; +} +{ +add.f16x2 r366, r342, r363; +} +mov.f64 fd879, 0d3FE0ED45EEA3B09F; +{ +cvt.rn.f16.f64 rs19, fd879; +} +mov.b32 r371, {rs19, rs19}; +{ +mul.f16x2 r369, r49, r371; +} +{ +add.f16x2 r372, r348, r369; +} +mov.f64 fd808, 0d3FEB2818007C19DF; +{ +cvt.rn.f16.f64 rs20, fd808; +} +mov.b32 r377, {rs20, rs20}; +{ +mul.f16x2 r375, r58, r377; +} +{ +add.f16x2 r378, r354, r375; +} +{ +cvt.rn.f16.f64 rs21, fd879; +} +mov.b32 r383, {rs21, rs21}; +{ +mul.f16x2 r381, r52, r383; +} +{ +add.f16x2 r384, r360, r381; +} +{ +cvt.rn.f16.f64 rs22, fd808; +} +mov.b32 r389, {rs22, rs22}; +{ +mul.f16x2 r387, r55, r389; +} +{ +add.f16x2 r390, r366, r387; +} +mov.f64 fd887, 0d3FD63A3FCFACA412; +{ +cvt.rn.f16.f64 rs23, fd887; +} +mov.b32 r395, {rs23, rs23}; +{ +mul.f16x2 r393, r61, r395; +} +{ +add.f16x2 r396, r372, r393; +} +mov.f64 fd760, 0d3FEE0210C26A6E6F; +{ +cvt.rn.f16.f64 rs24, fd760; +} +mov.b32 r401, {rs24, rs24}; +{ +mul.f16x2 r399, r70, r401; +} +{ +add.f16x2 r402, r378, r399; +} +{ +cvt.rn.f16.f64 rs25, fd887; +} +mov.b32 r407, {rs25, rs25}; +{ +mul.f16x2 r405, r64, r407; +} +{ +add.f16x2 r408, r384, r405; +} +{ +cvt.rn.f16.f64 rs26, fd760; +} +mov.b32 r413, {rs26, rs26}; +{ +mul.f16x2 r411, r67, r413; +} +{ +add.f16x2 r414, r390, r411; +} +mov.f64 fd895, 0d3FC361FC440B478F; +{ +cvt.rn.f16.f64 rs27, fd895; +} +mov.b32 r419, {rs27, rs27}; +{ +mul.f16x2 r417, r73, r419; +} +{ +add.f16x2 r420, r396, r417; +} +mov.f64 fd580, 0d3FEFA18852C3E08A; +{ +cvt.rn.f16.f64 rs28, fd580; +} +mov.b32 r425, {rs28, rs28}; +{ +mul.f16x2 r423, r82, r425; +} +{ +add.f16x2 r426, r402, r423; +} +{ +cvt.rn.f16.f64 rs29, fd895; +} +mov.b32 r431, {rs29, rs29}; +{ +mul.f16x2 r429, r76, r431; +} +{ +add.f16x2 r432, r408, r429; +} +{ +cvt.rn.f16.f64 rs30, fd580; +} +mov.b32 r437, {rs30, rs30}; +{ +mul.f16x2 r435, r79, r437; +} +{ +add.f16x2 r438, r414, r435; +} +mov.f64 fd899, 0dBFA9EEB01776B57D; +{ +cvt.rn.f16.f64 rs31, fd899; +} +mov.b32 r443, {rs31, rs31}; +{ +mul.f16x2 r441, r85, r443; +} +{ +add.f16x2 r444, r420, r441; +} +mov.f64 fd900, 0d3FEFF57C5208CCF9; +{ +cvt.rn.f16.f64 rs32, fd900; +} +mov.b32 r449, {rs32, rs32}; +{ +mul.f16x2 r447, r94, r449; +} +{ +add.f16x2 r450, r426, r447; +} +{ +cvt.rn.f16.f64 rs33, fd899; +} +mov.b32 r455, {rs33, rs33}; +{ +mul.f16x2 r453, r88, r455; +} +{ +add.f16x2 r456, r432, r453; +} +{ +cvt.rn.f16.f64 rs34, fd900; +} +mov.b32 r461, {rs34, rs34}; +{ +mul.f16x2 r459, r91, r461; +} +{ +add.f16x2 r462, r438, r459; +} +mov.f64 fd891, 0dBFD00AB0EB2D7D94; +{ +cvt.rn.f16.f64 rs35, fd891; +} +mov.b32 r467, {rs35, rs35}; +{ +mul.f16x2 r465, r97, r467; +} +{ +add.f16x2 r468, r444, r465; +} +mov.f64 fd892, 0d3FEEFA7CDDB128FA; +{ +cvt.rn.f16.f64 rs36, fd892; +} +mov.b32 r473, {rs36, rs36}; +{ +mul.f16x2 r471, r106, r473; +} +{ +add.f16x2 r474, r450, r471; +} +{ +cvt.rn.f16.f64 rs37, fd891; +} +mov.b32 r479, {rs37, rs37}; +{ +mul.f16x2 r477, r100, r479; +} +{ +add.f16x2 r480, r456, r477; +} +{ +cvt.rn.f16.f64 rs38, fd892; +} +mov.b32 r485, {rs38, rs38}; +{ +mul.f16x2 r483, r103, r485; +} +{ +add.f16x2 r486, r462, r483; +} +mov.f64 fd883, 0dBFDC2F6AF3928A8E; +{ +cvt.rn.f16.f64 rs39, fd883; +} +mov.b32 r491, {rs39, rs39}; +{ +mul.f16x2 r489, r109, r491; +} +{ +add.f16x2 r492, r468, r489; +} +mov.f64 fd884, 0d3FECBAD095F50378; +{ +cvt.rn.f16.f64 rs40, fd884; +} +mov.b32 r497, {rs40, rs40}; +{ +mul.f16x2 r495, r118, r497; +} +{ +add.f16x2 r498, r474, r495; +} +{ +cvt.rn.f16.f64 rs41, fd883; +} +mov.b32 r503, {rs41, rs41}; +{ +mul.f16x2 r501, r112, r503; +} +{ +add.f16x2 r504, r480, r501; +} +{ +cvt.rn.f16.f64 rs42, fd884; +} +mov.b32 r509, {rs42, rs42}; +{ +mul.f16x2 r507, r115, r509; +} +{ +add.f16x2 r510, r486, r507; +} +mov.f64 fd875, 0dBFE3965F49174D13; +{ +cvt.rn.f16.f64 rs43, fd875; +} +mov.b32 r515, {rs43, rs43}; +{ +mul.f16x2 r513, r121, r515; +} +{ +add.f16x2 r516, r492, r513; +} +mov.f64 fd876, 0d3FE94E08EB13C451; +{ +cvt.rn.f16.f64 rs44, fd876; +} +mov.b32 r521, {rs44, rs44}; +{ +mul.f16x2 r519, r130, r521; +} +{ +add.f16x2 r522, r498, r519; +} +{ +cvt.rn.f16.f64 rs45, fd875; +} +mov.b32 r527, {rs45, rs45}; +{ +mul.f16x2 r525, r124, r527; +} +{ +add.f16x2 r528, r504, r525; +} +{ +cvt.rn.f16.f64 rs46, fd876; +} +mov.b32 r533, {rs46, rs46}; +{ +mul.f16x2 r531, r127, r533; +} +{ +add.f16x2 r534, r510, r531; +} +mov.f64 fd867, 0dBFE847BF1D5146CC; +{ +cvt.rn.f16.f64 rs47, fd867; +} +mov.b32 r539, {rs47, rs47}; +{ +mul.f16x2 r537, r133, r539; +} +{ +add.f16x2 r540, r516, r537; +} +mov.f64 fd868, 0d3FE4D80B1AD9CCF6; +{ +cvt.rn.f16.f64 rs48, fd868; +} +mov.b32 r545, {rs48, rs48}; +{ +mul.f16x2 r543, r142, r545; +} +{ +add.f16x2 r546, r522, r543; +} +{ +cvt.rn.f16.f64 rs49, fd867; +} +mov.b32 r551, {rs49, rs49}; +{ +mul.f16x2 r549, r136, r551; +} +{ +add.f16x2 r552, r528, r549; +} +{ +cvt.rn.f16.f64 rs50, fd868; +} +mov.b32 r557, {rs50, rs50}; +{ +mul.f16x2 r555, r139, r557; +} +{ +add.f16x2 r558, r534, r555; +} +mov.f64 fd859, 0dBFEBFAA5C136B224; +{ +cvt.rn.f16.f64 rs51, fd859; +} +mov.b32 r563, {rs51, rs51}; +{ +mul.f16x2 r561, r145, r563; +} +{ +add.f16x2 r564, r540, r561; +} +mov.f64 fd860, 0d3FDF0F2FF6705BEC; +{ +cvt.rn.f16.f64 rs52, fd860; +} +mov.b32 r569, {rs52, rs52}; +{ +mul.f16x2 r567, r154, r569; +} +{ +add.f16x2 r570, r546, r567; +} +{ +cvt.rn.f16.f64 rs53, fd859; +} +mov.b32 r575, {rs53, rs53}; +{ +mul.f16x2 r573, r148, r575; +} +{ +add.f16x2 r576, r552, r573; +} +{ +cvt.rn.f16.f64 rs54, fd860; +} +mov.b32 r581, {rs54, rs54}; +{ +mul.f16x2 r579, r151, r581; +} +{ +add.f16x2 r582, r558, r579; +} +mov.f64 fd851, 0dBFEE884F0CC22CCC; +{ +cvt.rn.f16.f64 rs55, fd851; +} +mov.b32 r587, {rs55, rs55}; +{ +mul.f16x2 r585, r157, r587; +} +{ +add.f16x2 r588, r564, r585; +} +mov.f64 fd852, 0d3FD328C3F1B322CB; +{ +cvt.rn.f16.f64 rs56, fd852; +} +mov.b32 r593, {rs56, rs56}; +{ +mul.f16x2 r591, r166, r593; +} +{ +add.f16x2 r594, r570, r591; +} +{ +cvt.rn.f16.f64 rs57, fd851; +} +mov.b32 r599, {rs57, rs57}; +{ +mul.f16x2 r597, r160, r599; +} +{ +add.f16x2 r600, r576, r597; +} +{ +cvt.rn.f16.f64 rs58, fd852; +} +mov.b32 r605, {rs58, rs58}; +{ +mul.f16x2 r603, r163, r605; +} +{ +add.f16x2 r606, r582, r603; +} +mov.f64 fd843, 0dBFEFD5F830F860F9; +{ +cvt.rn.f16.f64 rs59, fd843; +} +mov.b32 r611, {rs59, rs59}; +{ +mul.f16x2 r609, r169, r611; +} +{ +add.f16x2 r612, r588, r609; +} +mov.f64 fd844, 0d3FB9E62ACA53C49F; +{ +cvt.rn.f16.f64 rs60, fd844; +} +mov.b32 r617, {rs60, rs60}; +{ +mul.f16x2 r615, r178, r617; +} +{ +add.f16x2 r618, r594, r615; +} +{ +cvt.rn.f16.f64 rs61, fd843; +} +mov.b32 r623, {rs61, rs61}; +{ +mul.f16x2 r621, r172, r623; +} +{ +add.f16x2 r624, r600, r621; +} +{ +cvt.rn.f16.f64 rs62, fd844; +} +mov.b32 r629, {rs62, rs62}; +{ +mul.f16x2 r627, r175, r629; +} +{ +add.f16x2 r630, r606, r627; +} +{ +sub.f16x2 %2, r612, r618; +} +{ +add.f16x2 %3, r624, r630; +} +{ +add.f16x2 %60, r612, r618; +} +{ +sub.f16x2 %61, r624, r630; +} +cvt.rn.f16.s32 rs63, r5508; +mov.b32 r657, {rs63, rs63}; +cvt.rn.f16.s32 rs64, r5508; +mov.b32 r669, {rs64, rs64}; +{ +cvt.rn.f16.f64 rs65, fd855; +} +mov.b32 r649, {rs65, rs65}; +{ +mul.f16x2 r647, r1, r649; +} +{ +add.f16x2 r650, %62, r647; +} +{ +cvt.rn.f16.f64 rs66, fd816; +} +mov.b32 r655, {rs66, rs66}; +{ +mul.f16x2 r653, r10, r655; +} +{ +add.f16x2 r656, r657, r653; +} +{ +cvt.rn.f16.f64 rs67, fd855; +} +mov.b32 r661, {rs67, rs67}; +{ +mul.f16x2 r659, r4, r661; +} +{ +add.f16x2 r662, %63, r659; +} +{ +cvt.rn.f16.f64 rs68, fd816; +} +mov.b32 r667, {rs68, rs68}; +{ +mul.f16x2 r665, r7, r667; +} +{ +add.f16x2 r668, r669, r665; +} +{ +cvt.rn.f16.f64 rs69, fd871; +} +mov.b32 r673, {rs69, rs69}; +{ +mul.f16x2 r671, r13, r673; +} +{ +add.f16x2 r674, r650, r671; +} +{ +cvt.rn.f16.f64 rs70, fd624; +} +mov.b32 r679, {rs70, rs70}; +{ +mul.f16x2 r677, r22, r679; +} +{ +add.f16x2 r680, r656, r677; +} +{ +cvt.rn.f16.f64 rs71, fd871; +} +mov.b32 r685, {rs71, rs71}; +{ +mul.f16x2 r683, r16, r685; +} +{ +add.f16x2 r686, r662, r683; +} +{ +cvt.rn.f16.f64 rs72, fd624; +} +mov.b32 r691, {rs72, rs72}; +{ +mul.f16x2 r689, r19, r691; +} +{ +add.f16x2 r692, r668, r689; +} +{ +cvt.rn.f16.f64 rs73, fd887; +} +mov.b32 r697, {rs73, rs73}; +{ +mul.f16x2 r695, r25, r697; +} +{ +add.f16x2 r698, r674, r695; +} +{ +cvt.rn.f16.f64 rs74, fd760; +} +mov.b32 r703, {rs74, rs74}; +{ +mul.f16x2 r701, r34, r703; +} +{ +add.f16x2 r704, r680, r701; +} +{ +cvt.rn.f16.f64 rs75, fd887; +} +mov.b32 r709, {rs75, rs75}; +{ +mul.f16x2 r707, r28, r709; +} +{ +add.f16x2 r710, r686, r707; +} +{ +cvt.rn.f16.f64 rs76, fd760; +} +mov.b32 r715, {rs76, rs76}; +{ +mul.f16x2 r713, r31, r715; +} +{ +add.f16x2 r716, r692, r713; +} +{ +cvt.rn.f16.f64 rs77, fd899; +} +mov.b32 r721, {rs77, rs77}; +{ +mul.f16x2 r719, r37, r721; +} +{ +add.f16x2 r722, r698, r719; +} +{ +cvt.rn.f16.f64 rs78, fd900; +} +mov.b32 r727, {rs78, rs78}; +{ +mul.f16x2 r725, r46, r727; +} +{ +add.f16x2 r728, r704, r725; +} +{ +cvt.rn.f16.f64 rs79, fd899; +} +mov.b32 r733, {rs79, rs79}; +{ +mul.f16x2 r731, r40, r733; +} +{ +add.f16x2 r734, r710, r731; +} +{ +cvt.rn.f16.f64 rs80, fd900; +} +mov.b32 r739, {rs80, rs80}; +{ +mul.f16x2 r737, r43, r739; +} +{ +add.f16x2 r740, r716, r737; +} +{ +cvt.rn.f16.f64 rs81, fd883; +} +mov.b32 r745, {rs81, rs81}; +{ +mul.f16x2 r743, r49, r745; +} +{ +add.f16x2 r746, r722, r743; +} +{ +cvt.rn.f16.f64 rs82, fd884; +} +mov.b32 r751, {rs82, rs82}; +{ +mul.f16x2 r749, r58, r751; +} +{ +add.f16x2 r752, r728, r749; +} +{ +cvt.rn.f16.f64 rs83, fd883; +} +mov.b32 r757, {rs83, rs83}; +{ +mul.f16x2 r755, r52, r757; +} +{ +add.f16x2 r758, r734, r755; +} +{ +cvt.rn.f16.f64 rs84, fd884; +} +mov.b32 r763, {rs84, rs84}; +{ +mul.f16x2 r761, r55, r763; +} +{ +add.f16x2 r764, r740, r761; +} +{ +cvt.rn.f16.f64 rs85, fd867; +} +mov.b32 r769, {rs85, rs85}; +{ +mul.f16x2 r767, r61, r769; +} +{ +add.f16x2 r770, r746, r767; +} +{ +cvt.rn.f16.f64 rs86, fd868; +} +mov.b32 r775, {rs86, rs86}; +{ +mul.f16x2 r773, r70, r775; +} +{ +add.f16x2 r776, r752, r773; +} +{ +cvt.rn.f16.f64 rs87, fd867; +} +mov.b32 r781, {rs87, rs87}; +{ +mul.f16x2 r779, r64, r781; +} +{ +add.f16x2 r782, r758, r779; +} +{ +cvt.rn.f16.f64 rs88, fd868; +} +mov.b32 r787, {rs88, rs88}; +{ +mul.f16x2 r785, r67, r787; +} +{ +add.f16x2 r788, r764, r785; +} +{ +cvt.rn.f16.f64 rs89, fd851; +} +mov.b32 r793, {rs89, rs89}; +{ +mul.f16x2 r791, r73, r793; +} +{ +add.f16x2 r794, r770, r791; +} +{ +cvt.rn.f16.f64 rs90, fd852; +} +mov.b32 r799, {rs90, rs90}; +{ +mul.f16x2 r797, r82, r799; +} +{ +add.f16x2 r800, r776, r797; +} +{ +cvt.rn.f16.f64 rs91, fd851; +} +mov.b32 r805, {rs91, rs91}; +{ +mul.f16x2 r803, r76, r805; +} +{ +add.f16x2 r806, r782, r803; +} +{ +cvt.rn.f16.f64 rs92, fd852; +} +mov.b32 r811, {rs92, rs92}; +{ +mul.f16x2 r809, r79, r811; +} +{ +add.f16x2 r812, r788, r809; +} +{ +cvt.rn.f16.f64 rs93, fd843; +} +mov.b32 r817, {rs93, rs93}; +{ +mul.f16x2 r815, r85, r817; +} +{ +add.f16x2 r818, r794, r815; +} +mov.f64 fd820, 0dBFB9E62ACA53C49F; +{ +cvt.rn.f16.f64 rs94, fd820; +} +mov.b32 r823, {rs94, rs94}; +{ +mul.f16x2 r821, r94, r823; +} +{ +add.f16x2 r824, r800, r821; +} +{ +cvt.rn.f16.f64 rs95, fd843; +} +mov.b32 r829, {rs95, rs95}; +{ +mul.f16x2 r827, r88, r829; +} +{ +add.f16x2 r830, r806, r827; +} +{ +cvt.rn.f16.f64 rs96, fd820; +} +mov.b32 r835, {rs96, rs96}; +{ +mul.f16x2 r833, r91, r835; +} +{ +add.f16x2 r836, r812, r833; +} +{ +cvt.rn.f16.f64 rs97, fd859; +} +mov.b32 r841, {rs97, rs97}; +{ +mul.f16x2 r839, r97, r841; +} +{ +add.f16x2 r842, r818, r839; +} +mov.f64 fd572, 0dBFDF0F2FF6705BEC; +{ +cvt.rn.f16.f64 rs98, fd572; +} +mov.b32 r847, {rs98, rs98}; +{ +mul.f16x2 r845, r106, r847; +} +{ +add.f16x2 r848, r824, r845; +} +{ +cvt.rn.f16.f64 rs99, fd859; +} +mov.b32 r853, {rs99, rs99}; +{ +mul.f16x2 r851, r100, r853; +} +{ +add.f16x2 r854, r830, r851; +} +{ +cvt.rn.f16.f64 rs100, fd572; +} +mov.b32 r859, {rs100, rs100}; +{ +mul.f16x2 r857, r103, r859; +} +{ +add.f16x2 r860, r836, r857; +} +{ +cvt.rn.f16.f64 rs101, fd875; +} +mov.b32 r865, {rs101, rs101}; +{ +mul.f16x2 r863, r109, r865; +} +{ +add.f16x2 r866, r842, r863; +} +mov.f64 fd708, 0dBFE94E08EB13C451; +{ +cvt.rn.f16.f64 rs102, fd708; +} +mov.b32 r871, {rs102, rs102}; +{ +mul.f16x2 r869, r118, r871; +} +{ +add.f16x2 r872, r848, r869; +} +{ +cvt.rn.f16.f64 rs103, fd875; +} +mov.b32 r877, {rs103, rs103}; +{ +mul.f16x2 r875, r112, r877; +} +{ +add.f16x2 r878, r854, r875; +} +{ +cvt.rn.f16.f64 rs104, fd708; +} +mov.b32 r883, {rs104, rs104}; +{ +mul.f16x2 r881, r115, r883; +} +{ +add.f16x2 r884, r860, r881; +} +{ +cvt.rn.f16.f64 rs105, fd891; +} +mov.b32 r889, {rs105, rs105}; +{ +mul.f16x2 r887, r121, r889; +} +{ +add.f16x2 r890, r866, r887; +} +mov.f64 fd804, 0dBFEEFA7CDDB128FA; +{ +cvt.rn.f16.f64 rs106, fd804; +} +mov.b32 r895, {rs106, rs106}; +{ +mul.f16x2 r893, r130, r895; +} +{ +add.f16x2 r896, r872, r893; +} +{ +cvt.rn.f16.f64 rs107, fd891; +} +mov.b32 r901, {rs107, rs107}; +{ +mul.f16x2 r899, r124, r901; +} +{ +add.f16x2 r902, r878, r899; +} +{ +cvt.rn.f16.f64 rs108, fd804; +} +mov.b32 r907, {rs108, rs108}; +{ +mul.f16x2 r905, r127, r907; +} +{ +add.f16x2 r908, r884, r905; +} +{ +cvt.rn.f16.f64 rs109, fd895; +} +mov.b32 r913, {rs109, rs109}; +{ +mul.f16x2 r911, r133, r913; +} +{ +add.f16x2 r914, r890, r911; +} +mov.f64 fd896, 0dBFEFA18852C3E08A; +{ +cvt.rn.f16.f64 rs110, fd896; +} +mov.b32 r919, {rs110, rs110}; +{ +mul.f16x2 r917, r142, r919; +} +{ +add.f16x2 r920, r896, r917; +} +{ +cvt.rn.f16.f64 rs111, fd895; +} +mov.b32 r925, {rs111, rs111}; +{ +mul.f16x2 r923, r136, r925; +} +{ +add.f16x2 r926, r902, r923; +} +{ +cvt.rn.f16.f64 rs112, fd896; +} +mov.b32 r931, {rs112, rs112}; +{ +mul.f16x2 r929, r139, r931; +} +{ +add.f16x2 r932, r908, r929; +} +{ +cvt.rn.f16.f64 rs113, fd879; +} +mov.b32 r937, {rs113, rs113}; +{ +mul.f16x2 r935, r145, r937; +} +{ +add.f16x2 r938, r914, r935; +} +mov.f64 fd880, 0dBFEB2818007C19DF; +{ +cvt.rn.f16.f64 rs114, fd880; +} +mov.b32 r943, {rs114, rs114}; +{ +mul.f16x2 r941, r154, r943; +} +{ +add.f16x2 r944, r920, r941; +} +{ +cvt.rn.f16.f64 rs115, fd879; +} +mov.b32 r949, {rs115, rs115}; +{ +mul.f16x2 r947, r148, r949; +} +{ +add.f16x2 r950, r926, r947; +} +{ +cvt.rn.f16.f64 rs116, fd880; +} +mov.b32 r955, {rs116, rs116}; +{ +mul.f16x2 r953, r151, r955; +} +{ +add.f16x2 r956, r932, r953; +} +{ +cvt.rn.f16.f64 rs117, fd863; +} +mov.b32 r961, {rs117, rs117}; +{ +mul.f16x2 r959, r157, r961; +} +{ +add.f16x2 r962, r938, r959; +} +mov.f64 fd864, 0dBFE247D447A27216; +{ +cvt.rn.f16.f64 rs118, fd864; +} +mov.b32 r967, {rs118, rs118}; +{ +mul.f16x2 r965, r166, r967; +} +{ +add.f16x2 r968, r944, r965; +} +{ +cvt.rn.f16.f64 rs119, fd863; +} +mov.b32 r973, {rs119, rs119}; +{ +mul.f16x2 r971, r160, r973; +} +{ +add.f16x2 r974, r950, r971; +} +{ +cvt.rn.f16.f64 rs120, fd864; +} +mov.b32 r979, {rs120, rs120}; +{ +mul.f16x2 r977, r163, r979; +} +{ +add.f16x2 r980, r956, r977; +} +{ +cvt.rn.f16.f64 rs121, fd847; +} +mov.b32 r985, {rs121, rs121}; +{ +mul.f16x2 r983, r169, r985; +} +{ +add.f16x2 r986, r962, r983; +} +mov.f64 fd848, 0dBFC9C4266041CA8F; +{ +cvt.rn.f16.f64 rs122, fd848; +} +mov.b32 r991, {rs122, rs122}; +{ +mul.f16x2 r989, r178, r991; +} +{ +add.f16x2 r992, r968, r989; +} +{ +cvt.rn.f16.f64 rs123, fd847; +} +mov.b32 r997, {rs123, rs123}; +{ +mul.f16x2 r995, r172, r997; +} +{ +add.f16x2 r998, r974, r995; +} +{ +cvt.rn.f16.f64 rs124, fd848; +} +mov.b32 r1003, {rs124, rs124}; +{ +mul.f16x2 r1001, r175, r1003; +} +{ +add.f16x2 r1004, r980, r1001; +} +{ +sub.f16x2 %4, r986, r992; +} +{ +add.f16x2 %5, r998, r1004; +} +{ +add.f16x2 %58, r986, r992; +} +{ +sub.f16x2 %59, r998, r1004; +} +cvt.rn.f16.s32 rs125, r5508; +mov.b32 r1031, {rs125, rs125}; +cvt.rn.f16.s32 rs126, r5508; +mov.b32 r1043, {rs126, rs126}; +{ +cvt.rn.f16.f64 rs127, fd863; +} +mov.b32 r1023, {rs127, rs127}; +{ +mul.f16x2 r1021, r1, r1023; +} +{ +add.f16x2 r1024, %62, r1021; +} +{ +cvt.rn.f16.f64 rs128, fd740; +} +mov.b32 r1029, {rs128, rs128}; +{ +mul.f16x2 r1027, r10, r1029; +} +{ +add.f16x2 r1030, r1031, r1027; +} +{ +cvt.rn.f16.f64 rs129, fd863; +} +mov.b32 r1035, {rs129, rs129}; +{ +mul.f16x2 r1033, r4, r1035; +} +{ +add.f16x2 r1036, %63, r1033; +} +{ +cvt.rn.f16.f64 rs130, fd740; +} +mov.b32 r1041, {rs130, rs130}; +{ +mul.f16x2 r1039, r7, r1041; +} +{ +add.f16x2 r1042, r1043, r1039; +} +{ +cvt.rn.f16.f64 rs131, fd887; +} +mov.b32 r1047, {rs131, rs131}; +{ +mul.f16x2 r1045, r13, r1047; +} +{ +add.f16x2 r1048, r1024, r1045; +} +{ +cvt.rn.f16.f64 rs132, fd760; +} +mov.b32 r1053, {rs132, rs132}; +{ +mul.f16x2 r1051, r22, r1053; +} +{ +add.f16x2 r1054, r1030, r1051; +} +{ +cvt.rn.f16.f64 rs133, fd887; +} +mov.b32 r1059, {rs133, rs133}; +{ +mul.f16x2 r1057, r16, r1059; +} +{ +add.f16x2 r1060, r1036, r1057; +} +{ +cvt.rn.f16.f64 rs134, fd760; +} +mov.b32 r1065, {rs134, rs134}; +{ +mul.f16x2 r1063, r19, r1065; +} +{ +add.f16x2 r1066, r1042, r1063; +} +{ +cvt.rn.f16.f64 rs135, fd891; +} +mov.b32 r1071, {rs135, rs135}; +{ +mul.f16x2 r1069, r25, r1071; +} +{ +add.f16x2 r1072, r1048, r1069; +} +{ +cvt.rn.f16.f64 rs136, fd892; +} +mov.b32 r1077, {rs136, rs136}; +{ +mul.f16x2 r1075, r34, r1077; +} +{ +add.f16x2 r1078, r1054, r1075; +} +{ +cvt.rn.f16.f64 rs137, fd891; +} +mov.b32 r1083, {rs137, rs137}; +{ +mul.f16x2 r1081, r28, r1083; +} +{ +add.f16x2 r1084, r1060, r1081; +} +{ +cvt.rn.f16.f64 rs138, fd892; +} +mov.b32 r1089, {rs138, rs138}; +{ +mul.f16x2 r1087, r31, r1089; +} +{ +add.f16x2 r1090, r1066, r1087; +} +{ +cvt.rn.f16.f64 rs139, fd867; +} +mov.b32 r1095, {rs139, rs139}; +{ +mul.f16x2 r1093, r37, r1095; +} +{ +add.f16x2 r1096, r1072, r1093; +} +{ +cvt.rn.f16.f64 rs140, fd868; +} +mov.b32 r1101, {rs140, rs140}; +{ +mul.f16x2 r1099, r46, r1101; +} +{ +add.f16x2 r1102, r1078, r1099; +} +{ +cvt.rn.f16.f64 rs141, fd867; +} +mov.b32 r1107, {rs141, rs141}; +{ +mul.f16x2 r1105, r40, r1107; +} +{ +add.f16x2 r1108, r1084, r1105; +} +{ +cvt.rn.f16.f64 rs142, fd868; +} +mov.b32 r1113, {rs142, rs142}; +{ +mul.f16x2 r1111, r43, r1113; +} +{ +add.f16x2 r1114, r1090, r1111; +} +{ +cvt.rn.f16.f64 rs143, fd843; +} +mov.b32 r1119, {rs143, rs143}; +{ +mul.f16x2 r1117, r49, r1119; +} +{ +add.f16x2 r1120, r1096, r1117; +} +{ +cvt.rn.f16.f64 rs144, fd844; +} +mov.b32 r1125, {rs144, rs144}; +{ +mul.f16x2 r1123, r58, r1125; +} +{ +add.f16x2 r1126, r1102, r1123; +} +{ +cvt.rn.f16.f64 rs145, fd843; +} +mov.b32 r1131, {rs145, rs145}; +{ +mul.f16x2 r1129, r52, r1131; +} +{ +add.f16x2 r1132, r1108, r1129; +} +{ +cvt.rn.f16.f64 rs146, fd844; +} +mov.b32 r1137, {rs146, rs146}; +{ +mul.f16x2 r1135, r55, r1137; +} +{ +add.f16x2 r1138, r1114, r1135; +} +{ +cvt.rn.f16.f64 rs147, fd859; +} +mov.b32 r1143, {rs147, rs147}; +{ +mul.f16x2 r1141, r61, r1143; +} +{ +add.f16x2 r1144, r1120, r1141; +} +{ +cvt.rn.f16.f64 rs148, fd572; +} +mov.b32 r1149, {rs148, rs148}; +{ +mul.f16x2 r1147, r70, r1149; +} +{ +add.f16x2 r1150, r1126, r1147; +} +{ +cvt.rn.f16.f64 rs149, fd859; +} +mov.b32 r1155, {rs149, rs149}; +{ +mul.f16x2 r1153, r64, r1155; +} +{ +add.f16x2 r1156, r1132, r1153; +} +{ +cvt.rn.f16.f64 rs150, fd572; +} +mov.b32 r1161, {rs150, rs150}; +{ +mul.f16x2 r1159, r67, r1161; +} +{ +add.f16x2 r1162, r1138, r1159; +} +{ +cvt.rn.f16.f64 rs151, fd883; +} +mov.b32 r1167, {rs151, rs151}; +{ +mul.f16x2 r1165, r73, r1167; +} +{ +add.f16x2 r1168, r1144, r1165; +} +mov.f64 fd736, 0dBFECBAD095F50378; +{ +cvt.rn.f16.f64 rs152, fd736; +} +mov.b32 r1173, {rs152, rs152}; +{ +mul.f16x2 r1171, r82, r1173; +} +{ +add.f16x2 r1174, r1150, r1171; +} +{ +cvt.rn.f16.f64 rs153, fd883; +} +mov.b32 r1179, {rs153, rs153}; +{ +mul.f16x2 r1177, r76, r1179; +} +{ +add.f16x2 r1180, r1156, r1177; +} +{ +cvt.rn.f16.f64 rs154, fd736; +} +mov.b32 r1185, {rs154, rs154}; +{ +mul.f16x2 r1183, r79, r1185; +} +{ +add.f16x2 r1186, r1162, r1183; +} +{ +cvt.rn.f16.f64 rs155, fd895; +} +mov.b32 r1191, {rs155, rs155}; +{ +mul.f16x2 r1189, r85, r1191; +} +{ +add.f16x2 r1192, r1168, r1189; +} +{ +cvt.rn.f16.f64 rs156, fd896; +} +mov.b32 r1197, {rs156, rs156}; +{ +mul.f16x2 r1195, r94, r1197; +} +{ +add.f16x2 r1198, r1174, r1195; +} +{ +cvt.rn.f16.f64 rs157, fd895; +} +mov.b32 r1203, {rs157, rs157}; +{ +mul.f16x2 r1201, r88, r1203; +} +{ +add.f16x2 r1204, r1180, r1201; +} +{ +cvt.rn.f16.f64 rs158, fd896; +} +mov.b32 r1209, {rs158, rs158}; +{ +mul.f16x2 r1207, r91, r1209; +} +{ +add.f16x2 r1210, r1186, r1207; +} +{ +cvt.rn.f16.f64 rs159, fd871; +} +mov.b32 r1215, {rs159, rs159}; +{ +mul.f16x2 r1213, r97, r1215; +} +{ +add.f16x2 r1216, r1192, r1213; +} +mov.f64 fd872, 0dBFE73180A4B0D300; +{ +cvt.rn.f16.f64 rs160, fd872; +} +mov.b32 r1221, {rs160, rs160}; +{ +mul.f16x2 r1219, r106, r1221; +} +{ +add.f16x2 r1222, r1198, r1219; +} +{ +cvt.rn.f16.f64 rs161, fd871; +} +mov.b32 r1227, {rs161, rs161}; +{ +mul.f16x2 r1225, r100, r1227; +} +{ +add.f16x2 r1228, r1204, r1225; +} +{ +cvt.rn.f16.f64 rs162, fd872; +} +mov.b32 r1233, {rs162, rs162}; +{ +mul.f16x2 r1231, r103, r1233; +} +{ +add.f16x2 r1234, r1210, r1231; +} +{ +cvt.rn.f16.f64 rs163, fd847; +} +mov.b32 r1239, {rs163, rs163}; +{ +mul.f16x2 r1237, r109, r1239; +} +{ +add.f16x2 r1240, r1216, r1237; +} +{ +cvt.rn.f16.f64 rs164, fd848; +} +mov.b32 r1245, {rs164, rs164}; +{ +mul.f16x2 r1243, r118, r1245; +} +{ +add.f16x2 r1246, r1222, r1243; +} +{ +cvt.rn.f16.f64 rs165, fd847; +} +mov.b32 r1251, {rs165, rs165}; +{ +mul.f16x2 r1249, r112, r1251; +} +{ +add.f16x2 r1252, r1228, r1249; +} +{ +cvt.rn.f16.f64 rs166, fd848; +} +mov.b32 r1257, {rs166, rs166}; +{ +mul.f16x2 r1255, r115, r1257; +} +{ +add.f16x2 r1258, r1234, r1255; +} +{ +cvt.rn.f16.f64 rs167, fd855; +} +mov.b32 r1263, {rs167, rs167}; +{ +mul.f16x2 r1261, r121, r1263; +} +{ +add.f16x2 r1264, r1240, r1261; +} +{ +cvt.rn.f16.f64 rs168, fd816; +} +mov.b32 r1269, {rs168, rs168}; +{ +mul.f16x2 r1267, r130, r1269; +} +{ +add.f16x2 r1270, r1246, r1267; +} +{ +cvt.rn.f16.f64 rs169, fd855; +} +mov.b32 r1275, {rs169, rs169}; +{ +mul.f16x2 r1273, r124, r1275; +} +{ +add.f16x2 r1276, r1252, r1273; +} +{ +cvt.rn.f16.f64 rs170, fd816; +} +mov.b32 r1281, {rs170, rs170}; +{ +mul.f16x2 r1279, r127, r1281; +} +{ +add.f16x2 r1282, r1258, r1279; +} +{ +cvt.rn.f16.f64 rs171, fd879; +} +mov.b32 r1287, {rs171, rs171}; +{ +mul.f16x2 r1285, r133, r1287; +} +{ +add.f16x2 r1288, r1264, r1285; +} +{ +cvt.rn.f16.f64 rs172, fd808; +} +mov.b32 r1293, {rs172, rs172}; +{ +mul.f16x2 r1291, r142, r1293; +} +{ +add.f16x2 r1294, r1270, r1291; +} +{ +cvt.rn.f16.f64 rs173, fd879; +} +mov.b32 r1299, {rs173, rs173}; +{ +mul.f16x2 r1297, r136, r1299; +} +{ +add.f16x2 r1300, r1276, r1297; +} +{ +cvt.rn.f16.f64 rs174, fd808; +} +mov.b32 r1305, {rs174, rs174}; +{ +mul.f16x2 r1303, r139, r1305; +} +{ +add.f16x2 r1306, r1282, r1303; +} +{ +cvt.rn.f16.f64 rs175, fd899; +} +mov.b32 r1311, {rs175, rs175}; +{ +mul.f16x2 r1309, r145, r1311; +} +{ +add.f16x2 r1312, r1288, r1309; +} +{ +cvt.rn.f16.f64 rs176, fd900; +} +mov.b32 r1317, {rs176, rs176}; +{ +mul.f16x2 r1315, r154, r1317; +} +{ +add.f16x2 r1318, r1294, r1315; +} +{ +cvt.rn.f16.f64 rs177, fd899; +} +mov.b32 r1323, {rs177, rs177}; +{ +mul.f16x2 r1321, r148, r1323; +} +{ +add.f16x2 r1324, r1300, r1321; +} +{ +cvt.rn.f16.f64 rs178, fd900; +} +mov.b32 r1329, {rs178, rs178}; +{ +mul.f16x2 r1327, r151, r1329; +} +{ +add.f16x2 r1330, r1306, r1327; +} +{ +cvt.rn.f16.f64 rs179, fd875; +} +mov.b32 r1335, {rs179, rs179}; +{ +mul.f16x2 r1333, r157, r1335; +} +{ +add.f16x2 r1336, r1312, r1333; +} +{ +cvt.rn.f16.f64 rs180, fd876; +} +mov.b32 r1341, {rs180, rs180}; +{ +mul.f16x2 r1339, r166, r1341; +} +{ +add.f16x2 r1342, r1318, r1339; +} +{ +cvt.rn.f16.f64 rs181, fd875; +} +mov.b32 r1347, {rs181, rs181}; +{ +mul.f16x2 r1345, r160, r1347; +} +{ +add.f16x2 r1348, r1324, r1345; +} +{ +cvt.rn.f16.f64 rs182, fd876; +} +mov.b32 r1353, {rs182, rs182}; +{ +mul.f16x2 r1351, r163, r1353; +} +{ +add.f16x2 r1354, r1330, r1351; +} +{ +cvt.rn.f16.f64 rs183, fd851; +} +mov.b32 r1359, {rs183, rs183}; +{ +mul.f16x2 r1357, r169, r1359; +} +{ +add.f16x2 r1360, r1336, r1357; +} +{ +cvt.rn.f16.f64 rs184, fd852; +} +mov.b32 r1365, {rs184, rs184}; +{ +mul.f16x2 r1363, r178, r1365; +} +{ +add.f16x2 r1366, r1342, r1363; +} +{ +cvt.rn.f16.f64 rs185, fd851; +} +mov.b32 r1371, {rs185, rs185}; +{ +mul.f16x2 r1369, r172, r1371; +} +{ +add.f16x2 r1372, r1348, r1369; +} +{ +cvt.rn.f16.f64 rs186, fd852; +} +mov.b32 r1377, {rs186, rs186}; +{ +mul.f16x2 r1375, r175, r1377; +} +{ +add.f16x2 r1378, r1354, r1375; +} +{ +sub.f16x2 %6, r1360, r1366; +} +{ +add.f16x2 %7, r1372, r1378; +} +{ +add.f16x2 %56, r1360, r1366; +} +{ +sub.f16x2 %57, r1372, r1378; +} +cvt.rn.f16.s32 rs187, r5508; +mov.b32 r1405, {rs187, rs187}; +cvt.rn.f16.s32 rs188, r5508; +mov.b32 r1417, {rs188, rs188}; +{ +cvt.rn.f16.f64 rs189, fd871; +} +mov.b32 r1397, {rs189, rs189}; +{ +mul.f16x2 r1395, r1, r1397; +} +{ +add.f16x2 r1398, %62, r1395; +} +{ +cvt.rn.f16.f64 rs190, fd624; +} +mov.b32 r1403, {rs190, rs190}; +{ +mul.f16x2 r1401, r10, r1403; +} +{ +add.f16x2 r1404, r1405, r1401; +} +{ +cvt.rn.f16.f64 rs191, fd871; +} +mov.b32 r1409, {rs191, rs191}; +{ +mul.f16x2 r1407, r4, r1409; +} +{ +add.f16x2 r1410, %63, r1407; +} +{ +cvt.rn.f16.f64 rs192, fd624; +} +mov.b32 r1415, {rs192, rs192}; +{ +mul.f16x2 r1413, r7, r1415; +} +{ +add.f16x2 r1416, r1417, r1413; +} +{ +cvt.rn.f16.f64 rs193, fd899; +} +mov.b32 r1421, {rs193, rs193}; +{ +mul.f16x2 r1419, r13, r1421; +} +{ +add.f16x2 r1422, r1398, r1419; +} +{ +cvt.rn.f16.f64 rs194, fd900; +} +mov.b32 r1427, {rs194, rs194}; +{ +mul.f16x2 r1425, r22, r1427; +} +{ +add.f16x2 r1428, r1404, r1425; +} +{ +cvt.rn.f16.f64 rs195, fd899; +} +mov.b32 r1433, {rs195, rs195}; +{ +mul.f16x2 r1431, r16, r1433; +} +{ +add.f16x2 r1434, r1410, r1431; +} +{ +cvt.rn.f16.f64 rs196, fd900; +} +mov.b32 r1439, {rs196, rs196}; +{ +mul.f16x2 r1437, r19, r1439; +} +{ +add.f16x2 r1440, r1416, r1437; +} +{ +cvt.rn.f16.f64 rs197, fd867; +} +mov.b32 r1445, {rs197, rs197}; +{ +mul.f16x2 r1443, r25, r1445; +} +{ +add.f16x2 r1446, r1422, r1443; +} +{ +cvt.rn.f16.f64 rs198, fd868; +} +mov.b32 r1451, {rs198, rs198}; +{ +mul.f16x2 r1449, r34, r1451; +} +{ +add.f16x2 r1452, r1428, r1449; +} +{ +cvt.rn.f16.f64 rs199, fd867; +} +mov.b32 r1457, {rs199, rs199}; +{ +mul.f16x2 r1455, r28, r1457; +} +{ +add.f16x2 r1458, r1434, r1455; +} +{ +cvt.rn.f16.f64 rs200, fd868; +} +mov.b32 r1463, {rs200, rs200}; +{ +mul.f16x2 r1461, r31, r1463; +} +{ +add.f16x2 r1464, r1440, r1461; +} +{ +cvt.rn.f16.f64 rs201, fd843; +} +mov.b32 r1469, {rs201, rs201}; +{ +mul.f16x2 r1467, r37, r1469; +} +{ +add.f16x2 r1470, r1446, r1467; +} +{ +cvt.rn.f16.f64 rs202, fd820; +} +mov.b32 r1475, {rs202, rs202}; +{ +mul.f16x2 r1473, r46, r1475; +} +{ +add.f16x2 r1476, r1452, r1473; +} +{ +cvt.rn.f16.f64 rs203, fd843; +} +mov.b32 r1481, {rs203, rs203}; +{ +mul.f16x2 r1479, r40, r1481; +} +{ +add.f16x2 r1482, r1458, r1479; +} +{ +cvt.rn.f16.f64 rs204, fd820; +} +mov.b32 r1487, {rs204, rs204}; +{ +mul.f16x2 r1485, r43, r1487; +} +{ +add.f16x2 r1488, r1464, r1485; +} +{ +cvt.rn.f16.f64 rs205, fd875; +} +mov.b32 r1493, {rs205, rs205}; +{ +mul.f16x2 r1491, r49, r1493; +} +{ +add.f16x2 r1494, r1470, r1491; +} +{ +cvt.rn.f16.f64 rs206, fd708; +} +mov.b32 r1499, {rs206, rs206}; +{ +mul.f16x2 r1497, r58, r1499; +} +{ +add.f16x2 r1500, r1476, r1497; +} +{ +cvt.rn.f16.f64 rs207, fd875; +} +mov.b32 r1505, {rs207, rs207}; +{ +mul.f16x2 r1503, r52, r1505; +} +{ +add.f16x2 r1506, r1482, r1503; +} +{ +cvt.rn.f16.f64 rs208, fd708; +} +mov.b32 r1511, {rs208, rs208}; +{ +mul.f16x2 r1509, r55, r1511; +} +{ +add.f16x2 r1512, r1488, r1509; +} +{ +cvt.rn.f16.f64 rs209, fd895; +} +mov.b32 r1517, {rs209, rs209}; +{ +mul.f16x2 r1515, r61, r1517; +} +{ +add.f16x2 r1518, r1494, r1515; +} +{ +cvt.rn.f16.f64 rs210, fd896; +} +mov.b32 r1523, {rs210, rs210}; +{ +mul.f16x2 r1521, r70, r1523; +} +{ +add.f16x2 r1524, r1500, r1521; +} +{ +cvt.rn.f16.f64 rs211, fd895; +} +mov.b32 r1529, {rs211, rs211}; +{ +mul.f16x2 r1527, r64, r1529; +} +{ +add.f16x2 r1530, r1506, r1527; +} +{ +cvt.rn.f16.f64 rs212, fd896; +} +mov.b32 r1535, {rs212, rs212}; +{ +mul.f16x2 r1533, r67, r1535; +} +{ +add.f16x2 r1536, r1512, r1533; +} +{ +cvt.rn.f16.f64 rs213, fd863; +} +mov.b32 r1541, {rs213, rs213}; +{ +mul.f16x2 r1539, r73, r1541; +} +{ +add.f16x2 r1542, r1518, r1539; +} +{ +cvt.rn.f16.f64 rs214, fd864; +} +mov.b32 r1547, {rs214, rs214}; +{ +mul.f16x2 r1545, r82, r1547; +} +{ +add.f16x2 r1548, r1524, r1545; +} +{ +cvt.rn.f16.f64 rs215, fd863; +} +mov.b32 r1553, {rs215, rs215}; +{ +mul.f16x2 r1551, r76, r1553; +} +{ +add.f16x2 r1554, r1530, r1551; +} +{ +cvt.rn.f16.f64 rs216, fd864; +} +mov.b32 r1559, {rs216, rs216}; +{ +mul.f16x2 r1557, r79, r1559; +} +{ +add.f16x2 r1560, r1536, r1557; +} +{ +cvt.rn.f16.f64 rs217, fd847; +} +mov.b32 r1565, {rs217, rs217}; +{ +mul.f16x2 r1563, r85, r1565; +} +{ +add.f16x2 r1566, r1542, r1563; +} +{ +cvt.rn.f16.f64 rs218, fd768; +} +mov.b32 r1571, {rs218, rs218}; +{ +mul.f16x2 r1569, r94, r1571; +} +{ +add.f16x2 r1572, r1548, r1569; +} +{ +cvt.rn.f16.f64 rs219, fd847; +} +mov.b32 r1577, {rs219, rs219}; +{ +mul.f16x2 r1575, r88, r1577; +} +{ +add.f16x2 r1578, r1554, r1575; +} +{ +cvt.rn.f16.f64 rs220, fd768; +} +mov.b32 r1583, {rs220, rs220}; +{ +mul.f16x2 r1581, r91, r1583; +} +{ +add.f16x2 r1584, r1560, r1581; +} +{ +cvt.rn.f16.f64 rs221, fd879; +} +mov.b32 r1589, {rs221, rs221}; +{ +mul.f16x2 r1587, r97, r1589; +} +{ +add.f16x2 r1590, r1566, r1587; +} +{ +cvt.rn.f16.f64 rs222, fd808; +} +mov.b32 r1595, {rs222, rs222}; +{ +mul.f16x2 r1593, r106, r1595; +} +{ +add.f16x2 r1596, r1572, r1593; +} +{ +cvt.rn.f16.f64 rs223, fd879; +} +mov.b32 r1601, {rs223, rs223}; +{ +mul.f16x2 r1599, r100, r1601; +} +{ +add.f16x2 r1602, r1578, r1599; +} +{ +cvt.rn.f16.f64 rs224, fd808; +} +mov.b32 r1607, {rs224, rs224}; +{ +mul.f16x2 r1605, r103, r1607; +} +{ +add.f16x2 r1608, r1584, r1605; +} +{ +cvt.rn.f16.f64 rs225, fd891; +} +mov.b32 r1613, {rs225, rs225}; +{ +mul.f16x2 r1611, r109, r1613; +} +{ +add.f16x2 r1614, r1590, r1611; +} +{ +cvt.rn.f16.f64 rs226, fd892; +} +mov.b32 r1619, {rs226, rs226}; +{ +mul.f16x2 r1617, r118, r1619; +} +{ +add.f16x2 r1620, r1596, r1617; +} +{ +cvt.rn.f16.f64 rs227, fd891; +} +mov.b32 r1625, {rs227, rs227}; +{ +mul.f16x2 r1623, r112, r1625; +} +{ +add.f16x2 r1626, r1602, r1623; +} +{ +cvt.rn.f16.f64 rs228, fd892; +} +mov.b32 r1631, {rs228, rs228}; +{ +mul.f16x2 r1629, r115, r1631; +} +{ +add.f16x2 r1632, r1608, r1629; +} +{ +cvt.rn.f16.f64 rs229, fd859; +} +mov.b32 r1637, {rs229, rs229}; +{ +mul.f16x2 r1635, r121, r1637; +} +{ +add.f16x2 r1638, r1614, r1635; +} +{ +cvt.rn.f16.f64 rs230, fd860; +} +mov.b32 r1643, {rs230, rs230}; +{ +mul.f16x2 r1641, r130, r1643; +} +{ +add.f16x2 r1644, r1620, r1641; +} +{ +cvt.rn.f16.f64 rs231, fd859; +} +mov.b32 r1649, {rs231, rs231}; +{ +mul.f16x2 r1647, r124, r1649; +} +{ +add.f16x2 r1650, r1626, r1647; +} +{ +cvt.rn.f16.f64 rs232, fd860; +} +mov.b32 r1655, {rs232, rs232}; +{ +mul.f16x2 r1653, r127, r1655; +} +{ +add.f16x2 r1656, r1632, r1653; +} +{ +cvt.rn.f16.f64 rs233, fd851; +} +mov.b32 r1661, {rs233, rs233}; +{ +mul.f16x2 r1659, r133, r1661; +} +{ +add.f16x2 r1662, r1638, r1659; +} +mov.f64 fd676, 0dBFD328C3F1B322CB; +{ +cvt.rn.f16.f64 rs234, fd676; +} +mov.b32 r1667, {rs234, rs234}; +{ +mul.f16x2 r1665, r142, r1667; +} +{ +add.f16x2 r1668, r1644, r1665; +} +{ +cvt.rn.f16.f64 rs235, fd851; +} +mov.b32 r1673, {rs235, rs235}; +{ +mul.f16x2 r1671, r136, r1673; +} +{ +add.f16x2 r1674, r1650, r1671; +} +{ +cvt.rn.f16.f64 rs236, fd676; +} +mov.b32 r1679, {rs236, rs236}; +{ +mul.f16x2 r1677, r139, r1679; +} +{ +add.f16x2 r1680, r1656, r1677; +} +{ +cvt.rn.f16.f64 rs237, fd883; +} +mov.b32 r1685, {rs237, rs237}; +{ +mul.f16x2 r1683, r145, r1685; +} +{ +add.f16x2 r1686, r1662, r1683; +} +{ +cvt.rn.f16.f64 rs238, fd736; +} +mov.b32 r1691, {rs238, rs238}; +{ +mul.f16x2 r1689, r154, r1691; +} +{ +add.f16x2 r1692, r1668, r1689; +} +{ +cvt.rn.f16.f64 rs239, fd883; +} +mov.b32 r1697, {rs239, rs239}; +{ +mul.f16x2 r1695, r148, r1697; +} +{ +add.f16x2 r1698, r1674, r1695; +} +{ +cvt.rn.f16.f64 rs240, fd736; +} +mov.b32 r1703, {rs240, rs240}; +{ +mul.f16x2 r1701, r151, r1703; +} +{ +add.f16x2 r1704, r1680, r1701; +} +{ +cvt.rn.f16.f64 rs241, fd887; +} +mov.b32 r1709, {rs241, rs241}; +{ +mul.f16x2 r1707, r157, r1709; +} +{ +add.f16x2 r1710, r1686, r1707; +} +mov.f64 fd888, 0dBFEE0210C26A6E6F; +{ +cvt.rn.f16.f64 rs242, fd888; +} +mov.b32 r1715, {rs242, rs242}; +{ +mul.f16x2 r1713, r166, r1715; +} +{ +add.f16x2 r1716, r1692, r1713; +} +{ +cvt.rn.f16.f64 rs243, fd887; +} +mov.b32 r1721, {rs243, rs243}; +{ +mul.f16x2 r1719, r160, r1721; +} +{ +add.f16x2 r1722, r1698, r1719; +} +{ +cvt.rn.f16.f64 rs244, fd888; +} +mov.b32 r1727, {rs244, rs244}; +{ +mul.f16x2 r1725, r163, r1727; +} +{ +add.f16x2 r1728, r1704, r1725; +} +{ +cvt.rn.f16.f64 rs245, fd855; +} +mov.b32 r1733, {rs245, rs245}; +{ +mul.f16x2 r1731, r169, r1733; +} +{ +add.f16x2 r1734, r1710, r1731; +} +mov.f64 fd856, 0dBFD93D20572CA90B; +{ +cvt.rn.f16.f64 rs246, fd856; +} +mov.b32 r1739, {rs246, rs246}; +{ +mul.f16x2 r1737, r178, r1739; +} +{ +add.f16x2 r1740, r1716, r1737; +} +{ +cvt.rn.f16.f64 rs247, fd855; +} +mov.b32 r1745, {rs247, rs247}; +{ +mul.f16x2 r1743, r172, r1745; +} +{ +add.f16x2 r1746, r1722, r1743; +} +{ +cvt.rn.f16.f64 rs248, fd856; +} +mov.b32 r1751, {rs248, rs248}; +{ +mul.f16x2 r1749, r175, r1751; +} +{ +add.f16x2 r1752, r1728, r1749; +} +{ +sub.f16x2 %8, r1734, r1740; +} +{ +add.f16x2 %9, r1746, r1752; +} +{ +add.f16x2 %54, r1734, r1740; +} +{ +sub.f16x2 %55, r1746, r1752; +} +cvt.rn.f16.s32 rs249, r5508; +mov.b32 r1779, {rs249, rs249}; +cvt.rn.f16.s32 rs250, r5508; +mov.b32 r1791, {rs250, rs250}; +{ +cvt.rn.f16.f64 rs251, fd879; +} +mov.b32 r1771, {rs251, rs251}; +{ +mul.f16x2 r1769, r1, r1771; +} +{ +add.f16x2 r1772, %62, r1769; +} +{ +cvt.rn.f16.f64 rs252, fd808; +} +mov.b32 r1777, {rs252, rs252}; +{ +mul.f16x2 r1775, r10, r1777; +} +{ +add.f16x2 r1778, r1779, r1775; +} +{ +cvt.rn.f16.f64 rs253, fd879; +} +mov.b32 r1783, {rs253, rs253}; +{ +mul.f16x2 r1781, r4, r1783; +} +{ +add.f16x2 r1784, %63, r1781; +} +{ +cvt.rn.f16.f64 rs254, fd808; +} +mov.b32 r1789, {rs254, rs254}; +{ +mul.f16x2 r1787, r7, r1789; +} +{ +add.f16x2 r1790, r1791, r1787; +} +{ +cvt.rn.f16.f64 rs255, fd883; +} +mov.b32 r1795, {rs255, rs255}; +{ +mul.f16x2 r1793, r13, r1795; +} +{ +add.f16x2 r1796, r1772, r1793; +} +{ +cvt.rn.f16.f64 rs256, fd884; +} +mov.b32 r1801, {rs256, rs256}; +{ +mul.f16x2 r1799, r22, r1801; +} +{ +add.f16x2 r1802, r1778, r1799; +} +{ +cvt.rn.f16.f64 rs257, fd883; +} +mov.b32 r1807, {rs257, rs257}; +{ +mul.f16x2 r1805, r16, r1807; +} +{ +add.f16x2 r1808, r1784, r1805; +} +{ +cvt.rn.f16.f64 rs258, fd884; +} +mov.b32 r1813, {rs258, rs258}; +{ +mul.f16x2 r1811, r19, r1813; +} +{ +add.f16x2 r1814, r1790, r1811; +} +{ +cvt.rn.f16.f64 rs259, fd843; +} +mov.b32 r1819, {rs259, rs259}; +{ +mul.f16x2 r1817, r25, r1819; +} +{ +add.f16x2 r1820, r1796, r1817; +} +{ +cvt.rn.f16.f64 rs260, fd844; +} +mov.b32 r1825, {rs260, rs260}; +{ +mul.f16x2 r1823, r34, r1825; +} +{ +add.f16x2 r1826, r1802, r1823; +} +{ +cvt.rn.f16.f64 rs261, fd843; +} +mov.b32 r1831, {rs261, rs261}; +{ +mul.f16x2 r1829, r28, r1831; +} +{ +add.f16x2 r1832, r1808, r1829; +} +{ +cvt.rn.f16.f64 rs262, fd844; +} +mov.b32 r1837, {rs262, rs262}; +{ +mul.f16x2 r1835, r31, r1837; +} +{ +add.f16x2 r1838, r1814, r1835; +} +{ +cvt.rn.f16.f64 rs263, fd875; +} +mov.b32 r1843, {rs263, rs263}; +{ +mul.f16x2 r1841, r37, r1843; +} +{ +add.f16x2 r1844, r1820, r1841; +} +{ +cvt.rn.f16.f64 rs264, fd708; +} +mov.b32 r1849, {rs264, rs264}; +{ +mul.f16x2 r1847, r46, r1849; +} +{ +add.f16x2 r1850, r1826, r1847; +} +{ +cvt.rn.f16.f64 rs265, fd875; +} +mov.b32 r1855, {rs265, rs265}; +{ +mul.f16x2 r1853, r40, r1855; +} +{ +add.f16x2 r1856, r1832, r1853; +} +{ +cvt.rn.f16.f64 rs266, fd708; +} +mov.b32 r1861, {rs266, rs266}; +{ +mul.f16x2 r1859, r43, r1861; +} +{ +add.f16x2 r1862, r1838, r1859; +} +{ +cvt.rn.f16.f64 rs267, fd887; +} +mov.b32 r1867, {rs267, rs267}; +{ +mul.f16x2 r1865, r49, r1867; +} +{ +add.f16x2 r1868, r1844, r1865; +} +{ +cvt.rn.f16.f64 rs268, fd888; +} +mov.b32 r1873, {rs268, rs268}; +{ +mul.f16x2 r1871, r58, r1873; +} +{ +add.f16x2 r1874, r1850, r1871; +} +{ +cvt.rn.f16.f64 rs269, fd887; +} +mov.b32 r1879, {rs269, rs269}; +{ +mul.f16x2 r1877, r52, r1879; +} +{ +add.f16x2 r1880, r1856, r1877; +} +{ +cvt.rn.f16.f64 rs270, fd888; +} +mov.b32 r1885, {rs270, rs270}; +{ +mul.f16x2 r1883, r55, r1885; +} +{ +add.f16x2 r1886, r1862, r1883; +} +{ +cvt.rn.f16.f64 rs271, fd847; +} +mov.b32 r1891, {rs271, rs271}; +{ +mul.f16x2 r1889, r61, r1891; +} +{ +add.f16x2 r1892, r1868, r1889; +} +{ +cvt.rn.f16.f64 rs272, fd848; +} +mov.b32 r1897, {rs272, rs272}; +{ +mul.f16x2 r1895, r70, r1897; +} +{ +add.f16x2 r1898, r1874, r1895; +} +{ +cvt.rn.f16.f64 rs273, fd847; +} +mov.b32 r1903, {rs273, rs273}; +{ +mul.f16x2 r1901, r64, r1903; +} +{ +add.f16x2 r1904, r1880, r1901; +} +{ +cvt.rn.f16.f64 rs274, fd848; +} +mov.b32 r1909, {rs274, rs274}; +{ +mul.f16x2 r1907, r67, r1909; +} +{ +add.f16x2 r1910, r1886, r1907; +} +{ +cvt.rn.f16.f64 rs275, fd871; +} +mov.b32 r1915, {rs275, rs275}; +{ +mul.f16x2 r1913, r73, r1915; +} +{ +add.f16x2 r1916, r1892, r1913; +} +{ +cvt.rn.f16.f64 rs276, fd624; +} +mov.b32 r1921, {rs276, rs276}; +{ +mul.f16x2 r1919, r82, r1921; +} +{ +add.f16x2 r1922, r1898, r1919; +} +{ +cvt.rn.f16.f64 rs277, fd871; +} +mov.b32 r1927, {rs277, rs277}; +{ +mul.f16x2 r1925, r76, r1927; +} +{ +add.f16x2 r1928, r1904, r1925; +} +{ +cvt.rn.f16.f64 rs278, fd624; +} +mov.b32 r1933, {rs278, rs278}; +{ +mul.f16x2 r1931, r79, r1933; +} +{ +add.f16x2 r1934, r1910, r1931; +} +{ +cvt.rn.f16.f64 rs279, fd891; +} +mov.b32 r1939, {rs279, rs279}; +{ +mul.f16x2 r1937, r85, r1939; +} +{ +add.f16x2 r1940, r1916, r1937; +} +{ +cvt.rn.f16.f64 rs280, fd892; +} +mov.b32 r1945, {rs280, rs280}; +{ +mul.f16x2 r1943, r94, r1945; +} +{ +add.f16x2 r1946, r1922, r1943; +} +{ +cvt.rn.f16.f64 rs281, fd891; +} +mov.b32 r1951, {rs281, rs281}; +{ +mul.f16x2 r1949, r88, r1951; +} +{ +add.f16x2 r1952, r1928, r1949; +} +{ +cvt.rn.f16.f64 rs282, fd892; +} +mov.b32 r1957, {rs282, rs282}; +{ +mul.f16x2 r1955, r91, r1957; +} +{ +add.f16x2 r1958, r1934, r1955; +} +{ +cvt.rn.f16.f64 rs283, fd851; +} +mov.b32 r1963, {rs283, rs283}; +{ +mul.f16x2 r1961, r97, r1963; +} +{ +add.f16x2 r1964, r1940, r1961; +} +{ +cvt.rn.f16.f64 rs284, fd852; +} +mov.b32 r1969, {rs284, rs284}; +{ +mul.f16x2 r1967, r106, r1969; +} +{ +add.f16x2 r1970, r1946, r1967; +} +{ +cvt.rn.f16.f64 rs285, fd851; +} +mov.b32 r1975, {rs285, rs285}; +{ +mul.f16x2 r1973, r100, r1975; +} +{ +add.f16x2 r1976, r1952, r1973; +} +{ +cvt.rn.f16.f64 rs286, fd852; +} +mov.b32 r1981, {rs286, rs286}; +{ +mul.f16x2 r1979, r103, r1981; +} +{ +add.f16x2 r1982, r1958, r1979; +} +{ +cvt.rn.f16.f64 rs287, fd867; +} +mov.b32 r1987, {rs287, rs287}; +{ +mul.f16x2 r1985, r109, r1987; +} +{ +add.f16x2 r1988, r1964, r1985; +} +mov.f64 fd812, 0dBFE4D80B1AD9CCF6; +{ +cvt.rn.f16.f64 rs288, fd812; +} +mov.b32 r1993, {rs288, rs288}; +{ +mul.f16x2 r1991, r118, r1993; +} +{ +add.f16x2 r1994, r1970, r1991; +} +{ +cvt.rn.f16.f64 rs289, fd867; +} +mov.b32 r1999, {rs289, rs289}; +{ +mul.f16x2 r1997, r112, r1999; +} +{ +add.f16x2 r2000, r1976, r1997; +} +{ +cvt.rn.f16.f64 rs290, fd812; +} +mov.b32 r2005, {rs290, rs290}; +{ +mul.f16x2 r2003, r115, r2005; +} +{ +add.f16x2 r2006, r1982, r2003; +} +{ +cvt.rn.f16.f64 rs291, fd895; +} +mov.b32 r2011, {rs291, rs291}; +{ +mul.f16x2 r2009, r121, r2011; +} +{ +add.f16x2 r2012, r1988, r2009; +} +{ +cvt.rn.f16.f64 rs292, fd896; +} +mov.b32 r2017, {rs292, rs292}; +{ +mul.f16x2 r2015, r130, r2017; +} +{ +add.f16x2 r2018, r1994, r2015; +} +{ +cvt.rn.f16.f64 rs293, fd895; +} +mov.b32 r2023, {rs293, rs293}; +{ +mul.f16x2 r2021, r124, r2023; +} +{ +add.f16x2 r2024, r2000, r2021; +} +{ +cvt.rn.f16.f64 rs294, fd896; +} +mov.b32 r2029, {rs294, rs294}; +{ +mul.f16x2 r2027, r127, r2029; +} +{ +add.f16x2 r2030, r2006, r2027; +} +{ +cvt.rn.f16.f64 rs295, fd855; +} +mov.b32 r2035, {rs295, rs295}; +{ +mul.f16x2 r2033, r133, r2035; +} +{ +add.f16x2 r2036, r2012, r2033; +} +{ +cvt.rn.f16.f64 rs296, fd856; +} +mov.b32 r2041, {rs296, rs296}; +{ +mul.f16x2 r2039, r142, r2041; +} +{ +add.f16x2 r2042, r2018, r2039; +} +{ +cvt.rn.f16.f64 rs297, fd855; +} +mov.b32 r2047, {rs297, rs297}; +{ +mul.f16x2 r2045, r136, r2047; +} +{ +add.f16x2 r2048, r2024, r2045; +} +{ +cvt.rn.f16.f64 rs298, fd856; +} +mov.b32 r2053, {rs298, rs298}; +{ +mul.f16x2 r2051, r139, r2053; +} +{ +add.f16x2 r2054, r2030, r2051; +} +{ +cvt.rn.f16.f64 rs299, fd863; +} +mov.b32 r2059, {rs299, rs299}; +{ +mul.f16x2 r2057, r145, r2059; +} +{ +add.f16x2 r2060, r2036, r2057; +} +{ +cvt.rn.f16.f64 rs300, fd740; +} +mov.b32 r2065, {rs300, rs300}; +{ +mul.f16x2 r2063, r154, r2065; +} +{ +add.f16x2 r2066, r2042, r2063; +} +{ +cvt.rn.f16.f64 rs301, fd863; +} +mov.b32 r2071, {rs301, rs301}; +{ +mul.f16x2 r2069, r148, r2071; +} +{ +add.f16x2 r2072, r2048, r2069; +} +{ +cvt.rn.f16.f64 rs302, fd740; +} +mov.b32 r2077, {rs302, rs302}; +{ +mul.f16x2 r2075, r151, r2077; +} +{ +add.f16x2 r2078, r2054, r2075; +} +{ +cvt.rn.f16.f64 rs303, fd899; +} +mov.b32 r2083, {rs303, rs303}; +{ +mul.f16x2 r2081, r157, r2083; +} +{ +add.f16x2 r2084, r2060, r2081; +} +{ +cvt.rn.f16.f64 rs304, fd900; +} +mov.b32 r2089, {rs304, rs304}; +{ +mul.f16x2 r2087, r166, r2089; +} +{ +add.f16x2 r2090, r2066, r2087; +} +{ +cvt.rn.f16.f64 rs305, fd899; +} +mov.b32 r2095, {rs305, rs305}; +{ +mul.f16x2 r2093, r160, r2095; +} +{ +add.f16x2 r2096, r2072, r2093; +} +{ +cvt.rn.f16.f64 rs306, fd900; +} +mov.b32 r2101, {rs306, rs306}; +{ +mul.f16x2 r2099, r163, r2101; +} +{ +add.f16x2 r2102, r2078, r2099; +} +{ +cvt.rn.f16.f64 rs307, fd859; +} +mov.b32 r2107, {rs307, rs307}; +{ +mul.f16x2 r2105, r169, r2107; +} +{ +add.f16x2 r2108, r2084, r2105; +} +{ +cvt.rn.f16.f64 rs308, fd860; +} +mov.b32 r2113, {rs308, rs308}; +{ +mul.f16x2 r2111, r178, r2113; +} +{ +add.f16x2 r2114, r2090, r2111; +} +{ +cvt.rn.f16.f64 rs309, fd859; +} +mov.b32 r2119, {rs309, rs309}; +{ +mul.f16x2 r2117, r172, r2119; +} +{ +add.f16x2 r2120, r2096, r2117; +} +{ +cvt.rn.f16.f64 rs310, fd860; +} +mov.b32 r2125, {rs310, rs310}; +{ +mul.f16x2 r2123, r175, r2125; +} +{ +add.f16x2 r2126, r2102, r2123; +} +{ +sub.f16x2 %10, r2108, r2114; +} +{ +add.f16x2 %11, r2120, r2126; +} +{ +add.f16x2 %52, r2108, r2114; +} +{ +sub.f16x2 %53, r2120, r2126; +} +cvt.rn.f16.s32 rs311, r5508; +mov.b32 r2153, {rs311, rs311}; +cvt.rn.f16.s32 rs312, r5508; +mov.b32 r2165, {rs312, rs312}; +{ +cvt.rn.f16.f64 rs313, fd887; +} +mov.b32 r2145, {rs313, rs313}; +{ +mul.f16x2 r2143, r1, r2145; +} +{ +add.f16x2 r2146, %62, r2143; +} +{ +cvt.rn.f16.f64 rs314, fd760; +} +mov.b32 r2151, {rs314, rs314}; +{ +mul.f16x2 r2149, r10, r2151; +} +{ +add.f16x2 r2152, r2153, r2149; +} +{ +cvt.rn.f16.f64 rs315, fd887; +} +mov.b32 r2157, {rs315, rs315}; +{ +mul.f16x2 r2155, r4, r2157; +} +{ +add.f16x2 r2158, %63, r2155; +} +{ +cvt.rn.f16.f64 rs316, fd760; +} +mov.b32 r2163, {rs316, rs316}; +{ +mul.f16x2 r2161, r7, r2163; +} +{ +add.f16x2 r2164, r2165, r2161; +} +{ +cvt.rn.f16.f64 rs317, fd867; +} +mov.b32 r2169, {rs317, rs317}; +{ +mul.f16x2 r2167, r13, r2169; +} +{ +add.f16x2 r2170, r2146, r2167; +} +{ +cvt.rn.f16.f64 rs318, fd868; +} +mov.b32 r2175, {rs318, rs318}; +{ +mul.f16x2 r2173, r22, r2175; +} +{ +add.f16x2 r2176, r2152, r2173; +} +{ +cvt.rn.f16.f64 rs319, fd867; +} +mov.b32 r2181, {rs319, rs319}; +{ +mul.f16x2 r2179, r16, r2181; +} +{ +add.f16x2 r2182, r2158, r2179; +} +{ +cvt.rn.f16.f64 rs320, fd868; +} +mov.b32 r2187, {rs320, rs320}; +{ +mul.f16x2 r2185, r19, r2187; +} +{ +add.f16x2 r2188, r2164, r2185; +} +{ +cvt.rn.f16.f64 rs321, fd859; +} +mov.b32 r2193, {rs321, rs321}; +{ +mul.f16x2 r2191, r25, r2193; +} +{ +add.f16x2 r2194, r2170, r2191; +} +{ +cvt.rn.f16.f64 rs322, fd572; +} +mov.b32 r2199, {rs322, rs322}; +{ +mul.f16x2 r2197, r34, r2199; +} +{ +add.f16x2 r2200, r2176, r2197; +} +{ +cvt.rn.f16.f64 rs323, fd859; +} +mov.b32 r2205, {rs323, rs323}; +{ +mul.f16x2 r2203, r28, r2205; +} +{ +add.f16x2 r2206, r2182, r2203; +} +{ +cvt.rn.f16.f64 rs324, fd572; +} +mov.b32 r2211, {rs324, rs324}; +{ +mul.f16x2 r2209, r31, r2211; +} +{ +add.f16x2 r2212, r2188, r2209; +} +{ +cvt.rn.f16.f64 rs325, fd895; +} +mov.b32 r2217, {rs325, rs325}; +{ +mul.f16x2 r2215, r37, r2217; +} +{ +add.f16x2 r2218, r2194, r2215; +} +{ +cvt.rn.f16.f64 rs326, fd896; +} +mov.b32 r2223, {rs326, rs326}; +{ +mul.f16x2 r2221, r46, r2223; +} +{ +add.f16x2 r2224, r2200, r2221; +} +{ +cvt.rn.f16.f64 rs327, fd895; +} +mov.b32 r2229, {rs327, rs327}; +{ +mul.f16x2 r2227, r40, r2229; +} +{ +add.f16x2 r2230, r2206, r2227; +} +{ +cvt.rn.f16.f64 rs328, fd896; +} +mov.b32 r2235, {rs328, rs328}; +{ +mul.f16x2 r2233, r43, r2235; +} +{ +add.f16x2 r2236, r2212, r2233; +} +{ +cvt.rn.f16.f64 rs329, fd847; +} +mov.b32 r2241, {rs329, rs329}; +{ +mul.f16x2 r2239, r49, r2241; +} +{ +add.f16x2 r2242, r2218, r2239; +} +{ +cvt.rn.f16.f64 rs330, fd848; +} +mov.b32 r2247, {rs330, rs330}; +{ +mul.f16x2 r2245, r58, r2247; +} +{ +add.f16x2 r2248, r2224, r2245; +} +{ +cvt.rn.f16.f64 rs331, fd847; +} +mov.b32 r2253, {rs331, rs331}; +{ +mul.f16x2 r2251, r52, r2253; +} +{ +add.f16x2 r2254, r2230, r2251; +} +{ +cvt.rn.f16.f64 rs332, fd848; +} +mov.b32 r2259, {rs332, rs332}; +{ +mul.f16x2 r2257, r55, r2259; +} +{ +add.f16x2 r2260, r2236, r2257; +} +{ +cvt.rn.f16.f64 rs333, fd879; +} +mov.b32 r2265, {rs333, rs333}; +{ +mul.f16x2 r2263, r61, r2265; +} +{ +add.f16x2 r2266, r2242, r2263; +} +{ +cvt.rn.f16.f64 rs334, fd808; +} +mov.b32 r2271, {rs334, rs334}; +{ +mul.f16x2 r2269, r70, r2271; +} +{ +add.f16x2 r2272, r2248, r2269; +} +{ +cvt.rn.f16.f64 rs335, fd879; +} +mov.b32 r2277, {rs335, rs335}; +{ +mul.f16x2 r2275, r64, r2277; +} +{ +add.f16x2 r2278, r2254, r2275; +} +{ +cvt.rn.f16.f64 rs336, fd808; +} +mov.b32 r2283, {rs336, rs336}; +{ +mul.f16x2 r2281, r67, r2283; +} +{ +add.f16x2 r2284, r2260, r2281; +} +{ +cvt.rn.f16.f64 rs337, fd875; +} +mov.b32 r2289, {rs337, rs337}; +{ +mul.f16x2 r2287, r73, r2289; +} +{ +add.f16x2 r2290, r2266, r2287; +} +{ +cvt.rn.f16.f64 rs338, fd876; +} +mov.b32 r2295, {rs338, rs338}; +{ +mul.f16x2 r2293, r82, r2295; +} +{ +add.f16x2 r2296, r2272, r2293; +} +{ +cvt.rn.f16.f64 rs339, fd875; +} +mov.b32 r2301, {rs339, rs339}; +{ +mul.f16x2 r2299, r76, r2301; +} +{ +add.f16x2 r2302, r2278, r2299; +} +{ +cvt.rn.f16.f64 rs340, fd876; +} +mov.b32 r2307, {rs340, rs340}; +{ +mul.f16x2 r2305, r79, r2307; +} +{ +add.f16x2 r2308, r2284, r2305; +} +{ +cvt.rn.f16.f64 rs341, fd851; +} +mov.b32 r2313, {rs341, rs341}; +{ +mul.f16x2 r2311, r85, r2313; +} +{ +add.f16x2 r2314, r2290, r2311; +} +{ +cvt.rn.f16.f64 rs342, fd676; +} +mov.b32 r2319, {rs342, rs342}; +{ +mul.f16x2 r2317, r94, r2319; +} +{ +add.f16x2 r2320, r2296, r2317; +} +{ +cvt.rn.f16.f64 rs343, fd851; +} +mov.b32 r2325, {rs343, rs343}; +{ +mul.f16x2 r2323, r88, r2325; +} +{ +add.f16x2 r2326, r2302, r2323; +} +{ +cvt.rn.f16.f64 rs344, fd676; +} +mov.b32 r2331, {rs344, rs344}; +{ +mul.f16x2 r2329, r91, r2331; +} +{ +add.f16x2 r2332, r2308, r2329; +} +{ +cvt.rn.f16.f64 rs345, fd899; +} +mov.b32 r2337, {rs345, rs345}; +{ +mul.f16x2 r2335, r97, r2337; +} +{ +add.f16x2 r2338, r2314, r2335; +} +mov.f64 fd504, 0dBFEFF57C5208CCF9; +{ +cvt.rn.f16.f64 rs346, fd504; +} +mov.b32 r2343, {rs346, rs346}; +{ +mul.f16x2 r2341, r106, r2343; +} +{ +add.f16x2 r2344, r2320, r2341; +} +{ +cvt.rn.f16.f64 rs347, fd899; +} +mov.b32 r2349, {rs347, rs347}; +{ +mul.f16x2 r2347, r100, r2349; +} +{ +add.f16x2 r2350, r2326, r2347; +} +{ +cvt.rn.f16.f64 rs348, fd504; +} +mov.b32 r2355, {rs348, rs348}; +{ +mul.f16x2 r2353, r103, r2355; +} +{ +add.f16x2 r2356, r2332, r2353; +} +{ +cvt.rn.f16.f64 rs349, fd855; +} +mov.b32 r2361, {rs349, rs349}; +{ +mul.f16x2 r2359, r109, r2361; +} +{ +add.f16x2 r2362, r2338, r2359; +} +{ +cvt.rn.f16.f64 rs350, fd856; +} +mov.b32 r2367, {rs350, rs350}; +{ +mul.f16x2 r2365, r118, r2367; +} +{ +add.f16x2 r2368, r2344, r2365; +} +{ +cvt.rn.f16.f64 rs351, fd855; +} +mov.b32 r2373, {rs351, rs351}; +{ +mul.f16x2 r2371, r112, r2373; +} +{ +add.f16x2 r2374, r2350, r2371; +} +{ +cvt.rn.f16.f64 rs352, fd856; +} +mov.b32 r2379, {rs352, rs352}; +{ +mul.f16x2 r2377, r115, r2379; +} +{ +add.f16x2 r2380, r2356, r2377; +} +{ +cvt.rn.f16.f64 rs353, fd871; +} +mov.b32 r2385, {rs353, rs353}; +{ +mul.f16x2 r2383, r121, r2385; +} +{ +add.f16x2 r2386, r2362, r2383; +} +{ +cvt.rn.f16.f64 rs354, fd624; +} +mov.b32 r2391, {rs354, rs354}; +{ +mul.f16x2 r2389, r130, r2391; +} +{ +add.f16x2 r2392, r2368, r2389; +} +{ +cvt.rn.f16.f64 rs355, fd871; +} +mov.b32 r2397, {rs355, rs355}; +{ +mul.f16x2 r2395, r124, r2397; +} +{ +add.f16x2 r2398, r2374, r2395; +} +{ +cvt.rn.f16.f64 rs356, fd624; +} +mov.b32 r2403, {rs356, rs356}; +{ +mul.f16x2 r2401, r127, r2403; +} +{ +add.f16x2 r2404, r2380, r2401; +} +{ +cvt.rn.f16.f64 rs357, fd883; +} +mov.b32 r2409, {rs357, rs357}; +{ +mul.f16x2 r2407, r133, r2409; +} +{ +add.f16x2 r2410, r2386, r2407; +} +{ +cvt.rn.f16.f64 rs358, fd884; +} +mov.b32 r2415, {rs358, rs358}; +{ +mul.f16x2 r2413, r142, r2415; +} +{ +add.f16x2 r2416, r2392, r2413; +} +{ +cvt.rn.f16.f64 rs359, fd883; +} +mov.b32 r2421, {rs359, rs359}; +{ +mul.f16x2 r2419, r136, r2421; +} +{ +add.f16x2 r2422, r2398, r2419; +} +{ +cvt.rn.f16.f64 rs360, fd884; +} +mov.b32 r2427, {rs360, rs360}; +{ +mul.f16x2 r2425, r139, r2427; +} +{ +add.f16x2 r2428, r2404, r2425; +} +{ +cvt.rn.f16.f64 rs361, fd843; +} +mov.b32 r2433, {rs361, rs361}; +{ +mul.f16x2 r2431, r145, r2433; +} +{ +add.f16x2 r2434, r2410, r2431; +} +{ +cvt.rn.f16.f64 rs362, fd820; +} +mov.b32 r2439, {rs362, rs362}; +{ +mul.f16x2 r2437, r154, r2439; +} +{ +add.f16x2 r2440, r2416, r2437; +} +{ +cvt.rn.f16.f64 rs363, fd843; +} +mov.b32 r2445, {rs363, rs363}; +{ +mul.f16x2 r2443, r148, r2445; +} +{ +add.f16x2 r2446, r2422, r2443; +} +{ +cvt.rn.f16.f64 rs364, fd820; +} +mov.b32 r2451, {rs364, rs364}; +{ +mul.f16x2 r2449, r151, r2451; +} +{ +add.f16x2 r2452, r2428, r2449; +} +{ +cvt.rn.f16.f64 rs365, fd891; +} +mov.b32 r2457, {rs365, rs365}; +{ +mul.f16x2 r2455, r157, r2457; +} +{ +add.f16x2 r2458, r2434, r2455; +} +{ +cvt.rn.f16.f64 rs366, fd804; +} +mov.b32 r2463, {rs366, rs366}; +{ +mul.f16x2 r2461, r166, r2463; +} +{ +add.f16x2 r2464, r2440, r2461; +} +{ +cvt.rn.f16.f64 rs367, fd891; +} +mov.b32 r2469, {rs367, rs367}; +{ +mul.f16x2 r2467, r160, r2469; +} +{ +add.f16x2 r2470, r2446, r2467; +} +{ +cvt.rn.f16.f64 rs368, fd804; +} +mov.b32 r2475, {rs368, rs368}; +{ +mul.f16x2 r2473, r163, r2475; +} +{ +add.f16x2 r2476, r2452, r2473; +} +{ +cvt.rn.f16.f64 rs369, fd863; +} +mov.b32 r2481, {rs369, rs369}; +{ +mul.f16x2 r2479, r169, r2481; +} +{ +add.f16x2 r2482, r2458, r2479; +} +{ +cvt.rn.f16.f64 rs370, fd864; +} +mov.b32 r2487, {rs370, rs370}; +{ +mul.f16x2 r2485, r178, r2487; +} +{ +add.f16x2 r2488, r2464, r2485; +} +{ +cvt.rn.f16.f64 rs371, fd863; +} +mov.b32 r2493, {rs371, rs371}; +{ +mul.f16x2 r2491, r172, r2493; +} +{ +add.f16x2 r2494, r2470, r2491; +} +{ +cvt.rn.f16.f64 rs372, fd864; +} +mov.b32 r2499, {rs372, rs372}; +{ +mul.f16x2 r2497, r175, r2499; +} +{ +add.f16x2 r2500, r2476, r2497; +} +{ +sub.f16x2 %12, r2482, r2488; +} +{ +add.f16x2 %13, r2494, r2500; +} +{ +add.f16x2 %50, r2482, r2488; +} +{ +sub.f16x2 %51, r2494, r2500; +} +cvt.rn.f16.s32 rs373, r5508; +mov.b32 r2527, {rs373, rs373}; +cvt.rn.f16.s32 rs374, r5508; +mov.b32 r2539, {rs374, rs374}; +{ +cvt.rn.f16.f64 rs375, fd895; +} +mov.b32 r2519, {rs375, rs375}; +{ +mul.f16x2 r2517, r1, r2519; +} +{ +add.f16x2 r2520, %62, r2517; +} +{ +cvt.rn.f16.f64 rs376, fd580; +} +mov.b32 r2525, {rs376, rs376}; +{ +mul.f16x2 r2523, r10, r2525; +} +{ +add.f16x2 r2526, r2527, r2523; +} +{ +cvt.rn.f16.f64 rs377, fd895; +} +mov.b32 r2531, {rs377, rs377}; +{ +mul.f16x2 r2529, r4, r2531; +} +{ +add.f16x2 r2532, %63, r2529; +} +{ +cvt.rn.f16.f64 rs378, fd580; +} +mov.b32 r2537, {rs378, rs378}; +{ +mul.f16x2 r2535, r7, r2537; +} +{ +add.f16x2 r2538, r2539, r2535; +} +{ +cvt.rn.f16.f64 rs379, fd851; +} +mov.b32 r2543, {rs379, rs379}; +{ +mul.f16x2 r2541, r13, r2543; +} +{ +add.f16x2 r2544, r2520, r2541; +} +{ +cvt.rn.f16.f64 rs380, fd852; +} +mov.b32 r2549, {rs380, rs380}; +{ +mul.f16x2 r2547, r22, r2549; +} +{ +add.f16x2 r2550, r2526, r2547; +} +{ +cvt.rn.f16.f64 rs381, fd851; +} +mov.b32 r2555, {rs381, rs381}; +{ +mul.f16x2 r2553, r16, r2555; +} +{ +add.f16x2 r2556, r2532, r2553; +} +{ +cvt.rn.f16.f64 rs382, fd852; +} +mov.b32 r2561, {rs382, rs382}; +{ +mul.f16x2 r2559, r19, r2561; +} +{ +add.f16x2 r2562, r2538, r2559; +} +{ +cvt.rn.f16.f64 rs383, fd883; +} +mov.b32 r2567, {rs383, rs383}; +{ +mul.f16x2 r2565, r25, r2567; +} +{ +add.f16x2 r2568, r2544, r2565; +} +{ +cvt.rn.f16.f64 rs384, fd736; +} +mov.b32 r2573, {rs384, rs384}; +{ +mul.f16x2 r2571, r34, r2573; +} +{ +add.f16x2 r2574, r2550, r2571; +} +{ +cvt.rn.f16.f64 rs385, fd883; +} +mov.b32 r2579, {rs385, rs385}; +{ +mul.f16x2 r2577, r28, r2579; +} +{ +add.f16x2 r2580, r2556, r2577; +} +{ +cvt.rn.f16.f64 rs386, fd736; +} +mov.b32 r2585, {rs386, rs386}; +{ +mul.f16x2 r2583, r31, r2585; +} +{ +add.f16x2 r2586, r2562, r2583; +} +{ +cvt.rn.f16.f64 rs387, fd863; +} +mov.b32 r2591, {rs387, rs387}; +{ +mul.f16x2 r2589, r37, r2591; +} +{ +add.f16x2 r2592, r2568, r2589; +} +{ +cvt.rn.f16.f64 rs388, fd864; +} +mov.b32 r2597, {rs388, rs388}; +{ +mul.f16x2 r2595, r46, r2597; +} +{ +add.f16x2 r2598, r2574, r2595; +} +{ +cvt.rn.f16.f64 rs389, fd863; +} +mov.b32 r2603, {rs389, rs389}; +{ +mul.f16x2 r2601, r40, r2603; +} +{ +add.f16x2 r2604, r2580, r2601; +} +{ +cvt.rn.f16.f64 rs390, fd864; +} +mov.b32 r2609, {rs390, rs390}; +{ +mul.f16x2 r2607, r43, r2609; +} +{ +add.f16x2 r2610, r2586, r2607; +} +{ +cvt.rn.f16.f64 rs391, fd871; +} +mov.b32 r2615, {rs391, rs391}; +{ +mul.f16x2 r2613, r49, r2615; +} +{ +add.f16x2 r2616, r2592, r2613; +} +{ +cvt.rn.f16.f64 rs392, fd624; +} +mov.b32 r2621, {rs392, rs392}; +{ +mul.f16x2 r2619, r58, r2621; +} +{ +add.f16x2 r2622, r2598, r2619; +} +{ +cvt.rn.f16.f64 rs393, fd871; +} +mov.b32 r2627, {rs393, rs393}; +{ +mul.f16x2 r2625, r52, r2627; +} +{ +add.f16x2 r2628, r2604, r2625; +} +{ +cvt.rn.f16.f64 rs394, fd624; +} +mov.b32 r2633, {rs394, rs394}; +{ +mul.f16x2 r2631, r55, r2633; +} +{ +add.f16x2 r2634, r2610, r2631; +} +{ +cvt.rn.f16.f64 rs395, fd875; +} +mov.b32 r2639, {rs395, rs395}; +{ +mul.f16x2 r2637, r61, r2639; +} +{ +add.f16x2 r2640, r2616, r2637; +} +{ +cvt.rn.f16.f64 rs396, fd876; +} +mov.b32 r2645, {rs396, rs396}; +{ +mul.f16x2 r2643, r70, r2645; +} +{ +add.f16x2 r2646, r2622, r2643; +} +{ +cvt.rn.f16.f64 rs397, fd875; +} +mov.b32 r2651, {rs397, rs397}; +{ +mul.f16x2 r2649, r64, r2651; +} +{ +add.f16x2 r2652, r2628, r2649; +} +{ +cvt.rn.f16.f64 rs398, fd876; +} +mov.b32 r2657, {rs398, rs398}; +{ +mul.f16x2 r2655, r67, r2657; +} +{ +add.f16x2 r2658, r2634, r2655; +} +{ +cvt.rn.f16.f64 rs399, fd859; +} +mov.b32 r2663, {rs399, rs399}; +{ +mul.f16x2 r2661, r73, r2663; +} +{ +add.f16x2 r2664, r2640, r2661; +} +{ +cvt.rn.f16.f64 rs400, fd572; +} +mov.b32 r2669, {rs400, rs400}; +{ +mul.f16x2 r2667, r82, r2669; +} +{ +add.f16x2 r2670, r2646, r2667; +} +{ +cvt.rn.f16.f64 rs401, fd859; +} +mov.b32 r2675, {rs401, rs401}; +{ +mul.f16x2 r2673, r76, r2675; +} +{ +add.f16x2 r2676, r2652, r2673; +} +{ +cvt.rn.f16.f64 rs402, fd572; +} +mov.b32 r2681, {rs402, rs402}; +{ +mul.f16x2 r2679, r79, r2681; +} +{ +add.f16x2 r2682, r2658, r2679; +} +{ +cvt.rn.f16.f64 rs403, fd887; +} +mov.b32 r2687, {rs403, rs403}; +{ +mul.f16x2 r2685, r85, r2687; +} +{ +add.f16x2 r2688, r2664, r2685; +} +{ +cvt.rn.f16.f64 rs404, fd888; +} +mov.b32 r2693, {rs404, rs404}; +{ +mul.f16x2 r2691, r94, r2693; +} +{ +add.f16x2 r2694, r2670, r2691; +} +{ +cvt.rn.f16.f64 rs405, fd887; +} +mov.b32 r2699, {rs405, rs405}; +{ +mul.f16x2 r2697, r88, r2699; +} +{ +add.f16x2 r2700, r2676, r2697; +} +{ +cvt.rn.f16.f64 rs406, fd888; +} +mov.b32 r2705, {rs406, rs406}; +{ +mul.f16x2 r2703, r91, r2705; +} +{ +add.f16x2 r2706, r2682, r2703; +} +{ +cvt.rn.f16.f64 rs407, fd847; +} +mov.b32 r2711, {rs407, rs407}; +{ +mul.f16x2 r2709, r97, r2711; +} +{ +add.f16x2 r2712, r2688, r2709; +} +{ +cvt.rn.f16.f64 rs408, fd768; +} +mov.b32 r2717, {rs408, rs408}; +{ +mul.f16x2 r2715, r106, r2717; +} +{ +add.f16x2 r2718, r2694, r2715; +} +{ +cvt.rn.f16.f64 rs409, fd847; +} +mov.b32 r2723, {rs409, rs409}; +{ +mul.f16x2 r2721, r100, r2723; +} +{ +add.f16x2 r2724, r2700, r2721; +} +{ +cvt.rn.f16.f64 rs410, fd768; +} +mov.b32 r2729, {rs410, rs410}; +{ +mul.f16x2 r2727, r103, r2729; +} +{ +add.f16x2 r2730, r2706, r2727; +} +{ +cvt.rn.f16.f64 rs411, fd899; +} +mov.b32 r2735, {rs411, rs411}; +{ +mul.f16x2 r2733, r109, r2735; +} +{ +add.f16x2 r2736, r2712, r2733; +} +{ +cvt.rn.f16.f64 rs412, fd900; +} +mov.b32 r2741, {rs412, rs412}; +{ +mul.f16x2 r2739, r118, r2741; +} +{ +add.f16x2 r2742, r2718, r2739; +} +{ +cvt.rn.f16.f64 rs413, fd899; +} +mov.b32 r2747, {rs413, rs413}; +{ +mul.f16x2 r2745, r112, r2747; +} +{ +add.f16x2 r2748, r2724, r2745; +} +{ +cvt.rn.f16.f64 rs414, fd900; +} +mov.b32 r2753, {rs414, rs414}; +{ +mul.f16x2 r2751, r115, r2753; +} +{ +add.f16x2 r2754, r2730, r2751; +} +{ +cvt.rn.f16.f64 rs415, fd843; +} +mov.b32 r2759, {rs415, rs415}; +{ +mul.f16x2 r2757, r121, r2759; +} +{ +add.f16x2 r2760, r2736, r2757; +} +{ +cvt.rn.f16.f64 rs416, fd844; +} +mov.b32 r2765, {rs416, rs416}; +{ +mul.f16x2 r2763, r130, r2765; +} +{ +add.f16x2 r2766, r2742, r2763; +} +{ +cvt.rn.f16.f64 rs417, fd843; +} +mov.b32 r2771, {rs417, rs417}; +{ +mul.f16x2 r2769, r124, r2771; +} +{ +add.f16x2 r2772, r2748, r2769; +} +{ +cvt.rn.f16.f64 rs418, fd844; +} +mov.b32 r2777, {rs418, rs418}; +{ +mul.f16x2 r2775, r127, r2777; +} +{ +add.f16x2 r2778, r2754, r2775; +} +{ +cvt.rn.f16.f64 rs419, fd891; +} +mov.b32 r2783, {rs419, rs419}; +{ +mul.f16x2 r2781, r133, r2783; +} +{ +add.f16x2 r2784, r2760, r2781; +} +{ +cvt.rn.f16.f64 rs420, fd804; +} +mov.b32 r2789, {rs420, rs420}; +{ +mul.f16x2 r2787, r142, r2789; +} +{ +add.f16x2 r2790, r2766, r2787; +} +{ +cvt.rn.f16.f64 rs421, fd891; +} +mov.b32 r2795, {rs421, rs421}; +{ +mul.f16x2 r2793, r136, r2795; +} +{ +add.f16x2 r2796, r2772, r2793; +} +{ +cvt.rn.f16.f64 rs422, fd804; +} +mov.b32 r2801, {rs422, rs422}; +{ +mul.f16x2 r2799, r139, r2801; +} +{ +add.f16x2 r2802, r2778, r2799; +} +{ +cvt.rn.f16.f64 rs423, fd855; +} +mov.b32 r2807, {rs423, rs423}; +{ +mul.f16x2 r2805, r145, r2807; +} +{ +add.f16x2 r2808, r2784, r2805; +} +{ +cvt.rn.f16.f64 rs424, fd856; +} +mov.b32 r2813, {rs424, rs424}; +{ +mul.f16x2 r2811, r154, r2813; +} +{ +add.f16x2 r2814, r2790, r2811; +} +{ +cvt.rn.f16.f64 rs425, fd855; +} +mov.b32 r2819, {rs425, rs425}; +{ +mul.f16x2 r2817, r148, r2819; +} +{ +add.f16x2 r2820, r2796, r2817; +} +{ +cvt.rn.f16.f64 rs426, fd856; +} +mov.b32 r2825, {rs426, rs426}; +{ +mul.f16x2 r2823, r151, r2825; +} +{ +add.f16x2 r2826, r2802, r2823; +} +{ +cvt.rn.f16.f64 rs427, fd879; +} +mov.b32 r2831, {rs427, rs427}; +{ +mul.f16x2 r2829, r157, r2831; +} +{ +add.f16x2 r2832, r2808, r2829; +} +{ +cvt.rn.f16.f64 rs428, fd808; +} +mov.b32 r2837, {rs428, rs428}; +{ +mul.f16x2 r2835, r166, r2837; +} +{ +add.f16x2 r2838, r2814, r2835; +} +{ +cvt.rn.f16.f64 rs429, fd879; +} +mov.b32 r2843, {rs429, rs429}; +{ +mul.f16x2 r2841, r160, r2843; +} +{ +add.f16x2 r2844, r2820, r2841; +} +{ +cvt.rn.f16.f64 rs430, fd808; +} +mov.b32 r2849, {rs430, rs430}; +{ +mul.f16x2 r2847, r163, r2849; +} +{ +add.f16x2 r2850, r2826, r2847; +} +{ +cvt.rn.f16.f64 rs431, fd867; +} +mov.b32 r2855, {rs431, rs431}; +{ +mul.f16x2 r2853, r169, r2855; +} +{ +add.f16x2 r2856, r2832, r2853; +} +{ +cvt.rn.f16.f64 rs432, fd868; +} +mov.b32 r2861, {rs432, rs432}; +{ +mul.f16x2 r2859, r178, r2861; +} +{ +add.f16x2 r2862, r2838, r2859; +} +{ +cvt.rn.f16.f64 rs433, fd867; +} +mov.b32 r2867, {rs433, rs433}; +{ +mul.f16x2 r2865, r172, r2867; +} +{ +add.f16x2 r2868, r2844, r2865; +} +{ +cvt.rn.f16.f64 rs434, fd868; +} +mov.b32 r2873, {rs434, rs434}; +{ +mul.f16x2 r2871, r175, r2873; +} +{ +add.f16x2 r2874, r2850, r2871; +} +{ +sub.f16x2 %14, r2856, r2862; +} +{ +add.f16x2 %15, r2868, r2874; +} +{ +add.f16x2 %48, r2856, r2862; +} +{ +sub.f16x2 %49, r2868, r2874; +} +cvt.rn.f16.s32 rs435, r5508; +mov.b32 r2901, {rs435, rs435}; +cvt.rn.f16.s32 rs436, r5508; +mov.b32 r2913, {rs436, rs436}; +{ +cvt.rn.f16.f64 rs437, fd899; +} +mov.b32 r2893, {rs437, rs437}; +{ +mul.f16x2 r2891, r1, r2893; +} +{ +add.f16x2 r2894, %62, r2891; +} +{ +cvt.rn.f16.f64 rs438, fd900; +} +mov.b32 r2899, {rs438, rs438}; +{ +mul.f16x2 r2897, r10, r2899; +} +{ +add.f16x2 r2900, r2901, r2897; +} +{ +cvt.rn.f16.f64 rs439, fd899; +} +mov.b32 r2905, {rs439, rs439}; +{ +mul.f16x2 r2903, r4, r2905; +} +{ +add.f16x2 r2906, %63, r2903; +} +{ +cvt.rn.f16.f64 rs440, fd900; +} +mov.b32 r2911, {rs440, rs440}; +{ +mul.f16x2 r2909, r7, r2911; +} +{ +add.f16x2 r2912, r2913, r2909; +} +{ +cvt.rn.f16.f64 rs441, fd843; +} +mov.b32 r2917, {rs441, rs441}; +{ +mul.f16x2 r2915, r13, r2917; +} +{ +add.f16x2 r2918, r2894, r2915; +} +{ +cvt.rn.f16.f64 rs442, fd820; +} +mov.b32 r2923, {rs442, rs442}; +{ +mul.f16x2 r2921, r22, r2923; +} +{ +add.f16x2 r2924, r2900, r2921; +} +{ +cvt.rn.f16.f64 rs443, fd843; +} +mov.b32 r2929, {rs443, rs443}; +{ +mul.f16x2 r2927, r16, r2929; +} +{ +add.f16x2 r2930, r2906, r2927; +} +{ +cvt.rn.f16.f64 rs444, fd820; +} +mov.b32 r2935, {rs444, rs444}; +{ +mul.f16x2 r2933, r19, r2935; +} +{ +add.f16x2 r2936, r2912, r2933; +} +{ +cvt.rn.f16.f64 rs445, fd895; +} +mov.b32 r2941, {rs445, rs445}; +{ +mul.f16x2 r2939, r25, r2941; +} +{ +add.f16x2 r2942, r2918, r2939; +} +{ +cvt.rn.f16.f64 rs446, fd896; +} +mov.b32 r2947, {rs446, rs446}; +{ +mul.f16x2 r2945, r34, r2947; +} +{ +add.f16x2 r2948, r2924, r2945; +} +{ +cvt.rn.f16.f64 rs447, fd895; +} +mov.b32 r2953, {rs447, rs447}; +{ +mul.f16x2 r2951, r28, r2953; +} +{ +add.f16x2 r2954, r2930, r2951; +} +{ +cvt.rn.f16.f64 rs448, fd896; +} +mov.b32 r2959, {rs448, rs448}; +{ +mul.f16x2 r2957, r31, r2959; +} +{ +add.f16x2 r2960, r2936, r2957; +} +{ +cvt.rn.f16.f64 rs449, fd847; +} +mov.b32 r2965, {rs449, rs449}; +{ +mul.f16x2 r2963, r37, r2965; +} +{ +add.f16x2 r2966, r2942, r2963; +} +{ +cvt.rn.f16.f64 rs450, fd768; +} +mov.b32 r2971, {rs450, rs450}; +{ +mul.f16x2 r2969, r46, r2971; +} +{ +add.f16x2 r2972, r2948, r2969; +} +{ +cvt.rn.f16.f64 rs451, fd847; +} +mov.b32 r2977, {rs451, rs451}; +{ +mul.f16x2 r2975, r40, r2977; +} +{ +add.f16x2 r2978, r2954, r2975; +} +{ +cvt.rn.f16.f64 rs452, fd768; +} +mov.b32 r2983, {rs452, rs452}; +{ +mul.f16x2 r2981, r43, r2983; +} +{ +add.f16x2 r2984, r2960, r2981; +} +{ +cvt.rn.f16.f64 rs453, fd891; +} +mov.b32 r2989, {rs453, rs453}; +{ +mul.f16x2 r2987, r49, r2989; +} +{ +add.f16x2 r2990, r2966, r2987; +} +{ +cvt.rn.f16.f64 rs454, fd892; +} +mov.b32 r2995, {rs454, rs454}; +{ +mul.f16x2 r2993, r58, r2995; +} +{ +add.f16x2 r2996, r2972, r2993; +} +{ +cvt.rn.f16.f64 rs455, fd891; +} +mov.b32 r3001, {rs455, rs455}; +{ +mul.f16x2 r2999, r52, r3001; +} +{ +add.f16x2 r3002, r2978, r2999; +} +{ +cvt.rn.f16.f64 rs456, fd892; +} +mov.b32 r3007, {rs456, rs456}; +{ +mul.f16x2 r3005, r55, r3007; +} +{ +add.f16x2 r3008, r2984, r3005; +} +{ +cvt.rn.f16.f64 rs457, fd851; +} +mov.b32 r3013, {rs457, rs457}; +{ +mul.f16x2 r3011, r61, r3013; +} +{ +add.f16x2 r3014, r2990, r3011; +} +{ +cvt.rn.f16.f64 rs458, fd676; +} +mov.b32 r3019, {rs458, rs458}; +{ +mul.f16x2 r3017, r70, r3019; +} +{ +add.f16x2 r3020, r2996, r3017; +} +{ +cvt.rn.f16.f64 rs459, fd851; +} +mov.b32 r3025, {rs459, rs459}; +{ +mul.f16x2 r3023, r64, r3025; +} +{ +add.f16x2 r3026, r3002, r3023; +} +{ +cvt.rn.f16.f64 rs460, fd676; +} +mov.b32 r3031, {rs460, rs460}; +{ +mul.f16x2 r3029, r67, r3031; +} +{ +add.f16x2 r3032, r3008, r3029; +} +{ +cvt.rn.f16.f64 rs461, fd887; +} +mov.b32 r3037, {rs461, rs461}; +{ +mul.f16x2 r3035, r73, r3037; +} +{ +add.f16x2 r3038, r3014, r3035; +} +{ +cvt.rn.f16.f64 rs462, fd888; +} +mov.b32 r3043, {rs462, rs462}; +{ +mul.f16x2 r3041, r82, r3043; +} +{ +add.f16x2 r3044, r3020, r3041; +} +{ +cvt.rn.f16.f64 rs463, fd887; +} +mov.b32 r3049, {rs463, rs463}; +{ +mul.f16x2 r3047, r76, r3049; +} +{ +add.f16x2 r3050, r3026, r3047; +} +{ +cvt.rn.f16.f64 rs464, fd888; +} +mov.b32 r3055, {rs464, rs464}; +{ +mul.f16x2 r3053, r79, r3055; +} +{ +add.f16x2 r3056, r3032, r3053; +} +{ +cvt.rn.f16.f64 rs465, fd855; +} +mov.b32 r3061, {rs465, rs465}; +{ +mul.f16x2 r3059, r85, r3061; +} +{ +add.f16x2 r3062, r3038, r3059; +} +{ +cvt.rn.f16.f64 rs466, fd816; +} +mov.b32 r3067, {rs466, rs466}; +{ +mul.f16x2 r3065, r94, r3067; +} +{ +add.f16x2 r3068, r3044, r3065; +} +{ +cvt.rn.f16.f64 rs467, fd855; +} +mov.b32 r3073, {rs467, rs467}; +{ +mul.f16x2 r3071, r88, r3073; +} +{ +add.f16x2 r3074, r3050, r3071; +} +{ +cvt.rn.f16.f64 rs468, fd816; +} +mov.b32 r3079, {rs468, rs468}; +{ +mul.f16x2 r3077, r91, r3079; +} +{ +add.f16x2 r3080, r3056, r3077; +} +{ +cvt.rn.f16.f64 rs469, fd883; +} +mov.b32 r3085, {rs469, rs469}; +{ +mul.f16x2 r3083, r97, r3085; +} +{ +add.f16x2 r3086, r3062, r3083; +} +{ +cvt.rn.f16.f64 rs470, fd884; +} +mov.b32 r3091, {rs470, rs470}; +{ +mul.f16x2 r3089, r106, r3091; +} +{ +add.f16x2 r3092, r3068, r3089; +} +{ +cvt.rn.f16.f64 rs471, fd883; +} +mov.b32 r3097, {rs471, rs471}; +{ +mul.f16x2 r3095, r100, r3097; +} +{ +add.f16x2 r3098, r3074, r3095; +} +{ +cvt.rn.f16.f64 rs472, fd884; +} +mov.b32 r3103, {rs472, rs472}; +{ +mul.f16x2 r3101, r103, r3103; +} +{ +add.f16x2 r3104, r3080, r3101; +} +{ +cvt.rn.f16.f64 rs473, fd859; +} +mov.b32 r3109, {rs473, rs473}; +{ +mul.f16x2 r3107, r109, r3109; +} +{ +add.f16x2 r3110, r3086, r3107; +} +{ +cvt.rn.f16.f64 rs474, fd572; +} +mov.b32 r3115, {rs474, rs474}; +{ +mul.f16x2 r3113, r118, r3115; +} +{ +add.f16x2 r3116, r3092, r3113; +} +{ +cvt.rn.f16.f64 rs475, fd859; +} +mov.b32 r3121, {rs475, rs475}; +{ +mul.f16x2 r3119, r112, r3121; +} +{ +add.f16x2 r3122, r3098, r3119; +} +{ +cvt.rn.f16.f64 rs476, fd572; +} +mov.b32 r3127, {rs476, rs476}; +{ +mul.f16x2 r3125, r115, r3127; +} +{ +add.f16x2 r3128, r3104, r3125; +} +{ +cvt.rn.f16.f64 rs477, fd879; +} +mov.b32 r3133, {rs477, rs477}; +{ +mul.f16x2 r3131, r121, r3133; +} +{ +add.f16x2 r3134, r3110, r3131; +} +{ +cvt.rn.f16.f64 rs478, fd880; +} +mov.b32 r3139, {rs478, rs478}; +{ +mul.f16x2 r3137, r130, r3139; +} +{ +add.f16x2 r3140, r3116, r3137; +} +{ +cvt.rn.f16.f64 rs479, fd879; +} +mov.b32 r3145, {rs479, rs479}; +{ +mul.f16x2 r3143, r124, r3145; +} +{ +add.f16x2 r3146, r3122, r3143; +} +{ +cvt.rn.f16.f64 rs480, fd880; +} +mov.b32 r3151, {rs480, rs480}; +{ +mul.f16x2 r3149, r127, r3151; +} +{ +add.f16x2 r3152, r3128, r3149; +} +{ +cvt.rn.f16.f64 rs481, fd863; +} +mov.b32 r3157, {rs481, rs481}; +{ +mul.f16x2 r3155, r133, r3157; +} +{ +add.f16x2 r3158, r3134, r3155; +} +{ +cvt.rn.f16.f64 rs482, fd740; +} +mov.b32 r3163, {rs482, rs482}; +{ +mul.f16x2 r3161, r142, r3163; +} +{ +add.f16x2 r3164, r3140, r3161; +} +{ +cvt.rn.f16.f64 rs483, fd863; +} +mov.b32 r3169, {rs483, rs483}; +{ +mul.f16x2 r3167, r136, r3169; +} +{ +add.f16x2 r3170, r3146, r3167; +} +{ +cvt.rn.f16.f64 rs484, fd740; +} +mov.b32 r3175, {rs484, rs484}; +{ +mul.f16x2 r3173, r139, r3175; +} +{ +add.f16x2 r3176, r3152, r3173; +} +{ +cvt.rn.f16.f64 rs485, fd875; +} +mov.b32 r3181, {rs485, rs485}; +{ +mul.f16x2 r3179, r145, r3181; +} +{ +add.f16x2 r3182, r3158, r3179; +} +{ +cvt.rn.f16.f64 rs486, fd876; +} +mov.b32 r3187, {rs486, rs486}; +{ +mul.f16x2 r3185, r154, r3187; +} +{ +add.f16x2 r3188, r3164, r3185; +} +{ +cvt.rn.f16.f64 rs487, fd875; +} +mov.b32 r3193, {rs487, rs487}; +{ +mul.f16x2 r3191, r148, r3193; +} +{ +add.f16x2 r3194, r3170, r3191; +} +{ +cvt.rn.f16.f64 rs488, fd876; +} +mov.b32 r3199, {rs488, rs488}; +{ +mul.f16x2 r3197, r151, r3199; +} +{ +add.f16x2 r3200, r3176, r3197; +} +{ +cvt.rn.f16.f64 rs489, fd867; +} +mov.b32 r3205, {rs489, rs489}; +{ +mul.f16x2 r3203, r157, r3205; +} +{ +add.f16x2 r3206, r3182, r3203; +} +{ +cvt.rn.f16.f64 rs490, fd812; +} +mov.b32 r3211, {rs490, rs490}; +{ +mul.f16x2 r3209, r166, r3211; +} +{ +add.f16x2 r3212, r3188, r3209; +} +{ +cvt.rn.f16.f64 rs491, fd867; +} +mov.b32 r3217, {rs491, rs491}; +{ +mul.f16x2 r3215, r160, r3217; +} +{ +add.f16x2 r3218, r3194, r3215; +} +{ +cvt.rn.f16.f64 rs492, fd812; +} +mov.b32 r3223, {rs492, rs492}; +{ +mul.f16x2 r3221, r163, r3223; +} +{ +add.f16x2 r3224, r3200, r3221; +} +{ +cvt.rn.f16.f64 rs493, fd871; +} +mov.b32 r3229, {rs493, rs493}; +{ +mul.f16x2 r3227, r169, r3229; +} +{ +add.f16x2 r3230, r3206, r3227; +} +{ +cvt.rn.f16.f64 rs494, fd872; +} +mov.b32 r3235, {rs494, rs494}; +{ +mul.f16x2 r3233, r178, r3235; +} +{ +add.f16x2 r3236, r3212, r3233; +} +{ +cvt.rn.f16.f64 rs495, fd871; +} +mov.b32 r3241, {rs495, rs495}; +{ +mul.f16x2 r3239, r172, r3241; +} +{ +add.f16x2 r3242, r3218, r3239; +} +{ +cvt.rn.f16.f64 rs496, fd872; +} +mov.b32 r3247, {rs496, rs496}; +{ +mul.f16x2 r3245, r175, r3247; +} +{ +add.f16x2 r3248, r3224, r3245; +} +{ +sub.f16x2 %16, r3230, r3236; +} +{ +add.f16x2 %17, r3242, r3248; +} +{ +add.f16x2 %46, r3230, r3236; +} +{ +sub.f16x2 %47, r3242, r3248; +} +cvt.rn.f16.s32 rs497, r5508; +mov.b32 r3275, {rs497, rs497}; +cvt.rn.f16.s32 rs498, r5508; +mov.b32 r3287, {rs498, rs498}; +{ +cvt.rn.f16.f64 rs499, fd891; +} +mov.b32 r3267, {rs499, rs499}; +{ +mul.f16x2 r3265, r1, r3267; +} +{ +add.f16x2 r3268, %62, r3265; +} +{ +cvt.rn.f16.f64 rs500, fd892; +} +mov.b32 r3273, {rs500, rs500}; +{ +mul.f16x2 r3271, r10, r3273; +} +{ +add.f16x2 r3274, r3275, r3271; +} +{ +cvt.rn.f16.f64 rs501, fd891; +} +mov.b32 r3279, {rs501, rs501}; +{ +mul.f16x2 r3277, r4, r3279; +} +{ +add.f16x2 r3280, %63, r3277; +} +{ +cvt.rn.f16.f64 rs502, fd892; +} +mov.b32 r3285, {rs502, rs502}; +{ +mul.f16x2 r3283, r7, r3285; +} +{ +add.f16x2 r3286, r3287, r3283; +} +{ +cvt.rn.f16.f64 rs503, fd859; +} +mov.b32 r3291, {rs503, rs503}; +{ +mul.f16x2 r3289, r13, r3291; +} +{ +add.f16x2 r3292, r3268, r3289; +} +{ +cvt.rn.f16.f64 rs504, fd572; +} +mov.b32 r3297, {rs504, rs504}; +{ +mul.f16x2 r3295, r22, r3297; +} +{ +add.f16x2 r3298, r3274, r3295; +} +{ +cvt.rn.f16.f64 rs505, fd859; +} +mov.b32 r3303, {rs505, rs505}; +{ +mul.f16x2 r3301, r16, r3303; +} +{ +add.f16x2 r3304, r3280, r3301; +} +{ +cvt.rn.f16.f64 rs506, fd572; +} +mov.b32 r3309, {rs506, rs506}; +{ +mul.f16x2 r3307, r19, r3309; +} +{ +add.f16x2 r3310, r3286, r3307; +} +{ +cvt.rn.f16.f64 rs507, fd871; +} +mov.b32 r3315, {rs507, rs507}; +{ +mul.f16x2 r3313, r25, r3315; +} +{ +add.f16x2 r3316, r3292, r3313; +} +{ +cvt.rn.f16.f64 rs508, fd872; +} +mov.b32 r3321, {rs508, rs508}; +{ +mul.f16x2 r3319, r34, r3321; +} +{ +add.f16x2 r3322, r3298, r3319; +} +{ +cvt.rn.f16.f64 rs509, fd871; +} +mov.b32 r3327, {rs509, rs509}; +{ +mul.f16x2 r3325, r28, r3327; +} +{ +add.f16x2 r3328, r3304, r3325; +} +{ +cvt.rn.f16.f64 rs510, fd872; +} +mov.b32 r3333, {rs510, rs510}; +{ +mul.f16x2 r3331, r31, r3333; +} +{ +add.f16x2 r3334, r3310, r3331; +} +{ +cvt.rn.f16.f64 rs511, fd879; +} +mov.b32 r3339, {rs511, rs511}; +{ +mul.f16x2 r3337, r37, r3339; +} +{ +add.f16x2 r3340, r3316, r3337; +} +{ +cvt.rn.f16.f64 rs512, fd808; +} +mov.b32 r3345, {rs512, rs512}; +{ +mul.f16x2 r3343, r46, r3345; +} +{ +add.f16x2 r3346, r3322, r3343; +} +{ +cvt.rn.f16.f64 rs513, fd879; +} +mov.b32 r3351, {rs513, rs513}; +{ +mul.f16x2 r3349, r40, r3351; +} +{ +add.f16x2 r3352, r3328, r3349; +} +{ +cvt.rn.f16.f64 rs514, fd808; +} +mov.b32 r3357, {rs514, rs514}; +{ +mul.f16x2 r3355, r43, r3357; +} +{ +add.f16x2 r3358, r3334, r3355; +} +{ +cvt.rn.f16.f64 rs515, fd851; +} +mov.b32 r3363, {rs515, rs515}; +{ +mul.f16x2 r3361, r49, r3363; +} +{ +add.f16x2 r3364, r3340, r3361; +} +{ +cvt.rn.f16.f64 rs516, fd852; +} +mov.b32 r3369, {rs516, rs516}; +{ +mul.f16x2 r3367, r58, r3369; +} +{ +add.f16x2 r3370, r3346, r3367; +} +{ +cvt.rn.f16.f64 rs517, fd851; +} +mov.b32 r3375, {rs517, rs517}; +{ +mul.f16x2 r3373, r52, r3375; +} +{ +add.f16x2 r3376, r3352, r3373; +} +{ +cvt.rn.f16.f64 rs518, fd852; +} +mov.b32 r3381, {rs518, rs518}; +{ +mul.f16x2 r3379, r55, r3381; +} +{ +add.f16x2 r3382, r3358, r3379; +} +{ +cvt.rn.f16.f64 rs519, fd899; +} +mov.b32 r3387, {rs519, rs519}; +{ +mul.f16x2 r3385, r61, r3387; +} +{ +add.f16x2 r3388, r3364, r3385; +} +{ +cvt.rn.f16.f64 rs520, fd504; +} +mov.b32 r3393, {rs520, rs520}; +{ +mul.f16x2 r3391, r70, r3393; +} +{ +add.f16x2 r3394, r3370, r3391; +} +{ +cvt.rn.f16.f64 rs521, fd899; +} +mov.b32 r3399, {rs521, rs521}; +{ +mul.f16x2 r3397, r64, r3399; +} +{ +add.f16x2 r3400, r3376, r3397; +} +{ +cvt.rn.f16.f64 rs522, fd504; +} +mov.b32 r3405, {rs522, rs522}; +{ +mul.f16x2 r3403, r67, r3405; +} +{ +add.f16x2 r3406, r3382, r3403; +} +{ +cvt.rn.f16.f64 rs523, fd847; +} +mov.b32 r3411, {rs523, rs523}; +{ +mul.f16x2 r3409, r73, r3411; +} +{ +add.f16x2 r3412, r3388, r3409; +} +{ +cvt.rn.f16.f64 rs524, fd768; +} +mov.b32 r3417, {rs524, rs524}; +{ +mul.f16x2 r3415, r82, r3417; +} +{ +add.f16x2 r3418, r3394, r3415; +} +{ +cvt.rn.f16.f64 rs525, fd847; +} +mov.b32 r3423, {rs525, rs525}; +{ +mul.f16x2 r3421, r76, r3423; +} +{ +add.f16x2 r3424, r3400, r3421; +} +{ +cvt.rn.f16.f64 rs526, fd768; +} +mov.b32 r3429, {rs526, rs526}; +{ +mul.f16x2 r3427, r79, r3429; +} +{ +add.f16x2 r3430, r3406, r3427; +} +{ +cvt.rn.f16.f64 rs527, fd883; +} +mov.b32 r3435, {rs527, rs527}; +{ +mul.f16x2 r3433, r85, r3435; +} +{ +add.f16x2 r3436, r3412, r3433; +} +{ +cvt.rn.f16.f64 rs528, fd884; +} +mov.b32 r3441, {rs528, rs528}; +{ +mul.f16x2 r3439, r94, r3441; +} +{ +add.f16x2 r3442, r3418, r3439; +} +{ +cvt.rn.f16.f64 rs529, fd883; +} +mov.b32 r3447, {rs529, rs529}; +{ +mul.f16x2 r3445, r88, r3447; +} +{ +add.f16x2 r3448, r3424, r3445; +} +{ +cvt.rn.f16.f64 rs530, fd884; +} +mov.b32 r3453, {rs530, rs530}; +{ +mul.f16x2 r3451, r91, r3453; +} +{ +add.f16x2 r3454, r3430, r3451; +} +{ +cvt.rn.f16.f64 rs531, fd867; +} +mov.b32 r3459, {rs531, rs531}; +{ +mul.f16x2 r3457, r97, r3459; +} +{ +add.f16x2 r3460, r3436, r3457; +} +{ +cvt.rn.f16.f64 rs532, fd812; +} +mov.b32 r3465, {rs532, rs532}; +{ +mul.f16x2 r3463, r106, r3465; +} +{ +add.f16x2 r3466, r3442, r3463; +} +{ +cvt.rn.f16.f64 rs533, fd867; +} +mov.b32 r3471, {rs533, rs533}; +{ +mul.f16x2 r3469, r100, r3471; +} +{ +add.f16x2 r3472, r3448, r3469; +} +{ +cvt.rn.f16.f64 rs534, fd812; +} +mov.b32 r3477, {rs534, rs534}; +{ +mul.f16x2 r3475, r103, r3477; +} +{ +add.f16x2 r3478, r3454, r3475; +} +{ +cvt.rn.f16.f64 rs535, fd863; +} +mov.b32 r3483, {rs535, rs535}; +{ +mul.f16x2 r3481, r109, r3483; +} +{ +add.f16x2 r3484, r3460, r3481; +} +{ +cvt.rn.f16.f64 rs536, fd864; +} +mov.b32 r3489, {rs536, rs536}; +{ +mul.f16x2 r3487, r118, r3489; +} +{ +add.f16x2 r3490, r3466, r3487; +} +{ +cvt.rn.f16.f64 rs537, fd863; +} +mov.b32 r3495, {rs537, rs537}; +{ +mul.f16x2 r3493, r112, r3495; +} +{ +add.f16x2 r3496, r3472, r3493; +} +{ +cvt.rn.f16.f64 rs538, fd864; +} +mov.b32 r3501, {rs538, rs538}; +{ +mul.f16x2 r3499, r115, r3501; +} +{ +add.f16x2 r3502, r3478, r3499; +} +{ +cvt.rn.f16.f64 rs539, fd887; +} +mov.b32 r3507, {rs539, rs539}; +{ +mul.f16x2 r3505, r121, r3507; +} +{ +add.f16x2 r3508, r3484, r3505; +} +{ +cvt.rn.f16.f64 rs540, fd760; +} +mov.b32 r3513, {rs540, rs540}; +{ +mul.f16x2 r3511, r130, r3513; +} +{ +add.f16x2 r3514, r3490, r3511; +} +{ +cvt.rn.f16.f64 rs541, fd887; +} +mov.b32 r3519, {rs541, rs541}; +{ +mul.f16x2 r3517, r124, r3519; +} +{ +add.f16x2 r3520, r3496, r3517; +} +{ +cvt.rn.f16.f64 rs542, fd760; +} +mov.b32 r3525, {rs542, rs542}; +{ +mul.f16x2 r3523, r127, r3525; +} +{ +add.f16x2 r3526, r3502, r3523; +} +{ +cvt.rn.f16.f64 rs543, fd843; +} +mov.b32 r3531, {rs543, rs543}; +{ +mul.f16x2 r3529, r133, r3531; +} +{ +add.f16x2 r3532, r3508, r3529; +} +{ +cvt.rn.f16.f64 rs544, fd844; +} +mov.b32 r3537, {rs544, rs544}; +{ +mul.f16x2 r3535, r142, r3537; +} +{ +add.f16x2 r3538, r3514, r3535; +} +{ +cvt.rn.f16.f64 rs545, fd843; +} +mov.b32 r3543, {rs545, rs545}; +{ +mul.f16x2 r3541, r136, r3543; +} +{ +add.f16x2 r3544, r3520, r3541; +} +{ +cvt.rn.f16.f64 rs546, fd844; +} +mov.b32 r3549, {rs546, rs546}; +{ +mul.f16x2 r3547, r139, r3549; +} +{ +add.f16x2 r3550, r3526, r3547; +} +{ +cvt.rn.f16.f64 rs547, fd895; +} +mov.b32 r3555, {rs547, rs547}; +{ +mul.f16x2 r3553, r145, r3555; +} +{ +add.f16x2 r3556, r3532, r3553; +} +{ +cvt.rn.f16.f64 rs548, fd896; +} +mov.b32 r3561, {rs548, rs548}; +{ +mul.f16x2 r3559, r154, r3561; +} +{ +add.f16x2 r3562, r3538, r3559; +} +{ +cvt.rn.f16.f64 rs549, fd895; +} +mov.b32 r3567, {rs549, rs549}; +{ +mul.f16x2 r3565, r148, r3567; +} +{ +add.f16x2 r3568, r3544, r3565; +} +{ +cvt.rn.f16.f64 rs550, fd896; +} +mov.b32 r3573, {rs550, rs550}; +{ +mul.f16x2 r3571, r151, r3573; +} +{ +add.f16x2 r3574, r3550, r3571; +} +{ +cvt.rn.f16.f64 rs551, fd855; +} +mov.b32 r3579, {rs551, rs551}; +{ +mul.f16x2 r3577, r157, r3579; +} +{ +add.f16x2 r3580, r3556, r3577; +} +{ +cvt.rn.f16.f64 rs552, fd816; +} +mov.b32 r3585, {rs552, rs552}; +{ +mul.f16x2 r3583, r166, r3585; +} +{ +add.f16x2 r3586, r3562, r3583; +} +{ +cvt.rn.f16.f64 rs553, fd855; +} +mov.b32 r3591, {rs553, rs553}; +{ +mul.f16x2 r3589, r160, r3591; +} +{ +add.f16x2 r3592, r3568, r3589; +} +{ +cvt.rn.f16.f64 rs554, fd816; +} +mov.b32 r3597, {rs554, rs554}; +{ +mul.f16x2 r3595, r163, r3597; +} +{ +add.f16x2 r3598, r3574, r3595; +} +{ +cvt.rn.f16.f64 rs555, fd875; +} +mov.b32 r3603, {rs555, rs555}; +{ +mul.f16x2 r3601, r169, r3603; +} +{ +add.f16x2 r3604, r3580, r3601; +} +{ +cvt.rn.f16.f64 rs556, fd876; +} +mov.b32 r3609, {rs556, rs556}; +{ +mul.f16x2 r3607, r178, r3609; +} +{ +add.f16x2 r3610, r3586, r3607; +} +{ +cvt.rn.f16.f64 rs557, fd875; +} +mov.b32 r3615, {rs557, rs557}; +{ +mul.f16x2 r3613, r172, r3615; +} +{ +add.f16x2 r3616, r3592, r3613; +} +{ +cvt.rn.f16.f64 rs558, fd876; +} +mov.b32 r3621, {rs558, rs558}; +{ +mul.f16x2 r3619, r175, r3621; +} +{ +add.f16x2 r3622, r3598, r3619; +} +{ +sub.f16x2 %18, r3604, r3610; +} +{ +add.f16x2 %19, r3616, r3622; +} +{ +add.f16x2 %44, r3604, r3610; +} +{ +sub.f16x2 %45, r3616, r3622; +} +cvt.rn.f16.s32 rs559, r5508; +mov.b32 r3649, {rs559, rs559}; +cvt.rn.f16.s32 rs560, r5508; +mov.b32 r3661, {rs560, rs560}; +{ +cvt.rn.f16.f64 rs561, fd883; +} +mov.b32 r3641, {rs561, rs561}; +{ +mul.f16x2 r3639, r1, r3641; +} +{ +add.f16x2 r3642, %62, r3639; +} +{ +cvt.rn.f16.f64 rs562, fd884; +} +mov.b32 r3647, {rs562, rs562}; +{ +mul.f16x2 r3645, r10, r3647; +} +{ +add.f16x2 r3648, r3649, r3645; +} +{ +cvt.rn.f16.f64 rs563, fd883; +} +mov.b32 r3653, {rs563, rs563}; +{ +mul.f16x2 r3651, r4, r3653; +} +{ +add.f16x2 r3654, %63, r3651; +} +{ +cvt.rn.f16.f64 rs564, fd884; +} +mov.b32 r3659, {rs564, rs564}; +{ +mul.f16x2 r3657, r7, r3659; +} +{ +add.f16x2 r3660, r3661, r3657; +} +{ +cvt.rn.f16.f64 rs565, fd875; +} +mov.b32 r3665, {rs565, rs565}; +{ +mul.f16x2 r3663, r13, r3665; +} +{ +add.f16x2 r3666, r3642, r3663; +} +{ +cvt.rn.f16.f64 rs566, fd708; +} +mov.b32 r3671, {rs566, rs566}; +{ +mul.f16x2 r3669, r22, r3671; +} +{ +add.f16x2 r3672, r3648, r3669; +} +{ +cvt.rn.f16.f64 rs567, fd875; +} +mov.b32 r3677, {rs567, rs567}; +{ +mul.f16x2 r3675, r16, r3677; +} +{ +add.f16x2 r3678, r3654, r3675; +} +{ +cvt.rn.f16.f64 rs568, fd708; +} +mov.b32 r3683, {rs568, rs568}; +{ +mul.f16x2 r3681, r19, r3683; +} +{ +add.f16x2 r3684, r3660, r3681; +} +{ +cvt.rn.f16.f64 rs569, fd847; +} +mov.b32 r3689, {rs569, rs569}; +{ +mul.f16x2 r3687, r25, r3689; +} +{ +add.f16x2 r3690, r3666, r3687; +} +{ +cvt.rn.f16.f64 rs570, fd848; +} +mov.b32 r3695, {rs570, rs570}; +{ +mul.f16x2 r3693, r34, r3695; +} +{ +add.f16x2 r3696, r3672, r3693; +} +{ +cvt.rn.f16.f64 rs571, fd847; +} +mov.b32 r3701, {rs571, rs571}; +{ +mul.f16x2 r3699, r28, r3701; +} +{ +add.f16x2 r3702, r3678, r3699; +} +{ +cvt.rn.f16.f64 rs572, fd848; +} +mov.b32 r3707, {rs572, rs572}; +{ +mul.f16x2 r3705, r31, r3707; +} +{ +add.f16x2 r3708, r3684, r3705; +} +{ +cvt.rn.f16.f64 rs573, fd891; +} +mov.b32 r3713, {rs573, rs573}; +{ +mul.f16x2 r3711, r37, r3713; +} +{ +add.f16x2 r3714, r3690, r3711; +} +{ +cvt.rn.f16.f64 rs574, fd892; +} +mov.b32 r3719, {rs574, rs574}; +{ +mul.f16x2 r3717, r46, r3719; +} +{ +add.f16x2 r3720, r3696, r3717; +} +{ +cvt.rn.f16.f64 rs575, fd891; +} +mov.b32 r3725, {rs575, rs575}; +{ +mul.f16x2 r3723, r40, r3725; +} +{ +add.f16x2 r3726, r3702, r3723; +} +{ +cvt.rn.f16.f64 rs576, fd892; +} +mov.b32 r3731, {rs576, rs576}; +{ +mul.f16x2 r3729, r43, r3731; +} +{ +add.f16x2 r3732, r3708, r3729; +} +{ +cvt.rn.f16.f64 rs577, fd867; +} +mov.b32 r3737, {rs577, rs577}; +{ +mul.f16x2 r3735, r49, r3737; +} +{ +add.f16x2 r3738, r3714, r3735; +} +{ +cvt.rn.f16.f64 rs578, fd812; +} +mov.b32 r3743, {rs578, rs578}; +{ +mul.f16x2 r3741, r58, r3743; +} +{ +add.f16x2 r3744, r3720, r3741; +} +{ +cvt.rn.f16.f64 rs579, fd867; +} +mov.b32 r3749, {rs579, rs579}; +{ +mul.f16x2 r3747, r52, r3749; +} +{ +add.f16x2 r3750, r3726, r3747; +} +{ +cvt.rn.f16.f64 rs580, fd812; +} +mov.b32 r3755, {rs580, rs580}; +{ +mul.f16x2 r3753, r55, r3755; +} +{ +add.f16x2 r3756, r3732, r3753; +} +{ +cvt.rn.f16.f64 rs581, fd855; +} +mov.b32 r3761, {rs581, rs581}; +{ +mul.f16x2 r3759, r61, r3761; +} +{ +add.f16x2 r3762, r3738, r3759; +} +{ +cvt.rn.f16.f64 rs582, fd856; +} +mov.b32 r3767, {rs582, rs582}; +{ +mul.f16x2 r3765, r70, r3767; +} +{ +add.f16x2 r3768, r3744, r3765; +} +{ +cvt.rn.f16.f64 rs583, fd855; +} +mov.b32 r3773, {rs583, rs583}; +{ +mul.f16x2 r3771, r64, r3773; +} +{ +add.f16x2 r3774, r3750, r3771; +} +{ +cvt.rn.f16.f64 rs584, fd856; +} +mov.b32 r3779, {rs584, rs584}; +{ +mul.f16x2 r3777, r67, r3779; +} +{ +add.f16x2 r3780, r3756, r3777; +} +{ +cvt.rn.f16.f64 rs585, fd899; +} +mov.b32 r3785, {rs585, rs585}; +{ +mul.f16x2 r3783, r73, r3785; +} +{ +add.f16x2 r3786, r3762, r3783; +} +{ +cvt.rn.f16.f64 rs586, fd900; +} +mov.b32 r3791, {rs586, rs586}; +{ +mul.f16x2 r3789, r82, r3791; +} +{ +add.f16x2 r3792, r3768, r3789; +} +{ +cvt.rn.f16.f64 rs587, fd899; +} +mov.b32 r3797, {rs587, rs587}; +{ +mul.f16x2 r3795, r76, r3797; +} +{ +add.f16x2 r3798, r3774, r3795; +} +{ +cvt.rn.f16.f64 rs588, fd900; +} +mov.b32 r3803, {rs588, rs588}; +{ +mul.f16x2 r3801, r79, r3803; +} +{ +add.f16x2 r3804, r3780, r3801; +} +{ +cvt.rn.f16.f64 rs589, fd859; +} +mov.b32 r3809, {rs589, rs589}; +{ +mul.f16x2 r3807, r85, r3809; +} +{ +add.f16x2 r3810, r3786, r3807; +} +{ +cvt.rn.f16.f64 rs590, fd572; +} +mov.b32 r3815, {rs590, rs590}; +{ +mul.f16x2 r3813, r94, r3815; +} +{ +add.f16x2 r3816, r3792, r3813; +} +{ +cvt.rn.f16.f64 rs591, fd859; +} +mov.b32 r3821, {rs591, rs591}; +{ +mul.f16x2 r3819, r88, r3821; +} +{ +add.f16x2 r3822, r3798, r3819; +} +{ +cvt.rn.f16.f64 rs592, fd572; +} +mov.b32 r3827, {rs592, rs592}; +{ +mul.f16x2 r3825, r91, r3827; +} +{ +add.f16x2 r3828, r3804, r3825; +} +{ +cvt.rn.f16.f64 rs593, fd863; +} +mov.b32 r3833, {rs593, rs593}; +{ +mul.f16x2 r3831, r97, r3833; +} +{ +add.f16x2 r3834, r3810, r3831; +} +{ +cvt.rn.f16.f64 rs594, fd864; +} +mov.b32 r3839, {rs594, rs594}; +{ +mul.f16x2 r3837, r106, r3839; +} +{ +add.f16x2 r3840, r3816, r3837; +} +{ +cvt.rn.f16.f64 rs595, fd863; +} +mov.b32 r3845, {rs595, rs595}; +{ +mul.f16x2 r3843, r100, r3845; +} +{ +add.f16x2 r3846, r3822, r3843; +} +{ +cvt.rn.f16.f64 rs596, fd864; +} +mov.b32 r3851, {rs596, rs596}; +{ +mul.f16x2 r3849, r103, r3851; +} +{ +add.f16x2 r3852, r3828, r3849; +} +{ +cvt.rn.f16.f64 rs597, fd895; +} +mov.b32 r3857, {rs597, rs597}; +{ +mul.f16x2 r3855, r109, r3857; +} +{ +add.f16x2 r3858, r3834, r3855; +} +{ +cvt.rn.f16.f64 rs598, fd580; +} +mov.b32 r3863, {rs598, rs598}; +{ +mul.f16x2 r3861, r118, r3863; +} +{ +add.f16x2 r3864, r3840, r3861; +} +{ +cvt.rn.f16.f64 rs599, fd895; +} +mov.b32 r3869, {rs599, rs599}; +{ +mul.f16x2 r3867, r112, r3869; +} +{ +add.f16x2 r3870, r3846, r3867; +} +{ +cvt.rn.f16.f64 rs600, fd580; +} +mov.b32 r3875, {rs600, rs600}; +{ +mul.f16x2 r3873, r115, r3875; +} +{ +add.f16x2 r3876, r3852, r3873; +} +{ +cvt.rn.f16.f64 rs601, fd851; +} +mov.b32 r3881, {rs601, rs601}; +{ +mul.f16x2 r3879, r121, r3881; +} +{ +add.f16x2 r3882, r3858, r3879; +} +{ +cvt.rn.f16.f64 rs602, fd676; +} +mov.b32 r3887, {rs602, rs602}; +{ +mul.f16x2 r3885, r130, r3887; +} +{ +add.f16x2 r3888, r3864, r3885; +} +{ +cvt.rn.f16.f64 rs603, fd851; +} +mov.b32 r3893, {rs603, rs603}; +{ +mul.f16x2 r3891, r124, r3893; +} +{ +add.f16x2 r3894, r3870, r3891; +} +{ +cvt.rn.f16.f64 rs604, fd676; +} +mov.b32 r3899, {rs604, rs604}; +{ +mul.f16x2 r3897, r127, r3899; +} +{ +add.f16x2 r3900, r3876, r3897; +} +{ +cvt.rn.f16.f64 rs605, fd871; +} +mov.b32 r3905, {rs605, rs605}; +{ +mul.f16x2 r3903, r133, r3905; +} +{ +add.f16x2 r3906, r3882, r3903; +} +{ +cvt.rn.f16.f64 rs606, fd872; +} +mov.b32 r3911, {rs606, rs606}; +{ +mul.f16x2 r3909, r142, r3911; +} +{ +add.f16x2 r3912, r3888, r3909; +} +{ +cvt.rn.f16.f64 rs607, fd871; +} +mov.b32 r3917, {rs607, rs607}; +{ +mul.f16x2 r3915, r136, r3917; +} +{ +add.f16x2 r3918, r3894, r3915; +} +{ +cvt.rn.f16.f64 rs608, fd872; +} +mov.b32 r3923, {rs608, rs608}; +{ +mul.f16x2 r3921, r139, r3923; +} +{ +add.f16x2 r3924, r3900, r3921; +} +{ +cvt.rn.f16.f64 rs609, fd887; +} +mov.b32 r3929, {rs609, rs609}; +{ +mul.f16x2 r3927, r145, r3929; +} +{ +add.f16x2 r3930, r3906, r3927; +} +{ +cvt.rn.f16.f64 rs610, fd760; +} +mov.b32 r3935, {rs610, rs610}; +{ +mul.f16x2 r3933, r154, r3935; +} +{ +add.f16x2 r3936, r3912, r3933; +} +{ +cvt.rn.f16.f64 rs611, fd887; +} +mov.b32 r3941, {rs611, rs611}; +{ +mul.f16x2 r3939, r148, r3941; +} +{ +add.f16x2 r3942, r3918, r3939; +} +{ +cvt.rn.f16.f64 rs612, fd760; +} +mov.b32 r3947, {rs612, rs612}; +{ +mul.f16x2 r3945, r151, r3947; +} +{ +add.f16x2 r3948, r3924, r3945; +} +{ +cvt.rn.f16.f64 rs613, fd843; +} +mov.b32 r3953, {rs613, rs613}; +{ +mul.f16x2 r3951, r157, r3953; +} +{ +add.f16x2 r3954, r3930, r3951; +} +{ +cvt.rn.f16.f64 rs614, fd820; +} +mov.b32 r3959, {rs614, rs614}; +{ +mul.f16x2 r3957, r166, r3959; +} +{ +add.f16x2 r3960, r3936, r3957; +} +{ +cvt.rn.f16.f64 rs615, fd843; +} +mov.b32 r3965, {rs615, rs615}; +{ +mul.f16x2 r3963, r160, r3965; +} +{ +add.f16x2 r3966, r3942, r3963; +} +{ +cvt.rn.f16.f64 rs616, fd820; +} +mov.b32 r3971, {rs616, rs616}; +{ +mul.f16x2 r3969, r163, r3971; +} +{ +add.f16x2 r3972, r3948, r3969; +} +{ +cvt.rn.f16.f64 rs617, fd879; +} +mov.b32 r3977, {rs617, rs617}; +{ +mul.f16x2 r3975, r169, r3977; +} +{ +add.f16x2 r3978, r3954, r3975; +} +{ +cvt.rn.f16.f64 rs618, fd880; +} +mov.b32 r3983, {rs618, rs618}; +{ +mul.f16x2 r3981, r178, r3983; +} +{ +add.f16x2 r3984, r3960, r3981; +} +{ +cvt.rn.f16.f64 rs619, fd879; +} +mov.b32 r3989, {rs619, rs619}; +{ +mul.f16x2 r3987, r172, r3989; +} +{ +add.f16x2 r3990, r3966, r3987; +} +{ +cvt.rn.f16.f64 rs620, fd880; +} +mov.b32 r3995, {rs620, rs620}; +{ +mul.f16x2 r3993, r175, r3995; +} +{ +add.f16x2 r3996, r3972, r3993; +} +{ +sub.f16x2 %20, r3978, r3984; +} +{ +add.f16x2 %21, r3990, r3996; +} +{ +add.f16x2 %42, r3978, r3984; +} +{ +sub.f16x2 %43, r3990, r3996; +} +cvt.rn.f16.s32 rs621, r5508; +mov.b32 r4023, {rs621, rs621}; +cvt.rn.f16.s32 rs622, r5508; +mov.b32 r4035, {rs622, rs622}; +{ +cvt.rn.f16.f64 rs623, fd875; +} +mov.b32 r4015, {rs623, rs623}; +{ +mul.f16x2 r4013, r1, r4015; +} +{ +add.f16x2 r4016, %62, r4013; +} +{ +cvt.rn.f16.f64 rs624, fd876; +} +mov.b32 r4021, {rs624, rs624}; +{ +mul.f16x2 r4019, r10, r4021; +} +{ +add.f16x2 r4022, r4023, r4019; +} +{ +cvt.rn.f16.f64 rs625, fd875; +} +mov.b32 r4027, {rs625, rs625}; +{ +mul.f16x2 r4025, r4, r4027; +} +{ +add.f16x2 r4028, %63, r4025; +} +{ +cvt.rn.f16.f64 rs626, fd876; +} +mov.b32 r4033, {rs626, rs626}; +{ +mul.f16x2 r4031, r7, r4033; +} +{ +add.f16x2 r4034, r4035, r4031; +} +{ +cvt.rn.f16.f64 rs627, fd891; +} +mov.b32 r4039, {rs627, rs627}; +{ +mul.f16x2 r4037, r13, r4039; +} +{ +add.f16x2 r4040, r4016, r4037; +} +{ +cvt.rn.f16.f64 rs628, fd804; +} +mov.b32 r4045, {rs628, rs628}; +{ +mul.f16x2 r4043, r22, r4045; +} +{ +add.f16x2 r4046, r4022, r4043; +} +{ +cvt.rn.f16.f64 rs629, fd891; +} +mov.b32 r4051, {rs629, rs629}; +{ +mul.f16x2 r4049, r16, r4051; +} +{ +add.f16x2 r4052, r4028, r4049; +} +{ +cvt.rn.f16.f64 rs630, fd804; +} +mov.b32 r4057, {rs630, rs630}; +{ +mul.f16x2 r4055, r19, r4057; +} +{ +add.f16x2 r4058, r4034, r4055; +} +{ +cvt.rn.f16.f64 rs631, fd855; +} +mov.b32 r4063, {rs631, rs631}; +{ +mul.f16x2 r4061, r25, r4063; +} +{ +add.f16x2 r4064, r4040, r4061; +} +{ +cvt.rn.f16.f64 rs632, fd816; +} +mov.b32 r4069, {rs632, rs632}; +{ +mul.f16x2 r4067, r34, r4069; +} +{ +add.f16x2 r4070, r4046, r4067; +} +{ +cvt.rn.f16.f64 rs633, fd855; +} +mov.b32 r4075, {rs633, rs633}; +{ +mul.f16x2 r4073, r28, r4075; +} +{ +add.f16x2 r4076, r4052, r4073; +} +{ +cvt.rn.f16.f64 rs634, fd816; +} +mov.b32 r4081, {rs634, rs634}; +{ +mul.f16x2 r4079, r31, r4081; +} +{ +add.f16x2 r4082, r4058, r4079; +} +{ +cvt.rn.f16.f64 rs635, fd859; +} +mov.b32 r4087, {rs635, rs635}; +{ +mul.f16x2 r4085, r37, r4087; +} +{ +add.f16x2 r4088, r4064, r4085; +} +{ +cvt.rn.f16.f64 rs636, fd860; +} +mov.b32 r4093, {rs636, rs636}; +{ +mul.f16x2 r4091, r46, r4093; +} +{ +add.f16x2 r4094, r4070, r4091; +} +{ +cvt.rn.f16.f64 rs637, fd859; +} +mov.b32 r4099, {rs637, rs637}; +{ +mul.f16x2 r4097, r40, r4099; +} +{ +add.f16x2 r4100, r4076, r4097; +} +{ +cvt.rn.f16.f64 rs638, fd860; +} +mov.b32 r4105, {rs638, rs638}; +{ +mul.f16x2 r4103, r43, r4105; +} +{ +add.f16x2 r4106, r4082, r4103; +} +{ +cvt.rn.f16.f64 rs639, fd895; +} +mov.b32 r4111, {rs639, rs639}; +{ +mul.f16x2 r4109, r49, r4111; +} +{ +add.f16x2 r4112, r4088, r4109; +} +{ +cvt.rn.f16.f64 rs640, fd896; +} +mov.b32 r4117, {rs640, rs640}; +{ +mul.f16x2 r4115, r58, r4117; +} +{ +add.f16x2 r4118, r4094, r4115; +} +{ +cvt.rn.f16.f64 rs641, fd895; +} +mov.b32 r4123, {rs641, rs641}; +{ +mul.f16x2 r4121, r52, r4123; +} +{ +add.f16x2 r4124, r4100, r4121; +} +{ +cvt.rn.f16.f64 rs642, fd896; +} +mov.b32 r4129, {rs642, rs642}; +{ +mul.f16x2 r4127, r55, r4129; +} +{ +add.f16x2 r4130, r4106, r4127; +} +{ +cvt.rn.f16.f64 rs643, fd871; +} +mov.b32 r4135, {rs643, rs643}; +{ +mul.f16x2 r4133, r61, r4135; +} +{ +add.f16x2 r4136, r4112, r4133; +} +{ +cvt.rn.f16.f64 rs644, fd624; +} +mov.b32 r4141, {rs644, rs644}; +{ +mul.f16x2 r4139, r70, r4141; +} +{ +add.f16x2 r4142, r4118, r4139; +} +{ +cvt.rn.f16.f64 rs645, fd871; +} +mov.b32 r4147, {rs645, rs645}; +{ +mul.f16x2 r4145, r64, r4147; +} +{ +add.f16x2 r4148, r4124, r4145; +} +{ +cvt.rn.f16.f64 rs646, fd624; +} +mov.b32 r4153, {rs646, rs646}; +{ +mul.f16x2 r4151, r67, r4153; +} +{ +add.f16x2 r4154, r4130, r4151; +} +{ +cvt.rn.f16.f64 rs647, fd843; +} +mov.b32 r4159, {rs647, rs647}; +{ +mul.f16x2 r4157, r73, r4159; +} +{ +add.f16x2 r4160, r4136, r4157; +} +{ +cvt.rn.f16.f64 rs648, fd844; +} +mov.b32 r4165, {rs648, rs648}; +{ +mul.f16x2 r4163, r82, r4165; +} +{ +add.f16x2 r4166, r4142, r4163; +} +{ +cvt.rn.f16.f64 rs649, fd843; +} +mov.b32 r4171, {rs649, rs649}; +{ +mul.f16x2 r4169, r76, r4171; +} +{ +add.f16x2 r4172, r4148, r4169; +} +{ +cvt.rn.f16.f64 rs650, fd844; +} +mov.b32 r4177, {rs650, rs650}; +{ +mul.f16x2 r4175, r79, r4177; +} +{ +add.f16x2 r4178, r4154, r4175; +} +{ +cvt.rn.f16.f64 rs651, fd879; +} +mov.b32 r4183, {rs651, rs651}; +{ +mul.f16x2 r4181, r85, r4183; +} +{ +add.f16x2 r4184, r4160, r4181; +} +{ +cvt.rn.f16.f64 rs652, fd880; +} +mov.b32 r4189, {rs652, rs652}; +{ +mul.f16x2 r4187, r94, r4189; +} +{ +add.f16x2 r4190, r4166, r4187; +} +{ +cvt.rn.f16.f64 rs653, fd879; +} +mov.b32 r4195, {rs653, rs653}; +{ +mul.f16x2 r4193, r88, r4195; +} +{ +add.f16x2 r4196, r4172, r4193; +} +{ +cvt.rn.f16.f64 rs654, fd880; +} +mov.b32 r4201, {rs654, rs654}; +{ +mul.f16x2 r4199, r91, r4201; +} +{ +add.f16x2 r4202, r4178, r4199; +} +{ +cvt.rn.f16.f64 rs655, fd887; +} +mov.b32 r4207, {rs655, rs655}; +{ +mul.f16x2 r4205, r97, r4207; +} +{ +add.f16x2 r4208, r4184, r4205; +} +{ +cvt.rn.f16.f64 rs656, fd760; +} +mov.b32 r4213, {rs656, rs656}; +{ +mul.f16x2 r4211, r106, r4213; +} +{ +add.f16x2 r4214, r4190, r4211; +} +{ +cvt.rn.f16.f64 rs657, fd887; +} +mov.b32 r4219, {rs657, rs657}; +{ +mul.f16x2 r4217, r100, r4219; +} +{ +add.f16x2 r4220, r4196, r4217; +} +{ +cvt.rn.f16.f64 rs658, fd760; +} +mov.b32 r4225, {rs658, rs658}; +{ +mul.f16x2 r4223, r103, r4225; +} +{ +add.f16x2 r4226, r4202, r4223; +} +{ +cvt.rn.f16.f64 rs659, fd851; +} +mov.b32 r4231, {rs659, rs659}; +{ +mul.f16x2 r4229, r109, r4231; +} +{ +add.f16x2 r4232, r4208, r4229; +} +{ +cvt.rn.f16.f64 rs660, fd676; +} +mov.b32 r4237, {rs660, rs660}; +{ +mul.f16x2 r4235, r118, r4237; +} +{ +add.f16x2 r4238, r4214, r4235; +} +{ +cvt.rn.f16.f64 rs661, fd851; +} +mov.b32 r4243, {rs661, rs661}; +{ +mul.f16x2 r4241, r112, r4243; +} +{ +add.f16x2 r4244, r4220, r4241; +} +{ +cvt.rn.f16.f64 rs662, fd676; +} +mov.b32 r4249, {rs662, rs662}; +{ +mul.f16x2 r4247, r115, r4249; +} +{ +add.f16x2 r4250, r4226, r4247; +} +{ +cvt.rn.f16.f64 rs663, fd863; +} +mov.b32 r4255, {rs663, rs663}; +{ +mul.f16x2 r4253, r121, r4255; +} +{ +add.f16x2 r4256, r4232, r4253; +} +{ +cvt.rn.f16.f64 rs664, fd864; +} +mov.b32 r4261, {rs664, rs664}; +{ +mul.f16x2 r4259, r130, r4261; +} +{ +add.f16x2 r4262, r4238, r4259; +} +{ +cvt.rn.f16.f64 rs665, fd863; +} +mov.b32 r4267, {rs665, rs665}; +{ +mul.f16x2 r4265, r124, r4267; +} +{ +add.f16x2 r4268, r4244, r4265; +} +{ +cvt.rn.f16.f64 rs666, fd864; +} +mov.b32 r4273, {rs666, rs666}; +{ +mul.f16x2 r4271, r127, r4273; +} +{ +add.f16x2 r4274, r4250, r4271; +} +{ +cvt.rn.f16.f64 rs667, fd899; +} +mov.b32 r4279, {rs667, rs667}; +{ +mul.f16x2 r4277, r133, r4279; +} +{ +add.f16x2 r4280, r4256, r4277; +} +{ +cvt.rn.f16.f64 rs668, fd900; +} +mov.b32 r4285, {rs668, rs668}; +{ +mul.f16x2 r4283, r142, r4285; +} +{ +add.f16x2 r4286, r4262, r4283; +} +{ +cvt.rn.f16.f64 rs669, fd899; +} +mov.b32 r4291, {rs669, rs669}; +{ +mul.f16x2 r4289, r136, r4291; +} +{ +add.f16x2 r4292, r4268, r4289; +} +{ +cvt.rn.f16.f64 rs670, fd900; +} +mov.b32 r4297, {rs670, rs670}; +{ +mul.f16x2 r4295, r139, r4297; +} +{ +add.f16x2 r4298, r4274, r4295; +} +{ +cvt.rn.f16.f64 rs671, fd867; +} +mov.b32 r4303, {rs671, rs671}; +{ +mul.f16x2 r4301, r145, r4303; +} +{ +add.f16x2 r4304, r4280, r4301; +} +{ +cvt.rn.f16.f64 rs672, fd812; +} +mov.b32 r4309, {rs672, rs672}; +{ +mul.f16x2 r4307, r154, r4309; +} +{ +add.f16x2 r4310, r4286, r4307; +} +{ +cvt.rn.f16.f64 rs673, fd867; +} +mov.b32 r4315, {rs673, rs673}; +{ +mul.f16x2 r4313, r148, r4315; +} +{ +add.f16x2 r4316, r4292, r4313; +} +{ +cvt.rn.f16.f64 rs674, fd812; +} +mov.b32 r4321, {rs674, rs674}; +{ +mul.f16x2 r4319, r151, r4321; +} +{ +add.f16x2 r4322, r4298, r4319; +} +{ +cvt.rn.f16.f64 rs675, fd847; +} +mov.b32 r4327, {rs675, rs675}; +{ +mul.f16x2 r4325, r157, r4327; +} +{ +add.f16x2 r4328, r4304, r4325; +} +{ +cvt.rn.f16.f64 rs676, fd848; +} +mov.b32 r4333, {rs676, rs676}; +{ +mul.f16x2 r4331, r166, r4333; +} +{ +add.f16x2 r4334, r4310, r4331; +} +{ +cvt.rn.f16.f64 rs677, fd847; +} +mov.b32 r4339, {rs677, rs677}; +{ +mul.f16x2 r4337, r160, r4339; +} +{ +add.f16x2 r4340, r4316, r4337; +} +{ +cvt.rn.f16.f64 rs678, fd848; +} +mov.b32 r4345, {rs678, rs678}; +{ +mul.f16x2 r4343, r163, r4345; +} +{ +add.f16x2 r4346, r4322, r4343; +} +{ +cvt.rn.f16.f64 rs679, fd883; +} +mov.b32 r4351, {rs679, rs679}; +{ +mul.f16x2 r4349, r169, r4351; +} +{ +add.f16x2 r4352, r4328, r4349; +} +{ +cvt.rn.f16.f64 rs680, fd884; +} +mov.b32 r4357, {rs680, rs680}; +{ +mul.f16x2 r4355, r178, r4357; +} +{ +add.f16x2 r4358, r4334, r4355; +} +{ +cvt.rn.f16.f64 rs681, fd883; +} +mov.b32 r4363, {rs681, rs681}; +{ +mul.f16x2 r4361, r172, r4363; +} +{ +add.f16x2 r4364, r4340, r4361; +} +{ +cvt.rn.f16.f64 rs682, fd884; +} +mov.b32 r4369, {rs682, rs682}; +{ +mul.f16x2 r4367, r175, r4369; +} +{ +add.f16x2 r4370, r4346, r4367; +} +{ +sub.f16x2 %22, r4352, r4358; +} +{ +add.f16x2 %23, r4364, r4370; +} +{ +add.f16x2 %40, r4352, r4358; +} +{ +sub.f16x2 %41, r4364, r4370; +} +cvt.rn.f16.s32 rs683, r5508; +mov.b32 r4397, {rs683, rs683}; +cvt.rn.f16.s32 rs684, r5508; +mov.b32 r4409, {rs684, rs684}; +{ +cvt.rn.f16.f64 rs685, fd867; +} +mov.b32 r4389, {rs685, rs685}; +{ +mul.f16x2 r4387, r1, r4389; +} +{ +add.f16x2 r4390, %62, r4387; +} +{ +cvt.rn.f16.f64 rs686, fd868; +} +mov.b32 r4395, {rs686, rs686}; +{ +mul.f16x2 r4393, r10, r4395; +} +{ +add.f16x2 r4396, r4397, r4393; +} +{ +cvt.rn.f16.f64 rs687, fd867; +} +mov.b32 r4401, {rs687, rs687}; +{ +mul.f16x2 r4399, r4, r4401; +} +{ +add.f16x2 r4402, %63, r4399; +} +{ +cvt.rn.f16.f64 rs688, fd868; +} +mov.b32 r4407, {rs688, rs688}; +{ +mul.f16x2 r4405, r7, r4407; +} +{ +add.f16x2 r4408, r4409, r4405; +} +{ +cvt.rn.f16.f64 rs689, fd895; +} +mov.b32 r4413, {rs689, rs689}; +{ +mul.f16x2 r4411, r13, r4413; +} +{ +add.f16x2 r4414, r4390, r4411; +} +{ +cvt.rn.f16.f64 rs690, fd896; +} +mov.b32 r4419, {rs690, rs690}; +{ +mul.f16x2 r4417, r22, r4419; +} +{ +add.f16x2 r4420, r4396, r4417; +} +{ +cvt.rn.f16.f64 rs691, fd895; +} +mov.b32 r4425, {rs691, rs691}; +{ +mul.f16x2 r4423, r16, r4425; +} +{ +add.f16x2 r4426, r4402, r4423; +} +{ +cvt.rn.f16.f64 rs692, fd896; +} +mov.b32 r4431, {rs692, rs692}; +{ +mul.f16x2 r4429, r19, r4431; +} +{ +add.f16x2 r4432, r4408, r4429; +} +{ +cvt.rn.f16.f64 rs693, fd879; +} +mov.b32 r4437, {rs693, rs693}; +{ +mul.f16x2 r4435, r25, r4437; +} +{ +add.f16x2 r4438, r4414, r4435; +} +{ +cvt.rn.f16.f64 rs694, fd808; +} +mov.b32 r4443, {rs694, rs694}; +{ +mul.f16x2 r4441, r34, r4443; +} +{ +add.f16x2 r4444, r4420, r4441; +} +{ +cvt.rn.f16.f64 rs695, fd879; +} +mov.b32 r4449, {rs695, rs695}; +{ +mul.f16x2 r4447, r28, r4449; +} +{ +add.f16x2 r4450, r4426, r4447; +} +{ +cvt.rn.f16.f64 rs696, fd808; +} +mov.b32 r4455, {rs696, rs696}; +{ +mul.f16x2 r4453, r31, r4455; +} +{ +add.f16x2 r4456, r4432, r4453; +} +{ +cvt.rn.f16.f64 rs697, fd851; +} +mov.b32 r4461, {rs697, rs697}; +{ +mul.f16x2 r4459, r37, r4461; +} +{ +add.f16x2 r4462, r4438, r4459; +} +{ +cvt.rn.f16.f64 rs698, fd676; +} +mov.b32 r4467, {rs698, rs698}; +{ +mul.f16x2 r4465, r46, r4467; +} +{ +add.f16x2 r4468, r4444, r4465; +} +{ +cvt.rn.f16.f64 rs699, fd851; +} +mov.b32 r4473, {rs699, rs699}; +{ +mul.f16x2 r4471, r40, r4473; +} +{ +add.f16x2 r4474, r4450, r4471; +} +{ +cvt.rn.f16.f64 rs700, fd676; +} +mov.b32 r4479, {rs700, rs700}; +{ +mul.f16x2 r4477, r43, r4479; +} +{ +add.f16x2 r4480, r4456, r4477; +} +{ +cvt.rn.f16.f64 rs701, fd855; +} +mov.b32 r4485, {rs701, rs701}; +{ +mul.f16x2 r4483, r49, r4485; +} +{ +add.f16x2 r4486, r4462, r4483; +} +{ +cvt.rn.f16.f64 rs702, fd856; +} +mov.b32 r4491, {rs702, rs702}; +{ +mul.f16x2 r4489, r58, r4491; +} +{ +add.f16x2 r4492, r4468, r4489; +} +{ +cvt.rn.f16.f64 rs703, fd855; +} +mov.b32 r4497, {rs703, rs703}; +{ +mul.f16x2 r4495, r52, r4497; +} +{ +add.f16x2 r4498, r4474, r4495; +} +{ +cvt.rn.f16.f64 rs704, fd856; +} +mov.b32 r4503, {rs704, rs704}; +{ +mul.f16x2 r4501, r55, r4503; +} +{ +add.f16x2 r4504, r4480, r4501; +} +{ +cvt.rn.f16.f64 rs705, fd883; +} +mov.b32 r4509, {rs705, rs705}; +{ +mul.f16x2 r4507, r61, r4509; +} +{ +add.f16x2 r4510, r4486, r4507; +} +{ +cvt.rn.f16.f64 rs706, fd884; +} +mov.b32 r4515, {rs706, rs706}; +{ +mul.f16x2 r4513, r70, r4515; +} +{ +add.f16x2 r4516, r4492, r4513; +} +{ +cvt.rn.f16.f64 rs707, fd883; +} +mov.b32 r4521, {rs707, rs707}; +{ +mul.f16x2 r4519, r64, r4521; +} +{ +add.f16x2 r4522, r4498, r4519; +} +{ +cvt.rn.f16.f64 rs708, fd884; +} +mov.b32 r4527, {rs708, rs708}; +{ +mul.f16x2 r4525, r67, r4527; +} +{ +add.f16x2 r4528, r4504, r4525; +} +{ +cvt.rn.f16.f64 rs709, fd891; +} +mov.b32 r4533, {rs709, rs709}; +{ +mul.f16x2 r4531, r73, r4533; +} +{ +add.f16x2 r4534, r4510, r4531; +} +{ +cvt.rn.f16.f64 rs710, fd804; +} +mov.b32 r4539, {rs710, rs710}; +{ +mul.f16x2 r4537, r82, r4539; +} +{ +add.f16x2 r4540, r4516, r4537; +} +{ +cvt.rn.f16.f64 rs711, fd891; +} +mov.b32 r4545, {rs711, rs711}; +{ +mul.f16x2 r4543, r76, r4545; +} +{ +add.f16x2 r4546, r4522, r4543; +} +{ +cvt.rn.f16.f64 rs712, fd804; +} +mov.b32 r4551, {rs712, rs712}; +{ +mul.f16x2 r4549, r79, r4551; +} +{ +add.f16x2 r4552, r4528, r4549; +} +{ +cvt.rn.f16.f64 rs713, fd863; +} +mov.b32 r4557, {rs713, rs713}; +{ +mul.f16x2 r4555, r85, r4557; +} +{ +add.f16x2 r4558, r4534, r4555; +} +{ +cvt.rn.f16.f64 rs714, fd740; +} +mov.b32 r4563, {rs714, rs714}; +{ +mul.f16x2 r4561, r94, r4563; +} +{ +add.f16x2 r4564, r4540, r4561; +} +{ +cvt.rn.f16.f64 rs715, fd863; +} +mov.b32 r4569, {rs715, rs715}; +{ +mul.f16x2 r4567, r88, r4569; +} +{ +add.f16x2 r4570, r4546, r4567; +} +{ +cvt.rn.f16.f64 rs716, fd740; +} +mov.b32 r4575, {rs716, rs716}; +{ +mul.f16x2 r4573, r91, r4575; +} +{ +add.f16x2 r4576, r4552, r4573; +} +{ +cvt.rn.f16.f64 rs717, fd843; +} +mov.b32 r4581, {rs717, rs717}; +{ +mul.f16x2 r4579, r97, r4581; +} +{ +add.f16x2 r4582, r4558, r4579; +} +{ +cvt.rn.f16.f64 rs718, fd844; +} +mov.b32 r4587, {rs718, rs718}; +{ +mul.f16x2 r4585, r106, r4587; +} +{ +add.f16x2 r4588, r4564, r4585; +} +{ +cvt.rn.f16.f64 rs719, fd843; +} +mov.b32 r4593, {rs719, rs719}; +{ +mul.f16x2 r4591, r100, r4593; +} +{ +add.f16x2 r4594, r4570, r4591; +} +{ +cvt.rn.f16.f64 rs720, fd844; +} +mov.b32 r4599, {rs720, rs720}; +{ +mul.f16x2 r4597, r103, r4599; +} +{ +add.f16x2 r4600, r4576, r4597; +} +{ +cvt.rn.f16.f64 rs721, fd871; +} +mov.b32 r4605, {rs721, rs721}; +{ +mul.f16x2 r4603, r109, r4605; +} +{ +add.f16x2 r4606, r4582, r4603; +} +{ +cvt.rn.f16.f64 rs722, fd872; +} +mov.b32 r4611, {rs722, rs722}; +{ +mul.f16x2 r4609, r118, r4611; +} +{ +add.f16x2 r4612, r4588, r4609; +} +{ +cvt.rn.f16.f64 rs723, fd871; +} +mov.b32 r4617, {rs723, rs723}; +{ +mul.f16x2 r4615, r112, r4617; +} +{ +add.f16x2 r4618, r4594, r4615; +} +{ +cvt.rn.f16.f64 rs724, fd872; +} +mov.b32 r4623, {rs724, rs724}; +{ +mul.f16x2 r4621, r115, r4623; +} +{ +add.f16x2 r4624, r4600, r4621; +} +{ +cvt.rn.f16.f64 rs725, fd899; +} +mov.b32 r4629, {rs725, rs725}; +{ +mul.f16x2 r4627, r121, r4629; +} +{ +add.f16x2 r4630, r4606, r4627; +} +{ +cvt.rn.f16.f64 rs726, fd900; +} +mov.b32 r4635, {rs726, rs726}; +{ +mul.f16x2 r4633, r130, r4635; +} +{ +add.f16x2 r4636, r4612, r4633; +} +{ +cvt.rn.f16.f64 rs727, fd899; +} +mov.b32 r4641, {rs727, rs727}; +{ +mul.f16x2 r4639, r124, r4641; +} +{ +add.f16x2 r4642, r4618, r4639; +} +{ +cvt.rn.f16.f64 rs728, fd900; +} +mov.b32 r4647, {rs728, rs728}; +{ +mul.f16x2 r4645, r127, r4647; +} +{ +add.f16x2 r4648, r4624, r4645; +} +{ +cvt.rn.f16.f64 rs729, fd875; +} +mov.b32 r4653, {rs729, rs729}; +{ +mul.f16x2 r4651, r133, r4653; +} +{ +add.f16x2 r4654, r4630, r4651; +} +{ +cvt.rn.f16.f64 rs730, fd708; +} +mov.b32 r4659, {rs730, rs730}; +{ +mul.f16x2 r4657, r142, r4659; +} +{ +add.f16x2 r4660, r4636, r4657; +} +{ +cvt.rn.f16.f64 rs731, fd875; +} +mov.b32 r4665, {rs731, rs731}; +{ +mul.f16x2 r4663, r136, r4665; +} +{ +add.f16x2 r4666, r4642, r4663; +} +{ +cvt.rn.f16.f64 rs732, fd708; +} +mov.b32 r4671, {rs732, rs732}; +{ +mul.f16x2 r4669, r139, r4671; +} +{ +add.f16x2 r4672, r4648, r4669; +} +{ +cvt.rn.f16.f64 rs733, fd847; +} +mov.b32 r4677, {rs733, rs733}; +{ +mul.f16x2 r4675, r145, r4677; +} +{ +add.f16x2 r4678, r4654, r4675; +} +{ +cvt.rn.f16.f64 rs734, fd768; +} +mov.b32 r4683, {rs734, rs734}; +{ +mul.f16x2 r4681, r154, r4683; +} +{ +add.f16x2 r4684, r4660, r4681; +} +{ +cvt.rn.f16.f64 rs735, fd847; +} +mov.b32 r4689, {rs735, rs735}; +{ +mul.f16x2 r4687, r148, r4689; +} +{ +add.f16x2 r4690, r4666, r4687; +} +{ +cvt.rn.f16.f64 rs736, fd768; +} +mov.b32 r4695, {rs736, rs736}; +{ +mul.f16x2 r4693, r151, r4695; +} +{ +add.f16x2 r4696, r4672, r4693; +} +{ +cvt.rn.f16.f64 rs737, fd859; +} +mov.b32 r4701, {rs737, rs737}; +{ +mul.f16x2 r4699, r157, r4701; +} +{ +add.f16x2 r4702, r4678, r4699; +} +{ +cvt.rn.f16.f64 rs738, fd860; +} +mov.b32 r4707, {rs738, rs738}; +{ +mul.f16x2 r4705, r166, r4707; +} +{ +add.f16x2 r4708, r4684, r4705; +} +{ +cvt.rn.f16.f64 rs739, fd859; +} +mov.b32 r4713, {rs739, rs739}; +{ +mul.f16x2 r4711, r160, r4713; +} +{ +add.f16x2 r4714, r4690, r4711; +} +{ +cvt.rn.f16.f64 rs740, fd860; +} +mov.b32 r4719, {rs740, rs740}; +{ +mul.f16x2 r4717, r163, r4719; +} +{ +add.f16x2 r4720, r4696, r4717; +} +{ +cvt.rn.f16.f64 rs741, fd887; +} +mov.b32 r4725, {rs741, rs741}; +{ +mul.f16x2 r4723, r169, r4725; +} +{ +add.f16x2 r4726, r4702, r4723; +} +{ +cvt.rn.f16.f64 rs742, fd888; +} +mov.b32 r4731, {rs742, rs742}; +{ +mul.f16x2 r4729, r178, r4731; +} +{ +add.f16x2 r4732, r4708, r4729; +} +{ +cvt.rn.f16.f64 rs743, fd887; +} +mov.b32 r4737, {rs743, rs743}; +{ +mul.f16x2 r4735, r172, r4737; +} +{ +add.f16x2 r4738, r4714, r4735; +} +{ +cvt.rn.f16.f64 rs744, fd888; +} +mov.b32 r4743, {rs744, rs744}; +{ +mul.f16x2 r4741, r175, r4743; +} +{ +add.f16x2 r4744, r4720, r4741; +} +{ +sub.f16x2 %24, r4726, r4732; +} +{ +add.f16x2 %25, r4738, r4744; +} +{ +add.f16x2 %38, r4726, r4732; +} +{ +sub.f16x2 %39, r4738, r4744; +} +cvt.rn.f16.s32 rs745, r5508; +mov.b32 r4771, {rs745, rs745}; +cvt.rn.f16.s32 rs746, r5508; +mov.b32 r4783, {rs746, rs746}; +{ +cvt.rn.f16.f64 rs747, fd859; +} +mov.b32 r4763, {rs747, rs747}; +{ +mul.f16x2 r4761, r1, r4763; +} +{ +add.f16x2 r4764, %62, r4761; +} +{ +cvt.rn.f16.f64 rs748, fd860; +} +mov.b32 r4769, {rs748, rs748}; +{ +mul.f16x2 r4767, r10, r4769; +} +{ +add.f16x2 r4770, r4771, r4767; +} +{ +cvt.rn.f16.f64 rs749, fd859; +} +mov.b32 r4775, {rs749, rs749}; +{ +mul.f16x2 r4773, r4, r4775; +} +{ +add.f16x2 r4776, %63, r4773; +} +{ +cvt.rn.f16.f64 rs750, fd860; +} +mov.b32 r4781, {rs750, rs750}; +{ +mul.f16x2 r4779, r7, r4781; +} +{ +add.f16x2 r4782, r4783, r4779; +} +{ +cvt.rn.f16.f64 rs751, fd879; +} +mov.b32 r4787, {rs751, rs751}; +{ +mul.f16x2 r4785, r13, r4787; +} +{ +add.f16x2 r4788, r4764, r4785; +} +{ +cvt.rn.f16.f64 rs752, fd880; +} +mov.b32 r4793, {rs752, rs752}; +{ +mul.f16x2 r4791, r22, r4793; +} +{ +add.f16x2 r4794, r4770, r4791; +} +{ +cvt.rn.f16.f64 rs753, fd879; +} +mov.b32 r4799, {rs753, rs753}; +{ +mul.f16x2 r4797, r16, r4799; +} +{ +add.f16x2 r4800, r4776, r4797; +} +{ +cvt.rn.f16.f64 rs754, fd880; +} +mov.b32 r4805, {rs754, rs754}; +{ +mul.f16x2 r4803, r19, r4805; +} +{ +add.f16x2 r4806, r4782, r4803; +} +{ +cvt.rn.f16.f64 rs755, fd899; +} +mov.b32 r4811, {rs755, rs755}; +{ +mul.f16x2 r4809, r25, r4811; +} +{ +add.f16x2 r4812, r4788, r4809; +} +{ +cvt.rn.f16.f64 rs756, fd900; +} +mov.b32 r4817, {rs756, rs756}; +{ +mul.f16x2 r4815, r34, r4817; +} +{ +add.f16x2 r4818, r4794, r4815; +} +{ +cvt.rn.f16.f64 rs757, fd899; +} +mov.b32 r4823, {rs757, rs757}; +{ +mul.f16x2 r4821, r28, r4823; +} +{ +add.f16x2 r4824, r4800, r4821; +} +{ +cvt.rn.f16.f64 rs758, fd900; +} +mov.b32 r4829, {rs758, rs758}; +{ +mul.f16x2 r4827, r31, r4829; +} +{ +add.f16x2 r4830, r4806, r4827; +} +{ +cvt.rn.f16.f64 rs759, fd883; +} +mov.b32 r4835, {rs759, rs759}; +{ +mul.f16x2 r4833, r37, r4835; +} +{ +add.f16x2 r4836, r4812, r4833; +} +{ +cvt.rn.f16.f64 rs760, fd736; +} +mov.b32 r4841, {rs760, rs760}; +{ +mul.f16x2 r4839, r46, r4841; +} +{ +add.f16x2 r4842, r4818, r4839; +} +{ +cvt.rn.f16.f64 rs761, fd883; +} +mov.b32 r4847, {rs761, rs761}; +{ +mul.f16x2 r4845, r40, r4847; +} +{ +add.f16x2 r4848, r4824, r4845; +} +{ +cvt.rn.f16.f64 rs762, fd736; +} +mov.b32 r4853, {rs762, rs762}; +{ +mul.f16x2 r4851, r43, r4853; +} +{ +add.f16x2 r4854, r4830, r4851; +} +{ +cvt.rn.f16.f64 rs763, fd863; +} +mov.b32 r4859, {rs763, rs763}; +{ +mul.f16x2 r4857, r49, r4859; +} +{ +add.f16x2 r4860, r4836, r4857; +} +{ +cvt.rn.f16.f64 rs764, fd740; +} +mov.b32 r4865, {rs764, rs764}; +{ +mul.f16x2 r4863, r58, r4865; +} +{ +add.f16x2 r4866, r4842, r4863; +} +{ +cvt.rn.f16.f64 rs765, fd863; +} +mov.b32 r4871, {rs765, rs765}; +{ +mul.f16x2 r4869, r52, r4871; +} +{ +add.f16x2 r4872, r4848, r4869; +} +{ +cvt.rn.f16.f64 rs766, fd740; +} +mov.b32 r4877, {rs766, rs766}; +{ +mul.f16x2 r4875, r55, r4877; +} +{ +add.f16x2 r4878, r4854, r4875; +} +{ +cvt.rn.f16.f64 rs767, fd843; +} +mov.b32 r4883, {rs767, rs767}; +{ +mul.f16x2 r4881, r61, r4883; +} +{ +add.f16x2 r4884, r4860, r4881; +} +{ +cvt.rn.f16.f64 rs768, fd820; +} +mov.b32 r4889, {rs768, rs768}; +{ +mul.f16x2 r4887, r70, r4889; +} +{ +add.f16x2 r4890, r4866, r4887; +} +{ +cvt.rn.f16.f64 rs769, fd843; +} +mov.b32 r4895, {rs769, rs769}; +{ +mul.f16x2 r4893, r64, r4895; +} +{ +add.f16x2 r4896, r4872, r4893; +} +{ +cvt.rn.f16.f64 rs770, fd820; +} +mov.b32 r4901, {rs770, rs770}; +{ +mul.f16x2 r4899, r67, r4901; +} +{ +add.f16x2 r4902, r4878, r4899; +} +{ +cvt.rn.f16.f64 rs771, fd855; +} +mov.b32 r4907, {rs771, rs771}; +{ +mul.f16x2 r4905, r73, r4907; +} +{ +add.f16x2 r4908, r4884, r4905; +} +{ +cvt.rn.f16.f64 rs772, fd856; +} +mov.b32 r4913, {rs772, rs772}; +{ +mul.f16x2 r4911, r82, r4913; +} +{ +add.f16x2 r4914, r4890, r4911; +} +{ +cvt.rn.f16.f64 rs773, fd855; +} +mov.b32 r4919, {rs773, rs773}; +{ +mul.f16x2 r4917, r76, r4919; +} +{ +add.f16x2 r4920, r4896, r4917; +} +{ +cvt.rn.f16.f64 rs774, fd856; +} +mov.b32 r4925, {rs774, rs774}; +{ +mul.f16x2 r4923, r79, r4925; +} +{ +add.f16x2 r4926, r4902, r4923; +} +{ +cvt.rn.f16.f64 rs775, fd875; +} +mov.b32 r4931, {rs775, rs775}; +{ +mul.f16x2 r4929, r85, r4931; +} +{ +add.f16x2 r4932, r4908, r4929; +} +{ +cvt.rn.f16.f64 rs776, fd876; +} +mov.b32 r4937, {rs776, rs776}; +{ +mul.f16x2 r4935, r94, r4937; +} +{ +add.f16x2 r4938, r4914, r4935; +} +{ +cvt.rn.f16.f64 rs777, fd875; +} +mov.b32 r4943, {rs777, rs777}; +{ +mul.f16x2 r4941, r88, r4943; +} +{ +add.f16x2 r4944, r4920, r4941; +} +{ +cvt.rn.f16.f64 rs778, fd876; +} +mov.b32 r4949, {rs778, rs778}; +{ +mul.f16x2 r4947, r91, r4949; +} +{ +add.f16x2 r4950, r4926, r4947; +} +{ +cvt.rn.f16.f64 rs779, fd895; +} +mov.b32 r4955, {rs779, rs779}; +{ +mul.f16x2 r4953, r97, r4955; +} +{ +add.f16x2 r4956, r4932, r4953; +} +{ +cvt.rn.f16.f64 rs780, fd896; +} +mov.b32 r4961, {rs780, rs780}; +{ +mul.f16x2 r4959, r106, r4961; +} +{ +add.f16x2 r4962, r4938, r4959; +} +{ +cvt.rn.f16.f64 rs781, fd895; +} +mov.b32 r4967, {rs781, rs781}; +{ +mul.f16x2 r4965, r100, r4967; +} +{ +add.f16x2 r4968, r4944, r4965; +} +{ +cvt.rn.f16.f64 rs782, fd896; +} +mov.b32 r4973, {rs782, rs782}; +{ +mul.f16x2 r4971, r103, r4973; +} +{ +add.f16x2 r4974, r4950, r4971; +} +{ +cvt.rn.f16.f64 rs783, fd887; +} +mov.b32 r4979, {rs783, rs783}; +{ +mul.f16x2 r4977, r109, r4979; +} +{ +add.f16x2 r4980, r4956, r4977; +} +{ +cvt.rn.f16.f64 rs784, fd760; +} +mov.b32 r4985, {rs784, rs784}; +{ +mul.f16x2 r4983, r118, r4985; +} +{ +add.f16x2 r4986, r4962, r4983; +} +{ +cvt.rn.f16.f64 rs785, fd887; +} +mov.b32 r4991, {rs785, rs785}; +{ +mul.f16x2 r4989, r112, r4991; +} +{ +add.f16x2 r4992, r4968, r4989; +} +{ +cvt.rn.f16.f64 rs786, fd760; +} +mov.b32 r4997, {rs786, rs786}; +{ +mul.f16x2 r4995, r115, r4997; +} +{ +add.f16x2 r4998, r4974, r4995; +} +{ +cvt.rn.f16.f64 rs787, fd867; +} +mov.b32 r5003, {rs787, rs787}; +{ +mul.f16x2 r5001, r121, r5003; +} +{ +add.f16x2 r5004, r4980, r5001; +} +{ +cvt.rn.f16.f64 rs788, fd812; +} +mov.b32 r5009, {rs788, rs788}; +{ +mul.f16x2 r5007, r130, r5009; +} +{ +add.f16x2 r5010, r4986, r5007; +} +{ +cvt.rn.f16.f64 rs789, fd867; +} +mov.b32 r5015, {rs789, rs789}; +{ +mul.f16x2 r5013, r124, r5015; +} +{ +add.f16x2 r5016, r4992, r5013; +} +{ +cvt.rn.f16.f64 rs790, fd812; +} +mov.b32 r5021, {rs790, rs790}; +{ +mul.f16x2 r5019, r127, r5021; +} +{ +add.f16x2 r5022, r4998, r5019; +} +{ +cvt.rn.f16.f64 rs791, fd847; +} +mov.b32 r5027, {rs791, rs791}; +{ +mul.f16x2 r5025, r133, r5027; +} +{ +add.f16x2 r5028, r5004, r5025; +} +{ +cvt.rn.f16.f64 rs792, fd768; +} +mov.b32 r5033, {rs792, rs792}; +{ +mul.f16x2 r5031, r142, r5033; +} +{ +add.f16x2 r5034, r5010, r5031; +} +{ +cvt.rn.f16.f64 rs793, fd847; +} +mov.b32 r5039, {rs793, rs793}; +{ +mul.f16x2 r5037, r136, r5039; +} +{ +add.f16x2 r5040, r5016, r5037; +} +{ +cvt.rn.f16.f64 rs794, fd768; +} +mov.b32 r5045, {rs794, rs794}; +{ +mul.f16x2 r5043, r139, r5045; +} +{ +add.f16x2 r5046, r5022, r5043; +} +{ +cvt.rn.f16.f64 rs795, fd851; +} +mov.b32 r5051, {rs795, rs795}; +{ +mul.f16x2 r5049, r145, r5051; +} +{ +add.f16x2 r5052, r5028, r5049; +} +{ +cvt.rn.f16.f64 rs796, fd852; +} +mov.b32 r5057, {rs796, rs796}; +{ +mul.f16x2 r5055, r154, r5057; +} +{ +add.f16x2 r5058, r5034, r5055; +} +{ +cvt.rn.f16.f64 rs797, fd851; +} +mov.b32 r5063, {rs797, rs797}; +{ +mul.f16x2 r5061, r148, r5063; +} +{ +add.f16x2 r5064, r5040, r5061; +} +{ +cvt.rn.f16.f64 rs798, fd852; +} +mov.b32 r5069, {rs798, rs798}; +{ +mul.f16x2 r5067, r151, r5069; +} +{ +add.f16x2 r5070, r5046, r5067; +} +{ +cvt.rn.f16.f64 rs799, fd871; +} +mov.b32 r5075, {rs799, rs799}; +{ +mul.f16x2 r5073, r157, r5075; +} +{ +add.f16x2 r5076, r5052, r5073; +} +{ +cvt.rn.f16.f64 rs800, fd872; +} +mov.b32 r5081, {rs800, rs800}; +{ +mul.f16x2 r5079, r166, r5081; +} +{ +add.f16x2 r5082, r5058, r5079; +} +{ +cvt.rn.f16.f64 rs801, fd871; +} +mov.b32 r5087, {rs801, rs801}; +{ +mul.f16x2 r5085, r160, r5087; +} +{ +add.f16x2 r5088, r5064, r5085; +} +{ +cvt.rn.f16.f64 rs802, fd872; +} +mov.b32 r5093, {rs802, rs802}; +{ +mul.f16x2 r5091, r163, r5093; +} +{ +add.f16x2 r5094, r5070, r5091; +} +{ +cvt.rn.f16.f64 rs803, fd891; +} +mov.b32 r5099, {rs803, rs803}; +{ +mul.f16x2 r5097, r169, r5099; +} +{ +add.f16x2 r5100, r5076, r5097; +} +{ +cvt.rn.f16.f64 rs804, fd892; +} +mov.b32 r5105, {rs804, rs804}; +{ +mul.f16x2 r5103, r178, r5105; +} +{ +add.f16x2 r5106, r5082, r5103; +} +{ +cvt.rn.f16.f64 rs805, fd891; +} +mov.b32 r5111, {rs805, rs805}; +{ +mul.f16x2 r5109, r172, r5111; +} +{ +add.f16x2 r5112, r5088, r5109; +} +{ +cvt.rn.f16.f64 rs806, fd892; +} +mov.b32 r5117, {rs806, rs806}; +{ +mul.f16x2 r5115, r175, r5117; +} +{ +add.f16x2 r5118, r5094, r5115; +} +{ +sub.f16x2 %26, r5100, r5106; +} +{ +add.f16x2 %27, r5112, r5118; +} +{ +add.f16x2 %36, r5100, r5106; +} +{ +sub.f16x2 %37, r5112, r5118; +} +cvt.rn.f16.s32 rs807, r5508; +mov.b32 r5145, {rs807, rs807}; +cvt.rn.f16.s32 rs808, r5508; +mov.b32 r5157, {rs808, rs808}; +{ +cvt.rn.f16.f64 rs809, fd851; +} +mov.b32 r5137, {rs809, rs809}; +{ +mul.f16x2 r5135, r1, r5137; +} +{ +add.f16x2 r5138, %62, r5135; +} +{ +cvt.rn.f16.f64 rs810, fd852; +} +mov.b32 r5143, {rs810, rs810}; +{ +mul.f16x2 r5141, r10, r5143; +} +{ +add.f16x2 r5144, r5145, r5141; +} +{ +cvt.rn.f16.f64 rs811, fd851; +} +mov.b32 r5149, {rs811, rs811}; +{ +mul.f16x2 r5147, r4, r5149; +} +{ +add.f16x2 r5150, %63, r5147; +} +{ +cvt.rn.f16.f64 rs812, fd852; +} +mov.b32 r5155, {rs812, rs812}; +{ +mul.f16x2 r5153, r7, r5155; +} +{ +add.f16x2 r5156, r5157, r5153; +} +{ +cvt.rn.f16.f64 rs813, fd863; +} +mov.b32 r5161, {rs813, rs813}; +{ +mul.f16x2 r5159, r13, r5161; +} +{ +add.f16x2 r5162, r5138, r5159; +} +{ +cvt.rn.f16.f64 rs814, fd864; +} +mov.b32 r5167, {rs814, rs814}; +{ +mul.f16x2 r5165, r22, r5167; +} +{ +add.f16x2 r5168, r5144, r5165; +} +{ +cvt.rn.f16.f64 rs815, fd863; +} +mov.b32 r5173, {rs815, rs815}; +{ +mul.f16x2 r5171, r16, r5173; +} +{ +add.f16x2 r5174, r5150, r5171; +} +{ +cvt.rn.f16.f64 rs816, fd864; +} +mov.b32 r5179, {rs816, rs816}; +{ +mul.f16x2 r5177, r19, r5179; +} +{ +add.f16x2 r5180, r5156, r5177; +} +{ +cvt.rn.f16.f64 rs817, fd875; +} +mov.b32 r5185, {rs817, rs817}; +{ +mul.f16x2 r5183, r25, r5185; +} +{ +add.f16x2 r5186, r5162, r5183; +} +{ +cvt.rn.f16.f64 rs818, fd876; +} +mov.b32 r5191, {rs818, rs818}; +{ +mul.f16x2 r5189, r34, r5191; +} +{ +add.f16x2 r5192, r5168, r5189; +} +{ +cvt.rn.f16.f64 rs819, fd875; +} +mov.b32 r5197, {rs819, rs819}; +{ +mul.f16x2 r5195, r28, r5197; +} +{ +add.f16x2 r5198, r5174, r5195; +} +{ +cvt.rn.f16.f64 rs820, fd876; +} +mov.b32 r5203, {rs820, rs820}; +{ +mul.f16x2 r5201, r31, r5203; +} +{ +add.f16x2 r5204, r5180, r5201; +} +{ +cvt.rn.f16.f64 rs821, fd887; +} +mov.b32 r5209, {rs821, rs821}; +{ +mul.f16x2 r5207, r37, r5209; +} +{ +add.f16x2 r5210, r5186, r5207; +} +{ +cvt.rn.f16.f64 rs822, fd888; +} +mov.b32 r5215, {rs822, rs822}; +{ +mul.f16x2 r5213, r46, r5215; +} +{ +add.f16x2 r5216, r5192, r5213; +} +{ +cvt.rn.f16.f64 rs823, fd887; +} +mov.b32 r5221, {rs823, rs823}; +{ +mul.f16x2 r5219, r40, r5221; +} +{ +add.f16x2 r5222, r5198, r5219; +} +{ +cvt.rn.f16.f64 rs824, fd888; +} +mov.b32 r5227, {rs824, rs824}; +{ +mul.f16x2 r5225, r43, r5227; +} +{ +add.f16x2 r5228, r5204, r5225; +} +{ +cvt.rn.f16.f64 rs825, fd899; +} +mov.b32 r5233, {rs825, rs825}; +{ +mul.f16x2 r5231, r49, r5233; +} +{ +add.f16x2 r5234, r5210, r5231; +} +{ +cvt.rn.f16.f64 rs826, fd900; +} +mov.b32 r5239, {rs826, rs826}; +{ +mul.f16x2 r5237, r58, r5239; +} +{ +add.f16x2 r5240, r5216, r5237; +} +{ +cvt.rn.f16.f64 rs827, fd899; +} +mov.b32 r5245, {rs827, rs827}; +{ +mul.f16x2 r5243, r52, r5245; +} +{ +add.f16x2 r5246, r5222, r5243; +} +{ +cvt.rn.f16.f64 rs828, fd900; +} +mov.b32 r5251, {rs828, rs828}; +{ +mul.f16x2 r5249, r55, r5251; +} +{ +add.f16x2 r5252, r5228, r5249; +} +{ +cvt.rn.f16.f64 rs829, fd891; +} +mov.b32 r5257, {rs829, rs829}; +{ +mul.f16x2 r5255, r61, r5257; +} +{ +add.f16x2 r5258, r5234, r5255; +} +{ +cvt.rn.f16.f64 rs830, fd804; +} +mov.b32 r5263, {rs830, rs830}; +{ +mul.f16x2 r5261, r70, r5263; +} +{ +add.f16x2 r5264, r5240, r5261; +} +{ +cvt.rn.f16.f64 rs831, fd891; +} +mov.b32 r5269, {rs831, rs831}; +{ +mul.f16x2 r5267, r64, r5269; +} +{ +add.f16x2 r5270, r5246, r5267; +} +{ +cvt.rn.f16.f64 rs832, fd804; +} +mov.b32 r5275, {rs832, rs832}; +{ +mul.f16x2 r5273, r67, r5275; +} +{ +add.f16x2 r5276, r5252, r5273; +} +{ +cvt.rn.f16.f64 rs833, fd879; +} +mov.b32 r5281, {rs833, rs833}; +{ +mul.f16x2 r5279, r73, r5281; +} +{ +add.f16x2 r5282, r5258, r5279; +} +{ +cvt.rn.f16.f64 rs834, fd808; +} +mov.b32 r5287, {rs834, rs834}; +{ +mul.f16x2 r5285, r82, r5287; +} +{ +add.f16x2 r5288, r5264, r5285; +} +{ +cvt.rn.f16.f64 rs835, fd879; +} +mov.b32 r5293, {rs835, rs835}; +{ +mul.f16x2 r5291, r76, r5293; +} +{ +add.f16x2 r5294, r5270, r5291; +} +{ +cvt.rn.f16.f64 rs836, fd808; +} +mov.b32 r5299, {rs836, rs836}; +{ +mul.f16x2 r5297, r79, r5299; +} +{ +add.f16x2 r5300, r5276, r5297; +} +{ +cvt.rn.f16.f64 rs837, fd867; +} +mov.b32 r5305, {rs837, rs837}; +{ +mul.f16x2 r5303, r85, r5305; +} +{ +add.f16x2 r5306, r5282, r5303; +} +{ +cvt.rn.f16.f64 rs838, fd812; +} +mov.b32 r5311, {rs838, rs838}; +{ +mul.f16x2 r5309, r94, r5311; +} +{ +add.f16x2 r5312, r5288, r5309; +} +{ +cvt.rn.f16.f64 rs839, fd867; +} +mov.b32 r5317, {rs839, rs839}; +{ +mul.f16x2 r5315, r88, r5317; +} +{ +add.f16x2 r5318, r5294, r5315; +} +{ +cvt.rn.f16.f64 rs840, fd812; +} +mov.b32 r5323, {rs840, rs840}; +{ +mul.f16x2 r5321, r91, r5323; +} +{ +add.f16x2 r5324, r5300, r5321; +} +{ +cvt.rn.f16.f64 rs841, fd855; +} +mov.b32 r5329, {rs841, rs841}; +{ +mul.f16x2 r5327, r97, r5329; +} +{ +add.f16x2 r5330, r5306, r5327; +} +{ +cvt.rn.f16.f64 rs842, fd816; +} +mov.b32 r5335, {rs842, rs842}; +{ +mul.f16x2 r5333, r106, r5335; +} +{ +add.f16x2 r5336, r5312, r5333; +} +{ +cvt.rn.f16.f64 rs843, fd855; +} +mov.b32 r5341, {rs843, rs843}; +{ +mul.f16x2 r5339, r100, r5341; +} +{ +add.f16x2 r5342, r5318, r5339; +} +{ +cvt.rn.f16.f64 rs844, fd816; +} +mov.b32 r5347, {rs844, rs844}; +{ +mul.f16x2 r5345, r103, r5347; +} +{ +add.f16x2 r5348, r5324, r5345; +} +{ +cvt.rn.f16.f64 rs845, fd843; +} +mov.b32 r5353, {rs845, rs845}; +{ +mul.f16x2 r5351, r109, r5353; +} +{ +add.f16x2 r5354, r5330, r5351; +} +{ +cvt.rn.f16.f64 rs846, fd820; +} +mov.b32 r5359, {rs846, rs846}; +{ +mul.f16x2 r5357, r118, r5359; +} +{ +add.f16x2 r5360, r5336, r5357; +} +{ +cvt.rn.f16.f64 rs847, fd843; +} +mov.b32 r5365, {rs847, rs847}; +{ +mul.f16x2 r5363, r112, r5365; +} +{ +add.f16x2 r5366, r5342, r5363; +} +{ +cvt.rn.f16.f64 rs848, fd820; +} +mov.b32 r5371, {rs848, rs848}; +{ +mul.f16x2 r5369, r115, r5371; +} +{ +add.f16x2 r5372, r5348, r5369; +} +{ +cvt.rn.f16.f64 rs849, fd847; +} +mov.b32 r5377, {rs849, rs849}; +{ +mul.f16x2 r5375, r121, r5377; +} +{ +add.f16x2 r5378, r5354, r5375; +} +{ +cvt.rn.f16.f64 rs850, fd848; +} +mov.b32 r5383, {rs850, rs850}; +{ +mul.f16x2 r5381, r130, r5383; +} +{ +add.f16x2 r5384, r5360, r5381; +} +{ +cvt.rn.f16.f64 rs851, fd847; +} +mov.b32 r5389, {rs851, rs851}; +{ +mul.f16x2 r5387, r124, r5389; +} +{ +add.f16x2 r5390, r5366, r5387; +} +{ +cvt.rn.f16.f64 rs852, fd848; +} +mov.b32 r5395, {rs852, rs852}; +{ +mul.f16x2 r5393, r127, r5395; +} +{ +add.f16x2 r5396, r5372, r5393; +} +{ +cvt.rn.f16.f64 rs853, fd859; +} +mov.b32 r5401, {rs853, rs853}; +{ +mul.f16x2 r5399, r133, r5401; +} +{ +add.f16x2 r5402, r5378, r5399; +} +{ +cvt.rn.f16.f64 rs854, fd860; +} +mov.b32 r5407, {rs854, rs854}; +{ +mul.f16x2 r5405, r142, r5407; +} +{ +add.f16x2 r5408, r5384, r5405; +} +{ +cvt.rn.f16.f64 rs855, fd859; +} +mov.b32 r5413, {rs855, rs855}; +{ +mul.f16x2 r5411, r136, r5413; +} +{ +add.f16x2 r5414, r5390, r5411; +} +{ +cvt.rn.f16.f64 rs856, fd860; +} +mov.b32 r5419, {rs856, rs856}; +{ +mul.f16x2 r5417, r139, r5419; +} +{ +add.f16x2 r5420, r5396, r5417; +} +{ +cvt.rn.f16.f64 rs857, fd871; +} +mov.b32 r5425, {rs857, rs857}; +{ +mul.f16x2 r5423, r145, r5425; +} +{ +add.f16x2 r5426, r5402, r5423; +} +{ +cvt.rn.f16.f64 rs858, fd872; +} +mov.b32 r5431, {rs858, rs858}; +{ +mul.f16x2 r5429, r154, r5431; +} +{ +add.f16x2 r5432, r5408, r5429; +} +{ +cvt.rn.f16.f64 rs859, fd871; +} +mov.b32 r5437, {rs859, rs859}; +{ +mul.f16x2 r5435, r148, r5437; +} +{ +add.f16x2 r5438, r5414, r5435; +} +{ +cvt.rn.f16.f64 rs860, fd872; +} +mov.b32 r5443, {rs860, rs860}; +{ +mul.f16x2 r5441, r151, r5443; +} +{ +add.f16x2 r5444, r5420, r5441; +} +{ +cvt.rn.f16.f64 rs861, fd883; +} +mov.b32 r5449, {rs861, rs861}; +{ +mul.f16x2 r5447, r157, r5449; +} +{ +add.f16x2 r5450, r5426, r5447; +} +{ +cvt.rn.f16.f64 rs862, fd884; +} +mov.b32 r5455, {rs862, rs862}; +{ +mul.f16x2 r5453, r166, r5455; +} +{ +add.f16x2 r5456, r5432, r5453; +} +{ +cvt.rn.f16.f64 rs863, fd883; +} +mov.b32 r5461, {rs863, rs863}; +{ +mul.f16x2 r5459, r160, r5461; +} +{ +add.f16x2 r5462, r5438, r5459; +} +{ +cvt.rn.f16.f64 rs864, fd884; +} +mov.b32 r5467, {rs864, rs864}; +{ +mul.f16x2 r5465, r163, r5467; +} +{ +add.f16x2 r5468, r5444, r5465; +} +{ +cvt.rn.f16.f64 rs865, fd895; +} +mov.b32 r5473, {rs865, rs865}; +{ +mul.f16x2 r5471, r169, r5473; +} +{ +add.f16x2 r5474, r5450, r5471; +} +{ +cvt.rn.f16.f64 rs866, fd896; +} +mov.b32 r5479, {rs866, rs866}; +{ +mul.f16x2 r5477, r178, r5479; +} +{ +add.f16x2 r5480, r5456, r5477; +} +{ +cvt.rn.f16.f64 rs867, fd895; +} +mov.b32 r5485, {rs867, rs867}; +{ +mul.f16x2 r5483, r172, r5485; +} +{ +add.f16x2 r5486, r5462, r5483; +} +{ +cvt.rn.f16.f64 rs868, fd896; +} +mov.b32 r5491, {rs868, rs868}; +{ +mul.f16x2 r5489, r175, r5491; +} +{ +add.f16x2 r5492, r5468, r5489; +} +{ +sub.f16x2 %28, r5474, r5480; +} +{ +add.f16x2 %29, r5486, r5492; +} +{ +add.f16x2 %34, r5474, r5480; +} +{ +sub.f16x2 %35, r5486, r5492; +} +cvt.rn.f16.s32 rs869, r5508; +mov.b32 r5519, {rs869, rs869}; +cvt.rn.f16.s32 rs870, r5508; +mov.b32 r5531, {rs870, rs870}; +{ +cvt.rn.f16.f64 rs871, fd843; +} +mov.b32 r5511, {rs871, rs871}; +{ +mul.f16x2 r5509, r1, r5511; +} +{ +add.f16x2 r5512, %62, r5509; +} +{ +cvt.rn.f16.f64 rs872, fd844; +} +mov.b32 r5517, {rs872, rs872}; +{ +mul.f16x2 r5515, r10, r5517; +} +{ +add.f16x2 r5518, r5519, r5515; +} +{ +cvt.rn.f16.f64 rs873, fd843; +} +mov.b32 r5523, {rs873, rs873}; +{ +mul.f16x2 r5521, r4, r5523; +} +{ +add.f16x2 r5524, %63, r5521; +} +{ +cvt.rn.f16.f64 rs874, fd844; +} +mov.b32 r5529, {rs874, rs874}; +{ +mul.f16x2 r5527, r7, r5529; +} +{ +add.f16x2 r5530, r5531, r5527; +} +{ +cvt.rn.f16.f64 rs875, fd847; +} +mov.b32 r5535, {rs875, rs875}; +{ +mul.f16x2 r5533, r13, r5535; +} +{ +add.f16x2 r5536, r5512, r5533; +} +{ +cvt.rn.f16.f64 rs876, fd848; +} +mov.b32 r5541, {rs876, rs876}; +{ +mul.f16x2 r5539, r22, r5541; +} +{ +add.f16x2 r5542, r5518, r5539; +} +{ +cvt.rn.f16.f64 rs877, fd847; +} +mov.b32 r5547, {rs877, rs877}; +{ +mul.f16x2 r5545, r16, r5547; +} +{ +add.f16x2 r5548, r5524, r5545; +} +{ +cvt.rn.f16.f64 rs878, fd848; +} +mov.b32 r5553, {rs878, rs878}; +{ +mul.f16x2 r5551, r19, r5553; +} +{ +add.f16x2 r5554, r5530, r5551; +} +{ +cvt.rn.f16.f64 rs879, fd851; +} +mov.b32 r5559, {rs879, rs879}; +{ +mul.f16x2 r5557, r25, r5559; +} +{ +add.f16x2 r5560, r5536, r5557; +} +{ +cvt.rn.f16.f64 rs880, fd852; +} +mov.b32 r5565, {rs880, rs880}; +{ +mul.f16x2 r5563, r34, r5565; +} +{ +add.f16x2 r5566, r5542, r5563; +} +{ +cvt.rn.f16.f64 rs881, fd851; +} +mov.b32 r5571, {rs881, rs881}; +{ +mul.f16x2 r5569, r28, r5571; +} +{ +add.f16x2 r5572, r5548, r5569; +} +{ +cvt.rn.f16.f64 rs882, fd852; +} +mov.b32 r5577, {rs882, rs882}; +{ +mul.f16x2 r5575, r31, r5577; +} +{ +add.f16x2 r5578, r5554, r5575; +} +{ +cvt.rn.f16.f64 rs883, fd855; +} +mov.b32 r5583, {rs883, rs883}; +{ +mul.f16x2 r5581, r37, r5583; +} +{ +add.f16x2 r5584, r5560, r5581; +} +{ +cvt.rn.f16.f64 rs884, fd856; +} +mov.b32 r5589, {rs884, rs884}; +{ +mul.f16x2 r5587, r46, r5589; +} +{ +add.f16x2 r5590, r5566, r5587; +} +{ +cvt.rn.f16.f64 rs885, fd855; +} +mov.b32 r5595, {rs885, rs885}; +{ +mul.f16x2 r5593, r40, r5595; +} +{ +add.f16x2 r5596, r5572, r5593; +} +{ +cvt.rn.f16.f64 rs886, fd856; +} +mov.b32 r5601, {rs886, rs886}; +{ +mul.f16x2 r5599, r43, r5601; +} +{ +add.f16x2 r5602, r5578, r5599; +} +{ +cvt.rn.f16.f64 rs887, fd859; +} +mov.b32 r5607, {rs887, rs887}; +{ +mul.f16x2 r5605, r49, r5607; +} +{ +add.f16x2 r5608, r5584, r5605; +} +{ +cvt.rn.f16.f64 rs888, fd860; +} +mov.b32 r5613, {rs888, rs888}; +{ +mul.f16x2 r5611, r58, r5613; +} +{ +add.f16x2 r5614, r5590, r5611; +} +{ +cvt.rn.f16.f64 rs889, fd859; +} +mov.b32 r5619, {rs889, rs889}; +{ +mul.f16x2 r5617, r52, r5619; +} +{ +add.f16x2 r5620, r5596, r5617; +} +{ +cvt.rn.f16.f64 rs890, fd860; +} +mov.b32 r5625, {rs890, rs890}; +{ +mul.f16x2 r5623, r55, r5625; +} +{ +add.f16x2 r5626, r5602, r5623; +} +{ +cvt.rn.f16.f64 rs891, fd863; +} +mov.b32 r5631, {rs891, rs891}; +{ +mul.f16x2 r5629, r61, r5631; +} +{ +add.f16x2 r5632, r5608, r5629; +} +{ +cvt.rn.f16.f64 rs892, fd864; +} +mov.b32 r5637, {rs892, rs892}; +{ +mul.f16x2 r5635, r70, r5637; +} +{ +add.f16x2 r5638, r5614, r5635; +} +{ +cvt.rn.f16.f64 rs893, fd863; +} +mov.b32 r5643, {rs893, rs893}; +{ +mul.f16x2 r5641, r64, r5643; +} +{ +add.f16x2 r5644, r5620, r5641; +} +{ +cvt.rn.f16.f64 rs894, fd864; +} +mov.b32 r5649, {rs894, rs894}; +{ +mul.f16x2 r5647, r67, r5649; +} +{ +add.f16x2 r5650, r5626, r5647; +} +{ +cvt.rn.f16.f64 rs895, fd867; +} +mov.b32 r5655, {rs895, rs895}; +{ +mul.f16x2 r5653, r73, r5655; +} +{ +add.f16x2 r5656, r5632, r5653; +} +{ +cvt.rn.f16.f64 rs896, fd868; +} +mov.b32 r5661, {rs896, rs896}; +{ +mul.f16x2 r5659, r82, r5661; +} +{ +add.f16x2 r5662, r5638, r5659; +} +{ +cvt.rn.f16.f64 rs897, fd867; +} +mov.b32 r5667, {rs897, rs897}; +{ +mul.f16x2 r5665, r76, r5667; +} +{ +add.f16x2 r5668, r5644, r5665; +} +{ +cvt.rn.f16.f64 rs898, fd868; +} +mov.b32 r5673, {rs898, rs898}; +{ +mul.f16x2 r5671, r79, r5673; +} +{ +add.f16x2 r5674, r5650, r5671; +} +{ +cvt.rn.f16.f64 rs899, fd871; +} +mov.b32 r5679, {rs899, rs899}; +{ +mul.f16x2 r5677, r85, r5679; +} +{ +add.f16x2 r5680, r5656, r5677; +} +{ +cvt.rn.f16.f64 rs900, fd872; +} +mov.b32 r5685, {rs900, rs900}; +{ +mul.f16x2 r5683, r94, r5685; +} +{ +add.f16x2 r5686, r5662, r5683; +} +{ +cvt.rn.f16.f64 rs901, fd871; +} +mov.b32 r5691, {rs901, rs901}; +{ +mul.f16x2 r5689, r88, r5691; +} +{ +add.f16x2 r5692, r5668, r5689; +} +{ +cvt.rn.f16.f64 rs902, fd872; +} +mov.b32 r5697, {rs902, rs902}; +{ +mul.f16x2 r5695, r91, r5697; +} +{ +add.f16x2 r5698, r5674, r5695; +} +{ +cvt.rn.f16.f64 rs903, fd875; +} +mov.b32 r5703, {rs903, rs903}; +{ +mul.f16x2 r5701, r97, r5703; +} +{ +add.f16x2 r5704, r5680, r5701; +} +{ +cvt.rn.f16.f64 rs904, fd876; +} +mov.b32 r5709, {rs904, rs904}; +{ +mul.f16x2 r5707, r106, r5709; +} +{ +add.f16x2 r5710, r5686, r5707; +} +{ +cvt.rn.f16.f64 rs905, fd875; +} +mov.b32 r5715, {rs905, rs905}; +{ +mul.f16x2 r5713, r100, r5715; +} +{ +add.f16x2 r5716, r5692, r5713; +} +{ +cvt.rn.f16.f64 rs906, fd876; +} +mov.b32 r5721, {rs906, rs906}; +{ +mul.f16x2 r5719, r103, r5721; +} +{ +add.f16x2 r5722, r5698, r5719; +} +{ +cvt.rn.f16.f64 rs907, fd879; +} +mov.b32 r5727, {rs907, rs907}; +{ +mul.f16x2 r5725, r109, r5727; +} +{ +add.f16x2 r5728, r5704, r5725; +} +{ +cvt.rn.f16.f64 rs908, fd880; +} +mov.b32 r5733, {rs908, rs908}; +{ +mul.f16x2 r5731, r118, r5733; +} +{ +add.f16x2 r5734, r5710, r5731; +} +{ +cvt.rn.f16.f64 rs909, fd879; +} +mov.b32 r5739, {rs909, rs909}; +{ +mul.f16x2 r5737, r112, r5739; +} +{ +add.f16x2 r5740, r5716, r5737; +} +{ +cvt.rn.f16.f64 rs910, fd880; +} +mov.b32 r5745, {rs910, rs910}; +{ +mul.f16x2 r5743, r115, r5745; +} +{ +add.f16x2 r5746, r5722, r5743; +} +{ +cvt.rn.f16.f64 rs911, fd883; +} +mov.b32 r5751, {rs911, rs911}; +{ +mul.f16x2 r5749, r121, r5751; +} +{ +add.f16x2 r5752, r5728, r5749; +} +{ +cvt.rn.f16.f64 rs912, fd884; +} +mov.b32 r5757, {rs912, rs912}; +{ +mul.f16x2 r5755, r130, r5757; +} +{ +add.f16x2 r5758, r5734, r5755; +} +{ +cvt.rn.f16.f64 rs913, fd883; +} +mov.b32 r5763, {rs913, rs913}; +{ +mul.f16x2 r5761, r124, r5763; +} +{ +add.f16x2 r5764, r5740, r5761; +} +{ +cvt.rn.f16.f64 rs914, fd884; +} +mov.b32 r5769, {rs914, rs914}; +{ +mul.f16x2 r5767, r127, r5769; +} +{ +add.f16x2 r5770, r5746, r5767; +} +{ +cvt.rn.f16.f64 rs915, fd887; +} +mov.b32 r5775, {rs915, rs915}; +{ +mul.f16x2 r5773, r133, r5775; +} +{ +add.f16x2 r5776, r5752, r5773; +} +{ +cvt.rn.f16.f64 rs916, fd888; +} +mov.b32 r5781, {rs916, rs916}; +{ +mul.f16x2 r5779, r142, r5781; +} +{ +add.f16x2 r5782, r5758, r5779; +} +{ +cvt.rn.f16.f64 rs917, fd887; +} +mov.b32 r5787, {rs917, rs917}; +{ +mul.f16x2 r5785, r136, r5787; +} +{ +add.f16x2 r5788, r5764, r5785; +} +{ +cvt.rn.f16.f64 rs918, fd888; +} +mov.b32 r5793, {rs918, rs918}; +{ +mul.f16x2 r5791, r139, r5793; +} +{ +add.f16x2 r5794, r5770, r5791; +} +{ +cvt.rn.f16.f64 rs919, fd891; +} +mov.b32 r5799, {rs919, rs919}; +{ +mul.f16x2 r5797, r145, r5799; +} +{ +add.f16x2 r5800, r5776, r5797; +} +{ +cvt.rn.f16.f64 rs920, fd892; +} +mov.b32 r5805, {rs920, rs920}; +{ +mul.f16x2 r5803, r154, r5805; +} +{ +add.f16x2 r5806, r5782, r5803; +} +{ +cvt.rn.f16.f64 rs921, fd891; +} +mov.b32 r5811, {rs921, rs921}; +{ +mul.f16x2 r5809, r148, r5811; +} +{ +add.f16x2 r5812, r5788, r5809; +} +{ +cvt.rn.f16.f64 rs922, fd892; +} +mov.b32 r5817, {rs922, rs922}; +{ +mul.f16x2 r5815, r151, r5817; +} +{ +add.f16x2 r5818, r5794, r5815; +} +{ +cvt.rn.f16.f64 rs923, fd895; +} +mov.b32 r5823, {rs923, rs923}; +{ +mul.f16x2 r5821, r157, r5823; +} +{ +add.f16x2 r5824, r5800, r5821; +} +{ +cvt.rn.f16.f64 rs924, fd896; +} +mov.b32 r5829, {rs924, rs924}; +{ +mul.f16x2 r5827, r166, r5829; +} +{ +add.f16x2 r5830, r5806, r5827; +} +{ +cvt.rn.f16.f64 rs925, fd895; +} +mov.b32 r5835, {rs925, rs925}; +{ +mul.f16x2 r5833, r160, r5835; +} +{ +add.f16x2 r5836, r5812, r5833; +} +{ +cvt.rn.f16.f64 rs926, fd896; +} +mov.b32 r5841, {rs926, rs926}; +{ +mul.f16x2 r5839, r163, r5841; +} +{ +add.f16x2 r5842, r5818, r5839; +} +{ +cvt.rn.f16.f64 rs927, fd899; +} +mov.b32 r5847, {rs927, rs927}; +{ +mul.f16x2 r5845, r169, r5847; +} +{ +add.f16x2 r5848, r5824, r5845; +} +{ +cvt.rn.f16.f64 rs928, fd900; +} +mov.b32 r5853, {rs928, rs928}; +{ +mul.f16x2 r5851, r178, r5853; +} +{ +add.f16x2 r5854, r5830, r5851; +} +{ +cvt.rn.f16.f64 rs929, fd899; +} +mov.b32 r5859, {rs929, rs929}; +{ +mul.f16x2 r5857, r172, r5859; +} +{ +add.f16x2 r5860, r5836, r5857; +} +{ +cvt.rn.f16.f64 rs930, fd900; +} +mov.b32 r5865, {rs930, rs930}; +{ +mul.f16x2 r5863, r175, r5865; +} +{ +add.f16x2 r5866, r5842, r5863; +} +{ +sub.f16x2 %30, r5848, r5854; +} +{ +add.f16x2 %31, r5860, r5866; +} +{ +add.f16x2 %32, r5848, r5854; +} +{ +sub.f16x2 %33, r5860, r5866; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[30].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..289e50f7bc736 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp32_fwd.hpp.inc @@ -0,0 +1,1066 @@ +#ifndef CUFFTDX_FFT_31_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_31_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<15, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1273>; +.reg .b64 rd<4>; +add.f32 f125, %64, %123; +sub.f32 f127, %64, %123; +add.f32 f1272, %66, %124; +sub.f32 f128, %66, %124; +add.f32 f129, %67, %121; +sub.f32 f131, %67, %121; +add.f32 f1269, %125, %126; +sub.f32 f132, %125, %126; +add.f32 f133, %69, %119; +sub.f32 f135, %69, %119; +add.f32 f1267, %70, %127; +sub.f32 f136, %70, %127; +add.f32 f137, %71, %117; +sub.f32 f139, %71, %117; +add.f32 f1265, %128, %118; +sub.f32 f140, %128, %118; +add.f32 f141, %73, %115; +sub.f32 f143, %73, %115; +add.f32 f1262, %129, %130; +sub.f32 f144, %129, %130; +add.f32 f145, %75, %113; +sub.f32 f147, %75, %113; +add.f32 f1260, %76, %131; +sub.f32 f148, %76, %131; +add.f32 f149, %77, %111; +sub.f32 f151, %77, %111; +add.f32 f1258, %132, %112; +sub.f32 f152, %132, %112; +add.f32 f153, %79, %109; +sub.f32 f155, %79, %109; +add.f32 f1255, %133, %134; +sub.f32 f156, %133, %134; +add.f32 f157, %81, %107; +sub.f32 f159, %81, %107; +add.f32 f1253, %82, %135; +sub.f32 f160, %82, %135; +add.f32 f161, %83, %105; +sub.f32 f163, %83, %105; +add.f32 f1251, %136, %106; +sub.f32 f164, %136, %106; +add.f32 f165, %85, %103; +sub.f32 f167, %85, %103; +add.f32 f1248, %137, %138; +sub.f32 f168, %137, %138; +add.f32 f169, %87, %101; +sub.f32 f171, %87, %101; +add.f32 f1246, %88, %139; +sub.f32 f172, %88, %139; +add.f32 f173, %89, %99; +sub.f32 f175, %89, %99; +add.f32 f1244, %140, %100; +sub.f32 f176, %140, %100; +add.f32 f177, %91, %97; +sub.f32 f179, %91, %97; +add.f32 f1241, %141, %142; +sub.f32 f180, %141, %142; +add.f32 f181, %93, %95; +sub.f32 f183, %93, %95; +add.f32 f1239, %94, %143; +sub.f32 f184, %94, %143; +add.f32 f185, %62, f125; +add.f32 f187, f185, f129; +add.f32 f1238, %63, f1272; +add.f32 f188, f1238, f1269; +add.f32 f189, f187, f133; +add.f32 f190, f188, f1267; +add.f32 f191, f189, f137; +add.f32 f192, f190, f1265; +add.f32 f193, f191, f141; +add.f32 f194, f192, f1262; +add.f32 f195, f193, f145; +add.f32 f196, f194, f1260; +add.f32 f197, f195, f149; +add.f32 f198, f196, f1258; +add.f32 f199, f197, f153; +add.f32 f200, f198, f1255; +add.f32 f201, f199, f157; +add.f32 f202, f200, f1253; +add.f32 f203, f201, f161; +add.f32 f204, f202, f1251; +add.f32 f205, f203, f165; +add.f32 f206, f204, f1248; +add.f32 f207, f205, f169; +add.f32 f208, f206, f1246; +add.f32 f209, f207, f173; +add.f32 f210, f208, f1244; +add.f32 f211, f209, f177; +add.f32 f212, f210, f1241; +fma.rn.f32 f213, f125, 0f3F7AC279, %62; +fma.rn.f32 f217, f129, 0f3F6B40D2, f213; +fma.rn.f32 f1237, f128, 0fBE4E2133, 0f00000000; +fma.rn.f32 f218, f132, 0fBEC9E903, f1237; +fma.rn.f32 f1236, f1272, 0f3F7AC279, %63; +fma.rn.f32 f219, f1269, 0f3F6B40D2, f1236; +fma.rn.f32 f1235, f127, 0fBE4E2133, 0f00000000; +fma.rn.f32 f220, f131, 0fBEC9E903, f1235; +fma.rn.f32 f221, f133, 0f3F521D8E, f217; +fma.rn.f32 f222, f136, 0fBF123EA2, f218; +fma.rn.f32 f223, f1267, 0f3F521D8E, f219; +fma.rn.f32 f224, f135, 0fBF123EA2, f220; +fma.rn.f32 f225, f137, 0f3F306023, f221; +fma.rn.f32 f226, f140, 0fBF398C05, f222; +fma.rn.f32 f227, f1265, 0f3F306023, f223; +fma.rn.f32 f228, f139, 0fBF398C05, f224; +fma.rn.f32 f229, f141, 0f3F076A2F, f225; +fma.rn.f32 f230, f144, 0fBF5940C0, f226; +fma.rn.f32 f231, f1262, 0f3F076A2F, f227; +fma.rn.f32 f232, f143, 0fBF5940C0, f228; +fma.rn.f32 f233, f145, 0f3EB1D1FE, f229; +fma.rn.f32 f234, f148, 0fBF701086, f230; +fma.rn.f32 f235, f1260, 0f3EB1D1FE, f231; +fma.rn.f32 f236, f147, 0fBF701086, f232; +fma.rn.f32 f237, f149, 0f3E1B0FE2, f233; +fma.rn.f32 f238, f152, 0fBF7D0C43, f234; +fma.rn.f32 f239, f1258, 0f3E1B0FE2, f235; +fma.rn.f32 f240, f151, 0fBF7D0C43, f236; +fma.rn.f32 f241, f153, 0fBD4F7581, f237; +fma.rn.f32 f242, f156, 0fBF7FABE3, f238; +fma.rn.f32 f243, f1255, 0fBD4F7581, f239; +fma.rn.f32 f244, f155, 0fBF7FABE3, f240; +fma.rn.f32 f245, f157, 0fBE805587, f241; +fma.rn.f32 f246, f160, 0fBF77D3E7, f242; +fma.rn.f32 f247, f1253, 0fBE805587, f243; +fma.rn.f32 f248, f159, 0fBF77D3E7, f244; +fma.rn.f32 f249, f161, 0fBEE17B58, f245; +fma.rn.f32 f250, f164, 0fBF65D685, f246; +fma.rn.f32 f251, f1251, 0fBEE17B58, f247; +fma.rn.f32 f252, f163, 0fBF65D685, f248; +fma.rn.f32 f253, f165, 0fBF1CB2FA, f249; +fma.rn.f32 f254, f168, 0fBF4A7047, f250; +fma.rn.f32 f255, f1248, 0fBF1CB2FA, f251; +fma.rn.f32 f256, f167, 0fBF4A7047, f252; +fma.rn.f32 f257, f169, 0fBF423DF9, f253; +fma.rn.f32 f258, f172, 0fBF26C059, f254; +fma.rn.f32 f259, f1246, 0fBF423DF9, f255; +fma.rn.f32 f260, f171, 0fBF26C059, f256; +fma.rn.f32 f261, f173, 0fBF5FD52E, f257; +fma.rn.f32 f262, f176, 0fBEF87980, f258; +fma.rn.f32 f263, f1244, 0fBF5FD52E, f259; +fma.rn.f32 f264, f175, 0fBEF87980, f260; +fma.rn.f32 f265, f177, 0fBF744278, f261; +fma.rn.f32 f266, f180, 0fBE994620, f262; +fma.rn.f32 f267, f1241, 0fBF744278, f263; +fma.rn.f32 f268, f179, 0fBE994620, f264; +fma.rn.f32 f269, f181, 0fBF7EAFC2, f265; +fma.rn.f32 f270, f184, 0fBDCF3156, f266; +fma.rn.f32 f271, f1239, 0fBF7EAFC2, f267; +fma.rn.f32 f272, f183, 0fBDCF3156, f268; +fma.rn.f32 f273, f125, 0f3F6B40D2, %62; +fma.rn.f32 f277, f129, 0f3F306023, f273; +fma.rn.f32 f1234, f128, 0fBEC9E903, 0f00000000; +fma.rn.f32 f278, f132, 0fBF398C05, f1234; +fma.rn.f32 f1233, f1272, 0f3F6B40D2, %63; +fma.rn.f32 f279, f1269, 0f3F306023, f1233; +fma.rn.f32 f1232, f127, 0fBEC9E903, 0f00000000; +fma.rn.f32 f280, f131, 0fBF398C05, f1232; +fma.rn.f32 f281, f133, 0f3EB1D1FE, f277; +fma.rn.f32 f282, f136, 0fBF701086, f278; +fma.rn.f32 f283, f1267, 0f3EB1D1FE, f279; +fma.rn.f32 f284, f135, 0fBF701086, f280; +fma.rn.f32 f285, f137, 0fBD4F7581, f281; +fma.rn.f32 f286, f140, 0fBF7FABE3, f282; +fma.rn.f32 f287, f1265, 0fBD4F7581, f283; +fma.rn.f32 f288, f139, 0fBF7FABE3, f284; +fma.rn.f32 f289, f141, 0fBEE17B58, f285; +fma.rn.f32 f290, f144, 0fBF65D685, f286; +fma.rn.f32 f291, f1262, 0fBEE17B58, f287; +fma.rn.f32 f292, f143, 0fBF65D685, f288; +fma.rn.f32 f293, f145, 0fBF423DF9, f289; +fma.rn.f32 f294, f148, 0fBF26C059, f290; +fma.rn.f32 f295, f1260, 0fBF423DF9, f291; +fma.rn.f32 f296, f147, 0fBF26C059, f292; +fma.rn.f32 f297, f149, 0fBF744278, f293; +fma.rn.f32 f298, f152, 0fBE994620, f294; +fma.rn.f32 f299, f1258, 0fBF744278, f295; +fma.rn.f32 f300, f151, 0fBE994620, f296; +fma.rn.f32 f301, f153, 0fBF7EAFC2, f297; +fma.rn.f32 f302, f156, 0f3DCF3156, f298; +fma.rn.f32 f303, f1255, 0fBF7EAFC2, f299; +fma.rn.f32 f304, f155, 0f3DCF3156, f300; +fma.rn.f32 f305, f157, 0fBF5FD52E, f301; +fma.rn.f32 f306, f160, 0f3EF87980, f302; +fma.rn.f32 f307, f1253, 0fBF5FD52E, f303; +fma.rn.f32 f308, f159, 0f3EF87980, f304; +fma.rn.f32 f309, f161, 0fBF1CB2FA, f305; +fma.rn.f32 f310, f164, 0f3F4A7047, f306; +fma.rn.f32 f311, f1251, 0fBF1CB2FA, f307; +fma.rn.f32 f312, f163, 0f3F4A7047, f308; +fma.rn.f32 f313, f165, 0fBE805587, f309; +fma.rn.f32 f314, f168, 0f3F77D3E7, f310; +fma.rn.f32 f315, f1248, 0fBE805587, f311; +fma.rn.f32 f316, f167, 0f3F77D3E7, f312; +fma.rn.f32 f317, f169, 0f3E1B0FE2, f313; +fma.rn.f32 f318, f172, 0f3F7D0C43, f314; +fma.rn.f32 f319, f1246, 0f3E1B0FE2, f315; +fma.rn.f32 f320, f171, 0f3F7D0C43, f316; +fma.rn.f32 f321, f173, 0f3F076A2F, f317; +fma.rn.f32 f322, f176, 0f3F5940C0, f318; +fma.rn.f32 f323, f1244, 0f3F076A2F, f319; +fma.rn.f32 f324, f175, 0f3F5940C0, f320; +fma.rn.f32 f325, f177, 0f3F521D8E, f321; +fma.rn.f32 f326, f180, 0f3F123EA2, f322; +fma.rn.f32 f327, f1241, 0f3F521D8E, f323; +fma.rn.f32 f328, f179, 0f3F123EA2, f324; +fma.rn.f32 f329, f181, 0f3F7AC279, f325; +fma.rn.f32 f330, f184, 0f3E4E2133, f326; +fma.rn.f32 f331, f1239, 0f3F7AC279, f327; +fma.rn.f32 f332, f183, 0f3E4E2133, f328; +fma.rn.f32 f333, f125, 0f3F521D8E, %62; +fma.rn.f32 f337, f129, 0f3EB1D1FE, f333; +fma.rn.f32 f1231, f128, 0fBF123EA2, 0f00000000; +fma.rn.f32 f338, f132, 0fBF701086, f1231; +fma.rn.f32 f1230, f1272, 0f3F521D8E, %63; +fma.rn.f32 f339, f1269, 0f3EB1D1FE, f1230; +fma.rn.f32 f1229, f127, 0fBF123EA2, 0f00000000; +fma.rn.f32 f340, f131, 0fBF701086, f1229; +fma.rn.f32 f341, f133, 0fBE805587, f337; +fma.rn.f32 f342, f136, 0fBF77D3E7, f338; +fma.rn.f32 f343, f1267, 0fBE805587, f339; +fma.rn.f32 f344, f135, 0fBF77D3E7, f340; +fma.rn.f32 f345, f137, 0fBF423DF9, f341; +fma.rn.f32 f346, f140, 0fBF26C059, f342; +fma.rn.f32 f347, f1265, 0fBF423DF9, f343; +fma.rn.f32 f348, f139, 0fBF26C059, f344; +fma.rn.f32 f349, f141, 0fBF7EAFC2, f345; +fma.rn.f32 f350, f144, 0fBDCF3156, f346; +fma.rn.f32 f351, f1262, 0fBF7EAFC2, f347; +fma.rn.f32 f352, f143, 0fBDCF3156, f348; +fma.rn.f32 f353, f145, 0fBF5FD52E, f349; +fma.rn.f32 f354, f148, 0f3EF87980, f350; +fma.rn.f32 f355, f1260, 0fBF5FD52E, f351; +fma.rn.f32 f356, f147, 0f3EF87980, f352; +fma.rn.f32 f357, f149, 0fBEE17B58, f353; +fma.rn.f32 f358, f152, 0f3F65D685, f354; +fma.rn.f32 f359, f1258, 0fBEE17B58, f355; +fma.rn.f32 f360, f151, 0f3F65D685, f356; +fma.rn.f32 f361, f153, 0f3E1B0FE2, f357; +fma.rn.f32 f362, f156, 0f3F7D0C43, f358; +fma.rn.f32 f363, f1255, 0f3E1B0FE2, f359; +fma.rn.f32 f364, f155, 0f3F7D0C43, f360; +fma.rn.f32 f365, f157, 0f3F306023, f361; +fma.rn.f32 f366, f160, 0f3F398C05, f362; +fma.rn.f32 f367, f1253, 0f3F306023, f363; +fma.rn.f32 f368, f159, 0f3F398C05, f364; +fma.rn.f32 f369, f161, 0f3F7AC279, f365; +fma.rn.f32 f370, f164, 0f3E4E2133, f366; +fma.rn.f32 f371, f1251, 0f3F7AC279, f367; +fma.rn.f32 f372, f163, 0f3E4E2133, f368; +fma.rn.f32 f373, f165, 0f3F6B40D2, f369; +fma.rn.f32 f374, f168, 0fBEC9E903, f370; +fma.rn.f32 f375, f1248, 0f3F6B40D2, f371; +fma.rn.f32 f376, f167, 0fBEC9E903, f372; +fma.rn.f32 f377, f169, 0f3F076A2F, f373; +fma.rn.f32 f378, f172, 0fBF5940C0, f374; +fma.rn.f32 f379, f1246, 0f3F076A2F, f375; +fma.rn.f32 f380, f171, 0fBF5940C0, f376; +fma.rn.f32 f381, f173, 0fBD4F7581, f377; +fma.rn.f32 f382, f176, 0fBF7FABE3, f378; +fma.rn.f32 f383, f1244, 0fBD4F7581, f379; +fma.rn.f32 f384, f175, 0fBF7FABE3, f380; +fma.rn.f32 f385, f177, 0fBF1CB2FA, f381; +fma.rn.f32 f386, f180, 0fBF4A7047, f382; +fma.rn.f32 f387, f1241, 0fBF1CB2FA, f383; +fma.rn.f32 f388, f179, 0fBF4A7047, f384; +fma.rn.f32 f389, f181, 0fBF744278, f385; +fma.rn.f32 f390, f184, 0fBE994620, f386; +fma.rn.f32 f391, f1239, 0fBF744278, f387; +fma.rn.f32 f392, f183, 0fBE994620, f388; +fma.rn.f32 f393, f125, 0f3F306023, %62; +fma.rn.f32 f397, f129, 0fBD4F7581, f393; +fma.rn.f32 f1228, f128, 0fBF398C05, 0f00000000; +fma.rn.f32 f398, f132, 0fBF7FABE3, f1228; +fma.rn.f32 f1227, f1272, 0f3F306023, %63; +fma.rn.f32 f399, f1269, 0fBD4F7581, f1227; +fma.rn.f32 f1226, f127, 0fBF398C05, 0f00000000; +fma.rn.f32 f400, f131, 0fBF7FABE3, f1226; +fma.rn.f32 f401, f133, 0fBF423DF9, f397; +fma.rn.f32 f402, f136, 0fBF26C059, f398; +fma.rn.f32 f403, f1267, 0fBF423DF9, f399; +fma.rn.f32 f404, f135, 0fBF26C059, f400; +fma.rn.f32 f405, f137, 0fBF7EAFC2, f401; +fma.rn.f32 f406, f140, 0f3DCF3156, f402; +fma.rn.f32 f407, f1265, 0fBF7EAFC2, f403; +fma.rn.f32 f408, f139, 0f3DCF3156, f404; +fma.rn.f32 f409, f141, 0fBF1CB2FA, f405; +fma.rn.f32 f410, f144, 0f3F4A7047, f406; +fma.rn.f32 f411, f1262, 0fBF1CB2FA, f407; +fma.rn.f32 f412, f143, 0f3F4A7047, f408; +fma.rn.f32 f413, f145, 0f3E1B0FE2, f409; +fma.rn.f32 f414, f148, 0f3F7D0C43, f410; +fma.rn.f32 f415, f1260, 0f3E1B0FE2, f411; +fma.rn.f32 f416, f147, 0f3F7D0C43, f412; +fma.rn.f32 f417, f149, 0f3F521D8E, f413; +fma.rn.f32 f418, f152, 0f3F123EA2, f414; +fma.rn.f32 f419, f1258, 0f3F521D8E, f415; +fma.rn.f32 f420, f151, 0f3F123EA2, f416; +fma.rn.f32 f421, f153, 0f3F7AC279, f417; +fma.rn.f32 f422, f156, 0fBE4E2133, f418; +fma.rn.f32 f423, f1255, 0f3F7AC279, f419; +fma.rn.f32 f424, f155, 0fBE4E2133, f420; +fma.rn.f32 f425, f157, 0f3F076A2F, f421; +fma.rn.f32 f426, f160, 0fBF5940C0, f422; +fma.rn.f32 f427, f1253, 0f3F076A2F, f423; +fma.rn.f32 f428, f159, 0fBF5940C0, f424; +fma.rn.f32 f429, f161, 0fBE805587, f425; +fma.rn.f32 f430, f164, 0fBF77D3E7, f426; +fma.rn.f32 f431, f1251, 0fBE805587, f427; +fma.rn.f32 f432, f163, 0fBF77D3E7, f428; +fma.rn.f32 f433, f165, 0fBF5FD52E, f429; +fma.rn.f32 f434, f168, 0fBEF87980, f430; +fma.rn.f32 f435, f1248, 0fBF5FD52E, f431; +fma.rn.f32 f436, f167, 0fBEF87980, f432; +fma.rn.f32 f437, f169, 0fBF744278, f433; +fma.rn.f32 f438, f172, 0f3E994620, f434; +fma.rn.f32 f439, f1246, 0fBF744278, f435; +fma.rn.f32 f440, f171, 0f3E994620, f436; +fma.rn.f32 f441, f173, 0fBEE17B58, f437; +fma.rn.f32 f442, f176, 0f3F65D685, f438; +fma.rn.f32 f443, f1244, 0fBEE17B58, f439; +fma.rn.f32 f444, f175, 0f3F65D685, f440; +fma.rn.f32 f445, f177, 0f3EB1D1FE, f441; +fma.rn.f32 f446, f180, 0f3F701086, f442; +fma.rn.f32 f447, f1241, 0f3EB1D1FE, f443; +fma.rn.f32 f448, f179, 0f3F701086, f444; +fma.rn.f32 f449, f181, 0f3F6B40D2, f445; +fma.rn.f32 f450, f184, 0f3EC9E903, f446; +fma.rn.f32 f451, f1239, 0f3F6B40D2, f447; +fma.rn.f32 f452, f183, 0f3EC9E903, f448; +fma.rn.f32 f453, f125, 0f3F076A2F, %62; +fma.rn.f32 f457, f129, 0fBEE17B58, f453; +fma.rn.f32 f1225, f128, 0fBF5940C0, 0f00000000; +fma.rn.f32 f458, f132, 0fBF65D685, f1225; +fma.rn.f32 f1224, f1272, 0f3F076A2F, %63; +fma.rn.f32 f459, f1269, 0fBEE17B58, f1224; +fma.rn.f32 f1223, f127, 0fBF5940C0, 0f00000000; +fma.rn.f32 f460, f131, 0fBF65D685, f1223; +fma.rn.f32 f461, f133, 0fBF7EAFC2, f457; +fma.rn.f32 f462, f136, 0fBDCF3156, f458; +fma.rn.f32 f463, f1267, 0fBF7EAFC2, f459; +fma.rn.f32 f464, f135, 0fBDCF3156, f460; +fma.rn.f32 f465, f137, 0fBF1CB2FA, f461; +fma.rn.f32 f466, f140, 0f3F4A7047, f462; +fma.rn.f32 f467, f1265, 0fBF1CB2FA, f463; +fma.rn.f32 f468, f139, 0f3F4A7047, f464; +fma.rn.f32 f469, f141, 0f3EB1D1FE, f465; +fma.rn.f32 f470, f144, 0f3F701086, f466; +fma.rn.f32 f471, f1262, 0f3EB1D1FE, f467; +fma.rn.f32 f472, f143, 0f3F701086, f468; +fma.rn.f32 f473, f145, 0f3F7AC279, f469; +fma.rn.f32 f474, f148, 0f3E4E2133, f470; +fma.rn.f32 f475, f1260, 0f3F7AC279, f471; +fma.rn.f32 f476, f147, 0f3E4E2133, f472; +fma.rn.f32 f477, f149, 0f3F306023, f473; +fma.rn.f32 f478, f152, 0fBF398C05, f474; +fma.rn.f32 f479, f1258, 0f3F306023, f475; +fma.rn.f32 f480, f151, 0fBF398C05, f476; +fma.rn.f32 f481, f153, 0fBE805587, f477; +fma.rn.f32 f482, f156, 0fBF77D3E7, f478; +fma.rn.f32 f483, f1255, 0fBE805587, f479; +fma.rn.f32 f484, f155, 0fBF77D3E7, f480; +fma.rn.f32 f485, f157, 0fBF744278, f481; +fma.rn.f32 f486, f160, 0fBE994620, f482; +fma.rn.f32 f487, f1253, 0fBF744278, f483; +fma.rn.f32 f488, f159, 0fBE994620, f484; +fma.rn.f32 f489, f161, 0fBF423DF9, f485; +fma.rn.f32 f490, f164, 0f3F26C059, f486; +fma.rn.f32 f491, f1251, 0fBF423DF9, f487; +fma.rn.f32 f492, f163, 0f3F26C059, f488; +fma.rn.f32 f493, f165, 0f3E1B0FE2, f489; +fma.rn.f32 f494, f168, 0f3F7D0C43, f490; +fma.rn.f32 f495, f1248, 0f3E1B0FE2, f491; +fma.rn.f32 f496, f167, 0f3F7D0C43, f492; +fma.rn.f32 f497, f169, 0f3F6B40D2, f493; +fma.rn.f32 f498, f172, 0f3EC9E903, f494; +fma.rn.f32 f499, f1246, 0f3F6B40D2, f495; +fma.rn.f32 f500, f171, 0f3EC9E903, f496; +fma.rn.f32 f501, f173, 0f3F521D8E, f497; +fma.rn.f32 f502, f176, 0fBF123EA2, f498; +fma.rn.f32 f503, f1244, 0f3F521D8E, f499; +fma.rn.f32 f504, f175, 0fBF123EA2, f500; +fma.rn.f32 f505, f177, 0fBD4F7581, f501; +fma.rn.f32 f506, f180, 0fBF7FABE3, f502; +fma.rn.f32 f507, f1241, 0fBD4F7581, f503; +fma.rn.f32 f508, f179, 0fBF7FABE3, f504; +fma.rn.f32 f509, f181, 0fBF5FD52E, f505; +fma.rn.f32 f510, f184, 0fBEF87980, f506; +fma.rn.f32 f511, f1239, 0fBF5FD52E, f507; +fma.rn.f32 f512, f183, 0fBEF87980, f508; +fma.rn.f32 f513, f125, 0f3EB1D1FE, %62; +fma.rn.f32 f517, f129, 0fBF423DF9, f513; +fma.rn.f32 f1222, f128, 0fBF701086, 0f00000000; +fma.rn.f32 f518, f132, 0fBF26C059, f1222; +fma.rn.f32 f1221, f1272, 0f3EB1D1FE, %63; +fma.rn.f32 f519, f1269, 0fBF423DF9, f1221; +fma.rn.f32 f1220, f127, 0fBF701086, 0f00000000; +fma.rn.f32 f520, f131, 0fBF26C059, f1220; +fma.rn.f32 f521, f133, 0fBF5FD52E, f517; +fma.rn.f32 f522, f136, 0f3EF87980, f518; +fma.rn.f32 f523, f1267, 0fBF5FD52E, f519; +fma.rn.f32 f524, f135, 0f3EF87980, f520; +fma.rn.f32 f525, f137, 0f3E1B0FE2, f521; +fma.rn.f32 f526, f140, 0f3F7D0C43, f522; +fma.rn.f32 f527, f1265, 0f3E1B0FE2, f523; +fma.rn.f32 f528, f139, 0f3F7D0C43, f524; +fma.rn.f32 f529, f141, 0f3F7AC279, f525; +fma.rn.f32 f530, f144, 0f3E4E2133, f526; +fma.rn.f32 f531, f1262, 0f3F7AC279, f527; +fma.rn.f32 f532, f143, 0f3E4E2133, f528; +fma.rn.f32 f533, f145, 0f3F076A2F, f529; +fma.rn.f32 f534, f148, 0fBF5940C0, f530; +fma.rn.f32 f535, f1260, 0f3F076A2F, f531; +fma.rn.f32 f536, f147, 0fBF5940C0, f532; +fma.rn.f32 f537, f149, 0fBF1CB2FA, f533; +fma.rn.f32 f538, f152, 0fBF4A7047, f534; +fma.rn.f32 f539, f1258, 0fBF1CB2FA, f535; +fma.rn.f32 f540, f151, 0fBF4A7047, f536; +fma.rn.f32 f541, f153, 0fBF744278, f537; +fma.rn.f32 f542, f156, 0f3E994620, f538; +fma.rn.f32 f543, f1255, 0fBF744278, f539; +fma.rn.f32 f544, f155, 0f3E994620, f540; +fma.rn.f32 f545, f157, 0fBD4F7581, f541; +fma.rn.f32 f546, f160, 0f3F7FABE3, f542; +fma.rn.f32 f547, f1253, 0fBD4F7581, f543; +fma.rn.f32 f548, f159, 0f3F7FABE3, f544; +fma.rn.f32 f549, f161, 0f3F6B40D2, f545; +fma.rn.f32 f550, f164, 0f3EC9E903, f546; +fma.rn.f32 f551, f1251, 0f3F6B40D2, f547; +fma.rn.f32 f552, f163, 0f3EC9E903, f548; +fma.rn.f32 f553, f165, 0f3F306023, f549; +fma.rn.f32 f554, f168, 0fBF398C05, f550; +fma.rn.f32 f555, f1248, 0f3F306023, f551; +fma.rn.f32 f556, f167, 0fBF398C05, f552; +fma.rn.f32 f557, f169, 0fBEE17B58, f553; +fma.rn.f32 f558, f172, 0fBF65D685, f554; +fma.rn.f32 f559, f1246, 0fBEE17B58, f555; +fma.rn.f32 f560, f171, 0fBF65D685, f556; +fma.rn.f32 f561, f173, 0fBF7EAFC2, f557; +fma.rn.f32 f562, f176, 0f3DCF3156, f558; +fma.rn.f32 f563, f1244, 0fBF7EAFC2, f559; +fma.rn.f32 f564, f175, 0f3DCF3156, f560; +fma.rn.f32 f565, f177, 0fBE805587, f561; +fma.rn.f32 f566, f180, 0f3F77D3E7, f562; +fma.rn.f32 f567, f1241, 0fBE805587, f563; +fma.rn.f32 f568, f179, 0f3F77D3E7, f564; +fma.rn.f32 f569, f181, 0f3F521D8E, f565; +fma.rn.f32 f570, f184, 0f3F123EA2, f566; +fma.rn.f32 f571, f1239, 0f3F521D8E, f567; +fma.rn.f32 f572, f183, 0f3F123EA2, f568; +fma.rn.f32 f573, f125, 0f3E1B0FE2, %62; +fma.rn.f32 f577, f129, 0fBF744278, f573; +fma.rn.f32 f1219, f128, 0fBF7D0C43, 0f00000000; +fma.rn.f32 f578, f132, 0fBE994620, f1219; +fma.rn.f32 f1218, f1272, 0f3E1B0FE2, %63; +fma.rn.f32 f579, f1269, 0fBF744278, f1218; +fma.rn.f32 f1217, f127, 0fBF7D0C43, 0f00000000; +fma.rn.f32 f580, f131, 0fBE994620, f1217; +fma.rn.f32 f581, f133, 0fBEE17B58, f577; +fma.rn.f32 f582, f136, 0f3F65D685, f578; +fma.rn.f32 f583, f1267, 0fBEE17B58, f579; +fma.rn.f32 f584, f135, 0f3F65D685, f580; +fma.rn.f32 f585, f137, 0f3F521D8E, f581; +fma.rn.f32 f586, f140, 0f3F123EA2, f582; +fma.rn.f32 f587, f1265, 0f3F521D8E, f583; +fma.rn.f32 f588, f139, 0f3F123EA2, f584; +fma.rn.f32 f589, f141, 0f3F306023, f585; +fma.rn.f32 f590, f144, 0fBF398C05, f586; +fma.rn.f32 f591, f1262, 0f3F306023, f587; +fma.rn.f32 f592, f143, 0fBF398C05, f588; +fma.rn.f32 f593, f145, 0fBF1CB2FA, f589; +fma.rn.f32 f594, f148, 0fBF4A7047, f590; +fma.rn.f32 f595, f1260, 0fBF1CB2FA, f591; +fma.rn.f32 f596, f147, 0fBF4A7047, f592; +fma.rn.f32 f597, f149, 0fBF5FD52E, f593; +fma.rn.f32 f598, f152, 0f3EF87980, f594; +fma.rn.f32 f599, f1258, 0fBF5FD52E, f595; +fma.rn.f32 f600, f151, 0f3EF87980, f596; +fma.rn.f32 f601, f153, 0f3EB1D1FE, f597; +fma.rn.f32 f602, f156, 0f3F701086, f598; +fma.rn.f32 f603, f1255, 0f3EB1D1FE, f599; +fma.rn.f32 f604, f155, 0f3F701086, f600; +fma.rn.f32 f605, f157, 0f3F7AC279, f601; +fma.rn.f32 f606, f160, 0fBE4E2133, f602; +fma.rn.f32 f607, f1253, 0f3F7AC279, f603; +fma.rn.f32 f608, f159, 0fBE4E2133, f604; +fma.rn.f32 f609, f161, 0fBD4F7581, f605; +fma.rn.f32 f610, f164, 0fBF7FABE3, f606; +fma.rn.f32 f611, f1251, 0fBD4F7581, f607; +fma.rn.f32 f612, f163, 0fBF7FABE3, f608; +fma.rn.f32 f613, f165, 0fBF7EAFC2, f609; +fma.rn.f32 f614, f168, 0fBDCF3156, f610; +fma.rn.f32 f615, f1248, 0fBF7EAFC2, f611; +fma.rn.f32 f616, f167, 0fBDCF3156, f612; +fma.rn.f32 f617, f169, 0fBE805587, f613; +fma.rn.f32 f618, f172, 0f3F77D3E7, f614; +fma.rn.f32 f619, f1246, 0fBE805587, f615; +fma.rn.f32 f620, f171, 0f3F77D3E7, f616; +fma.rn.f32 f621, f173, 0f3F6B40D2, f617; +fma.rn.f32 f622, f176, 0f3EC9E903, f618; +fma.rn.f32 f623, f1244, 0f3F6B40D2, f619; +fma.rn.f32 f624, f175, 0f3EC9E903, f620; +fma.rn.f32 f625, f177, 0f3F076A2F, f621; +fma.rn.f32 f626, f180, 0fBF5940C0, f622; +fma.rn.f32 f627, f1241, 0f3F076A2F, f623; +fma.rn.f32 f628, f179, 0fBF5940C0, f624; +fma.rn.f32 f629, f181, 0fBF423DF9, f625; +fma.rn.f32 f630, f184, 0fBF26C059, f626; +fma.rn.f32 f631, f1239, 0fBF423DF9, f627; +fma.rn.f32 f632, f183, 0fBF26C059, f628; +fma.rn.f32 f633, f125, 0fBD4F7581, %62; +fma.rn.f32 f637, f129, 0fBF7EAFC2, f633; +fma.rn.f32 f1216, f128, 0fBF7FABE3, 0f00000000; +fma.rn.f32 f638, f132, 0f3DCF3156, f1216; +fma.rn.f32 f1215, f1272, 0fBD4F7581, %63; +fma.rn.f32 f639, f1269, 0fBF7EAFC2, f1215; +fma.rn.f32 f1214, f127, 0fBF7FABE3, 0f00000000; +fma.rn.f32 f640, f131, 0f3DCF3156, f1214; +fma.rn.f32 f641, f133, 0f3E1B0FE2, f637; +fma.rn.f32 f642, f136, 0f3F7D0C43, f638; +fma.rn.f32 f643, f1267, 0f3E1B0FE2, f639; +fma.rn.f32 f644, f135, 0f3F7D0C43, f640; +fma.rn.f32 f645, f137, 0f3F7AC279, f641; +fma.rn.f32 f646, f140, 0fBE4E2133, f642; +fma.rn.f32 f647, f1265, 0f3F7AC279, f643; +fma.rn.f32 f648, f139, 0fBE4E2133, f644; +fma.rn.f32 f649, f141, 0fBE805587, f645; +fma.rn.f32 f650, f144, 0fBF77D3E7, f646; +fma.rn.f32 f651, f1262, 0fBE805587, f647; +fma.rn.f32 f652, f143, 0fBF77D3E7, f648; +fma.rn.f32 f653, f145, 0fBF744278, f649; +fma.rn.f32 f654, f148, 0f3E994620, f650; +fma.rn.f32 f655, f1260, 0fBF744278, f651; +fma.rn.f32 f656, f147, 0f3E994620, f652; +fma.rn.f32 f657, f149, 0f3EB1D1FE, f653; +fma.rn.f32 f658, f152, 0f3F701086, f654; +fma.rn.f32 f659, f1258, 0f3EB1D1FE, f655; +fma.rn.f32 f660, f151, 0f3F701086, f656; +fma.rn.f32 f661, f153, 0f3F6B40D2, f657; +fma.rn.f32 f662, f156, 0fBEC9E903, f658; +fma.rn.f32 f663, f1255, 0f3F6B40D2, f659; +fma.rn.f32 f664, f155, 0fBEC9E903, f660; +fma.rn.f32 f665, f157, 0fBEE17B58, f661; +fma.rn.f32 f666, f160, 0fBF65D685, f662; +fma.rn.f32 f667, f1253, 0fBEE17B58, f663; +fma.rn.f32 f668, f159, 0fBF65D685, f664; +fma.rn.f32 f669, f161, 0fBF5FD52E, f665; +fma.rn.f32 f670, f164, 0f3EF87980, f666; +fma.rn.f32 f671, f1251, 0fBF5FD52E, f667; +fma.rn.f32 f672, f163, 0f3EF87980, f668; +fma.rn.f32 f673, f165, 0f3F076A2F, f669; +fma.rn.f32 f674, f168, 0f3F5940C0, f670; +fma.rn.f32 f675, f1248, 0f3F076A2F, f671; +fma.rn.f32 f676, f167, 0f3F5940C0, f672; +fma.rn.f32 f677, f169, 0f3F521D8E, f673; +fma.rn.f32 f678, f172, 0fBF123EA2, f674; +fma.rn.f32 f679, f1246, 0f3F521D8E, f675; +fma.rn.f32 f680, f171, 0fBF123EA2, f676; +fma.rn.f32 f681, f173, 0fBF1CB2FA, f677; +fma.rn.f32 f682, f176, 0fBF4A7047, f678; +fma.rn.f32 f683, f1244, 0fBF1CB2FA, f679; +fma.rn.f32 f684, f175, 0fBF4A7047, f680; +fma.rn.f32 f685, f177, 0fBF423DF9, f681; +fma.rn.f32 f686, f180, 0f3F26C059, f682; +fma.rn.f32 f687, f1241, 0fBF423DF9, f683; +fma.rn.f32 f688, f179, 0f3F26C059, f684; +fma.rn.f32 f689, f181, 0f3F306023, f685; +fma.rn.f32 f690, f184, 0f3F398C05, f686; +fma.rn.f32 f691, f1239, 0f3F306023, f687; +fma.rn.f32 f692, f183, 0f3F398C05, f688; +fma.rn.f32 f693, f125, 0fBE805587, %62; +fma.rn.f32 f697, f129, 0fBF5FD52E, f693; +fma.rn.f32 f1213, f128, 0fBF77D3E7, 0f00000000; +fma.rn.f32 f698, f132, 0f3EF87980, f1213; +fma.rn.f32 f1212, f1272, 0fBE805587, %63; +fma.rn.f32 f699, f1269, 0fBF5FD52E, f1212; +fma.rn.f32 f1211, f127, 0fBF77D3E7, 0f00000000; +fma.rn.f32 f700, f131, 0f3EF87980, f1211; +fma.rn.f32 f701, f133, 0f3F306023, f697; +fma.rn.f32 f702, f136, 0f3F398C05, f698; +fma.rn.f32 f703, f1267, 0f3F306023, f699; +fma.rn.f32 f704, f135, 0f3F398C05, f700; +fma.rn.f32 f705, f137, 0f3F076A2F, f701; +fma.rn.f32 f706, f140, 0fBF5940C0, f702; +fma.rn.f32 f707, f1265, 0f3F076A2F, f703; +fma.rn.f32 f708, f139, 0fBF5940C0, f704; +fma.rn.f32 f709, f141, 0fBF744278, f705; +fma.rn.f32 f710, f144, 0fBE994620, f706; +fma.rn.f32 f711, f1262, 0fBF744278, f707; +fma.rn.f32 f712, f143, 0fBE994620, f708; +fma.rn.f32 f713, f145, 0fBD4F7581, f709; +fma.rn.f32 f714, f148, 0f3F7FABE3, f710; +fma.rn.f32 f715, f1260, 0fBD4F7581, f711; +fma.rn.f32 f716, f147, 0f3F7FABE3, f712; +fma.rn.f32 f717, f149, 0f3F7AC279, f713; +fma.rn.f32 f718, f152, 0fBE4E2133, f714; +fma.rn.f32 f719, f1258, 0f3F7AC279, f715; +fma.rn.f32 f720, f151, 0fBE4E2133, f716; +fma.rn.f32 f721, f153, 0fBEE17B58, f717; +fma.rn.f32 f722, f156, 0fBF65D685, f718; +fma.rn.f32 f723, f1255, 0fBEE17B58, f719; +fma.rn.f32 f724, f155, 0fBF65D685, f720; +fma.rn.f32 f725, f157, 0fBF423DF9, f721; +fma.rn.f32 f726, f160, 0f3F26C059, f722; +fma.rn.f32 f727, f1253, 0fBF423DF9, f723; +fma.rn.f32 f728, f159, 0f3F26C059, f724; +fma.rn.f32 f729, f161, 0f3F521D8E, f725; +fma.rn.f32 f730, f164, 0f3F123EA2, f726; +fma.rn.f32 f731, f1251, 0f3F521D8E, f727; +fma.rn.f32 f732, f163, 0f3F123EA2, f728; +fma.rn.f32 f733, f165, 0f3EB1D1FE, f729; +fma.rn.f32 f734, f168, 0fBF701086, f730; +fma.rn.f32 f735, f1248, 0f3EB1D1FE, f731; +fma.rn.f32 f736, f167, 0fBF701086, f732; +fma.rn.f32 f737, f169, 0fBF7EAFC2, f733; +fma.rn.f32 f738, f172, 0fBDCF3156, f734; +fma.rn.f32 f739, f1246, 0fBF7EAFC2, f735; +fma.rn.f32 f740, f171, 0fBDCF3156, f736; +fma.rn.f32 f741, f173, 0f3E1B0FE2, f737; +fma.rn.f32 f742, f176, 0f3F7D0C43, f738; +fma.rn.f32 f743, f1244, 0f3E1B0FE2, f739; +fma.rn.f32 f744, f175, 0f3F7D0C43, f740; +fma.rn.f32 f745, f177, 0f3F6B40D2, f741; +fma.rn.f32 f746, f180, 0fBEC9E903, f742; +fma.rn.f32 f747, f1241, 0f3F6B40D2, f743; +fma.rn.f32 f748, f179, 0fBEC9E903, f744; +fma.rn.f32 f749, f181, 0fBF1CB2FA, f745; +fma.rn.f32 f750, f184, 0fBF4A7047, f746; +fma.rn.f32 f751, f1239, 0fBF1CB2FA, f747; +fma.rn.f32 f752, f183, 0fBF4A7047, f748; +fma.rn.f32 f753, f125, 0fBEE17B58, %62; +fma.rn.f32 f757, f129, 0fBF1CB2FA, f753; +fma.rn.f32 f1210, f128, 0fBF65D685, 0f00000000; +fma.rn.f32 f758, f132, 0f3F4A7047, f1210; +fma.rn.f32 f1209, f1272, 0fBEE17B58, %63; +fma.rn.f32 f759, f1269, 0fBF1CB2FA, f1209; +fma.rn.f32 f1208, f127, 0fBF65D685, 0f00000000; +fma.rn.f32 f760, f131, 0f3F4A7047, f1208; +fma.rn.f32 f761, f133, 0f3F7AC279, f757; +fma.rn.f32 f762, f136, 0f3E4E2133, f758; +fma.rn.f32 f763, f1267, 0f3F7AC279, f759; +fma.rn.f32 f764, f135, 0f3E4E2133, f760; +fma.rn.f32 f765, f137, 0fBE805587, f761; +fma.rn.f32 f766, f140, 0fBF77D3E7, f762; +fma.rn.f32 f767, f1265, 0fBE805587, f763; +fma.rn.f32 f768, f139, 0fBF77D3E7, f764; +fma.rn.f32 f769, f141, 0fBF423DF9, f765; +fma.rn.f32 f770, f144, 0f3F26C059, f766; +fma.rn.f32 f771, f1262, 0fBF423DF9, f767; +fma.rn.f32 f772, f143, 0f3F26C059, f768; +fma.rn.f32 f773, f145, 0f3F6B40D2, f769; +fma.rn.f32 f774, f148, 0f3EC9E903, f770; +fma.rn.f32 f775, f1260, 0f3F6B40D2, f771; +fma.rn.f32 f776, f147, 0f3EC9E903, f772; +fma.rn.f32 f777, f149, 0fBD4F7581, f773; +fma.rn.f32 f778, f152, 0fBF7FABE3, f774; +fma.rn.f32 f779, f1258, 0fBD4F7581, f775; +fma.rn.f32 f780, f151, 0fBF7FABE3, f776; +fma.rn.f32 f781, f153, 0fBF5FD52E, f777; +fma.rn.f32 f782, f156, 0f3EF87980, f778; +fma.rn.f32 f783, f1255, 0fBF5FD52E, f779; +fma.rn.f32 f784, f155, 0f3EF87980, f780; +fma.rn.f32 f785, f157, 0f3F521D8E, f781; +fma.rn.f32 f786, f160, 0f3F123EA2, f782; +fma.rn.f32 f787, f1253, 0f3F521D8E, f783; +fma.rn.f32 f788, f159, 0f3F123EA2, f784; +fma.rn.f32 f789, f161, 0f3E1B0FE2, f785; +fma.rn.f32 f790, f164, 0fBF7D0C43, f786; +fma.rn.f32 f791, f1251, 0f3E1B0FE2, f787; +fma.rn.f32 f792, f163, 0fBF7D0C43, f788; +fma.rn.f32 f793, f165, 0fBF744278, f789; +fma.rn.f32 f794, f168, 0f3E994620, f790; +fma.rn.f32 f795, f1248, 0fBF744278, f791; +fma.rn.f32 f796, f167, 0f3E994620, f792; +fma.rn.f32 f797, f169, 0f3F306023, f793; +fma.rn.f32 f798, f172, 0f3F398C05, f794; +fma.rn.f32 f799, f1246, 0f3F306023, f795; +fma.rn.f32 f800, f171, 0f3F398C05, f796; +fma.rn.f32 f801, f173, 0f3EB1D1FE, f797; +fma.rn.f32 f802, f176, 0fBF701086, f798; +fma.rn.f32 f803, f1244, 0f3EB1D1FE, f799; +fma.rn.f32 f804, f175, 0fBF701086, f800; +fma.rn.f32 f805, f177, 0fBF7EAFC2, f801; +fma.rn.f32 f806, f180, 0f3DCF3156, f802; +fma.rn.f32 f807, f1241, 0fBF7EAFC2, f803; +fma.rn.f32 f808, f179, 0f3DCF3156, f804; +fma.rn.f32 f809, f181, 0f3F076A2F, f805; +fma.rn.f32 f810, f184, 0f3F5940C0, f806; +fma.rn.f32 f811, f1239, 0f3F076A2F, f807; +fma.rn.f32 f812, f183, 0f3F5940C0, f808; +fma.rn.f32 f813, f125, 0fBF1CB2FA, %62; +fma.rn.f32 f817, f129, 0fBE805587, f813; +fma.rn.f32 f1207, f128, 0fBF4A7047, 0f00000000; +fma.rn.f32 f818, f132, 0f3F77D3E7, f1207; +fma.rn.f32 f1206, f1272, 0fBF1CB2FA, %63; +fma.rn.f32 f819, f1269, 0fBE805587, f1206; +fma.rn.f32 f1205, f127, 0fBF4A7047, 0f00000000; +fma.rn.f32 f820, f131, 0f3F77D3E7, f1205; +fma.rn.f32 f821, f133, 0f3F6B40D2, f817; +fma.rn.f32 f822, f136, 0fBEC9E903, f818; +fma.rn.f32 f823, f1267, 0f3F6B40D2, f819; +fma.rn.f32 f824, f135, 0fBEC9E903, f820; +fma.rn.f32 f825, f137, 0fBF5FD52E, f821; +fma.rn.f32 f826, f140, 0fBEF87980, f822; +fma.rn.f32 f827, f1265, 0fBF5FD52E, f823; +fma.rn.f32 f828, f139, 0fBEF87980, f824; +fma.rn.f32 f829, f141, 0f3E1B0FE2, f825; +fma.rn.f32 f830, f144, 0f3F7D0C43, f826; +fma.rn.f32 f831, f1262, 0f3E1B0FE2, f827; +fma.rn.f32 f832, f143, 0f3F7D0C43, f828; +fma.rn.f32 f833, f145, 0f3F306023, f829; +fma.rn.f32 f834, f148, 0fBF398C05, f830; +fma.rn.f32 f835, f1260, 0f3F306023, f831; +fma.rn.f32 f836, f147, 0fBF398C05, f832; +fma.rn.f32 f837, f149, 0fBF7EAFC2, f833; +fma.rn.f32 f838, f152, 0fBDCF3156, f834; +fma.rn.f32 f839, f1258, 0fBF7EAFC2, f835; +fma.rn.f32 f840, f151, 0fBDCF3156, f836; +fma.rn.f32 f841, f153, 0f3F076A2F, f837; +fma.rn.f32 f842, f156, 0f3F5940C0, f838; +fma.rn.f32 f843, f1255, 0f3F076A2F, f839; +fma.rn.f32 f844, f155, 0f3F5940C0, f840; +fma.rn.f32 f845, f157, 0f3EB1D1FE, f841; +fma.rn.f32 f846, f160, 0fBF701086, f842; +fma.rn.f32 f847, f1253, 0f3EB1D1FE, f843; +fma.rn.f32 f848, f159, 0fBF701086, f844; +fma.rn.f32 f849, f161, 0fBF744278, f845; +fma.rn.f32 f850, f164, 0f3E994620, f846; +fma.rn.f32 f851, f1251, 0fBF744278, f847; +fma.rn.f32 f852, f163, 0f3E994620, f848; +fma.rn.f32 f853, f165, 0f3F521D8E, f849; +fma.rn.f32 f854, f168, 0f3F123EA2, f850; +fma.rn.f32 f855, f1248, 0f3F521D8E, f851; +fma.rn.f32 f856, f167, 0f3F123EA2, f852; +fma.rn.f32 f857, f169, 0fBD4F7581, f853; +fma.rn.f32 f858, f172, 0fBF7FABE3, f854; +fma.rn.f32 f859, f1246, 0fBD4F7581, f855; +fma.rn.f32 f860, f171, 0fBF7FABE3, f856; +fma.rn.f32 f861, f173, 0fBF423DF9, f857; +fma.rn.f32 f862, f176, 0f3F26C059, f858; +fma.rn.f32 f863, f1244, 0fBF423DF9, f859; +fma.rn.f32 f864, f175, 0f3F26C059, f860; +fma.rn.f32 f865, f177, 0f3F7AC279, f861; +fma.rn.f32 f866, f180, 0f3E4E2133, f862; +fma.rn.f32 f867, f1241, 0f3F7AC279, f863; +fma.rn.f32 f868, f179, 0f3E4E2133, f864; +fma.rn.f32 f869, f181, 0fBEE17B58, f865; +fma.rn.f32 f870, f184, 0fBF65D685, f866; +fma.rn.f32 f871, f1239, 0fBEE17B58, f867; +fma.rn.f32 f872, f183, 0fBF65D685, f868; +fma.rn.f32 f873, f125, 0fBF423DF9, %62; +fma.rn.f32 f877, f129, 0f3E1B0FE2, f873; +fma.rn.f32 f1204, f128, 0fBF26C059, 0f00000000; +fma.rn.f32 f878, f132, 0f3F7D0C43, f1204; +fma.rn.f32 f1203, f1272, 0fBF423DF9, %63; +fma.rn.f32 f879, f1269, 0f3E1B0FE2, f1203; +fma.rn.f32 f1202, f127, 0fBF26C059, 0f00000000; +fma.rn.f32 f880, f131, 0f3F7D0C43, f1202; +fma.rn.f32 f881, f133, 0f3F076A2F, f877; +fma.rn.f32 f882, f136, 0fBF5940C0, f878; +fma.rn.f32 f883, f1267, 0f3F076A2F, f879; +fma.rn.f32 f884, f135, 0fBF5940C0, f880; +fma.rn.f32 f885, f137, 0fBF744278, f881; +fma.rn.f32 f886, f140, 0f3E994620, f882; +fma.rn.f32 f887, f1265, 0fBF744278, f883; +fma.rn.f32 f888, f139, 0f3E994620, f884; +fma.rn.f32 f889, f141, 0f3F6B40D2, f885; +fma.rn.f32 f890, f144, 0f3EC9E903, f886; +fma.rn.f32 f891, f1262, 0f3F6B40D2, f887; +fma.rn.f32 f892, f143, 0f3EC9E903, f888; +fma.rn.f32 f893, f145, 0fBEE17B58, f889; +fma.rn.f32 f894, f148, 0fBF65D685, f890; +fma.rn.f32 f895, f1260, 0fBEE17B58, f891; +fma.rn.f32 f896, f147, 0fBF65D685, f892; +fma.rn.f32 f897, f149, 0fBE805587, f893; +fma.rn.f32 f898, f152, 0f3F77D3E7, f894; +fma.rn.f32 f899, f1258, 0fBE805587, f895; +fma.rn.f32 f900, f151, 0f3F77D3E7, f896; +fma.rn.f32 f901, f153, 0f3F521D8E, f897; +fma.rn.f32 f902, f156, 0fBF123EA2, f898; +fma.rn.f32 f903, f1255, 0f3F521D8E, f899; +fma.rn.f32 f904, f155, 0fBF123EA2, f900; +fma.rn.f32 f905, f157, 0fBF7EAFC2, f901; +fma.rn.f32 f906, f160, 0fBDCF3156, f902; +fma.rn.f32 f907, f1253, 0fBF7EAFC2, f903; +fma.rn.f32 f908, f159, 0fBDCF3156, f904; +fma.rn.f32 f909, f161, 0f3F306023, f905; +fma.rn.f32 f910, f164, 0f3F398C05, f906; +fma.rn.f32 f911, f1251, 0f3F306023, f907; +fma.rn.f32 f912, f163, 0f3F398C05, f908; +fma.rn.f32 f913, f165, 0fBD4F7581, f909; +fma.rn.f32 f914, f168, 0fBF7FABE3, f910; +fma.rn.f32 f915, f1248, 0fBD4F7581, f911; +fma.rn.f32 f916, f167, 0fBF7FABE3, f912; +fma.rn.f32 f917, f169, 0fBF1CB2FA, f913; +fma.rn.f32 f918, f172, 0f3F4A7047, f914; +fma.rn.f32 f919, f1246, 0fBF1CB2FA, f915; +fma.rn.f32 f920, f171, 0f3F4A7047, f916; +fma.rn.f32 f921, f173, 0f3F7AC279, f917; +fma.rn.f32 f922, f176, 0fBE4E2133, f918; +fma.rn.f32 f923, f1244, 0f3F7AC279, f919; +fma.rn.f32 f924, f175, 0fBE4E2133, f920; +fma.rn.f32 f925, f177, 0fBF5FD52E, f921; +fma.rn.f32 f926, f180, 0fBEF87980, f922; +fma.rn.f32 f927, f1241, 0fBF5FD52E, f923; +fma.rn.f32 f928, f179, 0fBEF87980, f924; +fma.rn.f32 f929, f181, 0f3EB1D1FE, f925; +fma.rn.f32 f930, f184, 0f3F701086, f926; +fma.rn.f32 f931, f1239, 0f3EB1D1FE, f927; +fma.rn.f32 f932, f183, 0f3F701086, f928; +fma.rn.f32 f933, f125, 0fBF5FD52E, %62; +fma.rn.f32 f937, f129, 0f3F076A2F, f933; +fma.rn.f32 f1201, f128, 0fBEF87980, 0f00000000; +fma.rn.f32 f938, f132, 0f3F5940C0, f1201; +fma.rn.f32 f1200, f1272, 0fBF5FD52E, %63; +fma.rn.f32 f939, f1269, 0f3F076A2F, f1200; +fma.rn.f32 f1199, f127, 0fBEF87980, 0f00000000; +fma.rn.f32 f940, f131, 0f3F5940C0, f1199; +fma.rn.f32 f941, f133, 0fBD4F7581, f937; +fma.rn.f32 f942, f136, 0fBF7FABE3, f938; +fma.rn.f32 f943, f1267, 0fBD4F7581, f939; +fma.rn.f32 f944, f135, 0fBF7FABE3, f940; +fma.rn.f32 f945, f137, 0fBEE17B58, f941; +fma.rn.f32 f946, f140, 0f3F65D685, f942; +fma.rn.f32 f947, f1265, 0fBEE17B58, f943; +fma.rn.f32 f948, f139, 0f3F65D685, f944; +fma.rn.f32 f949, f141, 0f3F521D8E, f945; +fma.rn.f32 f950, f144, 0fBF123EA2, f946; +fma.rn.f32 f951, f1262, 0f3F521D8E, f947; +fma.rn.f32 f952, f143, 0fBF123EA2, f948; +fma.rn.f32 f953, f145, 0fBF7EAFC2, f949; +fma.rn.f32 f954, f148, 0f3DCF3156, f950; +fma.rn.f32 f955, f1260, 0fBF7EAFC2, f951; +fma.rn.f32 f956, f147, 0f3DCF3156, f952; +fma.rn.f32 f957, f149, 0f3F6B40D2, f953; +fma.rn.f32 f958, f152, 0f3EC9E903, f954; +fma.rn.f32 f959, f1258, 0f3F6B40D2, f955; +fma.rn.f32 f960, f151, 0f3EC9E903, f956; +fma.rn.f32 f961, f153, 0fBF1CB2FA, f957; +fma.rn.f32 f962, f156, 0fBF4A7047, f958; +fma.rn.f32 f963, f1255, 0fBF1CB2FA, f959; +fma.rn.f32 f964, f155, 0fBF4A7047, f960; +fma.rn.f32 f965, f157, 0f3E1B0FE2, f961; +fma.rn.f32 f966, f160, 0f3F7D0C43, f962; +fma.rn.f32 f967, f1253, 0f3E1B0FE2, f963; +fma.rn.f32 f968, f159, 0f3F7D0C43, f964; +fma.rn.f32 f969, f161, 0f3EB1D1FE, f965; +fma.rn.f32 f970, f164, 0fBF701086, f966; +fma.rn.f32 f971, f1251, 0f3EB1D1FE, f967; +fma.rn.f32 f972, f163, 0fBF701086, f968; +fma.rn.f32 f973, f165, 0fBF423DF9, f969; +fma.rn.f32 f974, f168, 0f3F26C059, f970; +fma.rn.f32 f975, f1248, 0fBF423DF9, f971; +fma.rn.f32 f976, f167, 0f3F26C059, f972; +fma.rn.f32 f977, f169, 0f3F7AC279, f973; +fma.rn.f32 f978, f172, 0fBE4E2133, f974; +fma.rn.f32 f979, f1246, 0f3F7AC279, f975; +fma.rn.f32 f980, f171, 0fBE4E2133, f976; +fma.rn.f32 f981, f173, 0fBF744278, f977; +fma.rn.f32 f982, f176, 0fBE994620, f978; +fma.rn.f32 f983, f1244, 0fBF744278, f979; +fma.rn.f32 f984, f175, 0fBE994620, f980; +fma.rn.f32 f985, f177, 0f3F306023, f981; +fma.rn.f32 f986, f180, 0f3F398C05, f982; +fma.rn.f32 f987, f1241, 0f3F306023, f983; +fma.rn.f32 f988, f179, 0f3F398C05, f984; +fma.rn.f32 f989, f181, 0fBE805587, f985; +fma.rn.f32 f990, f184, 0fBF77D3E7, f986; +fma.rn.f32 f991, f1239, 0fBE805587, f987; +fma.rn.f32 f992, f183, 0fBF77D3E7, f988; +fma.rn.f32 f993, f125, 0fBF744278, %62; +fma.rn.f32 f997, f129, 0f3F521D8E, f993; +fma.rn.f32 f1198, f128, 0fBE994620, 0f00000000; +fma.rn.f32 f998, f132, 0f3F123EA2, f1198; +fma.rn.f32 f1197, f1272, 0fBF744278, %63; +fma.rn.f32 f999, f1269, 0f3F521D8E, f1197; +fma.rn.f32 f1196, f127, 0fBE994620, 0f00000000; +fma.rn.f32 f1000, f131, 0f3F123EA2, f1196; +fma.rn.f32 f1001, f133, 0fBF1CB2FA, f997; +fma.rn.f32 f1002, f136, 0fBF4A7047, f998; +fma.rn.f32 f1003, f1267, 0fBF1CB2FA, f999; +fma.rn.f32 f1004, f135, 0fBF4A7047, f1000; +fma.rn.f32 f1005, f137, 0f3EB1D1FE, f1001; +fma.rn.f32 f1006, f140, 0f3F701086, f1002; +fma.rn.f32 f1007, f1265, 0f3EB1D1FE, f1003; +fma.rn.f32 f1008, f139, 0f3F701086, f1004; +fma.rn.f32 f1009, f141, 0fBD4F7581, f1005; +fma.rn.f32 f1010, f144, 0fBF7FABE3, f1006; +fma.rn.f32 f1011, f1262, 0fBD4F7581, f1007; +fma.rn.f32 f1012, f143, 0fBF7FABE3, f1008; +fma.rn.f32 f1013, f145, 0fBE805587, f1009; +fma.rn.f32 f1014, f148, 0f3F77D3E7, f1010; +fma.rn.f32 f1015, f1260, 0fBE805587, f1011; +fma.rn.f32 f1016, f147, 0f3F77D3E7, f1012; +fma.rn.f32 f1017, f149, 0f3F076A2F, f1013; +fma.rn.f32 f1018, f152, 0fBF5940C0, f1014; +fma.rn.f32 f1019, f1258, 0f3F076A2F, f1015; +fma.rn.f32 f1020, f151, 0fBF5940C0, f1016; +fma.rn.f32 f1021, f153, 0fBF423DF9, f1017; +fma.rn.f32 f1022, f156, 0f3F26C059, f1018; +fma.rn.f32 f1023, f1255, 0fBF423DF9, f1019; +fma.rn.f32 f1024, f155, 0f3F26C059, f1020; +fma.rn.f32 f1025, f157, 0f3F6B40D2, f1021; +fma.rn.f32 f1026, f160, 0fBEC9E903, f1022; +fma.rn.f32 f1027, f1253, 0f3F6B40D2, f1023; +fma.rn.f32 f1028, f159, 0fBEC9E903, f1024; +fma.rn.f32 f1029, f161, 0fBF7EAFC2, f1025; +fma.rn.f32 f1030, f164, 0f3DCF3156, f1026; +fma.rn.f32 f1031, f1251, 0fBF7EAFC2, f1027; +fma.rn.f32 f1032, f163, 0f3DCF3156, f1028; +fma.rn.f32 f1033, f165, 0f3F7AC279, f1029; +fma.rn.f32 f1034, f168, 0f3E4E2133, f1030; +fma.rn.f32 f1035, f1248, 0f3F7AC279, f1031; +fma.rn.f32 f1036, f167, 0f3E4E2133, f1032; +fma.rn.f32 f1037, f169, 0fBF5FD52E, f1033; +fma.rn.f32 f1038, f172, 0fBEF87980, f1034; +fma.rn.f32 f1039, f1246, 0fBF5FD52E, f1035; +fma.rn.f32 f1040, f171, 0fBEF87980, f1036; +fma.rn.f32 f1041, f173, 0f3F306023, f1037; +fma.rn.f32 f1042, f176, 0f3F398C05, f1038; +fma.rn.f32 f1043, f1244, 0f3F306023, f1039; +fma.rn.f32 f1044, f175, 0f3F398C05, f1040; +fma.rn.f32 f1045, f177, 0fBEE17B58, f1041; +fma.rn.f32 f1046, f180, 0fBF65D685, f1042; +fma.rn.f32 f1047, f1241, 0fBEE17B58, f1043; +fma.rn.f32 f1048, f179, 0fBF65D685, f1044; +fma.rn.f32 f1049, f181, 0f3E1B0FE2, f1045; +fma.rn.f32 f1050, f184, 0f3F7D0C43, f1046; +fma.rn.f32 f1051, f1239, 0f3E1B0FE2, f1047; +fma.rn.f32 f1052, f183, 0f3F7D0C43, f1048; +fma.rn.f32 f1053, f125, 0fBF7EAFC2, %62; +fma.rn.f32 f1054, f128, 0fBDCF3156, 0f00000000; +fma.rn.f32 f1055, f1272, 0fBF7EAFC2, %63; +fma.rn.f32 f1056, f127, 0fBDCF3156, 0f00000000; +fma.rn.f32 f1057, f129, 0f3F7AC279, f1053; +fma.rn.f32 f1058, f132, 0f3E4E2133, f1054; +fma.rn.f32 f1059, f1269, 0f3F7AC279, f1055; +fma.rn.f32 f1060, f131, 0f3E4E2133, f1056; +fma.rn.f32 f1061, f133, 0fBF744278, f1057; +fma.rn.f32 f1062, f136, 0fBE994620, f1058; +fma.rn.f32 f1063, f1267, 0fBF744278, f1059; +fma.rn.f32 f1064, f135, 0fBE994620, f1060; +fma.rn.f32 f1065, f137, 0f3F6B40D2, f1061; +fma.rn.f32 f1066, f140, 0f3EC9E903, f1062; +fma.rn.f32 f1067, f1265, 0f3F6B40D2, f1063; +fma.rn.f32 f1068, f139, 0f3EC9E903, f1064; +fma.rn.f32 f1069, f141, 0fBF5FD52E, f1065; +fma.rn.f32 f1070, f144, 0fBEF87980, f1066; +fma.rn.f32 f1071, f1262, 0fBF5FD52E, f1067; +fma.rn.f32 f1072, f143, 0fBEF87980, f1068; +fma.rn.f32 f1073, f145, 0f3F521D8E, f1069; +fma.rn.f32 f1074, f148, 0f3F123EA2, f1070; +fma.rn.f32 f1075, f1260, 0f3F521D8E, f1071; +fma.rn.f32 f1076, f147, 0f3F123EA2, f1072; +fma.rn.f32 f1077, f149, 0fBF423DF9, f1073; +fma.rn.f32 f1078, f152, 0fBF26C059, f1074; +fma.rn.f32 f1079, f1258, 0fBF423DF9, f1075; +fma.rn.f32 f1080, f151, 0fBF26C059, f1076; +fma.rn.f32 f1081, f153, 0f3F306023, f1077; +fma.rn.f32 f1082, f156, 0f3F398C05, f1078; +fma.rn.f32 f1083, f1255, 0f3F306023, f1079; +fma.rn.f32 f1084, f155, 0f3F398C05, f1080; +fma.rn.f32 f1085, f157, 0fBF1CB2FA, f1081; +fma.rn.f32 f1086, f160, 0fBF4A7047, f1082; +fma.rn.f32 f1087, f1253, 0fBF1CB2FA, f1083; +fma.rn.f32 f1088, f159, 0fBF4A7047, f1084; +fma.rn.f32 f1089, f161, 0f3F076A2F, f1085; +fma.rn.f32 f1090, f164, 0f3F5940C0, f1086; +fma.rn.f32 f1091, f1251, 0f3F076A2F, f1087; +fma.rn.f32 f1092, f163, 0f3F5940C0, f1088; +fma.rn.f32 f1093, f165, 0fBEE17B58, f1089; +fma.rn.f32 f1094, f168, 0fBF65D685, f1090; +fma.rn.f32 f1095, f1248, 0fBEE17B58, f1091; +fma.rn.f32 f1096, f167, 0fBF65D685, f1092; +fma.rn.f32 f1097, f169, 0f3EB1D1FE, f1093; +fma.rn.f32 f1098, f172, 0f3F701086, f1094; +fma.rn.f32 f1099, f1246, 0f3EB1D1FE, f1095; +fma.rn.f32 f1100, f171, 0f3F701086, f1096; +fma.rn.f32 f1101, f173, 0fBE805587, f1097; +fma.rn.f32 f1102, f176, 0fBF77D3E7, f1098; +fma.rn.f32 f1103, f1244, 0fBE805587, f1099; +fma.rn.f32 f1104, f175, 0fBF77D3E7, f1100; +fma.rn.f32 f1105, f177, 0f3E1B0FE2, f1101; +fma.rn.f32 f1106, f180, 0f3F7D0C43, f1102; +fma.rn.f32 f1107, f1241, 0f3E1B0FE2, f1103; +fma.rn.f32 f1108, f179, 0f3F7D0C43, f1104; +fma.rn.f32 f1109, f181, 0fBD4F7581, f1105; +fma.rn.f32 f1110, f184, 0fBF7FABE3, f1106; +fma.rn.f32 f1111, f1239, 0fBD4F7581, f1107; +fma.rn.f32 f1112, f183, 0fBF7FABE3, f1108; +add.f32 %1, f212, f1239; +add.f32 %0, f211, f181; +sub.f32 %2, f269, f270; +add.f32 %3, f271, f272; +sub.f32 %4, f329, f330; +add.f32 %5, f331, f332; +sub.f32 %6, f389, f390; +add.f32 %7, f391, f392; +add.f32 %9, f451, f452; +sub.f32 %8, f449, f450; +add.f32 %11, f511, f512; +sub.f32 %10, f509, f510; +add.f32 %13, f571, f572; +sub.f32 %12, f569, f570; +sub.f32 %14, f629, f630; +add.f32 %15, f631, f632; +sub.f32 %16, f689, f690; +add.f32 %17, f691, f692; +sub.f32 %18, f749, f750; +add.f32 %19, f751, f752; +add.f32 %21, f811, f812; +sub.f32 %20, f809, f810; +add.f32 %23, f871, f872; +sub.f32 %22, f869, f870; +add.f32 %25, f931, f932; +sub.f32 %24, f929, f930; +sub.f32 %26, f989, f990; +add.f32 %27, f991, f992; +sub.f32 %28, f1049, f1050; +add.f32 %29, f1051, f1052; +sub.f32 %30, f1109, f1110; +add.f32 %31, f1111, f1112; +sub.f32 %33, f1111, f1112; +add.f32 %32, f1109, f1110; +sub.f32 %35, f1051, f1052; +add.f32 %34, f1049, f1050; +sub.f32 %37, f991, f992; +add.f32 %36, f989, f990; +sub.f32 %39, f931, f932; +add.f32 %38, f929, f930; +sub.f32 %41, f871, f872; +add.f32 %40, f869, f870; +sub.f32 %43, f811, f812; +add.f32 %42, f809, f810; +sub.f32 %45, f751, f752; +add.f32 %44, f749, f750; +sub.f32 %47, f691, f692; +add.f32 %46, f689, f690; +sub.f32 %49, f631, f632; +add.f32 %48, f629, f630; +sub.f32 %51, f571, f572; +add.f32 %50, f569, f570; +sub.f32 %53, f511, f512; +add.f32 %52, f509, f510; +sub.f32 %55, f451, f452; +add.f32 %54, f449, f450; +sub.f32 %57, f391, f392; +add.f32 %56, f389, f390; +sub.f32 %59, f331, f332; +add.f32 %58, f329, f330; +sub.f32 %61, f271, f272; +add.f32 %60, f269, f270; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[2].y), "f"(rmem[29].y), "f"(rmem[28].y), "f"(rmem[4].y), "f"(rmem[5].y), "f"(rmem[26].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[23].y), "f"(rmem[22].y), "f"(rmem[10].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[19].y), "f"(rmem[13].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..f14b833f9c59b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp32_inv.hpp.inc @@ -0,0 +1,1066 @@ +#ifndef CUFFTDX_FFT_31_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_31_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<217, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1273>; +.reg .b64 rd<4>; +add.f32 f125, %64, %123; +sub.f32 f127, %64, %123; +add.f32 f1272, %66, %124; +sub.f32 f128, %66, %124; +add.f32 f129, %67, %121; +sub.f32 f131, %67, %121; +add.f32 f1269, %125, %126; +sub.f32 f132, %125, %126; +add.f32 f133, %69, %119; +sub.f32 f135, %69, %119; +add.f32 f1267, %70, %127; +sub.f32 f136, %70, %127; +add.f32 f137, %71, %117; +sub.f32 f139, %71, %117; +add.f32 f1265, %128, %118; +sub.f32 f140, %128, %118; +add.f32 f141, %73, %115; +sub.f32 f143, %73, %115; +add.f32 f1262, %129, %130; +sub.f32 f144, %129, %130; +add.f32 f145, %75, %113; +sub.f32 f147, %75, %113; +add.f32 f1260, %76, %131; +sub.f32 f148, %76, %131; +add.f32 f149, %77, %111; +sub.f32 f151, %77, %111; +add.f32 f1258, %132, %112; +sub.f32 f152, %132, %112; +add.f32 f153, %79, %109; +sub.f32 f155, %79, %109; +add.f32 f1255, %133, %134; +sub.f32 f156, %133, %134; +add.f32 f157, %81, %107; +sub.f32 f159, %81, %107; +add.f32 f1253, %82, %135; +sub.f32 f160, %82, %135; +add.f32 f161, %83, %105; +sub.f32 f163, %83, %105; +add.f32 f1251, %136, %106; +sub.f32 f164, %136, %106; +add.f32 f165, %85, %103; +sub.f32 f167, %85, %103; +add.f32 f1248, %137, %138; +sub.f32 f168, %137, %138; +add.f32 f169, %87, %101; +sub.f32 f171, %87, %101; +add.f32 f1246, %88, %139; +sub.f32 f172, %88, %139; +add.f32 f173, %89, %99; +sub.f32 f175, %89, %99; +add.f32 f1244, %140, %100; +sub.f32 f176, %140, %100; +add.f32 f177, %91, %97; +sub.f32 f179, %91, %97; +add.f32 f1241, %141, %142; +sub.f32 f180, %141, %142; +add.f32 f181, %93, %95; +sub.f32 f183, %93, %95; +add.f32 f1239, %94, %143; +sub.f32 f184, %94, %143; +add.f32 f185, %62, f125; +add.f32 f187, f185, f129; +add.f32 f1238, %63, f1272; +add.f32 f188, f1238, f1269; +add.f32 f189, f187, f133; +add.f32 f190, f188, f1267; +add.f32 f191, f189, f137; +add.f32 f192, f190, f1265; +add.f32 f193, f191, f141; +add.f32 f194, f192, f1262; +add.f32 f195, f193, f145; +add.f32 f196, f194, f1260; +add.f32 f197, f195, f149; +add.f32 f198, f196, f1258; +add.f32 f199, f197, f153; +add.f32 f200, f198, f1255; +add.f32 f201, f199, f157; +add.f32 f202, f200, f1253; +add.f32 f203, f201, f161; +add.f32 f204, f202, f1251; +add.f32 f205, f203, f165; +add.f32 f206, f204, f1248; +add.f32 f207, f205, f169; +add.f32 f208, f206, f1246; +add.f32 f209, f207, f173; +add.f32 f210, f208, f1244; +add.f32 f211, f209, f177; +add.f32 f212, f210, f1241; +fma.rn.f32 f213, f125, 0f3F7AC279, %62; +fma.rn.f32 f217, f129, 0f3F6B40D2, f213; +fma.rn.f32 f1237, f128, 0f3E4E2133, 0f00000000; +fma.rn.f32 f218, f132, 0f3EC9E903, f1237; +fma.rn.f32 f1236, f1272, 0f3F7AC279, %63; +fma.rn.f32 f219, f1269, 0f3F6B40D2, f1236; +fma.rn.f32 f1235, f127, 0f3E4E2133, 0f00000000; +fma.rn.f32 f220, f131, 0f3EC9E903, f1235; +fma.rn.f32 f221, f133, 0f3F521D8E, f217; +fma.rn.f32 f222, f136, 0f3F123EA2, f218; +fma.rn.f32 f223, f1267, 0f3F521D8E, f219; +fma.rn.f32 f224, f135, 0f3F123EA2, f220; +fma.rn.f32 f225, f137, 0f3F306023, f221; +fma.rn.f32 f226, f140, 0f3F398C05, f222; +fma.rn.f32 f227, f1265, 0f3F306023, f223; +fma.rn.f32 f228, f139, 0f3F398C05, f224; +fma.rn.f32 f229, f141, 0f3F076A2F, f225; +fma.rn.f32 f230, f144, 0f3F5940C0, f226; +fma.rn.f32 f231, f1262, 0f3F076A2F, f227; +fma.rn.f32 f232, f143, 0f3F5940C0, f228; +fma.rn.f32 f233, f145, 0f3EB1D1FE, f229; +fma.rn.f32 f234, f148, 0f3F701086, f230; +fma.rn.f32 f235, f1260, 0f3EB1D1FE, f231; +fma.rn.f32 f236, f147, 0f3F701086, f232; +fma.rn.f32 f237, f149, 0f3E1B0FE2, f233; +fma.rn.f32 f238, f152, 0f3F7D0C43, f234; +fma.rn.f32 f239, f1258, 0f3E1B0FE2, f235; +fma.rn.f32 f240, f151, 0f3F7D0C43, f236; +fma.rn.f32 f241, f153, 0fBD4F7581, f237; +fma.rn.f32 f242, f156, 0f3F7FABE3, f238; +fma.rn.f32 f243, f1255, 0fBD4F7581, f239; +fma.rn.f32 f244, f155, 0f3F7FABE3, f240; +fma.rn.f32 f245, f157, 0fBE805587, f241; +fma.rn.f32 f246, f160, 0f3F77D3E7, f242; +fma.rn.f32 f247, f1253, 0fBE805587, f243; +fma.rn.f32 f248, f159, 0f3F77D3E7, f244; +fma.rn.f32 f249, f161, 0fBEE17B58, f245; +fma.rn.f32 f250, f164, 0f3F65D685, f246; +fma.rn.f32 f251, f1251, 0fBEE17B58, f247; +fma.rn.f32 f252, f163, 0f3F65D685, f248; +fma.rn.f32 f253, f165, 0fBF1CB2FA, f249; +fma.rn.f32 f254, f168, 0f3F4A7047, f250; +fma.rn.f32 f255, f1248, 0fBF1CB2FA, f251; +fma.rn.f32 f256, f167, 0f3F4A7047, f252; +fma.rn.f32 f257, f169, 0fBF423DF9, f253; +fma.rn.f32 f258, f172, 0f3F26C059, f254; +fma.rn.f32 f259, f1246, 0fBF423DF9, f255; +fma.rn.f32 f260, f171, 0f3F26C059, f256; +fma.rn.f32 f261, f173, 0fBF5FD52E, f257; +fma.rn.f32 f262, f176, 0f3EF87980, f258; +fma.rn.f32 f263, f1244, 0fBF5FD52E, f259; +fma.rn.f32 f264, f175, 0f3EF87980, f260; +fma.rn.f32 f265, f177, 0fBF744278, f261; +fma.rn.f32 f266, f180, 0f3E994620, f262; +fma.rn.f32 f267, f1241, 0fBF744278, f263; +fma.rn.f32 f268, f179, 0f3E994620, f264; +fma.rn.f32 f269, f181, 0fBF7EAFC2, f265; +fma.rn.f32 f270, f184, 0f3DCF3156, f266; +fma.rn.f32 f271, f1239, 0fBF7EAFC2, f267; +fma.rn.f32 f272, f183, 0f3DCF3156, f268; +fma.rn.f32 f273, f125, 0f3F6B40D2, %62; +fma.rn.f32 f277, f129, 0f3F306023, f273; +fma.rn.f32 f1234, f128, 0f3EC9E903, 0f00000000; +fma.rn.f32 f278, f132, 0f3F398C05, f1234; +fma.rn.f32 f1233, f1272, 0f3F6B40D2, %63; +fma.rn.f32 f279, f1269, 0f3F306023, f1233; +fma.rn.f32 f1232, f127, 0f3EC9E903, 0f00000000; +fma.rn.f32 f280, f131, 0f3F398C05, f1232; +fma.rn.f32 f281, f133, 0f3EB1D1FE, f277; +fma.rn.f32 f282, f136, 0f3F701086, f278; +fma.rn.f32 f283, f1267, 0f3EB1D1FE, f279; +fma.rn.f32 f284, f135, 0f3F701086, f280; +fma.rn.f32 f285, f137, 0fBD4F7581, f281; +fma.rn.f32 f286, f140, 0f3F7FABE3, f282; +fma.rn.f32 f287, f1265, 0fBD4F7581, f283; +fma.rn.f32 f288, f139, 0f3F7FABE3, f284; +fma.rn.f32 f289, f141, 0fBEE17B58, f285; +fma.rn.f32 f290, f144, 0f3F65D685, f286; +fma.rn.f32 f291, f1262, 0fBEE17B58, f287; +fma.rn.f32 f292, f143, 0f3F65D685, f288; +fma.rn.f32 f293, f145, 0fBF423DF9, f289; +fma.rn.f32 f294, f148, 0f3F26C059, f290; +fma.rn.f32 f295, f1260, 0fBF423DF9, f291; +fma.rn.f32 f296, f147, 0f3F26C059, f292; +fma.rn.f32 f297, f149, 0fBF744278, f293; +fma.rn.f32 f298, f152, 0f3E994620, f294; +fma.rn.f32 f299, f1258, 0fBF744278, f295; +fma.rn.f32 f300, f151, 0f3E994620, f296; +fma.rn.f32 f301, f153, 0fBF7EAFC2, f297; +fma.rn.f32 f302, f156, 0fBDCF3156, f298; +fma.rn.f32 f303, f1255, 0fBF7EAFC2, f299; +fma.rn.f32 f304, f155, 0fBDCF3156, f300; +fma.rn.f32 f305, f157, 0fBF5FD52E, f301; +fma.rn.f32 f306, f160, 0fBEF87980, f302; +fma.rn.f32 f307, f1253, 0fBF5FD52E, f303; +fma.rn.f32 f308, f159, 0fBEF87980, f304; +fma.rn.f32 f309, f161, 0fBF1CB2FA, f305; +fma.rn.f32 f310, f164, 0fBF4A7047, f306; +fma.rn.f32 f311, f1251, 0fBF1CB2FA, f307; +fma.rn.f32 f312, f163, 0fBF4A7047, f308; +fma.rn.f32 f313, f165, 0fBE805587, f309; +fma.rn.f32 f314, f168, 0fBF77D3E7, f310; +fma.rn.f32 f315, f1248, 0fBE805587, f311; +fma.rn.f32 f316, f167, 0fBF77D3E7, f312; +fma.rn.f32 f317, f169, 0f3E1B0FE2, f313; +fma.rn.f32 f318, f172, 0fBF7D0C43, f314; +fma.rn.f32 f319, f1246, 0f3E1B0FE2, f315; +fma.rn.f32 f320, f171, 0fBF7D0C43, f316; +fma.rn.f32 f321, f173, 0f3F076A2F, f317; +fma.rn.f32 f322, f176, 0fBF5940C0, f318; +fma.rn.f32 f323, f1244, 0f3F076A2F, f319; +fma.rn.f32 f324, f175, 0fBF5940C0, f320; +fma.rn.f32 f325, f177, 0f3F521D8E, f321; +fma.rn.f32 f326, f180, 0fBF123EA2, f322; +fma.rn.f32 f327, f1241, 0f3F521D8E, f323; +fma.rn.f32 f328, f179, 0fBF123EA2, f324; +fma.rn.f32 f329, f181, 0f3F7AC279, f325; +fma.rn.f32 f330, f184, 0fBE4E2133, f326; +fma.rn.f32 f331, f1239, 0f3F7AC279, f327; +fma.rn.f32 f332, f183, 0fBE4E2133, f328; +fma.rn.f32 f333, f125, 0f3F521D8E, %62; +fma.rn.f32 f337, f129, 0f3EB1D1FE, f333; +fma.rn.f32 f1231, f128, 0f3F123EA2, 0f00000000; +fma.rn.f32 f338, f132, 0f3F701086, f1231; +fma.rn.f32 f1230, f1272, 0f3F521D8E, %63; +fma.rn.f32 f339, f1269, 0f3EB1D1FE, f1230; +fma.rn.f32 f1229, f127, 0f3F123EA2, 0f00000000; +fma.rn.f32 f340, f131, 0f3F701086, f1229; +fma.rn.f32 f341, f133, 0fBE805587, f337; +fma.rn.f32 f342, f136, 0f3F77D3E7, f338; +fma.rn.f32 f343, f1267, 0fBE805587, f339; +fma.rn.f32 f344, f135, 0f3F77D3E7, f340; +fma.rn.f32 f345, f137, 0fBF423DF9, f341; +fma.rn.f32 f346, f140, 0f3F26C059, f342; +fma.rn.f32 f347, f1265, 0fBF423DF9, f343; +fma.rn.f32 f348, f139, 0f3F26C059, f344; +fma.rn.f32 f349, f141, 0fBF7EAFC2, f345; +fma.rn.f32 f350, f144, 0f3DCF3156, f346; +fma.rn.f32 f351, f1262, 0fBF7EAFC2, f347; +fma.rn.f32 f352, f143, 0f3DCF3156, f348; +fma.rn.f32 f353, f145, 0fBF5FD52E, f349; +fma.rn.f32 f354, f148, 0fBEF87980, f350; +fma.rn.f32 f355, f1260, 0fBF5FD52E, f351; +fma.rn.f32 f356, f147, 0fBEF87980, f352; +fma.rn.f32 f357, f149, 0fBEE17B58, f353; +fma.rn.f32 f358, f152, 0fBF65D685, f354; +fma.rn.f32 f359, f1258, 0fBEE17B58, f355; +fma.rn.f32 f360, f151, 0fBF65D685, f356; +fma.rn.f32 f361, f153, 0f3E1B0FE2, f357; +fma.rn.f32 f362, f156, 0fBF7D0C43, f358; +fma.rn.f32 f363, f1255, 0f3E1B0FE2, f359; +fma.rn.f32 f364, f155, 0fBF7D0C43, f360; +fma.rn.f32 f365, f157, 0f3F306023, f361; +fma.rn.f32 f366, f160, 0fBF398C05, f362; +fma.rn.f32 f367, f1253, 0f3F306023, f363; +fma.rn.f32 f368, f159, 0fBF398C05, f364; +fma.rn.f32 f369, f161, 0f3F7AC279, f365; +fma.rn.f32 f370, f164, 0fBE4E2133, f366; +fma.rn.f32 f371, f1251, 0f3F7AC279, f367; +fma.rn.f32 f372, f163, 0fBE4E2133, f368; +fma.rn.f32 f373, f165, 0f3F6B40D2, f369; +fma.rn.f32 f374, f168, 0f3EC9E903, f370; +fma.rn.f32 f375, f1248, 0f3F6B40D2, f371; +fma.rn.f32 f376, f167, 0f3EC9E903, f372; +fma.rn.f32 f377, f169, 0f3F076A2F, f373; +fma.rn.f32 f378, f172, 0f3F5940C0, f374; +fma.rn.f32 f379, f1246, 0f3F076A2F, f375; +fma.rn.f32 f380, f171, 0f3F5940C0, f376; +fma.rn.f32 f381, f173, 0fBD4F7581, f377; +fma.rn.f32 f382, f176, 0f3F7FABE3, f378; +fma.rn.f32 f383, f1244, 0fBD4F7581, f379; +fma.rn.f32 f384, f175, 0f3F7FABE3, f380; +fma.rn.f32 f385, f177, 0fBF1CB2FA, f381; +fma.rn.f32 f386, f180, 0f3F4A7047, f382; +fma.rn.f32 f387, f1241, 0fBF1CB2FA, f383; +fma.rn.f32 f388, f179, 0f3F4A7047, f384; +fma.rn.f32 f389, f181, 0fBF744278, f385; +fma.rn.f32 f390, f184, 0f3E994620, f386; +fma.rn.f32 f391, f1239, 0fBF744278, f387; +fma.rn.f32 f392, f183, 0f3E994620, f388; +fma.rn.f32 f393, f125, 0f3F306023, %62; +fma.rn.f32 f397, f129, 0fBD4F7581, f393; +fma.rn.f32 f1228, f128, 0f3F398C05, 0f00000000; +fma.rn.f32 f398, f132, 0f3F7FABE3, f1228; +fma.rn.f32 f1227, f1272, 0f3F306023, %63; +fma.rn.f32 f399, f1269, 0fBD4F7581, f1227; +fma.rn.f32 f1226, f127, 0f3F398C05, 0f00000000; +fma.rn.f32 f400, f131, 0f3F7FABE3, f1226; +fma.rn.f32 f401, f133, 0fBF423DF9, f397; +fma.rn.f32 f402, f136, 0f3F26C059, f398; +fma.rn.f32 f403, f1267, 0fBF423DF9, f399; +fma.rn.f32 f404, f135, 0f3F26C059, f400; +fma.rn.f32 f405, f137, 0fBF7EAFC2, f401; +fma.rn.f32 f406, f140, 0fBDCF3156, f402; +fma.rn.f32 f407, f1265, 0fBF7EAFC2, f403; +fma.rn.f32 f408, f139, 0fBDCF3156, f404; +fma.rn.f32 f409, f141, 0fBF1CB2FA, f405; +fma.rn.f32 f410, f144, 0fBF4A7047, f406; +fma.rn.f32 f411, f1262, 0fBF1CB2FA, f407; +fma.rn.f32 f412, f143, 0fBF4A7047, f408; +fma.rn.f32 f413, f145, 0f3E1B0FE2, f409; +fma.rn.f32 f414, f148, 0fBF7D0C43, f410; +fma.rn.f32 f415, f1260, 0f3E1B0FE2, f411; +fma.rn.f32 f416, f147, 0fBF7D0C43, f412; +fma.rn.f32 f417, f149, 0f3F521D8E, f413; +fma.rn.f32 f418, f152, 0fBF123EA2, f414; +fma.rn.f32 f419, f1258, 0f3F521D8E, f415; +fma.rn.f32 f420, f151, 0fBF123EA2, f416; +fma.rn.f32 f421, f153, 0f3F7AC279, f417; +fma.rn.f32 f422, f156, 0f3E4E2133, f418; +fma.rn.f32 f423, f1255, 0f3F7AC279, f419; +fma.rn.f32 f424, f155, 0f3E4E2133, f420; +fma.rn.f32 f425, f157, 0f3F076A2F, f421; +fma.rn.f32 f426, f160, 0f3F5940C0, f422; +fma.rn.f32 f427, f1253, 0f3F076A2F, f423; +fma.rn.f32 f428, f159, 0f3F5940C0, f424; +fma.rn.f32 f429, f161, 0fBE805587, f425; +fma.rn.f32 f430, f164, 0f3F77D3E7, f426; +fma.rn.f32 f431, f1251, 0fBE805587, f427; +fma.rn.f32 f432, f163, 0f3F77D3E7, f428; +fma.rn.f32 f433, f165, 0fBF5FD52E, f429; +fma.rn.f32 f434, f168, 0f3EF87980, f430; +fma.rn.f32 f435, f1248, 0fBF5FD52E, f431; +fma.rn.f32 f436, f167, 0f3EF87980, f432; +fma.rn.f32 f437, f169, 0fBF744278, f433; +fma.rn.f32 f438, f172, 0fBE994620, f434; +fma.rn.f32 f439, f1246, 0fBF744278, f435; +fma.rn.f32 f440, f171, 0fBE994620, f436; +fma.rn.f32 f441, f173, 0fBEE17B58, f437; +fma.rn.f32 f442, f176, 0fBF65D685, f438; +fma.rn.f32 f443, f1244, 0fBEE17B58, f439; +fma.rn.f32 f444, f175, 0fBF65D685, f440; +fma.rn.f32 f445, f177, 0f3EB1D1FE, f441; +fma.rn.f32 f446, f180, 0fBF701086, f442; +fma.rn.f32 f447, f1241, 0f3EB1D1FE, f443; +fma.rn.f32 f448, f179, 0fBF701086, f444; +fma.rn.f32 f449, f181, 0f3F6B40D2, f445; +fma.rn.f32 f450, f184, 0fBEC9E903, f446; +fma.rn.f32 f451, f1239, 0f3F6B40D2, f447; +fma.rn.f32 f452, f183, 0fBEC9E903, f448; +fma.rn.f32 f453, f125, 0f3F076A2F, %62; +fma.rn.f32 f457, f129, 0fBEE17B58, f453; +fma.rn.f32 f1225, f128, 0f3F5940C0, 0f00000000; +fma.rn.f32 f458, f132, 0f3F65D685, f1225; +fma.rn.f32 f1224, f1272, 0f3F076A2F, %63; +fma.rn.f32 f459, f1269, 0fBEE17B58, f1224; +fma.rn.f32 f1223, f127, 0f3F5940C0, 0f00000000; +fma.rn.f32 f460, f131, 0f3F65D685, f1223; +fma.rn.f32 f461, f133, 0fBF7EAFC2, f457; +fma.rn.f32 f462, f136, 0f3DCF3156, f458; +fma.rn.f32 f463, f1267, 0fBF7EAFC2, f459; +fma.rn.f32 f464, f135, 0f3DCF3156, f460; +fma.rn.f32 f465, f137, 0fBF1CB2FA, f461; +fma.rn.f32 f466, f140, 0fBF4A7047, f462; +fma.rn.f32 f467, f1265, 0fBF1CB2FA, f463; +fma.rn.f32 f468, f139, 0fBF4A7047, f464; +fma.rn.f32 f469, f141, 0f3EB1D1FE, f465; +fma.rn.f32 f470, f144, 0fBF701086, f466; +fma.rn.f32 f471, f1262, 0f3EB1D1FE, f467; +fma.rn.f32 f472, f143, 0fBF701086, f468; +fma.rn.f32 f473, f145, 0f3F7AC279, f469; +fma.rn.f32 f474, f148, 0fBE4E2133, f470; +fma.rn.f32 f475, f1260, 0f3F7AC279, f471; +fma.rn.f32 f476, f147, 0fBE4E2133, f472; +fma.rn.f32 f477, f149, 0f3F306023, f473; +fma.rn.f32 f478, f152, 0f3F398C05, f474; +fma.rn.f32 f479, f1258, 0f3F306023, f475; +fma.rn.f32 f480, f151, 0f3F398C05, f476; +fma.rn.f32 f481, f153, 0fBE805587, f477; +fma.rn.f32 f482, f156, 0f3F77D3E7, f478; +fma.rn.f32 f483, f1255, 0fBE805587, f479; +fma.rn.f32 f484, f155, 0f3F77D3E7, f480; +fma.rn.f32 f485, f157, 0fBF744278, f481; +fma.rn.f32 f486, f160, 0f3E994620, f482; +fma.rn.f32 f487, f1253, 0fBF744278, f483; +fma.rn.f32 f488, f159, 0f3E994620, f484; +fma.rn.f32 f489, f161, 0fBF423DF9, f485; +fma.rn.f32 f490, f164, 0fBF26C059, f486; +fma.rn.f32 f491, f1251, 0fBF423DF9, f487; +fma.rn.f32 f492, f163, 0fBF26C059, f488; +fma.rn.f32 f493, f165, 0f3E1B0FE2, f489; +fma.rn.f32 f494, f168, 0fBF7D0C43, f490; +fma.rn.f32 f495, f1248, 0f3E1B0FE2, f491; +fma.rn.f32 f496, f167, 0fBF7D0C43, f492; +fma.rn.f32 f497, f169, 0f3F6B40D2, f493; +fma.rn.f32 f498, f172, 0fBEC9E903, f494; +fma.rn.f32 f499, f1246, 0f3F6B40D2, f495; +fma.rn.f32 f500, f171, 0fBEC9E903, f496; +fma.rn.f32 f501, f173, 0f3F521D8E, f497; +fma.rn.f32 f502, f176, 0f3F123EA2, f498; +fma.rn.f32 f503, f1244, 0f3F521D8E, f499; +fma.rn.f32 f504, f175, 0f3F123EA2, f500; +fma.rn.f32 f505, f177, 0fBD4F7581, f501; +fma.rn.f32 f506, f180, 0f3F7FABE3, f502; +fma.rn.f32 f507, f1241, 0fBD4F7581, f503; +fma.rn.f32 f508, f179, 0f3F7FABE3, f504; +fma.rn.f32 f509, f181, 0fBF5FD52E, f505; +fma.rn.f32 f510, f184, 0f3EF87980, f506; +fma.rn.f32 f511, f1239, 0fBF5FD52E, f507; +fma.rn.f32 f512, f183, 0f3EF87980, f508; +fma.rn.f32 f513, f125, 0f3EB1D1FE, %62; +fma.rn.f32 f517, f129, 0fBF423DF9, f513; +fma.rn.f32 f1222, f128, 0f3F701086, 0f00000000; +fma.rn.f32 f518, f132, 0f3F26C059, f1222; +fma.rn.f32 f1221, f1272, 0f3EB1D1FE, %63; +fma.rn.f32 f519, f1269, 0fBF423DF9, f1221; +fma.rn.f32 f1220, f127, 0f3F701086, 0f00000000; +fma.rn.f32 f520, f131, 0f3F26C059, f1220; +fma.rn.f32 f521, f133, 0fBF5FD52E, f517; +fma.rn.f32 f522, f136, 0fBEF87980, f518; +fma.rn.f32 f523, f1267, 0fBF5FD52E, f519; +fma.rn.f32 f524, f135, 0fBEF87980, f520; +fma.rn.f32 f525, f137, 0f3E1B0FE2, f521; +fma.rn.f32 f526, f140, 0fBF7D0C43, f522; +fma.rn.f32 f527, f1265, 0f3E1B0FE2, f523; +fma.rn.f32 f528, f139, 0fBF7D0C43, f524; +fma.rn.f32 f529, f141, 0f3F7AC279, f525; +fma.rn.f32 f530, f144, 0fBE4E2133, f526; +fma.rn.f32 f531, f1262, 0f3F7AC279, f527; +fma.rn.f32 f532, f143, 0fBE4E2133, f528; +fma.rn.f32 f533, f145, 0f3F076A2F, f529; +fma.rn.f32 f534, f148, 0f3F5940C0, f530; +fma.rn.f32 f535, f1260, 0f3F076A2F, f531; +fma.rn.f32 f536, f147, 0f3F5940C0, f532; +fma.rn.f32 f537, f149, 0fBF1CB2FA, f533; +fma.rn.f32 f538, f152, 0f3F4A7047, f534; +fma.rn.f32 f539, f1258, 0fBF1CB2FA, f535; +fma.rn.f32 f540, f151, 0f3F4A7047, f536; +fma.rn.f32 f541, f153, 0fBF744278, f537; +fma.rn.f32 f542, f156, 0fBE994620, f538; +fma.rn.f32 f543, f1255, 0fBF744278, f539; +fma.rn.f32 f544, f155, 0fBE994620, f540; +fma.rn.f32 f545, f157, 0fBD4F7581, f541; +fma.rn.f32 f546, f160, 0fBF7FABE3, f542; +fma.rn.f32 f547, f1253, 0fBD4F7581, f543; +fma.rn.f32 f548, f159, 0fBF7FABE3, f544; +fma.rn.f32 f549, f161, 0f3F6B40D2, f545; +fma.rn.f32 f550, f164, 0fBEC9E903, f546; +fma.rn.f32 f551, f1251, 0f3F6B40D2, f547; +fma.rn.f32 f552, f163, 0fBEC9E903, f548; +fma.rn.f32 f553, f165, 0f3F306023, f549; +fma.rn.f32 f554, f168, 0f3F398C05, f550; +fma.rn.f32 f555, f1248, 0f3F306023, f551; +fma.rn.f32 f556, f167, 0f3F398C05, f552; +fma.rn.f32 f557, f169, 0fBEE17B58, f553; +fma.rn.f32 f558, f172, 0f3F65D685, f554; +fma.rn.f32 f559, f1246, 0fBEE17B58, f555; +fma.rn.f32 f560, f171, 0f3F65D685, f556; +fma.rn.f32 f561, f173, 0fBF7EAFC2, f557; +fma.rn.f32 f562, f176, 0fBDCF3156, f558; +fma.rn.f32 f563, f1244, 0fBF7EAFC2, f559; +fma.rn.f32 f564, f175, 0fBDCF3156, f560; +fma.rn.f32 f565, f177, 0fBE805587, f561; +fma.rn.f32 f566, f180, 0fBF77D3E7, f562; +fma.rn.f32 f567, f1241, 0fBE805587, f563; +fma.rn.f32 f568, f179, 0fBF77D3E7, f564; +fma.rn.f32 f569, f181, 0f3F521D8E, f565; +fma.rn.f32 f570, f184, 0fBF123EA2, f566; +fma.rn.f32 f571, f1239, 0f3F521D8E, f567; +fma.rn.f32 f572, f183, 0fBF123EA2, f568; +fma.rn.f32 f573, f125, 0f3E1B0FE2, %62; +fma.rn.f32 f577, f129, 0fBF744278, f573; +fma.rn.f32 f1219, f128, 0f3F7D0C43, 0f00000000; +fma.rn.f32 f578, f132, 0f3E994620, f1219; +fma.rn.f32 f1218, f1272, 0f3E1B0FE2, %63; +fma.rn.f32 f579, f1269, 0fBF744278, f1218; +fma.rn.f32 f1217, f127, 0f3F7D0C43, 0f00000000; +fma.rn.f32 f580, f131, 0f3E994620, f1217; +fma.rn.f32 f581, f133, 0fBEE17B58, f577; +fma.rn.f32 f582, f136, 0fBF65D685, f578; +fma.rn.f32 f583, f1267, 0fBEE17B58, f579; +fma.rn.f32 f584, f135, 0fBF65D685, f580; +fma.rn.f32 f585, f137, 0f3F521D8E, f581; +fma.rn.f32 f586, f140, 0fBF123EA2, f582; +fma.rn.f32 f587, f1265, 0f3F521D8E, f583; +fma.rn.f32 f588, f139, 0fBF123EA2, f584; +fma.rn.f32 f589, f141, 0f3F306023, f585; +fma.rn.f32 f590, f144, 0f3F398C05, f586; +fma.rn.f32 f591, f1262, 0f3F306023, f587; +fma.rn.f32 f592, f143, 0f3F398C05, f588; +fma.rn.f32 f593, f145, 0fBF1CB2FA, f589; +fma.rn.f32 f594, f148, 0f3F4A7047, f590; +fma.rn.f32 f595, f1260, 0fBF1CB2FA, f591; +fma.rn.f32 f596, f147, 0f3F4A7047, f592; +fma.rn.f32 f597, f149, 0fBF5FD52E, f593; +fma.rn.f32 f598, f152, 0fBEF87980, f594; +fma.rn.f32 f599, f1258, 0fBF5FD52E, f595; +fma.rn.f32 f600, f151, 0fBEF87980, f596; +fma.rn.f32 f601, f153, 0f3EB1D1FE, f597; +fma.rn.f32 f602, f156, 0fBF701086, f598; +fma.rn.f32 f603, f1255, 0f3EB1D1FE, f599; +fma.rn.f32 f604, f155, 0fBF701086, f600; +fma.rn.f32 f605, f157, 0f3F7AC279, f601; +fma.rn.f32 f606, f160, 0f3E4E2133, f602; +fma.rn.f32 f607, f1253, 0f3F7AC279, f603; +fma.rn.f32 f608, f159, 0f3E4E2133, f604; +fma.rn.f32 f609, f161, 0fBD4F7581, f605; +fma.rn.f32 f610, f164, 0f3F7FABE3, f606; +fma.rn.f32 f611, f1251, 0fBD4F7581, f607; +fma.rn.f32 f612, f163, 0f3F7FABE3, f608; +fma.rn.f32 f613, f165, 0fBF7EAFC2, f609; +fma.rn.f32 f614, f168, 0f3DCF3156, f610; +fma.rn.f32 f615, f1248, 0fBF7EAFC2, f611; +fma.rn.f32 f616, f167, 0f3DCF3156, f612; +fma.rn.f32 f617, f169, 0fBE805587, f613; +fma.rn.f32 f618, f172, 0fBF77D3E7, f614; +fma.rn.f32 f619, f1246, 0fBE805587, f615; +fma.rn.f32 f620, f171, 0fBF77D3E7, f616; +fma.rn.f32 f621, f173, 0f3F6B40D2, f617; +fma.rn.f32 f622, f176, 0fBEC9E903, f618; +fma.rn.f32 f623, f1244, 0f3F6B40D2, f619; +fma.rn.f32 f624, f175, 0fBEC9E903, f620; +fma.rn.f32 f625, f177, 0f3F076A2F, f621; +fma.rn.f32 f626, f180, 0f3F5940C0, f622; +fma.rn.f32 f627, f1241, 0f3F076A2F, f623; +fma.rn.f32 f628, f179, 0f3F5940C0, f624; +fma.rn.f32 f629, f181, 0fBF423DF9, f625; +fma.rn.f32 f630, f184, 0f3F26C059, f626; +fma.rn.f32 f631, f1239, 0fBF423DF9, f627; +fma.rn.f32 f632, f183, 0f3F26C059, f628; +fma.rn.f32 f633, f125, 0fBD4F7581, %62; +fma.rn.f32 f637, f129, 0fBF7EAFC2, f633; +fma.rn.f32 f1216, f128, 0f3F7FABE3, 0f00000000; +fma.rn.f32 f638, f132, 0fBDCF3156, f1216; +fma.rn.f32 f1215, f1272, 0fBD4F7581, %63; +fma.rn.f32 f639, f1269, 0fBF7EAFC2, f1215; +fma.rn.f32 f1214, f127, 0f3F7FABE3, 0f00000000; +fma.rn.f32 f640, f131, 0fBDCF3156, f1214; +fma.rn.f32 f641, f133, 0f3E1B0FE2, f637; +fma.rn.f32 f642, f136, 0fBF7D0C43, f638; +fma.rn.f32 f643, f1267, 0f3E1B0FE2, f639; +fma.rn.f32 f644, f135, 0fBF7D0C43, f640; +fma.rn.f32 f645, f137, 0f3F7AC279, f641; +fma.rn.f32 f646, f140, 0f3E4E2133, f642; +fma.rn.f32 f647, f1265, 0f3F7AC279, f643; +fma.rn.f32 f648, f139, 0f3E4E2133, f644; +fma.rn.f32 f649, f141, 0fBE805587, f645; +fma.rn.f32 f650, f144, 0f3F77D3E7, f646; +fma.rn.f32 f651, f1262, 0fBE805587, f647; +fma.rn.f32 f652, f143, 0f3F77D3E7, f648; +fma.rn.f32 f653, f145, 0fBF744278, f649; +fma.rn.f32 f654, f148, 0fBE994620, f650; +fma.rn.f32 f655, f1260, 0fBF744278, f651; +fma.rn.f32 f656, f147, 0fBE994620, f652; +fma.rn.f32 f657, f149, 0f3EB1D1FE, f653; +fma.rn.f32 f658, f152, 0fBF701086, f654; +fma.rn.f32 f659, f1258, 0f3EB1D1FE, f655; +fma.rn.f32 f660, f151, 0fBF701086, f656; +fma.rn.f32 f661, f153, 0f3F6B40D2, f657; +fma.rn.f32 f662, f156, 0f3EC9E903, f658; +fma.rn.f32 f663, f1255, 0f3F6B40D2, f659; +fma.rn.f32 f664, f155, 0f3EC9E903, f660; +fma.rn.f32 f665, f157, 0fBEE17B58, f661; +fma.rn.f32 f666, f160, 0f3F65D685, f662; +fma.rn.f32 f667, f1253, 0fBEE17B58, f663; +fma.rn.f32 f668, f159, 0f3F65D685, f664; +fma.rn.f32 f669, f161, 0fBF5FD52E, f665; +fma.rn.f32 f670, f164, 0fBEF87980, f666; +fma.rn.f32 f671, f1251, 0fBF5FD52E, f667; +fma.rn.f32 f672, f163, 0fBEF87980, f668; +fma.rn.f32 f673, f165, 0f3F076A2F, f669; +fma.rn.f32 f674, f168, 0fBF5940C0, f670; +fma.rn.f32 f675, f1248, 0f3F076A2F, f671; +fma.rn.f32 f676, f167, 0fBF5940C0, f672; +fma.rn.f32 f677, f169, 0f3F521D8E, f673; +fma.rn.f32 f678, f172, 0f3F123EA2, f674; +fma.rn.f32 f679, f1246, 0f3F521D8E, f675; +fma.rn.f32 f680, f171, 0f3F123EA2, f676; +fma.rn.f32 f681, f173, 0fBF1CB2FA, f677; +fma.rn.f32 f682, f176, 0f3F4A7047, f678; +fma.rn.f32 f683, f1244, 0fBF1CB2FA, f679; +fma.rn.f32 f684, f175, 0f3F4A7047, f680; +fma.rn.f32 f685, f177, 0fBF423DF9, f681; +fma.rn.f32 f686, f180, 0fBF26C059, f682; +fma.rn.f32 f687, f1241, 0fBF423DF9, f683; +fma.rn.f32 f688, f179, 0fBF26C059, f684; +fma.rn.f32 f689, f181, 0f3F306023, f685; +fma.rn.f32 f690, f184, 0fBF398C05, f686; +fma.rn.f32 f691, f1239, 0f3F306023, f687; +fma.rn.f32 f692, f183, 0fBF398C05, f688; +fma.rn.f32 f693, f125, 0fBE805587, %62; +fma.rn.f32 f697, f129, 0fBF5FD52E, f693; +fma.rn.f32 f1213, f128, 0f3F77D3E7, 0f00000000; +fma.rn.f32 f698, f132, 0fBEF87980, f1213; +fma.rn.f32 f1212, f1272, 0fBE805587, %63; +fma.rn.f32 f699, f1269, 0fBF5FD52E, f1212; +fma.rn.f32 f1211, f127, 0f3F77D3E7, 0f00000000; +fma.rn.f32 f700, f131, 0fBEF87980, f1211; +fma.rn.f32 f701, f133, 0f3F306023, f697; +fma.rn.f32 f702, f136, 0fBF398C05, f698; +fma.rn.f32 f703, f1267, 0f3F306023, f699; +fma.rn.f32 f704, f135, 0fBF398C05, f700; +fma.rn.f32 f705, f137, 0f3F076A2F, f701; +fma.rn.f32 f706, f140, 0f3F5940C0, f702; +fma.rn.f32 f707, f1265, 0f3F076A2F, f703; +fma.rn.f32 f708, f139, 0f3F5940C0, f704; +fma.rn.f32 f709, f141, 0fBF744278, f705; +fma.rn.f32 f710, f144, 0f3E994620, f706; +fma.rn.f32 f711, f1262, 0fBF744278, f707; +fma.rn.f32 f712, f143, 0f3E994620, f708; +fma.rn.f32 f713, f145, 0fBD4F7581, f709; +fma.rn.f32 f714, f148, 0fBF7FABE3, f710; +fma.rn.f32 f715, f1260, 0fBD4F7581, f711; +fma.rn.f32 f716, f147, 0fBF7FABE3, f712; +fma.rn.f32 f717, f149, 0f3F7AC279, f713; +fma.rn.f32 f718, f152, 0f3E4E2133, f714; +fma.rn.f32 f719, f1258, 0f3F7AC279, f715; +fma.rn.f32 f720, f151, 0f3E4E2133, f716; +fma.rn.f32 f721, f153, 0fBEE17B58, f717; +fma.rn.f32 f722, f156, 0f3F65D685, f718; +fma.rn.f32 f723, f1255, 0fBEE17B58, f719; +fma.rn.f32 f724, f155, 0f3F65D685, f720; +fma.rn.f32 f725, f157, 0fBF423DF9, f721; +fma.rn.f32 f726, f160, 0fBF26C059, f722; +fma.rn.f32 f727, f1253, 0fBF423DF9, f723; +fma.rn.f32 f728, f159, 0fBF26C059, f724; +fma.rn.f32 f729, f161, 0f3F521D8E, f725; +fma.rn.f32 f730, f164, 0fBF123EA2, f726; +fma.rn.f32 f731, f1251, 0f3F521D8E, f727; +fma.rn.f32 f732, f163, 0fBF123EA2, f728; +fma.rn.f32 f733, f165, 0f3EB1D1FE, f729; +fma.rn.f32 f734, f168, 0f3F701086, f730; +fma.rn.f32 f735, f1248, 0f3EB1D1FE, f731; +fma.rn.f32 f736, f167, 0f3F701086, f732; +fma.rn.f32 f737, f169, 0fBF7EAFC2, f733; +fma.rn.f32 f738, f172, 0f3DCF3156, f734; +fma.rn.f32 f739, f1246, 0fBF7EAFC2, f735; +fma.rn.f32 f740, f171, 0f3DCF3156, f736; +fma.rn.f32 f741, f173, 0f3E1B0FE2, f737; +fma.rn.f32 f742, f176, 0fBF7D0C43, f738; +fma.rn.f32 f743, f1244, 0f3E1B0FE2, f739; +fma.rn.f32 f744, f175, 0fBF7D0C43, f740; +fma.rn.f32 f745, f177, 0f3F6B40D2, f741; +fma.rn.f32 f746, f180, 0f3EC9E903, f742; +fma.rn.f32 f747, f1241, 0f3F6B40D2, f743; +fma.rn.f32 f748, f179, 0f3EC9E903, f744; +fma.rn.f32 f749, f181, 0fBF1CB2FA, f745; +fma.rn.f32 f750, f184, 0f3F4A7047, f746; +fma.rn.f32 f751, f1239, 0fBF1CB2FA, f747; +fma.rn.f32 f752, f183, 0f3F4A7047, f748; +fma.rn.f32 f753, f125, 0fBEE17B58, %62; +fma.rn.f32 f757, f129, 0fBF1CB2FA, f753; +fma.rn.f32 f1210, f128, 0f3F65D685, 0f00000000; +fma.rn.f32 f758, f132, 0fBF4A7047, f1210; +fma.rn.f32 f1209, f1272, 0fBEE17B58, %63; +fma.rn.f32 f759, f1269, 0fBF1CB2FA, f1209; +fma.rn.f32 f1208, f127, 0f3F65D685, 0f00000000; +fma.rn.f32 f760, f131, 0fBF4A7047, f1208; +fma.rn.f32 f761, f133, 0f3F7AC279, f757; +fma.rn.f32 f762, f136, 0fBE4E2133, f758; +fma.rn.f32 f763, f1267, 0f3F7AC279, f759; +fma.rn.f32 f764, f135, 0fBE4E2133, f760; +fma.rn.f32 f765, f137, 0fBE805587, f761; +fma.rn.f32 f766, f140, 0f3F77D3E7, f762; +fma.rn.f32 f767, f1265, 0fBE805587, f763; +fma.rn.f32 f768, f139, 0f3F77D3E7, f764; +fma.rn.f32 f769, f141, 0fBF423DF9, f765; +fma.rn.f32 f770, f144, 0fBF26C059, f766; +fma.rn.f32 f771, f1262, 0fBF423DF9, f767; +fma.rn.f32 f772, f143, 0fBF26C059, f768; +fma.rn.f32 f773, f145, 0f3F6B40D2, f769; +fma.rn.f32 f774, f148, 0fBEC9E903, f770; +fma.rn.f32 f775, f1260, 0f3F6B40D2, f771; +fma.rn.f32 f776, f147, 0fBEC9E903, f772; +fma.rn.f32 f777, f149, 0fBD4F7581, f773; +fma.rn.f32 f778, f152, 0f3F7FABE3, f774; +fma.rn.f32 f779, f1258, 0fBD4F7581, f775; +fma.rn.f32 f780, f151, 0f3F7FABE3, f776; +fma.rn.f32 f781, f153, 0fBF5FD52E, f777; +fma.rn.f32 f782, f156, 0fBEF87980, f778; +fma.rn.f32 f783, f1255, 0fBF5FD52E, f779; +fma.rn.f32 f784, f155, 0fBEF87980, f780; +fma.rn.f32 f785, f157, 0f3F521D8E, f781; +fma.rn.f32 f786, f160, 0fBF123EA2, f782; +fma.rn.f32 f787, f1253, 0f3F521D8E, f783; +fma.rn.f32 f788, f159, 0fBF123EA2, f784; +fma.rn.f32 f789, f161, 0f3E1B0FE2, f785; +fma.rn.f32 f790, f164, 0f3F7D0C43, f786; +fma.rn.f32 f791, f1251, 0f3E1B0FE2, f787; +fma.rn.f32 f792, f163, 0f3F7D0C43, f788; +fma.rn.f32 f793, f165, 0fBF744278, f789; +fma.rn.f32 f794, f168, 0fBE994620, f790; +fma.rn.f32 f795, f1248, 0fBF744278, f791; +fma.rn.f32 f796, f167, 0fBE994620, f792; +fma.rn.f32 f797, f169, 0f3F306023, f793; +fma.rn.f32 f798, f172, 0fBF398C05, f794; +fma.rn.f32 f799, f1246, 0f3F306023, f795; +fma.rn.f32 f800, f171, 0fBF398C05, f796; +fma.rn.f32 f801, f173, 0f3EB1D1FE, f797; +fma.rn.f32 f802, f176, 0f3F701086, f798; +fma.rn.f32 f803, f1244, 0f3EB1D1FE, f799; +fma.rn.f32 f804, f175, 0f3F701086, f800; +fma.rn.f32 f805, f177, 0fBF7EAFC2, f801; +fma.rn.f32 f806, f180, 0fBDCF3156, f802; +fma.rn.f32 f807, f1241, 0fBF7EAFC2, f803; +fma.rn.f32 f808, f179, 0fBDCF3156, f804; +fma.rn.f32 f809, f181, 0f3F076A2F, f805; +fma.rn.f32 f810, f184, 0fBF5940C0, f806; +fma.rn.f32 f811, f1239, 0f3F076A2F, f807; +fma.rn.f32 f812, f183, 0fBF5940C0, f808; +fma.rn.f32 f813, f125, 0fBF1CB2FA, %62; +fma.rn.f32 f817, f129, 0fBE805587, f813; +fma.rn.f32 f1207, f128, 0f3F4A7047, 0f00000000; +fma.rn.f32 f818, f132, 0fBF77D3E7, f1207; +fma.rn.f32 f1206, f1272, 0fBF1CB2FA, %63; +fma.rn.f32 f819, f1269, 0fBE805587, f1206; +fma.rn.f32 f1205, f127, 0f3F4A7047, 0f00000000; +fma.rn.f32 f820, f131, 0fBF77D3E7, f1205; +fma.rn.f32 f821, f133, 0f3F6B40D2, f817; +fma.rn.f32 f822, f136, 0f3EC9E903, f818; +fma.rn.f32 f823, f1267, 0f3F6B40D2, f819; +fma.rn.f32 f824, f135, 0f3EC9E903, f820; +fma.rn.f32 f825, f137, 0fBF5FD52E, f821; +fma.rn.f32 f826, f140, 0f3EF87980, f822; +fma.rn.f32 f827, f1265, 0fBF5FD52E, f823; +fma.rn.f32 f828, f139, 0f3EF87980, f824; +fma.rn.f32 f829, f141, 0f3E1B0FE2, f825; +fma.rn.f32 f830, f144, 0fBF7D0C43, f826; +fma.rn.f32 f831, f1262, 0f3E1B0FE2, f827; +fma.rn.f32 f832, f143, 0fBF7D0C43, f828; +fma.rn.f32 f833, f145, 0f3F306023, f829; +fma.rn.f32 f834, f148, 0f3F398C05, f830; +fma.rn.f32 f835, f1260, 0f3F306023, f831; +fma.rn.f32 f836, f147, 0f3F398C05, f832; +fma.rn.f32 f837, f149, 0fBF7EAFC2, f833; +fma.rn.f32 f838, f152, 0f3DCF3156, f834; +fma.rn.f32 f839, f1258, 0fBF7EAFC2, f835; +fma.rn.f32 f840, f151, 0f3DCF3156, f836; +fma.rn.f32 f841, f153, 0f3F076A2F, f837; +fma.rn.f32 f842, f156, 0fBF5940C0, f838; +fma.rn.f32 f843, f1255, 0f3F076A2F, f839; +fma.rn.f32 f844, f155, 0fBF5940C0, f840; +fma.rn.f32 f845, f157, 0f3EB1D1FE, f841; +fma.rn.f32 f846, f160, 0f3F701086, f842; +fma.rn.f32 f847, f1253, 0f3EB1D1FE, f843; +fma.rn.f32 f848, f159, 0f3F701086, f844; +fma.rn.f32 f849, f161, 0fBF744278, f845; +fma.rn.f32 f850, f164, 0fBE994620, f846; +fma.rn.f32 f851, f1251, 0fBF744278, f847; +fma.rn.f32 f852, f163, 0fBE994620, f848; +fma.rn.f32 f853, f165, 0f3F521D8E, f849; +fma.rn.f32 f854, f168, 0fBF123EA2, f850; +fma.rn.f32 f855, f1248, 0f3F521D8E, f851; +fma.rn.f32 f856, f167, 0fBF123EA2, f852; +fma.rn.f32 f857, f169, 0fBD4F7581, f853; +fma.rn.f32 f858, f172, 0f3F7FABE3, f854; +fma.rn.f32 f859, f1246, 0fBD4F7581, f855; +fma.rn.f32 f860, f171, 0f3F7FABE3, f856; +fma.rn.f32 f861, f173, 0fBF423DF9, f857; +fma.rn.f32 f862, f176, 0fBF26C059, f858; +fma.rn.f32 f863, f1244, 0fBF423DF9, f859; +fma.rn.f32 f864, f175, 0fBF26C059, f860; +fma.rn.f32 f865, f177, 0f3F7AC279, f861; +fma.rn.f32 f866, f180, 0fBE4E2133, f862; +fma.rn.f32 f867, f1241, 0f3F7AC279, f863; +fma.rn.f32 f868, f179, 0fBE4E2133, f864; +fma.rn.f32 f869, f181, 0fBEE17B58, f865; +fma.rn.f32 f870, f184, 0f3F65D685, f866; +fma.rn.f32 f871, f1239, 0fBEE17B58, f867; +fma.rn.f32 f872, f183, 0f3F65D685, f868; +fma.rn.f32 f873, f125, 0fBF423DF9, %62; +fma.rn.f32 f877, f129, 0f3E1B0FE2, f873; +fma.rn.f32 f1204, f128, 0f3F26C059, 0f00000000; +fma.rn.f32 f878, f132, 0fBF7D0C43, f1204; +fma.rn.f32 f1203, f1272, 0fBF423DF9, %63; +fma.rn.f32 f879, f1269, 0f3E1B0FE2, f1203; +fma.rn.f32 f1202, f127, 0f3F26C059, 0f00000000; +fma.rn.f32 f880, f131, 0fBF7D0C43, f1202; +fma.rn.f32 f881, f133, 0f3F076A2F, f877; +fma.rn.f32 f882, f136, 0f3F5940C0, f878; +fma.rn.f32 f883, f1267, 0f3F076A2F, f879; +fma.rn.f32 f884, f135, 0f3F5940C0, f880; +fma.rn.f32 f885, f137, 0fBF744278, f881; +fma.rn.f32 f886, f140, 0fBE994620, f882; +fma.rn.f32 f887, f1265, 0fBF744278, f883; +fma.rn.f32 f888, f139, 0fBE994620, f884; +fma.rn.f32 f889, f141, 0f3F6B40D2, f885; +fma.rn.f32 f890, f144, 0fBEC9E903, f886; +fma.rn.f32 f891, f1262, 0f3F6B40D2, f887; +fma.rn.f32 f892, f143, 0fBEC9E903, f888; +fma.rn.f32 f893, f145, 0fBEE17B58, f889; +fma.rn.f32 f894, f148, 0f3F65D685, f890; +fma.rn.f32 f895, f1260, 0fBEE17B58, f891; +fma.rn.f32 f896, f147, 0f3F65D685, f892; +fma.rn.f32 f897, f149, 0fBE805587, f893; +fma.rn.f32 f898, f152, 0fBF77D3E7, f894; +fma.rn.f32 f899, f1258, 0fBE805587, f895; +fma.rn.f32 f900, f151, 0fBF77D3E7, f896; +fma.rn.f32 f901, f153, 0f3F521D8E, f897; +fma.rn.f32 f902, f156, 0f3F123EA2, f898; +fma.rn.f32 f903, f1255, 0f3F521D8E, f899; +fma.rn.f32 f904, f155, 0f3F123EA2, f900; +fma.rn.f32 f905, f157, 0fBF7EAFC2, f901; +fma.rn.f32 f906, f160, 0f3DCF3156, f902; +fma.rn.f32 f907, f1253, 0fBF7EAFC2, f903; +fma.rn.f32 f908, f159, 0f3DCF3156, f904; +fma.rn.f32 f909, f161, 0f3F306023, f905; +fma.rn.f32 f910, f164, 0fBF398C05, f906; +fma.rn.f32 f911, f1251, 0f3F306023, f907; +fma.rn.f32 f912, f163, 0fBF398C05, f908; +fma.rn.f32 f913, f165, 0fBD4F7581, f909; +fma.rn.f32 f914, f168, 0f3F7FABE3, f910; +fma.rn.f32 f915, f1248, 0fBD4F7581, f911; +fma.rn.f32 f916, f167, 0f3F7FABE3, f912; +fma.rn.f32 f917, f169, 0fBF1CB2FA, f913; +fma.rn.f32 f918, f172, 0fBF4A7047, f914; +fma.rn.f32 f919, f1246, 0fBF1CB2FA, f915; +fma.rn.f32 f920, f171, 0fBF4A7047, f916; +fma.rn.f32 f921, f173, 0f3F7AC279, f917; +fma.rn.f32 f922, f176, 0f3E4E2133, f918; +fma.rn.f32 f923, f1244, 0f3F7AC279, f919; +fma.rn.f32 f924, f175, 0f3E4E2133, f920; +fma.rn.f32 f925, f177, 0fBF5FD52E, f921; +fma.rn.f32 f926, f180, 0f3EF87980, f922; +fma.rn.f32 f927, f1241, 0fBF5FD52E, f923; +fma.rn.f32 f928, f179, 0f3EF87980, f924; +fma.rn.f32 f929, f181, 0f3EB1D1FE, f925; +fma.rn.f32 f930, f184, 0fBF701086, f926; +fma.rn.f32 f931, f1239, 0f3EB1D1FE, f927; +fma.rn.f32 f932, f183, 0fBF701086, f928; +fma.rn.f32 f933, f125, 0fBF5FD52E, %62; +fma.rn.f32 f937, f129, 0f3F076A2F, f933; +fma.rn.f32 f1201, f128, 0f3EF87980, 0f00000000; +fma.rn.f32 f938, f132, 0fBF5940C0, f1201; +fma.rn.f32 f1200, f1272, 0fBF5FD52E, %63; +fma.rn.f32 f939, f1269, 0f3F076A2F, f1200; +fma.rn.f32 f1199, f127, 0f3EF87980, 0f00000000; +fma.rn.f32 f940, f131, 0fBF5940C0, f1199; +fma.rn.f32 f941, f133, 0fBD4F7581, f937; +fma.rn.f32 f942, f136, 0f3F7FABE3, f938; +fma.rn.f32 f943, f1267, 0fBD4F7581, f939; +fma.rn.f32 f944, f135, 0f3F7FABE3, f940; +fma.rn.f32 f945, f137, 0fBEE17B58, f941; +fma.rn.f32 f946, f140, 0fBF65D685, f942; +fma.rn.f32 f947, f1265, 0fBEE17B58, f943; +fma.rn.f32 f948, f139, 0fBF65D685, f944; +fma.rn.f32 f949, f141, 0f3F521D8E, f945; +fma.rn.f32 f950, f144, 0f3F123EA2, f946; +fma.rn.f32 f951, f1262, 0f3F521D8E, f947; +fma.rn.f32 f952, f143, 0f3F123EA2, f948; +fma.rn.f32 f953, f145, 0fBF7EAFC2, f949; +fma.rn.f32 f954, f148, 0fBDCF3156, f950; +fma.rn.f32 f955, f1260, 0fBF7EAFC2, f951; +fma.rn.f32 f956, f147, 0fBDCF3156, f952; +fma.rn.f32 f957, f149, 0f3F6B40D2, f953; +fma.rn.f32 f958, f152, 0fBEC9E903, f954; +fma.rn.f32 f959, f1258, 0f3F6B40D2, f955; +fma.rn.f32 f960, f151, 0fBEC9E903, f956; +fma.rn.f32 f961, f153, 0fBF1CB2FA, f957; +fma.rn.f32 f962, f156, 0f3F4A7047, f958; +fma.rn.f32 f963, f1255, 0fBF1CB2FA, f959; +fma.rn.f32 f964, f155, 0f3F4A7047, f960; +fma.rn.f32 f965, f157, 0f3E1B0FE2, f961; +fma.rn.f32 f966, f160, 0fBF7D0C43, f962; +fma.rn.f32 f967, f1253, 0f3E1B0FE2, f963; +fma.rn.f32 f968, f159, 0fBF7D0C43, f964; +fma.rn.f32 f969, f161, 0f3EB1D1FE, f965; +fma.rn.f32 f970, f164, 0f3F701086, f966; +fma.rn.f32 f971, f1251, 0f3EB1D1FE, f967; +fma.rn.f32 f972, f163, 0f3F701086, f968; +fma.rn.f32 f973, f165, 0fBF423DF9, f969; +fma.rn.f32 f974, f168, 0fBF26C059, f970; +fma.rn.f32 f975, f1248, 0fBF423DF9, f971; +fma.rn.f32 f976, f167, 0fBF26C059, f972; +fma.rn.f32 f977, f169, 0f3F7AC279, f973; +fma.rn.f32 f978, f172, 0f3E4E2133, f974; +fma.rn.f32 f979, f1246, 0f3F7AC279, f975; +fma.rn.f32 f980, f171, 0f3E4E2133, f976; +fma.rn.f32 f981, f173, 0fBF744278, f977; +fma.rn.f32 f982, f176, 0f3E994620, f978; +fma.rn.f32 f983, f1244, 0fBF744278, f979; +fma.rn.f32 f984, f175, 0f3E994620, f980; +fma.rn.f32 f985, f177, 0f3F306023, f981; +fma.rn.f32 f986, f180, 0fBF398C05, f982; +fma.rn.f32 f987, f1241, 0f3F306023, f983; +fma.rn.f32 f988, f179, 0fBF398C05, f984; +fma.rn.f32 f989, f181, 0fBE805587, f985; +fma.rn.f32 f990, f184, 0f3F77D3E7, f986; +fma.rn.f32 f991, f1239, 0fBE805587, f987; +fma.rn.f32 f992, f183, 0f3F77D3E7, f988; +fma.rn.f32 f993, f125, 0fBF744278, %62; +fma.rn.f32 f997, f129, 0f3F521D8E, f993; +fma.rn.f32 f1198, f128, 0f3E994620, 0f00000000; +fma.rn.f32 f998, f132, 0fBF123EA2, f1198; +fma.rn.f32 f1197, f1272, 0fBF744278, %63; +fma.rn.f32 f999, f1269, 0f3F521D8E, f1197; +fma.rn.f32 f1196, f127, 0f3E994620, 0f00000000; +fma.rn.f32 f1000, f131, 0fBF123EA2, f1196; +fma.rn.f32 f1001, f133, 0fBF1CB2FA, f997; +fma.rn.f32 f1002, f136, 0f3F4A7047, f998; +fma.rn.f32 f1003, f1267, 0fBF1CB2FA, f999; +fma.rn.f32 f1004, f135, 0f3F4A7047, f1000; +fma.rn.f32 f1005, f137, 0f3EB1D1FE, f1001; +fma.rn.f32 f1006, f140, 0fBF701086, f1002; +fma.rn.f32 f1007, f1265, 0f3EB1D1FE, f1003; +fma.rn.f32 f1008, f139, 0fBF701086, f1004; +fma.rn.f32 f1009, f141, 0fBD4F7581, f1005; +fma.rn.f32 f1010, f144, 0f3F7FABE3, f1006; +fma.rn.f32 f1011, f1262, 0fBD4F7581, f1007; +fma.rn.f32 f1012, f143, 0f3F7FABE3, f1008; +fma.rn.f32 f1013, f145, 0fBE805587, f1009; +fma.rn.f32 f1014, f148, 0fBF77D3E7, f1010; +fma.rn.f32 f1015, f1260, 0fBE805587, f1011; +fma.rn.f32 f1016, f147, 0fBF77D3E7, f1012; +fma.rn.f32 f1017, f149, 0f3F076A2F, f1013; +fma.rn.f32 f1018, f152, 0f3F5940C0, f1014; +fma.rn.f32 f1019, f1258, 0f3F076A2F, f1015; +fma.rn.f32 f1020, f151, 0f3F5940C0, f1016; +fma.rn.f32 f1021, f153, 0fBF423DF9, f1017; +fma.rn.f32 f1022, f156, 0fBF26C059, f1018; +fma.rn.f32 f1023, f1255, 0fBF423DF9, f1019; +fma.rn.f32 f1024, f155, 0fBF26C059, f1020; +fma.rn.f32 f1025, f157, 0f3F6B40D2, f1021; +fma.rn.f32 f1026, f160, 0f3EC9E903, f1022; +fma.rn.f32 f1027, f1253, 0f3F6B40D2, f1023; +fma.rn.f32 f1028, f159, 0f3EC9E903, f1024; +fma.rn.f32 f1029, f161, 0fBF7EAFC2, f1025; +fma.rn.f32 f1030, f164, 0fBDCF3156, f1026; +fma.rn.f32 f1031, f1251, 0fBF7EAFC2, f1027; +fma.rn.f32 f1032, f163, 0fBDCF3156, f1028; +fma.rn.f32 f1033, f165, 0f3F7AC279, f1029; +fma.rn.f32 f1034, f168, 0fBE4E2133, f1030; +fma.rn.f32 f1035, f1248, 0f3F7AC279, f1031; +fma.rn.f32 f1036, f167, 0fBE4E2133, f1032; +fma.rn.f32 f1037, f169, 0fBF5FD52E, f1033; +fma.rn.f32 f1038, f172, 0f3EF87980, f1034; +fma.rn.f32 f1039, f1246, 0fBF5FD52E, f1035; +fma.rn.f32 f1040, f171, 0f3EF87980, f1036; +fma.rn.f32 f1041, f173, 0f3F306023, f1037; +fma.rn.f32 f1042, f176, 0fBF398C05, f1038; +fma.rn.f32 f1043, f1244, 0f3F306023, f1039; +fma.rn.f32 f1044, f175, 0fBF398C05, f1040; +fma.rn.f32 f1045, f177, 0fBEE17B58, f1041; +fma.rn.f32 f1046, f180, 0f3F65D685, f1042; +fma.rn.f32 f1047, f1241, 0fBEE17B58, f1043; +fma.rn.f32 f1048, f179, 0f3F65D685, f1044; +fma.rn.f32 f1049, f181, 0f3E1B0FE2, f1045; +fma.rn.f32 f1050, f184, 0fBF7D0C43, f1046; +fma.rn.f32 f1051, f1239, 0f3E1B0FE2, f1047; +fma.rn.f32 f1052, f183, 0fBF7D0C43, f1048; +fma.rn.f32 f1053, f125, 0fBF7EAFC2, %62; +fma.rn.f32 f1054, f128, 0f3DCF3156, 0f00000000; +fma.rn.f32 f1055, f1272, 0fBF7EAFC2, %63; +fma.rn.f32 f1056, f127, 0f3DCF3156, 0f00000000; +fma.rn.f32 f1057, f129, 0f3F7AC279, f1053; +fma.rn.f32 f1058, f132, 0fBE4E2133, f1054; +fma.rn.f32 f1059, f1269, 0f3F7AC279, f1055; +fma.rn.f32 f1060, f131, 0fBE4E2133, f1056; +fma.rn.f32 f1061, f133, 0fBF744278, f1057; +fma.rn.f32 f1062, f136, 0f3E994620, f1058; +fma.rn.f32 f1063, f1267, 0fBF744278, f1059; +fma.rn.f32 f1064, f135, 0f3E994620, f1060; +fma.rn.f32 f1065, f137, 0f3F6B40D2, f1061; +fma.rn.f32 f1066, f140, 0fBEC9E903, f1062; +fma.rn.f32 f1067, f1265, 0f3F6B40D2, f1063; +fma.rn.f32 f1068, f139, 0fBEC9E903, f1064; +fma.rn.f32 f1069, f141, 0fBF5FD52E, f1065; +fma.rn.f32 f1070, f144, 0f3EF87980, f1066; +fma.rn.f32 f1071, f1262, 0fBF5FD52E, f1067; +fma.rn.f32 f1072, f143, 0f3EF87980, f1068; +fma.rn.f32 f1073, f145, 0f3F521D8E, f1069; +fma.rn.f32 f1074, f148, 0fBF123EA2, f1070; +fma.rn.f32 f1075, f1260, 0f3F521D8E, f1071; +fma.rn.f32 f1076, f147, 0fBF123EA2, f1072; +fma.rn.f32 f1077, f149, 0fBF423DF9, f1073; +fma.rn.f32 f1078, f152, 0f3F26C059, f1074; +fma.rn.f32 f1079, f1258, 0fBF423DF9, f1075; +fma.rn.f32 f1080, f151, 0f3F26C059, f1076; +fma.rn.f32 f1081, f153, 0f3F306023, f1077; +fma.rn.f32 f1082, f156, 0fBF398C05, f1078; +fma.rn.f32 f1083, f1255, 0f3F306023, f1079; +fma.rn.f32 f1084, f155, 0fBF398C05, f1080; +fma.rn.f32 f1085, f157, 0fBF1CB2FA, f1081; +fma.rn.f32 f1086, f160, 0f3F4A7047, f1082; +fma.rn.f32 f1087, f1253, 0fBF1CB2FA, f1083; +fma.rn.f32 f1088, f159, 0f3F4A7047, f1084; +fma.rn.f32 f1089, f161, 0f3F076A2F, f1085; +fma.rn.f32 f1090, f164, 0fBF5940C0, f1086; +fma.rn.f32 f1091, f1251, 0f3F076A2F, f1087; +fma.rn.f32 f1092, f163, 0fBF5940C0, f1088; +fma.rn.f32 f1093, f165, 0fBEE17B58, f1089; +fma.rn.f32 f1094, f168, 0f3F65D685, f1090; +fma.rn.f32 f1095, f1248, 0fBEE17B58, f1091; +fma.rn.f32 f1096, f167, 0f3F65D685, f1092; +fma.rn.f32 f1097, f169, 0f3EB1D1FE, f1093; +fma.rn.f32 f1098, f172, 0fBF701086, f1094; +fma.rn.f32 f1099, f1246, 0f3EB1D1FE, f1095; +fma.rn.f32 f1100, f171, 0fBF701086, f1096; +fma.rn.f32 f1101, f173, 0fBE805587, f1097; +fma.rn.f32 f1102, f176, 0f3F77D3E7, f1098; +fma.rn.f32 f1103, f1244, 0fBE805587, f1099; +fma.rn.f32 f1104, f175, 0f3F77D3E7, f1100; +fma.rn.f32 f1105, f177, 0f3E1B0FE2, f1101; +fma.rn.f32 f1106, f180, 0fBF7D0C43, f1102; +fma.rn.f32 f1107, f1241, 0f3E1B0FE2, f1103; +fma.rn.f32 f1108, f179, 0fBF7D0C43, f1104; +fma.rn.f32 f1109, f181, 0fBD4F7581, f1105; +fma.rn.f32 f1110, f184, 0f3F7FABE3, f1106; +fma.rn.f32 f1111, f1239, 0fBD4F7581, f1107; +fma.rn.f32 f1112, f183, 0f3F7FABE3, f1108; +add.f32 %1, f212, f1239; +add.f32 %0, f211, f181; +sub.f32 %2, f269, f270; +add.f32 %3, f271, f272; +sub.f32 %4, f329, f330; +add.f32 %5, f331, f332; +sub.f32 %6, f389, f390; +add.f32 %7, f391, f392; +add.f32 %9, f451, f452; +sub.f32 %8, f449, f450; +add.f32 %11, f511, f512; +sub.f32 %10, f509, f510; +add.f32 %13, f571, f572; +sub.f32 %12, f569, f570; +sub.f32 %14, f629, f630; +add.f32 %15, f631, f632; +sub.f32 %16, f689, f690; +add.f32 %17, f691, f692; +sub.f32 %18, f749, f750; +add.f32 %19, f751, f752; +add.f32 %21, f811, f812; +sub.f32 %20, f809, f810; +add.f32 %23, f871, f872; +sub.f32 %22, f869, f870; +add.f32 %25, f931, f932; +sub.f32 %24, f929, f930; +sub.f32 %26, f989, f990; +add.f32 %27, f991, f992; +sub.f32 %28, f1049, f1050; +add.f32 %29, f1051, f1052; +sub.f32 %30, f1109, f1110; +add.f32 %31, f1111, f1112; +sub.f32 %33, f1111, f1112; +add.f32 %32, f1109, f1110; +sub.f32 %35, f1051, f1052; +add.f32 %34, f1049, f1050; +sub.f32 %37, f991, f992; +add.f32 %36, f989, f990; +sub.f32 %39, f931, f932; +add.f32 %38, f929, f930; +sub.f32 %41, f871, f872; +add.f32 %40, f869, f870; +sub.f32 %43, f811, f812; +add.f32 %42, f809, f810; +sub.f32 %45, f751, f752; +add.f32 %44, f749, f750; +sub.f32 %47, f691, f692; +add.f32 %46, f689, f690; +sub.f32 %49, f631, f632; +add.f32 %48, f629, f630; +sub.f32 %51, f571, f572; +add.f32 %50, f569, f570; +sub.f32 %53, f511, f512; +add.f32 %52, f509, f510; +sub.f32 %55, f451, f452; +add.f32 %54, f449, f450; +sub.f32 %57, f391, f392; +add.f32 %56, f389, f390; +sub.f32 %59, f331, f332; +add.f32 %58, f329, f330; +sub.f32 %61, f271, f272; +add.f32 %60, f269, f270; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[2].y), "f"(rmem[29].y), "f"(rmem[28].y), "f"(rmem[4].y), "f"(rmem[5].y), "f"(rmem[26].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[8].y), "f"(rmem[23].y), "f"(rmem[22].y), "f"(rmem[10].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[19].y), "f"(rmem[13].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..5bd04cda065d0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp64_fwd.hpp.inc @@ -0,0 +1,1066 @@ +#ifndef CUFFTDX_FFT_31_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_31_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<419, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<1273>; +.reg .b64 rd<4>; +add.f64 fd125, %64, %123; +sub.f64 fd127, %64, %123; +add.f64 fd1272, %66, %124; +sub.f64 fd128, %66, %124; +add.f64 fd129, %67, %121; +sub.f64 fd131, %67, %121; +add.f64 fd1269, %125, %126; +sub.f64 fd132, %125, %126; +add.f64 fd133, %69, %119; +sub.f64 fd135, %69, %119; +add.f64 fd1267, %70, %127; +sub.f64 fd136, %70, %127; +add.f64 fd137, %71, %117; +sub.f64 fd139, %71, %117; +add.f64 fd1265, %128, %118; +sub.f64 fd140, %128, %118; +add.f64 fd141, %73, %115; +sub.f64 fd143, %73, %115; +add.f64 fd1262, %129, %130; +sub.f64 fd144, %129, %130; +add.f64 fd145, %75, %113; +sub.f64 fd147, %75, %113; +add.f64 fd1260, %76, %131; +sub.f64 fd148, %76, %131; +add.f64 fd149, %77, %111; +sub.f64 fd151, %77, %111; +add.f64 fd1258, %132, %112; +sub.f64 fd152, %132, %112; +add.f64 fd153, %79, %109; +sub.f64 fd155, %79, %109; +add.f64 fd1255, %133, %134; +sub.f64 fd156, %133, %134; +add.f64 fd157, %81, %107; +sub.f64 fd159, %81, %107; +add.f64 fd1253, %82, %135; +sub.f64 fd160, %82, %135; +add.f64 fd161, %83, %105; +sub.f64 fd163, %83, %105; +add.f64 fd1251, %136, %106; +sub.f64 fd164, %136, %106; +add.f64 fd165, %85, %103; +sub.f64 fd167, %85, %103; +add.f64 fd1248, %137, %138; +sub.f64 fd168, %137, %138; +add.f64 fd169, %87, %101; +sub.f64 fd171, %87, %101; +add.f64 fd1246, %88, %139; +sub.f64 fd172, %88, %139; +add.f64 fd173, %89, %99; +sub.f64 fd175, %89, %99; +add.f64 fd1244, %140, %100; +sub.f64 fd176, %140, %100; +add.f64 fd177, %91, %97; +sub.f64 fd179, %91, %97; +add.f64 fd1241, %141, %142; +sub.f64 fd180, %141, %142; +add.f64 fd181, %93, %95; +sub.f64 fd183, %93, %95; +add.f64 fd1239, %94, %143; +sub.f64 fd184, %94, %143; +add.f64 fd185, %62, fd125; +add.f64 fd187, fd185, fd129; +add.f64 fd1238, %63, fd1272; +add.f64 fd188, fd1238, fd1269; +add.f64 fd189, fd187, fd133; +add.f64 fd190, fd188, fd1267; +add.f64 fd191, fd189, fd137; +add.f64 fd192, fd190, fd1265; +add.f64 fd193, fd191, fd141; +add.f64 fd194, fd192, fd1262; +add.f64 fd195, fd193, fd145; +add.f64 fd196, fd194, fd1260; +add.f64 fd197, fd195, fd149; +add.f64 fd198, fd196, fd1258; +add.f64 fd199, fd197, fd153; +add.f64 fd200, fd198, fd1255; +add.f64 fd201, fd199, fd157; +add.f64 fd202, fd200, fd1253; +add.f64 fd203, fd201, fd161; +add.f64 fd204, fd202, fd1251; +add.f64 fd205, fd203, fd165; +add.f64 fd206, fd204, fd1248; +add.f64 fd207, fd205, fd169; +add.f64 fd208, fd206, fd1246; +add.f64 fd209, fd207, fd173; +add.f64 fd210, fd208, fd1244; +add.f64 fd211, fd209, fd177; +add.f64 fd212, fd210, fd1241; +fma.rn.f64 fd213, fd125, 0d3FEF584F2CE43B84, %62; +fma.rn.f64 fd217, fd129, 0d3FED681A366A00FA, fd213; +fma.rn.f64 fd1237, fd128, 0dBFC9C4266041CA8F, 0d0000000000000000; +fma.rn.f64 fd218, fd132, 0dBFD93D20572CA90B, fd1237; +fma.rn.f64 fd1236, fd1272, 0d3FEF584F2CE43B84, %63; +fma.rn.f64 fd219, fd1269, 0d3FED681A366A00FA, fd1236; +fma.rn.f64 fd1235, fd127, 0dBFC9C4266041CA8F, 0d0000000000000000; +fma.rn.f64 fd220, fd131, 0dBFD93D20572CA90B, fd1235; +fma.rn.f64 fd221, fd133, 0d3FEA43B1B1379AFF, fd217; +fma.rn.f64 fd222, fd136, 0dBFE247D447A27216, fd218; +fma.rn.f64 fd223, fd1267, 0d3FEA43B1B1379AFF, fd219; +fma.rn.f64 fd224, fd135, 0dBFE247D447A27216, fd220; +fma.rn.f64 fd225, fd137, 0d3FE60C045A2E9729, fd221; +fma.rn.f64 fd226, fd140, 0dBFE73180A4B0D300, fd222; +fma.rn.f64 fd227, fd1265, 0d3FE60C045A2E9729, fd223; +fma.rn.f64 fd228, fd139, 0dBFE73180A4B0D300, fd224; +fma.rn.f64 fd229, fd141, 0d3FE0ED45EEA3B09F, fd225; +fma.rn.f64 fd230, fd144, 0dBFEB2818007C19DF, fd226; +fma.rn.f64 fd231, fd1262, 0d3FE0ED45EEA3B09F, fd227; +fma.rn.f64 fd232, fd143, 0dBFEB2818007C19DF, fd228; +fma.rn.f64 fd233, fd145, 0d3FD63A3FCFACA412, fd229; +fma.rn.f64 fd234, fd148, 0dBFEE0210C26A6E6F, fd230; +fma.rn.f64 fd235, fd1260, 0d3FD63A3FCFACA412, fd231; +fma.rn.f64 fd236, fd147, 0dBFEE0210C26A6E6F, fd232; +fma.rn.f64 fd237, fd149, 0d3FC361FC440B478F, fd233; +fma.rn.f64 fd238, fd152, 0dBFEFA18852C3E08A, fd234; +fma.rn.f64 fd239, fd1258, 0d3FC361FC440B478F, fd235; +fma.rn.f64 fd240, fd151, 0dBFEFA18852C3E08A, fd236; +fma.rn.f64 fd241, fd153, 0dBFA9EEB01776B57D, fd237; +fma.rn.f64 fd242, fd156, 0dBFEFF57C5208CCF9, fd238; +fma.rn.f64 fd243, fd1255, 0dBFA9EEB01776B57D, fd239; +fma.rn.f64 fd244, fd155, 0dBFEFF57C5208CCF9, fd240; +fma.rn.f64 fd245, fd157, 0dBFD00AB0EB2D7D94, fd241; +fma.rn.f64 fd246, fd160, 0dBFEEFA7CDDB128FA, fd242; +fma.rn.f64 fd247, fd1253, 0dBFD00AB0EB2D7D94, fd243; +fma.rn.f64 fd248, fd159, 0dBFEEFA7CDDB128FA, fd244; +fma.rn.f64 fd249, fd161, 0dBFDC2F6AF3928A8E, fd245; +fma.rn.f64 fd250, fd164, 0dBFECBAD095F50378, fd246; +fma.rn.f64 fd251, fd1251, 0dBFDC2F6AF3928A8E, fd247; +fma.rn.f64 fd252, fd163, 0dBFECBAD095F50378, fd248; +fma.rn.f64 fd253, fd165, 0dBFE3965F49174D13, fd249; +fma.rn.f64 fd254, fd168, 0dBFE94E08EB13C451, fd250; +fma.rn.f64 fd255, fd1248, 0dBFE3965F49174D13, fd251; +fma.rn.f64 fd256, fd167, 0dBFE94E08EB13C451, fd252; +fma.rn.f64 fd257, fd169, 0dBFE847BF1D5146CC, fd253; +fma.rn.f64 fd258, fd172, 0dBFE4D80B1AD9CCF6, fd254; +fma.rn.f64 fd259, fd1246, 0dBFE847BF1D5146CC, fd255; +fma.rn.f64 fd260, fd171, 0dBFE4D80B1AD9CCF6, fd256; +fma.rn.f64 fd261, fd173, 0dBFEBFAA5C136B224, fd257; +fma.rn.f64 fd262, fd176, 0dBFDF0F2FF6705BEC, fd258; +fma.rn.f64 fd263, fd1244, 0dBFEBFAA5C136B224, fd259; +fma.rn.f64 fd264, fd175, 0dBFDF0F2FF6705BEC, fd260; +fma.rn.f64 fd265, fd177, 0dBFEE884F0CC22CCC, fd261; +fma.rn.f64 fd266, fd180, 0dBFD328C3F1B322CB, fd262; +fma.rn.f64 fd267, fd1241, 0dBFEE884F0CC22CCC, fd263; +fma.rn.f64 fd268, fd179, 0dBFD328C3F1B322CB, fd264; +fma.rn.f64 fd269, fd181, 0dBFEFD5F830F860F9, fd265; +fma.rn.f64 fd270, fd184, 0dBFB9E62ACA53C49F, fd266; +fma.rn.f64 fd271, fd1239, 0dBFEFD5F830F860F9, fd267; +fma.rn.f64 fd272, fd183, 0dBFB9E62ACA53C49F, fd268; +fma.rn.f64 fd273, fd125, 0d3FED681A366A00FA, %62; +fma.rn.f64 fd277, fd129, 0d3FE60C045A2E9729, fd273; +fma.rn.f64 fd1234, fd128, 0dBFD93D20572CA90B, 0d0000000000000000; +fma.rn.f64 fd278, fd132, 0dBFE73180A4B0D300, fd1234; +fma.rn.f64 fd1233, fd1272, 0d3FED681A366A00FA, %63; +fma.rn.f64 fd279, fd1269, 0d3FE60C045A2E9729, fd1233; +fma.rn.f64 fd1232, fd127, 0dBFD93D20572CA90B, 0d0000000000000000; +fma.rn.f64 fd280, fd131, 0dBFE73180A4B0D300, fd1232; +fma.rn.f64 fd281, fd133, 0d3FD63A3FCFACA412, fd277; +fma.rn.f64 fd282, fd136, 0dBFEE0210C26A6E6F, fd278; +fma.rn.f64 fd283, fd1267, 0d3FD63A3FCFACA412, fd279; +fma.rn.f64 fd284, fd135, 0dBFEE0210C26A6E6F, fd280; +fma.rn.f64 fd285, fd137, 0dBFA9EEB01776B57D, fd281; +fma.rn.f64 fd286, fd140, 0dBFEFF57C5208CCF9, fd282; +fma.rn.f64 fd287, fd1265, 0dBFA9EEB01776B57D, fd283; +fma.rn.f64 fd288, fd139, 0dBFEFF57C5208CCF9, fd284; +fma.rn.f64 fd289, fd141, 0dBFDC2F6AF3928A8E, fd285; +fma.rn.f64 fd290, fd144, 0dBFECBAD095F50378, fd286; +fma.rn.f64 fd291, fd1262, 0dBFDC2F6AF3928A8E, fd287; +fma.rn.f64 fd292, fd143, 0dBFECBAD095F50378, fd288; +fma.rn.f64 fd293, fd145, 0dBFE847BF1D5146CC, fd289; +fma.rn.f64 fd294, fd148, 0dBFE4D80B1AD9CCF6, fd290; +fma.rn.f64 fd295, fd1260, 0dBFE847BF1D5146CC, fd291; +fma.rn.f64 fd296, fd147, 0dBFE4D80B1AD9CCF6, fd292; +fma.rn.f64 fd297, fd149, 0dBFEE884F0CC22CCC, fd293; +fma.rn.f64 fd298, fd152, 0dBFD328C3F1B322CB, fd294; +fma.rn.f64 fd299, fd1258, 0dBFEE884F0CC22CCC, fd295; +fma.rn.f64 fd300, fd151, 0dBFD328C3F1B322CB, fd296; +fma.rn.f64 fd301, fd153, 0dBFEFD5F830F860F9, fd297; +fma.rn.f64 fd302, fd156, 0d3FB9E62ACA53C49F, fd298; +fma.rn.f64 fd303, fd1255, 0dBFEFD5F830F860F9, fd299; +fma.rn.f64 fd304, fd155, 0d3FB9E62ACA53C49F, fd300; +fma.rn.f64 fd305, fd157, 0dBFEBFAA5C136B224, fd301; +fma.rn.f64 fd306, fd160, 0d3FDF0F2FF6705BEC, fd302; +fma.rn.f64 fd307, fd1253, 0dBFEBFAA5C136B224, fd303; +fma.rn.f64 fd308, fd159, 0d3FDF0F2FF6705BEC, fd304; +fma.rn.f64 fd309, fd161, 0dBFE3965F49174D13, fd305; +fma.rn.f64 fd310, fd164, 0d3FE94E08EB13C451, fd306; +fma.rn.f64 fd311, fd1251, 0dBFE3965F49174D13, fd307; +fma.rn.f64 fd312, fd163, 0d3FE94E08EB13C451, fd308; +fma.rn.f64 fd313, fd165, 0dBFD00AB0EB2D7D94, fd309; +fma.rn.f64 fd314, fd168, 0d3FEEFA7CDDB128FA, fd310; +fma.rn.f64 fd315, fd1248, 0dBFD00AB0EB2D7D94, fd311; +fma.rn.f64 fd316, fd167, 0d3FEEFA7CDDB128FA, fd312; +fma.rn.f64 fd317, fd169, 0d3FC361FC440B478F, fd313; +fma.rn.f64 fd318, fd172, 0d3FEFA18852C3E08A, fd314; +fma.rn.f64 fd319, fd1246, 0d3FC361FC440B478F, fd315; +fma.rn.f64 fd320, fd171, 0d3FEFA18852C3E08A, fd316; +fma.rn.f64 fd321, fd173, 0d3FE0ED45EEA3B09F, fd317; +fma.rn.f64 fd322, fd176, 0d3FEB2818007C19DF, fd318; +fma.rn.f64 fd323, fd1244, 0d3FE0ED45EEA3B09F, fd319; +fma.rn.f64 fd324, fd175, 0d3FEB2818007C19DF, fd320; +fma.rn.f64 fd325, fd177, 0d3FEA43B1B1379AFF, fd321; +fma.rn.f64 fd326, fd180, 0d3FE247D447A27216, fd322; +fma.rn.f64 fd327, fd1241, 0d3FEA43B1B1379AFF, fd323; +fma.rn.f64 fd328, fd179, 0d3FE247D447A27216, fd324; +fma.rn.f64 fd329, fd181, 0d3FEF584F2CE43B84, fd325; +fma.rn.f64 fd330, fd184, 0d3FC9C4266041CA8F, fd326; +fma.rn.f64 fd331, fd1239, 0d3FEF584F2CE43B84, fd327; +fma.rn.f64 fd332, fd183, 0d3FC9C4266041CA8F, fd328; +fma.rn.f64 fd333, fd125, 0d3FEA43B1B1379AFF, %62; +fma.rn.f64 fd337, fd129, 0d3FD63A3FCFACA412, fd333; +fma.rn.f64 fd1231, fd128, 0dBFE247D447A27216, 0d0000000000000000; +fma.rn.f64 fd338, fd132, 0dBFEE0210C26A6E6F, fd1231; +fma.rn.f64 fd1230, fd1272, 0d3FEA43B1B1379AFF, %63; +fma.rn.f64 fd339, fd1269, 0d3FD63A3FCFACA412, fd1230; +fma.rn.f64 fd1229, fd127, 0dBFE247D447A27216, 0d0000000000000000; +fma.rn.f64 fd340, fd131, 0dBFEE0210C26A6E6F, fd1229; +fma.rn.f64 fd341, fd133, 0dBFD00AB0EB2D7D94, fd337; +fma.rn.f64 fd342, fd136, 0dBFEEFA7CDDB128FA, fd338; +fma.rn.f64 fd343, fd1267, 0dBFD00AB0EB2D7D94, fd339; +fma.rn.f64 fd344, fd135, 0dBFEEFA7CDDB128FA, fd340; +fma.rn.f64 fd345, fd137, 0dBFE847BF1D5146CC, fd341; +fma.rn.f64 fd346, fd140, 0dBFE4D80B1AD9CCF6, fd342; +fma.rn.f64 fd347, fd1265, 0dBFE847BF1D5146CC, fd343; +fma.rn.f64 fd348, fd139, 0dBFE4D80B1AD9CCF6, fd344; +fma.rn.f64 fd349, fd141, 0dBFEFD5F830F860F9, fd345; +fma.rn.f64 fd350, fd144, 0dBFB9E62ACA53C49F, fd346; +fma.rn.f64 fd351, fd1262, 0dBFEFD5F830F860F9, fd347; +fma.rn.f64 fd352, fd143, 0dBFB9E62ACA53C49F, fd348; +fma.rn.f64 fd353, fd145, 0dBFEBFAA5C136B224, fd349; +fma.rn.f64 fd354, fd148, 0d3FDF0F2FF6705BEC, fd350; +fma.rn.f64 fd355, fd1260, 0dBFEBFAA5C136B224, fd351; +fma.rn.f64 fd356, fd147, 0d3FDF0F2FF6705BEC, fd352; +fma.rn.f64 fd357, fd149, 0dBFDC2F6AF3928A8E, fd353; +fma.rn.f64 fd358, fd152, 0d3FECBAD095F50378, fd354; +fma.rn.f64 fd359, fd1258, 0dBFDC2F6AF3928A8E, fd355; +fma.rn.f64 fd360, fd151, 0d3FECBAD095F50378, fd356; +fma.rn.f64 fd361, fd153, 0d3FC361FC440B478F, fd357; +fma.rn.f64 fd362, fd156, 0d3FEFA18852C3E08A, fd358; +fma.rn.f64 fd363, fd1255, 0d3FC361FC440B478F, fd359; +fma.rn.f64 fd364, fd155, 0d3FEFA18852C3E08A, fd360; +fma.rn.f64 fd365, fd157, 0d3FE60C045A2E9729, fd361; +fma.rn.f64 fd366, fd160, 0d3FE73180A4B0D300, fd362; +fma.rn.f64 fd367, fd1253, 0d3FE60C045A2E9729, fd363; +fma.rn.f64 fd368, fd159, 0d3FE73180A4B0D300, fd364; +fma.rn.f64 fd369, fd161, 0d3FEF584F2CE43B84, fd365; +fma.rn.f64 fd370, fd164, 0d3FC9C4266041CA8F, fd366; +fma.rn.f64 fd371, fd1251, 0d3FEF584F2CE43B84, fd367; +fma.rn.f64 fd372, fd163, 0d3FC9C4266041CA8F, fd368; +fma.rn.f64 fd373, fd165, 0d3FED681A366A00FA, fd369; +fma.rn.f64 fd374, fd168, 0dBFD93D20572CA90B, fd370; +fma.rn.f64 fd375, fd1248, 0d3FED681A366A00FA, fd371; +fma.rn.f64 fd376, fd167, 0dBFD93D20572CA90B, fd372; +fma.rn.f64 fd377, fd169, 0d3FE0ED45EEA3B09F, fd373; +fma.rn.f64 fd378, fd172, 0dBFEB2818007C19DF, fd374; +fma.rn.f64 fd379, fd1246, 0d3FE0ED45EEA3B09F, fd375; +fma.rn.f64 fd380, fd171, 0dBFEB2818007C19DF, fd376; +fma.rn.f64 fd381, fd173, 0dBFA9EEB01776B57D, fd377; +fma.rn.f64 fd382, fd176, 0dBFEFF57C5208CCF9, fd378; +fma.rn.f64 fd383, fd1244, 0dBFA9EEB01776B57D, fd379; +fma.rn.f64 fd384, fd175, 0dBFEFF57C5208CCF9, fd380; +fma.rn.f64 fd385, fd177, 0dBFE3965F49174D13, fd381; +fma.rn.f64 fd386, fd180, 0dBFE94E08EB13C451, fd382; +fma.rn.f64 fd387, fd1241, 0dBFE3965F49174D13, fd383; +fma.rn.f64 fd388, fd179, 0dBFE94E08EB13C451, fd384; +fma.rn.f64 fd389, fd181, 0dBFEE884F0CC22CCC, fd385; +fma.rn.f64 fd390, fd184, 0dBFD328C3F1B322CB, fd386; +fma.rn.f64 fd391, fd1239, 0dBFEE884F0CC22CCC, fd387; +fma.rn.f64 fd392, fd183, 0dBFD328C3F1B322CB, fd388; +fma.rn.f64 fd393, fd125, 0d3FE60C045A2E9729, %62; +fma.rn.f64 fd397, fd129, 0dBFA9EEB01776B57D, fd393; +fma.rn.f64 fd1228, fd128, 0dBFE73180A4B0D300, 0d0000000000000000; +fma.rn.f64 fd398, fd132, 0dBFEFF57C5208CCF9, fd1228; +fma.rn.f64 fd1227, fd1272, 0d3FE60C045A2E9729, %63; +fma.rn.f64 fd399, fd1269, 0dBFA9EEB01776B57D, fd1227; +fma.rn.f64 fd1226, fd127, 0dBFE73180A4B0D300, 0d0000000000000000; +fma.rn.f64 fd400, fd131, 0dBFEFF57C5208CCF9, fd1226; +fma.rn.f64 fd401, fd133, 0dBFE847BF1D5146CC, fd397; +fma.rn.f64 fd402, fd136, 0dBFE4D80B1AD9CCF6, fd398; +fma.rn.f64 fd403, fd1267, 0dBFE847BF1D5146CC, fd399; +fma.rn.f64 fd404, fd135, 0dBFE4D80B1AD9CCF6, fd400; +fma.rn.f64 fd405, fd137, 0dBFEFD5F830F860F9, fd401; +fma.rn.f64 fd406, fd140, 0d3FB9E62ACA53C49F, fd402; +fma.rn.f64 fd407, fd1265, 0dBFEFD5F830F860F9, fd403; +fma.rn.f64 fd408, fd139, 0d3FB9E62ACA53C49F, fd404; +fma.rn.f64 fd409, fd141, 0dBFE3965F49174D13, fd405; +fma.rn.f64 fd410, fd144, 0d3FE94E08EB13C451, fd406; +fma.rn.f64 fd411, fd1262, 0dBFE3965F49174D13, fd407; +fma.rn.f64 fd412, fd143, 0d3FE94E08EB13C451, fd408; +fma.rn.f64 fd413, fd145, 0d3FC361FC440B478F, fd409; +fma.rn.f64 fd414, fd148, 0d3FEFA18852C3E08A, fd410; +fma.rn.f64 fd415, fd1260, 0d3FC361FC440B478F, fd411; +fma.rn.f64 fd416, fd147, 0d3FEFA18852C3E08A, fd412; +fma.rn.f64 fd417, fd149, 0d3FEA43B1B1379AFF, fd413; +fma.rn.f64 fd418, fd152, 0d3FE247D447A27216, fd414; +fma.rn.f64 fd419, fd1258, 0d3FEA43B1B1379AFF, fd415; +fma.rn.f64 fd420, fd151, 0d3FE247D447A27216, fd416; +fma.rn.f64 fd421, fd153, 0d3FEF584F2CE43B84, fd417; +fma.rn.f64 fd422, fd156, 0dBFC9C4266041CA8F, fd418; +fma.rn.f64 fd423, fd1255, 0d3FEF584F2CE43B84, fd419; +fma.rn.f64 fd424, fd155, 0dBFC9C4266041CA8F, fd420; +fma.rn.f64 fd425, fd157, 0d3FE0ED45EEA3B09F, fd421; +fma.rn.f64 fd426, fd160, 0dBFEB2818007C19DF, fd422; +fma.rn.f64 fd427, fd1253, 0d3FE0ED45EEA3B09F, fd423; +fma.rn.f64 fd428, fd159, 0dBFEB2818007C19DF, fd424; +fma.rn.f64 fd429, fd161, 0dBFD00AB0EB2D7D94, fd425; +fma.rn.f64 fd430, fd164, 0dBFEEFA7CDDB128FA, fd426; +fma.rn.f64 fd431, fd1251, 0dBFD00AB0EB2D7D94, fd427; +fma.rn.f64 fd432, fd163, 0dBFEEFA7CDDB128FA, fd428; +fma.rn.f64 fd433, fd165, 0dBFEBFAA5C136B224, fd429; +fma.rn.f64 fd434, fd168, 0dBFDF0F2FF6705BEC, fd430; +fma.rn.f64 fd435, fd1248, 0dBFEBFAA5C136B224, fd431; +fma.rn.f64 fd436, fd167, 0dBFDF0F2FF6705BEC, fd432; +fma.rn.f64 fd437, fd169, 0dBFEE884F0CC22CCC, fd433; +fma.rn.f64 fd438, fd172, 0d3FD328C3F1B322CB, fd434; +fma.rn.f64 fd439, fd1246, 0dBFEE884F0CC22CCC, fd435; +fma.rn.f64 fd440, fd171, 0d3FD328C3F1B322CB, fd436; +fma.rn.f64 fd441, fd173, 0dBFDC2F6AF3928A8E, fd437; +fma.rn.f64 fd442, fd176, 0d3FECBAD095F50378, fd438; +fma.rn.f64 fd443, fd1244, 0dBFDC2F6AF3928A8E, fd439; +fma.rn.f64 fd444, fd175, 0d3FECBAD095F50378, fd440; +fma.rn.f64 fd445, fd177, 0d3FD63A3FCFACA412, fd441; +fma.rn.f64 fd446, fd180, 0d3FEE0210C26A6E6F, fd442; +fma.rn.f64 fd447, fd1241, 0d3FD63A3FCFACA412, fd443; +fma.rn.f64 fd448, fd179, 0d3FEE0210C26A6E6F, fd444; +fma.rn.f64 fd449, fd181, 0d3FED681A366A00FA, fd445; +fma.rn.f64 fd450, fd184, 0d3FD93D20572CA90B, fd446; +fma.rn.f64 fd451, fd1239, 0d3FED681A366A00FA, fd447; +fma.rn.f64 fd452, fd183, 0d3FD93D20572CA90B, fd448; +fma.rn.f64 fd453, fd125, 0d3FE0ED45EEA3B09F, %62; +fma.rn.f64 fd457, fd129, 0dBFDC2F6AF3928A8E, fd453; +fma.rn.f64 fd1225, fd128, 0dBFEB2818007C19DF, 0d0000000000000000; +fma.rn.f64 fd458, fd132, 0dBFECBAD095F50378, fd1225; +fma.rn.f64 fd1224, fd1272, 0d3FE0ED45EEA3B09F, %63; +fma.rn.f64 fd459, fd1269, 0dBFDC2F6AF3928A8E, fd1224; +fma.rn.f64 fd1223, fd127, 0dBFEB2818007C19DF, 0d0000000000000000; +fma.rn.f64 fd460, fd131, 0dBFECBAD095F50378, fd1223; +fma.rn.f64 fd461, fd133, 0dBFEFD5F830F860F9, fd457; +fma.rn.f64 fd462, fd136, 0dBFB9E62ACA53C49F, fd458; +fma.rn.f64 fd463, fd1267, 0dBFEFD5F830F860F9, fd459; +fma.rn.f64 fd464, fd135, 0dBFB9E62ACA53C49F, fd460; +fma.rn.f64 fd465, fd137, 0dBFE3965F49174D13, fd461; +fma.rn.f64 fd466, fd140, 0d3FE94E08EB13C451, fd462; +fma.rn.f64 fd467, fd1265, 0dBFE3965F49174D13, fd463; +fma.rn.f64 fd468, fd139, 0d3FE94E08EB13C451, fd464; +fma.rn.f64 fd469, fd141, 0d3FD63A3FCFACA412, fd465; +fma.rn.f64 fd470, fd144, 0d3FEE0210C26A6E6F, fd466; +fma.rn.f64 fd471, fd1262, 0d3FD63A3FCFACA412, fd467; +fma.rn.f64 fd472, fd143, 0d3FEE0210C26A6E6F, fd468; +fma.rn.f64 fd473, fd145, 0d3FEF584F2CE43B84, fd469; +fma.rn.f64 fd474, fd148, 0d3FC9C4266041CA8F, fd470; +fma.rn.f64 fd475, fd1260, 0d3FEF584F2CE43B84, fd471; +fma.rn.f64 fd476, fd147, 0d3FC9C4266041CA8F, fd472; +fma.rn.f64 fd477, fd149, 0d3FE60C045A2E9729, fd473; +fma.rn.f64 fd478, fd152, 0dBFE73180A4B0D300, fd474; +fma.rn.f64 fd479, fd1258, 0d3FE60C045A2E9729, fd475; +fma.rn.f64 fd480, fd151, 0dBFE73180A4B0D300, fd476; +fma.rn.f64 fd481, fd153, 0dBFD00AB0EB2D7D94, fd477; +fma.rn.f64 fd482, fd156, 0dBFEEFA7CDDB128FA, fd478; +fma.rn.f64 fd483, fd1255, 0dBFD00AB0EB2D7D94, fd479; +fma.rn.f64 fd484, fd155, 0dBFEEFA7CDDB128FA, fd480; +fma.rn.f64 fd485, fd157, 0dBFEE884F0CC22CCC, fd481; +fma.rn.f64 fd486, fd160, 0dBFD328C3F1B322CB, fd482; +fma.rn.f64 fd487, fd1253, 0dBFEE884F0CC22CCC, fd483; +fma.rn.f64 fd488, fd159, 0dBFD328C3F1B322CB, fd484; +fma.rn.f64 fd489, fd161, 0dBFE847BF1D5146CC, fd485; +fma.rn.f64 fd490, fd164, 0d3FE4D80B1AD9CCF6, fd486; +fma.rn.f64 fd491, fd1251, 0dBFE847BF1D5146CC, fd487; +fma.rn.f64 fd492, fd163, 0d3FE4D80B1AD9CCF6, fd488; +fma.rn.f64 fd493, fd165, 0d3FC361FC440B478F, fd489; +fma.rn.f64 fd494, fd168, 0d3FEFA18852C3E08A, fd490; +fma.rn.f64 fd495, fd1248, 0d3FC361FC440B478F, fd491; +fma.rn.f64 fd496, fd167, 0d3FEFA18852C3E08A, fd492; +fma.rn.f64 fd497, fd169, 0d3FED681A366A00FA, fd493; +fma.rn.f64 fd498, fd172, 0d3FD93D20572CA90B, fd494; +fma.rn.f64 fd499, fd1246, 0d3FED681A366A00FA, fd495; +fma.rn.f64 fd500, fd171, 0d3FD93D20572CA90B, fd496; +fma.rn.f64 fd501, fd173, 0d3FEA43B1B1379AFF, fd497; +fma.rn.f64 fd502, fd176, 0dBFE247D447A27216, fd498; +fma.rn.f64 fd503, fd1244, 0d3FEA43B1B1379AFF, fd499; +fma.rn.f64 fd504, fd175, 0dBFE247D447A27216, fd500; +fma.rn.f64 fd505, fd177, 0dBFA9EEB01776B57D, fd501; +fma.rn.f64 fd506, fd180, 0dBFEFF57C5208CCF9, fd502; +fma.rn.f64 fd507, fd1241, 0dBFA9EEB01776B57D, fd503; +fma.rn.f64 fd508, fd179, 0dBFEFF57C5208CCF9, fd504; +fma.rn.f64 fd509, fd181, 0dBFEBFAA5C136B224, fd505; +fma.rn.f64 fd510, fd184, 0dBFDF0F2FF6705BEC, fd506; +fma.rn.f64 fd511, fd1239, 0dBFEBFAA5C136B224, fd507; +fma.rn.f64 fd512, fd183, 0dBFDF0F2FF6705BEC, fd508; +fma.rn.f64 fd513, fd125, 0d3FD63A3FCFACA412, %62; +fma.rn.f64 fd517, fd129, 0dBFE847BF1D5146CC, fd513; +fma.rn.f64 fd1222, fd128, 0dBFEE0210C26A6E6F, 0d0000000000000000; +fma.rn.f64 fd518, fd132, 0dBFE4D80B1AD9CCF6, fd1222; +fma.rn.f64 fd1221, fd1272, 0d3FD63A3FCFACA412, %63; +fma.rn.f64 fd519, fd1269, 0dBFE847BF1D5146CC, fd1221; +fma.rn.f64 fd1220, fd127, 0dBFEE0210C26A6E6F, 0d0000000000000000; +fma.rn.f64 fd520, fd131, 0dBFE4D80B1AD9CCF6, fd1220; +fma.rn.f64 fd521, fd133, 0dBFEBFAA5C136B224, fd517; +fma.rn.f64 fd522, fd136, 0d3FDF0F2FF6705BEC, fd518; +fma.rn.f64 fd523, fd1267, 0dBFEBFAA5C136B224, fd519; +fma.rn.f64 fd524, fd135, 0d3FDF0F2FF6705BEC, fd520; +fma.rn.f64 fd525, fd137, 0d3FC361FC440B478F, fd521; +fma.rn.f64 fd526, fd140, 0d3FEFA18852C3E08A, fd522; +fma.rn.f64 fd527, fd1265, 0d3FC361FC440B478F, fd523; +fma.rn.f64 fd528, fd139, 0d3FEFA18852C3E08A, fd524; +fma.rn.f64 fd529, fd141, 0d3FEF584F2CE43B84, fd525; +fma.rn.f64 fd530, fd144, 0d3FC9C4266041CA8F, fd526; +fma.rn.f64 fd531, fd1262, 0d3FEF584F2CE43B84, fd527; +fma.rn.f64 fd532, fd143, 0d3FC9C4266041CA8F, fd528; +fma.rn.f64 fd533, fd145, 0d3FE0ED45EEA3B09F, fd529; +fma.rn.f64 fd534, fd148, 0dBFEB2818007C19DF, fd530; +fma.rn.f64 fd535, fd1260, 0d3FE0ED45EEA3B09F, fd531; +fma.rn.f64 fd536, fd147, 0dBFEB2818007C19DF, fd532; +fma.rn.f64 fd537, fd149, 0dBFE3965F49174D13, fd533; +fma.rn.f64 fd538, fd152, 0dBFE94E08EB13C451, fd534; +fma.rn.f64 fd539, fd1258, 0dBFE3965F49174D13, fd535; +fma.rn.f64 fd540, fd151, 0dBFE94E08EB13C451, fd536; +fma.rn.f64 fd541, fd153, 0dBFEE884F0CC22CCC, fd537; +fma.rn.f64 fd542, fd156, 0d3FD328C3F1B322CB, fd538; +fma.rn.f64 fd543, fd1255, 0dBFEE884F0CC22CCC, fd539; +fma.rn.f64 fd544, fd155, 0d3FD328C3F1B322CB, fd540; +fma.rn.f64 fd545, fd157, 0dBFA9EEB01776B57D, fd541; +fma.rn.f64 fd546, fd160, 0d3FEFF57C5208CCF9, fd542; +fma.rn.f64 fd547, fd1253, 0dBFA9EEB01776B57D, fd543; +fma.rn.f64 fd548, fd159, 0d3FEFF57C5208CCF9, fd544; +fma.rn.f64 fd549, fd161, 0d3FED681A366A00FA, fd545; +fma.rn.f64 fd550, fd164, 0d3FD93D20572CA90B, fd546; +fma.rn.f64 fd551, fd1251, 0d3FED681A366A00FA, fd547; +fma.rn.f64 fd552, fd163, 0d3FD93D20572CA90B, fd548; +fma.rn.f64 fd553, fd165, 0d3FE60C045A2E9729, fd549; +fma.rn.f64 fd554, fd168, 0dBFE73180A4B0D300, fd550; +fma.rn.f64 fd555, fd1248, 0d3FE60C045A2E9729, fd551; +fma.rn.f64 fd556, fd167, 0dBFE73180A4B0D300, fd552; +fma.rn.f64 fd557, fd169, 0dBFDC2F6AF3928A8E, fd553; +fma.rn.f64 fd558, fd172, 0dBFECBAD095F50378, fd554; +fma.rn.f64 fd559, fd1246, 0dBFDC2F6AF3928A8E, fd555; +fma.rn.f64 fd560, fd171, 0dBFECBAD095F50378, fd556; +fma.rn.f64 fd561, fd173, 0dBFEFD5F830F860F9, fd557; +fma.rn.f64 fd562, fd176, 0d3FB9E62ACA53C49F, fd558; +fma.rn.f64 fd563, fd1244, 0dBFEFD5F830F860F9, fd559; +fma.rn.f64 fd564, fd175, 0d3FB9E62ACA53C49F, fd560; +fma.rn.f64 fd565, fd177, 0dBFD00AB0EB2D7D94, fd561; +fma.rn.f64 fd566, fd180, 0d3FEEFA7CDDB128FA, fd562; +fma.rn.f64 fd567, fd1241, 0dBFD00AB0EB2D7D94, fd563; +fma.rn.f64 fd568, fd179, 0d3FEEFA7CDDB128FA, fd564; +fma.rn.f64 fd569, fd181, 0d3FEA43B1B1379AFF, fd565; +fma.rn.f64 fd570, fd184, 0d3FE247D447A27216, fd566; +fma.rn.f64 fd571, fd1239, 0d3FEA43B1B1379AFF, fd567; +fma.rn.f64 fd572, fd183, 0d3FE247D447A27216, fd568; +fma.rn.f64 fd573, fd125, 0d3FC361FC440B478F, %62; +fma.rn.f64 fd577, fd129, 0dBFEE884F0CC22CCC, fd573; +fma.rn.f64 fd1219, fd128, 0dBFEFA18852C3E08A, 0d0000000000000000; +fma.rn.f64 fd578, fd132, 0dBFD328C3F1B322CB, fd1219; +fma.rn.f64 fd1218, fd1272, 0d3FC361FC440B478F, %63; +fma.rn.f64 fd579, fd1269, 0dBFEE884F0CC22CCC, fd1218; +fma.rn.f64 fd1217, fd127, 0dBFEFA18852C3E08A, 0d0000000000000000; +fma.rn.f64 fd580, fd131, 0dBFD328C3F1B322CB, fd1217; +fma.rn.f64 fd581, fd133, 0dBFDC2F6AF3928A8E, fd577; +fma.rn.f64 fd582, fd136, 0d3FECBAD095F50378, fd578; +fma.rn.f64 fd583, fd1267, 0dBFDC2F6AF3928A8E, fd579; +fma.rn.f64 fd584, fd135, 0d3FECBAD095F50378, fd580; +fma.rn.f64 fd585, fd137, 0d3FEA43B1B1379AFF, fd581; +fma.rn.f64 fd586, fd140, 0d3FE247D447A27216, fd582; +fma.rn.f64 fd587, fd1265, 0d3FEA43B1B1379AFF, fd583; +fma.rn.f64 fd588, fd139, 0d3FE247D447A27216, fd584; +fma.rn.f64 fd589, fd141, 0d3FE60C045A2E9729, fd585; +fma.rn.f64 fd590, fd144, 0dBFE73180A4B0D300, fd586; +fma.rn.f64 fd591, fd1262, 0d3FE60C045A2E9729, fd587; +fma.rn.f64 fd592, fd143, 0dBFE73180A4B0D300, fd588; +fma.rn.f64 fd593, fd145, 0dBFE3965F49174D13, fd589; +fma.rn.f64 fd594, fd148, 0dBFE94E08EB13C451, fd590; +fma.rn.f64 fd595, fd1260, 0dBFE3965F49174D13, fd591; +fma.rn.f64 fd596, fd147, 0dBFE94E08EB13C451, fd592; +fma.rn.f64 fd597, fd149, 0dBFEBFAA5C136B224, fd593; +fma.rn.f64 fd598, fd152, 0d3FDF0F2FF6705BEC, fd594; +fma.rn.f64 fd599, fd1258, 0dBFEBFAA5C136B224, fd595; +fma.rn.f64 fd600, fd151, 0d3FDF0F2FF6705BEC, fd596; +fma.rn.f64 fd601, fd153, 0d3FD63A3FCFACA412, fd597; +fma.rn.f64 fd602, fd156, 0d3FEE0210C26A6E6F, fd598; +fma.rn.f64 fd603, fd1255, 0d3FD63A3FCFACA412, fd599; +fma.rn.f64 fd604, fd155, 0d3FEE0210C26A6E6F, fd600; +fma.rn.f64 fd605, fd157, 0d3FEF584F2CE43B84, fd601; +fma.rn.f64 fd606, fd160, 0dBFC9C4266041CA8F, fd602; +fma.rn.f64 fd607, fd1253, 0d3FEF584F2CE43B84, fd603; +fma.rn.f64 fd608, fd159, 0dBFC9C4266041CA8F, fd604; +fma.rn.f64 fd609, fd161, 0dBFA9EEB01776B57D, fd605; +fma.rn.f64 fd610, fd164, 0dBFEFF57C5208CCF9, fd606; +fma.rn.f64 fd611, fd1251, 0dBFA9EEB01776B57D, fd607; +fma.rn.f64 fd612, fd163, 0dBFEFF57C5208CCF9, fd608; +fma.rn.f64 fd613, fd165, 0dBFEFD5F830F860F9, fd609; +fma.rn.f64 fd614, fd168, 0dBFB9E62ACA53C49F, fd610; +fma.rn.f64 fd615, fd1248, 0dBFEFD5F830F860F9, fd611; +fma.rn.f64 fd616, fd167, 0dBFB9E62ACA53C49F, fd612; +fma.rn.f64 fd617, fd169, 0dBFD00AB0EB2D7D94, fd613; +fma.rn.f64 fd618, fd172, 0d3FEEFA7CDDB128FA, fd614; +fma.rn.f64 fd619, fd1246, 0dBFD00AB0EB2D7D94, fd615; +fma.rn.f64 fd620, fd171, 0d3FEEFA7CDDB128FA, fd616; +fma.rn.f64 fd621, fd173, 0d3FED681A366A00FA, fd617; +fma.rn.f64 fd622, fd176, 0d3FD93D20572CA90B, fd618; +fma.rn.f64 fd623, fd1244, 0d3FED681A366A00FA, fd619; +fma.rn.f64 fd624, fd175, 0d3FD93D20572CA90B, fd620; +fma.rn.f64 fd625, fd177, 0d3FE0ED45EEA3B09F, fd621; +fma.rn.f64 fd626, fd180, 0dBFEB2818007C19DF, fd622; +fma.rn.f64 fd627, fd1241, 0d3FE0ED45EEA3B09F, fd623; +fma.rn.f64 fd628, fd179, 0dBFEB2818007C19DF, fd624; +fma.rn.f64 fd629, fd181, 0dBFE847BF1D5146CC, fd625; +fma.rn.f64 fd630, fd184, 0dBFE4D80B1AD9CCF6, fd626; +fma.rn.f64 fd631, fd1239, 0dBFE847BF1D5146CC, fd627; +fma.rn.f64 fd632, fd183, 0dBFE4D80B1AD9CCF6, fd628; +fma.rn.f64 fd633, fd125, 0dBFA9EEB01776B57D, %62; +fma.rn.f64 fd637, fd129, 0dBFEFD5F830F860F9, fd633; +fma.rn.f64 fd1216, fd128, 0dBFEFF57C5208CCF9, 0d0000000000000000; +fma.rn.f64 fd638, fd132, 0d3FB9E62ACA53C49F, fd1216; +fma.rn.f64 fd1215, fd1272, 0dBFA9EEB01776B57D, %63; +fma.rn.f64 fd639, fd1269, 0dBFEFD5F830F860F9, fd1215; +fma.rn.f64 fd1214, fd127, 0dBFEFF57C5208CCF9, 0d0000000000000000; +fma.rn.f64 fd640, fd131, 0d3FB9E62ACA53C49F, fd1214; +fma.rn.f64 fd641, fd133, 0d3FC361FC440B478F, fd637; +fma.rn.f64 fd642, fd136, 0d3FEFA18852C3E08A, fd638; +fma.rn.f64 fd643, fd1267, 0d3FC361FC440B478F, fd639; +fma.rn.f64 fd644, fd135, 0d3FEFA18852C3E08A, fd640; +fma.rn.f64 fd645, fd137, 0d3FEF584F2CE43B84, fd641; +fma.rn.f64 fd646, fd140, 0dBFC9C4266041CA8F, fd642; +fma.rn.f64 fd647, fd1265, 0d3FEF584F2CE43B84, fd643; +fma.rn.f64 fd648, fd139, 0dBFC9C4266041CA8F, fd644; +fma.rn.f64 fd649, fd141, 0dBFD00AB0EB2D7D94, fd645; +fma.rn.f64 fd650, fd144, 0dBFEEFA7CDDB128FA, fd646; +fma.rn.f64 fd651, fd1262, 0dBFD00AB0EB2D7D94, fd647; +fma.rn.f64 fd652, fd143, 0dBFEEFA7CDDB128FA, fd648; +fma.rn.f64 fd653, fd145, 0dBFEE884F0CC22CCC, fd649; +fma.rn.f64 fd654, fd148, 0d3FD328C3F1B322CB, fd650; +fma.rn.f64 fd655, fd1260, 0dBFEE884F0CC22CCC, fd651; +fma.rn.f64 fd656, fd147, 0d3FD328C3F1B322CB, fd652; +fma.rn.f64 fd657, fd149, 0d3FD63A3FCFACA412, fd653; +fma.rn.f64 fd658, fd152, 0d3FEE0210C26A6E6F, fd654; +fma.rn.f64 fd659, fd1258, 0d3FD63A3FCFACA412, fd655; +fma.rn.f64 fd660, fd151, 0d3FEE0210C26A6E6F, fd656; +fma.rn.f64 fd661, fd153, 0d3FED681A366A00FA, fd657; +fma.rn.f64 fd662, fd156, 0dBFD93D20572CA90B, fd658; +fma.rn.f64 fd663, fd1255, 0d3FED681A366A00FA, fd659; +fma.rn.f64 fd664, fd155, 0dBFD93D20572CA90B, fd660; +fma.rn.f64 fd665, fd157, 0dBFDC2F6AF3928A8E, fd661; +fma.rn.f64 fd666, fd160, 0dBFECBAD095F50378, fd662; +fma.rn.f64 fd667, fd1253, 0dBFDC2F6AF3928A8E, fd663; +fma.rn.f64 fd668, fd159, 0dBFECBAD095F50378, fd664; +fma.rn.f64 fd669, fd161, 0dBFEBFAA5C136B224, fd665; +fma.rn.f64 fd670, fd164, 0d3FDF0F2FF6705BEC, fd666; +fma.rn.f64 fd671, fd1251, 0dBFEBFAA5C136B224, fd667; +fma.rn.f64 fd672, fd163, 0d3FDF0F2FF6705BEC, fd668; +fma.rn.f64 fd673, fd165, 0d3FE0ED45EEA3B09F, fd669; +fma.rn.f64 fd674, fd168, 0d3FEB2818007C19DF, fd670; +fma.rn.f64 fd675, fd1248, 0d3FE0ED45EEA3B09F, fd671; +fma.rn.f64 fd676, fd167, 0d3FEB2818007C19DF, fd672; +fma.rn.f64 fd677, fd169, 0d3FEA43B1B1379AFF, fd673; +fma.rn.f64 fd678, fd172, 0dBFE247D447A27216, fd674; +fma.rn.f64 fd679, fd1246, 0d3FEA43B1B1379AFF, fd675; +fma.rn.f64 fd680, fd171, 0dBFE247D447A27216, fd676; +fma.rn.f64 fd681, fd173, 0dBFE3965F49174D13, fd677; +fma.rn.f64 fd682, fd176, 0dBFE94E08EB13C451, fd678; +fma.rn.f64 fd683, fd1244, 0dBFE3965F49174D13, fd679; +fma.rn.f64 fd684, fd175, 0dBFE94E08EB13C451, fd680; +fma.rn.f64 fd685, fd177, 0dBFE847BF1D5146CC, fd681; +fma.rn.f64 fd686, fd180, 0d3FE4D80B1AD9CCF6, fd682; +fma.rn.f64 fd687, fd1241, 0dBFE847BF1D5146CC, fd683; +fma.rn.f64 fd688, fd179, 0d3FE4D80B1AD9CCF6, fd684; +fma.rn.f64 fd689, fd181, 0d3FE60C045A2E9729, fd685; +fma.rn.f64 fd690, fd184, 0d3FE73180A4B0D300, fd686; +fma.rn.f64 fd691, fd1239, 0d3FE60C045A2E9729, fd687; +fma.rn.f64 fd692, fd183, 0d3FE73180A4B0D300, fd688; +fma.rn.f64 fd693, fd125, 0dBFD00AB0EB2D7D94, %62; +fma.rn.f64 fd697, fd129, 0dBFEBFAA5C136B224, fd693; +fma.rn.f64 fd1213, fd128, 0dBFEEFA7CDDB128FA, 0d0000000000000000; +fma.rn.f64 fd698, fd132, 0d3FDF0F2FF6705BEC, fd1213; +fma.rn.f64 fd1212, fd1272, 0dBFD00AB0EB2D7D94, %63; +fma.rn.f64 fd699, fd1269, 0dBFEBFAA5C136B224, fd1212; +fma.rn.f64 fd1211, fd127, 0dBFEEFA7CDDB128FA, 0d0000000000000000; +fma.rn.f64 fd700, fd131, 0d3FDF0F2FF6705BEC, fd1211; +fma.rn.f64 fd701, fd133, 0d3FE60C045A2E9729, fd697; +fma.rn.f64 fd702, fd136, 0d3FE73180A4B0D300, fd698; +fma.rn.f64 fd703, fd1267, 0d3FE60C045A2E9729, fd699; +fma.rn.f64 fd704, fd135, 0d3FE73180A4B0D300, fd700; +fma.rn.f64 fd705, fd137, 0d3FE0ED45EEA3B09F, fd701; +fma.rn.f64 fd706, fd140, 0dBFEB2818007C19DF, fd702; +fma.rn.f64 fd707, fd1265, 0d3FE0ED45EEA3B09F, fd703; +fma.rn.f64 fd708, fd139, 0dBFEB2818007C19DF, fd704; +fma.rn.f64 fd709, fd141, 0dBFEE884F0CC22CCC, fd705; +fma.rn.f64 fd710, fd144, 0dBFD328C3F1B322CB, fd706; +fma.rn.f64 fd711, fd1262, 0dBFEE884F0CC22CCC, fd707; +fma.rn.f64 fd712, fd143, 0dBFD328C3F1B322CB, fd708; +fma.rn.f64 fd713, fd145, 0dBFA9EEB01776B57D, fd709; +fma.rn.f64 fd714, fd148, 0d3FEFF57C5208CCF9, fd710; +fma.rn.f64 fd715, fd1260, 0dBFA9EEB01776B57D, fd711; +fma.rn.f64 fd716, fd147, 0d3FEFF57C5208CCF9, fd712; +fma.rn.f64 fd717, fd149, 0d3FEF584F2CE43B84, fd713; +fma.rn.f64 fd718, fd152, 0dBFC9C4266041CA8F, fd714; +fma.rn.f64 fd719, fd1258, 0d3FEF584F2CE43B84, fd715; +fma.rn.f64 fd720, fd151, 0dBFC9C4266041CA8F, fd716; +fma.rn.f64 fd721, fd153, 0dBFDC2F6AF3928A8E, fd717; +fma.rn.f64 fd722, fd156, 0dBFECBAD095F50378, fd718; +fma.rn.f64 fd723, fd1255, 0dBFDC2F6AF3928A8E, fd719; +fma.rn.f64 fd724, fd155, 0dBFECBAD095F50378, fd720; +fma.rn.f64 fd725, fd157, 0dBFE847BF1D5146CC, fd721; +fma.rn.f64 fd726, fd160, 0d3FE4D80B1AD9CCF6, fd722; +fma.rn.f64 fd727, fd1253, 0dBFE847BF1D5146CC, fd723; +fma.rn.f64 fd728, fd159, 0d3FE4D80B1AD9CCF6, fd724; +fma.rn.f64 fd729, fd161, 0d3FEA43B1B1379AFF, fd725; +fma.rn.f64 fd730, fd164, 0d3FE247D447A27216, fd726; +fma.rn.f64 fd731, fd1251, 0d3FEA43B1B1379AFF, fd727; +fma.rn.f64 fd732, fd163, 0d3FE247D447A27216, fd728; +fma.rn.f64 fd733, fd165, 0d3FD63A3FCFACA412, fd729; +fma.rn.f64 fd734, fd168, 0dBFEE0210C26A6E6F, fd730; +fma.rn.f64 fd735, fd1248, 0d3FD63A3FCFACA412, fd731; +fma.rn.f64 fd736, fd167, 0dBFEE0210C26A6E6F, fd732; +fma.rn.f64 fd737, fd169, 0dBFEFD5F830F860F9, fd733; +fma.rn.f64 fd738, fd172, 0dBFB9E62ACA53C49F, fd734; +fma.rn.f64 fd739, fd1246, 0dBFEFD5F830F860F9, fd735; +fma.rn.f64 fd740, fd171, 0dBFB9E62ACA53C49F, fd736; +fma.rn.f64 fd741, fd173, 0d3FC361FC440B478F, fd737; +fma.rn.f64 fd742, fd176, 0d3FEFA18852C3E08A, fd738; +fma.rn.f64 fd743, fd1244, 0d3FC361FC440B478F, fd739; +fma.rn.f64 fd744, fd175, 0d3FEFA18852C3E08A, fd740; +fma.rn.f64 fd745, fd177, 0d3FED681A366A00FA, fd741; +fma.rn.f64 fd746, fd180, 0dBFD93D20572CA90B, fd742; +fma.rn.f64 fd747, fd1241, 0d3FED681A366A00FA, fd743; +fma.rn.f64 fd748, fd179, 0dBFD93D20572CA90B, fd744; +fma.rn.f64 fd749, fd181, 0dBFE3965F49174D13, fd745; +fma.rn.f64 fd750, fd184, 0dBFE94E08EB13C451, fd746; +fma.rn.f64 fd751, fd1239, 0dBFE3965F49174D13, fd747; +fma.rn.f64 fd752, fd183, 0dBFE94E08EB13C451, fd748; +fma.rn.f64 fd753, fd125, 0dBFDC2F6AF3928A8E, %62; +fma.rn.f64 fd757, fd129, 0dBFE3965F49174D13, fd753; +fma.rn.f64 fd1210, fd128, 0dBFECBAD095F50378, 0d0000000000000000; +fma.rn.f64 fd758, fd132, 0d3FE94E08EB13C451, fd1210; +fma.rn.f64 fd1209, fd1272, 0dBFDC2F6AF3928A8E, %63; +fma.rn.f64 fd759, fd1269, 0dBFE3965F49174D13, fd1209; +fma.rn.f64 fd1208, fd127, 0dBFECBAD095F50378, 0d0000000000000000; +fma.rn.f64 fd760, fd131, 0d3FE94E08EB13C451, fd1208; +fma.rn.f64 fd761, fd133, 0d3FEF584F2CE43B84, fd757; +fma.rn.f64 fd762, fd136, 0d3FC9C4266041CA8F, fd758; +fma.rn.f64 fd763, fd1267, 0d3FEF584F2CE43B84, fd759; +fma.rn.f64 fd764, fd135, 0d3FC9C4266041CA8F, fd760; +fma.rn.f64 fd765, fd137, 0dBFD00AB0EB2D7D94, fd761; +fma.rn.f64 fd766, fd140, 0dBFEEFA7CDDB128FA, fd762; +fma.rn.f64 fd767, fd1265, 0dBFD00AB0EB2D7D94, fd763; +fma.rn.f64 fd768, fd139, 0dBFEEFA7CDDB128FA, fd764; +fma.rn.f64 fd769, fd141, 0dBFE847BF1D5146CC, fd765; +fma.rn.f64 fd770, fd144, 0d3FE4D80B1AD9CCF6, fd766; +fma.rn.f64 fd771, fd1262, 0dBFE847BF1D5146CC, fd767; +fma.rn.f64 fd772, fd143, 0d3FE4D80B1AD9CCF6, fd768; +fma.rn.f64 fd773, fd145, 0d3FED681A366A00FA, fd769; +fma.rn.f64 fd774, fd148, 0d3FD93D20572CA90B, fd770; +fma.rn.f64 fd775, fd1260, 0d3FED681A366A00FA, fd771; +fma.rn.f64 fd776, fd147, 0d3FD93D20572CA90B, fd772; +fma.rn.f64 fd777, fd149, 0dBFA9EEB01776B57D, fd773; +fma.rn.f64 fd778, fd152, 0dBFEFF57C5208CCF9, fd774; +fma.rn.f64 fd779, fd1258, 0dBFA9EEB01776B57D, fd775; +fma.rn.f64 fd780, fd151, 0dBFEFF57C5208CCF9, fd776; +fma.rn.f64 fd781, fd153, 0dBFEBFAA5C136B224, fd777; +fma.rn.f64 fd782, fd156, 0d3FDF0F2FF6705BEC, fd778; +fma.rn.f64 fd783, fd1255, 0dBFEBFAA5C136B224, fd779; +fma.rn.f64 fd784, fd155, 0d3FDF0F2FF6705BEC, fd780; +fma.rn.f64 fd785, fd157, 0d3FEA43B1B1379AFF, fd781; +fma.rn.f64 fd786, fd160, 0d3FE247D447A27216, fd782; +fma.rn.f64 fd787, fd1253, 0d3FEA43B1B1379AFF, fd783; +fma.rn.f64 fd788, fd159, 0d3FE247D447A27216, fd784; +fma.rn.f64 fd789, fd161, 0d3FC361FC440B478F, fd785; +fma.rn.f64 fd790, fd164, 0dBFEFA18852C3E08A, fd786; +fma.rn.f64 fd791, fd1251, 0d3FC361FC440B478F, fd787; +fma.rn.f64 fd792, fd163, 0dBFEFA18852C3E08A, fd788; +fma.rn.f64 fd793, fd165, 0dBFEE884F0CC22CCC, fd789; +fma.rn.f64 fd794, fd168, 0d3FD328C3F1B322CB, fd790; +fma.rn.f64 fd795, fd1248, 0dBFEE884F0CC22CCC, fd791; +fma.rn.f64 fd796, fd167, 0d3FD328C3F1B322CB, fd792; +fma.rn.f64 fd797, fd169, 0d3FE60C045A2E9729, fd793; +fma.rn.f64 fd798, fd172, 0d3FE73180A4B0D300, fd794; +fma.rn.f64 fd799, fd1246, 0d3FE60C045A2E9729, fd795; +fma.rn.f64 fd800, fd171, 0d3FE73180A4B0D300, fd796; +fma.rn.f64 fd801, fd173, 0d3FD63A3FCFACA412, fd797; +fma.rn.f64 fd802, fd176, 0dBFEE0210C26A6E6F, fd798; +fma.rn.f64 fd803, fd1244, 0d3FD63A3FCFACA412, fd799; +fma.rn.f64 fd804, fd175, 0dBFEE0210C26A6E6F, fd800; +fma.rn.f64 fd805, fd177, 0dBFEFD5F830F860F9, fd801; +fma.rn.f64 fd806, fd180, 0d3FB9E62ACA53C49F, fd802; +fma.rn.f64 fd807, fd1241, 0dBFEFD5F830F860F9, fd803; +fma.rn.f64 fd808, fd179, 0d3FB9E62ACA53C49F, fd804; +fma.rn.f64 fd809, fd181, 0d3FE0ED45EEA3B09F, fd805; +fma.rn.f64 fd810, fd184, 0d3FEB2818007C19DF, fd806; +fma.rn.f64 fd811, fd1239, 0d3FE0ED45EEA3B09F, fd807; +fma.rn.f64 fd812, fd183, 0d3FEB2818007C19DF, fd808; +fma.rn.f64 fd813, fd125, 0dBFE3965F49174D13, %62; +fma.rn.f64 fd817, fd129, 0dBFD00AB0EB2D7D94, fd813; +fma.rn.f64 fd1207, fd128, 0dBFE94E08EB13C451, 0d0000000000000000; +fma.rn.f64 fd818, fd132, 0d3FEEFA7CDDB128FA, fd1207; +fma.rn.f64 fd1206, fd1272, 0dBFE3965F49174D13, %63; +fma.rn.f64 fd819, fd1269, 0dBFD00AB0EB2D7D94, fd1206; +fma.rn.f64 fd1205, fd127, 0dBFE94E08EB13C451, 0d0000000000000000; +fma.rn.f64 fd820, fd131, 0d3FEEFA7CDDB128FA, fd1205; +fma.rn.f64 fd821, fd133, 0d3FED681A366A00FA, fd817; +fma.rn.f64 fd822, fd136, 0dBFD93D20572CA90B, fd818; +fma.rn.f64 fd823, fd1267, 0d3FED681A366A00FA, fd819; +fma.rn.f64 fd824, fd135, 0dBFD93D20572CA90B, fd820; +fma.rn.f64 fd825, fd137, 0dBFEBFAA5C136B224, fd821; +fma.rn.f64 fd826, fd140, 0dBFDF0F2FF6705BEC, fd822; +fma.rn.f64 fd827, fd1265, 0dBFEBFAA5C136B224, fd823; +fma.rn.f64 fd828, fd139, 0dBFDF0F2FF6705BEC, fd824; +fma.rn.f64 fd829, fd141, 0d3FC361FC440B478F, fd825; +fma.rn.f64 fd830, fd144, 0d3FEFA18852C3E08A, fd826; +fma.rn.f64 fd831, fd1262, 0d3FC361FC440B478F, fd827; +fma.rn.f64 fd832, fd143, 0d3FEFA18852C3E08A, fd828; +fma.rn.f64 fd833, fd145, 0d3FE60C045A2E9729, fd829; +fma.rn.f64 fd834, fd148, 0dBFE73180A4B0D300, fd830; +fma.rn.f64 fd835, fd1260, 0d3FE60C045A2E9729, fd831; +fma.rn.f64 fd836, fd147, 0dBFE73180A4B0D300, fd832; +fma.rn.f64 fd837, fd149, 0dBFEFD5F830F860F9, fd833; +fma.rn.f64 fd838, fd152, 0dBFB9E62ACA53C49F, fd834; +fma.rn.f64 fd839, fd1258, 0dBFEFD5F830F860F9, fd835; +fma.rn.f64 fd840, fd151, 0dBFB9E62ACA53C49F, fd836; +fma.rn.f64 fd841, fd153, 0d3FE0ED45EEA3B09F, fd837; +fma.rn.f64 fd842, fd156, 0d3FEB2818007C19DF, fd838; +fma.rn.f64 fd843, fd1255, 0d3FE0ED45EEA3B09F, fd839; +fma.rn.f64 fd844, fd155, 0d3FEB2818007C19DF, fd840; +fma.rn.f64 fd845, fd157, 0d3FD63A3FCFACA412, fd841; +fma.rn.f64 fd846, fd160, 0dBFEE0210C26A6E6F, fd842; +fma.rn.f64 fd847, fd1253, 0d3FD63A3FCFACA412, fd843; +fma.rn.f64 fd848, fd159, 0dBFEE0210C26A6E6F, fd844; +fma.rn.f64 fd849, fd161, 0dBFEE884F0CC22CCC, fd845; +fma.rn.f64 fd850, fd164, 0d3FD328C3F1B322CB, fd846; +fma.rn.f64 fd851, fd1251, 0dBFEE884F0CC22CCC, fd847; +fma.rn.f64 fd852, fd163, 0d3FD328C3F1B322CB, fd848; +fma.rn.f64 fd853, fd165, 0d3FEA43B1B1379AFF, fd849; +fma.rn.f64 fd854, fd168, 0d3FE247D447A27216, fd850; +fma.rn.f64 fd855, fd1248, 0d3FEA43B1B1379AFF, fd851; +fma.rn.f64 fd856, fd167, 0d3FE247D447A27216, fd852; +fma.rn.f64 fd857, fd169, 0dBFA9EEB01776B57D, fd853; +fma.rn.f64 fd858, fd172, 0dBFEFF57C5208CCF9, fd854; +fma.rn.f64 fd859, fd1246, 0dBFA9EEB01776B57D, fd855; +fma.rn.f64 fd860, fd171, 0dBFEFF57C5208CCF9, fd856; +fma.rn.f64 fd861, fd173, 0dBFE847BF1D5146CC, fd857; +fma.rn.f64 fd862, fd176, 0d3FE4D80B1AD9CCF6, fd858; +fma.rn.f64 fd863, fd1244, 0dBFE847BF1D5146CC, fd859; +fma.rn.f64 fd864, fd175, 0d3FE4D80B1AD9CCF6, fd860; +fma.rn.f64 fd865, fd177, 0d3FEF584F2CE43B84, fd861; +fma.rn.f64 fd866, fd180, 0d3FC9C4266041CA8F, fd862; +fma.rn.f64 fd867, fd1241, 0d3FEF584F2CE43B84, fd863; +fma.rn.f64 fd868, fd179, 0d3FC9C4266041CA8F, fd864; +fma.rn.f64 fd869, fd181, 0dBFDC2F6AF3928A8E, fd865; +fma.rn.f64 fd870, fd184, 0dBFECBAD095F50378, fd866; +fma.rn.f64 fd871, fd1239, 0dBFDC2F6AF3928A8E, fd867; +fma.rn.f64 fd872, fd183, 0dBFECBAD095F50378, fd868; +fma.rn.f64 fd873, fd125, 0dBFE847BF1D5146CC, %62; +fma.rn.f64 fd877, fd129, 0d3FC361FC440B478F, fd873; +fma.rn.f64 fd1204, fd128, 0dBFE4D80B1AD9CCF6, 0d0000000000000000; +fma.rn.f64 fd878, fd132, 0d3FEFA18852C3E08A, fd1204; +fma.rn.f64 fd1203, fd1272, 0dBFE847BF1D5146CC, %63; +fma.rn.f64 fd879, fd1269, 0d3FC361FC440B478F, fd1203; +fma.rn.f64 fd1202, fd127, 0dBFE4D80B1AD9CCF6, 0d0000000000000000; +fma.rn.f64 fd880, fd131, 0d3FEFA18852C3E08A, fd1202; +fma.rn.f64 fd881, fd133, 0d3FE0ED45EEA3B09F, fd877; +fma.rn.f64 fd882, fd136, 0dBFEB2818007C19DF, fd878; +fma.rn.f64 fd883, fd1267, 0d3FE0ED45EEA3B09F, fd879; +fma.rn.f64 fd884, fd135, 0dBFEB2818007C19DF, fd880; +fma.rn.f64 fd885, fd137, 0dBFEE884F0CC22CCC, fd881; +fma.rn.f64 fd886, fd140, 0d3FD328C3F1B322CB, fd882; +fma.rn.f64 fd887, fd1265, 0dBFEE884F0CC22CCC, fd883; +fma.rn.f64 fd888, fd139, 0d3FD328C3F1B322CB, fd884; +fma.rn.f64 fd889, fd141, 0d3FED681A366A00FA, fd885; +fma.rn.f64 fd890, fd144, 0d3FD93D20572CA90B, fd886; +fma.rn.f64 fd891, fd1262, 0d3FED681A366A00FA, fd887; +fma.rn.f64 fd892, fd143, 0d3FD93D20572CA90B, fd888; +fma.rn.f64 fd893, fd145, 0dBFDC2F6AF3928A8E, fd889; +fma.rn.f64 fd894, fd148, 0dBFECBAD095F50378, fd890; +fma.rn.f64 fd895, fd1260, 0dBFDC2F6AF3928A8E, fd891; +fma.rn.f64 fd896, fd147, 0dBFECBAD095F50378, fd892; +fma.rn.f64 fd897, fd149, 0dBFD00AB0EB2D7D94, fd893; +fma.rn.f64 fd898, fd152, 0d3FEEFA7CDDB128FA, fd894; +fma.rn.f64 fd899, fd1258, 0dBFD00AB0EB2D7D94, fd895; +fma.rn.f64 fd900, fd151, 0d3FEEFA7CDDB128FA, fd896; +fma.rn.f64 fd901, fd153, 0d3FEA43B1B1379AFF, fd897; +fma.rn.f64 fd902, fd156, 0dBFE247D447A27216, fd898; +fma.rn.f64 fd903, fd1255, 0d3FEA43B1B1379AFF, fd899; +fma.rn.f64 fd904, fd155, 0dBFE247D447A27216, fd900; +fma.rn.f64 fd905, fd157, 0dBFEFD5F830F860F9, fd901; +fma.rn.f64 fd906, fd160, 0dBFB9E62ACA53C49F, fd902; +fma.rn.f64 fd907, fd1253, 0dBFEFD5F830F860F9, fd903; +fma.rn.f64 fd908, fd159, 0dBFB9E62ACA53C49F, fd904; +fma.rn.f64 fd909, fd161, 0d3FE60C045A2E9729, fd905; +fma.rn.f64 fd910, fd164, 0d3FE73180A4B0D300, fd906; +fma.rn.f64 fd911, fd1251, 0d3FE60C045A2E9729, fd907; +fma.rn.f64 fd912, fd163, 0d3FE73180A4B0D300, fd908; +fma.rn.f64 fd913, fd165, 0dBFA9EEB01776B57D, fd909; +fma.rn.f64 fd914, fd168, 0dBFEFF57C5208CCF9, fd910; +fma.rn.f64 fd915, fd1248, 0dBFA9EEB01776B57D, fd911; +fma.rn.f64 fd916, fd167, 0dBFEFF57C5208CCF9, fd912; +fma.rn.f64 fd917, fd169, 0dBFE3965F49174D13, fd913; +fma.rn.f64 fd918, fd172, 0d3FE94E08EB13C451, fd914; +fma.rn.f64 fd919, fd1246, 0dBFE3965F49174D13, fd915; +fma.rn.f64 fd920, fd171, 0d3FE94E08EB13C451, fd916; +fma.rn.f64 fd921, fd173, 0d3FEF584F2CE43B84, fd917; +fma.rn.f64 fd922, fd176, 0dBFC9C4266041CA8F, fd918; +fma.rn.f64 fd923, fd1244, 0d3FEF584F2CE43B84, fd919; +fma.rn.f64 fd924, fd175, 0dBFC9C4266041CA8F, fd920; +fma.rn.f64 fd925, fd177, 0dBFEBFAA5C136B224, fd921; +fma.rn.f64 fd926, fd180, 0dBFDF0F2FF6705BEC, fd922; +fma.rn.f64 fd927, fd1241, 0dBFEBFAA5C136B224, fd923; +fma.rn.f64 fd928, fd179, 0dBFDF0F2FF6705BEC, fd924; +fma.rn.f64 fd929, fd181, 0d3FD63A3FCFACA412, fd925; +fma.rn.f64 fd930, fd184, 0d3FEE0210C26A6E6F, fd926; +fma.rn.f64 fd931, fd1239, 0d3FD63A3FCFACA412, fd927; +fma.rn.f64 fd932, fd183, 0d3FEE0210C26A6E6F, fd928; +fma.rn.f64 fd933, fd125, 0dBFEBFAA5C136B224, %62; +fma.rn.f64 fd937, fd129, 0d3FE0ED45EEA3B09F, fd933; +fma.rn.f64 fd1201, fd128, 0dBFDF0F2FF6705BEC, 0d0000000000000000; +fma.rn.f64 fd938, fd132, 0d3FEB2818007C19DF, fd1201; +fma.rn.f64 fd1200, fd1272, 0dBFEBFAA5C136B224, %63; +fma.rn.f64 fd939, fd1269, 0d3FE0ED45EEA3B09F, fd1200; +fma.rn.f64 fd1199, fd127, 0dBFDF0F2FF6705BEC, 0d0000000000000000; +fma.rn.f64 fd940, fd131, 0d3FEB2818007C19DF, fd1199; +fma.rn.f64 fd941, fd133, 0dBFA9EEB01776B57D, fd937; +fma.rn.f64 fd942, fd136, 0dBFEFF57C5208CCF9, fd938; +fma.rn.f64 fd943, fd1267, 0dBFA9EEB01776B57D, fd939; +fma.rn.f64 fd944, fd135, 0dBFEFF57C5208CCF9, fd940; +fma.rn.f64 fd945, fd137, 0dBFDC2F6AF3928A8E, fd941; +fma.rn.f64 fd946, fd140, 0d3FECBAD095F50378, fd942; +fma.rn.f64 fd947, fd1265, 0dBFDC2F6AF3928A8E, fd943; +fma.rn.f64 fd948, fd139, 0d3FECBAD095F50378, fd944; +fma.rn.f64 fd949, fd141, 0d3FEA43B1B1379AFF, fd945; +fma.rn.f64 fd950, fd144, 0dBFE247D447A27216, fd946; +fma.rn.f64 fd951, fd1262, 0d3FEA43B1B1379AFF, fd947; +fma.rn.f64 fd952, fd143, 0dBFE247D447A27216, fd948; +fma.rn.f64 fd953, fd145, 0dBFEFD5F830F860F9, fd949; +fma.rn.f64 fd954, fd148, 0d3FB9E62ACA53C49F, fd950; +fma.rn.f64 fd955, fd1260, 0dBFEFD5F830F860F9, fd951; +fma.rn.f64 fd956, fd147, 0d3FB9E62ACA53C49F, fd952; +fma.rn.f64 fd957, fd149, 0d3FED681A366A00FA, fd953; +fma.rn.f64 fd958, fd152, 0d3FD93D20572CA90B, fd954; +fma.rn.f64 fd959, fd1258, 0d3FED681A366A00FA, fd955; +fma.rn.f64 fd960, fd151, 0d3FD93D20572CA90B, fd956; +fma.rn.f64 fd961, fd153, 0dBFE3965F49174D13, fd957; +fma.rn.f64 fd962, fd156, 0dBFE94E08EB13C451, fd958; +fma.rn.f64 fd963, fd1255, 0dBFE3965F49174D13, fd959; +fma.rn.f64 fd964, fd155, 0dBFE94E08EB13C451, fd960; +fma.rn.f64 fd965, fd157, 0d3FC361FC440B478F, fd961; +fma.rn.f64 fd966, fd160, 0d3FEFA18852C3E08A, fd962; +fma.rn.f64 fd967, fd1253, 0d3FC361FC440B478F, fd963; +fma.rn.f64 fd968, fd159, 0d3FEFA18852C3E08A, fd964; +fma.rn.f64 fd969, fd161, 0d3FD63A3FCFACA412, fd965; +fma.rn.f64 fd970, fd164, 0dBFEE0210C26A6E6F, fd966; +fma.rn.f64 fd971, fd1251, 0d3FD63A3FCFACA412, fd967; +fma.rn.f64 fd972, fd163, 0dBFEE0210C26A6E6F, fd968; +fma.rn.f64 fd973, fd165, 0dBFE847BF1D5146CC, fd969; +fma.rn.f64 fd974, fd168, 0d3FE4D80B1AD9CCF6, fd970; +fma.rn.f64 fd975, fd1248, 0dBFE847BF1D5146CC, fd971; +fma.rn.f64 fd976, fd167, 0d3FE4D80B1AD9CCF6, fd972; +fma.rn.f64 fd977, fd169, 0d3FEF584F2CE43B84, fd973; +fma.rn.f64 fd978, fd172, 0dBFC9C4266041CA8F, fd974; +fma.rn.f64 fd979, fd1246, 0d3FEF584F2CE43B84, fd975; +fma.rn.f64 fd980, fd171, 0dBFC9C4266041CA8F, fd976; +fma.rn.f64 fd981, fd173, 0dBFEE884F0CC22CCC, fd977; +fma.rn.f64 fd982, fd176, 0dBFD328C3F1B322CB, fd978; +fma.rn.f64 fd983, fd1244, 0dBFEE884F0CC22CCC, fd979; +fma.rn.f64 fd984, fd175, 0dBFD328C3F1B322CB, fd980; +fma.rn.f64 fd985, fd177, 0d3FE60C045A2E9729, fd981; +fma.rn.f64 fd986, fd180, 0d3FE73180A4B0D300, fd982; +fma.rn.f64 fd987, fd1241, 0d3FE60C045A2E9729, fd983; +fma.rn.f64 fd988, fd179, 0d3FE73180A4B0D300, fd984; +fma.rn.f64 fd989, fd181, 0dBFD00AB0EB2D7D94, fd985; +fma.rn.f64 fd990, fd184, 0dBFEEFA7CDDB128FA, fd986; +fma.rn.f64 fd991, fd1239, 0dBFD00AB0EB2D7D94, fd987; +fma.rn.f64 fd992, fd183, 0dBFEEFA7CDDB128FA, fd988; +fma.rn.f64 fd993, fd125, 0dBFEE884F0CC22CCC, %62; +fma.rn.f64 fd997, fd129, 0d3FEA43B1B1379AFF, fd993; +fma.rn.f64 fd1198, fd128, 0dBFD328C3F1B322CB, 0d0000000000000000; +fma.rn.f64 fd998, fd132, 0d3FE247D447A27216, fd1198; +fma.rn.f64 fd1197, fd1272, 0dBFEE884F0CC22CCC, %63; +fma.rn.f64 fd999, fd1269, 0d3FEA43B1B1379AFF, fd1197; +fma.rn.f64 fd1196, fd127, 0dBFD328C3F1B322CB, 0d0000000000000000; +fma.rn.f64 fd1000, fd131, 0d3FE247D447A27216, fd1196; +fma.rn.f64 fd1001, fd133, 0dBFE3965F49174D13, fd997; +fma.rn.f64 fd1002, fd136, 0dBFE94E08EB13C451, fd998; +fma.rn.f64 fd1003, fd1267, 0dBFE3965F49174D13, fd999; +fma.rn.f64 fd1004, fd135, 0dBFE94E08EB13C451, fd1000; +fma.rn.f64 fd1005, fd137, 0d3FD63A3FCFACA412, fd1001; +fma.rn.f64 fd1006, fd140, 0d3FEE0210C26A6E6F, fd1002; +fma.rn.f64 fd1007, fd1265, 0d3FD63A3FCFACA412, fd1003; +fma.rn.f64 fd1008, fd139, 0d3FEE0210C26A6E6F, fd1004; +fma.rn.f64 fd1009, fd141, 0dBFA9EEB01776B57D, fd1005; +fma.rn.f64 fd1010, fd144, 0dBFEFF57C5208CCF9, fd1006; +fma.rn.f64 fd1011, fd1262, 0dBFA9EEB01776B57D, fd1007; +fma.rn.f64 fd1012, fd143, 0dBFEFF57C5208CCF9, fd1008; +fma.rn.f64 fd1013, fd145, 0dBFD00AB0EB2D7D94, fd1009; +fma.rn.f64 fd1014, fd148, 0d3FEEFA7CDDB128FA, fd1010; +fma.rn.f64 fd1015, fd1260, 0dBFD00AB0EB2D7D94, fd1011; +fma.rn.f64 fd1016, fd147, 0d3FEEFA7CDDB128FA, fd1012; +fma.rn.f64 fd1017, fd149, 0d3FE0ED45EEA3B09F, fd1013; +fma.rn.f64 fd1018, fd152, 0dBFEB2818007C19DF, fd1014; +fma.rn.f64 fd1019, fd1258, 0d3FE0ED45EEA3B09F, fd1015; +fma.rn.f64 fd1020, fd151, 0dBFEB2818007C19DF, fd1016; +fma.rn.f64 fd1021, fd153, 0dBFE847BF1D5146CC, fd1017; +fma.rn.f64 fd1022, fd156, 0d3FE4D80B1AD9CCF6, fd1018; +fma.rn.f64 fd1023, fd1255, 0dBFE847BF1D5146CC, fd1019; +fma.rn.f64 fd1024, fd155, 0d3FE4D80B1AD9CCF6, fd1020; +fma.rn.f64 fd1025, fd157, 0d3FED681A366A00FA, fd1021; +fma.rn.f64 fd1026, fd160, 0dBFD93D20572CA90B, fd1022; +fma.rn.f64 fd1027, fd1253, 0d3FED681A366A00FA, fd1023; +fma.rn.f64 fd1028, fd159, 0dBFD93D20572CA90B, fd1024; +fma.rn.f64 fd1029, fd161, 0dBFEFD5F830F860F9, fd1025; +fma.rn.f64 fd1030, fd164, 0d3FB9E62ACA53C49F, fd1026; +fma.rn.f64 fd1031, fd1251, 0dBFEFD5F830F860F9, fd1027; +fma.rn.f64 fd1032, fd163, 0d3FB9E62ACA53C49F, fd1028; +fma.rn.f64 fd1033, fd165, 0d3FEF584F2CE43B84, fd1029; +fma.rn.f64 fd1034, fd168, 0d3FC9C4266041CA8F, fd1030; +fma.rn.f64 fd1035, fd1248, 0d3FEF584F2CE43B84, fd1031; +fma.rn.f64 fd1036, fd167, 0d3FC9C4266041CA8F, fd1032; +fma.rn.f64 fd1037, fd169, 0dBFEBFAA5C136B224, fd1033; +fma.rn.f64 fd1038, fd172, 0dBFDF0F2FF6705BEC, fd1034; +fma.rn.f64 fd1039, fd1246, 0dBFEBFAA5C136B224, fd1035; +fma.rn.f64 fd1040, fd171, 0dBFDF0F2FF6705BEC, fd1036; +fma.rn.f64 fd1041, fd173, 0d3FE60C045A2E9729, fd1037; +fma.rn.f64 fd1042, fd176, 0d3FE73180A4B0D300, fd1038; +fma.rn.f64 fd1043, fd1244, 0d3FE60C045A2E9729, fd1039; +fma.rn.f64 fd1044, fd175, 0d3FE73180A4B0D300, fd1040; +fma.rn.f64 fd1045, fd177, 0dBFDC2F6AF3928A8E, fd1041; +fma.rn.f64 fd1046, fd180, 0dBFECBAD095F50378, fd1042; +fma.rn.f64 fd1047, fd1241, 0dBFDC2F6AF3928A8E, fd1043; +fma.rn.f64 fd1048, fd179, 0dBFECBAD095F50378, fd1044; +fma.rn.f64 fd1049, fd181, 0d3FC361FC440B478F, fd1045; +fma.rn.f64 fd1050, fd184, 0d3FEFA18852C3E08A, fd1046; +fma.rn.f64 fd1051, fd1239, 0d3FC361FC440B478F, fd1047; +fma.rn.f64 fd1052, fd183, 0d3FEFA18852C3E08A, fd1048; +fma.rn.f64 fd1053, fd125, 0dBFEFD5F830F860F9, %62; +fma.rn.f64 fd1054, fd128, 0dBFB9E62ACA53C49F, 0d0000000000000000; +fma.rn.f64 fd1055, fd1272, 0dBFEFD5F830F860F9, %63; +fma.rn.f64 fd1056, fd127, 0dBFB9E62ACA53C49F, 0d0000000000000000; +fma.rn.f64 fd1057, fd129, 0d3FEF584F2CE43B84, fd1053; +fma.rn.f64 fd1058, fd132, 0d3FC9C4266041CA8F, fd1054; +fma.rn.f64 fd1059, fd1269, 0d3FEF584F2CE43B84, fd1055; +fma.rn.f64 fd1060, fd131, 0d3FC9C4266041CA8F, fd1056; +fma.rn.f64 fd1061, fd133, 0dBFEE884F0CC22CCC, fd1057; +fma.rn.f64 fd1062, fd136, 0dBFD328C3F1B322CB, fd1058; +fma.rn.f64 fd1063, fd1267, 0dBFEE884F0CC22CCC, fd1059; +fma.rn.f64 fd1064, fd135, 0dBFD328C3F1B322CB, fd1060; +fma.rn.f64 fd1065, fd137, 0d3FED681A366A00FA, fd1061; +fma.rn.f64 fd1066, fd140, 0d3FD93D20572CA90B, fd1062; +fma.rn.f64 fd1067, fd1265, 0d3FED681A366A00FA, fd1063; +fma.rn.f64 fd1068, fd139, 0d3FD93D20572CA90B, fd1064; +fma.rn.f64 fd1069, fd141, 0dBFEBFAA5C136B224, fd1065; +fma.rn.f64 fd1070, fd144, 0dBFDF0F2FF6705BEC, fd1066; +fma.rn.f64 fd1071, fd1262, 0dBFEBFAA5C136B224, fd1067; +fma.rn.f64 fd1072, fd143, 0dBFDF0F2FF6705BEC, fd1068; +fma.rn.f64 fd1073, fd145, 0d3FEA43B1B1379AFF, fd1069; +fma.rn.f64 fd1074, fd148, 0d3FE247D447A27216, fd1070; +fma.rn.f64 fd1075, fd1260, 0d3FEA43B1B1379AFF, fd1071; +fma.rn.f64 fd1076, fd147, 0d3FE247D447A27216, fd1072; +fma.rn.f64 fd1077, fd149, 0dBFE847BF1D5146CC, fd1073; +fma.rn.f64 fd1078, fd152, 0dBFE4D80B1AD9CCF6, fd1074; +fma.rn.f64 fd1079, fd1258, 0dBFE847BF1D5146CC, fd1075; +fma.rn.f64 fd1080, fd151, 0dBFE4D80B1AD9CCF6, fd1076; +fma.rn.f64 fd1081, fd153, 0d3FE60C045A2E9729, fd1077; +fma.rn.f64 fd1082, fd156, 0d3FE73180A4B0D300, fd1078; +fma.rn.f64 fd1083, fd1255, 0d3FE60C045A2E9729, fd1079; +fma.rn.f64 fd1084, fd155, 0d3FE73180A4B0D300, fd1080; +fma.rn.f64 fd1085, fd157, 0dBFE3965F49174D13, fd1081; +fma.rn.f64 fd1086, fd160, 0dBFE94E08EB13C451, fd1082; +fma.rn.f64 fd1087, fd1253, 0dBFE3965F49174D13, fd1083; +fma.rn.f64 fd1088, fd159, 0dBFE94E08EB13C451, fd1084; +fma.rn.f64 fd1089, fd161, 0d3FE0ED45EEA3B09F, fd1085; +fma.rn.f64 fd1090, fd164, 0d3FEB2818007C19DF, fd1086; +fma.rn.f64 fd1091, fd1251, 0d3FE0ED45EEA3B09F, fd1087; +fma.rn.f64 fd1092, fd163, 0d3FEB2818007C19DF, fd1088; +fma.rn.f64 fd1093, fd165, 0dBFDC2F6AF3928A8E, fd1089; +fma.rn.f64 fd1094, fd168, 0dBFECBAD095F50378, fd1090; +fma.rn.f64 fd1095, fd1248, 0dBFDC2F6AF3928A8E, fd1091; +fma.rn.f64 fd1096, fd167, 0dBFECBAD095F50378, fd1092; +fma.rn.f64 fd1097, fd169, 0d3FD63A3FCFACA412, fd1093; +fma.rn.f64 fd1098, fd172, 0d3FEE0210C26A6E6F, fd1094; +fma.rn.f64 fd1099, fd1246, 0d3FD63A3FCFACA412, fd1095; +fma.rn.f64 fd1100, fd171, 0d3FEE0210C26A6E6F, fd1096; +fma.rn.f64 fd1101, fd173, 0dBFD00AB0EB2D7D94, fd1097; +fma.rn.f64 fd1102, fd176, 0dBFEEFA7CDDB128FA, fd1098; +fma.rn.f64 fd1103, fd1244, 0dBFD00AB0EB2D7D94, fd1099; +fma.rn.f64 fd1104, fd175, 0dBFEEFA7CDDB128FA, fd1100; +fma.rn.f64 fd1105, fd177, 0d3FC361FC440B478F, fd1101; +fma.rn.f64 fd1106, fd180, 0d3FEFA18852C3E08A, fd1102; +fma.rn.f64 fd1107, fd1241, 0d3FC361FC440B478F, fd1103; +fma.rn.f64 fd1108, fd179, 0d3FEFA18852C3E08A, fd1104; +fma.rn.f64 fd1109, fd181, 0dBFA9EEB01776B57D, fd1105; +fma.rn.f64 fd1110, fd184, 0dBFEFF57C5208CCF9, fd1106; +fma.rn.f64 fd1111, fd1239, 0dBFA9EEB01776B57D, fd1107; +fma.rn.f64 fd1112, fd183, 0dBFEFF57C5208CCF9, fd1108; +add.f64 %1, fd212, fd1239; +add.f64 %0, fd211, fd181; +sub.f64 %2, fd269, fd270; +add.f64 %3, fd271, fd272; +sub.f64 %4, fd329, fd330; +add.f64 %5, fd331, fd332; +sub.f64 %6, fd389, fd390; +add.f64 %7, fd391, fd392; +add.f64 %9, fd451, fd452; +sub.f64 %8, fd449, fd450; +add.f64 %11, fd511, fd512; +sub.f64 %10, fd509, fd510; +add.f64 %13, fd571, fd572; +sub.f64 %12, fd569, fd570; +sub.f64 %14, fd629, fd630; +add.f64 %15, fd631, fd632; +sub.f64 %16, fd689, fd690; +add.f64 %17, fd691, fd692; +sub.f64 %18, fd749, fd750; +add.f64 %19, fd751, fd752; +add.f64 %21, fd811, fd812; +sub.f64 %20, fd809, fd810; +add.f64 %23, fd871, fd872; +sub.f64 %22, fd869, fd870; +add.f64 %25, fd931, fd932; +sub.f64 %24, fd929, fd930; +sub.f64 %26, fd989, fd990; +add.f64 %27, fd991, fd992; +sub.f64 %28, fd1049, fd1050; +add.f64 %29, fd1051, fd1052; +sub.f64 %30, fd1109, fd1110; +add.f64 %31, fd1111, fd1112; +sub.f64 %33, fd1111, fd1112; +add.f64 %32, fd1109, fd1110; +sub.f64 %35, fd1051, fd1052; +add.f64 %34, fd1049, fd1050; +sub.f64 %37, fd991, fd992; +add.f64 %36, fd989, fd990; +sub.f64 %39, fd931, fd932; +add.f64 %38, fd929, fd930; +sub.f64 %41, fd871, fd872; +add.f64 %40, fd869, fd870; +sub.f64 %43, fd811, fd812; +add.f64 %42, fd809, fd810; +sub.f64 %45, fd751, fd752; +add.f64 %44, fd749, fd750; +sub.f64 %47, fd691, fd692; +add.f64 %46, fd689, fd690; +sub.f64 %49, fd631, fd632; +add.f64 %48, fd629, fd630; +sub.f64 %51, fd571, fd572; +add.f64 %50, fd569, fd570; +sub.f64 %53, fd511, fd512; +add.f64 %52, fd509, fd510; +sub.f64 %55, fd451, fd452; +add.f64 %54, fd449, fd450; +sub.f64 %57, fd391, fd392; +add.f64 %56, fd389, fd390; +sub.f64 %59, fd331, fd332; +add.f64 %58, fd329, fd330; +sub.f64 %61, fd271, fd272; +add.f64 %60, fd269, fd270; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y), "=d"(rmem[28].x), "=d"(rmem[28].y), "=d"(rmem[29].x), "=d"(rmem[29].y), "=d"(rmem[30].x), "=d"(rmem[30].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y), "d"(rmem[28].x), "d"(rmem[28].y), "d"(rmem[29].x), "d"(rmem[29].y), "d"(rmem[30].x), "d"(rmem[30].y), "d"(rmem[2].y), "d"(rmem[29].y), "d"(rmem[28].y), "d"(rmem[4].y), "d"(rmem[5].y), "d"(rmem[26].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[23].y), "d"(rmem[22].y), "d"(rmem[10].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[19].y), "d"(rmem[13].y), "d"(rmem[14].y), "d"(rmem[17].y), "d"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..edc56a8b15035 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_31_fp64_inv.hpp.inc @@ -0,0 +1,1066 @@ +#ifndef CUFFTDX_FFT_31_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_31_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<590, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<1273>; +.reg .b64 rd<4>; +add.f64 fd125, %64, %123; +sub.f64 fd127, %64, %123; +add.f64 fd1272, %66, %124; +sub.f64 fd128, %66, %124; +add.f64 fd129, %67, %121; +sub.f64 fd131, %67, %121; +add.f64 fd1269, %125, %126; +sub.f64 fd132, %125, %126; +add.f64 fd133, %69, %119; +sub.f64 fd135, %69, %119; +add.f64 fd1267, %70, %127; +sub.f64 fd136, %70, %127; +add.f64 fd137, %71, %117; +sub.f64 fd139, %71, %117; +add.f64 fd1265, %128, %118; +sub.f64 fd140, %128, %118; +add.f64 fd141, %73, %115; +sub.f64 fd143, %73, %115; +add.f64 fd1262, %129, %130; +sub.f64 fd144, %129, %130; +add.f64 fd145, %75, %113; +sub.f64 fd147, %75, %113; +add.f64 fd1260, %76, %131; +sub.f64 fd148, %76, %131; +add.f64 fd149, %77, %111; +sub.f64 fd151, %77, %111; +add.f64 fd1258, %132, %112; +sub.f64 fd152, %132, %112; +add.f64 fd153, %79, %109; +sub.f64 fd155, %79, %109; +add.f64 fd1255, %133, %134; +sub.f64 fd156, %133, %134; +add.f64 fd157, %81, %107; +sub.f64 fd159, %81, %107; +add.f64 fd1253, %82, %135; +sub.f64 fd160, %82, %135; +add.f64 fd161, %83, %105; +sub.f64 fd163, %83, %105; +add.f64 fd1251, %136, %106; +sub.f64 fd164, %136, %106; +add.f64 fd165, %85, %103; +sub.f64 fd167, %85, %103; +add.f64 fd1248, %137, %138; +sub.f64 fd168, %137, %138; +add.f64 fd169, %87, %101; +sub.f64 fd171, %87, %101; +add.f64 fd1246, %88, %139; +sub.f64 fd172, %88, %139; +add.f64 fd173, %89, %99; +sub.f64 fd175, %89, %99; +add.f64 fd1244, %140, %100; +sub.f64 fd176, %140, %100; +add.f64 fd177, %91, %97; +sub.f64 fd179, %91, %97; +add.f64 fd1241, %141, %142; +sub.f64 fd180, %141, %142; +add.f64 fd181, %93, %95; +sub.f64 fd183, %93, %95; +add.f64 fd1239, %94, %143; +sub.f64 fd184, %94, %143; +add.f64 fd185, %62, fd125; +add.f64 fd187, fd185, fd129; +add.f64 fd1238, %63, fd1272; +add.f64 fd188, fd1238, fd1269; +add.f64 fd189, fd187, fd133; +add.f64 fd190, fd188, fd1267; +add.f64 fd191, fd189, fd137; +add.f64 fd192, fd190, fd1265; +add.f64 fd193, fd191, fd141; +add.f64 fd194, fd192, fd1262; +add.f64 fd195, fd193, fd145; +add.f64 fd196, fd194, fd1260; +add.f64 fd197, fd195, fd149; +add.f64 fd198, fd196, fd1258; +add.f64 fd199, fd197, fd153; +add.f64 fd200, fd198, fd1255; +add.f64 fd201, fd199, fd157; +add.f64 fd202, fd200, fd1253; +add.f64 fd203, fd201, fd161; +add.f64 fd204, fd202, fd1251; +add.f64 fd205, fd203, fd165; +add.f64 fd206, fd204, fd1248; +add.f64 fd207, fd205, fd169; +add.f64 fd208, fd206, fd1246; +add.f64 fd209, fd207, fd173; +add.f64 fd210, fd208, fd1244; +add.f64 fd211, fd209, fd177; +add.f64 fd212, fd210, fd1241; +fma.rn.f64 fd213, fd125, 0d3FEF584F2CE43B84, %62; +fma.rn.f64 fd217, fd129, 0d3FED681A366A00FA, fd213; +fma.rn.f64 fd1237, fd128, 0d3FC9C4266041CA8F, 0d0000000000000000; +fma.rn.f64 fd218, fd132, 0d3FD93D20572CA90B, fd1237; +fma.rn.f64 fd1236, fd1272, 0d3FEF584F2CE43B84, %63; +fma.rn.f64 fd219, fd1269, 0d3FED681A366A00FA, fd1236; +fma.rn.f64 fd1235, fd127, 0d3FC9C4266041CA8F, 0d0000000000000000; +fma.rn.f64 fd220, fd131, 0d3FD93D20572CA90B, fd1235; +fma.rn.f64 fd221, fd133, 0d3FEA43B1B1379AFF, fd217; +fma.rn.f64 fd222, fd136, 0d3FE247D447A27216, fd218; +fma.rn.f64 fd223, fd1267, 0d3FEA43B1B1379AFF, fd219; +fma.rn.f64 fd224, fd135, 0d3FE247D447A27216, fd220; +fma.rn.f64 fd225, fd137, 0d3FE60C045A2E9729, fd221; +fma.rn.f64 fd226, fd140, 0d3FE73180A4B0D300, fd222; +fma.rn.f64 fd227, fd1265, 0d3FE60C045A2E9729, fd223; +fma.rn.f64 fd228, fd139, 0d3FE73180A4B0D300, fd224; +fma.rn.f64 fd229, fd141, 0d3FE0ED45EEA3B09F, fd225; +fma.rn.f64 fd230, fd144, 0d3FEB2818007C19DF, fd226; +fma.rn.f64 fd231, fd1262, 0d3FE0ED45EEA3B09F, fd227; +fma.rn.f64 fd232, fd143, 0d3FEB2818007C19DF, fd228; +fma.rn.f64 fd233, fd145, 0d3FD63A3FCFACA412, fd229; +fma.rn.f64 fd234, fd148, 0d3FEE0210C26A6E6F, fd230; +fma.rn.f64 fd235, fd1260, 0d3FD63A3FCFACA412, fd231; +fma.rn.f64 fd236, fd147, 0d3FEE0210C26A6E6F, fd232; +fma.rn.f64 fd237, fd149, 0d3FC361FC440B478F, fd233; +fma.rn.f64 fd238, fd152, 0d3FEFA18852C3E08A, fd234; +fma.rn.f64 fd239, fd1258, 0d3FC361FC440B478F, fd235; +fma.rn.f64 fd240, fd151, 0d3FEFA18852C3E08A, fd236; +fma.rn.f64 fd241, fd153, 0dBFA9EEB01776B57D, fd237; +fma.rn.f64 fd242, fd156, 0d3FEFF57C5208CCF9, fd238; +fma.rn.f64 fd243, fd1255, 0dBFA9EEB01776B57D, fd239; +fma.rn.f64 fd244, fd155, 0d3FEFF57C5208CCF9, fd240; +fma.rn.f64 fd245, fd157, 0dBFD00AB0EB2D7D94, fd241; +fma.rn.f64 fd246, fd160, 0d3FEEFA7CDDB128FA, fd242; +fma.rn.f64 fd247, fd1253, 0dBFD00AB0EB2D7D94, fd243; +fma.rn.f64 fd248, fd159, 0d3FEEFA7CDDB128FA, fd244; +fma.rn.f64 fd249, fd161, 0dBFDC2F6AF3928A8E, fd245; +fma.rn.f64 fd250, fd164, 0d3FECBAD095F50378, fd246; +fma.rn.f64 fd251, fd1251, 0dBFDC2F6AF3928A8E, fd247; +fma.rn.f64 fd252, fd163, 0d3FECBAD095F50378, fd248; +fma.rn.f64 fd253, fd165, 0dBFE3965F49174D13, fd249; +fma.rn.f64 fd254, fd168, 0d3FE94E08EB13C451, fd250; +fma.rn.f64 fd255, fd1248, 0dBFE3965F49174D13, fd251; +fma.rn.f64 fd256, fd167, 0d3FE94E08EB13C451, fd252; +fma.rn.f64 fd257, fd169, 0dBFE847BF1D5146CC, fd253; +fma.rn.f64 fd258, fd172, 0d3FE4D80B1AD9CCF6, fd254; +fma.rn.f64 fd259, fd1246, 0dBFE847BF1D5146CC, fd255; +fma.rn.f64 fd260, fd171, 0d3FE4D80B1AD9CCF6, fd256; +fma.rn.f64 fd261, fd173, 0dBFEBFAA5C136B224, fd257; +fma.rn.f64 fd262, fd176, 0d3FDF0F2FF6705BEC, fd258; +fma.rn.f64 fd263, fd1244, 0dBFEBFAA5C136B224, fd259; +fma.rn.f64 fd264, fd175, 0d3FDF0F2FF6705BEC, fd260; +fma.rn.f64 fd265, fd177, 0dBFEE884F0CC22CCC, fd261; +fma.rn.f64 fd266, fd180, 0d3FD328C3F1B322CB, fd262; +fma.rn.f64 fd267, fd1241, 0dBFEE884F0CC22CCC, fd263; +fma.rn.f64 fd268, fd179, 0d3FD328C3F1B322CB, fd264; +fma.rn.f64 fd269, fd181, 0dBFEFD5F830F860F9, fd265; +fma.rn.f64 fd270, fd184, 0d3FB9E62ACA53C49F, fd266; +fma.rn.f64 fd271, fd1239, 0dBFEFD5F830F860F9, fd267; +fma.rn.f64 fd272, fd183, 0d3FB9E62ACA53C49F, fd268; +fma.rn.f64 fd273, fd125, 0d3FED681A366A00FA, %62; +fma.rn.f64 fd277, fd129, 0d3FE60C045A2E9729, fd273; +fma.rn.f64 fd1234, fd128, 0d3FD93D20572CA90B, 0d0000000000000000; +fma.rn.f64 fd278, fd132, 0d3FE73180A4B0D300, fd1234; +fma.rn.f64 fd1233, fd1272, 0d3FED681A366A00FA, %63; +fma.rn.f64 fd279, fd1269, 0d3FE60C045A2E9729, fd1233; +fma.rn.f64 fd1232, fd127, 0d3FD93D20572CA90B, 0d0000000000000000; +fma.rn.f64 fd280, fd131, 0d3FE73180A4B0D300, fd1232; +fma.rn.f64 fd281, fd133, 0d3FD63A3FCFACA412, fd277; +fma.rn.f64 fd282, fd136, 0d3FEE0210C26A6E6F, fd278; +fma.rn.f64 fd283, fd1267, 0d3FD63A3FCFACA412, fd279; +fma.rn.f64 fd284, fd135, 0d3FEE0210C26A6E6F, fd280; +fma.rn.f64 fd285, fd137, 0dBFA9EEB01776B57D, fd281; +fma.rn.f64 fd286, fd140, 0d3FEFF57C5208CCF9, fd282; +fma.rn.f64 fd287, fd1265, 0dBFA9EEB01776B57D, fd283; +fma.rn.f64 fd288, fd139, 0d3FEFF57C5208CCF9, fd284; +fma.rn.f64 fd289, fd141, 0dBFDC2F6AF3928A8E, fd285; +fma.rn.f64 fd290, fd144, 0d3FECBAD095F50378, fd286; +fma.rn.f64 fd291, fd1262, 0dBFDC2F6AF3928A8E, fd287; +fma.rn.f64 fd292, fd143, 0d3FECBAD095F50378, fd288; +fma.rn.f64 fd293, fd145, 0dBFE847BF1D5146CC, fd289; +fma.rn.f64 fd294, fd148, 0d3FE4D80B1AD9CCF6, fd290; +fma.rn.f64 fd295, fd1260, 0dBFE847BF1D5146CC, fd291; +fma.rn.f64 fd296, fd147, 0d3FE4D80B1AD9CCF6, fd292; +fma.rn.f64 fd297, fd149, 0dBFEE884F0CC22CCC, fd293; +fma.rn.f64 fd298, fd152, 0d3FD328C3F1B322CB, fd294; +fma.rn.f64 fd299, fd1258, 0dBFEE884F0CC22CCC, fd295; +fma.rn.f64 fd300, fd151, 0d3FD328C3F1B322CB, fd296; +fma.rn.f64 fd301, fd153, 0dBFEFD5F830F860F9, fd297; +fma.rn.f64 fd302, fd156, 0dBFB9E62ACA53C49F, fd298; +fma.rn.f64 fd303, fd1255, 0dBFEFD5F830F860F9, fd299; +fma.rn.f64 fd304, fd155, 0dBFB9E62ACA53C49F, fd300; +fma.rn.f64 fd305, fd157, 0dBFEBFAA5C136B224, fd301; +fma.rn.f64 fd306, fd160, 0dBFDF0F2FF6705BEC, fd302; +fma.rn.f64 fd307, fd1253, 0dBFEBFAA5C136B224, fd303; +fma.rn.f64 fd308, fd159, 0dBFDF0F2FF6705BEC, fd304; +fma.rn.f64 fd309, fd161, 0dBFE3965F49174D13, fd305; +fma.rn.f64 fd310, fd164, 0dBFE94E08EB13C451, fd306; +fma.rn.f64 fd311, fd1251, 0dBFE3965F49174D13, fd307; +fma.rn.f64 fd312, fd163, 0dBFE94E08EB13C451, fd308; +fma.rn.f64 fd313, fd165, 0dBFD00AB0EB2D7D94, fd309; +fma.rn.f64 fd314, fd168, 0dBFEEFA7CDDB128FA, fd310; +fma.rn.f64 fd315, fd1248, 0dBFD00AB0EB2D7D94, fd311; +fma.rn.f64 fd316, fd167, 0dBFEEFA7CDDB128FA, fd312; +fma.rn.f64 fd317, fd169, 0d3FC361FC440B478F, fd313; +fma.rn.f64 fd318, fd172, 0dBFEFA18852C3E08A, fd314; +fma.rn.f64 fd319, fd1246, 0d3FC361FC440B478F, fd315; +fma.rn.f64 fd320, fd171, 0dBFEFA18852C3E08A, fd316; +fma.rn.f64 fd321, fd173, 0d3FE0ED45EEA3B09F, fd317; +fma.rn.f64 fd322, fd176, 0dBFEB2818007C19DF, fd318; +fma.rn.f64 fd323, fd1244, 0d3FE0ED45EEA3B09F, fd319; +fma.rn.f64 fd324, fd175, 0dBFEB2818007C19DF, fd320; +fma.rn.f64 fd325, fd177, 0d3FEA43B1B1379AFF, fd321; +fma.rn.f64 fd326, fd180, 0dBFE247D447A27216, fd322; +fma.rn.f64 fd327, fd1241, 0d3FEA43B1B1379AFF, fd323; +fma.rn.f64 fd328, fd179, 0dBFE247D447A27216, fd324; +fma.rn.f64 fd329, fd181, 0d3FEF584F2CE43B84, fd325; +fma.rn.f64 fd330, fd184, 0dBFC9C4266041CA8F, fd326; +fma.rn.f64 fd331, fd1239, 0d3FEF584F2CE43B84, fd327; +fma.rn.f64 fd332, fd183, 0dBFC9C4266041CA8F, fd328; +fma.rn.f64 fd333, fd125, 0d3FEA43B1B1379AFF, %62; +fma.rn.f64 fd337, fd129, 0d3FD63A3FCFACA412, fd333; +fma.rn.f64 fd1231, fd128, 0d3FE247D447A27216, 0d0000000000000000; +fma.rn.f64 fd338, fd132, 0d3FEE0210C26A6E6F, fd1231; +fma.rn.f64 fd1230, fd1272, 0d3FEA43B1B1379AFF, %63; +fma.rn.f64 fd339, fd1269, 0d3FD63A3FCFACA412, fd1230; +fma.rn.f64 fd1229, fd127, 0d3FE247D447A27216, 0d0000000000000000; +fma.rn.f64 fd340, fd131, 0d3FEE0210C26A6E6F, fd1229; +fma.rn.f64 fd341, fd133, 0dBFD00AB0EB2D7D94, fd337; +fma.rn.f64 fd342, fd136, 0d3FEEFA7CDDB128FA, fd338; +fma.rn.f64 fd343, fd1267, 0dBFD00AB0EB2D7D94, fd339; +fma.rn.f64 fd344, fd135, 0d3FEEFA7CDDB128FA, fd340; +fma.rn.f64 fd345, fd137, 0dBFE847BF1D5146CC, fd341; +fma.rn.f64 fd346, fd140, 0d3FE4D80B1AD9CCF6, fd342; +fma.rn.f64 fd347, fd1265, 0dBFE847BF1D5146CC, fd343; +fma.rn.f64 fd348, fd139, 0d3FE4D80B1AD9CCF6, fd344; +fma.rn.f64 fd349, fd141, 0dBFEFD5F830F860F9, fd345; +fma.rn.f64 fd350, fd144, 0d3FB9E62ACA53C49F, fd346; +fma.rn.f64 fd351, fd1262, 0dBFEFD5F830F860F9, fd347; +fma.rn.f64 fd352, fd143, 0d3FB9E62ACA53C49F, fd348; +fma.rn.f64 fd353, fd145, 0dBFEBFAA5C136B224, fd349; +fma.rn.f64 fd354, fd148, 0dBFDF0F2FF6705BEC, fd350; +fma.rn.f64 fd355, fd1260, 0dBFEBFAA5C136B224, fd351; +fma.rn.f64 fd356, fd147, 0dBFDF0F2FF6705BEC, fd352; +fma.rn.f64 fd357, fd149, 0dBFDC2F6AF3928A8E, fd353; +fma.rn.f64 fd358, fd152, 0dBFECBAD095F50378, fd354; +fma.rn.f64 fd359, fd1258, 0dBFDC2F6AF3928A8E, fd355; +fma.rn.f64 fd360, fd151, 0dBFECBAD095F50378, fd356; +fma.rn.f64 fd361, fd153, 0d3FC361FC440B478F, fd357; +fma.rn.f64 fd362, fd156, 0dBFEFA18852C3E08A, fd358; +fma.rn.f64 fd363, fd1255, 0d3FC361FC440B478F, fd359; +fma.rn.f64 fd364, fd155, 0dBFEFA18852C3E08A, fd360; +fma.rn.f64 fd365, fd157, 0d3FE60C045A2E9729, fd361; +fma.rn.f64 fd366, fd160, 0dBFE73180A4B0D300, fd362; +fma.rn.f64 fd367, fd1253, 0d3FE60C045A2E9729, fd363; +fma.rn.f64 fd368, fd159, 0dBFE73180A4B0D300, fd364; +fma.rn.f64 fd369, fd161, 0d3FEF584F2CE43B84, fd365; +fma.rn.f64 fd370, fd164, 0dBFC9C4266041CA8F, fd366; +fma.rn.f64 fd371, fd1251, 0d3FEF584F2CE43B84, fd367; +fma.rn.f64 fd372, fd163, 0dBFC9C4266041CA8F, fd368; +fma.rn.f64 fd373, fd165, 0d3FED681A366A00FA, fd369; +fma.rn.f64 fd374, fd168, 0d3FD93D20572CA90B, fd370; +fma.rn.f64 fd375, fd1248, 0d3FED681A366A00FA, fd371; +fma.rn.f64 fd376, fd167, 0d3FD93D20572CA90B, fd372; +fma.rn.f64 fd377, fd169, 0d3FE0ED45EEA3B09F, fd373; +fma.rn.f64 fd378, fd172, 0d3FEB2818007C19DF, fd374; +fma.rn.f64 fd379, fd1246, 0d3FE0ED45EEA3B09F, fd375; +fma.rn.f64 fd380, fd171, 0d3FEB2818007C19DF, fd376; +fma.rn.f64 fd381, fd173, 0dBFA9EEB01776B57D, fd377; +fma.rn.f64 fd382, fd176, 0d3FEFF57C5208CCF9, fd378; +fma.rn.f64 fd383, fd1244, 0dBFA9EEB01776B57D, fd379; +fma.rn.f64 fd384, fd175, 0d3FEFF57C5208CCF9, fd380; +fma.rn.f64 fd385, fd177, 0dBFE3965F49174D13, fd381; +fma.rn.f64 fd386, fd180, 0d3FE94E08EB13C451, fd382; +fma.rn.f64 fd387, fd1241, 0dBFE3965F49174D13, fd383; +fma.rn.f64 fd388, fd179, 0d3FE94E08EB13C451, fd384; +fma.rn.f64 fd389, fd181, 0dBFEE884F0CC22CCC, fd385; +fma.rn.f64 fd390, fd184, 0d3FD328C3F1B322CB, fd386; +fma.rn.f64 fd391, fd1239, 0dBFEE884F0CC22CCC, fd387; +fma.rn.f64 fd392, fd183, 0d3FD328C3F1B322CB, fd388; +fma.rn.f64 fd393, fd125, 0d3FE60C045A2E9729, %62; +fma.rn.f64 fd397, fd129, 0dBFA9EEB01776B57D, fd393; +fma.rn.f64 fd1228, fd128, 0d3FE73180A4B0D300, 0d0000000000000000; +fma.rn.f64 fd398, fd132, 0d3FEFF57C5208CCF9, fd1228; +fma.rn.f64 fd1227, fd1272, 0d3FE60C045A2E9729, %63; +fma.rn.f64 fd399, fd1269, 0dBFA9EEB01776B57D, fd1227; +fma.rn.f64 fd1226, fd127, 0d3FE73180A4B0D300, 0d0000000000000000; +fma.rn.f64 fd400, fd131, 0d3FEFF57C5208CCF9, fd1226; +fma.rn.f64 fd401, fd133, 0dBFE847BF1D5146CC, fd397; +fma.rn.f64 fd402, fd136, 0d3FE4D80B1AD9CCF6, fd398; +fma.rn.f64 fd403, fd1267, 0dBFE847BF1D5146CC, fd399; +fma.rn.f64 fd404, fd135, 0d3FE4D80B1AD9CCF6, fd400; +fma.rn.f64 fd405, fd137, 0dBFEFD5F830F860F9, fd401; +fma.rn.f64 fd406, fd140, 0dBFB9E62ACA53C49F, fd402; +fma.rn.f64 fd407, fd1265, 0dBFEFD5F830F860F9, fd403; +fma.rn.f64 fd408, fd139, 0dBFB9E62ACA53C49F, fd404; +fma.rn.f64 fd409, fd141, 0dBFE3965F49174D13, fd405; +fma.rn.f64 fd410, fd144, 0dBFE94E08EB13C451, fd406; +fma.rn.f64 fd411, fd1262, 0dBFE3965F49174D13, fd407; +fma.rn.f64 fd412, fd143, 0dBFE94E08EB13C451, fd408; +fma.rn.f64 fd413, fd145, 0d3FC361FC440B478F, fd409; +fma.rn.f64 fd414, fd148, 0dBFEFA18852C3E08A, fd410; +fma.rn.f64 fd415, fd1260, 0d3FC361FC440B478F, fd411; +fma.rn.f64 fd416, fd147, 0dBFEFA18852C3E08A, fd412; +fma.rn.f64 fd417, fd149, 0d3FEA43B1B1379AFF, fd413; +fma.rn.f64 fd418, fd152, 0dBFE247D447A27216, fd414; +fma.rn.f64 fd419, fd1258, 0d3FEA43B1B1379AFF, fd415; +fma.rn.f64 fd420, fd151, 0dBFE247D447A27216, fd416; +fma.rn.f64 fd421, fd153, 0d3FEF584F2CE43B84, fd417; +fma.rn.f64 fd422, fd156, 0d3FC9C4266041CA8F, fd418; +fma.rn.f64 fd423, fd1255, 0d3FEF584F2CE43B84, fd419; +fma.rn.f64 fd424, fd155, 0d3FC9C4266041CA8F, fd420; +fma.rn.f64 fd425, fd157, 0d3FE0ED45EEA3B09F, fd421; +fma.rn.f64 fd426, fd160, 0d3FEB2818007C19DF, fd422; +fma.rn.f64 fd427, fd1253, 0d3FE0ED45EEA3B09F, fd423; +fma.rn.f64 fd428, fd159, 0d3FEB2818007C19DF, fd424; +fma.rn.f64 fd429, fd161, 0dBFD00AB0EB2D7D94, fd425; +fma.rn.f64 fd430, fd164, 0d3FEEFA7CDDB128FA, fd426; +fma.rn.f64 fd431, fd1251, 0dBFD00AB0EB2D7D94, fd427; +fma.rn.f64 fd432, fd163, 0d3FEEFA7CDDB128FA, fd428; +fma.rn.f64 fd433, fd165, 0dBFEBFAA5C136B224, fd429; +fma.rn.f64 fd434, fd168, 0d3FDF0F2FF6705BEC, fd430; +fma.rn.f64 fd435, fd1248, 0dBFEBFAA5C136B224, fd431; +fma.rn.f64 fd436, fd167, 0d3FDF0F2FF6705BEC, fd432; +fma.rn.f64 fd437, fd169, 0dBFEE884F0CC22CCC, fd433; +fma.rn.f64 fd438, fd172, 0dBFD328C3F1B322CB, fd434; +fma.rn.f64 fd439, fd1246, 0dBFEE884F0CC22CCC, fd435; +fma.rn.f64 fd440, fd171, 0dBFD328C3F1B322CB, fd436; +fma.rn.f64 fd441, fd173, 0dBFDC2F6AF3928A8E, fd437; +fma.rn.f64 fd442, fd176, 0dBFECBAD095F50378, fd438; +fma.rn.f64 fd443, fd1244, 0dBFDC2F6AF3928A8E, fd439; +fma.rn.f64 fd444, fd175, 0dBFECBAD095F50378, fd440; +fma.rn.f64 fd445, fd177, 0d3FD63A3FCFACA412, fd441; +fma.rn.f64 fd446, fd180, 0dBFEE0210C26A6E6F, fd442; +fma.rn.f64 fd447, fd1241, 0d3FD63A3FCFACA412, fd443; +fma.rn.f64 fd448, fd179, 0dBFEE0210C26A6E6F, fd444; +fma.rn.f64 fd449, fd181, 0d3FED681A366A00FA, fd445; +fma.rn.f64 fd450, fd184, 0dBFD93D20572CA90B, fd446; +fma.rn.f64 fd451, fd1239, 0d3FED681A366A00FA, fd447; +fma.rn.f64 fd452, fd183, 0dBFD93D20572CA90B, fd448; +fma.rn.f64 fd453, fd125, 0d3FE0ED45EEA3B09F, %62; +fma.rn.f64 fd457, fd129, 0dBFDC2F6AF3928A8E, fd453; +fma.rn.f64 fd1225, fd128, 0d3FEB2818007C19DF, 0d0000000000000000; +fma.rn.f64 fd458, fd132, 0d3FECBAD095F50378, fd1225; +fma.rn.f64 fd1224, fd1272, 0d3FE0ED45EEA3B09F, %63; +fma.rn.f64 fd459, fd1269, 0dBFDC2F6AF3928A8E, fd1224; +fma.rn.f64 fd1223, fd127, 0d3FEB2818007C19DF, 0d0000000000000000; +fma.rn.f64 fd460, fd131, 0d3FECBAD095F50378, fd1223; +fma.rn.f64 fd461, fd133, 0dBFEFD5F830F860F9, fd457; +fma.rn.f64 fd462, fd136, 0d3FB9E62ACA53C49F, fd458; +fma.rn.f64 fd463, fd1267, 0dBFEFD5F830F860F9, fd459; +fma.rn.f64 fd464, fd135, 0d3FB9E62ACA53C49F, fd460; +fma.rn.f64 fd465, fd137, 0dBFE3965F49174D13, fd461; +fma.rn.f64 fd466, fd140, 0dBFE94E08EB13C451, fd462; +fma.rn.f64 fd467, fd1265, 0dBFE3965F49174D13, fd463; +fma.rn.f64 fd468, fd139, 0dBFE94E08EB13C451, fd464; +fma.rn.f64 fd469, fd141, 0d3FD63A3FCFACA412, fd465; +fma.rn.f64 fd470, fd144, 0dBFEE0210C26A6E6F, fd466; +fma.rn.f64 fd471, fd1262, 0d3FD63A3FCFACA412, fd467; +fma.rn.f64 fd472, fd143, 0dBFEE0210C26A6E6F, fd468; +fma.rn.f64 fd473, fd145, 0d3FEF584F2CE43B84, fd469; +fma.rn.f64 fd474, fd148, 0dBFC9C4266041CA8F, fd470; +fma.rn.f64 fd475, fd1260, 0d3FEF584F2CE43B84, fd471; +fma.rn.f64 fd476, fd147, 0dBFC9C4266041CA8F, fd472; +fma.rn.f64 fd477, fd149, 0d3FE60C045A2E9729, fd473; +fma.rn.f64 fd478, fd152, 0d3FE73180A4B0D300, fd474; +fma.rn.f64 fd479, fd1258, 0d3FE60C045A2E9729, fd475; +fma.rn.f64 fd480, fd151, 0d3FE73180A4B0D300, fd476; +fma.rn.f64 fd481, fd153, 0dBFD00AB0EB2D7D94, fd477; +fma.rn.f64 fd482, fd156, 0d3FEEFA7CDDB128FA, fd478; +fma.rn.f64 fd483, fd1255, 0dBFD00AB0EB2D7D94, fd479; +fma.rn.f64 fd484, fd155, 0d3FEEFA7CDDB128FA, fd480; +fma.rn.f64 fd485, fd157, 0dBFEE884F0CC22CCC, fd481; +fma.rn.f64 fd486, fd160, 0d3FD328C3F1B322CB, fd482; +fma.rn.f64 fd487, fd1253, 0dBFEE884F0CC22CCC, fd483; +fma.rn.f64 fd488, fd159, 0d3FD328C3F1B322CB, fd484; +fma.rn.f64 fd489, fd161, 0dBFE847BF1D5146CC, fd485; +fma.rn.f64 fd490, fd164, 0dBFE4D80B1AD9CCF6, fd486; +fma.rn.f64 fd491, fd1251, 0dBFE847BF1D5146CC, fd487; +fma.rn.f64 fd492, fd163, 0dBFE4D80B1AD9CCF6, fd488; +fma.rn.f64 fd493, fd165, 0d3FC361FC440B478F, fd489; +fma.rn.f64 fd494, fd168, 0dBFEFA18852C3E08A, fd490; +fma.rn.f64 fd495, fd1248, 0d3FC361FC440B478F, fd491; +fma.rn.f64 fd496, fd167, 0dBFEFA18852C3E08A, fd492; +fma.rn.f64 fd497, fd169, 0d3FED681A366A00FA, fd493; +fma.rn.f64 fd498, fd172, 0dBFD93D20572CA90B, fd494; +fma.rn.f64 fd499, fd1246, 0d3FED681A366A00FA, fd495; +fma.rn.f64 fd500, fd171, 0dBFD93D20572CA90B, fd496; +fma.rn.f64 fd501, fd173, 0d3FEA43B1B1379AFF, fd497; +fma.rn.f64 fd502, fd176, 0d3FE247D447A27216, fd498; +fma.rn.f64 fd503, fd1244, 0d3FEA43B1B1379AFF, fd499; +fma.rn.f64 fd504, fd175, 0d3FE247D447A27216, fd500; +fma.rn.f64 fd505, fd177, 0dBFA9EEB01776B57D, fd501; +fma.rn.f64 fd506, fd180, 0d3FEFF57C5208CCF9, fd502; +fma.rn.f64 fd507, fd1241, 0dBFA9EEB01776B57D, fd503; +fma.rn.f64 fd508, fd179, 0d3FEFF57C5208CCF9, fd504; +fma.rn.f64 fd509, fd181, 0dBFEBFAA5C136B224, fd505; +fma.rn.f64 fd510, fd184, 0d3FDF0F2FF6705BEC, fd506; +fma.rn.f64 fd511, fd1239, 0dBFEBFAA5C136B224, fd507; +fma.rn.f64 fd512, fd183, 0d3FDF0F2FF6705BEC, fd508; +fma.rn.f64 fd513, fd125, 0d3FD63A3FCFACA412, %62; +fma.rn.f64 fd517, fd129, 0dBFE847BF1D5146CC, fd513; +fma.rn.f64 fd1222, fd128, 0d3FEE0210C26A6E6F, 0d0000000000000000; +fma.rn.f64 fd518, fd132, 0d3FE4D80B1AD9CCF6, fd1222; +fma.rn.f64 fd1221, fd1272, 0d3FD63A3FCFACA412, %63; +fma.rn.f64 fd519, fd1269, 0dBFE847BF1D5146CC, fd1221; +fma.rn.f64 fd1220, fd127, 0d3FEE0210C26A6E6F, 0d0000000000000000; +fma.rn.f64 fd520, fd131, 0d3FE4D80B1AD9CCF6, fd1220; +fma.rn.f64 fd521, fd133, 0dBFEBFAA5C136B224, fd517; +fma.rn.f64 fd522, fd136, 0dBFDF0F2FF6705BEC, fd518; +fma.rn.f64 fd523, fd1267, 0dBFEBFAA5C136B224, fd519; +fma.rn.f64 fd524, fd135, 0dBFDF0F2FF6705BEC, fd520; +fma.rn.f64 fd525, fd137, 0d3FC361FC440B478F, fd521; +fma.rn.f64 fd526, fd140, 0dBFEFA18852C3E08A, fd522; +fma.rn.f64 fd527, fd1265, 0d3FC361FC440B478F, fd523; +fma.rn.f64 fd528, fd139, 0dBFEFA18852C3E08A, fd524; +fma.rn.f64 fd529, fd141, 0d3FEF584F2CE43B84, fd525; +fma.rn.f64 fd530, fd144, 0dBFC9C4266041CA8F, fd526; +fma.rn.f64 fd531, fd1262, 0d3FEF584F2CE43B84, fd527; +fma.rn.f64 fd532, fd143, 0dBFC9C4266041CA8F, fd528; +fma.rn.f64 fd533, fd145, 0d3FE0ED45EEA3B09F, fd529; +fma.rn.f64 fd534, fd148, 0d3FEB2818007C19DF, fd530; +fma.rn.f64 fd535, fd1260, 0d3FE0ED45EEA3B09F, fd531; +fma.rn.f64 fd536, fd147, 0d3FEB2818007C19DF, fd532; +fma.rn.f64 fd537, fd149, 0dBFE3965F49174D13, fd533; +fma.rn.f64 fd538, fd152, 0d3FE94E08EB13C451, fd534; +fma.rn.f64 fd539, fd1258, 0dBFE3965F49174D13, fd535; +fma.rn.f64 fd540, fd151, 0d3FE94E08EB13C451, fd536; +fma.rn.f64 fd541, fd153, 0dBFEE884F0CC22CCC, fd537; +fma.rn.f64 fd542, fd156, 0dBFD328C3F1B322CB, fd538; +fma.rn.f64 fd543, fd1255, 0dBFEE884F0CC22CCC, fd539; +fma.rn.f64 fd544, fd155, 0dBFD328C3F1B322CB, fd540; +fma.rn.f64 fd545, fd157, 0dBFA9EEB01776B57D, fd541; +fma.rn.f64 fd546, fd160, 0dBFEFF57C5208CCF9, fd542; +fma.rn.f64 fd547, fd1253, 0dBFA9EEB01776B57D, fd543; +fma.rn.f64 fd548, fd159, 0dBFEFF57C5208CCF9, fd544; +fma.rn.f64 fd549, fd161, 0d3FED681A366A00FA, fd545; +fma.rn.f64 fd550, fd164, 0dBFD93D20572CA90B, fd546; +fma.rn.f64 fd551, fd1251, 0d3FED681A366A00FA, fd547; +fma.rn.f64 fd552, fd163, 0dBFD93D20572CA90B, fd548; +fma.rn.f64 fd553, fd165, 0d3FE60C045A2E9729, fd549; +fma.rn.f64 fd554, fd168, 0d3FE73180A4B0D300, fd550; +fma.rn.f64 fd555, fd1248, 0d3FE60C045A2E9729, fd551; +fma.rn.f64 fd556, fd167, 0d3FE73180A4B0D300, fd552; +fma.rn.f64 fd557, fd169, 0dBFDC2F6AF3928A8E, fd553; +fma.rn.f64 fd558, fd172, 0d3FECBAD095F50378, fd554; +fma.rn.f64 fd559, fd1246, 0dBFDC2F6AF3928A8E, fd555; +fma.rn.f64 fd560, fd171, 0d3FECBAD095F50378, fd556; +fma.rn.f64 fd561, fd173, 0dBFEFD5F830F860F9, fd557; +fma.rn.f64 fd562, fd176, 0dBFB9E62ACA53C49F, fd558; +fma.rn.f64 fd563, fd1244, 0dBFEFD5F830F860F9, fd559; +fma.rn.f64 fd564, fd175, 0dBFB9E62ACA53C49F, fd560; +fma.rn.f64 fd565, fd177, 0dBFD00AB0EB2D7D94, fd561; +fma.rn.f64 fd566, fd180, 0dBFEEFA7CDDB128FA, fd562; +fma.rn.f64 fd567, fd1241, 0dBFD00AB0EB2D7D94, fd563; +fma.rn.f64 fd568, fd179, 0dBFEEFA7CDDB128FA, fd564; +fma.rn.f64 fd569, fd181, 0d3FEA43B1B1379AFF, fd565; +fma.rn.f64 fd570, fd184, 0dBFE247D447A27216, fd566; +fma.rn.f64 fd571, fd1239, 0d3FEA43B1B1379AFF, fd567; +fma.rn.f64 fd572, fd183, 0dBFE247D447A27216, fd568; +fma.rn.f64 fd573, fd125, 0d3FC361FC440B478F, %62; +fma.rn.f64 fd577, fd129, 0dBFEE884F0CC22CCC, fd573; +fma.rn.f64 fd1219, fd128, 0d3FEFA18852C3E08A, 0d0000000000000000; +fma.rn.f64 fd578, fd132, 0d3FD328C3F1B322CB, fd1219; +fma.rn.f64 fd1218, fd1272, 0d3FC361FC440B478F, %63; +fma.rn.f64 fd579, fd1269, 0dBFEE884F0CC22CCC, fd1218; +fma.rn.f64 fd1217, fd127, 0d3FEFA18852C3E08A, 0d0000000000000000; +fma.rn.f64 fd580, fd131, 0d3FD328C3F1B322CB, fd1217; +fma.rn.f64 fd581, fd133, 0dBFDC2F6AF3928A8E, fd577; +fma.rn.f64 fd582, fd136, 0dBFECBAD095F50378, fd578; +fma.rn.f64 fd583, fd1267, 0dBFDC2F6AF3928A8E, fd579; +fma.rn.f64 fd584, fd135, 0dBFECBAD095F50378, fd580; +fma.rn.f64 fd585, fd137, 0d3FEA43B1B1379AFF, fd581; +fma.rn.f64 fd586, fd140, 0dBFE247D447A27216, fd582; +fma.rn.f64 fd587, fd1265, 0d3FEA43B1B1379AFF, fd583; +fma.rn.f64 fd588, fd139, 0dBFE247D447A27216, fd584; +fma.rn.f64 fd589, fd141, 0d3FE60C045A2E9729, fd585; +fma.rn.f64 fd590, fd144, 0d3FE73180A4B0D300, fd586; +fma.rn.f64 fd591, fd1262, 0d3FE60C045A2E9729, fd587; +fma.rn.f64 fd592, fd143, 0d3FE73180A4B0D300, fd588; +fma.rn.f64 fd593, fd145, 0dBFE3965F49174D13, fd589; +fma.rn.f64 fd594, fd148, 0d3FE94E08EB13C451, fd590; +fma.rn.f64 fd595, fd1260, 0dBFE3965F49174D13, fd591; +fma.rn.f64 fd596, fd147, 0d3FE94E08EB13C451, fd592; +fma.rn.f64 fd597, fd149, 0dBFEBFAA5C136B224, fd593; +fma.rn.f64 fd598, fd152, 0dBFDF0F2FF6705BEC, fd594; +fma.rn.f64 fd599, fd1258, 0dBFEBFAA5C136B224, fd595; +fma.rn.f64 fd600, fd151, 0dBFDF0F2FF6705BEC, fd596; +fma.rn.f64 fd601, fd153, 0d3FD63A3FCFACA412, fd597; +fma.rn.f64 fd602, fd156, 0dBFEE0210C26A6E6F, fd598; +fma.rn.f64 fd603, fd1255, 0d3FD63A3FCFACA412, fd599; +fma.rn.f64 fd604, fd155, 0dBFEE0210C26A6E6F, fd600; +fma.rn.f64 fd605, fd157, 0d3FEF584F2CE43B84, fd601; +fma.rn.f64 fd606, fd160, 0d3FC9C4266041CA8F, fd602; +fma.rn.f64 fd607, fd1253, 0d3FEF584F2CE43B84, fd603; +fma.rn.f64 fd608, fd159, 0d3FC9C4266041CA8F, fd604; +fma.rn.f64 fd609, fd161, 0dBFA9EEB01776B57D, fd605; +fma.rn.f64 fd610, fd164, 0d3FEFF57C5208CCF9, fd606; +fma.rn.f64 fd611, fd1251, 0dBFA9EEB01776B57D, fd607; +fma.rn.f64 fd612, fd163, 0d3FEFF57C5208CCF9, fd608; +fma.rn.f64 fd613, fd165, 0dBFEFD5F830F860F9, fd609; +fma.rn.f64 fd614, fd168, 0d3FB9E62ACA53C49F, fd610; +fma.rn.f64 fd615, fd1248, 0dBFEFD5F830F860F9, fd611; +fma.rn.f64 fd616, fd167, 0d3FB9E62ACA53C49F, fd612; +fma.rn.f64 fd617, fd169, 0dBFD00AB0EB2D7D94, fd613; +fma.rn.f64 fd618, fd172, 0dBFEEFA7CDDB128FA, fd614; +fma.rn.f64 fd619, fd1246, 0dBFD00AB0EB2D7D94, fd615; +fma.rn.f64 fd620, fd171, 0dBFEEFA7CDDB128FA, fd616; +fma.rn.f64 fd621, fd173, 0d3FED681A366A00FA, fd617; +fma.rn.f64 fd622, fd176, 0dBFD93D20572CA90B, fd618; +fma.rn.f64 fd623, fd1244, 0d3FED681A366A00FA, fd619; +fma.rn.f64 fd624, fd175, 0dBFD93D20572CA90B, fd620; +fma.rn.f64 fd625, fd177, 0d3FE0ED45EEA3B09F, fd621; +fma.rn.f64 fd626, fd180, 0d3FEB2818007C19DF, fd622; +fma.rn.f64 fd627, fd1241, 0d3FE0ED45EEA3B09F, fd623; +fma.rn.f64 fd628, fd179, 0d3FEB2818007C19DF, fd624; +fma.rn.f64 fd629, fd181, 0dBFE847BF1D5146CC, fd625; +fma.rn.f64 fd630, fd184, 0d3FE4D80B1AD9CCF6, fd626; +fma.rn.f64 fd631, fd1239, 0dBFE847BF1D5146CC, fd627; +fma.rn.f64 fd632, fd183, 0d3FE4D80B1AD9CCF6, fd628; +fma.rn.f64 fd633, fd125, 0dBFA9EEB01776B57D, %62; +fma.rn.f64 fd637, fd129, 0dBFEFD5F830F860F9, fd633; +fma.rn.f64 fd1216, fd128, 0d3FEFF57C5208CCF9, 0d0000000000000000; +fma.rn.f64 fd638, fd132, 0dBFB9E62ACA53C49F, fd1216; +fma.rn.f64 fd1215, fd1272, 0dBFA9EEB01776B57D, %63; +fma.rn.f64 fd639, fd1269, 0dBFEFD5F830F860F9, fd1215; +fma.rn.f64 fd1214, fd127, 0d3FEFF57C5208CCF9, 0d0000000000000000; +fma.rn.f64 fd640, fd131, 0dBFB9E62ACA53C49F, fd1214; +fma.rn.f64 fd641, fd133, 0d3FC361FC440B478F, fd637; +fma.rn.f64 fd642, fd136, 0dBFEFA18852C3E08A, fd638; +fma.rn.f64 fd643, fd1267, 0d3FC361FC440B478F, fd639; +fma.rn.f64 fd644, fd135, 0dBFEFA18852C3E08A, fd640; +fma.rn.f64 fd645, fd137, 0d3FEF584F2CE43B84, fd641; +fma.rn.f64 fd646, fd140, 0d3FC9C4266041CA8F, fd642; +fma.rn.f64 fd647, fd1265, 0d3FEF584F2CE43B84, fd643; +fma.rn.f64 fd648, fd139, 0d3FC9C4266041CA8F, fd644; +fma.rn.f64 fd649, fd141, 0dBFD00AB0EB2D7D94, fd645; +fma.rn.f64 fd650, fd144, 0d3FEEFA7CDDB128FA, fd646; +fma.rn.f64 fd651, fd1262, 0dBFD00AB0EB2D7D94, fd647; +fma.rn.f64 fd652, fd143, 0d3FEEFA7CDDB128FA, fd648; +fma.rn.f64 fd653, fd145, 0dBFEE884F0CC22CCC, fd649; +fma.rn.f64 fd654, fd148, 0dBFD328C3F1B322CB, fd650; +fma.rn.f64 fd655, fd1260, 0dBFEE884F0CC22CCC, fd651; +fma.rn.f64 fd656, fd147, 0dBFD328C3F1B322CB, fd652; +fma.rn.f64 fd657, fd149, 0d3FD63A3FCFACA412, fd653; +fma.rn.f64 fd658, fd152, 0dBFEE0210C26A6E6F, fd654; +fma.rn.f64 fd659, fd1258, 0d3FD63A3FCFACA412, fd655; +fma.rn.f64 fd660, fd151, 0dBFEE0210C26A6E6F, fd656; +fma.rn.f64 fd661, fd153, 0d3FED681A366A00FA, fd657; +fma.rn.f64 fd662, fd156, 0d3FD93D20572CA90B, fd658; +fma.rn.f64 fd663, fd1255, 0d3FED681A366A00FA, fd659; +fma.rn.f64 fd664, fd155, 0d3FD93D20572CA90B, fd660; +fma.rn.f64 fd665, fd157, 0dBFDC2F6AF3928A8E, fd661; +fma.rn.f64 fd666, fd160, 0d3FECBAD095F50378, fd662; +fma.rn.f64 fd667, fd1253, 0dBFDC2F6AF3928A8E, fd663; +fma.rn.f64 fd668, fd159, 0d3FECBAD095F50378, fd664; +fma.rn.f64 fd669, fd161, 0dBFEBFAA5C136B224, fd665; +fma.rn.f64 fd670, fd164, 0dBFDF0F2FF6705BEC, fd666; +fma.rn.f64 fd671, fd1251, 0dBFEBFAA5C136B224, fd667; +fma.rn.f64 fd672, fd163, 0dBFDF0F2FF6705BEC, fd668; +fma.rn.f64 fd673, fd165, 0d3FE0ED45EEA3B09F, fd669; +fma.rn.f64 fd674, fd168, 0dBFEB2818007C19DF, fd670; +fma.rn.f64 fd675, fd1248, 0d3FE0ED45EEA3B09F, fd671; +fma.rn.f64 fd676, fd167, 0dBFEB2818007C19DF, fd672; +fma.rn.f64 fd677, fd169, 0d3FEA43B1B1379AFF, fd673; +fma.rn.f64 fd678, fd172, 0d3FE247D447A27216, fd674; +fma.rn.f64 fd679, fd1246, 0d3FEA43B1B1379AFF, fd675; +fma.rn.f64 fd680, fd171, 0d3FE247D447A27216, fd676; +fma.rn.f64 fd681, fd173, 0dBFE3965F49174D13, fd677; +fma.rn.f64 fd682, fd176, 0d3FE94E08EB13C451, fd678; +fma.rn.f64 fd683, fd1244, 0dBFE3965F49174D13, fd679; +fma.rn.f64 fd684, fd175, 0d3FE94E08EB13C451, fd680; +fma.rn.f64 fd685, fd177, 0dBFE847BF1D5146CC, fd681; +fma.rn.f64 fd686, fd180, 0dBFE4D80B1AD9CCF6, fd682; +fma.rn.f64 fd687, fd1241, 0dBFE847BF1D5146CC, fd683; +fma.rn.f64 fd688, fd179, 0dBFE4D80B1AD9CCF6, fd684; +fma.rn.f64 fd689, fd181, 0d3FE60C045A2E9729, fd685; +fma.rn.f64 fd690, fd184, 0dBFE73180A4B0D300, fd686; +fma.rn.f64 fd691, fd1239, 0d3FE60C045A2E9729, fd687; +fma.rn.f64 fd692, fd183, 0dBFE73180A4B0D300, fd688; +fma.rn.f64 fd693, fd125, 0dBFD00AB0EB2D7D94, %62; +fma.rn.f64 fd697, fd129, 0dBFEBFAA5C136B224, fd693; +fma.rn.f64 fd1213, fd128, 0d3FEEFA7CDDB128FA, 0d0000000000000000; +fma.rn.f64 fd698, fd132, 0dBFDF0F2FF6705BEC, fd1213; +fma.rn.f64 fd1212, fd1272, 0dBFD00AB0EB2D7D94, %63; +fma.rn.f64 fd699, fd1269, 0dBFEBFAA5C136B224, fd1212; +fma.rn.f64 fd1211, fd127, 0d3FEEFA7CDDB128FA, 0d0000000000000000; +fma.rn.f64 fd700, fd131, 0dBFDF0F2FF6705BEC, fd1211; +fma.rn.f64 fd701, fd133, 0d3FE60C045A2E9729, fd697; +fma.rn.f64 fd702, fd136, 0dBFE73180A4B0D300, fd698; +fma.rn.f64 fd703, fd1267, 0d3FE60C045A2E9729, fd699; +fma.rn.f64 fd704, fd135, 0dBFE73180A4B0D300, fd700; +fma.rn.f64 fd705, fd137, 0d3FE0ED45EEA3B09F, fd701; +fma.rn.f64 fd706, fd140, 0d3FEB2818007C19DF, fd702; +fma.rn.f64 fd707, fd1265, 0d3FE0ED45EEA3B09F, fd703; +fma.rn.f64 fd708, fd139, 0d3FEB2818007C19DF, fd704; +fma.rn.f64 fd709, fd141, 0dBFEE884F0CC22CCC, fd705; +fma.rn.f64 fd710, fd144, 0d3FD328C3F1B322CB, fd706; +fma.rn.f64 fd711, fd1262, 0dBFEE884F0CC22CCC, fd707; +fma.rn.f64 fd712, fd143, 0d3FD328C3F1B322CB, fd708; +fma.rn.f64 fd713, fd145, 0dBFA9EEB01776B57D, fd709; +fma.rn.f64 fd714, fd148, 0dBFEFF57C5208CCF9, fd710; +fma.rn.f64 fd715, fd1260, 0dBFA9EEB01776B57D, fd711; +fma.rn.f64 fd716, fd147, 0dBFEFF57C5208CCF9, fd712; +fma.rn.f64 fd717, fd149, 0d3FEF584F2CE43B84, fd713; +fma.rn.f64 fd718, fd152, 0d3FC9C4266041CA8F, fd714; +fma.rn.f64 fd719, fd1258, 0d3FEF584F2CE43B84, fd715; +fma.rn.f64 fd720, fd151, 0d3FC9C4266041CA8F, fd716; +fma.rn.f64 fd721, fd153, 0dBFDC2F6AF3928A8E, fd717; +fma.rn.f64 fd722, fd156, 0d3FECBAD095F50378, fd718; +fma.rn.f64 fd723, fd1255, 0dBFDC2F6AF3928A8E, fd719; +fma.rn.f64 fd724, fd155, 0d3FECBAD095F50378, fd720; +fma.rn.f64 fd725, fd157, 0dBFE847BF1D5146CC, fd721; +fma.rn.f64 fd726, fd160, 0dBFE4D80B1AD9CCF6, fd722; +fma.rn.f64 fd727, fd1253, 0dBFE847BF1D5146CC, fd723; +fma.rn.f64 fd728, fd159, 0dBFE4D80B1AD9CCF6, fd724; +fma.rn.f64 fd729, fd161, 0d3FEA43B1B1379AFF, fd725; +fma.rn.f64 fd730, fd164, 0dBFE247D447A27216, fd726; +fma.rn.f64 fd731, fd1251, 0d3FEA43B1B1379AFF, fd727; +fma.rn.f64 fd732, fd163, 0dBFE247D447A27216, fd728; +fma.rn.f64 fd733, fd165, 0d3FD63A3FCFACA412, fd729; +fma.rn.f64 fd734, fd168, 0d3FEE0210C26A6E6F, fd730; +fma.rn.f64 fd735, fd1248, 0d3FD63A3FCFACA412, fd731; +fma.rn.f64 fd736, fd167, 0d3FEE0210C26A6E6F, fd732; +fma.rn.f64 fd737, fd169, 0dBFEFD5F830F860F9, fd733; +fma.rn.f64 fd738, fd172, 0d3FB9E62ACA53C49F, fd734; +fma.rn.f64 fd739, fd1246, 0dBFEFD5F830F860F9, fd735; +fma.rn.f64 fd740, fd171, 0d3FB9E62ACA53C49F, fd736; +fma.rn.f64 fd741, fd173, 0d3FC361FC440B478F, fd737; +fma.rn.f64 fd742, fd176, 0dBFEFA18852C3E08A, fd738; +fma.rn.f64 fd743, fd1244, 0d3FC361FC440B478F, fd739; +fma.rn.f64 fd744, fd175, 0dBFEFA18852C3E08A, fd740; +fma.rn.f64 fd745, fd177, 0d3FED681A366A00FA, fd741; +fma.rn.f64 fd746, fd180, 0d3FD93D20572CA90B, fd742; +fma.rn.f64 fd747, fd1241, 0d3FED681A366A00FA, fd743; +fma.rn.f64 fd748, fd179, 0d3FD93D20572CA90B, fd744; +fma.rn.f64 fd749, fd181, 0dBFE3965F49174D13, fd745; +fma.rn.f64 fd750, fd184, 0d3FE94E08EB13C451, fd746; +fma.rn.f64 fd751, fd1239, 0dBFE3965F49174D13, fd747; +fma.rn.f64 fd752, fd183, 0d3FE94E08EB13C451, fd748; +fma.rn.f64 fd753, fd125, 0dBFDC2F6AF3928A8E, %62; +fma.rn.f64 fd757, fd129, 0dBFE3965F49174D13, fd753; +fma.rn.f64 fd1210, fd128, 0d3FECBAD095F50378, 0d0000000000000000; +fma.rn.f64 fd758, fd132, 0dBFE94E08EB13C451, fd1210; +fma.rn.f64 fd1209, fd1272, 0dBFDC2F6AF3928A8E, %63; +fma.rn.f64 fd759, fd1269, 0dBFE3965F49174D13, fd1209; +fma.rn.f64 fd1208, fd127, 0d3FECBAD095F50378, 0d0000000000000000; +fma.rn.f64 fd760, fd131, 0dBFE94E08EB13C451, fd1208; +fma.rn.f64 fd761, fd133, 0d3FEF584F2CE43B84, fd757; +fma.rn.f64 fd762, fd136, 0dBFC9C4266041CA8F, fd758; +fma.rn.f64 fd763, fd1267, 0d3FEF584F2CE43B84, fd759; +fma.rn.f64 fd764, fd135, 0dBFC9C4266041CA8F, fd760; +fma.rn.f64 fd765, fd137, 0dBFD00AB0EB2D7D94, fd761; +fma.rn.f64 fd766, fd140, 0d3FEEFA7CDDB128FA, fd762; +fma.rn.f64 fd767, fd1265, 0dBFD00AB0EB2D7D94, fd763; +fma.rn.f64 fd768, fd139, 0d3FEEFA7CDDB128FA, fd764; +fma.rn.f64 fd769, fd141, 0dBFE847BF1D5146CC, fd765; +fma.rn.f64 fd770, fd144, 0dBFE4D80B1AD9CCF6, fd766; +fma.rn.f64 fd771, fd1262, 0dBFE847BF1D5146CC, fd767; +fma.rn.f64 fd772, fd143, 0dBFE4D80B1AD9CCF6, fd768; +fma.rn.f64 fd773, fd145, 0d3FED681A366A00FA, fd769; +fma.rn.f64 fd774, fd148, 0dBFD93D20572CA90B, fd770; +fma.rn.f64 fd775, fd1260, 0d3FED681A366A00FA, fd771; +fma.rn.f64 fd776, fd147, 0dBFD93D20572CA90B, fd772; +fma.rn.f64 fd777, fd149, 0dBFA9EEB01776B57D, fd773; +fma.rn.f64 fd778, fd152, 0d3FEFF57C5208CCF9, fd774; +fma.rn.f64 fd779, fd1258, 0dBFA9EEB01776B57D, fd775; +fma.rn.f64 fd780, fd151, 0d3FEFF57C5208CCF9, fd776; +fma.rn.f64 fd781, fd153, 0dBFEBFAA5C136B224, fd777; +fma.rn.f64 fd782, fd156, 0dBFDF0F2FF6705BEC, fd778; +fma.rn.f64 fd783, fd1255, 0dBFEBFAA5C136B224, fd779; +fma.rn.f64 fd784, fd155, 0dBFDF0F2FF6705BEC, fd780; +fma.rn.f64 fd785, fd157, 0d3FEA43B1B1379AFF, fd781; +fma.rn.f64 fd786, fd160, 0dBFE247D447A27216, fd782; +fma.rn.f64 fd787, fd1253, 0d3FEA43B1B1379AFF, fd783; +fma.rn.f64 fd788, fd159, 0dBFE247D447A27216, fd784; +fma.rn.f64 fd789, fd161, 0d3FC361FC440B478F, fd785; +fma.rn.f64 fd790, fd164, 0d3FEFA18852C3E08A, fd786; +fma.rn.f64 fd791, fd1251, 0d3FC361FC440B478F, fd787; +fma.rn.f64 fd792, fd163, 0d3FEFA18852C3E08A, fd788; +fma.rn.f64 fd793, fd165, 0dBFEE884F0CC22CCC, fd789; +fma.rn.f64 fd794, fd168, 0dBFD328C3F1B322CB, fd790; +fma.rn.f64 fd795, fd1248, 0dBFEE884F0CC22CCC, fd791; +fma.rn.f64 fd796, fd167, 0dBFD328C3F1B322CB, fd792; +fma.rn.f64 fd797, fd169, 0d3FE60C045A2E9729, fd793; +fma.rn.f64 fd798, fd172, 0dBFE73180A4B0D300, fd794; +fma.rn.f64 fd799, fd1246, 0d3FE60C045A2E9729, fd795; +fma.rn.f64 fd800, fd171, 0dBFE73180A4B0D300, fd796; +fma.rn.f64 fd801, fd173, 0d3FD63A3FCFACA412, fd797; +fma.rn.f64 fd802, fd176, 0d3FEE0210C26A6E6F, fd798; +fma.rn.f64 fd803, fd1244, 0d3FD63A3FCFACA412, fd799; +fma.rn.f64 fd804, fd175, 0d3FEE0210C26A6E6F, fd800; +fma.rn.f64 fd805, fd177, 0dBFEFD5F830F860F9, fd801; +fma.rn.f64 fd806, fd180, 0dBFB9E62ACA53C49F, fd802; +fma.rn.f64 fd807, fd1241, 0dBFEFD5F830F860F9, fd803; +fma.rn.f64 fd808, fd179, 0dBFB9E62ACA53C49F, fd804; +fma.rn.f64 fd809, fd181, 0d3FE0ED45EEA3B09F, fd805; +fma.rn.f64 fd810, fd184, 0dBFEB2818007C19DF, fd806; +fma.rn.f64 fd811, fd1239, 0d3FE0ED45EEA3B09F, fd807; +fma.rn.f64 fd812, fd183, 0dBFEB2818007C19DF, fd808; +fma.rn.f64 fd813, fd125, 0dBFE3965F49174D13, %62; +fma.rn.f64 fd817, fd129, 0dBFD00AB0EB2D7D94, fd813; +fma.rn.f64 fd1207, fd128, 0d3FE94E08EB13C451, 0d0000000000000000; +fma.rn.f64 fd818, fd132, 0dBFEEFA7CDDB128FA, fd1207; +fma.rn.f64 fd1206, fd1272, 0dBFE3965F49174D13, %63; +fma.rn.f64 fd819, fd1269, 0dBFD00AB0EB2D7D94, fd1206; +fma.rn.f64 fd1205, fd127, 0d3FE94E08EB13C451, 0d0000000000000000; +fma.rn.f64 fd820, fd131, 0dBFEEFA7CDDB128FA, fd1205; +fma.rn.f64 fd821, fd133, 0d3FED681A366A00FA, fd817; +fma.rn.f64 fd822, fd136, 0d3FD93D20572CA90B, fd818; +fma.rn.f64 fd823, fd1267, 0d3FED681A366A00FA, fd819; +fma.rn.f64 fd824, fd135, 0d3FD93D20572CA90B, fd820; +fma.rn.f64 fd825, fd137, 0dBFEBFAA5C136B224, fd821; +fma.rn.f64 fd826, fd140, 0d3FDF0F2FF6705BEC, fd822; +fma.rn.f64 fd827, fd1265, 0dBFEBFAA5C136B224, fd823; +fma.rn.f64 fd828, fd139, 0d3FDF0F2FF6705BEC, fd824; +fma.rn.f64 fd829, fd141, 0d3FC361FC440B478F, fd825; +fma.rn.f64 fd830, fd144, 0dBFEFA18852C3E08A, fd826; +fma.rn.f64 fd831, fd1262, 0d3FC361FC440B478F, fd827; +fma.rn.f64 fd832, fd143, 0dBFEFA18852C3E08A, fd828; +fma.rn.f64 fd833, fd145, 0d3FE60C045A2E9729, fd829; +fma.rn.f64 fd834, fd148, 0d3FE73180A4B0D300, fd830; +fma.rn.f64 fd835, fd1260, 0d3FE60C045A2E9729, fd831; +fma.rn.f64 fd836, fd147, 0d3FE73180A4B0D300, fd832; +fma.rn.f64 fd837, fd149, 0dBFEFD5F830F860F9, fd833; +fma.rn.f64 fd838, fd152, 0d3FB9E62ACA53C49F, fd834; +fma.rn.f64 fd839, fd1258, 0dBFEFD5F830F860F9, fd835; +fma.rn.f64 fd840, fd151, 0d3FB9E62ACA53C49F, fd836; +fma.rn.f64 fd841, fd153, 0d3FE0ED45EEA3B09F, fd837; +fma.rn.f64 fd842, fd156, 0dBFEB2818007C19DF, fd838; +fma.rn.f64 fd843, fd1255, 0d3FE0ED45EEA3B09F, fd839; +fma.rn.f64 fd844, fd155, 0dBFEB2818007C19DF, fd840; +fma.rn.f64 fd845, fd157, 0d3FD63A3FCFACA412, fd841; +fma.rn.f64 fd846, fd160, 0d3FEE0210C26A6E6F, fd842; +fma.rn.f64 fd847, fd1253, 0d3FD63A3FCFACA412, fd843; +fma.rn.f64 fd848, fd159, 0d3FEE0210C26A6E6F, fd844; +fma.rn.f64 fd849, fd161, 0dBFEE884F0CC22CCC, fd845; +fma.rn.f64 fd850, fd164, 0dBFD328C3F1B322CB, fd846; +fma.rn.f64 fd851, fd1251, 0dBFEE884F0CC22CCC, fd847; +fma.rn.f64 fd852, fd163, 0dBFD328C3F1B322CB, fd848; +fma.rn.f64 fd853, fd165, 0d3FEA43B1B1379AFF, fd849; +fma.rn.f64 fd854, fd168, 0dBFE247D447A27216, fd850; +fma.rn.f64 fd855, fd1248, 0d3FEA43B1B1379AFF, fd851; +fma.rn.f64 fd856, fd167, 0dBFE247D447A27216, fd852; +fma.rn.f64 fd857, fd169, 0dBFA9EEB01776B57D, fd853; +fma.rn.f64 fd858, fd172, 0d3FEFF57C5208CCF9, fd854; +fma.rn.f64 fd859, fd1246, 0dBFA9EEB01776B57D, fd855; +fma.rn.f64 fd860, fd171, 0d3FEFF57C5208CCF9, fd856; +fma.rn.f64 fd861, fd173, 0dBFE847BF1D5146CC, fd857; +fma.rn.f64 fd862, fd176, 0dBFE4D80B1AD9CCF6, fd858; +fma.rn.f64 fd863, fd1244, 0dBFE847BF1D5146CC, fd859; +fma.rn.f64 fd864, fd175, 0dBFE4D80B1AD9CCF6, fd860; +fma.rn.f64 fd865, fd177, 0d3FEF584F2CE43B84, fd861; +fma.rn.f64 fd866, fd180, 0dBFC9C4266041CA8F, fd862; +fma.rn.f64 fd867, fd1241, 0d3FEF584F2CE43B84, fd863; +fma.rn.f64 fd868, fd179, 0dBFC9C4266041CA8F, fd864; +fma.rn.f64 fd869, fd181, 0dBFDC2F6AF3928A8E, fd865; +fma.rn.f64 fd870, fd184, 0d3FECBAD095F50378, fd866; +fma.rn.f64 fd871, fd1239, 0dBFDC2F6AF3928A8E, fd867; +fma.rn.f64 fd872, fd183, 0d3FECBAD095F50378, fd868; +fma.rn.f64 fd873, fd125, 0dBFE847BF1D5146CC, %62; +fma.rn.f64 fd877, fd129, 0d3FC361FC440B478F, fd873; +fma.rn.f64 fd1204, fd128, 0d3FE4D80B1AD9CCF6, 0d0000000000000000; +fma.rn.f64 fd878, fd132, 0dBFEFA18852C3E08A, fd1204; +fma.rn.f64 fd1203, fd1272, 0dBFE847BF1D5146CC, %63; +fma.rn.f64 fd879, fd1269, 0d3FC361FC440B478F, fd1203; +fma.rn.f64 fd1202, fd127, 0d3FE4D80B1AD9CCF6, 0d0000000000000000; +fma.rn.f64 fd880, fd131, 0dBFEFA18852C3E08A, fd1202; +fma.rn.f64 fd881, fd133, 0d3FE0ED45EEA3B09F, fd877; +fma.rn.f64 fd882, fd136, 0d3FEB2818007C19DF, fd878; +fma.rn.f64 fd883, fd1267, 0d3FE0ED45EEA3B09F, fd879; +fma.rn.f64 fd884, fd135, 0d3FEB2818007C19DF, fd880; +fma.rn.f64 fd885, fd137, 0dBFEE884F0CC22CCC, fd881; +fma.rn.f64 fd886, fd140, 0dBFD328C3F1B322CB, fd882; +fma.rn.f64 fd887, fd1265, 0dBFEE884F0CC22CCC, fd883; +fma.rn.f64 fd888, fd139, 0dBFD328C3F1B322CB, fd884; +fma.rn.f64 fd889, fd141, 0d3FED681A366A00FA, fd885; +fma.rn.f64 fd890, fd144, 0dBFD93D20572CA90B, fd886; +fma.rn.f64 fd891, fd1262, 0d3FED681A366A00FA, fd887; +fma.rn.f64 fd892, fd143, 0dBFD93D20572CA90B, fd888; +fma.rn.f64 fd893, fd145, 0dBFDC2F6AF3928A8E, fd889; +fma.rn.f64 fd894, fd148, 0d3FECBAD095F50378, fd890; +fma.rn.f64 fd895, fd1260, 0dBFDC2F6AF3928A8E, fd891; +fma.rn.f64 fd896, fd147, 0d3FECBAD095F50378, fd892; +fma.rn.f64 fd897, fd149, 0dBFD00AB0EB2D7D94, fd893; +fma.rn.f64 fd898, fd152, 0dBFEEFA7CDDB128FA, fd894; +fma.rn.f64 fd899, fd1258, 0dBFD00AB0EB2D7D94, fd895; +fma.rn.f64 fd900, fd151, 0dBFEEFA7CDDB128FA, fd896; +fma.rn.f64 fd901, fd153, 0d3FEA43B1B1379AFF, fd897; +fma.rn.f64 fd902, fd156, 0d3FE247D447A27216, fd898; +fma.rn.f64 fd903, fd1255, 0d3FEA43B1B1379AFF, fd899; +fma.rn.f64 fd904, fd155, 0d3FE247D447A27216, fd900; +fma.rn.f64 fd905, fd157, 0dBFEFD5F830F860F9, fd901; +fma.rn.f64 fd906, fd160, 0d3FB9E62ACA53C49F, fd902; +fma.rn.f64 fd907, fd1253, 0dBFEFD5F830F860F9, fd903; +fma.rn.f64 fd908, fd159, 0d3FB9E62ACA53C49F, fd904; +fma.rn.f64 fd909, fd161, 0d3FE60C045A2E9729, fd905; +fma.rn.f64 fd910, fd164, 0dBFE73180A4B0D300, fd906; +fma.rn.f64 fd911, fd1251, 0d3FE60C045A2E9729, fd907; +fma.rn.f64 fd912, fd163, 0dBFE73180A4B0D300, fd908; +fma.rn.f64 fd913, fd165, 0dBFA9EEB01776B57D, fd909; +fma.rn.f64 fd914, fd168, 0d3FEFF57C5208CCF9, fd910; +fma.rn.f64 fd915, fd1248, 0dBFA9EEB01776B57D, fd911; +fma.rn.f64 fd916, fd167, 0d3FEFF57C5208CCF9, fd912; +fma.rn.f64 fd917, fd169, 0dBFE3965F49174D13, fd913; +fma.rn.f64 fd918, fd172, 0dBFE94E08EB13C451, fd914; +fma.rn.f64 fd919, fd1246, 0dBFE3965F49174D13, fd915; +fma.rn.f64 fd920, fd171, 0dBFE94E08EB13C451, fd916; +fma.rn.f64 fd921, fd173, 0d3FEF584F2CE43B84, fd917; +fma.rn.f64 fd922, fd176, 0d3FC9C4266041CA8F, fd918; +fma.rn.f64 fd923, fd1244, 0d3FEF584F2CE43B84, fd919; +fma.rn.f64 fd924, fd175, 0d3FC9C4266041CA8F, fd920; +fma.rn.f64 fd925, fd177, 0dBFEBFAA5C136B224, fd921; +fma.rn.f64 fd926, fd180, 0d3FDF0F2FF6705BEC, fd922; +fma.rn.f64 fd927, fd1241, 0dBFEBFAA5C136B224, fd923; +fma.rn.f64 fd928, fd179, 0d3FDF0F2FF6705BEC, fd924; +fma.rn.f64 fd929, fd181, 0d3FD63A3FCFACA412, fd925; +fma.rn.f64 fd930, fd184, 0dBFEE0210C26A6E6F, fd926; +fma.rn.f64 fd931, fd1239, 0d3FD63A3FCFACA412, fd927; +fma.rn.f64 fd932, fd183, 0dBFEE0210C26A6E6F, fd928; +fma.rn.f64 fd933, fd125, 0dBFEBFAA5C136B224, %62; +fma.rn.f64 fd937, fd129, 0d3FE0ED45EEA3B09F, fd933; +fma.rn.f64 fd1201, fd128, 0d3FDF0F2FF6705BEC, 0d0000000000000000; +fma.rn.f64 fd938, fd132, 0dBFEB2818007C19DF, fd1201; +fma.rn.f64 fd1200, fd1272, 0dBFEBFAA5C136B224, %63; +fma.rn.f64 fd939, fd1269, 0d3FE0ED45EEA3B09F, fd1200; +fma.rn.f64 fd1199, fd127, 0d3FDF0F2FF6705BEC, 0d0000000000000000; +fma.rn.f64 fd940, fd131, 0dBFEB2818007C19DF, fd1199; +fma.rn.f64 fd941, fd133, 0dBFA9EEB01776B57D, fd937; +fma.rn.f64 fd942, fd136, 0d3FEFF57C5208CCF9, fd938; +fma.rn.f64 fd943, fd1267, 0dBFA9EEB01776B57D, fd939; +fma.rn.f64 fd944, fd135, 0d3FEFF57C5208CCF9, fd940; +fma.rn.f64 fd945, fd137, 0dBFDC2F6AF3928A8E, fd941; +fma.rn.f64 fd946, fd140, 0dBFECBAD095F50378, fd942; +fma.rn.f64 fd947, fd1265, 0dBFDC2F6AF3928A8E, fd943; +fma.rn.f64 fd948, fd139, 0dBFECBAD095F50378, fd944; +fma.rn.f64 fd949, fd141, 0d3FEA43B1B1379AFF, fd945; +fma.rn.f64 fd950, fd144, 0d3FE247D447A27216, fd946; +fma.rn.f64 fd951, fd1262, 0d3FEA43B1B1379AFF, fd947; +fma.rn.f64 fd952, fd143, 0d3FE247D447A27216, fd948; +fma.rn.f64 fd953, fd145, 0dBFEFD5F830F860F9, fd949; +fma.rn.f64 fd954, fd148, 0dBFB9E62ACA53C49F, fd950; +fma.rn.f64 fd955, fd1260, 0dBFEFD5F830F860F9, fd951; +fma.rn.f64 fd956, fd147, 0dBFB9E62ACA53C49F, fd952; +fma.rn.f64 fd957, fd149, 0d3FED681A366A00FA, fd953; +fma.rn.f64 fd958, fd152, 0dBFD93D20572CA90B, fd954; +fma.rn.f64 fd959, fd1258, 0d3FED681A366A00FA, fd955; +fma.rn.f64 fd960, fd151, 0dBFD93D20572CA90B, fd956; +fma.rn.f64 fd961, fd153, 0dBFE3965F49174D13, fd957; +fma.rn.f64 fd962, fd156, 0d3FE94E08EB13C451, fd958; +fma.rn.f64 fd963, fd1255, 0dBFE3965F49174D13, fd959; +fma.rn.f64 fd964, fd155, 0d3FE94E08EB13C451, fd960; +fma.rn.f64 fd965, fd157, 0d3FC361FC440B478F, fd961; +fma.rn.f64 fd966, fd160, 0dBFEFA18852C3E08A, fd962; +fma.rn.f64 fd967, fd1253, 0d3FC361FC440B478F, fd963; +fma.rn.f64 fd968, fd159, 0dBFEFA18852C3E08A, fd964; +fma.rn.f64 fd969, fd161, 0d3FD63A3FCFACA412, fd965; +fma.rn.f64 fd970, fd164, 0d3FEE0210C26A6E6F, fd966; +fma.rn.f64 fd971, fd1251, 0d3FD63A3FCFACA412, fd967; +fma.rn.f64 fd972, fd163, 0d3FEE0210C26A6E6F, fd968; +fma.rn.f64 fd973, fd165, 0dBFE847BF1D5146CC, fd969; +fma.rn.f64 fd974, fd168, 0dBFE4D80B1AD9CCF6, fd970; +fma.rn.f64 fd975, fd1248, 0dBFE847BF1D5146CC, fd971; +fma.rn.f64 fd976, fd167, 0dBFE4D80B1AD9CCF6, fd972; +fma.rn.f64 fd977, fd169, 0d3FEF584F2CE43B84, fd973; +fma.rn.f64 fd978, fd172, 0d3FC9C4266041CA8F, fd974; +fma.rn.f64 fd979, fd1246, 0d3FEF584F2CE43B84, fd975; +fma.rn.f64 fd980, fd171, 0d3FC9C4266041CA8F, fd976; +fma.rn.f64 fd981, fd173, 0dBFEE884F0CC22CCC, fd977; +fma.rn.f64 fd982, fd176, 0d3FD328C3F1B322CB, fd978; +fma.rn.f64 fd983, fd1244, 0dBFEE884F0CC22CCC, fd979; +fma.rn.f64 fd984, fd175, 0d3FD328C3F1B322CB, fd980; +fma.rn.f64 fd985, fd177, 0d3FE60C045A2E9729, fd981; +fma.rn.f64 fd986, fd180, 0dBFE73180A4B0D300, fd982; +fma.rn.f64 fd987, fd1241, 0d3FE60C045A2E9729, fd983; +fma.rn.f64 fd988, fd179, 0dBFE73180A4B0D300, fd984; +fma.rn.f64 fd989, fd181, 0dBFD00AB0EB2D7D94, fd985; +fma.rn.f64 fd990, fd184, 0d3FEEFA7CDDB128FA, fd986; +fma.rn.f64 fd991, fd1239, 0dBFD00AB0EB2D7D94, fd987; +fma.rn.f64 fd992, fd183, 0d3FEEFA7CDDB128FA, fd988; +fma.rn.f64 fd993, fd125, 0dBFEE884F0CC22CCC, %62; +fma.rn.f64 fd997, fd129, 0d3FEA43B1B1379AFF, fd993; +fma.rn.f64 fd1198, fd128, 0d3FD328C3F1B322CB, 0d0000000000000000; +fma.rn.f64 fd998, fd132, 0dBFE247D447A27216, fd1198; +fma.rn.f64 fd1197, fd1272, 0dBFEE884F0CC22CCC, %63; +fma.rn.f64 fd999, fd1269, 0d3FEA43B1B1379AFF, fd1197; +fma.rn.f64 fd1196, fd127, 0d3FD328C3F1B322CB, 0d0000000000000000; +fma.rn.f64 fd1000, fd131, 0dBFE247D447A27216, fd1196; +fma.rn.f64 fd1001, fd133, 0dBFE3965F49174D13, fd997; +fma.rn.f64 fd1002, fd136, 0d3FE94E08EB13C451, fd998; +fma.rn.f64 fd1003, fd1267, 0dBFE3965F49174D13, fd999; +fma.rn.f64 fd1004, fd135, 0d3FE94E08EB13C451, fd1000; +fma.rn.f64 fd1005, fd137, 0d3FD63A3FCFACA412, fd1001; +fma.rn.f64 fd1006, fd140, 0dBFEE0210C26A6E6F, fd1002; +fma.rn.f64 fd1007, fd1265, 0d3FD63A3FCFACA412, fd1003; +fma.rn.f64 fd1008, fd139, 0dBFEE0210C26A6E6F, fd1004; +fma.rn.f64 fd1009, fd141, 0dBFA9EEB01776B57D, fd1005; +fma.rn.f64 fd1010, fd144, 0d3FEFF57C5208CCF9, fd1006; +fma.rn.f64 fd1011, fd1262, 0dBFA9EEB01776B57D, fd1007; +fma.rn.f64 fd1012, fd143, 0d3FEFF57C5208CCF9, fd1008; +fma.rn.f64 fd1013, fd145, 0dBFD00AB0EB2D7D94, fd1009; +fma.rn.f64 fd1014, fd148, 0dBFEEFA7CDDB128FA, fd1010; +fma.rn.f64 fd1015, fd1260, 0dBFD00AB0EB2D7D94, fd1011; +fma.rn.f64 fd1016, fd147, 0dBFEEFA7CDDB128FA, fd1012; +fma.rn.f64 fd1017, fd149, 0d3FE0ED45EEA3B09F, fd1013; +fma.rn.f64 fd1018, fd152, 0d3FEB2818007C19DF, fd1014; +fma.rn.f64 fd1019, fd1258, 0d3FE0ED45EEA3B09F, fd1015; +fma.rn.f64 fd1020, fd151, 0d3FEB2818007C19DF, fd1016; +fma.rn.f64 fd1021, fd153, 0dBFE847BF1D5146CC, fd1017; +fma.rn.f64 fd1022, fd156, 0dBFE4D80B1AD9CCF6, fd1018; +fma.rn.f64 fd1023, fd1255, 0dBFE847BF1D5146CC, fd1019; +fma.rn.f64 fd1024, fd155, 0dBFE4D80B1AD9CCF6, fd1020; +fma.rn.f64 fd1025, fd157, 0d3FED681A366A00FA, fd1021; +fma.rn.f64 fd1026, fd160, 0d3FD93D20572CA90B, fd1022; +fma.rn.f64 fd1027, fd1253, 0d3FED681A366A00FA, fd1023; +fma.rn.f64 fd1028, fd159, 0d3FD93D20572CA90B, fd1024; +fma.rn.f64 fd1029, fd161, 0dBFEFD5F830F860F9, fd1025; +fma.rn.f64 fd1030, fd164, 0dBFB9E62ACA53C49F, fd1026; +fma.rn.f64 fd1031, fd1251, 0dBFEFD5F830F860F9, fd1027; +fma.rn.f64 fd1032, fd163, 0dBFB9E62ACA53C49F, fd1028; +fma.rn.f64 fd1033, fd165, 0d3FEF584F2CE43B84, fd1029; +fma.rn.f64 fd1034, fd168, 0dBFC9C4266041CA8F, fd1030; +fma.rn.f64 fd1035, fd1248, 0d3FEF584F2CE43B84, fd1031; +fma.rn.f64 fd1036, fd167, 0dBFC9C4266041CA8F, fd1032; +fma.rn.f64 fd1037, fd169, 0dBFEBFAA5C136B224, fd1033; +fma.rn.f64 fd1038, fd172, 0d3FDF0F2FF6705BEC, fd1034; +fma.rn.f64 fd1039, fd1246, 0dBFEBFAA5C136B224, fd1035; +fma.rn.f64 fd1040, fd171, 0d3FDF0F2FF6705BEC, fd1036; +fma.rn.f64 fd1041, fd173, 0d3FE60C045A2E9729, fd1037; +fma.rn.f64 fd1042, fd176, 0dBFE73180A4B0D300, fd1038; +fma.rn.f64 fd1043, fd1244, 0d3FE60C045A2E9729, fd1039; +fma.rn.f64 fd1044, fd175, 0dBFE73180A4B0D300, fd1040; +fma.rn.f64 fd1045, fd177, 0dBFDC2F6AF3928A8E, fd1041; +fma.rn.f64 fd1046, fd180, 0d3FECBAD095F50378, fd1042; +fma.rn.f64 fd1047, fd1241, 0dBFDC2F6AF3928A8E, fd1043; +fma.rn.f64 fd1048, fd179, 0d3FECBAD095F50378, fd1044; +fma.rn.f64 fd1049, fd181, 0d3FC361FC440B478F, fd1045; +fma.rn.f64 fd1050, fd184, 0dBFEFA18852C3E08A, fd1046; +fma.rn.f64 fd1051, fd1239, 0d3FC361FC440B478F, fd1047; +fma.rn.f64 fd1052, fd183, 0dBFEFA18852C3E08A, fd1048; +fma.rn.f64 fd1053, fd125, 0dBFEFD5F830F860F9, %62; +fma.rn.f64 fd1054, fd128, 0d3FB9E62ACA53C49F, 0d0000000000000000; +fma.rn.f64 fd1055, fd1272, 0dBFEFD5F830F860F9, %63; +fma.rn.f64 fd1056, fd127, 0d3FB9E62ACA53C49F, 0d0000000000000000; +fma.rn.f64 fd1057, fd129, 0d3FEF584F2CE43B84, fd1053; +fma.rn.f64 fd1058, fd132, 0dBFC9C4266041CA8F, fd1054; +fma.rn.f64 fd1059, fd1269, 0d3FEF584F2CE43B84, fd1055; +fma.rn.f64 fd1060, fd131, 0dBFC9C4266041CA8F, fd1056; +fma.rn.f64 fd1061, fd133, 0dBFEE884F0CC22CCC, fd1057; +fma.rn.f64 fd1062, fd136, 0d3FD328C3F1B322CB, fd1058; +fma.rn.f64 fd1063, fd1267, 0dBFEE884F0CC22CCC, fd1059; +fma.rn.f64 fd1064, fd135, 0d3FD328C3F1B322CB, fd1060; +fma.rn.f64 fd1065, fd137, 0d3FED681A366A00FA, fd1061; +fma.rn.f64 fd1066, fd140, 0dBFD93D20572CA90B, fd1062; +fma.rn.f64 fd1067, fd1265, 0d3FED681A366A00FA, fd1063; +fma.rn.f64 fd1068, fd139, 0dBFD93D20572CA90B, fd1064; +fma.rn.f64 fd1069, fd141, 0dBFEBFAA5C136B224, fd1065; +fma.rn.f64 fd1070, fd144, 0d3FDF0F2FF6705BEC, fd1066; +fma.rn.f64 fd1071, fd1262, 0dBFEBFAA5C136B224, fd1067; +fma.rn.f64 fd1072, fd143, 0d3FDF0F2FF6705BEC, fd1068; +fma.rn.f64 fd1073, fd145, 0d3FEA43B1B1379AFF, fd1069; +fma.rn.f64 fd1074, fd148, 0dBFE247D447A27216, fd1070; +fma.rn.f64 fd1075, fd1260, 0d3FEA43B1B1379AFF, fd1071; +fma.rn.f64 fd1076, fd147, 0dBFE247D447A27216, fd1072; +fma.rn.f64 fd1077, fd149, 0dBFE847BF1D5146CC, fd1073; +fma.rn.f64 fd1078, fd152, 0d3FE4D80B1AD9CCF6, fd1074; +fma.rn.f64 fd1079, fd1258, 0dBFE847BF1D5146CC, fd1075; +fma.rn.f64 fd1080, fd151, 0d3FE4D80B1AD9CCF6, fd1076; +fma.rn.f64 fd1081, fd153, 0d3FE60C045A2E9729, fd1077; +fma.rn.f64 fd1082, fd156, 0dBFE73180A4B0D300, fd1078; +fma.rn.f64 fd1083, fd1255, 0d3FE60C045A2E9729, fd1079; +fma.rn.f64 fd1084, fd155, 0dBFE73180A4B0D300, fd1080; +fma.rn.f64 fd1085, fd157, 0dBFE3965F49174D13, fd1081; +fma.rn.f64 fd1086, fd160, 0d3FE94E08EB13C451, fd1082; +fma.rn.f64 fd1087, fd1253, 0dBFE3965F49174D13, fd1083; +fma.rn.f64 fd1088, fd159, 0d3FE94E08EB13C451, fd1084; +fma.rn.f64 fd1089, fd161, 0d3FE0ED45EEA3B09F, fd1085; +fma.rn.f64 fd1090, fd164, 0dBFEB2818007C19DF, fd1086; +fma.rn.f64 fd1091, fd1251, 0d3FE0ED45EEA3B09F, fd1087; +fma.rn.f64 fd1092, fd163, 0dBFEB2818007C19DF, fd1088; +fma.rn.f64 fd1093, fd165, 0dBFDC2F6AF3928A8E, fd1089; +fma.rn.f64 fd1094, fd168, 0d3FECBAD095F50378, fd1090; +fma.rn.f64 fd1095, fd1248, 0dBFDC2F6AF3928A8E, fd1091; +fma.rn.f64 fd1096, fd167, 0d3FECBAD095F50378, fd1092; +fma.rn.f64 fd1097, fd169, 0d3FD63A3FCFACA412, fd1093; +fma.rn.f64 fd1098, fd172, 0dBFEE0210C26A6E6F, fd1094; +fma.rn.f64 fd1099, fd1246, 0d3FD63A3FCFACA412, fd1095; +fma.rn.f64 fd1100, fd171, 0dBFEE0210C26A6E6F, fd1096; +fma.rn.f64 fd1101, fd173, 0dBFD00AB0EB2D7D94, fd1097; +fma.rn.f64 fd1102, fd176, 0d3FEEFA7CDDB128FA, fd1098; +fma.rn.f64 fd1103, fd1244, 0dBFD00AB0EB2D7D94, fd1099; +fma.rn.f64 fd1104, fd175, 0d3FEEFA7CDDB128FA, fd1100; +fma.rn.f64 fd1105, fd177, 0d3FC361FC440B478F, fd1101; +fma.rn.f64 fd1106, fd180, 0dBFEFA18852C3E08A, fd1102; +fma.rn.f64 fd1107, fd1241, 0d3FC361FC440B478F, fd1103; +fma.rn.f64 fd1108, fd179, 0dBFEFA18852C3E08A, fd1104; +fma.rn.f64 fd1109, fd181, 0dBFA9EEB01776B57D, fd1105; +fma.rn.f64 fd1110, fd184, 0d3FEFF57C5208CCF9, fd1106; +fma.rn.f64 fd1111, fd1239, 0dBFA9EEB01776B57D, fd1107; +fma.rn.f64 fd1112, fd183, 0d3FEFF57C5208CCF9, fd1108; +add.f64 %1, fd212, fd1239; +add.f64 %0, fd211, fd181; +sub.f64 %2, fd269, fd270; +add.f64 %3, fd271, fd272; +sub.f64 %4, fd329, fd330; +add.f64 %5, fd331, fd332; +sub.f64 %6, fd389, fd390; +add.f64 %7, fd391, fd392; +add.f64 %9, fd451, fd452; +sub.f64 %8, fd449, fd450; +add.f64 %11, fd511, fd512; +sub.f64 %10, fd509, fd510; +add.f64 %13, fd571, fd572; +sub.f64 %12, fd569, fd570; +sub.f64 %14, fd629, fd630; +add.f64 %15, fd631, fd632; +sub.f64 %16, fd689, fd690; +add.f64 %17, fd691, fd692; +sub.f64 %18, fd749, fd750; +add.f64 %19, fd751, fd752; +add.f64 %21, fd811, fd812; +sub.f64 %20, fd809, fd810; +add.f64 %23, fd871, fd872; +sub.f64 %22, fd869, fd870; +add.f64 %25, fd931, fd932; +sub.f64 %24, fd929, fd930; +sub.f64 %26, fd989, fd990; +add.f64 %27, fd991, fd992; +sub.f64 %28, fd1049, fd1050; +add.f64 %29, fd1051, fd1052; +sub.f64 %30, fd1109, fd1110; +add.f64 %31, fd1111, fd1112; +sub.f64 %33, fd1111, fd1112; +add.f64 %32, fd1109, fd1110; +sub.f64 %35, fd1051, fd1052; +add.f64 %34, fd1049, fd1050; +sub.f64 %37, fd991, fd992; +add.f64 %36, fd989, fd990; +sub.f64 %39, fd931, fd932; +add.f64 %38, fd929, fd930; +sub.f64 %41, fd871, fd872; +add.f64 %40, fd869, fd870; +sub.f64 %43, fd811, fd812; +add.f64 %42, fd809, fd810; +sub.f64 %45, fd751, fd752; +add.f64 %44, fd749, fd750; +sub.f64 %47, fd691, fd692; +add.f64 %46, fd689, fd690; +sub.f64 %49, fd631, fd632; +add.f64 %48, fd629, fd630; +sub.f64 %51, fd571, fd572; +add.f64 %50, fd569, fd570; +sub.f64 %53, fd511, fd512; +add.f64 %52, fd509, fd510; +sub.f64 %55, fd451, fd452; +add.f64 %54, fd449, fd450; +sub.f64 %57, fd391, fd392; +add.f64 %56, fd389, fd390; +sub.f64 %59, fd331, fd332; +add.f64 %58, fd329, fd330; +sub.f64 %61, fd271, fd272; +add.f64 %60, fd269, fd270; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y), "=d"(rmem[27].x), "=d"(rmem[27].y), "=d"(rmem[28].x), "=d"(rmem[28].y), "=d"(rmem[29].x), "=d"(rmem[29].y), "=d"(rmem[30].x), "=d"(rmem[30].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[27].x), "d"(rmem[27].y), "d"(rmem[28].x), "d"(rmem[28].y), "d"(rmem[29].x), "d"(rmem[29].y), "d"(rmem[30].x), "d"(rmem[30].y), "d"(rmem[2].y), "d"(rmem[29].y), "d"(rmem[28].y), "d"(rmem[4].y), "d"(rmem[5].y), "d"(rmem[26].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[8].y), "d"(rmem[23].y), "d"(rmem[22].y), "d"(rmem[10].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[19].y), "d"(rmem[13].y), "d"(rmem[14].y), "d"(rmem[17].y), "d"(rmem[16].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..2db0d52d26d7d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp16_fwd.hpp.inc @@ -0,0 +1,9410 @@ +#ifndef CUFFTDX_FFT_32768_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_32768_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1176, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1209>; +.reg .b32 r<7522>; +.reg .b64 rd<3>; +mov.u32 r7437, %tid.y; +shl.b32 r7438, r7437, 17; +mov.u32 r7439, %64; +add.s32 r7440, r7439, r7438; +mov.u32 r7441, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f1090, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r101, {low, high}; +} +mov.f32 f1124, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f1082, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r393, {low, high}; +} +mov.f32 f1132, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r396, {low, high}; +} +mov.f32 f1098, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r397, {low, high}; +} +mov.f32 f1130, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f1078, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1078; +cvt.rn.f16.f32 high, f1078; +mov.b32 r1233, {low, high}; +} +mov.f32 f1136, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1236, {low, high}; +} +mov.f32 f1086, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1086; +cvt.rn.f16.f32 high, f1086; +mov.b32 r1237, {low, high}; +} +mov.f32 f1128, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1240, {low, high}; +} +mov.f32 f1094, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1094; +cvt.rn.f16.f32 high, f1094; +mov.b32 r1241, {low, high}; +} +mov.f32 f1126, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1244, {low, high}; +} +mov.f32 f1102, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1102; +cvt.rn.f16.f32 high, f1102; +mov.b32 r1245, {low, high}; +} +mov.f32 f1134, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r7443, r7441, 7; +and.b32 r7444, r7443, -131072; +add.s32 r7445, r7440, r7444; +and.b32 r7457, r7441, 1023; +cvt.rn.f32.u32 f1201, r7457; +mul.f32 f1202, f1201, 0f39490FDB; +cos.approx.f32 f357, f1202; +sin.approx.f32 f1203, f1202; +neg.f32 f358, f1203; +mov.f32 f1208, 0f3F800000; +mov.f32 f1207, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r7446, r7443, 130944; +add.s32 r7447, r7445, r7446; +st.shared.v4.f32 [r7447], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r7447+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r7447+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r7447+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r7447+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r7447+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r7447+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r7447+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r7448, r7457, -124, r7447; +ld.shared.u32 r2864, [r7448]; +ld.shared.u32 r3480, [r7448+4096]; +ld.shared.u32 r3060, [r7448+8192]; +ld.shared.u32 r3676, [r7448+12288]; +ld.shared.u32 r2914, [r7448+16384]; +ld.shared.u32 r3530, [r7448+20480]; +ld.shared.u32 r3110, [r7448+24576]; +ld.shared.u32 r3726, [r7448+28672]; +ld.shared.u32 r2876, [r7448+32768]; +ld.shared.u32 r3492, [r7448+36864]; +ld.shared.u32 r3072, [r7448+40960]; +ld.shared.u32 r3688, [r7448+45056]; +ld.shared.u32 r2926, [r7448+49152]; +ld.shared.u32 r3542, [r7448+53248]; +ld.shared.u32 r3122, [r7448+57344]; +ld.shared.u32 r3738, [r7448+61440]; +ld.shared.u32 r2865, [r7448+65536]; +ld.shared.u32 r3481, [r7448+69632]; +ld.shared.u32 r3061, [r7448+73728]; +ld.shared.u32 r3677, [r7448+77824]; +ld.shared.u32 r2915, [r7448+81920]; +ld.shared.u32 r3531, [r7448+86016]; +ld.shared.u32 r3111, [r7448+90112]; +ld.shared.u32 r3727, [r7448+94208]; +ld.shared.u32 r2877, [r7448+98304]; +ld.shared.u32 r3493, [r7448+102400]; +ld.shared.u32 r3073, [r7448+106496]; +ld.shared.u32 r3689, [r7448+110592]; +ld.shared.u32 r2927, [r7448+114688]; +ld.shared.u32 r3543, [r7448+118784]; +ld.shared.u32 r3123, [r7448+122880]; +ld.shared.u32 r3739, [r7448+126976]; +barrier.sync 0; +st.shared.v4.f32 [r7447], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r7447+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r7447+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r7447+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r7447+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r7447+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r7447+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r7447+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r7448]; +ld.shared.u32 r3483, [r7448+4096]; +ld.shared.u32 r3063, [r7448+8192]; +ld.shared.u32 r3679, [r7448+12288]; +ld.shared.u32 r2917, [r7448+16384]; +ld.shared.u32 r3533, [r7448+20480]; +ld.shared.u32 r3113, [r7448+24576]; +ld.shared.u32 r3729, [r7448+28672]; +ld.shared.u32 r2879, [r7448+32768]; +ld.shared.u32 r3495, [r7448+36864]; +ld.shared.u32 r3075, [r7448+40960]; +ld.shared.u32 r3691, [r7448+45056]; +ld.shared.u32 r2929, [r7448+49152]; +ld.shared.u32 r3545, [r7448+53248]; +ld.shared.u32 r3125, [r7448+57344]; +ld.shared.u32 r3741, [r7448+61440]; +ld.shared.u32 r2868, [r7448+65536]; +ld.shared.u32 r3484, [r7448+69632]; +ld.shared.u32 r3064, [r7448+73728]; +ld.shared.u32 r3680, [r7448+77824]; +ld.shared.u32 r2918, [r7448+81920]; +ld.shared.u32 r3534, [r7448+86016]; +ld.shared.u32 r3114, [r7448+90112]; +ld.shared.u32 r3730, [r7448+94208]; +ld.shared.u32 r2880, [r7448+98304]; +ld.shared.u32 r3496, [r7448+102400]; +ld.shared.u32 r3076, [r7448+106496]; +ld.shared.u32 r3692, [r7448+110592]; +ld.shared.u32 r2930, [r7448+114688]; +ld.shared.u32 r3546, [r7448+118784]; +ld.shared.u32 r3126, [r7448+122880]; +ld.shared.u32 r3742, [r7448+126976]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1078; +cvt.rn.f16.f32 high, f1078; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1086; +cvt.rn.f16.f32 high, f1086; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1094; +cvt.rn.f16.f32 high, f1094; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1102; +cvt.rn.f16.f32 high, f1102; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r7449, r7441, 992; +bfe.u32 r7450, r7441, 5, 5; +shl.b32 r7451, r7441, 2; +and.b32 r7452, r7451, 124; +add.s32 r7453, r7445, r7452; +cvt.rn.f32.u32 f1204, r7450; +mul.f32 f1205, f1204, 0f3BC90FDB; +cos.approx.f32 f779, f1205; +sin.approx.f32 f1206, f1205; +neg.f32 f780, f1206; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +barrier.sync 0; +and.b32 r7454, r7443, 126976; +add.s32 r7455, r7453, r7454; +st.shared.u32 [r7455], r4383; +st.shared.u32 [r7455+128], r4587; +st.shared.u32 [r7455+256], r4624; +st.shared.u32 [r7455+384], r4661; +st.shared.u32 [r7455+512], r4698; +st.shared.u32 [r7455+640], r4735; +st.shared.u32 [r7455+768], r4772; +st.shared.u32 [r7455+896], r4809; +st.shared.u32 [r7455+1024], r4846; +st.shared.u32 [r7455+1152], r4883; +st.shared.u32 [r7455+1280], r4920; +st.shared.u32 [r7455+1408], r4957; +st.shared.u32 [r7455+1536], r4994; +st.shared.u32 [r7455+1664], r5031; +st.shared.u32 [r7455+1792], r5068; +st.shared.u32 [r7455+1920], r5105; +st.shared.u32 [r7455+2048], r5142; +st.shared.u32 [r7455+2176], r5179; +st.shared.u32 [r7455+2304], r5216; +st.shared.u32 [r7455+2432], r5253; +st.shared.u32 [r7455+2560], r5290; +st.shared.u32 [r7455+2688], r5327; +st.shared.u32 [r7455+2816], r5364; +st.shared.u32 [r7455+2944], r5401; +st.shared.u32 [r7455+3072], r5438; +st.shared.u32 [r7455+3200], r5475; +st.shared.u32 [r7455+3328], r5512; +st.shared.u32 [r7455+3456], r5549; +st.shared.u32 [r7455+3584], r5586; +st.shared.u32 [r7455+3712], r5623; +st.shared.u32 [r7455+3840], r5660; +st.shared.u32 [r7455+3968], r5697; +barrier.sync 0; +mad.lo.s32 r7456, r7449, -124, r7455; +ld.shared.u32 r5726, [r7456]; +ld.shared.u32 r6342, [r7456+4096]; +ld.shared.u32 r5922, [r7456+8192]; +ld.shared.u32 r6538, [r7456+12288]; +ld.shared.u32 r5776, [r7456+16384]; +ld.shared.u32 r6392, [r7456+20480]; +ld.shared.u32 r5972, [r7456+24576]; +ld.shared.u32 r6588, [r7456+28672]; +ld.shared.u32 r5738, [r7456+32768]; +ld.shared.u32 r6354, [r7456+36864]; +ld.shared.u32 r5934, [r7456+40960]; +ld.shared.u32 r6550, [r7456+45056]; +ld.shared.u32 r5788, [r7456+49152]; +ld.shared.u32 r6404, [r7456+53248]; +ld.shared.u32 r5984, [r7456+57344]; +ld.shared.u32 r6600, [r7456+61440]; +ld.shared.u32 r5727, [r7456+65536]; +ld.shared.u32 r6343, [r7456+69632]; +ld.shared.u32 r5923, [r7456+73728]; +ld.shared.u32 r6539, [r7456+77824]; +ld.shared.u32 r5777, [r7456+81920]; +ld.shared.u32 r6393, [r7456+86016]; +ld.shared.u32 r5973, [r7456+90112]; +ld.shared.u32 r6589, [r7456+94208]; +ld.shared.u32 r5739, [r7456+98304]; +ld.shared.u32 r6355, [r7456+102400]; +ld.shared.u32 r5935, [r7456+106496]; +ld.shared.u32 r6551, [r7456+110592]; +ld.shared.u32 r5789, [r7456+114688]; +ld.shared.u32 r6405, [r7456+118784]; +ld.shared.u32 r5985, [r7456+122880]; +ld.shared.u32 r6601, [r7456+126976]; +barrier.sync 0; +st.shared.u32 [r7455], r4386; +st.shared.u32 [r7455+128], r4594; +st.shared.u32 [r7455+256], r4631; +st.shared.u32 [r7455+384], r4668; +st.shared.u32 [r7455+512], r4705; +st.shared.u32 [r7455+640], r4742; +st.shared.u32 [r7455+768], r4779; +st.shared.u32 [r7455+896], r4816; +st.shared.u32 [r7455+1024], r4853; +st.shared.u32 [r7455+1152], r4890; +st.shared.u32 [r7455+1280], r4927; +st.shared.u32 [r7455+1408], r4964; +st.shared.u32 [r7455+1536], r5001; +st.shared.u32 [r7455+1664], r5038; +st.shared.u32 [r7455+1792], r5075; +st.shared.u32 [r7455+1920], r5112; +st.shared.u32 [r7455+2048], r5149; +st.shared.u32 [r7455+2176], r5186; +st.shared.u32 [r7455+2304], r5223; +st.shared.u32 [r7455+2432], r5260; +st.shared.u32 [r7455+2560], r5297; +st.shared.u32 [r7455+2688], r5334; +st.shared.u32 [r7455+2816], r5371; +st.shared.u32 [r7455+2944], r5408; +st.shared.u32 [r7455+3072], r5445; +st.shared.u32 [r7455+3200], r5482; +st.shared.u32 [r7455+3328], r5519; +st.shared.u32 [r7455+3456], r5556; +st.shared.u32 [r7455+3584], r5593; +st.shared.u32 [r7455+3712], r5630; +st.shared.u32 [r7455+3840], r5667; +st.shared.u32 [r7455+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r7456]; +ld.shared.u32 r6345, [r7456+4096]; +ld.shared.u32 r5925, [r7456+8192]; +ld.shared.u32 r6541, [r7456+12288]; +ld.shared.u32 r5779, [r7456+16384]; +ld.shared.u32 r6395, [r7456+20480]; +ld.shared.u32 r5975, [r7456+24576]; +ld.shared.u32 r6591, [r7456+28672]; +ld.shared.u32 r5741, [r7456+32768]; +ld.shared.u32 r6357, [r7456+36864]; +ld.shared.u32 r5937, [r7456+40960]; +ld.shared.u32 r6553, [r7456+45056]; +ld.shared.u32 r5791, [r7456+49152]; +ld.shared.u32 r6407, [r7456+53248]; +ld.shared.u32 r5987, [r7456+57344]; +ld.shared.u32 r6603, [r7456+61440]; +ld.shared.u32 r5730, [r7456+65536]; +ld.shared.u32 r6346, [r7456+69632]; +ld.shared.u32 r5926, [r7456+73728]; +ld.shared.u32 r6542, [r7456+77824]; +ld.shared.u32 r5780, [r7456+81920]; +ld.shared.u32 r6396, [r7456+86016]; +ld.shared.u32 r5976, [r7456+90112]; +ld.shared.u32 r6592, [r7456+94208]; +ld.shared.u32 r5742, [r7456+98304]; +ld.shared.u32 r6358, [r7456+102400]; +ld.shared.u32 r5938, [r7456+106496]; +ld.shared.u32 r6554, [r7456+110592]; +ld.shared.u32 r5792, [r7456+114688]; +ld.shared.u32 r6408, [r7456+118784]; +ld.shared.u32 r5988, [r7456+122880]; +ld.shared.u32 r6604, [r7456+126976]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5746; +} +{ +add.f16x2 r5766, r5734, r5749; +} +{ +sub.f16x2 r5769, r5731, r5746; +} +{ +sub.f16x2 r5772, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5796; +} +{ +add.f16x2 r5816, r5784, r5799; +} +{ +sub.f16x2 r5819, r5781, r5796; +} +{ +sub.f16x2 r5822, r5784, r5799; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5807; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 r5873, r5751, r5801; +} +{ +add.f16x2 r5876, r5754, r5804; +} +{ +sub.f16x2 r5879, r5751, r5801; +} +{ +sub.f16x2 r5882, r5754, r5804; +} +{ +add.f16x2 r5885, r5763, r5845; +} +{ +add.f16x2 r5888, r5766, r5851; +} +{ +sub.f16x2 r5891, r5763, r5845; +} +{ +sub.f16x2 r5894, r5766, r5851; +} +{ +add.f16x2 r5897, r5757, r5810; +} +{ +add.f16x2 r5900, r5760, r5855; +} +{ +sub.f16x2 r5903, r5757, r5810; +} +{ +sub.f16x2 r5906, r5760, r5855; +} +{ +add.f16x2 r5909, r5769, r5863; +} +{ +add.f16x2 r5912, r5772, r5869; +} +{ +sub.f16x2 r5915, r5769, r5863; +} +{ +sub.f16x2 r5918, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5939; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5942; +} +{ +add.f16x2 r5962, r5930, r5945; +} +{ +sub.f16x2 r5965, r5927, r5942; +} +{ +sub.f16x2 r5968, r5930, r5945; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5989; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5992; +} +{ +add.f16x2 r6012, r5980, r5995; +} +{ +sub.f16x2 r6015, r5977, r5992; +} +{ +sub.f16x2 r6018, r5980, r5995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6003; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 r6069, r5947, r5997; +} +{ +add.f16x2 r6072, r5950, r6000; +} +{ +sub.f16x2 r6075, r5947, r5997; +} +{ +sub.f16x2 r6078, r5950, r6000; +} +{ +add.f16x2 r6081, r5959, r6041; +} +{ +add.f16x2 r6084, r5962, r6047; +} +{ +sub.f16x2 r6087, r5959, r6041; +} +{ +sub.f16x2 r6090, r5962, r6047; +} +{ +add.f16x2 r6093, r5953, r6006; +} +{ +add.f16x2 r6096, r5956, r6051; +} +{ +sub.f16x2 r6099, r5953, r6006; +} +{ +sub.f16x2 r6102, r5956, r6051; +} +{ +add.f16x2 r6105, r5965, r6059; +} +{ +add.f16x2 r6108, r5968, r6065; +} +{ +sub.f16x2 r6111, r5965, r6059; +} +{ +sub.f16x2 r6114, r5968, r6065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r6117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r6119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r6121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6125, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6126, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6128, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6130, {low, high}; +} +{ +mul.f16x2 r6147, r6081, r6117; +} +{ +mul.f16x2 r6150, r6084, r6118; +} +{ +sub.f16x2 r6153, r6147, r6150; +} +{ +mul.f16x2 r6156, r6081, r6118; +} +{ +fma.rn.f16x2 r6159, r6084, r6117, r6156; +} +{ +mul.f16x2 r6163, r6093, r6119; +} +{ +mul.f16x2 r6166, r6096, r6120; +} +{ +sub.f16x2 r6169, r6163, r6166; +} +{ +mul.f16x2 r6172, r6093, r6120; +} +{ +fma.rn.f16x2 r6175, r6096, r6119, r6172; +} +{ +mul.f16x2 r6179, r6105, r6121; +} +{ +mul.f16x2 r6182, r6108, r6122; +} +{ +sub.f16x2 r6185, r6179, r6182; +} +{ +mul.f16x2 r6188, r6105, r6122; +} +{ +fma.rn.f16x2 r6191, r6108, r6121, r6188; +} +{ +neg.f16x2 r6195, r6075; +} +{ +mul.f16x2 r6197, r6087, r6125; +} +{ +mul.f16x2 r6200, r6090, r6126; +} +{ +sub.f16x2 r6203, r6197, r6200; +} +{ +mul.f16x2 r6206, r6087, r6126; +} +{ +fma.rn.f16x2 r6209, r6090, r6125, r6206; +} +{ +mul.f16x2 r6213, r6099, r6127; +} +{ +mul.f16x2 r6216, r6102, r6128; +} +{ +sub.f16x2 r6219, r6213, r6216; +} +{ +mul.f16x2 r6222, r6099, r6128; +} +{ +fma.rn.f16x2 r6225, r6102, r6127, r6222; +} +{ +mul.f16x2 r6229, r6111, r6129; +} +{ +mul.f16x2 r6232, r6114, r6130; +} +{ +sub.f16x2 r6235, r6229, r6232; +} +{ +mul.f16x2 r6238, r6111, r6130; +} +{ +fma.rn.f16x2 r6241, r6114, r6129, r6238; +} +{ +add.f16x2 r6245, r5873, r6069; +} +{ +add.f16x2 r6248, r5876, r6072; +} +{ +sub.f16x2 r6251, r5873, r6069; +} +{ +sub.f16x2 r6254, r5876, r6072; +} +{ +add.f16x2 r6257, r5885, r6153; +} +{ +add.f16x2 r6260, r5888, r6159; +} +{ +sub.f16x2 r6263, r5885, r6153; +} +{ +sub.f16x2 r6266, r5888, r6159; +} +{ +add.f16x2 r6269, r5897, r6169; +} +{ +add.f16x2 r6272, r5900, r6175; +} +{ +sub.f16x2 r6275, r5897, r6169; +} +{ +sub.f16x2 r6278, r5900, r6175; +} +{ +add.f16x2 r6281, r5909, r6185; +} +{ +add.f16x2 r6284, r5912, r6191; +} +{ +sub.f16x2 r6287, r5909, r6185; +} +{ +sub.f16x2 r6290, r5912, r6191; +} +{ +add.f16x2 r6293, r5879, r6078; +} +{ +add.f16x2 r6296, r5882, r6195; +} +{ +sub.f16x2 r6299, r5879, r6078; +} +{ +sub.f16x2 r6302, r5882, r6195; +} +{ +add.f16x2 r6305, r5891, r6203; +} +{ +add.f16x2 r6308, r5894, r6209; +} +{ +sub.f16x2 r6311, r5891, r6203; +} +{ +sub.f16x2 r6314, r5894, r6209; +} +{ +add.f16x2 r6317, r5903, r6219; +} +{ +add.f16x2 r6320, r5906, r6225; +} +{ +sub.f16x2 r6323, r5903, r6219; +} +{ +sub.f16x2 r6326, r5906, r6225; +} +{ +add.f16x2 r6329, r5915, r6235; +} +{ +add.f16x2 r6332, r5918, r6241; +} +{ +sub.f16x2 r6335, r5915, r6235; +} +{ +sub.f16x2 r6338, r5918, r6241; +} +{ +add.f16x2 r6341, r6342, r6343; +} +{ +add.f16x2 r6344, r6345, r6346; +} +{ +sub.f16x2 r6347, r6342, r6343; +} +{ +sub.f16x2 r6350, r6345, r6346; +} +{ +add.f16x2 r6353, r6354, r6355; +} +{ +add.f16x2 r6356, r6357, r6358; +} +{ +sub.f16x2 r6359, r6354, r6355; +} +{ +sub.f16x2 r6362, r6357, r6358; +} +{ +neg.f16x2 r6365, r6359; +} +{ +add.f16x2 r6367, r6341, r6353; +} +{ +add.f16x2 r6370, r6344, r6356; +} +{ +sub.f16x2 r6373, r6341, r6353; +} +{ +sub.f16x2 r6376, r6344, r6356; +} +{ +add.f16x2 r6379, r6347, r6362; +} +{ +add.f16x2 r6382, r6350, r6365; +} +{ +sub.f16x2 r6385, r6347, r6362; +} +{ +sub.f16x2 r6388, r6350, r6365; +} +{ +add.f16x2 r6391, r6392, r6393; +} +{ +add.f16x2 r6394, r6395, r6396; +} +{ +sub.f16x2 r6397, r6392, r6393; +} +{ +sub.f16x2 r6400, r6395, r6396; +} +{ +add.f16x2 r6403, r6404, r6405; +} +{ +add.f16x2 r6406, r6407, r6408; +} +{ +sub.f16x2 r6409, r6404, r6405; +} +{ +sub.f16x2 r6412, r6407, r6408; +} +{ +neg.f16x2 r6415, r6409; +} +{ +add.f16x2 r6417, r6391, r6403; +} +{ +add.f16x2 r6420, r6394, r6406; +} +{ +sub.f16x2 r6423, r6391, r6403; +} +{ +sub.f16x2 r6426, r6394, r6406; +} +{ +add.f16x2 r6429, r6397, r6412; +} +{ +add.f16x2 r6432, r6400, r6415; +} +{ +sub.f16x2 r6435, r6397, r6412; +} +{ +sub.f16x2 r6438, r6400, r6415; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r6441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6446, {low, high}; +} +{ +mul.f16x2 r6455, r6429, r6441; +} +{ +mul.f16x2 r6458, r6432, r6442; +} +{ +sub.f16x2 r6461, r6455, r6458; +} +{ +mul.f16x2 r6464, r6429, r6442; +} +{ +fma.rn.f16x2 r6467, r6432, r6441, r6464; +} +{ +neg.f16x2 r6471, r6423; +} +{ +mul.f16x2 r6473, r6435, r6445; +} +{ +mul.f16x2 r6476, r6438, r6446; +} +{ +sub.f16x2 r6479, r6473, r6476; +} +{ +mul.f16x2 r6482, r6435, r6446; +} +{ +fma.rn.f16x2 r6485, r6438, r6445, r6482; +} +{ +add.f16x2 r6489, r6367, r6417; +} +{ +add.f16x2 r6492, r6370, r6420; +} +{ +sub.f16x2 r6495, r6367, r6417; +} +{ +sub.f16x2 r6498, r6370, r6420; +} +{ +add.f16x2 r6501, r6379, r6461; +} +{ +add.f16x2 r6504, r6382, r6467; +} +{ +sub.f16x2 r6507, r6379, r6461; +} +{ +sub.f16x2 r6510, r6382, r6467; +} +{ +add.f16x2 r6513, r6373, r6426; +} +{ +add.f16x2 r6516, r6376, r6471; +} +{ +sub.f16x2 r6519, r6373, r6426; +} +{ +sub.f16x2 r6522, r6376, r6471; +} +{ +add.f16x2 r6525, r6385, r6479; +} +{ +add.f16x2 r6528, r6388, r6485; +} +{ +sub.f16x2 r6531, r6385, r6479; +} +{ +sub.f16x2 r6534, r6388, r6485; +} +{ +add.f16x2 r6537, r6538, r6539; +} +{ +add.f16x2 r6540, r6541, r6542; +} +{ +sub.f16x2 r6543, r6538, r6539; +} +{ +sub.f16x2 r6546, r6541, r6542; +} +{ +add.f16x2 r6549, r6550, r6551; +} +{ +add.f16x2 r6552, r6553, r6554; +} +{ +sub.f16x2 r6555, r6550, r6551; +} +{ +sub.f16x2 r6558, r6553, r6554; +} +{ +neg.f16x2 r6561, r6555; +} +{ +add.f16x2 r6563, r6537, r6549; +} +{ +add.f16x2 r6566, r6540, r6552; +} +{ +sub.f16x2 r6569, r6537, r6549; +} +{ +sub.f16x2 r6572, r6540, r6552; +} +{ +add.f16x2 r6575, r6543, r6558; +} +{ +add.f16x2 r6578, r6546, r6561; +} +{ +sub.f16x2 r6581, r6543, r6558; +} +{ +sub.f16x2 r6584, r6546, r6561; +} +{ +add.f16x2 r6587, r6588, r6589; +} +{ +add.f16x2 r6590, r6591, r6592; +} +{ +sub.f16x2 r6593, r6588, r6589; +} +{ +sub.f16x2 r6596, r6591, r6592; +} +{ +add.f16x2 r6599, r6600, r6601; +} +{ +add.f16x2 r6602, r6603, r6604; +} +{ +sub.f16x2 r6605, r6600, r6601; +} +{ +sub.f16x2 r6608, r6603, r6604; +} +{ +neg.f16x2 r6611, r6605; +} +{ +add.f16x2 r6613, r6587, r6599; +} +{ +add.f16x2 r6616, r6590, r6602; +} +{ +sub.f16x2 r6619, r6587, r6599; +} +{ +sub.f16x2 r6622, r6590, r6602; +} +{ +add.f16x2 r6625, r6593, r6608; +} +{ +add.f16x2 r6628, r6596, r6611; +} +{ +sub.f16x2 r6631, r6593, r6608; +} +{ +sub.f16x2 r6634, r6596, r6611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r6637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6642, {low, high}; +} +{ +mul.f16x2 r6651, r6625, r6637; +} +{ +mul.f16x2 r6654, r6628, r6638; +} +{ +sub.f16x2 r6657, r6651, r6654; +} +{ +mul.f16x2 r6660, r6625, r6638; +} +{ +fma.rn.f16x2 r6663, r6628, r6637, r6660; +} +{ +neg.f16x2 r6667, r6619; +} +{ +mul.f16x2 r6669, r6631, r6641; +} +{ +mul.f16x2 r6672, r6634, r6642; +} +{ +sub.f16x2 r6675, r6669, r6672; +} +{ +mul.f16x2 r6678, r6631, r6642; +} +{ +fma.rn.f16x2 r6681, r6634, r6641, r6678; +} +{ +add.f16x2 r6685, r6563, r6613; +} +{ +add.f16x2 r6688, r6566, r6616; +} +{ +sub.f16x2 r6691, r6563, r6613; +} +{ +sub.f16x2 r6694, r6566, r6616; +} +{ +add.f16x2 r6697, r6575, r6657; +} +{ +add.f16x2 r6700, r6578, r6663; +} +{ +sub.f16x2 r6703, r6575, r6657; +} +{ +sub.f16x2 r6706, r6578, r6663; +} +{ +add.f16x2 r6709, r6569, r6622; +} +{ +add.f16x2 r6712, r6572, r6667; +} +{ +sub.f16x2 r6715, r6569, r6622; +} +{ +sub.f16x2 r6718, r6572, r6667; +} +{ +add.f16x2 r6721, r6581, r6675; +} +{ +add.f16x2 r6724, r6584, r6681; +} +{ +sub.f16x2 r6727, r6581, r6675; +} +{ +sub.f16x2 r6730, r6584, r6681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r6733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r6735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r6737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6746, {low, high}; +} +{ +mul.f16x2 r6763, r6697, r6733; +} +{ +mul.f16x2 r6766, r6700, r6734; +} +{ +sub.f16x2 r6769, r6763, r6766; +} +{ +mul.f16x2 r6772, r6697, r6734; +} +{ +fma.rn.f16x2 r6775, r6700, r6733, r6772; +} +{ +mul.f16x2 r6779, r6709, r6735; +} +{ +mul.f16x2 r6782, r6712, r6736; +} +{ +sub.f16x2 r6785, r6779, r6782; +} +{ +mul.f16x2 r6788, r6709, r6736; +} +{ +fma.rn.f16x2 r6791, r6712, r6735, r6788; +} +{ +mul.f16x2 r6795, r6721, r6737; +} +{ +mul.f16x2 r6798, r6724, r6738; +} +{ +sub.f16x2 r6801, r6795, r6798; +} +{ +mul.f16x2 r6804, r6721, r6738; +} +{ +fma.rn.f16x2 r6807, r6724, r6737, r6804; +} +{ +neg.f16x2 r6811, r6691; +} +{ +mul.f16x2 r6813, r6703, r6741; +} +{ +mul.f16x2 r6816, r6706, r6742; +} +{ +sub.f16x2 r6819, r6813, r6816; +} +{ +mul.f16x2 r6822, r6703, r6742; +} +{ +fma.rn.f16x2 r6825, r6706, r6741, r6822; +} +{ +mul.f16x2 r6829, r6715, r6743; +} +{ +mul.f16x2 r6832, r6718, r6744; +} +{ +sub.f16x2 r6835, r6829, r6832; +} +{ +mul.f16x2 r6838, r6715, r6744; +} +{ +fma.rn.f16x2 r6841, r6718, r6743, r6838; +} +{ +mul.f16x2 r6845, r6727, r6745; +} +{ +mul.f16x2 r6848, r6730, r6746; +} +{ +sub.f16x2 r6851, r6845, r6848; +} +{ +mul.f16x2 r6854, r6727, r6746; +} +{ +fma.rn.f16x2 r6857, r6730, r6745, r6854; +} +{ +add.f16x2 r6861, r6489, r6685; +} +{ +add.f16x2 r6864, r6492, r6688; +} +{ +sub.f16x2 r6867, r6489, r6685; +} +{ +sub.f16x2 r6870, r6492, r6688; +} +{ +add.f16x2 r6873, r6501, r6769; +} +{ +add.f16x2 r6876, r6504, r6775; +} +{ +sub.f16x2 r6879, r6501, r6769; +} +{ +sub.f16x2 r6882, r6504, r6775; +} +{ +add.f16x2 r6885, r6513, r6785; +} +{ +add.f16x2 r6888, r6516, r6791; +} +{ +sub.f16x2 r6891, r6513, r6785; +} +{ +sub.f16x2 r6894, r6516, r6791; +} +{ +add.f16x2 r6897, r6525, r6801; +} +{ +add.f16x2 r6900, r6528, r6807; +} +{ +sub.f16x2 r6903, r6525, r6801; +} +{ +sub.f16x2 r6906, r6528, r6807; +} +{ +add.f16x2 r6909, r6495, r6694; +} +{ +add.f16x2 r6912, r6498, r6811; +} +{ +sub.f16x2 r6915, r6495, r6694; +} +{ +sub.f16x2 r6918, r6498, r6811; +} +{ +add.f16x2 r6921, r6507, r6819; +} +{ +add.f16x2 r6924, r6510, r6825; +} +{ +sub.f16x2 r6927, r6507, r6819; +} +{ +sub.f16x2 r6930, r6510, r6825; +} +{ +add.f16x2 r6933, r6519, r6835; +} +{ +add.f16x2 r6936, r6522, r6841; +} +{ +sub.f16x2 r6939, r6519, r6835; +} +{ +sub.f16x2 r6942, r6522, r6841; +} +{ +add.f16x2 r6945, r6531, r6851; +} +{ +add.f16x2 r6948, r6534, r6857; +} +{ +sub.f16x2 r6951, r6531, r6851; +} +{ +sub.f16x2 r6954, r6534, r6857; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1078; +cvt.rn.f16.f32 high, f1078; +mov.b32 r6957, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r6958, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1082; +cvt.rn.f16.f32 high, f1082; +mov.b32 r6959, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6960, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1086; +cvt.rn.f16.f32 high, f1086; +mov.b32 r6961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r6962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1090; +cvt.rn.f16.f32 high, f1090; +mov.b32 r6963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1094; +cvt.rn.f16.f32 high, f1094; +mov.b32 r6965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r6966, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1098; +cvt.rn.f16.f32 high, f1098; +mov.b32 r6967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6968, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1102; +cvt.rn.f16.f32 high, f1102; +mov.b32 r6969, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r6970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r6973, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r6974, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6976, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r6977, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r6978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6979, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6980, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r6981, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r6982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6983, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6984, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r6985, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r6986, {low, high}; +} +{ +mul.f16x2 r7019, r6873, r6957; +} +{ +mul.f16x2 r7022, r6876, r6958; +} +{ +sub.f16x2 r7025, r7019, r7022; +} +{ +mul.f16x2 r7028, r6873, r6958; +} +{ +fma.rn.f16x2 r7031, r6876, r6957, r7028; +} +{ +mul.f16x2 r7035, r6885, r6959; +} +{ +mul.f16x2 r7038, r6888, r6960; +} +{ +sub.f16x2 r7041, r7035, r7038; +} +{ +mul.f16x2 r7044, r6885, r6960; +} +{ +fma.rn.f16x2 r7047, r6888, r6959, r7044; +} +{ +mul.f16x2 r7051, r6897, r6961; +} +{ +mul.f16x2 r7054, r6900, r6962; +} +{ +sub.f16x2 r7057, r7051, r7054; +} +{ +mul.f16x2 r7060, r6897, r6962; +} +{ +fma.rn.f16x2 r7063, r6900, r6961, r7060; +} +{ +mul.f16x2 r7067, r6909, r6963; +} +{ +mul.f16x2 r7070, r6912, r6964; +} +{ +sub.f16x2 r7073, r7067, r7070; +} +{ +mul.f16x2 r7076, r6909, r6964; +} +{ +fma.rn.f16x2 r7079, r6912, r6963, r7076; +} +{ +mul.f16x2 r7083, r6921, r6965; +} +{ +mul.f16x2 r7086, r6924, r6966; +} +{ +sub.f16x2 r7089, r7083, r7086; +} +{ +mul.f16x2 r7092, r6921, r6966; +} +{ +fma.rn.f16x2 r7095, r6924, r6965, r7092; +} +{ +mul.f16x2 r7099, r6933, r6967; +} +{ +mul.f16x2 r7102, r6936, r6968; +} +{ +sub.f16x2 r7105, r7099, r7102; +} +{ +mul.f16x2 r7108, r6933, r6968; +} +{ +fma.rn.f16x2 r7111, r6936, r6967, r7108; +} +{ +mul.f16x2 r7115, r6945, r6969; +} +{ +mul.f16x2 r7118, r6948, r6970; +} +{ +sub.f16x2 r7121, r7115, r7118; +} +{ +mul.f16x2 r7124, r6945, r6970; +} +{ +fma.rn.f16x2 r7127, r6948, r6969, r7124; +} +{ +neg.f16x2 r7131, r6867; +} +{ +mul.f16x2 r7133, r6879, r6973; +} +{ +mul.f16x2 r7136, r6882, r6974; +} +{ +sub.f16x2 r7139, r7133, r7136; +} +{ +mul.f16x2 r7142, r6879, r6974; +} +{ +fma.rn.f16x2 r7145, r6882, r6973, r7142; +} +{ +mul.f16x2 r7149, r6891, r6975; +} +{ +mul.f16x2 r7152, r6894, r6976; +} +{ +sub.f16x2 r7155, r7149, r7152; +} +{ +mul.f16x2 r7158, r6891, r6976; +} +{ +fma.rn.f16x2 r7161, r6894, r6975, r7158; +} +{ +mul.f16x2 r7165, r6903, r6977; +} +{ +mul.f16x2 r7168, r6906, r6978; +} +{ +sub.f16x2 r7171, r7165, r7168; +} +{ +mul.f16x2 r7174, r6903, r6978; +} +{ +fma.rn.f16x2 r7177, r6906, r6977, r7174; +} +{ +mul.f16x2 r7181, r6915, r6979; +} +{ +mul.f16x2 r7184, r6918, r6980; +} +{ +sub.f16x2 r7187, r7181, r7184; +} +{ +mul.f16x2 r7190, r6915, r6980; +} +{ +fma.rn.f16x2 r7193, r6918, r6979, r7190; +} +{ +mul.f16x2 r7197, r6927, r6981; +} +{ +mul.f16x2 r7200, r6930, r6982; +} +{ +sub.f16x2 r7203, r7197, r7200; +} +{ +mul.f16x2 r7206, r6927, r6982; +} +{ +fma.rn.f16x2 r7209, r6930, r6981, r7206; +} +{ +mul.f16x2 r7213, r6939, r6983; +} +{ +mul.f16x2 r7216, r6942, r6984; +} +{ +sub.f16x2 r7219, r7213, r7216; +} +{ +mul.f16x2 r7222, r6939, r6984; +} +{ +fma.rn.f16x2 r7225, r6942, r6983, r7222; +} +{ +mul.f16x2 r7229, r6951, r6985; +} +{ +mul.f16x2 r7232, r6954, r6986; +} +{ +sub.f16x2 r7235, r7229, r7232; +} +{ +mul.f16x2 r7238, r6951, r6986; +} +{ +fma.rn.f16x2 r7241, r6954, r6985, r7238; +} +{ +add.f16x2 %0, r6245, r6861; +} +{ +add.f16x2 %1, r6248, r6864; +} +{ +sub.f16x2 %32, r6245, r6861; +} +{ +sub.f16x2 %33, r6248, r6864; +} +{ +add.f16x2 %2, r6257, r7025; +} +{ +add.f16x2 %3, r6260, r7031; +} +{ +sub.f16x2 %34, r6257, r7025; +} +{ +sub.f16x2 %35, r6260, r7031; +} +{ +add.f16x2 %4, r6269, r7041; +} +{ +add.f16x2 %5, r6272, r7047; +} +{ +sub.f16x2 %36, r6269, r7041; +} +{ +sub.f16x2 %37, r6272, r7047; +} +{ +add.f16x2 %6, r6281, r7057; +} +{ +add.f16x2 %7, r6284, r7063; +} +{ +sub.f16x2 %38, r6281, r7057; +} +{ +sub.f16x2 %39, r6284, r7063; +} +{ +add.f16x2 %8, r6293, r7073; +} +{ +add.f16x2 %9, r6296, r7079; +} +{ +sub.f16x2 %40, r6293, r7073; +} +{ +sub.f16x2 %41, r6296, r7079; +} +{ +add.f16x2 %10, r6305, r7089; +} +{ +add.f16x2 %11, r6308, r7095; +} +{ +sub.f16x2 %42, r6305, r7089; +} +{ +sub.f16x2 %43, r6308, r7095; +} +{ +add.f16x2 %12, r6317, r7105; +} +{ +add.f16x2 %13, r6320, r7111; +} +{ +sub.f16x2 %44, r6317, r7105; +} +{ +sub.f16x2 %45, r6320, r7111; +} +{ +add.f16x2 %14, r6329, r7121; +} +{ +add.f16x2 %15, r6332, r7127; +} +{ +sub.f16x2 %46, r6329, r7121; +} +{ +sub.f16x2 %47, r6332, r7127; +} +{ +add.f16x2 %16, r6251, r6870; +} +{ +add.f16x2 %17, r6254, r7131; +} +{ +sub.f16x2 %48, r6251, r6870; +} +{ +sub.f16x2 %49, r6254, r7131; +} +{ +add.f16x2 %18, r6263, r7139; +} +{ +add.f16x2 %19, r6266, r7145; +} +{ +sub.f16x2 %50, r6263, r7139; +} +{ +sub.f16x2 %51, r6266, r7145; +} +{ +add.f16x2 %20, r6275, r7155; +} +{ +add.f16x2 %21, r6278, r7161; +} +{ +sub.f16x2 %52, r6275, r7155; +} +{ +sub.f16x2 %53, r6278, r7161; +} +{ +add.f16x2 %22, r6287, r7171; +} +{ +add.f16x2 %23, r6290, r7177; +} +{ +sub.f16x2 %54, r6287, r7171; +} +{ +sub.f16x2 %55, r6290, r7177; +} +{ +add.f16x2 %24, r6299, r7187; +} +{ +add.f16x2 %25, r6302, r7193; +} +{ +sub.f16x2 %56, r6299, r7187; +} +{ +sub.f16x2 %57, r6302, r7193; +} +{ +add.f16x2 %26, r6311, r7203; +} +{ +add.f16x2 %27, r6314, r7209; +} +{ +sub.f16x2 %58, r6311, r7203; +} +{ +sub.f16x2 %59, r6314, r7209; +} +{ +add.f16x2 %28, r6323, r7219; +} +{ +add.f16x2 %29, r6326, r7225; +} +{ +sub.f16x2 %60, r6323, r7219; +} +{ +sub.f16x2 %61, r6326, r7225; +} +{ +add.f16x2 %30, r6335, r7235; +} +{ +add.f16x2 %31, r6338, r7241; +} +{ +sub.f16x2 %62, r6335, r7235; +} +{ +sub.f16x2 %63, r6338, r7241; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..e31bf7942c8d6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp16_inv.hpp.inc @@ -0,0 +1,9410 @@ +#ifndef CUFFTDX_FFT_32768_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_32768_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1182, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1209>; +.reg .b32 r<7522>; +.reg .b64 rd<3>; +mov.u32 r7437, %tid.y; +shl.b32 r7438, r7437, 17; +mov.u32 r7439, %64; +add.s32 r7440, r7439, r7438; +mov.u32 r7441, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f1124, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r102, {low, high}; +} +mov.f32 f1122, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f1116, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r393, {low, high}; +} +mov.f32 f1132, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r398, {low, high}; +} +mov.f32 f1114, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r404, {low, high}; +} +mov.f32 f1130, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f1112, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r1233, {low, high}; +} +mov.f32 f1136, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1236, {low, high}; +} +mov.f32 f1120, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r1237, {low, high}; +} +mov.f32 f1128, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r1246, {low, high}; +} +mov.f32 f1110, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1110; +cvt.rn.f16.f32 high, f1110; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r1252, {low, high}; +} +mov.f32 f1118, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1118; +cvt.rn.f16.f32 high, f1118; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r1256, {low, high}; +} +mov.f32 f1126, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r1260, {low, high}; +} +mov.f32 f1134, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r7443, r7441, 7; +and.b32 r7444, r7443, -131072; +add.s32 r7445, r7440, r7444; +and.b32 r7457, r7441, 1023; +cvt.rn.f32.u32 f1201, r7457; +mul.f32 f1202, f1201, 0f39490FDB; +cos.approx.f32 f357, f1202; +sin.approx.f32 f1203, f1202; +neg.f32 f358, f1203; +mov.f32 f1208, 0f3F800000; +mov.f32 f1207, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r7446, r7443, 130944; +add.s32 r7447, r7445, r7446; +st.shared.v4.f32 [r7447], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r7447+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r7447+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r7447+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r7447+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r7447+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r7447+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r7447+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r7448, r7457, -124, r7447; +ld.shared.u32 r2864, [r7448]; +ld.shared.u32 r3480, [r7448+4096]; +ld.shared.u32 r3060, [r7448+8192]; +ld.shared.u32 r3676, [r7448+12288]; +ld.shared.u32 r2914, [r7448+16384]; +ld.shared.u32 r3530, [r7448+20480]; +ld.shared.u32 r3110, [r7448+24576]; +ld.shared.u32 r3726, [r7448+28672]; +ld.shared.u32 r2876, [r7448+32768]; +ld.shared.u32 r3492, [r7448+36864]; +ld.shared.u32 r3072, [r7448+40960]; +ld.shared.u32 r3688, [r7448+45056]; +ld.shared.u32 r2926, [r7448+49152]; +ld.shared.u32 r3542, [r7448+53248]; +ld.shared.u32 r3122, [r7448+57344]; +ld.shared.u32 r3738, [r7448+61440]; +ld.shared.u32 r2865, [r7448+65536]; +ld.shared.u32 r3481, [r7448+69632]; +ld.shared.u32 r3061, [r7448+73728]; +ld.shared.u32 r3677, [r7448+77824]; +ld.shared.u32 r2915, [r7448+81920]; +ld.shared.u32 r3531, [r7448+86016]; +ld.shared.u32 r3111, [r7448+90112]; +ld.shared.u32 r3727, [r7448+94208]; +ld.shared.u32 r2877, [r7448+98304]; +ld.shared.u32 r3493, [r7448+102400]; +ld.shared.u32 r3073, [r7448+106496]; +ld.shared.u32 r3689, [r7448+110592]; +ld.shared.u32 r2927, [r7448+114688]; +ld.shared.u32 r3543, [r7448+118784]; +ld.shared.u32 r3123, [r7448+122880]; +ld.shared.u32 r3739, [r7448+126976]; +barrier.sync 0; +st.shared.v4.f32 [r7447], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r7447+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r7447+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r7447+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r7447+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r7447+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r7447+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r7447+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r7448]; +ld.shared.u32 r3483, [r7448+4096]; +ld.shared.u32 r3063, [r7448+8192]; +ld.shared.u32 r3679, [r7448+12288]; +ld.shared.u32 r2917, [r7448+16384]; +ld.shared.u32 r3533, [r7448+20480]; +ld.shared.u32 r3113, [r7448+24576]; +ld.shared.u32 r3729, [r7448+28672]; +ld.shared.u32 r2879, [r7448+32768]; +ld.shared.u32 r3495, [r7448+36864]; +ld.shared.u32 r3075, [r7448+40960]; +ld.shared.u32 r3691, [r7448+45056]; +ld.shared.u32 r2929, [r7448+49152]; +ld.shared.u32 r3545, [r7448+53248]; +ld.shared.u32 r3125, [r7448+57344]; +ld.shared.u32 r3741, [r7448+61440]; +ld.shared.u32 r2868, [r7448+65536]; +ld.shared.u32 r3484, [r7448+69632]; +ld.shared.u32 r3064, [r7448+73728]; +ld.shared.u32 r3680, [r7448+77824]; +ld.shared.u32 r2918, [r7448+81920]; +ld.shared.u32 r3534, [r7448+86016]; +ld.shared.u32 r3114, [r7448+90112]; +ld.shared.u32 r3730, [r7448+94208]; +ld.shared.u32 r2880, [r7448+98304]; +ld.shared.u32 r3496, [r7448+102400]; +ld.shared.u32 r3076, [r7448+106496]; +ld.shared.u32 r3692, [r7448+110592]; +ld.shared.u32 r2930, [r7448+114688]; +ld.shared.u32 r3546, [r7448+118784]; +ld.shared.u32 r3126, [r7448+122880]; +ld.shared.u32 r3742, [r7448+126976]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1110; +cvt.rn.f16.f32 high, f1110; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1118; +cvt.rn.f16.f32 high, f1118; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r7449, r7441, 992; +bfe.u32 r7450, r7441, 5, 5; +shl.b32 r7451, r7441, 2; +and.b32 r7452, r7451, 124; +add.s32 r7453, r7445, r7452; +cvt.rn.f32.u32 f1204, r7450; +mul.f32 f1205, f1204, 0f3BC90FDB; +cos.approx.f32 f779, f1205; +sin.approx.f32 f1206, f1205; +neg.f32 f780, f1206; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1207; +cvt.rn.f16.f32 high, f1208; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +barrier.sync 0; +and.b32 r7454, r7443, 126976; +add.s32 r7455, r7453, r7454; +st.shared.u32 [r7455], r4383; +st.shared.u32 [r7455+128], r4585; +st.shared.u32 [r7455+256], r4622; +st.shared.u32 [r7455+384], r4659; +st.shared.u32 [r7455+512], r4696; +st.shared.u32 [r7455+640], r4733; +st.shared.u32 [r7455+768], r4770; +st.shared.u32 [r7455+896], r4807; +st.shared.u32 [r7455+1024], r4844; +st.shared.u32 [r7455+1152], r4881; +st.shared.u32 [r7455+1280], r4918; +st.shared.u32 [r7455+1408], r4955; +st.shared.u32 [r7455+1536], r4992; +st.shared.u32 [r7455+1664], r5029; +st.shared.u32 [r7455+1792], r5066; +st.shared.u32 [r7455+1920], r5103; +st.shared.u32 [r7455+2048], r5140; +st.shared.u32 [r7455+2176], r5177; +st.shared.u32 [r7455+2304], r5214; +st.shared.u32 [r7455+2432], r5251; +st.shared.u32 [r7455+2560], r5288; +st.shared.u32 [r7455+2688], r5325; +st.shared.u32 [r7455+2816], r5362; +st.shared.u32 [r7455+2944], r5399; +st.shared.u32 [r7455+3072], r5436; +st.shared.u32 [r7455+3200], r5473; +st.shared.u32 [r7455+3328], r5510; +st.shared.u32 [r7455+3456], r5547; +st.shared.u32 [r7455+3584], r5584; +st.shared.u32 [r7455+3712], r5621; +st.shared.u32 [r7455+3840], r5658; +st.shared.u32 [r7455+3968], r5695; +barrier.sync 0; +mad.lo.s32 r7456, r7449, -124, r7455; +ld.shared.u32 r5726, [r7456]; +ld.shared.u32 r6342, [r7456+4096]; +ld.shared.u32 r5922, [r7456+8192]; +ld.shared.u32 r6538, [r7456+12288]; +ld.shared.u32 r5776, [r7456+16384]; +ld.shared.u32 r6392, [r7456+20480]; +ld.shared.u32 r5972, [r7456+24576]; +ld.shared.u32 r6588, [r7456+28672]; +ld.shared.u32 r5738, [r7456+32768]; +ld.shared.u32 r6354, [r7456+36864]; +ld.shared.u32 r5934, [r7456+40960]; +ld.shared.u32 r6550, [r7456+45056]; +ld.shared.u32 r5788, [r7456+49152]; +ld.shared.u32 r6404, [r7456+53248]; +ld.shared.u32 r5984, [r7456+57344]; +ld.shared.u32 r6600, [r7456+61440]; +ld.shared.u32 r5727, [r7456+65536]; +ld.shared.u32 r6343, [r7456+69632]; +ld.shared.u32 r5923, [r7456+73728]; +ld.shared.u32 r6539, [r7456+77824]; +ld.shared.u32 r5777, [r7456+81920]; +ld.shared.u32 r6393, [r7456+86016]; +ld.shared.u32 r5973, [r7456+90112]; +ld.shared.u32 r6589, [r7456+94208]; +ld.shared.u32 r5739, [r7456+98304]; +ld.shared.u32 r6355, [r7456+102400]; +ld.shared.u32 r5935, [r7456+106496]; +ld.shared.u32 r6551, [r7456+110592]; +ld.shared.u32 r5789, [r7456+114688]; +ld.shared.u32 r6405, [r7456+118784]; +ld.shared.u32 r5985, [r7456+122880]; +ld.shared.u32 r6601, [r7456+126976]; +barrier.sync 0; +st.shared.u32 [r7455], r4386; +st.shared.u32 [r7455+128], r4594; +st.shared.u32 [r7455+256], r4631; +st.shared.u32 [r7455+384], r4668; +st.shared.u32 [r7455+512], r4705; +st.shared.u32 [r7455+640], r4742; +st.shared.u32 [r7455+768], r4779; +st.shared.u32 [r7455+896], r4816; +st.shared.u32 [r7455+1024], r4853; +st.shared.u32 [r7455+1152], r4890; +st.shared.u32 [r7455+1280], r4927; +st.shared.u32 [r7455+1408], r4964; +st.shared.u32 [r7455+1536], r5001; +st.shared.u32 [r7455+1664], r5038; +st.shared.u32 [r7455+1792], r5075; +st.shared.u32 [r7455+1920], r5112; +st.shared.u32 [r7455+2048], r5149; +st.shared.u32 [r7455+2176], r5186; +st.shared.u32 [r7455+2304], r5223; +st.shared.u32 [r7455+2432], r5260; +st.shared.u32 [r7455+2560], r5297; +st.shared.u32 [r7455+2688], r5334; +st.shared.u32 [r7455+2816], r5371; +st.shared.u32 [r7455+2944], r5408; +st.shared.u32 [r7455+3072], r5445; +st.shared.u32 [r7455+3200], r5482; +st.shared.u32 [r7455+3328], r5519; +st.shared.u32 [r7455+3456], r5556; +st.shared.u32 [r7455+3584], r5593; +st.shared.u32 [r7455+3712], r5630; +st.shared.u32 [r7455+3840], r5667; +st.shared.u32 [r7455+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r7456]; +ld.shared.u32 r6345, [r7456+4096]; +ld.shared.u32 r5925, [r7456+8192]; +ld.shared.u32 r6541, [r7456+12288]; +ld.shared.u32 r5779, [r7456+16384]; +ld.shared.u32 r6395, [r7456+20480]; +ld.shared.u32 r5975, [r7456+24576]; +ld.shared.u32 r6591, [r7456+28672]; +ld.shared.u32 r5741, [r7456+32768]; +ld.shared.u32 r6357, [r7456+36864]; +ld.shared.u32 r5937, [r7456+40960]; +ld.shared.u32 r6553, [r7456+45056]; +ld.shared.u32 r5791, [r7456+49152]; +ld.shared.u32 r6407, [r7456+53248]; +ld.shared.u32 r5987, [r7456+57344]; +ld.shared.u32 r6603, [r7456+61440]; +ld.shared.u32 r5730, [r7456+65536]; +ld.shared.u32 r6346, [r7456+69632]; +ld.shared.u32 r5926, [r7456+73728]; +ld.shared.u32 r6542, [r7456+77824]; +ld.shared.u32 r5780, [r7456+81920]; +ld.shared.u32 r6396, [r7456+86016]; +ld.shared.u32 r5976, [r7456+90112]; +ld.shared.u32 r6592, [r7456+94208]; +ld.shared.u32 r5742, [r7456+98304]; +ld.shared.u32 r6358, [r7456+102400]; +ld.shared.u32 r5938, [r7456+106496]; +ld.shared.u32 r6554, [r7456+110592]; +ld.shared.u32 r5792, [r7456+114688]; +ld.shared.u32 r6408, [r7456+118784]; +ld.shared.u32 r5988, [r7456+122880]; +ld.shared.u32 r6604, [r7456+126976]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5749; +} +{ +add.f16x2 r5766, r5734, r5743; +} +{ +sub.f16x2 r5769, r5731, r5749; +} +{ +sub.f16x2 r5772, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5799; +} +{ +add.f16x2 r5816, r5784, r5793; +} +{ +sub.f16x2 r5819, r5781, r5799; +} +{ +sub.f16x2 r5822, r5784, r5793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5810; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 r5873, r5751, r5801; +} +{ +add.f16x2 r5876, r5754, r5804; +} +{ +sub.f16x2 r5879, r5751, r5801; +} +{ +sub.f16x2 r5882, r5754, r5804; +} +{ +add.f16x2 r5885, r5763, r5845; +} +{ +add.f16x2 r5888, r5766, r5851; +} +{ +sub.f16x2 r5891, r5763, r5845; +} +{ +sub.f16x2 r5894, r5766, r5851; +} +{ +add.f16x2 r5897, r5757, r5855; +} +{ +add.f16x2 r5900, r5760, r5807; +} +{ +sub.f16x2 r5903, r5757, r5855; +} +{ +sub.f16x2 r5906, r5760, r5807; +} +{ +add.f16x2 r5909, r5769, r5863; +} +{ +add.f16x2 r5912, r5772, r5869; +} +{ +sub.f16x2 r5915, r5769, r5863; +} +{ +sub.f16x2 r5918, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5942; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5945; +} +{ +add.f16x2 r5962, r5930, r5939; +} +{ +sub.f16x2 r5965, r5927, r5945; +} +{ +sub.f16x2 r5968, r5930, r5939; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5992; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5995; +} +{ +add.f16x2 r6012, r5980, r5989; +} +{ +sub.f16x2 r6015, r5977, r5995; +} +{ +sub.f16x2 r6018, r5980, r5989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6006; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 r6069, r5947, r5997; +} +{ +add.f16x2 r6072, r5950, r6000; +} +{ +sub.f16x2 r6075, r5947, r5997; +} +{ +sub.f16x2 r6078, r5950, r6000; +} +{ +add.f16x2 r6081, r5959, r6041; +} +{ +add.f16x2 r6084, r5962, r6047; +} +{ +sub.f16x2 r6087, r5959, r6041; +} +{ +sub.f16x2 r6090, r5962, r6047; +} +{ +add.f16x2 r6093, r5953, r6051; +} +{ +add.f16x2 r6096, r5956, r6003; +} +{ +sub.f16x2 r6099, r5953, r6051; +} +{ +sub.f16x2 r6102, r5956, r6003; +} +{ +add.f16x2 r6105, r5965, r6059; +} +{ +add.f16x2 r6108, r5968, r6065; +} +{ +sub.f16x2 r6111, r5965, r6059; +} +{ +sub.f16x2 r6114, r5968, r6065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r6125, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6126, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r6127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6128, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6130, {low, high}; +} +{ +mul.f16x2 r6147, r6081, r6117; +} +{ +mul.f16x2 r6150, r6084, r6118; +} +{ +sub.f16x2 r6153, r6147, r6150; +} +{ +mul.f16x2 r6156, r6081, r6118; +} +{ +fma.rn.f16x2 r6159, r6084, r6117, r6156; +} +{ +mul.f16x2 r6163, r6093, r6119; +} +{ +mul.f16x2 r6166, r6096, r6120; +} +{ +sub.f16x2 r6169, r6163, r6166; +} +{ +mul.f16x2 r6172, r6093, r6120; +} +{ +fma.rn.f16x2 r6175, r6096, r6119, r6172; +} +{ +mul.f16x2 r6179, r6105, r6121; +} +{ +mul.f16x2 r6182, r6108, r6122; +} +{ +sub.f16x2 r6185, r6179, r6182; +} +{ +mul.f16x2 r6188, r6105, r6122; +} +{ +fma.rn.f16x2 r6191, r6108, r6121, r6188; +} +{ +neg.f16x2 r6195, r6078; +} +{ +mul.f16x2 r6197, r6087, r6125; +} +{ +mul.f16x2 r6200, r6090, r6126; +} +{ +sub.f16x2 r6203, r6197, r6200; +} +{ +mul.f16x2 r6206, r6087, r6126; +} +{ +fma.rn.f16x2 r6209, r6090, r6125, r6206; +} +{ +mul.f16x2 r6213, r6099, r6127; +} +{ +mul.f16x2 r6216, r6102, r6128; +} +{ +sub.f16x2 r6219, r6213, r6216; +} +{ +mul.f16x2 r6222, r6099, r6128; +} +{ +fma.rn.f16x2 r6225, r6102, r6127, r6222; +} +{ +mul.f16x2 r6229, r6111, r6129; +} +{ +mul.f16x2 r6232, r6114, r6130; +} +{ +sub.f16x2 r6235, r6229, r6232; +} +{ +mul.f16x2 r6238, r6111, r6130; +} +{ +fma.rn.f16x2 r6241, r6114, r6129, r6238; +} +{ +add.f16x2 r6245, r5873, r6069; +} +{ +add.f16x2 r6248, r5876, r6072; +} +{ +sub.f16x2 r6251, r5873, r6069; +} +{ +sub.f16x2 r6254, r5876, r6072; +} +{ +add.f16x2 r6257, r5885, r6153; +} +{ +add.f16x2 r6260, r5888, r6159; +} +{ +sub.f16x2 r6263, r5885, r6153; +} +{ +sub.f16x2 r6266, r5888, r6159; +} +{ +add.f16x2 r6269, r5897, r6169; +} +{ +add.f16x2 r6272, r5900, r6175; +} +{ +sub.f16x2 r6275, r5897, r6169; +} +{ +sub.f16x2 r6278, r5900, r6175; +} +{ +add.f16x2 r6281, r5909, r6185; +} +{ +add.f16x2 r6284, r5912, r6191; +} +{ +sub.f16x2 r6287, r5909, r6185; +} +{ +sub.f16x2 r6290, r5912, r6191; +} +{ +add.f16x2 r6293, r5879, r6195; +} +{ +add.f16x2 r6296, r5882, r6075; +} +{ +sub.f16x2 r6299, r5879, r6195; +} +{ +sub.f16x2 r6302, r5882, r6075; +} +{ +add.f16x2 r6305, r5891, r6203; +} +{ +add.f16x2 r6308, r5894, r6209; +} +{ +sub.f16x2 r6311, r5891, r6203; +} +{ +sub.f16x2 r6314, r5894, r6209; +} +{ +add.f16x2 r6317, r5903, r6219; +} +{ +add.f16x2 r6320, r5906, r6225; +} +{ +sub.f16x2 r6323, r5903, r6219; +} +{ +sub.f16x2 r6326, r5906, r6225; +} +{ +add.f16x2 r6329, r5915, r6235; +} +{ +add.f16x2 r6332, r5918, r6241; +} +{ +sub.f16x2 r6335, r5915, r6235; +} +{ +sub.f16x2 r6338, r5918, r6241; +} +{ +add.f16x2 r6341, r6342, r6343; +} +{ +add.f16x2 r6344, r6345, r6346; +} +{ +sub.f16x2 r6347, r6342, r6343; +} +{ +sub.f16x2 r6350, r6345, r6346; +} +{ +add.f16x2 r6353, r6354, r6355; +} +{ +add.f16x2 r6356, r6357, r6358; +} +{ +sub.f16x2 r6359, r6354, r6355; +} +{ +sub.f16x2 r6362, r6357, r6358; +} +{ +neg.f16x2 r6365, r6362; +} +{ +add.f16x2 r6367, r6341, r6353; +} +{ +add.f16x2 r6370, r6344, r6356; +} +{ +sub.f16x2 r6373, r6341, r6353; +} +{ +sub.f16x2 r6376, r6344, r6356; +} +{ +add.f16x2 r6379, r6347, r6365; +} +{ +add.f16x2 r6382, r6350, r6359; +} +{ +sub.f16x2 r6385, r6347, r6365; +} +{ +sub.f16x2 r6388, r6350, r6359; +} +{ +add.f16x2 r6391, r6392, r6393; +} +{ +add.f16x2 r6394, r6395, r6396; +} +{ +sub.f16x2 r6397, r6392, r6393; +} +{ +sub.f16x2 r6400, r6395, r6396; +} +{ +add.f16x2 r6403, r6404, r6405; +} +{ +add.f16x2 r6406, r6407, r6408; +} +{ +sub.f16x2 r6409, r6404, r6405; +} +{ +sub.f16x2 r6412, r6407, r6408; +} +{ +neg.f16x2 r6415, r6412; +} +{ +add.f16x2 r6417, r6391, r6403; +} +{ +add.f16x2 r6420, r6394, r6406; +} +{ +sub.f16x2 r6423, r6391, r6403; +} +{ +sub.f16x2 r6426, r6394, r6406; +} +{ +add.f16x2 r6429, r6397, r6415; +} +{ +add.f16x2 r6432, r6400, r6409; +} +{ +sub.f16x2 r6435, r6397, r6415; +} +{ +sub.f16x2 r6438, r6400, r6409; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6442, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r6445, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6446, {low, high}; +} +{ +mul.f16x2 r6455, r6429, r6441; +} +{ +mul.f16x2 r6458, r6432, r6442; +} +{ +sub.f16x2 r6461, r6455, r6458; +} +{ +mul.f16x2 r6464, r6429, r6442; +} +{ +fma.rn.f16x2 r6467, r6432, r6441, r6464; +} +{ +neg.f16x2 r6471, r6426; +} +{ +mul.f16x2 r6473, r6435, r6445; +} +{ +mul.f16x2 r6476, r6438, r6446; +} +{ +sub.f16x2 r6479, r6473, r6476; +} +{ +mul.f16x2 r6482, r6435, r6446; +} +{ +fma.rn.f16x2 r6485, r6438, r6445, r6482; +} +{ +add.f16x2 r6489, r6367, r6417; +} +{ +add.f16x2 r6492, r6370, r6420; +} +{ +sub.f16x2 r6495, r6367, r6417; +} +{ +sub.f16x2 r6498, r6370, r6420; +} +{ +add.f16x2 r6501, r6379, r6461; +} +{ +add.f16x2 r6504, r6382, r6467; +} +{ +sub.f16x2 r6507, r6379, r6461; +} +{ +sub.f16x2 r6510, r6382, r6467; +} +{ +add.f16x2 r6513, r6373, r6471; +} +{ +add.f16x2 r6516, r6376, r6423; +} +{ +sub.f16x2 r6519, r6373, r6471; +} +{ +sub.f16x2 r6522, r6376, r6423; +} +{ +add.f16x2 r6525, r6385, r6479; +} +{ +add.f16x2 r6528, r6388, r6485; +} +{ +sub.f16x2 r6531, r6385, r6479; +} +{ +sub.f16x2 r6534, r6388, r6485; +} +{ +add.f16x2 r6537, r6538, r6539; +} +{ +add.f16x2 r6540, r6541, r6542; +} +{ +sub.f16x2 r6543, r6538, r6539; +} +{ +sub.f16x2 r6546, r6541, r6542; +} +{ +add.f16x2 r6549, r6550, r6551; +} +{ +add.f16x2 r6552, r6553, r6554; +} +{ +sub.f16x2 r6555, r6550, r6551; +} +{ +sub.f16x2 r6558, r6553, r6554; +} +{ +neg.f16x2 r6561, r6558; +} +{ +add.f16x2 r6563, r6537, r6549; +} +{ +add.f16x2 r6566, r6540, r6552; +} +{ +sub.f16x2 r6569, r6537, r6549; +} +{ +sub.f16x2 r6572, r6540, r6552; +} +{ +add.f16x2 r6575, r6543, r6561; +} +{ +add.f16x2 r6578, r6546, r6555; +} +{ +sub.f16x2 r6581, r6543, r6561; +} +{ +sub.f16x2 r6584, r6546, r6555; +} +{ +add.f16x2 r6587, r6588, r6589; +} +{ +add.f16x2 r6590, r6591, r6592; +} +{ +sub.f16x2 r6593, r6588, r6589; +} +{ +sub.f16x2 r6596, r6591, r6592; +} +{ +add.f16x2 r6599, r6600, r6601; +} +{ +add.f16x2 r6602, r6603, r6604; +} +{ +sub.f16x2 r6605, r6600, r6601; +} +{ +sub.f16x2 r6608, r6603, r6604; +} +{ +neg.f16x2 r6611, r6608; +} +{ +add.f16x2 r6613, r6587, r6599; +} +{ +add.f16x2 r6616, r6590, r6602; +} +{ +sub.f16x2 r6619, r6587, r6599; +} +{ +sub.f16x2 r6622, r6590, r6602; +} +{ +add.f16x2 r6625, r6593, r6611; +} +{ +add.f16x2 r6628, r6596, r6605; +} +{ +sub.f16x2 r6631, r6593, r6611; +} +{ +sub.f16x2 r6634, r6596, r6605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r6641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6642, {low, high}; +} +{ +mul.f16x2 r6651, r6625, r6637; +} +{ +mul.f16x2 r6654, r6628, r6638; +} +{ +sub.f16x2 r6657, r6651, r6654; +} +{ +mul.f16x2 r6660, r6625, r6638; +} +{ +fma.rn.f16x2 r6663, r6628, r6637, r6660; +} +{ +neg.f16x2 r6667, r6622; +} +{ +mul.f16x2 r6669, r6631, r6641; +} +{ +mul.f16x2 r6672, r6634, r6642; +} +{ +sub.f16x2 r6675, r6669, r6672; +} +{ +mul.f16x2 r6678, r6631, r6642; +} +{ +fma.rn.f16x2 r6681, r6634, r6641, r6678; +} +{ +add.f16x2 r6685, r6563, r6613; +} +{ +add.f16x2 r6688, r6566, r6616; +} +{ +sub.f16x2 r6691, r6563, r6613; +} +{ +sub.f16x2 r6694, r6566, r6616; +} +{ +add.f16x2 r6697, r6575, r6657; +} +{ +add.f16x2 r6700, r6578, r6663; +} +{ +sub.f16x2 r6703, r6575, r6657; +} +{ +sub.f16x2 r6706, r6578, r6663; +} +{ +add.f16x2 r6709, r6569, r6667; +} +{ +add.f16x2 r6712, r6572, r6619; +} +{ +sub.f16x2 r6715, r6569, r6667; +} +{ +sub.f16x2 r6718, r6572, r6619; +} +{ +add.f16x2 r6721, r6581, r6675; +} +{ +add.f16x2 r6724, r6584, r6681; +} +{ +sub.f16x2 r6727, r6581, r6675; +} +{ +sub.f16x2 r6730, r6584, r6681; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6734, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6735, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r6741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r6743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6746, {low, high}; +} +{ +mul.f16x2 r6763, r6697, r6733; +} +{ +mul.f16x2 r6766, r6700, r6734; +} +{ +sub.f16x2 r6769, r6763, r6766; +} +{ +mul.f16x2 r6772, r6697, r6734; +} +{ +fma.rn.f16x2 r6775, r6700, r6733, r6772; +} +{ +mul.f16x2 r6779, r6709, r6735; +} +{ +mul.f16x2 r6782, r6712, r6736; +} +{ +sub.f16x2 r6785, r6779, r6782; +} +{ +mul.f16x2 r6788, r6709, r6736; +} +{ +fma.rn.f16x2 r6791, r6712, r6735, r6788; +} +{ +mul.f16x2 r6795, r6721, r6737; +} +{ +mul.f16x2 r6798, r6724, r6738; +} +{ +sub.f16x2 r6801, r6795, r6798; +} +{ +mul.f16x2 r6804, r6721, r6738; +} +{ +fma.rn.f16x2 r6807, r6724, r6737, r6804; +} +{ +neg.f16x2 r6811, r6694; +} +{ +mul.f16x2 r6813, r6703, r6741; +} +{ +mul.f16x2 r6816, r6706, r6742; +} +{ +sub.f16x2 r6819, r6813, r6816; +} +{ +mul.f16x2 r6822, r6703, r6742; +} +{ +fma.rn.f16x2 r6825, r6706, r6741, r6822; +} +{ +mul.f16x2 r6829, r6715, r6743; +} +{ +mul.f16x2 r6832, r6718, r6744; +} +{ +sub.f16x2 r6835, r6829, r6832; +} +{ +mul.f16x2 r6838, r6715, r6744; +} +{ +fma.rn.f16x2 r6841, r6718, r6743, r6838; +} +{ +mul.f16x2 r6845, r6727, r6745; +} +{ +mul.f16x2 r6848, r6730, r6746; +} +{ +sub.f16x2 r6851, r6845, r6848; +} +{ +mul.f16x2 r6854, r6727, r6746; +} +{ +fma.rn.f16x2 r6857, r6730, r6745, r6854; +} +{ +add.f16x2 r6861, r6489, r6685; +} +{ +add.f16x2 r6864, r6492, r6688; +} +{ +sub.f16x2 r6867, r6489, r6685; +} +{ +sub.f16x2 r6870, r6492, r6688; +} +{ +add.f16x2 r6873, r6501, r6769; +} +{ +add.f16x2 r6876, r6504, r6775; +} +{ +sub.f16x2 r6879, r6501, r6769; +} +{ +sub.f16x2 r6882, r6504, r6775; +} +{ +add.f16x2 r6885, r6513, r6785; +} +{ +add.f16x2 r6888, r6516, r6791; +} +{ +sub.f16x2 r6891, r6513, r6785; +} +{ +sub.f16x2 r6894, r6516, r6791; +} +{ +add.f16x2 r6897, r6525, r6801; +} +{ +add.f16x2 r6900, r6528, r6807; +} +{ +sub.f16x2 r6903, r6525, r6801; +} +{ +sub.f16x2 r6906, r6528, r6807; +} +{ +add.f16x2 r6909, r6495, r6811; +} +{ +add.f16x2 r6912, r6498, r6691; +} +{ +sub.f16x2 r6915, r6495, r6811; +} +{ +sub.f16x2 r6918, r6498, r6691; +} +{ +add.f16x2 r6921, r6507, r6819; +} +{ +add.f16x2 r6924, r6510, r6825; +} +{ +sub.f16x2 r6927, r6507, r6819; +} +{ +sub.f16x2 r6930, r6510, r6825; +} +{ +add.f16x2 r6933, r6519, r6835; +} +{ +add.f16x2 r6936, r6522, r6841; +} +{ +sub.f16x2 r6939, r6519, r6835; +} +{ +sub.f16x2 r6942, r6522, r6841; +} +{ +add.f16x2 r6945, r6531, r6851; +} +{ +add.f16x2 r6948, r6534, r6857; +} +{ +sub.f16x2 r6951, r6531, r6851; +} +{ +sub.f16x2 r6954, r6534, r6857; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r6957, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r6958, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6959, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6960, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r6961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r6962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r6965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r6966, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6968, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r6969, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r6970, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1110; +cvt.rn.f16.f32 high, f1110; +mov.b32 r6973, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1112; +cvt.rn.f16.f32 high, f1112; +mov.b32 r6974, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1114; +cvt.rn.f16.f32 high, f1114; +mov.b32 r6975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1116; +cvt.rn.f16.f32 high, f1116; +mov.b32 r6976, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1118; +cvt.rn.f16.f32 high, f1118; +mov.b32 r6977, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1120; +cvt.rn.f16.f32 high, f1120; +mov.b32 r6978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1122; +cvt.rn.f16.f32 high, f1122; +mov.b32 r6979, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1124; +cvt.rn.f16.f32 high, f1124; +mov.b32 r6980, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1126; +cvt.rn.f16.f32 high, f1126; +mov.b32 r6981, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1128; +cvt.rn.f16.f32 high, f1128; +mov.b32 r6982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1130; +cvt.rn.f16.f32 high, f1130; +mov.b32 r6983, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1132; +cvt.rn.f16.f32 high, f1132; +mov.b32 r6984, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1134; +cvt.rn.f16.f32 high, f1134; +mov.b32 r6985, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1136; +cvt.rn.f16.f32 high, f1136; +mov.b32 r6986, {low, high}; +} +{ +mul.f16x2 r7019, r6873, r6957; +} +{ +mul.f16x2 r7022, r6876, r6958; +} +{ +sub.f16x2 r7025, r7019, r7022; +} +{ +mul.f16x2 r7028, r6873, r6958; +} +{ +fma.rn.f16x2 r7031, r6876, r6957, r7028; +} +{ +mul.f16x2 r7035, r6885, r6959; +} +{ +mul.f16x2 r7038, r6888, r6960; +} +{ +sub.f16x2 r7041, r7035, r7038; +} +{ +mul.f16x2 r7044, r6885, r6960; +} +{ +fma.rn.f16x2 r7047, r6888, r6959, r7044; +} +{ +mul.f16x2 r7051, r6897, r6961; +} +{ +mul.f16x2 r7054, r6900, r6962; +} +{ +sub.f16x2 r7057, r7051, r7054; +} +{ +mul.f16x2 r7060, r6897, r6962; +} +{ +fma.rn.f16x2 r7063, r6900, r6961, r7060; +} +{ +mul.f16x2 r7067, r6909, r6963; +} +{ +mul.f16x2 r7070, r6912, r6964; +} +{ +sub.f16x2 r7073, r7067, r7070; +} +{ +mul.f16x2 r7076, r6909, r6964; +} +{ +fma.rn.f16x2 r7079, r6912, r6963, r7076; +} +{ +mul.f16x2 r7083, r6921, r6965; +} +{ +mul.f16x2 r7086, r6924, r6966; +} +{ +sub.f16x2 r7089, r7083, r7086; +} +{ +mul.f16x2 r7092, r6921, r6966; +} +{ +fma.rn.f16x2 r7095, r6924, r6965, r7092; +} +{ +mul.f16x2 r7099, r6933, r6967; +} +{ +mul.f16x2 r7102, r6936, r6968; +} +{ +sub.f16x2 r7105, r7099, r7102; +} +{ +mul.f16x2 r7108, r6933, r6968; +} +{ +fma.rn.f16x2 r7111, r6936, r6967, r7108; +} +{ +mul.f16x2 r7115, r6945, r6969; +} +{ +mul.f16x2 r7118, r6948, r6970; +} +{ +sub.f16x2 r7121, r7115, r7118; +} +{ +mul.f16x2 r7124, r6945, r6970; +} +{ +fma.rn.f16x2 r7127, r6948, r6969, r7124; +} +{ +neg.f16x2 r7131, r6870; +} +{ +mul.f16x2 r7133, r6879, r6973; +} +{ +mul.f16x2 r7136, r6882, r6974; +} +{ +sub.f16x2 r7139, r7133, r7136; +} +{ +mul.f16x2 r7142, r6879, r6974; +} +{ +fma.rn.f16x2 r7145, r6882, r6973, r7142; +} +{ +mul.f16x2 r7149, r6891, r6975; +} +{ +mul.f16x2 r7152, r6894, r6976; +} +{ +sub.f16x2 r7155, r7149, r7152; +} +{ +mul.f16x2 r7158, r6891, r6976; +} +{ +fma.rn.f16x2 r7161, r6894, r6975, r7158; +} +{ +mul.f16x2 r7165, r6903, r6977; +} +{ +mul.f16x2 r7168, r6906, r6978; +} +{ +sub.f16x2 r7171, r7165, r7168; +} +{ +mul.f16x2 r7174, r6903, r6978; +} +{ +fma.rn.f16x2 r7177, r6906, r6977, r7174; +} +{ +mul.f16x2 r7181, r6915, r6979; +} +{ +mul.f16x2 r7184, r6918, r6980; +} +{ +sub.f16x2 r7187, r7181, r7184; +} +{ +mul.f16x2 r7190, r6915, r6980; +} +{ +fma.rn.f16x2 r7193, r6918, r6979, r7190; +} +{ +mul.f16x2 r7197, r6927, r6981; +} +{ +mul.f16x2 r7200, r6930, r6982; +} +{ +sub.f16x2 r7203, r7197, r7200; +} +{ +mul.f16x2 r7206, r6927, r6982; +} +{ +fma.rn.f16x2 r7209, r6930, r6981, r7206; +} +{ +mul.f16x2 r7213, r6939, r6983; +} +{ +mul.f16x2 r7216, r6942, r6984; +} +{ +sub.f16x2 r7219, r7213, r7216; +} +{ +mul.f16x2 r7222, r6939, r6984; +} +{ +fma.rn.f16x2 r7225, r6942, r6983, r7222; +} +{ +mul.f16x2 r7229, r6951, r6985; +} +{ +mul.f16x2 r7232, r6954, r6986; +} +{ +sub.f16x2 r7235, r7229, r7232; +} +{ +mul.f16x2 r7238, r6951, r6986; +} +{ +fma.rn.f16x2 r7241, r6954, r6985, r7238; +} +{ +add.f16x2 %0, r6245, r6861; +} +{ +add.f16x2 %1, r6248, r6864; +} +{ +sub.f16x2 %32, r6245, r6861; +} +{ +sub.f16x2 %33, r6248, r6864; +} +{ +add.f16x2 %2, r6257, r7025; +} +{ +add.f16x2 %3, r6260, r7031; +} +{ +sub.f16x2 %34, r6257, r7025; +} +{ +sub.f16x2 %35, r6260, r7031; +} +{ +add.f16x2 %4, r6269, r7041; +} +{ +add.f16x2 %5, r6272, r7047; +} +{ +sub.f16x2 %36, r6269, r7041; +} +{ +sub.f16x2 %37, r6272, r7047; +} +{ +add.f16x2 %6, r6281, r7057; +} +{ +add.f16x2 %7, r6284, r7063; +} +{ +sub.f16x2 %38, r6281, r7057; +} +{ +sub.f16x2 %39, r6284, r7063; +} +{ +add.f16x2 %8, r6293, r7073; +} +{ +add.f16x2 %9, r6296, r7079; +} +{ +sub.f16x2 %40, r6293, r7073; +} +{ +sub.f16x2 %41, r6296, r7079; +} +{ +add.f16x2 %10, r6305, r7089; +} +{ +add.f16x2 %11, r6308, r7095; +} +{ +sub.f16x2 %42, r6305, r7089; +} +{ +sub.f16x2 %43, r6308, r7095; +} +{ +add.f16x2 %12, r6317, r7105; +} +{ +add.f16x2 %13, r6320, r7111; +} +{ +sub.f16x2 %44, r6317, r7105; +} +{ +sub.f16x2 %45, r6320, r7111; +} +{ +add.f16x2 %14, r6329, r7121; +} +{ +add.f16x2 %15, r6332, r7127; +} +{ +sub.f16x2 %46, r6329, r7121; +} +{ +sub.f16x2 %47, r6332, r7127; +} +{ +add.f16x2 %16, r6251, r7131; +} +{ +add.f16x2 %17, r6254, r6867; +} +{ +sub.f16x2 %48, r6251, r7131; +} +{ +sub.f16x2 %49, r6254, r6867; +} +{ +add.f16x2 %18, r6263, r7139; +} +{ +add.f16x2 %19, r6266, r7145; +} +{ +sub.f16x2 %50, r6263, r7139; +} +{ +sub.f16x2 %51, r6266, r7145; +} +{ +add.f16x2 %20, r6275, r7155; +} +{ +add.f16x2 %21, r6278, r7161; +} +{ +sub.f16x2 %52, r6275, r7155; +} +{ +sub.f16x2 %53, r6278, r7161; +} +{ +add.f16x2 %22, r6287, r7171; +} +{ +add.f16x2 %23, r6290, r7177; +} +{ +sub.f16x2 %54, r6287, r7171; +} +{ +sub.f16x2 %55, r6290, r7177; +} +{ +add.f16x2 %24, r6299, r7187; +} +{ +add.f16x2 %25, r6302, r7193; +} +{ +sub.f16x2 %56, r6299, r7187; +} +{ +sub.f16x2 %57, r6302, r7193; +} +{ +add.f16x2 %26, r6311, r7203; +} +{ +add.f16x2 %27, r6314, r7209; +} +{ +sub.f16x2 %58, r6311, r7203; +} +{ +sub.f16x2 %59, r6314, r7209; +} +{ +add.f16x2 %28, r6323, r7219; +} +{ +add.f16x2 %29, r6326, r7225; +} +{ +sub.f16x2 %60, r6323, r7219; +} +{ +sub.f16x2 %61, r6326, r7225; +} +{ +add.f16x2 %30, r6335, r7235; +} +{ +add.f16x2 %31, r6338, r7241; +} +{ +sub.f16x2 %62, r6335, r7235; +} +{ +sub.f16x2 %63, r6338, r7241; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..3f5891eceb359 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp32_fwd.hpp.inc @@ -0,0 +1,2328 @@ +#ifndef CUFFTDX_FFT_32768_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_32768_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1152, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2886>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2884, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2882, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2881, f2884, f2882; +sub.f32 f140, f2884, f2882; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2880, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2877, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2875, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2874, f2877, f2875; +sub.f32 f156, f2877, f2875; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2873, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2873, 0fBF3504F3; +mul.f32 f2872, f157, 0f3F3504F3; +sub.f32 f163, f2872, f162; +mul.f32 f164, f2873, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2871, f2881, f2874; +sub.f32 f173, f2881, f2874; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2870, f2880, f165; +sub.f32 f177, f2880, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2869, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2868, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2866, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2863, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2862, f2866, f2863; +sub.f32 f197, f2866, f2863; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2861, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2859, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2857, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2856, f2859, f2857; +sub.f32 f213, f2859, f2857; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2855, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2855, 0fBF3504F3; +mul.f32 f2854, f214, 0f3F3504F3; +sub.f32 f220, f2854, f219; +mul.f32 f221, f2855, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2853, f2862, f2856; +sub.f32 f230, f2862, f2856; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2852, f2861, f222; +sub.f32 f234, f2861, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2851, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2850, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2848, f231, 0f3F6C835E; +mul.f32 f2849, f2852, 0fBEC3EF15; +sub.f32 f245, f2848, f2849; +mul.f32 f246, f2852, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2846, f235, 0f3F3504F3; +mul.f32 f2847, f2851, 0fBF3504F3; +sub.f32 f250, f2846, f2847; +mul.f32 f251, f2851, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2844, f239, 0f3EC3EF15; +mul.f32 f2845, f2850, 0fBF6C835E; +sub.f32 f255, f2844, f2845; +mul.f32 f256, f2850, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2842, f233, 0fBEC3EF15; +mul.f32 f2843, f234, 0fBF6C835E; +sub.f32 f260, f2842, f2843; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2840, f241, 0fBF6C835E; +mul.f32 f2841, f242, 0fBEC3EF15; +sub.f32 f269, f2840, f2841; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2839, f2871, f2853; +sub.f32 f275, f2871, f2853; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2838, f2870, f247; +sub.f32 f279, f2870, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2837, f2869, f252; +sub.f32 f283, f2869, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2836, f2868, f257; +sub.f32 f287, f2868, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2835, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2834, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2833, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2832, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2829, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2827, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2826, f2829, f2827; +sub.f32 f315, f2829, f2827; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2825, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2823, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2820, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2819, f2823, f2820; +sub.f32 f331, f2823, f2820; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2818, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2816, f332, 0f3F3504F3; +mul.f32 f2817, f2818, 0fBF3504F3; +sub.f32 f338, f2816, f2817; +mul.f32 f339, f2818, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2815, f2826, f2819; +sub.f32 f348, f2826, f2819; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2814, f2825, f340; +sub.f32 f352, f2825, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2813, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2812, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2810, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2808, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2807, f2810, f2808; +sub.f32 f372, f2810, f2808; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2806, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2803, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2802, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2801, f2803, f2802; +sub.f32 f388, f2803, f2802; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2800, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2798, f389, 0f3F3504F3; +mul.f32 f2799, f2800, 0fBF3504F3; +sub.f32 f395, f2798, f2799; +mul.f32 f396, f2800, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2797, f2807, f2801; +sub.f32 f405, f2807, f2801; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2796, f2806, f397; +sub.f32 f409, f2806, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2795, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2794, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2796, 0fBEC3EF15; +mul.f32 f2793, f406, 0f3F6C835E; +sub.f32 f420, f2793, f419; +mul.f32 f421, f2796, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2795, 0fBF3504F3; +mul.f32 f2792, f410, 0f3F3504F3; +sub.f32 f425, f2792, f424; +mul.f32 f426, f2795, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2790, f414, 0f3EC3EF15; +mul.f32 f2791, f2794, 0fBF6C835E; +sub.f32 f430, f2790, f2791; +mul.f32 f431, f2794, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2788, f408, 0fBEC3EF15; +mul.f32 f2789, f409, 0fBF6C835E; +sub.f32 f435, f2788, f2789; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2787, f416, 0fBF6C835E; +sub.f32 f444, f2787, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2786, f2815, f2797; +sub.f32 f450, f2815, f2797; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2785, f2814, f422; +sub.f32 f454, f2814, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2784, f2813, f427; +sub.f32 f458, f2813, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2783, f2812, f432; +sub.f32 f462, f2812, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2782, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2781, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2780, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2779, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2785, 0fBE47C5C2; +mul.f32 f2778, f451, 0f3F7B14BE; +sub.f32 f481, f2778, f480; +mul.f32 f482, f2785, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2784, 0fBEC3EF15; +mul.f32 f2777, f455, 0f3F6C835E; +sub.f32 f486, f2777, f485; +mul.f32 f487, f2784, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2783, 0fBF0E39DA; +mul.f32 f2776, f459, 0f3F54DB31; +sub.f32 f491, f2776, f490; +mul.f32 f492, f2783, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2782, 0fBF3504F3; +mul.f32 f2775, f463, 0f3F3504F3; +sub.f32 f496, f2775, f495; +mul.f32 f497, f2782, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2773, f467, 0f3F0E39DA; +mul.f32 f2774, f2781, 0fBF54DB31; +sub.f32 f501, f2773, f2774; +mul.f32 f502, f2781, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2771, f471, 0f3EC3EF15; +mul.f32 f2772, f2780, 0fBF6C835E; +sub.f32 f506, f2771, f2772; +mul.f32 f507, f2780, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2769, f475, 0f3E47C5C2; +mul.f32 f2770, f2779, 0fBF7B14BE; +sub.f32 f511, f2769, f2770; +mul.f32 f512, f2779, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2767, f453, 0fBE47C5C2; +mul.f32 f2768, f454, 0fBF7B14BE; +sub.f32 f516, f2767, f2768; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2766, f457, 0fBEC3EF15; +sub.f32 f521, f2766, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2765, f461, 0fBF0E39DA; +sub.f32 f526, f2765, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2763, f469, 0fBF54DB31; +mul.f32 f2764, f470, 0fBF0E39DA; +sub.f32 f535, f2763, f2764; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2762, f473, 0fBF6C835E; +sub.f32 f540, f2762, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2761, f477, 0fBF7B14BE; +sub.f32 f545, f2761, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2760, f2839, f2786; +sub.f32 f551, f2839, f2786; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2759, f2838, f483; +sub.f32 f555, f2838, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2758, f2837, f488; +sub.f32 f559, f2837, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2757, f2836, f493; +sub.f32 f563, f2836, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2756, f2835, f498; +sub.f32 f567, f2835, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f2755, f2834, f503; +sub.f32 f571, f2834, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f2754, f2833, f508; +sub.f32 f575, f2833, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f2753, f2832, f513; +sub.f32 f579, f2832, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f2752, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f2751, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f2750, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f2749, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f2748, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2747, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2746, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2745, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f2759; +mul.f32 f2744, f612, f552; +sub.f32 f618, f2744, f617; +mul.f32 f619, f612, f2759; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f2742, f612, f612; +mul.f32 f2743, f613, f613; +sub.f32 f623, f2742, f2743; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f2740, f623, f556; +mul.f32 f2741, f625, f2758; +sub.f32 f628, f2740, f2741; +mul.f32 f629, f623, f2758; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f2738, f612, f623; +mul.f32 f2739, f613, f625; +sub.f32 f633, f2738, f2739; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f2736, f633, f560; +mul.f32 f2737, f635, f2757; +sub.f32 f638, f2736, f2737; +mul.f32 f639, f633, f2757; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f2735, f612, f633; +sub.f32 f643, f2735, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f2756; +mul.f32 f2734, f643, f564; +sub.f32 f648, f2734, f647; +mul.f32 f649, f643, f2756; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f2733, f612, f643; +sub.f32 f653, f2733, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f2755; +mul.f32 f2732, f653, f568; +sub.f32 f658, f2732, f657; +mul.f32 f659, f653, f2755; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f2731, f612, f653; +sub.f32 f663, f2731, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f2729, f663, f572; +mul.f32 f2730, f665, f2754; +sub.f32 f668, f2729, f2730; +mul.f32 f669, f663, f2754; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f2727, f612, f663; +mul.f32 f2728, f613, f665; +sub.f32 f673, f2727, f2728; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f2725, f673, f576; +mul.f32 f2726, f675, f2753; +sub.f32 f678, f2725, f2726; +mul.f32 f679, f673, f2753; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f2723, f612, f673; +mul.f32 f2724, f613, f675; +sub.f32 f683, f2723, f2724; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f2752; +mul.f32 f2722, f683, f580; +sub.f32 f688, f2722, f687; +mul.f32 f689, f683, f2752; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f2721, f612, f683; +sub.f32 f693, f2721, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f2751; +mul.f32 f2720, f693, f584; +sub.f32 f698, f2720, f697; +mul.f32 f699, f693, f2751; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f2719, f612, f693; +sub.f32 f703, f2719, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f2750; +mul.f32 f2718, f703, f588; +sub.f32 f708, f2718, f707; +mul.f32 f709, f703, f2750; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f2716, f612, f703; +mul.f32 f2717, f613, f705; +sub.f32 f713, f2716, f2717; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f2714, f713, f592; +mul.f32 f2715, f715, f2749; +sub.f32 f718, f2714, f2715; +mul.f32 f719, f713, f2749; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f2712, f612, f713; +mul.f32 f2713, f613, f715; +sub.f32 f723, f2712, f2713; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f2710, f723, f596; +mul.f32 f2711, f725, f2748; +sub.f32 f728, f2710, f2711; +mul.f32 f729, f723, f2748; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f2709, f612, f723; +sub.f32 f733, f2709, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f2747; +mul.f32 f2708, f733, f600; +sub.f32 f738, f2708, f737; +mul.f32 f739, f733, f2747; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f2707, f612, f733; +sub.f32 f743, f2707, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f2746; +mul.f32 f2706, f743, f604; +sub.f32 f748, f2706, f747; +mul.f32 f749, f743, f2746; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f2705, f612, f743; +sub.f32 f753, f2705, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f2745; +mul.f32 f2704, f753, f608; +sub.f32 f758, f2704, f757; +mul.f32 f759, f753, f2745; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f2702, f612, f753; +mul.f32 f2703, f613, f755; +sub.f32 f763, f2702, f2703; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f2700, f763, f550; +mul.f32 f2701, f765, f551; +sub.f32 f768, f2700, f2701; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f2698, f612, f763; +mul.f32 f2699, f613, f765; +sub.f32 f773, f2698, f2699; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f2697, f773, f554; +sub.f32 f778, f2697, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f2696, f612, f773; +sub.f32 f783, f2696, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f2695, f783, f558; +sub.f32 f788, f2695, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f2694, f612, f783; +sub.f32 f793, f2694, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f2693, f793, f562; +sub.f32 f798, f2693, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f2692, f612, f793; +sub.f32 f803, f2692, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f2690, f803, f566; +mul.f32 f2691, f805, f567; +sub.f32 f808, f2690, f2691; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f2688, f612, f803; +mul.f32 f2689, f613, f805; +sub.f32 f813, f2688, f2689; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f2686, f813, f570; +mul.f32 f2687, f815, f571; +sub.f32 f818, f2686, f2687; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f2684, f612, f813; +mul.f32 f2685, f613, f815; +sub.f32 f823, f2684, f2685; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f2683, f823, f574; +sub.f32 f828, f2683, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f2682, f612, f823; +sub.f32 f833, f2682, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f2681, f833, f578; +sub.f32 f838, f2681, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f2680, f612, f833; +sub.f32 f843, f2680, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f2679, f843, f582; +sub.f32 f848, f2679, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f2677, f612, f843; +mul.f32 f2678, f613, f845; +sub.f32 f853, f2677, f2678; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f2675, f853, f586; +mul.f32 f2676, f855, f587; +sub.f32 f858, f2675, f2676; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f2673, f612, f853; +mul.f32 f2674, f613, f855; +sub.f32 f863, f2673, f2674; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f2671, f863, f590; +mul.f32 f2672, f865, f591; +sub.f32 f868, f2671, f2672; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f2670, f612, f863; +sub.f32 f873, f2670, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f2669, f873, f594; +sub.f32 f878, f2669, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f2668, f612, f873; +sub.f32 f883, f2668, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f2667, f883, f598; +sub.f32 f888, f2667, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f2666, f612, f883; +sub.f32 f893, f2666, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f2664, f893, f602; +mul.f32 f2665, f895, f603; +sub.f32 f898, f2664, f2665; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f2662, f612, f893; +mul.f32 f2663, f613, f895; +sub.f32 f903, f2662, f2663; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f2660, f903, f606; +mul.f32 f2661, f905, f607; +sub.f32 f908, f2660, f2661; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f2658, f612, f903; +mul.f32 f2659, f613, f905; +sub.f32 f913, f2658, f2659; +mov.u32 r32, %tid.x; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f2657, f913, f610; +sub.f32 f918, f2657, f917; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +shl.b32 r8, r32, 7; +and.b32 r9, r8, -131072; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 130944; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +and.b32 r23, r32, 1023; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+4096]; +ld.shared.f32 f923, [r13+8192]; +ld.shared.f32 f924, [r13+12288]; +ld.shared.f32 f925, [r13+16384]; +ld.shared.f32 f926, [r13+20480]; +ld.shared.f32 f927, [r13+24576]; +ld.shared.f32 f928, [r13+28672]; +ld.shared.f32 f929, [r13+32768]; +ld.shared.f32 f930, [r13+36864]; +ld.shared.f32 f931, [r13+40960]; +ld.shared.f32 f932, [r13+45056]; +ld.shared.f32 f933, [r13+49152]; +ld.shared.f32 f934, [r13+53248]; +ld.shared.f32 f935, [r13+57344]; +ld.shared.f32 f936, [r13+61440]; +ld.shared.f32 f937, [r13+65536]; +ld.shared.f32 f938, [r13+69632]; +ld.shared.f32 f939, [r13+73728]; +ld.shared.f32 f940, [r13+77824]; +ld.shared.f32 f941, [r13+81920]; +ld.shared.f32 f942, [r13+86016]; +ld.shared.f32 f943, [r13+90112]; +ld.shared.f32 f944, [r13+94208]; +ld.shared.f32 f945, [r13+98304]; +ld.shared.f32 f946, [r13+102400]; +ld.shared.f32 f947, [r13+106496]; +ld.shared.f32 f948, [r13+110592]; +ld.shared.f32 f949, [r13+114688]; +ld.shared.f32 f950, [r13+118784]; +ld.shared.f32 f951, [r13+122880]; +ld.shared.f32 f952, [r13+126976]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2760, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+4096]; +ld.shared.f32 f955, [r13+8192]; +ld.shared.f32 f956, [r13+12288]; +ld.shared.f32 f957, [r13+16384]; +ld.shared.f32 f958, [r13+20480]; +ld.shared.f32 f959, [r13+24576]; +ld.shared.f32 f960, [r13+28672]; +ld.shared.f32 f961, [r13+32768]; +ld.shared.f32 f962, [r13+36864]; +ld.shared.f32 f963, [r13+40960]; +ld.shared.f32 f964, [r13+45056]; +ld.shared.f32 f965, [r13+49152]; +ld.shared.f32 f966, [r13+53248]; +ld.shared.f32 f967, [r13+57344]; +ld.shared.f32 f968, [r13+61440]; +ld.shared.f32 f969, [r13+65536]; +ld.shared.f32 f970, [r13+69632]; +ld.shared.f32 f971, [r13+73728]; +ld.shared.f32 f972, [r13+77824]; +ld.shared.f32 f973, [r13+81920]; +ld.shared.f32 f974, [r13+86016]; +ld.shared.f32 f975, [r13+90112]; +ld.shared.f32 f976, [r13+94208]; +ld.shared.f32 f977, [r13+98304]; +ld.shared.f32 f978, [r13+102400]; +ld.shared.f32 f979, [r13+106496]; +ld.shared.f32 f980, [r13+110592]; +ld.shared.f32 f981, [r13+114688]; +ld.shared.f32 f982, [r13+118784]; +ld.shared.f32 f983, [r13+122880]; +ld.shared.f32 f984, [r13+126976]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2656, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2655, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2654, f2656, f2655; +sub.f32 f996, f2656, f2655; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f2653, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2652, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2651, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2650, f2652, f2651; +sub.f32 f1012, f2652, f2651; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f2649, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f2649, 0fBF3504F3; +mul.f32 f2648, f1013, 0f3F3504F3; +sub.f32 f1019, f2648, f1018; +mul.f32 f1020, f2649, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2647, f2654, f2650; +sub.f32 f1029, f2654, f2650; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2646, f2653, f1021; +sub.f32 f1033, f2653, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f2645, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f2644, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2643, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2642, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2641, f2643, f2642; +sub.f32 f1053, f2643, f2642; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f2640, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2639, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2638, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2637, f2639, f2638; +sub.f32 f1069, f2639, f2638; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f2636, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f2636, 0fBF3504F3; +mul.f32 f2635, f1070, 0f3F3504F3; +sub.f32 f1076, f2635, f1075; +mul.f32 f1077, f2636, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2634, f2641, f2637; +sub.f32 f1086, f2641, f2637; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2633, f2640, f1078; +sub.f32 f1090, f2640, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f2632, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f2631, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2629, f1087, 0f3F6C835E; +mul.f32 f2630, f2633, 0fBEC3EF15; +sub.f32 f1101, f2629, f2630; +mul.f32 f1102, f2633, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f2627, f1091, 0f3F3504F3; +mul.f32 f2628, f2632, 0fBF3504F3; +sub.f32 f1106, f2627, f2628; +mul.f32 f1107, f2632, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f2631, 0fBF6C835E; +mul.f32 f2626, f1095, 0f3EC3EF15; +sub.f32 f1111, f2626, f1110; +mul.f32 f1112, f2631, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f2625, f1089, 0fBEC3EF15; +sub.f32 f1116, f2625, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f2623, f1097, 0fBF6C835E; +mul.f32 f2624, f1098, 0fBEC3EF15; +sub.f32 f1125, f2623, f2624; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2622, f2647, f2634; +sub.f32 f1131, f2647, f2634; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2621, f2646, f1103; +sub.f32 f1135, f2646, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2620, f2645, f1108; +sub.f32 f1139, f2645, f1108; +add.f32 f1140, f1038, f1111; +sub.f32 f1142, f1038, f1111; +add.f32 f2619, f2644, f1113; +sub.f32 f1143, f2644, f1113; +add.f32 f1144, f1028, f1086; +sub.f32 f1146, f1028, f1086; +sub.f32 f2618, f1029, f1085; +add.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1116; +sub.f32 f1150, f1032, f1116; +add.f32 f2617, f1033, f1118; +sub.f32 f1151, f1033, f1118; +add.f32 f1152, f1036, f1121; +sub.f32 f1154, f1036, f1121; +add.f32 f2616, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2615, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2614, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2613, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2612, f2614, f2613; +sub.f32 f1171, f2614, f2613; +add.f32 f1172, f1162, f1167; +sub.f32 f1174, f1162, f1167; +sub.f32 f2611, f1163, f1166; +add.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2610, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2609, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2608, f2610, f2609; +sub.f32 f1187, f2610, f2609; +add.f32 f1188, f1178, f1183; +sub.f32 f1190, f1178, f1183; +sub.f32 f2607, f1179, f1182; +add.f32 f1191, f1179, f1182; +mul.f32 f1193, f2607, 0fBF3504F3; +mul.f32 f2606, f1188, 0f3F3504F3; +sub.f32 f1194, f2606, f1193; +mul.f32 f1195, f2607, 0f3F3504F3; +fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195; +mul.f32 f1197, f1190, 0fBF3504F3; +mul.f32 f1198, f1191, 0fBF3504F3; +sub.f32 f1199, f1197, f1198; +add.f32 f1200, f1197, f1198; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2605, f2612, f2608; +sub.f32 f1204, f2612, f2608; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2604, f2611, f1196; +sub.f32 f1208, f2611, f1196; +add.f32 f1209, f1170, f1187; +sub.f32 f1211, f1170, f1187; +sub.f32 f2603, f1171, f1186; +add.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1199; +sub.f32 f1215, f1174, f1199; +add.f32 f2602, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2601, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2600, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2599, f2601, f2600; +sub.f32 f1228, f2601, f2600; +add.f32 f1229, f1219, f1224; +sub.f32 f1231, f1219, f1224; +sub.f32 f2598, f1220, f1223; +add.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2597, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2596, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2595, f2597, f2596; +sub.f32 f1244, f2597, f2596; +add.f32 f1245, f1235, f1240; +sub.f32 f1247, f1235, f1240; +sub.f32 f2594, f1236, f1239; +add.f32 f1248, f1236, f1239; +mul.f32 f1250, f2594, 0fBF3504F3; +mul.f32 f2593, f1245, 0f3F3504F3; +sub.f32 f1251, f2593, f1250; +mul.f32 f1252, f2594, 0f3F3504F3; +fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252; +mul.f32 f1254, f1247, 0fBF3504F3; +mul.f32 f1255, f1248, 0fBF3504F3; +sub.f32 f1256, f1254, f1255; +add.f32 f1257, f1254, f1255; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2592, f2599, f2595; +sub.f32 f1261, f2599, f2595; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2591, f2598, f1253; +sub.f32 f1265, f2598, f1253; +add.f32 f1266, f1227, f1244; +sub.f32 f1268, f1227, f1244; +sub.f32 f2590, f1228, f1243; +add.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1256; +sub.f32 f1272, f1231, f1256; +add.f32 f2589, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2587, f1262, 0f3F6C835E; +mul.f32 f2588, f2591, 0fBEC3EF15; +sub.f32 f1276, f2587, f2588; +mul.f32 f1277, f2591, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277; +mul.f32 f2585, f1266, 0f3F3504F3; +mul.f32 f2586, f2590, 0fBF3504F3; +sub.f32 f1281, f2585, f2586; +mul.f32 f1282, f2590, 0f3F3504F3; +fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282; +mul.f32 f2583, f1270, 0f3EC3EF15; +mul.f32 f2584, f2589, 0fBF6C835E; +sub.f32 f1286, f2583, f2584; +mul.f32 f1287, f2589, 0f3EC3EF15; +fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287; +mul.f32 f2581, f1264, 0fBEC3EF15; +mul.f32 f2582, f1265, 0fBF6C835E; +sub.f32 f1291, f2581, f2582; +mul.f32 f1292, f1265, 0fBEC3EF15; +fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292; +mul.f32 f1294, f1268, 0fBF3504F3; +mul.f32 f1295, f1269, 0fBF3504F3; +sub.f32 f1296, f1294, f1295; +add.f32 f1297, f1294, f1295; +mul.f32 f2579, f1272, 0fBF6C835E; +mul.f32 f2580, f1273, 0fBEC3EF15; +sub.f32 f1300, f2579, f2580; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2578, f2605, f2592; +sub.f32 f1306, f2605, f2592; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2577, f2604, f1278; +sub.f32 f1310, f2604, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2576, f2603, f1283; +sub.f32 f1314, f2603, f1283; +add.f32 f1315, f1213, f1286; +sub.f32 f1317, f1213, f1286; +add.f32 f2575, f2602, f1288; +sub.f32 f1318, f2602, f1288; +add.f32 f1319, f1203, f1261; +sub.f32 f1321, f1203, f1261; +sub.f32 f2574, f1204, f1260; +add.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1291; +sub.f32 f1325, f1207, f1291; +add.f32 f2573, f1208, f1293; +sub.f32 f1326, f1208, f1293; +add.f32 f1327, f1211, f1296; +sub.f32 f1329, f1211, f1296; +add.f32 f2572, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2571, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2577, 0fBE47C5C2; +mul.f32 f2570, f1307, 0f3F7B14BE; +sub.f32 f1337, f2570, f1336; +mul.f32 f1338, f2577, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338; +mul.f32 f1341, f2576, 0fBEC3EF15; +mul.f32 f2569, f1311, 0f3F6C835E; +sub.f32 f1342, f2569, f1341; +mul.f32 f1343, f2576, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343; +mul.f32 f2567, f1315, 0f3F54DB31; +mul.f32 f2568, f2575, 0fBF0E39DA; +sub.f32 f1347, f2567, f2568; +mul.f32 f1348, f2575, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348; +mul.f32 f2565, f1319, 0f3F3504F3; +mul.f32 f2566, f2574, 0fBF3504F3; +sub.f32 f1352, f2565, f2566; +mul.f32 f1353, f2574, 0f3F3504F3; +fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353; +mul.f32 f2563, f1323, 0f3F0E39DA; +mul.f32 f2564, f2573, 0fBF54DB31; +sub.f32 f1357, f2563, f2564; +mul.f32 f1358, f2573, 0f3F0E39DA; +fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358; +mul.f32 f2561, f1327, 0f3EC3EF15; +mul.f32 f2562, f2572, 0fBF6C835E; +sub.f32 f1362, f2561, f2562; +mul.f32 f1363, f2572, 0f3EC3EF15; +fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363; +mul.f32 f1366, f2571, 0fBF7B14BE; +mul.f32 f2560, f1331, 0f3E47C5C2; +sub.f32 f1367, f2560, f1366; +mul.f32 f1368, f2571, 0f3E47C5C2; +fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368; +mul.f32 f1371, f1310, 0fBF7B14BE; +mul.f32 f2559, f1309, 0fBE47C5C2; +sub.f32 f1372, f2559, f1371; +mul.f32 f1373, f1310, 0fBE47C5C2; +fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373; +mul.f32 f1376, f1314, 0fBF6C835E; +mul.f32 f2558, f1313, 0fBEC3EF15; +sub.f32 f1377, f2558, f1376; +mul.f32 f1378, f1314, 0fBEC3EF15; +fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378; +mul.f32 f1381, f1318, 0fBF54DB31; +mul.f32 f2557, f1317, 0fBF0E39DA; +sub.f32 f1382, f2557, f1381; +mul.f32 f1383, f1318, 0fBF0E39DA; +fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383; +mul.f32 f1385, f1321, 0fBF3504F3; +mul.f32 f1386, f1322, 0fBF3504F3; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1385, f1386; +mul.f32 f1390, f1326, 0fBF0E39DA; +mul.f32 f2556, f1325, 0fBF54DB31; +sub.f32 f1391, f2556, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392; +mul.f32 f1395, f1330, 0fBEC3EF15; +mul.f32 f2555, f1329, 0fBF6C835E; +sub.f32 f1396, f2555, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397; +mul.f32 f1400, f1334, 0fBE47C5C2; +mul.f32 f2554, f1333, 0fBF7B14BE; +sub.f32 f1401, f2554, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2553, f2622, f2578; +sub.f32 f1407, f2622, f2578; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2552, f2621, f1339; +sub.f32 f1411, f2621, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2551, f2620, f1344; +sub.f32 f1415, f2620, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2550, f2619, f1349; +sub.f32 f1419, f2619, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2549, f2618, f1354; +sub.f32 f1423, f2618, f1354; +add.f32 f1424, f1148, f1357; +sub.f32 f1426, f1148, f1357; +add.f32 f2548, f2617, f1359; +sub.f32 f1427, f2617, f1359; +add.f32 f1428, f1152, f1362; +sub.f32 f1430, f1152, f1362; +add.f32 f2547, f2616, f1364; +sub.f32 f1431, f2616, f1364; +add.f32 f1432, f1156, f1367; +sub.f32 f1434, f1156, f1367; +add.f32 f2546, f2615, f1369; +sub.f32 f1435, f2615, f1369; +add.f32 f1436, f1130, f1306; +sub.f32 f1438, f1130, f1306; +sub.f32 f2545, f1131, f1305; +add.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1372; +sub.f32 f1442, f1134, f1372; +add.f32 f2544, f1135, f1374; +sub.f32 f1443, f1135, f1374; +add.f32 f1444, f1138, f1377; +sub.f32 f1446, f1138, f1377; +add.f32 f2543, f1139, f1379; +sub.f32 f1447, f1139, f1379; +add.f32 f1448, f1142, f1382; +sub.f32 f1450, f1142, f1382; +add.f32 f2542, f1143, f1384; +sub.f32 f1451, f1143, f1384; +add.f32 f1452, f1146, f1387; +sub.f32 f1454, f1146, f1387; +add.f32 f2541, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2540, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2539, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2538, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r32, 5, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1473, f1469, f2552; +mul.f32 f2537, f1468, f1408; +sub.f32 f1474, f2537, f1473; +mul.f32 f1475, f1468, f2552; +fma.rn.f32 f1476, f1469, f1408, f1475; +mul.f32 f1478, f1469, f1469; +mul.f32 f2536, f1468, f1468; +sub.f32 f1479, f2536, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1483, f1481, f2551; +mul.f32 f2535, f1479, f1412; +sub.f32 f1484, f2535, f1483; +mul.f32 f1485, f1479, f2551; +fma.rn.f32 f1486, f1481, f1412, f1485; +mul.f32 f2533, f1468, f1479; +mul.f32 f2534, f1469, f1481; +sub.f32 f1489, f2533, f2534; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f2531, f1489, f1416; +mul.f32 f2532, f1491, f2550; +sub.f32 f1494, f2531, f2532; +mul.f32 f1495, f1489, f2550; +fma.rn.f32 f1496, f1491, f1416, f1495; +mul.f32 f2529, f1468, f1489; +mul.f32 f2530, f1469, f1491; +sub.f32 f1499, f2529, f2530; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f2527, f1499, f1420; +mul.f32 f2528, f1501, f2549; +sub.f32 f1504, f2527, f2528; +mul.f32 f1505, f1499, f2549; +fma.rn.f32 f1506, f1501, f1420, f1505; +mul.f32 f1508, f1469, f1501; +mul.f32 f2526, f1468, f1499; +sub.f32 f1509, f2526, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1513, f1511, f2548; +mul.f32 f2525, f1509, f1424; +sub.f32 f1514, f2525, f1513; +mul.f32 f1515, f1509, f2548; +fma.rn.f32 f1516, f1511, f1424, f1515; +mul.f32 f1518, f1469, f1511; +mul.f32 f2524, f1468, f1509; +sub.f32 f1519, f2524, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1523, f1521, f2547; +mul.f32 f2523, f1519, f1428; +sub.f32 f1524, f2523, f1523; +mul.f32 f1525, f1519, f2547; +fma.rn.f32 f1526, f1521, f1428, f1525; +mul.f32 f1528, f1469, f1521; +mul.f32 f2522, f1468, f1519; +sub.f32 f1529, f2522, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f2520, f1529, f1432; +mul.f32 f2521, f1531, f2546; +sub.f32 f1534, f2520, f2521; +mul.f32 f1535, f1529, f2546; +fma.rn.f32 f1536, f1531, f1432, f1535; +mul.f32 f2518, f1468, f1529; +mul.f32 f2519, f1469, f1531; +sub.f32 f1539, f2518, f2519; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f2516, f1539, f1436; +mul.f32 f2517, f1541, f2545; +sub.f32 f1544, f2516, f2517; +mul.f32 f1545, f1539, f2545; +fma.rn.f32 f1546, f1541, f1436, f1545; +mul.f32 f2514, f1468, f1539; +mul.f32 f2515, f1469, f1541; +sub.f32 f1549, f2514, f2515; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1553, f1551, f2544; +mul.f32 f2513, f1549, f1440; +sub.f32 f1554, f2513, f1553; +mul.f32 f1555, f1549, f2544; +fma.rn.f32 f1556, f1551, f1440, f1555; +mul.f32 f1558, f1469, f1551; +mul.f32 f2512, f1468, f1549; +sub.f32 f1559, f2512, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1563, f1561, f2543; +mul.f32 f2511, f1559, f1444; +sub.f32 f1564, f2511, f1563; +mul.f32 f1565, f1559, f2543; +fma.rn.f32 f1566, f1561, f1444, f1565; +mul.f32 f1568, f1469, f1561; +mul.f32 f2510, f1468, f1559; +sub.f32 f1569, f2510, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1573, f1571, f2542; +mul.f32 f2509, f1569, f1448; +sub.f32 f1574, f2509, f1573; +mul.f32 f1575, f1569, f2542; +fma.rn.f32 f1576, f1571, f1448, f1575; +mul.f32 f1578, f1469, f1571; +mul.f32 f2508, f1468, f1569; +sub.f32 f1579, f2508, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f2506, f1579, f1452; +mul.f32 f2507, f1581, f2541; +sub.f32 f1584, f2506, f2507; +mul.f32 f1585, f1579, f2541; +fma.rn.f32 f1586, f1581, f1452, f1585; +mul.f32 f2504, f1468, f1579; +mul.f32 f2505, f1469, f1581; +sub.f32 f1589, f2504, f2505; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f2502, f1589, f1456; +mul.f32 f2503, f1591, f2540; +sub.f32 f1594, f2502, f2503; +mul.f32 f1595, f1589, f2540; +fma.rn.f32 f1596, f1591, f1456, f1595; +mul.f32 f1598, f1469, f1591; +mul.f32 f2501, f1468, f1589; +sub.f32 f1599, f2501, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1603, f1601, f2539; +mul.f32 f2500, f1599, f1460; +sub.f32 f1604, f2500, f1603; +mul.f32 f1605, f1599, f2539; +fma.rn.f32 f1606, f1601, f1460, f1605; +mul.f32 f1608, f1469, f1601; +mul.f32 f2499, f1468, f1599; +sub.f32 f1609, f2499, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1613, f1611, f2538; +mul.f32 f2498, f1609, f1464; +sub.f32 f1614, f2498, f1613; +mul.f32 f1615, f1609, f2538; +fma.rn.f32 f1616, f1611, f1464, f1615; +mul.f32 f1618, f1469, f1611; +mul.f32 f2497, f1468, f1609; +sub.f32 f1619, f2497, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1623, f1621, f1407; +mul.f32 f2496, f1619, f1406; +sub.f32 f1624, f2496, f1623; +mul.f32 f1625, f1619, f1407; +fma.rn.f32 f1626, f1621, f1406, f1625; +mul.f32 f2494, f1468, f1619; +mul.f32 f2495, f1469, f1621; +sub.f32 f1629, f2494, f2495; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f2492, f1629, f1410; +mul.f32 f2493, f1631, f1411; +sub.f32 f1634, f2492, f2493; +mul.f32 f1635, f1629, f1411; +fma.rn.f32 f1636, f1631, f1410, f1635; +mul.f32 f2490, f1468, f1629; +mul.f32 f2491, f1469, f1631; +sub.f32 f1639, f2490, f2491; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f2488, f1639, f1414; +mul.f32 f2489, f1641, f1415; +sub.f32 f1644, f2488, f2489; +mul.f32 f1645, f1639, f1415; +fma.rn.f32 f1646, f1641, f1414, f1645; +mul.f32 f1648, f1469, f1641; +mul.f32 f2487, f1468, f1639; +sub.f32 f1649, f2487, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1653, f1651, f1419; +mul.f32 f2486, f1649, f1418; +sub.f32 f1654, f2486, f1653; +mul.f32 f1655, f1649, f1419; +fma.rn.f32 f1656, f1651, f1418, f1655; +mul.f32 f1658, f1469, f1651; +mul.f32 f2485, f1468, f1649; +sub.f32 f1659, f2485, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1663, f1661, f1423; +mul.f32 f2484, f1659, f1422; +sub.f32 f1664, f2484, f1663; +mul.f32 f1665, f1659, f1423; +fma.rn.f32 f1666, f1661, f1422, f1665; +mul.f32 f1668, f1469, f1661; +mul.f32 f2483, f1468, f1659; +sub.f32 f1669, f2483, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f2481, f1669, f1426; +mul.f32 f2482, f1671, f1427; +sub.f32 f1674, f2481, f2482; +mul.f32 f1675, f1669, f1427; +fma.rn.f32 f1676, f1671, f1426, f1675; +mul.f32 f2479, f1468, f1669; +mul.f32 f2480, f1469, f1671; +sub.f32 f1679, f2479, f2480; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f2477, f1679, f1430; +mul.f32 f2478, f1681, f1431; +sub.f32 f1684, f2477, f2478; +mul.f32 f1685, f1679, f1431; +fma.rn.f32 f1686, f1681, f1430, f1685; +mul.f32 f2475, f1468, f1679; +mul.f32 f2476, f1469, f1681; +sub.f32 f1689, f2475, f2476; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1693, f1691, f1435; +mul.f32 f2474, f1689, f1434; +sub.f32 f1694, f2474, f1693; +mul.f32 f1695, f1689, f1435; +fma.rn.f32 f1696, f1691, f1434, f1695; +mul.f32 f1698, f1469, f1691; +mul.f32 f2473, f1468, f1689; +sub.f32 f1699, f2473, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1703, f1701, f1439; +mul.f32 f2472, f1699, f1438; +sub.f32 f1704, f2472, f1703; +mul.f32 f1705, f1699, f1439; +fma.rn.f32 f1706, f1701, f1438, f1705; +mul.f32 f1708, f1469, f1701; +mul.f32 f2471, f1468, f1699; +sub.f32 f1709, f2471, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1713, f1711, f1443; +mul.f32 f2470, f1709, f1442; +sub.f32 f1714, f2470, f1713; +mul.f32 f1715, f1709, f1443; +fma.rn.f32 f1716, f1711, f1442, f1715; +mul.f32 f2468, f1468, f1709; +mul.f32 f2469, f1469, f1711; +sub.f32 f1719, f2468, f2469; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f2466, f1719, f1446; +mul.f32 f2467, f1721, f1447; +sub.f32 f1724, f2466, f2467; +mul.f32 f1725, f1719, f1447; +fma.rn.f32 f1726, f1721, f1446, f1725; +mul.f32 f2464, f1468, f1719; +mul.f32 f2465, f1469, f1721; +sub.f32 f1729, f2464, f2465; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f2462, f1729, f1450; +mul.f32 f2463, f1731, f1451; +sub.f32 f1734, f2462, f2463; +mul.f32 f1735, f1729, f1451; +fma.rn.f32 f1736, f1731, f1450, f1735; +mul.f32 f1738, f1469, f1731; +mul.f32 f2461, f1468, f1729; +sub.f32 f1739, f2461, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1743, f1741, f1455; +mul.f32 f2460, f1739, f1454; +sub.f32 f1744, f2460, f1743; +mul.f32 f1745, f1739, f1455; +fma.rn.f32 f1746, f1741, f1454, f1745; +mul.f32 f1748, f1469, f1741; +mul.f32 f2459, f1468, f1739; +sub.f32 f1749, f2459, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1753, f1751, f1459; +mul.f32 f2458, f1749, f1458; +sub.f32 f1754, f2458, f1753; +mul.f32 f1755, f1749, f1459; +fma.rn.f32 f1756, f1751, f1458, f1755; +mul.f32 f1758, f1469, f1751; +mul.f32 f2457, f1468, f1749; +sub.f32 f1759, f2457, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f2455, f1759, f1462; +mul.f32 f2456, f1761, f1463; +sub.f32 f1764, f2455, f2456; +mul.f32 f1765, f1759, f1463; +fma.rn.f32 f1766, f1761, f1462, f1765; +mul.f32 f2453, f1468, f1759; +mul.f32 f2454, f1469, f1761; +sub.f32 f1769, f2453, f2454; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 7; +mul.f32 f1770, f1468, f1761; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f2451, f1769, f1466; +mul.f32 f2452, f1771, f1467; +sub.f32 f1774, f2451, f2452; +mul.f32 f1775, f1769, f1467; +mov.u32 r33, %tid.x; +fma.rn.f32 f1776, f1771, f1466, f1775; +and.b32 r22, r33, 992; +shl.b32 r16, r33, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r30, 126976; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1474; +st.shared.f32 [r20+256], f1484; +st.shared.f32 [r20+384], f1494; +st.shared.f32 [r20+512], f1504; +st.shared.f32 [r20+640], f1514; +st.shared.f32 [r20+768], f1524; +st.shared.f32 [r20+896], f1534; +st.shared.f32 [r20+1024], f1544; +st.shared.f32 [r20+1152], f1554; +st.shared.f32 [r20+1280], f1564; +st.shared.f32 [r20+1408], f1574; +st.shared.f32 [r20+1536], f1584; +st.shared.f32 [r20+1664], f1594; +st.shared.f32 [r20+1792], f1604; +st.shared.f32 [r20+1920], f1614; +st.shared.f32 [r20+2048], f1624; +st.shared.f32 [r20+2176], f1634; +st.shared.f32 [r20+2304], f1644; +st.shared.f32 [r20+2432], f1654; +st.shared.f32 [r20+2560], f1664; +st.shared.f32 [r20+2688], f1674; +st.shared.f32 [r20+2816], f1684; +st.shared.f32 [r20+2944], f1694; +st.shared.f32 [r20+3072], f1704; +st.shared.f32 [r20+3200], f1714; +st.shared.f32 [r20+3328], f1724; +st.shared.f32 [r20+3456], f1734; +st.shared.f32 [r20+3584], f1744; +st.shared.f32 [r20+3712], f1754; +st.shared.f32 [r20+3840], f1764; +st.shared.f32 [r20+3968], f1774; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+4096]; +ld.shared.f32 f1779, [r21+8192]; +ld.shared.f32 f1780, [r21+12288]; +ld.shared.f32 f1781, [r21+16384]; +ld.shared.f32 f1782, [r21+20480]; +ld.shared.f32 f1783, [r21+24576]; +ld.shared.f32 f1784, [r21+28672]; +ld.shared.f32 f1785, [r21+32768]; +ld.shared.f32 f1786, [r21+36864]; +ld.shared.f32 f1787, [r21+40960]; +ld.shared.f32 f1788, [r21+45056]; +ld.shared.f32 f1789, [r21+49152]; +ld.shared.f32 f1790, [r21+53248]; +ld.shared.f32 f1791, [r21+57344]; +ld.shared.f32 f1792, [r21+61440]; +ld.shared.f32 f1793, [r21+65536]; +ld.shared.f32 f1794, [r21+69632]; +ld.shared.f32 f1795, [r21+73728]; +ld.shared.f32 f1796, [r21+77824]; +ld.shared.f32 f1797, [r21+81920]; +ld.shared.f32 f1798, [r21+86016]; +ld.shared.f32 f1799, [r21+90112]; +ld.shared.f32 f1800, [r21+94208]; +ld.shared.f32 f1801, [r21+98304]; +ld.shared.f32 f1802, [r21+102400]; +ld.shared.f32 f1803, [r21+106496]; +ld.shared.f32 f1804, [r21+110592]; +ld.shared.f32 f1805, [r21+114688]; +ld.shared.f32 f1806, [r21+118784]; +ld.shared.f32 f1807, [r21+122880]; +ld.shared.f32 f1808, [r21+126976]; +barrier.sync 0; +st.shared.f32 [r20], f2553; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+4096]; +ld.shared.f32 f1811, [r21+8192]; +ld.shared.f32 f1812, [r21+12288]; +ld.shared.f32 f1813, [r21+16384]; +ld.shared.f32 f1814, [r21+20480]; +ld.shared.f32 f1815, [r21+24576]; +ld.shared.f32 f1816, [r21+28672]; +ld.shared.f32 f1817, [r21+32768]; +ld.shared.f32 f1818, [r21+36864]; +ld.shared.f32 f1819, [r21+40960]; +ld.shared.f32 f1820, [r21+45056]; +ld.shared.f32 f1821, [r21+49152]; +ld.shared.f32 f1822, [r21+53248]; +ld.shared.f32 f1823, [r21+57344]; +ld.shared.f32 f1824, [r21+61440]; +ld.shared.f32 f1825, [r21+65536]; +ld.shared.f32 f1826, [r21+69632]; +ld.shared.f32 f1827, [r21+73728]; +ld.shared.f32 f1828, [r21+77824]; +ld.shared.f32 f1829, [r21+81920]; +ld.shared.f32 f1830, [r21+86016]; +ld.shared.f32 f1831, [r21+90112]; +ld.shared.f32 f1832, [r21+94208]; +ld.shared.f32 f1833, [r21+98304]; +ld.shared.f32 f1834, [r21+102400]; +ld.shared.f32 f1835, [r21+106496]; +ld.shared.f32 f1836, [r21+110592]; +ld.shared.f32 f1837, [r21+114688]; +ld.shared.f32 f1838, [r21+118784]; +ld.shared.f32 f1839, [r21+122880]; +ld.shared.f32 f1840, [r21+126976]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2450, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2449, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1841, f1845; +sub.f32 f1851, f1841, f1845; +add.f32 f2448, f2450, f2449; +sub.f32 f1852, f2450, f2449; +add.f32 f1853, f1843, f1848; +sub.f32 f1855, f1843, f1848; +sub.f32 f2447, f1844, f1847; +add.f32 f1856, f1844, f1847; +add.f32 f1857, f1781, f1797; +sub.f32 f1859, f1781, f1797; +add.f32 f2446, f1813, f1829; +sub.f32 f1860, f1813, f1829; +add.f32 f1861, f1789, f1805; +sub.f32 f1863, f1789, f1805; +add.f32 f2445, f1821, f1837; +sub.f32 f1864, f1821, f1837; +add.f32 f1865, f1857, f1861; +sub.f32 f1867, f1857, f1861; +add.f32 f2444, f2446, f2445; +sub.f32 f1868, f2446, f2445; +add.f32 f1869, f1859, f1864; +sub.f32 f1871, f1859, f1864; +sub.f32 f2443, f1860, f1863; +add.f32 f1872, f1860, f1863; +mul.f32 f2441, f1869, 0f3F3504F3; +mul.f32 f2442, f2443, 0fBF3504F3; +sub.f32 f1875, f2441, f2442; +mul.f32 f1876, f2443, 0f3F3504F3; +fma.rn.f32 f1877, f1869, 0fBF3504F3, f1876; +mul.f32 f1878, f1871, 0fBF3504F3; +mul.f32 f1879, f1872, 0fBF3504F3; +sub.f32 f1880, f1878, f1879; +add.f32 f1881, f1878, f1879; +add.f32 f1882, f1849, f1865; +sub.f32 f1884, f1849, f1865; +add.f32 f2440, f2448, f2444; +sub.f32 f1885, f2448, f2444; +add.f32 f1886, f1853, f1875; +sub.f32 f1888, f1853, f1875; +add.f32 f2439, f2447, f1877; +sub.f32 f1889, f2447, f1877; +add.f32 f1890, f1851, f1868; +sub.f32 f1892, f1851, f1868; +sub.f32 f2438, f1852, f1867; +add.f32 f1893, f1852, f1867; +add.f32 f1894, f1855, f1880; +sub.f32 f1896, f1855, f1880; +add.f32 f2437, f1856, f1881; +sub.f32 f1897, f1856, f1881; +add.f32 f1898, f1779, f1795; +sub.f32 f1900, f1779, f1795; +add.f32 f2436, f1811, f1827; +sub.f32 f1901, f1811, f1827; +add.f32 f1902, f1787, f1803; +sub.f32 f1904, f1787, f1803; +add.f32 f2435, f1819, f1835; +sub.f32 f1905, f1819, f1835; +add.f32 f1906, f1898, f1902; +sub.f32 f1908, f1898, f1902; +add.f32 f2434, f2436, f2435; +sub.f32 f1909, f2436, f2435; +add.f32 f1910, f1900, f1905; +sub.f32 f1912, f1900, f1905; +sub.f32 f2433, f1901, f1904; +add.f32 f1913, f1901, f1904; +add.f32 f1914, f1783, f1799; +sub.f32 f1916, f1783, f1799; +add.f32 f2432, f1815, f1831; +sub.f32 f1917, f1815, f1831; +add.f32 f1918, f1791, f1807; +sub.f32 f1920, f1791, f1807; +add.f32 f2431, f1823, f1839; +sub.f32 f1921, f1823, f1839; +add.f32 f1922, f1914, f1918; +sub.f32 f1924, f1914, f1918; +add.f32 f2430, f2432, f2431; +sub.f32 f1925, f2432, f2431; +add.f32 f1926, f1916, f1921; +sub.f32 f1928, f1916, f1921; +sub.f32 f2429, f1917, f1920; +add.f32 f1929, f1917, f1920; +mul.f32 f1931, f2429, 0fBF3504F3; +mul.f32 f2428, f1926, 0f3F3504F3; +sub.f32 f1932, f2428, f1931; +mul.f32 f1933, f2429, 0f3F3504F3; +fma.rn.f32 f1934, f1926, 0fBF3504F3, f1933; +mul.f32 f1935, f1928, 0fBF3504F3; +mul.f32 f1936, f1929, 0fBF3504F3; +sub.f32 f1937, f1935, f1936; +add.f32 f1938, f1935, f1936; +add.f32 f1939, f1906, f1922; +sub.f32 f1941, f1906, f1922; +add.f32 f2427, f2434, f2430; +sub.f32 f1942, f2434, f2430; +add.f32 f1943, f1910, f1932; +sub.f32 f1945, f1910, f1932; +add.f32 f2426, f2433, f1934; +sub.f32 f1946, f2433, f1934; +add.f32 f1947, f1908, f1925; +sub.f32 f1949, f1908, f1925; +sub.f32 f2425, f1909, f1924; +add.f32 f1950, f1909, f1924; +add.f32 f1951, f1912, f1937; +sub.f32 f1953, f1912, f1937; +add.f32 f2424, f1913, f1938; +sub.f32 f1954, f1913, f1938; +mul.f32 f1956, f2426, 0fBEC3EF15; +mul.f32 f2423, f1943, 0f3F6C835E; +sub.f32 f1957, f2423, f1956; +mul.f32 f1958, f2426, 0f3F6C835E; +fma.rn.f32 f1959, f1943, 0fBEC3EF15, f1958; +mul.f32 f2421, f1947, 0f3F3504F3; +mul.f32 f2422, f2425, 0fBF3504F3; +sub.f32 f1962, f2421, f2422; +mul.f32 f1963, f2425, 0f3F3504F3; +fma.rn.f32 f1964, f1947, 0fBF3504F3, f1963; +mul.f32 f2419, f1951, 0f3EC3EF15; +mul.f32 f2420, f2424, 0fBF6C835E; +sub.f32 f1967, f2419, f2420; +mul.f32 f1968, f2424, 0f3EC3EF15; +fma.rn.f32 f1969, f1951, 0fBF6C835E, f1968; +mul.f32 f2417, f1945, 0fBEC3EF15; +mul.f32 f2418, f1946, 0fBF6C835E; +sub.f32 f1972, f2417, f2418; +mul.f32 f1973, f1946, 0fBEC3EF15; +fma.rn.f32 f1974, f1945, 0fBF6C835E, f1973; +mul.f32 f1975, f1949, 0fBF3504F3; +mul.f32 f1976, f1950, 0fBF3504F3; +sub.f32 f1977, f1975, f1976; +add.f32 f1978, f1975, f1976; +mul.f32 f2415, f1953, 0fBF6C835E; +mul.f32 f2416, f1954, 0fBEC3EF15; +sub.f32 f1981, f2415, f2416; +mul.f32 f1982, f1954, 0fBF6C835E; +fma.rn.f32 f1983, f1953, 0fBEC3EF15, f1982; +add.f32 f1984, f1882, f1939; +sub.f32 f1986, f1882, f1939; +add.f32 f2414, f2440, f2427; +sub.f32 f1987, f2440, f2427; +add.f32 f1988, f1886, f1957; +sub.f32 f1990, f1886, f1957; +add.f32 f2413, f2439, f1959; +sub.f32 f1991, f2439, f1959; +add.f32 f1992, f1890, f1962; +sub.f32 f1994, f1890, f1962; +add.f32 f2412, f2438, f1964; +sub.f32 f1995, f2438, f1964; +add.f32 f1996, f1894, f1967; +sub.f32 f1998, f1894, f1967; +add.f32 f2411, f2437, f1969; +sub.f32 f1999, f2437, f1969; +add.f32 f2000, f1884, f1942; +sub.f32 f2002, f1884, f1942; +sub.f32 f2410, f1885, f1941; +add.f32 f2003, f1885, f1941; +add.f32 f2004, f1888, f1972; +sub.f32 f2006, f1888, f1972; +add.f32 f2409, f1889, f1974; +sub.f32 f2007, f1889, f1974; +add.f32 f2008, f1892, f1977; +sub.f32 f2010, f1892, f1977; +add.f32 f2408, f1893, f1978; +sub.f32 f2011, f1893, f1978; +add.f32 f2012, f1896, f1981; +sub.f32 f2014, f1896, f1981; +add.f32 f2407, f1897, f1983; +sub.f32 f2015, f1897, f1983; +add.f32 f2016, f1778, f1794; +sub.f32 f2018, f1778, f1794; +add.f32 f2406, f1810, f1826; +sub.f32 f2019, f1810, f1826; +add.f32 f2020, f1786, f1802; +sub.f32 f2022, f1786, f1802; +add.f32 f2405, f1818, f1834; +sub.f32 f2023, f1818, f1834; +add.f32 f2024, f2016, f2020; +sub.f32 f2026, f2016, f2020; +add.f32 f2404, f2406, f2405; +sub.f32 f2027, f2406, f2405; +add.f32 f2028, f2018, f2023; +sub.f32 f2030, f2018, f2023; +sub.f32 f2403, f2019, f2022; +add.f32 f2031, f2019, f2022; +add.f32 f2032, f1782, f1798; +sub.f32 f2034, f1782, f1798; +add.f32 f2402, f1814, f1830; +sub.f32 f2035, f1814, f1830; +add.f32 f2036, f1790, f1806; +sub.f32 f2038, f1790, f1806; +add.f32 f2401, f1822, f1838; +sub.f32 f2039, f1822, f1838; +add.f32 f2040, f2032, f2036; +sub.f32 f2042, f2032, f2036; +add.f32 f2400, f2402, f2401; +sub.f32 f2043, f2402, f2401; +add.f32 f2044, f2034, f2039; +sub.f32 f2046, f2034, f2039; +sub.f32 f2399, f2035, f2038; +add.f32 f2047, f2035, f2038; +mul.f32 f2397, f2044, 0f3F3504F3; +mul.f32 f2398, f2399, 0fBF3504F3; +sub.f32 f2050, f2397, f2398; +mul.f32 f2051, f2399, 0f3F3504F3; +fma.rn.f32 f2052, f2044, 0fBF3504F3, f2051; +mul.f32 f2053, f2046, 0fBF3504F3; +mul.f32 f2054, f2047, 0fBF3504F3; +sub.f32 f2055, f2053, f2054; +add.f32 f2056, f2053, f2054; +add.f32 f2057, f2024, f2040; +sub.f32 f2059, f2024, f2040; +add.f32 f2396, f2404, f2400; +sub.f32 f2060, f2404, f2400; +add.f32 f2061, f2028, f2050; +sub.f32 f2063, f2028, f2050; +add.f32 f2395, f2403, f2052; +sub.f32 f2064, f2403, f2052; +add.f32 f2065, f2026, f2043; +sub.f32 f2067, f2026, f2043; +sub.f32 f2394, f2027, f2042; +add.f32 f2068, f2027, f2042; +add.f32 f2069, f2030, f2055; +sub.f32 f2071, f2030, f2055; +add.f32 f2393, f2031, f2056; +sub.f32 f2072, f2031, f2056; +add.f32 f2073, f1780, f1796; +sub.f32 f2075, f1780, f1796; +add.f32 f2392, f1812, f1828; +sub.f32 f2076, f1812, f1828; +add.f32 f2077, f1788, f1804; +sub.f32 f2079, f1788, f1804; +add.f32 f2391, f1820, f1836; +sub.f32 f2080, f1820, f1836; +add.f32 f2081, f2073, f2077; +sub.f32 f2083, f2073, f2077; +add.f32 f2390, f2392, f2391; +sub.f32 f2084, f2392, f2391; +add.f32 f2085, f2075, f2080; +sub.f32 f2087, f2075, f2080; +sub.f32 f2389, f2076, f2079; +add.f32 f2088, f2076, f2079; +add.f32 f2089, f1784, f1800; +sub.f32 f2091, f1784, f1800; +add.f32 f2388, f1816, f1832; +sub.f32 f2092, f1816, f1832; +add.f32 f2093, f1792, f1808; +sub.f32 f2095, f1792, f1808; +add.f32 f2387, f1824, f1840; +sub.f32 f2096, f1824, f1840; +add.f32 f2097, f2089, f2093; +sub.f32 f2099, f2089, f2093; +add.f32 f2386, f2388, f2387; +sub.f32 f2100, f2388, f2387; +add.f32 f2101, f2091, f2096; +sub.f32 f2103, f2091, f2096; +sub.f32 f2385, f2092, f2095; +add.f32 f2104, f2092, f2095; +mul.f32 f2383, f2101, 0f3F3504F3; +mul.f32 f2384, f2385, 0fBF3504F3; +sub.f32 f2107, f2383, f2384; +mul.f32 f2108, f2385, 0f3F3504F3; +fma.rn.f32 f2109, f2101, 0fBF3504F3, f2108; +mul.f32 f2110, f2103, 0fBF3504F3; +mul.f32 f2111, f2104, 0fBF3504F3; +sub.f32 f2112, f2110, f2111; +add.f32 f2113, f2110, f2111; +add.f32 f2114, f2081, f2097; +sub.f32 f2116, f2081, f2097; +add.f32 f2382, f2390, f2386; +sub.f32 f2117, f2390, f2386; +add.f32 f2118, f2085, f2107; +sub.f32 f2120, f2085, f2107; +add.f32 f2381, f2389, f2109; +sub.f32 f2121, f2389, f2109; +add.f32 f2122, f2083, f2100; +sub.f32 f2124, f2083, f2100; +sub.f32 f2380, f2084, f2099; +add.f32 f2125, f2084, f2099; +add.f32 f2126, f2087, f2112; +sub.f32 f2128, f2087, f2112; +add.f32 f2379, f2088, f2113; +sub.f32 f2129, f2088, f2113; +mul.f32 f2131, f2381, 0fBEC3EF15; +mul.f32 f2378, f2118, 0f3F6C835E; +sub.f32 f2132, f2378, f2131; +mul.f32 f2133, f2381, 0f3F6C835E; +fma.rn.f32 f2134, f2118, 0fBEC3EF15, f2133; +mul.f32 f2136, f2380, 0fBF3504F3; +mul.f32 f2377, f2122, 0f3F3504F3; +sub.f32 f2137, f2377, f2136; +mul.f32 f2138, f2380, 0f3F3504F3; +fma.rn.f32 f2139, f2122, 0fBF3504F3, f2138; +mul.f32 f2375, f2126, 0f3EC3EF15; +mul.f32 f2376, f2379, 0fBF6C835E; +sub.f32 f2142, f2375, f2376; +mul.f32 f2143, f2379, 0f3EC3EF15; +fma.rn.f32 f2144, f2126, 0fBF6C835E, f2143; +mul.f32 f2373, f2120, 0fBEC3EF15; +mul.f32 f2374, f2121, 0fBF6C835E; +sub.f32 f2147, f2373, f2374; +mul.f32 f2148, f2121, 0fBEC3EF15; +fma.rn.f32 f2149, f2120, 0fBF6C835E, f2148; +mul.f32 f2150, f2124, 0fBF3504F3; +mul.f32 f2151, f2125, 0fBF3504F3; +sub.f32 f2152, f2150, f2151; +add.f32 f2153, f2150, f2151; +mul.f32 f2155, f2129, 0fBEC3EF15; +mul.f32 f2372, f2128, 0fBF6C835E; +sub.f32 f2156, f2372, f2155; +mul.f32 f2157, f2129, 0fBF6C835E; +fma.rn.f32 f2158, f2128, 0fBEC3EF15, f2157; +add.f32 f2159, f2057, f2114; +sub.f32 f2161, f2057, f2114; +add.f32 f2371, f2396, f2382; +sub.f32 f2162, f2396, f2382; +add.f32 f2163, f2061, f2132; +sub.f32 f2165, f2061, f2132; +add.f32 f2370, f2395, f2134; +sub.f32 f2166, f2395, f2134; +add.f32 f2167, f2065, f2137; +sub.f32 f2169, f2065, f2137; +add.f32 f2369, f2394, f2139; +sub.f32 f2170, f2394, f2139; +add.f32 f2171, f2069, f2142; +sub.f32 f2173, f2069, f2142; +add.f32 f2368, f2393, f2144; +sub.f32 f2174, f2393, f2144; +add.f32 f2175, f2059, f2117; +sub.f32 f2177, f2059, f2117; +sub.f32 f2367, f2060, f2116; +add.f32 f2178, f2060, f2116; +add.f32 f2179, f2063, f2147; +sub.f32 f2181, f2063, f2147; +add.f32 f2366, f2064, f2149; +sub.f32 f2182, f2064, f2149; +add.f32 f2183, f2067, f2152; +sub.f32 f2185, f2067, f2152; +add.f32 f2365, f2068, f2153; +sub.f32 f2186, f2068, f2153; +add.f32 f2187, f2071, f2156; +sub.f32 f2189, f2071, f2156; +add.f32 f2364, f2072, f2158; +sub.f32 f2190, f2072, f2158; +mul.f32 f2192, f2370, 0fBE47C5C2; +mul.f32 f2363, f2163, 0f3F7B14BE; +sub.f32 f2193, f2363, f2192; +mul.f32 f2194, f2370, 0f3F7B14BE; +fma.rn.f32 f2195, f2163, 0fBE47C5C2, f2194; +mul.f32 f2197, f2369, 0fBEC3EF15; +mul.f32 f2362, f2167, 0f3F6C835E; +sub.f32 f2198, f2362, f2197; +mul.f32 f2199, f2369, 0f3F6C835E; +fma.rn.f32 f2200, f2167, 0fBEC3EF15, f2199; +mul.f32 f2202, f2368, 0fBF0E39DA; +mul.f32 f2361, f2171, 0f3F54DB31; +sub.f32 f2203, f2361, f2202; +mul.f32 f2204, f2368, 0f3F54DB31; +fma.rn.f32 f2205, f2171, 0fBF0E39DA, f2204; +mul.f32 f2207, f2367, 0fBF3504F3; +mul.f32 f2360, f2175, 0f3F3504F3; +sub.f32 f2208, f2360, f2207; +mul.f32 f2209, f2367, 0f3F3504F3; +fma.rn.f32 f2210, f2175, 0fBF3504F3, f2209; +mul.f32 f2212, f2366, 0fBF54DB31; +mul.f32 f2359, f2179, 0f3F0E39DA; +sub.f32 f2213, f2359, f2212; +mul.f32 f2214, f2366, 0f3F0E39DA; +fma.rn.f32 f2215, f2179, 0fBF54DB31, f2214; +mul.f32 f2357, f2183, 0f3EC3EF15; +mul.f32 f2358, f2365, 0fBF6C835E; +sub.f32 f2218, f2357, f2358; +mul.f32 f2219, f2365, 0f3EC3EF15; +fma.rn.f32 f2220, f2183, 0fBF6C835E, f2219; +mul.f32 f2355, f2187, 0f3E47C5C2; +mul.f32 f2356, f2364, 0fBF7B14BE; +sub.f32 f2223, f2355, f2356; +mul.f32 f2224, f2364, 0f3E47C5C2; +fma.rn.f32 f2225, f2187, 0fBF7B14BE, f2224; +mul.f32 f2353, f2165, 0fBE47C5C2; +mul.f32 f2354, f2166, 0fBF7B14BE; +sub.f32 f2228, f2353, f2354; +mul.f32 f2229, f2166, 0fBE47C5C2; +fma.rn.f32 f2230, f2165, 0fBF7B14BE, f2229; +mul.f32 f2232, f2170, 0fBF6C835E; +mul.f32 f2352, f2169, 0fBEC3EF15; +sub.f32 f2233, f2352, f2232; +mul.f32 f2234, f2170, 0fBEC3EF15; +fma.rn.f32 f2235, f2169, 0fBF6C835E, f2234; +mul.f32 f2237, f2174, 0fBF54DB31; +mul.f32 f2351, f2173, 0fBF0E39DA; +sub.f32 f2238, f2351, f2237; +mul.f32 f2239, f2174, 0fBF0E39DA; +fma.rn.f32 f2240, f2173, 0fBF54DB31, f2239; +mul.f32 f2241, f2177, 0fBF3504F3; +mul.f32 f2242, f2178, 0fBF3504F3; +sub.f32 f2243, f2241, f2242; +add.f32 f2244, f2241, f2242; +mul.f32 f2349, f2181, 0fBF54DB31; +mul.f32 f2350, f2182, 0fBF0E39DA; +sub.f32 f2247, f2349, f2350; +mul.f32 f2248, f2182, 0fBF54DB31; +fma.rn.f32 f2249, f2181, 0fBF0E39DA, f2248; +mul.f32 f2251, f2186, 0fBEC3EF15; +mul.f32 f2348, f2185, 0fBF6C835E; +sub.f32 f2252, f2348, f2251; +mul.f32 f2253, f2186, 0fBF6C835E; +fma.rn.f32 f2254, f2185, 0fBEC3EF15, f2253; +mul.f32 f2256, f2190, 0fBE47C5C2; +mul.f32 f2347, f2189, 0fBF7B14BE; +sub.f32 f2257, f2347, f2256; +mul.f32 f2258, f2190, 0fBF7B14BE; +fma.rn.f32 f2259, f2189, 0fBE47C5C2, f2258; +add.f32 %1, f2414, f2371; +add.f32 %0, f1984, f2159; +add.f32 %2, f1988, f2193; +add.f32 %3, f2413, f2195; +add.f32 %5, f2412, f2200; +add.f32 %4, f1992, f2198; +add.f32 %7, f2411, f2205; +add.f32 %6, f1996, f2203; +add.f32 %9, f2410, f2210; +add.f32 %8, f2000, f2208; +add.f32 %10, f2004, f2213; +add.f32 %11, f2409, f2215; +add.f32 %12, f2008, f2218; +add.f32 %13, f2408, f2220; +add.f32 %14, f2012, f2223; +add.f32 %15, f2407, f2225; +sub.f32 %17, f1987, f2161; +add.f32 %16, f1986, f2162; +add.f32 %19, f1991, f2230; +add.f32 %18, f1990, f2228; +add.f32 %21, f1995, f2235; +add.f32 %20, f1994, f2233; +add.f32 %22, f1998, f2238; +add.f32 %23, f1999, f2240; +add.f32 %24, f2002, f2243; +add.f32 %25, f2003, f2244; +add.f32 %26, f2006, f2247; +add.f32 %27, f2007, f2249; +add.f32 %28, f2010, f2252; +add.f32 %29, f2011, f2254; +add.f32 %31, f2015, f2259; +add.f32 %30, f2014, f2257; +sub.f32 %32, f1984, f2159; +sub.f32 %33, f2414, f2371; +sub.f32 %35, f2413, f2195; +sub.f32 %34, f1988, f2193; +sub.f32 %37, f2412, f2200; +sub.f32 %36, f1992, f2198; +sub.f32 %39, f2411, f2205; +sub.f32 %38, f1996, f2203; +sub.f32 %41, f2410, f2210; +sub.f32 %40, f2000, f2208; +sub.f32 %43, f2409, f2215; +sub.f32 %42, f2004, f2213; +sub.f32 %45, f2408, f2220; +sub.f32 %44, f2008, f2218; +sub.f32 %47, f2407, f2225; +sub.f32 %46, f2012, f2223; +add.f32 %49, f1987, f2161; +sub.f32 %48, f1986, f2162; +sub.f32 %51, f1991, f2230; +sub.f32 %50, f1990, f2228; +sub.f32 %53, f1995, f2235; +sub.f32 %52, f1994, f2233; +sub.f32 %55, f1999, f2240; +sub.f32 %54, f1998, f2238; +sub.f32 %57, f2003, f2244; +sub.f32 %56, f2002, f2243; +sub.f32 %59, f2007, f2249; +sub.f32 %58, f2006, f2247; +sub.f32 %61, f2011, f2254; +sub.f32 %60, f2010, f2252; +sub.f32 %63, f2015, f2259; +sub.f32 %62, f2014, f2257; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_32768), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..0f0ec725774ba --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32768_fp32_inv.hpp.inc @@ -0,0 +1,2324 @@ +#ifndef CUFFTDX_FFT_32768_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_32768_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1158, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2793>; +.reg .b32 r<30>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2791, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2789, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2788, f2791, f2789; +sub.f32 f140, f2791, f2789; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2787, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2784, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2782, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2781, f2784, f2782; +sub.f32 f156, f2784, f2782; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2780, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2780, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2778, f159, 0fBF3504F3; +mul.f32 f2779, f160, 0f3F3504F3; +sub.f32 f167, f2778, f2779; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2777, f2788, f2781; +sub.f32 f173, f2788, f2781; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2776, f2787, f164; +sub.f32 f177, f2787, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2775, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2774, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2772, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2769, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2768, f2772, f2769; +sub.f32 f197, f2772, f2769; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2767, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2765, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2763, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2762, f2765, f2763; +sub.f32 f213, f2765, f2763; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2761, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2761, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2759, f216, 0fBF3504F3; +mul.f32 f2760, f217, 0f3F3504F3; +sub.f32 f224, f2759, f2760; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2758, f2768, f2762; +sub.f32 f230, f2768, f2762; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2757, f2767, f221; +sub.f32 f234, f2767, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2756, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2755, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2753, f231, 0f3F6C835E; +mul.f32 f2754, f2757, 0f3EC3EF15; +sub.f32 f245, f2753, f2754; +mul.f32 f246, f2757, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2756, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2755, 0f3F6C835E; +mul.f32 f2752, f239, 0f3EC3EF15; +sub.f32 f254, f2752, f253; +mul.f32 f255, f2755, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2751, f233, 0fBEC3EF15; +sub.f32 f259, f2751, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2749, f237, 0fBF3504F3; +mul.f32 f2750, f238, 0f3F3504F3; +sub.f32 f264, f2749, f2750; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2747, f241, 0fBF6C835E; +mul.f32 f2748, f242, 0f3EC3EF15; +sub.f32 f269, f2747, f2748; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2746, f2777, f2758; +sub.f32 f275, f2777, f2758; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2745, f2776, f247; +sub.f32 f279, f2776, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2744, f2775, f251; +sub.f32 f283, f2775, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2743, f2774, f256; +sub.f32 f287, f2774, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2742, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2741, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2740, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2739, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2736, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2734, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2733, f2736, f2734; +sub.f32 f315, f2736, f2734; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2732, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2730, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2727, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2726, f2730, f2727; +sub.f32 f331, f2730, f2727; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2725, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2725, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2724, f334, 0fBF3504F3; +sub.f32 f342, f2724, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2723, f2733, f2726; +sub.f32 f348, f2733, f2726; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2722, f2732, f339; +sub.f32 f352, f2732, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2721, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2720, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2718, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2716, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2715, f2718, f2716; +sub.f32 f372, f2718, f2716; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2714, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2711, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2710, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2709, f2711, f2710; +sub.f32 f388, f2711, f2710; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2708, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2708, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2706, f391, 0fBF3504F3; +mul.f32 f2707, f392, 0f3F3504F3; +sub.f32 f399, f2706, f2707; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2705, f2715, f2709; +sub.f32 f405, f2715, f2709; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2704, f2714, f396; +sub.f32 f409, f2714, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2703, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2702, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2704, 0f3EC3EF15; +mul.f32 f2701, f406, 0f3F6C835E; +sub.f32 f420, f2701, f419; +mul.f32 f421, f2704, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2703, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2702, 0f3F6C835E; +mul.f32 f2700, f414, 0f3EC3EF15; +sub.f32 f429, f2700, f428; +mul.f32 f430, f2702, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2699, f408, 0fBEC3EF15; +sub.f32 f434, f2699, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2698, f412, 0fBF3504F3; +sub.f32 f439, f2698, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2697, f416, 0fBF6C835E; +sub.f32 f444, f2697, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2696, f2723, f2705; +sub.f32 f450, f2723, f2705; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2695, f2722, f422; +sub.f32 f454, f2722, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2694, f2721, f426; +sub.f32 f458, f2721, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2693, f2720, f431; +sub.f32 f462, f2720, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2692, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2691, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2690, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2689, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2695, 0f3E47C5C2; +mul.f32 f2688, f451, 0f3F7B14BE; +sub.f32 f481, f2688, f480; +mul.f32 f482, f2695, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2694, 0f3EC3EF15; +mul.f32 f2687, f455, 0f3F6C835E; +sub.f32 f486, f2687, f485; +mul.f32 f487, f2694, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2693, 0f3F0E39DA; +mul.f32 f2686, f459, 0f3F54DB31; +sub.f32 f491, f2686, f490; +mul.f32 f492, f2693, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2692, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2691, 0f3F54DB31; +mul.f32 f2685, f467, 0f3F0E39DA; +sub.f32 f500, f2685, f499; +mul.f32 f501, f2691, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2690, 0f3F6C835E; +mul.f32 f2684, f471, 0f3EC3EF15; +sub.f32 f505, f2684, f504; +mul.f32 f506, f2690, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2689, 0f3F7B14BE; +mul.f32 f2683, f475, 0f3E47C5C2; +sub.f32 f510, f2683, f509; +mul.f32 f511, f2689, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2682, f453, 0fBE47C5C2; +sub.f32 f515, f2682, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2680, f457, 0fBEC3EF15; +mul.f32 f2681, f458, 0f3F6C835E; +sub.f32 f520, f2680, f2681; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2678, f461, 0fBF0E39DA; +mul.f32 f2679, f462, 0f3F54DB31; +sub.f32 f525, f2678, f2679; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2676, f465, 0fBF3504F3; +mul.f32 f2677, f466, 0f3F3504F3; +sub.f32 f530, f2676, f2677; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2674, f469, 0fBF54DB31; +mul.f32 f2675, f470, 0f3F0E39DA; +sub.f32 f535, f2674, f2675; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2673, f473, 0fBF6C835E; +sub.f32 f540, f2673, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2672, f477, 0fBF7B14BE; +sub.f32 f545, f2672, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2671, f2746, f2696; +sub.f32 f551, f2746, f2696; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2670, f2745, f483; +sub.f32 f555, f2745, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2669, f2744, f488; +sub.f32 f559, f2744, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2668, f2743, f493; +sub.f32 f563, f2743, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2667, f2742, f497; +sub.f32 f567, f2742, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f2666, f2741, f502; +sub.f32 f571, f2741, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f2665, f2740, f507; +sub.f32 f575, f2740, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f2664, f2739, f512; +sub.f32 f579, f2739, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f2663, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f2662, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f2661, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f2660, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f2659, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2658, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2657, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2656, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f2670, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f2670; +sub.f32 f620, f619, f618; +mul.f32 f2654, f612, f612; +mul.f32 f2655, f613, f613; +sub.f32 f623, f2654, f2655; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f2669, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f2669; +sub.f32 f630, f629, f628; +mul.f32 f2652, f612, f623; +mul.f32 f2653, f613, f625; +sub.f32 f633, f2652, f2653; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f2668, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f2668; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f2651, f612, f633; +sub.f32 f643, f2651, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f2667, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f2667; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f2650, f612, f643; +sub.f32 f653, f2650, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f2666, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f2666; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f2649, f612, f653; +sub.f32 f663, f2649, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f2665, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f2665; +sub.f32 f670, f669, f668; +mul.f32 f2647, f612, f663; +mul.f32 f2648, f613, f665; +sub.f32 f673, f2647, f2648; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f2664, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f2664; +sub.f32 f680, f679, f678; +mul.f32 f2645, f612, f673; +mul.f32 f2646, f613, f675; +sub.f32 f683, f2645, f2646; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f2663, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f2663; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f2644, f612, f683; +sub.f32 f693, f2644, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f2662, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f2662; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f2643, f612, f693; +sub.f32 f703, f2643, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f2661, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f2661; +sub.f32 f710, f709, f708; +mul.f32 f2641, f612, f703; +mul.f32 f2642, f613, f705; +sub.f32 f713, f2641, f2642; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f2660, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f2660; +sub.f32 f720, f719, f718; +mul.f32 f2639, f612, f713; +mul.f32 f2640, f613, f715; +sub.f32 f723, f2639, f2640; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f2659, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f2659; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f2638, f612, f723; +sub.f32 f733, f2638, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f2658, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f2658; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f2637, f612, f733; +sub.f32 f743, f2637, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f2657, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f2657; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f2636, f612, f743; +sub.f32 f753, f2636, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f2656, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f2656; +sub.f32 f760, f759, f758; +mul.f32 f2634, f612, f753; +mul.f32 f2635, f613, f755; +sub.f32 f763, f2634, f2635; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f2632, f612, f763; +mul.f32 f2633, f613, f765; +sub.f32 f773, f2632, f2633; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f2631, f612, f773; +sub.f32 f783, f2631, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f2630, f612, f783; +sub.f32 f793, f2630, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f2629, f612, f793; +sub.f32 f803, f2629, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f2627, f612, f803; +mul.f32 f2628, f613, f805; +sub.f32 f813, f2627, f2628; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f2625, f612, f813; +mul.f32 f2626, f613, f815; +sub.f32 f823, f2625, f2626; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f2624, f612, f823; +sub.f32 f833, f2624, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f2623, f612, f833; +sub.f32 f843, f2623, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f2621, f612, f843; +mul.f32 f2622, f613, f845; +sub.f32 f853, f2621, f2622; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f2619, f612, f853; +mul.f32 f2620, f613, f855; +sub.f32 f863, f2619, f2620; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f2618, f612, f863; +sub.f32 f873, f2618, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f2617, f612, f873; +sub.f32 f883, f2617, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f2616, f612, f883; +sub.f32 f893, f2616, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f2614, f612, f893; +mul.f32 f2615, f613, f895; +sub.f32 f903, f2614, f2615; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f2612, f612, f903; +mul.f32 f2613, f613, f905; +sub.f32 f913, f2612, f2613; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +shl.b32 r8, r24, 7; +and.b32 r9, r8, -131072; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 130944; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +and.b32 r23, r24, 1023; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+4096]; +ld.shared.f32 f923, [r13+8192]; +ld.shared.f32 f924, [r13+12288]; +ld.shared.f32 f925, [r13+16384]; +ld.shared.f32 f926, [r13+20480]; +ld.shared.f32 f927, [r13+24576]; +ld.shared.f32 f928, [r13+28672]; +ld.shared.f32 f929, [r13+32768]; +ld.shared.f32 f930, [r13+36864]; +ld.shared.f32 f931, [r13+40960]; +ld.shared.f32 f932, [r13+45056]; +ld.shared.f32 f933, [r13+49152]; +ld.shared.f32 f934, [r13+53248]; +ld.shared.f32 f935, [r13+57344]; +ld.shared.f32 f936, [r13+61440]; +ld.shared.f32 f937, [r13+65536]; +ld.shared.f32 f938, [r13+69632]; +ld.shared.f32 f939, [r13+73728]; +ld.shared.f32 f940, [r13+77824]; +ld.shared.f32 f941, [r13+81920]; +ld.shared.f32 f942, [r13+86016]; +ld.shared.f32 f943, [r13+90112]; +ld.shared.f32 f944, [r13+94208]; +ld.shared.f32 f945, [r13+98304]; +ld.shared.f32 f946, [r13+102400]; +ld.shared.f32 f947, [r13+106496]; +ld.shared.f32 f948, [r13+110592]; +ld.shared.f32 f949, [r13+114688]; +ld.shared.f32 f950, [r13+118784]; +ld.shared.f32 f951, [r13+122880]; +ld.shared.f32 f952, [r13+126976]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2671, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+4096]; +ld.shared.f32 f955, [r13+8192]; +ld.shared.f32 f956, [r13+12288]; +ld.shared.f32 f957, [r13+16384]; +ld.shared.f32 f958, [r13+20480]; +ld.shared.f32 f959, [r13+24576]; +ld.shared.f32 f960, [r13+28672]; +ld.shared.f32 f961, [r13+32768]; +ld.shared.f32 f962, [r13+36864]; +ld.shared.f32 f963, [r13+40960]; +ld.shared.f32 f964, [r13+45056]; +ld.shared.f32 f965, [r13+49152]; +ld.shared.f32 f966, [r13+53248]; +ld.shared.f32 f967, [r13+57344]; +ld.shared.f32 f968, [r13+61440]; +ld.shared.f32 f969, [r13+65536]; +ld.shared.f32 f970, [r13+69632]; +ld.shared.f32 f971, [r13+73728]; +ld.shared.f32 f972, [r13+77824]; +ld.shared.f32 f973, [r13+81920]; +ld.shared.f32 f974, [r13+86016]; +ld.shared.f32 f975, [r13+90112]; +ld.shared.f32 f976, [r13+94208]; +ld.shared.f32 f977, [r13+98304]; +ld.shared.f32 f978, [r13+102400]; +ld.shared.f32 f979, [r13+106496]; +ld.shared.f32 f980, [r13+110592]; +ld.shared.f32 f981, [r13+114688]; +ld.shared.f32 f982, [r13+118784]; +ld.shared.f32 f983, [r13+122880]; +ld.shared.f32 f984, [r13+126976]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2611, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2610, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2609, f2611, f2610; +sub.f32 f996, f2611, f2610; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f2608, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2607, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2606, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2605, f2607, f2606; +sub.f32 f1012, f2607, f2606; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f2604, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f2604, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f2602, f1015, 0fBF3504F3; +mul.f32 f2603, f1016, 0f3F3504F3; +sub.f32 f1023, f2602, f2603; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2601, f2609, f2605; +sub.f32 f1029, f2609, f2605; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2600, f2608, f1020; +sub.f32 f1033, f2608, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f2599, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f2598, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2597, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2596, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2595, f2597, f2596; +sub.f32 f1053, f2597, f2596; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f2594, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2593, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2592, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2591, f2593, f2592; +sub.f32 f1069, f2593, f2592; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f2590, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f2590, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f2588, f1072, 0fBF3504F3; +mul.f32 f2589, f1073, 0f3F3504F3; +sub.f32 f1080, f2588, f2589; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2587, f2595, f2591; +sub.f32 f1086, f2595, f2591; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2586, f2594, f1077; +sub.f32 f1090, f2594, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f2585, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f2584, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2582, f1087, 0f3F6C835E; +mul.f32 f2583, f2586, 0f3EC3EF15; +sub.f32 f1101, f2582, f2583; +mul.f32 f1102, f2586, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f2585, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f2580, f1095, 0f3EC3EF15; +mul.f32 f2581, f2584, 0f3F6C835E; +sub.f32 f1110, f2580, f2581; +mul.f32 f1111, f2584, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f2578, f1089, 0fBEC3EF15; +mul.f32 f2579, f1090, 0f3F6C835E; +sub.f32 f1115, f2578, f2579; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f2576, f1093, 0fBF3504F3; +mul.f32 f2577, f1094, 0f3F3504F3; +sub.f32 f1120, f2576, f2577; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f2574, f1097, 0fBF6C835E; +mul.f32 f2575, f1098, 0f3EC3EF15; +sub.f32 f1125, f2574, f2575; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2573, f2601, f2587; +sub.f32 f1131, f2601, f2587; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2572, f2600, f1103; +sub.f32 f1135, f2600, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2571, f2599, f1107; +sub.f32 f1139, f2599, f1107; +add.f32 f1140, f1038, f1110; +sub.f32 f1142, f1038, f1110; +add.f32 f2570, f2598, f1112; +sub.f32 f1143, f2598, f1112; +sub.f32 f1144, f1028, f1086; +add.f32 f1146, f1028, f1086; +add.f32 f2569, f1029, f1085; +sub.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1115; +sub.f32 f1150, f1032, f1115; +add.f32 f2568, f1033, f1117; +sub.f32 f1151, f1033, f1117; +add.f32 f1152, f1036, f1120; +sub.f32 f1154, f1036, f1120; +add.f32 f2567, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2566, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2565, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2564, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2563, f2565, f2564; +sub.f32 f1171, f2565, f2564; +sub.f32 f1172, f1162, f1167; +add.f32 f1174, f1162, f1167; +add.f32 f2562, f1163, f1166; +sub.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2561, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2560, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2559, f2561, f2560; +sub.f32 f1187, f2561, f2560; +sub.f32 f1188, f1178, f1183; +add.f32 f1190, f1178, f1183; +add.f32 f2558, f1179, f1182; +sub.f32 f1191, f1179, f1182; +mul.f32 f1192, f1188, 0f3F3504F3; +mul.f32 f1193, f2558, 0f3F3504F3; +sub.f32 f1194, f1192, f1193; +add.f32 f1195, f1192, f1193; +mul.f32 f2556, f1190, 0fBF3504F3; +mul.f32 f2557, f1191, 0f3F3504F3; +sub.f32 f1198, f2556, f2557; +mul.f32 f1199, f1191, 0fBF3504F3; +fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2555, f2563, f2559; +sub.f32 f1204, f2563, f2559; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2554, f2562, f1195; +sub.f32 f1208, f2562, f1195; +sub.f32 f1209, f1170, f1187; +add.f32 f1211, f1170, f1187; +add.f32 f2553, f1171, f1186; +sub.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1198; +sub.f32 f1215, f1174, f1198; +add.f32 f2552, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2551, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2550, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2549, f2551, f2550; +sub.f32 f1228, f2551, f2550; +sub.f32 f1229, f1219, f1224; +add.f32 f1231, f1219, f1224; +add.f32 f2548, f1220, f1223; +sub.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2547, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2546, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2545, f2547, f2546; +sub.f32 f1244, f2547, f2546; +sub.f32 f1245, f1235, f1240; +add.f32 f1247, f1235, f1240; +add.f32 f2544, f1236, f1239; +sub.f32 f1248, f1236, f1239; +mul.f32 f1249, f1245, 0f3F3504F3; +mul.f32 f1250, f2544, 0f3F3504F3; +sub.f32 f1251, f1249, f1250; +add.f32 f1252, f1249, f1250; +mul.f32 f2542, f1247, 0fBF3504F3; +mul.f32 f2543, f1248, 0f3F3504F3; +sub.f32 f1255, f2542, f2543; +mul.f32 f1256, f1248, 0fBF3504F3; +fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2541, f2549, f2545; +sub.f32 f1261, f2549, f2545; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2540, f2548, f1252; +sub.f32 f1265, f2548, f1252; +sub.f32 f1266, f1227, f1244; +add.f32 f1268, f1227, f1244; +add.f32 f2539, f1228, f1243; +sub.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1255; +sub.f32 f1272, f1231, f1255; +add.f32 f2538, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2536, f1262, 0f3F6C835E; +mul.f32 f2537, f2540, 0f3EC3EF15; +sub.f32 f1276, f2536, f2537; +mul.f32 f1277, f2540, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277; +mul.f32 f1279, f1266, 0f3F3504F3; +mul.f32 f1280, f2539, 0f3F3504F3; +sub.f32 f1281, f1279, f1280; +add.f32 f1282, f1279, f1280; +mul.f32 f1284, f2538, 0f3F6C835E; +mul.f32 f2535, f1270, 0f3EC3EF15; +sub.f32 f1285, f2535, f1284; +mul.f32 f1286, f2538, 0f3EC3EF15; +fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286; +mul.f32 f1289, f1265, 0f3F6C835E; +mul.f32 f2534, f1264, 0fBEC3EF15; +sub.f32 f1290, f2534, f1289; +mul.f32 f1291, f1265, 0fBEC3EF15; +fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291; +mul.f32 f2532, f1268, 0fBF3504F3; +mul.f32 f2533, f1269, 0f3F3504F3; +sub.f32 f1295, f2532, f2533; +mul.f32 f1296, f1269, 0fBF3504F3; +fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296; +mul.f32 f2530, f1272, 0fBF6C835E; +mul.f32 f2531, f1273, 0f3EC3EF15; +sub.f32 f1300, f2530, f2531; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2529, f2555, f2541; +sub.f32 f1306, f2555, f2541; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2528, f2554, f1278; +sub.f32 f1310, f2554, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2527, f2553, f1282; +sub.f32 f1314, f2553, f1282; +add.f32 f1315, f1213, f1285; +sub.f32 f1317, f1213, f1285; +add.f32 f2526, f2552, f1287; +sub.f32 f1318, f2552, f1287; +sub.f32 f1319, f1203, f1261; +add.f32 f1321, f1203, f1261; +add.f32 f2525, f1204, f1260; +sub.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1290; +sub.f32 f1325, f1207, f1290; +add.f32 f2524, f1208, f1292; +sub.f32 f1326, f1208, f1292; +add.f32 f1327, f1211, f1295; +sub.f32 f1329, f1211, f1295; +add.f32 f2523, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2522, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2528, 0f3E47C5C2; +mul.f32 f2521, f1307, 0f3F7B14BE; +sub.f32 f1337, f2521, f1336; +mul.f32 f1338, f2528, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338; +mul.f32 f1341, f2527, 0f3EC3EF15; +mul.f32 f2520, f1311, 0f3F6C835E; +sub.f32 f1342, f2520, f1341; +mul.f32 f1343, f2527, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343; +mul.f32 f2518, f1315, 0f3F54DB31; +mul.f32 f2519, f2526, 0f3F0E39DA; +sub.f32 f1347, f2518, f2519; +mul.f32 f1348, f2526, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348; +mul.f32 f1350, f1319, 0f3F3504F3; +mul.f32 f1351, f2525, 0f3F3504F3; +sub.f32 f1352, f1350, f1351; +add.f32 f1353, f1350, f1351; +mul.f32 f1355, f2524, 0f3F54DB31; +mul.f32 f2517, f1323, 0f3F0E39DA; +sub.f32 f1356, f2517, f1355; +mul.f32 f1357, f2524, 0f3F0E39DA; +fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357; +mul.f32 f1360, f2523, 0f3F6C835E; +mul.f32 f2516, f1327, 0f3EC3EF15; +sub.f32 f1361, f2516, f1360; +mul.f32 f1362, f2523, 0f3EC3EF15; +fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362; +mul.f32 f2514, f1331, 0f3E47C5C2; +mul.f32 f2515, f2522, 0f3F7B14BE; +sub.f32 f1366, f2514, f2515; +mul.f32 f1367, f2522, 0f3E47C5C2; +fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367; +mul.f32 f2512, f1309, 0fBE47C5C2; +mul.f32 f2513, f1310, 0f3F7B14BE; +sub.f32 f1371, f2512, f2513; +mul.f32 f1372, f1310, 0fBE47C5C2; +fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372; +mul.f32 f2510, f1313, 0fBEC3EF15; +mul.f32 f2511, f1314, 0f3F6C835E; +sub.f32 f1376, f2510, f2511; +mul.f32 f1377, f1314, 0fBEC3EF15; +fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377; +mul.f32 f2508, f1317, 0fBF0E39DA; +mul.f32 f2509, f1318, 0f3F54DB31; +sub.f32 f1381, f2508, f2509; +mul.f32 f1382, f1318, 0fBF0E39DA; +fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382; +mul.f32 f1385, f1322, 0f3F3504F3; +mul.f32 f2507, f1321, 0fBF3504F3; +sub.f32 f1386, f2507, f1385; +mul.f32 f1387, f1322, 0fBF3504F3; +fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387; +mul.f32 f1390, f1326, 0f3F0E39DA; +mul.f32 f2506, f1325, 0fBF54DB31; +sub.f32 f1391, f2506, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392; +mul.f32 f1395, f1330, 0f3EC3EF15; +mul.f32 f2505, f1329, 0fBF6C835E; +sub.f32 f1396, f2505, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397; +mul.f32 f1400, f1334, 0f3E47C5C2; +mul.f32 f2504, f1333, 0fBF7B14BE; +sub.f32 f1401, f2504, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2503, f2573, f2529; +sub.f32 f1407, f2573, f2529; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2502, f2572, f1339; +sub.f32 f1411, f2572, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2501, f2571, f1344; +sub.f32 f1415, f2571, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2500, f2570, f1349; +sub.f32 f1419, f2570, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2499, f2569, f1353; +sub.f32 f1423, f2569, f1353; +add.f32 f1424, f1148, f1356; +sub.f32 f1426, f1148, f1356; +add.f32 f2498, f2568, f1358; +sub.f32 f1427, f2568, f1358; +add.f32 f1428, f1152, f1361; +sub.f32 f1430, f1152, f1361; +add.f32 f2497, f2567, f1363; +sub.f32 f1431, f2567, f1363; +add.f32 f1432, f1156, f1366; +sub.f32 f1434, f1156, f1366; +add.f32 f2496, f2566, f1368; +sub.f32 f1435, f2566, f1368; +sub.f32 f1436, f1130, f1306; +add.f32 f1438, f1130, f1306; +add.f32 f2495, f1131, f1305; +sub.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1371; +sub.f32 f1442, f1134, f1371; +add.f32 f2494, f1135, f1373; +sub.f32 f1443, f1135, f1373; +add.f32 f1444, f1138, f1376; +sub.f32 f1446, f1138, f1376; +add.f32 f2493, f1139, f1378; +sub.f32 f1447, f1139, f1378; +add.f32 f1448, f1142, f1381; +sub.f32 f1450, f1142, f1381; +add.f32 f2492, f1143, f1383; +sub.f32 f1451, f1143, f1383; +add.f32 f1452, f1146, f1386; +sub.f32 f1454, f1146, f1386; +add.f32 f2491, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2490, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2489, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2488, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r24, 5, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1472, f2502, f1469; +fma.rn.f32 f1473, f1468, f1408, f1472; +mul.f32 f1474, f1408, f1469; +mul.f32 f1475, f1468, f2502; +sub.f32 f1476, f1475, f1474; +mul.f32 f1478, f1469, f1469; +mul.f32 f2487, f1468, f1468; +sub.f32 f1479, f2487, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1482, f2501, f1481; +fma.rn.f32 f1483, f1479, f1412, f1482; +mul.f32 f1484, f1412, f1481; +mul.f32 f1485, f1479, f2501; +sub.f32 f1486, f1485, f1484; +mul.f32 f2485, f1468, f1479; +mul.f32 f2486, f1469, f1481; +sub.f32 f1489, f2485, f2486; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f1492, f2500, f1491; +fma.rn.f32 f1493, f1489, f1416, f1492; +mul.f32 f1494, f1416, f1491; +mul.f32 f1495, f1489, f2500; +sub.f32 f1496, f1495, f1494; +mul.f32 f2483, f1468, f1489; +mul.f32 f2484, f1469, f1491; +sub.f32 f1499, f2483, f2484; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f1502, f2499, f1501; +fma.rn.f32 f1503, f1499, f1420, f1502; +mul.f32 f1504, f1420, f1501; +mul.f32 f1505, f1499, f2499; +sub.f32 f1506, f1505, f1504; +mul.f32 f1508, f1469, f1501; +mul.f32 f2482, f1468, f1499; +sub.f32 f1509, f2482, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1512, f2498, f1511; +fma.rn.f32 f1513, f1509, f1424, f1512; +mul.f32 f1514, f1424, f1511; +mul.f32 f1515, f1509, f2498; +sub.f32 f1516, f1515, f1514; +mul.f32 f1518, f1469, f1511; +mul.f32 f2481, f1468, f1509; +sub.f32 f1519, f2481, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1522, f2497, f1521; +fma.rn.f32 f1523, f1519, f1428, f1522; +mul.f32 f1524, f1428, f1521; +mul.f32 f1525, f1519, f2497; +sub.f32 f1526, f1525, f1524; +mul.f32 f1528, f1469, f1521; +mul.f32 f2480, f1468, f1519; +sub.f32 f1529, f2480, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f1532, f2496, f1531; +fma.rn.f32 f1533, f1529, f1432, f1532; +mul.f32 f1534, f1432, f1531; +mul.f32 f1535, f1529, f2496; +sub.f32 f1536, f1535, f1534; +mul.f32 f2478, f1468, f1529; +mul.f32 f2479, f1469, f1531; +sub.f32 f1539, f2478, f2479; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f1542, f2495, f1541; +fma.rn.f32 f1543, f1539, f1436, f1542; +mul.f32 f1544, f1436, f1541; +mul.f32 f1545, f1539, f2495; +sub.f32 f1546, f1545, f1544; +mul.f32 f2476, f1468, f1539; +mul.f32 f2477, f1469, f1541; +sub.f32 f1549, f2476, f2477; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1552, f2494, f1551; +fma.rn.f32 f1553, f1549, f1440, f1552; +mul.f32 f1554, f1440, f1551; +mul.f32 f1555, f1549, f2494; +sub.f32 f1556, f1555, f1554; +mul.f32 f1558, f1469, f1551; +mul.f32 f2475, f1468, f1549; +sub.f32 f1559, f2475, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1562, f2493, f1561; +fma.rn.f32 f1563, f1559, f1444, f1562; +mul.f32 f1564, f1444, f1561; +mul.f32 f1565, f1559, f2493; +sub.f32 f1566, f1565, f1564; +mul.f32 f1568, f1469, f1561; +mul.f32 f2474, f1468, f1559; +sub.f32 f1569, f2474, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1572, f2492, f1571; +fma.rn.f32 f1573, f1569, f1448, f1572; +mul.f32 f1574, f1448, f1571; +mul.f32 f1575, f1569, f2492; +sub.f32 f1576, f1575, f1574; +mul.f32 f1578, f1469, f1571; +mul.f32 f2473, f1468, f1569; +sub.f32 f1579, f2473, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f1582, f2491, f1581; +fma.rn.f32 f1583, f1579, f1452, f1582; +mul.f32 f1584, f1452, f1581; +mul.f32 f1585, f1579, f2491; +sub.f32 f1586, f1585, f1584; +mul.f32 f2471, f1468, f1579; +mul.f32 f2472, f1469, f1581; +sub.f32 f1589, f2471, f2472; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f1592, f2490, f1591; +fma.rn.f32 f1593, f1589, f1456, f1592; +mul.f32 f1594, f1456, f1591; +mul.f32 f1595, f1589, f2490; +sub.f32 f1596, f1595, f1594; +mul.f32 f1598, f1469, f1591; +mul.f32 f2470, f1468, f1589; +sub.f32 f1599, f2470, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1602, f2489, f1601; +fma.rn.f32 f1603, f1599, f1460, f1602; +mul.f32 f1604, f1460, f1601; +mul.f32 f1605, f1599, f2489; +sub.f32 f1606, f1605, f1604; +mul.f32 f1608, f1469, f1601; +mul.f32 f2469, f1468, f1599; +sub.f32 f1609, f2469, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1612, f2488, f1611; +fma.rn.f32 f1613, f1609, f1464, f1612; +mul.f32 f1614, f1464, f1611; +mul.f32 f1615, f1609, f2488; +sub.f32 f1616, f1615, f1614; +mul.f32 f1618, f1469, f1611; +mul.f32 f2468, f1468, f1609; +sub.f32 f1619, f2468, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1622, f1407, f1621; +fma.rn.f32 f1623, f1619, f1406, f1622; +mul.f32 f1624, f1406, f1621; +mul.f32 f1625, f1619, f1407; +sub.f32 f1626, f1625, f1624; +mul.f32 f2466, f1468, f1619; +mul.f32 f2467, f1469, f1621; +sub.f32 f1629, f2466, f2467; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f1632, f1411, f1631; +fma.rn.f32 f1633, f1629, f1410, f1632; +mul.f32 f1634, f1410, f1631; +mul.f32 f1635, f1629, f1411; +sub.f32 f1636, f1635, f1634; +mul.f32 f2464, f1468, f1629; +mul.f32 f2465, f1469, f1631; +sub.f32 f1639, f2464, f2465; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f1642, f1415, f1641; +fma.rn.f32 f1643, f1639, f1414, f1642; +mul.f32 f1644, f1414, f1641; +mul.f32 f1645, f1639, f1415; +sub.f32 f1646, f1645, f1644; +mul.f32 f1648, f1469, f1641; +mul.f32 f2463, f1468, f1639; +sub.f32 f1649, f2463, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1652, f1419, f1651; +fma.rn.f32 f1653, f1649, f1418, f1652; +mul.f32 f1654, f1418, f1651; +mul.f32 f1655, f1649, f1419; +sub.f32 f1656, f1655, f1654; +mul.f32 f1658, f1469, f1651; +mul.f32 f2462, f1468, f1649; +sub.f32 f1659, f2462, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1662, f1423, f1661; +fma.rn.f32 f1663, f1659, f1422, f1662; +mul.f32 f1664, f1422, f1661; +mul.f32 f1665, f1659, f1423; +sub.f32 f1666, f1665, f1664; +mul.f32 f1668, f1469, f1661; +mul.f32 f2461, f1468, f1659; +sub.f32 f1669, f2461, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f1672, f1427, f1671; +fma.rn.f32 f1673, f1669, f1426, f1672; +mul.f32 f1674, f1426, f1671; +mul.f32 f1675, f1669, f1427; +sub.f32 f1676, f1675, f1674; +mul.f32 f2459, f1468, f1669; +mul.f32 f2460, f1469, f1671; +sub.f32 f1679, f2459, f2460; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f1682, f1431, f1681; +fma.rn.f32 f1683, f1679, f1430, f1682; +mul.f32 f1684, f1430, f1681; +mul.f32 f1685, f1679, f1431; +sub.f32 f1686, f1685, f1684; +mul.f32 f2457, f1468, f1679; +mul.f32 f2458, f1469, f1681; +sub.f32 f1689, f2457, f2458; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1692, f1435, f1691; +fma.rn.f32 f1693, f1689, f1434, f1692; +mul.f32 f1694, f1434, f1691; +mul.f32 f1695, f1689, f1435; +sub.f32 f1696, f1695, f1694; +mul.f32 f1698, f1469, f1691; +mul.f32 f2456, f1468, f1689; +sub.f32 f1699, f2456, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1702, f1439, f1701; +fma.rn.f32 f1703, f1699, f1438, f1702; +mul.f32 f1704, f1438, f1701; +mul.f32 f1705, f1699, f1439; +sub.f32 f1706, f1705, f1704; +mul.f32 f1708, f1469, f1701; +mul.f32 f2455, f1468, f1699; +sub.f32 f1709, f2455, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1712, f1443, f1711; +fma.rn.f32 f1713, f1709, f1442, f1712; +mul.f32 f1714, f1442, f1711; +mul.f32 f1715, f1709, f1443; +sub.f32 f1716, f1715, f1714; +mul.f32 f2453, f1468, f1709; +mul.f32 f2454, f1469, f1711; +sub.f32 f1719, f2453, f2454; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f1722, f1447, f1721; +fma.rn.f32 f1723, f1719, f1446, f1722; +mul.f32 f1724, f1446, f1721; +mul.f32 f1725, f1719, f1447; +sub.f32 f1726, f1725, f1724; +mul.f32 f2451, f1468, f1719; +mul.f32 f2452, f1469, f1721; +sub.f32 f1729, f2451, f2452; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f1732, f1451, f1731; +fma.rn.f32 f1733, f1729, f1450, f1732; +mul.f32 f1734, f1450, f1731; +mul.f32 f1735, f1729, f1451; +sub.f32 f1736, f1735, f1734; +mul.f32 f1738, f1469, f1731; +mul.f32 f2450, f1468, f1729; +sub.f32 f1739, f2450, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1742, f1455, f1741; +fma.rn.f32 f1743, f1739, f1454, f1742; +mul.f32 f1744, f1454, f1741; +mul.f32 f1745, f1739, f1455; +sub.f32 f1746, f1745, f1744; +mul.f32 f1748, f1469, f1741; +mul.f32 f2449, f1468, f1739; +sub.f32 f1749, f2449, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1752, f1459, f1751; +fma.rn.f32 f1753, f1749, f1458, f1752; +mul.f32 f1754, f1458, f1751; +mul.f32 f1755, f1749, f1459; +sub.f32 f1756, f1755, f1754; +mul.f32 f1758, f1469, f1751; +mul.f32 f2448, f1468, f1749; +sub.f32 f1759, f2448, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f1762, f1463, f1761; +fma.rn.f32 f1763, f1759, f1462, f1762; +mul.f32 f1764, f1462, f1761; +mul.f32 f1765, f1759, f1463; +sub.f32 f1766, f1765, f1764; +mul.f32 f2446, f1468, f1759; +mul.f32 f2447, f1469, f1761; +sub.f32 f1769, f2446, f2447; +mul.f32 f1770, f1468, f1761; +fma.rn.f32 f1771, f1469, f1759, f1770; +mov.u32 r29, %tid.x; +shl.b32 r28, r29, 7; +mul.f32 f1772, f1467, f1771; +fma.rn.f32 f1773, f1769, f1466, f1772; +mul.f32 f1774, f1466, f1771; +mul.f32 f1775, f1769, f1467; +sub.f32 f1776, f1775, f1774; +and.b32 r22, r29, 992; +shl.b32 r16, r29, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r28, 126976; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1473; +st.shared.f32 [r20+256], f1483; +st.shared.f32 [r20+384], f1493; +st.shared.f32 [r20+512], f1503; +st.shared.f32 [r20+640], f1513; +st.shared.f32 [r20+768], f1523; +st.shared.f32 [r20+896], f1533; +st.shared.f32 [r20+1024], f1543; +st.shared.f32 [r20+1152], f1553; +st.shared.f32 [r20+1280], f1563; +st.shared.f32 [r20+1408], f1573; +st.shared.f32 [r20+1536], f1583; +st.shared.f32 [r20+1664], f1593; +st.shared.f32 [r20+1792], f1603; +st.shared.f32 [r20+1920], f1613; +st.shared.f32 [r20+2048], f1623; +st.shared.f32 [r20+2176], f1633; +st.shared.f32 [r20+2304], f1643; +st.shared.f32 [r20+2432], f1653; +st.shared.f32 [r20+2560], f1663; +st.shared.f32 [r20+2688], f1673; +st.shared.f32 [r20+2816], f1683; +st.shared.f32 [r20+2944], f1693; +st.shared.f32 [r20+3072], f1703; +st.shared.f32 [r20+3200], f1713; +st.shared.f32 [r20+3328], f1723; +st.shared.f32 [r20+3456], f1733; +st.shared.f32 [r20+3584], f1743; +st.shared.f32 [r20+3712], f1753; +st.shared.f32 [r20+3840], f1763; +st.shared.f32 [r20+3968], f1773; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+4096]; +ld.shared.f32 f1779, [r21+8192]; +ld.shared.f32 f1780, [r21+12288]; +ld.shared.f32 f1781, [r21+16384]; +ld.shared.f32 f1782, [r21+20480]; +ld.shared.f32 f1783, [r21+24576]; +ld.shared.f32 f1784, [r21+28672]; +ld.shared.f32 f1785, [r21+32768]; +ld.shared.f32 f1786, [r21+36864]; +ld.shared.f32 f1787, [r21+40960]; +ld.shared.f32 f1788, [r21+45056]; +ld.shared.f32 f1789, [r21+49152]; +ld.shared.f32 f1790, [r21+53248]; +ld.shared.f32 f1791, [r21+57344]; +ld.shared.f32 f1792, [r21+61440]; +ld.shared.f32 f1793, [r21+65536]; +ld.shared.f32 f1794, [r21+69632]; +ld.shared.f32 f1795, [r21+73728]; +ld.shared.f32 f1796, [r21+77824]; +ld.shared.f32 f1797, [r21+81920]; +ld.shared.f32 f1798, [r21+86016]; +ld.shared.f32 f1799, [r21+90112]; +ld.shared.f32 f1800, [r21+94208]; +ld.shared.f32 f1801, [r21+98304]; +ld.shared.f32 f1802, [r21+102400]; +ld.shared.f32 f1803, [r21+106496]; +ld.shared.f32 f1804, [r21+110592]; +ld.shared.f32 f1805, [r21+114688]; +ld.shared.f32 f1806, [r21+118784]; +ld.shared.f32 f1807, [r21+122880]; +ld.shared.f32 f1808, [r21+126976]; +barrier.sync 0; +st.shared.f32 [r20], f2503; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+4096]; +ld.shared.f32 f1811, [r21+8192]; +ld.shared.f32 f1812, [r21+12288]; +ld.shared.f32 f1813, [r21+16384]; +ld.shared.f32 f1814, [r21+20480]; +ld.shared.f32 f1815, [r21+24576]; +ld.shared.f32 f1816, [r21+28672]; +ld.shared.f32 f1817, [r21+32768]; +ld.shared.f32 f1818, [r21+36864]; +ld.shared.f32 f1819, [r21+40960]; +ld.shared.f32 f1820, [r21+45056]; +ld.shared.f32 f1821, [r21+49152]; +ld.shared.f32 f1822, [r21+53248]; +ld.shared.f32 f1823, [r21+57344]; +ld.shared.f32 f1824, [r21+61440]; +ld.shared.f32 f1825, [r21+65536]; +ld.shared.f32 f1826, [r21+69632]; +ld.shared.f32 f1827, [r21+73728]; +ld.shared.f32 f1828, [r21+77824]; +ld.shared.f32 f1829, [r21+81920]; +ld.shared.f32 f1830, [r21+86016]; +ld.shared.f32 f1831, [r21+90112]; +ld.shared.f32 f1832, [r21+94208]; +ld.shared.f32 f1833, [r21+98304]; +ld.shared.f32 f1834, [r21+102400]; +ld.shared.f32 f1835, [r21+106496]; +ld.shared.f32 f1836, [r21+110592]; +ld.shared.f32 f1837, [r21+114688]; +ld.shared.f32 f1838, [r21+118784]; +ld.shared.f32 f1839, [r21+122880]; +ld.shared.f32 f1840, [r21+126976]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2445, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2444, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1841, f1845; +sub.f32 f1851, f1841, f1845; +add.f32 f2443, f2445, f2444; +sub.f32 f1852, f2445, f2444; +sub.f32 f1853, f1843, f1848; +add.f32 f1855, f1843, f1848; +add.f32 f2442, f1844, f1847; +sub.f32 f1856, f1844, f1847; +add.f32 f1857, f1781, f1797; +sub.f32 f1859, f1781, f1797; +add.f32 f2441, f1813, f1829; +sub.f32 f1860, f1813, f1829; +add.f32 f1861, f1789, f1805; +sub.f32 f1863, f1789, f1805; +add.f32 f2440, f1821, f1837; +sub.f32 f1864, f1821, f1837; +add.f32 f1865, f1857, f1861; +sub.f32 f1867, f1857, f1861; +add.f32 f2439, f2441, f2440; +sub.f32 f1868, f2441, f2440; +sub.f32 f1869, f1859, f1864; +add.f32 f1871, f1859, f1864; +add.f32 f2438, f1860, f1863; +sub.f32 f1872, f1860, f1863; +mul.f32 f1873, f1869, 0f3F3504F3; +mul.f32 f1874, f2438, 0f3F3504F3; +sub.f32 f1875, f1873, f1874; +add.f32 f1876, f1873, f1874; +mul.f32 f2436, f1871, 0fBF3504F3; +mul.f32 f2437, f1872, 0f3F3504F3; +sub.f32 f1879, f2436, f2437; +mul.f32 f1880, f1872, 0fBF3504F3; +fma.rn.f32 f1881, f1871, 0f3F3504F3, f1880; +add.f32 f1882, f1849, f1865; +sub.f32 f1884, f1849, f1865; +add.f32 f2435, f2443, f2439; +sub.f32 f1885, f2443, f2439; +add.f32 f1886, f1853, f1875; +sub.f32 f1888, f1853, f1875; +add.f32 f2434, f2442, f1876; +sub.f32 f1889, f2442, f1876; +sub.f32 f1890, f1851, f1868; +add.f32 f1892, f1851, f1868; +add.f32 f2433, f1852, f1867; +sub.f32 f1893, f1852, f1867; +add.f32 f1894, f1855, f1879; +sub.f32 f1896, f1855, f1879; +add.f32 f2432, f1856, f1881; +sub.f32 f1897, f1856, f1881; +add.f32 f1898, f1779, f1795; +sub.f32 f1900, f1779, f1795; +add.f32 f2431, f1811, f1827; +sub.f32 f1901, f1811, f1827; +add.f32 f1902, f1787, f1803; +sub.f32 f1904, f1787, f1803; +add.f32 f2430, f1819, f1835; +sub.f32 f1905, f1819, f1835; +add.f32 f1906, f1898, f1902; +sub.f32 f1908, f1898, f1902; +add.f32 f2429, f2431, f2430; +sub.f32 f1909, f2431, f2430; +sub.f32 f1910, f1900, f1905; +add.f32 f1912, f1900, f1905; +add.f32 f2428, f1901, f1904; +sub.f32 f1913, f1901, f1904; +add.f32 f1914, f1783, f1799; +sub.f32 f1916, f1783, f1799; +add.f32 f2427, f1815, f1831; +sub.f32 f1917, f1815, f1831; +add.f32 f1918, f1791, f1807; +sub.f32 f1920, f1791, f1807; +add.f32 f2426, f1823, f1839; +sub.f32 f1921, f1823, f1839; +add.f32 f1922, f1914, f1918; +sub.f32 f1924, f1914, f1918; +add.f32 f2425, f2427, f2426; +sub.f32 f1925, f2427, f2426; +sub.f32 f1926, f1916, f1921; +add.f32 f1928, f1916, f1921; +add.f32 f2424, f1917, f1920; +sub.f32 f1929, f1917, f1920; +mul.f32 f1930, f1926, 0f3F3504F3; +mul.f32 f1931, f2424, 0f3F3504F3; +sub.f32 f1932, f1930, f1931; +add.f32 f1933, f1930, f1931; +mul.f32 f2422, f1928, 0fBF3504F3; +mul.f32 f2423, f1929, 0f3F3504F3; +sub.f32 f1936, f2422, f2423; +mul.f32 f1937, f1929, 0fBF3504F3; +fma.rn.f32 f1938, f1928, 0f3F3504F3, f1937; +add.f32 f1939, f1906, f1922; +sub.f32 f1941, f1906, f1922; +add.f32 f2421, f2429, f2425; +sub.f32 f1942, f2429, f2425; +add.f32 f1943, f1910, f1932; +sub.f32 f1945, f1910, f1932; +add.f32 f2420, f2428, f1933; +sub.f32 f1946, f2428, f1933; +sub.f32 f1947, f1908, f1925; +add.f32 f1949, f1908, f1925; +add.f32 f2419, f1909, f1924; +sub.f32 f1950, f1909, f1924; +add.f32 f1951, f1912, f1936; +sub.f32 f1953, f1912, f1936; +add.f32 f2418, f1913, f1938; +sub.f32 f1954, f1913, f1938; +mul.f32 f1956, f2420, 0f3EC3EF15; +mul.f32 f2417, f1943, 0f3F6C835E; +sub.f32 f1957, f2417, f1956; +mul.f32 f1958, f2420, 0f3F6C835E; +fma.rn.f32 f1959, f1943, 0f3EC3EF15, f1958; +mul.f32 f1960, f1947, 0f3F3504F3; +mul.f32 f1961, f2419, 0f3F3504F3; +sub.f32 f1962, f1960, f1961; +add.f32 f1963, f1960, f1961; +mul.f32 f1965, f2418, 0f3F6C835E; +mul.f32 f2416, f1951, 0f3EC3EF15; +sub.f32 f1966, f2416, f1965; +mul.f32 f1967, f2418, 0f3EC3EF15; +fma.rn.f32 f1968, f1951, 0f3F6C835E, f1967; +mul.f32 f1970, f1946, 0f3F6C835E; +mul.f32 f2415, f1945, 0fBEC3EF15; +sub.f32 f1971, f2415, f1970; +mul.f32 f1972, f1946, 0fBEC3EF15; +fma.rn.f32 f1973, f1945, 0f3F6C835E, f1972; +mul.f32 f2413, f1949, 0fBF3504F3; +mul.f32 f2414, f1950, 0f3F3504F3; +sub.f32 f1976, f2413, f2414; +mul.f32 f1977, f1950, 0fBF3504F3; +fma.rn.f32 f1978, f1949, 0f3F3504F3, f1977; +mul.f32 f2411, f1953, 0fBF6C835E; +mul.f32 f2412, f1954, 0f3EC3EF15; +sub.f32 f1981, f2411, f2412; +mul.f32 f1982, f1954, 0fBF6C835E; +fma.rn.f32 f1983, f1953, 0f3EC3EF15, f1982; +add.f32 f1984, f1882, f1939; +sub.f32 f1986, f1882, f1939; +add.f32 f2410, f2435, f2421; +sub.f32 f1987, f2435, f2421; +add.f32 f1988, f1886, f1957; +sub.f32 f1990, f1886, f1957; +add.f32 f2409, f2434, f1959; +sub.f32 f1991, f2434, f1959; +add.f32 f1992, f1890, f1962; +sub.f32 f1994, f1890, f1962; +add.f32 f2408, f2433, f1963; +sub.f32 f1995, f2433, f1963; +add.f32 f1996, f1894, f1966; +sub.f32 f1998, f1894, f1966; +add.f32 f2407, f2432, f1968; +sub.f32 f1999, f2432, f1968; +sub.f32 f2000, f1884, f1942; +add.f32 f2002, f1884, f1942; +add.f32 f2406, f1885, f1941; +sub.f32 f2003, f1885, f1941; +add.f32 f2004, f1888, f1971; +sub.f32 f2006, f1888, f1971; +add.f32 f2405, f1889, f1973; +sub.f32 f2007, f1889, f1973; +add.f32 f2008, f1892, f1976; +sub.f32 f2010, f1892, f1976; +add.f32 f2404, f1893, f1978; +sub.f32 f2011, f1893, f1978; +add.f32 f2012, f1896, f1981; +sub.f32 f2014, f1896, f1981; +add.f32 f2403, f1897, f1983; +sub.f32 f2015, f1897, f1983; +add.f32 f2016, f1778, f1794; +sub.f32 f2018, f1778, f1794; +add.f32 f2402, f1810, f1826; +sub.f32 f2019, f1810, f1826; +add.f32 f2020, f1786, f1802; +sub.f32 f2022, f1786, f1802; +add.f32 f2401, f1818, f1834; +sub.f32 f2023, f1818, f1834; +add.f32 f2024, f2016, f2020; +sub.f32 f2026, f2016, f2020; +add.f32 f2400, f2402, f2401; +sub.f32 f2027, f2402, f2401; +sub.f32 f2028, f2018, f2023; +add.f32 f2030, f2018, f2023; +add.f32 f2399, f2019, f2022; +sub.f32 f2031, f2019, f2022; +add.f32 f2032, f1782, f1798; +sub.f32 f2034, f1782, f1798; +add.f32 f2398, f1814, f1830; +sub.f32 f2035, f1814, f1830; +add.f32 f2036, f1790, f1806; +sub.f32 f2038, f1790, f1806; +add.f32 f2397, f1822, f1838; +sub.f32 f2039, f1822, f1838; +add.f32 f2040, f2032, f2036; +sub.f32 f2042, f2032, f2036; +add.f32 f2396, f2398, f2397; +sub.f32 f2043, f2398, f2397; +sub.f32 f2044, f2034, f2039; +add.f32 f2046, f2034, f2039; +add.f32 f2395, f2035, f2038; +sub.f32 f2047, f2035, f2038; +mul.f32 f2048, f2044, 0f3F3504F3; +mul.f32 f2049, f2395, 0f3F3504F3; +sub.f32 f2050, f2048, f2049; +add.f32 f2051, f2048, f2049; +mul.f32 f2053, f2047, 0f3F3504F3; +mul.f32 f2394, f2046, 0fBF3504F3; +sub.f32 f2054, f2394, f2053; +mul.f32 f2055, f2047, 0fBF3504F3; +fma.rn.f32 f2056, f2046, 0f3F3504F3, f2055; +add.f32 f2057, f2024, f2040; +sub.f32 f2059, f2024, f2040; +add.f32 f2393, f2400, f2396; +sub.f32 f2060, f2400, f2396; +add.f32 f2061, f2028, f2050; +sub.f32 f2063, f2028, f2050; +add.f32 f2392, f2399, f2051; +sub.f32 f2064, f2399, f2051; +sub.f32 f2065, f2026, f2043; +add.f32 f2067, f2026, f2043; +add.f32 f2391, f2027, f2042; +sub.f32 f2068, f2027, f2042; +add.f32 f2069, f2030, f2054; +sub.f32 f2071, f2030, f2054; +add.f32 f2390, f2031, f2056; +sub.f32 f2072, f2031, f2056; +add.f32 f2073, f1780, f1796; +sub.f32 f2075, f1780, f1796; +add.f32 f2389, f1812, f1828; +sub.f32 f2076, f1812, f1828; +add.f32 f2077, f1788, f1804; +sub.f32 f2079, f1788, f1804; +add.f32 f2388, f1820, f1836; +sub.f32 f2080, f1820, f1836; +add.f32 f2081, f2073, f2077; +sub.f32 f2083, f2073, f2077; +add.f32 f2387, f2389, f2388; +sub.f32 f2084, f2389, f2388; +sub.f32 f2085, f2075, f2080; +add.f32 f2087, f2075, f2080; +add.f32 f2386, f2076, f2079; +sub.f32 f2088, f2076, f2079; +add.f32 f2089, f1784, f1800; +sub.f32 f2091, f1784, f1800; +add.f32 f2385, f1816, f1832; +sub.f32 f2092, f1816, f1832; +add.f32 f2093, f1792, f1808; +sub.f32 f2095, f1792, f1808; +add.f32 f2384, f1824, f1840; +sub.f32 f2096, f1824, f1840; +add.f32 f2097, f2089, f2093; +sub.f32 f2099, f2089, f2093; +add.f32 f2383, f2385, f2384; +sub.f32 f2100, f2385, f2384; +sub.f32 f2101, f2091, f2096; +add.f32 f2103, f2091, f2096; +add.f32 f2382, f2092, f2095; +sub.f32 f2104, f2092, f2095; +mul.f32 f2105, f2101, 0f3F3504F3; +mul.f32 f2106, f2382, 0f3F3504F3; +sub.f32 f2107, f2105, f2106; +add.f32 f2108, f2105, f2106; +mul.f32 f2110, f2104, 0f3F3504F3; +mul.f32 f2381, f2103, 0fBF3504F3; +sub.f32 f2111, f2381, f2110; +mul.f32 f2112, f2104, 0fBF3504F3; +fma.rn.f32 f2113, f2103, 0f3F3504F3, f2112; +add.f32 f2114, f2081, f2097; +sub.f32 f2116, f2081, f2097; +add.f32 f2380, f2387, f2383; +sub.f32 f2117, f2387, f2383; +add.f32 f2118, f2085, f2107; +sub.f32 f2120, f2085, f2107; +add.f32 f2379, f2386, f2108; +sub.f32 f2121, f2386, f2108; +sub.f32 f2122, f2083, f2100; +add.f32 f2124, f2083, f2100; +add.f32 f2378, f2084, f2099; +sub.f32 f2125, f2084, f2099; +add.f32 f2126, f2087, f2111; +sub.f32 f2128, f2087, f2111; +add.f32 f2377, f2088, f2113; +sub.f32 f2129, f2088, f2113; +mul.f32 f2131, f2379, 0f3EC3EF15; +mul.f32 f2376, f2118, 0f3F6C835E; +sub.f32 f2132, f2376, f2131; +mul.f32 f2133, f2379, 0f3F6C835E; +fma.rn.f32 f2134, f2118, 0f3EC3EF15, f2133; +mul.f32 f2135, f2122, 0f3F3504F3; +mul.f32 f2136, f2378, 0f3F3504F3; +sub.f32 f2137, f2135, f2136; +add.f32 f2138, f2135, f2136; +mul.f32 f2140, f2377, 0f3F6C835E; +mul.f32 f2375, f2126, 0f3EC3EF15; +sub.f32 f2141, f2375, f2140; +mul.f32 f2142, f2377, 0f3EC3EF15; +fma.rn.f32 f2143, f2126, 0f3F6C835E, f2142; +mul.f32 f2145, f2121, 0f3F6C835E; +mul.f32 f2374, f2120, 0fBEC3EF15; +sub.f32 f2146, f2374, f2145; +mul.f32 f2147, f2121, 0fBEC3EF15; +fma.rn.f32 f2148, f2120, 0f3F6C835E, f2147; +mul.f32 f2150, f2125, 0f3F3504F3; +mul.f32 f2373, f2124, 0fBF3504F3; +sub.f32 f2151, f2373, f2150; +mul.f32 f2152, f2125, 0fBF3504F3; +fma.rn.f32 f2153, f2124, 0f3F3504F3, f2152; +mul.f32 f2155, f2129, 0f3EC3EF15; +mul.f32 f2372, f2128, 0fBF6C835E; +sub.f32 f2156, f2372, f2155; +mul.f32 f2157, f2129, 0fBF6C835E; +fma.rn.f32 f2158, f2128, 0f3EC3EF15, f2157; +add.f32 f2159, f2057, f2114; +sub.f32 f2161, f2057, f2114; +add.f32 f2371, f2393, f2380; +sub.f32 f2162, f2393, f2380; +add.f32 f2163, f2061, f2132; +sub.f32 f2165, f2061, f2132; +add.f32 f2370, f2392, f2134; +sub.f32 f2166, f2392, f2134; +add.f32 f2167, f2065, f2137; +sub.f32 f2169, f2065, f2137; +add.f32 f2369, f2391, f2138; +sub.f32 f2170, f2391, f2138; +add.f32 f2171, f2069, f2141; +sub.f32 f2173, f2069, f2141; +add.f32 f2368, f2390, f2143; +sub.f32 f2174, f2390, f2143; +sub.f32 f2175, f2059, f2117; +add.f32 f2177, f2059, f2117; +add.f32 f2367, f2060, f2116; +sub.f32 f2178, f2060, f2116; +add.f32 f2179, f2063, f2146; +sub.f32 f2181, f2063, f2146; +add.f32 f2366, f2064, f2148; +sub.f32 f2182, f2064, f2148; +add.f32 f2183, f2067, f2151; +sub.f32 f2185, f2067, f2151; +add.f32 f2365, f2068, f2153; +sub.f32 f2186, f2068, f2153; +add.f32 f2187, f2071, f2156; +sub.f32 f2189, f2071, f2156; +add.f32 f2364, f2072, f2158; +sub.f32 f2190, f2072, f2158; +mul.f32 f2192, f2370, 0f3E47C5C2; +mul.f32 f2363, f2163, 0f3F7B14BE; +sub.f32 f2193, f2363, f2192; +mul.f32 f2194, f2370, 0f3F7B14BE; +fma.rn.f32 f2195, f2163, 0f3E47C5C2, f2194; +mul.f32 f2197, f2369, 0f3EC3EF15; +mul.f32 f2362, f2167, 0f3F6C835E; +sub.f32 f2198, f2362, f2197; +mul.f32 f2199, f2369, 0f3F6C835E; +fma.rn.f32 f2200, f2167, 0f3EC3EF15, f2199; +mul.f32 f2202, f2368, 0f3F0E39DA; +mul.f32 f2361, f2171, 0f3F54DB31; +sub.f32 f2203, f2361, f2202; +mul.f32 f2204, f2368, 0f3F54DB31; +fma.rn.f32 f2205, f2171, 0f3F0E39DA, f2204; +mul.f32 f2206, f2175, 0f3F3504F3; +mul.f32 f2207, f2367, 0f3F3504F3; +sub.f32 f2208, f2206, f2207; +add.f32 f2209, f2206, f2207; +mul.f32 f2211, f2366, 0f3F54DB31; +mul.f32 f2360, f2179, 0f3F0E39DA; +sub.f32 f2212, f2360, f2211; +mul.f32 f2213, f2366, 0f3F0E39DA; +fma.rn.f32 f2214, f2179, 0f3F54DB31, f2213; +mul.f32 f2216, f2365, 0f3F6C835E; +mul.f32 f2359, f2183, 0f3EC3EF15; +sub.f32 f2217, f2359, f2216; +mul.f32 f2218, f2365, 0f3EC3EF15; +fma.rn.f32 f2219, f2183, 0f3F6C835E, f2218; +mul.f32 f2221, f2364, 0f3F7B14BE; +mul.f32 f2358, f2187, 0f3E47C5C2; +sub.f32 f2222, f2358, f2221; +mul.f32 f2223, f2364, 0f3E47C5C2; +fma.rn.f32 f2224, f2187, 0f3F7B14BE, f2223; +mul.f32 f2226, f2166, 0f3F7B14BE; +mul.f32 f2357, f2165, 0fBE47C5C2; +sub.f32 f2227, f2357, f2226; +mul.f32 f2228, f2166, 0fBE47C5C2; +fma.rn.f32 f2229, f2165, 0f3F7B14BE, f2228; +mul.f32 f2355, f2169, 0fBEC3EF15; +mul.f32 f2356, f2170, 0f3F6C835E; +sub.f32 f2232, f2355, f2356; +mul.f32 f2233, f2170, 0fBEC3EF15; +fma.rn.f32 f2234, f2169, 0f3F6C835E, f2233; +mul.f32 f2353, f2173, 0fBF0E39DA; +mul.f32 f2354, f2174, 0f3F54DB31; +sub.f32 f2237, f2353, f2354; +mul.f32 f2238, f2174, 0fBF0E39DA; +fma.rn.f32 f2239, f2173, 0f3F54DB31, f2238; +mul.f32 f2351, f2177, 0fBF3504F3; +mul.f32 f2352, f2178, 0f3F3504F3; +sub.f32 f2242, f2351, f2352; +mul.f32 f2243, f2178, 0fBF3504F3; +fma.rn.f32 f2244, f2177, 0f3F3504F3, f2243; +mul.f32 f2349, f2181, 0fBF54DB31; +mul.f32 f2350, f2182, 0f3F0E39DA; +sub.f32 f2247, f2349, f2350; +mul.f32 f2248, f2182, 0fBF54DB31; +fma.rn.f32 f2249, f2181, 0f3F0E39DA, f2248; +mul.f32 f2251, f2186, 0f3EC3EF15; +mul.f32 f2348, f2185, 0fBF6C835E; +sub.f32 f2252, f2348, f2251; +mul.f32 f2253, f2186, 0fBF6C835E; +fma.rn.f32 f2254, f2185, 0f3EC3EF15, f2253; +mul.f32 f2256, f2190, 0f3E47C5C2; +mul.f32 f2347, f2189, 0fBF7B14BE; +sub.f32 f2257, f2347, f2256; +mul.f32 f2258, f2190, 0fBF7B14BE; +fma.rn.f32 f2259, f2189, 0f3E47C5C2, f2258; +add.f32 %1, f2410, f2371; +add.f32 %0, f1984, f2159; +add.f32 %2, f1988, f2193; +add.f32 %3, f2409, f2195; +add.f32 %5, f2408, f2200; +add.f32 %4, f1992, f2198; +add.f32 %7, f2407, f2205; +add.f32 %6, f1996, f2203; +add.f32 %9, f2406, f2209; +add.f32 %8, f2000, f2208; +add.f32 %10, f2004, f2212; +add.f32 %11, f2405, f2214; +add.f32 %12, f2008, f2217; +add.f32 %13, f2404, f2219; +add.f32 %14, f2012, f2222; +add.f32 %15, f2403, f2224; +add.f32 %17, f1987, f2161; +sub.f32 %16, f1986, f2162; +add.f32 %19, f1991, f2229; +add.f32 %18, f1990, f2227; +add.f32 %21, f1995, f2234; +add.f32 %20, f1994, f2232; +add.f32 %22, f1998, f2237; +add.f32 %23, f1999, f2239; +add.f32 %24, f2002, f2242; +add.f32 %25, f2003, f2244; +add.f32 %26, f2006, f2247; +add.f32 %27, f2007, f2249; +add.f32 %28, f2010, f2252; +add.f32 %29, f2011, f2254; +add.f32 %31, f2015, f2259; +add.f32 %30, f2014, f2257; +sub.f32 %32, f1984, f2159; +sub.f32 %33, f2410, f2371; +sub.f32 %35, f2409, f2195; +sub.f32 %34, f1988, f2193; +sub.f32 %37, f2408, f2200; +sub.f32 %36, f1992, f2198; +sub.f32 %39, f2407, f2205; +sub.f32 %38, f1996, f2203; +sub.f32 %41, f2406, f2209; +sub.f32 %40, f2000, f2208; +sub.f32 %43, f2405, f2214; +sub.f32 %42, f2004, f2212; +sub.f32 %45, f2404, f2219; +sub.f32 %44, f2008, f2217; +sub.f32 %47, f2403, f2224; +sub.f32 %46, f2012, f2222; +sub.f32 %49, f1987, f2161; +add.f32 %48, f1986, f2162; +sub.f32 %51, f1991, f2229; +sub.f32 %50, f1990, f2227; +sub.f32 %53, f1995, f2234; +sub.f32 %52, f1994, f2232; +sub.f32 %55, f1999, f2239; +sub.f32 %54, f1998, f2237; +sub.f32 %57, f2003, f2244; +sub.f32 %56, f2002, f2242; +sub.f32 %59, f2007, f2249; +sub.f32 %58, f2006, f2247; +sub.f32 %61, f2011, f2254; +sub.f32 %60, f2010, f2252; +sub.f32 %63, f2015, f2259; +sub.f32 %62, f2014, f2257; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_32768), "l"(lut_sp_32_1024), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..fb3711a1c1022 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp16_fwd.hpp.inc @@ -0,0 +1,8137 @@ +#ifndef CUFFTDX_FFT_32_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_32_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<778, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<179>; +.reg .b32 r<1583>; +.reg .f64 fd<179>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %64, %72; +} +{ +add.f16x2 r4, %65, %73; +} +{ +sub.f16x2 r7, %64, %72; +} +{ +sub.f16x2 r10, %65, %73; +} +{ +add.f16x2 r13, %68, %76; +} +{ +add.f16x2 r16, %69, %77; +} +{ +sub.f16x2 r19, %68, %76; +} +{ +sub.f16x2 r22, %69, %77; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %66, %74; +} +{ +add.f16x2 r54, %67, %75; +} +{ +sub.f16x2 r57, %66, %74; +} +{ +sub.f16x2 r60, %67, %75; +} +{ +add.f16x2 r63, %70, %78; +} +{ +add.f16x2 r66, %71, %79; +} +{ +sub.f16x2 r69, %70, %78; +} +{ +sub.f16x2 r72, %71, %79; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f64 fd123, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs1, fd123; +} +mov.f64 fd140, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs2, fd140; +} +{ +cvt.rn.f16.f64 rs5, fd140; +} +{ +cvt.rn.f16.f64 rs6, fd140; +} +mov.b32 r115, {rs1, rs1}; +{ +mul.f16x2 r101, r89, r115; +} +mov.b32 r112, {rs2, rs2}; +{ +mul.f16x2 r104, r92, r112; +} +{ +sub.f16x2 r107, r101, r104; +} +{ +mul.f16x2 r110, r89, r112; +} +{ +fma.rn.f16x2 r113, r92, r115, r110; +} +{ +neg.f16x2 r117, r83; +} +mov.b32 r133, {rs5, rs5}; +{ +mul.f16x2 r119, r95, r133; +} +mov.b32 r130, {rs6, rs6}; +{ +mul.f16x2 r122, r98, r130; +} +{ +sub.f16x2 r125, r119, r122; +} +{ +mul.f16x2 r128, r95, r130; +} +{ +fma.rn.f16x2 r131, r98, r133, r128; +} +{ +add.f16x2 r135, r27, r77; +} +{ +add.f16x2 r138, r30, r80; +} +{ +sub.f16x2 r141, r27, r77; +} +{ +sub.f16x2 r144, r30, r80; +} +{ +add.f16x2 r147, r39, r107; +} +{ +add.f16x2 r150, r42, r113; +} +{ +sub.f16x2 r153, r39, r107; +} +{ +sub.f16x2 r156, r42, r113; +} +{ +add.f16x2 r159, r33, r86; +} +{ +add.f16x2 r162, r36, r117; +} +{ +sub.f16x2 r165, r33, r86; +} +{ +sub.f16x2 r168, r36, r117; +} +{ +add.f16x2 r171, r45, r125; +} +{ +add.f16x2 r174, r48, r131; +} +{ +sub.f16x2 r177, r45, r125; +} +{ +sub.f16x2 r180, r48, r131; +} +{ +add.f16x2 r183, %80, %92; +} +{ +add.f16x2 r186, %82, %94; +} +{ +sub.f16x2 r189, %80, %92; +} +{ +sub.f16x2 r192, %82, %94; +} +{ +add.f16x2 r195, %88, %84; +} +{ +add.f16x2 r198, %90, %86; +} +{ +sub.f16x2 r201, %88, %84; +} +{ +sub.f16x2 r204, %90, %86; +} +{ +neg.f16x2 r207, r201; +} +{ +add.f16x2 r209, r183, r195; +} +{ +add.f16x2 r212, r186, r198; +} +{ +sub.f16x2 r215, r183, r195; +} +{ +sub.f16x2 r218, r186, r198; +} +{ +add.f16x2 r221, r189, r204; +} +{ +add.f16x2 r224, r192, r207; +} +{ +sub.f16x2 r227, r189, r204; +} +{ +sub.f16x2 r230, r192, r207; +} +{ +add.f16x2 r233, %89, %85; +} +{ +add.f16x2 r236, %91, %87; +} +{ +sub.f16x2 r239, %89, %85; +} +{ +sub.f16x2 r242, %91, %87; +} +{ +add.f16x2 r245, %81, %93; +} +{ +add.f16x2 r248, %83, %95; +} +{ +sub.f16x2 r251, %81, %93; +} +{ +sub.f16x2 r254, %83, %95; +} +{ +neg.f16x2 r257, r251; +} +{ +add.f16x2 r259, r233, r245; +} +{ +add.f16x2 r262, r236, r248; +} +{ +sub.f16x2 r265, r233, r245; +} +{ +sub.f16x2 r268, r236, r248; +} +{ +add.f16x2 r271, r239, r254; +} +{ +add.f16x2 r274, r242, r257; +} +{ +sub.f16x2 r277, r239, r254; +} +{ +sub.f16x2 r280, r242, r257; +} +{ +cvt.rn.f16.f64 rs15, fd123; +} +{ +cvt.rn.f16.f64 rs16, fd140; +} +{ +cvt.rn.f16.f64 rs19, fd140; +} +{ +cvt.rn.f16.f64 rs20, fd140; +} +mov.b32 r297, {rs15, rs15}; +{ +mul.f16x2 r283, r271, r297; +} +mov.b32 r294, {rs16, rs16}; +{ +mul.f16x2 r286, r274, r294; +} +{ +sub.f16x2 r289, r283, r286; +} +{ +mul.f16x2 r292, r271, r294; +} +{ +fma.rn.f16x2 r295, r274, r297, r292; +} +{ +neg.f16x2 r299, r265; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r277, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r280, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r277, r312; +} +{ +fma.rn.f16x2 r313, r280, r315, r310; +} +{ +add.f16x2 r317, r209, r259; +} +{ +add.f16x2 r320, r212, r262; +} +{ +sub.f16x2 r323, r209, r259; +} +{ +sub.f16x2 r326, r212, r262; +} +{ +add.f16x2 r329, r221, r289; +} +{ +add.f16x2 r332, r224, r295; +} +{ +sub.f16x2 r335, r221, r289; +} +{ +sub.f16x2 r338, r224, r295; +} +{ +add.f16x2 r341, r215, r268; +} +{ +add.f16x2 r344, r218, r299; +} +{ +sub.f16x2 r347, r215, r268; +} +{ +sub.f16x2 r350, r218, r299; +} +{ +add.f16x2 r353, r227, r307; +} +{ +add.f16x2 r356, r230, r313; +} +{ +sub.f16x2 r359, r227, r307; +} +{ +sub.f16x2 r362, r230, r313; +} +mov.f64 fd119, 0d3FED906BCF328D46; +{ +cvt.rn.f16.f64 rs29, fd119; +} +mov.f64 fd144, 0dBFD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs30, fd144; +} +{ +cvt.rn.f16.f64 rs31, fd123; +} +{ +cvt.rn.f16.f64 rs32, fd140; +} +mov.f64 fd127, 0d3FD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs33, fd127; +} +mov.f64 fd143, 0dBFED906BCF328D46; +{ +cvt.rn.f16.f64 rs34, fd143; +} +{ +cvt.rn.f16.f64 rs37, fd144; +} +{ +cvt.rn.f16.f64 rs38, fd143; +} +{ +cvt.rn.f16.f64 rs39, fd140; +} +{ +cvt.rn.f16.f64 rs40, fd140; +} +{ +cvt.rn.f16.f64 rs41, fd143; +} +{ +cvt.rn.f16.f64 rs42, fd144; +} +mov.b32 r379, {rs29, rs29}; +{ +mul.f16x2 r365, r329, r379; +} +mov.b32 r376, {rs30, rs30}; +{ +mul.f16x2 r368, r332, r376; +} +{ +sub.f16x2 r371, r365, r368; +} +{ +mul.f16x2 r374, r329, r376; +} +{ +fma.rn.f16x2 r377, r332, r379, r374; +} +mov.b32 r395, {rs31, rs31}; +{ +mul.f16x2 r381, r341, r395; +} +mov.b32 r392, {rs32, rs32}; +{ +mul.f16x2 r384, r344, r392; +} +{ +sub.f16x2 r387, r381, r384; +} +{ +mul.f16x2 r390, r341, r392; +} +{ +fma.rn.f16x2 r393, r344, r395, r390; +} +mov.b32 r411, {rs33, rs33}; +{ +mul.f16x2 r397, r353, r411; +} +mov.b32 r408, {rs34, rs34}; +{ +mul.f16x2 r400, r356, r408; +} +{ +sub.f16x2 r403, r397, r400; +} +{ +mul.f16x2 r406, r353, r408; +} +{ +fma.rn.f16x2 r409, r356, r411, r406; +} +{ +neg.f16x2 r413, r323; +} +mov.b32 r429, {rs37, rs37}; +{ +mul.f16x2 r415, r335, r429; +} +mov.b32 r426, {rs38, rs38}; +{ +mul.f16x2 r418, r338, r426; +} +{ +sub.f16x2 r421, r415, r418; +} +{ +mul.f16x2 r424, r335, r426; +} +{ +fma.rn.f16x2 r427, r338, r429, r424; +} +mov.b32 r445, {rs39, rs39}; +{ +mul.f16x2 r431, r347, r445; +} +mov.b32 r442, {rs40, rs40}; +{ +mul.f16x2 r434, r350, r442; +} +{ +sub.f16x2 r437, r431, r434; +} +{ +mul.f16x2 r440, r347, r442; +} +{ +fma.rn.f16x2 r443, r350, r445, r440; +} +mov.b32 r461, {rs41, rs41}; +{ +mul.f16x2 r447, r359, r461; +} +mov.b32 r458, {rs42, rs42}; +{ +mul.f16x2 r450, r362, r458; +} +{ +sub.f16x2 r453, r447, r450; +} +{ +mul.f16x2 r456, r359, r458; +} +{ +fma.rn.f16x2 r459, r362, r461, r456; +} +{ +add.f16x2 r463, r135, r317; +} +{ +add.f16x2 r466, r138, r320; +} +{ +sub.f16x2 r469, r135, r317; +} +{ +sub.f16x2 r472, r138, r320; +} +{ +add.f16x2 r475, r147, r371; +} +{ +add.f16x2 r478, r150, r377; +} +{ +sub.f16x2 r481, r147, r371; +} +{ +sub.f16x2 r484, r150, r377; +} +{ +add.f16x2 r487, r159, r387; +} +{ +add.f16x2 r490, r162, r393; +} +{ +sub.f16x2 r493, r159, r387; +} +{ +sub.f16x2 r496, r162, r393; +} +{ +add.f16x2 r499, r171, r403; +} +{ +add.f16x2 r502, r174, r409; +} +{ +sub.f16x2 r505, r171, r403; +} +{ +sub.f16x2 r508, r174, r409; +} +{ +add.f16x2 r511, r141, r326; +} +{ +add.f16x2 r514, r144, r413; +} +{ +sub.f16x2 r517, r141, r326; +} +{ +sub.f16x2 r520, r144, r413; +} +{ +add.f16x2 r523, r153, r421; +} +{ +add.f16x2 r526, r156, r427; +} +{ +sub.f16x2 r529, r153, r421; +} +{ +sub.f16x2 r532, r156, r427; +} +{ +add.f16x2 r535, r165, r437; +} +{ +add.f16x2 r538, r168, r443; +} +{ +sub.f16x2 r541, r165, r437; +} +{ +sub.f16x2 r544, r168, r443; +} +{ +add.f16x2 r547, r177, r453; +} +{ +add.f16x2 r550, r180, r459; +} +{ +sub.f16x2 r553, r177, r453; +} +{ +sub.f16x2 r556, r180, r459; +} +{ +add.f16x2 r559, %110, %106; +} +{ +add.f16x2 r562, %96, %108; +} +{ +sub.f16x2 r565, %110, %106; +} +{ +sub.f16x2 r568, %96, %108; +} +{ +add.f16x2 r571, %102, %98; +} +{ +add.f16x2 r574, %104, %100; +} +{ +sub.f16x2 r577, %102, %98; +} +{ +sub.f16x2 r580, %104, %100; +} +{ +neg.f16x2 r583, r577; +} +{ +add.f16x2 r585, r559, r571; +} +{ +add.f16x2 r588, r562, r574; +} +{ +sub.f16x2 r591, r559, r571; +} +{ +sub.f16x2 r594, r562, r574; +} +{ +add.f16x2 r597, r565, r580; +} +{ +add.f16x2 r600, r568, r583; +} +{ +sub.f16x2 r603, r565, r580; +} +{ +sub.f16x2 r606, r568, r583; +} +{ +add.f16x2 r609, %103, %99; +} +{ +add.f16x2 r612, %105, %101; +} +{ +sub.f16x2 r615, %103, %99; +} +{ +sub.f16x2 r618, %105, %101; +} +{ +add.f16x2 r621, %111, %107; +} +{ +add.f16x2 r624, %97, %109; +} +{ +sub.f16x2 r627, %111, %107; +} +{ +sub.f16x2 r630, %97, %109; +} +{ +neg.f16x2 r633, r627; +} +{ +add.f16x2 r635, r609, r621; +} +{ +add.f16x2 r638, r612, r624; +} +{ +sub.f16x2 r641, r609, r621; +} +{ +sub.f16x2 r644, r612, r624; +} +{ +add.f16x2 r647, r615, r630; +} +{ +add.f16x2 r650, r618, r633; +} +{ +sub.f16x2 r653, r615, r630; +} +{ +sub.f16x2 r656, r618, r633; +} +{ +cvt.rn.f16.f64 rs59, fd123; +} +{ +cvt.rn.f16.f64 rs60, fd140; +} +{ +cvt.rn.f16.f64 rs63, fd140; +} +{ +cvt.rn.f16.f64 rs64, fd140; +} +mov.b32 r673, {rs59, rs59}; +{ +mul.f16x2 r659, r647, r673; +} +mov.b32 r670, {rs60, rs60}; +{ +mul.f16x2 r662, r650, r670; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r647, r670; +} +{ +fma.rn.f16x2 r671, r650, r673, r668; +} +{ +neg.f16x2 r675, r641; +} +mov.b32 r691, {rs63, rs63}; +{ +mul.f16x2 r677, r653, r691; +} +mov.b32 r688, {rs64, rs64}; +{ +mul.f16x2 r680, r656, r688; +} +{ +sub.f16x2 r683, r677, r680; +} +{ +mul.f16x2 r686, r653, r688; +} +{ +fma.rn.f16x2 r689, r656, r691, r686; +} +{ +add.f16x2 r693, r585, r635; +} +{ +add.f16x2 r696, r588, r638; +} +{ +sub.f16x2 r699, r585, r635; +} +{ +sub.f16x2 r702, r588, r638; +} +{ +add.f16x2 r705, r597, r665; +} +{ +add.f16x2 r708, r600, r671; +} +{ +sub.f16x2 r711, r597, r665; +} +{ +sub.f16x2 r714, r600, r671; +} +{ +add.f16x2 r717, r591, r644; +} +{ +add.f16x2 r720, r594, r675; +} +{ +sub.f16x2 r723, r591, r644; +} +{ +sub.f16x2 r726, r594, r675; +} +{ +add.f16x2 r729, r603, r683; +} +{ +add.f16x2 r732, r606, r689; +} +{ +sub.f16x2 r735, r603, r683; +} +{ +sub.f16x2 r738, r606, r689; +} +{ +add.f16x2 r741, %114, %126; +} +{ +add.f16x2 r744, %116, %112; +} +{ +sub.f16x2 r747, %114, %126; +} +{ +sub.f16x2 r750, %116, %112; +} +{ +add.f16x2 r753, %122, %118; +} +{ +add.f16x2 r756, %124, %120; +} +{ +sub.f16x2 r759, %122, %118; +} +{ +sub.f16x2 r762, %124, %120; +} +{ +neg.f16x2 r765, r759; +} +{ +add.f16x2 r767, r741, r753; +} +{ +add.f16x2 r770, r744, r756; +} +{ +sub.f16x2 r773, r741, r753; +} +{ +sub.f16x2 r776, r744, r756; +} +{ +add.f16x2 r779, r747, r762; +} +{ +add.f16x2 r782, r750, r765; +} +{ +sub.f16x2 r785, r747, r762; +} +{ +sub.f16x2 r788, r750, r765; +} +{ +add.f16x2 r791, %123, %119; +} +{ +add.f16x2 r794, %125, %121; +} +{ +sub.f16x2 r797, %123, %119; +} +{ +sub.f16x2 r800, %125, %121; +} +{ +add.f16x2 r803, %115, %127; +} +{ +add.f16x2 r806, %117, %113; +} +{ +sub.f16x2 r809, %115, %127; +} +{ +sub.f16x2 r812, %117, %113; +} +{ +neg.f16x2 r815, r809; +} +{ +add.f16x2 r817, r791, r803; +} +{ +add.f16x2 r820, r794, r806; +} +{ +sub.f16x2 r823, r791, r803; +} +{ +sub.f16x2 r826, r794, r806; +} +{ +add.f16x2 r829, r797, r812; +} +{ +add.f16x2 r832, r800, r815; +} +{ +sub.f16x2 r835, r797, r812; +} +{ +sub.f16x2 r838, r800, r815; +} +{ +cvt.rn.f16.f64 rs73, fd123; +} +{ +cvt.rn.f16.f64 rs74, fd140; +} +{ +cvt.rn.f16.f64 rs77, fd140; +} +{ +cvt.rn.f16.f64 rs78, fd140; +} +mov.b32 r855, {rs73, rs73}; +{ +mul.f16x2 r841, r829, r855; +} +mov.b32 r852, {rs74, rs74}; +{ +mul.f16x2 r844, r832, r852; +} +{ +sub.f16x2 r847, r841, r844; +} +{ +mul.f16x2 r850, r829, r852; +} +{ +fma.rn.f16x2 r853, r832, r855, r850; +} +{ +neg.f16x2 r857, r823; +} +mov.b32 r873, {rs77, rs77}; +{ +mul.f16x2 r859, r835, r873; +} +mov.b32 r870, {rs78, rs78}; +{ +mul.f16x2 r862, r838, r870; +} +{ +sub.f16x2 r865, r859, r862; +} +{ +mul.f16x2 r868, r835, r870; +} +{ +fma.rn.f16x2 r871, r838, r873, r868; +} +{ +add.f16x2 r875, r767, r817; +} +{ +add.f16x2 r878, r770, r820; +} +{ +sub.f16x2 r881, r767, r817; +} +{ +sub.f16x2 r884, r770, r820; +} +{ +add.f16x2 r887, r779, r847; +} +{ +add.f16x2 r890, r782, r853; +} +{ +sub.f16x2 r893, r779, r847; +} +{ +sub.f16x2 r896, r782, r853; +} +{ +add.f16x2 r899, r773, r826; +} +{ +add.f16x2 r902, r776, r857; +} +{ +sub.f16x2 r905, r773, r826; +} +{ +sub.f16x2 r908, r776, r857; +} +{ +add.f16x2 r911, r785, r865; +} +{ +add.f16x2 r914, r788, r871; +} +{ +sub.f16x2 r917, r785, r865; +} +{ +sub.f16x2 r920, r788, r871; +} +{ +cvt.rn.f16.f64 rs87, fd119; +} +{ +cvt.rn.f16.f64 rs88, fd144; +} +{ +cvt.rn.f16.f64 rs89, fd123; +} +{ +cvt.rn.f16.f64 rs90, fd140; +} +{ +cvt.rn.f16.f64 rs91, fd127; +} +{ +cvt.rn.f16.f64 rs92, fd143; +} +{ +cvt.rn.f16.f64 rs95, fd144; +} +{ +cvt.rn.f16.f64 rs96, fd143; +} +{ +cvt.rn.f16.f64 rs97, fd140; +} +{ +cvt.rn.f16.f64 rs98, fd140; +} +{ +cvt.rn.f16.f64 rs99, fd143; +} +{ +cvt.rn.f16.f64 rs100, fd144; +} +mov.b32 r937, {rs87, rs87}; +{ +mul.f16x2 r923, r887, r937; +} +mov.b32 r934, {rs88, rs88}; +{ +mul.f16x2 r926, r890, r934; +} +{ +sub.f16x2 r929, r923, r926; +} +{ +mul.f16x2 r932, r887, r934; +} +{ +fma.rn.f16x2 r935, r890, r937, r932; +} +mov.b32 r953, {rs89, rs89}; +{ +mul.f16x2 r939, r899, r953; +} +mov.b32 r950, {rs90, rs90}; +{ +mul.f16x2 r942, r902, r950; +} +{ +sub.f16x2 r945, r939, r942; +} +{ +mul.f16x2 r948, r899, r950; +} +{ +fma.rn.f16x2 r951, r902, r953, r948; +} +mov.b32 r969, {rs91, rs91}; +{ +mul.f16x2 r955, r911, r969; +} +mov.b32 r966, {rs92, rs92}; +{ +mul.f16x2 r958, r914, r966; +} +{ +sub.f16x2 r961, r955, r958; +} +{ +mul.f16x2 r964, r911, r966; +} +{ +fma.rn.f16x2 r967, r914, r969, r964; +} +{ +neg.f16x2 r971, r881; +} +mov.b32 r987, {rs95, rs95}; +{ +mul.f16x2 r973, r893, r987; +} +mov.b32 r984, {rs96, rs96}; +{ +mul.f16x2 r976, r896, r984; +} +{ +sub.f16x2 r979, r973, r976; +} +{ +mul.f16x2 r982, r893, r984; +} +{ +fma.rn.f16x2 r985, r896, r987, r982; +} +mov.b32 r1003, {rs97, rs97}; +{ +mul.f16x2 r989, r905, r1003; +} +mov.b32 r1000, {rs98, rs98}; +{ +mul.f16x2 r992, r908, r1000; +} +{ +sub.f16x2 r995, r989, r992; +} +{ +mul.f16x2 r998, r905, r1000; +} +{ +fma.rn.f16x2 r1001, r908, r1003, r998; +} +mov.b32 r1019, {rs99, rs99}; +{ +mul.f16x2 r1005, r917, r1019; +} +mov.b32 r1016, {rs100, rs100}; +{ +mul.f16x2 r1008, r920, r1016; +} +{ +sub.f16x2 r1011, r1005, r1008; +} +{ +mul.f16x2 r1014, r917, r1016; +} +{ +fma.rn.f16x2 r1017, r920, r1019, r1014; +} +{ +add.f16x2 r1021, r693, r875; +} +{ +add.f16x2 r1024, r696, r878; +} +{ +sub.f16x2 r1027, r693, r875; +} +{ +sub.f16x2 r1030, r696, r878; +} +{ +add.f16x2 r1033, r705, r929; +} +{ +add.f16x2 r1036, r708, r935; +} +{ +sub.f16x2 r1039, r705, r929; +} +{ +sub.f16x2 r1042, r708, r935; +} +{ +add.f16x2 r1045, r717, r945; +} +{ +add.f16x2 r1048, r720, r951; +} +{ +sub.f16x2 r1051, r717, r945; +} +{ +sub.f16x2 r1054, r720, r951; +} +{ +add.f16x2 r1057, r729, r961; +} +{ +add.f16x2 r1060, r732, r967; +} +{ +sub.f16x2 r1063, r729, r961; +} +{ +sub.f16x2 r1066, r732, r967; +} +{ +add.f16x2 r1069, r699, r884; +} +{ +add.f16x2 r1072, r702, r971; +} +{ +sub.f16x2 r1075, r699, r884; +} +{ +sub.f16x2 r1078, r702, r971; +} +{ +add.f16x2 r1081, r711, r979; +} +{ +add.f16x2 r1084, r714, r985; +} +{ +sub.f16x2 r1087, r711, r979; +} +{ +sub.f16x2 r1090, r714, r985; +} +{ +add.f16x2 r1093, r723, r995; +} +{ +add.f16x2 r1096, r726, r1001; +} +{ +sub.f16x2 r1099, r723, r995; +} +{ +sub.f16x2 r1102, r726, r1001; +} +{ +add.f16x2 r1105, r735, r1011; +} +{ +add.f16x2 r1108, r738, r1017; +} +{ +sub.f16x2 r1111, r735, r1011; +} +{ +sub.f16x2 r1114, r738, r1017; +} +mov.f64 fd117, 0d3FEF6297CFF75CB0; +{ +cvt.rn.f16.f64 rs117, fd117; +} +mov.f64 fd146, 0dBFC8F8B83C69A60B; +{ +cvt.rn.f16.f64 rs118, fd146; +} +{ +cvt.rn.f16.f64 rs119, fd119; +} +{ +cvt.rn.f16.f64 rs120, fd144; +} +mov.f64 fd121, 0d3FEA9B66290EA1A3; +{ +cvt.rn.f16.f64 rs121, fd121; +} +mov.f64 fd142, 0dBFE1C73B39AE68C8; +{ +cvt.rn.f16.f64 rs122, fd142; +} +{ +cvt.rn.f16.f64 rs123, fd123; +} +{ +cvt.rn.f16.f64 rs124, fd140; +} +mov.f64 fd125, 0d3FE1C73B39AE68C8; +{ +cvt.rn.f16.f64 rs125, fd125; +} +mov.f64 fd141, 0dBFEA9B66290EA1A3; +{ +cvt.rn.f16.f64 rs126, fd141; +} +{ +cvt.rn.f16.f64 rs127, fd127; +} +{ +cvt.rn.f16.f64 rs128, fd143; +} +mov.f64 fd129, 0d3FC8F8B83C69A60B; +{ +cvt.rn.f16.f64 rs129, fd129; +} +mov.f64 fd145, 0dBFEF6297CFF75CB0; +{ +cvt.rn.f16.f64 rs130, fd145; +} +{ +cvt.rn.f16.f64 rs133, fd146; +} +{ +cvt.rn.f16.f64 rs134, fd145; +} +{ +cvt.rn.f16.f64 rs135, fd144; +} +{ +cvt.rn.f16.f64 rs136, fd143; +} +{ +cvt.rn.f16.f64 rs137, fd142; +} +{ +cvt.rn.f16.f64 rs138, fd141; +} +{ +cvt.rn.f16.f64 rs139, fd140; +} +{ +cvt.rn.f16.f64 rs140, fd140; +} +{ +cvt.rn.f16.f64 rs141, fd141; +} +{ +cvt.rn.f16.f64 rs142, fd142; +} +{ +cvt.rn.f16.f64 rs143, fd143; +} +{ +cvt.rn.f16.f64 rs144, fd144; +} +{ +cvt.rn.f16.f64 rs145, fd145; +} +{ +cvt.rn.f16.f64 rs146, fd146; +} +mov.b32 r1131, {rs117, rs117}; +{ +mul.f16x2 r1117, r1033, r1131; +} +mov.b32 r1128, {rs118, rs118}; +{ +mul.f16x2 r1120, r1036, r1128; +} +{ +sub.f16x2 r1123, r1117, r1120; +} +{ +mul.f16x2 r1126, r1033, r1128; +} +{ +fma.rn.f16x2 r1129, r1036, r1131, r1126; +} +mov.b32 r1147, {rs119, rs119}; +{ +mul.f16x2 r1133, r1045, r1147; +} +mov.b32 r1144, {rs120, rs120}; +{ +mul.f16x2 r1136, r1048, r1144; +} +{ +sub.f16x2 r1139, r1133, r1136; +} +{ +mul.f16x2 r1142, r1045, r1144; +} +{ +fma.rn.f16x2 r1145, r1048, r1147, r1142; +} +mov.b32 r1163, {rs121, rs121}; +{ +mul.f16x2 r1149, r1057, r1163; +} +mov.b32 r1160, {rs122, rs122}; +{ +mul.f16x2 r1152, r1060, r1160; +} +{ +sub.f16x2 r1155, r1149, r1152; +} +{ +mul.f16x2 r1158, r1057, r1160; +} +{ +fma.rn.f16x2 r1161, r1060, r1163, r1158; +} +mov.b32 r1179, {rs123, rs123}; +{ +mul.f16x2 r1165, r1069, r1179; +} +mov.b32 r1176, {rs124, rs124}; +{ +mul.f16x2 r1168, r1072, r1176; +} +{ +sub.f16x2 r1171, r1165, r1168; +} +{ +mul.f16x2 r1174, r1069, r1176; +} +{ +fma.rn.f16x2 r1177, r1072, r1179, r1174; +} +mov.b32 r1195, {rs125, rs125}; +{ +mul.f16x2 r1181, r1081, r1195; +} +mov.b32 r1192, {rs126, rs126}; +{ +mul.f16x2 r1184, r1084, r1192; +} +{ +sub.f16x2 r1187, r1181, r1184; +} +{ +mul.f16x2 r1190, r1081, r1192; +} +{ +fma.rn.f16x2 r1193, r1084, r1195, r1190; +} +mov.b32 r1211, {rs127, rs127}; +{ +mul.f16x2 r1197, r1093, r1211; +} +mov.b32 r1208, {rs128, rs128}; +{ +mul.f16x2 r1200, r1096, r1208; +} +{ +sub.f16x2 r1203, r1197, r1200; +} +{ +mul.f16x2 r1206, r1093, r1208; +} +{ +fma.rn.f16x2 r1209, r1096, r1211, r1206; +} +mov.b32 r1227, {rs129, rs129}; +{ +mul.f16x2 r1213, r1105, r1227; +} +mov.b32 r1224, {rs130, rs130}; +{ +mul.f16x2 r1216, r1108, r1224; +} +{ +sub.f16x2 r1219, r1213, r1216; +} +{ +mul.f16x2 r1222, r1105, r1224; +} +{ +fma.rn.f16x2 r1225, r1108, r1227, r1222; +} +{ +neg.f16x2 r1229, r1027; +} +mov.b32 r1245, {rs133, rs133}; +{ +mul.f16x2 r1231, r1039, r1245; +} +mov.b32 r1242, {rs134, rs134}; +{ +mul.f16x2 r1234, r1042, r1242; +} +{ +sub.f16x2 r1237, r1231, r1234; +} +{ +mul.f16x2 r1240, r1039, r1242; +} +{ +fma.rn.f16x2 r1243, r1042, r1245, r1240; +} +mov.b32 r1261, {rs135, rs135}; +{ +mul.f16x2 r1247, r1051, r1261; +} +mov.b32 r1258, {rs136, rs136}; +{ +mul.f16x2 r1250, r1054, r1258; +} +{ +sub.f16x2 r1253, r1247, r1250; +} +{ +mul.f16x2 r1256, r1051, r1258; +} +{ +fma.rn.f16x2 r1259, r1054, r1261, r1256; +} +mov.b32 r1277, {rs137, rs137}; +{ +mul.f16x2 r1263, r1063, r1277; +} +mov.b32 r1274, {rs138, rs138}; +{ +mul.f16x2 r1266, r1066, r1274; +} +{ +sub.f16x2 r1269, r1263, r1266; +} +{ +mul.f16x2 r1272, r1063, r1274; +} +{ +fma.rn.f16x2 r1275, r1066, r1277, r1272; +} +mov.b32 r1293, {rs139, rs139}; +{ +mul.f16x2 r1279, r1075, r1293; +} +mov.b32 r1290, {rs140, rs140}; +{ +mul.f16x2 r1282, r1078, r1290; +} +{ +sub.f16x2 r1285, r1279, r1282; +} +{ +mul.f16x2 r1288, r1075, r1290; +} +{ +fma.rn.f16x2 r1291, r1078, r1293, r1288; +} +mov.b32 r1309, {rs141, rs141}; +{ +mul.f16x2 r1295, r1087, r1309; +} +mov.b32 r1306, {rs142, rs142}; +{ +mul.f16x2 r1298, r1090, r1306; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1087, r1306; +} +{ +fma.rn.f16x2 r1307, r1090, r1309, r1304; +} +mov.b32 r1325, {rs143, rs143}; +{ +mul.f16x2 r1311, r1099, r1325; +} +mov.b32 r1322, {rs144, rs144}; +{ +mul.f16x2 r1314, r1102, r1322; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1099, r1322; +} +{ +fma.rn.f16x2 r1323, r1102, r1325, r1320; +} +mov.b32 r1341, {rs145, rs145}; +{ +mul.f16x2 r1327, r1111, r1341; +} +mov.b32 r1338, {rs146, rs146}; +{ +mul.f16x2 r1330, r1114, r1338; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1111, r1338; +} +{ +fma.rn.f16x2 r1339, r1114, r1341, r1336; +} +{ +add.f16x2 %0, r463, r1021; +} +{ +add.f16x2 %1, r466, r1024; +} +{ +sub.f16x2 %32, r463, r1021; +} +{ +sub.f16x2 %33, r466, r1024; +} +{ +add.f16x2 %2, r475, r1123; +} +{ +add.f16x2 %3, r478, r1129; +} +{ +sub.f16x2 %34, r475, r1123; +} +{ +sub.f16x2 %35, r478, r1129; +} +{ +add.f16x2 %4, r487, r1139; +} +{ +add.f16x2 %5, r490, r1145; +} +{ +sub.f16x2 %36, r487, r1139; +} +{ +sub.f16x2 %37, r490, r1145; +} +{ +add.f16x2 %6, r499, r1155; +} +{ +add.f16x2 %7, r502, r1161; +} +{ +sub.f16x2 %38, r499, r1155; +} +{ +sub.f16x2 %39, r502, r1161; +} +{ +add.f16x2 %8, r511, r1171; +} +{ +add.f16x2 %9, r514, r1177; +} +{ +sub.f16x2 %40, r511, r1171; +} +{ +sub.f16x2 %41, r514, r1177; +} +{ +add.f16x2 %10, r523, r1187; +} +{ +add.f16x2 %11, r526, r1193; +} +{ +sub.f16x2 %42, r523, r1187; +} +{ +sub.f16x2 %43, r526, r1193; +} +{ +add.f16x2 %12, r535, r1203; +} +{ +add.f16x2 %13, r538, r1209; +} +{ +sub.f16x2 %44, r535, r1203; +} +{ +sub.f16x2 %45, r538, r1209; +} +{ +add.f16x2 %14, r547, r1219; +} +{ +add.f16x2 %15, r550, r1225; +} +{ +sub.f16x2 %46, r547, r1219; +} +{ +sub.f16x2 %47, r550, r1225; +} +{ +add.f16x2 %16, r469, r1030; +} +{ +add.f16x2 %17, r472, r1229; +} +{ +sub.f16x2 %48, r469, r1030; +} +{ +sub.f16x2 %49, r472, r1229; +} +{ +add.f16x2 %18, r481, r1237; +} +{ +add.f16x2 %19, r484, r1243; +} +{ +sub.f16x2 %50, r481, r1237; +} +{ +sub.f16x2 %51, r484, r1243; +} +{ +add.f16x2 %20, r493, r1253; +} +{ +add.f16x2 %21, r496, r1259; +} +{ +sub.f16x2 %52, r493, r1253; +} +{ +sub.f16x2 %53, r496, r1259; +} +{ +add.f16x2 %22, r505, r1269; +} +{ +add.f16x2 %23, r508, r1275; +} +{ +sub.f16x2 %54, r505, r1269; +} +{ +sub.f16x2 %55, r508, r1275; +} +{ +add.f16x2 %24, r517, r1285; +} +{ +add.f16x2 %25, r520, r1291; +} +{ +sub.f16x2 %56, r517, r1285; +} +{ +sub.f16x2 %57, r520, r1291; +} +{ +add.f16x2 %26, r529, r1301; +} +{ +add.f16x2 %27, r532, r1307; +} +{ +sub.f16x2 %58, r529, r1301; +} +{ +sub.f16x2 %59, r532, r1307; +} +{ +add.f16x2 %28, r541, r1317; +} +{ +add.f16x2 %29, r544, r1323; +} +{ +sub.f16x2 %60, r541, r1317; +} +{ +sub.f16x2 %61, r544, r1323; +} +{ +add.f16x2 %30, r553, r1333; +} +{ +add.f16x2 %31, r556, r1339; +} +{ +sub.f16x2 %62, r553, r1333; +} +{ +sub.f16x2 %63, r556, r1339; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[31].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<779, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<571>; +.reg .b64 rd<2>; +mov.u32 r559, %tid.y; +shl.b32 r560, r559, 8; +mov.u32 r561, %16; +add.s32 r562, r561, r560; +mov.u32 r563, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f2, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f2; +cvt.rn.f16.f32 high, f2; +mov.b32 r101, {low, high}; +} +mov.f32 f12, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f44, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r564, r563, 3; +shl.b32 r565, r563, 6; +and.b32 r566, r565, -256; +add.s32 r567, r562, r566; +cvt.rn.f32.u32 f47, r564; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r568, r565, 192; +add.s32 r569, r567, r568; +st.shared.v4.f32 [r569], {r149, r152, r209, r216}; +st.shared.v4.f32 [r569+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r569+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r569+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r570, r564, -56, r569; +ld.shared.u32 r460, [r570]; +ld.shared.u32 r463, [r570+4]; +ld.shared.u32 r510, [r570+32]; +ld.shared.u32 r513, [r570+36]; +ld.shared.u32 r472, [r570+64]; +ld.shared.u32 r475, [r570+68]; +ld.shared.u32 r522, [r570+96]; +ld.shared.u32 r525, [r570+100]; +ld.shared.u32 r461, [r570+128]; +ld.shared.u32 r464, [r570+132]; +ld.shared.u32 r511, [r570+160]; +ld.shared.u32 r514, [r570+164]; +ld.shared.u32 r473, [r570+192]; +ld.shared.u32 r476, [r570+196]; +ld.shared.u32 r523, [r570+224]; +ld.shared.u32 r526, [r570+228]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 %0, r459, r471; +} +{ +add.f16x2 %1, r462, r474; +} +{ +sub.f16x2 %8, r459, r471; +} +{ +sub.f16x2 %9, r462, r474; +} +{ +add.f16x2 %4, r465, r480; +} +{ +add.f16x2 %5, r468, r483; +} +{ +sub.f16x2 %12, r465, r480; +} +{ +sub.f16x2 %13, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 %2, r509, r521; +} +{ +add.f16x2 %3, r512, r524; +} +{ +sub.f16x2 %10, r509, r521; +} +{ +sub.f16x2 %11, r512, r524; +} +{ +add.f16x2 %6, r515, r530; +} +{ +add.f16x2 %7, r518, r533; +} +{ +sub.f16x2 %14, r515, r530; +} +{ +sub.f16x2 %15, r518, r533; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<780, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<571>; +.reg .b64 rd<2>; +mov.u32 r559, %tid.y; +shl.b32 r560, r559, 7; +mov.u32 r561, %16; +add.s32 r562, r561, r560; +mov.u32 r563, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f2, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f2; +cvt.rn.f16.f32 high, f2; +mov.b32 r101, {low, high}; +} +mov.f32 f12, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f44, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r564, r563, 3; +shl.b32 r565, r563, 5; +and.b32 r566, r565, -128; +add.s32 r567, r562, r566; +cvt.rn.f32.u32 f47, r564; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r568, r565, 96; +add.s32 r569, r567, r568; +st.shared.v4.f32 [r569], {r149, r209, r246, r283}; +st.shared.v4.f32 [r569+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r570, r564, -28, r569; +ld.shared.u32 r460, [r570]; +ld.shared.u32 r510, [r570+16]; +ld.shared.u32 r472, [r570+32]; +ld.shared.u32 r522, [r570+48]; +ld.shared.u32 r461, [r570+64]; +ld.shared.u32 r511, [r570+80]; +ld.shared.u32 r473, [r570+96]; +ld.shared.u32 r523, [r570+112]; +barrier.sync 0; +st.shared.v4.f32 [r569], {r152, r216, r253, r290}; +st.shared.v4.f32 [r569+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r570]; +ld.shared.u32 r513, [r570+16]; +ld.shared.u32 r475, [r570+32]; +ld.shared.u32 r525, [r570+48]; +ld.shared.u32 r464, [r570+64]; +ld.shared.u32 r514, [r570+80]; +ld.shared.u32 r476, [r570+96]; +ld.shared.u32 r526, [r570+112]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 %0, r459, r471; +} +{ +add.f16x2 %1, r462, r474; +} +{ +sub.f16x2 %8, r459, r471; +} +{ +sub.f16x2 %9, r462, r474; +} +{ +add.f16x2 %4, r465, r480; +} +{ +add.f16x2 %5, r468, r483; +} +{ +sub.f16x2 %12, r465, r480; +} +{ +sub.f16x2 %13, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 %2, r509, r521; +} +{ +add.f16x2 %3, r512, r524; +} +{ +sub.f16x2 %10, r509, r521; +} +{ +sub.f16x2 %11, r512, r524; +} +{ +add.f16x2 %6, r515, r530; +} +{ +add.f16x2 %7, r518, r533; +} +{ +sub.f16x2 %14, r515, r530; +} +{ +sub.f16x2 %15, r518, r533; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<781, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<373>; +.reg .b64 rd<2>; +mov.u32 r353, %tid.y; +shl.b32 r354, r353, 8; +mov.u32 r355, %8; +add.s32 r356, r355, r354; +mov.u32 r357, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r358, r357, 7; +shl.b32 r359, r357, 5; +and.b32 r360, r359, -256; +add.s32 r361, r356, r360; +cvt.rn.f32.u32 f21, r358; +mul.f32 f22, f21, 0f3E490FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r362, r359, 224; +add.s32 r363, r361, r362; +st.shared.v4.f32 [r363], {r27, r30, r63, r70}; +st.shared.v4.f32 [r363+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r364, r358, -24, r363; +ld.shared.u32 r166, [r364]; +ld.shared.u32 r169, [r364+4]; +ld.shared.u32 r178, [r364+64]; +ld.shared.u32 r181, [r364+68]; +ld.shared.u32 r167, [r364+128]; +ld.shared.u32 r170, [r364+132]; +ld.shared.u32 r179, [r364+192]; +ld.shared.u32 r182, [r364+196]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r365, r357, 4; +bfe.u32 r366, r357, 2, 1; +cvt.rn.f32.u32 f24, r366; +mul.f32 f25, f24, 0f3F490FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +shl.b32 r367, r357, 3; +and.b32 r368, r367, 24; +add.s32 r369, r361, r368; +barrier.sync 0; +and.b32 r370, r359, 128; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r191; +st.shared.u32 [r371+4], r194; +st.shared.u32 [r371+32], r227; +st.shared.u32 [r371+36], r234; +st.shared.u32 [r371+64], r264; +st.shared.u32 [r371+68], r271; +st.shared.u32 [r371+96], r301; +st.shared.u32 [r371+100], r308; +barrier.sync 0; +mad.lo.s32 r372, r365, -24, r371; +ld.shared.u32 r330, [r372]; +ld.shared.u32 r333, [r372+4]; +ld.shared.u32 r342, [r372+64]; +ld.shared.u32 r345, [r372+68]; +ld.shared.u32 r331, [r372+128]; +ld.shared.u32 r334, [r372+132]; +ld.shared.u32 r343, [r372+192]; +ld.shared.u32 r346, [r372+196]; +{ +add.f16x2 %0, r330, r331; +} +{ +add.f16x2 %1, r333, r334; +} +{ +sub.f16x2 %4, r330, r331; +} +{ +sub.f16x2 %5, r333, r334; +} +{ +add.f16x2 %2, r342, r343; +} +{ +add.f16x2 %3, r345, r346; +} +{ +sub.f16x2 %6, r342, r343; +} +{ +sub.f16x2 %7, r345, r346; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<782, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<373>; +.reg .b64 rd<2>; +mov.u32 r353, %tid.y; +shl.b32 r354, r353, 7; +mov.u32 r355, %8; +add.s32 r356, r355, r354; +mov.u32 r357, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r358, r357, 7; +shl.b32 r359, r357, 4; +and.b32 r360, r359, -128; +add.s32 r361, r356, r360; +cvt.rn.f32.u32 f21, r358; +mul.f32 f22, f21, 0f3E490FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r362, r359, 112; +add.s32 r363, r361, r362; +st.shared.v4.f32 [r363], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r364, r358, -12, r363; +ld.shared.u32 r166, [r364]; +ld.shared.u32 r178, [r364+32]; +ld.shared.u32 r167, [r364+64]; +ld.shared.u32 r179, [r364+96]; +barrier.sync 0; +st.shared.v4.f32 [r363], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r364]; +ld.shared.u32 r181, [r364+32]; +ld.shared.u32 r170, [r364+64]; +ld.shared.u32 r182, [r364+96]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r365, r357, 4; +bfe.u32 r366, r357, 2, 1; +shl.b32 r367, r357, 2; +and.b32 r368, r367, 12; +add.s32 r369, r361, r368; +cvt.rn.f32.u32 f24, r366; +mul.f32 f25, f24, 0f3F490FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +barrier.sync 0; +and.b32 r370, r359, 64; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r191; +st.shared.u32 [r371+16], r227; +st.shared.u32 [r371+32], r264; +st.shared.u32 [r371+48], r301; +barrier.sync 0; +mad.lo.s32 r372, r365, -12, r371; +ld.shared.u32 r330, [r372]; +ld.shared.u32 r342, [r372+32]; +ld.shared.u32 r331, [r372+64]; +ld.shared.u32 r343, [r372+96]; +barrier.sync 0; +st.shared.u32 [r371], r194; +st.shared.u32 [r371+16], r234; +st.shared.u32 [r371+32], r271; +st.shared.u32 [r371+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r372]; +ld.shared.u32 r345, [r372+32]; +ld.shared.u32 r334, [r372+64]; +ld.shared.u32 r346, [r372+96]; +{ +add.f16x2 %0, r330, r331; +} +{ +add.f16x2 %1, r333, r334; +} +{ +sub.f16x2 %4, r330, r331; +} +{ +sub.f16x2 %5, r333, r334; +} +{ +add.f16x2 %2, r342, r343; +} +{ +add.f16x2 %3, r345, r346; +} +{ +sub.f16x2 %6, r342, r343; +} +{ +sub.f16x2 %7, r345, r346; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<783, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<37>; +.reg .b32 r<256>; +.reg .b64 rd<2>; +mov.u32 r221, %tid.y; +shl.b32 r222, r221, 8; +mov.u32 r223, %4; +add.s32 r224, r223, r222; +mov.u32 r225, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r226, r225, 15; +shl.b32 r227, r225, 4; +and.b32 r228, r227, -256; +add.s32 r229, r224, r228; +cvt.rn.f32.u32 f25, r226; +mul.f32 f26, f25, 0f3E490FDB; +cos.approx.f32 f1, f26; +sin.approx.f32 f27, f26; +neg.f32 f2, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r230, r227, 240; +add.s32 r231, r229, r230; +st.shared.v2.f32 [r231], {r1, r4}; +st.shared.v2.f32 [r231+8], {r25, r32}; +barrier.sync 0; +shl.b32 r232, r225, 3; +and.b32 r233, r232, 120; +sub.s32 r234, r231, r233; +ld.shared.u32 r54, [r234]; +ld.shared.u32 r57, [r234+4]; +ld.shared.u32 r55, [r234+128]; +ld.shared.u32 r58, [r234+132]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r235, r225, 1, 3; +cvt.rn.f32.u32 f28, r235; +mul.f32 f29, f28, 0f3EC90FDB; +cos.approx.f32 f7, f29; +sin.approx.f32 f30, f29; +neg.f32 f8, f30; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r236, r232, 8; +add.s32 r237, r229, r236; +barrier.sync 0; +and.b32 r238, r227, 224; +add.s32 r239, r237, r238; +st.shared.u32 [r239], r53; +st.shared.u32 [r239+4], r56; +st.shared.u32 [r239+16], r77; +st.shared.u32 [r239+20], r84; +barrier.sync 0; +and.b32 r240, r232, 112; +sub.s32 r241, r239, r240; +ld.shared.u32 r106, [r241]; +ld.shared.u32 r109, [r241+4]; +ld.shared.u32 r107, [r241+128]; +ld.shared.u32 r110, [r241+132]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r242, r225, 2, 2; +cvt.rn.f32.u32 f31, r242; +mul.f32 f32, f31, 0f3F490FDB; +cos.approx.f32 f13, f32; +sin.approx.f32 f33, f32; +neg.f32 f14, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +and.b32 r243, r232, 24; +add.s32 r244, r229, r243; +barrier.sync 0; +and.b32 r245, r227, 192; +add.s32 r246, r244, r245; +st.shared.u32 [r246], r105; +st.shared.u32 [r246+4], r108; +st.shared.u32 [r246+32], r129; +st.shared.u32 [r246+36], r136; +barrier.sync 0; +and.b32 r247, r232, 96; +sub.s32 r248, r246, r247; +ld.shared.u32 r158, [r248]; +ld.shared.u32 r161, [r248+4]; +ld.shared.u32 r159, [r248+128]; +ld.shared.u32 r162, [r248+132]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r249, r225, 3, 1; +cvt.rn.f32.u32 f34, r249; +mul.f32 f35, f34, 0f3FC90FDB; +cos.approx.f32 f19, f35; +sin.approx.f32 f36, f35; +neg.f32 f20, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +and.b32 r250, r232, 56; +add.s32 r251, r229, r250; +barrier.sync 0; +and.b32 r252, r227, 128; +add.s32 r253, r251, r252; +st.shared.u32 [r253], r157; +st.shared.u32 [r253+4], r160; +st.shared.u32 [r253+64], r181; +st.shared.u32 [r253+68], r188; +barrier.sync 0; +and.b32 r254, r232, 64; +sub.s32 r255, r253, r254; +ld.shared.u32 r210, [r255]; +ld.shared.u32 r213, [r255+4]; +ld.shared.u32 r211, [r255+128]; +ld.shared.u32 r214, [r255+132]; +{ +add.f16x2 %0, r210, r211; +} +{ +add.f16x2 %1, r213, r214; +} +{ +sub.f16x2 %2, r210, r211; +} +{ +sub.f16x2 %3, r213, r214; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<784, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1283>; +.reg .b64 rd<2>; +mov.u32 r1271, %tid.y; +shl.b32 r1272, r1271, 8; +mov.u32 r1273, %32; +add.s32 r1274, r1273, r1272; +mov.u32 r1275, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f62, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r101, {low, high}; +} +mov.f32 f80, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f58, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +mov.f32 f66, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r397, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1276, r1275, 1; +shl.b32 r1277, r1275, 7; +and.b32 r1278, r1277, -256; +add.s32 r1279, r1274, r1278; +cvt.rn.f32.u32 f151, r1276; +mul.f32 f152, f151, 0f3E490FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1280, r1277, 128; +add.s32 r1281, r1279, r1280; +st.shared.v4.f32 [r1281], {r521, r524, r629, r636}; +st.shared.v4.f32 [r1281+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r1281+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r1281+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r1281+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r1281+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r1281+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r1281+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r1282, r1276, -120, r1281; +ld.shared.u32 r1176, [r1282]; +ld.shared.u32 r1179, [r1282+4]; +ld.shared.u32 r1188, [r1282+16]; +ld.shared.u32 r1191, [r1282+20]; +ld.shared.u32 r1200, [r1282+32]; +ld.shared.u32 r1203, [r1282+36]; +ld.shared.u32 r1212, [r1282+48]; +ld.shared.u32 r1215, [r1282+52]; +ld.shared.u32 r1224, [r1282+64]; +ld.shared.u32 r1227, [r1282+68]; +ld.shared.u32 r1236, [r1282+80]; +ld.shared.u32 r1239, [r1282+84]; +ld.shared.u32 r1248, [r1282+96]; +ld.shared.u32 r1251, [r1282+100]; +ld.shared.u32 r1260, [r1282+112]; +ld.shared.u32 r1263, [r1282+116]; +ld.shared.u32 r1177, [r1282+128]; +ld.shared.u32 r1180, [r1282+132]; +ld.shared.u32 r1189, [r1282+144]; +ld.shared.u32 r1192, [r1282+148]; +ld.shared.u32 r1201, [r1282+160]; +ld.shared.u32 r1204, [r1282+164]; +ld.shared.u32 r1213, [r1282+176]; +ld.shared.u32 r1216, [r1282+180]; +ld.shared.u32 r1225, [r1282+192]; +ld.shared.u32 r1228, [r1282+196]; +ld.shared.u32 r1237, [r1282+208]; +ld.shared.u32 r1240, [r1282+212]; +ld.shared.u32 r1249, [r1282+224]; +ld.shared.u32 r1252, [r1282+228]; +ld.shared.u32 r1261, [r1282+240]; +ld.shared.u32 r1264, [r1282+244]; +{ +add.f16x2 %0, r1176, r1177; +} +{ +add.f16x2 %1, r1179, r1180; +} +{ +sub.f16x2 %16, r1176, r1177; +} +{ +sub.f16x2 %17, r1179, r1180; +} +{ +add.f16x2 %2, r1188, r1189; +} +{ +add.f16x2 %3, r1191, r1192; +} +{ +sub.f16x2 %18, r1188, r1189; +} +{ +sub.f16x2 %19, r1191, r1192; +} +{ +add.f16x2 %4, r1200, r1201; +} +{ +add.f16x2 %5, r1203, r1204; +} +{ +sub.f16x2 %20, r1200, r1201; +} +{ +sub.f16x2 %21, r1203, r1204; +} +{ +add.f16x2 %6, r1212, r1213; +} +{ +add.f16x2 %7, r1215, r1216; +} +{ +sub.f16x2 %22, r1212, r1213; +} +{ +sub.f16x2 %23, r1215, r1216; +} +{ +add.f16x2 %8, r1224, r1225; +} +{ +add.f16x2 %9, r1227, r1228; +} +{ +sub.f16x2 %24, r1224, r1225; +} +{ +sub.f16x2 %25, r1227, r1228; +} +{ +add.f16x2 %10, r1236, r1237; +} +{ +add.f16x2 %11, r1239, r1240; +} +{ +sub.f16x2 %26, r1236, r1237; +} +{ +sub.f16x2 %27, r1239, r1240; +} +{ +add.f16x2 %12, r1248, r1249; +} +{ +add.f16x2 %13, r1251, r1252; +} +{ +sub.f16x2 %28, r1248, r1249; +} +{ +sub.f16x2 %29, r1251, r1252; +} +{ +add.f16x2 %14, r1260, r1261; +} +{ +add.f16x2 %15, r1263, r1264; +} +{ +sub.f16x2 %30, r1260, r1261; +} +{ +sub.f16x2 %31, r1263, r1264; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<785, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<37>; +.reg .b32 r<256>; +.reg .b64 rd<2>; +mov.u32 r221, %tid.y; +shl.b32 r222, r221, 7; +mov.u32 r223, %4; +add.s32 r224, r223, r222; +mov.u32 r225, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r226, r225, 15; +shl.b32 r227, r225, 3; +and.b32 r228, r227, -128; +add.s32 r229, r224, r228; +cvt.rn.f32.u32 f25, r226; +mul.f32 f26, f25, 0f3E490FDB; +cos.approx.f32 f1, f26; +sin.approx.f32 f27, f26; +neg.f32 f2, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r230, r227, 120; +add.s32 r231, r229, r230; +st.shared.v2.f32 [r231], {r1, r25}; +barrier.sync 0; +shl.b32 r232, r225, 2; +and.b32 r233, r232, 60; +sub.s32 r234, r231, r233; +ld.shared.u32 r54, [r234]; +ld.shared.u32 r55, [r234+64]; +barrier.sync 0; +st.shared.v2.f32 [r231], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r234]; +ld.shared.u32 r58, [r234+64]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r235, r225, 1, 3; +and.b32 r236, r232, 4; +add.s32 r237, r229, r236; +cvt.rn.f32.u32 f28, r235; +mul.f32 f29, f28, 0f3EC90FDB; +cos.approx.f32 f7, f29; +sin.approx.f32 f30, f29; +neg.f32 f8, f30; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r238, r227, 112; +add.s32 r239, r237, r238; +st.shared.u32 [r239], r53; +st.shared.u32 [r239+8], r77; +barrier.sync 0; +and.b32 r240, r232, 56; +sub.s32 r241, r239, r240; +ld.shared.u32 r106, [r241]; +ld.shared.u32 r107, [r241+64]; +barrier.sync 0; +st.shared.u32 [r239], r56; +st.shared.u32 [r239+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r241]; +ld.shared.u32 r110, [r241+64]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r242, r225, 2, 2; +and.b32 r243, r232, 12; +add.s32 r244, r229, r243; +cvt.rn.f32.u32 f31, r242; +mul.f32 f32, f31, 0f3F490FDB; +cos.approx.f32 f13, f32; +sin.approx.f32 f33, f32; +neg.f32 f14, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +barrier.sync 0; +and.b32 r245, r227, 96; +add.s32 r246, r244, r245; +st.shared.u32 [r246], r105; +st.shared.u32 [r246+16], r129; +barrier.sync 0; +and.b32 r247, r232, 48; +sub.s32 r248, r246, r247; +ld.shared.u32 r158, [r248]; +ld.shared.u32 r159, [r248+64]; +barrier.sync 0; +st.shared.u32 [r246], r108; +st.shared.u32 [r246+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r248]; +ld.shared.u32 r162, [r248+64]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r249, r225, 3, 1; +and.b32 r250, r232, 28; +add.s32 r251, r229, r250; +cvt.rn.f32.u32 f34, r249; +mul.f32 f35, f34, 0f3FC90FDB; +cos.approx.f32 f19, f35; +sin.approx.f32 f36, f35; +neg.f32 f20, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +barrier.sync 0; +and.b32 r252, r227, 64; +add.s32 r253, r251, r252; +st.shared.u32 [r253], r157; +st.shared.u32 [r253+32], r181; +barrier.sync 0; +and.b32 r254, r232, 32; +sub.s32 r255, r253, r254; +ld.shared.u32 r210, [r255]; +ld.shared.u32 r211, [r255+64]; +barrier.sync 0; +st.shared.u32 [r253], r160; +st.shared.u32 [r253+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r255]; +ld.shared.u32 r214, [r255+64]; +{ +add.f16x2 %0, r210, r211; +} +{ +add.f16x2 %1, r213, r214; +} +{ +sub.f16x2 %2, r210, r211; +} +{ +sub.f16x2 %3, r213, r214; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<786, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1283>; +.reg .b64 rd<2>; +mov.u32 r1271, %tid.y; +shl.b32 r1272, r1271, 7; +mov.u32 r1273, %32; +add.s32 r1274, r1273, r1272; +mov.u32 r1275, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f62, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r101, {low, high}; +} +mov.f32 f80, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f58, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +mov.f32 f66, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r397, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1276, r1275, 1; +shl.b32 r1277, r1275, 6; +and.b32 r1278, r1277, -128; +add.s32 r1279, r1274, r1278; +cvt.rn.f32.u32 f151, r1276; +mul.f32 f152, f151, 0f3E490FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1280, r1277, 64; +add.s32 r1281, r1279, r1280; +st.shared.v4.f32 [r1281], {r521, r629, r666, r703}; +st.shared.v4.f32 [r1281+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r1281+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r1281+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r1282, r1276, -60, r1281; +ld.shared.u32 r1176, [r1282]; +ld.shared.u32 r1188, [r1282+8]; +ld.shared.u32 r1200, [r1282+16]; +ld.shared.u32 r1212, [r1282+24]; +ld.shared.u32 r1224, [r1282+32]; +ld.shared.u32 r1236, [r1282+40]; +ld.shared.u32 r1248, [r1282+48]; +ld.shared.u32 r1260, [r1282+56]; +ld.shared.u32 r1177, [r1282+64]; +ld.shared.u32 r1189, [r1282+72]; +ld.shared.u32 r1201, [r1282+80]; +ld.shared.u32 r1213, [r1282+88]; +ld.shared.u32 r1225, [r1282+96]; +ld.shared.u32 r1237, [r1282+104]; +ld.shared.u32 r1249, [r1282+112]; +ld.shared.u32 r1261, [r1282+120]; +barrier.sync 0; +st.shared.v4.f32 [r1281], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1281+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1281+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1281+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1282]; +ld.shared.u32 r1191, [r1282+8]; +ld.shared.u32 r1203, [r1282+16]; +ld.shared.u32 r1215, [r1282+24]; +ld.shared.u32 r1227, [r1282+32]; +ld.shared.u32 r1239, [r1282+40]; +ld.shared.u32 r1251, [r1282+48]; +ld.shared.u32 r1263, [r1282+56]; +ld.shared.u32 r1180, [r1282+64]; +ld.shared.u32 r1192, [r1282+72]; +ld.shared.u32 r1204, [r1282+80]; +ld.shared.u32 r1216, [r1282+88]; +ld.shared.u32 r1228, [r1282+96]; +ld.shared.u32 r1240, [r1282+104]; +ld.shared.u32 r1252, [r1282+112]; +ld.shared.u32 r1264, [r1282+120]; +{ +add.f16x2 %0, r1176, r1177; +} +{ +add.f16x2 %1, r1179, r1180; +} +{ +sub.f16x2 %16, r1176, r1177; +} +{ +sub.f16x2 %17, r1179, r1180; +} +{ +add.f16x2 %2, r1188, r1189; +} +{ +add.f16x2 %3, r1191, r1192; +} +{ +sub.f16x2 %18, r1188, r1189; +} +{ +sub.f16x2 %19, r1191, r1192; +} +{ +add.f16x2 %4, r1200, r1201; +} +{ +add.f16x2 %5, r1203, r1204; +} +{ +sub.f16x2 %20, r1200, r1201; +} +{ +sub.f16x2 %21, r1203, r1204; +} +{ +add.f16x2 %6, r1212, r1213; +} +{ +add.f16x2 %7, r1215, r1216; +} +{ +sub.f16x2 %22, r1212, r1213; +} +{ +sub.f16x2 %23, r1215, r1216; +} +{ +add.f16x2 %8, r1224, r1225; +} +{ +add.f16x2 %9, r1227, r1228; +} +{ +sub.f16x2 %24, r1224, r1225; +} +{ +sub.f16x2 %25, r1227, r1228; +} +{ +add.f16x2 %10, r1236, r1237; +} +{ +add.f16x2 %11, r1239, r1240; +} +{ +sub.f16x2 %26, r1236, r1237; +} +{ +sub.f16x2 %27, r1239, r1240; +} +{ +add.f16x2 %12, r1248, r1249; +} +{ +add.f16x2 %13, r1251, r1252; +} +{ +sub.f16x2 %28, r1248, r1249; +} +{ +sub.f16x2 %29, r1251, r1252; +} +{ +add.f16x2 %14, r1260, r1261; +} +{ +add.f16x2 %15, r1263, r1264; +} +{ +sub.f16x2 %30, r1260, r1261; +} +{ +sub.f16x2 %31, r1263, r1264; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..854310efbc1cd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp16_inv.hpp.inc @@ -0,0 +1,8137 @@ +#ifndef CUFFTDX_FFT_32_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_32_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<980, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<179>; +.reg .b32 r<1583>; +.reg .f64 fd<179>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %64, %72; +} +{ +add.f16x2 r4, %65, %73; +} +{ +sub.f16x2 r7, %64, %72; +} +{ +sub.f16x2 r10, %65, %73; +} +{ +add.f16x2 r13, %68, %76; +} +{ +add.f16x2 r16, %69, %77; +} +{ +sub.f16x2 r19, %68, %76; +} +{ +sub.f16x2 r22, %69, %77; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %66, %74; +} +{ +add.f16x2 r54, %67, %75; +} +{ +sub.f16x2 r57, %66, %74; +} +{ +sub.f16x2 r60, %67, %75; +} +{ +add.f16x2 r63, %70, %78; +} +{ +add.f16x2 r66, %71, %79; +} +{ +sub.f16x2 r69, %70, %78; +} +{ +sub.f16x2 r72, %71, %79; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f64 fd140, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs1, fd140; +} +{ +cvt.rn.f16.f64 rs2, fd140; +} +mov.f64 fd139, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs5, fd139; +} +{ +cvt.rn.f16.f64 rs6, fd140; +} +mov.b32 r115, {rs1, rs1}; +{ +mul.f16x2 r101, r89, r115; +} +mov.b32 r112, {rs2, rs2}; +{ +mul.f16x2 r104, r92, r112; +} +{ +sub.f16x2 r107, r101, r104; +} +{ +mul.f16x2 r110, r89, r112; +} +{ +fma.rn.f16x2 r113, r92, r115, r110; +} +{ +neg.f16x2 r117, r86; +} +mov.b32 r133, {rs5, rs5}; +{ +mul.f16x2 r119, r95, r133; +} +mov.b32 r130, {rs6, rs6}; +{ +mul.f16x2 r122, r98, r130; +} +{ +sub.f16x2 r125, r119, r122; +} +{ +mul.f16x2 r128, r95, r130; +} +{ +fma.rn.f16x2 r131, r98, r133, r128; +} +{ +add.f16x2 r135, r27, r77; +} +{ +add.f16x2 r138, r30, r80; +} +{ +sub.f16x2 r141, r27, r77; +} +{ +sub.f16x2 r144, r30, r80; +} +{ +add.f16x2 r147, r39, r107; +} +{ +add.f16x2 r150, r42, r113; +} +{ +sub.f16x2 r153, r39, r107; +} +{ +sub.f16x2 r156, r42, r113; +} +{ +add.f16x2 r159, r33, r117; +} +{ +add.f16x2 r162, r36, r83; +} +{ +sub.f16x2 r165, r33, r117; +} +{ +sub.f16x2 r168, r36, r83; +} +{ +add.f16x2 r171, r45, r125; +} +{ +add.f16x2 r174, r48, r131; +} +{ +sub.f16x2 r177, r45, r125; +} +{ +sub.f16x2 r180, r48, r131; +} +{ +add.f16x2 r183, %80, %92; +} +{ +add.f16x2 r186, %82, %94; +} +{ +sub.f16x2 r189, %80, %92; +} +{ +sub.f16x2 r192, %82, %94; +} +{ +add.f16x2 r195, %88, %84; +} +{ +add.f16x2 r198, %90, %86; +} +{ +sub.f16x2 r201, %88, %84; +} +{ +sub.f16x2 r204, %90, %86; +} +{ +neg.f16x2 r207, r204; +} +{ +add.f16x2 r209, r183, r195; +} +{ +add.f16x2 r212, r186, r198; +} +{ +sub.f16x2 r215, r183, r195; +} +{ +sub.f16x2 r218, r186, r198; +} +{ +add.f16x2 r221, r189, r207; +} +{ +add.f16x2 r224, r192, r201; +} +{ +sub.f16x2 r227, r189, r207; +} +{ +sub.f16x2 r230, r192, r201; +} +{ +add.f16x2 r233, %89, %85; +} +{ +add.f16x2 r236, %91, %87; +} +{ +sub.f16x2 r239, %89, %85; +} +{ +sub.f16x2 r242, %91, %87; +} +{ +add.f16x2 r245, %81, %93; +} +{ +add.f16x2 r248, %83, %95; +} +{ +sub.f16x2 r251, %81, %93; +} +{ +sub.f16x2 r254, %83, %95; +} +{ +neg.f16x2 r257, r254; +} +{ +add.f16x2 r259, r233, r245; +} +{ +add.f16x2 r262, r236, r248; +} +{ +sub.f16x2 r265, r233, r245; +} +{ +sub.f16x2 r268, r236, r248; +} +{ +add.f16x2 r271, r239, r257; +} +{ +add.f16x2 r274, r242, r251; +} +{ +sub.f16x2 r277, r239, r257; +} +{ +sub.f16x2 r280, r242, r251; +} +{ +cvt.rn.f16.f64 rs15, fd140; +} +{ +cvt.rn.f16.f64 rs16, fd140; +} +{ +cvt.rn.f16.f64 rs19, fd139; +} +{ +cvt.rn.f16.f64 rs20, fd140; +} +mov.b32 r297, {rs15, rs15}; +{ +mul.f16x2 r283, r271, r297; +} +mov.b32 r294, {rs16, rs16}; +{ +mul.f16x2 r286, r274, r294; +} +{ +sub.f16x2 r289, r283, r286; +} +{ +mul.f16x2 r292, r271, r294; +} +{ +fma.rn.f16x2 r295, r274, r297, r292; +} +{ +neg.f16x2 r299, r268; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r277, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r280, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r277, r312; +} +{ +fma.rn.f16x2 r313, r280, r315, r310; +} +{ +add.f16x2 r317, r209, r259; +} +{ +add.f16x2 r320, r212, r262; +} +{ +sub.f16x2 r323, r209, r259; +} +{ +sub.f16x2 r326, r212, r262; +} +{ +add.f16x2 r329, r221, r289; +} +{ +add.f16x2 r332, r224, r295; +} +{ +sub.f16x2 r335, r221, r289; +} +{ +sub.f16x2 r338, r224, r295; +} +{ +add.f16x2 r341, r215, r299; +} +{ +add.f16x2 r344, r218, r265; +} +{ +sub.f16x2 r347, r215, r299; +} +{ +sub.f16x2 r350, r218, r265; +} +{ +add.f16x2 r353, r227, r307; +} +{ +add.f16x2 r356, r230, r313; +} +{ +sub.f16x2 r359, r227, r307; +} +{ +sub.f16x2 r362, r230, r313; +} +mov.f64 fd136, 0d3FED906BCF328D46; +{ +cvt.rn.f16.f64 rs29, fd136; +} +mov.f64 fd144, 0d3FD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs30, fd144; +} +{ +cvt.rn.f16.f64 rs31, fd140; +} +{ +cvt.rn.f16.f64 rs32, fd140; +} +{ +cvt.rn.f16.f64 rs33, fd144; +} +{ +cvt.rn.f16.f64 rs34, fd136; +} +mov.f64 fd135, 0dBFD87DE2A6AEA963; +{ +cvt.rn.f16.f64 rs37, fd135; +} +{ +cvt.rn.f16.f64 rs38, fd136; +} +{ +cvt.rn.f16.f64 rs39, fd139; +} +{ +cvt.rn.f16.f64 rs40, fd140; +} +mov.f64 fd143, 0dBFED906BCF328D46; +{ +cvt.rn.f16.f64 rs41, fd143; +} +{ +cvt.rn.f16.f64 rs42, fd144; +} +mov.b32 r379, {rs29, rs29}; +{ +mul.f16x2 r365, r329, r379; +} +mov.b32 r376, {rs30, rs30}; +{ +mul.f16x2 r368, r332, r376; +} +{ +sub.f16x2 r371, r365, r368; +} +{ +mul.f16x2 r374, r329, r376; +} +{ +fma.rn.f16x2 r377, r332, r379, r374; +} +mov.b32 r395, {rs31, rs31}; +{ +mul.f16x2 r381, r341, r395; +} +mov.b32 r392, {rs32, rs32}; +{ +mul.f16x2 r384, r344, r392; +} +{ +sub.f16x2 r387, r381, r384; +} +{ +mul.f16x2 r390, r341, r392; +} +{ +fma.rn.f16x2 r393, r344, r395, r390; +} +mov.b32 r411, {rs33, rs33}; +{ +mul.f16x2 r397, r353, r411; +} +mov.b32 r408, {rs34, rs34}; +{ +mul.f16x2 r400, r356, r408; +} +{ +sub.f16x2 r403, r397, r400; +} +{ +mul.f16x2 r406, r353, r408; +} +{ +fma.rn.f16x2 r409, r356, r411, r406; +} +{ +neg.f16x2 r413, r326; +} +mov.b32 r429, {rs37, rs37}; +{ +mul.f16x2 r415, r335, r429; +} +mov.b32 r426, {rs38, rs38}; +{ +mul.f16x2 r418, r338, r426; +} +{ +sub.f16x2 r421, r415, r418; +} +{ +mul.f16x2 r424, r335, r426; +} +{ +fma.rn.f16x2 r427, r338, r429, r424; +} +mov.b32 r445, {rs39, rs39}; +{ +mul.f16x2 r431, r347, r445; +} +mov.b32 r442, {rs40, rs40}; +{ +mul.f16x2 r434, r350, r442; +} +{ +sub.f16x2 r437, r431, r434; +} +{ +mul.f16x2 r440, r347, r442; +} +{ +fma.rn.f16x2 r443, r350, r445, r440; +} +mov.b32 r461, {rs41, rs41}; +{ +mul.f16x2 r447, r359, r461; +} +mov.b32 r458, {rs42, rs42}; +{ +mul.f16x2 r450, r362, r458; +} +{ +sub.f16x2 r453, r447, r450; +} +{ +mul.f16x2 r456, r359, r458; +} +{ +fma.rn.f16x2 r459, r362, r461, r456; +} +{ +add.f16x2 r463, r135, r317; +} +{ +add.f16x2 r466, r138, r320; +} +{ +sub.f16x2 r469, r135, r317; +} +{ +sub.f16x2 r472, r138, r320; +} +{ +add.f16x2 r475, r147, r371; +} +{ +add.f16x2 r478, r150, r377; +} +{ +sub.f16x2 r481, r147, r371; +} +{ +sub.f16x2 r484, r150, r377; +} +{ +add.f16x2 r487, r159, r387; +} +{ +add.f16x2 r490, r162, r393; +} +{ +sub.f16x2 r493, r159, r387; +} +{ +sub.f16x2 r496, r162, r393; +} +{ +add.f16x2 r499, r171, r403; +} +{ +add.f16x2 r502, r174, r409; +} +{ +sub.f16x2 r505, r171, r403; +} +{ +sub.f16x2 r508, r174, r409; +} +{ +add.f16x2 r511, r141, r413; +} +{ +add.f16x2 r514, r144, r323; +} +{ +sub.f16x2 r517, r141, r413; +} +{ +sub.f16x2 r520, r144, r323; +} +{ +add.f16x2 r523, r153, r421; +} +{ +add.f16x2 r526, r156, r427; +} +{ +sub.f16x2 r529, r153, r421; +} +{ +sub.f16x2 r532, r156, r427; +} +{ +add.f16x2 r535, r165, r437; +} +{ +add.f16x2 r538, r168, r443; +} +{ +sub.f16x2 r541, r165, r437; +} +{ +sub.f16x2 r544, r168, r443; +} +{ +add.f16x2 r547, r177, r453; +} +{ +add.f16x2 r550, r180, r459; +} +{ +sub.f16x2 r553, r177, r453; +} +{ +sub.f16x2 r556, r180, r459; +} +{ +add.f16x2 r559, %110, %106; +} +{ +add.f16x2 r562, %96, %108; +} +{ +sub.f16x2 r565, %110, %106; +} +{ +sub.f16x2 r568, %96, %108; +} +{ +add.f16x2 r571, %102, %98; +} +{ +add.f16x2 r574, %104, %100; +} +{ +sub.f16x2 r577, %102, %98; +} +{ +sub.f16x2 r580, %104, %100; +} +{ +neg.f16x2 r583, r580; +} +{ +add.f16x2 r585, r559, r571; +} +{ +add.f16x2 r588, r562, r574; +} +{ +sub.f16x2 r591, r559, r571; +} +{ +sub.f16x2 r594, r562, r574; +} +{ +add.f16x2 r597, r565, r583; +} +{ +add.f16x2 r600, r568, r577; +} +{ +sub.f16x2 r603, r565, r583; +} +{ +sub.f16x2 r606, r568, r577; +} +{ +add.f16x2 r609, %103, %99; +} +{ +add.f16x2 r612, %105, %101; +} +{ +sub.f16x2 r615, %103, %99; +} +{ +sub.f16x2 r618, %105, %101; +} +{ +add.f16x2 r621, %111, %107; +} +{ +add.f16x2 r624, %97, %109; +} +{ +sub.f16x2 r627, %111, %107; +} +{ +sub.f16x2 r630, %97, %109; +} +{ +neg.f16x2 r633, r630; +} +{ +add.f16x2 r635, r609, r621; +} +{ +add.f16x2 r638, r612, r624; +} +{ +sub.f16x2 r641, r609, r621; +} +{ +sub.f16x2 r644, r612, r624; +} +{ +add.f16x2 r647, r615, r633; +} +{ +add.f16x2 r650, r618, r627; +} +{ +sub.f16x2 r653, r615, r633; +} +{ +sub.f16x2 r656, r618, r627; +} +{ +cvt.rn.f16.f64 rs59, fd140; +} +{ +cvt.rn.f16.f64 rs60, fd140; +} +{ +cvt.rn.f16.f64 rs63, fd139; +} +{ +cvt.rn.f16.f64 rs64, fd140; +} +mov.b32 r673, {rs59, rs59}; +{ +mul.f16x2 r659, r647, r673; +} +mov.b32 r670, {rs60, rs60}; +{ +mul.f16x2 r662, r650, r670; +} +{ +sub.f16x2 r665, r659, r662; +} +{ +mul.f16x2 r668, r647, r670; +} +{ +fma.rn.f16x2 r671, r650, r673, r668; +} +{ +neg.f16x2 r675, r644; +} +mov.b32 r691, {rs63, rs63}; +{ +mul.f16x2 r677, r653, r691; +} +mov.b32 r688, {rs64, rs64}; +{ +mul.f16x2 r680, r656, r688; +} +{ +sub.f16x2 r683, r677, r680; +} +{ +mul.f16x2 r686, r653, r688; +} +{ +fma.rn.f16x2 r689, r656, r691, r686; +} +{ +add.f16x2 r693, r585, r635; +} +{ +add.f16x2 r696, r588, r638; +} +{ +sub.f16x2 r699, r585, r635; +} +{ +sub.f16x2 r702, r588, r638; +} +{ +add.f16x2 r705, r597, r665; +} +{ +add.f16x2 r708, r600, r671; +} +{ +sub.f16x2 r711, r597, r665; +} +{ +sub.f16x2 r714, r600, r671; +} +{ +add.f16x2 r717, r591, r675; +} +{ +add.f16x2 r720, r594, r641; +} +{ +sub.f16x2 r723, r591, r675; +} +{ +sub.f16x2 r726, r594, r641; +} +{ +add.f16x2 r729, r603, r683; +} +{ +add.f16x2 r732, r606, r689; +} +{ +sub.f16x2 r735, r603, r683; +} +{ +sub.f16x2 r738, r606, r689; +} +{ +add.f16x2 r741, %114, %126; +} +{ +add.f16x2 r744, %116, %112; +} +{ +sub.f16x2 r747, %114, %126; +} +{ +sub.f16x2 r750, %116, %112; +} +{ +add.f16x2 r753, %122, %118; +} +{ +add.f16x2 r756, %124, %120; +} +{ +sub.f16x2 r759, %122, %118; +} +{ +sub.f16x2 r762, %124, %120; +} +{ +neg.f16x2 r765, r762; +} +{ +add.f16x2 r767, r741, r753; +} +{ +add.f16x2 r770, r744, r756; +} +{ +sub.f16x2 r773, r741, r753; +} +{ +sub.f16x2 r776, r744, r756; +} +{ +add.f16x2 r779, r747, r765; +} +{ +add.f16x2 r782, r750, r759; +} +{ +sub.f16x2 r785, r747, r765; +} +{ +sub.f16x2 r788, r750, r759; +} +{ +add.f16x2 r791, %123, %119; +} +{ +add.f16x2 r794, %125, %121; +} +{ +sub.f16x2 r797, %123, %119; +} +{ +sub.f16x2 r800, %125, %121; +} +{ +add.f16x2 r803, %115, %127; +} +{ +add.f16x2 r806, %117, %113; +} +{ +sub.f16x2 r809, %115, %127; +} +{ +sub.f16x2 r812, %117, %113; +} +{ +neg.f16x2 r815, r812; +} +{ +add.f16x2 r817, r791, r803; +} +{ +add.f16x2 r820, r794, r806; +} +{ +sub.f16x2 r823, r791, r803; +} +{ +sub.f16x2 r826, r794, r806; +} +{ +add.f16x2 r829, r797, r815; +} +{ +add.f16x2 r832, r800, r809; +} +{ +sub.f16x2 r835, r797, r815; +} +{ +sub.f16x2 r838, r800, r809; +} +{ +cvt.rn.f16.f64 rs73, fd140; +} +{ +cvt.rn.f16.f64 rs74, fd140; +} +{ +cvt.rn.f16.f64 rs77, fd139; +} +{ +cvt.rn.f16.f64 rs78, fd140; +} +mov.b32 r855, {rs73, rs73}; +{ +mul.f16x2 r841, r829, r855; +} +mov.b32 r852, {rs74, rs74}; +{ +mul.f16x2 r844, r832, r852; +} +{ +sub.f16x2 r847, r841, r844; +} +{ +mul.f16x2 r850, r829, r852; +} +{ +fma.rn.f16x2 r853, r832, r855, r850; +} +{ +neg.f16x2 r857, r826; +} +mov.b32 r873, {rs77, rs77}; +{ +mul.f16x2 r859, r835, r873; +} +mov.b32 r870, {rs78, rs78}; +{ +mul.f16x2 r862, r838, r870; +} +{ +sub.f16x2 r865, r859, r862; +} +{ +mul.f16x2 r868, r835, r870; +} +{ +fma.rn.f16x2 r871, r838, r873, r868; +} +{ +add.f16x2 r875, r767, r817; +} +{ +add.f16x2 r878, r770, r820; +} +{ +sub.f16x2 r881, r767, r817; +} +{ +sub.f16x2 r884, r770, r820; +} +{ +add.f16x2 r887, r779, r847; +} +{ +add.f16x2 r890, r782, r853; +} +{ +sub.f16x2 r893, r779, r847; +} +{ +sub.f16x2 r896, r782, r853; +} +{ +add.f16x2 r899, r773, r857; +} +{ +add.f16x2 r902, r776, r823; +} +{ +sub.f16x2 r905, r773, r857; +} +{ +sub.f16x2 r908, r776, r823; +} +{ +add.f16x2 r911, r785, r865; +} +{ +add.f16x2 r914, r788, r871; +} +{ +sub.f16x2 r917, r785, r865; +} +{ +sub.f16x2 r920, r788, r871; +} +{ +cvt.rn.f16.f64 rs87, fd136; +} +{ +cvt.rn.f16.f64 rs88, fd144; +} +{ +cvt.rn.f16.f64 rs89, fd140; +} +{ +cvt.rn.f16.f64 rs90, fd140; +} +{ +cvt.rn.f16.f64 rs91, fd144; +} +{ +cvt.rn.f16.f64 rs92, fd136; +} +{ +cvt.rn.f16.f64 rs95, fd135; +} +{ +cvt.rn.f16.f64 rs96, fd136; +} +{ +cvt.rn.f16.f64 rs97, fd139; +} +{ +cvt.rn.f16.f64 rs98, fd140; +} +{ +cvt.rn.f16.f64 rs99, fd143; +} +{ +cvt.rn.f16.f64 rs100, fd144; +} +mov.b32 r937, {rs87, rs87}; +{ +mul.f16x2 r923, r887, r937; +} +mov.b32 r934, {rs88, rs88}; +{ +mul.f16x2 r926, r890, r934; +} +{ +sub.f16x2 r929, r923, r926; +} +{ +mul.f16x2 r932, r887, r934; +} +{ +fma.rn.f16x2 r935, r890, r937, r932; +} +mov.b32 r953, {rs89, rs89}; +{ +mul.f16x2 r939, r899, r953; +} +mov.b32 r950, {rs90, rs90}; +{ +mul.f16x2 r942, r902, r950; +} +{ +sub.f16x2 r945, r939, r942; +} +{ +mul.f16x2 r948, r899, r950; +} +{ +fma.rn.f16x2 r951, r902, r953, r948; +} +mov.b32 r969, {rs91, rs91}; +{ +mul.f16x2 r955, r911, r969; +} +mov.b32 r966, {rs92, rs92}; +{ +mul.f16x2 r958, r914, r966; +} +{ +sub.f16x2 r961, r955, r958; +} +{ +mul.f16x2 r964, r911, r966; +} +{ +fma.rn.f16x2 r967, r914, r969, r964; +} +{ +neg.f16x2 r971, r884; +} +mov.b32 r987, {rs95, rs95}; +{ +mul.f16x2 r973, r893, r987; +} +mov.b32 r984, {rs96, rs96}; +{ +mul.f16x2 r976, r896, r984; +} +{ +sub.f16x2 r979, r973, r976; +} +{ +mul.f16x2 r982, r893, r984; +} +{ +fma.rn.f16x2 r985, r896, r987, r982; +} +mov.b32 r1003, {rs97, rs97}; +{ +mul.f16x2 r989, r905, r1003; +} +mov.b32 r1000, {rs98, rs98}; +{ +mul.f16x2 r992, r908, r1000; +} +{ +sub.f16x2 r995, r989, r992; +} +{ +mul.f16x2 r998, r905, r1000; +} +{ +fma.rn.f16x2 r1001, r908, r1003, r998; +} +mov.b32 r1019, {rs99, rs99}; +{ +mul.f16x2 r1005, r917, r1019; +} +mov.b32 r1016, {rs100, rs100}; +{ +mul.f16x2 r1008, r920, r1016; +} +{ +sub.f16x2 r1011, r1005, r1008; +} +{ +mul.f16x2 r1014, r917, r1016; +} +{ +fma.rn.f16x2 r1017, r920, r1019, r1014; +} +{ +add.f16x2 r1021, r693, r875; +} +{ +add.f16x2 r1024, r696, r878; +} +{ +sub.f16x2 r1027, r693, r875; +} +{ +sub.f16x2 r1030, r696, r878; +} +{ +add.f16x2 r1033, r705, r929; +} +{ +add.f16x2 r1036, r708, r935; +} +{ +sub.f16x2 r1039, r705, r929; +} +{ +sub.f16x2 r1042, r708, r935; +} +{ +add.f16x2 r1045, r717, r945; +} +{ +add.f16x2 r1048, r720, r951; +} +{ +sub.f16x2 r1051, r717, r945; +} +{ +sub.f16x2 r1054, r720, r951; +} +{ +add.f16x2 r1057, r729, r961; +} +{ +add.f16x2 r1060, r732, r967; +} +{ +sub.f16x2 r1063, r729, r961; +} +{ +sub.f16x2 r1066, r732, r967; +} +{ +add.f16x2 r1069, r699, r971; +} +{ +add.f16x2 r1072, r702, r881; +} +{ +sub.f16x2 r1075, r699, r971; +} +{ +sub.f16x2 r1078, r702, r881; +} +{ +add.f16x2 r1081, r711, r979; +} +{ +add.f16x2 r1084, r714, r985; +} +{ +sub.f16x2 r1087, r711, r979; +} +{ +sub.f16x2 r1090, r714, r985; +} +{ +add.f16x2 r1093, r723, r995; +} +{ +add.f16x2 r1096, r726, r1001; +} +{ +sub.f16x2 r1099, r723, r995; +} +{ +sub.f16x2 r1102, r726, r1001; +} +{ +add.f16x2 r1105, r735, r1011; +} +{ +add.f16x2 r1108, r738, r1017; +} +{ +sub.f16x2 r1111, r735, r1011; +} +{ +sub.f16x2 r1114, r738, r1017; +} +mov.f64 fd134, 0d3FEF6297CFF75CB0; +{ +cvt.rn.f16.f64 rs117, fd134; +} +mov.f64 fd146, 0d3FC8F8B83C69A60B; +{ +cvt.rn.f16.f64 rs118, fd146; +} +{ +cvt.rn.f16.f64 rs119, fd136; +} +{ +cvt.rn.f16.f64 rs120, fd144; +} +mov.f64 fd138, 0d3FEA9B66290EA1A3; +{ +cvt.rn.f16.f64 rs121, fd138; +} +mov.f64 fd142, 0d3FE1C73B39AE68C8; +{ +cvt.rn.f16.f64 rs122, fd142; +} +{ +cvt.rn.f16.f64 rs123, fd140; +} +{ +cvt.rn.f16.f64 rs124, fd140; +} +{ +cvt.rn.f16.f64 rs125, fd142; +} +{ +cvt.rn.f16.f64 rs126, fd138; +} +{ +cvt.rn.f16.f64 rs127, fd144; +} +{ +cvt.rn.f16.f64 rs128, fd136; +} +{ +cvt.rn.f16.f64 rs129, fd146; +} +{ +cvt.rn.f16.f64 rs130, fd134; +} +mov.f64 fd133, 0dBFC8F8B83C69A60B; +{ +cvt.rn.f16.f64 rs133, fd133; +} +{ +cvt.rn.f16.f64 rs134, fd134; +} +{ +cvt.rn.f16.f64 rs135, fd135; +} +{ +cvt.rn.f16.f64 rs136, fd136; +} +mov.f64 fd137, 0dBFE1C73B39AE68C8; +{ +cvt.rn.f16.f64 rs137, fd137; +} +{ +cvt.rn.f16.f64 rs138, fd138; +} +{ +cvt.rn.f16.f64 rs139, fd139; +} +{ +cvt.rn.f16.f64 rs140, fd140; +} +mov.f64 fd141, 0dBFEA9B66290EA1A3; +{ +cvt.rn.f16.f64 rs141, fd141; +} +{ +cvt.rn.f16.f64 rs142, fd142; +} +{ +cvt.rn.f16.f64 rs143, fd143; +} +{ +cvt.rn.f16.f64 rs144, fd144; +} +mov.f64 fd145, 0dBFEF6297CFF75CB0; +{ +cvt.rn.f16.f64 rs145, fd145; +} +{ +cvt.rn.f16.f64 rs146, fd146; +} +mov.b32 r1131, {rs117, rs117}; +{ +mul.f16x2 r1117, r1033, r1131; +} +mov.b32 r1128, {rs118, rs118}; +{ +mul.f16x2 r1120, r1036, r1128; +} +{ +sub.f16x2 r1123, r1117, r1120; +} +{ +mul.f16x2 r1126, r1033, r1128; +} +{ +fma.rn.f16x2 r1129, r1036, r1131, r1126; +} +mov.b32 r1147, {rs119, rs119}; +{ +mul.f16x2 r1133, r1045, r1147; +} +mov.b32 r1144, {rs120, rs120}; +{ +mul.f16x2 r1136, r1048, r1144; +} +{ +sub.f16x2 r1139, r1133, r1136; +} +{ +mul.f16x2 r1142, r1045, r1144; +} +{ +fma.rn.f16x2 r1145, r1048, r1147, r1142; +} +mov.b32 r1163, {rs121, rs121}; +{ +mul.f16x2 r1149, r1057, r1163; +} +mov.b32 r1160, {rs122, rs122}; +{ +mul.f16x2 r1152, r1060, r1160; +} +{ +sub.f16x2 r1155, r1149, r1152; +} +{ +mul.f16x2 r1158, r1057, r1160; +} +{ +fma.rn.f16x2 r1161, r1060, r1163, r1158; +} +mov.b32 r1179, {rs123, rs123}; +{ +mul.f16x2 r1165, r1069, r1179; +} +mov.b32 r1176, {rs124, rs124}; +{ +mul.f16x2 r1168, r1072, r1176; +} +{ +sub.f16x2 r1171, r1165, r1168; +} +{ +mul.f16x2 r1174, r1069, r1176; +} +{ +fma.rn.f16x2 r1177, r1072, r1179, r1174; +} +mov.b32 r1195, {rs125, rs125}; +{ +mul.f16x2 r1181, r1081, r1195; +} +mov.b32 r1192, {rs126, rs126}; +{ +mul.f16x2 r1184, r1084, r1192; +} +{ +sub.f16x2 r1187, r1181, r1184; +} +{ +mul.f16x2 r1190, r1081, r1192; +} +{ +fma.rn.f16x2 r1193, r1084, r1195, r1190; +} +mov.b32 r1211, {rs127, rs127}; +{ +mul.f16x2 r1197, r1093, r1211; +} +mov.b32 r1208, {rs128, rs128}; +{ +mul.f16x2 r1200, r1096, r1208; +} +{ +sub.f16x2 r1203, r1197, r1200; +} +{ +mul.f16x2 r1206, r1093, r1208; +} +{ +fma.rn.f16x2 r1209, r1096, r1211, r1206; +} +mov.b32 r1227, {rs129, rs129}; +{ +mul.f16x2 r1213, r1105, r1227; +} +mov.b32 r1224, {rs130, rs130}; +{ +mul.f16x2 r1216, r1108, r1224; +} +{ +sub.f16x2 r1219, r1213, r1216; +} +{ +mul.f16x2 r1222, r1105, r1224; +} +{ +fma.rn.f16x2 r1225, r1108, r1227, r1222; +} +{ +neg.f16x2 r1229, r1030; +} +mov.b32 r1245, {rs133, rs133}; +{ +mul.f16x2 r1231, r1039, r1245; +} +mov.b32 r1242, {rs134, rs134}; +{ +mul.f16x2 r1234, r1042, r1242; +} +{ +sub.f16x2 r1237, r1231, r1234; +} +{ +mul.f16x2 r1240, r1039, r1242; +} +{ +fma.rn.f16x2 r1243, r1042, r1245, r1240; +} +mov.b32 r1261, {rs135, rs135}; +{ +mul.f16x2 r1247, r1051, r1261; +} +mov.b32 r1258, {rs136, rs136}; +{ +mul.f16x2 r1250, r1054, r1258; +} +{ +sub.f16x2 r1253, r1247, r1250; +} +{ +mul.f16x2 r1256, r1051, r1258; +} +{ +fma.rn.f16x2 r1259, r1054, r1261, r1256; +} +mov.b32 r1277, {rs137, rs137}; +{ +mul.f16x2 r1263, r1063, r1277; +} +mov.b32 r1274, {rs138, rs138}; +{ +mul.f16x2 r1266, r1066, r1274; +} +{ +sub.f16x2 r1269, r1263, r1266; +} +{ +mul.f16x2 r1272, r1063, r1274; +} +{ +fma.rn.f16x2 r1275, r1066, r1277, r1272; +} +mov.b32 r1293, {rs139, rs139}; +{ +mul.f16x2 r1279, r1075, r1293; +} +mov.b32 r1290, {rs140, rs140}; +{ +mul.f16x2 r1282, r1078, r1290; +} +{ +sub.f16x2 r1285, r1279, r1282; +} +{ +mul.f16x2 r1288, r1075, r1290; +} +{ +fma.rn.f16x2 r1291, r1078, r1293, r1288; +} +mov.b32 r1309, {rs141, rs141}; +{ +mul.f16x2 r1295, r1087, r1309; +} +mov.b32 r1306, {rs142, rs142}; +{ +mul.f16x2 r1298, r1090, r1306; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1087, r1306; +} +{ +fma.rn.f16x2 r1307, r1090, r1309, r1304; +} +mov.b32 r1325, {rs143, rs143}; +{ +mul.f16x2 r1311, r1099, r1325; +} +mov.b32 r1322, {rs144, rs144}; +{ +mul.f16x2 r1314, r1102, r1322; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1099, r1322; +} +{ +fma.rn.f16x2 r1323, r1102, r1325, r1320; +} +mov.b32 r1341, {rs145, rs145}; +{ +mul.f16x2 r1327, r1111, r1341; +} +mov.b32 r1338, {rs146, rs146}; +{ +mul.f16x2 r1330, r1114, r1338; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1111, r1338; +} +{ +fma.rn.f16x2 r1339, r1114, r1341, r1336; +} +{ +add.f16x2 %0, r463, r1021; +} +{ +add.f16x2 %1, r466, r1024; +} +{ +sub.f16x2 %32, r463, r1021; +} +{ +sub.f16x2 %33, r466, r1024; +} +{ +add.f16x2 %2, r475, r1123; +} +{ +add.f16x2 %3, r478, r1129; +} +{ +sub.f16x2 %34, r475, r1123; +} +{ +sub.f16x2 %35, r478, r1129; +} +{ +add.f16x2 %4, r487, r1139; +} +{ +add.f16x2 %5, r490, r1145; +} +{ +sub.f16x2 %36, r487, r1139; +} +{ +sub.f16x2 %37, r490, r1145; +} +{ +add.f16x2 %6, r499, r1155; +} +{ +add.f16x2 %7, r502, r1161; +} +{ +sub.f16x2 %38, r499, r1155; +} +{ +sub.f16x2 %39, r502, r1161; +} +{ +add.f16x2 %8, r511, r1171; +} +{ +add.f16x2 %9, r514, r1177; +} +{ +sub.f16x2 %40, r511, r1171; +} +{ +sub.f16x2 %41, r514, r1177; +} +{ +add.f16x2 %10, r523, r1187; +} +{ +add.f16x2 %11, r526, r1193; +} +{ +sub.f16x2 %42, r523, r1187; +} +{ +sub.f16x2 %43, r526, r1193; +} +{ +add.f16x2 %12, r535, r1203; +} +{ +add.f16x2 %13, r538, r1209; +} +{ +sub.f16x2 %44, r535, r1203; +} +{ +sub.f16x2 %45, r538, r1209; +} +{ +add.f16x2 %14, r547, r1219; +} +{ +add.f16x2 %15, r550, r1225; +} +{ +sub.f16x2 %46, r547, r1219; +} +{ +sub.f16x2 %47, r550, r1225; +} +{ +add.f16x2 %16, r469, r1229; +} +{ +add.f16x2 %17, r472, r1027; +} +{ +sub.f16x2 %48, r469, r1229; +} +{ +sub.f16x2 %49, r472, r1027; +} +{ +add.f16x2 %18, r481, r1237; +} +{ +add.f16x2 %19, r484, r1243; +} +{ +sub.f16x2 %50, r481, r1237; +} +{ +sub.f16x2 %51, r484, r1243; +} +{ +add.f16x2 %20, r493, r1253; +} +{ +add.f16x2 %21, r496, r1259; +} +{ +sub.f16x2 %52, r493, r1253; +} +{ +sub.f16x2 %53, r496, r1259; +} +{ +add.f16x2 %22, r505, r1269; +} +{ +add.f16x2 %23, r508, r1275; +} +{ +sub.f16x2 %54, r505, r1269; +} +{ +sub.f16x2 %55, r508, r1275; +} +{ +add.f16x2 %24, r517, r1285; +} +{ +add.f16x2 %25, r520, r1291; +} +{ +sub.f16x2 %56, r517, r1285; +} +{ +sub.f16x2 %57, r520, r1291; +} +{ +add.f16x2 %26, r529, r1301; +} +{ +add.f16x2 %27, r532, r1307; +} +{ +sub.f16x2 %58, r529, r1301; +} +{ +sub.f16x2 %59, r532, r1307; +} +{ +add.f16x2 %28, r541, r1317; +} +{ +add.f16x2 %29, r544, r1323; +} +{ +sub.f16x2 %60, r541, r1317; +} +{ +sub.f16x2 %61, r544, r1323; +} +{ +add.f16x2 %30, r553, r1333; +} +{ +add.f16x2 %31, r556, r1339; +} +{ +sub.f16x2 %62, r553, r1333; +} +{ +sub.f16x2 %63, r556, r1339; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[31].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<981, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<571>; +.reg .b64 rd<2>; +mov.u32 r559, %tid.y; +shl.b32 r560, r559, 8; +mov.u32 r561, %16; +add.s32 r562, r561, r560; +mov.u32 r563, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f12, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f44, 0f3F800000; +mov.f32 f10, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f10; +cvt.rn.f16.f32 high, f10; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r564, r563, 3; +shl.b32 r565, r563, 6; +and.b32 r566, r565, -256; +add.s32 r567, r562, r566; +cvt.rn.f32.u32 f47, r564; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r568, r565, 192; +add.s32 r569, r567, r568; +st.shared.v4.f32 [r569], {r149, r152, r207, r216}; +st.shared.v4.f32 [r569+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r569+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r569+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r570, r564, -56, r569; +ld.shared.u32 r460, [r570]; +ld.shared.u32 r463, [r570+4]; +ld.shared.u32 r510, [r570+32]; +ld.shared.u32 r513, [r570+36]; +ld.shared.u32 r472, [r570+64]; +ld.shared.u32 r475, [r570+68]; +ld.shared.u32 r522, [r570+96]; +ld.shared.u32 r525, [r570+100]; +ld.shared.u32 r461, [r570+128]; +ld.shared.u32 r464, [r570+132]; +ld.shared.u32 r511, [r570+160]; +ld.shared.u32 r514, [r570+164]; +ld.shared.u32 r473, [r570+192]; +ld.shared.u32 r476, [r570+196]; +ld.shared.u32 r523, [r570+224]; +ld.shared.u32 r526, [r570+228]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 %0, r459, r471; +} +{ +add.f16x2 %1, r462, r474; +} +{ +sub.f16x2 %8, r459, r471; +} +{ +sub.f16x2 %9, r462, r474; +} +{ +add.f16x2 %4, r465, r483; +} +{ +add.f16x2 %5, r468, r477; +} +{ +sub.f16x2 %12, r465, r483; +} +{ +sub.f16x2 %13, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 %2, r509, r521; +} +{ +add.f16x2 %3, r512, r524; +} +{ +sub.f16x2 %10, r509, r521; +} +{ +sub.f16x2 %11, r512, r524; +} +{ +add.f16x2 %6, r515, r533; +} +{ +add.f16x2 %7, r518, r527; +} +{ +sub.f16x2 %14, r515, r533; +} +{ +sub.f16x2 %15, r518, r527; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<982, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<571>; +.reg .b64 rd<2>; +mov.u32 r559, %tid.y; +shl.b32 r560, r559, 7; +mov.u32 r561, %16; +add.s32 r562, r561, r560; +mov.u32 r563, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f12, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r102, {low, high}; +} +mov.f32 f44, 0f3F800000; +mov.f32 f10, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f10; +cvt.rn.f16.f32 high, f10; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f12; +cvt.rn.f16.f32 high, f12; +mov.b32 r106, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r564, r563, 3; +shl.b32 r565, r563, 5; +and.b32 r566, r565, -128; +add.s32 r567, r562, r566; +cvt.rn.f32.u32 f47, r564; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r568, r565, 96; +add.s32 r569, r567, r568; +st.shared.v4.f32 [r569], {r149, r207, r244, r281}; +st.shared.v4.f32 [r569+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r570, r564, -28, r569; +ld.shared.u32 r460, [r570]; +ld.shared.u32 r510, [r570+16]; +ld.shared.u32 r472, [r570+32]; +ld.shared.u32 r522, [r570+48]; +ld.shared.u32 r461, [r570+64]; +ld.shared.u32 r511, [r570+80]; +ld.shared.u32 r473, [r570+96]; +ld.shared.u32 r523, [r570+112]; +barrier.sync 0; +st.shared.v4.f32 [r569], {r152, r216, r253, r290}; +st.shared.v4.f32 [r569+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r570]; +ld.shared.u32 r513, [r570+16]; +ld.shared.u32 r475, [r570+32]; +ld.shared.u32 r525, [r570+48]; +ld.shared.u32 r464, [r570+64]; +ld.shared.u32 r514, [r570+80]; +ld.shared.u32 r476, [r570+96]; +ld.shared.u32 r526, [r570+112]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 %0, r459, r471; +} +{ +add.f16x2 %1, r462, r474; +} +{ +sub.f16x2 %8, r459, r471; +} +{ +sub.f16x2 %9, r462, r474; +} +{ +add.f16x2 %4, r465, r483; +} +{ +add.f16x2 %5, r468, r477; +} +{ +sub.f16x2 %12, r465, r483; +} +{ +sub.f16x2 %13, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 %2, r509, r521; +} +{ +add.f16x2 %3, r512, r524; +} +{ +sub.f16x2 %10, r509, r521; +} +{ +sub.f16x2 %11, r512, r524; +} +{ +add.f16x2 %6, r515, r533; +} +{ +add.f16x2 %7, r518, r527; +} +{ +sub.f16x2 %14, r515, r533; +} +{ +sub.f16x2 %15, r518, r527; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<983, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<373>; +.reg .b64 rd<2>; +mov.u32 r353, %tid.y; +shl.b32 r354, r353, 8; +mov.u32 r355, %8; +add.s32 r356, r355, r354; +mov.u32 r357, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r358, r357, 7; +shl.b32 r359, r357, 5; +and.b32 r360, r359, -256; +add.s32 r361, r356, r360; +cvt.rn.f32.u32 f21, r358; +mul.f32 f22, f21, 0f3E490FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r362, r359, 224; +add.s32 r363, r361, r362; +st.shared.v4.f32 [r363], {r27, r30, r61, r70}; +st.shared.v4.f32 [r363+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r364, r358, -24, r363; +ld.shared.u32 r166, [r364]; +ld.shared.u32 r169, [r364+4]; +ld.shared.u32 r178, [r364+64]; +ld.shared.u32 r181, [r364+68]; +ld.shared.u32 r167, [r364+128]; +ld.shared.u32 r170, [r364+132]; +ld.shared.u32 r179, [r364+192]; +ld.shared.u32 r182, [r364+196]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r365, r357, 4; +bfe.u32 r366, r357, 2, 1; +cvt.rn.f32.u32 f24, r366; +mul.f32 f25, f24, 0f3F490FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +shl.b32 r367, r357, 3; +and.b32 r368, r367, 24; +add.s32 r369, r361, r368; +barrier.sync 0; +and.b32 r370, r359, 128; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r191; +st.shared.u32 [r371+4], r194; +st.shared.u32 [r371+32], r225; +st.shared.u32 [r371+36], r234; +st.shared.u32 [r371+64], r262; +st.shared.u32 [r371+68], r271; +st.shared.u32 [r371+96], r299; +st.shared.u32 [r371+100], r308; +barrier.sync 0; +mad.lo.s32 r372, r365, -24, r371; +ld.shared.u32 r330, [r372]; +ld.shared.u32 r333, [r372+4]; +ld.shared.u32 r342, [r372+64]; +ld.shared.u32 r345, [r372+68]; +ld.shared.u32 r331, [r372+128]; +ld.shared.u32 r334, [r372+132]; +ld.shared.u32 r343, [r372+192]; +ld.shared.u32 r346, [r372+196]; +{ +add.f16x2 %0, r330, r331; +} +{ +add.f16x2 %1, r333, r334; +} +{ +sub.f16x2 %4, r330, r331; +} +{ +sub.f16x2 %5, r333, r334; +} +{ +add.f16x2 %2, r342, r343; +} +{ +add.f16x2 %3, r345, r346; +} +{ +sub.f16x2 %6, r342, r343; +} +{ +sub.f16x2 %7, r345, r346; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<984, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<373>; +.reg .b64 rd<2>; +mov.u32 r353, %tid.y; +shl.b32 r354, r353, 7; +mov.u32 r355, %8; +add.s32 r356, r355, r354; +mov.u32 r357, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r358, r357, 7; +shl.b32 r359, r357, 4; +and.b32 r360, r359, -128; +add.s32 r361, r356, r360; +cvt.rn.f32.u32 f21, r358; +mul.f32 f22, f21, 0f3E490FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r362, r359, 112; +add.s32 r363, r361, r362; +st.shared.v4.f32 [r363], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r364, r358, -12, r363; +ld.shared.u32 r166, [r364]; +ld.shared.u32 r178, [r364+32]; +ld.shared.u32 r167, [r364+64]; +ld.shared.u32 r179, [r364+96]; +barrier.sync 0; +st.shared.v4.f32 [r363], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r364]; +ld.shared.u32 r181, [r364+32]; +ld.shared.u32 r170, [r364+64]; +ld.shared.u32 r182, [r364+96]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r365, r357, 4; +bfe.u32 r366, r357, 2, 1; +shl.b32 r367, r357, 2; +and.b32 r368, r367, 12; +add.s32 r369, r361, r368; +cvt.rn.f32.u32 f24, r366; +mul.f32 f25, f24, 0f3F490FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +barrier.sync 0; +and.b32 r370, r359, 64; +add.s32 r371, r369, r370; +st.shared.u32 [r371], r191; +st.shared.u32 [r371+16], r225; +st.shared.u32 [r371+32], r262; +st.shared.u32 [r371+48], r299; +barrier.sync 0; +mad.lo.s32 r372, r365, -12, r371; +ld.shared.u32 r330, [r372]; +ld.shared.u32 r342, [r372+32]; +ld.shared.u32 r331, [r372+64]; +ld.shared.u32 r343, [r372+96]; +barrier.sync 0; +st.shared.u32 [r371], r194; +st.shared.u32 [r371+16], r234; +st.shared.u32 [r371+32], r271; +st.shared.u32 [r371+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r372]; +ld.shared.u32 r345, [r372+32]; +ld.shared.u32 r334, [r372+64]; +ld.shared.u32 r346, [r372+96]; +{ +add.f16x2 %0, r330, r331; +} +{ +add.f16x2 %1, r333, r334; +} +{ +sub.f16x2 %4, r330, r331; +} +{ +sub.f16x2 %5, r333, r334; +} +{ +add.f16x2 %2, r342, r343; +} +{ +add.f16x2 %3, r345, r346; +} +{ +sub.f16x2 %6, r342, r343; +} +{ +sub.f16x2 %7, r345, r346; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<985, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<37>; +.reg .b32 r<256>; +.reg .b64 rd<2>; +mov.u32 r221, %tid.y; +shl.b32 r222, r221, 8; +mov.u32 r223, %4; +add.s32 r224, r223, r222; +mov.u32 r225, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r226, r225, 15; +shl.b32 r227, r225, 4; +and.b32 r228, r227, -256; +add.s32 r229, r224, r228; +cvt.rn.f32.u32 f25, r226; +mul.f32 f26, f25, 0f3E490FDB; +cos.approx.f32 f1, f26; +sin.approx.f32 f27, f26; +neg.f32 f2, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r230, r227, 240; +add.s32 r231, r229, r230; +st.shared.v2.f32 [r231], {r1, r4}; +st.shared.v2.f32 [r231+8], {r23, r32}; +barrier.sync 0; +shl.b32 r232, r225, 3; +and.b32 r233, r232, 120; +sub.s32 r234, r231, r233; +ld.shared.u32 r54, [r234]; +ld.shared.u32 r57, [r234+4]; +ld.shared.u32 r55, [r234+128]; +ld.shared.u32 r58, [r234+132]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r235, r225, 1, 3; +cvt.rn.f32.u32 f28, r235; +mul.f32 f29, f28, 0f3EC90FDB; +cos.approx.f32 f7, f29; +sin.approx.f32 f30, f29; +neg.f32 f8, f30; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r236, r232, 8; +add.s32 r237, r229, r236; +barrier.sync 0; +and.b32 r238, r227, 224; +add.s32 r239, r237, r238; +st.shared.u32 [r239], r53; +st.shared.u32 [r239+4], r56; +st.shared.u32 [r239+16], r75; +st.shared.u32 [r239+20], r84; +barrier.sync 0; +and.b32 r240, r232, 112; +sub.s32 r241, r239, r240; +ld.shared.u32 r106, [r241]; +ld.shared.u32 r109, [r241+4]; +ld.shared.u32 r107, [r241+128]; +ld.shared.u32 r110, [r241+132]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r242, r225, 2, 2; +cvt.rn.f32.u32 f31, r242; +mul.f32 f32, f31, 0f3F490FDB; +cos.approx.f32 f13, f32; +sin.approx.f32 f33, f32; +neg.f32 f14, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +and.b32 r243, r232, 24; +add.s32 r244, r229, r243; +barrier.sync 0; +and.b32 r245, r227, 192; +add.s32 r246, r244, r245; +st.shared.u32 [r246], r105; +st.shared.u32 [r246+4], r108; +st.shared.u32 [r246+32], r127; +st.shared.u32 [r246+36], r136; +barrier.sync 0; +and.b32 r247, r232, 96; +sub.s32 r248, r246, r247; +ld.shared.u32 r158, [r248]; +ld.shared.u32 r161, [r248+4]; +ld.shared.u32 r159, [r248+128]; +ld.shared.u32 r162, [r248+132]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r249, r225, 3, 1; +cvt.rn.f32.u32 f34, r249; +mul.f32 f35, f34, 0f3FC90FDB; +cos.approx.f32 f19, f35; +sin.approx.f32 f36, f35; +neg.f32 f20, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +and.b32 r250, r232, 56; +add.s32 r251, r229, r250; +barrier.sync 0; +and.b32 r252, r227, 128; +add.s32 r253, r251, r252; +st.shared.u32 [r253], r157; +st.shared.u32 [r253+4], r160; +st.shared.u32 [r253+64], r179; +st.shared.u32 [r253+68], r188; +barrier.sync 0; +and.b32 r254, r232, 64; +sub.s32 r255, r253, r254; +ld.shared.u32 r210, [r255]; +ld.shared.u32 r213, [r255+4]; +ld.shared.u32 r211, [r255+128]; +ld.shared.u32 r214, [r255+132]; +{ +add.f16x2 %0, r210, r211; +} +{ +add.f16x2 %1, r213, r214; +} +{ +sub.f16x2 %2, r210, r211; +} +{ +sub.f16x2 %3, r213, r214; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<986, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1283>; +.reg .b64 rd<2>; +mov.u32 r1271, %tid.y; +shl.b32 r1272, r1271, 8; +mov.u32 r1273, %32; +add.s32 r1274, r1273, r1272; +mov.u32 r1275, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f80, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f78, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f76, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r398, {low, high}; +} +mov.f32 f74, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1276, r1275, 1; +shl.b32 r1277, r1275, 7; +and.b32 r1278, r1277, -256; +add.s32 r1279, r1274, r1278; +cvt.rn.f32.u32 f151, r1276; +mul.f32 f152, f151, 0f3E490FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1280, r1277, 128; +add.s32 r1281, r1279, r1280; +st.shared.v4.f32 [r1281], {r521, r524, r627, r636}; +st.shared.v4.f32 [r1281+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r1281+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r1281+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r1281+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r1281+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r1281+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r1281+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r1282, r1276, -120, r1281; +ld.shared.u32 r1176, [r1282]; +ld.shared.u32 r1179, [r1282+4]; +ld.shared.u32 r1188, [r1282+16]; +ld.shared.u32 r1191, [r1282+20]; +ld.shared.u32 r1200, [r1282+32]; +ld.shared.u32 r1203, [r1282+36]; +ld.shared.u32 r1212, [r1282+48]; +ld.shared.u32 r1215, [r1282+52]; +ld.shared.u32 r1224, [r1282+64]; +ld.shared.u32 r1227, [r1282+68]; +ld.shared.u32 r1236, [r1282+80]; +ld.shared.u32 r1239, [r1282+84]; +ld.shared.u32 r1248, [r1282+96]; +ld.shared.u32 r1251, [r1282+100]; +ld.shared.u32 r1260, [r1282+112]; +ld.shared.u32 r1263, [r1282+116]; +ld.shared.u32 r1177, [r1282+128]; +ld.shared.u32 r1180, [r1282+132]; +ld.shared.u32 r1189, [r1282+144]; +ld.shared.u32 r1192, [r1282+148]; +ld.shared.u32 r1201, [r1282+160]; +ld.shared.u32 r1204, [r1282+164]; +ld.shared.u32 r1213, [r1282+176]; +ld.shared.u32 r1216, [r1282+180]; +ld.shared.u32 r1225, [r1282+192]; +ld.shared.u32 r1228, [r1282+196]; +ld.shared.u32 r1237, [r1282+208]; +ld.shared.u32 r1240, [r1282+212]; +ld.shared.u32 r1249, [r1282+224]; +ld.shared.u32 r1252, [r1282+228]; +ld.shared.u32 r1261, [r1282+240]; +ld.shared.u32 r1264, [r1282+244]; +{ +add.f16x2 %0, r1176, r1177; +} +{ +add.f16x2 %1, r1179, r1180; +} +{ +sub.f16x2 %16, r1176, r1177; +} +{ +sub.f16x2 %17, r1179, r1180; +} +{ +add.f16x2 %2, r1188, r1189; +} +{ +add.f16x2 %3, r1191, r1192; +} +{ +sub.f16x2 %18, r1188, r1189; +} +{ +sub.f16x2 %19, r1191, r1192; +} +{ +add.f16x2 %4, r1200, r1201; +} +{ +add.f16x2 %5, r1203, r1204; +} +{ +sub.f16x2 %20, r1200, r1201; +} +{ +sub.f16x2 %21, r1203, r1204; +} +{ +add.f16x2 %6, r1212, r1213; +} +{ +add.f16x2 %7, r1215, r1216; +} +{ +sub.f16x2 %22, r1212, r1213; +} +{ +sub.f16x2 %23, r1215, r1216; +} +{ +add.f16x2 %8, r1224, r1225; +} +{ +add.f16x2 %9, r1227, r1228; +} +{ +sub.f16x2 %24, r1224, r1225; +} +{ +sub.f16x2 %25, r1227, r1228; +} +{ +add.f16x2 %10, r1236, r1237; +} +{ +add.f16x2 %11, r1239, r1240; +} +{ +sub.f16x2 %26, r1236, r1237; +} +{ +sub.f16x2 %27, r1239, r1240; +} +{ +add.f16x2 %12, r1248, r1249; +} +{ +add.f16x2 %13, r1251, r1252; +} +{ +sub.f16x2 %28, r1248, r1249; +} +{ +sub.f16x2 %29, r1251, r1252; +} +{ +add.f16x2 %14, r1260, r1261; +} +{ +add.f16x2 %15, r1263, r1264; +} +{ +sub.f16x2 %30, r1260, r1261; +} +{ +sub.f16x2 %31, r1263, r1264; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<987, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<37>; +.reg .b32 r<256>; +.reg .b64 rd<2>; +mov.u32 r221, %tid.y; +shl.b32 r222, r221, 7; +mov.u32 r223, %4; +add.s32 r224, r223, r222; +mov.u32 r225, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r226, r225, 15; +shl.b32 r227, r225, 3; +and.b32 r228, r227, -128; +add.s32 r229, r224, r228; +cvt.rn.f32.u32 f25, r226; +mul.f32 f26, f25, 0f3E490FDB; +cos.approx.f32 f1, f26; +sin.approx.f32 f27, f26; +neg.f32 f2, f27; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r230, r227, 120; +add.s32 r231, r229, r230; +st.shared.v2.f32 [r231], {r1, r23}; +barrier.sync 0; +shl.b32 r232, r225, 2; +and.b32 r233, r232, 60; +sub.s32 r234, r231, r233; +ld.shared.u32 r54, [r234]; +ld.shared.u32 r55, [r234+64]; +barrier.sync 0; +st.shared.v2.f32 [r231], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r234]; +ld.shared.u32 r58, [r234+64]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r235, r225, 1, 3; +and.b32 r236, r232, 4; +add.s32 r237, r229, r236; +cvt.rn.f32.u32 f28, r235; +mul.f32 f29, f28, 0f3EC90FDB; +cos.approx.f32 f7, f29; +sin.approx.f32 f30, f29; +neg.f32 f8, f30; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r238, r227, 112; +add.s32 r239, r237, r238; +st.shared.u32 [r239], r53; +st.shared.u32 [r239+8], r75; +barrier.sync 0; +and.b32 r240, r232, 56; +sub.s32 r241, r239, r240; +ld.shared.u32 r106, [r241]; +ld.shared.u32 r107, [r241+64]; +barrier.sync 0; +st.shared.u32 [r239], r56; +st.shared.u32 [r239+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r241]; +ld.shared.u32 r110, [r241+64]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r242, r225, 2, 2; +and.b32 r243, r232, 12; +add.s32 r244, r229, r243; +cvt.rn.f32.u32 f31, r242; +mul.f32 f32, f31, 0f3F490FDB; +cos.approx.f32 f13, f32; +sin.approx.f32 f33, f32; +neg.f32 f14, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +barrier.sync 0; +and.b32 r245, r227, 96; +add.s32 r246, r244, r245; +st.shared.u32 [r246], r105; +st.shared.u32 [r246+16], r127; +barrier.sync 0; +and.b32 r247, r232, 48; +sub.s32 r248, r246, r247; +ld.shared.u32 r158, [r248]; +ld.shared.u32 r159, [r248+64]; +barrier.sync 0; +st.shared.u32 [r246], r108; +st.shared.u32 [r246+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r248]; +ld.shared.u32 r162, [r248+64]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r249, r225, 3, 1; +and.b32 r250, r232, 28; +add.s32 r251, r229, r250; +cvt.rn.f32.u32 f34, r249; +mul.f32 f35, f34, 0f3FC90FDB; +cos.approx.f32 f19, f35; +sin.approx.f32 f36, f35; +neg.f32 f20, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +barrier.sync 0; +and.b32 r252, r227, 64; +add.s32 r253, r251, r252; +st.shared.u32 [r253], r157; +st.shared.u32 [r253+32], r179; +barrier.sync 0; +and.b32 r254, r232, 32; +sub.s32 r255, r253, r254; +ld.shared.u32 r210, [r255]; +ld.shared.u32 r211, [r255+64]; +barrier.sync 0; +st.shared.u32 [r253], r160; +st.shared.u32 [r253+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r255]; +ld.shared.u32 r214, [r255+64]; +{ +add.f16x2 %0, r210, r211; +} +{ +add.f16x2 %1, r213, r214; +} +{ +sub.f16x2 %2, r210, r211; +} +{ +sub.f16x2 %3, r213, r214; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<988, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1283>; +.reg .b64 rd<2>; +mov.u32 r1271, %tid.y; +shl.b32 r1272, r1271, 7; +mov.u32 r1273, %32; +add.s32 r1274, r1273, r1272; +mov.u32 r1275, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f80, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f78, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f76, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r398, {low, high}; +} +mov.f32 f74, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1276, r1275, 1; +shl.b32 r1277, r1275, 6; +and.b32 r1278, r1277, -128; +add.s32 r1279, r1274, r1278; +cvt.rn.f32.u32 f151, r1276; +mul.f32 f152, f151, 0f3E490FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1280, r1277, 64; +add.s32 r1281, r1279, r1280; +st.shared.v4.f32 [r1281], {r521, r627, r664, r701}; +st.shared.v4.f32 [r1281+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r1281+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r1281+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r1282, r1276, -60, r1281; +ld.shared.u32 r1176, [r1282]; +ld.shared.u32 r1188, [r1282+8]; +ld.shared.u32 r1200, [r1282+16]; +ld.shared.u32 r1212, [r1282+24]; +ld.shared.u32 r1224, [r1282+32]; +ld.shared.u32 r1236, [r1282+40]; +ld.shared.u32 r1248, [r1282+48]; +ld.shared.u32 r1260, [r1282+56]; +ld.shared.u32 r1177, [r1282+64]; +ld.shared.u32 r1189, [r1282+72]; +ld.shared.u32 r1201, [r1282+80]; +ld.shared.u32 r1213, [r1282+88]; +ld.shared.u32 r1225, [r1282+96]; +ld.shared.u32 r1237, [r1282+104]; +ld.shared.u32 r1249, [r1282+112]; +ld.shared.u32 r1261, [r1282+120]; +barrier.sync 0; +st.shared.v4.f32 [r1281], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1281+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1281+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1281+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1282]; +ld.shared.u32 r1191, [r1282+8]; +ld.shared.u32 r1203, [r1282+16]; +ld.shared.u32 r1215, [r1282+24]; +ld.shared.u32 r1227, [r1282+32]; +ld.shared.u32 r1239, [r1282+40]; +ld.shared.u32 r1251, [r1282+48]; +ld.shared.u32 r1263, [r1282+56]; +ld.shared.u32 r1180, [r1282+64]; +ld.shared.u32 r1192, [r1282+72]; +ld.shared.u32 r1204, [r1282+80]; +ld.shared.u32 r1216, [r1282+88]; +ld.shared.u32 r1228, [r1282+96]; +ld.shared.u32 r1240, [r1282+104]; +ld.shared.u32 r1252, [r1282+112]; +ld.shared.u32 r1264, [r1282+120]; +{ +add.f16x2 %0, r1176, r1177; +} +{ +add.f16x2 %1, r1179, r1180; +} +{ +sub.f16x2 %16, r1176, r1177; +} +{ +sub.f16x2 %17, r1179, r1180; +} +{ +add.f16x2 %2, r1188, r1189; +} +{ +add.f16x2 %3, r1191, r1192; +} +{ +sub.f16x2 %18, r1188, r1189; +} +{ +sub.f16x2 %19, r1191, r1192; +} +{ +add.f16x2 %4, r1200, r1201; +} +{ +add.f16x2 %5, r1203, r1204; +} +{ +sub.f16x2 %20, r1200, r1201; +} +{ +sub.f16x2 %21, r1203, r1204; +} +{ +add.f16x2 %6, r1212, r1213; +} +{ +add.f16x2 %7, r1215, r1216; +} +{ +sub.f16x2 %22, r1212, r1213; +} +{ +sub.f16x2 %23, r1215, r1216; +} +{ +add.f16x2 %8, r1224, r1225; +} +{ +add.f16x2 %9, r1227, r1228; +} +{ +sub.f16x2 %24, r1224, r1225; +} +{ +sub.f16x2 %25, r1227, r1228; +} +{ +add.f16x2 %10, r1236, r1237; +} +{ +add.f16x2 %11, r1239, r1240; +} +{ +sub.f16x2 %26, r1236, r1237; +} +{ +sub.f16x2 %27, r1239, r1240; +} +{ +add.f16x2 %12, r1248, r1249; +} +{ +add.f16x2 %13, r1251, r1252; +} +{ +sub.f16x2 %28, r1248, r1249; +} +{ +sub.f16x2 %29, r1251, r1252; +} +{ +add.f16x2 %14, r1260, r1261; +} +{ +add.f16x2 %15, r1263, r1264; +} +{ +sub.f16x2 %30, r1260, r1261; +} +{ +sub.f16x2 %31, r1263, r1264; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..70c174cab04d9 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp32_fwd.hpp.inc @@ -0,0 +1,2353 @@ +#ifndef CUFFTDX_FFT_32_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_32_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<32, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<612>; +.reg .b64 rd<2>; +add.f32 f129, %64, %106; +add.f32 f130, %65, %108; +sub.f32 f131, %64, %106; +sub.f32 f132, %65, %108; +add.f32 f133, %85, %128; +add.f32 f134, %87, %129; +sub.f32 f135, %85, %128; +sub.f32 f136, %87, %129; +add.f32 f137, f129, f133; +add.f32 f138, f130, f134; +sub.f32 f139, f129, f133; +sub.f32 f140, f130, f134; +add.f32 f141, f131, f136; +sub.f32 f142, f132, f135; +sub.f32 f143, f131, f136; +add.f32 f144, f132, f135; +add.f32 f145, %74, %117; +add.f32 f146, %76, %119; +sub.f32 f147, %74, %117; +sub.f32 f148, %76, %119; +add.f32 f149, %96, %138; +add.f32 f150, %97, %140; +sub.f32 f151, %96, %138; +sub.f32 f152, %97, %140; +add.f32 f153, f145, f149; +add.f32 f154, f146, f150; +sub.f32 f155, f145, f149; +sub.f32 f156, f146, f150; +add.f32 f157, f147, f152; +sub.f32 f158, f148, f151; +sub.f32 f159, f147, f152; +add.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f158, 0fBF3504F3; +sub.f32 f163, f161, f162; +mul.f32 f164, f158, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +add.f32 f171, f138, f154; +sub.f32 f172, f137, f153; +sub.f32 f173, f138, f154; +add.f32 f174, f141, f163; +add.f32 f175, f142, f165; +sub.f32 f176, f141, f163; +sub.f32 f177, f142, f165; +add.f32 f178, f139, f156; +sub.f32 f179, f140, f155; +sub.f32 f180, f139, f156; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +add.f32 f183, f144, f169; +sub.f32 f184, f143, f168; +sub.f32 f185, f144, f169; +add.f32 f186, %69, %112; +add.f32 f187, %71, %113; +sub.f32 f188, %69, %112; +sub.f32 f189, %71, %113; +add.f32 f190, %90, %133; +add.f32 f191, %92, %135; +sub.f32 f192, %90, %133; +sub.f32 f193, %92, %135; +add.f32 f194, f186, f190; +add.f32 f195, f187, f191; +sub.f32 f196, f186, f190; +sub.f32 f197, f187, f191; +add.f32 f198, f188, f193; +sub.f32 f199, f189, f192; +sub.f32 f200, f188, f193; +add.f32 f201, f189, f192; +add.f32 f202, %80, %122; +add.f32 f203, %81, %124; +sub.f32 f204, %80, %122; +sub.f32 f205, %81, %124; +add.f32 f206, %101, %144; +add.f32 f207, %103, %145; +sub.f32 f208, %101, %144; +sub.f32 f209, %103, %145; +add.f32 f210, f202, f206; +add.f32 f211, f203, f207; +sub.f32 f212, f202, f206; +sub.f32 f213, f203, f207; +add.f32 f214, f204, f209; +sub.f32 f215, f205, f208; +sub.f32 f216, f204, f209; +add.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f215, 0fBF3504F3; +sub.f32 f220, f218, f219; +mul.f32 f221, f215, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +add.f32 f228, f195, f211; +sub.f32 f229, f194, f210; +sub.f32 f230, f195, f211; +add.f32 f231, f198, f220; +add.f32 f232, f199, f222; +sub.f32 f233, f198, f220; +sub.f32 f234, f199, f222; +add.f32 f235, f196, f213; +sub.f32 f236, f197, f212; +sub.f32 f237, f196, f213; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +add.f32 f240, f201, f226; +sub.f32 f241, f200, f225; +sub.f32 f242, f201, f226; +mul.f32 f243, f231, 0f3F6C835E; +mul.f32 f244, f232, 0fBEC3EF15; +sub.f32 f245, f243, f244; +mul.f32 f246, f232, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f236, 0fBF3504F3; +sub.f32 f250, f248, f249; +mul.f32 f251, f236, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f253, f239, 0f3EC3EF15; +mul.f32 f254, f240, 0fBF6C835E; +sub.f32 f255, f253, f254; +mul.f32 f256, f240, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f258, f233, 0fBEC3EF15; +mul.f32 f259, f234, 0fBF6C835E; +sub.f32 f260, f258, f259; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f267, f241, 0fBF6C835E; +mul.f32 f268, f242, 0fBEC3EF15; +sub.f32 f269, f267, f268; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +add.f32 f273, f171, f228; +sub.f32 f274, f170, f227; +sub.f32 f275, f171, f228; +add.f32 f276, f174, f245; +add.f32 f277, f175, f247; +sub.f32 f278, f174, f245; +sub.f32 f279, f175, f247; +add.f32 f280, f178, f250; +add.f32 f281, f179, f252; +sub.f32 f282, f178, f250; +sub.f32 f283, f179, f252; +add.f32 f284, f182, f255; +add.f32 f285, f183, f257; +sub.f32 f286, f182, f255; +sub.f32 f287, f183, f257; +add.f32 f288, f172, f230; +sub.f32 f289, f173, f229; +sub.f32 f290, f172, f230; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +add.f32 f293, f177, f262; +sub.f32 f294, f176, f260; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +add.f32 f297, f181, f266; +sub.f32 f298, f180, f265; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +add.f32 f301, f185, f271; +sub.f32 f302, f184, f269; +sub.f32 f303, f185, f271; +add.f32 f304, %66, %109; +add.f32 f305, %68, %111; +sub.f32 f306, %66, %109; +sub.f32 f307, %68, %111; +add.f32 f308, %88, %130; +add.f32 f309, %89, %132; +sub.f32 f310, %88, %130; +sub.f32 f311, %89, %132; +add.f32 f312, f304, f308; +add.f32 f313, f305, f309; +sub.f32 f314, f304, f308; +sub.f32 f315, f305, f309; +add.f32 f316, f306, f311; +sub.f32 f317, f307, f310; +sub.f32 f318, f306, f311; +add.f32 f319, f307, f310; +add.f32 f320, %77, %120; +add.f32 f321, %79, %121; +sub.f32 f322, %77, %120; +sub.f32 f323, %79, %121; +add.f32 f324, %98, %141; +add.f32 f325, %100, %143; +sub.f32 f326, %98, %141; +sub.f32 f327, %100, %143; +add.f32 f328, f320, f324; +add.f32 f329, f321, f325; +sub.f32 f330, f320, f324; +sub.f32 f331, f321, f325; +add.f32 f332, f322, f327; +sub.f32 f333, f323, f326; +sub.f32 f334, f322, f327; +add.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f333, 0fBF3504F3; +sub.f32 f338, f336, f337; +mul.f32 f339, f333, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +add.f32 f346, f313, f329; +sub.f32 f347, f312, f328; +sub.f32 f348, f313, f329; +add.f32 f349, f316, f338; +add.f32 f350, f317, f340; +sub.f32 f351, f316, f338; +sub.f32 f352, f317, f340; +add.f32 f353, f314, f331; +sub.f32 f354, f315, f330; +sub.f32 f355, f314, f331; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +add.f32 f358, f319, f344; +sub.f32 f359, f318, f343; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %114; +add.f32 f362, %73, %116; +sub.f32 f363, %72, %114; +sub.f32 f364, %73, %116; +add.f32 f365, %93, %136; +add.f32 f366, %95, %137; +sub.f32 f367, %93, %136; +sub.f32 f368, %95, %137; +add.f32 f369, f361, f365; +add.f32 f370, f362, f366; +sub.f32 f371, f361, f365; +sub.f32 f372, f362, f366; +add.f32 f373, f363, f368; +sub.f32 f374, f364, f367; +sub.f32 f375, f363, f368; +add.f32 f376, f364, f367; +add.f32 f377, %82, %125; +add.f32 f378, %84, %127; +sub.f32 f379, %82, %125; +sub.f32 f380, %84, %127; +add.f32 f381, %104, %146; +add.f32 f382, %105, %147; +sub.f32 f383, %104, %146; +sub.f32 f384, %105, %147; +add.f32 f385, f377, f381; +add.f32 f386, f378, f382; +sub.f32 f387, f377, f381; +sub.f32 f388, f378, f382; +add.f32 f389, f379, f384; +sub.f32 f390, f380, f383; +sub.f32 f391, f379, f384; +add.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f390, 0fBF3504F3; +sub.f32 f395, f393, f394; +mul.f32 f396, f390, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +add.f32 f403, f370, f386; +sub.f32 f404, f369, f385; +sub.f32 f405, f370, f386; +add.f32 f406, f373, f395; +add.f32 f407, f374, f397; +sub.f32 f408, f373, f395; +sub.f32 f409, f374, f397; +add.f32 f410, f371, f388; +sub.f32 f411, f372, f387; +sub.f32 f412, f371, f388; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +add.f32 f415, f376, f401; +sub.f32 f416, f375, f400; +sub.f32 f417, f376, f401; +mul.f32 f418, f406, 0f3F6C835E; +mul.f32 f419, f407, 0fBEC3EF15; +sub.f32 f420, f418, f419; +mul.f32 f421, f407, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f411, 0fBF3504F3; +sub.f32 f425, f423, f424; +mul.f32 f426, f411, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f428, f414, 0f3EC3EF15; +mul.f32 f429, f415, 0fBF6C835E; +sub.f32 f430, f428, f429; +mul.f32 f431, f415, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f433, f408, 0fBEC3EF15; +mul.f32 f434, f409, 0fBF6C835E; +sub.f32 f435, f433, f434; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f442, f416, 0fBF6C835E; +mul.f32 f443, f417, 0fBEC3EF15; +sub.f32 f444, f442, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +add.f32 f448, f346, f403; +sub.f32 f449, f345, f402; +sub.f32 f450, f346, f403; +add.f32 f451, f349, f420; +add.f32 f452, f350, f422; +sub.f32 f453, f349, f420; +sub.f32 f454, f350, f422; +add.f32 f455, f353, f425; +add.f32 f456, f354, f427; +sub.f32 f457, f353, f425; +sub.f32 f458, f354, f427; +add.f32 f459, f357, f430; +add.f32 f460, f358, f432; +sub.f32 f461, f357, f430; +sub.f32 f462, f358, f432; +add.f32 f463, f347, f405; +sub.f32 f464, f348, f404; +sub.f32 f465, f347, f405; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +add.f32 f468, f352, f437; +sub.f32 f469, f351, f435; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +add.f32 f472, f356, f441; +sub.f32 f473, f355, f440; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +add.f32 f476, f360, f446; +sub.f32 f477, f359, f444; +sub.f32 f478, f360, f446; +mul.f32 f479, f451, 0f3F7B14BE; +mul.f32 f480, f452, 0fBE47C5C2; +sub.f32 f481, f479, f480; +mul.f32 f482, f452, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f484, f455, 0f3F6C835E; +mul.f32 f485, f456, 0fBEC3EF15; +sub.f32 f486, f484, f485; +mul.f32 f487, f456, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f489, f459, 0f3F54DB31; +mul.f32 f490, f460, 0fBF0E39DA; +sub.f32 f491, f489, f490; +mul.f32 f492, f460, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f464, 0fBF3504F3; +sub.f32 f496, f494, f495; +mul.f32 f497, f464, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f499, f467, 0f3F0E39DA; +mul.f32 f500, f468, 0fBF54DB31; +sub.f32 f501, f499, f500; +mul.f32 f502, f468, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f504, f471, 0f3EC3EF15; +mul.f32 f505, f472, 0fBF6C835E; +sub.f32 f506, f504, f505; +mul.f32 f507, f472, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f509, f475, 0f3E47C5C2; +mul.f32 f510, f476, 0fBF7B14BE; +sub.f32 f511, f509, f510; +mul.f32 f512, f476, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f514, f453, 0fBE47C5C2; +mul.f32 f515, f454, 0fBF7B14BE; +sub.f32 f516, f514, f515; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f519, f457, 0fBEC3EF15; +mul.f32 f520, f458, 0fBF6C835E; +sub.f32 f521, f519, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f524, f461, 0fBF0E39DA; +mul.f32 f525, f462, 0fBF54DB31; +sub.f32 f526, f524, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f533, f469, 0fBF54DB31; +mul.f32 f534, f470, 0fBF0E39DA; +sub.f32 f535, f533, f534; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f538, f473, 0fBF6C835E; +mul.f32 f539, f474, 0fBEC3EF15; +sub.f32 f540, f538, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f543, f477, 0fBF7B14BE; +mul.f32 f544, f478, 0fBE47C5C2; +sub.f32 f545, f543, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 %1, f273, f448; +add.f32 %0, f272, f447; +add.f32 %3, f277, f483; +add.f32 %2, f276, f481; +add.f32 %5, f281, f488; +add.f32 %4, f280, f486; +add.f32 %7, f285, f493; +add.f32 %6, f284, f491; +add.f32 %9, f289, f498; +add.f32 %8, f288, f496; +add.f32 %11, f293, f503; +add.f32 %10, f292, f501; +add.f32 %13, f297, f508; +add.f32 %12, f296, f506; +add.f32 %15, f301, f513; +add.f32 %14, f300, f511; +sub.f32 %17, f275, f449; +add.f32 %16, f274, f450; +add.f32 %19, f279, f518; +add.f32 %18, f278, f516; +add.f32 %21, f283, f523; +add.f32 %20, f282, f521; +add.f32 %23, f287, f528; +add.f32 %22, f286, f526; +add.f32 %25, f291, f532; +add.f32 %24, f290, f531; +add.f32 %27, f295, f537; +add.f32 %26, f294, f535; +add.f32 %29, f299, f542; +add.f32 %28, f298, f540; +add.f32 %31, f303, f547; +add.f32 %30, f302, f545; +sub.f32 %33, f273, f448; +sub.f32 %32, f272, f447; +sub.f32 %35, f277, f483; +sub.f32 %34, f276, f481; +sub.f32 %37, f281, f488; +sub.f32 %36, f280, f486; +sub.f32 %39, f285, f493; +sub.f32 %38, f284, f491; +sub.f32 %41, f289, f498; +sub.f32 %40, f288, f496; +sub.f32 %43, f293, f503; +sub.f32 %42, f292, f501; +sub.f32 %45, f297, f508; +sub.f32 %44, f296, f506; +sub.f32 %47, f301, f513; +sub.f32 %46, f300, f511; +add.f32 %49, f275, f449; +sub.f32 %48, f274, f450; +sub.f32 %51, f279, f518; +sub.f32 %50, f278, f516; +sub.f32 %53, f283, f523; +sub.f32 %52, f282, f521; +sub.f32 %55, f287, f528; +sub.f32 %54, f286, f526; +sub.f32 %57, f291, f532; +sub.f32 %56, f290, f531; +sub.f32 %59, f295, f537; +sub.f32 %58, f294, f535; +sub.f32 %61, f299, f542; +sub.f32 %60, f298, f540; +sub.f32 %63, f303, f547; +sub.f32 %62, f302, f545; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<33, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<223>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 192; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+32]; +ld.shared.v2.f32 {f167, f168}, [r13+64]; +ld.shared.v2.f32 {f171, f172}, [r13+96]; +ld.shared.v2.f32 {f175, f176}, [r13+128]; +ld.shared.v2.f32 {f179, f180}, [r13+160]; +ld.shared.v2.f32 {f183, f184}, [r13+192]; +ld.shared.v2.f32 {f187, f188}, [r13+224]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f163, f179; +add.f32 f200, f164, f180; +sub.f32 f201, f163, f179; +sub.f32 f202, f164, f180; +add.f32 f203, f171, f187; +add.f32 f204, f172, f188; +sub.f32 f205, f171, f187; +sub.f32 f206, f172, f188; +add.f32 %1, f192, f196; +add.f32 %0, f191, f195; +add.f32 %3, f200, f204; +add.f32 %2, f199, f203; +sub.f32 %5, f194, f197; +add.f32 %4, f193, f198; +sub.f32 %7, f202, f205; +add.f32 %6, f201, f206; +sub.f32 %9, f192, f196; +sub.f32 %8, f191, f195; +sub.f32 %11, f200, f204; +sub.f32 %10, f199, f203; +add.f32 %13, f194, f197; +sub.f32 %12, f193, f198; +add.f32 %15, f202, f205; +sub.f32 %14, f201, f206; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<34, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<207>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 96; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+16]; +ld.shared.f32 f161, [r13+32]; +ld.shared.f32 f162, [r13+48]; +ld.shared.f32 f163, [r13+64]; +ld.shared.f32 f164, [r13+80]; +ld.shared.f32 f165, [r13+96]; +ld.shared.f32 f166, [r13+112]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+16]; +ld.shared.f32 f169, [r13+32]; +ld.shared.f32 f170, [r13+48]; +ld.shared.f32 f171, [r13+64]; +ld.shared.f32 f172, [r13+80]; +ld.shared.f32 f173, [r13+96]; +ld.shared.f32 f174, [r13+112]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f160, f164; +add.f32 f184, f168, f172; +sub.f32 f185, f160, f164; +sub.f32 f186, f168, f172; +add.f32 f187, f162, f166; +add.f32 f188, f170, f174; +sub.f32 f189, f162, f166; +sub.f32 f190, f170, f174; +add.f32 %0, f175, f179; +add.f32 %1, f176, f180; +add.f32 %2, f183, f187; +add.f32 %3, f184, f188; +sub.f32 %5, f178, f181; +add.f32 %4, f177, f182; +sub.f32 %7, f186, f189; +add.f32 %6, f185, f190; +sub.f32 %8, f175, f179; +sub.f32 %9, f176, f180; +sub.f32 %10, f183, f187; +sub.f32 %11, f184, f188; +add.f32 %13, f178, f181; +sub.f32 %12, f177, f182; +add.f32 %15, f186, f189; +sub.f32 %14, f185, f190; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<35, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<147>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 224; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+64]; +ld.shared.v2.f32 {f70, f71}, [r13+128]; +ld.shared.v2.f32 {f74, f75}, [r13+192]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 128; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+64]; +ld.shared.v2.f32 {f131, f132}, [r20+128]; +ld.shared.v2.f32 {f135, f136}, [r20+192]; +add.f32 %1, f124, f132; +add.f32 %0, f123, f131; +add.f32 %3, f128, f136; +add.f32 %2, f127, f135; +sub.f32 %5, f124, f132; +sub.f32 %4, f123, f131; +sub.f32 %7, f128, f136; +sub.f32 %6, f127, f135; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<36, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<131>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 112; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+32]; +ld.shared.f32 f64, [r13+64]; +ld.shared.f32 f65, [r13+96]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+32]; +ld.shared.f32 f68, [r13+64]; +ld.shared.f32 f69, [r13+96]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 64; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+32]; +ld.shared.f32 f117, [r21+64]; +ld.shared.f32 f118, [r21+96]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+32]; +ld.shared.f32 f121, [r21+64]; +ld.shared.f32 f122, [r21+96]; +add.f32 %0, f115, f117; +add.f32 %1, f119, f121; +add.f32 %2, f116, f118; +add.f32 %3, f120, f122; +sub.f32 %4, f115, f117; +sub.f32 %5, f119, f121; +sub.f32 %6, f116, f118; +sub.f32 %7, f120, f122; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<37, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<97>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %9, %11; +sub.f32 f10, %10, %12; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 240; +add.s32 r11, r8, r10; +add.f32 f18, %10, %12; +add.f32 f19, %9, %11; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 120; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+128]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 224; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 112; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+128]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 192; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 96; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+128]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 8; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 128; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 64; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+128]; +add.f32 %1, f86, f90; +add.f32 %0, f85, f89; +sub.f32 %3, f86, f90; +sub.f32 %2, f85, f89; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<38, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<589>; +.reg .b32 r<26>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f582, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f580, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f579, f582, f580; +sub.f32 f76, f582, f580; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f578, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f575, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f573, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f572, f575, f573; +sub.f32 f92, f575, f573; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f571, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f571, 0fBF3504F3; +mul.f32 f570, f93, 0f3F3504F3; +sub.f32 f99, f570, f98; +mul.f32 f100, f571, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f569, f579, f572; +sub.f32 f109, f579, f572; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f568, f578, f101; +sub.f32 f113, f578, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f567, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f566, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f564, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f561, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f560, f564, f561; +sub.f32 f133, f564, f561; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f559, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f557, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f555, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f554, f557, f555; +sub.f32 f149, f557, f555; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f553, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f553, 0fBF3504F3; +mul.f32 f552, f150, 0f3F3504F3; +sub.f32 f156, f552, f155; +mul.f32 f157, f553, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f551, f560, f554; +sub.f32 f166, f560, f554; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f550, f559, f158; +sub.f32 f170, f559, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f549, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f548, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f546, f167, 0f3F6C835E; +mul.f32 f547, f550, 0fBEC3EF15; +sub.f32 f181, f546, f547; +mul.f32 f182, f550, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f544, f171, 0f3F3504F3; +mul.f32 f545, f549, 0fBF3504F3; +sub.f32 f186, f544, f545; +mul.f32 f187, f549, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f542, f175, 0f3EC3EF15; +mul.f32 f543, f548, 0fBF6C835E; +sub.f32 f191, f542, f543; +mul.f32 f192, f548, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f540, f169, 0fBEC3EF15; +mul.f32 f541, f170, 0fBF6C835E; +sub.f32 f196, f540, f541; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f538, f177, 0fBF6C835E; +mul.f32 f539, f178, 0fBEC3EF15; +sub.f32 f205, f538, f539; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f537, f568, f183; +sub.f32 f213, f568, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f536, f567, f188; +sub.f32 f217, f567, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f535, f566, f193; +sub.f32 f221, f566, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f534, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f533, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f532, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f531, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +and.b32 r14, r15, 1; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f537; +mul.f32 f244, f238, f537; +mul.f32 f246, f239, f239; +mul.f32 f530, f238, f238; +sub.f32 f247, f530, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f536; +mul.f32 f252, f247, f536; +mul.f32 f528, f238, f247; +mul.f32 f529, f239, f249; +sub.f32 f255, f528, f529; +mul.f32 f527, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f535; +mul.f32 f260, f255, f535; +mul.f32 f262, f239, f257; +mul.f32 f526, f238, f255; +sub.f32 f263, f526, f262; +mul.f32 f525, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f534; +mul.f32 f268, f263, f534; +mul.f32 f270, f239, f265; +mul.f32 f524, f238, f263; +sub.f32 f271, f524, f270; +mul.f32 f523, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f533; +mul.f32 f276, f271, f533; +mul.f32 f521, f238, f271; +mul.f32 f522, f239, f273; +sub.f32 f279, f521, f522; +mul.f32 f520, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f532; +mul.f32 f284, f279, f532; +mul.f32 f286, f239, f281; +mul.f32 f519, f238, f279; +sub.f32 f287, f519, f286; +mul.f32 f518, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f531; +mul.f32 f292, f287, f531; +mul.f32 f294, f239, f289; +mul.f32 f517, f238, f287; +sub.f32 f295, f517, f294; +mul.f32 f516, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f515, f569, f551; +mul.f32 f299, f297, f515; +mul.f32 f300, f295, f515; +mul.f32 f513, f238, f295; +mul.f32 f514, f239, f297; +sub.f32 f303, f513, f514; +sub.f32 f512, f106, f163; +mul.f32 f511, f295, f512; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f510, f238, f303; +sub.f32 f311, f510, f310; +mul.f32 f509, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f507, f238, f311; +mul.f32 f508, f239, f313; +sub.f32 f319, f507, f508; +mul.f32 f506, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f505, f238, f319; +sub.f32 f327, f505, f326; +mul.f32 f504, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f503, f238, f327; +sub.f32 f335, f503, f334; +mul.f32 f502, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f500, f238, f335; +mul.f32 f501, f239, f337; +sub.f32 f343, f500, f501; +mul.f32 f499, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f498, f238, f343; +sub.f32 f351, f498, f350; +mul.f32 f497, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f496, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 1; +sub.f32 f585, f569, f551; +mul.f32 f584, f297, f585; +mov.u32 r23, %tid.x; +shl.b32 r22, r23, 7; +barrier.sync 0; +and.b32 r11, r22, 128; +add.s32 r12, r9, r11; +sub.f32 f587, f569, f551; +mul.f32 f586, f297, f587; +add.f32 f357, f569, f551; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 1; +sub.f32 f588, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 1; +mov.u32 r25, %tid.x; +and.b32 r24, r25, 1; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f497, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f527, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f525, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f523, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f520, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f518, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f516, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f588, f300; +sub.f32 f374, f511, f586; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f509, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f506, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f504, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f502, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f499, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f496, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +mad.lo.s32 r13, r24, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+16]; +ld.shared.v2.f32 {f397, f398}, [r13+32]; +ld.shared.v2.f32 {f401, f402}, [r13+48]; +ld.shared.v2.f32 {f405, f406}, [r13+64]; +ld.shared.v2.f32 {f409, f410}, [r13+80]; +ld.shared.v2.f32 {f413, f414}, [r13+96]; +ld.shared.v2.f32 {f417, f418}, [r13+112]; +ld.shared.v2.f32 {f421, f422}, [r13+128]; +ld.shared.v2.f32 {f425, f426}, [r13+144]; +ld.shared.v2.f32 {f429, f430}, [r13+160]; +ld.shared.v2.f32 {f433, f434}, [r13+176]; +ld.shared.v2.f32 {f437, f438}, [r13+192]; +ld.shared.v2.f32 {f441, f442}, [r13+208]; +ld.shared.v2.f32 {f445, f446}, [r13+224]; +ld.shared.v2.f32 {f449, f450}, [r13+240]; +add.f32 %0, f389, f421; +add.f32 %1, f390, f422; +add.f32 %2, f393, f425; +add.f32 %3, f394, f426; +add.f32 %5, f398, f430; +add.f32 %4, f397, f429; +add.f32 %7, f402, f434; +add.f32 %6, f401, f433; +add.f32 %9, f406, f438; +add.f32 %8, f405, f437; +add.f32 %10, f409, f441; +add.f32 %11, f410, f442; +add.f32 %12, f413, f445; +add.f32 %13, f414, f446; +add.f32 %14, f417, f449; +add.f32 %15, f418, f450; +sub.f32 %17, f390, f422; +sub.f32 %16, f389, f421; +sub.f32 %19, f394, f426; +sub.f32 %18, f393, f425; +sub.f32 %21, f398, f430; +sub.f32 %20, f397, f429; +sub.f32 %23, f402, f434; +sub.f32 %22, f401, f433; +sub.f32 %25, f406, f438; +sub.f32 %24, f405, f437; +sub.f32 %27, f410, f442; +sub.f32 %26, f409, f441; +sub.f32 %29, f414, f446; +sub.f32 %28, f413, f445; +sub.f32 %31, f418, f450; +sub.f32 %30, f417, f449; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<39, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<81>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %9, %11; +add.f32 f10, %10, %12; +sub.f32 f11, %9, %11; +sub.f32 f12, %10, %12; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 120; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 60; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+64]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+64]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 112; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 56; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+64]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+64]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 96; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 48; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+64]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+64]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 8; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 64; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 32; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+64]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+64]; +add.f32 %0, f73, f74; +add.f32 %1, f75, f76; +sub.f32 %2, f73, f74; +sub.f32 %3, f75, f76; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<40, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<453>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 64; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+8]; +ld.shared.f32 f391, [r13+16]; +ld.shared.f32 f392, [r13+24]; +ld.shared.f32 f393, [r13+32]; +ld.shared.f32 f394, [r13+40]; +ld.shared.f32 f395, [r13+48]; +ld.shared.f32 f396, [r13+56]; +ld.shared.f32 f397, [r13+64]; +ld.shared.f32 f398, [r13+72]; +ld.shared.f32 f399, [r13+80]; +ld.shared.f32 f400, [r13+88]; +ld.shared.f32 f401, [r13+96]; +ld.shared.f32 f402, [r13+104]; +ld.shared.f32 f403, [r13+112]; +ld.shared.f32 f404, [r13+120]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+8]; +ld.shared.f32 f407, [r13+16]; +ld.shared.f32 f408, [r13+24]; +ld.shared.f32 f409, [r13+32]; +ld.shared.f32 f410, [r13+40]; +ld.shared.f32 f411, [r13+48]; +ld.shared.f32 f412, [r13+56]; +ld.shared.f32 f413, [r13+64]; +ld.shared.f32 f414, [r13+72]; +ld.shared.f32 f415, [r13+80]; +ld.shared.f32 f416, [r13+88]; +ld.shared.f32 f417, [r13+96]; +ld.shared.f32 f418, [r13+104]; +ld.shared.f32 f419, [r13+112]; +ld.shared.f32 f420, [r13+120]; +add.f32 %0, f389, f397; +add.f32 %1, f405, f413; +add.f32 %2, f390, f398; +add.f32 %3, f406, f414; +add.f32 %4, f391, f399; +add.f32 %5, f407, f415; +add.f32 %6, f392, f400; +add.f32 %7, f408, f416; +add.f32 %8, f393, f401; +add.f32 %9, f409, f417; +add.f32 %10, f394, f402; +add.f32 %11, f410, f418; +add.f32 %12, f395, f403; +add.f32 %13, f411, f419; +add.f32 %14, f396, f404; +add.f32 %15, f412, f420; +sub.f32 %16, f389, f397; +sub.f32 %17, f405, f413; +sub.f32 %18, f390, f398; +sub.f32 %19, f406, f414; +sub.f32 %20, f391, f399; +sub.f32 %21, f407, f415; +sub.f32 %22, f392, f400; +sub.f32 %23, f408, f416; +sub.f32 %24, f393, f401; +sub.f32 %25, f409, f417; +sub.f32 %26, f394, f402; +sub.f32 %27, f410, f418; +sub.f32 %28, f395, f403; +sub.f32 %29, f411, f419; +sub.f32 %30, f396, f404; +sub.f32 %31, f412, f420; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..b22efd807c47e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp32_inv.hpp.inc @@ -0,0 +1,2349 @@ +#ifndef CUFFTDX_FFT_32_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_32_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<234, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<612>; +.reg .b64 rd<2>; +add.f32 f129, %64, %106; +add.f32 f130, %65, %108; +sub.f32 f131, %64, %106; +sub.f32 f132, %65, %108; +add.f32 f133, %85, %128; +add.f32 f134, %87, %129; +sub.f32 f135, %85, %128; +sub.f32 f136, %87, %129; +add.f32 f137, f129, f133; +add.f32 f138, f130, f134; +sub.f32 f139, f129, f133; +sub.f32 f140, f130, f134; +sub.f32 f141, f131, f136; +add.f32 f142, f132, f135; +add.f32 f143, f131, f136; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %117; +add.f32 f146, %76, %119; +sub.f32 f147, %74, %117; +sub.f32 f148, %76, %119; +add.f32 f149, %96, %138; +add.f32 f150, %97, %140; +sub.f32 f151, %96, %138; +sub.f32 f152, %97, %140; +add.f32 f153, f145, f149; +add.f32 f154, f146, f150; +sub.f32 f155, f145, f149; +sub.f32 f156, f146, f150; +sub.f32 f157, f147, f152; +add.f32 f158, f148, f151; +add.f32 f159, f147, f152; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f158, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f165, f159, 0fBF3504F3; +mul.f32 f166, f160, 0f3F3504F3; +sub.f32 f167, f165, f166; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +add.f32 f171, f138, f154; +sub.f32 f172, f137, f153; +sub.f32 f173, f138, f154; +add.f32 f174, f141, f163; +add.f32 f175, f142, f164; +sub.f32 f176, f141, f163; +sub.f32 f177, f142, f164; +sub.f32 f178, f139, f156; +add.f32 f179, f140, f155; +add.f32 f180, f139, f156; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +add.f32 f183, f144, f169; +sub.f32 f184, f143, f167; +sub.f32 f185, f144, f169; +add.f32 f186, %69, %112; +add.f32 f187, %71, %113; +sub.f32 f188, %69, %112; +sub.f32 f189, %71, %113; +add.f32 f190, %90, %133; +add.f32 f191, %92, %135; +sub.f32 f192, %90, %133; +sub.f32 f193, %92, %135; +add.f32 f194, f186, f190; +add.f32 f195, f187, f191; +sub.f32 f196, f186, f190; +sub.f32 f197, f187, f191; +sub.f32 f198, f188, f193; +add.f32 f199, f189, f192; +add.f32 f200, f188, f193; +sub.f32 f201, f189, f192; +add.f32 f202, %80, %122; +add.f32 f203, %81, %124; +sub.f32 f204, %80, %122; +sub.f32 f205, %81, %124; +add.f32 f206, %101, %144; +add.f32 f207, %103, %145; +sub.f32 f208, %101, %144; +sub.f32 f209, %103, %145; +add.f32 f210, f202, f206; +add.f32 f211, f203, f207; +sub.f32 f212, f202, f206; +sub.f32 f213, f203, f207; +sub.f32 f214, f204, f209; +add.f32 f215, f205, f208; +add.f32 f216, f204, f209; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f215, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f222, f216, 0fBF3504F3; +mul.f32 f223, f217, 0f3F3504F3; +sub.f32 f224, f222, f223; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +add.f32 f228, f195, f211; +sub.f32 f229, f194, f210; +sub.f32 f230, f195, f211; +add.f32 f231, f198, f220; +add.f32 f232, f199, f221; +sub.f32 f233, f198, f220; +sub.f32 f234, f199, f221; +sub.f32 f235, f196, f213; +add.f32 f236, f197, f212; +add.f32 f237, f196, f213; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +add.f32 f240, f201, f226; +sub.f32 f241, f200, f224; +sub.f32 f242, f201, f226; +mul.f32 f243, f231, 0f3F6C835E; +mul.f32 f244, f232, 0f3EC3EF15; +sub.f32 f245, f243, f244; +mul.f32 f246, f232, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f236, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f252, f239, 0f3EC3EF15; +mul.f32 f253, f240, 0f3F6C835E; +sub.f32 f254, f252, f253; +mul.f32 f255, f240, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f257, f233, 0fBEC3EF15; +mul.f32 f258, f234, 0f3F6C835E; +sub.f32 f259, f257, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f262, f237, 0fBF3504F3; +mul.f32 f263, f238, 0f3F3504F3; +sub.f32 f264, f262, f263; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f267, f241, 0fBF6C835E; +mul.f32 f268, f242, 0f3EC3EF15; +sub.f32 f269, f267, f268; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +add.f32 f273, f171, f228; +sub.f32 f274, f170, f227; +sub.f32 f275, f171, f228; +add.f32 f276, f174, f245; +add.f32 f277, f175, f247; +sub.f32 f278, f174, f245; +sub.f32 f279, f175, f247; +add.f32 f280, f178, f250; +add.f32 f281, f179, f251; +sub.f32 f282, f178, f250; +sub.f32 f283, f179, f251; +add.f32 f284, f182, f254; +add.f32 f285, f183, f256; +sub.f32 f286, f182, f254; +sub.f32 f287, f183, f256; +sub.f32 f288, f172, f230; +add.f32 f289, f173, f229; +add.f32 f290, f172, f230; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +add.f32 f293, f177, f261; +sub.f32 f294, f176, f259; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +add.f32 f297, f181, f266; +sub.f32 f298, f180, f264; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +add.f32 f301, f185, f271; +sub.f32 f302, f184, f269; +sub.f32 f303, f185, f271; +add.f32 f304, %66, %109; +add.f32 f305, %68, %111; +sub.f32 f306, %66, %109; +sub.f32 f307, %68, %111; +add.f32 f308, %88, %130; +add.f32 f309, %89, %132; +sub.f32 f310, %88, %130; +sub.f32 f311, %89, %132; +add.f32 f312, f304, f308; +add.f32 f313, f305, f309; +sub.f32 f314, f304, f308; +sub.f32 f315, f305, f309; +sub.f32 f316, f306, f311; +add.f32 f317, f307, f310; +add.f32 f318, f306, f311; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %120; +add.f32 f321, %79, %121; +sub.f32 f322, %77, %120; +sub.f32 f323, %79, %121; +add.f32 f324, %98, %141; +add.f32 f325, %100, %143; +sub.f32 f326, %98, %141; +sub.f32 f327, %100, %143; +add.f32 f328, f320, f324; +add.f32 f329, f321, f325; +sub.f32 f330, f320, f324; +sub.f32 f331, f321, f325; +sub.f32 f332, f322, f327; +add.f32 f333, f323, f326; +add.f32 f334, f322, f327; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f333, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f340, f334, 0fBF3504F3; +mul.f32 f341, f335, 0f3F3504F3; +sub.f32 f342, f340, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +add.f32 f346, f313, f329; +sub.f32 f347, f312, f328; +sub.f32 f348, f313, f329; +add.f32 f349, f316, f338; +add.f32 f350, f317, f339; +sub.f32 f351, f316, f338; +sub.f32 f352, f317, f339; +sub.f32 f353, f314, f331; +add.f32 f354, f315, f330; +add.f32 f355, f314, f331; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +add.f32 f358, f319, f344; +sub.f32 f359, f318, f342; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %114; +add.f32 f362, %73, %116; +sub.f32 f363, %72, %114; +sub.f32 f364, %73, %116; +add.f32 f365, %93, %136; +add.f32 f366, %95, %137; +sub.f32 f367, %93, %136; +sub.f32 f368, %95, %137; +add.f32 f369, f361, f365; +add.f32 f370, f362, f366; +sub.f32 f371, f361, f365; +sub.f32 f372, f362, f366; +sub.f32 f373, f363, f368; +add.f32 f374, f364, f367; +add.f32 f375, f363, f368; +sub.f32 f376, f364, f367; +add.f32 f377, %82, %125; +add.f32 f378, %84, %127; +sub.f32 f379, %82, %125; +sub.f32 f380, %84, %127; +add.f32 f381, %104, %146; +add.f32 f382, %105, %147; +sub.f32 f383, %104, %146; +sub.f32 f384, %105, %147; +add.f32 f385, f377, f381; +add.f32 f386, f378, f382; +sub.f32 f387, f377, f381; +sub.f32 f388, f378, f382; +sub.f32 f389, f379, f384; +add.f32 f390, f380, f383; +add.f32 f391, f379, f384; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f390, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f397, f391, 0fBF3504F3; +mul.f32 f398, f392, 0f3F3504F3; +sub.f32 f399, f397, f398; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +add.f32 f403, f370, f386; +sub.f32 f404, f369, f385; +sub.f32 f405, f370, f386; +add.f32 f406, f373, f395; +add.f32 f407, f374, f396; +sub.f32 f408, f373, f395; +sub.f32 f409, f374, f396; +sub.f32 f410, f371, f388; +add.f32 f411, f372, f387; +add.f32 f412, f371, f388; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +add.f32 f415, f376, f401; +sub.f32 f416, f375, f399; +sub.f32 f417, f376, f401; +mul.f32 f418, f406, 0f3F6C835E; +mul.f32 f419, f407, 0f3EC3EF15; +sub.f32 f420, f418, f419; +mul.f32 f421, f407, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f411, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f427, f414, 0f3EC3EF15; +mul.f32 f428, f415, 0f3F6C835E; +sub.f32 f429, f427, f428; +mul.f32 f430, f415, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f432, f408, 0fBEC3EF15; +mul.f32 f433, f409, 0f3F6C835E; +sub.f32 f434, f432, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f437, f412, 0fBF3504F3; +mul.f32 f438, f413, 0f3F3504F3; +sub.f32 f439, f437, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f442, f416, 0fBF6C835E; +mul.f32 f443, f417, 0f3EC3EF15; +sub.f32 f444, f442, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +add.f32 f448, f346, f403; +sub.f32 f449, f345, f402; +sub.f32 f450, f346, f403; +add.f32 f451, f349, f420; +add.f32 f452, f350, f422; +sub.f32 f453, f349, f420; +sub.f32 f454, f350, f422; +add.f32 f455, f353, f425; +add.f32 f456, f354, f426; +sub.f32 f457, f353, f425; +sub.f32 f458, f354, f426; +add.f32 f459, f357, f429; +add.f32 f460, f358, f431; +sub.f32 f461, f357, f429; +sub.f32 f462, f358, f431; +sub.f32 f463, f347, f405; +add.f32 f464, f348, f404; +add.f32 f465, f347, f405; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +add.f32 f468, f352, f436; +sub.f32 f469, f351, f434; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +add.f32 f472, f356, f441; +sub.f32 f473, f355, f439; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +add.f32 f476, f360, f446; +sub.f32 f477, f359, f444; +sub.f32 f478, f360, f446; +mul.f32 f479, f451, 0f3F7B14BE; +mul.f32 f480, f452, 0f3E47C5C2; +sub.f32 f481, f479, f480; +mul.f32 f482, f452, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f484, f455, 0f3F6C835E; +mul.f32 f485, f456, 0f3EC3EF15; +sub.f32 f486, f484, f485; +mul.f32 f487, f456, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f489, f459, 0f3F54DB31; +mul.f32 f490, f460, 0f3F0E39DA; +sub.f32 f491, f489, f490; +mul.f32 f492, f460, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f464, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f498, f467, 0f3F0E39DA; +mul.f32 f499, f468, 0f3F54DB31; +sub.f32 f500, f498, f499; +mul.f32 f501, f468, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f503, f471, 0f3EC3EF15; +mul.f32 f504, f472, 0f3F6C835E; +sub.f32 f505, f503, f504; +mul.f32 f506, f472, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f508, f475, 0f3E47C5C2; +mul.f32 f509, f476, 0f3F7B14BE; +sub.f32 f510, f508, f509; +mul.f32 f511, f476, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f513, f453, 0fBE47C5C2; +mul.f32 f514, f454, 0f3F7B14BE; +sub.f32 f515, f513, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f518, f457, 0fBEC3EF15; +mul.f32 f519, f458, 0f3F6C835E; +sub.f32 f520, f518, f519; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f523, f461, 0fBF0E39DA; +mul.f32 f524, f462, 0f3F54DB31; +sub.f32 f525, f523, f524; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f528, f465, 0fBF3504F3; +mul.f32 f529, f466, 0f3F3504F3; +sub.f32 f530, f528, f529; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f533, f469, 0fBF54DB31; +mul.f32 f534, f470, 0f3F0E39DA; +sub.f32 f535, f533, f534; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f538, f473, 0fBF6C835E; +mul.f32 f539, f474, 0f3EC3EF15; +sub.f32 f540, f538, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f543, f477, 0fBF7B14BE; +mul.f32 f544, f478, 0f3E47C5C2; +sub.f32 f545, f543, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 %1, f273, f448; +add.f32 %0, f272, f447; +add.f32 %3, f277, f483; +add.f32 %2, f276, f481; +add.f32 %5, f281, f488; +add.f32 %4, f280, f486; +add.f32 %7, f285, f493; +add.f32 %6, f284, f491; +add.f32 %9, f289, f497; +add.f32 %8, f288, f496; +add.f32 %11, f293, f502; +add.f32 %10, f292, f500; +add.f32 %13, f297, f507; +add.f32 %12, f296, f505; +add.f32 %15, f301, f512; +add.f32 %14, f300, f510; +add.f32 %17, f275, f449; +sub.f32 %16, f274, f450; +add.f32 %19, f279, f517; +add.f32 %18, f278, f515; +add.f32 %21, f283, f522; +add.f32 %20, f282, f520; +add.f32 %23, f287, f527; +add.f32 %22, f286, f525; +add.f32 %25, f291, f532; +add.f32 %24, f290, f530; +add.f32 %27, f295, f537; +add.f32 %26, f294, f535; +add.f32 %29, f299, f542; +add.f32 %28, f298, f540; +add.f32 %31, f303, f547; +add.f32 %30, f302, f545; +sub.f32 %33, f273, f448; +sub.f32 %32, f272, f447; +sub.f32 %35, f277, f483; +sub.f32 %34, f276, f481; +sub.f32 %37, f281, f488; +sub.f32 %36, f280, f486; +sub.f32 %39, f285, f493; +sub.f32 %38, f284, f491; +sub.f32 %41, f289, f497; +sub.f32 %40, f288, f496; +sub.f32 %43, f293, f502; +sub.f32 %42, f292, f500; +sub.f32 %45, f297, f507; +sub.f32 %44, f296, f505; +sub.f32 %47, f301, f512; +sub.f32 %46, f300, f510; +sub.f32 %49, f275, f449; +add.f32 %48, f274, f450; +sub.f32 %51, f279, f517; +sub.f32 %50, f278, f515; +sub.f32 %53, f283, f522; +sub.f32 %52, f282, f520; +sub.f32 %55, f287, f527; +sub.f32 %54, f286, f525; +sub.f32 %57, f291, f532; +sub.f32 %56, f290, f530; +sub.f32 %59, f295, f537; +sub.f32 %58, f294, f535; +sub.f32 %61, f299, f542; +sub.f32 %60, f298, f540; +sub.f32 %63, f303, f547; +sub.f32 %62, f302, f545; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<235, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<223>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 192; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+32]; +ld.shared.v2.f32 {f167, f168}, [r13+64]; +ld.shared.v2.f32 {f171, f172}, [r13+96]; +ld.shared.v2.f32 {f175, f176}, [r13+128]; +ld.shared.v2.f32 {f179, f180}, [r13+160]; +ld.shared.v2.f32 {f183, f184}, [r13+192]; +ld.shared.v2.f32 {f187, f188}, [r13+224]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f163, f179; +add.f32 f200, f164, f180; +sub.f32 f201, f163, f179; +sub.f32 f202, f164, f180; +add.f32 f203, f171, f187; +add.f32 f204, f172, f188; +sub.f32 f205, f171, f187; +sub.f32 f206, f172, f188; +add.f32 %1, f192, f196; +add.f32 %0, f191, f195; +add.f32 %3, f200, f204; +add.f32 %2, f199, f203; +add.f32 %5, f194, f197; +sub.f32 %4, f193, f198; +add.f32 %7, f202, f205; +sub.f32 %6, f201, f206; +sub.f32 %9, f192, f196; +sub.f32 %8, f191, f195; +sub.f32 %11, f200, f204; +sub.f32 %10, f199, f203; +sub.f32 %13, f194, f197; +add.f32 %12, f193, f198; +sub.f32 %15, f202, f205; +add.f32 %14, f201, f206; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<236, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<207>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 96; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+16]; +ld.shared.f32 f161, [r13+32]; +ld.shared.f32 f162, [r13+48]; +ld.shared.f32 f163, [r13+64]; +ld.shared.f32 f164, [r13+80]; +ld.shared.f32 f165, [r13+96]; +ld.shared.f32 f166, [r13+112]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+16]; +ld.shared.f32 f169, [r13+32]; +ld.shared.f32 f170, [r13+48]; +ld.shared.f32 f171, [r13+64]; +ld.shared.f32 f172, [r13+80]; +ld.shared.f32 f173, [r13+96]; +ld.shared.f32 f174, [r13+112]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f160, f164; +add.f32 f184, f168, f172; +sub.f32 f185, f160, f164; +sub.f32 f186, f168, f172; +add.f32 f187, f162, f166; +add.f32 f188, f170, f174; +sub.f32 f189, f162, f166; +sub.f32 f190, f170, f174; +add.f32 %0, f175, f179; +add.f32 %1, f176, f180; +add.f32 %2, f183, f187; +add.f32 %3, f184, f188; +add.f32 %5, f178, f181; +sub.f32 %4, f177, f182; +add.f32 %7, f186, f189; +sub.f32 %6, f185, f190; +sub.f32 %8, f175, f179; +sub.f32 %9, f176, f180; +sub.f32 %10, f183, f187; +sub.f32 %11, f184, f188; +sub.f32 %13, f178, f181; +add.f32 %12, f177, f182; +sub.f32 %15, f186, f189; +add.f32 %14, f185, f190; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<237, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<147>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 224; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+64]; +ld.shared.v2.f32 {f70, f71}, [r13+128]; +ld.shared.v2.f32 {f74, f75}, [r13+192]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 128; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+64]; +ld.shared.v2.f32 {f131, f132}, [r20+128]; +ld.shared.v2.f32 {f135, f136}, [r20+192]; +add.f32 %1, f124, f132; +add.f32 %0, f123, f131; +add.f32 %3, f128, f136; +add.f32 %2, f127, f135; +sub.f32 %5, f124, f132; +sub.f32 %4, f123, f131; +sub.f32 %7, f128, f136; +sub.f32 %6, f127, f135; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<238, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<131>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 112; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+32]; +ld.shared.f32 f64, [r13+64]; +ld.shared.f32 f65, [r13+96]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+32]; +ld.shared.f32 f68, [r13+64]; +ld.shared.f32 f69, [r13+96]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 64; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+32]; +ld.shared.f32 f117, [r21+64]; +ld.shared.f32 f118, [r21+96]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+32]; +ld.shared.f32 f121, [r21+64]; +ld.shared.f32 f122, [r21+96]; +add.f32 %0, f115, f117; +add.f32 %1, f119, f121; +add.f32 %2, f116, f118; +add.f32 %3, f120, f122; +sub.f32 %4, f115, f117; +sub.f32 %5, f119, f121; +sub.f32 %6, f116, f118; +sub.f32 %7, f120, f122; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<239, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<97>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %9, %11; +sub.f32 f10, %10, %12; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 240; +add.s32 r11, r8, r10; +add.f32 f18, %10, %12; +add.f32 f19, %9, %11; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 120; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+128]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 224; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 112; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+128]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 192; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 96; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+128]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 8; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 128; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 64; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+128]; +add.f32 %1, f86, f90; +add.f32 %0, f85, f89; +sub.f32 %3, f86, f90; +sub.f32 %2, f85, f89; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<240, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<587>; +.reg .b32 r<24>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f582, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f580, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f579, f582, f580; +sub.f32 f76, f582, f580; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f578, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f575, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f573, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f572, f575, f573; +sub.f32 f92, f575, f573; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f571, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f571, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f569, f95, 0fBF3504F3; +mul.f32 f570, f96, 0f3F3504F3; +sub.f32 f103, f569, f570; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f568, f579, f572; +sub.f32 f109, f579, f572; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f567, f578, f100; +sub.f32 f113, f578, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f566, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f565, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f563, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f560, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f559, f563, f560; +sub.f32 f133, f563, f560; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f558, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f556, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f554, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f553, f556, f554; +sub.f32 f149, f556, f554; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f552, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f552, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f550, f152, 0fBF3504F3; +mul.f32 f551, f153, 0f3F3504F3; +sub.f32 f160, f550, f551; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f549, f559, f553; +sub.f32 f166, f559, f553; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f548, f558, f157; +sub.f32 f170, f558, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f547, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f546, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f544, f167, 0f3F6C835E; +mul.f32 f545, f548, 0f3EC3EF15; +sub.f32 f181, f544, f545; +mul.f32 f182, f548, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f547, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f546, 0f3F6C835E; +mul.f32 f543, f175, 0f3EC3EF15; +sub.f32 f190, f543, f189; +mul.f32 f191, f546, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f542, f169, 0fBEC3EF15; +sub.f32 f195, f542, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f540, f173, 0fBF3504F3; +mul.f32 f541, f174, 0f3F3504F3; +sub.f32 f200, f540, f541; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f538, f177, 0fBF6C835E; +mul.f32 f539, f178, 0f3EC3EF15; +sub.f32 f205, f538, f539; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f537, f567, f183; +sub.f32 f213, f567, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f536, f566, f187; +sub.f32 f217, f566, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f535, f565, f192; +sub.f32 f221, f565, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f534, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f533, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f532, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f531, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -256; +add.s32 r9, r4, r8; +and.b32 r14, r15, 1; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f537, f239; +mul.f32 f244, f238, f537; +mul.f32 f246, f239, f239; +mul.f32 f530, f238, f238; +sub.f32 f247, f530, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f536, f249; +mul.f32 f252, f247, f536; +mul.f32 f528, f238, f247; +mul.f32 f529, f239, f249; +sub.f32 f255, f528, f529; +mul.f32 f527, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f535, f257; +mul.f32 f260, f255, f535; +mul.f32 f262, f239, f257; +mul.f32 f526, f238, f255; +sub.f32 f263, f526, f262; +mul.f32 f525, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f534, f265; +mul.f32 f268, f263, f534; +mul.f32 f270, f239, f265; +mul.f32 f524, f238, f263; +sub.f32 f271, f524, f270; +mul.f32 f523, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f533, f273; +mul.f32 f276, f271, f533; +mul.f32 f521, f238, f271; +mul.f32 f522, f239, f273; +sub.f32 f279, f521, f522; +mul.f32 f520, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f532, f281; +mul.f32 f284, f279, f532; +mul.f32 f286, f239, f281; +mul.f32 f519, f238, f279; +sub.f32 f287, f519, f286; +mul.f32 f518, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f531, f289; +mul.f32 f292, f287, f531; +mul.f32 f294, f239, f289; +mul.f32 f517, f238, f287; +sub.f32 f295, f517, f294; +mul.f32 f516, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f515, f568, f549; +mul.f32 f298, f515, f297; +mul.f32 f300, f295, f515; +mul.f32 f513, f238, f295; +mul.f32 f514, f239, f297; +sub.f32 f303, f513, f514; +sub.f32 f512, f106, f163; +mul.f32 f511, f512, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f510, f238, f303; +sub.f32 f311, f510, f310; +mul.f32 f509, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f507, f238, f311; +mul.f32 f508, f239, f313; +sub.f32 f319, f507, f508; +mul.f32 f506, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f505, f238, f319; +sub.f32 f327, f505, f326; +mul.f32 f504, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f503, f238, f327; +sub.f32 f335, f503, f334; +mul.f32 f502, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f500, f238, f335; +mul.f32 f501, f239, f337; +sub.f32 f343, f500, f501; +mul.f32 f499, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f498, f238, f343; +sub.f32 f351, f498, f350; +mul.f32 f497, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f496, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 128; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 1; +sub.f32 f585, f568, f549; +mul.f32 f584, f295, f585; +add.f32 f357, f568, f549; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 1; +sub.f32 f586, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 1; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 1; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f496; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f527; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f525; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f523; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f520; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f518; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f516; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f586, f298; +sub.f32 f374, f584, f511; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f509; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f506; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f504; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f502; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f499; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f497; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +mad.lo.s32 r13, r22, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+16]; +ld.shared.v2.f32 {f397, f398}, [r13+32]; +ld.shared.v2.f32 {f401, f402}, [r13+48]; +ld.shared.v2.f32 {f405, f406}, [r13+64]; +ld.shared.v2.f32 {f409, f410}, [r13+80]; +ld.shared.v2.f32 {f413, f414}, [r13+96]; +ld.shared.v2.f32 {f417, f418}, [r13+112]; +ld.shared.v2.f32 {f421, f422}, [r13+128]; +ld.shared.v2.f32 {f425, f426}, [r13+144]; +ld.shared.v2.f32 {f429, f430}, [r13+160]; +ld.shared.v2.f32 {f433, f434}, [r13+176]; +ld.shared.v2.f32 {f437, f438}, [r13+192]; +ld.shared.v2.f32 {f441, f442}, [r13+208]; +ld.shared.v2.f32 {f445, f446}, [r13+224]; +ld.shared.v2.f32 {f449, f450}, [r13+240]; +add.f32 %0, f389, f421; +add.f32 %1, f390, f422; +add.f32 %2, f393, f425; +add.f32 %3, f394, f426; +add.f32 %5, f398, f430; +add.f32 %4, f397, f429; +add.f32 %7, f402, f434; +add.f32 %6, f401, f433; +add.f32 %9, f406, f438; +add.f32 %8, f405, f437; +add.f32 %10, f409, f441; +add.f32 %11, f410, f442; +add.f32 %12, f413, f445; +add.f32 %13, f414, f446; +add.f32 %14, f417, f449; +add.f32 %15, f418, f450; +sub.f32 %17, f390, f422; +sub.f32 %16, f389, f421; +sub.f32 %19, f394, f426; +sub.f32 %18, f393, f425; +sub.f32 %21, f398, f430; +sub.f32 %20, f397, f429; +sub.f32 %23, f402, f434; +sub.f32 %22, f401, f433; +sub.f32 %25, f406, f438; +sub.f32 %24, f405, f437; +sub.f32 %27, f410, f442; +sub.f32 %26, f409, f441; +sub.f32 %29, f414, f446; +sub.f32 %28, f413, f445; +sub.f32 %31, f418, f450; +sub.f32 %30, f417, f449; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<241, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<81>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %9, %11; +add.f32 f10, %10, %12; +sub.f32 f11, %9, %11; +sub.f32 f12, %10, %12; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 120; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 60; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+64]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+64]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 112; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 56; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+64]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+64]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 96; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 48; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+64]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+64]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 8; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 64; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 32; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+64]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+64]; +add.f32 %0, f73, f74; +add.f32 %1, f75, f76; +sub.f32 %2, f73, f74; +sub.f32 %3, f75, f76; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<242, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<453>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -128; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 64; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+8]; +ld.shared.f32 f391, [r13+16]; +ld.shared.f32 f392, [r13+24]; +ld.shared.f32 f393, [r13+32]; +ld.shared.f32 f394, [r13+40]; +ld.shared.f32 f395, [r13+48]; +ld.shared.f32 f396, [r13+56]; +ld.shared.f32 f397, [r13+64]; +ld.shared.f32 f398, [r13+72]; +ld.shared.f32 f399, [r13+80]; +ld.shared.f32 f400, [r13+88]; +ld.shared.f32 f401, [r13+96]; +ld.shared.f32 f402, [r13+104]; +ld.shared.f32 f403, [r13+112]; +ld.shared.f32 f404, [r13+120]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+8]; +ld.shared.f32 f407, [r13+16]; +ld.shared.f32 f408, [r13+24]; +ld.shared.f32 f409, [r13+32]; +ld.shared.f32 f410, [r13+40]; +ld.shared.f32 f411, [r13+48]; +ld.shared.f32 f412, [r13+56]; +ld.shared.f32 f413, [r13+64]; +ld.shared.f32 f414, [r13+72]; +ld.shared.f32 f415, [r13+80]; +ld.shared.f32 f416, [r13+88]; +ld.shared.f32 f417, [r13+96]; +ld.shared.f32 f418, [r13+104]; +ld.shared.f32 f419, [r13+112]; +ld.shared.f32 f420, [r13+120]; +add.f32 %0, f389, f397; +add.f32 %1, f405, f413; +add.f32 %2, f390, f398; +add.f32 %3, f406, f414; +add.f32 %4, f391, f399; +add.f32 %5, f407, f415; +add.f32 %6, f392, f400; +add.f32 %7, f408, f416; +add.f32 %8, f393, f401; +add.f32 %9, f409, f417; +add.f32 %10, f394, f402; +add.f32 %11, f410, f418; +add.f32 %12, f395, f403; +add.f32 %13, f411, f419; +add.f32 %14, f396, f404; +add.f32 %15, f412, f420; +sub.f32 %16, f389, f397; +sub.f32 %17, f405, f413; +sub.f32 %18, f390, f398; +sub.f32 %19, f406, f414; +sub.f32 %20, f391, f399; +sub.f32 %21, f407, f415; +sub.f32 %22, f392, f400; +sub.f32 %23, f408, f416; +sub.f32 %24, f393, f401; +sub.f32 %25, f409, f417; +sub.f32 %26, f394, f402; +sub.f32 %27, f410, f418; +sub.f32 %28, f395, f403; +sub.f32 %29, f411, f419; +sub.f32 %30, f396, f404; +sub.f32 %31, f412, f420; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..5e2aa4defea0d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp64_fwd.hpp.inc @@ -0,0 +1,1852 @@ +#ifndef CUFFTDX_FFT_32_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_32_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<436, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<206>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+64]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 192; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+32]; +ld.shared.f64 fd160, [r13+64]; +ld.shared.f64 fd161, [r13+96]; +ld.shared.f64 fd162, [r13+128]; +ld.shared.f64 fd163, [r13+160]; +ld.shared.f64 fd164, [r13+192]; +ld.shared.f64 fd165, [r13+224]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+32]; +ld.shared.f64 fd168, [r13+64]; +ld.shared.f64 fd169, [r13+96]; +ld.shared.f64 fd170, [r13+128]; +ld.shared.f64 fd171, [r13+160]; +ld.shared.f64 fd172, [r13+192]; +ld.shared.f64 fd173, [r13+224]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd159, fd163; +add.f64 fd183, fd167, fd171; +sub.f64 fd184, fd159, fd163; +sub.f64 fd185, fd167, fd171; +add.f64 fd186, fd161, fd165; +add.f64 fd187, fd169, fd173; +sub.f64 fd188, fd161, fd165; +sub.f64 fd189, fd169, fd173; +add.f64 %0, fd174, fd178; +add.f64 %1, fd175, fd179; +add.f64 %2, fd182, fd186; +add.f64 %3, fd183, fd187; +sub.f64 %5, fd177, fd180; +add.f64 %4, fd176, fd181; +sub.f64 %7, fd185, fd188; +add.f64 %6, fd184, fd189; +sub.f64 %8, fd174, fd178; +sub.f64 %9, fd175, fd179; +sub.f64 %10, fd182, fd186; +sub.f64 %11, fd183, fd187; +add.f64 %13, fd177, fd180; +sub.f64 %12, fd176, fd181; +add.f64 %15, fd185, fd188; +sub.f64 %14, fd184, fd189; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<437, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<145>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+128]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 448; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+128]; +ld.shared.v2.f64 {fd69, fd70}, [r13+256]; +ld.shared.v2.f64 {fd73, fd74}, [r13+384]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+32]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 256; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+128]; +ld.shared.v2.f64 {fd129, fd130}, [r20+256]; +ld.shared.v2.f64 {fd133, fd134}, [r20+384]; +add.f64 %1, fd122, fd130; +add.f64 %0, fd121, fd129; +add.f64 %3, fd126, fd134; +add.f64 %2, fd125, fd133; +sub.f64 %5, fd122, fd130; +sub.f64 %4, fd121, fd129; +sub.f64 %7, fd126, fd134; +sub.f64 %6, fd125, fd133; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<438, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<222>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+64]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 384; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+64]; +ld.shared.v2.f64 {fd166, fd167}, [r13+128]; +ld.shared.v2.f64 {fd170, fd171}, [r13+192]; +ld.shared.v2.f64 {fd174, fd175}, [r13+256]; +ld.shared.v2.f64 {fd178, fd179}, [r13+320]; +ld.shared.v2.f64 {fd182, fd183}, [r13+384]; +ld.shared.v2.f64 {fd186, fd187}, [r13+448]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd162, fd178; +add.f64 fd199, fd163, fd179; +sub.f64 fd200, fd162, fd178; +sub.f64 fd201, fd163, fd179; +add.f64 fd202, fd170, fd186; +add.f64 fd203, fd171, fd187; +sub.f64 fd204, fd170, fd186; +sub.f64 fd205, fd171, fd187; +add.f64 %1, fd191, fd195; +add.f64 %0, fd190, fd194; +add.f64 %3, fd199, fd203; +add.f64 %2, fd198, fd202; +sub.f64 %5, fd193, fd196; +add.f64 %4, fd192, fd197; +sub.f64 %7, fd201, fd204; +add.f64 %6, fd200, fd205; +sub.f64 %9, fd191, fd195; +sub.f64 %8, fd190, fd194; +sub.f64 %11, fd199, fd203; +sub.f64 %10, fd198, fd202; +add.f64 %13, fd193, fd196; +sub.f64 %12, fd192, fd197; +add.f64 %15, fd201, fd204; +sub.f64 %14, fd200, fd205; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<439, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<129>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+128]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 224; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+64]; +ld.shared.f64 fd63, [r13+128]; +ld.shared.f64 fd64, [r13+192]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+64]; +ld.shared.f64 fd67, [r13+128]; +ld.shared.f64 fd68, [r13+192]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+32]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 128; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+64]; +ld.shared.f64 fd115, [r21+128]; +ld.shared.f64 fd116, [r21+192]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+64]; +ld.shared.f64 fd119, [r21+128]; +ld.shared.f64 fd120, [r21+192]; +add.f64 %0, fd113, fd115; +add.f64 %1, fd117, fd119; +add.f64 %2, fd114, fd116; +add.f64 %3, fd118, fd120; +sub.f64 %4, fd113, fd115; +sub.f64 %5, fd117, fd119; +sub.f64 %6, fd114, fd116; +sub.f64 %7, fd118, fd120; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<440, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<588>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd581, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd579, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd578, fd581, fd579; +sub.f64 fd76, fd581, fd579; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd577, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd574, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd572, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd571, fd574, fd572; +sub.f64 fd92, fd574, fd572; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd570, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd570, 0dBFE6A09E667F3BCD; +mul.f64 fd569, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd569, fd98; +mul.f64 fd100, fd570, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd568, fd578, fd571; +sub.f64 fd109, fd578, fd571; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd567, fd577, fd101; +sub.f64 fd113, fd577, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd566, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd565, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd563, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd560, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd559, fd563, fd560; +sub.f64 fd133, fd563, fd560; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd558, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd556, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd554, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd553, fd556, fd554; +sub.f64 fd149, fd556, fd554; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd552, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd552, 0dBFE6A09E667F3BCD; +mul.f64 fd551, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd551, fd155; +mul.f64 fd157, fd552, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd550, fd559, fd553; +sub.f64 fd166, fd559, fd553; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd549, fd558, fd158; +sub.f64 fd170, fd558, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd548, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd547, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd545, fd167, 0d3FED906BCF328D46; +mul.f64 fd546, fd549, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd545, fd546; +mul.f64 fd182, fd549, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd543, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd544, fd548, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd543, fd544; +mul.f64 fd187, fd548, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd541, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd542, fd547, 0dBFED906BCF328D46; +sub.f64 fd191, fd541, fd542; +mul.f64 fd192, fd547, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd539, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd540, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd539, fd540; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd537, fd177, 0dBFED906BCF328D46; +mul.f64 fd538, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd537, fd538; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd536, fd567, fd183; +sub.f64 fd213, fd567, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd535, fd566, fd188; +sub.f64 fd217, fd566, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd534, fd565, fd193; +sub.f64 fd221, fd565, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd533, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd532, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd531, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd530, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +and.b32 r14, r15, 1; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd536; +mul.f64 fd244, fd238, fd536; +mul.f64 fd246, fd239, fd239; +mul.f64 fd529, fd238, fd238; +sub.f64 fd247, fd529, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd535; +mul.f64 fd252, fd247, fd535; +mul.f64 fd527, fd238, fd247; +mul.f64 fd528, fd239, fd249; +sub.f64 fd255, fd527, fd528; +mul.f64 fd526, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd534; +mul.f64 fd260, fd255, fd534; +mul.f64 fd262, fd239, fd257; +mul.f64 fd525, fd238, fd255; +sub.f64 fd263, fd525, fd262; +mul.f64 fd524, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd533; +mul.f64 fd268, fd263, fd533; +mul.f64 fd270, fd239, fd265; +mul.f64 fd523, fd238, fd263; +sub.f64 fd271, fd523, fd270; +mul.f64 fd522, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd532; +mul.f64 fd276, fd271, fd532; +mul.f64 fd520, fd238, fd271; +mul.f64 fd521, fd239, fd273; +sub.f64 fd279, fd520, fd521; +mul.f64 fd519, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd531; +mul.f64 fd284, fd279, fd531; +mul.f64 fd286, fd239, fd281; +mul.f64 fd518, fd238, fd279; +sub.f64 fd287, fd518, fd286; +mul.f64 fd517, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd530; +mul.f64 fd292, fd287, fd530; +mul.f64 fd294, fd239, fd289; +mul.f64 fd516, fd238, fd287; +sub.f64 fd295, fd516, fd294; +mul.f64 fd515, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd514, fd568, fd550; +sub.f64 fd513, fd106, fd163; +mul.f64 fd298, fd295, fd513; +mul.f64 fd299, fd297, fd514; +mul.f64 fd300, fd295, fd514; +ld.global.v2.f64 {fd301, fd302}, [rd5+32]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd511, fd238, fd301; +mul.f64 fd512, fd239, fd302; +sub.f64 fd310, fd511, fd512; +mul.f64 fd510, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd509, fd238, fd310; +sub.f64 fd318, fd509, fd317; +mul.f64 fd508, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd506, fd238, fd318; +mul.f64 fd507, fd239, fd320; +sub.f64 fd326, fd506, fd507; +mul.f64 fd505, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd503, fd238, fd326; +mul.f64 fd504, fd239, fd328; +sub.f64 fd334, fd503, fd504; +mul.f64 fd502, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd501, fd238, fd334; +sub.f64 fd342, fd501, fd341; +mul.f64 fd500, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd498, fd238, fd342; +mul.f64 fd499, fd239, fd344; +sub.f64 fd350, fd498, fd499; +mul.f64 fd497, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd496, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 1; +sub.f64 fd585, fd568, fd550; +mul.f64 fd584, fd297, fd585; +mov.u32 r23, %tid.x; +shl.b32 r22, r23, 8; +barrier.sync 0; +and.b32 r11, r22, 256; +add.s32 r12, r9, r11; +sub.f64 fd587, fd568, fd550; +mul.f64 fd586, fd297, fd587; +add.f64 fd356, fd568, fd550; +sub.f64 fd583, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 1; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 1; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd497, fd243; +st.shared.v2.f64 [r12+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd526, fd251; +st.shared.v2.f64 [r12+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd524, fd259; +st.shared.v2.f64 [r12+48], {fd363, fd362}; +sub.f64 fd364, fd522, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r12+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd519, fd275; +st.shared.v2.f64 [r12+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd517, fd283; +st.shared.v2.f64 [r12+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd515, fd291; +st.shared.v2.f64 [r12+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd583, fd300; +sub.f64 fd373, fd298, fd586; +st.shared.v2.f64 [r12+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd510, fd306; +st.shared.v2.f64 [r12+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd508, fd314; +st.shared.v2.f64 [r12+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd505, fd322; +st.shared.v2.f64 [r12+176], {fd379, fd378}; +sub.f64 fd380, fd502, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r12+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd500, fd338; +st.shared.v2.f64 [r12+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd496, fd346; +st.shared.v2.f64 [r12+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r12+240], {fd387, fd386}; +barrier.sync 0; +mad.lo.s32 r13, r20, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+32]; +ld.shared.v2.f64 {fd396, fd397}, [r13+64]; +ld.shared.v2.f64 {fd400, fd401}, [r13+96]; +ld.shared.v2.f64 {fd404, fd405}, [r13+128]; +ld.shared.v2.f64 {fd408, fd409}, [r13+160]; +ld.shared.v2.f64 {fd412, fd413}, [r13+192]; +ld.shared.v2.f64 {fd416, fd417}, [r13+224]; +ld.shared.v2.f64 {fd420, fd421}, [r13+256]; +ld.shared.v2.f64 {fd424, fd425}, [r13+288]; +ld.shared.v2.f64 {fd428, fd429}, [r13+320]; +ld.shared.v2.f64 {fd432, fd433}, [r13+352]; +ld.shared.v2.f64 {fd436, fd437}, [r13+384]; +ld.shared.v2.f64 {fd440, fd441}, [r13+416]; +ld.shared.v2.f64 {fd444, fd445}, [r13+448]; +ld.shared.v2.f64 {fd448, fd449}, [r13+480]; +add.f64 %1, fd389, fd421; +add.f64 %0, fd388, fd420; +add.f64 %3, fd393, fd425; +add.f64 %2, fd392, fd424; +add.f64 %4, fd396, fd428; +add.f64 %5, fd397, fd429; +add.f64 %6, fd400, fd432; +add.f64 %7, fd401, fd433; +add.f64 %8, fd404, fd436; +add.f64 %9, fd405, fd437; +add.f64 %10, fd408, fd440; +add.f64 %11, fd409, fd441; +add.f64 %13, fd413, fd445; +add.f64 %12, fd412, fd444; +add.f64 %15, fd417, fd449; +add.f64 %14, fd416, fd448; +sub.f64 %17, fd389, fd421; +sub.f64 %16, fd388, fd420; +sub.f64 %19, fd393, fd425; +sub.f64 %18, fd392, fd424; +sub.f64 %21, fd397, fd429; +sub.f64 %20, fd396, fd428; +sub.f64 %23, fd401, fd433; +sub.f64 %22, fd400, fd432; +sub.f64 %25, fd405, fd437; +sub.f64 %24, fd404, fd436; +sub.f64 %27, fd409, fd441; +sub.f64 %26, fd408, fd440; +sub.f64 %29, fd413, fd445; +sub.f64 %28, fd412, fd444; +sub.f64 %31, fd417, fd449; +sub.f64 %30, fd416, fd448; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<441, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<97>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %9, %11; +sub.f64 fd10, %10, %12; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 480; +add.s32 r11, r8, r10; +add.f64 fd18, %10, %12; +add.f64 fd19, %9, %11; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 240; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+256]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 448; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 224; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+256]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 384; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 192; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+256]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 1; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 256; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 128; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+256]; +add.f64 %1, fd86, fd90; +add.f64 %0, fd85, fd89; +sub.f64 %3, fd86, fd90; +sub.f64 %2, fd85, fd89; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<442, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<81>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %9, %11; +add.f64 fd10, %10, %12; +sub.f64 fd11, %9, %11; +sub.f64 fd12, %10, %12; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 240; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 120; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+128]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+128]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 224; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 112; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+128]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+128]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 192; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 96; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+128]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+128]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 1; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 128; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 64; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+128]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+128]; +add.f64 %0, fd73, fd74; +add.f64 %1, fd75, fd76; +sub.f64 %2, fd73, fd74; +sub.f64 %3, fd75, fd76; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<443, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<452>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+32]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 128; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+16]; +ld.shared.f64 fd390, [r13+32]; +ld.shared.f64 fd391, [r13+48]; +ld.shared.f64 fd392, [r13+64]; +ld.shared.f64 fd393, [r13+80]; +ld.shared.f64 fd394, [r13+96]; +ld.shared.f64 fd395, [r13+112]; +ld.shared.f64 fd396, [r13+128]; +ld.shared.f64 fd397, [r13+144]; +ld.shared.f64 fd398, [r13+160]; +ld.shared.f64 fd399, [r13+176]; +ld.shared.f64 fd400, [r13+192]; +ld.shared.f64 fd401, [r13+208]; +ld.shared.f64 fd402, [r13+224]; +ld.shared.f64 fd403, [r13+240]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+16]; +ld.shared.f64 fd406, [r13+32]; +ld.shared.f64 fd407, [r13+48]; +ld.shared.f64 fd408, [r13+64]; +ld.shared.f64 fd409, [r13+80]; +ld.shared.f64 fd410, [r13+96]; +ld.shared.f64 fd411, [r13+112]; +ld.shared.f64 fd412, [r13+128]; +ld.shared.f64 fd413, [r13+144]; +ld.shared.f64 fd414, [r13+160]; +ld.shared.f64 fd415, [r13+176]; +ld.shared.f64 fd416, [r13+192]; +ld.shared.f64 fd417, [r13+208]; +ld.shared.f64 fd418, [r13+224]; +ld.shared.f64 fd419, [r13+240]; +add.f64 %0, fd388, fd396; +add.f64 %1, fd404, fd412; +add.f64 %2, fd389, fd397; +add.f64 %3, fd405, fd413; +add.f64 %4, fd390, fd398; +add.f64 %5, fd406, fd414; +add.f64 %6, fd391, fd399; +add.f64 %7, fd407, fd415; +add.f64 %8, fd392, fd400; +add.f64 %9, fd408, fd416; +add.f64 %10, fd393, fd401; +add.f64 %11, fd409, fd417; +add.f64 %12, fd394, fd402; +add.f64 %13, fd410, fd418; +add.f64 %14, fd395, fd403; +add.f64 %15, fd411, fd419; +sub.f64 %16, fd388, fd396; +sub.f64 %17, fd404, fd412; +sub.f64 %18, fd389, fd397; +sub.f64 %19, fd405, fd413; +sub.f64 %20, fd390, fd398; +sub.f64 %21, fd406, fd414; +sub.f64 %22, fd391, fd399; +sub.f64 %23, fd407, fd415; +sub.f64 %24, fd392, fd400; +sub.f64 %25, fd408, fd416; +sub.f64 %26, fd393, fd401; +sub.f64 %27, fd409, fd417; +sub.f64 %28, fd394, fd402; +sub.f64 %29, fd410, fd418; +sub.f64 %30, fd395, fd403; +sub.f64 %31, fd411, fd419; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..341bfcd73ad18 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_32_fp64_inv.hpp.inc @@ -0,0 +1,1850 @@ +#ifndef CUFFTDX_FFT_32_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_32_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<607, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<206>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+64]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 192; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+32]; +ld.shared.f64 fd160, [r13+64]; +ld.shared.f64 fd161, [r13+96]; +ld.shared.f64 fd162, [r13+128]; +ld.shared.f64 fd163, [r13+160]; +ld.shared.f64 fd164, [r13+192]; +ld.shared.f64 fd165, [r13+224]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+32]; +ld.shared.f64 fd168, [r13+64]; +ld.shared.f64 fd169, [r13+96]; +ld.shared.f64 fd170, [r13+128]; +ld.shared.f64 fd171, [r13+160]; +ld.shared.f64 fd172, [r13+192]; +ld.shared.f64 fd173, [r13+224]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd159, fd163; +add.f64 fd183, fd167, fd171; +sub.f64 fd184, fd159, fd163; +sub.f64 fd185, fd167, fd171; +add.f64 fd186, fd161, fd165; +add.f64 fd187, fd169, fd173; +sub.f64 fd188, fd161, fd165; +sub.f64 fd189, fd169, fd173; +add.f64 %0, fd174, fd178; +add.f64 %1, fd175, fd179; +add.f64 %2, fd182, fd186; +add.f64 %3, fd183, fd187; +add.f64 %5, fd177, fd180; +sub.f64 %4, fd176, fd181; +add.f64 %7, fd185, fd188; +sub.f64 %6, fd184, fd189; +sub.f64 %8, fd174, fd178; +sub.f64 %9, fd175, fd179; +sub.f64 %10, fd182, fd186; +sub.f64 %11, fd183, fd187; +sub.f64 %13, fd177, fd180; +add.f64 %12, fd176, fd181; +sub.f64 %15, fd185, fd188; +add.f64 %14, fd184, fd189; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<608, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<145>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+128]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 448; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+128]; +ld.shared.v2.f64 {fd69, fd70}, [r13+256]; +ld.shared.v2.f64 {fd73, fd74}, [r13+384]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+32]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 256; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+128]; +ld.shared.v2.f64 {fd129, fd130}, [r20+256]; +ld.shared.v2.f64 {fd133, fd134}, [r20+384]; +add.f64 %1, fd122, fd130; +add.f64 %0, fd121, fd129; +add.f64 %3, fd126, fd134; +add.f64 %2, fd125, fd133; +sub.f64 %5, fd122, fd130; +sub.f64 %4, fd121, fd129; +sub.f64 %7, fd126, fd134; +sub.f64 %6, fd125, fd133; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<609, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<222>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+64]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 384; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+64]; +ld.shared.v2.f64 {fd166, fd167}, [r13+128]; +ld.shared.v2.f64 {fd170, fd171}, [r13+192]; +ld.shared.v2.f64 {fd174, fd175}, [r13+256]; +ld.shared.v2.f64 {fd178, fd179}, [r13+320]; +ld.shared.v2.f64 {fd182, fd183}, [r13+384]; +ld.shared.v2.f64 {fd186, fd187}, [r13+448]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd162, fd178; +add.f64 fd199, fd163, fd179; +sub.f64 fd200, fd162, fd178; +sub.f64 fd201, fd163, fd179; +add.f64 fd202, fd170, fd186; +add.f64 fd203, fd171, fd187; +sub.f64 fd204, fd170, fd186; +sub.f64 fd205, fd171, fd187; +add.f64 %1, fd191, fd195; +add.f64 %0, fd190, fd194; +add.f64 %3, fd199, fd203; +add.f64 %2, fd198, fd202; +add.f64 %5, fd193, fd196; +sub.f64 %4, fd192, fd197; +add.f64 %7, fd201, fd204; +sub.f64 %6, fd200, fd205; +sub.f64 %9, fd191, fd195; +sub.f64 %8, fd190, fd194; +sub.f64 %11, fd199, fd203; +sub.f64 %10, fd198, fd202; +sub.f64 %13, fd193, fd196; +add.f64 %12, fd192, fd197; +sub.f64 %15, fd201, fd204; +add.f64 %14, fd200, fd205; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<610, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<129>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+128]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 224; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+64]; +ld.shared.f64 fd63, [r13+128]; +ld.shared.f64 fd64, [r13+192]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+64]; +ld.shared.f64 fd67, [r13+128]; +ld.shared.f64 fd68, [r13+192]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 4; +bfe.u32 r15, r5, 2, 1; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+32]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 128; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+64]; +ld.shared.f64 fd115, [r21+128]; +ld.shared.f64 fd116, [r21+192]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+64]; +ld.shared.f64 fd119, [r21+128]; +ld.shared.f64 fd120, [r21+192]; +add.f64 %0, fd113, fd115; +add.f64 %1, fd117, fd119; +add.f64 %2, fd114, fd116; +add.f64 %3, fd118, fd120; +sub.f64 %4, fd113, fd115; +sub.f64 %5, fd117, fd119; +sub.f64 %6, fd114, fd116; +sub.f64 %7, fd118, fd120; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<611, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<587>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd581, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd579, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd578, fd581, fd579; +sub.f64 fd76, fd581, fd579; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd577, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd574, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd572, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd571, fd574, fd572; +sub.f64 fd92, fd574, fd572; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd570, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd570, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd568, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd569, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd568, fd569; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd567, fd578, fd571; +sub.f64 fd109, fd578, fd571; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd566, fd577, fd100; +sub.f64 fd113, fd577, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd565, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd564, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd562, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd559, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd558, fd562, fd559; +sub.f64 fd133, fd562, fd559; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd557, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd555, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd553, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd552, fd555, fd553; +sub.f64 fd149, fd555, fd553; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd551, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd551, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd549, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd550, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd549, fd550; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd548, fd558, fd552; +sub.f64 fd166, fd558, fd552; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd547, fd557, fd157; +sub.f64 fd170, fd557, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd546, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd545, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd543, fd167, 0d3FED906BCF328D46; +mul.f64 fd544, fd547, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd543, fd544; +mul.f64 fd182, fd547, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd546, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd545, 0d3FED906BCF328D46; +mul.f64 fd542, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd542, fd189; +mul.f64 fd191, fd545, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd541, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd541, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd539, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd540, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd539, fd540; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd537, fd177, 0dBFED906BCF328D46; +mul.f64 fd538, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd537, fd538; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd536, fd566, fd183; +sub.f64 fd213, fd566, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd535, fd565, fd187; +sub.f64 fd217, fd565, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd534, fd564, fd192; +sub.f64 fd221, fd564, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd533, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd532, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd531, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd530, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +and.b32 r14, r15, 1; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd536, fd239; +mul.f64 fd244, fd238, fd536; +mul.f64 fd246, fd239, fd239; +mul.f64 fd529, fd238, fd238; +sub.f64 fd247, fd529, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd535, fd249; +mul.f64 fd252, fd247, fd535; +mul.f64 fd527, fd238, fd247; +mul.f64 fd528, fd239, fd249; +sub.f64 fd255, fd527, fd528; +mul.f64 fd526, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd534, fd257; +mul.f64 fd260, fd255, fd534; +mul.f64 fd262, fd239, fd257; +mul.f64 fd525, fd238, fd255; +sub.f64 fd263, fd525, fd262; +mul.f64 fd524, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd533, fd265; +mul.f64 fd268, fd263, fd533; +mul.f64 fd270, fd239, fd265; +mul.f64 fd523, fd238, fd263; +sub.f64 fd271, fd523, fd270; +mul.f64 fd522, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd532, fd273; +mul.f64 fd276, fd271, fd532; +mul.f64 fd520, fd238, fd271; +mul.f64 fd521, fd239, fd273; +sub.f64 fd279, fd520, fd521; +mul.f64 fd519, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd531, fd281; +mul.f64 fd284, fd279, fd531; +mul.f64 fd286, fd239, fd281; +mul.f64 fd518, fd238, fd279; +sub.f64 fd287, fd518, fd286; +mul.f64 fd517, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd530, fd289; +mul.f64 fd292, fd287, fd530; +mul.f64 fd294, fd239, fd289; +mul.f64 fd516, fd238, fd287; +sub.f64 fd295, fd516, fd294; +mul.f64 fd515, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd514, fd567, fd548; +mul.f64 fd298, fd514, fd297; +sub.f64 fd513, fd106, fd163; +mul.f64 fd299, fd513, fd297; +mul.f64 fd300, fd295, fd514; +ld.global.v2.f64 {fd301, fd302}, [rd5+32]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd511, fd238, fd301; +mul.f64 fd512, fd239, fd302; +sub.f64 fd310, fd511, fd512; +mul.f64 fd510, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd509, fd238, fd310; +sub.f64 fd318, fd509, fd317; +mul.f64 fd508, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd506, fd238, fd318; +mul.f64 fd507, fd239, fd320; +sub.f64 fd326, fd506, fd507; +mul.f64 fd505, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd503, fd238, fd326; +mul.f64 fd504, fd239, fd328; +sub.f64 fd334, fd503, fd504; +mul.f64 fd502, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd501, fd238, fd334; +sub.f64 fd342, fd501, fd341; +mul.f64 fd500, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd498, fd238, fd342; +mul.f64 fd499, fd239, fd344; +sub.f64 fd350, fd498, fd499; +mul.f64 fd497, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd496, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 256; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 1; +sub.f64 fd586, fd567, fd548; +mul.f64 fd585, fd295, fd586; +add.f64 fd356, fd567, fd548; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 1; +sub.f64 fd584, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 1; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 1; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd496; +st.shared.v2.f64 [r12+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd526; +st.shared.v2.f64 [r12+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd524; +st.shared.v2.f64 [r12+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd522; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r12+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd519; +st.shared.v2.f64 [r12+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd517; +st.shared.v2.f64 [r12+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd515; +st.shared.v2.f64 [r12+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd584, fd298; +sub.f64 fd373, fd585, fd299; +st.shared.v2.f64 [r12+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd510; +st.shared.v2.f64 [r12+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd508; +st.shared.v2.f64 [r12+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd505; +st.shared.v2.f64 [r12+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd502; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r12+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd500; +st.shared.v2.f64 [r12+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd497; +st.shared.v2.f64 [r12+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r12+240], {fd386, fd387}; +barrier.sync 0; +mad.lo.s32 r13, r22, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+32]; +ld.shared.v2.f64 {fd396, fd397}, [r13+64]; +ld.shared.v2.f64 {fd400, fd401}, [r13+96]; +ld.shared.v2.f64 {fd404, fd405}, [r13+128]; +ld.shared.v2.f64 {fd408, fd409}, [r13+160]; +ld.shared.v2.f64 {fd412, fd413}, [r13+192]; +ld.shared.v2.f64 {fd416, fd417}, [r13+224]; +ld.shared.v2.f64 {fd420, fd421}, [r13+256]; +ld.shared.v2.f64 {fd424, fd425}, [r13+288]; +ld.shared.v2.f64 {fd428, fd429}, [r13+320]; +ld.shared.v2.f64 {fd432, fd433}, [r13+352]; +ld.shared.v2.f64 {fd436, fd437}, [r13+384]; +ld.shared.v2.f64 {fd440, fd441}, [r13+416]; +ld.shared.v2.f64 {fd444, fd445}, [r13+448]; +ld.shared.v2.f64 {fd448, fd449}, [r13+480]; +add.f64 %1, fd389, fd421; +add.f64 %0, fd388, fd420; +add.f64 %3, fd393, fd425; +add.f64 %2, fd392, fd424; +add.f64 %4, fd396, fd428; +add.f64 %5, fd397, fd429; +add.f64 %6, fd400, fd432; +add.f64 %7, fd401, fd433; +add.f64 %8, fd404, fd436; +add.f64 %9, fd405, fd437; +add.f64 %10, fd408, fd440; +add.f64 %11, fd409, fd441; +add.f64 %13, fd413, fd445; +add.f64 %12, fd412, fd444; +add.f64 %15, fd417, fd449; +add.f64 %14, fd416, fd448; +sub.f64 %17, fd389, fd421; +sub.f64 %16, fd388, fd420; +sub.f64 %19, fd393, fd425; +sub.f64 %18, fd392, fd424; +sub.f64 %21, fd397, fd429; +sub.f64 %20, fd396, fd428; +sub.f64 %23, fd401, fd433; +sub.f64 %22, fd400, fd432; +sub.f64 %25, fd405, fd437; +sub.f64 %24, fd404, fd436; +sub.f64 %27, fd409, fd441; +sub.f64 %26, fd408, fd440; +sub.f64 %29, fd413, fd445; +sub.f64 %28, fd412, fd444; +sub.f64 %31, fd417, fd449; +sub.f64 %30, fd416, fd448; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<612, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<97>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %9, %11; +sub.f64 fd10, %10, %12; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 480; +add.s32 r11, r8, r10; +add.f64 fd18, %10, %12; +add.f64 fd19, %9, %11; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 240; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+256]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 448; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 224; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+256]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 384; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 192; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+256]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 1; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 256; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 128; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+256]; +add.f64 %1, fd86, fd90; +add.f64 %0, fd85, fd89; +sub.f64 %3, fd86, fd90; +sub.f64 %2, fd85, fd89; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<613, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<81>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %9, %11; +add.f64 fd10, %10, %12; +sub.f64 fd11, %9, %11; +sub.f64 fd12, %10, %12; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 240; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 120; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+128]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+128]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 3; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 224; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 112; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+128]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+128]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 2; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 192; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 96; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+128]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+128]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 1; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 128; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 64; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+128]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+128]; +add.f64 %0, fd73, fd74; +add.f64 %1, fd75, fd76; +sub.f64 %2, fd73, fd74; +sub.f64 %3, fd75, fd76; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<614, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<452>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+32]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 128; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+16]; +ld.shared.f64 fd390, [r13+32]; +ld.shared.f64 fd391, [r13+48]; +ld.shared.f64 fd392, [r13+64]; +ld.shared.f64 fd393, [r13+80]; +ld.shared.f64 fd394, [r13+96]; +ld.shared.f64 fd395, [r13+112]; +ld.shared.f64 fd396, [r13+128]; +ld.shared.f64 fd397, [r13+144]; +ld.shared.f64 fd398, [r13+160]; +ld.shared.f64 fd399, [r13+176]; +ld.shared.f64 fd400, [r13+192]; +ld.shared.f64 fd401, [r13+208]; +ld.shared.f64 fd402, [r13+224]; +ld.shared.f64 fd403, [r13+240]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+16]; +ld.shared.f64 fd406, [r13+32]; +ld.shared.f64 fd407, [r13+48]; +ld.shared.f64 fd408, [r13+64]; +ld.shared.f64 fd409, [r13+80]; +ld.shared.f64 fd410, [r13+96]; +ld.shared.f64 fd411, [r13+112]; +ld.shared.f64 fd412, [r13+128]; +ld.shared.f64 fd413, [r13+144]; +ld.shared.f64 fd414, [r13+160]; +ld.shared.f64 fd415, [r13+176]; +ld.shared.f64 fd416, [r13+192]; +ld.shared.f64 fd417, [r13+208]; +ld.shared.f64 fd418, [r13+224]; +ld.shared.f64 fd419, [r13+240]; +add.f64 %0, fd388, fd396; +add.f64 %1, fd404, fd412; +add.f64 %2, fd389, fd397; +add.f64 %3, fd405, fd413; +add.f64 %4, fd390, fd398; +add.f64 %5, fd406, fd414; +add.f64 %6, fd391, fd399; +add.f64 %7, fd407, fd415; +add.f64 %8, fd392, fd400; +add.f64 %9, fd408, fd416; +add.f64 %10, fd393, fd401; +add.f64 %11, fd409, fd417; +add.f64 %12, fd394, fd402; +add.f64 %13, fd410, fd418; +add.f64 %14, fd395, fd403; +add.f64 %15, fd411, fd419; +sub.f64 %16, fd388, fd396; +sub.f64 %17, fd404, fd412; +sub.f64 %18, fd389, fd397; +sub.f64 %19, fd405, fd413; +sub.f64 %20, fd390, fd398; +sub.f64 %21, fd406, fd414; +sub.f64 %22, fd391, fd399; +sub.f64 %23, fd407, fd415; +sub.f64 %24, fd392, fd400; +sub.f64 %25, fd408, fd416; +sub.f64 %26, fd393, fd401; +sub.f64 %27, fd409, fd417; +sub.f64 %28, fd394, fd402; +sub.f64 %29, fd410, fd418; +sub.f64 %30, fd395, fd403; +sub.f64 %31, fd411, fd419; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..43f79bdb14d66 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp16_fwd.hpp.inc @@ -0,0 +1,5955 @@ +#ifndef CUFFTDX_FFT_343_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_343_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<920, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<2565>; +.reg .b64 rd<6>; +mov.u32 r2545, %tid.y; +mov.u32 r2546, %14; +mad.lo.s32 r2547, r2545, 2744, r2546; +mov.u32 r2548, %tid.x; +mov.f32 f90, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1, {low, high}; +} +mov.f32 f92, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r2, {low, high}; +} +mov.f32 f78, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r3, {low, high}; +} +mov.f32 f80, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r4, {low, high}; +} +mov.f32 f86, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r5, {low, high}; +} +mov.f32 f88, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r6, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r7, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r8, {low, high}; +} +{ +neg.f16x2 r9, r8; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r11, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r12, {low, high}; +} +{ +neg.f16x2 r13, r12; +} +{ +add.f16x2 r15, %17, %27; +} +{ +add.f16x2 r18, %15, r15; +} +{ +add.f16x2 r21, %19, %25; +} +{ +add.f16x2 r24, r18, r21; +} +{ +add.f16x2 r27, %21, %23; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %18, %28; +} +{ +add.f16x2 r36, %16, r33; +} +{ +add.f16x2 r39, %20, %26; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %22, %24; +} +{ +add.f16x2 r48, r42, r45; +} +{ +add.f16x2 r51, %17, %27; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %15, r54; +} +{ +add.f16x2 r60, %19, %25; +} +{ +mul.f16x2 r63, r60, r3; +} +{ +add.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %21, %23; +} +{ +mul.f16x2 r72, r69, r5; +} +{ +add.f16x2 r75, r66, r72; +} +{ +sub.f16x2 r78, %18, %28; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +sub.f16x2 r84, %20, %26; +} +{ +mul.f16x2 r87, r84, r4; +} +{ +add.f16x2 r90, r81, r87; +} +{ +sub.f16x2 r93, %22, %24; +} +{ +mul.f16x2 r96, r93, r6; +} +{ +add.f16x2 r99, r90, r96; +} +{ +sub.f16x2 r102, r75, r99; +} +{ +add.f16x2 r105, %17, %27; +} +{ +mul.f16x2 r108, r105, r1; +} +{ +add.f16x2 r111, %15, r108; +} +{ +add.f16x2 r114, %19, %25; +} +{ +mul.f16x2 r117, r114, r3; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %21, %23; +} +{ +mul.f16x2 r126, r123, r5; +} +{ +add.f16x2 r129, r120, r126; +} +{ +sub.f16x2 r132, %18, %28; +} +{ +mul.f16x2 r135, r132, r2; +} +{ +sub.f16x2 r138, %20, %26; +} +{ +mul.f16x2 r141, r138, r4; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %22, %24; +} +{ +mul.f16x2 r150, r147, r6; +} +{ +add.f16x2 r153, r144, r150; +} +{ +add.f16x2 r156, r129, r153; +} +{ +add.f16x2 r159, %17, %27; +} +{ +mul.f16x2 r162, r159, r3; +} +{ +add.f16x2 r165, %15, r162; +} +{ +add.f16x2 r168, %19, %25; +} +{ +mul.f16x2 r171, r168, r7; +} +{ +add.f16x2 r174, r165, r171; +} +{ +add.f16x2 r177, %21, %23; +} +{ +mul.f16x2 r180, r177, r11; +} +{ +add.f16x2 r183, r174, r180; +} +{ +sub.f16x2 r186, %18, %28; +} +{ +mul.f16x2 r189, r186, r4; +} +{ +sub.f16x2 r192, %20, %26; +} +{ +mul.f16x2 r195, r192, r9; +} +{ +add.f16x2 r198, r189, r195; +} +{ +sub.f16x2 r201, %22, %24; +} +{ +mul.f16x2 r204, r201, r13; +} +{ +add.f16x2 r207, r198, r204; +} +{ +sub.f16x2 r210, r183, r207; +} +{ +add.f16x2 r213, %17, %27; +} +{ +mul.f16x2 r216, r213, r3; +} +{ +add.f16x2 r219, %15, r216; +} +{ +add.f16x2 r222, %19, %25; +} +{ +mul.f16x2 r225, r222, r7; +} +{ +add.f16x2 r228, r219, r225; +} +{ +add.f16x2 r231, %21, %23; +} +{ +mul.f16x2 r234, r231, r11; +} +{ +add.f16x2 r237, r228, r234; +} +{ +sub.f16x2 r240, %18, %28; +} +{ +mul.f16x2 r243, r240, r4; +} +{ +sub.f16x2 r246, %20, %26; +} +{ +mul.f16x2 r249, r246, r9; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %22, %24; +} +{ +mul.f16x2 r258, r255, r13; +} +{ +add.f16x2 r261, r252, r258; +} +{ +add.f16x2 r264, r237, r261; +} +{ +add.f16x2 r267, %17, %27; +} +{ +mul.f16x2 r270, r267, r5; +} +{ +add.f16x2 r273, %15, r270; +} +{ +add.f16x2 r276, %19, %25; +} +{ +mul.f16x2 r279, r276, r11; +} +{ +add.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %21, %23; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, r282, r288; +} +{ +sub.f16x2 r294, %18, %28; +} +{ +mul.f16x2 r297, r294, r6; +} +{ +sub.f16x2 r300, %20, %26; +} +{ +mul.f16x2 r303, r300, r13; +} +{ +add.f16x2 r306, r297, r303; +} +{ +sub.f16x2 r309, %22, %24; +} +{ +mul.f16x2 r312, r309, r4; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r291, r315; +} +{ +add.f16x2 r321, %17, %27; +} +{ +mul.f16x2 r324, r321, r5; +} +{ +add.f16x2 r327, %15, r324; +} +{ +add.f16x2 r330, %19, %25; +} +{ +mul.f16x2 r333, r330, r11; +} +{ +add.f16x2 r336, r327, r333; +} +{ +add.f16x2 r339, %21, %23; +} +{ +mul.f16x2 r342, r339, r3; +} +{ +add.f16x2 r345, r336, r342; +} +{ +sub.f16x2 r348, %18, %28; +} +{ +mul.f16x2 r351, r348, r6; +} +{ +sub.f16x2 r354, %20, %26; +} +{ +mul.f16x2 r357, r354, r13; +} +{ +add.f16x2 r360, r351, r357; +} +{ +sub.f16x2 r363, %22, %24; +} +{ +mul.f16x2 r366, r363, r4; +} +{ +add.f16x2 r369, r360, r366; +} +{ +add.f16x2 r372, r345, r369; +} +{ +add.f16x2 r375, %18, %28; +} +{ +mul.f16x2 r378, r375, r1; +} +{ +add.f16x2 r381, %16, r378; +} +{ +add.f16x2 r384, %20, %26; +} +{ +mul.f16x2 r387, r384, r3; +} +{ +add.f16x2 r390, r381, r387; +} +{ +add.f16x2 r393, %22, %24; +} +{ +mul.f16x2 r396, r393, r5; +} +{ +add.f16x2 r399, r390, r396; +} +{ +sub.f16x2 r402, %17, %27; +} +{ +mul.f16x2 r405, r402, r2; +} +{ +sub.f16x2 r408, %19, %25; +} +{ +mul.f16x2 r411, r408, r4; +} +{ +add.f16x2 r414, r405, r411; +} +{ +sub.f16x2 r417, %21, %23; +} +{ +mul.f16x2 r420, r417, r6; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r399, r423; +} +{ +add.f16x2 r429, %18, %28; +} +{ +mul.f16x2 r432, r429, r1; +} +{ +add.f16x2 r435, %16, r432; +} +{ +add.f16x2 r438, %20, %26; +} +{ +mul.f16x2 r441, r438, r3; +} +{ +add.f16x2 r444, r435, r441; +} +{ +add.f16x2 r447, %22, %24; +} +{ +mul.f16x2 r450, r447, r5; +} +{ +add.f16x2 r453, r444, r450; +} +{ +sub.f16x2 r456, %17, %27; +} +{ +mul.f16x2 r459, r456, r2; +} +{ +sub.f16x2 r462, %19, %25; +} +{ +mul.f16x2 r465, r462, r4; +} +{ +add.f16x2 r468, r459, r465; +} +{ +sub.f16x2 r471, %21, %23; +} +{ +mul.f16x2 r474, r471, r6; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r453, r477; +} +{ +add.f16x2 r483, %18, %28; +} +{ +mul.f16x2 r486, r483, r3; +} +{ +add.f16x2 r489, %16, r486; +} +{ +add.f16x2 r492, %20, %26; +} +{ +mul.f16x2 r495, r492, r7; +} +{ +add.f16x2 r498, r489, r495; +} +{ +add.f16x2 r501, %22, %24; +} +{ +mul.f16x2 r504, r501, r11; +} +{ +add.f16x2 r507, r498, r504; +} +{ +sub.f16x2 r510, %17, %27; +} +{ +mul.f16x2 r513, r510, r4; +} +{ +sub.f16x2 r516, %19, %25; +} +{ +mul.f16x2 r519, r516, r9; +} +{ +add.f16x2 r522, r513, r519; +} +{ +sub.f16x2 r525, %21, %23; +} +{ +mul.f16x2 r528, r525, r13; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r507, r531; +} +{ +add.f16x2 r537, %18, %28; +} +{ +mul.f16x2 r540, r537, r3; +} +{ +add.f16x2 r543, %16, r540; +} +{ +add.f16x2 r546, %20, %26; +} +{ +mul.f16x2 r549, r546, r7; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, %22, %24; +} +{ +mul.f16x2 r558, r555, r11; +} +{ +add.f16x2 r561, r552, r558; +} +{ +sub.f16x2 r564, %17, %27; +} +{ +mul.f16x2 r567, r564, r4; +} +{ +sub.f16x2 r570, %19, %25; +} +{ +mul.f16x2 r573, r570, r9; +} +{ +add.f16x2 r576, r567, r573; +} +{ +sub.f16x2 r579, %21, %23; +} +{ +mul.f16x2 r582, r579, r13; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r561, r585; +} +{ +add.f16x2 r591, %18, %28; +} +{ +mul.f16x2 r594, r591, r5; +} +{ +add.f16x2 r597, %16, r594; +} +{ +add.f16x2 r600, %20, %26; +} +{ +mul.f16x2 r603, r600, r11; +} +{ +add.f16x2 r606, r597, r603; +} +{ +add.f16x2 r609, %22, %24; +} +{ +mul.f16x2 r612, r609, r3; +} +{ +add.f16x2 r615, r606, r612; +} +{ +sub.f16x2 r618, %17, %27; +} +{ +mul.f16x2 r621, r618, r6; +} +{ +sub.f16x2 r624, %19, %25; +} +{ +mul.f16x2 r627, r624, r13; +} +{ +add.f16x2 r630, r621, r627; +} +{ +sub.f16x2 r633, %21, %23; +} +{ +mul.f16x2 r636, r633, r4; +} +{ +add.f16x2 r639, r630, r636; +} +{ +add.f16x2 r642, r615, r639; +} +{ +add.f16x2 r645, %18, %28; +} +{ +mul.f16x2 r648, r645, r5; +} +{ +add.f16x2 r651, %16, r648; +} +{ +add.f16x2 r654, %20, %26; +} +{ +mul.f16x2 r657, r654, r11; +} +{ +add.f16x2 r660, r651, r657; +} +{ +add.f16x2 r663, %22, %24; +} +{ +mul.f16x2 r666, r663, r3; +} +{ +add.f16x2 r669, r660, r666; +} +{ +sub.f16x2 r672, %17, %27; +} +{ +mul.f16x2 r675, r672, r6; +} +{ +sub.f16x2 r678, %19, %25; +} +{ +mul.f16x2 r681, r678, r13; +} +{ +add.f16x2 r684, r675, r681; +} +{ +sub.f16x2 r687, %21, %23; +} +{ +mul.f16x2 r690, r687, r4; +} +{ +add.f16x2 r693, r684, r690; +} +{ +sub.f16x2 r696, r669, r693; +} +mul.wide.u32 rd2, r2548, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r2549, rd3; +mul.lo.s32 r2550, r2549, 49; +sub.s32 r2551, r2548, r2550; +cvt.rn.f32.u32 f93, r2551; +mul.f32 f94, f93, 0f3C961050; +cos.approx.f32 f21, f94; +sin.approx.f32 f95, f94; +neg.f32 f22, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r699, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r704, {high, high}; +} +{ +mul.f16x2 r706, r426, r704; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r102, r702, r709; +} +{ +mul.f16x2 r715, r102, r704; +} +{ +fma.rn.f16x2 r718, r426, r702, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r724, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r726, {low, high}; +} +{ +mul.f16x2 r727, r724, r726; +} +{ +mul.f16x2 r730, r699, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r733, {high, low}; +} +{ +fma.rn.f16x2 r735, r727, r733, r730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r741, {high, high}; +} +{ +mul.f16x2 r743, r534, r741; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r210, r739, r746; +} +{ +mul.f16x2 r752, r210, r741; +} +{ +fma.rn.f16x2 r755, r534, r739, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r761, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r763, {low, high}; +} +{ +mul.f16x2 r764, r761, r763; +} +{ +mul.f16x2 r767, r735, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r770, {high, low}; +} +{ +fma.rn.f16x2 r772, r764, r770, r767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r778, {high, high}; +} +{ +mul.f16x2 r780, r642, r778; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r318, r776, r783; +} +{ +mul.f16x2 r789, r318, r778; +} +{ +fma.rn.f16x2 r792, r642, r776, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r798, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r798, r800; +} +{ +mul.f16x2 r804, r772, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r807, {high, low}; +} +{ +fma.rn.f16x2 r809, r801, r807, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r815, {high, high}; +} +{ +mul.f16x2 r817, r696, r815; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r372, r813, r820; +} +{ +mul.f16x2 r826, r372, r815; +} +{ +fma.rn.f16x2 r829, r696, r813, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r833, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r835, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r837, {low, high}; +} +{ +mul.f16x2 r838, r835, r837; +} +{ +mul.f16x2 r841, r809, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r844, {high, low}; +} +{ +fma.rn.f16x2 r846, r838, r844, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r850, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r852, {high, high}; +} +{ +mul.f16x2 r854, r588, r852; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r264, r850, r857; +} +{ +mul.f16x2 r863, r264, r852; +} +{ +fma.rn.f16x2 r866, r588, r850, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r870, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r872, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r874, {low, high}; +} +{ +mul.f16x2 r875, r872, r874; +} +{ +mul.f16x2 r878, r846, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r881, {high, low}; +} +{ +fma.rn.f16x2 r883, r875, r881, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r887, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r889, {high, high}; +} +{ +mul.f16x2 r891, r480, r889; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r156, r887, r894; +} +{ +mul.f16x2 r900, r156, r889; +} +{ +fma.rn.f16x2 r903, r480, r887, r900; +} +mad.lo.s32 r2552, r2549, 2744, r2547; +barrier.sync 0; +mad.lo.s32 r2553, r2551, 56, r2552; +st.shared.v2.f32 [r2553], {r30, r48}; +st.shared.v2.f32 [r2553+8], {r711, r718}; +st.shared.v2.f32 [r2553+16], {r748, r755}; +st.shared.v2.f32 [r2553+24], {r785, r792}; +st.shared.v2.f32 [r2553+32], {r822, r829}; +st.shared.v2.f32 [r2553+40], {r859, r866}; +st.shared.v2.f32 [r2553+48], {r896, r903}; +barrier.sync 0; +mad.lo.s32 r2554, r2551, -48, r2553; +ld.shared.u32 r942, [r2554]; +ld.shared.u32 r960, [r2554+4]; +ld.shared.u32 r939, [r2554+392]; +ld.shared.u32 r957, [r2554+396]; +ld.shared.u32 r945, [r2554+784]; +ld.shared.u32 r963, [r2554+788]; +ld.shared.u32 r951, [r2554+1176]; +ld.shared.u32 r969, [r2554+1180]; +ld.shared.u32 r952, [r2554+1568]; +ld.shared.u32 r970, [r2554+1572]; +ld.shared.u32 r946, [r2554+1960]; +ld.shared.u32 r964, [r2554+1964]; +ld.shared.u32 r940, [r2554+2352]; +ld.shared.u32 r958, [r2554+2356]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +add.f16x2 r938, r939, r940; +} +{ +add.f16x2 r941, r942, r938; +} +{ +add.f16x2 r944, r945, r946; +} +{ +add.f16x2 r947, r941, r944; +} +{ +add.f16x2 r950, r951, r952; +} +{ +add.f16x2 r953, r947, r950; +} +{ +add.f16x2 r956, r957, r958; +} +{ +add.f16x2 r959, r960, r956; +} +{ +add.f16x2 r962, r963, r964; +} +{ +add.f16x2 r965, r959, r962; +} +{ +add.f16x2 r968, r969, r970; +} +{ +add.f16x2 r971, r965, r968; +} +{ +add.f16x2 r974, r939, r940; +} +{ +mul.f16x2 r977, r974, r924; +} +{ +add.f16x2 r980, r942, r977; +} +{ +add.f16x2 r983, r945, r946; +} +{ +mul.f16x2 r986, r983, r926; +} +{ +add.f16x2 r989, r980, r986; +} +{ +add.f16x2 r992, r951, r952; +} +{ +mul.f16x2 r995, r992, r928; +} +{ +add.f16x2 r998, r989, r995; +} +{ +sub.f16x2 r1001, r957, r958; +} +{ +mul.f16x2 r1004, r1001, r925; +} +{ +sub.f16x2 r1007, r963, r964; +} +{ +mul.f16x2 r1010, r1007, r927; +} +{ +add.f16x2 r1013, r1004, r1010; +} +{ +sub.f16x2 r1016, r969, r970; +} +{ +mul.f16x2 r1019, r1016, r929; +} +{ +add.f16x2 r1022, r1013, r1019; +} +{ +sub.f16x2 r1025, r998, r1022; +} +{ +add.f16x2 r1028, r939, r940; +} +{ +mul.f16x2 r1031, r1028, r924; +} +{ +add.f16x2 r1034, r942, r1031; +} +{ +add.f16x2 r1037, r945, r946; +} +{ +mul.f16x2 r1040, r1037, r926; +} +{ +add.f16x2 r1043, r1034, r1040; +} +{ +add.f16x2 r1046, r951, r952; +} +{ +mul.f16x2 r1049, r1046, r928; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, r957, r958; +} +{ +mul.f16x2 r1058, r1055, r925; +} +{ +sub.f16x2 r1061, r963, r964; +} +{ +mul.f16x2 r1064, r1061, r927; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +sub.f16x2 r1070, r969, r970; +} +{ +mul.f16x2 r1073, r1070, r929; +} +{ +add.f16x2 r1076, r1067, r1073; +} +{ +add.f16x2 r1079, r1052, r1076; +} +{ +add.f16x2 r1082, r939, r940; +} +{ +mul.f16x2 r1085, r1082, r926; +} +{ +add.f16x2 r1088, r942, r1085; +} +{ +add.f16x2 r1091, r945, r946; +} +{ +mul.f16x2 r1094, r1091, r930; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r951, r952; +} +{ +mul.f16x2 r1103, r1100, r934; +} +{ +add.f16x2 r1106, r1097, r1103; +} +{ +sub.f16x2 r1109, r957, r958; +} +{ +mul.f16x2 r1112, r1109, r927; +} +{ +sub.f16x2 r1115, r963, r964; +} +{ +mul.f16x2 r1118, r1115, r932; +} +{ +add.f16x2 r1121, r1112, r1118; +} +{ +sub.f16x2 r1124, r969, r970; +} +{ +mul.f16x2 r1127, r1124, r936; +} +{ +add.f16x2 r1130, r1121, r1127; +} +{ +sub.f16x2 r1133, r1106, r1130; +} +{ +add.f16x2 r1136, r939, r940; +} +{ +mul.f16x2 r1139, r1136, r926; +} +{ +add.f16x2 r1142, r942, r1139; +} +{ +add.f16x2 r1145, r945, r946; +} +{ +mul.f16x2 r1148, r1145, r930; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r951, r952; +} +{ +mul.f16x2 r1157, r1154, r934; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, r957, r958; +} +{ +mul.f16x2 r1166, r1163, r927; +} +{ +sub.f16x2 r1169, r963, r964; +} +{ +mul.f16x2 r1172, r1169, r932; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +sub.f16x2 r1178, r969, r970; +} +{ +mul.f16x2 r1181, r1178, r936; +} +{ +add.f16x2 r1184, r1175, r1181; +} +{ +add.f16x2 r1187, r1160, r1184; +} +{ +add.f16x2 r1190, r939, r940; +} +{ +mul.f16x2 r1193, r1190, r928; +} +{ +add.f16x2 r1196, r942, r1193; +} +{ +add.f16x2 r1199, r945, r946; +} +{ +mul.f16x2 r1202, r1199, r934; +} +{ +add.f16x2 r1205, r1196, r1202; +} +{ +add.f16x2 r1208, r951, r952; +} +{ +mul.f16x2 r1211, r1208, r926; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +sub.f16x2 r1217, r957, r958; +} +{ +mul.f16x2 r1220, r1217, r929; +} +{ +sub.f16x2 r1223, r963, r964; +} +{ +mul.f16x2 r1226, r1223, r936; +} +{ +add.f16x2 r1229, r1220, r1226; +} +{ +sub.f16x2 r1232, r969, r970; +} +{ +mul.f16x2 r1235, r1232, r927; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1214, r1238; +} +{ +add.f16x2 r1244, r939, r940; +} +{ +mul.f16x2 r1247, r1244, r928; +} +{ +add.f16x2 r1250, r942, r1247; +} +{ +add.f16x2 r1253, r945, r946; +} +{ +mul.f16x2 r1256, r1253, r934; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r951, r952; +} +{ +mul.f16x2 r1265, r1262, r926; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, r957, r958; +} +{ +mul.f16x2 r1274, r1271, r929; +} +{ +sub.f16x2 r1277, r963, r964; +} +{ +mul.f16x2 r1280, r1277, r936; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r969, r970; +} +{ +mul.f16x2 r1289, r1286, r927; +} +{ +add.f16x2 r1292, r1283, r1289; +} +{ +add.f16x2 r1295, r1268, r1292; +} +{ +add.f16x2 r1298, r957, r958; +} +{ +mul.f16x2 r1301, r1298, r924; +} +{ +add.f16x2 r1304, r960, r1301; +} +{ +add.f16x2 r1307, r963, r964; +} +{ +mul.f16x2 r1310, r1307, r926; +} +{ +add.f16x2 r1313, r1304, r1310; +} +{ +add.f16x2 r1316, r969, r970; +} +{ +mul.f16x2 r1319, r1316, r928; +} +{ +add.f16x2 r1322, r1313, r1319; +} +{ +sub.f16x2 r1325, r939, r940; +} +{ +mul.f16x2 r1328, r1325, r925; +} +{ +sub.f16x2 r1331, r945, r946; +} +{ +mul.f16x2 r1334, r1331, r927; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +sub.f16x2 r1340, r951, r952; +} +{ +mul.f16x2 r1343, r1340, r929; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +add.f16x2 r1349, r1322, r1346; +} +{ +add.f16x2 r1352, r957, r958; +} +{ +mul.f16x2 r1355, r1352, r924; +} +{ +add.f16x2 r1358, r960, r1355; +} +{ +add.f16x2 r1361, r963, r964; +} +{ +mul.f16x2 r1364, r1361, r926; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r969, r970; +} +{ +mul.f16x2 r1373, r1370, r928; +} +{ +add.f16x2 r1376, r1367, r1373; +} +{ +sub.f16x2 r1379, r939, r940; +} +{ +mul.f16x2 r1382, r1379, r925; +} +{ +sub.f16x2 r1385, r945, r946; +} +{ +mul.f16x2 r1388, r1385, r927; +} +{ +add.f16x2 r1391, r1382, r1388; +} +{ +sub.f16x2 r1394, r951, r952; +} +{ +mul.f16x2 r1397, r1394, r929; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, r1376, r1400; +} +{ +add.f16x2 r1406, r957, r958; +} +{ +mul.f16x2 r1409, r1406, r926; +} +{ +add.f16x2 r1412, r960, r1409; +} +{ +add.f16x2 r1415, r963, r964; +} +{ +mul.f16x2 r1418, r1415, r930; +} +{ +add.f16x2 r1421, r1412, r1418; +} +{ +add.f16x2 r1424, r969, r970; +} +{ +mul.f16x2 r1427, r1424, r934; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +sub.f16x2 r1433, r939, r940; +} +{ +mul.f16x2 r1436, r1433, r927; +} +{ +sub.f16x2 r1439, r945, r946; +} +{ +mul.f16x2 r1442, r1439, r932; +} +{ +add.f16x2 r1445, r1436, r1442; +} +{ +sub.f16x2 r1448, r951, r952; +} +{ +mul.f16x2 r1451, r1448, r936; +} +{ +add.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1430, r1454; +} +{ +add.f16x2 r1460, r957, r958; +} +{ +mul.f16x2 r1463, r1460, r926; +} +{ +add.f16x2 r1466, r960, r1463; +} +{ +add.f16x2 r1469, r963, r964; +} +{ +mul.f16x2 r1472, r1469, r930; +} +{ +add.f16x2 r1475, r1466, r1472; +} +{ +add.f16x2 r1478, r969, r970; +} +{ +mul.f16x2 r1481, r1478, r934; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r939, r940; +} +{ +mul.f16x2 r1490, r1487, r927; +} +{ +sub.f16x2 r1493, r945, r946; +} +{ +mul.f16x2 r1496, r1493, r932; +} +{ +add.f16x2 r1499, r1490, r1496; +} +{ +sub.f16x2 r1502, r951, r952; +} +{ +mul.f16x2 r1505, r1502, r936; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, r1484, r1508; +} +{ +add.f16x2 r1514, r957, r958; +} +{ +mul.f16x2 r1517, r1514, r928; +} +{ +add.f16x2 r1520, r960, r1517; +} +{ +add.f16x2 r1523, r963, r964; +} +{ +mul.f16x2 r1526, r1523, r934; +} +{ +add.f16x2 r1529, r1520, r1526; +} +{ +add.f16x2 r1532, r969, r970; +} +{ +mul.f16x2 r1535, r1532, r926; +} +{ +add.f16x2 r1538, r1529, r1535; +} +{ +sub.f16x2 r1541, r939, r940; +} +{ +mul.f16x2 r1544, r1541, r929; +} +{ +sub.f16x2 r1547, r945, r946; +} +{ +mul.f16x2 r1550, r1547, r936; +} +{ +add.f16x2 r1553, r1544, r1550; +} +{ +sub.f16x2 r1556, r951, r952; +} +{ +mul.f16x2 r1559, r1556, r927; +} +{ +add.f16x2 r1562, r1553, r1559; +} +{ +add.f16x2 r1565, r1538, r1562; +} +{ +add.f16x2 r1568, r957, r958; +} +{ +mul.f16x2 r1571, r1568, r928; +} +{ +add.f16x2 r1574, r960, r1571; +} +{ +add.f16x2 r1577, r963, r964; +} +{ +mul.f16x2 r1580, r1577, r934; +} +{ +add.f16x2 r1583, r1574, r1580; +} +{ +add.f16x2 r1586, r969, r970; +} +{ +mul.f16x2 r1589, r1586, r926; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +sub.f16x2 r1595, r939, r940; +} +{ +mul.f16x2 r1598, r1595, r929; +} +{ +sub.f16x2 r1601, r945, r946; +} +{ +mul.f16x2 r1604, r1601, r936; +} +{ +add.f16x2 r1607, r1598, r1604; +} +{ +sub.f16x2 r1610, r951, r952; +} +{ +mul.f16x2 r1613, r1610, r927; +} +{ +add.f16x2 r1616, r1607, r1613; +} +{ +sub.f16x2 r1619, r1592, r1616; +} +mul.wide.u32 rd4, r2551, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r2555, rd5; +sub.s32 r2556, r2551, r2555; +shr.u32 r2557, r2556, 1; +add.s32 r2558, r2557, r2555; +shr.u32 r2559, r2558, 2; +cvt.rn.f32.u32 f96, r2559; +mul.f32 f97, f96, 0f3E034E46; +cos.approx.f32 f57, f97; +sin.approx.f32 f98, f97; +neg.f32 f58, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1622, {low, high}; +} +mul.lo.s32 r2560, r2559, 7; +sub.s32 r2561, r2551, r2560; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1625, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1627, {high, high}; +} +{ +mul.f16x2 r1629, r1349, r1627; +} +{ +neg.f16x2 r1632, r1629; +} +{ +fma.rn.f16x2 r1634, r1025, r1625, r1632; +} +{ +mul.f16x2 r1638, r1025, r1627; +} +{ +fma.rn.f16x2 r1641, r1349, r1625, r1638; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1645, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1647, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1649, {low, high}; +} +{ +mul.f16x2 r1650, r1647, r1649; +} +{ +mul.f16x2 r1653, r1622, r1645; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1656, {high, low}; +} +{ +fma.rn.f16x2 r1658, r1650, r1656, r1653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1662, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1664, {high, high}; +} +{ +mul.f16x2 r1666, r1457, r1664; +} +{ +neg.f16x2 r1669, r1666; +} +{ +fma.rn.f16x2 r1671, r1133, r1662, r1669; +} +{ +mul.f16x2 r1675, r1133, r1664; +} +{ +fma.rn.f16x2 r1678, r1457, r1662, r1675; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1682, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1684, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1686, {low, high}; +} +{ +mul.f16x2 r1687, r1684, r1686; +} +{ +mul.f16x2 r1690, r1658, r1682; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1693, {high, low}; +} +{ +fma.rn.f16x2 r1695, r1687, r1693, r1690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1699, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1701, {high, high}; +} +{ +mul.f16x2 r1703, r1565, r1701; +} +{ +neg.f16x2 r1706, r1703; +} +{ +fma.rn.f16x2 r1708, r1241, r1699, r1706; +} +{ +mul.f16x2 r1712, r1241, r1701; +} +{ +fma.rn.f16x2 r1715, r1565, r1699, r1712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1719, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1721, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1723, {low, high}; +} +{ +mul.f16x2 r1724, r1721, r1723; +} +{ +mul.f16x2 r1727, r1695, r1719; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1730, {high, low}; +} +{ +fma.rn.f16x2 r1732, r1724, r1730, r1727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1738, {high, high}; +} +{ +mul.f16x2 r1740, r1619, r1738; +} +{ +neg.f16x2 r1743, r1740; +} +{ +fma.rn.f16x2 r1745, r1295, r1736, r1743; +} +{ +mul.f16x2 r1749, r1295, r1738; +} +{ +fma.rn.f16x2 r1752, r1619, r1736, r1749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1756, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1758, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1758, r1760; +} +{ +mul.f16x2 r1764, r1732, r1756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1767, {high, low}; +} +{ +fma.rn.f16x2 r1769, r1761, r1767, r1764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1775, {high, high}; +} +{ +mul.f16x2 r1777, r1511, r1775; +} +{ +neg.f16x2 r1780, r1777; +} +{ +fma.rn.f16x2 r1782, r1187, r1773, r1780; +} +{ +mul.f16x2 r1786, r1187, r1775; +} +{ +fma.rn.f16x2 r1789, r1511, r1773, r1786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1793, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1795, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1795, r1797; +} +{ +mul.f16x2 r1801, r1769, r1793; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1804, {high, low}; +} +{ +fma.rn.f16x2 r1806, r1798, r1804, r1801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1812, {high, high}; +} +{ +mul.f16x2 r1814, r1403, r1812; +} +{ +neg.f16x2 r1817, r1814; +} +{ +fma.rn.f16x2 r1819, r1079, r1810, r1817; +} +{ +mul.f16x2 r1823, r1079, r1812; +} +{ +fma.rn.f16x2 r1826, r1403, r1810, r1823; +} +shl.b32 r2562, r2561, 3; +add.s32 r2563, r2552, r2562; +barrier.sync 0; +mad.lo.s32 r2564, r2559, 392, r2563; +st.shared.u32 [r2564], r953; +st.shared.u32 [r2564+4], r971; +st.shared.u32 [r2564+56], r1634; +st.shared.u32 [r2564+60], r1641; +st.shared.u32 [r2564+112], r1671; +st.shared.u32 [r2564+116], r1678; +st.shared.u32 [r2564+168], r1708; +st.shared.u32 [r2564+172], r1715; +st.shared.u32 [r2564+224], r1745; +st.shared.u32 [r2564+228], r1752; +st.shared.u32 [r2564+280], r1782; +st.shared.u32 [r2564+284], r1789; +st.shared.u32 [r2564+336], r1819; +st.shared.u32 [r2564+340], r1826; +barrier.sync 0; +ld.shared.u32 r1865, [r2554]; +ld.shared.u32 r1883, [r2554+4]; +ld.shared.u32 r1862, [r2554+392]; +ld.shared.u32 r1880, [r2554+396]; +ld.shared.u32 r1868, [r2554+784]; +ld.shared.u32 r1886, [r2554+788]; +ld.shared.u32 r1874, [r2554+1176]; +ld.shared.u32 r1892, [r2554+1180]; +ld.shared.u32 r1875, [r2554+1568]; +ld.shared.u32 r1893, [r2554+1572]; +ld.shared.u32 r1869, [r2554+1960]; +ld.shared.u32 r1887, [r2554+1964]; +ld.shared.u32 r1863, [r2554+2352]; +ld.shared.u32 r1881, [r2554+2356]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1848, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1849, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1850, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1853, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1854, {low, high}; +} +{ +neg.f16x2 r1855, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1858, {low, high}; +} +{ +neg.f16x2 r1859, r1858; +} +{ +add.f16x2 r1861, r1862, r1863; +} +{ +add.f16x2 r1864, r1865, r1861; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1864, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 %0, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1882, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 %1, r1888, r1891; +} +{ +add.f16x2 r1897, r1862, r1863; +} +{ +mul.f16x2 r1900, r1897, r1847; +} +{ +add.f16x2 r1903, r1865, r1900; +} +{ +add.f16x2 r1906, r1868, r1869; +} +{ +mul.f16x2 r1909, r1906, r1849; +} +{ +add.f16x2 r1912, r1903, r1909; +} +{ +add.f16x2 r1915, r1874, r1875; +} +{ +mul.f16x2 r1918, r1915, r1851; +} +{ +add.f16x2 r1921, r1912, r1918; +} +{ +sub.f16x2 r1924, r1880, r1881; +} +{ +mul.f16x2 r1927, r1924, r1848; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1850; +} +{ +add.f16x2 r1936, r1927, r1933; +} +{ +sub.f16x2 r1939, r1892, r1893; +} +{ +mul.f16x2 r1942, r1939, r1852; +} +{ +add.f16x2 r1945, r1936, r1942; +} +{ +sub.f16x2 %2, r1921, r1945; +} +{ +add.f16x2 r1951, r1862, r1863; +} +{ +mul.f16x2 r1954, r1951, r1847; +} +{ +add.f16x2 r1957, r1865, r1954; +} +{ +add.f16x2 r1960, r1868, r1869; +} +{ +mul.f16x2 r1963, r1960, r1849; +} +{ +add.f16x2 r1966, r1957, r1963; +} +{ +add.f16x2 r1969, r1874, r1875; +} +{ +mul.f16x2 r1972, r1969, r1851; +} +{ +add.f16x2 r1975, r1966, r1972; +} +{ +sub.f16x2 r1978, r1880, r1881; +} +{ +mul.f16x2 r1981, r1978, r1848; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1850; +} +{ +add.f16x2 r1990, r1981, r1987; +} +{ +sub.f16x2 r1993, r1892, r1893; +} +{ +mul.f16x2 r1996, r1993, r1852; +} +{ +add.f16x2 r1999, r1990, r1996; +} +{ +add.f16x2 %12, r1975, r1999; +} +{ +add.f16x2 r2005, r1862, r1863; +} +{ +mul.f16x2 r2008, r2005, r1849; +} +{ +add.f16x2 r2011, r1865, r2008; +} +{ +add.f16x2 r2014, r1868, r1869; +} +{ +mul.f16x2 r2017, r2014, r1853; +} +{ +add.f16x2 r2020, r2011, r2017; +} +{ +add.f16x2 r2023, r1874, r1875; +} +{ +mul.f16x2 r2026, r2023, r1857; +} +{ +add.f16x2 r2029, r2020, r2026; +} +{ +sub.f16x2 r2032, r1880, r1881; +} +{ +mul.f16x2 r2035, r2032, r1850; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1855; +} +{ +add.f16x2 r2044, r2035, r2041; +} +{ +sub.f16x2 r2047, r1892, r1893; +} +{ +mul.f16x2 r2050, r2047, r1859; +} +{ +add.f16x2 r2053, r2044, r2050; +} +{ +sub.f16x2 %4, r2029, r2053; +} +{ +add.f16x2 r2059, r1862, r1863; +} +{ +mul.f16x2 r2062, r2059, r1849; +} +{ +add.f16x2 r2065, r1865, r2062; +} +{ +add.f16x2 r2068, r1868, r1869; +} +{ +mul.f16x2 r2071, r2068, r1853; +} +{ +add.f16x2 r2074, r2065, r2071; +} +{ +add.f16x2 r2077, r1874, r1875; +} +{ +mul.f16x2 r2080, r2077, r1857; +} +{ +add.f16x2 r2083, r2074, r2080; +} +{ +sub.f16x2 r2086, r1880, r1881; +} +{ +mul.f16x2 r2089, r2086, r1850; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1855; +} +{ +add.f16x2 r2098, r2089, r2095; +} +{ +sub.f16x2 r2101, r1892, r1893; +} +{ +mul.f16x2 r2104, r2101, r1859; +} +{ +add.f16x2 r2107, r2098, r2104; +} +{ +add.f16x2 %10, r2083, r2107; +} +{ +add.f16x2 r2113, r1862, r1863; +} +{ +mul.f16x2 r2116, r2113, r1851; +} +{ +add.f16x2 r2119, r1865, r2116; +} +{ +add.f16x2 r2122, r1868, r1869; +} +{ +mul.f16x2 r2125, r2122, r1857; +} +{ +add.f16x2 r2128, r2119, r2125; +} +{ +add.f16x2 r2131, r1874, r1875; +} +{ +mul.f16x2 r2134, r2131, r1849; +} +{ +add.f16x2 r2137, r2128, r2134; +} +{ +sub.f16x2 r2140, r1880, r1881; +} +{ +mul.f16x2 r2143, r2140, r1852; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1859; +} +{ +add.f16x2 r2152, r2143, r2149; +} +{ +sub.f16x2 r2155, r1892, r1893; +} +{ +mul.f16x2 r2158, r2155, r1850; +} +{ +add.f16x2 r2161, r2152, r2158; +} +{ +sub.f16x2 %6, r2137, r2161; +} +{ +add.f16x2 r2167, r1862, r1863; +} +{ +mul.f16x2 r2170, r2167, r1851; +} +{ +add.f16x2 r2173, r1865, r2170; +} +{ +add.f16x2 r2176, r1868, r1869; +} +{ +mul.f16x2 r2179, r2176, r1857; +} +{ +add.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r1874, r1875; +} +{ +mul.f16x2 r2188, r2185, r1849; +} +{ +add.f16x2 r2191, r2182, r2188; +} +{ +sub.f16x2 r2194, r1880, r1881; +} +{ +mul.f16x2 r2197, r2194, r1852; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1859; +} +{ +add.f16x2 r2206, r2197, r2203; +} +{ +sub.f16x2 r2209, r1892, r1893; +} +{ +mul.f16x2 r2212, r2209, r1850; +} +{ +add.f16x2 r2215, r2206, r2212; +} +{ +add.f16x2 %8, r2191, r2215; +} +{ +add.f16x2 r2221, r1880, r1881; +} +{ +mul.f16x2 r2224, r2221, r1847; +} +{ +add.f16x2 r2227, r1883, r2224; +} +{ +add.f16x2 r2230, r1886, r1887; +} +{ +mul.f16x2 r2233, r2230, r1849; +} +{ +add.f16x2 r2236, r2227, r2233; +} +{ +add.f16x2 r2239, r1892, r1893; +} +{ +mul.f16x2 r2242, r2239, r1851; +} +{ +add.f16x2 r2245, r2236, r2242; +} +{ +sub.f16x2 r2248, r1862, r1863; +} +{ +mul.f16x2 r2251, r2248, r1848; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1850; +} +{ +add.f16x2 r2260, r2251, r2257; +} +{ +sub.f16x2 r2263, r1874, r1875; +} +{ +mul.f16x2 r2266, r2263, r1852; +} +{ +add.f16x2 r2269, r2260, r2266; +} +{ +add.f16x2 %3, r2245, r2269; +} +{ +add.f16x2 r2275, r1880, r1881; +} +{ +mul.f16x2 r2278, r2275, r1847; +} +{ +add.f16x2 r2281, r1883, r2278; +} +{ +add.f16x2 r2284, r1886, r1887; +} +{ +mul.f16x2 r2287, r2284, r1849; +} +{ +add.f16x2 r2290, r2281, r2287; +} +{ +add.f16x2 r2293, r1892, r1893; +} +{ +mul.f16x2 r2296, r2293, r1851; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r1862, r1863; +} +{ +mul.f16x2 r2305, r2302, r1848; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1850; +} +{ +add.f16x2 r2314, r2305, r2311; +} +{ +sub.f16x2 r2317, r1874, r1875; +} +{ +mul.f16x2 r2320, r2317, r1852; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +sub.f16x2 %13, r2299, r2323; +} +{ +add.f16x2 r2329, r1880, r1881; +} +{ +mul.f16x2 r2332, r2329, r1849; +} +{ +add.f16x2 r2335, r1883, r2332; +} +{ +add.f16x2 r2338, r1886, r1887; +} +{ +mul.f16x2 r2341, r2338, r1853; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +add.f16x2 r2347, r1892, r1893; +} +{ +mul.f16x2 r2350, r2347, r1857; +} +{ +add.f16x2 r2353, r2344, r2350; +} +{ +sub.f16x2 r2356, r1862, r1863; +} +{ +mul.f16x2 r2359, r2356, r1850; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1855; +} +{ +add.f16x2 r2368, r2359, r2365; +} +{ +sub.f16x2 r2371, r1874, r1875; +} +{ +mul.f16x2 r2374, r2371, r1859; +} +{ +add.f16x2 r2377, r2368, r2374; +} +{ +add.f16x2 %5, r2353, r2377; +} +{ +add.f16x2 r2383, r1880, r1881; +} +{ +mul.f16x2 r2386, r2383, r1849; +} +{ +add.f16x2 r2389, r1883, r2386; +} +{ +add.f16x2 r2392, r1886, r1887; +} +{ +mul.f16x2 r2395, r2392, r1853; +} +{ +add.f16x2 r2398, r2389, r2395; +} +{ +add.f16x2 r2401, r1892, r1893; +} +{ +mul.f16x2 r2404, r2401, r1857; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +sub.f16x2 r2410, r1862, r1863; +} +{ +mul.f16x2 r2413, r2410, r1850; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1855; +} +{ +add.f16x2 r2422, r2413, r2419; +} +{ +sub.f16x2 r2425, r1874, r1875; +} +{ +mul.f16x2 r2428, r2425, r1859; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +sub.f16x2 %11, r2407, r2431; +} +{ +add.f16x2 r2437, r1880, r1881; +} +{ +mul.f16x2 r2440, r2437, r1851; +} +{ +add.f16x2 r2443, r1883, r2440; +} +{ +add.f16x2 r2446, r1886, r1887; +} +{ +mul.f16x2 r2449, r2446, r1857; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +add.f16x2 r2455, r1892, r1893; +} +{ +mul.f16x2 r2458, r2455, r1849; +} +{ +add.f16x2 r2461, r2452, r2458; +} +{ +sub.f16x2 r2464, r1862, r1863; +} +{ +mul.f16x2 r2467, r2464, r1852; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1859; +} +{ +add.f16x2 r2476, r2467, r2473; +} +{ +sub.f16x2 r2479, r1874, r1875; +} +{ +mul.f16x2 r2482, r2479, r1850; +} +{ +add.f16x2 r2485, r2476, r2482; +} +{ +add.f16x2 %7, r2461, r2485; +} +{ +add.f16x2 r2491, r1880, r1881; +} +{ +mul.f16x2 r2494, r2491, r1851; +} +{ +add.f16x2 r2497, r1883, r2494; +} +{ +add.f16x2 r2500, r1886, r1887; +} +{ +mul.f16x2 r2503, r2500, r1857; +} +{ +add.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1892, r1893; +} +{ +mul.f16x2 r2512, r2509, r1849; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +sub.f16x2 r2518, r1862, r1863; +} +{ +mul.f16x2 r2521, r2518, r1852; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1859; +} +{ +add.f16x2 r2530, r2521, r2527; +} +{ +sub.f16x2 r2533, r1874, r1875; +} +{ +mul.f16x2 r2536, r2533, r1850; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 %9, r2515, r2539; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<921, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<2565>; +.reg .b64 rd<6>; +mov.u32 r2545, %tid.y; +mov.u32 r2546, %14; +mad.lo.s32 r2547, r2545, 1372, r2546; +mov.u32 r2548, %tid.x; +mov.f32 f90, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1, {low, high}; +} +mov.f32 f92, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r2, {low, high}; +} +mov.f32 f78, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r3, {low, high}; +} +mov.f32 f80, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r4, {low, high}; +} +mov.f32 f86, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r5, {low, high}; +} +mov.f32 f88, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r6, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r7, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r8, {low, high}; +} +{ +neg.f16x2 r9, r8; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r11, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r12, {low, high}; +} +{ +neg.f16x2 r13, r12; +} +{ +add.f16x2 r15, %17, %27; +} +{ +add.f16x2 r18, %15, r15; +} +{ +add.f16x2 r21, %19, %25; +} +{ +add.f16x2 r24, r18, r21; +} +{ +add.f16x2 r27, %21, %23; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %18, %28; +} +{ +add.f16x2 r36, %16, r33; +} +{ +add.f16x2 r39, %20, %26; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %22, %24; +} +{ +add.f16x2 r48, r42, r45; +} +{ +add.f16x2 r51, %17, %27; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %15, r54; +} +{ +add.f16x2 r60, %19, %25; +} +{ +mul.f16x2 r63, r60, r3; +} +{ +add.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %21, %23; +} +{ +mul.f16x2 r72, r69, r5; +} +{ +add.f16x2 r75, r66, r72; +} +{ +sub.f16x2 r78, %18, %28; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +sub.f16x2 r84, %20, %26; +} +{ +mul.f16x2 r87, r84, r4; +} +{ +add.f16x2 r90, r81, r87; +} +{ +sub.f16x2 r93, %22, %24; +} +{ +mul.f16x2 r96, r93, r6; +} +{ +add.f16x2 r99, r90, r96; +} +{ +sub.f16x2 r102, r75, r99; +} +{ +add.f16x2 r105, %17, %27; +} +{ +mul.f16x2 r108, r105, r1; +} +{ +add.f16x2 r111, %15, r108; +} +{ +add.f16x2 r114, %19, %25; +} +{ +mul.f16x2 r117, r114, r3; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %21, %23; +} +{ +mul.f16x2 r126, r123, r5; +} +{ +add.f16x2 r129, r120, r126; +} +{ +sub.f16x2 r132, %18, %28; +} +{ +mul.f16x2 r135, r132, r2; +} +{ +sub.f16x2 r138, %20, %26; +} +{ +mul.f16x2 r141, r138, r4; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %22, %24; +} +{ +mul.f16x2 r150, r147, r6; +} +{ +add.f16x2 r153, r144, r150; +} +{ +add.f16x2 r156, r129, r153; +} +{ +add.f16x2 r159, %17, %27; +} +{ +mul.f16x2 r162, r159, r3; +} +{ +add.f16x2 r165, %15, r162; +} +{ +add.f16x2 r168, %19, %25; +} +{ +mul.f16x2 r171, r168, r7; +} +{ +add.f16x2 r174, r165, r171; +} +{ +add.f16x2 r177, %21, %23; +} +{ +mul.f16x2 r180, r177, r11; +} +{ +add.f16x2 r183, r174, r180; +} +{ +sub.f16x2 r186, %18, %28; +} +{ +mul.f16x2 r189, r186, r4; +} +{ +sub.f16x2 r192, %20, %26; +} +{ +mul.f16x2 r195, r192, r9; +} +{ +add.f16x2 r198, r189, r195; +} +{ +sub.f16x2 r201, %22, %24; +} +{ +mul.f16x2 r204, r201, r13; +} +{ +add.f16x2 r207, r198, r204; +} +{ +sub.f16x2 r210, r183, r207; +} +{ +add.f16x2 r213, %17, %27; +} +{ +mul.f16x2 r216, r213, r3; +} +{ +add.f16x2 r219, %15, r216; +} +{ +add.f16x2 r222, %19, %25; +} +{ +mul.f16x2 r225, r222, r7; +} +{ +add.f16x2 r228, r219, r225; +} +{ +add.f16x2 r231, %21, %23; +} +{ +mul.f16x2 r234, r231, r11; +} +{ +add.f16x2 r237, r228, r234; +} +{ +sub.f16x2 r240, %18, %28; +} +{ +mul.f16x2 r243, r240, r4; +} +{ +sub.f16x2 r246, %20, %26; +} +{ +mul.f16x2 r249, r246, r9; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %22, %24; +} +{ +mul.f16x2 r258, r255, r13; +} +{ +add.f16x2 r261, r252, r258; +} +{ +add.f16x2 r264, r237, r261; +} +{ +add.f16x2 r267, %17, %27; +} +{ +mul.f16x2 r270, r267, r5; +} +{ +add.f16x2 r273, %15, r270; +} +{ +add.f16x2 r276, %19, %25; +} +{ +mul.f16x2 r279, r276, r11; +} +{ +add.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %21, %23; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, r282, r288; +} +{ +sub.f16x2 r294, %18, %28; +} +{ +mul.f16x2 r297, r294, r6; +} +{ +sub.f16x2 r300, %20, %26; +} +{ +mul.f16x2 r303, r300, r13; +} +{ +add.f16x2 r306, r297, r303; +} +{ +sub.f16x2 r309, %22, %24; +} +{ +mul.f16x2 r312, r309, r4; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r291, r315; +} +{ +add.f16x2 r321, %17, %27; +} +{ +mul.f16x2 r324, r321, r5; +} +{ +add.f16x2 r327, %15, r324; +} +{ +add.f16x2 r330, %19, %25; +} +{ +mul.f16x2 r333, r330, r11; +} +{ +add.f16x2 r336, r327, r333; +} +{ +add.f16x2 r339, %21, %23; +} +{ +mul.f16x2 r342, r339, r3; +} +{ +add.f16x2 r345, r336, r342; +} +{ +sub.f16x2 r348, %18, %28; +} +{ +mul.f16x2 r351, r348, r6; +} +{ +sub.f16x2 r354, %20, %26; +} +{ +mul.f16x2 r357, r354, r13; +} +{ +add.f16x2 r360, r351, r357; +} +{ +sub.f16x2 r363, %22, %24; +} +{ +mul.f16x2 r366, r363, r4; +} +{ +add.f16x2 r369, r360, r366; +} +{ +add.f16x2 r372, r345, r369; +} +{ +add.f16x2 r375, %18, %28; +} +{ +mul.f16x2 r378, r375, r1; +} +{ +add.f16x2 r381, %16, r378; +} +{ +add.f16x2 r384, %20, %26; +} +{ +mul.f16x2 r387, r384, r3; +} +{ +add.f16x2 r390, r381, r387; +} +{ +add.f16x2 r393, %22, %24; +} +{ +mul.f16x2 r396, r393, r5; +} +{ +add.f16x2 r399, r390, r396; +} +{ +sub.f16x2 r402, %17, %27; +} +{ +mul.f16x2 r405, r402, r2; +} +{ +sub.f16x2 r408, %19, %25; +} +{ +mul.f16x2 r411, r408, r4; +} +{ +add.f16x2 r414, r405, r411; +} +{ +sub.f16x2 r417, %21, %23; +} +{ +mul.f16x2 r420, r417, r6; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r399, r423; +} +{ +add.f16x2 r429, %18, %28; +} +{ +mul.f16x2 r432, r429, r1; +} +{ +add.f16x2 r435, %16, r432; +} +{ +add.f16x2 r438, %20, %26; +} +{ +mul.f16x2 r441, r438, r3; +} +{ +add.f16x2 r444, r435, r441; +} +{ +add.f16x2 r447, %22, %24; +} +{ +mul.f16x2 r450, r447, r5; +} +{ +add.f16x2 r453, r444, r450; +} +{ +sub.f16x2 r456, %17, %27; +} +{ +mul.f16x2 r459, r456, r2; +} +{ +sub.f16x2 r462, %19, %25; +} +{ +mul.f16x2 r465, r462, r4; +} +{ +add.f16x2 r468, r459, r465; +} +{ +sub.f16x2 r471, %21, %23; +} +{ +mul.f16x2 r474, r471, r6; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r453, r477; +} +{ +add.f16x2 r483, %18, %28; +} +{ +mul.f16x2 r486, r483, r3; +} +{ +add.f16x2 r489, %16, r486; +} +{ +add.f16x2 r492, %20, %26; +} +{ +mul.f16x2 r495, r492, r7; +} +{ +add.f16x2 r498, r489, r495; +} +{ +add.f16x2 r501, %22, %24; +} +{ +mul.f16x2 r504, r501, r11; +} +{ +add.f16x2 r507, r498, r504; +} +{ +sub.f16x2 r510, %17, %27; +} +{ +mul.f16x2 r513, r510, r4; +} +{ +sub.f16x2 r516, %19, %25; +} +{ +mul.f16x2 r519, r516, r9; +} +{ +add.f16x2 r522, r513, r519; +} +{ +sub.f16x2 r525, %21, %23; +} +{ +mul.f16x2 r528, r525, r13; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r507, r531; +} +{ +add.f16x2 r537, %18, %28; +} +{ +mul.f16x2 r540, r537, r3; +} +{ +add.f16x2 r543, %16, r540; +} +{ +add.f16x2 r546, %20, %26; +} +{ +mul.f16x2 r549, r546, r7; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, %22, %24; +} +{ +mul.f16x2 r558, r555, r11; +} +{ +add.f16x2 r561, r552, r558; +} +{ +sub.f16x2 r564, %17, %27; +} +{ +mul.f16x2 r567, r564, r4; +} +{ +sub.f16x2 r570, %19, %25; +} +{ +mul.f16x2 r573, r570, r9; +} +{ +add.f16x2 r576, r567, r573; +} +{ +sub.f16x2 r579, %21, %23; +} +{ +mul.f16x2 r582, r579, r13; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r561, r585; +} +{ +add.f16x2 r591, %18, %28; +} +{ +mul.f16x2 r594, r591, r5; +} +{ +add.f16x2 r597, %16, r594; +} +{ +add.f16x2 r600, %20, %26; +} +{ +mul.f16x2 r603, r600, r11; +} +{ +add.f16x2 r606, r597, r603; +} +{ +add.f16x2 r609, %22, %24; +} +{ +mul.f16x2 r612, r609, r3; +} +{ +add.f16x2 r615, r606, r612; +} +{ +sub.f16x2 r618, %17, %27; +} +{ +mul.f16x2 r621, r618, r6; +} +{ +sub.f16x2 r624, %19, %25; +} +{ +mul.f16x2 r627, r624, r13; +} +{ +add.f16x2 r630, r621, r627; +} +{ +sub.f16x2 r633, %21, %23; +} +{ +mul.f16x2 r636, r633, r4; +} +{ +add.f16x2 r639, r630, r636; +} +{ +add.f16x2 r642, r615, r639; +} +{ +add.f16x2 r645, %18, %28; +} +{ +mul.f16x2 r648, r645, r5; +} +{ +add.f16x2 r651, %16, r648; +} +{ +add.f16x2 r654, %20, %26; +} +{ +mul.f16x2 r657, r654, r11; +} +{ +add.f16x2 r660, r651, r657; +} +{ +add.f16x2 r663, %22, %24; +} +{ +mul.f16x2 r666, r663, r3; +} +{ +add.f16x2 r669, r660, r666; +} +{ +sub.f16x2 r672, %17, %27; +} +{ +mul.f16x2 r675, r672, r6; +} +{ +sub.f16x2 r678, %19, %25; +} +{ +mul.f16x2 r681, r678, r13; +} +{ +add.f16x2 r684, r675, r681; +} +{ +sub.f16x2 r687, %21, %23; +} +{ +mul.f16x2 r690, r687, r4; +} +{ +add.f16x2 r693, r684, r690; +} +{ +sub.f16x2 r696, r669, r693; +} +mul.wide.u32 rd2, r2548, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r2549, rd3; +mul.lo.s32 r2550, r2549, 49; +sub.s32 r2551, r2548, r2550; +cvt.rn.f32.u32 f93, r2551; +mul.f32 f94, f93, 0f3C961050; +cos.approx.f32 f21, f94; +sin.approx.f32 f95, f94; +neg.f32 f22, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r699, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r704, {high, high}; +} +{ +mul.f16x2 r706, r426, r704; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r102, r702, r709; +} +{ +mul.f16x2 r715, r102, r704; +} +{ +fma.rn.f16x2 r718, r426, r702, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r724, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r726, {low, high}; +} +{ +mul.f16x2 r727, r724, r726; +} +{ +mul.f16x2 r730, r699, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r733, {high, low}; +} +{ +fma.rn.f16x2 r735, r727, r733, r730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r741, {high, high}; +} +{ +mul.f16x2 r743, r534, r741; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r210, r739, r746; +} +{ +mul.f16x2 r752, r210, r741; +} +{ +fma.rn.f16x2 r755, r534, r739, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r761, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r763, {low, high}; +} +{ +mul.f16x2 r764, r761, r763; +} +{ +mul.f16x2 r767, r735, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r770, {high, low}; +} +{ +fma.rn.f16x2 r772, r764, r770, r767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r778, {high, high}; +} +{ +mul.f16x2 r780, r642, r778; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r318, r776, r783; +} +{ +mul.f16x2 r789, r318, r778; +} +{ +fma.rn.f16x2 r792, r642, r776, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r798, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r798, r800; +} +{ +mul.f16x2 r804, r772, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r807, {high, low}; +} +{ +fma.rn.f16x2 r809, r801, r807, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r815, {high, high}; +} +{ +mul.f16x2 r817, r696, r815; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r372, r813, r820; +} +{ +mul.f16x2 r826, r372, r815; +} +{ +fma.rn.f16x2 r829, r696, r813, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r833, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r835, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r837, {low, high}; +} +{ +mul.f16x2 r838, r835, r837; +} +{ +mul.f16x2 r841, r809, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r844, {high, low}; +} +{ +fma.rn.f16x2 r846, r838, r844, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r850, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r852, {high, high}; +} +{ +mul.f16x2 r854, r588, r852; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r264, r850, r857; +} +{ +mul.f16x2 r863, r264, r852; +} +{ +fma.rn.f16x2 r866, r588, r850, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r870, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r872, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r874, {low, high}; +} +{ +mul.f16x2 r875, r872, r874; +} +{ +mul.f16x2 r878, r846, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r881, {high, low}; +} +{ +fma.rn.f16x2 r883, r875, r881, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r887, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r889, {high, high}; +} +{ +mul.f16x2 r891, r480, r889; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r156, r887, r894; +} +{ +mul.f16x2 r900, r156, r889; +} +{ +fma.rn.f16x2 r903, r480, r887, r900; +} +mad.lo.s32 r2552, r2549, 1372, r2547; +barrier.sync 0; +mad.lo.s32 r2553, r2551, 28, r2552; +st.shared.u32 [r2553], r30; +st.shared.u32 [r2553+4], r711; +st.shared.u32 [r2553+8], r748; +st.shared.u32 [r2553+12], r785; +st.shared.u32 [r2553+16], r822; +st.shared.u32 [r2553+20], r859; +st.shared.u32 [r2553+24], r896; +barrier.sync 0; +mad.lo.s32 r2554, r2551, -24, r2553; +ld.shared.u32 r942, [r2554]; +ld.shared.u32 r939, [r2554+196]; +ld.shared.u32 r945, [r2554+392]; +ld.shared.u32 r951, [r2554+588]; +ld.shared.u32 r952, [r2554+784]; +ld.shared.u32 r946, [r2554+980]; +ld.shared.u32 r940, [r2554+1176]; +barrier.sync 0; +st.shared.u32 [r2553], r48; +st.shared.u32 [r2553+4], r718; +st.shared.u32 [r2553+8], r755; +st.shared.u32 [r2553+12], r792; +st.shared.u32 [r2553+16], r829; +st.shared.u32 [r2553+20], r866; +st.shared.u32 [r2553+24], r903; +barrier.sync 0; +ld.shared.u32 r960, [r2554]; +ld.shared.u32 r957, [r2554+196]; +ld.shared.u32 r963, [r2554+392]; +ld.shared.u32 r969, [r2554+588]; +ld.shared.u32 r970, [r2554+784]; +ld.shared.u32 r964, [r2554+980]; +ld.shared.u32 r958, [r2554+1176]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +add.f16x2 r938, r939, r940; +} +{ +add.f16x2 r941, r942, r938; +} +{ +add.f16x2 r944, r945, r946; +} +{ +add.f16x2 r947, r941, r944; +} +{ +add.f16x2 r950, r951, r952; +} +{ +add.f16x2 r953, r947, r950; +} +{ +add.f16x2 r956, r957, r958; +} +{ +add.f16x2 r959, r960, r956; +} +{ +add.f16x2 r962, r963, r964; +} +{ +add.f16x2 r965, r959, r962; +} +{ +add.f16x2 r968, r969, r970; +} +{ +add.f16x2 r971, r965, r968; +} +{ +add.f16x2 r974, r939, r940; +} +{ +mul.f16x2 r977, r974, r924; +} +{ +add.f16x2 r980, r942, r977; +} +{ +add.f16x2 r983, r945, r946; +} +{ +mul.f16x2 r986, r983, r926; +} +{ +add.f16x2 r989, r980, r986; +} +{ +add.f16x2 r992, r951, r952; +} +{ +mul.f16x2 r995, r992, r928; +} +{ +add.f16x2 r998, r989, r995; +} +{ +sub.f16x2 r1001, r957, r958; +} +{ +mul.f16x2 r1004, r1001, r925; +} +{ +sub.f16x2 r1007, r963, r964; +} +{ +mul.f16x2 r1010, r1007, r927; +} +{ +add.f16x2 r1013, r1004, r1010; +} +{ +sub.f16x2 r1016, r969, r970; +} +{ +mul.f16x2 r1019, r1016, r929; +} +{ +add.f16x2 r1022, r1013, r1019; +} +{ +sub.f16x2 r1025, r998, r1022; +} +{ +add.f16x2 r1028, r939, r940; +} +{ +mul.f16x2 r1031, r1028, r924; +} +{ +add.f16x2 r1034, r942, r1031; +} +{ +add.f16x2 r1037, r945, r946; +} +{ +mul.f16x2 r1040, r1037, r926; +} +{ +add.f16x2 r1043, r1034, r1040; +} +{ +add.f16x2 r1046, r951, r952; +} +{ +mul.f16x2 r1049, r1046, r928; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, r957, r958; +} +{ +mul.f16x2 r1058, r1055, r925; +} +{ +sub.f16x2 r1061, r963, r964; +} +{ +mul.f16x2 r1064, r1061, r927; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +sub.f16x2 r1070, r969, r970; +} +{ +mul.f16x2 r1073, r1070, r929; +} +{ +add.f16x2 r1076, r1067, r1073; +} +{ +add.f16x2 r1079, r1052, r1076; +} +{ +add.f16x2 r1082, r939, r940; +} +{ +mul.f16x2 r1085, r1082, r926; +} +{ +add.f16x2 r1088, r942, r1085; +} +{ +add.f16x2 r1091, r945, r946; +} +{ +mul.f16x2 r1094, r1091, r930; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r951, r952; +} +{ +mul.f16x2 r1103, r1100, r934; +} +{ +add.f16x2 r1106, r1097, r1103; +} +{ +sub.f16x2 r1109, r957, r958; +} +{ +mul.f16x2 r1112, r1109, r927; +} +{ +sub.f16x2 r1115, r963, r964; +} +{ +mul.f16x2 r1118, r1115, r932; +} +{ +add.f16x2 r1121, r1112, r1118; +} +{ +sub.f16x2 r1124, r969, r970; +} +{ +mul.f16x2 r1127, r1124, r936; +} +{ +add.f16x2 r1130, r1121, r1127; +} +{ +sub.f16x2 r1133, r1106, r1130; +} +{ +add.f16x2 r1136, r939, r940; +} +{ +mul.f16x2 r1139, r1136, r926; +} +{ +add.f16x2 r1142, r942, r1139; +} +{ +add.f16x2 r1145, r945, r946; +} +{ +mul.f16x2 r1148, r1145, r930; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r951, r952; +} +{ +mul.f16x2 r1157, r1154, r934; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, r957, r958; +} +{ +mul.f16x2 r1166, r1163, r927; +} +{ +sub.f16x2 r1169, r963, r964; +} +{ +mul.f16x2 r1172, r1169, r932; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +sub.f16x2 r1178, r969, r970; +} +{ +mul.f16x2 r1181, r1178, r936; +} +{ +add.f16x2 r1184, r1175, r1181; +} +{ +add.f16x2 r1187, r1160, r1184; +} +{ +add.f16x2 r1190, r939, r940; +} +{ +mul.f16x2 r1193, r1190, r928; +} +{ +add.f16x2 r1196, r942, r1193; +} +{ +add.f16x2 r1199, r945, r946; +} +{ +mul.f16x2 r1202, r1199, r934; +} +{ +add.f16x2 r1205, r1196, r1202; +} +{ +add.f16x2 r1208, r951, r952; +} +{ +mul.f16x2 r1211, r1208, r926; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +sub.f16x2 r1217, r957, r958; +} +{ +mul.f16x2 r1220, r1217, r929; +} +{ +sub.f16x2 r1223, r963, r964; +} +{ +mul.f16x2 r1226, r1223, r936; +} +{ +add.f16x2 r1229, r1220, r1226; +} +{ +sub.f16x2 r1232, r969, r970; +} +{ +mul.f16x2 r1235, r1232, r927; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 r1241, r1214, r1238; +} +{ +add.f16x2 r1244, r939, r940; +} +{ +mul.f16x2 r1247, r1244, r928; +} +{ +add.f16x2 r1250, r942, r1247; +} +{ +add.f16x2 r1253, r945, r946; +} +{ +mul.f16x2 r1256, r1253, r934; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r951, r952; +} +{ +mul.f16x2 r1265, r1262, r926; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, r957, r958; +} +{ +mul.f16x2 r1274, r1271, r929; +} +{ +sub.f16x2 r1277, r963, r964; +} +{ +mul.f16x2 r1280, r1277, r936; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r969, r970; +} +{ +mul.f16x2 r1289, r1286, r927; +} +{ +add.f16x2 r1292, r1283, r1289; +} +{ +add.f16x2 r1295, r1268, r1292; +} +{ +add.f16x2 r1298, r957, r958; +} +{ +mul.f16x2 r1301, r1298, r924; +} +{ +add.f16x2 r1304, r960, r1301; +} +{ +add.f16x2 r1307, r963, r964; +} +{ +mul.f16x2 r1310, r1307, r926; +} +{ +add.f16x2 r1313, r1304, r1310; +} +{ +add.f16x2 r1316, r969, r970; +} +{ +mul.f16x2 r1319, r1316, r928; +} +{ +add.f16x2 r1322, r1313, r1319; +} +{ +sub.f16x2 r1325, r939, r940; +} +{ +mul.f16x2 r1328, r1325, r925; +} +{ +sub.f16x2 r1331, r945, r946; +} +{ +mul.f16x2 r1334, r1331, r927; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +sub.f16x2 r1340, r951, r952; +} +{ +mul.f16x2 r1343, r1340, r929; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +add.f16x2 r1349, r1322, r1346; +} +{ +add.f16x2 r1352, r957, r958; +} +{ +mul.f16x2 r1355, r1352, r924; +} +{ +add.f16x2 r1358, r960, r1355; +} +{ +add.f16x2 r1361, r963, r964; +} +{ +mul.f16x2 r1364, r1361, r926; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r969, r970; +} +{ +mul.f16x2 r1373, r1370, r928; +} +{ +add.f16x2 r1376, r1367, r1373; +} +{ +sub.f16x2 r1379, r939, r940; +} +{ +mul.f16x2 r1382, r1379, r925; +} +{ +sub.f16x2 r1385, r945, r946; +} +{ +mul.f16x2 r1388, r1385, r927; +} +{ +add.f16x2 r1391, r1382, r1388; +} +{ +sub.f16x2 r1394, r951, r952; +} +{ +mul.f16x2 r1397, r1394, r929; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, r1376, r1400; +} +{ +add.f16x2 r1406, r957, r958; +} +{ +mul.f16x2 r1409, r1406, r926; +} +{ +add.f16x2 r1412, r960, r1409; +} +{ +add.f16x2 r1415, r963, r964; +} +{ +mul.f16x2 r1418, r1415, r930; +} +{ +add.f16x2 r1421, r1412, r1418; +} +{ +add.f16x2 r1424, r969, r970; +} +{ +mul.f16x2 r1427, r1424, r934; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +sub.f16x2 r1433, r939, r940; +} +{ +mul.f16x2 r1436, r1433, r927; +} +{ +sub.f16x2 r1439, r945, r946; +} +{ +mul.f16x2 r1442, r1439, r932; +} +{ +add.f16x2 r1445, r1436, r1442; +} +{ +sub.f16x2 r1448, r951, r952; +} +{ +mul.f16x2 r1451, r1448, r936; +} +{ +add.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 r1457, r1430, r1454; +} +{ +add.f16x2 r1460, r957, r958; +} +{ +mul.f16x2 r1463, r1460, r926; +} +{ +add.f16x2 r1466, r960, r1463; +} +{ +add.f16x2 r1469, r963, r964; +} +{ +mul.f16x2 r1472, r1469, r930; +} +{ +add.f16x2 r1475, r1466, r1472; +} +{ +add.f16x2 r1478, r969, r970; +} +{ +mul.f16x2 r1481, r1478, r934; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r939, r940; +} +{ +mul.f16x2 r1490, r1487, r927; +} +{ +sub.f16x2 r1493, r945, r946; +} +{ +mul.f16x2 r1496, r1493, r932; +} +{ +add.f16x2 r1499, r1490, r1496; +} +{ +sub.f16x2 r1502, r951, r952; +} +{ +mul.f16x2 r1505, r1502, r936; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, r1484, r1508; +} +{ +add.f16x2 r1514, r957, r958; +} +{ +mul.f16x2 r1517, r1514, r928; +} +{ +add.f16x2 r1520, r960, r1517; +} +{ +add.f16x2 r1523, r963, r964; +} +{ +mul.f16x2 r1526, r1523, r934; +} +{ +add.f16x2 r1529, r1520, r1526; +} +{ +add.f16x2 r1532, r969, r970; +} +{ +mul.f16x2 r1535, r1532, r926; +} +{ +add.f16x2 r1538, r1529, r1535; +} +{ +sub.f16x2 r1541, r939, r940; +} +{ +mul.f16x2 r1544, r1541, r929; +} +{ +sub.f16x2 r1547, r945, r946; +} +{ +mul.f16x2 r1550, r1547, r936; +} +{ +add.f16x2 r1553, r1544, r1550; +} +{ +sub.f16x2 r1556, r951, r952; +} +{ +mul.f16x2 r1559, r1556, r927; +} +{ +add.f16x2 r1562, r1553, r1559; +} +{ +add.f16x2 r1565, r1538, r1562; +} +{ +add.f16x2 r1568, r957, r958; +} +{ +mul.f16x2 r1571, r1568, r928; +} +{ +add.f16x2 r1574, r960, r1571; +} +{ +add.f16x2 r1577, r963, r964; +} +{ +mul.f16x2 r1580, r1577, r934; +} +{ +add.f16x2 r1583, r1574, r1580; +} +{ +add.f16x2 r1586, r969, r970; +} +{ +mul.f16x2 r1589, r1586, r926; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +sub.f16x2 r1595, r939, r940; +} +{ +mul.f16x2 r1598, r1595, r929; +} +{ +sub.f16x2 r1601, r945, r946; +} +{ +mul.f16x2 r1604, r1601, r936; +} +{ +add.f16x2 r1607, r1598, r1604; +} +{ +sub.f16x2 r1610, r951, r952; +} +{ +mul.f16x2 r1613, r1610, r927; +} +{ +add.f16x2 r1616, r1607, r1613; +} +{ +sub.f16x2 r1619, r1592, r1616; +} +mul.wide.u32 rd4, r2551, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r2555, rd5; +sub.s32 r2556, r2551, r2555; +shr.u32 r2557, r2556, 1; +add.s32 r2558, r2557, r2555; +shr.u32 r2559, r2558, 2; +cvt.rn.f32.u32 f96, r2559; +mul.f32 f97, f96, 0f3E034E46; +cos.approx.f32 f57, f97; +sin.approx.f32 f98, f97; +neg.f32 f58, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1622, {low, high}; +} +mul.lo.s32 r2560, r2559, 7; +sub.s32 r2561, r2551, r2560; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1625, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1627, {high, high}; +} +{ +mul.f16x2 r1629, r1349, r1627; +} +{ +neg.f16x2 r1632, r1629; +} +{ +fma.rn.f16x2 r1634, r1025, r1625, r1632; +} +{ +mul.f16x2 r1638, r1025, r1627; +} +{ +fma.rn.f16x2 r1641, r1349, r1625, r1638; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1645, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1647, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1649, {low, high}; +} +{ +mul.f16x2 r1650, r1647, r1649; +} +{ +mul.f16x2 r1653, r1622, r1645; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1656, {high, low}; +} +{ +fma.rn.f16x2 r1658, r1650, r1656, r1653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1662, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1664, {high, high}; +} +{ +mul.f16x2 r1666, r1457, r1664; +} +{ +neg.f16x2 r1669, r1666; +} +{ +fma.rn.f16x2 r1671, r1133, r1662, r1669; +} +{ +mul.f16x2 r1675, r1133, r1664; +} +{ +fma.rn.f16x2 r1678, r1457, r1662, r1675; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1682, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1684, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1686, {low, high}; +} +{ +mul.f16x2 r1687, r1684, r1686; +} +{ +mul.f16x2 r1690, r1658, r1682; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1658; +mov.b32 r1693, {high, low}; +} +{ +fma.rn.f16x2 r1695, r1687, r1693, r1690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1699, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1701, {high, high}; +} +{ +mul.f16x2 r1703, r1565, r1701; +} +{ +neg.f16x2 r1706, r1703; +} +{ +fma.rn.f16x2 r1708, r1241, r1699, r1706; +} +{ +mul.f16x2 r1712, r1241, r1701; +} +{ +fma.rn.f16x2 r1715, r1565, r1699, r1712; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1719, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1721, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1723, {low, high}; +} +{ +mul.f16x2 r1724, r1721, r1723; +} +{ +mul.f16x2 r1727, r1695, r1719; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1695; +mov.b32 r1730, {high, low}; +} +{ +fma.rn.f16x2 r1732, r1724, r1730, r1727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1738, {high, high}; +} +{ +mul.f16x2 r1740, r1619, r1738; +} +{ +neg.f16x2 r1743, r1740; +} +{ +fma.rn.f16x2 r1745, r1295, r1736, r1743; +} +{ +mul.f16x2 r1749, r1295, r1738; +} +{ +fma.rn.f16x2 r1752, r1619, r1736, r1749; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1756, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1758, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1760, {low, high}; +} +{ +mul.f16x2 r1761, r1758, r1760; +} +{ +mul.f16x2 r1764, r1732, r1756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1732; +mov.b32 r1767, {high, low}; +} +{ +fma.rn.f16x2 r1769, r1761, r1767, r1764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1775, {high, high}; +} +{ +mul.f16x2 r1777, r1511, r1775; +} +{ +neg.f16x2 r1780, r1777; +} +{ +fma.rn.f16x2 r1782, r1187, r1773, r1780; +} +{ +mul.f16x2 r1786, r1187, r1775; +} +{ +fma.rn.f16x2 r1789, r1511, r1773, r1786; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1793, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1622; +mov.b32 r1795, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1797, {low, high}; +} +{ +mul.f16x2 r1798, r1795, r1797; +} +{ +mul.f16x2 r1801, r1769, r1793; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1769; +mov.b32 r1804, {high, low}; +} +{ +fma.rn.f16x2 r1806, r1798, r1804, r1801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1806; +mov.b32 r1812, {high, high}; +} +{ +mul.f16x2 r1814, r1403, r1812; +} +{ +neg.f16x2 r1817, r1814; +} +{ +fma.rn.f16x2 r1819, r1079, r1810, r1817; +} +{ +mul.f16x2 r1823, r1079, r1812; +} +{ +fma.rn.f16x2 r1826, r1403, r1810, r1823; +} +shl.b32 r2562, r2561, 2; +add.s32 r2563, r2552, r2562; +barrier.sync 0; +mad.lo.s32 r2564, r2559, 196, r2563; +st.shared.u32 [r2564], r953; +st.shared.u32 [r2564+28], r1634; +st.shared.u32 [r2564+56], r1671; +st.shared.u32 [r2564+84], r1708; +st.shared.u32 [r2564+112], r1745; +st.shared.u32 [r2564+140], r1782; +st.shared.u32 [r2564+168], r1819; +barrier.sync 0; +ld.shared.u32 r1865, [r2554]; +ld.shared.u32 r1862, [r2554+196]; +ld.shared.u32 r1868, [r2554+392]; +ld.shared.u32 r1874, [r2554+588]; +ld.shared.u32 r1875, [r2554+784]; +ld.shared.u32 r1869, [r2554+980]; +ld.shared.u32 r1863, [r2554+1176]; +barrier.sync 0; +st.shared.u32 [r2564], r971; +st.shared.u32 [r2564+28], r1641; +st.shared.u32 [r2564+56], r1678; +st.shared.u32 [r2564+84], r1715; +st.shared.u32 [r2564+112], r1752; +st.shared.u32 [r2564+140], r1789; +st.shared.u32 [r2564+168], r1826; +barrier.sync 0; +ld.shared.u32 r1883, [r2554]; +ld.shared.u32 r1880, [r2554+196]; +ld.shared.u32 r1886, [r2554+392]; +ld.shared.u32 r1892, [r2554+588]; +ld.shared.u32 r1893, [r2554+784]; +ld.shared.u32 r1887, [r2554+980]; +ld.shared.u32 r1881, [r2554+1176]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1848, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1849, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1850, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1853, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1854, {low, high}; +} +{ +neg.f16x2 r1855, r1854; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1858, {low, high}; +} +{ +neg.f16x2 r1859, r1858; +} +{ +add.f16x2 r1861, r1862, r1863; +} +{ +add.f16x2 r1864, r1865, r1861; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1864, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 %0, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1882, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 %1, r1888, r1891; +} +{ +add.f16x2 r1897, r1862, r1863; +} +{ +mul.f16x2 r1900, r1897, r1847; +} +{ +add.f16x2 r1903, r1865, r1900; +} +{ +add.f16x2 r1906, r1868, r1869; +} +{ +mul.f16x2 r1909, r1906, r1849; +} +{ +add.f16x2 r1912, r1903, r1909; +} +{ +add.f16x2 r1915, r1874, r1875; +} +{ +mul.f16x2 r1918, r1915, r1851; +} +{ +add.f16x2 r1921, r1912, r1918; +} +{ +sub.f16x2 r1924, r1880, r1881; +} +{ +mul.f16x2 r1927, r1924, r1848; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1850; +} +{ +add.f16x2 r1936, r1927, r1933; +} +{ +sub.f16x2 r1939, r1892, r1893; +} +{ +mul.f16x2 r1942, r1939, r1852; +} +{ +add.f16x2 r1945, r1936, r1942; +} +{ +sub.f16x2 %2, r1921, r1945; +} +{ +add.f16x2 r1951, r1862, r1863; +} +{ +mul.f16x2 r1954, r1951, r1847; +} +{ +add.f16x2 r1957, r1865, r1954; +} +{ +add.f16x2 r1960, r1868, r1869; +} +{ +mul.f16x2 r1963, r1960, r1849; +} +{ +add.f16x2 r1966, r1957, r1963; +} +{ +add.f16x2 r1969, r1874, r1875; +} +{ +mul.f16x2 r1972, r1969, r1851; +} +{ +add.f16x2 r1975, r1966, r1972; +} +{ +sub.f16x2 r1978, r1880, r1881; +} +{ +mul.f16x2 r1981, r1978, r1848; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1850; +} +{ +add.f16x2 r1990, r1981, r1987; +} +{ +sub.f16x2 r1993, r1892, r1893; +} +{ +mul.f16x2 r1996, r1993, r1852; +} +{ +add.f16x2 r1999, r1990, r1996; +} +{ +add.f16x2 %12, r1975, r1999; +} +{ +add.f16x2 r2005, r1862, r1863; +} +{ +mul.f16x2 r2008, r2005, r1849; +} +{ +add.f16x2 r2011, r1865, r2008; +} +{ +add.f16x2 r2014, r1868, r1869; +} +{ +mul.f16x2 r2017, r2014, r1853; +} +{ +add.f16x2 r2020, r2011, r2017; +} +{ +add.f16x2 r2023, r1874, r1875; +} +{ +mul.f16x2 r2026, r2023, r1857; +} +{ +add.f16x2 r2029, r2020, r2026; +} +{ +sub.f16x2 r2032, r1880, r1881; +} +{ +mul.f16x2 r2035, r2032, r1850; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1855; +} +{ +add.f16x2 r2044, r2035, r2041; +} +{ +sub.f16x2 r2047, r1892, r1893; +} +{ +mul.f16x2 r2050, r2047, r1859; +} +{ +add.f16x2 r2053, r2044, r2050; +} +{ +sub.f16x2 %4, r2029, r2053; +} +{ +add.f16x2 r2059, r1862, r1863; +} +{ +mul.f16x2 r2062, r2059, r1849; +} +{ +add.f16x2 r2065, r1865, r2062; +} +{ +add.f16x2 r2068, r1868, r1869; +} +{ +mul.f16x2 r2071, r2068, r1853; +} +{ +add.f16x2 r2074, r2065, r2071; +} +{ +add.f16x2 r2077, r1874, r1875; +} +{ +mul.f16x2 r2080, r2077, r1857; +} +{ +add.f16x2 r2083, r2074, r2080; +} +{ +sub.f16x2 r2086, r1880, r1881; +} +{ +mul.f16x2 r2089, r2086, r1850; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1855; +} +{ +add.f16x2 r2098, r2089, r2095; +} +{ +sub.f16x2 r2101, r1892, r1893; +} +{ +mul.f16x2 r2104, r2101, r1859; +} +{ +add.f16x2 r2107, r2098, r2104; +} +{ +add.f16x2 %10, r2083, r2107; +} +{ +add.f16x2 r2113, r1862, r1863; +} +{ +mul.f16x2 r2116, r2113, r1851; +} +{ +add.f16x2 r2119, r1865, r2116; +} +{ +add.f16x2 r2122, r1868, r1869; +} +{ +mul.f16x2 r2125, r2122, r1857; +} +{ +add.f16x2 r2128, r2119, r2125; +} +{ +add.f16x2 r2131, r1874, r1875; +} +{ +mul.f16x2 r2134, r2131, r1849; +} +{ +add.f16x2 r2137, r2128, r2134; +} +{ +sub.f16x2 r2140, r1880, r1881; +} +{ +mul.f16x2 r2143, r2140, r1852; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1859; +} +{ +add.f16x2 r2152, r2143, r2149; +} +{ +sub.f16x2 r2155, r1892, r1893; +} +{ +mul.f16x2 r2158, r2155, r1850; +} +{ +add.f16x2 r2161, r2152, r2158; +} +{ +sub.f16x2 %6, r2137, r2161; +} +{ +add.f16x2 r2167, r1862, r1863; +} +{ +mul.f16x2 r2170, r2167, r1851; +} +{ +add.f16x2 r2173, r1865, r2170; +} +{ +add.f16x2 r2176, r1868, r1869; +} +{ +mul.f16x2 r2179, r2176, r1857; +} +{ +add.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r1874, r1875; +} +{ +mul.f16x2 r2188, r2185, r1849; +} +{ +add.f16x2 r2191, r2182, r2188; +} +{ +sub.f16x2 r2194, r1880, r1881; +} +{ +mul.f16x2 r2197, r2194, r1852; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1859; +} +{ +add.f16x2 r2206, r2197, r2203; +} +{ +sub.f16x2 r2209, r1892, r1893; +} +{ +mul.f16x2 r2212, r2209, r1850; +} +{ +add.f16x2 r2215, r2206, r2212; +} +{ +add.f16x2 %8, r2191, r2215; +} +{ +add.f16x2 r2221, r1880, r1881; +} +{ +mul.f16x2 r2224, r2221, r1847; +} +{ +add.f16x2 r2227, r1883, r2224; +} +{ +add.f16x2 r2230, r1886, r1887; +} +{ +mul.f16x2 r2233, r2230, r1849; +} +{ +add.f16x2 r2236, r2227, r2233; +} +{ +add.f16x2 r2239, r1892, r1893; +} +{ +mul.f16x2 r2242, r2239, r1851; +} +{ +add.f16x2 r2245, r2236, r2242; +} +{ +sub.f16x2 r2248, r1862, r1863; +} +{ +mul.f16x2 r2251, r2248, r1848; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1850; +} +{ +add.f16x2 r2260, r2251, r2257; +} +{ +sub.f16x2 r2263, r1874, r1875; +} +{ +mul.f16x2 r2266, r2263, r1852; +} +{ +add.f16x2 r2269, r2260, r2266; +} +{ +add.f16x2 %3, r2245, r2269; +} +{ +add.f16x2 r2275, r1880, r1881; +} +{ +mul.f16x2 r2278, r2275, r1847; +} +{ +add.f16x2 r2281, r1883, r2278; +} +{ +add.f16x2 r2284, r1886, r1887; +} +{ +mul.f16x2 r2287, r2284, r1849; +} +{ +add.f16x2 r2290, r2281, r2287; +} +{ +add.f16x2 r2293, r1892, r1893; +} +{ +mul.f16x2 r2296, r2293, r1851; +} +{ +add.f16x2 r2299, r2290, r2296; +} +{ +sub.f16x2 r2302, r1862, r1863; +} +{ +mul.f16x2 r2305, r2302, r1848; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1850; +} +{ +add.f16x2 r2314, r2305, r2311; +} +{ +sub.f16x2 r2317, r1874, r1875; +} +{ +mul.f16x2 r2320, r2317, r1852; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +sub.f16x2 %13, r2299, r2323; +} +{ +add.f16x2 r2329, r1880, r1881; +} +{ +mul.f16x2 r2332, r2329, r1849; +} +{ +add.f16x2 r2335, r1883, r2332; +} +{ +add.f16x2 r2338, r1886, r1887; +} +{ +mul.f16x2 r2341, r2338, r1853; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +add.f16x2 r2347, r1892, r1893; +} +{ +mul.f16x2 r2350, r2347, r1857; +} +{ +add.f16x2 r2353, r2344, r2350; +} +{ +sub.f16x2 r2356, r1862, r1863; +} +{ +mul.f16x2 r2359, r2356, r1850; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1855; +} +{ +add.f16x2 r2368, r2359, r2365; +} +{ +sub.f16x2 r2371, r1874, r1875; +} +{ +mul.f16x2 r2374, r2371, r1859; +} +{ +add.f16x2 r2377, r2368, r2374; +} +{ +add.f16x2 %5, r2353, r2377; +} +{ +add.f16x2 r2383, r1880, r1881; +} +{ +mul.f16x2 r2386, r2383, r1849; +} +{ +add.f16x2 r2389, r1883, r2386; +} +{ +add.f16x2 r2392, r1886, r1887; +} +{ +mul.f16x2 r2395, r2392, r1853; +} +{ +add.f16x2 r2398, r2389, r2395; +} +{ +add.f16x2 r2401, r1892, r1893; +} +{ +mul.f16x2 r2404, r2401, r1857; +} +{ +add.f16x2 r2407, r2398, r2404; +} +{ +sub.f16x2 r2410, r1862, r1863; +} +{ +mul.f16x2 r2413, r2410, r1850; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1855; +} +{ +add.f16x2 r2422, r2413, r2419; +} +{ +sub.f16x2 r2425, r1874, r1875; +} +{ +mul.f16x2 r2428, r2425, r1859; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +sub.f16x2 %11, r2407, r2431; +} +{ +add.f16x2 r2437, r1880, r1881; +} +{ +mul.f16x2 r2440, r2437, r1851; +} +{ +add.f16x2 r2443, r1883, r2440; +} +{ +add.f16x2 r2446, r1886, r1887; +} +{ +mul.f16x2 r2449, r2446, r1857; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +add.f16x2 r2455, r1892, r1893; +} +{ +mul.f16x2 r2458, r2455, r1849; +} +{ +add.f16x2 r2461, r2452, r2458; +} +{ +sub.f16x2 r2464, r1862, r1863; +} +{ +mul.f16x2 r2467, r2464, r1852; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1859; +} +{ +add.f16x2 r2476, r2467, r2473; +} +{ +sub.f16x2 r2479, r1874, r1875; +} +{ +mul.f16x2 r2482, r2479, r1850; +} +{ +add.f16x2 r2485, r2476, r2482; +} +{ +add.f16x2 %7, r2461, r2485; +} +{ +add.f16x2 r2491, r1880, r1881; +} +{ +mul.f16x2 r2494, r2491, r1851; +} +{ +add.f16x2 r2497, r1883, r2494; +} +{ +add.f16x2 r2500, r1886, r1887; +} +{ +mul.f16x2 r2503, r2500, r1857; +} +{ +add.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1892, r1893; +} +{ +mul.f16x2 r2512, r2509, r1849; +} +{ +add.f16x2 r2515, r2506, r2512; +} +{ +sub.f16x2 r2518, r1862, r1863; +} +{ +mul.f16x2 r2521, r2518, r1852; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1859; +} +{ +add.f16x2 r2530, r2521, r2527; +} +{ +sub.f16x2 r2533, r1874, r1875; +} +{ +mul.f16x2 r2536, r2533, r1850; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 %9, r2515, r2539; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..2691114a1c1e0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp16_inv.hpp.inc @@ -0,0 +1,5973 @@ +#ifndef CUFFTDX_FFT_343_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_343_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1122, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<2571>; +.reg .b64 rd<6>; +mov.u32 r2551, %tid.y; +mov.u32 r2552, %14; +mad.lo.s32 r2553, r2551, 2744, r2552; +mov.u32 r2554, %tid.x; +mov.f32 f90, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1, {low, high}; +} +mov.f32 f92, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f78, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r5, {low, high}; +} +mov.f32 f80, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +mov.f32 f86, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r9, {low, high}; +} +mov.f32 f88, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r10, {low, high}; +} +{ +neg.f16x2 r11, r10; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r14, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r15, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r16, {low, high}; +} +{ +add.f16x2 r17, %17, %27; +} +{ +add.f16x2 r20, %15, r17; +} +{ +add.f16x2 r23, %19, %25; +} +{ +add.f16x2 r26, r20, r23; +} +{ +add.f16x2 r29, %21, %23; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %18, %28; +} +{ +add.f16x2 r38, %16, r35; +} +{ +add.f16x2 r41, %20, %26; +} +{ +add.f16x2 r44, r38, r41; +} +{ +add.f16x2 r47, %22, %24; +} +{ +add.f16x2 r50, r44, r47; +} +{ +add.f16x2 r53, %17, %27; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %15, r56; +} +{ +add.f16x2 r62, %19, %25; +} +{ +mul.f16x2 r65, r62, r5; +} +{ +add.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %21, %23; +} +{ +mul.f16x2 r74, r71, r9; +} +{ +add.f16x2 r77, r68, r74; +} +{ +sub.f16x2 r80, %18, %28; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +sub.f16x2 r86, %20, %26; +} +{ +mul.f16x2 r89, r86, r7; +} +{ +add.f16x2 r92, r83, r89; +} +{ +sub.f16x2 r95, %22, %24; +} +{ +mul.f16x2 r98, r95, r11; +} +{ +add.f16x2 r101, r92, r98; +} +{ +sub.f16x2 r104, r77, r101; +} +{ +add.f16x2 r107, %17, %27; +} +{ +mul.f16x2 r110, r107, r1; +} +{ +add.f16x2 r113, %15, r110; +} +{ +add.f16x2 r116, %19, %25; +} +{ +mul.f16x2 r119, r116, r5; +} +{ +add.f16x2 r122, r113, r119; +} +{ +add.f16x2 r125, %21, %23; +} +{ +mul.f16x2 r128, r125, r9; +} +{ +add.f16x2 r131, r122, r128; +} +{ +sub.f16x2 r134, %18, %28; +} +{ +mul.f16x2 r137, r134, r3; +} +{ +sub.f16x2 r140, %20, %26; +} +{ +mul.f16x2 r143, r140, r7; +} +{ +add.f16x2 r146, r137, r143; +} +{ +sub.f16x2 r149, %22, %24; +} +{ +mul.f16x2 r152, r149, r11; +} +{ +add.f16x2 r155, r146, r152; +} +{ +add.f16x2 r158, r131, r155; +} +{ +add.f16x2 r161, %17, %27; +} +{ +mul.f16x2 r164, r161, r5; +} +{ +add.f16x2 r167, %15, r164; +} +{ +add.f16x2 r170, %19, %25; +} +{ +mul.f16x2 r173, r170, r13; +} +{ +add.f16x2 r176, r167, r173; +} +{ +add.f16x2 r179, %21, %23; +} +{ +mul.f16x2 r182, r179, r15; +} +{ +add.f16x2 r185, r176, r182; +} +{ +sub.f16x2 r188, %18, %28; +} +{ +mul.f16x2 r191, r188, r7; +} +{ +sub.f16x2 r194, %20, %26; +} +{ +mul.f16x2 r197, r194, r14; +} +{ +add.f16x2 r200, r191, r197; +} +{ +sub.f16x2 r203, %22, %24; +} +{ +mul.f16x2 r206, r203, r16; +} +{ +add.f16x2 r209, r200, r206; +} +{ +sub.f16x2 r212, r185, r209; +} +{ +add.f16x2 r215, %17, %27; +} +{ +mul.f16x2 r218, r215, r5; +} +{ +add.f16x2 r221, %15, r218; +} +{ +add.f16x2 r224, %19, %25; +} +{ +mul.f16x2 r227, r224, r13; +} +{ +add.f16x2 r230, r221, r227; +} +{ +add.f16x2 r233, %21, %23; +} +{ +mul.f16x2 r236, r233, r15; +} +{ +add.f16x2 r239, r230, r236; +} +{ +sub.f16x2 r242, %18, %28; +} +{ +mul.f16x2 r245, r242, r7; +} +{ +sub.f16x2 r248, %20, %26; +} +{ +mul.f16x2 r251, r248, r14; +} +{ +add.f16x2 r254, r245, r251; +} +{ +sub.f16x2 r257, %22, %24; +} +{ +mul.f16x2 r260, r257, r16; +} +{ +add.f16x2 r263, r254, r260; +} +{ +add.f16x2 r266, r239, r263; +} +{ +add.f16x2 r269, %17, %27; +} +{ +mul.f16x2 r272, r269, r9; +} +{ +add.f16x2 r275, %15, r272; +} +{ +add.f16x2 r278, %19, %25; +} +{ +mul.f16x2 r281, r278, r15; +} +{ +add.f16x2 r284, r275, r281; +} +{ +add.f16x2 r287, %21, %23; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, r284, r290; +} +{ +sub.f16x2 r296, %18, %28; +} +{ +mul.f16x2 r299, r296, r11; +} +{ +sub.f16x2 r302, %20, %26; +} +{ +mul.f16x2 r305, r302, r16; +} +{ +add.f16x2 r308, r299, r305; +} +{ +sub.f16x2 r311, %22, %24; +} +{ +mul.f16x2 r314, r311, r7; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r293, r317; +} +{ +add.f16x2 r323, %17, %27; +} +{ +mul.f16x2 r326, r323, r9; +} +{ +add.f16x2 r329, %15, r326; +} +{ +add.f16x2 r332, %19, %25; +} +{ +mul.f16x2 r335, r332, r15; +} +{ +add.f16x2 r338, r329, r335; +} +{ +add.f16x2 r341, %21, %23; +} +{ +mul.f16x2 r344, r341, r5; +} +{ +add.f16x2 r347, r338, r344; +} +{ +sub.f16x2 r350, %18, %28; +} +{ +mul.f16x2 r353, r350, r11; +} +{ +sub.f16x2 r356, %20, %26; +} +{ +mul.f16x2 r359, r356, r16; +} +{ +add.f16x2 r362, r353, r359; +} +{ +sub.f16x2 r365, %22, %24; +} +{ +mul.f16x2 r368, r365, r7; +} +{ +add.f16x2 r371, r362, r368; +} +{ +add.f16x2 r374, r347, r371; +} +{ +add.f16x2 r377, %18, %28; +} +{ +mul.f16x2 r380, r377, r1; +} +{ +add.f16x2 r383, %16, r380; +} +{ +add.f16x2 r386, %20, %26; +} +{ +mul.f16x2 r389, r386, r5; +} +{ +add.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %22, %24; +} +{ +mul.f16x2 r398, r395, r9; +} +{ +add.f16x2 r401, r392, r398; +} +{ +sub.f16x2 r404, %17, %27; +} +{ +mul.f16x2 r407, r404, r3; +} +{ +sub.f16x2 r410, %19, %25; +} +{ +mul.f16x2 r413, r410, r7; +} +{ +add.f16x2 r416, r407, r413; +} +{ +sub.f16x2 r419, %21, %23; +} +{ +mul.f16x2 r422, r419, r11; +} +{ +add.f16x2 r425, r416, r422; +} +{ +add.f16x2 r428, r401, r425; +} +{ +add.f16x2 r431, %18, %28; +} +{ +mul.f16x2 r434, r431, r1; +} +{ +add.f16x2 r437, %16, r434; +} +{ +add.f16x2 r440, %20, %26; +} +{ +mul.f16x2 r443, r440, r5; +} +{ +add.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, %22, %24; +} +{ +mul.f16x2 r452, r449, r9; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, %17, %27; +} +{ +mul.f16x2 r461, r458, r3; +} +{ +sub.f16x2 r464, %19, %25; +} +{ +mul.f16x2 r467, r464, r7; +} +{ +add.f16x2 r470, r461, r467; +} +{ +sub.f16x2 r473, %21, %23; +} +{ +mul.f16x2 r476, r473, r11; +} +{ +add.f16x2 r479, r470, r476; +} +{ +sub.f16x2 r482, r455, r479; +} +{ +add.f16x2 r485, %18, %28; +} +{ +mul.f16x2 r488, r485, r5; +} +{ +add.f16x2 r491, %16, r488; +} +{ +add.f16x2 r494, %20, %26; +} +{ +mul.f16x2 r497, r494, r13; +} +{ +add.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, %22, %24; +} +{ +mul.f16x2 r506, r503, r15; +} +{ +add.f16x2 r509, r500, r506; +} +{ +sub.f16x2 r512, %17, %27; +} +{ +mul.f16x2 r515, r512, r7; +} +{ +sub.f16x2 r518, %19, %25; +} +{ +mul.f16x2 r521, r518, r14; +} +{ +add.f16x2 r524, r515, r521; +} +{ +sub.f16x2 r527, %21, %23; +} +{ +mul.f16x2 r530, r527, r16; +} +{ +add.f16x2 r533, r524, r530; +} +{ +add.f16x2 r536, r509, r533; +} +{ +add.f16x2 r539, %18, %28; +} +{ +mul.f16x2 r542, r539, r5; +} +{ +add.f16x2 r545, %16, r542; +} +{ +add.f16x2 r548, %20, %26; +} +{ +mul.f16x2 r551, r548, r13; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, %22, %24; +} +{ +mul.f16x2 r560, r557, r15; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, %17, %27; +} +{ +mul.f16x2 r569, r566, r7; +} +{ +sub.f16x2 r572, %19, %25; +} +{ +mul.f16x2 r575, r572, r14; +} +{ +add.f16x2 r578, r569, r575; +} +{ +sub.f16x2 r581, %21, %23; +} +{ +mul.f16x2 r584, r581, r16; +} +{ +add.f16x2 r587, r578, r584; +} +{ +sub.f16x2 r590, r563, r587; +} +{ +add.f16x2 r593, %18, %28; +} +{ +mul.f16x2 r596, r593, r9; +} +{ +add.f16x2 r599, %16, r596; +} +{ +add.f16x2 r602, %20, %26; +} +{ +mul.f16x2 r605, r602, r15; +} +{ +add.f16x2 r608, r599, r605; +} +{ +add.f16x2 r611, %22, %24; +} +{ +mul.f16x2 r614, r611, r5; +} +{ +add.f16x2 r617, r608, r614; +} +{ +sub.f16x2 r620, %17, %27; +} +{ +mul.f16x2 r623, r620, r11; +} +{ +sub.f16x2 r626, %19, %25; +} +{ +mul.f16x2 r629, r626, r16; +} +{ +add.f16x2 r632, r623, r629; +} +{ +sub.f16x2 r635, %21, %23; +} +{ +mul.f16x2 r638, r635, r7; +} +{ +add.f16x2 r641, r632, r638; +} +{ +add.f16x2 r644, r617, r641; +} +{ +add.f16x2 r647, %18, %28; +} +{ +mul.f16x2 r650, r647, r9; +} +{ +add.f16x2 r653, %16, r650; +} +{ +add.f16x2 r656, %20, %26; +} +{ +mul.f16x2 r659, r656, r15; +} +{ +add.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %22, %24; +} +{ +mul.f16x2 r668, r665, r5; +} +{ +add.f16x2 r671, r662, r668; +} +{ +sub.f16x2 r674, %17, %27; +} +{ +mul.f16x2 r677, r674, r11; +} +{ +sub.f16x2 r680, %19, %25; +} +{ +mul.f16x2 r683, r680, r16; +} +{ +add.f16x2 r686, r677, r683; +} +{ +sub.f16x2 r689, %21, %23; +} +{ +mul.f16x2 r692, r689, r7; +} +{ +add.f16x2 r695, r686, r692; +} +{ +sub.f16x2 r698, r671, r695; +} +mul.wide.u32 rd2, r2554, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r2555, rd3; +mul.lo.s32 r2556, r2555, 49; +sub.s32 r2557, r2554, r2556; +cvt.rn.f32.u32 f93, r2557; +mul.f32 f94, f93, 0f3C961050; +cos.approx.f32 f21, f94; +sin.approx.f32 f95, f94; +neg.f32 f22, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r701, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r706, {high, high}; +} +{ +mul.f16x2 r708, r428, r706; +} +{ +fma.rn.f16x2 r711, r104, r704, r708; +} +{ +mul.f16x2 r715, r104, r706; +} +{ +neg.f16x2 r718, r715; +} +{ +fma.rn.f16x2 r720, r428, r704, r718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r726, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r728, {low, high}; +} +{ +mul.f16x2 r729, r726, r728; +} +{ +mul.f16x2 r732, r701, r724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r735, {high, low}; +} +{ +fma.rn.f16x2 r737, r729, r735, r732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r536, r743; +} +{ +fma.rn.f16x2 r748, r212, r741, r745; +} +{ +mul.f16x2 r752, r212, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r536, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r737, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r644, r780; +} +{ +fma.rn.f16x2 r785, r320, r778, r782; +} +{ +mul.f16x2 r789, r320, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r644, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r698, r817; +} +{ +fma.rn.f16x2 r822, r374, r815, r819; +} +{ +mul.f16x2 r826, r374, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r698, r815, r829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r835, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r837, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r839, {low, high}; +} +{ +mul.f16x2 r840, r837, r839; +} +{ +mul.f16x2 r843, r811, r835; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r846, {high, low}; +} +{ +fma.rn.f16x2 r848, r840, r846, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r854, {high, high}; +} +{ +mul.f16x2 r856, r590, r854; +} +{ +fma.rn.f16x2 r859, r266, r852, r856; +} +{ +mul.f16x2 r863, r266, r854; +} +{ +neg.f16x2 r866, r863; +} +{ +fma.rn.f16x2 r868, r590, r852, r866; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r872, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r874, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r876, {low, high}; +} +{ +mul.f16x2 r877, r874, r876; +} +{ +mul.f16x2 r880, r848, r872; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r883, {high, low}; +} +{ +fma.rn.f16x2 r885, r877, r883, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r891, {high, high}; +} +{ +mul.f16x2 r893, r482, r891; +} +{ +fma.rn.f16x2 r896, r158, r889, r893; +} +{ +mul.f16x2 r900, r158, r891; +} +{ +neg.f16x2 r903, r900; +} +{ +fma.rn.f16x2 r905, r482, r889, r903; +} +mad.lo.s32 r2558, r2555, 2744, r2553; +barrier.sync 0; +mad.lo.s32 r2559, r2557, 56, r2558; +st.shared.v2.f32 [r2559], {r32, r50}; +st.shared.v2.f32 [r2559+8], {r711, r720}; +st.shared.v2.f32 [r2559+16], {r748, r757}; +st.shared.v2.f32 [r2559+24], {r785, r794}; +st.shared.v2.f32 [r2559+32], {r822, r831}; +st.shared.v2.f32 [r2559+40], {r859, r868}; +st.shared.v2.f32 [r2559+48], {r896, r905}; +barrier.sync 0; +mad.lo.s32 r2560, r2557, -48, r2559; +ld.shared.u32 r946, [r2560]; +ld.shared.u32 r964, [r2560+4]; +ld.shared.u32 r943, [r2560+392]; +ld.shared.u32 r961, [r2560+396]; +ld.shared.u32 r949, [r2560+784]; +ld.shared.u32 r967, [r2560+788]; +ld.shared.u32 r955, [r2560+1176]; +ld.shared.u32 r973, [r2560+1180]; +ld.shared.u32 r956, [r2560+1568]; +ld.shared.u32 r974, [r2560+1572]; +ld.shared.u32 r950, [r2560+1960]; +ld.shared.u32 r968, [r2560+1964]; +ld.shared.u32 r944, [r2560+2352]; +ld.shared.u32 r962, [r2560+2356]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r927, {low, high}; +} +{ +neg.f16x2 r928, r927; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r941, {low, high}; +} +{ +add.f16x2 r942, r943, r944; +} +{ +add.f16x2 r945, r946, r942; +} +{ +add.f16x2 r948, r949, r950; +} +{ +add.f16x2 r951, r945, r948; +} +{ +add.f16x2 r954, r955, r956; +} +{ +add.f16x2 r957, r951, r954; +} +{ +add.f16x2 r960, r961, r962; +} +{ +add.f16x2 r963, r964, r960; +} +{ +add.f16x2 r966, r967, r968; +} +{ +add.f16x2 r969, r963, r966; +} +{ +add.f16x2 r972, r973, r974; +} +{ +add.f16x2 r975, r969, r972; +} +{ +add.f16x2 r978, r943, r944; +} +{ +mul.f16x2 r981, r978, r926; +} +{ +add.f16x2 r984, r946, r981; +} +{ +add.f16x2 r987, r949, r950; +} +{ +mul.f16x2 r990, r987, r930; +} +{ +add.f16x2 r993, r984, r990; +} +{ +add.f16x2 r996, r955, r956; +} +{ +mul.f16x2 r999, r996, r934; +} +{ +add.f16x2 r1002, r993, r999; +} +{ +sub.f16x2 r1005, r961, r962; +} +{ +mul.f16x2 r1008, r1005, r928; +} +{ +sub.f16x2 r1011, r967, r968; +} +{ +mul.f16x2 r1014, r1011, r932; +} +{ +add.f16x2 r1017, r1008, r1014; +} +{ +sub.f16x2 r1020, r973, r974; +} +{ +mul.f16x2 r1023, r1020, r936; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r1002, r1026; +} +{ +add.f16x2 r1032, r943, r944; +} +{ +mul.f16x2 r1035, r1032, r926; +} +{ +add.f16x2 r1038, r946, r1035; +} +{ +add.f16x2 r1041, r949, r950; +} +{ +mul.f16x2 r1044, r1041, r930; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r955, r956; +} +{ +mul.f16x2 r1053, r1050, r934; +} +{ +add.f16x2 r1056, r1047, r1053; +} +{ +sub.f16x2 r1059, r961, r962; +} +{ +mul.f16x2 r1062, r1059, r928; +} +{ +sub.f16x2 r1065, r967, r968; +} +{ +mul.f16x2 r1068, r1065, r932; +} +{ +add.f16x2 r1071, r1062, r1068; +} +{ +sub.f16x2 r1074, r973, r974; +} +{ +mul.f16x2 r1077, r1074, r936; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r1056, r1080; +} +{ +add.f16x2 r1086, r943, r944; +} +{ +mul.f16x2 r1089, r1086, r930; +} +{ +add.f16x2 r1092, r946, r1089; +} +{ +add.f16x2 r1095, r949, r950; +} +{ +mul.f16x2 r1098, r1095, r938; +} +{ +add.f16x2 r1101, r1092, r1098; +} +{ +add.f16x2 r1104, r955, r956; +} +{ +mul.f16x2 r1107, r1104, r940; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, r961, r962; +} +{ +mul.f16x2 r1116, r1113, r932; +} +{ +sub.f16x2 r1119, r967, r968; +} +{ +mul.f16x2 r1122, r1119, r939; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r973, r974; +} +{ +mul.f16x2 r1131, r1128, r941; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r1110, r1134; +} +{ +add.f16x2 r1140, r943, r944; +} +{ +mul.f16x2 r1143, r1140, r930; +} +{ +add.f16x2 r1146, r946, r1143; +} +{ +add.f16x2 r1149, r949, r950; +} +{ +mul.f16x2 r1152, r1149, r938; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r955, r956; +} +{ +mul.f16x2 r1161, r1158, r940; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +sub.f16x2 r1167, r961, r962; +} +{ +mul.f16x2 r1170, r1167, r932; +} +{ +sub.f16x2 r1173, r967, r968; +} +{ +mul.f16x2 r1176, r1173, r939; +} +{ +add.f16x2 r1179, r1170, r1176; +} +{ +sub.f16x2 r1182, r973, r974; +} +{ +mul.f16x2 r1185, r1182, r941; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +add.f16x2 r1191, r1164, r1188; +} +{ +add.f16x2 r1194, r943, r944; +} +{ +mul.f16x2 r1197, r1194, r934; +} +{ +add.f16x2 r1200, r946, r1197; +} +{ +add.f16x2 r1203, r949, r950; +} +{ +mul.f16x2 r1206, r1203, r940; +} +{ +add.f16x2 r1209, r1200, r1206; +} +{ +add.f16x2 r1212, r955, r956; +} +{ +mul.f16x2 r1215, r1212, r930; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, r961, r962; +} +{ +mul.f16x2 r1224, r1221, r936; +} +{ +sub.f16x2 r1227, r967, r968; +} +{ +mul.f16x2 r1230, r1227, r941; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r973, r974; +} +{ +mul.f16x2 r1239, r1236, r932; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r1218, r1242; +} +{ +add.f16x2 r1248, r943, r944; +} +{ +mul.f16x2 r1251, r1248, r934; +} +{ +add.f16x2 r1254, r946, r1251; +} +{ +add.f16x2 r1257, r949, r950; +} +{ +mul.f16x2 r1260, r1257, r940; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r955, r956; +} +{ +mul.f16x2 r1269, r1266, r930; +} +{ +add.f16x2 r1272, r1263, r1269; +} +{ +sub.f16x2 r1275, r961, r962; +} +{ +mul.f16x2 r1278, r1275, r936; +} +{ +sub.f16x2 r1281, r967, r968; +} +{ +mul.f16x2 r1284, r1281, r941; +} +{ +add.f16x2 r1287, r1278, r1284; +} +{ +sub.f16x2 r1290, r973, r974; +} +{ +mul.f16x2 r1293, r1290, r932; +} +{ +add.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 r1299, r1272, r1296; +} +{ +add.f16x2 r1302, r961, r962; +} +{ +mul.f16x2 r1305, r1302, r926; +} +{ +add.f16x2 r1308, r964, r1305; +} +{ +add.f16x2 r1311, r967, r968; +} +{ +mul.f16x2 r1314, r1311, r930; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +add.f16x2 r1320, r973, r974; +} +{ +mul.f16x2 r1323, r1320, r934; +} +{ +add.f16x2 r1326, r1317, r1323; +} +{ +sub.f16x2 r1329, r943, r944; +} +{ +mul.f16x2 r1332, r1329, r928; +} +{ +sub.f16x2 r1335, r949, r950; +} +{ +mul.f16x2 r1338, r1335, r932; +} +{ +add.f16x2 r1341, r1332, r1338; +} +{ +sub.f16x2 r1344, r955, r956; +} +{ +mul.f16x2 r1347, r1344, r936; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1326, r1350; +} +{ +add.f16x2 r1356, r961, r962; +} +{ +mul.f16x2 r1359, r1356, r926; +} +{ +add.f16x2 r1362, r964, r1359; +} +{ +add.f16x2 r1365, r967, r968; +} +{ +mul.f16x2 r1368, r1365, r930; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r973, r974; +} +{ +mul.f16x2 r1377, r1374, r934; +} +{ +add.f16x2 r1380, r1371, r1377; +} +{ +sub.f16x2 r1383, r943, r944; +} +{ +mul.f16x2 r1386, r1383, r928; +} +{ +sub.f16x2 r1389, r949, r950; +} +{ +mul.f16x2 r1392, r1389, r932; +} +{ +add.f16x2 r1395, r1386, r1392; +} +{ +sub.f16x2 r1398, r955, r956; +} +{ +mul.f16x2 r1401, r1398, r936; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +sub.f16x2 r1407, r1380, r1404; +} +{ +add.f16x2 r1410, r961, r962; +} +{ +mul.f16x2 r1413, r1410, r930; +} +{ +add.f16x2 r1416, r964, r1413; +} +{ +add.f16x2 r1419, r967, r968; +} +{ +mul.f16x2 r1422, r1419, r938; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +add.f16x2 r1428, r973, r974; +} +{ +mul.f16x2 r1431, r1428, r940; +} +{ +add.f16x2 r1434, r1425, r1431; +} +{ +sub.f16x2 r1437, r943, r944; +} +{ +mul.f16x2 r1440, r1437, r932; +} +{ +sub.f16x2 r1443, r949, r950; +} +{ +mul.f16x2 r1446, r1443, r939; +} +{ +add.f16x2 r1449, r1440, r1446; +} +{ +sub.f16x2 r1452, r955, r956; +} +{ +mul.f16x2 r1455, r1452, r941; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +add.f16x2 r1461, r1434, r1458; +} +{ +add.f16x2 r1464, r961, r962; +} +{ +mul.f16x2 r1467, r1464, r930; +} +{ +add.f16x2 r1470, r964, r1467; +} +{ +add.f16x2 r1473, r967, r968; +} +{ +mul.f16x2 r1476, r1473, r938; +} +{ +add.f16x2 r1479, r1470, r1476; +} +{ +add.f16x2 r1482, r973, r974; +} +{ +mul.f16x2 r1485, r1482, r940; +} +{ +add.f16x2 r1488, r1479, r1485; +} +{ +sub.f16x2 r1491, r943, r944; +} +{ +mul.f16x2 r1494, r1491, r932; +} +{ +sub.f16x2 r1497, r949, r950; +} +{ +mul.f16x2 r1500, r1497, r939; +} +{ +add.f16x2 r1503, r1494, r1500; +} +{ +sub.f16x2 r1506, r955, r956; +} +{ +mul.f16x2 r1509, r1506, r941; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +sub.f16x2 r1515, r1488, r1512; +} +{ +add.f16x2 r1518, r961, r962; +} +{ +mul.f16x2 r1521, r1518, r934; +} +{ +add.f16x2 r1524, r964, r1521; +} +{ +add.f16x2 r1527, r967, r968; +} +{ +mul.f16x2 r1530, r1527, r940; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +add.f16x2 r1536, r973, r974; +} +{ +mul.f16x2 r1539, r1536, r930; +} +{ +add.f16x2 r1542, r1533, r1539; +} +{ +sub.f16x2 r1545, r943, r944; +} +{ +mul.f16x2 r1548, r1545, r936; +} +{ +sub.f16x2 r1551, r949, r950; +} +{ +mul.f16x2 r1554, r1551, r941; +} +{ +add.f16x2 r1557, r1548, r1554; +} +{ +sub.f16x2 r1560, r955, r956; +} +{ +mul.f16x2 r1563, r1560, r932; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +add.f16x2 r1569, r1542, r1566; +} +{ +add.f16x2 r1572, r961, r962; +} +{ +mul.f16x2 r1575, r1572, r934; +} +{ +add.f16x2 r1578, r964, r1575; +} +{ +add.f16x2 r1581, r967, r968; +} +{ +mul.f16x2 r1584, r1581, r940; +} +{ +add.f16x2 r1587, r1578, r1584; +} +{ +add.f16x2 r1590, r973, r974; +} +{ +mul.f16x2 r1593, r1590, r930; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, r943, r944; +} +{ +mul.f16x2 r1602, r1599, r936; +} +{ +sub.f16x2 r1605, r949, r950; +} +{ +mul.f16x2 r1608, r1605, r941; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r955, r956; +} +{ +mul.f16x2 r1617, r1614, r932; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +sub.f16x2 r1623, r1596, r1620; +} +mul.wide.u32 rd4, r2557, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r2561, rd5; +sub.s32 r2562, r2557, r2561; +shr.u32 r2563, r2562, 1; +add.s32 r2564, r2563, r2561; +shr.u32 r2565, r2564, 2; +cvt.rn.f32.u32 f96, r2565; +mul.f32 f97, f96, 0f3E034E46; +cos.approx.f32 f57, f97; +sin.approx.f32 f98, f97; +neg.f32 f58, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1626, {low, high}; +} +mul.lo.s32 r2566, r2565, 7; +sub.s32 r2567, r2557, r2566; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1629, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1631, {high, high}; +} +{ +mul.f16x2 r1633, r1353, r1631; +} +{ +fma.rn.f16x2 r1636, r1029, r1629, r1633; +} +{ +mul.f16x2 r1640, r1029, r1631; +} +{ +neg.f16x2 r1643, r1640; +} +{ +fma.rn.f16x2 r1645, r1353, r1629, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1651, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1653, {low, high}; +} +{ +mul.f16x2 r1654, r1651, r1653; +} +{ +mul.f16x2 r1657, r1626, r1649; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1660, {high, low}; +} +{ +fma.rn.f16x2 r1662, r1654, r1660, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1666, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1668, {high, high}; +} +{ +mul.f16x2 r1670, r1461, r1668; +} +{ +fma.rn.f16x2 r1673, r1137, r1666, r1670; +} +{ +mul.f16x2 r1677, r1137, r1668; +} +{ +neg.f16x2 r1680, r1677; +} +{ +fma.rn.f16x2 r1682, r1461, r1666, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1688, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1688, r1690; +} +{ +mul.f16x2 r1694, r1662, r1686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1697, {high, low}; +} +{ +fma.rn.f16x2 r1699, r1691, r1697, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1703, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1705, {high, high}; +} +{ +mul.f16x2 r1707, r1569, r1705; +} +{ +fma.rn.f16x2 r1710, r1245, r1703, r1707; +} +{ +mul.f16x2 r1714, r1245, r1705; +} +{ +neg.f16x2 r1717, r1714; +} +{ +fma.rn.f16x2 r1719, r1569, r1703, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1725, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1725, r1727; +} +{ +mul.f16x2 r1731, r1699, r1723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1734, {high, low}; +} +{ +fma.rn.f16x2 r1736, r1728, r1734, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1742, {high, high}; +} +{ +mul.f16x2 r1744, r1623, r1742; +} +{ +fma.rn.f16x2 r1747, r1299, r1740, r1744; +} +{ +mul.f16x2 r1751, r1299, r1742; +} +{ +neg.f16x2 r1754, r1751; +} +{ +fma.rn.f16x2 r1756, r1623, r1740, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1762, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1764, {low, high}; +} +{ +mul.f16x2 r1765, r1762, r1764; +} +{ +mul.f16x2 r1768, r1736, r1760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1771, {high, low}; +} +{ +fma.rn.f16x2 r1773, r1765, r1771, r1768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1779, {high, high}; +} +{ +mul.f16x2 r1781, r1515, r1779; +} +{ +fma.rn.f16x2 r1784, r1191, r1777, r1781; +} +{ +mul.f16x2 r1788, r1191, r1779; +} +{ +neg.f16x2 r1791, r1788; +} +{ +fma.rn.f16x2 r1793, r1515, r1777, r1791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1799, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1801, {low, high}; +} +{ +mul.f16x2 r1802, r1799, r1801; +} +{ +mul.f16x2 r1805, r1773, r1797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1808, {high, low}; +} +{ +fma.rn.f16x2 r1810, r1802, r1808, r1805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1816, {high, high}; +} +{ +mul.f16x2 r1818, r1407, r1816; +} +{ +fma.rn.f16x2 r1821, r1083, r1814, r1818; +} +{ +mul.f16x2 r1825, r1083, r1816; +} +{ +neg.f16x2 r1828, r1825; +} +{ +fma.rn.f16x2 r1830, r1407, r1814, r1828; +} +shl.b32 r2568, r2567, 3; +add.s32 r2569, r2558, r2568; +barrier.sync 0; +mad.lo.s32 r2570, r2565, 392, r2569; +st.shared.u32 [r2570], r957; +st.shared.u32 [r2570+4], r975; +st.shared.u32 [r2570+56], r1636; +st.shared.u32 [r2570+60], r1645; +st.shared.u32 [r2570+112], r1673; +st.shared.u32 [r2570+116], r1682; +st.shared.u32 [r2570+168], r1710; +st.shared.u32 [r2570+172], r1719; +st.shared.u32 [r2570+224], r1747; +st.shared.u32 [r2570+228], r1756; +st.shared.u32 [r2570+280], r1784; +st.shared.u32 [r2570+284], r1793; +st.shared.u32 [r2570+336], r1821; +st.shared.u32 [r2570+340], r1830; +barrier.sync 0; +ld.shared.u32 r1871, [r2560]; +ld.shared.u32 r1889, [r2560+4]; +ld.shared.u32 r1868, [r2560+392]; +ld.shared.u32 r1886, [r2560+396]; +ld.shared.u32 r1874, [r2560+784]; +ld.shared.u32 r1892, [r2560+788]; +ld.shared.u32 r1880, [r2560+1176]; +ld.shared.u32 r1898, [r2560+1180]; +ld.shared.u32 r1881, [r2560+1568]; +ld.shared.u32 r1899, [r2560+1572]; +ld.shared.u32 r1875, [r2560+1960]; +ld.shared.u32 r1893, [r2560+1964]; +ld.shared.u32 r1869, [r2560+2352]; +ld.shared.u32 r1887, [r2560+2356]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1852, {low, high}; +} +{ +neg.f16x2 r1853, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1856, {low, high}; +} +{ +neg.f16x2 r1857, r1856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1859, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1860, {low, high}; +} +{ +neg.f16x2 r1861, r1860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1864, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1866, {low, high}; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1871, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 r1876, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 %0, r1876, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 r1894, r1888, r1891; +} +{ +add.f16x2 r1897, r1898, r1899; +} +{ +add.f16x2 %1, r1894, r1897; +} +{ +add.f16x2 r1903, r1868, r1869; +} +{ +mul.f16x2 r1906, r1903, r1851; +} +{ +add.f16x2 r1909, r1871, r1906; +} +{ +add.f16x2 r1912, r1874, r1875; +} +{ +mul.f16x2 r1915, r1912, r1855; +} +{ +add.f16x2 r1918, r1909, r1915; +} +{ +add.f16x2 r1921, r1880, r1881; +} +{ +mul.f16x2 r1924, r1921, r1859; +} +{ +add.f16x2 r1927, r1918, r1924; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1853; +} +{ +sub.f16x2 r1936, r1892, r1893; +} +{ +mul.f16x2 r1939, r1936, r1857; +} +{ +add.f16x2 r1942, r1933, r1939; +} +{ +sub.f16x2 r1945, r1898, r1899; +} +{ +mul.f16x2 r1948, r1945, r1861; +} +{ +add.f16x2 r1951, r1942, r1948; +} +{ +sub.f16x2 %2, r1927, r1951; +} +{ +add.f16x2 r1957, r1868, r1869; +} +{ +mul.f16x2 r1960, r1957, r1851; +} +{ +add.f16x2 r1963, r1871, r1960; +} +{ +add.f16x2 r1966, r1874, r1875; +} +{ +mul.f16x2 r1969, r1966, r1855; +} +{ +add.f16x2 r1972, r1963, r1969; +} +{ +add.f16x2 r1975, r1880, r1881; +} +{ +mul.f16x2 r1978, r1975, r1859; +} +{ +add.f16x2 r1981, r1972, r1978; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1853; +} +{ +sub.f16x2 r1990, r1892, r1893; +} +{ +mul.f16x2 r1993, r1990, r1857; +} +{ +add.f16x2 r1996, r1987, r1993; +} +{ +sub.f16x2 r1999, r1898, r1899; +} +{ +mul.f16x2 r2002, r1999, r1861; +} +{ +add.f16x2 r2005, r1996, r2002; +} +{ +add.f16x2 %12, r1981, r2005; +} +{ +add.f16x2 r2011, r1868, r1869; +} +{ +mul.f16x2 r2014, r2011, r1855; +} +{ +add.f16x2 r2017, r1871, r2014; +} +{ +add.f16x2 r2020, r1874, r1875; +} +{ +mul.f16x2 r2023, r2020, r1863; +} +{ +add.f16x2 r2026, r2017, r2023; +} +{ +add.f16x2 r2029, r1880, r1881; +} +{ +mul.f16x2 r2032, r2029, r1865; +} +{ +add.f16x2 r2035, r2026, r2032; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1857; +} +{ +sub.f16x2 r2044, r1892, r1893; +} +{ +mul.f16x2 r2047, r2044, r1864; +} +{ +add.f16x2 r2050, r2041, r2047; +} +{ +sub.f16x2 r2053, r1898, r1899; +} +{ +mul.f16x2 r2056, r2053, r1866; +} +{ +add.f16x2 r2059, r2050, r2056; +} +{ +sub.f16x2 %4, r2035, r2059; +} +{ +add.f16x2 r2065, r1868, r1869; +} +{ +mul.f16x2 r2068, r2065, r1855; +} +{ +add.f16x2 r2071, r1871, r2068; +} +{ +add.f16x2 r2074, r1874, r1875; +} +{ +mul.f16x2 r2077, r2074, r1863; +} +{ +add.f16x2 r2080, r2071, r2077; +} +{ +add.f16x2 r2083, r1880, r1881; +} +{ +mul.f16x2 r2086, r2083, r1865; +} +{ +add.f16x2 r2089, r2080, r2086; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1857; +} +{ +sub.f16x2 r2098, r1892, r1893; +} +{ +mul.f16x2 r2101, r2098, r1864; +} +{ +add.f16x2 r2104, r2095, r2101; +} +{ +sub.f16x2 r2107, r1898, r1899; +} +{ +mul.f16x2 r2110, r2107, r1866; +} +{ +add.f16x2 r2113, r2104, r2110; +} +{ +add.f16x2 %10, r2089, r2113; +} +{ +add.f16x2 r2119, r1868, r1869; +} +{ +mul.f16x2 r2122, r2119, r1859; +} +{ +add.f16x2 r2125, r1871, r2122; +} +{ +add.f16x2 r2128, r1874, r1875; +} +{ +mul.f16x2 r2131, r2128, r1865; +} +{ +add.f16x2 r2134, r2125, r2131; +} +{ +add.f16x2 r2137, r1880, r1881; +} +{ +mul.f16x2 r2140, r2137, r1855; +} +{ +add.f16x2 r2143, r2134, r2140; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1861; +} +{ +sub.f16x2 r2152, r1892, r1893; +} +{ +mul.f16x2 r2155, r2152, r1866; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +sub.f16x2 r2161, r1898, r1899; +} +{ +mul.f16x2 r2164, r2161, r1857; +} +{ +add.f16x2 r2167, r2158, r2164; +} +{ +sub.f16x2 %6, r2143, r2167; +} +{ +add.f16x2 r2173, r1868, r1869; +} +{ +mul.f16x2 r2176, r2173, r1859; +} +{ +add.f16x2 r2179, r1871, r2176; +} +{ +add.f16x2 r2182, r1874, r1875; +} +{ +mul.f16x2 r2185, r2182, r1865; +} +{ +add.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r1880, r1881; +} +{ +mul.f16x2 r2194, r2191, r1855; +} +{ +add.f16x2 r2197, r2188, r2194; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1861; +} +{ +sub.f16x2 r2206, r1892, r1893; +} +{ +mul.f16x2 r2209, r2206, r1866; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +sub.f16x2 r2215, r1898, r1899; +} +{ +mul.f16x2 r2218, r2215, r1857; +} +{ +add.f16x2 r2221, r2212, r2218; +} +{ +add.f16x2 %8, r2197, r2221; +} +{ +add.f16x2 r2227, r1886, r1887; +} +{ +mul.f16x2 r2230, r2227, r1851; +} +{ +add.f16x2 r2233, r1889, r2230; +} +{ +add.f16x2 r2236, r1892, r1893; +} +{ +mul.f16x2 r2239, r2236, r1855; +} +{ +add.f16x2 r2242, r2233, r2239; +} +{ +add.f16x2 r2245, r1898, r1899; +} +{ +mul.f16x2 r2248, r2245, r1859; +} +{ +add.f16x2 r2251, r2242, r2248; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1853; +} +{ +sub.f16x2 r2260, r1874, r1875; +} +{ +mul.f16x2 r2263, r2260, r1857; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +sub.f16x2 r2269, r1880, r1881; +} +{ +mul.f16x2 r2272, r2269, r1861; +} +{ +add.f16x2 r2275, r2266, r2272; +} +{ +add.f16x2 %3, r2251, r2275; +} +{ +add.f16x2 r2281, r1886, r1887; +} +{ +mul.f16x2 r2284, r2281, r1851; +} +{ +add.f16x2 r2287, r1889, r2284; +} +{ +add.f16x2 r2290, r1892, r1893; +} +{ +mul.f16x2 r2293, r2290, r1855; +} +{ +add.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r1898, r1899; +} +{ +mul.f16x2 r2302, r2299, r1859; +} +{ +add.f16x2 r2305, r2296, r2302; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1853; +} +{ +sub.f16x2 r2314, r1874, r1875; +} +{ +mul.f16x2 r2317, r2314, r1857; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r1880, r1881; +} +{ +mul.f16x2 r2326, r2323, r1861; +} +{ +add.f16x2 r2329, r2320, r2326; +} +{ +sub.f16x2 %13, r2305, r2329; +} +{ +add.f16x2 r2335, r1886, r1887; +} +{ +mul.f16x2 r2338, r2335, r1855; +} +{ +add.f16x2 r2341, r1889, r2338; +} +{ +add.f16x2 r2344, r1892, r1893; +} +{ +mul.f16x2 r2347, r2344, r1863; +} +{ +add.f16x2 r2350, r2341, r2347; +} +{ +add.f16x2 r2353, r1898, r1899; +} +{ +mul.f16x2 r2356, r2353, r1865; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1857; +} +{ +sub.f16x2 r2368, r1874, r1875; +} +{ +mul.f16x2 r2371, r2368, r1864; +} +{ +add.f16x2 r2374, r2365, r2371; +} +{ +sub.f16x2 r2377, r1880, r1881; +} +{ +mul.f16x2 r2380, r2377, r1866; +} +{ +add.f16x2 r2383, r2374, r2380; +} +{ +add.f16x2 %5, r2359, r2383; +} +{ +add.f16x2 r2389, r1886, r1887; +} +{ +mul.f16x2 r2392, r2389, r1855; +} +{ +add.f16x2 r2395, r1889, r2392; +} +{ +add.f16x2 r2398, r1892, r1893; +} +{ +mul.f16x2 r2401, r2398, r1863; +} +{ +add.f16x2 r2404, r2395, r2401; +} +{ +add.f16x2 r2407, r1898, r1899; +} +{ +mul.f16x2 r2410, r2407, r1865; +} +{ +add.f16x2 r2413, r2404, r2410; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1857; +} +{ +sub.f16x2 r2422, r1874, r1875; +} +{ +mul.f16x2 r2425, r2422, r1864; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r1880, r1881; +} +{ +mul.f16x2 r2434, r2431, r1866; +} +{ +add.f16x2 r2437, r2428, r2434; +} +{ +sub.f16x2 %11, r2413, r2437; +} +{ +add.f16x2 r2443, r1886, r1887; +} +{ +mul.f16x2 r2446, r2443, r1859; +} +{ +add.f16x2 r2449, r1889, r2446; +} +{ +add.f16x2 r2452, r1892, r1893; +} +{ +mul.f16x2 r2455, r2452, r1865; +} +{ +add.f16x2 r2458, r2449, r2455; +} +{ +add.f16x2 r2461, r1898, r1899; +} +{ +mul.f16x2 r2464, r2461, r1855; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1861; +} +{ +sub.f16x2 r2476, r1874, r1875; +} +{ +mul.f16x2 r2479, r2476, r1866; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +sub.f16x2 r2485, r1880, r1881; +} +{ +mul.f16x2 r2488, r2485, r1857; +} +{ +add.f16x2 r2491, r2482, r2488; +} +{ +add.f16x2 %7, r2467, r2491; +} +{ +add.f16x2 r2497, r1886, r1887; +} +{ +mul.f16x2 r2500, r2497, r1859; +} +{ +add.f16x2 r2503, r1889, r2500; +} +{ +add.f16x2 r2506, r1892, r1893; +} +{ +mul.f16x2 r2509, r2506, r1865; +} +{ +add.f16x2 r2512, r2503, r2509; +} +{ +add.f16x2 r2515, r1898, r1899; +} +{ +mul.f16x2 r2518, r2515, r1855; +} +{ +add.f16x2 r2521, r2512, r2518; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1861; +} +{ +sub.f16x2 r2530, r1874, r1875; +} +{ +mul.f16x2 r2533, r2530, r1866; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r1880, r1881; +} +{ +mul.f16x2 r2542, r2539, r1857; +} +{ +add.f16x2 r2545, r2536, r2542; +} +{ +sub.f16x2 %9, r2521, r2545; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1123, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<99>; +.reg .b32 r<2571>; +.reg .b64 rd<6>; +mov.u32 r2551, %tid.y; +mov.u32 r2552, %14; +mad.lo.s32 r2553, r2551, 1372, r2552; +mov.u32 r2554, %tid.x; +mov.f32 f90, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1, {low, high}; +} +mov.f32 f92, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f78, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r5, {low, high}; +} +mov.f32 f80, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +mov.f32 f86, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r9, {low, high}; +} +mov.f32 f88, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r10, {low, high}; +} +{ +neg.f16x2 r11, r10; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r14, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r15, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r16, {low, high}; +} +{ +add.f16x2 r17, %17, %27; +} +{ +add.f16x2 r20, %15, r17; +} +{ +add.f16x2 r23, %19, %25; +} +{ +add.f16x2 r26, r20, r23; +} +{ +add.f16x2 r29, %21, %23; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %18, %28; +} +{ +add.f16x2 r38, %16, r35; +} +{ +add.f16x2 r41, %20, %26; +} +{ +add.f16x2 r44, r38, r41; +} +{ +add.f16x2 r47, %22, %24; +} +{ +add.f16x2 r50, r44, r47; +} +{ +add.f16x2 r53, %17, %27; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %15, r56; +} +{ +add.f16x2 r62, %19, %25; +} +{ +mul.f16x2 r65, r62, r5; +} +{ +add.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %21, %23; +} +{ +mul.f16x2 r74, r71, r9; +} +{ +add.f16x2 r77, r68, r74; +} +{ +sub.f16x2 r80, %18, %28; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +sub.f16x2 r86, %20, %26; +} +{ +mul.f16x2 r89, r86, r7; +} +{ +add.f16x2 r92, r83, r89; +} +{ +sub.f16x2 r95, %22, %24; +} +{ +mul.f16x2 r98, r95, r11; +} +{ +add.f16x2 r101, r92, r98; +} +{ +sub.f16x2 r104, r77, r101; +} +{ +add.f16x2 r107, %17, %27; +} +{ +mul.f16x2 r110, r107, r1; +} +{ +add.f16x2 r113, %15, r110; +} +{ +add.f16x2 r116, %19, %25; +} +{ +mul.f16x2 r119, r116, r5; +} +{ +add.f16x2 r122, r113, r119; +} +{ +add.f16x2 r125, %21, %23; +} +{ +mul.f16x2 r128, r125, r9; +} +{ +add.f16x2 r131, r122, r128; +} +{ +sub.f16x2 r134, %18, %28; +} +{ +mul.f16x2 r137, r134, r3; +} +{ +sub.f16x2 r140, %20, %26; +} +{ +mul.f16x2 r143, r140, r7; +} +{ +add.f16x2 r146, r137, r143; +} +{ +sub.f16x2 r149, %22, %24; +} +{ +mul.f16x2 r152, r149, r11; +} +{ +add.f16x2 r155, r146, r152; +} +{ +add.f16x2 r158, r131, r155; +} +{ +add.f16x2 r161, %17, %27; +} +{ +mul.f16x2 r164, r161, r5; +} +{ +add.f16x2 r167, %15, r164; +} +{ +add.f16x2 r170, %19, %25; +} +{ +mul.f16x2 r173, r170, r13; +} +{ +add.f16x2 r176, r167, r173; +} +{ +add.f16x2 r179, %21, %23; +} +{ +mul.f16x2 r182, r179, r15; +} +{ +add.f16x2 r185, r176, r182; +} +{ +sub.f16x2 r188, %18, %28; +} +{ +mul.f16x2 r191, r188, r7; +} +{ +sub.f16x2 r194, %20, %26; +} +{ +mul.f16x2 r197, r194, r14; +} +{ +add.f16x2 r200, r191, r197; +} +{ +sub.f16x2 r203, %22, %24; +} +{ +mul.f16x2 r206, r203, r16; +} +{ +add.f16x2 r209, r200, r206; +} +{ +sub.f16x2 r212, r185, r209; +} +{ +add.f16x2 r215, %17, %27; +} +{ +mul.f16x2 r218, r215, r5; +} +{ +add.f16x2 r221, %15, r218; +} +{ +add.f16x2 r224, %19, %25; +} +{ +mul.f16x2 r227, r224, r13; +} +{ +add.f16x2 r230, r221, r227; +} +{ +add.f16x2 r233, %21, %23; +} +{ +mul.f16x2 r236, r233, r15; +} +{ +add.f16x2 r239, r230, r236; +} +{ +sub.f16x2 r242, %18, %28; +} +{ +mul.f16x2 r245, r242, r7; +} +{ +sub.f16x2 r248, %20, %26; +} +{ +mul.f16x2 r251, r248, r14; +} +{ +add.f16x2 r254, r245, r251; +} +{ +sub.f16x2 r257, %22, %24; +} +{ +mul.f16x2 r260, r257, r16; +} +{ +add.f16x2 r263, r254, r260; +} +{ +add.f16x2 r266, r239, r263; +} +{ +add.f16x2 r269, %17, %27; +} +{ +mul.f16x2 r272, r269, r9; +} +{ +add.f16x2 r275, %15, r272; +} +{ +add.f16x2 r278, %19, %25; +} +{ +mul.f16x2 r281, r278, r15; +} +{ +add.f16x2 r284, r275, r281; +} +{ +add.f16x2 r287, %21, %23; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, r284, r290; +} +{ +sub.f16x2 r296, %18, %28; +} +{ +mul.f16x2 r299, r296, r11; +} +{ +sub.f16x2 r302, %20, %26; +} +{ +mul.f16x2 r305, r302, r16; +} +{ +add.f16x2 r308, r299, r305; +} +{ +sub.f16x2 r311, %22, %24; +} +{ +mul.f16x2 r314, r311, r7; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r293, r317; +} +{ +add.f16x2 r323, %17, %27; +} +{ +mul.f16x2 r326, r323, r9; +} +{ +add.f16x2 r329, %15, r326; +} +{ +add.f16x2 r332, %19, %25; +} +{ +mul.f16x2 r335, r332, r15; +} +{ +add.f16x2 r338, r329, r335; +} +{ +add.f16x2 r341, %21, %23; +} +{ +mul.f16x2 r344, r341, r5; +} +{ +add.f16x2 r347, r338, r344; +} +{ +sub.f16x2 r350, %18, %28; +} +{ +mul.f16x2 r353, r350, r11; +} +{ +sub.f16x2 r356, %20, %26; +} +{ +mul.f16x2 r359, r356, r16; +} +{ +add.f16x2 r362, r353, r359; +} +{ +sub.f16x2 r365, %22, %24; +} +{ +mul.f16x2 r368, r365, r7; +} +{ +add.f16x2 r371, r362, r368; +} +{ +add.f16x2 r374, r347, r371; +} +{ +add.f16x2 r377, %18, %28; +} +{ +mul.f16x2 r380, r377, r1; +} +{ +add.f16x2 r383, %16, r380; +} +{ +add.f16x2 r386, %20, %26; +} +{ +mul.f16x2 r389, r386, r5; +} +{ +add.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %22, %24; +} +{ +mul.f16x2 r398, r395, r9; +} +{ +add.f16x2 r401, r392, r398; +} +{ +sub.f16x2 r404, %17, %27; +} +{ +mul.f16x2 r407, r404, r3; +} +{ +sub.f16x2 r410, %19, %25; +} +{ +mul.f16x2 r413, r410, r7; +} +{ +add.f16x2 r416, r407, r413; +} +{ +sub.f16x2 r419, %21, %23; +} +{ +mul.f16x2 r422, r419, r11; +} +{ +add.f16x2 r425, r416, r422; +} +{ +add.f16x2 r428, r401, r425; +} +{ +add.f16x2 r431, %18, %28; +} +{ +mul.f16x2 r434, r431, r1; +} +{ +add.f16x2 r437, %16, r434; +} +{ +add.f16x2 r440, %20, %26; +} +{ +mul.f16x2 r443, r440, r5; +} +{ +add.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, %22, %24; +} +{ +mul.f16x2 r452, r449, r9; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, %17, %27; +} +{ +mul.f16x2 r461, r458, r3; +} +{ +sub.f16x2 r464, %19, %25; +} +{ +mul.f16x2 r467, r464, r7; +} +{ +add.f16x2 r470, r461, r467; +} +{ +sub.f16x2 r473, %21, %23; +} +{ +mul.f16x2 r476, r473, r11; +} +{ +add.f16x2 r479, r470, r476; +} +{ +sub.f16x2 r482, r455, r479; +} +{ +add.f16x2 r485, %18, %28; +} +{ +mul.f16x2 r488, r485, r5; +} +{ +add.f16x2 r491, %16, r488; +} +{ +add.f16x2 r494, %20, %26; +} +{ +mul.f16x2 r497, r494, r13; +} +{ +add.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, %22, %24; +} +{ +mul.f16x2 r506, r503, r15; +} +{ +add.f16x2 r509, r500, r506; +} +{ +sub.f16x2 r512, %17, %27; +} +{ +mul.f16x2 r515, r512, r7; +} +{ +sub.f16x2 r518, %19, %25; +} +{ +mul.f16x2 r521, r518, r14; +} +{ +add.f16x2 r524, r515, r521; +} +{ +sub.f16x2 r527, %21, %23; +} +{ +mul.f16x2 r530, r527, r16; +} +{ +add.f16x2 r533, r524, r530; +} +{ +add.f16x2 r536, r509, r533; +} +{ +add.f16x2 r539, %18, %28; +} +{ +mul.f16x2 r542, r539, r5; +} +{ +add.f16x2 r545, %16, r542; +} +{ +add.f16x2 r548, %20, %26; +} +{ +mul.f16x2 r551, r548, r13; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, %22, %24; +} +{ +mul.f16x2 r560, r557, r15; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, %17, %27; +} +{ +mul.f16x2 r569, r566, r7; +} +{ +sub.f16x2 r572, %19, %25; +} +{ +mul.f16x2 r575, r572, r14; +} +{ +add.f16x2 r578, r569, r575; +} +{ +sub.f16x2 r581, %21, %23; +} +{ +mul.f16x2 r584, r581, r16; +} +{ +add.f16x2 r587, r578, r584; +} +{ +sub.f16x2 r590, r563, r587; +} +{ +add.f16x2 r593, %18, %28; +} +{ +mul.f16x2 r596, r593, r9; +} +{ +add.f16x2 r599, %16, r596; +} +{ +add.f16x2 r602, %20, %26; +} +{ +mul.f16x2 r605, r602, r15; +} +{ +add.f16x2 r608, r599, r605; +} +{ +add.f16x2 r611, %22, %24; +} +{ +mul.f16x2 r614, r611, r5; +} +{ +add.f16x2 r617, r608, r614; +} +{ +sub.f16x2 r620, %17, %27; +} +{ +mul.f16x2 r623, r620, r11; +} +{ +sub.f16x2 r626, %19, %25; +} +{ +mul.f16x2 r629, r626, r16; +} +{ +add.f16x2 r632, r623, r629; +} +{ +sub.f16x2 r635, %21, %23; +} +{ +mul.f16x2 r638, r635, r7; +} +{ +add.f16x2 r641, r632, r638; +} +{ +add.f16x2 r644, r617, r641; +} +{ +add.f16x2 r647, %18, %28; +} +{ +mul.f16x2 r650, r647, r9; +} +{ +add.f16x2 r653, %16, r650; +} +{ +add.f16x2 r656, %20, %26; +} +{ +mul.f16x2 r659, r656, r15; +} +{ +add.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %22, %24; +} +{ +mul.f16x2 r668, r665, r5; +} +{ +add.f16x2 r671, r662, r668; +} +{ +sub.f16x2 r674, %17, %27; +} +{ +mul.f16x2 r677, r674, r11; +} +{ +sub.f16x2 r680, %19, %25; +} +{ +mul.f16x2 r683, r680, r16; +} +{ +add.f16x2 r686, r677, r683; +} +{ +sub.f16x2 r689, %21, %23; +} +{ +mul.f16x2 r692, r689, r7; +} +{ +add.f16x2 r695, r686, r692; +} +{ +sub.f16x2 r698, r671, r695; +} +mul.wide.u32 rd2, r2554, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r2555, rd3; +mul.lo.s32 r2556, r2555, 49; +sub.s32 r2557, r2554, r2556; +cvt.rn.f32.u32 f93, r2557; +mul.f32 f94, f93, 0f3C961050; +cos.approx.f32 f21, f94; +sin.approx.f32 f95, f94; +neg.f32 f22, f95; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r701, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r706, {high, high}; +} +{ +mul.f16x2 r708, r428, r706; +} +{ +fma.rn.f16x2 r711, r104, r704, r708; +} +{ +mul.f16x2 r715, r104, r706; +} +{ +neg.f16x2 r718, r715; +} +{ +fma.rn.f16x2 r720, r428, r704, r718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r726, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r728, {low, high}; +} +{ +mul.f16x2 r729, r726, r728; +} +{ +mul.f16x2 r732, r701, r724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r735, {high, low}; +} +{ +fma.rn.f16x2 r737, r729, r735, r732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r536, r743; +} +{ +fma.rn.f16x2 r748, r212, r741, r745; +} +{ +mul.f16x2 r752, r212, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r536, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r737, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r644, r780; +} +{ +fma.rn.f16x2 r785, r320, r778, r782; +} +{ +mul.f16x2 r789, r320, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r644, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r698, r817; +} +{ +fma.rn.f16x2 r822, r374, r815, r819; +} +{ +mul.f16x2 r826, r374, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r698, r815, r829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r835, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r837, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r839, {low, high}; +} +{ +mul.f16x2 r840, r837, r839; +} +{ +mul.f16x2 r843, r811, r835; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r846, {high, low}; +} +{ +fma.rn.f16x2 r848, r840, r846, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r854, {high, high}; +} +{ +mul.f16x2 r856, r590, r854; +} +{ +fma.rn.f16x2 r859, r266, r852, r856; +} +{ +mul.f16x2 r863, r266, r854; +} +{ +neg.f16x2 r866, r863; +} +{ +fma.rn.f16x2 r868, r590, r852, r866; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r872, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r874, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r876, {low, high}; +} +{ +mul.f16x2 r877, r874, r876; +} +{ +mul.f16x2 r880, r848, r872; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r883, {high, low}; +} +{ +fma.rn.f16x2 r885, r877, r883, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r891, {high, high}; +} +{ +mul.f16x2 r893, r482, r891; +} +{ +fma.rn.f16x2 r896, r158, r889, r893; +} +{ +mul.f16x2 r900, r158, r891; +} +{ +neg.f16x2 r903, r900; +} +{ +fma.rn.f16x2 r905, r482, r889, r903; +} +mad.lo.s32 r2558, r2555, 1372, r2553; +barrier.sync 0; +mad.lo.s32 r2559, r2557, 28, r2558; +st.shared.u32 [r2559], r32; +st.shared.u32 [r2559+4], r711; +st.shared.u32 [r2559+8], r748; +st.shared.u32 [r2559+12], r785; +st.shared.u32 [r2559+16], r822; +st.shared.u32 [r2559+20], r859; +st.shared.u32 [r2559+24], r896; +barrier.sync 0; +mad.lo.s32 r2560, r2557, -24, r2559; +ld.shared.u32 r946, [r2560]; +ld.shared.u32 r943, [r2560+196]; +ld.shared.u32 r949, [r2560+392]; +ld.shared.u32 r955, [r2560+588]; +ld.shared.u32 r956, [r2560+784]; +ld.shared.u32 r950, [r2560+980]; +ld.shared.u32 r944, [r2560+1176]; +barrier.sync 0; +st.shared.u32 [r2559], r50; +st.shared.u32 [r2559+4], r720; +st.shared.u32 [r2559+8], r757; +st.shared.u32 [r2559+12], r794; +st.shared.u32 [r2559+16], r831; +st.shared.u32 [r2559+20], r868; +st.shared.u32 [r2559+24], r905; +barrier.sync 0; +ld.shared.u32 r964, [r2560]; +ld.shared.u32 r961, [r2560+196]; +ld.shared.u32 r967, [r2560+392]; +ld.shared.u32 r973, [r2560+588]; +ld.shared.u32 r974, [r2560+784]; +ld.shared.u32 r968, [r2560+980]; +ld.shared.u32 r962, [r2560+1176]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r927, {low, high}; +} +{ +neg.f16x2 r928, r927; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r941, {low, high}; +} +{ +add.f16x2 r942, r943, r944; +} +{ +add.f16x2 r945, r946, r942; +} +{ +add.f16x2 r948, r949, r950; +} +{ +add.f16x2 r951, r945, r948; +} +{ +add.f16x2 r954, r955, r956; +} +{ +add.f16x2 r957, r951, r954; +} +{ +add.f16x2 r960, r961, r962; +} +{ +add.f16x2 r963, r964, r960; +} +{ +add.f16x2 r966, r967, r968; +} +{ +add.f16x2 r969, r963, r966; +} +{ +add.f16x2 r972, r973, r974; +} +{ +add.f16x2 r975, r969, r972; +} +{ +add.f16x2 r978, r943, r944; +} +{ +mul.f16x2 r981, r978, r926; +} +{ +add.f16x2 r984, r946, r981; +} +{ +add.f16x2 r987, r949, r950; +} +{ +mul.f16x2 r990, r987, r930; +} +{ +add.f16x2 r993, r984, r990; +} +{ +add.f16x2 r996, r955, r956; +} +{ +mul.f16x2 r999, r996, r934; +} +{ +add.f16x2 r1002, r993, r999; +} +{ +sub.f16x2 r1005, r961, r962; +} +{ +mul.f16x2 r1008, r1005, r928; +} +{ +sub.f16x2 r1011, r967, r968; +} +{ +mul.f16x2 r1014, r1011, r932; +} +{ +add.f16x2 r1017, r1008, r1014; +} +{ +sub.f16x2 r1020, r973, r974; +} +{ +mul.f16x2 r1023, r1020, r936; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r1002, r1026; +} +{ +add.f16x2 r1032, r943, r944; +} +{ +mul.f16x2 r1035, r1032, r926; +} +{ +add.f16x2 r1038, r946, r1035; +} +{ +add.f16x2 r1041, r949, r950; +} +{ +mul.f16x2 r1044, r1041, r930; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r955, r956; +} +{ +mul.f16x2 r1053, r1050, r934; +} +{ +add.f16x2 r1056, r1047, r1053; +} +{ +sub.f16x2 r1059, r961, r962; +} +{ +mul.f16x2 r1062, r1059, r928; +} +{ +sub.f16x2 r1065, r967, r968; +} +{ +mul.f16x2 r1068, r1065, r932; +} +{ +add.f16x2 r1071, r1062, r1068; +} +{ +sub.f16x2 r1074, r973, r974; +} +{ +mul.f16x2 r1077, r1074, r936; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 r1083, r1056, r1080; +} +{ +add.f16x2 r1086, r943, r944; +} +{ +mul.f16x2 r1089, r1086, r930; +} +{ +add.f16x2 r1092, r946, r1089; +} +{ +add.f16x2 r1095, r949, r950; +} +{ +mul.f16x2 r1098, r1095, r938; +} +{ +add.f16x2 r1101, r1092, r1098; +} +{ +add.f16x2 r1104, r955, r956; +} +{ +mul.f16x2 r1107, r1104, r940; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, r961, r962; +} +{ +mul.f16x2 r1116, r1113, r932; +} +{ +sub.f16x2 r1119, r967, r968; +} +{ +mul.f16x2 r1122, r1119, r939; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r973, r974; +} +{ +mul.f16x2 r1131, r1128, r941; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r1110, r1134; +} +{ +add.f16x2 r1140, r943, r944; +} +{ +mul.f16x2 r1143, r1140, r930; +} +{ +add.f16x2 r1146, r946, r1143; +} +{ +add.f16x2 r1149, r949, r950; +} +{ +mul.f16x2 r1152, r1149, r938; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r955, r956; +} +{ +mul.f16x2 r1161, r1158, r940; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +sub.f16x2 r1167, r961, r962; +} +{ +mul.f16x2 r1170, r1167, r932; +} +{ +sub.f16x2 r1173, r967, r968; +} +{ +mul.f16x2 r1176, r1173, r939; +} +{ +add.f16x2 r1179, r1170, r1176; +} +{ +sub.f16x2 r1182, r973, r974; +} +{ +mul.f16x2 r1185, r1182, r941; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +add.f16x2 r1191, r1164, r1188; +} +{ +add.f16x2 r1194, r943, r944; +} +{ +mul.f16x2 r1197, r1194, r934; +} +{ +add.f16x2 r1200, r946, r1197; +} +{ +add.f16x2 r1203, r949, r950; +} +{ +mul.f16x2 r1206, r1203, r940; +} +{ +add.f16x2 r1209, r1200, r1206; +} +{ +add.f16x2 r1212, r955, r956; +} +{ +mul.f16x2 r1215, r1212, r930; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, r961, r962; +} +{ +mul.f16x2 r1224, r1221, r936; +} +{ +sub.f16x2 r1227, r967, r968; +} +{ +mul.f16x2 r1230, r1227, r941; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r973, r974; +} +{ +mul.f16x2 r1239, r1236, r932; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r1218, r1242; +} +{ +add.f16x2 r1248, r943, r944; +} +{ +mul.f16x2 r1251, r1248, r934; +} +{ +add.f16x2 r1254, r946, r1251; +} +{ +add.f16x2 r1257, r949, r950; +} +{ +mul.f16x2 r1260, r1257, r940; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r955, r956; +} +{ +mul.f16x2 r1269, r1266, r930; +} +{ +add.f16x2 r1272, r1263, r1269; +} +{ +sub.f16x2 r1275, r961, r962; +} +{ +mul.f16x2 r1278, r1275, r936; +} +{ +sub.f16x2 r1281, r967, r968; +} +{ +mul.f16x2 r1284, r1281, r941; +} +{ +add.f16x2 r1287, r1278, r1284; +} +{ +sub.f16x2 r1290, r973, r974; +} +{ +mul.f16x2 r1293, r1290, r932; +} +{ +add.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 r1299, r1272, r1296; +} +{ +add.f16x2 r1302, r961, r962; +} +{ +mul.f16x2 r1305, r1302, r926; +} +{ +add.f16x2 r1308, r964, r1305; +} +{ +add.f16x2 r1311, r967, r968; +} +{ +mul.f16x2 r1314, r1311, r930; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +add.f16x2 r1320, r973, r974; +} +{ +mul.f16x2 r1323, r1320, r934; +} +{ +add.f16x2 r1326, r1317, r1323; +} +{ +sub.f16x2 r1329, r943, r944; +} +{ +mul.f16x2 r1332, r1329, r928; +} +{ +sub.f16x2 r1335, r949, r950; +} +{ +mul.f16x2 r1338, r1335, r932; +} +{ +add.f16x2 r1341, r1332, r1338; +} +{ +sub.f16x2 r1344, r955, r956; +} +{ +mul.f16x2 r1347, r1344, r936; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 r1353, r1326, r1350; +} +{ +add.f16x2 r1356, r961, r962; +} +{ +mul.f16x2 r1359, r1356, r926; +} +{ +add.f16x2 r1362, r964, r1359; +} +{ +add.f16x2 r1365, r967, r968; +} +{ +mul.f16x2 r1368, r1365, r930; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r973, r974; +} +{ +mul.f16x2 r1377, r1374, r934; +} +{ +add.f16x2 r1380, r1371, r1377; +} +{ +sub.f16x2 r1383, r943, r944; +} +{ +mul.f16x2 r1386, r1383, r928; +} +{ +sub.f16x2 r1389, r949, r950; +} +{ +mul.f16x2 r1392, r1389, r932; +} +{ +add.f16x2 r1395, r1386, r1392; +} +{ +sub.f16x2 r1398, r955, r956; +} +{ +mul.f16x2 r1401, r1398, r936; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +sub.f16x2 r1407, r1380, r1404; +} +{ +add.f16x2 r1410, r961, r962; +} +{ +mul.f16x2 r1413, r1410, r930; +} +{ +add.f16x2 r1416, r964, r1413; +} +{ +add.f16x2 r1419, r967, r968; +} +{ +mul.f16x2 r1422, r1419, r938; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +add.f16x2 r1428, r973, r974; +} +{ +mul.f16x2 r1431, r1428, r940; +} +{ +add.f16x2 r1434, r1425, r1431; +} +{ +sub.f16x2 r1437, r943, r944; +} +{ +mul.f16x2 r1440, r1437, r932; +} +{ +sub.f16x2 r1443, r949, r950; +} +{ +mul.f16x2 r1446, r1443, r939; +} +{ +add.f16x2 r1449, r1440, r1446; +} +{ +sub.f16x2 r1452, r955, r956; +} +{ +mul.f16x2 r1455, r1452, r941; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +add.f16x2 r1461, r1434, r1458; +} +{ +add.f16x2 r1464, r961, r962; +} +{ +mul.f16x2 r1467, r1464, r930; +} +{ +add.f16x2 r1470, r964, r1467; +} +{ +add.f16x2 r1473, r967, r968; +} +{ +mul.f16x2 r1476, r1473, r938; +} +{ +add.f16x2 r1479, r1470, r1476; +} +{ +add.f16x2 r1482, r973, r974; +} +{ +mul.f16x2 r1485, r1482, r940; +} +{ +add.f16x2 r1488, r1479, r1485; +} +{ +sub.f16x2 r1491, r943, r944; +} +{ +mul.f16x2 r1494, r1491, r932; +} +{ +sub.f16x2 r1497, r949, r950; +} +{ +mul.f16x2 r1500, r1497, r939; +} +{ +add.f16x2 r1503, r1494, r1500; +} +{ +sub.f16x2 r1506, r955, r956; +} +{ +mul.f16x2 r1509, r1506, r941; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +sub.f16x2 r1515, r1488, r1512; +} +{ +add.f16x2 r1518, r961, r962; +} +{ +mul.f16x2 r1521, r1518, r934; +} +{ +add.f16x2 r1524, r964, r1521; +} +{ +add.f16x2 r1527, r967, r968; +} +{ +mul.f16x2 r1530, r1527, r940; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +add.f16x2 r1536, r973, r974; +} +{ +mul.f16x2 r1539, r1536, r930; +} +{ +add.f16x2 r1542, r1533, r1539; +} +{ +sub.f16x2 r1545, r943, r944; +} +{ +mul.f16x2 r1548, r1545, r936; +} +{ +sub.f16x2 r1551, r949, r950; +} +{ +mul.f16x2 r1554, r1551, r941; +} +{ +add.f16x2 r1557, r1548, r1554; +} +{ +sub.f16x2 r1560, r955, r956; +} +{ +mul.f16x2 r1563, r1560, r932; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +add.f16x2 r1569, r1542, r1566; +} +{ +add.f16x2 r1572, r961, r962; +} +{ +mul.f16x2 r1575, r1572, r934; +} +{ +add.f16x2 r1578, r964, r1575; +} +{ +add.f16x2 r1581, r967, r968; +} +{ +mul.f16x2 r1584, r1581, r940; +} +{ +add.f16x2 r1587, r1578, r1584; +} +{ +add.f16x2 r1590, r973, r974; +} +{ +mul.f16x2 r1593, r1590, r930; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, r943, r944; +} +{ +mul.f16x2 r1602, r1599, r936; +} +{ +sub.f16x2 r1605, r949, r950; +} +{ +mul.f16x2 r1608, r1605, r941; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r955, r956; +} +{ +mul.f16x2 r1617, r1614, r932; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +sub.f16x2 r1623, r1596, r1620; +} +mul.wide.u32 rd4, r2557, 613566757; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r2561, rd5; +sub.s32 r2562, r2557, r2561; +shr.u32 r2563, r2562, 1; +add.s32 r2564, r2563, r2561; +shr.u32 r2565, r2564, 2; +cvt.rn.f32.u32 f96, r2565; +mul.f32 f97, f96, 0f3E034E46; +cos.approx.f32 f57, f97; +sin.approx.f32 f98, f97; +neg.f32 f58, f98; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r1626, {low, high}; +} +mul.lo.s32 r2566, r2565, 7; +sub.s32 r2567, r2557, r2566; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1629, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1631, {high, high}; +} +{ +mul.f16x2 r1633, r1353, r1631; +} +{ +fma.rn.f16x2 r1636, r1029, r1629, r1633; +} +{ +mul.f16x2 r1640, r1029, r1631; +} +{ +neg.f16x2 r1643, r1640; +} +{ +fma.rn.f16x2 r1645, r1353, r1629, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1651, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1653, {low, high}; +} +{ +mul.f16x2 r1654, r1651, r1653; +} +{ +mul.f16x2 r1657, r1626, r1649; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1660, {high, low}; +} +{ +fma.rn.f16x2 r1662, r1654, r1660, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1666, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1668, {high, high}; +} +{ +mul.f16x2 r1670, r1461, r1668; +} +{ +fma.rn.f16x2 r1673, r1137, r1666, r1670; +} +{ +mul.f16x2 r1677, r1137, r1668; +} +{ +neg.f16x2 r1680, r1677; +} +{ +fma.rn.f16x2 r1682, r1461, r1666, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1688, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1690, {low, high}; +} +{ +mul.f16x2 r1691, r1688, r1690; +} +{ +mul.f16x2 r1694, r1662, r1686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1662; +mov.b32 r1697, {high, low}; +} +{ +fma.rn.f16x2 r1699, r1691, r1697, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1703, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1705, {high, high}; +} +{ +mul.f16x2 r1707, r1569, r1705; +} +{ +fma.rn.f16x2 r1710, r1245, r1703, r1707; +} +{ +mul.f16x2 r1714, r1245, r1705; +} +{ +neg.f16x2 r1717, r1714; +} +{ +fma.rn.f16x2 r1719, r1569, r1703, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1725, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1727, {low, high}; +} +{ +mul.f16x2 r1728, r1725, r1727; +} +{ +mul.f16x2 r1731, r1699, r1723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1699; +mov.b32 r1734, {high, low}; +} +{ +fma.rn.f16x2 r1736, r1728, r1734, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1742, {high, high}; +} +{ +mul.f16x2 r1744, r1623, r1742; +} +{ +fma.rn.f16x2 r1747, r1299, r1740, r1744; +} +{ +mul.f16x2 r1751, r1299, r1742; +} +{ +neg.f16x2 r1754, r1751; +} +{ +fma.rn.f16x2 r1756, r1623, r1740, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1762, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1764, {low, high}; +} +{ +mul.f16x2 r1765, r1762, r1764; +} +{ +mul.f16x2 r1768, r1736, r1760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1736; +mov.b32 r1771, {high, low}; +} +{ +fma.rn.f16x2 r1773, r1765, r1771, r1768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1779, {high, high}; +} +{ +mul.f16x2 r1781, r1515, r1779; +} +{ +fma.rn.f16x2 r1784, r1191, r1777, r1781; +} +{ +mul.f16x2 r1788, r1191, r1779; +} +{ +neg.f16x2 r1791, r1788; +} +{ +fma.rn.f16x2 r1793, r1515, r1777, r1791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1799, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1801, {low, high}; +} +{ +mul.f16x2 r1802, r1799, r1801; +} +{ +mul.f16x2 r1805, r1773, r1797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1773; +mov.b32 r1808, {high, low}; +} +{ +fma.rn.f16x2 r1810, r1802, r1808, r1805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1810; +mov.b32 r1816, {high, high}; +} +{ +mul.f16x2 r1818, r1407, r1816; +} +{ +fma.rn.f16x2 r1821, r1083, r1814, r1818; +} +{ +mul.f16x2 r1825, r1083, r1816; +} +{ +neg.f16x2 r1828, r1825; +} +{ +fma.rn.f16x2 r1830, r1407, r1814, r1828; +} +shl.b32 r2568, r2567, 2; +add.s32 r2569, r2558, r2568; +barrier.sync 0; +mad.lo.s32 r2570, r2565, 196, r2569; +st.shared.u32 [r2570], r957; +st.shared.u32 [r2570+28], r1636; +st.shared.u32 [r2570+56], r1673; +st.shared.u32 [r2570+84], r1710; +st.shared.u32 [r2570+112], r1747; +st.shared.u32 [r2570+140], r1784; +st.shared.u32 [r2570+168], r1821; +barrier.sync 0; +ld.shared.u32 r1871, [r2560]; +ld.shared.u32 r1868, [r2560+196]; +ld.shared.u32 r1874, [r2560+392]; +ld.shared.u32 r1880, [r2560+588]; +ld.shared.u32 r1881, [r2560+784]; +ld.shared.u32 r1875, [r2560+980]; +ld.shared.u32 r1869, [r2560+1176]; +barrier.sync 0; +st.shared.u32 [r2570], r975; +st.shared.u32 [r2570+28], r1645; +st.shared.u32 [r2570+56], r1682; +st.shared.u32 [r2570+84], r1719; +st.shared.u32 [r2570+112], r1756; +st.shared.u32 [r2570+140], r1793; +st.shared.u32 [r2570+168], r1830; +barrier.sync 0; +ld.shared.u32 r1889, [r2560]; +ld.shared.u32 r1886, [r2560+196]; +ld.shared.u32 r1892, [r2560+392]; +ld.shared.u32 r1898, [r2560+588]; +ld.shared.u32 r1899, [r2560+784]; +ld.shared.u32 r1893, [r2560+980]; +ld.shared.u32 r1887, [r2560+1176]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1851, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1852, {low, high}; +} +{ +neg.f16x2 r1853, r1852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1856, {low, high}; +} +{ +neg.f16x2 r1857, r1856; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1859, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1860, {low, high}; +} +{ +neg.f16x2 r1861, r1860; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f86; +cvt.rn.f16.f32 high, f86; +mov.b32 r1863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f88; +cvt.rn.f16.f32 high, f88; +mov.b32 r1864, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1866, {low, high}; +} +{ +add.f16x2 r1867, r1868, r1869; +} +{ +add.f16x2 r1870, r1871, r1867; +} +{ +add.f16x2 r1873, r1874, r1875; +} +{ +add.f16x2 r1876, r1870, r1873; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 %0, r1876, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1892, r1893; +} +{ +add.f16x2 r1894, r1888, r1891; +} +{ +add.f16x2 r1897, r1898, r1899; +} +{ +add.f16x2 %1, r1894, r1897; +} +{ +add.f16x2 r1903, r1868, r1869; +} +{ +mul.f16x2 r1906, r1903, r1851; +} +{ +add.f16x2 r1909, r1871, r1906; +} +{ +add.f16x2 r1912, r1874, r1875; +} +{ +mul.f16x2 r1915, r1912, r1855; +} +{ +add.f16x2 r1918, r1909, r1915; +} +{ +add.f16x2 r1921, r1880, r1881; +} +{ +mul.f16x2 r1924, r1921, r1859; +} +{ +add.f16x2 r1927, r1918, r1924; +} +{ +sub.f16x2 r1930, r1886, r1887; +} +{ +mul.f16x2 r1933, r1930, r1853; +} +{ +sub.f16x2 r1936, r1892, r1893; +} +{ +mul.f16x2 r1939, r1936, r1857; +} +{ +add.f16x2 r1942, r1933, r1939; +} +{ +sub.f16x2 r1945, r1898, r1899; +} +{ +mul.f16x2 r1948, r1945, r1861; +} +{ +add.f16x2 r1951, r1942, r1948; +} +{ +sub.f16x2 %2, r1927, r1951; +} +{ +add.f16x2 r1957, r1868, r1869; +} +{ +mul.f16x2 r1960, r1957, r1851; +} +{ +add.f16x2 r1963, r1871, r1960; +} +{ +add.f16x2 r1966, r1874, r1875; +} +{ +mul.f16x2 r1969, r1966, r1855; +} +{ +add.f16x2 r1972, r1963, r1969; +} +{ +add.f16x2 r1975, r1880, r1881; +} +{ +mul.f16x2 r1978, r1975, r1859; +} +{ +add.f16x2 r1981, r1972, r1978; +} +{ +sub.f16x2 r1984, r1886, r1887; +} +{ +mul.f16x2 r1987, r1984, r1853; +} +{ +sub.f16x2 r1990, r1892, r1893; +} +{ +mul.f16x2 r1993, r1990, r1857; +} +{ +add.f16x2 r1996, r1987, r1993; +} +{ +sub.f16x2 r1999, r1898, r1899; +} +{ +mul.f16x2 r2002, r1999, r1861; +} +{ +add.f16x2 r2005, r1996, r2002; +} +{ +add.f16x2 %12, r1981, r2005; +} +{ +add.f16x2 r2011, r1868, r1869; +} +{ +mul.f16x2 r2014, r2011, r1855; +} +{ +add.f16x2 r2017, r1871, r2014; +} +{ +add.f16x2 r2020, r1874, r1875; +} +{ +mul.f16x2 r2023, r2020, r1863; +} +{ +add.f16x2 r2026, r2017, r2023; +} +{ +add.f16x2 r2029, r1880, r1881; +} +{ +mul.f16x2 r2032, r2029, r1865; +} +{ +add.f16x2 r2035, r2026, r2032; +} +{ +sub.f16x2 r2038, r1886, r1887; +} +{ +mul.f16x2 r2041, r2038, r1857; +} +{ +sub.f16x2 r2044, r1892, r1893; +} +{ +mul.f16x2 r2047, r2044, r1864; +} +{ +add.f16x2 r2050, r2041, r2047; +} +{ +sub.f16x2 r2053, r1898, r1899; +} +{ +mul.f16x2 r2056, r2053, r1866; +} +{ +add.f16x2 r2059, r2050, r2056; +} +{ +sub.f16x2 %4, r2035, r2059; +} +{ +add.f16x2 r2065, r1868, r1869; +} +{ +mul.f16x2 r2068, r2065, r1855; +} +{ +add.f16x2 r2071, r1871, r2068; +} +{ +add.f16x2 r2074, r1874, r1875; +} +{ +mul.f16x2 r2077, r2074, r1863; +} +{ +add.f16x2 r2080, r2071, r2077; +} +{ +add.f16x2 r2083, r1880, r1881; +} +{ +mul.f16x2 r2086, r2083, r1865; +} +{ +add.f16x2 r2089, r2080, r2086; +} +{ +sub.f16x2 r2092, r1886, r1887; +} +{ +mul.f16x2 r2095, r2092, r1857; +} +{ +sub.f16x2 r2098, r1892, r1893; +} +{ +mul.f16x2 r2101, r2098, r1864; +} +{ +add.f16x2 r2104, r2095, r2101; +} +{ +sub.f16x2 r2107, r1898, r1899; +} +{ +mul.f16x2 r2110, r2107, r1866; +} +{ +add.f16x2 r2113, r2104, r2110; +} +{ +add.f16x2 %10, r2089, r2113; +} +{ +add.f16x2 r2119, r1868, r1869; +} +{ +mul.f16x2 r2122, r2119, r1859; +} +{ +add.f16x2 r2125, r1871, r2122; +} +{ +add.f16x2 r2128, r1874, r1875; +} +{ +mul.f16x2 r2131, r2128, r1865; +} +{ +add.f16x2 r2134, r2125, r2131; +} +{ +add.f16x2 r2137, r1880, r1881; +} +{ +mul.f16x2 r2140, r2137, r1855; +} +{ +add.f16x2 r2143, r2134, r2140; +} +{ +sub.f16x2 r2146, r1886, r1887; +} +{ +mul.f16x2 r2149, r2146, r1861; +} +{ +sub.f16x2 r2152, r1892, r1893; +} +{ +mul.f16x2 r2155, r2152, r1866; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +sub.f16x2 r2161, r1898, r1899; +} +{ +mul.f16x2 r2164, r2161, r1857; +} +{ +add.f16x2 r2167, r2158, r2164; +} +{ +sub.f16x2 %6, r2143, r2167; +} +{ +add.f16x2 r2173, r1868, r1869; +} +{ +mul.f16x2 r2176, r2173, r1859; +} +{ +add.f16x2 r2179, r1871, r2176; +} +{ +add.f16x2 r2182, r1874, r1875; +} +{ +mul.f16x2 r2185, r2182, r1865; +} +{ +add.f16x2 r2188, r2179, r2185; +} +{ +add.f16x2 r2191, r1880, r1881; +} +{ +mul.f16x2 r2194, r2191, r1855; +} +{ +add.f16x2 r2197, r2188, r2194; +} +{ +sub.f16x2 r2200, r1886, r1887; +} +{ +mul.f16x2 r2203, r2200, r1861; +} +{ +sub.f16x2 r2206, r1892, r1893; +} +{ +mul.f16x2 r2209, r2206, r1866; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +sub.f16x2 r2215, r1898, r1899; +} +{ +mul.f16x2 r2218, r2215, r1857; +} +{ +add.f16x2 r2221, r2212, r2218; +} +{ +add.f16x2 %8, r2197, r2221; +} +{ +add.f16x2 r2227, r1886, r1887; +} +{ +mul.f16x2 r2230, r2227, r1851; +} +{ +add.f16x2 r2233, r1889, r2230; +} +{ +add.f16x2 r2236, r1892, r1893; +} +{ +mul.f16x2 r2239, r2236, r1855; +} +{ +add.f16x2 r2242, r2233, r2239; +} +{ +add.f16x2 r2245, r1898, r1899; +} +{ +mul.f16x2 r2248, r2245, r1859; +} +{ +add.f16x2 r2251, r2242, r2248; +} +{ +sub.f16x2 r2254, r1868, r1869; +} +{ +mul.f16x2 r2257, r2254, r1853; +} +{ +sub.f16x2 r2260, r1874, r1875; +} +{ +mul.f16x2 r2263, r2260, r1857; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +sub.f16x2 r2269, r1880, r1881; +} +{ +mul.f16x2 r2272, r2269, r1861; +} +{ +add.f16x2 r2275, r2266, r2272; +} +{ +add.f16x2 %3, r2251, r2275; +} +{ +add.f16x2 r2281, r1886, r1887; +} +{ +mul.f16x2 r2284, r2281, r1851; +} +{ +add.f16x2 r2287, r1889, r2284; +} +{ +add.f16x2 r2290, r1892, r1893; +} +{ +mul.f16x2 r2293, r2290, r1855; +} +{ +add.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r1898, r1899; +} +{ +mul.f16x2 r2302, r2299, r1859; +} +{ +add.f16x2 r2305, r2296, r2302; +} +{ +sub.f16x2 r2308, r1868, r1869; +} +{ +mul.f16x2 r2311, r2308, r1853; +} +{ +sub.f16x2 r2314, r1874, r1875; +} +{ +mul.f16x2 r2317, r2314, r1857; +} +{ +add.f16x2 r2320, r2311, r2317; +} +{ +sub.f16x2 r2323, r1880, r1881; +} +{ +mul.f16x2 r2326, r2323, r1861; +} +{ +add.f16x2 r2329, r2320, r2326; +} +{ +sub.f16x2 %13, r2305, r2329; +} +{ +add.f16x2 r2335, r1886, r1887; +} +{ +mul.f16x2 r2338, r2335, r1855; +} +{ +add.f16x2 r2341, r1889, r2338; +} +{ +add.f16x2 r2344, r1892, r1893; +} +{ +mul.f16x2 r2347, r2344, r1863; +} +{ +add.f16x2 r2350, r2341, r2347; +} +{ +add.f16x2 r2353, r1898, r1899; +} +{ +mul.f16x2 r2356, r2353, r1865; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r1868, r1869; +} +{ +mul.f16x2 r2365, r2362, r1857; +} +{ +sub.f16x2 r2368, r1874, r1875; +} +{ +mul.f16x2 r2371, r2368, r1864; +} +{ +add.f16x2 r2374, r2365, r2371; +} +{ +sub.f16x2 r2377, r1880, r1881; +} +{ +mul.f16x2 r2380, r2377, r1866; +} +{ +add.f16x2 r2383, r2374, r2380; +} +{ +add.f16x2 %5, r2359, r2383; +} +{ +add.f16x2 r2389, r1886, r1887; +} +{ +mul.f16x2 r2392, r2389, r1855; +} +{ +add.f16x2 r2395, r1889, r2392; +} +{ +add.f16x2 r2398, r1892, r1893; +} +{ +mul.f16x2 r2401, r2398, r1863; +} +{ +add.f16x2 r2404, r2395, r2401; +} +{ +add.f16x2 r2407, r1898, r1899; +} +{ +mul.f16x2 r2410, r2407, r1865; +} +{ +add.f16x2 r2413, r2404, r2410; +} +{ +sub.f16x2 r2416, r1868, r1869; +} +{ +mul.f16x2 r2419, r2416, r1857; +} +{ +sub.f16x2 r2422, r1874, r1875; +} +{ +mul.f16x2 r2425, r2422, r1864; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +sub.f16x2 r2431, r1880, r1881; +} +{ +mul.f16x2 r2434, r2431, r1866; +} +{ +add.f16x2 r2437, r2428, r2434; +} +{ +sub.f16x2 %11, r2413, r2437; +} +{ +add.f16x2 r2443, r1886, r1887; +} +{ +mul.f16x2 r2446, r2443, r1859; +} +{ +add.f16x2 r2449, r1889, r2446; +} +{ +add.f16x2 r2452, r1892, r1893; +} +{ +mul.f16x2 r2455, r2452, r1865; +} +{ +add.f16x2 r2458, r2449, r2455; +} +{ +add.f16x2 r2461, r1898, r1899; +} +{ +mul.f16x2 r2464, r2461, r1855; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r1868, r1869; +} +{ +mul.f16x2 r2473, r2470, r1861; +} +{ +sub.f16x2 r2476, r1874, r1875; +} +{ +mul.f16x2 r2479, r2476, r1866; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +sub.f16x2 r2485, r1880, r1881; +} +{ +mul.f16x2 r2488, r2485, r1857; +} +{ +add.f16x2 r2491, r2482, r2488; +} +{ +add.f16x2 %7, r2467, r2491; +} +{ +add.f16x2 r2497, r1886, r1887; +} +{ +mul.f16x2 r2500, r2497, r1859; +} +{ +add.f16x2 r2503, r1889, r2500; +} +{ +add.f16x2 r2506, r1892, r1893; +} +{ +mul.f16x2 r2509, r2506, r1865; +} +{ +add.f16x2 r2512, r2503, r2509; +} +{ +add.f16x2 r2515, r1898, r1899; +} +{ +mul.f16x2 r2518, r2515, r1855; +} +{ +add.f16x2 r2521, r2512, r2518; +} +{ +sub.f16x2 r2524, r1868, r1869; +} +{ +mul.f16x2 r2527, r2524, r1861; +} +{ +sub.f16x2 r2530, r1874, r1875; +} +{ +mul.f16x2 r2533, r2530, r1866; +} +{ +add.f16x2 r2536, r2527, r2533; +} +{ +sub.f16x2 r2539, r1880, r1881; +} +{ +mul.f16x2 r2542, r2539, r1857; +} +{ +add.f16x2 r2545, r2536, r2542; +} +{ +sub.f16x2 %9, r2521, r2545; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..bd43f6bececb3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp32_fwd.hpp.inc @@ -0,0 +1,938 @@ +#ifndef CUFFTDX_FFT_343_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_343_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<174, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<467>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 2744, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %19, %33; +add.f32 f30, %17, f29; +add.f32 f31, %22, %30; +add.f32 f32, f31, f30; +add.f32 f33, %25, %27; +add.f32 f34, %21, %34; +add.f32 f35, %18, f34; +add.f32 f36, %24, %32; +add.f32 f37, f36, f35; +add.f32 f38, %26, %29; +fma.rn.f32 f39, f29, 0f3F1F9D07, %17; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %21, %34; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %24, %32; +mul.f32 f47, f46, 0fBF7994E0; +sub.f32 f48, f47, f45; +sub.f32 f49, %26, %29; +mul.f32 f50, f49, 0f3EDE2602; +sub.f32 f51, f48, f50; +sub.f32 f52, f43, f51; +add.f32 f53, f51, f43; +mul.f32 f54, f29, 0f3E63DC87; +sub.f32 f55, %17, f54; +mul.f32 f56, f31, 0f3F66A5E5; +sub.f32 f57, f55, f56; +fma.rn.f32 f58, f33, 0f3F1F9D07, f57; +mul.f32 f59, f44, 0f3F7994E0; +mul.f32 f60, f46, 0f3EDE2602; +sub.f32 f61, f60, f59; +fma.rn.f32 f62, f49, 0f3F48261C, f61; +sub.f32 f63, f58, f62; +add.f32 f64, f62, f58; +mul.f32 f65, f29, 0f3F66A5E5; +sub.f32 f66, %17, f65; +fma.rn.f32 f67, f31, 0f3F1F9D07, f66; +mul.f32 f68, f33, 0f3E63DC87; +sub.f32 f69, f67, f68; +mul.f32 f70, f44, 0f3EDE2602; +mul.f32 f71, f46, 0f3F48261C; +sub.f32 f72, f71, f70; +mul.f32 f73, f49, 0f3F7994E0; +sub.f32 f74, f72, f73; +sub.f32 f75, f69, f74; +add.f32 f76, f74, f69; +fma.rn.f32 f77, f34, 0f3F1F9D07, %18; +mul.f32 f78, f36, 0f3E63DC87; +sub.f32 f79, f77, f78; +mul.f32 f80, f38, 0f3F66A5E5; +sub.f32 f81, f79, f80; +sub.f32 f82, %19, %33; +mul.f32 f83, f82, 0f3F48261C; +sub.f32 f84, %22, %30; +mul.f32 f85, f84, 0fBF7994E0; +sub.f32 f86, f85, f83; +sub.f32 f87, %25, %27; +mul.f32 f88, f87, 0f3EDE2602; +sub.f32 f89, f86, f88; +add.f32 f90, f89, f81; +sub.f32 f91, f81, f89; +mul.f32 f92, f34, 0f3E63DC87; +sub.f32 f93, %18, f92; +mul.f32 f94, f36, 0f3F66A5E5; +sub.f32 f95, f93, f94; +fma.rn.f32 f96, f38, 0f3F1F9D07, f95; +mul.f32 f97, f82, 0f3F7994E0; +mul.f32 f98, f84, 0f3EDE2602; +sub.f32 f99, f98, f97; +fma.rn.f32 f100, f87, 0f3F48261C, f99; +add.f32 f101, f100, f96; +sub.f32 f102, f96, f100; +mul.f32 f103, f34, 0f3F66A5E5; +sub.f32 f104, %18, f103; +fma.rn.f32 f105, f36, 0f3F1F9D07, f104; +mul.f32 f106, f38, 0f3E63DC87; +sub.f32 f107, f105, f106; +mul.f32 f108, f82, 0f3EDE2602; +mul.f32 f109, f84, 0f3F48261C; +sub.f32 f110, f109, f108; +mul.f32 f111, f87, 0f3F7994E0; +sub.f32 f112, f110, f111; +add.f32 f113, f112, f107; +sub.f32 f114, f107, f112; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2744, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f115, f116}, [rd6]; +mul.f32 f119, f115, f52; +mul.f32 f120, f116, f90; +mul.f32 f121, f115, f90; +mul.f32 f122, f115, f115; +mul.f32 f123, f116, f116; +sub.f32 f124, f122, f123; +mul.f32 f125, f116, f115; +fma.rn.f32 f126, f116, f115, f125; +mul.f32 f127, f124, f63; +mul.f32 f128, f126, f101; +mul.f32 f129, f124, f101; +mul.f32 f130, f115, f124; +mul.f32 f131, f116, f126; +sub.f32 f132, f130, f131; +mul.f32 f133, f115, f126; +fma.rn.f32 f134, f116, f124, f133; +mul.f32 f135, f132, f75; +mul.f32 f136, f134, f113; +mul.f32 f137, f132, f113; +mul.f32 f138, f115, f132; +mul.f32 f139, f116, f134; +sub.f32 f140, f138, f139; +mul.f32 f141, f115, f134; +fma.rn.f32 f142, f116, f132, f141; +mul.f32 f143, f140, f76; +mul.f32 f144, f142, f114; +mul.f32 f145, f140, f114; +mul.f32 f146, f115, f140; +mul.f32 f147, f116, f142; +sub.f32 f148, f146, f147; +mul.f32 f149, f115, f142; +fma.rn.f32 f150, f116, f140, f149; +mul.f32 f151, f148, f64; +mul.f32 f152, f150, f102; +mul.f32 f153, f148, f102; +mul.f32 f154, f115, f148; +mul.f32 f155, f116, f150; +sub.f32 f156, f154, f155; +mul.f32 f157, f115, f150; +fma.rn.f32 f158, f116, f148, f157; +mul.f32 f159, f156, f53; +mul.f32 f160, f158, f91; +mul.f32 f161, f156, f91; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +add.f32 f162, f38, f37; +add.f32 f163, f33, f32; +st.shared.v2.f32 [r9], {f163, f162}; +fma.rn.f32 f164, f116, f52, f121; +sub.f32 f165, f119, f120; +st.shared.v2.f32 [r9+8], {f165, f164}; +fma.rn.f32 f166, f126, f63, f129; +sub.f32 f167, f127, f128; +st.shared.v2.f32 [r9+16], {f167, f166}; +sub.f32 f168, f135, f136; +fma.rn.f32 f169, f134, f75, f137; +st.shared.v2.f32 [r9+24], {f168, f169}; +fma.rn.f32 f170, f142, f76, f145; +sub.f32 f171, f143, f144; +st.shared.v2.f32 [r9+32], {f171, f170}; +fma.rn.f32 f172, f150, f64, f153; +sub.f32 f173, f151, f152; +st.shared.v2.f32 [r9+40], {f173, f172}; +fma.rn.f32 f174, f158, f53, f161; +sub.f32 f175, f159, f160; +st.shared.v2.f32 [r9+48], {f175, f174}; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.v2.f32 {f176, f177}, [r10]; +ld.shared.v2.f32 {f180, f181}, [r10+392]; +ld.shared.v2.f32 {f184, f185}, [r10+784]; +ld.shared.v2.f32 {f188, f189}, [r10+1176]; +ld.shared.v2.f32 {f192, f193}, [r10+1568]; +ld.shared.v2.f32 {f196, f197}, [r10+1960]; +ld.shared.v2.f32 {f200, f201}, [r10+2352]; +add.f32 f204, f180, f200; +add.f32 f205, f176, f204; +add.f32 f206, f184, f196; +add.f32 f207, f206, f205; +add.f32 f208, f188, f192; +add.f32 f209, f181, f201; +add.f32 f210, f177, f209; +add.f32 f211, f185, f197; +add.f32 f212, f211, f210; +add.f32 f213, f189, f193; +fma.rn.f32 f214, f204, 0f3F1F9D07, f176; +mul.f32 f215, f206, 0f3E63DC87; +sub.f32 f216, f214, f215; +mul.f32 f217, f208, 0f3F66A5E5; +sub.f32 f218, f216, f217; +sub.f32 f219, f181, f201; +mul.f32 f220, f219, 0f3F48261C; +sub.f32 f221, f185, f197; +mul.f32 f222, f221, 0fBF7994E0; +sub.f32 f223, f222, f220; +sub.f32 f224, f189, f193; +mul.f32 f225, f224, 0f3EDE2602; +sub.f32 f226, f223, f225; +sub.f32 f227, f218, f226; +add.f32 f228, f226, f218; +mul.f32 f229, f204, 0f3E63DC87; +sub.f32 f230, f176, f229; +mul.f32 f231, f206, 0f3F66A5E5; +sub.f32 f232, f230, f231; +fma.rn.f32 f233, f208, 0f3F1F9D07, f232; +mul.f32 f234, f219, 0f3F7994E0; +mul.f32 f235, f221, 0f3EDE2602; +sub.f32 f236, f235, f234; +fma.rn.f32 f237, f224, 0f3F48261C, f236; +sub.f32 f238, f233, f237; +add.f32 f239, f237, f233; +mul.f32 f240, f204, 0f3F66A5E5; +sub.f32 f241, f176, f240; +fma.rn.f32 f242, f206, 0f3F1F9D07, f241; +mul.f32 f243, f208, 0f3E63DC87; +sub.f32 f244, f242, f243; +mul.f32 f245, f219, 0f3EDE2602; +mul.f32 f246, f221, 0f3F48261C; +sub.f32 f247, f246, f245; +mul.f32 f248, f224, 0f3F7994E0; +sub.f32 f249, f247, f248; +sub.f32 f250, f244, f249; +add.f32 f251, f249, f244; +fma.rn.f32 f252, f209, 0f3F1F9D07, f177; +mul.f32 f253, f211, 0f3E63DC87; +sub.f32 f254, f252, f253; +mul.f32 f255, f213, 0f3F66A5E5; +sub.f32 f256, f254, f255; +sub.f32 f257, f180, f200; +mul.f32 f258, f257, 0f3F48261C; +sub.f32 f259, f184, f196; +mul.f32 f260, f259, 0fBF7994E0; +sub.f32 f261, f260, f258; +sub.f32 f262, f188, f192; +mul.f32 f263, f262, 0f3EDE2602; +sub.f32 f264, f261, f263; +add.f32 f265, f264, f256; +sub.f32 f266, f256, f264; +mul.f32 f267, f209, 0f3E63DC87; +sub.f32 f268, f177, f267; +mul.f32 f269, f211, 0f3F66A5E5; +sub.f32 f270, f268, f269; +fma.rn.f32 f271, f213, 0f3F1F9D07, f270; +mul.f32 f272, f257, 0f3F7994E0; +mul.f32 f273, f259, 0f3EDE2602; +sub.f32 f274, f273, f272; +fma.rn.f32 f275, f262, 0f3F48261C, f274; +add.f32 f276, f275, f271; +sub.f32 f277, f271, f275; +mul.f32 f278, f209, 0f3F66A5E5; +sub.f32 f279, f177, f278; +fma.rn.f32 f280, f211, 0f3F1F9D07, f279; +mul.f32 f281, f213, 0f3E63DC87; +sub.f32 f282, f280, f281; +mul.f32 f283, f257, 0f3EDE2602; +mul.f32 f284, f259, 0f3F48261C; +sub.f32 f285, f284, f283; +mul.f32 f286, f262, 0f3F7994E0; +sub.f32 f287, f285, f286; +add.f32 f288, f287, f282; +sub.f32 f289, f282, f287; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f290, f291}, [rd11]; +mul.f32 f294, f290, f227; +mul.f32 f295, f291, f265; +mul.f32 f296, f290, f265; +mul.f32 f297, f290, f290; +mul.f32 f298, f291, f291; +sub.f32 f299, f297, f298; +mul.f32 f300, f291, f290; +fma.rn.f32 f301, f291, f290, f300; +mul.f32 f302, f299, f238; +mul.f32 f303, f301, f276; +mul.f32 f304, f299, f276; +mul.f32 f305, f290, f299; +mul.f32 f306, f291, f301; +sub.f32 f307, f305, f306; +mul.f32 f308, f290, f301; +fma.rn.f32 f309, f291, f299, f308; +mul.f32 f310, f307, f250; +mul.f32 f311, f309, f288; +mul.f32 f312, f307, f288; +mul.f32 f313, f290, f307; +mul.f32 f314, f291, f309; +sub.f32 f315, f313, f314; +mul.f32 f316, f290, f309; +fma.rn.f32 f317, f291, f307, f316; +mul.f32 f318, f315, f251; +mul.f32 f319, f317, f289; +mul.f32 f320, f315, f289; +mul.f32 f321, f290, f315; +mul.f32 f322, f291, f317; +sub.f32 f323, f321, f322; +mul.f32 f324, f290, f317; +fma.rn.f32 f325, f291, f315, f324; +mul.f32 f326, f323, f239; +mul.f32 f327, f325, f277; +mul.f32 f328, f323, f277; +mul.f32 f329, f290, f323; +mul.f32 f330, f291, f325; +sub.f32 f331, f329, f330; +mul.f32 f332, f290, f325; +fma.rn.f32 f333, f291, f323, f332; +mul.f32 f334, f331, f228; +mul.f32 f335, f333, f266; +mul.f32 f336, f331, f266; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +add.f32 f337, f213, f212; +add.f32 f338, f208, f207; +st.shared.v2.f32 [r20], {f338, f337}; +fma.rn.f32 f339, f291, f227, f296; +sub.f32 f340, f294, f295; +st.shared.v2.f32 [r20+56], {f340, f339}; +fma.rn.f32 f341, f301, f238, f304; +sub.f32 f342, f302, f303; +st.shared.v2.f32 [r20+112], {f342, f341}; +fma.rn.f32 f343, f309, f250, f312; +sub.f32 f344, f310, f311; +st.shared.v2.f32 [r20+168], {f344, f343}; +sub.f32 f345, f318, f319; +fma.rn.f32 f346, f317, f251, f320; +st.shared.v2.f32 [r20+224], {f345, f346}; +fma.rn.f32 f347, f325, f239, f328; +sub.f32 f348, f326, f327; +st.shared.v2.f32 [r20+280], {f348, f347}; +fma.rn.f32 f349, f333, f228, f336; +sub.f32 f350, f334, f335; +st.shared.v2.f32 [r20+336], {f350, f349}; +barrier.sync 0; +ld.shared.v2.f32 {f351, f352}, [r10]; +ld.shared.v2.f32 {f355, f356}, [r10+392]; +ld.shared.v2.f32 {f359, f360}, [r10+784]; +ld.shared.v2.f32 {f363, f364}, [r10+1176]; +ld.shared.v2.f32 {f367, f368}, [r10+1568]; +ld.shared.v2.f32 {f371, f372}, [r10+1960]; +ld.shared.v2.f32 {f375, f376}, [r10+2352]; +add.f32 f379, f355, f375; +add.f32 f380, f351, f379; +add.f32 f381, f359, f371; +add.f32 f382, f381, f380; +add.f32 f383, f363, f367; +add.f32 f384, f356, f376; +add.f32 f385, f352, f384; +add.f32 f386, f360, f372; +add.f32 f387, f386, f385; +add.f32 f388, f364, f368; +fma.rn.f32 f389, f379, 0f3F1F9D07, f351; +mul.f32 f390, f381, 0f3E63DC87; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, 0f3F66A5E5; +sub.f32 f393, f391, f392; +sub.f32 f394, f356, f376; +mul.f32 f395, f394, 0f3F48261C; +sub.f32 f396, f360, f372; +mul.f32 f397, f396, 0fBF7994E0; +sub.f32 f398, f397, f395; +sub.f32 f399, f364, f368; +mul.f32 f400, f399, 0f3EDE2602; +sub.f32 f401, f398, f400; +mul.f32 f402, f379, 0f3E63DC87; +sub.f32 f403, f351, f402; +mul.f32 f404, f381, 0f3F66A5E5; +sub.f32 f405, f403, f404; +fma.rn.f32 f406, f383, 0f3F1F9D07, f405; +mul.f32 f407, f394, 0f3F7994E0; +mul.f32 f408, f396, 0f3EDE2602; +sub.f32 f409, f408, f407; +fma.rn.f32 f410, f399, 0f3F48261C, f409; +mul.f32 f411, f379, 0f3F66A5E5; +sub.f32 f412, f351, f411; +fma.rn.f32 f413, f381, 0f3F1F9D07, f412; +mul.f32 f414, f383, 0f3E63DC87; +sub.f32 f415, f413, f414; +mul.f32 f416, f394, 0f3EDE2602; +mul.f32 f417, f396, 0f3F48261C; +sub.f32 f418, f417, f416; +mul.f32 f419, f399, 0f3F7994E0; +sub.f32 f420, f418, f419; +fma.rn.f32 f421, f384, 0f3F1F9D07, f352; +mul.f32 f422, f386, 0f3E63DC87; +sub.f32 f423, f421, f422; +mul.f32 f424, f388, 0f3F66A5E5; +sub.f32 f425, f423, f424; +sub.f32 f426, f355, f375; +mul.f32 f427, f426, 0f3F48261C; +sub.f32 f428, f359, f371; +mul.f32 f429, f428, 0fBF7994E0; +sub.f32 f430, f429, f427; +sub.f32 f431, f363, f367; +mul.f32 f432, f431, 0f3EDE2602; +sub.f32 f433, f430, f432; +mul.f32 f434, f384, 0f3E63DC87; +sub.f32 f435, f352, f434; +mul.f32 f436, f386, 0f3F66A5E5; +sub.f32 f437, f435, f436; +fma.rn.f32 f438, f388, 0f3F1F9D07, f437; +mul.f32 f439, f426, 0f3F7994E0; +mul.f32 f440, f428, 0f3EDE2602; +sub.f32 f441, f440, f439; +fma.rn.f32 f442, f431, 0f3F48261C, f441; +mul.f32 f443, f384, 0f3F66A5E5; +sub.f32 f444, f352, f443; +fma.rn.f32 f445, f386, 0f3F1F9D07, f444; +mul.f32 f446, f388, 0f3E63DC87; +sub.f32 f447, f445, f446; +mul.f32 f448, f426, 0f3EDE2602; +mul.f32 f449, f428, 0f3F48261C; +sub.f32 f450, f449, f448; +mul.f32 f451, f431, 0f3F7994E0; +sub.f32 f452, f450, f451; +add.f32 %1, f388, f387; +add.f32 %0, f383, f382; +add.f32 %3, f433, f425; +sub.f32 %2, f393, f401; +add.f32 %5, f442, f438; +sub.f32 %4, f406, f410; +add.f32 %7, f452, f447; +sub.f32 %6, f415, f420; +sub.f32 %9, f447, f452; +add.f32 %8, f420, f415; +sub.f32 %11, f438, f442; +add.f32 %10, f410, f406; +sub.f32 %13, f425, f433; +add.f32 %12, f401, f393; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<175, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<439>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 1372, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %19, %33; +add.f32 f30, %17, f29; +add.f32 f31, %22, %30; +add.f32 f32, f31, f30; +add.f32 f33, %25, %27; +add.f32 f34, f33, f32; +add.f32 f35, %21, %34; +add.f32 f36, %18, f35; +add.f32 f37, %24, %32; +add.f32 f38, f37, f36; +add.f32 f39, %26, %29; +add.f32 f40, f39, f38; +fma.rn.f32 f41, f29, 0f3F1F9D07, %17; +mul.f32 f42, f31, 0f3E63DC87; +sub.f32 f43, f41, f42; +mul.f32 f44, f33, 0f3F66A5E5; +sub.f32 f45, f43, f44; +sub.f32 f46, %21, %34; +mul.f32 f47, f46, 0f3F48261C; +sub.f32 f48, %24, %32; +mul.f32 f49, f48, 0fBF7994E0; +sub.f32 f50, f49, f47; +sub.f32 f51, %26, %29; +mul.f32 f52, f51, 0f3EDE2602; +sub.f32 f53, f50, f52; +sub.f32 f54, f45, f53; +add.f32 f55, f53, f45; +mul.f32 f56, f29, 0f3E63DC87; +sub.f32 f57, %17, f56; +mul.f32 f58, f31, 0f3F66A5E5; +sub.f32 f59, f57, f58; +fma.rn.f32 f60, f33, 0f3F1F9D07, f59; +mul.f32 f61, f46, 0f3F7994E0; +mul.f32 f62, f48, 0f3EDE2602; +sub.f32 f63, f62, f61; +fma.rn.f32 f64, f51, 0f3F48261C, f63; +sub.f32 f65, f60, f64; +add.f32 f66, f64, f60; +mul.f32 f67, f29, 0f3F66A5E5; +sub.f32 f68, %17, f67; +fma.rn.f32 f69, f31, 0f3F1F9D07, f68; +mul.f32 f70, f33, 0f3E63DC87; +sub.f32 f71, f69, f70; +mul.f32 f72, f46, 0f3EDE2602; +mul.f32 f73, f48, 0f3F48261C; +sub.f32 f74, f73, f72; +mul.f32 f75, f51, 0f3F7994E0; +sub.f32 f76, f74, f75; +sub.f32 f77, f71, f76; +add.f32 f78, f76, f71; +fma.rn.f32 f79, f35, 0f3F1F9D07, %18; +mul.f32 f80, f37, 0f3E63DC87; +sub.f32 f81, f79, f80; +mul.f32 f82, f39, 0f3F66A5E5; +sub.f32 f83, f81, f82; +sub.f32 f84, %19, %33; +mul.f32 f85, f84, 0f3F48261C; +sub.f32 f86, %22, %30; +mul.f32 f87, f86, 0fBF7994E0; +sub.f32 f88, f87, f85; +sub.f32 f89, %25, %27; +mul.f32 f90, f89, 0f3EDE2602; +sub.f32 f91, f88, f90; +add.f32 f92, f91, f83; +sub.f32 f93, f83, f91; +mul.f32 f94, f35, 0f3E63DC87; +sub.f32 f95, %18, f94; +mul.f32 f96, f37, 0f3F66A5E5; +sub.f32 f97, f95, f96; +fma.rn.f32 f98, f39, 0f3F1F9D07, f97; +mul.f32 f99, f84, 0f3F7994E0; +mul.f32 f100, f86, 0f3EDE2602; +sub.f32 f101, f100, f99; +fma.rn.f32 f102, f89, 0f3F48261C, f101; +add.f32 f103, f102, f98; +sub.f32 f104, f98, f102; +mul.f32 f105, f35, 0f3F66A5E5; +sub.f32 f106, %18, f105; +fma.rn.f32 f107, f37, 0f3F1F9D07, f106; +mul.f32 f108, f39, 0f3E63DC87; +sub.f32 f109, f107, f108; +mul.f32 f110, f84, 0f3EDE2602; +mul.f32 f111, f86, 0f3F48261C; +sub.f32 f112, f111, f110; +mul.f32 f113, f89, 0f3F7994E0; +sub.f32 f114, f112, f113; +add.f32 f115, f114, f109; +sub.f32 f116, f109, f114; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f117, f118}, [rd6]; +mul.f32 f121, f117, f54; +mul.f32 f122, f118, f92; +sub.f32 f123, f121, f122; +mul.f32 f124, f117, f92; +fma.rn.f32 f125, f118, f54, f124; +mul.f32 f126, f117, f117; +mul.f32 f127, f118, f118; +sub.f32 f128, f126, f127; +mul.f32 f129, f118, f117; +fma.rn.f32 f130, f118, f117, f129; +mul.f32 f131, f128, f65; +mul.f32 f132, f130, f103; +sub.f32 f133, f131, f132; +mul.f32 f134, f128, f103; +fma.rn.f32 f135, f130, f65, f134; +mul.f32 f136, f117, f128; +mul.f32 f137, f118, f130; +sub.f32 f138, f136, f137; +mul.f32 f139, f117, f130; +fma.rn.f32 f140, f118, f128, f139; +mul.f32 f141, f138, f77; +mul.f32 f142, f140, f115; +sub.f32 f143, f141, f142; +mul.f32 f144, f138, f115; +fma.rn.f32 f145, f140, f77, f144; +mul.f32 f146, f117, f138; +mul.f32 f147, f118, f140; +sub.f32 f148, f146, f147; +mul.f32 f149, f117, f140; +fma.rn.f32 f150, f118, f138, f149; +mul.f32 f151, f148, f78; +mul.f32 f152, f150, f116; +sub.f32 f153, f151, f152; +mul.f32 f154, f148, f116; +fma.rn.f32 f155, f150, f78, f154; +mul.f32 f156, f117, f148; +mul.f32 f157, f118, f150; +sub.f32 f158, f156, f157; +mul.f32 f159, f117, f150; +fma.rn.f32 f160, f118, f148, f159; +mul.f32 f161, f158, f66; +mul.f32 f162, f160, f104; +sub.f32 f163, f161, f162; +mul.f32 f164, f158, f104; +fma.rn.f32 f165, f160, f66, f164; +mul.f32 f166, f117, f158; +mul.f32 f167, f118, f160; +sub.f32 f168, f166, f167; +mul.f32 f169, f117, f160; +fma.rn.f32 f170, f118, f158, f169; +mul.f32 f171, f168, f55; +mul.f32 f172, f170, f93; +sub.f32 f173, f171, f172; +mul.f32 f174, f168, f93; +fma.rn.f32 f175, f170, f55, f174; +mad.lo.s32 r8, r5, 1372, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 28, r8; +st.shared.f32 [r9], f34; +st.shared.f32 [r9+4], f123; +st.shared.f32 [r9+8], f133; +st.shared.f32 [r9+12], f143; +st.shared.f32 [r9+16], f153; +st.shared.f32 [r9+20], f163; +st.shared.f32 [r9+24], f173; +barrier.sync 0; +mad.lo.s32 r10, r7, -24, r9; +ld.shared.f32 f176, [r10]; +ld.shared.f32 f177, [r10+196]; +ld.shared.f32 f178, [r10+392]; +ld.shared.f32 f179, [r10+588]; +ld.shared.f32 f180, [r10+784]; +ld.shared.f32 f181, [r10+980]; +ld.shared.f32 f182, [r10+1176]; +barrier.sync 0; +st.shared.f32 [r9], f40; +st.shared.f32 [r9+4], f125; +st.shared.f32 [r9+8], f135; +st.shared.f32 [r9+12], f145; +st.shared.f32 [r9+16], f155; +st.shared.f32 [r9+20], f165; +st.shared.f32 [r9+24], f175; +barrier.sync 0; +ld.shared.f32 f183, [r10]; +ld.shared.f32 f184, [r10+196]; +ld.shared.f32 f185, [r10+392]; +ld.shared.f32 f186, [r10+588]; +ld.shared.f32 f187, [r10+784]; +ld.shared.f32 f188, [r10+980]; +ld.shared.f32 f189, [r10+1176]; +add.f32 f190, f177, f182; +add.f32 f191, f176, f190; +add.f32 f192, f178, f181; +add.f32 f193, f192, f191; +add.f32 f194, f179, f180; +add.f32 f195, f194, f193; +add.f32 f196, f184, f189; +add.f32 f197, f183, f196; +add.f32 f198, f185, f188; +add.f32 f199, f198, f197; +add.f32 f200, f186, f187; +add.f32 f201, f200, f199; +fma.rn.f32 f202, f190, 0f3F1F9D07, f176; +mul.f32 f203, f192, 0f3E63DC87; +sub.f32 f204, f202, f203; +mul.f32 f205, f194, 0f3F66A5E5; +sub.f32 f206, f204, f205; +sub.f32 f207, f184, f189; +mul.f32 f208, f207, 0f3F48261C; +sub.f32 f209, f185, f188; +mul.f32 f210, f209, 0fBF7994E0; +sub.f32 f211, f210, f208; +sub.f32 f212, f186, f187; +mul.f32 f213, f212, 0f3EDE2602; +sub.f32 f214, f211, f213; +sub.f32 f215, f206, f214; +add.f32 f216, f214, f206; +mul.f32 f217, f190, 0f3E63DC87; +sub.f32 f218, f176, f217; +mul.f32 f219, f192, 0f3F66A5E5; +sub.f32 f220, f218, f219; +fma.rn.f32 f221, f194, 0f3F1F9D07, f220; +mul.f32 f222, f207, 0f3F7994E0; +mul.f32 f223, f209, 0f3EDE2602; +sub.f32 f224, f223, f222; +fma.rn.f32 f225, f212, 0f3F48261C, f224; +sub.f32 f226, f221, f225; +add.f32 f227, f225, f221; +mul.f32 f228, f190, 0f3F66A5E5; +sub.f32 f229, f176, f228; +fma.rn.f32 f230, f192, 0f3F1F9D07, f229; +mul.f32 f231, f194, 0f3E63DC87; +sub.f32 f232, f230, f231; +mul.f32 f233, f207, 0f3EDE2602; +mul.f32 f234, f209, 0f3F48261C; +sub.f32 f235, f234, f233; +mul.f32 f236, f212, 0f3F7994E0; +sub.f32 f237, f235, f236; +sub.f32 f238, f232, f237; +add.f32 f239, f237, f232; +fma.rn.f32 f240, f196, 0f3F1F9D07, f183; +mul.f32 f241, f198, 0f3E63DC87; +sub.f32 f242, f240, f241; +mul.f32 f243, f200, 0f3F66A5E5; +sub.f32 f244, f242, f243; +sub.f32 f245, f177, f182; +mul.f32 f246, f245, 0f3F48261C; +sub.f32 f247, f178, f181; +mul.f32 f248, f247, 0fBF7994E0; +sub.f32 f249, f248, f246; +sub.f32 f250, f179, f180; +mul.f32 f251, f250, 0f3EDE2602; +sub.f32 f252, f249, f251; +add.f32 f253, f252, f244; +sub.f32 f254, f244, f252; +mul.f32 f255, f196, 0f3E63DC87; +sub.f32 f256, f183, f255; +mul.f32 f257, f198, 0f3F66A5E5; +sub.f32 f258, f256, f257; +fma.rn.f32 f259, f200, 0f3F1F9D07, f258; +mul.f32 f260, f245, 0f3F7994E0; +mul.f32 f261, f247, 0f3EDE2602; +sub.f32 f262, f261, f260; +fma.rn.f32 f263, f250, 0f3F48261C, f262; +add.f32 f264, f263, f259; +sub.f32 f265, f259, f263; +mul.f32 f266, f196, 0f3F66A5E5; +sub.f32 f267, f183, f266; +fma.rn.f32 f268, f198, 0f3F1F9D07, f267; +mul.f32 f269, f200, 0f3E63DC87; +sub.f32 f270, f268, f269; +mul.f32 f271, f245, 0f3EDE2602; +mul.f32 f272, f247, 0f3F48261C; +sub.f32 f273, f272, f271; +mul.f32 f274, f250, 0f3F7994E0; +sub.f32 f275, f273, f274; +add.f32 f276, f275, f270; +sub.f32 f277, f270, f275; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f278, f279}, [rd11]; +mul.f32 f282, f278, f215; +mul.f32 f283, f279, f253; +sub.f32 f284, f282, f283; +mul.f32 f285, f278, f253; +fma.rn.f32 f286, f279, f215, f285; +mul.f32 f287, f278, f278; +mul.f32 f288, f279, f279; +sub.f32 f289, f287, f288; +mul.f32 f290, f279, f278; +fma.rn.f32 f291, f279, f278, f290; +mul.f32 f292, f289, f226; +mul.f32 f293, f291, f264; +sub.f32 f294, f292, f293; +mul.f32 f295, f289, f264; +fma.rn.f32 f296, f291, f226, f295; +mul.f32 f297, f278, f289; +mul.f32 f298, f279, f291; +sub.f32 f299, f297, f298; +mul.f32 f300, f278, f291; +fma.rn.f32 f301, f279, f289, f300; +mul.f32 f302, f299, f238; +mul.f32 f303, f301, f276; +sub.f32 f304, f302, f303; +mul.f32 f305, f299, f276; +fma.rn.f32 f306, f301, f238, f305; +mul.f32 f307, f278, f299; +mul.f32 f308, f279, f301; +sub.f32 f309, f307, f308; +mul.f32 f310, f278, f301; +fma.rn.f32 f311, f279, f299, f310; +mul.f32 f312, f309, f239; +mul.f32 f313, f311, f277; +sub.f32 f314, f312, f313; +mul.f32 f315, f309, f277; +fma.rn.f32 f316, f311, f239, f315; +mul.f32 f317, f278, f309; +mul.f32 f318, f279, f311; +sub.f32 f319, f317, f318; +mul.f32 f320, f278, f311; +fma.rn.f32 f321, f279, f309, f320; +mul.f32 f322, f319, f227; +mul.f32 f323, f321, f265; +sub.f32 f324, f322, f323; +mul.f32 f325, f319, f265; +fma.rn.f32 f326, f321, f227, f325; +mul.f32 f327, f278, f319; +mul.f32 f328, f279, f321; +sub.f32 f329, f327, f328; +mul.f32 f330, f278, f321; +fma.rn.f32 f331, f279, f319, f330; +mul.f32 f332, f329, f216; +mul.f32 f333, f331, f254; +sub.f32 f334, f332, f333; +mul.f32 f335, f329, f254; +fma.rn.f32 f336, f331, f216, f335; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 196, r19; +st.shared.f32 [r20], f195; +st.shared.f32 [r20+28], f284; +st.shared.f32 [r20+56], f294; +st.shared.f32 [r20+84], f304; +st.shared.f32 [r20+112], f314; +st.shared.f32 [r20+140], f324; +st.shared.f32 [r20+168], f334; +barrier.sync 0; +ld.shared.f32 f337, [r10]; +ld.shared.f32 f338, [r10+196]; +ld.shared.f32 f339, [r10+392]; +ld.shared.f32 f340, [r10+588]; +ld.shared.f32 f341, [r10+784]; +ld.shared.f32 f342, [r10+980]; +ld.shared.f32 f343, [r10+1176]; +barrier.sync 0; +st.shared.f32 [r20], f201; +st.shared.f32 [r20+28], f286; +st.shared.f32 [r20+56], f296; +st.shared.f32 [r20+84], f306; +st.shared.f32 [r20+112], f316; +st.shared.f32 [r20+140], f326; +st.shared.f32 [r20+168], f336; +barrier.sync 0; +ld.shared.f32 f344, [r10]; +ld.shared.f32 f345, [r10+196]; +ld.shared.f32 f346, [r10+392]; +ld.shared.f32 f347, [r10+588]; +ld.shared.f32 f348, [r10+784]; +ld.shared.f32 f349, [r10+980]; +ld.shared.f32 f350, [r10+1176]; +add.f32 f351, f338, f343; +add.f32 f352, f337, f351; +add.f32 f353, f339, f342; +add.f32 f354, f353, f352; +add.f32 f355, f340, f341; +add.f32 f356, f345, f350; +add.f32 f357, f344, f356; +add.f32 f358, f346, f349; +add.f32 f359, f358, f357; +add.f32 f360, f347, f348; +fma.rn.f32 f361, f351, 0f3F1F9D07, f337; +mul.f32 f362, f353, 0f3E63DC87; +sub.f32 f363, f361, f362; +mul.f32 f364, f355, 0f3F66A5E5; +sub.f32 f365, f363, f364; +sub.f32 f366, f345, f350; +mul.f32 f367, f366, 0f3F48261C; +sub.f32 f368, f346, f349; +mul.f32 f369, f368, 0fBF7994E0; +sub.f32 f370, f369, f367; +sub.f32 f371, f347, f348; +mul.f32 f372, f371, 0f3EDE2602; +sub.f32 f373, f370, f372; +mul.f32 f374, f351, 0f3E63DC87; +sub.f32 f375, f337, f374; +mul.f32 f376, f353, 0f3F66A5E5; +sub.f32 f377, f375, f376; +fma.rn.f32 f378, f355, 0f3F1F9D07, f377; +mul.f32 f379, f366, 0f3F7994E0; +mul.f32 f380, f368, 0f3EDE2602; +sub.f32 f381, f380, f379; +fma.rn.f32 f382, f371, 0f3F48261C, f381; +mul.f32 f383, f351, 0f3F66A5E5; +sub.f32 f384, f337, f383; +fma.rn.f32 f385, f353, 0f3F1F9D07, f384; +mul.f32 f386, f355, 0f3E63DC87; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, 0f3EDE2602; +mul.f32 f389, f368, 0f3F48261C; +sub.f32 f390, f389, f388; +mul.f32 f391, f371, 0f3F7994E0; +sub.f32 f392, f390, f391; +fma.rn.f32 f393, f356, 0f3F1F9D07, f344; +mul.f32 f394, f358, 0f3E63DC87; +sub.f32 f395, f393, f394; +mul.f32 f396, f360, 0f3F66A5E5; +sub.f32 f397, f395, f396; +sub.f32 f398, f338, f343; +mul.f32 f399, f398, 0f3F48261C; +sub.f32 f400, f339, f342; +mul.f32 f401, f400, 0fBF7994E0; +sub.f32 f402, f401, f399; +sub.f32 f403, f340, f341; +mul.f32 f404, f403, 0f3EDE2602; +sub.f32 f405, f402, f404; +mul.f32 f406, f356, 0f3E63DC87; +sub.f32 f407, f344, f406; +mul.f32 f408, f358, 0f3F66A5E5; +sub.f32 f409, f407, f408; +fma.rn.f32 f410, f360, 0f3F1F9D07, f409; +mul.f32 f411, f398, 0f3F7994E0; +mul.f32 f412, f400, 0f3EDE2602; +sub.f32 f413, f412, f411; +fma.rn.f32 f414, f403, 0f3F48261C, f413; +mul.f32 f415, f356, 0f3F66A5E5; +sub.f32 f416, f344, f415; +fma.rn.f32 f417, f358, 0f3F1F9D07, f416; +mul.f32 f418, f360, 0f3E63DC87; +sub.f32 f419, f417, f418; +mul.f32 f420, f398, 0f3EDE2602; +mul.f32 f421, f400, 0f3F48261C; +sub.f32 f422, f421, f420; +mul.f32 f423, f403, 0f3F7994E0; +sub.f32 f424, f422, f423; +add.f32 %0, f355, f354; +add.f32 %1, f360, f359; +add.f32 %3, f405, f397; +sub.f32 %2, f365, f373; +sub.f32 %4, f378, f382; +add.f32 %5, f414, f410; +sub.f32 %6, f387, f392; +add.f32 %7, f424, f419; +add.f32 %8, f392, f387; +sub.f32 %9, f419, f424; +add.f32 %10, f382, f378; +sub.f32 %11, f410, f414; +sub.f32 %13, f397, f405; +add.f32 %12, f373, f365; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..ad6171b5fbb68 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp32_inv.hpp.inc @@ -0,0 +1,914 @@ +#ifndef CUFFTDX_FFT_343_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_343_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<376, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<455>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 2744, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %19, %33; +add.f32 f30, %17, f29; +add.f32 f31, %22, %30; +add.f32 f32, f31, f30; +add.f32 f33, %25, %27; +add.f32 f34, %21, %34; +add.f32 f35, %18, f34; +add.f32 f36, %24, %32; +add.f32 f37, f36, f35; +add.f32 f38, %26, %29; +fma.rn.f32 f39, f29, 0f3F1F9D07, %17; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %21, %34; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %24, %32; +fma.rn.f32 f47, f46, 0f3F7994E0, f45; +sub.f32 f48, %26, %29; +fma.rn.f32 f49, f48, 0f3EDE2602, f47; +sub.f32 f50, f43, f49; +add.f32 f51, f49, f43; +mul.f32 f52, f29, 0f3E63DC87; +sub.f32 f53, %17, f52; +mul.f32 f54, f31, 0f3F66A5E5; +sub.f32 f55, f53, f54; +fma.rn.f32 f56, f33, 0f3F1F9D07, f55; +mul.f32 f57, f44, 0f3F7994E0; +mul.f32 f58, f46, 0f3EDE2602; +sub.f32 f59, f57, f58; +mul.f32 f60, f48, 0f3F48261C; +sub.f32 f61, f59, f60; +sub.f32 f62, f56, f61; +add.f32 f63, f61, f56; +mul.f32 f64, f29, 0f3F66A5E5; +sub.f32 f65, %17, f64; +fma.rn.f32 f66, f31, 0f3F1F9D07, f65; +mul.f32 f67, f33, 0f3E63DC87; +sub.f32 f68, f66, f67; +mul.f32 f69, f44, 0f3EDE2602; +mul.f32 f70, f46, 0f3F48261C; +sub.f32 f71, f69, f70; +fma.rn.f32 f72, f48, 0f3F7994E0, f71; +sub.f32 f73, f68, f72; +add.f32 f74, f72, f68; +fma.rn.f32 f75, f34, 0f3F1F9D07, %18; +mul.f32 f76, f36, 0f3E63DC87; +sub.f32 f77, f75, f76; +mul.f32 f78, f38, 0f3F66A5E5; +sub.f32 f79, f77, f78; +sub.f32 f80, %19, %33; +mul.f32 f81, f80, 0f3F48261C; +sub.f32 f82, %22, %30; +fma.rn.f32 f83, f82, 0f3F7994E0, f81; +sub.f32 f84, %25, %27; +fma.rn.f32 f85, f84, 0f3EDE2602, f83; +add.f32 f86, f85, f79; +sub.f32 f87, f79, f85; +mul.f32 f88, f34, 0f3E63DC87; +sub.f32 f89, %18, f88; +mul.f32 f90, f36, 0f3F66A5E5; +sub.f32 f91, f89, f90; +fma.rn.f32 f92, f38, 0f3F1F9D07, f91; +mul.f32 f93, f80, 0f3F7994E0; +mul.f32 f94, f82, 0f3EDE2602; +sub.f32 f95, f93, f94; +mul.f32 f96, f84, 0f3F48261C; +sub.f32 f97, f95, f96; +add.f32 f98, f97, f92; +sub.f32 f99, f92, f97; +mul.f32 f100, f34, 0f3F66A5E5; +sub.f32 f101, %18, f100; +fma.rn.f32 f102, f36, 0f3F1F9D07, f101; +mul.f32 f103, f38, 0f3E63DC87; +sub.f32 f104, f102, f103; +mul.f32 f105, f80, 0f3EDE2602; +mul.f32 f106, f82, 0f3F48261C; +sub.f32 f107, f105, f106; +fma.rn.f32 f108, f84, 0f3F7994E0, f107; +add.f32 f109, f108, f104; +sub.f32 f110, f104, f108; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2744, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f111, f112}, [rd6]; +mul.f32 f115, f86, f112; +mul.f32 f116, f50, f112; +mul.f32 f117, f111, f86; +mul.f32 f118, f111, f111; +mul.f32 f119, f112, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f112, f111; +fma.rn.f32 f122, f112, f111, f121; +mul.f32 f123, f98, f122; +mul.f32 f124, f62, f122; +mul.f32 f125, f120, f98; +mul.f32 f126, f111, f120; +mul.f32 f127, f112, f122; +sub.f32 f128, f126, f127; +mul.f32 f129, f111, f122; +fma.rn.f32 f130, f112, f120, f129; +mul.f32 f131, f109, f130; +mul.f32 f132, f73, f130; +mul.f32 f133, f128, f109; +mul.f32 f134, f111, f128; +mul.f32 f135, f112, f130; +sub.f32 f136, f134, f135; +mul.f32 f137, f111, f130; +fma.rn.f32 f138, f112, f128, f137; +mul.f32 f139, f110, f138; +mul.f32 f140, f74, f138; +mul.f32 f141, f136, f110; +mul.f32 f142, f111, f136; +mul.f32 f143, f112, f138; +sub.f32 f144, f142, f143; +mul.f32 f145, f111, f138; +fma.rn.f32 f146, f112, f136, f145; +mul.f32 f147, f99, f146; +mul.f32 f148, f63, f146; +mul.f32 f149, f144, f99; +mul.f32 f150, f111, f144; +mul.f32 f151, f112, f146; +sub.f32 f152, f150, f151; +mul.f32 f153, f111, f146; +fma.rn.f32 f154, f112, f144, f153; +mul.f32 f155, f87, f154; +mul.f32 f156, f51, f154; +mul.f32 f157, f152, f87; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +add.f32 f158, f38, f37; +add.f32 f159, f33, f32; +st.shared.v2.f32 [r9], {f159, f158}; +fma.rn.f32 f160, f111, f50, f115; +sub.f32 f161, f117, f116; +st.shared.v2.f32 [r9+8], {f160, f161}; +fma.rn.f32 f162, f120, f62, f123; +sub.f32 f163, f125, f124; +st.shared.v2.f32 [r9+16], {f162, f163}; +sub.f32 f164, f133, f132; +fma.rn.f32 f165, f128, f73, f131; +st.shared.v2.f32 [r9+24], {f165, f164}; +fma.rn.f32 f166, f136, f74, f139; +sub.f32 f167, f141, f140; +st.shared.v2.f32 [r9+32], {f166, f167}; +fma.rn.f32 f168, f144, f63, f147; +sub.f32 f169, f149, f148; +st.shared.v2.f32 [r9+40], {f168, f169}; +fma.rn.f32 f170, f152, f51, f155; +sub.f32 f171, f157, f156; +st.shared.v2.f32 [r9+48], {f170, f171}; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.v2.f32 {f172, f173}, [r10]; +ld.shared.v2.f32 {f176, f177}, [r10+392]; +ld.shared.v2.f32 {f180, f181}, [r10+784]; +ld.shared.v2.f32 {f184, f185}, [r10+1176]; +ld.shared.v2.f32 {f188, f189}, [r10+1568]; +ld.shared.v2.f32 {f192, f193}, [r10+1960]; +ld.shared.v2.f32 {f196, f197}, [r10+2352]; +add.f32 f200, f176, f196; +add.f32 f201, f172, f200; +add.f32 f202, f180, f192; +add.f32 f203, f202, f201; +add.f32 f204, f184, f188; +add.f32 f205, f177, f197; +add.f32 f206, f173, f205; +add.f32 f207, f181, f193; +add.f32 f208, f207, f206; +add.f32 f209, f185, f189; +fma.rn.f32 f210, f200, 0f3F1F9D07, f172; +mul.f32 f211, f202, 0f3E63DC87; +sub.f32 f212, f210, f211; +mul.f32 f213, f204, 0f3F66A5E5; +sub.f32 f214, f212, f213; +sub.f32 f215, f177, f197; +mul.f32 f216, f215, 0f3F48261C; +sub.f32 f217, f181, f193; +fma.rn.f32 f218, f217, 0f3F7994E0, f216; +sub.f32 f219, f185, f189; +fma.rn.f32 f220, f219, 0f3EDE2602, f218; +sub.f32 f221, f214, f220; +add.f32 f222, f220, f214; +mul.f32 f223, f200, 0f3E63DC87; +sub.f32 f224, f172, f223; +mul.f32 f225, f202, 0f3F66A5E5; +sub.f32 f226, f224, f225; +fma.rn.f32 f227, f204, 0f3F1F9D07, f226; +mul.f32 f228, f215, 0f3F7994E0; +mul.f32 f229, f217, 0f3EDE2602; +sub.f32 f230, f228, f229; +mul.f32 f231, f219, 0f3F48261C; +sub.f32 f232, f230, f231; +sub.f32 f233, f227, f232; +add.f32 f234, f232, f227; +mul.f32 f235, f200, 0f3F66A5E5; +sub.f32 f236, f172, f235; +fma.rn.f32 f237, f202, 0f3F1F9D07, f236; +mul.f32 f238, f204, 0f3E63DC87; +sub.f32 f239, f237, f238; +mul.f32 f240, f215, 0f3EDE2602; +mul.f32 f241, f217, 0f3F48261C; +sub.f32 f242, f240, f241; +fma.rn.f32 f243, f219, 0f3F7994E0, f242; +sub.f32 f244, f239, f243; +add.f32 f245, f243, f239; +fma.rn.f32 f246, f205, 0f3F1F9D07, f173; +mul.f32 f247, f207, 0f3E63DC87; +sub.f32 f248, f246, f247; +mul.f32 f249, f209, 0f3F66A5E5; +sub.f32 f250, f248, f249; +sub.f32 f251, f176, f196; +mul.f32 f252, f251, 0f3F48261C; +sub.f32 f253, f180, f192; +fma.rn.f32 f254, f253, 0f3F7994E0, f252; +sub.f32 f255, f184, f188; +fma.rn.f32 f256, f255, 0f3EDE2602, f254; +add.f32 f257, f256, f250; +sub.f32 f258, f250, f256; +mul.f32 f259, f205, 0f3E63DC87; +sub.f32 f260, f173, f259; +mul.f32 f261, f207, 0f3F66A5E5; +sub.f32 f262, f260, f261; +fma.rn.f32 f263, f209, 0f3F1F9D07, f262; +mul.f32 f264, f251, 0f3F7994E0; +mul.f32 f265, f253, 0f3EDE2602; +sub.f32 f266, f264, f265; +mul.f32 f267, f255, 0f3F48261C; +sub.f32 f268, f266, f267; +add.f32 f269, f268, f263; +sub.f32 f270, f263, f268; +mul.f32 f271, f205, 0f3F66A5E5; +sub.f32 f272, f173, f271; +fma.rn.f32 f273, f207, 0f3F1F9D07, f272; +mul.f32 f274, f209, 0f3E63DC87; +sub.f32 f275, f273, f274; +mul.f32 f276, f251, 0f3EDE2602; +mul.f32 f277, f253, 0f3F48261C; +sub.f32 f278, f276, f277; +fma.rn.f32 f279, f255, 0f3F7994E0, f278; +add.f32 f280, f279, f275; +sub.f32 f281, f275, f279; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f282, f283}, [rd11]; +mul.f32 f286, f257, f283; +mul.f32 f287, f221, f283; +mul.f32 f288, f282, f257; +mul.f32 f289, f282, f282; +mul.f32 f290, f283, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f283, f282; +fma.rn.f32 f293, f283, f282, f292; +mul.f32 f294, f269, f293; +mul.f32 f295, f233, f293; +mul.f32 f296, f291, f269; +mul.f32 f297, f282, f291; +mul.f32 f298, f283, f293; +sub.f32 f299, f297, f298; +mul.f32 f300, f282, f293; +fma.rn.f32 f301, f283, f291, f300; +mul.f32 f302, f280, f301; +mul.f32 f303, f244, f301; +mul.f32 f304, f299, f280; +mul.f32 f305, f282, f299; +mul.f32 f306, f283, f301; +sub.f32 f307, f305, f306; +mul.f32 f308, f282, f301; +fma.rn.f32 f309, f283, f299, f308; +mul.f32 f310, f281, f309; +mul.f32 f311, f245, f309; +mul.f32 f312, f307, f281; +mul.f32 f313, f282, f307; +mul.f32 f314, f283, f309; +sub.f32 f315, f313, f314; +mul.f32 f316, f282, f309; +fma.rn.f32 f317, f283, f307, f316; +mul.f32 f318, f270, f317; +mul.f32 f319, f234, f317; +mul.f32 f320, f315, f270; +mul.f32 f321, f282, f315; +mul.f32 f322, f283, f317; +sub.f32 f323, f321, f322; +mul.f32 f324, f282, f317; +fma.rn.f32 f325, f283, f315, f324; +mul.f32 f326, f258, f325; +mul.f32 f327, f222, f325; +mul.f32 f328, f323, f258; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +add.f32 f329, f209, f208; +add.f32 f330, f204, f203; +st.shared.v2.f32 [r20], {f330, f329}; +fma.rn.f32 f331, f282, f221, f286; +sub.f32 f332, f288, f287; +st.shared.v2.f32 [r20+56], {f331, f332}; +fma.rn.f32 f333, f291, f233, f294; +sub.f32 f334, f296, f295; +st.shared.v2.f32 [r20+112], {f333, f334}; +fma.rn.f32 f335, f299, f244, f302; +sub.f32 f336, f304, f303; +st.shared.v2.f32 [r20+168], {f335, f336}; +sub.f32 f337, f312, f311; +fma.rn.f32 f338, f307, f245, f310; +st.shared.v2.f32 [r20+224], {f338, f337}; +fma.rn.f32 f339, f315, f234, f318; +sub.f32 f340, f320, f319; +st.shared.v2.f32 [r20+280], {f339, f340}; +fma.rn.f32 f341, f323, f222, f326; +sub.f32 f342, f328, f327; +st.shared.v2.f32 [r20+336], {f341, f342}; +barrier.sync 0; +ld.shared.v2.f32 {f343, f344}, [r10]; +ld.shared.v2.f32 {f347, f348}, [r10+392]; +ld.shared.v2.f32 {f351, f352}, [r10+784]; +ld.shared.v2.f32 {f355, f356}, [r10+1176]; +ld.shared.v2.f32 {f359, f360}, [r10+1568]; +ld.shared.v2.f32 {f363, f364}, [r10+1960]; +ld.shared.v2.f32 {f367, f368}, [r10+2352]; +add.f32 f371, f347, f367; +add.f32 f372, f343, f371; +add.f32 f373, f351, f363; +add.f32 f374, f373, f372; +add.f32 f375, f355, f359; +add.f32 f376, f348, f368; +add.f32 f377, f344, f376; +add.f32 f378, f352, f364; +add.f32 f379, f378, f377; +add.f32 f380, f356, f360; +fma.rn.f32 f381, f371, 0f3F1F9D07, f343; +mul.f32 f382, f373, 0f3E63DC87; +sub.f32 f383, f381, f382; +mul.f32 f384, f375, 0f3F66A5E5; +sub.f32 f385, f383, f384; +sub.f32 f386, f348, f368; +mul.f32 f387, f386, 0f3F48261C; +sub.f32 f388, f352, f364; +fma.rn.f32 f389, f388, 0f3F7994E0, f387; +sub.f32 f390, f356, f360; +fma.rn.f32 f391, f390, 0f3EDE2602, f389; +mul.f32 f392, f371, 0f3E63DC87; +sub.f32 f393, f343, f392; +mul.f32 f394, f373, 0f3F66A5E5; +sub.f32 f395, f393, f394; +fma.rn.f32 f396, f375, 0f3F1F9D07, f395; +mul.f32 f397, f386, 0f3F7994E0; +mul.f32 f398, f388, 0f3EDE2602; +sub.f32 f399, f397, f398; +mul.f32 f400, f390, 0f3F48261C; +sub.f32 f401, f399, f400; +mul.f32 f402, f371, 0f3F66A5E5; +sub.f32 f403, f343, f402; +fma.rn.f32 f404, f373, 0f3F1F9D07, f403; +mul.f32 f405, f375, 0f3E63DC87; +sub.f32 f406, f404, f405; +mul.f32 f407, f386, 0f3EDE2602; +mul.f32 f408, f388, 0f3F48261C; +sub.f32 f409, f407, f408; +fma.rn.f32 f410, f390, 0f3F7994E0, f409; +fma.rn.f32 f411, f376, 0f3F1F9D07, f344; +mul.f32 f412, f378, 0f3E63DC87; +sub.f32 f413, f411, f412; +mul.f32 f414, f380, 0f3F66A5E5; +sub.f32 f415, f413, f414; +sub.f32 f416, f347, f367; +mul.f32 f417, f416, 0f3F48261C; +sub.f32 f418, f351, f363; +fma.rn.f32 f419, f418, 0f3F7994E0, f417; +sub.f32 f420, f355, f359; +fma.rn.f32 f421, f420, 0f3EDE2602, f419; +mul.f32 f422, f376, 0f3E63DC87; +sub.f32 f423, f344, f422; +mul.f32 f424, f378, 0f3F66A5E5; +sub.f32 f425, f423, f424; +fma.rn.f32 f426, f380, 0f3F1F9D07, f425; +mul.f32 f427, f416, 0f3F7994E0; +mul.f32 f428, f418, 0f3EDE2602; +sub.f32 f429, f427, f428; +mul.f32 f430, f420, 0f3F48261C; +sub.f32 f431, f429, f430; +mul.f32 f432, f376, 0f3F66A5E5; +sub.f32 f433, f344, f432; +fma.rn.f32 f434, f378, 0f3F1F9D07, f433; +mul.f32 f435, f380, 0f3E63DC87; +sub.f32 f436, f434, f435; +mul.f32 f437, f416, 0f3EDE2602; +mul.f32 f438, f418, 0f3F48261C; +sub.f32 f439, f437, f438; +fma.rn.f32 f440, f420, 0f3F7994E0, f439; +add.f32 %1, f380, f379; +add.f32 %0, f375, f374; +add.f32 %3, f421, f415; +sub.f32 %2, f385, f391; +add.f32 %5, f431, f426; +sub.f32 %4, f396, f401; +add.f32 %7, f440, f436; +sub.f32 %6, f406, f410; +sub.f32 %9, f436, f440; +add.f32 %8, f410, f406; +sub.f32 %11, f426, f431; +add.f32 %10, f401, f396; +sub.f32 %13, f415, f421; +add.f32 %12, f391, f385; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<377, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<427>; +.reg .b32 r<21>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 1372, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %19, %33; +add.f32 f30, %17, f29; +add.f32 f31, %22, %30; +add.f32 f32, f31, f30; +add.f32 f33, %25, %27; +add.f32 f34, f33, f32; +add.f32 f35, %21, %34; +add.f32 f36, %18, f35; +add.f32 f37, %24, %32; +add.f32 f38, f37, f36; +add.f32 f39, %26, %29; +add.f32 f40, f39, f38; +fma.rn.f32 f41, f29, 0f3F1F9D07, %17; +mul.f32 f42, f31, 0f3E63DC87; +sub.f32 f43, f41, f42; +mul.f32 f44, f33, 0f3F66A5E5; +sub.f32 f45, f43, f44; +sub.f32 f46, %21, %34; +mul.f32 f47, f46, 0f3F48261C; +sub.f32 f48, %24, %32; +fma.rn.f32 f49, f48, 0f3F7994E0, f47; +sub.f32 f50, %26, %29; +fma.rn.f32 f51, f50, 0f3EDE2602, f49; +sub.f32 f52, f45, f51; +add.f32 f53, f51, f45; +mul.f32 f54, f29, 0f3E63DC87; +sub.f32 f55, %17, f54; +mul.f32 f56, f31, 0f3F66A5E5; +sub.f32 f57, f55, f56; +fma.rn.f32 f58, f33, 0f3F1F9D07, f57; +mul.f32 f59, f46, 0f3F7994E0; +mul.f32 f60, f48, 0f3EDE2602; +sub.f32 f61, f59, f60; +mul.f32 f62, f50, 0f3F48261C; +sub.f32 f63, f61, f62; +sub.f32 f64, f58, f63; +add.f32 f65, f63, f58; +mul.f32 f66, f29, 0f3F66A5E5; +sub.f32 f67, %17, f66; +fma.rn.f32 f68, f31, 0f3F1F9D07, f67; +mul.f32 f69, f33, 0f3E63DC87; +sub.f32 f70, f68, f69; +mul.f32 f71, f46, 0f3EDE2602; +mul.f32 f72, f48, 0f3F48261C; +sub.f32 f73, f71, f72; +fma.rn.f32 f74, f50, 0f3F7994E0, f73; +sub.f32 f75, f70, f74; +add.f32 f76, f74, f70; +fma.rn.f32 f77, f35, 0f3F1F9D07, %18; +mul.f32 f78, f37, 0f3E63DC87; +sub.f32 f79, f77, f78; +mul.f32 f80, f39, 0f3F66A5E5; +sub.f32 f81, f79, f80; +sub.f32 f82, %19, %33; +mul.f32 f83, f82, 0f3F48261C; +sub.f32 f84, %22, %30; +fma.rn.f32 f85, f84, 0f3F7994E0, f83; +sub.f32 f86, %25, %27; +fma.rn.f32 f87, f86, 0f3EDE2602, f85; +add.f32 f88, f87, f81; +sub.f32 f89, f81, f87; +mul.f32 f90, f35, 0f3E63DC87; +sub.f32 f91, %18, f90; +mul.f32 f92, f37, 0f3F66A5E5; +sub.f32 f93, f91, f92; +fma.rn.f32 f94, f39, 0f3F1F9D07, f93; +mul.f32 f95, f82, 0f3F7994E0; +mul.f32 f96, f84, 0f3EDE2602; +sub.f32 f97, f95, f96; +mul.f32 f98, f86, 0f3F48261C; +sub.f32 f99, f97, f98; +add.f32 f100, f99, f94; +sub.f32 f101, f94, f99; +mul.f32 f102, f35, 0f3F66A5E5; +sub.f32 f103, %18, f102; +fma.rn.f32 f104, f37, 0f3F1F9D07, f103; +mul.f32 f105, f39, 0f3E63DC87; +sub.f32 f106, f104, f105; +mul.f32 f107, f82, 0f3EDE2602; +mul.f32 f108, f84, 0f3F48261C; +sub.f32 f109, f107, f108; +fma.rn.f32 f110, f86, 0f3F7994E0, f109; +add.f32 f111, f110, f106; +sub.f32 f112, f106, f110; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f113, f114}, [rd6]; +mul.f32 f117, f88, f114; +fma.rn.f32 f118, f113, f52, f117; +mul.f32 f119, f52, f114; +mul.f32 f120, f113, f88; +sub.f32 f121, f120, f119; +mul.f32 f122, f113, f113; +mul.f32 f123, f114, f114; +sub.f32 f124, f122, f123; +mul.f32 f125, f114, f113; +fma.rn.f32 f126, f114, f113, f125; +mul.f32 f127, f100, f126; +fma.rn.f32 f128, f124, f64, f127; +mul.f32 f129, f64, f126; +mul.f32 f130, f124, f100; +sub.f32 f131, f130, f129; +mul.f32 f132, f113, f124; +mul.f32 f133, f114, f126; +sub.f32 f134, f132, f133; +mul.f32 f135, f113, f126; +fma.rn.f32 f136, f114, f124, f135; +mul.f32 f137, f111, f136; +fma.rn.f32 f138, f134, f75, f137; +mul.f32 f139, f75, f136; +mul.f32 f140, f134, f111; +sub.f32 f141, f140, f139; +mul.f32 f142, f113, f134; +mul.f32 f143, f114, f136; +sub.f32 f144, f142, f143; +mul.f32 f145, f113, f136; +fma.rn.f32 f146, f114, f134, f145; +mul.f32 f147, f112, f146; +fma.rn.f32 f148, f144, f76, f147; +mul.f32 f149, f76, f146; +mul.f32 f150, f144, f112; +sub.f32 f151, f150, f149; +mul.f32 f152, f113, f144; +mul.f32 f153, f114, f146; +sub.f32 f154, f152, f153; +mul.f32 f155, f113, f146; +fma.rn.f32 f156, f114, f144, f155; +mul.f32 f157, f101, f156; +fma.rn.f32 f158, f154, f65, f157; +mul.f32 f159, f65, f156; +mul.f32 f160, f154, f101; +sub.f32 f161, f160, f159; +mul.f32 f162, f113, f154; +mul.f32 f163, f114, f156; +sub.f32 f164, f162, f163; +mul.f32 f165, f113, f156; +fma.rn.f32 f166, f114, f154, f165; +mul.f32 f167, f89, f166; +fma.rn.f32 f168, f164, f53, f167; +mul.f32 f169, f53, f166; +mul.f32 f170, f164, f89; +sub.f32 f171, f170, f169; +mad.lo.s32 r8, r5, 1372, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 28, r8; +st.shared.f32 [r9], f34; +st.shared.f32 [r9+4], f118; +st.shared.f32 [r9+8], f128; +st.shared.f32 [r9+12], f138; +st.shared.f32 [r9+16], f148; +st.shared.f32 [r9+20], f158; +st.shared.f32 [r9+24], f168; +barrier.sync 0; +mad.lo.s32 r10, r7, -24, r9; +ld.shared.f32 f172, [r10]; +ld.shared.f32 f173, [r10+196]; +ld.shared.f32 f174, [r10+392]; +ld.shared.f32 f175, [r10+588]; +ld.shared.f32 f176, [r10+784]; +ld.shared.f32 f177, [r10+980]; +ld.shared.f32 f178, [r10+1176]; +barrier.sync 0; +st.shared.f32 [r9], f40; +st.shared.f32 [r9+4], f121; +st.shared.f32 [r9+8], f131; +st.shared.f32 [r9+12], f141; +st.shared.f32 [r9+16], f151; +st.shared.f32 [r9+20], f161; +st.shared.f32 [r9+24], f171; +barrier.sync 0; +ld.shared.f32 f179, [r10]; +ld.shared.f32 f180, [r10+196]; +ld.shared.f32 f181, [r10+392]; +ld.shared.f32 f182, [r10+588]; +ld.shared.f32 f183, [r10+784]; +ld.shared.f32 f184, [r10+980]; +ld.shared.f32 f185, [r10+1176]; +add.f32 f186, f173, f178; +add.f32 f187, f172, f186; +add.f32 f188, f174, f177; +add.f32 f189, f188, f187; +add.f32 f190, f175, f176; +add.f32 f191, f190, f189; +add.f32 f192, f180, f185; +add.f32 f193, f179, f192; +add.f32 f194, f181, f184; +add.f32 f195, f194, f193; +add.f32 f196, f182, f183; +add.f32 f197, f196, f195; +fma.rn.f32 f198, f186, 0f3F1F9D07, f172; +mul.f32 f199, f188, 0f3E63DC87; +sub.f32 f200, f198, f199; +mul.f32 f201, f190, 0f3F66A5E5; +sub.f32 f202, f200, f201; +sub.f32 f203, f180, f185; +mul.f32 f204, f203, 0f3F48261C; +sub.f32 f205, f181, f184; +fma.rn.f32 f206, f205, 0f3F7994E0, f204; +sub.f32 f207, f182, f183; +fma.rn.f32 f208, f207, 0f3EDE2602, f206; +sub.f32 f209, f202, f208; +add.f32 f210, f208, f202; +mul.f32 f211, f186, 0f3E63DC87; +sub.f32 f212, f172, f211; +mul.f32 f213, f188, 0f3F66A5E5; +sub.f32 f214, f212, f213; +fma.rn.f32 f215, f190, 0f3F1F9D07, f214; +mul.f32 f216, f203, 0f3F7994E0; +mul.f32 f217, f205, 0f3EDE2602; +sub.f32 f218, f216, f217; +mul.f32 f219, f207, 0f3F48261C; +sub.f32 f220, f218, f219; +sub.f32 f221, f215, f220; +add.f32 f222, f220, f215; +mul.f32 f223, f186, 0f3F66A5E5; +sub.f32 f224, f172, f223; +fma.rn.f32 f225, f188, 0f3F1F9D07, f224; +mul.f32 f226, f190, 0f3E63DC87; +sub.f32 f227, f225, f226; +mul.f32 f228, f203, 0f3EDE2602; +mul.f32 f229, f205, 0f3F48261C; +sub.f32 f230, f228, f229; +fma.rn.f32 f231, f207, 0f3F7994E0, f230; +sub.f32 f232, f227, f231; +add.f32 f233, f231, f227; +fma.rn.f32 f234, f192, 0f3F1F9D07, f179; +mul.f32 f235, f194, 0f3E63DC87; +sub.f32 f236, f234, f235; +mul.f32 f237, f196, 0f3F66A5E5; +sub.f32 f238, f236, f237; +sub.f32 f239, f173, f178; +mul.f32 f240, f239, 0f3F48261C; +sub.f32 f241, f174, f177; +fma.rn.f32 f242, f241, 0f3F7994E0, f240; +sub.f32 f243, f175, f176; +fma.rn.f32 f244, f243, 0f3EDE2602, f242; +add.f32 f245, f244, f238; +sub.f32 f246, f238, f244; +mul.f32 f247, f192, 0f3E63DC87; +sub.f32 f248, f179, f247; +mul.f32 f249, f194, 0f3F66A5E5; +sub.f32 f250, f248, f249; +fma.rn.f32 f251, f196, 0f3F1F9D07, f250; +mul.f32 f252, f239, 0f3F7994E0; +mul.f32 f253, f241, 0f3EDE2602; +sub.f32 f254, f252, f253; +mul.f32 f255, f243, 0f3F48261C; +sub.f32 f256, f254, f255; +add.f32 f257, f256, f251; +sub.f32 f258, f251, f256; +mul.f32 f259, f192, 0f3F66A5E5; +sub.f32 f260, f179, f259; +fma.rn.f32 f261, f194, 0f3F1F9D07, f260; +mul.f32 f262, f196, 0f3E63DC87; +sub.f32 f263, f261, f262; +mul.f32 f264, f239, 0f3EDE2602; +mul.f32 f265, f241, 0f3F48261C; +sub.f32 f266, f264, f265; +fma.rn.f32 f267, f243, 0f3F7994E0, f266; +add.f32 f268, f267, f263; +sub.f32 f269, f263, f267; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 8; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f270, f271}, [rd11]; +mul.f32 f274, f245, f271; +fma.rn.f32 f275, f270, f209, f274; +mul.f32 f276, f209, f271; +mul.f32 f277, f270, f245; +sub.f32 f278, f277, f276; +mul.f32 f279, f270, f270; +mul.f32 f280, f271, f271; +sub.f32 f281, f279, f280; +mul.f32 f282, f271, f270; +fma.rn.f32 f283, f271, f270, f282; +mul.f32 f284, f257, f283; +fma.rn.f32 f285, f281, f221, f284; +mul.f32 f286, f221, f283; +mul.f32 f287, f281, f257; +sub.f32 f288, f287, f286; +mul.f32 f289, f270, f281; +mul.f32 f290, f271, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f270, f283; +fma.rn.f32 f293, f271, f281, f292; +mul.f32 f294, f268, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f268; +sub.f32 f298, f297, f296; +mul.f32 f299, f270, f291; +mul.f32 f300, f271, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f270, f293; +fma.rn.f32 f303, f271, f291, f302; +mul.f32 f304, f269, f303; +fma.rn.f32 f305, f301, f233, f304; +mul.f32 f306, f233, f303; +mul.f32 f307, f301, f269; +sub.f32 f308, f307, f306; +mul.f32 f309, f270, f301; +mul.f32 f310, f271, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f270, f303; +fma.rn.f32 f313, f271, f301, f312; +mul.f32 f314, f258, f313; +fma.rn.f32 f315, f311, f222, f314; +mul.f32 f316, f222, f313; +mul.f32 f317, f311, f258; +sub.f32 f318, f317, f316; +mul.f32 f319, f270, f311; +mul.f32 f320, f271, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f270, f313; +fma.rn.f32 f323, f271, f311, f322; +mul.f32 f324, f246, f323; +fma.rn.f32 f325, f321, f210, f324; +mul.f32 f326, f210, f323; +mul.f32 f327, f321, f246; +sub.f32 f328, f327, f326; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 196, r19; +st.shared.f32 [r20], f191; +st.shared.f32 [r20+28], f275; +st.shared.f32 [r20+56], f285; +st.shared.f32 [r20+84], f295; +st.shared.f32 [r20+112], f305; +st.shared.f32 [r20+140], f315; +st.shared.f32 [r20+168], f325; +barrier.sync 0; +ld.shared.f32 f329, [r10]; +ld.shared.f32 f330, [r10+196]; +ld.shared.f32 f331, [r10+392]; +ld.shared.f32 f332, [r10+588]; +ld.shared.f32 f333, [r10+784]; +ld.shared.f32 f334, [r10+980]; +ld.shared.f32 f335, [r10+1176]; +barrier.sync 0; +st.shared.f32 [r20], f197; +st.shared.f32 [r20+28], f278; +st.shared.f32 [r20+56], f288; +st.shared.f32 [r20+84], f298; +st.shared.f32 [r20+112], f308; +st.shared.f32 [r20+140], f318; +st.shared.f32 [r20+168], f328; +barrier.sync 0; +ld.shared.f32 f336, [r10]; +ld.shared.f32 f337, [r10+196]; +ld.shared.f32 f338, [r10+392]; +ld.shared.f32 f339, [r10+588]; +ld.shared.f32 f340, [r10+784]; +ld.shared.f32 f341, [r10+980]; +ld.shared.f32 f342, [r10+1176]; +add.f32 f343, f330, f335; +add.f32 f344, f329, f343; +add.f32 f345, f331, f334; +add.f32 f346, f345, f344; +add.f32 f347, f332, f333; +add.f32 f348, f337, f342; +add.f32 f349, f336, f348; +add.f32 f350, f338, f341; +add.f32 f351, f350, f349; +add.f32 f352, f339, f340; +fma.rn.f32 f353, f343, 0f3F1F9D07, f329; +mul.f32 f354, f345, 0f3E63DC87; +sub.f32 f355, f353, f354; +mul.f32 f356, f347, 0f3F66A5E5; +sub.f32 f357, f355, f356; +sub.f32 f358, f337, f342; +mul.f32 f359, f358, 0f3F48261C; +sub.f32 f360, f338, f341; +fma.rn.f32 f361, f360, 0f3F7994E0, f359; +sub.f32 f362, f339, f340; +fma.rn.f32 f363, f362, 0f3EDE2602, f361; +mul.f32 f364, f343, 0f3E63DC87; +sub.f32 f365, f329, f364; +mul.f32 f366, f345, 0f3F66A5E5; +sub.f32 f367, f365, f366; +fma.rn.f32 f368, f347, 0f3F1F9D07, f367; +mul.f32 f369, f358, 0f3F7994E0; +mul.f32 f370, f360, 0f3EDE2602; +sub.f32 f371, f369, f370; +mul.f32 f372, f362, 0f3F48261C; +sub.f32 f373, f371, f372; +mul.f32 f374, f343, 0f3F66A5E5; +sub.f32 f375, f329, f374; +fma.rn.f32 f376, f345, 0f3F1F9D07, f375; +mul.f32 f377, f347, 0f3E63DC87; +sub.f32 f378, f376, f377; +mul.f32 f379, f358, 0f3EDE2602; +mul.f32 f380, f360, 0f3F48261C; +sub.f32 f381, f379, f380; +fma.rn.f32 f382, f362, 0f3F7994E0, f381; +fma.rn.f32 f383, f348, 0f3F1F9D07, f336; +mul.f32 f384, f350, 0f3E63DC87; +sub.f32 f385, f383, f384; +mul.f32 f386, f352, 0f3F66A5E5; +sub.f32 f387, f385, f386; +sub.f32 f388, f330, f335; +mul.f32 f389, f388, 0f3F48261C; +sub.f32 f390, f331, f334; +fma.rn.f32 f391, f390, 0f3F7994E0, f389; +sub.f32 f392, f332, f333; +fma.rn.f32 f393, f392, 0f3EDE2602, f391; +mul.f32 f394, f348, 0f3E63DC87; +sub.f32 f395, f336, f394; +mul.f32 f396, f350, 0f3F66A5E5; +sub.f32 f397, f395, f396; +fma.rn.f32 f398, f352, 0f3F1F9D07, f397; +mul.f32 f399, f388, 0f3F7994E0; +mul.f32 f400, f390, 0f3EDE2602; +sub.f32 f401, f399, f400; +mul.f32 f402, f392, 0f3F48261C; +sub.f32 f403, f401, f402; +mul.f32 f404, f348, 0f3F66A5E5; +sub.f32 f405, f336, f404; +fma.rn.f32 f406, f350, 0f3F1F9D07, f405; +mul.f32 f407, f352, 0f3E63DC87; +sub.f32 f408, f406, f407; +mul.f32 f409, f388, 0f3EDE2602; +mul.f32 f410, f390, 0f3F48261C; +sub.f32 f411, f409, f410; +fma.rn.f32 f412, f392, 0f3F7994E0, f411; +add.f32 %0, f347, f346; +add.f32 %1, f352, f351; +add.f32 %3, f393, f387; +sub.f32 %2, f357, f363; +sub.f32 %4, f368, f373; +add.f32 %5, f403, f398; +sub.f32 %6, f378, f382; +add.f32 %7, f412, f408; +add.f32 %8, f382, f378; +sub.f32 %9, f408, f412; +add.f32 %10, f373, f368; +sub.f32 %11, f398, f403; +sub.f32 %13, f387, f393; +add.f32 %12, f363, f357; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_343), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..eeb0de0754195 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp64_fwd.hpp.inc @@ -0,0 +1,922 @@ +#ifndef CUFFTDX_FFT_343_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_343_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<549, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<465>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 5488, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %19, %33; +add.f64 fd30, %17, fd29; +add.f64 fd31, %22, %30; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %25, %27; +add.f64 fd34, %21, %34; +add.f64 fd35, %18, fd34; +add.f64 fd36, %24, %32; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %26, %29; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %21, %34; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %24, %32; +mul.f64 fd47, fd46, 0dBFEF329C0558E969; +sub.f64 fd48, fd47, fd45; +sub.f64 fd49, %26, %29; +mul.f64 fd50, fd49, 0d3FDBC4C04D71ABC1; +sub.f64 fd51, fd48, fd50; +sub.f64 fd52, fd43, fd51; +add.f64 fd53, fd51, fd43; +mul.f64 fd54, fd29, 0d3FCC7B90E3024582; +sub.f64 fd55, %17, fd54; +mul.f64 fd56, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd57, fd55, fd56; +fma.rn.f64 fd58, fd33, 0d3FE3F3A0E28BEDD1, fd57; +mul.f64 fd59, fd44, 0d3FEF329C0558E969; +mul.f64 fd60, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd61, fd60, fd59; +fma.rn.f64 fd62, fd49, 0d3FE904C37505DE4B, fd61; +sub.f64 fd63, fd58, fd62; +add.f64 fd64, fd62, fd58; +mul.f64 fd65, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd66, %17, fd65; +fma.rn.f64 fd67, fd31, 0d3FE3F3A0E28BEDD1, fd66; +mul.f64 fd68, fd33, 0d3FCC7B90E3024582; +sub.f64 fd69, fd67, fd68; +mul.f64 fd70, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd71, fd46, 0d3FE904C37505DE4B; +sub.f64 fd72, fd71, fd70; +mul.f64 fd73, fd49, 0d3FEF329C0558E969; +sub.f64 fd74, fd72, fd73; +sub.f64 fd75, fd69, fd74; +add.f64 fd76, fd74, fd69; +fma.rn.f64 fd77, fd34, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd78, fd36, 0d3FCC7B90E3024582; +sub.f64 fd79, fd77, fd78; +mul.f64 fd80, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd81, fd79, fd80; +sub.f64 fd82, %19, %33; +mul.f64 fd83, fd82, 0d3FE904C37505DE4B; +sub.f64 fd84, %22, %30; +mul.f64 fd85, fd84, 0dBFEF329C0558E969; +sub.f64 fd86, fd85, fd83; +sub.f64 fd87, %25, %27; +mul.f64 fd88, fd87, 0d3FDBC4C04D71ABC1; +sub.f64 fd89, fd86, fd88; +add.f64 fd90, fd89, fd81; +sub.f64 fd91, fd81, fd89; +mul.f64 fd92, fd34, 0d3FCC7B90E3024582; +sub.f64 fd93, %18, fd92; +mul.f64 fd94, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd95, fd93, fd94; +fma.rn.f64 fd96, fd38, 0d3FE3F3A0E28BEDD1, fd95; +mul.f64 fd97, fd82, 0d3FEF329C0558E969; +mul.f64 fd98, fd84, 0d3FDBC4C04D71ABC1; +sub.f64 fd99, fd98, fd97; +fma.rn.f64 fd100, fd87, 0d3FE904C37505DE4B, fd99; +add.f64 fd101, fd100, fd96; +sub.f64 fd102, fd96, fd100; +mul.f64 fd103, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd104, %18, fd103; +fma.rn.f64 fd105, fd36, 0d3FE3F3A0E28BEDD1, fd104; +mul.f64 fd106, fd38, 0d3FCC7B90E3024582; +sub.f64 fd107, fd105, fd106; +mul.f64 fd108, fd82, 0d3FDBC4C04D71ABC1; +mul.f64 fd109, fd84, 0d3FE904C37505DE4B; +sub.f64 fd110, fd109, fd108; +mul.f64 fd111, fd87, 0d3FEF329C0558E969; +sub.f64 fd112, fd110, fd111; +add.f64 fd113, fd112, fd107; +sub.f64 fd114, fd107, fd112; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5488, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd115, fd116}, [rd6]; +mul.f64 fd119, fd115, fd52; +mul.f64 fd120, fd116, fd90; +mul.f64 fd121, fd115, fd90; +mul.f64 fd122, fd115, fd115; +mul.f64 fd123, fd116, fd116; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd116, fd115; +fma.rn.f64 fd126, fd116, fd115, fd125; +mul.f64 fd127, fd124, fd63; +mul.f64 fd128, fd126, fd101; +mul.f64 fd129, fd124, fd101; +mul.f64 fd130, fd115, fd124; +mul.f64 fd131, fd116, fd126; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd115, fd126; +fma.rn.f64 fd134, fd116, fd124, fd133; +mul.f64 fd135, fd132, fd75; +mul.f64 fd136, fd134, fd113; +mul.f64 fd137, fd132, fd113; +ld.global.v2.f64 {fd138, fd139}, [rd6+784]; +mul.f64 fd142, fd138, fd76; +mul.f64 fd143, fd139, fd114; +mul.f64 fd144, fd138, fd114; +mul.f64 fd145, fd115, fd138; +mul.f64 fd146, fd116, fd139; +sub.f64 fd147, fd145, fd146; +mul.f64 fd148, fd115, fd139; +fma.rn.f64 fd149, fd116, fd138, fd148; +mul.f64 fd150, fd147, fd64; +mul.f64 fd151, fd149, fd102; +mul.f64 fd152, fd147, fd102; +mul.f64 fd153, fd115, fd147; +mul.f64 fd154, fd116, fd149; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd115, fd149; +fma.rn.f64 fd157, fd116, fd147, fd156; +mul.f64 fd158, fd155, fd53; +mul.f64 fd159, fd157, fd91; +mul.f64 fd160, fd155, fd91; +barrier.sync 0; +mad.lo.s32 r9, r7, 112, r8; +add.f64 fd161, fd38, fd37; +add.f64 fd162, fd33, fd32; +st.shared.v2.f64 [r9], {fd162, fd161}; +fma.rn.f64 fd163, fd116, fd52, fd121; +sub.f64 fd164, fd119, fd120; +st.shared.v2.f64 [r9+16], {fd164, fd163}; +fma.rn.f64 fd165, fd126, fd63, fd129; +sub.f64 fd166, fd127, fd128; +st.shared.v2.f64 [r9+32], {fd166, fd165}; +sub.f64 fd167, fd135, fd136; +fma.rn.f64 fd168, fd134, fd75, fd137; +st.shared.v2.f64 [r9+48], {fd167, fd168}; +fma.rn.f64 fd169, fd139, fd76, fd144; +sub.f64 fd170, fd142, fd143; +st.shared.v2.f64 [r9+64], {fd170, fd169}; +fma.rn.f64 fd171, fd149, fd64, fd152; +sub.f64 fd172, fd150, fd151; +st.shared.v2.f64 [r9+80], {fd172, fd171}; +sub.f64 fd173, fd158, fd159; +fma.rn.f64 fd174, fd157, fd53, fd160; +st.shared.v2.f64 [r9+96], {fd173, fd174}; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.v2.f64 {fd175, fd176}, [r10]; +ld.shared.v2.f64 {fd179, fd180}, [r10+784]; +ld.shared.v2.f64 {fd183, fd184}, [r10+1568]; +ld.shared.v2.f64 {fd187, fd188}, [r10+2352]; +ld.shared.v2.f64 {fd191, fd192}, [r10+3136]; +ld.shared.v2.f64 {fd195, fd196}, [r10+3920]; +ld.shared.v2.f64 {fd199, fd200}, [r10+4704]; +add.f64 fd203, fd179, fd199; +add.f64 fd204, fd175, fd203; +add.f64 fd205, fd183, fd195; +add.f64 fd206, fd205, fd204; +add.f64 fd207, fd187, fd191; +add.f64 fd208, fd180, fd200; +add.f64 fd209, fd176, fd208; +add.f64 fd210, fd184, fd196; +add.f64 fd211, fd210, fd209; +add.f64 fd212, fd188, fd192; +fma.rn.f64 fd213, fd203, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd214, fd205, 0d3FCC7B90E3024582; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd207, 0d3FECD4BCA9CB5C71; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, fd180, fd200; +mul.f64 fd219, fd218, 0d3FE904C37505DE4B; +sub.f64 fd220, fd184, fd196; +mul.f64 fd221, fd220, 0dBFEF329C0558E969; +sub.f64 fd222, fd221, fd219; +sub.f64 fd223, fd188, fd192; +mul.f64 fd224, fd223, 0d3FDBC4C04D71ABC1; +sub.f64 fd225, fd222, fd224; +sub.f64 fd226, fd217, fd225; +add.f64 fd227, fd225, fd217; +mul.f64 fd228, fd203, 0d3FCC7B90E3024582; +sub.f64 fd229, fd175, fd228; +mul.f64 fd230, fd205, 0d3FECD4BCA9CB5C71; +sub.f64 fd231, fd229, fd230; +fma.rn.f64 fd232, fd207, 0d3FE3F3A0E28BEDD1, fd231; +mul.f64 fd233, fd218, 0d3FEF329C0558E969; +mul.f64 fd234, fd220, 0d3FDBC4C04D71ABC1; +sub.f64 fd235, fd234, fd233; +fma.rn.f64 fd236, fd223, 0d3FE904C37505DE4B, fd235; +sub.f64 fd237, fd232, fd236; +add.f64 fd238, fd236, fd232; +mul.f64 fd239, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd240, fd175, fd239; +fma.rn.f64 fd241, fd205, 0d3FE3F3A0E28BEDD1, fd240; +mul.f64 fd242, fd207, 0d3FCC7B90E3024582; +sub.f64 fd243, fd241, fd242; +mul.f64 fd244, fd218, 0d3FDBC4C04D71ABC1; +mul.f64 fd245, fd220, 0d3FE904C37505DE4B; +sub.f64 fd246, fd245, fd244; +mul.f64 fd247, fd223, 0d3FEF329C0558E969; +sub.f64 fd248, fd246, fd247; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +fma.rn.f64 fd251, fd208, 0d3FE3F3A0E28BEDD1, fd176; +mul.f64 fd252, fd210, 0d3FCC7B90E3024582; +sub.f64 fd253, fd251, fd252; +mul.f64 fd254, fd212, 0d3FECD4BCA9CB5C71; +sub.f64 fd255, fd253, fd254; +sub.f64 fd256, fd179, fd199; +mul.f64 fd257, fd256, 0d3FE904C37505DE4B; +sub.f64 fd258, fd183, fd195; +mul.f64 fd259, fd258, 0dBFEF329C0558E969; +sub.f64 fd260, fd259, fd257; +sub.f64 fd261, fd187, fd191; +mul.f64 fd262, fd261, 0d3FDBC4C04D71ABC1; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd263, fd255; +sub.f64 fd265, fd255, fd263; +mul.f64 fd266, fd208, 0d3FCC7B90E3024582; +sub.f64 fd267, fd176, fd266; +mul.f64 fd268, fd210, 0d3FECD4BCA9CB5C71; +sub.f64 fd269, fd267, fd268; +fma.rn.f64 fd270, fd212, 0d3FE3F3A0E28BEDD1, fd269; +mul.f64 fd271, fd256, 0d3FEF329C0558E969; +mul.f64 fd272, fd258, 0d3FDBC4C04D71ABC1; +sub.f64 fd273, fd272, fd271; +fma.rn.f64 fd274, fd261, 0d3FE904C37505DE4B, fd273; +add.f64 fd275, fd274, fd270; +sub.f64 fd276, fd270, fd274; +mul.f64 fd277, fd208, 0d3FECD4BCA9CB5C71; +sub.f64 fd278, fd176, fd277; +fma.rn.f64 fd279, fd210, 0d3FE3F3A0E28BEDD1, fd278; +mul.f64 fd280, fd212, 0d3FCC7B90E3024582; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd256, 0d3FDBC4C04D71ABC1; +mul.f64 fd283, fd258, 0d3FE904C37505DE4B; +sub.f64 fd284, fd283, fd282; +mul.f64 fd285, fd261, 0d3FEF329C0558E969; +sub.f64 fd286, fd284, fd285; +add.f64 fd287, fd286, fd281; +sub.f64 fd288, fd281, fd286; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd289, fd290}, [rd11]; +mul.f64 fd293, fd289, fd226; +mul.f64 fd294, fd290, fd264; +mul.f64 fd295, fd289, fd264; +mul.f64 fd296, fd289, fd289; +mul.f64 fd297, fd290, fd290; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd290, fd289; +fma.rn.f64 fd300, fd290, fd289, fd299; +mul.f64 fd301, fd298, fd237; +mul.f64 fd302, fd300, fd275; +mul.f64 fd303, fd298, fd275; +mul.f64 fd304, fd289, fd298; +mul.f64 fd305, fd290, fd300; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd289, fd300; +fma.rn.f64 fd308, fd290, fd298, fd307; +mul.f64 fd309, fd306, fd249; +mul.f64 fd310, fd308, fd287; +mul.f64 fd311, fd306, fd287; +ld.global.v2.f64 {fd312, fd313}, [rd11+112]; +mul.f64 fd316, fd312, fd250; +mul.f64 fd317, fd313, fd288; +mul.f64 fd318, fd312, fd288; +mul.f64 fd319, fd289, fd312; +mul.f64 fd320, fd290, fd313; +sub.f64 fd321, fd319, fd320; +mul.f64 fd322, fd289, fd313; +fma.rn.f64 fd323, fd290, fd312, fd322; +mul.f64 fd324, fd321, fd238; +mul.f64 fd325, fd323, fd276; +mul.f64 fd326, fd321, fd276; +mul.f64 fd327, fd289, fd321; +mul.f64 fd328, fd290, fd323; +sub.f64 fd329, fd327, fd328; +mul.f64 fd330, fd289, fd323; +fma.rn.f64 fd331, fd290, fd321, fd330; +mul.f64 fd332, fd329, fd227; +mul.f64 fd333, fd331, fd265; +mul.f64 fd334, fd329, fd265; +shl.b32 r18, r17, 4; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 784, r19; +add.f64 fd335, fd212, fd211; +add.f64 fd336, fd207, fd206; +st.shared.v2.f64 [r20], {fd336, fd335}; +fma.rn.f64 fd337, fd290, fd226, fd295; +sub.f64 fd338, fd293, fd294; +st.shared.v2.f64 [r20+112], {fd338, fd337}; +fma.rn.f64 fd339, fd300, fd237, fd303; +sub.f64 fd340, fd301, fd302; +st.shared.v2.f64 [r20+224], {fd340, fd339}; +fma.rn.f64 fd341, fd308, fd249, fd311; +sub.f64 fd342, fd309, fd310; +st.shared.v2.f64 [r20+336], {fd342, fd341}; +fma.rn.f64 fd343, fd313, fd250, fd318; +sub.f64 fd344, fd316, fd317; +st.shared.v2.f64 [r20+448], {fd344, fd343}; +fma.rn.f64 fd345, fd323, fd238, fd326; +sub.f64 fd346, fd324, fd325; +st.shared.v2.f64 [r20+560], {fd346, fd345}; +fma.rn.f64 fd347, fd331, fd227, fd334; +sub.f64 fd348, fd332, fd333; +st.shared.v2.f64 [r20+672], {fd348, fd347}; +barrier.sync 0; +ld.shared.v2.f64 {fd349, fd350}, [r10]; +ld.shared.v2.f64 {fd353, fd354}, [r10+784]; +ld.shared.v2.f64 {fd357, fd358}, [r10+1568]; +ld.shared.v2.f64 {fd361, fd362}, [r10+2352]; +ld.shared.v2.f64 {fd365, fd366}, [r10+3136]; +ld.shared.v2.f64 {fd369, fd370}, [r10+3920]; +ld.shared.v2.f64 {fd373, fd374}, [r10+4704]; +add.f64 fd377, fd353, fd373; +add.f64 fd378, fd349, fd377; +add.f64 fd379, fd357, fd369; +add.f64 fd380, fd379, fd378; +add.f64 fd381, fd361, fd365; +add.f64 fd382, fd354, fd374; +add.f64 fd383, fd350, fd382; +add.f64 fd384, fd358, fd370; +add.f64 fd385, fd384, fd383; +add.f64 fd386, fd362, fd366; +fma.rn.f64 fd387, fd377, 0d3FE3F3A0E28BEDD1, fd349; +mul.f64 fd388, fd379, 0d3FCC7B90E3024582; +sub.f64 fd389, fd387, fd388; +mul.f64 fd390, fd381, 0d3FECD4BCA9CB5C71; +sub.f64 fd391, fd389, fd390; +sub.f64 fd392, fd354, fd374; +mul.f64 fd393, fd392, 0d3FE904C37505DE4B; +sub.f64 fd394, fd358, fd370; +mul.f64 fd395, fd394, 0dBFEF329C0558E969; +sub.f64 fd396, fd395, fd393; +sub.f64 fd397, fd362, fd366; +mul.f64 fd398, fd397, 0d3FDBC4C04D71ABC1; +sub.f64 fd399, fd396, fd398; +mul.f64 fd400, fd377, 0d3FCC7B90E3024582; +sub.f64 fd401, fd349, fd400; +mul.f64 fd402, fd379, 0d3FECD4BCA9CB5C71; +sub.f64 fd403, fd401, fd402; +fma.rn.f64 fd404, fd381, 0d3FE3F3A0E28BEDD1, fd403; +mul.f64 fd405, fd392, 0d3FEF329C0558E969; +mul.f64 fd406, fd394, 0d3FDBC4C04D71ABC1; +sub.f64 fd407, fd406, fd405; +fma.rn.f64 fd408, fd397, 0d3FE904C37505DE4B, fd407; +mul.f64 fd409, fd377, 0d3FECD4BCA9CB5C71; +sub.f64 fd410, fd349, fd409; +fma.rn.f64 fd411, fd379, 0d3FE3F3A0E28BEDD1, fd410; +mul.f64 fd412, fd381, 0d3FCC7B90E3024582; +sub.f64 fd413, fd411, fd412; +mul.f64 fd414, fd392, 0d3FDBC4C04D71ABC1; +mul.f64 fd415, fd394, 0d3FE904C37505DE4B; +sub.f64 fd416, fd415, fd414; +mul.f64 fd417, fd397, 0d3FEF329C0558E969; +sub.f64 fd418, fd416, fd417; +fma.rn.f64 fd419, fd382, 0d3FE3F3A0E28BEDD1, fd350; +mul.f64 fd420, fd384, 0d3FCC7B90E3024582; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd386, 0d3FECD4BCA9CB5C71; +sub.f64 fd423, fd421, fd422; +sub.f64 fd424, fd353, fd373; +mul.f64 fd425, fd424, 0d3FE904C37505DE4B; +sub.f64 fd426, fd357, fd369; +mul.f64 fd427, fd426, 0dBFEF329C0558E969; +sub.f64 fd428, fd427, fd425; +sub.f64 fd429, fd361, fd365; +mul.f64 fd430, fd429, 0d3FDBC4C04D71ABC1; +sub.f64 fd431, fd428, fd430; +mul.f64 fd432, fd382, 0d3FCC7B90E3024582; +sub.f64 fd433, fd350, fd432; +mul.f64 fd434, fd384, 0d3FECD4BCA9CB5C71; +sub.f64 fd435, fd433, fd434; +fma.rn.f64 fd436, fd386, 0d3FE3F3A0E28BEDD1, fd435; +mul.f64 fd437, fd424, 0d3FEF329C0558E969; +mul.f64 fd438, fd426, 0d3FDBC4C04D71ABC1; +sub.f64 fd439, fd438, fd437; +fma.rn.f64 fd440, fd429, 0d3FE904C37505DE4B, fd439; +mul.f64 fd441, fd382, 0d3FECD4BCA9CB5C71; +sub.f64 fd442, fd350, fd441; +fma.rn.f64 fd443, fd384, 0d3FE3F3A0E28BEDD1, fd442; +mul.f64 fd444, fd386, 0d3FCC7B90E3024582; +sub.f64 fd445, fd443, fd444; +mul.f64 fd446, fd424, 0d3FDBC4C04D71ABC1; +mul.f64 fd447, fd426, 0d3FE904C37505DE4B; +sub.f64 fd448, fd447, fd446; +mul.f64 fd449, fd429, 0d3FEF329C0558E969; +sub.f64 fd450, fd448, fd449; +add.f64 %1, fd386, fd385; +add.f64 %0, fd381, fd380; +add.f64 %3, fd431, fd423; +sub.f64 %2, fd391, fd399; +add.f64 %5, fd440, fd436; +sub.f64 %4, fd404, fd408; +add.f64 %7, fd450, fd445; +sub.f64 %6, fd413, fd418; +sub.f64 %9, fd445, fd450; +add.f64 %8, fd418, fd413; +sub.f64 %11, fd436, fd440; +add.f64 %10, fd408, fd404; +sub.f64 %13, fd423, fd431; +add.f64 %12, fd399, fd391; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<550, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<437>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 2744, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %19, %33; +add.f64 fd30, %17, fd29; +add.f64 fd31, %22, %30; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %25, %27; +add.f64 fd34, fd33, fd32; +add.f64 fd35, %21, %34; +add.f64 fd36, %18, fd35; +add.f64 fd37, %24, %32; +add.f64 fd38, fd37, fd36; +add.f64 fd39, %26, %29; +add.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd29, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd42, fd31, 0d3FCC7B90E3024582; +sub.f64 fd43, fd41, fd42; +mul.f64 fd44, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd45, fd43, fd44; +sub.f64 fd46, %21, %34; +mul.f64 fd47, fd46, 0d3FE904C37505DE4B; +sub.f64 fd48, %24, %32; +mul.f64 fd49, fd48, 0dBFEF329C0558E969; +sub.f64 fd50, fd49, fd47; +sub.f64 fd51, %26, %29; +mul.f64 fd52, fd51, 0d3FDBC4C04D71ABC1; +sub.f64 fd53, fd50, fd52; +sub.f64 fd54, fd45, fd53; +add.f64 fd55, fd53, fd45; +mul.f64 fd56, fd29, 0d3FCC7B90E3024582; +sub.f64 fd57, %17, fd56; +mul.f64 fd58, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd59, fd57, fd58; +fma.rn.f64 fd60, fd33, 0d3FE3F3A0E28BEDD1, fd59; +mul.f64 fd61, fd46, 0d3FEF329C0558E969; +mul.f64 fd62, fd48, 0d3FDBC4C04D71ABC1; +sub.f64 fd63, fd62, fd61; +fma.rn.f64 fd64, fd51, 0d3FE904C37505DE4B, fd63; +sub.f64 fd65, fd60, fd64; +add.f64 fd66, fd64, fd60; +mul.f64 fd67, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd68, %17, fd67; +fma.rn.f64 fd69, fd31, 0d3FE3F3A0E28BEDD1, fd68; +mul.f64 fd70, fd33, 0d3FCC7B90E3024582; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd46, 0d3FDBC4C04D71ABC1; +mul.f64 fd73, fd48, 0d3FE904C37505DE4B; +sub.f64 fd74, fd73, fd72; +mul.f64 fd75, fd51, 0d3FEF329C0558E969; +sub.f64 fd76, fd74, fd75; +sub.f64 fd77, fd71, fd76; +add.f64 fd78, fd76, fd71; +fma.rn.f64 fd79, fd35, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd80, fd37, 0d3FCC7B90E3024582; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd39, 0d3FECD4BCA9CB5C71; +sub.f64 fd83, fd81, fd82; +sub.f64 fd84, %19, %33; +mul.f64 fd85, fd84, 0d3FE904C37505DE4B; +sub.f64 fd86, %22, %30; +mul.f64 fd87, fd86, 0dBFEF329C0558E969; +sub.f64 fd88, fd87, fd85; +sub.f64 fd89, %25, %27; +mul.f64 fd90, fd89, 0d3FDBC4C04D71ABC1; +sub.f64 fd91, fd88, fd90; +add.f64 fd92, fd91, fd83; +sub.f64 fd93, fd83, fd91; +mul.f64 fd94, fd35, 0d3FCC7B90E3024582; +sub.f64 fd95, %18, fd94; +mul.f64 fd96, fd37, 0d3FECD4BCA9CB5C71; +sub.f64 fd97, fd95, fd96; +fma.rn.f64 fd98, fd39, 0d3FE3F3A0E28BEDD1, fd97; +mul.f64 fd99, fd84, 0d3FEF329C0558E969; +mul.f64 fd100, fd86, 0d3FDBC4C04D71ABC1; +sub.f64 fd101, fd100, fd99; +fma.rn.f64 fd102, fd89, 0d3FE904C37505DE4B, fd101; +add.f64 fd103, fd102, fd98; +sub.f64 fd104, fd98, fd102; +mul.f64 fd105, fd35, 0d3FECD4BCA9CB5C71; +sub.f64 fd106, %18, fd105; +fma.rn.f64 fd107, fd37, 0d3FE3F3A0E28BEDD1, fd106; +mul.f64 fd108, fd39, 0d3FCC7B90E3024582; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd84, 0d3FDBC4C04D71ABC1; +mul.f64 fd111, fd86, 0d3FE904C37505DE4B; +sub.f64 fd112, fd111, fd110; +mul.f64 fd113, fd89, 0d3FEF329C0558E969; +sub.f64 fd114, fd112, fd113; +add.f64 fd115, fd114, fd109; +sub.f64 fd116, fd109, fd114; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd117, fd118}, [rd6]; +mul.f64 fd121, fd117, fd54; +mul.f64 fd122, fd118, fd92; +sub.f64 fd123, fd121, fd122; +mul.f64 fd124, fd117, fd92; +fma.rn.f64 fd125, fd118, fd54, fd124; +mul.f64 fd126, fd117, fd117; +mul.f64 fd127, fd118, fd118; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd118, fd117; +fma.rn.f64 fd130, fd118, fd117, fd129; +mul.f64 fd131, fd128, fd65; +mul.f64 fd132, fd130, fd103; +sub.f64 fd133, fd131, fd132; +mul.f64 fd134, fd128, fd103; +fma.rn.f64 fd135, fd130, fd65, fd134; +mul.f64 fd136, fd117, fd128; +mul.f64 fd137, fd118, fd130; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd117, fd130; +fma.rn.f64 fd140, fd118, fd128, fd139; +mul.f64 fd141, fd138, fd77; +mul.f64 fd142, fd140, fd115; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd138, fd115; +fma.rn.f64 fd145, fd140, fd77, fd144; +ld.global.v2.f64 {fd146, fd147}, [rd6+784]; +mul.f64 fd150, fd146, fd78; +mul.f64 fd151, fd147, fd116; +sub.f64 fd152, fd150, fd151; +mul.f64 fd153, fd146, fd116; +fma.rn.f64 fd154, fd147, fd78, fd153; +mul.f64 fd155, fd117, fd146; +mul.f64 fd156, fd118, fd147; +sub.f64 fd157, fd155, fd156; +mul.f64 fd158, fd117, fd147; +fma.rn.f64 fd159, fd118, fd146, fd158; +mul.f64 fd160, fd157, fd66; +mul.f64 fd161, fd159, fd104; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd157, fd104; +fma.rn.f64 fd164, fd159, fd66, fd163; +mul.f64 fd165, fd117, fd157; +mul.f64 fd166, fd118, fd159; +sub.f64 fd167, fd165, fd166; +mul.f64 fd168, fd117, fd159; +fma.rn.f64 fd169, fd118, fd157, fd168; +mul.f64 fd170, fd167, fd55; +mul.f64 fd171, fd169, fd93; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd167, fd93; +fma.rn.f64 fd174, fd169, fd55, fd173; +mad.lo.s32 r8, r5, 2744, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +st.shared.f64 [r9], fd34; +st.shared.f64 [r9+8], fd123; +st.shared.f64 [r9+16], fd133; +st.shared.f64 [r9+24], fd143; +st.shared.f64 [r9+32], fd152; +st.shared.f64 [r9+40], fd162; +st.shared.f64 [r9+48], fd172; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.f64 fd175, [r10]; +ld.shared.f64 fd176, [r10+392]; +ld.shared.f64 fd177, [r10+784]; +ld.shared.f64 fd178, [r10+1176]; +ld.shared.f64 fd179, [r10+1568]; +ld.shared.f64 fd180, [r10+1960]; +ld.shared.f64 fd181, [r10+2352]; +barrier.sync 0; +st.shared.f64 [r9], fd40; +st.shared.f64 [r9+8], fd125; +st.shared.f64 [r9+16], fd135; +st.shared.f64 [r9+24], fd145; +st.shared.f64 [r9+32], fd154; +st.shared.f64 [r9+40], fd164; +st.shared.f64 [r9+48], fd174; +barrier.sync 0; +ld.shared.f64 fd182, [r10]; +ld.shared.f64 fd183, [r10+392]; +ld.shared.f64 fd184, [r10+784]; +ld.shared.f64 fd185, [r10+1176]; +ld.shared.f64 fd186, [r10+1568]; +ld.shared.f64 fd187, [r10+1960]; +ld.shared.f64 fd188, [r10+2352]; +add.f64 fd189, fd176, fd181; +add.f64 fd190, fd175, fd189; +add.f64 fd191, fd177, fd180; +add.f64 fd192, fd191, fd190; +add.f64 fd193, fd178, fd179; +add.f64 fd194, fd193, fd192; +add.f64 fd195, fd183, fd188; +add.f64 fd196, fd182, fd195; +add.f64 fd197, fd184, fd187; +add.f64 fd198, fd197, fd196; +add.f64 fd199, fd185, fd186; +add.f64 fd200, fd199, fd198; +fma.rn.f64 fd201, fd189, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd202, fd191, 0d3FCC7B90E3024582; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd193, 0d3FECD4BCA9CB5C71; +sub.f64 fd205, fd203, fd204; +sub.f64 fd206, fd183, fd188; +mul.f64 fd207, fd206, 0d3FE904C37505DE4B; +sub.f64 fd208, fd184, fd187; +mul.f64 fd209, fd208, 0dBFEF329C0558E969; +sub.f64 fd210, fd209, fd207; +sub.f64 fd211, fd185, fd186; +mul.f64 fd212, fd211, 0d3FDBC4C04D71ABC1; +sub.f64 fd213, fd210, fd212; +sub.f64 fd214, fd205, fd213; +add.f64 fd215, fd213, fd205; +mul.f64 fd216, fd189, 0d3FCC7B90E3024582; +sub.f64 fd217, fd175, fd216; +mul.f64 fd218, fd191, 0d3FECD4BCA9CB5C71; +sub.f64 fd219, fd217, fd218; +fma.rn.f64 fd220, fd193, 0d3FE3F3A0E28BEDD1, fd219; +mul.f64 fd221, fd206, 0d3FEF329C0558E969; +mul.f64 fd222, fd208, 0d3FDBC4C04D71ABC1; +sub.f64 fd223, fd222, fd221; +fma.rn.f64 fd224, fd211, 0d3FE904C37505DE4B, fd223; +sub.f64 fd225, fd220, fd224; +add.f64 fd226, fd224, fd220; +mul.f64 fd227, fd189, 0d3FECD4BCA9CB5C71; +sub.f64 fd228, fd175, fd227; +fma.rn.f64 fd229, fd191, 0d3FE3F3A0E28BEDD1, fd228; +mul.f64 fd230, fd193, 0d3FCC7B90E3024582; +sub.f64 fd231, fd229, fd230; +mul.f64 fd232, fd206, 0d3FDBC4C04D71ABC1; +mul.f64 fd233, fd208, 0d3FE904C37505DE4B; +sub.f64 fd234, fd233, fd232; +mul.f64 fd235, fd211, 0d3FEF329C0558E969; +sub.f64 fd236, fd234, fd235; +sub.f64 fd237, fd231, fd236; +add.f64 fd238, fd236, fd231; +fma.rn.f64 fd239, fd195, 0d3FE3F3A0E28BEDD1, fd182; +mul.f64 fd240, fd197, 0d3FCC7B90E3024582; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd199, 0d3FECD4BCA9CB5C71; +sub.f64 fd243, fd241, fd242; +sub.f64 fd244, fd176, fd181; +mul.f64 fd245, fd244, 0d3FE904C37505DE4B; +sub.f64 fd246, fd177, fd180; +mul.f64 fd247, fd246, 0dBFEF329C0558E969; +sub.f64 fd248, fd247, fd245; +sub.f64 fd249, fd178, fd179; +mul.f64 fd250, fd249, 0d3FDBC4C04D71ABC1; +sub.f64 fd251, fd248, fd250; +add.f64 fd252, fd251, fd243; +sub.f64 fd253, fd243, fd251; +mul.f64 fd254, fd195, 0d3FCC7B90E3024582; +sub.f64 fd255, fd182, fd254; +mul.f64 fd256, fd197, 0d3FECD4BCA9CB5C71; +sub.f64 fd257, fd255, fd256; +fma.rn.f64 fd258, fd199, 0d3FE3F3A0E28BEDD1, fd257; +mul.f64 fd259, fd244, 0d3FEF329C0558E969; +mul.f64 fd260, fd246, 0d3FDBC4C04D71ABC1; +sub.f64 fd261, fd260, fd259; +fma.rn.f64 fd262, fd249, 0d3FE904C37505DE4B, fd261; +add.f64 fd263, fd262, fd258; +sub.f64 fd264, fd258, fd262; +mul.f64 fd265, fd195, 0d3FECD4BCA9CB5C71; +sub.f64 fd266, fd182, fd265; +fma.rn.f64 fd267, fd197, 0d3FE3F3A0E28BEDD1, fd266; +mul.f64 fd268, fd199, 0d3FCC7B90E3024582; +sub.f64 fd269, fd267, fd268; +mul.f64 fd270, fd244, 0d3FDBC4C04D71ABC1; +mul.f64 fd271, fd246, 0d3FE904C37505DE4B; +sub.f64 fd272, fd271, fd270; +mul.f64 fd273, fd249, 0d3FEF329C0558E969; +sub.f64 fd274, fd272, fd273; +add.f64 fd275, fd274, fd269; +sub.f64 fd276, fd269, fd274; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd277, fd278}, [rd11]; +mul.f64 fd281, fd277, fd214; +mul.f64 fd282, fd278, fd252; +sub.f64 fd283, fd281, fd282; +mul.f64 fd284, fd277, fd252; +fma.rn.f64 fd285, fd278, fd214, fd284; +mul.f64 fd286, fd277, fd277; +mul.f64 fd287, fd278, fd278; +sub.f64 fd288, fd286, fd287; +mul.f64 fd289, fd278, fd277; +fma.rn.f64 fd290, fd278, fd277, fd289; +mul.f64 fd291, fd288, fd225; +mul.f64 fd292, fd290, fd263; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd288, fd263; +fma.rn.f64 fd295, fd290, fd225, fd294; +mul.f64 fd296, fd277, fd288; +mul.f64 fd297, fd278, fd290; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd277, fd290; +fma.rn.f64 fd300, fd278, fd288, fd299; +mul.f64 fd301, fd298, fd237; +mul.f64 fd302, fd300, fd275; +sub.f64 fd303, fd301, fd302; +mul.f64 fd304, fd298, fd275; +fma.rn.f64 fd305, fd300, fd237, fd304; +ld.global.v2.f64 {fd306, fd307}, [rd11+112]; +mul.f64 fd310, fd306, fd238; +mul.f64 fd311, fd307, fd276; +sub.f64 fd312, fd310, fd311; +mul.f64 fd313, fd306, fd276; +fma.rn.f64 fd314, fd307, fd238, fd313; +mul.f64 fd315, fd277, fd306; +mul.f64 fd316, fd278, fd307; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd277, fd307; +fma.rn.f64 fd319, fd278, fd306, fd318; +mul.f64 fd320, fd317, fd226; +mul.f64 fd321, fd319, fd264; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd317, fd264; +fma.rn.f64 fd324, fd319, fd226, fd323; +mul.f64 fd325, fd277, fd317; +mul.f64 fd326, fd278, fd319; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd277, fd319; +fma.rn.f64 fd329, fd278, fd317, fd328; +mul.f64 fd330, fd327, fd215; +mul.f64 fd331, fd329, fd253; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd327, fd253; +fma.rn.f64 fd334, fd329, fd215, fd333; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +st.shared.f64 [r20], fd194; +st.shared.f64 [r20+56], fd283; +st.shared.f64 [r20+112], fd293; +st.shared.f64 [r20+168], fd303; +st.shared.f64 [r20+224], fd312; +st.shared.f64 [r20+280], fd322; +st.shared.f64 [r20+336], fd332; +barrier.sync 0; +ld.shared.f64 fd335, [r10]; +ld.shared.f64 fd336, [r10+392]; +ld.shared.f64 fd337, [r10+784]; +ld.shared.f64 fd338, [r10+1176]; +ld.shared.f64 fd339, [r10+1568]; +ld.shared.f64 fd340, [r10+1960]; +ld.shared.f64 fd341, [r10+2352]; +barrier.sync 0; +st.shared.f64 [r20], fd200; +st.shared.f64 [r20+56], fd285; +st.shared.f64 [r20+112], fd295; +st.shared.f64 [r20+168], fd305; +st.shared.f64 [r20+224], fd314; +st.shared.f64 [r20+280], fd324; +st.shared.f64 [r20+336], fd334; +barrier.sync 0; +ld.shared.f64 fd342, [r10]; +ld.shared.f64 fd343, [r10+392]; +ld.shared.f64 fd344, [r10+784]; +ld.shared.f64 fd345, [r10+1176]; +ld.shared.f64 fd346, [r10+1568]; +ld.shared.f64 fd347, [r10+1960]; +ld.shared.f64 fd348, [r10+2352]; +add.f64 fd349, fd336, fd341; +add.f64 fd350, fd335, fd349; +add.f64 fd351, fd337, fd340; +add.f64 fd352, fd351, fd350; +add.f64 fd353, fd338, fd339; +add.f64 fd354, fd343, fd348; +add.f64 fd355, fd342, fd354; +add.f64 fd356, fd344, fd347; +add.f64 fd357, fd356, fd355; +add.f64 fd358, fd345, fd346; +fma.rn.f64 fd359, fd349, 0d3FE3F3A0E28BEDD1, fd335; +mul.f64 fd360, fd351, 0d3FCC7B90E3024582; +sub.f64 fd361, fd359, fd360; +mul.f64 fd362, fd353, 0d3FECD4BCA9CB5C71; +sub.f64 fd363, fd361, fd362; +sub.f64 fd364, fd343, fd348; +mul.f64 fd365, fd364, 0d3FE904C37505DE4B; +sub.f64 fd366, fd344, fd347; +mul.f64 fd367, fd366, 0dBFEF329C0558E969; +sub.f64 fd368, fd367, fd365; +sub.f64 fd369, fd345, fd346; +mul.f64 fd370, fd369, 0d3FDBC4C04D71ABC1; +sub.f64 fd371, fd368, fd370; +mul.f64 fd372, fd349, 0d3FCC7B90E3024582; +sub.f64 fd373, fd335, fd372; +mul.f64 fd374, fd351, 0d3FECD4BCA9CB5C71; +sub.f64 fd375, fd373, fd374; +fma.rn.f64 fd376, fd353, 0d3FE3F3A0E28BEDD1, fd375; +mul.f64 fd377, fd364, 0d3FEF329C0558E969; +mul.f64 fd378, fd366, 0d3FDBC4C04D71ABC1; +sub.f64 fd379, fd378, fd377; +fma.rn.f64 fd380, fd369, 0d3FE904C37505DE4B, fd379; +mul.f64 fd381, fd349, 0d3FECD4BCA9CB5C71; +sub.f64 fd382, fd335, fd381; +fma.rn.f64 fd383, fd351, 0d3FE3F3A0E28BEDD1, fd382; +mul.f64 fd384, fd353, 0d3FCC7B90E3024582; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd364, 0d3FDBC4C04D71ABC1; +mul.f64 fd387, fd366, 0d3FE904C37505DE4B; +sub.f64 fd388, fd387, fd386; +mul.f64 fd389, fd369, 0d3FEF329C0558E969; +sub.f64 fd390, fd388, fd389; +fma.rn.f64 fd391, fd354, 0d3FE3F3A0E28BEDD1, fd342; +mul.f64 fd392, fd356, 0d3FCC7B90E3024582; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd358, 0d3FECD4BCA9CB5C71; +sub.f64 fd395, fd393, fd394; +sub.f64 fd396, fd336, fd341; +mul.f64 fd397, fd396, 0d3FE904C37505DE4B; +sub.f64 fd398, fd337, fd340; +mul.f64 fd399, fd398, 0dBFEF329C0558E969; +sub.f64 fd400, fd399, fd397; +sub.f64 fd401, fd338, fd339; +mul.f64 fd402, fd401, 0d3FDBC4C04D71ABC1; +sub.f64 fd403, fd400, fd402; +mul.f64 fd404, fd354, 0d3FCC7B90E3024582; +sub.f64 fd405, fd342, fd404; +mul.f64 fd406, fd356, 0d3FECD4BCA9CB5C71; +sub.f64 fd407, fd405, fd406; +fma.rn.f64 fd408, fd358, 0d3FE3F3A0E28BEDD1, fd407; +mul.f64 fd409, fd396, 0d3FEF329C0558E969; +mul.f64 fd410, fd398, 0d3FDBC4C04D71ABC1; +sub.f64 fd411, fd410, fd409; +fma.rn.f64 fd412, fd401, 0d3FE904C37505DE4B, fd411; +mul.f64 fd413, fd354, 0d3FECD4BCA9CB5C71; +sub.f64 fd414, fd342, fd413; +fma.rn.f64 fd415, fd356, 0d3FE3F3A0E28BEDD1, fd414; +mul.f64 fd416, fd358, 0d3FCC7B90E3024582; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd396, 0d3FDBC4C04D71ABC1; +mul.f64 fd419, fd398, 0d3FE904C37505DE4B; +sub.f64 fd420, fd419, fd418; +mul.f64 fd421, fd401, 0d3FEF329C0558E969; +sub.f64 fd422, fd420, fd421; +add.f64 %0, fd353, fd352; +add.f64 %1, fd358, fd357; +add.f64 %3, fd403, fd395; +sub.f64 %2, fd363, fd371; +sub.f64 %4, fd376, fd380; +add.f64 %5, fd412, fd408; +sub.f64 %6, fd385, fd390; +add.f64 %7, fd422, fd417; +add.f64 %8, fd390, fd385; +sub.f64 %9, fd417, fd422; +add.f64 %10, fd380, fd376; +sub.f64 %11, fd408, fd412; +sub.f64 %13, fd395, fd403; +add.f64 %12, fd371, fd363; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..b6071da2e2923 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_343_fp64_inv.hpp.inc @@ -0,0 +1,898 @@ +#ifndef CUFFTDX_FFT_343_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_343_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<720, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<453>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 5488, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %19, %33; +add.f64 fd30, %17, fd29; +add.f64 fd31, %22, %30; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %25, %27; +add.f64 fd34, %21, %34; +add.f64 fd35, %18, fd34; +add.f64 fd36, %24, %32; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %26, %29; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %21, %34; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %24, %32; +fma.rn.f64 fd47, fd46, 0d3FEF329C0558E969, fd45; +sub.f64 fd48, %26, %29; +fma.rn.f64 fd49, fd48, 0d3FDBC4C04D71ABC1, fd47; +sub.f64 fd50, fd43, fd49; +add.f64 fd51, fd49, fd43; +mul.f64 fd52, fd29, 0d3FCC7B90E3024582; +sub.f64 fd53, %17, fd52; +mul.f64 fd54, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd55, fd53, fd54; +fma.rn.f64 fd56, fd33, 0d3FE3F3A0E28BEDD1, fd55; +mul.f64 fd57, fd44, 0d3FEF329C0558E969; +mul.f64 fd58, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd48, 0d3FE904C37505DE4B; +sub.f64 fd61, fd59, fd60; +sub.f64 fd62, fd56, fd61; +add.f64 fd63, fd61, fd56; +mul.f64 fd64, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd65, %17, fd64; +fma.rn.f64 fd66, fd31, 0d3FE3F3A0E28BEDD1, fd65; +mul.f64 fd67, fd33, 0d3FCC7B90E3024582; +sub.f64 fd68, fd66, fd67; +mul.f64 fd69, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd70, fd46, 0d3FE904C37505DE4B; +sub.f64 fd71, fd69, fd70; +fma.rn.f64 fd72, fd48, 0d3FEF329C0558E969, fd71; +sub.f64 fd73, fd68, fd72; +add.f64 fd74, fd72, fd68; +fma.rn.f64 fd75, fd34, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd76, fd36, 0d3FCC7B90E3024582; +sub.f64 fd77, fd75, fd76; +mul.f64 fd78, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd79, fd77, fd78; +sub.f64 fd80, %19, %33; +mul.f64 fd81, fd80, 0d3FE904C37505DE4B; +sub.f64 fd82, %22, %30; +fma.rn.f64 fd83, fd82, 0d3FEF329C0558E969, fd81; +sub.f64 fd84, %25, %27; +fma.rn.f64 fd85, fd84, 0d3FDBC4C04D71ABC1, fd83; +add.f64 fd86, fd85, fd79; +sub.f64 fd87, fd79, fd85; +mul.f64 fd88, fd34, 0d3FCC7B90E3024582; +sub.f64 fd89, %18, fd88; +mul.f64 fd90, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd91, fd89, fd90; +fma.rn.f64 fd92, fd38, 0d3FE3F3A0E28BEDD1, fd91; +mul.f64 fd93, fd80, 0d3FEF329C0558E969; +mul.f64 fd94, fd82, 0d3FDBC4C04D71ABC1; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd84, 0d3FE904C37505DE4B; +sub.f64 fd97, fd95, fd96; +add.f64 fd98, fd97, fd92; +sub.f64 fd99, fd92, fd97; +mul.f64 fd100, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd101, %18, fd100; +fma.rn.f64 fd102, fd36, 0d3FE3F3A0E28BEDD1, fd101; +mul.f64 fd103, fd38, 0d3FCC7B90E3024582; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd80, 0d3FDBC4C04D71ABC1; +mul.f64 fd106, fd82, 0d3FE904C37505DE4B; +sub.f64 fd107, fd105, fd106; +fma.rn.f64 fd108, fd84, 0d3FEF329C0558E969, fd107; +add.f64 fd109, fd108, fd104; +sub.f64 fd110, fd104, fd108; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5488, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd111, fd112}, [rd6]; +mul.f64 fd115, fd86, fd112; +mul.f64 fd116, fd50, fd112; +mul.f64 fd117, fd111, fd86; +mul.f64 fd118, fd111, fd111; +mul.f64 fd119, fd112, fd112; +sub.f64 fd120, fd118, fd119; +mul.f64 fd121, fd112, fd111; +fma.rn.f64 fd122, fd112, fd111, fd121; +mul.f64 fd123, fd98, fd122; +mul.f64 fd124, fd62, fd122; +mul.f64 fd125, fd120, fd98; +mul.f64 fd126, fd111, fd120; +mul.f64 fd127, fd112, fd122; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd111, fd122; +fma.rn.f64 fd130, fd112, fd120, fd129; +mul.f64 fd131, fd109, fd130; +mul.f64 fd132, fd73, fd130; +mul.f64 fd133, fd128, fd109; +ld.global.v2.f64 {fd134, fd135}, [rd6+784]; +mul.f64 fd138, fd110, fd135; +mul.f64 fd139, fd74, fd135; +mul.f64 fd140, fd134, fd110; +mul.f64 fd141, fd111, fd134; +mul.f64 fd142, fd112, fd135; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd111, fd135; +fma.rn.f64 fd145, fd112, fd134, fd144; +mul.f64 fd146, fd99, fd145; +mul.f64 fd147, fd63, fd145; +mul.f64 fd148, fd143, fd99; +mul.f64 fd149, fd111, fd143; +mul.f64 fd150, fd112, fd145; +sub.f64 fd151, fd149, fd150; +mul.f64 fd152, fd111, fd145; +fma.rn.f64 fd153, fd112, fd143, fd152; +mul.f64 fd154, fd87, fd153; +mul.f64 fd155, fd51, fd153; +mul.f64 fd156, fd151, fd87; +barrier.sync 0; +mad.lo.s32 r9, r7, 112, r8; +add.f64 fd157, fd38, fd37; +add.f64 fd158, fd33, fd32; +st.shared.v2.f64 [r9], {fd158, fd157}; +fma.rn.f64 fd159, fd111, fd50, fd115; +sub.f64 fd160, fd117, fd116; +st.shared.v2.f64 [r9+16], {fd159, fd160}; +fma.rn.f64 fd161, fd120, fd62, fd123; +sub.f64 fd162, fd125, fd124; +st.shared.v2.f64 [r9+32], {fd161, fd162}; +sub.f64 fd163, fd133, fd132; +fma.rn.f64 fd164, fd128, fd73, fd131; +st.shared.v2.f64 [r9+48], {fd164, fd163}; +fma.rn.f64 fd165, fd134, fd74, fd138; +sub.f64 fd166, fd140, fd139; +st.shared.v2.f64 [r9+64], {fd165, fd166}; +fma.rn.f64 fd167, fd143, fd63, fd146; +sub.f64 fd168, fd148, fd147; +st.shared.v2.f64 [r9+80], {fd167, fd168}; +sub.f64 fd169, fd156, fd155; +fma.rn.f64 fd170, fd151, fd51, fd154; +st.shared.v2.f64 [r9+96], {fd170, fd169}; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.v2.f64 {fd171, fd172}, [r10]; +ld.shared.v2.f64 {fd175, fd176}, [r10+784]; +ld.shared.v2.f64 {fd179, fd180}, [r10+1568]; +ld.shared.v2.f64 {fd183, fd184}, [r10+2352]; +ld.shared.v2.f64 {fd187, fd188}, [r10+3136]; +ld.shared.v2.f64 {fd191, fd192}, [r10+3920]; +ld.shared.v2.f64 {fd195, fd196}, [r10+4704]; +add.f64 fd199, fd175, fd195; +add.f64 fd200, fd171, fd199; +add.f64 fd201, fd179, fd191; +add.f64 fd202, fd201, fd200; +add.f64 fd203, fd183, fd187; +add.f64 fd204, fd176, fd196; +add.f64 fd205, fd172, fd204; +add.f64 fd206, fd180, fd192; +add.f64 fd207, fd206, fd205; +add.f64 fd208, fd184, fd188; +fma.rn.f64 fd209, fd199, 0d3FE3F3A0E28BEDD1, fd171; +mul.f64 fd210, fd201, 0d3FCC7B90E3024582; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +sub.f64 fd214, fd176, fd196; +mul.f64 fd215, fd214, 0d3FE904C37505DE4B; +sub.f64 fd216, fd180, fd192; +fma.rn.f64 fd217, fd216, 0d3FEF329C0558E969, fd215; +sub.f64 fd218, fd184, fd188; +fma.rn.f64 fd219, fd218, 0d3FDBC4C04D71ABC1, fd217; +sub.f64 fd220, fd213, fd219; +add.f64 fd221, fd219, fd213; +mul.f64 fd222, fd199, 0d3FCC7B90E3024582; +sub.f64 fd223, fd171, fd222; +mul.f64 fd224, fd201, 0d3FECD4BCA9CB5C71; +sub.f64 fd225, fd223, fd224; +fma.rn.f64 fd226, fd203, 0d3FE3F3A0E28BEDD1, fd225; +mul.f64 fd227, fd214, 0d3FEF329C0558E969; +mul.f64 fd228, fd216, 0d3FDBC4C04D71ABC1; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd218, 0d3FE904C37505DE4B; +sub.f64 fd231, fd229, fd230; +sub.f64 fd232, fd226, fd231; +add.f64 fd233, fd231, fd226; +mul.f64 fd234, fd199, 0d3FECD4BCA9CB5C71; +sub.f64 fd235, fd171, fd234; +fma.rn.f64 fd236, fd201, 0d3FE3F3A0E28BEDD1, fd235; +mul.f64 fd237, fd203, 0d3FCC7B90E3024582; +sub.f64 fd238, fd236, fd237; +mul.f64 fd239, fd214, 0d3FDBC4C04D71ABC1; +mul.f64 fd240, fd216, 0d3FE904C37505DE4B; +sub.f64 fd241, fd239, fd240; +fma.rn.f64 fd242, fd218, 0d3FEF329C0558E969, fd241; +sub.f64 fd243, fd238, fd242; +add.f64 fd244, fd242, fd238; +fma.rn.f64 fd245, fd204, 0d3FE3F3A0E28BEDD1, fd172; +mul.f64 fd246, fd206, 0d3FCC7B90E3024582; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd208, 0d3FECD4BCA9CB5C71; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd175, fd195; +mul.f64 fd251, fd250, 0d3FE904C37505DE4B; +sub.f64 fd252, fd179, fd191; +fma.rn.f64 fd253, fd252, 0d3FEF329C0558E969, fd251; +sub.f64 fd254, fd183, fd187; +fma.rn.f64 fd255, fd254, 0d3FDBC4C04D71ABC1, fd253; +add.f64 fd256, fd255, fd249; +sub.f64 fd257, fd249, fd255; +mul.f64 fd258, fd204, 0d3FCC7B90E3024582; +sub.f64 fd259, fd172, fd258; +mul.f64 fd260, fd206, 0d3FECD4BCA9CB5C71; +sub.f64 fd261, fd259, fd260; +fma.rn.f64 fd262, fd208, 0d3FE3F3A0E28BEDD1, fd261; +mul.f64 fd263, fd250, 0d3FEF329C0558E969; +mul.f64 fd264, fd252, 0d3FDBC4C04D71ABC1; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd254, 0d3FE904C37505DE4B; +sub.f64 fd267, fd265, fd266; +add.f64 fd268, fd267, fd262; +sub.f64 fd269, fd262, fd267; +mul.f64 fd270, fd204, 0d3FECD4BCA9CB5C71; +sub.f64 fd271, fd172, fd270; +fma.rn.f64 fd272, fd206, 0d3FE3F3A0E28BEDD1, fd271; +mul.f64 fd273, fd208, 0d3FCC7B90E3024582; +sub.f64 fd274, fd272, fd273; +mul.f64 fd275, fd250, 0d3FDBC4C04D71ABC1; +mul.f64 fd276, fd252, 0d3FE904C37505DE4B; +sub.f64 fd277, fd275, fd276; +fma.rn.f64 fd278, fd254, 0d3FEF329C0558E969, fd277; +add.f64 fd279, fd278, fd274; +sub.f64 fd280, fd274, fd278; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd281, fd282}, [rd11]; +mul.f64 fd285, fd256, fd282; +mul.f64 fd286, fd220, fd282; +mul.f64 fd287, fd281, fd256; +mul.f64 fd288, fd281, fd281; +mul.f64 fd289, fd282, fd282; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd282, fd281; +fma.rn.f64 fd292, fd282, fd281, fd291; +mul.f64 fd293, fd268, fd292; +mul.f64 fd294, fd232, fd292; +mul.f64 fd295, fd290, fd268; +mul.f64 fd296, fd281, fd290; +mul.f64 fd297, fd282, fd292; +sub.f64 fd298, fd296, fd297; +mul.f64 fd299, fd281, fd292; +fma.rn.f64 fd300, fd282, fd290, fd299; +mul.f64 fd301, fd279, fd300; +mul.f64 fd302, fd243, fd300; +mul.f64 fd303, fd298, fd279; +ld.global.v2.f64 {fd304, fd305}, [rd11+112]; +mul.f64 fd308, fd280, fd305; +mul.f64 fd309, fd244, fd305; +mul.f64 fd310, fd304, fd280; +mul.f64 fd311, fd281, fd304; +mul.f64 fd312, fd282, fd305; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd281, fd305; +fma.rn.f64 fd315, fd282, fd304, fd314; +mul.f64 fd316, fd269, fd315; +mul.f64 fd317, fd233, fd315; +mul.f64 fd318, fd313, fd269; +mul.f64 fd319, fd281, fd313; +mul.f64 fd320, fd282, fd315; +sub.f64 fd321, fd319, fd320; +mul.f64 fd322, fd281, fd315; +fma.rn.f64 fd323, fd282, fd313, fd322; +mul.f64 fd324, fd257, fd323; +mul.f64 fd325, fd221, fd323; +mul.f64 fd326, fd321, fd257; +shl.b32 r18, r17, 4; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 784, r19; +add.f64 fd327, fd208, fd207; +add.f64 fd328, fd203, fd202; +st.shared.v2.f64 [r20], {fd328, fd327}; +fma.rn.f64 fd329, fd281, fd220, fd285; +sub.f64 fd330, fd287, fd286; +st.shared.v2.f64 [r20+112], {fd329, fd330}; +fma.rn.f64 fd331, fd290, fd232, fd293; +sub.f64 fd332, fd295, fd294; +st.shared.v2.f64 [r20+224], {fd331, fd332}; +fma.rn.f64 fd333, fd298, fd243, fd301; +sub.f64 fd334, fd303, fd302; +st.shared.v2.f64 [r20+336], {fd333, fd334}; +fma.rn.f64 fd335, fd304, fd244, fd308; +sub.f64 fd336, fd310, fd309; +st.shared.v2.f64 [r20+448], {fd335, fd336}; +fma.rn.f64 fd337, fd313, fd233, fd316; +sub.f64 fd338, fd318, fd317; +st.shared.v2.f64 [r20+560], {fd337, fd338}; +fma.rn.f64 fd339, fd321, fd221, fd324; +sub.f64 fd340, fd326, fd325; +st.shared.v2.f64 [r20+672], {fd339, fd340}; +barrier.sync 0; +ld.shared.v2.f64 {fd341, fd342}, [r10]; +ld.shared.v2.f64 {fd345, fd346}, [r10+784]; +ld.shared.v2.f64 {fd349, fd350}, [r10+1568]; +ld.shared.v2.f64 {fd353, fd354}, [r10+2352]; +ld.shared.v2.f64 {fd357, fd358}, [r10+3136]; +ld.shared.v2.f64 {fd361, fd362}, [r10+3920]; +ld.shared.v2.f64 {fd365, fd366}, [r10+4704]; +add.f64 fd369, fd345, fd365; +add.f64 fd370, fd341, fd369; +add.f64 fd371, fd349, fd361; +add.f64 fd372, fd371, fd370; +add.f64 fd373, fd353, fd357; +add.f64 fd374, fd346, fd366; +add.f64 fd375, fd342, fd374; +add.f64 fd376, fd350, fd362; +add.f64 fd377, fd376, fd375; +add.f64 fd378, fd354, fd358; +fma.rn.f64 fd379, fd369, 0d3FE3F3A0E28BEDD1, fd341; +mul.f64 fd380, fd371, 0d3FCC7B90E3024582; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd373, 0d3FECD4BCA9CB5C71; +sub.f64 fd383, fd381, fd382; +sub.f64 fd384, fd346, fd366; +mul.f64 fd385, fd384, 0d3FE904C37505DE4B; +sub.f64 fd386, fd350, fd362; +fma.rn.f64 fd387, fd386, 0d3FEF329C0558E969, fd385; +sub.f64 fd388, fd354, fd358; +fma.rn.f64 fd389, fd388, 0d3FDBC4C04D71ABC1, fd387; +mul.f64 fd390, fd369, 0d3FCC7B90E3024582; +sub.f64 fd391, fd341, fd390; +mul.f64 fd392, fd371, 0d3FECD4BCA9CB5C71; +sub.f64 fd393, fd391, fd392; +fma.rn.f64 fd394, fd373, 0d3FE3F3A0E28BEDD1, fd393; +mul.f64 fd395, fd384, 0d3FEF329C0558E969; +mul.f64 fd396, fd386, 0d3FDBC4C04D71ABC1; +sub.f64 fd397, fd395, fd396; +mul.f64 fd398, fd388, 0d3FE904C37505DE4B; +sub.f64 fd399, fd397, fd398; +mul.f64 fd400, fd369, 0d3FECD4BCA9CB5C71; +sub.f64 fd401, fd341, fd400; +fma.rn.f64 fd402, fd371, 0d3FE3F3A0E28BEDD1, fd401; +mul.f64 fd403, fd373, 0d3FCC7B90E3024582; +sub.f64 fd404, fd402, fd403; +mul.f64 fd405, fd384, 0d3FDBC4C04D71ABC1; +mul.f64 fd406, fd386, 0d3FE904C37505DE4B; +sub.f64 fd407, fd405, fd406; +fma.rn.f64 fd408, fd388, 0d3FEF329C0558E969, fd407; +fma.rn.f64 fd409, fd374, 0d3FE3F3A0E28BEDD1, fd342; +mul.f64 fd410, fd376, 0d3FCC7B90E3024582; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd378, 0d3FECD4BCA9CB5C71; +sub.f64 fd413, fd411, fd412; +sub.f64 fd414, fd345, fd365; +mul.f64 fd415, fd414, 0d3FE904C37505DE4B; +sub.f64 fd416, fd349, fd361; +fma.rn.f64 fd417, fd416, 0d3FEF329C0558E969, fd415; +sub.f64 fd418, fd353, fd357; +fma.rn.f64 fd419, fd418, 0d3FDBC4C04D71ABC1, fd417; +mul.f64 fd420, fd374, 0d3FCC7B90E3024582; +sub.f64 fd421, fd342, fd420; +mul.f64 fd422, fd376, 0d3FECD4BCA9CB5C71; +sub.f64 fd423, fd421, fd422; +fma.rn.f64 fd424, fd378, 0d3FE3F3A0E28BEDD1, fd423; +mul.f64 fd425, fd414, 0d3FEF329C0558E969; +mul.f64 fd426, fd416, 0d3FDBC4C04D71ABC1; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd418, 0d3FE904C37505DE4B; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd374, 0d3FECD4BCA9CB5C71; +sub.f64 fd431, fd342, fd430; +fma.rn.f64 fd432, fd376, 0d3FE3F3A0E28BEDD1, fd431; +mul.f64 fd433, fd378, 0d3FCC7B90E3024582; +sub.f64 fd434, fd432, fd433; +mul.f64 fd435, fd414, 0d3FDBC4C04D71ABC1; +mul.f64 fd436, fd416, 0d3FE904C37505DE4B; +sub.f64 fd437, fd435, fd436; +fma.rn.f64 fd438, fd418, 0d3FEF329C0558E969, fd437; +add.f64 %1, fd378, fd377; +add.f64 %0, fd373, fd372; +add.f64 %3, fd419, fd413; +sub.f64 %2, fd383, fd389; +add.f64 %5, fd429, fd424; +sub.f64 %4, fd394, fd399; +add.f64 %7, fd438, fd434; +sub.f64 %6, fd404, fd408; +sub.f64 %9, fd434, fd438; +add.f64 %8, fd408, fd404; +sub.f64 %11, fd424, fd429; +add.f64 %10, fd399, fd394; +sub.f64 %13, fd413, fd419; +add.f64 %12, fd389, fd383; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<721, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<425>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 2744, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %19, %33; +add.f64 fd30, %17, fd29; +add.f64 fd31, %22, %30; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %25, %27; +add.f64 fd34, fd33, fd32; +add.f64 fd35, %21, %34; +add.f64 fd36, %18, fd35; +add.f64 fd37, %24, %32; +add.f64 fd38, fd37, fd36; +add.f64 fd39, %26, %29; +add.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd29, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd42, fd31, 0d3FCC7B90E3024582; +sub.f64 fd43, fd41, fd42; +mul.f64 fd44, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd45, fd43, fd44; +sub.f64 fd46, %21, %34; +mul.f64 fd47, fd46, 0d3FE904C37505DE4B; +sub.f64 fd48, %24, %32; +fma.rn.f64 fd49, fd48, 0d3FEF329C0558E969, fd47; +sub.f64 fd50, %26, %29; +fma.rn.f64 fd51, fd50, 0d3FDBC4C04D71ABC1, fd49; +sub.f64 fd52, fd45, fd51; +add.f64 fd53, fd51, fd45; +mul.f64 fd54, fd29, 0d3FCC7B90E3024582; +sub.f64 fd55, %17, fd54; +mul.f64 fd56, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd57, fd55, fd56; +fma.rn.f64 fd58, fd33, 0d3FE3F3A0E28BEDD1, fd57; +mul.f64 fd59, fd46, 0d3FEF329C0558E969; +mul.f64 fd60, fd48, 0d3FDBC4C04D71ABC1; +sub.f64 fd61, fd59, fd60; +mul.f64 fd62, fd50, 0d3FE904C37505DE4B; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd58, fd63; +add.f64 fd65, fd63, fd58; +mul.f64 fd66, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd67, %17, fd66; +fma.rn.f64 fd68, fd31, 0d3FE3F3A0E28BEDD1, fd67; +mul.f64 fd69, fd33, 0d3FCC7B90E3024582; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd46, 0d3FDBC4C04D71ABC1; +mul.f64 fd72, fd48, 0d3FE904C37505DE4B; +sub.f64 fd73, fd71, fd72; +fma.rn.f64 fd74, fd50, 0d3FEF329C0558E969, fd73; +sub.f64 fd75, fd70, fd74; +add.f64 fd76, fd74, fd70; +fma.rn.f64 fd77, fd35, 0d3FE3F3A0E28BEDD1, %18; +mul.f64 fd78, fd37, 0d3FCC7B90E3024582; +sub.f64 fd79, fd77, fd78; +mul.f64 fd80, fd39, 0d3FECD4BCA9CB5C71; +sub.f64 fd81, fd79, fd80; +sub.f64 fd82, %19, %33; +mul.f64 fd83, fd82, 0d3FE904C37505DE4B; +sub.f64 fd84, %22, %30; +fma.rn.f64 fd85, fd84, 0d3FEF329C0558E969, fd83; +sub.f64 fd86, %25, %27; +fma.rn.f64 fd87, fd86, 0d3FDBC4C04D71ABC1, fd85; +add.f64 fd88, fd87, fd81; +sub.f64 fd89, fd81, fd87; +mul.f64 fd90, fd35, 0d3FCC7B90E3024582; +sub.f64 fd91, %18, fd90; +mul.f64 fd92, fd37, 0d3FECD4BCA9CB5C71; +sub.f64 fd93, fd91, fd92; +fma.rn.f64 fd94, fd39, 0d3FE3F3A0E28BEDD1, fd93; +mul.f64 fd95, fd82, 0d3FEF329C0558E969; +mul.f64 fd96, fd84, 0d3FDBC4C04D71ABC1; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd86, 0d3FE904C37505DE4B; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd99, fd94; +sub.f64 fd101, fd94, fd99; +mul.f64 fd102, fd35, 0d3FECD4BCA9CB5C71; +sub.f64 fd103, %18, fd102; +fma.rn.f64 fd104, fd37, 0d3FE3F3A0E28BEDD1, fd103; +mul.f64 fd105, fd39, 0d3FCC7B90E3024582; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd82, 0d3FDBC4C04D71ABC1; +mul.f64 fd108, fd84, 0d3FE904C37505DE4B; +sub.f64 fd109, fd107, fd108; +fma.rn.f64 fd110, fd86, 0d3FEF329C0558E969, fd109; +add.f64 fd111, fd110, fd106; +sub.f64 fd112, fd106, fd110; +mul.wide.u32 rd2, r4, 1402438301; +shr.u64 rd3, rd2, 36; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 49; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd113, fd114}, [rd6]; +mul.f64 fd117, fd88, fd114; +fma.rn.f64 fd118, fd113, fd52, fd117; +mul.f64 fd119, fd52, fd114; +mul.f64 fd120, fd113, fd88; +sub.f64 fd121, fd120, fd119; +mul.f64 fd122, fd113, fd113; +mul.f64 fd123, fd114, fd114; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd114, fd113; +fma.rn.f64 fd126, fd114, fd113, fd125; +mul.f64 fd127, fd100, fd126; +fma.rn.f64 fd128, fd124, fd64, fd127; +mul.f64 fd129, fd64, fd126; +mul.f64 fd130, fd124, fd100; +sub.f64 fd131, fd130, fd129; +mul.f64 fd132, fd113, fd124; +mul.f64 fd133, fd114, fd126; +sub.f64 fd134, fd132, fd133; +mul.f64 fd135, fd113, fd126; +fma.rn.f64 fd136, fd114, fd124, fd135; +mul.f64 fd137, fd111, fd136; +fma.rn.f64 fd138, fd134, fd75, fd137; +mul.f64 fd139, fd75, fd136; +mul.f64 fd140, fd134, fd111; +sub.f64 fd141, fd140, fd139; +ld.global.v2.f64 {fd142, fd143}, [rd6+784]; +mul.f64 fd146, fd112, fd143; +fma.rn.f64 fd147, fd142, fd76, fd146; +mul.f64 fd148, fd76, fd143; +mul.f64 fd149, fd142, fd112; +sub.f64 fd150, fd149, fd148; +mul.f64 fd151, fd113, fd142; +mul.f64 fd152, fd114, fd143; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd113, fd143; +fma.rn.f64 fd155, fd114, fd142, fd154; +mul.f64 fd156, fd101, fd155; +fma.rn.f64 fd157, fd153, fd65, fd156; +mul.f64 fd158, fd65, fd155; +mul.f64 fd159, fd153, fd101; +sub.f64 fd160, fd159, fd158; +mul.f64 fd161, fd113, fd153; +mul.f64 fd162, fd114, fd155; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd113, fd155; +fma.rn.f64 fd165, fd114, fd153, fd164; +mul.f64 fd166, fd89, fd165; +fma.rn.f64 fd167, fd163, fd53, fd166; +mul.f64 fd168, fd53, fd165; +mul.f64 fd169, fd163, fd89; +sub.f64 fd170, fd169, fd168; +mad.lo.s32 r8, r5, 2744, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 56, r8; +st.shared.f64 [r9], fd34; +st.shared.f64 [r9+8], fd118; +st.shared.f64 [r9+16], fd128; +st.shared.f64 [r9+24], fd138; +st.shared.f64 [r9+32], fd147; +st.shared.f64 [r9+40], fd157; +st.shared.f64 [r9+48], fd167; +barrier.sync 0; +mad.lo.s32 r10, r7, -48, r9; +ld.shared.f64 fd171, [r10]; +ld.shared.f64 fd172, [r10+392]; +ld.shared.f64 fd173, [r10+784]; +ld.shared.f64 fd174, [r10+1176]; +ld.shared.f64 fd175, [r10+1568]; +ld.shared.f64 fd176, [r10+1960]; +ld.shared.f64 fd177, [r10+2352]; +barrier.sync 0; +st.shared.f64 [r9], fd40; +st.shared.f64 [r9+8], fd121; +st.shared.f64 [r9+16], fd131; +st.shared.f64 [r9+24], fd141; +st.shared.f64 [r9+32], fd150; +st.shared.f64 [r9+40], fd160; +st.shared.f64 [r9+48], fd170; +barrier.sync 0; +ld.shared.f64 fd178, [r10]; +ld.shared.f64 fd179, [r10+392]; +ld.shared.f64 fd180, [r10+784]; +ld.shared.f64 fd181, [r10+1176]; +ld.shared.f64 fd182, [r10+1568]; +ld.shared.f64 fd183, [r10+1960]; +ld.shared.f64 fd184, [r10+2352]; +add.f64 fd185, fd172, fd177; +add.f64 fd186, fd171, fd185; +add.f64 fd187, fd173, fd176; +add.f64 fd188, fd187, fd186; +add.f64 fd189, fd174, fd175; +add.f64 fd190, fd189, fd188; +add.f64 fd191, fd179, fd184; +add.f64 fd192, fd178, fd191; +add.f64 fd193, fd180, fd183; +add.f64 fd194, fd193, fd192; +add.f64 fd195, fd181, fd182; +add.f64 fd196, fd195, fd194; +fma.rn.f64 fd197, fd185, 0d3FE3F3A0E28BEDD1, fd171; +mul.f64 fd198, fd187, 0d3FCC7B90E3024582; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd189, 0d3FECD4BCA9CB5C71; +sub.f64 fd201, fd199, fd200; +sub.f64 fd202, fd179, fd184; +mul.f64 fd203, fd202, 0d3FE904C37505DE4B; +sub.f64 fd204, fd180, fd183; +fma.rn.f64 fd205, fd204, 0d3FEF329C0558E969, fd203; +sub.f64 fd206, fd181, fd182; +fma.rn.f64 fd207, fd206, 0d3FDBC4C04D71ABC1, fd205; +sub.f64 fd208, fd201, fd207; +add.f64 fd209, fd207, fd201; +mul.f64 fd210, fd185, 0d3FCC7B90E3024582; +sub.f64 fd211, fd171, fd210; +mul.f64 fd212, fd187, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +fma.rn.f64 fd214, fd189, 0d3FE3F3A0E28BEDD1, fd213; +mul.f64 fd215, fd202, 0d3FEF329C0558E969; +mul.f64 fd216, fd204, 0d3FDBC4C04D71ABC1; +sub.f64 fd217, fd215, fd216; +mul.f64 fd218, fd206, 0d3FE904C37505DE4B; +sub.f64 fd219, fd217, fd218; +sub.f64 fd220, fd214, fd219; +add.f64 fd221, fd219, fd214; +mul.f64 fd222, fd185, 0d3FECD4BCA9CB5C71; +sub.f64 fd223, fd171, fd222; +fma.rn.f64 fd224, fd187, 0d3FE3F3A0E28BEDD1, fd223; +mul.f64 fd225, fd189, 0d3FCC7B90E3024582; +sub.f64 fd226, fd224, fd225; +mul.f64 fd227, fd202, 0d3FDBC4C04D71ABC1; +mul.f64 fd228, fd204, 0d3FE904C37505DE4B; +sub.f64 fd229, fd227, fd228; +fma.rn.f64 fd230, fd206, 0d3FEF329C0558E969, fd229; +sub.f64 fd231, fd226, fd230; +add.f64 fd232, fd230, fd226; +fma.rn.f64 fd233, fd191, 0d3FE3F3A0E28BEDD1, fd178; +mul.f64 fd234, fd193, 0d3FCC7B90E3024582; +sub.f64 fd235, fd233, fd234; +mul.f64 fd236, fd195, 0d3FECD4BCA9CB5C71; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, fd172, fd177; +mul.f64 fd239, fd238, 0d3FE904C37505DE4B; +sub.f64 fd240, fd173, fd176; +fma.rn.f64 fd241, fd240, 0d3FEF329C0558E969, fd239; +sub.f64 fd242, fd174, fd175; +fma.rn.f64 fd243, fd242, 0d3FDBC4C04D71ABC1, fd241; +add.f64 fd244, fd243, fd237; +sub.f64 fd245, fd237, fd243; +mul.f64 fd246, fd191, 0d3FCC7B90E3024582; +sub.f64 fd247, fd178, fd246; +mul.f64 fd248, fd193, 0d3FECD4BCA9CB5C71; +sub.f64 fd249, fd247, fd248; +fma.rn.f64 fd250, fd195, 0d3FE3F3A0E28BEDD1, fd249; +mul.f64 fd251, fd238, 0d3FEF329C0558E969; +mul.f64 fd252, fd240, 0d3FDBC4C04D71ABC1; +sub.f64 fd253, fd251, fd252; +mul.f64 fd254, fd242, 0d3FE904C37505DE4B; +sub.f64 fd255, fd253, fd254; +add.f64 fd256, fd255, fd250; +sub.f64 fd257, fd250, fd255; +mul.f64 fd258, fd191, 0d3FECD4BCA9CB5C71; +sub.f64 fd259, fd178, fd258; +fma.rn.f64 fd260, fd193, 0d3FE3F3A0E28BEDD1, fd259; +mul.f64 fd261, fd195, 0d3FCC7B90E3024582; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd238, 0d3FDBC4C04D71ABC1; +mul.f64 fd264, fd240, 0d3FE904C37505DE4B; +sub.f64 fd265, fd263, fd264; +fma.rn.f64 fd266, fd242, 0d3FEF329C0558E969, fd265; +add.f64 fd267, fd266, fd262; +sub.f64 fd268, fd262, fd266; +mul.wide.u32 rd7, r7, 613566757; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 2; +mul.lo.s32 r16, r15, 7; +sub.s32 r17, r7, r16; +mul.wide.u32 rd9, r15, 16; +mov.u64 rd10, %16; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd269, fd270}, [rd11]; +mul.f64 fd273, fd244, fd270; +fma.rn.f64 fd274, fd269, fd208, fd273; +mul.f64 fd275, fd208, fd270; +mul.f64 fd276, fd269, fd244; +sub.f64 fd277, fd276, fd275; +mul.f64 fd278, fd269, fd269; +mul.f64 fd279, fd270, fd270; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd270, fd269; +fma.rn.f64 fd282, fd270, fd269, fd281; +mul.f64 fd283, fd256, fd282; +fma.rn.f64 fd284, fd280, fd220, fd283; +mul.f64 fd285, fd220, fd282; +mul.f64 fd286, fd280, fd256; +sub.f64 fd287, fd286, fd285; +mul.f64 fd288, fd269, fd280; +mul.f64 fd289, fd270, fd282; +sub.f64 fd290, fd288, fd289; +mul.f64 fd291, fd269, fd282; +fma.rn.f64 fd292, fd270, fd280, fd291; +mul.f64 fd293, fd267, fd292; +fma.rn.f64 fd294, fd290, fd231, fd293; +mul.f64 fd295, fd231, fd292; +mul.f64 fd296, fd290, fd267; +sub.f64 fd297, fd296, fd295; +ld.global.v2.f64 {fd298, fd299}, [rd11+112]; +mul.f64 fd302, fd268, fd299; +fma.rn.f64 fd303, fd298, fd232, fd302; +mul.f64 fd304, fd232, fd299; +mul.f64 fd305, fd298, fd268; +sub.f64 fd306, fd305, fd304; +mul.f64 fd307, fd269, fd298; +mul.f64 fd308, fd270, fd299; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd269, fd299; +fma.rn.f64 fd311, fd270, fd298, fd310; +mul.f64 fd312, fd257, fd311; +fma.rn.f64 fd313, fd309, fd221, fd312; +mul.f64 fd314, fd221, fd311; +mul.f64 fd315, fd309, fd257; +sub.f64 fd316, fd315, fd314; +mul.f64 fd317, fd269, fd309; +mul.f64 fd318, fd270, fd311; +sub.f64 fd319, fd317, fd318; +mul.f64 fd320, fd269, fd311; +fma.rn.f64 fd321, fd270, fd309, fd320; +mul.f64 fd322, fd245, fd321; +fma.rn.f64 fd323, fd319, fd209, fd322; +mul.f64 fd324, fd209, fd321; +mul.f64 fd325, fd319, fd245; +sub.f64 fd326, fd325, fd324; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 392, r19; +st.shared.f64 [r20], fd190; +st.shared.f64 [r20+56], fd274; +st.shared.f64 [r20+112], fd284; +st.shared.f64 [r20+168], fd294; +st.shared.f64 [r20+224], fd303; +st.shared.f64 [r20+280], fd313; +st.shared.f64 [r20+336], fd323; +barrier.sync 0; +ld.shared.f64 fd327, [r10]; +ld.shared.f64 fd328, [r10+392]; +ld.shared.f64 fd329, [r10+784]; +ld.shared.f64 fd330, [r10+1176]; +ld.shared.f64 fd331, [r10+1568]; +ld.shared.f64 fd332, [r10+1960]; +ld.shared.f64 fd333, [r10+2352]; +barrier.sync 0; +st.shared.f64 [r20], fd196; +st.shared.f64 [r20+56], fd277; +st.shared.f64 [r20+112], fd287; +st.shared.f64 [r20+168], fd297; +st.shared.f64 [r20+224], fd306; +st.shared.f64 [r20+280], fd316; +st.shared.f64 [r20+336], fd326; +barrier.sync 0; +ld.shared.f64 fd334, [r10]; +ld.shared.f64 fd335, [r10+392]; +ld.shared.f64 fd336, [r10+784]; +ld.shared.f64 fd337, [r10+1176]; +ld.shared.f64 fd338, [r10+1568]; +ld.shared.f64 fd339, [r10+1960]; +ld.shared.f64 fd340, [r10+2352]; +add.f64 fd341, fd328, fd333; +add.f64 fd342, fd327, fd341; +add.f64 fd343, fd329, fd332; +add.f64 fd344, fd343, fd342; +add.f64 fd345, fd330, fd331; +add.f64 fd346, fd335, fd340; +add.f64 fd347, fd334, fd346; +add.f64 fd348, fd336, fd339; +add.f64 fd349, fd348, fd347; +add.f64 fd350, fd337, fd338; +fma.rn.f64 fd351, fd341, 0d3FE3F3A0E28BEDD1, fd327; +mul.f64 fd352, fd343, 0d3FCC7B90E3024582; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd345, 0d3FECD4BCA9CB5C71; +sub.f64 fd355, fd353, fd354; +sub.f64 fd356, fd335, fd340; +mul.f64 fd357, fd356, 0d3FE904C37505DE4B; +sub.f64 fd358, fd336, fd339; +fma.rn.f64 fd359, fd358, 0d3FEF329C0558E969, fd357; +sub.f64 fd360, fd337, fd338; +fma.rn.f64 fd361, fd360, 0d3FDBC4C04D71ABC1, fd359; +mul.f64 fd362, fd341, 0d3FCC7B90E3024582; +sub.f64 fd363, fd327, fd362; +mul.f64 fd364, fd343, 0d3FECD4BCA9CB5C71; +sub.f64 fd365, fd363, fd364; +fma.rn.f64 fd366, fd345, 0d3FE3F3A0E28BEDD1, fd365; +mul.f64 fd367, fd356, 0d3FEF329C0558E969; +mul.f64 fd368, fd358, 0d3FDBC4C04D71ABC1; +sub.f64 fd369, fd367, fd368; +mul.f64 fd370, fd360, 0d3FE904C37505DE4B; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd341, 0d3FECD4BCA9CB5C71; +sub.f64 fd373, fd327, fd372; +fma.rn.f64 fd374, fd343, 0d3FE3F3A0E28BEDD1, fd373; +mul.f64 fd375, fd345, 0d3FCC7B90E3024582; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd356, 0d3FDBC4C04D71ABC1; +mul.f64 fd378, fd358, 0d3FE904C37505DE4B; +sub.f64 fd379, fd377, fd378; +fma.rn.f64 fd380, fd360, 0d3FEF329C0558E969, fd379; +fma.rn.f64 fd381, fd346, 0d3FE3F3A0E28BEDD1, fd334; +mul.f64 fd382, fd348, 0d3FCC7B90E3024582; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd350, 0d3FECD4BCA9CB5C71; +sub.f64 fd385, fd383, fd384; +sub.f64 fd386, fd328, fd333; +mul.f64 fd387, fd386, 0d3FE904C37505DE4B; +sub.f64 fd388, fd329, fd332; +fma.rn.f64 fd389, fd388, 0d3FEF329C0558E969, fd387; +sub.f64 fd390, fd330, fd331; +fma.rn.f64 fd391, fd390, 0d3FDBC4C04D71ABC1, fd389; +mul.f64 fd392, fd346, 0d3FCC7B90E3024582; +sub.f64 fd393, fd334, fd392; +mul.f64 fd394, fd348, 0d3FECD4BCA9CB5C71; +sub.f64 fd395, fd393, fd394; +fma.rn.f64 fd396, fd350, 0d3FE3F3A0E28BEDD1, fd395; +mul.f64 fd397, fd386, 0d3FEF329C0558E969; +mul.f64 fd398, fd388, 0d3FDBC4C04D71ABC1; +sub.f64 fd399, fd397, fd398; +mul.f64 fd400, fd390, 0d3FE904C37505DE4B; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd346, 0d3FECD4BCA9CB5C71; +sub.f64 fd403, fd334, fd402; +fma.rn.f64 fd404, fd348, 0d3FE3F3A0E28BEDD1, fd403; +mul.f64 fd405, fd350, 0d3FCC7B90E3024582; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd386, 0d3FDBC4C04D71ABC1; +mul.f64 fd408, fd388, 0d3FE904C37505DE4B; +sub.f64 fd409, fd407, fd408; +fma.rn.f64 fd410, fd390, 0d3FEF329C0558E969, fd409; +add.f64 %0, fd345, fd344; +add.f64 %1, fd350, fd349; +add.f64 %3, fd391, fd385; +sub.f64 %2, fd355, fd361; +sub.f64 %4, fd366, fd371; +add.f64 %5, fd401, fd396; +sub.f64 %6, fd376, fd380; +add.f64 %7, fd410, fd406; +add.f64 %8, fd380, fd376; +sub.f64 %9, fd406, fd410; +add.f64 %10, fd371, fd366; +sub.f64 %11, fd396, fd401; +sub.f64 %13, fd385, fd391; +add.f64 %12, fd361, fd355; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_343), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..7c8a3cd42e791 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp16_fwd.hpp.inc @@ -0,0 +1,1774 @@ +#ifndef CUFFTDX_FFT_36_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_36_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<931, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<74>; +.reg .b32 r<710>; +.reg .b64 rd<5>; +mov.u32 r697, %tid.y; +shl.b32 r698, r697, 1; +mov.u32 r699, %12; +mad.lo.s32 r700, r698, 144, r699; +mov.u32 r701, %tid.x; +mov.f32 f56, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r1, {low, high}; +} +mov.f32 f58, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %17, %21; +} +{ +add.f16x2 r8, %13, r5; +} +{ +add.f16x2 r11, %18, %22; +} +{ +add.f16x2 r14, %14, r11; +} +{ +add.f16x2 r17, %17, %21; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %13, r20; +} +{ +sub.f16x2 r26, %18, %22; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %17, %21; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %13, r38; +} +{ +sub.f16x2 r44, %18, %22; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %18, %22; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %14, r56; +} +{ +sub.f16x2 r62, %17, %21; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %18, %22; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %14, r74; +} +{ +sub.f16x2 r80, %17, %21; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %19, %23; +} +{ +add.f16x2 r96, %15, r93; +} +{ +add.f16x2 r99, %20, %24; +} +{ +add.f16x2 r102, %16, r99; +} +{ +add.f16x2 r105, %19, %23; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %15, r108; +} +{ +sub.f16x2 r114, %20, %24; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %19, %23; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %15, r126; +} +{ +sub.f16x2 r132, %20, %24; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %20, %24; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %16, r144; +} +{ +sub.f16x2 r150, %19, %23; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %20, %24; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %16, r162; +} +{ +sub.f16x2 r168, %19, %23; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f52, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r180, {low, high}; +} +mov.f32 f39, 0fBF800000; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +mul.wide.u32 rd2, r701, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r702, rd3; +mul.lo.s32 r703, r702, 6; +sub.s32 r704, r701, r703; +shr.u64 rd4, rd2, 33; +cvt.u32.u64 r705, rd4; +and.b32 r706, r705, 2147483646; +mad.lo.s32 r707, r706, 144, r700; +cvt.rn.f32.u32 f71, r704; +mul.f32 f72, f71, 0f3E32B8C2; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r260, {high, high}; +} +{ +mul.f16x2 r262, r234, r260; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r231, r258, r265; +} +{ +mul.f16x2 r271, r231, r260; +} +{ +fma.rn.f16x2 r274, r234, r258, r271; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r280, {high, high}; +} +mov.f32 f40, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r280, r282; +} +{ +mul.f16x2 r286, r255, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r289, {high, low}; +} +{ +fma.rn.f16x2 r291, r283, r289, r286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r297, {high, high}; +} +{ +mul.f16x2 r299, r246, r297; +} +{ +neg.f16x2 r302, r299; +} +{ +fma.rn.f16x2 r304, r243, r295, r302; +} +{ +mul.f16x2 r308, r243, r297; +} +{ +fma.rn.f16x2 r311, r246, r295, r308; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r315, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r317, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r319, {low, high}; +} +{ +mul.f16x2 r320, r317, r319; +} +{ +mul.f16x2 r323, r291, r315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r326, {high, low}; +} +{ +fma.rn.f16x2 r328, r320, r326, r323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r334, {high, high}; +} +{ +mul.f16x2 r336, r228, r334; +} +{ +neg.f16x2 r339, r336; +} +{ +fma.rn.f16x2 r341, r225, r332, r339; +} +{ +mul.f16x2 r345, r225, r334; +} +{ +fma.rn.f16x2 r348, r228, r332, r345; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r352, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r354, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r356, {low, high}; +} +{ +mul.f16x2 r357, r354, r356; +} +{ +mul.f16x2 r360, r328, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r363, {high, low}; +} +{ +fma.rn.f16x2 r365, r357, r363, r360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r371, {high, high}; +} +{ +mul.f16x2 r373, r240, r371; +} +{ +neg.f16x2 r376, r373; +} +{ +fma.rn.f16x2 r378, r237, r369, r376; +} +{ +mul.f16x2 r382, r237, r371; +} +{ +fma.rn.f16x2 r385, r240, r369, r382; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r389, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r391, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r393, {low, high}; +} +{ +mul.f16x2 r394, r391, r393; +} +{ +mul.f16x2 r397, r365, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r400, {high, low}; +} +{ +fma.rn.f16x2 r402, r394, r400, r397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r408, {high, high}; +} +{ +mul.f16x2 r410, r252, r408; +} +{ +neg.f16x2 r413, r410; +} +{ +fma.rn.f16x2 r415, r249, r406, r413; +} +{ +mul.f16x2 r419, r249, r408; +} +{ +fma.rn.f16x2 r422, r252, r406, r419; +} +barrier.sync 0; +mad.lo.s32 r708, r704, 48, r707; +st.shared.v2.f32 [r708], {r219, r222}; +st.shared.v2.f32 [r708+8], {r267, r274}; +st.shared.v2.f32 [r708+16], {r304, r311}; +st.shared.v2.f32 [r708+24], {r341, r348}; +st.shared.v2.f32 [r708+32], {r378, r385}; +st.shared.v2.f32 [r708+40], {r415, r422}; +barrier.sync 0; +mad.lo.s32 r709, r704, -40, r708; +ld.shared.u32 r451, [r709]; +ld.shared.u32 r457, [r709+4]; +ld.shared.u32 r539, [r709+48]; +ld.shared.u32 r545, [r709+52]; +ld.shared.u32 r448, [r709+96]; +ld.shared.u32 r454, [r709+100]; +ld.shared.u32 r536, [r709+144]; +ld.shared.u32 r542, [r709+148]; +ld.shared.u32 r449, [r709+192]; +ld.shared.u32 r455, [r709+196]; +ld.shared.u32 r537, [r709+240]; +ld.shared.u32 r543, [r709+244]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r444, {low, high}; +} +{ +neg.f16x2 r445, r444; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r454, r455; +} +{ +add.f16x2 r456, r457, r453; +} +{ +add.f16x2 r459, r448, r449; +} +{ +mul.f16x2 r462, r459, r443; +} +{ +add.f16x2 r465, r451, r462; +} +{ +sub.f16x2 r468, r454, r455; +} +{ +mul.f16x2 r471, r468, r445; +} +{ +add.f16x2 r474, r465, r471; +} +{ +add.f16x2 r477, r448, r449; +} +{ +mul.f16x2 r480, r477, r443; +} +{ +add.f16x2 r483, r451, r480; +} +{ +sub.f16x2 r486, r454, r455; +} +{ +mul.f16x2 r489, r486, r445; +} +{ +sub.f16x2 r492, r483, r489; +} +{ +add.f16x2 r495, r454, r455; +} +{ +mul.f16x2 r498, r495, r443; +} +{ +add.f16x2 r501, r457, r498; +} +{ +sub.f16x2 r504, r448, r449; +} +{ +mul.f16x2 r507, r504, r445; +} +{ +sub.f16x2 r510, r501, r507; +} +{ +add.f16x2 r513, r454, r455; +} +{ +mul.f16x2 r516, r513, r443; +} +{ +add.f16x2 r519, r457, r516; +} +{ +sub.f16x2 r522, r448, r449; +} +{ +mul.f16x2 r525, r522, r445; +} +{ +add.f16x2 r528, r519, r525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r532, {low, high}; +} +{ +neg.f16x2 r533, r532; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r535; +} +{ +add.f16x2 r541, r542, r543; +} +{ +add.f16x2 r544, r545, r541; +} +{ +add.f16x2 r547, r536, r537; +} +{ +mul.f16x2 r550, r547, r531; +} +{ +add.f16x2 r553, r539, r550; +} +{ +sub.f16x2 r556, r542, r543; +} +{ +mul.f16x2 r559, r556, r533; +} +{ +add.f16x2 r562, r553, r559; +} +{ +add.f16x2 r565, r536, r537; +} +{ +mul.f16x2 r568, r565, r531; +} +{ +add.f16x2 r571, r539, r568; +} +{ +sub.f16x2 r574, r542, r543; +} +{ +mul.f16x2 r577, r574, r533; +} +{ +sub.f16x2 r580, r571, r577; +} +{ +add.f16x2 r583, r542, r543; +} +{ +mul.f16x2 r586, r583, r531; +} +{ +add.f16x2 r589, r545, r586; +} +{ +sub.f16x2 r592, r536, r537; +} +{ +mul.f16x2 r595, r592, r533; +} +{ +sub.f16x2 r598, r589, r595; +} +{ +add.f16x2 r601, r542, r543; +} +{ +mul.f16x2 r604, r601, r531; +} +{ +add.f16x2 r607, r545, r604; +} +{ +sub.f16x2 r610, r536, r537; +} +{ +mul.f16x2 r613, r610, r533; +} +{ +add.f16x2 r616, r607, r613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r619, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r622, {low, high}; +} +{ +mul.f16x2 r629, r562, r619; +} +{ +mul.f16x2 r632, r598, r620; +} +{ +sub.f16x2 r635, r629, r632; +} +{ +mul.f16x2 r638, r562, r620; +} +{ +fma.rn.f16x2 r641, r598, r619, r638; +} +{ +mul.f16x2 r645, r580, r621; +} +{ +mul.f16x2 r648, r616, r622; +} +{ +sub.f16x2 r651, r645, r648; +} +{ +mul.f16x2 r654, r580, r622; +} +{ +fma.rn.f16x2 r657, r616, r621, r654; +} +{ +add.f16x2 %0, r450, r538; +} +{ +add.f16x2 %1, r456, r544; +} +{ +sub.f16x2 %6, r450, r538; +} +{ +sub.f16x2 %7, r456, r544; +} +{ +add.f16x2 %2, r474, r635; +} +{ +add.f16x2 %3, r510, r641; +} +{ +sub.f16x2 %8, r474, r635; +} +{ +sub.f16x2 %9, r510, r641; +} +{ +add.f16x2 %4, r492, r651; +} +{ +add.f16x2 %5, r528, r657; +} +{ +sub.f16x2 %10, r492, r651; +} +{ +sub.f16x2 %11, r528, r657; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<930, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<74>; +.reg .b32 r<707>; +.reg .b64 rd<4>; +mov.u32 r697, %tid.y; +mov.u32 r698, %12; +mad.lo.s32 r699, r697, 144, r698; +mov.u32 r700, %tid.x; +mov.f32 f56, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r1, {low, high}; +} +mov.f32 f58, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %17, %21; +} +{ +add.f16x2 r8, %13, r5; +} +{ +add.f16x2 r11, %18, %22; +} +{ +add.f16x2 r14, %14, r11; +} +{ +add.f16x2 r17, %17, %21; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %13, r20; +} +{ +sub.f16x2 r26, %18, %22; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %17, %21; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %13, r38; +} +{ +sub.f16x2 r44, %18, %22; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %18, %22; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %14, r56; +} +{ +sub.f16x2 r62, %17, %21; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %18, %22; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %14, r74; +} +{ +sub.f16x2 r80, %17, %21; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %19, %23; +} +{ +add.f16x2 r96, %15, r93; +} +{ +add.f16x2 r99, %20, %24; +} +{ +add.f16x2 r102, %16, r99; +} +{ +add.f16x2 r105, %19, %23; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %15, r108; +} +{ +sub.f16x2 r114, %20, %24; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %19, %23; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %15, r126; +} +{ +sub.f16x2 r132, %20, %24; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %20, %24; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %16, r144; +} +{ +sub.f16x2 r150, %19, %23; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %20, %24; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %16, r162; +} +{ +sub.f16x2 r168, %19, %23; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +mov.f32 f52, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r179, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r180, {low, high}; +} +mov.f32 f39, 0fBF800000; +{ +mul.f16x2 r187, r120, r177; +} +{ +mul.f16x2 r190, r156, r178; +} +{ +sub.f16x2 r193, r187, r190; +} +{ +mul.f16x2 r196, r120, r178; +} +{ +fma.rn.f16x2 r199, r156, r177, r196; +} +{ +mul.f16x2 r203, r138, r179; +} +{ +mul.f16x2 r206, r174, r180; +} +{ +sub.f16x2 r209, r203, r206; +} +{ +mul.f16x2 r212, r138, r180; +} +{ +fma.rn.f16x2 r215, r174, r179, r212; +} +{ +add.f16x2 r219, r8, r96; +} +{ +add.f16x2 r222, r14, r102; +} +{ +sub.f16x2 r225, r8, r96; +} +{ +sub.f16x2 r228, r14, r102; +} +{ +add.f16x2 r231, r32, r193; +} +{ +add.f16x2 r234, r68, r199; +} +{ +sub.f16x2 r237, r32, r193; +} +{ +sub.f16x2 r240, r68, r199; +} +{ +add.f16x2 r243, r50, r209; +} +{ +add.f16x2 r246, r86, r215; +} +{ +sub.f16x2 r249, r50, r209; +} +{ +sub.f16x2 r252, r86, r215; +} +mul.wide.u32 rd2, r700, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r701, rd3; +mul.lo.s32 r702, r701, 6; +sub.s32 r703, r700, r702; +mad.lo.s32 r704, r701, 144, r699; +cvt.rn.f32.u32 f71, r703; +mul.f32 f72, f71, 0f3E32B8C2; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r255, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r260, {high, high}; +} +{ +mul.f16x2 r262, r234, r260; +} +{ +neg.f16x2 r265, r262; +} +{ +fma.rn.f16x2 r267, r231, r258, r265; +} +{ +mul.f16x2 r271, r231, r260; +} +{ +fma.rn.f16x2 r274, r234, r258, r271; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r278, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r280, {high, high}; +} +mov.f32 f40, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r282, {low, high}; +} +{ +mul.f16x2 r283, r280, r282; +} +{ +mul.f16x2 r286, r255, r278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r289, {high, low}; +} +{ +fma.rn.f16x2 r291, r283, r289, r286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r297, {high, high}; +} +{ +mul.f16x2 r299, r246, r297; +} +{ +neg.f16x2 r302, r299; +} +{ +fma.rn.f16x2 r304, r243, r295, r302; +} +{ +mul.f16x2 r308, r243, r297; +} +{ +fma.rn.f16x2 r311, r246, r295, r308; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r315, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r317, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r319, {low, high}; +} +{ +mul.f16x2 r320, r317, r319; +} +{ +mul.f16x2 r323, r291, r315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r291; +mov.b32 r326, {high, low}; +} +{ +fma.rn.f16x2 r328, r320, r326, r323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r332, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r334, {high, high}; +} +{ +mul.f16x2 r336, r228, r334; +} +{ +neg.f16x2 r339, r336; +} +{ +fma.rn.f16x2 r341, r225, r332, r339; +} +{ +mul.f16x2 r345, r225, r334; +} +{ +fma.rn.f16x2 r348, r228, r332, r345; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r352, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r354, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r356, {low, high}; +} +{ +mul.f16x2 r357, r354, r356; +} +{ +mul.f16x2 r360, r328, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r328; +mov.b32 r363, {high, low}; +} +{ +fma.rn.f16x2 r365, r357, r363, r360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r369, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r371, {high, high}; +} +{ +mul.f16x2 r373, r240, r371; +} +{ +neg.f16x2 r376, r373; +} +{ +fma.rn.f16x2 r378, r237, r369, r376; +} +{ +mul.f16x2 r382, r237, r371; +} +{ +fma.rn.f16x2 r385, r240, r369, r382; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r389, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r255; +mov.b32 r391, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r393, {low, high}; +} +{ +mul.f16x2 r394, r391, r393; +} +{ +mul.f16x2 r397, r365, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r365; +mov.b32 r400, {high, low}; +} +{ +fma.rn.f16x2 r402, r394, r400, r397; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r406, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r402; +mov.b32 r408, {high, high}; +} +{ +mul.f16x2 r410, r252, r408; +} +{ +neg.f16x2 r413, r410; +} +{ +fma.rn.f16x2 r415, r249, r406, r413; +} +{ +mul.f16x2 r419, r249, r408; +} +{ +fma.rn.f16x2 r422, r252, r406, r419; +} +barrier.sync 0; +mad.lo.s32 r705, r703, 24, r704; +st.shared.v2.f32 [r705], {r219, r267}; +st.shared.v2.f32 [r705+8], {r304, r341}; +st.shared.v2.f32 [r705+16], {r378, r415}; +barrier.sync 0; +mad.lo.s32 r706, r703, -20, r705; +ld.shared.u32 r451, [r706]; +ld.shared.u32 r539, [r706+24]; +ld.shared.u32 r448, [r706+48]; +ld.shared.u32 r536, [r706+72]; +ld.shared.u32 r449, [r706+96]; +ld.shared.u32 r537, [r706+120]; +barrier.sync 0; +st.shared.v2.f32 [r705], {r222, r274}; +st.shared.v2.f32 [r705+8], {r311, r348}; +st.shared.v2.f32 [r705+16], {r385, r422}; +barrier.sync 0; +ld.shared.u32 r457, [r706]; +ld.shared.u32 r545, [r706+24]; +ld.shared.u32 r454, [r706+48]; +ld.shared.u32 r542, [r706+72]; +ld.shared.u32 r455, [r706+96]; +ld.shared.u32 r543, [r706+120]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r443, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r444, {low, high}; +} +{ +neg.f16x2 r445, r444; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r454, r455; +} +{ +add.f16x2 r456, r457, r453; +} +{ +add.f16x2 r459, r448, r449; +} +{ +mul.f16x2 r462, r459, r443; +} +{ +add.f16x2 r465, r451, r462; +} +{ +sub.f16x2 r468, r454, r455; +} +{ +mul.f16x2 r471, r468, r445; +} +{ +add.f16x2 r474, r465, r471; +} +{ +add.f16x2 r477, r448, r449; +} +{ +mul.f16x2 r480, r477, r443; +} +{ +add.f16x2 r483, r451, r480; +} +{ +sub.f16x2 r486, r454, r455; +} +{ +mul.f16x2 r489, r486, r445; +} +{ +sub.f16x2 r492, r483, r489; +} +{ +add.f16x2 r495, r454, r455; +} +{ +mul.f16x2 r498, r495, r443; +} +{ +add.f16x2 r501, r457, r498; +} +{ +sub.f16x2 r504, r448, r449; +} +{ +mul.f16x2 r507, r504, r445; +} +{ +sub.f16x2 r510, r501, r507; +} +{ +add.f16x2 r513, r454, r455; +} +{ +mul.f16x2 r516, r513, r443; +} +{ +add.f16x2 r519, r457, r516; +} +{ +sub.f16x2 r522, r448, r449; +} +{ +mul.f16x2 r525, r522, r445; +} +{ +add.f16x2 r528, r519, r525; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r532, {low, high}; +} +{ +neg.f16x2 r533, r532; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r535; +} +{ +add.f16x2 r541, r542, r543; +} +{ +add.f16x2 r544, r545, r541; +} +{ +add.f16x2 r547, r536, r537; +} +{ +mul.f16x2 r550, r547, r531; +} +{ +add.f16x2 r553, r539, r550; +} +{ +sub.f16x2 r556, r542, r543; +} +{ +mul.f16x2 r559, r556, r533; +} +{ +add.f16x2 r562, r553, r559; +} +{ +add.f16x2 r565, r536, r537; +} +{ +mul.f16x2 r568, r565, r531; +} +{ +add.f16x2 r571, r539, r568; +} +{ +sub.f16x2 r574, r542, r543; +} +{ +mul.f16x2 r577, r574, r533; +} +{ +sub.f16x2 r580, r571, r577; +} +{ +add.f16x2 r583, r542, r543; +} +{ +mul.f16x2 r586, r583, r531; +} +{ +add.f16x2 r589, r545, r586; +} +{ +sub.f16x2 r592, r536, r537; +} +{ +mul.f16x2 r595, r592, r533; +} +{ +sub.f16x2 r598, r589, r595; +} +{ +add.f16x2 r601, r542, r543; +} +{ +mul.f16x2 r604, r601, r531; +} +{ +add.f16x2 r607, r545, r604; +} +{ +sub.f16x2 r610, r536, r537; +} +{ +mul.f16x2 r613, r610, r533; +} +{ +add.f16x2 r616, r607, r613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r619, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r622, {low, high}; +} +{ +mul.f16x2 r629, r562, r619; +} +{ +mul.f16x2 r632, r598, r620; +} +{ +sub.f16x2 r635, r629, r632; +} +{ +mul.f16x2 r638, r562, r620; +} +{ +fma.rn.f16x2 r641, r598, r619, r638; +} +{ +mul.f16x2 r645, r580, r621; +} +{ +mul.f16x2 r648, r616, r622; +} +{ +sub.f16x2 r651, r645, r648; +} +{ +mul.f16x2 r654, r580, r622; +} +{ +fma.rn.f16x2 r657, r616, r621, r654; +} +{ +add.f16x2 %0, r450, r538; +} +{ +add.f16x2 %1, r456, r544; +} +{ +sub.f16x2 %6, r450, r538; +} +{ +sub.f16x2 %7, r456, r544; +} +{ +add.f16x2 %2, r474, r635; +} +{ +add.f16x2 %3, r510, r641; +} +{ +sub.f16x2 %8, r474, r635; +} +{ +sub.f16x2 %9, r510, r641; +} +{ +add.f16x2 %4, r492, r651; +} +{ +add.f16x2 %5, r528, r657; +} +{ +sub.f16x2 %10, r492, r651; +} +{ +sub.f16x2 %11, r528, r657; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..bbc35d3798b5a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp16_inv.hpp.inc @@ -0,0 +1,1752 @@ +#ifndef CUFFTDX_FFT_36_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_36_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1133, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<74>; +.reg .b32 r<702>; +.reg .b64 rd<5>; +mov.u32 r689, %tid.y; +shl.b32 r690, r689, 1; +mov.u32 r691, %12; +mad.lo.s32 r692, r690, 144, r691; +mov.u32 r693, %tid.x; +mov.f32 f56, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r1, {low, high}; +} +mov.f32 f50, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %17, %21; +} +{ +add.f16x2 r6, %13, r3; +} +{ +add.f16x2 r9, %18, %22; +} +{ +add.f16x2 r12, %14, r9; +} +{ +add.f16x2 r15, %17, %21; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %13, r18; +} +{ +sub.f16x2 r24, %18, %22; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %17, %21; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %13, r36; +} +{ +sub.f16x2 r42, %18, %22; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %18, %22; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %14, r54; +} +{ +sub.f16x2 r60, %17, %21; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %18, %22; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %14, r72; +} +{ +sub.f16x2 r78, %17, %21; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %19, %23; +} +{ +add.f16x2 r92, %15, r89; +} +{ +add.f16x2 r95, %20, %24; +} +{ +add.f16x2 r98, %16, r95; +} +{ +add.f16x2 r101, %19, %23; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %15, r104; +} +{ +sub.f16x2 r110, %20, %24; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %19, %23; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %15, r122; +} +{ +sub.f16x2 r128, %20, %24; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %20, %24; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %16, r140; +} +{ +sub.f16x2 r146, %19, %23; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %20, %24; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %16, r158; +} +{ +sub.f16x2 r164, %19, %23; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f52, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r173, {low, high}; +} +mov.f32 f58, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r176, {low, high}; +} +mov.f32 f39, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +mul.wide.u32 rd2, r693, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r694, rd3; +mul.lo.s32 r695, r694, 6; +sub.s32 r696, r693, r695; +shr.u64 rd4, rd2, 33; +cvt.u32.u64 r697, rd4; +and.b32 r698, r697, 2147483646; +mad.lo.s32 r699, r698, 144, r692; +cvt.rn.f32.u32 f71, r696; +mul.f32 f72, f71, 0f3E32B8C2; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r256, {high, high}; +} +{ +mul.f16x2 r258, r230, r256; +} +{ +fma.rn.f16x2 r261, r227, r254, r258; +} +{ +mul.f16x2 r265, r227, r256; +} +{ +neg.f16x2 r268, r265; +} +{ +fma.rn.f16x2 r270, r230, r254, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r276, {high, high}; +} +mov.f32 f40, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r276, r278; +} +{ +mul.f16x2 r282, r251, r274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r285, {high, low}; +} +{ +fma.rn.f16x2 r287, r279, r285, r282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r293, {high, high}; +} +{ +mul.f16x2 r295, r242, r293; +} +{ +fma.rn.f16x2 r298, r239, r291, r295; +} +{ +mul.f16x2 r302, r239, r293; +} +{ +neg.f16x2 r305, r302; +} +{ +fma.rn.f16x2 r307, r242, r291, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r313, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r315, {low, high}; +} +{ +mul.f16x2 r316, r313, r315; +} +{ +mul.f16x2 r319, r287, r311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r322, {high, low}; +} +{ +fma.rn.f16x2 r324, r316, r322, r319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r224, r330; +} +{ +fma.rn.f16x2 r335, r221, r328, r332; +} +{ +mul.f16x2 r339, r221, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r224, r328, r342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r350, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r350, r352; +} +{ +mul.f16x2 r356, r324, r348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r359, {high, low}; +} +{ +fma.rn.f16x2 r361, r353, r359, r356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r367, {high, high}; +} +{ +mul.f16x2 r369, r236, r367; +} +{ +fma.rn.f16x2 r372, r233, r365, r369; +} +{ +mul.f16x2 r376, r233, r367; +} +{ +neg.f16x2 r379, r376; +} +{ +fma.rn.f16x2 r381, r236, r365, r379; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r387, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r389, {low, high}; +} +{ +mul.f16x2 r390, r387, r389; +} +{ +mul.f16x2 r393, r361, r385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r396, {high, low}; +} +{ +fma.rn.f16x2 r398, r390, r396, r393; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r404, {high, high}; +} +{ +mul.f16x2 r406, r248, r404; +} +{ +fma.rn.f16x2 r409, r245, r402, r406; +} +{ +mul.f16x2 r413, r245, r404; +} +{ +neg.f16x2 r416, r413; +} +{ +fma.rn.f16x2 r418, r248, r402, r416; +} +barrier.sync 0; +mad.lo.s32 r700, r696, 48, r699; +st.shared.v2.f32 [r700], {r215, r218}; +st.shared.v2.f32 [r700+8], {r261, r270}; +st.shared.v2.f32 [r700+16], {r298, r307}; +st.shared.v2.f32 [r700+24], {r335, r344}; +st.shared.v2.f32 [r700+32], {r372, r381}; +st.shared.v2.f32 [r700+40], {r409, r418}; +barrier.sync 0; +mad.lo.s32 r701, r696, -40, r700; +ld.shared.u32 r445, [r701]; +ld.shared.u32 r451, [r701+4]; +ld.shared.u32 r531, [r701+48]; +ld.shared.u32 r537, [r701+52]; +ld.shared.u32 r442, [r701+96]; +ld.shared.u32 r448, [r701+100]; +ld.shared.u32 r528, [r701+144]; +ld.shared.u32 r534, [r701+148]; +ld.shared.u32 r443, [r701+192]; +ld.shared.u32 r449, [r701+196]; +ld.shared.u32 r529, [r701+240]; +ld.shared.u32 r535, [r701+244]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r440, {low, high}; +} +{ +add.f16x2 r441, r442, r443; +} +{ +add.f16x2 r444, r445, r441; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r442, r443; +} +{ +mul.f16x2 r456, r453, r439; +} +{ +add.f16x2 r459, r445, r456; +} +{ +sub.f16x2 r462, r448, r449; +} +{ +mul.f16x2 r465, r462, r440; +} +{ +add.f16x2 r468, r459, r465; +} +{ +add.f16x2 r471, r442, r443; +} +{ +mul.f16x2 r474, r471, r439; +} +{ +add.f16x2 r477, r445, r474; +} +{ +sub.f16x2 r480, r448, r449; +} +{ +mul.f16x2 r483, r480, r440; +} +{ +sub.f16x2 r486, r477, r483; +} +{ +add.f16x2 r489, r448, r449; +} +{ +mul.f16x2 r492, r489, r439; +} +{ +add.f16x2 r495, r451, r492; +} +{ +sub.f16x2 r498, r442, r443; +} +{ +mul.f16x2 r501, r498, r440; +} +{ +sub.f16x2 r504, r495, r501; +} +{ +add.f16x2 r507, r448, r449; +} +{ +mul.f16x2 r510, r507, r439; +} +{ +add.f16x2 r513, r451, r510; +} +{ +sub.f16x2 r516, r442, r443; +} +{ +mul.f16x2 r519, r516, r440; +} +{ +add.f16x2 r522, r513, r519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r526, {low, high}; +} +{ +add.f16x2 r527, r528, r529; +} +{ +add.f16x2 r530, r531, r527; +} +{ +add.f16x2 r533, r534, r535; +} +{ +add.f16x2 r536, r537, r533; +} +{ +add.f16x2 r539, r528, r529; +} +{ +mul.f16x2 r542, r539, r525; +} +{ +add.f16x2 r545, r531, r542; +} +{ +sub.f16x2 r548, r534, r535; +} +{ +mul.f16x2 r551, r548, r526; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, r528, r529; +} +{ +mul.f16x2 r560, r557, r525; +} +{ +add.f16x2 r563, r531, r560; +} +{ +sub.f16x2 r566, r534, r535; +} +{ +mul.f16x2 r569, r566, r526; +} +{ +sub.f16x2 r572, r563, r569; +} +{ +add.f16x2 r575, r534, r535; +} +{ +mul.f16x2 r578, r575, r525; +} +{ +add.f16x2 r581, r537, r578; +} +{ +sub.f16x2 r584, r528, r529; +} +{ +mul.f16x2 r587, r584, r526; +} +{ +sub.f16x2 r590, r581, r587; +} +{ +add.f16x2 r593, r534, r535; +} +{ +mul.f16x2 r596, r593, r525; +} +{ +add.f16x2 r599, r537, r596; +} +{ +sub.f16x2 r602, r528, r529; +} +{ +mul.f16x2 r605, r602, r526; +} +{ +add.f16x2 r608, r599, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r621, r554, r611; +} +{ +mul.f16x2 r624, r590, r612; +} +{ +sub.f16x2 r627, r621, r624; +} +{ +mul.f16x2 r630, r554, r612; +} +{ +fma.rn.f16x2 r633, r590, r611, r630; +} +{ +mul.f16x2 r637, r572, r613; +} +{ +mul.f16x2 r640, r608, r614; +} +{ +sub.f16x2 r643, r637, r640; +} +{ +mul.f16x2 r646, r572, r614; +} +{ +fma.rn.f16x2 r649, r608, r613, r646; +} +{ +add.f16x2 %0, r444, r530; +} +{ +add.f16x2 %1, r450, r536; +} +{ +sub.f16x2 %6, r444, r530; +} +{ +sub.f16x2 %7, r450, r536; +} +{ +add.f16x2 %2, r468, r627; +} +{ +add.f16x2 %3, r504, r633; +} +{ +sub.f16x2 %8, r468, r627; +} +{ +sub.f16x2 %9, r504, r633; +} +{ +add.f16x2 %4, r486, r643; +} +{ +add.f16x2 %5, r522, r649; +} +{ +sub.f16x2 %10, r486, r643; +} +{ +sub.f16x2 %11, r522, r649; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1132, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<74>; +.reg .b32 r<699>; +.reg .b64 rd<4>; +mov.u32 r689, %tid.y; +mov.u32 r690, %12; +mad.lo.s32 r691, r689, 144, r690; +mov.u32 r692, %tid.x; +mov.f32 f56, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r1, {low, high}; +} +mov.f32 f50, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %17, %21; +} +{ +add.f16x2 r6, %13, r3; +} +{ +add.f16x2 r9, %18, %22; +} +{ +add.f16x2 r12, %14, r9; +} +{ +add.f16x2 r15, %17, %21; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %13, r18; +} +{ +sub.f16x2 r24, %18, %22; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %17, %21; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %13, r36; +} +{ +sub.f16x2 r42, %18, %22; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %18, %22; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %14, r54; +} +{ +sub.f16x2 r60, %17, %21; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %18, %22; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %14, r72; +} +{ +sub.f16x2 r78, %17, %21; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %19, %23; +} +{ +add.f16x2 r92, %15, r89; +} +{ +add.f16x2 r95, %20, %24; +} +{ +add.f16x2 r98, %16, r95; +} +{ +add.f16x2 r101, %19, %23; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %15, r104; +} +{ +sub.f16x2 r110, %20, %24; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %19, %23; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %15, r122; +} +{ +sub.f16x2 r128, %20, %24; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %20, %24; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %16, r140; +} +{ +sub.f16x2 r146, %19, %23; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %20, %24; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %16, r158; +} +{ +sub.f16x2 r164, %19, %23; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +mov.f32 f52, 0f3F000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r173, {low, high}; +} +mov.f32 f58, 0f3F5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r176, {low, high}; +} +mov.f32 f39, 0fBF800000; +{ +mul.f16x2 r183, r116, r173; +} +{ +mul.f16x2 r186, r152, r174; +} +{ +sub.f16x2 r189, r183, r186; +} +{ +mul.f16x2 r192, r116, r174; +} +{ +fma.rn.f16x2 r195, r152, r173, r192; +} +{ +mul.f16x2 r199, r134, r175; +} +{ +mul.f16x2 r202, r170, r176; +} +{ +sub.f16x2 r205, r199, r202; +} +{ +mul.f16x2 r208, r134, r176; +} +{ +fma.rn.f16x2 r211, r170, r175, r208; +} +{ +add.f16x2 r215, r6, r92; +} +{ +add.f16x2 r218, r12, r98; +} +{ +sub.f16x2 r221, r6, r92; +} +{ +sub.f16x2 r224, r12, r98; +} +{ +add.f16x2 r227, r30, r189; +} +{ +add.f16x2 r230, r66, r195; +} +{ +sub.f16x2 r233, r30, r189; +} +{ +sub.f16x2 r236, r66, r195; +} +{ +add.f16x2 r239, r48, r205; +} +{ +add.f16x2 r242, r84, r211; +} +{ +sub.f16x2 r245, r48, r205; +} +{ +sub.f16x2 r248, r84, r211; +} +mul.wide.u32 rd2, r692, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r693, rd3; +mul.lo.s32 r694, r693, 6; +sub.s32 r695, r692, r694; +mad.lo.s32 r696, r693, 144, r691; +cvt.rn.f32.u32 f71, r695; +mul.f32 f72, f71, 0f3E32B8C2; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r251, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r256, {high, high}; +} +{ +mul.f16x2 r258, r230, r256; +} +{ +fma.rn.f16x2 r261, r227, r254, r258; +} +{ +mul.f16x2 r265, r227, r256; +} +{ +neg.f16x2 r268, r265; +} +{ +fma.rn.f16x2 r270, r230, r254, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r276, {high, high}; +} +mov.f32 f40, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r278, {low, high}; +} +{ +mul.f16x2 r279, r276, r278; +} +{ +mul.f16x2 r282, r251, r274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r285, {high, low}; +} +{ +fma.rn.f16x2 r287, r279, r285, r282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r293, {high, high}; +} +{ +mul.f16x2 r295, r242, r293; +} +{ +fma.rn.f16x2 r298, r239, r291, r295; +} +{ +mul.f16x2 r302, r239, r293; +} +{ +neg.f16x2 r305, r302; +} +{ +fma.rn.f16x2 r307, r242, r291, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r313, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r315, {low, high}; +} +{ +mul.f16x2 r316, r313, r315; +} +{ +mul.f16x2 r319, r287, r311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r287; +mov.b32 r322, {high, low}; +} +{ +fma.rn.f16x2 r324, r316, r322, r319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r224, r330; +} +{ +fma.rn.f16x2 r335, r221, r328, r332; +} +{ +mul.f16x2 r339, r221, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r224, r328, r342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r350, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r352, {low, high}; +} +{ +mul.f16x2 r353, r350, r352; +} +{ +mul.f16x2 r356, r324, r348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r324; +mov.b32 r359, {high, low}; +} +{ +fma.rn.f16x2 r361, r353, r359, r356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r367, {high, high}; +} +{ +mul.f16x2 r369, r236, r367; +} +{ +fma.rn.f16x2 r372, r233, r365, r369; +} +{ +mul.f16x2 r376, r233, r367; +} +{ +neg.f16x2 r379, r376; +} +{ +fma.rn.f16x2 r381, r236, r365, r379; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r387, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f39; +cvt.rn.f16.f32 high, f40; +mov.b32 r389, {low, high}; +} +{ +mul.f16x2 r390, r387, r389; +} +{ +mul.f16x2 r393, r361, r385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r361; +mov.b32 r396, {high, low}; +} +{ +fma.rn.f16x2 r398, r390, r396, r393; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r398; +mov.b32 r404, {high, high}; +} +{ +mul.f16x2 r406, r248, r404; +} +{ +fma.rn.f16x2 r409, r245, r402, r406; +} +{ +mul.f16x2 r413, r245, r404; +} +{ +neg.f16x2 r416, r413; +} +{ +fma.rn.f16x2 r418, r248, r402, r416; +} +barrier.sync 0; +mad.lo.s32 r697, r695, 24, r696; +st.shared.v2.f32 [r697], {r215, r261}; +st.shared.v2.f32 [r697+8], {r298, r335}; +st.shared.v2.f32 [r697+16], {r372, r409}; +barrier.sync 0; +mad.lo.s32 r698, r695, -20, r697; +ld.shared.u32 r445, [r698]; +ld.shared.u32 r531, [r698+24]; +ld.shared.u32 r442, [r698+48]; +ld.shared.u32 r528, [r698+72]; +ld.shared.u32 r443, [r698+96]; +ld.shared.u32 r529, [r698+120]; +barrier.sync 0; +st.shared.v2.f32 [r697], {r218, r270}; +st.shared.v2.f32 [r697+8], {r307, r344}; +st.shared.v2.f32 [r697+16], {r381, r418}; +barrier.sync 0; +ld.shared.u32 r451, [r698]; +ld.shared.u32 r537, [r698+24]; +ld.shared.u32 r448, [r698+48]; +ld.shared.u32 r534, [r698+72]; +ld.shared.u32 r449, [r698+96]; +ld.shared.u32 r535, [r698+120]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r440, {low, high}; +} +{ +add.f16x2 r441, r442, r443; +} +{ +add.f16x2 r444, r445, r441; +} +{ +add.f16x2 r447, r448, r449; +} +{ +add.f16x2 r450, r451, r447; +} +{ +add.f16x2 r453, r442, r443; +} +{ +mul.f16x2 r456, r453, r439; +} +{ +add.f16x2 r459, r445, r456; +} +{ +sub.f16x2 r462, r448, r449; +} +{ +mul.f16x2 r465, r462, r440; +} +{ +add.f16x2 r468, r459, r465; +} +{ +add.f16x2 r471, r442, r443; +} +{ +mul.f16x2 r474, r471, r439; +} +{ +add.f16x2 r477, r445, r474; +} +{ +sub.f16x2 r480, r448, r449; +} +{ +mul.f16x2 r483, r480, r440; +} +{ +sub.f16x2 r486, r477, r483; +} +{ +add.f16x2 r489, r448, r449; +} +{ +mul.f16x2 r492, r489, r439; +} +{ +add.f16x2 r495, r451, r492; +} +{ +sub.f16x2 r498, r442, r443; +} +{ +mul.f16x2 r501, r498, r440; +} +{ +sub.f16x2 r504, r495, r501; +} +{ +add.f16x2 r507, r448, r449; +} +{ +mul.f16x2 r510, r507, r439; +} +{ +add.f16x2 r513, r451, r510; +} +{ +sub.f16x2 r516, r442, r443; +} +{ +mul.f16x2 r519, r516, r440; +} +{ +add.f16x2 r522, r513, r519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r526, {low, high}; +} +{ +add.f16x2 r527, r528, r529; +} +{ +add.f16x2 r530, r531, r527; +} +{ +add.f16x2 r533, r534, r535; +} +{ +add.f16x2 r536, r537, r533; +} +{ +add.f16x2 r539, r528, r529; +} +{ +mul.f16x2 r542, r539, r525; +} +{ +add.f16x2 r545, r531, r542; +} +{ +sub.f16x2 r548, r534, r535; +} +{ +mul.f16x2 r551, r548, r526; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, r528, r529; +} +{ +mul.f16x2 r560, r557, r525; +} +{ +add.f16x2 r563, r531, r560; +} +{ +sub.f16x2 r566, r534, r535; +} +{ +mul.f16x2 r569, r566, r526; +} +{ +sub.f16x2 r572, r563, r569; +} +{ +add.f16x2 r575, r534, r535; +} +{ +mul.f16x2 r578, r575, r525; +} +{ +add.f16x2 r581, r537, r578; +} +{ +sub.f16x2 r584, r528, r529; +} +{ +mul.f16x2 r587, r584, r526; +} +{ +sub.f16x2 r590, r581, r587; +} +{ +add.f16x2 r593, r534, r535; +} +{ +mul.f16x2 r596, r593, r525; +} +{ +add.f16x2 r599, r537, r596; +} +{ +sub.f16x2 r602, r528, r529; +} +{ +mul.f16x2 r605, r602, r526; +} +{ +add.f16x2 r608, r599, r605; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r611, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r612, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r614, {low, high}; +} +{ +mul.f16x2 r621, r554, r611; +} +{ +mul.f16x2 r624, r590, r612; +} +{ +sub.f16x2 r627, r621, r624; +} +{ +mul.f16x2 r630, r554, r612; +} +{ +fma.rn.f16x2 r633, r590, r611, r630; +} +{ +mul.f16x2 r637, r572, r613; +} +{ +mul.f16x2 r640, r608, r614; +} +{ +sub.f16x2 r643, r637, r640; +} +{ +mul.f16x2 r646, r572, r614; +} +{ +fma.rn.f16x2 r649, r608, r613, r646; +} +{ +add.f16x2 %0, r444, r530; +} +{ +add.f16x2 %1, r450, r536; +} +{ +sub.f16x2 %6, r444, r530; +} +{ +sub.f16x2 %7, r450, r536; +} +{ +add.f16x2 %2, r468, r627; +} +{ +add.f16x2 %3, r504, r633; +} +{ +sub.f16x2 %8, r468, r627; +} +{ +sub.f16x2 %9, r504, r633; +} +{ +add.f16x2 %4, r486, r643; +} +{ +add.f16x2 %5, r522, r649; +} +{ +sub.f16x2 %10, r486, r643; +} +{ +sub.f16x2 %11, r522, r649; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..e1ed846466449 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp32_fwd.hpp.inc @@ -0,0 +1,404 @@ +#ifndef CUFFTDX_FFT_36_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_36_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<185, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<206>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 288, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %19, %24; +add.f32 f26, %14, f25; +add.f32 f27, %21, %26; +add.f32 f28, %15, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %14, f29; +sub.f32 f31, %21, %26; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %15, f35; +sub.f32 f37, %19, %24; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %22, %27; +add.f32 f42, %16, f41; +add.f32 f43, %23, %28; +add.f32 f44, %18, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %16, f45; +sub.f32 f47, %23, %28; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %18, f51; +sub.f32 f53, %22, %27; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +sub.f32 f67, f26, f42; +sub.f32 f68, f28, f44; +add.f32 f69, f33, f59; +add.f32 f70, f39, f61; +sub.f32 f71, f33, f59; +sub.f32 f72, f39, f61; +add.f32 f73, f34, f64; +add.f32 f74, f40, f66; +sub.f32 f75, f34, f64; +sub.f32 f76, f40, f66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 288, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f77, f78}, [rd6]; +mul.f32 f81, f77, f69; +mul.f32 f82, f78, f70; +mul.f32 f83, f77, f70; +mul.f32 f84, f77, f77; +mul.f32 f85, f78, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f78, f77; +fma.rn.f32 f88, f78, f77, f87; +mul.f32 f89, f86, f73; +mul.f32 f90, f88, f74; +mul.f32 f91, f86, f74; +mul.f32 f92, f77, f86; +mul.f32 f93, f78, f88; +sub.f32 f94, f92, f93; +mul.f32 f95, f77, f88; +fma.rn.f32 f96, f78, f86, f95; +mul.f32 f97, f94, f67; +mul.f32 f98, f96, f68; +mul.f32 f99, f94, f68; +mul.f32 f100, f77, f94; +mul.f32 f101, f78, f96; +sub.f32 f102, f100, f101; +mul.f32 f103, f77, f96; +fma.rn.f32 f104, f78, f94, f103; +mul.f32 f105, f102, f71; +mul.f32 f106, f104, f72; +mul.f32 f107, f102, f72; +mul.f32 f108, f77, f102; +mul.f32 f109, f78, f104; +sub.f32 f110, f108, f109; +mul.f32 f111, f77, f104; +fma.rn.f32 f112, f78, f102, f111; +mul.f32 f113, f110, f75; +mul.f32 f114, f112, f76; +mul.f32 f115, f110, f76; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f32 f116, f28, f44; +add.f32 f117, f26, f42; +st.shared.v2.f32 [r9], {f117, f116}; +fma.rn.f32 f118, f78, f69, f83; +sub.f32 f119, f81, f82; +st.shared.v2.f32 [r9+8], {f119, f118}; +fma.rn.f32 f120, f88, f73, f91; +sub.f32 f121, f89, f90; +st.shared.v2.f32 [r9+16], {f121, f120}; +fma.rn.f32 f122, f96, f67, f99; +sub.f32 f123, f97, f98; +st.shared.v2.f32 [r9+24], {f123, f122}; +fma.rn.f32 f124, f104, f71, f107; +sub.f32 f125, f105, f106; +st.shared.v2.f32 [r9+32], {f125, f124}; +fma.rn.f32 f126, f112, f75, f115; +sub.f32 f127, f113, f114; +st.shared.v2.f32 [r9+40], {f127, f126}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.v2.f32 {f128, f129}, [r10]; +ld.shared.v2.f32 {f132, f133}, [r10+48]; +ld.shared.v2.f32 {f136, f137}, [r10+96]; +ld.shared.v2.f32 {f140, f141}, [r10+144]; +ld.shared.v2.f32 {f144, f145}, [r10+192]; +ld.shared.v2.f32 {f148, f149}, [r10+240]; +add.f32 f152, f136, f144; +add.f32 f153, f128, f152; +add.f32 f154, f137, f145; +add.f32 f155, f129, f154; +mul.f32 f156, f152, 0f3F000000; +sub.f32 f157, f128, f156; +sub.f32 f158, f137, f145; +mul.f32 f159, f158, 0f3F5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f154, 0f3F000000; +sub.f32 f163, f129, f162; +sub.f32 f164, f136, f144; +mul.f32 f165, f164, 0f3F5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +add.f32 f168, f140, f148; +add.f32 f169, f132, f168; +add.f32 f170, f141, f149; +add.f32 f171, f133, f170; +mul.f32 f172, f168, 0f3F000000; +sub.f32 f173, f132, f172; +sub.f32 f174, f141, f149; +mul.f32 f175, f174, 0f3F5DB3D7; +add.f32 f176, f175, f173; +sub.f32 f177, f173, f175; +mul.f32 f178, f170, 0f3F000000; +sub.f32 f179, f133, f178; +sub.f32 f180, f140, f148; +mul.f32 f181, f180, 0f3F5DB3D7; +sub.f32 f182, f179, f181; +add.f32 f183, f181, f179; +mul.f32 f184, f176, 0f3F000000; +mul.f32 f185, f182, 0fBF5DB3D7; +sub.f32 f186, f184, f185; +mul.f32 f187, f182, 0f3F000000; +fma.rn.f32 f188, f176, 0fBF5DB3D7, f187; +mul.f32 f189, f177, 0fBF000000; +mul.f32 f190, f183, 0fBF5DB3D7; +sub.f32 f191, f189, f190; +mul.f32 f192, f183, 0fBF000000; +fma.rn.f32 f193, f177, 0fBF5DB3D7, f192; +add.f32 %1, f155, f171; +add.f32 %0, f153, f169; +add.f32 %3, f166, f188; +add.f32 %2, f160, f186; +add.f32 %5, f167, f193; +add.f32 %4, f161, f191; +sub.f32 %7, f155, f171; +sub.f32 %6, f153, f169; +sub.f32 %9, f166, f188; +sub.f32 %8, f160, f186; +sub.f32 %11, f167, f193; +sub.f32 %10, f161, f191; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<184, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<194>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 144, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %19, %24; +add.f32 f26, %14, f25; +add.f32 f27, %21, %26; +add.f32 f28, %15, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %14, f29; +sub.f32 f31, %21, %26; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %15, f35; +sub.f32 f37, %19, %24; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %22, %27; +add.f32 f42, %16, f41; +add.f32 f43, %23, %28; +add.f32 f44, %18, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %16, f45; +sub.f32 f47, %23, %28; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %18, f51; +sub.f32 f53, %22, %27; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +add.f32 f67, f26, f42; +add.f32 f68, f28, f44; +sub.f32 f69, f26, f42; +sub.f32 f70, f28, f44; +add.f32 f71, f33, f59; +add.f32 f72, f39, f61; +sub.f32 f73, f33, f59; +sub.f32 f74, f39, f61; +add.f32 f75, f34, f64; +add.f32 f76, f40, f66; +sub.f32 f77, f34, f64; +sub.f32 f78, f40, f66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f79, f80}, [rd6]; +mul.f32 f83, f79, f71; +mul.f32 f84, f80, f72; +sub.f32 f85, f83, f84; +mul.f32 f86, f79, f72; +fma.rn.f32 f87, f80, f71, f86; +mul.f32 f88, f79, f79; +mul.f32 f89, f80, f80; +sub.f32 f90, f88, f89; +mul.f32 f91, f80, f79; +fma.rn.f32 f92, f80, f79, f91; +mul.f32 f93, f90, f75; +mul.f32 f94, f92, f76; +sub.f32 f95, f93, f94; +mul.f32 f96, f90, f76; +fma.rn.f32 f97, f92, f75, f96; +mul.f32 f98, f79, f90; +mul.f32 f99, f80, f92; +sub.f32 f100, f98, f99; +mul.f32 f101, f79, f92; +fma.rn.f32 f102, f80, f90, f101; +mul.f32 f103, f100, f69; +mul.f32 f104, f102, f70; +sub.f32 f105, f103, f104; +mul.f32 f106, f100, f70; +fma.rn.f32 f107, f102, f69, f106; +mul.f32 f108, f79, f100; +mul.f32 f109, f80, f102; +sub.f32 f110, f108, f109; +mul.f32 f111, f79, f102; +fma.rn.f32 f112, f80, f100, f111; +mul.f32 f113, f110, f73; +mul.f32 f114, f112, f74; +sub.f32 f115, f113, f114; +mul.f32 f116, f110, f74; +fma.rn.f32 f117, f112, f73, f116; +mul.f32 f118, f79, f110; +mul.f32 f119, f80, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f79, f112; +fma.rn.f32 f122, f80, f110, f121; +mul.f32 f123, f120, f77; +mul.f32 f124, f122, f78; +sub.f32 f125, f123, f124; +mul.f32 f126, f120, f78; +fma.rn.f32 f127, f122, f77, f126; +mad.lo.s32 r8, r5, 144, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.v2.f32 [r9], {f67, f85}; +st.shared.v2.f32 [r9+8], {f95, f105}; +st.shared.v2.f32 [r9+16], {f115, f125}; +barrier.sync 0; +mad.lo.s32 r10, r7, -20, r9; +ld.shared.f32 f128, [r10]; +ld.shared.f32 f129, [r10+24]; +ld.shared.f32 f130, [r10+48]; +ld.shared.f32 f131, [r10+72]; +ld.shared.f32 f132, [r10+96]; +ld.shared.f32 f133, [r10+120]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f68, f87}; +st.shared.v2.f32 [r9+8], {f97, f107}; +st.shared.v2.f32 [r9+16], {f117, f127}; +barrier.sync 0; +ld.shared.f32 f134, [r10]; +ld.shared.f32 f135, [r10+24]; +ld.shared.f32 f136, [r10+48]; +ld.shared.f32 f137, [r10+72]; +ld.shared.f32 f138, [r10+96]; +ld.shared.f32 f139, [r10+120]; +add.f32 f140, f130, f132; +add.f32 f141, f128, f140; +add.f32 f142, f136, f138; +add.f32 f143, f134, f142; +mul.f32 f144, f140, 0f3F000000; +sub.f32 f145, f128, f144; +sub.f32 f146, f136, f138; +mul.f32 f147, f146, 0f3F5DB3D7; +add.f32 f148, f147, f145; +sub.f32 f149, f145, f147; +mul.f32 f150, f142, 0f3F000000; +sub.f32 f151, f134, f150; +sub.f32 f152, f130, f132; +mul.f32 f153, f152, 0f3F5DB3D7; +sub.f32 f154, f151, f153; +add.f32 f155, f153, f151; +add.f32 f156, f131, f133; +add.f32 f157, f129, f156; +add.f32 f158, f137, f139; +add.f32 f159, f135, f158; +mul.f32 f160, f156, 0f3F000000; +sub.f32 f161, f129, f160; +sub.f32 f162, f137, f139; +mul.f32 f163, f162, 0f3F5DB3D7; +add.f32 f164, f163, f161; +sub.f32 f165, f161, f163; +mul.f32 f166, f158, 0f3F000000; +sub.f32 f167, f135, f166; +sub.f32 f168, f131, f133; +mul.f32 f169, f168, 0f3F5DB3D7; +sub.f32 f170, f167, f169; +add.f32 f171, f169, f167; +mul.f32 f172, f164, 0f3F000000; +mul.f32 f173, f170, 0fBF5DB3D7; +sub.f32 f174, f172, f173; +mul.f32 f175, f170, 0f3F000000; +fma.rn.f32 f176, f164, 0fBF5DB3D7, f175; +mul.f32 f177, f165, 0fBF000000; +mul.f32 f178, f171, 0fBF5DB3D7; +sub.f32 f179, f177, f178; +mul.f32 f180, f171, 0fBF000000; +fma.rn.f32 f181, f165, 0fBF5DB3D7, f180; +add.f32 %0, f141, f157; +add.f32 %1, f143, f159; +add.f32 %3, f154, f176; +add.f32 %2, f148, f174; +add.f32 %5, f155, f181; +add.f32 %4, f149, f179; +sub.f32 %6, f141, f157; +sub.f32 %7, f143, f159; +sub.f32 %9, f154, f176; +sub.f32 %8, f148, f174; +sub.f32 %11, f155, f181; +sub.f32 %10, f149, f179; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..4a1272e388000 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp32_inv.hpp.inc @@ -0,0 +1,404 @@ +#ifndef CUFFTDX_FFT_36_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_36_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<387, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<206>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 288, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %19, %24; +add.f32 f26, %14, f25; +add.f32 f27, %21, %26; +add.f32 f28, %15, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %14, f29; +sub.f32 f31, %21, %26; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %15, f35; +sub.f32 f37, %19, %24; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %22, %27; +add.f32 f42, %16, f41; +add.f32 f43, %23, %28; +add.f32 f44, %18, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %16, f45; +sub.f32 f47, %23, %28; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %18, f51; +sub.f32 f53, %22, %27; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +sub.f32 f67, f26, f42; +sub.f32 f68, f28, f44; +add.f32 f69, f33, f59; +add.f32 f70, f39, f61; +sub.f32 f71, f33, f59; +sub.f32 f72, f39, f61; +add.f32 f73, f34, f64; +add.f32 f74, f40, f66; +sub.f32 f75, f34, f64; +sub.f32 f76, f40, f66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 288, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f77, f78}, [rd6]; +mul.f32 f81, f70, f78; +mul.f32 f82, f69, f78; +mul.f32 f83, f77, f70; +mul.f32 f84, f77, f77; +mul.f32 f85, f78, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f78, f77; +fma.rn.f32 f88, f78, f77, f87; +mul.f32 f89, f74, f88; +mul.f32 f90, f73, f88; +mul.f32 f91, f86, f74; +mul.f32 f92, f77, f86; +mul.f32 f93, f78, f88; +sub.f32 f94, f92, f93; +mul.f32 f95, f77, f88; +fma.rn.f32 f96, f78, f86, f95; +mul.f32 f97, f68, f96; +mul.f32 f98, f67, f96; +mul.f32 f99, f94, f68; +mul.f32 f100, f77, f94; +mul.f32 f101, f78, f96; +sub.f32 f102, f100, f101; +mul.f32 f103, f77, f96; +fma.rn.f32 f104, f78, f94, f103; +mul.f32 f105, f72, f104; +mul.f32 f106, f71, f104; +mul.f32 f107, f102, f72; +mul.f32 f108, f77, f102; +mul.f32 f109, f78, f104; +sub.f32 f110, f108, f109; +mul.f32 f111, f77, f104; +fma.rn.f32 f112, f78, f102, f111; +mul.f32 f113, f76, f112; +mul.f32 f114, f75, f112; +mul.f32 f115, f110, f76; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f32 f116, f28, f44; +add.f32 f117, f26, f42; +st.shared.v2.f32 [r9], {f117, f116}; +fma.rn.f32 f118, f77, f69, f81; +sub.f32 f119, f83, f82; +st.shared.v2.f32 [r9+8], {f118, f119}; +fma.rn.f32 f120, f86, f73, f89; +sub.f32 f121, f91, f90; +st.shared.v2.f32 [r9+16], {f120, f121}; +fma.rn.f32 f122, f94, f67, f97; +sub.f32 f123, f99, f98; +st.shared.v2.f32 [r9+24], {f122, f123}; +fma.rn.f32 f124, f102, f71, f105; +sub.f32 f125, f107, f106; +st.shared.v2.f32 [r9+32], {f124, f125}; +fma.rn.f32 f126, f110, f75, f113; +sub.f32 f127, f115, f114; +st.shared.v2.f32 [r9+40], {f126, f127}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.v2.f32 {f128, f129}, [r10]; +ld.shared.v2.f32 {f132, f133}, [r10+48]; +ld.shared.v2.f32 {f136, f137}, [r10+96]; +ld.shared.v2.f32 {f140, f141}, [r10+144]; +ld.shared.v2.f32 {f144, f145}, [r10+192]; +ld.shared.v2.f32 {f148, f149}, [r10+240]; +add.f32 f152, f136, f144; +add.f32 f153, f128, f152; +add.f32 f154, f137, f145; +add.f32 f155, f129, f154; +mul.f32 f156, f152, 0f3F000000; +sub.f32 f157, f128, f156; +sub.f32 f158, f137, f145; +mul.f32 f159, f158, 0fBF5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f154, 0f3F000000; +sub.f32 f163, f129, f162; +sub.f32 f164, f136, f144; +mul.f32 f165, f164, 0fBF5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +add.f32 f168, f140, f148; +add.f32 f169, f132, f168; +add.f32 f170, f141, f149; +add.f32 f171, f133, f170; +mul.f32 f172, f168, 0f3F000000; +sub.f32 f173, f132, f172; +sub.f32 f174, f141, f149; +mul.f32 f175, f174, 0fBF5DB3D7; +add.f32 f176, f175, f173; +sub.f32 f177, f173, f175; +mul.f32 f178, f170, 0f3F000000; +sub.f32 f179, f133, f178; +sub.f32 f180, f140, f148; +mul.f32 f181, f180, 0fBF5DB3D7; +sub.f32 f182, f179, f181; +add.f32 f183, f181, f179; +mul.f32 f184, f176, 0f3F000000; +mul.f32 f185, f182, 0f3F5DB3D7; +sub.f32 f186, f184, f185; +mul.f32 f187, f182, 0f3F000000; +fma.rn.f32 f188, f176, 0f3F5DB3D7, f187; +mul.f32 f189, f177, 0fBF000000; +mul.f32 f190, f183, 0f3F5DB3D7; +sub.f32 f191, f189, f190; +mul.f32 f192, f183, 0fBF000000; +fma.rn.f32 f193, f177, 0f3F5DB3D7, f192; +add.f32 %1, f155, f171; +add.f32 %0, f153, f169; +add.f32 %3, f166, f188; +add.f32 %2, f160, f186; +add.f32 %5, f167, f193; +add.f32 %4, f161, f191; +sub.f32 %7, f155, f171; +sub.f32 %6, f153, f169; +sub.f32 %9, f166, f188; +sub.f32 %8, f160, f186; +sub.f32 %11, f167, f193; +sub.f32 %10, f161, f191; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<386, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<194>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 144, r2; +mov.u32 r4, %tid.x; +add.f32 f25, %19, %24; +add.f32 f26, %14, f25; +add.f32 f27, %21, %26; +add.f32 f28, %15, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %14, f29; +sub.f32 f31, %21, %26; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %15, f35; +sub.f32 f37, %19, %24; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %22, %27; +add.f32 f42, %16, f41; +add.f32 f43, %23, %28; +add.f32 f44, %18, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %16, f45; +sub.f32 f47, %23, %28; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %18, f51; +sub.f32 f53, %22, %27; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +add.f32 f67, f26, f42; +add.f32 f68, f28, f44; +sub.f32 f69, f26, f42; +sub.f32 f70, f28, f44; +add.f32 f71, f33, f59; +add.f32 f72, f39, f61; +sub.f32 f73, f33, f59; +sub.f32 f74, f39, f61; +add.f32 f75, f34, f64; +add.f32 f76, f40, f66; +sub.f32 f77, f34, f64; +sub.f32 f78, f40, f66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f79, f80}, [rd6]; +mul.f32 f83, f72, f80; +fma.rn.f32 f84, f79, f71, f83; +mul.f32 f85, f71, f80; +mul.f32 f86, f79, f72; +sub.f32 f87, f86, f85; +mul.f32 f88, f79, f79; +mul.f32 f89, f80, f80; +sub.f32 f90, f88, f89; +mul.f32 f91, f80, f79; +fma.rn.f32 f92, f80, f79, f91; +mul.f32 f93, f76, f92; +fma.rn.f32 f94, f90, f75, f93; +mul.f32 f95, f75, f92; +mul.f32 f96, f90, f76; +sub.f32 f97, f96, f95; +mul.f32 f98, f79, f90; +mul.f32 f99, f80, f92; +sub.f32 f100, f98, f99; +mul.f32 f101, f79, f92; +fma.rn.f32 f102, f80, f90, f101; +mul.f32 f103, f70, f102; +fma.rn.f32 f104, f100, f69, f103; +mul.f32 f105, f69, f102; +mul.f32 f106, f100, f70; +sub.f32 f107, f106, f105; +mul.f32 f108, f79, f100; +mul.f32 f109, f80, f102; +sub.f32 f110, f108, f109; +mul.f32 f111, f79, f102; +fma.rn.f32 f112, f80, f100, f111; +mul.f32 f113, f74, f112; +fma.rn.f32 f114, f110, f73, f113; +mul.f32 f115, f73, f112; +mul.f32 f116, f110, f74; +sub.f32 f117, f116, f115; +mul.f32 f118, f79, f110; +mul.f32 f119, f80, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f79, f112; +fma.rn.f32 f122, f80, f110, f121; +mul.f32 f123, f78, f122; +fma.rn.f32 f124, f120, f77, f123; +mul.f32 f125, f77, f122; +mul.f32 f126, f120, f78; +sub.f32 f127, f126, f125; +mad.lo.s32 r8, r5, 144, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.v2.f32 [r9], {f67, f84}; +st.shared.v2.f32 [r9+8], {f94, f104}; +st.shared.v2.f32 [r9+16], {f114, f124}; +barrier.sync 0; +mad.lo.s32 r10, r7, -20, r9; +ld.shared.f32 f128, [r10]; +ld.shared.f32 f129, [r10+24]; +ld.shared.f32 f130, [r10+48]; +ld.shared.f32 f131, [r10+72]; +ld.shared.f32 f132, [r10+96]; +ld.shared.f32 f133, [r10+120]; +barrier.sync 0; +st.shared.v2.f32 [r9], {f68, f87}; +st.shared.v2.f32 [r9+8], {f97, f107}; +st.shared.v2.f32 [r9+16], {f117, f127}; +barrier.sync 0; +ld.shared.f32 f134, [r10]; +ld.shared.f32 f135, [r10+24]; +ld.shared.f32 f136, [r10+48]; +ld.shared.f32 f137, [r10+72]; +ld.shared.f32 f138, [r10+96]; +ld.shared.f32 f139, [r10+120]; +add.f32 f140, f130, f132; +add.f32 f141, f128, f140; +add.f32 f142, f136, f138; +add.f32 f143, f134, f142; +mul.f32 f144, f140, 0f3F000000; +sub.f32 f145, f128, f144; +sub.f32 f146, f136, f138; +mul.f32 f147, f146, 0fBF5DB3D7; +add.f32 f148, f147, f145; +sub.f32 f149, f145, f147; +mul.f32 f150, f142, 0f3F000000; +sub.f32 f151, f134, f150; +sub.f32 f152, f130, f132; +mul.f32 f153, f152, 0fBF5DB3D7; +sub.f32 f154, f151, f153; +add.f32 f155, f153, f151; +add.f32 f156, f131, f133; +add.f32 f157, f129, f156; +add.f32 f158, f137, f139; +add.f32 f159, f135, f158; +mul.f32 f160, f156, 0f3F000000; +sub.f32 f161, f129, f160; +sub.f32 f162, f137, f139; +mul.f32 f163, f162, 0fBF5DB3D7; +add.f32 f164, f163, f161; +sub.f32 f165, f161, f163; +mul.f32 f166, f158, 0f3F000000; +sub.f32 f167, f135, f166; +sub.f32 f168, f131, f133; +mul.f32 f169, f168, 0fBF5DB3D7; +sub.f32 f170, f167, f169; +add.f32 f171, f169, f167; +mul.f32 f172, f164, 0f3F000000; +mul.f32 f173, f170, 0f3F5DB3D7; +sub.f32 f174, f172, f173; +mul.f32 f175, f170, 0f3F000000; +fma.rn.f32 f176, f164, 0f3F5DB3D7, f175; +mul.f32 f177, f165, 0fBF000000; +mul.f32 f178, f171, 0f3F5DB3D7; +sub.f32 f179, f177, f178; +mul.f32 f180, f171, 0fBF000000; +fma.rn.f32 f181, f165, 0f3F5DB3D7, f180; +add.f32 %0, f141, f157; +add.f32 %1, f143, f159; +add.f32 %3, f154, f176; +add.f32 %2, f148, f174; +add.f32 %5, f155, f181; +add.f32 %4, f149, f179; +sub.f32 %6, f141, f157; +sub.f32 %7, f143, f159; +sub.f32 %9, f154, f176; +sub.f32 %8, f148, f174; +sub.f32 %11, f155, f181; +sub.f32 %10, f149, f179; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "r"(smem), "l"(lut_sp_6_36), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..adbc51f4c27c5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp64_fwd.hpp.inc @@ -0,0 +1,396 @@ +#ifndef CUFFTDX_FFT_36_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_36_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<559, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<205>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 576, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %19, %24; +add.f64 fd26, %14, fd25; +add.f64 fd27, %21, %26; +add.f64 fd28, %15, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %14, fd29; +sub.f64 fd31, %21, %26; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %15, fd35; +sub.f64 fd37, %19, %24; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %22, %27; +add.f64 fd42, %16, fd41; +add.f64 fd43, %23, %28; +add.f64 fd44, %18, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %16, fd45; +sub.f64 fd47, %23, %28; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %18, fd51; +sub.f64 fd53, %22, %27; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +sub.f64 fd67, fd26, fd42; +sub.f64 fd68, fd28, fd44; +add.f64 fd69, fd33, fd59; +add.f64 fd70, fd39, fd61; +sub.f64 fd71, fd33, fd59; +sub.f64 fd72, fd39, fd61; +add.f64 fd73, fd34, fd64; +add.f64 fd74, fd40, fd66; +sub.f64 fd75, fd34, fd64; +sub.f64 fd76, fd40, fd66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 576, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd77, fd78}, [rd6]; +mul.f64 fd81, fd77, fd69; +mul.f64 fd82, fd78, fd70; +mul.f64 fd83, fd77, fd70; +mul.f64 fd84, fd77, fd77; +mul.f64 fd85, fd78, fd78; +sub.f64 fd86, fd84, fd85; +mul.f64 fd87, fd78, fd77; +fma.rn.f64 fd88, fd78, fd77, fd87; +mul.f64 fd89, fd86, fd73; +mul.f64 fd90, fd88, fd74; +mul.f64 fd91, fd86, fd74; +mul.f64 fd92, fd77, fd86; +mul.f64 fd93, fd78, fd88; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd77, fd88; +fma.rn.f64 fd96, fd78, fd86, fd95; +mul.f64 fd97, fd94, fd67; +mul.f64 fd98, fd96, fd68; +mul.f64 fd99, fd94, fd68; +ld.global.v2.f64 {fd100, fd101}, [rd6+96]; +mul.f64 fd104, fd100, fd71; +mul.f64 fd105, fd101, fd72; +mul.f64 fd106, fd100, fd72; +mul.f64 fd107, fd77, fd100; +mul.f64 fd108, fd78, fd101; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd77, fd101; +fma.rn.f64 fd111, fd78, fd100, fd110; +mul.f64 fd112, fd109, fd75; +mul.f64 fd113, fd111, fd76; +mul.f64 fd114, fd109, fd76; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f64 fd115, fd28, fd44; +add.f64 fd116, fd26, fd42; +st.shared.v2.f64 [r9], {fd116, fd115}; +fma.rn.f64 fd117, fd78, fd69, fd83; +sub.f64 fd118, fd81, fd82; +st.shared.v2.f64 [r9+16], {fd118, fd117}; +fma.rn.f64 fd119, fd88, fd73, fd91; +sub.f64 fd120, fd89, fd90; +st.shared.v2.f64 [r9+32], {fd120, fd119}; +fma.rn.f64 fd121, fd96, fd67, fd99; +sub.f64 fd122, fd97, fd98; +st.shared.v2.f64 [r9+48], {fd122, fd121}; +fma.rn.f64 fd123, fd101, fd71, fd106; +sub.f64 fd124, fd104, fd105; +st.shared.v2.f64 [r9+64], {fd124, fd123}; +fma.rn.f64 fd125, fd111, fd75, fd114; +sub.f64 fd126, fd112, fd113; +st.shared.v2.f64 [r9+80], {fd126, fd125}; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.v2.f64 {fd127, fd128}, [r10]; +ld.shared.v2.f64 {fd131, fd132}, [r10+96]; +ld.shared.v2.f64 {fd135, fd136}, [r10+192]; +ld.shared.v2.f64 {fd139, fd140}, [r10+288]; +ld.shared.v2.f64 {fd143, fd144}, [r10+384]; +ld.shared.v2.f64 {fd147, fd148}, [r10+480]; +add.f64 fd151, fd135, fd143; +add.f64 fd152, fd127, fd151; +add.f64 fd153, fd136, fd144; +add.f64 fd154, fd128, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, fd127, fd155; +sub.f64 fd157, fd136, fd144; +mul.f64 fd158, fd157, 0d3FEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, fd128, fd161; +sub.f64 fd163, fd135, fd143; +mul.f64 fd164, fd163, 0d3FEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, fd139, fd147; +add.f64 fd168, fd131, fd167; +add.f64 fd169, fd140, fd148; +add.f64 fd170, fd132, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, fd131, fd171; +sub.f64 fd173, fd140, fd148; +mul.f64 fd174, fd173, 0d3FEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, fd132, fd177; +sub.f64 fd179, fd139, fd147; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0dBFEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0dBFEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0dBFEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0dBFEBB67AE8584CAA, fd191; +add.f64 %1, fd154, fd170; +add.f64 %0, fd152, fd168; +add.f64 %3, fd165, fd187; +add.f64 %2, fd159, fd185; +add.f64 %5, fd166, fd192; +add.f64 %4, fd160, fd190; +sub.f64 %7, fd154, fd170; +sub.f64 %6, fd152, fd168; +sub.f64 %9, fd165, fd187; +sub.f64 %8, fd159, fd185; +sub.f64 %11, fd166, fd192; +sub.f64 %10, fd160, fd190; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<560, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<193>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 288, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %19, %24; +add.f64 fd26, %14, fd25; +add.f64 fd27, %21, %26; +add.f64 fd28, %15, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %14, fd29; +sub.f64 fd31, %21, %26; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %15, fd35; +sub.f64 fd37, %19, %24; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %22, %27; +add.f64 fd42, %16, fd41; +add.f64 fd43, %23, %28; +add.f64 fd44, %18, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %16, fd45; +sub.f64 fd47, %23, %28; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %18, fd51; +sub.f64 fd53, %22, %27; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +add.f64 fd67, fd26, fd42; +add.f64 fd68, fd28, fd44; +sub.f64 fd69, fd26, fd42; +sub.f64 fd70, fd28, fd44; +add.f64 fd71, fd33, fd59; +add.f64 fd72, fd39, fd61; +sub.f64 fd73, fd33, fd59; +sub.f64 fd74, fd39, fd61; +add.f64 fd75, fd34, fd64; +add.f64 fd76, fd40, fd66; +sub.f64 fd77, fd34, fd64; +sub.f64 fd78, fd40, fd66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd79, fd80}, [rd6]; +mul.f64 fd83, fd79, fd71; +mul.f64 fd84, fd80, fd72; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd79, fd72; +fma.rn.f64 fd87, fd80, fd71, fd86; +mul.f64 fd88, fd79, fd79; +mul.f64 fd89, fd80, fd80; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd80, fd79; +fma.rn.f64 fd92, fd80, fd79, fd91; +mul.f64 fd93, fd90, fd75; +mul.f64 fd94, fd92, fd76; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd90, fd76; +fma.rn.f64 fd97, fd92, fd75, fd96; +mul.f64 fd98, fd79, fd90; +mul.f64 fd99, fd80, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd79, fd92; +fma.rn.f64 fd102, fd80, fd90, fd101; +mul.f64 fd103, fd100, fd69; +mul.f64 fd104, fd102, fd70; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd100, fd70; +fma.rn.f64 fd107, fd102, fd69, fd106; +ld.global.v2.f64 {fd108, fd109}, [rd6+96]; +mul.f64 fd112, fd108, fd73; +mul.f64 fd113, fd109, fd74; +sub.f64 fd114, fd112, fd113; +mul.f64 fd115, fd108, fd74; +fma.rn.f64 fd116, fd109, fd73, fd115; +mul.f64 fd117, fd79, fd108; +mul.f64 fd118, fd80, fd109; +sub.f64 fd119, fd117, fd118; +mul.f64 fd120, fd79, fd109; +fma.rn.f64 fd121, fd80, fd108, fd120; +mul.f64 fd122, fd119, fd77; +mul.f64 fd123, fd121, fd78; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd119, fd78; +fma.rn.f64 fd126, fd121, fd77, fd125; +mad.lo.s32 r8, r5, 288, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v2.f64 [r9], {fd67, fd85}; +st.shared.v2.f64 [r9+16], {fd95, fd105}; +st.shared.v2.f64 [r9+32], {fd114, fd124}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.f64 fd127, [r10]; +ld.shared.f64 fd128, [r10+48]; +ld.shared.f64 fd129, [r10+96]; +ld.shared.f64 fd130, [r10+144]; +ld.shared.f64 fd131, [r10+192]; +ld.shared.f64 fd132, [r10+240]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd68, fd87}; +st.shared.v2.f64 [r9+16], {fd97, fd107}; +st.shared.v2.f64 [r9+32], {fd116, fd126}; +barrier.sync 0; +ld.shared.f64 fd133, [r10]; +ld.shared.f64 fd134, [r10+48]; +ld.shared.f64 fd135, [r10+96]; +ld.shared.f64 fd136, [r10+144]; +ld.shared.f64 fd137, [r10+192]; +ld.shared.f64 fd138, [r10+240]; +add.f64 fd139, fd129, fd131; +add.f64 fd140, fd127, fd139; +add.f64 fd141, fd135, fd137; +add.f64 fd142, fd133, fd141; +mul.f64 fd143, fd139, 0d3FE0000000000000; +sub.f64 fd144, fd127, fd143; +sub.f64 fd145, fd135, fd137; +mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA; +add.f64 fd147, fd146, fd144; +sub.f64 fd148, fd144, fd146; +mul.f64 fd149, fd141, 0d3FE0000000000000; +sub.f64 fd150, fd133, fd149; +sub.f64 fd151, fd129, fd131; +mul.f64 fd152, fd151, 0d3FEBB67AE8584CAA; +sub.f64 fd153, fd150, fd152; +add.f64 fd154, fd152, fd150; +add.f64 fd155, fd130, fd132; +add.f64 fd156, fd128, fd155; +add.f64 fd157, fd136, fd138; +add.f64 fd158, fd134, fd157; +mul.f64 fd159, fd155, 0d3FE0000000000000; +sub.f64 fd160, fd128, fd159; +sub.f64 fd161, fd136, fd138; +mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA; +add.f64 fd163, fd162, fd160; +sub.f64 fd164, fd160, fd162; +mul.f64 fd165, fd157, 0d3FE0000000000000; +sub.f64 fd166, fd134, fd165; +sub.f64 fd167, fd130, fd132; +mul.f64 fd168, fd167, 0d3FEBB67AE8584CAA; +sub.f64 fd169, fd166, fd168; +add.f64 fd170, fd168, fd166; +mul.f64 fd171, fd163, 0d3FE0000000000000; +mul.f64 fd172, fd169, 0dBFEBB67AE8584CAA; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd169, 0d3FE0000000000000; +fma.rn.f64 fd175, fd163, 0dBFEBB67AE8584CAA, fd174; +mul.f64 fd176, fd164, 0dBFE0000000000000; +mul.f64 fd177, fd170, 0dBFEBB67AE8584CAA; +sub.f64 fd178, fd176, fd177; +mul.f64 fd179, fd170, 0dBFE0000000000000; +fma.rn.f64 fd180, fd164, 0dBFEBB67AE8584CAA, fd179; +add.f64 %0, fd140, fd156; +add.f64 %1, fd142, fd158; +add.f64 %3, fd153, fd175; +add.f64 %2, fd147, fd173; +add.f64 %5, fd154, fd180; +add.f64 %4, fd148, fd178; +sub.f64 %6, fd140, fd156; +sub.f64 %7, fd142, fd158; +sub.f64 %9, fd153, fd175; +sub.f64 %8, fd147, fd173; +sub.f64 %11, fd154, fd180; +sub.f64 %10, fd148, fd178; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..e92fe274a82c6 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_36_fp64_inv.hpp.inc @@ -0,0 +1,396 @@ +#ifndef CUFFTDX_FFT_36_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_36_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<730, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<205>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 576, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %19, %24; +add.f64 fd26, %14, fd25; +add.f64 fd27, %21, %26; +add.f64 fd28, %15, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %14, fd29; +sub.f64 fd31, %21, %26; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %15, fd35; +sub.f64 fd37, %19, %24; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %22, %27; +add.f64 fd42, %16, fd41; +add.f64 fd43, %23, %28; +add.f64 fd44, %18, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %16, fd45; +sub.f64 fd47, %23, %28; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %18, fd51; +sub.f64 fd53, %22, %27; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +sub.f64 fd67, fd26, fd42; +sub.f64 fd68, fd28, fd44; +add.f64 fd69, fd33, fd59; +add.f64 fd70, fd39, fd61; +sub.f64 fd71, fd33, fd59; +sub.f64 fd72, fd39, fd61; +add.f64 fd73, fd34, fd64; +add.f64 fd74, fd40, fd66; +sub.f64 fd75, fd34, fd64; +sub.f64 fd76, fd40, fd66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 576, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd77, fd78}, [rd6]; +mul.f64 fd81, fd70, fd78; +mul.f64 fd82, fd69, fd78; +mul.f64 fd83, fd77, fd70; +mul.f64 fd84, fd77, fd77; +mul.f64 fd85, fd78, fd78; +sub.f64 fd86, fd84, fd85; +mul.f64 fd87, fd78, fd77; +fma.rn.f64 fd88, fd78, fd77, fd87; +mul.f64 fd89, fd74, fd88; +mul.f64 fd90, fd73, fd88; +mul.f64 fd91, fd86, fd74; +mul.f64 fd92, fd77, fd86; +mul.f64 fd93, fd78, fd88; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd77, fd88; +fma.rn.f64 fd96, fd78, fd86, fd95; +mul.f64 fd97, fd68, fd96; +mul.f64 fd98, fd67, fd96; +mul.f64 fd99, fd94, fd68; +ld.global.v2.f64 {fd100, fd101}, [rd6+96]; +mul.f64 fd104, fd72, fd101; +mul.f64 fd105, fd71, fd101; +mul.f64 fd106, fd100, fd72; +mul.f64 fd107, fd77, fd100; +mul.f64 fd108, fd78, fd101; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd77, fd101; +fma.rn.f64 fd111, fd78, fd100, fd110; +mul.f64 fd112, fd76, fd111; +mul.f64 fd113, fd75, fd111; +mul.f64 fd114, fd109, fd76; +barrier.sync 0; +mad.lo.s32 r9, r7, 96, r8; +add.f64 fd115, fd28, fd44; +add.f64 fd116, fd26, fd42; +st.shared.v2.f64 [r9], {fd116, fd115}; +fma.rn.f64 fd117, fd77, fd69, fd81; +sub.f64 fd118, fd83, fd82; +st.shared.v2.f64 [r9+16], {fd117, fd118}; +fma.rn.f64 fd119, fd86, fd73, fd89; +sub.f64 fd120, fd91, fd90; +st.shared.v2.f64 [r9+32], {fd119, fd120}; +fma.rn.f64 fd121, fd94, fd67, fd97; +sub.f64 fd122, fd99, fd98; +st.shared.v2.f64 [r9+48], {fd121, fd122}; +fma.rn.f64 fd123, fd100, fd71, fd104; +sub.f64 fd124, fd106, fd105; +st.shared.v2.f64 [r9+64], {fd123, fd124}; +fma.rn.f64 fd125, fd109, fd75, fd112; +sub.f64 fd126, fd114, fd113; +st.shared.v2.f64 [r9+80], {fd125, fd126}; +barrier.sync 0; +mad.lo.s32 r10, r7, -80, r9; +ld.shared.v2.f64 {fd127, fd128}, [r10]; +ld.shared.v2.f64 {fd131, fd132}, [r10+96]; +ld.shared.v2.f64 {fd135, fd136}, [r10+192]; +ld.shared.v2.f64 {fd139, fd140}, [r10+288]; +ld.shared.v2.f64 {fd143, fd144}, [r10+384]; +ld.shared.v2.f64 {fd147, fd148}, [r10+480]; +add.f64 fd151, fd135, fd143; +add.f64 fd152, fd127, fd151; +add.f64 fd153, fd136, fd144; +add.f64 fd154, fd128, fd153; +mul.f64 fd155, fd151, 0d3FE0000000000000; +sub.f64 fd156, fd127, fd155; +sub.f64 fd157, fd136, fd144; +mul.f64 fd158, fd157, 0dBFEBB67AE8584CAA; +add.f64 fd159, fd158, fd156; +sub.f64 fd160, fd156, fd158; +mul.f64 fd161, fd153, 0d3FE0000000000000; +sub.f64 fd162, fd128, fd161; +sub.f64 fd163, fd135, fd143; +mul.f64 fd164, fd163, 0dBFEBB67AE8584CAA; +sub.f64 fd165, fd162, fd164; +add.f64 fd166, fd164, fd162; +add.f64 fd167, fd139, fd147; +add.f64 fd168, fd131, fd167; +add.f64 fd169, fd140, fd148; +add.f64 fd170, fd132, fd169; +mul.f64 fd171, fd167, 0d3FE0000000000000; +sub.f64 fd172, fd131, fd171; +sub.f64 fd173, fd140, fd148; +mul.f64 fd174, fd173, 0dBFEBB67AE8584CAA; +add.f64 fd175, fd174, fd172; +sub.f64 fd176, fd172, fd174; +mul.f64 fd177, fd169, 0d3FE0000000000000; +sub.f64 fd178, fd132, fd177; +sub.f64 fd179, fd139, fd147; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +sub.f64 fd181, fd178, fd180; +add.f64 fd182, fd180, fd178; +mul.f64 fd183, fd175, 0d3FE0000000000000; +mul.f64 fd184, fd181, 0d3FEBB67AE8584CAA; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd181, 0d3FE0000000000000; +fma.rn.f64 fd187, fd175, 0d3FEBB67AE8584CAA, fd186; +mul.f64 fd188, fd176, 0dBFE0000000000000; +mul.f64 fd189, fd182, 0d3FEBB67AE8584CAA; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd182, 0dBFE0000000000000; +fma.rn.f64 fd192, fd176, 0d3FEBB67AE8584CAA, fd191; +add.f64 %1, fd154, fd170; +add.f64 %0, fd152, fd168; +add.f64 %3, fd165, fd187; +add.f64 %2, fd159, fd185; +add.f64 %5, fd166, fd192; +add.f64 %4, fd160, fd190; +sub.f64 %7, fd154, fd170; +sub.f64 %6, fd152, fd168; +sub.f64 %9, fd165, fd187; +sub.f64 %8, fd159, fd185; +sub.f64 %11, fd166, fd192; +sub.f64 %10, fd160, fd190; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<731, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<193>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %12; +mad.lo.s32 r3, r1, 288, r2; +mov.u32 r4, %tid.x; +add.f64 fd25, %19, %24; +add.f64 fd26, %14, fd25; +add.f64 fd27, %21, %26; +add.f64 fd28, %15, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %14, fd29; +sub.f64 fd31, %21, %26; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %15, fd35; +sub.f64 fd37, %19, %24; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %22, %27; +add.f64 fd42, %16, fd41; +add.f64 fd43, %23, %28; +add.f64 fd44, %18, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %16, fd45; +sub.f64 fd47, %23, %28; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %18, fd51; +sub.f64 fd53, %22, %27; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +add.f64 fd67, fd26, fd42; +add.f64 fd68, fd28, fd44; +sub.f64 fd69, fd26, fd42; +sub.f64 fd70, fd28, fd44; +add.f64 fd71, fd33, fd59; +add.f64 fd72, fd39, fd61; +sub.f64 fd73, fd33, fd59; +sub.f64 fd74, fd39, fd61; +add.f64 fd75, fd34, fd64; +add.f64 fd76, fd40, fd66; +sub.f64 fd77, fd34, fd64; +sub.f64 fd78, fd40, fd66; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 34; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 6; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %13; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd79, fd80}, [rd6]; +mul.f64 fd83, fd72, fd80; +fma.rn.f64 fd84, fd79, fd71, fd83; +mul.f64 fd85, fd71, fd80; +mul.f64 fd86, fd79, fd72; +sub.f64 fd87, fd86, fd85; +mul.f64 fd88, fd79, fd79; +mul.f64 fd89, fd80, fd80; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd80, fd79; +fma.rn.f64 fd92, fd80, fd79, fd91; +mul.f64 fd93, fd76, fd92; +fma.rn.f64 fd94, fd90, fd75, fd93; +mul.f64 fd95, fd75, fd92; +mul.f64 fd96, fd90, fd76; +sub.f64 fd97, fd96, fd95; +mul.f64 fd98, fd79, fd90; +mul.f64 fd99, fd80, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd79, fd92; +fma.rn.f64 fd102, fd80, fd90, fd101; +mul.f64 fd103, fd70, fd102; +fma.rn.f64 fd104, fd100, fd69, fd103; +mul.f64 fd105, fd69, fd102; +mul.f64 fd106, fd100, fd70; +sub.f64 fd107, fd106, fd105; +ld.global.v2.f64 {fd108, fd109}, [rd6+96]; +mul.f64 fd112, fd74, fd109; +fma.rn.f64 fd113, fd108, fd73, fd112; +mul.f64 fd114, fd73, fd109; +mul.f64 fd115, fd108, fd74; +sub.f64 fd116, fd115, fd114; +mul.f64 fd117, fd79, fd108; +mul.f64 fd118, fd80, fd109; +sub.f64 fd119, fd117, fd118; +mul.f64 fd120, fd79, fd109; +fma.rn.f64 fd121, fd80, fd108, fd120; +mul.f64 fd122, fd78, fd121; +fma.rn.f64 fd123, fd119, fd77, fd122; +mul.f64 fd124, fd77, fd121; +mul.f64 fd125, fd119, fd78; +sub.f64 fd126, fd125, fd124; +mad.lo.s32 r8, r5, 288, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +st.shared.v2.f64 [r9], {fd67, fd84}; +st.shared.v2.f64 [r9+16], {fd94, fd104}; +st.shared.v2.f64 [r9+32], {fd113, fd123}; +barrier.sync 0; +mad.lo.s32 r10, r7, -40, r9; +ld.shared.f64 fd127, [r10]; +ld.shared.f64 fd128, [r10+48]; +ld.shared.f64 fd129, [r10+96]; +ld.shared.f64 fd130, [r10+144]; +ld.shared.f64 fd131, [r10+192]; +ld.shared.f64 fd132, [r10+240]; +barrier.sync 0; +st.shared.v2.f64 [r9], {fd68, fd87}; +st.shared.v2.f64 [r9+16], {fd97, fd107}; +st.shared.v2.f64 [r9+32], {fd116, fd126}; +barrier.sync 0; +ld.shared.f64 fd133, [r10]; +ld.shared.f64 fd134, [r10+48]; +ld.shared.f64 fd135, [r10+96]; +ld.shared.f64 fd136, [r10+144]; +ld.shared.f64 fd137, [r10+192]; +ld.shared.f64 fd138, [r10+240]; +add.f64 fd139, fd129, fd131; +add.f64 fd140, fd127, fd139; +add.f64 fd141, fd135, fd137; +add.f64 fd142, fd133, fd141; +mul.f64 fd143, fd139, 0d3FE0000000000000; +sub.f64 fd144, fd127, fd143; +sub.f64 fd145, fd135, fd137; +mul.f64 fd146, fd145, 0dBFEBB67AE8584CAA; +add.f64 fd147, fd146, fd144; +sub.f64 fd148, fd144, fd146; +mul.f64 fd149, fd141, 0d3FE0000000000000; +sub.f64 fd150, fd133, fd149; +sub.f64 fd151, fd129, fd131; +mul.f64 fd152, fd151, 0dBFEBB67AE8584CAA; +sub.f64 fd153, fd150, fd152; +add.f64 fd154, fd152, fd150; +add.f64 fd155, fd130, fd132; +add.f64 fd156, fd128, fd155; +add.f64 fd157, fd136, fd138; +add.f64 fd158, fd134, fd157; +mul.f64 fd159, fd155, 0d3FE0000000000000; +sub.f64 fd160, fd128, fd159; +sub.f64 fd161, fd136, fd138; +mul.f64 fd162, fd161, 0dBFEBB67AE8584CAA; +add.f64 fd163, fd162, fd160; +sub.f64 fd164, fd160, fd162; +mul.f64 fd165, fd157, 0d3FE0000000000000; +sub.f64 fd166, fd134, fd165; +sub.f64 fd167, fd130, fd132; +mul.f64 fd168, fd167, 0dBFEBB67AE8584CAA; +sub.f64 fd169, fd166, fd168; +add.f64 fd170, fd168, fd166; +mul.f64 fd171, fd163, 0d3FE0000000000000; +mul.f64 fd172, fd169, 0d3FEBB67AE8584CAA; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd169, 0d3FE0000000000000; +fma.rn.f64 fd175, fd163, 0d3FEBB67AE8584CAA, fd174; +mul.f64 fd176, fd164, 0dBFE0000000000000; +mul.f64 fd177, fd170, 0d3FEBB67AE8584CAA; +sub.f64 fd178, fd176, fd177; +mul.f64 fd179, fd170, 0dBFE0000000000000; +fma.rn.f64 fd180, fd164, 0d3FEBB67AE8584CAA, fd179; +add.f64 %0, fd140, fd156; +add.f64 %1, fd142, fd158; +add.f64 %3, fd153, fd175; +add.f64 %2, fd147, fd173; +add.f64 %5, fd154, fd180; +add.f64 %4, fd148, fd178; +sub.f64 %6, fd140, fd156; +sub.f64 %7, fd142, fd158; +sub.f64 %9, fd153, fd175; +sub.f64 %8, fd147, fd173; +sub.f64 %11, fd154, fd180; +sub.f64 %10, fd148, fd178; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "r"(smem), "l"(lut_dp_6_36), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..898c2e6998c11 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp16_fwd.hpp.inc @@ -0,0 +1,115 @@ +#ifndef CUFFTDX_FFT_3_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_3_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<863, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<5>; +.reg .b32 r<85>; +.reg .f64 fd<3>; +.reg .b64 rd<2>; +mov.f64 fd1, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd1; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd2, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd2; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %6, %7; +} +{ +add.f16x2 %0, %8, r1; +} +{ +add.f16x2 r7, %9, %10; +} +{ +add.f16x2 %1, %11, r7; +} +{ +add.f16x2 r13, %6, %7; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %8, r16; +} +{ +sub.f16x2 r22, %9, %10; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 %2, r19, r25; +} +{ +add.f16x2 r31, %6, %7; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %8, r34; +} +{ +sub.f16x2 r40, %9, %10; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 %4, r37, r43; +} +{ +add.f16x2 r49, %9, %10; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %11, r52; +} +{ +sub.f16x2 r58, %6, %7; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 %3, r55, r61; +} +{ +add.f16x2 r67, %9, %10; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %11, r70; +} +{ +sub.f16x2 r76, %6, %7; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 %5, r73, r79; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[0].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..a2041121ca56b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp16_inv.hpp.inc @@ -0,0 +1,112 @@ +#ifndef CUFFTDX_FFT_3_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_3_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1065, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<3>; +.reg .b32 r<85>; +.reg .f64 fd<3>; +.reg .b64 rd<2>; +mov.f64 fd1, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd1; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd2, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd2; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %6, %7; +} +{ +add.f16x2 %0, %8, r1; +} +{ +add.f16x2 r7, %9, %10; +} +{ +add.f16x2 %1, %11, r7; +} +{ +add.f16x2 r13, %6, %7; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %8, r16; +} +{ +sub.f16x2 r22, %9, %10; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 %2, r19, r25; +} +{ +add.f16x2 r31, %6, %7; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %8, r34; +} +{ +sub.f16x2 r40, %9, %10; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 %4, r37, r43; +} +{ +add.f16x2 r49, %9, %10; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %11, r52; +} +{ +sub.f16x2 r58, %6, %7; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 %3, r55, r61; +} +{ +add.f16x2 r67, %9, %10; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %11, r70; +} +{ +sub.f16x2 r76, %6, %7; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 %5, r73, r79; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[0].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..95a1b426a79d5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp32_fwd.hpp.inc @@ -0,0 +1,32 @@ +#ifndef CUFFTDX_FFT_3_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_3_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<117, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<29>; +.reg .b64 rd<2>; +add.f32 f13, %8, %11; +add.f32 f14, %10, %12; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %6, f15; +sub.f32 f17, %10, %12; +mul.f32 f18, f17, 0f3F5DB3D7; +mul.f32 f19, f14, 0f3F000000; +sub.f32 f20, %7, f19; +sub.f32 f21, %8, %11; +mul.f32 f22, f21, 0f3F5DB3D7; +add.f32 %1, %7, f14; +add.f32 %0, %6, f13; +sub.f32 %3, f20, f22; +add.f32 %2, f18, f16; +add.f32 %5, f22, f20; +sub.f32 %4, f16, f18; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..2f93ca0c2d843 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp32_inv.hpp.inc @@ -0,0 +1,32 @@ +#ifndef CUFFTDX_FFT_3_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_3_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<319, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<29>; +.reg .b64 rd<2>; +add.f32 f13, %8, %11; +add.f32 f14, %10, %12; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %6, f15; +sub.f32 f17, %10, %12; +mul.f32 f18, f17, 0fBF5DB3D7; +mul.f32 f19, f14, 0f3F000000; +sub.f32 f20, %7, f19; +sub.f32 f21, %8, %11; +mul.f32 f22, f21, 0fBF5DB3D7; +add.f32 %1, %7, f14; +add.f32 %0, %6, f13; +sub.f32 %3, f20, f22; +add.f32 %2, f18, f16; +add.f32 %5, f22, f20; +sub.f32 %4, f16, f18; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..92b4c2f9d036d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp64_fwd.hpp.inc @@ -0,0 +1,32 @@ +#ifndef CUFFTDX_FFT_3_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_3_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<500, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<29>; +.reg .b64 rd<2>; +add.f64 fd13, %8, %11; +add.f64 fd14, %10, %12; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %6, fd15; +sub.f64 fd17, %10, %12; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +mul.f64 fd19, fd14, 0d3FE0000000000000; +sub.f64 fd20, %7, fd19; +sub.f64 fd21, %8, %11; +mul.f64 fd22, fd21, 0d3FEBB67AE8584CAA; +add.f64 %1, %7, fd14; +add.f64 %0, %6, fd13; +sub.f64 %3, fd20, fd22; +add.f64 %2, fd18, fd16; +add.f64 %5, fd22, fd20; +sub.f64 %4, fd16, fd18; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..7386c5fd06d6b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_3_fp64_inv.hpp.inc @@ -0,0 +1,32 @@ +#ifndef CUFFTDX_FFT_3_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_3_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<671, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<29>; +.reg .b64 rd<2>; +add.f64 fd13, %8, %11; +add.f64 fd14, %10, %12; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %6, fd15; +sub.f64 fd17, %10, %12; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +mul.f64 fd19, fd14, 0d3FE0000000000000; +sub.f64 fd20, %7, fd19; +sub.f64 fd21, %8, %11; +mul.f64 fd22, fd21, 0dBFEBB67AE8584CAA; +add.f64 %1, %7, fd14; +add.f64 %0, %6, fd13; +sub.f64 %3, fd20, fd22; +add.f64 %2, fd18, fd16; +add.f64 %5, fd22, fd20; +sub.f64 %4, fd16, fd18; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..3b53ff6138802 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp16_fwd.hpp.inc @@ -0,0 +1,29864 @@ +#ifndef CUFFTDX_FFT_4096_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_4096_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<847, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<423>; +.reg .b32 r<2985>; +.reg .b64 rd<2>; +mov.u32 r2965, %tid.y; +shl.b32 r2966, r2965, 14; +mov.u32 r2967, %32; +add.s32 r2968, r2967, r2966; +mov.u32 r2969, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f362, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r101, {low, high}; +} +mov.f32 f380, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f358, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +mov.f32 f366, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r397, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2970, r2969, 255; +shl.b32 r2971, r2969, 6; +and.b32 r2972, r2971, -16384; +add.s32 r2973, r2968, r2972; +cvt.rn.f32.u32 f417, r2970; +mul.f32 f418, f417, 0f3AC90FDB; +cos.approx.f32 f117, f418; +sin.approx.f32 f419, f418; +neg.f32 f118, f419; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2974, r2971, 16320; +add.s32 r2975, r2973, r2974; +st.shared.v4.f32 [r2975], {r521, r629, r666, r703}; +st.shared.v4.f32 [r2975+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r2975+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r2975+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r2976, r2970, -60, r2975; +ld.shared.u32 r1176, [r2976]; +ld.shared.u32 r1372, [r2976+1024]; +ld.shared.u32 r1226, [r2976+2048]; +ld.shared.u32 r1422, [r2976+3072]; +ld.shared.u32 r1188, [r2976+4096]; +ld.shared.u32 r1384, [r2976+5120]; +ld.shared.u32 r1238, [r2976+6144]; +ld.shared.u32 r1434, [r2976+7168]; +ld.shared.u32 r1177, [r2976+8192]; +ld.shared.u32 r1373, [r2976+9216]; +ld.shared.u32 r1227, [r2976+10240]; +ld.shared.u32 r1423, [r2976+11264]; +ld.shared.u32 r1189, [r2976+12288]; +ld.shared.u32 r1385, [r2976+13312]; +ld.shared.u32 r1239, [r2976+14336]; +ld.shared.u32 r1435, [r2976+15360]; +barrier.sync 0; +st.shared.v4.f32 [r2975], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2975+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2975+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2975+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2976]; +ld.shared.u32 r1375, [r2976+1024]; +ld.shared.u32 r1229, [r2976+2048]; +ld.shared.u32 r1425, [r2976+3072]; +ld.shared.u32 r1191, [r2976+4096]; +ld.shared.u32 r1387, [r2976+5120]; +ld.shared.u32 r1241, [r2976+6144]; +ld.shared.u32 r1437, [r2976+7168]; +ld.shared.u32 r1180, [r2976+8192]; +ld.shared.u32 r1376, [r2976+9216]; +ld.shared.u32 r1230, [r2976+10240]; +ld.shared.u32 r1426, [r2976+11264]; +ld.shared.u32 r1192, [r2976+12288]; +ld.shared.u32 r1388, [r2976+13312]; +ld.shared.u32 r1242, [r2976+14336]; +ld.shared.u32 r1438, [r2976+15360]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2977, r2969, 240; +bfe.u32 r2978, r2969, 4, 4; +shl.b32 r2979, r2969, 2; +and.b32 r2980, r2979, 60; +add.s32 r2981, r2973, r2980; +cvt.rn.f32.u32 f420, r2978; +mul.f32 f421, f420, 0f3CC90FDB; +cos.approx.f32 f267, f421; +sin.approx.f32 f422, f421; +neg.f32 f268, f422; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +barrier.sync 0; +and.b32 r2982, r2971, 15360; +add.s32 r2983, r2981, r2982; +st.shared.u32 [r2983], r1695; +st.shared.u32 [r2983+64], r1803; +st.shared.u32 [r2983+128], r1840; +st.shared.u32 [r2983+192], r1877; +st.shared.u32 [r2983+256], r1914; +st.shared.u32 [r2983+320], r1951; +st.shared.u32 [r2983+384], r1988; +st.shared.u32 [r2983+448], r2025; +st.shared.u32 [r2983+512], r2062; +st.shared.u32 [r2983+576], r2099; +st.shared.u32 [r2983+640], r2136; +st.shared.u32 [r2983+704], r2173; +st.shared.u32 [r2983+768], r2210; +st.shared.u32 [r2983+832], r2247; +st.shared.u32 [r2983+896], r2284; +st.shared.u32 [r2983+960], r2321; +barrier.sync 0; +mad.lo.s32 r2984, r2977, -60, r2983; +ld.shared.u32 r2350, [r2984]; +ld.shared.u32 r2546, [r2984+1024]; +ld.shared.u32 r2400, [r2984+2048]; +ld.shared.u32 r2596, [r2984+3072]; +ld.shared.u32 r2362, [r2984+4096]; +ld.shared.u32 r2558, [r2984+5120]; +ld.shared.u32 r2412, [r2984+6144]; +ld.shared.u32 r2608, [r2984+7168]; +ld.shared.u32 r2351, [r2984+8192]; +ld.shared.u32 r2547, [r2984+9216]; +ld.shared.u32 r2401, [r2984+10240]; +ld.shared.u32 r2597, [r2984+11264]; +ld.shared.u32 r2363, [r2984+12288]; +ld.shared.u32 r2559, [r2984+13312]; +ld.shared.u32 r2413, [r2984+14336]; +ld.shared.u32 r2609, [r2984+15360]; +barrier.sync 0; +st.shared.u32 [r2983], r1698; +st.shared.u32 [r2983+64], r1810; +st.shared.u32 [r2983+128], r1847; +st.shared.u32 [r2983+192], r1884; +st.shared.u32 [r2983+256], r1921; +st.shared.u32 [r2983+320], r1958; +st.shared.u32 [r2983+384], r1995; +st.shared.u32 [r2983+448], r2032; +st.shared.u32 [r2983+512], r2069; +st.shared.u32 [r2983+576], r2106; +st.shared.u32 [r2983+640], r2143; +st.shared.u32 [r2983+704], r2180; +st.shared.u32 [r2983+768], r2217; +st.shared.u32 [r2983+832], r2254; +st.shared.u32 [r2983+896], r2291; +st.shared.u32 [r2983+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2984]; +ld.shared.u32 r2549, [r2984+1024]; +ld.shared.u32 r2403, [r2984+2048]; +ld.shared.u32 r2599, [r2984+3072]; +ld.shared.u32 r2365, [r2984+4096]; +ld.shared.u32 r2561, [r2984+5120]; +ld.shared.u32 r2415, [r2984+6144]; +ld.shared.u32 r2611, [r2984+7168]; +ld.shared.u32 r2354, [r2984+8192]; +ld.shared.u32 r2550, [r2984+9216]; +ld.shared.u32 r2404, [r2984+10240]; +ld.shared.u32 r2600, [r2984+11264]; +ld.shared.u32 r2366, [r2984+12288]; +ld.shared.u32 r2562, [r2984+13312]; +ld.shared.u32 r2416, [r2984+14336]; +ld.shared.u32 r2612, [r2984+15360]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2434; +} +{ +add.f16x2 r2524, r2384, r2479; +} +{ +sub.f16x2 r2527, r2381, r2434; +} +{ +sub.f16x2 r2530, r2384, r2479; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2630; +} +{ +add.f16x2 r2720, r2580, r2675; +} +{ +sub.f16x2 r2723, r2577, r2630; +} +{ +sub.f16x2 r2726, r2580, r2675; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2699; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 %0, r2497, r2693; +} +{ +add.f16x2 %1, r2500, r2696; +} +{ +sub.f16x2 %16, r2497, r2693; +} +{ +sub.f16x2 %17, r2500, r2696; +} +{ +add.f16x2 %2, r2509, r2777; +} +{ +add.f16x2 %3, r2512, r2783; +} +{ +sub.f16x2 %18, r2509, r2777; +} +{ +sub.f16x2 %19, r2512, r2783; +} +{ +add.f16x2 %4, r2521, r2793; +} +{ +add.f16x2 %5, r2524, r2799; +} +{ +sub.f16x2 %20, r2521, r2793; +} +{ +sub.f16x2 %21, r2524, r2799; +} +{ +add.f16x2 %6, r2533, r2809; +} +{ +add.f16x2 %7, r2536, r2815; +} +{ +sub.f16x2 %22, r2533, r2809; +} +{ +sub.f16x2 %23, r2536, r2815; +} +{ +add.f16x2 %8, r2503, r2702; +} +{ +add.f16x2 %9, r2506, r2819; +} +{ +sub.f16x2 %24, r2503, r2702; +} +{ +sub.f16x2 %25, r2506, r2819; +} +{ +add.f16x2 %10, r2515, r2827; +} +{ +add.f16x2 %11, r2518, r2833; +} +{ +sub.f16x2 %26, r2515, r2827; +} +{ +sub.f16x2 %27, r2518, r2833; +} +{ +add.f16x2 %12, r2527, r2843; +} +{ +add.f16x2 %13, r2530, r2849; +} +{ +sub.f16x2 %28, r2527, r2843; +} +{ +sub.f16x2 %29, r2530, r2849; +} +{ +add.f16x2 %14, r2539, r2859; +} +{ +add.f16x2 %15, r2542, r2865; +} +{ +sub.f16x2 %30, r2539, r2859; +} +{ +sub.f16x2 %31, r2542, r2865; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<849, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<176>; +.reg .b32 r<1598>; +.reg .b64 rd<2>; +mov.u32 r1571, %tid.y; +shl.b32 r1572, r1571, 14; +mov.u32 r1573, %16; +add.s32 r1574, r1573, r1572; +mov.u32 r1575, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f140, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r101, {low, high}; +} +mov.f32 f150, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f136, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1576, r1575, 511; +shl.b32 r1577, r1575, 5; +and.b32 r1578, r1577, -16384; +add.s32 r1579, r1574, r1578; +cvt.rn.f32.u32 f167, r1576; +mul.f32 f168, f167, 0f3AC90FDB; +cos.approx.f32 f29, f168; +sin.approx.f32 f169, f168; +neg.f32 f30, f169; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1580, r1577, 16352; +add.s32 r1581, r1579, r1580; +st.shared.v4.f32 [r1581], {r149, r209, r246, r283}; +st.shared.v4.f32 [r1581+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r1582, r1576, -28, r1581; +ld.shared.u32 r460, [r1582]; +ld.shared.u32 r510, [r1582+2048]; +ld.shared.u32 r472, [r1582+4096]; +ld.shared.u32 r522, [r1582+6144]; +ld.shared.u32 r461, [r1582+8192]; +ld.shared.u32 r511, [r1582+10240]; +ld.shared.u32 r473, [r1582+12288]; +ld.shared.u32 r523, [r1582+14336]; +barrier.sync 0; +st.shared.v4.f32 [r1581], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1581+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1582]; +ld.shared.u32 r513, [r1582+2048]; +ld.shared.u32 r475, [r1582+4096]; +ld.shared.u32 r525, [r1582+6144]; +ld.shared.u32 r464, [r1582+8192]; +ld.shared.u32 r514, [r1582+10240]; +ld.shared.u32 r476, [r1582+12288]; +ld.shared.u32 r526, [r1582+14336]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1583, r1575, 504; +bfe.u32 r1584, r1575, 3, 6; +shl.b32 r1585, r1575, 2; +and.b32 r1586, r1585, 28; +add.s32 r1587, r1579, r1586; +cvt.rn.f32.u32 f170, r1584; +mul.f32 f171, f170, 0f3C490FDB; +cos.approx.f32 f75, f171; +sin.approx.f32 f172, f171; +neg.f32 f76, f172; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r1588, r1577, 16128; +add.s32 r1589, r1587, r1588; +st.shared.u32 [r1589], r607; +st.shared.u32 [r1589+32], r667; +st.shared.u32 [r1589+64], r704; +st.shared.u32 [r1589+96], r741; +st.shared.u32 [r1589+128], r778; +st.shared.u32 [r1589+160], r815; +st.shared.u32 [r1589+192], r852; +st.shared.u32 [r1589+224], r889; +barrier.sync 0; +mad.lo.s32 r1590, r1583, -28, r1589; +ld.shared.u32 r918, [r1590]; +ld.shared.u32 r968, [r1590+2048]; +ld.shared.u32 r930, [r1590+4096]; +ld.shared.u32 r980, [r1590+6144]; +ld.shared.u32 r919, [r1590+8192]; +ld.shared.u32 r969, [r1590+10240]; +ld.shared.u32 r931, [r1590+12288]; +ld.shared.u32 r981, [r1590+14336]; +barrier.sync 0; +st.shared.u32 [r1589], r610; +st.shared.u32 [r1589+32], r674; +st.shared.u32 [r1589+64], r711; +st.shared.u32 [r1589+96], r748; +st.shared.u32 [r1589+128], r785; +st.shared.u32 [r1589+160], r822; +st.shared.u32 [r1589+192], r859; +st.shared.u32 [r1589+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1590]; +ld.shared.u32 r971, [r1590+2048]; +ld.shared.u32 r933, [r1590+4096]; +ld.shared.u32 r983, [r1590+6144]; +ld.shared.u32 r922, [r1590+8192]; +ld.shared.u32 r972, [r1590+10240]; +ld.shared.u32 r934, [r1590+12288]; +ld.shared.u32 r984, [r1590+14336]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1591, r1575, 448; +bfe.u32 r1592, r1575, 6, 3; +and.b32 r1593, r1585, 252; +add.s32 r1594, r1579, r1593; +cvt.rn.f32.u32 f173, r1592; +mul.f32 f174, f173, 0f3DC90FDB; +cos.approx.f32 f121, f174; +sin.approx.f32 f175, f174; +neg.f32 f122, f175; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +barrier.sync 0; +and.b32 r1595, r1577, 14336; +add.s32 r1596, r1594, r1595; +st.shared.u32 [r1596], r1065; +st.shared.u32 [r1596+256], r1125; +st.shared.u32 [r1596+512], r1162; +st.shared.u32 [r1596+768], r1199; +st.shared.u32 [r1596+1024], r1236; +st.shared.u32 [r1596+1280], r1273; +st.shared.u32 [r1596+1536], r1310; +st.shared.u32 [r1596+1792], r1347; +barrier.sync 0; +mad.lo.s32 r1597, r1591, -28, r1596; +ld.shared.u32 r1376, [r1597]; +ld.shared.u32 r1426, [r1597+2048]; +ld.shared.u32 r1388, [r1597+4096]; +ld.shared.u32 r1438, [r1597+6144]; +ld.shared.u32 r1377, [r1597+8192]; +ld.shared.u32 r1427, [r1597+10240]; +ld.shared.u32 r1389, [r1597+12288]; +ld.shared.u32 r1439, [r1597+14336]; +barrier.sync 0; +st.shared.u32 [r1596], r1068; +st.shared.u32 [r1596+256], r1132; +st.shared.u32 [r1596+512], r1169; +st.shared.u32 [r1596+768], r1206; +st.shared.u32 [r1596+1024], r1243; +st.shared.u32 [r1596+1280], r1280; +st.shared.u32 [r1596+1536], r1317; +st.shared.u32 [r1596+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1597]; +ld.shared.u32 r1429, [r1597+2048]; +ld.shared.u32 r1391, [r1597+4096]; +ld.shared.u32 r1441, [r1597+6144]; +ld.shared.u32 r1380, [r1597+8192]; +ld.shared.u32 r1430, [r1597+10240]; +ld.shared.u32 r1392, [r1597+12288]; +ld.shared.u32 r1442, [r1597+14336]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1393; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1396; +} +{ +add.f16x2 r1416, r1384, r1399; +} +{ +sub.f16x2 r1419, r1381, r1396; +} +{ +sub.f16x2 r1422, r1384, r1399; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1443; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1446; +} +{ +add.f16x2 r1466, r1434, r1449; +} +{ +sub.f16x2 r1469, r1431, r1446; +} +{ +sub.f16x2 r1472, r1434, r1449; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1457; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 %0, r1401, r1451; +} +{ +add.f16x2 %1, r1404, r1454; +} +{ +sub.f16x2 %8, r1401, r1451; +} +{ +sub.f16x2 %9, r1404, r1454; +} +{ +add.f16x2 %2, r1413, r1495; +} +{ +add.f16x2 %3, r1416, r1501; +} +{ +sub.f16x2 %10, r1413, r1495; +} +{ +sub.f16x2 %11, r1416, r1501; +} +{ +add.f16x2 %4, r1407, r1460; +} +{ +add.f16x2 %5, r1410, r1505; +} +{ +sub.f16x2 %12, r1407, r1460; +} +{ +sub.f16x2 %13, r1410, r1505; +} +{ +add.f16x2 %6, r1419, r1513; +} +{ +add.f16x2 %7, r1422, r1519; +} +{ +sub.f16x2 %14, r1419, r1513; +} +{ +sub.f16x2 %15, r1422, r1519; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<850, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<423>; +.reg .b32 r<2985>; +.reg .b64 rd<2>; +mov.u32 r2965, %tid.y; +shl.b32 r2966, r2965, 15; +mov.u32 r2967, %32; +add.s32 r2968, r2967, r2966; +mov.u32 r2969, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f362, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r101, {low, high}; +} +mov.f32 f380, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f358, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +mov.f32 f366, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r397, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2970, r2969, 255; +shl.b32 r2971, r2969, 7; +and.b32 r2972, r2971, -32768; +add.s32 r2973, r2968, r2972; +cvt.rn.f32.u32 f417, r2970; +mul.f32 f418, f417, 0f3AC90FDB; +cos.approx.f32 f117, f418; +sin.approx.f32 f419, f418; +neg.f32 f118, f419; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2974, r2971, 32640; +add.s32 r2975, r2973, r2974; +st.shared.v4.f32 [r2975], {r521, r524, r629, r636}; +st.shared.v4.f32 [r2975+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r2975+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r2975+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r2975+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r2975+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r2975+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r2975+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r2976, r2970, -120, r2975; +ld.shared.u32 r1176, [r2976]; +ld.shared.u32 r1179, [r2976+4]; +ld.shared.u32 r1372, [r2976+2048]; +ld.shared.u32 r1375, [r2976+2052]; +ld.shared.u32 r1226, [r2976+4096]; +ld.shared.u32 r1229, [r2976+4100]; +ld.shared.u32 r1422, [r2976+6144]; +ld.shared.u32 r1425, [r2976+6148]; +ld.shared.u32 r1188, [r2976+8192]; +ld.shared.u32 r1191, [r2976+8196]; +ld.shared.u32 r1384, [r2976+10240]; +ld.shared.u32 r1387, [r2976+10244]; +ld.shared.u32 r1238, [r2976+12288]; +ld.shared.u32 r1241, [r2976+12292]; +ld.shared.u32 r1434, [r2976+14336]; +ld.shared.u32 r1437, [r2976+14340]; +ld.shared.u32 r1177, [r2976+16384]; +ld.shared.u32 r1180, [r2976+16388]; +ld.shared.u32 r1373, [r2976+18432]; +ld.shared.u32 r1376, [r2976+18436]; +ld.shared.u32 r1227, [r2976+20480]; +ld.shared.u32 r1230, [r2976+20484]; +ld.shared.u32 r1423, [r2976+22528]; +ld.shared.u32 r1426, [r2976+22532]; +ld.shared.u32 r1189, [r2976+24576]; +ld.shared.u32 r1192, [r2976+24580]; +ld.shared.u32 r1385, [r2976+26624]; +ld.shared.u32 r1388, [r2976+26628]; +ld.shared.u32 r1239, [r2976+28672]; +ld.shared.u32 r1242, [r2976+28676]; +ld.shared.u32 r1435, [r2976+30720]; +ld.shared.u32 r1438, [r2976+30724]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2977, r2969, 240; +bfe.u32 r2978, r2969, 4, 4; +cvt.rn.f32.u32 f420, r2978; +mul.f32 f421, f420, 0f3CC90FDB; +cos.approx.f32 f267, f421; +sin.approx.f32 f422, f421; +neg.f32 f268, f422; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +shl.b32 r2979, r2969, 3; +and.b32 r2980, r2979, 120; +add.s32 r2981, r2973, r2980; +barrier.sync 0; +and.b32 r2982, r2971, 30720; +add.s32 r2983, r2981, r2982; +st.shared.u32 [r2983], r1695; +st.shared.u32 [r2983+4], r1698; +st.shared.u32 [r2983+128], r1803; +st.shared.u32 [r2983+132], r1810; +st.shared.u32 [r2983+256], r1840; +st.shared.u32 [r2983+260], r1847; +st.shared.u32 [r2983+384], r1877; +st.shared.u32 [r2983+388], r1884; +st.shared.u32 [r2983+512], r1914; +st.shared.u32 [r2983+516], r1921; +st.shared.u32 [r2983+640], r1951; +st.shared.u32 [r2983+644], r1958; +st.shared.u32 [r2983+768], r1988; +st.shared.u32 [r2983+772], r1995; +st.shared.u32 [r2983+896], r2025; +st.shared.u32 [r2983+900], r2032; +st.shared.u32 [r2983+1024], r2062; +st.shared.u32 [r2983+1028], r2069; +st.shared.u32 [r2983+1152], r2099; +st.shared.u32 [r2983+1156], r2106; +st.shared.u32 [r2983+1280], r2136; +st.shared.u32 [r2983+1284], r2143; +st.shared.u32 [r2983+1408], r2173; +st.shared.u32 [r2983+1412], r2180; +st.shared.u32 [r2983+1536], r2210; +st.shared.u32 [r2983+1540], r2217; +st.shared.u32 [r2983+1664], r2247; +st.shared.u32 [r2983+1668], r2254; +st.shared.u32 [r2983+1792], r2284; +st.shared.u32 [r2983+1796], r2291; +st.shared.u32 [r2983+1920], r2321; +st.shared.u32 [r2983+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2984, r2977, -120, r2983; +ld.shared.u32 r2350, [r2984]; +ld.shared.u32 r2353, [r2984+4]; +ld.shared.u32 r2546, [r2984+2048]; +ld.shared.u32 r2549, [r2984+2052]; +ld.shared.u32 r2400, [r2984+4096]; +ld.shared.u32 r2403, [r2984+4100]; +ld.shared.u32 r2596, [r2984+6144]; +ld.shared.u32 r2599, [r2984+6148]; +ld.shared.u32 r2362, [r2984+8192]; +ld.shared.u32 r2365, [r2984+8196]; +ld.shared.u32 r2558, [r2984+10240]; +ld.shared.u32 r2561, [r2984+10244]; +ld.shared.u32 r2412, [r2984+12288]; +ld.shared.u32 r2415, [r2984+12292]; +ld.shared.u32 r2608, [r2984+14336]; +ld.shared.u32 r2611, [r2984+14340]; +ld.shared.u32 r2351, [r2984+16384]; +ld.shared.u32 r2354, [r2984+16388]; +ld.shared.u32 r2547, [r2984+18432]; +ld.shared.u32 r2550, [r2984+18436]; +ld.shared.u32 r2401, [r2984+20480]; +ld.shared.u32 r2404, [r2984+20484]; +ld.shared.u32 r2597, [r2984+22528]; +ld.shared.u32 r2600, [r2984+22532]; +ld.shared.u32 r2363, [r2984+24576]; +ld.shared.u32 r2366, [r2984+24580]; +ld.shared.u32 r2559, [r2984+26624]; +ld.shared.u32 r2562, [r2984+26628]; +ld.shared.u32 r2413, [r2984+28672]; +ld.shared.u32 r2416, [r2984+28676]; +ld.shared.u32 r2609, [r2984+30720]; +ld.shared.u32 r2612, [r2984+30724]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2434; +} +{ +add.f16x2 r2524, r2384, r2479; +} +{ +sub.f16x2 r2527, r2381, r2434; +} +{ +sub.f16x2 r2530, r2384, r2479; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2630; +} +{ +add.f16x2 r2720, r2580, r2675; +} +{ +sub.f16x2 r2723, r2577, r2630; +} +{ +sub.f16x2 r2726, r2580, r2675; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2699; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 %0, r2497, r2693; +} +{ +add.f16x2 %1, r2500, r2696; +} +{ +sub.f16x2 %16, r2497, r2693; +} +{ +sub.f16x2 %17, r2500, r2696; +} +{ +add.f16x2 %2, r2509, r2777; +} +{ +add.f16x2 %3, r2512, r2783; +} +{ +sub.f16x2 %18, r2509, r2777; +} +{ +sub.f16x2 %19, r2512, r2783; +} +{ +add.f16x2 %4, r2521, r2793; +} +{ +add.f16x2 %5, r2524, r2799; +} +{ +sub.f16x2 %20, r2521, r2793; +} +{ +sub.f16x2 %21, r2524, r2799; +} +{ +add.f16x2 %6, r2533, r2809; +} +{ +add.f16x2 %7, r2536, r2815; +} +{ +sub.f16x2 %22, r2533, r2809; +} +{ +sub.f16x2 %23, r2536, r2815; +} +{ +add.f16x2 %8, r2503, r2702; +} +{ +add.f16x2 %9, r2506, r2819; +} +{ +sub.f16x2 %24, r2503, r2702; +} +{ +sub.f16x2 %25, r2506, r2819; +} +{ +add.f16x2 %10, r2515, r2827; +} +{ +add.f16x2 %11, r2518, r2833; +} +{ +sub.f16x2 %26, r2515, r2827; +} +{ +sub.f16x2 %27, r2518, r2833; +} +{ +add.f16x2 %12, r2527, r2843; +} +{ +add.f16x2 %13, r2530, r2849; +} +{ +sub.f16x2 %28, r2527, r2843; +} +{ +sub.f16x2 %29, r2530, r2849; +} +{ +add.f16x2 %14, r2539, r2859; +} +{ +add.f16x2 %15, r2542, r2865; +} +{ +sub.f16x2 %30, r2539, r2859; +} +{ +sub.f16x2 %31, r2542, r2865; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<851, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6211>; +.reg .b64 rd<3>; +mov.u32 r6125, %tid.y; +shl.b32 r6126, r6125, 15; +mov.u32 r6127, %64; +add.s32 r6128, r6127, r6126; +mov.u32 r6129, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %121, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %121, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f668, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r101, {low, high}; +} +mov.f32 f702, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %122, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %122, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6131, r6129, 8; +and.b32 r6132, r6131, -32768; +add.s32 r6133, r6128, r6132; +and.b32 r6146, r6129, 127; +cvt.rn.f32.u32 f845, r6146; +mul.f32 f846, f845, 0f3AC90FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r6134, r6131, 32512; +add.s32 r6135, r6133, r6134; +st.shared.v4.f32 [r6135], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r6135+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r6135+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r6135+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r6135+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r6135+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r6135+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r6135+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r6135+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r6135+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r6135+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r6135+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r6135+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r6135+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r6135+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r6135+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r6136, r6146, -248, r6135; +ld.shared.u32 r2864, [r6136]; +ld.shared.u32 r2867, [r6136+4]; +ld.shared.u32 r3480, [r6136+1024]; +ld.shared.u32 r3483, [r6136+1028]; +ld.shared.u32 r3060, [r6136+2048]; +ld.shared.u32 r3063, [r6136+2052]; +ld.shared.u32 r3676, [r6136+3072]; +ld.shared.u32 r3679, [r6136+3076]; +ld.shared.u32 r2914, [r6136+4096]; +ld.shared.u32 r2917, [r6136+4100]; +ld.shared.u32 r3530, [r6136+5120]; +ld.shared.u32 r3533, [r6136+5124]; +ld.shared.u32 r3110, [r6136+6144]; +ld.shared.u32 r3113, [r6136+6148]; +ld.shared.u32 r3726, [r6136+7168]; +ld.shared.u32 r3729, [r6136+7172]; +ld.shared.u32 r2876, [r6136+8192]; +ld.shared.u32 r2879, [r6136+8196]; +ld.shared.u32 r3492, [r6136+9216]; +ld.shared.u32 r3495, [r6136+9220]; +ld.shared.u32 r3072, [r6136+10240]; +ld.shared.u32 r3075, [r6136+10244]; +ld.shared.u32 r3688, [r6136+11264]; +ld.shared.u32 r3691, [r6136+11268]; +ld.shared.u32 r2926, [r6136+12288]; +ld.shared.u32 r2929, [r6136+12292]; +ld.shared.u32 r3542, [r6136+13312]; +ld.shared.u32 r3545, [r6136+13316]; +ld.shared.u32 r3122, [r6136+14336]; +ld.shared.u32 r3125, [r6136+14340]; +ld.shared.u32 r3738, [r6136+15360]; +ld.shared.u32 r3741, [r6136+15364]; +ld.shared.u32 r2865, [r6136+16384]; +ld.shared.u32 r2868, [r6136+16388]; +ld.shared.u32 r3481, [r6136+17408]; +ld.shared.u32 r3484, [r6136+17412]; +ld.shared.u32 r3061, [r6136+18432]; +ld.shared.u32 r3064, [r6136+18436]; +ld.shared.u32 r3677, [r6136+19456]; +ld.shared.u32 r3680, [r6136+19460]; +ld.shared.u32 r2915, [r6136+20480]; +ld.shared.u32 r2918, [r6136+20484]; +ld.shared.u32 r3531, [r6136+21504]; +ld.shared.u32 r3534, [r6136+21508]; +ld.shared.u32 r3111, [r6136+22528]; +ld.shared.u32 r3114, [r6136+22532]; +ld.shared.u32 r3727, [r6136+23552]; +ld.shared.u32 r3730, [r6136+23556]; +ld.shared.u32 r2877, [r6136+24576]; +ld.shared.u32 r2880, [r6136+24580]; +ld.shared.u32 r3493, [r6136+25600]; +ld.shared.u32 r3496, [r6136+25604]; +ld.shared.u32 r3073, [r6136+26624]; +ld.shared.u32 r3076, [r6136+26628]; +ld.shared.u32 r3689, [r6136+27648]; +ld.shared.u32 r3692, [r6136+27652]; +ld.shared.u32 r2927, [r6136+28672]; +ld.shared.u32 r2930, [r6136+28676]; +ld.shared.u32 r3543, [r6136+29696]; +ld.shared.u32 r3546, [r6136+29700]; +ld.shared.u32 r3123, [r6136+30720]; +ld.shared.u32 r3126, [r6136+30724]; +ld.shared.u32 r3739, [r6136+31744]; +ld.shared.u32 r3742, [r6136+31748]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r6138, r6129, 5, 2; +cvt.rn.f32.u32 f848, r6138; +mul.f32 f849, f848, 0f3D490FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +and.b32 r6145, r6129, 96; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +shl.b32 r6139, r6129, 3; +and.b32 r6140, r6139, 248; +add.s32 r6141, r6133, r6140; +barrier.sync 0; +and.b32 r6142, r6131, 24576; +add.s32 r6143, r6141, r6142; +st.shared.u32 [r6143], r4383; +st.shared.u32 [r6143+4], r4386; +st.shared.u32 [r6143+256], r4587; +st.shared.u32 [r6143+260], r4594; +st.shared.u32 [r6143+512], r4624; +st.shared.u32 [r6143+516], r4631; +st.shared.u32 [r6143+768], r4661; +st.shared.u32 [r6143+772], r4668; +st.shared.u32 [r6143+1024], r4698; +st.shared.u32 [r6143+1028], r4705; +st.shared.u32 [r6143+1280], r4735; +st.shared.u32 [r6143+1284], r4742; +st.shared.u32 [r6143+1536], r4772; +st.shared.u32 [r6143+1540], r4779; +st.shared.u32 [r6143+1792], r4809; +st.shared.u32 [r6143+1796], r4816; +st.shared.u32 [r6143+2048], r4846; +st.shared.u32 [r6143+2052], r4853; +st.shared.u32 [r6143+2304], r4883; +st.shared.u32 [r6143+2308], r4890; +st.shared.u32 [r6143+2560], r4920; +st.shared.u32 [r6143+2564], r4927; +st.shared.u32 [r6143+2816], r4957; +st.shared.u32 [r6143+2820], r4964; +st.shared.u32 [r6143+3072], r4994; +st.shared.u32 [r6143+3076], r5001; +st.shared.u32 [r6143+3328], r5031; +st.shared.u32 [r6143+3332], r5038; +st.shared.u32 [r6143+3584], r5068; +st.shared.u32 [r6143+3588], r5075; +st.shared.u32 [r6143+3840], r5105; +st.shared.u32 [r6143+3844], r5112; +st.shared.u32 [r6143+4096], r5142; +st.shared.u32 [r6143+4100], r5149; +st.shared.u32 [r6143+4352], r5179; +st.shared.u32 [r6143+4356], r5186; +st.shared.u32 [r6143+4608], r5216; +st.shared.u32 [r6143+4612], r5223; +st.shared.u32 [r6143+4864], r5253; +st.shared.u32 [r6143+4868], r5260; +st.shared.u32 [r6143+5120], r5290; +st.shared.u32 [r6143+5124], r5297; +st.shared.u32 [r6143+5376], r5327; +st.shared.u32 [r6143+5380], r5334; +st.shared.u32 [r6143+5632], r5364; +st.shared.u32 [r6143+5636], r5371; +st.shared.u32 [r6143+5888], r5401; +st.shared.u32 [r6143+5892], r5408; +st.shared.u32 [r6143+6144], r5438; +st.shared.u32 [r6143+6148], r5445; +st.shared.u32 [r6143+6400], r5475; +st.shared.u32 [r6143+6404], r5482; +st.shared.u32 [r6143+6656], r5512; +st.shared.u32 [r6143+6660], r5519; +st.shared.u32 [r6143+6912], r5549; +st.shared.u32 [r6143+6916], r5556; +st.shared.u32 [r6143+7168], r5586; +st.shared.u32 [r6143+7172], r5593; +st.shared.u32 [r6143+7424], r5623; +st.shared.u32 [r6143+7428], r5630; +st.shared.u32 [r6143+7680], r5660; +st.shared.u32 [r6143+7684], r5667; +st.shared.u32 [r6143+7936], r5697; +st.shared.u32 [r6143+7940], r5704; +barrier.sync 0; +mad.lo.s32 r6144, r6145, -248, r6143; +ld.shared.u32 r5726, [r6144]; +ld.shared.u32 r5729, [r6144+4]; +ld.shared.u32 r5776, [r6144+1024]; +ld.shared.u32 r5779, [r6144+1028]; +ld.shared.u32 r5826, [r6144+2048]; +ld.shared.u32 r5829, [r6144+2052]; +ld.shared.u32 r5876, [r6144+3072]; +ld.shared.u32 r5879, [r6144+3076]; +ld.shared.u32 r5926, [r6144+4096]; +ld.shared.u32 r5929, [r6144+4100]; +ld.shared.u32 r5976, [r6144+5120]; +ld.shared.u32 r5979, [r6144+5124]; +ld.shared.u32 r6026, [r6144+6144]; +ld.shared.u32 r6029, [r6144+6148]; +ld.shared.u32 r6076, [r6144+7168]; +ld.shared.u32 r6079, [r6144+7172]; +ld.shared.u32 r5738, [r6144+8192]; +ld.shared.u32 r5741, [r6144+8196]; +ld.shared.u32 r5788, [r6144+9216]; +ld.shared.u32 r5791, [r6144+9220]; +ld.shared.u32 r5838, [r6144+10240]; +ld.shared.u32 r5841, [r6144+10244]; +ld.shared.u32 r5888, [r6144+11264]; +ld.shared.u32 r5891, [r6144+11268]; +ld.shared.u32 r5938, [r6144+12288]; +ld.shared.u32 r5941, [r6144+12292]; +ld.shared.u32 r5988, [r6144+13312]; +ld.shared.u32 r5991, [r6144+13316]; +ld.shared.u32 r6038, [r6144+14336]; +ld.shared.u32 r6041, [r6144+14340]; +ld.shared.u32 r6088, [r6144+15360]; +ld.shared.u32 r6091, [r6144+15364]; +ld.shared.u32 r5727, [r6144+16384]; +ld.shared.u32 r5730, [r6144+16388]; +ld.shared.u32 r5777, [r6144+17408]; +ld.shared.u32 r5780, [r6144+17412]; +ld.shared.u32 r5827, [r6144+18432]; +ld.shared.u32 r5830, [r6144+18436]; +ld.shared.u32 r5877, [r6144+19456]; +ld.shared.u32 r5880, [r6144+19460]; +ld.shared.u32 r5927, [r6144+20480]; +ld.shared.u32 r5930, [r6144+20484]; +ld.shared.u32 r5977, [r6144+21504]; +ld.shared.u32 r5980, [r6144+21508]; +ld.shared.u32 r6027, [r6144+22528]; +ld.shared.u32 r6030, [r6144+22532]; +ld.shared.u32 r6077, [r6144+23552]; +ld.shared.u32 r6080, [r6144+23556]; +ld.shared.u32 r5739, [r6144+24576]; +ld.shared.u32 r5742, [r6144+24580]; +ld.shared.u32 r5789, [r6144+25600]; +ld.shared.u32 r5792, [r6144+25604]; +ld.shared.u32 r5839, [r6144+26624]; +ld.shared.u32 r5842, [r6144+26628]; +ld.shared.u32 r5889, [r6144+27648]; +ld.shared.u32 r5892, [r6144+27652]; +ld.shared.u32 r5939, [r6144+28672]; +ld.shared.u32 r5942, [r6144+28676]; +ld.shared.u32 r5989, [r6144+29696]; +ld.shared.u32 r5992, [r6144+29700]; +ld.shared.u32 r6039, [r6144+30720]; +ld.shared.u32 r6042, [r6144+30724]; +ld.shared.u32 r6089, [r6144+31744]; +ld.shared.u32 r6092, [r6144+31748]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 %0, r5725, r5737; +} +{ +add.f16x2 %1, r5728, r5740; +} +{ +sub.f16x2 %32, r5725, r5737; +} +{ +sub.f16x2 %33, r5728, r5740; +} +{ +add.f16x2 %16, r5731, r5746; +} +{ +add.f16x2 %17, r5734, r5749; +} +{ +sub.f16x2 %48, r5731, r5746; +} +{ +sub.f16x2 %49, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 %2, r5775, r5787; +} +{ +add.f16x2 %3, r5778, r5790; +} +{ +sub.f16x2 %34, r5775, r5787; +} +{ +sub.f16x2 %35, r5778, r5790; +} +{ +add.f16x2 %18, r5781, r5796; +} +{ +add.f16x2 %19, r5784, r5799; +} +{ +sub.f16x2 %50, r5781, r5796; +} +{ +sub.f16x2 %51, r5784, r5799; +} +{ +add.f16x2 r5825, r5826, r5827; +} +{ +add.f16x2 r5828, r5829, r5830; +} +{ +sub.f16x2 r5831, r5826, r5827; +} +{ +sub.f16x2 r5834, r5829, r5830; +} +{ +add.f16x2 r5837, r5838, r5839; +} +{ +add.f16x2 r5840, r5841, r5842; +} +{ +sub.f16x2 r5843, r5838, r5839; +} +{ +sub.f16x2 r5846, r5841, r5842; +} +{ +neg.f16x2 r5849, r5843; +} +{ +add.f16x2 %4, r5825, r5837; +} +{ +add.f16x2 %5, r5828, r5840; +} +{ +sub.f16x2 %36, r5825, r5837; +} +{ +sub.f16x2 %37, r5828, r5840; +} +{ +add.f16x2 %20, r5831, r5846; +} +{ +add.f16x2 %21, r5834, r5849; +} +{ +sub.f16x2 %52, r5831, r5846; +} +{ +sub.f16x2 %53, r5834, r5849; +} +{ +add.f16x2 r5875, r5876, r5877; +} +{ +add.f16x2 r5878, r5879, r5880; +} +{ +sub.f16x2 r5881, r5876, r5877; +} +{ +sub.f16x2 r5884, r5879, r5880; +} +{ +add.f16x2 r5887, r5888, r5889; +} +{ +add.f16x2 r5890, r5891, r5892; +} +{ +sub.f16x2 r5893, r5888, r5889; +} +{ +sub.f16x2 r5896, r5891, r5892; +} +{ +neg.f16x2 r5899, r5893; +} +{ +add.f16x2 %6, r5875, r5887; +} +{ +add.f16x2 %7, r5878, r5890; +} +{ +sub.f16x2 %38, r5875, r5887; +} +{ +sub.f16x2 %39, r5878, r5890; +} +{ +add.f16x2 %22, r5881, r5896; +} +{ +add.f16x2 %23, r5884, r5899; +} +{ +sub.f16x2 %54, r5881, r5896; +} +{ +sub.f16x2 %55, r5884, r5899; +} +{ +add.f16x2 r5925, r5926, r5927; +} +{ +add.f16x2 r5928, r5929, r5930; +} +{ +sub.f16x2 r5931, r5926, r5927; +} +{ +sub.f16x2 r5934, r5929, r5930; +} +{ +add.f16x2 r5937, r5938, r5939; +} +{ +add.f16x2 r5940, r5941, r5942; +} +{ +sub.f16x2 r5943, r5938, r5939; +} +{ +sub.f16x2 r5946, r5941, r5942; +} +{ +neg.f16x2 r5949, r5943; +} +{ +add.f16x2 %8, r5925, r5937; +} +{ +add.f16x2 %9, r5928, r5940; +} +{ +sub.f16x2 %40, r5925, r5937; +} +{ +sub.f16x2 %41, r5928, r5940; +} +{ +add.f16x2 %24, r5931, r5946; +} +{ +add.f16x2 %25, r5934, r5949; +} +{ +sub.f16x2 %56, r5931, r5946; +} +{ +sub.f16x2 %57, r5934, r5949; +} +{ +add.f16x2 r5975, r5976, r5977; +} +{ +add.f16x2 r5978, r5979, r5980; +} +{ +sub.f16x2 r5981, r5976, r5977; +} +{ +sub.f16x2 r5984, r5979, r5980; +} +{ +add.f16x2 r5987, r5988, r5989; +} +{ +add.f16x2 r5990, r5991, r5992; +} +{ +sub.f16x2 r5993, r5988, r5989; +} +{ +sub.f16x2 r5996, r5991, r5992; +} +{ +neg.f16x2 r5999, r5993; +} +{ +add.f16x2 %10, r5975, r5987; +} +{ +add.f16x2 %11, r5978, r5990; +} +{ +sub.f16x2 %42, r5975, r5987; +} +{ +sub.f16x2 %43, r5978, r5990; +} +{ +add.f16x2 %26, r5981, r5996; +} +{ +add.f16x2 %27, r5984, r5999; +} +{ +sub.f16x2 %58, r5981, r5996; +} +{ +sub.f16x2 %59, r5984, r5999; +} +{ +add.f16x2 r6025, r6026, r6027; +} +{ +add.f16x2 r6028, r6029, r6030; +} +{ +sub.f16x2 r6031, r6026, r6027; +} +{ +sub.f16x2 r6034, r6029, r6030; +} +{ +add.f16x2 r6037, r6038, r6039; +} +{ +add.f16x2 r6040, r6041, r6042; +} +{ +sub.f16x2 r6043, r6038, r6039; +} +{ +sub.f16x2 r6046, r6041, r6042; +} +{ +neg.f16x2 r6049, r6043; +} +{ +add.f16x2 %12, r6025, r6037; +} +{ +add.f16x2 %13, r6028, r6040; +} +{ +sub.f16x2 %44, r6025, r6037; +} +{ +sub.f16x2 %45, r6028, r6040; +} +{ +add.f16x2 %28, r6031, r6046; +} +{ +add.f16x2 %29, r6034, r6049; +} +{ +sub.f16x2 %60, r6031, r6046; +} +{ +sub.f16x2 %61, r6034, r6049; +} +{ +add.f16x2 r6075, r6076, r6077; +} +{ +add.f16x2 r6078, r6079, r6080; +} +{ +sub.f16x2 r6081, r6076, r6077; +} +{ +sub.f16x2 r6084, r6079, r6080; +} +{ +add.f16x2 r6087, r6088, r6089; +} +{ +add.f16x2 r6090, r6091, r6092; +} +{ +sub.f16x2 r6093, r6088, r6089; +} +{ +sub.f16x2 r6096, r6091, r6092; +} +{ +neg.f16x2 r6099, r6093; +} +{ +add.f16x2 %14, r6075, r6087; +} +{ +add.f16x2 %15, r6078, r6090; +} +{ +sub.f16x2 %46, r6075, r6087; +} +{ +sub.f16x2 %47, r6078, r6090; +} +{ +add.f16x2 %30, r6081, r6096; +} +{ +add.f16x2 %31, r6084, r6099; +} +{ +sub.f16x2 %62, r6081, r6096; +} +{ +sub.f16x2 %63, r6084, r6099; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<848, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<176>; +.reg .b32 r<1598>; +.reg .b64 rd<2>; +mov.u32 r1571, %tid.y; +shl.b32 r1572, r1571, 15; +mov.u32 r1573, %16; +add.s32 r1574, r1573, r1572; +mov.u32 r1575, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f140, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r101, {low, high}; +} +mov.f32 f150, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f136, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1576, r1575, 511; +shl.b32 r1577, r1575, 6; +and.b32 r1578, r1577, -32768; +add.s32 r1579, r1574, r1578; +cvt.rn.f32.u32 f167, r1576; +mul.f32 f168, f167, 0f3AC90FDB; +cos.approx.f32 f29, f168; +sin.approx.f32 f169, f168; +neg.f32 f30, f169; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1580, r1577, 32704; +add.s32 r1581, r1579, r1580; +st.shared.v4.f32 [r1581], {r149, r152, r209, r216}; +st.shared.v4.f32 [r1581+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r1581+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r1581+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r1582, r1576, -56, r1581; +ld.shared.u32 r460, [r1582]; +ld.shared.u32 r463, [r1582+4]; +ld.shared.u32 r510, [r1582+4096]; +ld.shared.u32 r513, [r1582+4100]; +ld.shared.u32 r472, [r1582+8192]; +ld.shared.u32 r475, [r1582+8196]; +ld.shared.u32 r522, [r1582+12288]; +ld.shared.u32 r525, [r1582+12292]; +ld.shared.u32 r461, [r1582+16384]; +ld.shared.u32 r464, [r1582+16388]; +ld.shared.u32 r511, [r1582+20480]; +ld.shared.u32 r514, [r1582+20484]; +ld.shared.u32 r473, [r1582+24576]; +ld.shared.u32 r476, [r1582+24580]; +ld.shared.u32 r523, [r1582+28672]; +ld.shared.u32 r526, [r1582+28676]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1583, r1575, 504; +bfe.u32 r1584, r1575, 3, 6; +cvt.rn.f32.u32 f170, r1584; +mul.f32 f171, f170, 0f3C490FDB; +cos.approx.f32 f75, f171; +sin.approx.f32 f172, f171; +neg.f32 f76, f172; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r1585, r1575, 3; +and.b32 r1586, r1585, 56; +add.s32 r1587, r1579, r1586; +barrier.sync 0; +and.b32 r1588, r1577, 32256; +add.s32 r1589, r1587, r1588; +st.shared.u32 [r1589], r607; +st.shared.u32 [r1589+4], r610; +st.shared.u32 [r1589+64], r667; +st.shared.u32 [r1589+68], r674; +st.shared.u32 [r1589+128], r704; +st.shared.u32 [r1589+132], r711; +st.shared.u32 [r1589+192], r741; +st.shared.u32 [r1589+196], r748; +st.shared.u32 [r1589+256], r778; +st.shared.u32 [r1589+260], r785; +st.shared.u32 [r1589+320], r815; +st.shared.u32 [r1589+324], r822; +st.shared.u32 [r1589+384], r852; +st.shared.u32 [r1589+388], r859; +st.shared.u32 [r1589+448], r889; +st.shared.u32 [r1589+452], r896; +barrier.sync 0; +mad.lo.s32 r1590, r1583, -56, r1589; +ld.shared.u32 r918, [r1590]; +ld.shared.u32 r921, [r1590+4]; +ld.shared.u32 r968, [r1590+4096]; +ld.shared.u32 r971, [r1590+4100]; +ld.shared.u32 r930, [r1590+8192]; +ld.shared.u32 r933, [r1590+8196]; +ld.shared.u32 r980, [r1590+12288]; +ld.shared.u32 r983, [r1590+12292]; +ld.shared.u32 r919, [r1590+16384]; +ld.shared.u32 r922, [r1590+16388]; +ld.shared.u32 r969, [r1590+20480]; +ld.shared.u32 r972, [r1590+20484]; +ld.shared.u32 r931, [r1590+24576]; +ld.shared.u32 r934, [r1590+24580]; +ld.shared.u32 r981, [r1590+28672]; +ld.shared.u32 r984, [r1590+28676]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1591, r1575, 448; +bfe.u32 r1592, r1575, 6, 3; +cvt.rn.f32.u32 f173, r1592; +mul.f32 f174, f173, 0f3DC90FDB; +cos.approx.f32 f121, f174; +sin.approx.f32 f175, f174; +neg.f32 f122, f175; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +and.b32 r1593, r1585, 504; +add.s32 r1594, r1579, r1593; +barrier.sync 0; +and.b32 r1595, r1577, 28672; +add.s32 r1596, r1594, r1595; +st.shared.u32 [r1596], r1065; +st.shared.u32 [r1596+4], r1068; +st.shared.u32 [r1596+512], r1125; +st.shared.u32 [r1596+516], r1132; +st.shared.u32 [r1596+1024], r1162; +st.shared.u32 [r1596+1028], r1169; +st.shared.u32 [r1596+1536], r1199; +st.shared.u32 [r1596+1540], r1206; +st.shared.u32 [r1596+2048], r1236; +st.shared.u32 [r1596+2052], r1243; +st.shared.u32 [r1596+2560], r1273; +st.shared.u32 [r1596+2564], r1280; +st.shared.u32 [r1596+3072], r1310; +st.shared.u32 [r1596+3076], r1317; +st.shared.u32 [r1596+3584], r1347; +st.shared.u32 [r1596+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1597, r1591, -56, r1596; +ld.shared.u32 r1376, [r1597]; +ld.shared.u32 r1379, [r1597+4]; +ld.shared.u32 r1426, [r1597+4096]; +ld.shared.u32 r1429, [r1597+4100]; +ld.shared.u32 r1388, [r1597+8192]; +ld.shared.u32 r1391, [r1597+8196]; +ld.shared.u32 r1438, [r1597+12288]; +ld.shared.u32 r1441, [r1597+12292]; +ld.shared.u32 r1377, [r1597+16384]; +ld.shared.u32 r1380, [r1597+16388]; +ld.shared.u32 r1427, [r1597+20480]; +ld.shared.u32 r1430, [r1597+20484]; +ld.shared.u32 r1389, [r1597+24576]; +ld.shared.u32 r1392, [r1597+24580]; +ld.shared.u32 r1439, [r1597+28672]; +ld.shared.u32 r1442, [r1597+28676]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1393; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1396; +} +{ +add.f16x2 r1416, r1384, r1399; +} +{ +sub.f16x2 r1419, r1381, r1396; +} +{ +sub.f16x2 r1422, r1384, r1399; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1443; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1446; +} +{ +add.f16x2 r1466, r1434, r1449; +} +{ +sub.f16x2 r1469, r1431, r1446; +} +{ +sub.f16x2 r1472, r1434, r1449; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1457; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 %0, r1401, r1451; +} +{ +add.f16x2 %1, r1404, r1454; +} +{ +sub.f16x2 %8, r1401, r1451; +} +{ +sub.f16x2 %9, r1404, r1454; +} +{ +add.f16x2 %2, r1413, r1495; +} +{ +add.f16x2 %3, r1416, r1501; +} +{ +sub.f16x2 %10, r1413, r1495; +} +{ +sub.f16x2 %11, r1416, r1501; +} +{ +add.f16x2 %4, r1407, r1460; +} +{ +add.f16x2 %5, r1410, r1505; +} +{ +sub.f16x2 %12, r1407, r1460; +} +{ +sub.f16x2 %13, r1410, r1505; +} +{ +add.f16x2 %6, r1419, r1513; +} +{ +add.f16x2 %7, r1422, r1519; +} +{ +sub.f16x2 %14, r1419, r1513; +} +{ +sub.f16x2 %15, r1422, r1519; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<852, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6210>; +.reg .b64 rd<3>; +mov.u32 r6125, %tid.y; +shl.b32 r6126, r6125, 14; +mov.u32 r6127, %64; +add.s32 r6128, r6127, r6126; +mov.u32 r6129, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %121, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %121, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f668, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r101, {low, high}; +} +mov.f32 f702, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %122, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %122, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6131, r6129, 7; +and.b32 r6132, r6131, -16384; +add.s32 r6133, r6128, r6132; +and.b32 r6145, r6129, 127; +cvt.rn.f32.u32 f845, r6145; +mul.f32 f846, f845, 0f3AC90FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r6134, r6131, 16256; +add.s32 r6135, r6133, r6134; +st.shared.v4.f32 [r6135], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r6135+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r6135+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r6135+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r6135+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r6135+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r6135+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r6135+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r6136, r6145, -124, r6135; +ld.shared.u32 r2864, [r6136]; +ld.shared.u32 r3480, [r6136+512]; +ld.shared.u32 r3060, [r6136+1024]; +ld.shared.u32 r3676, [r6136+1536]; +ld.shared.u32 r2914, [r6136+2048]; +ld.shared.u32 r3530, [r6136+2560]; +ld.shared.u32 r3110, [r6136+3072]; +ld.shared.u32 r3726, [r6136+3584]; +ld.shared.u32 r2876, [r6136+4096]; +ld.shared.u32 r3492, [r6136+4608]; +ld.shared.u32 r3072, [r6136+5120]; +ld.shared.u32 r3688, [r6136+5632]; +ld.shared.u32 r2926, [r6136+6144]; +ld.shared.u32 r3542, [r6136+6656]; +ld.shared.u32 r3122, [r6136+7168]; +ld.shared.u32 r3738, [r6136+7680]; +ld.shared.u32 r2865, [r6136+8192]; +ld.shared.u32 r3481, [r6136+8704]; +ld.shared.u32 r3061, [r6136+9216]; +ld.shared.u32 r3677, [r6136+9728]; +ld.shared.u32 r2915, [r6136+10240]; +ld.shared.u32 r3531, [r6136+10752]; +ld.shared.u32 r3111, [r6136+11264]; +ld.shared.u32 r3727, [r6136+11776]; +ld.shared.u32 r2877, [r6136+12288]; +ld.shared.u32 r3493, [r6136+12800]; +ld.shared.u32 r3073, [r6136+13312]; +ld.shared.u32 r3689, [r6136+13824]; +ld.shared.u32 r2927, [r6136+14336]; +ld.shared.u32 r3543, [r6136+14848]; +ld.shared.u32 r3123, [r6136+15360]; +ld.shared.u32 r3739, [r6136+15872]; +barrier.sync 0; +st.shared.v4.f32 [r6135], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r6135+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r6135+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r6135+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r6135+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r6135+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r6135+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r6135+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r6136]; +ld.shared.u32 r3483, [r6136+512]; +ld.shared.u32 r3063, [r6136+1024]; +ld.shared.u32 r3679, [r6136+1536]; +ld.shared.u32 r2917, [r6136+2048]; +ld.shared.u32 r3533, [r6136+2560]; +ld.shared.u32 r3113, [r6136+3072]; +ld.shared.u32 r3729, [r6136+3584]; +ld.shared.u32 r2879, [r6136+4096]; +ld.shared.u32 r3495, [r6136+4608]; +ld.shared.u32 r3075, [r6136+5120]; +ld.shared.u32 r3691, [r6136+5632]; +ld.shared.u32 r2929, [r6136+6144]; +ld.shared.u32 r3545, [r6136+6656]; +ld.shared.u32 r3125, [r6136+7168]; +ld.shared.u32 r3741, [r6136+7680]; +ld.shared.u32 r2868, [r6136+8192]; +ld.shared.u32 r3484, [r6136+8704]; +ld.shared.u32 r3064, [r6136+9216]; +ld.shared.u32 r3680, [r6136+9728]; +ld.shared.u32 r2918, [r6136+10240]; +ld.shared.u32 r3534, [r6136+10752]; +ld.shared.u32 r3114, [r6136+11264]; +ld.shared.u32 r3730, [r6136+11776]; +ld.shared.u32 r2880, [r6136+12288]; +ld.shared.u32 r3496, [r6136+12800]; +ld.shared.u32 r3076, [r6136+13312]; +ld.shared.u32 r3692, [r6136+13824]; +ld.shared.u32 r2930, [r6136+14336]; +ld.shared.u32 r3546, [r6136+14848]; +ld.shared.u32 r3126, [r6136+15360]; +ld.shared.u32 r3742, [r6136+15872]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f668; +cvt.rn.f16.f32 high, f668; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r6137, r6129, 96; +bfe.u32 r6138, r6129, 5, 2; +shl.b32 r6139, r6129, 2; +and.b32 r6140, r6139, 124; +add.s32 r6141, r6133, r6140; +cvt.rn.f32.u32 f848, r6138; +mul.f32 f849, f848, 0f3D490FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +barrier.sync 0; +and.b32 r6142, r6131, 12288; +add.s32 r6143, r6141, r6142; +st.shared.u32 [r6143], r4383; +st.shared.u32 [r6143+128], r4587; +st.shared.u32 [r6143+256], r4624; +st.shared.u32 [r6143+384], r4661; +st.shared.u32 [r6143+512], r4698; +st.shared.u32 [r6143+640], r4735; +st.shared.u32 [r6143+768], r4772; +st.shared.u32 [r6143+896], r4809; +st.shared.u32 [r6143+1024], r4846; +st.shared.u32 [r6143+1152], r4883; +st.shared.u32 [r6143+1280], r4920; +st.shared.u32 [r6143+1408], r4957; +st.shared.u32 [r6143+1536], r4994; +st.shared.u32 [r6143+1664], r5031; +st.shared.u32 [r6143+1792], r5068; +st.shared.u32 [r6143+1920], r5105; +st.shared.u32 [r6143+2048], r5142; +st.shared.u32 [r6143+2176], r5179; +st.shared.u32 [r6143+2304], r5216; +st.shared.u32 [r6143+2432], r5253; +st.shared.u32 [r6143+2560], r5290; +st.shared.u32 [r6143+2688], r5327; +st.shared.u32 [r6143+2816], r5364; +st.shared.u32 [r6143+2944], r5401; +st.shared.u32 [r6143+3072], r5438; +st.shared.u32 [r6143+3200], r5475; +st.shared.u32 [r6143+3328], r5512; +st.shared.u32 [r6143+3456], r5549; +st.shared.u32 [r6143+3584], r5586; +st.shared.u32 [r6143+3712], r5623; +st.shared.u32 [r6143+3840], r5660; +st.shared.u32 [r6143+3968], r5697; +barrier.sync 0; +mad.lo.s32 r6144, r6137, -124, r6143; +ld.shared.u32 r5726, [r6144]; +ld.shared.u32 r5776, [r6144+512]; +ld.shared.u32 r5826, [r6144+1024]; +ld.shared.u32 r5876, [r6144+1536]; +ld.shared.u32 r5926, [r6144+2048]; +ld.shared.u32 r5976, [r6144+2560]; +ld.shared.u32 r6026, [r6144+3072]; +ld.shared.u32 r6076, [r6144+3584]; +ld.shared.u32 r5738, [r6144+4096]; +ld.shared.u32 r5788, [r6144+4608]; +ld.shared.u32 r5838, [r6144+5120]; +ld.shared.u32 r5888, [r6144+5632]; +ld.shared.u32 r5938, [r6144+6144]; +ld.shared.u32 r5988, [r6144+6656]; +ld.shared.u32 r6038, [r6144+7168]; +ld.shared.u32 r6088, [r6144+7680]; +ld.shared.u32 r5727, [r6144+8192]; +ld.shared.u32 r5777, [r6144+8704]; +ld.shared.u32 r5827, [r6144+9216]; +ld.shared.u32 r5877, [r6144+9728]; +ld.shared.u32 r5927, [r6144+10240]; +ld.shared.u32 r5977, [r6144+10752]; +ld.shared.u32 r6027, [r6144+11264]; +ld.shared.u32 r6077, [r6144+11776]; +ld.shared.u32 r5739, [r6144+12288]; +ld.shared.u32 r5789, [r6144+12800]; +ld.shared.u32 r5839, [r6144+13312]; +ld.shared.u32 r5889, [r6144+13824]; +ld.shared.u32 r5939, [r6144+14336]; +ld.shared.u32 r5989, [r6144+14848]; +ld.shared.u32 r6039, [r6144+15360]; +ld.shared.u32 r6089, [r6144+15872]; +barrier.sync 0; +st.shared.u32 [r6143], r4386; +st.shared.u32 [r6143+128], r4594; +st.shared.u32 [r6143+256], r4631; +st.shared.u32 [r6143+384], r4668; +st.shared.u32 [r6143+512], r4705; +st.shared.u32 [r6143+640], r4742; +st.shared.u32 [r6143+768], r4779; +st.shared.u32 [r6143+896], r4816; +st.shared.u32 [r6143+1024], r4853; +st.shared.u32 [r6143+1152], r4890; +st.shared.u32 [r6143+1280], r4927; +st.shared.u32 [r6143+1408], r4964; +st.shared.u32 [r6143+1536], r5001; +st.shared.u32 [r6143+1664], r5038; +st.shared.u32 [r6143+1792], r5075; +st.shared.u32 [r6143+1920], r5112; +st.shared.u32 [r6143+2048], r5149; +st.shared.u32 [r6143+2176], r5186; +st.shared.u32 [r6143+2304], r5223; +st.shared.u32 [r6143+2432], r5260; +st.shared.u32 [r6143+2560], r5297; +st.shared.u32 [r6143+2688], r5334; +st.shared.u32 [r6143+2816], r5371; +st.shared.u32 [r6143+2944], r5408; +st.shared.u32 [r6143+3072], r5445; +st.shared.u32 [r6143+3200], r5482; +st.shared.u32 [r6143+3328], r5519; +st.shared.u32 [r6143+3456], r5556; +st.shared.u32 [r6143+3584], r5593; +st.shared.u32 [r6143+3712], r5630; +st.shared.u32 [r6143+3840], r5667; +st.shared.u32 [r6143+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r6144]; +ld.shared.u32 r5779, [r6144+512]; +ld.shared.u32 r5829, [r6144+1024]; +ld.shared.u32 r5879, [r6144+1536]; +ld.shared.u32 r5929, [r6144+2048]; +ld.shared.u32 r5979, [r6144+2560]; +ld.shared.u32 r6029, [r6144+3072]; +ld.shared.u32 r6079, [r6144+3584]; +ld.shared.u32 r5741, [r6144+4096]; +ld.shared.u32 r5791, [r6144+4608]; +ld.shared.u32 r5841, [r6144+5120]; +ld.shared.u32 r5891, [r6144+5632]; +ld.shared.u32 r5941, [r6144+6144]; +ld.shared.u32 r5991, [r6144+6656]; +ld.shared.u32 r6041, [r6144+7168]; +ld.shared.u32 r6091, [r6144+7680]; +ld.shared.u32 r5730, [r6144+8192]; +ld.shared.u32 r5780, [r6144+8704]; +ld.shared.u32 r5830, [r6144+9216]; +ld.shared.u32 r5880, [r6144+9728]; +ld.shared.u32 r5930, [r6144+10240]; +ld.shared.u32 r5980, [r6144+10752]; +ld.shared.u32 r6030, [r6144+11264]; +ld.shared.u32 r6080, [r6144+11776]; +ld.shared.u32 r5742, [r6144+12288]; +ld.shared.u32 r5792, [r6144+12800]; +ld.shared.u32 r5842, [r6144+13312]; +ld.shared.u32 r5892, [r6144+13824]; +ld.shared.u32 r5942, [r6144+14336]; +ld.shared.u32 r5992, [r6144+14848]; +ld.shared.u32 r6042, [r6144+15360]; +ld.shared.u32 r6092, [r6144+15872]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 %0, r5725, r5737; +} +{ +add.f16x2 %1, r5728, r5740; +} +{ +sub.f16x2 %32, r5725, r5737; +} +{ +sub.f16x2 %33, r5728, r5740; +} +{ +add.f16x2 %16, r5731, r5746; +} +{ +add.f16x2 %17, r5734, r5749; +} +{ +sub.f16x2 %48, r5731, r5746; +} +{ +sub.f16x2 %49, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 %2, r5775, r5787; +} +{ +add.f16x2 %3, r5778, r5790; +} +{ +sub.f16x2 %34, r5775, r5787; +} +{ +sub.f16x2 %35, r5778, r5790; +} +{ +add.f16x2 %18, r5781, r5796; +} +{ +add.f16x2 %19, r5784, r5799; +} +{ +sub.f16x2 %50, r5781, r5796; +} +{ +sub.f16x2 %51, r5784, r5799; +} +{ +add.f16x2 r5825, r5826, r5827; +} +{ +add.f16x2 r5828, r5829, r5830; +} +{ +sub.f16x2 r5831, r5826, r5827; +} +{ +sub.f16x2 r5834, r5829, r5830; +} +{ +add.f16x2 r5837, r5838, r5839; +} +{ +add.f16x2 r5840, r5841, r5842; +} +{ +sub.f16x2 r5843, r5838, r5839; +} +{ +sub.f16x2 r5846, r5841, r5842; +} +{ +neg.f16x2 r5849, r5843; +} +{ +add.f16x2 %4, r5825, r5837; +} +{ +add.f16x2 %5, r5828, r5840; +} +{ +sub.f16x2 %36, r5825, r5837; +} +{ +sub.f16x2 %37, r5828, r5840; +} +{ +add.f16x2 %20, r5831, r5846; +} +{ +add.f16x2 %21, r5834, r5849; +} +{ +sub.f16x2 %52, r5831, r5846; +} +{ +sub.f16x2 %53, r5834, r5849; +} +{ +add.f16x2 r5875, r5876, r5877; +} +{ +add.f16x2 r5878, r5879, r5880; +} +{ +sub.f16x2 r5881, r5876, r5877; +} +{ +sub.f16x2 r5884, r5879, r5880; +} +{ +add.f16x2 r5887, r5888, r5889; +} +{ +add.f16x2 r5890, r5891, r5892; +} +{ +sub.f16x2 r5893, r5888, r5889; +} +{ +sub.f16x2 r5896, r5891, r5892; +} +{ +neg.f16x2 r5899, r5893; +} +{ +add.f16x2 %6, r5875, r5887; +} +{ +add.f16x2 %7, r5878, r5890; +} +{ +sub.f16x2 %38, r5875, r5887; +} +{ +sub.f16x2 %39, r5878, r5890; +} +{ +add.f16x2 %22, r5881, r5896; +} +{ +add.f16x2 %23, r5884, r5899; +} +{ +sub.f16x2 %54, r5881, r5896; +} +{ +sub.f16x2 %55, r5884, r5899; +} +{ +add.f16x2 r5925, r5926, r5927; +} +{ +add.f16x2 r5928, r5929, r5930; +} +{ +sub.f16x2 r5931, r5926, r5927; +} +{ +sub.f16x2 r5934, r5929, r5930; +} +{ +add.f16x2 r5937, r5938, r5939; +} +{ +add.f16x2 r5940, r5941, r5942; +} +{ +sub.f16x2 r5943, r5938, r5939; +} +{ +sub.f16x2 r5946, r5941, r5942; +} +{ +neg.f16x2 r5949, r5943; +} +{ +add.f16x2 %8, r5925, r5937; +} +{ +add.f16x2 %9, r5928, r5940; +} +{ +sub.f16x2 %40, r5925, r5937; +} +{ +sub.f16x2 %41, r5928, r5940; +} +{ +add.f16x2 %24, r5931, r5946; +} +{ +add.f16x2 %25, r5934, r5949; +} +{ +sub.f16x2 %56, r5931, r5946; +} +{ +sub.f16x2 %57, r5934, r5949; +} +{ +add.f16x2 r5975, r5976, r5977; +} +{ +add.f16x2 r5978, r5979, r5980; +} +{ +sub.f16x2 r5981, r5976, r5977; +} +{ +sub.f16x2 r5984, r5979, r5980; +} +{ +add.f16x2 r5987, r5988, r5989; +} +{ +add.f16x2 r5990, r5991, r5992; +} +{ +sub.f16x2 r5993, r5988, r5989; +} +{ +sub.f16x2 r5996, r5991, r5992; +} +{ +neg.f16x2 r5999, r5993; +} +{ +add.f16x2 %10, r5975, r5987; +} +{ +add.f16x2 %11, r5978, r5990; +} +{ +sub.f16x2 %42, r5975, r5987; +} +{ +sub.f16x2 %43, r5978, r5990; +} +{ +add.f16x2 %26, r5981, r5996; +} +{ +add.f16x2 %27, r5984, r5999; +} +{ +sub.f16x2 %58, r5981, r5996; +} +{ +sub.f16x2 %59, r5984, r5999; +} +{ +add.f16x2 r6025, r6026, r6027; +} +{ +add.f16x2 r6028, r6029, r6030; +} +{ +sub.f16x2 r6031, r6026, r6027; +} +{ +sub.f16x2 r6034, r6029, r6030; +} +{ +add.f16x2 r6037, r6038, r6039; +} +{ +add.f16x2 r6040, r6041, r6042; +} +{ +sub.f16x2 r6043, r6038, r6039; +} +{ +sub.f16x2 r6046, r6041, r6042; +} +{ +neg.f16x2 r6049, r6043; +} +{ +add.f16x2 %12, r6025, r6037; +} +{ +add.f16x2 %13, r6028, r6040; +} +{ +sub.f16x2 %44, r6025, r6037; +} +{ +sub.f16x2 %45, r6028, r6040; +} +{ +add.f16x2 %28, r6031, r6046; +} +{ +add.f16x2 %29, r6034, r6049; +} +{ +sub.f16x2 %60, r6031, r6046; +} +{ +sub.f16x2 %61, r6034, r6049; +} +{ +add.f16x2 r6075, r6076, r6077; +} +{ +add.f16x2 r6078, r6079, r6080; +} +{ +sub.f16x2 r6081, r6076, r6077; +} +{ +sub.f16x2 r6084, r6079, r6080; +} +{ +add.f16x2 r6087, r6088, r6089; +} +{ +add.f16x2 r6090, r6091, r6092; +} +{ +sub.f16x2 r6093, r6088, r6089; +} +{ +sub.f16x2 r6096, r6091, r6092; +} +{ +neg.f16x2 r6099, r6093; +} +{ +add.f16x2 %14, r6075, r6087; +} +{ +add.f16x2 %15, r6078, r6090; +} +{ +sub.f16x2 %46, r6075, r6087; +} +{ +sub.f16x2 %47, r6078, r6090; +} +{ +add.f16x2 %30, r6081, r6096; +} +{ +add.f16x2 %31, r6084, r6099; +} +{ +sub.f16x2 %62, r6081, r6096; +} +{ +sub.f16x2 %63, r6084, r6099; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<853, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .f32 f<74>; +.reg .b32 r<930>; +.reg .b64 rd<2>; +{ +add.f16x2 r22, %9, %13; +} +{ +add.f16x2 r25, %10, %14; +} +{ +sub.f16x2 r28, %9, %13; +} +{ +sub.f16x2 r31, %10, %14; +} +{ +add.f16x2 r34, %11, %15; +} +{ +add.f16x2 r37, %12, %16; +} +{ +sub.f16x2 r40, %11, %15; +} +{ +sub.f16x2 r43, %12, %16; +} +{ +neg.f16x2 r46, r40; +} +{ +add.f16x2 r48, r22, r34; +} +{ +add.f16x2 r51, r25, r37; +} +{ +sub.f16x2 r54, r22, r34; +} +{ +sub.f16x2 r57, r25, r37; +} +{ +add.f16x2 r60, r28, r43; +} +{ +add.f16x2 r63, r31, r46; +} +{ +sub.f16x2 r66, r28, r43; +} +{ +sub.f16x2 r69, r31, r46; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 1023; +cvt.rn.f32.u32 f8, r10; +mul.f32 f1, f8, 0f3AC90FDB; +setp.eq.s32 p1, r10, 1020; +mov.f32 f73, 0f3BC90F88; +mov.f32 f72, f73; +@p1 bra LBB6_2; +cos.approx.f32 f72, f1; +LBB6_2: +mov.u32 r236, %tid.y; +shl.b32 r237, r236, 15; +mov.u32 r238, %8; +add.s32 r239, r238, r237; +shl.b32 r240, r9, 5; +and.b32 r241, r240, -32768; +add.s32 r12, r239, r241; +sin.approx.f32 f20, f1; +neg.f32 f10, f20; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f10; +mov.b32 r72, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r75, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r77, {high, high}; +} +{ +mul.f16x2 r79, r63, r77; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r60, r75, r82; +} +{ +mul.f16x2 r88, r60, r77; +} +{ +fma.rn.f16x2 r91, r63, r75, r88; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r95, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r97, {high, high}; +} +mov.f32 f15, 0fBF800000; +mov.f32 f16, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r99, {low, high}; +} +{ +mul.f16x2 r100, r97, r99; +} +{ +mul.f16x2 r103, r72, r95; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r106, {high, low}; +} +{ +fma.rn.f16x2 r108, r100, r106, r103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r108; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r108; +mov.b32 r114, {high, high}; +} +{ +mul.f16x2 r116, r57, r114; +} +{ +neg.f16x2 r119, r116; +} +{ +fma.rn.f16x2 r121, r54, r112, r119; +} +{ +mul.f16x2 r125, r54, r114; +} +{ +fma.rn.f16x2 r128, r57, r112, r125; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r132, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r134, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r136, {low, high}; +} +{ +mul.f16x2 r137, r134, r136; +} +{ +mul.f16x2 r140, r108, r132; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r108; +mov.b32 r143, {high, low}; +} +{ +fma.rn.f16x2 r145, r137, r143, r140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r145; +mov.b32 r149, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r145; +mov.b32 r151, {high, high}; +} +{ +mul.f16x2 r153, r69, r151; +} +{ +neg.f16x2 r156, r153; +} +{ +fma.rn.f16x2 r158, r66, r149, r156; +} +{ +mul.f16x2 r162, r66, r151; +} +{ +fma.rn.f16x2 r165, r69, r149, r162; +} +barrier.sync 0; +shl.b32 r242, r10, 5; +add.s32 r243, r12, r242; +st.shared.v4.f32 [r243], {r48, r51, r84, r91}; +st.shared.v4.f32 [r243+16], {r121, r128, r158, r165}; +barrier.sync 0; +mad.lo.s32 r244, r10, -24, r243; +ld.shared.u32 r187, [r244]; +ld.shared.u32 r190, [r244+4]; +ld.shared.u32 r199, [r244+8192]; +ld.shared.u32 r202, [r244+8196]; +ld.shared.u32 r188, [r244+16384]; +ld.shared.u32 r191, [r244+16388]; +ld.shared.u32 r200, [r244+24576]; +ld.shared.u32 r203, [r244+24580]; +{ +add.f16x2 r186, r187, r188; +} +{ +add.f16x2 r189, r190, r191; +} +{ +sub.f16x2 r192, r187, r188; +} +{ +sub.f16x2 r195, r190, r191; +} +{ +add.f16x2 r198, r199, r200; +} +{ +add.f16x2 r201, r202, r203; +} +{ +sub.f16x2 r204, r199, r200; +} +{ +sub.f16x2 r207, r202, r203; +} +{ +neg.f16x2 r210, r204; +} +{ +add.f16x2 r212, r186, r198; +} +{ +add.f16x2 r215, r189, r201; +} +{ +sub.f16x2 r218, r186, r198; +} +{ +sub.f16x2 r221, r189, r201; +} +{ +add.f16x2 r224, r192, r207; +} +{ +add.f16x2 r227, r195, r210; +} +{ +sub.f16x2 r230, r192, r207; +} +{ +sub.f16x2 r233, r195, r210; +} +and.b32 r21, r9, 1020; +bfe.u32 r245, r9, 2, 8; +cvt.rn.f32.u32 f21, r245; +mul.f32 f4, f21, 0f3BC90FDB; +setp.eq.s32 p2, r21, 1020; +@p2 bra LBB6_4; +cos.approx.f32 f73, f4; +LBB6_4: +sin.approx.f32 f62, f4; +neg.f32 f23, f62; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f23; +mov.b32 r246, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r249, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r251, {high, high}; +} +{ +mul.f16x2 r253, r227, r251; +} +{ +neg.f16x2 r256, r253; +} +{ +fma.rn.f16x2 r258, r224, r249, r256; +} +{ +mul.f16x2 r262, r224, r251; +} +{ +fma.rn.f16x2 r265, r227, r249, r262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r269, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r271, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r273, {low, high}; +} +{ +mul.f16x2 r274, r271, r273; +} +{ +mul.f16x2 r277, r246, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r280, {high, low}; +} +{ +fma.rn.f16x2 r282, r274, r280, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r282; +mov.b32 r286, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r282; +mov.b32 r288, {high, high}; +} +{ +mul.f16x2 r290, r221, r288; +} +{ +neg.f16x2 r293, r290; +} +{ +fma.rn.f16x2 r295, r218, r286, r293; +} +{ +mul.f16x2 r299, r218, r288; +} +{ +fma.rn.f16x2 r302, r221, r286, r299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r306, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r308, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r310, {low, high}; +} +{ +mul.f16x2 r311, r308, r310; +} +{ +mul.f16x2 r314, r282, r306; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r282; +mov.b32 r317, {high, low}; +} +{ +fma.rn.f16x2 r319, r311, r317, r314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r319; +mov.b32 r323, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r319; +mov.b32 r325, {high, high}; +} +{ +mul.f16x2 r327, r233, r325; +} +{ +neg.f16x2 r330, r327; +} +{ +fma.rn.f16x2 r332, r230, r323, r330; +} +{ +mul.f16x2 r336, r230, r325; +} +{ +fma.rn.f16x2 r339, r233, r323, r336; +} +barrier.sync 0; +shl.b32 r902, r9, 3; +and.b32 r903, r902, 24; +add.s32 r904, r12, r903; +shl.b32 r905, r21, 5; +add.s32 r906, r904, r905; +st.shared.u32 [r906], r212; +st.shared.u32 [r906+4], r215; +st.shared.u32 [r906+32], r258; +st.shared.u32 [r906+36], r265; +st.shared.u32 [r906+64], r295; +st.shared.u32 [r906+68], r302; +st.shared.u32 [r906+96], r332; +st.shared.u32 [r906+100], r339; +barrier.sync 0; +mad.lo.s32 r907, r21, -24, r906; +ld.shared.u32 r361, [r907]; +ld.shared.u32 r364, [r907+4]; +ld.shared.u32 r373, [r907+8192]; +ld.shared.u32 r376, [r907+8196]; +ld.shared.u32 r362, [r907+16384]; +ld.shared.u32 r365, [r907+16388]; +ld.shared.u32 r374, [r907+24576]; +ld.shared.u32 r377, [r907+24580]; +{ +add.f16x2 r360, r361, r362; +} +{ +add.f16x2 r363, r364, r365; +} +{ +sub.f16x2 r366, r361, r362; +} +{ +sub.f16x2 r369, r364, r365; +} +{ +add.f16x2 r372, r373, r374; +} +{ +add.f16x2 r375, r376, r377; +} +{ +sub.f16x2 r378, r373, r374; +} +{ +sub.f16x2 r381, r376, r377; +} +{ +neg.f16x2 r384, r378; +} +{ +add.f16x2 r386, r360, r372; +} +{ +add.f16x2 r389, r363, r375; +} +{ +sub.f16x2 r392, r360, r372; +} +{ +sub.f16x2 r395, r363, r375; +} +{ +add.f16x2 r398, r366, r381; +} +{ +add.f16x2 r401, r369, r384; +} +{ +sub.f16x2 r404, r366, r381; +} +{ +sub.f16x2 r407, r369, r384; +} +and.b32 r908, r9, 1008; +bfe.u32 r909, r9, 4, 6; +cvt.rn.f32.u32 f63, r909; +mul.f32 f64, f63, 0f3CC90FDB; +cos.approx.f32 f32, f64; +sin.approx.f32 f65, f64; +neg.f32 f33, f65; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f33; +mov.b32 r410, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r413, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r415, {high, high}; +} +{ +mul.f16x2 r417, r401, r415; +} +{ +neg.f16x2 r420, r417; +} +{ +fma.rn.f16x2 r422, r398, r413, r420; +} +{ +mul.f16x2 r426, r398, r415; +} +{ +fma.rn.f16x2 r429, r401, r413, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r433, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r435, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r437, {low, high}; +} +{ +mul.f16x2 r438, r435, r437; +} +{ +mul.f16x2 r441, r410, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r444, {high, low}; +} +{ +fma.rn.f16x2 r446, r438, r444, r441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r446; +mov.b32 r450, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r446; +mov.b32 r452, {high, high}; +} +{ +mul.f16x2 r454, r395, r452; +} +{ +neg.f16x2 r457, r454; +} +{ +fma.rn.f16x2 r459, r392, r450, r457; +} +{ +mul.f16x2 r463, r392, r452; +} +{ +fma.rn.f16x2 r466, r395, r450, r463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r470, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r472, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r474, {low, high}; +} +{ +mul.f16x2 r475, r472, r474; +} +{ +mul.f16x2 r478, r446, r470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r446; +mov.b32 r481, {high, low}; +} +{ +fma.rn.f16x2 r483, r475, r481, r478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r483; +mov.b32 r487, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r483; +mov.b32 r489, {high, high}; +} +{ +mul.f16x2 r491, r407, r489; +} +{ +neg.f16x2 r494, r491; +} +{ +fma.rn.f16x2 r496, r404, r487, r494; +} +{ +mul.f16x2 r500, r404, r489; +} +{ +fma.rn.f16x2 r503, r407, r487, r500; +} +and.b32 r910, r902, 120; +add.s32 r911, r12, r910; +barrier.sync 0; +and.b32 r913, r240, 32256; +add.s32 r914, r911, r913; +st.shared.u32 [r914], r386; +st.shared.u32 [r914+4], r389; +st.shared.u32 [r914+128], r422; +st.shared.u32 [r914+132], r429; +st.shared.u32 [r914+256], r459; +st.shared.u32 [r914+260], r466; +st.shared.u32 [r914+384], r496; +st.shared.u32 [r914+388], r503; +barrier.sync 0; +mad.lo.s32 r915, r908, -24, r914; +ld.shared.u32 r525, [r915]; +ld.shared.u32 r528, [r915+4]; +ld.shared.u32 r537, [r915+8192]; +ld.shared.u32 r540, [r915+8196]; +ld.shared.u32 r526, [r915+16384]; +ld.shared.u32 r529, [r915+16388]; +ld.shared.u32 r538, [r915+24576]; +ld.shared.u32 r541, [r915+24580]; +{ +add.f16x2 r524, r525, r526; +} +{ +add.f16x2 r527, r528, r529; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +sub.f16x2 r533, r528, r529; +} +{ +add.f16x2 r536, r537, r538; +} +{ +add.f16x2 r539, r540, r541; +} +{ +sub.f16x2 r542, r537, r538; +} +{ +sub.f16x2 r545, r540, r541; +} +{ +neg.f16x2 r548, r542; +} +{ +add.f16x2 r550, r524, r536; +} +{ +add.f16x2 r553, r527, r539; +} +{ +sub.f16x2 r556, r524, r536; +} +{ +sub.f16x2 r559, r527, r539; +} +{ +add.f16x2 r562, r530, r545; +} +{ +add.f16x2 r565, r533, r548; +} +{ +sub.f16x2 r568, r530, r545; +} +{ +sub.f16x2 r571, r533, r548; +} +and.b32 r916, r9, 960; +bfe.u32 r917, r9, 6, 4; +cvt.rn.f32.u32 f66, r917; +mul.f32 f67, f66, 0f3DC90FDB; +cos.approx.f32 f42, f67; +sin.approx.f32 f68, f67; +neg.f32 f43, f68; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r574, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r579, {high, high}; +} +{ +mul.f16x2 r581, r565, r579; +} +{ +neg.f16x2 r584, r581; +} +{ +fma.rn.f16x2 r586, r562, r577, r584; +} +{ +mul.f16x2 r590, r562, r579; +} +{ +fma.rn.f16x2 r593, r565, r577, r590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r601, {low, high}; +} +{ +mul.f16x2 r602, r599, r601; +} +{ +mul.f16x2 r605, r574, r597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r608, {high, low}; +} +{ +fma.rn.f16x2 r610, r602, r608, r605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r610; +mov.b32 r614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r610; +mov.b32 r616, {high, high}; +} +{ +mul.f16x2 r618, r559, r616; +} +{ +neg.f16x2 r621, r618; +} +{ +fma.rn.f16x2 r623, r556, r614, r621; +} +{ +mul.f16x2 r627, r556, r616; +} +{ +fma.rn.f16x2 r630, r559, r614, r627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r638, {low, high}; +} +{ +mul.f16x2 r639, r636, r638; +} +{ +mul.f16x2 r642, r610, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r610; +mov.b32 r645, {high, low}; +} +{ +fma.rn.f16x2 r647, r639, r645, r642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r647; +mov.b32 r651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r647; +mov.b32 r653, {high, high}; +} +{ +mul.f16x2 r655, r571, r653; +} +{ +neg.f16x2 r658, r655; +} +{ +fma.rn.f16x2 r660, r568, r651, r658; +} +{ +mul.f16x2 r664, r568, r653; +} +{ +fma.rn.f16x2 r667, r571, r651, r664; +} +and.b32 r918, r902, 504; +add.s32 r919, r12, r918; +barrier.sync 0; +and.b32 r920, r240, 30720; +add.s32 r921, r919, r920; +st.shared.u32 [r921], r550; +st.shared.u32 [r921+4], r553; +st.shared.u32 [r921+512], r586; +st.shared.u32 [r921+516], r593; +st.shared.u32 [r921+1024], r623; +st.shared.u32 [r921+1028], r630; +st.shared.u32 [r921+1536], r660; +st.shared.u32 [r921+1540], r667; +barrier.sync 0; +mad.lo.s32 r922, r916, -24, r921; +ld.shared.u32 r689, [r922]; +ld.shared.u32 r692, [r922+4]; +ld.shared.u32 r701, [r922+8192]; +ld.shared.u32 r704, [r922+8196]; +ld.shared.u32 r690, [r922+16384]; +ld.shared.u32 r693, [r922+16388]; +ld.shared.u32 r702, [r922+24576]; +ld.shared.u32 r705, [r922+24580]; +{ +add.f16x2 r688, r689, r690; +} +{ +add.f16x2 r691, r692, r693; +} +{ +sub.f16x2 r694, r689, r690; +} +{ +sub.f16x2 r697, r692, r693; +} +{ +add.f16x2 r700, r701, r702; +} +{ +add.f16x2 r703, r704, r705; +} +{ +sub.f16x2 r706, r701, r702; +} +{ +sub.f16x2 r709, r704, r705; +} +{ +neg.f16x2 r712, r706; +} +{ +add.f16x2 r714, r688, r700; +} +{ +add.f16x2 r717, r691, r703; +} +{ +sub.f16x2 r720, r688, r700; +} +{ +sub.f16x2 r723, r691, r703; +} +{ +add.f16x2 r726, r694, r709; +} +{ +add.f16x2 r729, r697, r712; +} +{ +sub.f16x2 r732, r694, r709; +} +{ +sub.f16x2 r735, r697, r712; +} +and.b32 r923, r9, 768; +bfe.u32 r924, r9, 8, 2; +cvt.rn.f32.u32 f69, r924; +mul.f32 f70, f69, 0f3EC90FDB; +cos.approx.f32 f52, f70; +sin.approx.f32 f71, f70; +neg.f32 f53, f71; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f53; +mov.b32 r738, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r729, r743; +} +{ +neg.f16x2 r748, r745; +} +{ +fma.rn.f16x2 r750, r726, r741, r748; +} +{ +mul.f16x2 r754, r726, r743; +} +{ +fma.rn.f16x2 r757, r729, r741, r754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r738, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r723, r780; +} +{ +neg.f16x2 r785, r782; +} +{ +fma.rn.f16x2 r787, r720, r778, r785; +} +{ +mul.f16x2 r791, r720, r780; +} +{ +fma.rn.f16x2 r794, r723, r778, r791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r735, r817; +} +{ +neg.f16x2 r822, r819; +} +{ +fma.rn.f16x2 r824, r732, r815, r822; +} +{ +mul.f16x2 r828, r732, r817; +} +{ +fma.rn.f16x2 r831, r735, r815, r828; +} +and.b32 r925, r902, 2040; +add.s32 r926, r12, r925; +barrier.sync 0; +and.b32 r927, r240, 24576; +add.s32 r928, r926, r927; +st.shared.u32 [r928], r714; +st.shared.u32 [r928+4], r717; +st.shared.u32 [r928+2048], r750; +st.shared.u32 [r928+2052], r757; +st.shared.u32 [r928+4096], r787; +st.shared.u32 [r928+4100], r794; +st.shared.u32 [r928+6144], r824; +st.shared.u32 [r928+6148], r831; +barrier.sync 0; +mad.lo.s32 r929, r923, -24, r928; +ld.shared.u32 r853, [r929]; +ld.shared.u32 r856, [r929+4]; +ld.shared.u32 r865, [r929+8192]; +ld.shared.u32 r868, [r929+8196]; +ld.shared.u32 r854, [r929+16384]; +ld.shared.u32 r857, [r929+16388]; +ld.shared.u32 r866, [r929+24576]; +ld.shared.u32 r869, [r929+24580]; +{ +add.f16x2 r852, r853, r854; +} +{ +add.f16x2 r855, r856, r857; +} +{ +sub.f16x2 r858, r853, r854; +} +{ +sub.f16x2 r861, r856, r857; +} +{ +add.f16x2 r864, r865, r866; +} +{ +add.f16x2 r867, r868, r869; +} +{ +sub.f16x2 r870, r865, r866; +} +{ +sub.f16x2 r873, r868, r869; +} +{ +neg.f16x2 r876, r870; +} +{ +add.f16x2 %0, r852, r864; +} +{ +add.f16x2 %1, r855, r867; +} +{ +sub.f16x2 %4, r852, r864; +} +{ +sub.f16x2 %5, r855, r867; +} +{ +add.f16x2 %2, r858, r873; +} +{ +add.f16x2 %3, r861, r876; +} +{ +sub.f16x2 %6, r858, r873; +} +{ +sub.f16x2 %7, r861, r876; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<854, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .f32 f<74>; +.reg .b32 r<929>; +.reg .b64 rd<2>; +{ +add.f16x2 r21, %9, %13; +} +{ +add.f16x2 r24, %10, %14; +} +{ +sub.f16x2 r27, %9, %13; +} +{ +sub.f16x2 r30, %10, %14; +} +{ +add.f16x2 r33, %11, %15; +} +{ +add.f16x2 r36, %12, %16; +} +{ +sub.f16x2 r39, %11, %15; +} +{ +sub.f16x2 r42, %12, %16; +} +{ +neg.f16x2 r45, r39; +} +{ +add.f16x2 r47, r21, r33; +} +{ +add.f16x2 r50, r24, r36; +} +{ +sub.f16x2 r53, r21, r33; +} +{ +sub.f16x2 r56, r24, r36; +} +{ +add.f16x2 r59, r27, r42; +} +{ +add.f16x2 r62, r30, r45; +} +{ +sub.f16x2 r65, r27, r42; +} +{ +sub.f16x2 r68, r30, r45; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 1023; +cvt.rn.f32.u32 f8, r10; +mul.f32 f1, f8, 0f3AC90FDB; +setp.eq.s32 p1, r10, 1020; +mov.f32 f73, 0f3BC90F88; +mov.f32 f72, f73; +@p1 bra LBB7_2; +cos.approx.f32 f72, f1; +LBB7_2: +mov.u32 r235, %tid.y; +shl.b32 r236, r235, 14; +mov.u32 r237, %8; +add.s32 r238, r237, r236; +shl.b32 r239, r9, 4; +and.b32 r240, r239, -16384; +add.s32 r11, r238, r240; +sin.approx.f32 f20, f1; +neg.f32 f10, f20; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f10; +mov.b32 r71, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r76, {high, high}; +} +{ +mul.f16x2 r78, r62, r76; +} +{ +neg.f16x2 r81, r78; +} +{ +fma.rn.f16x2 r83, r59, r74, r81; +} +{ +mul.f16x2 r87, r59, r76; +} +{ +fma.rn.f16x2 r90, r62, r74, r87; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r94, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r96, {high, high}; +} +mov.f32 f15, 0fBF800000; +mov.f32 f16, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r98, {low, high}; +} +{ +mul.f16x2 r99, r96, r98; +} +{ +mul.f16x2 r102, r71, r94; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r105, {high, low}; +} +{ +fma.rn.f16x2 r107, r99, r105, r102; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r107; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r107; +mov.b32 r113, {high, high}; +} +{ +mul.f16x2 r115, r56, r113; +} +{ +neg.f16x2 r118, r115; +} +{ +fma.rn.f16x2 r120, r53, r111, r118; +} +{ +mul.f16x2 r124, r53, r113; +} +{ +fma.rn.f16x2 r127, r56, r111, r124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r131, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r133, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r135, {low, high}; +} +{ +mul.f16x2 r136, r133, r135; +} +{ +mul.f16x2 r139, r107, r131; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r107; +mov.b32 r142, {high, low}; +} +{ +fma.rn.f16x2 r144, r136, r142, r139; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r148, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r150, {high, high}; +} +{ +mul.f16x2 r152, r68, r150; +} +{ +neg.f16x2 r155, r152; +} +{ +fma.rn.f16x2 r157, r65, r148, r155; +} +{ +mul.f16x2 r161, r65, r150; +} +{ +fma.rn.f16x2 r164, r68, r148, r161; +} +barrier.sync 0; +shl.b32 r241, r10, 4; +add.s32 r242, r11, r241; +st.shared.v4.f32 [r242], {r47, r83, r120, r157}; +barrier.sync 0; +mad.lo.s32 r243, r10, -12, r242; +ld.shared.u32 r186, [r243]; +ld.shared.u32 r198, [r243+4096]; +ld.shared.u32 r187, [r243+8192]; +ld.shared.u32 r199, [r243+12288]; +barrier.sync 0; +st.shared.v4.f32 [r242], {r50, r90, r127, r164}; +barrier.sync 0; +ld.shared.u32 r189, [r243]; +ld.shared.u32 r201, [r243+4096]; +ld.shared.u32 r190, [r243+8192]; +ld.shared.u32 r202, [r243+12288]; +{ +add.f16x2 r185, r186, r187; +} +{ +add.f16x2 r188, r189, r190; +} +{ +sub.f16x2 r191, r186, r187; +} +{ +sub.f16x2 r194, r189, r190; +} +{ +add.f16x2 r197, r198, r199; +} +{ +add.f16x2 r200, r201, r202; +} +{ +sub.f16x2 r203, r198, r199; +} +{ +sub.f16x2 r206, r201, r202; +} +{ +neg.f16x2 r209, r203; +} +{ +add.f16x2 r211, r185, r197; +} +{ +add.f16x2 r214, r188, r200; +} +{ +sub.f16x2 r217, r185, r197; +} +{ +sub.f16x2 r220, r188, r200; +} +{ +add.f16x2 r223, r191, r206; +} +{ +add.f16x2 r226, r194, r209; +} +{ +sub.f16x2 r229, r191, r206; +} +{ +sub.f16x2 r232, r194, r209; +} +and.b32 r20, r9, 1020; +bfe.u32 r244, r9, 2, 8; +cvt.rn.f32.u32 f21, r244; +mul.f32 f4, f21, 0f3BC90FDB; +setp.eq.s32 p2, r20, 1020; +@p2 bra LBB7_4; +cos.approx.f32 f73, f4; +LBB7_4: +sin.approx.f32 f62, f4; +neg.f32 f23, f62; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f23; +mov.b32 r245, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r250, {high, high}; +} +{ +mul.f16x2 r252, r226, r250; +} +{ +neg.f16x2 r255, r252; +} +{ +fma.rn.f16x2 r257, r223, r248, r255; +} +{ +mul.f16x2 r261, r223, r250; +} +{ +fma.rn.f16x2 r264, r226, r248, r261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r273, r270, r272; +} +{ +mul.f16x2 r276, r245, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r279, {high, low}; +} +{ +fma.rn.f16x2 r281, r273, r279, r276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r281; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r281; +mov.b32 r287, {high, high}; +} +{ +mul.f16x2 r289, r220, r287; +} +{ +neg.f16x2 r292, r289; +} +{ +fma.rn.f16x2 r294, r217, r285, r292; +} +{ +mul.f16x2 r298, r217, r287; +} +{ +fma.rn.f16x2 r301, r220, r285, r298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r309, {low, high}; +} +{ +mul.f16x2 r310, r307, r309; +} +{ +mul.f16x2 r313, r281, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r281; +mov.b32 r316, {high, low}; +} +{ +fma.rn.f16x2 r318, r310, r316, r313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r318; +mov.b32 r322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r318; +mov.b32 r324, {high, high}; +} +{ +mul.f16x2 r326, r232, r324; +} +{ +neg.f16x2 r329, r326; +} +{ +fma.rn.f16x2 r331, r229, r322, r329; +} +{ +mul.f16x2 r335, r229, r324; +} +{ +fma.rn.f16x2 r338, r232, r322, r335; +} +barrier.sync 0; +shl.b32 r901, r9, 2; +and.b32 r902, r901, 12; +add.s32 r903, r11, r902; +shl.b32 r904, r20, 4; +add.s32 r905, r903, r904; +st.shared.u32 [r905], r211; +st.shared.u32 [r905+16], r257; +st.shared.u32 [r905+32], r294; +st.shared.u32 [r905+48], r331; +barrier.sync 0; +mad.lo.s32 r906, r20, -12, r905; +ld.shared.u32 r360, [r906]; +ld.shared.u32 r372, [r906+4096]; +ld.shared.u32 r361, [r906+8192]; +ld.shared.u32 r373, [r906+12288]; +barrier.sync 0; +st.shared.u32 [r905], r214; +st.shared.u32 [r905+16], r264; +st.shared.u32 [r905+32], r301; +st.shared.u32 [r905+48], r338; +barrier.sync 0; +ld.shared.u32 r363, [r906]; +ld.shared.u32 r375, [r906+4096]; +ld.shared.u32 r364, [r906+8192]; +ld.shared.u32 r376, [r906+12288]; +{ +add.f16x2 r359, r360, r361; +} +{ +add.f16x2 r362, r363, r364; +} +{ +sub.f16x2 r365, r360, r361; +} +{ +sub.f16x2 r368, r363, r364; +} +{ +add.f16x2 r371, r372, r373; +} +{ +add.f16x2 r374, r375, r376; +} +{ +sub.f16x2 r377, r372, r373; +} +{ +sub.f16x2 r380, r375, r376; +} +{ +neg.f16x2 r383, r377; +} +{ +add.f16x2 r385, r359, r371; +} +{ +add.f16x2 r388, r362, r374; +} +{ +sub.f16x2 r391, r359, r371; +} +{ +sub.f16x2 r394, r362, r374; +} +{ +add.f16x2 r397, r365, r380; +} +{ +add.f16x2 r400, r368, r383; +} +{ +sub.f16x2 r403, r365, r380; +} +{ +sub.f16x2 r406, r368, r383; +} +and.b32 r907, r9, 1008; +bfe.u32 r908, r9, 4, 6; +and.b32 r909, r901, 60; +add.s32 r910, r11, r909; +cvt.rn.f32.u32 f63, r908; +mul.f32 f64, f63, 0f3CC90FDB; +cos.approx.f32 f32, f64; +sin.approx.f32 f65, f64; +neg.f32 f33, f65; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f33; +mov.b32 r409, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r414, {high, high}; +} +{ +mul.f16x2 r416, r400, r414; +} +{ +neg.f16x2 r419, r416; +} +{ +fma.rn.f16x2 r421, r397, r412, r419; +} +{ +mul.f16x2 r425, r397, r414; +} +{ +fma.rn.f16x2 r428, r400, r412, r425; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r436, {low, high}; +} +{ +mul.f16x2 r437, r434, r436; +} +{ +mul.f16x2 r440, r409, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r443, {high, low}; +} +{ +fma.rn.f16x2 r445, r437, r443, r440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r445; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r445; +mov.b32 r451, {high, high}; +} +{ +mul.f16x2 r453, r394, r451; +} +{ +neg.f16x2 r456, r453; +} +{ +fma.rn.f16x2 r458, r391, r449, r456; +} +{ +mul.f16x2 r462, r391, r451; +} +{ +fma.rn.f16x2 r465, r394, r449, r462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r473, {low, high}; +} +{ +mul.f16x2 r474, r471, r473; +} +{ +mul.f16x2 r477, r445, r469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r445; +mov.b32 r480, {high, low}; +} +{ +fma.rn.f16x2 r482, r474, r480, r477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r482; +mov.b32 r486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r482; +mov.b32 r488, {high, high}; +} +{ +mul.f16x2 r490, r406, r488; +} +{ +neg.f16x2 r493, r490; +} +{ +fma.rn.f16x2 r495, r403, r486, r493; +} +{ +mul.f16x2 r499, r403, r488; +} +{ +fma.rn.f16x2 r502, r406, r486, r499; +} +barrier.sync 0; +and.b32 r912, r239, 16128; +add.s32 r913, r910, r912; +st.shared.u32 [r913], r385; +st.shared.u32 [r913+64], r421; +st.shared.u32 [r913+128], r458; +st.shared.u32 [r913+192], r495; +barrier.sync 0; +mad.lo.s32 r914, r907, -12, r913; +ld.shared.u32 r524, [r914]; +ld.shared.u32 r536, [r914+4096]; +ld.shared.u32 r525, [r914+8192]; +ld.shared.u32 r537, [r914+12288]; +barrier.sync 0; +st.shared.u32 [r913], r388; +st.shared.u32 [r913+64], r428; +st.shared.u32 [r913+128], r465; +st.shared.u32 [r913+192], r502; +barrier.sync 0; +ld.shared.u32 r527, [r914]; +ld.shared.u32 r539, [r914+4096]; +ld.shared.u32 r528, [r914+8192]; +ld.shared.u32 r540, [r914+12288]; +{ +add.f16x2 r523, r524, r525; +} +{ +add.f16x2 r526, r527, r528; +} +{ +sub.f16x2 r529, r524, r525; +} +{ +sub.f16x2 r532, r527, r528; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r540; +} +{ +sub.f16x2 r541, r536, r537; +} +{ +sub.f16x2 r544, r539, r540; +} +{ +neg.f16x2 r547, r541; +} +{ +add.f16x2 r549, r523, r535; +} +{ +add.f16x2 r552, r526, r538; +} +{ +sub.f16x2 r555, r523, r535; +} +{ +sub.f16x2 r558, r526, r538; +} +{ +add.f16x2 r561, r529, r544; +} +{ +add.f16x2 r564, r532, r547; +} +{ +sub.f16x2 r567, r529, r544; +} +{ +sub.f16x2 r570, r532, r547; +} +and.b32 r915, r9, 960; +bfe.u32 r916, r9, 6, 4; +and.b32 r917, r901, 252; +add.s32 r918, r11, r917; +cvt.rn.f32.u32 f66, r916; +mul.f32 f67, f66, 0f3DC90FDB; +cos.approx.f32 f42, f67; +sin.approx.f32 f68, f67; +neg.f32 f43, f68; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r573, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r578, {high, high}; +} +{ +mul.f16x2 r580, r564, r578; +} +{ +neg.f16x2 r583, r580; +} +{ +fma.rn.f16x2 r585, r561, r576, r583; +} +{ +mul.f16x2 r589, r561, r578; +} +{ +fma.rn.f16x2 r592, r564, r576, r589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r596, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r598, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r600, {low, high}; +} +{ +mul.f16x2 r601, r598, r600; +} +{ +mul.f16x2 r604, r573, r596; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r607, {high, low}; +} +{ +fma.rn.f16x2 r609, r601, r607, r604; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r615, {high, high}; +} +{ +mul.f16x2 r617, r558, r615; +} +{ +neg.f16x2 r620, r617; +} +{ +fma.rn.f16x2 r622, r555, r613, r620; +} +{ +mul.f16x2 r626, r555, r615; +} +{ +fma.rn.f16x2 r629, r558, r613, r626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r633, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r635, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r637, {low, high}; +} +{ +mul.f16x2 r638, r635, r637; +} +{ +mul.f16x2 r641, r609, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r644, {high, low}; +} +{ +fma.rn.f16x2 r646, r638, r644, r641; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r646; +mov.b32 r650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r646; +mov.b32 r652, {high, high}; +} +{ +mul.f16x2 r654, r570, r652; +} +{ +neg.f16x2 r657, r654; +} +{ +fma.rn.f16x2 r659, r567, r650, r657; +} +{ +mul.f16x2 r663, r567, r652; +} +{ +fma.rn.f16x2 r666, r570, r650, r663; +} +barrier.sync 0; +and.b32 r919, r239, 15360; +add.s32 r920, r918, r919; +st.shared.u32 [r920], r549; +st.shared.u32 [r920+256], r585; +st.shared.u32 [r920+512], r622; +st.shared.u32 [r920+768], r659; +barrier.sync 0; +mad.lo.s32 r921, r915, -12, r920; +ld.shared.u32 r688, [r921]; +ld.shared.u32 r700, [r921+4096]; +ld.shared.u32 r689, [r921+8192]; +ld.shared.u32 r701, [r921+12288]; +barrier.sync 0; +st.shared.u32 [r920], r552; +st.shared.u32 [r920+256], r592; +st.shared.u32 [r920+512], r629; +st.shared.u32 [r920+768], r666; +barrier.sync 0; +ld.shared.u32 r691, [r921]; +ld.shared.u32 r703, [r921+4096]; +ld.shared.u32 r692, [r921+8192]; +ld.shared.u32 r704, [r921+12288]; +{ +add.f16x2 r687, r688, r689; +} +{ +add.f16x2 r690, r691, r692; +} +{ +sub.f16x2 r693, r688, r689; +} +{ +sub.f16x2 r696, r691, r692; +} +{ +add.f16x2 r699, r700, r701; +} +{ +add.f16x2 r702, r703, r704; +} +{ +sub.f16x2 r705, r700, r701; +} +{ +sub.f16x2 r708, r703, r704; +} +{ +neg.f16x2 r711, r705; +} +{ +add.f16x2 r713, r687, r699; +} +{ +add.f16x2 r716, r690, r702; +} +{ +sub.f16x2 r719, r687, r699; +} +{ +sub.f16x2 r722, r690, r702; +} +{ +add.f16x2 r725, r693, r708; +} +{ +add.f16x2 r728, r696, r711; +} +{ +sub.f16x2 r731, r693, r708; +} +{ +sub.f16x2 r734, r696, r711; +} +and.b32 r922, r9, 768; +bfe.u32 r923, r9, 8, 2; +and.b32 r924, r901, 1020; +add.s32 r925, r11, r924; +cvt.rn.f32.u32 f69, r923; +mul.f32 f70, f69, 0f3EC90FDB; +cos.approx.f32 f52, f70; +sin.approx.f32 f71, f70; +neg.f32 f53, f71; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f53; +mov.b32 r737, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r742, {high, high}; +} +{ +mul.f16x2 r744, r728, r742; +} +{ +neg.f16x2 r747, r744; +} +{ +fma.rn.f16x2 r749, r725, r740, r747; +} +{ +mul.f16x2 r753, r725, r742; +} +{ +fma.rn.f16x2 r756, r728, r740, r753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r762, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r764, {low, high}; +} +{ +mul.f16x2 r765, r762, r764; +} +{ +mul.f16x2 r768, r737, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r771, {high, low}; +} +{ +fma.rn.f16x2 r773, r765, r771, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r773; +mov.b32 r777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r773; +mov.b32 r779, {high, high}; +} +{ +mul.f16x2 r781, r722, r779; +} +{ +neg.f16x2 r784, r781; +} +{ +fma.rn.f16x2 r786, r719, r777, r784; +} +{ +mul.f16x2 r790, r719, r779; +} +{ +fma.rn.f16x2 r793, r722, r777, r790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r799, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r801, {low, high}; +} +{ +mul.f16x2 r802, r799, r801; +} +{ +mul.f16x2 r805, r773, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r773; +mov.b32 r808, {high, low}; +} +{ +fma.rn.f16x2 r810, r802, r808, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r810; +mov.b32 r814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r810; +mov.b32 r816, {high, high}; +} +{ +mul.f16x2 r818, r734, r816; +} +{ +neg.f16x2 r821, r818; +} +{ +fma.rn.f16x2 r823, r731, r814, r821; +} +{ +mul.f16x2 r827, r731, r816; +} +{ +fma.rn.f16x2 r830, r734, r814, r827; +} +barrier.sync 0; +and.b32 r926, r239, 12288; +add.s32 r927, r925, r926; +st.shared.u32 [r927], r713; +st.shared.u32 [r927+1024], r749; +st.shared.u32 [r927+2048], r786; +st.shared.u32 [r927+3072], r823; +barrier.sync 0; +mad.lo.s32 r928, r922, -12, r927; +ld.shared.u32 r852, [r928]; +ld.shared.u32 r864, [r928+4096]; +ld.shared.u32 r853, [r928+8192]; +ld.shared.u32 r865, [r928+12288]; +barrier.sync 0; +st.shared.u32 [r927], r716; +st.shared.u32 [r927+1024], r756; +st.shared.u32 [r927+2048], r793; +st.shared.u32 [r927+3072], r830; +barrier.sync 0; +ld.shared.u32 r855, [r928]; +ld.shared.u32 r867, [r928+4096]; +ld.shared.u32 r856, [r928+8192]; +ld.shared.u32 r868, [r928+12288]; +{ +add.f16x2 r851, r852, r853; +} +{ +add.f16x2 r854, r855, r856; +} +{ +sub.f16x2 r857, r852, r853; +} +{ +sub.f16x2 r860, r855, r856; +} +{ +add.f16x2 r863, r864, r865; +} +{ +add.f16x2 r866, r867, r868; +} +{ +sub.f16x2 r869, r864, r865; +} +{ +sub.f16x2 r872, r867, r868; +} +{ +neg.f16x2 r875, r869; +} +{ +add.f16x2 %0, r851, r863; +} +{ +add.f16x2 %1, r854, r866; +} +{ +sub.f16x2 %4, r851, r863; +} +{ +sub.f16x2 %5, r854, r866; +} +{ +add.f16x2 %2, r857, r872; +} +{ +add.f16x2 %3, r860, r875; +} +{ +sub.f16x2 %6, r857, r872; +} +{ +sub.f16x2 %7, r860, r875; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..6b00d9d13c0cf --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp16_inv.hpp.inc @@ -0,0 +1,29864 @@ +#ifndef CUFFTDX_FFT_4096_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_4096_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1049, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<423>; +.reg .b32 r<2985>; +.reg .b64 rd<2>; +mov.u32 r2965, %tid.y; +shl.b32 r2966, r2965, 14; +mov.u32 r2967, %32; +add.s32 r2968, r2967, r2966; +mov.u32 r2969, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f380, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f378, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f376, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r398, {low, high}; +} +mov.f32 f374, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2970, r2969, 255; +shl.b32 r2971, r2969, 6; +and.b32 r2972, r2971, -16384; +add.s32 r2973, r2968, r2972; +cvt.rn.f32.u32 f417, r2970; +mul.f32 f418, f417, 0f3AC90FDB; +cos.approx.f32 f117, f418; +sin.approx.f32 f419, f418; +neg.f32 f118, f419; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2974, r2971, 16320; +add.s32 r2975, r2973, r2974; +st.shared.v4.f32 [r2975], {r521, r627, r664, r701}; +st.shared.v4.f32 [r2975+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r2975+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r2975+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r2976, r2970, -60, r2975; +ld.shared.u32 r1176, [r2976]; +ld.shared.u32 r1372, [r2976+1024]; +ld.shared.u32 r1226, [r2976+2048]; +ld.shared.u32 r1422, [r2976+3072]; +ld.shared.u32 r1188, [r2976+4096]; +ld.shared.u32 r1384, [r2976+5120]; +ld.shared.u32 r1238, [r2976+6144]; +ld.shared.u32 r1434, [r2976+7168]; +ld.shared.u32 r1177, [r2976+8192]; +ld.shared.u32 r1373, [r2976+9216]; +ld.shared.u32 r1227, [r2976+10240]; +ld.shared.u32 r1423, [r2976+11264]; +ld.shared.u32 r1189, [r2976+12288]; +ld.shared.u32 r1385, [r2976+13312]; +ld.shared.u32 r1239, [r2976+14336]; +ld.shared.u32 r1435, [r2976+15360]; +barrier.sync 0; +st.shared.v4.f32 [r2975], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2975+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2975+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2975+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2976]; +ld.shared.u32 r1375, [r2976+1024]; +ld.shared.u32 r1229, [r2976+2048]; +ld.shared.u32 r1425, [r2976+3072]; +ld.shared.u32 r1191, [r2976+4096]; +ld.shared.u32 r1387, [r2976+5120]; +ld.shared.u32 r1241, [r2976+6144]; +ld.shared.u32 r1437, [r2976+7168]; +ld.shared.u32 r1180, [r2976+8192]; +ld.shared.u32 r1376, [r2976+9216]; +ld.shared.u32 r1230, [r2976+10240]; +ld.shared.u32 r1426, [r2976+11264]; +ld.shared.u32 r1192, [r2976+12288]; +ld.shared.u32 r1388, [r2976+13312]; +ld.shared.u32 r1242, [r2976+14336]; +ld.shared.u32 r1438, [r2976+15360]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2977, r2969, 240; +bfe.u32 r2978, r2969, 4, 4; +shl.b32 r2979, r2969, 2; +and.b32 r2980, r2979, 60; +add.s32 r2981, r2973, r2980; +cvt.rn.f32.u32 f420, r2978; +mul.f32 f421, f420, 0f3CC90FDB; +cos.approx.f32 f267, f421; +sin.approx.f32 f422, f421; +neg.f32 f268, f422; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +barrier.sync 0; +and.b32 r2982, r2971, 15360; +add.s32 r2983, r2981, r2982; +st.shared.u32 [r2983], r1695; +st.shared.u32 [r2983+64], r1801; +st.shared.u32 [r2983+128], r1838; +st.shared.u32 [r2983+192], r1875; +st.shared.u32 [r2983+256], r1912; +st.shared.u32 [r2983+320], r1949; +st.shared.u32 [r2983+384], r1986; +st.shared.u32 [r2983+448], r2023; +st.shared.u32 [r2983+512], r2060; +st.shared.u32 [r2983+576], r2097; +st.shared.u32 [r2983+640], r2134; +st.shared.u32 [r2983+704], r2171; +st.shared.u32 [r2983+768], r2208; +st.shared.u32 [r2983+832], r2245; +st.shared.u32 [r2983+896], r2282; +st.shared.u32 [r2983+960], r2319; +barrier.sync 0; +mad.lo.s32 r2984, r2977, -60, r2983; +ld.shared.u32 r2350, [r2984]; +ld.shared.u32 r2546, [r2984+1024]; +ld.shared.u32 r2400, [r2984+2048]; +ld.shared.u32 r2596, [r2984+3072]; +ld.shared.u32 r2362, [r2984+4096]; +ld.shared.u32 r2558, [r2984+5120]; +ld.shared.u32 r2412, [r2984+6144]; +ld.shared.u32 r2608, [r2984+7168]; +ld.shared.u32 r2351, [r2984+8192]; +ld.shared.u32 r2547, [r2984+9216]; +ld.shared.u32 r2401, [r2984+10240]; +ld.shared.u32 r2597, [r2984+11264]; +ld.shared.u32 r2363, [r2984+12288]; +ld.shared.u32 r2559, [r2984+13312]; +ld.shared.u32 r2413, [r2984+14336]; +ld.shared.u32 r2609, [r2984+15360]; +barrier.sync 0; +st.shared.u32 [r2983], r1698; +st.shared.u32 [r2983+64], r1810; +st.shared.u32 [r2983+128], r1847; +st.shared.u32 [r2983+192], r1884; +st.shared.u32 [r2983+256], r1921; +st.shared.u32 [r2983+320], r1958; +st.shared.u32 [r2983+384], r1995; +st.shared.u32 [r2983+448], r2032; +st.shared.u32 [r2983+512], r2069; +st.shared.u32 [r2983+576], r2106; +st.shared.u32 [r2983+640], r2143; +st.shared.u32 [r2983+704], r2180; +st.shared.u32 [r2983+768], r2217; +st.shared.u32 [r2983+832], r2254; +st.shared.u32 [r2983+896], r2291; +st.shared.u32 [r2983+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2984]; +ld.shared.u32 r2549, [r2984+1024]; +ld.shared.u32 r2403, [r2984+2048]; +ld.shared.u32 r2599, [r2984+3072]; +ld.shared.u32 r2365, [r2984+4096]; +ld.shared.u32 r2561, [r2984+5120]; +ld.shared.u32 r2415, [r2984+6144]; +ld.shared.u32 r2611, [r2984+7168]; +ld.shared.u32 r2354, [r2984+8192]; +ld.shared.u32 r2550, [r2984+9216]; +ld.shared.u32 r2404, [r2984+10240]; +ld.shared.u32 r2600, [r2984+11264]; +ld.shared.u32 r2366, [r2984+12288]; +ld.shared.u32 r2562, [r2984+13312]; +ld.shared.u32 r2416, [r2984+14336]; +ld.shared.u32 r2612, [r2984+15360]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2479; +} +{ +add.f16x2 r2524, r2384, r2431; +} +{ +sub.f16x2 r2527, r2381, r2479; +} +{ +sub.f16x2 r2530, r2384, r2431; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2675; +} +{ +add.f16x2 r2720, r2580, r2627; +} +{ +sub.f16x2 r2723, r2577, r2675; +} +{ +sub.f16x2 r2726, r2580, r2627; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2702; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 %0, r2497, r2693; +} +{ +add.f16x2 %1, r2500, r2696; +} +{ +sub.f16x2 %16, r2497, r2693; +} +{ +sub.f16x2 %17, r2500, r2696; +} +{ +add.f16x2 %2, r2509, r2777; +} +{ +add.f16x2 %3, r2512, r2783; +} +{ +sub.f16x2 %18, r2509, r2777; +} +{ +sub.f16x2 %19, r2512, r2783; +} +{ +add.f16x2 %4, r2521, r2793; +} +{ +add.f16x2 %5, r2524, r2799; +} +{ +sub.f16x2 %20, r2521, r2793; +} +{ +sub.f16x2 %21, r2524, r2799; +} +{ +add.f16x2 %6, r2533, r2809; +} +{ +add.f16x2 %7, r2536, r2815; +} +{ +sub.f16x2 %22, r2533, r2809; +} +{ +sub.f16x2 %23, r2536, r2815; +} +{ +add.f16x2 %8, r2503, r2819; +} +{ +add.f16x2 %9, r2506, r2699; +} +{ +sub.f16x2 %24, r2503, r2819; +} +{ +sub.f16x2 %25, r2506, r2699; +} +{ +add.f16x2 %10, r2515, r2827; +} +{ +add.f16x2 %11, r2518, r2833; +} +{ +sub.f16x2 %26, r2515, r2827; +} +{ +sub.f16x2 %27, r2518, r2833; +} +{ +add.f16x2 %12, r2527, r2843; +} +{ +add.f16x2 %13, r2530, r2849; +} +{ +sub.f16x2 %28, r2527, r2843; +} +{ +sub.f16x2 %29, r2530, r2849; +} +{ +add.f16x2 %14, r2539, r2859; +} +{ +add.f16x2 %15, r2542, r2865; +} +{ +sub.f16x2 %30, r2539, r2859; +} +{ +sub.f16x2 %31, r2542, r2865; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1051, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<176>; +.reg .b32 r<1598>; +.reg .b64 rd<2>; +mov.u32 r1571, %tid.y; +shl.b32 r1572, r1571, 14; +mov.u32 r1573, %16; +add.s32 r1574, r1573, r1572; +mov.u32 r1575, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f150, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f136, 0f3F800000; +mov.f32 f148, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1576, r1575, 511; +shl.b32 r1577, r1575, 5; +and.b32 r1578, r1577, -16384; +add.s32 r1579, r1574, r1578; +cvt.rn.f32.u32 f167, r1576; +mul.f32 f168, f167, 0f3AC90FDB; +cos.approx.f32 f29, f168; +sin.approx.f32 f169, f168; +neg.f32 f30, f169; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1580, r1577, 16352; +add.s32 r1581, r1579, r1580; +st.shared.v4.f32 [r1581], {r149, r207, r244, r281}; +st.shared.v4.f32 [r1581+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r1582, r1576, -28, r1581; +ld.shared.u32 r460, [r1582]; +ld.shared.u32 r510, [r1582+2048]; +ld.shared.u32 r472, [r1582+4096]; +ld.shared.u32 r522, [r1582+6144]; +ld.shared.u32 r461, [r1582+8192]; +ld.shared.u32 r511, [r1582+10240]; +ld.shared.u32 r473, [r1582+12288]; +ld.shared.u32 r523, [r1582+14336]; +barrier.sync 0; +st.shared.v4.f32 [r1581], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1581+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1582]; +ld.shared.u32 r513, [r1582+2048]; +ld.shared.u32 r475, [r1582+4096]; +ld.shared.u32 r525, [r1582+6144]; +ld.shared.u32 r464, [r1582+8192]; +ld.shared.u32 r514, [r1582+10240]; +ld.shared.u32 r476, [r1582+12288]; +ld.shared.u32 r526, [r1582+14336]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1583, r1575, 504; +bfe.u32 r1584, r1575, 3, 6; +shl.b32 r1585, r1575, 2; +and.b32 r1586, r1585, 28; +add.s32 r1587, r1579, r1586; +cvt.rn.f32.u32 f170, r1584; +mul.f32 f171, f170, 0f3C490FDB; +cos.approx.f32 f75, f171; +sin.approx.f32 f172, f171; +neg.f32 f76, f172; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r1588, r1577, 16128; +add.s32 r1589, r1587, r1588; +st.shared.u32 [r1589], r607; +st.shared.u32 [r1589+32], r665; +st.shared.u32 [r1589+64], r702; +st.shared.u32 [r1589+96], r739; +st.shared.u32 [r1589+128], r776; +st.shared.u32 [r1589+160], r813; +st.shared.u32 [r1589+192], r850; +st.shared.u32 [r1589+224], r887; +barrier.sync 0; +mad.lo.s32 r1590, r1583, -28, r1589; +ld.shared.u32 r918, [r1590]; +ld.shared.u32 r968, [r1590+2048]; +ld.shared.u32 r930, [r1590+4096]; +ld.shared.u32 r980, [r1590+6144]; +ld.shared.u32 r919, [r1590+8192]; +ld.shared.u32 r969, [r1590+10240]; +ld.shared.u32 r931, [r1590+12288]; +ld.shared.u32 r981, [r1590+14336]; +barrier.sync 0; +st.shared.u32 [r1589], r610; +st.shared.u32 [r1589+32], r674; +st.shared.u32 [r1589+64], r711; +st.shared.u32 [r1589+96], r748; +st.shared.u32 [r1589+128], r785; +st.shared.u32 [r1589+160], r822; +st.shared.u32 [r1589+192], r859; +st.shared.u32 [r1589+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1590]; +ld.shared.u32 r971, [r1590+2048]; +ld.shared.u32 r933, [r1590+4096]; +ld.shared.u32 r983, [r1590+6144]; +ld.shared.u32 r922, [r1590+8192]; +ld.shared.u32 r972, [r1590+10240]; +ld.shared.u32 r934, [r1590+12288]; +ld.shared.u32 r984, [r1590+14336]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1591, r1575, 448; +bfe.u32 r1592, r1575, 6, 3; +and.b32 r1593, r1585, 252; +add.s32 r1594, r1579, r1593; +cvt.rn.f32.u32 f173, r1592; +mul.f32 f174, f173, 0f3DC90FDB; +cos.approx.f32 f121, f174; +sin.approx.f32 f175, f174; +neg.f32 f122, f175; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +barrier.sync 0; +and.b32 r1595, r1577, 14336; +add.s32 r1596, r1594, r1595; +st.shared.u32 [r1596], r1065; +st.shared.u32 [r1596+256], r1123; +st.shared.u32 [r1596+512], r1160; +st.shared.u32 [r1596+768], r1197; +st.shared.u32 [r1596+1024], r1234; +st.shared.u32 [r1596+1280], r1271; +st.shared.u32 [r1596+1536], r1308; +st.shared.u32 [r1596+1792], r1345; +barrier.sync 0; +mad.lo.s32 r1597, r1591, -28, r1596; +ld.shared.u32 r1376, [r1597]; +ld.shared.u32 r1426, [r1597+2048]; +ld.shared.u32 r1388, [r1597+4096]; +ld.shared.u32 r1438, [r1597+6144]; +ld.shared.u32 r1377, [r1597+8192]; +ld.shared.u32 r1427, [r1597+10240]; +ld.shared.u32 r1389, [r1597+12288]; +ld.shared.u32 r1439, [r1597+14336]; +barrier.sync 0; +st.shared.u32 [r1596], r1068; +st.shared.u32 [r1596+256], r1132; +st.shared.u32 [r1596+512], r1169; +st.shared.u32 [r1596+768], r1206; +st.shared.u32 [r1596+1024], r1243; +st.shared.u32 [r1596+1280], r1280; +st.shared.u32 [r1596+1536], r1317; +st.shared.u32 [r1596+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1597]; +ld.shared.u32 r1429, [r1597+2048]; +ld.shared.u32 r1391, [r1597+4096]; +ld.shared.u32 r1441, [r1597+6144]; +ld.shared.u32 r1380, [r1597+8192]; +ld.shared.u32 r1430, [r1597+10240]; +ld.shared.u32 r1392, [r1597+12288]; +ld.shared.u32 r1442, [r1597+14336]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1396; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1399; +} +{ +add.f16x2 r1416, r1384, r1393; +} +{ +sub.f16x2 r1419, r1381, r1399; +} +{ +sub.f16x2 r1422, r1384, r1393; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1446; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1449; +} +{ +add.f16x2 r1466, r1434, r1443; +} +{ +sub.f16x2 r1469, r1431, r1449; +} +{ +sub.f16x2 r1472, r1434, r1443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1460; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 %0, r1401, r1451; +} +{ +add.f16x2 %1, r1404, r1454; +} +{ +sub.f16x2 %8, r1401, r1451; +} +{ +sub.f16x2 %9, r1404, r1454; +} +{ +add.f16x2 %2, r1413, r1495; +} +{ +add.f16x2 %3, r1416, r1501; +} +{ +sub.f16x2 %10, r1413, r1495; +} +{ +sub.f16x2 %11, r1416, r1501; +} +{ +add.f16x2 %4, r1407, r1505; +} +{ +add.f16x2 %5, r1410, r1457; +} +{ +sub.f16x2 %12, r1407, r1505; +} +{ +sub.f16x2 %13, r1410, r1457; +} +{ +add.f16x2 %6, r1419, r1513; +} +{ +add.f16x2 %7, r1422, r1519; +} +{ +sub.f16x2 %14, r1419, r1513; +} +{ +sub.f16x2 %15, r1422, r1519; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1052, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<423>; +.reg .b32 r<2985>; +.reg .b64 rd<2>; +mov.u32 r2965, %tid.y; +shl.b32 r2966, r2965, 15; +mov.u32 r2967, %32; +add.s32 r2968, r2967, r2966; +mov.u32 r2969, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f380, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f378, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f376, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r398, {low, high}; +} +mov.f32 f374, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2970, r2969, 255; +shl.b32 r2971, r2969, 7; +and.b32 r2972, r2971, -32768; +add.s32 r2973, r2968, r2972; +cvt.rn.f32.u32 f417, r2970; +mul.f32 f418, f417, 0f3AC90FDB; +cos.approx.f32 f117, f418; +sin.approx.f32 f419, f418; +neg.f32 f118, f419; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2974, r2971, 32640; +add.s32 r2975, r2973, r2974; +st.shared.v4.f32 [r2975], {r521, r524, r627, r636}; +st.shared.v4.f32 [r2975+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r2975+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r2975+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r2975+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r2975+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r2975+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r2975+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r2976, r2970, -120, r2975; +ld.shared.u32 r1176, [r2976]; +ld.shared.u32 r1179, [r2976+4]; +ld.shared.u32 r1372, [r2976+2048]; +ld.shared.u32 r1375, [r2976+2052]; +ld.shared.u32 r1226, [r2976+4096]; +ld.shared.u32 r1229, [r2976+4100]; +ld.shared.u32 r1422, [r2976+6144]; +ld.shared.u32 r1425, [r2976+6148]; +ld.shared.u32 r1188, [r2976+8192]; +ld.shared.u32 r1191, [r2976+8196]; +ld.shared.u32 r1384, [r2976+10240]; +ld.shared.u32 r1387, [r2976+10244]; +ld.shared.u32 r1238, [r2976+12288]; +ld.shared.u32 r1241, [r2976+12292]; +ld.shared.u32 r1434, [r2976+14336]; +ld.shared.u32 r1437, [r2976+14340]; +ld.shared.u32 r1177, [r2976+16384]; +ld.shared.u32 r1180, [r2976+16388]; +ld.shared.u32 r1373, [r2976+18432]; +ld.shared.u32 r1376, [r2976+18436]; +ld.shared.u32 r1227, [r2976+20480]; +ld.shared.u32 r1230, [r2976+20484]; +ld.shared.u32 r1423, [r2976+22528]; +ld.shared.u32 r1426, [r2976+22532]; +ld.shared.u32 r1189, [r2976+24576]; +ld.shared.u32 r1192, [r2976+24580]; +ld.shared.u32 r1385, [r2976+26624]; +ld.shared.u32 r1388, [r2976+26628]; +ld.shared.u32 r1239, [r2976+28672]; +ld.shared.u32 r1242, [r2976+28676]; +ld.shared.u32 r1435, [r2976+30720]; +ld.shared.u32 r1438, [r2976+30724]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2977, r2969, 240; +bfe.u32 r2978, r2969, 4, 4; +cvt.rn.f32.u32 f420, r2978; +mul.f32 f421, f420, 0f3CC90FDB; +cos.approx.f32 f267, f421; +sin.approx.f32 f422, f421; +neg.f32 f268, f422; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +shl.b32 r2979, r2969, 3; +and.b32 r2980, r2979, 120; +add.s32 r2981, r2973, r2980; +barrier.sync 0; +and.b32 r2982, r2971, 30720; +add.s32 r2983, r2981, r2982; +st.shared.u32 [r2983], r1695; +st.shared.u32 [r2983+4], r1698; +st.shared.u32 [r2983+128], r1801; +st.shared.u32 [r2983+132], r1810; +st.shared.u32 [r2983+256], r1838; +st.shared.u32 [r2983+260], r1847; +st.shared.u32 [r2983+384], r1875; +st.shared.u32 [r2983+388], r1884; +st.shared.u32 [r2983+512], r1912; +st.shared.u32 [r2983+516], r1921; +st.shared.u32 [r2983+640], r1949; +st.shared.u32 [r2983+644], r1958; +st.shared.u32 [r2983+768], r1986; +st.shared.u32 [r2983+772], r1995; +st.shared.u32 [r2983+896], r2023; +st.shared.u32 [r2983+900], r2032; +st.shared.u32 [r2983+1024], r2060; +st.shared.u32 [r2983+1028], r2069; +st.shared.u32 [r2983+1152], r2097; +st.shared.u32 [r2983+1156], r2106; +st.shared.u32 [r2983+1280], r2134; +st.shared.u32 [r2983+1284], r2143; +st.shared.u32 [r2983+1408], r2171; +st.shared.u32 [r2983+1412], r2180; +st.shared.u32 [r2983+1536], r2208; +st.shared.u32 [r2983+1540], r2217; +st.shared.u32 [r2983+1664], r2245; +st.shared.u32 [r2983+1668], r2254; +st.shared.u32 [r2983+1792], r2282; +st.shared.u32 [r2983+1796], r2291; +st.shared.u32 [r2983+1920], r2319; +st.shared.u32 [r2983+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2984, r2977, -120, r2983; +ld.shared.u32 r2350, [r2984]; +ld.shared.u32 r2353, [r2984+4]; +ld.shared.u32 r2546, [r2984+2048]; +ld.shared.u32 r2549, [r2984+2052]; +ld.shared.u32 r2400, [r2984+4096]; +ld.shared.u32 r2403, [r2984+4100]; +ld.shared.u32 r2596, [r2984+6144]; +ld.shared.u32 r2599, [r2984+6148]; +ld.shared.u32 r2362, [r2984+8192]; +ld.shared.u32 r2365, [r2984+8196]; +ld.shared.u32 r2558, [r2984+10240]; +ld.shared.u32 r2561, [r2984+10244]; +ld.shared.u32 r2412, [r2984+12288]; +ld.shared.u32 r2415, [r2984+12292]; +ld.shared.u32 r2608, [r2984+14336]; +ld.shared.u32 r2611, [r2984+14340]; +ld.shared.u32 r2351, [r2984+16384]; +ld.shared.u32 r2354, [r2984+16388]; +ld.shared.u32 r2547, [r2984+18432]; +ld.shared.u32 r2550, [r2984+18436]; +ld.shared.u32 r2401, [r2984+20480]; +ld.shared.u32 r2404, [r2984+20484]; +ld.shared.u32 r2597, [r2984+22528]; +ld.shared.u32 r2600, [r2984+22532]; +ld.shared.u32 r2363, [r2984+24576]; +ld.shared.u32 r2366, [r2984+24580]; +ld.shared.u32 r2559, [r2984+26624]; +ld.shared.u32 r2562, [r2984+26628]; +ld.shared.u32 r2413, [r2984+28672]; +ld.shared.u32 r2416, [r2984+28676]; +ld.shared.u32 r2609, [r2984+30720]; +ld.shared.u32 r2612, [r2984+30724]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2479; +} +{ +add.f16x2 r2524, r2384, r2431; +} +{ +sub.f16x2 r2527, r2381, r2479; +} +{ +sub.f16x2 r2530, r2384, r2431; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2675; +} +{ +add.f16x2 r2720, r2580, r2627; +} +{ +sub.f16x2 r2723, r2577, r2675; +} +{ +sub.f16x2 r2726, r2580, r2627; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2702; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 %0, r2497, r2693; +} +{ +add.f16x2 %1, r2500, r2696; +} +{ +sub.f16x2 %16, r2497, r2693; +} +{ +sub.f16x2 %17, r2500, r2696; +} +{ +add.f16x2 %2, r2509, r2777; +} +{ +add.f16x2 %3, r2512, r2783; +} +{ +sub.f16x2 %18, r2509, r2777; +} +{ +sub.f16x2 %19, r2512, r2783; +} +{ +add.f16x2 %4, r2521, r2793; +} +{ +add.f16x2 %5, r2524, r2799; +} +{ +sub.f16x2 %20, r2521, r2793; +} +{ +sub.f16x2 %21, r2524, r2799; +} +{ +add.f16x2 %6, r2533, r2809; +} +{ +add.f16x2 %7, r2536, r2815; +} +{ +sub.f16x2 %22, r2533, r2809; +} +{ +sub.f16x2 %23, r2536, r2815; +} +{ +add.f16x2 %8, r2503, r2819; +} +{ +add.f16x2 %9, r2506, r2699; +} +{ +sub.f16x2 %24, r2503, r2819; +} +{ +sub.f16x2 %25, r2506, r2699; +} +{ +add.f16x2 %10, r2515, r2827; +} +{ +add.f16x2 %11, r2518, r2833; +} +{ +sub.f16x2 %26, r2515, r2827; +} +{ +sub.f16x2 %27, r2518, r2833; +} +{ +add.f16x2 %12, r2527, r2843; +} +{ +add.f16x2 %13, r2530, r2849; +} +{ +sub.f16x2 %28, r2527, r2843; +} +{ +sub.f16x2 %29, r2530, r2849; +} +{ +add.f16x2 %14, r2539, r2859; +} +{ +add.f16x2 %15, r2542, r2865; +} +{ +sub.f16x2 %30, r2539, r2859; +} +{ +sub.f16x2 %31, r2542, r2865; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1053, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6211>; +.reg .b64 rd<3>; +mov.u32 r6125, %tid.y; +shl.b32 r6126, r6125, 15; +mov.u32 r6127, %64; +add.s32 r6128, r6127, r6126; +mov.u32 r6129, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %121, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %121, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f702, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +mov.f32 f700, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %122, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %122, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6131, r6129, 8; +and.b32 r6132, r6131, -32768; +add.s32 r6133, r6128, r6132; +and.b32 r6146, r6129, 127; +cvt.rn.f32.u32 f845, r6146; +mul.f32 f846, f845, 0f3AC90FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r6134, r6131, 32512; +add.s32 r6135, r6133, r6134; +st.shared.v4.f32 [r6135], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r6135+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r6135+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r6135+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r6135+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r6135+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r6135+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r6135+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r6135+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r6135+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r6135+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r6135+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r6135+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r6135+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r6135+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r6135+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r6136, r6146, -248, r6135; +ld.shared.u32 r2864, [r6136]; +ld.shared.u32 r2867, [r6136+4]; +ld.shared.u32 r3480, [r6136+1024]; +ld.shared.u32 r3483, [r6136+1028]; +ld.shared.u32 r3060, [r6136+2048]; +ld.shared.u32 r3063, [r6136+2052]; +ld.shared.u32 r3676, [r6136+3072]; +ld.shared.u32 r3679, [r6136+3076]; +ld.shared.u32 r2914, [r6136+4096]; +ld.shared.u32 r2917, [r6136+4100]; +ld.shared.u32 r3530, [r6136+5120]; +ld.shared.u32 r3533, [r6136+5124]; +ld.shared.u32 r3110, [r6136+6144]; +ld.shared.u32 r3113, [r6136+6148]; +ld.shared.u32 r3726, [r6136+7168]; +ld.shared.u32 r3729, [r6136+7172]; +ld.shared.u32 r2876, [r6136+8192]; +ld.shared.u32 r2879, [r6136+8196]; +ld.shared.u32 r3492, [r6136+9216]; +ld.shared.u32 r3495, [r6136+9220]; +ld.shared.u32 r3072, [r6136+10240]; +ld.shared.u32 r3075, [r6136+10244]; +ld.shared.u32 r3688, [r6136+11264]; +ld.shared.u32 r3691, [r6136+11268]; +ld.shared.u32 r2926, [r6136+12288]; +ld.shared.u32 r2929, [r6136+12292]; +ld.shared.u32 r3542, [r6136+13312]; +ld.shared.u32 r3545, [r6136+13316]; +ld.shared.u32 r3122, [r6136+14336]; +ld.shared.u32 r3125, [r6136+14340]; +ld.shared.u32 r3738, [r6136+15360]; +ld.shared.u32 r3741, [r6136+15364]; +ld.shared.u32 r2865, [r6136+16384]; +ld.shared.u32 r2868, [r6136+16388]; +ld.shared.u32 r3481, [r6136+17408]; +ld.shared.u32 r3484, [r6136+17412]; +ld.shared.u32 r3061, [r6136+18432]; +ld.shared.u32 r3064, [r6136+18436]; +ld.shared.u32 r3677, [r6136+19456]; +ld.shared.u32 r3680, [r6136+19460]; +ld.shared.u32 r2915, [r6136+20480]; +ld.shared.u32 r2918, [r6136+20484]; +ld.shared.u32 r3531, [r6136+21504]; +ld.shared.u32 r3534, [r6136+21508]; +ld.shared.u32 r3111, [r6136+22528]; +ld.shared.u32 r3114, [r6136+22532]; +ld.shared.u32 r3727, [r6136+23552]; +ld.shared.u32 r3730, [r6136+23556]; +ld.shared.u32 r2877, [r6136+24576]; +ld.shared.u32 r2880, [r6136+24580]; +ld.shared.u32 r3493, [r6136+25600]; +ld.shared.u32 r3496, [r6136+25604]; +ld.shared.u32 r3073, [r6136+26624]; +ld.shared.u32 r3076, [r6136+26628]; +ld.shared.u32 r3689, [r6136+27648]; +ld.shared.u32 r3692, [r6136+27652]; +ld.shared.u32 r2927, [r6136+28672]; +ld.shared.u32 r2930, [r6136+28676]; +ld.shared.u32 r3543, [r6136+29696]; +ld.shared.u32 r3546, [r6136+29700]; +ld.shared.u32 r3123, [r6136+30720]; +ld.shared.u32 r3126, [r6136+30724]; +ld.shared.u32 r3739, [r6136+31744]; +ld.shared.u32 r3742, [r6136+31748]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r6138, r6129, 5, 2; +cvt.rn.f32.u32 f848, r6138; +mul.f32 f849, f848, 0f3D490FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +and.b32 r6145, r6129, 96; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +shl.b32 r6139, r6129, 3; +and.b32 r6140, r6139, 248; +add.s32 r6141, r6133, r6140; +barrier.sync 0; +and.b32 r6142, r6131, 24576; +add.s32 r6143, r6141, r6142; +st.shared.u32 [r6143], r4383; +st.shared.u32 [r6143+4], r4386; +st.shared.u32 [r6143+256], r4585; +st.shared.u32 [r6143+260], r4594; +st.shared.u32 [r6143+512], r4622; +st.shared.u32 [r6143+516], r4631; +st.shared.u32 [r6143+768], r4659; +st.shared.u32 [r6143+772], r4668; +st.shared.u32 [r6143+1024], r4696; +st.shared.u32 [r6143+1028], r4705; +st.shared.u32 [r6143+1280], r4733; +st.shared.u32 [r6143+1284], r4742; +st.shared.u32 [r6143+1536], r4770; +st.shared.u32 [r6143+1540], r4779; +st.shared.u32 [r6143+1792], r4807; +st.shared.u32 [r6143+1796], r4816; +st.shared.u32 [r6143+2048], r4844; +st.shared.u32 [r6143+2052], r4853; +st.shared.u32 [r6143+2304], r4881; +st.shared.u32 [r6143+2308], r4890; +st.shared.u32 [r6143+2560], r4918; +st.shared.u32 [r6143+2564], r4927; +st.shared.u32 [r6143+2816], r4955; +st.shared.u32 [r6143+2820], r4964; +st.shared.u32 [r6143+3072], r4992; +st.shared.u32 [r6143+3076], r5001; +st.shared.u32 [r6143+3328], r5029; +st.shared.u32 [r6143+3332], r5038; +st.shared.u32 [r6143+3584], r5066; +st.shared.u32 [r6143+3588], r5075; +st.shared.u32 [r6143+3840], r5103; +st.shared.u32 [r6143+3844], r5112; +st.shared.u32 [r6143+4096], r5140; +st.shared.u32 [r6143+4100], r5149; +st.shared.u32 [r6143+4352], r5177; +st.shared.u32 [r6143+4356], r5186; +st.shared.u32 [r6143+4608], r5214; +st.shared.u32 [r6143+4612], r5223; +st.shared.u32 [r6143+4864], r5251; +st.shared.u32 [r6143+4868], r5260; +st.shared.u32 [r6143+5120], r5288; +st.shared.u32 [r6143+5124], r5297; +st.shared.u32 [r6143+5376], r5325; +st.shared.u32 [r6143+5380], r5334; +st.shared.u32 [r6143+5632], r5362; +st.shared.u32 [r6143+5636], r5371; +st.shared.u32 [r6143+5888], r5399; +st.shared.u32 [r6143+5892], r5408; +st.shared.u32 [r6143+6144], r5436; +st.shared.u32 [r6143+6148], r5445; +st.shared.u32 [r6143+6400], r5473; +st.shared.u32 [r6143+6404], r5482; +st.shared.u32 [r6143+6656], r5510; +st.shared.u32 [r6143+6660], r5519; +st.shared.u32 [r6143+6912], r5547; +st.shared.u32 [r6143+6916], r5556; +st.shared.u32 [r6143+7168], r5584; +st.shared.u32 [r6143+7172], r5593; +st.shared.u32 [r6143+7424], r5621; +st.shared.u32 [r6143+7428], r5630; +st.shared.u32 [r6143+7680], r5658; +st.shared.u32 [r6143+7684], r5667; +st.shared.u32 [r6143+7936], r5695; +st.shared.u32 [r6143+7940], r5704; +barrier.sync 0; +mad.lo.s32 r6144, r6145, -248, r6143; +ld.shared.u32 r5726, [r6144]; +ld.shared.u32 r5729, [r6144+4]; +ld.shared.u32 r5776, [r6144+1024]; +ld.shared.u32 r5779, [r6144+1028]; +ld.shared.u32 r5826, [r6144+2048]; +ld.shared.u32 r5829, [r6144+2052]; +ld.shared.u32 r5876, [r6144+3072]; +ld.shared.u32 r5879, [r6144+3076]; +ld.shared.u32 r5926, [r6144+4096]; +ld.shared.u32 r5929, [r6144+4100]; +ld.shared.u32 r5976, [r6144+5120]; +ld.shared.u32 r5979, [r6144+5124]; +ld.shared.u32 r6026, [r6144+6144]; +ld.shared.u32 r6029, [r6144+6148]; +ld.shared.u32 r6076, [r6144+7168]; +ld.shared.u32 r6079, [r6144+7172]; +ld.shared.u32 r5738, [r6144+8192]; +ld.shared.u32 r5741, [r6144+8196]; +ld.shared.u32 r5788, [r6144+9216]; +ld.shared.u32 r5791, [r6144+9220]; +ld.shared.u32 r5838, [r6144+10240]; +ld.shared.u32 r5841, [r6144+10244]; +ld.shared.u32 r5888, [r6144+11264]; +ld.shared.u32 r5891, [r6144+11268]; +ld.shared.u32 r5938, [r6144+12288]; +ld.shared.u32 r5941, [r6144+12292]; +ld.shared.u32 r5988, [r6144+13312]; +ld.shared.u32 r5991, [r6144+13316]; +ld.shared.u32 r6038, [r6144+14336]; +ld.shared.u32 r6041, [r6144+14340]; +ld.shared.u32 r6088, [r6144+15360]; +ld.shared.u32 r6091, [r6144+15364]; +ld.shared.u32 r5727, [r6144+16384]; +ld.shared.u32 r5730, [r6144+16388]; +ld.shared.u32 r5777, [r6144+17408]; +ld.shared.u32 r5780, [r6144+17412]; +ld.shared.u32 r5827, [r6144+18432]; +ld.shared.u32 r5830, [r6144+18436]; +ld.shared.u32 r5877, [r6144+19456]; +ld.shared.u32 r5880, [r6144+19460]; +ld.shared.u32 r5927, [r6144+20480]; +ld.shared.u32 r5930, [r6144+20484]; +ld.shared.u32 r5977, [r6144+21504]; +ld.shared.u32 r5980, [r6144+21508]; +ld.shared.u32 r6027, [r6144+22528]; +ld.shared.u32 r6030, [r6144+22532]; +ld.shared.u32 r6077, [r6144+23552]; +ld.shared.u32 r6080, [r6144+23556]; +ld.shared.u32 r5739, [r6144+24576]; +ld.shared.u32 r5742, [r6144+24580]; +ld.shared.u32 r5789, [r6144+25600]; +ld.shared.u32 r5792, [r6144+25604]; +ld.shared.u32 r5839, [r6144+26624]; +ld.shared.u32 r5842, [r6144+26628]; +ld.shared.u32 r5889, [r6144+27648]; +ld.shared.u32 r5892, [r6144+27652]; +ld.shared.u32 r5939, [r6144+28672]; +ld.shared.u32 r5942, [r6144+28676]; +ld.shared.u32 r5989, [r6144+29696]; +ld.shared.u32 r5992, [r6144+29700]; +ld.shared.u32 r6039, [r6144+30720]; +ld.shared.u32 r6042, [r6144+30724]; +ld.shared.u32 r6089, [r6144+31744]; +ld.shared.u32 r6092, [r6144+31748]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 %0, r5725, r5737; +} +{ +add.f16x2 %1, r5728, r5740; +} +{ +sub.f16x2 %32, r5725, r5737; +} +{ +sub.f16x2 %33, r5728, r5740; +} +{ +add.f16x2 %16, r5731, r5749; +} +{ +add.f16x2 %17, r5734, r5743; +} +{ +sub.f16x2 %48, r5731, r5749; +} +{ +sub.f16x2 %49, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 %2, r5775, r5787; +} +{ +add.f16x2 %3, r5778, r5790; +} +{ +sub.f16x2 %34, r5775, r5787; +} +{ +sub.f16x2 %35, r5778, r5790; +} +{ +add.f16x2 %18, r5781, r5799; +} +{ +add.f16x2 %19, r5784, r5793; +} +{ +sub.f16x2 %50, r5781, r5799; +} +{ +sub.f16x2 %51, r5784, r5793; +} +{ +add.f16x2 r5825, r5826, r5827; +} +{ +add.f16x2 r5828, r5829, r5830; +} +{ +sub.f16x2 r5831, r5826, r5827; +} +{ +sub.f16x2 r5834, r5829, r5830; +} +{ +add.f16x2 r5837, r5838, r5839; +} +{ +add.f16x2 r5840, r5841, r5842; +} +{ +sub.f16x2 r5843, r5838, r5839; +} +{ +sub.f16x2 r5846, r5841, r5842; +} +{ +neg.f16x2 r5849, r5846; +} +{ +add.f16x2 %4, r5825, r5837; +} +{ +add.f16x2 %5, r5828, r5840; +} +{ +sub.f16x2 %36, r5825, r5837; +} +{ +sub.f16x2 %37, r5828, r5840; +} +{ +add.f16x2 %20, r5831, r5849; +} +{ +add.f16x2 %21, r5834, r5843; +} +{ +sub.f16x2 %52, r5831, r5849; +} +{ +sub.f16x2 %53, r5834, r5843; +} +{ +add.f16x2 r5875, r5876, r5877; +} +{ +add.f16x2 r5878, r5879, r5880; +} +{ +sub.f16x2 r5881, r5876, r5877; +} +{ +sub.f16x2 r5884, r5879, r5880; +} +{ +add.f16x2 r5887, r5888, r5889; +} +{ +add.f16x2 r5890, r5891, r5892; +} +{ +sub.f16x2 r5893, r5888, r5889; +} +{ +sub.f16x2 r5896, r5891, r5892; +} +{ +neg.f16x2 r5899, r5896; +} +{ +add.f16x2 %6, r5875, r5887; +} +{ +add.f16x2 %7, r5878, r5890; +} +{ +sub.f16x2 %38, r5875, r5887; +} +{ +sub.f16x2 %39, r5878, r5890; +} +{ +add.f16x2 %22, r5881, r5899; +} +{ +add.f16x2 %23, r5884, r5893; +} +{ +sub.f16x2 %54, r5881, r5899; +} +{ +sub.f16x2 %55, r5884, r5893; +} +{ +add.f16x2 r5925, r5926, r5927; +} +{ +add.f16x2 r5928, r5929, r5930; +} +{ +sub.f16x2 r5931, r5926, r5927; +} +{ +sub.f16x2 r5934, r5929, r5930; +} +{ +add.f16x2 r5937, r5938, r5939; +} +{ +add.f16x2 r5940, r5941, r5942; +} +{ +sub.f16x2 r5943, r5938, r5939; +} +{ +sub.f16x2 r5946, r5941, r5942; +} +{ +neg.f16x2 r5949, r5946; +} +{ +add.f16x2 %8, r5925, r5937; +} +{ +add.f16x2 %9, r5928, r5940; +} +{ +sub.f16x2 %40, r5925, r5937; +} +{ +sub.f16x2 %41, r5928, r5940; +} +{ +add.f16x2 %24, r5931, r5949; +} +{ +add.f16x2 %25, r5934, r5943; +} +{ +sub.f16x2 %56, r5931, r5949; +} +{ +sub.f16x2 %57, r5934, r5943; +} +{ +add.f16x2 r5975, r5976, r5977; +} +{ +add.f16x2 r5978, r5979, r5980; +} +{ +sub.f16x2 r5981, r5976, r5977; +} +{ +sub.f16x2 r5984, r5979, r5980; +} +{ +add.f16x2 r5987, r5988, r5989; +} +{ +add.f16x2 r5990, r5991, r5992; +} +{ +sub.f16x2 r5993, r5988, r5989; +} +{ +sub.f16x2 r5996, r5991, r5992; +} +{ +neg.f16x2 r5999, r5996; +} +{ +add.f16x2 %10, r5975, r5987; +} +{ +add.f16x2 %11, r5978, r5990; +} +{ +sub.f16x2 %42, r5975, r5987; +} +{ +sub.f16x2 %43, r5978, r5990; +} +{ +add.f16x2 %26, r5981, r5999; +} +{ +add.f16x2 %27, r5984, r5993; +} +{ +sub.f16x2 %58, r5981, r5999; +} +{ +sub.f16x2 %59, r5984, r5993; +} +{ +add.f16x2 r6025, r6026, r6027; +} +{ +add.f16x2 r6028, r6029, r6030; +} +{ +sub.f16x2 r6031, r6026, r6027; +} +{ +sub.f16x2 r6034, r6029, r6030; +} +{ +add.f16x2 r6037, r6038, r6039; +} +{ +add.f16x2 r6040, r6041, r6042; +} +{ +sub.f16x2 r6043, r6038, r6039; +} +{ +sub.f16x2 r6046, r6041, r6042; +} +{ +neg.f16x2 r6049, r6046; +} +{ +add.f16x2 %12, r6025, r6037; +} +{ +add.f16x2 %13, r6028, r6040; +} +{ +sub.f16x2 %44, r6025, r6037; +} +{ +sub.f16x2 %45, r6028, r6040; +} +{ +add.f16x2 %28, r6031, r6049; +} +{ +add.f16x2 %29, r6034, r6043; +} +{ +sub.f16x2 %60, r6031, r6049; +} +{ +sub.f16x2 %61, r6034, r6043; +} +{ +add.f16x2 r6075, r6076, r6077; +} +{ +add.f16x2 r6078, r6079, r6080; +} +{ +sub.f16x2 r6081, r6076, r6077; +} +{ +sub.f16x2 r6084, r6079, r6080; +} +{ +add.f16x2 r6087, r6088, r6089; +} +{ +add.f16x2 r6090, r6091, r6092; +} +{ +sub.f16x2 r6093, r6088, r6089; +} +{ +sub.f16x2 r6096, r6091, r6092; +} +{ +neg.f16x2 r6099, r6096; +} +{ +add.f16x2 %14, r6075, r6087; +} +{ +add.f16x2 %15, r6078, r6090; +} +{ +sub.f16x2 %46, r6075, r6087; +} +{ +sub.f16x2 %47, r6078, r6090; +} +{ +add.f16x2 %30, r6081, r6099; +} +{ +add.f16x2 %31, r6084, r6093; +} +{ +sub.f16x2 %62, r6081, r6099; +} +{ +sub.f16x2 %63, r6084, r6093; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1050, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<176>; +.reg .b32 r<1598>; +.reg .b64 rd<2>; +mov.u32 r1571, %tid.y; +shl.b32 r1572, r1571, 15; +mov.u32 r1573, %16; +add.s32 r1574, r1573, r1572; +mov.u32 r1575, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f150, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f136, 0f3F800000; +mov.f32 f148, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f135, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1576, r1575, 511; +shl.b32 r1577, r1575, 6; +and.b32 r1578, r1577, -32768; +add.s32 r1579, r1574, r1578; +cvt.rn.f32.u32 f167, r1576; +mul.f32 f168, f167, 0f3AC90FDB; +cos.approx.f32 f29, f168; +sin.approx.f32 f169, f168; +neg.f32 f30, f169; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1580, r1577, 32704; +add.s32 r1581, r1579, r1580; +st.shared.v4.f32 [r1581], {r149, r152, r207, r216}; +st.shared.v4.f32 [r1581+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r1581+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r1581+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r1582, r1576, -56, r1581; +ld.shared.u32 r460, [r1582]; +ld.shared.u32 r463, [r1582+4]; +ld.shared.u32 r510, [r1582+4096]; +ld.shared.u32 r513, [r1582+4100]; +ld.shared.u32 r472, [r1582+8192]; +ld.shared.u32 r475, [r1582+8196]; +ld.shared.u32 r522, [r1582+12288]; +ld.shared.u32 r525, [r1582+12292]; +ld.shared.u32 r461, [r1582+16384]; +ld.shared.u32 r464, [r1582+16388]; +ld.shared.u32 r511, [r1582+20480]; +ld.shared.u32 r514, [r1582+20484]; +ld.shared.u32 r473, [r1582+24576]; +ld.shared.u32 r476, [r1582+24580]; +ld.shared.u32 r523, [r1582+28672]; +ld.shared.u32 r526, [r1582+28676]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1583, r1575, 504; +bfe.u32 r1584, r1575, 3, 6; +cvt.rn.f32.u32 f170, r1584; +mul.f32 f171, f170, 0f3C490FDB; +cos.approx.f32 f75, f171; +sin.approx.f32 f172, f171; +neg.f32 f76, f172; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r1585, r1575, 3; +and.b32 r1586, r1585, 56; +add.s32 r1587, r1579, r1586; +barrier.sync 0; +and.b32 r1588, r1577, 32256; +add.s32 r1589, r1587, r1588; +st.shared.u32 [r1589], r607; +st.shared.u32 [r1589+4], r610; +st.shared.u32 [r1589+64], r665; +st.shared.u32 [r1589+68], r674; +st.shared.u32 [r1589+128], r702; +st.shared.u32 [r1589+132], r711; +st.shared.u32 [r1589+192], r739; +st.shared.u32 [r1589+196], r748; +st.shared.u32 [r1589+256], r776; +st.shared.u32 [r1589+260], r785; +st.shared.u32 [r1589+320], r813; +st.shared.u32 [r1589+324], r822; +st.shared.u32 [r1589+384], r850; +st.shared.u32 [r1589+388], r859; +st.shared.u32 [r1589+448], r887; +st.shared.u32 [r1589+452], r896; +barrier.sync 0; +mad.lo.s32 r1590, r1583, -56, r1589; +ld.shared.u32 r918, [r1590]; +ld.shared.u32 r921, [r1590+4]; +ld.shared.u32 r968, [r1590+4096]; +ld.shared.u32 r971, [r1590+4100]; +ld.shared.u32 r930, [r1590+8192]; +ld.shared.u32 r933, [r1590+8196]; +ld.shared.u32 r980, [r1590+12288]; +ld.shared.u32 r983, [r1590+12292]; +ld.shared.u32 r919, [r1590+16384]; +ld.shared.u32 r922, [r1590+16388]; +ld.shared.u32 r969, [r1590+20480]; +ld.shared.u32 r972, [r1590+20484]; +ld.shared.u32 r931, [r1590+24576]; +ld.shared.u32 r934, [r1590+24580]; +ld.shared.u32 r981, [r1590+28672]; +ld.shared.u32 r984, [r1590+28676]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1591, r1575, 448; +bfe.u32 r1592, r1575, 6, 3; +cvt.rn.f32.u32 f173, r1592; +mul.f32 f174, f173, 0f3DC90FDB; +cos.approx.f32 f121, f174; +sin.approx.f32 f175, f174; +neg.f32 f122, f175; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f135; +cvt.rn.f16.f32 high, f136; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +and.b32 r1593, r1585, 504; +add.s32 r1594, r1579, r1593; +barrier.sync 0; +and.b32 r1595, r1577, 28672; +add.s32 r1596, r1594, r1595; +st.shared.u32 [r1596], r1065; +st.shared.u32 [r1596+4], r1068; +st.shared.u32 [r1596+512], r1123; +st.shared.u32 [r1596+516], r1132; +st.shared.u32 [r1596+1024], r1160; +st.shared.u32 [r1596+1028], r1169; +st.shared.u32 [r1596+1536], r1197; +st.shared.u32 [r1596+1540], r1206; +st.shared.u32 [r1596+2048], r1234; +st.shared.u32 [r1596+2052], r1243; +st.shared.u32 [r1596+2560], r1271; +st.shared.u32 [r1596+2564], r1280; +st.shared.u32 [r1596+3072], r1308; +st.shared.u32 [r1596+3076], r1317; +st.shared.u32 [r1596+3584], r1345; +st.shared.u32 [r1596+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1597, r1591, -56, r1596; +ld.shared.u32 r1376, [r1597]; +ld.shared.u32 r1379, [r1597+4]; +ld.shared.u32 r1426, [r1597+4096]; +ld.shared.u32 r1429, [r1597+4100]; +ld.shared.u32 r1388, [r1597+8192]; +ld.shared.u32 r1391, [r1597+8196]; +ld.shared.u32 r1438, [r1597+12288]; +ld.shared.u32 r1441, [r1597+12292]; +ld.shared.u32 r1377, [r1597+16384]; +ld.shared.u32 r1380, [r1597+16388]; +ld.shared.u32 r1427, [r1597+20480]; +ld.shared.u32 r1430, [r1597+20484]; +ld.shared.u32 r1389, [r1597+24576]; +ld.shared.u32 r1392, [r1597+24580]; +ld.shared.u32 r1439, [r1597+28672]; +ld.shared.u32 r1442, [r1597+28676]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1396; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1399; +} +{ +add.f16x2 r1416, r1384, r1393; +} +{ +sub.f16x2 r1419, r1381, r1399; +} +{ +sub.f16x2 r1422, r1384, r1393; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1446; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1449; +} +{ +add.f16x2 r1466, r1434, r1443; +} +{ +sub.f16x2 r1469, r1431, r1449; +} +{ +sub.f16x2 r1472, r1434, r1443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1460; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 %0, r1401, r1451; +} +{ +add.f16x2 %1, r1404, r1454; +} +{ +sub.f16x2 %8, r1401, r1451; +} +{ +sub.f16x2 %9, r1404, r1454; +} +{ +add.f16x2 %2, r1413, r1495; +} +{ +add.f16x2 %3, r1416, r1501; +} +{ +sub.f16x2 %10, r1413, r1495; +} +{ +sub.f16x2 %11, r1416, r1501; +} +{ +add.f16x2 %4, r1407, r1505; +} +{ +add.f16x2 %5, r1410, r1457; +} +{ +sub.f16x2 %12, r1407, r1505; +} +{ +sub.f16x2 %13, r1410, r1457; +} +{ +add.f16x2 %6, r1419, r1513; +} +{ +add.f16x2 %7, r1422, r1519; +} +{ +sub.f16x2 %14, r1419, r1513; +} +{ +sub.f16x2 %15, r1422, r1519; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1054, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<853>; +.reg .b32 r<6210>; +.reg .b64 rd<3>; +mov.u32 r6125, %tid.y; +shl.b32 r6126, r6125, 14; +mov.u32 r6127, %64; +add.s32 r6128, r6127, r6126; +mov.u32 r6129, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %121, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %121, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f702, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r102, {low, high}; +} +mov.f32 f700, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %122, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %122, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6131, r6129, 7; +and.b32 r6132, r6131, -16384; +add.s32 r6133, r6128, r6132; +and.b32 r6145, r6129, 127; +cvt.rn.f32.u32 f845, r6145; +mul.f32 f846, f845, 0f3AC90FDB; +cos.approx.f32 f357, f846; +sin.approx.f32 f847, f846; +neg.f32 f358, f847; +mov.f32 f852, 0f3F800000; +mov.f32 f851, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r6134, r6131, 16256; +add.s32 r6135, r6133, r6134; +st.shared.v4.f32 [r6135], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r6135+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r6135+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r6135+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r6135+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r6135+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r6135+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r6135+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r6136, r6145, -124, r6135; +ld.shared.u32 r2864, [r6136]; +ld.shared.u32 r3480, [r6136+512]; +ld.shared.u32 r3060, [r6136+1024]; +ld.shared.u32 r3676, [r6136+1536]; +ld.shared.u32 r2914, [r6136+2048]; +ld.shared.u32 r3530, [r6136+2560]; +ld.shared.u32 r3110, [r6136+3072]; +ld.shared.u32 r3726, [r6136+3584]; +ld.shared.u32 r2876, [r6136+4096]; +ld.shared.u32 r3492, [r6136+4608]; +ld.shared.u32 r3072, [r6136+5120]; +ld.shared.u32 r3688, [r6136+5632]; +ld.shared.u32 r2926, [r6136+6144]; +ld.shared.u32 r3542, [r6136+6656]; +ld.shared.u32 r3122, [r6136+7168]; +ld.shared.u32 r3738, [r6136+7680]; +ld.shared.u32 r2865, [r6136+8192]; +ld.shared.u32 r3481, [r6136+8704]; +ld.shared.u32 r3061, [r6136+9216]; +ld.shared.u32 r3677, [r6136+9728]; +ld.shared.u32 r2915, [r6136+10240]; +ld.shared.u32 r3531, [r6136+10752]; +ld.shared.u32 r3111, [r6136+11264]; +ld.shared.u32 r3727, [r6136+11776]; +ld.shared.u32 r2877, [r6136+12288]; +ld.shared.u32 r3493, [r6136+12800]; +ld.shared.u32 r3073, [r6136+13312]; +ld.shared.u32 r3689, [r6136+13824]; +ld.shared.u32 r2927, [r6136+14336]; +ld.shared.u32 r3543, [r6136+14848]; +ld.shared.u32 r3123, [r6136+15360]; +ld.shared.u32 r3739, [r6136+15872]; +barrier.sync 0; +st.shared.v4.f32 [r6135], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r6135+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r6135+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r6135+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r6135+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r6135+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r6135+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r6135+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r6136]; +ld.shared.u32 r3483, [r6136+512]; +ld.shared.u32 r3063, [r6136+1024]; +ld.shared.u32 r3679, [r6136+1536]; +ld.shared.u32 r2917, [r6136+2048]; +ld.shared.u32 r3533, [r6136+2560]; +ld.shared.u32 r3113, [r6136+3072]; +ld.shared.u32 r3729, [r6136+3584]; +ld.shared.u32 r2879, [r6136+4096]; +ld.shared.u32 r3495, [r6136+4608]; +ld.shared.u32 r3075, [r6136+5120]; +ld.shared.u32 r3691, [r6136+5632]; +ld.shared.u32 r2929, [r6136+6144]; +ld.shared.u32 r3545, [r6136+6656]; +ld.shared.u32 r3125, [r6136+7168]; +ld.shared.u32 r3741, [r6136+7680]; +ld.shared.u32 r2868, [r6136+8192]; +ld.shared.u32 r3484, [r6136+8704]; +ld.shared.u32 r3064, [r6136+9216]; +ld.shared.u32 r3680, [r6136+9728]; +ld.shared.u32 r2918, [r6136+10240]; +ld.shared.u32 r3534, [r6136+10752]; +ld.shared.u32 r3114, [r6136+11264]; +ld.shared.u32 r3730, [r6136+11776]; +ld.shared.u32 r2880, [r6136+12288]; +ld.shared.u32 r3496, [r6136+12800]; +ld.shared.u32 r3076, [r6136+13312]; +ld.shared.u32 r3692, [r6136+13824]; +ld.shared.u32 r2930, [r6136+14336]; +ld.shared.u32 r3546, [r6136+14848]; +ld.shared.u32 r3126, [r6136+15360]; +ld.shared.u32 r3742, [r6136+15872]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f700; +cvt.rn.f16.f32 high, f700; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f702; +cvt.rn.f16.f32 high, f702; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r6137, r6129, 96; +bfe.u32 r6138, r6129, 5, 2; +shl.b32 r6139, r6129, 2; +and.b32 r6140, r6139, 124; +add.s32 r6141, r6133, r6140; +cvt.rn.f32.u32 f848, r6138; +mul.f32 f849, f848, 0f3D490FDB; +cos.approx.f32 f779, f849; +sin.approx.f32 f850, f849; +neg.f32 f780, f850; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f851; +cvt.rn.f16.f32 high, f852; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +barrier.sync 0; +and.b32 r6142, r6131, 12288; +add.s32 r6143, r6141, r6142; +st.shared.u32 [r6143], r4383; +st.shared.u32 [r6143+128], r4585; +st.shared.u32 [r6143+256], r4622; +st.shared.u32 [r6143+384], r4659; +st.shared.u32 [r6143+512], r4696; +st.shared.u32 [r6143+640], r4733; +st.shared.u32 [r6143+768], r4770; +st.shared.u32 [r6143+896], r4807; +st.shared.u32 [r6143+1024], r4844; +st.shared.u32 [r6143+1152], r4881; +st.shared.u32 [r6143+1280], r4918; +st.shared.u32 [r6143+1408], r4955; +st.shared.u32 [r6143+1536], r4992; +st.shared.u32 [r6143+1664], r5029; +st.shared.u32 [r6143+1792], r5066; +st.shared.u32 [r6143+1920], r5103; +st.shared.u32 [r6143+2048], r5140; +st.shared.u32 [r6143+2176], r5177; +st.shared.u32 [r6143+2304], r5214; +st.shared.u32 [r6143+2432], r5251; +st.shared.u32 [r6143+2560], r5288; +st.shared.u32 [r6143+2688], r5325; +st.shared.u32 [r6143+2816], r5362; +st.shared.u32 [r6143+2944], r5399; +st.shared.u32 [r6143+3072], r5436; +st.shared.u32 [r6143+3200], r5473; +st.shared.u32 [r6143+3328], r5510; +st.shared.u32 [r6143+3456], r5547; +st.shared.u32 [r6143+3584], r5584; +st.shared.u32 [r6143+3712], r5621; +st.shared.u32 [r6143+3840], r5658; +st.shared.u32 [r6143+3968], r5695; +barrier.sync 0; +mad.lo.s32 r6144, r6137, -124, r6143; +ld.shared.u32 r5726, [r6144]; +ld.shared.u32 r5776, [r6144+512]; +ld.shared.u32 r5826, [r6144+1024]; +ld.shared.u32 r5876, [r6144+1536]; +ld.shared.u32 r5926, [r6144+2048]; +ld.shared.u32 r5976, [r6144+2560]; +ld.shared.u32 r6026, [r6144+3072]; +ld.shared.u32 r6076, [r6144+3584]; +ld.shared.u32 r5738, [r6144+4096]; +ld.shared.u32 r5788, [r6144+4608]; +ld.shared.u32 r5838, [r6144+5120]; +ld.shared.u32 r5888, [r6144+5632]; +ld.shared.u32 r5938, [r6144+6144]; +ld.shared.u32 r5988, [r6144+6656]; +ld.shared.u32 r6038, [r6144+7168]; +ld.shared.u32 r6088, [r6144+7680]; +ld.shared.u32 r5727, [r6144+8192]; +ld.shared.u32 r5777, [r6144+8704]; +ld.shared.u32 r5827, [r6144+9216]; +ld.shared.u32 r5877, [r6144+9728]; +ld.shared.u32 r5927, [r6144+10240]; +ld.shared.u32 r5977, [r6144+10752]; +ld.shared.u32 r6027, [r6144+11264]; +ld.shared.u32 r6077, [r6144+11776]; +ld.shared.u32 r5739, [r6144+12288]; +ld.shared.u32 r5789, [r6144+12800]; +ld.shared.u32 r5839, [r6144+13312]; +ld.shared.u32 r5889, [r6144+13824]; +ld.shared.u32 r5939, [r6144+14336]; +ld.shared.u32 r5989, [r6144+14848]; +ld.shared.u32 r6039, [r6144+15360]; +ld.shared.u32 r6089, [r6144+15872]; +barrier.sync 0; +st.shared.u32 [r6143], r4386; +st.shared.u32 [r6143+128], r4594; +st.shared.u32 [r6143+256], r4631; +st.shared.u32 [r6143+384], r4668; +st.shared.u32 [r6143+512], r4705; +st.shared.u32 [r6143+640], r4742; +st.shared.u32 [r6143+768], r4779; +st.shared.u32 [r6143+896], r4816; +st.shared.u32 [r6143+1024], r4853; +st.shared.u32 [r6143+1152], r4890; +st.shared.u32 [r6143+1280], r4927; +st.shared.u32 [r6143+1408], r4964; +st.shared.u32 [r6143+1536], r5001; +st.shared.u32 [r6143+1664], r5038; +st.shared.u32 [r6143+1792], r5075; +st.shared.u32 [r6143+1920], r5112; +st.shared.u32 [r6143+2048], r5149; +st.shared.u32 [r6143+2176], r5186; +st.shared.u32 [r6143+2304], r5223; +st.shared.u32 [r6143+2432], r5260; +st.shared.u32 [r6143+2560], r5297; +st.shared.u32 [r6143+2688], r5334; +st.shared.u32 [r6143+2816], r5371; +st.shared.u32 [r6143+2944], r5408; +st.shared.u32 [r6143+3072], r5445; +st.shared.u32 [r6143+3200], r5482; +st.shared.u32 [r6143+3328], r5519; +st.shared.u32 [r6143+3456], r5556; +st.shared.u32 [r6143+3584], r5593; +st.shared.u32 [r6143+3712], r5630; +st.shared.u32 [r6143+3840], r5667; +st.shared.u32 [r6143+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r6144]; +ld.shared.u32 r5779, [r6144+512]; +ld.shared.u32 r5829, [r6144+1024]; +ld.shared.u32 r5879, [r6144+1536]; +ld.shared.u32 r5929, [r6144+2048]; +ld.shared.u32 r5979, [r6144+2560]; +ld.shared.u32 r6029, [r6144+3072]; +ld.shared.u32 r6079, [r6144+3584]; +ld.shared.u32 r5741, [r6144+4096]; +ld.shared.u32 r5791, [r6144+4608]; +ld.shared.u32 r5841, [r6144+5120]; +ld.shared.u32 r5891, [r6144+5632]; +ld.shared.u32 r5941, [r6144+6144]; +ld.shared.u32 r5991, [r6144+6656]; +ld.shared.u32 r6041, [r6144+7168]; +ld.shared.u32 r6091, [r6144+7680]; +ld.shared.u32 r5730, [r6144+8192]; +ld.shared.u32 r5780, [r6144+8704]; +ld.shared.u32 r5830, [r6144+9216]; +ld.shared.u32 r5880, [r6144+9728]; +ld.shared.u32 r5930, [r6144+10240]; +ld.shared.u32 r5980, [r6144+10752]; +ld.shared.u32 r6030, [r6144+11264]; +ld.shared.u32 r6080, [r6144+11776]; +ld.shared.u32 r5742, [r6144+12288]; +ld.shared.u32 r5792, [r6144+12800]; +ld.shared.u32 r5842, [r6144+13312]; +ld.shared.u32 r5892, [r6144+13824]; +ld.shared.u32 r5942, [r6144+14336]; +ld.shared.u32 r5992, [r6144+14848]; +ld.shared.u32 r6042, [r6144+15360]; +ld.shared.u32 r6092, [r6144+15872]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 %0, r5725, r5737; +} +{ +add.f16x2 %1, r5728, r5740; +} +{ +sub.f16x2 %32, r5725, r5737; +} +{ +sub.f16x2 %33, r5728, r5740; +} +{ +add.f16x2 %16, r5731, r5749; +} +{ +add.f16x2 %17, r5734, r5743; +} +{ +sub.f16x2 %48, r5731, r5749; +} +{ +sub.f16x2 %49, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 %2, r5775, r5787; +} +{ +add.f16x2 %3, r5778, r5790; +} +{ +sub.f16x2 %34, r5775, r5787; +} +{ +sub.f16x2 %35, r5778, r5790; +} +{ +add.f16x2 %18, r5781, r5799; +} +{ +add.f16x2 %19, r5784, r5793; +} +{ +sub.f16x2 %50, r5781, r5799; +} +{ +sub.f16x2 %51, r5784, r5793; +} +{ +add.f16x2 r5825, r5826, r5827; +} +{ +add.f16x2 r5828, r5829, r5830; +} +{ +sub.f16x2 r5831, r5826, r5827; +} +{ +sub.f16x2 r5834, r5829, r5830; +} +{ +add.f16x2 r5837, r5838, r5839; +} +{ +add.f16x2 r5840, r5841, r5842; +} +{ +sub.f16x2 r5843, r5838, r5839; +} +{ +sub.f16x2 r5846, r5841, r5842; +} +{ +neg.f16x2 r5849, r5846; +} +{ +add.f16x2 %4, r5825, r5837; +} +{ +add.f16x2 %5, r5828, r5840; +} +{ +sub.f16x2 %36, r5825, r5837; +} +{ +sub.f16x2 %37, r5828, r5840; +} +{ +add.f16x2 %20, r5831, r5849; +} +{ +add.f16x2 %21, r5834, r5843; +} +{ +sub.f16x2 %52, r5831, r5849; +} +{ +sub.f16x2 %53, r5834, r5843; +} +{ +add.f16x2 r5875, r5876, r5877; +} +{ +add.f16x2 r5878, r5879, r5880; +} +{ +sub.f16x2 r5881, r5876, r5877; +} +{ +sub.f16x2 r5884, r5879, r5880; +} +{ +add.f16x2 r5887, r5888, r5889; +} +{ +add.f16x2 r5890, r5891, r5892; +} +{ +sub.f16x2 r5893, r5888, r5889; +} +{ +sub.f16x2 r5896, r5891, r5892; +} +{ +neg.f16x2 r5899, r5896; +} +{ +add.f16x2 %6, r5875, r5887; +} +{ +add.f16x2 %7, r5878, r5890; +} +{ +sub.f16x2 %38, r5875, r5887; +} +{ +sub.f16x2 %39, r5878, r5890; +} +{ +add.f16x2 %22, r5881, r5899; +} +{ +add.f16x2 %23, r5884, r5893; +} +{ +sub.f16x2 %54, r5881, r5899; +} +{ +sub.f16x2 %55, r5884, r5893; +} +{ +add.f16x2 r5925, r5926, r5927; +} +{ +add.f16x2 r5928, r5929, r5930; +} +{ +sub.f16x2 r5931, r5926, r5927; +} +{ +sub.f16x2 r5934, r5929, r5930; +} +{ +add.f16x2 r5937, r5938, r5939; +} +{ +add.f16x2 r5940, r5941, r5942; +} +{ +sub.f16x2 r5943, r5938, r5939; +} +{ +sub.f16x2 r5946, r5941, r5942; +} +{ +neg.f16x2 r5949, r5946; +} +{ +add.f16x2 %8, r5925, r5937; +} +{ +add.f16x2 %9, r5928, r5940; +} +{ +sub.f16x2 %40, r5925, r5937; +} +{ +sub.f16x2 %41, r5928, r5940; +} +{ +add.f16x2 %24, r5931, r5949; +} +{ +add.f16x2 %25, r5934, r5943; +} +{ +sub.f16x2 %56, r5931, r5949; +} +{ +sub.f16x2 %57, r5934, r5943; +} +{ +add.f16x2 r5975, r5976, r5977; +} +{ +add.f16x2 r5978, r5979, r5980; +} +{ +sub.f16x2 r5981, r5976, r5977; +} +{ +sub.f16x2 r5984, r5979, r5980; +} +{ +add.f16x2 r5987, r5988, r5989; +} +{ +add.f16x2 r5990, r5991, r5992; +} +{ +sub.f16x2 r5993, r5988, r5989; +} +{ +sub.f16x2 r5996, r5991, r5992; +} +{ +neg.f16x2 r5999, r5996; +} +{ +add.f16x2 %10, r5975, r5987; +} +{ +add.f16x2 %11, r5978, r5990; +} +{ +sub.f16x2 %42, r5975, r5987; +} +{ +sub.f16x2 %43, r5978, r5990; +} +{ +add.f16x2 %26, r5981, r5999; +} +{ +add.f16x2 %27, r5984, r5993; +} +{ +sub.f16x2 %58, r5981, r5999; +} +{ +sub.f16x2 %59, r5984, r5993; +} +{ +add.f16x2 r6025, r6026, r6027; +} +{ +add.f16x2 r6028, r6029, r6030; +} +{ +sub.f16x2 r6031, r6026, r6027; +} +{ +sub.f16x2 r6034, r6029, r6030; +} +{ +add.f16x2 r6037, r6038, r6039; +} +{ +add.f16x2 r6040, r6041, r6042; +} +{ +sub.f16x2 r6043, r6038, r6039; +} +{ +sub.f16x2 r6046, r6041, r6042; +} +{ +neg.f16x2 r6049, r6046; +} +{ +add.f16x2 %12, r6025, r6037; +} +{ +add.f16x2 %13, r6028, r6040; +} +{ +sub.f16x2 %44, r6025, r6037; +} +{ +sub.f16x2 %45, r6028, r6040; +} +{ +add.f16x2 %28, r6031, r6049; +} +{ +add.f16x2 %29, r6034, r6043; +} +{ +sub.f16x2 %60, r6031, r6049; +} +{ +sub.f16x2 %61, r6034, r6043; +} +{ +add.f16x2 r6075, r6076, r6077; +} +{ +add.f16x2 r6078, r6079, r6080; +} +{ +sub.f16x2 r6081, r6076, r6077; +} +{ +sub.f16x2 r6084, r6079, r6080; +} +{ +add.f16x2 r6087, r6088, r6089; +} +{ +add.f16x2 r6090, r6091, r6092; +} +{ +sub.f16x2 r6093, r6088, r6089; +} +{ +sub.f16x2 r6096, r6091, r6092; +} +{ +neg.f16x2 r6099, r6096; +} +{ +add.f16x2 %14, r6075, r6087; +} +{ +add.f16x2 %15, r6078, r6090; +} +{ +sub.f16x2 %46, r6075, r6087; +} +{ +sub.f16x2 %47, r6078, r6090; +} +{ +add.f16x2 %30, r6081, r6099; +} +{ +add.f16x2 %31, r6084, r6093; +} +{ +sub.f16x2 %62, r6081, r6099; +} +{ +sub.f16x2 %63, r6084, r6093; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1055, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .f32 f<74>; +.reg .b32 r<930>; +.reg .b64 rd<2>; +{ +add.f16x2 r22, %9, %13; +} +{ +add.f16x2 r25, %10, %14; +} +{ +sub.f16x2 r28, %9, %13; +} +{ +sub.f16x2 r31, %10, %14; +} +{ +add.f16x2 r34, %11, %15; +} +{ +add.f16x2 r37, %12, %16; +} +{ +sub.f16x2 r40, %11, %15; +} +{ +sub.f16x2 r43, %12, %16; +} +{ +neg.f16x2 r46, r43; +} +{ +add.f16x2 r48, r22, r34; +} +{ +add.f16x2 r51, r25, r37; +} +{ +sub.f16x2 r54, r22, r34; +} +{ +sub.f16x2 r57, r25, r37; +} +{ +add.f16x2 r60, r28, r46; +} +{ +add.f16x2 r63, r31, r40; +} +{ +sub.f16x2 r66, r28, r46; +} +{ +sub.f16x2 r69, r31, r40; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 1023; +cvt.rn.f32.u32 f8, r10; +mul.f32 f1, f8, 0f3AC90FDB; +setp.eq.s32 p1, r10, 1020; +mov.f32 f73, 0f3BC90F88; +mov.f32 f72, f73; +@p1 bra LBB6_2; +cos.approx.f32 f72, f1; +LBB6_2: +mov.u32 r236, %tid.y; +shl.b32 r237, r236, 15; +mov.u32 r238, %8; +add.s32 r239, r238, r237; +shl.b32 r240, r9, 5; +and.b32 r241, r240, -32768; +add.s32 r12, r239, r241; +sin.approx.f32 f20, f1; +neg.f32 f10, f20; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f10; +mov.b32 r72, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r75, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r77, {high, high}; +} +{ +mul.f16x2 r79, r63, r77; +} +{ +fma.rn.f16x2 r82, r60, r75, r79; +} +{ +mul.f16x2 r86, r60, r77; +} +{ +neg.f16x2 r89, r86; +} +{ +fma.rn.f16x2 r91, r63, r75, r89; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r95, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r97, {high, high}; +} +mov.f32 f15, 0fBF800000; +mov.f32 f16, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r99, {low, high}; +} +{ +mul.f16x2 r100, r97, r99; +} +{ +mul.f16x2 r103, r72, r95; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r106, {high, low}; +} +{ +fma.rn.f16x2 r108, r100, r106, r103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r108; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r108; +mov.b32 r114, {high, high}; +} +{ +mul.f16x2 r116, r57, r114; +} +{ +fma.rn.f16x2 r119, r54, r112, r116; +} +{ +mul.f16x2 r123, r54, r114; +} +{ +neg.f16x2 r126, r123; +} +{ +fma.rn.f16x2 r128, r57, r112, r126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r132, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r72; +mov.b32 r134, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r136, {low, high}; +} +{ +mul.f16x2 r137, r134, r136; +} +{ +mul.f16x2 r140, r108, r132; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r108; +mov.b32 r143, {high, low}; +} +{ +fma.rn.f16x2 r145, r137, r143, r140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r145; +mov.b32 r149, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r145; +mov.b32 r151, {high, high}; +} +{ +mul.f16x2 r153, r69, r151; +} +{ +fma.rn.f16x2 r156, r66, r149, r153; +} +{ +mul.f16x2 r160, r66, r151; +} +{ +neg.f16x2 r163, r160; +} +{ +fma.rn.f16x2 r165, r69, r149, r163; +} +barrier.sync 0; +shl.b32 r242, r10, 5; +add.s32 r243, r12, r242; +st.shared.v4.f32 [r243], {r48, r51, r82, r91}; +st.shared.v4.f32 [r243+16], {r119, r128, r156, r165}; +barrier.sync 0; +mad.lo.s32 r244, r10, -24, r243; +ld.shared.u32 r187, [r244]; +ld.shared.u32 r190, [r244+4]; +ld.shared.u32 r199, [r244+8192]; +ld.shared.u32 r202, [r244+8196]; +ld.shared.u32 r188, [r244+16384]; +ld.shared.u32 r191, [r244+16388]; +ld.shared.u32 r200, [r244+24576]; +ld.shared.u32 r203, [r244+24580]; +{ +add.f16x2 r186, r187, r188; +} +{ +add.f16x2 r189, r190, r191; +} +{ +sub.f16x2 r192, r187, r188; +} +{ +sub.f16x2 r195, r190, r191; +} +{ +add.f16x2 r198, r199, r200; +} +{ +add.f16x2 r201, r202, r203; +} +{ +sub.f16x2 r204, r199, r200; +} +{ +sub.f16x2 r207, r202, r203; +} +{ +neg.f16x2 r210, r207; +} +{ +add.f16x2 r212, r186, r198; +} +{ +add.f16x2 r215, r189, r201; +} +{ +sub.f16x2 r218, r186, r198; +} +{ +sub.f16x2 r221, r189, r201; +} +{ +add.f16x2 r224, r192, r210; +} +{ +add.f16x2 r227, r195, r204; +} +{ +sub.f16x2 r230, r192, r210; +} +{ +sub.f16x2 r233, r195, r204; +} +and.b32 r21, r9, 1020; +bfe.u32 r245, r9, 2, 8; +cvt.rn.f32.u32 f21, r245; +mul.f32 f4, f21, 0f3BC90FDB; +setp.eq.s32 p2, r21, 1020; +@p2 bra LBB6_4; +cos.approx.f32 f73, f4; +LBB6_4: +sin.approx.f32 f62, f4; +neg.f32 f23, f62; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f23; +mov.b32 r246, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r249, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r251, {high, high}; +} +{ +mul.f16x2 r253, r227, r251; +} +{ +fma.rn.f16x2 r256, r224, r249, r253; +} +{ +mul.f16x2 r260, r224, r251; +} +{ +neg.f16x2 r263, r260; +} +{ +fma.rn.f16x2 r265, r227, r249, r263; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r269, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r271, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r273, {low, high}; +} +{ +mul.f16x2 r274, r271, r273; +} +{ +mul.f16x2 r277, r246, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r280, {high, low}; +} +{ +fma.rn.f16x2 r282, r274, r280, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r282; +mov.b32 r286, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r282; +mov.b32 r288, {high, high}; +} +{ +mul.f16x2 r290, r221, r288; +} +{ +fma.rn.f16x2 r293, r218, r286, r290; +} +{ +mul.f16x2 r297, r218, r288; +} +{ +neg.f16x2 r300, r297; +} +{ +fma.rn.f16x2 r302, r221, r286, r300; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r306, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r246; +mov.b32 r308, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r310, {low, high}; +} +{ +mul.f16x2 r311, r308, r310; +} +{ +mul.f16x2 r314, r282, r306; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r282; +mov.b32 r317, {high, low}; +} +{ +fma.rn.f16x2 r319, r311, r317, r314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r319; +mov.b32 r323, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r319; +mov.b32 r325, {high, high}; +} +{ +mul.f16x2 r327, r233, r325; +} +{ +fma.rn.f16x2 r330, r230, r323, r327; +} +{ +mul.f16x2 r334, r230, r325; +} +{ +neg.f16x2 r337, r334; +} +{ +fma.rn.f16x2 r339, r233, r323, r337; +} +barrier.sync 0; +shl.b32 r902, r9, 3; +and.b32 r903, r902, 24; +add.s32 r904, r12, r903; +shl.b32 r905, r21, 5; +add.s32 r906, r904, r905; +st.shared.u32 [r906], r212; +st.shared.u32 [r906+4], r215; +st.shared.u32 [r906+32], r256; +st.shared.u32 [r906+36], r265; +st.shared.u32 [r906+64], r293; +st.shared.u32 [r906+68], r302; +st.shared.u32 [r906+96], r330; +st.shared.u32 [r906+100], r339; +barrier.sync 0; +mad.lo.s32 r907, r21, -24, r906; +ld.shared.u32 r361, [r907]; +ld.shared.u32 r364, [r907+4]; +ld.shared.u32 r373, [r907+8192]; +ld.shared.u32 r376, [r907+8196]; +ld.shared.u32 r362, [r907+16384]; +ld.shared.u32 r365, [r907+16388]; +ld.shared.u32 r374, [r907+24576]; +ld.shared.u32 r377, [r907+24580]; +{ +add.f16x2 r360, r361, r362; +} +{ +add.f16x2 r363, r364, r365; +} +{ +sub.f16x2 r366, r361, r362; +} +{ +sub.f16x2 r369, r364, r365; +} +{ +add.f16x2 r372, r373, r374; +} +{ +add.f16x2 r375, r376, r377; +} +{ +sub.f16x2 r378, r373, r374; +} +{ +sub.f16x2 r381, r376, r377; +} +{ +neg.f16x2 r384, r381; +} +{ +add.f16x2 r386, r360, r372; +} +{ +add.f16x2 r389, r363, r375; +} +{ +sub.f16x2 r392, r360, r372; +} +{ +sub.f16x2 r395, r363, r375; +} +{ +add.f16x2 r398, r366, r384; +} +{ +add.f16x2 r401, r369, r378; +} +{ +sub.f16x2 r404, r366, r384; +} +{ +sub.f16x2 r407, r369, r378; +} +and.b32 r908, r9, 1008; +bfe.u32 r909, r9, 4, 6; +cvt.rn.f32.u32 f63, r909; +mul.f32 f64, f63, 0f3CC90FDB; +cos.approx.f32 f32, f64; +sin.approx.f32 f65, f64; +neg.f32 f33, f65; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f33; +mov.b32 r410, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r413, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r415, {high, high}; +} +{ +mul.f16x2 r417, r401, r415; +} +{ +fma.rn.f16x2 r420, r398, r413, r417; +} +{ +mul.f16x2 r424, r398, r415; +} +{ +neg.f16x2 r427, r424; +} +{ +fma.rn.f16x2 r429, r401, r413, r427; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r433, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r435, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r437, {low, high}; +} +{ +mul.f16x2 r438, r435, r437; +} +{ +mul.f16x2 r441, r410, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r444, {high, low}; +} +{ +fma.rn.f16x2 r446, r438, r444, r441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r446; +mov.b32 r450, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r446; +mov.b32 r452, {high, high}; +} +{ +mul.f16x2 r454, r395, r452; +} +{ +fma.rn.f16x2 r457, r392, r450, r454; +} +{ +mul.f16x2 r461, r392, r452; +} +{ +neg.f16x2 r464, r461; +} +{ +fma.rn.f16x2 r466, r395, r450, r464; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r470, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r410; +mov.b32 r472, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r474, {low, high}; +} +{ +mul.f16x2 r475, r472, r474; +} +{ +mul.f16x2 r478, r446, r470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r446; +mov.b32 r481, {high, low}; +} +{ +fma.rn.f16x2 r483, r475, r481, r478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r483; +mov.b32 r487, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r483; +mov.b32 r489, {high, high}; +} +{ +mul.f16x2 r491, r407, r489; +} +{ +fma.rn.f16x2 r494, r404, r487, r491; +} +{ +mul.f16x2 r498, r404, r489; +} +{ +neg.f16x2 r501, r498; +} +{ +fma.rn.f16x2 r503, r407, r487, r501; +} +and.b32 r910, r902, 120; +add.s32 r911, r12, r910; +barrier.sync 0; +and.b32 r913, r240, 32256; +add.s32 r914, r911, r913; +st.shared.u32 [r914], r386; +st.shared.u32 [r914+4], r389; +st.shared.u32 [r914+128], r420; +st.shared.u32 [r914+132], r429; +st.shared.u32 [r914+256], r457; +st.shared.u32 [r914+260], r466; +st.shared.u32 [r914+384], r494; +st.shared.u32 [r914+388], r503; +barrier.sync 0; +mad.lo.s32 r915, r908, -24, r914; +ld.shared.u32 r525, [r915]; +ld.shared.u32 r528, [r915+4]; +ld.shared.u32 r537, [r915+8192]; +ld.shared.u32 r540, [r915+8196]; +ld.shared.u32 r526, [r915+16384]; +ld.shared.u32 r529, [r915+16388]; +ld.shared.u32 r538, [r915+24576]; +ld.shared.u32 r541, [r915+24580]; +{ +add.f16x2 r524, r525, r526; +} +{ +add.f16x2 r527, r528, r529; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +sub.f16x2 r533, r528, r529; +} +{ +add.f16x2 r536, r537, r538; +} +{ +add.f16x2 r539, r540, r541; +} +{ +sub.f16x2 r542, r537, r538; +} +{ +sub.f16x2 r545, r540, r541; +} +{ +neg.f16x2 r548, r545; +} +{ +add.f16x2 r550, r524, r536; +} +{ +add.f16x2 r553, r527, r539; +} +{ +sub.f16x2 r556, r524, r536; +} +{ +sub.f16x2 r559, r527, r539; +} +{ +add.f16x2 r562, r530, r548; +} +{ +add.f16x2 r565, r533, r542; +} +{ +sub.f16x2 r568, r530, r548; +} +{ +sub.f16x2 r571, r533, r542; +} +and.b32 r916, r9, 960; +bfe.u32 r917, r9, 6, 4; +cvt.rn.f32.u32 f66, r917; +mul.f32 f67, f66, 0f3DC90FDB; +cos.approx.f32 f42, f67; +sin.approx.f32 f68, f67; +neg.f32 f43, f68; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r574, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r579, {high, high}; +} +{ +mul.f16x2 r581, r565, r579; +} +{ +fma.rn.f16x2 r584, r562, r577, r581; +} +{ +mul.f16x2 r588, r562, r579; +} +{ +neg.f16x2 r591, r588; +} +{ +fma.rn.f16x2 r593, r565, r577, r591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r601, {low, high}; +} +{ +mul.f16x2 r602, r599, r601; +} +{ +mul.f16x2 r605, r574, r597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r608, {high, low}; +} +{ +fma.rn.f16x2 r610, r602, r608, r605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r610; +mov.b32 r614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r610; +mov.b32 r616, {high, high}; +} +{ +mul.f16x2 r618, r559, r616; +} +{ +fma.rn.f16x2 r621, r556, r614, r618; +} +{ +mul.f16x2 r625, r556, r616; +} +{ +neg.f16x2 r628, r625; +} +{ +fma.rn.f16x2 r630, r559, r614, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r574; +mov.b32 r636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r638, {low, high}; +} +{ +mul.f16x2 r639, r636, r638; +} +{ +mul.f16x2 r642, r610, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r610; +mov.b32 r645, {high, low}; +} +{ +fma.rn.f16x2 r647, r639, r645, r642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r647; +mov.b32 r651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r647; +mov.b32 r653, {high, high}; +} +{ +mul.f16x2 r655, r571, r653; +} +{ +fma.rn.f16x2 r658, r568, r651, r655; +} +{ +mul.f16x2 r662, r568, r653; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r571, r651, r665; +} +and.b32 r918, r902, 504; +add.s32 r919, r12, r918; +barrier.sync 0; +and.b32 r920, r240, 30720; +add.s32 r921, r919, r920; +st.shared.u32 [r921], r550; +st.shared.u32 [r921+4], r553; +st.shared.u32 [r921+512], r584; +st.shared.u32 [r921+516], r593; +st.shared.u32 [r921+1024], r621; +st.shared.u32 [r921+1028], r630; +st.shared.u32 [r921+1536], r658; +st.shared.u32 [r921+1540], r667; +barrier.sync 0; +mad.lo.s32 r922, r916, -24, r921; +ld.shared.u32 r689, [r922]; +ld.shared.u32 r692, [r922+4]; +ld.shared.u32 r701, [r922+8192]; +ld.shared.u32 r704, [r922+8196]; +ld.shared.u32 r690, [r922+16384]; +ld.shared.u32 r693, [r922+16388]; +ld.shared.u32 r702, [r922+24576]; +ld.shared.u32 r705, [r922+24580]; +{ +add.f16x2 r688, r689, r690; +} +{ +add.f16x2 r691, r692, r693; +} +{ +sub.f16x2 r694, r689, r690; +} +{ +sub.f16x2 r697, r692, r693; +} +{ +add.f16x2 r700, r701, r702; +} +{ +add.f16x2 r703, r704, r705; +} +{ +sub.f16x2 r706, r701, r702; +} +{ +sub.f16x2 r709, r704, r705; +} +{ +neg.f16x2 r712, r709; +} +{ +add.f16x2 r714, r688, r700; +} +{ +add.f16x2 r717, r691, r703; +} +{ +sub.f16x2 r720, r688, r700; +} +{ +sub.f16x2 r723, r691, r703; +} +{ +add.f16x2 r726, r694, r712; +} +{ +add.f16x2 r729, r697, r706; +} +{ +sub.f16x2 r732, r694, r712; +} +{ +sub.f16x2 r735, r697, r706; +} +and.b32 r923, r9, 768; +bfe.u32 r924, r9, 8, 2; +cvt.rn.f32.u32 f69, r924; +mul.f32 f70, f69, 0f3EC90FDB; +cos.approx.f32 f52, f70; +sin.approx.f32 f71, f70; +neg.f32 f53, f71; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f53; +mov.b32 r738, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r729, r743; +} +{ +fma.rn.f16x2 r748, r726, r741, r745; +} +{ +mul.f16x2 r752, r726, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r729, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r738, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r723, r780; +} +{ +fma.rn.f16x2 r785, r720, r778, r782; +} +{ +mul.f16x2 r789, r720, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r723, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r738; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r735, r817; +} +{ +fma.rn.f16x2 r822, r732, r815, r819; +} +{ +mul.f16x2 r826, r732, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r735, r815, r829; +} +and.b32 r925, r902, 2040; +add.s32 r926, r12, r925; +barrier.sync 0; +and.b32 r927, r240, 24576; +add.s32 r928, r926, r927; +st.shared.u32 [r928], r714; +st.shared.u32 [r928+4], r717; +st.shared.u32 [r928+2048], r748; +st.shared.u32 [r928+2052], r757; +st.shared.u32 [r928+4096], r785; +st.shared.u32 [r928+4100], r794; +st.shared.u32 [r928+6144], r822; +st.shared.u32 [r928+6148], r831; +barrier.sync 0; +mad.lo.s32 r929, r923, -24, r928; +ld.shared.u32 r853, [r929]; +ld.shared.u32 r856, [r929+4]; +ld.shared.u32 r865, [r929+8192]; +ld.shared.u32 r868, [r929+8196]; +ld.shared.u32 r854, [r929+16384]; +ld.shared.u32 r857, [r929+16388]; +ld.shared.u32 r866, [r929+24576]; +ld.shared.u32 r869, [r929+24580]; +{ +add.f16x2 r852, r853, r854; +} +{ +add.f16x2 r855, r856, r857; +} +{ +sub.f16x2 r858, r853, r854; +} +{ +sub.f16x2 r861, r856, r857; +} +{ +add.f16x2 r864, r865, r866; +} +{ +add.f16x2 r867, r868, r869; +} +{ +sub.f16x2 r870, r865, r866; +} +{ +sub.f16x2 r873, r868, r869; +} +{ +neg.f16x2 r876, r873; +} +{ +add.f16x2 %0, r852, r864; +} +{ +add.f16x2 %1, r855, r867; +} +{ +sub.f16x2 %4, r852, r864; +} +{ +sub.f16x2 %5, r855, r867; +} +{ +add.f16x2 %2, r858, r876; +} +{ +add.f16x2 %3, r861, r870; +} +{ +sub.f16x2 %6, r858, r876; +} +{ +sub.f16x2 %7, r861, r870; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1056, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .pred p<3>; +.reg .f32 f<74>; +.reg .b32 r<929>; +.reg .b64 rd<2>; +{ +add.f16x2 r21, %9, %13; +} +{ +add.f16x2 r24, %10, %14; +} +{ +sub.f16x2 r27, %9, %13; +} +{ +sub.f16x2 r30, %10, %14; +} +{ +add.f16x2 r33, %11, %15; +} +{ +add.f16x2 r36, %12, %16; +} +{ +sub.f16x2 r39, %11, %15; +} +{ +sub.f16x2 r42, %12, %16; +} +{ +neg.f16x2 r45, r42; +} +{ +add.f16x2 r47, r21, r33; +} +{ +add.f16x2 r50, r24, r36; +} +{ +sub.f16x2 r53, r21, r33; +} +{ +sub.f16x2 r56, r24, r36; +} +{ +add.f16x2 r59, r27, r45; +} +{ +add.f16x2 r62, r30, r39; +} +{ +sub.f16x2 r65, r27, r45; +} +{ +sub.f16x2 r68, r30, r39; +} +mov.u32 r9, %tid.x; +and.b32 r10, r9, 1023; +cvt.rn.f32.u32 f8, r10; +mul.f32 f1, f8, 0f3AC90FDB; +setp.eq.s32 p1, r10, 1020; +mov.f32 f73, 0f3BC90F88; +mov.f32 f72, f73; +@p1 bra LBB7_2; +cos.approx.f32 f72, f1; +LBB7_2: +mov.u32 r235, %tid.y; +shl.b32 r236, r235, 14; +mov.u32 r237, %8; +add.s32 r238, r237, r236; +shl.b32 r239, r9, 4; +and.b32 r240, r239, -16384; +add.s32 r11, r238, r240; +sin.approx.f32 f20, f1; +neg.f32 f10, f20; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f72; +cvt.rn.f16.f32 high, f10; +mov.b32 r71, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r76, {high, high}; +} +{ +mul.f16x2 r78, r62, r76; +} +{ +fma.rn.f16x2 r81, r59, r74, r78; +} +{ +mul.f16x2 r85, r59, r76; +} +{ +neg.f16x2 r88, r85; +} +{ +fma.rn.f16x2 r90, r62, r74, r88; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r94, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r96, {high, high}; +} +mov.f32 f15, 0fBF800000; +mov.f32 f16, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r98, {low, high}; +} +{ +mul.f16x2 r99, r96, r98; +} +{ +mul.f16x2 r102, r71, r94; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r105, {high, low}; +} +{ +fma.rn.f16x2 r107, r99, r105, r102; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r107; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r107; +mov.b32 r113, {high, high}; +} +{ +mul.f16x2 r115, r56, r113; +} +{ +fma.rn.f16x2 r118, r53, r111, r115; +} +{ +mul.f16x2 r122, r53, r113; +} +{ +neg.f16x2 r125, r122; +} +{ +fma.rn.f16x2 r127, r56, r111, r125; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r131, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r71; +mov.b32 r133, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r135, {low, high}; +} +{ +mul.f16x2 r136, r133, r135; +} +{ +mul.f16x2 r139, r107, r131; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r107; +mov.b32 r142, {high, low}; +} +{ +fma.rn.f16x2 r144, r136, r142, r139; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r148, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r144; +mov.b32 r150, {high, high}; +} +{ +mul.f16x2 r152, r68, r150; +} +{ +fma.rn.f16x2 r155, r65, r148, r152; +} +{ +mul.f16x2 r159, r65, r150; +} +{ +neg.f16x2 r162, r159; +} +{ +fma.rn.f16x2 r164, r68, r148, r162; +} +barrier.sync 0; +shl.b32 r241, r10, 4; +add.s32 r242, r11, r241; +st.shared.v4.f32 [r242], {r47, r81, r118, r155}; +barrier.sync 0; +mad.lo.s32 r243, r10, -12, r242; +ld.shared.u32 r186, [r243]; +ld.shared.u32 r198, [r243+4096]; +ld.shared.u32 r187, [r243+8192]; +ld.shared.u32 r199, [r243+12288]; +barrier.sync 0; +st.shared.v4.f32 [r242], {r50, r90, r127, r164}; +barrier.sync 0; +ld.shared.u32 r189, [r243]; +ld.shared.u32 r201, [r243+4096]; +ld.shared.u32 r190, [r243+8192]; +ld.shared.u32 r202, [r243+12288]; +{ +add.f16x2 r185, r186, r187; +} +{ +add.f16x2 r188, r189, r190; +} +{ +sub.f16x2 r191, r186, r187; +} +{ +sub.f16x2 r194, r189, r190; +} +{ +add.f16x2 r197, r198, r199; +} +{ +add.f16x2 r200, r201, r202; +} +{ +sub.f16x2 r203, r198, r199; +} +{ +sub.f16x2 r206, r201, r202; +} +{ +neg.f16x2 r209, r206; +} +{ +add.f16x2 r211, r185, r197; +} +{ +add.f16x2 r214, r188, r200; +} +{ +sub.f16x2 r217, r185, r197; +} +{ +sub.f16x2 r220, r188, r200; +} +{ +add.f16x2 r223, r191, r209; +} +{ +add.f16x2 r226, r194, r203; +} +{ +sub.f16x2 r229, r191, r209; +} +{ +sub.f16x2 r232, r194, r203; +} +and.b32 r20, r9, 1020; +bfe.u32 r244, r9, 2, 8; +cvt.rn.f32.u32 f21, r244; +mul.f32 f4, f21, 0f3BC90FDB; +setp.eq.s32 p2, r20, 1020; +@p2 bra LBB7_4; +cos.approx.f32 f73, f4; +LBB7_4: +sin.approx.f32 f62, f4; +neg.f32 f23, f62; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f23; +mov.b32 r245, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r250, {high, high}; +} +{ +mul.f16x2 r252, r226, r250; +} +{ +fma.rn.f16x2 r255, r223, r248, r252; +} +{ +mul.f16x2 r259, r223, r250; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r226, r248, r262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r273, r270, r272; +} +{ +mul.f16x2 r276, r245, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r279, {high, low}; +} +{ +fma.rn.f16x2 r281, r273, r279, r276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r281; +mov.b32 r285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r281; +mov.b32 r287, {high, high}; +} +{ +mul.f16x2 r289, r220, r287; +} +{ +fma.rn.f16x2 r292, r217, r285, r289; +} +{ +mul.f16x2 r296, r217, r287; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r220, r285, r299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r245; +mov.b32 r307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r309, {low, high}; +} +{ +mul.f16x2 r310, r307, r309; +} +{ +mul.f16x2 r313, r281, r305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r281; +mov.b32 r316, {high, low}; +} +{ +fma.rn.f16x2 r318, r310, r316, r313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r318; +mov.b32 r322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r318; +mov.b32 r324, {high, high}; +} +{ +mul.f16x2 r326, r232, r324; +} +{ +fma.rn.f16x2 r329, r229, r322, r326; +} +{ +mul.f16x2 r333, r229, r324; +} +{ +neg.f16x2 r336, r333; +} +{ +fma.rn.f16x2 r338, r232, r322, r336; +} +barrier.sync 0; +shl.b32 r901, r9, 2; +and.b32 r902, r901, 12; +add.s32 r903, r11, r902; +shl.b32 r904, r20, 4; +add.s32 r905, r903, r904; +st.shared.u32 [r905], r211; +st.shared.u32 [r905+16], r255; +st.shared.u32 [r905+32], r292; +st.shared.u32 [r905+48], r329; +barrier.sync 0; +mad.lo.s32 r906, r20, -12, r905; +ld.shared.u32 r360, [r906]; +ld.shared.u32 r372, [r906+4096]; +ld.shared.u32 r361, [r906+8192]; +ld.shared.u32 r373, [r906+12288]; +barrier.sync 0; +st.shared.u32 [r905], r214; +st.shared.u32 [r905+16], r264; +st.shared.u32 [r905+32], r301; +st.shared.u32 [r905+48], r338; +barrier.sync 0; +ld.shared.u32 r363, [r906]; +ld.shared.u32 r375, [r906+4096]; +ld.shared.u32 r364, [r906+8192]; +ld.shared.u32 r376, [r906+12288]; +{ +add.f16x2 r359, r360, r361; +} +{ +add.f16x2 r362, r363, r364; +} +{ +sub.f16x2 r365, r360, r361; +} +{ +sub.f16x2 r368, r363, r364; +} +{ +add.f16x2 r371, r372, r373; +} +{ +add.f16x2 r374, r375, r376; +} +{ +sub.f16x2 r377, r372, r373; +} +{ +sub.f16x2 r380, r375, r376; +} +{ +neg.f16x2 r383, r380; +} +{ +add.f16x2 r385, r359, r371; +} +{ +add.f16x2 r388, r362, r374; +} +{ +sub.f16x2 r391, r359, r371; +} +{ +sub.f16x2 r394, r362, r374; +} +{ +add.f16x2 r397, r365, r383; +} +{ +add.f16x2 r400, r368, r377; +} +{ +sub.f16x2 r403, r365, r383; +} +{ +sub.f16x2 r406, r368, r377; +} +and.b32 r907, r9, 1008; +bfe.u32 r908, r9, 4, 6; +and.b32 r909, r901, 60; +add.s32 r910, r11, r909; +cvt.rn.f32.u32 f63, r908; +mul.f32 f64, f63, 0f3CC90FDB; +cos.approx.f32 f32, f64; +sin.approx.f32 f65, f64; +neg.f32 f33, f65; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f32; +cvt.rn.f16.f32 high, f33; +mov.b32 r409, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r414, {high, high}; +} +{ +mul.f16x2 r416, r400, r414; +} +{ +fma.rn.f16x2 r419, r397, r412, r416; +} +{ +mul.f16x2 r423, r397, r414; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r400, r412, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r436, {low, high}; +} +{ +mul.f16x2 r437, r434, r436; +} +{ +mul.f16x2 r440, r409, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r443, {high, low}; +} +{ +fma.rn.f16x2 r445, r437, r443, r440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r445; +mov.b32 r449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r445; +mov.b32 r451, {high, high}; +} +{ +mul.f16x2 r453, r394, r451; +} +{ +fma.rn.f16x2 r456, r391, r449, r453; +} +{ +mul.f16x2 r460, r391, r451; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r394, r449, r463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r409; +mov.b32 r471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r473, {low, high}; +} +{ +mul.f16x2 r474, r471, r473; +} +{ +mul.f16x2 r477, r445, r469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r445; +mov.b32 r480, {high, low}; +} +{ +fma.rn.f16x2 r482, r474, r480, r477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r482; +mov.b32 r486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r482; +mov.b32 r488, {high, high}; +} +{ +mul.f16x2 r490, r406, r488; +} +{ +fma.rn.f16x2 r493, r403, r486, r490; +} +{ +mul.f16x2 r497, r403, r488; +} +{ +neg.f16x2 r500, r497; +} +{ +fma.rn.f16x2 r502, r406, r486, r500; +} +barrier.sync 0; +and.b32 r912, r239, 16128; +add.s32 r913, r910, r912; +st.shared.u32 [r913], r385; +st.shared.u32 [r913+64], r419; +st.shared.u32 [r913+128], r456; +st.shared.u32 [r913+192], r493; +barrier.sync 0; +mad.lo.s32 r914, r907, -12, r913; +ld.shared.u32 r524, [r914]; +ld.shared.u32 r536, [r914+4096]; +ld.shared.u32 r525, [r914+8192]; +ld.shared.u32 r537, [r914+12288]; +barrier.sync 0; +st.shared.u32 [r913], r388; +st.shared.u32 [r913+64], r428; +st.shared.u32 [r913+128], r465; +st.shared.u32 [r913+192], r502; +barrier.sync 0; +ld.shared.u32 r527, [r914]; +ld.shared.u32 r539, [r914+4096]; +ld.shared.u32 r528, [r914+8192]; +ld.shared.u32 r540, [r914+12288]; +{ +add.f16x2 r523, r524, r525; +} +{ +add.f16x2 r526, r527, r528; +} +{ +sub.f16x2 r529, r524, r525; +} +{ +sub.f16x2 r532, r527, r528; +} +{ +add.f16x2 r535, r536, r537; +} +{ +add.f16x2 r538, r539, r540; +} +{ +sub.f16x2 r541, r536, r537; +} +{ +sub.f16x2 r544, r539, r540; +} +{ +neg.f16x2 r547, r544; +} +{ +add.f16x2 r549, r523, r535; +} +{ +add.f16x2 r552, r526, r538; +} +{ +sub.f16x2 r555, r523, r535; +} +{ +sub.f16x2 r558, r526, r538; +} +{ +add.f16x2 r561, r529, r547; +} +{ +add.f16x2 r564, r532, r541; +} +{ +sub.f16x2 r567, r529, r547; +} +{ +sub.f16x2 r570, r532, r541; +} +and.b32 r915, r9, 960; +bfe.u32 r916, r9, 6, 4; +and.b32 r917, r901, 252; +add.s32 r918, r11, r917; +cvt.rn.f32.u32 f66, r916; +mul.f32 f67, f66, 0f3DC90FDB; +cos.approx.f32 f42, f67; +sin.approx.f32 f68, f67; +neg.f32 f43, f68; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f43; +mov.b32 r573, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r578, {high, high}; +} +{ +mul.f16x2 r580, r564, r578; +} +{ +fma.rn.f16x2 r583, r561, r576, r580; +} +{ +mul.f16x2 r587, r561, r578; +} +{ +neg.f16x2 r590, r587; +} +{ +fma.rn.f16x2 r592, r564, r576, r590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r596, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r598, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r600, {low, high}; +} +{ +mul.f16x2 r601, r598, r600; +} +{ +mul.f16x2 r604, r573, r596; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r607, {high, low}; +} +{ +fma.rn.f16x2 r609, r601, r607, r604; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r615, {high, high}; +} +{ +mul.f16x2 r617, r558, r615; +} +{ +fma.rn.f16x2 r620, r555, r613, r617; +} +{ +mul.f16x2 r624, r555, r615; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r558, r613, r627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r633, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r573; +mov.b32 r635, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r637, {low, high}; +} +{ +mul.f16x2 r638, r635, r637; +} +{ +mul.f16x2 r641, r609, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r644, {high, low}; +} +{ +fma.rn.f16x2 r646, r638, r644, r641; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r646; +mov.b32 r650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r646; +mov.b32 r652, {high, high}; +} +{ +mul.f16x2 r654, r570, r652; +} +{ +fma.rn.f16x2 r657, r567, r650, r654; +} +{ +mul.f16x2 r661, r567, r652; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r570, r650, r664; +} +barrier.sync 0; +and.b32 r919, r239, 15360; +add.s32 r920, r918, r919; +st.shared.u32 [r920], r549; +st.shared.u32 [r920+256], r583; +st.shared.u32 [r920+512], r620; +st.shared.u32 [r920+768], r657; +barrier.sync 0; +mad.lo.s32 r921, r915, -12, r920; +ld.shared.u32 r688, [r921]; +ld.shared.u32 r700, [r921+4096]; +ld.shared.u32 r689, [r921+8192]; +ld.shared.u32 r701, [r921+12288]; +barrier.sync 0; +st.shared.u32 [r920], r552; +st.shared.u32 [r920+256], r592; +st.shared.u32 [r920+512], r629; +st.shared.u32 [r920+768], r666; +barrier.sync 0; +ld.shared.u32 r691, [r921]; +ld.shared.u32 r703, [r921+4096]; +ld.shared.u32 r692, [r921+8192]; +ld.shared.u32 r704, [r921+12288]; +{ +add.f16x2 r687, r688, r689; +} +{ +add.f16x2 r690, r691, r692; +} +{ +sub.f16x2 r693, r688, r689; +} +{ +sub.f16x2 r696, r691, r692; +} +{ +add.f16x2 r699, r700, r701; +} +{ +add.f16x2 r702, r703, r704; +} +{ +sub.f16x2 r705, r700, r701; +} +{ +sub.f16x2 r708, r703, r704; +} +{ +neg.f16x2 r711, r708; +} +{ +add.f16x2 r713, r687, r699; +} +{ +add.f16x2 r716, r690, r702; +} +{ +sub.f16x2 r719, r687, r699; +} +{ +sub.f16x2 r722, r690, r702; +} +{ +add.f16x2 r725, r693, r711; +} +{ +add.f16x2 r728, r696, r705; +} +{ +sub.f16x2 r731, r693, r711; +} +{ +sub.f16x2 r734, r696, r705; +} +and.b32 r922, r9, 768; +bfe.u32 r923, r9, 8, 2; +and.b32 r924, r901, 1020; +add.s32 r925, r11, r924; +cvt.rn.f32.u32 f69, r923; +mul.f32 f70, f69, 0f3EC90FDB; +cos.approx.f32 f52, f70; +sin.approx.f32 f71, f70; +neg.f32 f53, f71; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f53; +mov.b32 r737, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r740, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r742, {high, high}; +} +{ +mul.f16x2 r744, r728, r742; +} +{ +fma.rn.f16x2 r747, r725, r740, r744; +} +{ +mul.f16x2 r751, r725, r742; +} +{ +neg.f16x2 r754, r751; +} +{ +fma.rn.f16x2 r756, r728, r740, r754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r762, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r764, {low, high}; +} +{ +mul.f16x2 r765, r762, r764; +} +{ +mul.f16x2 r768, r737, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r771, {high, low}; +} +{ +fma.rn.f16x2 r773, r765, r771, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r773; +mov.b32 r777, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r773; +mov.b32 r779, {high, high}; +} +{ +mul.f16x2 r781, r722, r779; +} +{ +fma.rn.f16x2 r784, r719, r777, r781; +} +{ +mul.f16x2 r788, r719, r779; +} +{ +neg.f16x2 r791, r788; +} +{ +fma.rn.f16x2 r793, r722, r777, r791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r799, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f15; +cvt.rn.f16.f32 high, f16; +mov.b32 r801, {low, high}; +} +{ +mul.f16x2 r802, r799, r801; +} +{ +mul.f16x2 r805, r773, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r773; +mov.b32 r808, {high, low}; +} +{ +fma.rn.f16x2 r810, r802, r808, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r810; +mov.b32 r814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r810; +mov.b32 r816, {high, high}; +} +{ +mul.f16x2 r818, r734, r816; +} +{ +fma.rn.f16x2 r821, r731, r814, r818; +} +{ +mul.f16x2 r825, r731, r816; +} +{ +neg.f16x2 r828, r825; +} +{ +fma.rn.f16x2 r830, r734, r814, r828; +} +barrier.sync 0; +and.b32 r926, r239, 12288; +add.s32 r927, r925, r926; +st.shared.u32 [r927], r713; +st.shared.u32 [r927+1024], r747; +st.shared.u32 [r927+2048], r784; +st.shared.u32 [r927+3072], r821; +barrier.sync 0; +mad.lo.s32 r928, r922, -12, r927; +ld.shared.u32 r852, [r928]; +ld.shared.u32 r864, [r928+4096]; +ld.shared.u32 r853, [r928+8192]; +ld.shared.u32 r865, [r928+12288]; +barrier.sync 0; +st.shared.u32 [r927], r716; +st.shared.u32 [r927+1024], r756; +st.shared.u32 [r927+2048], r793; +st.shared.u32 [r927+3072], r830; +barrier.sync 0; +ld.shared.u32 r855, [r928]; +ld.shared.u32 r867, [r928+4096]; +ld.shared.u32 r856, [r928+8192]; +ld.shared.u32 r868, [r928+12288]; +{ +add.f16x2 r851, r852, r853; +} +{ +add.f16x2 r854, r855, r856; +} +{ +sub.f16x2 r857, r852, r853; +} +{ +sub.f16x2 r860, r855, r856; +} +{ +add.f16x2 r863, r864, r865; +} +{ +add.f16x2 r866, r867, r868; +} +{ +sub.f16x2 r869, r864, r865; +} +{ +sub.f16x2 r872, r867, r868; +} +{ +neg.f16x2 r875, r872; +} +{ +add.f16x2 %0, r851, r863; +} +{ +add.f16x2 %1, r854, r866; +} +{ +sub.f16x2 %4, r851, r863; +} +{ +sub.f16x2 %5, r854, r866; +} +{ +add.f16x2 %2, r857, r875; +} +{ +add.f16x2 %3, r860, r869; +} +{ +sub.f16x2 %6, r857, r875; +} +{ +sub.f16x2 %7, r860, r869; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..f96eff94f3499 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp32_fwd.hpp.inc @@ -0,0 +1,7582 @@ +#ifndef CUFFTDX_FFT_4096_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_4096_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<101, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<952>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16320; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+1024]; +ld.shared.f32 f391, [r13+2048]; +ld.shared.f32 f392, [r13+3072]; +ld.shared.f32 f393, [r13+4096]; +ld.shared.f32 f394, [r13+5120]; +ld.shared.f32 f395, [r13+6144]; +ld.shared.f32 f396, [r13+7168]; +ld.shared.f32 f397, [r13+8192]; +ld.shared.f32 f398, [r13+9216]; +ld.shared.f32 f399, [r13+10240]; +ld.shared.f32 f400, [r13+11264]; +ld.shared.f32 f401, [r13+12288]; +ld.shared.f32 f402, [r13+13312]; +ld.shared.f32 f403, [r13+14336]; +ld.shared.f32 f404, [r13+15360]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+1024]; +ld.shared.f32 f407, [r13+2048]; +ld.shared.f32 f408, [r13+3072]; +ld.shared.f32 f409, [r13+4096]; +ld.shared.f32 f410, [r13+5120]; +ld.shared.f32 f411, [r13+6144]; +ld.shared.f32 f412, [r13+7168]; +ld.shared.f32 f413, [r13+8192]; +ld.shared.f32 f414, [r13+9216]; +ld.shared.f32 f415, [r13+10240]; +ld.shared.f32 f416, [r13+11264]; +ld.shared.f32 f417, [r13+12288]; +ld.shared.f32 f418, [r13+13312]; +ld.shared.f32 f419, [r13+14336]; +ld.shared.f32 f420, [r13+15360]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f544; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f544; +add.f32 f576, f474, f547; +add.f32 f577, f475, f549; +sub.f32 f578, f474, f547; +sub.f32 f579, f475, f549; +add.f32 f580, f464, f522; +sub.f32 f581, f465, f521; +sub.f32 f582, f464, f522; +add.f32 f583, f465, f521; +add.f32 f584, f468, f552; +add.f32 f585, f469, f554; +sub.f32 f586, f468, f552; +sub.f32 f587, f469, f554; +add.f32 f588, f472, f557; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f557; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 240; +bfe.u32 r15, r5, 4, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f596, f568; +mul.f32 f601, f597, f569; +sub.f32 f602, f600, f601; +mul.f32 f603, f596, f569; +fma.rn.f32 f604, f597, f568, f603; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f607, f572; +mul.f32 f611, f609, f573; +sub.f32 f612, f610, f611; +mul.f32 f613, f607, f573; +fma.rn.f32 f614, f609, f572, f613; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f617, f576; +mul.f32 f621, f619, f577; +sub.f32 f622, f620, f621; +mul.f32 f623, f617, f577; +fma.rn.f32 f624, f619, f576, f623; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f627, f580; +mul.f32 f631, f629, f581; +sub.f32 f632, f630, f631; +mul.f32 f633, f627, f581; +fma.rn.f32 f634, f629, f580, f633; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f637, f584; +mul.f32 f641, f639, f585; +sub.f32 f642, f640, f641; +mul.f32 f643, f637, f585; +fma.rn.f32 f644, f639, f584, f643; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f647, f588; +mul.f32 f651, f649, f589; +sub.f32 f652, f650, f651; +mul.f32 f653, f647, f589; +fma.rn.f32 f654, f649, f588, f653; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f657, f592; +mul.f32 f661, f659, f593; +sub.f32 f662, f660, f661; +mul.f32 f663, f657, f593; +fma.rn.f32 f664, f659, f592, f663; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f667, f566; +mul.f32 f671, f669, f567; +sub.f32 f672, f670, f671; +mul.f32 f673, f667, f567; +fma.rn.f32 f674, f669, f566, f673; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f677, f570; +mul.f32 f681, f679, f571; +sub.f32 f682, f680, f681; +mul.f32 f683, f677, f571; +fma.rn.f32 f684, f679, f570, f683; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f687, f574; +mul.f32 f691, f689, f575; +sub.f32 f692, f690, f691; +mul.f32 f693, f687, f575; +fma.rn.f32 f694, f689, f574, f693; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f697, f578; +mul.f32 f701, f699, f579; +sub.f32 f702, f700, f701; +mul.f32 f703, f697, f579; +fma.rn.f32 f704, f699, f578, f703; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f707, f582; +mul.f32 f711, f709, f583; +sub.f32 f712, f710, f711; +mul.f32 f713, f707, f583; +fma.rn.f32 f714, f709, f582, f713; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f717, f586; +mul.f32 f721, f719, f587; +sub.f32 f722, f720, f721; +mul.f32 f723, f717, f587; +fma.rn.f32 f724, f719, f586, f723; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f727, f590; +mul.f32 f731, f729, f591; +sub.f32 f732, f730, f731; +mul.f32 f733, f727, f591; +fma.rn.f32 f734, f729, f590, f733; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f737, f594; +mul.f32 f741, f739, f595; +sub.f32 f742, f740, f741; +mul.f32 f743, f737, f595; +fma.rn.f32 f744, f739, f594, f743; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 15360; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f602; +st.shared.f32 [r20+128], f612; +st.shared.f32 [r20+192], f622; +st.shared.f32 [r20+256], f632; +st.shared.f32 [r20+320], f642; +st.shared.f32 [r20+384], f652; +st.shared.f32 [r20+448], f662; +st.shared.f32 [r20+512], f672; +st.shared.f32 [r20+576], f682; +st.shared.f32 [r20+640], f692; +st.shared.f32 [r20+704], f702; +st.shared.f32 [r20+768], f712; +st.shared.f32 [r20+832], f722; +st.shared.f32 [r20+896], f732; +st.shared.f32 [r20+960], f742; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+1024]; +ld.shared.f32 f747, [r21+2048]; +ld.shared.f32 f748, [r21+3072]; +ld.shared.f32 f749, [r21+4096]; +ld.shared.f32 f750, [r21+5120]; +ld.shared.f32 f751, [r21+6144]; +ld.shared.f32 f752, [r21+7168]; +ld.shared.f32 f753, [r21+8192]; +ld.shared.f32 f754, [r21+9216]; +ld.shared.f32 f755, [r21+10240]; +ld.shared.f32 f756, [r21+11264]; +ld.shared.f32 f757, [r21+12288]; +ld.shared.f32 f758, [r21+13312]; +ld.shared.f32 f759, [r21+14336]; +ld.shared.f32 f760, [r21+15360]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+1024]; +ld.shared.f32 f763, [r21+2048]; +ld.shared.f32 f764, [r21+3072]; +ld.shared.f32 f765, [r21+4096]; +ld.shared.f32 f766, [r21+5120]; +ld.shared.f32 f767, [r21+6144]; +ld.shared.f32 f768, [r21+7168]; +ld.shared.f32 f769, [r21+8192]; +ld.shared.f32 f770, [r21+9216]; +ld.shared.f32 f771, [r21+10240]; +ld.shared.f32 f772, [r21+11264]; +ld.shared.f32 f773, [r21+12288]; +ld.shared.f32 f774, [r21+13312]; +ld.shared.f32 f775, [r21+14336]; +ld.shared.f32 f776, [r21+15360]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +add.f32 f789, f779, f784; +sub.f32 f790, f780, f783; +sub.f32 f791, f779, f784; +add.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +add.f32 f805, f795, f800; +sub.f32 f806, f796, f799; +sub.f32 f807, f795, f800; +add.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0fBF3504F3; +sub.f32 f811, f809, f810; +mul.f32 f812, f806, 0f3F3504F3; +fma.rn.f32 f813, f805, 0fBF3504F3, f812; +mul.f32 f814, f807, 0fBF3504F3; +mul.f32 f815, f808, 0fBF3504F3; +sub.f32 f816, f814, f815; +add.f32 f817, f814, f815; +add.f32 f818, f785, f801; +add.f32 f819, f786, f802; +sub.f32 f820, f785, f801; +sub.f32 f821, f786, f802; +add.f32 f822, f789, f811; +add.f32 f823, f790, f813; +sub.f32 f824, f789, f811; +sub.f32 f825, f790, f813; +add.f32 f826, f787, f804; +sub.f32 f827, f788, f803; +sub.f32 f828, f787, f804; +add.f32 f829, f788, f803; +add.f32 f830, f791, f816; +add.f32 f831, f792, f817; +sub.f32 f832, f791, f816; +sub.f32 f833, f792, f817; +add.f32 f834, f746, f754; +add.f32 f835, f762, f770; +sub.f32 f836, f746, f754; +sub.f32 f837, f762, f770; +add.f32 f838, f750, f758; +add.f32 f839, f766, f774; +sub.f32 f840, f750, f758; +sub.f32 f841, f766, f774; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +add.f32 f846, f836, f841; +sub.f32 f847, f837, f840; +sub.f32 f848, f836, f841; +add.f32 f849, f837, f840; +add.f32 f850, f748, f756; +add.f32 f851, f764, f772; +sub.f32 f852, f748, f756; +sub.f32 f853, f764, f772; +add.f32 f854, f752, f760; +add.f32 f855, f768, f776; +sub.f32 f856, f752, f760; +sub.f32 f857, f768, f776; +add.f32 f858, f850, f854; +add.f32 f859, f851, f855; +sub.f32 f860, f850, f854; +sub.f32 f861, f851, f855; +add.f32 f862, f852, f857; +sub.f32 f863, f853, f856; +sub.f32 f864, f852, f857; +add.f32 f865, f853, f856; +mul.f32 f866, f862, 0f3F3504F3; +mul.f32 f867, f863, 0fBF3504F3; +sub.f32 f868, f866, f867; +mul.f32 f869, f863, 0f3F3504F3; +fma.rn.f32 f870, f862, 0fBF3504F3, f869; +mul.f32 f871, f864, 0fBF3504F3; +mul.f32 f872, f865, 0fBF3504F3; +sub.f32 f873, f871, f872; +add.f32 f874, f871, f872; +add.f32 f875, f842, f858; +add.f32 f876, f843, f859; +sub.f32 f877, f842, f858; +sub.f32 f878, f843, f859; +add.f32 f879, f846, f868; +add.f32 f880, f847, f870; +sub.f32 f881, f846, f868; +sub.f32 f882, f847, f870; +add.f32 f883, f844, f861; +sub.f32 f884, f845, f860; +sub.f32 f885, f844, f861; +add.f32 f886, f845, f860; +add.f32 f887, f848, f873; +add.f32 f888, f849, f874; +sub.f32 f889, f848, f873; +sub.f32 f890, f849, f874; +mul.f32 f891, f879, 0f3F6C835E; +mul.f32 f892, f880, 0fBEC3EF15; +sub.f32 f893, f891, f892; +mul.f32 f894, f880, 0f3F6C835E; +fma.rn.f32 f895, f879, 0fBEC3EF15, f894; +mul.f32 f896, f883, 0f3F3504F3; +mul.f32 f897, f884, 0fBF3504F3; +sub.f32 f898, f896, f897; +mul.f32 f899, f884, 0f3F3504F3; +fma.rn.f32 f900, f883, 0fBF3504F3, f899; +mul.f32 f901, f887, 0f3EC3EF15; +mul.f32 f902, f888, 0fBF6C835E; +sub.f32 f903, f901, f902; +mul.f32 f904, f888, 0f3EC3EF15; +fma.rn.f32 f905, f887, 0fBF6C835E, f904; +mul.f32 f906, f881, 0fBEC3EF15; +mul.f32 f907, f882, 0fBF6C835E; +sub.f32 f908, f906, f907; +mul.f32 f909, f882, 0fBEC3EF15; +fma.rn.f32 f910, f881, 0fBF6C835E, f909; +mul.f32 f911, f885, 0fBF3504F3; +mul.f32 f912, f886, 0fBF3504F3; +sub.f32 f913, f911, f912; +add.f32 f914, f911, f912; +mul.f32 f915, f889, 0fBF6C835E; +mul.f32 f916, f890, 0fBEC3EF15; +sub.f32 f917, f915, f916; +mul.f32 f918, f890, 0fBF6C835E; +fma.rn.f32 f919, f889, 0fBEC3EF15, f918; +add.f32 %0, f818, f875; +add.f32 %1, f819, f876; +add.f32 %3, f823, f895; +add.f32 %2, f822, f893; +add.f32 %5, f827, f900; +add.f32 %4, f826, f898; +add.f32 %7, f831, f905; +add.f32 %6, f830, f903; +sub.f32 %9, f821, f877; +add.f32 %8, f820, f878; +add.f32 %11, f825, f910; +add.f32 %10, f824, f908; +add.f32 %13, f829, f914; +add.f32 %12, f828, f913; +add.f32 %15, f833, f919; +add.f32 %14, f832, f917; +sub.f32 %16, f818, f875; +sub.f32 %17, f819, f876; +sub.f32 %19, f823, f895; +sub.f32 %18, f822, f893; +sub.f32 %21, f827, f900; +sub.f32 %20, f826, f898; +sub.f32 %23, f831, f905; +sub.f32 %22, f830, f903; +add.f32 %25, f821, f877; +sub.f32 %24, f820, f878; +sub.f32 %27, f825, f910; +sub.f32 %26, f824, f908; +sub.f32 %29, f829, f914; +sub.f32 %28, f828, f913; +sub.f32 %31, f833, f919; +sub.f32 %30, f832, f917; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_4096), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<103, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<516>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16352; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+2048]; +ld.shared.f32 f161, [r13+4096]; +ld.shared.f32 f162, [r13+6144]; +ld.shared.f32 f163, [r13+8192]; +ld.shared.f32 f164, [r13+10240]; +ld.shared.f32 f165, [r13+12288]; +ld.shared.f32 f166, [r13+14336]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+2048]; +ld.shared.f32 f169, [r13+4096]; +ld.shared.f32 f170, [r13+6144]; +ld.shared.f32 f171, [r13+8192]; +ld.shared.f32 f172, [r13+10240]; +ld.shared.f32 f173, [r13+12288]; +ld.shared.f32 f174, [r13+14336]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 504; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 16128; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+2048]; +ld.shared.f32 f303, [r20+4096]; +ld.shared.f32 f304, [r20+6144]; +ld.shared.f32 f305, [r20+8192]; +ld.shared.f32 f306, [r20+10240]; +ld.shared.f32 f307, [r20+12288]; +ld.shared.f32 f308, [r20+14336]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+2048]; +ld.shared.f32 f311, [r20+4096]; +ld.shared.f32 f312, [r20+6144]; +ld.shared.f32 f313, [r20+8192]; +ld.shared.f32 f314, [r20+10240]; +ld.shared.f32 f315, [r20+12288]; +ld.shared.f32 f316, [r20+14336]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +add.f32 f329, f319, f324; +sub.f32 f330, f320, f323; +sub.f32 f331, f319, f324; +add.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +add.f32 f345, f335, f340; +sub.f32 f346, f336, f339; +sub.f32 f347, f335, f340; +add.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0fBF3504F3; +sub.f32 f351, f349, f350; +mul.f32 f352, f346, 0f3F3504F3; +fma.rn.f32 f353, f345, 0fBF3504F3, f352; +mul.f32 f354, f347, 0fBF3504F3; +mul.f32 f355, f348, 0fBF3504F3; +sub.f32 f356, f354, f355; +add.f32 f357, f354, f355; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f353; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f353; +add.f32 f366, f327, f344; +sub.f32 f367, f328, f343; +sub.f32 f368, f327, f344; +add.f32 f369, f328, f343; +add.f32 f370, f331, f356; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f356; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 448; +bfe.u32 r22, r5, 6, 3; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f374, f362; +mul.f32 f379, f375, f363; +sub.f32 f380, f378, f379; +mul.f32 f381, f374, f363; +fma.rn.f32 f382, f375, f362, f381; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f385, f366; +mul.f32 f389, f387, f367; +sub.f32 f390, f388, f389; +mul.f32 f391, f385, f367; +fma.rn.f32 f392, f387, f366, f391; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f395, f370; +mul.f32 f399, f397, f371; +sub.f32 f400, f398, f399; +mul.f32 f401, f395, f371; +fma.rn.f32 f402, f397, f370, f401; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f405, f360; +mul.f32 f409, f407, f361; +sub.f32 f410, f408, f409; +mul.f32 f411, f405, f361; +fma.rn.f32 f412, f407, f360, f411; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f415, f364; +mul.f32 f419, f417, f365; +sub.f32 f420, f418, f419; +mul.f32 f421, f415, f365; +fma.rn.f32 f422, f417, f364, f421; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f425, f368; +mul.f32 f429, f427, f369; +sub.f32 f430, f428, f429; +mul.f32 f431, f425, f369; +fma.rn.f32 f432, f427, f368, f431; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f435, f372; +mul.f32 f439, f437, f373; +sub.f32 f440, f438, f439; +mul.f32 f441, f435, f373; +fma.rn.f32 f442, f437, f372, f441; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 14336; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f380; +st.shared.f32 [r26+512], f390; +st.shared.f32 [r26+768], f400; +st.shared.f32 [r26+1024], f410; +st.shared.f32 [r26+1280], f420; +st.shared.f32 [r26+1536], f430; +st.shared.f32 [r26+1792], f440; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+2048]; +ld.shared.f32 f445, [r27+4096]; +ld.shared.f32 f446, [r27+6144]; +ld.shared.f32 f447, [r27+8192]; +ld.shared.f32 f448, [r27+10240]; +ld.shared.f32 f449, [r27+12288]; +ld.shared.f32 f450, [r27+14336]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+2048]; +ld.shared.f32 f453, [r27+4096]; +ld.shared.f32 f454, [r27+6144]; +ld.shared.f32 f455, [r27+8192]; +ld.shared.f32 f456, [r27+10240]; +ld.shared.f32 f457, [r27+12288]; +ld.shared.f32 f458, [r27+14336]; +add.f32 f459, f443, f447; +add.f32 f460, f451, f455; +sub.f32 f461, f443, f447; +sub.f32 f462, f451, f455; +add.f32 f463, f445, f449; +add.f32 f464, f453, f457; +sub.f32 f465, f445, f449; +sub.f32 f466, f453, f457; +add.f32 f467, f459, f463; +add.f32 f468, f460, f464; +sub.f32 f469, f459, f463; +sub.f32 f470, f460, f464; +add.f32 f471, f461, f466; +sub.f32 f472, f462, f465; +sub.f32 f473, f461, f466; +add.f32 f474, f462, f465; +add.f32 f475, f444, f448; +add.f32 f476, f452, f456; +sub.f32 f477, f444, f448; +sub.f32 f478, f452, f456; +add.f32 f479, f446, f450; +add.f32 f480, f454, f458; +sub.f32 f481, f446, f450; +sub.f32 f482, f454, f458; +add.f32 f483, f475, f479; +add.f32 f484, f476, f480; +sub.f32 f485, f475, f479; +sub.f32 f486, f476, f480; +add.f32 f487, f477, f482; +sub.f32 f488, f478, f481; +sub.f32 f489, f477, f482; +add.f32 f490, f478, f481; +mul.f32 f491, f487, 0f3F3504F3; +mul.f32 f492, f488, 0fBF3504F3; +sub.f32 f493, f491, f492; +mul.f32 f494, f488, 0f3F3504F3; +fma.rn.f32 f495, f487, 0fBF3504F3, f494; +mul.f32 f496, f489, 0fBF3504F3; +mul.f32 f497, f490, 0fBF3504F3; +sub.f32 f498, f496, f497; +add.f32 f499, f496, f497; +add.f32 %0, f467, f483; +add.f32 %1, f468, f484; +add.f32 %3, f472, f495; +add.f32 %2, f471, f493; +sub.f32 %5, f470, f485; +add.f32 %4, f469, f486; +add.f32 %7, f474, f499; +add.f32 %6, f473, f498; +sub.f32 %8, f467, f483; +sub.f32 %9, f468, f484; +sub.f32 %11, f472, f495; +sub.f32 %10, f471, f493; +add.f32 %13, f470, f485; +sub.f32 %12, f469, f486; +sub.f32 %15, f474, f499; +sub.f32 %14, f473, f498; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_4096), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<104, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1232>; +.reg .b32 r<35>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1223, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1221, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1220, f1223, f1221; +sub.f32 f76, f1223, f1221; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f1219, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1216, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1214, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1213, f1216, f1214; +sub.f32 f92, f1216, f1214; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f1212, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f1212, 0fBF3504F3; +mul.f32 f1211, f93, 0f3F3504F3; +sub.f32 f99, f1211, f98; +mul.f32 f100, f1212, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1210, f1220, f1213; +sub.f32 f109, f1220, f1213; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1209, f1219, f101; +sub.f32 f113, f1219, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f1208, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f1207, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1205, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1202, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1201, f1205, f1202; +sub.f32 f133, f1205, f1202; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f1200, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1198, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1196, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1195, f1198, f1196; +sub.f32 f149, f1198, f1196; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f1194, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f1194, 0fBF3504F3; +mul.f32 f1193, f150, 0f3F3504F3; +sub.f32 f156, f1193, f155; +mul.f32 f157, f1194, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1192, f1201, f1195; +sub.f32 f166, f1201, f1195; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1191, f1200, f158; +sub.f32 f170, f1200, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f1190, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f1189, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1187, f167, 0f3F6C835E; +mul.f32 f1188, f1191, 0fBEC3EF15; +sub.f32 f181, f1187, f1188; +mul.f32 f182, f1191, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f1185, f171, 0f3F3504F3; +mul.f32 f1186, f1190, 0fBF3504F3; +sub.f32 f186, f1185, f1186; +mul.f32 f187, f1190, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f1183, f175, 0f3EC3EF15; +mul.f32 f1184, f1189, 0fBF6C835E; +sub.f32 f191, f1183, f1184; +mul.f32 f192, f1189, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f1181, f169, 0fBEC3EF15; +mul.f32 f1182, f170, 0fBF6C835E; +sub.f32 f196, f1181, f1182; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f1179, f177, 0fBF6C835E; +mul.f32 f1180, f178, 0fBEC3EF15; +sub.f32 f205, f1179, f1180; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1178, f1209, f183; +sub.f32 f213, f1209, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1177, f1208, f188; +sub.f32 f217, f1208, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f1176, f1207, f193; +sub.f32 f221, f1207, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f1175, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f1174, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f1173, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1172, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f1178; +mul.f32 f244, f238, f1178; +mul.f32 f246, f239, f239; +mul.f32 f1171, f238, f238; +sub.f32 f247, f1171, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f1177; +mul.f32 f252, f247, f1177; +mul.f32 f1169, f238, f247; +mul.f32 f1170, f239, f249; +sub.f32 f255, f1169, f1170; +mul.f32 f1168, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f1176; +mul.f32 f260, f255, f1176; +mul.f32 f262, f239, f257; +mul.f32 f1167, f238, f255; +sub.f32 f263, f1167, f262; +mul.f32 f1166, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f1175; +mul.f32 f268, f263, f1175; +mul.f32 f270, f239, f265; +mul.f32 f1165, f238, f263; +sub.f32 f271, f1165, f270; +mul.f32 f1164, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f1174; +mul.f32 f276, f271, f1174; +mul.f32 f1162, f238, f271; +mul.f32 f1163, f239, f273; +sub.f32 f279, f1162, f1163; +mul.f32 f1161, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f1173; +mul.f32 f284, f279, f1173; +mul.f32 f286, f239, f281; +mul.f32 f1160, f238, f279; +sub.f32 f287, f1160, f286; +mul.f32 f1159, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f1172; +mul.f32 f292, f287, f1172; +mul.f32 f294, f239, f289; +mul.f32 f1158, f238, f287; +sub.f32 f295, f1158, f294; +mul.f32 f1157, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1156, f1210, f1192; +mul.f32 f299, f297, f1156; +mul.f32 f300, f295, f1156; +mul.f32 f1154, f238, f295; +mul.f32 f1155, f239, f297; +sub.f32 f303, f1154, f1155; +sub.f32 f1153, f106, f163; +mul.f32 f1152, f295, f1153; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1151, f238, f303; +sub.f32 f311, f1151, f310; +mul.f32 f1150, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f1148, f238, f311; +mul.f32 f1149, f239, f313; +sub.f32 f319, f1148, f1149; +mul.f32 f1147, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1146, f238, f319; +sub.f32 f327, f1146, f326; +mul.f32 f1145, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1144, f238, f327; +sub.f32 f335, f1144, f334; +mul.f32 f1143, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f1141, f238, f335; +mul.f32 f1142, f239, f337; +sub.f32 f343, f1141, f1142; +mul.f32 f1140, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1139, f238, f343; +sub.f32 f351, f1139, f350; +mul.f32 f1138, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f1137, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f1229, f1210, f1192; +mul.f32 f1228, f297, f1229; +barrier.sync 0; +and.b32 r11, r7, 32640; +add.s32 r12, r9, r11; +add.f32 f357, f1210, f1192; +sub.f32 f1226, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r34, %tid.x; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f1138, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f1168, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f1166, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f1164, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f1161, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f1159, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f1157, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f1226, f300; +sub.f32 f374, f1152, f1228; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f1150, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f1147, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f1145, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f1143, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f1140, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f1137, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +and.b32 r21, r34, 255; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+2048]; +ld.shared.v2.f32 {f397, f398}, [r13+4096]; +ld.shared.v2.f32 {f401, f402}, [r13+6144]; +ld.shared.v2.f32 {f405, f406}, [r13+8192]; +ld.shared.v2.f32 {f409, f410}, [r13+10240]; +ld.shared.v2.f32 {f413, f414}, [r13+12288]; +ld.shared.v2.f32 {f417, f418}, [r13+14336]; +ld.shared.v2.f32 {f421, f422}, [r13+16384]; +ld.shared.v2.f32 {f425, f426}, [r13+18432]; +ld.shared.v2.f32 {f429, f430}, [r13+20480]; +ld.shared.v2.f32 {f433, f434}, [r13+22528]; +ld.shared.v2.f32 {f437, f438}, [r13+24576]; +ld.shared.v2.f32 {f441, f442}, [r13+26624]; +ld.shared.v2.f32 {f445, f446}, [r13+28672]; +ld.shared.v2.f32 {f449, f450}, [r13+30720]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1136, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1135, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1134, f1136, f1135; +sub.f32 f464, f1136, f1135; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f1133, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1132, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1131, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1130, f1132, f1131; +sub.f32 f480, f1132, f1131; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f1129, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f1127, f481, 0f3F3504F3; +mul.f32 f1128, f1129, 0fBF3504F3; +sub.f32 f487, f1127, f1128; +mul.f32 f488, f1129, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1126, f1134, f1130; +sub.f32 f497, f1134, f1130; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1125, f1133, f489; +sub.f32 f501, f1133, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f1124, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f1123, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1122, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1121, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1120, f1122, f1121; +sub.f32 f521, f1122, f1121; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f1119, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1118, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1117, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1116, f1118, f1117; +sub.f32 f537, f1118, f1117; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f1115, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f1113, f538, 0f3F3504F3; +mul.f32 f1114, f1115, 0fBF3504F3; +sub.f32 f544, f1113, f1114; +mul.f32 f545, f1115, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1112, f1120, f1116; +sub.f32 f554, f1120, f1116; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1111, f1119, f546; +sub.f32 f558, f1119, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f1110, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f1109, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1111, 0fBEC3EF15; +mul.f32 f1108, f555, 0f3F6C835E; +sub.f32 f569, f1108, f568; +mul.f32 f570, f1111, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f1110, 0fBF3504F3; +mul.f32 f1107, f559, 0f3F3504F3; +sub.f32 f574, f1107, f573; +mul.f32 f575, f1110, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f1105, f563, 0f3EC3EF15; +mul.f32 f1106, f1109, 0fBF6C835E; +sub.f32 f579, f1105, f1106; +mul.f32 f580, f1109, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f1103, f557, 0fBEC3EF15; +mul.f32 f1104, f558, 0fBF6C835E; +sub.f32 f584, f1103, f1104; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f1102, f565, 0fBF6C835E; +sub.f32 f593, f1102, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1101, f1125, f571; +sub.f32 f601, f1125, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1100, f1124, f576; +sub.f32 f605, f1124, f576; +add.f32 f606, f506, f579; +sub.f32 f608, f506, f579; +add.f32 f1099, f1123, f581; +sub.f32 f609, f1123, f581; +add.f32 f610, f496, f554; +sub.f32 f612, f496, f554; +sub.f32 f1098, f497, f553; +add.f32 f613, f497, f553; +add.f32 f614, f500, f584; +sub.f32 f616, f500, f584; +add.f32 f1097, f501, f586; +sub.f32 f617, f501, f586; +add.f32 f618, f504, f589; +sub.f32 f620, f504, f589; +add.f32 f1096, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1095, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r34, 240; +bfe.u32 r15, r34, 4, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f631, f627, f1101; +mul.f32 f632, f626, f1101; +mul.f32 f634, f627, f627; +mul.f32 f1094, f626, f626; +sub.f32 f635, f1094, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f639, f637, f1100; +mul.f32 f640, f635, f1100; +mul.f32 f1092, f626, f635; +mul.f32 f1093, f627, f637; +sub.f32 f643, f1092, f1093; +mul.f32 f1091, f635, f602; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f647, f645, f1099; +mul.f32 f648, f643, f1099; +mul.f32 f650, f627, f645; +mul.f32 f1090, f626, f643; +sub.f32 f651, f1090, f650; +mul.f32 f1089, f643, f606; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f655, f653, f1098; +mul.f32 f656, f651, f1098; +mul.f32 f658, f627, f653; +mul.f32 f1088, f626, f651; +sub.f32 f659, f1088, f658; +mul.f32 f1087, f651, f610; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f663, f661, f1097; +mul.f32 f664, f659, f1097; +mul.f32 f1085, f626, f659; +mul.f32 f1086, f627, f661; +sub.f32 f667, f1085, f1086; +mul.f32 f1084, f659, f614; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f671, f669, f1096; +mul.f32 f672, f667, f1096; +mul.f32 f674, f627, f669; +mul.f32 f1083, f626, f667; +sub.f32 f675, f1083, f674; +mul.f32 f1082, f667, f618; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f679, f677, f1095; +mul.f32 f680, f675, f1095; +mul.f32 f682, f627, f677; +mul.f32 f1081, f626, f675; +sub.f32 f683, f1081, f682; +mul.f32 f1080, f675, f622; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1079, f1126, f1112; +mul.f32 f687, f685, f1079; +mul.f32 f688, f683, f1079; +mul.f32 f1077, f626, f683; +mul.f32 f1078, f627, f685; +sub.f32 f691, f1077, f1078; +sub.f32 f1076, f494, f551; +mul.f32 f1075, f683, f1076; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f695, f693, f601; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1074, f626, f691; +sub.f32 f699, f1074, f698; +mul.f32 f1073, f691, f600; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f703, f701, f605; +mul.f32 f704, f699, f605; +mul.f32 f1071, f626, f699; +mul.f32 f1072, f627, f701; +sub.f32 f707, f1071, f1072; +mul.f32 f1070, f699, f604; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f711, f709, f609; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f1069, f626, f707; +sub.f32 f715, f1069, f714; +mul.f32 f1068, f707, f608; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f719, f717, f613; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f1067, f626, f715; +sub.f32 f723, f1067, f722; +mul.f32 f1066, f715, f612; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f727, f725, f617; +mul.f32 f728, f723, f617; +mul.f32 f1064, f626, f723; +mul.f32 f1065, f627, f725; +sub.f32 f731, f1064, f1065; +mul.f32 f1063, f723, f616; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f735, f733, f621; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f1062, f626, f731; +sub.f32 f739, f1062, f738; +mul.f32 f1061, f626, f598; +mul.f32 f740, f626, f733; +mul.f32 f1060, f731, f620; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f739, f624; +mul.f32 f743, f741, f625; +mul.f32 f744, f739, f625; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 3; +and.b32 r16, r23, 120; +add.s32 r17, r9, r16; +mov.u32 r26, %tid.x; +shl.b32 r25, r26, 7; +barrier.sync 0; +and.b32 r18, r25, 30720; +add.s32 r19, r17, r18; +mov.u32 r28, %tid.x; +and.b32 r27, r28, 240; +add.f32 f745, f1126, f1112; +sub.f32 f1227, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 240; +fma.rn.f32 f747, f627, f598, f632; +sub.f32 f748, f1061, f631; +st.shared.v2.f32 [r19+128], {f748, f747}; +fma.rn.f32 f749, f637, f602, f640; +sub.f32 f750, f1091, f639; +st.shared.v2.f32 [r19+256], {f750, f749}; +fma.rn.f32 f751, f645, f606, f648; +sub.f32 f752, f1089, f647; +st.shared.v2.f32 [r19+384], {f752, f751}; +fma.rn.f32 f753, f653, f610, f656; +sub.f32 f754, f1087, f655; +st.shared.v2.f32 [r19+512], {f754, f753}; +sub.f32 f755, f1084, f663; +fma.rn.f32 f756, f661, f614, f664; +st.shared.v2.f32 [r19+640], {f755, f756}; +fma.rn.f32 f757, f669, f618, f672; +sub.f32 f758, f1082, f671; +st.shared.v2.f32 [r19+768], {f758, f757}; +fma.rn.f32 f759, f677, f622, f680; +sub.f32 f760, f1080, f679; +st.shared.v2.f32 [r19+896], {f760, f759}; +fma.rn.f32 f761, f685, f1227, f688; +sub.f32 f762, f1075, f687; +st.shared.v2.f32 [r19+1024], {f762, f761}; +fma.rn.f32 f763, f693, f600, f696; +sub.f32 f764, f1073, f695; +st.shared.v2.f32 [r19+1152], {f764, f763}; +fma.rn.f32 f765, f701, f604, f704; +sub.f32 f766, f1070, f703; +st.shared.v2.f32 [r19+1280], {f766, f765}; +fma.rn.f32 f767, f709, f608, f712; +sub.f32 f768, f1068, f711; +st.shared.v2.f32 [r19+1408], {f768, f767}; +fma.rn.f32 f769, f717, f612, f720; +sub.f32 f770, f1066, f719; +st.shared.v2.f32 [r19+1536], {f770, f769}; +fma.rn.f32 f771, f725, f616, f728; +sub.f32 f772, f1063, f727; +st.shared.v2.f32 [r19+1664], {f772, f771}; +fma.rn.f32 f773, f733, f620, f736; +sub.f32 f774, f1060, f735; +st.shared.v2.f32 [r19+1792], {f774, f773}; +fma.rn.f32 f775, f741, f624, f744; +sub.f32 f776, f742, f743; +st.shared.v2.f32 [r19+1920], {f776, f775}; +barrier.sync 0; +mad.lo.s32 r20, r30, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+2048]; +ld.shared.v2.f32 {f785, f786}, [r20+4096]; +ld.shared.v2.f32 {f789, f790}, [r20+6144]; +ld.shared.v2.f32 {f793, f794}, [r20+8192]; +ld.shared.v2.f32 {f797, f798}, [r20+10240]; +ld.shared.v2.f32 {f801, f802}, [r20+12288]; +ld.shared.v2.f32 {f805, f806}, [r20+14336]; +ld.shared.v2.f32 {f809, f810}, [r20+16384]; +ld.shared.v2.f32 {f813, f814}, [r20+18432]; +ld.shared.v2.f32 {f817, f818}, [r20+20480]; +ld.shared.v2.f32 {f821, f822}, [r20+22528]; +ld.shared.v2.f32 {f825, f826}, [r20+24576]; +ld.shared.v2.f32 {f829, f830}, [r20+26624]; +ld.shared.v2.f32 {f833, f834}, [r20+28672]; +ld.shared.v2.f32 {f837, f838}, [r20+30720]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f1059, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f1058, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f1057, f1059, f1058; +sub.f32 f852, f1059, f1058; +add.f32 f853, f843, f848; +sub.f32 f855, f843, f848; +sub.f32 f1056, f844, f847; +add.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f1055, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f1054, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f1053, f1055, f1054; +sub.f32 f868, f1055, f1054; +add.f32 f869, f859, f864; +sub.f32 f871, f859, f864; +sub.f32 f1052, f860, f863; +add.f32 f872, f860, f863; +mul.f32 f1050, f869, 0f3F3504F3; +mul.f32 f1051, f1052, 0fBF3504F3; +sub.f32 f875, f1050, f1051; +mul.f32 f876, f1052, 0f3F3504F3; +fma.rn.f32 f877, f869, 0fBF3504F3, f876; +mul.f32 f878, f871, 0fBF3504F3; +mul.f32 f879, f872, 0fBF3504F3; +sub.f32 f880, f878, f879; +add.f32 f881, f878, f879; +add.f32 f882, f849, f865; +sub.f32 f884, f849, f865; +add.f32 f1049, f1057, f1053; +sub.f32 f885, f1057, f1053; +add.f32 f886, f853, f875; +sub.f32 f888, f853, f875; +add.f32 f1048, f1056, f877; +sub.f32 f889, f1056, f877; +add.f32 f890, f851, f868; +sub.f32 f892, f851, f868; +sub.f32 f1047, f852, f867; +add.f32 f893, f852, f867; +add.f32 f894, f855, f880; +sub.f32 f896, f855, f880; +add.f32 f1046, f856, f881; +sub.f32 f897, f856, f881; +add.f32 f898, f781, f813; +sub.f32 f900, f781, f813; +add.f32 f1045, f782, f814; +sub.f32 f901, f782, f814; +add.f32 f902, f797, f829; +sub.f32 f904, f797, f829; +add.f32 f1044, f798, f830; +sub.f32 f905, f798, f830; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f1043, f1045, f1044; +sub.f32 f909, f1045, f1044; +add.f32 f910, f900, f905; +sub.f32 f912, f900, f905; +sub.f32 f1042, f901, f904; +add.f32 f913, f901, f904; +add.f32 f914, f789, f821; +sub.f32 f916, f789, f821; +add.f32 f1041, f790, f822; +sub.f32 f917, f790, f822; +add.f32 f918, f805, f837; +sub.f32 f920, f805, f837; +add.f32 f1040, f806, f838; +sub.f32 f921, f806, f838; +add.f32 f922, f914, f918; +sub.f32 f924, f914, f918; +add.f32 f1039, f1041, f1040; +sub.f32 f925, f1041, f1040; +add.f32 f926, f916, f921; +sub.f32 f928, f916, f921; +sub.f32 f1038, f917, f920; +add.f32 f929, f917, f920; +mul.f32 f1036, f926, 0f3F3504F3; +mul.f32 f1037, f1038, 0fBF3504F3; +sub.f32 f932, f1036, f1037; +mul.f32 f933, f1038, 0f3F3504F3; +fma.rn.f32 f934, f926, 0fBF3504F3, f933; +mul.f32 f935, f928, 0fBF3504F3; +mul.f32 f936, f929, 0fBF3504F3; +sub.f32 f937, f935, f936; +add.f32 f938, f935, f936; +add.f32 f939, f906, f922; +sub.f32 f941, f906, f922; +add.f32 f1035, f1043, f1039; +sub.f32 f942, f1043, f1039; +add.f32 f943, f910, f932; +sub.f32 f945, f910, f932; +add.f32 f1034, f1042, f934; +sub.f32 f946, f1042, f934; +add.f32 f947, f908, f925; +sub.f32 f949, f908, f925; +sub.f32 f1033, f909, f924; +add.f32 f950, f909, f924; +add.f32 f951, f912, f937; +sub.f32 f953, f912, f937; +add.f32 f1032, f913, f938; +sub.f32 f954, f913, f938; +mul.f32 f956, f1034, 0fBEC3EF15; +mul.f32 f1031, f943, 0f3F6C835E; +sub.f32 f957, f1031, f956; +mul.f32 f958, f1034, 0f3F6C835E; +fma.rn.f32 f959, f943, 0fBEC3EF15, f958; +mul.f32 f961, f1033, 0fBF3504F3; +mul.f32 f1030, f947, 0f3F3504F3; +sub.f32 f962, f1030, f961; +mul.f32 f963, f1033, 0f3F3504F3; +fma.rn.f32 f964, f947, 0fBF3504F3, f963; +mul.f32 f966, f1032, 0fBF6C835E; +mul.f32 f1029, f951, 0f3EC3EF15; +sub.f32 f967, f1029, f966; +mul.f32 f968, f1032, 0f3EC3EF15; +fma.rn.f32 f969, f951, 0fBF6C835E, f968; +mul.f32 f1027, f945, 0fBEC3EF15; +mul.f32 f1028, f946, 0fBF6C835E; +sub.f32 f972, f1027, f1028; +mul.f32 f973, f946, 0fBEC3EF15; +fma.rn.f32 f974, f945, 0fBF6C835E, f973; +mul.f32 f975, f949, 0fBF3504F3; +mul.f32 f976, f950, 0fBF3504F3; +sub.f32 f977, f975, f976; +add.f32 f978, f975, f976; +mul.f32 f980, f954, 0fBEC3EF15; +mul.f32 f1026, f953, 0fBF6C835E; +sub.f32 f981, f1026, f980; +mul.f32 f982, f954, 0fBF6C835E; +fma.rn.f32 f983, f953, 0fBEC3EF15, f982; +add.f32 %0, f882, f939; +add.f32 %1, f1049, f1035; +add.f32 %2, f886, f957; +add.f32 %3, f1048, f959; +add.f32 %4, f890, f962; +add.f32 %5, f1047, f964; +add.f32 %6, f894, f967; +add.f32 %7, f1046, f969; +sub.f32 %9, f885, f941; +add.f32 %8, f884, f942; +add.f32 %11, f889, f974; +add.f32 %10, f888, f972; +add.f32 %12, f892, f977; +add.f32 %13, f893, f978; +add.f32 %14, f896, f981; +add.f32 %15, f897, f983; +sub.f32 %17, f1049, f1035; +sub.f32 %16, f882, f939; +sub.f32 %19, f1048, f959; +sub.f32 %18, f886, f957; +sub.f32 %21, f1047, f964; +sub.f32 %20, f890, f962; +sub.f32 %23, f1046, f969; +sub.f32 %22, f894, f967; +add.f32 %25, f885, f941; +sub.f32 %24, f884, f942; +sub.f32 %27, f889, f974; +sub.f32 %26, f888, f972; +sub.f32 %29, f893, f978; +sub.f32 %28, f892, f977; +sub.f32 %31, f897, f983; +sub.f32 %30, f896, f981; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_4096), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<105, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2548>; +.reg .b32 r<46>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2539, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2537, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2536, f2539, f2537; +sub.f32 f140, f2539, f2537; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2535, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2532, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2530, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2529, f2532, f2530; +sub.f32 f156, f2532, f2530; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2528, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2528, 0fBF3504F3; +mul.f32 f2527, f157, 0f3F3504F3; +sub.f32 f163, f2527, f162; +mul.f32 f164, f2528, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2526, f2536, f2529; +sub.f32 f173, f2536, f2529; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2525, f2535, f165; +sub.f32 f177, f2535, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2524, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2523, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2521, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2518, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2517, f2521, f2518; +sub.f32 f197, f2521, f2518; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2516, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2514, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2512, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2511, f2514, f2512; +sub.f32 f213, f2514, f2512; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2510, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2510, 0fBF3504F3; +mul.f32 f2509, f214, 0f3F3504F3; +sub.f32 f220, f2509, f219; +mul.f32 f221, f2510, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2508, f2517, f2511; +sub.f32 f230, f2517, f2511; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2507, f2516, f222; +sub.f32 f234, f2516, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2506, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2505, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2503, f231, 0f3F6C835E; +mul.f32 f2504, f2507, 0fBEC3EF15; +sub.f32 f245, f2503, f2504; +mul.f32 f246, f2507, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2501, f235, 0f3F3504F3; +mul.f32 f2502, f2506, 0fBF3504F3; +sub.f32 f250, f2501, f2502; +mul.f32 f251, f2506, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2499, f239, 0f3EC3EF15; +mul.f32 f2500, f2505, 0fBF6C835E; +sub.f32 f255, f2499, f2500; +mul.f32 f256, f2505, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2497, f233, 0fBEC3EF15; +mul.f32 f2498, f234, 0fBF6C835E; +sub.f32 f260, f2497, f2498; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2495, f241, 0fBF6C835E; +mul.f32 f2496, f242, 0fBEC3EF15; +sub.f32 f269, f2495, f2496; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2494, f2526, f2508; +sub.f32 f275, f2526, f2508; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2493, f2525, f247; +sub.f32 f279, f2525, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2492, f2524, f252; +sub.f32 f283, f2524, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2491, f2523, f257; +sub.f32 f287, f2523, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2490, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2489, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2488, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2487, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2484, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2482, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2481, f2484, f2482; +sub.f32 f315, f2484, f2482; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2480, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2478, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2475, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2474, f2478, f2475; +sub.f32 f331, f2478, f2475; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2473, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2471, f332, 0f3F3504F3; +mul.f32 f2472, f2473, 0fBF3504F3; +sub.f32 f338, f2471, f2472; +mul.f32 f339, f2473, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2470, f2481, f2474; +sub.f32 f348, f2481, f2474; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2469, f2480, f340; +sub.f32 f352, f2480, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2468, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2467, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2465, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2463, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2462, f2465, f2463; +sub.f32 f372, f2465, f2463; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2461, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2458, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2457, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2456, f2458, f2457; +sub.f32 f388, f2458, f2457; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2455, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2453, f389, 0f3F3504F3; +mul.f32 f2454, f2455, 0fBF3504F3; +sub.f32 f395, f2453, f2454; +mul.f32 f396, f2455, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2452, f2462, f2456; +sub.f32 f405, f2462, f2456; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2451, f2461, f397; +sub.f32 f409, f2461, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2450, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2449, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2451, 0fBEC3EF15; +mul.f32 f2448, f406, 0f3F6C835E; +sub.f32 f420, f2448, f419; +mul.f32 f421, f2451, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2450, 0fBF3504F3; +mul.f32 f2447, f410, 0f3F3504F3; +sub.f32 f425, f2447, f424; +mul.f32 f426, f2450, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2445, f414, 0f3EC3EF15; +mul.f32 f2446, f2449, 0fBF6C835E; +sub.f32 f430, f2445, f2446; +mul.f32 f431, f2449, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2443, f408, 0fBEC3EF15; +mul.f32 f2444, f409, 0fBF6C835E; +sub.f32 f435, f2443, f2444; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2442, f416, 0fBF6C835E; +sub.f32 f444, f2442, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2441, f2470, f2452; +sub.f32 f450, f2470, f2452; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2440, f2469, f422; +sub.f32 f454, f2469, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2439, f2468, f427; +sub.f32 f458, f2468, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2438, f2467, f432; +sub.f32 f462, f2467, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2437, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2436, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2435, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2434, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2440, 0fBE47C5C2; +mul.f32 f2433, f451, 0f3F7B14BE; +sub.f32 f481, f2433, f480; +mul.f32 f482, f2440, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2439, 0fBEC3EF15; +mul.f32 f2432, f455, 0f3F6C835E; +sub.f32 f486, f2432, f485; +mul.f32 f487, f2439, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2438, 0fBF0E39DA; +mul.f32 f2431, f459, 0f3F54DB31; +sub.f32 f491, f2431, f490; +mul.f32 f492, f2438, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2437, 0fBF3504F3; +mul.f32 f2430, f463, 0f3F3504F3; +sub.f32 f496, f2430, f495; +mul.f32 f497, f2437, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2428, f467, 0f3F0E39DA; +mul.f32 f2429, f2436, 0fBF54DB31; +sub.f32 f501, f2428, f2429; +mul.f32 f502, f2436, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2426, f471, 0f3EC3EF15; +mul.f32 f2427, f2435, 0fBF6C835E; +sub.f32 f506, f2426, f2427; +mul.f32 f507, f2435, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2424, f475, 0f3E47C5C2; +mul.f32 f2425, f2434, 0fBF7B14BE; +sub.f32 f511, f2424, f2425; +mul.f32 f512, f2434, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2422, f453, 0fBE47C5C2; +mul.f32 f2423, f454, 0fBF7B14BE; +sub.f32 f516, f2422, f2423; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2421, f457, 0fBEC3EF15; +sub.f32 f521, f2421, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2420, f461, 0fBF0E39DA; +sub.f32 f526, f2420, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2418, f469, 0fBF54DB31; +mul.f32 f2419, f470, 0fBF0E39DA; +sub.f32 f535, f2418, f2419; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2417, f473, 0fBF6C835E; +sub.f32 f540, f2417, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2416, f477, 0fBF7B14BE; +sub.f32 f545, f2416, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2415, f2493, f483; +sub.f32 f553, f2493, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2414, f2492, f488; +sub.f32 f557, f2492, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2413, f2491, f493; +sub.f32 f561, f2491, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2412, f2490, f498; +sub.f32 f565, f2490, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f2411, f2489, f503; +sub.f32 f569, f2489, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f2410, f2488, f508; +sub.f32 f573, f2488, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f2409, f2487, f513; +sub.f32 f577, f2487, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f2408, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f2407, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f2406, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f2405, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f2404, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2403, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2402, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2401, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f2415; +mul.f32 f616, f610, f2415; +mul.f32 f618, f611, f611; +mul.f32 f2400, f610, f610; +sub.f32 f619, f2400, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f2414; +mul.f32 f624, f619, f2414; +mul.f32 f626, f611, f621; +mul.f32 f2399, f610, f619; +sub.f32 f627, f2399, f626; +mul.f32 f2398, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f2413; +mul.f32 f632, f627, f2413; +mul.f32 f2396, f610, f627; +mul.f32 f2397, f611, f629; +sub.f32 f635, f2396, f2397; +mul.f32 f2395, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f2412; +mul.f32 f640, f635, f2412; +mul.f32 f642, f611, f637; +mul.f32 f2394, f610, f635; +sub.f32 f643, f2394, f642; +mul.f32 f2393, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f2411; +mul.f32 f648, f643, f2411; +mul.f32 f2391, f610, f643; +mul.f32 f2392, f611, f645; +sub.f32 f651, f2391, f2392; +mul.f32 f2390, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f2410; +mul.f32 f656, f651, f2410; +mul.f32 f658, f611, f653; +mul.f32 f2389, f610, f651; +sub.f32 f659, f2389, f658; +mul.f32 f2388, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f2409; +mul.f32 f664, f659, f2409; +mul.f32 f666, f611, f661; +mul.f32 f2387, f610, f659; +sub.f32 f667, f2387, f666; +mul.f32 f2386, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f2408; +mul.f32 f672, f667, f2408; +mul.f32 f2384, f610, f667; +mul.f32 f2385, f611, f669; +sub.f32 f675, f2384, f2385; +mul.f32 f2383, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f2407; +mul.f32 f680, f675, f2407; +mul.f32 f682, f611, f677; +mul.f32 f2382, f610, f675; +sub.f32 f683, f2382, f682; +mul.f32 f2381, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f2406; +mul.f32 f688, f683, f2406; +mul.f32 f690, f611, f685; +mul.f32 f2380, f610, f683; +sub.f32 f691, f2380, f690; +mul.f32 f2379, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f2405; +mul.f32 f696, f691, f2405; +mul.f32 f2377, f610, f691; +mul.f32 f2378, f611, f693; +sub.f32 f699, f2377, f2378; +mul.f32 f2376, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f2404; +mul.f32 f704, f699, f2404; +mul.f32 f706, f611, f701; +mul.f32 f2375, f610, f699; +sub.f32 f707, f2375, f706; +mul.f32 f2374, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f2403; +mul.f32 f712, f707, f2403; +mul.f32 f2372, f610, f707; +mul.f32 f2373, f611, f709; +sub.f32 f715, f2372, f2373; +mul.f32 f2371, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f2402; +mul.f32 f720, f715, f2402; +mul.f32 f722, f611, f717; +mul.f32 f2370, f610, f715; +sub.f32 f723, f2370, f722; +mul.f32 f2369, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f2401; +mul.f32 f728, f723, f2401; +mul.f32 f730, f611, f725; +mul.f32 f2368, f610, f723; +sub.f32 f731, f2368, f730; +mul.f32 f2367, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2366, f2494, f2441; +mul.f32 f735, f733, f2366; +mul.f32 f736, f731, f2366; +mul.f32 f2364, f610, f731; +mul.f32 f2365, f611, f733; +sub.f32 f739, f2364, f2365; +sub.f32 f2363, f272, f447; +mul.f32 f2362, f731, f2363; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2361, f610, f739; +sub.f32 f747, f2361, f746; +mul.f32 f2360, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2359, f610, f747; +sub.f32 f755, f2359, f754; +mul.f32 f2358, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f2356, f610, f755; +mul.f32 f2357, f611, f757; +sub.f32 f763, f2356, f2357; +mul.f32 f2355, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2354, f610, f763; +sub.f32 f771, f2354, f770; +mul.f32 f2353, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f2351, f610, f771; +mul.f32 f2352, f611, f773; +sub.f32 f779, f2351, f2352; +mul.f32 f2350, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2349, f610, f779; +sub.f32 f787, f2349, f786; +mul.f32 f2348, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2347, f610, f787; +sub.f32 f795, f2347, f794; +mul.f32 f2346, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f2344, f610, f795; +mul.f32 f2345, f611, f797; +sub.f32 f803, f2344, f2345; +mul.f32 f2343, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2342, f610, f803; +sub.f32 f811, f2342, f810; +mul.f32 f2341, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2340, f610, f811; +sub.f32 f819, f2340, f818; +mul.f32 f2339, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f2337, f610, f819; +mul.f32 f2338, f611, f821; +sub.f32 f827, f2337, f2338; +mul.f32 f2336, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2335, f610, f827; +sub.f32 f835, f2335, f834; +mul.f32 f2334, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f2332, f610, f835; +mul.f32 f2333, f611, f837; +sub.f32 f843, f2332, f2333; +mul.f32 f2331, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2330, f610, f843; +sub.f32 f851, f2330, f850; +mul.f32 f2329, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f2328, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r34, %tid.x; +shl.b32 r33, r34, 8; +barrier.sync 0; +and.b32 r11, r33, 32512; +add.s32 r12, r9, r11; +add.f32 f857, f2494, f2441; +mov.u32 r30, %tid.x; +shl.b32 r29, r30, 3; +sub.f32 f2544, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r39, %tid.x; +shl.b32 r38, r39, 3; +mov.u32 r37, %tid.x; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f2329, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f2398, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f2395, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f2393, f639; +sub.f32 f867, f2390, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f2388, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f2386, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f2383, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f2381, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f2379, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f2376, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f2374, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f2371, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f2369, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f2367, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f2544, f736; +sub.f32 f890, f2362, f735; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f2360, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f2358, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f2355, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f2353, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f2350, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f2348, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f2346, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f2343, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f2341, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f2339, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f2336, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f2334, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f2331, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f2328, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +and.b32 r21, r37, 127; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+1024]; +ld.shared.v2.f32 {f929, f930}, [r13+2048]; +ld.shared.v2.f32 {f933, f934}, [r13+3072]; +ld.shared.v2.f32 {f937, f938}, [r13+4096]; +ld.shared.v2.f32 {f941, f942}, [r13+5120]; +ld.shared.v2.f32 {f945, f946}, [r13+6144]; +ld.shared.v2.f32 {f949, f950}, [r13+7168]; +ld.shared.v2.f32 {f953, f954}, [r13+8192]; +ld.shared.v2.f32 {f957, f958}, [r13+9216]; +ld.shared.v2.f32 {f961, f962}, [r13+10240]; +ld.shared.v2.f32 {f965, f966}, [r13+11264]; +ld.shared.v2.f32 {f969, f970}, [r13+12288]; +ld.shared.v2.f32 {f973, f974}, [r13+13312]; +ld.shared.v2.f32 {f977, f978}, [r13+14336]; +ld.shared.v2.f32 {f981, f982}, [r13+15360]; +ld.shared.v2.f32 {f985, f986}, [r13+16384]; +ld.shared.v2.f32 {f989, f990}, [r13+17408]; +ld.shared.v2.f32 {f993, f994}, [r13+18432]; +ld.shared.v2.f32 {f997, f998}, [r13+19456]; +ld.shared.v2.f32 {f1001, f1002}, [r13+20480]; +ld.shared.v2.f32 {f1005, f1006}, [r13+21504]; +ld.shared.v2.f32 {f1009, f1010}, [r13+22528]; +ld.shared.v2.f32 {f1013, f1014}, [r13+23552]; +ld.shared.v2.f32 {f1017, f1018}, [r13+24576]; +ld.shared.v2.f32 {f1021, f1022}, [r13+25600]; +ld.shared.v2.f32 {f1025, f1026}, [r13+26624]; +ld.shared.v2.f32 {f1029, f1030}, [r13+27648]; +ld.shared.v2.f32 {f1033, f1034}, [r13+28672]; +ld.shared.v2.f32 {f1037, f1038}, [r13+29696]; +ld.shared.v2.f32 {f1041, f1042}, [r13+30720]; +ld.shared.v2.f32 {f1045, f1046}, [r13+31744]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2327, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2326, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2325, f2327, f2326; +sub.f32 f1060, f2327, f2326; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f2324, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2323, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2322, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2321, f2323, f2322; +sub.f32 f1076, f2323, f2322; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f2320, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f2320, 0fBF3504F3; +mul.f32 f2319, f1077, 0f3F3504F3; +sub.f32 f1083, f2319, f1082; +mul.f32 f1084, f2320, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2318, f2325, f2321; +sub.f32 f1093, f2325, f2321; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2317, f2324, f1085; +sub.f32 f1097, f2324, f1085; +add.f32 f1098, f1059, f1076; +sub.f32 f1100, f1059, f1076; +sub.f32 f2316, f1060, f1075; +add.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1088; +sub.f32 f1104, f1063, f1088; +add.f32 f2315, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2314, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2313, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2312, f2314, f2313; +sub.f32 f1117, f2314, f2313; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f2311, f1109, f1112; +add.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2310, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2309, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2308, f2310, f2309; +sub.f32 f1133, f2310, f2309; +add.f32 f1134, f1124, f1129; +sub.f32 f1136, f1124, f1129; +sub.f32 f2307, f1125, f1128; +add.f32 f1137, f1125, f1128; +mul.f32 f1139, f2307, 0fBF3504F3; +mul.f32 f2306, f1134, 0f3F3504F3; +sub.f32 f1140, f2306, f1139; +mul.f32 f1141, f2307, 0f3F3504F3; +fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141; +mul.f32 f1143, f1136, 0fBF3504F3; +mul.f32 f1144, f1137, 0fBF3504F3; +sub.f32 f1145, f1143, f1144; +add.f32 f1146, f1143, f1144; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2305, f2312, f2308; +sub.f32 f1150, f2312, f2308; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2304, f2311, f1142; +sub.f32 f1154, f2311, f1142; +add.f32 f1155, f1116, f1133; +sub.f32 f1157, f1116, f1133; +sub.f32 f2303, f1117, f1132; +add.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1145; +sub.f32 f1161, f1120, f1145; +add.f32 f2302, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2300, f1151, 0f3F6C835E; +mul.f32 f2301, f2304, 0fBEC3EF15; +sub.f32 f1165, f2300, f2301; +mul.f32 f1166, f2304, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166; +mul.f32 f2298, f1155, 0f3F3504F3; +mul.f32 f2299, f2303, 0fBF3504F3; +sub.f32 f1170, f2298, f2299; +mul.f32 f1171, f2303, 0f3F3504F3; +fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171; +mul.f32 f1174, f2302, 0fBF6C835E; +mul.f32 f2297, f1159, 0f3EC3EF15; +sub.f32 f1175, f2297, f1174; +mul.f32 f1176, f2302, 0f3EC3EF15; +fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176; +mul.f32 f1179, f1154, 0fBF6C835E; +mul.f32 f2296, f1153, 0fBEC3EF15; +sub.f32 f1180, f2296, f1179; +mul.f32 f1181, f1154, 0fBEC3EF15; +fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181; +mul.f32 f1183, f1157, 0fBF3504F3; +mul.f32 f1184, f1158, 0fBF3504F3; +sub.f32 f1185, f1183, f1184; +add.f32 f1186, f1183, f1184; +mul.f32 f2294, f1161, 0fBF6C835E; +mul.f32 f2295, f1162, 0fBEC3EF15; +sub.f32 f1189, f2294, f2295; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2293, f2318, f2305; +sub.f32 f1195, f2318, f2305; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2292, f2317, f1167; +sub.f32 f1199, f2317, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2291, f2316, f1172; +sub.f32 f1203, f2316, f1172; +add.f32 f1204, f1102, f1175; +sub.f32 f1206, f1102, f1175; +add.f32 f2290, f2315, f1177; +sub.f32 f1207, f2315, f1177; +add.f32 f1208, f1092, f1150; +sub.f32 f1210, f1092, f1150; +sub.f32 f2289, f1093, f1149; +add.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1180; +sub.f32 f1214, f1096, f1180; +add.f32 f2288, f1097, f1182; +sub.f32 f1215, f1097, f1182; +add.f32 f1216, f1100, f1185; +sub.f32 f1218, f1100, f1185; +add.f32 f2287, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2286, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2285, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2284, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2283, f2285, f2284; +sub.f32 f1235, f2285, f2284; +add.f32 f1236, f1226, f1231; +sub.f32 f1238, f1226, f1231; +sub.f32 f2282, f1227, f1230; +add.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2281, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2280, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2279, f2281, f2280; +sub.f32 f1251, f2281, f2280; +add.f32 f1252, f1242, f1247; +sub.f32 f1254, f1242, f1247; +sub.f32 f2278, f1243, f1246; +add.f32 f1255, f1243, f1246; +mul.f32 f1257, f2278, 0fBF3504F3; +mul.f32 f2277, f1252, 0f3F3504F3; +sub.f32 f1258, f2277, f1257; +mul.f32 f1259, f2278, 0f3F3504F3; +fma.rn.f32 f1260, f1252, 0fBF3504F3, f1259; +mul.f32 f1261, f1254, 0fBF3504F3; +mul.f32 f1262, f1255, 0fBF3504F3; +sub.f32 f1263, f1261, f1262; +add.f32 f1264, f1261, f1262; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2276, f2283, f2279; +sub.f32 f1268, f2283, f2279; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2275, f2282, f1260; +sub.f32 f1272, f2282, f1260; +add.f32 f1273, f1234, f1251; +sub.f32 f1275, f1234, f1251; +sub.f32 f2274, f1235, f1250; +add.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1263; +sub.f32 f1279, f1238, f1263; +add.f32 f2273, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2272, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2271, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2270, f2272, f2271; +sub.f32 f1292, f2272, f2271; +add.f32 f1293, f1283, f1288; +sub.f32 f1295, f1283, f1288; +sub.f32 f2269, f1284, f1287; +add.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2268, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2267, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2266, f2268, f2267; +sub.f32 f1308, f2268, f2267; +add.f32 f1309, f1299, f1304; +sub.f32 f1311, f1299, f1304; +sub.f32 f2265, f1300, f1303; +add.f32 f1312, f1300, f1303; +mul.f32 f1314, f2265, 0fBF3504F3; +mul.f32 f2264, f1309, 0f3F3504F3; +sub.f32 f1315, f2264, f1314; +mul.f32 f1316, f2265, 0f3F3504F3; +fma.rn.f32 f1317, f1309, 0fBF3504F3, f1316; +mul.f32 f1318, f1311, 0fBF3504F3; +mul.f32 f1319, f1312, 0fBF3504F3; +sub.f32 f1320, f1318, f1319; +add.f32 f1321, f1318, f1319; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2263, f2270, f2266; +sub.f32 f1325, f2270, f2266; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2262, f2269, f1317; +sub.f32 f1329, f2269, f1317; +add.f32 f1330, f1291, f1308; +sub.f32 f1332, f1291, f1308; +sub.f32 f2261, f1292, f1307; +add.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1320; +sub.f32 f1336, f1295, f1320; +add.f32 f2260, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2258, f1326, 0f3F6C835E; +mul.f32 f2259, f2262, 0fBEC3EF15; +sub.f32 f1340, f2258, f2259; +mul.f32 f1341, f2262, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0fBEC3EF15, f1341; +mul.f32 f2256, f1330, 0f3F3504F3; +mul.f32 f2257, f2261, 0fBF3504F3; +sub.f32 f1345, f2256, f2257; +mul.f32 f1346, f2261, 0f3F3504F3; +fma.rn.f32 f1347, f1330, 0fBF3504F3, f1346; +mul.f32 f2254, f1334, 0f3EC3EF15; +mul.f32 f2255, f2260, 0fBF6C835E; +sub.f32 f1350, f2254, f2255; +mul.f32 f1351, f2260, 0f3EC3EF15; +fma.rn.f32 f1352, f1334, 0fBF6C835E, f1351; +mul.f32 f2252, f1328, 0fBEC3EF15; +mul.f32 f2253, f1329, 0fBF6C835E; +sub.f32 f1355, f2252, f2253; +mul.f32 f1356, f1329, 0fBEC3EF15; +fma.rn.f32 f1357, f1328, 0fBF6C835E, f1356; +mul.f32 f1358, f1332, 0fBF3504F3; +mul.f32 f1359, f1333, 0fBF3504F3; +sub.f32 f1360, f1358, f1359; +add.f32 f1361, f1358, f1359; +mul.f32 f2250, f1336, 0fBF6C835E; +mul.f32 f2251, f1337, 0fBEC3EF15; +sub.f32 f1364, f2250, f2251; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0fBEC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2249, f2276, f2263; +sub.f32 f1370, f2276, f2263; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2248, f2275, f1342; +sub.f32 f1374, f2275, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2247, f2274, f1347; +sub.f32 f1378, f2274, f1347; +add.f32 f1379, f1277, f1350; +sub.f32 f1381, f1277, f1350; +add.f32 f2246, f2273, f1352; +sub.f32 f1382, f2273, f1352; +add.f32 f1383, f1267, f1325; +sub.f32 f1385, f1267, f1325; +sub.f32 f2245, f1268, f1324; +add.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1355; +sub.f32 f1389, f1271, f1355; +add.f32 f2244, f1272, f1357; +sub.f32 f1390, f1272, f1357; +add.f32 f1391, f1275, f1360; +sub.f32 f1393, f1275, f1360; +add.f32 f2243, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2242, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2248, 0fBE47C5C2; +mul.f32 f2241, f1371, 0f3F7B14BE; +sub.f32 f1401, f2241, f1400; +mul.f32 f1402, f2248, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0fBE47C5C2, f1402; +mul.f32 f1405, f2247, 0fBEC3EF15; +mul.f32 f2240, f1375, 0f3F6C835E; +sub.f32 f1406, f2240, f1405; +mul.f32 f1407, f2247, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0fBEC3EF15, f1407; +mul.f32 f2238, f1379, 0f3F54DB31; +mul.f32 f2239, f2246, 0fBF0E39DA; +sub.f32 f1411, f2238, f2239; +mul.f32 f1412, f2246, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0fBF0E39DA, f1412; +mul.f32 f2236, f1383, 0f3F3504F3; +mul.f32 f2237, f2245, 0fBF3504F3; +sub.f32 f1416, f2236, f2237; +mul.f32 f1417, f2245, 0f3F3504F3; +fma.rn.f32 f1418, f1383, 0fBF3504F3, f1417; +mul.f32 f2234, f1387, 0f3F0E39DA; +mul.f32 f2235, f2244, 0fBF54DB31; +sub.f32 f1421, f2234, f2235; +mul.f32 f1422, f2244, 0f3F0E39DA; +fma.rn.f32 f1423, f1387, 0fBF54DB31, f1422; +mul.f32 f2232, f1391, 0f3EC3EF15; +mul.f32 f2233, f2243, 0fBF6C835E; +sub.f32 f1426, f2232, f2233; +mul.f32 f1427, f2243, 0f3EC3EF15; +fma.rn.f32 f1428, f1391, 0fBF6C835E, f1427; +mul.f32 f1430, f2242, 0fBF7B14BE; +mul.f32 f2231, f1395, 0f3E47C5C2; +sub.f32 f1431, f2231, f1430; +mul.f32 f1432, f2242, 0f3E47C5C2; +fma.rn.f32 f1433, f1395, 0fBF7B14BE, f1432; +mul.f32 f1435, f1374, 0fBF7B14BE; +mul.f32 f2230, f1373, 0fBE47C5C2; +sub.f32 f1436, f2230, f1435; +mul.f32 f1437, f1374, 0fBE47C5C2; +fma.rn.f32 f1438, f1373, 0fBF7B14BE, f1437; +mul.f32 f1440, f1378, 0fBF6C835E; +mul.f32 f2229, f1377, 0fBEC3EF15; +sub.f32 f1441, f2229, f1440; +mul.f32 f1442, f1378, 0fBEC3EF15; +fma.rn.f32 f1443, f1377, 0fBF6C835E, f1442; +mul.f32 f1445, f1382, 0fBF54DB31; +mul.f32 f2228, f1381, 0fBF0E39DA; +sub.f32 f1446, f2228, f1445; +mul.f32 f1447, f1382, 0fBF0E39DA; +fma.rn.f32 f1448, f1381, 0fBF54DB31, f1447; +mul.f32 f1449, f1385, 0fBF3504F3; +mul.f32 f1450, f1386, 0fBF3504F3; +sub.f32 f1451, f1449, f1450; +add.f32 f1452, f1449, f1450; +mul.f32 f1454, f1390, 0fBF0E39DA; +mul.f32 f2227, f1389, 0fBF54DB31; +sub.f32 f1455, f2227, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0fBF0E39DA, f1456; +mul.f32 f1459, f1394, 0fBEC3EF15; +mul.f32 f2226, f1393, 0fBF6C835E; +sub.f32 f1460, f2226, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0fBEC3EF15, f1461; +mul.f32 f1464, f1398, 0fBE47C5C2; +mul.f32 f2225, f1397, 0fBF7B14BE; +sub.f32 f1465, f2225, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0fBE47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2224, f2292, f1403; +sub.f32 f1473, f2292, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2223, f2291, f1408; +sub.f32 f1477, f2291, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2222, f2290, f1413; +sub.f32 f1481, f2290, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2221, f2289, f1418; +sub.f32 f1485, f2289, f1418; +add.f32 f1486, f1212, f1421; +sub.f32 f1488, f1212, f1421; +add.f32 f2220, f2288, f1423; +sub.f32 f1489, f2288, f1423; +add.f32 f1490, f1216, f1426; +sub.f32 f1492, f1216, f1426; +add.f32 f2219, f2287, f1428; +sub.f32 f1493, f2287, f1428; +add.f32 f1494, f1220, f1431; +sub.f32 f1496, f1220, f1431; +add.f32 f2218, f2286, f1433; +sub.f32 f1497, f2286, f1433; +add.f32 f1498, f1194, f1370; +sub.f32 f1500, f1194, f1370; +sub.f32 f2217, f1195, f1369; +add.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1436; +sub.f32 f1504, f1198, f1436; +add.f32 f2216, f1199, f1438; +sub.f32 f1505, f1199, f1438; +add.f32 f1506, f1202, f1441; +sub.f32 f1508, f1202, f1441; +add.f32 f2215, f1203, f1443; +sub.f32 f1509, f1203, f1443; +add.f32 f1510, f1206, f1446; +sub.f32 f1512, f1206, f1446; +add.f32 f2214, f1207, f1448; +sub.f32 f1513, f1207, f1448; +add.f32 f1514, f1210, f1451; +sub.f32 f1516, f1210, f1451; +add.f32 f2213, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2212, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2211, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2210, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r37, 96; +bfe.u32 r15, r37, 5, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1535, f1531, f2224; +mul.f32 f1536, f1530, f2224; +mul.f32 f2208, f1530, f1530; +mul.f32 f2209, f1531, f1531; +sub.f32 f1539, f2208, f2209; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1543, f1541, f2223; +mul.f32 f1544, f1539, f2223; +mul.f32 f1546, f1531, f1541; +mul.f32 f2207, f1530, f1539; +sub.f32 f1547, f2207, f1546; +mul.f32 f2206, f1539, f1474; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1551, f1549, f2222; +mul.f32 f1552, f1547, f2222; +mul.f32 f1554, f1531, f1549; +mul.f32 f2205, f1530, f1547; +sub.f32 f1555, f2205, f1554; +mul.f32 f2204, f1547, f1478; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1559, f1557, f2221; +mul.f32 f1560, f1555, f2221; +mul.f32 f2202, f1530, f1555; +mul.f32 f2203, f1531, f1557; +sub.f32 f1563, f2202, f2203; +mul.f32 f2201, f1555, f1482; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1567, f1565, f2220; +mul.f32 f1568, f1563, f2220; +mul.f32 f1570, f1531, f1565; +mul.f32 f2200, f1530, f1563; +sub.f32 f1571, f2200, f1570; +mul.f32 f2199, f1563, f1486; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1575, f1573, f2219; +mul.f32 f1576, f1571, f2219; +mul.f32 f1578, f1531, f1573; +mul.f32 f2198, f1530, f1571; +sub.f32 f1579, f2198, f1578; +mul.f32 f2197, f1571, f1490; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1583, f1581, f2218; +mul.f32 f1584, f1579, f2218; +mul.f32 f2195, f1530, f1579; +mul.f32 f2196, f1531, f1581; +sub.f32 f1587, f2195, f2196; +mul.f32 f2194, f1579, f1494; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1591, f1589, f2217; +mul.f32 f1592, f1587, f2217; +mul.f32 f1594, f1531, f1589; +mul.f32 f2193, f1530, f1587; +sub.f32 f1595, f2193, f1594; +mul.f32 f2192, f1587, f1498; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1599, f1597, f2216; +mul.f32 f1600, f1595, f2216; +mul.f32 f2190, f1530, f1595; +mul.f32 f2191, f1531, f1597; +sub.f32 f1603, f2190, f2191; +mul.f32 f2189, f1595, f1502; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1607, f1605, f2215; +mul.f32 f1608, f1603, f2215; +mul.f32 f1610, f1531, f1605; +mul.f32 f2188, f1530, f1603; +sub.f32 f1611, f2188, f1610; +mul.f32 f2187, f1603, f1506; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1615, f1613, f2214; +mul.f32 f1616, f1611, f2214; +mul.f32 f1618, f1531, f1613; +mul.f32 f2186, f1530, f1611; +sub.f32 f1619, f2186, f1618; +mul.f32 f2185, f1611, f1510; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1623, f1621, f2213; +mul.f32 f1624, f1619, f2213; +mul.f32 f2183, f1530, f1619; +mul.f32 f2184, f1531, f1621; +sub.f32 f1627, f2183, f2184; +mul.f32 f2182, f1619, f1514; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1631, f1629, f2212; +mul.f32 f1632, f1627, f2212; +mul.f32 f1634, f1531, f1629; +mul.f32 f2181, f1530, f1627; +sub.f32 f1635, f2181, f1634; +mul.f32 f2180, f1627, f1518; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1639, f1637, f2211; +mul.f32 f1640, f1635, f2211; +mul.f32 f1642, f1531, f1637; +mul.f32 f2179, f1530, f1635; +sub.f32 f1643, f2179, f1642; +mul.f32 f2178, f1635, f1522; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1647, f1645, f2210; +mul.f32 f1648, f1643, f2210; +mul.f32 f2176, f1530, f1643; +mul.f32 f2177, f1531, f1645; +sub.f32 f1651, f2176, f2177; +mul.f32 f2175, f1643, f1526; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2174, f2293, f2249; +mul.f32 f1655, f1653, f2174; +mul.f32 f1656, f1651, f2174; +mul.f32 f1658, f1531, f1653; +mul.f32 f2173, f1530, f1651; +sub.f32 f1659, f2173, f1658; +sub.f32 f2172, f1192, f1367; +mul.f32 f2171, f1651, f2172; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1663, f1661, f1473; +mul.f32 f1664, f1659, f1473; +mul.f32 f2169, f1530, f1659; +mul.f32 f2170, f1531, f1661; +sub.f32 f1667, f2169, f2170; +mul.f32 f2168, f1659, f1472; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1671, f1669, f1477; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2167, f1530, f1667; +sub.f32 f1675, f2167, f1674; +mul.f32 f2166, f1667, f1476; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1679, f1677, f1481; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2165, f1530, f1675; +sub.f32 f1683, f2165, f1682; +mul.f32 f2164, f1675, f1480; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1687, f1685, f1485; +mul.f32 f1688, f1683, f1485; +mul.f32 f2162, f1530, f1683; +mul.f32 f2163, f1531, f1685; +sub.f32 f1691, f2162, f2163; +mul.f32 f2161, f1683, f1484; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1695, f1693, f1489; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2160, f1530, f1691; +sub.f32 f1699, f2160, f1698; +mul.f32 f2159, f1691, f1488; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1703, f1701, f1493; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2158, f1530, f1699; +sub.f32 f1707, f2158, f1706; +mul.f32 f2157, f1699, f1492; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1711, f1709, f1497; +mul.f32 f1712, f1707, f1497; +mul.f32 f2155, f1530, f1707; +mul.f32 f2156, f1531, f1709; +sub.f32 f1715, f2155, f2156; +mul.f32 f2154, f1707, f1496; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1719, f1717, f1501; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2153, f1530, f1715; +sub.f32 f1723, f2153, f1722; +mul.f32 f2152, f1715, f1500; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1727, f1725, f1505; +mul.f32 f1728, f1723, f1505; +mul.f32 f2150, f1530, f1723; +mul.f32 f2151, f1531, f1725; +sub.f32 f1731, f2150, f2151; +mul.f32 f2149, f1723, f1504; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1735, f1733, f1509; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2148, f1530, f1731; +sub.f32 f1739, f2148, f1738; +mul.f32 f2147, f1731, f1508; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1743, f1741, f1513; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2146, f1530, f1739; +sub.f32 f1747, f2146, f1746; +mul.f32 f2145, f1739, f1512; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1751, f1749, f1517; +mul.f32 f1752, f1747, f1517; +mul.f32 f2143, f1530, f1747; +mul.f32 f2144, f1531, f1749; +sub.f32 f1755, f2143, f2144; +mul.f32 f2142, f1747, f1516; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1759, f1757, f1521; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2141, f1530, f1755; +sub.f32 f1763, f2141, f1762; +mul.f32 f2140, f1755, f1520; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1767, f1765, f1525; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2139, f1530, f1763; +sub.f32 f1771, f2139, f1770; +mul.f32 f2138, f1530, f1470; +mul.f32 f1772, f1530, f1765; +mul.f32 f2137, f1763, f1524; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1771, f1528; +mul.f32 f1775, f1773, f1529; +mul.f32 f1776, f1771, f1529; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 8; +and.b32 r16, r38, 248; +add.s32 r17, r9, r16; +sub.f32 f2542, f2293, f2249; +mul.f32 f2541, f1653, f2542; +barrier.sync 0; +and.b32 r18, r23, 24576; +add.s32 r19, r17, r18; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 96; +add.f32 f1777, f2293, f2249; +mov.u32 r32, %tid.x; +and.b32 r31, r32, 96; +sub.f32 f2545, f1192, f1367; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r36, %tid.x; +and.b32 r35, r36, 96; +fma.rn.f32 f1779, f1531, f1470, f1536; +sub.f32 f1780, f2138, f1535; +st.shared.v2.f32 [r19+256], {f1780, f1779}; +fma.rn.f32 f1781, f1541, f1474, f1544; +sub.f32 f1782, f2206, f1543; +st.shared.v2.f32 [r19+512], {f1782, f1781}; +fma.rn.f32 f1783, f1549, f1478, f1552; +sub.f32 f1784, f2204, f1551; +st.shared.v2.f32 [r19+768], {f1784, f1783}; +fma.rn.f32 f1785, f1557, f1482, f1560; +sub.f32 f1786, f2201, f1559; +st.shared.v2.f32 [r19+1024], {f1786, f1785}; +fma.rn.f32 f1787, f1565, f1486, f1568; +sub.f32 f1788, f2199, f1567; +st.shared.v2.f32 [r19+1280], {f1788, f1787}; +sub.f32 f1789, f2197, f1575; +fma.rn.f32 f1790, f1573, f1490, f1576; +st.shared.v2.f32 [r19+1536], {f1789, f1790}; +fma.rn.f32 f1791, f1581, f1494, f1584; +sub.f32 f1792, f2194, f1583; +st.shared.v2.f32 [r19+1792], {f1792, f1791}; +fma.rn.f32 f1793, f1589, f1498, f1592; +sub.f32 f1794, f2192, f1591; +st.shared.v2.f32 [r19+2048], {f1794, f1793}; +fma.rn.f32 f1795, f1597, f1502, f1600; +sub.f32 f1796, f2189, f1599; +st.shared.v2.f32 [r19+2304], {f1796, f1795}; +fma.rn.f32 f1797, f1605, f1506, f1608; +sub.f32 f1798, f2187, f1607; +st.shared.v2.f32 [r19+2560], {f1798, f1797}; +fma.rn.f32 f1799, f1613, f1510, f1616; +sub.f32 f1800, f2185, f1615; +st.shared.v2.f32 [r19+2816], {f1800, f1799}; +fma.rn.f32 f1801, f1621, f1514, f1624; +sub.f32 f1802, f2182, f1623; +st.shared.v2.f32 [r19+3072], {f1802, f1801}; +fma.rn.f32 f1803, f1629, f1518, f1632; +sub.f32 f1804, f2180, f1631; +st.shared.v2.f32 [r19+3328], {f1804, f1803}; +fma.rn.f32 f1805, f1637, f1522, f1640; +sub.f32 f1806, f2178, f1639; +st.shared.v2.f32 [r19+3584], {f1806, f1805}; +fma.rn.f32 f1807, f1645, f1526, f1648; +sub.f32 f1808, f2175, f1647; +st.shared.v2.f32 [r19+3840], {f1808, f1807}; +fma.rn.f32 f1809, f1653, f2545, f1656; +sub.f32 f1810, f2171, f2541; +st.shared.v2.f32 [r19+4096], {f1810, f1809}; +fma.rn.f32 f1811, f1661, f1472, f1664; +sub.f32 f1812, f2168, f1663; +st.shared.v2.f32 [r19+4352], {f1812, f1811}; +fma.rn.f32 f1813, f1669, f1476, f1672; +sub.f32 f1814, f2166, f1671; +st.shared.v2.f32 [r19+4608], {f1814, f1813}; +fma.rn.f32 f1815, f1677, f1480, f1680; +sub.f32 f1816, f2164, f1679; +st.shared.v2.f32 [r19+4864], {f1816, f1815}; +fma.rn.f32 f1817, f1685, f1484, f1688; +sub.f32 f1818, f2161, f1687; +st.shared.v2.f32 [r19+5120], {f1818, f1817}; +fma.rn.f32 f1819, f1693, f1488, f1696; +sub.f32 f1820, f2159, f1695; +st.shared.v2.f32 [r19+5376], {f1820, f1819}; +fma.rn.f32 f1821, f1701, f1492, f1704; +sub.f32 f1822, f2157, f1703; +st.shared.v2.f32 [r19+5632], {f1822, f1821}; +fma.rn.f32 f1823, f1709, f1496, f1712; +sub.f32 f1824, f2154, f1711; +st.shared.v2.f32 [r19+5888], {f1824, f1823}; +fma.rn.f32 f1825, f1717, f1500, f1720; +sub.f32 f1826, f2152, f1719; +st.shared.v2.f32 [r19+6144], {f1826, f1825}; +fma.rn.f32 f1827, f1725, f1504, f1728; +sub.f32 f1828, f2149, f1727; +st.shared.v2.f32 [r19+6400], {f1828, f1827}; +fma.rn.f32 f1829, f1733, f1508, f1736; +sub.f32 f1830, f2147, f1735; +st.shared.v2.f32 [r19+6656], {f1830, f1829}; +fma.rn.f32 f1831, f1741, f1512, f1744; +sub.f32 f1832, f2145, f1743; +st.shared.v2.f32 [r19+6912], {f1832, f1831}; +fma.rn.f32 f1833, f1749, f1516, f1752; +sub.f32 f1834, f2142, f1751; +st.shared.v2.f32 [r19+7168], {f1834, f1833}; +fma.rn.f32 f1835, f1757, f1520, f1760; +sub.f32 f1836, f2140, f1759; +st.shared.v2.f32 [r19+7424], {f1836, f1835}; +fma.rn.f32 f1837, f1765, f1524, f1768; +sub.f32 f1838, f2137, f1767; +st.shared.v2.f32 [r19+7680], {f1838, f1837}; +fma.rn.f32 f1839, f1773, f1528, f1776; +sub.f32 f1840, f1774, f1775; +st.shared.v2.f32 [r19+7936], {f1840, f1839}; +barrier.sync 0; +mad.lo.s32 r20, r35, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+1024]; +ld.shared.v2.f32 {f1849, f1850}, [r20+2048]; +ld.shared.v2.f32 {f1853, f1854}, [r20+3072]; +ld.shared.v2.f32 {f1857, f1858}, [r20+4096]; +ld.shared.v2.f32 {f1861, f1862}, [r20+5120]; +ld.shared.v2.f32 {f1865, f1866}, [r20+6144]; +ld.shared.v2.f32 {f1869, f1870}, [r20+7168]; +ld.shared.v2.f32 {f1873, f1874}, [r20+8192]; +ld.shared.v2.f32 {f1877, f1878}, [r20+9216]; +ld.shared.v2.f32 {f1881, f1882}, [r20+10240]; +ld.shared.v2.f32 {f1885, f1886}, [r20+11264]; +ld.shared.v2.f32 {f1889, f1890}, [r20+12288]; +ld.shared.v2.f32 {f1893, f1894}, [r20+13312]; +ld.shared.v2.f32 {f1897, f1898}, [r20+14336]; +ld.shared.v2.f32 {f1901, f1902}, [r20+15360]; +ld.shared.v2.f32 {f1905, f1906}, [r20+16384]; +ld.shared.v2.f32 {f1909, f1910}, [r20+17408]; +ld.shared.v2.f32 {f1913, f1914}, [r20+18432]; +ld.shared.v2.f32 {f1917, f1918}, [r20+19456]; +ld.shared.v2.f32 {f1921, f1922}, [r20+20480]; +ld.shared.v2.f32 {f1925, f1926}, [r20+21504]; +ld.shared.v2.f32 {f1929, f1930}, [r20+22528]; +ld.shared.v2.f32 {f1933, f1934}, [r20+23552]; +ld.shared.v2.f32 {f1937, f1938}, [r20+24576]; +ld.shared.v2.f32 {f1941, f1942}, [r20+25600]; +ld.shared.v2.f32 {f1945, f1946}, [r20+26624]; +ld.shared.v2.f32 {f1949, f1950}, [r20+27648]; +ld.shared.v2.f32 {f1953, f1954}, [r20+28672]; +ld.shared.v2.f32 {f1957, f1958}, [r20+29696]; +ld.shared.v2.f32 {f1961, f1962}, [r20+30720]; +ld.shared.v2.f32 {f1965, f1966}, [r20+31744]; +add.f32 f1969, f1841, f1905; +sub.f32 f1971, f1841, f1905; +add.f32 f2136, f1842, f1906; +sub.f32 f1972, f1842, f1906; +add.f32 f1973, f1873, f1937; +sub.f32 f1975, f1873, f1937; +add.f32 f2135, f1874, f1938; +sub.f32 f1976, f1874, f1938; +add.f32 f1977, f1845, f1909; +sub.f32 f1979, f1845, f1909; +add.f32 f2134, f1846, f1910; +sub.f32 f1980, f1846, f1910; +add.f32 f1981, f1877, f1941; +sub.f32 f1983, f1877, f1941; +add.f32 f2133, f1878, f1942; +sub.f32 f1984, f1878, f1942; +add.f32 f1985, f1849, f1913; +sub.f32 f1987, f1849, f1913; +add.f32 f2132, f1850, f1914; +sub.f32 f1988, f1850, f1914; +add.f32 f1989, f1881, f1945; +sub.f32 f1991, f1881, f1945; +add.f32 f2131, f1882, f1946; +sub.f32 f1992, f1882, f1946; +add.f32 f1993, f1853, f1917; +sub.f32 f1995, f1853, f1917; +add.f32 f2130, f1854, f1918; +sub.f32 f1996, f1854, f1918; +add.f32 f1997, f1885, f1949; +sub.f32 f1999, f1885, f1949; +add.f32 f2129, f1886, f1950; +sub.f32 f2000, f1886, f1950; +add.f32 f2001, f1857, f1921; +sub.f32 f2003, f1857, f1921; +add.f32 f2128, f1858, f1922; +sub.f32 f2004, f1858, f1922; +add.f32 f2005, f1889, f1953; +sub.f32 f2007, f1889, f1953; +add.f32 f2127, f1890, f1954; +sub.f32 f2008, f1890, f1954; +add.f32 f2009, f1861, f1925; +sub.f32 f2011, f1861, f1925; +add.f32 f2126, f1862, f1926; +sub.f32 f2012, f1862, f1926; +add.f32 f2013, f1893, f1957; +sub.f32 f2015, f1893, f1957; +add.f32 f2125, f1894, f1958; +sub.f32 f2016, f1894, f1958; +add.f32 f2017, f1865, f1929; +sub.f32 f2019, f1865, f1929; +add.f32 f2124, f1866, f1930; +sub.f32 f2020, f1866, f1930; +add.f32 f2021, f1897, f1961; +sub.f32 f2023, f1897, f1961; +add.f32 f2123, f1898, f1962; +sub.f32 f2024, f1898, f1962; +add.f32 f2025, f1869, f1933; +sub.f32 f2027, f1869, f1933; +add.f32 f2122, f1870, f1934; +sub.f32 f2028, f1870, f1934; +add.f32 f2029, f1901, f1965; +sub.f32 f2031, f1901, f1965; +add.f32 f2121, f1902, f1966; +sub.f32 f2032, f1902, f1966; +add.f32 %0, f1969, f1973; +add.f32 %1, f2136, f2135; +add.f32 %3, f2134, f2133; +add.f32 %2, f1977, f1981; +add.f32 %5, f2132, f2131; +add.f32 %4, f1985, f1989; +add.f32 %7, f2130, f2129; +add.f32 %6, f1993, f1997; +add.f32 %8, f2001, f2005; +add.f32 %9, f2128, f2127; +add.f32 %10, f2009, f2013; +add.f32 %11, f2126, f2125; +add.f32 %12, f2017, f2021; +add.f32 %13, f2124, f2123; +add.f32 %15, f2122, f2121; +add.f32 %14, f2025, f2029; +sub.f32 %17, f1972, f1975; +add.f32 %16, f1971, f1976; +sub.f32 %19, f1980, f1983; +add.f32 %18, f1979, f1984; +add.f32 %20, f1987, f1992; +sub.f32 %21, f1988, f1991; +add.f32 %22, f1995, f2000; +sub.f32 %23, f1996, f1999; +add.f32 %24, f2003, f2008; +sub.f32 %25, f2004, f2007; +add.f32 %26, f2011, f2016; +sub.f32 %27, f2012, f2015; +sub.f32 %29, f2020, f2023; +add.f32 %28, f2019, f2024; +sub.f32 %31, f2028, f2031; +add.f32 %30, f2027, f2032; +sub.f32 %33, f2136, f2135; +sub.f32 %32, f1969, f1973; +sub.f32 %35, f2134, f2133; +sub.f32 %34, f1977, f1981; +sub.f32 %37, f2132, f2131; +sub.f32 %36, f1985, f1989; +sub.f32 %39, f2130, f2129; +sub.f32 %38, f1993, f1997; +sub.f32 %41, f2128, f2127; +sub.f32 %40, f2001, f2005; +sub.f32 %43, f2126, f2125; +sub.f32 %42, f2009, f2013; +sub.f32 %45, f2124, f2123; +sub.f32 %44, f2017, f2021; +sub.f32 %47, f2122, f2121; +sub.f32 %46, f2025, f2029; +add.f32 %49, f1972, f1975; +sub.f32 %48, f1971, f1976; +add.f32 %51, f1980, f1983; +sub.f32 %50, f1979, f1984; +add.f32 %53, f1988, f1991; +sub.f32 %52, f1987, f1992; +add.f32 %55, f1996, f1999; +sub.f32 %54, f1995, f2000; +add.f32 %57, f2004, f2007; +sub.f32 %56, f2003, f2008; +add.f32 %59, f2012, f2015; +sub.f32 %58, f2011, f2016; +add.f32 %61, f2020, f2023; +sub.f32 %60, f2019, f2024; +add.f32 %63, f2028, f2031; +sub.f32 %62, f2027, f2032; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_4096), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<102, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<564>; +.reg .b32 r<27>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 32704; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+4096]; +ld.shared.v2.f32 {f167, f168}, [r13+8192]; +ld.shared.v2.f32 {f171, f172}, [r13+12288]; +ld.shared.v2.f32 {f175, f176}, [r13+16384]; +ld.shared.v2.f32 {f179, f180}, [r13+20480]; +ld.shared.v2.f32 {f183, f184}, [r13+24576]; +ld.shared.v2.f32 {f187, f188}, [r13+28672]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 504; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 32256; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+4096]; +ld.shared.v2.f32 {f325, f326}, [r19+8192]; +ld.shared.v2.f32 {f329, f330}, [r19+12288]; +ld.shared.v2.f32 {f333, f334}, [r19+16384]; +ld.shared.v2.f32 {f337, f338}, [r19+20480]; +ld.shared.v2.f32 {f341, f342}, [r19+24576]; +ld.shared.v2.f32 {f345, f346}, [r19+28672]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +add.f32 f361, f351, f356; +sub.f32 f362, f352, f355; +sub.f32 f363, f351, f356; +add.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +add.f32 f377, f367, f372; +sub.f32 f378, f368, f371; +sub.f32 f379, f367, f372; +add.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0fBF3504F3; +sub.f32 f383, f381, f382; +mul.f32 f384, f378, 0f3F3504F3; +fma.rn.f32 f385, f377, 0fBF3504F3, f384; +mul.f32 f386, f379, 0fBF3504F3; +mul.f32 f387, f380, 0fBF3504F3; +sub.f32 f388, f386, f387; +add.f32 f389, f386, f387; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f385; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f385; +add.f32 f396, f359, f376; +sub.f32 f397, f360, f375; +sub.f32 f398, f359, f376; +add.f32 f399, f360, f375; +add.f32 f400, f363, f388; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f388; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 448; +bfe.u32 r21, r5, 6, 3; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f404, f392; +mul.f32 f409, f405, f393; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f413, f396; +mul.f32 f417, f415, f397; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f421, f400; +mul.f32 f425, f423, f401; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f429, f390; +mul.f32 f433, f431, f391; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f437, f394; +mul.f32 f441, f439, f395; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f445, f398; +mul.f32 f449, f447, f399; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f453, f402; +mul.f32 f457, f455, f403; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 28672; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f405, f392, f410; +sub.f32 f462, f408, f409; +st.shared.v2.f32 [r25+512], {f462, f461}; +fma.rn.f32 f463, f415, f396, f418; +sub.f32 f464, f416, f417; +st.shared.v2.f32 [r25+1024], {f464, f463}; +fma.rn.f32 f465, f423, f400, f426; +sub.f32 f466, f424, f425; +st.shared.v2.f32 [r25+1536], {f466, f465}; +sub.f32 f467, f432, f433; +fma.rn.f32 f468, f431, f390, f434; +st.shared.v2.f32 [r25+2048], {f467, f468}; +fma.rn.f32 f469, f439, f394, f442; +sub.f32 f470, f440, f441; +st.shared.v2.f32 [r25+2560], {f470, f469}; +fma.rn.f32 f471, f447, f398, f450; +sub.f32 f472, f448, f449; +st.shared.v2.f32 [r25+3072], {f472, f471}; +fma.rn.f32 f473, f455, f402, f458; +sub.f32 f474, f456, f457; +st.shared.v2.f32 [r25+3584], {f474, f473}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+4096]; +ld.shared.v2.f32 {f483, f484}, [r26+8192]; +ld.shared.v2.f32 {f487, f488}, [r26+12288]; +ld.shared.v2.f32 {f491, f492}, [r26+16384]; +ld.shared.v2.f32 {f495, f496}, [r26+20480]; +ld.shared.v2.f32 {f499, f500}, [r26+24576]; +ld.shared.v2.f32 {f503, f504}, [r26+28672]; +add.f32 f507, f475, f491; +add.f32 f508, f476, f492; +sub.f32 f509, f475, f491; +sub.f32 f510, f476, f492; +add.f32 f511, f483, f499; +add.f32 f512, f484, f500; +sub.f32 f513, f483, f499; +sub.f32 f514, f484, f500; +add.f32 f515, f507, f511; +add.f32 f516, f508, f512; +sub.f32 f517, f507, f511; +sub.f32 f518, f508, f512; +add.f32 f519, f509, f514; +sub.f32 f520, f510, f513; +sub.f32 f521, f509, f514; +add.f32 f522, f510, f513; +add.f32 f523, f479, f495; +add.f32 f524, f480, f496; +sub.f32 f525, f479, f495; +sub.f32 f526, f480, f496; +add.f32 f527, f487, f503; +add.f32 f528, f488, f504; +sub.f32 f529, f487, f503; +sub.f32 f530, f488, f504; +add.f32 f531, f523, f527; +add.f32 f532, f524, f528; +sub.f32 f533, f523, f527; +sub.f32 f534, f524, f528; +add.f32 f535, f525, f530; +sub.f32 f536, f526, f529; +sub.f32 f537, f525, f530; +add.f32 f538, f526, f529; +mul.f32 f539, f535, 0f3F3504F3; +mul.f32 f540, f536, 0fBF3504F3; +sub.f32 f541, f539, f540; +mul.f32 f542, f536, 0f3F3504F3; +fma.rn.f32 f543, f535, 0fBF3504F3, f542; +mul.f32 f544, f537, 0fBF3504F3; +mul.f32 f545, f538, 0fBF3504F3; +sub.f32 f546, f544, f545; +add.f32 f547, f544, f545; +add.f32 %1, f516, f532; +add.f32 %0, f515, f531; +add.f32 %3, f520, f543; +add.f32 %2, f519, f541; +sub.f32 %5, f518, f533; +add.f32 %4, f517, f534; +add.f32 %7, f522, f547; +add.f32 %6, f521, f546; +sub.f32 %9, f516, f532; +sub.f32 %8, f515, f531; +sub.f32 %11, f520, f543; +sub.f32 %10, f519, f541; +add.f32 %13, f518, f533; +sub.f32 %12, f517, f534; +sub.f32 %15, f522, f547; +sub.f32 %14, f521, f546; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_4096), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<106, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2441>; +.reg .b32 r<30>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2439, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2437, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2436, f2439, f2437; +sub.f32 f140, f2439, f2437; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2435, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2432, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2430, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2429, f2432, f2430; +sub.f32 f156, f2432, f2430; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2428, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2428, 0fBF3504F3; +mul.f32 f2427, f157, 0f3F3504F3; +sub.f32 f163, f2427, f162; +mul.f32 f164, f2428, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2426, f2436, f2429; +sub.f32 f173, f2436, f2429; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2425, f2435, f165; +sub.f32 f177, f2435, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2424, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2423, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2421, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2418, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2417, f2421, f2418; +sub.f32 f197, f2421, f2418; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2416, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2414, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2412, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2411, f2414, f2412; +sub.f32 f213, f2414, f2412; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2410, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2410, 0fBF3504F3; +mul.f32 f2409, f214, 0f3F3504F3; +sub.f32 f220, f2409, f219; +mul.f32 f221, f2410, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2408, f2417, f2411; +sub.f32 f230, f2417, f2411; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2407, f2416, f222; +sub.f32 f234, f2416, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2406, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2405, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2403, f231, 0f3F6C835E; +mul.f32 f2404, f2407, 0fBEC3EF15; +sub.f32 f245, f2403, f2404; +mul.f32 f246, f2407, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2401, f235, 0f3F3504F3; +mul.f32 f2402, f2406, 0fBF3504F3; +sub.f32 f250, f2401, f2402; +mul.f32 f251, f2406, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2399, f239, 0f3EC3EF15; +mul.f32 f2400, f2405, 0fBF6C835E; +sub.f32 f255, f2399, f2400; +mul.f32 f256, f2405, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2397, f233, 0fBEC3EF15; +mul.f32 f2398, f234, 0fBF6C835E; +sub.f32 f260, f2397, f2398; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2395, f241, 0fBF6C835E; +mul.f32 f2396, f242, 0fBEC3EF15; +sub.f32 f269, f2395, f2396; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2394, f2426, f2408; +sub.f32 f275, f2426, f2408; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2393, f2425, f247; +sub.f32 f279, f2425, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2392, f2424, f252; +sub.f32 f283, f2424, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2391, f2423, f257; +sub.f32 f287, f2423, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2390, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2389, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2388, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2387, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2384, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2382, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2381, f2384, f2382; +sub.f32 f315, f2384, f2382; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2380, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2378, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2375, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2374, f2378, f2375; +sub.f32 f331, f2378, f2375; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2373, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2371, f332, 0f3F3504F3; +mul.f32 f2372, f2373, 0fBF3504F3; +sub.f32 f338, f2371, f2372; +mul.f32 f339, f2373, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2370, f2381, f2374; +sub.f32 f348, f2381, f2374; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2369, f2380, f340; +sub.f32 f352, f2380, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2368, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2367, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2365, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2363, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2362, f2365, f2363; +sub.f32 f372, f2365, f2363; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2361, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2358, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2357, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2356, f2358, f2357; +sub.f32 f388, f2358, f2357; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2355, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2353, f389, 0f3F3504F3; +mul.f32 f2354, f2355, 0fBF3504F3; +sub.f32 f395, f2353, f2354; +mul.f32 f396, f2355, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2352, f2362, f2356; +sub.f32 f405, f2362, f2356; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2351, f2361, f397; +sub.f32 f409, f2361, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2350, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2349, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2351, 0fBEC3EF15; +mul.f32 f2348, f406, 0f3F6C835E; +sub.f32 f420, f2348, f419; +mul.f32 f421, f2351, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2350, 0fBF3504F3; +mul.f32 f2347, f410, 0f3F3504F3; +sub.f32 f425, f2347, f424; +mul.f32 f426, f2350, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2345, f414, 0f3EC3EF15; +mul.f32 f2346, f2349, 0fBF6C835E; +sub.f32 f430, f2345, f2346; +mul.f32 f431, f2349, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2343, f408, 0fBEC3EF15; +mul.f32 f2344, f409, 0fBF6C835E; +sub.f32 f435, f2343, f2344; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2342, f416, 0fBF6C835E; +sub.f32 f444, f2342, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2341, f2370, f2352; +sub.f32 f450, f2370, f2352; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2340, f2369, f422; +sub.f32 f454, f2369, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2339, f2368, f427; +sub.f32 f458, f2368, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2338, f2367, f432; +sub.f32 f462, f2367, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2337, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2336, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2335, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2334, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2340, 0fBE47C5C2; +mul.f32 f2333, f451, 0f3F7B14BE; +sub.f32 f481, f2333, f480; +mul.f32 f482, f2340, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2339, 0fBEC3EF15; +mul.f32 f2332, f455, 0f3F6C835E; +sub.f32 f486, f2332, f485; +mul.f32 f487, f2339, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2338, 0fBF0E39DA; +mul.f32 f2331, f459, 0f3F54DB31; +sub.f32 f491, f2331, f490; +mul.f32 f492, f2338, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2337, 0fBF3504F3; +mul.f32 f2330, f463, 0f3F3504F3; +sub.f32 f496, f2330, f495; +mul.f32 f497, f2337, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2328, f467, 0f3F0E39DA; +mul.f32 f2329, f2336, 0fBF54DB31; +sub.f32 f501, f2328, f2329; +mul.f32 f502, f2336, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2326, f471, 0f3EC3EF15; +mul.f32 f2327, f2335, 0fBF6C835E; +sub.f32 f506, f2326, f2327; +mul.f32 f507, f2335, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2324, f475, 0f3E47C5C2; +mul.f32 f2325, f2334, 0fBF7B14BE; +sub.f32 f511, f2324, f2325; +mul.f32 f512, f2334, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2322, f453, 0fBE47C5C2; +mul.f32 f2323, f454, 0fBF7B14BE; +sub.f32 f516, f2322, f2323; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2321, f457, 0fBEC3EF15; +sub.f32 f521, f2321, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2320, f461, 0fBF0E39DA; +sub.f32 f526, f2320, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2318, f469, 0fBF54DB31; +mul.f32 f2319, f470, 0fBF0E39DA; +sub.f32 f535, f2318, f2319; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2317, f473, 0fBF6C835E; +sub.f32 f540, f2317, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2316, f477, 0fBF7B14BE; +sub.f32 f545, f2316, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2315, f2394, f2341; +sub.f32 f551, f2394, f2341; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2314, f2393, f483; +sub.f32 f555, f2393, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2313, f2392, f488; +sub.f32 f559, f2392, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2312, f2391, f493; +sub.f32 f563, f2391, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2311, f2390, f498; +sub.f32 f567, f2390, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f2310, f2389, f503; +sub.f32 f571, f2389, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f2309, f2388, f508; +sub.f32 f575, f2388, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f2308, f2387, f513; +sub.f32 f579, f2387, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f2307, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f2306, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f2305, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f2304, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f2303, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2302, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2301, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2300, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f2314; +mul.f32 f2299, f612, f552; +sub.f32 f618, f2299, f617; +mul.f32 f619, f612, f2314; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f2297, f612, f612; +mul.f32 f2298, f613, f613; +sub.f32 f623, f2297, f2298; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f2295, f623, f556; +mul.f32 f2296, f625, f2313; +sub.f32 f628, f2295, f2296; +mul.f32 f629, f623, f2313; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f2293, f612, f623; +mul.f32 f2294, f613, f625; +sub.f32 f633, f2293, f2294; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f2291, f633, f560; +mul.f32 f2292, f635, f2312; +sub.f32 f638, f2291, f2292; +mul.f32 f639, f633, f2312; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f2290, f612, f633; +sub.f32 f643, f2290, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f2311; +mul.f32 f2289, f643, f564; +sub.f32 f648, f2289, f647; +mul.f32 f649, f643, f2311; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f2288, f612, f643; +sub.f32 f653, f2288, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f2310; +mul.f32 f2287, f653, f568; +sub.f32 f658, f2287, f657; +mul.f32 f659, f653, f2310; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f2286, f612, f653; +sub.f32 f663, f2286, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f2284, f663, f572; +mul.f32 f2285, f665, f2309; +sub.f32 f668, f2284, f2285; +mul.f32 f669, f663, f2309; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f2282, f612, f663; +mul.f32 f2283, f613, f665; +sub.f32 f673, f2282, f2283; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f2280, f673, f576; +mul.f32 f2281, f675, f2308; +sub.f32 f678, f2280, f2281; +mul.f32 f679, f673, f2308; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f2278, f612, f673; +mul.f32 f2279, f613, f675; +sub.f32 f683, f2278, f2279; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f2307; +mul.f32 f2277, f683, f580; +sub.f32 f688, f2277, f687; +mul.f32 f689, f683, f2307; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f2276, f612, f683; +sub.f32 f693, f2276, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f2306; +mul.f32 f2275, f693, f584; +sub.f32 f698, f2275, f697; +mul.f32 f699, f693, f2306; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f2274, f612, f693; +sub.f32 f703, f2274, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f2305; +mul.f32 f2273, f703, f588; +sub.f32 f708, f2273, f707; +mul.f32 f709, f703, f2305; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f2271, f612, f703; +mul.f32 f2272, f613, f705; +sub.f32 f713, f2271, f2272; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f2269, f713, f592; +mul.f32 f2270, f715, f2304; +sub.f32 f718, f2269, f2270; +mul.f32 f719, f713, f2304; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f2267, f612, f713; +mul.f32 f2268, f613, f715; +sub.f32 f723, f2267, f2268; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f2265, f723, f596; +mul.f32 f2266, f725, f2303; +sub.f32 f728, f2265, f2266; +mul.f32 f729, f723, f2303; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f2264, f612, f723; +sub.f32 f733, f2264, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f2302; +mul.f32 f2263, f733, f600; +sub.f32 f738, f2263, f737; +mul.f32 f739, f733, f2302; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f2262, f612, f733; +sub.f32 f743, f2262, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f2301; +mul.f32 f2261, f743, f604; +sub.f32 f748, f2261, f747; +mul.f32 f749, f743, f2301; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f2260, f612, f743; +sub.f32 f753, f2260, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f2300; +mul.f32 f2259, f753, f608; +sub.f32 f758, f2259, f757; +mul.f32 f759, f753, f2300; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f2257, f612, f753; +mul.f32 f2258, f613, f755; +sub.f32 f763, f2257, f2258; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f2255, f763, f550; +mul.f32 f2256, f765, f551; +sub.f32 f768, f2255, f2256; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f2253, f612, f763; +mul.f32 f2254, f613, f765; +sub.f32 f773, f2253, f2254; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f2252, f773, f554; +sub.f32 f778, f2252, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f2251, f612, f773; +sub.f32 f783, f2251, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f2250, f783, f558; +sub.f32 f788, f2250, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f2249, f612, f783; +sub.f32 f793, f2249, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f2248, f793, f562; +sub.f32 f798, f2248, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f2247, f612, f793; +sub.f32 f803, f2247, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f2245, f803, f566; +mul.f32 f2246, f805, f567; +sub.f32 f808, f2245, f2246; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f2243, f612, f803; +mul.f32 f2244, f613, f805; +sub.f32 f813, f2243, f2244; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f2241, f813, f570; +mul.f32 f2242, f815, f571; +sub.f32 f818, f2241, f2242; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f2239, f612, f813; +mul.f32 f2240, f613, f815; +sub.f32 f823, f2239, f2240; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f2238, f823, f574; +sub.f32 f828, f2238, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f2237, f612, f823; +sub.f32 f833, f2237, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f2236, f833, f578; +sub.f32 f838, f2236, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f2235, f612, f833; +sub.f32 f843, f2235, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f2234, f843, f582; +sub.f32 f848, f2234, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f2232, f612, f843; +mul.f32 f2233, f613, f845; +sub.f32 f853, f2232, f2233; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f2230, f853, f586; +mul.f32 f2231, f855, f587; +sub.f32 f858, f2230, f2231; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f2228, f612, f853; +mul.f32 f2229, f613, f855; +sub.f32 f863, f2228, f2229; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f2226, f863, f590; +mul.f32 f2227, f865, f591; +sub.f32 f868, f2226, f2227; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f2225, f612, f863; +sub.f32 f873, f2225, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f2224, f873, f594; +sub.f32 f878, f2224, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f2223, f612, f873; +sub.f32 f883, f2223, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f2222, f883, f598; +sub.f32 f888, f2222, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f2221, f612, f883; +sub.f32 f893, f2221, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f2219, f893, f602; +mul.f32 f2220, f895, f603; +sub.f32 f898, f2219, f2220; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f2217, f612, f893; +mul.f32 f2218, f613, f895; +sub.f32 f903, f2217, f2218; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f2215, f903, f606; +mul.f32 f2216, f905, f607; +sub.f32 f908, f2215, f2216; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f2213, f612, f903; +mul.f32 f2214, f613, f905; +sub.f32 f913, f2213, f2214; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f2212, f913, f610; +sub.f32 f918, f2212, f917; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +shl.b32 r8, r24, 7; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16256; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +and.b32 r23, r24, 127; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+512]; +ld.shared.f32 f923, [r13+1024]; +ld.shared.f32 f924, [r13+1536]; +ld.shared.f32 f925, [r13+2048]; +ld.shared.f32 f926, [r13+2560]; +ld.shared.f32 f927, [r13+3072]; +ld.shared.f32 f928, [r13+3584]; +ld.shared.f32 f929, [r13+4096]; +ld.shared.f32 f930, [r13+4608]; +ld.shared.f32 f931, [r13+5120]; +ld.shared.f32 f932, [r13+5632]; +ld.shared.f32 f933, [r13+6144]; +ld.shared.f32 f934, [r13+6656]; +ld.shared.f32 f935, [r13+7168]; +ld.shared.f32 f936, [r13+7680]; +ld.shared.f32 f937, [r13+8192]; +ld.shared.f32 f938, [r13+8704]; +ld.shared.f32 f939, [r13+9216]; +ld.shared.f32 f940, [r13+9728]; +ld.shared.f32 f941, [r13+10240]; +ld.shared.f32 f942, [r13+10752]; +ld.shared.f32 f943, [r13+11264]; +ld.shared.f32 f944, [r13+11776]; +ld.shared.f32 f945, [r13+12288]; +ld.shared.f32 f946, [r13+12800]; +ld.shared.f32 f947, [r13+13312]; +ld.shared.f32 f948, [r13+13824]; +ld.shared.f32 f949, [r13+14336]; +ld.shared.f32 f950, [r13+14848]; +ld.shared.f32 f951, [r13+15360]; +ld.shared.f32 f952, [r13+15872]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2315, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+512]; +ld.shared.f32 f955, [r13+1024]; +ld.shared.f32 f956, [r13+1536]; +ld.shared.f32 f957, [r13+2048]; +ld.shared.f32 f958, [r13+2560]; +ld.shared.f32 f959, [r13+3072]; +ld.shared.f32 f960, [r13+3584]; +ld.shared.f32 f961, [r13+4096]; +ld.shared.f32 f962, [r13+4608]; +ld.shared.f32 f963, [r13+5120]; +ld.shared.f32 f964, [r13+5632]; +ld.shared.f32 f965, [r13+6144]; +ld.shared.f32 f966, [r13+6656]; +ld.shared.f32 f967, [r13+7168]; +ld.shared.f32 f968, [r13+7680]; +ld.shared.f32 f969, [r13+8192]; +ld.shared.f32 f970, [r13+8704]; +ld.shared.f32 f971, [r13+9216]; +ld.shared.f32 f972, [r13+9728]; +ld.shared.f32 f973, [r13+10240]; +ld.shared.f32 f974, [r13+10752]; +ld.shared.f32 f975, [r13+11264]; +ld.shared.f32 f976, [r13+11776]; +ld.shared.f32 f977, [r13+12288]; +ld.shared.f32 f978, [r13+12800]; +ld.shared.f32 f979, [r13+13312]; +ld.shared.f32 f980, [r13+13824]; +ld.shared.f32 f981, [r13+14336]; +ld.shared.f32 f982, [r13+14848]; +ld.shared.f32 f983, [r13+15360]; +ld.shared.f32 f984, [r13+15872]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2211, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2210, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2209, f2211, f2210; +sub.f32 f996, f2211, f2210; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f2208, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2207, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2206, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2205, f2207, f2206; +sub.f32 f1012, f2207, f2206; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f2204, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f2204, 0fBF3504F3; +mul.f32 f2203, f1013, 0f3F3504F3; +sub.f32 f1019, f2203, f1018; +mul.f32 f1020, f2204, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2202, f2209, f2205; +sub.f32 f1029, f2209, f2205; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2201, f2208, f1021; +sub.f32 f1033, f2208, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f2200, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f2199, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2198, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2197, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2196, f2198, f2197; +sub.f32 f1053, f2198, f2197; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f2195, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2194, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2193, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2192, f2194, f2193; +sub.f32 f1069, f2194, f2193; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f2191, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f2191, 0fBF3504F3; +mul.f32 f2190, f1070, 0f3F3504F3; +sub.f32 f1076, f2190, f1075; +mul.f32 f1077, f2191, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2189, f2196, f2192; +sub.f32 f1086, f2196, f2192; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2188, f2195, f1078; +sub.f32 f1090, f2195, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f2187, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f2186, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2184, f1087, 0f3F6C835E; +mul.f32 f2185, f2188, 0fBEC3EF15; +sub.f32 f1101, f2184, f2185; +mul.f32 f1102, f2188, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f2182, f1091, 0f3F3504F3; +mul.f32 f2183, f2187, 0fBF3504F3; +sub.f32 f1106, f2182, f2183; +mul.f32 f1107, f2187, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f2186, 0fBF6C835E; +mul.f32 f2181, f1095, 0f3EC3EF15; +sub.f32 f1111, f2181, f1110; +mul.f32 f1112, f2186, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f2180, f1089, 0fBEC3EF15; +sub.f32 f1116, f2180, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f2178, f1097, 0fBF6C835E; +mul.f32 f2179, f1098, 0fBEC3EF15; +sub.f32 f1125, f2178, f2179; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2177, f2202, f2189; +sub.f32 f1131, f2202, f2189; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2176, f2201, f1103; +sub.f32 f1135, f2201, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2175, f2200, f1108; +sub.f32 f1139, f2200, f1108; +add.f32 f1140, f1038, f1111; +sub.f32 f1142, f1038, f1111; +add.f32 f2174, f2199, f1113; +sub.f32 f1143, f2199, f1113; +add.f32 f1144, f1028, f1086; +sub.f32 f1146, f1028, f1086; +sub.f32 f2173, f1029, f1085; +add.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1116; +sub.f32 f1150, f1032, f1116; +add.f32 f2172, f1033, f1118; +sub.f32 f1151, f1033, f1118; +add.f32 f1152, f1036, f1121; +sub.f32 f1154, f1036, f1121; +add.f32 f2171, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2170, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2169, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2168, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2167, f2169, f2168; +sub.f32 f1171, f2169, f2168; +add.f32 f1172, f1162, f1167; +sub.f32 f1174, f1162, f1167; +sub.f32 f2166, f1163, f1166; +add.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2165, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2164, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2163, f2165, f2164; +sub.f32 f1187, f2165, f2164; +add.f32 f1188, f1178, f1183; +sub.f32 f1190, f1178, f1183; +sub.f32 f2162, f1179, f1182; +add.f32 f1191, f1179, f1182; +mul.f32 f1193, f2162, 0fBF3504F3; +mul.f32 f2161, f1188, 0f3F3504F3; +sub.f32 f1194, f2161, f1193; +mul.f32 f1195, f2162, 0f3F3504F3; +fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195; +mul.f32 f1197, f1190, 0fBF3504F3; +mul.f32 f1198, f1191, 0fBF3504F3; +sub.f32 f1199, f1197, f1198; +add.f32 f1200, f1197, f1198; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2160, f2167, f2163; +sub.f32 f1204, f2167, f2163; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2159, f2166, f1196; +sub.f32 f1208, f2166, f1196; +add.f32 f1209, f1170, f1187; +sub.f32 f1211, f1170, f1187; +sub.f32 f2158, f1171, f1186; +add.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1199; +sub.f32 f1215, f1174, f1199; +add.f32 f2157, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2156, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2155, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2154, f2156, f2155; +sub.f32 f1228, f2156, f2155; +add.f32 f1229, f1219, f1224; +sub.f32 f1231, f1219, f1224; +sub.f32 f2153, f1220, f1223; +add.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2152, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2151, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2150, f2152, f2151; +sub.f32 f1244, f2152, f2151; +add.f32 f1245, f1235, f1240; +sub.f32 f1247, f1235, f1240; +sub.f32 f2149, f1236, f1239; +add.f32 f1248, f1236, f1239; +mul.f32 f1250, f2149, 0fBF3504F3; +mul.f32 f2148, f1245, 0f3F3504F3; +sub.f32 f1251, f2148, f1250; +mul.f32 f1252, f2149, 0f3F3504F3; +fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252; +mul.f32 f1254, f1247, 0fBF3504F3; +mul.f32 f1255, f1248, 0fBF3504F3; +sub.f32 f1256, f1254, f1255; +add.f32 f1257, f1254, f1255; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2147, f2154, f2150; +sub.f32 f1261, f2154, f2150; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2146, f2153, f1253; +sub.f32 f1265, f2153, f1253; +add.f32 f1266, f1227, f1244; +sub.f32 f1268, f1227, f1244; +sub.f32 f2145, f1228, f1243; +add.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1256; +sub.f32 f1272, f1231, f1256; +add.f32 f2144, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2142, f1262, 0f3F6C835E; +mul.f32 f2143, f2146, 0fBEC3EF15; +sub.f32 f1276, f2142, f2143; +mul.f32 f1277, f2146, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277; +mul.f32 f2140, f1266, 0f3F3504F3; +mul.f32 f2141, f2145, 0fBF3504F3; +sub.f32 f1281, f2140, f2141; +mul.f32 f1282, f2145, 0f3F3504F3; +fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282; +mul.f32 f2138, f1270, 0f3EC3EF15; +mul.f32 f2139, f2144, 0fBF6C835E; +sub.f32 f1286, f2138, f2139; +mul.f32 f1287, f2144, 0f3EC3EF15; +fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287; +mul.f32 f2136, f1264, 0fBEC3EF15; +mul.f32 f2137, f1265, 0fBF6C835E; +sub.f32 f1291, f2136, f2137; +mul.f32 f1292, f1265, 0fBEC3EF15; +fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292; +mul.f32 f1294, f1268, 0fBF3504F3; +mul.f32 f1295, f1269, 0fBF3504F3; +sub.f32 f1296, f1294, f1295; +add.f32 f1297, f1294, f1295; +mul.f32 f2134, f1272, 0fBF6C835E; +mul.f32 f2135, f1273, 0fBEC3EF15; +sub.f32 f1300, f2134, f2135; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2133, f2160, f2147; +sub.f32 f1306, f2160, f2147; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2132, f2159, f1278; +sub.f32 f1310, f2159, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2131, f2158, f1283; +sub.f32 f1314, f2158, f1283; +add.f32 f1315, f1213, f1286; +sub.f32 f1317, f1213, f1286; +add.f32 f2130, f2157, f1288; +sub.f32 f1318, f2157, f1288; +add.f32 f1319, f1203, f1261; +sub.f32 f1321, f1203, f1261; +sub.f32 f2129, f1204, f1260; +add.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1291; +sub.f32 f1325, f1207, f1291; +add.f32 f2128, f1208, f1293; +sub.f32 f1326, f1208, f1293; +add.f32 f1327, f1211, f1296; +sub.f32 f1329, f1211, f1296; +add.f32 f2127, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2126, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2132, 0fBE47C5C2; +mul.f32 f2125, f1307, 0f3F7B14BE; +sub.f32 f1337, f2125, f1336; +mul.f32 f1338, f2132, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338; +mul.f32 f1341, f2131, 0fBEC3EF15; +mul.f32 f2124, f1311, 0f3F6C835E; +sub.f32 f1342, f2124, f1341; +mul.f32 f1343, f2131, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343; +mul.f32 f2122, f1315, 0f3F54DB31; +mul.f32 f2123, f2130, 0fBF0E39DA; +sub.f32 f1347, f2122, f2123; +mul.f32 f1348, f2130, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348; +mul.f32 f2120, f1319, 0f3F3504F3; +mul.f32 f2121, f2129, 0fBF3504F3; +sub.f32 f1352, f2120, f2121; +mul.f32 f1353, f2129, 0f3F3504F3; +fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353; +mul.f32 f2118, f1323, 0f3F0E39DA; +mul.f32 f2119, f2128, 0fBF54DB31; +sub.f32 f1357, f2118, f2119; +mul.f32 f1358, f2128, 0f3F0E39DA; +fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358; +mul.f32 f2116, f1327, 0f3EC3EF15; +mul.f32 f2117, f2127, 0fBF6C835E; +sub.f32 f1362, f2116, f2117; +mul.f32 f1363, f2127, 0f3EC3EF15; +fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363; +mul.f32 f1366, f2126, 0fBF7B14BE; +mul.f32 f2115, f1331, 0f3E47C5C2; +sub.f32 f1367, f2115, f1366; +mul.f32 f1368, f2126, 0f3E47C5C2; +fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368; +mul.f32 f1371, f1310, 0fBF7B14BE; +mul.f32 f2114, f1309, 0fBE47C5C2; +sub.f32 f1372, f2114, f1371; +mul.f32 f1373, f1310, 0fBE47C5C2; +fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373; +mul.f32 f1376, f1314, 0fBF6C835E; +mul.f32 f2113, f1313, 0fBEC3EF15; +sub.f32 f1377, f2113, f1376; +mul.f32 f1378, f1314, 0fBEC3EF15; +fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378; +mul.f32 f1381, f1318, 0fBF54DB31; +mul.f32 f2112, f1317, 0fBF0E39DA; +sub.f32 f1382, f2112, f1381; +mul.f32 f1383, f1318, 0fBF0E39DA; +fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383; +mul.f32 f1385, f1321, 0fBF3504F3; +mul.f32 f1386, f1322, 0fBF3504F3; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1385, f1386; +mul.f32 f1390, f1326, 0fBF0E39DA; +mul.f32 f2111, f1325, 0fBF54DB31; +sub.f32 f1391, f2111, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392; +mul.f32 f1395, f1330, 0fBEC3EF15; +mul.f32 f2110, f1329, 0fBF6C835E; +sub.f32 f1396, f2110, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397; +mul.f32 f1400, f1334, 0fBE47C5C2; +mul.f32 f2109, f1333, 0fBF7B14BE; +sub.f32 f1401, f2109, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2108, f2177, f2133; +sub.f32 f1407, f2177, f2133; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2107, f2176, f1339; +sub.f32 f1411, f2176, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2106, f2175, f1344; +sub.f32 f1415, f2175, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2105, f2174, f1349; +sub.f32 f1419, f2174, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2104, f2173, f1354; +sub.f32 f1423, f2173, f1354; +add.f32 f1424, f1148, f1357; +sub.f32 f1426, f1148, f1357; +add.f32 f2103, f2172, f1359; +sub.f32 f1427, f2172, f1359; +add.f32 f1428, f1152, f1362; +sub.f32 f1430, f1152, f1362; +add.f32 f2102, f2171, f1364; +sub.f32 f1431, f2171, f1364; +add.f32 f1432, f1156, f1367; +sub.f32 f1434, f1156, f1367; +add.f32 f2101, f2170, f1369; +sub.f32 f1435, f2170, f1369; +add.f32 f1436, f1130, f1306; +sub.f32 f1438, f1130, f1306; +sub.f32 f2100, f1131, f1305; +add.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1372; +sub.f32 f1442, f1134, f1372; +add.f32 f2099, f1135, f1374; +sub.f32 f1443, f1135, f1374; +add.f32 f1444, f1138, f1377; +sub.f32 f1446, f1138, f1377; +add.f32 f2098, f1139, f1379; +sub.f32 f1447, f1139, f1379; +add.f32 f1448, f1142, f1382; +sub.f32 f1450, f1142, f1382; +add.f32 f2097, f1143, f1384; +sub.f32 f1451, f1143, f1384; +add.f32 f1452, f1146, f1387; +sub.f32 f1454, f1146, f1387; +add.f32 f2096, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2095, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2094, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2093, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r24, 5, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1473, f1469, f2107; +mul.f32 f2092, f1468, f1408; +sub.f32 f1474, f2092, f1473; +mul.f32 f1475, f1468, f2107; +fma.rn.f32 f1476, f1469, f1408, f1475; +mul.f32 f1478, f1469, f1469; +mul.f32 f2091, f1468, f1468; +sub.f32 f1479, f2091, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1483, f1481, f2106; +mul.f32 f2090, f1479, f1412; +sub.f32 f1484, f2090, f1483; +mul.f32 f1485, f1479, f2106; +fma.rn.f32 f1486, f1481, f1412, f1485; +mul.f32 f2088, f1468, f1479; +mul.f32 f2089, f1469, f1481; +sub.f32 f1489, f2088, f2089; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f2086, f1489, f1416; +mul.f32 f2087, f1491, f2105; +sub.f32 f1494, f2086, f2087; +mul.f32 f1495, f1489, f2105; +fma.rn.f32 f1496, f1491, f1416, f1495; +mul.f32 f2084, f1468, f1489; +mul.f32 f2085, f1469, f1491; +sub.f32 f1499, f2084, f2085; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f2082, f1499, f1420; +mul.f32 f2083, f1501, f2104; +sub.f32 f1504, f2082, f2083; +mul.f32 f1505, f1499, f2104; +fma.rn.f32 f1506, f1501, f1420, f1505; +mul.f32 f1508, f1469, f1501; +mul.f32 f2081, f1468, f1499; +sub.f32 f1509, f2081, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1513, f1511, f2103; +mul.f32 f2080, f1509, f1424; +sub.f32 f1514, f2080, f1513; +mul.f32 f1515, f1509, f2103; +fma.rn.f32 f1516, f1511, f1424, f1515; +mul.f32 f1518, f1469, f1511; +mul.f32 f2079, f1468, f1509; +sub.f32 f1519, f2079, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1523, f1521, f2102; +mul.f32 f2078, f1519, f1428; +sub.f32 f1524, f2078, f1523; +mul.f32 f1525, f1519, f2102; +fma.rn.f32 f1526, f1521, f1428, f1525; +mul.f32 f1528, f1469, f1521; +mul.f32 f2077, f1468, f1519; +sub.f32 f1529, f2077, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f2075, f1529, f1432; +mul.f32 f2076, f1531, f2101; +sub.f32 f1534, f2075, f2076; +mul.f32 f1535, f1529, f2101; +fma.rn.f32 f1536, f1531, f1432, f1535; +mul.f32 f2073, f1468, f1529; +mul.f32 f2074, f1469, f1531; +sub.f32 f1539, f2073, f2074; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f2071, f1539, f1436; +mul.f32 f2072, f1541, f2100; +sub.f32 f1544, f2071, f2072; +mul.f32 f1545, f1539, f2100; +fma.rn.f32 f1546, f1541, f1436, f1545; +mul.f32 f2069, f1468, f1539; +mul.f32 f2070, f1469, f1541; +sub.f32 f1549, f2069, f2070; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1553, f1551, f2099; +mul.f32 f2068, f1549, f1440; +sub.f32 f1554, f2068, f1553; +mul.f32 f1555, f1549, f2099; +fma.rn.f32 f1556, f1551, f1440, f1555; +mul.f32 f1558, f1469, f1551; +mul.f32 f2067, f1468, f1549; +sub.f32 f1559, f2067, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1563, f1561, f2098; +mul.f32 f2066, f1559, f1444; +sub.f32 f1564, f2066, f1563; +mul.f32 f1565, f1559, f2098; +fma.rn.f32 f1566, f1561, f1444, f1565; +mul.f32 f1568, f1469, f1561; +mul.f32 f2065, f1468, f1559; +sub.f32 f1569, f2065, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1573, f1571, f2097; +mul.f32 f2064, f1569, f1448; +sub.f32 f1574, f2064, f1573; +mul.f32 f1575, f1569, f2097; +fma.rn.f32 f1576, f1571, f1448, f1575; +mul.f32 f1578, f1469, f1571; +mul.f32 f2063, f1468, f1569; +sub.f32 f1579, f2063, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f2061, f1579, f1452; +mul.f32 f2062, f1581, f2096; +sub.f32 f1584, f2061, f2062; +mul.f32 f1585, f1579, f2096; +fma.rn.f32 f1586, f1581, f1452, f1585; +mul.f32 f2059, f1468, f1579; +mul.f32 f2060, f1469, f1581; +sub.f32 f1589, f2059, f2060; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f2057, f1589, f1456; +mul.f32 f2058, f1591, f2095; +sub.f32 f1594, f2057, f2058; +mul.f32 f1595, f1589, f2095; +fma.rn.f32 f1596, f1591, f1456, f1595; +mul.f32 f1598, f1469, f1591; +mul.f32 f2056, f1468, f1589; +sub.f32 f1599, f2056, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1603, f1601, f2094; +mul.f32 f2055, f1599, f1460; +sub.f32 f1604, f2055, f1603; +mul.f32 f1605, f1599, f2094; +fma.rn.f32 f1606, f1601, f1460, f1605; +mul.f32 f1608, f1469, f1601; +mul.f32 f2054, f1468, f1599; +sub.f32 f1609, f2054, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1613, f1611, f2093; +mul.f32 f2053, f1609, f1464; +sub.f32 f1614, f2053, f1613; +mul.f32 f1615, f1609, f2093; +fma.rn.f32 f1616, f1611, f1464, f1615; +mul.f32 f1618, f1469, f1611; +mul.f32 f2052, f1468, f1609; +sub.f32 f1619, f2052, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1623, f1621, f1407; +mul.f32 f2051, f1619, f1406; +sub.f32 f1624, f2051, f1623; +mul.f32 f1625, f1619, f1407; +fma.rn.f32 f1626, f1621, f1406, f1625; +mul.f32 f2049, f1468, f1619; +mul.f32 f2050, f1469, f1621; +sub.f32 f1629, f2049, f2050; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f2047, f1629, f1410; +mul.f32 f2048, f1631, f1411; +sub.f32 f1634, f2047, f2048; +mul.f32 f1635, f1629, f1411; +fma.rn.f32 f1636, f1631, f1410, f1635; +mul.f32 f2045, f1468, f1629; +mul.f32 f2046, f1469, f1631; +sub.f32 f1639, f2045, f2046; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f2043, f1639, f1414; +mul.f32 f2044, f1641, f1415; +sub.f32 f1644, f2043, f2044; +mul.f32 f1645, f1639, f1415; +fma.rn.f32 f1646, f1641, f1414, f1645; +mul.f32 f1648, f1469, f1641; +mul.f32 f2042, f1468, f1639; +sub.f32 f1649, f2042, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1653, f1651, f1419; +mul.f32 f2041, f1649, f1418; +sub.f32 f1654, f2041, f1653; +mul.f32 f1655, f1649, f1419; +fma.rn.f32 f1656, f1651, f1418, f1655; +mul.f32 f1658, f1469, f1651; +mul.f32 f2040, f1468, f1649; +sub.f32 f1659, f2040, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1663, f1661, f1423; +mul.f32 f2039, f1659, f1422; +sub.f32 f1664, f2039, f1663; +mul.f32 f1665, f1659, f1423; +fma.rn.f32 f1666, f1661, f1422, f1665; +mul.f32 f1668, f1469, f1661; +mul.f32 f2038, f1468, f1659; +sub.f32 f1669, f2038, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f2036, f1669, f1426; +mul.f32 f2037, f1671, f1427; +sub.f32 f1674, f2036, f2037; +mul.f32 f1675, f1669, f1427; +fma.rn.f32 f1676, f1671, f1426, f1675; +mul.f32 f2034, f1468, f1669; +mul.f32 f2035, f1469, f1671; +sub.f32 f1679, f2034, f2035; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f2032, f1679, f1430; +mul.f32 f2033, f1681, f1431; +sub.f32 f1684, f2032, f2033; +mul.f32 f1685, f1679, f1431; +fma.rn.f32 f1686, f1681, f1430, f1685; +mul.f32 f2030, f1468, f1679; +mul.f32 f2031, f1469, f1681; +sub.f32 f1689, f2030, f2031; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1693, f1691, f1435; +mul.f32 f2029, f1689, f1434; +sub.f32 f1694, f2029, f1693; +mul.f32 f1695, f1689, f1435; +fma.rn.f32 f1696, f1691, f1434, f1695; +mul.f32 f1698, f1469, f1691; +mul.f32 f2028, f1468, f1689; +sub.f32 f1699, f2028, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1703, f1701, f1439; +mul.f32 f2027, f1699, f1438; +sub.f32 f1704, f2027, f1703; +mul.f32 f1705, f1699, f1439; +fma.rn.f32 f1706, f1701, f1438, f1705; +mul.f32 f1708, f1469, f1701; +mul.f32 f2026, f1468, f1699; +sub.f32 f1709, f2026, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1713, f1711, f1443; +mul.f32 f2025, f1709, f1442; +sub.f32 f1714, f2025, f1713; +mul.f32 f1715, f1709, f1443; +fma.rn.f32 f1716, f1711, f1442, f1715; +mul.f32 f2023, f1468, f1709; +mul.f32 f2024, f1469, f1711; +sub.f32 f1719, f2023, f2024; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f2021, f1719, f1446; +mul.f32 f2022, f1721, f1447; +sub.f32 f1724, f2021, f2022; +mul.f32 f1725, f1719, f1447; +fma.rn.f32 f1726, f1721, f1446, f1725; +mul.f32 f2019, f1468, f1719; +mul.f32 f2020, f1469, f1721; +sub.f32 f1729, f2019, f2020; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f2017, f1729, f1450; +mul.f32 f2018, f1731, f1451; +sub.f32 f1734, f2017, f2018; +mul.f32 f1735, f1729, f1451; +fma.rn.f32 f1736, f1731, f1450, f1735; +mul.f32 f1738, f1469, f1731; +mul.f32 f2016, f1468, f1729; +sub.f32 f1739, f2016, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1743, f1741, f1455; +mul.f32 f2015, f1739, f1454; +sub.f32 f1744, f2015, f1743; +mul.f32 f1745, f1739, f1455; +fma.rn.f32 f1746, f1741, f1454, f1745; +mul.f32 f1748, f1469, f1741; +mul.f32 f2014, f1468, f1739; +sub.f32 f1749, f2014, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1753, f1751, f1459; +mul.f32 f2013, f1749, f1458; +sub.f32 f1754, f2013, f1753; +mul.f32 f1755, f1749, f1459; +fma.rn.f32 f1756, f1751, f1458, f1755; +mul.f32 f1758, f1469, f1751; +mul.f32 f2012, f1468, f1749; +sub.f32 f1759, f2012, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f2010, f1759, f1462; +mul.f32 f2011, f1761, f1463; +sub.f32 f1764, f2010, f2011; +mul.f32 f1765, f1759, f1463; +fma.rn.f32 f1766, f1761, f1462, f1765; +mul.f32 f2008, f1468, f1759; +mul.f32 f2009, f1469, f1761; +sub.f32 f1769, f2008, f2009; +mul.f32 f1770, f1468, f1761; +mov.u32 r29, %tid.x; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f2006, f1769, f1466; +mul.f32 f2007, f1771, f1467; +sub.f32 f1774, f2006, f2007; +shl.b32 r28, r29, 7; +mul.f32 f1775, f1769, f1467; +fma.rn.f32 f1776, f1771, f1466, f1775; +and.b32 r22, r29, 96; +shl.b32 r16, r29, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r28, 12288; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1474; +st.shared.f32 [r20+256], f1484; +st.shared.f32 [r20+384], f1494; +st.shared.f32 [r20+512], f1504; +st.shared.f32 [r20+640], f1514; +st.shared.f32 [r20+768], f1524; +st.shared.f32 [r20+896], f1534; +st.shared.f32 [r20+1024], f1544; +st.shared.f32 [r20+1152], f1554; +st.shared.f32 [r20+1280], f1564; +st.shared.f32 [r20+1408], f1574; +st.shared.f32 [r20+1536], f1584; +st.shared.f32 [r20+1664], f1594; +st.shared.f32 [r20+1792], f1604; +st.shared.f32 [r20+1920], f1614; +st.shared.f32 [r20+2048], f1624; +st.shared.f32 [r20+2176], f1634; +st.shared.f32 [r20+2304], f1644; +st.shared.f32 [r20+2432], f1654; +st.shared.f32 [r20+2560], f1664; +st.shared.f32 [r20+2688], f1674; +st.shared.f32 [r20+2816], f1684; +st.shared.f32 [r20+2944], f1694; +st.shared.f32 [r20+3072], f1704; +st.shared.f32 [r20+3200], f1714; +st.shared.f32 [r20+3328], f1724; +st.shared.f32 [r20+3456], f1734; +st.shared.f32 [r20+3584], f1744; +st.shared.f32 [r20+3712], f1754; +st.shared.f32 [r20+3840], f1764; +st.shared.f32 [r20+3968], f1774; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+512]; +ld.shared.f32 f1779, [r21+1024]; +ld.shared.f32 f1780, [r21+1536]; +ld.shared.f32 f1781, [r21+2048]; +ld.shared.f32 f1782, [r21+2560]; +ld.shared.f32 f1783, [r21+3072]; +ld.shared.f32 f1784, [r21+3584]; +ld.shared.f32 f1785, [r21+4096]; +ld.shared.f32 f1786, [r21+4608]; +ld.shared.f32 f1787, [r21+5120]; +ld.shared.f32 f1788, [r21+5632]; +ld.shared.f32 f1789, [r21+6144]; +ld.shared.f32 f1790, [r21+6656]; +ld.shared.f32 f1791, [r21+7168]; +ld.shared.f32 f1792, [r21+7680]; +ld.shared.f32 f1793, [r21+8192]; +ld.shared.f32 f1794, [r21+8704]; +ld.shared.f32 f1795, [r21+9216]; +ld.shared.f32 f1796, [r21+9728]; +ld.shared.f32 f1797, [r21+10240]; +ld.shared.f32 f1798, [r21+10752]; +ld.shared.f32 f1799, [r21+11264]; +ld.shared.f32 f1800, [r21+11776]; +ld.shared.f32 f1801, [r21+12288]; +ld.shared.f32 f1802, [r21+12800]; +ld.shared.f32 f1803, [r21+13312]; +ld.shared.f32 f1804, [r21+13824]; +ld.shared.f32 f1805, [r21+14336]; +ld.shared.f32 f1806, [r21+14848]; +ld.shared.f32 f1807, [r21+15360]; +ld.shared.f32 f1808, [r21+15872]; +barrier.sync 0; +st.shared.f32 [r20], f2108; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+512]; +ld.shared.f32 f1811, [r21+1024]; +ld.shared.f32 f1812, [r21+1536]; +ld.shared.f32 f1813, [r21+2048]; +ld.shared.f32 f1814, [r21+2560]; +ld.shared.f32 f1815, [r21+3072]; +ld.shared.f32 f1816, [r21+3584]; +ld.shared.f32 f1817, [r21+4096]; +ld.shared.f32 f1818, [r21+4608]; +ld.shared.f32 f1819, [r21+5120]; +ld.shared.f32 f1820, [r21+5632]; +ld.shared.f32 f1821, [r21+6144]; +ld.shared.f32 f1822, [r21+6656]; +ld.shared.f32 f1823, [r21+7168]; +ld.shared.f32 f1824, [r21+7680]; +ld.shared.f32 f1825, [r21+8192]; +ld.shared.f32 f1826, [r21+8704]; +ld.shared.f32 f1827, [r21+9216]; +ld.shared.f32 f1828, [r21+9728]; +ld.shared.f32 f1829, [r21+10240]; +ld.shared.f32 f1830, [r21+10752]; +ld.shared.f32 f1831, [r21+11264]; +ld.shared.f32 f1832, [r21+11776]; +ld.shared.f32 f1833, [r21+12288]; +ld.shared.f32 f1834, [r21+12800]; +ld.shared.f32 f1835, [r21+13312]; +ld.shared.f32 f1836, [r21+13824]; +ld.shared.f32 f1837, [r21+14336]; +ld.shared.f32 f1838, [r21+14848]; +ld.shared.f32 f1839, [r21+15360]; +ld.shared.f32 f1840, [r21+15872]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2005, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2004, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1778, f1794; +sub.f32 f1851, f1778, f1794; +add.f32 f2003, f1810, f1826; +sub.f32 f1852, f1810, f1826; +add.f32 f1853, f1786, f1802; +sub.f32 f1855, f1786, f1802; +add.f32 f2002, f1818, f1834; +sub.f32 f1856, f1818, f1834; +add.f32 f1857, f1779, f1795; +sub.f32 f1859, f1779, f1795; +add.f32 f2001, f1811, f1827; +sub.f32 f1860, f1811, f1827; +add.f32 f1861, f1787, f1803; +sub.f32 f1863, f1787, f1803; +add.f32 f2000, f1819, f1835; +sub.f32 f1864, f1819, f1835; +add.f32 f1865, f1780, f1796; +sub.f32 f1867, f1780, f1796; +add.f32 f1999, f1812, f1828; +sub.f32 f1868, f1812, f1828; +add.f32 f1869, f1788, f1804; +sub.f32 f1871, f1788, f1804; +add.f32 f1998, f1820, f1836; +sub.f32 f1872, f1820, f1836; +add.f32 f1873, f1781, f1797; +sub.f32 f1875, f1781, f1797; +add.f32 f1997, f1813, f1829; +sub.f32 f1876, f1813, f1829; +add.f32 f1877, f1789, f1805; +sub.f32 f1879, f1789, f1805; +add.f32 f1996, f1821, f1837; +sub.f32 f1880, f1821, f1837; +add.f32 f1881, f1782, f1798; +sub.f32 f1883, f1782, f1798; +add.f32 f1995, f1814, f1830; +sub.f32 f1884, f1814, f1830; +add.f32 f1885, f1790, f1806; +sub.f32 f1887, f1790, f1806; +add.f32 f1994, f1822, f1838; +sub.f32 f1888, f1822, f1838; +add.f32 f1889, f1783, f1799; +sub.f32 f1891, f1783, f1799; +add.f32 f1993, f1815, f1831; +sub.f32 f1892, f1815, f1831; +add.f32 f1893, f1791, f1807; +sub.f32 f1895, f1791, f1807; +add.f32 f1992, f1823, f1839; +sub.f32 f1896, f1823, f1839; +add.f32 f1897, f1784, f1800; +sub.f32 f1899, f1784, f1800; +add.f32 f1991, f1816, f1832; +sub.f32 f1900, f1816, f1832; +add.f32 f1901, f1792, f1808; +sub.f32 f1903, f1792, f1808; +add.f32 f1990, f1824, f1840; +sub.f32 f1904, f1824, f1840; +add.f32 %1, f2005, f2004; +add.f32 %0, f1841, f1845; +add.f32 %3, f2003, f2002; +add.f32 %2, f1849, f1853; +add.f32 %5, f2001, f2000; +add.f32 %4, f1857, f1861; +add.f32 %7, f1999, f1998; +add.f32 %6, f1865, f1869; +add.f32 %8, f1873, f1877; +add.f32 %9, f1997, f1996; +add.f32 %10, f1881, f1885; +add.f32 %11, f1995, f1994; +add.f32 %13, f1993, f1992; +add.f32 %12, f1889, f1893; +add.f32 %15, f1991, f1990; +add.f32 %14, f1897, f1901; +add.f32 %16, f1843, f1848; +sub.f32 %17, f1844, f1847; +add.f32 %18, f1851, f1856; +sub.f32 %19, f1852, f1855; +sub.f32 %21, f1860, f1863; +add.f32 %20, f1859, f1864; +sub.f32 %23, f1868, f1871; +add.f32 %22, f1867, f1872; +sub.f32 %25, f1876, f1879; +add.f32 %24, f1875, f1880; +add.f32 %26, f1883, f1888; +sub.f32 %27, f1884, f1887; +add.f32 %28, f1891, f1896; +sub.f32 %29, f1892, f1895; +add.f32 %30, f1899, f1904; +sub.f32 %31, f1900, f1903; +sub.f32 %32, f1841, f1845; +sub.f32 %33, f2005, f2004; +sub.f32 %34, f1849, f1853; +sub.f32 %35, f2003, f2002; +sub.f32 %36, f1857, f1861; +sub.f32 %37, f2001, f2000; +sub.f32 %38, f1865, f1869; +sub.f32 %39, f1999, f1998; +sub.f32 %40, f1873, f1877; +sub.f32 %41, f1997, f1996; +sub.f32 %42, f1881, f1885; +sub.f32 %43, f1995, f1994; +sub.f32 %44, f1889, f1893; +sub.f32 %45, f1993, f1992; +sub.f32 %46, f1897, f1901; +sub.f32 %47, f1991, f1990; +add.f32 %49, f1844, f1847; +sub.f32 %48, f1843, f1848; +add.f32 %51, f1852, f1855; +sub.f32 %50, f1851, f1856; +add.f32 %53, f1860, f1863; +sub.f32 %52, f1859, f1864; +add.f32 %55, f1868, f1871; +sub.f32 %54, f1867, f1872; +add.f32 %57, f1876, f1879; +sub.f32 %56, f1875, f1880; +add.f32 %59, f1884, f1887; +sub.f32 %58, f1883, f1888; +add.f32 %61, f1892, f1895; +sub.f32 %60, f1891, f1896; +add.f32 %63, f1900, f1903; +sub.f32 %62, f1899, f1904; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_4096), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<107, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<338>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 32736; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+8192]; +ld.shared.v2.f32 {f70, f71}, [r13+16384]; +ld.shared.v2.f32 {f74, f75}, [r13+24576]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 32640; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+8192]; +ld.shared.v2.f32 {f131, f132}, [r20+16384]; +ld.shared.v2.f32 {f135, f136}, [r20+24576]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +add.f32 f149, f141, f146; +sub.f32 f150, f142, f145; +sub.f32 f151, f141, f146; +add.f32 f152, f142, f145; +and.b32 r21, r5, 1008; +bfe.u32 r22, r5, 4, 6; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f153, f149; +mul.f32 f158, f154, f150; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f162, f147; +mul.f32 f166, f164, f148; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f170, f151; +mul.f32 f174, f172, f152; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 32256; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f154, f149, f159; +sub.f32 f179, f157, f158; +st.shared.v2.f32 [r26+128], {f179, f178}; +fma.rn.f32 f180, f164, f147, f167; +sub.f32 f181, f165, f166; +st.shared.v2.f32 [r26+256], {f181, f180}; +sub.f32 f182, f173, f174; +fma.rn.f32 f183, f172, f151, f175; +st.shared.v2.f32 [r26+384], {f182, f183}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+8192]; +ld.shared.v2.f32 {f192, f193}, [r27+16384]; +ld.shared.v2.f32 {f196, f197}, [r27+24576]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +add.f32 f210, f202, f207; +sub.f32 f211, f203, f206; +sub.f32 f212, f202, f207; +add.f32 f213, f203, f206; +and.b32 r28, r5, 960; +bfe.u32 r29, r5, 6, 4; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f214, f210; +mul.f32 f219, f215, f211; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f223, f208; +mul.f32 f227, f225, f209; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f231, f212; +mul.f32 f235, f233, f213; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 30720; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f215, f210, f220; +sub.f32 f240, f218, f219; +st.shared.v2.f32 [r33+512], {f240, f239}; +fma.rn.f32 f241, f225, f208, f228; +sub.f32 f242, f226, f227; +st.shared.v2.f32 [r33+1024], {f242, f241}; +sub.f32 f243, f234, f235; +fma.rn.f32 f244, f233, f212, f236; +st.shared.v2.f32 [r33+1536], {f243, f244}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+8192]; +ld.shared.v2.f32 {f253, f254}, [r34+16384]; +ld.shared.v2.f32 {f257, f258}, [r34+24576]; +add.f32 f261, f245, f253; +add.f32 f262, f246, f254; +sub.f32 f263, f245, f253; +sub.f32 f264, f246, f254; +add.f32 f265, f249, f257; +add.f32 f266, f250, f258; +sub.f32 f267, f249, f257; +sub.f32 f268, f250, f258; +sub.f32 f269, f261, f265; +sub.f32 f270, f262, f266; +add.f32 f271, f263, f268; +sub.f32 f272, f264, f267; +sub.f32 f273, f263, f268; +add.f32 f274, f264, f267; +and.b32 r35, r5, 768; +bfe.u32 r36, r5, 8, 2; +mul.wide.u32 rd15, r36, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f275, f276}, [rd17]; +mul.f32 f279, f275, f271; +mul.f32 f280, f276, f272; +mul.f32 f281, f275, f272; +mul.f32 f282, f275, f275; +mul.f32 f283, f276, f276; +sub.f32 f284, f282, f283; +mul.f32 f285, f276, f275; +fma.rn.f32 f286, f276, f275, f285; +mul.f32 f287, f284, f269; +mul.f32 f288, f286, f270; +mul.f32 f289, f284, f270; +mul.f32 f290, f275, f284; +mul.f32 f291, f276, f286; +sub.f32 f292, f290, f291; +mul.f32 f293, f275, f286; +fma.rn.f32 f294, f276, f284, f293; +mul.f32 f295, f292, f273; +mul.f32 f296, f294, f274; +mul.f32 f297, f292, f274; +and.b32 r37, r10, 2040; +add.s32 r38, r9, r37; +barrier.sync 0; +and.b32 r39, r7, 24576; +add.s32 r40, r38, r39; +add.f32 f298, f262, f266; +add.f32 f299, f261, f265; +st.shared.v2.f32 [r40], {f299, f298}; +fma.rn.f32 f300, f276, f271, f281; +sub.f32 f301, f279, f280; +st.shared.v2.f32 [r40+2048], {f301, f300}; +fma.rn.f32 f302, f286, f269, f289; +sub.f32 f303, f287, f288; +st.shared.v2.f32 [r40+4096], {f303, f302}; +sub.f32 f304, f295, f296; +fma.rn.f32 f305, f294, f273, f297; +st.shared.v2.f32 [r40+6144], {f304, f305}; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.v2.f32 {f306, f307}, [r41]; +ld.shared.v2.f32 {f310, f311}, [r41+8192]; +ld.shared.v2.f32 {f314, f315}, [r41+16384]; +ld.shared.v2.f32 {f318, f319}, [r41+24576]; +add.f32 f322, f306, f314; +add.f32 f323, f307, f315; +sub.f32 f324, f306, f314; +sub.f32 f325, f307, f315; +add.f32 f326, f310, f318; +add.f32 f327, f311, f319; +sub.f32 f328, f310, f318; +sub.f32 f329, f311, f319; +add.f32 %1, f323, f327; +add.f32 %0, f322, f326; +sub.f32 %3, f325, f328; +add.f32 %2, f324, f329; +sub.f32 %5, f323, f327; +sub.f32 %4, f322, f326; +add.f32 %7, f325, f328; +sub.f32 %6, f324, f329; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_4096), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<108, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<298>; +.reg .b32 r<43>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16368; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+4096]; +ld.shared.f32 f64, [r13+8192]; +ld.shared.f32 f65, [r13+12288]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+4096]; +ld.shared.f32 f68, [r13+8192]; +ld.shared.f32 f69, [r13+12288]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 16320; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+4096]; +ld.shared.f32 f117, [r21+8192]; +ld.shared.f32 f118, [r21+12288]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+4096]; +ld.shared.f32 f121, [r21+8192]; +ld.shared.f32 f122, [r21+12288]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +add.f32 f135, f125, f130; +sub.f32 f136, f126, f129; +sub.f32 f137, f125, f130; +add.f32 f138, f126, f129; +and.b32 r22, r5, 1008; +bfe.u32 r23, r5, 4, 6; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f139, f135; +mul.f32 f144, f140, f136; +sub.f32 f145, f143, f144; +mul.f32 f146, f139, f136; +fma.rn.f32 f147, f140, f135, f146; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f150, f133; +mul.f32 f154, f152, f134; +sub.f32 f155, f153, f154; +mul.f32 f156, f150, f134; +fma.rn.f32 f157, f152, f133, f156; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f160, f137; +mul.f32 f164, f162, f138; +sub.f32 f165, f163, f164; +mul.f32 f166, f160, f138; +fma.rn.f32 f167, f162, f137, f166; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 16128; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f145; +st.shared.f32 [r27+128], f155; +st.shared.f32 [r27+192], f165; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+4096]; +ld.shared.f32 f170, [r28+8192]; +ld.shared.f32 f171, [r28+12288]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+4096]; +ld.shared.f32 f174, [r28+8192]; +ld.shared.f32 f175, [r28+12288]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +add.f32 f188, f178, f183; +sub.f32 f189, f179, f182; +sub.f32 f190, f178, f183; +add.f32 f191, f179, f182; +and.b32 r29, r5, 960; +bfe.u32 r30, r5, 6, 4; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f192, f188; +mul.f32 f197, f193, f189; +sub.f32 f198, f196, f197; +mul.f32 f199, f192, f189; +fma.rn.f32 f200, f193, f188, f199; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f203, f186; +mul.f32 f207, f205, f187; +sub.f32 f208, f206, f207; +mul.f32 f209, f203, f187; +fma.rn.f32 f210, f205, f186, f209; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f213, f190; +mul.f32 f217, f215, f191; +sub.f32 f218, f216, f217; +mul.f32 f219, f213, f191; +fma.rn.f32 f220, f215, f190, f219; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 15360; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f198; +st.shared.f32 [r34+512], f208; +st.shared.f32 [r34+768], f218; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+4096]; +ld.shared.f32 f223, [r35+8192]; +ld.shared.f32 f224, [r35+12288]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+4096]; +ld.shared.f32 f227, [r35+8192]; +ld.shared.f32 f228, [r35+12288]; +add.f32 f229, f221, f223; +add.f32 f230, f225, f227; +sub.f32 f231, f221, f223; +sub.f32 f232, f225, f227; +add.f32 f233, f222, f224; +add.f32 f234, f226, f228; +sub.f32 f235, f222, f224; +sub.f32 f236, f226, f228; +add.f32 f237, f229, f233; +add.f32 f238, f230, f234; +sub.f32 f239, f229, f233; +sub.f32 f240, f230, f234; +add.f32 f241, f231, f236; +sub.f32 f242, f232, f235; +sub.f32 f243, f231, f236; +add.f32 f244, f232, f235; +and.b32 r36, r5, 768; +bfe.u32 r37, r5, 8, 2; +mul.wide.u32 rd15, r37, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f245, f246}, [rd17]; +mul.f32 f249, f245, f241; +mul.f32 f250, f246, f242; +sub.f32 f251, f249, f250; +mul.f32 f252, f245, f242; +fma.rn.f32 f253, f246, f241, f252; +mul.f32 f254, f245, f245; +mul.f32 f255, f246, f246; +sub.f32 f256, f254, f255; +mul.f32 f257, f246, f245; +fma.rn.f32 f258, f246, f245, f257; +mul.f32 f259, f256, f239; +mul.f32 f260, f258, f240; +sub.f32 f261, f259, f260; +mul.f32 f262, f256, f240; +fma.rn.f32 f263, f258, f239, f262; +mul.f32 f264, f245, f256; +mul.f32 f265, f246, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f245, f258; +fma.rn.f32 f268, f246, f256, f267; +mul.f32 f269, f266, f243; +mul.f32 f270, f268, f244; +sub.f32 f271, f269, f270; +mul.f32 f272, f266, f244; +fma.rn.f32 f273, f268, f243, f272; +and.b32 r38, r16, 1020; +add.s32 r39, r10, r38; +barrier.sync 0; +and.b32 r40, r8, 12288; +add.s32 r41, r39, r40; +st.shared.f32 [r41], f237; +st.shared.f32 [r41+1024], f251; +st.shared.f32 [r41+2048], f261; +st.shared.f32 [r41+3072], f271; +barrier.sync 0; +mad.lo.s32 r42, r36, -12, r41; +ld.shared.f32 f274, [r42]; +ld.shared.f32 f275, [r42+4096]; +ld.shared.f32 f276, [r42+8192]; +ld.shared.f32 f277, [r42+12288]; +barrier.sync 0; +st.shared.f32 [r41], f238; +st.shared.f32 [r41+1024], f253; +st.shared.f32 [r41+2048], f263; +st.shared.f32 [r41+3072], f273; +barrier.sync 0; +ld.shared.f32 f278, [r42]; +ld.shared.f32 f279, [r42+4096]; +ld.shared.f32 f280, [r42+8192]; +ld.shared.f32 f281, [r42+12288]; +add.f32 f282, f274, f276; +add.f32 f283, f278, f280; +sub.f32 f284, f274, f276; +sub.f32 f285, f278, f280; +add.f32 f286, f275, f277; +add.f32 f287, f279, f281; +sub.f32 f288, f275, f277; +sub.f32 f289, f279, f281; +add.f32 %0, f282, f286; +add.f32 %1, f283, f287; +sub.f32 %3, f285, f288; +add.f32 %2, f284, f289; +sub.f32 %4, f282, f286; +sub.f32 %5, f283, f287; +add.f32 %7, f285, f288; +sub.f32 %6, f284, f289; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_4096), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..8e81d7a4a8345 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp32_inv.hpp.inc @@ -0,0 +1,7584 @@ +#ifndef CUFFTDX_FFT_4096_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_4096_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<303, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<952>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16320; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+1024]; +ld.shared.f32 f391, [r13+2048]; +ld.shared.f32 f392, [r13+3072]; +ld.shared.f32 f393, [r13+4096]; +ld.shared.f32 f394, [r13+5120]; +ld.shared.f32 f395, [r13+6144]; +ld.shared.f32 f396, [r13+7168]; +ld.shared.f32 f397, [r13+8192]; +ld.shared.f32 f398, [r13+9216]; +ld.shared.f32 f399, [r13+10240]; +ld.shared.f32 f400, [r13+11264]; +ld.shared.f32 f401, [r13+12288]; +ld.shared.f32 f402, [r13+13312]; +ld.shared.f32 f403, [r13+14336]; +ld.shared.f32 f404, [r13+15360]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+1024]; +ld.shared.f32 f407, [r13+2048]; +ld.shared.f32 f408, [r13+3072]; +ld.shared.f32 f409, [r13+4096]; +ld.shared.f32 f410, [r13+5120]; +ld.shared.f32 f411, [r13+6144]; +ld.shared.f32 f412, [r13+7168]; +ld.shared.f32 f413, [r13+8192]; +ld.shared.f32 f414, [r13+9216]; +ld.shared.f32 f415, [r13+10240]; +ld.shared.f32 f416, [r13+11264]; +ld.shared.f32 f417, [r13+12288]; +ld.shared.f32 f418, [r13+13312]; +ld.shared.f32 f419, [r13+14336]; +ld.shared.f32 f420, [r13+15360]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f543; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f543; +add.f32 f576, f474, f546; +add.f32 f577, f475, f548; +sub.f32 f578, f474, f546; +sub.f32 f579, f475, f548; +sub.f32 f580, f464, f522; +add.f32 f581, f465, f521; +add.f32 f582, f464, f522; +sub.f32 f583, f465, f521; +add.f32 f584, f468, f551; +add.f32 f585, f469, f553; +sub.f32 f586, f468, f551; +sub.f32 f587, f469, f553; +add.f32 f588, f472, f556; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f556; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 240; +bfe.u32 r15, r5, 4, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f569, f597; +fma.rn.f32 f601, f596, f568, f600; +mul.f32 f602, f568, f597; +mul.f32 f603, f596, f569; +sub.f32 f604, f603, f602; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f573, f609; +fma.rn.f32 f611, f607, f572, f610; +mul.f32 f612, f572, f609; +mul.f32 f613, f607, f573; +sub.f32 f614, f613, f612; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f577, f619; +fma.rn.f32 f621, f617, f576, f620; +mul.f32 f622, f576, f619; +mul.f32 f623, f617, f577; +sub.f32 f624, f623, f622; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f581, f629; +fma.rn.f32 f631, f627, f580, f630; +mul.f32 f632, f580, f629; +mul.f32 f633, f627, f581; +sub.f32 f634, f633, f632; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f585, f639; +fma.rn.f32 f641, f637, f584, f640; +mul.f32 f642, f584, f639; +mul.f32 f643, f637, f585; +sub.f32 f644, f643, f642; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f589, f649; +fma.rn.f32 f651, f647, f588, f650; +mul.f32 f652, f588, f649; +mul.f32 f653, f647, f589; +sub.f32 f654, f653, f652; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f593, f659; +fma.rn.f32 f661, f657, f592, f660; +mul.f32 f662, f592, f659; +mul.f32 f663, f657, f593; +sub.f32 f664, f663, f662; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f567, f669; +fma.rn.f32 f671, f667, f566, f670; +mul.f32 f672, f566, f669; +mul.f32 f673, f667, f567; +sub.f32 f674, f673, f672; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f571, f679; +fma.rn.f32 f681, f677, f570, f680; +mul.f32 f682, f570, f679; +mul.f32 f683, f677, f571; +sub.f32 f684, f683, f682; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f575, f689; +fma.rn.f32 f691, f687, f574, f690; +mul.f32 f692, f574, f689; +mul.f32 f693, f687, f575; +sub.f32 f694, f693, f692; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f579, f699; +fma.rn.f32 f701, f697, f578, f700; +mul.f32 f702, f578, f699; +mul.f32 f703, f697, f579; +sub.f32 f704, f703, f702; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f583, f709; +fma.rn.f32 f711, f707, f582, f710; +mul.f32 f712, f582, f709; +mul.f32 f713, f707, f583; +sub.f32 f714, f713, f712; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f587, f719; +fma.rn.f32 f721, f717, f586, f720; +mul.f32 f722, f586, f719; +mul.f32 f723, f717, f587; +sub.f32 f724, f723, f722; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f591, f729; +fma.rn.f32 f731, f727, f590, f730; +mul.f32 f732, f590, f729; +mul.f32 f733, f727, f591; +sub.f32 f734, f733, f732; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f595, f739; +fma.rn.f32 f741, f737, f594, f740; +mul.f32 f742, f594, f739; +mul.f32 f743, f737, f595; +sub.f32 f744, f743, f742; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 15360; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f601; +st.shared.f32 [r20+128], f611; +st.shared.f32 [r20+192], f621; +st.shared.f32 [r20+256], f631; +st.shared.f32 [r20+320], f641; +st.shared.f32 [r20+384], f651; +st.shared.f32 [r20+448], f661; +st.shared.f32 [r20+512], f671; +st.shared.f32 [r20+576], f681; +st.shared.f32 [r20+640], f691; +st.shared.f32 [r20+704], f701; +st.shared.f32 [r20+768], f711; +st.shared.f32 [r20+832], f721; +st.shared.f32 [r20+896], f731; +st.shared.f32 [r20+960], f741; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+1024]; +ld.shared.f32 f747, [r21+2048]; +ld.shared.f32 f748, [r21+3072]; +ld.shared.f32 f749, [r21+4096]; +ld.shared.f32 f750, [r21+5120]; +ld.shared.f32 f751, [r21+6144]; +ld.shared.f32 f752, [r21+7168]; +ld.shared.f32 f753, [r21+8192]; +ld.shared.f32 f754, [r21+9216]; +ld.shared.f32 f755, [r21+10240]; +ld.shared.f32 f756, [r21+11264]; +ld.shared.f32 f757, [r21+12288]; +ld.shared.f32 f758, [r21+13312]; +ld.shared.f32 f759, [r21+14336]; +ld.shared.f32 f760, [r21+15360]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+1024]; +ld.shared.f32 f763, [r21+2048]; +ld.shared.f32 f764, [r21+3072]; +ld.shared.f32 f765, [r21+4096]; +ld.shared.f32 f766, [r21+5120]; +ld.shared.f32 f767, [r21+6144]; +ld.shared.f32 f768, [r21+7168]; +ld.shared.f32 f769, [r21+8192]; +ld.shared.f32 f770, [r21+9216]; +ld.shared.f32 f771, [r21+10240]; +ld.shared.f32 f772, [r21+11264]; +ld.shared.f32 f773, [r21+12288]; +ld.shared.f32 f774, [r21+13312]; +ld.shared.f32 f775, [r21+14336]; +ld.shared.f32 f776, [r21+15360]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +sub.f32 f789, f779, f784; +add.f32 f790, f780, f783; +add.f32 f791, f779, f784; +sub.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +sub.f32 f805, f795, f800; +add.f32 f806, f796, f799; +add.f32 f807, f795, f800; +sub.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0f3F3504F3; +sub.f32 f811, f809, f810; +add.f32 f812, f809, f810; +mul.f32 f813, f807, 0fBF3504F3; +mul.f32 f814, f808, 0f3F3504F3; +sub.f32 f815, f813, f814; +mul.f32 f816, f808, 0fBF3504F3; +fma.rn.f32 f817, f807, 0f3F3504F3, f816; +add.f32 f818, f785, f801; +add.f32 f819, f786, f802; +sub.f32 f820, f785, f801; +sub.f32 f821, f786, f802; +add.f32 f822, f789, f811; +add.f32 f823, f790, f812; +sub.f32 f824, f789, f811; +sub.f32 f825, f790, f812; +sub.f32 f826, f787, f804; +add.f32 f827, f788, f803; +add.f32 f828, f787, f804; +sub.f32 f829, f788, f803; +add.f32 f830, f791, f815; +add.f32 f831, f792, f817; +sub.f32 f832, f791, f815; +sub.f32 f833, f792, f817; +add.f32 f834, f746, f754; +add.f32 f835, f762, f770; +sub.f32 f836, f746, f754; +sub.f32 f837, f762, f770; +add.f32 f838, f750, f758; +add.f32 f839, f766, f774; +sub.f32 f840, f750, f758; +sub.f32 f841, f766, f774; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +sub.f32 f846, f836, f841; +add.f32 f847, f837, f840; +add.f32 f848, f836, f841; +sub.f32 f849, f837, f840; +add.f32 f850, f748, f756; +add.f32 f851, f764, f772; +sub.f32 f852, f748, f756; +sub.f32 f853, f764, f772; +add.f32 f854, f752, f760; +add.f32 f855, f768, f776; +sub.f32 f856, f752, f760; +sub.f32 f857, f768, f776; +add.f32 f858, f850, f854; +add.f32 f859, f851, f855; +sub.f32 f860, f850, f854; +sub.f32 f861, f851, f855; +sub.f32 f862, f852, f857; +add.f32 f863, f853, f856; +add.f32 f864, f852, f857; +sub.f32 f865, f853, f856; +mul.f32 f866, f862, 0f3F3504F3; +mul.f32 f867, f863, 0f3F3504F3; +sub.f32 f868, f866, f867; +add.f32 f869, f866, f867; +mul.f32 f870, f864, 0fBF3504F3; +mul.f32 f871, f865, 0f3F3504F3; +sub.f32 f872, f870, f871; +mul.f32 f873, f865, 0fBF3504F3; +fma.rn.f32 f874, f864, 0f3F3504F3, f873; +add.f32 f875, f842, f858; +add.f32 f876, f843, f859; +sub.f32 f877, f842, f858; +sub.f32 f878, f843, f859; +add.f32 f879, f846, f868; +add.f32 f880, f847, f869; +sub.f32 f881, f846, f868; +sub.f32 f882, f847, f869; +sub.f32 f883, f844, f861; +add.f32 f884, f845, f860; +add.f32 f885, f844, f861; +sub.f32 f886, f845, f860; +add.f32 f887, f848, f872; +add.f32 f888, f849, f874; +sub.f32 f889, f848, f872; +sub.f32 f890, f849, f874; +mul.f32 f891, f879, 0f3F6C835E; +mul.f32 f892, f880, 0f3EC3EF15; +sub.f32 f893, f891, f892; +mul.f32 f894, f880, 0f3F6C835E; +fma.rn.f32 f895, f879, 0f3EC3EF15, f894; +mul.f32 f896, f883, 0f3F3504F3; +mul.f32 f897, f884, 0f3F3504F3; +sub.f32 f898, f896, f897; +add.f32 f899, f896, f897; +mul.f32 f900, f887, 0f3EC3EF15; +mul.f32 f901, f888, 0f3F6C835E; +sub.f32 f902, f900, f901; +mul.f32 f903, f888, 0f3EC3EF15; +fma.rn.f32 f904, f887, 0f3F6C835E, f903; +mul.f32 f905, f881, 0fBEC3EF15; +mul.f32 f906, f882, 0f3F6C835E; +sub.f32 f907, f905, f906; +mul.f32 f908, f882, 0fBEC3EF15; +fma.rn.f32 f909, f881, 0f3F6C835E, f908; +mul.f32 f910, f885, 0fBF3504F3; +mul.f32 f911, f886, 0f3F3504F3; +sub.f32 f912, f910, f911; +mul.f32 f913, f886, 0fBF3504F3; +fma.rn.f32 f914, f885, 0f3F3504F3, f913; +mul.f32 f915, f889, 0fBF6C835E; +mul.f32 f916, f890, 0f3EC3EF15; +sub.f32 f917, f915, f916; +mul.f32 f918, f890, 0fBF6C835E; +fma.rn.f32 f919, f889, 0f3EC3EF15, f918; +add.f32 %0, f818, f875; +add.f32 %1, f819, f876; +add.f32 %3, f823, f895; +add.f32 %2, f822, f893; +add.f32 %5, f827, f899; +add.f32 %4, f826, f898; +add.f32 %7, f831, f904; +add.f32 %6, f830, f902; +add.f32 %9, f821, f877; +sub.f32 %8, f820, f878; +add.f32 %11, f825, f909; +add.f32 %10, f824, f907; +add.f32 %13, f829, f914; +add.f32 %12, f828, f912; +add.f32 %15, f833, f919; +add.f32 %14, f832, f917; +sub.f32 %16, f818, f875; +sub.f32 %17, f819, f876; +sub.f32 %19, f823, f895; +sub.f32 %18, f822, f893; +sub.f32 %21, f827, f899; +sub.f32 %20, f826, f898; +sub.f32 %23, f831, f904; +sub.f32 %22, f830, f902; +sub.f32 %25, f821, f877; +add.f32 %24, f820, f878; +sub.f32 %27, f825, f909; +sub.f32 %26, f824, f907; +sub.f32 %29, f829, f914; +sub.f32 %28, f828, f912; +sub.f32 %31, f833, f919; +sub.f32 %30, f832, f917; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_4096), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<305, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<516>; +.reg .b32 r<28>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16352; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+2048]; +ld.shared.f32 f161, [r13+4096]; +ld.shared.f32 f162, [r13+6144]; +ld.shared.f32 f163, [r13+8192]; +ld.shared.f32 f164, [r13+10240]; +ld.shared.f32 f165, [r13+12288]; +ld.shared.f32 f166, [r13+14336]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+2048]; +ld.shared.f32 f169, [r13+4096]; +ld.shared.f32 f170, [r13+6144]; +ld.shared.f32 f171, [r13+8192]; +ld.shared.f32 f172, [r13+10240]; +ld.shared.f32 f173, [r13+12288]; +ld.shared.f32 f174, [r13+14336]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 504; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 16128; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+2048]; +ld.shared.f32 f303, [r20+4096]; +ld.shared.f32 f304, [r20+6144]; +ld.shared.f32 f305, [r20+8192]; +ld.shared.f32 f306, [r20+10240]; +ld.shared.f32 f307, [r20+12288]; +ld.shared.f32 f308, [r20+14336]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+2048]; +ld.shared.f32 f311, [r20+4096]; +ld.shared.f32 f312, [r20+6144]; +ld.shared.f32 f313, [r20+8192]; +ld.shared.f32 f314, [r20+10240]; +ld.shared.f32 f315, [r20+12288]; +ld.shared.f32 f316, [r20+14336]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +sub.f32 f329, f319, f324; +add.f32 f330, f320, f323; +add.f32 f331, f319, f324; +sub.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +sub.f32 f345, f335, f340; +add.f32 f346, f336, f339; +add.f32 f347, f335, f340; +sub.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0f3F3504F3; +sub.f32 f351, f349, f350; +add.f32 f352, f349, f350; +mul.f32 f353, f347, 0fBF3504F3; +mul.f32 f354, f348, 0f3F3504F3; +sub.f32 f355, f353, f354; +mul.f32 f356, f348, 0fBF3504F3; +fma.rn.f32 f357, f347, 0f3F3504F3, f356; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f352; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f352; +sub.f32 f366, f327, f344; +add.f32 f367, f328, f343; +add.f32 f368, f327, f344; +sub.f32 f369, f328, f343; +add.f32 f370, f331, f355; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f355; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 448; +bfe.u32 r22, r5, 6, 3; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f363, f375; +fma.rn.f32 f379, f374, f362, f378; +mul.f32 f380, f362, f375; +mul.f32 f381, f374, f363; +sub.f32 f382, f381, f380; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f367, f387; +fma.rn.f32 f389, f385, f366, f388; +mul.f32 f390, f366, f387; +mul.f32 f391, f385, f367; +sub.f32 f392, f391, f390; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f371, f397; +fma.rn.f32 f399, f395, f370, f398; +mul.f32 f400, f370, f397; +mul.f32 f401, f395, f371; +sub.f32 f402, f401, f400; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f361, f407; +fma.rn.f32 f409, f405, f360, f408; +mul.f32 f410, f360, f407; +mul.f32 f411, f405, f361; +sub.f32 f412, f411, f410; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f365, f417; +fma.rn.f32 f419, f415, f364, f418; +mul.f32 f420, f364, f417; +mul.f32 f421, f415, f365; +sub.f32 f422, f421, f420; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f369, f427; +fma.rn.f32 f429, f425, f368, f428; +mul.f32 f430, f368, f427; +mul.f32 f431, f425, f369; +sub.f32 f432, f431, f430; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f373, f437; +fma.rn.f32 f439, f435, f372, f438; +mul.f32 f440, f372, f437; +mul.f32 f441, f435, f373; +sub.f32 f442, f441, f440; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 14336; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f379; +st.shared.f32 [r26+512], f389; +st.shared.f32 [r26+768], f399; +st.shared.f32 [r26+1024], f409; +st.shared.f32 [r26+1280], f419; +st.shared.f32 [r26+1536], f429; +st.shared.f32 [r26+1792], f439; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+2048]; +ld.shared.f32 f445, [r27+4096]; +ld.shared.f32 f446, [r27+6144]; +ld.shared.f32 f447, [r27+8192]; +ld.shared.f32 f448, [r27+10240]; +ld.shared.f32 f449, [r27+12288]; +ld.shared.f32 f450, [r27+14336]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+2048]; +ld.shared.f32 f453, [r27+4096]; +ld.shared.f32 f454, [r27+6144]; +ld.shared.f32 f455, [r27+8192]; +ld.shared.f32 f456, [r27+10240]; +ld.shared.f32 f457, [r27+12288]; +ld.shared.f32 f458, [r27+14336]; +add.f32 f459, f443, f447; +add.f32 f460, f451, f455; +sub.f32 f461, f443, f447; +sub.f32 f462, f451, f455; +add.f32 f463, f445, f449; +add.f32 f464, f453, f457; +sub.f32 f465, f445, f449; +sub.f32 f466, f453, f457; +add.f32 f467, f459, f463; +add.f32 f468, f460, f464; +sub.f32 f469, f459, f463; +sub.f32 f470, f460, f464; +sub.f32 f471, f461, f466; +add.f32 f472, f462, f465; +add.f32 f473, f461, f466; +sub.f32 f474, f462, f465; +add.f32 f475, f444, f448; +add.f32 f476, f452, f456; +sub.f32 f477, f444, f448; +sub.f32 f478, f452, f456; +add.f32 f479, f446, f450; +add.f32 f480, f454, f458; +sub.f32 f481, f446, f450; +sub.f32 f482, f454, f458; +add.f32 f483, f475, f479; +add.f32 f484, f476, f480; +sub.f32 f485, f475, f479; +sub.f32 f486, f476, f480; +sub.f32 f487, f477, f482; +add.f32 f488, f478, f481; +add.f32 f489, f477, f482; +sub.f32 f490, f478, f481; +mul.f32 f491, f487, 0f3F3504F3; +mul.f32 f492, f488, 0f3F3504F3; +sub.f32 f493, f491, f492; +add.f32 f494, f491, f492; +mul.f32 f495, f489, 0fBF3504F3; +mul.f32 f496, f490, 0f3F3504F3; +sub.f32 f497, f495, f496; +mul.f32 f498, f490, 0fBF3504F3; +fma.rn.f32 f499, f489, 0f3F3504F3, f498; +add.f32 %0, f467, f483; +add.f32 %1, f468, f484; +add.f32 %3, f472, f494; +add.f32 %2, f471, f493; +add.f32 %5, f470, f485; +sub.f32 %4, f469, f486; +add.f32 %7, f474, f499; +add.f32 %6, f473, f497; +sub.f32 %8, f467, f483; +sub.f32 %9, f468, f484; +sub.f32 %11, f472, f494; +sub.f32 %10, f471, f493; +sub.f32 %13, f470, f485; +add.f32 %12, f469, f486; +sub.f32 %15, f474, f499; +sub.f32 %14, f473, f497; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_4096), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<306, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1224>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1216, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1214, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1213, f1216, f1214; +sub.f32 f76, f1216, f1214; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f1212, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1209, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1207, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1206, f1209, f1207; +sub.f32 f92, f1209, f1207; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f1205, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f1205, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f1203, f95, 0fBF3504F3; +mul.f32 f1204, f96, 0f3F3504F3; +sub.f32 f103, f1203, f1204; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1202, f1213, f1206; +sub.f32 f109, f1213, f1206; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1201, f1212, f100; +sub.f32 f113, f1212, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f1200, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f1199, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1197, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1194, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1193, f1197, f1194; +sub.f32 f133, f1197, f1194; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f1192, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1190, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1188, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1187, f1190, f1188; +sub.f32 f149, f1190, f1188; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f1186, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f1186, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f1184, f152, 0fBF3504F3; +mul.f32 f1185, f153, 0f3F3504F3; +sub.f32 f160, f1184, f1185; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1183, f1193, f1187; +sub.f32 f166, f1193, f1187; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1182, f1192, f157; +sub.f32 f170, f1192, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f1181, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f1180, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1178, f167, 0f3F6C835E; +mul.f32 f1179, f1182, 0f3EC3EF15; +sub.f32 f181, f1178, f1179; +mul.f32 f182, f1182, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f1181, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f1180, 0f3F6C835E; +mul.f32 f1177, f175, 0f3EC3EF15; +sub.f32 f190, f1177, f189; +mul.f32 f191, f1180, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f1176, f169, 0fBEC3EF15; +sub.f32 f195, f1176, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f1174, f173, 0fBF3504F3; +mul.f32 f1175, f174, 0f3F3504F3; +sub.f32 f200, f1174, f1175; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f1172, f177, 0fBF6C835E; +mul.f32 f1173, f178, 0f3EC3EF15; +sub.f32 f205, f1172, f1173; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1171, f1201, f183; +sub.f32 f213, f1201, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1170, f1200, f187; +sub.f32 f217, f1200, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f1169, f1199, f192; +sub.f32 f221, f1199, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f1168, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f1167, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f1166, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1165, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f1171, f239; +mul.f32 f244, f238, f1171; +mul.f32 f246, f239, f239; +mul.f32 f1164, f238, f238; +sub.f32 f247, f1164, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f1170, f249; +mul.f32 f252, f247, f1170; +mul.f32 f1162, f238, f247; +mul.f32 f1163, f239, f249; +sub.f32 f255, f1162, f1163; +mul.f32 f1161, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f1169, f257; +mul.f32 f260, f255, f1169; +mul.f32 f262, f239, f257; +mul.f32 f1160, f238, f255; +sub.f32 f263, f1160, f262; +mul.f32 f1159, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f1168, f265; +mul.f32 f268, f263, f1168; +mul.f32 f270, f239, f265; +mul.f32 f1158, f238, f263; +sub.f32 f271, f1158, f270; +mul.f32 f1157, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f1167, f273; +mul.f32 f276, f271, f1167; +mul.f32 f1155, f238, f271; +mul.f32 f1156, f239, f273; +sub.f32 f279, f1155, f1156; +mul.f32 f1154, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f1166, f281; +mul.f32 f284, f279, f1166; +mul.f32 f286, f239, f281; +mul.f32 f1153, f238, f279; +sub.f32 f287, f1153, f286; +mul.f32 f1152, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f1165, f289; +mul.f32 f292, f287, f1165; +mul.f32 f294, f239, f289; +mul.f32 f1151, f238, f287; +sub.f32 f295, f1151, f294; +mul.f32 f1150, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1149, f1202, f1183; +mul.f32 f298, f1149, f297; +mul.f32 f300, f295, f1149; +mul.f32 f1147, f238, f295; +mul.f32 f1148, f239, f297; +sub.f32 f303, f1147, f1148; +sub.f32 f1146, f106, f163; +mul.f32 f1145, f1146, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1144, f238, f303; +sub.f32 f311, f1144, f310; +mul.f32 f1143, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f1141, f238, f311; +mul.f32 f1142, f239, f313; +sub.f32 f319, f1141, f1142; +mul.f32 f1140, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1139, f238, f319; +sub.f32 f327, f1139, f326; +mul.f32 f1138, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1137, f238, f327; +sub.f32 f335, f1137, f334; +mul.f32 f1136, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f1134, f238, f335; +mul.f32 f1135, f239, f337; +sub.f32 f343, f1134, f1135; +mul.f32 f1133, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1132, f238, f343; +sub.f32 f351, f1132, f350; +mul.f32 f1131, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f1130, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 32640; +add.s32 r12, r9, r11; +add.f32 f357, f1202, f1183; +sub.f32 f1218, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r32, %tid.x; +shl.b32 r27, r32, 7; +shl.b32 r26, r32, 3; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f1130; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f1161; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f1159; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f1157; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f1154; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f1152; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f1150; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f1218, f298; +sub.f32 f374, f300, f1145; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f1143; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f1140; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f1138; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f1136; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f1133; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f1131; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +and.b32 r21, r32, 255; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+2048]; +ld.shared.v2.f32 {f397, f398}, [r13+4096]; +ld.shared.v2.f32 {f401, f402}, [r13+6144]; +ld.shared.v2.f32 {f405, f406}, [r13+8192]; +ld.shared.v2.f32 {f409, f410}, [r13+10240]; +ld.shared.v2.f32 {f413, f414}, [r13+12288]; +ld.shared.v2.f32 {f417, f418}, [r13+14336]; +ld.shared.v2.f32 {f421, f422}, [r13+16384]; +ld.shared.v2.f32 {f425, f426}, [r13+18432]; +ld.shared.v2.f32 {f429, f430}, [r13+20480]; +ld.shared.v2.f32 {f433, f434}, [r13+22528]; +ld.shared.v2.f32 {f437, f438}, [r13+24576]; +ld.shared.v2.f32 {f441, f442}, [r13+26624]; +ld.shared.v2.f32 {f445, f446}, [r13+28672]; +ld.shared.v2.f32 {f449, f450}, [r13+30720]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1129, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1128, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1127, f1129, f1128; +sub.f32 f464, f1129, f1128; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f1126, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1125, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1124, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1123, f1125, f1124; +sub.f32 f480, f1125, f1124; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f1122, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f1122, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f1121, f483, 0fBF3504F3; +sub.f32 f491, f1121, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1120, f1127, f1123; +sub.f32 f497, f1127, f1123; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1119, f1126, f488; +sub.f32 f501, f1126, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f1118, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f1117, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1116, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1115, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1114, f1116, f1115; +sub.f32 f521, f1116, f1115; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f1113, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1112, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1111, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1110, f1112, f1111; +sub.f32 f537, f1112, f1111; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f1109, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f1109, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f1108, f540, 0fBF3504F3; +sub.f32 f548, f1108, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1107, f1114, f1110; +sub.f32 f554, f1114, f1110; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1106, f1113, f545; +sub.f32 f558, f1113, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f1105, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f1104, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1106, 0f3EC3EF15; +mul.f32 f1103, f555, 0f3F6C835E; +sub.f32 f569, f1103, f568; +mul.f32 f570, f1106, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f1105, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f1104, 0f3F6C835E; +mul.f32 f1102, f563, 0f3EC3EF15; +sub.f32 f578, f1102, f577; +mul.f32 f579, f1104, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f1101, f557, 0fBEC3EF15; +sub.f32 f583, f1101, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f1100, f561, 0fBF3504F3; +sub.f32 f588, f1100, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f1099, f565, 0fBF6C835E; +sub.f32 f593, f1099, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1098, f1119, f571; +sub.f32 f601, f1119, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1097, f1118, f575; +sub.f32 f605, f1118, f575; +add.f32 f606, f506, f578; +sub.f32 f608, f506, f578; +add.f32 f1096, f1117, f580; +sub.f32 f609, f1117, f580; +sub.f32 f610, f496, f554; +add.f32 f612, f496, f554; +add.f32 f1095, f497, f553; +sub.f32 f613, f497, f553; +add.f32 f614, f500, f583; +sub.f32 f616, f500, f583; +add.f32 f1094, f501, f585; +sub.f32 f617, f501, f585; +add.f32 f618, f504, f588; +sub.f32 f620, f504, f588; +add.f32 f1093, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1092, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r32, 240; +bfe.u32 r15, r32, 4, 4; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f630, f1098, f627; +mul.f32 f632, f626, f1098; +mul.f32 f634, f627, f627; +mul.f32 f1091, f626, f626; +sub.f32 f635, f1091, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f638, f1097, f637; +mul.f32 f640, f635, f1097; +mul.f32 f1089, f626, f635; +mul.f32 f1090, f627, f637; +sub.f32 f643, f1089, f1090; +mul.f32 f1088, f602, f637; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f646, f1096, f645; +mul.f32 f648, f643, f1096; +mul.f32 f650, f627, f645; +mul.f32 f1087, f626, f643; +sub.f32 f651, f1087, f650; +mul.f32 f1086, f606, f645; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f654, f1095, f653; +mul.f32 f656, f651, f1095; +mul.f32 f658, f627, f653; +mul.f32 f1085, f626, f651; +sub.f32 f659, f1085, f658; +mul.f32 f1084, f610, f653; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f662, f1094, f661; +mul.f32 f664, f659, f1094; +mul.f32 f1082, f626, f659; +mul.f32 f1083, f627, f661; +sub.f32 f667, f1082, f1083; +mul.f32 f1081, f614, f661; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f670, f1093, f669; +mul.f32 f672, f667, f1093; +mul.f32 f674, f627, f669; +mul.f32 f1080, f626, f667; +sub.f32 f675, f1080, f674; +mul.f32 f1079, f618, f669; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f678, f1092, f677; +mul.f32 f680, f675, f1092; +mul.f32 f682, f627, f677; +mul.f32 f1078, f626, f675; +sub.f32 f683, f1078, f682; +mul.f32 f1077, f622, f677; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1076, f1120, f1107; +mul.f32 f686, f1076, f685; +mul.f32 f688, f683, f1076; +mul.f32 f1074, f626, f683; +mul.f32 f1075, f627, f685; +sub.f32 f691, f1074, f1075; +sub.f32 f1073, f494, f551; +mul.f32 f1072, f1073, f685; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f694, f601, f693; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1071, f626, f691; +sub.f32 f699, f1071, f698; +mul.f32 f1070, f600, f693; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f702, f605, f701; +mul.f32 f704, f699, f605; +mul.f32 f1068, f626, f699; +mul.f32 f1069, f627, f701; +sub.f32 f707, f1068, f1069; +mul.f32 f1067, f604, f701; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f710, f609, f709; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f1066, f626, f707; +sub.f32 f715, f1066, f714; +mul.f32 f1065, f608, f709; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f718, f613, f717; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f1064, f626, f715; +sub.f32 f723, f1064, f722; +mul.f32 f1063, f612, f717; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f726, f617, f725; +mul.f32 f728, f723, f617; +mul.f32 f1061, f626, f723; +mul.f32 f1062, f627, f725; +sub.f32 f731, f1061, f1062; +mul.f32 f1060, f616, f725; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f734, f621, f733; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f1059, f626, f731; +sub.f32 f739, f1059, f738; +mul.f32 f1058, f620, f733; +mul.f32 f740, f626, f733; +mul.f32 f1057, f598, f627; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f625, f741; +mul.f32 f743, f624, f741; +mul.f32 f744, f739, f625; +and.b32 r16, r26, 120; +add.s32 r17, r9, r16; +sub.f32 f1220, f1120, f1107; +mul.f32 f1219, f683, f1220; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +barrier.sync 0; +and.b32 r18, r30, 30720; +add.s32 r19, r17, r18; +sub.f32 f1222, f1120, f1107; +mul.f32 f1221, f683, f1222; +add.f32 f745, f1120, f1107; +mov.u32 r25, %tid.x; +and.b32 r24, r25, 240; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r29, %tid.x; +and.b32 r28, r29, 240; +fma.rn.f32 f747, f626, f598, f630; +sub.f32 f748, f632, f1057; +st.shared.v2.f32 [r19+128], {f747, f748}; +fma.rn.f32 f749, f635, f602, f638; +sub.f32 f750, f640, f1088; +st.shared.v2.f32 [r19+256], {f749, f750}; +fma.rn.f32 f751, f643, f606, f646; +sub.f32 f752, f648, f1086; +st.shared.v2.f32 [r19+384], {f751, f752}; +fma.rn.f32 f753, f651, f610, f654; +sub.f32 f754, f656, f1084; +st.shared.v2.f32 [r19+512], {f753, f754}; +sub.f32 f755, f664, f1081; +fma.rn.f32 f756, f659, f614, f662; +st.shared.v2.f32 [r19+640], {f756, f755}; +fma.rn.f32 f757, f667, f618, f670; +sub.f32 f758, f672, f1079; +st.shared.v2.f32 [r19+768], {f757, f758}; +fma.rn.f32 f759, f675, f622, f678; +sub.f32 f760, f680, f1077; +st.shared.v2.f32 [r19+896], {f759, f760}; +fma.rn.f32 f761, f683, f1073, f686; +sub.f32 f762, f1221, f1072; +st.shared.v2.f32 [r19+1024], {f761, f762}; +fma.rn.f32 f763, f691, f600, f694; +sub.f32 f764, f696, f1070; +st.shared.v2.f32 [r19+1152], {f763, f764}; +fma.rn.f32 f765, f699, f604, f702; +sub.f32 f766, f704, f1067; +st.shared.v2.f32 [r19+1280], {f765, f766}; +fma.rn.f32 f767, f707, f608, f710; +sub.f32 f768, f712, f1065; +st.shared.v2.f32 [r19+1408], {f767, f768}; +fma.rn.f32 f769, f715, f612, f718; +sub.f32 f770, f720, f1063; +st.shared.v2.f32 [r19+1536], {f769, f770}; +fma.rn.f32 f771, f723, f616, f726; +sub.f32 f772, f728, f1060; +st.shared.v2.f32 [r19+1664], {f771, f772}; +fma.rn.f32 f773, f731, f620, f734; +sub.f32 f774, f736, f1058; +st.shared.v2.f32 [r19+1792], {f773, f774}; +fma.rn.f32 f775, f739, f624, f742; +sub.f32 f776, f744, f743; +st.shared.v2.f32 [r19+1920], {f775, f776}; +barrier.sync 0; +mad.lo.s32 r20, r28, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+2048]; +ld.shared.v2.f32 {f785, f786}, [r20+4096]; +ld.shared.v2.f32 {f789, f790}, [r20+6144]; +ld.shared.v2.f32 {f793, f794}, [r20+8192]; +ld.shared.v2.f32 {f797, f798}, [r20+10240]; +ld.shared.v2.f32 {f801, f802}, [r20+12288]; +ld.shared.v2.f32 {f805, f806}, [r20+14336]; +ld.shared.v2.f32 {f809, f810}, [r20+16384]; +ld.shared.v2.f32 {f813, f814}, [r20+18432]; +ld.shared.v2.f32 {f817, f818}, [r20+20480]; +ld.shared.v2.f32 {f821, f822}, [r20+22528]; +ld.shared.v2.f32 {f825, f826}, [r20+24576]; +ld.shared.v2.f32 {f829, f830}, [r20+26624]; +ld.shared.v2.f32 {f833, f834}, [r20+28672]; +ld.shared.v2.f32 {f837, f838}, [r20+30720]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f1056, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f1055, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f1054, f1056, f1055; +sub.f32 f852, f1056, f1055; +sub.f32 f853, f843, f848; +add.f32 f855, f843, f848; +add.f32 f1053, f844, f847; +sub.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f1052, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f1051, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f1050, f1052, f1051; +sub.f32 f868, f1052, f1051; +sub.f32 f869, f859, f864; +add.f32 f871, f859, f864; +add.f32 f1049, f860, f863; +sub.f32 f872, f860, f863; +mul.f32 f873, f869, 0f3F3504F3; +mul.f32 f874, f1049, 0f3F3504F3; +sub.f32 f875, f873, f874; +add.f32 f876, f873, f874; +mul.f32 f878, f872, 0f3F3504F3; +mul.f32 f1048, f871, 0fBF3504F3; +sub.f32 f879, f1048, f878; +mul.f32 f880, f872, 0fBF3504F3; +fma.rn.f32 f881, f871, 0f3F3504F3, f880; +add.f32 f882, f849, f865; +sub.f32 f884, f849, f865; +add.f32 f1047, f1054, f1050; +sub.f32 f885, f1054, f1050; +add.f32 f886, f853, f875; +sub.f32 f888, f853, f875; +add.f32 f1046, f1053, f876; +sub.f32 f889, f1053, f876; +sub.f32 f890, f851, f868; +add.f32 f892, f851, f868; +add.f32 f1045, f852, f867; +sub.f32 f893, f852, f867; +add.f32 f894, f855, f879; +sub.f32 f896, f855, f879; +add.f32 f1044, f856, f881; +sub.f32 f897, f856, f881; +add.f32 f898, f781, f813; +sub.f32 f900, f781, f813; +add.f32 f1043, f782, f814; +sub.f32 f901, f782, f814; +add.f32 f902, f797, f829; +sub.f32 f904, f797, f829; +add.f32 f1042, f798, f830; +sub.f32 f905, f798, f830; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f1041, f1043, f1042; +sub.f32 f909, f1043, f1042; +sub.f32 f910, f900, f905; +add.f32 f912, f900, f905; +add.f32 f1040, f901, f904; +sub.f32 f913, f901, f904; +add.f32 f914, f789, f821; +sub.f32 f916, f789, f821; +add.f32 f1039, f790, f822; +sub.f32 f917, f790, f822; +add.f32 f918, f805, f837; +sub.f32 f920, f805, f837; +add.f32 f1038, f806, f838; +sub.f32 f921, f806, f838; +add.f32 f922, f914, f918; +sub.f32 f924, f914, f918; +add.f32 f1037, f1039, f1038; +sub.f32 f925, f1039, f1038; +sub.f32 f926, f916, f921; +add.f32 f928, f916, f921; +add.f32 f1036, f917, f920; +sub.f32 f929, f917, f920; +mul.f32 f930, f926, 0f3F3504F3; +mul.f32 f931, f1036, 0f3F3504F3; +sub.f32 f932, f930, f931; +add.f32 f933, f930, f931; +mul.f32 f935, f929, 0f3F3504F3; +mul.f32 f1035, f928, 0fBF3504F3; +sub.f32 f936, f1035, f935; +mul.f32 f937, f929, 0fBF3504F3; +fma.rn.f32 f938, f928, 0f3F3504F3, f937; +add.f32 f939, f906, f922; +sub.f32 f941, f906, f922; +add.f32 f1034, f1041, f1037; +sub.f32 f942, f1041, f1037; +add.f32 f943, f910, f932; +sub.f32 f945, f910, f932; +add.f32 f1033, f1040, f933; +sub.f32 f946, f1040, f933; +sub.f32 f947, f908, f925; +add.f32 f949, f908, f925; +add.f32 f1032, f909, f924; +sub.f32 f950, f909, f924; +add.f32 f951, f912, f936; +sub.f32 f953, f912, f936; +add.f32 f1031, f913, f938; +sub.f32 f954, f913, f938; +mul.f32 f956, f1033, 0f3EC3EF15; +mul.f32 f1030, f943, 0f3F6C835E; +sub.f32 f957, f1030, f956; +mul.f32 f958, f1033, 0f3F6C835E; +fma.rn.f32 f959, f943, 0f3EC3EF15, f958; +mul.f32 f960, f947, 0f3F3504F3; +mul.f32 f961, f1032, 0f3F3504F3; +sub.f32 f962, f960, f961; +add.f32 f963, f960, f961; +mul.f32 f965, f1031, 0f3F6C835E; +mul.f32 f1029, f951, 0f3EC3EF15; +sub.f32 f966, f1029, f965; +mul.f32 f967, f1031, 0f3EC3EF15; +fma.rn.f32 f968, f951, 0f3F6C835E, f967; +mul.f32 f970, f946, 0f3F6C835E; +mul.f32 f1028, f945, 0fBEC3EF15; +sub.f32 f971, f1028, f970; +mul.f32 f972, f946, 0fBEC3EF15; +fma.rn.f32 f973, f945, 0f3F6C835E, f972; +mul.f32 f975, f950, 0f3F3504F3; +mul.f32 f1027, f949, 0fBF3504F3; +sub.f32 f976, f1027, f975; +mul.f32 f977, f950, 0fBF3504F3; +fma.rn.f32 f978, f949, 0f3F3504F3, f977; +mul.f32 f980, f954, 0f3EC3EF15; +mul.f32 f1026, f953, 0fBF6C835E; +sub.f32 f981, f1026, f980; +mul.f32 f982, f954, 0fBF6C835E; +fma.rn.f32 f983, f953, 0f3EC3EF15, f982; +add.f32 %0, f882, f939; +add.f32 %1, f1047, f1034; +add.f32 %2, f886, f957; +add.f32 %3, f1046, f959; +add.f32 %4, f890, f962; +add.f32 %5, f1045, f963; +add.f32 %6, f894, f966; +add.f32 %7, f1044, f968; +add.f32 %9, f885, f941; +sub.f32 %8, f884, f942; +add.f32 %11, f889, f973; +add.f32 %10, f888, f971; +add.f32 %12, f892, f976; +add.f32 %13, f893, f978; +add.f32 %14, f896, f981; +add.f32 %15, f897, f983; +sub.f32 %17, f1047, f1034; +sub.f32 %16, f882, f939; +sub.f32 %19, f1046, f959; +sub.f32 %18, f886, f957; +sub.f32 %21, f1045, f963; +sub.f32 %20, f890, f962; +sub.f32 %23, f1044, f968; +sub.f32 %22, f894, f966; +sub.f32 %25, f885, f941; +add.f32 %24, f884, f942; +sub.f32 %27, f889, f973; +sub.f32 %26, f888, f971; +sub.f32 %29, f893, f978; +sub.f32 %28, f892, f976; +sub.f32 %31, f897, f983; +sub.f32 %30, f896, f981; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_4096), "l"(lut_sp_16_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<307, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2548>; +.reg .b32 r<40>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2540, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2538, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2537, f2540, f2538; +sub.f32 f140, f2540, f2538; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2536, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2533, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2531, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2530, f2533, f2531; +sub.f32 f156, f2533, f2531; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2529, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2529, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2527, f159, 0fBF3504F3; +mul.f32 f2528, f160, 0f3F3504F3; +sub.f32 f167, f2527, f2528; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2526, f2537, f2530; +sub.f32 f173, f2537, f2530; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2525, f2536, f164; +sub.f32 f177, f2536, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2524, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2523, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2521, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2518, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2517, f2521, f2518; +sub.f32 f197, f2521, f2518; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2516, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2514, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2512, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2511, f2514, f2512; +sub.f32 f213, f2514, f2512; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2510, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2510, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2508, f216, 0fBF3504F3; +mul.f32 f2509, f217, 0f3F3504F3; +sub.f32 f224, f2508, f2509; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2507, f2517, f2511; +sub.f32 f230, f2517, f2511; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2506, f2516, f221; +sub.f32 f234, f2516, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2505, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2504, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2502, f231, 0f3F6C835E; +mul.f32 f2503, f2506, 0f3EC3EF15; +sub.f32 f245, f2502, f2503; +mul.f32 f246, f2506, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2505, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2504, 0f3F6C835E; +mul.f32 f2501, f239, 0f3EC3EF15; +sub.f32 f254, f2501, f253; +mul.f32 f255, f2504, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2500, f233, 0fBEC3EF15; +sub.f32 f259, f2500, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2498, f237, 0fBF3504F3; +mul.f32 f2499, f238, 0f3F3504F3; +sub.f32 f264, f2498, f2499; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2496, f241, 0fBF6C835E; +mul.f32 f2497, f242, 0f3EC3EF15; +sub.f32 f269, f2496, f2497; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2495, f2526, f2507; +sub.f32 f275, f2526, f2507; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2494, f2525, f247; +sub.f32 f279, f2525, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2493, f2524, f251; +sub.f32 f283, f2524, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2492, f2523, f256; +sub.f32 f287, f2523, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2491, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2490, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2489, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2488, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2485, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2483, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2482, f2485, f2483; +sub.f32 f315, f2485, f2483; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2481, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2479, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2476, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2475, f2479, f2476; +sub.f32 f331, f2479, f2476; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2474, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2474, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2473, f334, 0fBF3504F3; +sub.f32 f342, f2473, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2472, f2482, f2475; +sub.f32 f348, f2482, f2475; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2471, f2481, f339; +sub.f32 f352, f2481, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2470, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2469, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2467, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2465, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2464, f2467, f2465; +sub.f32 f372, f2467, f2465; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2463, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2460, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2459, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2458, f2460, f2459; +sub.f32 f388, f2460, f2459; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2457, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2457, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2455, f391, 0fBF3504F3; +mul.f32 f2456, f392, 0f3F3504F3; +sub.f32 f399, f2455, f2456; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2454, f2464, f2458; +sub.f32 f405, f2464, f2458; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2453, f2463, f396; +sub.f32 f409, f2463, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2452, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2451, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2453, 0f3EC3EF15; +mul.f32 f2450, f406, 0f3F6C835E; +sub.f32 f420, f2450, f419; +mul.f32 f421, f2453, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2452, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2451, 0f3F6C835E; +mul.f32 f2449, f414, 0f3EC3EF15; +sub.f32 f429, f2449, f428; +mul.f32 f430, f2451, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2448, f408, 0fBEC3EF15; +sub.f32 f434, f2448, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2447, f412, 0fBF3504F3; +sub.f32 f439, f2447, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2446, f416, 0fBF6C835E; +sub.f32 f444, f2446, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2445, f2472, f2454; +sub.f32 f450, f2472, f2454; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2444, f2471, f422; +sub.f32 f454, f2471, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2443, f2470, f426; +sub.f32 f458, f2470, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2442, f2469, f431; +sub.f32 f462, f2469, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2441, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2440, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2439, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2438, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2444, 0f3E47C5C2; +mul.f32 f2437, f451, 0f3F7B14BE; +sub.f32 f481, f2437, f480; +mul.f32 f482, f2444, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2443, 0f3EC3EF15; +mul.f32 f2436, f455, 0f3F6C835E; +sub.f32 f486, f2436, f485; +mul.f32 f487, f2443, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2442, 0f3F0E39DA; +mul.f32 f2435, f459, 0f3F54DB31; +sub.f32 f491, f2435, f490; +mul.f32 f492, f2442, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2441, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2440, 0f3F54DB31; +mul.f32 f2434, f467, 0f3F0E39DA; +sub.f32 f500, f2434, f499; +mul.f32 f501, f2440, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2439, 0f3F6C835E; +mul.f32 f2433, f471, 0f3EC3EF15; +sub.f32 f505, f2433, f504; +mul.f32 f506, f2439, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2438, 0f3F7B14BE; +mul.f32 f2432, f475, 0f3E47C5C2; +sub.f32 f510, f2432, f509; +mul.f32 f511, f2438, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2431, f453, 0fBE47C5C2; +sub.f32 f515, f2431, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2429, f457, 0fBEC3EF15; +mul.f32 f2430, f458, 0f3F6C835E; +sub.f32 f520, f2429, f2430; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2427, f461, 0fBF0E39DA; +mul.f32 f2428, f462, 0f3F54DB31; +sub.f32 f525, f2427, f2428; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2425, f465, 0fBF3504F3; +mul.f32 f2426, f466, 0f3F3504F3; +sub.f32 f530, f2425, f2426; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2423, f469, 0fBF54DB31; +mul.f32 f2424, f470, 0f3F0E39DA; +sub.f32 f535, f2423, f2424; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2422, f473, 0fBF6C835E; +sub.f32 f540, f2422, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2421, f477, 0fBF7B14BE; +sub.f32 f545, f2421, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2420, f2494, f483; +sub.f32 f553, f2494, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2419, f2493, f488; +sub.f32 f557, f2493, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2418, f2492, f493; +sub.f32 f561, f2492, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2417, f2491, f497; +sub.f32 f565, f2491, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f2416, f2490, f502; +sub.f32 f569, f2490, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f2415, f2489, f507; +sub.f32 f573, f2489, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f2414, f2488, f512; +sub.f32 f577, f2488, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f2413, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f2412, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f2411, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f2410, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f2409, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2408, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2407, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2406, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f2420, f611; +mul.f32 f616, f610, f2420; +mul.f32 f618, f611, f611; +mul.f32 f2405, f610, f610; +sub.f32 f619, f2405, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f2419, f621; +mul.f32 f624, f619, f2419; +mul.f32 f626, f611, f621; +mul.f32 f2404, f610, f619; +sub.f32 f627, f2404, f626; +mul.f32 f2403, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f2418, f629; +mul.f32 f632, f627, f2418; +mul.f32 f2401, f610, f627; +mul.f32 f2402, f611, f629; +sub.f32 f635, f2401, f2402; +mul.f32 f2400, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f2417, f637; +mul.f32 f640, f635, f2417; +mul.f32 f642, f611, f637; +mul.f32 f2399, f610, f635; +sub.f32 f643, f2399, f642; +mul.f32 f2398, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f2416, f645; +mul.f32 f648, f643, f2416; +mul.f32 f2396, f610, f643; +mul.f32 f2397, f611, f645; +sub.f32 f651, f2396, f2397; +mul.f32 f2395, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f2415, f653; +mul.f32 f656, f651, f2415; +mul.f32 f658, f611, f653; +mul.f32 f2394, f610, f651; +sub.f32 f659, f2394, f658; +mul.f32 f2393, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f2414, f661; +mul.f32 f664, f659, f2414; +mul.f32 f666, f611, f661; +mul.f32 f2392, f610, f659; +sub.f32 f667, f2392, f666; +mul.f32 f2391, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f2413, f669; +mul.f32 f672, f667, f2413; +mul.f32 f2389, f610, f667; +mul.f32 f2390, f611, f669; +sub.f32 f675, f2389, f2390; +mul.f32 f2388, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f2412, f677; +mul.f32 f680, f675, f2412; +mul.f32 f682, f611, f677; +mul.f32 f2387, f610, f675; +sub.f32 f683, f2387, f682; +mul.f32 f2386, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f2411, f685; +mul.f32 f688, f683, f2411; +mul.f32 f690, f611, f685; +mul.f32 f2385, f610, f683; +sub.f32 f691, f2385, f690; +mul.f32 f2384, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f2410, f693; +mul.f32 f696, f691, f2410; +mul.f32 f2382, f610, f691; +mul.f32 f2383, f611, f693; +sub.f32 f699, f2382, f2383; +mul.f32 f2381, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f2409, f701; +mul.f32 f704, f699, f2409; +mul.f32 f706, f611, f701; +mul.f32 f2380, f610, f699; +sub.f32 f707, f2380, f706; +mul.f32 f2379, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f2408, f709; +mul.f32 f712, f707, f2408; +mul.f32 f2377, f610, f707; +mul.f32 f2378, f611, f709; +sub.f32 f715, f2377, f2378; +mul.f32 f2376, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f2407, f717; +mul.f32 f720, f715, f2407; +mul.f32 f722, f611, f717; +mul.f32 f2375, f610, f715; +sub.f32 f723, f2375, f722; +mul.f32 f2374, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f2406, f725; +mul.f32 f728, f723, f2406; +mul.f32 f730, f611, f725; +mul.f32 f2373, f610, f723; +sub.f32 f731, f2373, f730; +mul.f32 f2372, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2371, f2495, f2445; +mul.f32 f734, f2371, f733; +mul.f32 f736, f731, f2371; +mul.f32 f2369, f610, f731; +mul.f32 f2370, f611, f733; +sub.f32 f739, f2369, f2370; +sub.f32 f2368, f272, f447; +mul.f32 f2367, f2368, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2366, f610, f739; +sub.f32 f747, f2366, f746; +mul.f32 f2365, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2364, f610, f747; +sub.f32 f755, f2364, f754; +mul.f32 f2363, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f2361, f610, f755; +mul.f32 f2362, f611, f757; +sub.f32 f763, f2361, f2362; +mul.f32 f2360, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2359, f610, f763; +sub.f32 f771, f2359, f770; +mul.f32 f2358, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f2356, f610, f771; +mul.f32 f2357, f611, f773; +sub.f32 f779, f2356, f2357; +mul.f32 f2355, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2354, f610, f779; +sub.f32 f787, f2354, f786; +mul.f32 f2353, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2352, f610, f787; +sub.f32 f795, f2352, f794; +mul.f32 f2351, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f2349, f610, f795; +mul.f32 f2350, f611, f797; +sub.f32 f803, f2349, f2350; +mul.f32 f2348, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2347, f610, f803; +sub.f32 f811, f2347, f810; +mul.f32 f2346, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2345, f610, f811; +sub.f32 f819, f2345, f818; +mul.f32 f2344, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f2342, f610, f819; +mul.f32 f2343, f611, f821; +sub.f32 f827, f2342, f2343; +mul.f32 f2341, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2340, f610, f827; +sub.f32 f835, f2340, f834; +mul.f32 f2339, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f2337, f610, f835; +mul.f32 f2338, f611, f837; +sub.f32 f843, f2337, f2338; +mul.f32 f2336, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2335, f610, f843; +sub.f32 f851, f2335, f850; +mul.f32 f2334, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f2333, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 32512; +add.s32 r12, r9, r11; +add.f32 f857, f2495, f2445; +sub.f32 f2544, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r39, %tid.x; +shl.b32 r35, r39, 3; +shl.b32 r27, r39, 8; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f2333; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f2403; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f2400; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f2398; +sub.f32 f867, f648, f2395; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f2393; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f2391; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f2388; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f2386; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f2384; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f2381; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f2379; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f2376; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f2374; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f2372; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f2544, f734; +sub.f32 f890, f736, f2367; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f2365; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f2363; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f2360; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f2358; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f2355; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f2353; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f2351; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f2348; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f2346; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f2344; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f2341; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f2339; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f2336; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f2334; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +and.b32 r21, r39, 127; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+1024]; +ld.shared.v2.f32 {f929, f930}, [r13+2048]; +ld.shared.v2.f32 {f933, f934}, [r13+3072]; +ld.shared.v2.f32 {f937, f938}, [r13+4096]; +ld.shared.v2.f32 {f941, f942}, [r13+5120]; +ld.shared.v2.f32 {f945, f946}, [r13+6144]; +ld.shared.v2.f32 {f949, f950}, [r13+7168]; +ld.shared.v2.f32 {f953, f954}, [r13+8192]; +ld.shared.v2.f32 {f957, f958}, [r13+9216]; +ld.shared.v2.f32 {f961, f962}, [r13+10240]; +ld.shared.v2.f32 {f965, f966}, [r13+11264]; +ld.shared.v2.f32 {f969, f970}, [r13+12288]; +ld.shared.v2.f32 {f973, f974}, [r13+13312]; +ld.shared.v2.f32 {f977, f978}, [r13+14336]; +ld.shared.v2.f32 {f981, f982}, [r13+15360]; +ld.shared.v2.f32 {f985, f986}, [r13+16384]; +ld.shared.v2.f32 {f989, f990}, [r13+17408]; +ld.shared.v2.f32 {f993, f994}, [r13+18432]; +ld.shared.v2.f32 {f997, f998}, [r13+19456]; +ld.shared.v2.f32 {f1001, f1002}, [r13+20480]; +ld.shared.v2.f32 {f1005, f1006}, [r13+21504]; +ld.shared.v2.f32 {f1009, f1010}, [r13+22528]; +ld.shared.v2.f32 {f1013, f1014}, [r13+23552]; +ld.shared.v2.f32 {f1017, f1018}, [r13+24576]; +ld.shared.v2.f32 {f1021, f1022}, [r13+25600]; +ld.shared.v2.f32 {f1025, f1026}, [r13+26624]; +ld.shared.v2.f32 {f1029, f1030}, [r13+27648]; +ld.shared.v2.f32 {f1033, f1034}, [r13+28672]; +ld.shared.v2.f32 {f1037, f1038}, [r13+29696]; +ld.shared.v2.f32 {f1041, f1042}, [r13+30720]; +ld.shared.v2.f32 {f1045, f1046}, [r13+31744]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2332, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2331, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2330, f2332, f2331; +sub.f32 f1060, f2332, f2331; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f2329, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2328, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2327, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2326, f2328, f2327; +sub.f32 f1076, f2328, f2327; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f2325, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f2325, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f2323, f1079, 0fBF3504F3; +mul.f32 f2324, f1080, 0f3F3504F3; +sub.f32 f1087, f2323, f2324; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2322, f2330, f2326; +sub.f32 f1093, f2330, f2326; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2321, f2329, f1084; +sub.f32 f1097, f2329, f1084; +sub.f32 f1098, f1059, f1076; +add.f32 f1100, f1059, f1076; +add.f32 f2320, f1060, f1075; +sub.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1087; +sub.f32 f1104, f1063, f1087; +add.f32 f2319, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2318, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2317, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2316, f2318, f2317; +sub.f32 f1117, f2318, f2317; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f2315, f1109, f1112; +sub.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2314, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2313, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2312, f2314, f2313; +sub.f32 f1133, f2314, f2313; +sub.f32 f1134, f1124, f1129; +add.f32 f1136, f1124, f1129; +add.f32 f2311, f1125, f1128; +sub.f32 f1137, f1125, f1128; +mul.f32 f1138, f1134, 0f3F3504F3; +mul.f32 f1139, f2311, 0f3F3504F3; +sub.f32 f1140, f1138, f1139; +add.f32 f1141, f1138, f1139; +mul.f32 f2309, f1136, 0fBF3504F3; +mul.f32 f2310, f1137, 0f3F3504F3; +sub.f32 f1144, f2309, f2310; +mul.f32 f1145, f1137, 0fBF3504F3; +fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2308, f2316, f2312; +sub.f32 f1150, f2316, f2312; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2307, f2315, f1141; +sub.f32 f1154, f2315, f1141; +sub.f32 f1155, f1116, f1133; +add.f32 f1157, f1116, f1133; +add.f32 f2306, f1117, f1132; +sub.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1144; +sub.f32 f1161, f1120, f1144; +add.f32 f2305, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2303, f1151, 0f3F6C835E; +mul.f32 f2304, f2307, 0f3EC3EF15; +sub.f32 f1165, f2303, f2304; +mul.f32 f1166, f2307, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166; +mul.f32 f1168, f1155, 0f3F3504F3; +mul.f32 f1169, f2306, 0f3F3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +mul.f32 f2301, f1159, 0f3EC3EF15; +mul.f32 f2302, f2305, 0f3F6C835E; +sub.f32 f1174, f2301, f2302; +mul.f32 f1175, f2305, 0f3EC3EF15; +fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175; +mul.f32 f2299, f1153, 0fBEC3EF15; +mul.f32 f2300, f1154, 0f3F6C835E; +sub.f32 f1179, f2299, f2300; +mul.f32 f1180, f1154, 0fBEC3EF15; +fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180; +mul.f32 f2297, f1157, 0fBF3504F3; +mul.f32 f2298, f1158, 0f3F3504F3; +sub.f32 f1184, f2297, f2298; +mul.f32 f1185, f1158, 0fBF3504F3; +fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185; +mul.f32 f2295, f1161, 0fBF6C835E; +mul.f32 f2296, f1162, 0f3EC3EF15; +sub.f32 f1189, f2295, f2296; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2294, f2322, f2308; +sub.f32 f1195, f2322, f2308; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2293, f2321, f1167; +sub.f32 f1199, f2321, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2292, f2320, f1171; +sub.f32 f1203, f2320, f1171; +add.f32 f1204, f1102, f1174; +sub.f32 f1206, f1102, f1174; +add.f32 f2291, f2319, f1176; +sub.f32 f1207, f2319, f1176; +sub.f32 f1208, f1092, f1150; +add.f32 f1210, f1092, f1150; +add.f32 f2290, f1093, f1149; +sub.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1179; +sub.f32 f1214, f1096, f1179; +add.f32 f2289, f1097, f1181; +sub.f32 f1215, f1097, f1181; +add.f32 f1216, f1100, f1184; +sub.f32 f1218, f1100, f1184; +add.f32 f2288, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2287, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2286, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2285, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2284, f2286, f2285; +sub.f32 f1235, f2286, f2285; +sub.f32 f1236, f1226, f1231; +add.f32 f1238, f1226, f1231; +add.f32 f2283, f1227, f1230; +sub.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2282, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2281, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2280, f2282, f2281; +sub.f32 f1251, f2282, f2281; +sub.f32 f1252, f1242, f1247; +add.f32 f1254, f1242, f1247; +add.f32 f2279, f1243, f1246; +sub.f32 f1255, f1243, f1246; +mul.f32 f1256, f1252, 0f3F3504F3; +mul.f32 f1257, f2279, 0f3F3504F3; +sub.f32 f1258, f1256, f1257; +add.f32 f1259, f1256, f1257; +mul.f32 f2277, f1254, 0fBF3504F3; +mul.f32 f2278, f1255, 0f3F3504F3; +sub.f32 f1262, f2277, f2278; +mul.f32 f1263, f1255, 0fBF3504F3; +fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2276, f2284, f2280; +sub.f32 f1268, f2284, f2280; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2275, f2283, f1259; +sub.f32 f1272, f2283, f1259; +sub.f32 f1273, f1234, f1251; +add.f32 f1275, f1234, f1251; +add.f32 f2274, f1235, f1250; +sub.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1262; +sub.f32 f1279, f1238, f1262; +add.f32 f2273, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2272, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2271, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2270, f2272, f2271; +sub.f32 f1292, f2272, f2271; +sub.f32 f1293, f1283, f1288; +add.f32 f1295, f1283, f1288; +add.f32 f2269, f1284, f1287; +sub.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2268, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2267, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2266, f2268, f2267; +sub.f32 f1308, f2268, f2267; +sub.f32 f1309, f1299, f1304; +add.f32 f1311, f1299, f1304; +add.f32 f2265, f1300, f1303; +sub.f32 f1312, f1300, f1303; +mul.f32 f1313, f1309, 0f3F3504F3; +mul.f32 f1314, f2265, 0f3F3504F3; +sub.f32 f1315, f1313, f1314; +add.f32 f1316, f1313, f1314; +mul.f32 f2263, f1311, 0fBF3504F3; +mul.f32 f2264, f1312, 0f3F3504F3; +sub.f32 f1319, f2263, f2264; +mul.f32 f1320, f1312, 0fBF3504F3; +fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2262, f2270, f2266; +sub.f32 f1325, f2270, f2266; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2261, f2269, f1316; +sub.f32 f1329, f2269, f1316; +sub.f32 f1330, f1291, f1308; +add.f32 f1332, f1291, f1308; +add.f32 f2260, f1292, f1307; +sub.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1319; +sub.f32 f1336, f1295, f1319; +add.f32 f2259, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2257, f1326, 0f3F6C835E; +mul.f32 f2258, f2261, 0f3EC3EF15; +sub.f32 f1340, f2257, f2258; +mul.f32 f1341, f2261, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341; +mul.f32 f1343, f1330, 0f3F3504F3; +mul.f32 f1344, f2260, 0f3F3504F3; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1343, f1344; +mul.f32 f1348, f2259, 0f3F6C835E; +mul.f32 f2256, f1334, 0f3EC3EF15; +sub.f32 f1349, f2256, f1348; +mul.f32 f1350, f2259, 0f3EC3EF15; +fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350; +mul.f32 f1353, f1329, 0f3F6C835E; +mul.f32 f2255, f1328, 0fBEC3EF15; +sub.f32 f1354, f2255, f1353; +mul.f32 f1355, f1329, 0fBEC3EF15; +fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355; +mul.f32 f2253, f1332, 0fBF3504F3; +mul.f32 f2254, f1333, 0f3F3504F3; +sub.f32 f1359, f2253, f2254; +mul.f32 f1360, f1333, 0fBF3504F3; +fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360; +mul.f32 f2251, f1336, 0fBF6C835E; +mul.f32 f2252, f1337, 0f3EC3EF15; +sub.f32 f1364, f2251, f2252; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2250, f2276, f2262; +sub.f32 f1370, f2276, f2262; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2249, f2275, f1342; +sub.f32 f1374, f2275, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2248, f2274, f1346; +sub.f32 f1378, f2274, f1346; +add.f32 f1379, f1277, f1349; +sub.f32 f1381, f1277, f1349; +add.f32 f2247, f2273, f1351; +sub.f32 f1382, f2273, f1351; +sub.f32 f1383, f1267, f1325; +add.f32 f1385, f1267, f1325; +add.f32 f2246, f1268, f1324; +sub.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1354; +sub.f32 f1389, f1271, f1354; +add.f32 f2245, f1272, f1356; +sub.f32 f1390, f1272, f1356; +add.f32 f1391, f1275, f1359; +sub.f32 f1393, f1275, f1359; +add.f32 f2244, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2243, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2249, 0f3E47C5C2; +mul.f32 f2242, f1371, 0f3F7B14BE; +sub.f32 f1401, f2242, f1400; +mul.f32 f1402, f2249, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402; +mul.f32 f1405, f2248, 0f3EC3EF15; +mul.f32 f2241, f1375, 0f3F6C835E; +sub.f32 f1406, f2241, f1405; +mul.f32 f1407, f2248, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407; +mul.f32 f2239, f1379, 0f3F54DB31; +mul.f32 f2240, f2247, 0f3F0E39DA; +sub.f32 f1411, f2239, f2240; +mul.f32 f1412, f2247, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412; +mul.f32 f1414, f1383, 0f3F3504F3; +mul.f32 f1415, f2246, 0f3F3504F3; +sub.f32 f1416, f1414, f1415; +add.f32 f1417, f1414, f1415; +mul.f32 f1419, f2245, 0f3F54DB31; +mul.f32 f2238, f1387, 0f3F0E39DA; +sub.f32 f1420, f2238, f1419; +mul.f32 f1421, f2245, 0f3F0E39DA; +fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421; +mul.f32 f1424, f2244, 0f3F6C835E; +mul.f32 f2237, f1391, 0f3EC3EF15; +sub.f32 f1425, f2237, f1424; +mul.f32 f1426, f2244, 0f3EC3EF15; +fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426; +mul.f32 f2235, f1395, 0f3E47C5C2; +mul.f32 f2236, f2243, 0f3F7B14BE; +sub.f32 f1430, f2235, f2236; +mul.f32 f1431, f2243, 0f3E47C5C2; +fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431; +mul.f32 f2233, f1373, 0fBE47C5C2; +mul.f32 f2234, f1374, 0f3F7B14BE; +sub.f32 f1435, f2233, f2234; +mul.f32 f1436, f1374, 0fBE47C5C2; +fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436; +mul.f32 f2231, f1377, 0fBEC3EF15; +mul.f32 f2232, f1378, 0f3F6C835E; +sub.f32 f1440, f2231, f2232; +mul.f32 f1441, f1378, 0fBEC3EF15; +fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441; +mul.f32 f2229, f1381, 0fBF0E39DA; +mul.f32 f2230, f1382, 0f3F54DB31; +sub.f32 f1445, f2229, f2230; +mul.f32 f1446, f1382, 0fBF0E39DA; +fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446; +mul.f32 f1449, f1386, 0f3F3504F3; +mul.f32 f2228, f1385, 0fBF3504F3; +sub.f32 f1450, f2228, f1449; +mul.f32 f1451, f1386, 0fBF3504F3; +fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451; +mul.f32 f1454, f1390, 0f3F0E39DA; +mul.f32 f2227, f1389, 0fBF54DB31; +sub.f32 f1455, f2227, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456; +mul.f32 f1459, f1394, 0f3EC3EF15; +mul.f32 f2226, f1393, 0fBF6C835E; +sub.f32 f1460, f2226, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461; +mul.f32 f1464, f1398, 0f3E47C5C2; +mul.f32 f2225, f1397, 0fBF7B14BE; +sub.f32 f1465, f2225, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2224, f2293, f1403; +sub.f32 f1473, f2293, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2223, f2292, f1408; +sub.f32 f1477, f2292, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2222, f2291, f1413; +sub.f32 f1481, f2291, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2221, f2290, f1417; +sub.f32 f1485, f2290, f1417; +add.f32 f1486, f1212, f1420; +sub.f32 f1488, f1212, f1420; +add.f32 f2220, f2289, f1422; +sub.f32 f1489, f2289, f1422; +add.f32 f1490, f1216, f1425; +sub.f32 f1492, f1216, f1425; +add.f32 f2219, f2288, f1427; +sub.f32 f1493, f2288, f1427; +add.f32 f1494, f1220, f1430; +sub.f32 f1496, f1220, f1430; +add.f32 f2218, f2287, f1432; +sub.f32 f1497, f2287, f1432; +sub.f32 f1498, f1194, f1370; +add.f32 f1500, f1194, f1370; +add.f32 f2217, f1195, f1369; +sub.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1435; +sub.f32 f1504, f1198, f1435; +add.f32 f2216, f1199, f1437; +sub.f32 f1505, f1199, f1437; +add.f32 f1506, f1202, f1440; +sub.f32 f1508, f1202, f1440; +add.f32 f2215, f1203, f1442; +sub.f32 f1509, f1203, f1442; +add.f32 f1510, f1206, f1445; +sub.f32 f1512, f1206, f1445; +add.f32 f2214, f1207, f1447; +sub.f32 f1513, f1207, f1447; +add.f32 f1514, f1210, f1450; +sub.f32 f1516, f1210, f1450; +add.f32 f2213, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2212, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2211, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2210, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r39, 96; +bfe.u32 r15, r39, 5, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1534, f2224, f1531; +mul.f32 f1536, f1530, f2224; +mul.f32 f2208, f1530, f1530; +mul.f32 f2209, f1531, f1531; +sub.f32 f1539, f2208, f2209; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1542, f2223, f1541; +mul.f32 f1544, f1539, f2223; +mul.f32 f1546, f1531, f1541; +mul.f32 f2207, f1530, f1539; +sub.f32 f1547, f2207, f1546; +mul.f32 f2206, f1474, f1541; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1550, f2222, f1549; +mul.f32 f1552, f1547, f2222; +mul.f32 f1554, f1531, f1549; +mul.f32 f2205, f1530, f1547; +sub.f32 f1555, f2205, f1554; +mul.f32 f2204, f1478, f1549; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1558, f2221, f1557; +mul.f32 f1560, f1555, f2221; +mul.f32 f2202, f1530, f1555; +mul.f32 f2203, f1531, f1557; +sub.f32 f1563, f2202, f2203; +mul.f32 f2201, f1482, f1557; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1566, f2220, f1565; +mul.f32 f1568, f1563, f2220; +mul.f32 f1570, f1531, f1565; +mul.f32 f2200, f1530, f1563; +sub.f32 f1571, f2200, f1570; +mul.f32 f2199, f1486, f1565; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1574, f2219, f1573; +mul.f32 f1576, f1571, f2219; +mul.f32 f1578, f1531, f1573; +mul.f32 f2198, f1530, f1571; +sub.f32 f1579, f2198, f1578; +mul.f32 f2197, f1490, f1573; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1582, f2218, f1581; +mul.f32 f1584, f1579, f2218; +mul.f32 f2195, f1530, f1579; +mul.f32 f2196, f1531, f1581; +sub.f32 f1587, f2195, f2196; +mul.f32 f2194, f1494, f1581; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1590, f2217, f1589; +mul.f32 f1592, f1587, f2217; +mul.f32 f1594, f1531, f1589; +mul.f32 f2193, f1530, f1587; +sub.f32 f1595, f2193, f1594; +mul.f32 f2192, f1498, f1589; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1598, f2216, f1597; +mul.f32 f1600, f1595, f2216; +mul.f32 f2190, f1530, f1595; +mul.f32 f2191, f1531, f1597; +sub.f32 f1603, f2190, f2191; +mul.f32 f2189, f1502, f1597; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1606, f2215, f1605; +mul.f32 f1608, f1603, f2215; +mul.f32 f1610, f1531, f1605; +mul.f32 f2188, f1530, f1603; +sub.f32 f1611, f2188, f1610; +mul.f32 f2187, f1506, f1605; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1614, f2214, f1613; +mul.f32 f1616, f1611, f2214; +mul.f32 f1618, f1531, f1613; +mul.f32 f2186, f1530, f1611; +sub.f32 f1619, f2186, f1618; +mul.f32 f2185, f1510, f1613; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1622, f2213, f1621; +mul.f32 f1624, f1619, f2213; +mul.f32 f2183, f1530, f1619; +mul.f32 f2184, f1531, f1621; +sub.f32 f1627, f2183, f2184; +mul.f32 f2182, f1514, f1621; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1630, f2212, f1629; +mul.f32 f1632, f1627, f2212; +mul.f32 f1634, f1531, f1629; +mul.f32 f2181, f1530, f1627; +sub.f32 f1635, f2181, f1634; +mul.f32 f2180, f1518, f1629; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1638, f2211, f1637; +mul.f32 f1640, f1635, f2211; +mul.f32 f1642, f1531, f1637; +mul.f32 f2179, f1530, f1635; +sub.f32 f1643, f2179, f1642; +mul.f32 f2178, f1522, f1637; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1646, f2210, f1645; +mul.f32 f1648, f1643, f2210; +mul.f32 f2176, f1530, f1643; +mul.f32 f2177, f1531, f1645; +sub.f32 f1651, f2176, f2177; +mul.f32 f2175, f1526, f1645; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2174, f2294, f2250; +mul.f32 f1654, f2174, f1653; +mul.f32 f1656, f1651, f2174; +mul.f32 f1658, f1531, f1653; +mul.f32 f2173, f1530, f1651; +sub.f32 f1659, f2173, f1658; +sub.f32 f2172, f1192, f1367; +mul.f32 f2171, f2172, f1653; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1662, f1473, f1661; +mul.f32 f1664, f1659, f1473; +mul.f32 f2169, f1530, f1659; +mul.f32 f2170, f1531, f1661; +sub.f32 f1667, f2169, f2170; +mul.f32 f2168, f1472, f1661; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1670, f1477, f1669; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2167, f1530, f1667; +sub.f32 f1675, f2167, f1674; +mul.f32 f2166, f1476, f1669; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1678, f1481, f1677; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2165, f1530, f1675; +sub.f32 f1683, f2165, f1682; +mul.f32 f2164, f1480, f1677; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1686, f1485, f1685; +mul.f32 f1688, f1683, f1485; +mul.f32 f2162, f1530, f1683; +mul.f32 f2163, f1531, f1685; +sub.f32 f1691, f2162, f2163; +mul.f32 f2161, f1484, f1685; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1694, f1489, f1693; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2160, f1530, f1691; +sub.f32 f1699, f2160, f1698; +mul.f32 f2159, f1488, f1693; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1702, f1493, f1701; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2158, f1530, f1699; +sub.f32 f1707, f2158, f1706; +mul.f32 f2157, f1492, f1701; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1710, f1497, f1709; +mul.f32 f1712, f1707, f1497; +mul.f32 f2155, f1530, f1707; +mul.f32 f2156, f1531, f1709; +sub.f32 f1715, f2155, f2156; +mul.f32 f2154, f1496, f1709; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1718, f1501, f1717; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2153, f1530, f1715; +sub.f32 f1723, f2153, f1722; +mul.f32 f2152, f1500, f1717; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1726, f1505, f1725; +mul.f32 f1728, f1723, f1505; +mul.f32 f2150, f1530, f1723; +mul.f32 f2151, f1531, f1725; +sub.f32 f1731, f2150, f2151; +mul.f32 f2149, f1504, f1725; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1734, f1509, f1733; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2148, f1530, f1731; +sub.f32 f1739, f2148, f1738; +mul.f32 f2147, f1508, f1733; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1742, f1513, f1741; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2146, f1530, f1739; +sub.f32 f1747, f2146, f1746; +mul.f32 f2145, f1512, f1741; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1750, f1517, f1749; +mul.f32 f1752, f1747, f1517; +mul.f32 f2143, f1530, f1747; +mul.f32 f2144, f1531, f1749; +sub.f32 f1755, f2143, f2144; +mul.f32 f2142, f1516, f1749; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1758, f1521, f1757; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2141, f1530, f1755; +sub.f32 f1763, f2141, f1762; +mul.f32 f2140, f1520, f1757; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1766, f1525, f1765; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2139, f1530, f1763; +sub.f32 f1771, f2139, f1770; +mul.f32 f2138, f1524, f1765; +mul.f32 f1772, f1530, f1765; +mul.f32 f2137, f1470, f1531; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1529, f1773; +mul.f32 f1775, f1528, f1773; +mul.f32 f1776, f1771, f1529; +and.b32 r16, r35, 248; +add.s32 r17, r9, r16; +sub.f32 f2543, f2294, f2250; +mul.f32 f2542, f1651, f2543; +mov.u32 r34, %tid.x; +shl.b32 r33, r34, 8; +barrier.sync 0; +and.b32 r18, r33, 24576; +add.s32 r19, r17, r18; +mov.u32 r26, %tid.x; +and.b32 r25, r26, 96; +sub.f32 f2546, f2294, f2250; +mul.f32 f2545, f1651, f2546; +add.f32 f1777, f2294, f2250; +mov.u32 r30, %tid.x; +and.b32 r29, r30, 96; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r32, %tid.x; +and.b32 r31, r32, 96; +fma.rn.f32 f1779, f1530, f1470, f1534; +sub.f32 f1780, f1536, f2137; +st.shared.v2.f32 [r19+256], {f1779, f1780}; +fma.rn.f32 f1781, f1539, f1474, f1542; +sub.f32 f1782, f1544, f2206; +st.shared.v2.f32 [r19+512], {f1781, f1782}; +fma.rn.f32 f1783, f1547, f1478, f1550; +sub.f32 f1784, f1552, f2204; +st.shared.v2.f32 [r19+768], {f1783, f1784}; +fma.rn.f32 f1785, f1555, f1482, f1558; +sub.f32 f1786, f1560, f2201; +st.shared.v2.f32 [r19+1024], {f1785, f1786}; +fma.rn.f32 f1787, f1563, f1486, f1566; +sub.f32 f1788, f1568, f2199; +st.shared.v2.f32 [r19+1280], {f1787, f1788}; +sub.f32 f1789, f1576, f2197; +fma.rn.f32 f1790, f1571, f1490, f1574; +st.shared.v2.f32 [r19+1536], {f1790, f1789}; +fma.rn.f32 f1791, f1579, f1494, f1582; +sub.f32 f1792, f1584, f2194; +st.shared.v2.f32 [r19+1792], {f1791, f1792}; +fma.rn.f32 f1793, f1587, f1498, f1590; +sub.f32 f1794, f1592, f2192; +st.shared.v2.f32 [r19+2048], {f1793, f1794}; +fma.rn.f32 f1795, f1595, f1502, f1598; +sub.f32 f1796, f1600, f2189; +st.shared.v2.f32 [r19+2304], {f1795, f1796}; +fma.rn.f32 f1797, f1603, f1506, f1606; +sub.f32 f1798, f1608, f2187; +st.shared.v2.f32 [r19+2560], {f1797, f1798}; +fma.rn.f32 f1799, f1611, f1510, f1614; +sub.f32 f1800, f1616, f2185; +st.shared.v2.f32 [r19+2816], {f1799, f1800}; +fma.rn.f32 f1801, f1619, f1514, f1622; +sub.f32 f1802, f1624, f2182; +st.shared.v2.f32 [r19+3072], {f1801, f1802}; +fma.rn.f32 f1803, f1627, f1518, f1630; +sub.f32 f1804, f1632, f2180; +st.shared.v2.f32 [r19+3328], {f1803, f1804}; +fma.rn.f32 f1805, f1635, f1522, f1638; +sub.f32 f1806, f1640, f2178; +st.shared.v2.f32 [r19+3584], {f1805, f1806}; +fma.rn.f32 f1807, f1643, f1526, f1646; +sub.f32 f1808, f1648, f2175; +st.shared.v2.f32 [r19+3840], {f1807, f1808}; +fma.rn.f32 f1809, f1651, f2172, f1654; +sub.f32 f1810, f2545, f2171; +st.shared.v2.f32 [r19+4096], {f1809, f1810}; +fma.rn.f32 f1811, f1659, f1472, f1662; +sub.f32 f1812, f1664, f2168; +st.shared.v2.f32 [r19+4352], {f1811, f1812}; +fma.rn.f32 f1813, f1667, f1476, f1670; +sub.f32 f1814, f1672, f2166; +st.shared.v2.f32 [r19+4608], {f1813, f1814}; +fma.rn.f32 f1815, f1675, f1480, f1678; +sub.f32 f1816, f1680, f2164; +st.shared.v2.f32 [r19+4864], {f1815, f1816}; +fma.rn.f32 f1817, f1683, f1484, f1686; +sub.f32 f1818, f1688, f2161; +st.shared.v2.f32 [r19+5120], {f1817, f1818}; +fma.rn.f32 f1819, f1691, f1488, f1694; +sub.f32 f1820, f1696, f2159; +st.shared.v2.f32 [r19+5376], {f1819, f1820}; +fma.rn.f32 f1821, f1699, f1492, f1702; +sub.f32 f1822, f1704, f2157; +st.shared.v2.f32 [r19+5632], {f1821, f1822}; +fma.rn.f32 f1823, f1707, f1496, f1710; +sub.f32 f1824, f1712, f2154; +st.shared.v2.f32 [r19+5888], {f1823, f1824}; +fma.rn.f32 f1825, f1715, f1500, f1718; +sub.f32 f1826, f1720, f2152; +st.shared.v2.f32 [r19+6144], {f1825, f1826}; +fma.rn.f32 f1827, f1723, f1504, f1726; +sub.f32 f1828, f1728, f2149; +st.shared.v2.f32 [r19+6400], {f1827, f1828}; +fma.rn.f32 f1829, f1731, f1508, f1734; +sub.f32 f1830, f1736, f2147; +st.shared.v2.f32 [r19+6656], {f1829, f1830}; +fma.rn.f32 f1831, f1739, f1512, f1742; +sub.f32 f1832, f1744, f2145; +st.shared.v2.f32 [r19+6912], {f1831, f1832}; +fma.rn.f32 f1833, f1747, f1516, f1750; +sub.f32 f1834, f1752, f2142; +st.shared.v2.f32 [r19+7168], {f1833, f1834}; +fma.rn.f32 f1835, f1755, f1520, f1758; +sub.f32 f1836, f1760, f2140; +st.shared.v2.f32 [r19+7424], {f1835, f1836}; +fma.rn.f32 f1837, f1763, f1524, f1766; +sub.f32 f1838, f1768, f2138; +st.shared.v2.f32 [r19+7680], {f1837, f1838}; +fma.rn.f32 f1839, f1771, f1528, f1774; +sub.f32 f1840, f1776, f1775; +st.shared.v2.f32 [r19+7936], {f1839, f1840}; +barrier.sync 0; +mad.lo.s32 r20, r31, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+1024]; +ld.shared.v2.f32 {f1849, f1850}, [r20+2048]; +ld.shared.v2.f32 {f1853, f1854}, [r20+3072]; +ld.shared.v2.f32 {f1857, f1858}, [r20+4096]; +ld.shared.v2.f32 {f1861, f1862}, [r20+5120]; +ld.shared.v2.f32 {f1865, f1866}, [r20+6144]; +ld.shared.v2.f32 {f1869, f1870}, [r20+7168]; +ld.shared.v2.f32 {f1873, f1874}, [r20+8192]; +ld.shared.v2.f32 {f1877, f1878}, [r20+9216]; +ld.shared.v2.f32 {f1881, f1882}, [r20+10240]; +ld.shared.v2.f32 {f1885, f1886}, [r20+11264]; +ld.shared.v2.f32 {f1889, f1890}, [r20+12288]; +ld.shared.v2.f32 {f1893, f1894}, [r20+13312]; +ld.shared.v2.f32 {f1897, f1898}, [r20+14336]; +ld.shared.v2.f32 {f1901, f1902}, [r20+15360]; +ld.shared.v2.f32 {f1905, f1906}, [r20+16384]; +ld.shared.v2.f32 {f1909, f1910}, [r20+17408]; +ld.shared.v2.f32 {f1913, f1914}, [r20+18432]; +ld.shared.v2.f32 {f1917, f1918}, [r20+19456]; +ld.shared.v2.f32 {f1921, f1922}, [r20+20480]; +ld.shared.v2.f32 {f1925, f1926}, [r20+21504]; +ld.shared.v2.f32 {f1929, f1930}, [r20+22528]; +ld.shared.v2.f32 {f1933, f1934}, [r20+23552]; +ld.shared.v2.f32 {f1937, f1938}, [r20+24576]; +ld.shared.v2.f32 {f1941, f1942}, [r20+25600]; +ld.shared.v2.f32 {f1945, f1946}, [r20+26624]; +ld.shared.v2.f32 {f1949, f1950}, [r20+27648]; +ld.shared.v2.f32 {f1953, f1954}, [r20+28672]; +ld.shared.v2.f32 {f1957, f1958}, [r20+29696]; +ld.shared.v2.f32 {f1961, f1962}, [r20+30720]; +ld.shared.v2.f32 {f1965, f1966}, [r20+31744]; +add.f32 f1969, f1841, f1905; +sub.f32 f1971, f1841, f1905; +add.f32 f2136, f1842, f1906; +sub.f32 f1972, f1842, f1906; +add.f32 f1973, f1873, f1937; +sub.f32 f1975, f1873, f1937; +add.f32 f2135, f1874, f1938; +sub.f32 f1976, f1874, f1938; +add.f32 f1977, f1845, f1909; +sub.f32 f1979, f1845, f1909; +add.f32 f2134, f1846, f1910; +sub.f32 f1980, f1846, f1910; +add.f32 f1981, f1877, f1941; +sub.f32 f1983, f1877, f1941; +add.f32 f2133, f1878, f1942; +sub.f32 f1984, f1878, f1942; +add.f32 f1985, f1849, f1913; +sub.f32 f1987, f1849, f1913; +add.f32 f2132, f1850, f1914; +sub.f32 f1988, f1850, f1914; +add.f32 f1989, f1881, f1945; +sub.f32 f1991, f1881, f1945; +add.f32 f2131, f1882, f1946; +sub.f32 f1992, f1882, f1946; +add.f32 f1993, f1853, f1917; +sub.f32 f1995, f1853, f1917; +add.f32 f2130, f1854, f1918; +sub.f32 f1996, f1854, f1918; +add.f32 f1997, f1885, f1949; +sub.f32 f1999, f1885, f1949; +add.f32 f2129, f1886, f1950; +sub.f32 f2000, f1886, f1950; +add.f32 f2001, f1857, f1921; +sub.f32 f2003, f1857, f1921; +add.f32 f2128, f1858, f1922; +sub.f32 f2004, f1858, f1922; +add.f32 f2005, f1889, f1953; +sub.f32 f2007, f1889, f1953; +add.f32 f2127, f1890, f1954; +sub.f32 f2008, f1890, f1954; +add.f32 f2009, f1861, f1925; +sub.f32 f2011, f1861, f1925; +add.f32 f2126, f1862, f1926; +sub.f32 f2012, f1862, f1926; +add.f32 f2013, f1893, f1957; +sub.f32 f2015, f1893, f1957; +add.f32 f2125, f1894, f1958; +sub.f32 f2016, f1894, f1958; +add.f32 f2017, f1865, f1929; +sub.f32 f2019, f1865, f1929; +add.f32 f2124, f1866, f1930; +sub.f32 f2020, f1866, f1930; +add.f32 f2021, f1897, f1961; +sub.f32 f2023, f1897, f1961; +add.f32 f2123, f1898, f1962; +sub.f32 f2024, f1898, f1962; +add.f32 f2025, f1869, f1933; +sub.f32 f2027, f1869, f1933; +add.f32 f2122, f1870, f1934; +sub.f32 f2028, f1870, f1934; +add.f32 f2029, f1901, f1965; +sub.f32 f2031, f1901, f1965; +add.f32 f2121, f1902, f1966; +sub.f32 f2032, f1902, f1966; +add.f32 %0, f1969, f1973; +add.f32 %1, f2136, f2135; +add.f32 %3, f2134, f2133; +add.f32 %2, f1977, f1981; +add.f32 %5, f2132, f2131; +add.f32 %4, f1985, f1989; +add.f32 %7, f2130, f2129; +add.f32 %6, f1993, f1997; +add.f32 %8, f2001, f2005; +add.f32 %9, f2128, f2127; +add.f32 %10, f2009, f2013; +add.f32 %11, f2126, f2125; +add.f32 %12, f2017, f2021; +add.f32 %13, f2124, f2123; +add.f32 %15, f2122, f2121; +add.f32 %14, f2025, f2029; +add.f32 %17, f1972, f1975; +sub.f32 %16, f1971, f1976; +add.f32 %19, f1980, f1983; +sub.f32 %18, f1979, f1984; +sub.f32 %20, f1987, f1992; +add.f32 %21, f1988, f1991; +sub.f32 %22, f1995, f2000; +add.f32 %23, f1996, f1999; +sub.f32 %24, f2003, f2008; +add.f32 %25, f2004, f2007; +sub.f32 %26, f2011, f2016; +add.f32 %27, f2012, f2015; +add.f32 %29, f2020, f2023; +sub.f32 %28, f2019, f2024; +add.f32 %31, f2028, f2031; +sub.f32 %30, f2027, f2032; +sub.f32 %33, f2136, f2135; +sub.f32 %32, f1969, f1973; +sub.f32 %35, f2134, f2133; +sub.f32 %34, f1977, f1981; +sub.f32 %37, f2132, f2131; +sub.f32 %36, f1985, f1989; +sub.f32 %39, f2130, f2129; +sub.f32 %38, f1993, f1997; +sub.f32 %41, f2128, f2127; +sub.f32 %40, f2001, f2005; +sub.f32 %43, f2126, f2125; +sub.f32 %42, f2009, f2013; +sub.f32 %45, f2124, f2123; +sub.f32 %44, f2017, f2021; +sub.f32 %47, f2122, f2121; +sub.f32 %46, f2025, f2029; +sub.f32 %49, f1972, f1975; +add.f32 %48, f1971, f1976; +sub.f32 %51, f1980, f1983; +add.f32 %50, f1979, f1984; +sub.f32 %53, f1988, f1991; +add.f32 %52, f1987, f1992; +sub.f32 %55, f1996, f1999; +add.f32 %54, f1995, f2000; +sub.f32 %57, f2004, f2007; +add.f32 %56, f2003, f2008; +sub.f32 %59, f2012, f2015; +add.f32 %58, f2011, f2016; +sub.f32 %61, f2020, f2023; +add.f32 %60, f2019, f2024; +sub.f32 %63, f2028, f2031; +add.f32 %62, f2027, f2032; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_4096), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<304, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<564>; +.reg .b32 r<27>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %20, %30; +add.f32 f34, %21, %32; +sub.f32 f35, %20, %30; +sub.f32 f36, %21, %32; +add.f32 f37, %25, %36; +add.f32 f38, %27, %37; +sub.f32 f39, %25, %36; +sub.f32 f40, %27, %37; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %22, %33; +add.f32 f50, %24, %35; +sub.f32 f51, %22, %33; +sub.f32 f52, %24, %35; +add.f32 f53, %28, %38; +add.f32 f54, %29, %39; +sub.f32 f55, %28, %38; +sub.f32 f56, %29, %39; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 32704; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+4096]; +ld.shared.v2.f32 {f167, f168}, [r13+8192]; +ld.shared.v2.f32 {f171, f172}, [r13+12288]; +ld.shared.v2.f32 {f175, f176}, [r13+16384]; +ld.shared.v2.f32 {f179, f180}, [r13+20480]; +ld.shared.v2.f32 {f183, f184}, [r13+24576]; +ld.shared.v2.f32 {f187, f188}, [r13+28672]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 504; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 32256; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+4096]; +ld.shared.v2.f32 {f325, f326}, [r19+8192]; +ld.shared.v2.f32 {f329, f330}, [r19+12288]; +ld.shared.v2.f32 {f333, f334}, [r19+16384]; +ld.shared.v2.f32 {f337, f338}, [r19+20480]; +ld.shared.v2.f32 {f341, f342}, [r19+24576]; +ld.shared.v2.f32 {f345, f346}, [r19+28672]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +sub.f32 f361, f351, f356; +add.f32 f362, f352, f355; +add.f32 f363, f351, f356; +sub.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +sub.f32 f377, f367, f372; +add.f32 f378, f368, f371; +add.f32 f379, f367, f372; +sub.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0f3F3504F3; +sub.f32 f383, f381, f382; +add.f32 f384, f381, f382; +mul.f32 f385, f379, 0fBF3504F3; +mul.f32 f386, f380, 0f3F3504F3; +sub.f32 f387, f385, f386; +mul.f32 f388, f380, 0fBF3504F3; +fma.rn.f32 f389, f379, 0f3F3504F3, f388; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f384; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f384; +sub.f32 f396, f359, f376; +add.f32 f397, f360, f375; +add.f32 f398, f359, f376; +sub.f32 f399, f360, f375; +add.f32 f400, f363, f387; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f387; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 448; +bfe.u32 r21, r5, 6, 3; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f393, f405; +mul.f32 f409, f392, f405; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f397, f415; +mul.f32 f417, f396, f415; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f401, f423; +mul.f32 f425, f400, f423; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f391, f431; +mul.f32 f433, f390, f431; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f395, f439; +mul.f32 f441, f394, f439; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f399, f447; +mul.f32 f449, f398, f447; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f403, f455; +mul.f32 f457, f402, f455; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 28672; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f404, f392, f408; +sub.f32 f462, f410, f409; +st.shared.v2.f32 [r25+512], {f461, f462}; +fma.rn.f32 f463, f413, f396, f416; +sub.f32 f464, f418, f417; +st.shared.v2.f32 [r25+1024], {f463, f464}; +fma.rn.f32 f465, f421, f400, f424; +sub.f32 f466, f426, f425; +st.shared.v2.f32 [r25+1536], {f465, f466}; +sub.f32 f467, f434, f433; +fma.rn.f32 f468, f429, f390, f432; +st.shared.v2.f32 [r25+2048], {f468, f467}; +fma.rn.f32 f469, f437, f394, f440; +sub.f32 f470, f442, f441; +st.shared.v2.f32 [r25+2560], {f469, f470}; +fma.rn.f32 f471, f445, f398, f448; +sub.f32 f472, f450, f449; +st.shared.v2.f32 [r25+3072], {f471, f472}; +fma.rn.f32 f473, f453, f402, f456; +sub.f32 f474, f458, f457; +st.shared.v2.f32 [r25+3584], {f473, f474}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+4096]; +ld.shared.v2.f32 {f483, f484}, [r26+8192]; +ld.shared.v2.f32 {f487, f488}, [r26+12288]; +ld.shared.v2.f32 {f491, f492}, [r26+16384]; +ld.shared.v2.f32 {f495, f496}, [r26+20480]; +ld.shared.v2.f32 {f499, f500}, [r26+24576]; +ld.shared.v2.f32 {f503, f504}, [r26+28672]; +add.f32 f507, f475, f491; +add.f32 f508, f476, f492; +sub.f32 f509, f475, f491; +sub.f32 f510, f476, f492; +add.f32 f511, f483, f499; +add.f32 f512, f484, f500; +sub.f32 f513, f483, f499; +sub.f32 f514, f484, f500; +add.f32 f515, f507, f511; +add.f32 f516, f508, f512; +sub.f32 f517, f507, f511; +sub.f32 f518, f508, f512; +sub.f32 f519, f509, f514; +add.f32 f520, f510, f513; +add.f32 f521, f509, f514; +sub.f32 f522, f510, f513; +add.f32 f523, f479, f495; +add.f32 f524, f480, f496; +sub.f32 f525, f479, f495; +sub.f32 f526, f480, f496; +add.f32 f527, f487, f503; +add.f32 f528, f488, f504; +sub.f32 f529, f487, f503; +sub.f32 f530, f488, f504; +add.f32 f531, f523, f527; +add.f32 f532, f524, f528; +sub.f32 f533, f523, f527; +sub.f32 f534, f524, f528; +sub.f32 f535, f525, f530; +add.f32 f536, f526, f529; +add.f32 f537, f525, f530; +sub.f32 f538, f526, f529; +mul.f32 f539, f535, 0f3F3504F3; +mul.f32 f540, f536, 0f3F3504F3; +sub.f32 f541, f539, f540; +add.f32 f542, f539, f540; +mul.f32 f543, f537, 0fBF3504F3; +mul.f32 f544, f538, 0f3F3504F3; +sub.f32 f545, f543, f544; +mul.f32 f546, f538, 0fBF3504F3; +fma.rn.f32 f547, f537, 0f3F3504F3, f546; +add.f32 %1, f516, f532; +add.f32 %0, f515, f531; +add.f32 %3, f520, f542; +add.f32 %2, f519, f541; +add.f32 %5, f518, f533; +sub.f32 %4, f517, f534; +add.f32 %7, f522, f547; +add.f32 %6, f521, f545; +sub.f32 %9, f516, f532; +sub.f32 %8, f515, f531; +sub.f32 %11, f520, f542; +sub.f32 %10, f519, f541; +sub.f32 %13, f518, f533; +add.f32 %12, f517, f534; +sub.f32 %15, f522, f547; +sub.f32 %14, f521, f545; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_4096), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<308, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2353>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2351, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2349, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2348, f2351, f2349; +sub.f32 f140, f2351, f2349; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2347, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2344, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2342, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2341, f2344, f2342; +sub.f32 f156, f2344, f2342; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2340, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2340, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2338, f159, 0fBF3504F3; +mul.f32 f2339, f160, 0f3F3504F3; +sub.f32 f167, f2338, f2339; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2337, f2348, f2341; +sub.f32 f173, f2348, f2341; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2336, f2347, f164; +sub.f32 f177, f2347, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2335, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2334, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2332, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2329, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2328, f2332, f2329; +sub.f32 f197, f2332, f2329; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2327, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2325, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2323, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2322, f2325, f2323; +sub.f32 f213, f2325, f2323; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2321, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2321, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2319, f216, 0fBF3504F3; +mul.f32 f2320, f217, 0f3F3504F3; +sub.f32 f224, f2319, f2320; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2318, f2328, f2322; +sub.f32 f230, f2328, f2322; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2317, f2327, f221; +sub.f32 f234, f2327, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2316, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2315, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2313, f231, 0f3F6C835E; +mul.f32 f2314, f2317, 0f3EC3EF15; +sub.f32 f245, f2313, f2314; +mul.f32 f246, f2317, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2316, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2315, 0f3F6C835E; +mul.f32 f2312, f239, 0f3EC3EF15; +sub.f32 f254, f2312, f253; +mul.f32 f255, f2315, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2311, f233, 0fBEC3EF15; +sub.f32 f259, f2311, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2309, f237, 0fBF3504F3; +mul.f32 f2310, f238, 0f3F3504F3; +sub.f32 f264, f2309, f2310; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2307, f241, 0fBF6C835E; +mul.f32 f2308, f242, 0f3EC3EF15; +sub.f32 f269, f2307, f2308; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2306, f2337, f2318; +sub.f32 f275, f2337, f2318; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2305, f2336, f247; +sub.f32 f279, f2336, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2304, f2335, f251; +sub.f32 f283, f2335, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2303, f2334, f256; +sub.f32 f287, f2334, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2302, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2301, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2300, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2299, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2296, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2294, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2293, f2296, f2294; +sub.f32 f315, f2296, f2294; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2292, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2290, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2287, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2286, f2290, f2287; +sub.f32 f331, f2290, f2287; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2285, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2285, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2284, f334, 0fBF3504F3; +sub.f32 f342, f2284, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2283, f2293, f2286; +sub.f32 f348, f2293, f2286; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2282, f2292, f339; +sub.f32 f352, f2292, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2281, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2280, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2278, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2276, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2275, f2278, f2276; +sub.f32 f372, f2278, f2276; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2274, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2271, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2270, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2269, f2271, f2270; +sub.f32 f388, f2271, f2270; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2268, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2268, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2266, f391, 0fBF3504F3; +mul.f32 f2267, f392, 0f3F3504F3; +sub.f32 f399, f2266, f2267; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2265, f2275, f2269; +sub.f32 f405, f2275, f2269; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2264, f2274, f396; +sub.f32 f409, f2274, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2263, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2262, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2264, 0f3EC3EF15; +mul.f32 f2261, f406, 0f3F6C835E; +sub.f32 f420, f2261, f419; +mul.f32 f421, f2264, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2263, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2262, 0f3F6C835E; +mul.f32 f2260, f414, 0f3EC3EF15; +sub.f32 f429, f2260, f428; +mul.f32 f430, f2262, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2259, f408, 0fBEC3EF15; +sub.f32 f434, f2259, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2258, f412, 0fBF3504F3; +sub.f32 f439, f2258, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2257, f416, 0fBF6C835E; +sub.f32 f444, f2257, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2256, f2283, f2265; +sub.f32 f450, f2283, f2265; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2255, f2282, f422; +sub.f32 f454, f2282, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2254, f2281, f426; +sub.f32 f458, f2281, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2253, f2280, f431; +sub.f32 f462, f2280, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2252, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2251, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2250, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2249, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2255, 0f3E47C5C2; +mul.f32 f2248, f451, 0f3F7B14BE; +sub.f32 f481, f2248, f480; +mul.f32 f482, f2255, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2254, 0f3EC3EF15; +mul.f32 f2247, f455, 0f3F6C835E; +sub.f32 f486, f2247, f485; +mul.f32 f487, f2254, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2253, 0f3F0E39DA; +mul.f32 f2246, f459, 0f3F54DB31; +sub.f32 f491, f2246, f490; +mul.f32 f492, f2253, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2252, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2251, 0f3F54DB31; +mul.f32 f2245, f467, 0f3F0E39DA; +sub.f32 f500, f2245, f499; +mul.f32 f501, f2251, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2250, 0f3F6C835E; +mul.f32 f2244, f471, 0f3EC3EF15; +sub.f32 f505, f2244, f504; +mul.f32 f506, f2250, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2249, 0f3F7B14BE; +mul.f32 f2243, f475, 0f3E47C5C2; +sub.f32 f510, f2243, f509; +mul.f32 f511, f2249, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2242, f453, 0fBE47C5C2; +sub.f32 f515, f2242, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2240, f457, 0fBEC3EF15; +mul.f32 f2241, f458, 0f3F6C835E; +sub.f32 f520, f2240, f2241; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2238, f461, 0fBF0E39DA; +mul.f32 f2239, f462, 0f3F54DB31; +sub.f32 f525, f2238, f2239; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2236, f465, 0fBF3504F3; +mul.f32 f2237, f466, 0f3F3504F3; +sub.f32 f530, f2236, f2237; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2234, f469, 0fBF54DB31; +mul.f32 f2235, f470, 0f3F0E39DA; +sub.f32 f535, f2234, f2235; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2233, f473, 0fBF6C835E; +sub.f32 f540, f2233, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2232, f477, 0fBF7B14BE; +sub.f32 f545, f2232, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2231, f2306, f2256; +sub.f32 f551, f2306, f2256; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2230, f2305, f483; +sub.f32 f555, f2305, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2229, f2304, f488; +sub.f32 f559, f2304, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2228, f2303, f493; +sub.f32 f563, f2303, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2227, f2302, f497; +sub.f32 f567, f2302, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f2226, f2301, f502; +sub.f32 f571, f2301, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f2225, f2300, f507; +sub.f32 f575, f2300, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f2224, f2299, f512; +sub.f32 f579, f2299, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f2223, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f2222, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f2221, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f2220, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f2219, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2218, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2217, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2216, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f2230, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f2230; +sub.f32 f620, f619, f618; +mul.f32 f2214, f612, f612; +mul.f32 f2215, f613, f613; +sub.f32 f623, f2214, f2215; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f2229, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f2229; +sub.f32 f630, f629, f628; +mul.f32 f2212, f612, f623; +mul.f32 f2213, f613, f625; +sub.f32 f633, f2212, f2213; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f2228, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f2228; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f2211, f612, f633; +sub.f32 f643, f2211, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f2227, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f2227; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f2210, f612, f643; +sub.f32 f653, f2210, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f2226, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f2226; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f2209, f612, f653; +sub.f32 f663, f2209, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f2225, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f2225; +sub.f32 f670, f669, f668; +mul.f32 f2207, f612, f663; +mul.f32 f2208, f613, f665; +sub.f32 f673, f2207, f2208; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f2224, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f2224; +sub.f32 f680, f679, f678; +mul.f32 f2205, f612, f673; +mul.f32 f2206, f613, f675; +sub.f32 f683, f2205, f2206; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f2223, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f2223; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f2204, f612, f683; +sub.f32 f693, f2204, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f2222, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f2222; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f2203, f612, f693; +sub.f32 f703, f2203, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f2221, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f2221; +sub.f32 f710, f709, f708; +mul.f32 f2201, f612, f703; +mul.f32 f2202, f613, f705; +sub.f32 f713, f2201, f2202; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f2220, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f2220; +sub.f32 f720, f719, f718; +mul.f32 f2199, f612, f713; +mul.f32 f2200, f613, f715; +sub.f32 f723, f2199, f2200; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f2219, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f2219; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f2198, f612, f723; +sub.f32 f733, f2198, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f2218, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f2218; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f2197, f612, f733; +sub.f32 f743, f2197, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f2217, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f2217; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f2196, f612, f743; +sub.f32 f753, f2196, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f2216, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f2216; +sub.f32 f760, f759, f758; +mul.f32 f2194, f612, f753; +mul.f32 f2195, f613, f755; +sub.f32 f763, f2194, f2195; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f2192, f612, f763; +mul.f32 f2193, f613, f765; +sub.f32 f773, f2192, f2193; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f2191, f612, f773; +sub.f32 f783, f2191, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f2190, f612, f783; +sub.f32 f793, f2190, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f2189, f612, f793; +sub.f32 f803, f2189, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f2187, f612, f803; +mul.f32 f2188, f613, f805; +sub.f32 f813, f2187, f2188; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f2185, f612, f813; +mul.f32 f2186, f613, f815; +sub.f32 f823, f2185, f2186; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f2184, f612, f823; +sub.f32 f833, f2184, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f2183, f612, f833; +sub.f32 f843, f2183, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f2181, f612, f843; +mul.f32 f2182, f613, f845; +sub.f32 f853, f2181, f2182; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f2179, f612, f853; +mul.f32 f2180, f613, f855; +sub.f32 f863, f2179, f2180; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f2178, f612, f863; +sub.f32 f873, f2178, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f2177, f612, f873; +sub.f32 f883, f2177, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f2176, f612, f883; +sub.f32 f893, f2176, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f2174, f612, f893; +mul.f32 f2175, f613, f895; +sub.f32 f903, f2174, f2175; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f2172, f612, f903; +mul.f32 f2173, f613, f905; +sub.f32 f913, f2172, f2173; +mov.u32 r32, %tid.x; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +shl.b32 r8, r32, 7; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16256; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +and.b32 r23, r32, 127; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+512]; +ld.shared.f32 f923, [r13+1024]; +ld.shared.f32 f924, [r13+1536]; +ld.shared.f32 f925, [r13+2048]; +ld.shared.f32 f926, [r13+2560]; +ld.shared.f32 f927, [r13+3072]; +ld.shared.f32 f928, [r13+3584]; +ld.shared.f32 f929, [r13+4096]; +ld.shared.f32 f930, [r13+4608]; +ld.shared.f32 f931, [r13+5120]; +ld.shared.f32 f932, [r13+5632]; +ld.shared.f32 f933, [r13+6144]; +ld.shared.f32 f934, [r13+6656]; +ld.shared.f32 f935, [r13+7168]; +ld.shared.f32 f936, [r13+7680]; +ld.shared.f32 f937, [r13+8192]; +ld.shared.f32 f938, [r13+8704]; +ld.shared.f32 f939, [r13+9216]; +ld.shared.f32 f940, [r13+9728]; +ld.shared.f32 f941, [r13+10240]; +ld.shared.f32 f942, [r13+10752]; +ld.shared.f32 f943, [r13+11264]; +ld.shared.f32 f944, [r13+11776]; +ld.shared.f32 f945, [r13+12288]; +ld.shared.f32 f946, [r13+12800]; +ld.shared.f32 f947, [r13+13312]; +ld.shared.f32 f948, [r13+13824]; +ld.shared.f32 f949, [r13+14336]; +ld.shared.f32 f950, [r13+14848]; +ld.shared.f32 f951, [r13+15360]; +ld.shared.f32 f952, [r13+15872]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2231, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+512]; +ld.shared.f32 f955, [r13+1024]; +ld.shared.f32 f956, [r13+1536]; +ld.shared.f32 f957, [r13+2048]; +ld.shared.f32 f958, [r13+2560]; +ld.shared.f32 f959, [r13+3072]; +ld.shared.f32 f960, [r13+3584]; +ld.shared.f32 f961, [r13+4096]; +ld.shared.f32 f962, [r13+4608]; +ld.shared.f32 f963, [r13+5120]; +ld.shared.f32 f964, [r13+5632]; +ld.shared.f32 f965, [r13+6144]; +ld.shared.f32 f966, [r13+6656]; +ld.shared.f32 f967, [r13+7168]; +ld.shared.f32 f968, [r13+7680]; +ld.shared.f32 f969, [r13+8192]; +ld.shared.f32 f970, [r13+8704]; +ld.shared.f32 f971, [r13+9216]; +ld.shared.f32 f972, [r13+9728]; +ld.shared.f32 f973, [r13+10240]; +ld.shared.f32 f974, [r13+10752]; +ld.shared.f32 f975, [r13+11264]; +ld.shared.f32 f976, [r13+11776]; +ld.shared.f32 f977, [r13+12288]; +ld.shared.f32 f978, [r13+12800]; +ld.shared.f32 f979, [r13+13312]; +ld.shared.f32 f980, [r13+13824]; +ld.shared.f32 f981, [r13+14336]; +ld.shared.f32 f982, [r13+14848]; +ld.shared.f32 f983, [r13+15360]; +ld.shared.f32 f984, [r13+15872]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2171, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2170, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2169, f2171, f2170; +sub.f32 f996, f2171, f2170; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f2168, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2167, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2166, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2165, f2167, f2166; +sub.f32 f1012, f2167, f2166; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f2164, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f2164, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f2162, f1015, 0fBF3504F3; +mul.f32 f2163, f1016, 0f3F3504F3; +sub.f32 f1023, f2162, f2163; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2161, f2169, f2165; +sub.f32 f1029, f2169, f2165; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2160, f2168, f1020; +sub.f32 f1033, f2168, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f2159, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f2158, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2157, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2156, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2155, f2157, f2156; +sub.f32 f1053, f2157, f2156; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f2154, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2153, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2152, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2151, f2153, f2152; +sub.f32 f1069, f2153, f2152; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f2150, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f2150, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f2148, f1072, 0fBF3504F3; +mul.f32 f2149, f1073, 0f3F3504F3; +sub.f32 f1080, f2148, f2149; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2147, f2155, f2151; +sub.f32 f1086, f2155, f2151; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2146, f2154, f1077; +sub.f32 f1090, f2154, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f2145, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f2144, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2142, f1087, 0f3F6C835E; +mul.f32 f2143, f2146, 0f3EC3EF15; +sub.f32 f1101, f2142, f2143; +mul.f32 f1102, f2146, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f2145, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f2140, f1095, 0f3EC3EF15; +mul.f32 f2141, f2144, 0f3F6C835E; +sub.f32 f1110, f2140, f2141; +mul.f32 f1111, f2144, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f2138, f1089, 0fBEC3EF15; +mul.f32 f2139, f1090, 0f3F6C835E; +sub.f32 f1115, f2138, f2139; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f2136, f1093, 0fBF3504F3; +mul.f32 f2137, f1094, 0f3F3504F3; +sub.f32 f1120, f2136, f2137; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f2134, f1097, 0fBF6C835E; +mul.f32 f2135, f1098, 0f3EC3EF15; +sub.f32 f1125, f2134, f2135; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2133, f2161, f2147; +sub.f32 f1131, f2161, f2147; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2132, f2160, f1103; +sub.f32 f1135, f2160, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2131, f2159, f1107; +sub.f32 f1139, f2159, f1107; +add.f32 f1140, f1038, f1110; +sub.f32 f1142, f1038, f1110; +add.f32 f2130, f2158, f1112; +sub.f32 f1143, f2158, f1112; +sub.f32 f1144, f1028, f1086; +add.f32 f1146, f1028, f1086; +add.f32 f2129, f1029, f1085; +sub.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1115; +sub.f32 f1150, f1032, f1115; +add.f32 f2128, f1033, f1117; +sub.f32 f1151, f1033, f1117; +add.f32 f1152, f1036, f1120; +sub.f32 f1154, f1036, f1120; +add.f32 f2127, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2126, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2125, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2124, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2123, f2125, f2124; +sub.f32 f1171, f2125, f2124; +sub.f32 f1172, f1162, f1167; +add.f32 f1174, f1162, f1167; +add.f32 f2122, f1163, f1166; +sub.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2121, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2120, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2119, f2121, f2120; +sub.f32 f1187, f2121, f2120; +sub.f32 f1188, f1178, f1183; +add.f32 f1190, f1178, f1183; +add.f32 f2118, f1179, f1182; +sub.f32 f1191, f1179, f1182; +mul.f32 f1192, f1188, 0f3F3504F3; +mul.f32 f1193, f2118, 0f3F3504F3; +sub.f32 f1194, f1192, f1193; +add.f32 f1195, f1192, f1193; +mul.f32 f2116, f1190, 0fBF3504F3; +mul.f32 f2117, f1191, 0f3F3504F3; +sub.f32 f1198, f2116, f2117; +mul.f32 f1199, f1191, 0fBF3504F3; +fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2115, f2123, f2119; +sub.f32 f1204, f2123, f2119; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2114, f2122, f1195; +sub.f32 f1208, f2122, f1195; +sub.f32 f1209, f1170, f1187; +add.f32 f1211, f1170, f1187; +add.f32 f2113, f1171, f1186; +sub.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1198; +sub.f32 f1215, f1174, f1198; +add.f32 f2112, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2111, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2110, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2109, f2111, f2110; +sub.f32 f1228, f2111, f2110; +sub.f32 f1229, f1219, f1224; +add.f32 f1231, f1219, f1224; +add.f32 f2108, f1220, f1223; +sub.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2107, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2106, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2105, f2107, f2106; +sub.f32 f1244, f2107, f2106; +sub.f32 f1245, f1235, f1240; +add.f32 f1247, f1235, f1240; +add.f32 f2104, f1236, f1239; +sub.f32 f1248, f1236, f1239; +mul.f32 f1249, f1245, 0f3F3504F3; +mul.f32 f1250, f2104, 0f3F3504F3; +sub.f32 f1251, f1249, f1250; +add.f32 f1252, f1249, f1250; +mul.f32 f2102, f1247, 0fBF3504F3; +mul.f32 f2103, f1248, 0f3F3504F3; +sub.f32 f1255, f2102, f2103; +mul.f32 f1256, f1248, 0fBF3504F3; +fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2101, f2109, f2105; +sub.f32 f1261, f2109, f2105; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2100, f2108, f1252; +sub.f32 f1265, f2108, f1252; +sub.f32 f1266, f1227, f1244; +add.f32 f1268, f1227, f1244; +add.f32 f2099, f1228, f1243; +sub.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1255; +sub.f32 f1272, f1231, f1255; +add.f32 f2098, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2096, f1262, 0f3F6C835E; +mul.f32 f2097, f2100, 0f3EC3EF15; +sub.f32 f1276, f2096, f2097; +mul.f32 f1277, f2100, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277; +mul.f32 f1279, f1266, 0f3F3504F3; +mul.f32 f1280, f2099, 0f3F3504F3; +sub.f32 f1281, f1279, f1280; +add.f32 f1282, f1279, f1280; +mul.f32 f1284, f2098, 0f3F6C835E; +mul.f32 f2095, f1270, 0f3EC3EF15; +sub.f32 f1285, f2095, f1284; +mul.f32 f1286, f2098, 0f3EC3EF15; +fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286; +mul.f32 f1289, f1265, 0f3F6C835E; +mul.f32 f2094, f1264, 0fBEC3EF15; +sub.f32 f1290, f2094, f1289; +mul.f32 f1291, f1265, 0fBEC3EF15; +fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291; +mul.f32 f2092, f1268, 0fBF3504F3; +mul.f32 f2093, f1269, 0f3F3504F3; +sub.f32 f1295, f2092, f2093; +mul.f32 f1296, f1269, 0fBF3504F3; +fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296; +mul.f32 f2090, f1272, 0fBF6C835E; +mul.f32 f2091, f1273, 0f3EC3EF15; +sub.f32 f1300, f2090, f2091; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2089, f2115, f2101; +sub.f32 f1306, f2115, f2101; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2088, f2114, f1278; +sub.f32 f1310, f2114, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2087, f2113, f1282; +sub.f32 f1314, f2113, f1282; +add.f32 f1315, f1213, f1285; +sub.f32 f1317, f1213, f1285; +add.f32 f2086, f2112, f1287; +sub.f32 f1318, f2112, f1287; +sub.f32 f1319, f1203, f1261; +add.f32 f1321, f1203, f1261; +add.f32 f2085, f1204, f1260; +sub.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1290; +sub.f32 f1325, f1207, f1290; +add.f32 f2084, f1208, f1292; +sub.f32 f1326, f1208, f1292; +add.f32 f1327, f1211, f1295; +sub.f32 f1329, f1211, f1295; +add.f32 f2083, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2082, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2088, 0f3E47C5C2; +mul.f32 f2081, f1307, 0f3F7B14BE; +sub.f32 f1337, f2081, f1336; +mul.f32 f1338, f2088, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338; +mul.f32 f1341, f2087, 0f3EC3EF15; +mul.f32 f2080, f1311, 0f3F6C835E; +sub.f32 f1342, f2080, f1341; +mul.f32 f1343, f2087, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343; +mul.f32 f2078, f1315, 0f3F54DB31; +mul.f32 f2079, f2086, 0f3F0E39DA; +sub.f32 f1347, f2078, f2079; +mul.f32 f1348, f2086, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348; +mul.f32 f1350, f1319, 0f3F3504F3; +mul.f32 f1351, f2085, 0f3F3504F3; +sub.f32 f1352, f1350, f1351; +add.f32 f1353, f1350, f1351; +mul.f32 f1355, f2084, 0f3F54DB31; +mul.f32 f2077, f1323, 0f3F0E39DA; +sub.f32 f1356, f2077, f1355; +mul.f32 f1357, f2084, 0f3F0E39DA; +fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357; +mul.f32 f1360, f2083, 0f3F6C835E; +mul.f32 f2076, f1327, 0f3EC3EF15; +sub.f32 f1361, f2076, f1360; +mul.f32 f1362, f2083, 0f3EC3EF15; +fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362; +mul.f32 f2074, f1331, 0f3E47C5C2; +mul.f32 f2075, f2082, 0f3F7B14BE; +sub.f32 f1366, f2074, f2075; +mul.f32 f1367, f2082, 0f3E47C5C2; +fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367; +mul.f32 f2072, f1309, 0fBE47C5C2; +mul.f32 f2073, f1310, 0f3F7B14BE; +sub.f32 f1371, f2072, f2073; +mul.f32 f1372, f1310, 0fBE47C5C2; +fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372; +mul.f32 f2070, f1313, 0fBEC3EF15; +mul.f32 f2071, f1314, 0f3F6C835E; +sub.f32 f1376, f2070, f2071; +mul.f32 f1377, f1314, 0fBEC3EF15; +fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377; +mul.f32 f2068, f1317, 0fBF0E39DA; +mul.f32 f2069, f1318, 0f3F54DB31; +sub.f32 f1381, f2068, f2069; +mul.f32 f1382, f1318, 0fBF0E39DA; +fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382; +mul.f32 f1385, f1322, 0f3F3504F3; +mul.f32 f2067, f1321, 0fBF3504F3; +sub.f32 f1386, f2067, f1385; +mul.f32 f1387, f1322, 0fBF3504F3; +fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387; +mul.f32 f1390, f1326, 0f3F0E39DA; +mul.f32 f2066, f1325, 0fBF54DB31; +sub.f32 f1391, f2066, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392; +mul.f32 f1395, f1330, 0f3EC3EF15; +mul.f32 f2065, f1329, 0fBF6C835E; +sub.f32 f1396, f2065, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397; +mul.f32 f1400, f1334, 0f3E47C5C2; +mul.f32 f2064, f1333, 0fBF7B14BE; +sub.f32 f1401, f2064, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2063, f2133, f2089; +sub.f32 f1407, f2133, f2089; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2062, f2132, f1339; +sub.f32 f1411, f2132, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2061, f2131, f1344; +sub.f32 f1415, f2131, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2060, f2130, f1349; +sub.f32 f1419, f2130, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2059, f2129, f1353; +sub.f32 f1423, f2129, f1353; +add.f32 f1424, f1148, f1356; +sub.f32 f1426, f1148, f1356; +add.f32 f2058, f2128, f1358; +sub.f32 f1427, f2128, f1358; +add.f32 f1428, f1152, f1361; +sub.f32 f1430, f1152, f1361; +add.f32 f2057, f2127, f1363; +sub.f32 f1431, f2127, f1363; +add.f32 f1432, f1156, f1366; +sub.f32 f1434, f1156, f1366; +add.f32 f2056, f2126, f1368; +sub.f32 f1435, f2126, f1368; +sub.f32 f1436, f1130, f1306; +add.f32 f1438, f1130, f1306; +add.f32 f2055, f1131, f1305; +sub.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1371; +sub.f32 f1442, f1134, f1371; +add.f32 f2054, f1135, f1373; +sub.f32 f1443, f1135, f1373; +add.f32 f1444, f1138, f1376; +sub.f32 f1446, f1138, f1376; +add.f32 f2053, f1139, f1378; +sub.f32 f1447, f1139, f1378; +add.f32 f1448, f1142, f1381; +sub.f32 f1450, f1142, f1381; +add.f32 f2052, f1143, f1383; +sub.f32 f1451, f1143, f1383; +add.f32 f1452, f1146, f1386; +sub.f32 f1454, f1146, f1386; +add.f32 f2051, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2050, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2049, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2048, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r32, 5, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1472, f2062, f1469; +fma.rn.f32 f1473, f1468, f1408, f1472; +mul.f32 f1474, f1408, f1469; +mul.f32 f1475, f1468, f2062; +sub.f32 f1476, f1475, f1474; +mul.f32 f1478, f1469, f1469; +mul.f32 f2047, f1468, f1468; +sub.f32 f1479, f2047, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1482, f2061, f1481; +fma.rn.f32 f1483, f1479, f1412, f1482; +mul.f32 f1484, f1412, f1481; +mul.f32 f1485, f1479, f2061; +sub.f32 f1486, f1485, f1484; +mul.f32 f2045, f1468, f1479; +mul.f32 f2046, f1469, f1481; +sub.f32 f1489, f2045, f2046; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f1492, f2060, f1491; +fma.rn.f32 f1493, f1489, f1416, f1492; +mul.f32 f1494, f1416, f1491; +mul.f32 f1495, f1489, f2060; +sub.f32 f1496, f1495, f1494; +mul.f32 f2043, f1468, f1489; +mul.f32 f2044, f1469, f1491; +sub.f32 f1499, f2043, f2044; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f1502, f2059, f1501; +fma.rn.f32 f1503, f1499, f1420, f1502; +mul.f32 f1504, f1420, f1501; +mul.f32 f1505, f1499, f2059; +sub.f32 f1506, f1505, f1504; +mul.f32 f1508, f1469, f1501; +mul.f32 f2042, f1468, f1499; +sub.f32 f1509, f2042, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1512, f2058, f1511; +fma.rn.f32 f1513, f1509, f1424, f1512; +mul.f32 f1514, f1424, f1511; +mul.f32 f1515, f1509, f2058; +sub.f32 f1516, f1515, f1514; +mul.f32 f1518, f1469, f1511; +mul.f32 f2041, f1468, f1509; +sub.f32 f1519, f2041, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1522, f2057, f1521; +fma.rn.f32 f1523, f1519, f1428, f1522; +mul.f32 f1524, f1428, f1521; +mul.f32 f1525, f1519, f2057; +sub.f32 f1526, f1525, f1524; +mul.f32 f1528, f1469, f1521; +mul.f32 f2040, f1468, f1519; +sub.f32 f1529, f2040, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f1532, f2056, f1531; +fma.rn.f32 f1533, f1529, f1432, f1532; +mul.f32 f1534, f1432, f1531; +mul.f32 f1535, f1529, f2056; +sub.f32 f1536, f1535, f1534; +mul.f32 f2038, f1468, f1529; +mul.f32 f2039, f1469, f1531; +sub.f32 f1539, f2038, f2039; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f1542, f2055, f1541; +fma.rn.f32 f1543, f1539, f1436, f1542; +mul.f32 f1544, f1436, f1541; +mul.f32 f1545, f1539, f2055; +sub.f32 f1546, f1545, f1544; +mul.f32 f2036, f1468, f1539; +mul.f32 f2037, f1469, f1541; +sub.f32 f1549, f2036, f2037; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1552, f2054, f1551; +fma.rn.f32 f1553, f1549, f1440, f1552; +mul.f32 f1554, f1440, f1551; +mul.f32 f1555, f1549, f2054; +sub.f32 f1556, f1555, f1554; +mul.f32 f1558, f1469, f1551; +mul.f32 f2035, f1468, f1549; +sub.f32 f1559, f2035, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1562, f2053, f1561; +fma.rn.f32 f1563, f1559, f1444, f1562; +mul.f32 f1564, f1444, f1561; +mul.f32 f1565, f1559, f2053; +sub.f32 f1566, f1565, f1564; +mul.f32 f1568, f1469, f1561; +mul.f32 f2034, f1468, f1559; +sub.f32 f1569, f2034, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1572, f2052, f1571; +fma.rn.f32 f1573, f1569, f1448, f1572; +mul.f32 f1574, f1448, f1571; +mul.f32 f1575, f1569, f2052; +sub.f32 f1576, f1575, f1574; +mul.f32 f1578, f1469, f1571; +mul.f32 f2033, f1468, f1569; +sub.f32 f1579, f2033, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f1582, f2051, f1581; +fma.rn.f32 f1583, f1579, f1452, f1582; +mul.f32 f1584, f1452, f1581; +mul.f32 f1585, f1579, f2051; +sub.f32 f1586, f1585, f1584; +mul.f32 f2031, f1468, f1579; +mul.f32 f2032, f1469, f1581; +sub.f32 f1589, f2031, f2032; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f1592, f2050, f1591; +fma.rn.f32 f1593, f1589, f1456, f1592; +mul.f32 f1594, f1456, f1591; +mul.f32 f1595, f1589, f2050; +sub.f32 f1596, f1595, f1594; +mul.f32 f1598, f1469, f1591; +mul.f32 f2030, f1468, f1589; +sub.f32 f1599, f2030, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1602, f2049, f1601; +fma.rn.f32 f1603, f1599, f1460, f1602; +mul.f32 f1604, f1460, f1601; +mul.f32 f1605, f1599, f2049; +sub.f32 f1606, f1605, f1604; +mul.f32 f1608, f1469, f1601; +mul.f32 f2029, f1468, f1599; +sub.f32 f1609, f2029, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1612, f2048, f1611; +fma.rn.f32 f1613, f1609, f1464, f1612; +mul.f32 f1614, f1464, f1611; +mul.f32 f1615, f1609, f2048; +sub.f32 f1616, f1615, f1614; +mul.f32 f1618, f1469, f1611; +mul.f32 f2028, f1468, f1609; +sub.f32 f1619, f2028, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1622, f1407, f1621; +fma.rn.f32 f1623, f1619, f1406, f1622; +mul.f32 f1624, f1406, f1621; +mul.f32 f1625, f1619, f1407; +sub.f32 f1626, f1625, f1624; +mul.f32 f2026, f1468, f1619; +mul.f32 f2027, f1469, f1621; +sub.f32 f1629, f2026, f2027; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f1632, f1411, f1631; +fma.rn.f32 f1633, f1629, f1410, f1632; +mul.f32 f1634, f1410, f1631; +mul.f32 f1635, f1629, f1411; +sub.f32 f1636, f1635, f1634; +mul.f32 f2024, f1468, f1629; +mul.f32 f2025, f1469, f1631; +sub.f32 f1639, f2024, f2025; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f1642, f1415, f1641; +fma.rn.f32 f1643, f1639, f1414, f1642; +mul.f32 f1644, f1414, f1641; +mul.f32 f1645, f1639, f1415; +sub.f32 f1646, f1645, f1644; +mul.f32 f1648, f1469, f1641; +mul.f32 f2023, f1468, f1639; +sub.f32 f1649, f2023, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1652, f1419, f1651; +fma.rn.f32 f1653, f1649, f1418, f1652; +mul.f32 f1654, f1418, f1651; +mul.f32 f1655, f1649, f1419; +sub.f32 f1656, f1655, f1654; +mul.f32 f1658, f1469, f1651; +mul.f32 f2022, f1468, f1649; +sub.f32 f1659, f2022, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1662, f1423, f1661; +fma.rn.f32 f1663, f1659, f1422, f1662; +mul.f32 f1664, f1422, f1661; +mul.f32 f1665, f1659, f1423; +sub.f32 f1666, f1665, f1664; +mul.f32 f1668, f1469, f1661; +mul.f32 f2021, f1468, f1659; +sub.f32 f1669, f2021, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f1672, f1427, f1671; +fma.rn.f32 f1673, f1669, f1426, f1672; +mul.f32 f1674, f1426, f1671; +mul.f32 f1675, f1669, f1427; +sub.f32 f1676, f1675, f1674; +mul.f32 f2019, f1468, f1669; +mul.f32 f2020, f1469, f1671; +sub.f32 f1679, f2019, f2020; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f1682, f1431, f1681; +fma.rn.f32 f1683, f1679, f1430, f1682; +mul.f32 f1684, f1430, f1681; +mul.f32 f1685, f1679, f1431; +sub.f32 f1686, f1685, f1684; +mul.f32 f2017, f1468, f1679; +mul.f32 f2018, f1469, f1681; +sub.f32 f1689, f2017, f2018; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1692, f1435, f1691; +fma.rn.f32 f1693, f1689, f1434, f1692; +mul.f32 f1694, f1434, f1691; +mul.f32 f1695, f1689, f1435; +sub.f32 f1696, f1695, f1694; +mul.f32 f1698, f1469, f1691; +mul.f32 f2016, f1468, f1689; +sub.f32 f1699, f2016, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1702, f1439, f1701; +fma.rn.f32 f1703, f1699, f1438, f1702; +mul.f32 f1704, f1438, f1701; +mul.f32 f1705, f1699, f1439; +sub.f32 f1706, f1705, f1704; +mul.f32 f1708, f1469, f1701; +mul.f32 f2015, f1468, f1699; +sub.f32 f1709, f2015, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1712, f1443, f1711; +fma.rn.f32 f1713, f1709, f1442, f1712; +mul.f32 f1714, f1442, f1711; +mul.f32 f1715, f1709, f1443; +sub.f32 f1716, f1715, f1714; +mul.f32 f2013, f1468, f1709; +mul.f32 f2014, f1469, f1711; +sub.f32 f1719, f2013, f2014; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f1722, f1447, f1721; +fma.rn.f32 f1723, f1719, f1446, f1722; +mul.f32 f1724, f1446, f1721; +mul.f32 f1725, f1719, f1447; +sub.f32 f1726, f1725, f1724; +mul.f32 f2011, f1468, f1719; +mul.f32 f2012, f1469, f1721; +sub.f32 f1729, f2011, f2012; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f1732, f1451, f1731; +fma.rn.f32 f1733, f1729, f1450, f1732; +mul.f32 f1734, f1450, f1731; +mul.f32 f1735, f1729, f1451; +sub.f32 f1736, f1735, f1734; +mul.f32 f1738, f1469, f1731; +mul.f32 f2010, f1468, f1729; +sub.f32 f1739, f2010, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1742, f1455, f1741; +fma.rn.f32 f1743, f1739, f1454, f1742; +mul.f32 f1744, f1454, f1741; +mul.f32 f1745, f1739, f1455; +sub.f32 f1746, f1745, f1744; +mul.f32 f1748, f1469, f1741; +mul.f32 f2009, f1468, f1739; +sub.f32 f1749, f2009, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1752, f1459, f1751; +fma.rn.f32 f1753, f1749, f1458, f1752; +mul.f32 f1754, f1458, f1751; +mul.f32 f1755, f1749, f1459; +sub.f32 f1756, f1755, f1754; +mul.f32 f1758, f1469, f1751; +mul.f32 f2008, f1468, f1749; +sub.f32 f1759, f2008, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f1762, f1463, f1761; +fma.rn.f32 f1763, f1759, f1462, f1762; +mul.f32 f1764, f1462, f1761; +mul.f32 f1765, f1759, f1463; +sub.f32 f1766, f1765, f1764; +mul.f32 f2006, f1468, f1759; +mul.f32 f2007, f1469, f1761; +sub.f32 f1769, f2006, f2007; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 7; +mul.f32 f1770, f1468, f1761; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f1772, f1467, f1771; +fma.rn.f32 f1773, f1769, f1466, f1772; +mul.f32 f1774, f1466, f1771; +mov.u32 r33, %tid.x; +mul.f32 f1775, f1769, f1467; +sub.f32 f1776, f1775, f1774; +and.b32 r22, r33, 96; +shl.b32 r16, r33, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r30, 12288; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1473; +st.shared.f32 [r20+256], f1483; +st.shared.f32 [r20+384], f1493; +st.shared.f32 [r20+512], f1503; +st.shared.f32 [r20+640], f1513; +st.shared.f32 [r20+768], f1523; +st.shared.f32 [r20+896], f1533; +st.shared.f32 [r20+1024], f1543; +st.shared.f32 [r20+1152], f1553; +st.shared.f32 [r20+1280], f1563; +st.shared.f32 [r20+1408], f1573; +st.shared.f32 [r20+1536], f1583; +st.shared.f32 [r20+1664], f1593; +st.shared.f32 [r20+1792], f1603; +st.shared.f32 [r20+1920], f1613; +st.shared.f32 [r20+2048], f1623; +st.shared.f32 [r20+2176], f1633; +st.shared.f32 [r20+2304], f1643; +st.shared.f32 [r20+2432], f1653; +st.shared.f32 [r20+2560], f1663; +st.shared.f32 [r20+2688], f1673; +st.shared.f32 [r20+2816], f1683; +st.shared.f32 [r20+2944], f1693; +st.shared.f32 [r20+3072], f1703; +st.shared.f32 [r20+3200], f1713; +st.shared.f32 [r20+3328], f1723; +st.shared.f32 [r20+3456], f1733; +st.shared.f32 [r20+3584], f1743; +st.shared.f32 [r20+3712], f1753; +st.shared.f32 [r20+3840], f1763; +st.shared.f32 [r20+3968], f1773; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+512]; +ld.shared.f32 f1779, [r21+1024]; +ld.shared.f32 f1780, [r21+1536]; +ld.shared.f32 f1781, [r21+2048]; +ld.shared.f32 f1782, [r21+2560]; +ld.shared.f32 f1783, [r21+3072]; +ld.shared.f32 f1784, [r21+3584]; +ld.shared.f32 f1785, [r21+4096]; +ld.shared.f32 f1786, [r21+4608]; +ld.shared.f32 f1787, [r21+5120]; +ld.shared.f32 f1788, [r21+5632]; +ld.shared.f32 f1789, [r21+6144]; +ld.shared.f32 f1790, [r21+6656]; +ld.shared.f32 f1791, [r21+7168]; +ld.shared.f32 f1792, [r21+7680]; +ld.shared.f32 f1793, [r21+8192]; +ld.shared.f32 f1794, [r21+8704]; +ld.shared.f32 f1795, [r21+9216]; +ld.shared.f32 f1796, [r21+9728]; +ld.shared.f32 f1797, [r21+10240]; +ld.shared.f32 f1798, [r21+10752]; +ld.shared.f32 f1799, [r21+11264]; +ld.shared.f32 f1800, [r21+11776]; +ld.shared.f32 f1801, [r21+12288]; +ld.shared.f32 f1802, [r21+12800]; +ld.shared.f32 f1803, [r21+13312]; +ld.shared.f32 f1804, [r21+13824]; +ld.shared.f32 f1805, [r21+14336]; +ld.shared.f32 f1806, [r21+14848]; +ld.shared.f32 f1807, [r21+15360]; +ld.shared.f32 f1808, [r21+15872]; +barrier.sync 0; +st.shared.f32 [r20], f2063; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+512]; +ld.shared.f32 f1811, [r21+1024]; +ld.shared.f32 f1812, [r21+1536]; +ld.shared.f32 f1813, [r21+2048]; +ld.shared.f32 f1814, [r21+2560]; +ld.shared.f32 f1815, [r21+3072]; +ld.shared.f32 f1816, [r21+3584]; +ld.shared.f32 f1817, [r21+4096]; +ld.shared.f32 f1818, [r21+4608]; +ld.shared.f32 f1819, [r21+5120]; +ld.shared.f32 f1820, [r21+5632]; +ld.shared.f32 f1821, [r21+6144]; +ld.shared.f32 f1822, [r21+6656]; +ld.shared.f32 f1823, [r21+7168]; +ld.shared.f32 f1824, [r21+7680]; +ld.shared.f32 f1825, [r21+8192]; +ld.shared.f32 f1826, [r21+8704]; +ld.shared.f32 f1827, [r21+9216]; +ld.shared.f32 f1828, [r21+9728]; +ld.shared.f32 f1829, [r21+10240]; +ld.shared.f32 f1830, [r21+10752]; +ld.shared.f32 f1831, [r21+11264]; +ld.shared.f32 f1832, [r21+11776]; +ld.shared.f32 f1833, [r21+12288]; +ld.shared.f32 f1834, [r21+12800]; +ld.shared.f32 f1835, [r21+13312]; +ld.shared.f32 f1836, [r21+13824]; +ld.shared.f32 f1837, [r21+14336]; +ld.shared.f32 f1838, [r21+14848]; +ld.shared.f32 f1839, [r21+15360]; +ld.shared.f32 f1840, [r21+15872]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2005, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2004, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1778, f1794; +sub.f32 f1851, f1778, f1794; +add.f32 f2003, f1810, f1826; +sub.f32 f1852, f1810, f1826; +add.f32 f1853, f1786, f1802; +sub.f32 f1855, f1786, f1802; +add.f32 f2002, f1818, f1834; +sub.f32 f1856, f1818, f1834; +add.f32 f1857, f1779, f1795; +sub.f32 f1859, f1779, f1795; +add.f32 f2001, f1811, f1827; +sub.f32 f1860, f1811, f1827; +add.f32 f1861, f1787, f1803; +sub.f32 f1863, f1787, f1803; +add.f32 f2000, f1819, f1835; +sub.f32 f1864, f1819, f1835; +add.f32 f1865, f1780, f1796; +sub.f32 f1867, f1780, f1796; +add.f32 f1999, f1812, f1828; +sub.f32 f1868, f1812, f1828; +add.f32 f1869, f1788, f1804; +sub.f32 f1871, f1788, f1804; +add.f32 f1998, f1820, f1836; +sub.f32 f1872, f1820, f1836; +add.f32 f1873, f1781, f1797; +sub.f32 f1875, f1781, f1797; +add.f32 f1997, f1813, f1829; +sub.f32 f1876, f1813, f1829; +add.f32 f1877, f1789, f1805; +sub.f32 f1879, f1789, f1805; +add.f32 f1996, f1821, f1837; +sub.f32 f1880, f1821, f1837; +add.f32 f1881, f1782, f1798; +sub.f32 f1883, f1782, f1798; +add.f32 f1995, f1814, f1830; +sub.f32 f1884, f1814, f1830; +add.f32 f1885, f1790, f1806; +sub.f32 f1887, f1790, f1806; +add.f32 f1994, f1822, f1838; +sub.f32 f1888, f1822, f1838; +add.f32 f1889, f1783, f1799; +sub.f32 f1891, f1783, f1799; +add.f32 f1993, f1815, f1831; +sub.f32 f1892, f1815, f1831; +add.f32 f1893, f1791, f1807; +sub.f32 f1895, f1791, f1807; +add.f32 f1992, f1823, f1839; +sub.f32 f1896, f1823, f1839; +add.f32 f1897, f1784, f1800; +sub.f32 f1899, f1784, f1800; +add.f32 f1991, f1816, f1832; +sub.f32 f1900, f1816, f1832; +add.f32 f1901, f1792, f1808; +sub.f32 f1903, f1792, f1808; +add.f32 f1990, f1824, f1840; +sub.f32 f1904, f1824, f1840; +add.f32 %1, f2005, f2004; +add.f32 %0, f1841, f1845; +add.f32 %3, f2003, f2002; +add.f32 %2, f1849, f1853; +add.f32 %5, f2001, f2000; +add.f32 %4, f1857, f1861; +add.f32 %7, f1999, f1998; +add.f32 %6, f1865, f1869; +add.f32 %8, f1873, f1877; +add.f32 %9, f1997, f1996; +add.f32 %10, f1881, f1885; +add.f32 %11, f1995, f1994; +add.f32 %13, f1993, f1992; +add.f32 %12, f1889, f1893; +add.f32 %15, f1991, f1990; +add.f32 %14, f1897, f1901; +sub.f32 %16, f1843, f1848; +add.f32 %17, f1844, f1847; +sub.f32 %18, f1851, f1856; +add.f32 %19, f1852, f1855; +add.f32 %21, f1860, f1863; +sub.f32 %20, f1859, f1864; +add.f32 %23, f1868, f1871; +sub.f32 %22, f1867, f1872; +add.f32 %25, f1876, f1879; +sub.f32 %24, f1875, f1880; +sub.f32 %26, f1883, f1888; +add.f32 %27, f1884, f1887; +sub.f32 %28, f1891, f1896; +add.f32 %29, f1892, f1895; +sub.f32 %30, f1899, f1904; +add.f32 %31, f1900, f1903; +sub.f32 %32, f1841, f1845; +sub.f32 %33, f2005, f2004; +sub.f32 %34, f1849, f1853; +sub.f32 %35, f2003, f2002; +sub.f32 %36, f1857, f1861; +sub.f32 %37, f2001, f2000; +sub.f32 %38, f1865, f1869; +sub.f32 %39, f1999, f1998; +sub.f32 %40, f1873, f1877; +sub.f32 %41, f1997, f1996; +sub.f32 %42, f1881, f1885; +sub.f32 %43, f1995, f1994; +sub.f32 %44, f1889, f1893; +sub.f32 %45, f1993, f1992; +sub.f32 %46, f1897, f1901; +sub.f32 %47, f1991, f1990; +sub.f32 %49, f1844, f1847; +add.f32 %48, f1843, f1848; +sub.f32 %51, f1852, f1855; +add.f32 %50, f1851, f1856; +sub.f32 %53, f1860, f1863; +add.f32 %52, f1859, f1864; +sub.f32 %55, f1868, f1871; +add.f32 %54, f1867, f1872; +sub.f32 %57, f1876, f1879; +add.f32 %56, f1875, f1880; +sub.f32 %59, f1884, f1887; +add.f32 %58, f1883, f1888; +sub.f32 %61, f1892, f1895; +add.f32 %60, f1891, f1896; +sub.f32 %63, f1900, f1903; +add.f32 %62, f1899, f1904; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_4096), "l"(lut_sp_32_128), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<309, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<338>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -32768; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 32736; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+8192]; +ld.shared.v2.f32 {f70, f71}, [r13+16384]; +ld.shared.v2.f32 {f74, f75}, [r13+24576]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 32640; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+8192]; +ld.shared.v2.f32 {f131, f132}, [r20+16384]; +ld.shared.v2.f32 {f135, f136}, [r20+24576]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +sub.f32 f149, f141, f146; +add.f32 f150, f142, f145; +add.f32 f151, f141, f146; +sub.f32 f152, f142, f145; +and.b32 r21, r5, 1008; +bfe.u32 r22, r5, 4, 6; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f150, f154; +mul.f32 f158, f149, f154; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f148, f164; +mul.f32 f166, f147, f164; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f152, f172; +mul.f32 f174, f151, f172; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 32256; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f153, f149, f157; +sub.f32 f179, f159, f158; +st.shared.v2.f32 [r26+128], {f178, f179}; +fma.rn.f32 f180, f162, f147, f165; +sub.f32 f181, f167, f166; +st.shared.v2.f32 [r26+256], {f180, f181}; +sub.f32 f182, f175, f174; +fma.rn.f32 f183, f170, f151, f173; +st.shared.v2.f32 [r26+384], {f183, f182}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+8192]; +ld.shared.v2.f32 {f192, f193}, [r27+16384]; +ld.shared.v2.f32 {f196, f197}, [r27+24576]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +sub.f32 f210, f202, f207; +add.f32 f211, f203, f206; +add.f32 f212, f202, f207; +sub.f32 f213, f203, f206; +and.b32 r28, r5, 960; +bfe.u32 r29, r5, 6, 4; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f211, f215; +mul.f32 f219, f210, f215; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f209, f225; +mul.f32 f227, f208, f225; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f213, f233; +mul.f32 f235, f212, f233; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 30720; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f214, f210, f218; +sub.f32 f240, f220, f219; +st.shared.v2.f32 [r33+512], {f239, f240}; +fma.rn.f32 f241, f223, f208, f226; +sub.f32 f242, f228, f227; +st.shared.v2.f32 [r33+1024], {f241, f242}; +sub.f32 f243, f236, f235; +fma.rn.f32 f244, f231, f212, f234; +st.shared.v2.f32 [r33+1536], {f244, f243}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+8192]; +ld.shared.v2.f32 {f253, f254}, [r34+16384]; +ld.shared.v2.f32 {f257, f258}, [r34+24576]; +add.f32 f261, f245, f253; +add.f32 f262, f246, f254; +sub.f32 f263, f245, f253; +sub.f32 f264, f246, f254; +add.f32 f265, f249, f257; +add.f32 f266, f250, f258; +sub.f32 f267, f249, f257; +sub.f32 f268, f250, f258; +sub.f32 f269, f261, f265; +sub.f32 f270, f262, f266; +sub.f32 f271, f263, f268; +add.f32 f272, f264, f267; +add.f32 f273, f263, f268; +sub.f32 f274, f264, f267; +and.b32 r35, r5, 768; +bfe.u32 r36, r5, 8, 2; +mul.wide.u32 rd15, r36, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f275, f276}, [rd17]; +mul.f32 f279, f272, f276; +mul.f32 f280, f271, f276; +mul.f32 f281, f275, f272; +mul.f32 f282, f275, f275; +mul.f32 f283, f276, f276; +sub.f32 f284, f282, f283; +mul.f32 f285, f276, f275; +fma.rn.f32 f286, f276, f275, f285; +mul.f32 f287, f270, f286; +mul.f32 f288, f269, f286; +mul.f32 f289, f284, f270; +mul.f32 f290, f275, f284; +mul.f32 f291, f276, f286; +sub.f32 f292, f290, f291; +mul.f32 f293, f275, f286; +fma.rn.f32 f294, f276, f284, f293; +mul.f32 f295, f274, f294; +mul.f32 f296, f273, f294; +mul.f32 f297, f292, f274; +and.b32 r37, r10, 2040; +add.s32 r38, r9, r37; +barrier.sync 0; +and.b32 r39, r7, 24576; +add.s32 r40, r38, r39; +add.f32 f298, f262, f266; +add.f32 f299, f261, f265; +st.shared.v2.f32 [r40], {f299, f298}; +fma.rn.f32 f300, f275, f271, f279; +sub.f32 f301, f281, f280; +st.shared.v2.f32 [r40+2048], {f300, f301}; +fma.rn.f32 f302, f284, f269, f287; +sub.f32 f303, f289, f288; +st.shared.v2.f32 [r40+4096], {f302, f303}; +sub.f32 f304, f297, f296; +fma.rn.f32 f305, f292, f273, f295; +st.shared.v2.f32 [r40+6144], {f305, f304}; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.v2.f32 {f306, f307}, [r41]; +ld.shared.v2.f32 {f310, f311}, [r41+8192]; +ld.shared.v2.f32 {f314, f315}, [r41+16384]; +ld.shared.v2.f32 {f318, f319}, [r41+24576]; +add.f32 f322, f306, f314; +add.f32 f323, f307, f315; +sub.f32 f324, f306, f314; +sub.f32 f325, f307, f315; +add.f32 f326, f310, f318; +add.f32 f327, f311, f319; +sub.f32 f328, f310, f318; +sub.f32 f329, f311, f319; +add.f32 %1, f323, f327; +add.f32 %0, f322, f326; +add.f32 %3, f325, f328; +sub.f32 %2, f324, f329; +sub.f32 %5, f323, f327; +sub.f32 %4, f322, f326; +sub.f32 %7, f325, f328; +add.f32 %6, f324, f329; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_4096), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<310, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<298>; +.reg .b32 r<43>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 14; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %14, %19; +add.f32 f18, %15, %21; +sub.f32 f19, %14, %19; +sub.f32 f20, %15, %21; +add.f32 f21, %16, %22; +add.f32 f22, %18, %23; +sub.f32 f23, %16, %22; +sub.f32 f24, %18, %23; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -16384; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16368; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+4096]; +ld.shared.f32 f64, [r13+8192]; +ld.shared.f32 f65, [r13+12288]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+4096]; +ld.shared.f32 f68, [r13+8192]; +ld.shared.f32 f69, [r13+12288]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 16320; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+4096]; +ld.shared.f32 f117, [r21+8192]; +ld.shared.f32 f118, [r21+12288]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+4096]; +ld.shared.f32 f121, [r21+8192]; +ld.shared.f32 f122, [r21+12288]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +sub.f32 f135, f125, f130; +add.f32 f136, f126, f129; +add.f32 f137, f125, f130; +sub.f32 f138, f126, f129; +and.b32 r22, r5, 1008; +bfe.u32 r23, r5, 4, 6; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f136, f140; +fma.rn.f32 f144, f139, f135, f143; +mul.f32 f145, f135, f140; +mul.f32 f146, f139, f136; +sub.f32 f147, f146, f145; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f134, f152; +fma.rn.f32 f154, f150, f133, f153; +mul.f32 f155, f133, f152; +mul.f32 f156, f150, f134; +sub.f32 f157, f156, f155; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f138, f162; +fma.rn.f32 f164, f160, f137, f163; +mul.f32 f165, f137, f162; +mul.f32 f166, f160, f138; +sub.f32 f167, f166, f165; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 16128; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f144; +st.shared.f32 [r27+128], f154; +st.shared.f32 [r27+192], f164; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+4096]; +ld.shared.f32 f170, [r28+8192]; +ld.shared.f32 f171, [r28+12288]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+4096]; +ld.shared.f32 f174, [r28+8192]; +ld.shared.f32 f175, [r28+12288]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +sub.f32 f188, f178, f183; +add.f32 f189, f179, f182; +add.f32 f190, f178, f183; +sub.f32 f191, f179, f182; +and.b32 r29, r5, 960; +bfe.u32 r30, r5, 6, 4; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f189, f193; +fma.rn.f32 f197, f192, f188, f196; +mul.f32 f198, f188, f193; +mul.f32 f199, f192, f189; +sub.f32 f200, f199, f198; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f187, f205; +fma.rn.f32 f207, f203, f186, f206; +mul.f32 f208, f186, f205; +mul.f32 f209, f203, f187; +sub.f32 f210, f209, f208; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f191, f215; +fma.rn.f32 f217, f213, f190, f216; +mul.f32 f218, f190, f215; +mul.f32 f219, f213, f191; +sub.f32 f220, f219, f218; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 15360; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f197; +st.shared.f32 [r34+512], f207; +st.shared.f32 [r34+768], f217; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+4096]; +ld.shared.f32 f223, [r35+8192]; +ld.shared.f32 f224, [r35+12288]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+4096]; +ld.shared.f32 f227, [r35+8192]; +ld.shared.f32 f228, [r35+12288]; +add.f32 f229, f221, f223; +add.f32 f230, f225, f227; +sub.f32 f231, f221, f223; +sub.f32 f232, f225, f227; +add.f32 f233, f222, f224; +add.f32 f234, f226, f228; +sub.f32 f235, f222, f224; +sub.f32 f236, f226, f228; +add.f32 f237, f229, f233; +add.f32 f238, f230, f234; +sub.f32 f239, f229, f233; +sub.f32 f240, f230, f234; +sub.f32 f241, f231, f236; +add.f32 f242, f232, f235; +add.f32 f243, f231, f236; +sub.f32 f244, f232, f235; +and.b32 r36, r5, 768; +bfe.u32 r37, r5, 8, 2; +mul.wide.u32 rd15, r37, 8; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f245, f246}, [rd17]; +mul.f32 f249, f242, f246; +fma.rn.f32 f250, f245, f241, f249; +mul.f32 f251, f241, f246; +mul.f32 f252, f245, f242; +sub.f32 f253, f252, f251; +mul.f32 f254, f245, f245; +mul.f32 f255, f246, f246; +sub.f32 f256, f254, f255; +mul.f32 f257, f246, f245; +fma.rn.f32 f258, f246, f245, f257; +mul.f32 f259, f240, f258; +fma.rn.f32 f260, f256, f239, f259; +mul.f32 f261, f239, f258; +mul.f32 f262, f256, f240; +sub.f32 f263, f262, f261; +mul.f32 f264, f245, f256; +mul.f32 f265, f246, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f245, f258; +fma.rn.f32 f268, f246, f256, f267; +mul.f32 f269, f244, f268; +fma.rn.f32 f270, f266, f243, f269; +mul.f32 f271, f243, f268; +mul.f32 f272, f266, f244; +sub.f32 f273, f272, f271; +and.b32 r38, r16, 1020; +add.s32 r39, r10, r38; +barrier.sync 0; +and.b32 r40, r8, 12288; +add.s32 r41, r39, r40; +st.shared.f32 [r41], f237; +st.shared.f32 [r41+1024], f250; +st.shared.f32 [r41+2048], f260; +st.shared.f32 [r41+3072], f270; +barrier.sync 0; +mad.lo.s32 r42, r36, -12, r41; +ld.shared.f32 f274, [r42]; +ld.shared.f32 f275, [r42+4096]; +ld.shared.f32 f276, [r42+8192]; +ld.shared.f32 f277, [r42+12288]; +barrier.sync 0; +st.shared.f32 [r41], f238; +st.shared.f32 [r41+1024], f253; +st.shared.f32 [r41+2048], f263; +st.shared.f32 [r41+3072], f273; +barrier.sync 0; +ld.shared.f32 f278, [r42]; +ld.shared.f32 f279, [r42+4096]; +ld.shared.f32 f280, [r42+8192]; +ld.shared.f32 f281, [r42+12288]; +add.f32 f282, f274, f276; +add.f32 f283, f278, f280; +sub.f32 f284, f274, f276; +sub.f32 f285, f278, f280; +add.f32 f286, f275, f277; +add.f32 f287, f279, f281; +sub.f32 f288, f275, f277; +sub.f32 f289, f279, f281; +add.f32 %0, f282, f286; +add.f32 %1, f283, f287; +add.f32 %3, f285, f288; +sub.f32 %2, f284, f289; +sub.f32 %4, f282, f286; +sub.f32 %5, f283, f287; +sub.f32 %7, f285, f288; +add.f32 %6, f284, f289; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_4096), "l"(lut_sp_4_1024), "l"(lut_sp_4_256), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..6a442147fa924 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp64_fwd.hpp.inc @@ -0,0 +1,3706 @@ +#ifndef CUFFTDX_FFT_4096_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_4096_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<492, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<29>; +.reg .f64 fd<513>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+8192]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32704; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+4096]; +ld.shared.f64 fd160, [r13+8192]; +ld.shared.f64 fd161, [r13+12288]; +ld.shared.f64 fd162, [r13+16384]; +ld.shared.f64 fd163, [r13+20480]; +ld.shared.f64 fd164, [r13+24576]; +ld.shared.f64 fd165, [r13+28672]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+4096]; +ld.shared.f64 fd168, [r13+8192]; +ld.shared.f64 fd169, [r13+12288]; +ld.shared.f64 fd170, [r13+16384]; +ld.shared.f64 fd171, [r13+20480]; +ld.shared.f64 fd172, [r13+24576]; +ld.shared.f64 fd173, [r13+28672]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 504; +bfe.u32 r15, r5, 3, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+1024]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 32256; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+4096]; +ld.shared.f64 fd301, [r21+8192]; +ld.shared.f64 fd302, [r21+12288]; +ld.shared.f64 fd303, [r21+16384]; +ld.shared.f64 fd304, [r21+20480]; +ld.shared.f64 fd305, [r21+24576]; +ld.shared.f64 fd306, [r21+28672]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+4096]; +ld.shared.f64 fd309, [r21+8192]; +ld.shared.f64 fd310, [r21+12288]; +ld.shared.f64 fd311, [r21+16384]; +ld.shared.f64 fd312, [r21+20480]; +ld.shared.f64 fd313, [r21+24576]; +ld.shared.f64 fd314, [r21+28672]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +add.f64 fd327, fd317, fd322; +sub.f64 fd328, fd318, fd321; +sub.f64 fd329, fd317, fd322; +add.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +add.f64 fd343, fd333, fd338; +sub.f64 fd344, fd334, fd337; +sub.f64 fd345, fd333, fd338; +add.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0dBFE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd344, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd351, fd343, 0dBFE6A09E667F3BCD, fd350; +mul.f64 fd352, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd353, fd346, 0dBFE6A09E667F3BCD; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd352, fd353; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd351; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd351; +add.f64 fd364, fd325, fd342; +sub.f64 fd365, fd326, fd341; +sub.f64 fd366, fd325, fd342; +add.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd354; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd354; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 448; +bfe.u32 r23, r5, 6, 3; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd372, fd360; +mul.f64 fd377, fd373, fd361; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd372, fd361; +fma.rn.f64 fd380, fd373, fd360, fd379; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd383, fd364; +mul.f64 fd387, fd385, fd365; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd383, fd365; +fma.rn.f64 fd390, fd385, fd364, fd389; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd393, fd368; +mul.f64 fd397, fd395, fd369; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd393, fd369; +fma.rn.f64 fd400, fd395, fd368, fd399; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd403, fd358; +mul.f64 fd407, fd405, fd359; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd403, fd359; +fma.rn.f64 fd410, fd405, fd358, fd409; +ld.global.v2.f64 {fd411, fd412}, [rd11+128]; +mul.f64 fd415, fd411, fd362; +mul.f64 fd416, fd412, fd363; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd411, fd363; +fma.rn.f64 fd419, fd412, fd362, fd418; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd422, fd366; +mul.f64 fd426, fd424, fd367; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd422, fd367; +fma.rn.f64 fd429, fd424, fd366, fd428; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd432, fd370; +mul.f64 fd436, fd434, fd371; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd432, fd371; +fma.rn.f64 fd439, fd434, fd370, fd438; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 28672; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd378; +st.shared.f64 [r27+1024], fd388; +st.shared.f64 [r27+1536], fd398; +st.shared.f64 [r27+2048], fd408; +st.shared.f64 [r27+2560], fd417; +st.shared.f64 [r27+3072], fd427; +st.shared.f64 [r27+3584], fd437; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+4096]; +ld.shared.f64 fd442, [r28+8192]; +ld.shared.f64 fd443, [r28+12288]; +ld.shared.f64 fd444, [r28+16384]; +ld.shared.f64 fd445, [r28+20480]; +ld.shared.f64 fd446, [r28+24576]; +ld.shared.f64 fd447, [r28+28672]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+4096]; +ld.shared.f64 fd450, [r28+8192]; +ld.shared.f64 fd451, [r28+12288]; +ld.shared.f64 fd452, [r28+16384]; +ld.shared.f64 fd453, [r28+20480]; +ld.shared.f64 fd454, [r28+24576]; +ld.shared.f64 fd455, [r28+28672]; +add.f64 fd456, fd440, fd444; +add.f64 fd457, fd448, fd452; +sub.f64 fd458, fd440, fd444; +sub.f64 fd459, fd448, fd452; +add.f64 fd460, fd442, fd446; +add.f64 fd461, fd450, fd454; +sub.f64 fd462, fd442, fd446; +sub.f64 fd463, fd450, fd454; +add.f64 fd464, fd456, fd460; +add.f64 fd465, fd457, fd461; +sub.f64 fd466, fd456, fd460; +sub.f64 fd467, fd457, fd461; +add.f64 fd468, fd458, fd463; +sub.f64 fd469, fd459, fd462; +sub.f64 fd470, fd458, fd463; +add.f64 fd471, fd459, fd462; +add.f64 fd472, fd441, fd445; +add.f64 fd473, fd449, fd453; +sub.f64 fd474, fd441, fd445; +sub.f64 fd475, fd449, fd453; +add.f64 fd476, fd443, fd447; +add.f64 fd477, fd451, fd455; +sub.f64 fd478, fd443, fd447; +sub.f64 fd479, fd451, fd455; +add.f64 fd480, fd472, fd476; +add.f64 fd481, fd473, fd477; +sub.f64 fd482, fd472, fd476; +sub.f64 fd483, fd473, fd477; +add.f64 fd484, fd474, fd479; +sub.f64 fd485, fd475, fd478; +sub.f64 fd486, fd474, fd479; +add.f64 fd487, fd475, fd478; +mul.f64 fd488, fd484, 0d3FE6A09E667F3BCD; +mul.f64 fd489, fd485, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd488, fd489; +mul.f64 fd491, fd485, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd492, fd484, 0dBFE6A09E667F3BCD, fd491; +mul.f64 fd493, fd486, 0dBFE6A09E667F3BCD; +mul.f64 fd494, fd487, 0dBFE6A09E667F3BCD; +sub.f64 fd495, fd493, fd494; +add.f64 fd496, fd493, fd494; +add.f64 %0, fd464, fd480; +add.f64 %1, fd465, fd481; +add.f64 %3, fd469, fd492; +add.f64 %2, fd468, fd490; +sub.f64 %5, fd467, fd482; +add.f64 %4, fd466, fd483; +add.f64 %7, fd471, fd496; +add.f64 %6, fd470, fd495; +sub.f64 %8, fd464, fd480; +sub.f64 %9, fd465, fd481; +sub.f64 %11, fd469, fd492; +sub.f64 %10, fd468, fd490; +add.f64 %13, fd467, fd482; +sub.f64 %12, fd466, fd483; +sub.f64 %15, fd471, fd496; +sub.f64 %14, fd470, fd495; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_4096), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<494, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<49>; +.reg .f64 fd<1232>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1214, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1212, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1211, fd1214, fd1212; +sub.f64 fd76, fd1214, fd1212; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd1210, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1207, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1205, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1204, fd1207, fd1205; +sub.f64 fd92, fd1207, fd1205; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd1203, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd1203, 0dBFE6A09E667F3BCD; +mul.f64 fd1202, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd1202, fd98; +mul.f64 fd100, fd1203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1201, fd1211, fd1204; +sub.f64 fd109, fd1211, fd1204; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1200, fd1210, fd101; +sub.f64 fd113, fd1210, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd1199, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd1198, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1196, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1193, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1192, fd1196, fd1193; +sub.f64 fd133, fd1196, fd1193; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd1191, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1189, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1187, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1186, fd1189, fd1187; +sub.f64 fd149, fd1189, fd1187; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd1185, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd1185, 0dBFE6A09E667F3BCD; +mul.f64 fd1184, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd1184, fd155; +mul.f64 fd157, fd1185, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1183, fd1192, fd1186; +sub.f64 fd166, fd1192, fd1186; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1182, fd1191, fd158; +sub.f64 fd170, fd1191, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd1181, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd1180, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1178, fd167, 0d3FED906BCF328D46; +mul.f64 fd1179, fd1182, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd1178, fd1179; +mul.f64 fd182, fd1182, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd1176, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd1177, fd1181, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd1176, fd1177; +mul.f64 fd187, fd1181, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd1174, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd1175, fd1180, 0dBFED906BCF328D46; +sub.f64 fd191, fd1174, fd1175; +mul.f64 fd192, fd1180, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd1172, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd1173, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd1172, fd1173; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd1170, fd177, 0dBFED906BCF328D46; +mul.f64 fd1171, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd1170, fd1171; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1169, fd1200, fd183; +sub.f64 fd213, fd1200, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1168, fd1199, fd188; +sub.f64 fd217, fd1199, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd1167, fd1198, fd193; +sub.f64 fd221, fd1198, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd1166, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd1165, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd1164, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1163, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd1169; +mul.f64 fd244, fd238, fd1169; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1162, fd238, fd238; +sub.f64 fd247, fd1162, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd1168; +mul.f64 fd252, fd247, fd1168; +mul.f64 fd1160, fd238, fd247; +mul.f64 fd1161, fd239, fd249; +sub.f64 fd255, fd1160, fd1161; +mul.f64 fd1159, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd1167; +mul.f64 fd260, fd255, fd1167; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1158, fd238, fd255; +sub.f64 fd263, fd1158, fd262; +mul.f64 fd1157, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd1166; +mul.f64 fd268, fd263, fd1166; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1156, fd238, fd263; +sub.f64 fd271, fd1156, fd270; +mul.f64 fd1155, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd1165; +mul.f64 fd276, fd271, fd1165; +mul.f64 fd1153, fd238, fd271; +mul.f64 fd1154, fd239, fd273; +sub.f64 fd279, fd1153, fd1154; +mul.f64 fd1152, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd1164; +mul.f64 fd284, fd279, fd1164; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1151, fd238, fd279; +sub.f64 fd287, fd1151, fd286; +mul.f64 fd1150, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd1163; +mul.f64 fd292, fd287, fd1163; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1149, fd238, fd287; +sub.f64 fd295, fd1149, fd294; +mul.f64 fd1148, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1147, fd1201, fd1183; +sub.f64 fd1146, fd106, fd163; +mul.f64 fd298, fd295, fd1146; +mul.f64 fd299, fd297, fd1147; +mul.f64 fd300, fd295, fd1147; +ld.global.v2.f64 {fd301, fd302}, [rd5+4096]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1144, fd238, fd301; +mul.f64 fd1145, fd239, fd302; +sub.f64 fd310, fd1144, fd1145; +mul.f64 fd1143, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1142, fd238, fd310; +sub.f64 fd318, fd1142, fd317; +mul.f64 fd1141, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1139, fd238, fd318; +mul.f64 fd1140, fd239, fd320; +sub.f64 fd326, fd1139, fd1140; +mul.f64 fd1138, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1136, fd238, fd326; +mul.f64 fd1137, fd239, fd328; +sub.f64 fd334, fd1136, fd1137; +mul.f64 fd1135, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1134, fd238, fd334; +sub.f64 fd342, fd1134, fd341; +mul.f64 fd1133, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1131, fd238, fd342; +mul.f64 fd1132, fd239, fd344; +sub.f64 fd350, fd1131, fd1132; +mul.f64 fd1130, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd1129, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +sub.f64 fd1227, fd1201, fd1183; +mul.f64 fd1226, fd297, fd1227; +mov.u32 r48, %tid.x; +shl.b32 r47, r48, 8; +barrier.sync 0; +and.b32 r11, r47, 65280; +add.s32 r12, r9, r11; +mov.u32 r32, %tid.x; +shl.b32 r31, r32, 4; +sub.f64 fd1231, fd1201, fd1183; +mul.f64 fd1230, fd297, fd1231; +add.f64 fd356, fd1201, fd1183; +mov.u32 r40, %tid.x; +shl.b32 r39, r40, 8; +and.b32 r23, r39, 65280; +add.s32 r22, r9, r23; +sub.f64 fd1225, fd106, fd163; +add.f64 fd357, fd106, fd163; +mov.u32 r38, %tid.x; +shl.b32 r37, r38, 8; +and.b32 r30, r37, 65280; +add.s32 r29, r9, r30; +st.shared.v2.f64 [r29], {fd357, fd356}; +mov.u32 r28, %tid.x; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd1130, fd243; +st.shared.v2.f64 [r29+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd1159, fd251; +st.shared.v2.f64 [r29+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd1157, fd259; +st.shared.v2.f64 [r29+48], {fd363, fd362}; +sub.f64 fd364, fd1155, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r29+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd1152, fd275; +st.shared.v2.f64 [r29+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd1150, fd283; +st.shared.v2.f64 [r29+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd1148, fd291; +st.shared.v2.f64 [r29+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd1225, fd300; +sub.f64 fd373, fd298, fd1230; +st.shared.v2.f64 [r29+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd1143, fd306; +st.shared.v2.f64 [r29+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd1141, fd314; +st.shared.v2.f64 [r29+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd1138, fd322; +st.shared.v2.f64 [r29+176], {fd379, fd378}; +sub.f64 fd380, fd1135, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r29+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd1133, fd338; +st.shared.v2.f64 [r29+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd1129, fd346; +st.shared.v2.f64 [r29+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r29+240], {fd387, fd386}; +barrier.sync 0; +and.b32 r20, r28, 255; +mad.lo.s32 r13, r20, -240, r29; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+4096]; +ld.shared.v2.f64 {fd396, fd397}, [r13+8192]; +ld.shared.v2.f64 {fd400, fd401}, [r13+12288]; +ld.shared.v2.f64 {fd404, fd405}, [r13+16384]; +ld.shared.v2.f64 {fd408, fd409}, [r13+20480]; +ld.shared.v2.f64 {fd412, fd413}, [r13+24576]; +ld.shared.v2.f64 {fd416, fd417}, [r13+28672]; +ld.shared.v2.f64 {fd420, fd421}, [r13+32768]; +ld.shared.v2.f64 {fd424, fd425}, [r13+36864]; +ld.shared.v2.f64 {fd428, fd429}, [r13+40960]; +ld.shared.v2.f64 {fd432, fd433}, [r13+45056]; +ld.shared.v2.f64 {fd436, fd437}, [r13+49152]; +ld.shared.v2.f64 {fd440, fd441}, [r13+53248]; +ld.shared.v2.f64 {fd444, fd445}, [r13+57344]; +ld.shared.v2.f64 {fd448, fd449}, [r13+61440]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd1128, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd1127, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd1126, fd1128, fd1127; +sub.f64 fd463, fd1128, fd1127; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd1125, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd1124, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd1123, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd1122, fd1124, fd1123; +sub.f64 fd479, fd1124, fd1123; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd1121, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd1121, 0dBFE6A09E667F3BCD; +mul.f64 fd1120, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd1120, fd485; +mul.f64 fd487, fd1121, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd1119, fd1126, fd1122; +sub.f64 fd496, fd1126, fd1122; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd1118, fd1125, fd488; +sub.f64 fd500, fd1125, fd488; +add.f64 fd501, fd462, fd479; +sub.f64 fd503, fd462, fd479; +sub.f64 fd1117, fd463, fd478; +add.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd491; +sub.f64 fd507, fd466, fd491; +add.f64 fd1116, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd1115, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd1114, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd1113, fd1115, fd1114; +sub.f64 fd520, fd1115, fd1114; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd1112, fd512, fd515; +add.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd1111, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd1110, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd1109, fd1111, fd1110; +sub.f64 fd536, fd1111, fd1110; +add.f64 fd537, fd527, fd532; +sub.f64 fd539, fd527, fd532; +sub.f64 fd1108, fd528, fd531; +add.f64 fd540, fd528, fd531; +mul.f64 fd542, fd1108, 0dBFE6A09E667F3BCD; +mul.f64 fd1107, fd537, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd1107, fd542; +mul.f64 fd544, fd1108, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd545, fd537, 0dBFE6A09E667F3BCD, fd544; +mul.f64 fd546, fd539, 0dBFE6A09E667F3BCD; +mul.f64 fd547, fd540, 0dBFE6A09E667F3BCD; +sub.f64 fd548, fd546, fd547; +add.f64 fd549, fd546, fd547; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd1106, fd1113, fd1109; +sub.f64 fd553, fd1113, fd1109; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd1105, fd1112, fd545; +sub.f64 fd557, fd1112, fd545; +add.f64 fd558, fd519, fd536; +sub.f64 fd560, fd519, fd536; +sub.f64 fd1104, fd520, fd535; +add.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd548; +sub.f64 fd564, fd523, fd548; +add.f64 fd1103, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd1101, fd554, 0d3FED906BCF328D46; +mul.f64 fd1102, fd1105, 0dBFD87DE2A6AEA963; +sub.f64 fd568, fd1101, fd1102; +mul.f64 fd569, fd1105, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0dBFD87DE2A6AEA963, fd569; +mul.f64 fd572, fd1104, 0dBFE6A09E667F3BCD; +mul.f64 fd1100, fd558, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd1100, fd572; +mul.f64 fd574, fd1104, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd575, fd558, 0dBFE6A09E667F3BCD, fd574; +mul.f64 fd577, fd1103, 0dBFED906BCF328D46; +mul.f64 fd1099, fd562, 0d3FD87DE2A6AEA963; +sub.f64 fd578, fd1099, fd577; +mul.f64 fd579, fd1103, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd580, fd562, 0dBFED906BCF328D46, fd579; +mul.f64 fd582, fd557, 0dBFED906BCF328D46; +mul.f64 fd1098, fd556, 0dBFD87DE2A6AEA963; +sub.f64 fd583, fd1098, fd582; +mul.f64 fd584, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd585, fd556, 0dBFED906BCF328D46, fd584; +mul.f64 fd586, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd561, 0dBFE6A09E667F3BCD; +sub.f64 fd588, fd586, fd587; +add.f64 fd589, fd586, fd587; +mul.f64 fd591, fd565, 0dBFD87DE2A6AEA963; +mul.f64 fd1097, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd1097, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0dBFD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd1096, fd1118, fd570; +sub.f64 fd600, fd1118, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd1095, fd1117, fd575; +sub.f64 fd604, fd1117, fd575; +add.f64 fd605, fd505, fd578; +sub.f64 fd607, fd505, fd578; +add.f64 fd1094, fd1116, fd580; +sub.f64 fd608, fd1116, fd580; +add.f64 fd609, fd495, fd553; +sub.f64 fd611, fd495, fd553; +sub.f64 fd1093, fd496, fd552; +add.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd583; +sub.f64 fd615, fd499, fd583; +add.f64 fd1092, fd500, fd585; +sub.f64 fd616, fd500, fd585; +add.f64 fd617, fd503, fd588; +sub.f64 fd619, fd503, fd588; +add.f64 fd1091, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd1090, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r28, 240; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd630, fd626, fd1096; +mul.f64 fd631, fd625, fd1096; +mul.f64 fd633, fd626, fd626; +mul.f64 fd1089, fd625, fd625; +sub.f64 fd634, fd1089, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd638, fd636, fd1095; +mul.f64 fd639, fd634, fd1095; +mul.f64 fd1087, fd625, fd634; +mul.f64 fd1088, fd626, fd636; +sub.f64 fd642, fd1087, fd1088; +mul.f64 fd1086, fd634, fd601; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd646, fd644, fd1094; +mul.f64 fd647, fd642, fd1094; +mul.f64 fd649, fd626, fd644; +mul.f64 fd1085, fd625, fd642; +sub.f64 fd650, fd1085, fd649; +mul.f64 fd1084, fd642, fd605; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd654, fd652, fd1093; +mul.f64 fd655, fd650, fd1093; +mul.f64 fd1082, fd625, fd650; +mul.f64 fd1083, fd626, fd652; +sub.f64 fd658, fd1082, fd1083; +mul.f64 fd1081, fd650, fd609; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd662, fd660, fd1092; +mul.f64 fd663, fd658, fd1092; +mul.f64 fd1079, fd625, fd658; +mul.f64 fd1080, fd626, fd660; +sub.f64 fd666, fd1079, fd1080; +mul.f64 fd1078, fd658, fd613; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd670, fd668, fd1091; +mul.f64 fd671, fd666, fd1091; +mul.f64 fd673, fd626, fd668; +mul.f64 fd1077, fd625, fd666; +sub.f64 fd674, fd1077, fd673; +mul.f64 fd1076, fd666, fd617; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd678, fd676, fd1090; +mul.f64 fd679, fd674, fd1090; +mul.f64 fd1074, fd625, fd674; +mul.f64 fd1075, fd626, fd676; +sub.f64 fd682, fd1074, fd1075; +mul.f64 fd1073, fd674, fd621; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd1072, fd1119, fd1106; +sub.f64 fd1071, fd493, fd550; +mul.f64 fd685, fd682, fd1071; +mul.f64 fd686, fd684, fd1072; +mul.f64 fd687, fd682, fd1072; +ld.global.v2.f64 {fd688, fd689}, [rd8+256]; +mul.f64 fd693, fd689, fd600; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd1070, fd625, fd688; +sub.f64 fd697, fd1070, fd696; +mul.f64 fd1069, fd688, fd599; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd701, fd699, fd604; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd1068, fd625, fd697; +sub.f64 fd705, fd1068, fd704; +mul.f64 fd1067, fd697, fd603; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd709, fd707, fd608; +mul.f64 fd710, fd705, fd608; +mul.f64 fd1065, fd625, fd705; +mul.f64 fd1066, fd626, fd707; +sub.f64 fd713, fd1065, fd1066; +mul.f64 fd1064, fd705, fd607; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd717, fd715, fd612; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd1063, fd625, fd713; +sub.f64 fd721, fd1063, fd720; +mul.f64 fd1062, fd713, fd611; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd725, fd723, fd616; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd1061, fd625, fd721; +sub.f64 fd729, fd1061, fd728; +mul.f64 fd1060, fd721, fd615; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd733, fd731, fd620; +mul.f64 fd734, fd729, fd620; +mul.f64 fd1058, fd625, fd729; +mul.f64 fd1059, fd626, fd731; +sub.f64 fd737, fd1058, fd1059; +mul.f64 fd1057, fd625, fd597; +mul.f64 fd738, fd625, fd731; +mul.f64 fd1056, fd729, fd619; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd737, fd623; +mul.f64 fd741, fd739, fd624; +mul.f64 fd742, fd737, fd624; +sub.f64 fd1219, fd1119, fd1106; +mul.f64 fd1218, fd684, fd1219; +mov.u32 r36, %tid.x; +shl.b32 r35, r36, 8; +mov.u32 r46, %tid.x; +shl.b32 r45, r46, 4; +and.b32 r15, r45, 240; +add.s32 r16, r9, r15; +sub.f64 fd1223, fd1119, fd1106; +mul.f64 fd1222, fd684, fd1223; +mov.u32 r44, %tid.x; +shl.b32 r43, r44, 8; +barrier.sync 0; +and.b32 r17, r43, 61440; +add.s32 r18, r16, r17; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 240; +sub.f64 fd1229, fd1119, fd1106; +mul.f64 fd1228, fd684, fd1229; +add.f64 fd743, fd1119, fd1106; +sub.f64 fd1224, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r34, %tid.x; +and.b32 r33, r34, 240; +mov.u32 r42, %tid.x; +and.b32 r41, r42, 240; +fma.rn.f64 fd745, fd626, fd597, fd631; +sub.f64 fd746, fd1057, fd630; +st.shared.v2.f64 [r18+256], {fd746, fd745}; +fma.rn.f64 fd747, fd636, fd601, fd639; +sub.f64 fd748, fd1086, fd638; +st.shared.v2.f64 [r18+512], {fd748, fd747}; +fma.rn.f64 fd749, fd644, fd605, fd647; +sub.f64 fd750, fd1084, fd646; +st.shared.v2.f64 [r18+768], {fd750, fd749}; +fma.rn.f64 fd751, fd652, fd609, fd655; +sub.f64 fd752, fd1081, fd654; +st.shared.v2.f64 [r18+1024], {fd752, fd751}; +sub.f64 fd753, fd1078, fd662; +fma.rn.f64 fd754, fd660, fd613, fd663; +st.shared.v2.f64 [r18+1280], {fd753, fd754}; +fma.rn.f64 fd755, fd668, fd617, fd671; +sub.f64 fd756, fd1076, fd670; +st.shared.v2.f64 [r18+1536], {fd756, fd755}; +fma.rn.f64 fd757, fd676, fd621, fd679; +sub.f64 fd758, fd1073, fd678; +st.shared.v2.f64 [r18+1792], {fd758, fd757}; +fma.rn.f64 fd759, fd684, fd1224, fd687; +sub.f64 fd760, fd685, fd1228; +st.shared.v2.f64 [r18+2048], {fd760, fd759}; +fma.rn.f64 fd761, fd689, fd599, fd694; +sub.f64 fd762, fd1069, fd693; +st.shared.v2.f64 [r18+2304], {fd762, fd761}; +fma.rn.f64 fd763, fd699, fd603, fd702; +sub.f64 fd764, fd1067, fd701; +st.shared.v2.f64 [r18+2560], {fd764, fd763}; +fma.rn.f64 fd765, fd707, fd607, fd710; +sub.f64 fd766, fd1064, fd709; +st.shared.v2.f64 [r18+2816], {fd766, fd765}; +fma.rn.f64 fd767, fd715, fd611, fd718; +sub.f64 fd768, fd1062, fd717; +st.shared.v2.f64 [r18+3072], {fd768, fd767}; +sub.f64 fd769, fd1060, fd725; +fma.rn.f64 fd770, fd723, fd615, fd726; +st.shared.v2.f64 [r18+3328], {fd769, fd770}; +fma.rn.f64 fd771, fd731, fd619, fd734; +sub.f64 fd772, fd1056, fd733; +st.shared.v2.f64 [r18+3584], {fd772, fd771}; +fma.rn.f64 fd773, fd739, fd623, fd742; +sub.f64 fd774, fd740, fd741; +st.shared.v2.f64 [r18+3840], {fd774, fd773}; +barrier.sync 0; +mad.lo.s32 r19, r41, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+4096]; +ld.shared.v2.f64 {fd783, fd784}, [r19+8192]; +ld.shared.v2.f64 {fd787, fd788}, [r19+12288]; +ld.shared.v2.f64 {fd791, fd792}, [r19+16384]; +ld.shared.v2.f64 {fd795, fd796}, [r19+20480]; +ld.shared.v2.f64 {fd799, fd800}, [r19+24576]; +ld.shared.v2.f64 {fd803, fd804}, [r19+28672]; +ld.shared.v2.f64 {fd807, fd808}, [r19+32768]; +ld.shared.v2.f64 {fd811, fd812}, [r19+36864]; +ld.shared.v2.f64 {fd815, fd816}, [r19+40960]; +ld.shared.v2.f64 {fd819, fd820}, [r19+45056]; +ld.shared.v2.f64 {fd823, fd824}, [r19+49152]; +ld.shared.v2.f64 {fd827, fd828}, [r19+53248]; +ld.shared.v2.f64 {fd831, fd832}, [r19+57344]; +ld.shared.v2.f64 {fd835, fd836}, [r19+61440]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd1055, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd1054, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd839, fd843; +sub.f64 fd849, fd839, fd843; +add.f64 fd1053, fd1055, fd1054; +sub.f64 fd850, fd1055, fd1054; +add.f64 fd851, fd841, fd846; +sub.f64 fd853, fd841, fd846; +sub.f64 fd1052, fd842, fd845; +add.f64 fd854, fd842, fd845; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd1051, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd1050, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd855, fd859; +sub.f64 fd865, fd855, fd859; +add.f64 fd1049, fd1051, fd1050; +sub.f64 fd866, fd1051, fd1050; +add.f64 fd867, fd857, fd862; +sub.f64 fd869, fd857, fd862; +sub.f64 fd1048, fd858, fd861; +add.f64 fd870, fd858, fd861; +mul.f64 fd872, fd1048, 0dBFE6A09E667F3BCD; +mul.f64 fd1047, fd867, 0d3FE6A09E667F3BCD; +sub.f64 fd873, fd1047, fd872; +mul.f64 fd874, fd1048, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd875, fd867, 0dBFE6A09E667F3BCD, fd874; +mul.f64 fd876, fd869, 0dBFE6A09E667F3BCD; +mul.f64 fd877, fd870, 0dBFE6A09E667F3BCD; +sub.f64 fd878, fd876, fd877; +add.f64 fd879, fd876, fd877; +add.f64 fd880, fd847, fd863; +sub.f64 fd882, fd847, fd863; +add.f64 fd1046, fd1053, fd1049; +sub.f64 fd883, fd1053, fd1049; +add.f64 fd884, fd851, fd873; +sub.f64 fd886, fd851, fd873; +add.f64 fd1045, fd1052, fd875; +sub.f64 fd887, fd1052, fd875; +add.f64 fd888, fd849, fd866; +sub.f64 fd890, fd849, fd866; +sub.f64 fd1044, fd850, fd865; +add.f64 fd891, fd850, fd865; +add.f64 fd892, fd853, fd878; +sub.f64 fd894, fd853, fd878; +add.f64 fd1043, fd854, fd879; +sub.f64 fd895, fd854, fd879; +add.f64 fd896, fd779, fd811; +sub.f64 fd898, fd779, fd811; +add.f64 fd1042, fd780, fd812; +sub.f64 fd899, fd780, fd812; +add.f64 fd900, fd795, fd827; +sub.f64 fd902, fd795, fd827; +add.f64 fd1041, fd796, fd828; +sub.f64 fd903, fd796, fd828; +add.f64 fd904, fd896, fd900; +sub.f64 fd906, fd896, fd900; +add.f64 fd1040, fd1042, fd1041; +sub.f64 fd907, fd1042, fd1041; +add.f64 fd908, fd898, fd903; +sub.f64 fd910, fd898, fd903; +sub.f64 fd1039, fd899, fd902; +add.f64 fd911, fd899, fd902; +add.f64 fd912, fd787, fd819; +sub.f64 fd914, fd787, fd819; +add.f64 fd1038, fd788, fd820; +sub.f64 fd915, fd788, fd820; +add.f64 fd916, fd803, fd835; +sub.f64 fd918, fd803, fd835; +add.f64 fd1037, fd804, fd836; +sub.f64 fd919, fd804, fd836; +add.f64 fd920, fd912, fd916; +sub.f64 fd922, fd912, fd916; +add.f64 fd1036, fd1038, fd1037; +sub.f64 fd923, fd1038, fd1037; +add.f64 fd924, fd914, fd919; +sub.f64 fd926, fd914, fd919; +sub.f64 fd1035, fd915, fd918; +add.f64 fd927, fd915, fd918; +mul.f64 fd929, fd1035, 0dBFE6A09E667F3BCD; +mul.f64 fd1034, fd924, 0d3FE6A09E667F3BCD; +sub.f64 fd930, fd1034, fd929; +mul.f64 fd931, fd1035, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd932, fd924, 0dBFE6A09E667F3BCD, fd931; +mul.f64 fd933, fd926, 0dBFE6A09E667F3BCD; +mul.f64 fd934, fd927, 0dBFE6A09E667F3BCD; +sub.f64 fd935, fd933, fd934; +add.f64 fd936, fd933, fd934; +add.f64 fd937, fd904, fd920; +sub.f64 fd939, fd904, fd920; +add.f64 fd1033, fd1040, fd1036; +sub.f64 fd940, fd1040, fd1036; +add.f64 fd941, fd908, fd930; +sub.f64 fd943, fd908, fd930; +add.f64 fd1032, fd1039, fd932; +sub.f64 fd944, fd1039, fd932; +add.f64 fd945, fd906, fd923; +sub.f64 fd947, fd906, fd923; +sub.f64 fd1031, fd907, fd922; +add.f64 fd948, fd907, fd922; +add.f64 fd949, fd910, fd935; +sub.f64 fd951, fd910, fd935; +add.f64 fd1030, fd911, fd936; +sub.f64 fd952, fd911, fd936; +mul.f64 fd954, fd1032, 0dBFD87DE2A6AEA963; +mul.f64 fd1029, fd941, 0d3FED906BCF328D46; +sub.f64 fd955, fd1029, fd954; +mul.f64 fd956, fd1032, 0d3FED906BCF328D46; +fma.rn.f64 fd957, fd941, 0dBFD87DE2A6AEA963, fd956; +mul.f64 fd959, fd1031, 0dBFE6A09E667F3BCD; +mul.f64 fd1028, fd945, 0d3FE6A09E667F3BCD; +sub.f64 fd960, fd1028, fd959; +mul.f64 fd961, fd1031, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd962, fd945, 0dBFE6A09E667F3BCD, fd961; +mul.f64 fd964, fd1030, 0dBFED906BCF328D46; +mul.f64 fd1027, fd949, 0d3FD87DE2A6AEA963; +sub.f64 fd965, fd1027, fd964; +mul.f64 fd966, fd1030, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd967, fd949, 0dBFED906BCF328D46, fd966; +mul.f64 fd969, fd944, 0dBFED906BCF328D46; +mul.f64 fd1026, fd943, 0dBFD87DE2A6AEA963; +sub.f64 fd970, fd1026, fd969; +mul.f64 fd971, fd944, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd972, fd943, 0dBFED906BCF328D46, fd971; +mul.f64 fd973, fd947, 0dBFE6A09E667F3BCD; +mul.f64 fd974, fd948, 0dBFE6A09E667F3BCD; +sub.f64 fd975, fd973, fd974; +add.f64 fd976, fd973, fd974; +mul.f64 fd978, fd952, 0dBFD87DE2A6AEA963; +mul.f64 fd1025, fd951, 0dBFED906BCF328D46; +sub.f64 fd979, fd1025, fd978; +mul.f64 fd980, fd952, 0dBFED906BCF328D46; +fma.rn.f64 fd981, fd951, 0dBFD87DE2A6AEA963, fd980; +add.f64 %0, fd880, fd937; +add.f64 %1, fd1046, fd1033; +add.f64 %2, fd884, fd955; +add.f64 %3, fd1045, fd957; +add.f64 %5, fd1044, fd962; +add.f64 %4, fd888, fd960; +add.f64 %7, fd1043, fd967; +add.f64 %6, fd892, fd965; +sub.f64 %9, fd883, fd939; +add.f64 %8, fd882, fd940; +add.f64 %10, fd886, fd970; +add.f64 %11, fd887, fd972; +add.f64 %12, fd890, fd975; +add.f64 %13, fd891, fd976; +add.f64 %14, fd894, fd979; +add.f64 %15, fd895, fd981; +sub.f64 %17, fd1046, fd1033; +sub.f64 %16, fd880, fd937; +sub.f64 %19, fd1045, fd957; +sub.f64 %18, fd884, fd955; +sub.f64 %21, fd1044, fd962; +sub.f64 %20, fd888, fd960; +sub.f64 %23, fd1043, fd967; +sub.f64 %22, fd892, fd965; +add.f64 %25, fd883, fd939; +sub.f64 %24, fd882, fd940; +sub.f64 %27, fd887, fd972; +sub.f64 %26, fd886, fd970; +sub.f64 %29, fd891, fd976; +sub.f64 %28, fd890, fd975; +sub.f64 %31, fd895, fd981; +sub.f64 %30, fd894, fd979; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_4096), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<495, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<561>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+8192]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 65408; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+8192]; +ld.shared.v2.f64 {fd166, fd167}, [r13+16384]; +ld.shared.v2.f64 {fd170, fd171}, [r13+24576]; +ld.shared.v2.f64 {fd174, fd175}, [r13+32768]; +ld.shared.v2.f64 {fd178, fd179}, [r13+40960]; +ld.shared.v2.f64 {fd182, fd183}, [r13+49152]; +ld.shared.v2.f64 {fd186, fd187}, [r13+57344]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 504; +bfe.u32 r15, r5, 3, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+1024]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 64512; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+8192]; +ld.shared.v2.f64 {fd323, fd324}, [r20+16384]; +ld.shared.v2.f64 {fd327, fd328}, [r20+24576]; +ld.shared.v2.f64 {fd331, fd332}, [r20+32768]; +ld.shared.v2.f64 {fd335, fd336}, [r20+40960]; +ld.shared.v2.f64 {fd339, fd340}, [r20+49152]; +ld.shared.v2.f64 {fd343, fd344}, [r20+57344]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +add.f64 fd359, fd349, fd354; +sub.f64 fd360, fd350, fd353; +sub.f64 fd361, fd349, fd354; +add.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +add.f64 fd375, fd365, fd370; +sub.f64 fd376, fd366, fd369; +sub.f64 fd377, fd365, fd370; +add.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0dBFE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd383, fd375, 0dBFE6A09E667F3BCD, fd382; +mul.f64 fd384, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd385, fd378, 0dBFE6A09E667F3BCD; +sub.f64 fd386, fd384, fd385; +add.f64 fd387, fd384, fd385; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd383; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd383; +add.f64 fd394, fd357, fd374; +sub.f64 fd395, fd358, fd373; +sub.f64 fd396, fd357, fd374; +add.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd386; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd386; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 448; +bfe.u32 r22, r5, 6, 3; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd402, fd390; +mul.f64 fd407, fd403, fd391; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd411, fd394; +mul.f64 fd415, fd413, fd395; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd419, fd398; +mul.f64 fd423, fd421, fd399; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd427, fd388; +mul.f64 fd431, fd429, fd389; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+128]; +mul.f64 fd437, fd433, fd392; +mul.f64 fd438, fd434, fd393; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd442, fd396; +mul.f64 fd446, fd444, fd397; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd450, fd400; +mul.f64 fd454, fd452, fd401; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 57344; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd403, fd390, fd408; +sub.f64 fd459, fd406, fd407; +st.shared.v2.f64 [r26+1024], {fd459, fd458}; +fma.rn.f64 fd460, fd413, fd394, fd416; +sub.f64 fd461, fd414, fd415; +st.shared.v2.f64 [r26+2048], {fd461, fd460}; +fma.rn.f64 fd462, fd421, fd398, fd424; +sub.f64 fd463, fd422, fd423; +st.shared.v2.f64 [r26+3072], {fd463, fd462}; +sub.f64 fd464, fd430, fd431; +fma.rn.f64 fd465, fd429, fd388, fd432; +st.shared.v2.f64 [r26+4096], {fd464, fd465}; +fma.rn.f64 fd466, fd434, fd392, fd439; +sub.f64 fd467, fd437, fd438; +st.shared.v2.f64 [r26+5120], {fd467, fd466}; +fma.rn.f64 fd468, fd444, fd396, fd447; +sub.f64 fd469, fd445, fd446; +st.shared.v2.f64 [r26+6144], {fd469, fd468}; +fma.rn.f64 fd470, fd452, fd400, fd455; +sub.f64 fd471, fd453, fd454; +st.shared.v2.f64 [r26+7168], {fd471, fd470}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+8192]; +ld.shared.v2.f64 {fd480, fd481}, [r27+16384]; +ld.shared.v2.f64 {fd484, fd485}, [r27+24576]; +ld.shared.v2.f64 {fd488, fd489}, [r27+32768]; +ld.shared.v2.f64 {fd492, fd493}, [r27+40960]; +ld.shared.v2.f64 {fd496, fd497}, [r27+49152]; +ld.shared.v2.f64 {fd500, fd501}, [r27+57344]; +add.f64 fd504, fd472, fd488; +add.f64 fd505, fd473, fd489; +sub.f64 fd506, fd472, fd488; +sub.f64 fd507, fd473, fd489; +add.f64 fd508, fd480, fd496; +add.f64 fd509, fd481, fd497; +sub.f64 fd510, fd480, fd496; +sub.f64 fd511, fd481, fd497; +add.f64 fd512, fd504, fd508; +add.f64 fd513, fd505, fd509; +sub.f64 fd514, fd504, fd508; +sub.f64 fd515, fd505, fd509; +add.f64 fd516, fd506, fd511; +sub.f64 fd517, fd507, fd510; +sub.f64 fd518, fd506, fd511; +add.f64 fd519, fd507, fd510; +add.f64 fd520, fd476, fd492; +add.f64 fd521, fd477, fd493; +sub.f64 fd522, fd476, fd492; +sub.f64 fd523, fd477, fd493; +add.f64 fd524, fd484, fd500; +add.f64 fd525, fd485, fd501; +sub.f64 fd526, fd484, fd500; +sub.f64 fd527, fd485, fd501; +add.f64 fd528, fd520, fd524; +add.f64 fd529, fd521, fd525; +sub.f64 fd530, fd520, fd524; +sub.f64 fd531, fd521, fd525; +add.f64 fd532, fd522, fd527; +sub.f64 fd533, fd523, fd526; +sub.f64 fd534, fd522, fd527; +add.f64 fd535, fd523, fd526; +mul.f64 fd536, fd532, 0d3FE6A09E667F3BCD; +mul.f64 fd537, fd533, 0dBFE6A09E667F3BCD; +sub.f64 fd538, fd536, fd537; +mul.f64 fd539, fd533, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd540, fd532, 0dBFE6A09E667F3BCD, fd539; +mul.f64 fd541, fd534, 0dBFE6A09E667F3BCD; +mul.f64 fd542, fd535, 0dBFE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +add.f64 %1, fd513, fd529; +add.f64 %0, fd512, fd528; +add.f64 %3, fd517, fd540; +add.f64 %2, fd516, fd538; +sub.f64 %5, fd515, fd530; +add.f64 %4, fd514, fd531; +add.f64 %7, fd519, fd544; +add.f64 %6, fd518, fd543; +sub.f64 %9, fd513, fd529; +sub.f64 %8, fd512, fd528; +sub.f64 %11, fd517, fd540; +sub.f64 %10, fd516, fd538; +add.f64 %13, fd515, fd530; +sub.f64 %12, fd514, fd531; +sub.f64 %15, fd519, fd544; +sub.f64 %14, fd518, fd543; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_4096), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<493, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<950>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+4096]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32640; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+2048]; +ld.shared.f64 fd390, [r13+4096]; +ld.shared.f64 fd391, [r13+6144]; +ld.shared.f64 fd392, [r13+8192]; +ld.shared.f64 fd393, [r13+10240]; +ld.shared.f64 fd394, [r13+12288]; +ld.shared.f64 fd395, [r13+14336]; +ld.shared.f64 fd396, [r13+16384]; +ld.shared.f64 fd397, [r13+18432]; +ld.shared.f64 fd398, [r13+20480]; +ld.shared.f64 fd399, [r13+22528]; +ld.shared.f64 fd400, [r13+24576]; +ld.shared.f64 fd401, [r13+26624]; +ld.shared.f64 fd402, [r13+28672]; +ld.shared.f64 fd403, [r13+30720]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+2048]; +ld.shared.f64 fd406, [r13+4096]; +ld.shared.f64 fd407, [r13+6144]; +ld.shared.f64 fd408, [r13+8192]; +ld.shared.f64 fd409, [r13+10240]; +ld.shared.f64 fd410, [r13+12288]; +ld.shared.f64 fd411, [r13+14336]; +ld.shared.f64 fd412, [r13+16384]; +ld.shared.f64 fd413, [r13+18432]; +ld.shared.f64 fd414, [r13+20480]; +ld.shared.f64 fd415, [r13+22528]; +ld.shared.f64 fd416, [r13+24576]; +ld.shared.f64 fd417, [r13+26624]; +ld.shared.f64 fd418, [r13+28672]; +ld.shared.f64 fd419, [r13+30720]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd543; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd543; +add.f64 fd575, fd473, fd546; +add.f64 fd576, fd474, fd548; +sub.f64 fd577, fd473, fd546; +sub.f64 fd578, fd474, fd548; +add.f64 fd579, fd463, fd521; +sub.f64 fd580, fd464, fd520; +sub.f64 fd581, fd463, fd521; +add.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd551; +add.f64 fd584, fd468, fd553; +sub.f64 fd585, fd467, fd551; +sub.f64 fd586, fd468, fd553; +add.f64 fd587, fd471, fd556; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd556; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 240; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd595, fd567; +mul.f64 fd600, fd596, fd568; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd595, fd568; +fma.rn.f64 fd603, fd596, fd567, fd602; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd606, fd571; +mul.f64 fd610, fd608, fd572; +sub.f64 fd611, fd609, fd610; +mul.f64 fd612, fd606, fd572; +fma.rn.f64 fd613, fd608, fd571, fd612; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd616, fd575; +mul.f64 fd620, fd618, fd576; +sub.f64 fd621, fd619, fd620; +mul.f64 fd622, fd616, fd576; +fma.rn.f64 fd623, fd618, fd575, fd622; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd626, fd579; +mul.f64 fd630, fd628, fd580; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd626, fd580; +fma.rn.f64 fd633, fd628, fd579, fd632; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd636, fd583; +mul.f64 fd640, fd638, fd584; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd636, fd584; +fma.rn.f64 fd643, fd638, fd583, fd642; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd646, fd587; +mul.f64 fd650, fd648, fd588; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd646, fd588; +fma.rn.f64 fd653, fd648, fd587, fd652; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd656, fd591; +mul.f64 fd660, fd658, fd592; +sub.f64 fd661, fd659, fd660; +mul.f64 fd662, fd656, fd592; +fma.rn.f64 fd663, fd658, fd591, fd662; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd666, fd565; +mul.f64 fd670, fd668, fd566; +sub.f64 fd671, fd669, fd670; +mul.f64 fd672, fd666, fd566; +fma.rn.f64 fd673, fd668, fd565, fd672; +ld.global.v2.f64 {fd674, fd675}, [rd8+256]; +mul.f64 fd678, fd674, fd569; +mul.f64 fd679, fd675, fd570; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd674, fd570; +fma.rn.f64 fd682, fd675, fd569, fd681; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd685, fd573; +mul.f64 fd689, fd687, fd574; +sub.f64 fd690, fd688, fd689; +mul.f64 fd691, fd685, fd574; +fma.rn.f64 fd692, fd687, fd573, fd691; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd695, fd577; +mul.f64 fd699, fd697, fd578; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd695, fd578; +fma.rn.f64 fd702, fd697, fd577, fd701; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd705, fd581; +mul.f64 fd709, fd707, fd582; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd705, fd582; +fma.rn.f64 fd712, fd707, fd581, fd711; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd715, fd585; +mul.f64 fd719, fd717, fd586; +sub.f64 fd720, fd718, fd719; +mul.f64 fd721, fd715, fd586; +fma.rn.f64 fd722, fd717, fd585, fd721; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd725, fd589; +mul.f64 fd729, fd727, fd590; +sub.f64 fd730, fd728, fd729; +mul.f64 fd731, fd725, fd590; +fma.rn.f64 fd732, fd727, fd589, fd731; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd735, fd593; +mul.f64 fd739, fd737, fd594; +sub.f64 fd740, fd738, fd739; +mul.f64 fd741, fd735, fd594; +fma.rn.f64 fd742, fd737, fd593, fd741; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 30720; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd601; +st.shared.f64 [r19+256], fd611; +st.shared.f64 [r19+384], fd621; +st.shared.f64 [r19+512], fd631; +st.shared.f64 [r19+640], fd641; +st.shared.f64 [r19+768], fd651; +st.shared.f64 [r19+896], fd661; +st.shared.f64 [r19+1024], fd671; +st.shared.f64 [r19+1152], fd680; +st.shared.f64 [r19+1280], fd690; +st.shared.f64 [r19+1408], fd700; +st.shared.f64 [r19+1536], fd710; +st.shared.f64 [r19+1664], fd720; +st.shared.f64 [r19+1792], fd730; +st.shared.f64 [r19+1920], fd740; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+2048]; +ld.shared.f64 fd745, [r20+4096]; +ld.shared.f64 fd746, [r20+6144]; +ld.shared.f64 fd747, [r20+8192]; +ld.shared.f64 fd748, [r20+10240]; +ld.shared.f64 fd749, [r20+12288]; +ld.shared.f64 fd750, [r20+14336]; +ld.shared.f64 fd751, [r20+16384]; +ld.shared.f64 fd752, [r20+18432]; +ld.shared.f64 fd753, [r20+20480]; +ld.shared.f64 fd754, [r20+22528]; +ld.shared.f64 fd755, [r20+24576]; +ld.shared.f64 fd756, [r20+26624]; +ld.shared.f64 fd757, [r20+28672]; +ld.shared.f64 fd758, [r20+30720]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+2048]; +ld.shared.f64 fd761, [r20+4096]; +ld.shared.f64 fd762, [r20+6144]; +ld.shared.f64 fd763, [r20+8192]; +ld.shared.f64 fd764, [r20+10240]; +ld.shared.f64 fd765, [r20+12288]; +ld.shared.f64 fd766, [r20+14336]; +ld.shared.f64 fd767, [r20+16384]; +ld.shared.f64 fd768, [r20+18432]; +ld.shared.f64 fd769, [r20+20480]; +ld.shared.f64 fd770, [r20+22528]; +ld.shared.f64 fd771, [r20+24576]; +ld.shared.f64 fd772, [r20+26624]; +ld.shared.f64 fd773, [r20+28672]; +ld.shared.f64 fd774, [r20+30720]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +add.f64 fd787, fd777, fd782; +sub.f64 fd788, fd778, fd781; +sub.f64 fd789, fd777, fd782; +add.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +add.f64 fd803, fd793, fd798; +sub.f64 fd804, fd794, fd797; +sub.f64 fd805, fd793, fd798; +add.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0dBFE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +mul.f64 fd810, fd804, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd811, fd803, 0dBFE6A09E667F3BCD, fd810; +mul.f64 fd812, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd813, fd806, 0dBFE6A09E667F3BCD; +sub.f64 fd814, fd812, fd813; +add.f64 fd815, fd812, fd813; +add.f64 fd816, fd783, fd799; +add.f64 fd817, fd784, fd800; +sub.f64 fd818, fd783, fd799; +sub.f64 fd819, fd784, fd800; +add.f64 fd820, fd787, fd809; +add.f64 fd821, fd788, fd811; +sub.f64 fd822, fd787, fd809; +sub.f64 fd823, fd788, fd811; +add.f64 fd824, fd785, fd802; +sub.f64 fd825, fd786, fd801; +sub.f64 fd826, fd785, fd802; +add.f64 fd827, fd786, fd801; +add.f64 fd828, fd789, fd814; +add.f64 fd829, fd790, fd815; +sub.f64 fd830, fd789, fd814; +sub.f64 fd831, fd790, fd815; +add.f64 fd832, fd744, fd752; +add.f64 fd833, fd760, fd768; +sub.f64 fd834, fd744, fd752; +sub.f64 fd835, fd760, fd768; +add.f64 fd836, fd748, fd756; +add.f64 fd837, fd764, fd772; +sub.f64 fd838, fd748, fd756; +sub.f64 fd839, fd764, fd772; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +add.f64 fd844, fd834, fd839; +sub.f64 fd845, fd835, fd838; +sub.f64 fd846, fd834, fd839; +add.f64 fd847, fd835, fd838; +add.f64 fd848, fd746, fd754; +add.f64 fd849, fd762, fd770; +sub.f64 fd850, fd746, fd754; +sub.f64 fd851, fd762, fd770; +add.f64 fd852, fd750, fd758; +add.f64 fd853, fd766, fd774; +sub.f64 fd854, fd750, fd758; +sub.f64 fd855, fd766, fd774; +add.f64 fd856, fd848, fd852; +add.f64 fd857, fd849, fd853; +sub.f64 fd858, fd848, fd852; +sub.f64 fd859, fd849, fd853; +add.f64 fd860, fd850, fd855; +sub.f64 fd861, fd851, fd854; +sub.f64 fd862, fd850, fd855; +add.f64 fd863, fd851, fd854; +mul.f64 fd864, fd860, 0d3FE6A09E667F3BCD; +mul.f64 fd865, fd861, 0dBFE6A09E667F3BCD; +sub.f64 fd866, fd864, fd865; +mul.f64 fd867, fd861, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd868, fd860, 0dBFE6A09E667F3BCD, fd867; +mul.f64 fd869, fd862, 0dBFE6A09E667F3BCD; +mul.f64 fd870, fd863, 0dBFE6A09E667F3BCD; +sub.f64 fd871, fd869, fd870; +add.f64 fd872, fd869, fd870; +add.f64 fd873, fd840, fd856; +add.f64 fd874, fd841, fd857; +sub.f64 fd875, fd840, fd856; +sub.f64 fd876, fd841, fd857; +add.f64 fd877, fd844, fd866; +add.f64 fd878, fd845, fd868; +sub.f64 fd879, fd844, fd866; +sub.f64 fd880, fd845, fd868; +add.f64 fd881, fd842, fd859; +sub.f64 fd882, fd843, fd858; +sub.f64 fd883, fd842, fd859; +add.f64 fd884, fd843, fd858; +add.f64 fd885, fd846, fd871; +add.f64 fd886, fd847, fd872; +sub.f64 fd887, fd846, fd871; +sub.f64 fd888, fd847, fd872; +mul.f64 fd889, fd877, 0d3FED906BCF328D46; +mul.f64 fd890, fd878, 0dBFD87DE2A6AEA963; +sub.f64 fd891, fd889, fd890; +mul.f64 fd892, fd878, 0d3FED906BCF328D46; +fma.rn.f64 fd893, fd877, 0dBFD87DE2A6AEA963, fd892; +mul.f64 fd894, fd881, 0d3FE6A09E667F3BCD; +mul.f64 fd895, fd882, 0dBFE6A09E667F3BCD; +sub.f64 fd896, fd894, fd895; +mul.f64 fd897, fd882, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd898, fd881, 0dBFE6A09E667F3BCD, fd897; +mul.f64 fd899, fd885, 0d3FD87DE2A6AEA963; +mul.f64 fd900, fd886, 0dBFED906BCF328D46; +sub.f64 fd901, fd899, fd900; +mul.f64 fd902, fd886, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd903, fd885, 0dBFED906BCF328D46, fd902; +mul.f64 fd904, fd879, 0dBFD87DE2A6AEA963; +mul.f64 fd905, fd880, 0dBFED906BCF328D46; +sub.f64 fd906, fd904, fd905; +mul.f64 fd907, fd880, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd908, fd879, 0dBFED906BCF328D46, fd907; +mul.f64 fd909, fd883, 0dBFE6A09E667F3BCD; +mul.f64 fd910, fd884, 0dBFE6A09E667F3BCD; +sub.f64 fd911, fd909, fd910; +add.f64 fd912, fd909, fd910; +mul.f64 fd913, fd887, 0dBFED906BCF328D46; +mul.f64 fd914, fd888, 0dBFD87DE2A6AEA963; +sub.f64 fd915, fd913, fd914; +mul.f64 fd916, fd888, 0dBFED906BCF328D46; +fma.rn.f64 fd917, fd887, 0dBFD87DE2A6AEA963, fd916; +add.f64 %0, fd816, fd873; +add.f64 %1, fd817, fd874; +add.f64 %3, fd821, fd893; +add.f64 %2, fd820, fd891; +add.f64 %5, fd825, fd898; +add.f64 %4, fd824, fd896; +add.f64 %7, fd829, fd903; +add.f64 %6, fd828, fd901; +sub.f64 %9, fd819, fd875; +add.f64 %8, fd818, fd876; +add.f64 %11, fd823, fd908; +add.f64 %10, fd822, fd906; +add.f64 %13, fd827, fd912; +add.f64 %12, fd826, fd911; +add.f64 %15, fd831, fd917; +add.f64 %14, fd830, fd915; +sub.f64 %16, fd816, fd873; +sub.f64 %17, fd817, fd874; +sub.f64 %19, fd821, fd893; +sub.f64 %18, fd820, fd891; +sub.f64 %21, fd825, fd898; +sub.f64 %20, fd824, fd896; +sub.f64 %23, fd829, fd903; +sub.f64 %22, fd828, fd901; +add.f64 %25, fd819, fd875; +sub.f64 %24, fd818, fd876; +sub.f64 %27, fd823, fd908; +sub.f64 %26, fd822, fd906; +sub.f64 %29, fd827, fd912; +sub.f64 %28, fd826, fd911; +sub.f64 %31, fd831, fd917; +sub.f64 %30, fd830, fd915; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_4096), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<497, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<41>; +.reg .f64 fd<333>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+16384]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 65472; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+16384]; +ld.shared.v2.f64 {fd69, fd70}, [r13+32768]; +ld.shared.v2.f64 {fd73, fd74}, [r13+49152]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+4096]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 65280; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+16384]; +ld.shared.v2.f64 {fd129, fd130}, [r20+32768]; +ld.shared.v2.f64 {fd133, fd134}, [r20+49152]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd144; +sub.f64 fd148, fd140, fd143; +sub.f64 fd149, fd139, fd144; +add.f64 fd150, fd140, fd143; +and.b32 r21, r5, 1008; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd151, fd147; +mul.f64 fd156, fd152, fd148; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd145; +mul.f64 fd164, fd162, fd146; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+1024]; +mul.f64 fd170, fd166, fd149; +mul.f64 fd171, fd167, fd150; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 64512; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd152, fd147, fd157; +sub.f64 fd176, fd155, fd156; +st.shared.v2.f64 [r25+256], {fd176, fd175}; +fma.rn.f64 fd177, fd162, fd145, fd165; +sub.f64 fd178, fd163, fd164; +st.shared.v2.f64 [r25+512], {fd178, fd177}; +fma.rn.f64 fd179, fd167, fd149, fd172; +sub.f64 fd180, fd170, fd171; +st.shared.v2.f64 [r25+768], {fd180, fd179}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+16384]; +ld.shared.v2.f64 {fd189, fd190}, [r26+32768]; +ld.shared.v2.f64 {fd193, fd194}, [r26+49152]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +add.f64 fd207, fd199, fd204; +sub.f64 fd208, fd200, fd203; +sub.f64 fd209, fd199, fd204; +add.f64 fd210, fd200, fd203; +and.b32 r27, r5, 960; +bfe.u32 r28, r5, 6, 4; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd211, fd207; +mul.f64 fd216, fd212, fd208; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd220, fd205; +mul.f64 fd224, fd222, fd206; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+256]; +mul.f64 fd230, fd226, fd209; +mul.f64 fd231, fd227, fd210; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 61440; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd212, fd207, fd217; +sub.f64 fd236, fd215, fd216; +st.shared.v2.f64 [r32+1024], {fd236, fd235}; +fma.rn.f64 fd237, fd222, fd205, fd225; +sub.f64 fd238, fd223, fd224; +st.shared.v2.f64 [r32+2048], {fd238, fd237}; +fma.rn.f64 fd239, fd227, fd209, fd232; +sub.f64 fd240, fd230, fd231; +st.shared.v2.f64 [r32+3072], {fd240, fd239}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+16384]; +ld.shared.v2.f64 {fd249, fd250}, [r33+32768]; +ld.shared.v2.f64 {fd253, fd254}, [r33+49152]; +add.f64 fd257, fd241, fd249; +add.f64 fd258, fd242, fd250; +sub.f64 fd259, fd241, fd249; +sub.f64 fd260, fd242, fd250; +add.f64 fd261, fd245, fd253; +add.f64 fd262, fd246, fd254; +sub.f64 fd263, fd245, fd253; +sub.f64 fd264, fd246, fd254; +sub.f64 fd265, fd257, fd261; +sub.f64 fd266, fd258, fd262; +add.f64 fd267, fd259, fd264; +sub.f64 fd268, fd260, fd263; +sub.f64 fd269, fd259, fd264; +add.f64 fd270, fd260, fd263; +and.b32 r34, r5, 768; +bfe.u32 r35, r5, 8, 2; +mul.wide.u32 rd15, r35, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd271, fd272}, [rd17]; +mul.f64 fd275, fd271, fd267; +mul.f64 fd276, fd272, fd268; +mul.f64 fd277, fd271, fd268; +mul.f64 fd278, fd271, fd271; +mul.f64 fd279, fd272, fd272; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd272, fd271; +fma.rn.f64 fd282, fd272, fd271, fd281; +mul.f64 fd283, fd280, fd265; +mul.f64 fd284, fd282, fd266; +mul.f64 fd285, fd280, fd266; +ld.global.v2.f64 {fd286, fd287}, [rd17+64]; +mul.f64 fd290, fd286, fd269; +mul.f64 fd291, fd287, fd270; +mul.f64 fd292, fd286, fd270; +and.b32 r36, r10, 4080; +add.s32 r37, r9, r36; +barrier.sync 0; +and.b32 r38, r7, 49152; +add.s32 r39, r37, r38; +add.f64 fd293, fd258, fd262; +add.f64 fd294, fd257, fd261; +st.shared.v2.f64 [r39], {fd294, fd293}; +fma.rn.f64 fd295, fd272, fd267, fd277; +sub.f64 fd296, fd275, fd276; +st.shared.v2.f64 [r39+4096], {fd296, fd295}; +fma.rn.f64 fd297, fd282, fd265, fd285; +sub.f64 fd298, fd283, fd284; +st.shared.v2.f64 [r39+8192], {fd298, fd297}; +fma.rn.f64 fd299, fd287, fd269, fd292; +sub.f64 fd300, fd290, fd291; +st.shared.v2.f64 [r39+12288], {fd300, fd299}; +barrier.sync 0; +mad.lo.s32 r40, r34, -48, r39; +ld.shared.v2.f64 {fd301, fd302}, [r40]; +ld.shared.v2.f64 {fd305, fd306}, [r40+16384]; +ld.shared.v2.f64 {fd309, fd310}, [r40+32768]; +ld.shared.v2.f64 {fd313, fd314}, [r40+49152]; +add.f64 fd317, fd301, fd309; +add.f64 fd318, fd302, fd310; +sub.f64 fd319, fd301, fd309; +sub.f64 fd320, fd302, fd310; +add.f64 fd321, fd305, fd313; +add.f64 fd322, fd306, fd314; +sub.f64 fd323, fd305, fd313; +sub.f64 fd324, fd306, fd314; +add.f64 %1, fd318, fd322; +add.f64 %0, fd317, fd321; +sub.f64 %3, fd320, fd323; +add.f64 %2, fd319, fd324; +sub.f64 %5, fd318, fd322; +sub.f64 %4, fd317, fd321; +add.f64 %7, fd320, fd323; +sub.f64 %6, fd319, fd324; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_4096), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<496, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<293>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+16384]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32736; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+8192]; +ld.shared.f64 fd63, [r13+16384]; +ld.shared.f64 fd64, [r13+24576]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+8192]; +ld.shared.f64 fd67, [r13+16384]; +ld.shared.f64 fd68, [r13+24576]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+4096]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 32640; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+8192]; +ld.shared.f64 fd115, [r21+16384]; +ld.shared.f64 fd116, [r21+24576]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+8192]; +ld.shared.f64 fd119, [r21+16384]; +ld.shared.f64 fd120, [r21+24576]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +add.f64 fd133, fd123, fd128; +sub.f64 fd134, fd124, fd127; +sub.f64 fd135, fd123, fd128; +add.f64 fd136, fd124, fd127; +and.b32 r22, r5, 1008; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd137, fd133; +mul.f64 fd142, fd138, fd134; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd137, fd134; +fma.rn.f64 fd145, fd138, fd133, fd144; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd148, fd131; +mul.f64 fd152, fd150, fd132; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd148, fd132; +fma.rn.f64 fd155, fd150, fd131, fd154; +ld.global.v2.f64 {fd156, fd157}, [rd11+1024]; +mul.f64 fd160, fd156, fd135; +mul.f64 fd161, fd157, fd136; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd156, fd136; +fma.rn.f64 fd164, fd157, fd135, fd163; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 32256; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd143; +st.shared.f64 [r26+256], fd153; +st.shared.f64 [r26+384], fd162; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+8192]; +ld.shared.f64 fd167, [r27+16384]; +ld.shared.f64 fd168, [r27+24576]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+8192]; +ld.shared.f64 fd171, [r27+16384]; +ld.shared.f64 fd172, [r27+24576]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +add.f64 fd185, fd175, fd180; +sub.f64 fd186, fd176, fd179; +sub.f64 fd187, fd175, fd180; +add.f64 fd188, fd176, fd179; +and.b32 r28, r5, 960; +bfe.u32 r29, r5, 6, 4; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd189, fd185; +mul.f64 fd194, fd190, fd186; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd189, fd186; +fma.rn.f64 fd197, fd190, fd185, fd196; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd200, fd183; +mul.f64 fd204, fd202, fd184; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd184; +fma.rn.f64 fd207, fd202, fd183, fd206; +ld.global.v2.f64 {fd208, fd209}, [rd14+256]; +mul.f64 fd212, fd208, fd187; +mul.f64 fd213, fd209, fd188; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd208, fd188; +fma.rn.f64 fd216, fd209, fd187, fd215; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 30720; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd195; +st.shared.f64 [r33+1024], fd205; +st.shared.f64 [r33+1536], fd214; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+8192]; +ld.shared.f64 fd219, [r34+16384]; +ld.shared.f64 fd220, [r34+24576]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+8192]; +ld.shared.f64 fd223, [r34+16384]; +ld.shared.f64 fd224, [r34+24576]; +add.f64 fd225, fd217, fd219; +add.f64 fd226, fd221, fd223; +sub.f64 fd227, fd217, fd219; +sub.f64 fd228, fd221, fd223; +add.f64 fd229, fd218, fd220; +add.f64 fd230, fd222, fd224; +sub.f64 fd231, fd218, fd220; +sub.f64 fd232, fd222, fd224; +add.f64 fd233, fd225, fd229; +add.f64 fd234, fd226, fd230; +sub.f64 fd235, fd225, fd229; +sub.f64 fd236, fd226, fd230; +add.f64 fd237, fd227, fd232; +sub.f64 fd238, fd228, fd231; +sub.f64 fd239, fd227, fd232; +add.f64 fd240, fd228, fd231; +and.b32 r35, r5, 768; +bfe.u32 r36, r5, 8, 2; +mul.wide.u32 rd15, r36, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd241, fd242}, [rd17]; +mul.f64 fd245, fd241, fd237; +mul.f64 fd246, fd242, fd238; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd241, fd238; +fma.rn.f64 fd249, fd242, fd237, fd248; +mul.f64 fd250, fd241, fd241; +mul.f64 fd251, fd242, fd242; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd242, fd241; +fma.rn.f64 fd254, fd242, fd241, fd253; +mul.f64 fd255, fd252, fd235; +mul.f64 fd256, fd254, fd236; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd236; +fma.rn.f64 fd259, fd254, fd235, fd258; +ld.global.v2.f64 {fd260, fd261}, [rd17+64]; +mul.f64 fd264, fd260, fd239; +mul.f64 fd265, fd261, fd240; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd260, fd240; +fma.rn.f64 fd268, fd261, fd239, fd267; +and.b32 r37, r16, 2040; +add.s32 r38, r10, r37; +barrier.sync 0; +and.b32 r39, r8, 24576; +add.s32 r40, r38, r39; +st.shared.f64 [r40], fd233; +st.shared.f64 [r40+2048], fd247; +st.shared.f64 [r40+4096], fd257; +st.shared.f64 [r40+6144], fd266; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.f64 fd269, [r41]; +ld.shared.f64 fd270, [r41+8192]; +ld.shared.f64 fd271, [r41+16384]; +ld.shared.f64 fd272, [r41+24576]; +barrier.sync 0; +st.shared.f64 [r40], fd234; +st.shared.f64 [r40+2048], fd249; +st.shared.f64 [r40+4096], fd259; +st.shared.f64 [r40+6144], fd268; +barrier.sync 0; +ld.shared.f64 fd273, [r41]; +ld.shared.f64 fd274, [r41+8192]; +ld.shared.f64 fd275, [r41+16384]; +ld.shared.f64 fd276, [r41+24576]; +add.f64 fd277, fd269, fd271; +add.f64 fd278, fd273, fd275; +sub.f64 fd279, fd269, fd271; +sub.f64 fd280, fd273, fd275; +add.f64 fd281, fd270, fd272; +add.f64 fd282, fd274, fd276; +sub.f64 fd283, fd270, fd272; +sub.f64 fd284, fd274, fd276; +add.f64 %0, fd277, fd281; +add.f64 %1, fd278, fd282; +sub.f64 %3, fd280, fd283; +add.f64 %2, fd279, fd284; +sub.f64 %4, fd277, fd281; +sub.f64 %5, fd278, fd282; +add.f64 %7, fd280, fd283; +sub.f64 %6, fd279, fd284; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_4096), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..34199094f81db --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4096_fp64_inv.hpp.inc @@ -0,0 +1,3687 @@ +#ifndef CUFFTDX_FFT_4096_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_4096_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<663, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<29>; +.reg .f64 fd<513>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+8192]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32704; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+4096]; +ld.shared.f64 fd160, [r13+8192]; +ld.shared.f64 fd161, [r13+12288]; +ld.shared.f64 fd162, [r13+16384]; +ld.shared.f64 fd163, [r13+20480]; +ld.shared.f64 fd164, [r13+24576]; +ld.shared.f64 fd165, [r13+28672]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+4096]; +ld.shared.f64 fd168, [r13+8192]; +ld.shared.f64 fd169, [r13+12288]; +ld.shared.f64 fd170, [r13+16384]; +ld.shared.f64 fd171, [r13+20480]; +ld.shared.f64 fd172, [r13+24576]; +ld.shared.f64 fd173, [r13+28672]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 504; +bfe.u32 r15, r5, 3, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+1024]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 32256; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+4096]; +ld.shared.f64 fd301, [r21+8192]; +ld.shared.f64 fd302, [r21+12288]; +ld.shared.f64 fd303, [r21+16384]; +ld.shared.f64 fd304, [r21+20480]; +ld.shared.f64 fd305, [r21+24576]; +ld.shared.f64 fd306, [r21+28672]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+4096]; +ld.shared.f64 fd309, [r21+8192]; +ld.shared.f64 fd310, [r21+12288]; +ld.shared.f64 fd311, [r21+16384]; +ld.shared.f64 fd312, [r21+20480]; +ld.shared.f64 fd313, [r21+24576]; +ld.shared.f64 fd314, [r21+28672]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +sub.f64 fd327, fd317, fd322; +add.f64 fd328, fd318, fd321; +add.f64 fd329, fd317, fd322; +sub.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +sub.f64 fd343, fd333, fd338; +add.f64 fd344, fd334, fd337; +add.f64 fd345, fd333, fd338; +sub.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0d3FE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +add.f64 fd350, fd347, fd348; +mul.f64 fd351, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd352, fd346, 0d3FE6A09E667F3BCD; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd346, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd355, fd345, 0d3FE6A09E667F3BCD, fd354; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd350; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd350; +sub.f64 fd364, fd325, fd342; +add.f64 fd365, fd326, fd341; +add.f64 fd366, fd325, fd342; +sub.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd353; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd353; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 448; +bfe.u32 r23, r5, 6, 3; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd361, fd373; +fma.rn.f64 fd377, fd372, fd360, fd376; +mul.f64 fd378, fd360, fd373; +mul.f64 fd379, fd372, fd361; +sub.f64 fd380, fd379, fd378; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd365, fd385; +fma.rn.f64 fd387, fd383, fd364, fd386; +mul.f64 fd388, fd364, fd385; +mul.f64 fd389, fd383, fd365; +sub.f64 fd390, fd389, fd388; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd369, fd395; +fma.rn.f64 fd397, fd393, fd368, fd396; +mul.f64 fd398, fd368, fd395; +mul.f64 fd399, fd393, fd369; +sub.f64 fd400, fd399, fd398; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd359, fd405; +fma.rn.f64 fd407, fd403, fd358, fd406; +mul.f64 fd408, fd358, fd405; +mul.f64 fd409, fd403, fd359; +sub.f64 fd410, fd409, fd408; +ld.global.v2.f64 {fd411, fd412}, [rd11+128]; +mul.f64 fd415, fd363, fd412; +fma.rn.f64 fd416, fd411, fd362, fd415; +mul.f64 fd417, fd362, fd412; +mul.f64 fd418, fd411, fd363; +sub.f64 fd419, fd418, fd417; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd367, fd424; +fma.rn.f64 fd426, fd422, fd366, fd425; +mul.f64 fd427, fd366, fd424; +mul.f64 fd428, fd422, fd367; +sub.f64 fd429, fd428, fd427; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd371, fd434; +fma.rn.f64 fd436, fd432, fd370, fd435; +mul.f64 fd437, fd370, fd434; +mul.f64 fd438, fd432, fd371; +sub.f64 fd439, fd438, fd437; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 28672; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd377; +st.shared.f64 [r27+1024], fd387; +st.shared.f64 [r27+1536], fd397; +st.shared.f64 [r27+2048], fd407; +st.shared.f64 [r27+2560], fd416; +st.shared.f64 [r27+3072], fd426; +st.shared.f64 [r27+3584], fd436; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+4096]; +ld.shared.f64 fd442, [r28+8192]; +ld.shared.f64 fd443, [r28+12288]; +ld.shared.f64 fd444, [r28+16384]; +ld.shared.f64 fd445, [r28+20480]; +ld.shared.f64 fd446, [r28+24576]; +ld.shared.f64 fd447, [r28+28672]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+4096]; +ld.shared.f64 fd450, [r28+8192]; +ld.shared.f64 fd451, [r28+12288]; +ld.shared.f64 fd452, [r28+16384]; +ld.shared.f64 fd453, [r28+20480]; +ld.shared.f64 fd454, [r28+24576]; +ld.shared.f64 fd455, [r28+28672]; +add.f64 fd456, fd440, fd444; +add.f64 fd457, fd448, fd452; +sub.f64 fd458, fd440, fd444; +sub.f64 fd459, fd448, fd452; +add.f64 fd460, fd442, fd446; +add.f64 fd461, fd450, fd454; +sub.f64 fd462, fd442, fd446; +sub.f64 fd463, fd450, fd454; +add.f64 fd464, fd456, fd460; +add.f64 fd465, fd457, fd461; +sub.f64 fd466, fd456, fd460; +sub.f64 fd467, fd457, fd461; +sub.f64 fd468, fd458, fd463; +add.f64 fd469, fd459, fd462; +add.f64 fd470, fd458, fd463; +sub.f64 fd471, fd459, fd462; +add.f64 fd472, fd441, fd445; +add.f64 fd473, fd449, fd453; +sub.f64 fd474, fd441, fd445; +sub.f64 fd475, fd449, fd453; +add.f64 fd476, fd443, fd447; +add.f64 fd477, fd451, fd455; +sub.f64 fd478, fd443, fd447; +sub.f64 fd479, fd451, fd455; +add.f64 fd480, fd472, fd476; +add.f64 fd481, fd473, fd477; +sub.f64 fd482, fd472, fd476; +sub.f64 fd483, fd473, fd477; +sub.f64 fd484, fd474, fd479; +add.f64 fd485, fd475, fd478; +add.f64 fd486, fd474, fd479; +sub.f64 fd487, fd475, fd478; +mul.f64 fd488, fd484, 0d3FE6A09E667F3BCD; +mul.f64 fd489, fd485, 0d3FE6A09E667F3BCD; +sub.f64 fd490, fd488, fd489; +add.f64 fd491, fd488, fd489; +mul.f64 fd492, fd486, 0dBFE6A09E667F3BCD; +mul.f64 fd493, fd487, 0d3FE6A09E667F3BCD; +sub.f64 fd494, fd492, fd493; +mul.f64 fd495, fd487, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd496, fd486, 0d3FE6A09E667F3BCD, fd495; +add.f64 %0, fd464, fd480; +add.f64 %1, fd465, fd481; +add.f64 %3, fd469, fd491; +add.f64 %2, fd468, fd490; +add.f64 %5, fd467, fd482; +sub.f64 %4, fd466, fd483; +add.f64 %7, fd471, fd496; +add.f64 %6, fd470, fd494; +sub.f64 %8, fd464, fd480; +sub.f64 %9, fd465, fd481; +sub.f64 %11, fd469, fd491; +sub.f64 %10, fd468, fd490; +sub.f64 %13, fd467, fd482; +add.f64 %12, fd466, fd483; +sub.f64 %15, fd471, fd496; +sub.f64 %14, fd470, fd494; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_4096), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<665, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<44>; +.reg .f64 fd<1226>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1219, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1217, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1216, fd1219, fd1217; +sub.f64 fd76, fd1219, fd1217; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd1215, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1212, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1210, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1209, fd1212, fd1210; +sub.f64 fd92, fd1212, fd1210; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd1208, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd1208, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd1206, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd1207, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd1206, fd1207; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1205, fd1216, fd1209; +sub.f64 fd109, fd1216, fd1209; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1204, fd1215, fd100; +sub.f64 fd113, fd1215, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd1203, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd1202, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1200, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1197, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1196, fd1200, fd1197; +sub.f64 fd133, fd1200, fd1197; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd1195, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1193, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1191, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1190, fd1193, fd1191; +sub.f64 fd149, fd1193, fd1191; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd1189, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd1189, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd1187, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd1188, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd1187, fd1188; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1186, fd1196, fd1190; +sub.f64 fd166, fd1196, fd1190; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1185, fd1195, fd157; +sub.f64 fd170, fd1195, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd1184, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd1183, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1181, fd167, 0d3FED906BCF328D46; +mul.f64 fd1182, fd1185, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd1181, fd1182; +mul.f64 fd182, fd1185, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd1184, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd1183, 0d3FED906BCF328D46; +mul.f64 fd1180, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd1180, fd189; +mul.f64 fd191, fd1183, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd1179, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd1179, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd1177, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd1178, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd1177, fd1178; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd1175, fd177, 0dBFED906BCF328D46; +mul.f64 fd1176, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd1175, fd1176; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1174, fd1204, fd183; +sub.f64 fd213, fd1204, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1173, fd1203, fd187; +sub.f64 fd217, fd1203, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd1172, fd1202, fd192; +sub.f64 fd221, fd1202, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd1171, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd1170, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd1169, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1168, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd1174, fd239; +mul.f64 fd244, fd238, fd1174; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1167, fd238, fd238; +sub.f64 fd247, fd1167, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd1173, fd249; +mul.f64 fd252, fd247, fd1173; +mul.f64 fd1165, fd238, fd247; +mul.f64 fd1166, fd239, fd249; +sub.f64 fd255, fd1165, fd1166; +mul.f64 fd1164, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd1172, fd257; +mul.f64 fd260, fd255, fd1172; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1163, fd238, fd255; +sub.f64 fd263, fd1163, fd262; +mul.f64 fd1162, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd1171, fd265; +mul.f64 fd268, fd263, fd1171; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1161, fd238, fd263; +sub.f64 fd271, fd1161, fd270; +mul.f64 fd1160, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd1170, fd273; +mul.f64 fd276, fd271, fd1170; +mul.f64 fd1158, fd238, fd271; +mul.f64 fd1159, fd239, fd273; +sub.f64 fd279, fd1158, fd1159; +mul.f64 fd1157, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd1169, fd281; +mul.f64 fd284, fd279, fd1169; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1156, fd238, fd279; +sub.f64 fd287, fd1156, fd286; +mul.f64 fd1155, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd1168, fd289; +mul.f64 fd292, fd287, fd1168; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1154, fd238, fd287; +sub.f64 fd295, fd1154, fd294; +mul.f64 fd1153, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1152, fd1205, fd1186; +mul.f64 fd298, fd1152, fd297; +sub.f64 fd1151, fd106, fd163; +mul.f64 fd299, fd1151, fd297; +mul.f64 fd300, fd295, fd1152; +ld.global.v2.f64 {fd301, fd302}, [rd5+4096]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1149, fd238, fd301; +mul.f64 fd1150, fd239, fd302; +sub.f64 fd310, fd1149, fd1150; +mul.f64 fd1148, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1147, fd238, fd310; +sub.f64 fd318, fd1147, fd317; +mul.f64 fd1146, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1144, fd238, fd318; +mul.f64 fd1145, fd239, fd320; +sub.f64 fd326, fd1144, fd1145; +mul.f64 fd1143, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1141, fd238, fd326; +mul.f64 fd1142, fd239, fd328; +sub.f64 fd334, fd1141, fd1142; +mul.f64 fd1140, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1139, fd238, fd334; +sub.f64 fd342, fd1139, fd341; +mul.f64 fd1138, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1136, fd238, fd342; +mul.f64 fd1137, fd239, fd344; +sub.f64 fd350, fd1136, fd1137; +mul.f64 fd1135, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd1134, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 65280; +add.s32 r12, r9, r11; +sub.f64 fd1224, fd1205, fd1186; +mul.f64 fd1223, fd295, fd1224; +add.f64 fd356, fd1205, fd1186; +mov.u32 r40, %tid.x; +shl.b32 r32, r40, 8; +and.b32 r23, r32, 65280; +add.s32 r22, r9, r23; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r22], {fd357, fd356}; +mov.u32 r43, %tid.x; +shl.b32 r38, r43, 4; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd1134; +st.shared.v2.f64 [r22+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd1164; +st.shared.v2.f64 [r22+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd1162; +st.shared.v2.f64 [r22+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd1160; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r22+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd1157; +st.shared.v2.f64 [r22+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd1155; +st.shared.v2.f64 [r22+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd1153; +st.shared.v2.f64 [r22+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd1151, fd298; +sub.f64 fd373, fd1223, fd299; +st.shared.v2.f64 [r22+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd1148; +st.shared.v2.f64 [r22+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd1146; +st.shared.v2.f64 [r22+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd1143; +st.shared.v2.f64 [r22+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd1140; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r22+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd1138; +st.shared.v2.f64 [r22+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd1135; +st.shared.v2.f64 [r22+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r22+240], {fd386, fd387}; +barrier.sync 0; +and.b32 r20, r43, 255; +mad.lo.s32 r13, r20, -240, r22; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+4096]; +ld.shared.v2.f64 {fd396, fd397}, [r13+8192]; +ld.shared.v2.f64 {fd400, fd401}, [r13+12288]; +ld.shared.v2.f64 {fd404, fd405}, [r13+16384]; +ld.shared.v2.f64 {fd408, fd409}, [r13+20480]; +ld.shared.v2.f64 {fd412, fd413}, [r13+24576]; +ld.shared.v2.f64 {fd416, fd417}, [r13+28672]; +ld.shared.v2.f64 {fd420, fd421}, [r13+32768]; +ld.shared.v2.f64 {fd424, fd425}, [r13+36864]; +ld.shared.v2.f64 {fd428, fd429}, [r13+40960]; +ld.shared.v2.f64 {fd432, fd433}, [r13+45056]; +ld.shared.v2.f64 {fd436, fd437}, [r13+49152]; +ld.shared.v2.f64 {fd440, fd441}, [r13+53248]; +ld.shared.v2.f64 {fd444, fd445}, [r13+57344]; +ld.shared.v2.f64 {fd448, fd449}, [r13+61440]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd1133, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd1132, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd1131, fd1133, fd1132; +sub.f64 fd463, fd1133, fd1132; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd1130, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd1129, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd1128, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd1127, fd1129, fd1128; +sub.f64 fd479, fd1129, fd1128; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd1126, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd1126, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd1125, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd1125, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd1124, fd1131, fd1127; +sub.f64 fd496, fd1131, fd1127; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd1123, fd1130, fd487; +sub.f64 fd500, fd1130, fd487; +sub.f64 fd501, fd462, fd479; +add.f64 fd503, fd462, fd479; +add.f64 fd1122, fd463, fd478; +sub.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd490; +sub.f64 fd507, fd466, fd490; +add.f64 fd1121, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd1120, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd1119, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd1118, fd1120, fd1119; +sub.f64 fd520, fd1120, fd1119; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd1117, fd512, fd515; +sub.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd1116, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd1115, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd1114, fd1116, fd1115; +sub.f64 fd536, fd1116, fd1115; +sub.f64 fd537, fd527, fd532; +add.f64 fd539, fd527, fd532; +add.f64 fd1113, fd528, fd531; +sub.f64 fd540, fd528, fd531; +mul.f64 fd541, fd537, 0d3FE6A09E667F3BCD; +mul.f64 fd542, fd1113, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +mul.f64 fd546, fd540, 0d3FE6A09E667F3BCD; +mul.f64 fd1112, fd539, 0dBFE6A09E667F3BCD; +sub.f64 fd547, fd1112, fd546; +mul.f64 fd548, fd540, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd549, fd539, 0d3FE6A09E667F3BCD, fd548; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd1111, fd1118, fd1114; +sub.f64 fd553, fd1118, fd1114; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd1110, fd1117, fd544; +sub.f64 fd557, fd1117, fd544; +sub.f64 fd558, fd519, fd536; +add.f64 fd560, fd519, fd536; +add.f64 fd1109, fd520, fd535; +sub.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd547; +sub.f64 fd564, fd523, fd547; +add.f64 fd1108, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd1106, fd554, 0d3FED906BCF328D46; +mul.f64 fd1107, fd1110, 0d3FD87DE2A6AEA963; +sub.f64 fd568, fd1106, fd1107; +mul.f64 fd569, fd1110, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0d3FD87DE2A6AEA963, fd569; +mul.f64 fd571, fd558, 0d3FE6A09E667F3BCD; +mul.f64 fd572, fd1109, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd571, fd572; +add.f64 fd574, fd571, fd572; +mul.f64 fd1104, fd562, 0d3FD87DE2A6AEA963; +mul.f64 fd1105, fd1108, 0d3FED906BCF328D46; +sub.f64 fd577, fd1104, fd1105; +mul.f64 fd578, fd1108, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd579, fd562, 0d3FED906BCF328D46, fd578; +mul.f64 fd1102, fd556, 0dBFD87DE2A6AEA963; +mul.f64 fd1103, fd557, 0d3FED906BCF328D46; +sub.f64 fd582, fd1102, fd1103; +mul.f64 fd583, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd584, fd556, 0d3FED906BCF328D46, fd583; +mul.f64 fd1100, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd1101, fd561, 0d3FE6A09E667F3BCD; +sub.f64 fd587, fd1100, fd1101; +mul.f64 fd588, fd561, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd589, fd560, 0d3FE6A09E667F3BCD, fd588; +mul.f64 fd591, fd565, 0d3FD87DE2A6AEA963; +mul.f64 fd1099, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd1099, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0d3FD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd1098, fd1123, fd570; +sub.f64 fd600, fd1123, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd1097, fd1122, fd574; +sub.f64 fd604, fd1122, fd574; +add.f64 fd605, fd505, fd577; +sub.f64 fd607, fd505, fd577; +add.f64 fd1096, fd1121, fd579; +sub.f64 fd608, fd1121, fd579; +sub.f64 fd609, fd495, fd553; +add.f64 fd611, fd495, fd553; +add.f64 fd1095, fd496, fd552; +sub.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd582; +sub.f64 fd615, fd499, fd582; +add.f64 fd1094, fd500, fd584; +sub.f64 fd616, fd500, fd584; +add.f64 fd617, fd503, fd587; +sub.f64 fd619, fd503, fd587; +add.f64 fd1093, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd1092, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r43, 240; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd629, fd1098, fd626; +mul.f64 fd631, fd625, fd1098; +mul.f64 fd633, fd626, fd626; +mul.f64 fd1091, fd625, fd625; +sub.f64 fd634, fd1091, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd637, fd1097, fd636; +mul.f64 fd639, fd634, fd1097; +mul.f64 fd1089, fd625, fd634; +mul.f64 fd1090, fd626, fd636; +sub.f64 fd642, fd1089, fd1090; +mul.f64 fd1088, fd601, fd636; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd645, fd1096, fd644; +mul.f64 fd647, fd642, fd1096; +mul.f64 fd649, fd626, fd644; +mul.f64 fd1087, fd625, fd642; +sub.f64 fd650, fd1087, fd649; +mul.f64 fd1086, fd605, fd644; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd653, fd1095, fd652; +mul.f64 fd655, fd650, fd1095; +mul.f64 fd1084, fd625, fd650; +mul.f64 fd1085, fd626, fd652; +sub.f64 fd658, fd1084, fd1085; +mul.f64 fd1083, fd609, fd652; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd661, fd1094, fd660; +mul.f64 fd663, fd658, fd1094; +mul.f64 fd1081, fd625, fd658; +mul.f64 fd1082, fd626, fd660; +sub.f64 fd666, fd1081, fd1082; +mul.f64 fd1080, fd613, fd660; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd669, fd1093, fd668; +mul.f64 fd671, fd666, fd1093; +mul.f64 fd673, fd626, fd668; +mul.f64 fd1079, fd625, fd666; +sub.f64 fd674, fd1079, fd673; +mul.f64 fd1078, fd617, fd668; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd677, fd1092, fd676; +mul.f64 fd679, fd674, fd1092; +mul.f64 fd1076, fd625, fd674; +mul.f64 fd1077, fd626, fd676; +sub.f64 fd682, fd1076, fd1077; +mul.f64 fd1075, fd621, fd676; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd1074, fd1124, fd1111; +mul.f64 fd685, fd1074, fd684; +sub.f64 fd1073, fd493, fd550; +mul.f64 fd686, fd1073, fd684; +mul.f64 fd687, fd682, fd1074; +ld.global.v2.f64 {fd688, fd689}, [rd8+256]; +mul.f64 fd692, fd600, fd689; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd1072, fd625, fd688; +sub.f64 fd697, fd1072, fd696; +mul.f64 fd1071, fd599, fd689; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd700, fd604, fd699; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd1070, fd625, fd697; +sub.f64 fd705, fd1070, fd704; +mul.f64 fd1069, fd603, fd699; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd708, fd608, fd707; +mul.f64 fd710, fd705, fd608; +mul.f64 fd1067, fd625, fd705; +mul.f64 fd1068, fd626, fd707; +sub.f64 fd713, fd1067, fd1068; +mul.f64 fd1066, fd607, fd707; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd716, fd612, fd715; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd1065, fd625, fd713; +sub.f64 fd721, fd1065, fd720; +mul.f64 fd1064, fd611, fd715; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd724, fd616, fd723; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd1063, fd625, fd721; +sub.f64 fd729, fd1063, fd728; +mul.f64 fd1062, fd615, fd723; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd732, fd620, fd731; +mul.f64 fd734, fd729, fd620; +mul.f64 fd1060, fd625, fd729; +mul.f64 fd1061, fd626, fd731; +sub.f64 fd737, fd1060, fd1061; +mul.f64 fd1059, fd619, fd731; +mul.f64 fd738, fd625, fd731; +mul.f64 fd1058, fd597, fd626; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd624, fd739; +mul.f64 fd741, fd623, fd739; +mul.f64 fd742, fd737, fd624; +and.b32 r15, r38, 240; +add.s32 r16, r9, r15; +mov.u32 r26, %tid.x; +shl.b32 r25, r26, 8; +barrier.sync 0; +and.b32 r17, r25, 61440; +add.s32 r18, r16, r17; +mov.u32 r30, %tid.x; +and.b32 r29, r30, 240; +sub.f64 fd1222, fd1124, fd1111; +mul.f64 fd1221, fd682, fd1222; +add.f64 fd743, fd1124, fd1111; +mov.u32 r35, %tid.x; +and.b32 r34, r35, 240; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r37, %tid.x; +and.b32 r36, r37, 240; +fma.rn.f64 fd745, fd625, fd597, fd629; +sub.f64 fd746, fd631, fd1058; +st.shared.v2.f64 [r18+256], {fd745, fd746}; +fma.rn.f64 fd747, fd634, fd601, fd637; +sub.f64 fd748, fd639, fd1088; +st.shared.v2.f64 [r18+512], {fd747, fd748}; +fma.rn.f64 fd749, fd642, fd605, fd645; +sub.f64 fd750, fd647, fd1086; +st.shared.v2.f64 [r18+768], {fd749, fd750}; +fma.rn.f64 fd751, fd650, fd609, fd653; +sub.f64 fd752, fd655, fd1083; +st.shared.v2.f64 [r18+1024], {fd751, fd752}; +sub.f64 fd753, fd663, fd1080; +fma.rn.f64 fd754, fd658, fd613, fd661; +st.shared.v2.f64 [r18+1280], {fd754, fd753}; +fma.rn.f64 fd755, fd666, fd617, fd669; +sub.f64 fd756, fd671, fd1078; +st.shared.v2.f64 [r18+1536], {fd755, fd756}; +fma.rn.f64 fd757, fd674, fd621, fd677; +sub.f64 fd758, fd679, fd1075; +st.shared.v2.f64 [r18+1792], {fd757, fd758}; +fma.rn.f64 fd759, fd682, fd1073, fd685; +sub.f64 fd760, fd1221, fd686; +st.shared.v2.f64 [r18+2048], {fd759, fd760}; +fma.rn.f64 fd761, fd688, fd599, fd692; +sub.f64 fd762, fd694, fd1071; +st.shared.v2.f64 [r18+2304], {fd761, fd762}; +fma.rn.f64 fd763, fd697, fd603, fd700; +sub.f64 fd764, fd702, fd1069; +st.shared.v2.f64 [r18+2560], {fd763, fd764}; +fma.rn.f64 fd765, fd705, fd607, fd708; +sub.f64 fd766, fd710, fd1066; +st.shared.v2.f64 [r18+2816], {fd765, fd766}; +fma.rn.f64 fd767, fd713, fd611, fd716; +sub.f64 fd768, fd718, fd1064; +st.shared.v2.f64 [r18+3072], {fd767, fd768}; +sub.f64 fd769, fd726, fd1062; +fma.rn.f64 fd770, fd721, fd615, fd724; +st.shared.v2.f64 [r18+3328], {fd770, fd769}; +fma.rn.f64 fd771, fd729, fd619, fd732; +sub.f64 fd772, fd734, fd1059; +st.shared.v2.f64 [r18+3584], {fd771, fd772}; +fma.rn.f64 fd773, fd737, fd623, fd740; +sub.f64 fd774, fd742, fd741; +st.shared.v2.f64 [r18+3840], {fd773, fd774}; +barrier.sync 0; +mad.lo.s32 r19, r36, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+4096]; +ld.shared.v2.f64 {fd783, fd784}, [r19+8192]; +ld.shared.v2.f64 {fd787, fd788}, [r19+12288]; +ld.shared.v2.f64 {fd791, fd792}, [r19+16384]; +ld.shared.v2.f64 {fd795, fd796}, [r19+20480]; +ld.shared.v2.f64 {fd799, fd800}, [r19+24576]; +ld.shared.v2.f64 {fd803, fd804}, [r19+28672]; +ld.shared.v2.f64 {fd807, fd808}, [r19+32768]; +ld.shared.v2.f64 {fd811, fd812}, [r19+36864]; +ld.shared.v2.f64 {fd815, fd816}, [r19+40960]; +ld.shared.v2.f64 {fd819, fd820}, [r19+45056]; +ld.shared.v2.f64 {fd823, fd824}, [r19+49152]; +ld.shared.v2.f64 {fd827, fd828}, [r19+53248]; +ld.shared.v2.f64 {fd831, fd832}, [r19+57344]; +ld.shared.v2.f64 {fd835, fd836}, [r19+61440]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd1057, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd1056, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd839, fd843; +sub.f64 fd849, fd839, fd843; +add.f64 fd1055, fd1057, fd1056; +sub.f64 fd850, fd1057, fd1056; +sub.f64 fd851, fd841, fd846; +add.f64 fd853, fd841, fd846; +add.f64 fd1054, fd842, fd845; +sub.f64 fd854, fd842, fd845; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd1053, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd1052, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd855, fd859; +sub.f64 fd865, fd855, fd859; +add.f64 fd1051, fd1053, fd1052; +sub.f64 fd866, fd1053, fd1052; +sub.f64 fd867, fd857, fd862; +add.f64 fd869, fd857, fd862; +add.f64 fd1050, fd858, fd861; +sub.f64 fd870, fd858, fd861; +mul.f64 fd871, fd867, 0d3FE6A09E667F3BCD; +mul.f64 fd872, fd1050, 0d3FE6A09E667F3BCD; +sub.f64 fd873, fd871, fd872; +add.f64 fd874, fd871, fd872; +mul.f64 fd876, fd870, 0d3FE6A09E667F3BCD; +mul.f64 fd1049, fd869, 0dBFE6A09E667F3BCD; +sub.f64 fd877, fd1049, fd876; +mul.f64 fd878, fd870, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd879, fd869, 0d3FE6A09E667F3BCD, fd878; +add.f64 fd880, fd847, fd863; +sub.f64 fd882, fd847, fd863; +add.f64 fd1048, fd1055, fd1051; +sub.f64 fd883, fd1055, fd1051; +add.f64 fd884, fd851, fd873; +sub.f64 fd886, fd851, fd873; +add.f64 fd1047, fd1054, fd874; +sub.f64 fd887, fd1054, fd874; +sub.f64 fd888, fd849, fd866; +add.f64 fd890, fd849, fd866; +add.f64 fd1046, fd850, fd865; +sub.f64 fd891, fd850, fd865; +add.f64 fd892, fd853, fd877; +sub.f64 fd894, fd853, fd877; +add.f64 fd1045, fd854, fd879; +sub.f64 fd895, fd854, fd879; +add.f64 fd896, fd779, fd811; +sub.f64 fd898, fd779, fd811; +add.f64 fd1044, fd780, fd812; +sub.f64 fd899, fd780, fd812; +add.f64 fd900, fd795, fd827; +sub.f64 fd902, fd795, fd827; +add.f64 fd1043, fd796, fd828; +sub.f64 fd903, fd796, fd828; +add.f64 fd904, fd896, fd900; +sub.f64 fd906, fd896, fd900; +add.f64 fd1042, fd1044, fd1043; +sub.f64 fd907, fd1044, fd1043; +sub.f64 fd908, fd898, fd903; +add.f64 fd910, fd898, fd903; +add.f64 fd1041, fd899, fd902; +sub.f64 fd911, fd899, fd902; +add.f64 fd912, fd787, fd819; +sub.f64 fd914, fd787, fd819; +add.f64 fd1040, fd788, fd820; +sub.f64 fd915, fd788, fd820; +add.f64 fd916, fd803, fd835; +sub.f64 fd918, fd803, fd835; +add.f64 fd1039, fd804, fd836; +sub.f64 fd919, fd804, fd836; +add.f64 fd920, fd912, fd916; +sub.f64 fd922, fd912, fd916; +add.f64 fd1038, fd1040, fd1039; +sub.f64 fd923, fd1040, fd1039; +sub.f64 fd924, fd914, fd919; +add.f64 fd926, fd914, fd919; +add.f64 fd1037, fd915, fd918; +sub.f64 fd927, fd915, fd918; +mul.f64 fd928, fd924, 0d3FE6A09E667F3BCD; +mul.f64 fd929, fd1037, 0d3FE6A09E667F3BCD; +sub.f64 fd930, fd928, fd929; +add.f64 fd931, fd928, fd929; +mul.f64 fd933, fd927, 0d3FE6A09E667F3BCD; +mul.f64 fd1036, fd926, 0dBFE6A09E667F3BCD; +sub.f64 fd934, fd1036, fd933; +mul.f64 fd935, fd927, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd936, fd926, 0d3FE6A09E667F3BCD, fd935; +add.f64 fd937, fd904, fd920; +sub.f64 fd939, fd904, fd920; +add.f64 fd1035, fd1042, fd1038; +sub.f64 fd940, fd1042, fd1038; +add.f64 fd941, fd908, fd930; +sub.f64 fd943, fd908, fd930; +add.f64 fd1034, fd1041, fd931; +sub.f64 fd944, fd1041, fd931; +sub.f64 fd945, fd906, fd923; +add.f64 fd947, fd906, fd923; +add.f64 fd1033, fd907, fd922; +sub.f64 fd948, fd907, fd922; +add.f64 fd949, fd910, fd934; +sub.f64 fd951, fd910, fd934; +add.f64 fd1032, fd911, fd936; +sub.f64 fd952, fd911, fd936; +mul.f64 fd954, fd1034, 0d3FD87DE2A6AEA963; +mul.f64 fd1031, fd941, 0d3FED906BCF328D46; +sub.f64 fd955, fd1031, fd954; +mul.f64 fd956, fd1034, 0d3FED906BCF328D46; +fma.rn.f64 fd957, fd941, 0d3FD87DE2A6AEA963, fd956; +mul.f64 fd958, fd945, 0d3FE6A09E667F3BCD; +mul.f64 fd959, fd1033, 0d3FE6A09E667F3BCD; +sub.f64 fd960, fd958, fd959; +add.f64 fd961, fd958, fd959; +mul.f64 fd1029, fd949, 0d3FD87DE2A6AEA963; +mul.f64 fd1030, fd1032, 0d3FED906BCF328D46; +sub.f64 fd964, fd1029, fd1030; +mul.f64 fd965, fd1032, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd966, fd949, 0d3FED906BCF328D46, fd965; +mul.f64 fd1027, fd943, 0dBFD87DE2A6AEA963; +mul.f64 fd1028, fd944, 0d3FED906BCF328D46; +sub.f64 fd969, fd1027, fd1028; +mul.f64 fd970, fd944, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd971, fd943, 0d3FED906BCF328D46, fd970; +mul.f64 fd973, fd948, 0d3FE6A09E667F3BCD; +mul.f64 fd1026, fd947, 0dBFE6A09E667F3BCD; +sub.f64 fd974, fd1026, fd973; +mul.f64 fd975, fd948, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd976, fd947, 0d3FE6A09E667F3BCD, fd975; +mul.f64 fd978, fd952, 0d3FD87DE2A6AEA963; +mul.f64 fd1025, fd951, 0dBFED906BCF328D46; +sub.f64 fd979, fd1025, fd978; +mul.f64 fd980, fd952, 0dBFED906BCF328D46; +fma.rn.f64 fd981, fd951, 0d3FD87DE2A6AEA963, fd980; +add.f64 %0, fd880, fd937; +add.f64 %1, fd1048, fd1035; +add.f64 %2, fd884, fd955; +add.f64 %3, fd1047, fd957; +add.f64 %5, fd1046, fd961; +add.f64 %4, fd888, fd960; +add.f64 %7, fd1045, fd966; +add.f64 %6, fd892, fd964; +add.f64 %9, fd883, fd939; +sub.f64 %8, fd882, fd940; +add.f64 %10, fd886, fd969; +add.f64 %11, fd887, fd971; +add.f64 %12, fd890, fd974; +add.f64 %13, fd891, fd976; +add.f64 %14, fd894, fd979; +add.f64 %15, fd895, fd981; +sub.f64 %17, fd1048, fd1035; +sub.f64 %16, fd880, fd937; +sub.f64 %19, fd1047, fd957; +sub.f64 %18, fd884, fd955; +sub.f64 %21, fd1046, fd961; +sub.f64 %20, fd888, fd960; +sub.f64 %23, fd1045, fd966; +sub.f64 %22, fd892, fd964; +sub.f64 %25, fd883, fd939; +add.f64 %24, fd882, fd940; +sub.f64 %27, fd887, fd971; +sub.f64 %26, fd886, fd969; +sub.f64 %29, fd891, fd976; +sub.f64 %28, fd890, fd974; +sub.f64 %31, fd895, fd981; +sub.f64 %30, fd894, fd979; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_4096), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<666, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<561>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %20, %30; +add.f64 fd34, %21, %32; +sub.f64 fd35, %20, %30; +sub.f64 fd36, %21, %32; +add.f64 fd37, %25, %36; +add.f64 fd38, %27, %37; +sub.f64 fd39, %25, %36; +sub.f64 fd40, %27, %37; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %22, %33; +add.f64 fd50, %24, %35; +sub.f64 fd51, %22, %33; +sub.f64 fd52, %24, %35; +add.f64 fd53, %28, %38; +add.f64 fd54, %29, %39; +sub.f64 fd55, %28, %38; +sub.f64 fd56, %29, %39; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+8192]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 65408; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+8192]; +ld.shared.v2.f64 {fd166, fd167}, [r13+16384]; +ld.shared.v2.f64 {fd170, fd171}, [r13+24576]; +ld.shared.v2.f64 {fd174, fd175}, [r13+32768]; +ld.shared.v2.f64 {fd178, fd179}, [r13+40960]; +ld.shared.v2.f64 {fd182, fd183}, [r13+49152]; +ld.shared.v2.f64 {fd186, fd187}, [r13+57344]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 504; +bfe.u32 r15, r5, 3, 6; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+1024]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 64512; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+8192]; +ld.shared.v2.f64 {fd323, fd324}, [r20+16384]; +ld.shared.v2.f64 {fd327, fd328}, [r20+24576]; +ld.shared.v2.f64 {fd331, fd332}, [r20+32768]; +ld.shared.v2.f64 {fd335, fd336}, [r20+40960]; +ld.shared.v2.f64 {fd339, fd340}, [r20+49152]; +ld.shared.v2.f64 {fd343, fd344}, [r20+57344]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +sub.f64 fd359, fd349, fd354; +add.f64 fd360, fd350, fd353; +add.f64 fd361, fd349, fd354; +sub.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +sub.f64 fd375, fd365, fd370; +add.f64 fd376, fd366, fd369; +add.f64 fd377, fd365, fd370; +sub.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0d3FE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +add.f64 fd382, fd379, fd380; +mul.f64 fd383, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd384, fd378, 0d3FE6A09E667F3BCD; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd378, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd387, fd377, 0d3FE6A09E667F3BCD, fd386; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd382; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd382; +sub.f64 fd394, fd357, fd374; +add.f64 fd395, fd358, fd373; +add.f64 fd396, fd357, fd374; +sub.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd385; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd385; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 448; +bfe.u32 r22, r5, 6, 3; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd391, fd403; +mul.f64 fd407, fd390, fd403; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd395, fd413; +mul.f64 fd415, fd394, fd413; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd399, fd421; +mul.f64 fd423, fd398, fd421; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd389, fd429; +mul.f64 fd431, fd388, fd429; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+128]; +mul.f64 fd437, fd393, fd434; +mul.f64 fd438, fd392, fd434; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd397, fd444; +mul.f64 fd446, fd396, fd444; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd401, fd452; +mul.f64 fd454, fd400, fd452; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 57344; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd402, fd390, fd406; +sub.f64 fd459, fd408, fd407; +st.shared.v2.f64 [r26+1024], {fd458, fd459}; +fma.rn.f64 fd460, fd411, fd394, fd414; +sub.f64 fd461, fd416, fd415; +st.shared.v2.f64 [r26+2048], {fd460, fd461}; +fma.rn.f64 fd462, fd419, fd398, fd422; +sub.f64 fd463, fd424, fd423; +st.shared.v2.f64 [r26+3072], {fd462, fd463}; +sub.f64 fd464, fd432, fd431; +fma.rn.f64 fd465, fd427, fd388, fd430; +st.shared.v2.f64 [r26+4096], {fd465, fd464}; +fma.rn.f64 fd466, fd433, fd392, fd437; +sub.f64 fd467, fd439, fd438; +st.shared.v2.f64 [r26+5120], {fd466, fd467}; +fma.rn.f64 fd468, fd442, fd396, fd445; +sub.f64 fd469, fd447, fd446; +st.shared.v2.f64 [r26+6144], {fd468, fd469}; +fma.rn.f64 fd470, fd450, fd400, fd453; +sub.f64 fd471, fd455, fd454; +st.shared.v2.f64 [r26+7168], {fd470, fd471}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+8192]; +ld.shared.v2.f64 {fd480, fd481}, [r27+16384]; +ld.shared.v2.f64 {fd484, fd485}, [r27+24576]; +ld.shared.v2.f64 {fd488, fd489}, [r27+32768]; +ld.shared.v2.f64 {fd492, fd493}, [r27+40960]; +ld.shared.v2.f64 {fd496, fd497}, [r27+49152]; +ld.shared.v2.f64 {fd500, fd501}, [r27+57344]; +add.f64 fd504, fd472, fd488; +add.f64 fd505, fd473, fd489; +sub.f64 fd506, fd472, fd488; +sub.f64 fd507, fd473, fd489; +add.f64 fd508, fd480, fd496; +add.f64 fd509, fd481, fd497; +sub.f64 fd510, fd480, fd496; +sub.f64 fd511, fd481, fd497; +add.f64 fd512, fd504, fd508; +add.f64 fd513, fd505, fd509; +sub.f64 fd514, fd504, fd508; +sub.f64 fd515, fd505, fd509; +sub.f64 fd516, fd506, fd511; +add.f64 fd517, fd507, fd510; +add.f64 fd518, fd506, fd511; +sub.f64 fd519, fd507, fd510; +add.f64 fd520, fd476, fd492; +add.f64 fd521, fd477, fd493; +sub.f64 fd522, fd476, fd492; +sub.f64 fd523, fd477, fd493; +add.f64 fd524, fd484, fd500; +add.f64 fd525, fd485, fd501; +sub.f64 fd526, fd484, fd500; +sub.f64 fd527, fd485, fd501; +add.f64 fd528, fd520, fd524; +add.f64 fd529, fd521, fd525; +sub.f64 fd530, fd520, fd524; +sub.f64 fd531, fd521, fd525; +sub.f64 fd532, fd522, fd527; +add.f64 fd533, fd523, fd526; +add.f64 fd534, fd522, fd527; +sub.f64 fd535, fd523, fd526; +mul.f64 fd536, fd532, 0d3FE6A09E667F3BCD; +mul.f64 fd537, fd533, 0d3FE6A09E667F3BCD; +sub.f64 fd538, fd536, fd537; +add.f64 fd539, fd536, fd537; +mul.f64 fd540, fd534, 0dBFE6A09E667F3BCD; +mul.f64 fd541, fd535, 0d3FE6A09E667F3BCD; +sub.f64 fd542, fd540, fd541; +mul.f64 fd543, fd535, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd544, fd534, 0d3FE6A09E667F3BCD, fd543; +add.f64 %1, fd513, fd529; +add.f64 %0, fd512, fd528; +add.f64 %3, fd517, fd539; +add.f64 %2, fd516, fd538; +add.f64 %5, fd515, fd530; +sub.f64 %4, fd514, fd531; +add.f64 %7, fd519, fd544; +add.f64 %6, fd518, fd542; +sub.f64 %9, fd513, fd529; +sub.f64 %8, fd512, fd528; +sub.f64 %11, fd517, fd539; +sub.f64 %10, fd516, fd538; +sub.f64 %13, fd515, fd530; +add.f64 %12, fd514, fd531; +sub.f64 %15, fd519, fd544; +sub.f64 %14, fd518, fd542; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_4096), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<664, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<950>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 255; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+4096]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32640; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+2048]; +ld.shared.f64 fd390, [r13+4096]; +ld.shared.f64 fd391, [r13+6144]; +ld.shared.f64 fd392, [r13+8192]; +ld.shared.f64 fd393, [r13+10240]; +ld.shared.f64 fd394, [r13+12288]; +ld.shared.f64 fd395, [r13+14336]; +ld.shared.f64 fd396, [r13+16384]; +ld.shared.f64 fd397, [r13+18432]; +ld.shared.f64 fd398, [r13+20480]; +ld.shared.f64 fd399, [r13+22528]; +ld.shared.f64 fd400, [r13+24576]; +ld.shared.f64 fd401, [r13+26624]; +ld.shared.f64 fd402, [r13+28672]; +ld.shared.f64 fd403, [r13+30720]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+2048]; +ld.shared.f64 fd406, [r13+4096]; +ld.shared.f64 fd407, [r13+6144]; +ld.shared.f64 fd408, [r13+8192]; +ld.shared.f64 fd409, [r13+10240]; +ld.shared.f64 fd410, [r13+12288]; +ld.shared.f64 fd411, [r13+14336]; +ld.shared.f64 fd412, [r13+16384]; +ld.shared.f64 fd413, [r13+18432]; +ld.shared.f64 fd414, [r13+20480]; +ld.shared.f64 fd415, [r13+22528]; +ld.shared.f64 fd416, [r13+24576]; +ld.shared.f64 fd417, [r13+26624]; +ld.shared.f64 fd418, [r13+28672]; +ld.shared.f64 fd419, [r13+30720]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd542; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd542; +add.f64 fd575, fd473, fd545; +add.f64 fd576, fd474, fd547; +sub.f64 fd577, fd473, fd545; +sub.f64 fd578, fd474, fd547; +sub.f64 fd579, fd463, fd521; +add.f64 fd580, fd464, fd520; +add.f64 fd581, fd463, fd521; +sub.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd550; +add.f64 fd584, fd468, fd552; +sub.f64 fd585, fd467, fd550; +sub.f64 fd586, fd468, fd552; +add.f64 fd587, fd471, fd555; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd555; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 240; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd568, fd596; +fma.rn.f64 fd600, fd595, fd567, fd599; +mul.f64 fd601, fd567, fd596; +mul.f64 fd602, fd595, fd568; +sub.f64 fd603, fd602, fd601; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd572, fd608; +fma.rn.f64 fd610, fd606, fd571, fd609; +mul.f64 fd611, fd571, fd608; +mul.f64 fd612, fd606, fd572; +sub.f64 fd613, fd612, fd611; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd576, fd618; +fma.rn.f64 fd620, fd616, fd575, fd619; +mul.f64 fd621, fd575, fd618; +mul.f64 fd622, fd616, fd576; +sub.f64 fd623, fd622, fd621; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd580, fd628; +fma.rn.f64 fd630, fd626, fd579, fd629; +mul.f64 fd631, fd579, fd628; +mul.f64 fd632, fd626, fd580; +sub.f64 fd633, fd632, fd631; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd584, fd638; +fma.rn.f64 fd640, fd636, fd583, fd639; +mul.f64 fd641, fd583, fd638; +mul.f64 fd642, fd636, fd584; +sub.f64 fd643, fd642, fd641; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd588, fd648; +fma.rn.f64 fd650, fd646, fd587, fd649; +mul.f64 fd651, fd587, fd648; +mul.f64 fd652, fd646, fd588; +sub.f64 fd653, fd652, fd651; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd592, fd658; +fma.rn.f64 fd660, fd656, fd591, fd659; +mul.f64 fd661, fd591, fd658; +mul.f64 fd662, fd656, fd592; +sub.f64 fd663, fd662, fd661; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd566, fd668; +fma.rn.f64 fd670, fd666, fd565, fd669; +mul.f64 fd671, fd565, fd668; +mul.f64 fd672, fd666, fd566; +sub.f64 fd673, fd672, fd671; +ld.global.v2.f64 {fd674, fd675}, [rd8+256]; +mul.f64 fd678, fd570, fd675; +fma.rn.f64 fd679, fd674, fd569, fd678; +mul.f64 fd680, fd569, fd675; +mul.f64 fd681, fd674, fd570; +sub.f64 fd682, fd681, fd680; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd574, fd687; +fma.rn.f64 fd689, fd685, fd573, fd688; +mul.f64 fd690, fd573, fd687; +mul.f64 fd691, fd685, fd574; +sub.f64 fd692, fd691, fd690; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd578, fd697; +fma.rn.f64 fd699, fd695, fd577, fd698; +mul.f64 fd700, fd577, fd697; +mul.f64 fd701, fd695, fd578; +sub.f64 fd702, fd701, fd700; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd582, fd707; +fma.rn.f64 fd709, fd705, fd581, fd708; +mul.f64 fd710, fd581, fd707; +mul.f64 fd711, fd705, fd582; +sub.f64 fd712, fd711, fd710; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd586, fd717; +fma.rn.f64 fd719, fd715, fd585, fd718; +mul.f64 fd720, fd585, fd717; +mul.f64 fd721, fd715, fd586; +sub.f64 fd722, fd721, fd720; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd590, fd727; +fma.rn.f64 fd729, fd725, fd589, fd728; +mul.f64 fd730, fd589, fd727; +mul.f64 fd731, fd725, fd590; +sub.f64 fd732, fd731, fd730; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd594, fd737; +fma.rn.f64 fd739, fd735, fd593, fd738; +mul.f64 fd740, fd593, fd737; +mul.f64 fd741, fd735, fd594; +sub.f64 fd742, fd741, fd740; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 30720; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd600; +st.shared.f64 [r19+256], fd610; +st.shared.f64 [r19+384], fd620; +st.shared.f64 [r19+512], fd630; +st.shared.f64 [r19+640], fd640; +st.shared.f64 [r19+768], fd650; +st.shared.f64 [r19+896], fd660; +st.shared.f64 [r19+1024], fd670; +st.shared.f64 [r19+1152], fd679; +st.shared.f64 [r19+1280], fd689; +st.shared.f64 [r19+1408], fd699; +st.shared.f64 [r19+1536], fd709; +st.shared.f64 [r19+1664], fd719; +st.shared.f64 [r19+1792], fd729; +st.shared.f64 [r19+1920], fd739; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+2048]; +ld.shared.f64 fd745, [r20+4096]; +ld.shared.f64 fd746, [r20+6144]; +ld.shared.f64 fd747, [r20+8192]; +ld.shared.f64 fd748, [r20+10240]; +ld.shared.f64 fd749, [r20+12288]; +ld.shared.f64 fd750, [r20+14336]; +ld.shared.f64 fd751, [r20+16384]; +ld.shared.f64 fd752, [r20+18432]; +ld.shared.f64 fd753, [r20+20480]; +ld.shared.f64 fd754, [r20+22528]; +ld.shared.f64 fd755, [r20+24576]; +ld.shared.f64 fd756, [r20+26624]; +ld.shared.f64 fd757, [r20+28672]; +ld.shared.f64 fd758, [r20+30720]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+2048]; +ld.shared.f64 fd761, [r20+4096]; +ld.shared.f64 fd762, [r20+6144]; +ld.shared.f64 fd763, [r20+8192]; +ld.shared.f64 fd764, [r20+10240]; +ld.shared.f64 fd765, [r20+12288]; +ld.shared.f64 fd766, [r20+14336]; +ld.shared.f64 fd767, [r20+16384]; +ld.shared.f64 fd768, [r20+18432]; +ld.shared.f64 fd769, [r20+20480]; +ld.shared.f64 fd770, [r20+22528]; +ld.shared.f64 fd771, [r20+24576]; +ld.shared.f64 fd772, [r20+26624]; +ld.shared.f64 fd773, [r20+28672]; +ld.shared.f64 fd774, [r20+30720]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +sub.f64 fd787, fd777, fd782; +add.f64 fd788, fd778, fd781; +add.f64 fd789, fd777, fd782; +sub.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +sub.f64 fd803, fd793, fd798; +add.f64 fd804, fd794, fd797; +add.f64 fd805, fd793, fd798; +sub.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0d3FE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +add.f64 fd810, fd807, fd808; +mul.f64 fd811, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd812, fd806, 0d3FE6A09E667F3BCD; +sub.f64 fd813, fd811, fd812; +mul.f64 fd814, fd806, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd815, fd805, 0d3FE6A09E667F3BCD, fd814; +add.f64 fd816, fd783, fd799; +add.f64 fd817, fd784, fd800; +sub.f64 fd818, fd783, fd799; +sub.f64 fd819, fd784, fd800; +add.f64 fd820, fd787, fd809; +add.f64 fd821, fd788, fd810; +sub.f64 fd822, fd787, fd809; +sub.f64 fd823, fd788, fd810; +sub.f64 fd824, fd785, fd802; +add.f64 fd825, fd786, fd801; +add.f64 fd826, fd785, fd802; +sub.f64 fd827, fd786, fd801; +add.f64 fd828, fd789, fd813; +add.f64 fd829, fd790, fd815; +sub.f64 fd830, fd789, fd813; +sub.f64 fd831, fd790, fd815; +add.f64 fd832, fd744, fd752; +add.f64 fd833, fd760, fd768; +sub.f64 fd834, fd744, fd752; +sub.f64 fd835, fd760, fd768; +add.f64 fd836, fd748, fd756; +add.f64 fd837, fd764, fd772; +sub.f64 fd838, fd748, fd756; +sub.f64 fd839, fd764, fd772; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +sub.f64 fd844, fd834, fd839; +add.f64 fd845, fd835, fd838; +add.f64 fd846, fd834, fd839; +sub.f64 fd847, fd835, fd838; +add.f64 fd848, fd746, fd754; +add.f64 fd849, fd762, fd770; +sub.f64 fd850, fd746, fd754; +sub.f64 fd851, fd762, fd770; +add.f64 fd852, fd750, fd758; +add.f64 fd853, fd766, fd774; +sub.f64 fd854, fd750, fd758; +sub.f64 fd855, fd766, fd774; +add.f64 fd856, fd848, fd852; +add.f64 fd857, fd849, fd853; +sub.f64 fd858, fd848, fd852; +sub.f64 fd859, fd849, fd853; +sub.f64 fd860, fd850, fd855; +add.f64 fd861, fd851, fd854; +add.f64 fd862, fd850, fd855; +sub.f64 fd863, fd851, fd854; +mul.f64 fd864, fd860, 0d3FE6A09E667F3BCD; +mul.f64 fd865, fd861, 0d3FE6A09E667F3BCD; +sub.f64 fd866, fd864, fd865; +add.f64 fd867, fd864, fd865; +mul.f64 fd868, fd862, 0dBFE6A09E667F3BCD; +mul.f64 fd869, fd863, 0d3FE6A09E667F3BCD; +sub.f64 fd870, fd868, fd869; +mul.f64 fd871, fd863, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd872, fd862, 0d3FE6A09E667F3BCD, fd871; +add.f64 fd873, fd840, fd856; +add.f64 fd874, fd841, fd857; +sub.f64 fd875, fd840, fd856; +sub.f64 fd876, fd841, fd857; +add.f64 fd877, fd844, fd866; +add.f64 fd878, fd845, fd867; +sub.f64 fd879, fd844, fd866; +sub.f64 fd880, fd845, fd867; +sub.f64 fd881, fd842, fd859; +add.f64 fd882, fd843, fd858; +add.f64 fd883, fd842, fd859; +sub.f64 fd884, fd843, fd858; +add.f64 fd885, fd846, fd870; +add.f64 fd886, fd847, fd872; +sub.f64 fd887, fd846, fd870; +sub.f64 fd888, fd847, fd872; +mul.f64 fd889, fd877, 0d3FED906BCF328D46; +mul.f64 fd890, fd878, 0d3FD87DE2A6AEA963; +sub.f64 fd891, fd889, fd890; +mul.f64 fd892, fd878, 0d3FED906BCF328D46; +fma.rn.f64 fd893, fd877, 0d3FD87DE2A6AEA963, fd892; +mul.f64 fd894, fd881, 0d3FE6A09E667F3BCD; +mul.f64 fd895, fd882, 0d3FE6A09E667F3BCD; +sub.f64 fd896, fd894, fd895; +add.f64 fd897, fd894, fd895; +mul.f64 fd898, fd885, 0d3FD87DE2A6AEA963; +mul.f64 fd899, fd886, 0d3FED906BCF328D46; +sub.f64 fd900, fd898, fd899; +mul.f64 fd901, fd886, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd902, fd885, 0d3FED906BCF328D46, fd901; +mul.f64 fd903, fd879, 0dBFD87DE2A6AEA963; +mul.f64 fd904, fd880, 0d3FED906BCF328D46; +sub.f64 fd905, fd903, fd904; +mul.f64 fd906, fd880, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd907, fd879, 0d3FED906BCF328D46, fd906; +mul.f64 fd908, fd883, 0dBFE6A09E667F3BCD; +mul.f64 fd909, fd884, 0d3FE6A09E667F3BCD; +sub.f64 fd910, fd908, fd909; +mul.f64 fd911, fd884, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd912, fd883, 0d3FE6A09E667F3BCD, fd911; +mul.f64 fd913, fd887, 0dBFED906BCF328D46; +mul.f64 fd914, fd888, 0d3FD87DE2A6AEA963; +sub.f64 fd915, fd913, fd914; +mul.f64 fd916, fd888, 0dBFED906BCF328D46; +fma.rn.f64 fd917, fd887, 0d3FD87DE2A6AEA963, fd916; +add.f64 %0, fd816, fd873; +add.f64 %1, fd817, fd874; +add.f64 %3, fd821, fd893; +add.f64 %2, fd820, fd891; +add.f64 %5, fd825, fd897; +add.f64 %4, fd824, fd896; +add.f64 %7, fd829, fd902; +add.f64 %6, fd828, fd900; +add.f64 %9, fd819, fd875; +sub.f64 %8, fd818, fd876; +add.f64 %11, fd823, fd907; +add.f64 %10, fd822, fd905; +add.f64 %13, fd827, fd912; +add.f64 %12, fd826, fd910; +add.f64 %15, fd831, fd917; +add.f64 %14, fd830, fd915; +sub.f64 %16, fd816, fd873; +sub.f64 %17, fd817, fd874; +sub.f64 %19, fd821, fd893; +sub.f64 %18, fd820, fd891; +sub.f64 %21, fd825, fd897; +sub.f64 %20, fd824, fd896; +sub.f64 %23, fd829, fd902; +sub.f64 %22, fd828, fd900; +sub.f64 %25, fd819, fd875; +add.f64 %24, fd818, fd876; +sub.f64 %27, fd823, fd907; +sub.f64 %26, fd822, fd905; +sub.f64 %29, fd827, fd912; +sub.f64 %28, fd826, fd910; +sub.f64 %31, fd831, fd917; +sub.f64 %30, fd830, fd915; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_4096), "l"(lut_dp_16_256), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<668, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<41>; +.reg .f64 fd<333>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+16384]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 65472; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+16384]; +ld.shared.v2.f64 {fd69, fd70}, [r13+32768]; +ld.shared.v2.f64 {fd73, fd74}, [r13+49152]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+4096]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 65280; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+16384]; +ld.shared.v2.f64 {fd129, fd130}, [r20+32768]; +ld.shared.v2.f64 {fd133, fd134}, [r20+49152]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +sub.f64 fd147, fd139, fd144; +add.f64 fd148, fd140, fd143; +add.f64 fd149, fd139, fd144; +sub.f64 fd150, fd140, fd143; +and.b32 r21, r5, 1008; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd148, fd152; +mul.f64 fd156, fd147, fd152; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd146, fd162; +mul.f64 fd164, fd145, fd162; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+1024]; +mul.f64 fd170, fd150, fd167; +mul.f64 fd171, fd149, fd167; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 64512; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd151, fd147, fd155; +sub.f64 fd176, fd157, fd156; +st.shared.v2.f64 [r25+256], {fd175, fd176}; +fma.rn.f64 fd177, fd160, fd145, fd163; +sub.f64 fd178, fd165, fd164; +st.shared.v2.f64 [r25+512], {fd177, fd178}; +fma.rn.f64 fd179, fd166, fd149, fd170; +sub.f64 fd180, fd172, fd171; +st.shared.v2.f64 [r25+768], {fd179, fd180}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+16384]; +ld.shared.v2.f64 {fd189, fd190}, [r26+32768]; +ld.shared.v2.f64 {fd193, fd194}, [r26+49152]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +sub.f64 fd207, fd199, fd204; +add.f64 fd208, fd200, fd203; +add.f64 fd209, fd199, fd204; +sub.f64 fd210, fd200, fd203; +and.b32 r27, r5, 960; +bfe.u32 r28, r5, 6, 4; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd208, fd212; +mul.f64 fd216, fd207, fd212; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd206, fd222; +mul.f64 fd224, fd205, fd222; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+256]; +mul.f64 fd230, fd210, fd227; +mul.f64 fd231, fd209, fd227; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 61440; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd211, fd207, fd215; +sub.f64 fd236, fd217, fd216; +st.shared.v2.f64 [r32+1024], {fd235, fd236}; +fma.rn.f64 fd237, fd220, fd205, fd223; +sub.f64 fd238, fd225, fd224; +st.shared.v2.f64 [r32+2048], {fd237, fd238}; +fma.rn.f64 fd239, fd226, fd209, fd230; +sub.f64 fd240, fd232, fd231; +st.shared.v2.f64 [r32+3072], {fd239, fd240}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+16384]; +ld.shared.v2.f64 {fd249, fd250}, [r33+32768]; +ld.shared.v2.f64 {fd253, fd254}, [r33+49152]; +add.f64 fd257, fd241, fd249; +add.f64 fd258, fd242, fd250; +sub.f64 fd259, fd241, fd249; +sub.f64 fd260, fd242, fd250; +add.f64 fd261, fd245, fd253; +add.f64 fd262, fd246, fd254; +sub.f64 fd263, fd245, fd253; +sub.f64 fd264, fd246, fd254; +sub.f64 fd265, fd257, fd261; +sub.f64 fd266, fd258, fd262; +sub.f64 fd267, fd259, fd264; +add.f64 fd268, fd260, fd263; +add.f64 fd269, fd259, fd264; +sub.f64 fd270, fd260, fd263; +and.b32 r34, r5, 768; +bfe.u32 r35, r5, 8, 2; +mul.wide.u32 rd15, r35, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd271, fd272}, [rd17]; +mul.f64 fd275, fd268, fd272; +mul.f64 fd276, fd267, fd272; +mul.f64 fd277, fd271, fd268; +mul.f64 fd278, fd271, fd271; +mul.f64 fd279, fd272, fd272; +sub.f64 fd280, fd278, fd279; +mul.f64 fd281, fd272, fd271; +fma.rn.f64 fd282, fd272, fd271, fd281; +mul.f64 fd283, fd266, fd282; +mul.f64 fd284, fd265, fd282; +mul.f64 fd285, fd280, fd266; +ld.global.v2.f64 {fd286, fd287}, [rd17+64]; +mul.f64 fd290, fd270, fd287; +mul.f64 fd291, fd269, fd287; +mul.f64 fd292, fd286, fd270; +and.b32 r36, r10, 4080; +add.s32 r37, r9, r36; +barrier.sync 0; +and.b32 r38, r7, 49152; +add.s32 r39, r37, r38; +add.f64 fd293, fd258, fd262; +add.f64 fd294, fd257, fd261; +st.shared.v2.f64 [r39], {fd294, fd293}; +fma.rn.f64 fd295, fd271, fd267, fd275; +sub.f64 fd296, fd277, fd276; +st.shared.v2.f64 [r39+4096], {fd295, fd296}; +fma.rn.f64 fd297, fd280, fd265, fd283; +sub.f64 fd298, fd285, fd284; +st.shared.v2.f64 [r39+8192], {fd297, fd298}; +fma.rn.f64 fd299, fd286, fd269, fd290; +sub.f64 fd300, fd292, fd291; +st.shared.v2.f64 [r39+12288], {fd299, fd300}; +barrier.sync 0; +mad.lo.s32 r40, r34, -48, r39; +ld.shared.v2.f64 {fd301, fd302}, [r40]; +ld.shared.v2.f64 {fd305, fd306}, [r40+16384]; +ld.shared.v2.f64 {fd309, fd310}, [r40+32768]; +ld.shared.v2.f64 {fd313, fd314}, [r40+49152]; +add.f64 fd317, fd301, fd309; +add.f64 fd318, fd302, fd310; +sub.f64 fd319, fd301, fd309; +sub.f64 fd320, fd302, fd310; +add.f64 fd321, fd305, fd313; +add.f64 fd322, fd306, fd314; +sub.f64 fd323, fd305, fd313; +sub.f64 fd324, fd306, fd314; +add.f64 %1, fd318, fd322; +add.f64 %0, fd317, fd321; +add.f64 %3, fd320, fd323; +sub.f64 %2, fd319, fd324; +sub.f64 %5, fd318, fd322; +sub.f64 %4, fd317, fd321; +sub.f64 %7, fd320, fd323; +add.f64 %6, fd319, fd324; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_4096), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<667, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<293>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %14, %19; +add.f64 fd18, %15, %21; +sub.f64 fd19, %14, %19; +sub.f64 fd20, %15, %21; +add.f64 fd21, %16, %22; +add.f64 fd22, %18, %23; +sub.f64 fd23, %16, %22; +sub.f64 fd24, %18, %23; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+16384]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32736; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+8192]; +ld.shared.f64 fd63, [r13+16384]; +ld.shared.f64 fd64, [r13+24576]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+8192]; +ld.shared.f64 fd67, [r13+16384]; +ld.shared.f64 fd68, [r13+24576]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 1020; +bfe.u32 r15, r5, 2, 8; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+4096]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 32640; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+8192]; +ld.shared.f64 fd115, [r21+16384]; +ld.shared.f64 fd116, [r21+24576]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+8192]; +ld.shared.f64 fd119, [r21+16384]; +ld.shared.f64 fd120, [r21+24576]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd128; +add.f64 fd134, fd124, fd127; +add.f64 fd135, fd123, fd128; +sub.f64 fd136, fd124, fd127; +and.b32 r22, r5, 1008; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd134, fd138; +fma.rn.f64 fd142, fd137, fd133, fd141; +mul.f64 fd143, fd133, fd138; +mul.f64 fd144, fd137, fd134; +sub.f64 fd145, fd144, fd143; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd132, fd150; +fma.rn.f64 fd152, fd148, fd131, fd151; +mul.f64 fd153, fd131, fd150; +mul.f64 fd154, fd148, fd132; +sub.f64 fd155, fd154, fd153; +ld.global.v2.f64 {fd156, fd157}, [rd11+1024]; +mul.f64 fd160, fd136, fd157; +fma.rn.f64 fd161, fd156, fd135, fd160; +mul.f64 fd162, fd135, fd157; +mul.f64 fd163, fd156, fd136; +sub.f64 fd164, fd163, fd162; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 32256; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd142; +st.shared.f64 [r26+256], fd152; +st.shared.f64 [r26+384], fd161; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+8192]; +ld.shared.f64 fd167, [r27+16384]; +ld.shared.f64 fd168, [r27+24576]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+8192]; +ld.shared.f64 fd171, [r27+16384]; +ld.shared.f64 fd172, [r27+24576]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd180; +add.f64 fd186, fd176, fd179; +add.f64 fd187, fd175, fd180; +sub.f64 fd188, fd176, fd179; +and.b32 r28, r5, 960; +bfe.u32 r29, r5, 6, 4; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd186, fd190; +fma.rn.f64 fd194, fd189, fd185, fd193; +mul.f64 fd195, fd185, fd190; +mul.f64 fd196, fd189, fd186; +sub.f64 fd197, fd196, fd195; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd184, fd202; +fma.rn.f64 fd204, fd200, fd183, fd203; +mul.f64 fd205, fd183, fd202; +mul.f64 fd206, fd200, fd184; +sub.f64 fd207, fd206, fd205; +ld.global.v2.f64 {fd208, fd209}, [rd14+256]; +mul.f64 fd212, fd188, fd209; +fma.rn.f64 fd213, fd208, fd187, fd212; +mul.f64 fd214, fd187, fd209; +mul.f64 fd215, fd208, fd188; +sub.f64 fd216, fd215, fd214; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 30720; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd194; +st.shared.f64 [r33+1024], fd204; +st.shared.f64 [r33+1536], fd213; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+8192]; +ld.shared.f64 fd219, [r34+16384]; +ld.shared.f64 fd220, [r34+24576]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+8192]; +ld.shared.f64 fd223, [r34+16384]; +ld.shared.f64 fd224, [r34+24576]; +add.f64 fd225, fd217, fd219; +add.f64 fd226, fd221, fd223; +sub.f64 fd227, fd217, fd219; +sub.f64 fd228, fd221, fd223; +add.f64 fd229, fd218, fd220; +add.f64 fd230, fd222, fd224; +sub.f64 fd231, fd218, fd220; +sub.f64 fd232, fd222, fd224; +add.f64 fd233, fd225, fd229; +add.f64 fd234, fd226, fd230; +sub.f64 fd235, fd225, fd229; +sub.f64 fd236, fd226, fd230; +sub.f64 fd237, fd227, fd232; +add.f64 fd238, fd228, fd231; +add.f64 fd239, fd227, fd232; +sub.f64 fd240, fd228, fd231; +and.b32 r35, r5, 768; +bfe.u32 r36, r5, 8, 2; +mul.wide.u32 rd15, r36, 16; +mov.u64 rd16, %13; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd241, fd242}, [rd17]; +mul.f64 fd245, fd238, fd242; +fma.rn.f64 fd246, fd241, fd237, fd245; +mul.f64 fd247, fd237, fd242; +mul.f64 fd248, fd241, fd238; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd241, fd241; +mul.f64 fd251, fd242, fd242; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd242, fd241; +fma.rn.f64 fd254, fd242, fd241, fd253; +mul.f64 fd255, fd236, fd254; +fma.rn.f64 fd256, fd252, fd235, fd255; +mul.f64 fd257, fd235, fd254; +mul.f64 fd258, fd252, fd236; +sub.f64 fd259, fd258, fd257; +ld.global.v2.f64 {fd260, fd261}, [rd17+64]; +mul.f64 fd264, fd240, fd261; +fma.rn.f64 fd265, fd260, fd239, fd264; +mul.f64 fd266, fd239, fd261; +mul.f64 fd267, fd260, fd240; +sub.f64 fd268, fd267, fd266; +and.b32 r37, r16, 2040; +add.s32 r38, r10, r37; +barrier.sync 0; +and.b32 r39, r8, 24576; +add.s32 r40, r38, r39; +st.shared.f64 [r40], fd233; +st.shared.f64 [r40+2048], fd246; +st.shared.f64 [r40+4096], fd256; +st.shared.f64 [r40+6144], fd265; +barrier.sync 0; +mad.lo.s32 r41, r35, -24, r40; +ld.shared.f64 fd269, [r41]; +ld.shared.f64 fd270, [r41+8192]; +ld.shared.f64 fd271, [r41+16384]; +ld.shared.f64 fd272, [r41+24576]; +barrier.sync 0; +st.shared.f64 [r40], fd234; +st.shared.f64 [r40+2048], fd249; +st.shared.f64 [r40+4096], fd259; +st.shared.f64 [r40+6144], fd268; +barrier.sync 0; +ld.shared.f64 fd273, [r41]; +ld.shared.f64 fd274, [r41+8192]; +ld.shared.f64 fd275, [r41+16384]; +ld.shared.f64 fd276, [r41+24576]; +add.f64 fd277, fd269, fd271; +add.f64 fd278, fd273, fd275; +sub.f64 fd279, fd269, fd271; +sub.f64 fd280, fd273, fd275; +add.f64 fd281, fd270, fd272; +add.f64 fd282, fd274, fd276; +sub.f64 fd283, fd270, fd272; +sub.f64 fd284, fd274, fd276; +add.f64 %0, fd277, fd281; +add.f64 %1, fd278, fd282; +add.f64 %3, fd280, fd283; +sub.f64 %2, fd279, fd284; +sub.f64 %4, fd277, fd281; +sub.f64 %5, fd278, fd282; +sub.f64 %7, fd280, fd283; +add.f64 %6, fd279, fd284; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_4096), "l"(lut_dp_4_1024), "l"(lut_dp_4_256), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..74804e82c051e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp16_fwd.hpp.inc @@ -0,0 +1,3755 @@ +#ifndef CUFFTDX_FFT_49_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_49_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<918, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<60>; +.reg .b32 r<1636>; +.reg .b64 rd<4>; +mov.u32 r1622, %tid.x; +mov.f32 f54, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r1, {low, high}; +} +mov.f32 f56, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r2, {low, high}; +} +mov.f32 f42, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r3, {low, high}; +} +mov.f32 f44, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r4, {low, high}; +} +mov.f32 f50, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r5, {low, high}; +} +mov.f32 f52, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r6, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r7, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r8, {low, high}; +} +{ +neg.f16x2 r9, r8; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r11, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r12, {low, high}; +} +{ +neg.f16x2 r13, r12; +} +{ +add.f16x2 r15, %17, %27; +} +{ +add.f16x2 r18, %15, r15; +} +{ +add.f16x2 r21, %19, %25; +} +{ +add.f16x2 r24, r18, r21; +} +{ +add.f16x2 r27, %21, %23; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %18, %28; +} +{ +add.f16x2 r36, %16, r33; +} +{ +add.f16x2 r39, %20, %26; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %22, %24; +} +{ +add.f16x2 r48, r42, r45; +} +{ +add.f16x2 r51, %17, %27; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %15, r54; +} +{ +add.f16x2 r60, %19, %25; +} +{ +mul.f16x2 r63, r60, r3; +} +{ +add.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %21, %23; +} +{ +mul.f16x2 r72, r69, r5; +} +{ +add.f16x2 r75, r66, r72; +} +{ +sub.f16x2 r78, %18, %28; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +sub.f16x2 r84, %20, %26; +} +{ +mul.f16x2 r87, r84, r4; +} +{ +add.f16x2 r90, r81, r87; +} +{ +sub.f16x2 r93, %22, %24; +} +{ +mul.f16x2 r96, r93, r6; +} +{ +add.f16x2 r99, r90, r96; +} +{ +sub.f16x2 r102, r75, r99; +} +{ +add.f16x2 r105, %17, %27; +} +{ +mul.f16x2 r108, r105, r1; +} +{ +add.f16x2 r111, %15, r108; +} +{ +add.f16x2 r114, %19, %25; +} +{ +mul.f16x2 r117, r114, r3; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %21, %23; +} +{ +mul.f16x2 r126, r123, r5; +} +{ +add.f16x2 r129, r120, r126; +} +{ +sub.f16x2 r132, %18, %28; +} +{ +mul.f16x2 r135, r132, r2; +} +{ +sub.f16x2 r138, %20, %26; +} +{ +mul.f16x2 r141, r138, r4; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %22, %24; +} +{ +mul.f16x2 r150, r147, r6; +} +{ +add.f16x2 r153, r144, r150; +} +{ +add.f16x2 r156, r129, r153; +} +{ +add.f16x2 r159, %17, %27; +} +{ +mul.f16x2 r162, r159, r3; +} +{ +add.f16x2 r165, %15, r162; +} +{ +add.f16x2 r168, %19, %25; +} +{ +mul.f16x2 r171, r168, r7; +} +{ +add.f16x2 r174, r165, r171; +} +{ +add.f16x2 r177, %21, %23; +} +{ +mul.f16x2 r180, r177, r11; +} +{ +add.f16x2 r183, r174, r180; +} +{ +sub.f16x2 r186, %18, %28; +} +{ +mul.f16x2 r189, r186, r4; +} +{ +sub.f16x2 r192, %20, %26; +} +{ +mul.f16x2 r195, r192, r9; +} +{ +add.f16x2 r198, r189, r195; +} +{ +sub.f16x2 r201, %22, %24; +} +{ +mul.f16x2 r204, r201, r13; +} +{ +add.f16x2 r207, r198, r204; +} +{ +sub.f16x2 r210, r183, r207; +} +{ +add.f16x2 r213, %17, %27; +} +{ +mul.f16x2 r216, r213, r3; +} +{ +add.f16x2 r219, %15, r216; +} +{ +add.f16x2 r222, %19, %25; +} +{ +mul.f16x2 r225, r222, r7; +} +{ +add.f16x2 r228, r219, r225; +} +{ +add.f16x2 r231, %21, %23; +} +{ +mul.f16x2 r234, r231, r11; +} +{ +add.f16x2 r237, r228, r234; +} +{ +sub.f16x2 r240, %18, %28; +} +{ +mul.f16x2 r243, r240, r4; +} +{ +sub.f16x2 r246, %20, %26; +} +{ +mul.f16x2 r249, r246, r9; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %22, %24; +} +{ +mul.f16x2 r258, r255, r13; +} +{ +add.f16x2 r261, r252, r258; +} +{ +add.f16x2 r264, r237, r261; +} +{ +add.f16x2 r267, %17, %27; +} +{ +mul.f16x2 r270, r267, r5; +} +{ +add.f16x2 r273, %15, r270; +} +{ +add.f16x2 r276, %19, %25; +} +{ +mul.f16x2 r279, r276, r11; +} +{ +add.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %21, %23; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, r282, r288; +} +{ +sub.f16x2 r294, %18, %28; +} +{ +mul.f16x2 r297, r294, r6; +} +{ +sub.f16x2 r300, %20, %26; +} +{ +mul.f16x2 r303, r300, r13; +} +{ +add.f16x2 r306, r297, r303; +} +{ +sub.f16x2 r309, %22, %24; +} +{ +mul.f16x2 r312, r309, r4; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r291, r315; +} +{ +add.f16x2 r321, %17, %27; +} +{ +mul.f16x2 r324, r321, r5; +} +{ +add.f16x2 r327, %15, r324; +} +{ +add.f16x2 r330, %19, %25; +} +{ +mul.f16x2 r333, r330, r11; +} +{ +add.f16x2 r336, r327, r333; +} +{ +add.f16x2 r339, %21, %23; +} +{ +mul.f16x2 r342, r339, r3; +} +{ +add.f16x2 r345, r336, r342; +} +{ +sub.f16x2 r348, %18, %28; +} +{ +mul.f16x2 r351, r348, r6; +} +{ +sub.f16x2 r354, %20, %26; +} +{ +mul.f16x2 r357, r354, r13; +} +{ +add.f16x2 r360, r351, r357; +} +{ +sub.f16x2 r363, %22, %24; +} +{ +mul.f16x2 r366, r363, r4; +} +{ +add.f16x2 r369, r360, r366; +} +{ +add.f16x2 r372, r345, r369; +} +{ +add.f16x2 r375, %18, %28; +} +{ +mul.f16x2 r378, r375, r1; +} +{ +add.f16x2 r381, %16, r378; +} +{ +add.f16x2 r384, %20, %26; +} +{ +mul.f16x2 r387, r384, r3; +} +{ +add.f16x2 r390, r381, r387; +} +{ +add.f16x2 r393, %22, %24; +} +{ +mul.f16x2 r396, r393, r5; +} +{ +add.f16x2 r399, r390, r396; +} +{ +sub.f16x2 r402, %17, %27; +} +{ +mul.f16x2 r405, r402, r2; +} +{ +sub.f16x2 r408, %19, %25; +} +{ +mul.f16x2 r411, r408, r4; +} +{ +add.f16x2 r414, r405, r411; +} +{ +sub.f16x2 r417, %21, %23; +} +{ +mul.f16x2 r420, r417, r6; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r399, r423; +} +{ +add.f16x2 r429, %18, %28; +} +{ +mul.f16x2 r432, r429, r1; +} +{ +add.f16x2 r435, %16, r432; +} +{ +add.f16x2 r438, %20, %26; +} +{ +mul.f16x2 r441, r438, r3; +} +{ +add.f16x2 r444, r435, r441; +} +{ +add.f16x2 r447, %22, %24; +} +{ +mul.f16x2 r450, r447, r5; +} +{ +add.f16x2 r453, r444, r450; +} +{ +sub.f16x2 r456, %17, %27; +} +{ +mul.f16x2 r459, r456, r2; +} +{ +sub.f16x2 r462, %19, %25; +} +{ +mul.f16x2 r465, r462, r4; +} +{ +add.f16x2 r468, r459, r465; +} +{ +sub.f16x2 r471, %21, %23; +} +{ +mul.f16x2 r474, r471, r6; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r453, r477; +} +{ +add.f16x2 r483, %18, %28; +} +{ +mul.f16x2 r486, r483, r3; +} +{ +add.f16x2 r489, %16, r486; +} +{ +add.f16x2 r492, %20, %26; +} +{ +mul.f16x2 r495, r492, r7; +} +{ +add.f16x2 r498, r489, r495; +} +{ +add.f16x2 r501, %22, %24; +} +{ +mul.f16x2 r504, r501, r11; +} +{ +add.f16x2 r507, r498, r504; +} +{ +sub.f16x2 r510, %17, %27; +} +{ +mul.f16x2 r513, r510, r4; +} +{ +sub.f16x2 r516, %19, %25; +} +{ +mul.f16x2 r519, r516, r9; +} +{ +add.f16x2 r522, r513, r519; +} +{ +sub.f16x2 r525, %21, %23; +} +{ +mul.f16x2 r528, r525, r13; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r507, r531; +} +{ +add.f16x2 r537, %18, %28; +} +{ +mul.f16x2 r540, r537, r3; +} +{ +add.f16x2 r543, %16, r540; +} +{ +add.f16x2 r546, %20, %26; +} +{ +mul.f16x2 r549, r546, r7; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, %22, %24; +} +{ +mul.f16x2 r558, r555, r11; +} +{ +add.f16x2 r561, r552, r558; +} +{ +sub.f16x2 r564, %17, %27; +} +{ +mul.f16x2 r567, r564, r4; +} +{ +sub.f16x2 r570, %19, %25; +} +{ +mul.f16x2 r573, r570, r9; +} +{ +add.f16x2 r576, r567, r573; +} +{ +sub.f16x2 r579, %21, %23; +} +{ +mul.f16x2 r582, r579, r13; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r561, r585; +} +{ +add.f16x2 r591, %18, %28; +} +{ +mul.f16x2 r594, r591, r5; +} +{ +add.f16x2 r597, %16, r594; +} +{ +add.f16x2 r600, %20, %26; +} +{ +mul.f16x2 r603, r600, r11; +} +{ +add.f16x2 r606, r597, r603; +} +{ +add.f16x2 r609, %22, %24; +} +{ +mul.f16x2 r612, r609, r3; +} +{ +add.f16x2 r615, r606, r612; +} +{ +sub.f16x2 r618, %17, %27; +} +{ +mul.f16x2 r621, r618, r6; +} +{ +sub.f16x2 r624, %19, %25; +} +{ +mul.f16x2 r627, r624, r13; +} +{ +add.f16x2 r630, r621, r627; +} +{ +sub.f16x2 r633, %21, %23; +} +{ +mul.f16x2 r636, r633, r4; +} +{ +add.f16x2 r639, r630, r636; +} +{ +add.f16x2 r642, r615, r639; +} +{ +add.f16x2 r645, %18, %28; +} +{ +mul.f16x2 r648, r645, r5; +} +{ +add.f16x2 r651, %16, r648; +} +{ +add.f16x2 r654, %20, %26; +} +{ +mul.f16x2 r657, r654, r11; +} +{ +add.f16x2 r660, r651, r657; +} +{ +add.f16x2 r663, %22, %24; +} +{ +mul.f16x2 r666, r663, r3; +} +{ +add.f16x2 r669, r660, r666; +} +{ +sub.f16x2 r672, %17, %27; +} +{ +mul.f16x2 r675, r672, r6; +} +{ +sub.f16x2 r678, %19, %25; +} +{ +mul.f16x2 r681, r678, r13; +} +{ +add.f16x2 r684, r675, r681; +} +{ +sub.f16x2 r687, %21, %23; +} +{ +mul.f16x2 r690, r687, r4; +} +{ +add.f16x2 r693, r684, r690; +} +{ +sub.f16x2 r696, r669, r693; +} +mul.wide.u32 rd2, r1622, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1623, rd3; +sub.s32 r1624, r1622, r1623; +shr.u32 r1625, r1624, 1; +add.s32 r1626, r1625, r1623; +shr.u32 r1627, r1626, 2; +mul.lo.s32 r1628, r1627, 7; +sub.s32 r1629, r1622, r1628; +cvt.rn.f32.u32 f57, r1629; +mul.f32 f58, f57, 0f3E034E46; +cos.approx.f32 f21, f58; +sin.approx.f32 f59, f58; +neg.f32 f22, f59; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r699, {low, high}; +} +mov.u32 r1630, %tid.y; +mov.u32 r1631, %14; +mad.lo.s32 r1632, r1630, 392, r1631; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r704, {high, high}; +} +{ +mul.f16x2 r706, r426, r704; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r102, r702, r709; +} +{ +mul.f16x2 r715, r102, r704; +} +{ +fma.rn.f16x2 r718, r426, r702, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r724, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r726, {low, high}; +} +{ +mul.f16x2 r727, r724, r726; +} +{ +mul.f16x2 r730, r699, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r733, {high, low}; +} +{ +fma.rn.f16x2 r735, r727, r733, r730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r741, {high, high}; +} +{ +mul.f16x2 r743, r534, r741; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r210, r739, r746; +} +{ +mul.f16x2 r752, r210, r741; +} +{ +fma.rn.f16x2 r755, r534, r739, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r761, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r763, {low, high}; +} +{ +mul.f16x2 r764, r761, r763; +} +{ +mul.f16x2 r767, r735, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r770, {high, low}; +} +{ +fma.rn.f16x2 r772, r764, r770, r767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r778, {high, high}; +} +{ +mul.f16x2 r780, r642, r778; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r318, r776, r783; +} +{ +mul.f16x2 r789, r318, r778; +} +{ +fma.rn.f16x2 r792, r642, r776, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r798, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r798, r800; +} +{ +mul.f16x2 r804, r772, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r807, {high, low}; +} +{ +fma.rn.f16x2 r809, r801, r807, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r815, {high, high}; +} +{ +mul.f16x2 r817, r696, r815; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r372, r813, r820; +} +{ +mul.f16x2 r826, r372, r815; +} +{ +fma.rn.f16x2 r829, r696, r813, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r833, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r835, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r837, {low, high}; +} +{ +mul.f16x2 r838, r835, r837; +} +{ +mul.f16x2 r841, r809, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r844, {high, low}; +} +{ +fma.rn.f16x2 r846, r838, r844, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r850, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r852, {high, high}; +} +{ +mul.f16x2 r854, r588, r852; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r264, r850, r857; +} +{ +mul.f16x2 r863, r264, r852; +} +{ +fma.rn.f16x2 r866, r588, r850, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r870, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r872, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r874, {low, high}; +} +{ +mul.f16x2 r875, r872, r874; +} +{ +mul.f16x2 r878, r846, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r881, {high, low}; +} +{ +fma.rn.f16x2 r883, r875, r881, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r887, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r889, {high, high}; +} +{ +mul.f16x2 r891, r480, r889; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r156, r887, r894; +} +{ +mul.f16x2 r900, r156, r889; +} +{ +fma.rn.f16x2 r903, r480, r887, r900; +} +mad.lo.s32 r1633, r1627, 392, r1632; +barrier.sync 0; +mad.lo.s32 r1634, r1629, 56, r1633; +st.shared.v2.f32 [r1634], {r30, r48}; +st.shared.v2.f32 [r1634+8], {r711, r718}; +st.shared.v2.f32 [r1634+16], {r748, r755}; +st.shared.v2.f32 [r1634+24], {r785, r792}; +st.shared.v2.f32 [r1634+32], {r822, r829}; +st.shared.v2.f32 [r1634+40], {r859, r866}; +st.shared.v2.f32 [r1634+48], {r896, r903}; +barrier.sync 0; +mad.lo.s32 r1635, r1629, -48, r1634; +ld.shared.u32 r942, [r1635]; +ld.shared.u32 r960, [r1635+4]; +ld.shared.u32 r939, [r1635+56]; +ld.shared.u32 r957, [r1635+60]; +ld.shared.u32 r945, [r1635+112]; +ld.shared.u32 r963, [r1635+116]; +ld.shared.u32 r951, [r1635+168]; +ld.shared.u32 r969, [r1635+172]; +ld.shared.u32 r952, [r1635+224]; +ld.shared.u32 r970, [r1635+228]; +ld.shared.u32 r946, [r1635+280]; +ld.shared.u32 r964, [r1635+284]; +ld.shared.u32 r940, [r1635+336]; +ld.shared.u32 r958, [r1635+340]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +add.f16x2 r938, r939, r940; +} +{ +add.f16x2 r941, r942, r938; +} +{ +add.f16x2 r944, r945, r946; +} +{ +add.f16x2 r947, r941, r944; +} +{ +add.f16x2 r950, r951, r952; +} +{ +add.f16x2 %0, r947, r950; +} +{ +add.f16x2 r956, r957, r958; +} +{ +add.f16x2 r959, r960, r956; +} +{ +add.f16x2 r962, r963, r964; +} +{ +add.f16x2 r965, r959, r962; +} +{ +add.f16x2 r968, r969, r970; +} +{ +add.f16x2 %1, r965, r968; +} +{ +add.f16x2 r974, r939, r940; +} +{ +mul.f16x2 r977, r974, r924; +} +{ +add.f16x2 r980, r942, r977; +} +{ +add.f16x2 r983, r945, r946; +} +{ +mul.f16x2 r986, r983, r926; +} +{ +add.f16x2 r989, r980, r986; +} +{ +add.f16x2 r992, r951, r952; +} +{ +mul.f16x2 r995, r992, r928; +} +{ +add.f16x2 r998, r989, r995; +} +{ +sub.f16x2 r1001, r957, r958; +} +{ +mul.f16x2 r1004, r1001, r925; +} +{ +sub.f16x2 r1007, r963, r964; +} +{ +mul.f16x2 r1010, r1007, r927; +} +{ +add.f16x2 r1013, r1004, r1010; +} +{ +sub.f16x2 r1016, r969, r970; +} +{ +mul.f16x2 r1019, r1016, r929; +} +{ +add.f16x2 r1022, r1013, r1019; +} +{ +sub.f16x2 %2, r998, r1022; +} +{ +add.f16x2 r1028, r939, r940; +} +{ +mul.f16x2 r1031, r1028, r924; +} +{ +add.f16x2 r1034, r942, r1031; +} +{ +add.f16x2 r1037, r945, r946; +} +{ +mul.f16x2 r1040, r1037, r926; +} +{ +add.f16x2 r1043, r1034, r1040; +} +{ +add.f16x2 r1046, r951, r952; +} +{ +mul.f16x2 r1049, r1046, r928; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, r957, r958; +} +{ +mul.f16x2 r1058, r1055, r925; +} +{ +sub.f16x2 r1061, r963, r964; +} +{ +mul.f16x2 r1064, r1061, r927; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +sub.f16x2 r1070, r969, r970; +} +{ +mul.f16x2 r1073, r1070, r929; +} +{ +add.f16x2 r1076, r1067, r1073; +} +{ +add.f16x2 %12, r1052, r1076; +} +{ +add.f16x2 r1082, r939, r940; +} +{ +mul.f16x2 r1085, r1082, r926; +} +{ +add.f16x2 r1088, r942, r1085; +} +{ +add.f16x2 r1091, r945, r946; +} +{ +mul.f16x2 r1094, r1091, r930; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r951, r952; +} +{ +mul.f16x2 r1103, r1100, r934; +} +{ +add.f16x2 r1106, r1097, r1103; +} +{ +sub.f16x2 r1109, r957, r958; +} +{ +mul.f16x2 r1112, r1109, r927; +} +{ +sub.f16x2 r1115, r963, r964; +} +{ +mul.f16x2 r1118, r1115, r932; +} +{ +add.f16x2 r1121, r1112, r1118; +} +{ +sub.f16x2 r1124, r969, r970; +} +{ +mul.f16x2 r1127, r1124, r936; +} +{ +add.f16x2 r1130, r1121, r1127; +} +{ +sub.f16x2 %4, r1106, r1130; +} +{ +add.f16x2 r1136, r939, r940; +} +{ +mul.f16x2 r1139, r1136, r926; +} +{ +add.f16x2 r1142, r942, r1139; +} +{ +add.f16x2 r1145, r945, r946; +} +{ +mul.f16x2 r1148, r1145, r930; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r951, r952; +} +{ +mul.f16x2 r1157, r1154, r934; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, r957, r958; +} +{ +mul.f16x2 r1166, r1163, r927; +} +{ +sub.f16x2 r1169, r963, r964; +} +{ +mul.f16x2 r1172, r1169, r932; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +sub.f16x2 r1178, r969, r970; +} +{ +mul.f16x2 r1181, r1178, r936; +} +{ +add.f16x2 r1184, r1175, r1181; +} +{ +add.f16x2 %10, r1160, r1184; +} +{ +add.f16x2 r1190, r939, r940; +} +{ +mul.f16x2 r1193, r1190, r928; +} +{ +add.f16x2 r1196, r942, r1193; +} +{ +add.f16x2 r1199, r945, r946; +} +{ +mul.f16x2 r1202, r1199, r934; +} +{ +add.f16x2 r1205, r1196, r1202; +} +{ +add.f16x2 r1208, r951, r952; +} +{ +mul.f16x2 r1211, r1208, r926; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +sub.f16x2 r1217, r957, r958; +} +{ +mul.f16x2 r1220, r1217, r929; +} +{ +sub.f16x2 r1223, r963, r964; +} +{ +mul.f16x2 r1226, r1223, r936; +} +{ +add.f16x2 r1229, r1220, r1226; +} +{ +sub.f16x2 r1232, r969, r970; +} +{ +mul.f16x2 r1235, r1232, r927; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 %6, r1214, r1238; +} +{ +add.f16x2 r1244, r939, r940; +} +{ +mul.f16x2 r1247, r1244, r928; +} +{ +add.f16x2 r1250, r942, r1247; +} +{ +add.f16x2 r1253, r945, r946; +} +{ +mul.f16x2 r1256, r1253, r934; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r951, r952; +} +{ +mul.f16x2 r1265, r1262, r926; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, r957, r958; +} +{ +mul.f16x2 r1274, r1271, r929; +} +{ +sub.f16x2 r1277, r963, r964; +} +{ +mul.f16x2 r1280, r1277, r936; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r969, r970; +} +{ +mul.f16x2 r1289, r1286, r927; +} +{ +add.f16x2 r1292, r1283, r1289; +} +{ +add.f16x2 %8, r1268, r1292; +} +{ +add.f16x2 r1298, r957, r958; +} +{ +mul.f16x2 r1301, r1298, r924; +} +{ +add.f16x2 r1304, r960, r1301; +} +{ +add.f16x2 r1307, r963, r964; +} +{ +mul.f16x2 r1310, r1307, r926; +} +{ +add.f16x2 r1313, r1304, r1310; +} +{ +add.f16x2 r1316, r969, r970; +} +{ +mul.f16x2 r1319, r1316, r928; +} +{ +add.f16x2 r1322, r1313, r1319; +} +{ +sub.f16x2 r1325, r939, r940; +} +{ +mul.f16x2 r1328, r1325, r925; +} +{ +sub.f16x2 r1331, r945, r946; +} +{ +mul.f16x2 r1334, r1331, r927; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +sub.f16x2 r1340, r951, r952; +} +{ +mul.f16x2 r1343, r1340, r929; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +add.f16x2 %3, r1322, r1346; +} +{ +add.f16x2 r1352, r957, r958; +} +{ +mul.f16x2 r1355, r1352, r924; +} +{ +add.f16x2 r1358, r960, r1355; +} +{ +add.f16x2 r1361, r963, r964; +} +{ +mul.f16x2 r1364, r1361, r926; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r969, r970; +} +{ +mul.f16x2 r1373, r1370, r928; +} +{ +add.f16x2 r1376, r1367, r1373; +} +{ +sub.f16x2 r1379, r939, r940; +} +{ +mul.f16x2 r1382, r1379, r925; +} +{ +sub.f16x2 r1385, r945, r946; +} +{ +mul.f16x2 r1388, r1385, r927; +} +{ +add.f16x2 r1391, r1382, r1388; +} +{ +sub.f16x2 r1394, r951, r952; +} +{ +mul.f16x2 r1397, r1394, r929; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 %13, r1376, r1400; +} +{ +add.f16x2 r1406, r957, r958; +} +{ +mul.f16x2 r1409, r1406, r926; +} +{ +add.f16x2 r1412, r960, r1409; +} +{ +add.f16x2 r1415, r963, r964; +} +{ +mul.f16x2 r1418, r1415, r930; +} +{ +add.f16x2 r1421, r1412, r1418; +} +{ +add.f16x2 r1424, r969, r970; +} +{ +mul.f16x2 r1427, r1424, r934; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +sub.f16x2 r1433, r939, r940; +} +{ +mul.f16x2 r1436, r1433, r927; +} +{ +sub.f16x2 r1439, r945, r946; +} +{ +mul.f16x2 r1442, r1439, r932; +} +{ +add.f16x2 r1445, r1436, r1442; +} +{ +sub.f16x2 r1448, r951, r952; +} +{ +mul.f16x2 r1451, r1448, r936; +} +{ +add.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 %5, r1430, r1454; +} +{ +add.f16x2 r1460, r957, r958; +} +{ +mul.f16x2 r1463, r1460, r926; +} +{ +add.f16x2 r1466, r960, r1463; +} +{ +add.f16x2 r1469, r963, r964; +} +{ +mul.f16x2 r1472, r1469, r930; +} +{ +add.f16x2 r1475, r1466, r1472; +} +{ +add.f16x2 r1478, r969, r970; +} +{ +mul.f16x2 r1481, r1478, r934; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r939, r940; +} +{ +mul.f16x2 r1490, r1487, r927; +} +{ +sub.f16x2 r1493, r945, r946; +} +{ +mul.f16x2 r1496, r1493, r932; +} +{ +add.f16x2 r1499, r1490, r1496; +} +{ +sub.f16x2 r1502, r951, r952; +} +{ +mul.f16x2 r1505, r1502, r936; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 %11, r1484, r1508; +} +{ +add.f16x2 r1514, r957, r958; +} +{ +mul.f16x2 r1517, r1514, r928; +} +{ +add.f16x2 r1520, r960, r1517; +} +{ +add.f16x2 r1523, r963, r964; +} +{ +mul.f16x2 r1526, r1523, r934; +} +{ +add.f16x2 r1529, r1520, r1526; +} +{ +add.f16x2 r1532, r969, r970; +} +{ +mul.f16x2 r1535, r1532, r926; +} +{ +add.f16x2 r1538, r1529, r1535; +} +{ +sub.f16x2 r1541, r939, r940; +} +{ +mul.f16x2 r1544, r1541, r929; +} +{ +sub.f16x2 r1547, r945, r946; +} +{ +mul.f16x2 r1550, r1547, r936; +} +{ +add.f16x2 r1553, r1544, r1550; +} +{ +sub.f16x2 r1556, r951, r952; +} +{ +mul.f16x2 r1559, r1556, r927; +} +{ +add.f16x2 r1562, r1553, r1559; +} +{ +add.f16x2 %7, r1538, r1562; +} +{ +add.f16x2 r1568, r957, r958; +} +{ +mul.f16x2 r1571, r1568, r928; +} +{ +add.f16x2 r1574, r960, r1571; +} +{ +add.f16x2 r1577, r963, r964; +} +{ +mul.f16x2 r1580, r1577, r934; +} +{ +add.f16x2 r1583, r1574, r1580; +} +{ +add.f16x2 r1586, r969, r970; +} +{ +mul.f16x2 r1589, r1586, r926; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +sub.f16x2 r1595, r939, r940; +} +{ +mul.f16x2 r1598, r1595, r929; +} +{ +sub.f16x2 r1601, r945, r946; +} +{ +mul.f16x2 r1604, r1601, r936; +} +{ +add.f16x2 r1607, r1598, r1604; +} +{ +sub.f16x2 r1610, r951, r952; +} +{ +mul.f16x2 r1613, r1610, r927; +} +{ +add.f16x2 r1616, r1607, r1613; +} +{ +sub.f16x2 %9, r1592, r1616; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<919, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<60>; +.reg .b32 r<1636>; +.reg .b64 rd<4>; +mov.u32 r1622, %tid.y; +mov.u32 r1623, %14; +mad.lo.s32 r1624, r1622, 196, r1623; +mov.u32 r1625, %tid.x; +mov.f32 f54, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r1, {low, high}; +} +mov.f32 f56, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r2, {low, high}; +} +mov.f32 f42, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r3, {low, high}; +} +mov.f32 f44, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r4, {low, high}; +} +mov.f32 f50, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r5, {low, high}; +} +mov.f32 f52, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r6, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r7, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r8, {low, high}; +} +{ +neg.f16x2 r9, r8; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r11, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r12, {low, high}; +} +{ +neg.f16x2 r13, r12; +} +{ +add.f16x2 r15, %17, %27; +} +{ +add.f16x2 r18, %15, r15; +} +{ +add.f16x2 r21, %19, %25; +} +{ +add.f16x2 r24, r18, r21; +} +{ +add.f16x2 r27, %21, %23; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %18, %28; +} +{ +add.f16x2 r36, %16, r33; +} +{ +add.f16x2 r39, %20, %26; +} +{ +add.f16x2 r42, r36, r39; +} +{ +add.f16x2 r45, %22, %24; +} +{ +add.f16x2 r48, r42, r45; +} +{ +add.f16x2 r51, %17, %27; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %15, r54; +} +{ +add.f16x2 r60, %19, %25; +} +{ +mul.f16x2 r63, r60, r3; +} +{ +add.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %21, %23; +} +{ +mul.f16x2 r72, r69, r5; +} +{ +add.f16x2 r75, r66, r72; +} +{ +sub.f16x2 r78, %18, %28; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +sub.f16x2 r84, %20, %26; +} +{ +mul.f16x2 r87, r84, r4; +} +{ +add.f16x2 r90, r81, r87; +} +{ +sub.f16x2 r93, %22, %24; +} +{ +mul.f16x2 r96, r93, r6; +} +{ +add.f16x2 r99, r90, r96; +} +{ +sub.f16x2 r102, r75, r99; +} +{ +add.f16x2 r105, %17, %27; +} +{ +mul.f16x2 r108, r105, r1; +} +{ +add.f16x2 r111, %15, r108; +} +{ +add.f16x2 r114, %19, %25; +} +{ +mul.f16x2 r117, r114, r3; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %21, %23; +} +{ +mul.f16x2 r126, r123, r5; +} +{ +add.f16x2 r129, r120, r126; +} +{ +sub.f16x2 r132, %18, %28; +} +{ +mul.f16x2 r135, r132, r2; +} +{ +sub.f16x2 r138, %20, %26; +} +{ +mul.f16x2 r141, r138, r4; +} +{ +add.f16x2 r144, r135, r141; +} +{ +sub.f16x2 r147, %22, %24; +} +{ +mul.f16x2 r150, r147, r6; +} +{ +add.f16x2 r153, r144, r150; +} +{ +add.f16x2 r156, r129, r153; +} +{ +add.f16x2 r159, %17, %27; +} +{ +mul.f16x2 r162, r159, r3; +} +{ +add.f16x2 r165, %15, r162; +} +{ +add.f16x2 r168, %19, %25; +} +{ +mul.f16x2 r171, r168, r7; +} +{ +add.f16x2 r174, r165, r171; +} +{ +add.f16x2 r177, %21, %23; +} +{ +mul.f16x2 r180, r177, r11; +} +{ +add.f16x2 r183, r174, r180; +} +{ +sub.f16x2 r186, %18, %28; +} +{ +mul.f16x2 r189, r186, r4; +} +{ +sub.f16x2 r192, %20, %26; +} +{ +mul.f16x2 r195, r192, r9; +} +{ +add.f16x2 r198, r189, r195; +} +{ +sub.f16x2 r201, %22, %24; +} +{ +mul.f16x2 r204, r201, r13; +} +{ +add.f16x2 r207, r198, r204; +} +{ +sub.f16x2 r210, r183, r207; +} +{ +add.f16x2 r213, %17, %27; +} +{ +mul.f16x2 r216, r213, r3; +} +{ +add.f16x2 r219, %15, r216; +} +{ +add.f16x2 r222, %19, %25; +} +{ +mul.f16x2 r225, r222, r7; +} +{ +add.f16x2 r228, r219, r225; +} +{ +add.f16x2 r231, %21, %23; +} +{ +mul.f16x2 r234, r231, r11; +} +{ +add.f16x2 r237, r228, r234; +} +{ +sub.f16x2 r240, %18, %28; +} +{ +mul.f16x2 r243, r240, r4; +} +{ +sub.f16x2 r246, %20, %26; +} +{ +mul.f16x2 r249, r246, r9; +} +{ +add.f16x2 r252, r243, r249; +} +{ +sub.f16x2 r255, %22, %24; +} +{ +mul.f16x2 r258, r255, r13; +} +{ +add.f16x2 r261, r252, r258; +} +{ +add.f16x2 r264, r237, r261; +} +{ +add.f16x2 r267, %17, %27; +} +{ +mul.f16x2 r270, r267, r5; +} +{ +add.f16x2 r273, %15, r270; +} +{ +add.f16x2 r276, %19, %25; +} +{ +mul.f16x2 r279, r276, r11; +} +{ +add.f16x2 r282, r273, r279; +} +{ +add.f16x2 r285, %21, %23; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, r282, r288; +} +{ +sub.f16x2 r294, %18, %28; +} +{ +mul.f16x2 r297, r294, r6; +} +{ +sub.f16x2 r300, %20, %26; +} +{ +mul.f16x2 r303, r300, r13; +} +{ +add.f16x2 r306, r297, r303; +} +{ +sub.f16x2 r309, %22, %24; +} +{ +mul.f16x2 r312, r309, r4; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r291, r315; +} +{ +add.f16x2 r321, %17, %27; +} +{ +mul.f16x2 r324, r321, r5; +} +{ +add.f16x2 r327, %15, r324; +} +{ +add.f16x2 r330, %19, %25; +} +{ +mul.f16x2 r333, r330, r11; +} +{ +add.f16x2 r336, r327, r333; +} +{ +add.f16x2 r339, %21, %23; +} +{ +mul.f16x2 r342, r339, r3; +} +{ +add.f16x2 r345, r336, r342; +} +{ +sub.f16x2 r348, %18, %28; +} +{ +mul.f16x2 r351, r348, r6; +} +{ +sub.f16x2 r354, %20, %26; +} +{ +mul.f16x2 r357, r354, r13; +} +{ +add.f16x2 r360, r351, r357; +} +{ +sub.f16x2 r363, %22, %24; +} +{ +mul.f16x2 r366, r363, r4; +} +{ +add.f16x2 r369, r360, r366; +} +{ +add.f16x2 r372, r345, r369; +} +{ +add.f16x2 r375, %18, %28; +} +{ +mul.f16x2 r378, r375, r1; +} +{ +add.f16x2 r381, %16, r378; +} +{ +add.f16x2 r384, %20, %26; +} +{ +mul.f16x2 r387, r384, r3; +} +{ +add.f16x2 r390, r381, r387; +} +{ +add.f16x2 r393, %22, %24; +} +{ +mul.f16x2 r396, r393, r5; +} +{ +add.f16x2 r399, r390, r396; +} +{ +sub.f16x2 r402, %17, %27; +} +{ +mul.f16x2 r405, r402, r2; +} +{ +sub.f16x2 r408, %19, %25; +} +{ +mul.f16x2 r411, r408, r4; +} +{ +add.f16x2 r414, r405, r411; +} +{ +sub.f16x2 r417, %21, %23; +} +{ +mul.f16x2 r420, r417, r6; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r399, r423; +} +{ +add.f16x2 r429, %18, %28; +} +{ +mul.f16x2 r432, r429, r1; +} +{ +add.f16x2 r435, %16, r432; +} +{ +add.f16x2 r438, %20, %26; +} +{ +mul.f16x2 r441, r438, r3; +} +{ +add.f16x2 r444, r435, r441; +} +{ +add.f16x2 r447, %22, %24; +} +{ +mul.f16x2 r450, r447, r5; +} +{ +add.f16x2 r453, r444, r450; +} +{ +sub.f16x2 r456, %17, %27; +} +{ +mul.f16x2 r459, r456, r2; +} +{ +sub.f16x2 r462, %19, %25; +} +{ +mul.f16x2 r465, r462, r4; +} +{ +add.f16x2 r468, r459, r465; +} +{ +sub.f16x2 r471, %21, %23; +} +{ +mul.f16x2 r474, r471, r6; +} +{ +add.f16x2 r477, r468, r474; +} +{ +sub.f16x2 r480, r453, r477; +} +{ +add.f16x2 r483, %18, %28; +} +{ +mul.f16x2 r486, r483, r3; +} +{ +add.f16x2 r489, %16, r486; +} +{ +add.f16x2 r492, %20, %26; +} +{ +mul.f16x2 r495, r492, r7; +} +{ +add.f16x2 r498, r489, r495; +} +{ +add.f16x2 r501, %22, %24; +} +{ +mul.f16x2 r504, r501, r11; +} +{ +add.f16x2 r507, r498, r504; +} +{ +sub.f16x2 r510, %17, %27; +} +{ +mul.f16x2 r513, r510, r4; +} +{ +sub.f16x2 r516, %19, %25; +} +{ +mul.f16x2 r519, r516, r9; +} +{ +add.f16x2 r522, r513, r519; +} +{ +sub.f16x2 r525, %21, %23; +} +{ +mul.f16x2 r528, r525, r13; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r507, r531; +} +{ +add.f16x2 r537, %18, %28; +} +{ +mul.f16x2 r540, r537, r3; +} +{ +add.f16x2 r543, %16, r540; +} +{ +add.f16x2 r546, %20, %26; +} +{ +mul.f16x2 r549, r546, r7; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, %22, %24; +} +{ +mul.f16x2 r558, r555, r11; +} +{ +add.f16x2 r561, r552, r558; +} +{ +sub.f16x2 r564, %17, %27; +} +{ +mul.f16x2 r567, r564, r4; +} +{ +sub.f16x2 r570, %19, %25; +} +{ +mul.f16x2 r573, r570, r9; +} +{ +add.f16x2 r576, r567, r573; +} +{ +sub.f16x2 r579, %21, %23; +} +{ +mul.f16x2 r582, r579, r13; +} +{ +add.f16x2 r585, r576, r582; +} +{ +sub.f16x2 r588, r561, r585; +} +{ +add.f16x2 r591, %18, %28; +} +{ +mul.f16x2 r594, r591, r5; +} +{ +add.f16x2 r597, %16, r594; +} +{ +add.f16x2 r600, %20, %26; +} +{ +mul.f16x2 r603, r600, r11; +} +{ +add.f16x2 r606, r597, r603; +} +{ +add.f16x2 r609, %22, %24; +} +{ +mul.f16x2 r612, r609, r3; +} +{ +add.f16x2 r615, r606, r612; +} +{ +sub.f16x2 r618, %17, %27; +} +{ +mul.f16x2 r621, r618, r6; +} +{ +sub.f16x2 r624, %19, %25; +} +{ +mul.f16x2 r627, r624, r13; +} +{ +add.f16x2 r630, r621, r627; +} +{ +sub.f16x2 r633, %21, %23; +} +{ +mul.f16x2 r636, r633, r4; +} +{ +add.f16x2 r639, r630, r636; +} +{ +add.f16x2 r642, r615, r639; +} +{ +add.f16x2 r645, %18, %28; +} +{ +mul.f16x2 r648, r645, r5; +} +{ +add.f16x2 r651, %16, r648; +} +{ +add.f16x2 r654, %20, %26; +} +{ +mul.f16x2 r657, r654, r11; +} +{ +add.f16x2 r660, r651, r657; +} +{ +add.f16x2 r663, %22, %24; +} +{ +mul.f16x2 r666, r663, r3; +} +{ +add.f16x2 r669, r660, r666; +} +{ +sub.f16x2 r672, %17, %27; +} +{ +mul.f16x2 r675, r672, r6; +} +{ +sub.f16x2 r678, %19, %25; +} +{ +mul.f16x2 r681, r678, r13; +} +{ +add.f16x2 r684, r675, r681; +} +{ +sub.f16x2 r687, %21, %23; +} +{ +mul.f16x2 r690, r687, r4; +} +{ +add.f16x2 r693, r684, r690; +} +{ +sub.f16x2 r696, r669, r693; +} +mul.wide.u32 rd2, r1625, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1626, rd3; +sub.s32 r1627, r1625, r1626; +shr.u32 r1628, r1627, 1; +add.s32 r1629, r1628, r1626; +shr.u32 r1630, r1629, 2; +mul.lo.s32 r1631, r1630, 7; +sub.s32 r1632, r1625, r1631; +mad.lo.s32 r1633, r1630, 196, r1624; +cvt.rn.f32.u32 f57, r1632; +mul.f32 f58, f57, 0f3E034E46; +cos.approx.f32 f21, f58; +sin.approx.f32 f59, f58; +neg.f32 f22, f59; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r699, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r702, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r704, {high, high}; +} +{ +mul.f16x2 r706, r426, r704; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r102, r702, r709; +} +{ +mul.f16x2 r715, r102, r704; +} +{ +fma.rn.f16x2 r718, r426, r702, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r724, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r726, {low, high}; +} +{ +mul.f16x2 r727, r724, r726; +} +{ +mul.f16x2 r730, r699, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r733, {high, low}; +} +{ +fma.rn.f16x2 r735, r727, r733, r730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r739, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r741, {high, high}; +} +{ +mul.f16x2 r743, r534, r741; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r210, r739, r746; +} +{ +mul.f16x2 r752, r210, r741; +} +{ +fma.rn.f16x2 r755, r534, r739, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r761, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r763, {low, high}; +} +{ +mul.f16x2 r764, r761, r763; +} +{ +mul.f16x2 r767, r735, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r735; +mov.b32 r770, {high, low}; +} +{ +fma.rn.f16x2 r772, r764, r770, r767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r776, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r778, {high, high}; +} +{ +mul.f16x2 r780, r642, r778; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r318, r776, r783; +} +{ +mul.f16x2 r789, r318, r778; +} +{ +fma.rn.f16x2 r792, r642, r776, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r798, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r800, {low, high}; +} +{ +mul.f16x2 r801, r798, r800; +} +{ +mul.f16x2 r804, r772, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r772; +mov.b32 r807, {high, low}; +} +{ +fma.rn.f16x2 r809, r801, r807, r804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r813, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r815, {high, high}; +} +{ +mul.f16x2 r817, r696, r815; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r372, r813, r820; +} +{ +mul.f16x2 r826, r372, r815; +} +{ +fma.rn.f16x2 r829, r696, r813, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r833, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r835, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r837, {low, high}; +} +{ +mul.f16x2 r838, r835, r837; +} +{ +mul.f16x2 r841, r809, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r809; +mov.b32 r844, {high, low}; +} +{ +fma.rn.f16x2 r846, r838, r844, r841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r850, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r852, {high, high}; +} +{ +mul.f16x2 r854, r588, r852; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r264, r850, r857; +} +{ +mul.f16x2 r863, r264, r852; +} +{ +fma.rn.f16x2 r866, r588, r850, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r870, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r699; +mov.b32 r872, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r874, {low, high}; +} +{ +mul.f16x2 r875, r872, r874; +} +{ +mul.f16x2 r878, r846, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r846; +mov.b32 r881, {high, low}; +} +{ +fma.rn.f16x2 r883, r875, r881, r878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r887, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r883; +mov.b32 r889, {high, high}; +} +{ +mul.f16x2 r891, r480, r889; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r156, r887, r894; +} +{ +mul.f16x2 r900, r156, r889; +} +{ +fma.rn.f16x2 r903, r480, r887, r900; +} +barrier.sync 0; +mad.lo.s32 r1634, r1632, 28, r1633; +st.shared.u32 [r1634], r30; +st.shared.u32 [r1634+4], r711; +st.shared.u32 [r1634+8], r748; +st.shared.u32 [r1634+12], r785; +st.shared.u32 [r1634+16], r822; +st.shared.u32 [r1634+20], r859; +st.shared.u32 [r1634+24], r896; +barrier.sync 0; +mad.lo.s32 r1635, r1632, -24, r1634; +ld.shared.u32 r942, [r1635]; +ld.shared.u32 r939, [r1635+28]; +ld.shared.u32 r945, [r1635+56]; +ld.shared.u32 r951, [r1635+84]; +ld.shared.u32 r952, [r1635+112]; +ld.shared.u32 r946, [r1635+140]; +ld.shared.u32 r940, [r1635+168]; +barrier.sync 0; +st.shared.u32 [r1634], r48; +st.shared.u32 [r1634+4], r718; +st.shared.u32 [r1634+8], r755; +st.shared.u32 [r1634+12], r792; +st.shared.u32 [r1634+16], r829; +st.shared.u32 [r1634+20], r866; +st.shared.u32 [r1634+24], r903; +barrier.sync 0; +ld.shared.u32 r960, [r1635]; +ld.shared.u32 r957, [r1635+28]; +ld.shared.u32 r963, [r1635+56]; +ld.shared.u32 r969, [r1635+84]; +ld.shared.u32 r970, [r1635+112]; +ld.shared.u32 r964, [r1635+140]; +ld.shared.u32 r958, [r1635+168]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r924, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r925, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r927, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r928, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +add.f16x2 r938, r939, r940; +} +{ +add.f16x2 r941, r942, r938; +} +{ +add.f16x2 r944, r945, r946; +} +{ +add.f16x2 r947, r941, r944; +} +{ +add.f16x2 r950, r951, r952; +} +{ +add.f16x2 %0, r947, r950; +} +{ +add.f16x2 r956, r957, r958; +} +{ +add.f16x2 r959, r960, r956; +} +{ +add.f16x2 r962, r963, r964; +} +{ +add.f16x2 r965, r959, r962; +} +{ +add.f16x2 r968, r969, r970; +} +{ +add.f16x2 %1, r965, r968; +} +{ +add.f16x2 r974, r939, r940; +} +{ +mul.f16x2 r977, r974, r924; +} +{ +add.f16x2 r980, r942, r977; +} +{ +add.f16x2 r983, r945, r946; +} +{ +mul.f16x2 r986, r983, r926; +} +{ +add.f16x2 r989, r980, r986; +} +{ +add.f16x2 r992, r951, r952; +} +{ +mul.f16x2 r995, r992, r928; +} +{ +add.f16x2 r998, r989, r995; +} +{ +sub.f16x2 r1001, r957, r958; +} +{ +mul.f16x2 r1004, r1001, r925; +} +{ +sub.f16x2 r1007, r963, r964; +} +{ +mul.f16x2 r1010, r1007, r927; +} +{ +add.f16x2 r1013, r1004, r1010; +} +{ +sub.f16x2 r1016, r969, r970; +} +{ +mul.f16x2 r1019, r1016, r929; +} +{ +add.f16x2 r1022, r1013, r1019; +} +{ +sub.f16x2 %2, r998, r1022; +} +{ +add.f16x2 r1028, r939, r940; +} +{ +mul.f16x2 r1031, r1028, r924; +} +{ +add.f16x2 r1034, r942, r1031; +} +{ +add.f16x2 r1037, r945, r946; +} +{ +mul.f16x2 r1040, r1037, r926; +} +{ +add.f16x2 r1043, r1034, r1040; +} +{ +add.f16x2 r1046, r951, r952; +} +{ +mul.f16x2 r1049, r1046, r928; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, r957, r958; +} +{ +mul.f16x2 r1058, r1055, r925; +} +{ +sub.f16x2 r1061, r963, r964; +} +{ +mul.f16x2 r1064, r1061, r927; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +sub.f16x2 r1070, r969, r970; +} +{ +mul.f16x2 r1073, r1070, r929; +} +{ +add.f16x2 r1076, r1067, r1073; +} +{ +add.f16x2 %12, r1052, r1076; +} +{ +add.f16x2 r1082, r939, r940; +} +{ +mul.f16x2 r1085, r1082, r926; +} +{ +add.f16x2 r1088, r942, r1085; +} +{ +add.f16x2 r1091, r945, r946; +} +{ +mul.f16x2 r1094, r1091, r930; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r951, r952; +} +{ +mul.f16x2 r1103, r1100, r934; +} +{ +add.f16x2 r1106, r1097, r1103; +} +{ +sub.f16x2 r1109, r957, r958; +} +{ +mul.f16x2 r1112, r1109, r927; +} +{ +sub.f16x2 r1115, r963, r964; +} +{ +mul.f16x2 r1118, r1115, r932; +} +{ +add.f16x2 r1121, r1112, r1118; +} +{ +sub.f16x2 r1124, r969, r970; +} +{ +mul.f16x2 r1127, r1124, r936; +} +{ +add.f16x2 r1130, r1121, r1127; +} +{ +sub.f16x2 %4, r1106, r1130; +} +{ +add.f16x2 r1136, r939, r940; +} +{ +mul.f16x2 r1139, r1136, r926; +} +{ +add.f16x2 r1142, r942, r1139; +} +{ +add.f16x2 r1145, r945, r946; +} +{ +mul.f16x2 r1148, r1145, r930; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r951, r952; +} +{ +mul.f16x2 r1157, r1154, r934; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, r957, r958; +} +{ +mul.f16x2 r1166, r1163, r927; +} +{ +sub.f16x2 r1169, r963, r964; +} +{ +mul.f16x2 r1172, r1169, r932; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +sub.f16x2 r1178, r969, r970; +} +{ +mul.f16x2 r1181, r1178, r936; +} +{ +add.f16x2 r1184, r1175, r1181; +} +{ +add.f16x2 %10, r1160, r1184; +} +{ +add.f16x2 r1190, r939, r940; +} +{ +mul.f16x2 r1193, r1190, r928; +} +{ +add.f16x2 r1196, r942, r1193; +} +{ +add.f16x2 r1199, r945, r946; +} +{ +mul.f16x2 r1202, r1199, r934; +} +{ +add.f16x2 r1205, r1196, r1202; +} +{ +add.f16x2 r1208, r951, r952; +} +{ +mul.f16x2 r1211, r1208, r926; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +sub.f16x2 r1217, r957, r958; +} +{ +mul.f16x2 r1220, r1217, r929; +} +{ +sub.f16x2 r1223, r963, r964; +} +{ +mul.f16x2 r1226, r1223, r936; +} +{ +add.f16x2 r1229, r1220, r1226; +} +{ +sub.f16x2 r1232, r969, r970; +} +{ +mul.f16x2 r1235, r1232, r927; +} +{ +add.f16x2 r1238, r1229, r1235; +} +{ +sub.f16x2 %6, r1214, r1238; +} +{ +add.f16x2 r1244, r939, r940; +} +{ +mul.f16x2 r1247, r1244, r928; +} +{ +add.f16x2 r1250, r942, r1247; +} +{ +add.f16x2 r1253, r945, r946; +} +{ +mul.f16x2 r1256, r1253, r934; +} +{ +add.f16x2 r1259, r1250, r1256; +} +{ +add.f16x2 r1262, r951, r952; +} +{ +mul.f16x2 r1265, r1262, r926; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, r957, r958; +} +{ +mul.f16x2 r1274, r1271, r929; +} +{ +sub.f16x2 r1277, r963, r964; +} +{ +mul.f16x2 r1280, r1277, r936; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r969, r970; +} +{ +mul.f16x2 r1289, r1286, r927; +} +{ +add.f16x2 r1292, r1283, r1289; +} +{ +add.f16x2 %8, r1268, r1292; +} +{ +add.f16x2 r1298, r957, r958; +} +{ +mul.f16x2 r1301, r1298, r924; +} +{ +add.f16x2 r1304, r960, r1301; +} +{ +add.f16x2 r1307, r963, r964; +} +{ +mul.f16x2 r1310, r1307, r926; +} +{ +add.f16x2 r1313, r1304, r1310; +} +{ +add.f16x2 r1316, r969, r970; +} +{ +mul.f16x2 r1319, r1316, r928; +} +{ +add.f16x2 r1322, r1313, r1319; +} +{ +sub.f16x2 r1325, r939, r940; +} +{ +mul.f16x2 r1328, r1325, r925; +} +{ +sub.f16x2 r1331, r945, r946; +} +{ +mul.f16x2 r1334, r1331, r927; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +sub.f16x2 r1340, r951, r952; +} +{ +mul.f16x2 r1343, r1340, r929; +} +{ +add.f16x2 r1346, r1337, r1343; +} +{ +add.f16x2 %3, r1322, r1346; +} +{ +add.f16x2 r1352, r957, r958; +} +{ +mul.f16x2 r1355, r1352, r924; +} +{ +add.f16x2 r1358, r960, r1355; +} +{ +add.f16x2 r1361, r963, r964; +} +{ +mul.f16x2 r1364, r1361, r926; +} +{ +add.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r969, r970; +} +{ +mul.f16x2 r1373, r1370, r928; +} +{ +add.f16x2 r1376, r1367, r1373; +} +{ +sub.f16x2 r1379, r939, r940; +} +{ +mul.f16x2 r1382, r1379, r925; +} +{ +sub.f16x2 r1385, r945, r946; +} +{ +mul.f16x2 r1388, r1385, r927; +} +{ +add.f16x2 r1391, r1382, r1388; +} +{ +sub.f16x2 r1394, r951, r952; +} +{ +mul.f16x2 r1397, r1394, r929; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 %13, r1376, r1400; +} +{ +add.f16x2 r1406, r957, r958; +} +{ +mul.f16x2 r1409, r1406, r926; +} +{ +add.f16x2 r1412, r960, r1409; +} +{ +add.f16x2 r1415, r963, r964; +} +{ +mul.f16x2 r1418, r1415, r930; +} +{ +add.f16x2 r1421, r1412, r1418; +} +{ +add.f16x2 r1424, r969, r970; +} +{ +mul.f16x2 r1427, r1424, r934; +} +{ +add.f16x2 r1430, r1421, r1427; +} +{ +sub.f16x2 r1433, r939, r940; +} +{ +mul.f16x2 r1436, r1433, r927; +} +{ +sub.f16x2 r1439, r945, r946; +} +{ +mul.f16x2 r1442, r1439, r932; +} +{ +add.f16x2 r1445, r1436, r1442; +} +{ +sub.f16x2 r1448, r951, r952; +} +{ +mul.f16x2 r1451, r1448, r936; +} +{ +add.f16x2 r1454, r1445, r1451; +} +{ +add.f16x2 %5, r1430, r1454; +} +{ +add.f16x2 r1460, r957, r958; +} +{ +mul.f16x2 r1463, r1460, r926; +} +{ +add.f16x2 r1466, r960, r1463; +} +{ +add.f16x2 r1469, r963, r964; +} +{ +mul.f16x2 r1472, r1469, r930; +} +{ +add.f16x2 r1475, r1466, r1472; +} +{ +add.f16x2 r1478, r969, r970; +} +{ +mul.f16x2 r1481, r1478, r934; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 r1487, r939, r940; +} +{ +mul.f16x2 r1490, r1487, r927; +} +{ +sub.f16x2 r1493, r945, r946; +} +{ +mul.f16x2 r1496, r1493, r932; +} +{ +add.f16x2 r1499, r1490, r1496; +} +{ +sub.f16x2 r1502, r951, r952; +} +{ +mul.f16x2 r1505, r1502, r936; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 %11, r1484, r1508; +} +{ +add.f16x2 r1514, r957, r958; +} +{ +mul.f16x2 r1517, r1514, r928; +} +{ +add.f16x2 r1520, r960, r1517; +} +{ +add.f16x2 r1523, r963, r964; +} +{ +mul.f16x2 r1526, r1523, r934; +} +{ +add.f16x2 r1529, r1520, r1526; +} +{ +add.f16x2 r1532, r969, r970; +} +{ +mul.f16x2 r1535, r1532, r926; +} +{ +add.f16x2 r1538, r1529, r1535; +} +{ +sub.f16x2 r1541, r939, r940; +} +{ +mul.f16x2 r1544, r1541, r929; +} +{ +sub.f16x2 r1547, r945, r946; +} +{ +mul.f16x2 r1550, r1547, r936; +} +{ +add.f16x2 r1553, r1544, r1550; +} +{ +sub.f16x2 r1556, r951, r952; +} +{ +mul.f16x2 r1559, r1556, r927; +} +{ +add.f16x2 r1562, r1553, r1559; +} +{ +add.f16x2 %7, r1538, r1562; +} +{ +add.f16x2 r1568, r957, r958; +} +{ +mul.f16x2 r1571, r1568, r928; +} +{ +add.f16x2 r1574, r960, r1571; +} +{ +add.f16x2 r1577, r963, r964; +} +{ +mul.f16x2 r1580, r1577, r934; +} +{ +add.f16x2 r1583, r1574, r1580; +} +{ +add.f16x2 r1586, r969, r970; +} +{ +mul.f16x2 r1589, r1586, r926; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +sub.f16x2 r1595, r939, r940; +} +{ +mul.f16x2 r1598, r1595, r929; +} +{ +sub.f16x2 r1601, r945, r946; +} +{ +mul.f16x2 r1604, r1601, r936; +} +{ +add.f16x2 r1607, r1598, r1604; +} +{ +sub.f16x2 r1610, r951, r952; +} +{ +mul.f16x2 r1613, r1610, r927; +} +{ +add.f16x2 r1616, r1607, r1613; +} +{ +sub.f16x2 %9, r1592, r1616; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..6feea69c13d4f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp16_inv.hpp.inc @@ -0,0 +1,3767 @@ +#ifndef CUFFTDX_FFT_49_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_49_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1120, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<60>; +.reg .b32 r<1640>; +.reg .b64 rd<4>; +mov.u32 r1626, %tid.x; +mov.f32 f54, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r1, {low, high}; +} +mov.f32 f56, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f42, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r5, {low, high}; +} +mov.f32 f44, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +mov.f32 f50, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r9, {low, high}; +} +mov.f32 f52, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r10, {low, high}; +} +{ +neg.f16x2 r11, r10; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r14, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r15, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r16, {low, high}; +} +{ +add.f16x2 r17, %17, %27; +} +{ +add.f16x2 r20, %15, r17; +} +{ +add.f16x2 r23, %19, %25; +} +{ +add.f16x2 r26, r20, r23; +} +{ +add.f16x2 r29, %21, %23; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %18, %28; +} +{ +add.f16x2 r38, %16, r35; +} +{ +add.f16x2 r41, %20, %26; +} +{ +add.f16x2 r44, r38, r41; +} +{ +add.f16x2 r47, %22, %24; +} +{ +add.f16x2 r50, r44, r47; +} +{ +add.f16x2 r53, %17, %27; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %15, r56; +} +{ +add.f16x2 r62, %19, %25; +} +{ +mul.f16x2 r65, r62, r5; +} +{ +add.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %21, %23; +} +{ +mul.f16x2 r74, r71, r9; +} +{ +add.f16x2 r77, r68, r74; +} +{ +sub.f16x2 r80, %18, %28; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +sub.f16x2 r86, %20, %26; +} +{ +mul.f16x2 r89, r86, r7; +} +{ +add.f16x2 r92, r83, r89; +} +{ +sub.f16x2 r95, %22, %24; +} +{ +mul.f16x2 r98, r95, r11; +} +{ +add.f16x2 r101, r92, r98; +} +{ +sub.f16x2 r104, r77, r101; +} +{ +add.f16x2 r107, %17, %27; +} +{ +mul.f16x2 r110, r107, r1; +} +{ +add.f16x2 r113, %15, r110; +} +{ +add.f16x2 r116, %19, %25; +} +{ +mul.f16x2 r119, r116, r5; +} +{ +add.f16x2 r122, r113, r119; +} +{ +add.f16x2 r125, %21, %23; +} +{ +mul.f16x2 r128, r125, r9; +} +{ +add.f16x2 r131, r122, r128; +} +{ +sub.f16x2 r134, %18, %28; +} +{ +mul.f16x2 r137, r134, r3; +} +{ +sub.f16x2 r140, %20, %26; +} +{ +mul.f16x2 r143, r140, r7; +} +{ +add.f16x2 r146, r137, r143; +} +{ +sub.f16x2 r149, %22, %24; +} +{ +mul.f16x2 r152, r149, r11; +} +{ +add.f16x2 r155, r146, r152; +} +{ +add.f16x2 r158, r131, r155; +} +{ +add.f16x2 r161, %17, %27; +} +{ +mul.f16x2 r164, r161, r5; +} +{ +add.f16x2 r167, %15, r164; +} +{ +add.f16x2 r170, %19, %25; +} +{ +mul.f16x2 r173, r170, r13; +} +{ +add.f16x2 r176, r167, r173; +} +{ +add.f16x2 r179, %21, %23; +} +{ +mul.f16x2 r182, r179, r15; +} +{ +add.f16x2 r185, r176, r182; +} +{ +sub.f16x2 r188, %18, %28; +} +{ +mul.f16x2 r191, r188, r7; +} +{ +sub.f16x2 r194, %20, %26; +} +{ +mul.f16x2 r197, r194, r14; +} +{ +add.f16x2 r200, r191, r197; +} +{ +sub.f16x2 r203, %22, %24; +} +{ +mul.f16x2 r206, r203, r16; +} +{ +add.f16x2 r209, r200, r206; +} +{ +sub.f16x2 r212, r185, r209; +} +{ +add.f16x2 r215, %17, %27; +} +{ +mul.f16x2 r218, r215, r5; +} +{ +add.f16x2 r221, %15, r218; +} +{ +add.f16x2 r224, %19, %25; +} +{ +mul.f16x2 r227, r224, r13; +} +{ +add.f16x2 r230, r221, r227; +} +{ +add.f16x2 r233, %21, %23; +} +{ +mul.f16x2 r236, r233, r15; +} +{ +add.f16x2 r239, r230, r236; +} +{ +sub.f16x2 r242, %18, %28; +} +{ +mul.f16x2 r245, r242, r7; +} +{ +sub.f16x2 r248, %20, %26; +} +{ +mul.f16x2 r251, r248, r14; +} +{ +add.f16x2 r254, r245, r251; +} +{ +sub.f16x2 r257, %22, %24; +} +{ +mul.f16x2 r260, r257, r16; +} +{ +add.f16x2 r263, r254, r260; +} +{ +add.f16x2 r266, r239, r263; +} +{ +add.f16x2 r269, %17, %27; +} +{ +mul.f16x2 r272, r269, r9; +} +{ +add.f16x2 r275, %15, r272; +} +{ +add.f16x2 r278, %19, %25; +} +{ +mul.f16x2 r281, r278, r15; +} +{ +add.f16x2 r284, r275, r281; +} +{ +add.f16x2 r287, %21, %23; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, r284, r290; +} +{ +sub.f16x2 r296, %18, %28; +} +{ +mul.f16x2 r299, r296, r11; +} +{ +sub.f16x2 r302, %20, %26; +} +{ +mul.f16x2 r305, r302, r16; +} +{ +add.f16x2 r308, r299, r305; +} +{ +sub.f16x2 r311, %22, %24; +} +{ +mul.f16x2 r314, r311, r7; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r293, r317; +} +{ +add.f16x2 r323, %17, %27; +} +{ +mul.f16x2 r326, r323, r9; +} +{ +add.f16x2 r329, %15, r326; +} +{ +add.f16x2 r332, %19, %25; +} +{ +mul.f16x2 r335, r332, r15; +} +{ +add.f16x2 r338, r329, r335; +} +{ +add.f16x2 r341, %21, %23; +} +{ +mul.f16x2 r344, r341, r5; +} +{ +add.f16x2 r347, r338, r344; +} +{ +sub.f16x2 r350, %18, %28; +} +{ +mul.f16x2 r353, r350, r11; +} +{ +sub.f16x2 r356, %20, %26; +} +{ +mul.f16x2 r359, r356, r16; +} +{ +add.f16x2 r362, r353, r359; +} +{ +sub.f16x2 r365, %22, %24; +} +{ +mul.f16x2 r368, r365, r7; +} +{ +add.f16x2 r371, r362, r368; +} +{ +add.f16x2 r374, r347, r371; +} +{ +add.f16x2 r377, %18, %28; +} +{ +mul.f16x2 r380, r377, r1; +} +{ +add.f16x2 r383, %16, r380; +} +{ +add.f16x2 r386, %20, %26; +} +{ +mul.f16x2 r389, r386, r5; +} +{ +add.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %22, %24; +} +{ +mul.f16x2 r398, r395, r9; +} +{ +add.f16x2 r401, r392, r398; +} +{ +sub.f16x2 r404, %17, %27; +} +{ +mul.f16x2 r407, r404, r3; +} +{ +sub.f16x2 r410, %19, %25; +} +{ +mul.f16x2 r413, r410, r7; +} +{ +add.f16x2 r416, r407, r413; +} +{ +sub.f16x2 r419, %21, %23; +} +{ +mul.f16x2 r422, r419, r11; +} +{ +add.f16x2 r425, r416, r422; +} +{ +add.f16x2 r428, r401, r425; +} +{ +add.f16x2 r431, %18, %28; +} +{ +mul.f16x2 r434, r431, r1; +} +{ +add.f16x2 r437, %16, r434; +} +{ +add.f16x2 r440, %20, %26; +} +{ +mul.f16x2 r443, r440, r5; +} +{ +add.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, %22, %24; +} +{ +mul.f16x2 r452, r449, r9; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, %17, %27; +} +{ +mul.f16x2 r461, r458, r3; +} +{ +sub.f16x2 r464, %19, %25; +} +{ +mul.f16x2 r467, r464, r7; +} +{ +add.f16x2 r470, r461, r467; +} +{ +sub.f16x2 r473, %21, %23; +} +{ +mul.f16x2 r476, r473, r11; +} +{ +add.f16x2 r479, r470, r476; +} +{ +sub.f16x2 r482, r455, r479; +} +{ +add.f16x2 r485, %18, %28; +} +{ +mul.f16x2 r488, r485, r5; +} +{ +add.f16x2 r491, %16, r488; +} +{ +add.f16x2 r494, %20, %26; +} +{ +mul.f16x2 r497, r494, r13; +} +{ +add.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, %22, %24; +} +{ +mul.f16x2 r506, r503, r15; +} +{ +add.f16x2 r509, r500, r506; +} +{ +sub.f16x2 r512, %17, %27; +} +{ +mul.f16x2 r515, r512, r7; +} +{ +sub.f16x2 r518, %19, %25; +} +{ +mul.f16x2 r521, r518, r14; +} +{ +add.f16x2 r524, r515, r521; +} +{ +sub.f16x2 r527, %21, %23; +} +{ +mul.f16x2 r530, r527, r16; +} +{ +add.f16x2 r533, r524, r530; +} +{ +add.f16x2 r536, r509, r533; +} +{ +add.f16x2 r539, %18, %28; +} +{ +mul.f16x2 r542, r539, r5; +} +{ +add.f16x2 r545, %16, r542; +} +{ +add.f16x2 r548, %20, %26; +} +{ +mul.f16x2 r551, r548, r13; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, %22, %24; +} +{ +mul.f16x2 r560, r557, r15; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, %17, %27; +} +{ +mul.f16x2 r569, r566, r7; +} +{ +sub.f16x2 r572, %19, %25; +} +{ +mul.f16x2 r575, r572, r14; +} +{ +add.f16x2 r578, r569, r575; +} +{ +sub.f16x2 r581, %21, %23; +} +{ +mul.f16x2 r584, r581, r16; +} +{ +add.f16x2 r587, r578, r584; +} +{ +sub.f16x2 r590, r563, r587; +} +{ +add.f16x2 r593, %18, %28; +} +{ +mul.f16x2 r596, r593, r9; +} +{ +add.f16x2 r599, %16, r596; +} +{ +add.f16x2 r602, %20, %26; +} +{ +mul.f16x2 r605, r602, r15; +} +{ +add.f16x2 r608, r599, r605; +} +{ +add.f16x2 r611, %22, %24; +} +{ +mul.f16x2 r614, r611, r5; +} +{ +add.f16x2 r617, r608, r614; +} +{ +sub.f16x2 r620, %17, %27; +} +{ +mul.f16x2 r623, r620, r11; +} +{ +sub.f16x2 r626, %19, %25; +} +{ +mul.f16x2 r629, r626, r16; +} +{ +add.f16x2 r632, r623, r629; +} +{ +sub.f16x2 r635, %21, %23; +} +{ +mul.f16x2 r638, r635, r7; +} +{ +add.f16x2 r641, r632, r638; +} +{ +add.f16x2 r644, r617, r641; +} +{ +add.f16x2 r647, %18, %28; +} +{ +mul.f16x2 r650, r647, r9; +} +{ +add.f16x2 r653, %16, r650; +} +{ +add.f16x2 r656, %20, %26; +} +{ +mul.f16x2 r659, r656, r15; +} +{ +add.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %22, %24; +} +{ +mul.f16x2 r668, r665, r5; +} +{ +add.f16x2 r671, r662, r668; +} +{ +sub.f16x2 r674, %17, %27; +} +{ +mul.f16x2 r677, r674, r11; +} +{ +sub.f16x2 r680, %19, %25; +} +{ +mul.f16x2 r683, r680, r16; +} +{ +add.f16x2 r686, r677, r683; +} +{ +sub.f16x2 r689, %21, %23; +} +{ +mul.f16x2 r692, r689, r7; +} +{ +add.f16x2 r695, r686, r692; +} +{ +sub.f16x2 r698, r671, r695; +} +mul.wide.u32 rd2, r1626, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1627, rd3; +sub.s32 r1628, r1626, r1627; +shr.u32 r1629, r1628, 1; +add.s32 r1630, r1629, r1627; +shr.u32 r1631, r1630, 2; +mul.lo.s32 r1632, r1631, 7; +sub.s32 r1633, r1626, r1632; +cvt.rn.f32.u32 f57, r1633; +mul.f32 f58, f57, 0f3E034E46; +cos.approx.f32 f21, f58; +sin.approx.f32 f59, f58; +neg.f32 f22, f59; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r701, {low, high}; +} +mov.u32 r1634, %tid.y; +mov.u32 r1635, %14; +mad.lo.s32 r1636, r1634, 392, r1635; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r706, {high, high}; +} +{ +mul.f16x2 r708, r428, r706; +} +{ +fma.rn.f16x2 r711, r104, r704, r708; +} +{ +mul.f16x2 r715, r104, r706; +} +{ +neg.f16x2 r718, r715; +} +{ +fma.rn.f16x2 r720, r428, r704, r718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r726, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r728, {low, high}; +} +{ +mul.f16x2 r729, r726, r728; +} +{ +mul.f16x2 r732, r701, r724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r735, {high, low}; +} +{ +fma.rn.f16x2 r737, r729, r735, r732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r536, r743; +} +{ +fma.rn.f16x2 r748, r212, r741, r745; +} +{ +mul.f16x2 r752, r212, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r536, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r737, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r644, r780; +} +{ +fma.rn.f16x2 r785, r320, r778, r782; +} +{ +mul.f16x2 r789, r320, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r644, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r698, r817; +} +{ +fma.rn.f16x2 r822, r374, r815, r819; +} +{ +mul.f16x2 r826, r374, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r698, r815, r829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r835, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r837, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r839, {low, high}; +} +{ +mul.f16x2 r840, r837, r839; +} +{ +mul.f16x2 r843, r811, r835; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r846, {high, low}; +} +{ +fma.rn.f16x2 r848, r840, r846, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r854, {high, high}; +} +{ +mul.f16x2 r856, r590, r854; +} +{ +fma.rn.f16x2 r859, r266, r852, r856; +} +{ +mul.f16x2 r863, r266, r854; +} +{ +neg.f16x2 r866, r863; +} +{ +fma.rn.f16x2 r868, r590, r852, r866; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r872, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r874, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r876, {low, high}; +} +{ +mul.f16x2 r877, r874, r876; +} +{ +mul.f16x2 r880, r848, r872; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r883, {high, low}; +} +{ +fma.rn.f16x2 r885, r877, r883, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r891, {high, high}; +} +{ +mul.f16x2 r893, r482, r891; +} +{ +fma.rn.f16x2 r896, r158, r889, r893; +} +{ +mul.f16x2 r900, r158, r891; +} +{ +neg.f16x2 r903, r900; +} +{ +fma.rn.f16x2 r905, r482, r889, r903; +} +mad.lo.s32 r1637, r1631, 392, r1636; +barrier.sync 0; +mad.lo.s32 r1638, r1633, 56, r1637; +st.shared.v2.f32 [r1638], {r32, r50}; +st.shared.v2.f32 [r1638+8], {r711, r720}; +st.shared.v2.f32 [r1638+16], {r748, r757}; +st.shared.v2.f32 [r1638+24], {r785, r794}; +st.shared.v2.f32 [r1638+32], {r822, r831}; +st.shared.v2.f32 [r1638+40], {r859, r868}; +st.shared.v2.f32 [r1638+48], {r896, r905}; +barrier.sync 0; +mad.lo.s32 r1639, r1633, -48, r1638; +ld.shared.u32 r946, [r1639]; +ld.shared.u32 r964, [r1639+4]; +ld.shared.u32 r943, [r1639+56]; +ld.shared.u32 r961, [r1639+60]; +ld.shared.u32 r949, [r1639+112]; +ld.shared.u32 r967, [r1639+116]; +ld.shared.u32 r955, [r1639+168]; +ld.shared.u32 r973, [r1639+172]; +ld.shared.u32 r956, [r1639+224]; +ld.shared.u32 r974, [r1639+228]; +ld.shared.u32 r950, [r1639+280]; +ld.shared.u32 r968, [r1639+284]; +ld.shared.u32 r944, [r1639+336]; +ld.shared.u32 r962, [r1639+340]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r927, {low, high}; +} +{ +neg.f16x2 r928, r927; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r941, {low, high}; +} +{ +add.f16x2 r942, r943, r944; +} +{ +add.f16x2 r945, r946, r942; +} +{ +add.f16x2 r948, r949, r950; +} +{ +add.f16x2 r951, r945, r948; +} +{ +add.f16x2 r954, r955, r956; +} +{ +add.f16x2 %0, r951, r954; +} +{ +add.f16x2 r960, r961, r962; +} +{ +add.f16x2 r963, r964, r960; +} +{ +add.f16x2 r966, r967, r968; +} +{ +add.f16x2 r969, r963, r966; +} +{ +add.f16x2 r972, r973, r974; +} +{ +add.f16x2 %1, r969, r972; +} +{ +add.f16x2 r978, r943, r944; +} +{ +mul.f16x2 r981, r978, r926; +} +{ +add.f16x2 r984, r946, r981; +} +{ +add.f16x2 r987, r949, r950; +} +{ +mul.f16x2 r990, r987, r930; +} +{ +add.f16x2 r993, r984, r990; +} +{ +add.f16x2 r996, r955, r956; +} +{ +mul.f16x2 r999, r996, r934; +} +{ +add.f16x2 r1002, r993, r999; +} +{ +sub.f16x2 r1005, r961, r962; +} +{ +mul.f16x2 r1008, r1005, r928; +} +{ +sub.f16x2 r1011, r967, r968; +} +{ +mul.f16x2 r1014, r1011, r932; +} +{ +add.f16x2 r1017, r1008, r1014; +} +{ +sub.f16x2 r1020, r973, r974; +} +{ +mul.f16x2 r1023, r1020, r936; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 %2, r1002, r1026; +} +{ +add.f16x2 r1032, r943, r944; +} +{ +mul.f16x2 r1035, r1032, r926; +} +{ +add.f16x2 r1038, r946, r1035; +} +{ +add.f16x2 r1041, r949, r950; +} +{ +mul.f16x2 r1044, r1041, r930; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r955, r956; +} +{ +mul.f16x2 r1053, r1050, r934; +} +{ +add.f16x2 r1056, r1047, r1053; +} +{ +sub.f16x2 r1059, r961, r962; +} +{ +mul.f16x2 r1062, r1059, r928; +} +{ +sub.f16x2 r1065, r967, r968; +} +{ +mul.f16x2 r1068, r1065, r932; +} +{ +add.f16x2 r1071, r1062, r1068; +} +{ +sub.f16x2 r1074, r973, r974; +} +{ +mul.f16x2 r1077, r1074, r936; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 %12, r1056, r1080; +} +{ +add.f16x2 r1086, r943, r944; +} +{ +mul.f16x2 r1089, r1086, r930; +} +{ +add.f16x2 r1092, r946, r1089; +} +{ +add.f16x2 r1095, r949, r950; +} +{ +mul.f16x2 r1098, r1095, r938; +} +{ +add.f16x2 r1101, r1092, r1098; +} +{ +add.f16x2 r1104, r955, r956; +} +{ +mul.f16x2 r1107, r1104, r940; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, r961, r962; +} +{ +mul.f16x2 r1116, r1113, r932; +} +{ +sub.f16x2 r1119, r967, r968; +} +{ +mul.f16x2 r1122, r1119, r939; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r973, r974; +} +{ +mul.f16x2 r1131, r1128, r941; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 %4, r1110, r1134; +} +{ +add.f16x2 r1140, r943, r944; +} +{ +mul.f16x2 r1143, r1140, r930; +} +{ +add.f16x2 r1146, r946, r1143; +} +{ +add.f16x2 r1149, r949, r950; +} +{ +mul.f16x2 r1152, r1149, r938; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r955, r956; +} +{ +mul.f16x2 r1161, r1158, r940; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +sub.f16x2 r1167, r961, r962; +} +{ +mul.f16x2 r1170, r1167, r932; +} +{ +sub.f16x2 r1173, r967, r968; +} +{ +mul.f16x2 r1176, r1173, r939; +} +{ +add.f16x2 r1179, r1170, r1176; +} +{ +sub.f16x2 r1182, r973, r974; +} +{ +mul.f16x2 r1185, r1182, r941; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +add.f16x2 %10, r1164, r1188; +} +{ +add.f16x2 r1194, r943, r944; +} +{ +mul.f16x2 r1197, r1194, r934; +} +{ +add.f16x2 r1200, r946, r1197; +} +{ +add.f16x2 r1203, r949, r950; +} +{ +mul.f16x2 r1206, r1203, r940; +} +{ +add.f16x2 r1209, r1200, r1206; +} +{ +add.f16x2 r1212, r955, r956; +} +{ +mul.f16x2 r1215, r1212, r930; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, r961, r962; +} +{ +mul.f16x2 r1224, r1221, r936; +} +{ +sub.f16x2 r1227, r967, r968; +} +{ +mul.f16x2 r1230, r1227, r941; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r973, r974; +} +{ +mul.f16x2 r1239, r1236, r932; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 %6, r1218, r1242; +} +{ +add.f16x2 r1248, r943, r944; +} +{ +mul.f16x2 r1251, r1248, r934; +} +{ +add.f16x2 r1254, r946, r1251; +} +{ +add.f16x2 r1257, r949, r950; +} +{ +mul.f16x2 r1260, r1257, r940; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r955, r956; +} +{ +mul.f16x2 r1269, r1266, r930; +} +{ +add.f16x2 r1272, r1263, r1269; +} +{ +sub.f16x2 r1275, r961, r962; +} +{ +mul.f16x2 r1278, r1275, r936; +} +{ +sub.f16x2 r1281, r967, r968; +} +{ +mul.f16x2 r1284, r1281, r941; +} +{ +add.f16x2 r1287, r1278, r1284; +} +{ +sub.f16x2 r1290, r973, r974; +} +{ +mul.f16x2 r1293, r1290, r932; +} +{ +add.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 %8, r1272, r1296; +} +{ +add.f16x2 r1302, r961, r962; +} +{ +mul.f16x2 r1305, r1302, r926; +} +{ +add.f16x2 r1308, r964, r1305; +} +{ +add.f16x2 r1311, r967, r968; +} +{ +mul.f16x2 r1314, r1311, r930; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +add.f16x2 r1320, r973, r974; +} +{ +mul.f16x2 r1323, r1320, r934; +} +{ +add.f16x2 r1326, r1317, r1323; +} +{ +sub.f16x2 r1329, r943, r944; +} +{ +mul.f16x2 r1332, r1329, r928; +} +{ +sub.f16x2 r1335, r949, r950; +} +{ +mul.f16x2 r1338, r1335, r932; +} +{ +add.f16x2 r1341, r1332, r1338; +} +{ +sub.f16x2 r1344, r955, r956; +} +{ +mul.f16x2 r1347, r1344, r936; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 %3, r1326, r1350; +} +{ +add.f16x2 r1356, r961, r962; +} +{ +mul.f16x2 r1359, r1356, r926; +} +{ +add.f16x2 r1362, r964, r1359; +} +{ +add.f16x2 r1365, r967, r968; +} +{ +mul.f16x2 r1368, r1365, r930; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r973, r974; +} +{ +mul.f16x2 r1377, r1374, r934; +} +{ +add.f16x2 r1380, r1371, r1377; +} +{ +sub.f16x2 r1383, r943, r944; +} +{ +mul.f16x2 r1386, r1383, r928; +} +{ +sub.f16x2 r1389, r949, r950; +} +{ +mul.f16x2 r1392, r1389, r932; +} +{ +add.f16x2 r1395, r1386, r1392; +} +{ +sub.f16x2 r1398, r955, r956; +} +{ +mul.f16x2 r1401, r1398, r936; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +sub.f16x2 %13, r1380, r1404; +} +{ +add.f16x2 r1410, r961, r962; +} +{ +mul.f16x2 r1413, r1410, r930; +} +{ +add.f16x2 r1416, r964, r1413; +} +{ +add.f16x2 r1419, r967, r968; +} +{ +mul.f16x2 r1422, r1419, r938; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +add.f16x2 r1428, r973, r974; +} +{ +mul.f16x2 r1431, r1428, r940; +} +{ +add.f16x2 r1434, r1425, r1431; +} +{ +sub.f16x2 r1437, r943, r944; +} +{ +mul.f16x2 r1440, r1437, r932; +} +{ +sub.f16x2 r1443, r949, r950; +} +{ +mul.f16x2 r1446, r1443, r939; +} +{ +add.f16x2 r1449, r1440, r1446; +} +{ +sub.f16x2 r1452, r955, r956; +} +{ +mul.f16x2 r1455, r1452, r941; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +add.f16x2 %5, r1434, r1458; +} +{ +add.f16x2 r1464, r961, r962; +} +{ +mul.f16x2 r1467, r1464, r930; +} +{ +add.f16x2 r1470, r964, r1467; +} +{ +add.f16x2 r1473, r967, r968; +} +{ +mul.f16x2 r1476, r1473, r938; +} +{ +add.f16x2 r1479, r1470, r1476; +} +{ +add.f16x2 r1482, r973, r974; +} +{ +mul.f16x2 r1485, r1482, r940; +} +{ +add.f16x2 r1488, r1479, r1485; +} +{ +sub.f16x2 r1491, r943, r944; +} +{ +mul.f16x2 r1494, r1491, r932; +} +{ +sub.f16x2 r1497, r949, r950; +} +{ +mul.f16x2 r1500, r1497, r939; +} +{ +add.f16x2 r1503, r1494, r1500; +} +{ +sub.f16x2 r1506, r955, r956; +} +{ +mul.f16x2 r1509, r1506, r941; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +sub.f16x2 %11, r1488, r1512; +} +{ +add.f16x2 r1518, r961, r962; +} +{ +mul.f16x2 r1521, r1518, r934; +} +{ +add.f16x2 r1524, r964, r1521; +} +{ +add.f16x2 r1527, r967, r968; +} +{ +mul.f16x2 r1530, r1527, r940; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +add.f16x2 r1536, r973, r974; +} +{ +mul.f16x2 r1539, r1536, r930; +} +{ +add.f16x2 r1542, r1533, r1539; +} +{ +sub.f16x2 r1545, r943, r944; +} +{ +mul.f16x2 r1548, r1545, r936; +} +{ +sub.f16x2 r1551, r949, r950; +} +{ +mul.f16x2 r1554, r1551, r941; +} +{ +add.f16x2 r1557, r1548, r1554; +} +{ +sub.f16x2 r1560, r955, r956; +} +{ +mul.f16x2 r1563, r1560, r932; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +add.f16x2 %7, r1542, r1566; +} +{ +add.f16x2 r1572, r961, r962; +} +{ +mul.f16x2 r1575, r1572, r934; +} +{ +add.f16x2 r1578, r964, r1575; +} +{ +add.f16x2 r1581, r967, r968; +} +{ +mul.f16x2 r1584, r1581, r940; +} +{ +add.f16x2 r1587, r1578, r1584; +} +{ +add.f16x2 r1590, r973, r974; +} +{ +mul.f16x2 r1593, r1590, r930; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, r943, r944; +} +{ +mul.f16x2 r1602, r1599, r936; +} +{ +sub.f16x2 r1605, r949, r950; +} +{ +mul.f16x2 r1608, r1605, r941; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r955, r956; +} +{ +mul.f16x2 r1617, r1614, r932; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +sub.f16x2 %9, r1596, r1620; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1121, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<60>; +.reg .b32 r<1640>; +.reg .b64 rd<4>; +mov.u32 r1626, %tid.y; +mov.u32 r1627, %14; +mad.lo.s32 r1628, r1626, 196, r1627; +mov.u32 r1629, %tid.x; +mov.f32 f54, 0f3F1F9D07; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r1, {low, high}; +} +mov.f32 f56, 0fBF48261C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f42, 0fBE63DC87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r5, {low, high}; +} +mov.f32 f44, 0fBF7994E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +mov.f32 f50, 0fBF66A5E5; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r9, {low, high}; +} +mov.f32 f52, 0fBEDE2602; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r10, {low, high}; +} +{ +neg.f16x2 r11, r10; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r14, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r15, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r16, {low, high}; +} +{ +add.f16x2 r17, %17, %27; +} +{ +add.f16x2 r20, %15, r17; +} +{ +add.f16x2 r23, %19, %25; +} +{ +add.f16x2 r26, r20, r23; +} +{ +add.f16x2 r29, %21, %23; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %18, %28; +} +{ +add.f16x2 r38, %16, r35; +} +{ +add.f16x2 r41, %20, %26; +} +{ +add.f16x2 r44, r38, r41; +} +{ +add.f16x2 r47, %22, %24; +} +{ +add.f16x2 r50, r44, r47; +} +{ +add.f16x2 r53, %17, %27; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %15, r56; +} +{ +add.f16x2 r62, %19, %25; +} +{ +mul.f16x2 r65, r62, r5; +} +{ +add.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %21, %23; +} +{ +mul.f16x2 r74, r71, r9; +} +{ +add.f16x2 r77, r68, r74; +} +{ +sub.f16x2 r80, %18, %28; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +sub.f16x2 r86, %20, %26; +} +{ +mul.f16x2 r89, r86, r7; +} +{ +add.f16x2 r92, r83, r89; +} +{ +sub.f16x2 r95, %22, %24; +} +{ +mul.f16x2 r98, r95, r11; +} +{ +add.f16x2 r101, r92, r98; +} +{ +sub.f16x2 r104, r77, r101; +} +{ +add.f16x2 r107, %17, %27; +} +{ +mul.f16x2 r110, r107, r1; +} +{ +add.f16x2 r113, %15, r110; +} +{ +add.f16x2 r116, %19, %25; +} +{ +mul.f16x2 r119, r116, r5; +} +{ +add.f16x2 r122, r113, r119; +} +{ +add.f16x2 r125, %21, %23; +} +{ +mul.f16x2 r128, r125, r9; +} +{ +add.f16x2 r131, r122, r128; +} +{ +sub.f16x2 r134, %18, %28; +} +{ +mul.f16x2 r137, r134, r3; +} +{ +sub.f16x2 r140, %20, %26; +} +{ +mul.f16x2 r143, r140, r7; +} +{ +add.f16x2 r146, r137, r143; +} +{ +sub.f16x2 r149, %22, %24; +} +{ +mul.f16x2 r152, r149, r11; +} +{ +add.f16x2 r155, r146, r152; +} +{ +add.f16x2 r158, r131, r155; +} +{ +add.f16x2 r161, %17, %27; +} +{ +mul.f16x2 r164, r161, r5; +} +{ +add.f16x2 r167, %15, r164; +} +{ +add.f16x2 r170, %19, %25; +} +{ +mul.f16x2 r173, r170, r13; +} +{ +add.f16x2 r176, r167, r173; +} +{ +add.f16x2 r179, %21, %23; +} +{ +mul.f16x2 r182, r179, r15; +} +{ +add.f16x2 r185, r176, r182; +} +{ +sub.f16x2 r188, %18, %28; +} +{ +mul.f16x2 r191, r188, r7; +} +{ +sub.f16x2 r194, %20, %26; +} +{ +mul.f16x2 r197, r194, r14; +} +{ +add.f16x2 r200, r191, r197; +} +{ +sub.f16x2 r203, %22, %24; +} +{ +mul.f16x2 r206, r203, r16; +} +{ +add.f16x2 r209, r200, r206; +} +{ +sub.f16x2 r212, r185, r209; +} +{ +add.f16x2 r215, %17, %27; +} +{ +mul.f16x2 r218, r215, r5; +} +{ +add.f16x2 r221, %15, r218; +} +{ +add.f16x2 r224, %19, %25; +} +{ +mul.f16x2 r227, r224, r13; +} +{ +add.f16x2 r230, r221, r227; +} +{ +add.f16x2 r233, %21, %23; +} +{ +mul.f16x2 r236, r233, r15; +} +{ +add.f16x2 r239, r230, r236; +} +{ +sub.f16x2 r242, %18, %28; +} +{ +mul.f16x2 r245, r242, r7; +} +{ +sub.f16x2 r248, %20, %26; +} +{ +mul.f16x2 r251, r248, r14; +} +{ +add.f16x2 r254, r245, r251; +} +{ +sub.f16x2 r257, %22, %24; +} +{ +mul.f16x2 r260, r257, r16; +} +{ +add.f16x2 r263, r254, r260; +} +{ +add.f16x2 r266, r239, r263; +} +{ +add.f16x2 r269, %17, %27; +} +{ +mul.f16x2 r272, r269, r9; +} +{ +add.f16x2 r275, %15, r272; +} +{ +add.f16x2 r278, %19, %25; +} +{ +mul.f16x2 r281, r278, r15; +} +{ +add.f16x2 r284, r275, r281; +} +{ +add.f16x2 r287, %21, %23; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, r284, r290; +} +{ +sub.f16x2 r296, %18, %28; +} +{ +mul.f16x2 r299, r296, r11; +} +{ +sub.f16x2 r302, %20, %26; +} +{ +mul.f16x2 r305, r302, r16; +} +{ +add.f16x2 r308, r299, r305; +} +{ +sub.f16x2 r311, %22, %24; +} +{ +mul.f16x2 r314, r311, r7; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r293, r317; +} +{ +add.f16x2 r323, %17, %27; +} +{ +mul.f16x2 r326, r323, r9; +} +{ +add.f16x2 r329, %15, r326; +} +{ +add.f16x2 r332, %19, %25; +} +{ +mul.f16x2 r335, r332, r15; +} +{ +add.f16x2 r338, r329, r335; +} +{ +add.f16x2 r341, %21, %23; +} +{ +mul.f16x2 r344, r341, r5; +} +{ +add.f16x2 r347, r338, r344; +} +{ +sub.f16x2 r350, %18, %28; +} +{ +mul.f16x2 r353, r350, r11; +} +{ +sub.f16x2 r356, %20, %26; +} +{ +mul.f16x2 r359, r356, r16; +} +{ +add.f16x2 r362, r353, r359; +} +{ +sub.f16x2 r365, %22, %24; +} +{ +mul.f16x2 r368, r365, r7; +} +{ +add.f16x2 r371, r362, r368; +} +{ +add.f16x2 r374, r347, r371; +} +{ +add.f16x2 r377, %18, %28; +} +{ +mul.f16x2 r380, r377, r1; +} +{ +add.f16x2 r383, %16, r380; +} +{ +add.f16x2 r386, %20, %26; +} +{ +mul.f16x2 r389, r386, r5; +} +{ +add.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, %22, %24; +} +{ +mul.f16x2 r398, r395, r9; +} +{ +add.f16x2 r401, r392, r398; +} +{ +sub.f16x2 r404, %17, %27; +} +{ +mul.f16x2 r407, r404, r3; +} +{ +sub.f16x2 r410, %19, %25; +} +{ +mul.f16x2 r413, r410, r7; +} +{ +add.f16x2 r416, r407, r413; +} +{ +sub.f16x2 r419, %21, %23; +} +{ +mul.f16x2 r422, r419, r11; +} +{ +add.f16x2 r425, r416, r422; +} +{ +add.f16x2 r428, r401, r425; +} +{ +add.f16x2 r431, %18, %28; +} +{ +mul.f16x2 r434, r431, r1; +} +{ +add.f16x2 r437, %16, r434; +} +{ +add.f16x2 r440, %20, %26; +} +{ +mul.f16x2 r443, r440, r5; +} +{ +add.f16x2 r446, r437, r443; +} +{ +add.f16x2 r449, %22, %24; +} +{ +mul.f16x2 r452, r449, r9; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, %17, %27; +} +{ +mul.f16x2 r461, r458, r3; +} +{ +sub.f16x2 r464, %19, %25; +} +{ +mul.f16x2 r467, r464, r7; +} +{ +add.f16x2 r470, r461, r467; +} +{ +sub.f16x2 r473, %21, %23; +} +{ +mul.f16x2 r476, r473, r11; +} +{ +add.f16x2 r479, r470, r476; +} +{ +sub.f16x2 r482, r455, r479; +} +{ +add.f16x2 r485, %18, %28; +} +{ +mul.f16x2 r488, r485, r5; +} +{ +add.f16x2 r491, %16, r488; +} +{ +add.f16x2 r494, %20, %26; +} +{ +mul.f16x2 r497, r494, r13; +} +{ +add.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, %22, %24; +} +{ +mul.f16x2 r506, r503, r15; +} +{ +add.f16x2 r509, r500, r506; +} +{ +sub.f16x2 r512, %17, %27; +} +{ +mul.f16x2 r515, r512, r7; +} +{ +sub.f16x2 r518, %19, %25; +} +{ +mul.f16x2 r521, r518, r14; +} +{ +add.f16x2 r524, r515, r521; +} +{ +sub.f16x2 r527, %21, %23; +} +{ +mul.f16x2 r530, r527, r16; +} +{ +add.f16x2 r533, r524, r530; +} +{ +add.f16x2 r536, r509, r533; +} +{ +add.f16x2 r539, %18, %28; +} +{ +mul.f16x2 r542, r539, r5; +} +{ +add.f16x2 r545, %16, r542; +} +{ +add.f16x2 r548, %20, %26; +} +{ +mul.f16x2 r551, r548, r13; +} +{ +add.f16x2 r554, r545, r551; +} +{ +add.f16x2 r557, %22, %24; +} +{ +mul.f16x2 r560, r557, r15; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, %17, %27; +} +{ +mul.f16x2 r569, r566, r7; +} +{ +sub.f16x2 r572, %19, %25; +} +{ +mul.f16x2 r575, r572, r14; +} +{ +add.f16x2 r578, r569, r575; +} +{ +sub.f16x2 r581, %21, %23; +} +{ +mul.f16x2 r584, r581, r16; +} +{ +add.f16x2 r587, r578, r584; +} +{ +sub.f16x2 r590, r563, r587; +} +{ +add.f16x2 r593, %18, %28; +} +{ +mul.f16x2 r596, r593, r9; +} +{ +add.f16x2 r599, %16, r596; +} +{ +add.f16x2 r602, %20, %26; +} +{ +mul.f16x2 r605, r602, r15; +} +{ +add.f16x2 r608, r599, r605; +} +{ +add.f16x2 r611, %22, %24; +} +{ +mul.f16x2 r614, r611, r5; +} +{ +add.f16x2 r617, r608, r614; +} +{ +sub.f16x2 r620, %17, %27; +} +{ +mul.f16x2 r623, r620, r11; +} +{ +sub.f16x2 r626, %19, %25; +} +{ +mul.f16x2 r629, r626, r16; +} +{ +add.f16x2 r632, r623, r629; +} +{ +sub.f16x2 r635, %21, %23; +} +{ +mul.f16x2 r638, r635, r7; +} +{ +add.f16x2 r641, r632, r638; +} +{ +add.f16x2 r644, r617, r641; +} +{ +add.f16x2 r647, %18, %28; +} +{ +mul.f16x2 r650, r647, r9; +} +{ +add.f16x2 r653, %16, r650; +} +{ +add.f16x2 r656, %20, %26; +} +{ +mul.f16x2 r659, r656, r15; +} +{ +add.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %22, %24; +} +{ +mul.f16x2 r668, r665, r5; +} +{ +add.f16x2 r671, r662, r668; +} +{ +sub.f16x2 r674, %17, %27; +} +{ +mul.f16x2 r677, r674, r11; +} +{ +sub.f16x2 r680, %19, %25; +} +{ +mul.f16x2 r683, r680, r16; +} +{ +add.f16x2 r686, r677, r683; +} +{ +sub.f16x2 r689, %21, %23; +} +{ +mul.f16x2 r692, r689, r7; +} +{ +add.f16x2 r695, r686, r692; +} +{ +sub.f16x2 r698, r671, r695; +} +mul.wide.u32 rd2, r1629, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r1630, rd3; +sub.s32 r1631, r1629, r1630; +shr.u32 r1632, r1631, 1; +add.s32 r1633, r1632, r1630; +shr.u32 r1634, r1633, 2; +mul.lo.s32 r1635, r1634, 7; +sub.s32 r1636, r1629, r1635; +mad.lo.s32 r1637, r1634, 196, r1628; +cvt.rn.f32.u32 f57, r1636; +mul.f32 f58, f57, 0f3E034E46; +cos.approx.f32 f21, f58; +sin.approx.f32 f59, f58; +neg.f32 f22, f59; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r701, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r706, {high, high}; +} +{ +mul.f16x2 r708, r428, r706; +} +{ +fma.rn.f16x2 r711, r104, r704, r708; +} +{ +mul.f16x2 r715, r104, r706; +} +{ +neg.f16x2 r718, r715; +} +{ +fma.rn.f16x2 r720, r428, r704, r718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r726, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r728, {low, high}; +} +{ +mul.f16x2 r729, r726, r728; +} +{ +mul.f16x2 r732, r701, r724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r735, {high, low}; +} +{ +fma.rn.f16x2 r737, r729, r735, r732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r743, {high, high}; +} +{ +mul.f16x2 r745, r536, r743; +} +{ +fma.rn.f16x2 r748, r212, r741, r745; +} +{ +mul.f16x2 r752, r212, r743; +} +{ +neg.f16x2 r755, r752; +} +{ +fma.rn.f16x2 r757, r536, r741, r755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r765, {low, high}; +} +{ +mul.f16x2 r766, r763, r765; +} +{ +mul.f16x2 r769, r737, r761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r737; +mov.b32 r772, {high, low}; +} +{ +fma.rn.f16x2 r774, r766, r772, r769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r780, {high, high}; +} +{ +mul.f16x2 r782, r644, r780; +} +{ +fma.rn.f16x2 r785, r320, r778, r782; +} +{ +mul.f16x2 r789, r320, r780; +} +{ +neg.f16x2 r792, r789; +} +{ +fma.rn.f16x2 r794, r644, r778, r792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r798, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r800, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r802, {low, high}; +} +{ +mul.f16x2 r803, r800, r802; +} +{ +mul.f16x2 r806, r774, r798; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r774; +mov.b32 r809, {high, low}; +} +{ +fma.rn.f16x2 r811, r803, r809, r806; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r817, {high, high}; +} +{ +mul.f16x2 r819, r698, r817; +} +{ +fma.rn.f16x2 r822, r374, r815, r819; +} +{ +mul.f16x2 r826, r374, r817; +} +{ +neg.f16x2 r829, r826; +} +{ +fma.rn.f16x2 r831, r698, r815, r829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r835, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r837, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r839, {low, high}; +} +{ +mul.f16x2 r840, r837, r839; +} +{ +mul.f16x2 r843, r811, r835; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r811; +mov.b32 r846, {high, low}; +} +{ +fma.rn.f16x2 r848, r840, r846, r843; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r854, {high, high}; +} +{ +mul.f16x2 r856, r590, r854; +} +{ +fma.rn.f16x2 r859, r266, r852, r856; +} +{ +mul.f16x2 r863, r266, r854; +} +{ +neg.f16x2 r866, r863; +} +{ +fma.rn.f16x2 r868, r590, r852, r866; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r872, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r701; +mov.b32 r874, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r876, {low, high}; +} +{ +mul.f16x2 r877, r874, r876; +} +{ +mul.f16x2 r880, r848, r872; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r848; +mov.b32 r883, {high, low}; +} +{ +fma.rn.f16x2 r885, r877, r883, r880; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r885; +mov.b32 r891, {high, high}; +} +{ +mul.f16x2 r893, r482, r891; +} +{ +fma.rn.f16x2 r896, r158, r889, r893; +} +{ +mul.f16x2 r900, r158, r891; +} +{ +neg.f16x2 r903, r900; +} +{ +fma.rn.f16x2 r905, r482, r889, r903; +} +barrier.sync 0; +mad.lo.s32 r1638, r1636, 28, r1637; +st.shared.u32 [r1638], r32; +st.shared.u32 [r1638+4], r711; +st.shared.u32 [r1638+8], r748; +st.shared.u32 [r1638+12], r785; +st.shared.u32 [r1638+16], r822; +st.shared.u32 [r1638+20], r859; +st.shared.u32 [r1638+24], r896; +barrier.sync 0; +mad.lo.s32 r1639, r1636, -24, r1638; +ld.shared.u32 r946, [r1639]; +ld.shared.u32 r943, [r1639+28]; +ld.shared.u32 r949, [r1639+56]; +ld.shared.u32 r955, [r1639+84]; +ld.shared.u32 r956, [r1639+112]; +ld.shared.u32 r950, [r1639+140]; +ld.shared.u32 r944, [r1639+168]; +barrier.sync 0; +st.shared.u32 [r1638], r50; +st.shared.u32 [r1638+4], r720; +st.shared.u32 [r1638+8], r757; +st.shared.u32 [r1638+12], r794; +st.shared.u32 [r1638+16], r831; +st.shared.u32 [r1638+20], r868; +st.shared.u32 [r1638+24], r905; +barrier.sync 0; +ld.shared.u32 r964, [r1639]; +ld.shared.u32 r961, [r1639+28]; +ld.shared.u32 r967, [r1639+56]; +ld.shared.u32 r973, [r1639+84]; +ld.shared.u32 r974, [r1639+112]; +ld.shared.u32 r968, [r1639+140]; +ld.shared.u32 r962, [r1639+168]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r926, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r927, {low, high}; +} +{ +neg.f16x2 r928, r927; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f42; +cvt.rn.f16.f32 high, f42; +mov.b32 r930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f44; +cvt.rn.f16.f32 high, f44; +mov.b32 r931, {low, high}; +} +{ +neg.f16x2 r932, r931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r934, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r935, {low, high}; +} +{ +neg.f16x2 r936, r935; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f50; +cvt.rn.f16.f32 high, f50; +mov.b32 r938, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f52; +cvt.rn.f16.f32 high, f52; +mov.b32 r939, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f54; +cvt.rn.f16.f32 high, f54; +mov.b32 r940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r941, {low, high}; +} +{ +add.f16x2 r942, r943, r944; +} +{ +add.f16x2 r945, r946, r942; +} +{ +add.f16x2 r948, r949, r950; +} +{ +add.f16x2 r951, r945, r948; +} +{ +add.f16x2 r954, r955, r956; +} +{ +add.f16x2 %0, r951, r954; +} +{ +add.f16x2 r960, r961, r962; +} +{ +add.f16x2 r963, r964, r960; +} +{ +add.f16x2 r966, r967, r968; +} +{ +add.f16x2 r969, r963, r966; +} +{ +add.f16x2 r972, r973, r974; +} +{ +add.f16x2 %1, r969, r972; +} +{ +add.f16x2 r978, r943, r944; +} +{ +mul.f16x2 r981, r978, r926; +} +{ +add.f16x2 r984, r946, r981; +} +{ +add.f16x2 r987, r949, r950; +} +{ +mul.f16x2 r990, r987, r930; +} +{ +add.f16x2 r993, r984, r990; +} +{ +add.f16x2 r996, r955, r956; +} +{ +mul.f16x2 r999, r996, r934; +} +{ +add.f16x2 r1002, r993, r999; +} +{ +sub.f16x2 r1005, r961, r962; +} +{ +mul.f16x2 r1008, r1005, r928; +} +{ +sub.f16x2 r1011, r967, r968; +} +{ +mul.f16x2 r1014, r1011, r932; +} +{ +add.f16x2 r1017, r1008, r1014; +} +{ +sub.f16x2 r1020, r973, r974; +} +{ +mul.f16x2 r1023, r1020, r936; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 %2, r1002, r1026; +} +{ +add.f16x2 r1032, r943, r944; +} +{ +mul.f16x2 r1035, r1032, r926; +} +{ +add.f16x2 r1038, r946, r1035; +} +{ +add.f16x2 r1041, r949, r950; +} +{ +mul.f16x2 r1044, r1041, r930; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r955, r956; +} +{ +mul.f16x2 r1053, r1050, r934; +} +{ +add.f16x2 r1056, r1047, r1053; +} +{ +sub.f16x2 r1059, r961, r962; +} +{ +mul.f16x2 r1062, r1059, r928; +} +{ +sub.f16x2 r1065, r967, r968; +} +{ +mul.f16x2 r1068, r1065, r932; +} +{ +add.f16x2 r1071, r1062, r1068; +} +{ +sub.f16x2 r1074, r973, r974; +} +{ +mul.f16x2 r1077, r1074, r936; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +add.f16x2 %12, r1056, r1080; +} +{ +add.f16x2 r1086, r943, r944; +} +{ +mul.f16x2 r1089, r1086, r930; +} +{ +add.f16x2 r1092, r946, r1089; +} +{ +add.f16x2 r1095, r949, r950; +} +{ +mul.f16x2 r1098, r1095, r938; +} +{ +add.f16x2 r1101, r1092, r1098; +} +{ +add.f16x2 r1104, r955, r956; +} +{ +mul.f16x2 r1107, r1104, r940; +} +{ +add.f16x2 r1110, r1101, r1107; +} +{ +sub.f16x2 r1113, r961, r962; +} +{ +mul.f16x2 r1116, r1113, r932; +} +{ +sub.f16x2 r1119, r967, r968; +} +{ +mul.f16x2 r1122, r1119, r939; +} +{ +add.f16x2 r1125, r1116, r1122; +} +{ +sub.f16x2 r1128, r973, r974; +} +{ +mul.f16x2 r1131, r1128, r941; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 %4, r1110, r1134; +} +{ +add.f16x2 r1140, r943, r944; +} +{ +mul.f16x2 r1143, r1140, r930; +} +{ +add.f16x2 r1146, r946, r1143; +} +{ +add.f16x2 r1149, r949, r950; +} +{ +mul.f16x2 r1152, r1149, r938; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r955, r956; +} +{ +mul.f16x2 r1161, r1158, r940; +} +{ +add.f16x2 r1164, r1155, r1161; +} +{ +sub.f16x2 r1167, r961, r962; +} +{ +mul.f16x2 r1170, r1167, r932; +} +{ +sub.f16x2 r1173, r967, r968; +} +{ +mul.f16x2 r1176, r1173, r939; +} +{ +add.f16x2 r1179, r1170, r1176; +} +{ +sub.f16x2 r1182, r973, r974; +} +{ +mul.f16x2 r1185, r1182, r941; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +add.f16x2 %10, r1164, r1188; +} +{ +add.f16x2 r1194, r943, r944; +} +{ +mul.f16x2 r1197, r1194, r934; +} +{ +add.f16x2 r1200, r946, r1197; +} +{ +add.f16x2 r1203, r949, r950; +} +{ +mul.f16x2 r1206, r1203, r940; +} +{ +add.f16x2 r1209, r1200, r1206; +} +{ +add.f16x2 r1212, r955, r956; +} +{ +mul.f16x2 r1215, r1212, r930; +} +{ +add.f16x2 r1218, r1209, r1215; +} +{ +sub.f16x2 r1221, r961, r962; +} +{ +mul.f16x2 r1224, r1221, r936; +} +{ +sub.f16x2 r1227, r967, r968; +} +{ +mul.f16x2 r1230, r1227, r941; +} +{ +add.f16x2 r1233, r1224, r1230; +} +{ +sub.f16x2 r1236, r973, r974; +} +{ +mul.f16x2 r1239, r1236, r932; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 %6, r1218, r1242; +} +{ +add.f16x2 r1248, r943, r944; +} +{ +mul.f16x2 r1251, r1248, r934; +} +{ +add.f16x2 r1254, r946, r1251; +} +{ +add.f16x2 r1257, r949, r950; +} +{ +mul.f16x2 r1260, r1257, r940; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r955, r956; +} +{ +mul.f16x2 r1269, r1266, r930; +} +{ +add.f16x2 r1272, r1263, r1269; +} +{ +sub.f16x2 r1275, r961, r962; +} +{ +mul.f16x2 r1278, r1275, r936; +} +{ +sub.f16x2 r1281, r967, r968; +} +{ +mul.f16x2 r1284, r1281, r941; +} +{ +add.f16x2 r1287, r1278, r1284; +} +{ +sub.f16x2 r1290, r973, r974; +} +{ +mul.f16x2 r1293, r1290, r932; +} +{ +add.f16x2 r1296, r1287, r1293; +} +{ +add.f16x2 %8, r1272, r1296; +} +{ +add.f16x2 r1302, r961, r962; +} +{ +mul.f16x2 r1305, r1302, r926; +} +{ +add.f16x2 r1308, r964, r1305; +} +{ +add.f16x2 r1311, r967, r968; +} +{ +mul.f16x2 r1314, r1311, r930; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +add.f16x2 r1320, r973, r974; +} +{ +mul.f16x2 r1323, r1320, r934; +} +{ +add.f16x2 r1326, r1317, r1323; +} +{ +sub.f16x2 r1329, r943, r944; +} +{ +mul.f16x2 r1332, r1329, r928; +} +{ +sub.f16x2 r1335, r949, r950; +} +{ +mul.f16x2 r1338, r1335, r932; +} +{ +add.f16x2 r1341, r1332, r1338; +} +{ +sub.f16x2 r1344, r955, r956; +} +{ +mul.f16x2 r1347, r1344, r936; +} +{ +add.f16x2 r1350, r1341, r1347; +} +{ +add.f16x2 %3, r1326, r1350; +} +{ +add.f16x2 r1356, r961, r962; +} +{ +mul.f16x2 r1359, r1356, r926; +} +{ +add.f16x2 r1362, r964, r1359; +} +{ +add.f16x2 r1365, r967, r968; +} +{ +mul.f16x2 r1368, r1365, r930; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r973, r974; +} +{ +mul.f16x2 r1377, r1374, r934; +} +{ +add.f16x2 r1380, r1371, r1377; +} +{ +sub.f16x2 r1383, r943, r944; +} +{ +mul.f16x2 r1386, r1383, r928; +} +{ +sub.f16x2 r1389, r949, r950; +} +{ +mul.f16x2 r1392, r1389, r932; +} +{ +add.f16x2 r1395, r1386, r1392; +} +{ +sub.f16x2 r1398, r955, r956; +} +{ +mul.f16x2 r1401, r1398, r936; +} +{ +add.f16x2 r1404, r1395, r1401; +} +{ +sub.f16x2 %13, r1380, r1404; +} +{ +add.f16x2 r1410, r961, r962; +} +{ +mul.f16x2 r1413, r1410, r930; +} +{ +add.f16x2 r1416, r964, r1413; +} +{ +add.f16x2 r1419, r967, r968; +} +{ +mul.f16x2 r1422, r1419, r938; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +add.f16x2 r1428, r973, r974; +} +{ +mul.f16x2 r1431, r1428, r940; +} +{ +add.f16x2 r1434, r1425, r1431; +} +{ +sub.f16x2 r1437, r943, r944; +} +{ +mul.f16x2 r1440, r1437, r932; +} +{ +sub.f16x2 r1443, r949, r950; +} +{ +mul.f16x2 r1446, r1443, r939; +} +{ +add.f16x2 r1449, r1440, r1446; +} +{ +sub.f16x2 r1452, r955, r956; +} +{ +mul.f16x2 r1455, r1452, r941; +} +{ +add.f16x2 r1458, r1449, r1455; +} +{ +add.f16x2 %5, r1434, r1458; +} +{ +add.f16x2 r1464, r961, r962; +} +{ +mul.f16x2 r1467, r1464, r930; +} +{ +add.f16x2 r1470, r964, r1467; +} +{ +add.f16x2 r1473, r967, r968; +} +{ +mul.f16x2 r1476, r1473, r938; +} +{ +add.f16x2 r1479, r1470, r1476; +} +{ +add.f16x2 r1482, r973, r974; +} +{ +mul.f16x2 r1485, r1482, r940; +} +{ +add.f16x2 r1488, r1479, r1485; +} +{ +sub.f16x2 r1491, r943, r944; +} +{ +mul.f16x2 r1494, r1491, r932; +} +{ +sub.f16x2 r1497, r949, r950; +} +{ +mul.f16x2 r1500, r1497, r939; +} +{ +add.f16x2 r1503, r1494, r1500; +} +{ +sub.f16x2 r1506, r955, r956; +} +{ +mul.f16x2 r1509, r1506, r941; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +sub.f16x2 %11, r1488, r1512; +} +{ +add.f16x2 r1518, r961, r962; +} +{ +mul.f16x2 r1521, r1518, r934; +} +{ +add.f16x2 r1524, r964, r1521; +} +{ +add.f16x2 r1527, r967, r968; +} +{ +mul.f16x2 r1530, r1527, r940; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +add.f16x2 r1536, r973, r974; +} +{ +mul.f16x2 r1539, r1536, r930; +} +{ +add.f16x2 r1542, r1533, r1539; +} +{ +sub.f16x2 r1545, r943, r944; +} +{ +mul.f16x2 r1548, r1545, r936; +} +{ +sub.f16x2 r1551, r949, r950; +} +{ +mul.f16x2 r1554, r1551, r941; +} +{ +add.f16x2 r1557, r1548, r1554; +} +{ +sub.f16x2 r1560, r955, r956; +} +{ +mul.f16x2 r1563, r1560, r932; +} +{ +add.f16x2 r1566, r1557, r1563; +} +{ +add.f16x2 %7, r1542, r1566; +} +{ +add.f16x2 r1572, r961, r962; +} +{ +mul.f16x2 r1575, r1572, r934; +} +{ +add.f16x2 r1578, r964, r1575; +} +{ +add.f16x2 r1581, r967, r968; +} +{ +mul.f16x2 r1584, r1581, r940; +} +{ +add.f16x2 r1587, r1578, r1584; +} +{ +add.f16x2 r1590, r973, r974; +} +{ +mul.f16x2 r1593, r1590, r930; +} +{ +add.f16x2 r1596, r1587, r1593; +} +{ +sub.f16x2 r1599, r943, r944; +} +{ +mul.f16x2 r1602, r1599, r936; +} +{ +sub.f16x2 r1605, r949, r950; +} +{ +mul.f16x2 r1608, r1605, r941; +} +{ +add.f16x2 r1611, r1602, r1608; +} +{ +sub.f16x2 r1614, r955, r956; +} +{ +mul.f16x2 r1617, r1614, r932; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +sub.f16x2 %9, r1596, r1620; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..12817d4a43627 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp32_fwd.hpp.inc @@ -0,0 +1,580 @@ +#ifndef CUFFTDX_FFT_49_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_49_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<172, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<292>; +.reg .b32 r<15>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 392, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %18, %32; +add.f32 f30, %16, f29; +add.f32 f31, %21, %29; +add.f32 f32, f31, f30; +add.f32 f33, %24, %26; +add.f32 f34, %20, %33; +add.f32 f35, %17, f34; +add.f32 f36, %23, %31; +add.f32 f37, f36, f35; +add.f32 f38, %25, %28; +fma.rn.f32 f39, f29, 0f3F1F9D07, %16; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %20, %33; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %23, %31; +mul.f32 f47, f46, 0fBF7994E0; +sub.f32 f48, f47, f45; +sub.f32 f49, %25, %28; +mul.f32 f50, f49, 0f3EDE2602; +sub.f32 f51, f48, f50; +sub.f32 f52, f43, f51; +add.f32 f53, f51, f43; +mul.f32 f54, f29, 0f3E63DC87; +sub.f32 f55, %16, f54; +mul.f32 f56, f31, 0f3F66A5E5; +sub.f32 f57, f55, f56; +fma.rn.f32 f58, f33, 0f3F1F9D07, f57; +mul.f32 f59, f44, 0f3F7994E0; +mul.f32 f60, f46, 0f3EDE2602; +sub.f32 f61, f60, f59; +fma.rn.f32 f62, f49, 0f3F48261C, f61; +sub.f32 f63, f58, f62; +add.f32 f64, f62, f58; +mul.f32 f65, f29, 0f3F66A5E5; +sub.f32 f66, %16, f65; +fma.rn.f32 f67, f31, 0f3F1F9D07, f66; +mul.f32 f68, f33, 0f3E63DC87; +sub.f32 f69, f67, f68; +mul.f32 f70, f44, 0f3EDE2602; +mul.f32 f71, f46, 0f3F48261C; +sub.f32 f72, f71, f70; +mul.f32 f73, f49, 0f3F7994E0; +sub.f32 f74, f72, f73; +sub.f32 f75, f69, f74; +add.f32 f76, f74, f69; +fma.rn.f32 f77, f34, 0f3F1F9D07, %17; +mul.f32 f78, f36, 0f3E63DC87; +sub.f32 f79, f77, f78; +mul.f32 f80, f38, 0f3F66A5E5; +sub.f32 f81, f79, f80; +sub.f32 f82, %18, %32; +mul.f32 f83, f82, 0f3F48261C; +sub.f32 f84, %21, %29; +mul.f32 f85, f84, 0fBF7994E0; +sub.f32 f86, f85, f83; +sub.f32 f87, %24, %26; +mul.f32 f88, f87, 0f3EDE2602; +sub.f32 f89, f86, f88; +add.f32 f90, f89, f81; +sub.f32 f91, f81, f89; +mul.f32 f92, f34, 0f3E63DC87; +sub.f32 f93, %17, f92; +mul.f32 f94, f36, 0f3F66A5E5; +sub.f32 f95, f93, f94; +fma.rn.f32 f96, f38, 0f3F1F9D07, f95; +mul.f32 f97, f82, 0f3F7994E0; +mul.f32 f98, f84, 0f3EDE2602; +sub.f32 f99, f98, f97; +fma.rn.f32 f100, f87, 0f3F48261C, f99; +add.f32 f101, f100, f96; +sub.f32 f102, f96, f100; +mul.f32 f103, f34, 0f3F66A5E5; +sub.f32 f104, %17, f103; +fma.rn.f32 f105, f36, 0f3F1F9D07, f104; +mul.f32 f106, f38, 0f3E63DC87; +sub.f32 f107, f105, f106; +mul.f32 f108, f82, 0f3EDE2602; +mul.f32 f109, f84, 0f3F48261C; +sub.f32 f110, f109, f108; +mul.f32 f111, f87, 0f3F7994E0; +sub.f32 f112, f110, f111; +add.f32 f113, f112, f107; +sub.f32 f114, f107, f112; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 392, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f115, f116}, [rd6]; +mul.f32 f119, f115, f52; +mul.f32 f120, f116, f90; +mul.f32 f121, f115, f90; +mul.f32 f122, f115, f115; +mul.f32 f123, f116, f116; +sub.f32 f124, f122, f123; +mul.f32 f125, f116, f115; +fma.rn.f32 f126, f116, f115, f125; +mul.f32 f127, f124, f63; +mul.f32 f128, f126, f101; +mul.f32 f129, f124, f101; +mul.f32 f130, f115, f124; +mul.f32 f131, f116, f126; +sub.f32 f132, f130, f131; +mul.f32 f133, f115, f126; +fma.rn.f32 f134, f116, f124, f133; +mul.f32 f135, f132, f75; +mul.f32 f136, f134, f113; +mul.f32 f137, f132, f113; +mul.f32 f138, f115, f132; +mul.f32 f139, f116, f134; +sub.f32 f140, f138, f139; +mul.f32 f141, f115, f134; +fma.rn.f32 f142, f116, f132, f141; +mul.f32 f143, f140, f76; +mul.f32 f144, f142, f114; +mul.f32 f145, f140, f114; +mul.f32 f146, f115, f140; +mul.f32 f147, f116, f142; +sub.f32 f148, f146, f147; +mul.f32 f149, f115, f142; +fma.rn.f32 f150, f116, f140, f149; +mul.f32 f151, f148, f64; +mul.f32 f152, f150, f102; +mul.f32 f153, f148, f102; +mul.f32 f154, f115, f148; +mul.f32 f155, f116, f150; +sub.f32 f156, f154, f155; +mul.f32 f157, f115, f150; +fma.rn.f32 f158, f116, f148, f157; +mul.f32 f159, f156, f53; +mul.f32 f160, f158, f91; +mul.f32 f161, f156, f91; +barrier.sync 0; +mad.lo.s32 r13, r11, 56, r12; +add.f32 f162, f38, f37; +add.f32 f163, f33, f32; +st.shared.v2.f32 [r13], {f163, f162}; +fma.rn.f32 f164, f116, f52, f121; +sub.f32 f165, f119, f120; +st.shared.v2.f32 [r13+8], {f165, f164}; +fma.rn.f32 f166, f126, f63, f129; +sub.f32 f167, f127, f128; +st.shared.v2.f32 [r13+16], {f167, f166}; +sub.f32 f168, f135, f136; +fma.rn.f32 f169, f134, f75, f137; +st.shared.v2.f32 [r13+24], {f168, f169}; +fma.rn.f32 f170, f142, f76, f145; +sub.f32 f171, f143, f144; +st.shared.v2.f32 [r13+32], {f171, f170}; +fma.rn.f32 f172, f150, f64, f153; +sub.f32 f173, f151, f152; +st.shared.v2.f32 [r13+40], {f173, f172}; +fma.rn.f32 f174, f158, f53, f161; +sub.f32 f175, f159, f160; +st.shared.v2.f32 [r13+48], {f175, f174}; +barrier.sync 0; +mad.lo.s32 r14, r11, -48, r13; +ld.shared.v2.f32 {f176, f177}, [r14]; +ld.shared.v2.f32 {f180, f181}, [r14+56]; +ld.shared.v2.f32 {f184, f185}, [r14+112]; +ld.shared.v2.f32 {f188, f189}, [r14+168]; +ld.shared.v2.f32 {f192, f193}, [r14+224]; +ld.shared.v2.f32 {f196, f197}, [r14+280]; +ld.shared.v2.f32 {f200, f201}, [r14+336]; +add.f32 f204, f180, f200; +add.f32 f205, f176, f204; +add.f32 f206, f184, f196; +add.f32 f207, f206, f205; +add.f32 f208, f188, f192; +add.f32 f209, f181, f201; +add.f32 f210, f177, f209; +add.f32 f211, f185, f197; +add.f32 f212, f211, f210; +add.f32 f213, f189, f193; +fma.rn.f32 f214, f204, 0f3F1F9D07, f176; +mul.f32 f215, f206, 0f3E63DC87; +sub.f32 f216, f214, f215; +mul.f32 f217, f208, 0f3F66A5E5; +sub.f32 f218, f216, f217; +sub.f32 f219, f181, f201; +mul.f32 f220, f219, 0f3F48261C; +sub.f32 f221, f185, f197; +mul.f32 f222, f221, 0fBF7994E0; +sub.f32 f223, f222, f220; +sub.f32 f224, f189, f193; +mul.f32 f225, f224, 0f3EDE2602; +sub.f32 f226, f223, f225; +mul.f32 f227, f204, 0f3E63DC87; +sub.f32 f228, f176, f227; +mul.f32 f229, f206, 0f3F66A5E5; +sub.f32 f230, f228, f229; +fma.rn.f32 f231, f208, 0f3F1F9D07, f230; +mul.f32 f232, f219, 0f3F7994E0; +mul.f32 f233, f221, 0f3EDE2602; +sub.f32 f234, f233, f232; +fma.rn.f32 f235, f224, 0f3F48261C, f234; +mul.f32 f236, f204, 0f3F66A5E5; +sub.f32 f237, f176, f236; +fma.rn.f32 f238, f206, 0f3F1F9D07, f237; +mul.f32 f239, f208, 0f3E63DC87; +sub.f32 f240, f238, f239; +mul.f32 f241, f219, 0f3EDE2602; +mul.f32 f242, f221, 0f3F48261C; +sub.f32 f243, f242, f241; +mul.f32 f244, f224, 0f3F7994E0; +sub.f32 f245, f243, f244; +fma.rn.f32 f246, f209, 0f3F1F9D07, f177; +mul.f32 f247, f211, 0f3E63DC87; +sub.f32 f248, f246, f247; +mul.f32 f249, f213, 0f3F66A5E5; +sub.f32 f250, f248, f249; +sub.f32 f251, f180, f200; +mul.f32 f252, f251, 0f3F48261C; +sub.f32 f253, f184, f196; +mul.f32 f254, f253, 0fBF7994E0; +sub.f32 f255, f254, f252; +sub.f32 f256, f188, f192; +mul.f32 f257, f256, 0f3EDE2602; +sub.f32 f258, f255, f257; +mul.f32 f259, f209, 0f3E63DC87; +sub.f32 f260, f177, f259; +mul.f32 f261, f211, 0f3F66A5E5; +sub.f32 f262, f260, f261; +fma.rn.f32 f263, f213, 0f3F1F9D07, f262; +mul.f32 f264, f251, 0f3F7994E0; +mul.f32 f265, f253, 0f3EDE2602; +sub.f32 f266, f265, f264; +fma.rn.f32 f267, f256, 0f3F48261C, f266; +mul.f32 f268, f209, 0f3F66A5E5; +sub.f32 f269, f177, f268; +fma.rn.f32 f270, f211, 0f3F1F9D07, f269; +mul.f32 f271, f213, 0f3E63DC87; +sub.f32 f272, f270, f271; +mul.f32 f273, f251, 0f3EDE2602; +mul.f32 f274, f253, 0f3F48261C; +sub.f32 f275, f274, f273; +mul.f32 f276, f256, 0f3F7994E0; +sub.f32 f277, f275, f276; +add.f32 %1, f213, f212; +add.f32 %0, f208, f207; +add.f32 %3, f258, f250; +sub.f32 %2, f218, f226; +add.f32 %5, f267, f263; +sub.f32 %4, f231, f235; +add.f32 %7, f277, f272; +sub.f32 %6, f240, f245; +sub.f32 %9, f272, f277; +add.f32 %8, f245, f240; +sub.f32 %11, f263, f267; +add.f32 %10, f235, f231; +sub.f32 %13, f250, f258; +add.f32 %12, f226, f218; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<173, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<278>; +.reg .b32 r<15>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 196, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %18, %32; +add.f32 f30, %16, f29; +add.f32 f31, %21, %29; +add.f32 f32, f31, f30; +add.f32 f33, %24, %26; +add.f32 f34, f33, f32; +add.f32 f35, %20, %33; +add.f32 f36, %17, f35; +add.f32 f37, %23, %31; +add.f32 f38, f37, f36; +add.f32 f39, %25, %28; +add.f32 f40, f39, f38; +fma.rn.f32 f41, f29, 0f3F1F9D07, %16; +mul.f32 f42, f31, 0f3E63DC87; +sub.f32 f43, f41, f42; +mul.f32 f44, f33, 0f3F66A5E5; +sub.f32 f45, f43, f44; +sub.f32 f46, %20, %33; +mul.f32 f47, f46, 0f3F48261C; +sub.f32 f48, %23, %31; +mul.f32 f49, f48, 0fBF7994E0; +sub.f32 f50, f49, f47; +sub.f32 f51, %25, %28; +mul.f32 f52, f51, 0f3EDE2602; +sub.f32 f53, f50, f52; +sub.f32 f54, f45, f53; +add.f32 f55, f53, f45; +mul.f32 f56, f29, 0f3E63DC87; +sub.f32 f57, %16, f56; +mul.f32 f58, f31, 0f3F66A5E5; +sub.f32 f59, f57, f58; +fma.rn.f32 f60, f33, 0f3F1F9D07, f59; +mul.f32 f61, f46, 0f3F7994E0; +mul.f32 f62, f48, 0f3EDE2602; +sub.f32 f63, f62, f61; +fma.rn.f32 f64, f51, 0f3F48261C, f63; +sub.f32 f65, f60, f64; +add.f32 f66, f64, f60; +mul.f32 f67, f29, 0f3F66A5E5; +sub.f32 f68, %16, f67; +fma.rn.f32 f69, f31, 0f3F1F9D07, f68; +mul.f32 f70, f33, 0f3E63DC87; +sub.f32 f71, f69, f70; +mul.f32 f72, f46, 0f3EDE2602; +mul.f32 f73, f48, 0f3F48261C; +sub.f32 f74, f73, f72; +mul.f32 f75, f51, 0f3F7994E0; +sub.f32 f76, f74, f75; +sub.f32 f77, f71, f76; +add.f32 f78, f76, f71; +fma.rn.f32 f79, f35, 0f3F1F9D07, %17; +mul.f32 f80, f37, 0f3E63DC87; +sub.f32 f81, f79, f80; +mul.f32 f82, f39, 0f3F66A5E5; +sub.f32 f83, f81, f82; +sub.f32 f84, %18, %32; +mul.f32 f85, f84, 0f3F48261C; +sub.f32 f86, %21, %29; +mul.f32 f87, f86, 0fBF7994E0; +sub.f32 f88, f87, f85; +sub.f32 f89, %24, %26; +mul.f32 f90, f89, 0f3EDE2602; +sub.f32 f91, f88, f90; +add.f32 f92, f91, f83; +sub.f32 f93, f83, f91; +mul.f32 f94, f35, 0f3E63DC87; +sub.f32 f95, %17, f94; +mul.f32 f96, f37, 0f3F66A5E5; +sub.f32 f97, f95, f96; +fma.rn.f32 f98, f39, 0f3F1F9D07, f97; +mul.f32 f99, f84, 0f3F7994E0; +mul.f32 f100, f86, 0f3EDE2602; +sub.f32 f101, f100, f99; +fma.rn.f32 f102, f89, 0f3F48261C, f101; +add.f32 f103, f102, f98; +sub.f32 f104, f98, f102; +mul.f32 f105, f35, 0f3F66A5E5; +sub.f32 f106, %17, f105; +fma.rn.f32 f107, f37, 0f3F1F9D07, f106; +mul.f32 f108, f39, 0f3E63DC87; +sub.f32 f109, f107, f108; +mul.f32 f110, f84, 0f3EDE2602; +mul.f32 f111, f86, 0f3F48261C; +sub.f32 f112, f111, f110; +mul.f32 f113, f89, 0f3F7994E0; +sub.f32 f114, f112, f113; +add.f32 f115, f114, f109; +sub.f32 f116, f109, f114; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f117, f118}, [rd6]; +mul.f32 f121, f117, f54; +mul.f32 f122, f118, f92; +sub.f32 f123, f121, f122; +mul.f32 f124, f117, f92; +fma.rn.f32 f125, f118, f54, f124; +mul.f32 f126, f117, f117; +mul.f32 f127, f118, f118; +sub.f32 f128, f126, f127; +mul.f32 f129, f118, f117; +fma.rn.f32 f130, f118, f117, f129; +mul.f32 f131, f128, f65; +mul.f32 f132, f130, f103; +sub.f32 f133, f131, f132; +mul.f32 f134, f128, f103; +fma.rn.f32 f135, f130, f65, f134; +mul.f32 f136, f117, f128; +mul.f32 f137, f118, f130; +sub.f32 f138, f136, f137; +mul.f32 f139, f117, f130; +fma.rn.f32 f140, f118, f128, f139; +mul.f32 f141, f138, f77; +mul.f32 f142, f140, f115; +sub.f32 f143, f141, f142; +mul.f32 f144, f138, f115; +fma.rn.f32 f145, f140, f77, f144; +mul.f32 f146, f117, f138; +mul.f32 f147, f118, f140; +sub.f32 f148, f146, f147; +mul.f32 f149, f117, f140; +fma.rn.f32 f150, f118, f138, f149; +mul.f32 f151, f148, f78; +mul.f32 f152, f150, f116; +sub.f32 f153, f151, f152; +mul.f32 f154, f148, f116; +fma.rn.f32 f155, f150, f78, f154; +mul.f32 f156, f117, f148; +mul.f32 f157, f118, f150; +sub.f32 f158, f156, f157; +mul.f32 f159, f117, f150; +fma.rn.f32 f160, f118, f148, f159; +mul.f32 f161, f158, f66; +mul.f32 f162, f160, f104; +sub.f32 f163, f161, f162; +mul.f32 f164, f158, f104; +fma.rn.f32 f165, f160, f66, f164; +mul.f32 f166, f117, f158; +mul.f32 f167, f118, f160; +sub.f32 f168, f166, f167; +mul.f32 f169, f117, f160; +fma.rn.f32 f170, f118, f158, f169; +mul.f32 f171, f168, f55; +mul.f32 f172, f170, f93; +sub.f32 f173, f171, f172; +mul.f32 f174, f168, f93; +fma.rn.f32 f175, f170, f55, f174; +mad.lo.s32 r12, r9, 196, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 28, r12; +st.shared.f32 [r13], f34; +st.shared.f32 [r13+4], f123; +st.shared.f32 [r13+8], f133; +st.shared.f32 [r13+12], f143; +st.shared.f32 [r13+16], f153; +st.shared.f32 [r13+20], f163; +st.shared.f32 [r13+24], f173; +barrier.sync 0; +mad.lo.s32 r14, r11, -24, r13; +ld.shared.f32 f176, [r14]; +ld.shared.f32 f177, [r14+28]; +ld.shared.f32 f178, [r14+56]; +ld.shared.f32 f179, [r14+84]; +ld.shared.f32 f180, [r14+112]; +ld.shared.f32 f181, [r14+140]; +ld.shared.f32 f182, [r14+168]; +barrier.sync 0; +st.shared.f32 [r13], f40; +st.shared.f32 [r13+4], f125; +st.shared.f32 [r13+8], f135; +st.shared.f32 [r13+12], f145; +st.shared.f32 [r13+16], f155; +st.shared.f32 [r13+20], f165; +st.shared.f32 [r13+24], f175; +barrier.sync 0; +ld.shared.f32 f183, [r14]; +ld.shared.f32 f184, [r14+28]; +ld.shared.f32 f185, [r14+56]; +ld.shared.f32 f186, [r14+84]; +ld.shared.f32 f187, [r14+112]; +ld.shared.f32 f188, [r14+140]; +ld.shared.f32 f189, [r14+168]; +add.f32 f190, f177, f182; +add.f32 f191, f176, f190; +add.f32 f192, f178, f181; +add.f32 f193, f192, f191; +add.f32 f194, f179, f180; +add.f32 f195, f184, f189; +add.f32 f196, f183, f195; +add.f32 f197, f185, f188; +add.f32 f198, f197, f196; +add.f32 f199, f186, f187; +fma.rn.f32 f200, f190, 0f3F1F9D07, f176; +mul.f32 f201, f192, 0f3E63DC87; +sub.f32 f202, f200, f201; +mul.f32 f203, f194, 0f3F66A5E5; +sub.f32 f204, f202, f203; +sub.f32 f205, f184, f189; +mul.f32 f206, f205, 0f3F48261C; +sub.f32 f207, f185, f188; +mul.f32 f208, f207, 0fBF7994E0; +sub.f32 f209, f208, f206; +sub.f32 f210, f186, f187; +mul.f32 f211, f210, 0f3EDE2602; +sub.f32 f212, f209, f211; +mul.f32 f213, f190, 0f3E63DC87; +sub.f32 f214, f176, f213; +mul.f32 f215, f192, 0f3F66A5E5; +sub.f32 f216, f214, f215; +fma.rn.f32 f217, f194, 0f3F1F9D07, f216; +mul.f32 f218, f205, 0f3F7994E0; +mul.f32 f219, f207, 0f3EDE2602; +sub.f32 f220, f219, f218; +fma.rn.f32 f221, f210, 0f3F48261C, f220; +mul.f32 f222, f190, 0f3F66A5E5; +sub.f32 f223, f176, f222; +fma.rn.f32 f224, f192, 0f3F1F9D07, f223; +mul.f32 f225, f194, 0f3E63DC87; +sub.f32 f226, f224, f225; +mul.f32 f227, f205, 0f3EDE2602; +mul.f32 f228, f207, 0f3F48261C; +sub.f32 f229, f228, f227; +mul.f32 f230, f210, 0f3F7994E0; +sub.f32 f231, f229, f230; +fma.rn.f32 f232, f195, 0f3F1F9D07, f183; +mul.f32 f233, f197, 0f3E63DC87; +sub.f32 f234, f232, f233; +mul.f32 f235, f199, 0f3F66A5E5; +sub.f32 f236, f234, f235; +sub.f32 f237, f177, f182; +mul.f32 f238, f237, 0f3F48261C; +sub.f32 f239, f178, f181; +mul.f32 f240, f239, 0fBF7994E0; +sub.f32 f241, f240, f238; +sub.f32 f242, f179, f180; +mul.f32 f243, f242, 0f3EDE2602; +sub.f32 f244, f241, f243; +mul.f32 f245, f195, 0f3E63DC87; +sub.f32 f246, f183, f245; +mul.f32 f247, f197, 0f3F66A5E5; +sub.f32 f248, f246, f247; +fma.rn.f32 f249, f199, 0f3F1F9D07, f248; +mul.f32 f250, f237, 0f3F7994E0; +mul.f32 f251, f239, 0f3EDE2602; +sub.f32 f252, f251, f250; +fma.rn.f32 f253, f242, 0f3F48261C, f252; +mul.f32 f254, f195, 0f3F66A5E5; +sub.f32 f255, f183, f254; +fma.rn.f32 f256, f197, 0f3F1F9D07, f255; +mul.f32 f257, f199, 0f3E63DC87; +sub.f32 f258, f256, f257; +mul.f32 f259, f237, 0f3EDE2602; +mul.f32 f260, f239, 0f3F48261C; +sub.f32 f261, f260, f259; +mul.f32 f262, f242, 0f3F7994E0; +sub.f32 f263, f261, f262; +add.f32 %0, f194, f193; +add.f32 %1, f199, f198; +add.f32 %3, f244, f236; +sub.f32 %2, f204, f212; +sub.f32 %4, f217, f221; +add.f32 %5, f253, f249; +sub.f32 %6, f226, f231; +add.f32 %7, f263, f258; +add.f32 %8, f231, f226; +sub.f32 %9, f258, f263; +add.f32 %10, f221, f217; +sub.f32 %11, f249, f253; +sub.f32 %13, f236, f244; +add.f32 %12, f212, f204; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..56f9f3e89a622 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp32_inv.hpp.inc @@ -0,0 +1,564 @@ +#ifndef CUFFTDX_FFT_49_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_49_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<374, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<284>; +.reg .b32 r<15>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 392, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %18, %32; +add.f32 f30, %16, f29; +add.f32 f31, %21, %29; +add.f32 f32, f31, f30; +add.f32 f33, %24, %26; +add.f32 f34, %20, %33; +add.f32 f35, %17, f34; +add.f32 f36, %23, %31; +add.f32 f37, f36, f35; +add.f32 f38, %25, %28; +fma.rn.f32 f39, f29, 0f3F1F9D07, %16; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %20, %33; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %23, %31; +fma.rn.f32 f47, f46, 0f3F7994E0, f45; +sub.f32 f48, %25, %28; +fma.rn.f32 f49, f48, 0f3EDE2602, f47; +sub.f32 f50, f43, f49; +add.f32 f51, f49, f43; +mul.f32 f52, f29, 0f3E63DC87; +sub.f32 f53, %16, f52; +mul.f32 f54, f31, 0f3F66A5E5; +sub.f32 f55, f53, f54; +fma.rn.f32 f56, f33, 0f3F1F9D07, f55; +mul.f32 f57, f44, 0f3F7994E0; +mul.f32 f58, f46, 0f3EDE2602; +sub.f32 f59, f57, f58; +mul.f32 f60, f48, 0f3F48261C; +sub.f32 f61, f59, f60; +sub.f32 f62, f56, f61; +add.f32 f63, f61, f56; +mul.f32 f64, f29, 0f3F66A5E5; +sub.f32 f65, %16, f64; +fma.rn.f32 f66, f31, 0f3F1F9D07, f65; +mul.f32 f67, f33, 0f3E63DC87; +sub.f32 f68, f66, f67; +mul.f32 f69, f44, 0f3EDE2602; +mul.f32 f70, f46, 0f3F48261C; +sub.f32 f71, f69, f70; +fma.rn.f32 f72, f48, 0f3F7994E0, f71; +sub.f32 f73, f68, f72; +add.f32 f74, f72, f68; +fma.rn.f32 f75, f34, 0f3F1F9D07, %17; +mul.f32 f76, f36, 0f3E63DC87; +sub.f32 f77, f75, f76; +mul.f32 f78, f38, 0f3F66A5E5; +sub.f32 f79, f77, f78; +sub.f32 f80, %18, %32; +mul.f32 f81, f80, 0f3F48261C; +sub.f32 f82, %21, %29; +fma.rn.f32 f83, f82, 0f3F7994E0, f81; +sub.f32 f84, %24, %26; +fma.rn.f32 f85, f84, 0f3EDE2602, f83; +add.f32 f86, f85, f79; +sub.f32 f87, f79, f85; +mul.f32 f88, f34, 0f3E63DC87; +sub.f32 f89, %17, f88; +mul.f32 f90, f36, 0f3F66A5E5; +sub.f32 f91, f89, f90; +fma.rn.f32 f92, f38, 0f3F1F9D07, f91; +mul.f32 f93, f80, 0f3F7994E0; +mul.f32 f94, f82, 0f3EDE2602; +sub.f32 f95, f93, f94; +mul.f32 f96, f84, 0f3F48261C; +sub.f32 f97, f95, f96; +add.f32 f98, f97, f92; +sub.f32 f99, f92, f97; +mul.f32 f100, f34, 0f3F66A5E5; +sub.f32 f101, %17, f100; +fma.rn.f32 f102, f36, 0f3F1F9D07, f101; +mul.f32 f103, f38, 0f3E63DC87; +sub.f32 f104, f102, f103; +mul.f32 f105, f80, 0f3EDE2602; +mul.f32 f106, f82, 0f3F48261C; +sub.f32 f107, f105, f106; +fma.rn.f32 f108, f84, 0f3F7994E0, f107; +add.f32 f109, f108, f104; +sub.f32 f110, f104, f108; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 392, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f111, f112}, [rd6]; +mul.f32 f115, f86, f112; +mul.f32 f116, f50, f112; +mul.f32 f117, f111, f86; +mul.f32 f118, f111, f111; +mul.f32 f119, f112, f112; +sub.f32 f120, f118, f119; +mul.f32 f121, f112, f111; +fma.rn.f32 f122, f112, f111, f121; +mul.f32 f123, f98, f122; +mul.f32 f124, f62, f122; +mul.f32 f125, f120, f98; +mul.f32 f126, f111, f120; +mul.f32 f127, f112, f122; +sub.f32 f128, f126, f127; +mul.f32 f129, f111, f122; +fma.rn.f32 f130, f112, f120, f129; +mul.f32 f131, f109, f130; +mul.f32 f132, f73, f130; +mul.f32 f133, f128, f109; +mul.f32 f134, f111, f128; +mul.f32 f135, f112, f130; +sub.f32 f136, f134, f135; +mul.f32 f137, f111, f130; +fma.rn.f32 f138, f112, f128, f137; +mul.f32 f139, f110, f138; +mul.f32 f140, f74, f138; +mul.f32 f141, f136, f110; +mul.f32 f142, f111, f136; +mul.f32 f143, f112, f138; +sub.f32 f144, f142, f143; +mul.f32 f145, f111, f138; +fma.rn.f32 f146, f112, f136, f145; +mul.f32 f147, f99, f146; +mul.f32 f148, f63, f146; +mul.f32 f149, f144, f99; +mul.f32 f150, f111, f144; +mul.f32 f151, f112, f146; +sub.f32 f152, f150, f151; +mul.f32 f153, f111, f146; +fma.rn.f32 f154, f112, f144, f153; +mul.f32 f155, f87, f154; +mul.f32 f156, f51, f154; +mul.f32 f157, f152, f87; +barrier.sync 0; +mad.lo.s32 r13, r11, 56, r12; +add.f32 f158, f38, f37; +add.f32 f159, f33, f32; +st.shared.v2.f32 [r13], {f159, f158}; +fma.rn.f32 f160, f111, f50, f115; +sub.f32 f161, f117, f116; +st.shared.v2.f32 [r13+8], {f160, f161}; +fma.rn.f32 f162, f120, f62, f123; +sub.f32 f163, f125, f124; +st.shared.v2.f32 [r13+16], {f162, f163}; +sub.f32 f164, f133, f132; +fma.rn.f32 f165, f128, f73, f131; +st.shared.v2.f32 [r13+24], {f165, f164}; +fma.rn.f32 f166, f136, f74, f139; +sub.f32 f167, f141, f140; +st.shared.v2.f32 [r13+32], {f166, f167}; +fma.rn.f32 f168, f144, f63, f147; +sub.f32 f169, f149, f148; +st.shared.v2.f32 [r13+40], {f168, f169}; +fma.rn.f32 f170, f152, f51, f155; +sub.f32 f171, f157, f156; +st.shared.v2.f32 [r13+48], {f170, f171}; +barrier.sync 0; +mad.lo.s32 r14, r11, -48, r13; +ld.shared.v2.f32 {f172, f173}, [r14]; +ld.shared.v2.f32 {f176, f177}, [r14+56]; +ld.shared.v2.f32 {f180, f181}, [r14+112]; +ld.shared.v2.f32 {f184, f185}, [r14+168]; +ld.shared.v2.f32 {f188, f189}, [r14+224]; +ld.shared.v2.f32 {f192, f193}, [r14+280]; +ld.shared.v2.f32 {f196, f197}, [r14+336]; +add.f32 f200, f176, f196; +add.f32 f201, f172, f200; +add.f32 f202, f180, f192; +add.f32 f203, f202, f201; +add.f32 f204, f184, f188; +add.f32 f205, f177, f197; +add.f32 f206, f173, f205; +add.f32 f207, f181, f193; +add.f32 f208, f207, f206; +add.f32 f209, f185, f189; +fma.rn.f32 f210, f200, 0f3F1F9D07, f172; +mul.f32 f211, f202, 0f3E63DC87; +sub.f32 f212, f210, f211; +mul.f32 f213, f204, 0f3F66A5E5; +sub.f32 f214, f212, f213; +sub.f32 f215, f177, f197; +mul.f32 f216, f215, 0f3F48261C; +sub.f32 f217, f181, f193; +fma.rn.f32 f218, f217, 0f3F7994E0, f216; +sub.f32 f219, f185, f189; +fma.rn.f32 f220, f219, 0f3EDE2602, f218; +mul.f32 f221, f200, 0f3E63DC87; +sub.f32 f222, f172, f221; +mul.f32 f223, f202, 0f3F66A5E5; +sub.f32 f224, f222, f223; +fma.rn.f32 f225, f204, 0f3F1F9D07, f224; +mul.f32 f226, f215, 0f3F7994E0; +mul.f32 f227, f217, 0f3EDE2602; +sub.f32 f228, f226, f227; +mul.f32 f229, f219, 0f3F48261C; +sub.f32 f230, f228, f229; +mul.f32 f231, f200, 0f3F66A5E5; +sub.f32 f232, f172, f231; +fma.rn.f32 f233, f202, 0f3F1F9D07, f232; +mul.f32 f234, f204, 0f3E63DC87; +sub.f32 f235, f233, f234; +mul.f32 f236, f215, 0f3EDE2602; +mul.f32 f237, f217, 0f3F48261C; +sub.f32 f238, f236, f237; +fma.rn.f32 f239, f219, 0f3F7994E0, f238; +fma.rn.f32 f240, f205, 0f3F1F9D07, f173; +mul.f32 f241, f207, 0f3E63DC87; +sub.f32 f242, f240, f241; +mul.f32 f243, f209, 0f3F66A5E5; +sub.f32 f244, f242, f243; +sub.f32 f245, f176, f196; +mul.f32 f246, f245, 0f3F48261C; +sub.f32 f247, f180, f192; +fma.rn.f32 f248, f247, 0f3F7994E0, f246; +sub.f32 f249, f184, f188; +fma.rn.f32 f250, f249, 0f3EDE2602, f248; +mul.f32 f251, f205, 0f3E63DC87; +sub.f32 f252, f173, f251; +mul.f32 f253, f207, 0f3F66A5E5; +sub.f32 f254, f252, f253; +fma.rn.f32 f255, f209, 0f3F1F9D07, f254; +mul.f32 f256, f245, 0f3F7994E0; +mul.f32 f257, f247, 0f3EDE2602; +sub.f32 f258, f256, f257; +mul.f32 f259, f249, 0f3F48261C; +sub.f32 f260, f258, f259; +mul.f32 f261, f205, 0f3F66A5E5; +sub.f32 f262, f173, f261; +fma.rn.f32 f263, f207, 0f3F1F9D07, f262; +mul.f32 f264, f209, 0f3E63DC87; +sub.f32 f265, f263, f264; +mul.f32 f266, f245, 0f3EDE2602; +mul.f32 f267, f247, 0f3F48261C; +sub.f32 f268, f266, f267; +fma.rn.f32 f269, f249, 0f3F7994E0, f268; +add.f32 %1, f209, f208; +add.f32 %0, f204, f203; +add.f32 %3, f250, f244; +sub.f32 %2, f214, f220; +add.f32 %5, f260, f255; +sub.f32 %4, f225, f230; +add.f32 %7, f269, f265; +sub.f32 %6, f235, f239; +sub.f32 %9, f265, f269; +add.f32 %8, f239, f235; +sub.f32 %11, f255, f260; +add.f32 %10, f230, f225; +sub.f32 %13, f244, f250; +add.f32 %12, f220, f214; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<375, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<270>; +.reg .b32 r<15>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 196, r2; +mov.u32 r4, %tid.x; +add.f32 f29, %18, %32; +add.f32 f30, %16, f29; +add.f32 f31, %21, %29; +add.f32 f32, f31, f30; +add.f32 f33, %24, %26; +add.f32 f34, f33, f32; +add.f32 f35, %20, %33; +add.f32 f36, %17, f35; +add.f32 f37, %23, %31; +add.f32 f38, f37, f36; +add.f32 f39, %25, %28; +add.f32 f40, f39, f38; +fma.rn.f32 f41, f29, 0f3F1F9D07, %16; +mul.f32 f42, f31, 0f3E63DC87; +sub.f32 f43, f41, f42; +mul.f32 f44, f33, 0f3F66A5E5; +sub.f32 f45, f43, f44; +sub.f32 f46, %20, %33; +mul.f32 f47, f46, 0f3F48261C; +sub.f32 f48, %23, %31; +fma.rn.f32 f49, f48, 0f3F7994E0, f47; +sub.f32 f50, %25, %28; +fma.rn.f32 f51, f50, 0f3EDE2602, f49; +sub.f32 f52, f45, f51; +add.f32 f53, f51, f45; +mul.f32 f54, f29, 0f3E63DC87; +sub.f32 f55, %16, f54; +mul.f32 f56, f31, 0f3F66A5E5; +sub.f32 f57, f55, f56; +fma.rn.f32 f58, f33, 0f3F1F9D07, f57; +mul.f32 f59, f46, 0f3F7994E0; +mul.f32 f60, f48, 0f3EDE2602; +sub.f32 f61, f59, f60; +mul.f32 f62, f50, 0f3F48261C; +sub.f32 f63, f61, f62; +sub.f32 f64, f58, f63; +add.f32 f65, f63, f58; +mul.f32 f66, f29, 0f3F66A5E5; +sub.f32 f67, %16, f66; +fma.rn.f32 f68, f31, 0f3F1F9D07, f67; +mul.f32 f69, f33, 0f3E63DC87; +sub.f32 f70, f68, f69; +mul.f32 f71, f46, 0f3EDE2602; +mul.f32 f72, f48, 0f3F48261C; +sub.f32 f73, f71, f72; +fma.rn.f32 f74, f50, 0f3F7994E0, f73; +sub.f32 f75, f70, f74; +add.f32 f76, f74, f70; +fma.rn.f32 f77, f35, 0f3F1F9D07, %17; +mul.f32 f78, f37, 0f3E63DC87; +sub.f32 f79, f77, f78; +mul.f32 f80, f39, 0f3F66A5E5; +sub.f32 f81, f79, f80; +sub.f32 f82, %18, %32; +mul.f32 f83, f82, 0f3F48261C; +sub.f32 f84, %21, %29; +fma.rn.f32 f85, f84, 0f3F7994E0, f83; +sub.f32 f86, %24, %26; +fma.rn.f32 f87, f86, 0f3EDE2602, f85; +add.f32 f88, f87, f81; +sub.f32 f89, f81, f87; +mul.f32 f90, f35, 0f3E63DC87; +sub.f32 f91, %17, f90; +mul.f32 f92, f37, 0f3F66A5E5; +sub.f32 f93, f91, f92; +fma.rn.f32 f94, f39, 0f3F1F9D07, f93; +mul.f32 f95, f82, 0f3F7994E0; +mul.f32 f96, f84, 0f3EDE2602; +sub.f32 f97, f95, f96; +mul.f32 f98, f86, 0f3F48261C; +sub.f32 f99, f97, f98; +add.f32 f100, f99, f94; +sub.f32 f101, f94, f99; +mul.f32 f102, f35, 0f3F66A5E5; +sub.f32 f103, %17, f102; +fma.rn.f32 f104, f37, 0f3F1F9D07, f103; +mul.f32 f105, f39, 0f3E63DC87; +sub.f32 f106, f104, f105; +mul.f32 f107, f82, 0f3EDE2602; +mul.f32 f108, f84, 0f3F48261C; +sub.f32 f109, f107, f108; +fma.rn.f32 f110, f86, 0f3F7994E0, f109; +add.f32 f111, f110, f106; +sub.f32 f112, f106, f110; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f113, f114}, [rd6]; +mul.f32 f117, f88, f114; +fma.rn.f32 f118, f113, f52, f117; +mul.f32 f119, f52, f114; +mul.f32 f120, f113, f88; +sub.f32 f121, f120, f119; +mul.f32 f122, f113, f113; +mul.f32 f123, f114, f114; +sub.f32 f124, f122, f123; +mul.f32 f125, f114, f113; +fma.rn.f32 f126, f114, f113, f125; +mul.f32 f127, f100, f126; +fma.rn.f32 f128, f124, f64, f127; +mul.f32 f129, f64, f126; +mul.f32 f130, f124, f100; +sub.f32 f131, f130, f129; +mul.f32 f132, f113, f124; +mul.f32 f133, f114, f126; +sub.f32 f134, f132, f133; +mul.f32 f135, f113, f126; +fma.rn.f32 f136, f114, f124, f135; +mul.f32 f137, f111, f136; +fma.rn.f32 f138, f134, f75, f137; +mul.f32 f139, f75, f136; +mul.f32 f140, f134, f111; +sub.f32 f141, f140, f139; +mul.f32 f142, f113, f134; +mul.f32 f143, f114, f136; +sub.f32 f144, f142, f143; +mul.f32 f145, f113, f136; +fma.rn.f32 f146, f114, f134, f145; +mul.f32 f147, f112, f146; +fma.rn.f32 f148, f144, f76, f147; +mul.f32 f149, f76, f146; +mul.f32 f150, f144, f112; +sub.f32 f151, f150, f149; +mul.f32 f152, f113, f144; +mul.f32 f153, f114, f146; +sub.f32 f154, f152, f153; +mul.f32 f155, f113, f146; +fma.rn.f32 f156, f114, f144, f155; +mul.f32 f157, f101, f156; +fma.rn.f32 f158, f154, f65, f157; +mul.f32 f159, f65, f156; +mul.f32 f160, f154, f101; +sub.f32 f161, f160, f159; +mul.f32 f162, f113, f154; +mul.f32 f163, f114, f156; +sub.f32 f164, f162, f163; +mul.f32 f165, f113, f156; +fma.rn.f32 f166, f114, f154, f165; +mul.f32 f167, f89, f166; +fma.rn.f32 f168, f164, f53, f167; +mul.f32 f169, f53, f166; +mul.f32 f170, f164, f89; +sub.f32 f171, f170, f169; +mad.lo.s32 r12, r9, 196, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 28, r12; +st.shared.f32 [r13], f34; +st.shared.f32 [r13+4], f118; +st.shared.f32 [r13+8], f128; +st.shared.f32 [r13+12], f138; +st.shared.f32 [r13+16], f148; +st.shared.f32 [r13+20], f158; +st.shared.f32 [r13+24], f168; +barrier.sync 0; +mad.lo.s32 r14, r11, -24, r13; +ld.shared.f32 f172, [r14]; +ld.shared.f32 f173, [r14+28]; +ld.shared.f32 f174, [r14+56]; +ld.shared.f32 f175, [r14+84]; +ld.shared.f32 f176, [r14+112]; +ld.shared.f32 f177, [r14+140]; +ld.shared.f32 f178, [r14+168]; +barrier.sync 0; +st.shared.f32 [r13], f40; +st.shared.f32 [r13+4], f121; +st.shared.f32 [r13+8], f131; +st.shared.f32 [r13+12], f141; +st.shared.f32 [r13+16], f151; +st.shared.f32 [r13+20], f161; +st.shared.f32 [r13+24], f171; +barrier.sync 0; +ld.shared.f32 f179, [r14]; +ld.shared.f32 f180, [r14+28]; +ld.shared.f32 f181, [r14+56]; +ld.shared.f32 f182, [r14+84]; +ld.shared.f32 f183, [r14+112]; +ld.shared.f32 f184, [r14+140]; +ld.shared.f32 f185, [r14+168]; +add.f32 f186, f173, f178; +add.f32 f187, f172, f186; +add.f32 f188, f174, f177; +add.f32 f189, f188, f187; +add.f32 f190, f175, f176; +add.f32 f191, f180, f185; +add.f32 f192, f179, f191; +add.f32 f193, f181, f184; +add.f32 f194, f193, f192; +add.f32 f195, f182, f183; +fma.rn.f32 f196, f186, 0f3F1F9D07, f172; +mul.f32 f197, f188, 0f3E63DC87; +sub.f32 f198, f196, f197; +mul.f32 f199, f190, 0f3F66A5E5; +sub.f32 f200, f198, f199; +sub.f32 f201, f180, f185; +mul.f32 f202, f201, 0f3F48261C; +sub.f32 f203, f181, f184; +fma.rn.f32 f204, f203, 0f3F7994E0, f202; +sub.f32 f205, f182, f183; +fma.rn.f32 f206, f205, 0f3EDE2602, f204; +mul.f32 f207, f186, 0f3E63DC87; +sub.f32 f208, f172, f207; +mul.f32 f209, f188, 0f3F66A5E5; +sub.f32 f210, f208, f209; +fma.rn.f32 f211, f190, 0f3F1F9D07, f210; +mul.f32 f212, f201, 0f3F7994E0; +mul.f32 f213, f203, 0f3EDE2602; +sub.f32 f214, f212, f213; +mul.f32 f215, f205, 0f3F48261C; +sub.f32 f216, f214, f215; +mul.f32 f217, f186, 0f3F66A5E5; +sub.f32 f218, f172, f217; +fma.rn.f32 f219, f188, 0f3F1F9D07, f218; +mul.f32 f220, f190, 0f3E63DC87; +sub.f32 f221, f219, f220; +mul.f32 f222, f201, 0f3EDE2602; +mul.f32 f223, f203, 0f3F48261C; +sub.f32 f224, f222, f223; +fma.rn.f32 f225, f205, 0f3F7994E0, f224; +fma.rn.f32 f226, f191, 0f3F1F9D07, f179; +mul.f32 f227, f193, 0f3E63DC87; +sub.f32 f228, f226, f227; +mul.f32 f229, f195, 0f3F66A5E5; +sub.f32 f230, f228, f229; +sub.f32 f231, f173, f178; +mul.f32 f232, f231, 0f3F48261C; +sub.f32 f233, f174, f177; +fma.rn.f32 f234, f233, 0f3F7994E0, f232; +sub.f32 f235, f175, f176; +fma.rn.f32 f236, f235, 0f3EDE2602, f234; +mul.f32 f237, f191, 0f3E63DC87; +sub.f32 f238, f179, f237; +mul.f32 f239, f193, 0f3F66A5E5; +sub.f32 f240, f238, f239; +fma.rn.f32 f241, f195, 0f3F1F9D07, f240; +mul.f32 f242, f231, 0f3F7994E0; +mul.f32 f243, f233, 0f3EDE2602; +sub.f32 f244, f242, f243; +mul.f32 f245, f235, 0f3F48261C; +sub.f32 f246, f244, f245; +mul.f32 f247, f191, 0f3F66A5E5; +sub.f32 f248, f179, f247; +fma.rn.f32 f249, f193, 0f3F1F9D07, f248; +mul.f32 f250, f195, 0f3E63DC87; +sub.f32 f251, f249, f250; +mul.f32 f252, f231, 0f3EDE2602; +mul.f32 f253, f233, 0f3F48261C; +sub.f32 f254, f252, f253; +fma.rn.f32 f255, f235, 0f3F7994E0, f254; +add.f32 %0, f190, f189; +add.f32 %1, f195, f194; +add.f32 %3, f236, f230; +sub.f32 %2, f200, f206; +sub.f32 %4, f211, f216; +add.f32 %5, f246, f241; +sub.f32 %6, f221, f225; +add.f32 %7, f255, f251; +add.f32 %8, f225, f221; +sub.f32 %9, f251, f255; +add.f32 %10, f216, f211; +sub.f32 %11, f241, f246; +sub.f32 %13, f230, f236; +add.f32 %12, f206, f200; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "r"(smem), "l"(lut_sp_7_49), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..669bf1c4efe27 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp64_fwd.hpp.inc @@ -0,0 +1,572 @@ +#ifndef CUFFTDX_FFT_49_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_49_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<548, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<15>; +.reg .f64 fd<291>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 784, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %18, %32; +add.f64 fd30, %16, fd29; +add.f64 fd31, %21, %29; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %24, %26; +add.f64 fd34, %20, %33; +add.f64 fd35, %17, fd34; +add.f64 fd36, %23, %31; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %25, %28; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %16; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %20, %33; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %23, %31; +mul.f64 fd47, fd46, 0dBFEF329C0558E969; +sub.f64 fd48, fd47, fd45; +sub.f64 fd49, %25, %28; +mul.f64 fd50, fd49, 0d3FDBC4C04D71ABC1; +sub.f64 fd51, fd48, fd50; +sub.f64 fd52, fd43, fd51; +add.f64 fd53, fd51, fd43; +mul.f64 fd54, fd29, 0d3FCC7B90E3024582; +sub.f64 fd55, %16, fd54; +mul.f64 fd56, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd57, fd55, fd56; +fma.rn.f64 fd58, fd33, 0d3FE3F3A0E28BEDD1, fd57; +mul.f64 fd59, fd44, 0d3FEF329C0558E969; +mul.f64 fd60, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd61, fd60, fd59; +fma.rn.f64 fd62, fd49, 0d3FE904C37505DE4B, fd61; +sub.f64 fd63, fd58, fd62; +add.f64 fd64, fd62, fd58; +mul.f64 fd65, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd66, %16, fd65; +fma.rn.f64 fd67, fd31, 0d3FE3F3A0E28BEDD1, fd66; +mul.f64 fd68, fd33, 0d3FCC7B90E3024582; +sub.f64 fd69, fd67, fd68; +mul.f64 fd70, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd71, fd46, 0d3FE904C37505DE4B; +sub.f64 fd72, fd71, fd70; +mul.f64 fd73, fd49, 0d3FEF329C0558E969; +sub.f64 fd74, fd72, fd73; +sub.f64 fd75, fd69, fd74; +add.f64 fd76, fd74, fd69; +fma.rn.f64 fd77, fd34, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd78, fd36, 0d3FCC7B90E3024582; +sub.f64 fd79, fd77, fd78; +mul.f64 fd80, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd81, fd79, fd80; +sub.f64 fd82, %18, %32; +mul.f64 fd83, fd82, 0d3FE904C37505DE4B; +sub.f64 fd84, %21, %29; +mul.f64 fd85, fd84, 0dBFEF329C0558E969; +sub.f64 fd86, fd85, fd83; +sub.f64 fd87, %24, %26; +mul.f64 fd88, fd87, 0d3FDBC4C04D71ABC1; +sub.f64 fd89, fd86, fd88; +add.f64 fd90, fd89, fd81; +sub.f64 fd91, fd81, fd89; +mul.f64 fd92, fd34, 0d3FCC7B90E3024582; +sub.f64 fd93, %17, fd92; +mul.f64 fd94, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd95, fd93, fd94; +fma.rn.f64 fd96, fd38, 0d3FE3F3A0E28BEDD1, fd95; +mul.f64 fd97, fd82, 0d3FEF329C0558E969; +mul.f64 fd98, fd84, 0d3FDBC4C04D71ABC1; +sub.f64 fd99, fd98, fd97; +fma.rn.f64 fd100, fd87, 0d3FE904C37505DE4B, fd99; +add.f64 fd101, fd100, fd96; +sub.f64 fd102, fd96, fd100; +mul.f64 fd103, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd104, %17, fd103; +fma.rn.f64 fd105, fd36, 0d3FE3F3A0E28BEDD1, fd104; +mul.f64 fd106, fd38, 0d3FCC7B90E3024582; +sub.f64 fd107, fd105, fd106; +mul.f64 fd108, fd82, 0d3FDBC4C04D71ABC1; +mul.f64 fd109, fd84, 0d3FE904C37505DE4B; +sub.f64 fd110, fd109, fd108; +mul.f64 fd111, fd87, 0d3FEF329C0558E969; +sub.f64 fd112, fd110, fd111; +add.f64 fd113, fd112, fd107; +sub.f64 fd114, fd107, fd112; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 784, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd115, fd116}, [rd6]; +mul.f64 fd119, fd115, fd52; +mul.f64 fd120, fd116, fd90; +mul.f64 fd121, fd115, fd90; +mul.f64 fd122, fd115, fd115; +mul.f64 fd123, fd116, fd116; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd116, fd115; +fma.rn.f64 fd126, fd116, fd115, fd125; +mul.f64 fd127, fd124, fd63; +mul.f64 fd128, fd126, fd101; +mul.f64 fd129, fd124, fd101; +mul.f64 fd130, fd115, fd124; +mul.f64 fd131, fd116, fd126; +sub.f64 fd132, fd130, fd131; +mul.f64 fd133, fd115, fd126; +fma.rn.f64 fd134, fd116, fd124, fd133; +mul.f64 fd135, fd132, fd75; +mul.f64 fd136, fd134, fd113; +mul.f64 fd137, fd132, fd113; +ld.global.v2.f64 {fd138, fd139}, [rd6+112]; +mul.f64 fd142, fd138, fd76; +mul.f64 fd143, fd139, fd114; +mul.f64 fd144, fd138, fd114; +mul.f64 fd145, fd115, fd138; +mul.f64 fd146, fd116, fd139; +sub.f64 fd147, fd145, fd146; +mul.f64 fd148, fd115, fd139; +fma.rn.f64 fd149, fd116, fd138, fd148; +mul.f64 fd150, fd147, fd64; +mul.f64 fd151, fd149, fd102; +mul.f64 fd152, fd147, fd102; +mul.f64 fd153, fd115, fd147; +mul.f64 fd154, fd116, fd149; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd115, fd149; +fma.rn.f64 fd157, fd116, fd147, fd156; +mul.f64 fd158, fd155, fd53; +mul.f64 fd159, fd157, fd91; +mul.f64 fd160, fd155, fd91; +barrier.sync 0; +mad.lo.s32 r13, r11, 112, r12; +add.f64 fd161, fd38, fd37; +add.f64 fd162, fd33, fd32; +st.shared.v2.f64 [r13], {fd162, fd161}; +fma.rn.f64 fd163, fd116, fd52, fd121; +sub.f64 fd164, fd119, fd120; +st.shared.v2.f64 [r13+16], {fd164, fd163}; +fma.rn.f64 fd165, fd126, fd63, fd129; +sub.f64 fd166, fd127, fd128; +st.shared.v2.f64 [r13+32], {fd166, fd165}; +sub.f64 fd167, fd135, fd136; +fma.rn.f64 fd168, fd134, fd75, fd137; +st.shared.v2.f64 [r13+48], {fd167, fd168}; +fma.rn.f64 fd169, fd139, fd76, fd144; +sub.f64 fd170, fd142, fd143; +st.shared.v2.f64 [r13+64], {fd170, fd169}; +fma.rn.f64 fd171, fd149, fd64, fd152; +sub.f64 fd172, fd150, fd151; +st.shared.v2.f64 [r13+80], {fd172, fd171}; +sub.f64 fd173, fd158, fd159; +fma.rn.f64 fd174, fd157, fd53, fd160; +st.shared.v2.f64 [r13+96], {fd173, fd174}; +barrier.sync 0; +mad.lo.s32 r14, r11, -96, r13; +ld.shared.v2.f64 {fd175, fd176}, [r14]; +ld.shared.v2.f64 {fd179, fd180}, [r14+112]; +ld.shared.v2.f64 {fd183, fd184}, [r14+224]; +ld.shared.v2.f64 {fd187, fd188}, [r14+336]; +ld.shared.v2.f64 {fd191, fd192}, [r14+448]; +ld.shared.v2.f64 {fd195, fd196}, [r14+560]; +ld.shared.v2.f64 {fd199, fd200}, [r14+672]; +add.f64 fd203, fd179, fd199; +add.f64 fd204, fd175, fd203; +add.f64 fd205, fd183, fd195; +add.f64 fd206, fd205, fd204; +add.f64 fd207, fd187, fd191; +add.f64 fd208, fd180, fd200; +add.f64 fd209, fd176, fd208; +add.f64 fd210, fd184, fd196; +add.f64 fd211, fd210, fd209; +add.f64 fd212, fd188, fd192; +fma.rn.f64 fd213, fd203, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd214, fd205, 0d3FCC7B90E3024582; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd207, 0d3FECD4BCA9CB5C71; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, fd180, fd200; +mul.f64 fd219, fd218, 0d3FE904C37505DE4B; +sub.f64 fd220, fd184, fd196; +mul.f64 fd221, fd220, 0dBFEF329C0558E969; +sub.f64 fd222, fd221, fd219; +sub.f64 fd223, fd188, fd192; +mul.f64 fd224, fd223, 0d3FDBC4C04D71ABC1; +sub.f64 fd225, fd222, fd224; +mul.f64 fd226, fd203, 0d3FCC7B90E3024582; +sub.f64 fd227, fd175, fd226; +mul.f64 fd228, fd205, 0d3FECD4BCA9CB5C71; +sub.f64 fd229, fd227, fd228; +fma.rn.f64 fd230, fd207, 0d3FE3F3A0E28BEDD1, fd229; +mul.f64 fd231, fd218, 0d3FEF329C0558E969; +mul.f64 fd232, fd220, 0d3FDBC4C04D71ABC1; +sub.f64 fd233, fd232, fd231; +fma.rn.f64 fd234, fd223, 0d3FE904C37505DE4B, fd233; +mul.f64 fd235, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd236, fd175, fd235; +fma.rn.f64 fd237, fd205, 0d3FE3F3A0E28BEDD1, fd236; +mul.f64 fd238, fd207, 0d3FCC7B90E3024582; +sub.f64 fd239, fd237, fd238; +mul.f64 fd240, fd218, 0d3FDBC4C04D71ABC1; +mul.f64 fd241, fd220, 0d3FE904C37505DE4B; +sub.f64 fd242, fd241, fd240; +mul.f64 fd243, fd223, 0d3FEF329C0558E969; +sub.f64 fd244, fd242, fd243; +fma.rn.f64 fd245, fd208, 0d3FE3F3A0E28BEDD1, fd176; +mul.f64 fd246, fd210, 0d3FCC7B90E3024582; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd212, 0d3FECD4BCA9CB5C71; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd179, fd199; +mul.f64 fd251, fd250, 0d3FE904C37505DE4B; +sub.f64 fd252, fd183, fd195; +mul.f64 fd253, fd252, 0dBFEF329C0558E969; +sub.f64 fd254, fd253, fd251; +sub.f64 fd255, fd187, fd191; +mul.f64 fd256, fd255, 0d3FDBC4C04D71ABC1; +sub.f64 fd257, fd254, fd256; +mul.f64 fd258, fd208, 0d3FCC7B90E3024582; +sub.f64 fd259, fd176, fd258; +mul.f64 fd260, fd210, 0d3FECD4BCA9CB5C71; +sub.f64 fd261, fd259, fd260; +fma.rn.f64 fd262, fd212, 0d3FE3F3A0E28BEDD1, fd261; +mul.f64 fd263, fd250, 0d3FEF329C0558E969; +mul.f64 fd264, fd252, 0d3FDBC4C04D71ABC1; +sub.f64 fd265, fd264, fd263; +fma.rn.f64 fd266, fd255, 0d3FE904C37505DE4B, fd265; +mul.f64 fd267, fd208, 0d3FECD4BCA9CB5C71; +sub.f64 fd268, fd176, fd267; +fma.rn.f64 fd269, fd210, 0d3FE3F3A0E28BEDD1, fd268; +mul.f64 fd270, fd212, 0d3FCC7B90E3024582; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd250, 0d3FDBC4C04D71ABC1; +mul.f64 fd273, fd252, 0d3FE904C37505DE4B; +sub.f64 fd274, fd273, fd272; +mul.f64 fd275, fd255, 0d3FEF329C0558E969; +sub.f64 fd276, fd274, fd275; +add.f64 %1, fd212, fd211; +add.f64 %0, fd207, fd206; +add.f64 %3, fd257, fd249; +sub.f64 %2, fd217, fd225; +add.f64 %5, fd266, fd262; +sub.f64 %4, fd230, fd234; +add.f64 %7, fd276, fd271; +sub.f64 %6, fd239, fd244; +sub.f64 %9, fd271, fd276; +add.f64 %8, fd244, fd239; +sub.f64 %11, fd262, fd266; +add.f64 %10, fd234, fd230; +sub.f64 %13, fd249, fd257; +add.f64 %12, fd225, fd217; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<547, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<15>; +.reg .f64 fd<277>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 392, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %18, %32; +add.f64 fd30, %16, fd29; +add.f64 fd31, %21, %29; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %24, %26; +add.f64 fd34, fd33, fd32; +add.f64 fd35, %20, %33; +add.f64 fd36, %17, fd35; +add.f64 fd37, %23, %31; +add.f64 fd38, fd37, fd36; +add.f64 fd39, %25, %28; +add.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd29, 0d3FE3F3A0E28BEDD1, %16; +mul.f64 fd42, fd31, 0d3FCC7B90E3024582; +sub.f64 fd43, fd41, fd42; +mul.f64 fd44, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd45, fd43, fd44; +sub.f64 fd46, %20, %33; +mul.f64 fd47, fd46, 0d3FE904C37505DE4B; +sub.f64 fd48, %23, %31; +mul.f64 fd49, fd48, 0dBFEF329C0558E969; +sub.f64 fd50, fd49, fd47; +sub.f64 fd51, %25, %28; +mul.f64 fd52, fd51, 0d3FDBC4C04D71ABC1; +sub.f64 fd53, fd50, fd52; +sub.f64 fd54, fd45, fd53; +add.f64 fd55, fd53, fd45; +mul.f64 fd56, fd29, 0d3FCC7B90E3024582; +sub.f64 fd57, %16, fd56; +mul.f64 fd58, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd59, fd57, fd58; +fma.rn.f64 fd60, fd33, 0d3FE3F3A0E28BEDD1, fd59; +mul.f64 fd61, fd46, 0d3FEF329C0558E969; +mul.f64 fd62, fd48, 0d3FDBC4C04D71ABC1; +sub.f64 fd63, fd62, fd61; +fma.rn.f64 fd64, fd51, 0d3FE904C37505DE4B, fd63; +sub.f64 fd65, fd60, fd64; +add.f64 fd66, fd64, fd60; +mul.f64 fd67, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd68, %16, fd67; +fma.rn.f64 fd69, fd31, 0d3FE3F3A0E28BEDD1, fd68; +mul.f64 fd70, fd33, 0d3FCC7B90E3024582; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd46, 0d3FDBC4C04D71ABC1; +mul.f64 fd73, fd48, 0d3FE904C37505DE4B; +sub.f64 fd74, fd73, fd72; +mul.f64 fd75, fd51, 0d3FEF329C0558E969; +sub.f64 fd76, fd74, fd75; +sub.f64 fd77, fd71, fd76; +add.f64 fd78, fd76, fd71; +fma.rn.f64 fd79, fd35, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd80, fd37, 0d3FCC7B90E3024582; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd39, 0d3FECD4BCA9CB5C71; +sub.f64 fd83, fd81, fd82; +sub.f64 fd84, %18, %32; +mul.f64 fd85, fd84, 0d3FE904C37505DE4B; +sub.f64 fd86, %21, %29; +mul.f64 fd87, fd86, 0dBFEF329C0558E969; +sub.f64 fd88, fd87, fd85; +sub.f64 fd89, %24, %26; +mul.f64 fd90, fd89, 0d3FDBC4C04D71ABC1; +sub.f64 fd91, fd88, fd90; +add.f64 fd92, fd91, fd83; +sub.f64 fd93, fd83, fd91; +mul.f64 fd94, fd35, 0d3FCC7B90E3024582; +sub.f64 fd95, %17, fd94; +mul.f64 fd96, fd37, 0d3FECD4BCA9CB5C71; +sub.f64 fd97, fd95, fd96; +fma.rn.f64 fd98, fd39, 0d3FE3F3A0E28BEDD1, fd97; +mul.f64 fd99, fd84, 0d3FEF329C0558E969; +mul.f64 fd100, fd86, 0d3FDBC4C04D71ABC1; +sub.f64 fd101, fd100, fd99; +fma.rn.f64 fd102, fd89, 0d3FE904C37505DE4B, fd101; +add.f64 fd103, fd102, fd98; +sub.f64 fd104, fd98, fd102; +mul.f64 fd105, fd35, 0d3FECD4BCA9CB5C71; +sub.f64 fd106, %17, fd105; +fma.rn.f64 fd107, fd37, 0d3FE3F3A0E28BEDD1, fd106; +mul.f64 fd108, fd39, 0d3FCC7B90E3024582; +sub.f64 fd109, fd107, fd108; +mul.f64 fd110, fd84, 0d3FDBC4C04D71ABC1; +mul.f64 fd111, fd86, 0d3FE904C37505DE4B; +sub.f64 fd112, fd111, fd110; +mul.f64 fd113, fd89, 0d3FEF329C0558E969; +sub.f64 fd114, fd112, fd113; +add.f64 fd115, fd114, fd109; +sub.f64 fd116, fd109, fd114; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd117, fd118}, [rd6]; +mul.f64 fd121, fd117, fd54; +mul.f64 fd122, fd118, fd92; +sub.f64 fd123, fd121, fd122; +mul.f64 fd124, fd117, fd92; +fma.rn.f64 fd125, fd118, fd54, fd124; +mul.f64 fd126, fd117, fd117; +mul.f64 fd127, fd118, fd118; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd118, fd117; +fma.rn.f64 fd130, fd118, fd117, fd129; +mul.f64 fd131, fd128, fd65; +mul.f64 fd132, fd130, fd103; +sub.f64 fd133, fd131, fd132; +mul.f64 fd134, fd128, fd103; +fma.rn.f64 fd135, fd130, fd65, fd134; +mul.f64 fd136, fd117, fd128; +mul.f64 fd137, fd118, fd130; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd117, fd130; +fma.rn.f64 fd140, fd118, fd128, fd139; +mul.f64 fd141, fd138, fd77; +mul.f64 fd142, fd140, fd115; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd138, fd115; +fma.rn.f64 fd145, fd140, fd77, fd144; +ld.global.v2.f64 {fd146, fd147}, [rd6+112]; +mul.f64 fd150, fd146, fd78; +mul.f64 fd151, fd147, fd116; +sub.f64 fd152, fd150, fd151; +mul.f64 fd153, fd146, fd116; +fma.rn.f64 fd154, fd147, fd78, fd153; +mul.f64 fd155, fd117, fd146; +mul.f64 fd156, fd118, fd147; +sub.f64 fd157, fd155, fd156; +mul.f64 fd158, fd117, fd147; +fma.rn.f64 fd159, fd118, fd146, fd158; +mul.f64 fd160, fd157, fd66; +mul.f64 fd161, fd159, fd104; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd157, fd104; +fma.rn.f64 fd164, fd159, fd66, fd163; +mul.f64 fd165, fd117, fd157; +mul.f64 fd166, fd118, fd159; +sub.f64 fd167, fd165, fd166; +mul.f64 fd168, fd117, fd159; +fma.rn.f64 fd169, fd118, fd157, fd168; +mul.f64 fd170, fd167, fd55; +mul.f64 fd171, fd169, fd93; +sub.f64 fd172, fd170, fd171; +mul.f64 fd173, fd167, fd93; +fma.rn.f64 fd174, fd169, fd55, fd173; +mad.lo.s32 r12, r9, 392, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 56, r12; +st.shared.f64 [r13], fd34; +st.shared.f64 [r13+8], fd123; +st.shared.f64 [r13+16], fd133; +st.shared.f64 [r13+24], fd143; +st.shared.f64 [r13+32], fd152; +st.shared.f64 [r13+40], fd162; +st.shared.f64 [r13+48], fd172; +barrier.sync 0; +mad.lo.s32 r14, r11, -48, r13; +ld.shared.f64 fd175, [r14]; +ld.shared.f64 fd176, [r14+56]; +ld.shared.f64 fd177, [r14+112]; +ld.shared.f64 fd178, [r14+168]; +ld.shared.f64 fd179, [r14+224]; +ld.shared.f64 fd180, [r14+280]; +ld.shared.f64 fd181, [r14+336]; +barrier.sync 0; +st.shared.f64 [r13], fd40; +st.shared.f64 [r13+8], fd125; +st.shared.f64 [r13+16], fd135; +st.shared.f64 [r13+24], fd145; +st.shared.f64 [r13+32], fd154; +st.shared.f64 [r13+40], fd164; +st.shared.f64 [r13+48], fd174; +barrier.sync 0; +ld.shared.f64 fd182, [r14]; +ld.shared.f64 fd183, [r14+56]; +ld.shared.f64 fd184, [r14+112]; +ld.shared.f64 fd185, [r14+168]; +ld.shared.f64 fd186, [r14+224]; +ld.shared.f64 fd187, [r14+280]; +ld.shared.f64 fd188, [r14+336]; +add.f64 fd189, fd176, fd181; +add.f64 fd190, fd175, fd189; +add.f64 fd191, fd177, fd180; +add.f64 fd192, fd191, fd190; +add.f64 fd193, fd178, fd179; +add.f64 fd194, fd183, fd188; +add.f64 fd195, fd182, fd194; +add.f64 fd196, fd184, fd187; +add.f64 fd197, fd196, fd195; +add.f64 fd198, fd185, fd186; +fma.rn.f64 fd199, fd189, 0d3FE3F3A0E28BEDD1, fd175; +mul.f64 fd200, fd191, 0d3FCC7B90E3024582; +sub.f64 fd201, fd199, fd200; +mul.f64 fd202, fd193, 0d3FECD4BCA9CB5C71; +sub.f64 fd203, fd201, fd202; +sub.f64 fd204, fd183, fd188; +mul.f64 fd205, fd204, 0d3FE904C37505DE4B; +sub.f64 fd206, fd184, fd187; +mul.f64 fd207, fd206, 0dBFEF329C0558E969; +sub.f64 fd208, fd207, fd205; +sub.f64 fd209, fd185, fd186; +mul.f64 fd210, fd209, 0d3FDBC4C04D71ABC1; +sub.f64 fd211, fd208, fd210; +mul.f64 fd212, fd189, 0d3FCC7B90E3024582; +sub.f64 fd213, fd175, fd212; +mul.f64 fd214, fd191, 0d3FECD4BCA9CB5C71; +sub.f64 fd215, fd213, fd214; +fma.rn.f64 fd216, fd193, 0d3FE3F3A0E28BEDD1, fd215; +mul.f64 fd217, fd204, 0d3FEF329C0558E969; +mul.f64 fd218, fd206, 0d3FDBC4C04D71ABC1; +sub.f64 fd219, fd218, fd217; +fma.rn.f64 fd220, fd209, 0d3FE904C37505DE4B, fd219; +mul.f64 fd221, fd189, 0d3FECD4BCA9CB5C71; +sub.f64 fd222, fd175, fd221; +fma.rn.f64 fd223, fd191, 0d3FE3F3A0E28BEDD1, fd222; +mul.f64 fd224, fd193, 0d3FCC7B90E3024582; +sub.f64 fd225, fd223, fd224; +mul.f64 fd226, fd204, 0d3FDBC4C04D71ABC1; +mul.f64 fd227, fd206, 0d3FE904C37505DE4B; +sub.f64 fd228, fd227, fd226; +mul.f64 fd229, fd209, 0d3FEF329C0558E969; +sub.f64 fd230, fd228, fd229; +fma.rn.f64 fd231, fd194, 0d3FE3F3A0E28BEDD1, fd182; +mul.f64 fd232, fd196, 0d3FCC7B90E3024582; +sub.f64 fd233, fd231, fd232; +mul.f64 fd234, fd198, 0d3FECD4BCA9CB5C71; +sub.f64 fd235, fd233, fd234; +sub.f64 fd236, fd176, fd181; +mul.f64 fd237, fd236, 0d3FE904C37505DE4B; +sub.f64 fd238, fd177, fd180; +mul.f64 fd239, fd238, 0dBFEF329C0558E969; +sub.f64 fd240, fd239, fd237; +sub.f64 fd241, fd178, fd179; +mul.f64 fd242, fd241, 0d3FDBC4C04D71ABC1; +sub.f64 fd243, fd240, fd242; +mul.f64 fd244, fd194, 0d3FCC7B90E3024582; +sub.f64 fd245, fd182, fd244; +mul.f64 fd246, fd196, 0d3FECD4BCA9CB5C71; +sub.f64 fd247, fd245, fd246; +fma.rn.f64 fd248, fd198, 0d3FE3F3A0E28BEDD1, fd247; +mul.f64 fd249, fd236, 0d3FEF329C0558E969; +mul.f64 fd250, fd238, 0d3FDBC4C04D71ABC1; +sub.f64 fd251, fd250, fd249; +fma.rn.f64 fd252, fd241, 0d3FE904C37505DE4B, fd251; +mul.f64 fd253, fd194, 0d3FECD4BCA9CB5C71; +sub.f64 fd254, fd182, fd253; +fma.rn.f64 fd255, fd196, 0d3FE3F3A0E28BEDD1, fd254; +mul.f64 fd256, fd198, 0d3FCC7B90E3024582; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd236, 0d3FDBC4C04D71ABC1; +mul.f64 fd259, fd238, 0d3FE904C37505DE4B; +sub.f64 fd260, fd259, fd258; +mul.f64 fd261, fd241, 0d3FEF329C0558E969; +sub.f64 fd262, fd260, fd261; +add.f64 %0, fd193, fd192; +add.f64 %1, fd198, fd197; +add.f64 %3, fd243, fd235; +sub.f64 %2, fd203, fd211; +sub.f64 %4, fd216, fd220; +add.f64 %5, fd252, fd248; +sub.f64 %6, fd225, fd230; +add.f64 %7, fd262, fd257; +add.f64 %8, fd230, fd225; +sub.f64 %9, fd257, fd262; +add.f64 %10, fd220, fd216; +sub.f64 %11, fd248, fd252; +sub.f64 %13, fd235, fd243; +add.f64 %12, fd211, fd203; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..5faba68c5177f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_49_fp64_inv.hpp.inc @@ -0,0 +1,556 @@ +#ifndef CUFFTDX_FFT_49_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_49_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<719, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<15>; +.reg .f64 fd<283>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 784, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %18, %32; +add.f64 fd30, %16, fd29; +add.f64 fd31, %21, %29; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %24, %26; +add.f64 fd34, %20, %33; +add.f64 fd35, %17, fd34; +add.f64 fd36, %23, %31; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %25, %28; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %16; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %20, %33; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %23, %31; +fma.rn.f64 fd47, fd46, 0d3FEF329C0558E969, fd45; +sub.f64 fd48, %25, %28; +fma.rn.f64 fd49, fd48, 0d3FDBC4C04D71ABC1, fd47; +sub.f64 fd50, fd43, fd49; +add.f64 fd51, fd49, fd43; +mul.f64 fd52, fd29, 0d3FCC7B90E3024582; +sub.f64 fd53, %16, fd52; +mul.f64 fd54, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd55, fd53, fd54; +fma.rn.f64 fd56, fd33, 0d3FE3F3A0E28BEDD1, fd55; +mul.f64 fd57, fd44, 0d3FEF329C0558E969; +mul.f64 fd58, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd48, 0d3FE904C37505DE4B; +sub.f64 fd61, fd59, fd60; +sub.f64 fd62, fd56, fd61; +add.f64 fd63, fd61, fd56; +mul.f64 fd64, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd65, %16, fd64; +fma.rn.f64 fd66, fd31, 0d3FE3F3A0E28BEDD1, fd65; +mul.f64 fd67, fd33, 0d3FCC7B90E3024582; +sub.f64 fd68, fd66, fd67; +mul.f64 fd69, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd70, fd46, 0d3FE904C37505DE4B; +sub.f64 fd71, fd69, fd70; +fma.rn.f64 fd72, fd48, 0d3FEF329C0558E969, fd71; +sub.f64 fd73, fd68, fd72; +add.f64 fd74, fd72, fd68; +fma.rn.f64 fd75, fd34, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd76, fd36, 0d3FCC7B90E3024582; +sub.f64 fd77, fd75, fd76; +mul.f64 fd78, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd79, fd77, fd78; +sub.f64 fd80, %18, %32; +mul.f64 fd81, fd80, 0d3FE904C37505DE4B; +sub.f64 fd82, %21, %29; +fma.rn.f64 fd83, fd82, 0d3FEF329C0558E969, fd81; +sub.f64 fd84, %24, %26; +fma.rn.f64 fd85, fd84, 0d3FDBC4C04D71ABC1, fd83; +add.f64 fd86, fd85, fd79; +sub.f64 fd87, fd79, fd85; +mul.f64 fd88, fd34, 0d3FCC7B90E3024582; +sub.f64 fd89, %17, fd88; +mul.f64 fd90, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd91, fd89, fd90; +fma.rn.f64 fd92, fd38, 0d3FE3F3A0E28BEDD1, fd91; +mul.f64 fd93, fd80, 0d3FEF329C0558E969; +mul.f64 fd94, fd82, 0d3FDBC4C04D71ABC1; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd84, 0d3FE904C37505DE4B; +sub.f64 fd97, fd95, fd96; +add.f64 fd98, fd97, fd92; +sub.f64 fd99, fd92, fd97; +mul.f64 fd100, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd101, %17, fd100; +fma.rn.f64 fd102, fd36, 0d3FE3F3A0E28BEDD1, fd101; +mul.f64 fd103, fd38, 0d3FCC7B90E3024582; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd80, 0d3FDBC4C04D71ABC1; +mul.f64 fd106, fd82, 0d3FE904C37505DE4B; +sub.f64 fd107, fd105, fd106; +fma.rn.f64 fd108, fd84, 0d3FEF329C0558E969, fd107; +add.f64 fd109, fd108, fd104; +sub.f64 fd110, fd104, fd108; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 784, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd111, fd112}, [rd6]; +mul.f64 fd115, fd86, fd112; +mul.f64 fd116, fd50, fd112; +mul.f64 fd117, fd111, fd86; +mul.f64 fd118, fd111, fd111; +mul.f64 fd119, fd112, fd112; +sub.f64 fd120, fd118, fd119; +mul.f64 fd121, fd112, fd111; +fma.rn.f64 fd122, fd112, fd111, fd121; +mul.f64 fd123, fd98, fd122; +mul.f64 fd124, fd62, fd122; +mul.f64 fd125, fd120, fd98; +mul.f64 fd126, fd111, fd120; +mul.f64 fd127, fd112, fd122; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd111, fd122; +fma.rn.f64 fd130, fd112, fd120, fd129; +mul.f64 fd131, fd109, fd130; +mul.f64 fd132, fd73, fd130; +mul.f64 fd133, fd128, fd109; +ld.global.v2.f64 {fd134, fd135}, [rd6+112]; +mul.f64 fd138, fd110, fd135; +mul.f64 fd139, fd74, fd135; +mul.f64 fd140, fd134, fd110; +mul.f64 fd141, fd111, fd134; +mul.f64 fd142, fd112, fd135; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd111, fd135; +fma.rn.f64 fd145, fd112, fd134, fd144; +mul.f64 fd146, fd99, fd145; +mul.f64 fd147, fd63, fd145; +mul.f64 fd148, fd143, fd99; +mul.f64 fd149, fd111, fd143; +mul.f64 fd150, fd112, fd145; +sub.f64 fd151, fd149, fd150; +mul.f64 fd152, fd111, fd145; +fma.rn.f64 fd153, fd112, fd143, fd152; +mul.f64 fd154, fd87, fd153; +mul.f64 fd155, fd51, fd153; +mul.f64 fd156, fd151, fd87; +barrier.sync 0; +mad.lo.s32 r13, r11, 112, r12; +add.f64 fd157, fd38, fd37; +add.f64 fd158, fd33, fd32; +st.shared.v2.f64 [r13], {fd158, fd157}; +fma.rn.f64 fd159, fd111, fd50, fd115; +sub.f64 fd160, fd117, fd116; +st.shared.v2.f64 [r13+16], {fd159, fd160}; +fma.rn.f64 fd161, fd120, fd62, fd123; +sub.f64 fd162, fd125, fd124; +st.shared.v2.f64 [r13+32], {fd161, fd162}; +sub.f64 fd163, fd133, fd132; +fma.rn.f64 fd164, fd128, fd73, fd131; +st.shared.v2.f64 [r13+48], {fd164, fd163}; +fma.rn.f64 fd165, fd134, fd74, fd138; +sub.f64 fd166, fd140, fd139; +st.shared.v2.f64 [r13+64], {fd165, fd166}; +fma.rn.f64 fd167, fd143, fd63, fd146; +sub.f64 fd168, fd148, fd147; +st.shared.v2.f64 [r13+80], {fd167, fd168}; +sub.f64 fd169, fd156, fd155; +fma.rn.f64 fd170, fd151, fd51, fd154; +st.shared.v2.f64 [r13+96], {fd170, fd169}; +barrier.sync 0; +mad.lo.s32 r14, r11, -96, r13; +ld.shared.v2.f64 {fd171, fd172}, [r14]; +ld.shared.v2.f64 {fd175, fd176}, [r14+112]; +ld.shared.v2.f64 {fd179, fd180}, [r14+224]; +ld.shared.v2.f64 {fd183, fd184}, [r14+336]; +ld.shared.v2.f64 {fd187, fd188}, [r14+448]; +ld.shared.v2.f64 {fd191, fd192}, [r14+560]; +ld.shared.v2.f64 {fd195, fd196}, [r14+672]; +add.f64 fd199, fd175, fd195; +add.f64 fd200, fd171, fd199; +add.f64 fd201, fd179, fd191; +add.f64 fd202, fd201, fd200; +add.f64 fd203, fd183, fd187; +add.f64 fd204, fd176, fd196; +add.f64 fd205, fd172, fd204; +add.f64 fd206, fd180, fd192; +add.f64 fd207, fd206, fd205; +add.f64 fd208, fd184, fd188; +fma.rn.f64 fd209, fd199, 0d3FE3F3A0E28BEDD1, fd171; +mul.f64 fd210, fd201, 0d3FCC7B90E3024582; +sub.f64 fd211, fd209, fd210; +mul.f64 fd212, fd203, 0d3FECD4BCA9CB5C71; +sub.f64 fd213, fd211, fd212; +sub.f64 fd214, fd176, fd196; +mul.f64 fd215, fd214, 0d3FE904C37505DE4B; +sub.f64 fd216, fd180, fd192; +fma.rn.f64 fd217, fd216, 0d3FEF329C0558E969, fd215; +sub.f64 fd218, fd184, fd188; +fma.rn.f64 fd219, fd218, 0d3FDBC4C04D71ABC1, fd217; +mul.f64 fd220, fd199, 0d3FCC7B90E3024582; +sub.f64 fd221, fd171, fd220; +mul.f64 fd222, fd201, 0d3FECD4BCA9CB5C71; +sub.f64 fd223, fd221, fd222; +fma.rn.f64 fd224, fd203, 0d3FE3F3A0E28BEDD1, fd223; +mul.f64 fd225, fd214, 0d3FEF329C0558E969; +mul.f64 fd226, fd216, 0d3FDBC4C04D71ABC1; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd218, 0d3FE904C37505DE4B; +sub.f64 fd229, fd227, fd228; +mul.f64 fd230, fd199, 0d3FECD4BCA9CB5C71; +sub.f64 fd231, fd171, fd230; +fma.rn.f64 fd232, fd201, 0d3FE3F3A0E28BEDD1, fd231; +mul.f64 fd233, fd203, 0d3FCC7B90E3024582; +sub.f64 fd234, fd232, fd233; +mul.f64 fd235, fd214, 0d3FDBC4C04D71ABC1; +mul.f64 fd236, fd216, 0d3FE904C37505DE4B; +sub.f64 fd237, fd235, fd236; +fma.rn.f64 fd238, fd218, 0d3FEF329C0558E969, fd237; +fma.rn.f64 fd239, fd204, 0d3FE3F3A0E28BEDD1, fd172; +mul.f64 fd240, fd206, 0d3FCC7B90E3024582; +sub.f64 fd241, fd239, fd240; +mul.f64 fd242, fd208, 0d3FECD4BCA9CB5C71; +sub.f64 fd243, fd241, fd242; +sub.f64 fd244, fd175, fd195; +mul.f64 fd245, fd244, 0d3FE904C37505DE4B; +sub.f64 fd246, fd179, fd191; +fma.rn.f64 fd247, fd246, 0d3FEF329C0558E969, fd245; +sub.f64 fd248, fd183, fd187; +fma.rn.f64 fd249, fd248, 0d3FDBC4C04D71ABC1, fd247; +mul.f64 fd250, fd204, 0d3FCC7B90E3024582; +sub.f64 fd251, fd172, fd250; +mul.f64 fd252, fd206, 0d3FECD4BCA9CB5C71; +sub.f64 fd253, fd251, fd252; +fma.rn.f64 fd254, fd208, 0d3FE3F3A0E28BEDD1, fd253; +mul.f64 fd255, fd244, 0d3FEF329C0558E969; +mul.f64 fd256, fd246, 0d3FDBC4C04D71ABC1; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd248, 0d3FE904C37505DE4B; +sub.f64 fd259, fd257, fd258; +mul.f64 fd260, fd204, 0d3FECD4BCA9CB5C71; +sub.f64 fd261, fd172, fd260; +fma.rn.f64 fd262, fd206, 0d3FE3F3A0E28BEDD1, fd261; +mul.f64 fd263, fd208, 0d3FCC7B90E3024582; +sub.f64 fd264, fd262, fd263; +mul.f64 fd265, fd244, 0d3FDBC4C04D71ABC1; +mul.f64 fd266, fd246, 0d3FE904C37505DE4B; +sub.f64 fd267, fd265, fd266; +fma.rn.f64 fd268, fd248, 0d3FEF329C0558E969, fd267; +add.f64 %1, fd208, fd207; +add.f64 %0, fd203, fd202; +add.f64 %3, fd249, fd243; +sub.f64 %2, fd213, fd219; +add.f64 %5, fd259, fd254; +sub.f64 %4, fd224, fd229; +add.f64 %7, fd268, fd264; +sub.f64 %6, fd234, fd238; +sub.f64 %9, fd264, fd268; +add.f64 %8, fd238, fd234; +sub.f64 %11, fd254, fd259; +add.f64 %10, fd229, fd224; +sub.f64 %13, fd243, fd249; +add.f64 %12, fd219, fd213; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<718, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<15>; +.reg .f64 fd<269>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %14; +mad.lo.s32 r3, r1, 392, r2; +mov.u32 r4, %tid.x; +add.f64 fd29, %18, %32; +add.f64 fd30, %16, fd29; +add.f64 fd31, %21, %29; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %24, %26; +add.f64 fd34, fd33, fd32; +add.f64 fd35, %20, %33; +add.f64 fd36, %17, fd35; +add.f64 fd37, %23, %31; +add.f64 fd38, fd37, fd36; +add.f64 fd39, %25, %28; +add.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd29, 0d3FE3F3A0E28BEDD1, %16; +mul.f64 fd42, fd31, 0d3FCC7B90E3024582; +sub.f64 fd43, fd41, fd42; +mul.f64 fd44, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd45, fd43, fd44; +sub.f64 fd46, %20, %33; +mul.f64 fd47, fd46, 0d3FE904C37505DE4B; +sub.f64 fd48, %23, %31; +fma.rn.f64 fd49, fd48, 0d3FEF329C0558E969, fd47; +sub.f64 fd50, %25, %28; +fma.rn.f64 fd51, fd50, 0d3FDBC4C04D71ABC1, fd49; +sub.f64 fd52, fd45, fd51; +add.f64 fd53, fd51, fd45; +mul.f64 fd54, fd29, 0d3FCC7B90E3024582; +sub.f64 fd55, %16, fd54; +mul.f64 fd56, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd57, fd55, fd56; +fma.rn.f64 fd58, fd33, 0d3FE3F3A0E28BEDD1, fd57; +mul.f64 fd59, fd46, 0d3FEF329C0558E969; +mul.f64 fd60, fd48, 0d3FDBC4C04D71ABC1; +sub.f64 fd61, fd59, fd60; +mul.f64 fd62, fd50, 0d3FE904C37505DE4B; +sub.f64 fd63, fd61, fd62; +sub.f64 fd64, fd58, fd63; +add.f64 fd65, fd63, fd58; +mul.f64 fd66, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd67, %16, fd66; +fma.rn.f64 fd68, fd31, 0d3FE3F3A0E28BEDD1, fd67; +mul.f64 fd69, fd33, 0d3FCC7B90E3024582; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd46, 0d3FDBC4C04D71ABC1; +mul.f64 fd72, fd48, 0d3FE904C37505DE4B; +sub.f64 fd73, fd71, fd72; +fma.rn.f64 fd74, fd50, 0d3FEF329C0558E969, fd73; +sub.f64 fd75, fd70, fd74; +add.f64 fd76, fd74, fd70; +fma.rn.f64 fd77, fd35, 0d3FE3F3A0E28BEDD1, %17; +mul.f64 fd78, fd37, 0d3FCC7B90E3024582; +sub.f64 fd79, fd77, fd78; +mul.f64 fd80, fd39, 0d3FECD4BCA9CB5C71; +sub.f64 fd81, fd79, fd80; +sub.f64 fd82, %18, %32; +mul.f64 fd83, fd82, 0d3FE904C37505DE4B; +sub.f64 fd84, %21, %29; +fma.rn.f64 fd85, fd84, 0d3FEF329C0558E969, fd83; +sub.f64 fd86, %24, %26; +fma.rn.f64 fd87, fd86, 0d3FDBC4C04D71ABC1, fd85; +add.f64 fd88, fd87, fd81; +sub.f64 fd89, fd81, fd87; +mul.f64 fd90, fd35, 0d3FCC7B90E3024582; +sub.f64 fd91, %17, fd90; +mul.f64 fd92, fd37, 0d3FECD4BCA9CB5C71; +sub.f64 fd93, fd91, fd92; +fma.rn.f64 fd94, fd39, 0d3FE3F3A0E28BEDD1, fd93; +mul.f64 fd95, fd82, 0d3FEF329C0558E969; +mul.f64 fd96, fd84, 0d3FDBC4C04D71ABC1; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd86, 0d3FE904C37505DE4B; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd99, fd94; +sub.f64 fd101, fd94, fd99; +mul.f64 fd102, fd35, 0d3FECD4BCA9CB5C71; +sub.f64 fd103, %17, fd102; +fma.rn.f64 fd104, fd37, 0d3FE3F3A0E28BEDD1, fd103; +mul.f64 fd105, fd39, 0d3FCC7B90E3024582; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd82, 0d3FDBC4C04D71ABC1; +mul.f64 fd108, fd84, 0d3FE904C37505DE4B; +sub.f64 fd109, fd107, fd108; +fma.rn.f64 fd110, fd86, 0d3FEF329C0558E969, fd109; +add.f64 fd111, fd110, fd106; +sub.f64 fd112, fd106, fd110; +mul.wide.u32 rd2, r4, 613566757; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 2; +mul.lo.s32 r10, r9, 7; +sub.s32 r11, r4, r10; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %15; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd113, fd114}, [rd6]; +mul.f64 fd117, fd88, fd114; +fma.rn.f64 fd118, fd113, fd52, fd117; +mul.f64 fd119, fd52, fd114; +mul.f64 fd120, fd113, fd88; +sub.f64 fd121, fd120, fd119; +mul.f64 fd122, fd113, fd113; +mul.f64 fd123, fd114, fd114; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd114, fd113; +fma.rn.f64 fd126, fd114, fd113, fd125; +mul.f64 fd127, fd100, fd126; +fma.rn.f64 fd128, fd124, fd64, fd127; +mul.f64 fd129, fd64, fd126; +mul.f64 fd130, fd124, fd100; +sub.f64 fd131, fd130, fd129; +mul.f64 fd132, fd113, fd124; +mul.f64 fd133, fd114, fd126; +sub.f64 fd134, fd132, fd133; +mul.f64 fd135, fd113, fd126; +fma.rn.f64 fd136, fd114, fd124, fd135; +mul.f64 fd137, fd111, fd136; +fma.rn.f64 fd138, fd134, fd75, fd137; +mul.f64 fd139, fd75, fd136; +mul.f64 fd140, fd134, fd111; +sub.f64 fd141, fd140, fd139; +ld.global.v2.f64 {fd142, fd143}, [rd6+112]; +mul.f64 fd146, fd112, fd143; +fma.rn.f64 fd147, fd142, fd76, fd146; +mul.f64 fd148, fd76, fd143; +mul.f64 fd149, fd142, fd112; +sub.f64 fd150, fd149, fd148; +mul.f64 fd151, fd113, fd142; +mul.f64 fd152, fd114, fd143; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd113, fd143; +fma.rn.f64 fd155, fd114, fd142, fd154; +mul.f64 fd156, fd101, fd155; +fma.rn.f64 fd157, fd153, fd65, fd156; +mul.f64 fd158, fd65, fd155; +mul.f64 fd159, fd153, fd101; +sub.f64 fd160, fd159, fd158; +mul.f64 fd161, fd113, fd153; +mul.f64 fd162, fd114, fd155; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd113, fd155; +fma.rn.f64 fd165, fd114, fd153, fd164; +mul.f64 fd166, fd89, fd165; +fma.rn.f64 fd167, fd163, fd53, fd166; +mul.f64 fd168, fd53, fd165; +mul.f64 fd169, fd163, fd89; +sub.f64 fd170, fd169, fd168; +mad.lo.s32 r12, r9, 392, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 56, r12; +st.shared.f64 [r13], fd34; +st.shared.f64 [r13+8], fd118; +st.shared.f64 [r13+16], fd128; +st.shared.f64 [r13+24], fd138; +st.shared.f64 [r13+32], fd147; +st.shared.f64 [r13+40], fd157; +st.shared.f64 [r13+48], fd167; +barrier.sync 0; +mad.lo.s32 r14, r11, -48, r13; +ld.shared.f64 fd171, [r14]; +ld.shared.f64 fd172, [r14+56]; +ld.shared.f64 fd173, [r14+112]; +ld.shared.f64 fd174, [r14+168]; +ld.shared.f64 fd175, [r14+224]; +ld.shared.f64 fd176, [r14+280]; +ld.shared.f64 fd177, [r14+336]; +barrier.sync 0; +st.shared.f64 [r13], fd40; +st.shared.f64 [r13+8], fd121; +st.shared.f64 [r13+16], fd131; +st.shared.f64 [r13+24], fd141; +st.shared.f64 [r13+32], fd150; +st.shared.f64 [r13+40], fd160; +st.shared.f64 [r13+48], fd170; +barrier.sync 0; +ld.shared.f64 fd178, [r14]; +ld.shared.f64 fd179, [r14+56]; +ld.shared.f64 fd180, [r14+112]; +ld.shared.f64 fd181, [r14+168]; +ld.shared.f64 fd182, [r14+224]; +ld.shared.f64 fd183, [r14+280]; +ld.shared.f64 fd184, [r14+336]; +add.f64 fd185, fd172, fd177; +add.f64 fd186, fd171, fd185; +add.f64 fd187, fd173, fd176; +add.f64 fd188, fd187, fd186; +add.f64 fd189, fd174, fd175; +add.f64 fd190, fd179, fd184; +add.f64 fd191, fd178, fd190; +add.f64 fd192, fd180, fd183; +add.f64 fd193, fd192, fd191; +add.f64 fd194, fd181, fd182; +fma.rn.f64 fd195, fd185, 0d3FE3F3A0E28BEDD1, fd171; +mul.f64 fd196, fd187, 0d3FCC7B90E3024582; +sub.f64 fd197, fd195, fd196; +mul.f64 fd198, fd189, 0d3FECD4BCA9CB5C71; +sub.f64 fd199, fd197, fd198; +sub.f64 fd200, fd179, fd184; +mul.f64 fd201, fd200, 0d3FE904C37505DE4B; +sub.f64 fd202, fd180, fd183; +fma.rn.f64 fd203, fd202, 0d3FEF329C0558E969, fd201; +sub.f64 fd204, fd181, fd182; +fma.rn.f64 fd205, fd204, 0d3FDBC4C04D71ABC1, fd203; +mul.f64 fd206, fd185, 0d3FCC7B90E3024582; +sub.f64 fd207, fd171, fd206; +mul.f64 fd208, fd187, 0d3FECD4BCA9CB5C71; +sub.f64 fd209, fd207, fd208; +fma.rn.f64 fd210, fd189, 0d3FE3F3A0E28BEDD1, fd209; +mul.f64 fd211, fd200, 0d3FEF329C0558E969; +mul.f64 fd212, fd202, 0d3FDBC4C04D71ABC1; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd204, 0d3FE904C37505DE4B; +sub.f64 fd215, fd213, fd214; +mul.f64 fd216, fd185, 0d3FECD4BCA9CB5C71; +sub.f64 fd217, fd171, fd216; +fma.rn.f64 fd218, fd187, 0d3FE3F3A0E28BEDD1, fd217; +mul.f64 fd219, fd189, 0d3FCC7B90E3024582; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd200, 0d3FDBC4C04D71ABC1; +mul.f64 fd222, fd202, 0d3FE904C37505DE4B; +sub.f64 fd223, fd221, fd222; +fma.rn.f64 fd224, fd204, 0d3FEF329C0558E969, fd223; +fma.rn.f64 fd225, fd190, 0d3FE3F3A0E28BEDD1, fd178; +mul.f64 fd226, fd192, 0d3FCC7B90E3024582; +sub.f64 fd227, fd225, fd226; +mul.f64 fd228, fd194, 0d3FECD4BCA9CB5C71; +sub.f64 fd229, fd227, fd228; +sub.f64 fd230, fd172, fd177; +mul.f64 fd231, fd230, 0d3FE904C37505DE4B; +sub.f64 fd232, fd173, fd176; +fma.rn.f64 fd233, fd232, 0d3FEF329C0558E969, fd231; +sub.f64 fd234, fd174, fd175; +fma.rn.f64 fd235, fd234, 0d3FDBC4C04D71ABC1, fd233; +mul.f64 fd236, fd190, 0d3FCC7B90E3024582; +sub.f64 fd237, fd178, fd236; +mul.f64 fd238, fd192, 0d3FECD4BCA9CB5C71; +sub.f64 fd239, fd237, fd238; +fma.rn.f64 fd240, fd194, 0d3FE3F3A0E28BEDD1, fd239; +mul.f64 fd241, fd230, 0d3FEF329C0558E969; +mul.f64 fd242, fd232, 0d3FDBC4C04D71ABC1; +sub.f64 fd243, fd241, fd242; +mul.f64 fd244, fd234, 0d3FE904C37505DE4B; +sub.f64 fd245, fd243, fd244; +mul.f64 fd246, fd190, 0d3FECD4BCA9CB5C71; +sub.f64 fd247, fd178, fd246; +fma.rn.f64 fd248, fd192, 0d3FE3F3A0E28BEDD1, fd247; +mul.f64 fd249, fd194, 0d3FCC7B90E3024582; +sub.f64 fd250, fd248, fd249; +mul.f64 fd251, fd230, 0d3FDBC4C04D71ABC1; +mul.f64 fd252, fd232, 0d3FE904C37505DE4B; +sub.f64 fd253, fd251, fd252; +fma.rn.f64 fd254, fd234, 0d3FEF329C0558E969, fd253; +add.f64 %0, fd189, fd188; +add.f64 %1, fd194, fd193; +add.f64 %3, fd235, fd229; +sub.f64 %2, fd199, fd205; +sub.f64 %4, fd210, fd215; +add.f64 %5, fd245, fd240; +sub.f64 %6, fd220, fd224; +add.f64 %7, fd254, fd250; +add.f64 %8, fd224, fd220; +sub.f64 %9, fd250, fd254; +add.f64 %10, fd215, fd210; +sub.f64 %11, fd240, fd245; +sub.f64 %13, fd229, fd235; +add.f64 %12, fd205, fd199; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "r"(smem), "l"(lut_dp_7_49), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..493a16f54c005 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp16_fwd.hpp.inc @@ -0,0 +1,259 @@ +#ifndef CUFFTDX_FFT_4_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_4_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<763, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<51>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %8, %9; +} +{ +add.f16x2 r4, %10, %11; +} +{ +sub.f16x2 r7, %8, %9; +} +{ +sub.f16x2 r10, %10, %11; +} +{ +add.f16x2 r13, %12, %13; +} +{ +add.f16x2 r16, %14, %15; +} +{ +sub.f16x2 r19, %12, %13; +} +{ +sub.f16x2 r22, %14, %15; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 %0, r1, r13; +} +{ +add.f16x2 %1, r4, r16; +} +{ +sub.f16x2 %4, r1, r13; +} +{ +sub.f16x2 %5, r4, r16; +} +{ +add.f16x2 %2, r7, r22; +} +{ +add.f16x2 %3, r10, r25; +} +{ +sub.f16x2 %6, r7, r22; +} +{ +sub.f16x2 %7, r10, r25; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<764, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<10>; +.reg .b32 r<79>; +.reg .b64 rd<2>; +mov.u32 r65, %tid.y; +shl.b32 r66, r65, 4; +mov.u32 r67, %4; +add.s32 r68, r67, r66; +mov.u32 r69, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r70, r69, 1; +shl.b32 r71, r69, 3; +and.b32 r72, r71, -16; +add.s32 r73, r68, r72; +cvt.rn.f32.u32 f7, r70; +mul.f32 f8, f7, 0f3FC90FDB; +cos.approx.f32 f1, f8; +sin.approx.f32 f9, f8; +neg.f32 f2, f9; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r74, r71, 8; +add.s32 r75, r73, r74; +st.shared.v2.f32 [r75], {r1, r25}; +barrier.sync 0; +shl.b32 r76, r69, 2; +and.b32 r77, r76, 4; +sub.s32 r78, r75, r77; +ld.shared.u32 r54, [r78]; +ld.shared.u32 r55, [r78+8]; +barrier.sync 0; +st.shared.v2.f32 [r75], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r78]; +ld.shared.u32 r58, [r78+8]; +{ +add.f16x2 %0, r54, r55; +} +{ +add.f16x2 %1, r57, r58; +} +{ +sub.f16x2 %2, r54, r55; +} +{ +sub.f16x2 %3, r57, r58; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<765, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<10>; +.reg .b32 r<79>; +.reg .b64 rd<2>; +mov.u32 r65, %tid.y; +shl.b32 r66, r65, 5; +mov.u32 r67, %4; +add.s32 r68, r67, r66; +mov.u32 r69, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r70, r69, 1; +shl.b32 r71, r69, 4; +and.b32 r72, r71, -32; +add.s32 r73, r68, r72; +cvt.rn.f32.u32 f7, r70; +mul.f32 f8, f7, 0f3FC90FDB; +cos.approx.f32 f1, f8; +sin.approx.f32 f9, f8; +neg.f32 f2, f9; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r74, r71, 16; +add.s32 r75, r73, r74; +st.shared.v2.f32 [r75], {r1, r4}; +st.shared.v2.f32 [r75+8], {r25, r32}; +barrier.sync 0; +shl.b32 r76, r69, 3; +and.b32 r77, r76, 8; +sub.s32 r78, r75, r77; +ld.shared.u32 r54, [r78]; +ld.shared.u32 r57, [r78+4]; +ld.shared.u32 r55, [r78+16]; +ld.shared.u32 r58, [r78+20]; +{ +add.f16x2 %0, r54, r55; +} +{ +add.f16x2 %1, r57, r58; +} +{ +sub.f16x2 %2, r54, r55; +} +{ +sub.f16x2 %3, r57, r58; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..82bd3a2830f22 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp16_inv.hpp.inc @@ -0,0 +1,259 @@ +#ifndef CUFFTDX_FFT_4_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_4_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<965, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<51>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %8, %9; +} +{ +add.f16x2 r4, %10, %11; +} +{ +sub.f16x2 r7, %8, %9; +} +{ +sub.f16x2 r10, %10, %11; +} +{ +add.f16x2 r13, %12, %13; +} +{ +add.f16x2 r16, %14, %15; +} +{ +sub.f16x2 r19, %12, %13; +} +{ +sub.f16x2 r22, %14, %15; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 %0, r1, r13; +} +{ +add.f16x2 %1, r4, r16; +} +{ +sub.f16x2 %4, r1, r13; +} +{ +sub.f16x2 %5, r4, r16; +} +{ +add.f16x2 %2, r7, r25; +} +{ +add.f16x2 %3, r10, r19; +} +{ +sub.f16x2 %6, r7, r25; +} +{ +sub.f16x2 %7, r10, r19; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<966, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<10>; +.reg .b32 r<79>; +.reg .b64 rd<2>; +mov.u32 r65, %tid.y; +shl.b32 r66, r65, 4; +mov.u32 r67, %4; +add.s32 r68, r67, r66; +mov.u32 r69, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r70, r69, 1; +shl.b32 r71, r69, 3; +and.b32 r72, r71, -16; +add.s32 r73, r68, r72; +cvt.rn.f32.u32 f7, r70; +mul.f32 f8, f7, 0f3FC90FDB; +cos.approx.f32 f1, f8; +sin.approx.f32 f9, f8; +neg.f32 f2, f9; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r74, r71, 8; +add.s32 r75, r73, r74; +st.shared.v2.f32 [r75], {r1, r23}; +barrier.sync 0; +shl.b32 r76, r69, 2; +and.b32 r77, r76, 4; +sub.s32 r78, r75, r77; +ld.shared.u32 r54, [r78]; +ld.shared.u32 r55, [r78+8]; +barrier.sync 0; +st.shared.v2.f32 [r75], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r78]; +ld.shared.u32 r58, [r78+8]; +{ +add.f16x2 %0, r54, r55; +} +{ +add.f16x2 %1, r57, r58; +} +{ +sub.f16x2 %2, r54, r55; +} +{ +sub.f16x2 %3, r57, r58; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<967, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<10>; +.reg .b32 r<79>; +.reg .b64 rd<2>; +mov.u32 r65, %tid.y; +shl.b32 r66, r65, 5; +mov.u32 r67, %4; +add.s32 r68, r67, r66; +mov.u32 r69, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r70, r69, 1; +shl.b32 r71, r69, 4; +and.b32 r72, r71, -32; +add.s32 r73, r68, r72; +cvt.rn.f32.u32 f7, r70; +mul.f32 f8, f7, 0f3FC90FDB; +cos.approx.f32 f1, f8; +sin.approx.f32 f9, f8; +neg.f32 f2, f9; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r74, r71, 16; +add.s32 r75, r73, r74; +st.shared.v2.f32 [r75], {r1, r4}; +st.shared.v2.f32 [r75+8], {r23, r32}; +barrier.sync 0; +shl.b32 r76, r69, 3; +and.b32 r77, r76, 8; +sub.s32 r78, r75, r77; +ld.shared.u32 r54, [r78]; +ld.shared.u32 r57, [r78+4]; +ld.shared.u32 r55, [r78+16]; +ld.shared.u32 r58, [r78+20]; +{ +add.f16x2 %0, r54, r55; +} +{ +add.f16x2 %1, r57, r58; +} +{ +sub.f16x2 %2, r54, r55; +} +{ +sub.f16x2 %3, r57, r58; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..99a4589d69ee7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp32_fwd.hpp.inc @@ -0,0 +1,136 @@ +#ifndef CUFFTDX_FFT_4_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_4_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<17, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<33>; +.reg .b64 rd<2>; +add.f32 f17, %8, %13; +add.f32 f18, %9, %15; +sub.f32 f19, %8, %13; +sub.f32 f20, %9, %15; +add.f32 f21, %10, %16; +add.f32 f22, %12, %17; +sub.f32 f23, %10, %16; +sub.f32 f24, %12, %17; +add.f32 %1, f18, f22; +add.f32 %0, f17, f21; +sub.f32 %3, f20, f23; +add.f32 %2, f19, f24; +sub.f32 %5, f18, f22; +sub.f32 %4, f17, f21; +add.f32 %7, f20, f23; +sub.f32 %6, f19, f24; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<18, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<30>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 4; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %6, %8; +add.f32 f10, %7, %9; +sub.f32 f11, %6, %8; +sub.f32 f12, %7, %9; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -16; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 8; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 4; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+8]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+8]; +add.f32 %0, f22, f23; +add.f32 %1, f24, f25; +sub.f32 %2, f22, f23; +sub.f32 %3, f24, f25; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<19, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<34>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %6, %8; +sub.f32 f10, %7, %9; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -32; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 16; +add.s32 r11, r8, r10; +add.f32 f18, %7, %9; +add.f32 f19, %6, %8; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 8; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+16]; +add.f32 %1, f23, f27; +add.f32 %0, f22, f26; +sub.f32 %3, f23, f27; +sub.f32 %2, f22, f26; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..810b69f23e2ba --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp32_inv.hpp.inc @@ -0,0 +1,136 @@ +#ifndef CUFFTDX_FFT_4_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_4_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<219, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<33>; +.reg .b64 rd<2>; +add.f32 f17, %8, %13; +add.f32 f18, %9, %15; +sub.f32 f19, %8, %13; +sub.f32 f20, %9, %15; +add.f32 f21, %10, %16; +add.f32 f22, %12, %17; +sub.f32 f23, %10, %16; +sub.f32 f24, %12, %17; +add.f32 %1, f18, f22; +add.f32 %0, f17, f21; +add.f32 %3, f20, f23; +sub.f32 %2, f19, f24; +sub.f32 %5, f18, f22; +sub.f32 %4, f17, f21; +sub.f32 %7, f20, f23; +add.f32 %6, f19, f24; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<220, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<30>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 4; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %6, %8; +add.f32 f10, %7, %9; +sub.f32 f11, %6, %8; +sub.f32 f12, %7, %9; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -16; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 8; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 4; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+8]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+8]; +add.f32 %0, f22, f23; +add.f32 %1, f24, f25; +sub.f32 %2, f22, f23; +sub.f32 %3, f24, f25; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<221, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<34>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %6, %8; +sub.f32 f10, %7, %9; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -32; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 16; +add.s32 r11, r8, r10; +add.f32 f18, %7, %9; +add.f32 f19, %6, %8; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 8; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+16]; +add.f32 %1, f23, f27; +add.f32 %0, f22, f26; +sub.f32 %3, f23, f27; +sub.f32 %2, f22, f26; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..a3b032f2c3315 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp64_fwd.hpp.inc @@ -0,0 +1,136 @@ +#ifndef CUFFTDX_FFT_4_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_4_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<421, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<33>; +.reg .b64 rd<2>; +add.f64 fd17, %8, %13; +add.f64 fd18, %9, %15; +sub.f64 fd19, %8, %13; +sub.f64 fd20, %9, %15; +add.f64 fd21, %10, %16; +add.f64 fd22, %12, %17; +sub.f64 fd23, %10, %16; +sub.f64 fd24, %12, %17; +add.f64 %1, fd18, fd22; +add.f64 %0, fd17, fd21; +sub.f64 %3, fd20, fd23; +add.f64 %2, fd19, fd24; +sub.f64 %5, fd18, fd22; +sub.f64 %4, fd17, fd21; +add.f64 %7, fd20, fd23; +sub.f64 %6, fd19, fd24; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<422, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<30>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %6, %8; +add.f64 fd10, %7, %9; +sub.f64 fd11, %6, %8; +sub.f64 fd12, %7, %9; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -32; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 16; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 8; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+16]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+16]; +add.f64 %0, fd22, fd23; +add.f64 %1, fd24, fd25; +sub.f64 %2, fd22, fd23; +sub.f64 %3, fd24, fd25; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<423, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<34>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %6, %8; +sub.f64 fd10, %7, %9; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 32; +add.s32 r11, r8, r10; +add.f64 fd18, %7, %9; +add.f64 fd19, %6, %8; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 16; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+32]; +add.f64 %1, fd23, fd27; +add.f64 %0, fd22, fd26; +sub.f64 %3, fd23, fd27; +sub.f64 %2, fd22, fd26; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..d7050658b4df0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_4_fp64_inv.hpp.inc @@ -0,0 +1,136 @@ +#ifndef CUFFTDX_FFT_4_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_4_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<592, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<33>; +.reg .b64 rd<2>; +add.f64 fd17, %8, %13; +add.f64 fd18, %9, %15; +sub.f64 fd19, %8, %13; +sub.f64 fd20, %9, %15; +add.f64 fd21, %10, %16; +add.f64 fd22, %12, %17; +sub.f64 fd23, %10, %16; +sub.f64 fd24, %12, %17; +add.f64 %1, fd18, fd22; +add.f64 %0, fd17, fd21; +add.f64 %3, fd20, fd23; +sub.f64 %2, fd19, fd24; +sub.f64 %5, fd18, fd22; +sub.f64 %4, fd17, fd21; +sub.f64 %7, fd20, fd23; +add.f64 %6, fd19, fd24; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<593, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<30>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %6, %8; +add.f64 fd10, %7, %9; +sub.f64 fd11, %6, %8; +sub.f64 fd12, %7, %9; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -32; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 16; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 8; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+16]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+16]; +add.f64 %0, fd22, fd23; +add.f64 %1, fd24, fd25; +sub.f64 %2, fd22, fd23; +sub.f64 %3, fd24, fd25; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<594, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<34>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %6, %8; +sub.f64 fd10, %7, %9; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 32; +add.s32 r11, r8, r10; +add.f64 fd18, %7, %9; +add.f64 fd19, %6, %8; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 16; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+32]; +add.f64 %1, fd23, fd27; +add.f64 %0, fd22, fd26; +sub.f64 %3, fd23, fd27; +sub.f64 %2, fd22, fd26; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..45bcb19603603 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp16_fwd.hpp.inc @@ -0,0 +1,22462 @@ +#ifndef CUFFTDX_FFT_512_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_512_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<817, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<127>; +.reg .b32 r<1133>; +.reg .b64 rd<2>; +mov.u32 r1113, %tid.y; +shl.b32 r1114, r1113, 12; +mov.u32 r1115, %16; +add.s32 r1116, r1115, r1114; +mov.u32 r1117, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f94, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r101, {low, high}; +} +mov.f32 f104, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f90, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1118, r1117, 63; +shl.b32 r1119, r1117, 6; +and.b32 r1120, r1119, -4096; +add.s32 r1121, r1116, r1120; +cvt.rn.f32.u32 f121, r1118; +mul.f32 f122, f121, 0f3C490FDB; +cos.approx.f32 f29, f122; +sin.approx.f32 f123, f122; +neg.f32 f30, f123; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1122, r1119, 4032; +add.s32 r1123, r1121, r1122; +st.shared.v4.f32 [r1123], {r149, r152, r209, r216}; +st.shared.v4.f32 [r1123+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r1123+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r1123+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r1124, r1118, -56, r1123; +ld.shared.u32 r460, [r1124]; +ld.shared.u32 r463, [r1124+4]; +ld.shared.u32 r510, [r1124+512]; +ld.shared.u32 r513, [r1124+516]; +ld.shared.u32 r472, [r1124+1024]; +ld.shared.u32 r475, [r1124+1028]; +ld.shared.u32 r522, [r1124+1536]; +ld.shared.u32 r525, [r1124+1540]; +ld.shared.u32 r461, [r1124+2048]; +ld.shared.u32 r464, [r1124+2052]; +ld.shared.u32 r511, [r1124+2560]; +ld.shared.u32 r514, [r1124+2564]; +ld.shared.u32 r473, [r1124+3072]; +ld.shared.u32 r476, [r1124+3076]; +ld.shared.u32 r523, [r1124+3584]; +ld.shared.u32 r526, [r1124+3588]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1125, r1117, 56; +bfe.u32 r1126, r1117, 3, 3; +cvt.rn.f32.u32 f124, r1126; +mul.f32 f125, f124, 0f3DC90FDB; +cos.approx.f32 f75, f125; +sin.approx.f32 f126, f125; +neg.f32 f76, f126; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r1127, r1117, 3; +and.b32 r1128, r1127, 56; +add.s32 r1129, r1121, r1128; +barrier.sync 0; +and.b32 r1130, r1119, 3584; +add.s32 r1131, r1129, r1130; +st.shared.u32 [r1131], r607; +st.shared.u32 [r1131+4], r610; +st.shared.u32 [r1131+64], r667; +st.shared.u32 [r1131+68], r674; +st.shared.u32 [r1131+128], r704; +st.shared.u32 [r1131+132], r711; +st.shared.u32 [r1131+192], r741; +st.shared.u32 [r1131+196], r748; +st.shared.u32 [r1131+256], r778; +st.shared.u32 [r1131+260], r785; +st.shared.u32 [r1131+320], r815; +st.shared.u32 [r1131+324], r822; +st.shared.u32 [r1131+384], r852; +st.shared.u32 [r1131+388], r859; +st.shared.u32 [r1131+448], r889; +st.shared.u32 [r1131+452], r896; +barrier.sync 0; +mad.lo.s32 r1132, r1125, -56, r1131; +ld.shared.u32 r918, [r1132]; +ld.shared.u32 r921, [r1132+4]; +ld.shared.u32 r968, [r1132+512]; +ld.shared.u32 r971, [r1132+516]; +ld.shared.u32 r930, [r1132+1024]; +ld.shared.u32 r933, [r1132+1028]; +ld.shared.u32 r980, [r1132+1536]; +ld.shared.u32 r983, [r1132+1540]; +ld.shared.u32 r919, [r1132+2048]; +ld.shared.u32 r922, [r1132+2052]; +ld.shared.u32 r969, [r1132+2560]; +ld.shared.u32 r972, [r1132+2564]; +ld.shared.u32 r931, [r1132+3072]; +ld.shared.u32 r934, [r1132+3076]; +ld.shared.u32 r981, [r1132+3584]; +ld.shared.u32 r984, [r1132+3588]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 %0, r943, r993; +} +{ +add.f16x2 %1, r946, r996; +} +{ +sub.f16x2 %8, r943, r993; +} +{ +sub.f16x2 %9, r946, r996; +} +{ +add.f16x2 %2, r955, r1037; +} +{ +add.f16x2 %3, r958, r1043; +} +{ +sub.f16x2 %10, r955, r1037; +} +{ +sub.f16x2 %11, r958, r1043; +} +{ +add.f16x2 %4, r949, r1002; +} +{ +add.f16x2 %5, r952, r1047; +} +{ +sub.f16x2 %12, r949, r1002; +} +{ +sub.f16x2 %13, r952, r1047; +} +{ +add.f16x2 %6, r961, r1055; +} +{ +add.f16x2 %7, r964, r1061; +} +{ +sub.f16x2 %14, r961, r1055; +} +{ +sub.f16x2 %15, r964, r1061; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<818, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<127>; +.reg .b32 r<1133>; +.reg .b64 rd<2>; +mov.u32 r1113, %tid.y; +shl.b32 r1114, r1113, 11; +mov.u32 r1115, %16; +add.s32 r1116, r1115, r1114; +mov.u32 r1117, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f94, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r101, {low, high}; +} +mov.f32 f104, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f90, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1118, r1117, 63; +shl.b32 r1119, r1117, 5; +and.b32 r1120, r1119, -2048; +add.s32 r1121, r1116, r1120; +cvt.rn.f32.u32 f121, r1118; +mul.f32 f122, f121, 0f3C490FDB; +cos.approx.f32 f29, f122; +sin.approx.f32 f123, f122; +neg.f32 f30, f123; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1122, r1119, 2016; +add.s32 r1123, r1121, r1122; +st.shared.v4.f32 [r1123], {r149, r209, r246, r283}; +st.shared.v4.f32 [r1123+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r1124, r1118, -28, r1123; +ld.shared.u32 r460, [r1124]; +ld.shared.u32 r510, [r1124+256]; +ld.shared.u32 r472, [r1124+512]; +ld.shared.u32 r522, [r1124+768]; +ld.shared.u32 r461, [r1124+1024]; +ld.shared.u32 r511, [r1124+1280]; +ld.shared.u32 r473, [r1124+1536]; +ld.shared.u32 r523, [r1124+1792]; +barrier.sync 0; +st.shared.v4.f32 [r1123], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1123+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1124]; +ld.shared.u32 r513, [r1124+256]; +ld.shared.u32 r475, [r1124+512]; +ld.shared.u32 r525, [r1124+768]; +ld.shared.u32 r464, [r1124+1024]; +ld.shared.u32 r514, [r1124+1280]; +ld.shared.u32 r476, [r1124+1536]; +ld.shared.u32 r526, [r1124+1792]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1125, r1117, 56; +bfe.u32 r1126, r1117, 3, 3; +shl.b32 r1127, r1117, 2; +and.b32 r1128, r1127, 28; +add.s32 r1129, r1121, r1128; +cvt.rn.f32.u32 f124, r1126; +mul.f32 f125, f124, 0f3DC90FDB; +cos.approx.f32 f75, f125; +sin.approx.f32 f126, f125; +neg.f32 f76, f126; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r1130, r1119, 1792; +add.s32 r1131, r1129, r1130; +st.shared.u32 [r1131], r607; +st.shared.u32 [r1131+32], r667; +st.shared.u32 [r1131+64], r704; +st.shared.u32 [r1131+96], r741; +st.shared.u32 [r1131+128], r778; +st.shared.u32 [r1131+160], r815; +st.shared.u32 [r1131+192], r852; +st.shared.u32 [r1131+224], r889; +barrier.sync 0; +mad.lo.s32 r1132, r1125, -28, r1131; +ld.shared.u32 r918, [r1132]; +ld.shared.u32 r968, [r1132+256]; +ld.shared.u32 r930, [r1132+512]; +ld.shared.u32 r980, [r1132+768]; +ld.shared.u32 r919, [r1132+1024]; +ld.shared.u32 r969, [r1132+1280]; +ld.shared.u32 r931, [r1132+1536]; +ld.shared.u32 r981, [r1132+1792]; +barrier.sync 0; +st.shared.u32 [r1131], r610; +st.shared.u32 [r1131+32], r674; +st.shared.u32 [r1131+64], r711; +st.shared.u32 [r1131+96], r748; +st.shared.u32 [r1131+128], r785; +st.shared.u32 [r1131+160], r822; +st.shared.u32 [r1131+192], r859; +st.shared.u32 [r1131+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1132]; +ld.shared.u32 r971, [r1132+256]; +ld.shared.u32 r933, [r1132+512]; +ld.shared.u32 r983, [r1132+768]; +ld.shared.u32 r922, [r1132+1024]; +ld.shared.u32 r972, [r1132+1280]; +ld.shared.u32 r934, [r1132+1536]; +ld.shared.u32 r984, [r1132+1792]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 %0, r943, r993; +} +{ +add.f16x2 %1, r946, r996; +} +{ +sub.f16x2 %8, r943, r993; +} +{ +sub.f16x2 %9, r946, r996; +} +{ +add.f16x2 %2, r955, r1037; +} +{ +add.f16x2 %3, r958, r1043; +} +{ +sub.f16x2 %10, r955, r1037; +} +{ +sub.f16x2 %11, r958, r1043; +} +{ +add.f16x2 %4, r949, r1002; +} +{ +add.f16x2 %5, r952, r1047; +} +{ +sub.f16x2 %12, r949, r1002; +} +{ +sub.f16x2 %13, r952, r1047; +} +{ +add.f16x2 %6, r961, r1055; +} +{ +add.f16x2 %7, r964, r1061; +} +{ +sub.f16x2 %14, r961, r1055; +} +{ +sub.f16x2 %15, r964, r1061; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<820, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<660>; +.reg .b32 r<4171>; +.reg .b64 rd<3>; +mov.u32 r4095, %tid.y; +shl.b32 r4096, r4095, 12; +mov.u32 r4097, %64; +add.s32 r4098, r4097, r4096; +mov.u32 r4099, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f600, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r101, {low, high}; +} +mov.f32 f618, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f596, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r393, {low, high}; +} +mov.f32 f622, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r396, {low, high}; +} +mov.f32 f604, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r397, {low, high}; +} +mov.f32 f620, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4100, r4099, 15; +shl.b32 r4101, r4099, 8; +and.b32 r4102, r4101, -4096; +add.s32 r4103, r4098, r4102; +cvt.rn.f32.u32 f655, r4100; +mul.f32 f656, f655, 0f3C490FDB; +cos.approx.f32 f357, f656; +sin.approx.f32 f657, f656; +neg.f32 f358, f657; +mov.f32 f659, 0fBF800000; +mov.f32 f658, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r4104, r4101, 3840; +add.s32 r4105, r4103, r4104; +st.shared.v4.f32 [r4105], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r4105+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r4105+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r4105+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r4105+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r4105+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r4105+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r4105+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r4105+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r4105+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r4105+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r4105+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r4105+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r4105+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r4105+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r4105+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r4106, r4100, -248, r4105; +ld.shared.u32 r2864, [r4106]; +ld.shared.u32 r2867, [r4106+4]; +ld.shared.u32 r3480, [r4106+128]; +ld.shared.u32 r3483, [r4106+132]; +ld.shared.u32 r3060, [r4106+256]; +ld.shared.u32 r3063, [r4106+260]; +ld.shared.u32 r3676, [r4106+384]; +ld.shared.u32 r3679, [r4106+388]; +ld.shared.u32 r2914, [r4106+512]; +ld.shared.u32 r2917, [r4106+516]; +ld.shared.u32 r3530, [r4106+640]; +ld.shared.u32 r3533, [r4106+644]; +ld.shared.u32 r3110, [r4106+768]; +ld.shared.u32 r3113, [r4106+772]; +ld.shared.u32 r3726, [r4106+896]; +ld.shared.u32 r3729, [r4106+900]; +ld.shared.u32 r2876, [r4106+1024]; +ld.shared.u32 r2879, [r4106+1028]; +ld.shared.u32 r3492, [r4106+1152]; +ld.shared.u32 r3495, [r4106+1156]; +ld.shared.u32 r3072, [r4106+1280]; +ld.shared.u32 r3075, [r4106+1284]; +ld.shared.u32 r3688, [r4106+1408]; +ld.shared.u32 r3691, [r4106+1412]; +ld.shared.u32 r2926, [r4106+1536]; +ld.shared.u32 r2929, [r4106+1540]; +ld.shared.u32 r3542, [r4106+1664]; +ld.shared.u32 r3545, [r4106+1668]; +ld.shared.u32 r3122, [r4106+1792]; +ld.shared.u32 r3125, [r4106+1796]; +ld.shared.u32 r3738, [r4106+1920]; +ld.shared.u32 r3741, [r4106+1924]; +ld.shared.u32 r2865, [r4106+2048]; +ld.shared.u32 r2868, [r4106+2052]; +ld.shared.u32 r3481, [r4106+2176]; +ld.shared.u32 r3484, [r4106+2180]; +ld.shared.u32 r3061, [r4106+2304]; +ld.shared.u32 r3064, [r4106+2308]; +ld.shared.u32 r3677, [r4106+2432]; +ld.shared.u32 r3680, [r4106+2436]; +ld.shared.u32 r2915, [r4106+2560]; +ld.shared.u32 r2918, [r4106+2564]; +ld.shared.u32 r3531, [r4106+2688]; +ld.shared.u32 r3534, [r4106+2692]; +ld.shared.u32 r3111, [r4106+2816]; +ld.shared.u32 r3114, [r4106+2820]; +ld.shared.u32 r3727, [r4106+2944]; +ld.shared.u32 r3730, [r4106+2948]; +ld.shared.u32 r2877, [r4106+3072]; +ld.shared.u32 r2880, [r4106+3076]; +ld.shared.u32 r3493, [r4106+3200]; +ld.shared.u32 r3496, [r4106+3204]; +ld.shared.u32 r3073, [r4106+3328]; +ld.shared.u32 r3076, [r4106+3332]; +ld.shared.u32 r3689, [r4106+3456]; +ld.shared.u32 r3692, [r4106+3460]; +ld.shared.u32 r2927, [r4106+3584]; +ld.shared.u32 r2930, [r4106+3588]; +ld.shared.u32 r3543, [r4106+3712]; +ld.shared.u32 r3546, [r4106+3716]; +ld.shared.u32 r3123, [r4106+3840]; +ld.shared.u32 r3126, [r4106+3844]; +ld.shared.u32 r3739, [r4106+3968]; +ld.shared.u32 r3742, [r4106+3972]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 %0, r3011, r3207; +} +{ +add.f16x2 %1, r3014, r3210; +} +{ +sub.f16x2 %32, r3011, r3207; +} +{ +sub.f16x2 %33, r3014, r3210; +} +{ +add.f16x2 %4, r3023, r3291; +} +{ +add.f16x2 %5, r3026, r3297; +} +{ +sub.f16x2 %36, r3023, r3291; +} +{ +sub.f16x2 %37, r3026, r3297; +} +{ +add.f16x2 %8, r3035, r3307; +} +{ +add.f16x2 %9, r3038, r3313; +} +{ +sub.f16x2 %40, r3035, r3307; +} +{ +sub.f16x2 %41, r3038, r3313; +} +{ +add.f16x2 %12, r3047, r3323; +} +{ +add.f16x2 %13, r3050, r3329; +} +{ +sub.f16x2 %44, r3047, r3323; +} +{ +sub.f16x2 %45, r3050, r3329; +} +{ +add.f16x2 %16, r3017, r3216; +} +{ +add.f16x2 %17, r3020, r3333; +} +{ +sub.f16x2 %48, r3017, r3216; +} +{ +sub.f16x2 %49, r3020, r3333; +} +{ +add.f16x2 %20, r3029, r3341; +} +{ +add.f16x2 %21, r3032, r3347; +} +{ +sub.f16x2 %52, r3029, r3341; +} +{ +sub.f16x2 %53, r3032, r3347; +} +{ +add.f16x2 %24, r3041, r3357; +} +{ +add.f16x2 %25, r3044, r3363; +} +{ +sub.f16x2 %56, r3041, r3357; +} +{ +sub.f16x2 %57, r3044, r3363; +} +{ +add.f16x2 %28, r3053, r3373; +} +{ +add.f16x2 %29, r3056, r3379; +} +{ +sub.f16x2 %60, r3053, r3373; +} +{ +sub.f16x2 %61, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 %2, r3627, r3823; +} +{ +add.f16x2 %3, r3630, r3826; +} +{ +sub.f16x2 %34, r3627, r3823; +} +{ +sub.f16x2 %35, r3630, r3826; +} +{ +add.f16x2 %6, r3639, r3907; +} +{ +add.f16x2 %7, r3642, r3913; +} +{ +sub.f16x2 %38, r3639, r3907; +} +{ +sub.f16x2 %39, r3642, r3913; +} +{ +add.f16x2 %10, r3651, r3923; +} +{ +add.f16x2 %11, r3654, r3929; +} +{ +sub.f16x2 %42, r3651, r3923; +} +{ +sub.f16x2 %43, r3654, r3929; +} +{ +add.f16x2 %14, r3663, r3939; +} +{ +add.f16x2 %15, r3666, r3945; +} +{ +sub.f16x2 %46, r3663, r3939; +} +{ +sub.f16x2 %47, r3666, r3945; +} +{ +add.f16x2 %18, r3633, r3832; +} +{ +add.f16x2 %19, r3636, r3949; +} +{ +sub.f16x2 %50, r3633, r3832; +} +{ +sub.f16x2 %51, r3636, r3949; +} +{ +add.f16x2 %22, r3645, r3957; +} +{ +add.f16x2 %23, r3648, r3963; +} +{ +sub.f16x2 %54, r3645, r3957; +} +{ +sub.f16x2 %55, r3648, r3963; +} +{ +add.f16x2 %26, r3657, r3973; +} +{ +add.f16x2 %27, r3660, r3979; +} +{ +sub.f16x2 %58, r3657, r3973; +} +{ +sub.f16x2 %59, r3660, r3979; +} +{ +add.f16x2 %30, r3669, r3989; +} +{ +add.f16x2 %31, r3672, r3995; +} +{ +sub.f16x2 %62, r3669, r3989; +} +{ +sub.f16x2 %63, r3672, r3995; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<819, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2465>; +.reg .b64 rd<2>; +mov.u32 r2445, %tid.y; +shl.b32 r2446, r2445, 11; +mov.u32 r2447, %32; +add.s32 r2448, r2447, r2446; +mov.u32 r2449, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f212, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r101, {low, high}; +} +mov.f32 f230, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2450, r2449, 31; +shl.b32 r2451, r2449, 6; +and.b32 r2452, r2451, -2048; +add.s32 r2453, r2448, r2452; +cvt.rn.f32.u32 f301, r2450; +mul.f32 f302, f301, 0f3C490FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2454, r2451, 1984; +add.s32 r2455, r2453, r2454; +st.shared.v4.f32 [r2455], {r521, r629, r666, r703}; +st.shared.v4.f32 [r2455+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r2455+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r2455+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r2456, r2450, -60, r2455; +ld.shared.u32 r1176, [r2456]; +ld.shared.u32 r1372, [r2456+128]; +ld.shared.u32 r1226, [r2456+256]; +ld.shared.u32 r1422, [r2456+384]; +ld.shared.u32 r1188, [r2456+512]; +ld.shared.u32 r1384, [r2456+640]; +ld.shared.u32 r1238, [r2456+768]; +ld.shared.u32 r1434, [r2456+896]; +ld.shared.u32 r1177, [r2456+1024]; +ld.shared.u32 r1373, [r2456+1152]; +ld.shared.u32 r1227, [r2456+1280]; +ld.shared.u32 r1423, [r2456+1408]; +ld.shared.u32 r1189, [r2456+1536]; +ld.shared.u32 r1385, [r2456+1664]; +ld.shared.u32 r1239, [r2456+1792]; +ld.shared.u32 r1435, [r2456+1920]; +barrier.sync 0; +st.shared.v4.f32 [r2455], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2455+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2455+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2455+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2456]; +ld.shared.u32 r1375, [r2456+128]; +ld.shared.u32 r1229, [r2456+256]; +ld.shared.u32 r1425, [r2456+384]; +ld.shared.u32 r1191, [r2456+512]; +ld.shared.u32 r1387, [r2456+640]; +ld.shared.u32 r1241, [r2456+768]; +ld.shared.u32 r1437, [r2456+896]; +ld.shared.u32 r1180, [r2456+1024]; +ld.shared.u32 r1376, [r2456+1152]; +ld.shared.u32 r1230, [r2456+1280]; +ld.shared.u32 r1426, [r2456+1408]; +ld.shared.u32 r1192, [r2456+1536]; +ld.shared.u32 r1388, [r2456+1664]; +ld.shared.u32 r1242, [r2456+1792]; +ld.shared.u32 r1438, [r2456+1920]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2457, r2449, 16; +bfe.u32 r2458, r2449, 4, 1; +shl.b32 r2459, r2449, 2; +and.b32 r2460, r2459, 60; +add.s32 r2461, r2453, r2460; +cvt.rn.f32.u32 f304, r2458; +mul.f32 f305, f304, 0f3E490FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +barrier.sync 0; +and.b32 r2462, r2451, 1024; +add.s32 r2463, r2461, r2462; +st.shared.u32 [r2463], r1695; +st.shared.u32 [r2463+64], r1803; +st.shared.u32 [r2463+128], r1840; +st.shared.u32 [r2463+192], r1877; +st.shared.u32 [r2463+256], r1914; +st.shared.u32 [r2463+320], r1951; +st.shared.u32 [r2463+384], r1988; +st.shared.u32 [r2463+448], r2025; +st.shared.u32 [r2463+512], r2062; +st.shared.u32 [r2463+576], r2099; +st.shared.u32 [r2463+640], r2136; +st.shared.u32 [r2463+704], r2173; +st.shared.u32 [r2463+768], r2210; +st.shared.u32 [r2463+832], r2247; +st.shared.u32 [r2463+896], r2284; +st.shared.u32 [r2463+960], r2321; +barrier.sync 0; +mad.lo.s32 r2464, r2457, -60, r2463; +ld.shared.u32 r2350, [r2464]; +ld.shared.u32 r2362, [r2464+128]; +ld.shared.u32 r2374, [r2464+256]; +ld.shared.u32 r2386, [r2464+384]; +ld.shared.u32 r2398, [r2464+512]; +ld.shared.u32 r2410, [r2464+640]; +ld.shared.u32 r2422, [r2464+768]; +ld.shared.u32 r2434, [r2464+896]; +ld.shared.u32 r2351, [r2464+1024]; +ld.shared.u32 r2363, [r2464+1152]; +ld.shared.u32 r2375, [r2464+1280]; +ld.shared.u32 r2387, [r2464+1408]; +ld.shared.u32 r2399, [r2464+1536]; +ld.shared.u32 r2411, [r2464+1664]; +ld.shared.u32 r2423, [r2464+1792]; +ld.shared.u32 r2435, [r2464+1920]; +barrier.sync 0; +st.shared.u32 [r2463], r1698; +st.shared.u32 [r2463+64], r1810; +st.shared.u32 [r2463+128], r1847; +st.shared.u32 [r2463+192], r1884; +st.shared.u32 [r2463+256], r1921; +st.shared.u32 [r2463+320], r1958; +st.shared.u32 [r2463+384], r1995; +st.shared.u32 [r2463+448], r2032; +st.shared.u32 [r2463+512], r2069; +st.shared.u32 [r2463+576], r2106; +st.shared.u32 [r2463+640], r2143; +st.shared.u32 [r2463+704], r2180; +st.shared.u32 [r2463+768], r2217; +st.shared.u32 [r2463+832], r2254; +st.shared.u32 [r2463+896], r2291; +st.shared.u32 [r2463+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2464]; +ld.shared.u32 r2365, [r2464+128]; +ld.shared.u32 r2377, [r2464+256]; +ld.shared.u32 r2389, [r2464+384]; +ld.shared.u32 r2401, [r2464+512]; +ld.shared.u32 r2413, [r2464+640]; +ld.shared.u32 r2425, [r2464+768]; +ld.shared.u32 r2437, [r2464+896]; +ld.shared.u32 r2354, [r2464+1024]; +ld.shared.u32 r2366, [r2464+1152]; +ld.shared.u32 r2378, [r2464+1280]; +ld.shared.u32 r2390, [r2464+1408]; +ld.shared.u32 r2402, [r2464+1536]; +ld.shared.u32 r2414, [r2464+1664]; +ld.shared.u32 r2426, [r2464+1792]; +ld.shared.u32 r2438, [r2464+1920]; +{ +add.f16x2 %0, r2350, r2351; +} +{ +add.f16x2 %1, r2353, r2354; +} +{ +sub.f16x2 %16, r2350, r2351; +} +{ +sub.f16x2 %17, r2353, r2354; +} +{ +add.f16x2 %2, r2362, r2363; +} +{ +add.f16x2 %3, r2365, r2366; +} +{ +sub.f16x2 %18, r2362, r2363; +} +{ +sub.f16x2 %19, r2365, r2366; +} +{ +add.f16x2 %4, r2374, r2375; +} +{ +add.f16x2 %5, r2377, r2378; +} +{ +sub.f16x2 %20, r2374, r2375; +} +{ +sub.f16x2 %21, r2377, r2378; +} +{ +add.f16x2 %6, r2386, r2387; +} +{ +add.f16x2 %7, r2389, r2390; +} +{ +sub.f16x2 %22, r2386, r2387; +} +{ +sub.f16x2 %23, r2389, r2390; +} +{ +add.f16x2 %8, r2398, r2399; +} +{ +add.f16x2 %9, r2401, r2402; +} +{ +sub.f16x2 %24, r2398, r2399; +} +{ +sub.f16x2 %25, r2401, r2402; +} +{ +add.f16x2 %10, r2410, r2411; +} +{ +add.f16x2 %11, r2413, r2414; +} +{ +sub.f16x2 %26, r2410, r2411; +} +{ +sub.f16x2 %27, r2413, r2414; +} +{ +add.f16x2 %12, r2422, r2423; +} +{ +add.f16x2 %13, r2425, r2426; +} +{ +sub.f16x2 %28, r2422, r2423; +} +{ +sub.f16x2 %29, r2425, r2426; +} +{ +add.f16x2 %14, r2434, r2435; +} +{ +add.f16x2 %15, r2437, r2438; +} +{ +sub.f16x2 %30, r2434, r2435; +} +{ +sub.f16x2 %31, r2437, r2438; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<822, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2465>; +.reg .b64 rd<2>; +mov.u32 r2445, %tid.y; +shl.b32 r2446, r2445, 12; +mov.u32 r2447, %32; +add.s32 r2448, r2447, r2446; +mov.u32 r2449, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f212, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r101, {low, high}; +} +mov.f32 f230, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f298, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f208, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +mov.f32 f216, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r397, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2450, r2449, 31; +shl.b32 r2451, r2449, 7; +and.b32 r2452, r2451, -4096; +add.s32 r2453, r2448, r2452; +cvt.rn.f32.u32 f301, r2450; +mul.f32 f302, f301, 0f3C490FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r2454, r2451, 3968; +add.s32 r2455, r2453, r2454; +st.shared.v4.f32 [r2455], {r521, r524, r629, r636}; +st.shared.v4.f32 [r2455+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r2455+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r2455+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r2455+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r2455+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r2455+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r2455+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r2456, r2450, -120, r2455; +ld.shared.u32 r1176, [r2456]; +ld.shared.u32 r1179, [r2456+4]; +ld.shared.u32 r1372, [r2456+256]; +ld.shared.u32 r1375, [r2456+260]; +ld.shared.u32 r1226, [r2456+512]; +ld.shared.u32 r1229, [r2456+516]; +ld.shared.u32 r1422, [r2456+768]; +ld.shared.u32 r1425, [r2456+772]; +ld.shared.u32 r1188, [r2456+1024]; +ld.shared.u32 r1191, [r2456+1028]; +ld.shared.u32 r1384, [r2456+1280]; +ld.shared.u32 r1387, [r2456+1284]; +ld.shared.u32 r1238, [r2456+1536]; +ld.shared.u32 r1241, [r2456+1540]; +ld.shared.u32 r1434, [r2456+1792]; +ld.shared.u32 r1437, [r2456+1796]; +ld.shared.u32 r1177, [r2456+2048]; +ld.shared.u32 r1180, [r2456+2052]; +ld.shared.u32 r1373, [r2456+2304]; +ld.shared.u32 r1376, [r2456+2308]; +ld.shared.u32 r1227, [r2456+2560]; +ld.shared.u32 r1230, [r2456+2564]; +ld.shared.u32 r1423, [r2456+2816]; +ld.shared.u32 r1426, [r2456+2820]; +ld.shared.u32 r1189, [r2456+3072]; +ld.shared.u32 r1192, [r2456+3076]; +ld.shared.u32 r1385, [r2456+3328]; +ld.shared.u32 r1388, [r2456+3332]; +ld.shared.u32 r1239, [r2456+3584]; +ld.shared.u32 r1242, [r2456+3588]; +ld.shared.u32 r1435, [r2456+3840]; +ld.shared.u32 r1438, [r2456+3844]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f212; +cvt.rn.f16.f32 high, f212; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2457, r2449, 16; +bfe.u32 r2458, r2449, 4, 1; +cvt.rn.f32.u32 f304, r2458; +mul.f32 f305, f304, 0f3E490FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +shl.b32 r2459, r2449, 3; +and.b32 r2460, r2459, 120; +add.s32 r2461, r2453, r2460; +barrier.sync 0; +and.b32 r2462, r2451, 2048; +add.s32 r2463, r2461, r2462; +st.shared.u32 [r2463], r1695; +st.shared.u32 [r2463+4], r1698; +st.shared.u32 [r2463+128], r1803; +st.shared.u32 [r2463+132], r1810; +st.shared.u32 [r2463+256], r1840; +st.shared.u32 [r2463+260], r1847; +st.shared.u32 [r2463+384], r1877; +st.shared.u32 [r2463+388], r1884; +st.shared.u32 [r2463+512], r1914; +st.shared.u32 [r2463+516], r1921; +st.shared.u32 [r2463+640], r1951; +st.shared.u32 [r2463+644], r1958; +st.shared.u32 [r2463+768], r1988; +st.shared.u32 [r2463+772], r1995; +st.shared.u32 [r2463+896], r2025; +st.shared.u32 [r2463+900], r2032; +st.shared.u32 [r2463+1024], r2062; +st.shared.u32 [r2463+1028], r2069; +st.shared.u32 [r2463+1152], r2099; +st.shared.u32 [r2463+1156], r2106; +st.shared.u32 [r2463+1280], r2136; +st.shared.u32 [r2463+1284], r2143; +st.shared.u32 [r2463+1408], r2173; +st.shared.u32 [r2463+1412], r2180; +st.shared.u32 [r2463+1536], r2210; +st.shared.u32 [r2463+1540], r2217; +st.shared.u32 [r2463+1664], r2247; +st.shared.u32 [r2463+1668], r2254; +st.shared.u32 [r2463+1792], r2284; +st.shared.u32 [r2463+1796], r2291; +st.shared.u32 [r2463+1920], r2321; +st.shared.u32 [r2463+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2464, r2457, -120, r2463; +ld.shared.u32 r2350, [r2464]; +ld.shared.u32 r2353, [r2464+4]; +ld.shared.u32 r2362, [r2464+256]; +ld.shared.u32 r2365, [r2464+260]; +ld.shared.u32 r2374, [r2464+512]; +ld.shared.u32 r2377, [r2464+516]; +ld.shared.u32 r2386, [r2464+768]; +ld.shared.u32 r2389, [r2464+772]; +ld.shared.u32 r2398, [r2464+1024]; +ld.shared.u32 r2401, [r2464+1028]; +ld.shared.u32 r2410, [r2464+1280]; +ld.shared.u32 r2413, [r2464+1284]; +ld.shared.u32 r2422, [r2464+1536]; +ld.shared.u32 r2425, [r2464+1540]; +ld.shared.u32 r2434, [r2464+1792]; +ld.shared.u32 r2437, [r2464+1796]; +ld.shared.u32 r2351, [r2464+2048]; +ld.shared.u32 r2354, [r2464+2052]; +ld.shared.u32 r2363, [r2464+2304]; +ld.shared.u32 r2366, [r2464+2308]; +ld.shared.u32 r2375, [r2464+2560]; +ld.shared.u32 r2378, [r2464+2564]; +ld.shared.u32 r2387, [r2464+2816]; +ld.shared.u32 r2390, [r2464+2820]; +ld.shared.u32 r2399, [r2464+3072]; +ld.shared.u32 r2402, [r2464+3076]; +ld.shared.u32 r2411, [r2464+3328]; +ld.shared.u32 r2414, [r2464+3332]; +ld.shared.u32 r2423, [r2464+3584]; +ld.shared.u32 r2426, [r2464+3588]; +ld.shared.u32 r2435, [r2464+3840]; +ld.shared.u32 r2438, [r2464+3844]; +{ +add.f16x2 %0, r2350, r2351; +} +{ +add.f16x2 %1, r2353, r2354; +} +{ +sub.f16x2 %16, r2350, r2351; +} +{ +sub.f16x2 %17, r2353, r2354; +} +{ +add.f16x2 %2, r2362, r2363; +} +{ +add.f16x2 %3, r2365, r2366; +} +{ +sub.f16x2 %18, r2362, r2363; +} +{ +sub.f16x2 %19, r2365, r2366; +} +{ +add.f16x2 %4, r2374, r2375; +} +{ +add.f16x2 %5, r2377, r2378; +} +{ +sub.f16x2 %20, r2374, r2375; +} +{ +sub.f16x2 %21, r2377, r2378; +} +{ +add.f16x2 %6, r2386, r2387; +} +{ +add.f16x2 %7, r2389, r2390; +} +{ +sub.f16x2 %22, r2386, r2387; +} +{ +sub.f16x2 %23, r2389, r2390; +} +{ +add.f16x2 %8, r2398, r2399; +} +{ +add.f16x2 %9, r2401, r2402; +} +{ +sub.f16x2 %24, r2398, r2399; +} +{ +sub.f16x2 %25, r2401, r2402; +} +{ +add.f16x2 %10, r2410, r2411; +} +{ +add.f16x2 %11, r2413, r2414; +} +{ +sub.f16x2 %26, r2410, r2411; +} +{ +sub.f16x2 %27, r2413, r2414; +} +{ +add.f16x2 %12, r2422, r2423; +} +{ +add.f16x2 %13, r2425, r2426; +} +{ +sub.f16x2 %28, r2422, r2423; +} +{ +sub.f16x2 %29, r2425, r2426; +} +{ +add.f16x2 %14, r2434, r2435; +} +{ +add.f16x2 %15, r2437, r2438; +} +{ +sub.f16x2 %30, r2434, r2435; +} +{ +sub.f16x2 %31, r2437, r2438; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<821, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<660>; +.reg .b32 r<4171>; +.reg .b64 rd<3>; +mov.u32 r4095, %tid.y; +shl.b32 r4096, r4095, 11; +mov.u32 r4097, %64; +add.s32 r4098, r4097, r4096; +mov.u32 r4099, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f600, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r101, {low, high}; +} +mov.f32 f618, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f596, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r393, {low, high}; +} +mov.f32 f622, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r396, {low, high}; +} +mov.f32 f604, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r397, {low, high}; +} +mov.f32 f620, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4100, r4099, 15; +shl.b32 r4101, r4099, 7; +and.b32 r4102, r4101, -2048; +add.s32 r4103, r4098, r4102; +cvt.rn.f32.u32 f655, r4100; +mul.f32 f656, f655, 0f3C490FDB; +cos.approx.f32 f357, f656; +sin.approx.f32 f657, f656; +neg.f32 f358, f657; +mov.f32 f659, 0fBF800000; +mov.f32 f658, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r4104, r4101, 1920; +add.s32 r4105, r4103, r4104; +st.shared.v4.f32 [r4105], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r4105+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r4105+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r4105+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r4105+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r4105+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r4105+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r4105+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r4106, r4100, -124, r4105; +ld.shared.u32 r2864, [r4106]; +ld.shared.u32 r3480, [r4106+64]; +ld.shared.u32 r3060, [r4106+128]; +ld.shared.u32 r3676, [r4106+192]; +ld.shared.u32 r2914, [r4106+256]; +ld.shared.u32 r3530, [r4106+320]; +ld.shared.u32 r3110, [r4106+384]; +ld.shared.u32 r3726, [r4106+448]; +ld.shared.u32 r2876, [r4106+512]; +ld.shared.u32 r3492, [r4106+576]; +ld.shared.u32 r3072, [r4106+640]; +ld.shared.u32 r3688, [r4106+704]; +ld.shared.u32 r2926, [r4106+768]; +ld.shared.u32 r3542, [r4106+832]; +ld.shared.u32 r3122, [r4106+896]; +ld.shared.u32 r3738, [r4106+960]; +ld.shared.u32 r2865, [r4106+1024]; +ld.shared.u32 r3481, [r4106+1088]; +ld.shared.u32 r3061, [r4106+1152]; +ld.shared.u32 r3677, [r4106+1216]; +ld.shared.u32 r2915, [r4106+1280]; +ld.shared.u32 r3531, [r4106+1344]; +ld.shared.u32 r3111, [r4106+1408]; +ld.shared.u32 r3727, [r4106+1472]; +ld.shared.u32 r2877, [r4106+1536]; +ld.shared.u32 r3493, [r4106+1600]; +ld.shared.u32 r3073, [r4106+1664]; +ld.shared.u32 r3689, [r4106+1728]; +ld.shared.u32 r2927, [r4106+1792]; +ld.shared.u32 r3543, [r4106+1856]; +ld.shared.u32 r3123, [r4106+1920]; +ld.shared.u32 r3739, [r4106+1984]; +barrier.sync 0; +st.shared.v4.f32 [r4105], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r4105+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r4105+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r4105+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r4105+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r4105+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r4105+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r4105+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r4106]; +ld.shared.u32 r3483, [r4106+64]; +ld.shared.u32 r3063, [r4106+128]; +ld.shared.u32 r3679, [r4106+192]; +ld.shared.u32 r2917, [r4106+256]; +ld.shared.u32 r3533, [r4106+320]; +ld.shared.u32 r3113, [r4106+384]; +ld.shared.u32 r3729, [r4106+448]; +ld.shared.u32 r2879, [r4106+512]; +ld.shared.u32 r3495, [r4106+576]; +ld.shared.u32 r3075, [r4106+640]; +ld.shared.u32 r3691, [r4106+704]; +ld.shared.u32 r2929, [r4106+768]; +ld.shared.u32 r3545, [r4106+832]; +ld.shared.u32 r3125, [r4106+896]; +ld.shared.u32 r3741, [r4106+960]; +ld.shared.u32 r2868, [r4106+1024]; +ld.shared.u32 r3484, [r4106+1088]; +ld.shared.u32 r3064, [r4106+1152]; +ld.shared.u32 r3680, [r4106+1216]; +ld.shared.u32 r2918, [r4106+1280]; +ld.shared.u32 r3534, [r4106+1344]; +ld.shared.u32 r3114, [r4106+1408]; +ld.shared.u32 r3730, [r4106+1472]; +ld.shared.u32 r2880, [r4106+1536]; +ld.shared.u32 r3496, [r4106+1600]; +ld.shared.u32 r3076, [r4106+1664]; +ld.shared.u32 r3692, [r4106+1728]; +ld.shared.u32 r2930, [r4106+1792]; +ld.shared.u32 r3546, [r4106+1856]; +ld.shared.u32 r3126, [r4106+1920]; +ld.shared.u32 r3742, [r4106+1984]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 %0, r3011, r3207; +} +{ +add.f16x2 %1, r3014, r3210; +} +{ +sub.f16x2 %32, r3011, r3207; +} +{ +sub.f16x2 %33, r3014, r3210; +} +{ +add.f16x2 %4, r3023, r3291; +} +{ +add.f16x2 %5, r3026, r3297; +} +{ +sub.f16x2 %36, r3023, r3291; +} +{ +sub.f16x2 %37, r3026, r3297; +} +{ +add.f16x2 %8, r3035, r3307; +} +{ +add.f16x2 %9, r3038, r3313; +} +{ +sub.f16x2 %40, r3035, r3307; +} +{ +sub.f16x2 %41, r3038, r3313; +} +{ +add.f16x2 %12, r3047, r3323; +} +{ +add.f16x2 %13, r3050, r3329; +} +{ +sub.f16x2 %44, r3047, r3323; +} +{ +sub.f16x2 %45, r3050, r3329; +} +{ +add.f16x2 %16, r3017, r3216; +} +{ +add.f16x2 %17, r3020, r3333; +} +{ +sub.f16x2 %48, r3017, r3216; +} +{ +sub.f16x2 %49, r3020, r3333; +} +{ +add.f16x2 %20, r3029, r3341; +} +{ +add.f16x2 %21, r3032, r3347; +} +{ +sub.f16x2 %52, r3029, r3341; +} +{ +sub.f16x2 %53, r3032, r3347; +} +{ +add.f16x2 %24, r3041, r3357; +} +{ +add.f16x2 %25, r3044, r3363; +} +{ +sub.f16x2 %56, r3041, r3357; +} +{ +sub.f16x2 %57, r3044, r3363; +} +{ +add.f16x2 %28, r3053, r3373; +} +{ +add.f16x2 %29, r3056, r3379; +} +{ +sub.f16x2 %60, r3053, r3373; +} +{ +sub.f16x2 %61, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f600; +cvt.rn.f16.f32 high, f600; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f604; +cvt.rn.f16.f32 high, f604; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 %2, r3627, r3823; +} +{ +add.f16x2 %3, r3630, r3826; +} +{ +sub.f16x2 %34, r3627, r3823; +} +{ +sub.f16x2 %35, r3630, r3826; +} +{ +add.f16x2 %6, r3639, r3907; +} +{ +add.f16x2 %7, r3642, r3913; +} +{ +sub.f16x2 %38, r3639, r3907; +} +{ +sub.f16x2 %39, r3642, r3913; +} +{ +add.f16x2 %10, r3651, r3923; +} +{ +add.f16x2 %11, r3654, r3929; +} +{ +sub.f16x2 %42, r3651, r3923; +} +{ +sub.f16x2 %43, r3654, r3929; +} +{ +add.f16x2 %14, r3663, r3939; +} +{ +add.f16x2 %15, r3666, r3945; +} +{ +sub.f16x2 %46, r3663, r3939; +} +{ +sub.f16x2 %47, r3666, r3945; +} +{ +add.f16x2 %18, r3633, r3832; +} +{ +add.f16x2 %19, r3636, r3949; +} +{ +sub.f16x2 %50, r3633, r3832; +} +{ +sub.f16x2 %51, r3636, r3949; +} +{ +add.f16x2 %22, r3645, r3957; +} +{ +add.f16x2 %23, r3648, r3963; +} +{ +sub.f16x2 %54, r3645, r3957; +} +{ +sub.f16x2 %55, r3648, r3963; +} +{ +add.f16x2 %26, r3657, r3973; +} +{ +add.f16x2 %27, r3660, r3979; +} +{ +sub.f16x2 %58, r3657, r3973; +} +{ +sub.f16x2 %59, r3660, r3979; +} +{ +add.f16x2 %30, r3669, r3989; +} +{ +add.f16x2 %31, r3672, r3995; +} +{ +sub.f16x2 %62, r3669, r3989; +} +{ +sub.f16x2 %63, r3672, r3995; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<823, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<53>; +.reg .b32 r<715>; +.reg .b64 rd<2>; +mov.u32 r681, %tid.y; +shl.b32 r682, r681, 12; +mov.u32 r683, %8; +add.s32 r684, r683, r682; +mov.u32 r685, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r686, r685, 127; +shl.b32 r687, r685, 5; +and.b32 r688, r687, -4096; +add.s32 r689, r684, r688; +cvt.rn.f32.u32 f41, r686; +mul.f32 f42, f41, 0f3C490FDB; +cos.approx.f32 f1, f42; +sin.approx.f32 f43, f42; +neg.f32 f2, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f37, 0fBF800000; +mov.f32 f38, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r690, r687, 4064; +add.s32 r691, r689, r690; +st.shared.v4.f32 [r691], {r27, r30, r63, r70}; +st.shared.v4.f32 [r691+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r692, r686, -24, r691; +ld.shared.u32 r166, [r692]; +ld.shared.u32 r169, [r692+4]; +ld.shared.u32 r178, [r692+1024]; +ld.shared.u32 r181, [r692+1028]; +ld.shared.u32 r167, [r692+2048]; +ld.shared.u32 r170, [r692+2052]; +ld.shared.u32 r179, [r692+3072]; +ld.shared.u32 r182, [r692+3076]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r693, r685, 124; +bfe.u32 r694, r685, 2, 5; +cvt.rn.f32.u32 f44, r694; +mul.f32 f45, f44, 0f3D490FDB; +cos.approx.f32 f11, f45; +sin.approx.f32 f46, f45; +neg.f32 f12, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +shl.b32 r695, r685, 3; +and.b32 r696, r695, 24; +add.s32 r697, r689, r696; +barrier.sync 0; +and.b32 r698, r687, 3968; +add.s32 r699, r697, r698; +st.shared.u32 [r699], r191; +st.shared.u32 [r699+4], r194; +st.shared.u32 [r699+32], r227; +st.shared.u32 [r699+36], r234; +st.shared.u32 [r699+64], r264; +st.shared.u32 [r699+68], r271; +st.shared.u32 [r699+96], r301; +st.shared.u32 [r699+100], r308; +barrier.sync 0; +mad.lo.s32 r700, r693, -24, r699; +ld.shared.u32 r330, [r700]; +ld.shared.u32 r333, [r700+4]; +ld.shared.u32 r342, [r700+1024]; +ld.shared.u32 r345, [r700+1028]; +ld.shared.u32 r331, [r700+2048]; +ld.shared.u32 r334, [r700+2052]; +ld.shared.u32 r343, [r700+3072]; +ld.shared.u32 r346, [r700+3076]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r350; +} +{ +add.f16x2 r370, r338, r353; +} +{ +sub.f16x2 r373, r335, r350; +} +{ +sub.f16x2 r376, r338, r353; +} +and.b32 r701, r685, 112; +bfe.u32 r702, r685, 4, 3; +cvt.rn.f32.u32 f47, r702; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f21, f48; +sin.approx.f32 f49, f48; +neg.f32 f22, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +neg.f16x2 r389, r386; +} +{ +fma.rn.f16x2 r391, r367, r382, r389; +} +{ +mul.f16x2 r395, r367, r384; +} +{ +fma.rn.f16x2 r398, r370, r382, r395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r361, r419, r426; +} +{ +mul.f16x2 r432, r361, r421; +} +{ +fma.rn.f16x2 r435, r364, r419, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r373, r456, r463; +} +{ +mul.f16x2 r469, r373, r458; +} +{ +fma.rn.f16x2 r472, r376, r456, r469; +} +and.b32 r703, r695, 120; +add.s32 r704, r689, r703; +barrier.sync 0; +and.b32 r705, r687, 3584; +add.s32 r706, r704, r705; +st.shared.u32 [r706], r355; +st.shared.u32 [r706+4], r358; +st.shared.u32 [r706+128], r391; +st.shared.u32 [r706+132], r398; +st.shared.u32 [r706+256], r428; +st.shared.u32 [r706+260], r435; +st.shared.u32 [r706+384], r465; +st.shared.u32 [r706+388], r472; +barrier.sync 0; +mad.lo.s32 r707, r701, -24, r706; +ld.shared.u32 r494, [r707]; +ld.shared.u32 r497, [r707+4]; +ld.shared.u32 r506, [r707+1024]; +ld.shared.u32 r509, [r707+1028]; +ld.shared.u32 r495, [r707+2048]; +ld.shared.u32 r498, [r707+2052]; +ld.shared.u32 r507, [r707+3072]; +ld.shared.u32 r510, [r707+3076]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r511; +} +{ +add.f16x2 r519, r493, r505; +} +{ +add.f16x2 r522, r496, r508; +} +{ +sub.f16x2 r525, r493, r505; +} +{ +sub.f16x2 r528, r496, r508; +} +{ +add.f16x2 r531, r499, r514; +} +{ +add.f16x2 r534, r502, r517; +} +{ +sub.f16x2 r537, r499, r514; +} +{ +sub.f16x2 r540, r502, r517; +} +and.b32 r708, r685, 64; +bfe.u32 r709, r685, 6, 1; +cvt.rn.f32.u32 f50, r709; +mul.f32 f51, f50, 0f3F490FDB; +cos.approx.f32 f31, f51; +sin.approx.f32 f52, f51; +neg.f32 f32, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r543, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r546, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r548, {high, high}; +} +{ +mul.f16x2 r550, r534, r548; +} +{ +neg.f16x2 r553, r550; +} +{ +fma.rn.f16x2 r555, r531, r546, r553; +} +{ +mul.f16x2 r559, r531, r548; +} +{ +fma.rn.f16x2 r562, r534, r546, r559; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r566, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r568, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r570, {low, high}; +} +{ +mul.f16x2 r571, r568, r570; +} +{ +mul.f16x2 r574, r543, r566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r577, {high, low}; +} +{ +fma.rn.f16x2 r579, r571, r577, r574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r583, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r585, {high, high}; +} +{ +mul.f16x2 r587, r528, r585; +} +{ +neg.f16x2 r590, r587; +} +{ +fma.rn.f16x2 r592, r525, r583, r590; +} +{ +mul.f16x2 r596, r525, r585; +} +{ +fma.rn.f16x2 r599, r528, r583, r596; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r603, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r605, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r605, r607; +} +{ +mul.f16x2 r611, r579, r603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r614, {high, low}; +} +{ +fma.rn.f16x2 r616, r608, r614, r611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r540, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r537, r620, r627; +} +{ +mul.f16x2 r633, r537, r622; +} +{ +fma.rn.f16x2 r636, r540, r620, r633; +} +and.b32 r710, r695, 504; +add.s32 r711, r689, r710; +barrier.sync 0; +and.b32 r712, r687, 2048; +add.s32 r713, r711, r712; +st.shared.u32 [r713], r519; +st.shared.u32 [r713+4], r522; +st.shared.u32 [r713+512], r555; +st.shared.u32 [r713+516], r562; +st.shared.u32 [r713+1024], r592; +st.shared.u32 [r713+1028], r599; +st.shared.u32 [r713+1536], r629; +st.shared.u32 [r713+1540], r636; +barrier.sync 0; +mad.lo.s32 r714, r708, -24, r713; +ld.shared.u32 r658, [r714]; +ld.shared.u32 r661, [r714+4]; +ld.shared.u32 r670, [r714+1024]; +ld.shared.u32 r673, [r714+1028]; +ld.shared.u32 r659, [r714+2048]; +ld.shared.u32 r662, [r714+2052]; +ld.shared.u32 r671, [r714+3072]; +ld.shared.u32 r674, [r714+3076]; +{ +add.f16x2 %0, r658, r659; +} +{ +add.f16x2 %1, r661, r662; +} +{ +sub.f16x2 %4, r658, r659; +} +{ +sub.f16x2 %5, r661, r662; +} +{ +add.f16x2 %2, r670, r671; +} +{ +add.f16x2 %3, r673, r674; +} +{ +sub.f16x2 %6, r670, r671; +} +{ +sub.f16x2 %7, r673, r674; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<824, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<53>; +.reg .b32 r<715>; +.reg .b64 rd<2>; +mov.u32 r681, %tid.y; +shl.b32 r682, r681, 11; +mov.u32 r683, %8; +add.s32 r684, r683, r682; +mov.u32 r685, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r686, r685, 127; +shl.b32 r687, r685, 4; +and.b32 r688, r687, -2048; +add.s32 r689, r684, r688; +cvt.rn.f32.u32 f41, r686; +mul.f32 f42, f41, 0f3C490FDB; +cos.approx.f32 f1, f42; +sin.approx.f32 f43, f42; +neg.f32 f2, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f37, 0fBF800000; +mov.f32 f38, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r690, r687, 2032; +add.s32 r691, r689, r690; +st.shared.v4.f32 [r691], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r692, r686, -12, r691; +ld.shared.u32 r166, [r692]; +ld.shared.u32 r178, [r692+512]; +ld.shared.u32 r167, [r692+1024]; +ld.shared.u32 r179, [r692+1536]; +barrier.sync 0; +st.shared.v4.f32 [r691], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r692]; +ld.shared.u32 r181, [r692+512]; +ld.shared.u32 r170, [r692+1024]; +ld.shared.u32 r182, [r692+1536]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r693, r685, 124; +bfe.u32 r694, r685, 2, 5; +shl.b32 r695, r685, 2; +and.b32 r696, r695, 12; +add.s32 r697, r689, r696; +cvt.rn.f32.u32 f44, r694; +mul.f32 f45, f44, 0f3D490FDB; +cos.approx.f32 f11, f45; +sin.approx.f32 f46, f45; +neg.f32 f12, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +barrier.sync 0; +and.b32 r698, r687, 1984; +add.s32 r699, r697, r698; +st.shared.u32 [r699], r191; +st.shared.u32 [r699+16], r227; +st.shared.u32 [r699+32], r264; +st.shared.u32 [r699+48], r301; +barrier.sync 0; +mad.lo.s32 r700, r693, -12, r699; +ld.shared.u32 r330, [r700]; +ld.shared.u32 r342, [r700+512]; +ld.shared.u32 r331, [r700+1024]; +ld.shared.u32 r343, [r700+1536]; +barrier.sync 0; +st.shared.u32 [r699], r194; +st.shared.u32 [r699+16], r234; +st.shared.u32 [r699+32], r271; +st.shared.u32 [r699+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r700]; +ld.shared.u32 r345, [r700+512]; +ld.shared.u32 r334, [r700+1024]; +ld.shared.u32 r346, [r700+1536]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r350; +} +{ +add.f16x2 r370, r338, r353; +} +{ +sub.f16x2 r373, r335, r350; +} +{ +sub.f16x2 r376, r338, r353; +} +and.b32 r701, r685, 112; +bfe.u32 r702, r685, 4, 3; +and.b32 r703, r695, 60; +add.s32 r704, r689, r703; +cvt.rn.f32.u32 f47, r702; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f21, f48; +sin.approx.f32 f49, f48; +neg.f32 f22, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +neg.f16x2 r389, r386; +} +{ +fma.rn.f16x2 r391, r367, r382, r389; +} +{ +mul.f16x2 r395, r367, r384; +} +{ +fma.rn.f16x2 r398, r370, r382, r395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +neg.f16x2 r426, r423; +} +{ +fma.rn.f16x2 r428, r361, r419, r426; +} +{ +mul.f16x2 r432, r361, r421; +} +{ +fma.rn.f16x2 r435, r364, r419, r432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +neg.f16x2 r463, r460; +} +{ +fma.rn.f16x2 r465, r373, r456, r463; +} +{ +mul.f16x2 r469, r373, r458; +} +{ +fma.rn.f16x2 r472, r376, r456, r469; +} +barrier.sync 0; +and.b32 r705, r687, 1792; +add.s32 r706, r704, r705; +st.shared.u32 [r706], r355; +st.shared.u32 [r706+64], r391; +st.shared.u32 [r706+128], r428; +st.shared.u32 [r706+192], r465; +barrier.sync 0; +mad.lo.s32 r707, r701, -12, r706; +ld.shared.u32 r494, [r707]; +ld.shared.u32 r506, [r707+512]; +ld.shared.u32 r495, [r707+1024]; +ld.shared.u32 r507, [r707+1536]; +barrier.sync 0; +st.shared.u32 [r706], r358; +st.shared.u32 [r706+64], r398; +st.shared.u32 [r706+128], r435; +st.shared.u32 [r706+192], r472; +barrier.sync 0; +ld.shared.u32 r497, [r707]; +ld.shared.u32 r509, [r707+512]; +ld.shared.u32 r498, [r707+1024]; +ld.shared.u32 r510, [r707+1536]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r511; +} +{ +add.f16x2 r519, r493, r505; +} +{ +add.f16x2 r522, r496, r508; +} +{ +sub.f16x2 r525, r493, r505; +} +{ +sub.f16x2 r528, r496, r508; +} +{ +add.f16x2 r531, r499, r514; +} +{ +add.f16x2 r534, r502, r517; +} +{ +sub.f16x2 r537, r499, r514; +} +{ +sub.f16x2 r540, r502, r517; +} +and.b32 r708, r685, 64; +bfe.u32 r709, r685, 6, 1; +and.b32 r710, r695, 252; +add.s32 r711, r689, r710; +cvt.rn.f32.u32 f50, r709; +mul.f32 f51, f50, 0f3F490FDB; +cos.approx.f32 f31, f51; +sin.approx.f32 f52, f51; +neg.f32 f32, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r543, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r546, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r548, {high, high}; +} +{ +mul.f16x2 r550, r534, r548; +} +{ +neg.f16x2 r553, r550; +} +{ +fma.rn.f16x2 r555, r531, r546, r553; +} +{ +mul.f16x2 r559, r531, r548; +} +{ +fma.rn.f16x2 r562, r534, r546, r559; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r566, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r568, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r570, {low, high}; +} +{ +mul.f16x2 r571, r568, r570; +} +{ +mul.f16x2 r574, r543, r566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r577, {high, low}; +} +{ +fma.rn.f16x2 r579, r571, r577, r574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r583, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r585, {high, high}; +} +{ +mul.f16x2 r587, r528, r585; +} +{ +neg.f16x2 r590, r587; +} +{ +fma.rn.f16x2 r592, r525, r583, r590; +} +{ +mul.f16x2 r596, r525, r585; +} +{ +fma.rn.f16x2 r599, r528, r583, r596; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r603, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r605, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r605, r607; +} +{ +mul.f16x2 r611, r579, r603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r614, {high, low}; +} +{ +fma.rn.f16x2 r616, r608, r614, r611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r540, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r537, r620, r627; +} +{ +mul.f16x2 r633, r537, r622; +} +{ +fma.rn.f16x2 r636, r540, r620, r633; +} +barrier.sync 0; +and.b32 r712, r687, 1024; +add.s32 r713, r711, r712; +st.shared.u32 [r713], r519; +st.shared.u32 [r713+256], r555; +st.shared.u32 [r713+512], r592; +st.shared.u32 [r713+768], r629; +barrier.sync 0; +mad.lo.s32 r714, r708, -12, r713; +ld.shared.u32 r658, [r714]; +ld.shared.u32 r670, [r714+512]; +ld.shared.u32 r659, [r714+1024]; +ld.shared.u32 r671, [r714+1536]; +barrier.sync 0; +st.shared.u32 [r713], r522; +st.shared.u32 [r713+256], r562; +st.shared.u32 [r713+512], r599; +st.shared.u32 [r713+768], r636; +barrier.sync 0; +ld.shared.u32 r661, [r714]; +ld.shared.u32 r673, [r714+512]; +ld.shared.u32 r662, [r714+1024]; +ld.shared.u32 r674, [r714+1536]; +{ +add.f16x2 %0, r658, r659; +} +{ +add.f16x2 %1, r661, r662; +} +{ +sub.f16x2 %4, r658, r659; +} +{ +sub.f16x2 %5, r661, r662; +} +{ +add.f16x2 %2, r670, r671; +} +{ +add.f16x2 %3, r673, r674; +} +{ +sub.f16x2 %6, r670, r671; +} +{ +sub.f16x2 %7, r673, r674; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<825, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<73>; +.reg .b32 r<492>; +.reg .b64 rd<2>; +mov.u32 r429, %tid.y; +shl.b32 r430, r429, 12; +mov.u32 r431, %4; +add.s32 r432, r431, r430; +mov.u32 r433, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r434, r433, 255; +shl.b32 r435, r433, 4; +and.b32 r436, r435, -4096; +add.s32 r437, r432, r436; +cvt.rn.f32.u32 f49, r434; +mul.f32 f50, f49, 0f3C490FDB; +cos.approx.f32 f1, f50; +sin.approx.f32 f51, f50; +neg.f32 f2, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r438, r435, 4080; +add.s32 r439, r437, r438; +st.shared.v2.f32 [r439], {r1, r4}; +st.shared.v2.f32 [r439+8], {r25, r32}; +barrier.sync 0; +shl.b32 r440, r433, 3; +and.b32 r441, r440, 2040; +sub.s32 r442, r439, r441; +ld.shared.u32 r54, [r442]; +ld.shared.u32 r57, [r442+4]; +ld.shared.u32 r55, [r442+2048]; +ld.shared.u32 r58, [r442+2052]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r443, r433, 1, 7; +cvt.rn.f32.u32 f52, r443; +mul.f32 f53, f52, 0f3CC90FDB; +cos.approx.f32 f7, f53; +sin.approx.f32 f54, f53; +neg.f32 f8, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r444, r440, 8; +add.s32 r445, r437, r444; +barrier.sync 0; +and.b32 r446, r435, 4064; +add.s32 r447, r445, r446; +st.shared.u32 [r447], r53; +st.shared.u32 [r447+4], r56; +st.shared.u32 [r447+16], r77; +st.shared.u32 [r447+20], r84; +barrier.sync 0; +and.b32 r448, r440, 2032; +sub.s32 r449, r447, r448; +ld.shared.u32 r106, [r449]; +ld.shared.u32 r109, [r449+4]; +ld.shared.u32 r107, [r449+2048]; +ld.shared.u32 r110, [r449+2052]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r450, r433, 2, 6; +cvt.rn.f32.u32 f55, r450; +mul.f32 f56, f55, 0f3D490FDB; +cos.approx.f32 f13, f56; +sin.approx.f32 f57, f56; +neg.f32 f14, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +and.b32 r451, r440, 24; +add.s32 r452, r437, r451; +barrier.sync 0; +and.b32 r453, r435, 4032; +add.s32 r454, r452, r453; +st.shared.u32 [r454], r105; +st.shared.u32 [r454+4], r108; +st.shared.u32 [r454+32], r129; +st.shared.u32 [r454+36], r136; +barrier.sync 0; +and.b32 r455, r440, 2016; +sub.s32 r456, r454, r455; +ld.shared.u32 r158, [r456]; +ld.shared.u32 r161, [r456+4]; +ld.shared.u32 r159, [r456+2048]; +ld.shared.u32 r162, [r456+2052]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r457, r433, 3, 5; +cvt.rn.f32.u32 f58, r457; +mul.f32 f59, f58, 0f3DC90FDB; +cos.approx.f32 f19, f59; +sin.approx.f32 f60, f59; +neg.f32 f20, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +and.b32 r458, r440, 56; +add.s32 r459, r437, r458; +barrier.sync 0; +and.b32 r460, r435, 3968; +add.s32 r461, r459, r460; +st.shared.u32 [r461], r157; +st.shared.u32 [r461+4], r160; +st.shared.u32 [r461+64], r181; +st.shared.u32 [r461+68], r188; +barrier.sync 0; +and.b32 r462, r440, 1984; +sub.s32 r463, r461, r462; +ld.shared.u32 r210, [r463]; +ld.shared.u32 r213, [r463+4]; +ld.shared.u32 r211, [r463+2048]; +ld.shared.u32 r214, [r463+2052]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r464, r433, 4, 4; +cvt.rn.f32.u32 f61, r464; +mul.f32 f62, f61, 0f3E490FDB; +cos.approx.f32 f25, f62; +sin.approx.f32 f63, f62; +neg.f32 f26, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +and.b32 r465, r440, 120; +add.s32 r466, r437, r465; +barrier.sync 0; +and.b32 r467, r435, 3840; +add.s32 r468, r466, r467; +st.shared.u32 [r468], r209; +st.shared.u32 [r468+4], r212; +st.shared.u32 [r468+128], r233; +st.shared.u32 [r468+132], r240; +barrier.sync 0; +and.b32 r469, r440, 1920; +sub.s32 r470, r468, r469; +ld.shared.u32 r262, [r470]; +ld.shared.u32 r265, [r470+4]; +ld.shared.u32 r263, [r470+2048]; +ld.shared.u32 r266, [r470+2052]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r471, r433, 5, 3; +cvt.rn.f32.u32 f64, r471; +mul.f32 f65, f64, 0f3EC90FDB; +cos.approx.f32 f31, f65; +sin.approx.f32 f66, f65; +neg.f32 f32, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +neg.f16x2 r283, r280; +} +{ +fma.rn.f16x2 r285, r267, r276, r283; +} +{ +mul.f16x2 r289, r267, r278; +} +{ +fma.rn.f16x2 r292, r270, r276, r289; +} +and.b32 r472, r440, 248; +add.s32 r473, r437, r472; +barrier.sync 0; +and.b32 r474, r435, 3584; +add.s32 r475, r473, r474; +st.shared.u32 [r475], r261; +st.shared.u32 [r475+4], r264; +st.shared.u32 [r475+256], r285; +st.shared.u32 [r475+260], r292; +barrier.sync 0; +and.b32 r476, r440, 1792; +sub.s32 r477, r475, r476; +ld.shared.u32 r314, [r477]; +ld.shared.u32 r317, [r477+4]; +ld.shared.u32 r315, [r477+2048]; +ld.shared.u32 r318, [r477+2052]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r478, r433, 6, 2; +cvt.rn.f32.u32 f67, r478; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f37, f68; +sin.approx.f32 f69, f68; +neg.f32 f38, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +neg.f16x2 r335, r332; +} +{ +fma.rn.f16x2 r337, r319, r328, r335; +} +{ +mul.f16x2 r341, r319, r330; +} +{ +fma.rn.f16x2 r344, r322, r328, r341; +} +and.b32 r479, r440, 504; +add.s32 r480, r437, r479; +barrier.sync 0; +and.b32 r481, r435, 3072; +add.s32 r482, r480, r481; +st.shared.u32 [r482], r313; +st.shared.u32 [r482+4], r316; +st.shared.u32 [r482+512], r337; +st.shared.u32 [r482+516], r344; +barrier.sync 0; +and.b32 r483, r440, 1536; +sub.s32 r484, r482, r483; +ld.shared.u32 r366, [r484]; +ld.shared.u32 r369, [r484+4]; +ld.shared.u32 r367, [r484+2048]; +ld.shared.u32 r370, [r484+2052]; +{ +add.f16x2 r365, r366, r367; +} +{ +add.f16x2 r368, r369, r370; +} +{ +sub.f16x2 r371, r366, r367; +} +{ +sub.f16x2 r374, r369, r370; +} +bfe.u32 r485, r433, 7, 1; +cvt.rn.f32.u32 f70, r485; +mul.f32 f71, f70, 0f3FC90FDB; +cos.approx.f32 f43, f71; +sin.approx.f32 f72, f71; +neg.f32 f44, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r377, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r380, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r382, {high, high}; +} +{ +mul.f16x2 r384, r374, r382; +} +{ +neg.f16x2 r387, r384; +} +{ +fma.rn.f16x2 r389, r371, r380, r387; +} +{ +mul.f16x2 r393, r371, r382; +} +{ +fma.rn.f16x2 r396, r374, r380, r393; +} +and.b32 r486, r440, 1016; +add.s32 r487, r437, r486; +barrier.sync 0; +and.b32 r488, r435, 2048; +add.s32 r489, r487, r488; +st.shared.u32 [r489], r365; +st.shared.u32 [r489+4], r368; +st.shared.u32 [r489+1024], r389; +st.shared.u32 [r489+1028], r396; +barrier.sync 0; +and.b32 r490, r440, 1024; +sub.s32 r491, r489, r490; +ld.shared.u32 r418, [r491]; +ld.shared.u32 r421, [r491+4]; +ld.shared.u32 r419, [r491+2048]; +ld.shared.u32 r422, [r491+2052]; +{ +add.f16x2 %0, r418, r419; +} +{ +add.f16x2 %1, r421, r422; +} +{ +sub.f16x2 %2, r418, r419; +} +{ +sub.f16x2 %3, r421, r422; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<826, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<73>; +.reg .b32 r<492>; +.reg .b64 rd<2>; +mov.u32 r429, %tid.y; +shl.b32 r430, r429, 11; +mov.u32 r431, %4; +add.s32 r432, r431, r430; +mov.u32 r433, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r434, r433, 255; +shl.b32 r435, r433, 3; +and.b32 r436, r435, -2048; +add.s32 r437, r432, r436; +cvt.rn.f32.u32 f49, r434; +mul.f32 f50, f49, 0f3C490FDB; +cos.approx.f32 f1, f50; +sin.approx.f32 f51, f50; +neg.f32 f2, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r438, r435, 2040; +add.s32 r439, r437, r438; +st.shared.v2.f32 [r439], {r1, r25}; +barrier.sync 0; +shl.b32 r440, r433, 2; +and.b32 r441, r440, 1020; +sub.s32 r442, r439, r441; +ld.shared.u32 r54, [r442]; +ld.shared.u32 r55, [r442+1024]; +barrier.sync 0; +st.shared.v2.f32 [r439], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r442]; +ld.shared.u32 r58, [r442+1024]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r443, r433, 1, 7; +and.b32 r444, r440, 4; +add.s32 r445, r437, r444; +cvt.rn.f32.u32 f52, r443; +mul.f32 f53, f52, 0f3CC90FDB; +cos.approx.f32 f7, f53; +sin.approx.f32 f54, f53; +neg.f32 f8, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r446, r435, 2032; +add.s32 r447, r445, r446; +st.shared.u32 [r447], r53; +st.shared.u32 [r447+8], r77; +barrier.sync 0; +and.b32 r448, r440, 1016; +sub.s32 r449, r447, r448; +ld.shared.u32 r106, [r449]; +ld.shared.u32 r107, [r449+1024]; +barrier.sync 0; +st.shared.u32 [r447], r56; +st.shared.u32 [r447+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r449]; +ld.shared.u32 r110, [r449+1024]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r450, r433, 2, 6; +and.b32 r451, r440, 12; +add.s32 r452, r437, r451; +cvt.rn.f32.u32 f55, r450; +mul.f32 f56, f55, 0f3D490FDB; +cos.approx.f32 f13, f56; +sin.approx.f32 f57, f56; +neg.f32 f14, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +barrier.sync 0; +and.b32 r453, r435, 2016; +add.s32 r454, r452, r453; +st.shared.u32 [r454], r105; +st.shared.u32 [r454+16], r129; +barrier.sync 0; +and.b32 r455, r440, 1008; +sub.s32 r456, r454, r455; +ld.shared.u32 r158, [r456]; +ld.shared.u32 r159, [r456+1024]; +barrier.sync 0; +st.shared.u32 [r454], r108; +st.shared.u32 [r454+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r456]; +ld.shared.u32 r162, [r456+1024]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r457, r433, 3, 5; +and.b32 r458, r440, 28; +add.s32 r459, r437, r458; +cvt.rn.f32.u32 f58, r457; +mul.f32 f59, f58, 0f3DC90FDB; +cos.approx.f32 f19, f59; +sin.approx.f32 f60, f59; +neg.f32 f20, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +barrier.sync 0; +and.b32 r460, r435, 1984; +add.s32 r461, r459, r460; +st.shared.u32 [r461], r157; +st.shared.u32 [r461+32], r181; +barrier.sync 0; +and.b32 r462, r440, 992; +sub.s32 r463, r461, r462; +ld.shared.u32 r210, [r463]; +ld.shared.u32 r211, [r463+1024]; +barrier.sync 0; +st.shared.u32 [r461], r160; +st.shared.u32 [r461+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r463]; +ld.shared.u32 r214, [r463+1024]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r464, r433, 4, 4; +and.b32 r465, r440, 60; +add.s32 r466, r437, r465; +cvt.rn.f32.u32 f61, r464; +mul.f32 f62, f61, 0f3E490FDB; +cos.approx.f32 f25, f62; +sin.approx.f32 f63, f62; +neg.f32 f26, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +barrier.sync 0; +and.b32 r467, r435, 1920; +add.s32 r468, r466, r467; +st.shared.u32 [r468], r209; +st.shared.u32 [r468+64], r233; +barrier.sync 0; +and.b32 r469, r440, 960; +sub.s32 r470, r468, r469; +ld.shared.u32 r262, [r470]; +ld.shared.u32 r263, [r470+1024]; +barrier.sync 0; +st.shared.u32 [r468], r212; +st.shared.u32 [r468+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r470]; +ld.shared.u32 r266, [r470+1024]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r471, r433, 5, 3; +and.b32 r472, r440, 124; +add.s32 r473, r437, r472; +cvt.rn.f32.u32 f64, r471; +mul.f32 f65, f64, 0f3EC90FDB; +cos.approx.f32 f31, f65; +sin.approx.f32 f66, f65; +neg.f32 f32, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +neg.f16x2 r283, r280; +} +{ +fma.rn.f16x2 r285, r267, r276, r283; +} +{ +mul.f16x2 r289, r267, r278; +} +{ +fma.rn.f16x2 r292, r270, r276, r289; +} +barrier.sync 0; +and.b32 r474, r435, 1792; +add.s32 r475, r473, r474; +st.shared.u32 [r475], r261; +st.shared.u32 [r475+128], r285; +barrier.sync 0; +and.b32 r476, r440, 896; +sub.s32 r477, r475, r476; +ld.shared.u32 r314, [r477]; +ld.shared.u32 r315, [r477+1024]; +barrier.sync 0; +st.shared.u32 [r475], r264; +st.shared.u32 [r475+128], r292; +barrier.sync 0; +ld.shared.u32 r317, [r477]; +ld.shared.u32 r318, [r477+1024]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r478, r433, 6, 2; +and.b32 r479, r440, 252; +add.s32 r480, r437, r479; +cvt.rn.f32.u32 f67, r478; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f37, f68; +sin.approx.f32 f69, f68; +neg.f32 f38, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +neg.f16x2 r335, r332; +} +{ +fma.rn.f16x2 r337, r319, r328, r335; +} +{ +mul.f16x2 r341, r319, r330; +} +{ +fma.rn.f16x2 r344, r322, r328, r341; +} +barrier.sync 0; +and.b32 r481, r435, 1536; +add.s32 r482, r480, r481; +st.shared.u32 [r482], r313; +st.shared.u32 [r482+256], r337; +barrier.sync 0; +and.b32 r483, r440, 768; +sub.s32 r484, r482, r483; +ld.shared.u32 r366, [r484]; +ld.shared.u32 r367, [r484+1024]; +barrier.sync 0; +st.shared.u32 [r482], r316; +st.shared.u32 [r482+256], r344; +barrier.sync 0; +ld.shared.u32 r369, [r484]; +ld.shared.u32 r370, [r484+1024]; +{ +add.f16x2 r365, r366, r367; +} +{ +add.f16x2 r368, r369, r370; +} +{ +sub.f16x2 r371, r366, r367; +} +{ +sub.f16x2 r374, r369, r370; +} +bfe.u32 r485, r433, 7, 1; +and.b32 r486, r440, 508; +add.s32 r487, r437, r486; +cvt.rn.f32.u32 f70, r485; +mul.f32 f71, f70, 0f3FC90FDB; +cos.approx.f32 f43, f71; +sin.approx.f32 f72, f71; +neg.f32 f44, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r377, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r380, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r382, {high, high}; +} +{ +mul.f16x2 r384, r374, r382; +} +{ +neg.f16x2 r387, r384; +} +{ +fma.rn.f16x2 r389, r371, r380, r387; +} +{ +mul.f16x2 r393, r371, r382; +} +{ +fma.rn.f16x2 r396, r374, r380, r393; +} +barrier.sync 0; +and.b32 r488, r435, 1024; +add.s32 r489, r487, r488; +st.shared.u32 [r489], r365; +st.shared.u32 [r489+512], r389; +barrier.sync 0; +and.b32 r490, r440, 512; +sub.s32 r491, r489, r490; +ld.shared.u32 r418, [r491]; +ld.shared.u32 r419, [r491+1024]; +barrier.sync 0; +st.shared.u32 [r489], r368; +st.shared.u32 [r489+512], r396; +barrier.sync 0; +ld.shared.u32 r421, [r491]; +ld.shared.u32 r422, [r491+1024]; +{ +add.f16x2 %0, r418, r419; +} +{ +add.f16x2 %1, r421, r422; +} +{ +sub.f16x2 %2, r418, r419; +} +{ +sub.f16x2 %3, r421, r422; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..25ad66b2cbc88 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp16_inv.hpp.inc @@ -0,0 +1,22462 @@ +#ifndef CUFFTDX_FFT_512_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_512_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1019, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<127>; +.reg .b32 r<1133>; +.reg .b64 rd<2>; +mov.u32 r1113, %tid.y; +shl.b32 r1114, r1113, 12; +mov.u32 r1115, %16; +add.s32 r1116, r1115, r1114; +mov.u32 r1117, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f104, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f90, 0f3F800000; +mov.f32 f102, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1118, r1117, 63; +shl.b32 r1119, r1117, 6; +and.b32 r1120, r1119, -4096; +add.s32 r1121, r1116, r1120; +cvt.rn.f32.u32 f121, r1118; +mul.f32 f122, f121, 0f3C490FDB; +cos.approx.f32 f29, f122; +sin.approx.f32 f123, f122; +neg.f32 f30, f123; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1122, r1119, 4032; +add.s32 r1123, r1121, r1122; +st.shared.v4.f32 [r1123], {r149, r152, r207, r216}; +st.shared.v4.f32 [r1123+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r1123+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r1123+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r1124, r1118, -56, r1123; +ld.shared.u32 r460, [r1124]; +ld.shared.u32 r463, [r1124+4]; +ld.shared.u32 r510, [r1124+512]; +ld.shared.u32 r513, [r1124+516]; +ld.shared.u32 r472, [r1124+1024]; +ld.shared.u32 r475, [r1124+1028]; +ld.shared.u32 r522, [r1124+1536]; +ld.shared.u32 r525, [r1124+1540]; +ld.shared.u32 r461, [r1124+2048]; +ld.shared.u32 r464, [r1124+2052]; +ld.shared.u32 r511, [r1124+2560]; +ld.shared.u32 r514, [r1124+2564]; +ld.shared.u32 r473, [r1124+3072]; +ld.shared.u32 r476, [r1124+3076]; +ld.shared.u32 r523, [r1124+3584]; +ld.shared.u32 r526, [r1124+3588]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1125, r1117, 56; +bfe.u32 r1126, r1117, 3, 3; +cvt.rn.f32.u32 f124, r1126; +mul.f32 f125, f124, 0f3DC90FDB; +cos.approx.f32 f75, f125; +sin.approx.f32 f126, f125; +neg.f32 f76, f126; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r1127, r1117, 3; +and.b32 r1128, r1127, 56; +add.s32 r1129, r1121, r1128; +barrier.sync 0; +and.b32 r1130, r1119, 3584; +add.s32 r1131, r1129, r1130; +st.shared.u32 [r1131], r607; +st.shared.u32 [r1131+4], r610; +st.shared.u32 [r1131+64], r665; +st.shared.u32 [r1131+68], r674; +st.shared.u32 [r1131+128], r702; +st.shared.u32 [r1131+132], r711; +st.shared.u32 [r1131+192], r739; +st.shared.u32 [r1131+196], r748; +st.shared.u32 [r1131+256], r776; +st.shared.u32 [r1131+260], r785; +st.shared.u32 [r1131+320], r813; +st.shared.u32 [r1131+324], r822; +st.shared.u32 [r1131+384], r850; +st.shared.u32 [r1131+388], r859; +st.shared.u32 [r1131+448], r887; +st.shared.u32 [r1131+452], r896; +barrier.sync 0; +mad.lo.s32 r1132, r1125, -56, r1131; +ld.shared.u32 r918, [r1132]; +ld.shared.u32 r921, [r1132+4]; +ld.shared.u32 r968, [r1132+512]; +ld.shared.u32 r971, [r1132+516]; +ld.shared.u32 r930, [r1132+1024]; +ld.shared.u32 r933, [r1132+1028]; +ld.shared.u32 r980, [r1132+1536]; +ld.shared.u32 r983, [r1132+1540]; +ld.shared.u32 r919, [r1132+2048]; +ld.shared.u32 r922, [r1132+2052]; +ld.shared.u32 r969, [r1132+2560]; +ld.shared.u32 r972, [r1132+2564]; +ld.shared.u32 r931, [r1132+3072]; +ld.shared.u32 r934, [r1132+3076]; +ld.shared.u32 r981, [r1132+3584]; +ld.shared.u32 r984, [r1132+3588]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 %0, r943, r993; +} +{ +add.f16x2 %1, r946, r996; +} +{ +sub.f16x2 %8, r943, r993; +} +{ +sub.f16x2 %9, r946, r996; +} +{ +add.f16x2 %2, r955, r1037; +} +{ +add.f16x2 %3, r958, r1043; +} +{ +sub.f16x2 %10, r955, r1037; +} +{ +sub.f16x2 %11, r958, r1043; +} +{ +add.f16x2 %4, r949, r1047; +} +{ +add.f16x2 %5, r952, r999; +} +{ +sub.f16x2 %12, r949, r1047; +} +{ +sub.f16x2 %13, r952, r999; +} +{ +add.f16x2 %6, r961, r1055; +} +{ +add.f16x2 %7, r964, r1061; +} +{ +sub.f16x2 %14, r961, r1055; +} +{ +sub.f16x2 %15, r964, r1061; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1020, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<127>; +.reg .b32 r<1133>; +.reg .b64 rd<2>; +mov.u32 r1113, %tid.y; +shl.b32 r1114, r1113, 11; +mov.u32 r1115, %16; +add.s32 r1116, r1115, r1114; +mov.u32 r1117, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f104, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r102, {low, high}; +} +mov.f32 f90, 0f3F800000; +mov.f32 f102, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r106, {low, high}; +} +mov.f32 f89, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1118, r1117, 63; +shl.b32 r1119, r1117, 5; +and.b32 r1120, r1119, -2048; +add.s32 r1121, r1116, r1120; +cvt.rn.f32.u32 f121, r1118; +mul.f32 f122, f121, 0f3C490FDB; +cos.approx.f32 f29, f122; +sin.approx.f32 f123, f122; +neg.f32 f30, f123; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1122, r1119, 2016; +add.s32 r1123, r1121, r1122; +st.shared.v4.f32 [r1123], {r149, r207, r244, r281}; +st.shared.v4.f32 [r1123+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r1124, r1118, -28, r1123; +ld.shared.u32 r460, [r1124]; +ld.shared.u32 r510, [r1124+256]; +ld.shared.u32 r472, [r1124+512]; +ld.shared.u32 r522, [r1124+768]; +ld.shared.u32 r461, [r1124+1024]; +ld.shared.u32 r511, [r1124+1280]; +ld.shared.u32 r473, [r1124+1536]; +ld.shared.u32 r523, [r1124+1792]; +barrier.sync 0; +st.shared.v4.f32 [r1123], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1123+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1124]; +ld.shared.u32 r513, [r1124+256]; +ld.shared.u32 r475, [r1124+512]; +ld.shared.u32 r525, [r1124+768]; +ld.shared.u32 r464, [r1124+1024]; +ld.shared.u32 r514, [r1124+1280]; +ld.shared.u32 r476, [r1124+1536]; +ld.shared.u32 r526, [r1124+1792]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1125, r1117, 56; +bfe.u32 r1126, r1117, 3, 3; +shl.b32 r1127, r1117, 2; +and.b32 r1128, r1127, 28; +add.s32 r1129, r1121, r1128; +cvt.rn.f32.u32 f124, r1126; +mul.f32 f125, f124, 0f3DC90FDB; +cos.approx.f32 f75, f125; +sin.approx.f32 f126, f125; +neg.f32 f76, f126; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f89; +cvt.rn.f16.f32 high, f90; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r1130, r1119, 1792; +add.s32 r1131, r1129, r1130; +st.shared.u32 [r1131], r607; +st.shared.u32 [r1131+32], r665; +st.shared.u32 [r1131+64], r702; +st.shared.u32 [r1131+96], r739; +st.shared.u32 [r1131+128], r776; +st.shared.u32 [r1131+160], r813; +st.shared.u32 [r1131+192], r850; +st.shared.u32 [r1131+224], r887; +barrier.sync 0; +mad.lo.s32 r1132, r1125, -28, r1131; +ld.shared.u32 r918, [r1132]; +ld.shared.u32 r968, [r1132+256]; +ld.shared.u32 r930, [r1132+512]; +ld.shared.u32 r980, [r1132+768]; +ld.shared.u32 r919, [r1132+1024]; +ld.shared.u32 r969, [r1132+1280]; +ld.shared.u32 r931, [r1132+1536]; +ld.shared.u32 r981, [r1132+1792]; +barrier.sync 0; +st.shared.u32 [r1131], r610; +st.shared.u32 [r1131+32], r674; +st.shared.u32 [r1131+64], r711; +st.shared.u32 [r1131+96], r748; +st.shared.u32 [r1131+128], r785; +st.shared.u32 [r1131+160], r822; +st.shared.u32 [r1131+192], r859; +st.shared.u32 [r1131+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1132]; +ld.shared.u32 r971, [r1132+256]; +ld.shared.u32 r933, [r1132+512]; +ld.shared.u32 r983, [r1132+768]; +ld.shared.u32 r922, [r1132+1024]; +ld.shared.u32 r972, [r1132+1280]; +ld.shared.u32 r934, [r1132+1536]; +ld.shared.u32 r984, [r1132+1792]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 %0, r943, r993; +} +{ +add.f16x2 %1, r946, r996; +} +{ +sub.f16x2 %8, r943, r993; +} +{ +sub.f16x2 %9, r946, r996; +} +{ +add.f16x2 %2, r955, r1037; +} +{ +add.f16x2 %3, r958, r1043; +} +{ +sub.f16x2 %10, r955, r1037; +} +{ +sub.f16x2 %11, r958, r1043; +} +{ +add.f16x2 %4, r949, r1047; +} +{ +add.f16x2 %5, r952, r999; +} +{ +sub.f16x2 %12, r949, r1047; +} +{ +sub.f16x2 %13, r952, r999; +} +{ +add.f16x2 %6, r961, r1055; +} +{ +add.f16x2 %7, r964, r1061; +} +{ +sub.f16x2 %14, r961, r1055; +} +{ +sub.f16x2 %15, r964, r1061; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1022, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<660>; +.reg .b32 r<4171>; +.reg .b64 rd<3>; +mov.u32 r4095, %tid.y; +shl.b32 r4096, r4095, 12; +mov.u32 r4097, %64; +add.s32 r4098, r4097, r4096; +mov.u32 r4099, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f618, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r102, {low, high}; +} +mov.f32 f616, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f614, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r393, {low, high}; +} +mov.f32 f622, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r398, {low, high}; +} +mov.f32 f612, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r404, {low, high}; +} +mov.f32 f620, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4100, r4099, 15; +shl.b32 r4101, r4099, 8; +and.b32 r4102, r4101, -4096; +add.s32 r4103, r4098, r4102; +cvt.rn.f32.u32 f655, r4100; +mul.f32 f656, f655, 0f3C490FDB; +cos.approx.f32 f357, f656; +sin.approx.f32 f657, f656; +neg.f32 f358, f657; +mov.f32 f659, 0fBF800000; +mov.f32 f658, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r4104, r4101, 3840; +add.s32 r4105, r4103, r4104; +st.shared.v4.f32 [r4105], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r4105+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r4105+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r4105+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r4105+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r4105+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r4105+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r4105+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r4105+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r4105+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r4105+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r4105+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r4105+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r4105+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r4105+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r4105+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r4106, r4100, -248, r4105; +ld.shared.u32 r2864, [r4106]; +ld.shared.u32 r2867, [r4106+4]; +ld.shared.u32 r3480, [r4106+128]; +ld.shared.u32 r3483, [r4106+132]; +ld.shared.u32 r3060, [r4106+256]; +ld.shared.u32 r3063, [r4106+260]; +ld.shared.u32 r3676, [r4106+384]; +ld.shared.u32 r3679, [r4106+388]; +ld.shared.u32 r2914, [r4106+512]; +ld.shared.u32 r2917, [r4106+516]; +ld.shared.u32 r3530, [r4106+640]; +ld.shared.u32 r3533, [r4106+644]; +ld.shared.u32 r3110, [r4106+768]; +ld.shared.u32 r3113, [r4106+772]; +ld.shared.u32 r3726, [r4106+896]; +ld.shared.u32 r3729, [r4106+900]; +ld.shared.u32 r2876, [r4106+1024]; +ld.shared.u32 r2879, [r4106+1028]; +ld.shared.u32 r3492, [r4106+1152]; +ld.shared.u32 r3495, [r4106+1156]; +ld.shared.u32 r3072, [r4106+1280]; +ld.shared.u32 r3075, [r4106+1284]; +ld.shared.u32 r3688, [r4106+1408]; +ld.shared.u32 r3691, [r4106+1412]; +ld.shared.u32 r2926, [r4106+1536]; +ld.shared.u32 r2929, [r4106+1540]; +ld.shared.u32 r3542, [r4106+1664]; +ld.shared.u32 r3545, [r4106+1668]; +ld.shared.u32 r3122, [r4106+1792]; +ld.shared.u32 r3125, [r4106+1796]; +ld.shared.u32 r3738, [r4106+1920]; +ld.shared.u32 r3741, [r4106+1924]; +ld.shared.u32 r2865, [r4106+2048]; +ld.shared.u32 r2868, [r4106+2052]; +ld.shared.u32 r3481, [r4106+2176]; +ld.shared.u32 r3484, [r4106+2180]; +ld.shared.u32 r3061, [r4106+2304]; +ld.shared.u32 r3064, [r4106+2308]; +ld.shared.u32 r3677, [r4106+2432]; +ld.shared.u32 r3680, [r4106+2436]; +ld.shared.u32 r2915, [r4106+2560]; +ld.shared.u32 r2918, [r4106+2564]; +ld.shared.u32 r3531, [r4106+2688]; +ld.shared.u32 r3534, [r4106+2692]; +ld.shared.u32 r3111, [r4106+2816]; +ld.shared.u32 r3114, [r4106+2820]; +ld.shared.u32 r3727, [r4106+2944]; +ld.shared.u32 r3730, [r4106+2948]; +ld.shared.u32 r2877, [r4106+3072]; +ld.shared.u32 r2880, [r4106+3076]; +ld.shared.u32 r3493, [r4106+3200]; +ld.shared.u32 r3496, [r4106+3204]; +ld.shared.u32 r3073, [r4106+3328]; +ld.shared.u32 r3076, [r4106+3332]; +ld.shared.u32 r3689, [r4106+3456]; +ld.shared.u32 r3692, [r4106+3460]; +ld.shared.u32 r2927, [r4106+3584]; +ld.shared.u32 r2930, [r4106+3588]; +ld.shared.u32 r3543, [r4106+3712]; +ld.shared.u32 r3546, [r4106+3716]; +ld.shared.u32 r3123, [r4106+3840]; +ld.shared.u32 r3126, [r4106+3844]; +ld.shared.u32 r3739, [r4106+3968]; +ld.shared.u32 r3742, [r4106+3972]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 %0, r3011, r3207; +} +{ +add.f16x2 %1, r3014, r3210; +} +{ +sub.f16x2 %32, r3011, r3207; +} +{ +sub.f16x2 %33, r3014, r3210; +} +{ +add.f16x2 %4, r3023, r3291; +} +{ +add.f16x2 %5, r3026, r3297; +} +{ +sub.f16x2 %36, r3023, r3291; +} +{ +sub.f16x2 %37, r3026, r3297; +} +{ +add.f16x2 %8, r3035, r3307; +} +{ +add.f16x2 %9, r3038, r3313; +} +{ +sub.f16x2 %40, r3035, r3307; +} +{ +sub.f16x2 %41, r3038, r3313; +} +{ +add.f16x2 %12, r3047, r3323; +} +{ +add.f16x2 %13, r3050, r3329; +} +{ +sub.f16x2 %44, r3047, r3323; +} +{ +sub.f16x2 %45, r3050, r3329; +} +{ +add.f16x2 %16, r3017, r3333; +} +{ +add.f16x2 %17, r3020, r3213; +} +{ +sub.f16x2 %48, r3017, r3333; +} +{ +sub.f16x2 %49, r3020, r3213; +} +{ +add.f16x2 %20, r3029, r3341; +} +{ +add.f16x2 %21, r3032, r3347; +} +{ +sub.f16x2 %52, r3029, r3341; +} +{ +sub.f16x2 %53, r3032, r3347; +} +{ +add.f16x2 %24, r3041, r3357; +} +{ +add.f16x2 %25, r3044, r3363; +} +{ +sub.f16x2 %56, r3041, r3357; +} +{ +sub.f16x2 %57, r3044, r3363; +} +{ +add.f16x2 %28, r3053, r3373; +} +{ +add.f16x2 %29, r3056, r3379; +} +{ +sub.f16x2 %60, r3053, r3373; +} +{ +sub.f16x2 %61, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 %2, r3627, r3823; +} +{ +add.f16x2 %3, r3630, r3826; +} +{ +sub.f16x2 %34, r3627, r3823; +} +{ +sub.f16x2 %35, r3630, r3826; +} +{ +add.f16x2 %6, r3639, r3907; +} +{ +add.f16x2 %7, r3642, r3913; +} +{ +sub.f16x2 %38, r3639, r3907; +} +{ +sub.f16x2 %39, r3642, r3913; +} +{ +add.f16x2 %10, r3651, r3923; +} +{ +add.f16x2 %11, r3654, r3929; +} +{ +sub.f16x2 %42, r3651, r3923; +} +{ +sub.f16x2 %43, r3654, r3929; +} +{ +add.f16x2 %14, r3663, r3939; +} +{ +add.f16x2 %15, r3666, r3945; +} +{ +sub.f16x2 %46, r3663, r3939; +} +{ +sub.f16x2 %47, r3666, r3945; +} +{ +add.f16x2 %18, r3633, r3949; +} +{ +add.f16x2 %19, r3636, r3829; +} +{ +sub.f16x2 %50, r3633, r3949; +} +{ +sub.f16x2 %51, r3636, r3829; +} +{ +add.f16x2 %22, r3645, r3957; +} +{ +add.f16x2 %23, r3648, r3963; +} +{ +sub.f16x2 %54, r3645, r3957; +} +{ +sub.f16x2 %55, r3648, r3963; +} +{ +add.f16x2 %26, r3657, r3973; +} +{ +add.f16x2 %27, r3660, r3979; +} +{ +sub.f16x2 %58, r3657, r3973; +} +{ +sub.f16x2 %59, r3660, r3979; +} +{ +add.f16x2 %30, r3669, r3989; +} +{ +add.f16x2 %31, r3672, r3995; +} +{ +sub.f16x2 %62, r3669, r3989; +} +{ +sub.f16x2 %63, r3672, r3995; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1021, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2465>; +.reg .b64 rd<2>; +mov.u32 r2445, %tid.y; +shl.b32 r2446, r2445, 11; +mov.u32 r2447, %32; +add.s32 r2448, r2447, r2446; +mov.u32 r2449, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f230, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f228, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2450, r2449, 31; +shl.b32 r2451, r2449, 6; +and.b32 r2452, r2451, -2048; +add.s32 r2453, r2448, r2452; +cvt.rn.f32.u32 f301, r2450; +mul.f32 f302, f301, 0f3C490FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2454, r2451, 1984; +add.s32 r2455, r2453, r2454; +st.shared.v4.f32 [r2455], {r521, r627, r664, r701}; +st.shared.v4.f32 [r2455+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r2455+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r2455+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r2456, r2450, -60, r2455; +ld.shared.u32 r1176, [r2456]; +ld.shared.u32 r1372, [r2456+128]; +ld.shared.u32 r1226, [r2456+256]; +ld.shared.u32 r1422, [r2456+384]; +ld.shared.u32 r1188, [r2456+512]; +ld.shared.u32 r1384, [r2456+640]; +ld.shared.u32 r1238, [r2456+768]; +ld.shared.u32 r1434, [r2456+896]; +ld.shared.u32 r1177, [r2456+1024]; +ld.shared.u32 r1373, [r2456+1152]; +ld.shared.u32 r1227, [r2456+1280]; +ld.shared.u32 r1423, [r2456+1408]; +ld.shared.u32 r1189, [r2456+1536]; +ld.shared.u32 r1385, [r2456+1664]; +ld.shared.u32 r1239, [r2456+1792]; +ld.shared.u32 r1435, [r2456+1920]; +barrier.sync 0; +st.shared.v4.f32 [r2455], {r524, r636, r673, r710}; +st.shared.v4.f32 [r2455+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r2455+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r2455+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r2456]; +ld.shared.u32 r1375, [r2456+128]; +ld.shared.u32 r1229, [r2456+256]; +ld.shared.u32 r1425, [r2456+384]; +ld.shared.u32 r1191, [r2456+512]; +ld.shared.u32 r1387, [r2456+640]; +ld.shared.u32 r1241, [r2456+768]; +ld.shared.u32 r1437, [r2456+896]; +ld.shared.u32 r1180, [r2456+1024]; +ld.shared.u32 r1376, [r2456+1152]; +ld.shared.u32 r1230, [r2456+1280]; +ld.shared.u32 r1426, [r2456+1408]; +ld.shared.u32 r1192, [r2456+1536]; +ld.shared.u32 r1388, [r2456+1664]; +ld.shared.u32 r1242, [r2456+1792]; +ld.shared.u32 r1438, [r2456+1920]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2457, r2449, 16; +bfe.u32 r2458, r2449, 4, 1; +shl.b32 r2459, r2449, 2; +and.b32 r2460, r2459, 60; +add.s32 r2461, r2453, r2460; +cvt.rn.f32.u32 f304, r2458; +mul.f32 f305, f304, 0f3E490FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +barrier.sync 0; +and.b32 r2462, r2451, 1024; +add.s32 r2463, r2461, r2462; +st.shared.u32 [r2463], r1695; +st.shared.u32 [r2463+64], r1801; +st.shared.u32 [r2463+128], r1838; +st.shared.u32 [r2463+192], r1875; +st.shared.u32 [r2463+256], r1912; +st.shared.u32 [r2463+320], r1949; +st.shared.u32 [r2463+384], r1986; +st.shared.u32 [r2463+448], r2023; +st.shared.u32 [r2463+512], r2060; +st.shared.u32 [r2463+576], r2097; +st.shared.u32 [r2463+640], r2134; +st.shared.u32 [r2463+704], r2171; +st.shared.u32 [r2463+768], r2208; +st.shared.u32 [r2463+832], r2245; +st.shared.u32 [r2463+896], r2282; +st.shared.u32 [r2463+960], r2319; +barrier.sync 0; +mad.lo.s32 r2464, r2457, -60, r2463; +ld.shared.u32 r2350, [r2464]; +ld.shared.u32 r2362, [r2464+128]; +ld.shared.u32 r2374, [r2464+256]; +ld.shared.u32 r2386, [r2464+384]; +ld.shared.u32 r2398, [r2464+512]; +ld.shared.u32 r2410, [r2464+640]; +ld.shared.u32 r2422, [r2464+768]; +ld.shared.u32 r2434, [r2464+896]; +ld.shared.u32 r2351, [r2464+1024]; +ld.shared.u32 r2363, [r2464+1152]; +ld.shared.u32 r2375, [r2464+1280]; +ld.shared.u32 r2387, [r2464+1408]; +ld.shared.u32 r2399, [r2464+1536]; +ld.shared.u32 r2411, [r2464+1664]; +ld.shared.u32 r2423, [r2464+1792]; +ld.shared.u32 r2435, [r2464+1920]; +barrier.sync 0; +st.shared.u32 [r2463], r1698; +st.shared.u32 [r2463+64], r1810; +st.shared.u32 [r2463+128], r1847; +st.shared.u32 [r2463+192], r1884; +st.shared.u32 [r2463+256], r1921; +st.shared.u32 [r2463+320], r1958; +st.shared.u32 [r2463+384], r1995; +st.shared.u32 [r2463+448], r2032; +st.shared.u32 [r2463+512], r2069; +st.shared.u32 [r2463+576], r2106; +st.shared.u32 [r2463+640], r2143; +st.shared.u32 [r2463+704], r2180; +st.shared.u32 [r2463+768], r2217; +st.shared.u32 [r2463+832], r2254; +st.shared.u32 [r2463+896], r2291; +st.shared.u32 [r2463+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r2464]; +ld.shared.u32 r2365, [r2464+128]; +ld.shared.u32 r2377, [r2464+256]; +ld.shared.u32 r2389, [r2464+384]; +ld.shared.u32 r2401, [r2464+512]; +ld.shared.u32 r2413, [r2464+640]; +ld.shared.u32 r2425, [r2464+768]; +ld.shared.u32 r2437, [r2464+896]; +ld.shared.u32 r2354, [r2464+1024]; +ld.shared.u32 r2366, [r2464+1152]; +ld.shared.u32 r2378, [r2464+1280]; +ld.shared.u32 r2390, [r2464+1408]; +ld.shared.u32 r2402, [r2464+1536]; +ld.shared.u32 r2414, [r2464+1664]; +ld.shared.u32 r2426, [r2464+1792]; +ld.shared.u32 r2438, [r2464+1920]; +{ +add.f16x2 %0, r2350, r2351; +} +{ +add.f16x2 %1, r2353, r2354; +} +{ +sub.f16x2 %16, r2350, r2351; +} +{ +sub.f16x2 %17, r2353, r2354; +} +{ +add.f16x2 %2, r2362, r2363; +} +{ +add.f16x2 %3, r2365, r2366; +} +{ +sub.f16x2 %18, r2362, r2363; +} +{ +sub.f16x2 %19, r2365, r2366; +} +{ +add.f16x2 %4, r2374, r2375; +} +{ +add.f16x2 %5, r2377, r2378; +} +{ +sub.f16x2 %20, r2374, r2375; +} +{ +sub.f16x2 %21, r2377, r2378; +} +{ +add.f16x2 %6, r2386, r2387; +} +{ +add.f16x2 %7, r2389, r2390; +} +{ +sub.f16x2 %22, r2386, r2387; +} +{ +sub.f16x2 %23, r2389, r2390; +} +{ +add.f16x2 %8, r2398, r2399; +} +{ +add.f16x2 %9, r2401, r2402; +} +{ +sub.f16x2 %24, r2398, r2399; +} +{ +sub.f16x2 %25, r2401, r2402; +} +{ +add.f16x2 %10, r2410, r2411; +} +{ +add.f16x2 %11, r2413, r2414; +} +{ +sub.f16x2 %26, r2410, r2411; +} +{ +sub.f16x2 %27, r2413, r2414; +} +{ +add.f16x2 %12, r2422, r2423; +} +{ +add.f16x2 %13, r2425, r2426; +} +{ +sub.f16x2 %28, r2422, r2423; +} +{ +sub.f16x2 %29, r2425, r2426; +} +{ +add.f16x2 %14, r2434, r2435; +} +{ +add.f16x2 %15, r2437, r2438; +} +{ +sub.f16x2 %30, r2434, r2435; +} +{ +sub.f16x2 %31, r2437, r2438; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1024, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<307>; +.reg .b32 r<2465>; +.reg .b64 rd<2>; +mov.u32 r2445, %tid.y; +shl.b32 r2446, r2445, 12; +mov.u32 r2447, %32; +add.s32 r2448, r2447, r2446; +mov.u32 r2449, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f230, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r102, {low, high}; +} +mov.f32 f298, 0f3F800000; +mov.f32 f228, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r106, {low, high}; +} +mov.f32 f297, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f226, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r393, {low, high}; +} +mov.f32 f234, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r398, {low, high}; +} +mov.f32 f224, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r404, {low, high}; +} +mov.f32 f232, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r2450, r2449, 31; +shl.b32 r2451, r2449, 7; +and.b32 r2452, r2451, -4096; +add.s32 r2453, r2448, r2452; +cvt.rn.f32.u32 f301, r2450; +mul.f32 f302, f301, 0f3C490FDB; +cos.approx.f32 f117, f302; +sin.approx.f32 f303, f302; +neg.f32 f118, f303; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r2454, r2451, 3968; +add.s32 r2455, r2453, r2454; +st.shared.v4.f32 [r2455], {r521, r524, r627, r636}; +st.shared.v4.f32 [r2455+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r2455+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r2455+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r2455+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r2455+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r2455+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r2455+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r2456, r2450, -120, r2455; +ld.shared.u32 r1176, [r2456]; +ld.shared.u32 r1179, [r2456+4]; +ld.shared.u32 r1372, [r2456+256]; +ld.shared.u32 r1375, [r2456+260]; +ld.shared.u32 r1226, [r2456+512]; +ld.shared.u32 r1229, [r2456+516]; +ld.shared.u32 r1422, [r2456+768]; +ld.shared.u32 r1425, [r2456+772]; +ld.shared.u32 r1188, [r2456+1024]; +ld.shared.u32 r1191, [r2456+1028]; +ld.shared.u32 r1384, [r2456+1280]; +ld.shared.u32 r1387, [r2456+1284]; +ld.shared.u32 r1238, [r2456+1536]; +ld.shared.u32 r1241, [r2456+1540]; +ld.shared.u32 r1434, [r2456+1792]; +ld.shared.u32 r1437, [r2456+1796]; +ld.shared.u32 r1177, [r2456+2048]; +ld.shared.u32 r1180, [r2456+2052]; +ld.shared.u32 r1373, [r2456+2304]; +ld.shared.u32 r1376, [r2456+2308]; +ld.shared.u32 r1227, [r2456+2560]; +ld.shared.u32 r1230, [r2456+2564]; +ld.shared.u32 r1423, [r2456+2816]; +ld.shared.u32 r1426, [r2456+2820]; +ld.shared.u32 r1189, [r2456+3072]; +ld.shared.u32 r1192, [r2456+3076]; +ld.shared.u32 r1385, [r2456+3328]; +ld.shared.u32 r1388, [r2456+3332]; +ld.shared.u32 r1239, [r2456+3584]; +ld.shared.u32 r1242, [r2456+3588]; +ld.shared.u32 r1435, [r2456+3840]; +ld.shared.u32 r1438, [r2456+3844]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f226; +cvt.rn.f16.f32 high, f226; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f228; +cvt.rn.f16.f32 high, f228; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r2457, r2449, 16; +bfe.u32 r2458, r2449, 4, 1; +cvt.rn.f32.u32 f304, r2458; +mul.f32 f305, f304, 0f3E490FDB; +cos.approx.f32 f267, f305; +sin.approx.f32 f306, f305; +neg.f32 f268, f306; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f297; +cvt.rn.f16.f32 high, f298; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +shl.b32 r2459, r2449, 3; +and.b32 r2460, r2459, 120; +add.s32 r2461, r2453, r2460; +barrier.sync 0; +and.b32 r2462, r2451, 2048; +add.s32 r2463, r2461, r2462; +st.shared.u32 [r2463], r1695; +st.shared.u32 [r2463+4], r1698; +st.shared.u32 [r2463+128], r1801; +st.shared.u32 [r2463+132], r1810; +st.shared.u32 [r2463+256], r1838; +st.shared.u32 [r2463+260], r1847; +st.shared.u32 [r2463+384], r1875; +st.shared.u32 [r2463+388], r1884; +st.shared.u32 [r2463+512], r1912; +st.shared.u32 [r2463+516], r1921; +st.shared.u32 [r2463+640], r1949; +st.shared.u32 [r2463+644], r1958; +st.shared.u32 [r2463+768], r1986; +st.shared.u32 [r2463+772], r1995; +st.shared.u32 [r2463+896], r2023; +st.shared.u32 [r2463+900], r2032; +st.shared.u32 [r2463+1024], r2060; +st.shared.u32 [r2463+1028], r2069; +st.shared.u32 [r2463+1152], r2097; +st.shared.u32 [r2463+1156], r2106; +st.shared.u32 [r2463+1280], r2134; +st.shared.u32 [r2463+1284], r2143; +st.shared.u32 [r2463+1408], r2171; +st.shared.u32 [r2463+1412], r2180; +st.shared.u32 [r2463+1536], r2208; +st.shared.u32 [r2463+1540], r2217; +st.shared.u32 [r2463+1664], r2245; +st.shared.u32 [r2463+1668], r2254; +st.shared.u32 [r2463+1792], r2282; +st.shared.u32 [r2463+1796], r2291; +st.shared.u32 [r2463+1920], r2319; +st.shared.u32 [r2463+1924], r2328; +barrier.sync 0; +mad.lo.s32 r2464, r2457, -120, r2463; +ld.shared.u32 r2350, [r2464]; +ld.shared.u32 r2353, [r2464+4]; +ld.shared.u32 r2362, [r2464+256]; +ld.shared.u32 r2365, [r2464+260]; +ld.shared.u32 r2374, [r2464+512]; +ld.shared.u32 r2377, [r2464+516]; +ld.shared.u32 r2386, [r2464+768]; +ld.shared.u32 r2389, [r2464+772]; +ld.shared.u32 r2398, [r2464+1024]; +ld.shared.u32 r2401, [r2464+1028]; +ld.shared.u32 r2410, [r2464+1280]; +ld.shared.u32 r2413, [r2464+1284]; +ld.shared.u32 r2422, [r2464+1536]; +ld.shared.u32 r2425, [r2464+1540]; +ld.shared.u32 r2434, [r2464+1792]; +ld.shared.u32 r2437, [r2464+1796]; +ld.shared.u32 r2351, [r2464+2048]; +ld.shared.u32 r2354, [r2464+2052]; +ld.shared.u32 r2363, [r2464+2304]; +ld.shared.u32 r2366, [r2464+2308]; +ld.shared.u32 r2375, [r2464+2560]; +ld.shared.u32 r2378, [r2464+2564]; +ld.shared.u32 r2387, [r2464+2816]; +ld.shared.u32 r2390, [r2464+2820]; +ld.shared.u32 r2399, [r2464+3072]; +ld.shared.u32 r2402, [r2464+3076]; +ld.shared.u32 r2411, [r2464+3328]; +ld.shared.u32 r2414, [r2464+3332]; +ld.shared.u32 r2423, [r2464+3584]; +ld.shared.u32 r2426, [r2464+3588]; +ld.shared.u32 r2435, [r2464+3840]; +ld.shared.u32 r2438, [r2464+3844]; +{ +add.f16x2 %0, r2350, r2351; +} +{ +add.f16x2 %1, r2353, r2354; +} +{ +sub.f16x2 %16, r2350, r2351; +} +{ +sub.f16x2 %17, r2353, r2354; +} +{ +add.f16x2 %2, r2362, r2363; +} +{ +add.f16x2 %3, r2365, r2366; +} +{ +sub.f16x2 %18, r2362, r2363; +} +{ +sub.f16x2 %19, r2365, r2366; +} +{ +add.f16x2 %4, r2374, r2375; +} +{ +add.f16x2 %5, r2377, r2378; +} +{ +sub.f16x2 %20, r2374, r2375; +} +{ +sub.f16x2 %21, r2377, r2378; +} +{ +add.f16x2 %6, r2386, r2387; +} +{ +add.f16x2 %7, r2389, r2390; +} +{ +sub.f16x2 %22, r2386, r2387; +} +{ +sub.f16x2 %23, r2389, r2390; +} +{ +add.f16x2 %8, r2398, r2399; +} +{ +add.f16x2 %9, r2401, r2402; +} +{ +sub.f16x2 %24, r2398, r2399; +} +{ +sub.f16x2 %25, r2401, r2402; +} +{ +add.f16x2 %10, r2410, r2411; +} +{ +add.f16x2 %11, r2413, r2414; +} +{ +sub.f16x2 %26, r2410, r2411; +} +{ +sub.f16x2 %27, r2413, r2414; +} +{ +add.f16x2 %12, r2422, r2423; +} +{ +add.f16x2 %13, r2425, r2426; +} +{ +sub.f16x2 %28, r2422, r2423; +} +{ +sub.f16x2 %29, r2425, r2426; +} +{ +add.f16x2 %14, r2434, r2435; +} +{ +add.f16x2 %15, r2437, r2438; +} +{ +sub.f16x2 %30, r2434, r2435; +} +{ +sub.f16x2 %31, r2437, r2438; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1023, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<660>; +.reg .b32 r<4171>; +.reg .b64 rd<3>; +mov.u32 r4095, %tid.y; +shl.b32 r4096, r4095, 11; +mov.u32 r4097, %64; +add.s32 r4098, r4097, r4096; +mov.u32 r4099, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f618, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r102, {low, high}; +} +mov.f32 f616, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f614, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r393, {low, high}; +} +mov.f32 f622, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r398, {low, high}; +} +mov.f32 f612, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r404, {low, high}; +} +mov.f32 f620, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r4100, r4099, 15; +shl.b32 r4101, r4099, 7; +and.b32 r4102, r4101, -2048; +add.s32 r4103, r4098, r4102; +cvt.rn.f32.u32 f655, r4100; +mul.f32 f656, f655, 0f3C490FDB; +cos.approx.f32 f357, f656; +sin.approx.f32 f657, f656; +neg.f32 f358, f657; +mov.f32 f659, 0fBF800000; +mov.f32 f658, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f659; +cvt.rn.f16.f32 high, f658; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r4104, r4101, 1920; +add.s32 r4105, r4103, r4104; +st.shared.v4.f32 [r4105], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r4105+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r4105+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r4105+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r4105+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r4105+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r4105+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r4105+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r4106, r4100, -124, r4105; +ld.shared.u32 r2864, [r4106]; +ld.shared.u32 r3480, [r4106+64]; +ld.shared.u32 r3060, [r4106+128]; +ld.shared.u32 r3676, [r4106+192]; +ld.shared.u32 r2914, [r4106+256]; +ld.shared.u32 r3530, [r4106+320]; +ld.shared.u32 r3110, [r4106+384]; +ld.shared.u32 r3726, [r4106+448]; +ld.shared.u32 r2876, [r4106+512]; +ld.shared.u32 r3492, [r4106+576]; +ld.shared.u32 r3072, [r4106+640]; +ld.shared.u32 r3688, [r4106+704]; +ld.shared.u32 r2926, [r4106+768]; +ld.shared.u32 r3542, [r4106+832]; +ld.shared.u32 r3122, [r4106+896]; +ld.shared.u32 r3738, [r4106+960]; +ld.shared.u32 r2865, [r4106+1024]; +ld.shared.u32 r3481, [r4106+1088]; +ld.shared.u32 r3061, [r4106+1152]; +ld.shared.u32 r3677, [r4106+1216]; +ld.shared.u32 r2915, [r4106+1280]; +ld.shared.u32 r3531, [r4106+1344]; +ld.shared.u32 r3111, [r4106+1408]; +ld.shared.u32 r3727, [r4106+1472]; +ld.shared.u32 r2877, [r4106+1536]; +ld.shared.u32 r3493, [r4106+1600]; +ld.shared.u32 r3073, [r4106+1664]; +ld.shared.u32 r3689, [r4106+1728]; +ld.shared.u32 r2927, [r4106+1792]; +ld.shared.u32 r3543, [r4106+1856]; +ld.shared.u32 r3123, [r4106+1920]; +ld.shared.u32 r3739, [r4106+1984]; +barrier.sync 0; +st.shared.v4.f32 [r4105], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r4105+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r4105+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r4105+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r4105+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r4105+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r4105+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r4105+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r4106]; +ld.shared.u32 r3483, [r4106+64]; +ld.shared.u32 r3063, [r4106+128]; +ld.shared.u32 r3679, [r4106+192]; +ld.shared.u32 r2917, [r4106+256]; +ld.shared.u32 r3533, [r4106+320]; +ld.shared.u32 r3113, [r4106+384]; +ld.shared.u32 r3729, [r4106+448]; +ld.shared.u32 r2879, [r4106+512]; +ld.shared.u32 r3495, [r4106+576]; +ld.shared.u32 r3075, [r4106+640]; +ld.shared.u32 r3691, [r4106+704]; +ld.shared.u32 r2929, [r4106+768]; +ld.shared.u32 r3545, [r4106+832]; +ld.shared.u32 r3125, [r4106+896]; +ld.shared.u32 r3741, [r4106+960]; +ld.shared.u32 r2868, [r4106+1024]; +ld.shared.u32 r3484, [r4106+1088]; +ld.shared.u32 r3064, [r4106+1152]; +ld.shared.u32 r3680, [r4106+1216]; +ld.shared.u32 r2918, [r4106+1280]; +ld.shared.u32 r3534, [r4106+1344]; +ld.shared.u32 r3114, [r4106+1408]; +ld.shared.u32 r3730, [r4106+1472]; +ld.shared.u32 r2880, [r4106+1536]; +ld.shared.u32 r3496, [r4106+1600]; +ld.shared.u32 r3076, [r4106+1664]; +ld.shared.u32 r3692, [r4106+1728]; +ld.shared.u32 r2930, [r4106+1792]; +ld.shared.u32 r3546, [r4106+1856]; +ld.shared.u32 r3126, [r4106+1920]; +ld.shared.u32 r3742, [r4106+1984]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 %0, r3011, r3207; +} +{ +add.f16x2 %1, r3014, r3210; +} +{ +sub.f16x2 %32, r3011, r3207; +} +{ +sub.f16x2 %33, r3014, r3210; +} +{ +add.f16x2 %4, r3023, r3291; +} +{ +add.f16x2 %5, r3026, r3297; +} +{ +sub.f16x2 %36, r3023, r3291; +} +{ +sub.f16x2 %37, r3026, r3297; +} +{ +add.f16x2 %8, r3035, r3307; +} +{ +add.f16x2 %9, r3038, r3313; +} +{ +sub.f16x2 %40, r3035, r3307; +} +{ +sub.f16x2 %41, r3038, r3313; +} +{ +add.f16x2 %12, r3047, r3323; +} +{ +add.f16x2 %13, r3050, r3329; +} +{ +sub.f16x2 %44, r3047, r3323; +} +{ +sub.f16x2 %45, r3050, r3329; +} +{ +add.f16x2 %16, r3017, r3333; +} +{ +add.f16x2 %17, r3020, r3213; +} +{ +sub.f16x2 %48, r3017, r3333; +} +{ +sub.f16x2 %49, r3020, r3213; +} +{ +add.f16x2 %20, r3029, r3341; +} +{ +add.f16x2 %21, r3032, r3347; +} +{ +sub.f16x2 %52, r3029, r3341; +} +{ +sub.f16x2 %53, r3032, r3347; +} +{ +add.f16x2 %24, r3041, r3357; +} +{ +add.f16x2 %25, r3044, r3363; +} +{ +sub.f16x2 %56, r3041, r3357; +} +{ +sub.f16x2 %57, r3044, r3363; +} +{ +add.f16x2 %28, r3053, r3373; +} +{ +add.f16x2 %29, r3056, r3379; +} +{ +sub.f16x2 %60, r3053, r3373; +} +{ +sub.f16x2 %61, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f612; +cvt.rn.f16.f32 high, f612; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f614; +cvt.rn.f16.f32 high, f614; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f616; +cvt.rn.f16.f32 high, f616; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f618; +cvt.rn.f16.f32 high, f618; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f620; +cvt.rn.f16.f32 high, f620; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f622; +cvt.rn.f16.f32 high, f622; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 %2, r3627, r3823; +} +{ +add.f16x2 %3, r3630, r3826; +} +{ +sub.f16x2 %34, r3627, r3823; +} +{ +sub.f16x2 %35, r3630, r3826; +} +{ +add.f16x2 %6, r3639, r3907; +} +{ +add.f16x2 %7, r3642, r3913; +} +{ +sub.f16x2 %38, r3639, r3907; +} +{ +sub.f16x2 %39, r3642, r3913; +} +{ +add.f16x2 %10, r3651, r3923; +} +{ +add.f16x2 %11, r3654, r3929; +} +{ +sub.f16x2 %42, r3651, r3923; +} +{ +sub.f16x2 %43, r3654, r3929; +} +{ +add.f16x2 %14, r3663, r3939; +} +{ +add.f16x2 %15, r3666, r3945; +} +{ +sub.f16x2 %46, r3663, r3939; +} +{ +sub.f16x2 %47, r3666, r3945; +} +{ +add.f16x2 %18, r3633, r3949; +} +{ +add.f16x2 %19, r3636, r3829; +} +{ +sub.f16x2 %50, r3633, r3949; +} +{ +sub.f16x2 %51, r3636, r3829; +} +{ +add.f16x2 %22, r3645, r3957; +} +{ +add.f16x2 %23, r3648, r3963; +} +{ +sub.f16x2 %54, r3645, r3957; +} +{ +sub.f16x2 %55, r3648, r3963; +} +{ +add.f16x2 %26, r3657, r3973; +} +{ +add.f16x2 %27, r3660, r3979; +} +{ +sub.f16x2 %58, r3657, r3973; +} +{ +sub.f16x2 %59, r3660, r3979; +} +{ +add.f16x2 %30, r3669, r3989; +} +{ +add.f16x2 %31, r3672, r3995; +} +{ +sub.f16x2 %62, r3669, r3989; +} +{ +sub.f16x2 %63, r3672, r3995; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1025, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<53>; +.reg .b32 r<715>; +.reg .b64 rd<2>; +mov.u32 r681, %tid.y; +shl.b32 r682, r681, 12; +mov.u32 r683, %8; +add.s32 r684, r683, r682; +mov.u32 r685, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r686, r685, 127; +shl.b32 r687, r685, 5; +and.b32 r688, r687, -4096; +add.s32 r689, r684, r688; +cvt.rn.f32.u32 f41, r686; +mul.f32 f42, f41, 0f3C490FDB; +cos.approx.f32 f1, f42; +sin.approx.f32 f43, f42; +neg.f32 f2, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f37, 0fBF800000; +mov.f32 f38, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r690, r687, 4064; +add.s32 r691, r689, r690; +st.shared.v4.f32 [r691], {r27, r30, r61, r70}; +st.shared.v4.f32 [r691+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r692, r686, -24, r691; +ld.shared.u32 r166, [r692]; +ld.shared.u32 r169, [r692+4]; +ld.shared.u32 r178, [r692+1024]; +ld.shared.u32 r181, [r692+1028]; +ld.shared.u32 r167, [r692+2048]; +ld.shared.u32 r170, [r692+2052]; +ld.shared.u32 r179, [r692+3072]; +ld.shared.u32 r182, [r692+3076]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r693, r685, 124; +bfe.u32 r694, r685, 2, 5; +cvt.rn.f32.u32 f44, r694; +mul.f32 f45, f44, 0f3D490FDB; +cos.approx.f32 f11, f45; +sin.approx.f32 f46, f45; +neg.f32 f12, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +shl.b32 r695, r685, 3; +and.b32 r696, r695, 24; +add.s32 r697, r689, r696; +barrier.sync 0; +and.b32 r698, r687, 3968; +add.s32 r699, r697, r698; +st.shared.u32 [r699], r191; +st.shared.u32 [r699+4], r194; +st.shared.u32 [r699+32], r225; +st.shared.u32 [r699+36], r234; +st.shared.u32 [r699+64], r262; +st.shared.u32 [r699+68], r271; +st.shared.u32 [r699+96], r299; +st.shared.u32 [r699+100], r308; +barrier.sync 0; +mad.lo.s32 r700, r693, -24, r699; +ld.shared.u32 r330, [r700]; +ld.shared.u32 r333, [r700+4]; +ld.shared.u32 r342, [r700+1024]; +ld.shared.u32 r345, [r700+1028]; +ld.shared.u32 r331, [r700+2048]; +ld.shared.u32 r334, [r700+2052]; +ld.shared.u32 r343, [r700+3072]; +ld.shared.u32 r346, [r700+3076]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r353; +} +{ +add.f16x2 r370, r338, r347; +} +{ +sub.f16x2 r373, r335, r353; +} +{ +sub.f16x2 r376, r338, r347; +} +and.b32 r701, r685, 112; +bfe.u32 r702, r685, 4, 3; +cvt.rn.f32.u32 f47, r702; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f21, f48; +sin.approx.f32 f49, f48; +neg.f32 f22, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +fma.rn.f16x2 r389, r367, r382, r386; +} +{ +mul.f16x2 r393, r367, r384; +} +{ +neg.f16x2 r396, r393; +} +{ +fma.rn.f16x2 r398, r370, r382, r396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +fma.rn.f16x2 r426, r361, r419, r423; +} +{ +mul.f16x2 r430, r361, r421; +} +{ +neg.f16x2 r433, r430; +} +{ +fma.rn.f16x2 r435, r364, r419, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +fma.rn.f16x2 r463, r373, r456, r460; +} +{ +mul.f16x2 r467, r373, r458; +} +{ +neg.f16x2 r470, r467; +} +{ +fma.rn.f16x2 r472, r376, r456, r470; +} +and.b32 r703, r695, 120; +add.s32 r704, r689, r703; +barrier.sync 0; +and.b32 r705, r687, 3584; +add.s32 r706, r704, r705; +st.shared.u32 [r706], r355; +st.shared.u32 [r706+4], r358; +st.shared.u32 [r706+128], r389; +st.shared.u32 [r706+132], r398; +st.shared.u32 [r706+256], r426; +st.shared.u32 [r706+260], r435; +st.shared.u32 [r706+384], r463; +st.shared.u32 [r706+388], r472; +barrier.sync 0; +mad.lo.s32 r707, r701, -24, r706; +ld.shared.u32 r494, [r707]; +ld.shared.u32 r497, [r707+4]; +ld.shared.u32 r506, [r707+1024]; +ld.shared.u32 r509, [r707+1028]; +ld.shared.u32 r495, [r707+2048]; +ld.shared.u32 r498, [r707+2052]; +ld.shared.u32 r507, [r707+3072]; +ld.shared.u32 r510, [r707+3076]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r514; +} +{ +add.f16x2 r519, r493, r505; +} +{ +add.f16x2 r522, r496, r508; +} +{ +sub.f16x2 r525, r493, r505; +} +{ +sub.f16x2 r528, r496, r508; +} +{ +add.f16x2 r531, r499, r517; +} +{ +add.f16x2 r534, r502, r511; +} +{ +sub.f16x2 r537, r499, r517; +} +{ +sub.f16x2 r540, r502, r511; +} +and.b32 r708, r685, 64; +bfe.u32 r709, r685, 6, 1; +cvt.rn.f32.u32 f50, r709; +mul.f32 f51, f50, 0f3F490FDB; +cos.approx.f32 f31, f51; +sin.approx.f32 f52, f51; +neg.f32 f32, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r543, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r546, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r548, {high, high}; +} +{ +mul.f16x2 r550, r534, r548; +} +{ +fma.rn.f16x2 r553, r531, r546, r550; +} +{ +mul.f16x2 r557, r531, r548; +} +{ +neg.f16x2 r560, r557; +} +{ +fma.rn.f16x2 r562, r534, r546, r560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r566, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r568, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r570, {low, high}; +} +{ +mul.f16x2 r571, r568, r570; +} +{ +mul.f16x2 r574, r543, r566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r577, {high, low}; +} +{ +fma.rn.f16x2 r579, r571, r577, r574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r583, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r585, {high, high}; +} +{ +mul.f16x2 r587, r528, r585; +} +{ +fma.rn.f16x2 r590, r525, r583, r587; +} +{ +mul.f16x2 r594, r525, r585; +} +{ +neg.f16x2 r597, r594; +} +{ +fma.rn.f16x2 r599, r528, r583, r597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r603, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r605, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r605, r607; +} +{ +mul.f16x2 r611, r579, r603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r614, {high, low}; +} +{ +fma.rn.f16x2 r616, r608, r614, r611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r540, r622; +} +{ +fma.rn.f16x2 r627, r537, r620, r624; +} +{ +mul.f16x2 r631, r537, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r540, r620, r634; +} +and.b32 r710, r695, 504; +add.s32 r711, r689, r710; +barrier.sync 0; +and.b32 r712, r687, 2048; +add.s32 r713, r711, r712; +st.shared.u32 [r713], r519; +st.shared.u32 [r713+4], r522; +st.shared.u32 [r713+512], r553; +st.shared.u32 [r713+516], r562; +st.shared.u32 [r713+1024], r590; +st.shared.u32 [r713+1028], r599; +st.shared.u32 [r713+1536], r627; +st.shared.u32 [r713+1540], r636; +barrier.sync 0; +mad.lo.s32 r714, r708, -24, r713; +ld.shared.u32 r658, [r714]; +ld.shared.u32 r661, [r714+4]; +ld.shared.u32 r670, [r714+1024]; +ld.shared.u32 r673, [r714+1028]; +ld.shared.u32 r659, [r714+2048]; +ld.shared.u32 r662, [r714+2052]; +ld.shared.u32 r671, [r714+3072]; +ld.shared.u32 r674, [r714+3076]; +{ +add.f16x2 %0, r658, r659; +} +{ +add.f16x2 %1, r661, r662; +} +{ +sub.f16x2 %4, r658, r659; +} +{ +sub.f16x2 %5, r661, r662; +} +{ +add.f16x2 %2, r670, r671; +} +{ +add.f16x2 %3, r673, r674; +} +{ +sub.f16x2 %6, r670, r671; +} +{ +sub.f16x2 %7, r673, r674; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1026, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<53>; +.reg .b32 r<715>; +.reg .b64 rd<2>; +mov.u32 r681, %tid.y; +shl.b32 r682, r681, 11; +mov.u32 r683, %8; +add.s32 r684, r683, r682; +mov.u32 r685, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r686, r685, 127; +shl.b32 r687, r685, 4; +and.b32 r688, r687, -2048; +add.s32 r689, r684, r688; +cvt.rn.f32.u32 f41, r686; +mul.f32 f42, f41, 0f3C490FDB; +cos.approx.f32 f1, f42; +sin.approx.f32 f43, f42; +neg.f32 f2, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f37, 0fBF800000; +mov.f32 f38, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r690, r687, 2032; +add.s32 r691, r689, r690; +st.shared.v4.f32 [r691], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r692, r686, -12, r691; +ld.shared.u32 r166, [r692]; +ld.shared.u32 r178, [r692+512]; +ld.shared.u32 r167, [r692+1024]; +ld.shared.u32 r179, [r692+1536]; +barrier.sync 0; +st.shared.v4.f32 [r691], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r692]; +ld.shared.u32 r181, [r692+512]; +ld.shared.u32 r170, [r692+1024]; +ld.shared.u32 r182, [r692+1536]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r693, r685, 124; +bfe.u32 r694, r685, 2, 5; +shl.b32 r695, r685, 2; +and.b32 r696, r695, 12; +add.s32 r697, r689, r696; +cvt.rn.f32.u32 f44, r694; +mul.f32 f45, f44, 0f3D490FDB; +cos.approx.f32 f11, f45; +sin.approx.f32 f46, f45; +neg.f32 f12, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +barrier.sync 0; +and.b32 r698, r687, 1984; +add.s32 r699, r697, r698; +st.shared.u32 [r699], r191; +st.shared.u32 [r699+16], r225; +st.shared.u32 [r699+32], r262; +st.shared.u32 [r699+48], r299; +barrier.sync 0; +mad.lo.s32 r700, r693, -12, r699; +ld.shared.u32 r330, [r700]; +ld.shared.u32 r342, [r700+512]; +ld.shared.u32 r331, [r700+1024]; +ld.shared.u32 r343, [r700+1536]; +barrier.sync 0; +st.shared.u32 [r699], r194; +st.shared.u32 [r699+16], r234; +st.shared.u32 [r699+32], r271; +st.shared.u32 [r699+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r700]; +ld.shared.u32 r345, [r700+512]; +ld.shared.u32 r334, [r700+1024]; +ld.shared.u32 r346, [r700+1536]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 r355, r329, r341; +} +{ +add.f16x2 r358, r332, r344; +} +{ +sub.f16x2 r361, r329, r341; +} +{ +sub.f16x2 r364, r332, r344; +} +{ +add.f16x2 r367, r335, r353; +} +{ +add.f16x2 r370, r338, r347; +} +{ +sub.f16x2 r373, r335, r353; +} +{ +sub.f16x2 r376, r338, r347; +} +and.b32 r701, r685, 112; +bfe.u32 r702, r685, 4, 3; +and.b32 r703, r695, 60; +add.s32 r704, r689, r703; +cvt.rn.f32.u32 f47, r702; +mul.f32 f48, f47, 0f3E490FDB; +cos.approx.f32 f21, f48; +sin.approx.f32 f49, f48; +neg.f32 f22, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f21; +cvt.rn.f16.f32 high, f22; +mov.b32 r379, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r384, {high, high}; +} +{ +mul.f16x2 r386, r370, r384; +} +{ +fma.rn.f16x2 r389, r367, r382, r386; +} +{ +mul.f16x2 r393, r367, r384; +} +{ +neg.f16x2 r396, r393; +} +{ +fma.rn.f16x2 r398, r370, r382, r396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r407, r404, r406; +} +{ +mul.f16x2 r410, r379, r402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r413, {high, low}; +} +{ +fma.rn.f16x2 r415, r407, r413, r410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r421, {high, high}; +} +{ +mul.f16x2 r423, r364, r421; +} +{ +fma.rn.f16x2 r426, r361, r419, r423; +} +{ +mul.f16x2 r430, r361, r421; +} +{ +neg.f16x2 r433, r430; +} +{ +fma.rn.f16x2 r435, r364, r419, r433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r379; +mov.b32 r441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r443, {low, high}; +} +{ +mul.f16x2 r444, r441, r443; +} +{ +mul.f16x2 r447, r415, r439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r415; +mov.b32 r450, {high, low}; +} +{ +fma.rn.f16x2 r452, r444, r450, r447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r452; +mov.b32 r458, {high, high}; +} +{ +mul.f16x2 r460, r376, r458; +} +{ +fma.rn.f16x2 r463, r373, r456, r460; +} +{ +mul.f16x2 r467, r373, r458; +} +{ +neg.f16x2 r470, r467; +} +{ +fma.rn.f16x2 r472, r376, r456, r470; +} +barrier.sync 0; +and.b32 r705, r687, 1792; +add.s32 r706, r704, r705; +st.shared.u32 [r706], r355; +st.shared.u32 [r706+64], r389; +st.shared.u32 [r706+128], r426; +st.shared.u32 [r706+192], r463; +barrier.sync 0; +mad.lo.s32 r707, r701, -12, r706; +ld.shared.u32 r494, [r707]; +ld.shared.u32 r506, [r707+512]; +ld.shared.u32 r495, [r707+1024]; +ld.shared.u32 r507, [r707+1536]; +barrier.sync 0; +st.shared.u32 [r706], r358; +st.shared.u32 [r706+64], r398; +st.shared.u32 [r706+128], r435; +st.shared.u32 [r706+192], r472; +barrier.sync 0; +ld.shared.u32 r497, [r707]; +ld.shared.u32 r509, [r707+512]; +ld.shared.u32 r498, [r707+1024]; +ld.shared.u32 r510, [r707+1536]; +{ +add.f16x2 r493, r494, r495; +} +{ +add.f16x2 r496, r497, r498; +} +{ +sub.f16x2 r499, r494, r495; +} +{ +sub.f16x2 r502, r497, r498; +} +{ +add.f16x2 r505, r506, r507; +} +{ +add.f16x2 r508, r509, r510; +} +{ +sub.f16x2 r511, r506, r507; +} +{ +sub.f16x2 r514, r509, r510; +} +{ +neg.f16x2 r517, r514; +} +{ +add.f16x2 r519, r493, r505; +} +{ +add.f16x2 r522, r496, r508; +} +{ +sub.f16x2 r525, r493, r505; +} +{ +sub.f16x2 r528, r496, r508; +} +{ +add.f16x2 r531, r499, r517; +} +{ +add.f16x2 r534, r502, r511; +} +{ +sub.f16x2 r537, r499, r517; +} +{ +sub.f16x2 r540, r502, r511; +} +and.b32 r708, r685, 64; +bfe.u32 r709, r685, 6, 1; +and.b32 r710, r695, 252; +add.s32 r711, r689, r710; +cvt.rn.f32.u32 f50, r709; +mul.f32 f51, f50, 0f3F490FDB; +cos.approx.f32 f31, f51; +sin.approx.f32 f52, f51; +neg.f32 f32, f52; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r543, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r546, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r548, {high, high}; +} +{ +mul.f16x2 r550, r534, r548; +} +{ +fma.rn.f16x2 r553, r531, r546, r550; +} +{ +mul.f16x2 r557, r531, r548; +} +{ +neg.f16x2 r560, r557; +} +{ +fma.rn.f16x2 r562, r534, r546, r560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r566, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r568, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r570, {low, high}; +} +{ +mul.f16x2 r571, r568, r570; +} +{ +mul.f16x2 r574, r543, r566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r577, {high, low}; +} +{ +fma.rn.f16x2 r579, r571, r577, r574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r583, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r585, {high, high}; +} +{ +mul.f16x2 r587, r528, r585; +} +{ +fma.rn.f16x2 r590, r525, r583, r587; +} +{ +mul.f16x2 r594, r525, r585; +} +{ +neg.f16x2 r597, r594; +} +{ +fma.rn.f16x2 r599, r528, r583, r597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r603, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r543; +mov.b32 r605, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r607, {low, high}; +} +{ +mul.f16x2 r608, r605, r607; +} +{ +mul.f16x2 r611, r579, r603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r579; +mov.b32 r614, {high, low}; +} +{ +fma.rn.f16x2 r616, r608, r614, r611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r616; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r540, r622; +} +{ +fma.rn.f16x2 r627, r537, r620, r624; +} +{ +mul.f16x2 r631, r537, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r540, r620, r634; +} +barrier.sync 0; +and.b32 r712, r687, 1024; +add.s32 r713, r711, r712; +st.shared.u32 [r713], r519; +st.shared.u32 [r713+256], r553; +st.shared.u32 [r713+512], r590; +st.shared.u32 [r713+768], r627; +barrier.sync 0; +mad.lo.s32 r714, r708, -12, r713; +ld.shared.u32 r658, [r714]; +ld.shared.u32 r670, [r714+512]; +ld.shared.u32 r659, [r714+1024]; +ld.shared.u32 r671, [r714+1536]; +barrier.sync 0; +st.shared.u32 [r713], r522; +st.shared.u32 [r713+256], r562; +st.shared.u32 [r713+512], r599; +st.shared.u32 [r713+768], r636; +barrier.sync 0; +ld.shared.u32 r661, [r714]; +ld.shared.u32 r673, [r714+512]; +ld.shared.u32 r662, [r714+1024]; +ld.shared.u32 r674, [r714+1536]; +{ +add.f16x2 %0, r658, r659; +} +{ +add.f16x2 %1, r661, r662; +} +{ +sub.f16x2 %4, r658, r659; +} +{ +sub.f16x2 %5, r661, r662; +} +{ +add.f16x2 %2, r670, r671; +} +{ +add.f16x2 %3, r673, r674; +} +{ +sub.f16x2 %6, r670, r671; +} +{ +sub.f16x2 %7, r673, r674; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1027, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<73>; +.reg .b32 r<492>; +.reg .b64 rd<2>; +mov.u32 r429, %tid.y; +shl.b32 r430, r429, 12; +mov.u32 r431, %4; +add.s32 r432, r431, r430; +mov.u32 r433, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r434, r433, 255; +shl.b32 r435, r433, 4; +and.b32 r436, r435, -4096; +add.s32 r437, r432, r436; +cvt.rn.f32.u32 f49, r434; +mul.f32 f50, f49, 0f3C490FDB; +cos.approx.f32 f1, f50; +sin.approx.f32 f51, f50; +neg.f32 f2, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r438, r435, 4080; +add.s32 r439, r437, r438; +st.shared.v2.f32 [r439], {r1, r4}; +st.shared.v2.f32 [r439+8], {r23, r32}; +barrier.sync 0; +shl.b32 r440, r433, 3; +and.b32 r441, r440, 2040; +sub.s32 r442, r439, r441; +ld.shared.u32 r54, [r442]; +ld.shared.u32 r57, [r442+4]; +ld.shared.u32 r55, [r442+2048]; +ld.shared.u32 r58, [r442+2052]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r443, r433, 1, 7; +cvt.rn.f32.u32 f52, r443; +mul.f32 f53, f52, 0f3CC90FDB; +cos.approx.f32 f7, f53; +sin.approx.f32 f54, f53; +neg.f32 f8, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r444, r440, 8; +add.s32 r445, r437, r444; +barrier.sync 0; +and.b32 r446, r435, 4064; +add.s32 r447, r445, r446; +st.shared.u32 [r447], r53; +st.shared.u32 [r447+4], r56; +st.shared.u32 [r447+16], r75; +st.shared.u32 [r447+20], r84; +barrier.sync 0; +and.b32 r448, r440, 2032; +sub.s32 r449, r447, r448; +ld.shared.u32 r106, [r449]; +ld.shared.u32 r109, [r449+4]; +ld.shared.u32 r107, [r449+2048]; +ld.shared.u32 r110, [r449+2052]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r450, r433, 2, 6; +cvt.rn.f32.u32 f55, r450; +mul.f32 f56, f55, 0f3D490FDB; +cos.approx.f32 f13, f56; +sin.approx.f32 f57, f56; +neg.f32 f14, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +and.b32 r451, r440, 24; +add.s32 r452, r437, r451; +barrier.sync 0; +and.b32 r453, r435, 4032; +add.s32 r454, r452, r453; +st.shared.u32 [r454], r105; +st.shared.u32 [r454+4], r108; +st.shared.u32 [r454+32], r127; +st.shared.u32 [r454+36], r136; +barrier.sync 0; +and.b32 r455, r440, 2016; +sub.s32 r456, r454, r455; +ld.shared.u32 r158, [r456]; +ld.shared.u32 r161, [r456+4]; +ld.shared.u32 r159, [r456+2048]; +ld.shared.u32 r162, [r456+2052]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r457, r433, 3, 5; +cvt.rn.f32.u32 f58, r457; +mul.f32 f59, f58, 0f3DC90FDB; +cos.approx.f32 f19, f59; +sin.approx.f32 f60, f59; +neg.f32 f20, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +and.b32 r458, r440, 56; +add.s32 r459, r437, r458; +barrier.sync 0; +and.b32 r460, r435, 3968; +add.s32 r461, r459, r460; +st.shared.u32 [r461], r157; +st.shared.u32 [r461+4], r160; +st.shared.u32 [r461+64], r179; +st.shared.u32 [r461+68], r188; +barrier.sync 0; +and.b32 r462, r440, 1984; +sub.s32 r463, r461, r462; +ld.shared.u32 r210, [r463]; +ld.shared.u32 r213, [r463+4]; +ld.shared.u32 r211, [r463+2048]; +ld.shared.u32 r214, [r463+2052]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r464, r433, 4, 4; +cvt.rn.f32.u32 f61, r464; +mul.f32 f62, f61, 0f3E490FDB; +cos.approx.f32 f25, f62; +sin.approx.f32 f63, f62; +neg.f32 f26, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +and.b32 r465, r440, 120; +add.s32 r466, r437, r465; +barrier.sync 0; +and.b32 r467, r435, 3840; +add.s32 r468, r466, r467; +st.shared.u32 [r468], r209; +st.shared.u32 [r468+4], r212; +st.shared.u32 [r468+128], r231; +st.shared.u32 [r468+132], r240; +barrier.sync 0; +and.b32 r469, r440, 1920; +sub.s32 r470, r468, r469; +ld.shared.u32 r262, [r470]; +ld.shared.u32 r265, [r470+4]; +ld.shared.u32 r263, [r470+2048]; +ld.shared.u32 r266, [r470+2052]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r471, r433, 5, 3; +cvt.rn.f32.u32 f64, r471; +mul.f32 f65, f64, 0f3EC90FDB; +cos.approx.f32 f31, f65; +sin.approx.f32 f66, f65; +neg.f32 f32, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +fma.rn.f16x2 r283, r267, r276, r280; +} +{ +mul.f16x2 r287, r267, r278; +} +{ +neg.f16x2 r290, r287; +} +{ +fma.rn.f16x2 r292, r270, r276, r290; +} +and.b32 r472, r440, 248; +add.s32 r473, r437, r472; +barrier.sync 0; +and.b32 r474, r435, 3584; +add.s32 r475, r473, r474; +st.shared.u32 [r475], r261; +st.shared.u32 [r475+4], r264; +st.shared.u32 [r475+256], r283; +st.shared.u32 [r475+260], r292; +barrier.sync 0; +and.b32 r476, r440, 1792; +sub.s32 r477, r475, r476; +ld.shared.u32 r314, [r477]; +ld.shared.u32 r317, [r477+4]; +ld.shared.u32 r315, [r477+2048]; +ld.shared.u32 r318, [r477+2052]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r478, r433, 6, 2; +cvt.rn.f32.u32 f67, r478; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f37, f68; +sin.approx.f32 f69, f68; +neg.f32 f38, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +fma.rn.f16x2 r335, r319, r328, r332; +} +{ +mul.f16x2 r339, r319, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r322, r328, r342; +} +and.b32 r479, r440, 504; +add.s32 r480, r437, r479; +barrier.sync 0; +and.b32 r481, r435, 3072; +add.s32 r482, r480, r481; +st.shared.u32 [r482], r313; +st.shared.u32 [r482+4], r316; +st.shared.u32 [r482+512], r335; +st.shared.u32 [r482+516], r344; +barrier.sync 0; +and.b32 r483, r440, 1536; +sub.s32 r484, r482, r483; +ld.shared.u32 r366, [r484]; +ld.shared.u32 r369, [r484+4]; +ld.shared.u32 r367, [r484+2048]; +ld.shared.u32 r370, [r484+2052]; +{ +add.f16x2 r365, r366, r367; +} +{ +add.f16x2 r368, r369, r370; +} +{ +sub.f16x2 r371, r366, r367; +} +{ +sub.f16x2 r374, r369, r370; +} +bfe.u32 r485, r433, 7, 1; +cvt.rn.f32.u32 f70, r485; +mul.f32 f71, f70, 0f3FC90FDB; +cos.approx.f32 f43, f71; +sin.approx.f32 f72, f71; +neg.f32 f44, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r377, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r380, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r382, {high, high}; +} +{ +mul.f16x2 r384, r374, r382; +} +{ +fma.rn.f16x2 r387, r371, r380, r384; +} +{ +mul.f16x2 r391, r371, r382; +} +{ +neg.f16x2 r394, r391; +} +{ +fma.rn.f16x2 r396, r374, r380, r394; +} +and.b32 r486, r440, 1016; +add.s32 r487, r437, r486; +barrier.sync 0; +and.b32 r488, r435, 2048; +add.s32 r489, r487, r488; +st.shared.u32 [r489], r365; +st.shared.u32 [r489+4], r368; +st.shared.u32 [r489+1024], r387; +st.shared.u32 [r489+1028], r396; +barrier.sync 0; +and.b32 r490, r440, 1024; +sub.s32 r491, r489, r490; +ld.shared.u32 r418, [r491]; +ld.shared.u32 r421, [r491+4]; +ld.shared.u32 r419, [r491+2048]; +ld.shared.u32 r422, [r491+2052]; +{ +add.f16x2 %0, r418, r419; +} +{ +add.f16x2 %1, r421, r422; +} +{ +sub.f16x2 %2, r418, r419; +} +{ +sub.f16x2 %3, r421, r422; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1028, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<73>; +.reg .b32 r<492>; +.reg .b64 rd<2>; +mov.u32 r429, %tid.y; +shl.b32 r430, r429, 11; +mov.u32 r431, %4; +add.s32 r432, r431, r430; +mov.u32 r433, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r434, r433, 255; +shl.b32 r435, r433, 3; +and.b32 r436, r435, -2048; +add.s32 r437, r432, r436; +cvt.rn.f32.u32 f49, r434; +mul.f32 f50, f49, 0f3C490FDB; +cos.approx.f32 f1, f50; +sin.approx.f32 f51, f50; +neg.f32 f2, f51; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r438, r435, 2040; +add.s32 r439, r437, r438; +st.shared.v2.f32 [r439], {r1, r23}; +barrier.sync 0; +shl.b32 r440, r433, 2; +and.b32 r441, r440, 1020; +sub.s32 r442, r439, r441; +ld.shared.u32 r54, [r442]; +ld.shared.u32 r55, [r442+1024]; +barrier.sync 0; +st.shared.v2.f32 [r439], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r442]; +ld.shared.u32 r58, [r442+1024]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r443, r433, 1, 7; +and.b32 r444, r440, 4; +add.s32 r445, r437, r444; +cvt.rn.f32.u32 f52, r443; +mul.f32 f53, f52, 0f3CC90FDB; +cos.approx.f32 f7, f53; +sin.approx.f32 f54, f53; +neg.f32 f8, f54; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r446, r435, 2032; +add.s32 r447, r445, r446; +st.shared.u32 [r447], r53; +st.shared.u32 [r447+8], r75; +barrier.sync 0; +and.b32 r448, r440, 1016; +sub.s32 r449, r447, r448; +ld.shared.u32 r106, [r449]; +ld.shared.u32 r107, [r449+1024]; +barrier.sync 0; +st.shared.u32 [r447], r56; +st.shared.u32 [r447+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r449]; +ld.shared.u32 r110, [r449+1024]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r450, r433, 2, 6; +and.b32 r451, r440, 12; +add.s32 r452, r437, r451; +cvt.rn.f32.u32 f55, r450; +mul.f32 f56, f55, 0f3D490FDB; +cos.approx.f32 f13, f56; +sin.approx.f32 f57, f56; +neg.f32 f14, f57; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +barrier.sync 0; +and.b32 r453, r435, 2016; +add.s32 r454, r452, r453; +st.shared.u32 [r454], r105; +st.shared.u32 [r454+16], r127; +barrier.sync 0; +and.b32 r455, r440, 1008; +sub.s32 r456, r454, r455; +ld.shared.u32 r158, [r456]; +ld.shared.u32 r159, [r456+1024]; +barrier.sync 0; +st.shared.u32 [r454], r108; +st.shared.u32 [r454+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r456]; +ld.shared.u32 r162, [r456+1024]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r457, r433, 3, 5; +and.b32 r458, r440, 28; +add.s32 r459, r437, r458; +cvt.rn.f32.u32 f58, r457; +mul.f32 f59, f58, 0f3DC90FDB; +cos.approx.f32 f19, f59; +sin.approx.f32 f60, f59; +neg.f32 f20, f60; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +barrier.sync 0; +and.b32 r460, r435, 1984; +add.s32 r461, r459, r460; +st.shared.u32 [r461], r157; +st.shared.u32 [r461+32], r179; +barrier.sync 0; +and.b32 r462, r440, 992; +sub.s32 r463, r461, r462; +ld.shared.u32 r210, [r463]; +ld.shared.u32 r211, [r463+1024]; +barrier.sync 0; +st.shared.u32 [r461], r160; +st.shared.u32 [r461+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r463]; +ld.shared.u32 r214, [r463+1024]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r464, r433, 4, 4; +and.b32 r465, r440, 60; +add.s32 r466, r437, r465; +cvt.rn.f32.u32 f61, r464; +mul.f32 f62, f61, 0f3E490FDB; +cos.approx.f32 f25, f62; +sin.approx.f32 f63, f62; +neg.f32 f26, f63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +barrier.sync 0; +and.b32 r467, r435, 1920; +add.s32 r468, r466, r467; +st.shared.u32 [r468], r209; +st.shared.u32 [r468+64], r231; +barrier.sync 0; +and.b32 r469, r440, 960; +sub.s32 r470, r468, r469; +ld.shared.u32 r262, [r470]; +ld.shared.u32 r263, [r470+1024]; +barrier.sync 0; +st.shared.u32 [r468], r212; +st.shared.u32 [r468+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r470]; +ld.shared.u32 r266, [r470+1024]; +{ +add.f16x2 r261, r262, r263; +} +{ +add.f16x2 r264, r265, r266; +} +{ +sub.f16x2 r267, r262, r263; +} +{ +sub.f16x2 r270, r265, r266; +} +bfe.u32 r471, r433, 5, 3; +and.b32 r472, r440, 124; +add.s32 r473, r437, r472; +cvt.rn.f32.u32 f64, r471; +mul.f32 f65, f64, 0f3EC90FDB; +cos.approx.f32 f31, f65; +sin.approx.f32 f66, f65; +neg.f32 f32, f66; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f31; +cvt.rn.f16.f32 high, f32; +mov.b32 r273, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r276, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r273; +mov.b32 r278, {high, high}; +} +{ +mul.f16x2 r280, r270, r278; +} +{ +fma.rn.f16x2 r283, r267, r276, r280; +} +{ +mul.f16x2 r287, r267, r278; +} +{ +neg.f16x2 r290, r287; +} +{ +fma.rn.f16x2 r292, r270, r276, r290; +} +barrier.sync 0; +and.b32 r474, r435, 1792; +add.s32 r475, r473, r474; +st.shared.u32 [r475], r261; +st.shared.u32 [r475+128], r283; +barrier.sync 0; +and.b32 r476, r440, 896; +sub.s32 r477, r475, r476; +ld.shared.u32 r314, [r477]; +ld.shared.u32 r315, [r477+1024]; +barrier.sync 0; +st.shared.u32 [r475], r264; +st.shared.u32 [r475+128], r292; +barrier.sync 0; +ld.shared.u32 r317, [r477]; +ld.shared.u32 r318, [r477+1024]; +{ +add.f16x2 r313, r314, r315; +} +{ +add.f16x2 r316, r317, r318; +} +{ +sub.f16x2 r319, r314, r315; +} +{ +sub.f16x2 r322, r317, r318; +} +bfe.u32 r478, r433, 6, 2; +and.b32 r479, r440, 252; +add.s32 r480, r437, r479; +cvt.rn.f32.u32 f67, r478; +mul.f32 f68, f67, 0f3F490FDB; +cos.approx.f32 f37, f68; +sin.approx.f32 f69, f68; +neg.f32 f38, f69; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r325; +mov.b32 r330, {high, high}; +} +{ +mul.f16x2 r332, r322, r330; +} +{ +fma.rn.f16x2 r335, r319, r328, r332; +} +{ +mul.f16x2 r339, r319, r330; +} +{ +neg.f16x2 r342, r339; +} +{ +fma.rn.f16x2 r344, r322, r328, r342; +} +barrier.sync 0; +and.b32 r481, r435, 1536; +add.s32 r482, r480, r481; +st.shared.u32 [r482], r313; +st.shared.u32 [r482+256], r335; +barrier.sync 0; +and.b32 r483, r440, 768; +sub.s32 r484, r482, r483; +ld.shared.u32 r366, [r484]; +ld.shared.u32 r367, [r484+1024]; +barrier.sync 0; +st.shared.u32 [r482], r316; +st.shared.u32 [r482+256], r344; +barrier.sync 0; +ld.shared.u32 r369, [r484]; +ld.shared.u32 r370, [r484+1024]; +{ +add.f16x2 r365, r366, r367; +} +{ +add.f16x2 r368, r369, r370; +} +{ +sub.f16x2 r371, r366, r367; +} +{ +sub.f16x2 r374, r369, r370; +} +bfe.u32 r485, r433, 7, 1; +and.b32 r486, r440, 508; +add.s32 r487, r437, r486; +cvt.rn.f32.u32 f70, r485; +mul.f32 f71, f70, 0f3FC90FDB; +cos.approx.f32 f43, f71; +sin.approx.f32 f72, f71; +neg.f32 f44, f72; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r377, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r380, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r377; +mov.b32 r382, {high, high}; +} +{ +mul.f16x2 r384, r374, r382; +} +{ +fma.rn.f16x2 r387, r371, r380, r384; +} +{ +mul.f16x2 r391, r371, r382; +} +{ +neg.f16x2 r394, r391; +} +{ +fma.rn.f16x2 r396, r374, r380, r394; +} +barrier.sync 0; +and.b32 r488, r435, 1024; +add.s32 r489, r487, r488; +st.shared.u32 [r489], r365; +st.shared.u32 [r489+512], r387; +barrier.sync 0; +and.b32 r490, r440, 512; +sub.s32 r491, r489, r490; +ld.shared.u32 r418, [r491]; +ld.shared.u32 r419, [r491+1024]; +barrier.sync 0; +st.shared.u32 [r489], r368; +st.shared.u32 [r489+512], r396; +barrier.sync 0; +ld.shared.u32 r421, [r491]; +ld.shared.u32 r422, [r491+1024]; +{ +add.f16x2 %0, r418, r419; +} +{ +add.f16x2 %1, r421, r422; +} +{ +sub.f16x2 %2, r418, r419; +} +{ +sub.f16x2 %3, r421, r422; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..d03e837085ac8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp32_fwd.hpp.inc @@ -0,0 +1,5972 @@ +#ifndef CUFFTDX_FFT_512_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_512_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<71, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<406>; +.reg .b32 r<20>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 4032; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+512]; +ld.shared.v2.f32 {f167, f168}, [r13+1024]; +ld.shared.v2.f32 {f171, f172}, [r13+1536]; +ld.shared.v2.f32 {f175, f176}, [r13+2048]; +ld.shared.v2.f32 {f179, f180}, [r13+2560]; +ld.shared.v2.f32 {f183, f184}, [r13+3072]; +ld.shared.v2.f32 {f187, f188}, [r13+3584]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 56; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 3584; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+512]; +ld.shared.v2.f32 {f325, f326}, [r19+1024]; +ld.shared.v2.f32 {f329, f330}, [r19+1536]; +ld.shared.v2.f32 {f333, f334}, [r19+2048]; +ld.shared.v2.f32 {f337, f338}, [r19+2560]; +ld.shared.v2.f32 {f341, f342}, [r19+3072]; +ld.shared.v2.f32 {f345, f346}, [r19+3584]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +add.f32 f361, f351, f356; +sub.f32 f362, f352, f355; +sub.f32 f363, f351, f356; +add.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +add.f32 f377, f367, f372; +sub.f32 f378, f368, f371; +sub.f32 f379, f367, f372; +add.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0fBF3504F3; +sub.f32 f383, f381, f382; +mul.f32 f384, f378, 0f3F3504F3; +fma.rn.f32 f385, f377, 0fBF3504F3, f384; +mul.f32 f386, f379, 0fBF3504F3; +mul.f32 f387, f380, 0fBF3504F3; +sub.f32 f388, f386, f387; +add.f32 f389, f386, f387; +add.f32 %1, f358, f374; +add.f32 %0, f357, f373; +add.f32 %3, f362, f385; +add.f32 %2, f361, f383; +sub.f32 %5, f360, f375; +add.f32 %4, f359, f376; +add.f32 %7, f364, f389; +add.f32 %6, f363, f388; +sub.f32 %9, f358, f374; +sub.f32 %8, f357, f373; +sub.f32 %11, f362, f385; +sub.f32 %10, f361, f383; +add.f32 %13, f360, f375; +sub.f32 %12, f359, f376; +sub.f32 %15, f364, f389; +sub.f32 %14, f363, f388; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<72, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<374>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 2016; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+256]; +ld.shared.f32 f161, [r13+512]; +ld.shared.f32 f162, [r13+768]; +ld.shared.f32 f163, [r13+1024]; +ld.shared.f32 f164, [r13+1280]; +ld.shared.f32 f165, [r13+1536]; +ld.shared.f32 f166, [r13+1792]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+256]; +ld.shared.f32 f169, [r13+512]; +ld.shared.f32 f170, [r13+768]; +ld.shared.f32 f171, [r13+1024]; +ld.shared.f32 f172, [r13+1280]; +ld.shared.f32 f173, [r13+1536]; +ld.shared.f32 f174, [r13+1792]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 56; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 1792; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+256]; +ld.shared.f32 f303, [r20+512]; +ld.shared.f32 f304, [r20+768]; +ld.shared.f32 f305, [r20+1024]; +ld.shared.f32 f306, [r20+1280]; +ld.shared.f32 f307, [r20+1536]; +ld.shared.f32 f308, [r20+1792]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+256]; +ld.shared.f32 f311, [r20+512]; +ld.shared.f32 f312, [r20+768]; +ld.shared.f32 f313, [r20+1024]; +ld.shared.f32 f314, [r20+1280]; +ld.shared.f32 f315, [r20+1536]; +ld.shared.f32 f316, [r20+1792]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +add.f32 f329, f319, f324; +sub.f32 f330, f320, f323; +sub.f32 f331, f319, f324; +add.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +add.f32 f345, f335, f340; +sub.f32 f346, f336, f339; +sub.f32 f347, f335, f340; +add.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0fBF3504F3; +sub.f32 f351, f349, f350; +mul.f32 f352, f346, 0f3F3504F3; +fma.rn.f32 f353, f345, 0fBF3504F3, f352; +mul.f32 f354, f347, 0fBF3504F3; +mul.f32 f355, f348, 0fBF3504F3; +sub.f32 f356, f354, f355; +add.f32 f357, f354, f355; +add.f32 %0, f325, f341; +add.f32 %1, f326, f342; +add.f32 %3, f330, f353; +add.f32 %2, f329, f351; +sub.f32 %5, f328, f343; +add.f32 %4, f327, f344; +add.f32 %7, f332, f357; +add.f32 %6, f331, f356; +sub.f32 %8, f325, f341; +sub.f32 %9, f326, f342; +sub.f32 %11, f330, f353; +sub.f32 %10, f329, f351; +add.f32 %13, f328, f343; +sub.f32 %12, f327, f344; +sub.f32 %15, f332, f357; +sub.f32 %14, f331, f356; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<74, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1703>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1698, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1696, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1695, f1698, f1696; +sub.f32 f140, f1698, f1696; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1694, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1691, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1689, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1688, f1691, f1689; +sub.f32 f156, f1691, f1689; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1687, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1687, 0fBF3504F3; +mul.f32 f1686, f157, 0f3F3504F3; +sub.f32 f163, f1686, f162; +mul.f32 f164, f1687, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1685, f1695, f1688; +sub.f32 f173, f1695, f1688; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1684, f1694, f165; +sub.f32 f177, f1694, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1683, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1682, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1680, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1677, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1676, f1680, f1677; +sub.f32 f197, f1680, f1677; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1675, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1673, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1671, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1670, f1673, f1671; +sub.f32 f213, f1673, f1671; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1669, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1669, 0fBF3504F3; +mul.f32 f1668, f214, 0f3F3504F3; +sub.f32 f220, f1668, f219; +mul.f32 f221, f1669, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1667, f1676, f1670; +sub.f32 f230, f1676, f1670; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1666, f1675, f222; +sub.f32 f234, f1675, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1665, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1664, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1662, f231, 0f3F6C835E; +mul.f32 f1663, f1666, 0fBEC3EF15; +sub.f32 f245, f1662, f1663; +mul.f32 f246, f1666, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1660, f235, 0f3F3504F3; +mul.f32 f1661, f1665, 0fBF3504F3; +sub.f32 f250, f1660, f1661; +mul.f32 f251, f1665, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1658, f239, 0f3EC3EF15; +mul.f32 f1659, f1664, 0fBF6C835E; +sub.f32 f255, f1658, f1659; +mul.f32 f256, f1664, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1656, f233, 0fBEC3EF15; +mul.f32 f1657, f234, 0fBF6C835E; +sub.f32 f260, f1656, f1657; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1654, f241, 0fBF6C835E; +mul.f32 f1655, f242, 0fBEC3EF15; +sub.f32 f269, f1654, f1655; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1653, f1685, f1667; +sub.f32 f275, f1685, f1667; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1652, f1684, f247; +sub.f32 f279, f1684, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1651, f1683, f252; +sub.f32 f283, f1683, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1650, f1682, f257; +sub.f32 f287, f1682, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1649, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1648, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1647, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1646, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1643, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1641, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1640, f1643, f1641; +sub.f32 f315, f1643, f1641; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1639, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1637, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1634, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1633, f1637, f1634; +sub.f32 f331, f1637, f1634; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1632, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1630, f332, 0f3F3504F3; +mul.f32 f1631, f1632, 0fBF3504F3; +sub.f32 f338, f1630, f1631; +mul.f32 f339, f1632, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1629, f1640, f1633; +sub.f32 f348, f1640, f1633; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1628, f1639, f340; +sub.f32 f352, f1639, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1627, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1626, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1624, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1622, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1621, f1624, f1622; +sub.f32 f372, f1624, f1622; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1620, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1617, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1616, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1615, f1617, f1616; +sub.f32 f388, f1617, f1616; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1614, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1612, f389, 0f3F3504F3; +mul.f32 f1613, f1614, 0fBF3504F3; +sub.f32 f395, f1612, f1613; +mul.f32 f396, f1614, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1611, f1621, f1615; +sub.f32 f405, f1621, f1615; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1610, f1620, f397; +sub.f32 f409, f1620, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1609, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1608, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1610, 0fBEC3EF15; +mul.f32 f1607, f406, 0f3F6C835E; +sub.f32 f420, f1607, f419; +mul.f32 f421, f1610, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1609, 0fBF3504F3; +mul.f32 f1606, f410, 0f3F3504F3; +sub.f32 f425, f1606, f424; +mul.f32 f426, f1609, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1604, f414, 0f3EC3EF15; +mul.f32 f1605, f1608, 0fBF6C835E; +sub.f32 f430, f1604, f1605; +mul.f32 f431, f1608, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1602, f408, 0fBEC3EF15; +mul.f32 f1603, f409, 0fBF6C835E; +sub.f32 f435, f1602, f1603; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1601, f416, 0fBF6C835E; +sub.f32 f444, f1601, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1600, f1629, f1611; +sub.f32 f450, f1629, f1611; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1599, f1628, f422; +sub.f32 f454, f1628, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1598, f1627, f427; +sub.f32 f458, f1627, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1597, f1626, f432; +sub.f32 f462, f1626, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1596, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1595, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1594, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1593, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1599, 0fBE47C5C2; +mul.f32 f1592, f451, 0f3F7B14BE; +sub.f32 f481, f1592, f480; +mul.f32 f482, f1599, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1598, 0fBEC3EF15; +mul.f32 f1591, f455, 0f3F6C835E; +sub.f32 f486, f1591, f485; +mul.f32 f487, f1598, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1597, 0fBF0E39DA; +mul.f32 f1590, f459, 0f3F54DB31; +sub.f32 f491, f1590, f490; +mul.f32 f492, f1597, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1596, 0fBF3504F3; +mul.f32 f1589, f463, 0f3F3504F3; +sub.f32 f496, f1589, f495; +mul.f32 f497, f1596, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1587, f467, 0f3F0E39DA; +mul.f32 f1588, f1595, 0fBF54DB31; +sub.f32 f501, f1587, f1588; +mul.f32 f502, f1595, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1585, f471, 0f3EC3EF15; +mul.f32 f1586, f1594, 0fBF6C835E; +sub.f32 f506, f1585, f1586; +mul.f32 f507, f1594, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1583, f475, 0f3E47C5C2; +mul.f32 f1584, f1593, 0fBF7B14BE; +sub.f32 f511, f1583, f1584; +mul.f32 f512, f1593, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1581, f453, 0fBE47C5C2; +mul.f32 f1582, f454, 0fBF7B14BE; +sub.f32 f516, f1581, f1582; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1580, f457, 0fBEC3EF15; +sub.f32 f521, f1580, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1579, f461, 0fBF0E39DA; +sub.f32 f526, f1579, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1577, f469, 0fBF54DB31; +mul.f32 f1578, f470, 0fBF0E39DA; +sub.f32 f535, f1577, f1578; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1576, f473, 0fBF6C835E; +sub.f32 f540, f1576, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1575, f477, 0fBF7B14BE; +sub.f32 f545, f1575, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1574, f1652, f483; +sub.f32 f553, f1652, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1573, f1651, f488; +sub.f32 f557, f1651, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1572, f1650, f493; +sub.f32 f561, f1650, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1571, f1649, f498; +sub.f32 f565, f1649, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f1570, f1648, f503; +sub.f32 f569, f1648, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f1569, f1647, f508; +sub.f32 f573, f1647, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f1568, f1646, f513; +sub.f32 f577, f1646, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f1567, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f1566, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f1565, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f1564, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f1563, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1562, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1561, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1560, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +and.b32 r14, r15, 15; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f1574; +mul.f32 f616, f610, f1574; +mul.f32 f618, f611, f611; +mul.f32 f1559, f610, f610; +sub.f32 f619, f1559, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f1573; +mul.f32 f624, f619, f1573; +mul.f32 f626, f611, f621; +mul.f32 f1558, f610, f619; +sub.f32 f627, f1558, f626; +mul.f32 f1557, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f1572; +mul.f32 f632, f627, f1572; +mul.f32 f1555, f610, f627; +mul.f32 f1556, f611, f629; +sub.f32 f635, f1555, f1556; +mul.f32 f1554, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f1571; +mul.f32 f640, f635, f1571; +mul.f32 f642, f611, f637; +mul.f32 f1553, f610, f635; +sub.f32 f643, f1553, f642; +mul.f32 f1552, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f1570; +mul.f32 f648, f643, f1570; +mul.f32 f1550, f610, f643; +mul.f32 f1551, f611, f645; +sub.f32 f651, f1550, f1551; +mul.f32 f1549, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f1569; +mul.f32 f656, f651, f1569; +mul.f32 f658, f611, f653; +mul.f32 f1548, f610, f651; +sub.f32 f659, f1548, f658; +mul.f32 f1547, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f1568; +mul.f32 f664, f659, f1568; +mul.f32 f666, f611, f661; +mul.f32 f1546, f610, f659; +sub.f32 f667, f1546, f666; +mul.f32 f1545, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f1567; +mul.f32 f672, f667, f1567; +mul.f32 f1543, f610, f667; +mul.f32 f1544, f611, f669; +sub.f32 f675, f1543, f1544; +mul.f32 f1542, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f1566; +mul.f32 f680, f675, f1566; +mul.f32 f682, f611, f677; +mul.f32 f1541, f610, f675; +sub.f32 f683, f1541, f682; +mul.f32 f1540, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f1565; +mul.f32 f688, f683, f1565; +mul.f32 f690, f611, f685; +mul.f32 f1539, f610, f683; +sub.f32 f691, f1539, f690; +mul.f32 f1538, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f1564; +mul.f32 f696, f691, f1564; +mul.f32 f1536, f610, f691; +mul.f32 f1537, f611, f693; +sub.f32 f699, f1536, f1537; +mul.f32 f1535, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f1563; +mul.f32 f704, f699, f1563; +mul.f32 f706, f611, f701; +mul.f32 f1534, f610, f699; +sub.f32 f707, f1534, f706; +mul.f32 f1533, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f1562; +mul.f32 f712, f707, f1562; +mul.f32 f1531, f610, f707; +mul.f32 f1532, f611, f709; +sub.f32 f715, f1531, f1532; +mul.f32 f1530, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f1561; +mul.f32 f720, f715, f1561; +mul.f32 f722, f611, f717; +mul.f32 f1529, f610, f715; +sub.f32 f723, f1529, f722; +mul.f32 f1528, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f1560; +mul.f32 f728, f723, f1560; +mul.f32 f730, f611, f725; +mul.f32 f1527, f610, f723; +sub.f32 f731, f1527, f730; +mul.f32 f1526, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1525, f1653, f1600; +mul.f32 f735, f733, f1525; +mul.f32 f736, f731, f1525; +mul.f32 f1523, f610, f731; +mul.f32 f1524, f611, f733; +sub.f32 f739, f1523, f1524; +sub.f32 f1522, f272, f447; +mul.f32 f1521, f731, f1522; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1520, f610, f739; +sub.f32 f747, f1520, f746; +mul.f32 f1519, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1518, f610, f747; +sub.f32 f755, f1518, f754; +mul.f32 f1517, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f1515, f610, f755; +mul.f32 f1516, f611, f757; +sub.f32 f763, f1515, f1516; +mul.f32 f1514, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1513, f610, f763; +sub.f32 f771, f1513, f770; +mul.f32 f1512, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f1510, f610, f771; +mul.f32 f1511, f611, f773; +sub.f32 f779, f1510, f1511; +mul.f32 f1509, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1508, f610, f779; +sub.f32 f787, f1508, f786; +mul.f32 f1507, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1506, f610, f787; +sub.f32 f795, f1506, f794; +mul.f32 f1505, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f1503, f610, f795; +mul.f32 f1504, f611, f797; +sub.f32 f803, f1503, f1504; +mul.f32 f1502, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1501, f610, f803; +sub.f32 f811, f1501, f810; +mul.f32 f1500, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1499, f610, f811; +sub.f32 f819, f1499, f818; +mul.f32 f1498, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f1496, f610, f819; +mul.f32 f1497, f611, f821; +sub.f32 f827, f1496, f1497; +mul.f32 f1495, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1494, f610, f827; +sub.f32 f835, f1494, f834; +mul.f32 f1493, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f1491, f610, f835; +mul.f32 f1492, f611, f837; +sub.f32 f843, f1491, f1492; +mul.f32 f1490, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1489, f610, f843; +sub.f32 f851, f1489, f850; +mul.f32 f1488, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f1487, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r17, %tid.x; +shl.b32 r16, r17, 8; +barrier.sync 0; +and.b32 r11, r16, 3840; +add.s32 r12, r9, r11; +sub.f32 f1702, f1653, f1600; +mul.f32 f1701, f733, f1702; +add.f32 f857, f1653, f1600; +sub.f32 f1700, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 15; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 15; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f1488, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f1557, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f1554, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f1552, f639; +sub.f32 f867, f1549, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f1547, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f1545, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f1542, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f1540, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f1538, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f1535, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f1533, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f1530, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f1528, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f1526, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f1700, f736; +sub.f32 f890, f1521, f1701; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f1519, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f1517, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f1514, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f1512, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f1509, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f1507, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f1505, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f1502, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f1500, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f1498, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f1495, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f1493, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f1490, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f1487, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +mad.lo.s32 r13, r20, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+128]; +ld.shared.v2.f32 {f929, f930}, [r13+256]; +ld.shared.v2.f32 {f933, f934}, [r13+384]; +ld.shared.v2.f32 {f937, f938}, [r13+512]; +ld.shared.v2.f32 {f941, f942}, [r13+640]; +ld.shared.v2.f32 {f945, f946}, [r13+768]; +ld.shared.v2.f32 {f949, f950}, [r13+896]; +ld.shared.v2.f32 {f953, f954}, [r13+1024]; +ld.shared.v2.f32 {f957, f958}, [r13+1152]; +ld.shared.v2.f32 {f961, f962}, [r13+1280]; +ld.shared.v2.f32 {f965, f966}, [r13+1408]; +ld.shared.v2.f32 {f969, f970}, [r13+1536]; +ld.shared.v2.f32 {f973, f974}, [r13+1664]; +ld.shared.v2.f32 {f977, f978}, [r13+1792]; +ld.shared.v2.f32 {f981, f982}, [r13+1920]; +ld.shared.v2.f32 {f985, f986}, [r13+2048]; +ld.shared.v2.f32 {f989, f990}, [r13+2176]; +ld.shared.v2.f32 {f993, f994}, [r13+2304]; +ld.shared.v2.f32 {f997, f998}, [r13+2432]; +ld.shared.v2.f32 {f1001, f1002}, [r13+2560]; +ld.shared.v2.f32 {f1005, f1006}, [r13+2688]; +ld.shared.v2.f32 {f1009, f1010}, [r13+2816]; +ld.shared.v2.f32 {f1013, f1014}, [r13+2944]; +ld.shared.v2.f32 {f1017, f1018}, [r13+3072]; +ld.shared.v2.f32 {f1021, f1022}, [r13+3200]; +ld.shared.v2.f32 {f1025, f1026}, [r13+3328]; +ld.shared.v2.f32 {f1029, f1030}, [r13+3456]; +ld.shared.v2.f32 {f1033, f1034}, [r13+3584]; +ld.shared.v2.f32 {f1037, f1038}, [r13+3712]; +ld.shared.v2.f32 {f1041, f1042}, [r13+3840]; +ld.shared.v2.f32 {f1045, f1046}, [r13+3968]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1486, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1485, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f1484, f1486, f1485; +sub.f32 f1060, f1486, f1485; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f1483, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f1482, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f1481, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f1480, f1482, f1481; +sub.f32 f1076, f1482, f1481; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f1479, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f1479, 0fBF3504F3; +mul.f32 f1478, f1077, 0f3F3504F3; +sub.f32 f1083, f1478, f1082; +mul.f32 f1084, f1479, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f1477, f1484, f1480; +sub.f32 f1093, f1484, f1480; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f1476, f1483, f1085; +sub.f32 f1097, f1483, f1085; +add.f32 f1098, f1059, f1076; +sub.f32 f1100, f1059, f1076; +sub.f32 f1475, f1060, f1075; +add.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1088; +sub.f32 f1104, f1063, f1088; +add.f32 f1474, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f1473, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f1472, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f1471, f1473, f1472; +sub.f32 f1117, f1473, f1472; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f1470, f1109, f1112; +add.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f1469, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f1468, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f1467, f1469, f1468; +sub.f32 f1133, f1469, f1468; +add.f32 f1134, f1124, f1129; +sub.f32 f1136, f1124, f1129; +sub.f32 f1466, f1125, f1128; +add.f32 f1137, f1125, f1128; +mul.f32 f1139, f1466, 0fBF3504F3; +mul.f32 f1465, f1134, 0f3F3504F3; +sub.f32 f1140, f1465, f1139; +mul.f32 f1141, f1466, 0f3F3504F3; +fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141; +mul.f32 f1143, f1136, 0fBF3504F3; +mul.f32 f1144, f1137, 0fBF3504F3; +sub.f32 f1145, f1143, f1144; +add.f32 f1146, f1143, f1144; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f1464, f1471, f1467; +sub.f32 f1150, f1471, f1467; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f1463, f1470, f1142; +sub.f32 f1154, f1470, f1142; +add.f32 f1155, f1116, f1133; +sub.f32 f1157, f1116, f1133; +sub.f32 f1462, f1117, f1132; +add.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1145; +sub.f32 f1161, f1120, f1145; +add.f32 f1461, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f1459, f1151, 0f3F6C835E; +mul.f32 f1460, f1463, 0fBEC3EF15; +sub.f32 f1165, f1459, f1460; +mul.f32 f1166, f1463, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166; +mul.f32 f1457, f1155, 0f3F3504F3; +mul.f32 f1458, f1462, 0fBF3504F3; +sub.f32 f1170, f1457, f1458; +mul.f32 f1171, f1462, 0f3F3504F3; +fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171; +mul.f32 f1174, f1461, 0fBF6C835E; +mul.f32 f1456, f1159, 0f3EC3EF15; +sub.f32 f1175, f1456, f1174; +mul.f32 f1176, f1461, 0f3EC3EF15; +fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176; +mul.f32 f1179, f1154, 0fBF6C835E; +mul.f32 f1455, f1153, 0fBEC3EF15; +sub.f32 f1180, f1455, f1179; +mul.f32 f1181, f1154, 0fBEC3EF15; +fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181; +mul.f32 f1183, f1157, 0fBF3504F3; +mul.f32 f1184, f1158, 0fBF3504F3; +sub.f32 f1185, f1183, f1184; +add.f32 f1186, f1183, f1184; +mul.f32 f1453, f1161, 0fBF6C835E; +mul.f32 f1454, f1162, 0fBEC3EF15; +sub.f32 f1189, f1453, f1454; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190; +add.f32 f1192, f925, f989; +sub.f32 f1194, f925, f989; +add.f32 f1452, f926, f990; +sub.f32 f1195, f926, f990; +add.f32 f1196, f957, f1021; +sub.f32 f1198, f957, f1021; +add.f32 f1451, f958, f1022; +sub.f32 f1199, f958, f1022; +add.f32 f1200, f1192, f1196; +sub.f32 f1202, f1192, f1196; +add.f32 f1450, f1452, f1451; +sub.f32 f1203, f1452, f1451; +add.f32 f1204, f1194, f1199; +sub.f32 f1206, f1194, f1199; +sub.f32 f1449, f1195, f1198; +add.f32 f1207, f1195, f1198; +add.f32 f1208, f941, f1005; +sub.f32 f1210, f941, f1005; +add.f32 f1448, f942, f1006; +sub.f32 f1211, f942, f1006; +add.f32 f1212, f973, f1037; +sub.f32 f1214, f973, f1037; +add.f32 f1447, f974, f1038; +sub.f32 f1215, f974, f1038; +add.f32 f1216, f1208, f1212; +sub.f32 f1218, f1208, f1212; +add.f32 f1446, f1448, f1447; +sub.f32 f1219, f1448, f1447; +add.f32 f1220, f1210, f1215; +sub.f32 f1222, f1210, f1215; +sub.f32 f1445, f1211, f1214; +add.f32 f1223, f1211, f1214; +mul.f32 f1225, f1445, 0fBF3504F3; +mul.f32 f1444, f1220, 0f3F3504F3; +sub.f32 f1226, f1444, f1225; +mul.f32 f1227, f1445, 0f3F3504F3; +fma.rn.f32 f1228, f1220, 0fBF3504F3, f1227; +mul.f32 f1229, f1222, 0fBF3504F3; +mul.f32 f1230, f1223, 0fBF3504F3; +sub.f32 f1231, f1229, f1230; +add.f32 f1232, f1229, f1230; +add.f32 f1233, f1200, f1216; +sub.f32 f1235, f1200, f1216; +add.f32 f1443, f1450, f1446; +sub.f32 f1236, f1450, f1446; +add.f32 f1237, f1204, f1226; +sub.f32 f1239, f1204, f1226; +add.f32 f1442, f1449, f1228; +sub.f32 f1240, f1449, f1228; +add.f32 f1241, f1202, f1219; +sub.f32 f1243, f1202, f1219; +sub.f32 f1441, f1203, f1218; +add.f32 f1244, f1203, f1218; +add.f32 f1245, f1206, f1231; +sub.f32 f1247, f1206, f1231; +add.f32 f1440, f1207, f1232; +sub.f32 f1248, f1207, f1232; +add.f32 f1249, f933, f997; +sub.f32 f1251, f933, f997; +add.f32 f1439, f934, f998; +sub.f32 f1252, f934, f998; +add.f32 f1253, f965, f1029; +sub.f32 f1255, f965, f1029; +add.f32 f1438, f966, f1030; +sub.f32 f1256, f966, f1030; +add.f32 f1257, f1249, f1253; +sub.f32 f1259, f1249, f1253; +add.f32 f1437, f1439, f1438; +sub.f32 f1260, f1439, f1438; +add.f32 f1261, f1251, f1256; +sub.f32 f1263, f1251, f1256; +sub.f32 f1436, f1252, f1255; +add.f32 f1264, f1252, f1255; +add.f32 f1265, f949, f1013; +sub.f32 f1267, f949, f1013; +add.f32 f1435, f950, f1014; +sub.f32 f1268, f950, f1014; +add.f32 f1269, f981, f1045; +sub.f32 f1271, f981, f1045; +add.f32 f1434, f982, f1046; +sub.f32 f1272, f982, f1046; +add.f32 f1273, f1265, f1269; +sub.f32 f1275, f1265, f1269; +add.f32 f1433, f1435, f1434; +sub.f32 f1276, f1435, f1434; +add.f32 f1277, f1267, f1272; +sub.f32 f1279, f1267, f1272; +sub.f32 f1432, f1268, f1271; +add.f32 f1280, f1268, f1271; +mul.f32 f1430, f1277, 0f3F3504F3; +mul.f32 f1431, f1432, 0fBF3504F3; +sub.f32 f1283, f1430, f1431; +mul.f32 f1284, f1432, 0f3F3504F3; +fma.rn.f32 f1285, f1277, 0fBF3504F3, f1284; +mul.f32 f1286, f1279, 0fBF3504F3; +mul.f32 f1287, f1280, 0fBF3504F3; +sub.f32 f1288, f1286, f1287; +add.f32 f1289, f1286, f1287; +add.f32 f1290, f1257, f1273; +sub.f32 f1292, f1257, f1273; +add.f32 f1429, f1437, f1433; +sub.f32 f1293, f1437, f1433; +add.f32 f1294, f1261, f1283; +sub.f32 f1296, f1261, f1283; +add.f32 f1428, f1436, f1285; +sub.f32 f1297, f1436, f1285; +add.f32 f1298, f1259, f1276; +sub.f32 f1300, f1259, f1276; +sub.f32 f1427, f1260, f1275; +add.f32 f1301, f1260, f1275; +add.f32 f1302, f1263, f1288; +sub.f32 f1304, f1263, f1288; +add.f32 f1426, f1264, f1289; +sub.f32 f1305, f1264, f1289; +mul.f32 f1307, f1428, 0fBEC3EF15; +mul.f32 f1425, f1294, 0f3F6C835E; +sub.f32 f1308, f1425, f1307; +mul.f32 f1309, f1428, 0f3F6C835E; +fma.rn.f32 f1310, f1294, 0fBEC3EF15, f1309; +mul.f32 f1312, f1427, 0fBF3504F3; +mul.f32 f1424, f1298, 0f3F3504F3; +sub.f32 f1313, f1424, f1312; +mul.f32 f1314, f1427, 0f3F3504F3; +fma.rn.f32 f1315, f1298, 0fBF3504F3, f1314; +mul.f32 f1317, f1426, 0fBF6C835E; +mul.f32 f1423, f1302, 0f3EC3EF15; +sub.f32 f1318, f1423, f1317; +mul.f32 f1319, f1426, 0f3EC3EF15; +fma.rn.f32 f1320, f1302, 0fBF6C835E, f1319; +mul.f32 f1322, f1297, 0fBF6C835E; +mul.f32 f1422, f1296, 0fBEC3EF15; +sub.f32 f1323, f1422, f1322; +mul.f32 f1324, f1297, 0fBEC3EF15; +fma.rn.f32 f1325, f1296, 0fBF6C835E, f1324; +mul.f32 f1326, f1300, 0fBF3504F3; +mul.f32 f1327, f1301, 0fBF3504F3; +sub.f32 f1328, f1326, f1327; +add.f32 f1329, f1326, f1327; +mul.f32 f1331, f1305, 0fBEC3EF15; +mul.f32 f1421, f1304, 0fBF6C835E; +sub.f32 f1332, f1421, f1331; +mul.f32 f1333, f1305, 0fBF6C835E; +fma.rn.f32 f1334, f1304, 0fBEC3EF15, f1333; +add.f32 %0, f1090, f1147; +add.f32 %1, f1477, f1464; +add.f32 %2, f1233, f1290; +add.f32 %3, f1443, f1429; +add.f32 %4, f1094, f1165; +add.f32 %5, f1476, f1167; +add.f32 %7, f1442, f1310; +add.f32 %6, f1237, f1308; +add.f32 %9, f1475, f1172; +add.f32 %8, f1098, f1170; +add.f32 %11, f1441, f1315; +add.f32 %10, f1241, f1313; +add.f32 %12, f1102, f1175; +add.f32 %13, f1474, f1177; +add.f32 %14, f1245, f1318; +add.f32 %15, f1440, f1320; +add.f32 %16, f1092, f1150; +sub.f32 %17, f1093, f1149; +sub.f32 %19, f1236, f1292; +add.f32 %18, f1235, f1293; +add.f32 %21, f1097, f1182; +add.f32 %20, f1096, f1180; +add.f32 %23, f1240, f1325; +add.f32 %22, f1239, f1323; +add.f32 %24, f1100, f1185; +add.f32 %25, f1101, f1186; +add.f32 %26, f1243, f1328; +add.f32 %27, f1244, f1329; +add.f32 %28, f1104, f1189; +add.f32 %29, f1105, f1191; +add.f32 %30, f1247, f1332; +add.f32 %31, f1248, f1334; +sub.f32 %33, f1477, f1464; +sub.f32 %32, f1090, f1147; +sub.f32 %35, f1443, f1429; +sub.f32 %34, f1233, f1290; +sub.f32 %37, f1476, f1167; +sub.f32 %36, f1094, f1165; +sub.f32 %39, f1442, f1310; +sub.f32 %38, f1237, f1308; +sub.f32 %41, f1475, f1172; +sub.f32 %40, f1098, f1170; +sub.f32 %43, f1441, f1315; +sub.f32 %42, f1241, f1313; +sub.f32 %45, f1474, f1177; +sub.f32 %44, f1102, f1175; +sub.f32 %47, f1440, f1320; +sub.f32 %46, f1245, f1318; +add.f32 %49, f1093, f1149; +sub.f32 %48, f1092, f1150; +add.f32 %51, f1236, f1292; +sub.f32 %50, f1235, f1293; +sub.f32 %53, f1097, f1182; +sub.f32 %52, f1096, f1180; +sub.f32 %55, f1240, f1325; +sub.f32 %54, f1239, f1323; +sub.f32 %57, f1101, f1186; +sub.f32 %56, f1100, f1185; +sub.f32 %59, f1244, f1329; +sub.f32 %58, f1243, f1328; +sub.f32 %61, f1105, f1191; +sub.f32 %60, f1104, f1189; +sub.f32 %63, f1248, f1334; +sub.f32 %62, f1247, f1332; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<73, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<809>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1984; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+128]; +ld.shared.f32 f391, [r13+256]; +ld.shared.f32 f392, [r13+384]; +ld.shared.f32 f393, [r13+512]; +ld.shared.f32 f394, [r13+640]; +ld.shared.f32 f395, [r13+768]; +ld.shared.f32 f396, [r13+896]; +ld.shared.f32 f397, [r13+1024]; +ld.shared.f32 f398, [r13+1152]; +ld.shared.f32 f399, [r13+1280]; +ld.shared.f32 f400, [r13+1408]; +ld.shared.f32 f401, [r13+1536]; +ld.shared.f32 f402, [r13+1664]; +ld.shared.f32 f403, [r13+1792]; +ld.shared.f32 f404, [r13+1920]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+128]; +ld.shared.f32 f407, [r13+256]; +ld.shared.f32 f408, [r13+384]; +ld.shared.f32 f409, [r13+512]; +ld.shared.f32 f410, [r13+640]; +ld.shared.f32 f411, [r13+768]; +ld.shared.f32 f412, [r13+896]; +ld.shared.f32 f413, [r13+1024]; +ld.shared.f32 f414, [r13+1152]; +ld.shared.f32 f415, [r13+1280]; +ld.shared.f32 f416, [r13+1408]; +ld.shared.f32 f417, [r13+1536]; +ld.shared.f32 f418, [r13+1664]; +ld.shared.f32 f419, [r13+1792]; +ld.shared.f32 f420, [r13+1920]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f544; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f544; +add.f32 f576, f474, f547; +add.f32 f577, f475, f549; +sub.f32 f578, f474, f547; +sub.f32 f579, f475, f549; +add.f32 f580, f464, f522; +sub.f32 f581, f465, f521; +sub.f32 f582, f464, f522; +add.f32 f583, f465, f521; +add.f32 f584, f468, f552; +add.f32 f585, f469, f554; +sub.f32 f586, f468, f552; +sub.f32 f587, f469, f554; +add.f32 f588, f472, f557; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f557; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 16; +bfe.u32 r15, r5, 4, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f596, f568; +mul.f32 f601, f597, f569; +sub.f32 f602, f600, f601; +mul.f32 f603, f596, f569; +fma.rn.f32 f604, f597, f568, f603; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f607, f572; +mul.f32 f611, f609, f573; +sub.f32 f612, f610, f611; +mul.f32 f613, f607, f573; +fma.rn.f32 f614, f609, f572, f613; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f617, f576; +mul.f32 f621, f619, f577; +sub.f32 f622, f620, f621; +mul.f32 f623, f617, f577; +fma.rn.f32 f624, f619, f576, f623; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f627, f580; +mul.f32 f631, f629, f581; +sub.f32 f632, f630, f631; +mul.f32 f633, f627, f581; +fma.rn.f32 f634, f629, f580, f633; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f637, f584; +mul.f32 f641, f639, f585; +sub.f32 f642, f640, f641; +mul.f32 f643, f637, f585; +fma.rn.f32 f644, f639, f584, f643; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f647, f588; +mul.f32 f651, f649, f589; +sub.f32 f652, f650, f651; +mul.f32 f653, f647, f589; +fma.rn.f32 f654, f649, f588, f653; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f657, f592; +mul.f32 f661, f659, f593; +sub.f32 f662, f660, f661; +mul.f32 f663, f657, f593; +fma.rn.f32 f664, f659, f592, f663; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f667, f566; +mul.f32 f671, f669, f567; +sub.f32 f672, f670, f671; +mul.f32 f673, f667, f567; +fma.rn.f32 f674, f669, f566, f673; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f677, f570; +mul.f32 f681, f679, f571; +sub.f32 f682, f680, f681; +mul.f32 f683, f677, f571; +fma.rn.f32 f684, f679, f570, f683; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f687, f574; +mul.f32 f691, f689, f575; +sub.f32 f692, f690, f691; +mul.f32 f693, f687, f575; +fma.rn.f32 f694, f689, f574, f693; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f697, f578; +mul.f32 f701, f699, f579; +sub.f32 f702, f700, f701; +mul.f32 f703, f697, f579; +fma.rn.f32 f704, f699, f578, f703; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f707, f582; +mul.f32 f711, f709, f583; +sub.f32 f712, f710, f711; +mul.f32 f713, f707, f583; +fma.rn.f32 f714, f709, f582, f713; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f717, f586; +mul.f32 f721, f719, f587; +sub.f32 f722, f720, f721; +mul.f32 f723, f717, f587; +fma.rn.f32 f724, f719, f586, f723; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f727, f590; +mul.f32 f731, f729, f591; +sub.f32 f732, f730, f731; +mul.f32 f733, f727, f591; +fma.rn.f32 f734, f729, f590, f733; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f737, f594; +mul.f32 f741, f739, f595; +sub.f32 f742, f740, f741; +mul.f32 f743, f737, f595; +fma.rn.f32 f744, f739, f594, f743; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1024; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f602; +st.shared.f32 [r20+128], f612; +st.shared.f32 [r20+192], f622; +st.shared.f32 [r20+256], f632; +st.shared.f32 [r20+320], f642; +st.shared.f32 [r20+384], f652; +st.shared.f32 [r20+448], f662; +st.shared.f32 [r20+512], f672; +st.shared.f32 [r20+576], f682; +st.shared.f32 [r20+640], f692; +st.shared.f32 [r20+704], f702; +st.shared.f32 [r20+768], f712; +st.shared.f32 [r20+832], f722; +st.shared.f32 [r20+896], f732; +st.shared.f32 [r20+960], f742; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+128]; +ld.shared.f32 f747, [r21+256]; +ld.shared.f32 f748, [r21+384]; +ld.shared.f32 f749, [r21+512]; +ld.shared.f32 f750, [r21+640]; +ld.shared.f32 f751, [r21+768]; +ld.shared.f32 f752, [r21+896]; +ld.shared.f32 f753, [r21+1024]; +ld.shared.f32 f754, [r21+1152]; +ld.shared.f32 f755, [r21+1280]; +ld.shared.f32 f756, [r21+1408]; +ld.shared.f32 f757, [r21+1536]; +ld.shared.f32 f758, [r21+1664]; +ld.shared.f32 f759, [r21+1792]; +ld.shared.f32 f760, [r21+1920]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+128]; +ld.shared.f32 f763, [r21+256]; +ld.shared.f32 f764, [r21+384]; +ld.shared.f32 f765, [r21+512]; +ld.shared.f32 f766, [r21+640]; +ld.shared.f32 f767, [r21+768]; +ld.shared.f32 f768, [r21+896]; +ld.shared.f32 f769, [r21+1024]; +ld.shared.f32 f770, [r21+1152]; +ld.shared.f32 f771, [r21+1280]; +ld.shared.f32 f772, [r21+1408]; +ld.shared.f32 f773, [r21+1536]; +ld.shared.f32 f774, [r21+1664]; +ld.shared.f32 f775, [r21+1792]; +ld.shared.f32 f776, [r21+1920]; +add.f32 %0, f745, f753; +add.f32 %1, f761, f769; +add.f32 %2, f746, f754; +add.f32 %3, f762, f770; +add.f32 %4, f747, f755; +add.f32 %5, f763, f771; +add.f32 %6, f748, f756; +add.f32 %7, f764, f772; +add.f32 %8, f749, f757; +add.f32 %9, f765, f773; +add.f32 %10, f750, f758; +add.f32 %11, f766, f774; +add.f32 %12, f751, f759; +add.f32 %13, f767, f775; +add.f32 %14, f752, f760; +add.f32 %15, f768, f776; +sub.f32 %16, f745, f753; +sub.f32 %17, f761, f769; +sub.f32 %18, f746, f754; +sub.f32 %19, f762, f770; +sub.f32 %20, f747, f755; +sub.f32 %21, f763, f771; +sub.f32 %22, f748, f756; +sub.f32 %23, f764, f772; +sub.f32 %24, f749, f757; +sub.f32 %25, f765, f773; +sub.f32 %26, f750, f758; +sub.f32 %27, f766, f774; +sub.f32 %28, f751, f759; +sub.f32 %29, f767, f775; +sub.f32 %30, f752, f760; +sub.f32 %31, f768, f776; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<76, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1057>; +.reg .b32 r<35>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1046, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1044, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1043, f1046, f1044; +sub.f32 f76, f1046, f1044; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f1042, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1039, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1037, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1036, f1039, f1037; +sub.f32 f92, f1039, f1037; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f1035, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f1035, 0fBF3504F3; +mul.f32 f1034, f93, 0f3F3504F3; +sub.f32 f99, f1034, f98; +mul.f32 f100, f1035, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1033, f1043, f1036; +sub.f32 f109, f1043, f1036; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1032, f1042, f101; +sub.f32 f113, f1042, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f1031, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f1030, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1028, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1025, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1024, f1028, f1025; +sub.f32 f133, f1028, f1025; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f1023, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1021, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1019, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1018, f1021, f1019; +sub.f32 f149, f1021, f1019; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f1017, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f1017, 0fBF3504F3; +mul.f32 f1016, f150, 0f3F3504F3; +sub.f32 f156, f1016, f155; +mul.f32 f157, f1017, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1015, f1024, f1018; +sub.f32 f166, f1024, f1018; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1014, f1023, f158; +sub.f32 f170, f1023, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f1013, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f1012, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1010, f167, 0f3F6C835E; +mul.f32 f1011, f1014, 0fBEC3EF15; +sub.f32 f181, f1010, f1011; +mul.f32 f182, f1014, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f1008, f171, 0f3F3504F3; +mul.f32 f1009, f1013, 0fBF3504F3; +sub.f32 f186, f1008, f1009; +mul.f32 f187, f1013, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f1006, f175, 0f3EC3EF15; +mul.f32 f1007, f1012, 0fBF6C835E; +sub.f32 f191, f1006, f1007; +mul.f32 f192, f1012, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f1004, f169, 0fBEC3EF15; +mul.f32 f1005, f170, 0fBF6C835E; +sub.f32 f196, f1004, f1005; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f1002, f177, 0fBF6C835E; +mul.f32 f1003, f178, 0fBEC3EF15; +sub.f32 f205, f1002, f1003; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1001, f1032, f183; +sub.f32 f213, f1032, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1000, f1031, f188; +sub.f32 f217, f1031, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f999, f1030, f193; +sub.f32 f221, f1030, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f998, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f997, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f996, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f995, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f1001; +mul.f32 f244, f238, f1001; +mul.f32 f246, f239, f239; +mul.f32 f994, f238, f238; +sub.f32 f247, f994, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f1000; +mul.f32 f252, f247, f1000; +mul.f32 f992, f238, f247; +mul.f32 f993, f239, f249; +sub.f32 f255, f992, f993; +mul.f32 f991, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f999; +mul.f32 f260, f255, f999; +mul.f32 f262, f239, f257; +mul.f32 f990, f238, f255; +sub.f32 f263, f990, f262; +mul.f32 f989, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f998; +mul.f32 f268, f263, f998; +mul.f32 f270, f239, f265; +mul.f32 f988, f238, f263; +sub.f32 f271, f988, f270; +mul.f32 f987, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f997; +mul.f32 f276, f271, f997; +mul.f32 f985, f238, f271; +mul.f32 f986, f239, f273; +sub.f32 f279, f985, f986; +mul.f32 f984, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f996; +mul.f32 f284, f279, f996; +mul.f32 f286, f239, f281; +mul.f32 f983, f238, f279; +sub.f32 f287, f983, f286; +mul.f32 f982, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f995; +mul.f32 f292, f287, f995; +mul.f32 f294, f239, f289; +mul.f32 f981, f238, f287; +sub.f32 f295, f981, f294; +mul.f32 f980, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f979, f1033, f1015; +mul.f32 f299, f297, f979; +mul.f32 f300, f295, f979; +mul.f32 f977, f238, f295; +mul.f32 f978, f239, f297; +sub.f32 f303, f977, f978; +sub.f32 f976, f106, f163; +mul.f32 f975, f295, f976; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f974, f238, f303; +sub.f32 f311, f974, f310; +mul.f32 f973, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f971, f238, f311; +mul.f32 f972, f239, f313; +sub.f32 f319, f971, f972; +mul.f32 f970, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f969, f238, f319; +sub.f32 f327, f969, f326; +mul.f32 f968, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f967, f238, f327; +sub.f32 f335, f967, f334; +mul.f32 f966, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f964, f238, f335; +mul.f32 f965, f239, f337; +sub.f32 f343, f964, f965; +mul.f32 f963, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f962, f238, f343; +sub.f32 f351, f962, f350; +mul.f32 f961, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f960, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f1054, f1033, f1015; +mul.f32 f1053, f297, f1054; +barrier.sync 0; +and.b32 r11, r7, 3968; +add.s32 r12, r9, r11; +add.f32 f357, f1033, f1015; +sub.f32 f1049, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r34, %tid.x; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f961, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f991, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f989, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f987, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f984, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f982, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f980, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f1049, f300; +sub.f32 f374, f975, f1053; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f973, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f970, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f968, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f966, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f963, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f960, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +and.b32 r21, r34, 31; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+256]; +ld.shared.v2.f32 {f397, f398}, [r13+512]; +ld.shared.v2.f32 {f401, f402}, [r13+768]; +ld.shared.v2.f32 {f405, f406}, [r13+1024]; +ld.shared.v2.f32 {f409, f410}, [r13+1280]; +ld.shared.v2.f32 {f413, f414}, [r13+1536]; +ld.shared.v2.f32 {f417, f418}, [r13+1792]; +ld.shared.v2.f32 {f421, f422}, [r13+2048]; +ld.shared.v2.f32 {f425, f426}, [r13+2304]; +ld.shared.v2.f32 {f429, f430}, [r13+2560]; +ld.shared.v2.f32 {f433, f434}, [r13+2816]; +ld.shared.v2.f32 {f437, f438}, [r13+3072]; +ld.shared.v2.f32 {f441, f442}, [r13+3328]; +ld.shared.v2.f32 {f445, f446}, [r13+3584]; +ld.shared.v2.f32 {f449, f450}, [r13+3840]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f959, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f958, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f957, f959, f958; +sub.f32 f464, f959, f958; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f956, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f955, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f954, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f953, f955, f954; +sub.f32 f480, f955, f954; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f952, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f950, f481, 0f3F3504F3; +mul.f32 f951, f952, 0fBF3504F3; +sub.f32 f487, f950, f951; +mul.f32 f488, f952, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f949, f957, f953; +sub.f32 f497, f957, f953; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f948, f956, f489; +sub.f32 f501, f956, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f947, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f946, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f945, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f944, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f943, f945, f944; +sub.f32 f521, f945, f944; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f942, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f941, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f940, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f939, f941, f940; +sub.f32 f537, f941, f940; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f938, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f936, f538, 0f3F3504F3; +mul.f32 f937, f938, 0fBF3504F3; +sub.f32 f544, f936, f937; +mul.f32 f545, f938, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f935, f943, f939; +sub.f32 f554, f943, f939; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f934, f942, f546; +sub.f32 f558, f942, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f933, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f932, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f934, 0fBEC3EF15; +mul.f32 f931, f555, 0f3F6C835E; +sub.f32 f569, f931, f568; +mul.f32 f570, f934, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f933, 0fBF3504F3; +mul.f32 f930, f559, 0f3F3504F3; +sub.f32 f574, f930, f573; +mul.f32 f575, f933, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f928, f563, 0f3EC3EF15; +mul.f32 f929, f932, 0fBF6C835E; +sub.f32 f579, f928, f929; +mul.f32 f580, f932, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f926, f557, 0fBEC3EF15; +mul.f32 f927, f558, 0fBF6C835E; +sub.f32 f584, f926, f927; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f925, f565, 0fBF6C835E; +sub.f32 f593, f925, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f924, f948, f571; +sub.f32 f601, f948, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f923, f947, f576; +sub.f32 f605, f947, f576; +add.f32 f606, f506, f579; +sub.f32 f608, f506, f579; +add.f32 f922, f946, f581; +sub.f32 f609, f946, f581; +add.f32 f610, f496, f554; +sub.f32 f612, f496, f554; +sub.f32 f921, f497, f553; +add.f32 f613, f497, f553; +add.f32 f614, f500, f584; +sub.f32 f616, f500, f584; +add.f32 f920, f501, f586; +sub.f32 f617, f501, f586; +add.f32 f618, f504, f589; +sub.f32 f620, f504, f589; +add.f32 f919, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f918, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r34, 16; +bfe.u32 r15, r34, 4, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f631, f627, f924; +mul.f32 f632, f626, f924; +mul.f32 f634, f627, f627; +mul.f32 f917, f626, f626; +sub.f32 f635, f917, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f639, f637, f923; +mul.f32 f640, f635, f923; +mul.f32 f915, f626, f635; +mul.f32 f916, f627, f637; +sub.f32 f643, f915, f916; +mul.f32 f914, f635, f602; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f647, f645, f922; +mul.f32 f648, f643, f922; +mul.f32 f650, f627, f645; +mul.f32 f913, f626, f643; +sub.f32 f651, f913, f650; +mul.f32 f912, f643, f606; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f655, f653, f921; +mul.f32 f656, f651, f921; +mul.f32 f658, f627, f653; +mul.f32 f911, f626, f651; +sub.f32 f659, f911, f658; +mul.f32 f910, f651, f610; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f663, f661, f920; +mul.f32 f664, f659, f920; +mul.f32 f908, f626, f659; +mul.f32 f909, f627, f661; +sub.f32 f667, f908, f909; +mul.f32 f907, f659, f614; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f671, f669, f919; +mul.f32 f672, f667, f919; +mul.f32 f674, f627, f669; +mul.f32 f906, f626, f667; +sub.f32 f675, f906, f674; +mul.f32 f905, f667, f618; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f679, f677, f918; +mul.f32 f680, f675, f918; +mul.f32 f682, f627, f677; +mul.f32 f904, f626, f675; +sub.f32 f683, f904, f682; +mul.f32 f903, f675, f622; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f902, f949, f935; +mul.f32 f687, f685, f902; +mul.f32 f688, f683, f902; +sub.f32 f901, f494, f551; +mul.f32 f899, f626, f683; +mul.f32 f900, f627, f685; +sub.f32 f691, f899, f900; +mul.f32 f898, f683, f901; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f695, f693, f601; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f897, f626, f691; +sub.f32 f699, f897, f698; +mul.f32 f896, f691, f600; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f703, f701, f605; +mul.f32 f704, f699, f605; +mul.f32 f894, f626, f699; +mul.f32 f895, f627, f701; +sub.f32 f707, f894, f895; +mul.f32 f893, f699, f604; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f711, f709, f609; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f892, f626, f707; +sub.f32 f715, f892, f714; +mul.f32 f891, f707, f608; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f719, f717, f613; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f890, f626, f715; +sub.f32 f723, f890, f722; +mul.f32 f889, f715, f612; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f727, f725, f617; +mul.f32 f728, f723, f617; +mul.f32 f887, f626, f723; +mul.f32 f888, f627, f725; +sub.f32 f731, f887, f888; +mul.f32 f886, f723, f616; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f735, f733, f621; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f885, f626, f731; +sub.f32 f739, f885, f738; +mul.f32 f884, f626, f598; +mul.f32 f740, f626, f733; +mul.f32 f883, f731, f620; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f739, f624; +mul.f32 f743, f741, f625; +mul.f32 f744, f739, f625; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 3; +and.b32 r16, r23, 120; +add.s32 r17, r9, r16; +mov.u32 r26, %tid.x; +shl.b32 r25, r26, 7; +barrier.sync 0; +and.b32 r18, r25, 2048; +add.s32 r19, r17, r18; +mov.u32 r28, %tid.x; +and.b32 r27, r28, 16; +add.f32 f745, f949, f935; +sub.f32 f1052, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 16; +fma.rn.f32 f747, f627, f598, f632; +sub.f32 f748, f884, f631; +st.shared.v2.f32 [r19+128], {f748, f747}; +fma.rn.f32 f749, f637, f602, f640; +sub.f32 f750, f914, f639; +st.shared.v2.f32 [r19+256], {f750, f749}; +fma.rn.f32 f751, f645, f606, f648; +sub.f32 f752, f912, f647; +st.shared.v2.f32 [r19+384], {f752, f751}; +fma.rn.f32 f753, f653, f610, f656; +sub.f32 f754, f910, f655; +st.shared.v2.f32 [r19+512], {f754, f753}; +sub.f32 f755, f907, f663; +fma.rn.f32 f756, f661, f614, f664; +st.shared.v2.f32 [r19+640], {f755, f756}; +fma.rn.f32 f757, f669, f618, f672; +sub.f32 f758, f905, f671; +st.shared.v2.f32 [r19+768], {f758, f757}; +fma.rn.f32 f759, f677, f622, f680; +sub.f32 f760, f903, f679; +st.shared.v2.f32 [r19+896], {f760, f759}; +fma.rn.f32 f761, f685, f1052, f688; +sub.f32 f762, f898, f687; +st.shared.v2.f32 [r19+1024], {f762, f761}; +fma.rn.f32 f763, f693, f600, f696; +sub.f32 f764, f896, f695; +st.shared.v2.f32 [r19+1152], {f764, f763}; +fma.rn.f32 f765, f701, f604, f704; +sub.f32 f766, f893, f703; +st.shared.v2.f32 [r19+1280], {f766, f765}; +fma.rn.f32 f767, f709, f608, f712; +sub.f32 f768, f891, f711; +st.shared.v2.f32 [r19+1408], {f768, f767}; +fma.rn.f32 f769, f717, f612, f720; +sub.f32 f770, f889, f719; +st.shared.v2.f32 [r19+1536], {f770, f769}; +fma.rn.f32 f771, f725, f616, f728; +sub.f32 f772, f886, f727; +st.shared.v2.f32 [r19+1664], {f772, f771}; +fma.rn.f32 f773, f733, f620, f736; +sub.f32 f774, f883, f735; +st.shared.v2.f32 [r19+1792], {f774, f773}; +fma.rn.f32 f775, f741, f624, f744; +sub.f32 f776, f742, f743; +st.shared.v2.f32 [r19+1920], {f776, f775}; +barrier.sync 0; +mad.lo.s32 r20, r30, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+256]; +ld.shared.v2.f32 {f785, f786}, [r20+512]; +ld.shared.v2.f32 {f789, f790}, [r20+768]; +ld.shared.v2.f32 {f793, f794}, [r20+1024]; +ld.shared.v2.f32 {f797, f798}, [r20+1280]; +ld.shared.v2.f32 {f801, f802}, [r20+1536]; +ld.shared.v2.f32 {f805, f806}, [r20+1792]; +ld.shared.v2.f32 {f809, f810}, [r20+2048]; +ld.shared.v2.f32 {f813, f814}, [r20+2304]; +ld.shared.v2.f32 {f817, f818}, [r20+2560]; +ld.shared.v2.f32 {f821, f822}, [r20+2816]; +ld.shared.v2.f32 {f825, f826}, [r20+3072]; +ld.shared.v2.f32 {f829, f830}, [r20+3328]; +ld.shared.v2.f32 {f833, f834}, [r20+3584]; +ld.shared.v2.f32 {f837, f838}, [r20+3840]; +add.f32 %0, f777, f809; +add.f32 %1, f778, f810; +add.f32 %2, f781, f813; +add.f32 %3, f782, f814; +add.f32 %5, f786, f818; +add.f32 %4, f785, f817; +add.f32 %7, f790, f822; +add.f32 %6, f789, f821; +add.f32 %8, f793, f825; +add.f32 %9, f794, f826; +add.f32 %10, f797, f829; +add.f32 %11, f798, f830; +add.f32 %12, f801, f833; +add.f32 %13, f802, f834; +add.f32 %14, f805, f837; +add.f32 %15, f806, f838; +sub.f32 %17, f778, f810; +sub.f32 %16, f777, f809; +sub.f32 %19, f782, f814; +sub.f32 %18, f781, f813; +sub.f32 %21, f786, f818; +sub.f32 %20, f785, f817; +sub.f32 %23, f790, f822; +sub.f32 %22, f789, f821; +sub.f32 %25, f794, f826; +sub.f32 %24, f793, f825; +sub.f32 %27, f798, f830; +sub.f32 %26, f797, f829; +sub.f32 %29, f802, f834; +sub.f32 %28, f801, f833; +sub.f32 %31, f806, f838; +sub.f32 %30, f805, f837; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<75, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1652>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1650, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1648, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1647, f1650, f1648; +sub.f32 f140, f1650, f1648; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1646, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1643, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1641, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1640, f1643, f1641; +sub.f32 f156, f1643, f1641; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1639, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1639, 0fBF3504F3; +mul.f32 f1638, f157, 0f3F3504F3; +sub.f32 f163, f1638, f162; +mul.f32 f164, f1639, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1637, f1647, f1640; +sub.f32 f173, f1647, f1640; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1636, f1646, f165; +sub.f32 f177, f1646, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1635, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1634, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1632, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1629, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1628, f1632, f1629; +sub.f32 f197, f1632, f1629; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1627, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1625, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1623, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1622, f1625, f1623; +sub.f32 f213, f1625, f1623; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1621, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1621, 0fBF3504F3; +mul.f32 f1620, f214, 0f3F3504F3; +sub.f32 f220, f1620, f219; +mul.f32 f221, f1621, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1619, f1628, f1622; +sub.f32 f230, f1628, f1622; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1618, f1627, f222; +sub.f32 f234, f1627, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1617, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1616, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1614, f231, 0f3F6C835E; +mul.f32 f1615, f1618, 0fBEC3EF15; +sub.f32 f245, f1614, f1615; +mul.f32 f246, f1618, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1612, f235, 0f3F3504F3; +mul.f32 f1613, f1617, 0fBF3504F3; +sub.f32 f250, f1612, f1613; +mul.f32 f251, f1617, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1610, f239, 0f3EC3EF15; +mul.f32 f1611, f1616, 0fBF6C835E; +sub.f32 f255, f1610, f1611; +mul.f32 f256, f1616, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1608, f233, 0fBEC3EF15; +mul.f32 f1609, f234, 0fBF6C835E; +sub.f32 f260, f1608, f1609; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1606, f241, 0fBF6C835E; +mul.f32 f1607, f242, 0fBEC3EF15; +sub.f32 f269, f1606, f1607; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1605, f1637, f1619; +sub.f32 f275, f1637, f1619; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1604, f1636, f247; +sub.f32 f279, f1636, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1603, f1635, f252; +sub.f32 f283, f1635, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1602, f1634, f257; +sub.f32 f287, f1634, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1601, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1600, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1599, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1598, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1595, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1593, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1592, f1595, f1593; +sub.f32 f315, f1595, f1593; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1591, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1589, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1586, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1585, f1589, f1586; +sub.f32 f331, f1589, f1586; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1584, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1582, f332, 0f3F3504F3; +mul.f32 f1583, f1584, 0fBF3504F3; +sub.f32 f338, f1582, f1583; +mul.f32 f339, f1584, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1581, f1592, f1585; +sub.f32 f348, f1592, f1585; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1580, f1591, f340; +sub.f32 f352, f1591, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1579, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1578, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1576, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1574, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1573, f1576, f1574; +sub.f32 f372, f1576, f1574; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1572, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1569, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1568, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1567, f1569, f1568; +sub.f32 f388, f1569, f1568; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1566, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1564, f389, 0f3F3504F3; +mul.f32 f1565, f1566, 0fBF3504F3; +sub.f32 f395, f1564, f1565; +mul.f32 f396, f1566, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1563, f1573, f1567; +sub.f32 f405, f1573, f1567; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1562, f1572, f397; +sub.f32 f409, f1572, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1561, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1560, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1562, 0fBEC3EF15; +mul.f32 f1559, f406, 0f3F6C835E; +sub.f32 f420, f1559, f419; +mul.f32 f421, f1562, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1561, 0fBF3504F3; +mul.f32 f1558, f410, 0f3F3504F3; +sub.f32 f425, f1558, f424; +mul.f32 f426, f1561, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1556, f414, 0f3EC3EF15; +mul.f32 f1557, f1560, 0fBF6C835E; +sub.f32 f430, f1556, f1557; +mul.f32 f431, f1560, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1554, f408, 0fBEC3EF15; +mul.f32 f1555, f409, 0fBF6C835E; +sub.f32 f435, f1554, f1555; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1553, f416, 0fBF6C835E; +sub.f32 f444, f1553, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1552, f1581, f1563; +sub.f32 f450, f1581, f1563; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1551, f1580, f422; +sub.f32 f454, f1580, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1550, f1579, f427; +sub.f32 f458, f1579, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1549, f1578, f432; +sub.f32 f462, f1578, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1548, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1547, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1546, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1545, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1551, 0fBE47C5C2; +mul.f32 f1544, f451, 0f3F7B14BE; +sub.f32 f481, f1544, f480; +mul.f32 f482, f1551, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1550, 0fBEC3EF15; +mul.f32 f1543, f455, 0f3F6C835E; +sub.f32 f486, f1543, f485; +mul.f32 f487, f1550, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1549, 0fBF0E39DA; +mul.f32 f1542, f459, 0f3F54DB31; +sub.f32 f491, f1542, f490; +mul.f32 f492, f1549, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1548, 0fBF3504F3; +mul.f32 f1541, f463, 0f3F3504F3; +sub.f32 f496, f1541, f495; +mul.f32 f497, f1548, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1539, f467, 0f3F0E39DA; +mul.f32 f1540, f1547, 0fBF54DB31; +sub.f32 f501, f1539, f1540; +mul.f32 f502, f1547, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1537, f471, 0f3EC3EF15; +mul.f32 f1538, f1546, 0fBF6C835E; +sub.f32 f506, f1537, f1538; +mul.f32 f507, f1546, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1535, f475, 0f3E47C5C2; +mul.f32 f1536, f1545, 0fBF7B14BE; +sub.f32 f511, f1535, f1536; +mul.f32 f512, f1545, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1533, f453, 0fBE47C5C2; +mul.f32 f1534, f454, 0fBF7B14BE; +sub.f32 f516, f1533, f1534; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1532, f457, 0fBEC3EF15; +sub.f32 f521, f1532, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1531, f461, 0fBF0E39DA; +sub.f32 f526, f1531, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1529, f469, 0fBF54DB31; +mul.f32 f1530, f470, 0fBF0E39DA; +sub.f32 f535, f1529, f1530; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1528, f473, 0fBF6C835E; +sub.f32 f540, f1528, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1527, f477, 0fBF7B14BE; +sub.f32 f545, f1527, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1526, f1605, f1552; +sub.f32 f551, f1605, f1552; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1525, f1604, f483; +sub.f32 f555, f1604, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1524, f1603, f488; +sub.f32 f559, f1603, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1523, f1602, f493; +sub.f32 f563, f1602, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1522, f1601, f498; +sub.f32 f567, f1601, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f1521, f1600, f503; +sub.f32 f571, f1600, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f1520, f1599, f508; +sub.f32 f575, f1599, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f1519, f1598, f513; +sub.f32 f579, f1598, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f1518, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f1517, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f1516, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f1515, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f1514, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1513, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1512, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1511, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f1525; +mul.f32 f1510, f612, f552; +sub.f32 f618, f1510, f617; +mul.f32 f619, f612, f1525; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f1508, f612, f612; +mul.f32 f1509, f613, f613; +sub.f32 f623, f1508, f1509; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f1506, f623, f556; +mul.f32 f1507, f625, f1524; +sub.f32 f628, f1506, f1507; +mul.f32 f629, f623, f1524; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f1504, f612, f623; +mul.f32 f1505, f613, f625; +sub.f32 f633, f1504, f1505; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f1502, f633, f560; +mul.f32 f1503, f635, f1523; +sub.f32 f638, f1502, f1503; +mul.f32 f639, f633, f1523; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f1501, f612, f633; +sub.f32 f643, f1501, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f1522; +mul.f32 f1500, f643, f564; +sub.f32 f648, f1500, f647; +mul.f32 f649, f643, f1522; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f1499, f612, f643; +sub.f32 f653, f1499, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f1521; +mul.f32 f1498, f653, f568; +sub.f32 f658, f1498, f657; +mul.f32 f659, f653, f1521; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f1497, f612, f653; +sub.f32 f663, f1497, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f1495, f663, f572; +mul.f32 f1496, f665, f1520; +sub.f32 f668, f1495, f1496; +mul.f32 f669, f663, f1520; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f1493, f612, f663; +mul.f32 f1494, f613, f665; +sub.f32 f673, f1493, f1494; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f1491, f673, f576; +mul.f32 f1492, f675, f1519; +sub.f32 f678, f1491, f1492; +mul.f32 f679, f673, f1519; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f1489, f612, f673; +mul.f32 f1490, f613, f675; +sub.f32 f683, f1489, f1490; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f1518; +mul.f32 f1488, f683, f580; +sub.f32 f688, f1488, f687; +mul.f32 f689, f683, f1518; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f1487, f612, f683; +sub.f32 f693, f1487, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f1517; +mul.f32 f1486, f693, f584; +sub.f32 f698, f1486, f697; +mul.f32 f699, f693, f1517; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f1485, f612, f693; +sub.f32 f703, f1485, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f1516; +mul.f32 f1484, f703, f588; +sub.f32 f708, f1484, f707; +mul.f32 f709, f703, f1516; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f1482, f612, f703; +mul.f32 f1483, f613, f705; +sub.f32 f713, f1482, f1483; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f1480, f713, f592; +mul.f32 f1481, f715, f1515; +sub.f32 f718, f1480, f1481; +mul.f32 f719, f713, f1515; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f1478, f612, f713; +mul.f32 f1479, f613, f715; +sub.f32 f723, f1478, f1479; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f1476, f723, f596; +mul.f32 f1477, f725, f1514; +sub.f32 f728, f1476, f1477; +mul.f32 f729, f723, f1514; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f1475, f612, f723; +sub.f32 f733, f1475, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f1513; +mul.f32 f1474, f733, f600; +sub.f32 f738, f1474, f737; +mul.f32 f739, f733, f1513; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f1473, f612, f733; +sub.f32 f743, f1473, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f1512; +mul.f32 f1472, f743, f604; +sub.f32 f748, f1472, f747; +mul.f32 f749, f743, f1512; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f1471, f612, f743; +sub.f32 f753, f1471, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f1511; +mul.f32 f1470, f753, f608; +sub.f32 f758, f1470, f757; +mul.f32 f759, f753, f1511; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f1468, f612, f753; +mul.f32 f1469, f613, f755; +sub.f32 f763, f1468, f1469; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f1466, f763, f550; +mul.f32 f1467, f765, f551; +sub.f32 f768, f1466, f1467; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f1464, f612, f763; +mul.f32 f1465, f613, f765; +sub.f32 f773, f1464, f1465; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f1463, f773, f554; +sub.f32 f778, f1463, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f1462, f612, f773; +sub.f32 f783, f1462, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f1461, f783, f558; +sub.f32 f788, f1461, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f1460, f612, f783; +sub.f32 f793, f1460, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f1459, f793, f562; +sub.f32 f798, f1459, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f1458, f612, f793; +sub.f32 f803, f1458, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f1456, f803, f566; +mul.f32 f1457, f805, f567; +sub.f32 f808, f1456, f1457; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f1454, f612, f803; +mul.f32 f1455, f613, f805; +sub.f32 f813, f1454, f1455; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f1452, f813, f570; +mul.f32 f1453, f815, f571; +sub.f32 f818, f1452, f1453; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f1450, f612, f813; +mul.f32 f1451, f613, f815; +sub.f32 f823, f1450, f1451; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f1449, f823, f574; +sub.f32 f828, f1449, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f1448, f612, f823; +sub.f32 f833, f1448, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f1447, f833, f578; +sub.f32 f838, f1447, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f1446, f612, f833; +sub.f32 f843, f1446, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f1445, f843, f582; +sub.f32 f848, f1445, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f1443, f612, f843; +mul.f32 f1444, f613, f845; +sub.f32 f853, f1443, f1444; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f1441, f853, f586; +mul.f32 f1442, f855, f587; +sub.f32 f858, f1441, f1442; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f1439, f612, f853; +mul.f32 f1440, f613, f855; +sub.f32 f863, f1439, f1440; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f1437, f863, f590; +mul.f32 f1438, f865, f591; +sub.f32 f868, f1437, f1438; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f1436, f612, f863; +sub.f32 f873, f1436, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f1435, f873, f594; +sub.f32 f878, f1435, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f1434, f612, f873; +sub.f32 f883, f1434, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f1433, f883, f598; +sub.f32 f888, f1433, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f1432, f612, f883; +sub.f32 f893, f1432, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f1430, f893, f602; +mul.f32 f1431, f895, f603; +sub.f32 f898, f1430, f1431; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f1428, f612, f893; +mul.f32 f1429, f613, f895; +sub.f32 f903, f1428, f1429; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f1426, f903, f606; +mul.f32 f1427, f905, f607; +sub.f32 f908, f1426, f1427; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f1424, f612, f903; +mul.f32 f1425, f613, f905; +sub.f32 f913, f1424, f1425; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f1423, f913, f610; +sub.f32 f918, f1423, f917; +mov.u32 r17, %tid.x; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +and.b32 r14, r17, 15; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1920; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+64]; +ld.shared.f32 f923, [r13+128]; +ld.shared.f32 f924, [r13+192]; +ld.shared.f32 f925, [r13+256]; +ld.shared.f32 f926, [r13+320]; +ld.shared.f32 f927, [r13+384]; +ld.shared.f32 f928, [r13+448]; +ld.shared.f32 f929, [r13+512]; +ld.shared.f32 f930, [r13+576]; +ld.shared.f32 f931, [r13+640]; +ld.shared.f32 f932, [r13+704]; +ld.shared.f32 f933, [r13+768]; +ld.shared.f32 f934, [r13+832]; +ld.shared.f32 f935, [r13+896]; +ld.shared.f32 f936, [r13+960]; +ld.shared.f32 f937, [r13+1024]; +ld.shared.f32 f938, [r13+1088]; +ld.shared.f32 f939, [r13+1152]; +ld.shared.f32 f940, [r13+1216]; +ld.shared.f32 f941, [r13+1280]; +ld.shared.f32 f942, [r13+1344]; +ld.shared.f32 f943, [r13+1408]; +ld.shared.f32 f944, [r13+1472]; +ld.shared.f32 f945, [r13+1536]; +ld.shared.f32 f946, [r13+1600]; +ld.shared.f32 f947, [r13+1664]; +ld.shared.f32 f948, [r13+1728]; +ld.shared.f32 f949, [r13+1792]; +ld.shared.f32 f950, [r13+1856]; +ld.shared.f32 f951, [r13+1920]; +ld.shared.f32 f952, [r13+1984]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1526, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+64]; +ld.shared.f32 f955, [r13+128]; +ld.shared.f32 f956, [r13+192]; +ld.shared.f32 f957, [r13+256]; +ld.shared.f32 f958, [r13+320]; +ld.shared.f32 f959, [r13+384]; +ld.shared.f32 f960, [r13+448]; +ld.shared.f32 f961, [r13+512]; +ld.shared.f32 f962, [r13+576]; +ld.shared.f32 f963, [r13+640]; +ld.shared.f32 f964, [r13+704]; +ld.shared.f32 f965, [r13+768]; +ld.shared.f32 f966, [r13+832]; +ld.shared.f32 f967, [r13+896]; +ld.shared.f32 f968, [r13+960]; +ld.shared.f32 f969, [r13+1024]; +ld.shared.f32 f970, [r13+1088]; +ld.shared.f32 f971, [r13+1152]; +ld.shared.f32 f972, [r13+1216]; +ld.shared.f32 f973, [r13+1280]; +ld.shared.f32 f974, [r13+1344]; +ld.shared.f32 f975, [r13+1408]; +ld.shared.f32 f976, [r13+1472]; +ld.shared.f32 f977, [r13+1536]; +ld.shared.f32 f978, [r13+1600]; +ld.shared.f32 f979, [r13+1664]; +ld.shared.f32 f980, [r13+1728]; +ld.shared.f32 f981, [r13+1792]; +ld.shared.f32 f982, [r13+1856]; +ld.shared.f32 f983, [r13+1920]; +ld.shared.f32 f984, [r13+1984]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1422, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1421, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f1420, f1422, f1421; +sub.f32 f996, f1422, f1421; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f1419, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f1418, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f1417, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f1416, f1418, f1417; +sub.f32 f1012, f1418, f1417; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f1415, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f1415, 0fBF3504F3; +mul.f32 f1414, f1013, 0f3F3504F3; +sub.f32 f1019, f1414, f1018; +mul.f32 f1020, f1415, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f1413, f1420, f1416; +sub.f32 f1029, f1420, f1416; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f1412, f1419, f1021; +sub.f32 f1033, f1419, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f1411, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f1410, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f1409, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f1408, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f1407, f1409, f1408; +sub.f32 f1053, f1409, f1408; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f1406, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f1405, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f1404, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f1403, f1405, f1404; +sub.f32 f1069, f1405, f1404; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f1402, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f1402, 0fBF3504F3; +mul.f32 f1401, f1070, 0f3F3504F3; +sub.f32 f1076, f1401, f1075; +mul.f32 f1077, f1402, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f1400, f1407, f1403; +sub.f32 f1086, f1407, f1403; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f1399, f1406, f1078; +sub.f32 f1090, f1406, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f1398, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f1397, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f1395, f1087, 0f3F6C835E; +mul.f32 f1396, f1399, 0fBEC3EF15; +sub.f32 f1101, f1395, f1396; +mul.f32 f1102, f1399, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f1393, f1091, 0f3F3504F3; +mul.f32 f1394, f1398, 0fBF3504F3; +sub.f32 f1106, f1393, f1394; +mul.f32 f1107, f1398, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f1397, 0fBF6C835E; +mul.f32 f1392, f1095, 0f3EC3EF15; +sub.f32 f1111, f1392, f1110; +mul.f32 f1112, f1397, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f1391, f1089, 0fBEC3EF15; +sub.f32 f1116, f1391, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f1389, f1097, 0fBF6C835E; +mul.f32 f1390, f1098, 0fBEC3EF15; +sub.f32 f1125, f1389, f1390; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f922, f938; +sub.f32 f1130, f922, f938; +add.f32 f1388, f954, f970; +sub.f32 f1131, f954, f970; +add.f32 f1132, f930, f946; +sub.f32 f1134, f930, f946; +add.f32 f1387, f962, f978; +sub.f32 f1135, f962, f978; +add.f32 f1136, f1128, f1132; +sub.f32 f1138, f1128, f1132; +add.f32 f1386, f1388, f1387; +sub.f32 f1139, f1388, f1387; +add.f32 f1140, f1130, f1135; +sub.f32 f1142, f1130, f1135; +sub.f32 f1385, f1131, f1134; +add.f32 f1143, f1131, f1134; +add.f32 f1144, f926, f942; +sub.f32 f1146, f926, f942; +add.f32 f1384, f958, f974; +sub.f32 f1147, f958, f974; +add.f32 f1148, f934, f950; +sub.f32 f1150, f934, f950; +add.f32 f1383, f966, f982; +sub.f32 f1151, f966, f982; +add.f32 f1152, f1144, f1148; +sub.f32 f1154, f1144, f1148; +add.f32 f1382, f1384, f1383; +sub.f32 f1155, f1384, f1383; +add.f32 f1156, f1146, f1151; +sub.f32 f1158, f1146, f1151; +sub.f32 f1381, f1147, f1150; +add.f32 f1159, f1147, f1150; +mul.f32 f1161, f1381, 0fBF3504F3; +mul.f32 f1380, f1156, 0f3F3504F3; +sub.f32 f1162, f1380, f1161; +mul.f32 f1163, f1381, 0f3F3504F3; +fma.rn.f32 f1164, f1156, 0fBF3504F3, f1163; +mul.f32 f1165, f1158, 0fBF3504F3; +mul.f32 f1166, f1159, 0fBF3504F3; +sub.f32 f1167, f1165, f1166; +add.f32 f1168, f1165, f1166; +add.f32 f1169, f1136, f1152; +sub.f32 f1171, f1136, f1152; +add.f32 f1379, f1386, f1382; +sub.f32 f1172, f1386, f1382; +add.f32 f1173, f1140, f1162; +sub.f32 f1175, f1140, f1162; +add.f32 f1378, f1385, f1164; +sub.f32 f1176, f1385, f1164; +add.f32 f1177, f1138, f1155; +sub.f32 f1179, f1138, f1155; +sub.f32 f1377, f1139, f1154; +add.f32 f1180, f1139, f1154; +add.f32 f1181, f1142, f1167; +sub.f32 f1183, f1142, f1167; +add.f32 f1376, f1143, f1168; +sub.f32 f1184, f1143, f1168; +add.f32 f1185, f924, f940; +sub.f32 f1187, f924, f940; +add.f32 f1375, f956, f972; +sub.f32 f1188, f956, f972; +add.f32 f1189, f932, f948; +sub.f32 f1191, f932, f948; +add.f32 f1374, f964, f980; +sub.f32 f1192, f964, f980; +add.f32 f1193, f1185, f1189; +sub.f32 f1195, f1185, f1189; +add.f32 f1373, f1375, f1374; +sub.f32 f1196, f1375, f1374; +add.f32 f1197, f1187, f1192; +sub.f32 f1199, f1187, f1192; +sub.f32 f1372, f1188, f1191; +add.f32 f1200, f1188, f1191; +add.f32 f1201, f928, f944; +sub.f32 f1203, f928, f944; +add.f32 f1371, f960, f976; +sub.f32 f1204, f960, f976; +add.f32 f1205, f936, f952; +sub.f32 f1207, f936, f952; +add.f32 f1370, f968, f984; +sub.f32 f1208, f968, f984; +add.f32 f1209, f1201, f1205; +sub.f32 f1211, f1201, f1205; +add.f32 f1369, f1371, f1370; +sub.f32 f1212, f1371, f1370; +add.f32 f1213, f1203, f1208; +sub.f32 f1215, f1203, f1208; +sub.f32 f1368, f1204, f1207; +add.f32 f1216, f1204, f1207; +mul.f32 f1366, f1213, 0f3F3504F3; +mul.f32 f1367, f1368, 0fBF3504F3; +sub.f32 f1219, f1366, f1367; +mul.f32 f1220, f1368, 0f3F3504F3; +fma.rn.f32 f1221, f1213, 0fBF3504F3, f1220; +mul.f32 f1222, f1215, 0fBF3504F3; +mul.f32 f1223, f1216, 0fBF3504F3; +sub.f32 f1224, f1222, f1223; +add.f32 f1225, f1222, f1223; +add.f32 f1226, f1193, f1209; +sub.f32 f1228, f1193, f1209; +add.f32 f1365, f1373, f1369; +sub.f32 f1229, f1373, f1369; +add.f32 f1230, f1197, f1219; +sub.f32 f1232, f1197, f1219; +add.f32 f1364, f1372, f1221; +sub.f32 f1233, f1372, f1221; +add.f32 f1234, f1195, f1212; +sub.f32 f1236, f1195, f1212; +sub.f32 f1363, f1196, f1211; +add.f32 f1237, f1196, f1211; +add.f32 f1238, f1199, f1224; +sub.f32 f1240, f1199, f1224; +add.f32 f1362, f1200, f1225; +sub.f32 f1241, f1200, f1225; +mul.f32 f1243, f1364, 0fBEC3EF15; +mul.f32 f1361, f1230, 0f3F6C835E; +sub.f32 f1244, f1361, f1243; +mul.f32 f1245, f1364, 0f3F6C835E; +fma.rn.f32 f1246, f1230, 0fBEC3EF15, f1245; +mul.f32 f1248, f1363, 0fBF3504F3; +mul.f32 f1360, f1234, 0f3F3504F3; +sub.f32 f1249, f1360, f1248; +mul.f32 f1250, f1363, 0f3F3504F3; +fma.rn.f32 f1251, f1234, 0fBF3504F3, f1250; +mul.f32 f1253, f1362, 0fBF6C835E; +mul.f32 f1359, f1238, 0f3EC3EF15; +sub.f32 f1254, f1359, f1253; +mul.f32 f1255, f1362, 0f3EC3EF15; +fma.rn.f32 f1256, f1238, 0fBF6C835E, f1255; +mul.f32 f1258, f1233, 0fBF6C835E; +mul.f32 f1358, f1232, 0fBEC3EF15; +sub.f32 f1259, f1358, f1258; +mul.f32 f1260, f1233, 0fBEC3EF15; +fma.rn.f32 f1261, f1232, 0fBF6C835E, f1260; +mul.f32 f1262, f1236, 0fBF3504F3; +mul.f32 f1263, f1237, 0fBF3504F3; +sub.f32 f1264, f1262, f1263; +add.f32 f1265, f1262, f1263; +mul.f32 f1267, f1241, 0fBEC3EF15; +mul.f32 f1357, f1240, 0fBF6C835E; +sub.f32 f1268, f1357, f1267; +mul.f32 f1269, f1241, 0fBF6C835E; +fma.rn.f32 f1270, f1240, 0fBEC3EF15, f1269; +add.f32 %1, f1413, f1400; +add.f32 %0, f1026, f1083; +add.f32 %3, f1379, f1365; +add.f32 %2, f1169, f1226; +add.f32 %4, f1030, f1101; +add.f32 %5, f1412, f1103; +add.f32 %7, f1378, f1246; +add.f32 %6, f1173, f1244; +add.f32 %9, f1411, f1108; +add.f32 %8, f1034, f1106; +add.f32 %11, f1377, f1251; +add.f32 %10, f1177, f1249; +add.f32 %12, f1038, f1111; +add.f32 %13, f1410, f1113; +add.f32 %14, f1181, f1254; +add.f32 %15, f1376, f1256; +add.f32 %16, f1028, f1086; +sub.f32 %17, f1029, f1085; +sub.f32 %19, f1172, f1228; +add.f32 %18, f1171, f1229; +add.f32 %21, f1033, f1118; +add.f32 %20, f1032, f1116; +add.f32 %23, f1176, f1261; +add.f32 %22, f1175, f1259; +add.f32 %24, f1036, f1121; +add.f32 %25, f1037, f1122; +add.f32 %26, f1179, f1264; +add.f32 %27, f1180, f1265; +add.f32 %28, f1040, f1125; +add.f32 %29, f1041, f1127; +add.f32 %30, f1183, f1268; +add.f32 %31, f1184, f1270; +sub.f32 %32, f1026, f1083; +sub.f32 %33, f1413, f1400; +sub.f32 %34, f1169, f1226; +sub.f32 %35, f1379, f1365; +sub.f32 %37, f1412, f1103; +sub.f32 %36, f1030, f1101; +sub.f32 %39, f1378, f1246; +sub.f32 %38, f1173, f1244; +sub.f32 %41, f1411, f1108; +sub.f32 %40, f1034, f1106; +sub.f32 %43, f1377, f1251; +sub.f32 %42, f1177, f1249; +sub.f32 %45, f1410, f1113; +sub.f32 %44, f1038, f1111; +sub.f32 %47, f1376, f1256; +sub.f32 %46, f1181, f1254; +add.f32 %49, f1029, f1085; +sub.f32 %48, f1028, f1086; +add.f32 %51, f1172, f1228; +sub.f32 %50, f1171, f1229; +sub.f32 %53, f1033, f1118; +sub.f32 %52, f1032, f1116; +sub.f32 %55, f1176, f1261; +sub.f32 %54, f1175, f1259; +sub.f32 %57, f1037, f1122; +sub.f32 %56, f1036, f1121; +sub.f32 %59, f1180, f1265; +sub.f32 %58, f1179, f1264; +sub.f32 %61, f1041, f1127; +sub.f32 %60, f1040, f1125; +sub.f32 %63, f1184, f1270; +sub.f32 %62, f1183, f1268; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<77, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<269>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 4064; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+1024]; +ld.shared.v2.f32 {f70, f71}, [r13+2048]; +ld.shared.v2.f32 {f74, f75}, [r13+3072]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 3968; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+1024]; +ld.shared.v2.f32 {f131, f132}, [r20+2048]; +ld.shared.v2.f32 {f135, f136}, [r20+3072]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +add.f32 f149, f141, f146; +sub.f32 f150, f142, f145; +sub.f32 f151, f141, f146; +add.f32 f152, f142, f145; +and.b32 r21, r5, 112; +bfe.u32 r22, r5, 4, 3; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f153, f149; +mul.f32 f158, f154, f150; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f162, f147; +mul.f32 f166, f164, f148; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f170, f151; +mul.f32 f174, f172, f152; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 3584; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f154, f149, f159; +sub.f32 f179, f157, f158; +st.shared.v2.f32 [r26+128], {f179, f178}; +fma.rn.f32 f180, f164, f147, f167; +sub.f32 f181, f165, f166; +st.shared.v2.f32 [r26+256], {f181, f180}; +sub.f32 f182, f173, f174; +fma.rn.f32 f183, f172, f151, f175; +st.shared.v2.f32 [r26+384], {f182, f183}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+1024]; +ld.shared.v2.f32 {f192, f193}, [r27+2048]; +ld.shared.v2.f32 {f196, f197}, [r27+3072]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +add.f32 f210, f202, f207; +sub.f32 f211, f203, f206; +sub.f32 f212, f202, f207; +add.f32 f213, f203, f206; +and.b32 r28, r5, 64; +bfe.u32 r29, r5, 6, 1; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f214, f210; +mul.f32 f219, f215, f211; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f223, f208; +mul.f32 f227, f225, f209; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f231, f212; +mul.f32 f235, f233, f213; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 2048; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f215, f210, f220; +sub.f32 f240, f218, f219; +st.shared.v2.f32 [r33+512], {f240, f239}; +fma.rn.f32 f241, f225, f208, f228; +sub.f32 f242, f226, f227; +st.shared.v2.f32 [r33+1024], {f242, f241}; +sub.f32 f243, f234, f235; +fma.rn.f32 f244, f233, f212, f236; +st.shared.v2.f32 [r33+1536], {f243, f244}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+1024]; +ld.shared.v2.f32 {f253, f254}, [r34+2048]; +ld.shared.v2.f32 {f257, f258}, [r34+3072]; +add.f32 %1, f246, f254; +add.f32 %0, f245, f253; +add.f32 %3, f250, f258; +add.f32 %2, f249, f257; +sub.f32 %5, f246, f254; +sub.f32 %4, f245, f253; +sub.f32 %7, f250, f258; +sub.f32 %6, f249, f257; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<78, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<237>; +.reg .b32 r<36>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 2032; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+512]; +ld.shared.f32 f64, [r13+1024]; +ld.shared.f32 f65, [r13+1536]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+512]; +ld.shared.f32 f68, [r13+1024]; +ld.shared.f32 f69, [r13+1536]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1984; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+512]; +ld.shared.f32 f117, [r21+1024]; +ld.shared.f32 f118, [r21+1536]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+512]; +ld.shared.f32 f121, [r21+1024]; +ld.shared.f32 f122, [r21+1536]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +add.f32 f135, f125, f130; +sub.f32 f136, f126, f129; +sub.f32 f137, f125, f130; +add.f32 f138, f126, f129; +and.b32 r22, r5, 112; +bfe.u32 r23, r5, 4, 3; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f139, f135; +mul.f32 f144, f140, f136; +sub.f32 f145, f143, f144; +mul.f32 f146, f139, f136; +fma.rn.f32 f147, f140, f135, f146; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f150, f133; +mul.f32 f154, f152, f134; +sub.f32 f155, f153, f154; +mul.f32 f156, f150, f134; +fma.rn.f32 f157, f152, f133, f156; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f160, f137; +mul.f32 f164, f162, f138; +sub.f32 f165, f163, f164; +mul.f32 f166, f160, f138; +fma.rn.f32 f167, f162, f137, f166; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 1792; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f145; +st.shared.f32 [r27+128], f155; +st.shared.f32 [r27+192], f165; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+512]; +ld.shared.f32 f170, [r28+1024]; +ld.shared.f32 f171, [r28+1536]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+512]; +ld.shared.f32 f174, [r28+1024]; +ld.shared.f32 f175, [r28+1536]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +add.f32 f188, f178, f183; +sub.f32 f189, f179, f182; +sub.f32 f190, f178, f183; +add.f32 f191, f179, f182; +and.b32 r29, r5, 64; +bfe.u32 r30, r5, 6, 1; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f192, f188; +mul.f32 f197, f193, f189; +sub.f32 f198, f196, f197; +mul.f32 f199, f192, f189; +fma.rn.f32 f200, f193, f188, f199; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f203, f186; +mul.f32 f207, f205, f187; +sub.f32 f208, f206, f207; +mul.f32 f209, f203, f187; +fma.rn.f32 f210, f205, f186, f209; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f213, f190; +mul.f32 f217, f215, f191; +sub.f32 f218, f216, f217; +mul.f32 f219, f213, f191; +fma.rn.f32 f220, f215, f190, f219; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 1024; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f198; +st.shared.f32 [r34+512], f208; +st.shared.f32 [r34+768], f218; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+512]; +ld.shared.f32 f223, [r35+1024]; +ld.shared.f32 f224, [r35+1536]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+512]; +ld.shared.f32 f227, [r35+1024]; +ld.shared.f32 f228, [r35+1536]; +add.f32 %0, f221, f223; +add.f32 %1, f225, f227; +add.f32 %2, f222, f224; +add.f32 %3, f226, f228; +sub.f32 %4, f221, f223; +sub.f32 %5, f225, f227; +sub.f32 %6, f222, f224; +sub.f32 %7, f226, f228; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<79, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<181>; +.reg .b32 r<63>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %13, %15; +sub.f32 f10, %14, %16; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 4080; +add.s32 r11, r8, r10; +add.f32 f18, %14, %16; +add.f32 f19, %13, %15; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 2040; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+2048]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4064; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 2032; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+2048]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 4032; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 2016; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+2048]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 248; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 3968; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 1984; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+2048]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 4; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f95, f93; +mul.f32 f100, f96, f94; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3840; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f96, f93, f101; +sub.f32 f105, f99, f100; +st.shared.v2.f32 [r39+128], {f105, f104}; +barrier.sync 0; +and.b32 r40, r9, 1920; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+2048]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f116, f114; +mul.f32 f121, f117, f115; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3584; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f117, f114, f122; +sub.f32 f126, f120, f121; +st.shared.v2.f32 [r46+256], {f126, f125}; +barrier.sync 0; +and.b32 r47, r9, 1792; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+2048]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f137, f135; +mul.f32 f142, f138, f136; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 3072; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f138, f135, f143; +sub.f32 f147, f141, f142; +st.shared.v2.f32 [r53+512], {f147, f146}; +barrier.sync 0; +and.b32 r54, r9, 1536; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+2048]; +sub.f32 f156, f148, f152; +sub.f32 f157, f149, f153; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f158, f159}, [rd26]; +mul.f32 f162, f158, f156; +mul.f32 f163, f159, f157; +mul.f32 f164, f158, f157; +and.b32 r57, r9, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 2048; +add.s32 r60, r58, r59; +add.f32 f165, f149, f153; +add.f32 f166, f148, f152; +st.shared.v2.f32 [r60], {f166, f165}; +fma.rn.f32 f167, f159, f156, f164; +sub.f32 f168, f162, f163; +st.shared.v2.f32 [r60+1024], {f168, f167}; +barrier.sync 0; +and.b32 r61, r9, 1024; +sub.s32 r62, r60, r61; +ld.shared.v2.f32 {f169, f170}, [r62]; +ld.shared.v2.f32 {f173, f174}, [r62+2048]; +add.f32 %1, f170, f174; +add.f32 %0, f169, f173; +sub.f32 %3, f170, f174; +sub.f32 %2, f169, f173; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<80, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<149>; +.reg .b32 r<63>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %13, %15; +add.f32 f10, %14, %16; +sub.f32 f11, %13, %15; +sub.f32 f12, %14, %16; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 2040; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 1020; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+1024]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+1024]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 2032; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 1016; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+1024]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+1024]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 2016; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 1008; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+1024]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+1024]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 248; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1984; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 992; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+1024]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+1024]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 4; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f81, f79; +mul.f32 f86, f82, f80; +sub.f32 f87, f85, f86; +mul.f32 f88, f81, f80; +fma.rn.f32 f89, f82, f79, f88; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1920; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f87; +barrier.sync 0; +and.b32 r40, r11, 960; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+1024]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+1024]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f98, f96; +mul.f32 f103, f99, f97; +sub.f32 f104, f102, f103; +mul.f32 f105, f98, f97; +fma.rn.f32 f106, f99, f96, f105; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1792; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f104; +barrier.sync 0; +and.b32 r47, r11, 896; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+1024]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+1024]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f115, f113; +mul.f32 f120, f116, f114; +sub.f32 f121, f119, f120; +mul.f32 f122, f115, f114; +fma.rn.f32 f123, f116, f113, f122; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 1536; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f121; +barrier.sync 0; +and.b32 r54, r11, 768; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+1024]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+1024]; +add.f32 f128, f124, f125; +add.f32 f129, f126, f127; +sub.f32 f130, f124, f125; +sub.f32 f131, f126, f127; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f132, f133}, [rd26]; +mul.f32 f136, f132, f130; +mul.f32 f137, f133, f131; +sub.f32 f138, f136, f137; +mul.f32 f139, f132, f131; +fma.rn.f32 f140, f133, f130, f139; +and.b32 r57, r11, 508; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 1024; +add.s32 r60, r58, r59; +st.shared.f32 [r60], f128; +st.shared.f32 [r60+512], f138; +barrier.sync 0; +and.b32 r61, r11, 512; +sub.s32 r62, r60, r61; +ld.shared.f32 f141, [r62]; +ld.shared.f32 f142, [r62+1024]; +barrier.sync 0; +st.shared.f32 [r60], f129; +st.shared.f32 [r60+512], f140; +barrier.sync 0; +ld.shared.f32 f143, [r62]; +ld.shared.f32 f144, [r62+1024]; +add.f32 %0, f141, f142; +add.f32 %1, f143, f144; +sub.f32 %2, f141, f142; +sub.f32 %3, f143, f144; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..c365514992e75 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp32_inv.hpp.inc @@ -0,0 +1,5968 @@ +#ifndef CUFFTDX_FFT_512_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_512_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<273, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<406>; +.reg .b32 r<20>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 4032; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+512]; +ld.shared.v2.f32 {f167, f168}, [r13+1024]; +ld.shared.v2.f32 {f171, f172}, [r13+1536]; +ld.shared.v2.f32 {f175, f176}, [r13+2048]; +ld.shared.v2.f32 {f179, f180}, [r13+2560]; +ld.shared.v2.f32 {f183, f184}, [r13+3072]; +ld.shared.v2.f32 {f187, f188}, [r13+3584]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 56; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 3584; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+512]; +ld.shared.v2.f32 {f325, f326}, [r19+1024]; +ld.shared.v2.f32 {f329, f330}, [r19+1536]; +ld.shared.v2.f32 {f333, f334}, [r19+2048]; +ld.shared.v2.f32 {f337, f338}, [r19+2560]; +ld.shared.v2.f32 {f341, f342}, [r19+3072]; +ld.shared.v2.f32 {f345, f346}, [r19+3584]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +sub.f32 f361, f351, f356; +add.f32 f362, f352, f355; +add.f32 f363, f351, f356; +sub.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +sub.f32 f377, f367, f372; +add.f32 f378, f368, f371; +add.f32 f379, f367, f372; +sub.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0f3F3504F3; +sub.f32 f383, f381, f382; +add.f32 f384, f381, f382; +mul.f32 f385, f379, 0fBF3504F3; +mul.f32 f386, f380, 0f3F3504F3; +sub.f32 f387, f385, f386; +mul.f32 f388, f380, 0fBF3504F3; +fma.rn.f32 f389, f379, 0f3F3504F3, f388; +add.f32 %1, f358, f374; +add.f32 %0, f357, f373; +add.f32 %3, f362, f384; +add.f32 %2, f361, f383; +add.f32 %5, f360, f375; +sub.f32 %4, f359, f376; +add.f32 %7, f364, f389; +add.f32 %6, f363, f387; +sub.f32 %9, f358, f374; +sub.f32 %8, f357, f373; +sub.f32 %11, f362, f384; +sub.f32 %10, f361, f383; +sub.f32 %13, f360, f375; +add.f32 %12, f359, f376; +sub.f32 %15, f364, f389; +sub.f32 %14, f363, f387; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<274, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<374>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %19, %29; +add.f32 f34, %20, %31; +sub.f32 f35, %19, %29; +sub.f32 f36, %20, %31; +add.f32 f37, %24, %35; +add.f32 f38, %26, %36; +sub.f32 f39, %24, %35; +sub.f32 f40, %26, %36; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %21, %32; +add.f32 f50, %23, %34; +sub.f32 f51, %21, %32; +sub.f32 f52, %23, %34; +add.f32 f53, %27, %37; +add.f32 f54, %28, %38; +sub.f32 f55, %27, %37; +sub.f32 f56, %28, %38; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 504; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 2016; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+256]; +ld.shared.f32 f161, [r13+512]; +ld.shared.f32 f162, [r13+768]; +ld.shared.f32 f163, [r13+1024]; +ld.shared.f32 f164, [r13+1280]; +ld.shared.f32 f165, [r13+1536]; +ld.shared.f32 f166, [r13+1792]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+256]; +ld.shared.f32 f169, [r13+512]; +ld.shared.f32 f170, [r13+768]; +ld.shared.f32 f171, [r13+1024]; +ld.shared.f32 f172, [r13+1280]; +ld.shared.f32 f173, [r13+1536]; +ld.shared.f32 f174, [r13+1792]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 56; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 1792; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+256]; +ld.shared.f32 f303, [r20+512]; +ld.shared.f32 f304, [r20+768]; +ld.shared.f32 f305, [r20+1024]; +ld.shared.f32 f306, [r20+1280]; +ld.shared.f32 f307, [r20+1536]; +ld.shared.f32 f308, [r20+1792]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+256]; +ld.shared.f32 f311, [r20+512]; +ld.shared.f32 f312, [r20+768]; +ld.shared.f32 f313, [r20+1024]; +ld.shared.f32 f314, [r20+1280]; +ld.shared.f32 f315, [r20+1536]; +ld.shared.f32 f316, [r20+1792]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +sub.f32 f329, f319, f324; +add.f32 f330, f320, f323; +add.f32 f331, f319, f324; +sub.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +sub.f32 f345, f335, f340; +add.f32 f346, f336, f339; +add.f32 f347, f335, f340; +sub.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0f3F3504F3; +sub.f32 f351, f349, f350; +add.f32 f352, f349, f350; +mul.f32 f353, f347, 0fBF3504F3; +mul.f32 f354, f348, 0f3F3504F3; +sub.f32 f355, f353, f354; +mul.f32 f356, f348, 0fBF3504F3; +fma.rn.f32 f357, f347, 0f3F3504F3, f356; +add.f32 %0, f325, f341; +add.f32 %1, f326, f342; +add.f32 %3, f330, f352; +add.f32 %2, f329, f351; +add.f32 %5, f328, f343; +sub.f32 %4, f327, f344; +add.f32 %7, f332, f357; +add.f32 %6, f331, f355; +sub.f32 %8, f325, f341; +sub.f32 %9, f326, f342; +sub.f32 %11, f330, f352; +sub.f32 %10, f329, f351; +sub.f32 %13, f328, f343; +add.f32 %12, f327, f344; +sub.f32 %15, f332, f357; +sub.f32 %14, f331, f355; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_512), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<276, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1704>; +.reg .b32 r<20>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1698, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1696, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1695, f1698, f1696; +sub.f32 f140, f1698, f1696; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1694, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1691, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1689, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1688, f1691, f1689; +sub.f32 f156, f1691, f1689; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1687, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1687, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1685, f159, 0fBF3504F3; +mul.f32 f1686, f160, 0f3F3504F3; +sub.f32 f167, f1685, f1686; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1684, f1695, f1688; +sub.f32 f173, f1695, f1688; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1683, f1694, f164; +sub.f32 f177, f1694, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1682, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1681, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1679, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1676, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1675, f1679, f1676; +sub.f32 f197, f1679, f1676; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1674, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1672, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1670, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1669, f1672, f1670; +sub.f32 f213, f1672, f1670; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1668, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1668, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1666, f216, 0fBF3504F3; +mul.f32 f1667, f217, 0f3F3504F3; +sub.f32 f224, f1666, f1667; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1665, f1675, f1669; +sub.f32 f230, f1675, f1669; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1664, f1674, f221; +sub.f32 f234, f1674, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1663, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1662, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1660, f231, 0f3F6C835E; +mul.f32 f1661, f1664, 0f3EC3EF15; +sub.f32 f245, f1660, f1661; +mul.f32 f246, f1664, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1663, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1662, 0f3F6C835E; +mul.f32 f1659, f239, 0f3EC3EF15; +sub.f32 f254, f1659, f253; +mul.f32 f255, f1662, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1658, f233, 0fBEC3EF15; +sub.f32 f259, f1658, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1656, f237, 0fBF3504F3; +mul.f32 f1657, f238, 0f3F3504F3; +sub.f32 f264, f1656, f1657; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1654, f241, 0fBF6C835E; +mul.f32 f1655, f242, 0f3EC3EF15; +sub.f32 f269, f1654, f1655; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1653, f1684, f1665; +sub.f32 f275, f1684, f1665; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1652, f1683, f247; +sub.f32 f279, f1683, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1651, f1682, f251; +sub.f32 f283, f1682, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1650, f1681, f256; +sub.f32 f287, f1681, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1649, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1648, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1647, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1646, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1643, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1641, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1640, f1643, f1641; +sub.f32 f315, f1643, f1641; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1639, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1637, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1634, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1633, f1637, f1634; +sub.f32 f331, f1637, f1634; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1632, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1632, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1631, f334, 0fBF3504F3; +sub.f32 f342, f1631, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1630, f1640, f1633; +sub.f32 f348, f1640, f1633; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1629, f1639, f339; +sub.f32 f352, f1639, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1628, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1627, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1625, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1623, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1622, f1625, f1623; +sub.f32 f372, f1625, f1623; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1621, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1618, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1617, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1616, f1618, f1617; +sub.f32 f388, f1618, f1617; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1615, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1615, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1613, f391, 0fBF3504F3; +mul.f32 f1614, f392, 0f3F3504F3; +sub.f32 f399, f1613, f1614; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1612, f1622, f1616; +sub.f32 f405, f1622, f1616; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1611, f1621, f396; +sub.f32 f409, f1621, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1610, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1609, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1611, 0f3EC3EF15; +mul.f32 f1608, f406, 0f3F6C835E; +sub.f32 f420, f1608, f419; +mul.f32 f421, f1611, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1610, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1609, 0f3F6C835E; +mul.f32 f1607, f414, 0f3EC3EF15; +sub.f32 f429, f1607, f428; +mul.f32 f430, f1609, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1606, f408, 0fBEC3EF15; +sub.f32 f434, f1606, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1605, f412, 0fBF3504F3; +sub.f32 f439, f1605, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1604, f416, 0fBF6C835E; +sub.f32 f444, f1604, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1603, f1630, f1612; +sub.f32 f450, f1630, f1612; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1602, f1629, f422; +sub.f32 f454, f1629, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1601, f1628, f426; +sub.f32 f458, f1628, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1600, f1627, f431; +sub.f32 f462, f1627, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1599, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1598, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1597, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1596, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1602, 0f3E47C5C2; +mul.f32 f1595, f451, 0f3F7B14BE; +sub.f32 f481, f1595, f480; +mul.f32 f482, f1602, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1601, 0f3EC3EF15; +mul.f32 f1594, f455, 0f3F6C835E; +sub.f32 f486, f1594, f485; +mul.f32 f487, f1601, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1600, 0f3F0E39DA; +mul.f32 f1593, f459, 0f3F54DB31; +sub.f32 f491, f1593, f490; +mul.f32 f492, f1600, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1599, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1598, 0f3F54DB31; +mul.f32 f1592, f467, 0f3F0E39DA; +sub.f32 f500, f1592, f499; +mul.f32 f501, f1598, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1597, 0f3F6C835E; +mul.f32 f1591, f471, 0f3EC3EF15; +sub.f32 f505, f1591, f504; +mul.f32 f506, f1597, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1596, 0f3F7B14BE; +mul.f32 f1590, f475, 0f3E47C5C2; +sub.f32 f510, f1590, f509; +mul.f32 f511, f1596, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1589, f453, 0fBE47C5C2; +sub.f32 f515, f1589, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1587, f457, 0fBEC3EF15; +mul.f32 f1588, f458, 0f3F6C835E; +sub.f32 f520, f1587, f1588; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1585, f461, 0fBF0E39DA; +mul.f32 f1586, f462, 0f3F54DB31; +sub.f32 f525, f1585, f1586; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1583, f465, 0fBF3504F3; +mul.f32 f1584, f466, 0f3F3504F3; +sub.f32 f530, f1583, f1584; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1581, f469, 0fBF54DB31; +mul.f32 f1582, f470, 0f3F0E39DA; +sub.f32 f535, f1581, f1582; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1580, f473, 0fBF6C835E; +sub.f32 f540, f1580, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1579, f477, 0fBF7B14BE; +sub.f32 f545, f1579, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1578, f1652, f483; +sub.f32 f553, f1652, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1577, f1651, f488; +sub.f32 f557, f1651, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1576, f1650, f493; +sub.f32 f561, f1650, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1575, f1649, f497; +sub.f32 f565, f1649, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f1574, f1648, f502; +sub.f32 f569, f1648, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f1573, f1647, f507; +sub.f32 f573, f1647, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f1572, f1646, f512; +sub.f32 f577, f1646, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f1571, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f1570, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f1569, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f1568, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f1567, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1566, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1565, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1564, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +and.b32 r14, r15, 15; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f1578, f611; +mul.f32 f616, f610, f1578; +mul.f32 f618, f611, f611; +mul.f32 f1563, f610, f610; +sub.f32 f619, f1563, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f1577, f621; +mul.f32 f624, f619, f1577; +mul.f32 f626, f611, f621; +mul.f32 f1562, f610, f619; +sub.f32 f627, f1562, f626; +mul.f32 f1561, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f1576, f629; +mul.f32 f632, f627, f1576; +mul.f32 f1559, f610, f627; +mul.f32 f1560, f611, f629; +sub.f32 f635, f1559, f1560; +mul.f32 f1558, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f1575, f637; +mul.f32 f640, f635, f1575; +mul.f32 f642, f611, f637; +mul.f32 f1557, f610, f635; +sub.f32 f643, f1557, f642; +mul.f32 f1556, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f1574, f645; +mul.f32 f648, f643, f1574; +mul.f32 f1554, f610, f643; +mul.f32 f1555, f611, f645; +sub.f32 f651, f1554, f1555; +mul.f32 f1553, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f1573, f653; +mul.f32 f656, f651, f1573; +mul.f32 f658, f611, f653; +mul.f32 f1552, f610, f651; +sub.f32 f659, f1552, f658; +mul.f32 f1551, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f1572, f661; +mul.f32 f664, f659, f1572; +mul.f32 f666, f611, f661; +mul.f32 f1550, f610, f659; +sub.f32 f667, f1550, f666; +mul.f32 f1549, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f1571, f669; +mul.f32 f672, f667, f1571; +mul.f32 f1547, f610, f667; +mul.f32 f1548, f611, f669; +sub.f32 f675, f1547, f1548; +mul.f32 f1546, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f1570, f677; +mul.f32 f680, f675, f1570; +mul.f32 f682, f611, f677; +mul.f32 f1545, f610, f675; +sub.f32 f683, f1545, f682; +mul.f32 f1544, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f1569, f685; +mul.f32 f688, f683, f1569; +mul.f32 f690, f611, f685; +mul.f32 f1543, f610, f683; +sub.f32 f691, f1543, f690; +mul.f32 f1542, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f1568, f693; +mul.f32 f696, f691, f1568; +mul.f32 f1540, f610, f691; +mul.f32 f1541, f611, f693; +sub.f32 f699, f1540, f1541; +mul.f32 f1539, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f1567, f701; +mul.f32 f704, f699, f1567; +mul.f32 f706, f611, f701; +mul.f32 f1538, f610, f699; +sub.f32 f707, f1538, f706; +mul.f32 f1537, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f1566, f709; +mul.f32 f712, f707, f1566; +mul.f32 f1535, f610, f707; +mul.f32 f1536, f611, f709; +sub.f32 f715, f1535, f1536; +mul.f32 f1534, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f1565, f717; +mul.f32 f720, f715, f1565; +mul.f32 f722, f611, f717; +mul.f32 f1533, f610, f715; +sub.f32 f723, f1533, f722; +mul.f32 f1532, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f1564, f725; +mul.f32 f728, f723, f1564; +mul.f32 f730, f611, f725; +mul.f32 f1531, f610, f723; +sub.f32 f731, f1531, f730; +mul.f32 f1530, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1529, f1653, f1603; +mul.f32 f734, f1529, f733; +mul.f32 f736, f731, f1529; +mul.f32 f1527, f610, f731; +mul.f32 f1528, f611, f733; +sub.f32 f739, f1527, f1528; +sub.f32 f1526, f272, f447; +mul.f32 f1525, f1526, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1524, f610, f739; +sub.f32 f747, f1524, f746; +mul.f32 f1523, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1522, f610, f747; +sub.f32 f755, f1522, f754; +mul.f32 f1521, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f1519, f610, f755; +mul.f32 f1520, f611, f757; +sub.f32 f763, f1519, f1520; +mul.f32 f1518, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1517, f610, f763; +sub.f32 f771, f1517, f770; +mul.f32 f1516, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f1514, f610, f771; +mul.f32 f1515, f611, f773; +sub.f32 f779, f1514, f1515; +mul.f32 f1513, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1512, f610, f779; +sub.f32 f787, f1512, f786; +mul.f32 f1511, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1510, f610, f787; +sub.f32 f795, f1510, f794; +mul.f32 f1509, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f1507, f610, f795; +mul.f32 f1508, f611, f797; +sub.f32 f803, f1507, f1508; +mul.f32 f1506, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1505, f610, f803; +sub.f32 f811, f1505, f810; +mul.f32 f1504, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1503, f610, f811; +sub.f32 f819, f1503, f818; +mul.f32 f1502, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f1500, f610, f819; +mul.f32 f1501, f611, f821; +sub.f32 f827, f1500, f1501; +mul.f32 f1499, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1498, f610, f827; +sub.f32 f835, f1498, f834; +mul.f32 f1497, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f1495, f610, f835; +mul.f32 f1496, f611, f837; +sub.f32 f843, f1495, f1496; +mul.f32 f1494, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1493, f610, f843; +sub.f32 f851, f1493, f850; +mul.f32 f1492, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f1491, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 3840; +add.s32 r12, r9, r11; +sub.f32 f1703, f1653, f1603; +mul.f32 f1702, f731, f1703; +add.f32 f857, f1653, f1603; +sub.f32 f1701, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 15; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 15; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f1491; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f1561; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f1558; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f1556; +sub.f32 f867, f648, f1553; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f1551; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f1549; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f1546; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f1544; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f1542; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f1539; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f1537; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f1534; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f1532; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f1530; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f1701, f734; +sub.f32 f890, f1702, f1525; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f1523; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f1521; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f1518; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f1516; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f1513; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f1511; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f1509; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f1506; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f1504; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f1502; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f1499; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f1497; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f1494; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f1492; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +mad.lo.s32 r13, r18, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+128]; +ld.shared.v2.f32 {f929, f930}, [r13+256]; +ld.shared.v2.f32 {f933, f934}, [r13+384]; +ld.shared.v2.f32 {f937, f938}, [r13+512]; +ld.shared.v2.f32 {f941, f942}, [r13+640]; +ld.shared.v2.f32 {f945, f946}, [r13+768]; +ld.shared.v2.f32 {f949, f950}, [r13+896]; +ld.shared.v2.f32 {f953, f954}, [r13+1024]; +ld.shared.v2.f32 {f957, f958}, [r13+1152]; +ld.shared.v2.f32 {f961, f962}, [r13+1280]; +ld.shared.v2.f32 {f965, f966}, [r13+1408]; +ld.shared.v2.f32 {f969, f970}, [r13+1536]; +ld.shared.v2.f32 {f973, f974}, [r13+1664]; +ld.shared.v2.f32 {f977, f978}, [r13+1792]; +ld.shared.v2.f32 {f981, f982}, [r13+1920]; +ld.shared.v2.f32 {f985, f986}, [r13+2048]; +ld.shared.v2.f32 {f989, f990}, [r13+2176]; +ld.shared.v2.f32 {f993, f994}, [r13+2304]; +ld.shared.v2.f32 {f997, f998}, [r13+2432]; +ld.shared.v2.f32 {f1001, f1002}, [r13+2560]; +ld.shared.v2.f32 {f1005, f1006}, [r13+2688]; +ld.shared.v2.f32 {f1009, f1010}, [r13+2816]; +ld.shared.v2.f32 {f1013, f1014}, [r13+2944]; +ld.shared.v2.f32 {f1017, f1018}, [r13+3072]; +ld.shared.v2.f32 {f1021, f1022}, [r13+3200]; +ld.shared.v2.f32 {f1025, f1026}, [r13+3328]; +ld.shared.v2.f32 {f1029, f1030}, [r13+3456]; +ld.shared.v2.f32 {f1033, f1034}, [r13+3584]; +ld.shared.v2.f32 {f1037, f1038}, [r13+3712]; +ld.shared.v2.f32 {f1041, f1042}, [r13+3840]; +ld.shared.v2.f32 {f1045, f1046}, [r13+3968]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f1490, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f1489, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f1488, f1490, f1489; +sub.f32 f1060, f1490, f1489; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f1487, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f1486, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f1485, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f1484, f1486, f1485; +sub.f32 f1076, f1486, f1485; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f1483, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f1483, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f1481, f1079, 0fBF3504F3; +mul.f32 f1482, f1080, 0f3F3504F3; +sub.f32 f1087, f1481, f1482; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f1480, f1488, f1484; +sub.f32 f1093, f1488, f1484; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f1479, f1487, f1084; +sub.f32 f1097, f1487, f1084; +sub.f32 f1098, f1059, f1076; +add.f32 f1100, f1059, f1076; +add.f32 f1478, f1060, f1075; +sub.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1087; +sub.f32 f1104, f1063, f1087; +add.f32 f1477, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f1476, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f1475, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f1474, f1476, f1475; +sub.f32 f1117, f1476, f1475; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f1473, f1109, f1112; +sub.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f1472, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f1471, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f1470, f1472, f1471; +sub.f32 f1133, f1472, f1471; +sub.f32 f1134, f1124, f1129; +add.f32 f1136, f1124, f1129; +add.f32 f1469, f1125, f1128; +sub.f32 f1137, f1125, f1128; +mul.f32 f1138, f1134, 0f3F3504F3; +mul.f32 f1139, f1469, 0f3F3504F3; +sub.f32 f1140, f1138, f1139; +add.f32 f1141, f1138, f1139; +mul.f32 f1467, f1136, 0fBF3504F3; +mul.f32 f1468, f1137, 0f3F3504F3; +sub.f32 f1144, f1467, f1468; +mul.f32 f1145, f1137, 0fBF3504F3; +fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f1466, f1474, f1470; +sub.f32 f1150, f1474, f1470; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f1465, f1473, f1141; +sub.f32 f1154, f1473, f1141; +sub.f32 f1155, f1116, f1133; +add.f32 f1157, f1116, f1133; +add.f32 f1464, f1117, f1132; +sub.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1144; +sub.f32 f1161, f1120, f1144; +add.f32 f1463, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f1461, f1151, 0f3F6C835E; +mul.f32 f1462, f1465, 0f3EC3EF15; +sub.f32 f1165, f1461, f1462; +mul.f32 f1166, f1465, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166; +mul.f32 f1168, f1155, 0f3F3504F3; +mul.f32 f1169, f1464, 0f3F3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +mul.f32 f1459, f1159, 0f3EC3EF15; +mul.f32 f1460, f1463, 0f3F6C835E; +sub.f32 f1174, f1459, f1460; +mul.f32 f1175, f1463, 0f3EC3EF15; +fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175; +mul.f32 f1457, f1153, 0fBEC3EF15; +mul.f32 f1458, f1154, 0f3F6C835E; +sub.f32 f1179, f1457, f1458; +mul.f32 f1180, f1154, 0fBEC3EF15; +fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180; +mul.f32 f1455, f1157, 0fBF3504F3; +mul.f32 f1456, f1158, 0f3F3504F3; +sub.f32 f1184, f1455, f1456; +mul.f32 f1185, f1158, 0fBF3504F3; +fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185; +mul.f32 f1453, f1161, 0fBF6C835E; +mul.f32 f1454, f1162, 0f3EC3EF15; +sub.f32 f1189, f1453, f1454; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190; +add.f32 f1192, f925, f989; +sub.f32 f1194, f925, f989; +add.f32 f1452, f926, f990; +sub.f32 f1195, f926, f990; +add.f32 f1196, f957, f1021; +sub.f32 f1198, f957, f1021; +add.f32 f1451, f958, f1022; +sub.f32 f1199, f958, f1022; +add.f32 f1200, f1192, f1196; +sub.f32 f1202, f1192, f1196; +add.f32 f1450, f1452, f1451; +sub.f32 f1203, f1452, f1451; +sub.f32 f1204, f1194, f1199; +add.f32 f1206, f1194, f1199; +add.f32 f1449, f1195, f1198; +sub.f32 f1207, f1195, f1198; +add.f32 f1208, f941, f1005; +sub.f32 f1210, f941, f1005; +add.f32 f1448, f942, f1006; +sub.f32 f1211, f942, f1006; +add.f32 f1212, f973, f1037; +sub.f32 f1214, f973, f1037; +add.f32 f1447, f974, f1038; +sub.f32 f1215, f974, f1038; +add.f32 f1216, f1208, f1212; +sub.f32 f1218, f1208, f1212; +add.f32 f1446, f1448, f1447; +sub.f32 f1219, f1448, f1447; +sub.f32 f1220, f1210, f1215; +add.f32 f1222, f1210, f1215; +add.f32 f1445, f1211, f1214; +sub.f32 f1223, f1211, f1214; +mul.f32 f1224, f1220, 0f3F3504F3; +mul.f32 f1225, f1445, 0f3F3504F3; +sub.f32 f1226, f1224, f1225; +add.f32 f1227, f1224, f1225; +mul.f32 f1229, f1223, 0f3F3504F3; +mul.f32 f1444, f1222, 0fBF3504F3; +sub.f32 f1230, f1444, f1229; +mul.f32 f1231, f1223, 0fBF3504F3; +fma.rn.f32 f1232, f1222, 0f3F3504F3, f1231; +add.f32 f1233, f1200, f1216; +sub.f32 f1235, f1200, f1216; +add.f32 f1443, f1450, f1446; +sub.f32 f1236, f1450, f1446; +add.f32 f1237, f1204, f1226; +sub.f32 f1239, f1204, f1226; +add.f32 f1442, f1449, f1227; +sub.f32 f1240, f1449, f1227; +sub.f32 f1241, f1202, f1219; +add.f32 f1243, f1202, f1219; +add.f32 f1441, f1203, f1218; +sub.f32 f1244, f1203, f1218; +add.f32 f1245, f1206, f1230; +sub.f32 f1247, f1206, f1230; +add.f32 f1440, f1207, f1232; +sub.f32 f1248, f1207, f1232; +add.f32 f1249, f933, f997; +sub.f32 f1251, f933, f997; +add.f32 f1439, f934, f998; +sub.f32 f1252, f934, f998; +add.f32 f1253, f965, f1029; +sub.f32 f1255, f965, f1029; +add.f32 f1438, f966, f1030; +sub.f32 f1256, f966, f1030; +add.f32 f1257, f1249, f1253; +sub.f32 f1259, f1249, f1253; +add.f32 f1437, f1439, f1438; +sub.f32 f1260, f1439, f1438; +sub.f32 f1261, f1251, f1256; +add.f32 f1263, f1251, f1256; +add.f32 f1436, f1252, f1255; +sub.f32 f1264, f1252, f1255; +add.f32 f1265, f949, f1013; +sub.f32 f1267, f949, f1013; +add.f32 f1435, f950, f1014; +sub.f32 f1268, f950, f1014; +add.f32 f1269, f981, f1045; +sub.f32 f1271, f981, f1045; +add.f32 f1434, f982, f1046; +sub.f32 f1272, f982, f1046; +add.f32 f1273, f1265, f1269; +sub.f32 f1275, f1265, f1269; +add.f32 f1433, f1435, f1434; +sub.f32 f1276, f1435, f1434; +sub.f32 f1277, f1267, f1272; +add.f32 f1279, f1267, f1272; +add.f32 f1432, f1268, f1271; +sub.f32 f1280, f1268, f1271; +mul.f32 f1281, f1277, 0f3F3504F3; +mul.f32 f1282, f1432, 0f3F3504F3; +sub.f32 f1283, f1281, f1282; +add.f32 f1284, f1281, f1282; +mul.f32 f1286, f1280, 0f3F3504F3; +mul.f32 f1431, f1279, 0fBF3504F3; +sub.f32 f1287, f1431, f1286; +mul.f32 f1288, f1280, 0fBF3504F3; +fma.rn.f32 f1289, f1279, 0f3F3504F3, f1288; +add.f32 f1290, f1257, f1273; +sub.f32 f1292, f1257, f1273; +add.f32 f1430, f1437, f1433; +sub.f32 f1293, f1437, f1433; +add.f32 f1294, f1261, f1283; +sub.f32 f1296, f1261, f1283; +add.f32 f1429, f1436, f1284; +sub.f32 f1297, f1436, f1284; +sub.f32 f1298, f1259, f1276; +add.f32 f1300, f1259, f1276; +add.f32 f1428, f1260, f1275; +sub.f32 f1301, f1260, f1275; +add.f32 f1302, f1263, f1287; +sub.f32 f1304, f1263, f1287; +add.f32 f1427, f1264, f1289; +sub.f32 f1305, f1264, f1289; +mul.f32 f1307, f1429, 0f3EC3EF15; +mul.f32 f1426, f1294, 0f3F6C835E; +sub.f32 f1308, f1426, f1307; +mul.f32 f1309, f1429, 0f3F6C835E; +fma.rn.f32 f1310, f1294, 0f3EC3EF15, f1309; +mul.f32 f1311, f1298, 0f3F3504F3; +mul.f32 f1312, f1428, 0f3F3504F3; +sub.f32 f1313, f1311, f1312; +add.f32 f1314, f1311, f1312; +mul.f32 f1424, f1302, 0f3EC3EF15; +mul.f32 f1425, f1427, 0f3F6C835E; +sub.f32 f1317, f1424, f1425; +mul.f32 f1318, f1427, 0f3EC3EF15; +fma.rn.f32 f1319, f1302, 0f3F6C835E, f1318; +mul.f32 f1321, f1297, 0f3F6C835E; +mul.f32 f1423, f1296, 0fBEC3EF15; +sub.f32 f1322, f1423, f1321; +mul.f32 f1323, f1297, 0fBEC3EF15; +fma.rn.f32 f1324, f1296, 0f3F6C835E, f1323; +mul.f32 f1326, f1301, 0f3F3504F3; +mul.f32 f1422, f1300, 0fBF3504F3; +sub.f32 f1327, f1422, f1326; +mul.f32 f1328, f1301, 0fBF3504F3; +fma.rn.f32 f1329, f1300, 0f3F3504F3, f1328; +mul.f32 f1331, f1305, 0f3EC3EF15; +mul.f32 f1421, f1304, 0fBF6C835E; +sub.f32 f1332, f1421, f1331; +mul.f32 f1333, f1305, 0fBF6C835E; +fma.rn.f32 f1334, f1304, 0f3EC3EF15, f1333; +add.f32 %0, f1090, f1147; +add.f32 %1, f1480, f1466; +add.f32 %2, f1233, f1290; +add.f32 %3, f1443, f1430; +add.f32 %4, f1094, f1165; +add.f32 %5, f1479, f1167; +add.f32 %7, f1442, f1310; +add.f32 %6, f1237, f1308; +add.f32 %9, f1478, f1171; +add.f32 %8, f1098, f1170; +add.f32 %11, f1441, f1314; +add.f32 %10, f1241, f1313; +add.f32 %12, f1102, f1174; +add.f32 %13, f1477, f1176; +add.f32 %14, f1245, f1317; +add.f32 %15, f1440, f1319; +sub.f32 %16, f1092, f1150; +add.f32 %17, f1093, f1149; +add.f32 %19, f1236, f1292; +sub.f32 %18, f1235, f1293; +add.f32 %21, f1097, f1181; +add.f32 %20, f1096, f1179; +add.f32 %23, f1240, f1324; +add.f32 %22, f1239, f1322; +add.f32 %24, f1100, f1184; +add.f32 %25, f1101, f1186; +add.f32 %26, f1243, f1327; +add.f32 %27, f1244, f1329; +add.f32 %28, f1104, f1189; +add.f32 %29, f1105, f1191; +add.f32 %30, f1247, f1332; +add.f32 %31, f1248, f1334; +sub.f32 %33, f1480, f1466; +sub.f32 %32, f1090, f1147; +sub.f32 %35, f1443, f1430; +sub.f32 %34, f1233, f1290; +sub.f32 %37, f1479, f1167; +sub.f32 %36, f1094, f1165; +sub.f32 %39, f1442, f1310; +sub.f32 %38, f1237, f1308; +sub.f32 %41, f1478, f1171; +sub.f32 %40, f1098, f1170; +sub.f32 %43, f1441, f1314; +sub.f32 %42, f1241, f1313; +sub.f32 %45, f1477, f1176; +sub.f32 %44, f1102, f1174; +sub.f32 %47, f1440, f1319; +sub.f32 %46, f1245, f1317; +sub.f32 %49, f1093, f1149; +add.f32 %48, f1092, f1150; +sub.f32 %51, f1236, f1292; +add.f32 %50, f1235, f1293; +sub.f32 %53, f1097, f1181; +sub.f32 %52, f1096, f1179; +sub.f32 %55, f1240, f1324; +sub.f32 %54, f1239, f1322; +sub.f32 %57, f1101, f1186; +sub.f32 %56, f1100, f1184; +sub.f32 %59, f1244, f1329; +sub.f32 %58, f1243, f1327; +sub.f32 %61, f1105, f1191; +sub.f32 %60, f1104, f1189; +sub.f32 %63, f1248, f1334; +sub.f32 %62, f1247, f1332; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<275, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<809>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %35, %56; +add.f32 f66, %36, %58; +sub.f32 f67, %35, %56; +sub.f32 f68, %36, %58; +add.f32 f69, %45, %67; +add.f32 f70, %47, %68; +sub.f32 f71, %45, %67; +sub.f32 f72, %47, %68; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %40, %61; +add.f32 f82, %42, %63; +sub.f32 f83, %40, %61; +sub.f32 f84, %42, %63; +add.f32 f85, %51, %72; +add.f32 f86, %52, %74; +sub.f32 f87, %51, %72; +sub.f32 f88, %52, %74; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %59; +add.f32 f123, %39, %60; +sub.f32 f124, %37, %59; +sub.f32 f125, %39, %60; +add.f32 f126, %48, %69; +add.f32 f127, %50, %71; +sub.f32 f128, %48, %69; +sub.f32 f129, %50, %71; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %43, %64; +add.f32 f139, %44, %66; +sub.f32 f140, %43, %64; +sub.f32 f141, %44, %66; +add.f32 f142, %53, %75; +add.f32 f143, %55, %76; +sub.f32 f144, %53, %75; +sub.f32 f145, %55, %76; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1984; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+128]; +ld.shared.f32 f391, [r13+256]; +ld.shared.f32 f392, [r13+384]; +ld.shared.f32 f393, [r13+512]; +ld.shared.f32 f394, [r13+640]; +ld.shared.f32 f395, [r13+768]; +ld.shared.f32 f396, [r13+896]; +ld.shared.f32 f397, [r13+1024]; +ld.shared.f32 f398, [r13+1152]; +ld.shared.f32 f399, [r13+1280]; +ld.shared.f32 f400, [r13+1408]; +ld.shared.f32 f401, [r13+1536]; +ld.shared.f32 f402, [r13+1664]; +ld.shared.f32 f403, [r13+1792]; +ld.shared.f32 f404, [r13+1920]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+128]; +ld.shared.f32 f407, [r13+256]; +ld.shared.f32 f408, [r13+384]; +ld.shared.f32 f409, [r13+512]; +ld.shared.f32 f410, [r13+640]; +ld.shared.f32 f411, [r13+768]; +ld.shared.f32 f412, [r13+896]; +ld.shared.f32 f413, [r13+1024]; +ld.shared.f32 f414, [r13+1152]; +ld.shared.f32 f415, [r13+1280]; +ld.shared.f32 f416, [r13+1408]; +ld.shared.f32 f417, [r13+1536]; +ld.shared.f32 f418, [r13+1664]; +ld.shared.f32 f419, [r13+1792]; +ld.shared.f32 f420, [r13+1920]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f543; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f543; +add.f32 f576, f474, f546; +add.f32 f577, f475, f548; +sub.f32 f578, f474, f546; +sub.f32 f579, f475, f548; +sub.f32 f580, f464, f522; +add.f32 f581, f465, f521; +add.f32 f582, f464, f522; +sub.f32 f583, f465, f521; +add.f32 f584, f468, f551; +add.f32 f585, f469, f553; +sub.f32 f586, f468, f551; +sub.f32 f587, f469, f553; +add.f32 f588, f472, f556; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f556; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 16; +bfe.u32 r15, r5, 4, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f569, f597; +fma.rn.f32 f601, f596, f568, f600; +mul.f32 f602, f568, f597; +mul.f32 f603, f596, f569; +sub.f32 f604, f603, f602; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f573, f609; +fma.rn.f32 f611, f607, f572, f610; +mul.f32 f612, f572, f609; +mul.f32 f613, f607, f573; +sub.f32 f614, f613, f612; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f577, f619; +fma.rn.f32 f621, f617, f576, f620; +mul.f32 f622, f576, f619; +mul.f32 f623, f617, f577; +sub.f32 f624, f623, f622; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f581, f629; +fma.rn.f32 f631, f627, f580, f630; +mul.f32 f632, f580, f629; +mul.f32 f633, f627, f581; +sub.f32 f634, f633, f632; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f585, f639; +fma.rn.f32 f641, f637, f584, f640; +mul.f32 f642, f584, f639; +mul.f32 f643, f637, f585; +sub.f32 f644, f643, f642; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f589, f649; +fma.rn.f32 f651, f647, f588, f650; +mul.f32 f652, f588, f649; +mul.f32 f653, f647, f589; +sub.f32 f654, f653, f652; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f593, f659; +fma.rn.f32 f661, f657, f592, f660; +mul.f32 f662, f592, f659; +mul.f32 f663, f657, f593; +sub.f32 f664, f663, f662; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f567, f669; +fma.rn.f32 f671, f667, f566, f670; +mul.f32 f672, f566, f669; +mul.f32 f673, f667, f567; +sub.f32 f674, f673, f672; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f571, f679; +fma.rn.f32 f681, f677, f570, f680; +mul.f32 f682, f570, f679; +mul.f32 f683, f677, f571; +sub.f32 f684, f683, f682; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f575, f689; +fma.rn.f32 f691, f687, f574, f690; +mul.f32 f692, f574, f689; +mul.f32 f693, f687, f575; +sub.f32 f694, f693, f692; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f579, f699; +fma.rn.f32 f701, f697, f578, f700; +mul.f32 f702, f578, f699; +mul.f32 f703, f697, f579; +sub.f32 f704, f703, f702; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f583, f709; +fma.rn.f32 f711, f707, f582, f710; +mul.f32 f712, f582, f709; +mul.f32 f713, f707, f583; +sub.f32 f714, f713, f712; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f587, f719; +fma.rn.f32 f721, f717, f586, f720; +mul.f32 f722, f586, f719; +mul.f32 f723, f717, f587; +sub.f32 f724, f723, f722; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f591, f729; +fma.rn.f32 f731, f727, f590, f730; +mul.f32 f732, f590, f729; +mul.f32 f733, f727, f591; +sub.f32 f734, f733, f732; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f595, f739; +fma.rn.f32 f741, f737, f594, f740; +mul.f32 f742, f594, f739; +mul.f32 f743, f737, f595; +sub.f32 f744, f743, f742; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1024; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f601; +st.shared.f32 [r20+128], f611; +st.shared.f32 [r20+192], f621; +st.shared.f32 [r20+256], f631; +st.shared.f32 [r20+320], f641; +st.shared.f32 [r20+384], f651; +st.shared.f32 [r20+448], f661; +st.shared.f32 [r20+512], f671; +st.shared.f32 [r20+576], f681; +st.shared.f32 [r20+640], f691; +st.shared.f32 [r20+704], f701; +st.shared.f32 [r20+768], f711; +st.shared.f32 [r20+832], f721; +st.shared.f32 [r20+896], f731; +st.shared.f32 [r20+960], f741; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+128]; +ld.shared.f32 f747, [r21+256]; +ld.shared.f32 f748, [r21+384]; +ld.shared.f32 f749, [r21+512]; +ld.shared.f32 f750, [r21+640]; +ld.shared.f32 f751, [r21+768]; +ld.shared.f32 f752, [r21+896]; +ld.shared.f32 f753, [r21+1024]; +ld.shared.f32 f754, [r21+1152]; +ld.shared.f32 f755, [r21+1280]; +ld.shared.f32 f756, [r21+1408]; +ld.shared.f32 f757, [r21+1536]; +ld.shared.f32 f758, [r21+1664]; +ld.shared.f32 f759, [r21+1792]; +ld.shared.f32 f760, [r21+1920]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+128]; +ld.shared.f32 f763, [r21+256]; +ld.shared.f32 f764, [r21+384]; +ld.shared.f32 f765, [r21+512]; +ld.shared.f32 f766, [r21+640]; +ld.shared.f32 f767, [r21+768]; +ld.shared.f32 f768, [r21+896]; +ld.shared.f32 f769, [r21+1024]; +ld.shared.f32 f770, [r21+1152]; +ld.shared.f32 f771, [r21+1280]; +ld.shared.f32 f772, [r21+1408]; +ld.shared.f32 f773, [r21+1536]; +ld.shared.f32 f774, [r21+1664]; +ld.shared.f32 f775, [r21+1792]; +ld.shared.f32 f776, [r21+1920]; +add.f32 %0, f745, f753; +add.f32 %1, f761, f769; +add.f32 %2, f746, f754; +add.f32 %3, f762, f770; +add.f32 %4, f747, f755; +add.f32 %5, f763, f771; +add.f32 %6, f748, f756; +add.f32 %7, f764, f772; +add.f32 %8, f749, f757; +add.f32 %9, f765, f773; +add.f32 %10, f750, f758; +add.f32 %11, f766, f774; +add.f32 %12, f751, f759; +add.f32 %13, f767, f775; +add.f32 %14, f752, f760; +add.f32 %15, f768, f776; +sub.f32 %16, f745, f753; +sub.f32 %17, f761, f769; +sub.f32 %18, f746, f754; +sub.f32 %19, f762, f770; +sub.f32 %20, f747, f755; +sub.f32 %21, f763, f771; +sub.f32 %22, f748, f756; +sub.f32 %23, f764, f772; +sub.f32 %24, f749, f757; +sub.f32 %25, f765, f773; +sub.f32 %26, f750, f758; +sub.f32 %27, f766, f774; +sub.f32 %28, f751, f759; +sub.f32 %29, f767, f775; +sub.f32 %30, f752, f760; +sub.f32 %31, f768, f776; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<278, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1051>; +.reg .b32 r<35>; +.reg .b64 rd<11>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %35, %51; +sub.f32 f67, %35, %51; +add.f32 f1042, %36, %67; +sub.f32 f68, %36, %67; +add.f32 f69, %43, %59; +sub.f32 f71, %43, %59; +add.f32 f1040, %68, %60; +sub.f32 f72, %68, %60; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1039, f1042, f1040; +sub.f32 f76, f1042, f1040; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f1038, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %55; +sub.f32 f83, %39, %55; +add.f32 f1035, %70, %69; +sub.f32 f84, %70, %69; +add.f32 f85, %47, %63; +sub.f32 f87, %47, %63; +add.f32 f1033, %48, %71; +sub.f32 f88, %48, %71; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1032, f1035, f1033; +sub.f32 f92, f1035, f1033; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f1031, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f1031, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f1029, f95, 0fBF3504F3; +mul.f32 f1030, f96, 0f3F3504F3; +sub.f32 f103, f1029, f1030; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1028, f1039, f1032; +sub.f32 f109, f1039, f1032; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1027, f1038, f100; +sub.f32 f113, f1038, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f1026, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f1025, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %37, %53; +sub.f32 f124, %37, %53; +add.f32 f1023, %72, %54; +sub.f32 f125, %72, %54; +add.f32 f126, %45, %61; +sub.f32 f128, %45, %61; +add.f32 f1020, %73, %74; +sub.f32 f129, %73, %74; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1019, f1023, f1020; +sub.f32 f133, f1023, f1020; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f1018, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %41, %57; +sub.f32 f140, %41, %57; +add.f32 f1016, %42, %75; +sub.f32 f141, %42, %75; +add.f32 f142, %49, %65; +sub.f32 f144, %49, %65; +add.f32 f1014, %76, %66; +sub.f32 f145, %76, %66; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1013, f1016, f1014; +sub.f32 f149, f1016, f1014; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f1012, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f1012, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f1010, f152, 0fBF3504F3; +mul.f32 f1011, f153, 0f3F3504F3; +sub.f32 f160, f1010, f1011; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1009, f1019, f1013; +sub.f32 f166, f1019, f1013; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1008, f1018, f157; +sub.f32 f170, f1018, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f1007, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f1006, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1004, f167, 0f3F6C835E; +mul.f32 f1005, f1008, 0f3EC3EF15; +sub.f32 f181, f1004, f1005; +mul.f32 f182, f1008, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f1007, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f1006, 0f3F6C835E; +mul.f32 f1003, f175, 0f3EC3EF15; +sub.f32 f190, f1003, f189; +mul.f32 f191, f1006, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f1002, f169, 0fBEC3EF15; +sub.f32 f195, f1002, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f1000, f173, 0fBF3504F3; +mul.f32 f1001, f174, 0f3F3504F3; +sub.f32 f200, f1000, f1001; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f998, f177, 0fBF6C835E; +mul.f32 f999, f178, 0f3EC3EF15; +sub.f32 f205, f998, f999; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f997, f1027, f183; +sub.f32 f213, f1027, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f996, f1026, f187; +sub.f32 f217, f1026, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f995, f1025, f192; +sub.f32 f221, f1025, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f994, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f993, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f992, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f991, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 7; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f997, f239; +mul.f32 f244, f238, f997; +mul.f32 f246, f239, f239; +mul.f32 f990, f238, f238; +sub.f32 f247, f990, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f996, f249; +mul.f32 f252, f247, f996; +mul.f32 f988, f238, f247; +mul.f32 f989, f239, f249; +sub.f32 f255, f988, f989; +mul.f32 f987, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f995, f257; +mul.f32 f260, f255, f995; +mul.f32 f262, f239, f257; +mul.f32 f986, f238, f255; +sub.f32 f263, f986, f262; +mul.f32 f985, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f994, f265; +mul.f32 f268, f263, f994; +mul.f32 f270, f239, f265; +mul.f32 f984, f238, f263; +sub.f32 f271, f984, f270; +mul.f32 f983, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f993, f273; +mul.f32 f276, f271, f993; +mul.f32 f981, f238, f271; +mul.f32 f982, f239, f273; +sub.f32 f279, f981, f982; +mul.f32 f980, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f992, f281; +mul.f32 f284, f279, f992; +mul.f32 f286, f239, f281; +mul.f32 f979, f238, f279; +sub.f32 f287, f979, f286; +mul.f32 f978, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f991, f289; +mul.f32 f292, f287, f991; +mul.f32 f294, f239, f289; +mul.f32 f977, f238, f287; +sub.f32 f295, f977, f294; +mul.f32 f976, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f975, f1028, f1009; +mul.f32 f298, f975, f297; +mul.f32 f300, f295, f975; +mul.f32 f973, f238, f295; +mul.f32 f974, f239, f297; +sub.f32 f303, f973, f974; +sub.f32 f972, f106, f163; +mul.f32 f971, f972, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f970, f238, f303; +sub.f32 f311, f970, f310; +mul.f32 f969, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f967, f238, f311; +mul.f32 f968, f239, f313; +sub.f32 f319, f967, f968; +mul.f32 f966, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f965, f238, f319; +sub.f32 f327, f965, f326; +mul.f32 f964, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f963, f238, f327; +sub.f32 f335, f963, f334; +mul.f32 f962, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f960, f238, f335; +mul.f32 f961, f239, f337; +sub.f32 f343, f960, f961; +mul.f32 f959, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f958, f238, f343; +sub.f32 f351, f958, f350; +mul.f32 f957, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f956, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 3968; +add.s32 r12, r9, r11; +add.f32 f357, f1028, f1009; +sub.f32 f1044, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r34, %tid.x; +shl.b32 r28, r34, 7; +shl.b32 r24, r34, 3; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f956; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f987; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f985; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f983; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f980; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f978; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f976; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f1044, f298; +sub.f32 f374, f300, f971; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f969; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f966; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f964; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f962; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f959; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f957; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +and.b32 r21, r34, 31; +mad.lo.s32 r13, r21, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+256]; +ld.shared.v2.f32 {f397, f398}, [r13+512]; +ld.shared.v2.f32 {f401, f402}, [r13+768]; +ld.shared.v2.f32 {f405, f406}, [r13+1024]; +ld.shared.v2.f32 {f409, f410}, [r13+1280]; +ld.shared.v2.f32 {f413, f414}, [r13+1536]; +ld.shared.v2.f32 {f417, f418}, [r13+1792]; +ld.shared.v2.f32 {f421, f422}, [r13+2048]; +ld.shared.v2.f32 {f425, f426}, [r13+2304]; +ld.shared.v2.f32 {f429, f430}, [r13+2560]; +ld.shared.v2.f32 {f433, f434}, [r13+2816]; +ld.shared.v2.f32 {f437, f438}, [r13+3072]; +ld.shared.v2.f32 {f441, f442}, [r13+3328]; +ld.shared.v2.f32 {f445, f446}, [r13+3584]; +ld.shared.v2.f32 {f449, f450}, [r13+3840]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f955, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f954, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f953, f955, f954; +sub.f32 f464, f955, f954; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f952, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f951, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f950, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f949, f951, f950; +sub.f32 f480, f951, f950; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f948, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f948, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f947, f483, 0fBF3504F3; +sub.f32 f491, f947, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f946, f953, f949; +sub.f32 f497, f953, f949; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f945, f952, f488; +sub.f32 f501, f952, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f944, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f943, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f942, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f941, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f940, f942, f941; +sub.f32 f521, f942, f941; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f939, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f938, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f937, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f936, f938, f937; +sub.f32 f537, f938, f937; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f935, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f935, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f934, f540, 0fBF3504F3; +sub.f32 f548, f934, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f933, f940, f936; +sub.f32 f554, f940, f936; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f932, f939, f545; +sub.f32 f558, f939, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f931, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f930, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f932, 0f3EC3EF15; +mul.f32 f929, f555, 0f3F6C835E; +sub.f32 f569, f929, f568; +mul.f32 f570, f932, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f931, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f930, 0f3F6C835E; +mul.f32 f928, f563, 0f3EC3EF15; +sub.f32 f578, f928, f577; +mul.f32 f579, f930, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f927, f557, 0fBEC3EF15; +sub.f32 f583, f927, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f926, f561, 0fBF3504F3; +sub.f32 f588, f926, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f925, f565, 0fBF6C835E; +sub.f32 f593, f925, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f924, f945, f571; +sub.f32 f601, f945, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f923, f944, f575; +sub.f32 f605, f944, f575; +add.f32 f606, f506, f578; +sub.f32 f608, f506, f578; +add.f32 f922, f943, f580; +sub.f32 f609, f943, f580; +sub.f32 f610, f496, f554; +add.f32 f612, f496, f554; +add.f32 f921, f497, f553; +sub.f32 f613, f497, f553; +add.f32 f614, f500, f583; +sub.f32 f616, f500, f583; +add.f32 f920, f501, f585; +sub.f32 f617, f501, f585; +add.f32 f618, f504, f588; +sub.f32 f620, f504, f588; +add.f32 f919, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f918, f509, f595; +sub.f32 f625, f509, f595; +and.b32 r14, r34, 16; +bfe.u32 r15, r34, 4, 1; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f630, f924, f627; +mul.f32 f632, f626, f924; +mul.f32 f634, f627, f627; +mul.f32 f917, f626, f626; +sub.f32 f635, f917, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f638, f923, f637; +mul.f32 f640, f635, f923; +mul.f32 f915, f626, f635; +mul.f32 f916, f627, f637; +sub.f32 f643, f915, f916; +mul.f32 f914, f602, f637; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f646, f922, f645; +mul.f32 f648, f643, f922; +mul.f32 f650, f627, f645; +mul.f32 f913, f626, f643; +sub.f32 f651, f913, f650; +mul.f32 f912, f606, f645; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f654, f921, f653; +mul.f32 f656, f651, f921; +mul.f32 f658, f627, f653; +mul.f32 f911, f626, f651; +sub.f32 f659, f911, f658; +mul.f32 f910, f610, f653; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f662, f920, f661; +mul.f32 f664, f659, f920; +mul.f32 f908, f626, f659; +mul.f32 f909, f627, f661; +sub.f32 f667, f908, f909; +mul.f32 f907, f614, f661; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f670, f919, f669; +mul.f32 f672, f667, f919; +mul.f32 f674, f627, f669; +mul.f32 f906, f626, f667; +sub.f32 f675, f906, f674; +mul.f32 f905, f618, f669; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f678, f918, f677; +mul.f32 f680, f675, f918; +mul.f32 f682, f627, f677; +mul.f32 f904, f626, f675; +sub.f32 f683, f904, f682; +mul.f32 f903, f622, f677; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f902, f946, f933; +mul.f32 f686, f902, f685; +mul.f32 f688, f683, f902; +sub.f32 f901, f494, f551; +mul.f32 f899, f626, f683; +mul.f32 f900, f627, f685; +sub.f32 f691, f899, f900; +mul.f32 f898, f901, f685; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f694, f601, f693; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f897, f626, f691; +sub.f32 f699, f897, f698; +mul.f32 f896, f600, f693; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f702, f605, f701; +mul.f32 f704, f699, f605; +mul.f32 f894, f626, f699; +mul.f32 f895, f627, f701; +sub.f32 f707, f894, f895; +mul.f32 f893, f604, f701; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f710, f609, f709; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f892, f626, f707; +sub.f32 f715, f892, f714; +mul.f32 f891, f608, f709; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f718, f613, f717; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f890, f626, f715; +sub.f32 f723, f890, f722; +mul.f32 f889, f612, f717; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f726, f617, f725; +mul.f32 f728, f723, f617; +mul.f32 f887, f626, f723; +mul.f32 f888, f627, f725; +sub.f32 f731, f887, f888; +mul.f32 f886, f616, f725; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f734, f621, f733; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f885, f626, f731; +sub.f32 f739, f885, f738; +mul.f32 f884, f620, f733; +mul.f32 f740, f626, f733; +mul.f32 f883, f598, f627; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f625, f741; +mul.f32 f743, f624, f741; +mul.f32 f744, f739, f625; +and.b32 r16, r24, 120; +add.s32 r17, r9, r16; +sub.f32 f1046, f946, f933; +mul.f32 f1045, f683, f1046; +barrier.sync 0; +and.b32 r18, r28, 2048; +add.s32 r19, r17, r18; +sub.f32 f1050, f946, f933; +mul.f32 f1049, f683, f1050; +add.f32 f745, f946, f933; +sub.f32 f1048, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 16; +fma.rn.f32 f747, f626, f598, f630; +sub.f32 f748, f632, f883; +st.shared.v2.f32 [r19+128], {f747, f748}; +fma.rn.f32 f749, f635, f602, f638; +sub.f32 f750, f640, f914; +st.shared.v2.f32 [r19+256], {f749, f750}; +fma.rn.f32 f751, f643, f606, f646; +sub.f32 f752, f648, f912; +st.shared.v2.f32 [r19+384], {f751, f752}; +fma.rn.f32 f753, f651, f610, f654; +sub.f32 f754, f656, f910; +st.shared.v2.f32 [r19+512], {f753, f754}; +sub.f32 f755, f664, f907; +fma.rn.f32 f756, f659, f614, f662; +st.shared.v2.f32 [r19+640], {f756, f755}; +fma.rn.f32 f757, f667, f618, f670; +sub.f32 f758, f672, f905; +st.shared.v2.f32 [r19+768], {f757, f758}; +fma.rn.f32 f759, f675, f622, f678; +sub.f32 f760, f680, f903; +st.shared.v2.f32 [r19+896], {f759, f760}; +fma.rn.f32 f761, f683, f1048, f686; +sub.f32 f762, f1049, f898; +st.shared.v2.f32 [r19+1024], {f761, f762}; +fma.rn.f32 f763, f691, f600, f694; +sub.f32 f764, f696, f896; +st.shared.v2.f32 [r19+1152], {f763, f764}; +fma.rn.f32 f765, f699, f604, f702; +sub.f32 f766, f704, f893; +st.shared.v2.f32 [r19+1280], {f765, f766}; +fma.rn.f32 f767, f707, f608, f710; +sub.f32 f768, f712, f891; +st.shared.v2.f32 [r19+1408], {f767, f768}; +fma.rn.f32 f769, f715, f612, f718; +sub.f32 f770, f720, f889; +st.shared.v2.f32 [r19+1536], {f769, f770}; +fma.rn.f32 f771, f723, f616, f726; +sub.f32 f772, f728, f886; +st.shared.v2.f32 [r19+1664], {f771, f772}; +fma.rn.f32 f773, f731, f620, f734; +sub.f32 f774, f736, f884; +st.shared.v2.f32 [r19+1792], {f773, f774}; +fma.rn.f32 f775, f739, f624, f742; +sub.f32 f776, f744, f743; +st.shared.v2.f32 [r19+1920], {f775, f776}; +barrier.sync 0; +mad.lo.s32 r20, r26, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+256]; +ld.shared.v2.f32 {f785, f786}, [r20+512]; +ld.shared.v2.f32 {f789, f790}, [r20+768]; +ld.shared.v2.f32 {f793, f794}, [r20+1024]; +ld.shared.v2.f32 {f797, f798}, [r20+1280]; +ld.shared.v2.f32 {f801, f802}, [r20+1536]; +ld.shared.v2.f32 {f805, f806}, [r20+1792]; +ld.shared.v2.f32 {f809, f810}, [r20+2048]; +ld.shared.v2.f32 {f813, f814}, [r20+2304]; +ld.shared.v2.f32 {f817, f818}, [r20+2560]; +ld.shared.v2.f32 {f821, f822}, [r20+2816]; +ld.shared.v2.f32 {f825, f826}, [r20+3072]; +ld.shared.v2.f32 {f829, f830}, [r20+3328]; +ld.shared.v2.f32 {f833, f834}, [r20+3584]; +ld.shared.v2.f32 {f837, f838}, [r20+3840]; +add.f32 %0, f777, f809; +add.f32 %1, f778, f810; +add.f32 %2, f781, f813; +add.f32 %3, f782, f814; +add.f32 %5, f786, f818; +add.f32 %4, f785, f817; +add.f32 %7, f790, f822; +add.f32 %6, f789, f821; +add.f32 %8, f793, f825; +add.f32 %9, f794, f826; +add.f32 %10, f797, f829; +add.f32 %11, f798, f830; +add.f32 %12, f801, f833; +add.f32 %13, f802, f834; +add.f32 %14, f805, f837; +add.f32 %15, f806, f838; +sub.f32 %17, f778, f810; +sub.f32 %16, f777, f809; +sub.f32 %19, f782, f814; +sub.f32 %18, f781, f813; +sub.f32 %21, f786, f818; +sub.f32 %20, f785, f817; +sub.f32 %23, f790, f822; +sub.f32 %22, f789, f821; +sub.f32 %25, f794, f826; +sub.f32 %24, f793, f825; +sub.f32 %27, f798, f830; +sub.f32 %26, f797, f829; +sub.f32 %29, f802, f834; +sub.f32 %28, f801, f833; +sub.f32 %31, f806, f838; +sub.f32 %30, f805, f837; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<277, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1608>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1606, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1604, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1603, f1606, f1604; +sub.f32 f140, f1606, f1604; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1602, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1599, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1597, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1596, f1599, f1597; +sub.f32 f156, f1599, f1597; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1595, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1595, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1593, f159, 0fBF3504F3; +mul.f32 f1594, f160, 0f3F3504F3; +sub.f32 f167, f1593, f1594; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1592, f1603, f1596; +sub.f32 f173, f1603, f1596; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1591, f1602, f164; +sub.f32 f177, f1602, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1590, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1589, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1587, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1584, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1583, f1587, f1584; +sub.f32 f197, f1587, f1584; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1582, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1580, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1578, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1577, f1580, f1578; +sub.f32 f213, f1580, f1578; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1576, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1576, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1574, f216, 0fBF3504F3; +mul.f32 f1575, f217, 0f3F3504F3; +sub.f32 f224, f1574, f1575; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1573, f1583, f1577; +sub.f32 f230, f1583, f1577; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1572, f1582, f221; +sub.f32 f234, f1582, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1571, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1570, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1568, f231, 0f3F6C835E; +mul.f32 f1569, f1572, 0f3EC3EF15; +sub.f32 f245, f1568, f1569; +mul.f32 f246, f1572, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1571, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1570, 0f3F6C835E; +mul.f32 f1567, f239, 0f3EC3EF15; +sub.f32 f254, f1567, f253; +mul.f32 f255, f1570, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1566, f233, 0fBEC3EF15; +sub.f32 f259, f1566, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1564, f237, 0fBF3504F3; +mul.f32 f1565, f238, 0f3F3504F3; +sub.f32 f264, f1564, f1565; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1562, f241, 0fBF6C835E; +mul.f32 f1563, f242, 0f3EC3EF15; +sub.f32 f269, f1562, f1563; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1561, f1592, f1573; +sub.f32 f275, f1592, f1573; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1560, f1591, f247; +sub.f32 f279, f1591, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1559, f1590, f251; +sub.f32 f283, f1590, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1558, f1589, f256; +sub.f32 f287, f1589, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1557, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1556, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1555, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1554, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1551, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1549, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1548, f1551, f1549; +sub.f32 f315, f1551, f1549; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1547, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1545, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1542, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1541, f1545, f1542; +sub.f32 f331, f1545, f1542; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1540, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1540, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1539, f334, 0fBF3504F3; +sub.f32 f342, f1539, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1538, f1548, f1541; +sub.f32 f348, f1548, f1541; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1537, f1547, f339; +sub.f32 f352, f1547, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1536, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1535, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1533, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1531, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1530, f1533, f1531; +sub.f32 f372, f1533, f1531; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1529, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1526, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1525, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1524, f1526, f1525; +sub.f32 f388, f1526, f1525; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1523, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1523, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1521, f391, 0fBF3504F3; +mul.f32 f1522, f392, 0f3F3504F3; +sub.f32 f399, f1521, f1522; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1520, f1530, f1524; +sub.f32 f405, f1530, f1524; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1519, f1529, f396; +sub.f32 f409, f1529, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1518, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1517, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1519, 0f3EC3EF15; +mul.f32 f1516, f406, 0f3F6C835E; +sub.f32 f420, f1516, f419; +mul.f32 f421, f1519, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1518, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1517, 0f3F6C835E; +mul.f32 f1515, f414, 0f3EC3EF15; +sub.f32 f429, f1515, f428; +mul.f32 f430, f1517, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1514, f408, 0fBEC3EF15; +sub.f32 f434, f1514, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1513, f412, 0fBF3504F3; +sub.f32 f439, f1513, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1512, f416, 0fBF6C835E; +sub.f32 f444, f1512, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1511, f1538, f1520; +sub.f32 f450, f1538, f1520; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1510, f1537, f422; +sub.f32 f454, f1537, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1509, f1536, f426; +sub.f32 f458, f1536, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1508, f1535, f431; +sub.f32 f462, f1535, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1507, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1506, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1505, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1504, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1510, 0f3E47C5C2; +mul.f32 f1503, f451, 0f3F7B14BE; +sub.f32 f481, f1503, f480; +mul.f32 f482, f1510, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1509, 0f3EC3EF15; +mul.f32 f1502, f455, 0f3F6C835E; +sub.f32 f486, f1502, f485; +mul.f32 f487, f1509, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1508, 0f3F0E39DA; +mul.f32 f1501, f459, 0f3F54DB31; +sub.f32 f491, f1501, f490; +mul.f32 f492, f1508, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1507, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1506, 0f3F54DB31; +mul.f32 f1500, f467, 0f3F0E39DA; +sub.f32 f500, f1500, f499; +mul.f32 f501, f1506, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1505, 0f3F6C835E; +mul.f32 f1499, f471, 0f3EC3EF15; +sub.f32 f505, f1499, f504; +mul.f32 f506, f1505, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1504, 0f3F7B14BE; +mul.f32 f1498, f475, 0f3E47C5C2; +sub.f32 f510, f1498, f509; +mul.f32 f511, f1504, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1497, f453, 0fBE47C5C2; +sub.f32 f515, f1497, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1495, f457, 0fBEC3EF15; +mul.f32 f1496, f458, 0f3F6C835E; +sub.f32 f520, f1495, f1496; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1493, f461, 0fBF0E39DA; +mul.f32 f1494, f462, 0f3F54DB31; +sub.f32 f525, f1493, f1494; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1491, f465, 0fBF3504F3; +mul.f32 f1492, f466, 0f3F3504F3; +sub.f32 f530, f1491, f1492; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1489, f469, 0fBF54DB31; +mul.f32 f1490, f470, 0f3F0E39DA; +sub.f32 f535, f1489, f1490; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1488, f473, 0fBF6C835E; +sub.f32 f540, f1488, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1487, f477, 0fBF7B14BE; +sub.f32 f545, f1487, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1486, f1561, f1511; +sub.f32 f551, f1561, f1511; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1485, f1560, f483; +sub.f32 f555, f1560, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1484, f1559, f488; +sub.f32 f559, f1559, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1483, f1558, f493; +sub.f32 f563, f1558, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1482, f1557, f497; +sub.f32 f567, f1557, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f1481, f1556, f502; +sub.f32 f571, f1556, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f1480, f1555, f507; +sub.f32 f575, f1555, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f1479, f1554, f512; +sub.f32 f579, f1554, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f1478, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f1477, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f1476, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f1475, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f1474, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1473, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1472, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1471, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f1485, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f1485; +sub.f32 f620, f619, f618; +mul.f32 f1469, f612, f612; +mul.f32 f1470, f613, f613; +sub.f32 f623, f1469, f1470; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f1484, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f1484; +sub.f32 f630, f629, f628; +mul.f32 f1467, f612, f623; +mul.f32 f1468, f613, f625; +sub.f32 f633, f1467, f1468; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f1483, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f1483; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f1466, f612, f633; +sub.f32 f643, f1466, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f1482, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f1482; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f1465, f612, f643; +sub.f32 f653, f1465, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f1481, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f1481; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f1464, f612, f653; +sub.f32 f663, f1464, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f1480, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f1480; +sub.f32 f670, f669, f668; +mul.f32 f1462, f612, f663; +mul.f32 f1463, f613, f665; +sub.f32 f673, f1462, f1463; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f1479, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f1479; +sub.f32 f680, f679, f678; +mul.f32 f1460, f612, f673; +mul.f32 f1461, f613, f675; +sub.f32 f683, f1460, f1461; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f1478, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f1478; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f1459, f612, f683; +sub.f32 f693, f1459, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f1477, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f1477; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f1458, f612, f693; +sub.f32 f703, f1458, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f1476, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f1476; +sub.f32 f710, f709, f708; +mul.f32 f1456, f612, f703; +mul.f32 f1457, f613, f705; +sub.f32 f713, f1456, f1457; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f1475, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f1475; +sub.f32 f720, f719, f718; +mul.f32 f1454, f612, f713; +mul.f32 f1455, f613, f715; +sub.f32 f723, f1454, f1455; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f1474, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f1474; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f1453, f612, f723; +sub.f32 f733, f1453, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f1473, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f1473; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f1452, f612, f733; +sub.f32 f743, f1452, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f1472, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f1472; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f1451, f612, f743; +sub.f32 f753, f1451, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f1471, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f1471; +sub.f32 f760, f759, f758; +mul.f32 f1449, f612, f753; +mul.f32 f1450, f613, f755; +sub.f32 f763, f1449, f1450; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f1447, f612, f763; +mul.f32 f1448, f613, f765; +sub.f32 f773, f1447, f1448; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f1446, f612, f773; +sub.f32 f783, f1446, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f1445, f612, f783; +sub.f32 f793, f1445, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f1444, f612, f793; +sub.f32 f803, f1444, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f1442, f612, f803; +mul.f32 f1443, f613, f805; +sub.f32 f813, f1442, f1443; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f1440, f612, f813; +mul.f32 f1441, f613, f815; +sub.f32 f823, f1440, f1441; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f1439, f612, f823; +sub.f32 f833, f1439, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f1438, f612, f833; +sub.f32 f843, f1438, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f1436, f612, f843; +mul.f32 f1437, f613, f845; +sub.f32 f853, f1436, f1437; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f1434, f612, f853; +mul.f32 f1435, f613, f855; +sub.f32 f863, f1434, f1435; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f1433, f612, f863; +sub.f32 f873, f1433, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f1432, f612, f873; +sub.f32 f883, f1432, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f1431, f612, f883; +sub.f32 f893, f1431, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f1429, f612, f893; +mul.f32 f1430, f613, f895; +sub.f32 f903, f1429, f1430; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f1427, f612, f903; +mul.f32 f1428, f613, f905; +sub.f32 f913, f1427, f1428; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mov.u32 r17, %tid.x; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +and.b32 r14, r17, 15; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 1920; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+64]; +ld.shared.f32 f923, [r13+128]; +ld.shared.f32 f924, [r13+192]; +ld.shared.f32 f925, [r13+256]; +ld.shared.f32 f926, [r13+320]; +ld.shared.f32 f927, [r13+384]; +ld.shared.f32 f928, [r13+448]; +ld.shared.f32 f929, [r13+512]; +ld.shared.f32 f930, [r13+576]; +ld.shared.f32 f931, [r13+640]; +ld.shared.f32 f932, [r13+704]; +ld.shared.f32 f933, [r13+768]; +ld.shared.f32 f934, [r13+832]; +ld.shared.f32 f935, [r13+896]; +ld.shared.f32 f936, [r13+960]; +ld.shared.f32 f937, [r13+1024]; +ld.shared.f32 f938, [r13+1088]; +ld.shared.f32 f939, [r13+1152]; +ld.shared.f32 f940, [r13+1216]; +ld.shared.f32 f941, [r13+1280]; +ld.shared.f32 f942, [r13+1344]; +ld.shared.f32 f943, [r13+1408]; +ld.shared.f32 f944, [r13+1472]; +ld.shared.f32 f945, [r13+1536]; +ld.shared.f32 f946, [r13+1600]; +ld.shared.f32 f947, [r13+1664]; +ld.shared.f32 f948, [r13+1728]; +ld.shared.f32 f949, [r13+1792]; +ld.shared.f32 f950, [r13+1856]; +ld.shared.f32 f951, [r13+1920]; +ld.shared.f32 f952, [r13+1984]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1486, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+64]; +ld.shared.f32 f955, [r13+128]; +ld.shared.f32 f956, [r13+192]; +ld.shared.f32 f957, [r13+256]; +ld.shared.f32 f958, [r13+320]; +ld.shared.f32 f959, [r13+384]; +ld.shared.f32 f960, [r13+448]; +ld.shared.f32 f961, [r13+512]; +ld.shared.f32 f962, [r13+576]; +ld.shared.f32 f963, [r13+640]; +ld.shared.f32 f964, [r13+704]; +ld.shared.f32 f965, [r13+768]; +ld.shared.f32 f966, [r13+832]; +ld.shared.f32 f967, [r13+896]; +ld.shared.f32 f968, [r13+960]; +ld.shared.f32 f969, [r13+1024]; +ld.shared.f32 f970, [r13+1088]; +ld.shared.f32 f971, [r13+1152]; +ld.shared.f32 f972, [r13+1216]; +ld.shared.f32 f973, [r13+1280]; +ld.shared.f32 f974, [r13+1344]; +ld.shared.f32 f975, [r13+1408]; +ld.shared.f32 f976, [r13+1472]; +ld.shared.f32 f977, [r13+1536]; +ld.shared.f32 f978, [r13+1600]; +ld.shared.f32 f979, [r13+1664]; +ld.shared.f32 f980, [r13+1728]; +ld.shared.f32 f981, [r13+1792]; +ld.shared.f32 f982, [r13+1856]; +ld.shared.f32 f983, [r13+1920]; +ld.shared.f32 f984, [r13+1984]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f1426, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f1425, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f1424, f1426, f1425; +sub.f32 f996, f1426, f1425; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f1423, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f1422, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f1421, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f1420, f1422, f1421; +sub.f32 f1012, f1422, f1421; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f1419, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f1419, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f1417, f1015, 0fBF3504F3; +mul.f32 f1418, f1016, 0f3F3504F3; +sub.f32 f1023, f1417, f1418; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f1416, f1424, f1420; +sub.f32 f1029, f1424, f1420; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f1415, f1423, f1020; +sub.f32 f1033, f1423, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f1414, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f1413, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f1412, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f1411, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f1410, f1412, f1411; +sub.f32 f1053, f1412, f1411; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f1409, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f1408, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f1407, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f1406, f1408, f1407; +sub.f32 f1069, f1408, f1407; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f1405, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f1405, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f1403, f1072, 0fBF3504F3; +mul.f32 f1404, f1073, 0f3F3504F3; +sub.f32 f1080, f1403, f1404; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f1402, f1410, f1406; +sub.f32 f1086, f1410, f1406; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f1401, f1409, f1077; +sub.f32 f1090, f1409, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f1400, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f1399, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f1397, f1087, 0f3F6C835E; +mul.f32 f1398, f1401, 0f3EC3EF15; +sub.f32 f1101, f1397, f1398; +mul.f32 f1102, f1401, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f1400, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f1395, f1095, 0f3EC3EF15; +mul.f32 f1396, f1399, 0f3F6C835E; +sub.f32 f1110, f1395, f1396; +mul.f32 f1111, f1399, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f1393, f1089, 0fBEC3EF15; +mul.f32 f1394, f1090, 0f3F6C835E; +sub.f32 f1115, f1393, f1394; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f1391, f1093, 0fBF3504F3; +mul.f32 f1392, f1094, 0f3F3504F3; +sub.f32 f1120, f1391, f1392; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f1389, f1097, 0fBF6C835E; +mul.f32 f1390, f1098, 0f3EC3EF15; +sub.f32 f1125, f1389, f1390; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f922, f938; +sub.f32 f1130, f922, f938; +add.f32 f1388, f954, f970; +sub.f32 f1131, f954, f970; +add.f32 f1132, f930, f946; +sub.f32 f1134, f930, f946; +add.f32 f1387, f962, f978; +sub.f32 f1135, f962, f978; +add.f32 f1136, f1128, f1132; +sub.f32 f1138, f1128, f1132; +add.f32 f1386, f1388, f1387; +sub.f32 f1139, f1388, f1387; +sub.f32 f1140, f1130, f1135; +add.f32 f1142, f1130, f1135; +add.f32 f1385, f1131, f1134; +sub.f32 f1143, f1131, f1134; +add.f32 f1144, f926, f942; +sub.f32 f1146, f926, f942; +add.f32 f1384, f958, f974; +sub.f32 f1147, f958, f974; +add.f32 f1148, f934, f950; +sub.f32 f1150, f934, f950; +add.f32 f1383, f966, f982; +sub.f32 f1151, f966, f982; +add.f32 f1152, f1144, f1148; +sub.f32 f1154, f1144, f1148; +add.f32 f1382, f1384, f1383; +sub.f32 f1155, f1384, f1383; +sub.f32 f1156, f1146, f1151; +add.f32 f1158, f1146, f1151; +add.f32 f1381, f1147, f1150; +sub.f32 f1159, f1147, f1150; +mul.f32 f1160, f1156, 0f3F3504F3; +mul.f32 f1161, f1381, 0f3F3504F3; +sub.f32 f1162, f1160, f1161; +add.f32 f1163, f1160, f1161; +mul.f32 f1165, f1159, 0f3F3504F3; +mul.f32 f1380, f1158, 0fBF3504F3; +sub.f32 f1166, f1380, f1165; +mul.f32 f1167, f1159, 0fBF3504F3; +fma.rn.f32 f1168, f1158, 0f3F3504F3, f1167; +add.f32 f1169, f1136, f1152; +sub.f32 f1171, f1136, f1152; +add.f32 f1379, f1386, f1382; +sub.f32 f1172, f1386, f1382; +add.f32 f1173, f1140, f1162; +sub.f32 f1175, f1140, f1162; +add.f32 f1378, f1385, f1163; +sub.f32 f1176, f1385, f1163; +sub.f32 f1177, f1138, f1155; +add.f32 f1179, f1138, f1155; +add.f32 f1377, f1139, f1154; +sub.f32 f1180, f1139, f1154; +add.f32 f1181, f1142, f1166; +sub.f32 f1183, f1142, f1166; +add.f32 f1376, f1143, f1168; +sub.f32 f1184, f1143, f1168; +add.f32 f1185, f924, f940; +sub.f32 f1187, f924, f940; +add.f32 f1375, f956, f972; +sub.f32 f1188, f956, f972; +add.f32 f1189, f932, f948; +sub.f32 f1191, f932, f948; +add.f32 f1374, f964, f980; +sub.f32 f1192, f964, f980; +add.f32 f1193, f1185, f1189; +sub.f32 f1195, f1185, f1189; +add.f32 f1373, f1375, f1374; +sub.f32 f1196, f1375, f1374; +sub.f32 f1197, f1187, f1192; +add.f32 f1199, f1187, f1192; +add.f32 f1372, f1188, f1191; +sub.f32 f1200, f1188, f1191; +add.f32 f1201, f928, f944; +sub.f32 f1203, f928, f944; +add.f32 f1371, f960, f976; +sub.f32 f1204, f960, f976; +add.f32 f1205, f936, f952; +sub.f32 f1207, f936, f952; +add.f32 f1370, f968, f984; +sub.f32 f1208, f968, f984; +add.f32 f1209, f1201, f1205; +sub.f32 f1211, f1201, f1205; +add.f32 f1369, f1371, f1370; +sub.f32 f1212, f1371, f1370; +sub.f32 f1213, f1203, f1208; +add.f32 f1215, f1203, f1208; +add.f32 f1368, f1204, f1207; +sub.f32 f1216, f1204, f1207; +mul.f32 f1217, f1213, 0f3F3504F3; +mul.f32 f1218, f1368, 0f3F3504F3; +sub.f32 f1219, f1217, f1218; +add.f32 f1220, f1217, f1218; +mul.f32 f1222, f1216, 0f3F3504F3; +mul.f32 f1367, f1215, 0fBF3504F3; +sub.f32 f1223, f1367, f1222; +mul.f32 f1224, f1216, 0fBF3504F3; +fma.rn.f32 f1225, f1215, 0f3F3504F3, f1224; +add.f32 f1226, f1193, f1209; +sub.f32 f1228, f1193, f1209; +add.f32 f1366, f1373, f1369; +sub.f32 f1229, f1373, f1369; +add.f32 f1230, f1197, f1219; +sub.f32 f1232, f1197, f1219; +add.f32 f1365, f1372, f1220; +sub.f32 f1233, f1372, f1220; +sub.f32 f1234, f1195, f1212; +add.f32 f1236, f1195, f1212; +add.f32 f1364, f1196, f1211; +sub.f32 f1237, f1196, f1211; +add.f32 f1238, f1199, f1223; +sub.f32 f1240, f1199, f1223; +add.f32 f1363, f1200, f1225; +sub.f32 f1241, f1200, f1225; +mul.f32 f1243, f1365, 0f3EC3EF15; +mul.f32 f1362, f1230, 0f3F6C835E; +sub.f32 f1244, f1362, f1243; +mul.f32 f1245, f1365, 0f3F6C835E; +fma.rn.f32 f1246, f1230, 0f3EC3EF15, f1245; +mul.f32 f1247, f1234, 0f3F3504F3; +mul.f32 f1248, f1364, 0f3F3504F3; +sub.f32 f1249, f1247, f1248; +add.f32 f1250, f1247, f1248; +mul.f32 f1360, f1238, 0f3EC3EF15; +mul.f32 f1361, f1363, 0f3F6C835E; +sub.f32 f1253, f1360, f1361; +mul.f32 f1254, f1363, 0f3EC3EF15; +fma.rn.f32 f1255, f1238, 0f3F6C835E, f1254; +mul.f32 f1257, f1233, 0f3F6C835E; +mul.f32 f1359, f1232, 0fBEC3EF15; +sub.f32 f1258, f1359, f1257; +mul.f32 f1259, f1233, 0fBEC3EF15; +fma.rn.f32 f1260, f1232, 0f3F6C835E, f1259; +mul.f32 f1262, f1237, 0f3F3504F3; +mul.f32 f1358, f1236, 0fBF3504F3; +sub.f32 f1263, f1358, f1262; +mul.f32 f1264, f1237, 0fBF3504F3; +fma.rn.f32 f1265, f1236, 0f3F3504F3, f1264; +mul.f32 f1267, f1241, 0f3EC3EF15; +mul.f32 f1357, f1240, 0fBF6C835E; +sub.f32 f1268, f1357, f1267; +mul.f32 f1269, f1241, 0fBF6C835E; +fma.rn.f32 f1270, f1240, 0f3EC3EF15, f1269; +add.f32 %1, f1416, f1402; +add.f32 %0, f1026, f1083; +add.f32 %3, f1379, f1366; +add.f32 %2, f1169, f1226; +add.f32 %4, f1030, f1101; +add.f32 %5, f1415, f1103; +add.f32 %7, f1378, f1246; +add.f32 %6, f1173, f1244; +add.f32 %9, f1414, f1107; +add.f32 %8, f1034, f1106; +add.f32 %11, f1377, f1250; +add.f32 %10, f1177, f1249; +add.f32 %12, f1038, f1110; +add.f32 %13, f1413, f1112; +add.f32 %14, f1181, f1253; +add.f32 %15, f1376, f1255; +sub.f32 %16, f1028, f1086; +add.f32 %17, f1029, f1085; +add.f32 %19, f1172, f1228; +sub.f32 %18, f1171, f1229; +add.f32 %21, f1033, f1117; +add.f32 %20, f1032, f1115; +add.f32 %23, f1176, f1260; +add.f32 %22, f1175, f1258; +add.f32 %24, f1036, f1120; +add.f32 %25, f1037, f1122; +add.f32 %26, f1179, f1263; +add.f32 %27, f1180, f1265; +add.f32 %28, f1040, f1125; +add.f32 %29, f1041, f1127; +add.f32 %30, f1183, f1268; +add.f32 %31, f1184, f1270; +sub.f32 %32, f1026, f1083; +sub.f32 %33, f1416, f1402; +sub.f32 %34, f1169, f1226; +sub.f32 %35, f1379, f1366; +sub.f32 %37, f1415, f1103; +sub.f32 %36, f1030, f1101; +sub.f32 %39, f1378, f1246; +sub.f32 %38, f1173, f1244; +sub.f32 %41, f1414, f1107; +sub.f32 %40, f1034, f1106; +sub.f32 %43, f1377, f1250; +sub.f32 %42, f1177, f1249; +sub.f32 %45, f1413, f1112; +sub.f32 %44, f1038, f1110; +sub.f32 %47, f1376, f1255; +sub.f32 %46, f1181, f1253; +sub.f32 %49, f1029, f1085; +add.f32 %48, f1028, f1086; +sub.f32 %51, f1172, f1228; +add.f32 %50, f1171, f1229; +sub.f32 %53, f1033, f1117; +sub.f32 %52, f1032, f1115; +sub.f32 %55, f1176, f1260; +sub.f32 %54, f1175, f1258; +sub.f32 %57, f1037, f1122; +sub.f32 %56, f1036, f1120; +sub.f32 %59, f1180, f1265; +sub.f32 %58, f1179, f1263; +sub.f32 %61, f1041, f1127; +sub.f32 %60, f1040, f1125; +sub.f32 %63, f1184, f1270; +sub.f32 %62, f1183, f1268; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_512), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<279, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<269>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -4096; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 4064; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+1024]; +ld.shared.v2.f32 {f70, f71}, [r13+2048]; +ld.shared.v2.f32 {f74, f75}, [r13+3072]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 3968; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+1024]; +ld.shared.v2.f32 {f131, f132}, [r20+2048]; +ld.shared.v2.f32 {f135, f136}, [r20+3072]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +sub.f32 f147, f139, f143; +sub.f32 f148, f140, f144; +sub.f32 f149, f141, f146; +add.f32 f150, f142, f145; +add.f32 f151, f141, f146; +sub.f32 f152, f142, f145; +and.b32 r21, r5, 112; +bfe.u32 r22, r5, 4, 3; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f153, f154}, [rd11]; +mul.f32 f157, f150, f154; +mul.f32 f158, f149, f154; +mul.f32 f159, f153, f150; +mul.f32 f160, f153, f153; +mul.f32 f161, f154, f154; +sub.f32 f162, f160, f161; +mul.f32 f163, f154, f153; +fma.rn.f32 f164, f154, f153, f163; +mul.f32 f165, f148, f164; +mul.f32 f166, f147, f164; +mul.f32 f167, f162, f148; +mul.f32 f168, f153, f162; +mul.f32 f169, f154, f164; +sub.f32 f170, f168, f169; +mul.f32 f171, f153, f164; +fma.rn.f32 f172, f154, f162, f171; +mul.f32 f173, f152, f172; +mul.f32 f174, f151, f172; +mul.f32 f175, f170, f152; +and.b32 r23, r10, 120; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 3584; +add.s32 r26, r24, r25; +add.f32 f176, f140, f144; +add.f32 f177, f139, f143; +st.shared.v2.f32 [r26], {f177, f176}; +fma.rn.f32 f178, f153, f149, f157; +sub.f32 f179, f159, f158; +st.shared.v2.f32 [r26+128], {f178, f179}; +fma.rn.f32 f180, f162, f147, f165; +sub.f32 f181, f167, f166; +st.shared.v2.f32 [r26+256], {f180, f181}; +sub.f32 f182, f175, f174; +fma.rn.f32 f183, f170, f151, f173; +st.shared.v2.f32 [r26+384], {f183, f182}; +barrier.sync 0; +mad.lo.s32 r27, r21, -24, r26; +ld.shared.v2.f32 {f184, f185}, [r27]; +ld.shared.v2.f32 {f188, f189}, [r27+1024]; +ld.shared.v2.f32 {f192, f193}, [r27+2048]; +ld.shared.v2.f32 {f196, f197}, [r27+3072]; +add.f32 f200, f184, f192; +add.f32 f201, f185, f193; +sub.f32 f202, f184, f192; +sub.f32 f203, f185, f193; +add.f32 f204, f188, f196; +add.f32 f205, f189, f197; +sub.f32 f206, f188, f196; +sub.f32 f207, f189, f197; +sub.f32 f208, f200, f204; +sub.f32 f209, f201, f205; +sub.f32 f210, f202, f207; +add.f32 f211, f203, f206; +add.f32 f212, f202, f207; +sub.f32 f213, f203, f206; +and.b32 r28, r5, 64; +bfe.u32 r29, r5, 6, 1; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f214, f215}, [rd14]; +mul.f32 f218, f211, f215; +mul.f32 f219, f210, f215; +mul.f32 f220, f214, f211; +mul.f32 f221, f214, f214; +mul.f32 f222, f215, f215; +sub.f32 f223, f221, f222; +mul.f32 f224, f215, f214; +fma.rn.f32 f225, f215, f214, f224; +mul.f32 f226, f209, f225; +mul.f32 f227, f208, f225; +mul.f32 f228, f223, f209; +mul.f32 f229, f214, f223; +mul.f32 f230, f215, f225; +sub.f32 f231, f229, f230; +mul.f32 f232, f214, f225; +fma.rn.f32 f233, f215, f223, f232; +mul.f32 f234, f213, f233; +mul.f32 f235, f212, f233; +mul.f32 f236, f231, f213; +and.b32 r30, r10, 504; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 2048; +add.s32 r33, r31, r32; +add.f32 f237, f201, f205; +add.f32 f238, f200, f204; +st.shared.v2.f32 [r33], {f238, f237}; +fma.rn.f32 f239, f214, f210, f218; +sub.f32 f240, f220, f219; +st.shared.v2.f32 [r33+512], {f239, f240}; +fma.rn.f32 f241, f223, f208, f226; +sub.f32 f242, f228, f227; +st.shared.v2.f32 [r33+1024], {f241, f242}; +sub.f32 f243, f236, f235; +fma.rn.f32 f244, f231, f212, f234; +st.shared.v2.f32 [r33+1536], {f244, f243}; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.v2.f32 {f245, f246}, [r34]; +ld.shared.v2.f32 {f249, f250}, [r34+1024]; +ld.shared.v2.f32 {f253, f254}, [r34+2048]; +ld.shared.v2.f32 {f257, f258}, [r34+3072]; +add.f32 %1, f246, f254; +add.f32 %0, f245, f253; +add.f32 %3, f250, f258; +add.f32 %2, f249, f257; +sub.f32 %5, f246, f254; +sub.f32 %4, f245, f253; +sub.f32 %7, f250, f258; +sub.f32 %6, f249, f257; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<280, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<237>; +.reg .b32 r<36>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %13, %18; +add.f32 f18, %14, %20; +sub.f32 f19, %13, %18; +sub.f32 f20, %14, %20; +add.f32 f21, %15, %21; +add.f32 f22, %17, %22; +sub.f32 f23, %15, %21; +sub.f32 f24, %17, %22; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1016; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -2048; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 2032; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+512]; +ld.shared.f32 f64, [r13+1024]; +ld.shared.f32 f65, [r13+1536]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+512]; +ld.shared.f32 f68, [r13+1024]; +ld.shared.f32 f69, [r13+1536]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 1984; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+512]; +ld.shared.f32 f117, [r21+1024]; +ld.shared.f32 f118, [r21+1536]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+512]; +ld.shared.f32 f121, [r21+1024]; +ld.shared.f32 f122, [r21+1536]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 f131, f123, f127; +add.f32 f132, f124, f128; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f128; +sub.f32 f135, f125, f130; +add.f32 f136, f126, f129; +add.f32 f137, f125, f130; +sub.f32 f138, f126, f129; +and.b32 r22, r5, 112; +bfe.u32 r23, r5, 4, 3; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f139, f140}, [rd11]; +mul.f32 f143, f136, f140; +fma.rn.f32 f144, f139, f135, f143; +mul.f32 f145, f135, f140; +mul.f32 f146, f139, f136; +sub.f32 f147, f146, f145; +mul.f32 f148, f139, f139; +mul.f32 f149, f140, f140; +sub.f32 f150, f148, f149; +mul.f32 f151, f140, f139; +fma.rn.f32 f152, f140, f139, f151; +mul.f32 f153, f134, f152; +fma.rn.f32 f154, f150, f133, f153; +mul.f32 f155, f133, f152; +mul.f32 f156, f150, f134; +sub.f32 f157, f156, f155; +mul.f32 f158, f139, f150; +mul.f32 f159, f140, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f139, f152; +fma.rn.f32 f162, f140, f150, f161; +mul.f32 f163, f138, f162; +fma.rn.f32 f164, f160, f137, f163; +mul.f32 f165, f137, f162; +mul.f32 f166, f160, f138; +sub.f32 f167, f166, f165; +and.b32 r24, r16, 60; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 1792; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f131; +st.shared.f32 [r27+64], f144; +st.shared.f32 [r27+128], f154; +st.shared.f32 [r27+192], f164; +barrier.sync 0; +mad.lo.s32 r28, r22, -12, r27; +ld.shared.f32 f168, [r28]; +ld.shared.f32 f169, [r28+512]; +ld.shared.f32 f170, [r28+1024]; +ld.shared.f32 f171, [r28+1536]; +barrier.sync 0; +st.shared.f32 [r27], f132; +st.shared.f32 [r27+64], f147; +st.shared.f32 [r27+128], f157; +st.shared.f32 [r27+192], f167; +barrier.sync 0; +ld.shared.f32 f172, [r28]; +ld.shared.f32 f173, [r28+512]; +ld.shared.f32 f174, [r28+1024]; +ld.shared.f32 f175, [r28+1536]; +add.f32 f176, f168, f170; +add.f32 f177, f172, f174; +sub.f32 f178, f168, f170; +sub.f32 f179, f172, f174; +add.f32 f180, f169, f171; +add.f32 f181, f173, f175; +sub.f32 f182, f169, f171; +sub.f32 f183, f173, f175; +add.f32 f184, f176, f180; +add.f32 f185, f177, f181; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f181; +sub.f32 f188, f178, f183; +add.f32 f189, f179, f182; +add.f32 f190, f178, f183; +sub.f32 f191, f179, f182; +and.b32 r29, r5, 64; +bfe.u32 r30, r5, 6, 1; +mul.wide.u32 rd12, r30, 8; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f192, f193}, [rd14]; +mul.f32 f196, f189, f193; +fma.rn.f32 f197, f192, f188, f196; +mul.f32 f198, f188, f193; +mul.f32 f199, f192, f189; +sub.f32 f200, f199, f198; +mul.f32 f201, f192, f192; +mul.f32 f202, f193, f193; +sub.f32 f203, f201, f202; +mul.f32 f204, f193, f192; +fma.rn.f32 f205, f193, f192, f204; +mul.f32 f206, f187, f205; +fma.rn.f32 f207, f203, f186, f206; +mul.f32 f208, f186, f205; +mul.f32 f209, f203, f187; +sub.f32 f210, f209, f208; +mul.f32 f211, f192, f203; +mul.f32 f212, f193, f205; +sub.f32 f213, f211, f212; +mul.f32 f214, f192, f205; +fma.rn.f32 f215, f193, f203, f214; +mul.f32 f216, f191, f215; +fma.rn.f32 f217, f213, f190, f216; +mul.f32 f218, f190, f215; +mul.f32 f219, f213, f191; +sub.f32 f220, f219, f218; +and.b32 r31, r16, 252; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 1024; +add.s32 r34, r32, r33; +st.shared.f32 [r34], f184; +st.shared.f32 [r34+256], f197; +st.shared.f32 [r34+512], f207; +st.shared.f32 [r34+768], f217; +barrier.sync 0; +mad.lo.s32 r35, r29, -12, r34; +ld.shared.f32 f221, [r35]; +ld.shared.f32 f222, [r35+512]; +ld.shared.f32 f223, [r35+1024]; +ld.shared.f32 f224, [r35+1536]; +barrier.sync 0; +st.shared.f32 [r34], f185; +st.shared.f32 [r34+256], f200; +st.shared.f32 [r34+512], f210; +st.shared.f32 [r34+768], f220; +barrier.sync 0; +ld.shared.f32 f225, [r35]; +ld.shared.f32 f226, [r35+512]; +ld.shared.f32 f227, [r35+1024]; +ld.shared.f32 f228, [r35+1536]; +add.f32 %0, f221, f223; +add.f32 %1, f225, f227; +add.f32 %2, f222, f224; +add.f32 %3, f226, f228; +sub.f32 %4, f221, f223; +sub.f32 %5, f225, f227; +sub.f32 %6, f222, f224; +sub.f32 %7, f226, f228; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_512), "l"(lut_sp_4_128), "l"(lut_sp_4_32), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<281, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<181>; +.reg .b32 r<63>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %13, %15; +sub.f32 f10, %14, %16; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 4080; +add.s32 r11, r8, r10; +add.f32 f18, %14, %16; +add.f32 f19, %13, %15; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 2040; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+2048]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4064; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 2032; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+2048]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 4032; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 2016; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+2048]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 248; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 3968; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 1984; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+2048]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 4; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f94, f96; +mul.f32 f100, f93, f96; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3840; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f95, f93, f99; +sub.f32 f105, f101, f100; +st.shared.v2.f32 [r39+128], {f104, f105}; +barrier.sync 0; +and.b32 r40, r9, 1920; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+2048]; +sub.f32 f114, f106, f110; +sub.f32 f115, f107, f111; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f116, f117}, [rd20]; +mul.f32 f120, f115, f117; +mul.f32 f121, f114, f117; +mul.f32 f122, f116, f115; +and.b32 r43, r9, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3584; +add.s32 r46, r44, r45; +add.f32 f123, f107, f111; +add.f32 f124, f106, f110; +st.shared.v2.f32 [r46], {f124, f123}; +fma.rn.f32 f125, f116, f114, f120; +sub.f32 f126, f122, f121; +st.shared.v2.f32 [r46+256], {f125, f126}; +barrier.sync 0; +and.b32 r47, r9, 1792; +sub.s32 r48, r46, r47; +ld.shared.v2.f32 {f127, f128}, [r48]; +ld.shared.v2.f32 {f131, f132}, [r48+2048]; +sub.f32 f135, f127, f131; +sub.f32 f136, f128, f132; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f137, f138}, [rd23]; +mul.f32 f141, f136, f138; +mul.f32 f142, f135, f138; +mul.f32 f143, f137, f136; +and.b32 r50, r9, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 3072; +add.s32 r53, r51, r52; +add.f32 f144, f128, f132; +add.f32 f145, f127, f131; +st.shared.v2.f32 [r53], {f145, f144}; +fma.rn.f32 f146, f137, f135, f141; +sub.f32 f147, f143, f142; +st.shared.v2.f32 [r53+512], {f146, f147}; +barrier.sync 0; +and.b32 r54, r9, 1536; +sub.s32 r55, r53, r54; +ld.shared.v2.f32 {f148, f149}, [r55]; +ld.shared.v2.f32 {f152, f153}, [r55+2048]; +sub.f32 f156, f148, f152; +sub.f32 f157, f149, f153; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f158, f159}, [rd26]; +mul.f32 f162, f157, f159; +mul.f32 f163, f156, f159; +mul.f32 f164, f158, f157; +and.b32 r57, r9, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 2048; +add.s32 r60, r58, r59; +add.f32 f165, f149, f153; +add.f32 f166, f148, f152; +st.shared.v2.f32 [r60], {f166, f165}; +fma.rn.f32 f167, f158, f156, f162; +sub.f32 f168, f164, f163; +st.shared.v2.f32 [r60+1024], {f167, f168}; +barrier.sync 0; +and.b32 r61, r9, 1024; +sub.s32 r62, r60, r61; +ld.shared.v2.f32 {f169, f170}, [r62]; +ld.shared.v2.f32 {f173, f174}, [r62+2048]; +add.f32 %1, f170, f174; +add.f32 %0, f169, f173; +sub.f32 %3, f170, f174; +sub.f32 %2, f169, f173; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<282, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<149>; +.reg .b32 r<63>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 11; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %13, %15; +add.f32 f10, %14, %16; +sub.f32 f11, %13, %15; +sub.f32 f12, %14, %16; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -2048; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 2040; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 1020; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+1024]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+1024]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 2032; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 1016; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+1024]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+1024]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 2016; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 1008; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+1024]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+1024]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 248; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 1984; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 992; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+1024]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+1024]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 4; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f80, f82; +fma.rn.f32 f86, f81, f79, f85; +mul.f32 f87, f79, f82; +mul.f32 f88, f81, f80; +sub.f32 f89, f88, f87; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 1920; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f86; +barrier.sync 0; +and.b32 r40, r11, 960; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+1024]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+1024]; +add.f32 f94, f90, f91; +add.f32 f95, f92, f93; +sub.f32 f96, f90, f91; +sub.f32 f97, f92, f93; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 8; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f32 {f98, f99}, [rd20]; +mul.f32 f102, f97, f99; +fma.rn.f32 f103, f98, f96, f102; +mul.f32 f104, f96, f99; +mul.f32 f105, f98, f97; +sub.f32 f106, f105, f104; +and.b32 r43, r11, 124; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 1792; +add.s32 r46, r44, r45; +st.shared.f32 [r46], f94; +st.shared.f32 [r46+128], f103; +barrier.sync 0; +and.b32 r47, r11, 896; +sub.s32 r48, r46, r47; +ld.shared.f32 f107, [r48]; +ld.shared.f32 f108, [r48+1024]; +barrier.sync 0; +st.shared.f32 [r46], f95; +st.shared.f32 [r46+128], f106; +barrier.sync 0; +ld.shared.f32 f109, [r48]; +ld.shared.f32 f110, [r48+1024]; +add.f32 f111, f107, f108; +add.f32 f112, f109, f110; +sub.f32 f113, f107, f108; +sub.f32 f114, f109, f110; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 8; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f32 {f115, f116}, [rd23]; +mul.f32 f119, f114, f116; +fma.rn.f32 f120, f115, f113, f119; +mul.f32 f121, f113, f116; +mul.f32 f122, f115, f114; +sub.f32 f123, f122, f121; +and.b32 r50, r11, 252; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 1536; +add.s32 r53, r51, r52; +st.shared.f32 [r53], f111; +st.shared.f32 [r53+256], f120; +barrier.sync 0; +and.b32 r54, r11, 768; +sub.s32 r55, r53, r54; +ld.shared.f32 f124, [r55]; +ld.shared.f32 f125, [r55+1024]; +barrier.sync 0; +st.shared.f32 [r53], f112; +st.shared.f32 [r53+256], f123; +barrier.sync 0; +ld.shared.f32 f126, [r55]; +ld.shared.f32 f127, [r55+1024]; +add.f32 f128, f124, f125; +add.f32 f129, f126, f127; +sub.f32 f130, f124, f125; +sub.f32 f131, f126, f127; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 8; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f132, f133}, [rd26]; +mul.f32 f136, f131, f133; +fma.rn.f32 f137, f132, f130, f136; +mul.f32 f138, f130, f133; +mul.f32 f139, f132, f131; +sub.f32 f140, f139, f138; +and.b32 r57, r11, 508; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 1024; +add.s32 r60, r58, r59; +st.shared.f32 [r60], f128; +st.shared.f32 [r60+512], f137; +barrier.sync 0; +and.b32 r61, r11, 512; +sub.s32 r62, r60, r61; +ld.shared.f32 f141, [r62]; +ld.shared.f32 f142, [r62+1024]; +barrier.sync 0; +st.shared.f32 [r60], f129; +st.shared.f32 [r60+512], f140; +barrier.sync 0; +ld.shared.f32 f143, [r62]; +ld.shared.f32 f144, [r62+1024]; +add.f32 %0, f141, f142; +add.f32 %1, f143, f144; +sub.f32 %2, f141, f142; +sub.f32 %3, f143, f144; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_512), "l"(lut_sp_2_256), "l"(lut_sp_2_128), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..c31767be8f345 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp64_fwd.hpp.inc @@ -0,0 +1,3461 @@ +#ifndef CUFFTDX_FFT_512_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_512_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<468, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<372>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+1024]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4032; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+512]; +ld.shared.f64 fd160, [r13+1024]; +ld.shared.f64 fd161, [r13+1536]; +ld.shared.f64 fd162, [r13+2048]; +ld.shared.f64 fd163, [r13+2560]; +ld.shared.f64 fd164, [r13+3072]; +ld.shared.f64 fd165, [r13+3584]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+512]; +ld.shared.f64 fd168, [r13+1024]; +ld.shared.f64 fd169, [r13+1536]; +ld.shared.f64 fd170, [r13+2048]; +ld.shared.f64 fd171, [r13+2560]; +ld.shared.f64 fd172, [r13+3072]; +ld.shared.f64 fd173, [r13+3584]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 56; +bfe.u32 r15, r5, 3, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+128]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 3584; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+512]; +ld.shared.f64 fd301, [r21+1024]; +ld.shared.f64 fd302, [r21+1536]; +ld.shared.f64 fd303, [r21+2048]; +ld.shared.f64 fd304, [r21+2560]; +ld.shared.f64 fd305, [r21+3072]; +ld.shared.f64 fd306, [r21+3584]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+512]; +ld.shared.f64 fd309, [r21+1024]; +ld.shared.f64 fd310, [r21+1536]; +ld.shared.f64 fd311, [r21+2048]; +ld.shared.f64 fd312, [r21+2560]; +ld.shared.f64 fd313, [r21+3072]; +ld.shared.f64 fd314, [r21+3584]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +add.f64 fd327, fd317, fd322; +sub.f64 fd328, fd318, fd321; +sub.f64 fd329, fd317, fd322; +add.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +add.f64 fd343, fd333, fd338; +sub.f64 fd344, fd334, fd337; +sub.f64 fd345, fd333, fd338; +add.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0dBFE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd344, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd351, fd343, 0dBFE6A09E667F3BCD, fd350; +mul.f64 fd352, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd353, fd346, 0dBFE6A09E667F3BCD; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd352, fd353; +add.f64 %0, fd323, fd339; +add.f64 %1, fd324, fd340; +add.f64 %3, fd328, fd351; +add.f64 %2, fd327, fd349; +sub.f64 %5, fd326, fd341; +add.f64 %4, fd325, fd342; +add.f64 %7, fd330, fd355; +add.f64 %6, fd329, fd354; +sub.f64 %8, fd323, fd339; +sub.f64 %9, fd324, fd340; +sub.f64 %11, fd328, fd351; +sub.f64 %10, fd327, fd349; +add.f64 %13, fd326, fd341; +sub.f64 %12, fd325, fd342; +sub.f64 %15, fd330, fd355; +sub.f64 %14, fd329, fd354; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<469, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<404>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+1024]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 8064; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+1024]; +ld.shared.v2.f64 {fd166, fd167}, [r13+2048]; +ld.shared.v2.f64 {fd170, fd171}, [r13+3072]; +ld.shared.v2.f64 {fd174, fd175}, [r13+4096]; +ld.shared.v2.f64 {fd178, fd179}, [r13+5120]; +ld.shared.v2.f64 {fd182, fd183}, [r13+6144]; +ld.shared.v2.f64 {fd186, fd187}, [r13+7168]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 56; +bfe.u32 r15, r5, 3, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+128]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 7168; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+1024]; +ld.shared.v2.f64 {fd323, fd324}, [r20+2048]; +ld.shared.v2.f64 {fd327, fd328}, [r20+3072]; +ld.shared.v2.f64 {fd331, fd332}, [r20+4096]; +ld.shared.v2.f64 {fd335, fd336}, [r20+5120]; +ld.shared.v2.f64 {fd339, fd340}, [r20+6144]; +ld.shared.v2.f64 {fd343, fd344}, [r20+7168]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +add.f64 fd359, fd349, fd354; +sub.f64 fd360, fd350, fd353; +sub.f64 fd361, fd349, fd354; +add.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +add.f64 fd375, fd365, fd370; +sub.f64 fd376, fd366, fd369; +sub.f64 fd377, fd365, fd370; +add.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0dBFE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd383, fd375, 0dBFE6A09E667F3BCD, fd382; +mul.f64 fd384, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd385, fd378, 0dBFE6A09E667F3BCD; +sub.f64 fd386, fd384, fd385; +add.f64 fd387, fd384, fd385; +add.f64 %1, fd356, fd372; +add.f64 %0, fd355, fd371; +add.f64 %3, fd360, fd383; +add.f64 %2, fd359, fd381; +sub.f64 %5, fd358, fd373; +add.f64 %4, fd357, fd374; +add.f64 %7, fd362, fd387; +add.f64 %6, fd361, fd386; +sub.f64 %9, fd356, fd372; +sub.f64 %8, fd355, fd371; +sub.f64 %11, fd360, fd383; +sub.f64 %10, fd359, fd381; +add.f64 %13, fd358, fd373; +sub.f64 %12, fd357, fd374; +sub.f64 %15, fd362, fd387; +sub.f64 %14, fd361, fd386; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<470, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<45>; +.reg .f64 fd<1054>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1042, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1040, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1039, fd1042, fd1040; +sub.f64 fd76, fd1042, fd1040; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd1038, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1035, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1033, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1032, fd1035, fd1033; +sub.f64 fd92, fd1035, fd1033; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd1031, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd1031, 0dBFE6A09E667F3BCD; +mul.f64 fd1030, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd1030, fd98; +mul.f64 fd100, fd1031, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1029, fd1039, fd1032; +sub.f64 fd109, fd1039, fd1032; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1028, fd1038, fd101; +sub.f64 fd113, fd1038, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd1027, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd1026, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1024, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1021, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1020, fd1024, fd1021; +sub.f64 fd133, fd1024, fd1021; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd1019, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1017, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1015, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1014, fd1017, fd1015; +sub.f64 fd149, fd1017, fd1015; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd1013, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd1013, 0dBFE6A09E667F3BCD; +mul.f64 fd1012, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd1012, fd155; +mul.f64 fd157, fd1013, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1011, fd1020, fd1014; +sub.f64 fd166, fd1020, fd1014; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1010, fd1019, fd158; +sub.f64 fd170, fd1019, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd1009, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd1008, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1006, fd167, 0d3FED906BCF328D46; +mul.f64 fd1007, fd1010, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd1006, fd1007; +mul.f64 fd182, fd1010, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd1004, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd1005, fd1009, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd1004, fd1005; +mul.f64 fd187, fd1009, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd1002, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd1003, fd1008, 0dBFED906BCF328D46; +sub.f64 fd191, fd1002, fd1003; +mul.f64 fd192, fd1008, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd1000, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd1001, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd1000, fd1001; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd998, fd177, 0dBFED906BCF328D46; +mul.f64 fd999, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd998, fd999; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd997, fd1028, fd183; +sub.f64 fd213, fd1028, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd996, fd1027, fd188; +sub.f64 fd217, fd1027, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd995, fd1026, fd193; +sub.f64 fd221, fd1026, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd994, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd993, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd992, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd991, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd997; +mul.f64 fd244, fd238, fd997; +mul.f64 fd246, fd239, fd239; +mul.f64 fd990, fd238, fd238; +sub.f64 fd247, fd990, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd996; +mul.f64 fd252, fd247, fd996; +mul.f64 fd988, fd238, fd247; +mul.f64 fd989, fd239, fd249; +sub.f64 fd255, fd988, fd989; +mul.f64 fd987, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd995; +mul.f64 fd260, fd255, fd995; +mul.f64 fd262, fd239, fd257; +mul.f64 fd986, fd238, fd255; +sub.f64 fd263, fd986, fd262; +mul.f64 fd985, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd994; +mul.f64 fd268, fd263, fd994; +mul.f64 fd270, fd239, fd265; +mul.f64 fd984, fd238, fd263; +sub.f64 fd271, fd984, fd270; +mul.f64 fd983, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd993; +mul.f64 fd276, fd271, fd993; +mul.f64 fd981, fd238, fd271; +mul.f64 fd982, fd239, fd273; +sub.f64 fd279, fd981, fd982; +mul.f64 fd980, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd992; +mul.f64 fd284, fd279, fd992; +mul.f64 fd286, fd239, fd281; +mul.f64 fd979, fd238, fd279; +sub.f64 fd287, fd979, fd286; +mul.f64 fd978, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd991; +mul.f64 fd292, fd287, fd991; +mul.f64 fd294, fd239, fd289; +mul.f64 fd977, fd238, fd287; +sub.f64 fd295, fd977, fd294; +mul.f64 fd976, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd975, fd1029, fd1011; +sub.f64 fd974, fd106, fd163; +mul.f64 fd298, fd295, fd974; +mul.f64 fd299, fd297, fd975; +mul.f64 fd300, fd295, fd975; +ld.global.v2.f64 {fd301, fd302}, [rd5+512]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd972, fd238, fd301; +mul.f64 fd973, fd239, fd302; +sub.f64 fd310, fd972, fd973; +mul.f64 fd971, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd970, fd238, fd310; +sub.f64 fd318, fd970, fd317; +mul.f64 fd969, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd967, fd238, fd318; +mul.f64 fd968, fd239, fd320; +sub.f64 fd326, fd967, fd968; +mul.f64 fd966, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd964, fd238, fd326; +mul.f64 fd965, fd239, fd328; +sub.f64 fd334, fd964, fd965; +mul.f64 fd963, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd962, fd238, fd334; +sub.f64 fd342, fd962, fd341; +mul.f64 fd961, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd959, fd238, fd342; +mul.f64 fd960, fd239, fd344; +sub.f64 fd350, fd959, fd960; +mul.f64 fd958, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd957, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r39, %tid.x; +shl.b32 r31, r39, 8; +barrier.sync 0; +and.b32 r11, r31, 7936; +add.s32 r12, r9, r11; +sub.f64 fd1050, fd1029, fd1011; +mul.f64 fd1049, fd297, fd1050; +add.f64 fd356, fd1029, fd1011; +mov.u32 r38, %tid.x; +shl.b32 r30, r38, 8; +and.b32 r23, r30, 7936; +add.s32 r22, r9, r23; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r22], {fd357, fd356}; +mov.u32 r44, %tid.x; +shl.b32 r36, r44, 8; +shl.b32 r28, r44, 4; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd958, fd243; +st.shared.v2.f64 [r22+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd987, fd251; +st.shared.v2.f64 [r22+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd985, fd259; +st.shared.v2.f64 [r22+48], {fd363, fd362}; +sub.f64 fd364, fd983, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r22+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd980, fd275; +st.shared.v2.f64 [r22+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd978, fd283; +st.shared.v2.f64 [r22+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd976, fd291; +st.shared.v2.f64 [r22+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd974, fd300; +sub.f64 fd373, fd298, fd1049; +st.shared.v2.f64 [r22+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd971, fd306; +st.shared.v2.f64 [r22+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd969, fd314; +st.shared.v2.f64 [r22+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd966, fd322; +st.shared.v2.f64 [r22+176], {fd379, fd378}; +sub.f64 fd380, fd963, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r22+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd961, fd338; +st.shared.v2.f64 [r22+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd957, fd346; +st.shared.v2.f64 [r22+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r22+240], {fd387, fd386}; +barrier.sync 0; +and.b32 r20, r44, 31; +mad.lo.s32 r13, r20, -240, r22; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+512]; +ld.shared.v2.f64 {fd396, fd397}, [r13+1024]; +ld.shared.v2.f64 {fd400, fd401}, [r13+1536]; +ld.shared.v2.f64 {fd404, fd405}, [r13+2048]; +ld.shared.v2.f64 {fd408, fd409}, [r13+2560]; +ld.shared.v2.f64 {fd412, fd413}, [r13+3072]; +ld.shared.v2.f64 {fd416, fd417}, [r13+3584]; +ld.shared.v2.f64 {fd420, fd421}, [r13+4096]; +ld.shared.v2.f64 {fd424, fd425}, [r13+4608]; +ld.shared.v2.f64 {fd428, fd429}, [r13+5120]; +ld.shared.v2.f64 {fd432, fd433}, [r13+5632]; +ld.shared.v2.f64 {fd436, fd437}, [r13+6144]; +ld.shared.v2.f64 {fd440, fd441}, [r13+6656]; +ld.shared.v2.f64 {fd444, fd445}, [r13+7168]; +ld.shared.v2.f64 {fd448, fd449}, [r13+7680]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd956, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd955, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd954, fd956, fd955; +sub.f64 fd463, fd956, fd955; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd953, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd952, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd951, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd950, fd952, fd951; +sub.f64 fd479, fd952, fd951; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd949, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd949, 0dBFE6A09E667F3BCD; +mul.f64 fd948, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd948, fd485; +mul.f64 fd487, fd949, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd947, fd954, fd950; +sub.f64 fd496, fd954, fd950; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd946, fd953, fd488; +sub.f64 fd500, fd953, fd488; +add.f64 fd501, fd462, fd479; +sub.f64 fd503, fd462, fd479; +sub.f64 fd945, fd463, fd478; +add.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd491; +sub.f64 fd507, fd466, fd491; +add.f64 fd944, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd943, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd942, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd941, fd943, fd942; +sub.f64 fd520, fd943, fd942; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd940, fd512, fd515; +add.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd939, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd938, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd937, fd939, fd938; +sub.f64 fd536, fd939, fd938; +add.f64 fd537, fd527, fd532; +sub.f64 fd539, fd527, fd532; +sub.f64 fd936, fd528, fd531; +add.f64 fd540, fd528, fd531; +mul.f64 fd542, fd936, 0dBFE6A09E667F3BCD; +mul.f64 fd935, fd537, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd935, fd542; +mul.f64 fd544, fd936, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd545, fd537, 0dBFE6A09E667F3BCD, fd544; +mul.f64 fd546, fd539, 0dBFE6A09E667F3BCD; +mul.f64 fd547, fd540, 0dBFE6A09E667F3BCD; +sub.f64 fd548, fd546, fd547; +add.f64 fd549, fd546, fd547; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd934, fd941, fd937; +sub.f64 fd553, fd941, fd937; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd933, fd940, fd545; +sub.f64 fd557, fd940, fd545; +add.f64 fd558, fd519, fd536; +sub.f64 fd560, fd519, fd536; +sub.f64 fd932, fd520, fd535; +add.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd548; +sub.f64 fd564, fd523, fd548; +add.f64 fd931, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd929, fd554, 0d3FED906BCF328D46; +mul.f64 fd930, fd933, 0dBFD87DE2A6AEA963; +sub.f64 fd568, fd929, fd930; +mul.f64 fd569, fd933, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0dBFD87DE2A6AEA963, fd569; +mul.f64 fd572, fd932, 0dBFE6A09E667F3BCD; +mul.f64 fd928, fd558, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd928, fd572; +mul.f64 fd574, fd932, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd575, fd558, 0dBFE6A09E667F3BCD, fd574; +mul.f64 fd577, fd931, 0dBFED906BCF328D46; +mul.f64 fd927, fd562, 0d3FD87DE2A6AEA963; +sub.f64 fd578, fd927, fd577; +mul.f64 fd579, fd931, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd580, fd562, 0dBFED906BCF328D46, fd579; +mul.f64 fd582, fd557, 0dBFED906BCF328D46; +mul.f64 fd926, fd556, 0dBFD87DE2A6AEA963; +sub.f64 fd583, fd926, fd582; +mul.f64 fd584, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd585, fd556, 0dBFED906BCF328D46, fd584; +mul.f64 fd586, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd561, 0dBFE6A09E667F3BCD; +sub.f64 fd588, fd586, fd587; +add.f64 fd589, fd586, fd587; +mul.f64 fd591, fd565, 0dBFD87DE2A6AEA963; +mul.f64 fd925, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd925, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0dBFD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd924, fd946, fd570; +sub.f64 fd600, fd946, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd923, fd945, fd575; +sub.f64 fd604, fd945, fd575; +add.f64 fd605, fd505, fd578; +sub.f64 fd607, fd505, fd578; +add.f64 fd922, fd944, fd580; +sub.f64 fd608, fd944, fd580; +add.f64 fd609, fd495, fd553; +sub.f64 fd611, fd495, fd553; +sub.f64 fd921, fd496, fd552; +add.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd583; +sub.f64 fd615, fd499, fd583; +add.f64 fd920, fd500, fd585; +sub.f64 fd616, fd500, fd585; +add.f64 fd617, fd503, fd588; +sub.f64 fd619, fd503, fd588; +add.f64 fd919, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd918, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r44, 16; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd630, fd626, fd924; +mul.f64 fd631, fd625, fd924; +mul.f64 fd633, fd626, fd626; +mul.f64 fd917, fd625, fd625; +sub.f64 fd634, fd917, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd638, fd636, fd923; +mul.f64 fd639, fd634, fd923; +mul.f64 fd915, fd625, fd634; +mul.f64 fd916, fd626, fd636; +sub.f64 fd642, fd915, fd916; +mul.f64 fd914, fd634, fd601; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd646, fd644, fd922; +mul.f64 fd647, fd642, fd922; +mul.f64 fd649, fd626, fd644; +mul.f64 fd913, fd625, fd642; +sub.f64 fd650, fd913, fd649; +mul.f64 fd912, fd642, fd605; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd654, fd652, fd921; +mul.f64 fd655, fd650, fd921; +mul.f64 fd910, fd625, fd650; +mul.f64 fd911, fd626, fd652; +sub.f64 fd658, fd910, fd911; +mul.f64 fd909, fd650, fd609; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd662, fd660, fd920; +mul.f64 fd663, fd658, fd920; +mul.f64 fd907, fd625, fd658; +mul.f64 fd908, fd626, fd660; +sub.f64 fd666, fd907, fd908; +mul.f64 fd906, fd658, fd613; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd670, fd668, fd919; +mul.f64 fd671, fd666, fd919; +mul.f64 fd673, fd626, fd668; +mul.f64 fd905, fd625, fd666; +sub.f64 fd674, fd905, fd673; +mul.f64 fd904, fd666, fd617; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd678, fd676, fd918; +mul.f64 fd679, fd674, fd918; +mul.f64 fd902, fd625, fd674; +mul.f64 fd903, fd626, fd676; +sub.f64 fd682, fd902, fd903; +mul.f64 fd901, fd674, fd621; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd900, fd947, fd934; +sub.f64 fd899, fd493, fd550; +mul.f64 fd685, fd682, fd899; +mul.f64 fd686, fd684, fd900; +mul.f64 fd687, fd682, fd900; +ld.global.v2.f64 {fd688, fd689}, [rd8+32]; +mul.f64 fd693, fd689, fd600; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd898, fd625, fd688; +sub.f64 fd697, fd898, fd696; +mul.f64 fd897, fd688, fd599; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd701, fd699, fd604; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd896, fd625, fd697; +sub.f64 fd705, fd896, fd704; +mul.f64 fd895, fd697, fd603; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd709, fd707, fd608; +mul.f64 fd710, fd705, fd608; +mul.f64 fd893, fd625, fd705; +mul.f64 fd894, fd626, fd707; +sub.f64 fd713, fd893, fd894; +mul.f64 fd892, fd705, fd607; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd717, fd715, fd612; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd891, fd625, fd713; +sub.f64 fd721, fd891, fd720; +mul.f64 fd890, fd713, fd611; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd725, fd723, fd616; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd889, fd625, fd721; +sub.f64 fd729, fd889, fd728; +mul.f64 fd888, fd721, fd615; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd733, fd731, fd620; +mul.f64 fd734, fd729, fd620; +mul.f64 fd886, fd625, fd729; +mul.f64 fd887, fd626, fd731; +sub.f64 fd737, fd886, fd887; +mul.f64 fd885, fd625, fd597; +mul.f64 fd738, fd625, fd731; +mul.f64 fd884, fd729, fd619; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd737, fd623; +mul.f64 fd741, fd739, fd624; +mul.f64 fd742, fd737, fd624; +sub.f64 fd1045, fd947, fd934; +mul.f64 fd1044, fd684, fd1045; +mov.u32 r35, %tid.x; +shl.b32 r34, r35, 4; +and.b32 r15, r34, 240; +add.s32 r16, r9, r15; +sub.f64 fd1047, fd947, fd934; +mul.f64 fd1046, fd684, fd1047; +barrier.sync 0; +and.b32 r17, r36, 4096; +add.s32 r18, r16, r17; +mov.u32 r27, %tid.x; +and.b32 r26, r27, 16; +add.f64 fd743, fd947, fd934; +sub.f64 fd1051, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r33, %tid.x; +and.b32 r32, r33, 16; +fma.rn.f64 fd745, fd626, fd597, fd631; +sub.f64 fd746, fd885, fd630; +st.shared.v2.f64 [r18+256], {fd746, fd745}; +fma.rn.f64 fd747, fd636, fd601, fd639; +sub.f64 fd748, fd914, fd638; +st.shared.v2.f64 [r18+512], {fd748, fd747}; +fma.rn.f64 fd749, fd644, fd605, fd647; +sub.f64 fd750, fd912, fd646; +st.shared.v2.f64 [r18+768], {fd750, fd749}; +fma.rn.f64 fd751, fd652, fd609, fd655; +sub.f64 fd752, fd909, fd654; +st.shared.v2.f64 [r18+1024], {fd752, fd751}; +sub.f64 fd753, fd906, fd662; +fma.rn.f64 fd754, fd660, fd613, fd663; +st.shared.v2.f64 [r18+1280], {fd753, fd754}; +fma.rn.f64 fd755, fd668, fd617, fd671; +sub.f64 fd756, fd904, fd670; +st.shared.v2.f64 [r18+1536], {fd756, fd755}; +fma.rn.f64 fd757, fd676, fd621, fd679; +sub.f64 fd758, fd901, fd678; +st.shared.v2.f64 [r18+1792], {fd758, fd757}; +fma.rn.f64 fd759, fd684, fd1051, fd687; +sub.f64 fd760, fd685, fd1046; +st.shared.v2.f64 [r18+2048], {fd760, fd759}; +fma.rn.f64 fd761, fd689, fd599, fd694; +sub.f64 fd762, fd897, fd693; +st.shared.v2.f64 [r18+2304], {fd762, fd761}; +fma.rn.f64 fd763, fd699, fd603, fd702; +sub.f64 fd764, fd895, fd701; +st.shared.v2.f64 [r18+2560], {fd764, fd763}; +fma.rn.f64 fd765, fd707, fd607, fd710; +sub.f64 fd766, fd892, fd709; +st.shared.v2.f64 [r18+2816], {fd766, fd765}; +fma.rn.f64 fd767, fd715, fd611, fd718; +sub.f64 fd768, fd890, fd717; +st.shared.v2.f64 [r18+3072], {fd768, fd767}; +sub.f64 fd769, fd888, fd725; +fma.rn.f64 fd770, fd723, fd615, fd726; +st.shared.v2.f64 [r18+3328], {fd769, fd770}; +fma.rn.f64 fd771, fd731, fd619, fd734; +sub.f64 fd772, fd884, fd733; +st.shared.v2.f64 [r18+3584], {fd772, fd771}; +fma.rn.f64 fd773, fd739, fd623, fd742; +sub.f64 fd774, fd740, fd741; +st.shared.v2.f64 [r18+3840], {fd774, fd773}; +barrier.sync 0; +mad.lo.s32 r19, r32, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+512]; +ld.shared.v2.f64 {fd783, fd784}, [r19+1024]; +ld.shared.v2.f64 {fd787, fd788}, [r19+1536]; +ld.shared.v2.f64 {fd791, fd792}, [r19+2048]; +ld.shared.v2.f64 {fd795, fd796}, [r19+2560]; +ld.shared.v2.f64 {fd799, fd800}, [r19+3072]; +ld.shared.v2.f64 {fd803, fd804}, [r19+3584]; +ld.shared.v2.f64 {fd807, fd808}, [r19+4096]; +ld.shared.v2.f64 {fd811, fd812}, [r19+4608]; +ld.shared.v2.f64 {fd815, fd816}, [r19+5120]; +ld.shared.v2.f64 {fd819, fd820}, [r19+5632]; +ld.shared.v2.f64 {fd823, fd824}, [r19+6144]; +ld.shared.v2.f64 {fd827, fd828}, [r19+6656]; +ld.shared.v2.f64 {fd831, fd832}, [r19+7168]; +ld.shared.v2.f64 {fd835, fd836}, [r19+7680]; +add.f64 %1, fd776, fd808; +add.f64 %0, fd775, fd807; +add.f64 %3, fd780, fd812; +add.f64 %2, fd779, fd811; +add.f64 %5, fd784, fd816; +add.f64 %4, fd783, fd815; +add.f64 %6, fd787, fd819; +add.f64 %7, fd788, fd820; +add.f64 %8, fd791, fd823; +add.f64 %9, fd792, fd824; +add.f64 %10, fd795, fd827; +add.f64 %11, fd796, fd828; +add.f64 %13, fd800, fd832; +add.f64 %12, fd799, fd831; +add.f64 %15, fd804, fd836; +add.f64 %14, fd803, fd835; +sub.f64 %17, fd776, fd808; +sub.f64 %16, fd775, fd807; +sub.f64 %19, fd780, fd812; +sub.f64 %18, fd779, fd811; +sub.f64 %21, fd784, fd816; +sub.f64 %20, fd783, fd815; +sub.f64 %23, fd788, fd820; +sub.f64 %22, fd787, fd819; +sub.f64 %25, fd792, fd824; +sub.f64 %24, fd791, fd823; +sub.f64 %27, fd796, fd828; +sub.f64 %26, fd795, fd827; +sub.f64 %29, fd800, fd832; +sub.f64 %28, fd799, fd831; +sub.f64 %31, fd804, fd836; +sub.f64 %30, fd803, fd835; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<471, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<807>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+512]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 3968; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+256]; +ld.shared.f64 fd390, [r13+512]; +ld.shared.f64 fd391, [r13+768]; +ld.shared.f64 fd392, [r13+1024]; +ld.shared.f64 fd393, [r13+1280]; +ld.shared.f64 fd394, [r13+1536]; +ld.shared.f64 fd395, [r13+1792]; +ld.shared.f64 fd396, [r13+2048]; +ld.shared.f64 fd397, [r13+2304]; +ld.shared.f64 fd398, [r13+2560]; +ld.shared.f64 fd399, [r13+2816]; +ld.shared.f64 fd400, [r13+3072]; +ld.shared.f64 fd401, [r13+3328]; +ld.shared.f64 fd402, [r13+3584]; +ld.shared.f64 fd403, [r13+3840]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+256]; +ld.shared.f64 fd406, [r13+512]; +ld.shared.f64 fd407, [r13+768]; +ld.shared.f64 fd408, [r13+1024]; +ld.shared.f64 fd409, [r13+1280]; +ld.shared.f64 fd410, [r13+1536]; +ld.shared.f64 fd411, [r13+1792]; +ld.shared.f64 fd412, [r13+2048]; +ld.shared.f64 fd413, [r13+2304]; +ld.shared.f64 fd414, [r13+2560]; +ld.shared.f64 fd415, [r13+2816]; +ld.shared.f64 fd416, [r13+3072]; +ld.shared.f64 fd417, [r13+3328]; +ld.shared.f64 fd418, [r13+3584]; +ld.shared.f64 fd419, [r13+3840]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd543; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd543; +add.f64 fd575, fd473, fd546; +add.f64 fd576, fd474, fd548; +sub.f64 fd577, fd473, fd546; +sub.f64 fd578, fd474, fd548; +add.f64 fd579, fd463, fd521; +sub.f64 fd580, fd464, fd520; +sub.f64 fd581, fd463, fd521; +add.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd551; +add.f64 fd584, fd468, fd553; +sub.f64 fd585, fd467, fd551; +sub.f64 fd586, fd468, fd553; +add.f64 fd587, fd471, fd556; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd556; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 16; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd595, fd567; +mul.f64 fd600, fd596, fd568; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd595, fd568; +fma.rn.f64 fd603, fd596, fd567, fd602; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd606, fd571; +mul.f64 fd610, fd608, fd572; +sub.f64 fd611, fd609, fd610; +mul.f64 fd612, fd606, fd572; +fma.rn.f64 fd613, fd608, fd571, fd612; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd616, fd575; +mul.f64 fd620, fd618, fd576; +sub.f64 fd621, fd619, fd620; +mul.f64 fd622, fd616, fd576; +fma.rn.f64 fd623, fd618, fd575, fd622; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd626, fd579; +mul.f64 fd630, fd628, fd580; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd626, fd580; +fma.rn.f64 fd633, fd628, fd579, fd632; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd636, fd583; +mul.f64 fd640, fd638, fd584; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd636, fd584; +fma.rn.f64 fd643, fd638, fd583, fd642; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd646, fd587; +mul.f64 fd650, fd648, fd588; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd646, fd588; +fma.rn.f64 fd653, fd648, fd587, fd652; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd656, fd591; +mul.f64 fd660, fd658, fd592; +sub.f64 fd661, fd659, fd660; +mul.f64 fd662, fd656, fd592; +fma.rn.f64 fd663, fd658, fd591, fd662; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd666, fd565; +mul.f64 fd670, fd668, fd566; +sub.f64 fd671, fd669, fd670; +mul.f64 fd672, fd666, fd566; +fma.rn.f64 fd673, fd668, fd565, fd672; +ld.global.v2.f64 {fd674, fd675}, [rd8+32]; +mul.f64 fd678, fd674, fd569; +mul.f64 fd679, fd675, fd570; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd674, fd570; +fma.rn.f64 fd682, fd675, fd569, fd681; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd685, fd573; +mul.f64 fd689, fd687, fd574; +sub.f64 fd690, fd688, fd689; +mul.f64 fd691, fd685, fd574; +fma.rn.f64 fd692, fd687, fd573, fd691; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd695, fd577; +mul.f64 fd699, fd697, fd578; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd695, fd578; +fma.rn.f64 fd702, fd697, fd577, fd701; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd705, fd581; +mul.f64 fd709, fd707, fd582; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd705, fd582; +fma.rn.f64 fd712, fd707, fd581, fd711; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd715, fd585; +mul.f64 fd719, fd717, fd586; +sub.f64 fd720, fd718, fd719; +mul.f64 fd721, fd715, fd586; +fma.rn.f64 fd722, fd717, fd585, fd721; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd725, fd589; +mul.f64 fd729, fd727, fd590; +sub.f64 fd730, fd728, fd729; +mul.f64 fd731, fd725, fd590; +fma.rn.f64 fd732, fd727, fd589, fd731; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd735, fd593; +mul.f64 fd739, fd737, fd594; +sub.f64 fd740, fd738, fd739; +mul.f64 fd741, fd735, fd594; +fma.rn.f64 fd742, fd737, fd593, fd741; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 2048; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd601; +st.shared.f64 [r19+256], fd611; +st.shared.f64 [r19+384], fd621; +st.shared.f64 [r19+512], fd631; +st.shared.f64 [r19+640], fd641; +st.shared.f64 [r19+768], fd651; +st.shared.f64 [r19+896], fd661; +st.shared.f64 [r19+1024], fd671; +st.shared.f64 [r19+1152], fd680; +st.shared.f64 [r19+1280], fd690; +st.shared.f64 [r19+1408], fd700; +st.shared.f64 [r19+1536], fd710; +st.shared.f64 [r19+1664], fd720; +st.shared.f64 [r19+1792], fd730; +st.shared.f64 [r19+1920], fd740; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+256]; +ld.shared.f64 fd745, [r20+512]; +ld.shared.f64 fd746, [r20+768]; +ld.shared.f64 fd747, [r20+1024]; +ld.shared.f64 fd748, [r20+1280]; +ld.shared.f64 fd749, [r20+1536]; +ld.shared.f64 fd750, [r20+1792]; +ld.shared.f64 fd751, [r20+2048]; +ld.shared.f64 fd752, [r20+2304]; +ld.shared.f64 fd753, [r20+2560]; +ld.shared.f64 fd754, [r20+2816]; +ld.shared.f64 fd755, [r20+3072]; +ld.shared.f64 fd756, [r20+3328]; +ld.shared.f64 fd757, [r20+3584]; +ld.shared.f64 fd758, [r20+3840]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+256]; +ld.shared.f64 fd761, [r20+512]; +ld.shared.f64 fd762, [r20+768]; +ld.shared.f64 fd763, [r20+1024]; +ld.shared.f64 fd764, [r20+1280]; +ld.shared.f64 fd765, [r20+1536]; +ld.shared.f64 fd766, [r20+1792]; +ld.shared.f64 fd767, [r20+2048]; +ld.shared.f64 fd768, [r20+2304]; +ld.shared.f64 fd769, [r20+2560]; +ld.shared.f64 fd770, [r20+2816]; +ld.shared.f64 fd771, [r20+3072]; +ld.shared.f64 fd772, [r20+3328]; +ld.shared.f64 fd773, [r20+3584]; +ld.shared.f64 fd774, [r20+3840]; +add.f64 %0, fd743, fd751; +add.f64 %1, fd759, fd767; +add.f64 %2, fd744, fd752; +add.f64 %3, fd760, fd768; +add.f64 %4, fd745, fd753; +add.f64 %5, fd761, fd769; +add.f64 %6, fd746, fd754; +add.f64 %7, fd762, fd770; +add.f64 %8, fd747, fd755; +add.f64 %9, fd763, fd771; +add.f64 %10, fd748, fd756; +add.f64 %11, fd764, fd772; +add.f64 %12, fd749, fd757; +add.f64 %13, fd765, fd773; +add.f64 %14, fd750, fd758; +add.f64 %15, fd766, fd774; +sub.f64 %16, fd743, fd751; +sub.f64 %17, fd759, fd767; +sub.f64 %18, fd744, fd752; +sub.f64 %19, fd760, fd768; +sub.f64 %20, fd745, fd753; +sub.f64 %21, fd761, fd769; +sub.f64 %22, fd746, fd754; +sub.f64 %23, fd762, fd770; +sub.f64 %24, fd747, fd755; +sub.f64 %25, fd763, fd771; +sub.f64 %26, fd748, fd756; +sub.f64 %27, fd764, fd772; +sub.f64 %28, fd749, fd757; +sub.f64 %29, fd765, fd773; +sub.f64 %30, fd750, fd758; +sub.f64 %31, fd766, fd774; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<473, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<265>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+2048]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 8128; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+2048]; +ld.shared.v2.f64 {fd69, fd70}, [r13+4096]; +ld.shared.v2.f64 {fd73, fd74}, [r13+6144]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+512]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 7936; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+2048]; +ld.shared.v2.f64 {fd129, fd130}, [r20+4096]; +ld.shared.v2.f64 {fd133, fd134}, [r20+6144]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd144; +sub.f64 fd148, fd140, fd143; +sub.f64 fd149, fd139, fd144; +add.f64 fd150, fd140, fd143; +and.b32 r21, r5, 112; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd151, fd147; +mul.f64 fd156, fd152, fd148; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd145; +mul.f64 fd164, fd162, fd146; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+128]; +mul.f64 fd170, fd166, fd149; +mul.f64 fd171, fd167, fd150; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 7168; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd152, fd147, fd157; +sub.f64 fd176, fd155, fd156; +st.shared.v2.f64 [r25+256], {fd176, fd175}; +fma.rn.f64 fd177, fd162, fd145, fd165; +sub.f64 fd178, fd163, fd164; +st.shared.v2.f64 [r25+512], {fd178, fd177}; +fma.rn.f64 fd179, fd167, fd149, fd172; +sub.f64 fd180, fd170, fd171; +st.shared.v2.f64 [r25+768], {fd180, fd179}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+2048]; +ld.shared.v2.f64 {fd189, fd190}, [r26+4096]; +ld.shared.v2.f64 {fd193, fd194}, [r26+6144]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +add.f64 fd207, fd199, fd204; +sub.f64 fd208, fd200, fd203; +sub.f64 fd209, fd199, fd204; +add.f64 fd210, fd200, fd203; +and.b32 r27, r5, 64; +bfe.u32 r28, r5, 6, 1; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd211, fd207; +mul.f64 fd216, fd212, fd208; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd220, fd205; +mul.f64 fd224, fd222, fd206; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+32]; +mul.f64 fd230, fd226, fd209; +mul.f64 fd231, fd227, fd210; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 4096; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd212, fd207, fd217; +sub.f64 fd236, fd215, fd216; +st.shared.v2.f64 [r32+1024], {fd236, fd235}; +fma.rn.f64 fd237, fd222, fd205, fd225; +sub.f64 fd238, fd223, fd224; +st.shared.v2.f64 [r32+2048], {fd238, fd237}; +fma.rn.f64 fd239, fd227, fd209, fd232; +sub.f64 fd240, fd230, fd231; +st.shared.v2.f64 [r32+3072], {fd240, fd239}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+2048]; +ld.shared.v2.f64 {fd249, fd250}, [r33+4096]; +ld.shared.v2.f64 {fd253, fd254}, [r33+6144]; +add.f64 %1, fd242, fd250; +add.f64 %0, fd241, fd249; +add.f64 %3, fd246, fd254; +add.f64 %2, fd245, fd253; +sub.f64 %5, fd242, fd250; +sub.f64 %4, fd241, fd249; +sub.f64 %7, fd246, fd254; +sub.f64 %6, fd245, fd253; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<472, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<233>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+2048]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4064; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+1024]; +ld.shared.f64 fd63, [r13+2048]; +ld.shared.f64 fd64, [r13+3072]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+1024]; +ld.shared.f64 fd67, [r13+2048]; +ld.shared.f64 fd68, [r13+3072]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+512]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 3968; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+1024]; +ld.shared.f64 fd115, [r21+2048]; +ld.shared.f64 fd116, [r21+3072]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+1024]; +ld.shared.f64 fd119, [r21+2048]; +ld.shared.f64 fd120, [r21+3072]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +add.f64 fd133, fd123, fd128; +sub.f64 fd134, fd124, fd127; +sub.f64 fd135, fd123, fd128; +add.f64 fd136, fd124, fd127; +and.b32 r22, r5, 112; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd137, fd133; +mul.f64 fd142, fd138, fd134; +sub.f64 fd143, fd141, fd142; +mul.f64 fd144, fd137, fd134; +fma.rn.f64 fd145, fd138, fd133, fd144; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd148, fd131; +mul.f64 fd152, fd150, fd132; +sub.f64 fd153, fd151, fd152; +mul.f64 fd154, fd148, fd132; +fma.rn.f64 fd155, fd150, fd131, fd154; +ld.global.v2.f64 {fd156, fd157}, [rd11+128]; +mul.f64 fd160, fd156, fd135; +mul.f64 fd161, fd157, fd136; +sub.f64 fd162, fd160, fd161; +mul.f64 fd163, fd156, fd136; +fma.rn.f64 fd164, fd157, fd135, fd163; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 3584; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd143; +st.shared.f64 [r26+256], fd153; +st.shared.f64 [r26+384], fd162; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+1024]; +ld.shared.f64 fd167, [r27+2048]; +ld.shared.f64 fd168, [r27+3072]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+1024]; +ld.shared.f64 fd171, [r27+2048]; +ld.shared.f64 fd172, [r27+3072]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +add.f64 fd185, fd175, fd180; +sub.f64 fd186, fd176, fd179; +sub.f64 fd187, fd175, fd180; +add.f64 fd188, fd176, fd179; +and.b32 r28, r5, 64; +bfe.u32 r29, r5, 6, 1; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd189, fd185; +mul.f64 fd194, fd190, fd186; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd189, fd186; +fma.rn.f64 fd197, fd190, fd185, fd196; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd200, fd183; +mul.f64 fd204, fd202, fd184; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd200, fd184; +fma.rn.f64 fd207, fd202, fd183, fd206; +ld.global.v2.f64 {fd208, fd209}, [rd14+32]; +mul.f64 fd212, fd208, fd187; +mul.f64 fd213, fd209, fd188; +sub.f64 fd214, fd212, fd213; +mul.f64 fd215, fd208, fd188; +fma.rn.f64 fd216, fd209, fd187, fd215; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 2048; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd195; +st.shared.f64 [r33+1024], fd205; +st.shared.f64 [r33+1536], fd214; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+1024]; +ld.shared.f64 fd219, [r34+2048]; +ld.shared.f64 fd220, [r34+3072]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+1024]; +ld.shared.f64 fd223, [r34+2048]; +ld.shared.f64 fd224, [r34+3072]; +add.f64 %0, fd217, fd219; +add.f64 %1, fd221, fd223; +add.f64 %2, fd218, fd220; +add.f64 %3, fd222, fd224; +sub.f64 %4, fd217, fd219; +sub.f64 %5, fd221, fd223; +sub.f64 %6, fd218, fd220; +sub.f64 %7, fd222, fd224; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<475, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<63>; +.reg .f64 fd<181>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %13, %15; +sub.f64 fd10, %14, %16; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 8160; +add.s32 r11, r8, r10; +add.f64 fd18, %14, %16; +add.f64 fd19, %13, %15; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 4080; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+4096]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8128; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 4064; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+4096]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8064; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 4032; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+4096]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 5; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 7936; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 3968; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+4096]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 240; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd95, fd93; +mul.f64 fd100, fd96, fd94; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 7680; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd96, fd93, fd101; +sub.f64 fd105, fd99, fd100; +st.shared.v2.f64 [r39+256], {fd105, fd104}; +barrier.sync 0; +and.b32 r40, r9, 3840; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+4096]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd116, fd114; +mul.f64 fd121, fd117, fd115; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7168; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd117, fd114, fd122; +sub.f64 fd126, fd120, fd121; +st.shared.v2.f64 [r46+512], {fd126, fd125}; +barrier.sync 0; +and.b32 r47, r9, 3584; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+4096]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd137, fd135; +mul.f64 fd142, fd138, fd136; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 6144; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd138, fd135, fd143; +sub.f64 fd147, fd141, fd142; +st.shared.v2.f64 [r53+1024], {fd147, fd146}; +barrier.sync 0; +and.b32 r54, r9, 3072; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+4096]; +sub.f64 fd156, fd148, fd152; +sub.f64 fd157, fd149, fd153; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd158, fd159}, [rd26]; +mul.f64 fd162, fd158, fd156; +mul.f64 fd163, fd159, fd157; +mul.f64 fd164, fd158, fd157; +and.b32 r57, r9, 2032; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 4096; +add.s32 r60, r58, r59; +add.f64 fd165, fd149, fd153; +add.f64 fd166, fd148, fd152; +st.shared.v2.f64 [r60], {fd166, fd165}; +fma.rn.f64 fd167, fd159, fd156, fd164; +sub.f64 fd168, fd162, fd163; +st.shared.v2.f64 [r60+2048], {fd168, fd167}; +barrier.sync 0; +and.b32 r61, r9, 2048; +sub.s32 r62, r60, r61; +ld.shared.v2.f64 {fd169, fd170}, [r62]; +ld.shared.v2.f64 {fd173, fd174}, [r62+4096]; +add.f64 %1, fd170, fd174; +add.f64 %0, fd169, fd173; +sub.f64 %3, fd170, fd174; +sub.f64 %2, fd169, fd173; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<474, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<63>; +.reg .f64 fd<149>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %13, %15; +add.f64 fd10, %14, %16; +sub.f64 fd11, %13, %15; +sub.f64 fd12, %14, %16; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 4080; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 2040; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+2048]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+2048]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4064; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 2032; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+2048]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+2048]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 4032; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 2016; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+2048]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+2048]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 5; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 3968; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 1984; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+2048]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+2048]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 240; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd81, fd79; +mul.f64 fd86, fd82, fd80; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd81, fd80; +fma.rn.f64 fd89, fd82, fd79, fd88; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3840; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd87; +barrier.sync 0; +and.b32 r40, r11, 1920; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+2048]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+2048]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd98, fd96; +mul.f64 fd103, fd99, fd97; +sub.f64 fd104, fd102, fd103; +mul.f64 fd105, fd98, fd97; +fma.rn.f64 fd106, fd99, fd96, fd105; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3584; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd104; +barrier.sync 0; +and.b32 r47, r11, 1792; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+2048]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+2048]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd115, fd113; +mul.f64 fd120, fd116, fd114; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd115, fd114; +fma.rn.f64 fd123, fd116, fd113, fd122; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 3072; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd121; +barrier.sync 0; +and.b32 r54, r11, 1536; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+2048]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+2048]; +add.f64 fd128, fd124, fd125; +add.f64 fd129, fd126, fd127; +sub.f64 fd130, fd124, fd125; +sub.f64 fd131, fd126, fd127; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd132, fd133}, [rd26]; +mul.f64 fd136, fd132, fd130; +mul.f64 fd137, fd133, fd131; +sub.f64 fd138, fd136, fd137; +mul.f64 fd139, fd132, fd131; +fma.rn.f64 fd140, fd133, fd130, fd139; +and.b32 r57, r11, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 2048; +add.s32 r60, r58, r59; +st.shared.f64 [r60], fd128; +st.shared.f64 [r60+1024], fd138; +barrier.sync 0; +and.b32 r61, r11, 1024; +sub.s32 r62, r60, r61; +ld.shared.f64 fd141, [r62]; +ld.shared.f64 fd142, [r62+2048]; +barrier.sync 0; +st.shared.f64 [r60], fd129; +st.shared.f64 [r60+1024], fd140; +barrier.sync 0; +ld.shared.f64 fd143, [r62]; +ld.shared.f64 fd144, [r62+2048]; +add.f64 %0, fd141, fd142; +add.f64 %1, fd143, fd144; +sub.f64 %2, fd141, fd142; +sub.f64 %3, fd143, fd144; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..f6447123465ea --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_512_fp64_inv.hpp.inc @@ -0,0 +1,3453 @@ +#ifndef CUFFTDX_FFT_512_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_512_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<639, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<372>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+1024]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4032; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+512]; +ld.shared.f64 fd160, [r13+1024]; +ld.shared.f64 fd161, [r13+1536]; +ld.shared.f64 fd162, [r13+2048]; +ld.shared.f64 fd163, [r13+2560]; +ld.shared.f64 fd164, [r13+3072]; +ld.shared.f64 fd165, [r13+3584]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+512]; +ld.shared.f64 fd168, [r13+1024]; +ld.shared.f64 fd169, [r13+1536]; +ld.shared.f64 fd170, [r13+2048]; +ld.shared.f64 fd171, [r13+2560]; +ld.shared.f64 fd172, [r13+3072]; +ld.shared.f64 fd173, [r13+3584]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 56; +bfe.u32 r15, r5, 3, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+128]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 3584; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+512]; +ld.shared.f64 fd301, [r21+1024]; +ld.shared.f64 fd302, [r21+1536]; +ld.shared.f64 fd303, [r21+2048]; +ld.shared.f64 fd304, [r21+2560]; +ld.shared.f64 fd305, [r21+3072]; +ld.shared.f64 fd306, [r21+3584]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+512]; +ld.shared.f64 fd309, [r21+1024]; +ld.shared.f64 fd310, [r21+1536]; +ld.shared.f64 fd311, [r21+2048]; +ld.shared.f64 fd312, [r21+2560]; +ld.shared.f64 fd313, [r21+3072]; +ld.shared.f64 fd314, [r21+3584]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +sub.f64 fd327, fd317, fd322; +add.f64 fd328, fd318, fd321; +add.f64 fd329, fd317, fd322; +sub.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +sub.f64 fd343, fd333, fd338; +add.f64 fd344, fd334, fd337; +add.f64 fd345, fd333, fd338; +sub.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0d3FE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +add.f64 fd350, fd347, fd348; +mul.f64 fd351, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd352, fd346, 0d3FE6A09E667F3BCD; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd346, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd355, fd345, 0d3FE6A09E667F3BCD, fd354; +add.f64 %0, fd323, fd339; +add.f64 %1, fd324, fd340; +add.f64 %3, fd328, fd350; +add.f64 %2, fd327, fd349; +add.f64 %5, fd326, fd341; +sub.f64 %4, fd325, fd342; +add.f64 %7, fd330, fd355; +add.f64 %6, fd329, fd353; +sub.f64 %8, fd323, fd339; +sub.f64 %9, fd324, fd340; +sub.f64 %11, fd328, fd350; +sub.f64 %10, fd327, fd349; +sub.f64 %13, fd326, fd341; +add.f64 %12, fd325, fd342; +sub.f64 %15, fd330, fd355; +sub.f64 %14, fd329, fd353; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<640, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<404>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %19, %29; +add.f64 fd34, %20, %31; +sub.f64 fd35, %19, %29; +sub.f64 fd36, %20, %31; +add.f64 fd37, %24, %35; +add.f64 fd38, %26, %36; +sub.f64 fd39, %24, %35; +sub.f64 fd40, %26, %36; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %21, %32; +add.f64 fd50, %23, %34; +sub.f64 fd51, %21, %32; +sub.f64 fd52, %23, %34; +add.f64 fd53, %27, %37; +add.f64 fd54, %28, %38; +sub.f64 fd55, %27, %37; +sub.f64 fd56, %28, %38; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 63; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 1008; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+1024]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 8064; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+1024]; +ld.shared.v2.f64 {fd166, fd167}, [r13+2048]; +ld.shared.v2.f64 {fd170, fd171}, [r13+3072]; +ld.shared.v2.f64 {fd174, fd175}, [r13+4096]; +ld.shared.v2.f64 {fd178, fd179}, [r13+5120]; +ld.shared.v2.f64 {fd182, fd183}, [r13+6144]; +ld.shared.v2.f64 {fd186, fd187}, [r13+7168]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 56; +bfe.u32 r15, r5, 3, 3; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+128]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 7168; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+1024]; +ld.shared.v2.f64 {fd323, fd324}, [r20+2048]; +ld.shared.v2.f64 {fd327, fd328}, [r20+3072]; +ld.shared.v2.f64 {fd331, fd332}, [r20+4096]; +ld.shared.v2.f64 {fd335, fd336}, [r20+5120]; +ld.shared.v2.f64 {fd339, fd340}, [r20+6144]; +ld.shared.v2.f64 {fd343, fd344}, [r20+7168]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +sub.f64 fd359, fd349, fd354; +add.f64 fd360, fd350, fd353; +add.f64 fd361, fd349, fd354; +sub.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +sub.f64 fd375, fd365, fd370; +add.f64 fd376, fd366, fd369; +add.f64 fd377, fd365, fd370; +sub.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0d3FE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +add.f64 fd382, fd379, fd380; +mul.f64 fd383, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd384, fd378, 0d3FE6A09E667F3BCD; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd378, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd387, fd377, 0d3FE6A09E667F3BCD, fd386; +add.f64 %1, fd356, fd372; +add.f64 %0, fd355, fd371; +add.f64 %3, fd360, fd382; +add.f64 %2, fd359, fd381; +add.f64 %5, fd358, fd373; +sub.f64 %4, fd357, fd374; +add.f64 %7, fd362, fd387; +add.f64 %6, fd361, fd385; +sub.f64 %9, fd356, fd372; +sub.f64 %8, fd355, fd371; +sub.f64 %11, fd360, fd382; +sub.f64 %10, fd359, fd381; +sub.f64 %13, fd358, fd373; +add.f64 %12, fd357, fd374; +sub.f64 %15, fd362, fd387; +sub.f64 %14, fd361, fd385; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_512), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<641, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<1054>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %35, %51; +sub.f64 fd67, %35, %51; +add.f64 fd1045, %36, %67; +sub.f64 fd68, %36, %67; +add.f64 fd69, %43, %59; +sub.f64 fd71, %43, %59; +add.f64 fd1043, %68, %60; +sub.f64 fd72, %68, %60; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1042, fd1045, fd1043; +sub.f64 fd76, fd1045, fd1043; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd1041, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %55; +sub.f64 fd83, %39, %55; +add.f64 fd1038, %70, %69; +sub.f64 fd84, %70, %69; +add.f64 fd85, %47, %63; +sub.f64 fd87, %47, %63; +add.f64 fd1036, %48, %71; +sub.f64 fd88, %48, %71; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1035, fd1038, fd1036; +sub.f64 fd92, fd1038, fd1036; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd1034, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd1034, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd1032, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd1033, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd1032, fd1033; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1031, fd1042, fd1035; +sub.f64 fd109, fd1042, fd1035; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1030, fd1041, fd100; +sub.f64 fd113, fd1041, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd1029, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd1028, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %53; +sub.f64 fd124, %37, %53; +add.f64 fd1026, %72, %54; +sub.f64 fd125, %72, %54; +add.f64 fd126, %45, %61; +sub.f64 fd128, %45, %61; +add.f64 fd1023, %73, %74; +sub.f64 fd129, %73, %74; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1022, fd1026, fd1023; +sub.f64 fd133, fd1026, fd1023; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd1021, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %41, %57; +sub.f64 fd140, %41, %57; +add.f64 fd1019, %42, %75; +sub.f64 fd141, %42, %75; +add.f64 fd142, %49, %65; +sub.f64 fd144, %49, %65; +add.f64 fd1017, %76, %66; +sub.f64 fd145, %76, %66; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1016, fd1019, fd1017; +sub.f64 fd149, fd1019, fd1017; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd1015, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd1015, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd1013, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd1014, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd1013, fd1014; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1012, fd1022, fd1016; +sub.f64 fd166, fd1022, fd1016; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1011, fd1021, fd157; +sub.f64 fd170, fd1021, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd1010, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd1009, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1007, fd167, 0d3FED906BCF328D46; +mul.f64 fd1008, fd1011, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd1007, fd1008; +mul.f64 fd182, fd1011, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd1010, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd1009, 0d3FED906BCF328D46; +mul.f64 fd1006, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd1006, fd189; +mul.f64 fd191, fd1009, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd1005, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd1005, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd1003, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd1004, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd1003, fd1004; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd1001, fd177, 0dBFED906BCF328D46; +mul.f64 fd1002, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd1001, fd1002; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1000, fd1030, fd183; +sub.f64 fd213, fd1030, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd999, fd1029, fd187; +sub.f64 fd217, fd1029, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd998, fd1028, fd192; +sub.f64 fd221, fd1028, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd997, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd996, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd995, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd994, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r21, %tid.x; +shl.b32 r7, r21, 8; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r21, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd1000, fd239; +mul.f64 fd244, fd238, fd1000; +mul.f64 fd246, fd239, fd239; +mul.f64 fd993, fd238, fd238; +sub.f64 fd247, fd993, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd999, fd249; +mul.f64 fd252, fd247, fd999; +mul.f64 fd991, fd238, fd247; +mul.f64 fd992, fd239, fd249; +sub.f64 fd255, fd991, fd992; +mul.f64 fd990, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd998, fd257; +mul.f64 fd260, fd255, fd998; +mul.f64 fd262, fd239, fd257; +mul.f64 fd989, fd238, fd255; +sub.f64 fd263, fd989, fd262; +mul.f64 fd988, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd997, fd265; +mul.f64 fd268, fd263, fd997; +mul.f64 fd270, fd239, fd265; +mul.f64 fd987, fd238, fd263; +sub.f64 fd271, fd987, fd270; +mul.f64 fd986, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd996, fd273; +mul.f64 fd276, fd271, fd996; +mul.f64 fd984, fd238, fd271; +mul.f64 fd985, fd239, fd273; +sub.f64 fd279, fd984, fd985; +mul.f64 fd983, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd995, fd281; +mul.f64 fd284, fd279, fd995; +mul.f64 fd286, fd239, fd281; +mul.f64 fd982, fd238, fd279; +sub.f64 fd287, fd982, fd286; +mul.f64 fd981, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd994, fd289; +mul.f64 fd292, fd287, fd994; +mul.f64 fd294, fd239, fd289; +mul.f64 fd980, fd238, fd287; +sub.f64 fd295, fd980, fd294; +mul.f64 fd979, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd978, fd1031, fd1012; +mul.f64 fd298, fd978, fd297; +sub.f64 fd977, fd106, fd163; +mul.f64 fd299, fd977, fd297; +mul.f64 fd300, fd295, fd978; +ld.global.v2.f64 {fd301, fd302}, [rd5+512]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd975, fd238, fd301; +mul.f64 fd976, fd239, fd302; +sub.f64 fd310, fd975, fd976; +mul.f64 fd974, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd973, fd238, fd310; +sub.f64 fd318, fd973, fd317; +mul.f64 fd972, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd970, fd238, fd318; +mul.f64 fd971, fd239, fd320; +sub.f64 fd326, fd970, fd971; +mul.f64 fd969, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd967, fd238, fd326; +mul.f64 fd968, fd239, fd328; +sub.f64 fd334, fd967, fd968; +mul.f64 fd966, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd965, fd238, fd334; +sub.f64 fd342, fd965, fd341; +mul.f64 fd964, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd962, fd238, fd342; +mul.f64 fd963, fd239, fd344; +sub.f64 fd350, fd962, fd963; +mul.f64 fd961, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd960, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 7936; +add.s32 r12, r9, r11; +mov.u32 r35, %tid.x; +shl.b32 r34, r35, 8; +add.f64 fd356, fd1031, fd1012; +sub.f64 fd1051, fd106, fd163; +and.b32 r23, r34, 7936; +add.s32 r22, r9, r23; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r22], {fd357, fd356}; +mov.u32 r33, %tid.x; +shl.b32 r32, r33, 4; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd960; +st.shared.v2.f64 [r22+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd990; +st.shared.v2.f64 [r22+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd988; +st.shared.v2.f64 [r22+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd986; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r22+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd983; +st.shared.v2.f64 [r22+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd981; +st.shared.v2.f64 [r22+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd979; +st.shared.v2.f64 [r22+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd1051, fd298; +sub.f64 fd373, fd300, fd299; +st.shared.v2.f64 [r22+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd974; +st.shared.v2.f64 [r22+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd972; +st.shared.v2.f64 [r22+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd969; +st.shared.v2.f64 [r22+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd966; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r22+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd964; +st.shared.v2.f64 [r22+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd961; +st.shared.v2.f64 [r22+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r22+240], {fd386, fd387}; +barrier.sync 0; +and.b32 r20, r33, 31; +mad.lo.s32 r13, r20, -240, r22; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+512]; +ld.shared.v2.f64 {fd396, fd397}, [r13+1024]; +ld.shared.v2.f64 {fd400, fd401}, [r13+1536]; +ld.shared.v2.f64 {fd404, fd405}, [r13+2048]; +ld.shared.v2.f64 {fd408, fd409}, [r13+2560]; +ld.shared.v2.f64 {fd412, fd413}, [r13+3072]; +ld.shared.v2.f64 {fd416, fd417}, [r13+3584]; +ld.shared.v2.f64 {fd420, fd421}, [r13+4096]; +ld.shared.v2.f64 {fd424, fd425}, [r13+4608]; +ld.shared.v2.f64 {fd428, fd429}, [r13+5120]; +ld.shared.v2.f64 {fd432, fd433}, [r13+5632]; +ld.shared.v2.f64 {fd436, fd437}, [r13+6144]; +ld.shared.v2.f64 {fd440, fd441}, [r13+6656]; +ld.shared.v2.f64 {fd444, fd445}, [r13+7168]; +ld.shared.v2.f64 {fd448, fd449}, [r13+7680]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd959, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd958, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd957, fd959, fd958; +sub.f64 fd463, fd959, fd958; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd956, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd955, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd954, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd953, fd955, fd954; +sub.f64 fd479, fd955, fd954; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd952, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd952, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd951, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd951, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd950, fd957, fd953; +sub.f64 fd496, fd957, fd953; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd949, fd956, fd487; +sub.f64 fd500, fd956, fd487; +sub.f64 fd501, fd462, fd479; +add.f64 fd503, fd462, fd479; +add.f64 fd948, fd463, fd478; +sub.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd490; +sub.f64 fd507, fd466, fd490; +add.f64 fd947, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd946, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd945, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd944, fd946, fd945; +sub.f64 fd520, fd946, fd945; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd943, fd512, fd515; +sub.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd942, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd941, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd940, fd942, fd941; +sub.f64 fd536, fd942, fd941; +sub.f64 fd537, fd527, fd532; +add.f64 fd539, fd527, fd532; +add.f64 fd939, fd528, fd531; +sub.f64 fd540, fd528, fd531; +mul.f64 fd541, fd537, 0d3FE6A09E667F3BCD; +mul.f64 fd542, fd939, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +mul.f64 fd546, fd540, 0d3FE6A09E667F3BCD; +mul.f64 fd938, fd539, 0dBFE6A09E667F3BCD; +sub.f64 fd547, fd938, fd546; +mul.f64 fd548, fd540, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd549, fd539, 0d3FE6A09E667F3BCD, fd548; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd937, fd944, fd940; +sub.f64 fd553, fd944, fd940; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd936, fd943, fd544; +sub.f64 fd557, fd943, fd544; +sub.f64 fd558, fd519, fd536; +add.f64 fd560, fd519, fd536; +add.f64 fd935, fd520, fd535; +sub.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd547; +sub.f64 fd564, fd523, fd547; +add.f64 fd934, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd932, fd554, 0d3FED906BCF328D46; +mul.f64 fd933, fd936, 0d3FD87DE2A6AEA963; +sub.f64 fd568, fd932, fd933; +mul.f64 fd569, fd936, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0d3FD87DE2A6AEA963, fd569; +mul.f64 fd571, fd558, 0d3FE6A09E667F3BCD; +mul.f64 fd572, fd935, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd571, fd572; +add.f64 fd574, fd571, fd572; +mul.f64 fd930, fd562, 0d3FD87DE2A6AEA963; +mul.f64 fd931, fd934, 0d3FED906BCF328D46; +sub.f64 fd577, fd930, fd931; +mul.f64 fd578, fd934, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd579, fd562, 0d3FED906BCF328D46, fd578; +mul.f64 fd928, fd556, 0dBFD87DE2A6AEA963; +mul.f64 fd929, fd557, 0d3FED906BCF328D46; +sub.f64 fd582, fd928, fd929; +mul.f64 fd583, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd584, fd556, 0d3FED906BCF328D46, fd583; +mul.f64 fd926, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd927, fd561, 0d3FE6A09E667F3BCD; +sub.f64 fd587, fd926, fd927; +mul.f64 fd588, fd561, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd589, fd560, 0d3FE6A09E667F3BCD, fd588; +mul.f64 fd591, fd565, 0d3FD87DE2A6AEA963; +mul.f64 fd925, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd925, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0d3FD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd924, fd949, fd570; +sub.f64 fd600, fd949, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd923, fd948, fd574; +sub.f64 fd604, fd948, fd574; +add.f64 fd605, fd505, fd577; +sub.f64 fd607, fd505, fd577; +add.f64 fd922, fd947, fd579; +sub.f64 fd608, fd947, fd579; +sub.f64 fd609, fd495, fd553; +add.f64 fd611, fd495, fd553; +add.f64 fd921, fd496, fd552; +sub.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd582; +sub.f64 fd615, fd499, fd582; +add.f64 fd920, fd500, fd584; +sub.f64 fd616, fd500, fd584; +add.f64 fd617, fd503, fd587; +sub.f64 fd619, fd503, fd587; +add.f64 fd919, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd918, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r33, 16; +mov.u64 rd7, %34; +cvt.u64.u32 rd9, r14; +add.s64 rd8, rd7, rd9; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd629, fd924, fd626; +mul.f64 fd631, fd625, fd924; +mul.f64 fd633, fd626, fd626; +mul.f64 fd917, fd625, fd625; +sub.f64 fd634, fd917, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd637, fd923, fd636; +mul.f64 fd639, fd634, fd923; +mul.f64 fd915, fd625, fd634; +mul.f64 fd916, fd626, fd636; +sub.f64 fd642, fd915, fd916; +mul.f64 fd914, fd601, fd636; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd645, fd922, fd644; +mul.f64 fd647, fd642, fd922; +mul.f64 fd649, fd626, fd644; +mul.f64 fd913, fd625, fd642; +sub.f64 fd650, fd913, fd649; +mul.f64 fd912, fd605, fd644; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd653, fd921, fd652; +mul.f64 fd655, fd650, fd921; +mul.f64 fd910, fd625, fd650; +mul.f64 fd911, fd626, fd652; +sub.f64 fd658, fd910, fd911; +mul.f64 fd909, fd609, fd652; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd661, fd920, fd660; +mul.f64 fd663, fd658, fd920; +mul.f64 fd907, fd625, fd658; +mul.f64 fd908, fd626, fd660; +sub.f64 fd666, fd907, fd908; +mul.f64 fd906, fd613, fd660; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd669, fd919, fd668; +mul.f64 fd671, fd666, fd919; +mul.f64 fd673, fd626, fd668; +mul.f64 fd905, fd625, fd666; +sub.f64 fd674, fd905, fd673; +mul.f64 fd904, fd617, fd668; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd677, fd918, fd676; +mul.f64 fd679, fd674, fd918; +mul.f64 fd902, fd625, fd674; +mul.f64 fd903, fd626, fd676; +sub.f64 fd682, fd902, fd903; +mul.f64 fd901, fd621, fd676; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd900, fd950, fd937; +mul.f64 fd685, fd900, fd684; +sub.f64 fd899, fd493, fd550; +mul.f64 fd686, fd899, fd684; +mul.f64 fd687, fd682, fd900; +ld.global.v2.f64 {fd688, fd689}, [rd8+32]; +mul.f64 fd692, fd600, fd689; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd898, fd625, fd688; +sub.f64 fd697, fd898, fd696; +mul.f64 fd897, fd599, fd689; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd700, fd604, fd699; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd896, fd625, fd697; +sub.f64 fd705, fd896, fd704; +mul.f64 fd895, fd603, fd699; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd708, fd608, fd707; +mul.f64 fd710, fd705, fd608; +mul.f64 fd893, fd625, fd705; +mul.f64 fd894, fd626, fd707; +sub.f64 fd713, fd893, fd894; +mul.f64 fd892, fd607, fd707; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd716, fd612, fd715; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd891, fd625, fd713; +sub.f64 fd721, fd891, fd720; +mul.f64 fd890, fd611, fd715; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd724, fd616, fd723; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd889, fd625, fd721; +sub.f64 fd729, fd889, fd728; +mul.f64 fd888, fd615, fd723; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd732, fd620, fd731; +mul.f64 fd734, fd729, fd620; +mul.f64 fd886, fd625, fd729; +mul.f64 fd887, fd626, fd731; +sub.f64 fd737, fd886, fd887; +mul.f64 fd885, fd619, fd731; +mul.f64 fd738, fd625, fd731; +mul.f64 fd884, fd597, fd626; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd624, fd739; +mul.f64 fd741, fd623, fd739; +mul.f64 fd742, fd737, fd624; +and.b32 r15, r32, 240; +add.s32 r16, r9, r15; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 8; +barrier.sync 0; +and.b32 r17, r27, 4096; +add.s32 r18, r16, r17; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 16; +add.f64 fd743, fd950, fd937; +sub.f64 fd1050, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r37, %tid.x; +and.b32 r36, r37, 16; +fma.rn.f64 fd745, fd625, fd597, fd629; +sub.f64 fd746, fd631, fd884; +st.shared.v2.f64 [r18+256], {fd745, fd746}; +fma.rn.f64 fd747, fd634, fd601, fd637; +sub.f64 fd748, fd639, fd914; +st.shared.v2.f64 [r18+512], {fd747, fd748}; +fma.rn.f64 fd749, fd642, fd605, fd645; +sub.f64 fd750, fd647, fd912; +st.shared.v2.f64 [r18+768], {fd749, fd750}; +fma.rn.f64 fd751, fd650, fd609, fd653; +sub.f64 fd752, fd655, fd909; +st.shared.v2.f64 [r18+1024], {fd751, fd752}; +sub.f64 fd753, fd663, fd906; +fma.rn.f64 fd754, fd658, fd613, fd661; +st.shared.v2.f64 [r18+1280], {fd754, fd753}; +fma.rn.f64 fd755, fd666, fd617, fd669; +sub.f64 fd756, fd671, fd904; +st.shared.v2.f64 [r18+1536], {fd755, fd756}; +fma.rn.f64 fd757, fd674, fd621, fd677; +sub.f64 fd758, fd679, fd901; +st.shared.v2.f64 [r18+1792], {fd757, fd758}; +fma.rn.f64 fd759, fd682, fd1050, fd685; +sub.f64 fd760, fd687, fd686; +st.shared.v2.f64 [r18+2048], {fd759, fd760}; +fma.rn.f64 fd761, fd688, fd599, fd692; +sub.f64 fd762, fd694, fd897; +st.shared.v2.f64 [r18+2304], {fd761, fd762}; +fma.rn.f64 fd763, fd697, fd603, fd700; +sub.f64 fd764, fd702, fd895; +st.shared.v2.f64 [r18+2560], {fd763, fd764}; +fma.rn.f64 fd765, fd705, fd607, fd708; +sub.f64 fd766, fd710, fd892; +st.shared.v2.f64 [r18+2816], {fd765, fd766}; +fma.rn.f64 fd767, fd713, fd611, fd716; +sub.f64 fd768, fd718, fd890; +st.shared.v2.f64 [r18+3072], {fd767, fd768}; +sub.f64 fd769, fd726, fd888; +fma.rn.f64 fd770, fd721, fd615, fd724; +st.shared.v2.f64 [r18+3328], {fd770, fd769}; +fma.rn.f64 fd771, fd729, fd619, fd732; +sub.f64 fd772, fd734, fd885; +st.shared.v2.f64 [r18+3584], {fd771, fd772}; +fma.rn.f64 fd773, fd737, fd623, fd740; +sub.f64 fd774, fd742, fd741; +st.shared.v2.f64 [r18+3840], {fd773, fd774}; +barrier.sync 0; +mad.lo.s32 r19, r36, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+512]; +ld.shared.v2.f64 {fd783, fd784}, [r19+1024]; +ld.shared.v2.f64 {fd787, fd788}, [r19+1536]; +ld.shared.v2.f64 {fd791, fd792}, [r19+2048]; +ld.shared.v2.f64 {fd795, fd796}, [r19+2560]; +ld.shared.v2.f64 {fd799, fd800}, [r19+3072]; +ld.shared.v2.f64 {fd803, fd804}, [r19+3584]; +ld.shared.v2.f64 {fd807, fd808}, [r19+4096]; +ld.shared.v2.f64 {fd811, fd812}, [r19+4608]; +ld.shared.v2.f64 {fd815, fd816}, [r19+5120]; +ld.shared.v2.f64 {fd819, fd820}, [r19+5632]; +ld.shared.v2.f64 {fd823, fd824}, [r19+6144]; +ld.shared.v2.f64 {fd827, fd828}, [r19+6656]; +ld.shared.v2.f64 {fd831, fd832}, [r19+7168]; +ld.shared.v2.f64 {fd835, fd836}, [r19+7680]; +add.f64 %1, fd776, fd808; +add.f64 %0, fd775, fd807; +add.f64 %3, fd780, fd812; +add.f64 %2, fd779, fd811; +add.f64 %5, fd784, fd816; +add.f64 %4, fd783, fd815; +add.f64 %6, fd787, fd819; +add.f64 %7, fd788, fd820; +add.f64 %8, fd791, fd823; +add.f64 %9, fd792, fd824; +add.f64 %10, fd795, fd827; +add.f64 %11, fd796, fd828; +add.f64 %13, fd800, fd832; +add.f64 %12, fd799, fd831; +add.f64 %15, fd804, fd836; +add.f64 %14, fd803, fd835; +sub.f64 %17, fd776, fd808; +sub.f64 %16, fd775, fd807; +sub.f64 %19, fd780, fd812; +sub.f64 %18, fd779, fd811; +sub.f64 %21, fd784, fd816; +sub.f64 %20, fd783, fd815; +sub.f64 %23, fd788, fd820; +sub.f64 %22, fd787, fd819; +sub.f64 %25, fd792, fd824; +sub.f64 %24, fd791, fd823; +sub.f64 %27, fd796, fd828; +sub.f64 %26, fd795, fd827; +sub.f64 %29, fd800, fd832; +sub.f64 %28, fd799, fd831; +sub.f64 %31, fd804, fd836; +sub.f64 %30, fd803, fd835; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<642, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<807>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %35, %56; +add.f64 fd66, %36, %58; +sub.f64 fd67, %35, %56; +sub.f64 fd68, %36, %58; +add.f64 fd69, %45, %67; +add.f64 fd70, %47, %68; +sub.f64 fd71, %45, %67; +sub.f64 fd72, %47, %68; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %61; +add.f64 fd82, %42, %63; +sub.f64 fd83, %40, %61; +sub.f64 fd84, %42, %63; +add.f64 fd85, %51, %72; +add.f64 fd86, %52, %74; +sub.f64 fd87, %51, %72; +sub.f64 fd88, %52, %74; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %37, %59; +add.f64 fd123, %39, %60; +sub.f64 fd124, %37, %59; +sub.f64 fd125, %39, %60; +add.f64 fd126, %48, %69; +add.f64 fd127, %50, %71; +sub.f64 fd128, %48, %69; +sub.f64 fd129, %50, %71; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %43, %64; +add.f64 fd139, %44, %66; +sub.f64 fd140, %43, %64; +sub.f64 fd141, %44, %66; +add.f64 fd142, %53, %75; +add.f64 fd143, %55, %76; +sub.f64 fd144, %53, %75; +sub.f64 fd145, %55, %76; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 31; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+512]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 3968; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+256]; +ld.shared.f64 fd390, [r13+512]; +ld.shared.f64 fd391, [r13+768]; +ld.shared.f64 fd392, [r13+1024]; +ld.shared.f64 fd393, [r13+1280]; +ld.shared.f64 fd394, [r13+1536]; +ld.shared.f64 fd395, [r13+1792]; +ld.shared.f64 fd396, [r13+2048]; +ld.shared.f64 fd397, [r13+2304]; +ld.shared.f64 fd398, [r13+2560]; +ld.shared.f64 fd399, [r13+2816]; +ld.shared.f64 fd400, [r13+3072]; +ld.shared.f64 fd401, [r13+3328]; +ld.shared.f64 fd402, [r13+3584]; +ld.shared.f64 fd403, [r13+3840]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+256]; +ld.shared.f64 fd406, [r13+512]; +ld.shared.f64 fd407, [r13+768]; +ld.shared.f64 fd408, [r13+1024]; +ld.shared.f64 fd409, [r13+1280]; +ld.shared.f64 fd410, [r13+1536]; +ld.shared.f64 fd411, [r13+1792]; +ld.shared.f64 fd412, [r13+2048]; +ld.shared.f64 fd413, [r13+2304]; +ld.shared.f64 fd414, [r13+2560]; +ld.shared.f64 fd415, [r13+2816]; +ld.shared.f64 fd416, [r13+3072]; +ld.shared.f64 fd417, [r13+3328]; +ld.shared.f64 fd418, [r13+3584]; +ld.shared.f64 fd419, [r13+3840]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd542; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd542; +add.f64 fd575, fd473, fd545; +add.f64 fd576, fd474, fd547; +sub.f64 fd577, fd473, fd545; +sub.f64 fd578, fd474, fd547; +sub.f64 fd579, fd463, fd521; +add.f64 fd580, fd464, fd520; +add.f64 fd581, fd463, fd521; +sub.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd550; +add.f64 fd584, fd468, fd552; +sub.f64 fd585, fd467, fd550; +sub.f64 fd586, fd468, fd552; +add.f64 fd587, fd471, fd555; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd555; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 16; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd568, fd596; +fma.rn.f64 fd600, fd595, fd567, fd599; +mul.f64 fd601, fd567, fd596; +mul.f64 fd602, fd595, fd568; +sub.f64 fd603, fd602, fd601; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd572, fd608; +fma.rn.f64 fd610, fd606, fd571, fd609; +mul.f64 fd611, fd571, fd608; +mul.f64 fd612, fd606, fd572; +sub.f64 fd613, fd612, fd611; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd576, fd618; +fma.rn.f64 fd620, fd616, fd575, fd619; +mul.f64 fd621, fd575, fd618; +mul.f64 fd622, fd616, fd576; +sub.f64 fd623, fd622, fd621; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd580, fd628; +fma.rn.f64 fd630, fd626, fd579, fd629; +mul.f64 fd631, fd579, fd628; +mul.f64 fd632, fd626, fd580; +sub.f64 fd633, fd632, fd631; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd584, fd638; +fma.rn.f64 fd640, fd636, fd583, fd639; +mul.f64 fd641, fd583, fd638; +mul.f64 fd642, fd636, fd584; +sub.f64 fd643, fd642, fd641; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd588, fd648; +fma.rn.f64 fd650, fd646, fd587, fd649; +mul.f64 fd651, fd587, fd648; +mul.f64 fd652, fd646, fd588; +sub.f64 fd653, fd652, fd651; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd592, fd658; +fma.rn.f64 fd660, fd656, fd591, fd659; +mul.f64 fd661, fd591, fd658; +mul.f64 fd662, fd656, fd592; +sub.f64 fd663, fd662, fd661; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd566, fd668; +fma.rn.f64 fd670, fd666, fd565, fd669; +mul.f64 fd671, fd565, fd668; +mul.f64 fd672, fd666, fd566; +sub.f64 fd673, fd672, fd671; +ld.global.v2.f64 {fd674, fd675}, [rd8+32]; +mul.f64 fd678, fd570, fd675; +fma.rn.f64 fd679, fd674, fd569, fd678; +mul.f64 fd680, fd569, fd675; +mul.f64 fd681, fd674, fd570; +sub.f64 fd682, fd681, fd680; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd574, fd687; +fma.rn.f64 fd689, fd685, fd573, fd688; +mul.f64 fd690, fd573, fd687; +mul.f64 fd691, fd685, fd574; +sub.f64 fd692, fd691, fd690; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd578, fd697; +fma.rn.f64 fd699, fd695, fd577, fd698; +mul.f64 fd700, fd577, fd697; +mul.f64 fd701, fd695, fd578; +sub.f64 fd702, fd701, fd700; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd582, fd707; +fma.rn.f64 fd709, fd705, fd581, fd708; +mul.f64 fd710, fd581, fd707; +mul.f64 fd711, fd705, fd582; +sub.f64 fd712, fd711, fd710; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd586, fd717; +fma.rn.f64 fd719, fd715, fd585, fd718; +mul.f64 fd720, fd585, fd717; +mul.f64 fd721, fd715, fd586; +sub.f64 fd722, fd721, fd720; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd590, fd727; +fma.rn.f64 fd729, fd725, fd589, fd728; +mul.f64 fd730, fd589, fd727; +mul.f64 fd731, fd725, fd590; +sub.f64 fd732, fd731, fd730; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd594, fd737; +fma.rn.f64 fd739, fd735, fd593, fd738; +mul.f64 fd740, fd593, fd737; +mul.f64 fd741, fd735, fd594; +sub.f64 fd742, fd741, fd740; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 2048; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd600; +st.shared.f64 [r19+256], fd610; +st.shared.f64 [r19+384], fd620; +st.shared.f64 [r19+512], fd630; +st.shared.f64 [r19+640], fd640; +st.shared.f64 [r19+768], fd650; +st.shared.f64 [r19+896], fd660; +st.shared.f64 [r19+1024], fd670; +st.shared.f64 [r19+1152], fd679; +st.shared.f64 [r19+1280], fd689; +st.shared.f64 [r19+1408], fd699; +st.shared.f64 [r19+1536], fd709; +st.shared.f64 [r19+1664], fd719; +st.shared.f64 [r19+1792], fd729; +st.shared.f64 [r19+1920], fd739; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+256]; +ld.shared.f64 fd745, [r20+512]; +ld.shared.f64 fd746, [r20+768]; +ld.shared.f64 fd747, [r20+1024]; +ld.shared.f64 fd748, [r20+1280]; +ld.shared.f64 fd749, [r20+1536]; +ld.shared.f64 fd750, [r20+1792]; +ld.shared.f64 fd751, [r20+2048]; +ld.shared.f64 fd752, [r20+2304]; +ld.shared.f64 fd753, [r20+2560]; +ld.shared.f64 fd754, [r20+2816]; +ld.shared.f64 fd755, [r20+3072]; +ld.shared.f64 fd756, [r20+3328]; +ld.shared.f64 fd757, [r20+3584]; +ld.shared.f64 fd758, [r20+3840]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+256]; +ld.shared.f64 fd761, [r20+512]; +ld.shared.f64 fd762, [r20+768]; +ld.shared.f64 fd763, [r20+1024]; +ld.shared.f64 fd764, [r20+1280]; +ld.shared.f64 fd765, [r20+1536]; +ld.shared.f64 fd766, [r20+1792]; +ld.shared.f64 fd767, [r20+2048]; +ld.shared.f64 fd768, [r20+2304]; +ld.shared.f64 fd769, [r20+2560]; +ld.shared.f64 fd770, [r20+2816]; +ld.shared.f64 fd771, [r20+3072]; +ld.shared.f64 fd772, [r20+3328]; +ld.shared.f64 fd773, [r20+3584]; +ld.shared.f64 fd774, [r20+3840]; +add.f64 %0, fd743, fd751; +add.f64 %1, fd759, fd767; +add.f64 %2, fd744, fd752; +add.f64 %3, fd760, fd768; +add.f64 %4, fd745, fd753; +add.f64 %5, fd761, fd769; +add.f64 %6, fd746, fd754; +add.f64 %7, fd762, fd770; +add.f64 %8, fd747, fd755; +add.f64 %9, fd763, fd771; +add.f64 %10, fd748, fd756; +add.f64 %11, fd764, fd772; +add.f64 %12, fd749, fd757; +add.f64 %13, fd765, fd773; +add.f64 %14, fd750, fd758; +add.f64 %15, fd766, fd774; +sub.f64 %16, fd743, fd751; +sub.f64 %17, fd759, fd767; +sub.f64 %18, fd744, fd752; +sub.f64 %19, fd760, fd768; +sub.f64 %20, fd745, fd753; +sub.f64 %21, fd761, fd769; +sub.f64 %22, fd746, fd754; +sub.f64 %23, fd762, fd770; +sub.f64 %24, fd747, fd755; +sub.f64 %25, fd763, fd771; +sub.f64 %26, fd748, fd756; +sub.f64 %27, fd764, fd772; +sub.f64 %28, fd749, fd757; +sub.f64 %29, fd765, fd773; +sub.f64 %30, fd750, fd758; +sub.f64 %31, fd766, fd774; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<644, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<34>; +.reg .f64 fd<265>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -8192; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+2048]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 8128; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+2048]; +ld.shared.v2.f64 {fd69, fd70}, [r13+4096]; +ld.shared.v2.f64 {fd73, fd74}, [r13+6144]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+512]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 7936; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+2048]; +ld.shared.v2.f64 {fd129, fd130}, [r20+4096]; +ld.shared.v2.f64 {fd133, fd134}, [r20+6144]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +sub.f64 fd145, fd137, fd141; +sub.f64 fd146, fd138, fd142; +sub.f64 fd147, fd139, fd144; +add.f64 fd148, fd140, fd143; +add.f64 fd149, fd139, fd144; +sub.f64 fd150, fd140, fd143; +and.b32 r21, r5, 112; +cvt.u64.u32 rd9, r21; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd151, fd152}, [rd11]; +mul.f64 fd155, fd148, fd152; +mul.f64 fd156, fd147, fd152; +mul.f64 fd157, fd151, fd148; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd146, fd162; +mul.f64 fd164, fd145, fd162; +mul.f64 fd165, fd160, fd146; +ld.global.v2.f64 {fd166, fd167}, [rd11+128]; +mul.f64 fd170, fd150, fd167; +mul.f64 fd171, fd149, fd167; +mul.f64 fd172, fd166, fd150; +and.b32 r22, r10, 240; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 7168; +add.s32 r25, r23, r24; +add.f64 fd173, fd138, fd142; +add.f64 fd174, fd137, fd141; +st.shared.v2.f64 [r25], {fd174, fd173}; +fma.rn.f64 fd175, fd151, fd147, fd155; +sub.f64 fd176, fd157, fd156; +st.shared.v2.f64 [r25+256], {fd175, fd176}; +fma.rn.f64 fd177, fd160, fd145, fd163; +sub.f64 fd178, fd165, fd164; +st.shared.v2.f64 [r25+512], {fd177, fd178}; +fma.rn.f64 fd179, fd166, fd149, fd170; +sub.f64 fd180, fd172, fd171; +st.shared.v2.f64 [r25+768], {fd179, fd180}; +barrier.sync 0; +mad.lo.s32 r26, r21, -48, r25; +ld.shared.v2.f64 {fd181, fd182}, [r26]; +ld.shared.v2.f64 {fd185, fd186}, [r26+2048]; +ld.shared.v2.f64 {fd189, fd190}, [r26+4096]; +ld.shared.v2.f64 {fd193, fd194}, [r26+6144]; +add.f64 fd197, fd181, fd189; +add.f64 fd198, fd182, fd190; +sub.f64 fd199, fd181, fd189; +sub.f64 fd200, fd182, fd190; +add.f64 fd201, fd185, fd193; +add.f64 fd202, fd186, fd194; +sub.f64 fd203, fd185, fd193; +sub.f64 fd204, fd186, fd194; +sub.f64 fd205, fd197, fd201; +sub.f64 fd206, fd198, fd202; +sub.f64 fd207, fd199, fd204; +add.f64 fd208, fd200, fd203; +add.f64 fd209, fd199, fd204; +sub.f64 fd210, fd200, fd203; +and.b32 r27, r5, 64; +bfe.u32 r28, r5, 6, 1; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd211, fd212}, [rd14]; +mul.f64 fd215, fd208, fd212; +mul.f64 fd216, fd207, fd212; +mul.f64 fd217, fd211, fd208; +mul.f64 fd218, fd211, fd211; +mul.f64 fd219, fd212, fd212; +sub.f64 fd220, fd218, fd219; +mul.f64 fd221, fd212, fd211; +fma.rn.f64 fd222, fd212, fd211, fd221; +mul.f64 fd223, fd206, fd222; +mul.f64 fd224, fd205, fd222; +mul.f64 fd225, fd220, fd206; +ld.global.v2.f64 {fd226, fd227}, [rd14+32]; +mul.f64 fd230, fd210, fd227; +mul.f64 fd231, fd209, fd227; +mul.f64 fd232, fd226, fd210; +and.b32 r29, r10, 1008; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 4096; +add.s32 r32, r30, r31; +add.f64 fd233, fd198, fd202; +add.f64 fd234, fd197, fd201; +st.shared.v2.f64 [r32], {fd234, fd233}; +fma.rn.f64 fd235, fd211, fd207, fd215; +sub.f64 fd236, fd217, fd216; +st.shared.v2.f64 [r32+1024], {fd235, fd236}; +fma.rn.f64 fd237, fd220, fd205, fd223; +sub.f64 fd238, fd225, fd224; +st.shared.v2.f64 [r32+2048], {fd237, fd238}; +fma.rn.f64 fd239, fd226, fd209, fd230; +sub.f64 fd240, fd232, fd231; +st.shared.v2.f64 [r32+3072], {fd239, fd240}; +barrier.sync 0; +mad.lo.s32 r33, r27, -48, r32; +ld.shared.v2.f64 {fd241, fd242}, [r33]; +ld.shared.v2.f64 {fd245, fd246}, [r33+2048]; +ld.shared.v2.f64 {fd249, fd250}, [r33+4096]; +ld.shared.v2.f64 {fd253, fd254}, [r33+6144]; +add.f64 %1, fd242, fd250; +add.f64 %0, fd241, fd249; +add.f64 %3, fd246, fd254; +add.f64 %2, fd245, fd253; +sub.f64 %5, fd242, fd250; +sub.f64 %4, fd241, fd249; +sub.f64 %7, fd246, fd254; +sub.f64 %6, fd245, fd253; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<643, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<233>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %13, %18; +add.f64 fd18, %14, %20; +sub.f64 fd19, %13, %18; +sub.f64 fd20, %14, %20; +add.f64 fd21, %15, %21; +add.f64 fd22, %17, %22; +sub.f64 fd23, %15, %21; +sub.f64 fd24, %17, %22; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 127; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2032; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+2048]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -4096; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 4064; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+1024]; +ld.shared.f64 fd63, [r13+2048]; +ld.shared.f64 fd64, [r13+3072]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+1024]; +ld.shared.f64 fd67, [r13+2048]; +ld.shared.f64 fd68, [r13+3072]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 124; +bfe.u32 r15, r5, 2, 5; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+512]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 3968; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+1024]; +ld.shared.f64 fd115, [r21+2048]; +ld.shared.f64 fd116, [r21+3072]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+1024]; +ld.shared.f64 fd119, [r21+2048]; +ld.shared.f64 fd120, [r21+3072]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 fd129, fd121, fd125; +add.f64 fd130, fd122, fd126; +sub.f64 fd131, fd121, fd125; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd128; +add.f64 fd134, fd124, fd127; +add.f64 fd135, fd123, fd128; +sub.f64 fd136, fd124, fd127; +and.b32 r22, r5, 112; +cvt.u64.u32 rd9, r22; +mov.u64 rd10, %11; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd137, fd138}, [rd11]; +mul.f64 fd141, fd134, fd138; +fma.rn.f64 fd142, fd137, fd133, fd141; +mul.f64 fd143, fd133, fd138; +mul.f64 fd144, fd137, fd134; +sub.f64 fd145, fd144, fd143; +mul.f64 fd146, fd137, fd137; +mul.f64 fd147, fd138, fd138; +sub.f64 fd148, fd146, fd147; +mul.f64 fd149, fd138, fd137; +fma.rn.f64 fd150, fd138, fd137, fd149; +mul.f64 fd151, fd132, fd150; +fma.rn.f64 fd152, fd148, fd131, fd151; +mul.f64 fd153, fd131, fd150; +mul.f64 fd154, fd148, fd132; +sub.f64 fd155, fd154, fd153; +ld.global.v2.f64 {fd156, fd157}, [rd11+128]; +mul.f64 fd160, fd136, fd157; +fma.rn.f64 fd161, fd156, fd135, fd160; +mul.f64 fd162, fd135, fd157; +mul.f64 fd163, fd156, fd136; +sub.f64 fd164, fd163, fd162; +and.b32 r23, r16, 120; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 3584; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd129; +st.shared.f64 [r26+128], fd142; +st.shared.f64 [r26+256], fd152; +st.shared.f64 [r26+384], fd161; +barrier.sync 0; +mad.lo.s32 r27, r22, -24, r26; +ld.shared.f64 fd165, [r27]; +ld.shared.f64 fd166, [r27+1024]; +ld.shared.f64 fd167, [r27+2048]; +ld.shared.f64 fd168, [r27+3072]; +barrier.sync 0; +st.shared.f64 [r26], fd130; +st.shared.f64 [r26+128], fd145; +st.shared.f64 [r26+256], fd155; +st.shared.f64 [r26+384], fd164; +barrier.sync 0; +ld.shared.f64 fd169, [r27]; +ld.shared.f64 fd170, [r27+1024]; +ld.shared.f64 fd171, [r27+2048]; +ld.shared.f64 fd172, [r27+3072]; +add.f64 fd173, fd165, fd167; +add.f64 fd174, fd169, fd171; +sub.f64 fd175, fd165, fd167; +sub.f64 fd176, fd169, fd171; +add.f64 fd177, fd166, fd168; +add.f64 fd178, fd170, fd172; +sub.f64 fd179, fd166, fd168; +sub.f64 fd180, fd170, fd172; +add.f64 fd181, fd173, fd177; +add.f64 fd182, fd174, fd178; +sub.f64 fd183, fd173, fd177; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd180; +add.f64 fd186, fd176, fd179; +add.f64 fd187, fd175, fd180; +sub.f64 fd188, fd176, fd179; +and.b32 r28, r5, 64; +bfe.u32 r29, r5, 6, 1; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %12; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd189, fd190}, [rd14]; +mul.f64 fd193, fd186, fd190; +fma.rn.f64 fd194, fd189, fd185, fd193; +mul.f64 fd195, fd185, fd190; +mul.f64 fd196, fd189, fd186; +sub.f64 fd197, fd196, fd195; +mul.f64 fd198, fd189, fd189; +mul.f64 fd199, fd190, fd190; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd190, fd189; +fma.rn.f64 fd202, fd190, fd189, fd201; +mul.f64 fd203, fd184, fd202; +fma.rn.f64 fd204, fd200, fd183, fd203; +mul.f64 fd205, fd183, fd202; +mul.f64 fd206, fd200, fd184; +sub.f64 fd207, fd206, fd205; +ld.global.v2.f64 {fd208, fd209}, [rd14+32]; +mul.f64 fd212, fd188, fd209; +fma.rn.f64 fd213, fd208, fd187, fd212; +mul.f64 fd214, fd187, fd209; +mul.f64 fd215, fd208, fd188; +sub.f64 fd216, fd215, fd214; +and.b32 r30, r16, 504; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 2048; +add.s32 r33, r31, r32; +st.shared.f64 [r33], fd181; +st.shared.f64 [r33+512], fd194; +st.shared.f64 [r33+1024], fd204; +st.shared.f64 [r33+1536], fd213; +barrier.sync 0; +mad.lo.s32 r34, r28, -24, r33; +ld.shared.f64 fd217, [r34]; +ld.shared.f64 fd218, [r34+1024]; +ld.shared.f64 fd219, [r34+2048]; +ld.shared.f64 fd220, [r34+3072]; +barrier.sync 0; +st.shared.f64 [r33], fd182; +st.shared.f64 [r33+512], fd197; +st.shared.f64 [r33+1024], fd207; +st.shared.f64 [r33+1536], fd216; +barrier.sync 0; +ld.shared.f64 fd221, [r34]; +ld.shared.f64 fd222, [r34+1024]; +ld.shared.f64 fd223, [r34+2048]; +ld.shared.f64 fd224, [r34+3072]; +add.f64 %0, fd217, fd219; +add.f64 %1, fd221, fd223; +add.f64 %2, fd218, fd220; +add.f64 %3, fd222, fd224; +sub.f64 %4, fd217, fd219; +sub.f64 %5, fd221, fd223; +sub.f64 %6, fd218, fd220; +sub.f64 %7, fd222, fd224; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_512), "l"(lut_dp_4_128), "l"(lut_dp_4_32), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<646, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<63>; +.reg .f64 fd<181>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 13; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %13, %15; +sub.f64 fd10, %14, %16; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -8192; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 8160; +add.s32 r11, r8, r10; +add.f64 fd18, %14, %16; +add.f64 fd19, %13, %15; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 4080; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+4096]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 8128; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 4064; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+4096]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 8064; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 4032; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+4096]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 5; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 7936; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 3968; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+4096]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 240; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd94, fd96; +mul.f64 fd100, fd93, fd96; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 7680; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd95, fd93, fd99; +sub.f64 fd105, fd101, fd100; +st.shared.v2.f64 [r39+256], {fd104, fd105}; +barrier.sync 0; +and.b32 r40, r9, 3840; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+4096]; +sub.f64 fd114, fd106, fd110; +sub.f64 fd115, fd107, fd111; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd116, fd117}, [rd20]; +mul.f64 fd120, fd115, fd117; +mul.f64 fd121, fd114, fd117; +mul.f64 fd122, fd116, fd115; +and.b32 r43, r9, 496; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 7168; +add.s32 r46, r44, r45; +add.f64 fd123, fd107, fd111; +add.f64 fd124, fd106, fd110; +st.shared.v2.f64 [r46], {fd124, fd123}; +fma.rn.f64 fd125, fd116, fd114, fd120; +sub.f64 fd126, fd122, fd121; +st.shared.v2.f64 [r46+512], {fd125, fd126}; +barrier.sync 0; +and.b32 r47, r9, 3584; +sub.s32 r48, r46, r47; +ld.shared.v2.f64 {fd127, fd128}, [r48]; +ld.shared.v2.f64 {fd131, fd132}, [r48+4096]; +sub.f64 fd135, fd127, fd131; +sub.f64 fd136, fd128, fd132; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd137, fd138}, [rd23]; +mul.f64 fd141, fd136, fd138; +mul.f64 fd142, fd135, fd138; +mul.f64 fd143, fd137, fd136; +and.b32 r50, r9, 1008; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 6144; +add.s32 r53, r51, r52; +add.f64 fd144, fd128, fd132; +add.f64 fd145, fd127, fd131; +st.shared.v2.f64 [r53], {fd145, fd144}; +fma.rn.f64 fd146, fd137, fd135, fd141; +sub.f64 fd147, fd143, fd142; +st.shared.v2.f64 [r53+1024], {fd146, fd147}; +barrier.sync 0; +and.b32 r54, r9, 3072; +sub.s32 r55, r53, r54; +ld.shared.v2.f64 {fd148, fd149}, [r55]; +ld.shared.v2.f64 {fd152, fd153}, [r55+4096]; +sub.f64 fd156, fd148, fd152; +sub.f64 fd157, fd149, fd153; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd158, fd159}, [rd26]; +mul.f64 fd162, fd157, fd159; +mul.f64 fd163, fd156, fd159; +mul.f64 fd164, fd158, fd157; +and.b32 r57, r9, 2032; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 4096; +add.s32 r60, r58, r59; +add.f64 fd165, fd149, fd153; +add.f64 fd166, fd148, fd152; +st.shared.v2.f64 [r60], {fd166, fd165}; +fma.rn.f64 fd167, fd158, fd156, fd162; +sub.f64 fd168, fd164, fd163; +st.shared.v2.f64 [r60+2048], {fd167, fd168}; +barrier.sync 0; +and.b32 r61, r9, 2048; +sub.s32 r62, r60, r61; +ld.shared.v2.f64 {fd169, fd170}, [r62]; +ld.shared.v2.f64 {fd173, fd174}, [r62+4096]; +add.f64 %1, fd170, fd174; +add.f64 %0, fd169, fd173; +sub.f64 %3, fd170, fd174; +sub.f64 %2, fd169, fd173; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<645, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<63>; +.reg .f64 fd<149>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 12; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %13, %15; +add.f64 fd10, %14, %16; +sub.f64 fd11, %13, %15; +sub.f64 fd12, %14, %16; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 4080; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -4096; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 4080; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 2040; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+2048]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+2048]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 7; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 4064; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 2032; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+2048]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+2048]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 6; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 4032; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 2016; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+2048]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+2048]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 5; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 3968; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 1984; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+2048]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+2048]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 240; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd80, fd82; +fma.rn.f64 fd86, fd81, fd79, fd85; +mul.f64 fd87, fd79, fd82; +mul.f64 fd88, fd81, fd80; +sub.f64 fd89, fd88, fd87; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 3840; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd86; +barrier.sync 0; +and.b32 r40, r11, 1920; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+2048]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+2048]; +add.f64 fd94, fd90, fd91; +add.f64 fd95, fd92, fd93; +sub.f64 fd96, fd90, fd91; +sub.f64 fd97, fd92, fd93; +bfe.u32 r42, r5, 5, 3; +mul.wide.u32 rd18, r42, 16; +mov.u64 rd19, %10; +add.s64 rd20, rd19, rd18; +ld.global.v2.f64 {fd98, fd99}, [rd20]; +mul.f64 fd102, fd97, fd99; +fma.rn.f64 fd103, fd98, fd96, fd102; +mul.f64 fd104, fd96, fd99; +mul.f64 fd105, fd98, fd97; +sub.f64 fd106, fd105, fd104; +and.b32 r43, r11, 248; +add.s32 r44, r8, r43; +barrier.sync 0; +and.b32 r45, r6, 3584; +add.s32 r46, r44, r45; +st.shared.f64 [r46], fd94; +st.shared.f64 [r46+256], fd103; +barrier.sync 0; +and.b32 r47, r11, 1792; +sub.s32 r48, r46, r47; +ld.shared.f64 fd107, [r48]; +ld.shared.f64 fd108, [r48+2048]; +barrier.sync 0; +st.shared.f64 [r46], fd95; +st.shared.f64 [r46+256], fd106; +barrier.sync 0; +ld.shared.f64 fd109, [r48]; +ld.shared.f64 fd110, [r48+2048]; +add.f64 fd111, fd107, fd108; +add.f64 fd112, fd109, fd110; +sub.f64 fd113, fd107, fd108; +sub.f64 fd114, fd109, fd110; +bfe.u32 r49, r5, 6, 2; +mul.wide.u32 rd21, r49, 16; +mov.u64 rd22, %11; +add.s64 rd23, rd22, rd21; +ld.global.v2.f64 {fd115, fd116}, [rd23]; +mul.f64 fd119, fd114, fd116; +fma.rn.f64 fd120, fd115, fd113, fd119; +mul.f64 fd121, fd113, fd116; +mul.f64 fd122, fd115, fd114; +sub.f64 fd123, fd122, fd121; +and.b32 r50, r11, 504; +add.s32 r51, r8, r50; +barrier.sync 0; +and.b32 r52, r6, 3072; +add.s32 r53, r51, r52; +st.shared.f64 [r53], fd111; +st.shared.f64 [r53+512], fd120; +barrier.sync 0; +and.b32 r54, r11, 1536; +sub.s32 r55, r53, r54; +ld.shared.f64 fd124, [r55]; +ld.shared.f64 fd125, [r55+2048]; +barrier.sync 0; +st.shared.f64 [r53], fd112; +st.shared.f64 [r53+512], fd123; +barrier.sync 0; +ld.shared.f64 fd126, [r55]; +ld.shared.f64 fd127, [r55+2048]; +add.f64 fd128, fd124, fd125; +add.f64 fd129, fd126, fd127; +sub.f64 fd130, fd124, fd125; +sub.f64 fd131, fd126, fd127; +bfe.u32 r56, r5, 7, 1; +mul.wide.u32 rd24, r56, 16; +mov.u64 rd25, %12; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd132, fd133}, [rd26]; +mul.f64 fd136, fd131, fd133; +fma.rn.f64 fd137, fd132, fd130, fd136; +mul.f64 fd138, fd130, fd133; +mul.f64 fd139, fd132, fd131; +sub.f64 fd140, fd139, fd138; +and.b32 r57, r11, 1016; +add.s32 r58, r8, r57; +barrier.sync 0; +and.b32 r59, r6, 2048; +add.s32 r60, r58, r59; +st.shared.f64 [r60], fd128; +st.shared.f64 [r60+1024], fd137; +barrier.sync 0; +and.b32 r61, r11, 1024; +sub.s32 r62, r60, r61; +ld.shared.f64 fd141, [r62]; +ld.shared.f64 fd142, [r62+2048]; +barrier.sync 0; +st.shared.f64 [r60], fd129; +st.shared.f64 [r60+1024], fd140; +barrier.sync 0; +ld.shared.f64 fd143, [r62]; +ld.shared.f64 fd144, [r62+2048]; +add.f64 %0, fd141, fd142; +add.f64 %1, fd143, fd144; +sub.f64 %2, fd141, fd142; +sub.f64 %3, fd143, fd144; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_512), "l"(lut_dp_2_256), "l"(lut_dp_2_128), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..39529e1fd5754 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp16_fwd.hpp.inc @@ -0,0 +1,361 @@ +#ifndef CUFFTDX_FFT_5_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_5_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<900, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<9>; +.reg .b32 r<313>; +.reg .f64 fd<7>; +.reg .b64 rd<2>; +mov.f64 fd5, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd5; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd6, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd6; +} +mov.b32 r228, {rs2, rs2}; +mov.f64 fd3, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs3, fd3; +} +mov.b32 r282, {rs3, rs3}; +mov.f64 fd4, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs4, fd4; +} +mov.b32 r300, {rs4, rs4}; +{ +cvt.rn.f16.f64 rs5, fd5; +} +mov.b32 r291, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd6; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r306, {rs7, rs7}; +{ +add.f16x2 r1, %11, %12; +} +{ +add.f16x2 r4, %13, r1; +} +{ +add.f16x2 r7, %14, %15; +} +{ +add.f16x2 %0, r4, r7; +} +{ +add.f16x2 r13, %16, %10; +} +{ +add.f16x2 r16, %17, r13; +} +{ +add.f16x2 r19, %18, %19; +} +{ +add.f16x2 %1, r16, r19; +} +{ +add.f16x2 r25, %11, %12; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %13, r28; +} +{ +add.f16x2 r34, %14, %15; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %16, %10; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %18, %19; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 %2, r40, r55; +} +{ +add.f16x2 r61, %11, %12; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %13, r64; +} +{ +add.f16x2 r70, %14, %15; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %16, %10; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %18, %19; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 %8, r76, r91; +} +{ +add.f16x2 r97, %11, %12; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %13, r100; +} +{ +add.f16x2 r106, %14, %15; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %16, %10; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %18, %19; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 %4, r112, r127; +} +{ +add.f16x2 r133, %11, %12; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %13, r136; +} +{ +add.f16x2 r142, %14, %15; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %16, %10; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %18, %19; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 %6, r148, r163; +} +{ +add.f16x2 r169, %16, %10; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %17, r172; +} +{ +add.f16x2 r178, %18, %19; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %11, %12; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %14, %15; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 %3, r184, r199; +} +{ +add.f16x2 r205, %16, %10; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %17, r208; +} +{ +add.f16x2 r214, %18, %19; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %11, %12; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %14, %15; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 %9, r220, r235; +} +{ +add.f16x2 r241, %16, %10; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %17, r244; +} +{ +add.f16x2 r250, %18, %19; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %11, %12; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %14, %15; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 %5, r256, r271; +} +{ +add.f16x2 r277, %16, %10; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %17, r280; +} +{ +add.f16x2 r286, %18, %19; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %11, %12; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %14, %15; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 %7, r292, r307; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..1d62029706537 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp16_inv.hpp.inc @@ -0,0 +1,364 @@ +#ifndef CUFFTDX_FFT_5_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_5_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1102, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<11>; +.reg .b32 r<313>; +.reg .f64 fd<7>; +.reg .b64 rd<2>; +mov.f64 fd5, 0d3FD3C6EF372FE950; +{ +cvt.rn.f16.f64 rs1, fd5; +} +mov.b32 r210, {rs1, rs1}; +mov.f64 fd6, 0dBFEE6F0E134454FF; +{ +cvt.rn.f16.f64 rs2, fd6; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r228, {rs3, rs3}; +mov.f64 fd3, 0dBFE9E3779B97F4A8; +{ +cvt.rn.f16.f64 rs5, fd3; +} +mov.b32 r282, {rs5, rs5}; +mov.f64 fd4, 0dBFE2CF2304755A5E; +{ +cvt.rn.f16.f64 rs6, fd4; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r300, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs9, fd5; +} +mov.b32 r291, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd6; +} +mov.b32 r306, {rs10, rs10}; +{ +add.f16x2 r1, %11, %12; +} +{ +add.f16x2 r4, %13, r1; +} +{ +add.f16x2 r7, %14, %15; +} +{ +add.f16x2 %0, r4, r7; +} +{ +add.f16x2 r13, %16, %10; +} +{ +add.f16x2 r16, %17, r13; +} +{ +add.f16x2 r19, %18, %19; +} +{ +add.f16x2 %1, r16, r19; +} +{ +add.f16x2 r25, %11, %12; +} +{ +mul.f16x2 r28, r25, r210; +} +{ +add.f16x2 r31, %13, r28; +} +{ +add.f16x2 r34, %14, %15; +} +{ +mul.f16x2 r37, r34, r282; +} +{ +add.f16x2 r40, r31, r37; +} +{ +sub.f16x2 r43, %16, %10; +} +{ +mul.f16x2 r46, r43, r228; +} +{ +sub.f16x2 r49, %18, %19; +} +{ +mul.f16x2 r52, r49, r300; +} +{ +add.f16x2 r55, r46, r52; +} +{ +sub.f16x2 %2, r40, r55; +} +{ +add.f16x2 r61, %11, %12; +} +{ +mul.f16x2 r64, r61, r210; +} +{ +add.f16x2 r67, %13, r64; +} +{ +add.f16x2 r70, %14, %15; +} +{ +mul.f16x2 r73, r70, r282; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %16, %10; +} +{ +mul.f16x2 r82, r79, r228; +} +{ +sub.f16x2 r85, %18, %19; +} +{ +mul.f16x2 r88, r85, r300; +} +{ +add.f16x2 r91, r82, r88; +} +{ +add.f16x2 %8, r76, r91; +} +{ +add.f16x2 r97, %11, %12; +} +{ +mul.f16x2 r100, r97, r282; +} +{ +add.f16x2 r103, %13, r100; +} +{ +add.f16x2 r106, %14, %15; +} +{ +mul.f16x2 r109, r106, r291; +} +{ +add.f16x2 r112, r103, r109; +} +{ +sub.f16x2 r115, %16, %10; +} +{ +mul.f16x2 r118, r115, r300; +} +{ +sub.f16x2 r121, %18, %19; +} +{ +mul.f16x2 r124, r121, r306; +} +{ +add.f16x2 r127, r118, r124; +} +{ +sub.f16x2 %4, r112, r127; +} +{ +add.f16x2 r133, %11, %12; +} +{ +mul.f16x2 r136, r133, r282; +} +{ +add.f16x2 r139, %13, r136; +} +{ +add.f16x2 r142, %14, %15; +} +{ +mul.f16x2 r145, r142, r291; +} +{ +add.f16x2 r148, r139, r145; +} +{ +sub.f16x2 r151, %16, %10; +} +{ +mul.f16x2 r154, r151, r300; +} +{ +sub.f16x2 r157, %18, %19; +} +{ +mul.f16x2 r160, r157, r306; +} +{ +add.f16x2 r163, r154, r160; +} +{ +add.f16x2 %6, r148, r163; +} +{ +add.f16x2 r169, %16, %10; +} +{ +mul.f16x2 r172, r169, r210; +} +{ +add.f16x2 r175, %17, r172; +} +{ +add.f16x2 r178, %18, %19; +} +{ +mul.f16x2 r181, r178, r282; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %11, %12; +} +{ +mul.f16x2 r190, r187, r228; +} +{ +sub.f16x2 r193, %14, %15; +} +{ +mul.f16x2 r196, r193, r300; +} +{ +add.f16x2 r199, r190, r196; +} +{ +add.f16x2 %3, r184, r199; +} +{ +add.f16x2 r205, %16, %10; +} +{ +mul.f16x2 r208, r205, r210; +} +{ +add.f16x2 r211, %17, r208; +} +{ +add.f16x2 r214, %18, %19; +} +{ +mul.f16x2 r217, r214, r282; +} +{ +add.f16x2 r220, r211, r217; +} +{ +sub.f16x2 r223, %11, %12; +} +{ +mul.f16x2 r226, r223, r228; +} +{ +sub.f16x2 r229, %14, %15; +} +{ +mul.f16x2 r232, r229, r300; +} +{ +add.f16x2 r235, r226, r232; +} +{ +sub.f16x2 %9, r220, r235; +} +{ +add.f16x2 r241, %16, %10; +} +{ +mul.f16x2 r244, r241, r282; +} +{ +add.f16x2 r247, %17, r244; +} +{ +add.f16x2 r250, %18, %19; +} +{ +mul.f16x2 r253, r250, r291; +} +{ +add.f16x2 r256, r247, r253; +} +{ +sub.f16x2 r259, %11, %12; +} +{ +mul.f16x2 r262, r259, r300; +} +{ +sub.f16x2 r265, %14, %15; +} +{ +mul.f16x2 r268, r265, r306; +} +{ +add.f16x2 r271, r262, r268; +} +{ +add.f16x2 %5, r256, r271; +} +{ +add.f16x2 r277, %16, %10; +} +{ +mul.f16x2 r280, r277, r282; +} +{ +add.f16x2 r283, %17, r280; +} +{ +add.f16x2 r286, %18, %19; +} +{ +mul.f16x2 r289, r286, r291; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %11, %12; +} +{ +mul.f16x2 r298, r295, r300; +} +{ +sub.f16x2 r301, %14, %15; +} +{ +mul.f16x2 r304, r301, r306; +} +{ +add.f16x2 r307, r298, r304; +} +{ +sub.f16x2 %7, r292, r307; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..fcb32ca0e79cd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp32_fwd.hpp.inc @@ -0,0 +1,60 @@ +#ifndef CUFFTDX_FFT_5_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_5_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<154, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<65>; +.reg .b64 rd<2>; +add.f32 f21, %12, %20; +add.f32 f22, %10, f21; +add.f32 f23, %15, %18; +add.f32 f24, %14, %21; +add.f32 f25, %11, f24; +add.f32 f26, %17, %19; +fma.rn.f32 f27, f21, 0f3E9E377A, %10; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %14, %21; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %17, %19; +mul.f32 f33, f32, 0fBF167918; +sub.f32 f34, f33, f31; +mul.f32 f35, f21, 0f3F4F1BBD; +sub.f32 f36, %10, f35; +fma.rn.f32 f37, f23, 0f3E9E377A, f36; +mul.f32 f38, f30, 0f3F167918; +mul.f32 f39, f32, 0f3F737871; +sub.f32 f40, f39, f38; +fma.rn.f32 f41, f24, 0f3E9E377A, %11; +mul.f32 f42, f26, 0f3F4F1BBD; +sub.f32 f43, f41, f42; +sub.f32 f44, %12, %20; +mul.f32 f45, f44, 0f3F737871; +sub.f32 f46, %15, %18; +mul.f32 f47, f46, 0fBF167918; +sub.f32 f48, f47, f45; +mul.f32 f49, f24, 0f3F4F1BBD; +sub.f32 f50, %11, f49; +fma.rn.f32 f51, f26, 0f3E9E377A, f50; +mul.f32 f52, f44, 0f3F167918; +mul.f32 f53, f46, 0f3F737871; +sub.f32 f54, f53, f52; +add.f32 %1, f26, f25; +add.f32 %0, f23, f22; +add.f32 %3, f48, f43; +sub.f32 %2, f29, f34; +add.f32 %5, f54, f51; +sub.f32 %4, f37, f40; +sub.f32 %7, f51, f54; +add.f32 %6, f40, f37; +sub.f32 %9, f43, f48; +add.f32 %8, f34, f29; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..a35fa8c2458f4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp32_inv.hpp.inc @@ -0,0 +1,58 @@ +#ifndef CUFFTDX_FFT_5_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_5_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<356, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<63>; +.reg .b64 rd<2>; +add.f32 f21, %12, %20; +add.f32 f22, %10, f21; +add.f32 f23, %15, %18; +add.f32 f24, %14, %21; +add.f32 f25, %11, f24; +add.f32 f26, %17, %19; +fma.rn.f32 f27, f21, 0f3E9E377A, %10; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %14, %21; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %17, %19; +fma.rn.f32 f33, f32, 0f3F167918, f31; +mul.f32 f34, f21, 0f3F4F1BBD; +sub.f32 f35, %10, f34; +fma.rn.f32 f36, f23, 0f3E9E377A, f35; +mul.f32 f37, f30, 0f3F167918; +mul.f32 f38, f32, 0f3F737871; +sub.f32 f39, f37, f38; +fma.rn.f32 f40, f24, 0f3E9E377A, %11; +mul.f32 f41, f26, 0f3F4F1BBD; +sub.f32 f42, f40, f41; +sub.f32 f43, %12, %20; +mul.f32 f44, f43, 0f3F737871; +sub.f32 f45, %15, %18; +fma.rn.f32 f46, f45, 0f3F167918, f44; +mul.f32 f47, f24, 0f3F4F1BBD; +sub.f32 f48, %11, f47; +fma.rn.f32 f49, f26, 0f3E9E377A, f48; +mul.f32 f50, f43, 0f3F167918; +mul.f32 f51, f45, 0f3F737871; +sub.f32 f52, f50, f51; +add.f32 %1, f26, f25; +add.f32 %0, f23, f22; +add.f32 %3, f46, f42; +sub.f32 %2, f29, f33; +add.f32 %5, f52, f49; +sub.f32 %4, f36, f39; +sub.f32 %7, f49, f52; +add.f32 %6, f39, f36; +sub.f32 %9, f42, f46; +add.f32 %8, f33, f29; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..ccbbe266987ed --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp64_fwd.hpp.inc @@ -0,0 +1,60 @@ +#ifndef CUFFTDX_FFT_5_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_5_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<532, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<65>; +.reg .b64 rd<2>; +add.f64 fd21, %12, %20; +add.f64 fd22, %10, fd21; +add.f64 fd23, %15, %18; +add.f64 fd24, %14, %21; +add.f64 fd25, %11, fd24; +add.f64 fd26, %17, %19; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %10; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %14, %21; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %17, %19; +mul.f64 fd33, fd32, 0dBFE2CF2304755A5E; +sub.f64 fd34, fd33, fd31; +mul.f64 fd35, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd36, %10, fd35; +fma.rn.f64 fd37, fd23, 0d3FD3C6EF372FE950, fd36; +mul.f64 fd38, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd39, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd40, fd39, fd38; +fma.rn.f64 fd41, fd24, 0d3FD3C6EF372FE950, %11; +mul.f64 fd42, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %12, %20; +mul.f64 fd45, fd44, 0d3FEE6F0E134454FF; +sub.f64 fd46, %15, %18; +mul.f64 fd47, fd46, 0dBFE2CF2304755A5E; +sub.f64 fd48, fd47, fd45; +mul.f64 fd49, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd50, %11, fd49; +fma.rn.f64 fd51, fd26, 0d3FD3C6EF372FE950, fd50; +mul.f64 fd52, fd44, 0d3FE2CF2304755A5E; +mul.f64 fd53, fd46, 0d3FEE6F0E134454FF; +sub.f64 fd54, fd53, fd52; +add.f64 %1, fd26, fd25; +add.f64 %0, fd23, fd22; +add.f64 %3, fd48, fd43; +sub.f64 %2, fd29, fd34; +add.f64 %5, fd54, fd51; +sub.f64 %4, fd37, fd40; +sub.f64 %7, fd51, fd54; +add.f64 %6, fd40, fd37; +sub.f64 %9, fd43, fd48; +add.f64 %8, fd34, fd29; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..1c9ec0c5f3ad5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_5_fp64_inv.hpp.inc @@ -0,0 +1,58 @@ +#ifndef CUFFTDX_FFT_5_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_5_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<703, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<63>; +.reg .b64 rd<2>; +add.f64 fd21, %12, %20; +add.f64 fd22, %10, fd21; +add.f64 fd23, %15, %18; +add.f64 fd24, %14, %21; +add.f64 fd25, %11, fd24; +add.f64 fd26, %17, %19; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %10; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %14, %21; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %17, %19; +fma.rn.f64 fd33, fd32, 0d3FE2CF2304755A5E, fd31; +mul.f64 fd34, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd35, %10, fd34; +fma.rn.f64 fd36, fd23, 0d3FD3C6EF372FE950, fd35; +mul.f64 fd37, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd38, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd39, fd37, fd38; +fma.rn.f64 fd40, fd24, 0d3FD3C6EF372FE950, %11; +mul.f64 fd41, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd42, fd40, fd41; +sub.f64 fd43, %12, %20; +mul.f64 fd44, fd43, 0d3FEE6F0E134454FF; +sub.f64 fd45, %15, %18; +fma.rn.f64 fd46, fd45, 0d3FE2CF2304755A5E, fd44; +mul.f64 fd47, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd48, %11, fd47; +fma.rn.f64 fd49, fd26, 0d3FD3C6EF372FE950, fd48; +mul.f64 fd50, fd43, 0d3FE2CF2304755A5E; +mul.f64 fd51, fd45, 0d3FEE6F0E134454FF; +sub.f64 fd52, fd50, fd51; +add.f64 %1, fd26, fd25; +add.f64 %0, fd23, fd22; +add.f64 %3, fd46, fd42; +sub.f64 %2, fd29, fd33; +add.f64 %5, fd52, fd49; +sub.f64 %4, fd36, fd39; +sub.f64 %7, fd49, fd52; +add.f64 %6, fd39, fd36; +sub.f64 %9, fd42, fd46; +add.f64 %8, fd33, fd29; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..c96555a0e2897 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp16_fwd.hpp.inc @@ -0,0 +1,22558 @@ +#ifndef CUFFTDX_FFT_625_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_625_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<909, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<488>; +.reg .b32 r<7962>; +.reg .b64 rd<4>; +mov.u32 r7960, %tid.y; +mov.u32 r7961, %50; +mad.lo.s32 r7902, r7960, 5000, r7961; +mov.u32 r7903, %tid.x; +mov.f32 f482, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1, {low, high}; +} +mov.f32 f484, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2, {low, high}; +} +mov.f32 f478, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r3, {low, high}; +} +mov.f32 f480, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %95, %81; +} +{ +add.f16x2 r12, %54, r9; +} +{ +add.f16x2 r15, %60, %94; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %70, %58; +} +{ +add.f16x2 r24, %79, r21; +} +{ +add.f16x2 r27, %87, %69; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %95, %81; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %54, r36; +} +{ +add.f16x2 r42, %60, %94; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %70, %58; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %87, %69; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %95, %81; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %54, r72; +} +{ +add.f16x2 r78, %60, %94; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %70, %58; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %87, %69; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %95, %81; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %54, r108; +} +{ +add.f16x2 r114, %60, %94; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %70, %58; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %87, %69; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %95, %81; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %54, r144; +} +{ +add.f16x2 r150, %60, %94; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %70, %58; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %87, %69; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %70, %58; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %79, r180; +} +{ +add.f16x2 r186, %87, %69; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %95, %81; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %60, %94; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %70, %58; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %79, r216; +} +{ +add.f16x2 r222, %87, %69; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %95, %81; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %60, %94; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %70, %58; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %79, r252; +} +{ +add.f16x2 r258, %87, %69; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %95, %81; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %60, %94; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %70, %58; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %79, r288; +} +{ +add.f16x2 r294, %87, %69; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %95, %81; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %60, %94; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %97, %83; +} +{ +add.f16x2 r332, %55, r329; +} +{ +add.f16x2 r335, %62, %96; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %72, %61; +} +{ +add.f16x2 r344, %80, r341; +} +{ +add.f16x2 r347, %89, %71; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %97, %83; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %55, r356; +} +{ +add.f16x2 r362, %62, %96; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %72, %61; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %89, %71; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %97, %83; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %55, r392; +} +{ +add.f16x2 r398, %62, %96; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %72, %61; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %89, %71; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %97, %83; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %55, r428; +} +{ +add.f16x2 r434, %62, %96; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %72, %61; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %89, %71; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %97, %83; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %55, r464; +} +{ +add.f16x2 r470, %62, %96; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %72, %61; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %89, %71; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %72, %61; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %80, r500; +} +{ +add.f16x2 r506, %89, %71; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %97, %83; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %62, %96; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %72, %61; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %80, r536; +} +{ +add.f16x2 r542, %89, %71; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %97, %83; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %62, %96; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %72, %61; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %80, r572; +} +{ +add.f16x2 r578, %89, %71; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %97, %83; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %62, %96; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %72, %61; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %80, r608; +} +{ +add.f16x2 r614, %89, %71; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %97, %83; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %62, %96; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +add.f16x2 r649, %99, %85; +} +{ +add.f16x2 r652, %56, r649; +} +{ +add.f16x2 r655, %64, %98; +} +{ +add.f16x2 r658, r652, r655; +} +{ +add.f16x2 r661, %74, %63; +} +{ +add.f16x2 r664, %82, r661; +} +{ +add.f16x2 r667, %91, %73; +} +{ +add.f16x2 r670, r664, r667; +} +{ +add.f16x2 r673, %99, %85; +} +{ +mul.f16x2 r676, r673, r641; +} +{ +add.f16x2 r679, %56, r676; +} +{ +add.f16x2 r682, %64, %98; +} +{ +mul.f16x2 r685, r682, r643; +} +{ +add.f16x2 r688, r679, r685; +} +{ +sub.f16x2 r691, %74, %63; +} +{ +mul.f16x2 r694, r691, r642; +} +{ +sub.f16x2 r697, %91, %73; +} +{ +mul.f16x2 r700, r697, r644; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r688, r703; +} +{ +add.f16x2 r709, %99, %85; +} +{ +mul.f16x2 r712, r709, r641; +} +{ +add.f16x2 r715, %56, r712; +} +{ +add.f16x2 r718, %64, %98; +} +{ +mul.f16x2 r721, r718, r643; +} +{ +add.f16x2 r724, r715, r721; +} +{ +sub.f16x2 r727, %74, %63; +} +{ +mul.f16x2 r730, r727, r642; +} +{ +sub.f16x2 r733, %91, %73; +} +{ +mul.f16x2 r736, r733, r644; +} +{ +add.f16x2 r739, r730, r736; +} +{ +add.f16x2 r742, r724, r739; +} +{ +add.f16x2 r745, %99, %85; +} +{ +mul.f16x2 r748, r745, r643; +} +{ +add.f16x2 r751, %56, r748; +} +{ +add.f16x2 r754, %64, %98; +} +{ +mul.f16x2 r757, r754, r645; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %63; +} +{ +mul.f16x2 r766, r763, r644; +} +{ +sub.f16x2 r769, %91, %73; +} +{ +mul.f16x2 r772, r769, r647; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r760, r775; +} +{ +add.f16x2 r781, %99, %85; +} +{ +mul.f16x2 r784, r781, r643; +} +{ +add.f16x2 r787, %56, r784; +} +{ +add.f16x2 r790, %64, %98; +} +{ +mul.f16x2 r793, r790, r645; +} +{ +add.f16x2 r796, r787, r793; +} +{ +sub.f16x2 r799, %74, %63; +} +{ +mul.f16x2 r802, r799, r644; +} +{ +sub.f16x2 r805, %91, %73; +} +{ +mul.f16x2 r808, r805, r647; +} +{ +add.f16x2 r811, r802, r808; +} +{ +add.f16x2 r814, r796, r811; +} +{ +add.f16x2 r817, %74, %63; +} +{ +mul.f16x2 r820, r817, r641; +} +{ +add.f16x2 r823, %82, r820; +} +{ +add.f16x2 r826, %91, %73; +} +{ +mul.f16x2 r829, r826, r643; +} +{ +add.f16x2 r832, r823, r829; +} +{ +sub.f16x2 r835, %99, %85; +} +{ +mul.f16x2 r838, r835, r642; +} +{ +sub.f16x2 r841, %64, %98; +} +{ +mul.f16x2 r844, r841, r644; +} +{ +add.f16x2 r847, r838, r844; +} +{ +add.f16x2 r850, r832, r847; +} +{ +add.f16x2 r853, %74, %63; +} +{ +mul.f16x2 r856, r853, r641; +} +{ +add.f16x2 r859, %82, r856; +} +{ +add.f16x2 r862, %91, %73; +} +{ +mul.f16x2 r865, r862, r643; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %99, %85; +} +{ +mul.f16x2 r874, r871, r642; +} +{ +sub.f16x2 r877, %64, %98; +} +{ +mul.f16x2 r880, r877, r644; +} +{ +add.f16x2 r883, r874, r880; +} +{ +sub.f16x2 r886, r868, r883; +} +{ +add.f16x2 r889, %74, %63; +} +{ +mul.f16x2 r892, r889, r643; +} +{ +add.f16x2 r895, %82, r892; +} +{ +add.f16x2 r898, %91, %73; +} +{ +mul.f16x2 r901, r898, r645; +} +{ +add.f16x2 r904, r895, r901; +} +{ +sub.f16x2 r907, %99, %85; +} +{ +mul.f16x2 r910, r907, r644; +} +{ +sub.f16x2 r913, %64, %98; +} +{ +mul.f16x2 r916, r913, r647; +} +{ +add.f16x2 r919, r910, r916; +} +{ +add.f16x2 r922, r904, r919; +} +{ +add.f16x2 r925, %74, %63; +} +{ +mul.f16x2 r928, r925, r643; +} +{ +add.f16x2 r931, %82, r928; +} +{ +add.f16x2 r934, %91, %73; +} +{ +mul.f16x2 r937, r934, r645; +} +{ +add.f16x2 r940, r931, r937; +} +{ +sub.f16x2 r943, %99, %85; +} +{ +mul.f16x2 r946, r943, r644; +} +{ +sub.f16x2 r949, %64, %98; +} +{ +mul.f16x2 r952, r949, r647; +} +{ +add.f16x2 r955, r946, r952; +} +{ +sub.f16x2 r958, r940, r955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r966, {low, high}; +} +{ +neg.f16x2 r967, r966; +} +{ +add.f16x2 r969, %51, %88; +} +{ +add.f16x2 r972, %57, r969; +} +{ +add.f16x2 r975, %66, %100; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, %76, %65; +} +{ +add.f16x2 r984, %84, r981; +} +{ +add.f16x2 r987, %92, %75; +} +{ +add.f16x2 r990, r984, r987; +} +{ +add.f16x2 r993, %51, %88; +} +{ +mul.f16x2 r996, r993, r961; +} +{ +add.f16x2 r999, %57, r996; +} +{ +add.f16x2 r1002, %66, %100; +} +{ +mul.f16x2 r1005, r1002, r963; +} +{ +add.f16x2 r1008, r999, r1005; +} +{ +sub.f16x2 r1011, %76, %65; +} +{ +mul.f16x2 r1014, r1011, r962; +} +{ +sub.f16x2 r1017, %92, %75; +} +{ +mul.f16x2 r1020, r1017, r964; +} +{ +add.f16x2 r1023, r1014, r1020; +} +{ +sub.f16x2 r1026, r1008, r1023; +} +{ +add.f16x2 r1029, %51, %88; +} +{ +mul.f16x2 r1032, r1029, r961; +} +{ +add.f16x2 r1035, %57, r1032; +} +{ +add.f16x2 r1038, %66, %100; +} +{ +mul.f16x2 r1041, r1038, r963; +} +{ +add.f16x2 r1044, r1035, r1041; +} +{ +sub.f16x2 r1047, %76, %65; +} +{ +mul.f16x2 r1050, r1047, r962; +} +{ +sub.f16x2 r1053, %92, %75; +} +{ +mul.f16x2 r1056, r1053, r964; +} +{ +add.f16x2 r1059, r1050, r1056; +} +{ +add.f16x2 r1062, r1044, r1059; +} +{ +add.f16x2 r1065, %51, %88; +} +{ +mul.f16x2 r1068, r1065, r963; +} +{ +add.f16x2 r1071, %57, r1068; +} +{ +add.f16x2 r1074, %66, %100; +} +{ +mul.f16x2 r1077, r1074, r965; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +sub.f16x2 r1083, %76, %65; +} +{ +mul.f16x2 r1086, r1083, r964; +} +{ +sub.f16x2 r1089, %92, %75; +} +{ +mul.f16x2 r1092, r1089, r967; +} +{ +add.f16x2 r1095, r1086, r1092; +} +{ +sub.f16x2 r1098, r1080, r1095; +} +{ +add.f16x2 r1101, %51, %88; +} +{ +mul.f16x2 r1104, r1101, r963; +} +{ +add.f16x2 r1107, %57, r1104; +} +{ +add.f16x2 r1110, %66, %100; +} +{ +mul.f16x2 r1113, r1110, r965; +} +{ +add.f16x2 r1116, r1107, r1113; +} +{ +sub.f16x2 r1119, %76, %65; +} +{ +mul.f16x2 r1122, r1119, r964; +} +{ +sub.f16x2 r1125, %92, %75; +} +{ +mul.f16x2 r1128, r1125, r967; +} +{ +add.f16x2 r1131, r1122, r1128; +} +{ +add.f16x2 r1134, r1116, r1131; +} +{ +add.f16x2 r1137, %76, %65; +} +{ +mul.f16x2 r1140, r1137, r961; +} +{ +add.f16x2 r1143, %84, r1140; +} +{ +add.f16x2 r1146, %92, %75; +} +{ +mul.f16x2 r1149, r1146, r963; +} +{ +add.f16x2 r1152, r1143, r1149; +} +{ +sub.f16x2 r1155, %51, %88; +} +{ +mul.f16x2 r1158, r1155, r962; +} +{ +sub.f16x2 r1161, %66, %100; +} +{ +mul.f16x2 r1164, r1161, r964; +} +{ +add.f16x2 r1167, r1158, r1164; +} +{ +add.f16x2 r1170, r1152, r1167; +} +{ +add.f16x2 r1173, %76, %65; +} +{ +mul.f16x2 r1176, r1173, r961; +} +{ +add.f16x2 r1179, %84, r1176; +} +{ +add.f16x2 r1182, %92, %75; +} +{ +mul.f16x2 r1185, r1182, r963; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +sub.f16x2 r1191, %51, %88; +} +{ +mul.f16x2 r1194, r1191, r962; +} +{ +sub.f16x2 r1197, %66, %100; +} +{ +mul.f16x2 r1200, r1197, r964; +} +{ +add.f16x2 r1203, r1194, r1200; +} +{ +sub.f16x2 r1206, r1188, r1203; +} +{ +add.f16x2 r1209, %76, %65; +} +{ +mul.f16x2 r1212, r1209, r963; +} +{ +add.f16x2 r1215, %84, r1212; +} +{ +add.f16x2 r1218, %92, %75; +} +{ +mul.f16x2 r1221, r1218, r965; +} +{ +add.f16x2 r1224, r1215, r1221; +} +{ +sub.f16x2 r1227, %51, %88; +} +{ +mul.f16x2 r1230, r1227, r964; +} +{ +sub.f16x2 r1233, %66, %100; +} +{ +mul.f16x2 r1236, r1233, r967; +} +{ +add.f16x2 r1239, r1230, r1236; +} +{ +add.f16x2 r1242, r1224, r1239; +} +{ +add.f16x2 r1245, %76, %65; +} +{ +mul.f16x2 r1248, r1245, r963; +} +{ +add.f16x2 r1251, %84, r1248; +} +{ +add.f16x2 r1254, %92, %75; +} +{ +mul.f16x2 r1257, r1254, r965; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +sub.f16x2 r1263, %51, %88; +} +{ +mul.f16x2 r1266, r1263, r964; +} +{ +sub.f16x2 r1269, %66, %100; +} +{ +mul.f16x2 r1272, r1269, r967; +} +{ +add.f16x2 r1275, r1266, r1272; +} +{ +sub.f16x2 r1278, r1260, r1275; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1281, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1286, {low, high}; +} +{ +neg.f16x2 r1287, r1286; +} +{ +add.f16x2 r1289, %53, %90; +} +{ +add.f16x2 r1292, %59, r1289; +} +{ +add.f16x2 r1295, %68, %52; +} +{ +add.f16x2 r1298, r1292, r1295; +} +{ +add.f16x2 r1301, %78, %67; +} +{ +add.f16x2 r1304, %86, r1301; +} +{ +add.f16x2 r1307, %93, %77; +} +{ +add.f16x2 r1310, r1304, r1307; +} +{ +add.f16x2 r1313, %53, %90; +} +{ +mul.f16x2 r1316, r1313, r1281; +} +{ +add.f16x2 r1319, %59, r1316; +} +{ +add.f16x2 r1322, %68, %52; +} +{ +mul.f16x2 r1325, r1322, r1283; +} +{ +add.f16x2 r1328, r1319, r1325; +} +{ +sub.f16x2 r1331, %78, %67; +} +{ +mul.f16x2 r1334, r1331, r1282; +} +{ +sub.f16x2 r1337, %93, %77; +} +{ +mul.f16x2 r1340, r1337, r1284; +} +{ +add.f16x2 r1343, r1334, r1340; +} +{ +sub.f16x2 r1346, r1328, r1343; +} +{ +add.f16x2 r1349, %53, %90; +} +{ +mul.f16x2 r1352, r1349, r1281; +} +{ +add.f16x2 r1355, %59, r1352; +} +{ +add.f16x2 r1358, %68, %52; +} +{ +mul.f16x2 r1361, r1358, r1283; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +sub.f16x2 r1367, %78, %67; +} +{ +mul.f16x2 r1370, r1367, r1282; +} +{ +sub.f16x2 r1373, %93, %77; +} +{ +mul.f16x2 r1376, r1373, r1284; +} +{ +add.f16x2 r1379, r1370, r1376; +} +{ +add.f16x2 r1382, r1364, r1379; +} +{ +add.f16x2 r1385, %53, %90; +} +{ +mul.f16x2 r1388, r1385, r1283; +} +{ +add.f16x2 r1391, %59, r1388; +} +{ +add.f16x2 r1394, %68, %52; +} +{ +mul.f16x2 r1397, r1394, r1285; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, %78, %67; +} +{ +mul.f16x2 r1406, r1403, r1284; +} +{ +sub.f16x2 r1409, %93, %77; +} +{ +mul.f16x2 r1412, r1409, r1287; +} +{ +add.f16x2 r1415, r1406, r1412; +} +{ +sub.f16x2 r1418, r1400, r1415; +} +{ +add.f16x2 r1421, %53, %90; +} +{ +mul.f16x2 r1424, r1421, r1283; +} +{ +add.f16x2 r1427, %59, r1424; +} +{ +add.f16x2 r1430, %68, %52; +} +{ +mul.f16x2 r1433, r1430, r1285; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +sub.f16x2 r1439, %78, %67; +} +{ +mul.f16x2 r1442, r1439, r1284; +} +{ +sub.f16x2 r1445, %93, %77; +} +{ +mul.f16x2 r1448, r1445, r1287; +} +{ +add.f16x2 r1451, r1442, r1448; +} +{ +add.f16x2 r1454, r1436, r1451; +} +{ +add.f16x2 r1457, %78, %67; +} +{ +mul.f16x2 r1460, r1457, r1281; +} +{ +add.f16x2 r1463, %86, r1460; +} +{ +add.f16x2 r1466, %93, %77; +} +{ +mul.f16x2 r1469, r1466, r1283; +} +{ +add.f16x2 r1472, r1463, r1469; +} +{ +sub.f16x2 r1475, %53, %90; +} +{ +mul.f16x2 r1478, r1475, r1282; +} +{ +sub.f16x2 r1481, %68, %52; +} +{ +mul.f16x2 r1484, r1481, r1284; +} +{ +add.f16x2 r1487, r1478, r1484; +} +{ +add.f16x2 r1490, r1472, r1487; +} +{ +add.f16x2 r1493, %78, %67; +} +{ +mul.f16x2 r1496, r1493, r1281; +} +{ +add.f16x2 r1499, %86, r1496; +} +{ +add.f16x2 r1502, %93, %77; +} +{ +mul.f16x2 r1505, r1502, r1283; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, %53, %90; +} +{ +mul.f16x2 r1514, r1511, r1282; +} +{ +sub.f16x2 r1517, %68, %52; +} +{ +mul.f16x2 r1520, r1517, r1284; +} +{ +add.f16x2 r1523, r1514, r1520; +} +{ +sub.f16x2 r1526, r1508, r1523; +} +{ +add.f16x2 r1529, %78, %67; +} +{ +mul.f16x2 r1532, r1529, r1283; +} +{ +add.f16x2 r1535, %86, r1532; +} +{ +add.f16x2 r1538, %93, %77; +} +{ +mul.f16x2 r1541, r1538, r1285; +} +{ +add.f16x2 r1544, r1535, r1541; +} +{ +sub.f16x2 r1547, %53, %90; +} +{ +mul.f16x2 r1550, r1547, r1284; +} +{ +sub.f16x2 r1553, %68, %52; +} +{ +mul.f16x2 r1556, r1553, r1287; +} +{ +add.f16x2 r1559, r1550, r1556; +} +{ +add.f16x2 r1562, r1544, r1559; +} +{ +add.f16x2 r1565, %78, %67; +} +{ +mul.f16x2 r1568, r1565, r1283; +} +{ +add.f16x2 r1571, %86, r1568; +} +{ +add.f16x2 r1574, %93, %77; +} +{ +mul.f16x2 r1577, r1574, r1285; +} +{ +add.f16x2 r1580, r1571, r1577; +} +{ +sub.f16x2 r1583, %53, %90; +} +{ +mul.f16x2 r1586, r1583, r1284; +} +{ +sub.f16x2 r1589, %68, %52; +} +{ +mul.f16x2 r1592, r1589, r1287; +} +{ +add.f16x2 r1595, r1586, r1592; +} +{ +sub.f16x2 r1598, r1580, r1595; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1601, {low, high}; +} +mov.f32 f332, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1602, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1603, {low, high}; +} +mov.f32 f336, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1604, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1605, {low, high}; +} +mov.f32 f340, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1606, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1607, {low, high}; +} +mov.f32 f344, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1608, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1611, {low, high}; +} +mov.f32 f352, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1612, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1615, {low, high}; +} +mov.f32 f360, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1616, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1617, {low, high}; +} +mov.f32 f364, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1618, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1623, {low, high}; +} +mov.f32 f376, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1631, {low, high}; +} +mov.f32 f392, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1632, {low, high}; +} +{ +mul.f16x2 r1649, r386, r1601; +} +{ +mul.f16x2 r1652, r530, r1602; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r386, r1602; +} +{ +fma.rn.f16x2 r1661, r530, r1601, r1658; +} +{ +mul.f16x2 r1665, r706, r1603; +} +{ +mul.f16x2 r1668, r850, r1604; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r706, r1604; +} +{ +fma.rn.f16x2 r1677, r850, r1603, r1674; +} +{ +mul.f16x2 r1681, r1026, r1605; +} +{ +mul.f16x2 r1684, r1170, r1606; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r1026, r1606; +} +{ +fma.rn.f16x2 r1693, r1170, r1605, r1690; +} +{ +mul.f16x2 r1697, r1346, r1607; +} +{ +mul.f16x2 r1700, r1490, r1608; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r1346, r1608; +} +{ +fma.rn.f16x2 r1709, r1490, r1607, r1706; +} +{ +mul.f16x2 r1713, r458, r1603; +} +{ +mul.f16x2 r1716, r602, r1604; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r458, r1604; +} +{ +fma.rn.f16x2 r1725, r602, r1603, r1722; +} +{ +mul.f16x2 r1729, r778, r1607; +} +{ +mul.f16x2 r1732, r922, r1608; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r778, r1608; +} +{ +fma.rn.f16x2 r1741, r922, r1607, r1738; +} +{ +mul.f16x2 r1745, r1098, r1611; +} +{ +mul.f16x2 r1748, r1242, r1612; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r1098, r1612; +} +{ +fma.rn.f16x2 r1757, r1242, r1611, r1754; +} +{ +mul.f16x2 r1761, r1418, r1615; +} +{ +mul.f16x2 r1764, r1562, r1616; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r1418, r1616; +} +{ +fma.rn.f16x2 r1773, r1562, r1615, r1770; +} +{ +mul.f16x2 r1777, r494, r1605; +} +{ +mul.f16x2 r1780, r638, r1606; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r494, r1606; +} +{ +fma.rn.f16x2 r1789, r638, r1605, r1786; +} +{ +mul.f16x2 r1793, r814, r1611; +} +{ +mul.f16x2 r1796, r958, r1612; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r814, r1612; +} +{ +fma.rn.f16x2 r1805, r958, r1611, r1802; +} +{ +mul.f16x2 r1809, r1134, r1617; +} +{ +mul.f16x2 r1812, r1278, r1618; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1134, r1618; +} +{ +fma.rn.f16x2 r1821, r1278, r1617, r1818; +} +{ +mul.f16x2 r1825, r1454, r1623; +} +{ +mul.f16x2 r1828, r1598, r1624; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1454, r1624; +} +{ +fma.rn.f16x2 r1837, r1598, r1623, r1834; +} +{ +mul.f16x2 r1841, r422, r1607; +} +{ +mul.f16x2 r1844, r566, r1608; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r422, r1608; +} +{ +fma.rn.f16x2 r1853, r566, r1607, r1850; +} +{ +mul.f16x2 r1857, r742, r1615; +} +{ +mul.f16x2 r1860, r886, r1616; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r742, r1616; +} +{ +fma.rn.f16x2 r1869, r886, r1615, r1866; +} +{ +mul.f16x2 r1873, r1062, r1623; +} +{ +mul.f16x2 r1876, r1206, r1624; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1062, r1624; +} +{ +fma.rn.f16x2 r1885, r1206, r1623, r1882; +} +{ +mul.f16x2 r1889, r1382, r1631; +} +{ +mul.f16x2 r1892, r1526, r1632; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1382, r1632; +} +{ +fma.rn.f16x2 r1901, r1526, r1631, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1905, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1906, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1910, {low, high}; +} +{ +neg.f16x2 r1911, r1910; +} +{ +add.f16x2 r1913, r338, r1298; +} +{ +add.f16x2 r1916, r18, r1913; +} +{ +add.f16x2 r1919, r658, r978; +} +{ +add.f16x2 r1922, r1916, r1919; +} +{ +add.f16x2 r1925, r350, r1310; +} +{ +add.f16x2 r1928, r30, r1925; +} +{ +add.f16x2 r1931, r670, r990; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r338, r1298; +} +{ +mul.f16x2 r1940, r1937, r1905; +} +{ +add.f16x2 r1943, r18, r1940; +} +{ +add.f16x2 r1946, r658, r978; +} +{ +mul.f16x2 r1949, r1946, r1907; +} +{ +add.f16x2 r1952, r1943, r1949; +} +{ +sub.f16x2 r1955, r350, r1310; +} +{ +mul.f16x2 r1958, r1955, r1906; +} +{ +sub.f16x2 r1961, r670, r990; +} +{ +mul.f16x2 r1964, r1961, r1908; +} +{ +add.f16x2 r1967, r1958, r1964; +} +{ +sub.f16x2 r1970, r1952, r1967; +} +{ +add.f16x2 r1973, r338, r1298; +} +{ +mul.f16x2 r1976, r1973, r1905; +} +{ +add.f16x2 r1979, r18, r1976; +} +{ +add.f16x2 r1982, r658, r978; +} +{ +mul.f16x2 r1985, r1982, r1907; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +sub.f16x2 r1991, r350, r1310; +} +{ +mul.f16x2 r1994, r1991, r1906; +} +{ +sub.f16x2 r1997, r670, r990; +} +{ +mul.f16x2 r2000, r1997, r1908; +} +{ +add.f16x2 r2003, r1994, r2000; +} +{ +add.f16x2 r2006, r1988, r2003; +} +{ +add.f16x2 r2009, r338, r1298; +} +{ +mul.f16x2 r2012, r2009, r1907; +} +{ +add.f16x2 r2015, r18, r2012; +} +{ +add.f16x2 r2018, r658, r978; +} +{ +mul.f16x2 r2021, r2018, r1909; +} +{ +add.f16x2 r2024, r2015, r2021; +} +{ +sub.f16x2 r2027, r350, r1310; +} +{ +mul.f16x2 r2030, r2027, r1908; +} +{ +sub.f16x2 r2033, r670, r990; +} +{ +mul.f16x2 r2036, r2033, r1911; +} +{ +add.f16x2 r2039, r2030, r2036; +} +{ +sub.f16x2 r2042, r2024, r2039; +} +{ +add.f16x2 r2045, r338, r1298; +} +{ +mul.f16x2 r2048, r2045, r1907; +} +{ +add.f16x2 r2051, r18, r2048; +} +{ +add.f16x2 r2054, r658, r978; +} +{ +mul.f16x2 r2057, r2054, r1909; +} +{ +add.f16x2 r2060, r2051, r2057; +} +{ +sub.f16x2 r2063, r350, r1310; +} +{ +mul.f16x2 r2066, r2063, r1908; +} +{ +sub.f16x2 r2069, r670, r990; +} +{ +mul.f16x2 r2072, r2069, r1911; +} +{ +add.f16x2 r2075, r2066, r2072; +} +{ +add.f16x2 r2078, r2060, r2075; +} +{ +add.f16x2 r2081, r350, r1310; +} +{ +mul.f16x2 r2084, r2081, r1905; +} +{ +add.f16x2 r2087, r30, r2084; +} +{ +add.f16x2 r2090, r670, r990; +} +{ +mul.f16x2 r2093, r2090, r1907; +} +{ +add.f16x2 r2096, r2087, r2093; +} +{ +sub.f16x2 r2099, r338, r1298; +} +{ +mul.f16x2 r2102, r2099, r1906; +} +{ +sub.f16x2 r2105, r658, r978; +} +{ +mul.f16x2 r2108, r2105, r1908; +} +{ +add.f16x2 r2111, r2102, r2108; +} +{ +add.f16x2 r2114, r2096, r2111; +} +{ +add.f16x2 r2117, r350, r1310; +} +{ +mul.f16x2 r2120, r2117, r1905; +} +{ +add.f16x2 r2123, r30, r2120; +} +{ +add.f16x2 r2126, r670, r990; +} +{ +mul.f16x2 r2129, r2126, r1907; +} +{ +add.f16x2 r2132, r2123, r2129; +} +{ +sub.f16x2 r2135, r338, r1298; +} +{ +mul.f16x2 r2138, r2135, r1906; +} +{ +sub.f16x2 r2141, r658, r978; +} +{ +mul.f16x2 r2144, r2141, r1908; +} +{ +add.f16x2 r2147, r2138, r2144; +} +{ +sub.f16x2 r2150, r2132, r2147; +} +{ +add.f16x2 r2153, r350, r1310; +} +{ +mul.f16x2 r2156, r2153, r1907; +} +{ +add.f16x2 r2159, r30, r2156; +} +{ +add.f16x2 r2162, r670, r990; +} +{ +mul.f16x2 r2165, r2162, r1909; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r338, r1298; +} +{ +mul.f16x2 r2174, r2171, r1908; +} +{ +sub.f16x2 r2177, r658, r978; +} +{ +mul.f16x2 r2180, r2177, r1911; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +add.f16x2 r2186, r2168, r2183; +} +{ +add.f16x2 r2189, r350, r1310; +} +{ +mul.f16x2 r2192, r2189, r1907; +} +{ +add.f16x2 r2195, r30, r2192; +} +{ +add.f16x2 r2198, r670, r990; +} +{ +mul.f16x2 r2201, r2198, r1909; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r338, r1298; +} +{ +mul.f16x2 r2210, r2207, r1908; +} +{ +sub.f16x2 r2213, r658, r978; +} +{ +mul.f16x2 r2216, r2213, r1911; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +sub.f16x2 r2222, r2204, r2219; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2226, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2228, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2230, {low, high}; +} +{ +neg.f16x2 r2231, r2230; +} +{ +add.f16x2 r2233, r1655, r1703; +} +{ +add.f16x2 r2236, r66, r2233; +} +{ +add.f16x2 r2239, r1671, r1687; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, r1661, r1709; +} +{ +add.f16x2 r2248, r210, r2245; +} +{ +add.f16x2 r2251, r1677, r1693; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r1655, r1703; +} +{ +mul.f16x2 r2260, r2257, r2225; +} +{ +add.f16x2 r2263, r66, r2260; +} +{ +add.f16x2 r2266, r1671, r1687; +} +{ +mul.f16x2 r2269, r2266, r2227; +} +{ +add.f16x2 r2272, r2263, r2269; +} +{ +sub.f16x2 r2275, r1661, r1709; +} +{ +mul.f16x2 r2278, r2275, r2226; +} +{ +sub.f16x2 r2281, r1677, r1693; +} +{ +mul.f16x2 r2284, r2281, r2228; +} +{ +add.f16x2 r2287, r2278, r2284; +} +{ +sub.f16x2 r2290, r2272, r2287; +} +{ +add.f16x2 r2293, r1655, r1703; +} +{ +mul.f16x2 r2296, r2293, r2225; +} +{ +add.f16x2 r2299, r66, r2296; +} +{ +add.f16x2 r2302, r1671, r1687; +} +{ +mul.f16x2 r2305, r2302, r2227; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1661, r1709; +} +{ +mul.f16x2 r2314, r2311, r2226; +} +{ +sub.f16x2 r2317, r1677, r1693; +} +{ +mul.f16x2 r2320, r2317, r2228; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +add.f16x2 r2326, r2308, r2323; +} +{ +add.f16x2 r2329, r1655, r1703; +} +{ +mul.f16x2 r2332, r2329, r2227; +} +{ +add.f16x2 r2335, r66, r2332; +} +{ +add.f16x2 r2338, r1671, r1687; +} +{ +mul.f16x2 r2341, r2338, r2229; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1661, r1709; +} +{ +mul.f16x2 r2350, r2347, r2228; +} +{ +sub.f16x2 r2353, r1677, r1693; +} +{ +mul.f16x2 r2356, r2353, r2231; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r2344, r2359; +} +{ +add.f16x2 r2365, r1655, r1703; +} +{ +mul.f16x2 r2368, r2365, r2227; +} +{ +add.f16x2 r2371, r66, r2368; +} +{ +add.f16x2 r2374, r1671, r1687; +} +{ +mul.f16x2 r2377, r2374, r2229; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1661, r1709; +} +{ +mul.f16x2 r2386, r2383, r2228; +} +{ +sub.f16x2 r2389, r1677, r1693; +} +{ +mul.f16x2 r2392, r2389, r2231; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +add.f16x2 r2398, r2380, r2395; +} +{ +add.f16x2 r2401, r1661, r1709; +} +{ +mul.f16x2 r2404, r2401, r2225; +} +{ +add.f16x2 r2407, r210, r2404; +} +{ +add.f16x2 r2410, r1677, r1693; +} +{ +mul.f16x2 r2413, r2410, r2227; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1655, r1703; +} +{ +mul.f16x2 r2422, r2419, r2226; +} +{ +sub.f16x2 r2425, r1671, r1687; +} +{ +mul.f16x2 r2428, r2425, r2228; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +{ +add.f16x2 r2437, r1661, r1709; +} +{ +mul.f16x2 r2440, r2437, r2225; +} +{ +add.f16x2 r2443, r210, r2440; +} +{ +add.f16x2 r2446, r1677, r1693; +} +{ +mul.f16x2 r2449, r2446, r2227; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1655, r1703; +} +{ +mul.f16x2 r2458, r2455, r2226; +} +{ +sub.f16x2 r2461, r1671, r1687; +} +{ +mul.f16x2 r2464, r2461, r2228; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r2452, r2467; +} +{ +add.f16x2 r2473, r1661, r1709; +} +{ +mul.f16x2 r2476, r2473, r2227; +} +{ +add.f16x2 r2479, r210, r2476; +} +{ +add.f16x2 r2482, r1677, r1693; +} +{ +mul.f16x2 r2485, r2482, r2229; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1655, r1703; +} +{ +mul.f16x2 r2494, r2491, r2228; +} +{ +sub.f16x2 r2497, r1671, r1687; +} +{ +mul.f16x2 r2500, r2497, r2231; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +add.f16x2 r2506, r2488, r2503; +} +{ +add.f16x2 r2509, r1661, r1709; +} +{ +mul.f16x2 r2512, r2509, r2227; +} +{ +add.f16x2 r2515, r210, r2512; +} +{ +add.f16x2 r2518, r1677, r1693; +} +{ +mul.f16x2 r2521, r2518, r2229; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1655, r1703; +} +{ +mul.f16x2 r2530, r2527, r2228; +} +{ +sub.f16x2 r2533, r1671, r1687; +} +{ +mul.f16x2 r2536, r2533, r2231; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2524, r2539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2545, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2546, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2547, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2548, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2549, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2550, {low, high}; +} +{ +neg.f16x2 r2551, r2550; +} +{ +add.f16x2 r2553, r1719, r1767; +} +{ +add.f16x2 r2556, r138, r2553; +} +{ +add.f16x2 r2559, r1735, r1751; +} +{ +add.f16x2 r2562, r2556, r2559; +} +{ +add.f16x2 r2565, r1725, r1773; +} +{ +add.f16x2 r2568, r282, r2565; +} +{ +add.f16x2 r2571, r1741, r1757; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r1719, r1767; +} +{ +mul.f16x2 r2580, r2577, r2545; +} +{ +add.f16x2 r2583, r138, r2580; +} +{ +add.f16x2 r2586, r1735, r1751; +} +{ +mul.f16x2 r2589, r2586, r2547; +} +{ +add.f16x2 r2592, r2583, r2589; +} +{ +sub.f16x2 r2595, r1725, r1773; +} +{ +mul.f16x2 r2598, r2595, r2546; +} +{ +sub.f16x2 r2601, r1741, r1757; +} +{ +mul.f16x2 r2604, r2601, r2548; +} +{ +add.f16x2 r2607, r2598, r2604; +} +{ +sub.f16x2 r2610, r2592, r2607; +} +{ +add.f16x2 r2613, r1719, r1767; +} +{ +mul.f16x2 r2616, r2613, r2545; +} +{ +add.f16x2 r2619, r138, r2616; +} +{ +add.f16x2 r2622, r1735, r1751; +} +{ +mul.f16x2 r2625, r2622, r2547; +} +{ +add.f16x2 r2628, r2619, r2625; +} +{ +sub.f16x2 r2631, r1725, r1773; +} +{ +mul.f16x2 r2634, r2631, r2546; +} +{ +sub.f16x2 r2637, r1741, r1757; +} +{ +mul.f16x2 r2640, r2637, r2548; +} +{ +add.f16x2 r2643, r2634, r2640; +} +{ +add.f16x2 r2646, r2628, r2643; +} +{ +add.f16x2 r2649, r1719, r1767; +} +{ +mul.f16x2 r2652, r2649, r2547; +} +{ +add.f16x2 r2655, r138, r2652; +} +{ +add.f16x2 r2658, r1735, r1751; +} +{ +mul.f16x2 r2661, r2658, r2549; +} +{ +add.f16x2 r2664, r2655, r2661; +} +{ +sub.f16x2 r2667, r1725, r1773; +} +{ +mul.f16x2 r2670, r2667, r2548; +} +{ +sub.f16x2 r2673, r1741, r1757; +} +{ +mul.f16x2 r2676, r2673, r2551; +} +{ +add.f16x2 r2679, r2670, r2676; +} +{ +sub.f16x2 r2682, r2664, r2679; +} +{ +add.f16x2 r2685, r1719, r1767; +} +{ +mul.f16x2 r2688, r2685, r2547; +} +{ +add.f16x2 r2691, r138, r2688; +} +{ +add.f16x2 r2694, r1735, r1751; +} +{ +mul.f16x2 r2697, r2694, r2549; +} +{ +add.f16x2 r2700, r2691, r2697; +} +{ +sub.f16x2 r2703, r1725, r1773; +} +{ +mul.f16x2 r2706, r2703, r2548; +} +{ +sub.f16x2 r2709, r1741, r1757; +} +{ +mul.f16x2 r2712, r2709, r2551; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2700, r2715; +} +{ +add.f16x2 r2721, r1725, r1773; +} +{ +mul.f16x2 r2724, r2721, r2545; +} +{ +add.f16x2 r2727, r282, r2724; +} +{ +add.f16x2 r2730, r1741, r1757; +} +{ +mul.f16x2 r2733, r2730, r2547; +} +{ +add.f16x2 r2736, r2727, r2733; +} +{ +sub.f16x2 r2739, r1719, r1767; +} +{ +mul.f16x2 r2742, r2739, r2546; +} +{ +sub.f16x2 r2745, r1735, r1751; +} +{ +mul.f16x2 r2748, r2745, r2548; +} +{ +add.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2736, r2751; +} +{ +add.f16x2 r2757, r1725, r1773; +} +{ +mul.f16x2 r2760, r2757, r2545; +} +{ +add.f16x2 r2763, r282, r2760; +} +{ +add.f16x2 r2766, r1741, r1757; +} +{ +mul.f16x2 r2769, r2766, r2547; +} +{ +add.f16x2 r2772, r2763, r2769; +} +{ +sub.f16x2 r2775, r1719, r1767; +} +{ +mul.f16x2 r2778, r2775, r2546; +} +{ +sub.f16x2 r2781, r1735, r1751; +} +{ +mul.f16x2 r2784, r2781, r2548; +} +{ +add.f16x2 r2787, r2778, r2784; +} +{ +sub.f16x2 r2790, r2772, r2787; +} +{ +add.f16x2 r2793, r1725, r1773; +} +{ +mul.f16x2 r2796, r2793, r2547; +} +{ +add.f16x2 r2799, r282, r2796; +} +{ +add.f16x2 r2802, r1741, r1757; +} +{ +mul.f16x2 r2805, r2802, r2549; +} +{ +add.f16x2 r2808, r2799, r2805; +} +{ +sub.f16x2 r2811, r1719, r1767; +} +{ +mul.f16x2 r2814, r2811, r2548; +} +{ +sub.f16x2 r2817, r1735, r1751; +} +{ +mul.f16x2 r2820, r2817, r2551; +} +{ +add.f16x2 r2823, r2814, r2820; +} +{ +add.f16x2 r2826, r2808, r2823; +} +{ +add.f16x2 r2829, r1725, r1773; +} +{ +mul.f16x2 r2832, r2829, r2547; +} +{ +add.f16x2 r2835, r282, r2832; +} +{ +add.f16x2 r2838, r1741, r1757; +} +{ +mul.f16x2 r2841, r2838, r2549; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r1719, r1767; +} +{ +mul.f16x2 r2850, r2847, r2548; +} +{ +sub.f16x2 r2853, r1735, r1751; +} +{ +mul.f16x2 r2856, r2853, r2551; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2844, r2859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2866, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2868, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2869, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2870, {low, high}; +} +{ +neg.f16x2 r2871, r2870; +} +{ +add.f16x2 r2873, r1783, r1831; +} +{ +add.f16x2 r2876, r174, r2873; +} +{ +add.f16x2 r2879, r1799, r1815; +} +{ +add.f16x2 r2882, r2876, r2879; +} +{ +add.f16x2 r2885, r1789, r1837; +} +{ +add.f16x2 r2888, r318, r2885; +} +{ +add.f16x2 r2891, r1805, r1821; +} +{ +add.f16x2 r2894, r2888, r2891; +} +{ +add.f16x2 r2897, r1783, r1831; +} +{ +mul.f16x2 r2900, r2897, r2865; +} +{ +add.f16x2 r2903, r174, r2900; +} +{ +add.f16x2 r2906, r1799, r1815; +} +{ +mul.f16x2 r2909, r2906, r2867; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +sub.f16x2 r2915, r1789, r1837; +} +{ +mul.f16x2 r2918, r2915, r2866; +} +{ +sub.f16x2 r2921, r1805, r1821; +} +{ +mul.f16x2 r2924, r2921, r2868; +} +{ +add.f16x2 r2927, r2918, r2924; +} +{ +sub.f16x2 r2930, r2912, r2927; +} +{ +add.f16x2 r2933, r1783, r1831; +} +{ +mul.f16x2 r2936, r2933, r2865; +} +{ +add.f16x2 r2939, r174, r2936; +} +{ +add.f16x2 r2942, r1799, r1815; +} +{ +mul.f16x2 r2945, r2942, r2867; +} +{ +add.f16x2 r2948, r2939, r2945; +} +{ +sub.f16x2 r2951, r1789, r1837; +} +{ +mul.f16x2 r2954, r2951, r2866; +} +{ +sub.f16x2 r2957, r1805, r1821; +} +{ +mul.f16x2 r2960, r2957, r2868; +} +{ +add.f16x2 r2963, r2954, r2960; +} +{ +add.f16x2 r2966, r2948, r2963; +} +{ +add.f16x2 r2969, r1783, r1831; +} +{ +mul.f16x2 r2972, r2969, r2867; +} +{ +add.f16x2 r2975, r174, r2972; +} +{ +add.f16x2 r2978, r1799, r1815; +} +{ +mul.f16x2 r2981, r2978, r2869; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 r2987, r1789, r1837; +} +{ +mul.f16x2 r2990, r2987, r2868; +} +{ +sub.f16x2 r2993, r1805, r1821; +} +{ +mul.f16x2 r2996, r2993, r2871; +} +{ +add.f16x2 r2999, r2990, r2996; +} +{ +sub.f16x2 r3002, r2984, r2999; +} +{ +add.f16x2 r3005, r1783, r1831; +} +{ +mul.f16x2 r3008, r3005, r2867; +} +{ +add.f16x2 r3011, r174, r3008; +} +{ +add.f16x2 r3014, r1799, r1815; +} +{ +mul.f16x2 r3017, r3014, r2869; +} +{ +add.f16x2 r3020, r3011, r3017; +} +{ +sub.f16x2 r3023, r1789, r1837; +} +{ +mul.f16x2 r3026, r3023, r2868; +} +{ +sub.f16x2 r3029, r1805, r1821; +} +{ +mul.f16x2 r3032, r3029, r2871; +} +{ +add.f16x2 r3035, r3026, r3032; +} +{ +add.f16x2 r3038, r3020, r3035; +} +{ +add.f16x2 r3041, r1789, r1837; +} +{ +mul.f16x2 r3044, r3041, r2865; +} +{ +add.f16x2 r3047, r318, r3044; +} +{ +add.f16x2 r3050, r1805, r1821; +} +{ +mul.f16x2 r3053, r3050, r2867; +} +{ +add.f16x2 r3056, r3047, r3053; +} +{ +sub.f16x2 r3059, r1783, r1831; +} +{ +mul.f16x2 r3062, r3059, r2866; +} +{ +sub.f16x2 r3065, r1799, r1815; +} +{ +mul.f16x2 r3068, r3065, r2868; +} +{ +add.f16x2 r3071, r3062, r3068; +} +{ +add.f16x2 r3074, r3056, r3071; +} +{ +add.f16x2 r3077, r1789, r1837; +} +{ +mul.f16x2 r3080, r3077, r2865; +} +{ +add.f16x2 r3083, r318, r3080; +} +{ +add.f16x2 r3086, r1805, r1821; +} +{ +mul.f16x2 r3089, r3086, r2867; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 r3095, r1783, r1831; +} +{ +mul.f16x2 r3098, r3095, r2866; +} +{ +sub.f16x2 r3101, r1799, r1815; +} +{ +mul.f16x2 r3104, r3101, r2868; +} +{ +add.f16x2 r3107, r3098, r3104; +} +{ +sub.f16x2 r3110, r3092, r3107; +} +{ +add.f16x2 r3113, r1789, r1837; +} +{ +mul.f16x2 r3116, r3113, r2867; +} +{ +add.f16x2 r3119, r318, r3116; +} +{ +add.f16x2 r3122, r1805, r1821; +} +{ +mul.f16x2 r3125, r3122, r2869; +} +{ +add.f16x2 r3128, r3119, r3125; +} +{ +sub.f16x2 r3131, r1783, r1831; +} +{ +mul.f16x2 r3134, r3131, r2868; +} +{ +sub.f16x2 r3137, r1799, r1815; +} +{ +mul.f16x2 r3140, r3137, r2871; +} +{ +add.f16x2 r3143, r3134, r3140; +} +{ +add.f16x2 r3146, r3128, r3143; +} +{ +add.f16x2 r3149, r1789, r1837; +} +{ +mul.f16x2 r3152, r3149, r2867; +} +{ +add.f16x2 r3155, r318, r3152; +} +{ +add.f16x2 r3158, r1805, r1821; +} +{ +mul.f16x2 r3161, r3158, r2869; +} +{ +add.f16x2 r3164, r3155, r3161; +} +{ +sub.f16x2 r3167, r1783, r1831; +} +{ +mul.f16x2 r3170, r3167, r2868; +} +{ +sub.f16x2 r3173, r1799, r1815; +} +{ +mul.f16x2 r3176, r3173, r2871; +} +{ +add.f16x2 r3179, r3170, r3176; +} +{ +sub.f16x2 r3182, r3164, r3179; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3185, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3186, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r3187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r3188, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3189, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3190, {low, high}; +} +{ +neg.f16x2 r3191, r3190; +} +{ +add.f16x2 r3193, r1847, r1895; +} +{ +add.f16x2 r3196, r102, r3193; +} +{ +add.f16x2 r3199, r1863, r1879; +} +{ +add.f16x2 r3202, r3196, r3199; +} +{ +add.f16x2 r3205, r1853, r1901; +} +{ +add.f16x2 r3208, r246, r3205; +} +{ +add.f16x2 r3211, r1869, r1885; +} +{ +add.f16x2 r3214, r3208, r3211; +} +{ +add.f16x2 r3217, r1847, r1895; +} +{ +mul.f16x2 r3220, r3217, r3185; +} +{ +add.f16x2 r3223, r102, r3220; +} +{ +add.f16x2 r3226, r1863, r1879; +} +{ +mul.f16x2 r3229, r3226, r3187; +} +{ +add.f16x2 r3232, r3223, r3229; +} +{ +sub.f16x2 r3235, r1853, r1901; +} +{ +mul.f16x2 r3238, r3235, r3186; +} +{ +sub.f16x2 r3241, r1869, r1885; +} +{ +mul.f16x2 r3244, r3241, r3188; +} +{ +add.f16x2 r3247, r3238, r3244; +} +{ +sub.f16x2 r3250, r3232, r3247; +} +{ +add.f16x2 r3253, r1847, r1895; +} +{ +mul.f16x2 r3256, r3253, r3185; +} +{ +add.f16x2 r3259, r102, r3256; +} +{ +add.f16x2 r3262, r1863, r1879; +} +{ +mul.f16x2 r3265, r3262, r3187; +} +{ +add.f16x2 r3268, r3259, r3265; +} +{ +sub.f16x2 r3271, r1853, r1901; +} +{ +mul.f16x2 r3274, r3271, r3186; +} +{ +sub.f16x2 r3277, r1869, r1885; +} +{ +mul.f16x2 r3280, r3277, r3188; +} +{ +add.f16x2 r3283, r3274, r3280; +} +{ +add.f16x2 r3286, r3268, r3283; +} +{ +add.f16x2 r3289, r1847, r1895; +} +{ +mul.f16x2 r3292, r3289, r3187; +} +{ +add.f16x2 r3295, r102, r3292; +} +{ +add.f16x2 r3298, r1863, r1879; +} +{ +mul.f16x2 r3301, r3298, r3189; +} +{ +add.f16x2 r3304, r3295, r3301; +} +{ +sub.f16x2 r3307, r1853, r1901; +} +{ +mul.f16x2 r3310, r3307, r3188; +} +{ +sub.f16x2 r3313, r1869, r1885; +} +{ +mul.f16x2 r3316, r3313, r3191; +} +{ +add.f16x2 r3319, r3310, r3316; +} +{ +sub.f16x2 r3322, r3304, r3319; +} +{ +add.f16x2 r3325, r1847, r1895; +} +{ +mul.f16x2 r3328, r3325, r3187; +} +{ +add.f16x2 r3331, r102, r3328; +} +{ +add.f16x2 r3334, r1863, r1879; +} +{ +mul.f16x2 r3337, r3334, r3189; +} +{ +add.f16x2 r3340, r3331, r3337; +} +{ +sub.f16x2 r3343, r1853, r1901; +} +{ +mul.f16x2 r3346, r3343, r3188; +} +{ +sub.f16x2 r3349, r1869, r1885; +} +{ +mul.f16x2 r3352, r3349, r3191; +} +{ +add.f16x2 r3355, r3346, r3352; +} +{ +add.f16x2 r3358, r3340, r3355; +} +{ +add.f16x2 r3361, r1853, r1901; +} +{ +mul.f16x2 r3364, r3361, r3185; +} +{ +add.f16x2 r3367, r246, r3364; +} +{ +add.f16x2 r3370, r1869, r1885; +} +{ +mul.f16x2 r3373, r3370, r3187; +} +{ +add.f16x2 r3376, r3367, r3373; +} +{ +sub.f16x2 r3379, r1847, r1895; +} +{ +mul.f16x2 r3382, r3379, r3186; +} +{ +sub.f16x2 r3385, r1863, r1879; +} +{ +mul.f16x2 r3388, r3385, r3188; +} +{ +add.f16x2 r3391, r3382, r3388; +} +{ +add.f16x2 r3394, r3376, r3391; +} +{ +add.f16x2 r3397, r1853, r1901; +} +{ +mul.f16x2 r3400, r3397, r3185; +} +{ +add.f16x2 r3403, r246, r3400; +} +{ +add.f16x2 r3406, r1869, r1885; +} +{ +mul.f16x2 r3409, r3406, r3187; +} +{ +add.f16x2 r3412, r3403, r3409; +} +{ +sub.f16x2 r3415, r1847, r1895; +} +{ +mul.f16x2 r3418, r3415, r3186; +} +{ +sub.f16x2 r3421, r1863, r1879; +} +{ +mul.f16x2 r3424, r3421, r3188; +} +{ +add.f16x2 r3427, r3418, r3424; +} +{ +sub.f16x2 r3430, r3412, r3427; +} +{ +add.f16x2 r3433, r1853, r1901; +} +{ +mul.f16x2 r3436, r3433, r3187; +} +{ +add.f16x2 r3439, r246, r3436; +} +{ +add.f16x2 r3442, r1869, r1885; +} +{ +mul.f16x2 r3445, r3442, r3189; +} +{ +add.f16x2 r3448, r3439, r3445; +} +{ +sub.f16x2 r3451, r1847, r1895; +} +{ +mul.f16x2 r3454, r3451, r3188; +} +{ +sub.f16x2 r3457, r1863, r1879; +} +{ +mul.f16x2 r3460, r3457, r3191; +} +{ +add.f16x2 r3463, r3454, r3460; +} +{ +add.f16x2 r3466, r3448, r3463; +} +{ +add.f16x2 r3469, r1853, r1901; +} +{ +mul.f16x2 r3472, r3469, r3187; +} +{ +add.f16x2 r3475, r246, r3472; +} +{ +add.f16x2 r3478, r1869, r1885; +} +{ +mul.f16x2 r3481, r3478, r3189; +} +{ +add.f16x2 r3484, r3475, r3481; +} +{ +sub.f16x2 r3487, r1847, r1895; +} +{ +mul.f16x2 r3490, r3487, r3188; +} +{ +sub.f16x2 r3493, r1863, r1879; +} +{ +mul.f16x2 r3496, r3493, r3191; +} +{ +add.f16x2 r3499, r3490, r3496; +} +{ +sub.f16x2 r3502, r3484, r3499; +} +mul.wide.u32 rd2, r7903, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r7904, rd3; +mul.lo.s32 r7905, r7904, 25; +sub.s32 r7906, r7903, r7905; +cvt.rn.f32.u32 f485, r7906; +mul.f32 f486, f485, 0f3C24B5BE; +cos.approx.f32 f217, f486; +sin.approx.f32 f487, f486; +neg.f32 f218, f487; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3505, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3508, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3510, {high, high}; +} +{ +mul.f16x2 r3512, r2254, r3510; +} +{ +neg.f16x2 r3515, r3512; +} +{ +fma.rn.f16x2 r3517, r2242, r3508, r3515; +} +{ +mul.f16x2 r3521, r2242, r3510; +} +{ +fma.rn.f16x2 r3524, r2254, r3508, r3521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3530, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3532, {low, high}; +} +{ +mul.f16x2 r3533, r3530, r3532; +} +{ +mul.f16x2 r3536, r3505, r3528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3539, {high, low}; +} +{ +fma.rn.f16x2 r3541, r3533, r3539, r3536; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3545, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3547, {high, high}; +} +{ +mul.f16x2 r3549, r2574, r3547; +} +{ +neg.f16x2 r3552, r3549; +} +{ +fma.rn.f16x2 r3554, r2562, r3545, r3552; +} +{ +mul.f16x2 r3558, r2562, r3547; +} +{ +fma.rn.f16x2 r3561, r2574, r3545, r3558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3567, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3569, {low, high}; +} +{ +mul.f16x2 r3570, r3567, r3569; +} +{ +mul.f16x2 r3573, r3541, r3565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3576, {high, low}; +} +{ +fma.rn.f16x2 r3578, r3570, r3576, r3573; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3582, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3584, {high, high}; +} +{ +mul.f16x2 r3586, r2894, r3584; +} +{ +neg.f16x2 r3589, r3586; +} +{ +fma.rn.f16x2 r3591, r2882, r3582, r3589; +} +{ +mul.f16x2 r3595, r2882, r3584; +} +{ +fma.rn.f16x2 r3598, r2894, r3582, r3595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3604, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3606, {low, high}; +} +{ +mul.f16x2 r3607, r3604, r3606; +} +{ +mul.f16x2 r3610, r3578, r3602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3613, {high, low}; +} +{ +fma.rn.f16x2 r3615, r3607, r3613, r3610; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3619, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3621, {high, high}; +} +{ +mul.f16x2 r3623, r3214, r3621; +} +{ +neg.f16x2 r3626, r3623; +} +{ +fma.rn.f16x2 r3628, r3202, r3619, r3626; +} +{ +mul.f16x2 r3632, r3202, r3621; +} +{ +fma.rn.f16x2 r3635, r3214, r3619, r3632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3641, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3643, {low, high}; +} +{ +mul.f16x2 r3644, r3641, r3643; +} +{ +mul.f16x2 r3647, r3615, r3639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3650, {high, low}; +} +{ +fma.rn.f16x2 r3652, r3644, r3650, r3647; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3656, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3658, {high, high}; +} +{ +mul.f16x2 r3660, r2114, r3658; +} +{ +neg.f16x2 r3663, r3660; +} +{ +fma.rn.f16x2 r3665, r1970, r3656, r3663; +} +{ +mul.f16x2 r3669, r1970, r3658; +} +{ +fma.rn.f16x2 r3672, r2114, r3656, r3669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3678, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3680, {low, high}; +} +{ +mul.f16x2 r3681, r3678, r3680; +} +{ +mul.f16x2 r3684, r3652, r3676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3687, {high, low}; +} +{ +fma.rn.f16x2 r3689, r3681, r3687, r3684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3693, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3695, {high, high}; +} +{ +mul.f16x2 r3697, r2434, r3695; +} +{ +neg.f16x2 r3700, r3697; +} +{ +fma.rn.f16x2 r3702, r2290, r3693, r3700; +} +{ +mul.f16x2 r3706, r2290, r3695; +} +{ +fma.rn.f16x2 r3709, r2434, r3693, r3706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3715, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3717, {low, high}; +} +{ +mul.f16x2 r3718, r3715, r3717; +} +{ +mul.f16x2 r3721, r3689, r3713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3724, {high, low}; +} +{ +fma.rn.f16x2 r3726, r3718, r3724, r3721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3730, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3732, {high, high}; +} +{ +mul.f16x2 r3734, r2754, r3732; +} +{ +neg.f16x2 r3737, r3734; +} +{ +fma.rn.f16x2 r3739, r2610, r3730, r3737; +} +{ +mul.f16x2 r3743, r2610, r3732; +} +{ +fma.rn.f16x2 r3746, r2754, r3730, r3743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3752, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3754, {low, high}; +} +{ +mul.f16x2 r3755, r3752, r3754; +} +{ +mul.f16x2 r3758, r3726, r3750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3761, {high, low}; +} +{ +fma.rn.f16x2 r3763, r3755, r3761, r3758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3767, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3769, {high, high}; +} +{ +mul.f16x2 r3771, r3074, r3769; +} +{ +neg.f16x2 r3774, r3771; +} +{ +fma.rn.f16x2 r3776, r2930, r3767, r3774; +} +{ +mul.f16x2 r3780, r2930, r3769; +} +{ +fma.rn.f16x2 r3783, r3074, r3767, r3780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3789, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3791, {low, high}; +} +{ +mul.f16x2 r3792, r3789, r3791; +} +{ +mul.f16x2 r3795, r3763, r3787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3798, {high, low}; +} +{ +fma.rn.f16x2 r3800, r3792, r3798, r3795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3804, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3806, {high, high}; +} +{ +mul.f16x2 r3808, r3394, r3806; +} +{ +neg.f16x2 r3811, r3808; +} +{ +fma.rn.f16x2 r3813, r3250, r3804, r3811; +} +{ +mul.f16x2 r3817, r3250, r3806; +} +{ +fma.rn.f16x2 r3820, r3394, r3804, r3817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3826, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3828, {low, high}; +} +{ +mul.f16x2 r3829, r3826, r3828; +} +{ +mul.f16x2 r3832, r3800, r3824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3835, {high, low}; +} +{ +fma.rn.f16x2 r3837, r3829, r3835, r3832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3841, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3843, {high, high}; +} +{ +mul.f16x2 r3845, r2186, r3843; +} +{ +neg.f16x2 r3848, r3845; +} +{ +fma.rn.f16x2 r3850, r2042, r3841, r3848; +} +{ +mul.f16x2 r3854, r2042, r3843; +} +{ +fma.rn.f16x2 r3857, r2186, r3841, r3854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3863, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3865, {low, high}; +} +{ +mul.f16x2 r3866, r3863, r3865; +} +{ +mul.f16x2 r3869, r3837, r3861; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3872, {high, low}; +} +{ +fma.rn.f16x2 r3874, r3866, r3872, r3869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3878, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3880, {high, high}; +} +{ +mul.f16x2 r3882, r2506, r3880; +} +{ +neg.f16x2 r3885, r3882; +} +{ +fma.rn.f16x2 r3887, r2362, r3878, r3885; +} +{ +mul.f16x2 r3891, r2362, r3880; +} +{ +fma.rn.f16x2 r3894, r2506, r3878, r3891; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3900, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3902, {low, high}; +} +{ +mul.f16x2 r3903, r3900, r3902; +} +{ +mul.f16x2 r3906, r3874, r3898; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3909, {high, low}; +} +{ +fma.rn.f16x2 r3911, r3903, r3909, r3906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3915, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3917, {high, high}; +} +{ +mul.f16x2 r3919, r2826, r3917; +} +{ +neg.f16x2 r3922, r3919; +} +{ +fma.rn.f16x2 r3924, r2682, r3915, r3922; +} +{ +mul.f16x2 r3928, r2682, r3917; +} +{ +fma.rn.f16x2 r3931, r2826, r3915, r3928; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3937, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3939, {low, high}; +} +{ +mul.f16x2 r3940, r3937, r3939; +} +{ +mul.f16x2 r3943, r3911, r3935; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3946, {high, low}; +} +{ +fma.rn.f16x2 r3948, r3940, r3946, r3943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3952, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3954, {high, high}; +} +{ +mul.f16x2 r3956, r3146, r3954; +} +{ +neg.f16x2 r3959, r3956; +} +{ +fma.rn.f16x2 r3961, r3002, r3952, r3959; +} +{ +mul.f16x2 r3965, r3002, r3954; +} +{ +fma.rn.f16x2 r3968, r3146, r3952, r3965; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3974, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3976, {low, high}; +} +{ +mul.f16x2 r3977, r3974, r3976; +} +{ +mul.f16x2 r3980, r3948, r3972; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3983, {high, low}; +} +{ +fma.rn.f16x2 r3985, r3977, r3983, r3980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3989, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3991, {high, high}; +} +{ +mul.f16x2 r3993, r3466, r3991; +} +{ +neg.f16x2 r3996, r3993; +} +{ +fma.rn.f16x2 r3998, r3322, r3989, r3996; +} +{ +mul.f16x2 r4002, r3322, r3991; +} +{ +fma.rn.f16x2 r4005, r3466, r3989, r4002; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4011, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4013, {low, high}; +} +{ +mul.f16x2 r4014, r4011, r4013; +} +{ +mul.f16x2 r4017, r3985, r4009; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r4020, {high, low}; +} +{ +fma.rn.f16x2 r4022, r4014, r4020, r4017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4026, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4028, {high, high}; +} +{ +mul.f16x2 r4030, r2222, r4028; +} +{ +neg.f16x2 r4033, r4030; +} +{ +fma.rn.f16x2 r4035, r2078, r4026, r4033; +} +{ +mul.f16x2 r4039, r2078, r4028; +} +{ +fma.rn.f16x2 r4042, r2222, r4026, r4039; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4048, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4050, {low, high}; +} +{ +mul.f16x2 r4051, r4048, r4050; +} +{ +mul.f16x2 r4054, r4022, r4046; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4057, {high, low}; +} +{ +fma.rn.f16x2 r4059, r4051, r4057, r4054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4063, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4065, {high, high}; +} +{ +mul.f16x2 r4067, r2542, r4065; +} +{ +neg.f16x2 r4070, r4067; +} +{ +fma.rn.f16x2 r4072, r2398, r4063, r4070; +} +{ +mul.f16x2 r4076, r2398, r4065; +} +{ +fma.rn.f16x2 r4079, r2542, r4063, r4076; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4085, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4087, {low, high}; +} +{ +mul.f16x2 r4088, r4085, r4087; +} +{ +mul.f16x2 r4091, r4059, r4083; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4094, {high, low}; +} +{ +fma.rn.f16x2 r4096, r4088, r4094, r4091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4100, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4102, {high, high}; +} +{ +mul.f16x2 r4104, r2862, r4102; +} +{ +neg.f16x2 r4107, r4104; +} +{ +fma.rn.f16x2 r4109, r2718, r4100, r4107; +} +{ +mul.f16x2 r4113, r2718, r4102; +} +{ +fma.rn.f16x2 r4116, r2862, r4100, r4113; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4122, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4125, r4122, r4124; +} +{ +mul.f16x2 r4128, r4096, r4120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4131, {high, low}; +} +{ +fma.rn.f16x2 r4133, r4125, r4131, r4128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4137, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4139, {high, high}; +} +{ +mul.f16x2 r4141, r3182, r4139; +} +{ +neg.f16x2 r4144, r4141; +} +{ +fma.rn.f16x2 r4146, r3038, r4137, r4144; +} +{ +mul.f16x2 r4150, r3038, r4139; +} +{ +fma.rn.f16x2 r4153, r3182, r4137, r4150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4159, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4162, r4159, r4161; +} +{ +mul.f16x2 r4165, r4133, r4157; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4168, {high, low}; +} +{ +fma.rn.f16x2 r4170, r4162, r4168, r4165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4174, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4176, {high, high}; +} +{ +mul.f16x2 r4178, r3502, r4176; +} +{ +neg.f16x2 r4181, r4178; +} +{ +fma.rn.f16x2 r4183, r3358, r4174, r4181; +} +{ +mul.f16x2 r4187, r3358, r4176; +} +{ +fma.rn.f16x2 r4190, r3502, r4174, r4187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4196, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4198, {low, high}; +} +{ +mul.f16x2 r4199, r4196, r4198; +} +{ +mul.f16x2 r4202, r4170, r4194; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4205, {high, low}; +} +{ +fma.rn.f16x2 r4207, r4199, r4205, r4202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4211, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4213, {high, high}; +} +{ +mul.f16x2 r4215, r2150, r4213; +} +{ +neg.f16x2 r4218, r4215; +} +{ +fma.rn.f16x2 r4220, r2006, r4211, r4218; +} +{ +mul.f16x2 r4224, r2006, r4213; +} +{ +fma.rn.f16x2 r4227, r2150, r4211, r4224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4233, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4235, {low, high}; +} +{ +mul.f16x2 r4236, r4233, r4235; +} +{ +mul.f16x2 r4239, r4207, r4231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4242, {high, low}; +} +{ +fma.rn.f16x2 r4244, r4236, r4242, r4239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4250, {high, high}; +} +{ +mul.f16x2 r4252, r2470, r4250; +} +{ +neg.f16x2 r4255, r4252; +} +{ +fma.rn.f16x2 r4257, r2326, r4248, r4255; +} +{ +mul.f16x2 r4261, r2326, r4250; +} +{ +fma.rn.f16x2 r4264, r2470, r4248, r4261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4272, {low, high}; +} +{ +mul.f16x2 r4273, r4270, r4272; +} +{ +mul.f16x2 r4276, r4244, r4268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4279, {high, low}; +} +{ +fma.rn.f16x2 r4281, r4273, r4279, r4276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4287, {high, high}; +} +{ +mul.f16x2 r4289, r2790, r4287; +} +{ +neg.f16x2 r4292, r4289; +} +{ +fma.rn.f16x2 r4294, r2646, r4285, r4292; +} +{ +mul.f16x2 r4298, r2646, r4287; +} +{ +fma.rn.f16x2 r4301, r2790, r4285, r4298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4309, {low, high}; +} +{ +mul.f16x2 r4310, r4307, r4309; +} +{ +mul.f16x2 r4313, r4281, r4305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4316, {high, low}; +} +{ +fma.rn.f16x2 r4318, r4310, r4316, r4313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4324, {high, high}; +} +{ +mul.f16x2 r4326, r3110, r4324; +} +{ +neg.f16x2 r4329, r4326; +} +{ +fma.rn.f16x2 r4331, r2966, r4322, r4329; +} +{ +mul.f16x2 r4335, r2966, r4324; +} +{ +fma.rn.f16x2 r4338, r3110, r4322, r4335; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4344, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4346, {low, high}; +} +{ +mul.f16x2 r4347, r4344, r4346; +} +{ +mul.f16x2 r4350, r4318, r4342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4353, {high, low}; +} +{ +fma.rn.f16x2 r4355, r4347, r4353, r4350; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4359, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4361, {high, high}; +} +{ +mul.f16x2 r4363, r3430, r4361; +} +{ +neg.f16x2 r4366, r4363; +} +{ +fma.rn.f16x2 r4368, r3286, r4359, r4366; +} +{ +mul.f16x2 r4372, r3286, r4361; +} +{ +fma.rn.f16x2 r4375, r3430, r4359, r4372; +} +mad.lo.s32 r7907, r7904, 5000, r7902; +barrier.sync 0; +mad.lo.s32 r7908, r7906, 200, r7907; +st.shared.v2.f32 [r7908], {r1922, r1934}; +st.shared.v2.f32 [r7908+8], {r3517, r3524}; +st.shared.v2.f32 [r7908+16], {r3554, r3561}; +st.shared.v2.f32 [r7908+24], {r3591, r3598}; +st.shared.v2.f32 [r7908+32], {r3628, r3635}; +st.shared.v2.f32 [r7908+40], {r3665, r3672}; +st.shared.v2.f32 [r7908+48], {r3702, r3709}; +st.shared.v2.f32 [r7908+56], {r3739, r3746}; +st.shared.v2.f32 [r7908+64], {r3776, r3783}; +st.shared.v2.f32 [r7908+72], {r3813, r3820}; +st.shared.v2.f32 [r7908+80], {r3850, r3857}; +st.shared.v2.f32 [r7908+88], {r3887, r3894}; +st.shared.v2.f32 [r7908+96], {r3924, r3931}; +st.shared.v2.f32 [r7908+104], {r3961, r3968}; +st.shared.v2.f32 [r7908+112], {r3998, r4005}; +st.shared.v2.f32 [r7908+120], {r4035, r4042}; +st.shared.v2.f32 [r7908+128], {r4072, r4079}; +st.shared.v2.f32 [r7908+136], {r4109, r4116}; +st.shared.v2.f32 [r7908+144], {r4146, r4153}; +st.shared.v2.f32 [r7908+152], {r4183, r4190}; +st.shared.v2.f32 [r7908+160], {r4220, r4227}; +st.shared.v2.f32 [r7908+168], {r4257, r4264}; +st.shared.v2.f32 [r7908+176], {r4294, r4301}; +st.shared.v2.f32 [r7908+184], {r4331, r4338}; +st.shared.v2.f32 [r7908+192], {r4368, r4375}; +barrier.sync 0; +mad.lo.s32 r7909, r7906, -192, r7908; +ld.shared.u32 r4408, [r7909]; +ld.shared.u32 r4420, [r7909+4]; +ld.shared.u32 r4728, [r7909+200]; +ld.shared.u32 r4740, [r7909+204]; +ld.shared.u32 r5048, [r7909+400]; +ld.shared.u32 r5060, [r7909+404]; +ld.shared.u32 r5368, [r7909+600]; +ld.shared.u32 r5380, [r7909+604]; +ld.shared.u32 r5688, [r7909+800]; +ld.shared.u32 r5700, [r7909+804]; +ld.shared.u32 r4405, [r7909+1000]; +ld.shared.u32 r4417, [r7909+1004]; +ld.shared.u32 r4725, [r7909+1200]; +ld.shared.u32 r4737, [r7909+1204]; +ld.shared.u32 r5045, [r7909+1400]; +ld.shared.u32 r5057, [r7909+1404]; +ld.shared.u32 r5365, [r7909+1600]; +ld.shared.u32 r5377, [r7909+1604]; +ld.shared.u32 r5685, [r7909+1800]; +ld.shared.u32 r5697, [r7909+1804]; +ld.shared.u32 r4411, [r7909+2000]; +ld.shared.u32 r4423, [r7909+2004]; +ld.shared.u32 r4731, [r7909+2200]; +ld.shared.u32 r4743, [r7909+2204]; +ld.shared.u32 r5051, [r7909+2400]; +ld.shared.u32 r5063, [r7909+2404]; +ld.shared.u32 r5371, [r7909+2600]; +ld.shared.u32 r5383, [r7909+2604]; +ld.shared.u32 r5691, [r7909+2800]; +ld.shared.u32 r5703, [r7909+2804]; +ld.shared.u32 r4412, [r7909+3000]; +ld.shared.u32 r4424, [r7909+3004]; +ld.shared.u32 r4732, [r7909+3200]; +ld.shared.u32 r4744, [r7909+3204]; +ld.shared.u32 r5052, [r7909+3400]; +ld.shared.u32 r5064, [r7909+3404]; +ld.shared.u32 r5372, [r7909+3600]; +ld.shared.u32 r5384, [r7909+3604]; +ld.shared.u32 r5692, [r7909+3800]; +ld.shared.u32 r5704, [r7909+3804]; +ld.shared.u32 r4406, [r7909+4000]; +ld.shared.u32 r4418, [r7909+4004]; +ld.shared.u32 r4726, [r7909+4200]; +ld.shared.u32 r4738, [r7909+4204]; +ld.shared.u32 r5046, [r7909+4400]; +ld.shared.u32 r5058, [r7909+4404]; +ld.shared.u32 r5366, [r7909+4600]; +ld.shared.u32 r5378, [r7909+4604]; +ld.shared.u32 r5686, [r7909+4800]; +ld.shared.u32 r5698, [r7909+4804]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4401, {low, high}; +} +{ +neg.f16x2 r4402, r4401; +} +{ +add.f16x2 r4404, r4405, r4406; +} +{ +add.f16x2 r4407, r4408, r4404; +} +{ +add.f16x2 r4410, r4411, r4412; +} +{ +add.f16x2 r4413, r4407, r4410; +} +{ +add.f16x2 r4416, r4417, r4418; +} +{ +add.f16x2 r4419, r4420, r4416; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 r4425, r4419, r4422; +} +{ +add.f16x2 r4428, r4405, r4406; +} +{ +mul.f16x2 r4431, r4428, r4396; +} +{ +add.f16x2 r4434, r4408, r4431; +} +{ +add.f16x2 r4437, r4411, r4412; +} +{ +mul.f16x2 r4440, r4437, r4398; +} +{ +add.f16x2 r4443, r4434, r4440; +} +{ +sub.f16x2 r4446, r4417, r4418; +} +{ +mul.f16x2 r4449, r4446, r4397; +} +{ +sub.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4399; +} +{ +add.f16x2 r4458, r4449, r4455; +} +{ +sub.f16x2 r4461, r4443, r4458; +} +{ +add.f16x2 r4464, r4405, r4406; +} +{ +mul.f16x2 r4467, r4464, r4396; +} +{ +add.f16x2 r4470, r4408, r4467; +} +{ +add.f16x2 r4473, r4411, r4412; +} +{ +mul.f16x2 r4476, r4473, r4398; +} +{ +add.f16x2 r4479, r4470, r4476; +} +{ +sub.f16x2 r4482, r4417, r4418; +} +{ +mul.f16x2 r4485, r4482, r4397; +} +{ +sub.f16x2 r4488, r4423, r4424; +} +{ +mul.f16x2 r4491, r4488, r4399; +} +{ +add.f16x2 r4494, r4485, r4491; +} +{ +add.f16x2 r4497, r4479, r4494; +} +{ +add.f16x2 r4500, r4405, r4406; +} +{ +mul.f16x2 r4503, r4500, r4398; +} +{ +add.f16x2 r4506, r4408, r4503; +} +{ +add.f16x2 r4509, r4411, r4412; +} +{ +mul.f16x2 r4512, r4509, r4400; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +sub.f16x2 r4518, r4417, r4418; +} +{ +mul.f16x2 r4521, r4518, r4399; +} +{ +sub.f16x2 r4524, r4423, r4424; +} +{ +mul.f16x2 r4527, r4524, r4402; +} +{ +add.f16x2 r4530, r4521, r4527; +} +{ +sub.f16x2 r4533, r4515, r4530; +} +{ +add.f16x2 r4536, r4405, r4406; +} +{ +mul.f16x2 r4539, r4536, r4398; +} +{ +add.f16x2 r4542, r4408, r4539; +} +{ +add.f16x2 r4545, r4411, r4412; +} +{ +mul.f16x2 r4548, r4545, r4400; +} +{ +add.f16x2 r4551, r4542, r4548; +} +{ +sub.f16x2 r4554, r4417, r4418; +} +{ +mul.f16x2 r4557, r4554, r4399; +} +{ +sub.f16x2 r4560, r4423, r4424; +} +{ +mul.f16x2 r4563, r4560, r4402; +} +{ +add.f16x2 r4566, r4557, r4563; +} +{ +add.f16x2 r4569, r4551, r4566; +} +{ +add.f16x2 r4572, r4417, r4418; +} +{ +mul.f16x2 r4575, r4572, r4396; +} +{ +add.f16x2 r4578, r4420, r4575; +} +{ +add.f16x2 r4581, r4423, r4424; +} +{ +mul.f16x2 r4584, r4581, r4398; +} +{ +add.f16x2 r4587, r4578, r4584; +} +{ +sub.f16x2 r4590, r4405, r4406; +} +{ +mul.f16x2 r4593, r4590, r4397; +} +{ +sub.f16x2 r4596, r4411, r4412; +} +{ +mul.f16x2 r4599, r4596, r4399; +} +{ +add.f16x2 r4602, r4593, r4599; +} +{ +add.f16x2 r4605, r4587, r4602; +} +{ +add.f16x2 r4608, r4417, r4418; +} +{ +mul.f16x2 r4611, r4608, r4396; +} +{ +add.f16x2 r4614, r4420, r4611; +} +{ +add.f16x2 r4617, r4423, r4424; +} +{ +mul.f16x2 r4620, r4617, r4398; +} +{ +add.f16x2 r4623, r4614, r4620; +} +{ +sub.f16x2 r4626, r4405, r4406; +} +{ +mul.f16x2 r4629, r4626, r4397; +} +{ +sub.f16x2 r4632, r4411, r4412; +} +{ +mul.f16x2 r4635, r4632, r4399; +} +{ +add.f16x2 r4638, r4629, r4635; +} +{ +sub.f16x2 r4641, r4623, r4638; +} +{ +add.f16x2 r4644, r4417, r4418; +} +{ +mul.f16x2 r4647, r4644, r4398; +} +{ +add.f16x2 r4650, r4420, r4647; +} +{ +add.f16x2 r4653, r4423, r4424; +} +{ +mul.f16x2 r4656, r4653, r4400; +} +{ +add.f16x2 r4659, r4650, r4656; +} +{ +sub.f16x2 r4662, r4405, r4406; +} +{ +mul.f16x2 r4665, r4662, r4399; +} +{ +sub.f16x2 r4668, r4411, r4412; +} +{ +mul.f16x2 r4671, r4668, r4402; +} +{ +add.f16x2 r4674, r4665, r4671; +} +{ +add.f16x2 r4677, r4659, r4674; +} +{ +add.f16x2 r4680, r4417, r4418; +} +{ +mul.f16x2 r4683, r4680, r4398; +} +{ +add.f16x2 r4686, r4420, r4683; +} +{ +add.f16x2 r4689, r4423, r4424; +} +{ +mul.f16x2 r4692, r4689, r4400; +} +{ +add.f16x2 r4695, r4686, r4692; +} +{ +sub.f16x2 r4698, r4405, r4406; +} +{ +mul.f16x2 r4701, r4698, r4399; +} +{ +sub.f16x2 r4704, r4411, r4412; +} +{ +mul.f16x2 r4707, r4704, r4402; +} +{ +add.f16x2 r4710, r4701, r4707; +} +{ +sub.f16x2 r4713, r4695, r4710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4721, {low, high}; +} +{ +neg.f16x2 r4722, r4721; +} +{ +add.f16x2 r4724, r4725, r4726; +} +{ +add.f16x2 r4727, r4728, r4724; +} +{ +add.f16x2 r4730, r4731, r4732; +} +{ +add.f16x2 r4733, r4727, r4730; +} +{ +add.f16x2 r4736, r4737, r4738; +} +{ +add.f16x2 r4739, r4740, r4736; +} +{ +add.f16x2 r4742, r4743, r4744; +} +{ +add.f16x2 r4745, r4739, r4742; +} +{ +add.f16x2 r4748, r4725, r4726; +} +{ +mul.f16x2 r4751, r4748, r4716; +} +{ +add.f16x2 r4754, r4728, r4751; +} +{ +add.f16x2 r4757, r4731, r4732; +} +{ +mul.f16x2 r4760, r4757, r4718; +} +{ +add.f16x2 r4763, r4754, r4760; +} +{ +sub.f16x2 r4766, r4737, r4738; +} +{ +mul.f16x2 r4769, r4766, r4717; +} +{ +sub.f16x2 r4772, r4743, r4744; +} +{ +mul.f16x2 r4775, r4772, r4719; +} +{ +add.f16x2 r4778, r4769, r4775; +} +{ +sub.f16x2 r4781, r4763, r4778; +} +{ +add.f16x2 r4784, r4725, r4726; +} +{ +mul.f16x2 r4787, r4784, r4716; +} +{ +add.f16x2 r4790, r4728, r4787; +} +{ +add.f16x2 r4793, r4731, r4732; +} +{ +mul.f16x2 r4796, r4793, r4718; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +sub.f16x2 r4802, r4737, r4738; +} +{ +mul.f16x2 r4805, r4802, r4717; +} +{ +sub.f16x2 r4808, r4743, r4744; +} +{ +mul.f16x2 r4811, r4808, r4719; +} +{ +add.f16x2 r4814, r4805, r4811; +} +{ +add.f16x2 r4817, r4799, r4814; +} +{ +add.f16x2 r4820, r4725, r4726; +} +{ +mul.f16x2 r4823, r4820, r4718; +} +{ +add.f16x2 r4826, r4728, r4823; +} +{ +add.f16x2 r4829, r4731, r4732; +} +{ +mul.f16x2 r4832, r4829, r4720; +} +{ +add.f16x2 r4835, r4826, r4832; +} +{ +sub.f16x2 r4838, r4737, r4738; +} +{ +mul.f16x2 r4841, r4838, r4719; +} +{ +sub.f16x2 r4844, r4743, r4744; +} +{ +mul.f16x2 r4847, r4844, r4722; +} +{ +add.f16x2 r4850, r4841, r4847; +} +{ +sub.f16x2 r4853, r4835, r4850; +} +{ +add.f16x2 r4856, r4725, r4726; +} +{ +mul.f16x2 r4859, r4856, r4718; +} +{ +add.f16x2 r4862, r4728, r4859; +} +{ +add.f16x2 r4865, r4731, r4732; +} +{ +mul.f16x2 r4868, r4865, r4720; +} +{ +add.f16x2 r4871, r4862, r4868; +} +{ +sub.f16x2 r4874, r4737, r4738; +} +{ +mul.f16x2 r4877, r4874, r4719; +} +{ +sub.f16x2 r4880, r4743, r4744; +} +{ +mul.f16x2 r4883, r4880, r4722; +} +{ +add.f16x2 r4886, r4877, r4883; +} +{ +add.f16x2 r4889, r4871, r4886; +} +{ +add.f16x2 r4892, r4737, r4738; +} +{ +mul.f16x2 r4895, r4892, r4716; +} +{ +add.f16x2 r4898, r4740, r4895; +} +{ +add.f16x2 r4901, r4743, r4744; +} +{ +mul.f16x2 r4904, r4901, r4718; +} +{ +add.f16x2 r4907, r4898, r4904; +} +{ +sub.f16x2 r4910, r4725, r4726; +} +{ +mul.f16x2 r4913, r4910, r4717; +} +{ +sub.f16x2 r4916, r4731, r4732; +} +{ +mul.f16x2 r4919, r4916, r4719; +} +{ +add.f16x2 r4922, r4913, r4919; +} +{ +add.f16x2 r4925, r4907, r4922; +} +{ +add.f16x2 r4928, r4737, r4738; +} +{ +mul.f16x2 r4931, r4928, r4716; +} +{ +add.f16x2 r4934, r4740, r4931; +} +{ +add.f16x2 r4937, r4743, r4744; +} +{ +mul.f16x2 r4940, r4937, r4718; +} +{ +add.f16x2 r4943, r4934, r4940; +} +{ +sub.f16x2 r4946, r4725, r4726; +} +{ +mul.f16x2 r4949, r4946, r4717; +} +{ +sub.f16x2 r4952, r4731, r4732; +} +{ +mul.f16x2 r4955, r4952, r4719; +} +{ +add.f16x2 r4958, r4949, r4955; +} +{ +sub.f16x2 r4961, r4943, r4958; +} +{ +add.f16x2 r4964, r4737, r4738; +} +{ +mul.f16x2 r4967, r4964, r4718; +} +{ +add.f16x2 r4970, r4740, r4967; +} +{ +add.f16x2 r4973, r4743, r4744; +} +{ +mul.f16x2 r4976, r4973, r4720; +} +{ +add.f16x2 r4979, r4970, r4976; +} +{ +sub.f16x2 r4982, r4725, r4726; +} +{ +mul.f16x2 r4985, r4982, r4719; +} +{ +sub.f16x2 r4988, r4731, r4732; +} +{ +mul.f16x2 r4991, r4988, r4722; +} +{ +add.f16x2 r4994, r4985, r4991; +} +{ +add.f16x2 r4997, r4979, r4994; +} +{ +add.f16x2 r5000, r4737, r4738; +} +{ +mul.f16x2 r5003, r5000, r4718; +} +{ +add.f16x2 r5006, r4740, r5003; +} +{ +add.f16x2 r5009, r4743, r4744; +} +{ +mul.f16x2 r5012, r5009, r4720; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +sub.f16x2 r5018, r4725, r4726; +} +{ +mul.f16x2 r5021, r5018, r4719; +} +{ +sub.f16x2 r5024, r4731, r4732; +} +{ +mul.f16x2 r5027, r5024, r4722; +} +{ +add.f16x2 r5030, r5021, r5027; +} +{ +sub.f16x2 r5033, r5015, r5030; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5038, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5039, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5041, {low, high}; +} +{ +neg.f16x2 r5042, r5041; +} +{ +add.f16x2 r5044, r5045, r5046; +} +{ +add.f16x2 r5047, r5048, r5044; +} +{ +add.f16x2 r5050, r5051, r5052; +} +{ +add.f16x2 r5053, r5047, r5050; +} +{ +add.f16x2 r5056, r5057, r5058; +} +{ +add.f16x2 r5059, r5060, r5056; +} +{ +add.f16x2 r5062, r5063, r5064; +} +{ +add.f16x2 r5065, r5059, r5062; +} +{ +add.f16x2 r5068, r5045, r5046; +} +{ +mul.f16x2 r5071, r5068, r5036; +} +{ +add.f16x2 r5074, r5048, r5071; +} +{ +add.f16x2 r5077, r5051, r5052; +} +{ +mul.f16x2 r5080, r5077, r5038; +} +{ +add.f16x2 r5083, r5074, r5080; +} +{ +sub.f16x2 r5086, r5057, r5058; +} +{ +mul.f16x2 r5089, r5086, r5037; +} +{ +sub.f16x2 r5092, r5063, r5064; +} +{ +mul.f16x2 r5095, r5092, r5039; +} +{ +add.f16x2 r5098, r5089, r5095; +} +{ +sub.f16x2 r5101, r5083, r5098; +} +{ +add.f16x2 r5104, r5045, r5046; +} +{ +mul.f16x2 r5107, r5104, r5036; +} +{ +add.f16x2 r5110, r5048, r5107; +} +{ +add.f16x2 r5113, r5051, r5052; +} +{ +mul.f16x2 r5116, r5113, r5038; +} +{ +add.f16x2 r5119, r5110, r5116; +} +{ +sub.f16x2 r5122, r5057, r5058; +} +{ +mul.f16x2 r5125, r5122, r5037; +} +{ +sub.f16x2 r5128, r5063, r5064; +} +{ +mul.f16x2 r5131, r5128, r5039; +} +{ +add.f16x2 r5134, r5125, r5131; +} +{ +add.f16x2 r5137, r5119, r5134; +} +{ +add.f16x2 r5140, r5045, r5046; +} +{ +mul.f16x2 r5143, r5140, r5038; +} +{ +add.f16x2 r5146, r5048, r5143; +} +{ +add.f16x2 r5149, r5051, r5052; +} +{ +mul.f16x2 r5152, r5149, r5040; +} +{ +add.f16x2 r5155, r5146, r5152; +} +{ +sub.f16x2 r5158, r5057, r5058; +} +{ +mul.f16x2 r5161, r5158, r5039; +} +{ +sub.f16x2 r5164, r5063, r5064; +} +{ +mul.f16x2 r5167, r5164, r5042; +} +{ +add.f16x2 r5170, r5161, r5167; +} +{ +sub.f16x2 r5173, r5155, r5170; +} +{ +add.f16x2 r5176, r5045, r5046; +} +{ +mul.f16x2 r5179, r5176, r5038; +} +{ +add.f16x2 r5182, r5048, r5179; +} +{ +add.f16x2 r5185, r5051, r5052; +} +{ +mul.f16x2 r5188, r5185, r5040; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +sub.f16x2 r5194, r5057, r5058; +} +{ +mul.f16x2 r5197, r5194, r5039; +} +{ +sub.f16x2 r5200, r5063, r5064; +} +{ +mul.f16x2 r5203, r5200, r5042; +} +{ +add.f16x2 r5206, r5197, r5203; +} +{ +add.f16x2 r5209, r5191, r5206; +} +{ +add.f16x2 r5212, r5057, r5058; +} +{ +mul.f16x2 r5215, r5212, r5036; +} +{ +add.f16x2 r5218, r5060, r5215; +} +{ +add.f16x2 r5221, r5063, r5064; +} +{ +mul.f16x2 r5224, r5221, r5038; +} +{ +add.f16x2 r5227, r5218, r5224; +} +{ +sub.f16x2 r5230, r5045, r5046; +} +{ +mul.f16x2 r5233, r5230, r5037; +} +{ +sub.f16x2 r5236, r5051, r5052; +} +{ +mul.f16x2 r5239, r5236, r5039; +} +{ +add.f16x2 r5242, r5233, r5239; +} +{ +add.f16x2 r5245, r5227, r5242; +} +{ +add.f16x2 r5248, r5057, r5058; +} +{ +mul.f16x2 r5251, r5248, r5036; +} +{ +add.f16x2 r5254, r5060, r5251; +} +{ +add.f16x2 r5257, r5063, r5064; +} +{ +mul.f16x2 r5260, r5257, r5038; +} +{ +add.f16x2 r5263, r5254, r5260; +} +{ +sub.f16x2 r5266, r5045, r5046; +} +{ +mul.f16x2 r5269, r5266, r5037; +} +{ +sub.f16x2 r5272, r5051, r5052; +} +{ +mul.f16x2 r5275, r5272, r5039; +} +{ +add.f16x2 r5278, r5269, r5275; +} +{ +sub.f16x2 r5281, r5263, r5278; +} +{ +add.f16x2 r5284, r5057, r5058; +} +{ +mul.f16x2 r5287, r5284, r5038; +} +{ +add.f16x2 r5290, r5060, r5287; +} +{ +add.f16x2 r5293, r5063, r5064; +} +{ +mul.f16x2 r5296, r5293, r5040; +} +{ +add.f16x2 r5299, r5290, r5296; +} +{ +sub.f16x2 r5302, r5045, r5046; +} +{ +mul.f16x2 r5305, r5302, r5039; +} +{ +sub.f16x2 r5308, r5051, r5052; +} +{ +mul.f16x2 r5311, r5308, r5042; +} +{ +add.f16x2 r5314, r5305, r5311; +} +{ +add.f16x2 r5317, r5299, r5314; +} +{ +add.f16x2 r5320, r5057, r5058; +} +{ +mul.f16x2 r5323, r5320, r5038; +} +{ +add.f16x2 r5326, r5060, r5323; +} +{ +add.f16x2 r5329, r5063, r5064; +} +{ +mul.f16x2 r5332, r5329, r5040; +} +{ +add.f16x2 r5335, r5326, r5332; +} +{ +sub.f16x2 r5338, r5045, r5046; +} +{ +mul.f16x2 r5341, r5338, r5039; +} +{ +sub.f16x2 r5344, r5051, r5052; +} +{ +mul.f16x2 r5347, r5344, r5042; +} +{ +add.f16x2 r5350, r5341, r5347; +} +{ +sub.f16x2 r5353, r5335, r5350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5361, {low, high}; +} +{ +neg.f16x2 r5362, r5361; +} +{ +add.f16x2 r5364, r5365, r5366; +} +{ +add.f16x2 r5367, r5368, r5364; +} +{ +add.f16x2 r5370, r5371, r5372; +} +{ +add.f16x2 r5373, r5367, r5370; +} +{ +add.f16x2 r5376, r5377, r5378; +} +{ +add.f16x2 r5379, r5380, r5376; +} +{ +add.f16x2 r5382, r5383, r5384; +} +{ +add.f16x2 r5385, r5379, r5382; +} +{ +add.f16x2 r5388, r5365, r5366; +} +{ +mul.f16x2 r5391, r5388, r5356; +} +{ +add.f16x2 r5394, r5368, r5391; +} +{ +add.f16x2 r5397, r5371, r5372; +} +{ +mul.f16x2 r5400, r5397, r5358; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5377, r5378; +} +{ +mul.f16x2 r5409, r5406, r5357; +} +{ +sub.f16x2 r5412, r5383, r5384; +} +{ +mul.f16x2 r5415, r5412, r5359; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +sub.f16x2 r5421, r5403, r5418; +} +{ +add.f16x2 r5424, r5365, r5366; +} +{ +mul.f16x2 r5427, r5424, r5356; +} +{ +add.f16x2 r5430, r5368, r5427; +} +{ +add.f16x2 r5433, r5371, r5372; +} +{ +mul.f16x2 r5436, r5433, r5358; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5377, r5378; +} +{ +mul.f16x2 r5445, r5442, r5357; +} +{ +sub.f16x2 r5448, r5383, r5384; +} +{ +mul.f16x2 r5451, r5448, r5359; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +add.f16x2 r5457, r5439, r5454; +} +{ +add.f16x2 r5460, r5365, r5366; +} +{ +mul.f16x2 r5463, r5460, r5358; +} +{ +add.f16x2 r5466, r5368, r5463; +} +{ +add.f16x2 r5469, r5371, r5372; +} +{ +mul.f16x2 r5472, r5469, r5360; +} +{ +add.f16x2 r5475, r5466, r5472; +} +{ +sub.f16x2 r5478, r5377, r5378; +} +{ +mul.f16x2 r5481, r5478, r5359; +} +{ +sub.f16x2 r5484, r5383, r5384; +} +{ +mul.f16x2 r5487, r5484, r5362; +} +{ +add.f16x2 r5490, r5481, r5487; +} +{ +sub.f16x2 r5493, r5475, r5490; +} +{ +add.f16x2 r5496, r5365, r5366; +} +{ +mul.f16x2 r5499, r5496, r5358; +} +{ +add.f16x2 r5502, r5368, r5499; +} +{ +add.f16x2 r5505, r5371, r5372; +} +{ +mul.f16x2 r5508, r5505, r5360; +} +{ +add.f16x2 r5511, r5502, r5508; +} +{ +sub.f16x2 r5514, r5377, r5378; +} +{ +mul.f16x2 r5517, r5514, r5359; +} +{ +sub.f16x2 r5520, r5383, r5384; +} +{ +mul.f16x2 r5523, r5520, r5362; +} +{ +add.f16x2 r5526, r5517, r5523; +} +{ +add.f16x2 r5529, r5511, r5526; +} +{ +add.f16x2 r5532, r5377, r5378; +} +{ +mul.f16x2 r5535, r5532, r5356; +} +{ +add.f16x2 r5538, r5380, r5535; +} +{ +add.f16x2 r5541, r5383, r5384; +} +{ +mul.f16x2 r5544, r5541, r5358; +} +{ +add.f16x2 r5547, r5538, r5544; +} +{ +sub.f16x2 r5550, r5365, r5366; +} +{ +mul.f16x2 r5553, r5550, r5357; +} +{ +sub.f16x2 r5556, r5371, r5372; +} +{ +mul.f16x2 r5559, r5556, r5359; +} +{ +add.f16x2 r5562, r5553, r5559; +} +{ +add.f16x2 r5565, r5547, r5562; +} +{ +add.f16x2 r5568, r5377, r5378; +} +{ +mul.f16x2 r5571, r5568, r5356; +} +{ +add.f16x2 r5574, r5380, r5571; +} +{ +add.f16x2 r5577, r5383, r5384; +} +{ +mul.f16x2 r5580, r5577, r5358; +} +{ +add.f16x2 r5583, r5574, r5580; +} +{ +sub.f16x2 r5586, r5365, r5366; +} +{ +mul.f16x2 r5589, r5586, r5357; +} +{ +sub.f16x2 r5592, r5371, r5372; +} +{ +mul.f16x2 r5595, r5592, r5359; +} +{ +add.f16x2 r5598, r5589, r5595; +} +{ +sub.f16x2 r5601, r5583, r5598; +} +{ +add.f16x2 r5604, r5377, r5378; +} +{ +mul.f16x2 r5607, r5604, r5358; +} +{ +add.f16x2 r5610, r5380, r5607; +} +{ +add.f16x2 r5613, r5383, r5384; +} +{ +mul.f16x2 r5616, r5613, r5360; +} +{ +add.f16x2 r5619, r5610, r5616; +} +{ +sub.f16x2 r5622, r5365, r5366; +} +{ +mul.f16x2 r5625, r5622, r5359; +} +{ +sub.f16x2 r5628, r5371, r5372; +} +{ +mul.f16x2 r5631, r5628, r5362; +} +{ +add.f16x2 r5634, r5625, r5631; +} +{ +add.f16x2 r5637, r5619, r5634; +} +{ +add.f16x2 r5640, r5377, r5378; +} +{ +mul.f16x2 r5643, r5640, r5358; +} +{ +add.f16x2 r5646, r5380, r5643; +} +{ +add.f16x2 r5649, r5383, r5384; +} +{ +mul.f16x2 r5652, r5649, r5360; +} +{ +add.f16x2 r5655, r5646, r5652; +} +{ +sub.f16x2 r5658, r5365, r5366; +} +{ +mul.f16x2 r5661, r5658, r5359; +} +{ +sub.f16x2 r5664, r5371, r5372; +} +{ +mul.f16x2 r5667, r5664, r5362; +} +{ +add.f16x2 r5670, r5661, r5667; +} +{ +sub.f16x2 r5673, r5655, r5670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5680, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5681, {low, high}; +} +{ +neg.f16x2 r5682, r5681; +} +{ +add.f16x2 r5684, r5685, r5686; +} +{ +add.f16x2 r5687, r5688, r5684; +} +{ +add.f16x2 r5690, r5691, r5692; +} +{ +add.f16x2 r5693, r5687, r5690; +} +{ +add.f16x2 r5696, r5697, r5698; +} +{ +add.f16x2 r5699, r5700, r5696; +} +{ +add.f16x2 r5702, r5703, r5704; +} +{ +add.f16x2 r5705, r5699, r5702; +} +{ +add.f16x2 r5708, r5685, r5686; +} +{ +mul.f16x2 r5711, r5708, r5676; +} +{ +add.f16x2 r5714, r5688, r5711; +} +{ +add.f16x2 r5717, r5691, r5692; +} +{ +mul.f16x2 r5720, r5717, r5678; +} +{ +add.f16x2 r5723, r5714, r5720; +} +{ +sub.f16x2 r5726, r5697, r5698; +} +{ +mul.f16x2 r5729, r5726, r5677; +} +{ +sub.f16x2 r5732, r5703, r5704; +} +{ +mul.f16x2 r5735, r5732, r5679; +} +{ +add.f16x2 r5738, r5729, r5735; +} +{ +sub.f16x2 r5741, r5723, r5738; +} +{ +add.f16x2 r5744, r5685, r5686; +} +{ +mul.f16x2 r5747, r5744, r5676; +} +{ +add.f16x2 r5750, r5688, r5747; +} +{ +add.f16x2 r5753, r5691, r5692; +} +{ +mul.f16x2 r5756, r5753, r5678; +} +{ +add.f16x2 r5759, r5750, r5756; +} +{ +sub.f16x2 r5762, r5697, r5698; +} +{ +mul.f16x2 r5765, r5762, r5677; +} +{ +sub.f16x2 r5768, r5703, r5704; +} +{ +mul.f16x2 r5771, r5768, r5679; +} +{ +add.f16x2 r5774, r5765, r5771; +} +{ +add.f16x2 r5777, r5759, r5774; +} +{ +add.f16x2 r5780, r5685, r5686; +} +{ +mul.f16x2 r5783, r5780, r5678; +} +{ +add.f16x2 r5786, r5688, r5783; +} +{ +add.f16x2 r5789, r5691, r5692; +} +{ +mul.f16x2 r5792, r5789, r5680; +} +{ +add.f16x2 r5795, r5786, r5792; +} +{ +sub.f16x2 r5798, r5697, r5698; +} +{ +mul.f16x2 r5801, r5798, r5679; +} +{ +sub.f16x2 r5804, r5703, r5704; +} +{ +mul.f16x2 r5807, r5804, r5682; +} +{ +add.f16x2 r5810, r5801, r5807; +} +{ +sub.f16x2 r5813, r5795, r5810; +} +{ +add.f16x2 r5816, r5685, r5686; +} +{ +mul.f16x2 r5819, r5816, r5678; +} +{ +add.f16x2 r5822, r5688, r5819; +} +{ +add.f16x2 r5825, r5691, r5692; +} +{ +mul.f16x2 r5828, r5825, r5680; +} +{ +add.f16x2 r5831, r5822, r5828; +} +{ +sub.f16x2 r5834, r5697, r5698; +} +{ +mul.f16x2 r5837, r5834, r5679; +} +{ +sub.f16x2 r5840, r5703, r5704; +} +{ +mul.f16x2 r5843, r5840, r5682; +} +{ +add.f16x2 r5846, r5837, r5843; +} +{ +add.f16x2 r5849, r5831, r5846; +} +{ +add.f16x2 r5852, r5697, r5698; +} +{ +mul.f16x2 r5855, r5852, r5676; +} +{ +add.f16x2 r5858, r5700, r5855; +} +{ +add.f16x2 r5861, r5703, r5704; +} +{ +mul.f16x2 r5864, r5861, r5678; +} +{ +add.f16x2 r5867, r5858, r5864; +} +{ +sub.f16x2 r5870, r5685, r5686; +} +{ +mul.f16x2 r5873, r5870, r5677; +} +{ +sub.f16x2 r5876, r5691, r5692; +} +{ +mul.f16x2 r5879, r5876, r5679; +} +{ +add.f16x2 r5882, r5873, r5879; +} +{ +add.f16x2 r5885, r5867, r5882; +} +{ +add.f16x2 r5888, r5697, r5698; +} +{ +mul.f16x2 r5891, r5888, r5676; +} +{ +add.f16x2 r5894, r5700, r5891; +} +{ +add.f16x2 r5897, r5703, r5704; +} +{ +mul.f16x2 r5900, r5897, r5678; +} +{ +add.f16x2 r5903, r5894, r5900; +} +{ +sub.f16x2 r5906, r5685, r5686; +} +{ +mul.f16x2 r5909, r5906, r5677; +} +{ +sub.f16x2 r5912, r5691, r5692; +} +{ +mul.f16x2 r5915, r5912, r5679; +} +{ +add.f16x2 r5918, r5909, r5915; +} +{ +sub.f16x2 r5921, r5903, r5918; +} +{ +add.f16x2 r5924, r5697, r5698; +} +{ +mul.f16x2 r5927, r5924, r5678; +} +{ +add.f16x2 r5930, r5700, r5927; +} +{ +add.f16x2 r5933, r5703, r5704; +} +{ +mul.f16x2 r5936, r5933, r5680; +} +{ +add.f16x2 r5939, r5930, r5936; +} +{ +sub.f16x2 r5942, r5685, r5686; +} +{ +mul.f16x2 r5945, r5942, r5679; +} +{ +sub.f16x2 r5948, r5691, r5692; +} +{ +mul.f16x2 r5951, r5948, r5682; +} +{ +add.f16x2 r5954, r5945, r5951; +} +{ +add.f16x2 r5957, r5939, r5954; +} +{ +add.f16x2 r5960, r5697, r5698; +} +{ +mul.f16x2 r5963, r5960, r5678; +} +{ +add.f16x2 r5966, r5700, r5963; +} +{ +add.f16x2 r5969, r5703, r5704; +} +{ +mul.f16x2 r5972, r5969, r5680; +} +{ +add.f16x2 r5975, r5966, r5972; +} +{ +sub.f16x2 r5978, r5685, r5686; +} +{ +mul.f16x2 r5981, r5978, r5679; +} +{ +sub.f16x2 r5984, r5691, r5692; +} +{ +mul.f16x2 r5987, r5984, r5682; +} +{ +add.f16x2 r5990, r5981, r5987; +} +{ +sub.f16x2 r5993, r5975, r5990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r5996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r5997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r5998, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r5999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6000, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6001, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6002, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6003, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6006, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6007, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6027, {low, high}; +} +{ +mul.f16x2 r6044, r4781, r5996; +} +{ +mul.f16x2 r6047, r4925, r5997; +} +{ +sub.f16x2 r6050, r6044, r6047; +} +{ +mul.f16x2 r6053, r4781, r5997; +} +{ +fma.rn.f16x2 r6056, r4925, r5996, r6053; +} +{ +mul.f16x2 r6060, r5101, r5998; +} +{ +mul.f16x2 r6063, r5245, r5999; +} +{ +sub.f16x2 r6066, r6060, r6063; +} +{ +mul.f16x2 r6069, r5101, r5999; +} +{ +fma.rn.f16x2 r6072, r5245, r5998, r6069; +} +{ +mul.f16x2 r6076, r5421, r6000; +} +{ +mul.f16x2 r6079, r5565, r6001; +} +{ +sub.f16x2 r6082, r6076, r6079; +} +{ +mul.f16x2 r6085, r5421, r6001; +} +{ +fma.rn.f16x2 r6088, r5565, r6000, r6085; +} +{ +mul.f16x2 r6092, r5741, r6002; +} +{ +mul.f16x2 r6095, r5885, r6003; +} +{ +sub.f16x2 r6098, r6092, r6095; +} +{ +mul.f16x2 r6101, r5741, r6003; +} +{ +fma.rn.f16x2 r6104, r5885, r6002, r6101; +} +{ +mul.f16x2 r6108, r4853, r5998; +} +{ +mul.f16x2 r6111, r4997, r5999; +} +{ +sub.f16x2 r6114, r6108, r6111; +} +{ +mul.f16x2 r6117, r4853, r5999; +} +{ +fma.rn.f16x2 r6120, r4997, r5998, r6117; +} +{ +mul.f16x2 r6124, r5173, r6002; +} +{ +mul.f16x2 r6127, r5317, r6003; +} +{ +sub.f16x2 r6130, r6124, r6127; +} +{ +mul.f16x2 r6133, r5173, r6003; +} +{ +fma.rn.f16x2 r6136, r5317, r6002, r6133; +} +{ +mul.f16x2 r6140, r5493, r6006; +} +{ +mul.f16x2 r6143, r5637, r6007; +} +{ +sub.f16x2 r6146, r6140, r6143; +} +{ +mul.f16x2 r6149, r5493, r6007; +} +{ +fma.rn.f16x2 r6152, r5637, r6006, r6149; +} +{ +mul.f16x2 r6156, r5813, r6010; +} +{ +mul.f16x2 r6159, r5957, r6011; +} +{ +sub.f16x2 r6162, r6156, r6159; +} +{ +mul.f16x2 r6165, r5813, r6011; +} +{ +fma.rn.f16x2 r6168, r5957, r6010, r6165; +} +{ +mul.f16x2 r6172, r4889, r6000; +} +{ +mul.f16x2 r6175, r5033, r6001; +} +{ +sub.f16x2 r6178, r6172, r6175; +} +{ +mul.f16x2 r6181, r4889, r6001; +} +{ +fma.rn.f16x2 r6184, r5033, r6000, r6181; +} +{ +mul.f16x2 r6188, r5209, r6006; +} +{ +mul.f16x2 r6191, r5353, r6007; +} +{ +sub.f16x2 r6194, r6188, r6191; +} +{ +mul.f16x2 r6197, r5209, r6007; +} +{ +fma.rn.f16x2 r6200, r5353, r6006, r6197; +} +{ +mul.f16x2 r6204, r5529, r6012; +} +{ +mul.f16x2 r6207, r5673, r6013; +} +{ +sub.f16x2 r6210, r6204, r6207; +} +{ +mul.f16x2 r6213, r5529, r6013; +} +{ +fma.rn.f16x2 r6216, r5673, r6012, r6213; +} +{ +mul.f16x2 r6220, r5849, r6018; +} +{ +mul.f16x2 r6223, r5993, r6019; +} +{ +sub.f16x2 r6226, r6220, r6223; +} +{ +mul.f16x2 r6229, r5849, r6019; +} +{ +fma.rn.f16x2 r6232, r5993, r6018, r6229; +} +{ +mul.f16x2 r6236, r4817, r6002; +} +{ +mul.f16x2 r6239, r4961, r6003; +} +{ +sub.f16x2 r6242, r6236, r6239; +} +{ +mul.f16x2 r6245, r4817, r6003; +} +{ +fma.rn.f16x2 r6248, r4961, r6002, r6245; +} +{ +mul.f16x2 r6252, r5137, r6010; +} +{ +mul.f16x2 r6255, r5281, r6011; +} +{ +sub.f16x2 r6258, r6252, r6255; +} +{ +mul.f16x2 r6261, r5137, r6011; +} +{ +fma.rn.f16x2 r6264, r5281, r6010, r6261; +} +{ +mul.f16x2 r6268, r5457, r6018; +} +{ +mul.f16x2 r6271, r5601, r6019; +} +{ +sub.f16x2 r6274, r6268, r6271; +} +{ +mul.f16x2 r6277, r5457, r6019; +} +{ +fma.rn.f16x2 r6280, r5601, r6018, r6277; +} +{ +mul.f16x2 r6284, r5777, r6026; +} +{ +mul.f16x2 r6287, r5921, r6027; +} +{ +sub.f16x2 r6290, r6284, r6287; +} +{ +mul.f16x2 r6293, r5777, r6027; +} +{ +fma.rn.f16x2 r6296, r5921, r6026, r6293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6302, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6305, {low, high}; +} +{ +neg.f16x2 r6306, r6305; +} +{ +add.f16x2 r6308, r4733, r5693; +} +{ +add.f16x2 r6311, r4413, r6308; +} +{ +add.f16x2 r6314, r5053, r5373; +} +{ +add.f16x2 %0, r6311, r6314; +} +{ +add.f16x2 r6320, r4745, r5705; +} +{ +add.f16x2 r6323, r4425, r6320; +} +{ +add.f16x2 r6326, r5065, r5385; +} +{ +add.f16x2 %1, r6323, r6326; +} +{ +add.f16x2 r6332, r4733, r5693; +} +{ +mul.f16x2 r6335, r6332, r6300; +} +{ +add.f16x2 r6338, r4413, r6335; +} +{ +add.f16x2 r6341, r5053, r5373; +} +{ +mul.f16x2 r6344, r6341, r6302; +} +{ +add.f16x2 r6347, r6338, r6344; +} +{ +sub.f16x2 r6350, r4745, r5705; +} +{ +mul.f16x2 r6353, r6350, r6301; +} +{ +sub.f16x2 r6356, r5065, r5385; +} +{ +mul.f16x2 r6359, r6356, r6303; +} +{ +add.f16x2 r6362, r6353, r6359; +} +{ +sub.f16x2 %10, r6347, r6362; +} +{ +add.f16x2 r6368, r4733, r5693; +} +{ +mul.f16x2 r6371, r6368, r6300; +} +{ +add.f16x2 r6374, r4413, r6371; +} +{ +add.f16x2 r6377, r5053, r5373; +} +{ +mul.f16x2 r6380, r6377, r6302; +} +{ +add.f16x2 r6383, r6374, r6380; +} +{ +sub.f16x2 r6386, r4745, r5705; +} +{ +mul.f16x2 r6389, r6386, r6301; +} +{ +sub.f16x2 r6392, r5065, r5385; +} +{ +mul.f16x2 r6395, r6392, r6303; +} +{ +add.f16x2 r6398, r6389, r6395; +} +{ +add.f16x2 %40, r6383, r6398; +} +{ +add.f16x2 r6404, r4733, r5693; +} +{ +mul.f16x2 r6407, r6404, r6302; +} +{ +add.f16x2 r6410, r4413, r6407; +} +{ +add.f16x2 r6413, r5053, r5373; +} +{ +mul.f16x2 r6416, r6413, r6304; +} +{ +add.f16x2 r6419, r6410, r6416; +} +{ +sub.f16x2 r6422, r4745, r5705; +} +{ +mul.f16x2 r6425, r6422, r6303; +} +{ +sub.f16x2 r6428, r5065, r5385; +} +{ +mul.f16x2 r6431, r6428, r6306; +} +{ +add.f16x2 r6434, r6425, r6431; +} +{ +sub.f16x2 %20, r6419, r6434; +} +{ +add.f16x2 r6440, r4733, r5693; +} +{ +mul.f16x2 r6443, r6440, r6302; +} +{ +add.f16x2 r6446, r4413, r6443; +} +{ +add.f16x2 r6449, r5053, r5373; +} +{ +mul.f16x2 r6452, r6449, r6304; +} +{ +add.f16x2 r6455, r6446, r6452; +} +{ +sub.f16x2 r6458, r4745, r5705; +} +{ +mul.f16x2 r6461, r6458, r6303; +} +{ +sub.f16x2 r6464, r5065, r5385; +} +{ +mul.f16x2 r6467, r6464, r6306; +} +{ +add.f16x2 r6470, r6461, r6467; +} +{ +add.f16x2 %30, r6455, r6470; +} +{ +add.f16x2 r6476, r4745, r5705; +} +{ +mul.f16x2 r6479, r6476, r6300; +} +{ +add.f16x2 r6482, r4425, r6479; +} +{ +add.f16x2 r6485, r5065, r5385; +} +{ +mul.f16x2 r6488, r6485, r6302; +} +{ +add.f16x2 r6491, r6482, r6488; +} +{ +sub.f16x2 r6494, r4733, r5693; +} +{ +mul.f16x2 r6497, r6494, r6301; +} +{ +sub.f16x2 r6500, r5053, r5373; +} +{ +mul.f16x2 r6503, r6500, r6303; +} +{ +add.f16x2 r6506, r6497, r6503; +} +{ +add.f16x2 %11, r6491, r6506; +} +{ +add.f16x2 r6512, r4745, r5705; +} +{ +mul.f16x2 r6515, r6512, r6300; +} +{ +add.f16x2 r6518, r4425, r6515; +} +{ +add.f16x2 r6521, r5065, r5385; +} +{ +mul.f16x2 r6524, r6521, r6302; +} +{ +add.f16x2 r6527, r6518, r6524; +} +{ +sub.f16x2 r6530, r4733, r5693; +} +{ +mul.f16x2 r6533, r6530, r6301; +} +{ +sub.f16x2 r6536, r5053, r5373; +} +{ +mul.f16x2 r6539, r6536, r6303; +} +{ +add.f16x2 r6542, r6533, r6539; +} +{ +sub.f16x2 %41, r6527, r6542; +} +{ +add.f16x2 r6548, r4745, r5705; +} +{ +mul.f16x2 r6551, r6548, r6302; +} +{ +add.f16x2 r6554, r4425, r6551; +} +{ +add.f16x2 r6557, r5065, r5385; +} +{ +mul.f16x2 r6560, r6557, r6304; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +sub.f16x2 r6566, r4733, r5693; +} +{ +mul.f16x2 r6569, r6566, r6303; +} +{ +sub.f16x2 r6572, r5053, r5373; +} +{ +mul.f16x2 r6575, r6572, r6306; +} +{ +add.f16x2 r6578, r6569, r6575; +} +{ +add.f16x2 %21, r6563, r6578; +} +{ +add.f16x2 r6584, r4745, r5705; +} +{ +mul.f16x2 r6587, r6584, r6302; +} +{ +add.f16x2 r6590, r4425, r6587; +} +{ +add.f16x2 r6593, r5065, r5385; +} +{ +mul.f16x2 r6596, r6593, r6304; +} +{ +add.f16x2 r6599, r6590, r6596; +} +{ +sub.f16x2 r6602, r4733, r5693; +} +{ +mul.f16x2 r6605, r6602, r6303; +} +{ +sub.f16x2 r6608, r5053, r5373; +} +{ +mul.f16x2 r6611, r6608, r6306; +} +{ +add.f16x2 r6614, r6605, r6611; +} +{ +sub.f16x2 %31, r6599, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6623, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6625, {low, high}; +} +{ +neg.f16x2 r6626, r6625; +} +{ +add.f16x2 r6628, r6050, r6098; +} +{ +add.f16x2 r6631, r4461, r6628; +} +{ +add.f16x2 r6634, r6066, r6082; +} +{ +add.f16x2 %2, r6631, r6634; +} +{ +add.f16x2 r6640, r6056, r6104; +} +{ +add.f16x2 r6643, r4605, r6640; +} +{ +add.f16x2 r6646, r6072, r6088; +} +{ +add.f16x2 %3, r6643, r6646; +} +{ +add.f16x2 r6652, r6050, r6098; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4461, r6655; +} +{ +add.f16x2 r6661, r6066, r6082; +} +{ +mul.f16x2 r6664, r6661, r6622; +} +{ +add.f16x2 r6667, r6658, r6664; +} +{ +sub.f16x2 r6670, r6056, r6104; +} +{ +mul.f16x2 r6673, r6670, r6621; +} +{ +sub.f16x2 r6676, r6072, r6088; +} +{ +mul.f16x2 r6679, r6676, r6623; +} +{ +add.f16x2 r6682, r6673, r6679; +} +{ +sub.f16x2 %12, r6667, r6682; +} +{ +add.f16x2 r6688, r6050, r6098; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4461, r6691; +} +{ +add.f16x2 r6697, r6066, r6082; +} +{ +mul.f16x2 r6700, r6697, r6622; +} +{ +add.f16x2 r6703, r6694, r6700; +} +{ +sub.f16x2 r6706, r6056, r6104; +} +{ +mul.f16x2 r6709, r6706, r6621; +} +{ +sub.f16x2 r6712, r6072, r6088; +} +{ +mul.f16x2 r6715, r6712, r6623; +} +{ +add.f16x2 r6718, r6709, r6715; +} +{ +add.f16x2 %42, r6703, r6718; +} +{ +add.f16x2 r6724, r6050, r6098; +} +{ +mul.f16x2 r6727, r6724, r6622; +} +{ +add.f16x2 r6730, r4461, r6727; +} +{ +add.f16x2 r6733, r6066, r6082; +} +{ +mul.f16x2 r6736, r6733, r6624; +} +{ +add.f16x2 r6739, r6730, r6736; +} +{ +sub.f16x2 r6742, r6056, r6104; +} +{ +mul.f16x2 r6745, r6742, r6623; +} +{ +sub.f16x2 r6748, r6072, r6088; +} +{ +mul.f16x2 r6751, r6748, r6626; +} +{ +add.f16x2 r6754, r6745, r6751; +} +{ +sub.f16x2 %22, r6739, r6754; +} +{ +add.f16x2 r6760, r6050, r6098; +} +{ +mul.f16x2 r6763, r6760, r6622; +} +{ +add.f16x2 r6766, r4461, r6763; +} +{ +add.f16x2 r6769, r6066, r6082; +} +{ +mul.f16x2 r6772, r6769, r6624; +} +{ +add.f16x2 r6775, r6766, r6772; +} +{ +sub.f16x2 r6778, r6056, r6104; +} +{ +mul.f16x2 r6781, r6778, r6623; +} +{ +sub.f16x2 r6784, r6072, r6088; +} +{ +mul.f16x2 r6787, r6784, r6626; +} +{ +add.f16x2 r6790, r6781, r6787; +} +{ +add.f16x2 %32, r6775, r6790; +} +{ +add.f16x2 r6796, r6056, r6104; +} +{ +mul.f16x2 r6799, r6796, r6620; +} +{ +add.f16x2 r6802, r4605, r6799; +} +{ +add.f16x2 r6805, r6072, r6088; +} +{ +mul.f16x2 r6808, r6805, r6622; +} +{ +add.f16x2 r6811, r6802, r6808; +} +{ +sub.f16x2 r6814, r6050, r6098; +} +{ +mul.f16x2 r6817, r6814, r6621; +} +{ +sub.f16x2 r6820, r6066, r6082; +} +{ +mul.f16x2 r6823, r6820, r6623; +} +{ +add.f16x2 r6826, r6817, r6823; +} +{ +add.f16x2 %13, r6811, r6826; +} +{ +add.f16x2 r6832, r6056, r6104; +} +{ +mul.f16x2 r6835, r6832, r6620; +} +{ +add.f16x2 r6838, r4605, r6835; +} +{ +add.f16x2 r6841, r6072, r6088; +} +{ +mul.f16x2 r6844, r6841, r6622; +} +{ +add.f16x2 r6847, r6838, r6844; +} +{ +sub.f16x2 r6850, r6050, r6098; +} +{ +mul.f16x2 r6853, r6850, r6621; +} +{ +sub.f16x2 r6856, r6066, r6082; +} +{ +mul.f16x2 r6859, r6856, r6623; +} +{ +add.f16x2 r6862, r6853, r6859; +} +{ +sub.f16x2 %43, r6847, r6862; +} +{ +add.f16x2 r6868, r6056, r6104; +} +{ +mul.f16x2 r6871, r6868, r6622; +} +{ +add.f16x2 r6874, r4605, r6871; +} +{ +add.f16x2 r6877, r6072, r6088; +} +{ +mul.f16x2 r6880, r6877, r6624; +} +{ +add.f16x2 r6883, r6874, r6880; +} +{ +sub.f16x2 r6886, r6050, r6098; +} +{ +mul.f16x2 r6889, r6886, r6623; +} +{ +sub.f16x2 r6892, r6066, r6082; +} +{ +mul.f16x2 r6895, r6892, r6626; +} +{ +add.f16x2 r6898, r6889, r6895; +} +{ +add.f16x2 %23, r6883, r6898; +} +{ +add.f16x2 r6904, r6056, r6104; +} +{ +mul.f16x2 r6907, r6904, r6622; +} +{ +add.f16x2 r6910, r4605, r6907; +} +{ +add.f16x2 r6913, r6072, r6088; +} +{ +mul.f16x2 r6916, r6913, r6624; +} +{ +add.f16x2 r6919, r6910, r6916; +} +{ +sub.f16x2 r6922, r6050, r6098; +} +{ +mul.f16x2 r6925, r6922, r6623; +} +{ +sub.f16x2 r6928, r6066, r6082; +} +{ +mul.f16x2 r6931, r6928, r6626; +} +{ +add.f16x2 r6934, r6925, r6931; +} +{ +sub.f16x2 %33, r6919, r6934; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6945, {low, high}; +} +{ +neg.f16x2 r6946, r6945; +} +{ +add.f16x2 r6948, r6114, r6162; +} +{ +add.f16x2 r6951, r4533, r6948; +} +{ +add.f16x2 r6954, r6130, r6146; +} +{ +add.f16x2 %4, r6951, r6954; +} +{ +add.f16x2 r6960, r6120, r6168; +} +{ +add.f16x2 r6963, r4677, r6960; +} +{ +add.f16x2 r6966, r6136, r6152; +} +{ +add.f16x2 %5, r6963, r6966; +} +{ +add.f16x2 r6972, r6114, r6162; +} +{ +mul.f16x2 r6975, r6972, r6940; +} +{ +add.f16x2 r6978, r4533, r6975; +} +{ +add.f16x2 r6981, r6130, r6146; +} +{ +mul.f16x2 r6984, r6981, r6942; +} +{ +add.f16x2 r6987, r6978, r6984; +} +{ +sub.f16x2 r6990, r6120, r6168; +} +{ +mul.f16x2 r6993, r6990, r6941; +} +{ +sub.f16x2 r6996, r6136, r6152; +} +{ +mul.f16x2 r6999, r6996, r6943; +} +{ +add.f16x2 r7002, r6993, r6999; +} +{ +sub.f16x2 %14, r6987, r7002; +} +{ +add.f16x2 r7008, r6114, r6162; +} +{ +mul.f16x2 r7011, r7008, r6940; +} +{ +add.f16x2 r7014, r4533, r7011; +} +{ +add.f16x2 r7017, r6130, r6146; +} +{ +mul.f16x2 r7020, r7017, r6942; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6120, r6168; +} +{ +mul.f16x2 r7029, r7026, r6941; +} +{ +sub.f16x2 r7032, r6136, r6152; +} +{ +mul.f16x2 r7035, r7032, r6943; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +add.f16x2 %44, r7023, r7038; +} +{ +add.f16x2 r7044, r6114, r6162; +} +{ +mul.f16x2 r7047, r7044, r6942; +} +{ +add.f16x2 r7050, r4533, r7047; +} +{ +add.f16x2 r7053, r6130, r6146; +} +{ +mul.f16x2 r7056, r7053, r6944; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6120, r6168; +} +{ +mul.f16x2 r7065, r7062, r6943; +} +{ +sub.f16x2 r7068, r6136, r6152; +} +{ +mul.f16x2 r7071, r7068, r6946; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +sub.f16x2 %24, r7059, r7074; +} +{ +add.f16x2 r7080, r6114, r6162; +} +{ +mul.f16x2 r7083, r7080, r6942; +} +{ +add.f16x2 r7086, r4533, r7083; +} +{ +add.f16x2 r7089, r6130, r6146; +} +{ +mul.f16x2 r7092, r7089, r6944; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6120, r6168; +} +{ +mul.f16x2 r7101, r7098, r6943; +} +{ +sub.f16x2 r7104, r6136, r6152; +} +{ +mul.f16x2 r7107, r7104, r6946; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +add.f16x2 %34, r7095, r7110; +} +{ +add.f16x2 r7116, r6120, r6168; +} +{ +mul.f16x2 r7119, r7116, r6940; +} +{ +add.f16x2 r7122, r4677, r7119; +} +{ +add.f16x2 r7125, r6136, r6152; +} +{ +mul.f16x2 r7128, r7125, r6942; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6114, r6162; +} +{ +mul.f16x2 r7137, r7134, r6941; +} +{ +sub.f16x2 r7140, r6130, r6146; +} +{ +mul.f16x2 r7143, r7140, r6943; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 %15, r7131, r7146; +} +{ +add.f16x2 r7152, r6120, r6168; +} +{ +mul.f16x2 r7155, r7152, r6940; +} +{ +add.f16x2 r7158, r4677, r7155; +} +{ +add.f16x2 r7161, r6136, r6152; +} +{ +mul.f16x2 r7164, r7161, r6942; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6114, r6162; +} +{ +mul.f16x2 r7173, r7170, r6941; +} +{ +sub.f16x2 r7176, r6130, r6146; +} +{ +mul.f16x2 r7179, r7176, r6943; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +sub.f16x2 %45, r7167, r7182; +} +{ +add.f16x2 r7188, r6120, r6168; +} +{ +mul.f16x2 r7191, r7188, r6942; +} +{ +add.f16x2 r7194, r4677, r7191; +} +{ +add.f16x2 r7197, r6136, r6152; +} +{ +mul.f16x2 r7200, r7197, r6944; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6114, r6162; +} +{ +mul.f16x2 r7209, r7206, r6943; +} +{ +sub.f16x2 r7212, r6130, r6146; +} +{ +mul.f16x2 r7215, r7212, r6946; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +add.f16x2 %25, r7203, r7218; +} +{ +add.f16x2 r7224, r6120, r6168; +} +{ +mul.f16x2 r7227, r7224, r6942; +} +{ +add.f16x2 r7230, r4677, r7227; +} +{ +add.f16x2 r7233, r6136, r6152; +} +{ +mul.f16x2 r7236, r7233, r6944; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6114, r6162; +} +{ +mul.f16x2 r7245, r7242, r6943; +} +{ +sub.f16x2 r7248, r6130, r6146; +} +{ +mul.f16x2 r7251, r7248, r6946; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +sub.f16x2 %35, r7239, r7254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7265, {low, high}; +} +{ +neg.f16x2 r7266, r7265; +} +{ +add.f16x2 r7268, r6178, r6226; +} +{ +add.f16x2 r7271, r4569, r7268; +} +{ +add.f16x2 r7274, r6194, r6210; +} +{ +add.f16x2 %6, r7271, r7274; +} +{ +add.f16x2 r7280, r6184, r6232; +} +{ +add.f16x2 r7283, r4713, r7280; +} +{ +add.f16x2 r7286, r6200, r6216; +} +{ +add.f16x2 %7, r7283, r7286; +} +{ +add.f16x2 r7292, r6178, r6226; +} +{ +mul.f16x2 r7295, r7292, r7260; +} +{ +add.f16x2 r7298, r4569, r7295; +} +{ +add.f16x2 r7301, r6194, r6210; +} +{ +mul.f16x2 r7304, r7301, r7262; +} +{ +add.f16x2 r7307, r7298, r7304; +} +{ +sub.f16x2 r7310, r6184, r6232; +} +{ +mul.f16x2 r7313, r7310, r7261; +} +{ +sub.f16x2 r7316, r6200, r6216; +} +{ +mul.f16x2 r7319, r7316, r7263; +} +{ +add.f16x2 r7322, r7313, r7319; +} +{ +sub.f16x2 %16, r7307, r7322; +} +{ +add.f16x2 r7328, r6178, r6226; +} +{ +mul.f16x2 r7331, r7328, r7260; +} +{ +add.f16x2 r7334, r4569, r7331; +} +{ +add.f16x2 r7337, r6194, r6210; +} +{ +mul.f16x2 r7340, r7337, r7262; +} +{ +add.f16x2 r7343, r7334, r7340; +} +{ +sub.f16x2 r7346, r6184, r6232; +} +{ +mul.f16x2 r7349, r7346, r7261; +} +{ +sub.f16x2 r7352, r6200, r6216; +} +{ +mul.f16x2 r7355, r7352, r7263; +} +{ +add.f16x2 r7358, r7349, r7355; +} +{ +add.f16x2 %46, r7343, r7358; +} +{ +add.f16x2 r7364, r6178, r6226; +} +{ +mul.f16x2 r7367, r7364, r7262; +} +{ +add.f16x2 r7370, r4569, r7367; +} +{ +add.f16x2 r7373, r6194, r6210; +} +{ +mul.f16x2 r7376, r7373, r7264; +} +{ +add.f16x2 r7379, r7370, r7376; +} +{ +sub.f16x2 r7382, r6184, r6232; +} +{ +mul.f16x2 r7385, r7382, r7263; +} +{ +sub.f16x2 r7388, r6200, r6216; +} +{ +mul.f16x2 r7391, r7388, r7266; +} +{ +add.f16x2 r7394, r7385, r7391; +} +{ +sub.f16x2 %26, r7379, r7394; +} +{ +add.f16x2 r7400, r6178, r6226; +} +{ +mul.f16x2 r7403, r7400, r7262; +} +{ +add.f16x2 r7406, r4569, r7403; +} +{ +add.f16x2 r7409, r6194, r6210; +} +{ +mul.f16x2 r7412, r7409, r7264; +} +{ +add.f16x2 r7415, r7406, r7412; +} +{ +sub.f16x2 r7418, r6184, r6232; +} +{ +mul.f16x2 r7421, r7418, r7263; +} +{ +sub.f16x2 r7424, r6200, r6216; +} +{ +mul.f16x2 r7427, r7424, r7266; +} +{ +add.f16x2 r7430, r7421, r7427; +} +{ +add.f16x2 %36, r7415, r7430; +} +{ +add.f16x2 r7436, r6184, r6232; +} +{ +mul.f16x2 r7439, r7436, r7260; +} +{ +add.f16x2 r7442, r4713, r7439; +} +{ +add.f16x2 r7445, r6200, r6216; +} +{ +mul.f16x2 r7448, r7445, r7262; +} +{ +add.f16x2 r7451, r7442, r7448; +} +{ +sub.f16x2 r7454, r6178, r6226; +} +{ +mul.f16x2 r7457, r7454, r7261; +} +{ +sub.f16x2 r7460, r6194, r6210; +} +{ +mul.f16x2 r7463, r7460, r7263; +} +{ +add.f16x2 r7466, r7457, r7463; +} +{ +add.f16x2 %17, r7451, r7466; +} +{ +add.f16x2 r7472, r6184, r6232; +} +{ +mul.f16x2 r7475, r7472, r7260; +} +{ +add.f16x2 r7478, r4713, r7475; +} +{ +add.f16x2 r7481, r6200, r6216; +} +{ +mul.f16x2 r7484, r7481, r7262; +} +{ +add.f16x2 r7487, r7478, r7484; +} +{ +sub.f16x2 r7490, r6178, r6226; +} +{ +mul.f16x2 r7493, r7490, r7261; +} +{ +sub.f16x2 r7496, r6194, r6210; +} +{ +mul.f16x2 r7499, r7496, r7263; +} +{ +add.f16x2 r7502, r7493, r7499; +} +{ +sub.f16x2 %47, r7487, r7502; +} +{ +add.f16x2 r7508, r6184, r6232; +} +{ +mul.f16x2 r7511, r7508, r7262; +} +{ +add.f16x2 r7514, r4713, r7511; +} +{ +add.f16x2 r7517, r6200, r6216; +} +{ +mul.f16x2 r7520, r7517, r7264; +} +{ +add.f16x2 r7523, r7514, r7520; +} +{ +sub.f16x2 r7526, r6178, r6226; +} +{ +mul.f16x2 r7529, r7526, r7263; +} +{ +sub.f16x2 r7532, r6194, r6210; +} +{ +mul.f16x2 r7535, r7532, r7266; +} +{ +add.f16x2 r7538, r7529, r7535; +} +{ +add.f16x2 %27, r7523, r7538; +} +{ +add.f16x2 r7544, r6184, r6232; +} +{ +mul.f16x2 r7547, r7544, r7262; +} +{ +add.f16x2 r7550, r4713, r7547; +} +{ +add.f16x2 r7553, r6200, r6216; +} +{ +mul.f16x2 r7556, r7553, r7264; +} +{ +add.f16x2 r7559, r7550, r7556; +} +{ +sub.f16x2 r7562, r6178, r6226; +} +{ +mul.f16x2 r7565, r7562, r7263; +} +{ +sub.f16x2 r7568, r6194, r6210; +} +{ +mul.f16x2 r7571, r7568, r7266; +} +{ +add.f16x2 r7574, r7565, r7571; +} +{ +sub.f16x2 %37, r7559, r7574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7581, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7582, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7584, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7585, {low, high}; +} +{ +neg.f16x2 r7586, r7585; +} +{ +add.f16x2 r7588, r6242, r6290; +} +{ +add.f16x2 r7591, r4497, r7588; +} +{ +add.f16x2 r7594, r6258, r6274; +} +{ +add.f16x2 %8, r7591, r7594; +} +{ +add.f16x2 r7600, r6248, r6296; +} +{ +add.f16x2 r7603, r4641, r7600; +} +{ +add.f16x2 r7606, r6264, r6280; +} +{ +add.f16x2 %9, r7603, r7606; +} +{ +add.f16x2 r7612, r6242, r6290; +} +{ +mul.f16x2 r7615, r7612, r7580; +} +{ +add.f16x2 r7618, r4497, r7615; +} +{ +add.f16x2 r7621, r6258, r6274; +} +{ +mul.f16x2 r7624, r7621, r7582; +} +{ +add.f16x2 r7627, r7618, r7624; +} +{ +sub.f16x2 r7630, r6248, r6296; +} +{ +mul.f16x2 r7633, r7630, r7581; +} +{ +sub.f16x2 r7636, r6264, r6280; +} +{ +mul.f16x2 r7639, r7636, r7583; +} +{ +add.f16x2 r7642, r7633, r7639; +} +{ +sub.f16x2 %18, r7627, r7642; +} +{ +add.f16x2 r7648, r6242, r6290; +} +{ +mul.f16x2 r7651, r7648, r7580; +} +{ +add.f16x2 r7654, r4497, r7651; +} +{ +add.f16x2 r7657, r6258, r6274; +} +{ +mul.f16x2 r7660, r7657, r7582; +} +{ +add.f16x2 r7663, r7654, r7660; +} +{ +sub.f16x2 r7666, r6248, r6296; +} +{ +mul.f16x2 r7669, r7666, r7581; +} +{ +sub.f16x2 r7672, r6264, r6280; +} +{ +mul.f16x2 r7675, r7672, r7583; +} +{ +add.f16x2 r7678, r7669, r7675; +} +{ +add.f16x2 %48, r7663, r7678; +} +{ +add.f16x2 r7684, r6242, r6290; +} +{ +mul.f16x2 r7687, r7684, r7582; +} +{ +add.f16x2 r7690, r4497, r7687; +} +{ +add.f16x2 r7693, r6258, r6274; +} +{ +mul.f16x2 r7696, r7693, r7584; +} +{ +add.f16x2 r7699, r7690, r7696; +} +{ +sub.f16x2 r7702, r6248, r6296; +} +{ +mul.f16x2 r7705, r7702, r7583; +} +{ +sub.f16x2 r7708, r6264, r6280; +} +{ +mul.f16x2 r7711, r7708, r7586; +} +{ +add.f16x2 r7714, r7705, r7711; +} +{ +sub.f16x2 %28, r7699, r7714; +} +{ +add.f16x2 r7720, r6242, r6290; +} +{ +mul.f16x2 r7723, r7720, r7582; +} +{ +add.f16x2 r7726, r4497, r7723; +} +{ +add.f16x2 r7729, r6258, r6274; +} +{ +mul.f16x2 r7732, r7729, r7584; +} +{ +add.f16x2 r7735, r7726, r7732; +} +{ +sub.f16x2 r7738, r6248, r6296; +} +{ +mul.f16x2 r7741, r7738, r7583; +} +{ +sub.f16x2 r7744, r6264, r6280; +} +{ +mul.f16x2 r7747, r7744, r7586; +} +{ +add.f16x2 r7750, r7741, r7747; +} +{ +add.f16x2 %38, r7735, r7750; +} +{ +add.f16x2 r7756, r6248, r6296; +} +{ +mul.f16x2 r7759, r7756, r7580; +} +{ +add.f16x2 r7762, r4641, r7759; +} +{ +add.f16x2 r7765, r6264, r6280; +} +{ +mul.f16x2 r7768, r7765, r7582; +} +{ +add.f16x2 r7771, r7762, r7768; +} +{ +sub.f16x2 r7774, r6242, r6290; +} +{ +mul.f16x2 r7777, r7774, r7581; +} +{ +sub.f16x2 r7780, r6258, r6274; +} +{ +mul.f16x2 r7783, r7780, r7583; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 %19, r7771, r7786; +} +{ +add.f16x2 r7792, r6248, r6296; +} +{ +mul.f16x2 r7795, r7792, r7580; +} +{ +add.f16x2 r7798, r4641, r7795; +} +{ +add.f16x2 r7801, r6264, r6280; +} +{ +mul.f16x2 r7804, r7801, r7582; +} +{ +add.f16x2 r7807, r7798, r7804; +} +{ +sub.f16x2 r7810, r6242, r6290; +} +{ +mul.f16x2 r7813, r7810, r7581; +} +{ +sub.f16x2 r7816, r6258, r6274; +} +{ +mul.f16x2 r7819, r7816, r7583; +} +{ +add.f16x2 r7822, r7813, r7819; +} +{ +sub.f16x2 %49, r7807, r7822; +} +{ +add.f16x2 r7828, r6248, r6296; +} +{ +mul.f16x2 r7831, r7828, r7582; +} +{ +add.f16x2 r7834, r4641, r7831; +} +{ +add.f16x2 r7837, r6264, r6280; +} +{ +mul.f16x2 r7840, r7837, r7584; +} +{ +add.f16x2 r7843, r7834, r7840; +} +{ +sub.f16x2 r7846, r6242, r6290; +} +{ +mul.f16x2 r7849, r7846, r7583; +} +{ +sub.f16x2 r7852, r6258, r6274; +} +{ +mul.f16x2 r7855, r7852, r7586; +} +{ +add.f16x2 r7858, r7849, r7855; +} +{ +add.f16x2 %29, r7843, r7858; +} +{ +add.f16x2 r7864, r6248, r6296; +} +{ +mul.f16x2 r7867, r7864, r7582; +} +{ +add.f16x2 r7870, r4641, r7867; +} +{ +add.f16x2 r7873, r6264, r6280; +} +{ +mul.f16x2 r7876, r7873, r7584; +} +{ +add.f16x2 r7879, r7870, r7876; +} +{ +sub.f16x2 r7882, r6242, r6290; +} +{ +mul.f16x2 r7885, r7882, r7583; +} +{ +sub.f16x2 r7888, r6258, r6274; +} +{ +mul.f16x2 r7891, r7888, r7586; +} +{ +add.f16x2 r7894, r7885, r7891; +} +{ +sub.f16x2 %39, r7879, r7894; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[18].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<908, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<488>; +.reg .b32 r<7962>; +.reg .b64 rd<4>; +mov.u32 r7960, %tid.y; +mov.u32 r7961, %50; +mad.lo.s32 r7902, r7960, 2500, r7961; +mov.u32 r7903, %tid.x; +mov.f32 f482, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1, {low, high}; +} +mov.f32 f484, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2, {low, high}; +} +mov.f32 f478, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r3, {low, high}; +} +mov.f32 f480, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %95, %81; +} +{ +add.f16x2 r12, %54, r9; +} +{ +add.f16x2 r15, %60, %94; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %70, %58; +} +{ +add.f16x2 r24, %79, r21; +} +{ +add.f16x2 r27, %87, %69; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %95, %81; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %54, r36; +} +{ +add.f16x2 r42, %60, %94; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %70, %58; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %87, %69; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %95, %81; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %54, r72; +} +{ +add.f16x2 r78, %60, %94; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %70, %58; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %87, %69; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %95, %81; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %54, r108; +} +{ +add.f16x2 r114, %60, %94; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %70, %58; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %87, %69; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %95, %81; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %54, r144; +} +{ +add.f16x2 r150, %60, %94; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %70, %58; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %87, %69; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %70, %58; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %79, r180; +} +{ +add.f16x2 r186, %87, %69; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %95, %81; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %60, %94; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %70, %58; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %79, r216; +} +{ +add.f16x2 r222, %87, %69; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %95, %81; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %60, %94; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %70, %58; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %79, r252; +} +{ +add.f16x2 r258, %87, %69; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %95, %81; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %60, %94; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %70, %58; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %79, r288; +} +{ +add.f16x2 r294, %87, %69; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %95, %81; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %60, %94; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r324, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r325, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r326, {low, high}; +} +{ +neg.f16x2 r327, r326; +} +{ +add.f16x2 r329, %97, %83; +} +{ +add.f16x2 r332, %55, r329; +} +{ +add.f16x2 r335, %62, %96; +} +{ +add.f16x2 r338, r332, r335; +} +{ +add.f16x2 r341, %72, %61; +} +{ +add.f16x2 r344, %80, r341; +} +{ +add.f16x2 r347, %89, %71; +} +{ +add.f16x2 r350, r344, r347; +} +{ +add.f16x2 r353, %97, %83; +} +{ +mul.f16x2 r356, r353, r321; +} +{ +add.f16x2 r359, %55, r356; +} +{ +add.f16x2 r362, %62, %96; +} +{ +mul.f16x2 r365, r362, r323; +} +{ +add.f16x2 r368, r359, r365; +} +{ +sub.f16x2 r371, %72, %61; +} +{ +mul.f16x2 r374, r371, r322; +} +{ +sub.f16x2 r377, %89, %71; +} +{ +mul.f16x2 r380, r377, r324; +} +{ +add.f16x2 r383, r374, r380; +} +{ +sub.f16x2 r386, r368, r383; +} +{ +add.f16x2 r389, %97, %83; +} +{ +mul.f16x2 r392, r389, r321; +} +{ +add.f16x2 r395, %55, r392; +} +{ +add.f16x2 r398, %62, %96; +} +{ +mul.f16x2 r401, r398, r323; +} +{ +add.f16x2 r404, r395, r401; +} +{ +sub.f16x2 r407, %72, %61; +} +{ +mul.f16x2 r410, r407, r322; +} +{ +sub.f16x2 r413, %89, %71; +} +{ +mul.f16x2 r416, r413, r324; +} +{ +add.f16x2 r419, r410, r416; +} +{ +add.f16x2 r422, r404, r419; +} +{ +add.f16x2 r425, %97, %83; +} +{ +mul.f16x2 r428, r425, r323; +} +{ +add.f16x2 r431, %55, r428; +} +{ +add.f16x2 r434, %62, %96; +} +{ +mul.f16x2 r437, r434, r325; +} +{ +add.f16x2 r440, r431, r437; +} +{ +sub.f16x2 r443, %72, %61; +} +{ +mul.f16x2 r446, r443, r324; +} +{ +sub.f16x2 r449, %89, %71; +} +{ +mul.f16x2 r452, r449, r327; +} +{ +add.f16x2 r455, r446, r452; +} +{ +sub.f16x2 r458, r440, r455; +} +{ +add.f16x2 r461, %97, %83; +} +{ +mul.f16x2 r464, r461, r323; +} +{ +add.f16x2 r467, %55, r464; +} +{ +add.f16x2 r470, %62, %96; +} +{ +mul.f16x2 r473, r470, r325; +} +{ +add.f16x2 r476, r467, r473; +} +{ +sub.f16x2 r479, %72, %61; +} +{ +mul.f16x2 r482, r479, r324; +} +{ +sub.f16x2 r485, %89, %71; +} +{ +mul.f16x2 r488, r485, r327; +} +{ +add.f16x2 r491, r482, r488; +} +{ +add.f16x2 r494, r476, r491; +} +{ +add.f16x2 r497, %72, %61; +} +{ +mul.f16x2 r500, r497, r321; +} +{ +add.f16x2 r503, %80, r500; +} +{ +add.f16x2 r506, %89, %71; +} +{ +mul.f16x2 r509, r506, r323; +} +{ +add.f16x2 r512, r503, r509; +} +{ +sub.f16x2 r515, %97, %83; +} +{ +mul.f16x2 r518, r515, r322; +} +{ +sub.f16x2 r521, %62, %96; +} +{ +mul.f16x2 r524, r521, r324; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r512, r527; +} +{ +add.f16x2 r533, %72, %61; +} +{ +mul.f16x2 r536, r533, r321; +} +{ +add.f16x2 r539, %80, r536; +} +{ +add.f16x2 r542, %89, %71; +} +{ +mul.f16x2 r545, r542, r323; +} +{ +add.f16x2 r548, r539, r545; +} +{ +sub.f16x2 r551, %97, %83; +} +{ +mul.f16x2 r554, r551, r322; +} +{ +sub.f16x2 r557, %62, %96; +} +{ +mul.f16x2 r560, r557, r324; +} +{ +add.f16x2 r563, r554, r560; +} +{ +sub.f16x2 r566, r548, r563; +} +{ +add.f16x2 r569, %72, %61; +} +{ +mul.f16x2 r572, r569, r323; +} +{ +add.f16x2 r575, %80, r572; +} +{ +add.f16x2 r578, %89, %71; +} +{ +mul.f16x2 r581, r578, r325; +} +{ +add.f16x2 r584, r575, r581; +} +{ +sub.f16x2 r587, %97, %83; +} +{ +mul.f16x2 r590, r587, r324; +} +{ +sub.f16x2 r593, %62, %96; +} +{ +mul.f16x2 r596, r593, r327; +} +{ +add.f16x2 r599, r590, r596; +} +{ +add.f16x2 r602, r584, r599; +} +{ +add.f16x2 r605, %72, %61; +} +{ +mul.f16x2 r608, r605, r323; +} +{ +add.f16x2 r611, %80, r608; +} +{ +add.f16x2 r614, %89, %71; +} +{ +mul.f16x2 r617, r614, r325; +} +{ +add.f16x2 r620, r611, r617; +} +{ +sub.f16x2 r623, %97, %83; +} +{ +mul.f16x2 r626, r623, r324; +} +{ +sub.f16x2 r629, %62, %96; +} +{ +mul.f16x2 r632, r629, r327; +} +{ +add.f16x2 r635, r626, r632; +} +{ +sub.f16x2 r638, r620, r635; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r641, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r644, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +add.f16x2 r649, %99, %85; +} +{ +add.f16x2 r652, %56, r649; +} +{ +add.f16x2 r655, %64, %98; +} +{ +add.f16x2 r658, r652, r655; +} +{ +add.f16x2 r661, %74, %63; +} +{ +add.f16x2 r664, %82, r661; +} +{ +add.f16x2 r667, %91, %73; +} +{ +add.f16x2 r670, r664, r667; +} +{ +add.f16x2 r673, %99, %85; +} +{ +mul.f16x2 r676, r673, r641; +} +{ +add.f16x2 r679, %56, r676; +} +{ +add.f16x2 r682, %64, %98; +} +{ +mul.f16x2 r685, r682, r643; +} +{ +add.f16x2 r688, r679, r685; +} +{ +sub.f16x2 r691, %74, %63; +} +{ +mul.f16x2 r694, r691, r642; +} +{ +sub.f16x2 r697, %91, %73; +} +{ +mul.f16x2 r700, r697, r644; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r688, r703; +} +{ +add.f16x2 r709, %99, %85; +} +{ +mul.f16x2 r712, r709, r641; +} +{ +add.f16x2 r715, %56, r712; +} +{ +add.f16x2 r718, %64, %98; +} +{ +mul.f16x2 r721, r718, r643; +} +{ +add.f16x2 r724, r715, r721; +} +{ +sub.f16x2 r727, %74, %63; +} +{ +mul.f16x2 r730, r727, r642; +} +{ +sub.f16x2 r733, %91, %73; +} +{ +mul.f16x2 r736, r733, r644; +} +{ +add.f16x2 r739, r730, r736; +} +{ +add.f16x2 r742, r724, r739; +} +{ +add.f16x2 r745, %99, %85; +} +{ +mul.f16x2 r748, r745, r643; +} +{ +add.f16x2 r751, %56, r748; +} +{ +add.f16x2 r754, %64, %98; +} +{ +mul.f16x2 r757, r754, r645; +} +{ +add.f16x2 r760, r751, r757; +} +{ +sub.f16x2 r763, %74, %63; +} +{ +mul.f16x2 r766, r763, r644; +} +{ +sub.f16x2 r769, %91, %73; +} +{ +mul.f16x2 r772, r769, r647; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r760, r775; +} +{ +add.f16x2 r781, %99, %85; +} +{ +mul.f16x2 r784, r781, r643; +} +{ +add.f16x2 r787, %56, r784; +} +{ +add.f16x2 r790, %64, %98; +} +{ +mul.f16x2 r793, r790, r645; +} +{ +add.f16x2 r796, r787, r793; +} +{ +sub.f16x2 r799, %74, %63; +} +{ +mul.f16x2 r802, r799, r644; +} +{ +sub.f16x2 r805, %91, %73; +} +{ +mul.f16x2 r808, r805, r647; +} +{ +add.f16x2 r811, r802, r808; +} +{ +add.f16x2 r814, r796, r811; +} +{ +add.f16x2 r817, %74, %63; +} +{ +mul.f16x2 r820, r817, r641; +} +{ +add.f16x2 r823, %82, r820; +} +{ +add.f16x2 r826, %91, %73; +} +{ +mul.f16x2 r829, r826, r643; +} +{ +add.f16x2 r832, r823, r829; +} +{ +sub.f16x2 r835, %99, %85; +} +{ +mul.f16x2 r838, r835, r642; +} +{ +sub.f16x2 r841, %64, %98; +} +{ +mul.f16x2 r844, r841, r644; +} +{ +add.f16x2 r847, r838, r844; +} +{ +add.f16x2 r850, r832, r847; +} +{ +add.f16x2 r853, %74, %63; +} +{ +mul.f16x2 r856, r853, r641; +} +{ +add.f16x2 r859, %82, r856; +} +{ +add.f16x2 r862, %91, %73; +} +{ +mul.f16x2 r865, r862, r643; +} +{ +add.f16x2 r868, r859, r865; +} +{ +sub.f16x2 r871, %99, %85; +} +{ +mul.f16x2 r874, r871, r642; +} +{ +sub.f16x2 r877, %64, %98; +} +{ +mul.f16x2 r880, r877, r644; +} +{ +add.f16x2 r883, r874, r880; +} +{ +sub.f16x2 r886, r868, r883; +} +{ +add.f16x2 r889, %74, %63; +} +{ +mul.f16x2 r892, r889, r643; +} +{ +add.f16x2 r895, %82, r892; +} +{ +add.f16x2 r898, %91, %73; +} +{ +mul.f16x2 r901, r898, r645; +} +{ +add.f16x2 r904, r895, r901; +} +{ +sub.f16x2 r907, %99, %85; +} +{ +mul.f16x2 r910, r907, r644; +} +{ +sub.f16x2 r913, %64, %98; +} +{ +mul.f16x2 r916, r913, r647; +} +{ +add.f16x2 r919, r910, r916; +} +{ +add.f16x2 r922, r904, r919; +} +{ +add.f16x2 r925, %74, %63; +} +{ +mul.f16x2 r928, r925, r643; +} +{ +add.f16x2 r931, %82, r928; +} +{ +add.f16x2 r934, %91, %73; +} +{ +mul.f16x2 r937, r934, r645; +} +{ +add.f16x2 r940, r931, r937; +} +{ +sub.f16x2 r943, %99, %85; +} +{ +mul.f16x2 r946, r943, r644; +} +{ +sub.f16x2 r949, %64, %98; +} +{ +mul.f16x2 r952, r949, r647; +} +{ +add.f16x2 r955, r946, r952; +} +{ +sub.f16x2 r958, r940, r955; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r961, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r962, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r965, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r966, {low, high}; +} +{ +neg.f16x2 r967, r966; +} +{ +add.f16x2 r969, %51, %88; +} +{ +add.f16x2 r972, %57, r969; +} +{ +add.f16x2 r975, %66, %100; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, %76, %65; +} +{ +add.f16x2 r984, %84, r981; +} +{ +add.f16x2 r987, %92, %75; +} +{ +add.f16x2 r990, r984, r987; +} +{ +add.f16x2 r993, %51, %88; +} +{ +mul.f16x2 r996, r993, r961; +} +{ +add.f16x2 r999, %57, r996; +} +{ +add.f16x2 r1002, %66, %100; +} +{ +mul.f16x2 r1005, r1002, r963; +} +{ +add.f16x2 r1008, r999, r1005; +} +{ +sub.f16x2 r1011, %76, %65; +} +{ +mul.f16x2 r1014, r1011, r962; +} +{ +sub.f16x2 r1017, %92, %75; +} +{ +mul.f16x2 r1020, r1017, r964; +} +{ +add.f16x2 r1023, r1014, r1020; +} +{ +sub.f16x2 r1026, r1008, r1023; +} +{ +add.f16x2 r1029, %51, %88; +} +{ +mul.f16x2 r1032, r1029, r961; +} +{ +add.f16x2 r1035, %57, r1032; +} +{ +add.f16x2 r1038, %66, %100; +} +{ +mul.f16x2 r1041, r1038, r963; +} +{ +add.f16x2 r1044, r1035, r1041; +} +{ +sub.f16x2 r1047, %76, %65; +} +{ +mul.f16x2 r1050, r1047, r962; +} +{ +sub.f16x2 r1053, %92, %75; +} +{ +mul.f16x2 r1056, r1053, r964; +} +{ +add.f16x2 r1059, r1050, r1056; +} +{ +add.f16x2 r1062, r1044, r1059; +} +{ +add.f16x2 r1065, %51, %88; +} +{ +mul.f16x2 r1068, r1065, r963; +} +{ +add.f16x2 r1071, %57, r1068; +} +{ +add.f16x2 r1074, %66, %100; +} +{ +mul.f16x2 r1077, r1074, r965; +} +{ +add.f16x2 r1080, r1071, r1077; +} +{ +sub.f16x2 r1083, %76, %65; +} +{ +mul.f16x2 r1086, r1083, r964; +} +{ +sub.f16x2 r1089, %92, %75; +} +{ +mul.f16x2 r1092, r1089, r967; +} +{ +add.f16x2 r1095, r1086, r1092; +} +{ +sub.f16x2 r1098, r1080, r1095; +} +{ +add.f16x2 r1101, %51, %88; +} +{ +mul.f16x2 r1104, r1101, r963; +} +{ +add.f16x2 r1107, %57, r1104; +} +{ +add.f16x2 r1110, %66, %100; +} +{ +mul.f16x2 r1113, r1110, r965; +} +{ +add.f16x2 r1116, r1107, r1113; +} +{ +sub.f16x2 r1119, %76, %65; +} +{ +mul.f16x2 r1122, r1119, r964; +} +{ +sub.f16x2 r1125, %92, %75; +} +{ +mul.f16x2 r1128, r1125, r967; +} +{ +add.f16x2 r1131, r1122, r1128; +} +{ +add.f16x2 r1134, r1116, r1131; +} +{ +add.f16x2 r1137, %76, %65; +} +{ +mul.f16x2 r1140, r1137, r961; +} +{ +add.f16x2 r1143, %84, r1140; +} +{ +add.f16x2 r1146, %92, %75; +} +{ +mul.f16x2 r1149, r1146, r963; +} +{ +add.f16x2 r1152, r1143, r1149; +} +{ +sub.f16x2 r1155, %51, %88; +} +{ +mul.f16x2 r1158, r1155, r962; +} +{ +sub.f16x2 r1161, %66, %100; +} +{ +mul.f16x2 r1164, r1161, r964; +} +{ +add.f16x2 r1167, r1158, r1164; +} +{ +add.f16x2 r1170, r1152, r1167; +} +{ +add.f16x2 r1173, %76, %65; +} +{ +mul.f16x2 r1176, r1173, r961; +} +{ +add.f16x2 r1179, %84, r1176; +} +{ +add.f16x2 r1182, %92, %75; +} +{ +mul.f16x2 r1185, r1182, r963; +} +{ +add.f16x2 r1188, r1179, r1185; +} +{ +sub.f16x2 r1191, %51, %88; +} +{ +mul.f16x2 r1194, r1191, r962; +} +{ +sub.f16x2 r1197, %66, %100; +} +{ +mul.f16x2 r1200, r1197, r964; +} +{ +add.f16x2 r1203, r1194, r1200; +} +{ +sub.f16x2 r1206, r1188, r1203; +} +{ +add.f16x2 r1209, %76, %65; +} +{ +mul.f16x2 r1212, r1209, r963; +} +{ +add.f16x2 r1215, %84, r1212; +} +{ +add.f16x2 r1218, %92, %75; +} +{ +mul.f16x2 r1221, r1218, r965; +} +{ +add.f16x2 r1224, r1215, r1221; +} +{ +sub.f16x2 r1227, %51, %88; +} +{ +mul.f16x2 r1230, r1227, r964; +} +{ +sub.f16x2 r1233, %66, %100; +} +{ +mul.f16x2 r1236, r1233, r967; +} +{ +add.f16x2 r1239, r1230, r1236; +} +{ +add.f16x2 r1242, r1224, r1239; +} +{ +add.f16x2 r1245, %76, %65; +} +{ +mul.f16x2 r1248, r1245, r963; +} +{ +add.f16x2 r1251, %84, r1248; +} +{ +add.f16x2 r1254, %92, %75; +} +{ +mul.f16x2 r1257, r1254, r965; +} +{ +add.f16x2 r1260, r1251, r1257; +} +{ +sub.f16x2 r1263, %51, %88; +} +{ +mul.f16x2 r1266, r1263, r964; +} +{ +sub.f16x2 r1269, %66, %100; +} +{ +mul.f16x2 r1272, r1269, r967; +} +{ +add.f16x2 r1275, r1266, r1272; +} +{ +sub.f16x2 r1278, r1260, r1275; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1281, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1283, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1284, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1285, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1286, {low, high}; +} +{ +neg.f16x2 r1287, r1286; +} +{ +add.f16x2 r1289, %53, %90; +} +{ +add.f16x2 r1292, %59, r1289; +} +{ +add.f16x2 r1295, %68, %52; +} +{ +add.f16x2 r1298, r1292, r1295; +} +{ +add.f16x2 r1301, %78, %67; +} +{ +add.f16x2 r1304, %86, r1301; +} +{ +add.f16x2 r1307, %93, %77; +} +{ +add.f16x2 r1310, r1304, r1307; +} +{ +add.f16x2 r1313, %53, %90; +} +{ +mul.f16x2 r1316, r1313, r1281; +} +{ +add.f16x2 r1319, %59, r1316; +} +{ +add.f16x2 r1322, %68, %52; +} +{ +mul.f16x2 r1325, r1322, r1283; +} +{ +add.f16x2 r1328, r1319, r1325; +} +{ +sub.f16x2 r1331, %78, %67; +} +{ +mul.f16x2 r1334, r1331, r1282; +} +{ +sub.f16x2 r1337, %93, %77; +} +{ +mul.f16x2 r1340, r1337, r1284; +} +{ +add.f16x2 r1343, r1334, r1340; +} +{ +sub.f16x2 r1346, r1328, r1343; +} +{ +add.f16x2 r1349, %53, %90; +} +{ +mul.f16x2 r1352, r1349, r1281; +} +{ +add.f16x2 r1355, %59, r1352; +} +{ +add.f16x2 r1358, %68, %52; +} +{ +mul.f16x2 r1361, r1358, r1283; +} +{ +add.f16x2 r1364, r1355, r1361; +} +{ +sub.f16x2 r1367, %78, %67; +} +{ +mul.f16x2 r1370, r1367, r1282; +} +{ +sub.f16x2 r1373, %93, %77; +} +{ +mul.f16x2 r1376, r1373, r1284; +} +{ +add.f16x2 r1379, r1370, r1376; +} +{ +add.f16x2 r1382, r1364, r1379; +} +{ +add.f16x2 r1385, %53, %90; +} +{ +mul.f16x2 r1388, r1385, r1283; +} +{ +add.f16x2 r1391, %59, r1388; +} +{ +add.f16x2 r1394, %68, %52; +} +{ +mul.f16x2 r1397, r1394, r1285; +} +{ +add.f16x2 r1400, r1391, r1397; +} +{ +sub.f16x2 r1403, %78, %67; +} +{ +mul.f16x2 r1406, r1403, r1284; +} +{ +sub.f16x2 r1409, %93, %77; +} +{ +mul.f16x2 r1412, r1409, r1287; +} +{ +add.f16x2 r1415, r1406, r1412; +} +{ +sub.f16x2 r1418, r1400, r1415; +} +{ +add.f16x2 r1421, %53, %90; +} +{ +mul.f16x2 r1424, r1421, r1283; +} +{ +add.f16x2 r1427, %59, r1424; +} +{ +add.f16x2 r1430, %68, %52; +} +{ +mul.f16x2 r1433, r1430, r1285; +} +{ +add.f16x2 r1436, r1427, r1433; +} +{ +sub.f16x2 r1439, %78, %67; +} +{ +mul.f16x2 r1442, r1439, r1284; +} +{ +sub.f16x2 r1445, %93, %77; +} +{ +mul.f16x2 r1448, r1445, r1287; +} +{ +add.f16x2 r1451, r1442, r1448; +} +{ +add.f16x2 r1454, r1436, r1451; +} +{ +add.f16x2 r1457, %78, %67; +} +{ +mul.f16x2 r1460, r1457, r1281; +} +{ +add.f16x2 r1463, %86, r1460; +} +{ +add.f16x2 r1466, %93, %77; +} +{ +mul.f16x2 r1469, r1466, r1283; +} +{ +add.f16x2 r1472, r1463, r1469; +} +{ +sub.f16x2 r1475, %53, %90; +} +{ +mul.f16x2 r1478, r1475, r1282; +} +{ +sub.f16x2 r1481, %68, %52; +} +{ +mul.f16x2 r1484, r1481, r1284; +} +{ +add.f16x2 r1487, r1478, r1484; +} +{ +add.f16x2 r1490, r1472, r1487; +} +{ +add.f16x2 r1493, %78, %67; +} +{ +mul.f16x2 r1496, r1493, r1281; +} +{ +add.f16x2 r1499, %86, r1496; +} +{ +add.f16x2 r1502, %93, %77; +} +{ +mul.f16x2 r1505, r1502, r1283; +} +{ +add.f16x2 r1508, r1499, r1505; +} +{ +sub.f16x2 r1511, %53, %90; +} +{ +mul.f16x2 r1514, r1511, r1282; +} +{ +sub.f16x2 r1517, %68, %52; +} +{ +mul.f16x2 r1520, r1517, r1284; +} +{ +add.f16x2 r1523, r1514, r1520; +} +{ +sub.f16x2 r1526, r1508, r1523; +} +{ +add.f16x2 r1529, %78, %67; +} +{ +mul.f16x2 r1532, r1529, r1283; +} +{ +add.f16x2 r1535, %86, r1532; +} +{ +add.f16x2 r1538, %93, %77; +} +{ +mul.f16x2 r1541, r1538, r1285; +} +{ +add.f16x2 r1544, r1535, r1541; +} +{ +sub.f16x2 r1547, %53, %90; +} +{ +mul.f16x2 r1550, r1547, r1284; +} +{ +sub.f16x2 r1553, %68, %52; +} +{ +mul.f16x2 r1556, r1553, r1287; +} +{ +add.f16x2 r1559, r1550, r1556; +} +{ +add.f16x2 r1562, r1544, r1559; +} +{ +add.f16x2 r1565, %78, %67; +} +{ +mul.f16x2 r1568, r1565, r1283; +} +{ +add.f16x2 r1571, %86, r1568; +} +{ +add.f16x2 r1574, %93, %77; +} +{ +mul.f16x2 r1577, r1574, r1285; +} +{ +add.f16x2 r1580, r1571, r1577; +} +{ +sub.f16x2 r1583, %53, %90; +} +{ +mul.f16x2 r1586, r1583, r1284; +} +{ +sub.f16x2 r1589, %68, %52; +} +{ +mul.f16x2 r1592, r1589, r1287; +} +{ +add.f16x2 r1595, r1586, r1592; +} +{ +sub.f16x2 r1598, r1580, r1595; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1601, {low, high}; +} +mov.f32 f332, 0fBE7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1602, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1603, {low, high}; +} +mov.f32 f336, 0fBEF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1604, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1605, {low, high}; +} +mov.f32 f340, 0fBF2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1606, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1607, {low, high}; +} +mov.f32 f344, 0fBF5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1608, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1611, {low, high}; +} +mov.f32 f352, 0fBF7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1612, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1615, {low, high}; +} +mov.f32 f360, 0fBF67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1616, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1617, {low, high}; +} +mov.f32 f364, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1618, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1623, {low, high}; +} +mov.f32 f376, 0fBE00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1631, {low, high}; +} +mov.f32 f392, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1632, {low, high}; +} +{ +mul.f16x2 r1649, r386, r1601; +} +{ +mul.f16x2 r1652, r530, r1602; +} +{ +sub.f16x2 r1655, r1649, r1652; +} +{ +mul.f16x2 r1658, r386, r1602; +} +{ +fma.rn.f16x2 r1661, r530, r1601, r1658; +} +{ +mul.f16x2 r1665, r706, r1603; +} +{ +mul.f16x2 r1668, r850, r1604; +} +{ +sub.f16x2 r1671, r1665, r1668; +} +{ +mul.f16x2 r1674, r706, r1604; +} +{ +fma.rn.f16x2 r1677, r850, r1603, r1674; +} +{ +mul.f16x2 r1681, r1026, r1605; +} +{ +mul.f16x2 r1684, r1170, r1606; +} +{ +sub.f16x2 r1687, r1681, r1684; +} +{ +mul.f16x2 r1690, r1026, r1606; +} +{ +fma.rn.f16x2 r1693, r1170, r1605, r1690; +} +{ +mul.f16x2 r1697, r1346, r1607; +} +{ +mul.f16x2 r1700, r1490, r1608; +} +{ +sub.f16x2 r1703, r1697, r1700; +} +{ +mul.f16x2 r1706, r1346, r1608; +} +{ +fma.rn.f16x2 r1709, r1490, r1607, r1706; +} +{ +mul.f16x2 r1713, r458, r1603; +} +{ +mul.f16x2 r1716, r602, r1604; +} +{ +sub.f16x2 r1719, r1713, r1716; +} +{ +mul.f16x2 r1722, r458, r1604; +} +{ +fma.rn.f16x2 r1725, r602, r1603, r1722; +} +{ +mul.f16x2 r1729, r778, r1607; +} +{ +mul.f16x2 r1732, r922, r1608; +} +{ +sub.f16x2 r1735, r1729, r1732; +} +{ +mul.f16x2 r1738, r778, r1608; +} +{ +fma.rn.f16x2 r1741, r922, r1607, r1738; +} +{ +mul.f16x2 r1745, r1098, r1611; +} +{ +mul.f16x2 r1748, r1242, r1612; +} +{ +sub.f16x2 r1751, r1745, r1748; +} +{ +mul.f16x2 r1754, r1098, r1612; +} +{ +fma.rn.f16x2 r1757, r1242, r1611, r1754; +} +{ +mul.f16x2 r1761, r1418, r1615; +} +{ +mul.f16x2 r1764, r1562, r1616; +} +{ +sub.f16x2 r1767, r1761, r1764; +} +{ +mul.f16x2 r1770, r1418, r1616; +} +{ +fma.rn.f16x2 r1773, r1562, r1615, r1770; +} +{ +mul.f16x2 r1777, r494, r1605; +} +{ +mul.f16x2 r1780, r638, r1606; +} +{ +sub.f16x2 r1783, r1777, r1780; +} +{ +mul.f16x2 r1786, r494, r1606; +} +{ +fma.rn.f16x2 r1789, r638, r1605, r1786; +} +{ +mul.f16x2 r1793, r814, r1611; +} +{ +mul.f16x2 r1796, r958, r1612; +} +{ +sub.f16x2 r1799, r1793, r1796; +} +{ +mul.f16x2 r1802, r814, r1612; +} +{ +fma.rn.f16x2 r1805, r958, r1611, r1802; +} +{ +mul.f16x2 r1809, r1134, r1617; +} +{ +mul.f16x2 r1812, r1278, r1618; +} +{ +sub.f16x2 r1815, r1809, r1812; +} +{ +mul.f16x2 r1818, r1134, r1618; +} +{ +fma.rn.f16x2 r1821, r1278, r1617, r1818; +} +{ +mul.f16x2 r1825, r1454, r1623; +} +{ +mul.f16x2 r1828, r1598, r1624; +} +{ +sub.f16x2 r1831, r1825, r1828; +} +{ +mul.f16x2 r1834, r1454, r1624; +} +{ +fma.rn.f16x2 r1837, r1598, r1623, r1834; +} +{ +mul.f16x2 r1841, r422, r1607; +} +{ +mul.f16x2 r1844, r566, r1608; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r422, r1608; +} +{ +fma.rn.f16x2 r1853, r566, r1607, r1850; +} +{ +mul.f16x2 r1857, r742, r1615; +} +{ +mul.f16x2 r1860, r886, r1616; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r742, r1616; +} +{ +fma.rn.f16x2 r1869, r886, r1615, r1866; +} +{ +mul.f16x2 r1873, r1062, r1623; +} +{ +mul.f16x2 r1876, r1206, r1624; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1062, r1624; +} +{ +fma.rn.f16x2 r1885, r1206, r1623, r1882; +} +{ +mul.f16x2 r1889, r1382, r1631; +} +{ +mul.f16x2 r1892, r1526, r1632; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1382, r1632; +} +{ +fma.rn.f16x2 r1901, r1526, r1631, r1898; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1905, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1906, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1909, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1910, {low, high}; +} +{ +neg.f16x2 r1911, r1910; +} +{ +add.f16x2 r1913, r338, r1298; +} +{ +add.f16x2 r1916, r18, r1913; +} +{ +add.f16x2 r1919, r658, r978; +} +{ +add.f16x2 r1922, r1916, r1919; +} +{ +add.f16x2 r1925, r350, r1310; +} +{ +add.f16x2 r1928, r30, r1925; +} +{ +add.f16x2 r1931, r670, r990; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r338, r1298; +} +{ +mul.f16x2 r1940, r1937, r1905; +} +{ +add.f16x2 r1943, r18, r1940; +} +{ +add.f16x2 r1946, r658, r978; +} +{ +mul.f16x2 r1949, r1946, r1907; +} +{ +add.f16x2 r1952, r1943, r1949; +} +{ +sub.f16x2 r1955, r350, r1310; +} +{ +mul.f16x2 r1958, r1955, r1906; +} +{ +sub.f16x2 r1961, r670, r990; +} +{ +mul.f16x2 r1964, r1961, r1908; +} +{ +add.f16x2 r1967, r1958, r1964; +} +{ +sub.f16x2 r1970, r1952, r1967; +} +{ +add.f16x2 r1973, r338, r1298; +} +{ +mul.f16x2 r1976, r1973, r1905; +} +{ +add.f16x2 r1979, r18, r1976; +} +{ +add.f16x2 r1982, r658, r978; +} +{ +mul.f16x2 r1985, r1982, r1907; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +sub.f16x2 r1991, r350, r1310; +} +{ +mul.f16x2 r1994, r1991, r1906; +} +{ +sub.f16x2 r1997, r670, r990; +} +{ +mul.f16x2 r2000, r1997, r1908; +} +{ +add.f16x2 r2003, r1994, r2000; +} +{ +add.f16x2 r2006, r1988, r2003; +} +{ +add.f16x2 r2009, r338, r1298; +} +{ +mul.f16x2 r2012, r2009, r1907; +} +{ +add.f16x2 r2015, r18, r2012; +} +{ +add.f16x2 r2018, r658, r978; +} +{ +mul.f16x2 r2021, r2018, r1909; +} +{ +add.f16x2 r2024, r2015, r2021; +} +{ +sub.f16x2 r2027, r350, r1310; +} +{ +mul.f16x2 r2030, r2027, r1908; +} +{ +sub.f16x2 r2033, r670, r990; +} +{ +mul.f16x2 r2036, r2033, r1911; +} +{ +add.f16x2 r2039, r2030, r2036; +} +{ +sub.f16x2 r2042, r2024, r2039; +} +{ +add.f16x2 r2045, r338, r1298; +} +{ +mul.f16x2 r2048, r2045, r1907; +} +{ +add.f16x2 r2051, r18, r2048; +} +{ +add.f16x2 r2054, r658, r978; +} +{ +mul.f16x2 r2057, r2054, r1909; +} +{ +add.f16x2 r2060, r2051, r2057; +} +{ +sub.f16x2 r2063, r350, r1310; +} +{ +mul.f16x2 r2066, r2063, r1908; +} +{ +sub.f16x2 r2069, r670, r990; +} +{ +mul.f16x2 r2072, r2069, r1911; +} +{ +add.f16x2 r2075, r2066, r2072; +} +{ +add.f16x2 r2078, r2060, r2075; +} +{ +add.f16x2 r2081, r350, r1310; +} +{ +mul.f16x2 r2084, r2081, r1905; +} +{ +add.f16x2 r2087, r30, r2084; +} +{ +add.f16x2 r2090, r670, r990; +} +{ +mul.f16x2 r2093, r2090, r1907; +} +{ +add.f16x2 r2096, r2087, r2093; +} +{ +sub.f16x2 r2099, r338, r1298; +} +{ +mul.f16x2 r2102, r2099, r1906; +} +{ +sub.f16x2 r2105, r658, r978; +} +{ +mul.f16x2 r2108, r2105, r1908; +} +{ +add.f16x2 r2111, r2102, r2108; +} +{ +add.f16x2 r2114, r2096, r2111; +} +{ +add.f16x2 r2117, r350, r1310; +} +{ +mul.f16x2 r2120, r2117, r1905; +} +{ +add.f16x2 r2123, r30, r2120; +} +{ +add.f16x2 r2126, r670, r990; +} +{ +mul.f16x2 r2129, r2126, r1907; +} +{ +add.f16x2 r2132, r2123, r2129; +} +{ +sub.f16x2 r2135, r338, r1298; +} +{ +mul.f16x2 r2138, r2135, r1906; +} +{ +sub.f16x2 r2141, r658, r978; +} +{ +mul.f16x2 r2144, r2141, r1908; +} +{ +add.f16x2 r2147, r2138, r2144; +} +{ +sub.f16x2 r2150, r2132, r2147; +} +{ +add.f16x2 r2153, r350, r1310; +} +{ +mul.f16x2 r2156, r2153, r1907; +} +{ +add.f16x2 r2159, r30, r2156; +} +{ +add.f16x2 r2162, r670, r990; +} +{ +mul.f16x2 r2165, r2162, r1909; +} +{ +add.f16x2 r2168, r2159, r2165; +} +{ +sub.f16x2 r2171, r338, r1298; +} +{ +mul.f16x2 r2174, r2171, r1908; +} +{ +sub.f16x2 r2177, r658, r978; +} +{ +mul.f16x2 r2180, r2177, r1911; +} +{ +add.f16x2 r2183, r2174, r2180; +} +{ +add.f16x2 r2186, r2168, r2183; +} +{ +add.f16x2 r2189, r350, r1310; +} +{ +mul.f16x2 r2192, r2189, r1907; +} +{ +add.f16x2 r2195, r30, r2192; +} +{ +add.f16x2 r2198, r670, r990; +} +{ +mul.f16x2 r2201, r2198, r1909; +} +{ +add.f16x2 r2204, r2195, r2201; +} +{ +sub.f16x2 r2207, r338, r1298; +} +{ +mul.f16x2 r2210, r2207, r1908; +} +{ +sub.f16x2 r2213, r658, r978; +} +{ +mul.f16x2 r2216, r2213, r1911; +} +{ +add.f16x2 r2219, r2210, r2216; +} +{ +sub.f16x2 r2222, r2204, r2219; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2225, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2226, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2227, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2228, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2229, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2230, {low, high}; +} +{ +neg.f16x2 r2231, r2230; +} +{ +add.f16x2 r2233, r1655, r1703; +} +{ +add.f16x2 r2236, r66, r2233; +} +{ +add.f16x2 r2239, r1671, r1687; +} +{ +add.f16x2 r2242, r2236, r2239; +} +{ +add.f16x2 r2245, r1661, r1709; +} +{ +add.f16x2 r2248, r210, r2245; +} +{ +add.f16x2 r2251, r1677, r1693; +} +{ +add.f16x2 r2254, r2248, r2251; +} +{ +add.f16x2 r2257, r1655, r1703; +} +{ +mul.f16x2 r2260, r2257, r2225; +} +{ +add.f16x2 r2263, r66, r2260; +} +{ +add.f16x2 r2266, r1671, r1687; +} +{ +mul.f16x2 r2269, r2266, r2227; +} +{ +add.f16x2 r2272, r2263, r2269; +} +{ +sub.f16x2 r2275, r1661, r1709; +} +{ +mul.f16x2 r2278, r2275, r2226; +} +{ +sub.f16x2 r2281, r1677, r1693; +} +{ +mul.f16x2 r2284, r2281, r2228; +} +{ +add.f16x2 r2287, r2278, r2284; +} +{ +sub.f16x2 r2290, r2272, r2287; +} +{ +add.f16x2 r2293, r1655, r1703; +} +{ +mul.f16x2 r2296, r2293, r2225; +} +{ +add.f16x2 r2299, r66, r2296; +} +{ +add.f16x2 r2302, r1671, r1687; +} +{ +mul.f16x2 r2305, r2302, r2227; +} +{ +add.f16x2 r2308, r2299, r2305; +} +{ +sub.f16x2 r2311, r1661, r1709; +} +{ +mul.f16x2 r2314, r2311, r2226; +} +{ +sub.f16x2 r2317, r1677, r1693; +} +{ +mul.f16x2 r2320, r2317, r2228; +} +{ +add.f16x2 r2323, r2314, r2320; +} +{ +add.f16x2 r2326, r2308, r2323; +} +{ +add.f16x2 r2329, r1655, r1703; +} +{ +mul.f16x2 r2332, r2329, r2227; +} +{ +add.f16x2 r2335, r66, r2332; +} +{ +add.f16x2 r2338, r1671, r1687; +} +{ +mul.f16x2 r2341, r2338, r2229; +} +{ +add.f16x2 r2344, r2335, r2341; +} +{ +sub.f16x2 r2347, r1661, r1709; +} +{ +mul.f16x2 r2350, r2347, r2228; +} +{ +sub.f16x2 r2353, r1677, r1693; +} +{ +mul.f16x2 r2356, r2353, r2231; +} +{ +add.f16x2 r2359, r2350, r2356; +} +{ +sub.f16x2 r2362, r2344, r2359; +} +{ +add.f16x2 r2365, r1655, r1703; +} +{ +mul.f16x2 r2368, r2365, r2227; +} +{ +add.f16x2 r2371, r66, r2368; +} +{ +add.f16x2 r2374, r1671, r1687; +} +{ +mul.f16x2 r2377, r2374, r2229; +} +{ +add.f16x2 r2380, r2371, r2377; +} +{ +sub.f16x2 r2383, r1661, r1709; +} +{ +mul.f16x2 r2386, r2383, r2228; +} +{ +sub.f16x2 r2389, r1677, r1693; +} +{ +mul.f16x2 r2392, r2389, r2231; +} +{ +add.f16x2 r2395, r2386, r2392; +} +{ +add.f16x2 r2398, r2380, r2395; +} +{ +add.f16x2 r2401, r1661, r1709; +} +{ +mul.f16x2 r2404, r2401, r2225; +} +{ +add.f16x2 r2407, r210, r2404; +} +{ +add.f16x2 r2410, r1677, r1693; +} +{ +mul.f16x2 r2413, r2410, r2227; +} +{ +add.f16x2 r2416, r2407, r2413; +} +{ +sub.f16x2 r2419, r1655, r1703; +} +{ +mul.f16x2 r2422, r2419, r2226; +} +{ +sub.f16x2 r2425, r1671, r1687; +} +{ +mul.f16x2 r2428, r2425, r2228; +} +{ +add.f16x2 r2431, r2422, r2428; +} +{ +add.f16x2 r2434, r2416, r2431; +} +{ +add.f16x2 r2437, r1661, r1709; +} +{ +mul.f16x2 r2440, r2437, r2225; +} +{ +add.f16x2 r2443, r210, r2440; +} +{ +add.f16x2 r2446, r1677, r1693; +} +{ +mul.f16x2 r2449, r2446, r2227; +} +{ +add.f16x2 r2452, r2443, r2449; +} +{ +sub.f16x2 r2455, r1655, r1703; +} +{ +mul.f16x2 r2458, r2455, r2226; +} +{ +sub.f16x2 r2461, r1671, r1687; +} +{ +mul.f16x2 r2464, r2461, r2228; +} +{ +add.f16x2 r2467, r2458, r2464; +} +{ +sub.f16x2 r2470, r2452, r2467; +} +{ +add.f16x2 r2473, r1661, r1709; +} +{ +mul.f16x2 r2476, r2473, r2227; +} +{ +add.f16x2 r2479, r210, r2476; +} +{ +add.f16x2 r2482, r1677, r1693; +} +{ +mul.f16x2 r2485, r2482, r2229; +} +{ +add.f16x2 r2488, r2479, r2485; +} +{ +sub.f16x2 r2491, r1655, r1703; +} +{ +mul.f16x2 r2494, r2491, r2228; +} +{ +sub.f16x2 r2497, r1671, r1687; +} +{ +mul.f16x2 r2500, r2497, r2231; +} +{ +add.f16x2 r2503, r2494, r2500; +} +{ +add.f16x2 r2506, r2488, r2503; +} +{ +add.f16x2 r2509, r1661, r1709; +} +{ +mul.f16x2 r2512, r2509, r2227; +} +{ +add.f16x2 r2515, r210, r2512; +} +{ +add.f16x2 r2518, r1677, r1693; +} +{ +mul.f16x2 r2521, r2518, r2229; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +sub.f16x2 r2527, r1655, r1703; +} +{ +mul.f16x2 r2530, r2527, r2228; +} +{ +sub.f16x2 r2533, r1671, r1687; +} +{ +mul.f16x2 r2536, r2533, r2231; +} +{ +add.f16x2 r2539, r2530, r2536; +} +{ +sub.f16x2 r2542, r2524, r2539; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2545, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2546, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2547, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2548, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2549, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2550, {low, high}; +} +{ +neg.f16x2 r2551, r2550; +} +{ +add.f16x2 r2553, r1719, r1767; +} +{ +add.f16x2 r2556, r138, r2553; +} +{ +add.f16x2 r2559, r1735, r1751; +} +{ +add.f16x2 r2562, r2556, r2559; +} +{ +add.f16x2 r2565, r1725, r1773; +} +{ +add.f16x2 r2568, r282, r2565; +} +{ +add.f16x2 r2571, r1741, r1757; +} +{ +add.f16x2 r2574, r2568, r2571; +} +{ +add.f16x2 r2577, r1719, r1767; +} +{ +mul.f16x2 r2580, r2577, r2545; +} +{ +add.f16x2 r2583, r138, r2580; +} +{ +add.f16x2 r2586, r1735, r1751; +} +{ +mul.f16x2 r2589, r2586, r2547; +} +{ +add.f16x2 r2592, r2583, r2589; +} +{ +sub.f16x2 r2595, r1725, r1773; +} +{ +mul.f16x2 r2598, r2595, r2546; +} +{ +sub.f16x2 r2601, r1741, r1757; +} +{ +mul.f16x2 r2604, r2601, r2548; +} +{ +add.f16x2 r2607, r2598, r2604; +} +{ +sub.f16x2 r2610, r2592, r2607; +} +{ +add.f16x2 r2613, r1719, r1767; +} +{ +mul.f16x2 r2616, r2613, r2545; +} +{ +add.f16x2 r2619, r138, r2616; +} +{ +add.f16x2 r2622, r1735, r1751; +} +{ +mul.f16x2 r2625, r2622, r2547; +} +{ +add.f16x2 r2628, r2619, r2625; +} +{ +sub.f16x2 r2631, r1725, r1773; +} +{ +mul.f16x2 r2634, r2631, r2546; +} +{ +sub.f16x2 r2637, r1741, r1757; +} +{ +mul.f16x2 r2640, r2637, r2548; +} +{ +add.f16x2 r2643, r2634, r2640; +} +{ +add.f16x2 r2646, r2628, r2643; +} +{ +add.f16x2 r2649, r1719, r1767; +} +{ +mul.f16x2 r2652, r2649, r2547; +} +{ +add.f16x2 r2655, r138, r2652; +} +{ +add.f16x2 r2658, r1735, r1751; +} +{ +mul.f16x2 r2661, r2658, r2549; +} +{ +add.f16x2 r2664, r2655, r2661; +} +{ +sub.f16x2 r2667, r1725, r1773; +} +{ +mul.f16x2 r2670, r2667, r2548; +} +{ +sub.f16x2 r2673, r1741, r1757; +} +{ +mul.f16x2 r2676, r2673, r2551; +} +{ +add.f16x2 r2679, r2670, r2676; +} +{ +sub.f16x2 r2682, r2664, r2679; +} +{ +add.f16x2 r2685, r1719, r1767; +} +{ +mul.f16x2 r2688, r2685, r2547; +} +{ +add.f16x2 r2691, r138, r2688; +} +{ +add.f16x2 r2694, r1735, r1751; +} +{ +mul.f16x2 r2697, r2694, r2549; +} +{ +add.f16x2 r2700, r2691, r2697; +} +{ +sub.f16x2 r2703, r1725, r1773; +} +{ +mul.f16x2 r2706, r2703, r2548; +} +{ +sub.f16x2 r2709, r1741, r1757; +} +{ +mul.f16x2 r2712, r2709, r2551; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2700, r2715; +} +{ +add.f16x2 r2721, r1725, r1773; +} +{ +mul.f16x2 r2724, r2721, r2545; +} +{ +add.f16x2 r2727, r282, r2724; +} +{ +add.f16x2 r2730, r1741, r1757; +} +{ +mul.f16x2 r2733, r2730, r2547; +} +{ +add.f16x2 r2736, r2727, r2733; +} +{ +sub.f16x2 r2739, r1719, r1767; +} +{ +mul.f16x2 r2742, r2739, r2546; +} +{ +sub.f16x2 r2745, r1735, r1751; +} +{ +mul.f16x2 r2748, r2745, r2548; +} +{ +add.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2736, r2751; +} +{ +add.f16x2 r2757, r1725, r1773; +} +{ +mul.f16x2 r2760, r2757, r2545; +} +{ +add.f16x2 r2763, r282, r2760; +} +{ +add.f16x2 r2766, r1741, r1757; +} +{ +mul.f16x2 r2769, r2766, r2547; +} +{ +add.f16x2 r2772, r2763, r2769; +} +{ +sub.f16x2 r2775, r1719, r1767; +} +{ +mul.f16x2 r2778, r2775, r2546; +} +{ +sub.f16x2 r2781, r1735, r1751; +} +{ +mul.f16x2 r2784, r2781, r2548; +} +{ +add.f16x2 r2787, r2778, r2784; +} +{ +sub.f16x2 r2790, r2772, r2787; +} +{ +add.f16x2 r2793, r1725, r1773; +} +{ +mul.f16x2 r2796, r2793, r2547; +} +{ +add.f16x2 r2799, r282, r2796; +} +{ +add.f16x2 r2802, r1741, r1757; +} +{ +mul.f16x2 r2805, r2802, r2549; +} +{ +add.f16x2 r2808, r2799, r2805; +} +{ +sub.f16x2 r2811, r1719, r1767; +} +{ +mul.f16x2 r2814, r2811, r2548; +} +{ +sub.f16x2 r2817, r1735, r1751; +} +{ +mul.f16x2 r2820, r2817, r2551; +} +{ +add.f16x2 r2823, r2814, r2820; +} +{ +add.f16x2 r2826, r2808, r2823; +} +{ +add.f16x2 r2829, r1725, r1773; +} +{ +mul.f16x2 r2832, r2829, r2547; +} +{ +add.f16x2 r2835, r282, r2832; +} +{ +add.f16x2 r2838, r1741, r1757; +} +{ +mul.f16x2 r2841, r2838, r2549; +} +{ +add.f16x2 r2844, r2835, r2841; +} +{ +sub.f16x2 r2847, r1719, r1767; +} +{ +mul.f16x2 r2850, r2847, r2548; +} +{ +sub.f16x2 r2853, r1735, r1751; +} +{ +mul.f16x2 r2856, r2853, r2551; +} +{ +add.f16x2 r2859, r2850, r2856; +} +{ +sub.f16x2 r2862, r2844, r2859; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2865, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2866, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2868, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2869, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2870, {low, high}; +} +{ +neg.f16x2 r2871, r2870; +} +{ +add.f16x2 r2873, r1783, r1831; +} +{ +add.f16x2 r2876, r174, r2873; +} +{ +add.f16x2 r2879, r1799, r1815; +} +{ +add.f16x2 r2882, r2876, r2879; +} +{ +add.f16x2 r2885, r1789, r1837; +} +{ +add.f16x2 r2888, r318, r2885; +} +{ +add.f16x2 r2891, r1805, r1821; +} +{ +add.f16x2 r2894, r2888, r2891; +} +{ +add.f16x2 r2897, r1783, r1831; +} +{ +mul.f16x2 r2900, r2897, r2865; +} +{ +add.f16x2 r2903, r174, r2900; +} +{ +add.f16x2 r2906, r1799, r1815; +} +{ +mul.f16x2 r2909, r2906, r2867; +} +{ +add.f16x2 r2912, r2903, r2909; +} +{ +sub.f16x2 r2915, r1789, r1837; +} +{ +mul.f16x2 r2918, r2915, r2866; +} +{ +sub.f16x2 r2921, r1805, r1821; +} +{ +mul.f16x2 r2924, r2921, r2868; +} +{ +add.f16x2 r2927, r2918, r2924; +} +{ +sub.f16x2 r2930, r2912, r2927; +} +{ +add.f16x2 r2933, r1783, r1831; +} +{ +mul.f16x2 r2936, r2933, r2865; +} +{ +add.f16x2 r2939, r174, r2936; +} +{ +add.f16x2 r2942, r1799, r1815; +} +{ +mul.f16x2 r2945, r2942, r2867; +} +{ +add.f16x2 r2948, r2939, r2945; +} +{ +sub.f16x2 r2951, r1789, r1837; +} +{ +mul.f16x2 r2954, r2951, r2866; +} +{ +sub.f16x2 r2957, r1805, r1821; +} +{ +mul.f16x2 r2960, r2957, r2868; +} +{ +add.f16x2 r2963, r2954, r2960; +} +{ +add.f16x2 r2966, r2948, r2963; +} +{ +add.f16x2 r2969, r1783, r1831; +} +{ +mul.f16x2 r2972, r2969, r2867; +} +{ +add.f16x2 r2975, r174, r2972; +} +{ +add.f16x2 r2978, r1799, r1815; +} +{ +mul.f16x2 r2981, r2978, r2869; +} +{ +add.f16x2 r2984, r2975, r2981; +} +{ +sub.f16x2 r2987, r1789, r1837; +} +{ +mul.f16x2 r2990, r2987, r2868; +} +{ +sub.f16x2 r2993, r1805, r1821; +} +{ +mul.f16x2 r2996, r2993, r2871; +} +{ +add.f16x2 r2999, r2990, r2996; +} +{ +sub.f16x2 r3002, r2984, r2999; +} +{ +add.f16x2 r3005, r1783, r1831; +} +{ +mul.f16x2 r3008, r3005, r2867; +} +{ +add.f16x2 r3011, r174, r3008; +} +{ +add.f16x2 r3014, r1799, r1815; +} +{ +mul.f16x2 r3017, r3014, r2869; +} +{ +add.f16x2 r3020, r3011, r3017; +} +{ +sub.f16x2 r3023, r1789, r1837; +} +{ +mul.f16x2 r3026, r3023, r2868; +} +{ +sub.f16x2 r3029, r1805, r1821; +} +{ +mul.f16x2 r3032, r3029, r2871; +} +{ +add.f16x2 r3035, r3026, r3032; +} +{ +add.f16x2 r3038, r3020, r3035; +} +{ +add.f16x2 r3041, r1789, r1837; +} +{ +mul.f16x2 r3044, r3041, r2865; +} +{ +add.f16x2 r3047, r318, r3044; +} +{ +add.f16x2 r3050, r1805, r1821; +} +{ +mul.f16x2 r3053, r3050, r2867; +} +{ +add.f16x2 r3056, r3047, r3053; +} +{ +sub.f16x2 r3059, r1783, r1831; +} +{ +mul.f16x2 r3062, r3059, r2866; +} +{ +sub.f16x2 r3065, r1799, r1815; +} +{ +mul.f16x2 r3068, r3065, r2868; +} +{ +add.f16x2 r3071, r3062, r3068; +} +{ +add.f16x2 r3074, r3056, r3071; +} +{ +add.f16x2 r3077, r1789, r1837; +} +{ +mul.f16x2 r3080, r3077, r2865; +} +{ +add.f16x2 r3083, r318, r3080; +} +{ +add.f16x2 r3086, r1805, r1821; +} +{ +mul.f16x2 r3089, r3086, r2867; +} +{ +add.f16x2 r3092, r3083, r3089; +} +{ +sub.f16x2 r3095, r1783, r1831; +} +{ +mul.f16x2 r3098, r3095, r2866; +} +{ +sub.f16x2 r3101, r1799, r1815; +} +{ +mul.f16x2 r3104, r3101, r2868; +} +{ +add.f16x2 r3107, r3098, r3104; +} +{ +sub.f16x2 r3110, r3092, r3107; +} +{ +add.f16x2 r3113, r1789, r1837; +} +{ +mul.f16x2 r3116, r3113, r2867; +} +{ +add.f16x2 r3119, r318, r3116; +} +{ +add.f16x2 r3122, r1805, r1821; +} +{ +mul.f16x2 r3125, r3122, r2869; +} +{ +add.f16x2 r3128, r3119, r3125; +} +{ +sub.f16x2 r3131, r1783, r1831; +} +{ +mul.f16x2 r3134, r3131, r2868; +} +{ +sub.f16x2 r3137, r1799, r1815; +} +{ +mul.f16x2 r3140, r3137, r2871; +} +{ +add.f16x2 r3143, r3134, r3140; +} +{ +add.f16x2 r3146, r3128, r3143; +} +{ +add.f16x2 r3149, r1789, r1837; +} +{ +mul.f16x2 r3152, r3149, r2867; +} +{ +add.f16x2 r3155, r318, r3152; +} +{ +add.f16x2 r3158, r1805, r1821; +} +{ +mul.f16x2 r3161, r3158, r2869; +} +{ +add.f16x2 r3164, r3155, r3161; +} +{ +sub.f16x2 r3167, r1783, r1831; +} +{ +mul.f16x2 r3170, r3167, r2868; +} +{ +sub.f16x2 r3173, r1799, r1815; +} +{ +mul.f16x2 r3176, r3173, r2871; +} +{ +add.f16x2 r3179, r3170, r3176; +} +{ +sub.f16x2 r3182, r3164, r3179; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3185, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3186, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r3187, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r3188, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3189, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3190, {low, high}; +} +{ +neg.f16x2 r3191, r3190; +} +{ +add.f16x2 r3193, r1847, r1895; +} +{ +add.f16x2 r3196, r102, r3193; +} +{ +add.f16x2 r3199, r1863, r1879; +} +{ +add.f16x2 r3202, r3196, r3199; +} +{ +add.f16x2 r3205, r1853, r1901; +} +{ +add.f16x2 r3208, r246, r3205; +} +{ +add.f16x2 r3211, r1869, r1885; +} +{ +add.f16x2 r3214, r3208, r3211; +} +{ +add.f16x2 r3217, r1847, r1895; +} +{ +mul.f16x2 r3220, r3217, r3185; +} +{ +add.f16x2 r3223, r102, r3220; +} +{ +add.f16x2 r3226, r1863, r1879; +} +{ +mul.f16x2 r3229, r3226, r3187; +} +{ +add.f16x2 r3232, r3223, r3229; +} +{ +sub.f16x2 r3235, r1853, r1901; +} +{ +mul.f16x2 r3238, r3235, r3186; +} +{ +sub.f16x2 r3241, r1869, r1885; +} +{ +mul.f16x2 r3244, r3241, r3188; +} +{ +add.f16x2 r3247, r3238, r3244; +} +{ +sub.f16x2 r3250, r3232, r3247; +} +{ +add.f16x2 r3253, r1847, r1895; +} +{ +mul.f16x2 r3256, r3253, r3185; +} +{ +add.f16x2 r3259, r102, r3256; +} +{ +add.f16x2 r3262, r1863, r1879; +} +{ +mul.f16x2 r3265, r3262, r3187; +} +{ +add.f16x2 r3268, r3259, r3265; +} +{ +sub.f16x2 r3271, r1853, r1901; +} +{ +mul.f16x2 r3274, r3271, r3186; +} +{ +sub.f16x2 r3277, r1869, r1885; +} +{ +mul.f16x2 r3280, r3277, r3188; +} +{ +add.f16x2 r3283, r3274, r3280; +} +{ +add.f16x2 r3286, r3268, r3283; +} +{ +add.f16x2 r3289, r1847, r1895; +} +{ +mul.f16x2 r3292, r3289, r3187; +} +{ +add.f16x2 r3295, r102, r3292; +} +{ +add.f16x2 r3298, r1863, r1879; +} +{ +mul.f16x2 r3301, r3298, r3189; +} +{ +add.f16x2 r3304, r3295, r3301; +} +{ +sub.f16x2 r3307, r1853, r1901; +} +{ +mul.f16x2 r3310, r3307, r3188; +} +{ +sub.f16x2 r3313, r1869, r1885; +} +{ +mul.f16x2 r3316, r3313, r3191; +} +{ +add.f16x2 r3319, r3310, r3316; +} +{ +sub.f16x2 r3322, r3304, r3319; +} +{ +add.f16x2 r3325, r1847, r1895; +} +{ +mul.f16x2 r3328, r3325, r3187; +} +{ +add.f16x2 r3331, r102, r3328; +} +{ +add.f16x2 r3334, r1863, r1879; +} +{ +mul.f16x2 r3337, r3334, r3189; +} +{ +add.f16x2 r3340, r3331, r3337; +} +{ +sub.f16x2 r3343, r1853, r1901; +} +{ +mul.f16x2 r3346, r3343, r3188; +} +{ +sub.f16x2 r3349, r1869, r1885; +} +{ +mul.f16x2 r3352, r3349, r3191; +} +{ +add.f16x2 r3355, r3346, r3352; +} +{ +add.f16x2 r3358, r3340, r3355; +} +{ +add.f16x2 r3361, r1853, r1901; +} +{ +mul.f16x2 r3364, r3361, r3185; +} +{ +add.f16x2 r3367, r246, r3364; +} +{ +add.f16x2 r3370, r1869, r1885; +} +{ +mul.f16x2 r3373, r3370, r3187; +} +{ +add.f16x2 r3376, r3367, r3373; +} +{ +sub.f16x2 r3379, r1847, r1895; +} +{ +mul.f16x2 r3382, r3379, r3186; +} +{ +sub.f16x2 r3385, r1863, r1879; +} +{ +mul.f16x2 r3388, r3385, r3188; +} +{ +add.f16x2 r3391, r3382, r3388; +} +{ +add.f16x2 r3394, r3376, r3391; +} +{ +add.f16x2 r3397, r1853, r1901; +} +{ +mul.f16x2 r3400, r3397, r3185; +} +{ +add.f16x2 r3403, r246, r3400; +} +{ +add.f16x2 r3406, r1869, r1885; +} +{ +mul.f16x2 r3409, r3406, r3187; +} +{ +add.f16x2 r3412, r3403, r3409; +} +{ +sub.f16x2 r3415, r1847, r1895; +} +{ +mul.f16x2 r3418, r3415, r3186; +} +{ +sub.f16x2 r3421, r1863, r1879; +} +{ +mul.f16x2 r3424, r3421, r3188; +} +{ +add.f16x2 r3427, r3418, r3424; +} +{ +sub.f16x2 r3430, r3412, r3427; +} +{ +add.f16x2 r3433, r1853, r1901; +} +{ +mul.f16x2 r3436, r3433, r3187; +} +{ +add.f16x2 r3439, r246, r3436; +} +{ +add.f16x2 r3442, r1869, r1885; +} +{ +mul.f16x2 r3445, r3442, r3189; +} +{ +add.f16x2 r3448, r3439, r3445; +} +{ +sub.f16x2 r3451, r1847, r1895; +} +{ +mul.f16x2 r3454, r3451, r3188; +} +{ +sub.f16x2 r3457, r1863, r1879; +} +{ +mul.f16x2 r3460, r3457, r3191; +} +{ +add.f16x2 r3463, r3454, r3460; +} +{ +add.f16x2 r3466, r3448, r3463; +} +{ +add.f16x2 r3469, r1853, r1901; +} +{ +mul.f16x2 r3472, r3469, r3187; +} +{ +add.f16x2 r3475, r246, r3472; +} +{ +add.f16x2 r3478, r1869, r1885; +} +{ +mul.f16x2 r3481, r3478, r3189; +} +{ +add.f16x2 r3484, r3475, r3481; +} +{ +sub.f16x2 r3487, r1847, r1895; +} +{ +mul.f16x2 r3490, r3487, r3188; +} +{ +sub.f16x2 r3493, r1863, r1879; +} +{ +mul.f16x2 r3496, r3493, r3191; +} +{ +add.f16x2 r3499, r3490, r3496; +} +{ +sub.f16x2 r3502, r3484, r3499; +} +mul.wide.u32 rd2, r7903, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r7904, rd3; +mul.lo.s32 r7905, r7904, 25; +sub.s32 r7906, r7903, r7905; +mad.lo.s32 r7907, r7904, 2500, r7902; +cvt.rn.f32.u32 f485, r7906; +mul.f32 f486, f485, 0f3C24B5BE; +cos.approx.f32 f217, f486; +sin.approx.f32 f487, f486; +neg.f32 f218, f487; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3505, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3508, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3510, {high, high}; +} +{ +mul.f16x2 r3512, r2254, r3510; +} +{ +neg.f16x2 r3515, r3512; +} +{ +fma.rn.f16x2 r3517, r2242, r3508, r3515; +} +{ +mul.f16x2 r3521, r2242, r3510; +} +{ +fma.rn.f16x2 r3524, r2254, r3508, r3521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3530, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3532, {low, high}; +} +{ +mul.f16x2 r3533, r3530, r3532; +} +{ +mul.f16x2 r3536, r3505, r3528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3539, {high, low}; +} +{ +fma.rn.f16x2 r3541, r3533, r3539, r3536; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3545, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3547, {high, high}; +} +{ +mul.f16x2 r3549, r2574, r3547; +} +{ +neg.f16x2 r3552, r3549; +} +{ +fma.rn.f16x2 r3554, r2562, r3545, r3552; +} +{ +mul.f16x2 r3558, r2562, r3547; +} +{ +fma.rn.f16x2 r3561, r2574, r3545, r3558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3567, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3569, {low, high}; +} +{ +mul.f16x2 r3570, r3567, r3569; +} +{ +mul.f16x2 r3573, r3541, r3565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3541; +mov.b32 r3576, {high, low}; +} +{ +fma.rn.f16x2 r3578, r3570, r3576, r3573; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3582, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3584, {high, high}; +} +{ +mul.f16x2 r3586, r2894, r3584; +} +{ +neg.f16x2 r3589, r3586; +} +{ +fma.rn.f16x2 r3591, r2882, r3582, r3589; +} +{ +mul.f16x2 r3595, r2882, r3584; +} +{ +fma.rn.f16x2 r3598, r2894, r3582, r3595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3604, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3606, {low, high}; +} +{ +mul.f16x2 r3607, r3604, r3606; +} +{ +mul.f16x2 r3610, r3578, r3602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3578; +mov.b32 r3613, {high, low}; +} +{ +fma.rn.f16x2 r3615, r3607, r3613, r3610; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3619, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3621, {high, high}; +} +{ +mul.f16x2 r3623, r3214, r3621; +} +{ +neg.f16x2 r3626, r3623; +} +{ +fma.rn.f16x2 r3628, r3202, r3619, r3626; +} +{ +mul.f16x2 r3632, r3202, r3621; +} +{ +fma.rn.f16x2 r3635, r3214, r3619, r3632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3641, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3643, {low, high}; +} +{ +mul.f16x2 r3644, r3641, r3643; +} +{ +mul.f16x2 r3647, r3615, r3639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3615; +mov.b32 r3650, {high, low}; +} +{ +fma.rn.f16x2 r3652, r3644, r3650, r3647; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3656, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3658, {high, high}; +} +{ +mul.f16x2 r3660, r2114, r3658; +} +{ +neg.f16x2 r3663, r3660; +} +{ +fma.rn.f16x2 r3665, r1970, r3656, r3663; +} +{ +mul.f16x2 r3669, r1970, r3658; +} +{ +fma.rn.f16x2 r3672, r2114, r3656, r3669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3678, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3680, {low, high}; +} +{ +mul.f16x2 r3681, r3678, r3680; +} +{ +mul.f16x2 r3684, r3652, r3676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3652; +mov.b32 r3687, {high, low}; +} +{ +fma.rn.f16x2 r3689, r3681, r3687, r3684; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3693, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3695, {high, high}; +} +{ +mul.f16x2 r3697, r2434, r3695; +} +{ +neg.f16x2 r3700, r3697; +} +{ +fma.rn.f16x2 r3702, r2290, r3693, r3700; +} +{ +mul.f16x2 r3706, r2290, r3695; +} +{ +fma.rn.f16x2 r3709, r2434, r3693, r3706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3715, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3717, {low, high}; +} +{ +mul.f16x2 r3718, r3715, r3717; +} +{ +mul.f16x2 r3721, r3689, r3713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3689; +mov.b32 r3724, {high, low}; +} +{ +fma.rn.f16x2 r3726, r3718, r3724, r3721; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3730, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3732, {high, high}; +} +{ +mul.f16x2 r3734, r2754, r3732; +} +{ +neg.f16x2 r3737, r3734; +} +{ +fma.rn.f16x2 r3739, r2610, r3730, r3737; +} +{ +mul.f16x2 r3743, r2610, r3732; +} +{ +fma.rn.f16x2 r3746, r2754, r3730, r3743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3752, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3754, {low, high}; +} +{ +mul.f16x2 r3755, r3752, r3754; +} +{ +mul.f16x2 r3758, r3726, r3750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3726; +mov.b32 r3761, {high, low}; +} +{ +fma.rn.f16x2 r3763, r3755, r3761, r3758; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3767, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3769, {high, high}; +} +{ +mul.f16x2 r3771, r3074, r3769; +} +{ +neg.f16x2 r3774, r3771; +} +{ +fma.rn.f16x2 r3776, r2930, r3767, r3774; +} +{ +mul.f16x2 r3780, r2930, r3769; +} +{ +fma.rn.f16x2 r3783, r3074, r3767, r3780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3789, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3791, {low, high}; +} +{ +mul.f16x2 r3792, r3789, r3791; +} +{ +mul.f16x2 r3795, r3763, r3787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3763; +mov.b32 r3798, {high, low}; +} +{ +fma.rn.f16x2 r3800, r3792, r3798, r3795; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3804, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3806, {high, high}; +} +{ +mul.f16x2 r3808, r3394, r3806; +} +{ +neg.f16x2 r3811, r3808; +} +{ +fma.rn.f16x2 r3813, r3250, r3804, r3811; +} +{ +mul.f16x2 r3817, r3250, r3806; +} +{ +fma.rn.f16x2 r3820, r3394, r3804, r3817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3826, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3828, {low, high}; +} +{ +mul.f16x2 r3829, r3826, r3828; +} +{ +mul.f16x2 r3832, r3800, r3824; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3800; +mov.b32 r3835, {high, low}; +} +{ +fma.rn.f16x2 r3837, r3829, r3835, r3832; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3841, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3843, {high, high}; +} +{ +mul.f16x2 r3845, r2186, r3843; +} +{ +neg.f16x2 r3848, r3845; +} +{ +fma.rn.f16x2 r3850, r2042, r3841, r3848; +} +{ +mul.f16x2 r3854, r2042, r3843; +} +{ +fma.rn.f16x2 r3857, r2186, r3841, r3854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3863, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3865, {low, high}; +} +{ +mul.f16x2 r3866, r3863, r3865; +} +{ +mul.f16x2 r3869, r3837, r3861; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3837; +mov.b32 r3872, {high, low}; +} +{ +fma.rn.f16x2 r3874, r3866, r3872, r3869; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3878, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3880, {high, high}; +} +{ +mul.f16x2 r3882, r2506, r3880; +} +{ +neg.f16x2 r3885, r3882; +} +{ +fma.rn.f16x2 r3887, r2362, r3878, r3885; +} +{ +mul.f16x2 r3891, r2362, r3880; +} +{ +fma.rn.f16x2 r3894, r2506, r3878, r3891; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3900, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3902, {low, high}; +} +{ +mul.f16x2 r3903, r3900, r3902; +} +{ +mul.f16x2 r3906, r3874, r3898; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3874; +mov.b32 r3909, {high, low}; +} +{ +fma.rn.f16x2 r3911, r3903, r3909, r3906; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3915, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3917, {high, high}; +} +{ +mul.f16x2 r3919, r2826, r3917; +} +{ +neg.f16x2 r3922, r3919; +} +{ +fma.rn.f16x2 r3924, r2682, r3915, r3922; +} +{ +mul.f16x2 r3928, r2682, r3917; +} +{ +fma.rn.f16x2 r3931, r2826, r3915, r3928; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3937, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3939, {low, high}; +} +{ +mul.f16x2 r3940, r3937, r3939; +} +{ +mul.f16x2 r3943, r3911, r3935; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3911; +mov.b32 r3946, {high, low}; +} +{ +fma.rn.f16x2 r3948, r3940, r3946, r3943; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3952, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3954, {high, high}; +} +{ +mul.f16x2 r3956, r3146, r3954; +} +{ +neg.f16x2 r3959, r3956; +} +{ +fma.rn.f16x2 r3961, r3002, r3952, r3959; +} +{ +mul.f16x2 r3965, r3002, r3954; +} +{ +fma.rn.f16x2 r3968, r3146, r3952, r3965; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r3974, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3976, {low, high}; +} +{ +mul.f16x2 r3977, r3974, r3976; +} +{ +mul.f16x2 r3980, r3948, r3972; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3948; +mov.b32 r3983, {high, low}; +} +{ +fma.rn.f16x2 r3985, r3977, r3983, r3980; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3989, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r3991, {high, high}; +} +{ +mul.f16x2 r3993, r3466, r3991; +} +{ +neg.f16x2 r3996, r3993; +} +{ +fma.rn.f16x2 r3998, r3322, r3989, r3996; +} +{ +mul.f16x2 r4002, r3322, r3991; +} +{ +fma.rn.f16x2 r4005, r3466, r3989, r4002; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4011, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4013, {low, high}; +} +{ +mul.f16x2 r4014, r4011, r4013; +} +{ +mul.f16x2 r4017, r3985, r4009; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3985; +mov.b32 r4020, {high, low}; +} +{ +fma.rn.f16x2 r4022, r4014, r4020, r4017; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4026, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4028, {high, high}; +} +{ +mul.f16x2 r4030, r2222, r4028; +} +{ +neg.f16x2 r4033, r4030; +} +{ +fma.rn.f16x2 r4035, r2078, r4026, r4033; +} +{ +mul.f16x2 r4039, r2078, r4028; +} +{ +fma.rn.f16x2 r4042, r2222, r4026, r4039; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4048, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4050, {low, high}; +} +{ +mul.f16x2 r4051, r4048, r4050; +} +{ +mul.f16x2 r4054, r4022, r4046; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4022; +mov.b32 r4057, {high, low}; +} +{ +fma.rn.f16x2 r4059, r4051, r4057, r4054; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4063, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4065, {high, high}; +} +{ +mul.f16x2 r4067, r2542, r4065; +} +{ +neg.f16x2 r4070, r4067; +} +{ +fma.rn.f16x2 r4072, r2398, r4063, r4070; +} +{ +mul.f16x2 r4076, r2398, r4065; +} +{ +fma.rn.f16x2 r4079, r2542, r4063, r4076; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4085, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4087, {low, high}; +} +{ +mul.f16x2 r4088, r4085, r4087; +} +{ +mul.f16x2 r4091, r4059, r4083; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4059; +mov.b32 r4094, {high, low}; +} +{ +fma.rn.f16x2 r4096, r4088, r4094, r4091; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4100, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4102, {high, high}; +} +{ +mul.f16x2 r4104, r2862, r4102; +} +{ +neg.f16x2 r4107, r4104; +} +{ +fma.rn.f16x2 r4109, r2718, r4100, r4107; +} +{ +mul.f16x2 r4113, r2718, r4102; +} +{ +fma.rn.f16x2 r4116, r2862, r4100, r4113; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4122, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4125, r4122, r4124; +} +{ +mul.f16x2 r4128, r4096, r4120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4096; +mov.b32 r4131, {high, low}; +} +{ +fma.rn.f16x2 r4133, r4125, r4131, r4128; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4137, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4139, {high, high}; +} +{ +mul.f16x2 r4141, r3182, r4139; +} +{ +neg.f16x2 r4144, r4141; +} +{ +fma.rn.f16x2 r4146, r3038, r4137, r4144; +} +{ +mul.f16x2 r4150, r3038, r4139; +} +{ +fma.rn.f16x2 r4153, r3182, r4137, r4150; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4159, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4162, r4159, r4161; +} +{ +mul.f16x2 r4165, r4133, r4157; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4133; +mov.b32 r4168, {high, low}; +} +{ +fma.rn.f16x2 r4170, r4162, r4168, r4165; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4174, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4176, {high, high}; +} +{ +mul.f16x2 r4178, r3502, r4176; +} +{ +neg.f16x2 r4181, r4178; +} +{ +fma.rn.f16x2 r4183, r3358, r4174, r4181; +} +{ +mul.f16x2 r4187, r3358, r4176; +} +{ +fma.rn.f16x2 r4190, r3502, r4174, r4187; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4196, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4198, {low, high}; +} +{ +mul.f16x2 r4199, r4196, r4198; +} +{ +mul.f16x2 r4202, r4170, r4194; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4170; +mov.b32 r4205, {high, low}; +} +{ +fma.rn.f16x2 r4207, r4199, r4205, r4202; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4211, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4213, {high, high}; +} +{ +mul.f16x2 r4215, r2150, r4213; +} +{ +neg.f16x2 r4218, r4215; +} +{ +fma.rn.f16x2 r4220, r2006, r4211, r4218; +} +{ +mul.f16x2 r4224, r2006, r4213; +} +{ +fma.rn.f16x2 r4227, r2150, r4211, r4224; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4233, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4235, {low, high}; +} +{ +mul.f16x2 r4236, r4233, r4235; +} +{ +mul.f16x2 r4239, r4207, r4231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4207; +mov.b32 r4242, {high, low}; +} +{ +fma.rn.f16x2 r4244, r4236, r4242, r4239; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4248, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4250, {high, high}; +} +{ +mul.f16x2 r4252, r2470, r4250; +} +{ +neg.f16x2 r4255, r4252; +} +{ +fma.rn.f16x2 r4257, r2326, r4248, r4255; +} +{ +mul.f16x2 r4261, r2326, r4250; +} +{ +fma.rn.f16x2 r4264, r2470, r4248, r4261; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4270, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4272, {low, high}; +} +{ +mul.f16x2 r4273, r4270, r4272; +} +{ +mul.f16x2 r4276, r4244, r4268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4244; +mov.b32 r4279, {high, low}; +} +{ +fma.rn.f16x2 r4281, r4273, r4279, r4276; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4285, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4287, {high, high}; +} +{ +mul.f16x2 r4289, r2790, r4287; +} +{ +neg.f16x2 r4292, r4289; +} +{ +fma.rn.f16x2 r4294, r2646, r4285, r4292; +} +{ +mul.f16x2 r4298, r2646, r4287; +} +{ +fma.rn.f16x2 r4301, r2790, r4285, r4298; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4307, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4309, {low, high}; +} +{ +mul.f16x2 r4310, r4307, r4309; +} +{ +mul.f16x2 r4313, r4281, r4305; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4281; +mov.b32 r4316, {high, low}; +} +{ +fma.rn.f16x2 r4318, r4310, r4316, r4313; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4322, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4324, {high, high}; +} +{ +mul.f16x2 r4326, r3110, r4324; +} +{ +neg.f16x2 r4329, r4326; +} +{ +fma.rn.f16x2 r4331, r2966, r4322, r4329; +} +{ +mul.f16x2 r4335, r2966, r4324; +} +{ +fma.rn.f16x2 r4338, r3110, r4322, r4335; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3505; +mov.b32 r4344, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4346, {low, high}; +} +{ +mul.f16x2 r4347, r4344, r4346; +} +{ +mul.f16x2 r4350, r4318, r4342; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4318; +mov.b32 r4353, {high, low}; +} +{ +fma.rn.f16x2 r4355, r4347, r4353, r4350; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4359, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4355; +mov.b32 r4361, {high, high}; +} +{ +mul.f16x2 r4363, r3430, r4361; +} +{ +neg.f16x2 r4366, r4363; +} +{ +fma.rn.f16x2 r4368, r3286, r4359, r4366; +} +{ +mul.f16x2 r4372, r3286, r4361; +} +{ +fma.rn.f16x2 r4375, r3430, r4359, r4372; +} +barrier.sync 0; +mad.lo.s32 r7908, r7906, 100, r7907; +st.shared.u32 [r7908], r1922; +st.shared.u32 [r7908+4], r3517; +st.shared.u32 [r7908+8], r3554; +st.shared.u32 [r7908+12], r3591; +st.shared.u32 [r7908+16], r3628; +st.shared.u32 [r7908+20], r3665; +st.shared.u32 [r7908+24], r3702; +st.shared.u32 [r7908+28], r3739; +st.shared.u32 [r7908+32], r3776; +st.shared.u32 [r7908+36], r3813; +st.shared.u32 [r7908+40], r3850; +st.shared.u32 [r7908+44], r3887; +st.shared.u32 [r7908+48], r3924; +st.shared.u32 [r7908+52], r3961; +st.shared.u32 [r7908+56], r3998; +st.shared.u32 [r7908+60], r4035; +st.shared.u32 [r7908+64], r4072; +st.shared.u32 [r7908+68], r4109; +st.shared.u32 [r7908+72], r4146; +st.shared.u32 [r7908+76], r4183; +st.shared.u32 [r7908+80], r4220; +st.shared.u32 [r7908+84], r4257; +st.shared.u32 [r7908+88], r4294; +st.shared.u32 [r7908+92], r4331; +st.shared.u32 [r7908+96], r4368; +barrier.sync 0; +mad.lo.s32 r7909, r7906, -96, r7908; +ld.shared.u32 r4408, [r7909]; +ld.shared.u32 r4728, [r7909+100]; +ld.shared.u32 r5048, [r7909+200]; +ld.shared.u32 r5368, [r7909+300]; +ld.shared.u32 r5688, [r7909+400]; +ld.shared.u32 r4405, [r7909+500]; +ld.shared.u32 r4725, [r7909+600]; +ld.shared.u32 r5045, [r7909+700]; +ld.shared.u32 r5365, [r7909+800]; +ld.shared.u32 r5685, [r7909+900]; +ld.shared.u32 r4411, [r7909+1000]; +ld.shared.u32 r4731, [r7909+1100]; +ld.shared.u32 r5051, [r7909+1200]; +ld.shared.u32 r5371, [r7909+1300]; +ld.shared.u32 r5691, [r7909+1400]; +ld.shared.u32 r4412, [r7909+1500]; +ld.shared.u32 r4732, [r7909+1600]; +ld.shared.u32 r5052, [r7909+1700]; +ld.shared.u32 r5372, [r7909+1800]; +ld.shared.u32 r5692, [r7909+1900]; +ld.shared.u32 r4406, [r7909+2000]; +ld.shared.u32 r4726, [r7909+2100]; +ld.shared.u32 r5046, [r7909+2200]; +ld.shared.u32 r5366, [r7909+2300]; +ld.shared.u32 r5686, [r7909+2400]; +barrier.sync 0; +st.shared.u32 [r7908], r1934; +st.shared.u32 [r7908+4], r3524; +st.shared.u32 [r7908+8], r3561; +st.shared.u32 [r7908+12], r3598; +st.shared.u32 [r7908+16], r3635; +st.shared.u32 [r7908+20], r3672; +st.shared.u32 [r7908+24], r3709; +st.shared.u32 [r7908+28], r3746; +st.shared.u32 [r7908+32], r3783; +st.shared.u32 [r7908+36], r3820; +st.shared.u32 [r7908+40], r3857; +st.shared.u32 [r7908+44], r3894; +st.shared.u32 [r7908+48], r3931; +st.shared.u32 [r7908+52], r3968; +st.shared.u32 [r7908+56], r4005; +st.shared.u32 [r7908+60], r4042; +st.shared.u32 [r7908+64], r4079; +st.shared.u32 [r7908+68], r4116; +st.shared.u32 [r7908+72], r4153; +st.shared.u32 [r7908+76], r4190; +st.shared.u32 [r7908+80], r4227; +st.shared.u32 [r7908+84], r4264; +st.shared.u32 [r7908+88], r4301; +st.shared.u32 [r7908+92], r4338; +st.shared.u32 [r7908+96], r4375; +barrier.sync 0; +ld.shared.u32 r4420, [r7909]; +ld.shared.u32 r4740, [r7909+100]; +ld.shared.u32 r5060, [r7909+200]; +ld.shared.u32 r5380, [r7909+300]; +ld.shared.u32 r5700, [r7909+400]; +ld.shared.u32 r4417, [r7909+500]; +ld.shared.u32 r4737, [r7909+600]; +ld.shared.u32 r5057, [r7909+700]; +ld.shared.u32 r5377, [r7909+800]; +ld.shared.u32 r5697, [r7909+900]; +ld.shared.u32 r4423, [r7909+1000]; +ld.shared.u32 r4743, [r7909+1100]; +ld.shared.u32 r5063, [r7909+1200]; +ld.shared.u32 r5383, [r7909+1300]; +ld.shared.u32 r5703, [r7909+1400]; +ld.shared.u32 r4424, [r7909+1500]; +ld.shared.u32 r4744, [r7909+1600]; +ld.shared.u32 r5064, [r7909+1700]; +ld.shared.u32 r5384, [r7909+1800]; +ld.shared.u32 r5704, [r7909+1900]; +ld.shared.u32 r4418, [r7909+2000]; +ld.shared.u32 r4738, [r7909+2100]; +ld.shared.u32 r5058, [r7909+2200]; +ld.shared.u32 r5378, [r7909+2300]; +ld.shared.u32 r5698, [r7909+2400]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4399, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4400, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4401, {low, high}; +} +{ +neg.f16x2 r4402, r4401; +} +{ +add.f16x2 r4404, r4405, r4406; +} +{ +add.f16x2 r4407, r4408, r4404; +} +{ +add.f16x2 r4410, r4411, r4412; +} +{ +add.f16x2 r4413, r4407, r4410; +} +{ +add.f16x2 r4416, r4417, r4418; +} +{ +add.f16x2 r4419, r4420, r4416; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 r4425, r4419, r4422; +} +{ +add.f16x2 r4428, r4405, r4406; +} +{ +mul.f16x2 r4431, r4428, r4396; +} +{ +add.f16x2 r4434, r4408, r4431; +} +{ +add.f16x2 r4437, r4411, r4412; +} +{ +mul.f16x2 r4440, r4437, r4398; +} +{ +add.f16x2 r4443, r4434, r4440; +} +{ +sub.f16x2 r4446, r4417, r4418; +} +{ +mul.f16x2 r4449, r4446, r4397; +} +{ +sub.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4399; +} +{ +add.f16x2 r4458, r4449, r4455; +} +{ +sub.f16x2 r4461, r4443, r4458; +} +{ +add.f16x2 r4464, r4405, r4406; +} +{ +mul.f16x2 r4467, r4464, r4396; +} +{ +add.f16x2 r4470, r4408, r4467; +} +{ +add.f16x2 r4473, r4411, r4412; +} +{ +mul.f16x2 r4476, r4473, r4398; +} +{ +add.f16x2 r4479, r4470, r4476; +} +{ +sub.f16x2 r4482, r4417, r4418; +} +{ +mul.f16x2 r4485, r4482, r4397; +} +{ +sub.f16x2 r4488, r4423, r4424; +} +{ +mul.f16x2 r4491, r4488, r4399; +} +{ +add.f16x2 r4494, r4485, r4491; +} +{ +add.f16x2 r4497, r4479, r4494; +} +{ +add.f16x2 r4500, r4405, r4406; +} +{ +mul.f16x2 r4503, r4500, r4398; +} +{ +add.f16x2 r4506, r4408, r4503; +} +{ +add.f16x2 r4509, r4411, r4412; +} +{ +mul.f16x2 r4512, r4509, r4400; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +sub.f16x2 r4518, r4417, r4418; +} +{ +mul.f16x2 r4521, r4518, r4399; +} +{ +sub.f16x2 r4524, r4423, r4424; +} +{ +mul.f16x2 r4527, r4524, r4402; +} +{ +add.f16x2 r4530, r4521, r4527; +} +{ +sub.f16x2 r4533, r4515, r4530; +} +{ +add.f16x2 r4536, r4405, r4406; +} +{ +mul.f16x2 r4539, r4536, r4398; +} +{ +add.f16x2 r4542, r4408, r4539; +} +{ +add.f16x2 r4545, r4411, r4412; +} +{ +mul.f16x2 r4548, r4545, r4400; +} +{ +add.f16x2 r4551, r4542, r4548; +} +{ +sub.f16x2 r4554, r4417, r4418; +} +{ +mul.f16x2 r4557, r4554, r4399; +} +{ +sub.f16x2 r4560, r4423, r4424; +} +{ +mul.f16x2 r4563, r4560, r4402; +} +{ +add.f16x2 r4566, r4557, r4563; +} +{ +add.f16x2 r4569, r4551, r4566; +} +{ +add.f16x2 r4572, r4417, r4418; +} +{ +mul.f16x2 r4575, r4572, r4396; +} +{ +add.f16x2 r4578, r4420, r4575; +} +{ +add.f16x2 r4581, r4423, r4424; +} +{ +mul.f16x2 r4584, r4581, r4398; +} +{ +add.f16x2 r4587, r4578, r4584; +} +{ +sub.f16x2 r4590, r4405, r4406; +} +{ +mul.f16x2 r4593, r4590, r4397; +} +{ +sub.f16x2 r4596, r4411, r4412; +} +{ +mul.f16x2 r4599, r4596, r4399; +} +{ +add.f16x2 r4602, r4593, r4599; +} +{ +add.f16x2 r4605, r4587, r4602; +} +{ +add.f16x2 r4608, r4417, r4418; +} +{ +mul.f16x2 r4611, r4608, r4396; +} +{ +add.f16x2 r4614, r4420, r4611; +} +{ +add.f16x2 r4617, r4423, r4424; +} +{ +mul.f16x2 r4620, r4617, r4398; +} +{ +add.f16x2 r4623, r4614, r4620; +} +{ +sub.f16x2 r4626, r4405, r4406; +} +{ +mul.f16x2 r4629, r4626, r4397; +} +{ +sub.f16x2 r4632, r4411, r4412; +} +{ +mul.f16x2 r4635, r4632, r4399; +} +{ +add.f16x2 r4638, r4629, r4635; +} +{ +sub.f16x2 r4641, r4623, r4638; +} +{ +add.f16x2 r4644, r4417, r4418; +} +{ +mul.f16x2 r4647, r4644, r4398; +} +{ +add.f16x2 r4650, r4420, r4647; +} +{ +add.f16x2 r4653, r4423, r4424; +} +{ +mul.f16x2 r4656, r4653, r4400; +} +{ +add.f16x2 r4659, r4650, r4656; +} +{ +sub.f16x2 r4662, r4405, r4406; +} +{ +mul.f16x2 r4665, r4662, r4399; +} +{ +sub.f16x2 r4668, r4411, r4412; +} +{ +mul.f16x2 r4671, r4668, r4402; +} +{ +add.f16x2 r4674, r4665, r4671; +} +{ +add.f16x2 r4677, r4659, r4674; +} +{ +add.f16x2 r4680, r4417, r4418; +} +{ +mul.f16x2 r4683, r4680, r4398; +} +{ +add.f16x2 r4686, r4420, r4683; +} +{ +add.f16x2 r4689, r4423, r4424; +} +{ +mul.f16x2 r4692, r4689, r4400; +} +{ +add.f16x2 r4695, r4686, r4692; +} +{ +sub.f16x2 r4698, r4405, r4406; +} +{ +mul.f16x2 r4701, r4698, r4399; +} +{ +sub.f16x2 r4704, r4411, r4412; +} +{ +mul.f16x2 r4707, r4704, r4402; +} +{ +add.f16x2 r4710, r4701, r4707; +} +{ +sub.f16x2 r4713, r4695, r4710; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4721, {low, high}; +} +{ +neg.f16x2 r4722, r4721; +} +{ +add.f16x2 r4724, r4725, r4726; +} +{ +add.f16x2 r4727, r4728, r4724; +} +{ +add.f16x2 r4730, r4731, r4732; +} +{ +add.f16x2 r4733, r4727, r4730; +} +{ +add.f16x2 r4736, r4737, r4738; +} +{ +add.f16x2 r4739, r4740, r4736; +} +{ +add.f16x2 r4742, r4743, r4744; +} +{ +add.f16x2 r4745, r4739, r4742; +} +{ +add.f16x2 r4748, r4725, r4726; +} +{ +mul.f16x2 r4751, r4748, r4716; +} +{ +add.f16x2 r4754, r4728, r4751; +} +{ +add.f16x2 r4757, r4731, r4732; +} +{ +mul.f16x2 r4760, r4757, r4718; +} +{ +add.f16x2 r4763, r4754, r4760; +} +{ +sub.f16x2 r4766, r4737, r4738; +} +{ +mul.f16x2 r4769, r4766, r4717; +} +{ +sub.f16x2 r4772, r4743, r4744; +} +{ +mul.f16x2 r4775, r4772, r4719; +} +{ +add.f16x2 r4778, r4769, r4775; +} +{ +sub.f16x2 r4781, r4763, r4778; +} +{ +add.f16x2 r4784, r4725, r4726; +} +{ +mul.f16x2 r4787, r4784, r4716; +} +{ +add.f16x2 r4790, r4728, r4787; +} +{ +add.f16x2 r4793, r4731, r4732; +} +{ +mul.f16x2 r4796, r4793, r4718; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +sub.f16x2 r4802, r4737, r4738; +} +{ +mul.f16x2 r4805, r4802, r4717; +} +{ +sub.f16x2 r4808, r4743, r4744; +} +{ +mul.f16x2 r4811, r4808, r4719; +} +{ +add.f16x2 r4814, r4805, r4811; +} +{ +add.f16x2 r4817, r4799, r4814; +} +{ +add.f16x2 r4820, r4725, r4726; +} +{ +mul.f16x2 r4823, r4820, r4718; +} +{ +add.f16x2 r4826, r4728, r4823; +} +{ +add.f16x2 r4829, r4731, r4732; +} +{ +mul.f16x2 r4832, r4829, r4720; +} +{ +add.f16x2 r4835, r4826, r4832; +} +{ +sub.f16x2 r4838, r4737, r4738; +} +{ +mul.f16x2 r4841, r4838, r4719; +} +{ +sub.f16x2 r4844, r4743, r4744; +} +{ +mul.f16x2 r4847, r4844, r4722; +} +{ +add.f16x2 r4850, r4841, r4847; +} +{ +sub.f16x2 r4853, r4835, r4850; +} +{ +add.f16x2 r4856, r4725, r4726; +} +{ +mul.f16x2 r4859, r4856, r4718; +} +{ +add.f16x2 r4862, r4728, r4859; +} +{ +add.f16x2 r4865, r4731, r4732; +} +{ +mul.f16x2 r4868, r4865, r4720; +} +{ +add.f16x2 r4871, r4862, r4868; +} +{ +sub.f16x2 r4874, r4737, r4738; +} +{ +mul.f16x2 r4877, r4874, r4719; +} +{ +sub.f16x2 r4880, r4743, r4744; +} +{ +mul.f16x2 r4883, r4880, r4722; +} +{ +add.f16x2 r4886, r4877, r4883; +} +{ +add.f16x2 r4889, r4871, r4886; +} +{ +add.f16x2 r4892, r4737, r4738; +} +{ +mul.f16x2 r4895, r4892, r4716; +} +{ +add.f16x2 r4898, r4740, r4895; +} +{ +add.f16x2 r4901, r4743, r4744; +} +{ +mul.f16x2 r4904, r4901, r4718; +} +{ +add.f16x2 r4907, r4898, r4904; +} +{ +sub.f16x2 r4910, r4725, r4726; +} +{ +mul.f16x2 r4913, r4910, r4717; +} +{ +sub.f16x2 r4916, r4731, r4732; +} +{ +mul.f16x2 r4919, r4916, r4719; +} +{ +add.f16x2 r4922, r4913, r4919; +} +{ +add.f16x2 r4925, r4907, r4922; +} +{ +add.f16x2 r4928, r4737, r4738; +} +{ +mul.f16x2 r4931, r4928, r4716; +} +{ +add.f16x2 r4934, r4740, r4931; +} +{ +add.f16x2 r4937, r4743, r4744; +} +{ +mul.f16x2 r4940, r4937, r4718; +} +{ +add.f16x2 r4943, r4934, r4940; +} +{ +sub.f16x2 r4946, r4725, r4726; +} +{ +mul.f16x2 r4949, r4946, r4717; +} +{ +sub.f16x2 r4952, r4731, r4732; +} +{ +mul.f16x2 r4955, r4952, r4719; +} +{ +add.f16x2 r4958, r4949, r4955; +} +{ +sub.f16x2 r4961, r4943, r4958; +} +{ +add.f16x2 r4964, r4737, r4738; +} +{ +mul.f16x2 r4967, r4964, r4718; +} +{ +add.f16x2 r4970, r4740, r4967; +} +{ +add.f16x2 r4973, r4743, r4744; +} +{ +mul.f16x2 r4976, r4973, r4720; +} +{ +add.f16x2 r4979, r4970, r4976; +} +{ +sub.f16x2 r4982, r4725, r4726; +} +{ +mul.f16x2 r4985, r4982, r4719; +} +{ +sub.f16x2 r4988, r4731, r4732; +} +{ +mul.f16x2 r4991, r4988, r4722; +} +{ +add.f16x2 r4994, r4985, r4991; +} +{ +add.f16x2 r4997, r4979, r4994; +} +{ +add.f16x2 r5000, r4737, r4738; +} +{ +mul.f16x2 r5003, r5000, r4718; +} +{ +add.f16x2 r5006, r4740, r5003; +} +{ +add.f16x2 r5009, r4743, r4744; +} +{ +mul.f16x2 r5012, r5009, r4720; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +sub.f16x2 r5018, r4725, r4726; +} +{ +mul.f16x2 r5021, r5018, r4719; +} +{ +sub.f16x2 r5024, r4731, r4732; +} +{ +mul.f16x2 r5027, r5024, r4722; +} +{ +add.f16x2 r5030, r5021, r5027; +} +{ +sub.f16x2 r5033, r5015, r5030; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5038, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5039, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5041, {low, high}; +} +{ +neg.f16x2 r5042, r5041; +} +{ +add.f16x2 r5044, r5045, r5046; +} +{ +add.f16x2 r5047, r5048, r5044; +} +{ +add.f16x2 r5050, r5051, r5052; +} +{ +add.f16x2 r5053, r5047, r5050; +} +{ +add.f16x2 r5056, r5057, r5058; +} +{ +add.f16x2 r5059, r5060, r5056; +} +{ +add.f16x2 r5062, r5063, r5064; +} +{ +add.f16x2 r5065, r5059, r5062; +} +{ +add.f16x2 r5068, r5045, r5046; +} +{ +mul.f16x2 r5071, r5068, r5036; +} +{ +add.f16x2 r5074, r5048, r5071; +} +{ +add.f16x2 r5077, r5051, r5052; +} +{ +mul.f16x2 r5080, r5077, r5038; +} +{ +add.f16x2 r5083, r5074, r5080; +} +{ +sub.f16x2 r5086, r5057, r5058; +} +{ +mul.f16x2 r5089, r5086, r5037; +} +{ +sub.f16x2 r5092, r5063, r5064; +} +{ +mul.f16x2 r5095, r5092, r5039; +} +{ +add.f16x2 r5098, r5089, r5095; +} +{ +sub.f16x2 r5101, r5083, r5098; +} +{ +add.f16x2 r5104, r5045, r5046; +} +{ +mul.f16x2 r5107, r5104, r5036; +} +{ +add.f16x2 r5110, r5048, r5107; +} +{ +add.f16x2 r5113, r5051, r5052; +} +{ +mul.f16x2 r5116, r5113, r5038; +} +{ +add.f16x2 r5119, r5110, r5116; +} +{ +sub.f16x2 r5122, r5057, r5058; +} +{ +mul.f16x2 r5125, r5122, r5037; +} +{ +sub.f16x2 r5128, r5063, r5064; +} +{ +mul.f16x2 r5131, r5128, r5039; +} +{ +add.f16x2 r5134, r5125, r5131; +} +{ +add.f16x2 r5137, r5119, r5134; +} +{ +add.f16x2 r5140, r5045, r5046; +} +{ +mul.f16x2 r5143, r5140, r5038; +} +{ +add.f16x2 r5146, r5048, r5143; +} +{ +add.f16x2 r5149, r5051, r5052; +} +{ +mul.f16x2 r5152, r5149, r5040; +} +{ +add.f16x2 r5155, r5146, r5152; +} +{ +sub.f16x2 r5158, r5057, r5058; +} +{ +mul.f16x2 r5161, r5158, r5039; +} +{ +sub.f16x2 r5164, r5063, r5064; +} +{ +mul.f16x2 r5167, r5164, r5042; +} +{ +add.f16x2 r5170, r5161, r5167; +} +{ +sub.f16x2 r5173, r5155, r5170; +} +{ +add.f16x2 r5176, r5045, r5046; +} +{ +mul.f16x2 r5179, r5176, r5038; +} +{ +add.f16x2 r5182, r5048, r5179; +} +{ +add.f16x2 r5185, r5051, r5052; +} +{ +mul.f16x2 r5188, r5185, r5040; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +sub.f16x2 r5194, r5057, r5058; +} +{ +mul.f16x2 r5197, r5194, r5039; +} +{ +sub.f16x2 r5200, r5063, r5064; +} +{ +mul.f16x2 r5203, r5200, r5042; +} +{ +add.f16x2 r5206, r5197, r5203; +} +{ +add.f16x2 r5209, r5191, r5206; +} +{ +add.f16x2 r5212, r5057, r5058; +} +{ +mul.f16x2 r5215, r5212, r5036; +} +{ +add.f16x2 r5218, r5060, r5215; +} +{ +add.f16x2 r5221, r5063, r5064; +} +{ +mul.f16x2 r5224, r5221, r5038; +} +{ +add.f16x2 r5227, r5218, r5224; +} +{ +sub.f16x2 r5230, r5045, r5046; +} +{ +mul.f16x2 r5233, r5230, r5037; +} +{ +sub.f16x2 r5236, r5051, r5052; +} +{ +mul.f16x2 r5239, r5236, r5039; +} +{ +add.f16x2 r5242, r5233, r5239; +} +{ +add.f16x2 r5245, r5227, r5242; +} +{ +add.f16x2 r5248, r5057, r5058; +} +{ +mul.f16x2 r5251, r5248, r5036; +} +{ +add.f16x2 r5254, r5060, r5251; +} +{ +add.f16x2 r5257, r5063, r5064; +} +{ +mul.f16x2 r5260, r5257, r5038; +} +{ +add.f16x2 r5263, r5254, r5260; +} +{ +sub.f16x2 r5266, r5045, r5046; +} +{ +mul.f16x2 r5269, r5266, r5037; +} +{ +sub.f16x2 r5272, r5051, r5052; +} +{ +mul.f16x2 r5275, r5272, r5039; +} +{ +add.f16x2 r5278, r5269, r5275; +} +{ +sub.f16x2 r5281, r5263, r5278; +} +{ +add.f16x2 r5284, r5057, r5058; +} +{ +mul.f16x2 r5287, r5284, r5038; +} +{ +add.f16x2 r5290, r5060, r5287; +} +{ +add.f16x2 r5293, r5063, r5064; +} +{ +mul.f16x2 r5296, r5293, r5040; +} +{ +add.f16x2 r5299, r5290, r5296; +} +{ +sub.f16x2 r5302, r5045, r5046; +} +{ +mul.f16x2 r5305, r5302, r5039; +} +{ +sub.f16x2 r5308, r5051, r5052; +} +{ +mul.f16x2 r5311, r5308, r5042; +} +{ +add.f16x2 r5314, r5305, r5311; +} +{ +add.f16x2 r5317, r5299, r5314; +} +{ +add.f16x2 r5320, r5057, r5058; +} +{ +mul.f16x2 r5323, r5320, r5038; +} +{ +add.f16x2 r5326, r5060, r5323; +} +{ +add.f16x2 r5329, r5063, r5064; +} +{ +mul.f16x2 r5332, r5329, r5040; +} +{ +add.f16x2 r5335, r5326, r5332; +} +{ +sub.f16x2 r5338, r5045, r5046; +} +{ +mul.f16x2 r5341, r5338, r5039; +} +{ +sub.f16x2 r5344, r5051, r5052; +} +{ +mul.f16x2 r5347, r5344, r5042; +} +{ +add.f16x2 r5350, r5341, r5347; +} +{ +sub.f16x2 r5353, r5335, r5350; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5356, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5357, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5358, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5359, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5360, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5361, {low, high}; +} +{ +neg.f16x2 r5362, r5361; +} +{ +add.f16x2 r5364, r5365, r5366; +} +{ +add.f16x2 r5367, r5368, r5364; +} +{ +add.f16x2 r5370, r5371, r5372; +} +{ +add.f16x2 r5373, r5367, r5370; +} +{ +add.f16x2 r5376, r5377, r5378; +} +{ +add.f16x2 r5379, r5380, r5376; +} +{ +add.f16x2 r5382, r5383, r5384; +} +{ +add.f16x2 r5385, r5379, r5382; +} +{ +add.f16x2 r5388, r5365, r5366; +} +{ +mul.f16x2 r5391, r5388, r5356; +} +{ +add.f16x2 r5394, r5368, r5391; +} +{ +add.f16x2 r5397, r5371, r5372; +} +{ +mul.f16x2 r5400, r5397, r5358; +} +{ +add.f16x2 r5403, r5394, r5400; +} +{ +sub.f16x2 r5406, r5377, r5378; +} +{ +mul.f16x2 r5409, r5406, r5357; +} +{ +sub.f16x2 r5412, r5383, r5384; +} +{ +mul.f16x2 r5415, r5412, r5359; +} +{ +add.f16x2 r5418, r5409, r5415; +} +{ +sub.f16x2 r5421, r5403, r5418; +} +{ +add.f16x2 r5424, r5365, r5366; +} +{ +mul.f16x2 r5427, r5424, r5356; +} +{ +add.f16x2 r5430, r5368, r5427; +} +{ +add.f16x2 r5433, r5371, r5372; +} +{ +mul.f16x2 r5436, r5433, r5358; +} +{ +add.f16x2 r5439, r5430, r5436; +} +{ +sub.f16x2 r5442, r5377, r5378; +} +{ +mul.f16x2 r5445, r5442, r5357; +} +{ +sub.f16x2 r5448, r5383, r5384; +} +{ +mul.f16x2 r5451, r5448, r5359; +} +{ +add.f16x2 r5454, r5445, r5451; +} +{ +add.f16x2 r5457, r5439, r5454; +} +{ +add.f16x2 r5460, r5365, r5366; +} +{ +mul.f16x2 r5463, r5460, r5358; +} +{ +add.f16x2 r5466, r5368, r5463; +} +{ +add.f16x2 r5469, r5371, r5372; +} +{ +mul.f16x2 r5472, r5469, r5360; +} +{ +add.f16x2 r5475, r5466, r5472; +} +{ +sub.f16x2 r5478, r5377, r5378; +} +{ +mul.f16x2 r5481, r5478, r5359; +} +{ +sub.f16x2 r5484, r5383, r5384; +} +{ +mul.f16x2 r5487, r5484, r5362; +} +{ +add.f16x2 r5490, r5481, r5487; +} +{ +sub.f16x2 r5493, r5475, r5490; +} +{ +add.f16x2 r5496, r5365, r5366; +} +{ +mul.f16x2 r5499, r5496, r5358; +} +{ +add.f16x2 r5502, r5368, r5499; +} +{ +add.f16x2 r5505, r5371, r5372; +} +{ +mul.f16x2 r5508, r5505, r5360; +} +{ +add.f16x2 r5511, r5502, r5508; +} +{ +sub.f16x2 r5514, r5377, r5378; +} +{ +mul.f16x2 r5517, r5514, r5359; +} +{ +sub.f16x2 r5520, r5383, r5384; +} +{ +mul.f16x2 r5523, r5520, r5362; +} +{ +add.f16x2 r5526, r5517, r5523; +} +{ +add.f16x2 r5529, r5511, r5526; +} +{ +add.f16x2 r5532, r5377, r5378; +} +{ +mul.f16x2 r5535, r5532, r5356; +} +{ +add.f16x2 r5538, r5380, r5535; +} +{ +add.f16x2 r5541, r5383, r5384; +} +{ +mul.f16x2 r5544, r5541, r5358; +} +{ +add.f16x2 r5547, r5538, r5544; +} +{ +sub.f16x2 r5550, r5365, r5366; +} +{ +mul.f16x2 r5553, r5550, r5357; +} +{ +sub.f16x2 r5556, r5371, r5372; +} +{ +mul.f16x2 r5559, r5556, r5359; +} +{ +add.f16x2 r5562, r5553, r5559; +} +{ +add.f16x2 r5565, r5547, r5562; +} +{ +add.f16x2 r5568, r5377, r5378; +} +{ +mul.f16x2 r5571, r5568, r5356; +} +{ +add.f16x2 r5574, r5380, r5571; +} +{ +add.f16x2 r5577, r5383, r5384; +} +{ +mul.f16x2 r5580, r5577, r5358; +} +{ +add.f16x2 r5583, r5574, r5580; +} +{ +sub.f16x2 r5586, r5365, r5366; +} +{ +mul.f16x2 r5589, r5586, r5357; +} +{ +sub.f16x2 r5592, r5371, r5372; +} +{ +mul.f16x2 r5595, r5592, r5359; +} +{ +add.f16x2 r5598, r5589, r5595; +} +{ +sub.f16x2 r5601, r5583, r5598; +} +{ +add.f16x2 r5604, r5377, r5378; +} +{ +mul.f16x2 r5607, r5604, r5358; +} +{ +add.f16x2 r5610, r5380, r5607; +} +{ +add.f16x2 r5613, r5383, r5384; +} +{ +mul.f16x2 r5616, r5613, r5360; +} +{ +add.f16x2 r5619, r5610, r5616; +} +{ +sub.f16x2 r5622, r5365, r5366; +} +{ +mul.f16x2 r5625, r5622, r5359; +} +{ +sub.f16x2 r5628, r5371, r5372; +} +{ +mul.f16x2 r5631, r5628, r5362; +} +{ +add.f16x2 r5634, r5625, r5631; +} +{ +add.f16x2 r5637, r5619, r5634; +} +{ +add.f16x2 r5640, r5377, r5378; +} +{ +mul.f16x2 r5643, r5640, r5358; +} +{ +add.f16x2 r5646, r5380, r5643; +} +{ +add.f16x2 r5649, r5383, r5384; +} +{ +mul.f16x2 r5652, r5649, r5360; +} +{ +add.f16x2 r5655, r5646, r5652; +} +{ +sub.f16x2 r5658, r5365, r5366; +} +{ +mul.f16x2 r5661, r5658, r5359; +} +{ +sub.f16x2 r5664, r5371, r5372; +} +{ +mul.f16x2 r5667, r5664, r5362; +} +{ +add.f16x2 r5670, r5661, r5667; +} +{ +sub.f16x2 r5673, r5655, r5670; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5676, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5677, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5678, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5679, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5680, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5681, {low, high}; +} +{ +neg.f16x2 r5682, r5681; +} +{ +add.f16x2 r5684, r5685, r5686; +} +{ +add.f16x2 r5687, r5688, r5684; +} +{ +add.f16x2 r5690, r5691, r5692; +} +{ +add.f16x2 r5693, r5687, r5690; +} +{ +add.f16x2 r5696, r5697, r5698; +} +{ +add.f16x2 r5699, r5700, r5696; +} +{ +add.f16x2 r5702, r5703, r5704; +} +{ +add.f16x2 r5705, r5699, r5702; +} +{ +add.f16x2 r5708, r5685, r5686; +} +{ +mul.f16x2 r5711, r5708, r5676; +} +{ +add.f16x2 r5714, r5688, r5711; +} +{ +add.f16x2 r5717, r5691, r5692; +} +{ +mul.f16x2 r5720, r5717, r5678; +} +{ +add.f16x2 r5723, r5714, r5720; +} +{ +sub.f16x2 r5726, r5697, r5698; +} +{ +mul.f16x2 r5729, r5726, r5677; +} +{ +sub.f16x2 r5732, r5703, r5704; +} +{ +mul.f16x2 r5735, r5732, r5679; +} +{ +add.f16x2 r5738, r5729, r5735; +} +{ +sub.f16x2 r5741, r5723, r5738; +} +{ +add.f16x2 r5744, r5685, r5686; +} +{ +mul.f16x2 r5747, r5744, r5676; +} +{ +add.f16x2 r5750, r5688, r5747; +} +{ +add.f16x2 r5753, r5691, r5692; +} +{ +mul.f16x2 r5756, r5753, r5678; +} +{ +add.f16x2 r5759, r5750, r5756; +} +{ +sub.f16x2 r5762, r5697, r5698; +} +{ +mul.f16x2 r5765, r5762, r5677; +} +{ +sub.f16x2 r5768, r5703, r5704; +} +{ +mul.f16x2 r5771, r5768, r5679; +} +{ +add.f16x2 r5774, r5765, r5771; +} +{ +add.f16x2 r5777, r5759, r5774; +} +{ +add.f16x2 r5780, r5685, r5686; +} +{ +mul.f16x2 r5783, r5780, r5678; +} +{ +add.f16x2 r5786, r5688, r5783; +} +{ +add.f16x2 r5789, r5691, r5692; +} +{ +mul.f16x2 r5792, r5789, r5680; +} +{ +add.f16x2 r5795, r5786, r5792; +} +{ +sub.f16x2 r5798, r5697, r5698; +} +{ +mul.f16x2 r5801, r5798, r5679; +} +{ +sub.f16x2 r5804, r5703, r5704; +} +{ +mul.f16x2 r5807, r5804, r5682; +} +{ +add.f16x2 r5810, r5801, r5807; +} +{ +sub.f16x2 r5813, r5795, r5810; +} +{ +add.f16x2 r5816, r5685, r5686; +} +{ +mul.f16x2 r5819, r5816, r5678; +} +{ +add.f16x2 r5822, r5688, r5819; +} +{ +add.f16x2 r5825, r5691, r5692; +} +{ +mul.f16x2 r5828, r5825, r5680; +} +{ +add.f16x2 r5831, r5822, r5828; +} +{ +sub.f16x2 r5834, r5697, r5698; +} +{ +mul.f16x2 r5837, r5834, r5679; +} +{ +sub.f16x2 r5840, r5703, r5704; +} +{ +mul.f16x2 r5843, r5840, r5682; +} +{ +add.f16x2 r5846, r5837, r5843; +} +{ +add.f16x2 r5849, r5831, r5846; +} +{ +add.f16x2 r5852, r5697, r5698; +} +{ +mul.f16x2 r5855, r5852, r5676; +} +{ +add.f16x2 r5858, r5700, r5855; +} +{ +add.f16x2 r5861, r5703, r5704; +} +{ +mul.f16x2 r5864, r5861, r5678; +} +{ +add.f16x2 r5867, r5858, r5864; +} +{ +sub.f16x2 r5870, r5685, r5686; +} +{ +mul.f16x2 r5873, r5870, r5677; +} +{ +sub.f16x2 r5876, r5691, r5692; +} +{ +mul.f16x2 r5879, r5876, r5679; +} +{ +add.f16x2 r5882, r5873, r5879; +} +{ +add.f16x2 r5885, r5867, r5882; +} +{ +add.f16x2 r5888, r5697, r5698; +} +{ +mul.f16x2 r5891, r5888, r5676; +} +{ +add.f16x2 r5894, r5700, r5891; +} +{ +add.f16x2 r5897, r5703, r5704; +} +{ +mul.f16x2 r5900, r5897, r5678; +} +{ +add.f16x2 r5903, r5894, r5900; +} +{ +sub.f16x2 r5906, r5685, r5686; +} +{ +mul.f16x2 r5909, r5906, r5677; +} +{ +sub.f16x2 r5912, r5691, r5692; +} +{ +mul.f16x2 r5915, r5912, r5679; +} +{ +add.f16x2 r5918, r5909, r5915; +} +{ +sub.f16x2 r5921, r5903, r5918; +} +{ +add.f16x2 r5924, r5697, r5698; +} +{ +mul.f16x2 r5927, r5924, r5678; +} +{ +add.f16x2 r5930, r5700, r5927; +} +{ +add.f16x2 r5933, r5703, r5704; +} +{ +mul.f16x2 r5936, r5933, r5680; +} +{ +add.f16x2 r5939, r5930, r5936; +} +{ +sub.f16x2 r5942, r5685, r5686; +} +{ +mul.f16x2 r5945, r5942, r5679; +} +{ +sub.f16x2 r5948, r5691, r5692; +} +{ +mul.f16x2 r5951, r5948, r5682; +} +{ +add.f16x2 r5954, r5945, r5951; +} +{ +add.f16x2 r5957, r5939, r5954; +} +{ +add.f16x2 r5960, r5697, r5698; +} +{ +mul.f16x2 r5963, r5960, r5678; +} +{ +add.f16x2 r5966, r5700, r5963; +} +{ +add.f16x2 r5969, r5703, r5704; +} +{ +mul.f16x2 r5972, r5969, r5680; +} +{ +add.f16x2 r5975, r5966, r5972; +} +{ +sub.f16x2 r5978, r5685, r5686; +} +{ +mul.f16x2 r5981, r5978, r5679; +} +{ +sub.f16x2 r5984, r5691, r5692; +} +{ +mul.f16x2 r5987, r5984, r5682; +} +{ +add.f16x2 r5990, r5981, r5987; +} +{ +sub.f16x2 r5993, r5975, r5990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r5996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r5997, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r5998, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r5999, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6000, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6001, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6002, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6003, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6006, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6007, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6027, {low, high}; +} +{ +mul.f16x2 r6044, r4781, r5996; +} +{ +mul.f16x2 r6047, r4925, r5997; +} +{ +sub.f16x2 r6050, r6044, r6047; +} +{ +mul.f16x2 r6053, r4781, r5997; +} +{ +fma.rn.f16x2 r6056, r4925, r5996, r6053; +} +{ +mul.f16x2 r6060, r5101, r5998; +} +{ +mul.f16x2 r6063, r5245, r5999; +} +{ +sub.f16x2 r6066, r6060, r6063; +} +{ +mul.f16x2 r6069, r5101, r5999; +} +{ +fma.rn.f16x2 r6072, r5245, r5998, r6069; +} +{ +mul.f16x2 r6076, r5421, r6000; +} +{ +mul.f16x2 r6079, r5565, r6001; +} +{ +sub.f16x2 r6082, r6076, r6079; +} +{ +mul.f16x2 r6085, r5421, r6001; +} +{ +fma.rn.f16x2 r6088, r5565, r6000, r6085; +} +{ +mul.f16x2 r6092, r5741, r6002; +} +{ +mul.f16x2 r6095, r5885, r6003; +} +{ +sub.f16x2 r6098, r6092, r6095; +} +{ +mul.f16x2 r6101, r5741, r6003; +} +{ +fma.rn.f16x2 r6104, r5885, r6002, r6101; +} +{ +mul.f16x2 r6108, r4853, r5998; +} +{ +mul.f16x2 r6111, r4997, r5999; +} +{ +sub.f16x2 r6114, r6108, r6111; +} +{ +mul.f16x2 r6117, r4853, r5999; +} +{ +fma.rn.f16x2 r6120, r4997, r5998, r6117; +} +{ +mul.f16x2 r6124, r5173, r6002; +} +{ +mul.f16x2 r6127, r5317, r6003; +} +{ +sub.f16x2 r6130, r6124, r6127; +} +{ +mul.f16x2 r6133, r5173, r6003; +} +{ +fma.rn.f16x2 r6136, r5317, r6002, r6133; +} +{ +mul.f16x2 r6140, r5493, r6006; +} +{ +mul.f16x2 r6143, r5637, r6007; +} +{ +sub.f16x2 r6146, r6140, r6143; +} +{ +mul.f16x2 r6149, r5493, r6007; +} +{ +fma.rn.f16x2 r6152, r5637, r6006, r6149; +} +{ +mul.f16x2 r6156, r5813, r6010; +} +{ +mul.f16x2 r6159, r5957, r6011; +} +{ +sub.f16x2 r6162, r6156, r6159; +} +{ +mul.f16x2 r6165, r5813, r6011; +} +{ +fma.rn.f16x2 r6168, r5957, r6010, r6165; +} +{ +mul.f16x2 r6172, r4889, r6000; +} +{ +mul.f16x2 r6175, r5033, r6001; +} +{ +sub.f16x2 r6178, r6172, r6175; +} +{ +mul.f16x2 r6181, r4889, r6001; +} +{ +fma.rn.f16x2 r6184, r5033, r6000, r6181; +} +{ +mul.f16x2 r6188, r5209, r6006; +} +{ +mul.f16x2 r6191, r5353, r6007; +} +{ +sub.f16x2 r6194, r6188, r6191; +} +{ +mul.f16x2 r6197, r5209, r6007; +} +{ +fma.rn.f16x2 r6200, r5353, r6006, r6197; +} +{ +mul.f16x2 r6204, r5529, r6012; +} +{ +mul.f16x2 r6207, r5673, r6013; +} +{ +sub.f16x2 r6210, r6204, r6207; +} +{ +mul.f16x2 r6213, r5529, r6013; +} +{ +fma.rn.f16x2 r6216, r5673, r6012, r6213; +} +{ +mul.f16x2 r6220, r5849, r6018; +} +{ +mul.f16x2 r6223, r5993, r6019; +} +{ +sub.f16x2 r6226, r6220, r6223; +} +{ +mul.f16x2 r6229, r5849, r6019; +} +{ +fma.rn.f16x2 r6232, r5993, r6018, r6229; +} +{ +mul.f16x2 r6236, r4817, r6002; +} +{ +mul.f16x2 r6239, r4961, r6003; +} +{ +sub.f16x2 r6242, r6236, r6239; +} +{ +mul.f16x2 r6245, r4817, r6003; +} +{ +fma.rn.f16x2 r6248, r4961, r6002, r6245; +} +{ +mul.f16x2 r6252, r5137, r6010; +} +{ +mul.f16x2 r6255, r5281, r6011; +} +{ +sub.f16x2 r6258, r6252, r6255; +} +{ +mul.f16x2 r6261, r5137, r6011; +} +{ +fma.rn.f16x2 r6264, r5281, r6010, r6261; +} +{ +mul.f16x2 r6268, r5457, r6018; +} +{ +mul.f16x2 r6271, r5601, r6019; +} +{ +sub.f16x2 r6274, r6268, r6271; +} +{ +mul.f16x2 r6277, r5457, r6019; +} +{ +fma.rn.f16x2 r6280, r5601, r6018, r6277; +} +{ +mul.f16x2 r6284, r5777, r6026; +} +{ +mul.f16x2 r6287, r5921, r6027; +} +{ +sub.f16x2 r6290, r6284, r6287; +} +{ +mul.f16x2 r6293, r5777, r6027; +} +{ +fma.rn.f16x2 r6296, r5921, r6026, r6293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6302, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6303, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6305, {low, high}; +} +{ +neg.f16x2 r6306, r6305; +} +{ +add.f16x2 r6308, r4733, r5693; +} +{ +add.f16x2 r6311, r4413, r6308; +} +{ +add.f16x2 r6314, r5053, r5373; +} +{ +add.f16x2 %0, r6311, r6314; +} +{ +add.f16x2 r6320, r4745, r5705; +} +{ +add.f16x2 r6323, r4425, r6320; +} +{ +add.f16x2 r6326, r5065, r5385; +} +{ +add.f16x2 %1, r6323, r6326; +} +{ +add.f16x2 r6332, r4733, r5693; +} +{ +mul.f16x2 r6335, r6332, r6300; +} +{ +add.f16x2 r6338, r4413, r6335; +} +{ +add.f16x2 r6341, r5053, r5373; +} +{ +mul.f16x2 r6344, r6341, r6302; +} +{ +add.f16x2 r6347, r6338, r6344; +} +{ +sub.f16x2 r6350, r4745, r5705; +} +{ +mul.f16x2 r6353, r6350, r6301; +} +{ +sub.f16x2 r6356, r5065, r5385; +} +{ +mul.f16x2 r6359, r6356, r6303; +} +{ +add.f16x2 r6362, r6353, r6359; +} +{ +sub.f16x2 %10, r6347, r6362; +} +{ +add.f16x2 r6368, r4733, r5693; +} +{ +mul.f16x2 r6371, r6368, r6300; +} +{ +add.f16x2 r6374, r4413, r6371; +} +{ +add.f16x2 r6377, r5053, r5373; +} +{ +mul.f16x2 r6380, r6377, r6302; +} +{ +add.f16x2 r6383, r6374, r6380; +} +{ +sub.f16x2 r6386, r4745, r5705; +} +{ +mul.f16x2 r6389, r6386, r6301; +} +{ +sub.f16x2 r6392, r5065, r5385; +} +{ +mul.f16x2 r6395, r6392, r6303; +} +{ +add.f16x2 r6398, r6389, r6395; +} +{ +add.f16x2 %40, r6383, r6398; +} +{ +add.f16x2 r6404, r4733, r5693; +} +{ +mul.f16x2 r6407, r6404, r6302; +} +{ +add.f16x2 r6410, r4413, r6407; +} +{ +add.f16x2 r6413, r5053, r5373; +} +{ +mul.f16x2 r6416, r6413, r6304; +} +{ +add.f16x2 r6419, r6410, r6416; +} +{ +sub.f16x2 r6422, r4745, r5705; +} +{ +mul.f16x2 r6425, r6422, r6303; +} +{ +sub.f16x2 r6428, r5065, r5385; +} +{ +mul.f16x2 r6431, r6428, r6306; +} +{ +add.f16x2 r6434, r6425, r6431; +} +{ +sub.f16x2 %20, r6419, r6434; +} +{ +add.f16x2 r6440, r4733, r5693; +} +{ +mul.f16x2 r6443, r6440, r6302; +} +{ +add.f16x2 r6446, r4413, r6443; +} +{ +add.f16x2 r6449, r5053, r5373; +} +{ +mul.f16x2 r6452, r6449, r6304; +} +{ +add.f16x2 r6455, r6446, r6452; +} +{ +sub.f16x2 r6458, r4745, r5705; +} +{ +mul.f16x2 r6461, r6458, r6303; +} +{ +sub.f16x2 r6464, r5065, r5385; +} +{ +mul.f16x2 r6467, r6464, r6306; +} +{ +add.f16x2 r6470, r6461, r6467; +} +{ +add.f16x2 %30, r6455, r6470; +} +{ +add.f16x2 r6476, r4745, r5705; +} +{ +mul.f16x2 r6479, r6476, r6300; +} +{ +add.f16x2 r6482, r4425, r6479; +} +{ +add.f16x2 r6485, r5065, r5385; +} +{ +mul.f16x2 r6488, r6485, r6302; +} +{ +add.f16x2 r6491, r6482, r6488; +} +{ +sub.f16x2 r6494, r4733, r5693; +} +{ +mul.f16x2 r6497, r6494, r6301; +} +{ +sub.f16x2 r6500, r5053, r5373; +} +{ +mul.f16x2 r6503, r6500, r6303; +} +{ +add.f16x2 r6506, r6497, r6503; +} +{ +add.f16x2 %11, r6491, r6506; +} +{ +add.f16x2 r6512, r4745, r5705; +} +{ +mul.f16x2 r6515, r6512, r6300; +} +{ +add.f16x2 r6518, r4425, r6515; +} +{ +add.f16x2 r6521, r5065, r5385; +} +{ +mul.f16x2 r6524, r6521, r6302; +} +{ +add.f16x2 r6527, r6518, r6524; +} +{ +sub.f16x2 r6530, r4733, r5693; +} +{ +mul.f16x2 r6533, r6530, r6301; +} +{ +sub.f16x2 r6536, r5053, r5373; +} +{ +mul.f16x2 r6539, r6536, r6303; +} +{ +add.f16x2 r6542, r6533, r6539; +} +{ +sub.f16x2 %41, r6527, r6542; +} +{ +add.f16x2 r6548, r4745, r5705; +} +{ +mul.f16x2 r6551, r6548, r6302; +} +{ +add.f16x2 r6554, r4425, r6551; +} +{ +add.f16x2 r6557, r5065, r5385; +} +{ +mul.f16x2 r6560, r6557, r6304; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +sub.f16x2 r6566, r4733, r5693; +} +{ +mul.f16x2 r6569, r6566, r6303; +} +{ +sub.f16x2 r6572, r5053, r5373; +} +{ +mul.f16x2 r6575, r6572, r6306; +} +{ +add.f16x2 r6578, r6569, r6575; +} +{ +add.f16x2 %21, r6563, r6578; +} +{ +add.f16x2 r6584, r4745, r5705; +} +{ +mul.f16x2 r6587, r6584, r6302; +} +{ +add.f16x2 r6590, r4425, r6587; +} +{ +add.f16x2 r6593, r5065, r5385; +} +{ +mul.f16x2 r6596, r6593, r6304; +} +{ +add.f16x2 r6599, r6590, r6596; +} +{ +sub.f16x2 r6602, r4733, r5693; +} +{ +mul.f16x2 r6605, r6602, r6303; +} +{ +sub.f16x2 r6608, r5053, r5373; +} +{ +mul.f16x2 r6611, r6608, r6306; +} +{ +add.f16x2 r6614, r6605, r6611; +} +{ +sub.f16x2 %31, r6599, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6621, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6623, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6625, {low, high}; +} +{ +neg.f16x2 r6626, r6625; +} +{ +add.f16x2 r6628, r6050, r6098; +} +{ +add.f16x2 r6631, r4461, r6628; +} +{ +add.f16x2 r6634, r6066, r6082; +} +{ +add.f16x2 %2, r6631, r6634; +} +{ +add.f16x2 r6640, r6056, r6104; +} +{ +add.f16x2 r6643, r4605, r6640; +} +{ +add.f16x2 r6646, r6072, r6088; +} +{ +add.f16x2 %3, r6643, r6646; +} +{ +add.f16x2 r6652, r6050, r6098; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4461, r6655; +} +{ +add.f16x2 r6661, r6066, r6082; +} +{ +mul.f16x2 r6664, r6661, r6622; +} +{ +add.f16x2 r6667, r6658, r6664; +} +{ +sub.f16x2 r6670, r6056, r6104; +} +{ +mul.f16x2 r6673, r6670, r6621; +} +{ +sub.f16x2 r6676, r6072, r6088; +} +{ +mul.f16x2 r6679, r6676, r6623; +} +{ +add.f16x2 r6682, r6673, r6679; +} +{ +sub.f16x2 %12, r6667, r6682; +} +{ +add.f16x2 r6688, r6050, r6098; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4461, r6691; +} +{ +add.f16x2 r6697, r6066, r6082; +} +{ +mul.f16x2 r6700, r6697, r6622; +} +{ +add.f16x2 r6703, r6694, r6700; +} +{ +sub.f16x2 r6706, r6056, r6104; +} +{ +mul.f16x2 r6709, r6706, r6621; +} +{ +sub.f16x2 r6712, r6072, r6088; +} +{ +mul.f16x2 r6715, r6712, r6623; +} +{ +add.f16x2 r6718, r6709, r6715; +} +{ +add.f16x2 %42, r6703, r6718; +} +{ +add.f16x2 r6724, r6050, r6098; +} +{ +mul.f16x2 r6727, r6724, r6622; +} +{ +add.f16x2 r6730, r4461, r6727; +} +{ +add.f16x2 r6733, r6066, r6082; +} +{ +mul.f16x2 r6736, r6733, r6624; +} +{ +add.f16x2 r6739, r6730, r6736; +} +{ +sub.f16x2 r6742, r6056, r6104; +} +{ +mul.f16x2 r6745, r6742, r6623; +} +{ +sub.f16x2 r6748, r6072, r6088; +} +{ +mul.f16x2 r6751, r6748, r6626; +} +{ +add.f16x2 r6754, r6745, r6751; +} +{ +sub.f16x2 %22, r6739, r6754; +} +{ +add.f16x2 r6760, r6050, r6098; +} +{ +mul.f16x2 r6763, r6760, r6622; +} +{ +add.f16x2 r6766, r4461, r6763; +} +{ +add.f16x2 r6769, r6066, r6082; +} +{ +mul.f16x2 r6772, r6769, r6624; +} +{ +add.f16x2 r6775, r6766, r6772; +} +{ +sub.f16x2 r6778, r6056, r6104; +} +{ +mul.f16x2 r6781, r6778, r6623; +} +{ +sub.f16x2 r6784, r6072, r6088; +} +{ +mul.f16x2 r6787, r6784, r6626; +} +{ +add.f16x2 r6790, r6781, r6787; +} +{ +add.f16x2 %32, r6775, r6790; +} +{ +add.f16x2 r6796, r6056, r6104; +} +{ +mul.f16x2 r6799, r6796, r6620; +} +{ +add.f16x2 r6802, r4605, r6799; +} +{ +add.f16x2 r6805, r6072, r6088; +} +{ +mul.f16x2 r6808, r6805, r6622; +} +{ +add.f16x2 r6811, r6802, r6808; +} +{ +sub.f16x2 r6814, r6050, r6098; +} +{ +mul.f16x2 r6817, r6814, r6621; +} +{ +sub.f16x2 r6820, r6066, r6082; +} +{ +mul.f16x2 r6823, r6820, r6623; +} +{ +add.f16x2 r6826, r6817, r6823; +} +{ +add.f16x2 %13, r6811, r6826; +} +{ +add.f16x2 r6832, r6056, r6104; +} +{ +mul.f16x2 r6835, r6832, r6620; +} +{ +add.f16x2 r6838, r4605, r6835; +} +{ +add.f16x2 r6841, r6072, r6088; +} +{ +mul.f16x2 r6844, r6841, r6622; +} +{ +add.f16x2 r6847, r6838, r6844; +} +{ +sub.f16x2 r6850, r6050, r6098; +} +{ +mul.f16x2 r6853, r6850, r6621; +} +{ +sub.f16x2 r6856, r6066, r6082; +} +{ +mul.f16x2 r6859, r6856, r6623; +} +{ +add.f16x2 r6862, r6853, r6859; +} +{ +sub.f16x2 %43, r6847, r6862; +} +{ +add.f16x2 r6868, r6056, r6104; +} +{ +mul.f16x2 r6871, r6868, r6622; +} +{ +add.f16x2 r6874, r4605, r6871; +} +{ +add.f16x2 r6877, r6072, r6088; +} +{ +mul.f16x2 r6880, r6877, r6624; +} +{ +add.f16x2 r6883, r6874, r6880; +} +{ +sub.f16x2 r6886, r6050, r6098; +} +{ +mul.f16x2 r6889, r6886, r6623; +} +{ +sub.f16x2 r6892, r6066, r6082; +} +{ +mul.f16x2 r6895, r6892, r6626; +} +{ +add.f16x2 r6898, r6889, r6895; +} +{ +add.f16x2 %23, r6883, r6898; +} +{ +add.f16x2 r6904, r6056, r6104; +} +{ +mul.f16x2 r6907, r6904, r6622; +} +{ +add.f16x2 r6910, r4605, r6907; +} +{ +add.f16x2 r6913, r6072, r6088; +} +{ +mul.f16x2 r6916, r6913, r6624; +} +{ +add.f16x2 r6919, r6910, r6916; +} +{ +sub.f16x2 r6922, r6050, r6098; +} +{ +mul.f16x2 r6925, r6922, r6623; +} +{ +sub.f16x2 r6928, r6066, r6082; +} +{ +mul.f16x2 r6931, r6928, r6626; +} +{ +add.f16x2 r6934, r6925, r6931; +} +{ +sub.f16x2 %33, r6919, r6934; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6940, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6941, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6945, {low, high}; +} +{ +neg.f16x2 r6946, r6945; +} +{ +add.f16x2 r6948, r6114, r6162; +} +{ +add.f16x2 r6951, r4533, r6948; +} +{ +add.f16x2 r6954, r6130, r6146; +} +{ +add.f16x2 %4, r6951, r6954; +} +{ +add.f16x2 r6960, r6120, r6168; +} +{ +add.f16x2 r6963, r4677, r6960; +} +{ +add.f16x2 r6966, r6136, r6152; +} +{ +add.f16x2 %5, r6963, r6966; +} +{ +add.f16x2 r6972, r6114, r6162; +} +{ +mul.f16x2 r6975, r6972, r6940; +} +{ +add.f16x2 r6978, r4533, r6975; +} +{ +add.f16x2 r6981, r6130, r6146; +} +{ +mul.f16x2 r6984, r6981, r6942; +} +{ +add.f16x2 r6987, r6978, r6984; +} +{ +sub.f16x2 r6990, r6120, r6168; +} +{ +mul.f16x2 r6993, r6990, r6941; +} +{ +sub.f16x2 r6996, r6136, r6152; +} +{ +mul.f16x2 r6999, r6996, r6943; +} +{ +add.f16x2 r7002, r6993, r6999; +} +{ +sub.f16x2 %14, r6987, r7002; +} +{ +add.f16x2 r7008, r6114, r6162; +} +{ +mul.f16x2 r7011, r7008, r6940; +} +{ +add.f16x2 r7014, r4533, r7011; +} +{ +add.f16x2 r7017, r6130, r6146; +} +{ +mul.f16x2 r7020, r7017, r6942; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6120, r6168; +} +{ +mul.f16x2 r7029, r7026, r6941; +} +{ +sub.f16x2 r7032, r6136, r6152; +} +{ +mul.f16x2 r7035, r7032, r6943; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +add.f16x2 %44, r7023, r7038; +} +{ +add.f16x2 r7044, r6114, r6162; +} +{ +mul.f16x2 r7047, r7044, r6942; +} +{ +add.f16x2 r7050, r4533, r7047; +} +{ +add.f16x2 r7053, r6130, r6146; +} +{ +mul.f16x2 r7056, r7053, r6944; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6120, r6168; +} +{ +mul.f16x2 r7065, r7062, r6943; +} +{ +sub.f16x2 r7068, r6136, r6152; +} +{ +mul.f16x2 r7071, r7068, r6946; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +sub.f16x2 %24, r7059, r7074; +} +{ +add.f16x2 r7080, r6114, r6162; +} +{ +mul.f16x2 r7083, r7080, r6942; +} +{ +add.f16x2 r7086, r4533, r7083; +} +{ +add.f16x2 r7089, r6130, r6146; +} +{ +mul.f16x2 r7092, r7089, r6944; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6120, r6168; +} +{ +mul.f16x2 r7101, r7098, r6943; +} +{ +sub.f16x2 r7104, r6136, r6152; +} +{ +mul.f16x2 r7107, r7104, r6946; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +add.f16x2 %34, r7095, r7110; +} +{ +add.f16x2 r7116, r6120, r6168; +} +{ +mul.f16x2 r7119, r7116, r6940; +} +{ +add.f16x2 r7122, r4677, r7119; +} +{ +add.f16x2 r7125, r6136, r6152; +} +{ +mul.f16x2 r7128, r7125, r6942; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6114, r6162; +} +{ +mul.f16x2 r7137, r7134, r6941; +} +{ +sub.f16x2 r7140, r6130, r6146; +} +{ +mul.f16x2 r7143, r7140, r6943; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 %15, r7131, r7146; +} +{ +add.f16x2 r7152, r6120, r6168; +} +{ +mul.f16x2 r7155, r7152, r6940; +} +{ +add.f16x2 r7158, r4677, r7155; +} +{ +add.f16x2 r7161, r6136, r6152; +} +{ +mul.f16x2 r7164, r7161, r6942; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6114, r6162; +} +{ +mul.f16x2 r7173, r7170, r6941; +} +{ +sub.f16x2 r7176, r6130, r6146; +} +{ +mul.f16x2 r7179, r7176, r6943; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +sub.f16x2 %45, r7167, r7182; +} +{ +add.f16x2 r7188, r6120, r6168; +} +{ +mul.f16x2 r7191, r7188, r6942; +} +{ +add.f16x2 r7194, r4677, r7191; +} +{ +add.f16x2 r7197, r6136, r6152; +} +{ +mul.f16x2 r7200, r7197, r6944; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6114, r6162; +} +{ +mul.f16x2 r7209, r7206, r6943; +} +{ +sub.f16x2 r7212, r6130, r6146; +} +{ +mul.f16x2 r7215, r7212, r6946; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +add.f16x2 %25, r7203, r7218; +} +{ +add.f16x2 r7224, r6120, r6168; +} +{ +mul.f16x2 r7227, r7224, r6942; +} +{ +add.f16x2 r7230, r4677, r7227; +} +{ +add.f16x2 r7233, r6136, r6152; +} +{ +mul.f16x2 r7236, r7233, r6944; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6114, r6162; +} +{ +mul.f16x2 r7245, r7242, r6943; +} +{ +sub.f16x2 r7248, r6130, r6146; +} +{ +mul.f16x2 r7251, r7248, r6946; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +sub.f16x2 %35, r7239, r7254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7265, {low, high}; +} +{ +neg.f16x2 r7266, r7265; +} +{ +add.f16x2 r7268, r6178, r6226; +} +{ +add.f16x2 r7271, r4569, r7268; +} +{ +add.f16x2 r7274, r6194, r6210; +} +{ +add.f16x2 %6, r7271, r7274; +} +{ +add.f16x2 r7280, r6184, r6232; +} +{ +add.f16x2 r7283, r4713, r7280; +} +{ +add.f16x2 r7286, r6200, r6216; +} +{ +add.f16x2 %7, r7283, r7286; +} +{ +add.f16x2 r7292, r6178, r6226; +} +{ +mul.f16x2 r7295, r7292, r7260; +} +{ +add.f16x2 r7298, r4569, r7295; +} +{ +add.f16x2 r7301, r6194, r6210; +} +{ +mul.f16x2 r7304, r7301, r7262; +} +{ +add.f16x2 r7307, r7298, r7304; +} +{ +sub.f16x2 r7310, r6184, r6232; +} +{ +mul.f16x2 r7313, r7310, r7261; +} +{ +sub.f16x2 r7316, r6200, r6216; +} +{ +mul.f16x2 r7319, r7316, r7263; +} +{ +add.f16x2 r7322, r7313, r7319; +} +{ +sub.f16x2 %16, r7307, r7322; +} +{ +add.f16x2 r7328, r6178, r6226; +} +{ +mul.f16x2 r7331, r7328, r7260; +} +{ +add.f16x2 r7334, r4569, r7331; +} +{ +add.f16x2 r7337, r6194, r6210; +} +{ +mul.f16x2 r7340, r7337, r7262; +} +{ +add.f16x2 r7343, r7334, r7340; +} +{ +sub.f16x2 r7346, r6184, r6232; +} +{ +mul.f16x2 r7349, r7346, r7261; +} +{ +sub.f16x2 r7352, r6200, r6216; +} +{ +mul.f16x2 r7355, r7352, r7263; +} +{ +add.f16x2 r7358, r7349, r7355; +} +{ +add.f16x2 %46, r7343, r7358; +} +{ +add.f16x2 r7364, r6178, r6226; +} +{ +mul.f16x2 r7367, r7364, r7262; +} +{ +add.f16x2 r7370, r4569, r7367; +} +{ +add.f16x2 r7373, r6194, r6210; +} +{ +mul.f16x2 r7376, r7373, r7264; +} +{ +add.f16x2 r7379, r7370, r7376; +} +{ +sub.f16x2 r7382, r6184, r6232; +} +{ +mul.f16x2 r7385, r7382, r7263; +} +{ +sub.f16x2 r7388, r6200, r6216; +} +{ +mul.f16x2 r7391, r7388, r7266; +} +{ +add.f16x2 r7394, r7385, r7391; +} +{ +sub.f16x2 %26, r7379, r7394; +} +{ +add.f16x2 r7400, r6178, r6226; +} +{ +mul.f16x2 r7403, r7400, r7262; +} +{ +add.f16x2 r7406, r4569, r7403; +} +{ +add.f16x2 r7409, r6194, r6210; +} +{ +mul.f16x2 r7412, r7409, r7264; +} +{ +add.f16x2 r7415, r7406, r7412; +} +{ +sub.f16x2 r7418, r6184, r6232; +} +{ +mul.f16x2 r7421, r7418, r7263; +} +{ +sub.f16x2 r7424, r6200, r6216; +} +{ +mul.f16x2 r7427, r7424, r7266; +} +{ +add.f16x2 r7430, r7421, r7427; +} +{ +add.f16x2 %36, r7415, r7430; +} +{ +add.f16x2 r7436, r6184, r6232; +} +{ +mul.f16x2 r7439, r7436, r7260; +} +{ +add.f16x2 r7442, r4713, r7439; +} +{ +add.f16x2 r7445, r6200, r6216; +} +{ +mul.f16x2 r7448, r7445, r7262; +} +{ +add.f16x2 r7451, r7442, r7448; +} +{ +sub.f16x2 r7454, r6178, r6226; +} +{ +mul.f16x2 r7457, r7454, r7261; +} +{ +sub.f16x2 r7460, r6194, r6210; +} +{ +mul.f16x2 r7463, r7460, r7263; +} +{ +add.f16x2 r7466, r7457, r7463; +} +{ +add.f16x2 %17, r7451, r7466; +} +{ +add.f16x2 r7472, r6184, r6232; +} +{ +mul.f16x2 r7475, r7472, r7260; +} +{ +add.f16x2 r7478, r4713, r7475; +} +{ +add.f16x2 r7481, r6200, r6216; +} +{ +mul.f16x2 r7484, r7481, r7262; +} +{ +add.f16x2 r7487, r7478, r7484; +} +{ +sub.f16x2 r7490, r6178, r6226; +} +{ +mul.f16x2 r7493, r7490, r7261; +} +{ +sub.f16x2 r7496, r6194, r6210; +} +{ +mul.f16x2 r7499, r7496, r7263; +} +{ +add.f16x2 r7502, r7493, r7499; +} +{ +sub.f16x2 %47, r7487, r7502; +} +{ +add.f16x2 r7508, r6184, r6232; +} +{ +mul.f16x2 r7511, r7508, r7262; +} +{ +add.f16x2 r7514, r4713, r7511; +} +{ +add.f16x2 r7517, r6200, r6216; +} +{ +mul.f16x2 r7520, r7517, r7264; +} +{ +add.f16x2 r7523, r7514, r7520; +} +{ +sub.f16x2 r7526, r6178, r6226; +} +{ +mul.f16x2 r7529, r7526, r7263; +} +{ +sub.f16x2 r7532, r6194, r6210; +} +{ +mul.f16x2 r7535, r7532, r7266; +} +{ +add.f16x2 r7538, r7529, r7535; +} +{ +add.f16x2 %27, r7523, r7538; +} +{ +add.f16x2 r7544, r6184, r6232; +} +{ +mul.f16x2 r7547, r7544, r7262; +} +{ +add.f16x2 r7550, r4713, r7547; +} +{ +add.f16x2 r7553, r6200, r6216; +} +{ +mul.f16x2 r7556, r7553, r7264; +} +{ +add.f16x2 r7559, r7550, r7556; +} +{ +sub.f16x2 r7562, r6178, r6226; +} +{ +mul.f16x2 r7565, r7562, r7263; +} +{ +sub.f16x2 r7568, r6194, r6210; +} +{ +mul.f16x2 r7571, r7568, r7266; +} +{ +add.f16x2 r7574, r7565, r7571; +} +{ +sub.f16x2 %37, r7559, r7574; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7581, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7582, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7584, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7585, {low, high}; +} +{ +neg.f16x2 r7586, r7585; +} +{ +add.f16x2 r7588, r6242, r6290; +} +{ +add.f16x2 r7591, r4497, r7588; +} +{ +add.f16x2 r7594, r6258, r6274; +} +{ +add.f16x2 %8, r7591, r7594; +} +{ +add.f16x2 r7600, r6248, r6296; +} +{ +add.f16x2 r7603, r4641, r7600; +} +{ +add.f16x2 r7606, r6264, r6280; +} +{ +add.f16x2 %9, r7603, r7606; +} +{ +add.f16x2 r7612, r6242, r6290; +} +{ +mul.f16x2 r7615, r7612, r7580; +} +{ +add.f16x2 r7618, r4497, r7615; +} +{ +add.f16x2 r7621, r6258, r6274; +} +{ +mul.f16x2 r7624, r7621, r7582; +} +{ +add.f16x2 r7627, r7618, r7624; +} +{ +sub.f16x2 r7630, r6248, r6296; +} +{ +mul.f16x2 r7633, r7630, r7581; +} +{ +sub.f16x2 r7636, r6264, r6280; +} +{ +mul.f16x2 r7639, r7636, r7583; +} +{ +add.f16x2 r7642, r7633, r7639; +} +{ +sub.f16x2 %18, r7627, r7642; +} +{ +add.f16x2 r7648, r6242, r6290; +} +{ +mul.f16x2 r7651, r7648, r7580; +} +{ +add.f16x2 r7654, r4497, r7651; +} +{ +add.f16x2 r7657, r6258, r6274; +} +{ +mul.f16x2 r7660, r7657, r7582; +} +{ +add.f16x2 r7663, r7654, r7660; +} +{ +sub.f16x2 r7666, r6248, r6296; +} +{ +mul.f16x2 r7669, r7666, r7581; +} +{ +sub.f16x2 r7672, r6264, r6280; +} +{ +mul.f16x2 r7675, r7672, r7583; +} +{ +add.f16x2 r7678, r7669, r7675; +} +{ +add.f16x2 %48, r7663, r7678; +} +{ +add.f16x2 r7684, r6242, r6290; +} +{ +mul.f16x2 r7687, r7684, r7582; +} +{ +add.f16x2 r7690, r4497, r7687; +} +{ +add.f16x2 r7693, r6258, r6274; +} +{ +mul.f16x2 r7696, r7693, r7584; +} +{ +add.f16x2 r7699, r7690, r7696; +} +{ +sub.f16x2 r7702, r6248, r6296; +} +{ +mul.f16x2 r7705, r7702, r7583; +} +{ +sub.f16x2 r7708, r6264, r6280; +} +{ +mul.f16x2 r7711, r7708, r7586; +} +{ +add.f16x2 r7714, r7705, r7711; +} +{ +sub.f16x2 %28, r7699, r7714; +} +{ +add.f16x2 r7720, r6242, r6290; +} +{ +mul.f16x2 r7723, r7720, r7582; +} +{ +add.f16x2 r7726, r4497, r7723; +} +{ +add.f16x2 r7729, r6258, r6274; +} +{ +mul.f16x2 r7732, r7729, r7584; +} +{ +add.f16x2 r7735, r7726, r7732; +} +{ +sub.f16x2 r7738, r6248, r6296; +} +{ +mul.f16x2 r7741, r7738, r7583; +} +{ +sub.f16x2 r7744, r6264, r6280; +} +{ +mul.f16x2 r7747, r7744, r7586; +} +{ +add.f16x2 r7750, r7741, r7747; +} +{ +add.f16x2 %38, r7735, r7750; +} +{ +add.f16x2 r7756, r6248, r6296; +} +{ +mul.f16x2 r7759, r7756, r7580; +} +{ +add.f16x2 r7762, r4641, r7759; +} +{ +add.f16x2 r7765, r6264, r6280; +} +{ +mul.f16x2 r7768, r7765, r7582; +} +{ +add.f16x2 r7771, r7762, r7768; +} +{ +sub.f16x2 r7774, r6242, r6290; +} +{ +mul.f16x2 r7777, r7774, r7581; +} +{ +sub.f16x2 r7780, r6258, r6274; +} +{ +mul.f16x2 r7783, r7780, r7583; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 %19, r7771, r7786; +} +{ +add.f16x2 r7792, r6248, r6296; +} +{ +mul.f16x2 r7795, r7792, r7580; +} +{ +add.f16x2 r7798, r4641, r7795; +} +{ +add.f16x2 r7801, r6264, r6280; +} +{ +mul.f16x2 r7804, r7801, r7582; +} +{ +add.f16x2 r7807, r7798, r7804; +} +{ +sub.f16x2 r7810, r6242, r6290; +} +{ +mul.f16x2 r7813, r7810, r7581; +} +{ +sub.f16x2 r7816, r6258, r6274; +} +{ +mul.f16x2 r7819, r7816, r7583; +} +{ +add.f16x2 r7822, r7813, r7819; +} +{ +sub.f16x2 %49, r7807, r7822; +} +{ +add.f16x2 r7828, r6248, r6296; +} +{ +mul.f16x2 r7831, r7828, r7582; +} +{ +add.f16x2 r7834, r4641, r7831; +} +{ +add.f16x2 r7837, r6264, r6280; +} +{ +mul.f16x2 r7840, r7837, r7584; +} +{ +add.f16x2 r7843, r7834, r7840; +} +{ +sub.f16x2 r7846, r6242, r6290; +} +{ +mul.f16x2 r7849, r7846, r7583; +} +{ +sub.f16x2 r7852, r6258, r6274; +} +{ +mul.f16x2 r7855, r7852, r7586; +} +{ +add.f16x2 r7858, r7849, r7855; +} +{ +add.f16x2 %29, r7843, r7858; +} +{ +add.f16x2 r7864, r6248, r6296; +} +{ +mul.f16x2 r7867, r7864, r7582; +} +{ +add.f16x2 r7870, r4641, r7867; +} +{ +add.f16x2 r7873, r6264, r6280; +} +{ +mul.f16x2 r7876, r7873, r7584; +} +{ +add.f16x2 r7879, r7870, r7876; +} +{ +sub.f16x2 r7882, r6242, r6290; +} +{ +mul.f16x2 r7885, r7882, r7583; +} +{ +sub.f16x2 r7888, r6258, r6274; +} +{ +mul.f16x2 r7891, r7888, r7586; +} +{ +add.f16x2 r7894, r7885, r7891; +} +{ +sub.f16x2 %39, r7879, r7894; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[18].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<910, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<94>; +.reg .b32 r<1757>; +.reg .b64 rd<8>; +mov.u32 r1734, %tid.x; +mov.f32 f82, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1, {low, high}; +} +mov.f32 f84, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r2, {low, high}; +} +mov.f32 f78, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r3, {low, high}; +} +mov.f32 f80, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r1734, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1735, rd3; +mul.lo.s32 r1736, r1735, 125; +sub.s32 r1737, r1734, r1736; +cvt.rn.f32.u32 f85, r1737; +mul.f32 f86, f85, 0f3C24B5BE; +cos.approx.f32 f13, f86; +sin.approx.f32 f87, f86; +neg.f32 f14, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +mov.u32 r1738, %tid.y; +mov.u32 r1739, %10; +mad.lo.s32 r1740, r1738, 5000, r1739; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +mad.lo.s32 r1741, r1735, 5000, r1740; +barrier.sync 0; +mad.lo.s32 r1742, r1737, 40, r1741; +st.shared.v2.f32 [r1742], {r18, r30}; +st.shared.v2.f32 [r1742+8], {r333, r340}; +st.shared.v2.f32 [r1742+16], {r370, r377}; +st.shared.v2.f32 [r1742+24], {r407, r414}; +st.shared.v2.f32 [r1742+32], {r444, r451}; +barrier.sync 0; +shl.b32 r1743, r1737, 5; +sub.s32 r1744, r1742, r1743; +ld.shared.u32 r484, [r1744]; +ld.shared.u32 r496, [r1744+4]; +ld.shared.u32 r481, [r1744+1000]; +ld.shared.u32 r493, [r1744+1004]; +ld.shared.u32 r487, [r1744+2000]; +ld.shared.u32 r499, [r1744+2004]; +ld.shared.u32 r488, [r1744+3000]; +ld.shared.u32 r500, [r1744+3004]; +ld.shared.u32 r482, [r1744+4000]; +ld.shared.u32 r494, [r1744+4004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 r489, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 r645, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 r681, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 r717, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 r753, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 r789, r771, r786; +} +mul.wide.u32 rd4, r1737, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1745, rd5; +cvt.rn.f32.u32 f88, r1745; +mul.f32 f89, f88, 0f3D4DE32E; +cos.approx.f32 f37, f89; +sin.approx.f32 f90, f89; +neg.f32 f38, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r792, {low, high}; +} +mul.lo.s32 r1746, r1745, 5; +sub.s32 r1747, r1737, r1746; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r797, {high, high}; +} +{ +mul.f16x2 r799, r681, r797; +} +{ +neg.f16x2 r802, r799; +} +{ +fma.rn.f16x2 r804, r537, r795, r802; +} +{ +mul.f16x2 r808, r537, r797; +} +{ +fma.rn.f16x2 r811, r681, r795, r808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r817, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r819, {low, high}; +} +{ +mul.f16x2 r820, r817, r819; +} +{ +mul.f16x2 r823, r792, r815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r826, {high, low}; +} +{ +fma.rn.f16x2 r828, r820, r826, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r834, {high, high}; +} +{ +mul.f16x2 r836, r753, r834; +} +{ +neg.f16x2 r839, r836; +} +{ +fma.rn.f16x2 r841, r609, r832, r839; +} +{ +mul.f16x2 r845, r609, r834; +} +{ +fma.rn.f16x2 r848, r753, r832, r845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r854, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r856, {low, high}; +} +{ +mul.f16x2 r857, r854, r856; +} +{ +mul.f16x2 r860, r828, r852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r863, {high, low}; +} +{ +fma.rn.f16x2 r865, r857, r863, r860; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r871, {high, high}; +} +{ +mul.f16x2 r873, r789, r871; +} +{ +neg.f16x2 r876, r873; +} +{ +fma.rn.f16x2 r878, r645, r869, r876; +} +{ +mul.f16x2 r882, r645, r871; +} +{ +fma.rn.f16x2 r885, r789, r869, r882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r891, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r893, {low, high}; +} +{ +mul.f16x2 r894, r891, r893; +} +{ +mul.f16x2 r897, r865, r889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r900, {high, low}; +} +{ +fma.rn.f16x2 r902, r894, r900, r897; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r908, {high, high}; +} +{ +mul.f16x2 r910, r717, r908; +} +{ +neg.f16x2 r913, r910; +} +{ +fma.rn.f16x2 r915, r573, r906, r913; +} +{ +mul.f16x2 r919, r573, r908; +} +{ +fma.rn.f16x2 r922, r717, r906, r919; +} +shl.b32 r1748, r1747, 3; +add.s32 r1749, r1741, r1748; +barrier.sync 0; +mad.lo.s32 r1750, r1745, 200, r1749; +st.shared.u32 [r1750], r489; +st.shared.u32 [r1750+4], r501; +st.shared.u32 [r1750+40], r804; +st.shared.u32 [r1750+44], r811; +st.shared.u32 [r1750+80], r841; +st.shared.u32 [r1750+84], r848; +st.shared.u32 [r1750+120], r878; +st.shared.u32 [r1750+124], r885; +st.shared.u32 [r1750+160], r915; +st.shared.u32 [r1750+164], r922; +barrier.sync 0; +ld.shared.u32 r955, [r1744]; +ld.shared.u32 r967, [r1744+4]; +ld.shared.u32 r952, [r1744+1000]; +ld.shared.u32 r964, [r1744+1004]; +ld.shared.u32 r958, [r1744+2000]; +ld.shared.u32 r970, [r1744+2004]; +ld.shared.u32 r959, [r1744+3000]; +ld.shared.u32 r971, [r1744+3004]; +ld.shared.u32 r953, [r1744+4000]; +ld.shared.u32 r965, [r1744+4004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +add.f16x2 r951, r952, r953; +} +{ +add.f16x2 r954, r955, r951; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r954, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r967, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r966, r969; +} +{ +add.f16x2 r975, r952, r953; +} +{ +mul.f16x2 r978, r975, r943; +} +{ +add.f16x2 r981, r955, r978; +} +{ +add.f16x2 r984, r958, r959; +} +{ +mul.f16x2 r987, r984, r945; +} +{ +add.f16x2 r990, r981, r987; +} +{ +sub.f16x2 r993, r964, r965; +} +{ +mul.f16x2 r996, r993, r944; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r946; +} +{ +add.f16x2 r1005, r996, r1002; +} +{ +sub.f16x2 r1008, r990, r1005; +} +{ +add.f16x2 r1011, r952, r953; +} +{ +mul.f16x2 r1014, r1011, r943; +} +{ +add.f16x2 r1017, r955, r1014; +} +{ +add.f16x2 r1020, r958, r959; +} +{ +mul.f16x2 r1023, r1020, r945; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r964, r965; +} +{ +mul.f16x2 r1032, r1029, r944; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r946; +} +{ +add.f16x2 r1041, r1032, r1038; +} +{ +add.f16x2 r1044, r1026, r1041; +} +{ +add.f16x2 r1047, r952, r953; +} +{ +mul.f16x2 r1050, r1047, r945; +} +{ +add.f16x2 r1053, r955, r1050; +} +{ +add.f16x2 r1056, r958, r959; +} +{ +mul.f16x2 r1059, r1056, r947; +} +{ +add.f16x2 r1062, r1053, r1059; +} +{ +sub.f16x2 r1065, r964, r965; +} +{ +mul.f16x2 r1068, r1065, r946; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r949; +} +{ +add.f16x2 r1077, r1068, r1074; +} +{ +sub.f16x2 r1080, r1062, r1077; +} +{ +add.f16x2 r1083, r952, r953; +} +{ +mul.f16x2 r1086, r1083, r945; +} +{ +add.f16x2 r1089, r955, r1086; +} +{ +add.f16x2 r1092, r958, r959; +} +{ +mul.f16x2 r1095, r1092, r947; +} +{ +add.f16x2 r1098, r1089, r1095; +} +{ +sub.f16x2 r1101, r964, r965; +} +{ +mul.f16x2 r1104, r1101, r946; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r949; +} +{ +add.f16x2 r1113, r1104, r1110; +} +{ +add.f16x2 r1116, r1098, r1113; +} +{ +add.f16x2 r1119, r964, r965; +} +{ +mul.f16x2 r1122, r1119, r943; +} +{ +add.f16x2 r1125, r967, r1122; +} +{ +add.f16x2 r1128, r970, r971; +} +{ +mul.f16x2 r1131, r1128, r945; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r952, r953; +} +{ +mul.f16x2 r1140, r1137, r944; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r946; +} +{ +add.f16x2 r1149, r1140, r1146; +} +{ +add.f16x2 r1152, r1134, r1149; +} +{ +add.f16x2 r1155, r964, r965; +} +{ +mul.f16x2 r1158, r1155, r943; +} +{ +add.f16x2 r1161, r967, r1158; +} +{ +add.f16x2 r1164, r970, r971; +} +{ +mul.f16x2 r1167, r1164, r945; +} +{ +add.f16x2 r1170, r1161, r1167; +} +{ +sub.f16x2 r1173, r952, r953; +} +{ +mul.f16x2 r1176, r1173, r944; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r946; +} +{ +add.f16x2 r1185, r1176, r1182; +} +{ +sub.f16x2 r1188, r1170, r1185; +} +{ +add.f16x2 r1191, r964, r965; +} +{ +mul.f16x2 r1194, r1191, r945; +} +{ +add.f16x2 r1197, r967, r1194; +} +{ +add.f16x2 r1200, r970, r971; +} +{ +mul.f16x2 r1203, r1200, r947; +} +{ +add.f16x2 r1206, r1197, r1203; +} +{ +sub.f16x2 r1209, r952, r953; +} +{ +mul.f16x2 r1212, r1209, r946; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r949; +} +{ +add.f16x2 r1221, r1212, r1218; +} +{ +add.f16x2 r1224, r1206, r1221; +} +{ +add.f16x2 r1227, r964, r965; +} +{ +mul.f16x2 r1230, r1227, r945; +} +{ +add.f16x2 r1233, r967, r1230; +} +{ +add.f16x2 r1236, r970, r971; +} +{ +mul.f16x2 r1239, r1236, r947; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r952, r953; +} +{ +mul.f16x2 r1248, r1245, r946; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r949; +} +{ +add.f16x2 r1257, r1248, r1254; +} +{ +sub.f16x2 r1260, r1242, r1257; +} +mul.wide.u32 rd6, r1737, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1751, rd7; +cvt.rn.f32.u32 f91, r1751; +mul.f32 f92, f91, 0f3E80ADFD; +cos.approx.f32 f61, f92; +sin.approx.f32 f93, f92; +neg.f32 f62, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1263, {low, high}; +} +mul.lo.s32 r1752, r1751, 25; +sub.s32 r1753, r1737, r1752; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1266, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1268, {high, high}; +} +{ +mul.f16x2 r1270, r1152, r1268; +} +{ +neg.f16x2 r1273, r1270; +} +{ +fma.rn.f16x2 r1275, r1008, r1266, r1273; +} +{ +mul.f16x2 r1279, r1008, r1268; +} +{ +fma.rn.f16x2 r1282, r1152, r1266, r1279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1286, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1288, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1290, {low, high}; +} +{ +mul.f16x2 r1291, r1288, r1290; +} +{ +mul.f16x2 r1294, r1263, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1297, {high, low}; +} +{ +fma.rn.f16x2 r1299, r1291, r1297, r1294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1305, {high, high}; +} +{ +mul.f16x2 r1307, r1224, r1305; +} +{ +neg.f16x2 r1310, r1307; +} +{ +fma.rn.f16x2 r1312, r1080, r1303, r1310; +} +{ +mul.f16x2 r1316, r1080, r1305; +} +{ +fma.rn.f16x2 r1319, r1224, r1303, r1316; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1323, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1325, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1327, {low, high}; +} +{ +mul.f16x2 r1328, r1325, r1327; +} +{ +mul.f16x2 r1331, r1299, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1334, {high, low}; +} +{ +fma.rn.f16x2 r1336, r1328, r1334, r1331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1340, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1342, {high, high}; +} +{ +mul.f16x2 r1344, r1260, r1342; +} +{ +neg.f16x2 r1347, r1344; +} +{ +fma.rn.f16x2 r1349, r1116, r1340, r1347; +} +{ +mul.f16x2 r1353, r1116, r1342; +} +{ +fma.rn.f16x2 r1356, r1260, r1340, r1353; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1360, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1362, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1364, {low, high}; +} +{ +mul.f16x2 r1365, r1362, r1364; +} +{ +mul.f16x2 r1368, r1336, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1371, {high, low}; +} +{ +fma.rn.f16x2 r1373, r1365, r1371, r1368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1377, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1379, {high, high}; +} +{ +mul.f16x2 r1381, r1188, r1379; +} +{ +neg.f16x2 r1384, r1381; +} +{ +fma.rn.f16x2 r1386, r1044, r1377, r1384; +} +{ +mul.f16x2 r1390, r1044, r1379; +} +{ +fma.rn.f16x2 r1393, r1188, r1377, r1390; +} +shl.b32 r1754, r1753, 3; +add.s32 r1755, r1741, r1754; +barrier.sync 0; +mad.lo.s32 r1756, r1751, 1000, r1755; +st.shared.u32 [r1756], r960; +st.shared.u32 [r1756+4], r972; +st.shared.u32 [r1756+200], r1275; +st.shared.u32 [r1756+204], r1282; +st.shared.u32 [r1756+400], r1312; +st.shared.u32 [r1756+404], r1319; +st.shared.u32 [r1756+600], r1349; +st.shared.u32 [r1756+604], r1356; +st.shared.u32 [r1756+800], r1386; +st.shared.u32 [r1756+804], r1393; +barrier.sync 0; +ld.shared.u32 r1426, [r1744]; +ld.shared.u32 r1438, [r1744+4]; +ld.shared.u32 r1423, [r1744+1000]; +ld.shared.u32 r1435, [r1744+1004]; +ld.shared.u32 r1429, [r1744+2000]; +ld.shared.u32 r1441, [r1744+2004]; +ld.shared.u32 r1430, [r1744+3000]; +ld.shared.u32 r1442, [r1744+3004]; +ld.shared.u32 r1424, [r1744+4000]; +ld.shared.u32 r1436, [r1744+4004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1415, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1419, {low, high}; +} +{ +neg.f16x2 r1420, r1419; +} +{ +add.f16x2 r1422, r1423, r1424; +} +{ +add.f16x2 r1425, r1426, r1422; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +add.f16x2 %0, r1425, r1428; +} +{ +add.f16x2 r1434, r1435, r1436; +} +{ +add.f16x2 r1437, r1438, r1434; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +add.f16x2 %1, r1437, r1440; +} +{ +add.f16x2 r1446, r1423, r1424; +} +{ +mul.f16x2 r1449, r1446, r1414; +} +{ +add.f16x2 r1452, r1426, r1449; +} +{ +add.f16x2 r1455, r1429, r1430; +} +{ +mul.f16x2 r1458, r1455, r1416; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +sub.f16x2 r1464, r1435, r1436; +} +{ +mul.f16x2 r1467, r1464, r1415; +} +{ +sub.f16x2 r1470, r1441, r1442; +} +{ +mul.f16x2 r1473, r1470, r1417; +} +{ +add.f16x2 r1476, r1467, r1473; +} +{ +sub.f16x2 %2, r1461, r1476; +} +{ +add.f16x2 r1482, r1423, r1424; +} +{ +mul.f16x2 r1485, r1482, r1414; +} +{ +add.f16x2 r1488, r1426, r1485; +} +{ +add.f16x2 r1491, r1429, r1430; +} +{ +mul.f16x2 r1494, r1491, r1416; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +sub.f16x2 r1500, r1435, r1436; +} +{ +mul.f16x2 r1503, r1500, r1415; +} +{ +sub.f16x2 r1506, r1441, r1442; +} +{ +mul.f16x2 r1509, r1506, r1417; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +add.f16x2 %8, r1497, r1512; +} +{ +add.f16x2 r1518, r1423, r1424; +} +{ +mul.f16x2 r1521, r1518, r1416; +} +{ +add.f16x2 r1524, r1426, r1521; +} +{ +add.f16x2 r1527, r1429, r1430; +} +{ +mul.f16x2 r1530, r1527, r1418; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1435, r1436; +} +{ +mul.f16x2 r1539, r1536, r1417; +} +{ +sub.f16x2 r1542, r1441, r1442; +} +{ +mul.f16x2 r1545, r1542, r1420; +} +{ +add.f16x2 r1548, r1539, r1545; +} +{ +sub.f16x2 %4, r1533, r1548; +} +{ +add.f16x2 r1554, r1423, r1424; +} +{ +mul.f16x2 r1557, r1554, r1416; +} +{ +add.f16x2 r1560, r1426, r1557; +} +{ +add.f16x2 r1563, r1429, r1430; +} +{ +mul.f16x2 r1566, r1563, r1418; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +sub.f16x2 r1572, r1435, r1436; +} +{ +mul.f16x2 r1575, r1572, r1417; +} +{ +sub.f16x2 r1578, r1441, r1442; +} +{ +mul.f16x2 r1581, r1578, r1420; +} +{ +add.f16x2 r1584, r1575, r1581; +} +{ +add.f16x2 %6, r1569, r1584; +} +{ +add.f16x2 r1590, r1435, r1436; +} +{ +mul.f16x2 r1593, r1590, r1414; +} +{ +add.f16x2 r1596, r1438, r1593; +} +{ +add.f16x2 r1599, r1441, r1442; +} +{ +mul.f16x2 r1602, r1599, r1416; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1423, r1424; +} +{ +mul.f16x2 r1611, r1608, r1415; +} +{ +sub.f16x2 r1614, r1429, r1430; +} +{ +mul.f16x2 r1617, r1614, r1417; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +add.f16x2 %3, r1605, r1620; +} +{ +add.f16x2 r1626, r1435, r1436; +} +{ +mul.f16x2 r1629, r1626, r1414; +} +{ +add.f16x2 r1632, r1438, r1629; +} +{ +add.f16x2 r1635, r1441, r1442; +} +{ +mul.f16x2 r1638, r1635, r1416; +} +{ +add.f16x2 r1641, r1632, r1638; +} +{ +sub.f16x2 r1644, r1423, r1424; +} +{ +mul.f16x2 r1647, r1644, r1415; +} +{ +sub.f16x2 r1650, r1429, r1430; +} +{ +mul.f16x2 r1653, r1650, r1417; +} +{ +add.f16x2 r1656, r1647, r1653; +} +{ +sub.f16x2 %9, r1641, r1656; +} +{ +add.f16x2 r1662, r1435, r1436; +} +{ +mul.f16x2 r1665, r1662, r1416; +} +{ +add.f16x2 r1668, r1438, r1665; +} +{ +add.f16x2 r1671, r1441, r1442; +} +{ +mul.f16x2 r1674, r1671, r1418; +} +{ +add.f16x2 r1677, r1668, r1674; +} +{ +sub.f16x2 r1680, r1423, r1424; +} +{ +mul.f16x2 r1683, r1680, r1417; +} +{ +sub.f16x2 r1686, r1429, r1430; +} +{ +mul.f16x2 r1689, r1686, r1420; +} +{ +add.f16x2 r1692, r1683, r1689; +} +{ +add.f16x2 %5, r1677, r1692; +} +{ +add.f16x2 r1698, r1435, r1436; +} +{ +mul.f16x2 r1701, r1698, r1416; +} +{ +add.f16x2 r1704, r1438, r1701; +} +{ +add.f16x2 r1707, r1441, r1442; +} +{ +mul.f16x2 r1710, r1707, r1418; +} +{ +add.f16x2 r1713, r1704, r1710; +} +{ +sub.f16x2 r1716, r1423, r1424; +} +{ +mul.f16x2 r1719, r1716, r1417; +} +{ +sub.f16x2 r1722, r1429, r1430; +} +{ +mul.f16x2 r1725, r1722, r1420; +} +{ +add.f16x2 r1728, r1719, r1725; +} +{ +sub.f16x2 %7, r1713, r1728; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<911, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<94>; +.reg .b32 r<1757>; +.reg .b64 rd<8>; +mov.u32 r1734, %tid.y; +mov.u32 r1735, %10; +mad.lo.s32 r1736, r1734, 2500, r1735; +mov.u32 r1737, %tid.x; +mov.f32 f82, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1, {low, high}; +} +mov.f32 f84, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r2, {low, high}; +} +mov.f32 f78, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r3, {low, high}; +} +mov.f32 f80, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r4, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r5, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +add.f16x2 r9, %13, %19; +} +{ +add.f16x2 r12, %11, r9; +} +{ +add.f16x2 r15, %15, %17; +} +{ +add.f16x2 r18, r12, r15; +} +{ +add.f16x2 r21, %14, %20; +} +{ +add.f16x2 r24, %12, r21; +} +{ +add.f16x2 r27, %16, %18; +} +{ +add.f16x2 r30, r24, r27; +} +{ +add.f16x2 r33, %13, %19; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %11, r36; +} +{ +add.f16x2 r42, %15, %17; +} +{ +mul.f16x2 r45, r42, r3; +} +{ +add.f16x2 r48, r39, r45; +} +{ +sub.f16x2 r51, %14, %20; +} +{ +mul.f16x2 r54, r51, r2; +} +{ +sub.f16x2 r57, %16, %18; +} +{ +mul.f16x2 r60, r57, r4; +} +{ +add.f16x2 r63, r54, r60; +} +{ +sub.f16x2 r66, r48, r63; +} +{ +add.f16x2 r69, %13, %19; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %11, r72; +} +{ +add.f16x2 r78, %15, %17; +} +{ +mul.f16x2 r81, r78, r3; +} +{ +add.f16x2 r84, r75, r81; +} +{ +sub.f16x2 r87, %14, %20; +} +{ +mul.f16x2 r90, r87, r2; +} +{ +sub.f16x2 r93, %16, %18; +} +{ +mul.f16x2 r96, r93, r4; +} +{ +add.f16x2 r99, r90, r96; +} +{ +add.f16x2 r102, r84, r99; +} +{ +add.f16x2 r105, %13, %19; +} +{ +mul.f16x2 r108, r105, r3; +} +{ +add.f16x2 r111, %11, r108; +} +{ +add.f16x2 r114, %15, %17; +} +{ +mul.f16x2 r117, r114, r5; +} +{ +add.f16x2 r120, r111, r117; +} +{ +sub.f16x2 r123, %14, %20; +} +{ +mul.f16x2 r126, r123, r4; +} +{ +sub.f16x2 r129, %16, %18; +} +{ +mul.f16x2 r132, r129, r7; +} +{ +add.f16x2 r135, r126, r132; +} +{ +sub.f16x2 r138, r120, r135; +} +{ +add.f16x2 r141, %13, %19; +} +{ +mul.f16x2 r144, r141, r3; +} +{ +add.f16x2 r147, %11, r144; +} +{ +add.f16x2 r150, %15, %17; +} +{ +mul.f16x2 r153, r150, r5; +} +{ +add.f16x2 r156, r147, r153; +} +{ +sub.f16x2 r159, %14, %20; +} +{ +mul.f16x2 r162, r159, r4; +} +{ +sub.f16x2 r165, %16, %18; +} +{ +mul.f16x2 r168, r165, r7; +} +{ +add.f16x2 r171, r162, r168; +} +{ +add.f16x2 r174, r156, r171; +} +{ +add.f16x2 r177, %14, %20; +} +{ +mul.f16x2 r180, r177, r1; +} +{ +add.f16x2 r183, %12, r180; +} +{ +add.f16x2 r186, %16, %18; +} +{ +mul.f16x2 r189, r186, r3; +} +{ +add.f16x2 r192, r183, r189; +} +{ +sub.f16x2 r195, %13, %19; +} +{ +mul.f16x2 r198, r195, r2; +} +{ +sub.f16x2 r201, %15, %17; +} +{ +mul.f16x2 r204, r201, r4; +} +{ +add.f16x2 r207, r198, r204; +} +{ +add.f16x2 r210, r192, r207; +} +{ +add.f16x2 r213, %14, %20; +} +{ +mul.f16x2 r216, r213, r1; +} +{ +add.f16x2 r219, %12, r216; +} +{ +add.f16x2 r222, %16, %18; +} +{ +mul.f16x2 r225, r222, r3; +} +{ +add.f16x2 r228, r219, r225; +} +{ +sub.f16x2 r231, %13, %19; +} +{ +mul.f16x2 r234, r231, r2; +} +{ +sub.f16x2 r237, %15, %17; +} +{ +mul.f16x2 r240, r237, r4; +} +{ +add.f16x2 r243, r234, r240; +} +{ +sub.f16x2 r246, r228, r243; +} +{ +add.f16x2 r249, %14, %20; +} +{ +mul.f16x2 r252, r249, r3; +} +{ +add.f16x2 r255, %12, r252; +} +{ +add.f16x2 r258, %16, %18; +} +{ +mul.f16x2 r261, r258, r5; +} +{ +add.f16x2 r264, r255, r261; +} +{ +sub.f16x2 r267, %13, %19; +} +{ +mul.f16x2 r270, r267, r4; +} +{ +sub.f16x2 r273, %15, %17; +} +{ +mul.f16x2 r276, r273, r7; +} +{ +add.f16x2 r279, r270, r276; +} +{ +add.f16x2 r282, r264, r279; +} +{ +add.f16x2 r285, %14, %20; +} +{ +mul.f16x2 r288, r285, r3; +} +{ +add.f16x2 r291, %12, r288; +} +{ +add.f16x2 r294, %16, %18; +} +{ +mul.f16x2 r297, r294, r5; +} +{ +add.f16x2 r300, r291, r297; +} +{ +sub.f16x2 r303, %13, %19; +} +{ +mul.f16x2 r306, r303, r4; +} +{ +sub.f16x2 r309, %15, %17; +} +{ +mul.f16x2 r312, r309, r7; +} +{ +add.f16x2 r315, r306, r312; +} +{ +sub.f16x2 r318, r300, r315; +} +mul.wide.u32 rd2, r1737, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1738, rd3; +mul.lo.s32 r1739, r1738, 125; +sub.s32 r1740, r1737, r1739; +mad.lo.s32 r1741, r1738, 2500, r1736; +cvt.rn.f32.u32 f85, r1740; +mul.f32 f86, f85, 0f3C24B5BE; +cos.approx.f32 f13, f86; +sin.approx.f32 f87, f86; +neg.f32 f14, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r321, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r324, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r326, {high, high}; +} +{ +mul.f16x2 r328, r210, r326; +} +{ +neg.f16x2 r331, r328; +} +{ +fma.rn.f16x2 r333, r66, r324, r331; +} +{ +mul.f16x2 r337, r66, r326; +} +{ +fma.rn.f16x2 r340, r210, r324, r337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r344, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r346, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r348, {low, high}; +} +{ +mul.f16x2 r349, r346, r348; +} +{ +mul.f16x2 r352, r321, r344; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r355, {high, low}; +} +{ +fma.rn.f16x2 r357, r349, r355, r352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r361, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r363, {high, high}; +} +{ +mul.f16x2 r365, r282, r363; +} +{ +neg.f16x2 r368, r365; +} +{ +fma.rn.f16x2 r370, r138, r361, r368; +} +{ +mul.f16x2 r374, r138, r363; +} +{ +fma.rn.f16x2 r377, r282, r361, r374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r381, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r383, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r385, {low, high}; +} +{ +mul.f16x2 r386, r383, r385; +} +{ +mul.f16x2 r389, r357, r381; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r357; +mov.b32 r392, {high, low}; +} +{ +fma.rn.f16x2 r394, r386, r392, r389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r398, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r400, {high, high}; +} +{ +mul.f16x2 r402, r318, r400; +} +{ +neg.f16x2 r405, r402; +} +{ +fma.rn.f16x2 r407, r174, r398, r405; +} +{ +mul.f16x2 r411, r174, r400; +} +{ +fma.rn.f16x2 r414, r318, r398, r411; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r418, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r321; +mov.b32 r420, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r422, {low, high}; +} +{ +mul.f16x2 r423, r420, r422; +} +{ +mul.f16x2 r426, r394, r418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r394; +mov.b32 r429, {high, low}; +} +{ +fma.rn.f16x2 r431, r423, r429, r426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r435, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r431; +mov.b32 r437, {high, high}; +} +{ +mul.f16x2 r439, r246, r437; +} +{ +neg.f16x2 r442, r439; +} +{ +fma.rn.f16x2 r444, r102, r435, r442; +} +{ +mul.f16x2 r448, r102, r437; +} +{ +fma.rn.f16x2 r451, r246, r435, r448; +} +barrier.sync 0; +mad.lo.s32 r1742, r1740, 20, r1741; +st.shared.u32 [r1742], r18; +st.shared.u32 [r1742+4], r333; +st.shared.u32 [r1742+8], r370; +st.shared.u32 [r1742+12], r407; +st.shared.u32 [r1742+16], r444; +barrier.sync 0; +shl.b32 r1743, r1740, 4; +sub.s32 r1744, r1742, r1743; +ld.shared.u32 r484, [r1744]; +ld.shared.u32 r481, [r1744+500]; +ld.shared.u32 r487, [r1744+1000]; +ld.shared.u32 r488, [r1744+1500]; +ld.shared.u32 r482, [r1744+2000]; +barrier.sync 0; +st.shared.u32 [r1742], r30; +st.shared.u32 [r1742+4], r340; +st.shared.u32 [r1742+8], r377; +st.shared.u32 [r1742+12], r414; +st.shared.u32 [r1742+16], r451; +barrier.sync 0; +ld.shared.u32 r496, [r1744]; +ld.shared.u32 r493, [r1744+500]; +ld.shared.u32 r499, [r1744+1000]; +ld.shared.u32 r500, [r1744+1500]; +ld.shared.u32 r494, [r1744+2000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r473, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r477, {low, high}; +} +{ +neg.f16x2 r478, r477; +} +{ +add.f16x2 r480, r481, r482; +} +{ +add.f16x2 r483, r484, r480; +} +{ +add.f16x2 r486, r487, r488; +} +{ +add.f16x2 r489, r483, r486; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r495, r498; +} +{ +add.f16x2 r504, r481, r482; +} +{ +mul.f16x2 r507, r504, r472; +} +{ +add.f16x2 r510, r484, r507; +} +{ +add.f16x2 r513, r487, r488; +} +{ +mul.f16x2 r516, r513, r474; +} +{ +add.f16x2 r519, r510, r516; +} +{ +sub.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r473; +} +{ +sub.f16x2 r528, r499, r500; +} +{ +mul.f16x2 r531, r528, r475; +} +{ +add.f16x2 r534, r525, r531; +} +{ +sub.f16x2 r537, r519, r534; +} +{ +add.f16x2 r540, r481, r482; +} +{ +mul.f16x2 r543, r540, r472; +} +{ +add.f16x2 r546, r484, r543; +} +{ +add.f16x2 r549, r487, r488; +} +{ +mul.f16x2 r552, r549, r474; +} +{ +add.f16x2 r555, r546, r552; +} +{ +sub.f16x2 r558, r493, r494; +} +{ +mul.f16x2 r561, r558, r473; +} +{ +sub.f16x2 r564, r499, r500; +} +{ +mul.f16x2 r567, r564, r475; +} +{ +add.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r555, r570; +} +{ +add.f16x2 r576, r481, r482; +} +{ +mul.f16x2 r579, r576, r474; +} +{ +add.f16x2 r582, r484, r579; +} +{ +add.f16x2 r585, r487, r488; +} +{ +mul.f16x2 r588, r585, r476; +} +{ +add.f16x2 r591, r582, r588; +} +{ +sub.f16x2 r594, r493, r494; +} +{ +mul.f16x2 r597, r594, r475; +} +{ +sub.f16x2 r600, r499, r500; +} +{ +mul.f16x2 r603, r600, r478; +} +{ +add.f16x2 r606, r597, r603; +} +{ +sub.f16x2 r609, r591, r606; +} +{ +add.f16x2 r612, r481, r482; +} +{ +mul.f16x2 r615, r612, r474; +} +{ +add.f16x2 r618, r484, r615; +} +{ +add.f16x2 r621, r487, r488; +} +{ +mul.f16x2 r624, r621, r476; +} +{ +add.f16x2 r627, r618, r624; +} +{ +sub.f16x2 r630, r493, r494; +} +{ +mul.f16x2 r633, r630, r475; +} +{ +sub.f16x2 r636, r499, r500; +} +{ +mul.f16x2 r639, r636, r478; +} +{ +add.f16x2 r642, r633, r639; +} +{ +add.f16x2 r645, r627, r642; +} +{ +add.f16x2 r648, r493, r494; +} +{ +mul.f16x2 r651, r648, r472; +} +{ +add.f16x2 r654, r496, r651; +} +{ +add.f16x2 r657, r499, r500; +} +{ +mul.f16x2 r660, r657, r474; +} +{ +add.f16x2 r663, r654, r660; +} +{ +sub.f16x2 r666, r481, r482; +} +{ +mul.f16x2 r669, r666, r473; +} +{ +sub.f16x2 r672, r487, r488; +} +{ +mul.f16x2 r675, r672, r475; +} +{ +add.f16x2 r678, r669, r675; +} +{ +add.f16x2 r681, r663, r678; +} +{ +add.f16x2 r684, r493, r494; +} +{ +mul.f16x2 r687, r684, r472; +} +{ +add.f16x2 r690, r496, r687; +} +{ +add.f16x2 r693, r499, r500; +} +{ +mul.f16x2 r696, r693, r474; +} +{ +add.f16x2 r699, r690, r696; +} +{ +sub.f16x2 r702, r481, r482; +} +{ +mul.f16x2 r705, r702, r473; +} +{ +sub.f16x2 r708, r487, r488; +} +{ +mul.f16x2 r711, r708, r475; +} +{ +add.f16x2 r714, r705, r711; +} +{ +sub.f16x2 r717, r699, r714; +} +{ +add.f16x2 r720, r493, r494; +} +{ +mul.f16x2 r723, r720, r474; +} +{ +add.f16x2 r726, r496, r723; +} +{ +add.f16x2 r729, r499, r500; +} +{ +mul.f16x2 r732, r729, r476; +} +{ +add.f16x2 r735, r726, r732; +} +{ +sub.f16x2 r738, r481, r482; +} +{ +mul.f16x2 r741, r738, r475; +} +{ +sub.f16x2 r744, r487, r488; +} +{ +mul.f16x2 r747, r744, r478; +} +{ +add.f16x2 r750, r741, r747; +} +{ +add.f16x2 r753, r735, r750; +} +{ +add.f16x2 r756, r493, r494; +} +{ +mul.f16x2 r759, r756, r474; +} +{ +add.f16x2 r762, r496, r759; +} +{ +add.f16x2 r765, r499, r500; +} +{ +mul.f16x2 r768, r765, r476; +} +{ +add.f16x2 r771, r762, r768; +} +{ +sub.f16x2 r774, r481, r482; +} +{ +mul.f16x2 r777, r774, r475; +} +{ +sub.f16x2 r780, r487, r488; +} +{ +mul.f16x2 r783, r780, r478; +} +{ +add.f16x2 r786, r777, r783; +} +{ +sub.f16x2 r789, r771, r786; +} +mul.wide.u32 rd4, r1740, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1745, rd5; +mul.lo.s32 r1746, r1745, 5; +sub.s32 r1747, r1740, r1746; +shl.b32 r1748, r1747, 2; +add.s32 r1749, r1741, r1748; +cvt.rn.f32.u32 f88, r1745; +mul.f32 f89, f88, 0f3D4DE32E; +cos.approx.f32 f37, f89; +sin.approx.f32 f90, f89; +neg.f32 f38, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r792, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r795, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r797, {high, high}; +} +{ +mul.f16x2 r799, r681, r797; +} +{ +neg.f16x2 r802, r799; +} +{ +fma.rn.f16x2 r804, r537, r795, r802; +} +{ +mul.f16x2 r808, r537, r797; +} +{ +fma.rn.f16x2 r811, r681, r795, r808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r815, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r817, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r819, {low, high}; +} +{ +mul.f16x2 r820, r817, r819; +} +{ +mul.f16x2 r823, r792, r815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r826, {high, low}; +} +{ +fma.rn.f16x2 r828, r820, r826, r823; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r832, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r834, {high, high}; +} +{ +mul.f16x2 r836, r753, r834; +} +{ +neg.f16x2 r839, r836; +} +{ +fma.rn.f16x2 r841, r609, r832, r839; +} +{ +mul.f16x2 r845, r609, r834; +} +{ +fma.rn.f16x2 r848, r753, r832, r845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r852, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r854, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r856, {low, high}; +} +{ +mul.f16x2 r857, r854, r856; +} +{ +mul.f16x2 r860, r828, r852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r828; +mov.b32 r863, {high, low}; +} +{ +fma.rn.f16x2 r865, r857, r863, r860; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r869, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r871, {high, high}; +} +{ +mul.f16x2 r873, r789, r871; +} +{ +neg.f16x2 r876, r873; +} +{ +fma.rn.f16x2 r878, r645, r869, r876; +} +{ +mul.f16x2 r882, r645, r871; +} +{ +fma.rn.f16x2 r885, r789, r869, r882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r889, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r792; +mov.b32 r891, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r893, {low, high}; +} +{ +mul.f16x2 r894, r891, r893; +} +{ +mul.f16x2 r897, r865, r889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r865; +mov.b32 r900, {high, low}; +} +{ +fma.rn.f16x2 r902, r894, r900, r897; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r906, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r902; +mov.b32 r908, {high, high}; +} +{ +mul.f16x2 r910, r717, r908; +} +{ +neg.f16x2 r913, r910; +} +{ +fma.rn.f16x2 r915, r573, r906, r913; +} +{ +mul.f16x2 r919, r573, r908; +} +{ +fma.rn.f16x2 r922, r717, r906, r919; +} +barrier.sync 0; +mad.lo.s32 r1750, r1745, 100, r1749; +st.shared.u32 [r1750], r489; +st.shared.u32 [r1750+20], r804; +st.shared.u32 [r1750+40], r841; +st.shared.u32 [r1750+60], r878; +st.shared.u32 [r1750+80], r915; +barrier.sync 0; +ld.shared.u32 r955, [r1744]; +ld.shared.u32 r952, [r1744+500]; +ld.shared.u32 r958, [r1744+1000]; +ld.shared.u32 r959, [r1744+1500]; +ld.shared.u32 r953, [r1744+2000]; +barrier.sync 0; +st.shared.u32 [r1750], r501; +st.shared.u32 [r1750+20], r811; +st.shared.u32 [r1750+40], r848; +st.shared.u32 [r1750+60], r885; +st.shared.u32 [r1750+80], r922; +barrier.sync 0; +ld.shared.u32 r967, [r1744]; +ld.shared.u32 r964, [r1744+500]; +ld.shared.u32 r970, [r1744+1000]; +ld.shared.u32 r971, [r1744+1500]; +ld.shared.u32 r965, [r1744+2000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r943, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +add.f16x2 r951, r952, r953; +} +{ +add.f16x2 r954, r955, r951; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r954, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r967, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r966, r969; +} +{ +add.f16x2 r975, r952, r953; +} +{ +mul.f16x2 r978, r975, r943; +} +{ +add.f16x2 r981, r955, r978; +} +{ +add.f16x2 r984, r958, r959; +} +{ +mul.f16x2 r987, r984, r945; +} +{ +add.f16x2 r990, r981, r987; +} +{ +sub.f16x2 r993, r964, r965; +} +{ +mul.f16x2 r996, r993, r944; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r946; +} +{ +add.f16x2 r1005, r996, r1002; +} +{ +sub.f16x2 r1008, r990, r1005; +} +{ +add.f16x2 r1011, r952, r953; +} +{ +mul.f16x2 r1014, r1011, r943; +} +{ +add.f16x2 r1017, r955, r1014; +} +{ +add.f16x2 r1020, r958, r959; +} +{ +mul.f16x2 r1023, r1020, r945; +} +{ +add.f16x2 r1026, r1017, r1023; +} +{ +sub.f16x2 r1029, r964, r965; +} +{ +mul.f16x2 r1032, r1029, r944; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r946; +} +{ +add.f16x2 r1041, r1032, r1038; +} +{ +add.f16x2 r1044, r1026, r1041; +} +{ +add.f16x2 r1047, r952, r953; +} +{ +mul.f16x2 r1050, r1047, r945; +} +{ +add.f16x2 r1053, r955, r1050; +} +{ +add.f16x2 r1056, r958, r959; +} +{ +mul.f16x2 r1059, r1056, r947; +} +{ +add.f16x2 r1062, r1053, r1059; +} +{ +sub.f16x2 r1065, r964, r965; +} +{ +mul.f16x2 r1068, r1065, r946; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r949; +} +{ +add.f16x2 r1077, r1068, r1074; +} +{ +sub.f16x2 r1080, r1062, r1077; +} +{ +add.f16x2 r1083, r952, r953; +} +{ +mul.f16x2 r1086, r1083, r945; +} +{ +add.f16x2 r1089, r955, r1086; +} +{ +add.f16x2 r1092, r958, r959; +} +{ +mul.f16x2 r1095, r1092, r947; +} +{ +add.f16x2 r1098, r1089, r1095; +} +{ +sub.f16x2 r1101, r964, r965; +} +{ +mul.f16x2 r1104, r1101, r946; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r949; +} +{ +add.f16x2 r1113, r1104, r1110; +} +{ +add.f16x2 r1116, r1098, r1113; +} +{ +add.f16x2 r1119, r964, r965; +} +{ +mul.f16x2 r1122, r1119, r943; +} +{ +add.f16x2 r1125, r967, r1122; +} +{ +add.f16x2 r1128, r970, r971; +} +{ +mul.f16x2 r1131, r1128, r945; +} +{ +add.f16x2 r1134, r1125, r1131; +} +{ +sub.f16x2 r1137, r952, r953; +} +{ +mul.f16x2 r1140, r1137, r944; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r946; +} +{ +add.f16x2 r1149, r1140, r1146; +} +{ +add.f16x2 r1152, r1134, r1149; +} +{ +add.f16x2 r1155, r964, r965; +} +{ +mul.f16x2 r1158, r1155, r943; +} +{ +add.f16x2 r1161, r967, r1158; +} +{ +add.f16x2 r1164, r970, r971; +} +{ +mul.f16x2 r1167, r1164, r945; +} +{ +add.f16x2 r1170, r1161, r1167; +} +{ +sub.f16x2 r1173, r952, r953; +} +{ +mul.f16x2 r1176, r1173, r944; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r946; +} +{ +add.f16x2 r1185, r1176, r1182; +} +{ +sub.f16x2 r1188, r1170, r1185; +} +{ +add.f16x2 r1191, r964, r965; +} +{ +mul.f16x2 r1194, r1191, r945; +} +{ +add.f16x2 r1197, r967, r1194; +} +{ +add.f16x2 r1200, r970, r971; +} +{ +mul.f16x2 r1203, r1200, r947; +} +{ +add.f16x2 r1206, r1197, r1203; +} +{ +sub.f16x2 r1209, r952, r953; +} +{ +mul.f16x2 r1212, r1209, r946; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r949; +} +{ +add.f16x2 r1221, r1212, r1218; +} +{ +add.f16x2 r1224, r1206, r1221; +} +{ +add.f16x2 r1227, r964, r965; +} +{ +mul.f16x2 r1230, r1227, r945; +} +{ +add.f16x2 r1233, r967, r1230; +} +{ +add.f16x2 r1236, r970, r971; +} +{ +mul.f16x2 r1239, r1236, r947; +} +{ +add.f16x2 r1242, r1233, r1239; +} +{ +sub.f16x2 r1245, r952, r953; +} +{ +mul.f16x2 r1248, r1245, r946; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r949; +} +{ +add.f16x2 r1257, r1248, r1254; +} +{ +sub.f16x2 r1260, r1242, r1257; +} +mul.wide.u32 rd6, r1740, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1751, rd7; +mul.lo.s32 r1752, r1751, 25; +sub.s32 r1753, r1740, r1752; +shl.b32 r1754, r1753, 2; +add.s32 r1755, r1741, r1754; +cvt.rn.f32.u32 f91, r1751; +mul.f32 f92, f91, 0f3E80ADFD; +cos.approx.f32 f61, f92; +sin.approx.f32 f93, f92; +neg.f32 f62, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1263, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1266, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1268, {high, high}; +} +{ +mul.f16x2 r1270, r1152, r1268; +} +{ +neg.f16x2 r1273, r1270; +} +{ +fma.rn.f16x2 r1275, r1008, r1266, r1273; +} +{ +mul.f16x2 r1279, r1008, r1268; +} +{ +fma.rn.f16x2 r1282, r1152, r1266, r1279; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1286, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1288, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1290, {low, high}; +} +{ +mul.f16x2 r1291, r1288, r1290; +} +{ +mul.f16x2 r1294, r1263, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1297, {high, low}; +} +{ +fma.rn.f16x2 r1299, r1291, r1297, r1294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1303, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1305, {high, high}; +} +{ +mul.f16x2 r1307, r1224, r1305; +} +{ +neg.f16x2 r1310, r1307; +} +{ +fma.rn.f16x2 r1312, r1080, r1303, r1310; +} +{ +mul.f16x2 r1316, r1080, r1305; +} +{ +fma.rn.f16x2 r1319, r1224, r1303, r1316; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1323, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1325, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1327, {low, high}; +} +{ +mul.f16x2 r1328, r1325, r1327; +} +{ +mul.f16x2 r1331, r1299, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1299; +mov.b32 r1334, {high, low}; +} +{ +fma.rn.f16x2 r1336, r1328, r1334, r1331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1340, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1342, {high, high}; +} +{ +mul.f16x2 r1344, r1260, r1342; +} +{ +neg.f16x2 r1347, r1344; +} +{ +fma.rn.f16x2 r1349, r1116, r1340, r1347; +} +{ +mul.f16x2 r1353, r1116, r1342; +} +{ +fma.rn.f16x2 r1356, r1260, r1340, r1353; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1360, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1263; +mov.b32 r1362, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1364, {low, high}; +} +{ +mul.f16x2 r1365, r1362, r1364; +} +{ +mul.f16x2 r1368, r1336, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1336; +mov.b32 r1371, {high, low}; +} +{ +fma.rn.f16x2 r1373, r1365, r1371, r1368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1377, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1373; +mov.b32 r1379, {high, high}; +} +{ +mul.f16x2 r1381, r1188, r1379; +} +{ +neg.f16x2 r1384, r1381; +} +{ +fma.rn.f16x2 r1386, r1044, r1377, r1384; +} +{ +mul.f16x2 r1390, r1044, r1379; +} +{ +fma.rn.f16x2 r1393, r1188, r1377, r1390; +} +barrier.sync 0; +mad.lo.s32 r1756, r1751, 500, r1755; +st.shared.u32 [r1756], r960; +st.shared.u32 [r1756+100], r1275; +st.shared.u32 [r1756+200], r1312; +st.shared.u32 [r1756+300], r1349; +st.shared.u32 [r1756+400], r1386; +barrier.sync 0; +ld.shared.u32 r1426, [r1744]; +ld.shared.u32 r1423, [r1744+500]; +ld.shared.u32 r1429, [r1744+1000]; +ld.shared.u32 r1430, [r1744+1500]; +ld.shared.u32 r1424, [r1744+2000]; +barrier.sync 0; +st.shared.u32 [r1756], r972; +st.shared.u32 [r1756+100], r1282; +st.shared.u32 [r1756+200], r1319; +st.shared.u32 [r1756+300], r1356; +st.shared.u32 [r1756+400], r1393; +barrier.sync 0; +ld.shared.u32 r1438, [r1744]; +ld.shared.u32 r1435, [r1744+500]; +ld.shared.u32 r1441, [r1744+1000]; +ld.shared.u32 r1442, [r1744+1500]; +ld.shared.u32 r1436, [r1744+2000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1415, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1419, {low, high}; +} +{ +neg.f16x2 r1420, r1419; +} +{ +add.f16x2 r1422, r1423, r1424; +} +{ +add.f16x2 r1425, r1426, r1422; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +add.f16x2 %0, r1425, r1428; +} +{ +add.f16x2 r1434, r1435, r1436; +} +{ +add.f16x2 r1437, r1438, r1434; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +add.f16x2 %1, r1437, r1440; +} +{ +add.f16x2 r1446, r1423, r1424; +} +{ +mul.f16x2 r1449, r1446, r1414; +} +{ +add.f16x2 r1452, r1426, r1449; +} +{ +add.f16x2 r1455, r1429, r1430; +} +{ +mul.f16x2 r1458, r1455, r1416; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +sub.f16x2 r1464, r1435, r1436; +} +{ +mul.f16x2 r1467, r1464, r1415; +} +{ +sub.f16x2 r1470, r1441, r1442; +} +{ +mul.f16x2 r1473, r1470, r1417; +} +{ +add.f16x2 r1476, r1467, r1473; +} +{ +sub.f16x2 %2, r1461, r1476; +} +{ +add.f16x2 r1482, r1423, r1424; +} +{ +mul.f16x2 r1485, r1482, r1414; +} +{ +add.f16x2 r1488, r1426, r1485; +} +{ +add.f16x2 r1491, r1429, r1430; +} +{ +mul.f16x2 r1494, r1491, r1416; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +sub.f16x2 r1500, r1435, r1436; +} +{ +mul.f16x2 r1503, r1500, r1415; +} +{ +sub.f16x2 r1506, r1441, r1442; +} +{ +mul.f16x2 r1509, r1506, r1417; +} +{ +add.f16x2 r1512, r1503, r1509; +} +{ +add.f16x2 %8, r1497, r1512; +} +{ +add.f16x2 r1518, r1423, r1424; +} +{ +mul.f16x2 r1521, r1518, r1416; +} +{ +add.f16x2 r1524, r1426, r1521; +} +{ +add.f16x2 r1527, r1429, r1430; +} +{ +mul.f16x2 r1530, r1527, r1418; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1435, r1436; +} +{ +mul.f16x2 r1539, r1536, r1417; +} +{ +sub.f16x2 r1542, r1441, r1442; +} +{ +mul.f16x2 r1545, r1542, r1420; +} +{ +add.f16x2 r1548, r1539, r1545; +} +{ +sub.f16x2 %4, r1533, r1548; +} +{ +add.f16x2 r1554, r1423, r1424; +} +{ +mul.f16x2 r1557, r1554, r1416; +} +{ +add.f16x2 r1560, r1426, r1557; +} +{ +add.f16x2 r1563, r1429, r1430; +} +{ +mul.f16x2 r1566, r1563, r1418; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +sub.f16x2 r1572, r1435, r1436; +} +{ +mul.f16x2 r1575, r1572, r1417; +} +{ +sub.f16x2 r1578, r1441, r1442; +} +{ +mul.f16x2 r1581, r1578, r1420; +} +{ +add.f16x2 r1584, r1575, r1581; +} +{ +add.f16x2 %6, r1569, r1584; +} +{ +add.f16x2 r1590, r1435, r1436; +} +{ +mul.f16x2 r1593, r1590, r1414; +} +{ +add.f16x2 r1596, r1438, r1593; +} +{ +add.f16x2 r1599, r1441, r1442; +} +{ +mul.f16x2 r1602, r1599, r1416; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1423, r1424; +} +{ +mul.f16x2 r1611, r1608, r1415; +} +{ +sub.f16x2 r1614, r1429, r1430; +} +{ +mul.f16x2 r1617, r1614, r1417; +} +{ +add.f16x2 r1620, r1611, r1617; +} +{ +add.f16x2 %3, r1605, r1620; +} +{ +add.f16x2 r1626, r1435, r1436; +} +{ +mul.f16x2 r1629, r1626, r1414; +} +{ +add.f16x2 r1632, r1438, r1629; +} +{ +add.f16x2 r1635, r1441, r1442; +} +{ +mul.f16x2 r1638, r1635, r1416; +} +{ +add.f16x2 r1641, r1632, r1638; +} +{ +sub.f16x2 r1644, r1423, r1424; +} +{ +mul.f16x2 r1647, r1644, r1415; +} +{ +sub.f16x2 r1650, r1429, r1430; +} +{ +mul.f16x2 r1653, r1650, r1417; +} +{ +add.f16x2 r1656, r1647, r1653; +} +{ +sub.f16x2 %9, r1641, r1656; +} +{ +add.f16x2 r1662, r1435, r1436; +} +{ +mul.f16x2 r1665, r1662, r1416; +} +{ +add.f16x2 r1668, r1438, r1665; +} +{ +add.f16x2 r1671, r1441, r1442; +} +{ +mul.f16x2 r1674, r1671, r1418; +} +{ +add.f16x2 r1677, r1668, r1674; +} +{ +sub.f16x2 r1680, r1423, r1424; +} +{ +mul.f16x2 r1683, r1680, r1417; +} +{ +sub.f16x2 r1686, r1429, r1430; +} +{ +mul.f16x2 r1689, r1686, r1420; +} +{ +add.f16x2 r1692, r1683, r1689; +} +{ +add.f16x2 %5, r1677, r1692; +} +{ +add.f16x2 r1698, r1435, r1436; +} +{ +mul.f16x2 r1701, r1698, r1416; +} +{ +add.f16x2 r1704, r1438, r1701; +} +{ +add.f16x2 r1707, r1441, r1442; +} +{ +mul.f16x2 r1710, r1707, r1418; +} +{ +add.f16x2 r1713, r1704, r1710; +} +{ +sub.f16x2 r1716, r1423, r1424; +} +{ +mul.f16x2 r1719, r1716, r1417; +} +{ +sub.f16x2 r1722, r1429, r1430; +} +{ +mul.f16x2 r1725, r1722, r1420; +} +{ +add.f16x2 r1728, r1719, r1725; +} +{ +sub.f16x2 %7, r1713, r1728; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..8952b276f8d83 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp16_inv.hpp.inc @@ -0,0 +1,22702 @@ +#ifndef CUFFTDX_FFT_625_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_625_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1111, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<488>; +.reg .b32 r<8002>; +.reg .b64 rd<4>; +mov.u32 r8000, %tid.y; +mov.u32 r8001, %50; +mad.lo.s32 r7942, r8000, 5000, r8001; +mov.u32 r7943, %tid.x; +mov.f32 f482, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1, {low, high}; +} +mov.f32 f484, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f478, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5, {low, high}; +} +mov.f32 f480, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %67, %55; +} +{ +add.f16x2 r14, %74, r11; +} +{ +add.f16x2 r17, %81, %65; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %91, %79; +} +{ +add.f16x2 r26, %51, r23; +} +{ +add.f16x2 r29, %57, %93; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %67, %55; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %74, r38; +} +{ +add.f16x2 r44, %81, %65; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %91, %79; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %57, %93; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %67, %55; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %74, r74; +} +{ +add.f16x2 r80, %81, %65; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %91, %79; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %57, %93; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %67, %55; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %74, r110; +} +{ +add.f16x2 r116, %81, %65; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %91, %79; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %57, %93; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %67, %55; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %74, r146; +} +{ +add.f16x2 r152, %81, %65; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %91, %79; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %57, %93; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %91, %79; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %51, r182; +} +{ +add.f16x2 r188, %57, %93; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %67, %55; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %81, %65; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %91, %79; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %51, r218; +} +{ +add.f16x2 r224, %57, %93; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %67, %55; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %81, %65; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %91, %79; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %51, r254; +} +{ +add.f16x2 r260, %57, %93; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %67, %55; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %81, %65; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %91, %79; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %51, r290; +} +{ +add.f16x2 r296, %57, %93; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %67, %55; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %81, %65; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %89, %77; +} +{ +add.f16x2 r336, %96, r333; +} +{ +add.f16x2 r339, %53, %87; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %66, %52; +} +{ +add.f16x2 r348, %73, r345; +} +{ +add.f16x2 r351, %80, %64; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %89, %77; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %96, r360; +} +{ +add.f16x2 r366, %53, %87; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %66, %52; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %80, %64; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %89, %77; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %96, r396; +} +{ +add.f16x2 r402, %53, %87; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %66, %52; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %80, %64; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %89, %77; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %96, r432; +} +{ +add.f16x2 r438, %53, %87; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %66, %52; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %80, %64; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %89, %77; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %96, r468; +} +{ +add.f16x2 r474, %53, %87; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %66, %52; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %80, %64; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %66, %52; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %73, r504; +} +{ +add.f16x2 r510, %80, %64; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %89, %77; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %53, %87; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %66, %52; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %73, r540; +} +{ +add.f16x2 r546, %80, %64; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %89, %77; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %53, %87; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %66, %52; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %73, r576; +} +{ +add.f16x2 r582, %80, %64; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %89, %77; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %53, %87; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %66, %52; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %73, r612; +} +{ +add.f16x2 r618, %80, %64; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %89, %77; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %53, %87; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r650, {low, high}; +} +{ +neg.f16x2 r651, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, %62, %99; +} +{ +add.f16x2 r658, %69, r655; +} +{ +add.f16x2 r661, %78, %60; +} +{ +add.f16x2 r664, r658, r661; +} +{ +add.f16x2 r667, %88, %75; +} +{ +add.f16x2 r670, %95, r667; +} +{ +add.f16x2 r673, %54, %86; +} +{ +add.f16x2 r676, r670, r673; +} +{ +add.f16x2 r679, %62, %99; +} +{ +mul.f16x2 r682, r679, r645; +} +{ +add.f16x2 r685, %69, r682; +} +{ +add.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r649; +} +{ +add.f16x2 r694, r685, r691; +} +{ +sub.f16x2 r697, %88, %75; +} +{ +mul.f16x2 r700, r697, r647; +} +{ +sub.f16x2 r703, %54, %86; +} +{ +mul.f16x2 r706, r703, r651; +} +{ +add.f16x2 r709, r700, r706; +} +{ +sub.f16x2 r712, r694, r709; +} +{ +add.f16x2 r715, %62, %99; +} +{ +mul.f16x2 r718, r715, r645; +} +{ +add.f16x2 r721, %69, r718; +} +{ +add.f16x2 r724, %78, %60; +} +{ +mul.f16x2 r727, r724, r649; +} +{ +add.f16x2 r730, r721, r727; +} +{ +sub.f16x2 r733, %88, %75; +} +{ +mul.f16x2 r736, r733, r647; +} +{ +sub.f16x2 r739, %54, %86; +} +{ +mul.f16x2 r742, r739, r651; +} +{ +add.f16x2 r745, r736, r742; +} +{ +add.f16x2 r748, r730, r745; +} +{ +add.f16x2 r751, %62, %99; +} +{ +mul.f16x2 r754, r751, r649; +} +{ +add.f16x2 r757, %69, r754; +} +{ +add.f16x2 r760, %78, %60; +} +{ +mul.f16x2 r763, r760, r653; +} +{ +add.f16x2 r766, r757, r763; +} +{ +sub.f16x2 r769, %88, %75; +} +{ +mul.f16x2 r772, r769, r651; +} +{ +sub.f16x2 r775, %54, %86; +} +{ +mul.f16x2 r778, r775, r654; +} +{ +add.f16x2 r781, r772, r778; +} +{ +sub.f16x2 r784, r766, r781; +} +{ +add.f16x2 r787, %62, %99; +} +{ +mul.f16x2 r790, r787, r649; +} +{ +add.f16x2 r793, %69, r790; +} +{ +add.f16x2 r796, %78, %60; +} +{ +mul.f16x2 r799, r796, r653; +} +{ +add.f16x2 r802, r793, r799; +} +{ +sub.f16x2 r805, %88, %75; +} +{ +mul.f16x2 r808, r805, r651; +} +{ +sub.f16x2 r811, %54, %86; +} +{ +mul.f16x2 r814, r811, r654; +} +{ +add.f16x2 r817, r808, r814; +} +{ +add.f16x2 r820, r802, r817; +} +{ +add.f16x2 r823, %88, %75; +} +{ +mul.f16x2 r826, r823, r645; +} +{ +add.f16x2 r829, %95, r826; +} +{ +add.f16x2 r832, %54, %86; +} +{ +mul.f16x2 r835, r832, r649; +} +{ +add.f16x2 r838, r829, r835; +} +{ +sub.f16x2 r841, %62, %99; +} +{ +mul.f16x2 r844, r841, r647; +} +{ +sub.f16x2 r847, %78, %60; +} +{ +mul.f16x2 r850, r847, r651; +} +{ +add.f16x2 r853, r844, r850; +} +{ +add.f16x2 r856, r838, r853; +} +{ +add.f16x2 r859, %88, %75; +} +{ +mul.f16x2 r862, r859, r645; +} +{ +add.f16x2 r865, %95, r862; +} +{ +add.f16x2 r868, %54, %86; +} +{ +mul.f16x2 r871, r868, r649; +} +{ +add.f16x2 r874, r865, r871; +} +{ +sub.f16x2 r877, %62, %99; +} +{ +mul.f16x2 r880, r877, r647; +} +{ +sub.f16x2 r883, %78, %60; +} +{ +mul.f16x2 r886, r883, r651; +} +{ +add.f16x2 r889, r880, r886; +} +{ +sub.f16x2 r892, r874, r889; +} +{ +add.f16x2 r895, %88, %75; +} +{ +mul.f16x2 r898, r895, r649; +} +{ +add.f16x2 r901, %95, r898; +} +{ +add.f16x2 r904, %54, %86; +} +{ +mul.f16x2 r907, r904, r653; +} +{ +add.f16x2 r910, r901, r907; +} +{ +sub.f16x2 r913, %62, %99; +} +{ +mul.f16x2 r916, r913, r651; +} +{ +sub.f16x2 r919, %78, %60; +} +{ +mul.f16x2 r922, r919, r654; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r910, r925; +} +{ +add.f16x2 r931, %88, %75; +} +{ +mul.f16x2 r934, r931, r649; +} +{ +add.f16x2 r937, %95, r934; +} +{ +add.f16x2 r940, %54, %86; +} +{ +mul.f16x2 r943, r940, r653; +} +{ +add.f16x2 r946, r937, r943; +} +{ +sub.f16x2 r949, %62, %99; +} +{ +mul.f16x2 r952, r949, r651; +} +{ +sub.f16x2 r955, %78, %60; +} +{ +mul.f16x2 r958, r955, r654; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, r946, r961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r968, {low, high}; +} +{ +neg.f16x2 r969, r968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r972, {low, high}; +} +{ +neg.f16x2 r973, r972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r976, {low, high}; +} +{ +add.f16x2 r977, %85, %71; +} +{ +add.f16x2 r980, %92, r977; +} +{ +add.f16x2 r983, %100, %83; +} +{ +add.f16x2 r986, r980, r983; +} +{ +add.f16x2 r989, %61, %97; +} +{ +add.f16x2 r992, %68, r989; +} +{ +add.f16x2 r995, %76, %59; +} +{ +add.f16x2 r998, r992, r995; +} +{ +add.f16x2 r1001, %85, %71; +} +{ +mul.f16x2 r1004, r1001, r967; +} +{ +add.f16x2 r1007, %92, r1004; +} +{ +add.f16x2 r1010, %100, %83; +} +{ +mul.f16x2 r1013, r1010, r971; +} +{ +add.f16x2 r1016, r1007, r1013; +} +{ +sub.f16x2 r1019, %61, %97; +} +{ +mul.f16x2 r1022, r1019, r969; +} +{ +sub.f16x2 r1025, %76, %59; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r1022, r1028; +} +{ +sub.f16x2 r1034, r1016, r1031; +} +{ +add.f16x2 r1037, %85, %71; +} +{ +mul.f16x2 r1040, r1037, r967; +} +{ +add.f16x2 r1043, %92, r1040; +} +{ +add.f16x2 r1046, %100, %83; +} +{ +mul.f16x2 r1049, r1046, r971; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, %61, %97; +} +{ +mul.f16x2 r1058, r1055, r969; +} +{ +sub.f16x2 r1061, %76, %59; +} +{ +mul.f16x2 r1064, r1061, r973; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +add.f16x2 r1070, r1052, r1067; +} +{ +add.f16x2 r1073, %85, %71; +} +{ +mul.f16x2 r1076, r1073, r971; +} +{ +add.f16x2 r1079, %92, r1076; +} +{ +add.f16x2 r1082, %100, %83; +} +{ +mul.f16x2 r1085, r1082, r975; +} +{ +add.f16x2 r1088, r1079, r1085; +} +{ +sub.f16x2 r1091, %61, %97; +} +{ +mul.f16x2 r1094, r1091, r973; +} +{ +sub.f16x2 r1097, %76, %59; +} +{ +mul.f16x2 r1100, r1097, r976; +} +{ +add.f16x2 r1103, r1094, r1100; +} +{ +sub.f16x2 r1106, r1088, r1103; +} +{ +add.f16x2 r1109, %85, %71; +} +{ +mul.f16x2 r1112, r1109, r971; +} +{ +add.f16x2 r1115, %92, r1112; +} +{ +add.f16x2 r1118, %100, %83; +} +{ +mul.f16x2 r1121, r1118, r975; +} +{ +add.f16x2 r1124, r1115, r1121; +} +{ +sub.f16x2 r1127, %61, %97; +} +{ +mul.f16x2 r1130, r1127, r973; +} +{ +sub.f16x2 r1133, %76, %59; +} +{ +mul.f16x2 r1136, r1133, r976; +} +{ +add.f16x2 r1139, r1130, r1136; +} +{ +add.f16x2 r1142, r1124, r1139; +} +{ +add.f16x2 r1145, %61, %97; +} +{ +mul.f16x2 r1148, r1145, r967; +} +{ +add.f16x2 r1151, %68, r1148; +} +{ +add.f16x2 r1154, %76, %59; +} +{ +mul.f16x2 r1157, r1154, r971; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, %85, %71; +} +{ +mul.f16x2 r1166, r1163, r969; +} +{ +sub.f16x2 r1169, %100, %83; +} +{ +mul.f16x2 r1172, r1169, r973; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +add.f16x2 r1178, r1160, r1175; +} +{ +add.f16x2 r1181, %61, %97; +} +{ +mul.f16x2 r1184, r1181, r967; +} +{ +add.f16x2 r1187, %68, r1184; +} +{ +add.f16x2 r1190, %76, %59; +} +{ +mul.f16x2 r1193, r1190, r971; +} +{ +add.f16x2 r1196, r1187, r1193; +} +{ +sub.f16x2 r1199, %85, %71; +} +{ +mul.f16x2 r1202, r1199, r969; +} +{ +sub.f16x2 r1205, %100, %83; +} +{ +mul.f16x2 r1208, r1205, r973; +} +{ +add.f16x2 r1211, r1202, r1208; +} +{ +sub.f16x2 r1214, r1196, r1211; +} +{ +add.f16x2 r1217, %61, %97; +} +{ +mul.f16x2 r1220, r1217, r971; +} +{ +add.f16x2 r1223, %68, r1220; +} +{ +add.f16x2 r1226, %76, %59; +} +{ +mul.f16x2 r1229, r1226, r975; +} +{ +add.f16x2 r1232, r1223, r1229; +} +{ +sub.f16x2 r1235, %85, %71; +} +{ +mul.f16x2 r1238, r1235, r973; +} +{ +sub.f16x2 r1241, %100, %83; +} +{ +mul.f16x2 r1244, r1241, r976; +} +{ +add.f16x2 r1247, r1238, r1244; +} +{ +add.f16x2 r1250, r1232, r1247; +} +{ +add.f16x2 r1253, %61, %97; +} +{ +mul.f16x2 r1256, r1253, r971; +} +{ +add.f16x2 r1259, %68, r1256; +} +{ +add.f16x2 r1262, %76, %59; +} +{ +mul.f16x2 r1265, r1262, r975; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, %85, %71; +} +{ +mul.f16x2 r1274, r1271, r973; +} +{ +sub.f16x2 r1277, %100, %83; +} +{ +mul.f16x2 r1280, r1277, r976; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r1268, r1283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1290, {low, high}; +} +{ +neg.f16x2 r1291, r1290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1294, {low, high}; +} +{ +neg.f16x2 r1295, r1294; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1298, {low, high}; +} +{ +add.f16x2 r1299, %58, %94; +} +{ +add.f16x2 r1302, %63, r1299; +} +{ +add.f16x2 r1305, %72, %56; +} +{ +add.f16x2 r1308, r1302, r1305; +} +{ +add.f16x2 r1311, %84, %70; +} +{ +add.f16x2 r1314, %90, r1311; +} +{ +add.f16x2 r1317, %98, %82; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %58, %94; +} +{ +mul.f16x2 r1326, r1323, r1289; +} +{ +add.f16x2 r1329, %63, r1326; +} +{ +add.f16x2 r1332, %72, %56; +} +{ +mul.f16x2 r1335, r1332, r1293; +} +{ +add.f16x2 r1338, r1329, r1335; +} +{ +sub.f16x2 r1341, %84, %70; +} +{ +mul.f16x2 r1344, r1341, r1291; +} +{ +sub.f16x2 r1347, %98, %82; +} +{ +mul.f16x2 r1350, r1347, r1295; +} +{ +add.f16x2 r1353, r1344, r1350; +} +{ +sub.f16x2 r1356, r1338, r1353; +} +{ +add.f16x2 r1359, %58, %94; +} +{ +mul.f16x2 r1362, r1359, r1289; +} +{ +add.f16x2 r1365, %63, r1362; +} +{ +add.f16x2 r1368, %72, %56; +} +{ +mul.f16x2 r1371, r1368, r1293; +} +{ +add.f16x2 r1374, r1365, r1371; +} +{ +sub.f16x2 r1377, %84, %70; +} +{ +mul.f16x2 r1380, r1377, r1291; +} +{ +sub.f16x2 r1383, %98, %82; +} +{ +mul.f16x2 r1386, r1383, r1295; +} +{ +add.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1374, r1389; +} +{ +add.f16x2 r1395, %58, %94; +} +{ +mul.f16x2 r1398, r1395, r1293; +} +{ +add.f16x2 r1401, %63, r1398; +} +{ +add.f16x2 r1404, %72, %56; +} +{ +mul.f16x2 r1407, r1404, r1297; +} +{ +add.f16x2 r1410, r1401, r1407; +} +{ +sub.f16x2 r1413, %84, %70; +} +{ +mul.f16x2 r1416, r1413, r1295; +} +{ +sub.f16x2 r1419, %98, %82; +} +{ +mul.f16x2 r1422, r1419, r1298; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +sub.f16x2 r1428, r1410, r1425; +} +{ +add.f16x2 r1431, %58, %94; +} +{ +mul.f16x2 r1434, r1431, r1293; +} +{ +add.f16x2 r1437, %63, r1434; +} +{ +add.f16x2 r1440, %72, %56; +} +{ +mul.f16x2 r1443, r1440, r1297; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +sub.f16x2 r1449, %84, %70; +} +{ +mul.f16x2 r1452, r1449, r1295; +} +{ +sub.f16x2 r1455, %98, %82; +} +{ +mul.f16x2 r1458, r1455, r1298; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +add.f16x2 r1464, r1446, r1461; +} +{ +add.f16x2 r1467, %84, %70; +} +{ +mul.f16x2 r1470, r1467, r1289; +} +{ +add.f16x2 r1473, %90, r1470; +} +{ +add.f16x2 r1476, %98, %82; +} +{ +mul.f16x2 r1479, r1476, r1293; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +sub.f16x2 r1485, %58, %94; +} +{ +mul.f16x2 r1488, r1485, r1291; +} +{ +sub.f16x2 r1491, %72, %56; +} +{ +mul.f16x2 r1494, r1491, r1295; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +add.f16x2 r1500, r1482, r1497; +} +{ +add.f16x2 r1503, %84, %70; +} +{ +mul.f16x2 r1506, r1503, r1289; +} +{ +add.f16x2 r1509, %90, r1506; +} +{ +add.f16x2 r1512, %98, %82; +} +{ +mul.f16x2 r1515, r1512, r1293; +} +{ +add.f16x2 r1518, r1509, r1515; +} +{ +sub.f16x2 r1521, %58, %94; +} +{ +mul.f16x2 r1524, r1521, r1291; +} +{ +sub.f16x2 r1527, %72, %56; +} +{ +mul.f16x2 r1530, r1527, r1295; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1518, r1533; +} +{ +add.f16x2 r1539, %84, %70; +} +{ +mul.f16x2 r1542, r1539, r1293; +} +{ +add.f16x2 r1545, %90, r1542; +} +{ +add.f16x2 r1548, %98, %82; +} +{ +mul.f16x2 r1551, r1548, r1297; +} +{ +add.f16x2 r1554, r1545, r1551; +} +{ +sub.f16x2 r1557, %58, %94; +} +{ +mul.f16x2 r1560, r1557, r1295; +} +{ +sub.f16x2 r1563, %72, %56; +} +{ +mul.f16x2 r1566, r1563, r1298; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +add.f16x2 r1572, r1554, r1569; +} +{ +add.f16x2 r1575, %84, %70; +} +{ +mul.f16x2 r1578, r1575, r1293; +} +{ +add.f16x2 r1581, %90, r1578; +} +{ +add.f16x2 r1584, %98, %82; +} +{ +mul.f16x2 r1587, r1584, r1297; +} +{ +add.f16x2 r1590, r1581, r1587; +} +{ +sub.f16x2 r1593, %58, %94; +} +{ +mul.f16x2 r1596, r1593, r1295; +} +{ +sub.f16x2 r1599, %72, %56; +} +{ +mul.f16x2 r1602, r1599, r1298; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1590, r1605; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1611, {low, high}; +} +mov.f32 f332, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1612, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1613, {low, high}; +} +mov.f32 f336, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1614, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1615, {low, high}; +} +mov.f32 f340, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1616, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1617, {low, high}; +} +mov.f32 f344, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1618, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1621, {low, high}; +} +mov.f32 f352, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1622, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1625, {low, high}; +} +mov.f32 f360, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1626, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1627, {low, high}; +} +mov.f32 f364, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1628, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1633, {low, high}; +} +mov.f32 f376, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1641, {low, high}; +} +mov.f32 f392, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1642, {low, high}; +} +{ +mul.f16x2 r1659, r390, r1611; +} +{ +mul.f16x2 r1662, r534, r1612; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r390, r1612; +} +{ +fma.rn.f16x2 r1671, r534, r1611, r1668; +} +{ +mul.f16x2 r1675, r712, r1613; +} +{ +mul.f16x2 r1678, r856, r1614; +} +{ +sub.f16x2 r1681, r1675, r1678; +} +{ +mul.f16x2 r1684, r712, r1614; +} +{ +fma.rn.f16x2 r1687, r856, r1613, r1684; +} +{ +mul.f16x2 r1691, r1034, r1615; +} +{ +mul.f16x2 r1694, r1178, r1616; +} +{ +sub.f16x2 r1697, r1691, r1694; +} +{ +mul.f16x2 r1700, r1034, r1616; +} +{ +fma.rn.f16x2 r1703, r1178, r1615, r1700; +} +{ +mul.f16x2 r1707, r1356, r1617; +} +{ +mul.f16x2 r1710, r1500, r1618; +} +{ +sub.f16x2 r1713, r1707, r1710; +} +{ +mul.f16x2 r1716, r1356, r1618; +} +{ +fma.rn.f16x2 r1719, r1500, r1617, r1716; +} +{ +mul.f16x2 r1723, r462, r1613; +} +{ +mul.f16x2 r1726, r606, r1614; +} +{ +sub.f16x2 r1729, r1723, r1726; +} +{ +mul.f16x2 r1732, r462, r1614; +} +{ +fma.rn.f16x2 r1735, r606, r1613, r1732; +} +{ +mul.f16x2 r1739, r784, r1617; +} +{ +mul.f16x2 r1742, r928, r1618; +} +{ +sub.f16x2 r1745, r1739, r1742; +} +{ +mul.f16x2 r1748, r784, r1618; +} +{ +fma.rn.f16x2 r1751, r928, r1617, r1748; +} +{ +mul.f16x2 r1755, r1106, r1621; +} +{ +mul.f16x2 r1758, r1250, r1622; +} +{ +sub.f16x2 r1761, r1755, r1758; +} +{ +mul.f16x2 r1764, r1106, r1622; +} +{ +fma.rn.f16x2 r1767, r1250, r1621, r1764; +} +{ +mul.f16x2 r1771, r1428, r1625; +} +{ +mul.f16x2 r1774, r1572, r1626; +} +{ +sub.f16x2 r1777, r1771, r1774; +} +{ +mul.f16x2 r1780, r1428, r1626; +} +{ +fma.rn.f16x2 r1783, r1572, r1625, r1780; +} +{ +mul.f16x2 r1787, r498, r1615; +} +{ +mul.f16x2 r1790, r642, r1616; +} +{ +sub.f16x2 r1793, r1787, r1790; +} +{ +mul.f16x2 r1796, r498, r1616; +} +{ +fma.rn.f16x2 r1799, r642, r1615, r1796; +} +{ +mul.f16x2 r1803, r820, r1621; +} +{ +mul.f16x2 r1806, r964, r1622; +} +{ +sub.f16x2 r1809, r1803, r1806; +} +{ +mul.f16x2 r1812, r820, r1622; +} +{ +fma.rn.f16x2 r1815, r964, r1621, r1812; +} +{ +mul.f16x2 r1819, r1142, r1627; +} +{ +mul.f16x2 r1822, r1286, r1628; +} +{ +sub.f16x2 r1825, r1819, r1822; +} +{ +mul.f16x2 r1828, r1142, r1628; +} +{ +fma.rn.f16x2 r1831, r1286, r1627, r1828; +} +{ +mul.f16x2 r1835, r1464, r1633; +} +{ +mul.f16x2 r1838, r1608, r1634; +} +{ +sub.f16x2 r1841, r1835, r1838; +} +{ +mul.f16x2 r1844, r1464, r1634; +} +{ +fma.rn.f16x2 r1847, r1608, r1633, r1844; +} +{ +mul.f16x2 r1851, r426, r1617; +} +{ +mul.f16x2 r1854, r570, r1618; +} +{ +sub.f16x2 r1857, r1851, r1854; +} +{ +mul.f16x2 r1860, r426, r1618; +} +{ +fma.rn.f16x2 r1863, r570, r1617, r1860; +} +{ +mul.f16x2 r1867, r748, r1625; +} +{ +mul.f16x2 r1870, r892, r1626; +} +{ +sub.f16x2 r1873, r1867, r1870; +} +{ +mul.f16x2 r1876, r748, r1626; +} +{ +fma.rn.f16x2 r1879, r892, r1625, r1876; +} +{ +mul.f16x2 r1883, r1070, r1633; +} +{ +mul.f16x2 r1886, r1214, r1634; +} +{ +sub.f16x2 r1889, r1883, r1886; +} +{ +mul.f16x2 r1892, r1070, r1634; +} +{ +fma.rn.f16x2 r1895, r1214, r1633, r1892; +} +{ +mul.f16x2 r1899, r1392, r1641; +} +{ +mul.f16x2 r1902, r1536, r1642; +} +{ +sub.f16x2 r1905, r1899, r1902; +} +{ +mul.f16x2 r1908, r1392, r1642; +} +{ +fma.rn.f16x2 r1911, r1536, r1641, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1916, {low, high}; +} +{ +neg.f16x2 r1917, r1916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1919, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1920, {low, high}; +} +{ +neg.f16x2 r1921, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1924, {low, high}; +} +{ +add.f16x2 r1925, r342, r1308; +} +{ +add.f16x2 r1928, r20, r1925; +} +{ +add.f16x2 r1931, r664, r986; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r354, r1320; +} +{ +add.f16x2 r1940, r32, r1937; +} +{ +add.f16x2 r1943, r676, r998; +} +{ +add.f16x2 r1946, r1940, r1943; +} +{ +add.f16x2 r1949, r342, r1308; +} +{ +mul.f16x2 r1952, r1949, r1915; +} +{ +add.f16x2 r1955, r20, r1952; +} +{ +add.f16x2 r1958, r664, r986; +} +{ +mul.f16x2 r1961, r1958, r1919; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r354, r1320; +} +{ +mul.f16x2 r1970, r1967, r1917; +} +{ +sub.f16x2 r1973, r676, r998; +} +{ +mul.f16x2 r1976, r1973, r1921; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +sub.f16x2 r1982, r1964, r1979; +} +{ +add.f16x2 r1985, r342, r1308; +} +{ +mul.f16x2 r1988, r1985, r1915; +} +{ +add.f16x2 r1991, r20, r1988; +} +{ +add.f16x2 r1994, r664, r986; +} +{ +mul.f16x2 r1997, r1994, r1919; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r354, r1320; +} +{ +mul.f16x2 r2006, r2003, r1917; +} +{ +sub.f16x2 r2009, r676, r998; +} +{ +mul.f16x2 r2012, r2009, r1921; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 r2018, r2000, r2015; +} +{ +add.f16x2 r2021, r342, r1308; +} +{ +mul.f16x2 r2024, r2021, r1919; +} +{ +add.f16x2 r2027, r20, r2024; +} +{ +add.f16x2 r2030, r664, r986; +} +{ +mul.f16x2 r2033, r2030, r1923; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r354, r1320; +} +{ +mul.f16x2 r2042, r2039, r1921; +} +{ +sub.f16x2 r2045, r676, r998; +} +{ +mul.f16x2 r2048, r2045, r1924; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 r2054, r2036, r2051; +} +{ +add.f16x2 r2057, r342, r1308; +} +{ +mul.f16x2 r2060, r2057, r1919; +} +{ +add.f16x2 r2063, r20, r2060; +} +{ +add.f16x2 r2066, r664, r986; +} +{ +mul.f16x2 r2069, r2066, r1923; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r354, r1320; +} +{ +mul.f16x2 r2078, r2075, r1921; +} +{ +sub.f16x2 r2081, r676, r998; +} +{ +mul.f16x2 r2084, r2081, r1924; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 r2090, r2072, r2087; +} +{ +add.f16x2 r2093, r354, r1320; +} +{ +mul.f16x2 r2096, r2093, r1915; +} +{ +add.f16x2 r2099, r32, r2096; +} +{ +add.f16x2 r2102, r676, r998; +} +{ +mul.f16x2 r2105, r2102, r1919; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r342, r1308; +} +{ +mul.f16x2 r2114, r2111, r1917; +} +{ +sub.f16x2 r2117, r664, r986; +} +{ +mul.f16x2 r2120, r2117, r1921; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +add.f16x2 r2126, r2108, r2123; +} +{ +add.f16x2 r2129, r354, r1320; +} +{ +mul.f16x2 r2132, r2129, r1915; +} +{ +add.f16x2 r2135, r32, r2132; +} +{ +add.f16x2 r2138, r676, r998; +} +{ +mul.f16x2 r2141, r2138, r1919; +} +{ +add.f16x2 r2144, r2135, r2141; +} +{ +sub.f16x2 r2147, r342, r1308; +} +{ +mul.f16x2 r2150, r2147, r1917; +} +{ +sub.f16x2 r2153, r664, r986; +} +{ +mul.f16x2 r2156, r2153, r1921; +} +{ +add.f16x2 r2159, r2150, r2156; +} +{ +sub.f16x2 r2162, r2144, r2159; +} +{ +add.f16x2 r2165, r354, r1320; +} +{ +mul.f16x2 r2168, r2165, r1919; +} +{ +add.f16x2 r2171, r32, r2168; +} +{ +add.f16x2 r2174, r676, r998; +} +{ +mul.f16x2 r2177, r2174, r1923; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +sub.f16x2 r2183, r342, r1308; +} +{ +mul.f16x2 r2186, r2183, r1921; +} +{ +sub.f16x2 r2189, r664, r986; +} +{ +mul.f16x2 r2192, r2189, r1924; +} +{ +add.f16x2 r2195, r2186, r2192; +} +{ +add.f16x2 r2198, r2180, r2195; +} +{ +add.f16x2 r2201, r354, r1320; +} +{ +mul.f16x2 r2204, r2201, r1919; +} +{ +add.f16x2 r2207, r32, r2204; +} +{ +add.f16x2 r2210, r676, r998; +} +{ +mul.f16x2 r2213, r2210, r1923; +} +{ +add.f16x2 r2216, r2207, r2213; +} +{ +sub.f16x2 r2219, r342, r1308; +} +{ +mul.f16x2 r2222, r2219, r1921; +} +{ +sub.f16x2 r2225, r664, r986; +} +{ +mul.f16x2 r2228, r2225, r1924; +} +{ +add.f16x2 r2231, r2222, r2228; +} +{ +sub.f16x2 r2234, r2216, r2231; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2238, {low, high}; +} +{ +neg.f16x2 r2239, r2238; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2246, {low, high}; +} +{ +add.f16x2 r2247, r1665, r1713; +} +{ +add.f16x2 r2250, r68, r2247; +} +{ +add.f16x2 r2253, r1681, r1697; +} +{ +add.f16x2 r2256, r2250, r2253; +} +{ +add.f16x2 r2259, r1671, r1719; +} +{ +add.f16x2 r2262, r212, r2259; +} +{ +add.f16x2 r2265, r1687, r1703; +} +{ +add.f16x2 r2268, r2262, r2265; +} +{ +add.f16x2 r2271, r1665, r1713; +} +{ +mul.f16x2 r2274, r2271, r2237; +} +{ +add.f16x2 r2277, r68, r2274; +} +{ +add.f16x2 r2280, r1681, r1697; +} +{ +mul.f16x2 r2283, r2280, r2241; +} +{ +add.f16x2 r2286, r2277, r2283; +} +{ +sub.f16x2 r2289, r1671, r1719; +} +{ +mul.f16x2 r2292, r2289, r2239; +} +{ +sub.f16x2 r2295, r1687, r1703; +} +{ +mul.f16x2 r2298, r2295, r2243; +} +{ +add.f16x2 r2301, r2292, r2298; +} +{ +sub.f16x2 r2304, r2286, r2301; +} +{ +add.f16x2 r2307, r1665, r1713; +} +{ +mul.f16x2 r2310, r2307, r2237; +} +{ +add.f16x2 r2313, r68, r2310; +} +{ +add.f16x2 r2316, r1681, r1697; +} +{ +mul.f16x2 r2319, r2316, r2241; +} +{ +add.f16x2 r2322, r2313, r2319; +} +{ +sub.f16x2 r2325, r1671, r1719; +} +{ +mul.f16x2 r2328, r2325, r2239; +} +{ +sub.f16x2 r2331, r1687, r1703; +} +{ +mul.f16x2 r2334, r2331, r2243; +} +{ +add.f16x2 r2337, r2328, r2334; +} +{ +add.f16x2 r2340, r2322, r2337; +} +{ +add.f16x2 r2343, r1665, r1713; +} +{ +mul.f16x2 r2346, r2343, r2241; +} +{ +add.f16x2 r2349, r68, r2346; +} +{ +add.f16x2 r2352, r1681, r1697; +} +{ +mul.f16x2 r2355, r2352, r2245; +} +{ +add.f16x2 r2358, r2349, r2355; +} +{ +sub.f16x2 r2361, r1671, r1719; +} +{ +mul.f16x2 r2364, r2361, r2243; +} +{ +sub.f16x2 r2367, r1687, r1703; +} +{ +mul.f16x2 r2370, r2367, r2246; +} +{ +add.f16x2 r2373, r2364, r2370; +} +{ +sub.f16x2 r2376, r2358, r2373; +} +{ +add.f16x2 r2379, r1665, r1713; +} +{ +mul.f16x2 r2382, r2379, r2241; +} +{ +add.f16x2 r2385, r68, r2382; +} +{ +add.f16x2 r2388, r1681, r1697; +} +{ +mul.f16x2 r2391, r2388, r2245; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +sub.f16x2 r2397, r1671, r1719; +} +{ +mul.f16x2 r2400, r2397, r2243; +} +{ +sub.f16x2 r2403, r1687, r1703; +} +{ +mul.f16x2 r2406, r2403, r2246; +} +{ +add.f16x2 r2409, r2400, r2406; +} +{ +add.f16x2 r2412, r2394, r2409; +} +{ +add.f16x2 r2415, r1671, r1719; +} +{ +mul.f16x2 r2418, r2415, r2237; +} +{ +add.f16x2 r2421, r212, r2418; +} +{ +add.f16x2 r2424, r1687, r1703; +} +{ +mul.f16x2 r2427, r2424, r2241; +} +{ +add.f16x2 r2430, r2421, r2427; +} +{ +sub.f16x2 r2433, r1665, r1713; +} +{ +mul.f16x2 r2436, r2433, r2239; +} +{ +sub.f16x2 r2439, r1681, r1697; +} +{ +mul.f16x2 r2442, r2439, r2243; +} +{ +add.f16x2 r2445, r2436, r2442; +} +{ +add.f16x2 r2448, r2430, r2445; +} +{ +add.f16x2 r2451, r1671, r1719; +} +{ +mul.f16x2 r2454, r2451, r2237; +} +{ +add.f16x2 r2457, r212, r2454; +} +{ +add.f16x2 r2460, r1687, r1703; +} +{ +mul.f16x2 r2463, r2460, r2241; +} +{ +add.f16x2 r2466, r2457, r2463; +} +{ +sub.f16x2 r2469, r1665, r1713; +} +{ +mul.f16x2 r2472, r2469, r2239; +} +{ +sub.f16x2 r2475, r1681, r1697; +} +{ +mul.f16x2 r2478, r2475, r2243; +} +{ +add.f16x2 r2481, r2472, r2478; +} +{ +sub.f16x2 r2484, r2466, r2481; +} +{ +add.f16x2 r2487, r1671, r1719; +} +{ +mul.f16x2 r2490, r2487, r2241; +} +{ +add.f16x2 r2493, r212, r2490; +} +{ +add.f16x2 r2496, r1687, r1703; +} +{ +mul.f16x2 r2499, r2496, r2245; +} +{ +add.f16x2 r2502, r2493, r2499; +} +{ +sub.f16x2 r2505, r1665, r1713; +} +{ +mul.f16x2 r2508, r2505, r2243; +} +{ +sub.f16x2 r2511, r1681, r1697; +} +{ +mul.f16x2 r2514, r2511, r2246; +} +{ +add.f16x2 r2517, r2508, r2514; +} +{ +add.f16x2 r2520, r2502, r2517; +} +{ +add.f16x2 r2523, r1671, r1719; +} +{ +mul.f16x2 r2526, r2523, r2241; +} +{ +add.f16x2 r2529, r212, r2526; +} +{ +add.f16x2 r2532, r1687, r1703; +} +{ +mul.f16x2 r2535, r2532, r2245; +} +{ +add.f16x2 r2538, r2529, r2535; +} +{ +sub.f16x2 r2541, r1665, r1713; +} +{ +mul.f16x2 r2544, r2541, r2243; +} +{ +sub.f16x2 r2547, r1681, r1697; +} +{ +mul.f16x2 r2550, r2547, r2246; +} +{ +add.f16x2 r2553, r2544, r2550; +} +{ +sub.f16x2 r2556, r2538, r2553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2560, {low, high}; +} +{ +neg.f16x2 r2561, r2560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2564, {low, high}; +} +{ +neg.f16x2 r2565, r2564; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2568, {low, high}; +} +{ +add.f16x2 r2569, r1729, r1777; +} +{ +add.f16x2 r2572, r140, r2569; +} +{ +add.f16x2 r2575, r1745, r1761; +} +{ +add.f16x2 r2578, r2572, r2575; +} +{ +add.f16x2 r2581, r1735, r1783; +} +{ +add.f16x2 r2584, r284, r2581; +} +{ +add.f16x2 r2587, r1751, r1767; +} +{ +add.f16x2 r2590, r2584, r2587; +} +{ +add.f16x2 r2593, r1729, r1777; +} +{ +mul.f16x2 r2596, r2593, r2559; +} +{ +add.f16x2 r2599, r140, r2596; +} +{ +add.f16x2 r2602, r1745, r1761; +} +{ +mul.f16x2 r2605, r2602, r2563; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +sub.f16x2 r2611, r1735, r1783; +} +{ +mul.f16x2 r2614, r2611, r2561; +} +{ +sub.f16x2 r2617, r1751, r1767; +} +{ +mul.f16x2 r2620, r2617, r2565; +} +{ +add.f16x2 r2623, r2614, r2620; +} +{ +sub.f16x2 r2626, r2608, r2623; +} +{ +add.f16x2 r2629, r1729, r1777; +} +{ +mul.f16x2 r2632, r2629, r2559; +} +{ +add.f16x2 r2635, r140, r2632; +} +{ +add.f16x2 r2638, r1745, r1761; +} +{ +mul.f16x2 r2641, r2638, r2563; +} +{ +add.f16x2 r2644, r2635, r2641; +} +{ +sub.f16x2 r2647, r1735, r1783; +} +{ +mul.f16x2 r2650, r2647, r2561; +} +{ +sub.f16x2 r2653, r1751, r1767; +} +{ +mul.f16x2 r2656, r2653, r2565; +} +{ +add.f16x2 r2659, r2650, r2656; +} +{ +add.f16x2 r2662, r2644, r2659; +} +{ +add.f16x2 r2665, r1729, r1777; +} +{ +mul.f16x2 r2668, r2665, r2563; +} +{ +add.f16x2 r2671, r140, r2668; +} +{ +add.f16x2 r2674, r1745, r1761; +} +{ +mul.f16x2 r2677, r2674, r2567; +} +{ +add.f16x2 r2680, r2671, r2677; +} +{ +sub.f16x2 r2683, r1735, r1783; +} +{ +mul.f16x2 r2686, r2683, r2565; +} +{ +sub.f16x2 r2689, r1751, r1767; +} +{ +mul.f16x2 r2692, r2689, r2568; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2680, r2695; +} +{ +add.f16x2 r2701, r1729, r1777; +} +{ +mul.f16x2 r2704, r2701, r2563; +} +{ +add.f16x2 r2707, r140, r2704; +} +{ +add.f16x2 r2710, r1745, r1761; +} +{ +mul.f16x2 r2713, r2710, r2567; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +sub.f16x2 r2719, r1735, r1783; +} +{ +mul.f16x2 r2722, r2719, r2565; +} +{ +sub.f16x2 r2725, r1751, r1767; +} +{ +mul.f16x2 r2728, r2725, r2568; +} +{ +add.f16x2 r2731, r2722, r2728; +} +{ +add.f16x2 r2734, r2716, r2731; +} +{ +add.f16x2 r2737, r1735, r1783; +} +{ +mul.f16x2 r2740, r2737, r2559; +} +{ +add.f16x2 r2743, r284, r2740; +} +{ +add.f16x2 r2746, r1751, r1767; +} +{ +mul.f16x2 r2749, r2746, r2563; +} +{ +add.f16x2 r2752, r2743, r2749; +} +{ +sub.f16x2 r2755, r1729, r1777; +} +{ +mul.f16x2 r2758, r2755, r2561; +} +{ +sub.f16x2 r2761, r1745, r1761; +} +{ +mul.f16x2 r2764, r2761, r2565; +} +{ +add.f16x2 r2767, r2758, r2764; +} +{ +add.f16x2 r2770, r2752, r2767; +} +{ +add.f16x2 r2773, r1735, r1783; +} +{ +mul.f16x2 r2776, r2773, r2559; +} +{ +add.f16x2 r2779, r284, r2776; +} +{ +add.f16x2 r2782, r1751, r1767; +} +{ +mul.f16x2 r2785, r2782, r2563; +} +{ +add.f16x2 r2788, r2779, r2785; +} +{ +sub.f16x2 r2791, r1729, r1777; +} +{ +mul.f16x2 r2794, r2791, r2561; +} +{ +sub.f16x2 r2797, r1745, r1761; +} +{ +mul.f16x2 r2800, r2797, r2565; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2788, r2803; +} +{ +add.f16x2 r2809, r1735, r1783; +} +{ +mul.f16x2 r2812, r2809, r2563; +} +{ +add.f16x2 r2815, r284, r2812; +} +{ +add.f16x2 r2818, r1751, r1767; +} +{ +mul.f16x2 r2821, r2818, r2567; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +sub.f16x2 r2827, r1729, r1777; +} +{ +mul.f16x2 r2830, r2827, r2565; +} +{ +sub.f16x2 r2833, r1745, r1761; +} +{ +mul.f16x2 r2836, r2833, r2568; +} +{ +add.f16x2 r2839, r2830, r2836; +} +{ +add.f16x2 r2842, r2824, r2839; +} +{ +add.f16x2 r2845, r1735, r1783; +} +{ +mul.f16x2 r2848, r2845, r2563; +} +{ +add.f16x2 r2851, r284, r2848; +} +{ +add.f16x2 r2854, r1751, r1767; +} +{ +mul.f16x2 r2857, r2854, r2567; +} +{ +add.f16x2 r2860, r2851, r2857; +} +{ +sub.f16x2 r2863, r1729, r1777; +} +{ +mul.f16x2 r2866, r2863, r2565; +} +{ +sub.f16x2 r2869, r1745, r1761; +} +{ +mul.f16x2 r2872, r2869, r2568; +} +{ +add.f16x2 r2875, r2866, r2872; +} +{ +sub.f16x2 r2878, r2860, r2875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2882, {low, high}; +} +{ +neg.f16x2 r2883, r2882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2886, {low, high}; +} +{ +neg.f16x2 r2887, r2886; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2890, {low, high}; +} +{ +add.f16x2 r2891, r1793, r1841; +} +{ +add.f16x2 r2894, r176, r2891; +} +{ +add.f16x2 r2897, r1809, r1825; +} +{ +add.f16x2 r2900, r2894, r2897; +} +{ +add.f16x2 r2903, r1799, r1847; +} +{ +add.f16x2 r2906, r320, r2903; +} +{ +add.f16x2 r2909, r1815, r1831; +} +{ +add.f16x2 r2912, r2906, r2909; +} +{ +add.f16x2 r2915, r1793, r1841; +} +{ +mul.f16x2 r2918, r2915, r2881; +} +{ +add.f16x2 r2921, r176, r2918; +} +{ +add.f16x2 r2924, r1809, r1825; +} +{ +mul.f16x2 r2927, r2924, r2885; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +sub.f16x2 r2933, r1799, r1847; +} +{ +mul.f16x2 r2936, r2933, r2883; +} +{ +sub.f16x2 r2939, r1815, r1831; +} +{ +mul.f16x2 r2942, r2939, r2887; +} +{ +add.f16x2 r2945, r2936, r2942; +} +{ +sub.f16x2 r2948, r2930, r2945; +} +{ +add.f16x2 r2951, r1793, r1841; +} +{ +mul.f16x2 r2954, r2951, r2881; +} +{ +add.f16x2 r2957, r176, r2954; +} +{ +add.f16x2 r2960, r1809, r1825; +} +{ +mul.f16x2 r2963, r2960, r2885; +} +{ +add.f16x2 r2966, r2957, r2963; +} +{ +sub.f16x2 r2969, r1799, r1847; +} +{ +mul.f16x2 r2972, r2969, r2883; +} +{ +sub.f16x2 r2975, r1815, r1831; +} +{ +mul.f16x2 r2978, r2975, r2887; +} +{ +add.f16x2 r2981, r2972, r2978; +} +{ +add.f16x2 r2984, r2966, r2981; +} +{ +add.f16x2 r2987, r1793, r1841; +} +{ +mul.f16x2 r2990, r2987, r2885; +} +{ +add.f16x2 r2993, r176, r2990; +} +{ +add.f16x2 r2996, r1809, r1825; +} +{ +mul.f16x2 r2999, r2996, r2889; +} +{ +add.f16x2 r3002, r2993, r2999; +} +{ +sub.f16x2 r3005, r1799, r1847; +} +{ +mul.f16x2 r3008, r3005, r2887; +} +{ +sub.f16x2 r3011, r1815, r1831; +} +{ +mul.f16x2 r3014, r3011, r2890; +} +{ +add.f16x2 r3017, r3008, r3014; +} +{ +sub.f16x2 r3020, r3002, r3017; +} +{ +add.f16x2 r3023, r1793, r1841; +} +{ +mul.f16x2 r3026, r3023, r2885; +} +{ +add.f16x2 r3029, r176, r3026; +} +{ +add.f16x2 r3032, r1809, r1825; +} +{ +mul.f16x2 r3035, r3032, r2889; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +sub.f16x2 r3041, r1799, r1847; +} +{ +mul.f16x2 r3044, r3041, r2887; +} +{ +sub.f16x2 r3047, r1815, r1831; +} +{ +mul.f16x2 r3050, r3047, r2890; +} +{ +add.f16x2 r3053, r3044, r3050; +} +{ +add.f16x2 r3056, r3038, r3053; +} +{ +add.f16x2 r3059, r1799, r1847; +} +{ +mul.f16x2 r3062, r3059, r2881; +} +{ +add.f16x2 r3065, r320, r3062; +} +{ +add.f16x2 r3068, r1815, r1831; +} +{ +mul.f16x2 r3071, r3068, r2885; +} +{ +add.f16x2 r3074, r3065, r3071; +} +{ +sub.f16x2 r3077, r1793, r1841; +} +{ +mul.f16x2 r3080, r3077, r2883; +} +{ +sub.f16x2 r3083, r1809, r1825; +} +{ +mul.f16x2 r3086, r3083, r2887; +} +{ +add.f16x2 r3089, r3080, r3086; +} +{ +add.f16x2 r3092, r3074, r3089; +} +{ +add.f16x2 r3095, r1799, r1847; +} +{ +mul.f16x2 r3098, r3095, r2881; +} +{ +add.f16x2 r3101, r320, r3098; +} +{ +add.f16x2 r3104, r1815, r1831; +} +{ +mul.f16x2 r3107, r3104, r2885; +} +{ +add.f16x2 r3110, r3101, r3107; +} +{ +sub.f16x2 r3113, r1793, r1841; +} +{ +mul.f16x2 r3116, r3113, r2883; +} +{ +sub.f16x2 r3119, r1809, r1825; +} +{ +mul.f16x2 r3122, r3119, r2887; +} +{ +add.f16x2 r3125, r3116, r3122; +} +{ +sub.f16x2 r3128, r3110, r3125; +} +{ +add.f16x2 r3131, r1799, r1847; +} +{ +mul.f16x2 r3134, r3131, r2885; +} +{ +add.f16x2 r3137, r320, r3134; +} +{ +add.f16x2 r3140, r1815, r1831; +} +{ +mul.f16x2 r3143, r3140, r2889; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +sub.f16x2 r3149, r1793, r1841; +} +{ +mul.f16x2 r3152, r3149, r2887; +} +{ +sub.f16x2 r3155, r1809, r1825; +} +{ +mul.f16x2 r3158, r3155, r2890; +} +{ +add.f16x2 r3161, r3152, r3158; +} +{ +add.f16x2 r3164, r3146, r3161; +} +{ +add.f16x2 r3167, r1799, r1847; +} +{ +mul.f16x2 r3170, r3167, r2885; +} +{ +add.f16x2 r3173, r320, r3170; +} +{ +add.f16x2 r3176, r1815, r1831; +} +{ +mul.f16x2 r3179, r3176, r2889; +} +{ +add.f16x2 r3182, r3173, r3179; +} +{ +sub.f16x2 r3185, r1793, r1841; +} +{ +mul.f16x2 r3188, r3185, r2887; +} +{ +sub.f16x2 r3191, r1809, r1825; +} +{ +mul.f16x2 r3194, r3191, r2890; +} +{ +add.f16x2 r3197, r3188, r3194; +} +{ +sub.f16x2 r3200, r3182, r3197; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3204, {low, high}; +} +{ +neg.f16x2 r3205, r3204; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r3208, {low, high}; +} +{ +neg.f16x2 r3209, r3208; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3212, {low, high}; +} +{ +add.f16x2 r3213, r1857, r1905; +} +{ +add.f16x2 r3216, r104, r3213; +} +{ +add.f16x2 r3219, r1873, r1889; +} +{ +add.f16x2 r3222, r3216, r3219; +} +{ +add.f16x2 r3225, r1863, r1911; +} +{ +add.f16x2 r3228, r248, r3225; +} +{ +add.f16x2 r3231, r1879, r1895; +} +{ +add.f16x2 r3234, r3228, r3231; +} +{ +add.f16x2 r3237, r1857, r1905; +} +{ +mul.f16x2 r3240, r3237, r3203; +} +{ +add.f16x2 r3243, r104, r3240; +} +{ +add.f16x2 r3246, r1873, r1889; +} +{ +mul.f16x2 r3249, r3246, r3207; +} +{ +add.f16x2 r3252, r3243, r3249; +} +{ +sub.f16x2 r3255, r1863, r1911; +} +{ +mul.f16x2 r3258, r3255, r3205; +} +{ +sub.f16x2 r3261, r1879, r1895; +} +{ +mul.f16x2 r3264, r3261, r3209; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +sub.f16x2 r3270, r3252, r3267; +} +{ +add.f16x2 r3273, r1857, r1905; +} +{ +mul.f16x2 r3276, r3273, r3203; +} +{ +add.f16x2 r3279, r104, r3276; +} +{ +add.f16x2 r3282, r1873, r1889; +} +{ +mul.f16x2 r3285, r3282, r3207; +} +{ +add.f16x2 r3288, r3279, r3285; +} +{ +sub.f16x2 r3291, r1863, r1911; +} +{ +mul.f16x2 r3294, r3291, r3205; +} +{ +sub.f16x2 r3297, r1879, r1895; +} +{ +mul.f16x2 r3300, r3297, r3209; +} +{ +add.f16x2 r3303, r3294, r3300; +} +{ +add.f16x2 r3306, r3288, r3303; +} +{ +add.f16x2 r3309, r1857, r1905; +} +{ +mul.f16x2 r3312, r3309, r3207; +} +{ +add.f16x2 r3315, r104, r3312; +} +{ +add.f16x2 r3318, r1873, r1889; +} +{ +mul.f16x2 r3321, r3318, r3211; +} +{ +add.f16x2 r3324, r3315, r3321; +} +{ +sub.f16x2 r3327, r1863, r1911; +} +{ +mul.f16x2 r3330, r3327, r3209; +} +{ +sub.f16x2 r3333, r1879, r1895; +} +{ +mul.f16x2 r3336, r3333, r3212; +} +{ +add.f16x2 r3339, r3330, r3336; +} +{ +sub.f16x2 r3342, r3324, r3339; +} +{ +add.f16x2 r3345, r1857, r1905; +} +{ +mul.f16x2 r3348, r3345, r3207; +} +{ +add.f16x2 r3351, r104, r3348; +} +{ +add.f16x2 r3354, r1873, r1889; +} +{ +mul.f16x2 r3357, r3354, r3211; +} +{ +add.f16x2 r3360, r3351, r3357; +} +{ +sub.f16x2 r3363, r1863, r1911; +} +{ +mul.f16x2 r3366, r3363, r3209; +} +{ +sub.f16x2 r3369, r1879, r1895; +} +{ +mul.f16x2 r3372, r3369, r3212; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r3360, r3375; +} +{ +add.f16x2 r3381, r1863, r1911; +} +{ +mul.f16x2 r3384, r3381, r3203; +} +{ +add.f16x2 r3387, r248, r3384; +} +{ +add.f16x2 r3390, r1879, r1895; +} +{ +mul.f16x2 r3393, r3390, r3207; +} +{ +add.f16x2 r3396, r3387, r3393; +} +{ +sub.f16x2 r3399, r1857, r1905; +} +{ +mul.f16x2 r3402, r3399, r3205; +} +{ +sub.f16x2 r3405, r1873, r1889; +} +{ +mul.f16x2 r3408, r3405, r3209; +} +{ +add.f16x2 r3411, r3402, r3408; +} +{ +add.f16x2 r3414, r3396, r3411; +} +{ +add.f16x2 r3417, r1863, r1911; +} +{ +mul.f16x2 r3420, r3417, r3203; +} +{ +add.f16x2 r3423, r248, r3420; +} +{ +add.f16x2 r3426, r1879, r1895; +} +{ +mul.f16x2 r3429, r3426, r3207; +} +{ +add.f16x2 r3432, r3423, r3429; +} +{ +sub.f16x2 r3435, r1857, r1905; +} +{ +mul.f16x2 r3438, r3435, r3205; +} +{ +sub.f16x2 r3441, r1873, r1889; +} +{ +mul.f16x2 r3444, r3441, r3209; +} +{ +add.f16x2 r3447, r3438, r3444; +} +{ +sub.f16x2 r3450, r3432, r3447; +} +{ +add.f16x2 r3453, r1863, r1911; +} +{ +mul.f16x2 r3456, r3453, r3207; +} +{ +add.f16x2 r3459, r248, r3456; +} +{ +add.f16x2 r3462, r1879, r1895; +} +{ +mul.f16x2 r3465, r3462, r3211; +} +{ +add.f16x2 r3468, r3459, r3465; +} +{ +sub.f16x2 r3471, r1857, r1905; +} +{ +mul.f16x2 r3474, r3471, r3209; +} +{ +sub.f16x2 r3477, r1873, r1889; +} +{ +mul.f16x2 r3480, r3477, r3212; +} +{ +add.f16x2 r3483, r3474, r3480; +} +{ +add.f16x2 r3486, r3468, r3483; +} +{ +add.f16x2 r3489, r1863, r1911; +} +{ +mul.f16x2 r3492, r3489, r3207; +} +{ +add.f16x2 r3495, r248, r3492; +} +{ +add.f16x2 r3498, r1879, r1895; +} +{ +mul.f16x2 r3501, r3498, r3211; +} +{ +add.f16x2 r3504, r3495, r3501; +} +{ +sub.f16x2 r3507, r1857, r1905; +} +{ +mul.f16x2 r3510, r3507, r3209; +} +{ +sub.f16x2 r3513, r1873, r1889; +} +{ +mul.f16x2 r3516, r3513, r3212; +} +{ +add.f16x2 r3519, r3510, r3516; +} +{ +sub.f16x2 r3522, r3504, r3519; +} +mul.wide.u32 rd2, r7943, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r7944, rd3; +mul.lo.s32 r7945, r7944, 25; +sub.s32 r7946, r7943, r7945; +cvt.rn.f32.u32 f485, r7946; +mul.f32 f486, f485, 0f3C24B5BE; +cos.approx.f32 f217, f486; +sin.approx.f32 f487, f486; +neg.f32 f218, f487; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3525, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3530, {high, high}; +} +{ +mul.f16x2 r3532, r2268, r3530; +} +{ +fma.rn.f16x2 r3535, r2256, r3528, r3532; +} +{ +mul.f16x2 r3539, r2256, r3530; +} +{ +neg.f16x2 r3542, r3539; +} +{ +fma.rn.f16x2 r3544, r2268, r3528, r3542; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3550, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3552, {low, high}; +} +{ +mul.f16x2 r3553, r3550, r3552; +} +{ +mul.f16x2 r3556, r3525, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3559, {high, low}; +} +{ +fma.rn.f16x2 r3561, r3553, r3559, r3556; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3567, {high, high}; +} +{ +mul.f16x2 r3569, r2590, r3567; +} +{ +fma.rn.f16x2 r3572, r2578, r3565, r3569; +} +{ +mul.f16x2 r3576, r2578, r3567; +} +{ +neg.f16x2 r3579, r3576; +} +{ +fma.rn.f16x2 r3581, r2590, r3565, r3579; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3587, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3589, {low, high}; +} +{ +mul.f16x2 r3590, r3587, r3589; +} +{ +mul.f16x2 r3593, r3561, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3596, {high, low}; +} +{ +fma.rn.f16x2 r3598, r3590, r3596, r3593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3604, {high, high}; +} +{ +mul.f16x2 r3606, r2912, r3604; +} +{ +fma.rn.f16x2 r3609, r2900, r3602, r3606; +} +{ +mul.f16x2 r3613, r2900, r3604; +} +{ +neg.f16x2 r3616, r3613; +} +{ +fma.rn.f16x2 r3618, r2912, r3602, r3616; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3624, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3626, {low, high}; +} +{ +mul.f16x2 r3627, r3624, r3626; +} +{ +mul.f16x2 r3630, r3598, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3633, {high, low}; +} +{ +fma.rn.f16x2 r3635, r3627, r3633, r3630; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3641, {high, high}; +} +{ +mul.f16x2 r3643, r3234, r3641; +} +{ +fma.rn.f16x2 r3646, r3222, r3639, r3643; +} +{ +mul.f16x2 r3650, r3222, r3641; +} +{ +neg.f16x2 r3653, r3650; +} +{ +fma.rn.f16x2 r3655, r3234, r3639, r3653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3661, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3663, {low, high}; +} +{ +mul.f16x2 r3664, r3661, r3663; +} +{ +mul.f16x2 r3667, r3635, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3670, {high, low}; +} +{ +fma.rn.f16x2 r3672, r3664, r3670, r3667; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3678, {high, high}; +} +{ +mul.f16x2 r3680, r2126, r3678; +} +{ +fma.rn.f16x2 r3683, r1982, r3676, r3680; +} +{ +mul.f16x2 r3687, r1982, r3678; +} +{ +neg.f16x2 r3690, r3687; +} +{ +fma.rn.f16x2 r3692, r2126, r3676, r3690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3698, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3700, {low, high}; +} +{ +mul.f16x2 r3701, r3698, r3700; +} +{ +mul.f16x2 r3704, r3672, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3707, {high, low}; +} +{ +fma.rn.f16x2 r3709, r3701, r3707, r3704; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3715, {high, high}; +} +{ +mul.f16x2 r3717, r2448, r3715; +} +{ +fma.rn.f16x2 r3720, r2304, r3713, r3717; +} +{ +mul.f16x2 r3724, r2304, r3715; +} +{ +neg.f16x2 r3727, r3724; +} +{ +fma.rn.f16x2 r3729, r2448, r3713, r3727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3735, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3737, {low, high}; +} +{ +mul.f16x2 r3738, r3735, r3737; +} +{ +mul.f16x2 r3741, r3709, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3744, {high, low}; +} +{ +fma.rn.f16x2 r3746, r3738, r3744, r3741; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3752, {high, high}; +} +{ +mul.f16x2 r3754, r2770, r3752; +} +{ +fma.rn.f16x2 r3757, r2626, r3750, r3754; +} +{ +mul.f16x2 r3761, r2626, r3752; +} +{ +neg.f16x2 r3764, r3761; +} +{ +fma.rn.f16x2 r3766, r2770, r3750, r3764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3774, {low, high}; +} +{ +mul.f16x2 r3775, r3772, r3774; +} +{ +mul.f16x2 r3778, r3746, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3781, {high, low}; +} +{ +fma.rn.f16x2 r3783, r3775, r3781, r3778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3789, {high, high}; +} +{ +mul.f16x2 r3791, r3092, r3789; +} +{ +fma.rn.f16x2 r3794, r2948, r3787, r3791; +} +{ +mul.f16x2 r3798, r2948, r3789; +} +{ +neg.f16x2 r3801, r3798; +} +{ +fma.rn.f16x2 r3803, r3092, r3787, r3801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3811, {low, high}; +} +{ +mul.f16x2 r3812, r3809, r3811; +} +{ +mul.f16x2 r3815, r3783, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3818, {high, low}; +} +{ +fma.rn.f16x2 r3820, r3812, r3818, r3815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3826, {high, high}; +} +{ +mul.f16x2 r3828, r3414, r3826; +} +{ +fma.rn.f16x2 r3831, r3270, r3824, r3828; +} +{ +mul.f16x2 r3835, r3270, r3826; +} +{ +neg.f16x2 r3838, r3835; +} +{ +fma.rn.f16x2 r3840, r3414, r3824, r3838; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3848, {low, high}; +} +{ +mul.f16x2 r3849, r3846, r3848; +} +{ +mul.f16x2 r3852, r3820, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3855, {high, low}; +} +{ +fma.rn.f16x2 r3857, r3849, r3855, r3852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3863, {high, high}; +} +{ +mul.f16x2 r3865, r2198, r3863; +} +{ +fma.rn.f16x2 r3868, r2054, r3861, r3865; +} +{ +mul.f16x2 r3872, r2054, r3863; +} +{ +neg.f16x2 r3875, r3872; +} +{ +fma.rn.f16x2 r3877, r2198, r3861, r3875; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3885, {low, high}; +} +{ +mul.f16x2 r3886, r3883, r3885; +} +{ +mul.f16x2 r3889, r3857, r3881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3892, {high, low}; +} +{ +fma.rn.f16x2 r3894, r3886, r3892, r3889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3900, {high, high}; +} +{ +mul.f16x2 r3902, r2520, r3900; +} +{ +fma.rn.f16x2 r3905, r2376, r3898, r3902; +} +{ +mul.f16x2 r3909, r2376, r3900; +} +{ +neg.f16x2 r3912, r3909; +} +{ +fma.rn.f16x2 r3914, r2520, r3898, r3912; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3922, {low, high}; +} +{ +mul.f16x2 r3923, r3920, r3922; +} +{ +mul.f16x2 r3926, r3894, r3918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3929, {high, low}; +} +{ +fma.rn.f16x2 r3931, r3923, r3929, r3926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3937, {high, high}; +} +{ +mul.f16x2 r3939, r2842, r3937; +} +{ +fma.rn.f16x2 r3942, r2698, r3935, r3939; +} +{ +mul.f16x2 r3946, r2698, r3937; +} +{ +neg.f16x2 r3949, r3946; +} +{ +fma.rn.f16x2 r3951, r2842, r3935, r3949; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3959, {low, high}; +} +{ +mul.f16x2 r3960, r3957, r3959; +} +{ +mul.f16x2 r3963, r3931, r3955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3966, {high, low}; +} +{ +fma.rn.f16x2 r3968, r3960, r3966, r3963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3974, {high, high}; +} +{ +mul.f16x2 r3976, r3164, r3974; +} +{ +fma.rn.f16x2 r3979, r3020, r3972, r3976; +} +{ +mul.f16x2 r3983, r3020, r3974; +} +{ +neg.f16x2 r3986, r3983; +} +{ +fma.rn.f16x2 r3988, r3164, r3972, r3986; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3996, {low, high}; +} +{ +mul.f16x2 r3997, r3994, r3996; +} +{ +mul.f16x2 r4000, r3968, r3992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r4003, {high, low}; +} +{ +fma.rn.f16x2 r4005, r3997, r4003, r4000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4011, {high, high}; +} +{ +mul.f16x2 r4013, r3486, r4011; +} +{ +fma.rn.f16x2 r4016, r3342, r4009, r4013; +} +{ +mul.f16x2 r4020, r3342, r4011; +} +{ +neg.f16x2 r4023, r4020; +} +{ +fma.rn.f16x2 r4025, r3486, r4009, r4023; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4033, {low, high}; +} +{ +mul.f16x2 r4034, r4031, r4033; +} +{ +mul.f16x2 r4037, r4005, r4029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4040, {high, low}; +} +{ +fma.rn.f16x2 r4042, r4034, r4040, r4037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4048, {high, high}; +} +{ +mul.f16x2 r4050, r2234, r4048; +} +{ +fma.rn.f16x2 r4053, r2090, r4046, r4050; +} +{ +mul.f16x2 r4057, r2090, r4048; +} +{ +neg.f16x2 r4060, r4057; +} +{ +fma.rn.f16x2 r4062, r2234, r4046, r4060; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4070, {low, high}; +} +{ +mul.f16x2 r4071, r4068, r4070; +} +{ +mul.f16x2 r4074, r4042, r4066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4077, {high, low}; +} +{ +fma.rn.f16x2 r4079, r4071, r4077, r4074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4085, {high, high}; +} +{ +mul.f16x2 r4087, r2556, r4085; +} +{ +fma.rn.f16x2 r4090, r2412, r4083, r4087; +} +{ +mul.f16x2 r4094, r2412, r4085; +} +{ +neg.f16x2 r4097, r4094; +} +{ +fma.rn.f16x2 r4099, r2556, r4083, r4097; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4107, {low, high}; +} +{ +mul.f16x2 r4108, r4105, r4107; +} +{ +mul.f16x2 r4111, r4079, r4103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4114, {high, low}; +} +{ +fma.rn.f16x2 r4116, r4108, r4114, r4111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4122, {high, high}; +} +{ +mul.f16x2 r4124, r2878, r4122; +} +{ +fma.rn.f16x2 r4127, r2734, r4120, r4124; +} +{ +mul.f16x2 r4131, r2734, r4122; +} +{ +neg.f16x2 r4134, r4131; +} +{ +fma.rn.f16x2 r4136, r2878, r4120, r4134; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4140, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4142, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4144, {low, high}; +} +{ +mul.f16x2 r4145, r4142, r4144; +} +{ +mul.f16x2 r4148, r4116, r4140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4151, {high, low}; +} +{ +fma.rn.f16x2 r4153, r4145, r4151, r4148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4159, {high, high}; +} +{ +mul.f16x2 r4161, r3200, r4159; +} +{ +fma.rn.f16x2 r4164, r3056, r4157, r4161; +} +{ +mul.f16x2 r4168, r3056, r4159; +} +{ +neg.f16x2 r4171, r4168; +} +{ +fma.rn.f16x2 r4173, r3200, r4157, r4171; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4177, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4179, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4181, {low, high}; +} +{ +mul.f16x2 r4182, r4179, r4181; +} +{ +mul.f16x2 r4185, r4153, r4177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4188, {high, low}; +} +{ +fma.rn.f16x2 r4190, r4182, r4188, r4185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4196, {high, high}; +} +{ +mul.f16x2 r4198, r3522, r4196; +} +{ +fma.rn.f16x2 r4201, r3378, r4194, r4198; +} +{ +mul.f16x2 r4205, r3378, r4196; +} +{ +neg.f16x2 r4208, r4205; +} +{ +fma.rn.f16x2 r4210, r3522, r4194, r4208; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4214, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4216, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4218, {low, high}; +} +{ +mul.f16x2 r4219, r4216, r4218; +} +{ +mul.f16x2 r4222, r4190, r4214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4225, {high, low}; +} +{ +fma.rn.f16x2 r4227, r4219, r4225, r4222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4233, {high, high}; +} +{ +mul.f16x2 r4235, r2162, r4233; +} +{ +fma.rn.f16x2 r4238, r2018, r4231, r4235; +} +{ +mul.f16x2 r4242, r2018, r4233; +} +{ +neg.f16x2 r4245, r4242; +} +{ +fma.rn.f16x2 r4247, r2162, r4231, r4245; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4253, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4255, {low, high}; +} +{ +mul.f16x2 r4256, r4253, r4255; +} +{ +mul.f16x2 r4259, r4227, r4251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4262, {high, low}; +} +{ +fma.rn.f16x2 r4264, r4256, r4262, r4259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4270, {high, high}; +} +{ +mul.f16x2 r4272, r2484, r4270; +} +{ +fma.rn.f16x2 r4275, r2340, r4268, r4272; +} +{ +mul.f16x2 r4279, r2340, r4270; +} +{ +neg.f16x2 r4282, r4279; +} +{ +fma.rn.f16x2 r4284, r2484, r4268, r4282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4288, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4290, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4292, {low, high}; +} +{ +mul.f16x2 r4293, r4290, r4292; +} +{ +mul.f16x2 r4296, r4264, r4288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4299, {high, low}; +} +{ +fma.rn.f16x2 r4301, r4293, r4299, r4296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4307, {high, high}; +} +{ +mul.f16x2 r4309, r2806, r4307; +} +{ +fma.rn.f16x2 r4312, r2662, r4305, r4309; +} +{ +mul.f16x2 r4316, r2662, r4307; +} +{ +neg.f16x2 r4319, r4316; +} +{ +fma.rn.f16x2 r4321, r2806, r4305, r4319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4325, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4327, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4329, {low, high}; +} +{ +mul.f16x2 r4330, r4327, r4329; +} +{ +mul.f16x2 r4333, r4301, r4325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4336, {high, low}; +} +{ +fma.rn.f16x2 r4338, r4330, r4336, r4333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4344, {high, high}; +} +{ +mul.f16x2 r4346, r3128, r4344; +} +{ +fma.rn.f16x2 r4349, r2984, r4342, r4346; +} +{ +mul.f16x2 r4353, r2984, r4344; +} +{ +neg.f16x2 r4356, r4353; +} +{ +fma.rn.f16x2 r4358, r3128, r4342, r4356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4362, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4364, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4366, {low, high}; +} +{ +mul.f16x2 r4367, r4364, r4366; +} +{ +mul.f16x2 r4370, r4338, r4362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4373, {high, low}; +} +{ +fma.rn.f16x2 r4375, r4367, r4373, r4370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4379, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4381, {high, high}; +} +{ +mul.f16x2 r4383, r3450, r4381; +} +{ +fma.rn.f16x2 r4386, r3306, r4379, r4383; +} +{ +mul.f16x2 r4390, r3306, r4381; +} +{ +neg.f16x2 r4393, r4390; +} +{ +fma.rn.f16x2 r4395, r3450, r4379, r4393; +} +mad.lo.s32 r7947, r7944, 5000, r7942; +barrier.sync 0; +mad.lo.s32 r7948, r7946, 200, r7947; +st.shared.v2.f32 [r7948], {r1934, r1946}; +st.shared.v2.f32 [r7948+8], {r3535, r3544}; +st.shared.v2.f32 [r7948+16], {r3572, r3581}; +st.shared.v2.f32 [r7948+24], {r3609, r3618}; +st.shared.v2.f32 [r7948+32], {r3646, r3655}; +st.shared.v2.f32 [r7948+40], {r3683, r3692}; +st.shared.v2.f32 [r7948+48], {r3720, r3729}; +st.shared.v2.f32 [r7948+56], {r3757, r3766}; +st.shared.v2.f32 [r7948+64], {r3794, r3803}; +st.shared.v2.f32 [r7948+72], {r3831, r3840}; +st.shared.v2.f32 [r7948+80], {r3868, r3877}; +st.shared.v2.f32 [r7948+88], {r3905, r3914}; +st.shared.v2.f32 [r7948+96], {r3942, r3951}; +st.shared.v2.f32 [r7948+104], {r3979, r3988}; +st.shared.v2.f32 [r7948+112], {r4016, r4025}; +st.shared.v2.f32 [r7948+120], {r4053, r4062}; +st.shared.v2.f32 [r7948+128], {r4090, r4099}; +st.shared.v2.f32 [r7948+136], {r4127, r4136}; +st.shared.v2.f32 [r7948+144], {r4164, r4173}; +st.shared.v2.f32 [r7948+152], {r4201, r4210}; +st.shared.v2.f32 [r7948+160], {r4238, r4247}; +st.shared.v2.f32 [r7948+168], {r4275, r4284}; +st.shared.v2.f32 [r7948+176], {r4312, r4321}; +st.shared.v2.f32 [r7948+184], {r4349, r4358}; +st.shared.v2.f32 [r7948+192], {r4386, r4395}; +barrier.sync 0; +mad.lo.s32 r7949, r7946, -192, r7948; +ld.shared.u32 r4430, [r7949]; +ld.shared.u32 r4442, [r7949+4]; +ld.shared.u32 r4752, [r7949+200]; +ld.shared.u32 r4764, [r7949+204]; +ld.shared.u32 r5074, [r7949+400]; +ld.shared.u32 r5086, [r7949+404]; +ld.shared.u32 r5396, [r7949+600]; +ld.shared.u32 r5408, [r7949+604]; +ld.shared.u32 r5718, [r7949+800]; +ld.shared.u32 r5730, [r7949+804]; +ld.shared.u32 r4427, [r7949+1000]; +ld.shared.u32 r4439, [r7949+1004]; +ld.shared.u32 r4749, [r7949+1200]; +ld.shared.u32 r4761, [r7949+1204]; +ld.shared.u32 r5071, [r7949+1400]; +ld.shared.u32 r5083, [r7949+1404]; +ld.shared.u32 r5393, [r7949+1600]; +ld.shared.u32 r5405, [r7949+1604]; +ld.shared.u32 r5715, [r7949+1800]; +ld.shared.u32 r5727, [r7949+1804]; +ld.shared.u32 r4433, [r7949+2000]; +ld.shared.u32 r4445, [r7949+2004]; +ld.shared.u32 r4755, [r7949+2200]; +ld.shared.u32 r4767, [r7949+2204]; +ld.shared.u32 r5077, [r7949+2400]; +ld.shared.u32 r5089, [r7949+2404]; +ld.shared.u32 r5399, [r7949+2600]; +ld.shared.u32 r5411, [r7949+2604]; +ld.shared.u32 r5721, [r7949+2800]; +ld.shared.u32 r5733, [r7949+2804]; +ld.shared.u32 r4434, [r7949+3000]; +ld.shared.u32 r4446, [r7949+3004]; +ld.shared.u32 r4756, [r7949+3200]; +ld.shared.u32 r4768, [r7949+3204]; +ld.shared.u32 r5078, [r7949+3400]; +ld.shared.u32 r5090, [r7949+3404]; +ld.shared.u32 r5400, [r7949+3600]; +ld.shared.u32 r5412, [r7949+3604]; +ld.shared.u32 r5722, [r7949+3800]; +ld.shared.u32 r5734, [r7949+3804]; +ld.shared.u32 r4428, [r7949+4000]; +ld.shared.u32 r4440, [r7949+4004]; +ld.shared.u32 r4750, [r7949+4200]; +ld.shared.u32 r4762, [r7949+4204]; +ld.shared.u32 r5072, [r7949+4400]; +ld.shared.u32 r5084, [r7949+4404]; +ld.shared.u32 r5394, [r7949+4600]; +ld.shared.u32 r5406, [r7949+4604]; +ld.shared.u32 r5716, [r7949+4800]; +ld.shared.u32 r5728, [r7949+4804]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4417, {low, high}; +} +{ +neg.f16x2 r4418, r4417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4421, {low, high}; +} +{ +neg.f16x2 r4422, r4421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4425, {low, high}; +} +{ +add.f16x2 r4426, r4427, r4428; +} +{ +add.f16x2 r4429, r4430, r4426; +} +{ +add.f16x2 r4432, r4433, r4434; +} +{ +add.f16x2 r4435, r4429, r4432; +} +{ +add.f16x2 r4438, r4439, r4440; +} +{ +add.f16x2 r4441, r4442, r4438; +} +{ +add.f16x2 r4444, r4445, r4446; +} +{ +add.f16x2 r4447, r4441, r4444; +} +{ +add.f16x2 r4450, r4427, r4428; +} +{ +mul.f16x2 r4453, r4450, r4416; +} +{ +add.f16x2 r4456, r4430, r4453; +} +{ +add.f16x2 r4459, r4433, r4434; +} +{ +mul.f16x2 r4462, r4459, r4420; +} +{ +add.f16x2 r4465, r4456, r4462; +} +{ +sub.f16x2 r4468, r4439, r4440; +} +{ +mul.f16x2 r4471, r4468, r4418; +} +{ +sub.f16x2 r4474, r4445, r4446; +} +{ +mul.f16x2 r4477, r4474, r4422; +} +{ +add.f16x2 r4480, r4471, r4477; +} +{ +sub.f16x2 r4483, r4465, r4480; +} +{ +add.f16x2 r4486, r4427, r4428; +} +{ +mul.f16x2 r4489, r4486, r4416; +} +{ +add.f16x2 r4492, r4430, r4489; +} +{ +add.f16x2 r4495, r4433, r4434; +} +{ +mul.f16x2 r4498, r4495, r4420; +} +{ +add.f16x2 r4501, r4492, r4498; +} +{ +sub.f16x2 r4504, r4439, r4440; +} +{ +mul.f16x2 r4507, r4504, r4418; +} +{ +sub.f16x2 r4510, r4445, r4446; +} +{ +mul.f16x2 r4513, r4510, r4422; +} +{ +add.f16x2 r4516, r4507, r4513; +} +{ +add.f16x2 r4519, r4501, r4516; +} +{ +add.f16x2 r4522, r4427, r4428; +} +{ +mul.f16x2 r4525, r4522, r4420; +} +{ +add.f16x2 r4528, r4430, r4525; +} +{ +add.f16x2 r4531, r4433, r4434; +} +{ +mul.f16x2 r4534, r4531, r4424; +} +{ +add.f16x2 r4537, r4528, r4534; +} +{ +sub.f16x2 r4540, r4439, r4440; +} +{ +mul.f16x2 r4543, r4540, r4422; +} +{ +sub.f16x2 r4546, r4445, r4446; +} +{ +mul.f16x2 r4549, r4546, r4425; +} +{ +add.f16x2 r4552, r4543, r4549; +} +{ +sub.f16x2 r4555, r4537, r4552; +} +{ +add.f16x2 r4558, r4427, r4428; +} +{ +mul.f16x2 r4561, r4558, r4420; +} +{ +add.f16x2 r4564, r4430, r4561; +} +{ +add.f16x2 r4567, r4433, r4434; +} +{ +mul.f16x2 r4570, r4567, r4424; +} +{ +add.f16x2 r4573, r4564, r4570; +} +{ +sub.f16x2 r4576, r4439, r4440; +} +{ +mul.f16x2 r4579, r4576, r4422; +} +{ +sub.f16x2 r4582, r4445, r4446; +} +{ +mul.f16x2 r4585, r4582, r4425; +} +{ +add.f16x2 r4588, r4579, r4585; +} +{ +add.f16x2 r4591, r4573, r4588; +} +{ +add.f16x2 r4594, r4439, r4440; +} +{ +mul.f16x2 r4597, r4594, r4416; +} +{ +add.f16x2 r4600, r4442, r4597; +} +{ +add.f16x2 r4603, r4445, r4446; +} +{ +mul.f16x2 r4606, r4603, r4420; +} +{ +add.f16x2 r4609, r4600, r4606; +} +{ +sub.f16x2 r4612, r4427, r4428; +} +{ +mul.f16x2 r4615, r4612, r4418; +} +{ +sub.f16x2 r4618, r4433, r4434; +} +{ +mul.f16x2 r4621, r4618, r4422; +} +{ +add.f16x2 r4624, r4615, r4621; +} +{ +add.f16x2 r4627, r4609, r4624; +} +{ +add.f16x2 r4630, r4439, r4440; +} +{ +mul.f16x2 r4633, r4630, r4416; +} +{ +add.f16x2 r4636, r4442, r4633; +} +{ +add.f16x2 r4639, r4445, r4446; +} +{ +mul.f16x2 r4642, r4639, r4420; +} +{ +add.f16x2 r4645, r4636, r4642; +} +{ +sub.f16x2 r4648, r4427, r4428; +} +{ +mul.f16x2 r4651, r4648, r4418; +} +{ +sub.f16x2 r4654, r4433, r4434; +} +{ +mul.f16x2 r4657, r4654, r4422; +} +{ +add.f16x2 r4660, r4651, r4657; +} +{ +sub.f16x2 r4663, r4645, r4660; +} +{ +add.f16x2 r4666, r4439, r4440; +} +{ +mul.f16x2 r4669, r4666, r4420; +} +{ +add.f16x2 r4672, r4442, r4669; +} +{ +add.f16x2 r4675, r4445, r4446; +} +{ +mul.f16x2 r4678, r4675, r4424; +} +{ +add.f16x2 r4681, r4672, r4678; +} +{ +sub.f16x2 r4684, r4427, r4428; +} +{ +mul.f16x2 r4687, r4684, r4422; +} +{ +sub.f16x2 r4690, r4433, r4434; +} +{ +mul.f16x2 r4693, r4690, r4425; +} +{ +add.f16x2 r4696, r4687, r4693; +} +{ +add.f16x2 r4699, r4681, r4696; +} +{ +add.f16x2 r4702, r4439, r4440; +} +{ +mul.f16x2 r4705, r4702, r4420; +} +{ +add.f16x2 r4708, r4442, r4705; +} +{ +add.f16x2 r4711, r4445, r4446; +} +{ +mul.f16x2 r4714, r4711, r4424; +} +{ +add.f16x2 r4717, r4708, r4714; +} +{ +sub.f16x2 r4720, r4427, r4428; +} +{ +mul.f16x2 r4723, r4720, r4422; +} +{ +sub.f16x2 r4726, r4433, r4434; +} +{ +mul.f16x2 r4729, r4726, r4425; +} +{ +add.f16x2 r4732, r4723, r4729; +} +{ +sub.f16x2 r4735, r4717, r4732; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4739, {low, high}; +} +{ +neg.f16x2 r4740, r4739; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4743, {low, high}; +} +{ +neg.f16x2 r4744, r4743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4747, {low, high}; +} +{ +add.f16x2 r4748, r4749, r4750; +} +{ +add.f16x2 r4751, r4752, r4748; +} +{ +add.f16x2 r4754, r4755, r4756; +} +{ +add.f16x2 r4757, r4751, r4754; +} +{ +add.f16x2 r4760, r4761, r4762; +} +{ +add.f16x2 r4763, r4764, r4760; +} +{ +add.f16x2 r4766, r4767, r4768; +} +{ +add.f16x2 r4769, r4763, r4766; +} +{ +add.f16x2 r4772, r4749, r4750; +} +{ +mul.f16x2 r4775, r4772, r4738; +} +{ +add.f16x2 r4778, r4752, r4775; +} +{ +add.f16x2 r4781, r4755, r4756; +} +{ +mul.f16x2 r4784, r4781, r4742; +} +{ +add.f16x2 r4787, r4778, r4784; +} +{ +sub.f16x2 r4790, r4761, r4762; +} +{ +mul.f16x2 r4793, r4790, r4740; +} +{ +sub.f16x2 r4796, r4767, r4768; +} +{ +mul.f16x2 r4799, r4796, r4744; +} +{ +add.f16x2 r4802, r4793, r4799; +} +{ +sub.f16x2 r4805, r4787, r4802; +} +{ +add.f16x2 r4808, r4749, r4750; +} +{ +mul.f16x2 r4811, r4808, r4738; +} +{ +add.f16x2 r4814, r4752, r4811; +} +{ +add.f16x2 r4817, r4755, r4756; +} +{ +mul.f16x2 r4820, r4817, r4742; +} +{ +add.f16x2 r4823, r4814, r4820; +} +{ +sub.f16x2 r4826, r4761, r4762; +} +{ +mul.f16x2 r4829, r4826, r4740; +} +{ +sub.f16x2 r4832, r4767, r4768; +} +{ +mul.f16x2 r4835, r4832, r4744; +} +{ +add.f16x2 r4838, r4829, r4835; +} +{ +add.f16x2 r4841, r4823, r4838; +} +{ +add.f16x2 r4844, r4749, r4750; +} +{ +mul.f16x2 r4847, r4844, r4742; +} +{ +add.f16x2 r4850, r4752, r4847; +} +{ +add.f16x2 r4853, r4755, r4756; +} +{ +mul.f16x2 r4856, r4853, r4746; +} +{ +add.f16x2 r4859, r4850, r4856; +} +{ +sub.f16x2 r4862, r4761, r4762; +} +{ +mul.f16x2 r4865, r4862, r4744; +} +{ +sub.f16x2 r4868, r4767, r4768; +} +{ +mul.f16x2 r4871, r4868, r4747; +} +{ +add.f16x2 r4874, r4865, r4871; +} +{ +sub.f16x2 r4877, r4859, r4874; +} +{ +add.f16x2 r4880, r4749, r4750; +} +{ +mul.f16x2 r4883, r4880, r4742; +} +{ +add.f16x2 r4886, r4752, r4883; +} +{ +add.f16x2 r4889, r4755, r4756; +} +{ +mul.f16x2 r4892, r4889, r4746; +} +{ +add.f16x2 r4895, r4886, r4892; +} +{ +sub.f16x2 r4898, r4761, r4762; +} +{ +mul.f16x2 r4901, r4898, r4744; +} +{ +sub.f16x2 r4904, r4767, r4768; +} +{ +mul.f16x2 r4907, r4904, r4747; +} +{ +add.f16x2 r4910, r4901, r4907; +} +{ +add.f16x2 r4913, r4895, r4910; +} +{ +add.f16x2 r4916, r4761, r4762; +} +{ +mul.f16x2 r4919, r4916, r4738; +} +{ +add.f16x2 r4922, r4764, r4919; +} +{ +add.f16x2 r4925, r4767, r4768; +} +{ +mul.f16x2 r4928, r4925, r4742; +} +{ +add.f16x2 r4931, r4922, r4928; +} +{ +sub.f16x2 r4934, r4749, r4750; +} +{ +mul.f16x2 r4937, r4934, r4740; +} +{ +sub.f16x2 r4940, r4755, r4756; +} +{ +mul.f16x2 r4943, r4940, r4744; +} +{ +add.f16x2 r4946, r4937, r4943; +} +{ +add.f16x2 r4949, r4931, r4946; +} +{ +add.f16x2 r4952, r4761, r4762; +} +{ +mul.f16x2 r4955, r4952, r4738; +} +{ +add.f16x2 r4958, r4764, r4955; +} +{ +add.f16x2 r4961, r4767, r4768; +} +{ +mul.f16x2 r4964, r4961, r4742; +} +{ +add.f16x2 r4967, r4958, r4964; +} +{ +sub.f16x2 r4970, r4749, r4750; +} +{ +mul.f16x2 r4973, r4970, r4740; +} +{ +sub.f16x2 r4976, r4755, r4756; +} +{ +mul.f16x2 r4979, r4976, r4744; +} +{ +add.f16x2 r4982, r4973, r4979; +} +{ +sub.f16x2 r4985, r4967, r4982; +} +{ +add.f16x2 r4988, r4761, r4762; +} +{ +mul.f16x2 r4991, r4988, r4742; +} +{ +add.f16x2 r4994, r4764, r4991; +} +{ +add.f16x2 r4997, r4767, r4768; +} +{ +mul.f16x2 r5000, r4997, r4746; +} +{ +add.f16x2 r5003, r4994, r5000; +} +{ +sub.f16x2 r5006, r4749, r4750; +} +{ +mul.f16x2 r5009, r5006, r4744; +} +{ +sub.f16x2 r5012, r4755, r4756; +} +{ +mul.f16x2 r5015, r5012, r4747; +} +{ +add.f16x2 r5018, r5009, r5015; +} +{ +add.f16x2 r5021, r5003, r5018; +} +{ +add.f16x2 r5024, r4761, r4762; +} +{ +mul.f16x2 r5027, r5024, r4742; +} +{ +add.f16x2 r5030, r4764, r5027; +} +{ +add.f16x2 r5033, r4767, r4768; +} +{ +mul.f16x2 r5036, r5033, r4746; +} +{ +add.f16x2 r5039, r5030, r5036; +} +{ +sub.f16x2 r5042, r4749, r4750; +} +{ +mul.f16x2 r5045, r5042, r4744; +} +{ +sub.f16x2 r5048, r4755, r4756; +} +{ +mul.f16x2 r5051, r5048, r4747; +} +{ +add.f16x2 r5054, r5045, r5051; +} +{ +sub.f16x2 r5057, r5039, r5054; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5061, {low, high}; +} +{ +neg.f16x2 r5062, r5061; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5064, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5065, {low, high}; +} +{ +neg.f16x2 r5066, r5065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5069, {low, high}; +} +{ +add.f16x2 r5070, r5071, r5072; +} +{ +add.f16x2 r5073, r5074, r5070; +} +{ +add.f16x2 r5076, r5077, r5078; +} +{ +add.f16x2 r5079, r5073, r5076; +} +{ +add.f16x2 r5082, r5083, r5084; +} +{ +add.f16x2 r5085, r5086, r5082; +} +{ +add.f16x2 r5088, r5089, r5090; +} +{ +add.f16x2 r5091, r5085, r5088; +} +{ +add.f16x2 r5094, r5071, r5072; +} +{ +mul.f16x2 r5097, r5094, r5060; +} +{ +add.f16x2 r5100, r5074, r5097; +} +{ +add.f16x2 r5103, r5077, r5078; +} +{ +mul.f16x2 r5106, r5103, r5064; +} +{ +add.f16x2 r5109, r5100, r5106; +} +{ +sub.f16x2 r5112, r5083, r5084; +} +{ +mul.f16x2 r5115, r5112, r5062; +} +{ +sub.f16x2 r5118, r5089, r5090; +} +{ +mul.f16x2 r5121, r5118, r5066; +} +{ +add.f16x2 r5124, r5115, r5121; +} +{ +sub.f16x2 r5127, r5109, r5124; +} +{ +add.f16x2 r5130, r5071, r5072; +} +{ +mul.f16x2 r5133, r5130, r5060; +} +{ +add.f16x2 r5136, r5074, r5133; +} +{ +add.f16x2 r5139, r5077, r5078; +} +{ +mul.f16x2 r5142, r5139, r5064; +} +{ +add.f16x2 r5145, r5136, r5142; +} +{ +sub.f16x2 r5148, r5083, r5084; +} +{ +mul.f16x2 r5151, r5148, r5062; +} +{ +sub.f16x2 r5154, r5089, r5090; +} +{ +mul.f16x2 r5157, r5154, r5066; +} +{ +add.f16x2 r5160, r5151, r5157; +} +{ +add.f16x2 r5163, r5145, r5160; +} +{ +add.f16x2 r5166, r5071, r5072; +} +{ +mul.f16x2 r5169, r5166, r5064; +} +{ +add.f16x2 r5172, r5074, r5169; +} +{ +add.f16x2 r5175, r5077, r5078; +} +{ +mul.f16x2 r5178, r5175, r5068; +} +{ +add.f16x2 r5181, r5172, r5178; +} +{ +sub.f16x2 r5184, r5083, r5084; +} +{ +mul.f16x2 r5187, r5184, r5066; +} +{ +sub.f16x2 r5190, r5089, r5090; +} +{ +mul.f16x2 r5193, r5190, r5069; +} +{ +add.f16x2 r5196, r5187, r5193; +} +{ +sub.f16x2 r5199, r5181, r5196; +} +{ +add.f16x2 r5202, r5071, r5072; +} +{ +mul.f16x2 r5205, r5202, r5064; +} +{ +add.f16x2 r5208, r5074, r5205; +} +{ +add.f16x2 r5211, r5077, r5078; +} +{ +mul.f16x2 r5214, r5211, r5068; +} +{ +add.f16x2 r5217, r5208, r5214; +} +{ +sub.f16x2 r5220, r5083, r5084; +} +{ +mul.f16x2 r5223, r5220, r5066; +} +{ +sub.f16x2 r5226, r5089, r5090; +} +{ +mul.f16x2 r5229, r5226, r5069; +} +{ +add.f16x2 r5232, r5223, r5229; +} +{ +add.f16x2 r5235, r5217, r5232; +} +{ +add.f16x2 r5238, r5083, r5084; +} +{ +mul.f16x2 r5241, r5238, r5060; +} +{ +add.f16x2 r5244, r5086, r5241; +} +{ +add.f16x2 r5247, r5089, r5090; +} +{ +mul.f16x2 r5250, r5247, r5064; +} +{ +add.f16x2 r5253, r5244, r5250; +} +{ +sub.f16x2 r5256, r5071, r5072; +} +{ +mul.f16x2 r5259, r5256, r5062; +} +{ +sub.f16x2 r5262, r5077, r5078; +} +{ +mul.f16x2 r5265, r5262, r5066; +} +{ +add.f16x2 r5268, r5259, r5265; +} +{ +add.f16x2 r5271, r5253, r5268; +} +{ +add.f16x2 r5274, r5083, r5084; +} +{ +mul.f16x2 r5277, r5274, r5060; +} +{ +add.f16x2 r5280, r5086, r5277; +} +{ +add.f16x2 r5283, r5089, r5090; +} +{ +mul.f16x2 r5286, r5283, r5064; +} +{ +add.f16x2 r5289, r5280, r5286; +} +{ +sub.f16x2 r5292, r5071, r5072; +} +{ +mul.f16x2 r5295, r5292, r5062; +} +{ +sub.f16x2 r5298, r5077, r5078; +} +{ +mul.f16x2 r5301, r5298, r5066; +} +{ +add.f16x2 r5304, r5295, r5301; +} +{ +sub.f16x2 r5307, r5289, r5304; +} +{ +add.f16x2 r5310, r5083, r5084; +} +{ +mul.f16x2 r5313, r5310, r5064; +} +{ +add.f16x2 r5316, r5086, r5313; +} +{ +add.f16x2 r5319, r5089, r5090; +} +{ +mul.f16x2 r5322, r5319, r5068; +} +{ +add.f16x2 r5325, r5316, r5322; +} +{ +sub.f16x2 r5328, r5071, r5072; +} +{ +mul.f16x2 r5331, r5328, r5066; +} +{ +sub.f16x2 r5334, r5077, r5078; +} +{ +mul.f16x2 r5337, r5334, r5069; +} +{ +add.f16x2 r5340, r5331, r5337; +} +{ +add.f16x2 r5343, r5325, r5340; +} +{ +add.f16x2 r5346, r5083, r5084; +} +{ +mul.f16x2 r5349, r5346, r5064; +} +{ +add.f16x2 r5352, r5086, r5349; +} +{ +add.f16x2 r5355, r5089, r5090; +} +{ +mul.f16x2 r5358, r5355, r5068; +} +{ +add.f16x2 r5361, r5352, r5358; +} +{ +sub.f16x2 r5364, r5071, r5072; +} +{ +mul.f16x2 r5367, r5364, r5066; +} +{ +sub.f16x2 r5370, r5077, r5078; +} +{ +mul.f16x2 r5373, r5370, r5069; +} +{ +add.f16x2 r5376, r5367, r5373; +} +{ +sub.f16x2 r5379, r5361, r5376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5383, {low, high}; +} +{ +neg.f16x2 r5384, r5383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5386, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5387, {low, high}; +} +{ +neg.f16x2 r5388, r5387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5391, {low, high}; +} +{ +add.f16x2 r5392, r5393, r5394; +} +{ +add.f16x2 r5395, r5396, r5392; +} +{ +add.f16x2 r5398, r5399, r5400; +} +{ +add.f16x2 r5401, r5395, r5398; +} +{ +add.f16x2 r5404, r5405, r5406; +} +{ +add.f16x2 r5407, r5408, r5404; +} +{ +add.f16x2 r5410, r5411, r5412; +} +{ +add.f16x2 r5413, r5407, r5410; +} +{ +add.f16x2 r5416, r5393, r5394; +} +{ +mul.f16x2 r5419, r5416, r5382; +} +{ +add.f16x2 r5422, r5396, r5419; +} +{ +add.f16x2 r5425, r5399, r5400; +} +{ +mul.f16x2 r5428, r5425, r5386; +} +{ +add.f16x2 r5431, r5422, r5428; +} +{ +sub.f16x2 r5434, r5405, r5406; +} +{ +mul.f16x2 r5437, r5434, r5384; +} +{ +sub.f16x2 r5440, r5411, r5412; +} +{ +mul.f16x2 r5443, r5440, r5388; +} +{ +add.f16x2 r5446, r5437, r5443; +} +{ +sub.f16x2 r5449, r5431, r5446; +} +{ +add.f16x2 r5452, r5393, r5394; +} +{ +mul.f16x2 r5455, r5452, r5382; +} +{ +add.f16x2 r5458, r5396, r5455; +} +{ +add.f16x2 r5461, r5399, r5400; +} +{ +mul.f16x2 r5464, r5461, r5386; +} +{ +add.f16x2 r5467, r5458, r5464; +} +{ +sub.f16x2 r5470, r5405, r5406; +} +{ +mul.f16x2 r5473, r5470, r5384; +} +{ +sub.f16x2 r5476, r5411, r5412; +} +{ +mul.f16x2 r5479, r5476, r5388; +} +{ +add.f16x2 r5482, r5473, r5479; +} +{ +add.f16x2 r5485, r5467, r5482; +} +{ +add.f16x2 r5488, r5393, r5394; +} +{ +mul.f16x2 r5491, r5488, r5386; +} +{ +add.f16x2 r5494, r5396, r5491; +} +{ +add.f16x2 r5497, r5399, r5400; +} +{ +mul.f16x2 r5500, r5497, r5390; +} +{ +add.f16x2 r5503, r5494, r5500; +} +{ +sub.f16x2 r5506, r5405, r5406; +} +{ +mul.f16x2 r5509, r5506, r5388; +} +{ +sub.f16x2 r5512, r5411, r5412; +} +{ +mul.f16x2 r5515, r5512, r5391; +} +{ +add.f16x2 r5518, r5509, r5515; +} +{ +sub.f16x2 r5521, r5503, r5518; +} +{ +add.f16x2 r5524, r5393, r5394; +} +{ +mul.f16x2 r5527, r5524, r5386; +} +{ +add.f16x2 r5530, r5396, r5527; +} +{ +add.f16x2 r5533, r5399, r5400; +} +{ +mul.f16x2 r5536, r5533, r5390; +} +{ +add.f16x2 r5539, r5530, r5536; +} +{ +sub.f16x2 r5542, r5405, r5406; +} +{ +mul.f16x2 r5545, r5542, r5388; +} +{ +sub.f16x2 r5548, r5411, r5412; +} +{ +mul.f16x2 r5551, r5548, r5391; +} +{ +add.f16x2 r5554, r5545, r5551; +} +{ +add.f16x2 r5557, r5539, r5554; +} +{ +add.f16x2 r5560, r5405, r5406; +} +{ +mul.f16x2 r5563, r5560, r5382; +} +{ +add.f16x2 r5566, r5408, r5563; +} +{ +add.f16x2 r5569, r5411, r5412; +} +{ +mul.f16x2 r5572, r5569, r5386; +} +{ +add.f16x2 r5575, r5566, r5572; +} +{ +sub.f16x2 r5578, r5393, r5394; +} +{ +mul.f16x2 r5581, r5578, r5384; +} +{ +sub.f16x2 r5584, r5399, r5400; +} +{ +mul.f16x2 r5587, r5584, r5388; +} +{ +add.f16x2 r5590, r5581, r5587; +} +{ +add.f16x2 r5593, r5575, r5590; +} +{ +add.f16x2 r5596, r5405, r5406; +} +{ +mul.f16x2 r5599, r5596, r5382; +} +{ +add.f16x2 r5602, r5408, r5599; +} +{ +add.f16x2 r5605, r5411, r5412; +} +{ +mul.f16x2 r5608, r5605, r5386; +} +{ +add.f16x2 r5611, r5602, r5608; +} +{ +sub.f16x2 r5614, r5393, r5394; +} +{ +mul.f16x2 r5617, r5614, r5384; +} +{ +sub.f16x2 r5620, r5399, r5400; +} +{ +mul.f16x2 r5623, r5620, r5388; +} +{ +add.f16x2 r5626, r5617, r5623; +} +{ +sub.f16x2 r5629, r5611, r5626; +} +{ +add.f16x2 r5632, r5405, r5406; +} +{ +mul.f16x2 r5635, r5632, r5386; +} +{ +add.f16x2 r5638, r5408, r5635; +} +{ +add.f16x2 r5641, r5411, r5412; +} +{ +mul.f16x2 r5644, r5641, r5390; +} +{ +add.f16x2 r5647, r5638, r5644; +} +{ +sub.f16x2 r5650, r5393, r5394; +} +{ +mul.f16x2 r5653, r5650, r5388; +} +{ +sub.f16x2 r5656, r5399, r5400; +} +{ +mul.f16x2 r5659, r5656, r5391; +} +{ +add.f16x2 r5662, r5653, r5659; +} +{ +add.f16x2 r5665, r5647, r5662; +} +{ +add.f16x2 r5668, r5405, r5406; +} +{ +mul.f16x2 r5671, r5668, r5386; +} +{ +add.f16x2 r5674, r5408, r5671; +} +{ +add.f16x2 r5677, r5411, r5412; +} +{ +mul.f16x2 r5680, r5677, r5390; +} +{ +add.f16x2 r5683, r5674, r5680; +} +{ +sub.f16x2 r5686, r5393, r5394; +} +{ +mul.f16x2 r5689, r5686, r5388; +} +{ +sub.f16x2 r5692, r5399, r5400; +} +{ +mul.f16x2 r5695, r5692, r5391; +} +{ +add.f16x2 r5698, r5689, r5695; +} +{ +sub.f16x2 r5701, r5683, r5698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5704, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5705, {low, high}; +} +{ +neg.f16x2 r5706, r5705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5709, {low, high}; +} +{ +neg.f16x2 r5710, r5709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5712, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5713, {low, high}; +} +{ +add.f16x2 r5714, r5715, r5716; +} +{ +add.f16x2 r5717, r5718, r5714; +} +{ +add.f16x2 r5720, r5721, r5722; +} +{ +add.f16x2 r5723, r5717, r5720; +} +{ +add.f16x2 r5726, r5727, r5728; +} +{ +add.f16x2 r5729, r5730, r5726; +} +{ +add.f16x2 r5732, r5733, r5734; +} +{ +add.f16x2 r5735, r5729, r5732; +} +{ +add.f16x2 r5738, r5715, r5716; +} +{ +mul.f16x2 r5741, r5738, r5704; +} +{ +add.f16x2 r5744, r5718, r5741; +} +{ +add.f16x2 r5747, r5721, r5722; +} +{ +mul.f16x2 r5750, r5747, r5708; +} +{ +add.f16x2 r5753, r5744, r5750; +} +{ +sub.f16x2 r5756, r5727, r5728; +} +{ +mul.f16x2 r5759, r5756, r5706; +} +{ +sub.f16x2 r5762, r5733, r5734; +} +{ +mul.f16x2 r5765, r5762, r5710; +} +{ +add.f16x2 r5768, r5759, r5765; +} +{ +sub.f16x2 r5771, r5753, r5768; +} +{ +add.f16x2 r5774, r5715, r5716; +} +{ +mul.f16x2 r5777, r5774, r5704; +} +{ +add.f16x2 r5780, r5718, r5777; +} +{ +add.f16x2 r5783, r5721, r5722; +} +{ +mul.f16x2 r5786, r5783, r5708; +} +{ +add.f16x2 r5789, r5780, r5786; +} +{ +sub.f16x2 r5792, r5727, r5728; +} +{ +mul.f16x2 r5795, r5792, r5706; +} +{ +sub.f16x2 r5798, r5733, r5734; +} +{ +mul.f16x2 r5801, r5798, r5710; +} +{ +add.f16x2 r5804, r5795, r5801; +} +{ +add.f16x2 r5807, r5789, r5804; +} +{ +add.f16x2 r5810, r5715, r5716; +} +{ +mul.f16x2 r5813, r5810, r5708; +} +{ +add.f16x2 r5816, r5718, r5813; +} +{ +add.f16x2 r5819, r5721, r5722; +} +{ +mul.f16x2 r5822, r5819, r5712; +} +{ +add.f16x2 r5825, r5816, r5822; +} +{ +sub.f16x2 r5828, r5727, r5728; +} +{ +mul.f16x2 r5831, r5828, r5710; +} +{ +sub.f16x2 r5834, r5733, r5734; +} +{ +mul.f16x2 r5837, r5834, r5713; +} +{ +add.f16x2 r5840, r5831, r5837; +} +{ +sub.f16x2 r5843, r5825, r5840; +} +{ +add.f16x2 r5846, r5715, r5716; +} +{ +mul.f16x2 r5849, r5846, r5708; +} +{ +add.f16x2 r5852, r5718, r5849; +} +{ +add.f16x2 r5855, r5721, r5722; +} +{ +mul.f16x2 r5858, r5855, r5712; +} +{ +add.f16x2 r5861, r5852, r5858; +} +{ +sub.f16x2 r5864, r5727, r5728; +} +{ +mul.f16x2 r5867, r5864, r5710; +} +{ +sub.f16x2 r5870, r5733, r5734; +} +{ +mul.f16x2 r5873, r5870, r5713; +} +{ +add.f16x2 r5876, r5867, r5873; +} +{ +add.f16x2 r5879, r5861, r5876; +} +{ +add.f16x2 r5882, r5727, r5728; +} +{ +mul.f16x2 r5885, r5882, r5704; +} +{ +add.f16x2 r5888, r5730, r5885; +} +{ +add.f16x2 r5891, r5733, r5734; +} +{ +mul.f16x2 r5894, r5891, r5708; +} +{ +add.f16x2 r5897, r5888, r5894; +} +{ +sub.f16x2 r5900, r5715, r5716; +} +{ +mul.f16x2 r5903, r5900, r5706; +} +{ +sub.f16x2 r5906, r5721, r5722; +} +{ +mul.f16x2 r5909, r5906, r5710; +} +{ +add.f16x2 r5912, r5903, r5909; +} +{ +add.f16x2 r5915, r5897, r5912; +} +{ +add.f16x2 r5918, r5727, r5728; +} +{ +mul.f16x2 r5921, r5918, r5704; +} +{ +add.f16x2 r5924, r5730, r5921; +} +{ +add.f16x2 r5927, r5733, r5734; +} +{ +mul.f16x2 r5930, r5927, r5708; +} +{ +add.f16x2 r5933, r5924, r5930; +} +{ +sub.f16x2 r5936, r5715, r5716; +} +{ +mul.f16x2 r5939, r5936, r5706; +} +{ +sub.f16x2 r5942, r5721, r5722; +} +{ +mul.f16x2 r5945, r5942, r5710; +} +{ +add.f16x2 r5948, r5939, r5945; +} +{ +sub.f16x2 r5951, r5933, r5948; +} +{ +add.f16x2 r5954, r5727, r5728; +} +{ +mul.f16x2 r5957, r5954, r5708; +} +{ +add.f16x2 r5960, r5730, r5957; +} +{ +add.f16x2 r5963, r5733, r5734; +} +{ +mul.f16x2 r5966, r5963, r5712; +} +{ +add.f16x2 r5969, r5960, r5966; +} +{ +sub.f16x2 r5972, r5715, r5716; +} +{ +mul.f16x2 r5975, r5972, r5710; +} +{ +sub.f16x2 r5978, r5721, r5722; +} +{ +mul.f16x2 r5981, r5978, r5713; +} +{ +add.f16x2 r5984, r5975, r5981; +} +{ +add.f16x2 r5987, r5969, r5984; +} +{ +add.f16x2 r5990, r5727, r5728; +} +{ +mul.f16x2 r5993, r5990, r5708; +} +{ +add.f16x2 r5996, r5730, r5993; +} +{ +add.f16x2 r5999, r5733, r5734; +} +{ +mul.f16x2 r6002, r5999, r5712; +} +{ +add.f16x2 r6005, r5996, r6002; +} +{ +sub.f16x2 r6008, r5715, r5716; +} +{ +mul.f16x2 r6011, r6008, r5710; +} +{ +sub.f16x2 r6014, r5721, r5722; +} +{ +mul.f16x2 r6017, r6014, r5713; +} +{ +add.f16x2 r6020, r6011, r6017; +} +{ +sub.f16x2 r6023, r6005, r6020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r6027, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r6028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r6029, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6030, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6031, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6032, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6033, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6042, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6048, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6057, {low, high}; +} +{ +mul.f16x2 r6074, r4805, r6026; +} +{ +mul.f16x2 r6077, r4949, r6027; +} +{ +sub.f16x2 r6080, r6074, r6077; +} +{ +mul.f16x2 r6083, r4805, r6027; +} +{ +fma.rn.f16x2 r6086, r4949, r6026, r6083; +} +{ +mul.f16x2 r6090, r5127, r6028; +} +{ +mul.f16x2 r6093, r5271, r6029; +} +{ +sub.f16x2 r6096, r6090, r6093; +} +{ +mul.f16x2 r6099, r5127, r6029; +} +{ +fma.rn.f16x2 r6102, r5271, r6028, r6099; +} +{ +mul.f16x2 r6106, r5449, r6030; +} +{ +mul.f16x2 r6109, r5593, r6031; +} +{ +sub.f16x2 r6112, r6106, r6109; +} +{ +mul.f16x2 r6115, r5449, r6031; +} +{ +fma.rn.f16x2 r6118, r5593, r6030, r6115; +} +{ +mul.f16x2 r6122, r5771, r6032; +} +{ +mul.f16x2 r6125, r5915, r6033; +} +{ +sub.f16x2 r6128, r6122, r6125; +} +{ +mul.f16x2 r6131, r5771, r6033; +} +{ +fma.rn.f16x2 r6134, r5915, r6032, r6131; +} +{ +mul.f16x2 r6138, r4877, r6028; +} +{ +mul.f16x2 r6141, r5021, r6029; +} +{ +sub.f16x2 r6144, r6138, r6141; +} +{ +mul.f16x2 r6147, r4877, r6029; +} +{ +fma.rn.f16x2 r6150, r5021, r6028, r6147; +} +{ +mul.f16x2 r6154, r5199, r6032; +} +{ +mul.f16x2 r6157, r5343, r6033; +} +{ +sub.f16x2 r6160, r6154, r6157; +} +{ +mul.f16x2 r6163, r5199, r6033; +} +{ +fma.rn.f16x2 r6166, r5343, r6032, r6163; +} +{ +mul.f16x2 r6170, r5521, r6036; +} +{ +mul.f16x2 r6173, r5665, r6037; +} +{ +sub.f16x2 r6176, r6170, r6173; +} +{ +mul.f16x2 r6179, r5521, r6037; +} +{ +fma.rn.f16x2 r6182, r5665, r6036, r6179; +} +{ +mul.f16x2 r6186, r5843, r6040; +} +{ +mul.f16x2 r6189, r5987, r6041; +} +{ +sub.f16x2 r6192, r6186, r6189; +} +{ +mul.f16x2 r6195, r5843, r6041; +} +{ +fma.rn.f16x2 r6198, r5987, r6040, r6195; +} +{ +mul.f16x2 r6202, r4913, r6030; +} +{ +mul.f16x2 r6205, r5057, r6031; +} +{ +sub.f16x2 r6208, r6202, r6205; +} +{ +mul.f16x2 r6211, r4913, r6031; +} +{ +fma.rn.f16x2 r6214, r5057, r6030, r6211; +} +{ +mul.f16x2 r6218, r5235, r6036; +} +{ +mul.f16x2 r6221, r5379, r6037; +} +{ +sub.f16x2 r6224, r6218, r6221; +} +{ +mul.f16x2 r6227, r5235, r6037; +} +{ +fma.rn.f16x2 r6230, r5379, r6036, r6227; +} +{ +mul.f16x2 r6234, r5557, r6042; +} +{ +mul.f16x2 r6237, r5701, r6043; +} +{ +sub.f16x2 r6240, r6234, r6237; +} +{ +mul.f16x2 r6243, r5557, r6043; +} +{ +fma.rn.f16x2 r6246, r5701, r6042, r6243; +} +{ +mul.f16x2 r6250, r5879, r6048; +} +{ +mul.f16x2 r6253, r6023, r6049; +} +{ +sub.f16x2 r6256, r6250, r6253; +} +{ +mul.f16x2 r6259, r5879, r6049; +} +{ +fma.rn.f16x2 r6262, r6023, r6048, r6259; +} +{ +mul.f16x2 r6266, r4841, r6032; +} +{ +mul.f16x2 r6269, r4985, r6033; +} +{ +sub.f16x2 r6272, r6266, r6269; +} +{ +mul.f16x2 r6275, r4841, r6033; +} +{ +fma.rn.f16x2 r6278, r4985, r6032, r6275; +} +{ +mul.f16x2 r6282, r5163, r6040; +} +{ +mul.f16x2 r6285, r5307, r6041; +} +{ +sub.f16x2 r6288, r6282, r6285; +} +{ +mul.f16x2 r6291, r5163, r6041; +} +{ +fma.rn.f16x2 r6294, r5307, r6040, r6291; +} +{ +mul.f16x2 r6298, r5485, r6048; +} +{ +mul.f16x2 r6301, r5629, r6049; +} +{ +sub.f16x2 r6304, r6298, r6301; +} +{ +mul.f16x2 r6307, r5485, r6049; +} +{ +fma.rn.f16x2 r6310, r5629, r6048, r6307; +} +{ +mul.f16x2 r6314, r5807, r6056; +} +{ +mul.f16x2 r6317, r5951, r6057; +} +{ +sub.f16x2 r6320, r6314, r6317; +} +{ +mul.f16x2 r6323, r5807, r6057; +} +{ +fma.rn.f16x2 r6326, r5951, r6056, r6323; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6331, {low, high}; +} +{ +neg.f16x2 r6332, r6331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6334, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6335, {low, high}; +} +{ +neg.f16x2 r6336, r6335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6338, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6339, {low, high}; +} +{ +add.f16x2 r6340, r4757, r5723; +} +{ +add.f16x2 r6343, r4435, r6340; +} +{ +add.f16x2 r6346, r5079, r5401; +} +{ +add.f16x2 %0, r6343, r6346; +} +{ +add.f16x2 r6352, r4769, r5735; +} +{ +add.f16x2 r6355, r4447, r6352; +} +{ +add.f16x2 r6358, r5091, r5413; +} +{ +add.f16x2 %1, r6355, r6358; +} +{ +add.f16x2 r6364, r4757, r5723; +} +{ +mul.f16x2 r6367, r6364, r6330; +} +{ +add.f16x2 r6370, r4435, r6367; +} +{ +add.f16x2 r6373, r5079, r5401; +} +{ +mul.f16x2 r6376, r6373, r6334; +} +{ +add.f16x2 r6379, r6370, r6376; +} +{ +sub.f16x2 r6382, r4769, r5735; +} +{ +mul.f16x2 r6385, r6382, r6332; +} +{ +sub.f16x2 r6388, r5091, r5413; +} +{ +mul.f16x2 r6391, r6388, r6336; +} +{ +add.f16x2 r6394, r6385, r6391; +} +{ +sub.f16x2 %10, r6379, r6394; +} +{ +add.f16x2 r6400, r4757, r5723; +} +{ +mul.f16x2 r6403, r6400, r6330; +} +{ +add.f16x2 r6406, r4435, r6403; +} +{ +add.f16x2 r6409, r5079, r5401; +} +{ +mul.f16x2 r6412, r6409, r6334; +} +{ +add.f16x2 r6415, r6406, r6412; +} +{ +sub.f16x2 r6418, r4769, r5735; +} +{ +mul.f16x2 r6421, r6418, r6332; +} +{ +sub.f16x2 r6424, r5091, r5413; +} +{ +mul.f16x2 r6427, r6424, r6336; +} +{ +add.f16x2 r6430, r6421, r6427; +} +{ +add.f16x2 %40, r6415, r6430; +} +{ +add.f16x2 r6436, r4757, r5723; +} +{ +mul.f16x2 r6439, r6436, r6334; +} +{ +add.f16x2 r6442, r4435, r6439; +} +{ +add.f16x2 r6445, r5079, r5401; +} +{ +mul.f16x2 r6448, r6445, r6338; +} +{ +add.f16x2 r6451, r6442, r6448; +} +{ +sub.f16x2 r6454, r4769, r5735; +} +{ +mul.f16x2 r6457, r6454, r6336; +} +{ +sub.f16x2 r6460, r5091, r5413; +} +{ +mul.f16x2 r6463, r6460, r6339; +} +{ +add.f16x2 r6466, r6457, r6463; +} +{ +sub.f16x2 %20, r6451, r6466; +} +{ +add.f16x2 r6472, r4757, r5723; +} +{ +mul.f16x2 r6475, r6472, r6334; +} +{ +add.f16x2 r6478, r4435, r6475; +} +{ +add.f16x2 r6481, r5079, r5401; +} +{ +mul.f16x2 r6484, r6481, r6338; +} +{ +add.f16x2 r6487, r6478, r6484; +} +{ +sub.f16x2 r6490, r4769, r5735; +} +{ +mul.f16x2 r6493, r6490, r6336; +} +{ +sub.f16x2 r6496, r5091, r5413; +} +{ +mul.f16x2 r6499, r6496, r6339; +} +{ +add.f16x2 r6502, r6493, r6499; +} +{ +add.f16x2 %30, r6487, r6502; +} +{ +add.f16x2 r6508, r4769, r5735; +} +{ +mul.f16x2 r6511, r6508, r6330; +} +{ +add.f16x2 r6514, r4447, r6511; +} +{ +add.f16x2 r6517, r5091, r5413; +} +{ +mul.f16x2 r6520, r6517, r6334; +} +{ +add.f16x2 r6523, r6514, r6520; +} +{ +sub.f16x2 r6526, r4757, r5723; +} +{ +mul.f16x2 r6529, r6526, r6332; +} +{ +sub.f16x2 r6532, r5079, r5401; +} +{ +mul.f16x2 r6535, r6532, r6336; +} +{ +add.f16x2 r6538, r6529, r6535; +} +{ +add.f16x2 %11, r6523, r6538; +} +{ +add.f16x2 r6544, r4769, r5735; +} +{ +mul.f16x2 r6547, r6544, r6330; +} +{ +add.f16x2 r6550, r4447, r6547; +} +{ +add.f16x2 r6553, r5091, r5413; +} +{ +mul.f16x2 r6556, r6553, r6334; +} +{ +add.f16x2 r6559, r6550, r6556; +} +{ +sub.f16x2 r6562, r4757, r5723; +} +{ +mul.f16x2 r6565, r6562, r6332; +} +{ +sub.f16x2 r6568, r5079, r5401; +} +{ +mul.f16x2 r6571, r6568, r6336; +} +{ +add.f16x2 r6574, r6565, r6571; +} +{ +sub.f16x2 %41, r6559, r6574; +} +{ +add.f16x2 r6580, r4769, r5735; +} +{ +mul.f16x2 r6583, r6580, r6334; +} +{ +add.f16x2 r6586, r4447, r6583; +} +{ +add.f16x2 r6589, r5091, r5413; +} +{ +mul.f16x2 r6592, r6589, r6338; +} +{ +add.f16x2 r6595, r6586, r6592; +} +{ +sub.f16x2 r6598, r4757, r5723; +} +{ +mul.f16x2 r6601, r6598, r6336; +} +{ +sub.f16x2 r6604, r5079, r5401; +} +{ +mul.f16x2 r6607, r6604, r6339; +} +{ +add.f16x2 r6610, r6601, r6607; +} +{ +add.f16x2 %21, r6595, r6610; +} +{ +add.f16x2 r6616, r4769, r5735; +} +{ +mul.f16x2 r6619, r6616, r6334; +} +{ +add.f16x2 r6622, r4447, r6619; +} +{ +add.f16x2 r6625, r5091, r5413; +} +{ +mul.f16x2 r6628, r6625, r6338; +} +{ +add.f16x2 r6631, r6622, r6628; +} +{ +sub.f16x2 r6634, r4757, r5723; +} +{ +mul.f16x2 r6637, r6634, r6336; +} +{ +sub.f16x2 r6640, r5079, r5401; +} +{ +mul.f16x2 r6643, r6640, r6339; +} +{ +add.f16x2 r6646, r6637, r6643; +} +{ +sub.f16x2 %31, r6631, r6646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6653, {low, high}; +} +{ +neg.f16x2 r6654, r6653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6656, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6657, {low, high}; +} +{ +neg.f16x2 r6658, r6657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6660, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6661, {low, high}; +} +{ +add.f16x2 r6662, r6080, r6128; +} +{ +add.f16x2 r6665, r4483, r6662; +} +{ +add.f16x2 r6668, r6096, r6112; +} +{ +add.f16x2 %2, r6665, r6668; +} +{ +add.f16x2 r6674, r6086, r6134; +} +{ +add.f16x2 r6677, r4627, r6674; +} +{ +add.f16x2 r6680, r6102, r6118; +} +{ +add.f16x2 %3, r6677, r6680; +} +{ +add.f16x2 r6686, r6080, r6128; +} +{ +mul.f16x2 r6689, r6686, r6652; +} +{ +add.f16x2 r6692, r4483, r6689; +} +{ +add.f16x2 r6695, r6096, r6112; +} +{ +mul.f16x2 r6698, r6695, r6656; +} +{ +add.f16x2 r6701, r6692, r6698; +} +{ +sub.f16x2 r6704, r6086, r6134; +} +{ +mul.f16x2 r6707, r6704, r6654; +} +{ +sub.f16x2 r6710, r6102, r6118; +} +{ +mul.f16x2 r6713, r6710, r6658; +} +{ +add.f16x2 r6716, r6707, r6713; +} +{ +sub.f16x2 %12, r6701, r6716; +} +{ +add.f16x2 r6722, r6080, r6128; +} +{ +mul.f16x2 r6725, r6722, r6652; +} +{ +add.f16x2 r6728, r4483, r6725; +} +{ +add.f16x2 r6731, r6096, r6112; +} +{ +mul.f16x2 r6734, r6731, r6656; +} +{ +add.f16x2 r6737, r6728, r6734; +} +{ +sub.f16x2 r6740, r6086, r6134; +} +{ +mul.f16x2 r6743, r6740, r6654; +} +{ +sub.f16x2 r6746, r6102, r6118; +} +{ +mul.f16x2 r6749, r6746, r6658; +} +{ +add.f16x2 r6752, r6743, r6749; +} +{ +add.f16x2 %42, r6737, r6752; +} +{ +add.f16x2 r6758, r6080, r6128; +} +{ +mul.f16x2 r6761, r6758, r6656; +} +{ +add.f16x2 r6764, r4483, r6761; +} +{ +add.f16x2 r6767, r6096, r6112; +} +{ +mul.f16x2 r6770, r6767, r6660; +} +{ +add.f16x2 r6773, r6764, r6770; +} +{ +sub.f16x2 r6776, r6086, r6134; +} +{ +mul.f16x2 r6779, r6776, r6658; +} +{ +sub.f16x2 r6782, r6102, r6118; +} +{ +mul.f16x2 r6785, r6782, r6661; +} +{ +add.f16x2 r6788, r6779, r6785; +} +{ +sub.f16x2 %22, r6773, r6788; +} +{ +add.f16x2 r6794, r6080, r6128; +} +{ +mul.f16x2 r6797, r6794, r6656; +} +{ +add.f16x2 r6800, r4483, r6797; +} +{ +add.f16x2 r6803, r6096, r6112; +} +{ +mul.f16x2 r6806, r6803, r6660; +} +{ +add.f16x2 r6809, r6800, r6806; +} +{ +sub.f16x2 r6812, r6086, r6134; +} +{ +mul.f16x2 r6815, r6812, r6658; +} +{ +sub.f16x2 r6818, r6102, r6118; +} +{ +mul.f16x2 r6821, r6818, r6661; +} +{ +add.f16x2 r6824, r6815, r6821; +} +{ +add.f16x2 %32, r6809, r6824; +} +{ +add.f16x2 r6830, r6086, r6134; +} +{ +mul.f16x2 r6833, r6830, r6652; +} +{ +add.f16x2 r6836, r4627, r6833; +} +{ +add.f16x2 r6839, r6102, r6118; +} +{ +mul.f16x2 r6842, r6839, r6656; +} +{ +add.f16x2 r6845, r6836, r6842; +} +{ +sub.f16x2 r6848, r6080, r6128; +} +{ +mul.f16x2 r6851, r6848, r6654; +} +{ +sub.f16x2 r6854, r6096, r6112; +} +{ +mul.f16x2 r6857, r6854, r6658; +} +{ +add.f16x2 r6860, r6851, r6857; +} +{ +add.f16x2 %13, r6845, r6860; +} +{ +add.f16x2 r6866, r6086, r6134; +} +{ +mul.f16x2 r6869, r6866, r6652; +} +{ +add.f16x2 r6872, r4627, r6869; +} +{ +add.f16x2 r6875, r6102, r6118; +} +{ +mul.f16x2 r6878, r6875, r6656; +} +{ +add.f16x2 r6881, r6872, r6878; +} +{ +sub.f16x2 r6884, r6080, r6128; +} +{ +mul.f16x2 r6887, r6884, r6654; +} +{ +sub.f16x2 r6890, r6096, r6112; +} +{ +mul.f16x2 r6893, r6890, r6658; +} +{ +add.f16x2 r6896, r6887, r6893; +} +{ +sub.f16x2 %43, r6881, r6896; +} +{ +add.f16x2 r6902, r6086, r6134; +} +{ +mul.f16x2 r6905, r6902, r6656; +} +{ +add.f16x2 r6908, r4627, r6905; +} +{ +add.f16x2 r6911, r6102, r6118; +} +{ +mul.f16x2 r6914, r6911, r6660; +} +{ +add.f16x2 r6917, r6908, r6914; +} +{ +sub.f16x2 r6920, r6080, r6128; +} +{ +mul.f16x2 r6923, r6920, r6658; +} +{ +sub.f16x2 r6926, r6096, r6112; +} +{ +mul.f16x2 r6929, r6926, r6661; +} +{ +add.f16x2 r6932, r6923, r6929; +} +{ +add.f16x2 %23, r6917, r6932; +} +{ +add.f16x2 r6938, r6086, r6134; +} +{ +mul.f16x2 r6941, r6938, r6656; +} +{ +add.f16x2 r6944, r4627, r6941; +} +{ +add.f16x2 r6947, r6102, r6118; +} +{ +mul.f16x2 r6950, r6947, r6660; +} +{ +add.f16x2 r6953, r6944, r6950; +} +{ +sub.f16x2 r6956, r6080, r6128; +} +{ +mul.f16x2 r6959, r6956, r6658; +} +{ +sub.f16x2 r6962, r6096, r6112; +} +{ +mul.f16x2 r6965, r6962, r6661; +} +{ +add.f16x2 r6968, r6959, r6965; +} +{ +sub.f16x2 %33, r6953, r6968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6974, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6975, {low, high}; +} +{ +neg.f16x2 r6976, r6975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6979, {low, high}; +} +{ +neg.f16x2 r6980, r6979; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6983, {low, high}; +} +{ +add.f16x2 r6984, r6144, r6192; +} +{ +add.f16x2 r6987, r4555, r6984; +} +{ +add.f16x2 r6990, r6160, r6176; +} +{ +add.f16x2 %4, r6987, r6990; +} +{ +add.f16x2 r6996, r6150, r6198; +} +{ +add.f16x2 r6999, r4699, r6996; +} +{ +add.f16x2 r7002, r6166, r6182; +} +{ +add.f16x2 %5, r6999, r7002; +} +{ +add.f16x2 r7008, r6144, r6192; +} +{ +mul.f16x2 r7011, r7008, r6974; +} +{ +add.f16x2 r7014, r4555, r7011; +} +{ +add.f16x2 r7017, r6160, r6176; +} +{ +mul.f16x2 r7020, r7017, r6978; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6150, r6198; +} +{ +mul.f16x2 r7029, r7026, r6976; +} +{ +sub.f16x2 r7032, r6166, r6182; +} +{ +mul.f16x2 r7035, r7032, r6980; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +sub.f16x2 %14, r7023, r7038; +} +{ +add.f16x2 r7044, r6144, r6192; +} +{ +mul.f16x2 r7047, r7044, r6974; +} +{ +add.f16x2 r7050, r4555, r7047; +} +{ +add.f16x2 r7053, r6160, r6176; +} +{ +mul.f16x2 r7056, r7053, r6978; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6150, r6198; +} +{ +mul.f16x2 r7065, r7062, r6976; +} +{ +sub.f16x2 r7068, r6166, r6182; +} +{ +mul.f16x2 r7071, r7068, r6980; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +add.f16x2 %44, r7059, r7074; +} +{ +add.f16x2 r7080, r6144, r6192; +} +{ +mul.f16x2 r7083, r7080, r6978; +} +{ +add.f16x2 r7086, r4555, r7083; +} +{ +add.f16x2 r7089, r6160, r6176; +} +{ +mul.f16x2 r7092, r7089, r6982; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6150, r6198; +} +{ +mul.f16x2 r7101, r7098, r6980; +} +{ +sub.f16x2 r7104, r6166, r6182; +} +{ +mul.f16x2 r7107, r7104, r6983; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +sub.f16x2 %24, r7095, r7110; +} +{ +add.f16x2 r7116, r6144, r6192; +} +{ +mul.f16x2 r7119, r7116, r6978; +} +{ +add.f16x2 r7122, r4555, r7119; +} +{ +add.f16x2 r7125, r6160, r6176; +} +{ +mul.f16x2 r7128, r7125, r6982; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6150, r6198; +} +{ +mul.f16x2 r7137, r7134, r6980; +} +{ +sub.f16x2 r7140, r6166, r6182; +} +{ +mul.f16x2 r7143, r7140, r6983; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 %34, r7131, r7146; +} +{ +add.f16x2 r7152, r6150, r6198; +} +{ +mul.f16x2 r7155, r7152, r6974; +} +{ +add.f16x2 r7158, r4699, r7155; +} +{ +add.f16x2 r7161, r6166, r6182; +} +{ +mul.f16x2 r7164, r7161, r6978; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6144, r6192; +} +{ +mul.f16x2 r7173, r7170, r6976; +} +{ +sub.f16x2 r7176, r6160, r6176; +} +{ +mul.f16x2 r7179, r7176, r6980; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +add.f16x2 %15, r7167, r7182; +} +{ +add.f16x2 r7188, r6150, r6198; +} +{ +mul.f16x2 r7191, r7188, r6974; +} +{ +add.f16x2 r7194, r4699, r7191; +} +{ +add.f16x2 r7197, r6166, r6182; +} +{ +mul.f16x2 r7200, r7197, r6978; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6144, r6192; +} +{ +mul.f16x2 r7209, r7206, r6976; +} +{ +sub.f16x2 r7212, r6160, r6176; +} +{ +mul.f16x2 r7215, r7212, r6980; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +sub.f16x2 %45, r7203, r7218; +} +{ +add.f16x2 r7224, r6150, r6198; +} +{ +mul.f16x2 r7227, r7224, r6978; +} +{ +add.f16x2 r7230, r4699, r7227; +} +{ +add.f16x2 r7233, r6166, r6182; +} +{ +mul.f16x2 r7236, r7233, r6982; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6144, r6192; +} +{ +mul.f16x2 r7245, r7242, r6980; +} +{ +sub.f16x2 r7248, r6160, r6176; +} +{ +mul.f16x2 r7251, r7248, r6983; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +add.f16x2 %25, r7239, r7254; +} +{ +add.f16x2 r7260, r6150, r6198; +} +{ +mul.f16x2 r7263, r7260, r6978; +} +{ +add.f16x2 r7266, r4699, r7263; +} +{ +add.f16x2 r7269, r6166, r6182; +} +{ +mul.f16x2 r7272, r7269, r6982; +} +{ +add.f16x2 r7275, r7266, r7272; +} +{ +sub.f16x2 r7278, r6144, r6192; +} +{ +mul.f16x2 r7281, r7278, r6980; +} +{ +sub.f16x2 r7284, r6160, r6176; +} +{ +mul.f16x2 r7287, r7284, r6983; +} +{ +add.f16x2 r7290, r7281, r7287; +} +{ +sub.f16x2 %35, r7275, r7290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7296, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7297, {low, high}; +} +{ +neg.f16x2 r7298, r7297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7301, {low, high}; +} +{ +neg.f16x2 r7302, r7301; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7305, {low, high}; +} +{ +add.f16x2 r7306, r6208, r6256; +} +{ +add.f16x2 r7309, r4591, r7306; +} +{ +add.f16x2 r7312, r6224, r6240; +} +{ +add.f16x2 %6, r7309, r7312; +} +{ +add.f16x2 r7318, r6214, r6262; +} +{ +add.f16x2 r7321, r4735, r7318; +} +{ +add.f16x2 r7324, r6230, r6246; +} +{ +add.f16x2 %7, r7321, r7324; +} +{ +add.f16x2 r7330, r6208, r6256; +} +{ +mul.f16x2 r7333, r7330, r7296; +} +{ +add.f16x2 r7336, r4591, r7333; +} +{ +add.f16x2 r7339, r6224, r6240; +} +{ +mul.f16x2 r7342, r7339, r7300; +} +{ +add.f16x2 r7345, r7336, r7342; +} +{ +sub.f16x2 r7348, r6214, r6262; +} +{ +mul.f16x2 r7351, r7348, r7298; +} +{ +sub.f16x2 r7354, r6230, r6246; +} +{ +mul.f16x2 r7357, r7354, r7302; +} +{ +add.f16x2 r7360, r7351, r7357; +} +{ +sub.f16x2 %16, r7345, r7360; +} +{ +add.f16x2 r7366, r6208, r6256; +} +{ +mul.f16x2 r7369, r7366, r7296; +} +{ +add.f16x2 r7372, r4591, r7369; +} +{ +add.f16x2 r7375, r6224, r6240; +} +{ +mul.f16x2 r7378, r7375, r7300; +} +{ +add.f16x2 r7381, r7372, r7378; +} +{ +sub.f16x2 r7384, r6214, r6262; +} +{ +mul.f16x2 r7387, r7384, r7298; +} +{ +sub.f16x2 r7390, r6230, r6246; +} +{ +mul.f16x2 r7393, r7390, r7302; +} +{ +add.f16x2 r7396, r7387, r7393; +} +{ +add.f16x2 %46, r7381, r7396; +} +{ +add.f16x2 r7402, r6208, r6256; +} +{ +mul.f16x2 r7405, r7402, r7300; +} +{ +add.f16x2 r7408, r4591, r7405; +} +{ +add.f16x2 r7411, r6224, r6240; +} +{ +mul.f16x2 r7414, r7411, r7304; +} +{ +add.f16x2 r7417, r7408, r7414; +} +{ +sub.f16x2 r7420, r6214, r6262; +} +{ +mul.f16x2 r7423, r7420, r7302; +} +{ +sub.f16x2 r7426, r6230, r6246; +} +{ +mul.f16x2 r7429, r7426, r7305; +} +{ +add.f16x2 r7432, r7423, r7429; +} +{ +sub.f16x2 %26, r7417, r7432; +} +{ +add.f16x2 r7438, r6208, r6256; +} +{ +mul.f16x2 r7441, r7438, r7300; +} +{ +add.f16x2 r7444, r4591, r7441; +} +{ +add.f16x2 r7447, r6224, r6240; +} +{ +mul.f16x2 r7450, r7447, r7304; +} +{ +add.f16x2 r7453, r7444, r7450; +} +{ +sub.f16x2 r7456, r6214, r6262; +} +{ +mul.f16x2 r7459, r7456, r7302; +} +{ +sub.f16x2 r7462, r6230, r6246; +} +{ +mul.f16x2 r7465, r7462, r7305; +} +{ +add.f16x2 r7468, r7459, r7465; +} +{ +add.f16x2 %36, r7453, r7468; +} +{ +add.f16x2 r7474, r6214, r6262; +} +{ +mul.f16x2 r7477, r7474, r7296; +} +{ +add.f16x2 r7480, r4735, r7477; +} +{ +add.f16x2 r7483, r6230, r6246; +} +{ +mul.f16x2 r7486, r7483, r7300; +} +{ +add.f16x2 r7489, r7480, r7486; +} +{ +sub.f16x2 r7492, r6208, r6256; +} +{ +mul.f16x2 r7495, r7492, r7298; +} +{ +sub.f16x2 r7498, r6224, r6240; +} +{ +mul.f16x2 r7501, r7498, r7302; +} +{ +add.f16x2 r7504, r7495, r7501; +} +{ +add.f16x2 %17, r7489, r7504; +} +{ +add.f16x2 r7510, r6214, r6262; +} +{ +mul.f16x2 r7513, r7510, r7296; +} +{ +add.f16x2 r7516, r4735, r7513; +} +{ +add.f16x2 r7519, r6230, r6246; +} +{ +mul.f16x2 r7522, r7519, r7300; +} +{ +add.f16x2 r7525, r7516, r7522; +} +{ +sub.f16x2 r7528, r6208, r6256; +} +{ +mul.f16x2 r7531, r7528, r7298; +} +{ +sub.f16x2 r7534, r6224, r6240; +} +{ +mul.f16x2 r7537, r7534, r7302; +} +{ +add.f16x2 r7540, r7531, r7537; +} +{ +sub.f16x2 %47, r7525, r7540; +} +{ +add.f16x2 r7546, r6214, r6262; +} +{ +mul.f16x2 r7549, r7546, r7300; +} +{ +add.f16x2 r7552, r4735, r7549; +} +{ +add.f16x2 r7555, r6230, r6246; +} +{ +mul.f16x2 r7558, r7555, r7304; +} +{ +add.f16x2 r7561, r7552, r7558; +} +{ +sub.f16x2 r7564, r6208, r6256; +} +{ +mul.f16x2 r7567, r7564, r7302; +} +{ +sub.f16x2 r7570, r6224, r6240; +} +{ +mul.f16x2 r7573, r7570, r7305; +} +{ +add.f16x2 r7576, r7567, r7573; +} +{ +add.f16x2 %27, r7561, r7576; +} +{ +add.f16x2 r7582, r6214, r6262; +} +{ +mul.f16x2 r7585, r7582, r7300; +} +{ +add.f16x2 r7588, r4735, r7585; +} +{ +add.f16x2 r7591, r6230, r6246; +} +{ +mul.f16x2 r7594, r7591, r7304; +} +{ +add.f16x2 r7597, r7588, r7594; +} +{ +sub.f16x2 r7600, r6208, r6256; +} +{ +mul.f16x2 r7603, r7600, r7302; +} +{ +sub.f16x2 r7606, r6224, r6240; +} +{ +mul.f16x2 r7609, r7606, r7305; +} +{ +add.f16x2 r7612, r7603, r7609; +} +{ +sub.f16x2 %37, r7597, r7612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7618, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7619, {low, high}; +} +{ +neg.f16x2 r7620, r7619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7623, {low, high}; +} +{ +neg.f16x2 r7624, r7623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7627, {low, high}; +} +{ +add.f16x2 r7628, r6272, r6320; +} +{ +add.f16x2 r7631, r4519, r7628; +} +{ +add.f16x2 r7634, r6288, r6304; +} +{ +add.f16x2 %8, r7631, r7634; +} +{ +add.f16x2 r7640, r6278, r6326; +} +{ +add.f16x2 r7643, r4663, r7640; +} +{ +add.f16x2 r7646, r6294, r6310; +} +{ +add.f16x2 %9, r7643, r7646; +} +{ +add.f16x2 r7652, r6272, r6320; +} +{ +mul.f16x2 r7655, r7652, r7618; +} +{ +add.f16x2 r7658, r4519, r7655; +} +{ +add.f16x2 r7661, r6288, r6304; +} +{ +mul.f16x2 r7664, r7661, r7622; +} +{ +add.f16x2 r7667, r7658, r7664; +} +{ +sub.f16x2 r7670, r6278, r6326; +} +{ +mul.f16x2 r7673, r7670, r7620; +} +{ +sub.f16x2 r7676, r6294, r6310; +} +{ +mul.f16x2 r7679, r7676, r7624; +} +{ +add.f16x2 r7682, r7673, r7679; +} +{ +sub.f16x2 %18, r7667, r7682; +} +{ +add.f16x2 r7688, r6272, r6320; +} +{ +mul.f16x2 r7691, r7688, r7618; +} +{ +add.f16x2 r7694, r4519, r7691; +} +{ +add.f16x2 r7697, r6288, r6304; +} +{ +mul.f16x2 r7700, r7697, r7622; +} +{ +add.f16x2 r7703, r7694, r7700; +} +{ +sub.f16x2 r7706, r6278, r6326; +} +{ +mul.f16x2 r7709, r7706, r7620; +} +{ +sub.f16x2 r7712, r6294, r6310; +} +{ +mul.f16x2 r7715, r7712, r7624; +} +{ +add.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 %48, r7703, r7718; +} +{ +add.f16x2 r7724, r6272, r6320; +} +{ +mul.f16x2 r7727, r7724, r7622; +} +{ +add.f16x2 r7730, r4519, r7727; +} +{ +add.f16x2 r7733, r6288, r6304; +} +{ +mul.f16x2 r7736, r7733, r7626; +} +{ +add.f16x2 r7739, r7730, r7736; +} +{ +sub.f16x2 r7742, r6278, r6326; +} +{ +mul.f16x2 r7745, r7742, r7624; +} +{ +sub.f16x2 r7748, r6294, r6310; +} +{ +mul.f16x2 r7751, r7748, r7627; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +sub.f16x2 %28, r7739, r7754; +} +{ +add.f16x2 r7760, r6272, r6320; +} +{ +mul.f16x2 r7763, r7760, r7622; +} +{ +add.f16x2 r7766, r4519, r7763; +} +{ +add.f16x2 r7769, r6288, r6304; +} +{ +mul.f16x2 r7772, r7769, r7626; +} +{ +add.f16x2 r7775, r7766, r7772; +} +{ +sub.f16x2 r7778, r6278, r6326; +} +{ +mul.f16x2 r7781, r7778, r7624; +} +{ +sub.f16x2 r7784, r6294, r6310; +} +{ +mul.f16x2 r7787, r7784, r7627; +} +{ +add.f16x2 r7790, r7781, r7787; +} +{ +add.f16x2 %38, r7775, r7790; +} +{ +add.f16x2 r7796, r6278, r6326; +} +{ +mul.f16x2 r7799, r7796, r7618; +} +{ +add.f16x2 r7802, r4663, r7799; +} +{ +add.f16x2 r7805, r6294, r6310; +} +{ +mul.f16x2 r7808, r7805, r7622; +} +{ +add.f16x2 r7811, r7802, r7808; +} +{ +sub.f16x2 r7814, r6272, r6320; +} +{ +mul.f16x2 r7817, r7814, r7620; +} +{ +sub.f16x2 r7820, r6288, r6304; +} +{ +mul.f16x2 r7823, r7820, r7624; +} +{ +add.f16x2 r7826, r7817, r7823; +} +{ +add.f16x2 %19, r7811, r7826; +} +{ +add.f16x2 r7832, r6278, r6326; +} +{ +mul.f16x2 r7835, r7832, r7618; +} +{ +add.f16x2 r7838, r4663, r7835; +} +{ +add.f16x2 r7841, r6294, r6310; +} +{ +mul.f16x2 r7844, r7841, r7622; +} +{ +add.f16x2 r7847, r7838, r7844; +} +{ +sub.f16x2 r7850, r6272, r6320; +} +{ +mul.f16x2 r7853, r7850, r7620; +} +{ +sub.f16x2 r7856, r6288, r6304; +} +{ +mul.f16x2 r7859, r7856, r7624; +} +{ +add.f16x2 r7862, r7853, r7859; +} +{ +sub.f16x2 %49, r7847, r7862; +} +{ +add.f16x2 r7868, r6278, r6326; +} +{ +mul.f16x2 r7871, r7868, r7622; +} +{ +add.f16x2 r7874, r4663, r7871; +} +{ +add.f16x2 r7877, r6294, r6310; +} +{ +mul.f16x2 r7880, r7877, r7626; +} +{ +add.f16x2 r7883, r7874, r7880; +} +{ +sub.f16x2 r7886, r6272, r6320; +} +{ +mul.f16x2 r7889, r7886, r7624; +} +{ +sub.f16x2 r7892, r6288, r6304; +} +{ +mul.f16x2 r7895, r7892, r7627; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 %29, r7883, r7898; +} +{ +add.f16x2 r7904, r6278, r6326; +} +{ +mul.f16x2 r7907, r7904, r7622; +} +{ +add.f16x2 r7910, r4663, r7907; +} +{ +add.f16x2 r7913, r6294, r6310; +} +{ +mul.f16x2 r7916, r7913, r7626; +} +{ +add.f16x2 r7919, r7910, r7916; +} +{ +sub.f16x2 r7922, r6272, r6320; +} +{ +mul.f16x2 r7925, r7922, r7624; +} +{ +sub.f16x2 r7928, r6288, r6304; +} +{ +mul.f16x2 r7931, r7928, r7627; +} +{ +add.f16x2 r7934, r7925, r7931; +} +{ +sub.f16x2 %39, r7919, r7934; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[13].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1110, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<488>; +.reg .b32 r<8002>; +.reg .b64 rd<4>; +mov.u32 r8000, %tid.y; +mov.u32 r8001, %50; +mad.lo.s32 r7942, r8000, 2500, r8001; +mov.u32 r7943, %tid.x; +mov.f32 f482, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1, {low, high}; +} +mov.f32 f484, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f478, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5, {low, high}; +} +mov.f32 f480, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %67, %55; +} +{ +add.f16x2 r14, %74, r11; +} +{ +add.f16x2 r17, %81, %65; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %91, %79; +} +{ +add.f16x2 r26, %51, r23; +} +{ +add.f16x2 r29, %57, %93; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %67, %55; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %74, r38; +} +{ +add.f16x2 r44, %81, %65; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %91, %79; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %57, %93; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %67, %55; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %74, r74; +} +{ +add.f16x2 r80, %81, %65; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %91, %79; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %57, %93; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %67, %55; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %74, r110; +} +{ +add.f16x2 r116, %81, %65; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %91, %79; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %57, %93; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %67, %55; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %74, r146; +} +{ +add.f16x2 r152, %81, %65; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %91, %79; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %57, %93; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %91, %79; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %51, r182; +} +{ +add.f16x2 r188, %57, %93; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %67, %55; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %81, %65; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %91, %79; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %51, r218; +} +{ +add.f16x2 r224, %57, %93; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %67, %55; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %81, %65; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %91, %79; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %51, r254; +} +{ +add.f16x2 r260, %57, %93; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %67, %55; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %81, %65; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %91, %79; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %51, r290; +} +{ +add.f16x2 r296, %57, %93; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %67, %55; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %81, %65; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r324, {low, high}; +} +{ +neg.f16x2 r325, r324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r328, {low, high}; +} +{ +neg.f16x2 r329, r328; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r332, {low, high}; +} +{ +add.f16x2 r333, %89, %77; +} +{ +add.f16x2 r336, %96, r333; +} +{ +add.f16x2 r339, %53, %87; +} +{ +add.f16x2 r342, r336, r339; +} +{ +add.f16x2 r345, %66, %52; +} +{ +add.f16x2 r348, %73, r345; +} +{ +add.f16x2 r351, %80, %64; +} +{ +add.f16x2 r354, r348, r351; +} +{ +add.f16x2 r357, %89, %77; +} +{ +mul.f16x2 r360, r357, r323; +} +{ +add.f16x2 r363, %96, r360; +} +{ +add.f16x2 r366, %53, %87; +} +{ +mul.f16x2 r369, r366, r327; +} +{ +add.f16x2 r372, r363, r369; +} +{ +sub.f16x2 r375, %66, %52; +} +{ +mul.f16x2 r378, r375, r325; +} +{ +sub.f16x2 r381, %80, %64; +} +{ +mul.f16x2 r384, r381, r329; +} +{ +add.f16x2 r387, r378, r384; +} +{ +sub.f16x2 r390, r372, r387; +} +{ +add.f16x2 r393, %89, %77; +} +{ +mul.f16x2 r396, r393, r323; +} +{ +add.f16x2 r399, %96, r396; +} +{ +add.f16x2 r402, %53, %87; +} +{ +mul.f16x2 r405, r402, r327; +} +{ +add.f16x2 r408, r399, r405; +} +{ +sub.f16x2 r411, %66, %52; +} +{ +mul.f16x2 r414, r411, r325; +} +{ +sub.f16x2 r417, %80, %64; +} +{ +mul.f16x2 r420, r417, r329; +} +{ +add.f16x2 r423, r414, r420; +} +{ +add.f16x2 r426, r408, r423; +} +{ +add.f16x2 r429, %89, %77; +} +{ +mul.f16x2 r432, r429, r327; +} +{ +add.f16x2 r435, %96, r432; +} +{ +add.f16x2 r438, %53, %87; +} +{ +mul.f16x2 r441, r438, r331; +} +{ +add.f16x2 r444, r435, r441; +} +{ +sub.f16x2 r447, %66, %52; +} +{ +mul.f16x2 r450, r447, r329; +} +{ +sub.f16x2 r453, %80, %64; +} +{ +mul.f16x2 r456, r453, r332; +} +{ +add.f16x2 r459, r450, r456; +} +{ +sub.f16x2 r462, r444, r459; +} +{ +add.f16x2 r465, %89, %77; +} +{ +mul.f16x2 r468, r465, r327; +} +{ +add.f16x2 r471, %96, r468; +} +{ +add.f16x2 r474, %53, %87; +} +{ +mul.f16x2 r477, r474, r331; +} +{ +add.f16x2 r480, r471, r477; +} +{ +sub.f16x2 r483, %66, %52; +} +{ +mul.f16x2 r486, r483, r329; +} +{ +sub.f16x2 r489, %80, %64; +} +{ +mul.f16x2 r492, r489, r332; +} +{ +add.f16x2 r495, r486, r492; +} +{ +add.f16x2 r498, r480, r495; +} +{ +add.f16x2 r501, %66, %52; +} +{ +mul.f16x2 r504, r501, r323; +} +{ +add.f16x2 r507, %73, r504; +} +{ +add.f16x2 r510, %80, %64; +} +{ +mul.f16x2 r513, r510, r327; +} +{ +add.f16x2 r516, r507, r513; +} +{ +sub.f16x2 r519, %89, %77; +} +{ +mul.f16x2 r522, r519, r325; +} +{ +sub.f16x2 r525, %53, %87; +} +{ +mul.f16x2 r528, r525, r329; +} +{ +add.f16x2 r531, r522, r528; +} +{ +add.f16x2 r534, r516, r531; +} +{ +add.f16x2 r537, %66, %52; +} +{ +mul.f16x2 r540, r537, r323; +} +{ +add.f16x2 r543, %73, r540; +} +{ +add.f16x2 r546, %80, %64; +} +{ +mul.f16x2 r549, r546, r327; +} +{ +add.f16x2 r552, r543, r549; +} +{ +sub.f16x2 r555, %89, %77; +} +{ +mul.f16x2 r558, r555, r325; +} +{ +sub.f16x2 r561, %53, %87; +} +{ +mul.f16x2 r564, r561, r329; +} +{ +add.f16x2 r567, r558, r564; +} +{ +sub.f16x2 r570, r552, r567; +} +{ +add.f16x2 r573, %66, %52; +} +{ +mul.f16x2 r576, r573, r327; +} +{ +add.f16x2 r579, %73, r576; +} +{ +add.f16x2 r582, %80, %64; +} +{ +mul.f16x2 r585, r582, r331; +} +{ +add.f16x2 r588, r579, r585; +} +{ +sub.f16x2 r591, %89, %77; +} +{ +mul.f16x2 r594, r591, r329; +} +{ +sub.f16x2 r597, %53, %87; +} +{ +mul.f16x2 r600, r597, r332; +} +{ +add.f16x2 r603, r594, r600; +} +{ +add.f16x2 r606, r588, r603; +} +{ +add.f16x2 r609, %66, %52; +} +{ +mul.f16x2 r612, r609, r327; +} +{ +add.f16x2 r615, %73, r612; +} +{ +add.f16x2 r618, %80, %64; +} +{ +mul.f16x2 r621, r618, r331; +} +{ +add.f16x2 r624, r615, r621; +} +{ +sub.f16x2 r627, %89, %77; +} +{ +mul.f16x2 r630, r627, r329; +} +{ +sub.f16x2 r633, %53, %87; +} +{ +mul.f16x2 r636, r633, r332; +} +{ +add.f16x2 r639, r630, r636; +} +{ +sub.f16x2 r642, r624, r639; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r646, {low, high}; +} +{ +neg.f16x2 r647, r646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r650, {low, high}; +} +{ +neg.f16x2 r651, r650; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, %62, %99; +} +{ +add.f16x2 r658, %69, r655; +} +{ +add.f16x2 r661, %78, %60; +} +{ +add.f16x2 r664, r658, r661; +} +{ +add.f16x2 r667, %88, %75; +} +{ +add.f16x2 r670, %95, r667; +} +{ +add.f16x2 r673, %54, %86; +} +{ +add.f16x2 r676, r670, r673; +} +{ +add.f16x2 r679, %62, %99; +} +{ +mul.f16x2 r682, r679, r645; +} +{ +add.f16x2 r685, %69, r682; +} +{ +add.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r649; +} +{ +add.f16x2 r694, r685, r691; +} +{ +sub.f16x2 r697, %88, %75; +} +{ +mul.f16x2 r700, r697, r647; +} +{ +sub.f16x2 r703, %54, %86; +} +{ +mul.f16x2 r706, r703, r651; +} +{ +add.f16x2 r709, r700, r706; +} +{ +sub.f16x2 r712, r694, r709; +} +{ +add.f16x2 r715, %62, %99; +} +{ +mul.f16x2 r718, r715, r645; +} +{ +add.f16x2 r721, %69, r718; +} +{ +add.f16x2 r724, %78, %60; +} +{ +mul.f16x2 r727, r724, r649; +} +{ +add.f16x2 r730, r721, r727; +} +{ +sub.f16x2 r733, %88, %75; +} +{ +mul.f16x2 r736, r733, r647; +} +{ +sub.f16x2 r739, %54, %86; +} +{ +mul.f16x2 r742, r739, r651; +} +{ +add.f16x2 r745, r736, r742; +} +{ +add.f16x2 r748, r730, r745; +} +{ +add.f16x2 r751, %62, %99; +} +{ +mul.f16x2 r754, r751, r649; +} +{ +add.f16x2 r757, %69, r754; +} +{ +add.f16x2 r760, %78, %60; +} +{ +mul.f16x2 r763, r760, r653; +} +{ +add.f16x2 r766, r757, r763; +} +{ +sub.f16x2 r769, %88, %75; +} +{ +mul.f16x2 r772, r769, r651; +} +{ +sub.f16x2 r775, %54, %86; +} +{ +mul.f16x2 r778, r775, r654; +} +{ +add.f16x2 r781, r772, r778; +} +{ +sub.f16x2 r784, r766, r781; +} +{ +add.f16x2 r787, %62, %99; +} +{ +mul.f16x2 r790, r787, r649; +} +{ +add.f16x2 r793, %69, r790; +} +{ +add.f16x2 r796, %78, %60; +} +{ +mul.f16x2 r799, r796, r653; +} +{ +add.f16x2 r802, r793, r799; +} +{ +sub.f16x2 r805, %88, %75; +} +{ +mul.f16x2 r808, r805, r651; +} +{ +sub.f16x2 r811, %54, %86; +} +{ +mul.f16x2 r814, r811, r654; +} +{ +add.f16x2 r817, r808, r814; +} +{ +add.f16x2 r820, r802, r817; +} +{ +add.f16x2 r823, %88, %75; +} +{ +mul.f16x2 r826, r823, r645; +} +{ +add.f16x2 r829, %95, r826; +} +{ +add.f16x2 r832, %54, %86; +} +{ +mul.f16x2 r835, r832, r649; +} +{ +add.f16x2 r838, r829, r835; +} +{ +sub.f16x2 r841, %62, %99; +} +{ +mul.f16x2 r844, r841, r647; +} +{ +sub.f16x2 r847, %78, %60; +} +{ +mul.f16x2 r850, r847, r651; +} +{ +add.f16x2 r853, r844, r850; +} +{ +add.f16x2 r856, r838, r853; +} +{ +add.f16x2 r859, %88, %75; +} +{ +mul.f16x2 r862, r859, r645; +} +{ +add.f16x2 r865, %95, r862; +} +{ +add.f16x2 r868, %54, %86; +} +{ +mul.f16x2 r871, r868, r649; +} +{ +add.f16x2 r874, r865, r871; +} +{ +sub.f16x2 r877, %62, %99; +} +{ +mul.f16x2 r880, r877, r647; +} +{ +sub.f16x2 r883, %78, %60; +} +{ +mul.f16x2 r886, r883, r651; +} +{ +add.f16x2 r889, r880, r886; +} +{ +sub.f16x2 r892, r874, r889; +} +{ +add.f16x2 r895, %88, %75; +} +{ +mul.f16x2 r898, r895, r649; +} +{ +add.f16x2 r901, %95, r898; +} +{ +add.f16x2 r904, %54, %86; +} +{ +mul.f16x2 r907, r904, r653; +} +{ +add.f16x2 r910, r901, r907; +} +{ +sub.f16x2 r913, %62, %99; +} +{ +mul.f16x2 r916, r913, r651; +} +{ +sub.f16x2 r919, %78, %60; +} +{ +mul.f16x2 r922, r919, r654; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r910, r925; +} +{ +add.f16x2 r931, %88, %75; +} +{ +mul.f16x2 r934, r931, r649; +} +{ +add.f16x2 r937, %95, r934; +} +{ +add.f16x2 r940, %54, %86; +} +{ +mul.f16x2 r943, r940, r653; +} +{ +add.f16x2 r946, r937, r943; +} +{ +sub.f16x2 r949, %62, %99; +} +{ +mul.f16x2 r952, r949, r651; +} +{ +sub.f16x2 r955, %78, %60; +} +{ +mul.f16x2 r958, r955, r654; +} +{ +add.f16x2 r961, r952, r958; +} +{ +sub.f16x2 r964, r946, r961; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r968, {low, high}; +} +{ +neg.f16x2 r969, r968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r971, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r972, {low, high}; +} +{ +neg.f16x2 r973, r972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r975, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r976, {low, high}; +} +{ +add.f16x2 r977, %85, %71; +} +{ +add.f16x2 r980, %92, r977; +} +{ +add.f16x2 r983, %100, %83; +} +{ +add.f16x2 r986, r980, r983; +} +{ +add.f16x2 r989, %61, %97; +} +{ +add.f16x2 r992, %68, r989; +} +{ +add.f16x2 r995, %76, %59; +} +{ +add.f16x2 r998, r992, r995; +} +{ +add.f16x2 r1001, %85, %71; +} +{ +mul.f16x2 r1004, r1001, r967; +} +{ +add.f16x2 r1007, %92, r1004; +} +{ +add.f16x2 r1010, %100, %83; +} +{ +mul.f16x2 r1013, r1010, r971; +} +{ +add.f16x2 r1016, r1007, r1013; +} +{ +sub.f16x2 r1019, %61, %97; +} +{ +mul.f16x2 r1022, r1019, r969; +} +{ +sub.f16x2 r1025, %76, %59; +} +{ +mul.f16x2 r1028, r1025, r973; +} +{ +add.f16x2 r1031, r1022, r1028; +} +{ +sub.f16x2 r1034, r1016, r1031; +} +{ +add.f16x2 r1037, %85, %71; +} +{ +mul.f16x2 r1040, r1037, r967; +} +{ +add.f16x2 r1043, %92, r1040; +} +{ +add.f16x2 r1046, %100, %83; +} +{ +mul.f16x2 r1049, r1046, r971; +} +{ +add.f16x2 r1052, r1043, r1049; +} +{ +sub.f16x2 r1055, %61, %97; +} +{ +mul.f16x2 r1058, r1055, r969; +} +{ +sub.f16x2 r1061, %76, %59; +} +{ +mul.f16x2 r1064, r1061, r973; +} +{ +add.f16x2 r1067, r1058, r1064; +} +{ +add.f16x2 r1070, r1052, r1067; +} +{ +add.f16x2 r1073, %85, %71; +} +{ +mul.f16x2 r1076, r1073, r971; +} +{ +add.f16x2 r1079, %92, r1076; +} +{ +add.f16x2 r1082, %100, %83; +} +{ +mul.f16x2 r1085, r1082, r975; +} +{ +add.f16x2 r1088, r1079, r1085; +} +{ +sub.f16x2 r1091, %61, %97; +} +{ +mul.f16x2 r1094, r1091, r973; +} +{ +sub.f16x2 r1097, %76, %59; +} +{ +mul.f16x2 r1100, r1097, r976; +} +{ +add.f16x2 r1103, r1094, r1100; +} +{ +sub.f16x2 r1106, r1088, r1103; +} +{ +add.f16x2 r1109, %85, %71; +} +{ +mul.f16x2 r1112, r1109, r971; +} +{ +add.f16x2 r1115, %92, r1112; +} +{ +add.f16x2 r1118, %100, %83; +} +{ +mul.f16x2 r1121, r1118, r975; +} +{ +add.f16x2 r1124, r1115, r1121; +} +{ +sub.f16x2 r1127, %61, %97; +} +{ +mul.f16x2 r1130, r1127, r973; +} +{ +sub.f16x2 r1133, %76, %59; +} +{ +mul.f16x2 r1136, r1133, r976; +} +{ +add.f16x2 r1139, r1130, r1136; +} +{ +add.f16x2 r1142, r1124, r1139; +} +{ +add.f16x2 r1145, %61, %97; +} +{ +mul.f16x2 r1148, r1145, r967; +} +{ +add.f16x2 r1151, %68, r1148; +} +{ +add.f16x2 r1154, %76, %59; +} +{ +mul.f16x2 r1157, r1154, r971; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +sub.f16x2 r1163, %85, %71; +} +{ +mul.f16x2 r1166, r1163, r969; +} +{ +sub.f16x2 r1169, %100, %83; +} +{ +mul.f16x2 r1172, r1169, r973; +} +{ +add.f16x2 r1175, r1166, r1172; +} +{ +add.f16x2 r1178, r1160, r1175; +} +{ +add.f16x2 r1181, %61, %97; +} +{ +mul.f16x2 r1184, r1181, r967; +} +{ +add.f16x2 r1187, %68, r1184; +} +{ +add.f16x2 r1190, %76, %59; +} +{ +mul.f16x2 r1193, r1190, r971; +} +{ +add.f16x2 r1196, r1187, r1193; +} +{ +sub.f16x2 r1199, %85, %71; +} +{ +mul.f16x2 r1202, r1199, r969; +} +{ +sub.f16x2 r1205, %100, %83; +} +{ +mul.f16x2 r1208, r1205, r973; +} +{ +add.f16x2 r1211, r1202, r1208; +} +{ +sub.f16x2 r1214, r1196, r1211; +} +{ +add.f16x2 r1217, %61, %97; +} +{ +mul.f16x2 r1220, r1217, r971; +} +{ +add.f16x2 r1223, %68, r1220; +} +{ +add.f16x2 r1226, %76, %59; +} +{ +mul.f16x2 r1229, r1226, r975; +} +{ +add.f16x2 r1232, r1223, r1229; +} +{ +sub.f16x2 r1235, %85, %71; +} +{ +mul.f16x2 r1238, r1235, r973; +} +{ +sub.f16x2 r1241, %100, %83; +} +{ +mul.f16x2 r1244, r1241, r976; +} +{ +add.f16x2 r1247, r1238, r1244; +} +{ +add.f16x2 r1250, r1232, r1247; +} +{ +add.f16x2 r1253, %61, %97; +} +{ +mul.f16x2 r1256, r1253, r971; +} +{ +add.f16x2 r1259, %68, r1256; +} +{ +add.f16x2 r1262, %76, %59; +} +{ +mul.f16x2 r1265, r1262, r975; +} +{ +add.f16x2 r1268, r1259, r1265; +} +{ +sub.f16x2 r1271, %85, %71; +} +{ +mul.f16x2 r1274, r1271, r973; +} +{ +sub.f16x2 r1277, %100, %83; +} +{ +mul.f16x2 r1280, r1277, r976; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +sub.f16x2 r1286, r1268, r1283; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1290, {low, high}; +} +{ +neg.f16x2 r1291, r1290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1293, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1294, {low, high}; +} +{ +neg.f16x2 r1295, r1294; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1298, {low, high}; +} +{ +add.f16x2 r1299, %58, %94; +} +{ +add.f16x2 r1302, %63, r1299; +} +{ +add.f16x2 r1305, %72, %56; +} +{ +add.f16x2 r1308, r1302, r1305; +} +{ +add.f16x2 r1311, %84, %70; +} +{ +add.f16x2 r1314, %90, r1311; +} +{ +add.f16x2 r1317, %98, %82; +} +{ +add.f16x2 r1320, r1314, r1317; +} +{ +add.f16x2 r1323, %58, %94; +} +{ +mul.f16x2 r1326, r1323, r1289; +} +{ +add.f16x2 r1329, %63, r1326; +} +{ +add.f16x2 r1332, %72, %56; +} +{ +mul.f16x2 r1335, r1332, r1293; +} +{ +add.f16x2 r1338, r1329, r1335; +} +{ +sub.f16x2 r1341, %84, %70; +} +{ +mul.f16x2 r1344, r1341, r1291; +} +{ +sub.f16x2 r1347, %98, %82; +} +{ +mul.f16x2 r1350, r1347, r1295; +} +{ +add.f16x2 r1353, r1344, r1350; +} +{ +sub.f16x2 r1356, r1338, r1353; +} +{ +add.f16x2 r1359, %58, %94; +} +{ +mul.f16x2 r1362, r1359, r1289; +} +{ +add.f16x2 r1365, %63, r1362; +} +{ +add.f16x2 r1368, %72, %56; +} +{ +mul.f16x2 r1371, r1368, r1293; +} +{ +add.f16x2 r1374, r1365, r1371; +} +{ +sub.f16x2 r1377, %84, %70; +} +{ +mul.f16x2 r1380, r1377, r1291; +} +{ +sub.f16x2 r1383, %98, %82; +} +{ +mul.f16x2 r1386, r1383, r1295; +} +{ +add.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1374, r1389; +} +{ +add.f16x2 r1395, %58, %94; +} +{ +mul.f16x2 r1398, r1395, r1293; +} +{ +add.f16x2 r1401, %63, r1398; +} +{ +add.f16x2 r1404, %72, %56; +} +{ +mul.f16x2 r1407, r1404, r1297; +} +{ +add.f16x2 r1410, r1401, r1407; +} +{ +sub.f16x2 r1413, %84, %70; +} +{ +mul.f16x2 r1416, r1413, r1295; +} +{ +sub.f16x2 r1419, %98, %82; +} +{ +mul.f16x2 r1422, r1419, r1298; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +sub.f16x2 r1428, r1410, r1425; +} +{ +add.f16x2 r1431, %58, %94; +} +{ +mul.f16x2 r1434, r1431, r1293; +} +{ +add.f16x2 r1437, %63, r1434; +} +{ +add.f16x2 r1440, %72, %56; +} +{ +mul.f16x2 r1443, r1440, r1297; +} +{ +add.f16x2 r1446, r1437, r1443; +} +{ +sub.f16x2 r1449, %84, %70; +} +{ +mul.f16x2 r1452, r1449, r1295; +} +{ +sub.f16x2 r1455, %98, %82; +} +{ +mul.f16x2 r1458, r1455, r1298; +} +{ +add.f16x2 r1461, r1452, r1458; +} +{ +add.f16x2 r1464, r1446, r1461; +} +{ +add.f16x2 r1467, %84, %70; +} +{ +mul.f16x2 r1470, r1467, r1289; +} +{ +add.f16x2 r1473, %90, r1470; +} +{ +add.f16x2 r1476, %98, %82; +} +{ +mul.f16x2 r1479, r1476, r1293; +} +{ +add.f16x2 r1482, r1473, r1479; +} +{ +sub.f16x2 r1485, %58, %94; +} +{ +mul.f16x2 r1488, r1485, r1291; +} +{ +sub.f16x2 r1491, %72, %56; +} +{ +mul.f16x2 r1494, r1491, r1295; +} +{ +add.f16x2 r1497, r1488, r1494; +} +{ +add.f16x2 r1500, r1482, r1497; +} +{ +add.f16x2 r1503, %84, %70; +} +{ +mul.f16x2 r1506, r1503, r1289; +} +{ +add.f16x2 r1509, %90, r1506; +} +{ +add.f16x2 r1512, %98, %82; +} +{ +mul.f16x2 r1515, r1512, r1293; +} +{ +add.f16x2 r1518, r1509, r1515; +} +{ +sub.f16x2 r1521, %58, %94; +} +{ +mul.f16x2 r1524, r1521, r1291; +} +{ +sub.f16x2 r1527, %72, %56; +} +{ +mul.f16x2 r1530, r1527, r1295; +} +{ +add.f16x2 r1533, r1524, r1530; +} +{ +sub.f16x2 r1536, r1518, r1533; +} +{ +add.f16x2 r1539, %84, %70; +} +{ +mul.f16x2 r1542, r1539, r1293; +} +{ +add.f16x2 r1545, %90, r1542; +} +{ +add.f16x2 r1548, %98, %82; +} +{ +mul.f16x2 r1551, r1548, r1297; +} +{ +add.f16x2 r1554, r1545, r1551; +} +{ +sub.f16x2 r1557, %58, %94; +} +{ +mul.f16x2 r1560, r1557, r1295; +} +{ +sub.f16x2 r1563, %72, %56; +} +{ +mul.f16x2 r1566, r1563, r1298; +} +{ +add.f16x2 r1569, r1560, r1566; +} +{ +add.f16x2 r1572, r1554, r1569; +} +{ +add.f16x2 r1575, %84, %70; +} +{ +mul.f16x2 r1578, r1575, r1293; +} +{ +add.f16x2 r1581, %90, r1578; +} +{ +add.f16x2 r1584, %98, %82; +} +{ +mul.f16x2 r1587, r1584, r1297; +} +{ +add.f16x2 r1590, r1581, r1587; +} +{ +sub.f16x2 r1593, %58, %94; +} +{ +mul.f16x2 r1596, r1593, r1295; +} +{ +sub.f16x2 r1599, %72, %56; +} +{ +mul.f16x2 r1602, r1599, r1298; +} +{ +add.f16x2 r1605, r1596, r1602; +} +{ +sub.f16x2 r1608, r1590, r1605; +} +mov.f32 f330, 0f3F77F511; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r1611, {low, high}; +} +mov.f32 f332, 0f3E7EA890; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r1612, {low, high}; +} +mov.f32 f334, 0f3F6055A2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r1613, {low, high}; +} +mov.f32 f336, 0f3EF6A86B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r1614, {low, high}; +} +mov.f32 f338, 0f3F3A9DB0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r1615, {low, high}; +} +mov.f32 f340, 0f3F2F3E7B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r1616, {low, high}; +} +mov.f32 f342, 0f3F092BF2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r1617, {low, high}; +} +mov.f32 f344, 0f3F5825E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r1618, {low, high}; +} +mov.f32 f350, 0f3D809851; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r1621, {low, high}; +} +mov.f32 f352, 0f3F7F7EAE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r1622, {low, high}; +} +mov.f32 f358, 0fBED9FFBE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1625, {low, high}; +} +mov.f32 f360, 0f3F67A2BF; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r1626, {low, high}; +} +mov.f32 f390, 0fBF232E38; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1627, {low, high}; +} +mov.f32 f364, 0f3F45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r1628, {low, high}; +} +mov.f32 f374, 0fBF7DFB3B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1633, {low, high}; +} +mov.f32 f376, 0f3E00575B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r1641, {low, high}; +} +mov.f32 f392, 0fBF45405B; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r1642, {low, high}; +} +{ +mul.f16x2 r1659, r390, r1611; +} +{ +mul.f16x2 r1662, r534, r1612; +} +{ +sub.f16x2 r1665, r1659, r1662; +} +{ +mul.f16x2 r1668, r390, r1612; +} +{ +fma.rn.f16x2 r1671, r534, r1611, r1668; +} +{ +mul.f16x2 r1675, r712, r1613; +} +{ +mul.f16x2 r1678, r856, r1614; +} +{ +sub.f16x2 r1681, r1675, r1678; +} +{ +mul.f16x2 r1684, r712, r1614; +} +{ +fma.rn.f16x2 r1687, r856, r1613, r1684; +} +{ +mul.f16x2 r1691, r1034, r1615; +} +{ +mul.f16x2 r1694, r1178, r1616; +} +{ +sub.f16x2 r1697, r1691, r1694; +} +{ +mul.f16x2 r1700, r1034, r1616; +} +{ +fma.rn.f16x2 r1703, r1178, r1615, r1700; +} +{ +mul.f16x2 r1707, r1356, r1617; +} +{ +mul.f16x2 r1710, r1500, r1618; +} +{ +sub.f16x2 r1713, r1707, r1710; +} +{ +mul.f16x2 r1716, r1356, r1618; +} +{ +fma.rn.f16x2 r1719, r1500, r1617, r1716; +} +{ +mul.f16x2 r1723, r462, r1613; +} +{ +mul.f16x2 r1726, r606, r1614; +} +{ +sub.f16x2 r1729, r1723, r1726; +} +{ +mul.f16x2 r1732, r462, r1614; +} +{ +fma.rn.f16x2 r1735, r606, r1613, r1732; +} +{ +mul.f16x2 r1739, r784, r1617; +} +{ +mul.f16x2 r1742, r928, r1618; +} +{ +sub.f16x2 r1745, r1739, r1742; +} +{ +mul.f16x2 r1748, r784, r1618; +} +{ +fma.rn.f16x2 r1751, r928, r1617, r1748; +} +{ +mul.f16x2 r1755, r1106, r1621; +} +{ +mul.f16x2 r1758, r1250, r1622; +} +{ +sub.f16x2 r1761, r1755, r1758; +} +{ +mul.f16x2 r1764, r1106, r1622; +} +{ +fma.rn.f16x2 r1767, r1250, r1621, r1764; +} +{ +mul.f16x2 r1771, r1428, r1625; +} +{ +mul.f16x2 r1774, r1572, r1626; +} +{ +sub.f16x2 r1777, r1771, r1774; +} +{ +mul.f16x2 r1780, r1428, r1626; +} +{ +fma.rn.f16x2 r1783, r1572, r1625, r1780; +} +{ +mul.f16x2 r1787, r498, r1615; +} +{ +mul.f16x2 r1790, r642, r1616; +} +{ +sub.f16x2 r1793, r1787, r1790; +} +{ +mul.f16x2 r1796, r498, r1616; +} +{ +fma.rn.f16x2 r1799, r642, r1615, r1796; +} +{ +mul.f16x2 r1803, r820, r1621; +} +{ +mul.f16x2 r1806, r964, r1622; +} +{ +sub.f16x2 r1809, r1803, r1806; +} +{ +mul.f16x2 r1812, r820, r1622; +} +{ +fma.rn.f16x2 r1815, r964, r1621, r1812; +} +{ +mul.f16x2 r1819, r1142, r1627; +} +{ +mul.f16x2 r1822, r1286, r1628; +} +{ +sub.f16x2 r1825, r1819, r1822; +} +{ +mul.f16x2 r1828, r1142, r1628; +} +{ +fma.rn.f16x2 r1831, r1286, r1627, r1828; +} +{ +mul.f16x2 r1835, r1464, r1633; +} +{ +mul.f16x2 r1838, r1608, r1634; +} +{ +sub.f16x2 r1841, r1835, r1838; +} +{ +mul.f16x2 r1844, r1464, r1634; +} +{ +fma.rn.f16x2 r1847, r1608, r1633, r1844; +} +{ +mul.f16x2 r1851, r426, r1617; +} +{ +mul.f16x2 r1854, r570, r1618; +} +{ +sub.f16x2 r1857, r1851, r1854; +} +{ +mul.f16x2 r1860, r426, r1618; +} +{ +fma.rn.f16x2 r1863, r570, r1617, r1860; +} +{ +mul.f16x2 r1867, r748, r1625; +} +{ +mul.f16x2 r1870, r892, r1626; +} +{ +sub.f16x2 r1873, r1867, r1870; +} +{ +mul.f16x2 r1876, r748, r1626; +} +{ +fma.rn.f16x2 r1879, r892, r1625, r1876; +} +{ +mul.f16x2 r1883, r1070, r1633; +} +{ +mul.f16x2 r1886, r1214, r1634; +} +{ +sub.f16x2 r1889, r1883, r1886; +} +{ +mul.f16x2 r1892, r1070, r1634; +} +{ +fma.rn.f16x2 r1895, r1214, r1633, r1892; +} +{ +mul.f16x2 r1899, r1392, r1641; +} +{ +mul.f16x2 r1902, r1536, r1642; +} +{ +sub.f16x2 r1905, r1899, r1902; +} +{ +mul.f16x2 r1908, r1392, r1642; +} +{ +fma.rn.f16x2 r1911, r1536, r1641, r1908; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1915, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1916, {low, high}; +} +{ +neg.f16x2 r1917, r1916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r1919, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r1920, {low, high}; +} +{ +neg.f16x2 r1921, r1920; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r1923, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r1924, {low, high}; +} +{ +add.f16x2 r1925, r342, r1308; +} +{ +add.f16x2 r1928, r20, r1925; +} +{ +add.f16x2 r1931, r664, r986; +} +{ +add.f16x2 r1934, r1928, r1931; +} +{ +add.f16x2 r1937, r354, r1320; +} +{ +add.f16x2 r1940, r32, r1937; +} +{ +add.f16x2 r1943, r676, r998; +} +{ +add.f16x2 r1946, r1940, r1943; +} +{ +add.f16x2 r1949, r342, r1308; +} +{ +mul.f16x2 r1952, r1949, r1915; +} +{ +add.f16x2 r1955, r20, r1952; +} +{ +add.f16x2 r1958, r664, r986; +} +{ +mul.f16x2 r1961, r1958, r1919; +} +{ +add.f16x2 r1964, r1955, r1961; +} +{ +sub.f16x2 r1967, r354, r1320; +} +{ +mul.f16x2 r1970, r1967, r1917; +} +{ +sub.f16x2 r1973, r676, r998; +} +{ +mul.f16x2 r1976, r1973, r1921; +} +{ +add.f16x2 r1979, r1970, r1976; +} +{ +sub.f16x2 r1982, r1964, r1979; +} +{ +add.f16x2 r1985, r342, r1308; +} +{ +mul.f16x2 r1988, r1985, r1915; +} +{ +add.f16x2 r1991, r20, r1988; +} +{ +add.f16x2 r1994, r664, r986; +} +{ +mul.f16x2 r1997, r1994, r1919; +} +{ +add.f16x2 r2000, r1991, r1997; +} +{ +sub.f16x2 r2003, r354, r1320; +} +{ +mul.f16x2 r2006, r2003, r1917; +} +{ +sub.f16x2 r2009, r676, r998; +} +{ +mul.f16x2 r2012, r2009, r1921; +} +{ +add.f16x2 r2015, r2006, r2012; +} +{ +add.f16x2 r2018, r2000, r2015; +} +{ +add.f16x2 r2021, r342, r1308; +} +{ +mul.f16x2 r2024, r2021, r1919; +} +{ +add.f16x2 r2027, r20, r2024; +} +{ +add.f16x2 r2030, r664, r986; +} +{ +mul.f16x2 r2033, r2030, r1923; +} +{ +add.f16x2 r2036, r2027, r2033; +} +{ +sub.f16x2 r2039, r354, r1320; +} +{ +mul.f16x2 r2042, r2039, r1921; +} +{ +sub.f16x2 r2045, r676, r998; +} +{ +mul.f16x2 r2048, r2045, r1924; +} +{ +add.f16x2 r2051, r2042, r2048; +} +{ +sub.f16x2 r2054, r2036, r2051; +} +{ +add.f16x2 r2057, r342, r1308; +} +{ +mul.f16x2 r2060, r2057, r1919; +} +{ +add.f16x2 r2063, r20, r2060; +} +{ +add.f16x2 r2066, r664, r986; +} +{ +mul.f16x2 r2069, r2066, r1923; +} +{ +add.f16x2 r2072, r2063, r2069; +} +{ +sub.f16x2 r2075, r354, r1320; +} +{ +mul.f16x2 r2078, r2075, r1921; +} +{ +sub.f16x2 r2081, r676, r998; +} +{ +mul.f16x2 r2084, r2081, r1924; +} +{ +add.f16x2 r2087, r2078, r2084; +} +{ +add.f16x2 r2090, r2072, r2087; +} +{ +add.f16x2 r2093, r354, r1320; +} +{ +mul.f16x2 r2096, r2093, r1915; +} +{ +add.f16x2 r2099, r32, r2096; +} +{ +add.f16x2 r2102, r676, r998; +} +{ +mul.f16x2 r2105, r2102, r1919; +} +{ +add.f16x2 r2108, r2099, r2105; +} +{ +sub.f16x2 r2111, r342, r1308; +} +{ +mul.f16x2 r2114, r2111, r1917; +} +{ +sub.f16x2 r2117, r664, r986; +} +{ +mul.f16x2 r2120, r2117, r1921; +} +{ +add.f16x2 r2123, r2114, r2120; +} +{ +add.f16x2 r2126, r2108, r2123; +} +{ +add.f16x2 r2129, r354, r1320; +} +{ +mul.f16x2 r2132, r2129, r1915; +} +{ +add.f16x2 r2135, r32, r2132; +} +{ +add.f16x2 r2138, r676, r998; +} +{ +mul.f16x2 r2141, r2138, r1919; +} +{ +add.f16x2 r2144, r2135, r2141; +} +{ +sub.f16x2 r2147, r342, r1308; +} +{ +mul.f16x2 r2150, r2147, r1917; +} +{ +sub.f16x2 r2153, r664, r986; +} +{ +mul.f16x2 r2156, r2153, r1921; +} +{ +add.f16x2 r2159, r2150, r2156; +} +{ +sub.f16x2 r2162, r2144, r2159; +} +{ +add.f16x2 r2165, r354, r1320; +} +{ +mul.f16x2 r2168, r2165, r1919; +} +{ +add.f16x2 r2171, r32, r2168; +} +{ +add.f16x2 r2174, r676, r998; +} +{ +mul.f16x2 r2177, r2174, r1923; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +sub.f16x2 r2183, r342, r1308; +} +{ +mul.f16x2 r2186, r2183, r1921; +} +{ +sub.f16x2 r2189, r664, r986; +} +{ +mul.f16x2 r2192, r2189, r1924; +} +{ +add.f16x2 r2195, r2186, r2192; +} +{ +add.f16x2 r2198, r2180, r2195; +} +{ +add.f16x2 r2201, r354, r1320; +} +{ +mul.f16x2 r2204, r2201, r1919; +} +{ +add.f16x2 r2207, r32, r2204; +} +{ +add.f16x2 r2210, r676, r998; +} +{ +mul.f16x2 r2213, r2210, r1923; +} +{ +add.f16x2 r2216, r2207, r2213; +} +{ +sub.f16x2 r2219, r342, r1308; +} +{ +mul.f16x2 r2222, r2219, r1921; +} +{ +sub.f16x2 r2225, r664, r986; +} +{ +mul.f16x2 r2228, r2225, r1924; +} +{ +add.f16x2 r2231, r2222, r2228; +} +{ +sub.f16x2 r2234, r2216, r2231; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2237, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2238, {low, high}; +} +{ +neg.f16x2 r2239, r2238; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2242, {low, high}; +} +{ +neg.f16x2 r2243, r2242; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2246, {low, high}; +} +{ +add.f16x2 r2247, r1665, r1713; +} +{ +add.f16x2 r2250, r68, r2247; +} +{ +add.f16x2 r2253, r1681, r1697; +} +{ +add.f16x2 r2256, r2250, r2253; +} +{ +add.f16x2 r2259, r1671, r1719; +} +{ +add.f16x2 r2262, r212, r2259; +} +{ +add.f16x2 r2265, r1687, r1703; +} +{ +add.f16x2 r2268, r2262, r2265; +} +{ +add.f16x2 r2271, r1665, r1713; +} +{ +mul.f16x2 r2274, r2271, r2237; +} +{ +add.f16x2 r2277, r68, r2274; +} +{ +add.f16x2 r2280, r1681, r1697; +} +{ +mul.f16x2 r2283, r2280, r2241; +} +{ +add.f16x2 r2286, r2277, r2283; +} +{ +sub.f16x2 r2289, r1671, r1719; +} +{ +mul.f16x2 r2292, r2289, r2239; +} +{ +sub.f16x2 r2295, r1687, r1703; +} +{ +mul.f16x2 r2298, r2295, r2243; +} +{ +add.f16x2 r2301, r2292, r2298; +} +{ +sub.f16x2 r2304, r2286, r2301; +} +{ +add.f16x2 r2307, r1665, r1713; +} +{ +mul.f16x2 r2310, r2307, r2237; +} +{ +add.f16x2 r2313, r68, r2310; +} +{ +add.f16x2 r2316, r1681, r1697; +} +{ +mul.f16x2 r2319, r2316, r2241; +} +{ +add.f16x2 r2322, r2313, r2319; +} +{ +sub.f16x2 r2325, r1671, r1719; +} +{ +mul.f16x2 r2328, r2325, r2239; +} +{ +sub.f16x2 r2331, r1687, r1703; +} +{ +mul.f16x2 r2334, r2331, r2243; +} +{ +add.f16x2 r2337, r2328, r2334; +} +{ +add.f16x2 r2340, r2322, r2337; +} +{ +add.f16x2 r2343, r1665, r1713; +} +{ +mul.f16x2 r2346, r2343, r2241; +} +{ +add.f16x2 r2349, r68, r2346; +} +{ +add.f16x2 r2352, r1681, r1697; +} +{ +mul.f16x2 r2355, r2352, r2245; +} +{ +add.f16x2 r2358, r2349, r2355; +} +{ +sub.f16x2 r2361, r1671, r1719; +} +{ +mul.f16x2 r2364, r2361, r2243; +} +{ +sub.f16x2 r2367, r1687, r1703; +} +{ +mul.f16x2 r2370, r2367, r2246; +} +{ +add.f16x2 r2373, r2364, r2370; +} +{ +sub.f16x2 r2376, r2358, r2373; +} +{ +add.f16x2 r2379, r1665, r1713; +} +{ +mul.f16x2 r2382, r2379, r2241; +} +{ +add.f16x2 r2385, r68, r2382; +} +{ +add.f16x2 r2388, r1681, r1697; +} +{ +mul.f16x2 r2391, r2388, r2245; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +sub.f16x2 r2397, r1671, r1719; +} +{ +mul.f16x2 r2400, r2397, r2243; +} +{ +sub.f16x2 r2403, r1687, r1703; +} +{ +mul.f16x2 r2406, r2403, r2246; +} +{ +add.f16x2 r2409, r2400, r2406; +} +{ +add.f16x2 r2412, r2394, r2409; +} +{ +add.f16x2 r2415, r1671, r1719; +} +{ +mul.f16x2 r2418, r2415, r2237; +} +{ +add.f16x2 r2421, r212, r2418; +} +{ +add.f16x2 r2424, r1687, r1703; +} +{ +mul.f16x2 r2427, r2424, r2241; +} +{ +add.f16x2 r2430, r2421, r2427; +} +{ +sub.f16x2 r2433, r1665, r1713; +} +{ +mul.f16x2 r2436, r2433, r2239; +} +{ +sub.f16x2 r2439, r1681, r1697; +} +{ +mul.f16x2 r2442, r2439, r2243; +} +{ +add.f16x2 r2445, r2436, r2442; +} +{ +add.f16x2 r2448, r2430, r2445; +} +{ +add.f16x2 r2451, r1671, r1719; +} +{ +mul.f16x2 r2454, r2451, r2237; +} +{ +add.f16x2 r2457, r212, r2454; +} +{ +add.f16x2 r2460, r1687, r1703; +} +{ +mul.f16x2 r2463, r2460, r2241; +} +{ +add.f16x2 r2466, r2457, r2463; +} +{ +sub.f16x2 r2469, r1665, r1713; +} +{ +mul.f16x2 r2472, r2469, r2239; +} +{ +sub.f16x2 r2475, r1681, r1697; +} +{ +mul.f16x2 r2478, r2475, r2243; +} +{ +add.f16x2 r2481, r2472, r2478; +} +{ +sub.f16x2 r2484, r2466, r2481; +} +{ +add.f16x2 r2487, r1671, r1719; +} +{ +mul.f16x2 r2490, r2487, r2241; +} +{ +add.f16x2 r2493, r212, r2490; +} +{ +add.f16x2 r2496, r1687, r1703; +} +{ +mul.f16x2 r2499, r2496, r2245; +} +{ +add.f16x2 r2502, r2493, r2499; +} +{ +sub.f16x2 r2505, r1665, r1713; +} +{ +mul.f16x2 r2508, r2505, r2243; +} +{ +sub.f16x2 r2511, r1681, r1697; +} +{ +mul.f16x2 r2514, r2511, r2246; +} +{ +add.f16x2 r2517, r2508, r2514; +} +{ +add.f16x2 r2520, r2502, r2517; +} +{ +add.f16x2 r2523, r1671, r1719; +} +{ +mul.f16x2 r2526, r2523, r2241; +} +{ +add.f16x2 r2529, r212, r2526; +} +{ +add.f16x2 r2532, r1687, r1703; +} +{ +mul.f16x2 r2535, r2532, r2245; +} +{ +add.f16x2 r2538, r2529, r2535; +} +{ +sub.f16x2 r2541, r1665, r1713; +} +{ +mul.f16x2 r2544, r2541, r2243; +} +{ +sub.f16x2 r2547, r1681, r1697; +} +{ +mul.f16x2 r2550, r2547, r2246; +} +{ +add.f16x2 r2553, r2544, r2550; +} +{ +sub.f16x2 r2556, r2538, r2553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2560, {low, high}; +} +{ +neg.f16x2 r2561, r2560; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2564, {low, high}; +} +{ +neg.f16x2 r2565, r2564; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2568, {low, high}; +} +{ +add.f16x2 r2569, r1729, r1777; +} +{ +add.f16x2 r2572, r140, r2569; +} +{ +add.f16x2 r2575, r1745, r1761; +} +{ +add.f16x2 r2578, r2572, r2575; +} +{ +add.f16x2 r2581, r1735, r1783; +} +{ +add.f16x2 r2584, r284, r2581; +} +{ +add.f16x2 r2587, r1751, r1767; +} +{ +add.f16x2 r2590, r2584, r2587; +} +{ +add.f16x2 r2593, r1729, r1777; +} +{ +mul.f16x2 r2596, r2593, r2559; +} +{ +add.f16x2 r2599, r140, r2596; +} +{ +add.f16x2 r2602, r1745, r1761; +} +{ +mul.f16x2 r2605, r2602, r2563; +} +{ +add.f16x2 r2608, r2599, r2605; +} +{ +sub.f16x2 r2611, r1735, r1783; +} +{ +mul.f16x2 r2614, r2611, r2561; +} +{ +sub.f16x2 r2617, r1751, r1767; +} +{ +mul.f16x2 r2620, r2617, r2565; +} +{ +add.f16x2 r2623, r2614, r2620; +} +{ +sub.f16x2 r2626, r2608, r2623; +} +{ +add.f16x2 r2629, r1729, r1777; +} +{ +mul.f16x2 r2632, r2629, r2559; +} +{ +add.f16x2 r2635, r140, r2632; +} +{ +add.f16x2 r2638, r1745, r1761; +} +{ +mul.f16x2 r2641, r2638, r2563; +} +{ +add.f16x2 r2644, r2635, r2641; +} +{ +sub.f16x2 r2647, r1735, r1783; +} +{ +mul.f16x2 r2650, r2647, r2561; +} +{ +sub.f16x2 r2653, r1751, r1767; +} +{ +mul.f16x2 r2656, r2653, r2565; +} +{ +add.f16x2 r2659, r2650, r2656; +} +{ +add.f16x2 r2662, r2644, r2659; +} +{ +add.f16x2 r2665, r1729, r1777; +} +{ +mul.f16x2 r2668, r2665, r2563; +} +{ +add.f16x2 r2671, r140, r2668; +} +{ +add.f16x2 r2674, r1745, r1761; +} +{ +mul.f16x2 r2677, r2674, r2567; +} +{ +add.f16x2 r2680, r2671, r2677; +} +{ +sub.f16x2 r2683, r1735, r1783; +} +{ +mul.f16x2 r2686, r2683, r2565; +} +{ +sub.f16x2 r2689, r1751, r1767; +} +{ +mul.f16x2 r2692, r2689, r2568; +} +{ +add.f16x2 r2695, r2686, r2692; +} +{ +sub.f16x2 r2698, r2680, r2695; +} +{ +add.f16x2 r2701, r1729, r1777; +} +{ +mul.f16x2 r2704, r2701, r2563; +} +{ +add.f16x2 r2707, r140, r2704; +} +{ +add.f16x2 r2710, r1745, r1761; +} +{ +mul.f16x2 r2713, r2710, r2567; +} +{ +add.f16x2 r2716, r2707, r2713; +} +{ +sub.f16x2 r2719, r1735, r1783; +} +{ +mul.f16x2 r2722, r2719, r2565; +} +{ +sub.f16x2 r2725, r1751, r1767; +} +{ +mul.f16x2 r2728, r2725, r2568; +} +{ +add.f16x2 r2731, r2722, r2728; +} +{ +add.f16x2 r2734, r2716, r2731; +} +{ +add.f16x2 r2737, r1735, r1783; +} +{ +mul.f16x2 r2740, r2737, r2559; +} +{ +add.f16x2 r2743, r284, r2740; +} +{ +add.f16x2 r2746, r1751, r1767; +} +{ +mul.f16x2 r2749, r2746, r2563; +} +{ +add.f16x2 r2752, r2743, r2749; +} +{ +sub.f16x2 r2755, r1729, r1777; +} +{ +mul.f16x2 r2758, r2755, r2561; +} +{ +sub.f16x2 r2761, r1745, r1761; +} +{ +mul.f16x2 r2764, r2761, r2565; +} +{ +add.f16x2 r2767, r2758, r2764; +} +{ +add.f16x2 r2770, r2752, r2767; +} +{ +add.f16x2 r2773, r1735, r1783; +} +{ +mul.f16x2 r2776, r2773, r2559; +} +{ +add.f16x2 r2779, r284, r2776; +} +{ +add.f16x2 r2782, r1751, r1767; +} +{ +mul.f16x2 r2785, r2782, r2563; +} +{ +add.f16x2 r2788, r2779, r2785; +} +{ +sub.f16x2 r2791, r1729, r1777; +} +{ +mul.f16x2 r2794, r2791, r2561; +} +{ +sub.f16x2 r2797, r1745, r1761; +} +{ +mul.f16x2 r2800, r2797, r2565; +} +{ +add.f16x2 r2803, r2794, r2800; +} +{ +sub.f16x2 r2806, r2788, r2803; +} +{ +add.f16x2 r2809, r1735, r1783; +} +{ +mul.f16x2 r2812, r2809, r2563; +} +{ +add.f16x2 r2815, r284, r2812; +} +{ +add.f16x2 r2818, r1751, r1767; +} +{ +mul.f16x2 r2821, r2818, r2567; +} +{ +add.f16x2 r2824, r2815, r2821; +} +{ +sub.f16x2 r2827, r1729, r1777; +} +{ +mul.f16x2 r2830, r2827, r2565; +} +{ +sub.f16x2 r2833, r1745, r1761; +} +{ +mul.f16x2 r2836, r2833, r2568; +} +{ +add.f16x2 r2839, r2830, r2836; +} +{ +add.f16x2 r2842, r2824, r2839; +} +{ +add.f16x2 r2845, r1735, r1783; +} +{ +mul.f16x2 r2848, r2845, r2563; +} +{ +add.f16x2 r2851, r284, r2848; +} +{ +add.f16x2 r2854, r1751, r1767; +} +{ +mul.f16x2 r2857, r2854, r2567; +} +{ +add.f16x2 r2860, r2851, r2857; +} +{ +sub.f16x2 r2863, r1729, r1777; +} +{ +mul.f16x2 r2866, r2863, r2565; +} +{ +sub.f16x2 r2869, r1745, r1761; +} +{ +mul.f16x2 r2872, r2869, r2568; +} +{ +add.f16x2 r2875, r2866, r2872; +} +{ +sub.f16x2 r2878, r2860, r2875; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2882, {low, high}; +} +{ +neg.f16x2 r2883, r2882; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r2885, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r2886, {low, high}; +} +{ +neg.f16x2 r2887, r2886; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r2889, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r2890, {low, high}; +} +{ +add.f16x2 r2891, r1793, r1841; +} +{ +add.f16x2 r2894, r176, r2891; +} +{ +add.f16x2 r2897, r1809, r1825; +} +{ +add.f16x2 r2900, r2894, r2897; +} +{ +add.f16x2 r2903, r1799, r1847; +} +{ +add.f16x2 r2906, r320, r2903; +} +{ +add.f16x2 r2909, r1815, r1831; +} +{ +add.f16x2 r2912, r2906, r2909; +} +{ +add.f16x2 r2915, r1793, r1841; +} +{ +mul.f16x2 r2918, r2915, r2881; +} +{ +add.f16x2 r2921, r176, r2918; +} +{ +add.f16x2 r2924, r1809, r1825; +} +{ +mul.f16x2 r2927, r2924, r2885; +} +{ +add.f16x2 r2930, r2921, r2927; +} +{ +sub.f16x2 r2933, r1799, r1847; +} +{ +mul.f16x2 r2936, r2933, r2883; +} +{ +sub.f16x2 r2939, r1815, r1831; +} +{ +mul.f16x2 r2942, r2939, r2887; +} +{ +add.f16x2 r2945, r2936, r2942; +} +{ +sub.f16x2 r2948, r2930, r2945; +} +{ +add.f16x2 r2951, r1793, r1841; +} +{ +mul.f16x2 r2954, r2951, r2881; +} +{ +add.f16x2 r2957, r176, r2954; +} +{ +add.f16x2 r2960, r1809, r1825; +} +{ +mul.f16x2 r2963, r2960, r2885; +} +{ +add.f16x2 r2966, r2957, r2963; +} +{ +sub.f16x2 r2969, r1799, r1847; +} +{ +mul.f16x2 r2972, r2969, r2883; +} +{ +sub.f16x2 r2975, r1815, r1831; +} +{ +mul.f16x2 r2978, r2975, r2887; +} +{ +add.f16x2 r2981, r2972, r2978; +} +{ +add.f16x2 r2984, r2966, r2981; +} +{ +add.f16x2 r2987, r1793, r1841; +} +{ +mul.f16x2 r2990, r2987, r2885; +} +{ +add.f16x2 r2993, r176, r2990; +} +{ +add.f16x2 r2996, r1809, r1825; +} +{ +mul.f16x2 r2999, r2996, r2889; +} +{ +add.f16x2 r3002, r2993, r2999; +} +{ +sub.f16x2 r3005, r1799, r1847; +} +{ +mul.f16x2 r3008, r3005, r2887; +} +{ +sub.f16x2 r3011, r1815, r1831; +} +{ +mul.f16x2 r3014, r3011, r2890; +} +{ +add.f16x2 r3017, r3008, r3014; +} +{ +sub.f16x2 r3020, r3002, r3017; +} +{ +add.f16x2 r3023, r1793, r1841; +} +{ +mul.f16x2 r3026, r3023, r2885; +} +{ +add.f16x2 r3029, r176, r3026; +} +{ +add.f16x2 r3032, r1809, r1825; +} +{ +mul.f16x2 r3035, r3032, r2889; +} +{ +add.f16x2 r3038, r3029, r3035; +} +{ +sub.f16x2 r3041, r1799, r1847; +} +{ +mul.f16x2 r3044, r3041, r2887; +} +{ +sub.f16x2 r3047, r1815, r1831; +} +{ +mul.f16x2 r3050, r3047, r2890; +} +{ +add.f16x2 r3053, r3044, r3050; +} +{ +add.f16x2 r3056, r3038, r3053; +} +{ +add.f16x2 r3059, r1799, r1847; +} +{ +mul.f16x2 r3062, r3059, r2881; +} +{ +add.f16x2 r3065, r320, r3062; +} +{ +add.f16x2 r3068, r1815, r1831; +} +{ +mul.f16x2 r3071, r3068, r2885; +} +{ +add.f16x2 r3074, r3065, r3071; +} +{ +sub.f16x2 r3077, r1793, r1841; +} +{ +mul.f16x2 r3080, r3077, r2883; +} +{ +sub.f16x2 r3083, r1809, r1825; +} +{ +mul.f16x2 r3086, r3083, r2887; +} +{ +add.f16x2 r3089, r3080, r3086; +} +{ +add.f16x2 r3092, r3074, r3089; +} +{ +add.f16x2 r3095, r1799, r1847; +} +{ +mul.f16x2 r3098, r3095, r2881; +} +{ +add.f16x2 r3101, r320, r3098; +} +{ +add.f16x2 r3104, r1815, r1831; +} +{ +mul.f16x2 r3107, r3104, r2885; +} +{ +add.f16x2 r3110, r3101, r3107; +} +{ +sub.f16x2 r3113, r1793, r1841; +} +{ +mul.f16x2 r3116, r3113, r2883; +} +{ +sub.f16x2 r3119, r1809, r1825; +} +{ +mul.f16x2 r3122, r3119, r2887; +} +{ +add.f16x2 r3125, r3116, r3122; +} +{ +sub.f16x2 r3128, r3110, r3125; +} +{ +add.f16x2 r3131, r1799, r1847; +} +{ +mul.f16x2 r3134, r3131, r2885; +} +{ +add.f16x2 r3137, r320, r3134; +} +{ +add.f16x2 r3140, r1815, r1831; +} +{ +mul.f16x2 r3143, r3140, r2889; +} +{ +add.f16x2 r3146, r3137, r3143; +} +{ +sub.f16x2 r3149, r1793, r1841; +} +{ +mul.f16x2 r3152, r3149, r2887; +} +{ +sub.f16x2 r3155, r1809, r1825; +} +{ +mul.f16x2 r3158, r3155, r2890; +} +{ +add.f16x2 r3161, r3152, r3158; +} +{ +add.f16x2 r3164, r3146, r3161; +} +{ +add.f16x2 r3167, r1799, r1847; +} +{ +mul.f16x2 r3170, r3167, r2885; +} +{ +add.f16x2 r3173, r320, r3170; +} +{ +add.f16x2 r3176, r1815, r1831; +} +{ +mul.f16x2 r3179, r3176, r2889; +} +{ +add.f16x2 r3182, r3173, r3179; +} +{ +sub.f16x2 r3185, r1793, r1841; +} +{ +mul.f16x2 r3188, r3185, r2887; +} +{ +sub.f16x2 r3191, r1809, r1825; +} +{ +mul.f16x2 r3194, r3191, r2890; +} +{ +add.f16x2 r3197, r3188, r3194; +} +{ +sub.f16x2 r3200, r3182, r3197; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3203, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3204, {low, high}; +} +{ +neg.f16x2 r3205, r3204; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r3207, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r3208, {low, high}; +} +{ +neg.f16x2 r3209, r3208; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r3211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r3212, {low, high}; +} +{ +add.f16x2 r3213, r1857, r1905; +} +{ +add.f16x2 r3216, r104, r3213; +} +{ +add.f16x2 r3219, r1873, r1889; +} +{ +add.f16x2 r3222, r3216, r3219; +} +{ +add.f16x2 r3225, r1863, r1911; +} +{ +add.f16x2 r3228, r248, r3225; +} +{ +add.f16x2 r3231, r1879, r1895; +} +{ +add.f16x2 r3234, r3228, r3231; +} +{ +add.f16x2 r3237, r1857, r1905; +} +{ +mul.f16x2 r3240, r3237, r3203; +} +{ +add.f16x2 r3243, r104, r3240; +} +{ +add.f16x2 r3246, r1873, r1889; +} +{ +mul.f16x2 r3249, r3246, r3207; +} +{ +add.f16x2 r3252, r3243, r3249; +} +{ +sub.f16x2 r3255, r1863, r1911; +} +{ +mul.f16x2 r3258, r3255, r3205; +} +{ +sub.f16x2 r3261, r1879, r1895; +} +{ +mul.f16x2 r3264, r3261, r3209; +} +{ +add.f16x2 r3267, r3258, r3264; +} +{ +sub.f16x2 r3270, r3252, r3267; +} +{ +add.f16x2 r3273, r1857, r1905; +} +{ +mul.f16x2 r3276, r3273, r3203; +} +{ +add.f16x2 r3279, r104, r3276; +} +{ +add.f16x2 r3282, r1873, r1889; +} +{ +mul.f16x2 r3285, r3282, r3207; +} +{ +add.f16x2 r3288, r3279, r3285; +} +{ +sub.f16x2 r3291, r1863, r1911; +} +{ +mul.f16x2 r3294, r3291, r3205; +} +{ +sub.f16x2 r3297, r1879, r1895; +} +{ +mul.f16x2 r3300, r3297, r3209; +} +{ +add.f16x2 r3303, r3294, r3300; +} +{ +add.f16x2 r3306, r3288, r3303; +} +{ +add.f16x2 r3309, r1857, r1905; +} +{ +mul.f16x2 r3312, r3309, r3207; +} +{ +add.f16x2 r3315, r104, r3312; +} +{ +add.f16x2 r3318, r1873, r1889; +} +{ +mul.f16x2 r3321, r3318, r3211; +} +{ +add.f16x2 r3324, r3315, r3321; +} +{ +sub.f16x2 r3327, r1863, r1911; +} +{ +mul.f16x2 r3330, r3327, r3209; +} +{ +sub.f16x2 r3333, r1879, r1895; +} +{ +mul.f16x2 r3336, r3333, r3212; +} +{ +add.f16x2 r3339, r3330, r3336; +} +{ +sub.f16x2 r3342, r3324, r3339; +} +{ +add.f16x2 r3345, r1857, r1905; +} +{ +mul.f16x2 r3348, r3345, r3207; +} +{ +add.f16x2 r3351, r104, r3348; +} +{ +add.f16x2 r3354, r1873, r1889; +} +{ +mul.f16x2 r3357, r3354, r3211; +} +{ +add.f16x2 r3360, r3351, r3357; +} +{ +sub.f16x2 r3363, r1863, r1911; +} +{ +mul.f16x2 r3366, r3363, r3209; +} +{ +sub.f16x2 r3369, r1879, r1895; +} +{ +mul.f16x2 r3372, r3369, r3212; +} +{ +add.f16x2 r3375, r3366, r3372; +} +{ +add.f16x2 r3378, r3360, r3375; +} +{ +add.f16x2 r3381, r1863, r1911; +} +{ +mul.f16x2 r3384, r3381, r3203; +} +{ +add.f16x2 r3387, r248, r3384; +} +{ +add.f16x2 r3390, r1879, r1895; +} +{ +mul.f16x2 r3393, r3390, r3207; +} +{ +add.f16x2 r3396, r3387, r3393; +} +{ +sub.f16x2 r3399, r1857, r1905; +} +{ +mul.f16x2 r3402, r3399, r3205; +} +{ +sub.f16x2 r3405, r1873, r1889; +} +{ +mul.f16x2 r3408, r3405, r3209; +} +{ +add.f16x2 r3411, r3402, r3408; +} +{ +add.f16x2 r3414, r3396, r3411; +} +{ +add.f16x2 r3417, r1863, r1911; +} +{ +mul.f16x2 r3420, r3417, r3203; +} +{ +add.f16x2 r3423, r248, r3420; +} +{ +add.f16x2 r3426, r1879, r1895; +} +{ +mul.f16x2 r3429, r3426, r3207; +} +{ +add.f16x2 r3432, r3423, r3429; +} +{ +sub.f16x2 r3435, r1857, r1905; +} +{ +mul.f16x2 r3438, r3435, r3205; +} +{ +sub.f16x2 r3441, r1873, r1889; +} +{ +mul.f16x2 r3444, r3441, r3209; +} +{ +add.f16x2 r3447, r3438, r3444; +} +{ +sub.f16x2 r3450, r3432, r3447; +} +{ +add.f16x2 r3453, r1863, r1911; +} +{ +mul.f16x2 r3456, r3453, r3207; +} +{ +add.f16x2 r3459, r248, r3456; +} +{ +add.f16x2 r3462, r1879, r1895; +} +{ +mul.f16x2 r3465, r3462, r3211; +} +{ +add.f16x2 r3468, r3459, r3465; +} +{ +sub.f16x2 r3471, r1857, r1905; +} +{ +mul.f16x2 r3474, r3471, r3209; +} +{ +sub.f16x2 r3477, r1873, r1889; +} +{ +mul.f16x2 r3480, r3477, r3212; +} +{ +add.f16x2 r3483, r3474, r3480; +} +{ +add.f16x2 r3486, r3468, r3483; +} +{ +add.f16x2 r3489, r1863, r1911; +} +{ +mul.f16x2 r3492, r3489, r3207; +} +{ +add.f16x2 r3495, r248, r3492; +} +{ +add.f16x2 r3498, r1879, r1895; +} +{ +mul.f16x2 r3501, r3498, r3211; +} +{ +add.f16x2 r3504, r3495, r3501; +} +{ +sub.f16x2 r3507, r1857, r1905; +} +{ +mul.f16x2 r3510, r3507, r3209; +} +{ +sub.f16x2 r3513, r1873, r1889; +} +{ +mul.f16x2 r3516, r3513, r3212; +} +{ +add.f16x2 r3519, r3510, r3516; +} +{ +sub.f16x2 r3522, r3504, r3519; +} +mul.wide.u32 rd2, r7943, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r7944, rd3; +mul.lo.s32 r7945, r7944, 25; +sub.s32 r7946, r7943, r7945; +mad.lo.s32 r7947, r7944, 2500, r7942; +cvt.rn.f32.u32 f485, r7946; +mul.f32 f486, f485, 0f3C24B5BE; +cos.approx.f32 f217, f486; +sin.approx.f32 f487, f486; +neg.f32 f218, f487; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f217; +cvt.rn.f16.f32 high, f218; +mov.b32 r3525, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3528, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3530, {high, high}; +} +{ +mul.f16x2 r3532, r2268, r3530; +} +{ +fma.rn.f16x2 r3535, r2256, r3528, r3532; +} +{ +mul.f16x2 r3539, r2256, r3530; +} +{ +neg.f16x2 r3542, r3539; +} +{ +fma.rn.f16x2 r3544, r2268, r3528, r3542; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3548, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3550, {high, high}; +} +mov.f32 f265, 0fBF800000; +mov.f32 f266, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3552, {low, high}; +} +{ +mul.f16x2 r3553, r3550, r3552; +} +{ +mul.f16x2 r3556, r3525, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3559, {high, low}; +} +{ +fma.rn.f16x2 r3561, r3553, r3559, r3556; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3565, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3567, {high, high}; +} +{ +mul.f16x2 r3569, r2590, r3567; +} +{ +fma.rn.f16x2 r3572, r2578, r3565, r3569; +} +{ +mul.f16x2 r3576, r2578, r3567; +} +{ +neg.f16x2 r3579, r3576; +} +{ +fma.rn.f16x2 r3581, r2590, r3565, r3579; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3585, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3587, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3589, {low, high}; +} +{ +mul.f16x2 r3590, r3587, r3589; +} +{ +mul.f16x2 r3593, r3561, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3561; +mov.b32 r3596, {high, low}; +} +{ +fma.rn.f16x2 r3598, r3590, r3596, r3593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3602, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3604, {high, high}; +} +{ +mul.f16x2 r3606, r2912, r3604; +} +{ +fma.rn.f16x2 r3609, r2900, r3602, r3606; +} +{ +mul.f16x2 r3613, r2900, r3604; +} +{ +neg.f16x2 r3616, r3613; +} +{ +fma.rn.f16x2 r3618, r2912, r3602, r3616; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3622, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3624, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3626, {low, high}; +} +{ +mul.f16x2 r3627, r3624, r3626; +} +{ +mul.f16x2 r3630, r3598, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3598; +mov.b32 r3633, {high, low}; +} +{ +fma.rn.f16x2 r3635, r3627, r3633, r3630; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3639, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3641, {high, high}; +} +{ +mul.f16x2 r3643, r3234, r3641; +} +{ +fma.rn.f16x2 r3646, r3222, r3639, r3643; +} +{ +mul.f16x2 r3650, r3222, r3641; +} +{ +neg.f16x2 r3653, r3650; +} +{ +fma.rn.f16x2 r3655, r3234, r3639, r3653; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3659, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3661, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3663, {low, high}; +} +{ +mul.f16x2 r3664, r3661, r3663; +} +{ +mul.f16x2 r3667, r3635, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3635; +mov.b32 r3670, {high, low}; +} +{ +fma.rn.f16x2 r3672, r3664, r3670, r3667; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3676, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3678, {high, high}; +} +{ +mul.f16x2 r3680, r2126, r3678; +} +{ +fma.rn.f16x2 r3683, r1982, r3676, r3680; +} +{ +mul.f16x2 r3687, r1982, r3678; +} +{ +neg.f16x2 r3690, r3687; +} +{ +fma.rn.f16x2 r3692, r2126, r3676, r3690; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3696, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3698, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3700, {low, high}; +} +{ +mul.f16x2 r3701, r3698, r3700; +} +{ +mul.f16x2 r3704, r3672, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3672; +mov.b32 r3707, {high, low}; +} +{ +fma.rn.f16x2 r3709, r3701, r3707, r3704; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3713, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3715, {high, high}; +} +{ +mul.f16x2 r3717, r2448, r3715; +} +{ +fma.rn.f16x2 r3720, r2304, r3713, r3717; +} +{ +mul.f16x2 r3724, r2304, r3715; +} +{ +neg.f16x2 r3727, r3724; +} +{ +fma.rn.f16x2 r3729, r2448, r3713, r3727; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3733, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3735, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3737, {low, high}; +} +{ +mul.f16x2 r3738, r3735, r3737; +} +{ +mul.f16x2 r3741, r3709, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3709; +mov.b32 r3744, {high, low}; +} +{ +fma.rn.f16x2 r3746, r3738, r3744, r3741; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3750, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3752, {high, high}; +} +{ +mul.f16x2 r3754, r2770, r3752; +} +{ +fma.rn.f16x2 r3757, r2626, r3750, r3754; +} +{ +mul.f16x2 r3761, r2626, r3752; +} +{ +neg.f16x2 r3764, r3761; +} +{ +fma.rn.f16x2 r3766, r2770, r3750, r3764; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3770, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3772, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3774, {low, high}; +} +{ +mul.f16x2 r3775, r3772, r3774; +} +{ +mul.f16x2 r3778, r3746, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3746; +mov.b32 r3781, {high, low}; +} +{ +fma.rn.f16x2 r3783, r3775, r3781, r3778; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3787, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3789, {high, high}; +} +{ +mul.f16x2 r3791, r3092, r3789; +} +{ +fma.rn.f16x2 r3794, r2948, r3787, r3791; +} +{ +mul.f16x2 r3798, r2948, r3789; +} +{ +neg.f16x2 r3801, r3798; +} +{ +fma.rn.f16x2 r3803, r3092, r3787, r3801; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3807, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3809, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3811, {low, high}; +} +{ +mul.f16x2 r3812, r3809, r3811; +} +{ +mul.f16x2 r3815, r3783, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3783; +mov.b32 r3818, {high, low}; +} +{ +fma.rn.f16x2 r3820, r3812, r3818, r3815; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3824, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3826, {high, high}; +} +{ +mul.f16x2 r3828, r3414, r3826; +} +{ +fma.rn.f16x2 r3831, r3270, r3824, r3828; +} +{ +mul.f16x2 r3835, r3270, r3826; +} +{ +neg.f16x2 r3838, r3835; +} +{ +fma.rn.f16x2 r3840, r3414, r3824, r3838; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3844, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3846, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3848, {low, high}; +} +{ +mul.f16x2 r3849, r3846, r3848; +} +{ +mul.f16x2 r3852, r3820, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3820; +mov.b32 r3855, {high, low}; +} +{ +fma.rn.f16x2 r3857, r3849, r3855, r3852; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3861, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3863, {high, high}; +} +{ +mul.f16x2 r3865, r2198, r3863; +} +{ +fma.rn.f16x2 r3868, r2054, r3861, r3865; +} +{ +mul.f16x2 r3872, r2054, r3863; +} +{ +neg.f16x2 r3875, r3872; +} +{ +fma.rn.f16x2 r3877, r2198, r3861, r3875; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3881, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3883, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3885, {low, high}; +} +{ +mul.f16x2 r3886, r3883, r3885; +} +{ +mul.f16x2 r3889, r3857, r3881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3857; +mov.b32 r3892, {high, low}; +} +{ +fma.rn.f16x2 r3894, r3886, r3892, r3889; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3898, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3900, {high, high}; +} +{ +mul.f16x2 r3902, r2520, r3900; +} +{ +fma.rn.f16x2 r3905, r2376, r3898, r3902; +} +{ +mul.f16x2 r3909, r2376, r3900; +} +{ +neg.f16x2 r3912, r3909; +} +{ +fma.rn.f16x2 r3914, r2520, r3898, r3912; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3918, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3920, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3922, {low, high}; +} +{ +mul.f16x2 r3923, r3920, r3922; +} +{ +mul.f16x2 r3926, r3894, r3918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3894; +mov.b32 r3929, {high, low}; +} +{ +fma.rn.f16x2 r3931, r3923, r3929, r3926; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3935, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3937, {high, high}; +} +{ +mul.f16x2 r3939, r2842, r3937; +} +{ +fma.rn.f16x2 r3942, r2698, r3935, r3939; +} +{ +mul.f16x2 r3946, r2698, r3937; +} +{ +neg.f16x2 r3949, r3946; +} +{ +fma.rn.f16x2 r3951, r2842, r3935, r3949; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3955, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3957, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3959, {low, high}; +} +{ +mul.f16x2 r3960, r3957, r3959; +} +{ +mul.f16x2 r3963, r3931, r3955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3931; +mov.b32 r3966, {high, low}; +} +{ +fma.rn.f16x2 r3968, r3960, r3966, r3963; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3972, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r3974, {high, high}; +} +{ +mul.f16x2 r3976, r3164, r3974; +} +{ +fma.rn.f16x2 r3979, r3020, r3972, r3976; +} +{ +mul.f16x2 r3983, r3020, r3974; +} +{ +neg.f16x2 r3986, r3983; +} +{ +fma.rn.f16x2 r3988, r3164, r3972, r3986; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3992, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r3994, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r3996, {low, high}; +} +{ +mul.f16x2 r3997, r3994, r3996; +} +{ +mul.f16x2 r4000, r3968, r3992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3968; +mov.b32 r4003, {high, low}; +} +{ +fma.rn.f16x2 r4005, r3997, r4003, r4000; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4009, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4011, {high, high}; +} +{ +mul.f16x2 r4013, r3486, r4011; +} +{ +fma.rn.f16x2 r4016, r3342, r4009, r4013; +} +{ +mul.f16x2 r4020, r3342, r4011; +} +{ +neg.f16x2 r4023, r4020; +} +{ +fma.rn.f16x2 r4025, r3486, r4009, r4023; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4029, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4031, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4033, {low, high}; +} +{ +mul.f16x2 r4034, r4031, r4033; +} +{ +mul.f16x2 r4037, r4005, r4029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4005; +mov.b32 r4040, {high, low}; +} +{ +fma.rn.f16x2 r4042, r4034, r4040, r4037; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4046, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4048, {high, high}; +} +{ +mul.f16x2 r4050, r2234, r4048; +} +{ +fma.rn.f16x2 r4053, r2090, r4046, r4050; +} +{ +mul.f16x2 r4057, r2090, r4048; +} +{ +neg.f16x2 r4060, r4057; +} +{ +fma.rn.f16x2 r4062, r2234, r4046, r4060; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4066, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4068, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4070, {low, high}; +} +{ +mul.f16x2 r4071, r4068, r4070; +} +{ +mul.f16x2 r4074, r4042, r4066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4042; +mov.b32 r4077, {high, low}; +} +{ +fma.rn.f16x2 r4079, r4071, r4077, r4074; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4083, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4085, {high, high}; +} +{ +mul.f16x2 r4087, r2556, r4085; +} +{ +fma.rn.f16x2 r4090, r2412, r4083, r4087; +} +{ +mul.f16x2 r4094, r2412, r4085; +} +{ +neg.f16x2 r4097, r4094; +} +{ +fma.rn.f16x2 r4099, r2556, r4083, r4097; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4103, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4105, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4107, {low, high}; +} +{ +mul.f16x2 r4108, r4105, r4107; +} +{ +mul.f16x2 r4111, r4079, r4103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4079; +mov.b32 r4114, {high, low}; +} +{ +fma.rn.f16x2 r4116, r4108, r4114, r4111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4122, {high, high}; +} +{ +mul.f16x2 r4124, r2878, r4122; +} +{ +fma.rn.f16x2 r4127, r2734, r4120, r4124; +} +{ +mul.f16x2 r4131, r2734, r4122; +} +{ +neg.f16x2 r4134, r4131; +} +{ +fma.rn.f16x2 r4136, r2878, r4120, r4134; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4140, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4142, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4144, {low, high}; +} +{ +mul.f16x2 r4145, r4142, r4144; +} +{ +mul.f16x2 r4148, r4116, r4140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4116; +mov.b32 r4151, {high, low}; +} +{ +fma.rn.f16x2 r4153, r4145, r4151, r4148; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4157, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4159, {high, high}; +} +{ +mul.f16x2 r4161, r3200, r4159; +} +{ +fma.rn.f16x2 r4164, r3056, r4157, r4161; +} +{ +mul.f16x2 r4168, r3056, r4159; +} +{ +neg.f16x2 r4171, r4168; +} +{ +fma.rn.f16x2 r4173, r3200, r4157, r4171; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4177, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4179, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4181, {low, high}; +} +{ +mul.f16x2 r4182, r4179, r4181; +} +{ +mul.f16x2 r4185, r4153, r4177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4153; +mov.b32 r4188, {high, low}; +} +{ +fma.rn.f16x2 r4190, r4182, r4188, r4185; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4194, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4196, {high, high}; +} +{ +mul.f16x2 r4198, r3522, r4196; +} +{ +fma.rn.f16x2 r4201, r3378, r4194, r4198; +} +{ +mul.f16x2 r4205, r3378, r4196; +} +{ +neg.f16x2 r4208, r4205; +} +{ +fma.rn.f16x2 r4210, r3522, r4194, r4208; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4214, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4216, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4218, {low, high}; +} +{ +mul.f16x2 r4219, r4216, r4218; +} +{ +mul.f16x2 r4222, r4190, r4214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4190; +mov.b32 r4225, {high, low}; +} +{ +fma.rn.f16x2 r4227, r4219, r4225, r4222; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4231, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4233, {high, high}; +} +{ +mul.f16x2 r4235, r2162, r4233; +} +{ +fma.rn.f16x2 r4238, r2018, r4231, r4235; +} +{ +mul.f16x2 r4242, r2018, r4233; +} +{ +neg.f16x2 r4245, r4242; +} +{ +fma.rn.f16x2 r4247, r2162, r4231, r4245; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4251, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4253, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4255, {low, high}; +} +{ +mul.f16x2 r4256, r4253, r4255; +} +{ +mul.f16x2 r4259, r4227, r4251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4227; +mov.b32 r4262, {high, low}; +} +{ +fma.rn.f16x2 r4264, r4256, r4262, r4259; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4268, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4270, {high, high}; +} +{ +mul.f16x2 r4272, r2484, r4270; +} +{ +fma.rn.f16x2 r4275, r2340, r4268, r4272; +} +{ +mul.f16x2 r4279, r2340, r4270; +} +{ +neg.f16x2 r4282, r4279; +} +{ +fma.rn.f16x2 r4284, r2484, r4268, r4282; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4288, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4290, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4292, {low, high}; +} +{ +mul.f16x2 r4293, r4290, r4292; +} +{ +mul.f16x2 r4296, r4264, r4288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4264; +mov.b32 r4299, {high, low}; +} +{ +fma.rn.f16x2 r4301, r4293, r4299, r4296; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4305, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4307, {high, high}; +} +{ +mul.f16x2 r4309, r2806, r4307; +} +{ +fma.rn.f16x2 r4312, r2662, r4305, r4309; +} +{ +mul.f16x2 r4316, r2662, r4307; +} +{ +neg.f16x2 r4319, r4316; +} +{ +fma.rn.f16x2 r4321, r2806, r4305, r4319; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4325, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4327, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4329, {low, high}; +} +{ +mul.f16x2 r4330, r4327, r4329; +} +{ +mul.f16x2 r4333, r4301, r4325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4301; +mov.b32 r4336, {high, low}; +} +{ +fma.rn.f16x2 r4338, r4330, r4336, r4333; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4342, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4344, {high, high}; +} +{ +mul.f16x2 r4346, r3128, r4344; +} +{ +fma.rn.f16x2 r4349, r2984, r4342, r4346; +} +{ +mul.f16x2 r4353, r2984, r4344; +} +{ +neg.f16x2 r4356, r4353; +} +{ +fma.rn.f16x2 r4358, r3128, r4342, r4356; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4362, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3525; +mov.b32 r4364, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f265; +cvt.rn.f16.f32 high, f266; +mov.b32 r4366, {low, high}; +} +{ +mul.f16x2 r4367, r4364, r4366; +} +{ +mul.f16x2 r4370, r4338, r4362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4338; +mov.b32 r4373, {high, low}; +} +{ +fma.rn.f16x2 r4375, r4367, r4373, r4370; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4379, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4375; +mov.b32 r4381, {high, high}; +} +{ +mul.f16x2 r4383, r3450, r4381; +} +{ +fma.rn.f16x2 r4386, r3306, r4379, r4383; +} +{ +mul.f16x2 r4390, r3306, r4381; +} +{ +neg.f16x2 r4393, r4390; +} +{ +fma.rn.f16x2 r4395, r3450, r4379, r4393; +} +barrier.sync 0; +mad.lo.s32 r7948, r7946, 100, r7947; +st.shared.u32 [r7948], r1934; +st.shared.u32 [r7948+4], r3535; +st.shared.u32 [r7948+8], r3572; +st.shared.u32 [r7948+12], r3609; +st.shared.u32 [r7948+16], r3646; +st.shared.u32 [r7948+20], r3683; +st.shared.u32 [r7948+24], r3720; +st.shared.u32 [r7948+28], r3757; +st.shared.u32 [r7948+32], r3794; +st.shared.u32 [r7948+36], r3831; +st.shared.u32 [r7948+40], r3868; +st.shared.u32 [r7948+44], r3905; +st.shared.u32 [r7948+48], r3942; +st.shared.u32 [r7948+52], r3979; +st.shared.u32 [r7948+56], r4016; +st.shared.u32 [r7948+60], r4053; +st.shared.u32 [r7948+64], r4090; +st.shared.u32 [r7948+68], r4127; +st.shared.u32 [r7948+72], r4164; +st.shared.u32 [r7948+76], r4201; +st.shared.u32 [r7948+80], r4238; +st.shared.u32 [r7948+84], r4275; +st.shared.u32 [r7948+88], r4312; +st.shared.u32 [r7948+92], r4349; +st.shared.u32 [r7948+96], r4386; +barrier.sync 0; +mad.lo.s32 r7949, r7946, -96, r7948; +ld.shared.u32 r4430, [r7949]; +ld.shared.u32 r4752, [r7949+100]; +ld.shared.u32 r5074, [r7949+200]; +ld.shared.u32 r5396, [r7949+300]; +ld.shared.u32 r5718, [r7949+400]; +ld.shared.u32 r4427, [r7949+500]; +ld.shared.u32 r4749, [r7949+600]; +ld.shared.u32 r5071, [r7949+700]; +ld.shared.u32 r5393, [r7949+800]; +ld.shared.u32 r5715, [r7949+900]; +ld.shared.u32 r4433, [r7949+1000]; +ld.shared.u32 r4755, [r7949+1100]; +ld.shared.u32 r5077, [r7949+1200]; +ld.shared.u32 r5399, [r7949+1300]; +ld.shared.u32 r5721, [r7949+1400]; +ld.shared.u32 r4434, [r7949+1500]; +ld.shared.u32 r4756, [r7949+1600]; +ld.shared.u32 r5078, [r7949+1700]; +ld.shared.u32 r5400, [r7949+1800]; +ld.shared.u32 r5722, [r7949+1900]; +ld.shared.u32 r4428, [r7949+2000]; +ld.shared.u32 r4750, [r7949+2100]; +ld.shared.u32 r5072, [r7949+2200]; +ld.shared.u32 r5394, [r7949+2300]; +ld.shared.u32 r5716, [r7949+2400]; +barrier.sync 0; +st.shared.u32 [r7948], r1946; +st.shared.u32 [r7948+4], r3544; +st.shared.u32 [r7948+8], r3581; +st.shared.u32 [r7948+12], r3618; +st.shared.u32 [r7948+16], r3655; +st.shared.u32 [r7948+20], r3692; +st.shared.u32 [r7948+24], r3729; +st.shared.u32 [r7948+28], r3766; +st.shared.u32 [r7948+32], r3803; +st.shared.u32 [r7948+36], r3840; +st.shared.u32 [r7948+40], r3877; +st.shared.u32 [r7948+44], r3914; +st.shared.u32 [r7948+48], r3951; +st.shared.u32 [r7948+52], r3988; +st.shared.u32 [r7948+56], r4025; +st.shared.u32 [r7948+60], r4062; +st.shared.u32 [r7948+64], r4099; +st.shared.u32 [r7948+68], r4136; +st.shared.u32 [r7948+72], r4173; +st.shared.u32 [r7948+76], r4210; +st.shared.u32 [r7948+80], r4247; +st.shared.u32 [r7948+84], r4284; +st.shared.u32 [r7948+88], r4321; +st.shared.u32 [r7948+92], r4358; +st.shared.u32 [r7948+96], r4395; +barrier.sync 0; +ld.shared.u32 r4442, [r7949]; +ld.shared.u32 r4764, [r7949+100]; +ld.shared.u32 r5086, [r7949+200]; +ld.shared.u32 r5408, [r7949+300]; +ld.shared.u32 r5730, [r7949+400]; +ld.shared.u32 r4439, [r7949+500]; +ld.shared.u32 r4761, [r7949+600]; +ld.shared.u32 r5083, [r7949+700]; +ld.shared.u32 r5405, [r7949+800]; +ld.shared.u32 r5727, [r7949+900]; +ld.shared.u32 r4445, [r7949+1000]; +ld.shared.u32 r4767, [r7949+1100]; +ld.shared.u32 r5089, [r7949+1200]; +ld.shared.u32 r5411, [r7949+1300]; +ld.shared.u32 r5733, [r7949+1400]; +ld.shared.u32 r4446, [r7949+1500]; +ld.shared.u32 r4768, [r7949+1600]; +ld.shared.u32 r5090, [r7949+1700]; +ld.shared.u32 r5412, [r7949+1800]; +ld.shared.u32 r5734, [r7949+1900]; +ld.shared.u32 r4440, [r7949+2000]; +ld.shared.u32 r4762, [r7949+2100]; +ld.shared.u32 r5084, [r7949+2200]; +ld.shared.u32 r5406, [r7949+2300]; +ld.shared.u32 r5728, [r7949+2400]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4416, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4417, {low, high}; +} +{ +neg.f16x2 r4418, r4417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4421, {low, high}; +} +{ +neg.f16x2 r4422, r4421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4425, {low, high}; +} +{ +add.f16x2 r4426, r4427, r4428; +} +{ +add.f16x2 r4429, r4430, r4426; +} +{ +add.f16x2 r4432, r4433, r4434; +} +{ +add.f16x2 r4435, r4429, r4432; +} +{ +add.f16x2 r4438, r4439, r4440; +} +{ +add.f16x2 r4441, r4442, r4438; +} +{ +add.f16x2 r4444, r4445, r4446; +} +{ +add.f16x2 r4447, r4441, r4444; +} +{ +add.f16x2 r4450, r4427, r4428; +} +{ +mul.f16x2 r4453, r4450, r4416; +} +{ +add.f16x2 r4456, r4430, r4453; +} +{ +add.f16x2 r4459, r4433, r4434; +} +{ +mul.f16x2 r4462, r4459, r4420; +} +{ +add.f16x2 r4465, r4456, r4462; +} +{ +sub.f16x2 r4468, r4439, r4440; +} +{ +mul.f16x2 r4471, r4468, r4418; +} +{ +sub.f16x2 r4474, r4445, r4446; +} +{ +mul.f16x2 r4477, r4474, r4422; +} +{ +add.f16x2 r4480, r4471, r4477; +} +{ +sub.f16x2 r4483, r4465, r4480; +} +{ +add.f16x2 r4486, r4427, r4428; +} +{ +mul.f16x2 r4489, r4486, r4416; +} +{ +add.f16x2 r4492, r4430, r4489; +} +{ +add.f16x2 r4495, r4433, r4434; +} +{ +mul.f16x2 r4498, r4495, r4420; +} +{ +add.f16x2 r4501, r4492, r4498; +} +{ +sub.f16x2 r4504, r4439, r4440; +} +{ +mul.f16x2 r4507, r4504, r4418; +} +{ +sub.f16x2 r4510, r4445, r4446; +} +{ +mul.f16x2 r4513, r4510, r4422; +} +{ +add.f16x2 r4516, r4507, r4513; +} +{ +add.f16x2 r4519, r4501, r4516; +} +{ +add.f16x2 r4522, r4427, r4428; +} +{ +mul.f16x2 r4525, r4522, r4420; +} +{ +add.f16x2 r4528, r4430, r4525; +} +{ +add.f16x2 r4531, r4433, r4434; +} +{ +mul.f16x2 r4534, r4531, r4424; +} +{ +add.f16x2 r4537, r4528, r4534; +} +{ +sub.f16x2 r4540, r4439, r4440; +} +{ +mul.f16x2 r4543, r4540, r4422; +} +{ +sub.f16x2 r4546, r4445, r4446; +} +{ +mul.f16x2 r4549, r4546, r4425; +} +{ +add.f16x2 r4552, r4543, r4549; +} +{ +sub.f16x2 r4555, r4537, r4552; +} +{ +add.f16x2 r4558, r4427, r4428; +} +{ +mul.f16x2 r4561, r4558, r4420; +} +{ +add.f16x2 r4564, r4430, r4561; +} +{ +add.f16x2 r4567, r4433, r4434; +} +{ +mul.f16x2 r4570, r4567, r4424; +} +{ +add.f16x2 r4573, r4564, r4570; +} +{ +sub.f16x2 r4576, r4439, r4440; +} +{ +mul.f16x2 r4579, r4576, r4422; +} +{ +sub.f16x2 r4582, r4445, r4446; +} +{ +mul.f16x2 r4585, r4582, r4425; +} +{ +add.f16x2 r4588, r4579, r4585; +} +{ +add.f16x2 r4591, r4573, r4588; +} +{ +add.f16x2 r4594, r4439, r4440; +} +{ +mul.f16x2 r4597, r4594, r4416; +} +{ +add.f16x2 r4600, r4442, r4597; +} +{ +add.f16x2 r4603, r4445, r4446; +} +{ +mul.f16x2 r4606, r4603, r4420; +} +{ +add.f16x2 r4609, r4600, r4606; +} +{ +sub.f16x2 r4612, r4427, r4428; +} +{ +mul.f16x2 r4615, r4612, r4418; +} +{ +sub.f16x2 r4618, r4433, r4434; +} +{ +mul.f16x2 r4621, r4618, r4422; +} +{ +add.f16x2 r4624, r4615, r4621; +} +{ +add.f16x2 r4627, r4609, r4624; +} +{ +add.f16x2 r4630, r4439, r4440; +} +{ +mul.f16x2 r4633, r4630, r4416; +} +{ +add.f16x2 r4636, r4442, r4633; +} +{ +add.f16x2 r4639, r4445, r4446; +} +{ +mul.f16x2 r4642, r4639, r4420; +} +{ +add.f16x2 r4645, r4636, r4642; +} +{ +sub.f16x2 r4648, r4427, r4428; +} +{ +mul.f16x2 r4651, r4648, r4418; +} +{ +sub.f16x2 r4654, r4433, r4434; +} +{ +mul.f16x2 r4657, r4654, r4422; +} +{ +add.f16x2 r4660, r4651, r4657; +} +{ +sub.f16x2 r4663, r4645, r4660; +} +{ +add.f16x2 r4666, r4439, r4440; +} +{ +mul.f16x2 r4669, r4666, r4420; +} +{ +add.f16x2 r4672, r4442, r4669; +} +{ +add.f16x2 r4675, r4445, r4446; +} +{ +mul.f16x2 r4678, r4675, r4424; +} +{ +add.f16x2 r4681, r4672, r4678; +} +{ +sub.f16x2 r4684, r4427, r4428; +} +{ +mul.f16x2 r4687, r4684, r4422; +} +{ +sub.f16x2 r4690, r4433, r4434; +} +{ +mul.f16x2 r4693, r4690, r4425; +} +{ +add.f16x2 r4696, r4687, r4693; +} +{ +add.f16x2 r4699, r4681, r4696; +} +{ +add.f16x2 r4702, r4439, r4440; +} +{ +mul.f16x2 r4705, r4702, r4420; +} +{ +add.f16x2 r4708, r4442, r4705; +} +{ +add.f16x2 r4711, r4445, r4446; +} +{ +mul.f16x2 r4714, r4711, r4424; +} +{ +add.f16x2 r4717, r4708, r4714; +} +{ +sub.f16x2 r4720, r4427, r4428; +} +{ +mul.f16x2 r4723, r4720, r4422; +} +{ +sub.f16x2 r4726, r4433, r4434; +} +{ +mul.f16x2 r4729, r4726, r4425; +} +{ +add.f16x2 r4732, r4723, r4729; +} +{ +sub.f16x2 r4735, r4717, r4732; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4738, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4739, {low, high}; +} +{ +neg.f16x2 r4740, r4739; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r4742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r4743, {low, high}; +} +{ +neg.f16x2 r4744, r4743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r4746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r4747, {low, high}; +} +{ +add.f16x2 r4748, r4749, r4750; +} +{ +add.f16x2 r4751, r4752, r4748; +} +{ +add.f16x2 r4754, r4755, r4756; +} +{ +add.f16x2 r4757, r4751, r4754; +} +{ +add.f16x2 r4760, r4761, r4762; +} +{ +add.f16x2 r4763, r4764, r4760; +} +{ +add.f16x2 r4766, r4767, r4768; +} +{ +add.f16x2 r4769, r4763, r4766; +} +{ +add.f16x2 r4772, r4749, r4750; +} +{ +mul.f16x2 r4775, r4772, r4738; +} +{ +add.f16x2 r4778, r4752, r4775; +} +{ +add.f16x2 r4781, r4755, r4756; +} +{ +mul.f16x2 r4784, r4781, r4742; +} +{ +add.f16x2 r4787, r4778, r4784; +} +{ +sub.f16x2 r4790, r4761, r4762; +} +{ +mul.f16x2 r4793, r4790, r4740; +} +{ +sub.f16x2 r4796, r4767, r4768; +} +{ +mul.f16x2 r4799, r4796, r4744; +} +{ +add.f16x2 r4802, r4793, r4799; +} +{ +sub.f16x2 r4805, r4787, r4802; +} +{ +add.f16x2 r4808, r4749, r4750; +} +{ +mul.f16x2 r4811, r4808, r4738; +} +{ +add.f16x2 r4814, r4752, r4811; +} +{ +add.f16x2 r4817, r4755, r4756; +} +{ +mul.f16x2 r4820, r4817, r4742; +} +{ +add.f16x2 r4823, r4814, r4820; +} +{ +sub.f16x2 r4826, r4761, r4762; +} +{ +mul.f16x2 r4829, r4826, r4740; +} +{ +sub.f16x2 r4832, r4767, r4768; +} +{ +mul.f16x2 r4835, r4832, r4744; +} +{ +add.f16x2 r4838, r4829, r4835; +} +{ +add.f16x2 r4841, r4823, r4838; +} +{ +add.f16x2 r4844, r4749, r4750; +} +{ +mul.f16x2 r4847, r4844, r4742; +} +{ +add.f16x2 r4850, r4752, r4847; +} +{ +add.f16x2 r4853, r4755, r4756; +} +{ +mul.f16x2 r4856, r4853, r4746; +} +{ +add.f16x2 r4859, r4850, r4856; +} +{ +sub.f16x2 r4862, r4761, r4762; +} +{ +mul.f16x2 r4865, r4862, r4744; +} +{ +sub.f16x2 r4868, r4767, r4768; +} +{ +mul.f16x2 r4871, r4868, r4747; +} +{ +add.f16x2 r4874, r4865, r4871; +} +{ +sub.f16x2 r4877, r4859, r4874; +} +{ +add.f16x2 r4880, r4749, r4750; +} +{ +mul.f16x2 r4883, r4880, r4742; +} +{ +add.f16x2 r4886, r4752, r4883; +} +{ +add.f16x2 r4889, r4755, r4756; +} +{ +mul.f16x2 r4892, r4889, r4746; +} +{ +add.f16x2 r4895, r4886, r4892; +} +{ +sub.f16x2 r4898, r4761, r4762; +} +{ +mul.f16x2 r4901, r4898, r4744; +} +{ +sub.f16x2 r4904, r4767, r4768; +} +{ +mul.f16x2 r4907, r4904, r4747; +} +{ +add.f16x2 r4910, r4901, r4907; +} +{ +add.f16x2 r4913, r4895, r4910; +} +{ +add.f16x2 r4916, r4761, r4762; +} +{ +mul.f16x2 r4919, r4916, r4738; +} +{ +add.f16x2 r4922, r4764, r4919; +} +{ +add.f16x2 r4925, r4767, r4768; +} +{ +mul.f16x2 r4928, r4925, r4742; +} +{ +add.f16x2 r4931, r4922, r4928; +} +{ +sub.f16x2 r4934, r4749, r4750; +} +{ +mul.f16x2 r4937, r4934, r4740; +} +{ +sub.f16x2 r4940, r4755, r4756; +} +{ +mul.f16x2 r4943, r4940, r4744; +} +{ +add.f16x2 r4946, r4937, r4943; +} +{ +add.f16x2 r4949, r4931, r4946; +} +{ +add.f16x2 r4952, r4761, r4762; +} +{ +mul.f16x2 r4955, r4952, r4738; +} +{ +add.f16x2 r4958, r4764, r4955; +} +{ +add.f16x2 r4961, r4767, r4768; +} +{ +mul.f16x2 r4964, r4961, r4742; +} +{ +add.f16x2 r4967, r4958, r4964; +} +{ +sub.f16x2 r4970, r4749, r4750; +} +{ +mul.f16x2 r4973, r4970, r4740; +} +{ +sub.f16x2 r4976, r4755, r4756; +} +{ +mul.f16x2 r4979, r4976, r4744; +} +{ +add.f16x2 r4982, r4973, r4979; +} +{ +sub.f16x2 r4985, r4967, r4982; +} +{ +add.f16x2 r4988, r4761, r4762; +} +{ +mul.f16x2 r4991, r4988, r4742; +} +{ +add.f16x2 r4994, r4764, r4991; +} +{ +add.f16x2 r4997, r4767, r4768; +} +{ +mul.f16x2 r5000, r4997, r4746; +} +{ +add.f16x2 r5003, r4994, r5000; +} +{ +sub.f16x2 r5006, r4749, r4750; +} +{ +mul.f16x2 r5009, r5006, r4744; +} +{ +sub.f16x2 r5012, r4755, r4756; +} +{ +mul.f16x2 r5015, r5012, r4747; +} +{ +add.f16x2 r5018, r5009, r5015; +} +{ +add.f16x2 r5021, r5003, r5018; +} +{ +add.f16x2 r5024, r4761, r4762; +} +{ +mul.f16x2 r5027, r5024, r4742; +} +{ +add.f16x2 r5030, r4764, r5027; +} +{ +add.f16x2 r5033, r4767, r4768; +} +{ +mul.f16x2 r5036, r5033, r4746; +} +{ +add.f16x2 r5039, r5030, r5036; +} +{ +sub.f16x2 r5042, r4749, r4750; +} +{ +mul.f16x2 r5045, r5042, r4744; +} +{ +sub.f16x2 r5048, r4755, r4756; +} +{ +mul.f16x2 r5051, r5048, r4747; +} +{ +add.f16x2 r5054, r5045, r5051; +} +{ +sub.f16x2 r5057, r5039, r5054; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5060, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5061, {low, high}; +} +{ +neg.f16x2 r5062, r5061; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5064, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5065, {low, high}; +} +{ +neg.f16x2 r5066, r5065; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5069, {low, high}; +} +{ +add.f16x2 r5070, r5071, r5072; +} +{ +add.f16x2 r5073, r5074, r5070; +} +{ +add.f16x2 r5076, r5077, r5078; +} +{ +add.f16x2 r5079, r5073, r5076; +} +{ +add.f16x2 r5082, r5083, r5084; +} +{ +add.f16x2 r5085, r5086, r5082; +} +{ +add.f16x2 r5088, r5089, r5090; +} +{ +add.f16x2 r5091, r5085, r5088; +} +{ +add.f16x2 r5094, r5071, r5072; +} +{ +mul.f16x2 r5097, r5094, r5060; +} +{ +add.f16x2 r5100, r5074, r5097; +} +{ +add.f16x2 r5103, r5077, r5078; +} +{ +mul.f16x2 r5106, r5103, r5064; +} +{ +add.f16x2 r5109, r5100, r5106; +} +{ +sub.f16x2 r5112, r5083, r5084; +} +{ +mul.f16x2 r5115, r5112, r5062; +} +{ +sub.f16x2 r5118, r5089, r5090; +} +{ +mul.f16x2 r5121, r5118, r5066; +} +{ +add.f16x2 r5124, r5115, r5121; +} +{ +sub.f16x2 r5127, r5109, r5124; +} +{ +add.f16x2 r5130, r5071, r5072; +} +{ +mul.f16x2 r5133, r5130, r5060; +} +{ +add.f16x2 r5136, r5074, r5133; +} +{ +add.f16x2 r5139, r5077, r5078; +} +{ +mul.f16x2 r5142, r5139, r5064; +} +{ +add.f16x2 r5145, r5136, r5142; +} +{ +sub.f16x2 r5148, r5083, r5084; +} +{ +mul.f16x2 r5151, r5148, r5062; +} +{ +sub.f16x2 r5154, r5089, r5090; +} +{ +mul.f16x2 r5157, r5154, r5066; +} +{ +add.f16x2 r5160, r5151, r5157; +} +{ +add.f16x2 r5163, r5145, r5160; +} +{ +add.f16x2 r5166, r5071, r5072; +} +{ +mul.f16x2 r5169, r5166, r5064; +} +{ +add.f16x2 r5172, r5074, r5169; +} +{ +add.f16x2 r5175, r5077, r5078; +} +{ +mul.f16x2 r5178, r5175, r5068; +} +{ +add.f16x2 r5181, r5172, r5178; +} +{ +sub.f16x2 r5184, r5083, r5084; +} +{ +mul.f16x2 r5187, r5184, r5066; +} +{ +sub.f16x2 r5190, r5089, r5090; +} +{ +mul.f16x2 r5193, r5190, r5069; +} +{ +add.f16x2 r5196, r5187, r5193; +} +{ +sub.f16x2 r5199, r5181, r5196; +} +{ +add.f16x2 r5202, r5071, r5072; +} +{ +mul.f16x2 r5205, r5202, r5064; +} +{ +add.f16x2 r5208, r5074, r5205; +} +{ +add.f16x2 r5211, r5077, r5078; +} +{ +mul.f16x2 r5214, r5211, r5068; +} +{ +add.f16x2 r5217, r5208, r5214; +} +{ +sub.f16x2 r5220, r5083, r5084; +} +{ +mul.f16x2 r5223, r5220, r5066; +} +{ +sub.f16x2 r5226, r5089, r5090; +} +{ +mul.f16x2 r5229, r5226, r5069; +} +{ +add.f16x2 r5232, r5223, r5229; +} +{ +add.f16x2 r5235, r5217, r5232; +} +{ +add.f16x2 r5238, r5083, r5084; +} +{ +mul.f16x2 r5241, r5238, r5060; +} +{ +add.f16x2 r5244, r5086, r5241; +} +{ +add.f16x2 r5247, r5089, r5090; +} +{ +mul.f16x2 r5250, r5247, r5064; +} +{ +add.f16x2 r5253, r5244, r5250; +} +{ +sub.f16x2 r5256, r5071, r5072; +} +{ +mul.f16x2 r5259, r5256, r5062; +} +{ +sub.f16x2 r5262, r5077, r5078; +} +{ +mul.f16x2 r5265, r5262, r5066; +} +{ +add.f16x2 r5268, r5259, r5265; +} +{ +add.f16x2 r5271, r5253, r5268; +} +{ +add.f16x2 r5274, r5083, r5084; +} +{ +mul.f16x2 r5277, r5274, r5060; +} +{ +add.f16x2 r5280, r5086, r5277; +} +{ +add.f16x2 r5283, r5089, r5090; +} +{ +mul.f16x2 r5286, r5283, r5064; +} +{ +add.f16x2 r5289, r5280, r5286; +} +{ +sub.f16x2 r5292, r5071, r5072; +} +{ +mul.f16x2 r5295, r5292, r5062; +} +{ +sub.f16x2 r5298, r5077, r5078; +} +{ +mul.f16x2 r5301, r5298, r5066; +} +{ +add.f16x2 r5304, r5295, r5301; +} +{ +sub.f16x2 r5307, r5289, r5304; +} +{ +add.f16x2 r5310, r5083, r5084; +} +{ +mul.f16x2 r5313, r5310, r5064; +} +{ +add.f16x2 r5316, r5086, r5313; +} +{ +add.f16x2 r5319, r5089, r5090; +} +{ +mul.f16x2 r5322, r5319, r5068; +} +{ +add.f16x2 r5325, r5316, r5322; +} +{ +sub.f16x2 r5328, r5071, r5072; +} +{ +mul.f16x2 r5331, r5328, r5066; +} +{ +sub.f16x2 r5334, r5077, r5078; +} +{ +mul.f16x2 r5337, r5334, r5069; +} +{ +add.f16x2 r5340, r5331, r5337; +} +{ +add.f16x2 r5343, r5325, r5340; +} +{ +add.f16x2 r5346, r5083, r5084; +} +{ +mul.f16x2 r5349, r5346, r5064; +} +{ +add.f16x2 r5352, r5086, r5349; +} +{ +add.f16x2 r5355, r5089, r5090; +} +{ +mul.f16x2 r5358, r5355, r5068; +} +{ +add.f16x2 r5361, r5352, r5358; +} +{ +sub.f16x2 r5364, r5071, r5072; +} +{ +mul.f16x2 r5367, r5364, r5066; +} +{ +sub.f16x2 r5370, r5077, r5078; +} +{ +mul.f16x2 r5373, r5370, r5069; +} +{ +add.f16x2 r5376, r5367, r5373; +} +{ +sub.f16x2 r5379, r5361, r5376; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5382, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5383, {low, high}; +} +{ +neg.f16x2 r5384, r5383; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5386, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5387, {low, high}; +} +{ +neg.f16x2 r5388, r5387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5390, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5391, {low, high}; +} +{ +add.f16x2 r5392, r5393, r5394; +} +{ +add.f16x2 r5395, r5396, r5392; +} +{ +add.f16x2 r5398, r5399, r5400; +} +{ +add.f16x2 r5401, r5395, r5398; +} +{ +add.f16x2 r5404, r5405, r5406; +} +{ +add.f16x2 r5407, r5408, r5404; +} +{ +add.f16x2 r5410, r5411, r5412; +} +{ +add.f16x2 r5413, r5407, r5410; +} +{ +add.f16x2 r5416, r5393, r5394; +} +{ +mul.f16x2 r5419, r5416, r5382; +} +{ +add.f16x2 r5422, r5396, r5419; +} +{ +add.f16x2 r5425, r5399, r5400; +} +{ +mul.f16x2 r5428, r5425, r5386; +} +{ +add.f16x2 r5431, r5422, r5428; +} +{ +sub.f16x2 r5434, r5405, r5406; +} +{ +mul.f16x2 r5437, r5434, r5384; +} +{ +sub.f16x2 r5440, r5411, r5412; +} +{ +mul.f16x2 r5443, r5440, r5388; +} +{ +add.f16x2 r5446, r5437, r5443; +} +{ +sub.f16x2 r5449, r5431, r5446; +} +{ +add.f16x2 r5452, r5393, r5394; +} +{ +mul.f16x2 r5455, r5452, r5382; +} +{ +add.f16x2 r5458, r5396, r5455; +} +{ +add.f16x2 r5461, r5399, r5400; +} +{ +mul.f16x2 r5464, r5461, r5386; +} +{ +add.f16x2 r5467, r5458, r5464; +} +{ +sub.f16x2 r5470, r5405, r5406; +} +{ +mul.f16x2 r5473, r5470, r5384; +} +{ +sub.f16x2 r5476, r5411, r5412; +} +{ +mul.f16x2 r5479, r5476, r5388; +} +{ +add.f16x2 r5482, r5473, r5479; +} +{ +add.f16x2 r5485, r5467, r5482; +} +{ +add.f16x2 r5488, r5393, r5394; +} +{ +mul.f16x2 r5491, r5488, r5386; +} +{ +add.f16x2 r5494, r5396, r5491; +} +{ +add.f16x2 r5497, r5399, r5400; +} +{ +mul.f16x2 r5500, r5497, r5390; +} +{ +add.f16x2 r5503, r5494, r5500; +} +{ +sub.f16x2 r5506, r5405, r5406; +} +{ +mul.f16x2 r5509, r5506, r5388; +} +{ +sub.f16x2 r5512, r5411, r5412; +} +{ +mul.f16x2 r5515, r5512, r5391; +} +{ +add.f16x2 r5518, r5509, r5515; +} +{ +sub.f16x2 r5521, r5503, r5518; +} +{ +add.f16x2 r5524, r5393, r5394; +} +{ +mul.f16x2 r5527, r5524, r5386; +} +{ +add.f16x2 r5530, r5396, r5527; +} +{ +add.f16x2 r5533, r5399, r5400; +} +{ +mul.f16x2 r5536, r5533, r5390; +} +{ +add.f16x2 r5539, r5530, r5536; +} +{ +sub.f16x2 r5542, r5405, r5406; +} +{ +mul.f16x2 r5545, r5542, r5388; +} +{ +sub.f16x2 r5548, r5411, r5412; +} +{ +mul.f16x2 r5551, r5548, r5391; +} +{ +add.f16x2 r5554, r5545, r5551; +} +{ +add.f16x2 r5557, r5539, r5554; +} +{ +add.f16x2 r5560, r5405, r5406; +} +{ +mul.f16x2 r5563, r5560, r5382; +} +{ +add.f16x2 r5566, r5408, r5563; +} +{ +add.f16x2 r5569, r5411, r5412; +} +{ +mul.f16x2 r5572, r5569, r5386; +} +{ +add.f16x2 r5575, r5566, r5572; +} +{ +sub.f16x2 r5578, r5393, r5394; +} +{ +mul.f16x2 r5581, r5578, r5384; +} +{ +sub.f16x2 r5584, r5399, r5400; +} +{ +mul.f16x2 r5587, r5584, r5388; +} +{ +add.f16x2 r5590, r5581, r5587; +} +{ +add.f16x2 r5593, r5575, r5590; +} +{ +add.f16x2 r5596, r5405, r5406; +} +{ +mul.f16x2 r5599, r5596, r5382; +} +{ +add.f16x2 r5602, r5408, r5599; +} +{ +add.f16x2 r5605, r5411, r5412; +} +{ +mul.f16x2 r5608, r5605, r5386; +} +{ +add.f16x2 r5611, r5602, r5608; +} +{ +sub.f16x2 r5614, r5393, r5394; +} +{ +mul.f16x2 r5617, r5614, r5384; +} +{ +sub.f16x2 r5620, r5399, r5400; +} +{ +mul.f16x2 r5623, r5620, r5388; +} +{ +add.f16x2 r5626, r5617, r5623; +} +{ +sub.f16x2 r5629, r5611, r5626; +} +{ +add.f16x2 r5632, r5405, r5406; +} +{ +mul.f16x2 r5635, r5632, r5386; +} +{ +add.f16x2 r5638, r5408, r5635; +} +{ +add.f16x2 r5641, r5411, r5412; +} +{ +mul.f16x2 r5644, r5641, r5390; +} +{ +add.f16x2 r5647, r5638, r5644; +} +{ +sub.f16x2 r5650, r5393, r5394; +} +{ +mul.f16x2 r5653, r5650, r5388; +} +{ +sub.f16x2 r5656, r5399, r5400; +} +{ +mul.f16x2 r5659, r5656, r5391; +} +{ +add.f16x2 r5662, r5653, r5659; +} +{ +add.f16x2 r5665, r5647, r5662; +} +{ +add.f16x2 r5668, r5405, r5406; +} +{ +mul.f16x2 r5671, r5668, r5386; +} +{ +add.f16x2 r5674, r5408, r5671; +} +{ +add.f16x2 r5677, r5411, r5412; +} +{ +mul.f16x2 r5680, r5677, r5390; +} +{ +add.f16x2 r5683, r5674, r5680; +} +{ +sub.f16x2 r5686, r5393, r5394; +} +{ +mul.f16x2 r5689, r5686, r5388; +} +{ +sub.f16x2 r5692, r5399, r5400; +} +{ +mul.f16x2 r5695, r5692, r5391; +} +{ +add.f16x2 r5698, r5689, r5695; +} +{ +sub.f16x2 r5701, r5683, r5698; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5704, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5705, {low, high}; +} +{ +neg.f16x2 r5706, r5705; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r5708, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r5709, {low, high}; +} +{ +neg.f16x2 r5710, r5709; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r5712, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r5713, {low, high}; +} +{ +add.f16x2 r5714, r5715, r5716; +} +{ +add.f16x2 r5717, r5718, r5714; +} +{ +add.f16x2 r5720, r5721, r5722; +} +{ +add.f16x2 r5723, r5717, r5720; +} +{ +add.f16x2 r5726, r5727, r5728; +} +{ +add.f16x2 r5729, r5730, r5726; +} +{ +add.f16x2 r5732, r5733, r5734; +} +{ +add.f16x2 r5735, r5729, r5732; +} +{ +add.f16x2 r5738, r5715, r5716; +} +{ +mul.f16x2 r5741, r5738, r5704; +} +{ +add.f16x2 r5744, r5718, r5741; +} +{ +add.f16x2 r5747, r5721, r5722; +} +{ +mul.f16x2 r5750, r5747, r5708; +} +{ +add.f16x2 r5753, r5744, r5750; +} +{ +sub.f16x2 r5756, r5727, r5728; +} +{ +mul.f16x2 r5759, r5756, r5706; +} +{ +sub.f16x2 r5762, r5733, r5734; +} +{ +mul.f16x2 r5765, r5762, r5710; +} +{ +add.f16x2 r5768, r5759, r5765; +} +{ +sub.f16x2 r5771, r5753, r5768; +} +{ +add.f16x2 r5774, r5715, r5716; +} +{ +mul.f16x2 r5777, r5774, r5704; +} +{ +add.f16x2 r5780, r5718, r5777; +} +{ +add.f16x2 r5783, r5721, r5722; +} +{ +mul.f16x2 r5786, r5783, r5708; +} +{ +add.f16x2 r5789, r5780, r5786; +} +{ +sub.f16x2 r5792, r5727, r5728; +} +{ +mul.f16x2 r5795, r5792, r5706; +} +{ +sub.f16x2 r5798, r5733, r5734; +} +{ +mul.f16x2 r5801, r5798, r5710; +} +{ +add.f16x2 r5804, r5795, r5801; +} +{ +add.f16x2 r5807, r5789, r5804; +} +{ +add.f16x2 r5810, r5715, r5716; +} +{ +mul.f16x2 r5813, r5810, r5708; +} +{ +add.f16x2 r5816, r5718, r5813; +} +{ +add.f16x2 r5819, r5721, r5722; +} +{ +mul.f16x2 r5822, r5819, r5712; +} +{ +add.f16x2 r5825, r5816, r5822; +} +{ +sub.f16x2 r5828, r5727, r5728; +} +{ +mul.f16x2 r5831, r5828, r5710; +} +{ +sub.f16x2 r5834, r5733, r5734; +} +{ +mul.f16x2 r5837, r5834, r5713; +} +{ +add.f16x2 r5840, r5831, r5837; +} +{ +sub.f16x2 r5843, r5825, r5840; +} +{ +add.f16x2 r5846, r5715, r5716; +} +{ +mul.f16x2 r5849, r5846, r5708; +} +{ +add.f16x2 r5852, r5718, r5849; +} +{ +add.f16x2 r5855, r5721, r5722; +} +{ +mul.f16x2 r5858, r5855, r5712; +} +{ +add.f16x2 r5861, r5852, r5858; +} +{ +sub.f16x2 r5864, r5727, r5728; +} +{ +mul.f16x2 r5867, r5864, r5710; +} +{ +sub.f16x2 r5870, r5733, r5734; +} +{ +mul.f16x2 r5873, r5870, r5713; +} +{ +add.f16x2 r5876, r5867, r5873; +} +{ +add.f16x2 r5879, r5861, r5876; +} +{ +add.f16x2 r5882, r5727, r5728; +} +{ +mul.f16x2 r5885, r5882, r5704; +} +{ +add.f16x2 r5888, r5730, r5885; +} +{ +add.f16x2 r5891, r5733, r5734; +} +{ +mul.f16x2 r5894, r5891, r5708; +} +{ +add.f16x2 r5897, r5888, r5894; +} +{ +sub.f16x2 r5900, r5715, r5716; +} +{ +mul.f16x2 r5903, r5900, r5706; +} +{ +sub.f16x2 r5906, r5721, r5722; +} +{ +mul.f16x2 r5909, r5906, r5710; +} +{ +add.f16x2 r5912, r5903, r5909; +} +{ +add.f16x2 r5915, r5897, r5912; +} +{ +add.f16x2 r5918, r5727, r5728; +} +{ +mul.f16x2 r5921, r5918, r5704; +} +{ +add.f16x2 r5924, r5730, r5921; +} +{ +add.f16x2 r5927, r5733, r5734; +} +{ +mul.f16x2 r5930, r5927, r5708; +} +{ +add.f16x2 r5933, r5924, r5930; +} +{ +sub.f16x2 r5936, r5715, r5716; +} +{ +mul.f16x2 r5939, r5936, r5706; +} +{ +sub.f16x2 r5942, r5721, r5722; +} +{ +mul.f16x2 r5945, r5942, r5710; +} +{ +add.f16x2 r5948, r5939, r5945; +} +{ +sub.f16x2 r5951, r5933, r5948; +} +{ +add.f16x2 r5954, r5727, r5728; +} +{ +mul.f16x2 r5957, r5954, r5708; +} +{ +add.f16x2 r5960, r5730, r5957; +} +{ +add.f16x2 r5963, r5733, r5734; +} +{ +mul.f16x2 r5966, r5963, r5712; +} +{ +add.f16x2 r5969, r5960, r5966; +} +{ +sub.f16x2 r5972, r5715, r5716; +} +{ +mul.f16x2 r5975, r5972, r5710; +} +{ +sub.f16x2 r5978, r5721, r5722; +} +{ +mul.f16x2 r5981, r5978, r5713; +} +{ +add.f16x2 r5984, r5975, r5981; +} +{ +add.f16x2 r5987, r5969, r5984; +} +{ +add.f16x2 r5990, r5727, r5728; +} +{ +mul.f16x2 r5993, r5990, r5708; +} +{ +add.f16x2 r5996, r5730, r5993; +} +{ +add.f16x2 r5999, r5733, r5734; +} +{ +mul.f16x2 r6002, r5999, r5712; +} +{ +add.f16x2 r6005, r5996, r6002; +} +{ +sub.f16x2 r6008, r5715, r5716; +} +{ +mul.f16x2 r6011, r6008, r5710; +} +{ +sub.f16x2 r6014, r5721, r5722; +} +{ +mul.f16x2 r6017, r6014, r5713; +} +{ +add.f16x2 r6020, r6011, r6017; +} +{ +sub.f16x2 r6023, r6005, r6020; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f330; +cvt.rn.f16.f32 high, f330; +mov.b32 r6026, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f332; +cvt.rn.f16.f32 high, f332; +mov.b32 r6027, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f334; +cvt.rn.f16.f32 high, f334; +mov.b32 r6028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f336; +cvt.rn.f16.f32 high, f336; +mov.b32 r6029, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f338; +cvt.rn.f16.f32 high, f338; +mov.b32 r6030, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f340; +cvt.rn.f16.f32 high, f340; +mov.b32 r6031, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f342; +cvt.rn.f16.f32 high, f342; +mov.b32 r6032, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f344; +cvt.rn.f16.f32 high, f344; +mov.b32 r6033, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f350; +cvt.rn.f16.f32 high, f350; +mov.b32 r6036, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f352; +cvt.rn.f16.f32 high, f352; +mov.b32 r6037, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r6040, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f360; +cvt.rn.f16.f32 high, f360; +mov.b32 r6041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6042, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f364; +cvt.rn.f16.f32 high, f364; +mov.b32 r6043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r6048, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r6049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f390; +cvt.rn.f16.f32 high, f390; +mov.b32 r6056, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f392; +cvt.rn.f16.f32 high, f392; +mov.b32 r6057, {low, high}; +} +{ +mul.f16x2 r6074, r4805, r6026; +} +{ +mul.f16x2 r6077, r4949, r6027; +} +{ +sub.f16x2 r6080, r6074, r6077; +} +{ +mul.f16x2 r6083, r4805, r6027; +} +{ +fma.rn.f16x2 r6086, r4949, r6026, r6083; +} +{ +mul.f16x2 r6090, r5127, r6028; +} +{ +mul.f16x2 r6093, r5271, r6029; +} +{ +sub.f16x2 r6096, r6090, r6093; +} +{ +mul.f16x2 r6099, r5127, r6029; +} +{ +fma.rn.f16x2 r6102, r5271, r6028, r6099; +} +{ +mul.f16x2 r6106, r5449, r6030; +} +{ +mul.f16x2 r6109, r5593, r6031; +} +{ +sub.f16x2 r6112, r6106, r6109; +} +{ +mul.f16x2 r6115, r5449, r6031; +} +{ +fma.rn.f16x2 r6118, r5593, r6030, r6115; +} +{ +mul.f16x2 r6122, r5771, r6032; +} +{ +mul.f16x2 r6125, r5915, r6033; +} +{ +sub.f16x2 r6128, r6122, r6125; +} +{ +mul.f16x2 r6131, r5771, r6033; +} +{ +fma.rn.f16x2 r6134, r5915, r6032, r6131; +} +{ +mul.f16x2 r6138, r4877, r6028; +} +{ +mul.f16x2 r6141, r5021, r6029; +} +{ +sub.f16x2 r6144, r6138, r6141; +} +{ +mul.f16x2 r6147, r4877, r6029; +} +{ +fma.rn.f16x2 r6150, r5021, r6028, r6147; +} +{ +mul.f16x2 r6154, r5199, r6032; +} +{ +mul.f16x2 r6157, r5343, r6033; +} +{ +sub.f16x2 r6160, r6154, r6157; +} +{ +mul.f16x2 r6163, r5199, r6033; +} +{ +fma.rn.f16x2 r6166, r5343, r6032, r6163; +} +{ +mul.f16x2 r6170, r5521, r6036; +} +{ +mul.f16x2 r6173, r5665, r6037; +} +{ +sub.f16x2 r6176, r6170, r6173; +} +{ +mul.f16x2 r6179, r5521, r6037; +} +{ +fma.rn.f16x2 r6182, r5665, r6036, r6179; +} +{ +mul.f16x2 r6186, r5843, r6040; +} +{ +mul.f16x2 r6189, r5987, r6041; +} +{ +sub.f16x2 r6192, r6186, r6189; +} +{ +mul.f16x2 r6195, r5843, r6041; +} +{ +fma.rn.f16x2 r6198, r5987, r6040, r6195; +} +{ +mul.f16x2 r6202, r4913, r6030; +} +{ +mul.f16x2 r6205, r5057, r6031; +} +{ +sub.f16x2 r6208, r6202, r6205; +} +{ +mul.f16x2 r6211, r4913, r6031; +} +{ +fma.rn.f16x2 r6214, r5057, r6030, r6211; +} +{ +mul.f16x2 r6218, r5235, r6036; +} +{ +mul.f16x2 r6221, r5379, r6037; +} +{ +sub.f16x2 r6224, r6218, r6221; +} +{ +mul.f16x2 r6227, r5235, r6037; +} +{ +fma.rn.f16x2 r6230, r5379, r6036, r6227; +} +{ +mul.f16x2 r6234, r5557, r6042; +} +{ +mul.f16x2 r6237, r5701, r6043; +} +{ +sub.f16x2 r6240, r6234, r6237; +} +{ +mul.f16x2 r6243, r5557, r6043; +} +{ +fma.rn.f16x2 r6246, r5701, r6042, r6243; +} +{ +mul.f16x2 r6250, r5879, r6048; +} +{ +mul.f16x2 r6253, r6023, r6049; +} +{ +sub.f16x2 r6256, r6250, r6253; +} +{ +mul.f16x2 r6259, r5879, r6049; +} +{ +fma.rn.f16x2 r6262, r6023, r6048, r6259; +} +{ +mul.f16x2 r6266, r4841, r6032; +} +{ +mul.f16x2 r6269, r4985, r6033; +} +{ +sub.f16x2 r6272, r6266, r6269; +} +{ +mul.f16x2 r6275, r4841, r6033; +} +{ +fma.rn.f16x2 r6278, r4985, r6032, r6275; +} +{ +mul.f16x2 r6282, r5163, r6040; +} +{ +mul.f16x2 r6285, r5307, r6041; +} +{ +sub.f16x2 r6288, r6282, r6285; +} +{ +mul.f16x2 r6291, r5163, r6041; +} +{ +fma.rn.f16x2 r6294, r5307, r6040, r6291; +} +{ +mul.f16x2 r6298, r5485, r6048; +} +{ +mul.f16x2 r6301, r5629, r6049; +} +{ +sub.f16x2 r6304, r6298, r6301; +} +{ +mul.f16x2 r6307, r5485, r6049; +} +{ +fma.rn.f16x2 r6310, r5629, r6048, r6307; +} +{ +mul.f16x2 r6314, r5807, r6056; +} +{ +mul.f16x2 r6317, r5951, r6057; +} +{ +sub.f16x2 r6320, r6314, r6317; +} +{ +mul.f16x2 r6323, r5807, r6057; +} +{ +fma.rn.f16x2 r6326, r5951, r6056, r6323; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6331, {low, high}; +} +{ +neg.f16x2 r6332, r6331; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6334, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6335, {low, high}; +} +{ +neg.f16x2 r6336, r6335; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6338, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6339, {low, high}; +} +{ +add.f16x2 r6340, r4757, r5723; +} +{ +add.f16x2 r6343, r4435, r6340; +} +{ +add.f16x2 r6346, r5079, r5401; +} +{ +add.f16x2 %0, r6343, r6346; +} +{ +add.f16x2 r6352, r4769, r5735; +} +{ +add.f16x2 r6355, r4447, r6352; +} +{ +add.f16x2 r6358, r5091, r5413; +} +{ +add.f16x2 %1, r6355, r6358; +} +{ +add.f16x2 r6364, r4757, r5723; +} +{ +mul.f16x2 r6367, r6364, r6330; +} +{ +add.f16x2 r6370, r4435, r6367; +} +{ +add.f16x2 r6373, r5079, r5401; +} +{ +mul.f16x2 r6376, r6373, r6334; +} +{ +add.f16x2 r6379, r6370, r6376; +} +{ +sub.f16x2 r6382, r4769, r5735; +} +{ +mul.f16x2 r6385, r6382, r6332; +} +{ +sub.f16x2 r6388, r5091, r5413; +} +{ +mul.f16x2 r6391, r6388, r6336; +} +{ +add.f16x2 r6394, r6385, r6391; +} +{ +sub.f16x2 %10, r6379, r6394; +} +{ +add.f16x2 r6400, r4757, r5723; +} +{ +mul.f16x2 r6403, r6400, r6330; +} +{ +add.f16x2 r6406, r4435, r6403; +} +{ +add.f16x2 r6409, r5079, r5401; +} +{ +mul.f16x2 r6412, r6409, r6334; +} +{ +add.f16x2 r6415, r6406, r6412; +} +{ +sub.f16x2 r6418, r4769, r5735; +} +{ +mul.f16x2 r6421, r6418, r6332; +} +{ +sub.f16x2 r6424, r5091, r5413; +} +{ +mul.f16x2 r6427, r6424, r6336; +} +{ +add.f16x2 r6430, r6421, r6427; +} +{ +add.f16x2 %40, r6415, r6430; +} +{ +add.f16x2 r6436, r4757, r5723; +} +{ +mul.f16x2 r6439, r6436, r6334; +} +{ +add.f16x2 r6442, r4435, r6439; +} +{ +add.f16x2 r6445, r5079, r5401; +} +{ +mul.f16x2 r6448, r6445, r6338; +} +{ +add.f16x2 r6451, r6442, r6448; +} +{ +sub.f16x2 r6454, r4769, r5735; +} +{ +mul.f16x2 r6457, r6454, r6336; +} +{ +sub.f16x2 r6460, r5091, r5413; +} +{ +mul.f16x2 r6463, r6460, r6339; +} +{ +add.f16x2 r6466, r6457, r6463; +} +{ +sub.f16x2 %20, r6451, r6466; +} +{ +add.f16x2 r6472, r4757, r5723; +} +{ +mul.f16x2 r6475, r6472, r6334; +} +{ +add.f16x2 r6478, r4435, r6475; +} +{ +add.f16x2 r6481, r5079, r5401; +} +{ +mul.f16x2 r6484, r6481, r6338; +} +{ +add.f16x2 r6487, r6478, r6484; +} +{ +sub.f16x2 r6490, r4769, r5735; +} +{ +mul.f16x2 r6493, r6490, r6336; +} +{ +sub.f16x2 r6496, r5091, r5413; +} +{ +mul.f16x2 r6499, r6496, r6339; +} +{ +add.f16x2 r6502, r6493, r6499; +} +{ +add.f16x2 %30, r6487, r6502; +} +{ +add.f16x2 r6508, r4769, r5735; +} +{ +mul.f16x2 r6511, r6508, r6330; +} +{ +add.f16x2 r6514, r4447, r6511; +} +{ +add.f16x2 r6517, r5091, r5413; +} +{ +mul.f16x2 r6520, r6517, r6334; +} +{ +add.f16x2 r6523, r6514, r6520; +} +{ +sub.f16x2 r6526, r4757, r5723; +} +{ +mul.f16x2 r6529, r6526, r6332; +} +{ +sub.f16x2 r6532, r5079, r5401; +} +{ +mul.f16x2 r6535, r6532, r6336; +} +{ +add.f16x2 r6538, r6529, r6535; +} +{ +add.f16x2 %11, r6523, r6538; +} +{ +add.f16x2 r6544, r4769, r5735; +} +{ +mul.f16x2 r6547, r6544, r6330; +} +{ +add.f16x2 r6550, r4447, r6547; +} +{ +add.f16x2 r6553, r5091, r5413; +} +{ +mul.f16x2 r6556, r6553, r6334; +} +{ +add.f16x2 r6559, r6550, r6556; +} +{ +sub.f16x2 r6562, r4757, r5723; +} +{ +mul.f16x2 r6565, r6562, r6332; +} +{ +sub.f16x2 r6568, r5079, r5401; +} +{ +mul.f16x2 r6571, r6568, r6336; +} +{ +add.f16x2 r6574, r6565, r6571; +} +{ +sub.f16x2 %41, r6559, r6574; +} +{ +add.f16x2 r6580, r4769, r5735; +} +{ +mul.f16x2 r6583, r6580, r6334; +} +{ +add.f16x2 r6586, r4447, r6583; +} +{ +add.f16x2 r6589, r5091, r5413; +} +{ +mul.f16x2 r6592, r6589, r6338; +} +{ +add.f16x2 r6595, r6586, r6592; +} +{ +sub.f16x2 r6598, r4757, r5723; +} +{ +mul.f16x2 r6601, r6598, r6336; +} +{ +sub.f16x2 r6604, r5079, r5401; +} +{ +mul.f16x2 r6607, r6604, r6339; +} +{ +add.f16x2 r6610, r6601, r6607; +} +{ +add.f16x2 %21, r6595, r6610; +} +{ +add.f16x2 r6616, r4769, r5735; +} +{ +mul.f16x2 r6619, r6616, r6334; +} +{ +add.f16x2 r6622, r4447, r6619; +} +{ +add.f16x2 r6625, r5091, r5413; +} +{ +mul.f16x2 r6628, r6625, r6338; +} +{ +add.f16x2 r6631, r6622, r6628; +} +{ +sub.f16x2 r6634, r4757, r5723; +} +{ +mul.f16x2 r6637, r6634, r6336; +} +{ +sub.f16x2 r6640, r5079, r5401; +} +{ +mul.f16x2 r6643, r6640, r6339; +} +{ +add.f16x2 r6646, r6637, r6643; +} +{ +sub.f16x2 %31, r6631, r6646; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6653, {low, high}; +} +{ +neg.f16x2 r6654, r6653; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6656, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6657, {low, high}; +} +{ +neg.f16x2 r6658, r6657; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6660, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6661, {low, high}; +} +{ +add.f16x2 r6662, r6080, r6128; +} +{ +add.f16x2 r6665, r4483, r6662; +} +{ +add.f16x2 r6668, r6096, r6112; +} +{ +add.f16x2 %2, r6665, r6668; +} +{ +add.f16x2 r6674, r6086, r6134; +} +{ +add.f16x2 r6677, r4627, r6674; +} +{ +add.f16x2 r6680, r6102, r6118; +} +{ +add.f16x2 %3, r6677, r6680; +} +{ +add.f16x2 r6686, r6080, r6128; +} +{ +mul.f16x2 r6689, r6686, r6652; +} +{ +add.f16x2 r6692, r4483, r6689; +} +{ +add.f16x2 r6695, r6096, r6112; +} +{ +mul.f16x2 r6698, r6695, r6656; +} +{ +add.f16x2 r6701, r6692, r6698; +} +{ +sub.f16x2 r6704, r6086, r6134; +} +{ +mul.f16x2 r6707, r6704, r6654; +} +{ +sub.f16x2 r6710, r6102, r6118; +} +{ +mul.f16x2 r6713, r6710, r6658; +} +{ +add.f16x2 r6716, r6707, r6713; +} +{ +sub.f16x2 %12, r6701, r6716; +} +{ +add.f16x2 r6722, r6080, r6128; +} +{ +mul.f16x2 r6725, r6722, r6652; +} +{ +add.f16x2 r6728, r4483, r6725; +} +{ +add.f16x2 r6731, r6096, r6112; +} +{ +mul.f16x2 r6734, r6731, r6656; +} +{ +add.f16x2 r6737, r6728, r6734; +} +{ +sub.f16x2 r6740, r6086, r6134; +} +{ +mul.f16x2 r6743, r6740, r6654; +} +{ +sub.f16x2 r6746, r6102, r6118; +} +{ +mul.f16x2 r6749, r6746, r6658; +} +{ +add.f16x2 r6752, r6743, r6749; +} +{ +add.f16x2 %42, r6737, r6752; +} +{ +add.f16x2 r6758, r6080, r6128; +} +{ +mul.f16x2 r6761, r6758, r6656; +} +{ +add.f16x2 r6764, r4483, r6761; +} +{ +add.f16x2 r6767, r6096, r6112; +} +{ +mul.f16x2 r6770, r6767, r6660; +} +{ +add.f16x2 r6773, r6764, r6770; +} +{ +sub.f16x2 r6776, r6086, r6134; +} +{ +mul.f16x2 r6779, r6776, r6658; +} +{ +sub.f16x2 r6782, r6102, r6118; +} +{ +mul.f16x2 r6785, r6782, r6661; +} +{ +add.f16x2 r6788, r6779, r6785; +} +{ +sub.f16x2 %22, r6773, r6788; +} +{ +add.f16x2 r6794, r6080, r6128; +} +{ +mul.f16x2 r6797, r6794, r6656; +} +{ +add.f16x2 r6800, r4483, r6797; +} +{ +add.f16x2 r6803, r6096, r6112; +} +{ +mul.f16x2 r6806, r6803, r6660; +} +{ +add.f16x2 r6809, r6800, r6806; +} +{ +sub.f16x2 r6812, r6086, r6134; +} +{ +mul.f16x2 r6815, r6812, r6658; +} +{ +sub.f16x2 r6818, r6102, r6118; +} +{ +mul.f16x2 r6821, r6818, r6661; +} +{ +add.f16x2 r6824, r6815, r6821; +} +{ +add.f16x2 %32, r6809, r6824; +} +{ +add.f16x2 r6830, r6086, r6134; +} +{ +mul.f16x2 r6833, r6830, r6652; +} +{ +add.f16x2 r6836, r4627, r6833; +} +{ +add.f16x2 r6839, r6102, r6118; +} +{ +mul.f16x2 r6842, r6839, r6656; +} +{ +add.f16x2 r6845, r6836, r6842; +} +{ +sub.f16x2 r6848, r6080, r6128; +} +{ +mul.f16x2 r6851, r6848, r6654; +} +{ +sub.f16x2 r6854, r6096, r6112; +} +{ +mul.f16x2 r6857, r6854, r6658; +} +{ +add.f16x2 r6860, r6851, r6857; +} +{ +add.f16x2 %13, r6845, r6860; +} +{ +add.f16x2 r6866, r6086, r6134; +} +{ +mul.f16x2 r6869, r6866, r6652; +} +{ +add.f16x2 r6872, r4627, r6869; +} +{ +add.f16x2 r6875, r6102, r6118; +} +{ +mul.f16x2 r6878, r6875, r6656; +} +{ +add.f16x2 r6881, r6872, r6878; +} +{ +sub.f16x2 r6884, r6080, r6128; +} +{ +mul.f16x2 r6887, r6884, r6654; +} +{ +sub.f16x2 r6890, r6096, r6112; +} +{ +mul.f16x2 r6893, r6890, r6658; +} +{ +add.f16x2 r6896, r6887, r6893; +} +{ +sub.f16x2 %43, r6881, r6896; +} +{ +add.f16x2 r6902, r6086, r6134; +} +{ +mul.f16x2 r6905, r6902, r6656; +} +{ +add.f16x2 r6908, r4627, r6905; +} +{ +add.f16x2 r6911, r6102, r6118; +} +{ +mul.f16x2 r6914, r6911, r6660; +} +{ +add.f16x2 r6917, r6908, r6914; +} +{ +sub.f16x2 r6920, r6080, r6128; +} +{ +mul.f16x2 r6923, r6920, r6658; +} +{ +sub.f16x2 r6926, r6096, r6112; +} +{ +mul.f16x2 r6929, r6926, r6661; +} +{ +add.f16x2 r6932, r6923, r6929; +} +{ +add.f16x2 %23, r6917, r6932; +} +{ +add.f16x2 r6938, r6086, r6134; +} +{ +mul.f16x2 r6941, r6938, r6656; +} +{ +add.f16x2 r6944, r4627, r6941; +} +{ +add.f16x2 r6947, r6102, r6118; +} +{ +mul.f16x2 r6950, r6947, r6660; +} +{ +add.f16x2 r6953, r6944, r6950; +} +{ +sub.f16x2 r6956, r6080, r6128; +} +{ +mul.f16x2 r6959, r6956, r6658; +} +{ +sub.f16x2 r6962, r6096, r6112; +} +{ +mul.f16x2 r6965, r6962, r6661; +} +{ +add.f16x2 r6968, r6959, r6965; +} +{ +sub.f16x2 %33, r6953, r6968; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6974, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6975, {low, high}; +} +{ +neg.f16x2 r6976, r6975; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r6978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r6979, {low, high}; +} +{ +neg.f16x2 r6980, r6979; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r6982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r6983, {low, high}; +} +{ +add.f16x2 r6984, r6144, r6192; +} +{ +add.f16x2 r6987, r4555, r6984; +} +{ +add.f16x2 r6990, r6160, r6176; +} +{ +add.f16x2 %4, r6987, r6990; +} +{ +add.f16x2 r6996, r6150, r6198; +} +{ +add.f16x2 r6999, r4699, r6996; +} +{ +add.f16x2 r7002, r6166, r6182; +} +{ +add.f16x2 %5, r6999, r7002; +} +{ +add.f16x2 r7008, r6144, r6192; +} +{ +mul.f16x2 r7011, r7008, r6974; +} +{ +add.f16x2 r7014, r4555, r7011; +} +{ +add.f16x2 r7017, r6160, r6176; +} +{ +mul.f16x2 r7020, r7017, r6978; +} +{ +add.f16x2 r7023, r7014, r7020; +} +{ +sub.f16x2 r7026, r6150, r6198; +} +{ +mul.f16x2 r7029, r7026, r6976; +} +{ +sub.f16x2 r7032, r6166, r6182; +} +{ +mul.f16x2 r7035, r7032, r6980; +} +{ +add.f16x2 r7038, r7029, r7035; +} +{ +sub.f16x2 %14, r7023, r7038; +} +{ +add.f16x2 r7044, r6144, r6192; +} +{ +mul.f16x2 r7047, r7044, r6974; +} +{ +add.f16x2 r7050, r4555, r7047; +} +{ +add.f16x2 r7053, r6160, r6176; +} +{ +mul.f16x2 r7056, r7053, r6978; +} +{ +add.f16x2 r7059, r7050, r7056; +} +{ +sub.f16x2 r7062, r6150, r6198; +} +{ +mul.f16x2 r7065, r7062, r6976; +} +{ +sub.f16x2 r7068, r6166, r6182; +} +{ +mul.f16x2 r7071, r7068, r6980; +} +{ +add.f16x2 r7074, r7065, r7071; +} +{ +add.f16x2 %44, r7059, r7074; +} +{ +add.f16x2 r7080, r6144, r6192; +} +{ +mul.f16x2 r7083, r7080, r6978; +} +{ +add.f16x2 r7086, r4555, r7083; +} +{ +add.f16x2 r7089, r6160, r6176; +} +{ +mul.f16x2 r7092, r7089, r6982; +} +{ +add.f16x2 r7095, r7086, r7092; +} +{ +sub.f16x2 r7098, r6150, r6198; +} +{ +mul.f16x2 r7101, r7098, r6980; +} +{ +sub.f16x2 r7104, r6166, r6182; +} +{ +mul.f16x2 r7107, r7104, r6983; +} +{ +add.f16x2 r7110, r7101, r7107; +} +{ +sub.f16x2 %24, r7095, r7110; +} +{ +add.f16x2 r7116, r6144, r6192; +} +{ +mul.f16x2 r7119, r7116, r6978; +} +{ +add.f16x2 r7122, r4555, r7119; +} +{ +add.f16x2 r7125, r6160, r6176; +} +{ +mul.f16x2 r7128, r7125, r6982; +} +{ +add.f16x2 r7131, r7122, r7128; +} +{ +sub.f16x2 r7134, r6150, r6198; +} +{ +mul.f16x2 r7137, r7134, r6980; +} +{ +sub.f16x2 r7140, r6166, r6182; +} +{ +mul.f16x2 r7143, r7140, r6983; +} +{ +add.f16x2 r7146, r7137, r7143; +} +{ +add.f16x2 %34, r7131, r7146; +} +{ +add.f16x2 r7152, r6150, r6198; +} +{ +mul.f16x2 r7155, r7152, r6974; +} +{ +add.f16x2 r7158, r4699, r7155; +} +{ +add.f16x2 r7161, r6166, r6182; +} +{ +mul.f16x2 r7164, r7161, r6978; +} +{ +add.f16x2 r7167, r7158, r7164; +} +{ +sub.f16x2 r7170, r6144, r6192; +} +{ +mul.f16x2 r7173, r7170, r6976; +} +{ +sub.f16x2 r7176, r6160, r6176; +} +{ +mul.f16x2 r7179, r7176, r6980; +} +{ +add.f16x2 r7182, r7173, r7179; +} +{ +add.f16x2 %15, r7167, r7182; +} +{ +add.f16x2 r7188, r6150, r6198; +} +{ +mul.f16x2 r7191, r7188, r6974; +} +{ +add.f16x2 r7194, r4699, r7191; +} +{ +add.f16x2 r7197, r6166, r6182; +} +{ +mul.f16x2 r7200, r7197, r6978; +} +{ +add.f16x2 r7203, r7194, r7200; +} +{ +sub.f16x2 r7206, r6144, r6192; +} +{ +mul.f16x2 r7209, r7206, r6976; +} +{ +sub.f16x2 r7212, r6160, r6176; +} +{ +mul.f16x2 r7215, r7212, r6980; +} +{ +add.f16x2 r7218, r7209, r7215; +} +{ +sub.f16x2 %45, r7203, r7218; +} +{ +add.f16x2 r7224, r6150, r6198; +} +{ +mul.f16x2 r7227, r7224, r6978; +} +{ +add.f16x2 r7230, r4699, r7227; +} +{ +add.f16x2 r7233, r6166, r6182; +} +{ +mul.f16x2 r7236, r7233, r6982; +} +{ +add.f16x2 r7239, r7230, r7236; +} +{ +sub.f16x2 r7242, r6144, r6192; +} +{ +mul.f16x2 r7245, r7242, r6980; +} +{ +sub.f16x2 r7248, r6160, r6176; +} +{ +mul.f16x2 r7251, r7248, r6983; +} +{ +add.f16x2 r7254, r7245, r7251; +} +{ +add.f16x2 %25, r7239, r7254; +} +{ +add.f16x2 r7260, r6150, r6198; +} +{ +mul.f16x2 r7263, r7260, r6978; +} +{ +add.f16x2 r7266, r4699, r7263; +} +{ +add.f16x2 r7269, r6166, r6182; +} +{ +mul.f16x2 r7272, r7269, r6982; +} +{ +add.f16x2 r7275, r7266, r7272; +} +{ +sub.f16x2 r7278, r6144, r6192; +} +{ +mul.f16x2 r7281, r7278, r6980; +} +{ +sub.f16x2 r7284, r6160, r6176; +} +{ +mul.f16x2 r7287, r7284, r6983; +} +{ +add.f16x2 r7290, r7281, r7287; +} +{ +sub.f16x2 %35, r7275, r7290; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7296, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7297, {low, high}; +} +{ +neg.f16x2 r7298, r7297; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7300, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7301, {low, high}; +} +{ +neg.f16x2 r7302, r7301; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7304, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7305, {low, high}; +} +{ +add.f16x2 r7306, r6208, r6256; +} +{ +add.f16x2 r7309, r4591, r7306; +} +{ +add.f16x2 r7312, r6224, r6240; +} +{ +add.f16x2 %6, r7309, r7312; +} +{ +add.f16x2 r7318, r6214, r6262; +} +{ +add.f16x2 r7321, r4735, r7318; +} +{ +add.f16x2 r7324, r6230, r6246; +} +{ +add.f16x2 %7, r7321, r7324; +} +{ +add.f16x2 r7330, r6208, r6256; +} +{ +mul.f16x2 r7333, r7330, r7296; +} +{ +add.f16x2 r7336, r4591, r7333; +} +{ +add.f16x2 r7339, r6224, r6240; +} +{ +mul.f16x2 r7342, r7339, r7300; +} +{ +add.f16x2 r7345, r7336, r7342; +} +{ +sub.f16x2 r7348, r6214, r6262; +} +{ +mul.f16x2 r7351, r7348, r7298; +} +{ +sub.f16x2 r7354, r6230, r6246; +} +{ +mul.f16x2 r7357, r7354, r7302; +} +{ +add.f16x2 r7360, r7351, r7357; +} +{ +sub.f16x2 %16, r7345, r7360; +} +{ +add.f16x2 r7366, r6208, r6256; +} +{ +mul.f16x2 r7369, r7366, r7296; +} +{ +add.f16x2 r7372, r4591, r7369; +} +{ +add.f16x2 r7375, r6224, r6240; +} +{ +mul.f16x2 r7378, r7375, r7300; +} +{ +add.f16x2 r7381, r7372, r7378; +} +{ +sub.f16x2 r7384, r6214, r6262; +} +{ +mul.f16x2 r7387, r7384, r7298; +} +{ +sub.f16x2 r7390, r6230, r6246; +} +{ +mul.f16x2 r7393, r7390, r7302; +} +{ +add.f16x2 r7396, r7387, r7393; +} +{ +add.f16x2 %46, r7381, r7396; +} +{ +add.f16x2 r7402, r6208, r6256; +} +{ +mul.f16x2 r7405, r7402, r7300; +} +{ +add.f16x2 r7408, r4591, r7405; +} +{ +add.f16x2 r7411, r6224, r6240; +} +{ +mul.f16x2 r7414, r7411, r7304; +} +{ +add.f16x2 r7417, r7408, r7414; +} +{ +sub.f16x2 r7420, r6214, r6262; +} +{ +mul.f16x2 r7423, r7420, r7302; +} +{ +sub.f16x2 r7426, r6230, r6246; +} +{ +mul.f16x2 r7429, r7426, r7305; +} +{ +add.f16x2 r7432, r7423, r7429; +} +{ +sub.f16x2 %26, r7417, r7432; +} +{ +add.f16x2 r7438, r6208, r6256; +} +{ +mul.f16x2 r7441, r7438, r7300; +} +{ +add.f16x2 r7444, r4591, r7441; +} +{ +add.f16x2 r7447, r6224, r6240; +} +{ +mul.f16x2 r7450, r7447, r7304; +} +{ +add.f16x2 r7453, r7444, r7450; +} +{ +sub.f16x2 r7456, r6214, r6262; +} +{ +mul.f16x2 r7459, r7456, r7302; +} +{ +sub.f16x2 r7462, r6230, r6246; +} +{ +mul.f16x2 r7465, r7462, r7305; +} +{ +add.f16x2 r7468, r7459, r7465; +} +{ +add.f16x2 %36, r7453, r7468; +} +{ +add.f16x2 r7474, r6214, r6262; +} +{ +mul.f16x2 r7477, r7474, r7296; +} +{ +add.f16x2 r7480, r4735, r7477; +} +{ +add.f16x2 r7483, r6230, r6246; +} +{ +mul.f16x2 r7486, r7483, r7300; +} +{ +add.f16x2 r7489, r7480, r7486; +} +{ +sub.f16x2 r7492, r6208, r6256; +} +{ +mul.f16x2 r7495, r7492, r7298; +} +{ +sub.f16x2 r7498, r6224, r6240; +} +{ +mul.f16x2 r7501, r7498, r7302; +} +{ +add.f16x2 r7504, r7495, r7501; +} +{ +add.f16x2 %17, r7489, r7504; +} +{ +add.f16x2 r7510, r6214, r6262; +} +{ +mul.f16x2 r7513, r7510, r7296; +} +{ +add.f16x2 r7516, r4735, r7513; +} +{ +add.f16x2 r7519, r6230, r6246; +} +{ +mul.f16x2 r7522, r7519, r7300; +} +{ +add.f16x2 r7525, r7516, r7522; +} +{ +sub.f16x2 r7528, r6208, r6256; +} +{ +mul.f16x2 r7531, r7528, r7298; +} +{ +sub.f16x2 r7534, r6224, r6240; +} +{ +mul.f16x2 r7537, r7534, r7302; +} +{ +add.f16x2 r7540, r7531, r7537; +} +{ +sub.f16x2 %47, r7525, r7540; +} +{ +add.f16x2 r7546, r6214, r6262; +} +{ +mul.f16x2 r7549, r7546, r7300; +} +{ +add.f16x2 r7552, r4735, r7549; +} +{ +add.f16x2 r7555, r6230, r6246; +} +{ +mul.f16x2 r7558, r7555, r7304; +} +{ +add.f16x2 r7561, r7552, r7558; +} +{ +sub.f16x2 r7564, r6208, r6256; +} +{ +mul.f16x2 r7567, r7564, r7302; +} +{ +sub.f16x2 r7570, r6224, r6240; +} +{ +mul.f16x2 r7573, r7570, r7305; +} +{ +add.f16x2 r7576, r7567, r7573; +} +{ +add.f16x2 %27, r7561, r7576; +} +{ +add.f16x2 r7582, r6214, r6262; +} +{ +mul.f16x2 r7585, r7582, r7300; +} +{ +add.f16x2 r7588, r4735, r7585; +} +{ +add.f16x2 r7591, r6230, r6246; +} +{ +mul.f16x2 r7594, r7591, r7304; +} +{ +add.f16x2 r7597, r7588, r7594; +} +{ +sub.f16x2 r7600, r6208, r6256; +} +{ +mul.f16x2 r7603, r7600, r7302; +} +{ +sub.f16x2 r7606, r6224, r6240; +} +{ +mul.f16x2 r7609, r7606, r7305; +} +{ +add.f16x2 r7612, r7603, r7609; +} +{ +sub.f16x2 %37, r7597, r7612; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7618, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7619, {low, high}; +} +{ +neg.f16x2 r7620, r7619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f478; +cvt.rn.f16.f32 high, f478; +mov.b32 r7622, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f480; +cvt.rn.f16.f32 high, f480; +mov.b32 r7623, {low, high}; +} +{ +neg.f16x2 r7624, r7623; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f482; +cvt.rn.f16.f32 high, f482; +mov.b32 r7626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f484; +cvt.rn.f16.f32 high, f484; +mov.b32 r7627, {low, high}; +} +{ +add.f16x2 r7628, r6272, r6320; +} +{ +add.f16x2 r7631, r4519, r7628; +} +{ +add.f16x2 r7634, r6288, r6304; +} +{ +add.f16x2 %8, r7631, r7634; +} +{ +add.f16x2 r7640, r6278, r6326; +} +{ +add.f16x2 r7643, r4663, r7640; +} +{ +add.f16x2 r7646, r6294, r6310; +} +{ +add.f16x2 %9, r7643, r7646; +} +{ +add.f16x2 r7652, r6272, r6320; +} +{ +mul.f16x2 r7655, r7652, r7618; +} +{ +add.f16x2 r7658, r4519, r7655; +} +{ +add.f16x2 r7661, r6288, r6304; +} +{ +mul.f16x2 r7664, r7661, r7622; +} +{ +add.f16x2 r7667, r7658, r7664; +} +{ +sub.f16x2 r7670, r6278, r6326; +} +{ +mul.f16x2 r7673, r7670, r7620; +} +{ +sub.f16x2 r7676, r6294, r6310; +} +{ +mul.f16x2 r7679, r7676, r7624; +} +{ +add.f16x2 r7682, r7673, r7679; +} +{ +sub.f16x2 %18, r7667, r7682; +} +{ +add.f16x2 r7688, r6272, r6320; +} +{ +mul.f16x2 r7691, r7688, r7618; +} +{ +add.f16x2 r7694, r4519, r7691; +} +{ +add.f16x2 r7697, r6288, r6304; +} +{ +mul.f16x2 r7700, r7697, r7622; +} +{ +add.f16x2 r7703, r7694, r7700; +} +{ +sub.f16x2 r7706, r6278, r6326; +} +{ +mul.f16x2 r7709, r7706, r7620; +} +{ +sub.f16x2 r7712, r6294, r6310; +} +{ +mul.f16x2 r7715, r7712, r7624; +} +{ +add.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 %48, r7703, r7718; +} +{ +add.f16x2 r7724, r6272, r6320; +} +{ +mul.f16x2 r7727, r7724, r7622; +} +{ +add.f16x2 r7730, r4519, r7727; +} +{ +add.f16x2 r7733, r6288, r6304; +} +{ +mul.f16x2 r7736, r7733, r7626; +} +{ +add.f16x2 r7739, r7730, r7736; +} +{ +sub.f16x2 r7742, r6278, r6326; +} +{ +mul.f16x2 r7745, r7742, r7624; +} +{ +sub.f16x2 r7748, r6294, r6310; +} +{ +mul.f16x2 r7751, r7748, r7627; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +sub.f16x2 %28, r7739, r7754; +} +{ +add.f16x2 r7760, r6272, r6320; +} +{ +mul.f16x2 r7763, r7760, r7622; +} +{ +add.f16x2 r7766, r4519, r7763; +} +{ +add.f16x2 r7769, r6288, r6304; +} +{ +mul.f16x2 r7772, r7769, r7626; +} +{ +add.f16x2 r7775, r7766, r7772; +} +{ +sub.f16x2 r7778, r6278, r6326; +} +{ +mul.f16x2 r7781, r7778, r7624; +} +{ +sub.f16x2 r7784, r6294, r6310; +} +{ +mul.f16x2 r7787, r7784, r7627; +} +{ +add.f16x2 r7790, r7781, r7787; +} +{ +add.f16x2 %38, r7775, r7790; +} +{ +add.f16x2 r7796, r6278, r6326; +} +{ +mul.f16x2 r7799, r7796, r7618; +} +{ +add.f16x2 r7802, r4663, r7799; +} +{ +add.f16x2 r7805, r6294, r6310; +} +{ +mul.f16x2 r7808, r7805, r7622; +} +{ +add.f16x2 r7811, r7802, r7808; +} +{ +sub.f16x2 r7814, r6272, r6320; +} +{ +mul.f16x2 r7817, r7814, r7620; +} +{ +sub.f16x2 r7820, r6288, r6304; +} +{ +mul.f16x2 r7823, r7820, r7624; +} +{ +add.f16x2 r7826, r7817, r7823; +} +{ +add.f16x2 %19, r7811, r7826; +} +{ +add.f16x2 r7832, r6278, r6326; +} +{ +mul.f16x2 r7835, r7832, r7618; +} +{ +add.f16x2 r7838, r4663, r7835; +} +{ +add.f16x2 r7841, r6294, r6310; +} +{ +mul.f16x2 r7844, r7841, r7622; +} +{ +add.f16x2 r7847, r7838, r7844; +} +{ +sub.f16x2 r7850, r6272, r6320; +} +{ +mul.f16x2 r7853, r7850, r7620; +} +{ +sub.f16x2 r7856, r6288, r6304; +} +{ +mul.f16x2 r7859, r7856, r7624; +} +{ +add.f16x2 r7862, r7853, r7859; +} +{ +sub.f16x2 %49, r7847, r7862; +} +{ +add.f16x2 r7868, r6278, r6326; +} +{ +mul.f16x2 r7871, r7868, r7622; +} +{ +add.f16x2 r7874, r4663, r7871; +} +{ +add.f16x2 r7877, r6294, r6310; +} +{ +mul.f16x2 r7880, r7877, r7626; +} +{ +add.f16x2 r7883, r7874, r7880; +} +{ +sub.f16x2 r7886, r6272, r6320; +} +{ +mul.f16x2 r7889, r7886, r7624; +} +{ +sub.f16x2 r7892, r6288, r6304; +} +{ +mul.f16x2 r7895, r7892, r7627; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 %29, r7883, r7898; +} +{ +add.f16x2 r7904, r6278, r6326; +} +{ +mul.f16x2 r7907, r7904, r7622; +} +{ +add.f16x2 r7910, r4663, r7907; +} +{ +add.f16x2 r7913, r6294, r6310; +} +{ +mul.f16x2 r7916, r7913, r7626; +} +{ +add.f16x2 r7919, r7910, r7916; +} +{ +sub.f16x2 r7922, r6272, r6320; +} +{ +mul.f16x2 r7925, r7922, r7624; +} +{ +sub.f16x2 r7928, r6288, r6304; +} +{ +mul.f16x2 r7931, r7928, r7627; +} +{ +add.f16x2 r7934, r7925, r7931; +} +{ +sub.f16x2 %39, r7919, r7934; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[13].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1112, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<94>; +.reg .b32 r<1765>; +.reg .b64 rd<8>; +mov.u32 r1742, %tid.x; +mov.f32 f82, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1, {low, high}; +} +mov.f32 f84, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f78, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r5, {low, high}; +} +mov.f32 f80, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r1742, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1743, rd3; +mul.lo.s32 r1744, r1743, 125; +sub.s32 r1745, r1742, r1744; +cvt.rn.f32.u32 f85, r1745; +mul.f32 f86, f85, 0f3C24B5BE; +cos.approx.f32 f13, f86; +sin.approx.f32 f87, f86; +neg.f32 f14, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +mov.u32 r1746, %tid.y; +mov.u32 r1747, %10; +mad.lo.s32 r1748, r1746, 5000, r1747; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +mad.lo.s32 r1749, r1743, 5000, r1748; +barrier.sync 0; +mad.lo.s32 r1750, r1745, 40, r1749; +st.shared.v2.f32 [r1750], {r20, r32}; +st.shared.v2.f32 [r1750+8], {r333, r342}; +st.shared.v2.f32 [r1750+16], {r370, r379}; +st.shared.v2.f32 [r1750+24], {r407, r416}; +st.shared.v2.f32 [r1750+32], {r444, r453}; +barrier.sync 0; +shl.b32 r1751, r1745, 5; +sub.s32 r1752, r1750, r1751; +ld.shared.u32 r488, [r1752]; +ld.shared.u32 r500, [r1752+4]; +ld.shared.u32 r485, [r1752+1000]; +ld.shared.u32 r497, [r1752+1004]; +ld.shared.u32 r491, [r1752+2000]; +ld.shared.u32 r503, [r1752+2004]; +ld.shared.u32 r492, [r1752+3000]; +ld.shared.u32 r504, [r1752+3004]; +ld.shared.u32 r486, [r1752+4000]; +ld.shared.u32 r498, [r1752+4004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 r493, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 r505, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 r577, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 r721, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 r757, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 r793, r775, r790; +} +mul.wide.u32 rd4, r1745, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1753, rd5; +cvt.rn.f32.u32 f88, r1753; +mul.f32 f89, f88, 0f3D4DE32E; +cos.approx.f32 f37, f89; +sin.approx.f32 f90, f89; +neg.f32 f38, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r796, {low, high}; +} +mul.lo.s32 r1754, r1753, 5; +sub.s32 r1755, r1745, r1754; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r801, {high, high}; +} +{ +mul.f16x2 r803, r685, r801; +} +{ +fma.rn.f16x2 r806, r541, r799, r803; +} +{ +mul.f16x2 r810, r541, r801; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r685, r799, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r821, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r823, {low, high}; +} +{ +mul.f16x2 r824, r821, r823; +} +{ +mul.f16x2 r827, r796, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r830, {high, low}; +} +{ +fma.rn.f16x2 r832, r824, r830, r827; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r838, {high, high}; +} +{ +mul.f16x2 r840, r757, r838; +} +{ +fma.rn.f16x2 r843, r613, r836, r840; +} +{ +mul.f16x2 r847, r613, r838; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r757, r836, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r858, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r860, {low, high}; +} +{ +mul.f16x2 r861, r858, r860; +} +{ +mul.f16x2 r864, r832, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r867, {high, low}; +} +{ +fma.rn.f16x2 r869, r861, r867, r864; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r873, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r875, {high, high}; +} +{ +mul.f16x2 r877, r793, r875; +} +{ +fma.rn.f16x2 r880, r649, r873, r877; +} +{ +mul.f16x2 r884, r649, r875; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r793, r873, r887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r893, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r895, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r897, {low, high}; +} +{ +mul.f16x2 r898, r895, r897; +} +{ +mul.f16x2 r901, r869, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r904, {high, low}; +} +{ +fma.rn.f16x2 r906, r898, r904, r901; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r910, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r912, {high, high}; +} +{ +mul.f16x2 r914, r721, r912; +} +{ +fma.rn.f16x2 r917, r577, r910, r914; +} +{ +mul.f16x2 r921, r577, r912; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r721, r910, r924; +} +shl.b32 r1756, r1755, 3; +add.s32 r1757, r1749, r1756; +barrier.sync 0; +mad.lo.s32 r1758, r1753, 200, r1757; +st.shared.u32 [r1758], r493; +st.shared.u32 [r1758+4], r505; +st.shared.u32 [r1758+40], r806; +st.shared.u32 [r1758+44], r815; +st.shared.u32 [r1758+80], r843; +st.shared.u32 [r1758+84], r852; +st.shared.u32 [r1758+120], r880; +st.shared.u32 [r1758+124], r889; +st.shared.u32 [r1758+160], r917; +st.shared.u32 [r1758+164], r926; +barrier.sync 0; +ld.shared.u32 r961, [r1752]; +ld.shared.u32 r973, [r1752+4]; +ld.shared.u32 r958, [r1752+1000]; +ld.shared.u32 r970, [r1752+1004]; +ld.shared.u32 r964, [r1752+2000]; +ld.shared.u32 r976, [r1752+2004]; +ld.shared.u32 r965, [r1752+3000]; +ld.shared.u32 r977, [r1752+3004]; +ld.shared.u32 r959, [r1752+4000]; +ld.shared.u32 r971, [r1752+4004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r951, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r952, {low, high}; +} +{ +neg.f16x2 r953, r952; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r956, {low, high}; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r961, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r960, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r973, r969; +} +{ +add.f16x2 r975, r976, r977; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, r958, r959; +} +{ +mul.f16x2 r984, r981, r947; +} +{ +add.f16x2 r987, r961, r984; +} +{ +add.f16x2 r990, r964, r965; +} +{ +mul.f16x2 r993, r990, r951; +} +{ +add.f16x2 r996, r987, r993; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r949; +} +{ +sub.f16x2 r1005, r976, r977; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +sub.f16x2 r1014, r996, r1011; +} +{ +add.f16x2 r1017, r958, r959; +} +{ +mul.f16x2 r1020, r1017, r947; +} +{ +add.f16x2 r1023, r961, r1020; +} +{ +add.f16x2 r1026, r964, r965; +} +{ +mul.f16x2 r1029, r1026, r951; +} +{ +add.f16x2 r1032, r1023, r1029; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r949; +} +{ +sub.f16x2 r1041, r976, r977; +} +{ +mul.f16x2 r1044, r1041, r953; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r1032, r1047; +} +{ +add.f16x2 r1053, r958, r959; +} +{ +mul.f16x2 r1056, r1053, r951; +} +{ +add.f16x2 r1059, r961, r1056; +} +{ +add.f16x2 r1062, r964, r965; +} +{ +mul.f16x2 r1065, r1062, r955; +} +{ +add.f16x2 r1068, r1059, r1065; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r953; +} +{ +sub.f16x2 r1077, r976, r977; +} +{ +mul.f16x2 r1080, r1077, r956; +} +{ +add.f16x2 r1083, r1074, r1080; +} +{ +sub.f16x2 r1086, r1068, r1083; +} +{ +add.f16x2 r1089, r958, r959; +} +{ +mul.f16x2 r1092, r1089, r951; +} +{ +add.f16x2 r1095, r961, r1092; +} +{ +add.f16x2 r1098, r964, r965; +} +{ +mul.f16x2 r1101, r1098, r955; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r953; +} +{ +sub.f16x2 r1113, r976, r977; +} +{ +mul.f16x2 r1116, r1113, r956; +} +{ +add.f16x2 r1119, r1110, r1116; +} +{ +add.f16x2 r1122, r1104, r1119; +} +{ +add.f16x2 r1125, r970, r971; +} +{ +mul.f16x2 r1128, r1125, r947; +} +{ +add.f16x2 r1131, r973, r1128; +} +{ +add.f16x2 r1134, r976, r977; +} +{ +mul.f16x2 r1137, r1134, r951; +} +{ +add.f16x2 r1140, r1131, r1137; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r949; +} +{ +sub.f16x2 r1149, r964, r965; +} +{ +mul.f16x2 r1152, r1149, r953; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r1140, r1155; +} +{ +add.f16x2 r1161, r970, r971; +} +{ +mul.f16x2 r1164, r1161, r947; +} +{ +add.f16x2 r1167, r973, r1164; +} +{ +add.f16x2 r1170, r976, r977; +} +{ +mul.f16x2 r1173, r1170, r951; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r949; +} +{ +sub.f16x2 r1185, r964, r965; +} +{ +mul.f16x2 r1188, r1185, r953; +} +{ +add.f16x2 r1191, r1182, r1188; +} +{ +sub.f16x2 r1194, r1176, r1191; +} +{ +add.f16x2 r1197, r970, r971; +} +{ +mul.f16x2 r1200, r1197, r951; +} +{ +add.f16x2 r1203, r973, r1200; +} +{ +add.f16x2 r1206, r976, r977; +} +{ +mul.f16x2 r1209, r1206, r955; +} +{ +add.f16x2 r1212, r1203, r1209; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r953; +} +{ +sub.f16x2 r1221, r964, r965; +} +{ +mul.f16x2 r1224, r1221, r956; +} +{ +add.f16x2 r1227, r1218, r1224; +} +{ +add.f16x2 r1230, r1212, r1227; +} +{ +add.f16x2 r1233, r970, r971; +} +{ +mul.f16x2 r1236, r1233, r951; +} +{ +add.f16x2 r1239, r973, r1236; +} +{ +add.f16x2 r1242, r976, r977; +} +{ +mul.f16x2 r1245, r1242, r955; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r953; +} +{ +sub.f16x2 r1257, r964, r965; +} +{ +mul.f16x2 r1260, r1257, r956; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +sub.f16x2 r1266, r1248, r1263; +} +mul.wide.u32 rd6, r1745, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1759, rd7; +cvt.rn.f32.u32 f91, r1759; +mul.f32 f92, f91, 0f3E80ADFD; +cos.approx.f32 f61, f92; +sin.approx.f32 f93, f92; +neg.f32 f62, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1269, {low, high}; +} +mul.lo.s32 r1760, r1759, 25; +sub.s32 r1761, r1745, r1760; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1272, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1274, {high, high}; +} +{ +mul.f16x2 r1276, r1158, r1274; +} +{ +fma.rn.f16x2 r1279, r1014, r1272, r1276; +} +{ +mul.f16x2 r1283, r1014, r1274; +} +{ +neg.f16x2 r1286, r1283; +} +{ +fma.rn.f16x2 r1288, r1158, r1272, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1294, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1296, {low, high}; +} +{ +mul.f16x2 r1297, r1294, r1296; +} +{ +mul.f16x2 r1300, r1269, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1303, {high, low}; +} +{ +fma.rn.f16x2 r1305, r1297, r1303, r1300; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1309, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1311, {high, high}; +} +{ +mul.f16x2 r1313, r1230, r1311; +} +{ +fma.rn.f16x2 r1316, r1086, r1309, r1313; +} +{ +mul.f16x2 r1320, r1086, r1311; +} +{ +neg.f16x2 r1323, r1320; +} +{ +fma.rn.f16x2 r1325, r1230, r1309, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1329, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1331, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1331, r1333; +} +{ +mul.f16x2 r1337, r1305, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1340, {high, low}; +} +{ +fma.rn.f16x2 r1342, r1334, r1340, r1337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1348, {high, high}; +} +{ +mul.f16x2 r1350, r1266, r1348; +} +{ +fma.rn.f16x2 r1353, r1122, r1346, r1350; +} +{ +mul.f16x2 r1357, r1122, r1348; +} +{ +neg.f16x2 r1360, r1357; +} +{ +fma.rn.f16x2 r1362, r1266, r1346, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1366, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1368, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1368, r1370; +} +{ +mul.f16x2 r1374, r1342, r1366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1377, {high, low}; +} +{ +fma.rn.f16x2 r1379, r1371, r1377, r1374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1385, {high, high}; +} +{ +mul.f16x2 r1387, r1194, r1385; +} +{ +fma.rn.f16x2 r1390, r1050, r1383, r1387; +} +{ +mul.f16x2 r1394, r1050, r1385; +} +{ +neg.f16x2 r1397, r1394; +} +{ +fma.rn.f16x2 r1399, r1194, r1383, r1397; +} +shl.b32 r1762, r1761, 3; +add.s32 r1763, r1749, r1762; +barrier.sync 0; +mad.lo.s32 r1764, r1759, 1000, r1763; +st.shared.u32 [r1764], r966; +st.shared.u32 [r1764+4], r978; +st.shared.u32 [r1764+200], r1279; +st.shared.u32 [r1764+204], r1288; +st.shared.u32 [r1764+400], r1316; +st.shared.u32 [r1764+404], r1325; +st.shared.u32 [r1764+600], r1353; +st.shared.u32 [r1764+604], r1362; +st.shared.u32 [r1764+800], r1390; +st.shared.u32 [r1764+804], r1399; +barrier.sync 0; +ld.shared.u32 r1434, [r1752]; +ld.shared.u32 r1446, [r1752+4]; +ld.shared.u32 r1431, [r1752+1000]; +ld.shared.u32 r1443, [r1752+1004]; +ld.shared.u32 r1437, [r1752+2000]; +ld.shared.u32 r1449, [r1752+2004]; +ld.shared.u32 r1438, [r1752+3000]; +ld.shared.u32 r1450, [r1752+3004]; +ld.shared.u32 r1432, [r1752+4000]; +ld.shared.u32 r1444, [r1752+4004]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1421, {low, high}; +} +{ +neg.f16x2 r1422, r1421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1425, {low, high}; +} +{ +neg.f16x2 r1426, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1429, {low, high}; +} +{ +add.f16x2 r1430, r1431, r1432; +} +{ +add.f16x2 r1433, r1434, r1430; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +add.f16x2 %0, r1433, r1436; +} +{ +add.f16x2 r1442, r1443, r1444; +} +{ +add.f16x2 r1445, r1446, r1442; +} +{ +add.f16x2 r1448, r1449, r1450; +} +{ +add.f16x2 %1, r1445, r1448; +} +{ +add.f16x2 r1454, r1431, r1432; +} +{ +mul.f16x2 r1457, r1454, r1420; +} +{ +add.f16x2 r1460, r1434, r1457; +} +{ +add.f16x2 r1463, r1437, r1438; +} +{ +mul.f16x2 r1466, r1463, r1424; +} +{ +add.f16x2 r1469, r1460, r1466; +} +{ +sub.f16x2 r1472, r1443, r1444; +} +{ +mul.f16x2 r1475, r1472, r1422; +} +{ +sub.f16x2 r1478, r1449, r1450; +} +{ +mul.f16x2 r1481, r1478, r1426; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 %2, r1469, r1484; +} +{ +add.f16x2 r1490, r1431, r1432; +} +{ +mul.f16x2 r1493, r1490, r1420; +} +{ +add.f16x2 r1496, r1434, r1493; +} +{ +add.f16x2 r1499, r1437, r1438; +} +{ +mul.f16x2 r1502, r1499, r1424; +} +{ +add.f16x2 r1505, r1496, r1502; +} +{ +sub.f16x2 r1508, r1443, r1444; +} +{ +mul.f16x2 r1511, r1508, r1422; +} +{ +sub.f16x2 r1514, r1449, r1450; +} +{ +mul.f16x2 r1517, r1514, r1426; +} +{ +add.f16x2 r1520, r1511, r1517; +} +{ +add.f16x2 %8, r1505, r1520; +} +{ +add.f16x2 r1526, r1431, r1432; +} +{ +mul.f16x2 r1529, r1526, r1424; +} +{ +add.f16x2 r1532, r1434, r1529; +} +{ +add.f16x2 r1535, r1437, r1438; +} +{ +mul.f16x2 r1538, r1535, r1428; +} +{ +add.f16x2 r1541, r1532, r1538; +} +{ +sub.f16x2 r1544, r1443, r1444; +} +{ +mul.f16x2 r1547, r1544, r1426; +} +{ +sub.f16x2 r1550, r1449, r1450; +} +{ +mul.f16x2 r1553, r1550, r1429; +} +{ +add.f16x2 r1556, r1547, r1553; +} +{ +sub.f16x2 %4, r1541, r1556; +} +{ +add.f16x2 r1562, r1431, r1432; +} +{ +mul.f16x2 r1565, r1562, r1424; +} +{ +add.f16x2 r1568, r1434, r1565; +} +{ +add.f16x2 r1571, r1437, r1438; +} +{ +mul.f16x2 r1574, r1571, r1428; +} +{ +add.f16x2 r1577, r1568, r1574; +} +{ +sub.f16x2 r1580, r1443, r1444; +} +{ +mul.f16x2 r1583, r1580, r1426; +} +{ +sub.f16x2 r1586, r1449, r1450; +} +{ +mul.f16x2 r1589, r1586, r1429; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 %6, r1577, r1592; +} +{ +add.f16x2 r1598, r1443, r1444; +} +{ +mul.f16x2 r1601, r1598, r1420; +} +{ +add.f16x2 r1604, r1446, r1601; +} +{ +add.f16x2 r1607, r1449, r1450; +} +{ +mul.f16x2 r1610, r1607, r1424; +} +{ +add.f16x2 r1613, r1604, r1610; +} +{ +sub.f16x2 r1616, r1431, r1432; +} +{ +mul.f16x2 r1619, r1616, r1422; +} +{ +sub.f16x2 r1622, r1437, r1438; +} +{ +mul.f16x2 r1625, r1622, r1426; +} +{ +add.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 %3, r1613, r1628; +} +{ +add.f16x2 r1634, r1443, r1444; +} +{ +mul.f16x2 r1637, r1634, r1420; +} +{ +add.f16x2 r1640, r1446, r1637; +} +{ +add.f16x2 r1643, r1449, r1450; +} +{ +mul.f16x2 r1646, r1643, r1424; +} +{ +add.f16x2 r1649, r1640, r1646; +} +{ +sub.f16x2 r1652, r1431, r1432; +} +{ +mul.f16x2 r1655, r1652, r1422; +} +{ +sub.f16x2 r1658, r1437, r1438; +} +{ +mul.f16x2 r1661, r1658, r1426; +} +{ +add.f16x2 r1664, r1655, r1661; +} +{ +sub.f16x2 %9, r1649, r1664; +} +{ +add.f16x2 r1670, r1443, r1444; +} +{ +mul.f16x2 r1673, r1670, r1424; +} +{ +add.f16x2 r1676, r1446, r1673; +} +{ +add.f16x2 r1679, r1449, r1450; +} +{ +mul.f16x2 r1682, r1679, r1428; +} +{ +add.f16x2 r1685, r1676, r1682; +} +{ +sub.f16x2 r1688, r1431, r1432; +} +{ +mul.f16x2 r1691, r1688, r1426; +} +{ +sub.f16x2 r1694, r1437, r1438; +} +{ +mul.f16x2 r1697, r1694, r1429; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +add.f16x2 %5, r1685, r1700; +} +{ +add.f16x2 r1706, r1443, r1444; +} +{ +mul.f16x2 r1709, r1706, r1424; +} +{ +add.f16x2 r1712, r1446, r1709; +} +{ +add.f16x2 r1715, r1449, r1450; +} +{ +mul.f16x2 r1718, r1715, r1428; +} +{ +add.f16x2 r1721, r1712, r1718; +} +{ +sub.f16x2 r1724, r1431, r1432; +} +{ +mul.f16x2 r1727, r1724, r1426; +} +{ +sub.f16x2 r1730, r1437, r1438; +} +{ +mul.f16x2 r1733, r1730, r1429; +} +{ +add.f16x2 r1736, r1727, r1733; +} +{ +sub.f16x2 %7, r1721, r1736; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1113, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<94>; +.reg .b32 r<1765>; +.reg .b64 rd<8>; +mov.u32 r1742, %tid.y; +mov.u32 r1743, %10; +mad.lo.s32 r1744, r1742, 2500, r1743; +mov.u32 r1745, %tid.x; +mov.f32 f82, 0f3E9E377A; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1, {low, high}; +} +mov.f32 f84, 0fBF737871; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +mov.f32 f78, 0fBF4F1BBD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r5, {low, high}; +} +mov.f32 f80, 0fBF167918; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r6, {low, high}; +} +{ +neg.f16x2 r7, r6; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r9, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r10, {low, high}; +} +{ +add.f16x2 r11, %13, %19; +} +{ +add.f16x2 r14, %11, r11; +} +{ +add.f16x2 r17, %15, %17; +} +{ +add.f16x2 r20, r14, r17; +} +{ +add.f16x2 r23, %14, %20; +} +{ +add.f16x2 r26, %12, r23; +} +{ +add.f16x2 r29, %16, %18; +} +{ +add.f16x2 r32, r26, r29; +} +{ +add.f16x2 r35, %13, %19; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %11, r38; +} +{ +add.f16x2 r44, %15, %17; +} +{ +mul.f16x2 r47, r44, r5; +} +{ +add.f16x2 r50, r41, r47; +} +{ +sub.f16x2 r53, %14, %20; +} +{ +mul.f16x2 r56, r53, r3; +} +{ +sub.f16x2 r59, %16, %18; +} +{ +mul.f16x2 r62, r59, r7; +} +{ +add.f16x2 r65, r56, r62; +} +{ +sub.f16x2 r68, r50, r65; +} +{ +add.f16x2 r71, %13, %19; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %11, r74; +} +{ +add.f16x2 r80, %15, %17; +} +{ +mul.f16x2 r83, r80, r5; +} +{ +add.f16x2 r86, r77, r83; +} +{ +sub.f16x2 r89, %14, %20; +} +{ +mul.f16x2 r92, r89, r3; +} +{ +sub.f16x2 r95, %16, %18; +} +{ +mul.f16x2 r98, r95, r7; +} +{ +add.f16x2 r101, r92, r98; +} +{ +add.f16x2 r104, r86, r101; +} +{ +add.f16x2 r107, %13, %19; +} +{ +mul.f16x2 r110, r107, r5; +} +{ +add.f16x2 r113, %11, r110; +} +{ +add.f16x2 r116, %15, %17; +} +{ +mul.f16x2 r119, r116, r9; +} +{ +add.f16x2 r122, r113, r119; +} +{ +sub.f16x2 r125, %14, %20; +} +{ +mul.f16x2 r128, r125, r7; +} +{ +sub.f16x2 r131, %16, %18; +} +{ +mul.f16x2 r134, r131, r10; +} +{ +add.f16x2 r137, r128, r134; +} +{ +sub.f16x2 r140, r122, r137; +} +{ +add.f16x2 r143, %13, %19; +} +{ +mul.f16x2 r146, r143, r5; +} +{ +add.f16x2 r149, %11, r146; +} +{ +add.f16x2 r152, %15, %17; +} +{ +mul.f16x2 r155, r152, r9; +} +{ +add.f16x2 r158, r149, r155; +} +{ +sub.f16x2 r161, %14, %20; +} +{ +mul.f16x2 r164, r161, r7; +} +{ +sub.f16x2 r167, %16, %18; +} +{ +mul.f16x2 r170, r167, r10; +} +{ +add.f16x2 r173, r164, r170; +} +{ +add.f16x2 r176, r158, r173; +} +{ +add.f16x2 r179, %14, %20; +} +{ +mul.f16x2 r182, r179, r1; +} +{ +add.f16x2 r185, %12, r182; +} +{ +add.f16x2 r188, %16, %18; +} +{ +mul.f16x2 r191, r188, r5; +} +{ +add.f16x2 r194, r185, r191; +} +{ +sub.f16x2 r197, %13, %19; +} +{ +mul.f16x2 r200, r197, r3; +} +{ +sub.f16x2 r203, %15, %17; +} +{ +mul.f16x2 r206, r203, r7; +} +{ +add.f16x2 r209, r200, r206; +} +{ +add.f16x2 r212, r194, r209; +} +{ +add.f16x2 r215, %14, %20; +} +{ +mul.f16x2 r218, r215, r1; +} +{ +add.f16x2 r221, %12, r218; +} +{ +add.f16x2 r224, %16, %18; +} +{ +mul.f16x2 r227, r224, r5; +} +{ +add.f16x2 r230, r221, r227; +} +{ +sub.f16x2 r233, %13, %19; +} +{ +mul.f16x2 r236, r233, r3; +} +{ +sub.f16x2 r239, %15, %17; +} +{ +mul.f16x2 r242, r239, r7; +} +{ +add.f16x2 r245, r236, r242; +} +{ +sub.f16x2 r248, r230, r245; +} +{ +add.f16x2 r251, %14, %20; +} +{ +mul.f16x2 r254, r251, r5; +} +{ +add.f16x2 r257, %12, r254; +} +{ +add.f16x2 r260, %16, %18; +} +{ +mul.f16x2 r263, r260, r9; +} +{ +add.f16x2 r266, r257, r263; +} +{ +sub.f16x2 r269, %13, %19; +} +{ +mul.f16x2 r272, r269, r7; +} +{ +sub.f16x2 r275, %15, %17; +} +{ +mul.f16x2 r278, r275, r10; +} +{ +add.f16x2 r281, r272, r278; +} +{ +add.f16x2 r284, r266, r281; +} +{ +add.f16x2 r287, %14, %20; +} +{ +mul.f16x2 r290, r287, r5; +} +{ +add.f16x2 r293, %12, r290; +} +{ +add.f16x2 r296, %16, %18; +} +{ +mul.f16x2 r299, r296, r9; +} +{ +add.f16x2 r302, r293, r299; +} +{ +sub.f16x2 r305, %13, %19; +} +{ +mul.f16x2 r308, r305, r7; +} +{ +sub.f16x2 r311, %15, %17; +} +{ +mul.f16x2 r314, r311, r10; +} +{ +add.f16x2 r317, r308, r314; +} +{ +sub.f16x2 r320, r302, r317; +} +mul.wide.u32 rd2, r1745, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r1746, rd3; +mul.lo.s32 r1747, r1746, 125; +sub.s32 r1748, r1745, r1747; +mad.lo.s32 r1749, r1746, 2500, r1744; +cvt.rn.f32.u32 f85, r1748; +mul.f32 f86, f85, 0f3C24B5BE; +cos.approx.f32 f13, f86; +sin.approx.f32 f87, f86; +neg.f32 f14, f87; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r323, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r326, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r328, {high, high}; +} +{ +mul.f16x2 r330, r212, r328; +} +{ +fma.rn.f16x2 r333, r68, r326, r330; +} +{ +mul.f16x2 r337, r68, r328; +} +{ +neg.f16x2 r340, r337; +} +{ +fma.rn.f16x2 r342, r212, r326, r340; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r348, {high, high}; +} +mov.f32 f69, 0fBF800000; +mov.f32 f70, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r350, {low, high}; +} +{ +mul.f16x2 r351, r348, r350; +} +{ +mul.f16x2 r354, r323, r346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r357, {high, low}; +} +{ +fma.rn.f16x2 r359, r351, r357, r354; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r363, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r365, {high, high}; +} +{ +mul.f16x2 r367, r284, r365; +} +{ +fma.rn.f16x2 r370, r140, r363, r367; +} +{ +mul.f16x2 r374, r140, r365; +} +{ +neg.f16x2 r377, r374; +} +{ +fma.rn.f16x2 r379, r284, r363, r377; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r385, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r387, {low, high}; +} +{ +mul.f16x2 r388, r385, r387; +} +{ +mul.f16x2 r391, r359, r383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r359; +mov.b32 r394, {high, low}; +} +{ +fma.rn.f16x2 r396, r388, r394, r391; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r400, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r402, {high, high}; +} +{ +mul.f16x2 r404, r320, r402; +} +{ +fma.rn.f16x2 r407, r176, r400, r404; +} +{ +mul.f16x2 r411, r176, r402; +} +{ +neg.f16x2 r414, r411; +} +{ +fma.rn.f16x2 r416, r320, r400, r414; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r420, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r323; +mov.b32 r422, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r424, {low, high}; +} +{ +mul.f16x2 r425, r422, r424; +} +{ +mul.f16x2 r428, r396, r420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r396; +mov.b32 r431, {high, low}; +} +{ +fma.rn.f16x2 r433, r425, r431, r428; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r437, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r433; +mov.b32 r439, {high, high}; +} +{ +mul.f16x2 r441, r248, r439; +} +{ +fma.rn.f16x2 r444, r104, r437, r441; +} +{ +mul.f16x2 r448, r104, r439; +} +{ +neg.f16x2 r451, r448; +} +{ +fma.rn.f16x2 r453, r248, r437, r451; +} +barrier.sync 0; +mad.lo.s32 r1750, r1748, 20, r1749; +st.shared.u32 [r1750], r20; +st.shared.u32 [r1750+4], r333; +st.shared.u32 [r1750+8], r370; +st.shared.u32 [r1750+12], r407; +st.shared.u32 [r1750+16], r444; +barrier.sync 0; +shl.b32 r1751, r1748, 4; +sub.s32 r1752, r1750, r1751; +ld.shared.u32 r488, [r1752]; +ld.shared.u32 r485, [r1752+500]; +ld.shared.u32 r491, [r1752+1000]; +ld.shared.u32 r492, [r1752+1500]; +ld.shared.u32 r486, [r1752+2000]; +barrier.sync 0; +st.shared.u32 [r1750], r32; +st.shared.u32 [r1750+4], r342; +st.shared.u32 [r1750+8], r379; +st.shared.u32 [r1750+12], r416; +st.shared.u32 [r1750+16], r453; +barrier.sync 0; +ld.shared.u32 r500, [r1752]; +ld.shared.u32 r497, [r1752+500]; +ld.shared.u32 r503, [r1752+1000]; +ld.shared.u32 r504, [r1752+1500]; +ld.shared.u32 r498, [r1752+2000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r474, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r475, {low, high}; +} +{ +neg.f16x2 r476, r475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r478, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r479, {low, high}; +} +{ +neg.f16x2 r480, r479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r483, {low, high}; +} +{ +add.f16x2 r484, r485, r486; +} +{ +add.f16x2 r487, r488, r484; +} +{ +add.f16x2 r490, r491, r492; +} +{ +add.f16x2 r493, r487, r490; +} +{ +add.f16x2 r496, r497, r498; +} +{ +add.f16x2 r499, r500, r496; +} +{ +add.f16x2 r502, r503, r504; +} +{ +add.f16x2 r505, r499, r502; +} +{ +add.f16x2 r508, r485, r486; +} +{ +mul.f16x2 r511, r508, r474; +} +{ +add.f16x2 r514, r488, r511; +} +{ +add.f16x2 r517, r491, r492; +} +{ +mul.f16x2 r520, r517, r478; +} +{ +add.f16x2 r523, r514, r520; +} +{ +sub.f16x2 r526, r497, r498; +} +{ +mul.f16x2 r529, r526, r476; +} +{ +sub.f16x2 r532, r503, r504; +} +{ +mul.f16x2 r535, r532, r480; +} +{ +add.f16x2 r538, r529, r535; +} +{ +sub.f16x2 r541, r523, r538; +} +{ +add.f16x2 r544, r485, r486; +} +{ +mul.f16x2 r547, r544, r474; +} +{ +add.f16x2 r550, r488, r547; +} +{ +add.f16x2 r553, r491, r492; +} +{ +mul.f16x2 r556, r553, r478; +} +{ +add.f16x2 r559, r550, r556; +} +{ +sub.f16x2 r562, r497, r498; +} +{ +mul.f16x2 r565, r562, r476; +} +{ +sub.f16x2 r568, r503, r504; +} +{ +mul.f16x2 r571, r568, r480; +} +{ +add.f16x2 r574, r565, r571; +} +{ +add.f16x2 r577, r559, r574; +} +{ +add.f16x2 r580, r485, r486; +} +{ +mul.f16x2 r583, r580, r478; +} +{ +add.f16x2 r586, r488, r583; +} +{ +add.f16x2 r589, r491, r492; +} +{ +mul.f16x2 r592, r589, r482; +} +{ +add.f16x2 r595, r586, r592; +} +{ +sub.f16x2 r598, r497, r498; +} +{ +mul.f16x2 r601, r598, r480; +} +{ +sub.f16x2 r604, r503, r504; +} +{ +mul.f16x2 r607, r604, r483; +} +{ +add.f16x2 r610, r601, r607; +} +{ +sub.f16x2 r613, r595, r610; +} +{ +add.f16x2 r616, r485, r486; +} +{ +mul.f16x2 r619, r616, r478; +} +{ +add.f16x2 r622, r488, r619; +} +{ +add.f16x2 r625, r491, r492; +} +{ +mul.f16x2 r628, r625, r482; +} +{ +add.f16x2 r631, r622, r628; +} +{ +sub.f16x2 r634, r497, r498; +} +{ +mul.f16x2 r637, r634, r480; +} +{ +sub.f16x2 r640, r503, r504; +} +{ +mul.f16x2 r643, r640, r483; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, r631, r646; +} +{ +add.f16x2 r652, r497, r498; +} +{ +mul.f16x2 r655, r652, r474; +} +{ +add.f16x2 r658, r500, r655; +} +{ +add.f16x2 r661, r503, r504; +} +{ +mul.f16x2 r664, r661, r478; +} +{ +add.f16x2 r667, r658, r664; +} +{ +sub.f16x2 r670, r485, r486; +} +{ +mul.f16x2 r673, r670, r476; +} +{ +sub.f16x2 r676, r491, r492; +} +{ +mul.f16x2 r679, r676, r480; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r667, r682; +} +{ +add.f16x2 r688, r497, r498; +} +{ +mul.f16x2 r691, r688, r474; +} +{ +add.f16x2 r694, r500, r691; +} +{ +add.f16x2 r697, r503, r504; +} +{ +mul.f16x2 r700, r697, r478; +} +{ +add.f16x2 r703, r694, r700; +} +{ +sub.f16x2 r706, r485, r486; +} +{ +mul.f16x2 r709, r706, r476; +} +{ +sub.f16x2 r712, r491, r492; +} +{ +mul.f16x2 r715, r712, r480; +} +{ +add.f16x2 r718, r709, r715; +} +{ +sub.f16x2 r721, r703, r718; +} +{ +add.f16x2 r724, r497, r498; +} +{ +mul.f16x2 r727, r724, r478; +} +{ +add.f16x2 r730, r500, r727; +} +{ +add.f16x2 r733, r503, r504; +} +{ +mul.f16x2 r736, r733, r482; +} +{ +add.f16x2 r739, r730, r736; +} +{ +sub.f16x2 r742, r485, r486; +} +{ +mul.f16x2 r745, r742, r480; +} +{ +sub.f16x2 r748, r491, r492; +} +{ +mul.f16x2 r751, r748, r483; +} +{ +add.f16x2 r754, r745, r751; +} +{ +add.f16x2 r757, r739, r754; +} +{ +add.f16x2 r760, r497, r498; +} +{ +mul.f16x2 r763, r760, r478; +} +{ +add.f16x2 r766, r500, r763; +} +{ +add.f16x2 r769, r503, r504; +} +{ +mul.f16x2 r772, r769, r482; +} +{ +add.f16x2 r775, r766, r772; +} +{ +sub.f16x2 r778, r485, r486; +} +{ +mul.f16x2 r781, r778, r480; +} +{ +sub.f16x2 r784, r491, r492; +} +{ +mul.f16x2 r787, r784, r483; +} +{ +add.f16x2 r790, r781, r787; +} +{ +sub.f16x2 r793, r775, r790; +} +mul.wide.u32 rd4, r1748, -858993459; +shr.u64 rd5, rd4, 34; +cvt.u32.u64 r1753, rd5; +mul.lo.s32 r1754, r1753, 5; +sub.s32 r1755, r1748, r1754; +shl.b32 r1756, r1755, 2; +add.s32 r1757, r1749, r1756; +cvt.rn.f32.u32 f88, r1753; +mul.f32 f89, f88, 0f3D4DE32E; +cos.approx.f32 f37, f89; +sin.approx.f32 f90, f89; +neg.f32 f38, f90; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f37; +cvt.rn.f16.f32 high, f38; +mov.b32 r796, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r801, {high, high}; +} +{ +mul.f16x2 r803, r685, r801; +} +{ +fma.rn.f16x2 r806, r541, r799, r803; +} +{ +mul.f16x2 r810, r541, r801; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r685, r799, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r819, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r821, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r823, {low, high}; +} +{ +mul.f16x2 r824, r821, r823; +} +{ +mul.f16x2 r827, r796, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r830, {high, low}; +} +{ +fma.rn.f16x2 r832, r824, r830, r827; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r838, {high, high}; +} +{ +mul.f16x2 r840, r757, r838; +} +{ +fma.rn.f16x2 r843, r613, r836, r840; +} +{ +mul.f16x2 r847, r613, r838; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r757, r836, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r856, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r858, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r860, {low, high}; +} +{ +mul.f16x2 r861, r858, r860; +} +{ +mul.f16x2 r864, r832, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r832; +mov.b32 r867, {high, low}; +} +{ +fma.rn.f16x2 r869, r861, r867, r864; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r873, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r875, {high, high}; +} +{ +mul.f16x2 r877, r793, r875; +} +{ +fma.rn.f16x2 r880, r649, r873, r877; +} +{ +mul.f16x2 r884, r649, r875; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r793, r873, r887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r893, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r796; +mov.b32 r895, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r897, {low, high}; +} +{ +mul.f16x2 r898, r895, r897; +} +{ +mul.f16x2 r901, r869, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r869; +mov.b32 r904, {high, low}; +} +{ +fma.rn.f16x2 r906, r898, r904, r901; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r910, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r906; +mov.b32 r912, {high, high}; +} +{ +mul.f16x2 r914, r721, r912; +} +{ +fma.rn.f16x2 r917, r577, r910, r914; +} +{ +mul.f16x2 r921, r577, r912; +} +{ +neg.f16x2 r924, r921; +} +{ +fma.rn.f16x2 r926, r721, r910, r924; +} +barrier.sync 0; +mad.lo.s32 r1758, r1753, 100, r1757; +st.shared.u32 [r1758], r493; +st.shared.u32 [r1758+20], r806; +st.shared.u32 [r1758+40], r843; +st.shared.u32 [r1758+60], r880; +st.shared.u32 [r1758+80], r917; +barrier.sync 0; +ld.shared.u32 r961, [r1752]; +ld.shared.u32 r958, [r1752+500]; +ld.shared.u32 r964, [r1752+1000]; +ld.shared.u32 r965, [r1752+1500]; +ld.shared.u32 r959, [r1752+2000]; +barrier.sync 0; +st.shared.u32 [r1758], r505; +st.shared.u32 [r1758+20], r815; +st.shared.u32 [r1758+40], r852; +st.shared.u32 [r1758+60], r889; +st.shared.u32 [r1758+80], r926; +barrier.sync 0; +ld.shared.u32 r973, [r1752]; +ld.shared.u32 r970, [r1752+500]; +ld.shared.u32 r976, [r1752+1000]; +ld.shared.u32 r977, [r1752+1500]; +ld.shared.u32 r971, [r1752+2000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r948, {low, high}; +} +{ +neg.f16x2 r949, r948; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r951, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r952, {low, high}; +} +{ +neg.f16x2 r953, r952; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r956, {low, high}; +} +{ +add.f16x2 r957, r958, r959; +} +{ +add.f16x2 r960, r961, r957; +} +{ +add.f16x2 r963, r964, r965; +} +{ +add.f16x2 r966, r960, r963; +} +{ +add.f16x2 r969, r970, r971; +} +{ +add.f16x2 r972, r973, r969; +} +{ +add.f16x2 r975, r976, r977; +} +{ +add.f16x2 r978, r972, r975; +} +{ +add.f16x2 r981, r958, r959; +} +{ +mul.f16x2 r984, r981, r947; +} +{ +add.f16x2 r987, r961, r984; +} +{ +add.f16x2 r990, r964, r965; +} +{ +mul.f16x2 r993, r990, r951; +} +{ +add.f16x2 r996, r987, r993; +} +{ +sub.f16x2 r999, r970, r971; +} +{ +mul.f16x2 r1002, r999, r949; +} +{ +sub.f16x2 r1005, r976, r977; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +sub.f16x2 r1014, r996, r1011; +} +{ +add.f16x2 r1017, r958, r959; +} +{ +mul.f16x2 r1020, r1017, r947; +} +{ +add.f16x2 r1023, r961, r1020; +} +{ +add.f16x2 r1026, r964, r965; +} +{ +mul.f16x2 r1029, r1026, r951; +} +{ +add.f16x2 r1032, r1023, r1029; +} +{ +sub.f16x2 r1035, r970, r971; +} +{ +mul.f16x2 r1038, r1035, r949; +} +{ +sub.f16x2 r1041, r976, r977; +} +{ +mul.f16x2 r1044, r1041, r953; +} +{ +add.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r1032, r1047; +} +{ +add.f16x2 r1053, r958, r959; +} +{ +mul.f16x2 r1056, r1053, r951; +} +{ +add.f16x2 r1059, r961, r1056; +} +{ +add.f16x2 r1062, r964, r965; +} +{ +mul.f16x2 r1065, r1062, r955; +} +{ +add.f16x2 r1068, r1059, r1065; +} +{ +sub.f16x2 r1071, r970, r971; +} +{ +mul.f16x2 r1074, r1071, r953; +} +{ +sub.f16x2 r1077, r976, r977; +} +{ +mul.f16x2 r1080, r1077, r956; +} +{ +add.f16x2 r1083, r1074, r1080; +} +{ +sub.f16x2 r1086, r1068, r1083; +} +{ +add.f16x2 r1089, r958, r959; +} +{ +mul.f16x2 r1092, r1089, r951; +} +{ +add.f16x2 r1095, r961, r1092; +} +{ +add.f16x2 r1098, r964, r965; +} +{ +mul.f16x2 r1101, r1098, r955; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +sub.f16x2 r1107, r970, r971; +} +{ +mul.f16x2 r1110, r1107, r953; +} +{ +sub.f16x2 r1113, r976, r977; +} +{ +mul.f16x2 r1116, r1113, r956; +} +{ +add.f16x2 r1119, r1110, r1116; +} +{ +add.f16x2 r1122, r1104, r1119; +} +{ +add.f16x2 r1125, r970, r971; +} +{ +mul.f16x2 r1128, r1125, r947; +} +{ +add.f16x2 r1131, r973, r1128; +} +{ +add.f16x2 r1134, r976, r977; +} +{ +mul.f16x2 r1137, r1134, r951; +} +{ +add.f16x2 r1140, r1131, r1137; +} +{ +sub.f16x2 r1143, r958, r959; +} +{ +mul.f16x2 r1146, r1143, r949; +} +{ +sub.f16x2 r1149, r964, r965; +} +{ +mul.f16x2 r1152, r1149, r953; +} +{ +add.f16x2 r1155, r1146, r1152; +} +{ +add.f16x2 r1158, r1140, r1155; +} +{ +add.f16x2 r1161, r970, r971; +} +{ +mul.f16x2 r1164, r1161, r947; +} +{ +add.f16x2 r1167, r973, r1164; +} +{ +add.f16x2 r1170, r976, r977; +} +{ +mul.f16x2 r1173, r1170, r951; +} +{ +add.f16x2 r1176, r1167, r1173; +} +{ +sub.f16x2 r1179, r958, r959; +} +{ +mul.f16x2 r1182, r1179, r949; +} +{ +sub.f16x2 r1185, r964, r965; +} +{ +mul.f16x2 r1188, r1185, r953; +} +{ +add.f16x2 r1191, r1182, r1188; +} +{ +sub.f16x2 r1194, r1176, r1191; +} +{ +add.f16x2 r1197, r970, r971; +} +{ +mul.f16x2 r1200, r1197, r951; +} +{ +add.f16x2 r1203, r973, r1200; +} +{ +add.f16x2 r1206, r976, r977; +} +{ +mul.f16x2 r1209, r1206, r955; +} +{ +add.f16x2 r1212, r1203, r1209; +} +{ +sub.f16x2 r1215, r958, r959; +} +{ +mul.f16x2 r1218, r1215, r953; +} +{ +sub.f16x2 r1221, r964, r965; +} +{ +mul.f16x2 r1224, r1221, r956; +} +{ +add.f16x2 r1227, r1218, r1224; +} +{ +add.f16x2 r1230, r1212, r1227; +} +{ +add.f16x2 r1233, r970, r971; +} +{ +mul.f16x2 r1236, r1233, r951; +} +{ +add.f16x2 r1239, r973, r1236; +} +{ +add.f16x2 r1242, r976, r977; +} +{ +mul.f16x2 r1245, r1242, r955; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +sub.f16x2 r1251, r958, r959; +} +{ +mul.f16x2 r1254, r1251, r953; +} +{ +sub.f16x2 r1257, r964, r965; +} +{ +mul.f16x2 r1260, r1257, r956; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +sub.f16x2 r1266, r1248, r1263; +} +mul.wide.u32 rd6, r1748, 1374389535; +shr.u64 rd7, rd6, 35; +cvt.u32.u64 r1759, rd7; +mul.lo.s32 r1760, r1759, 25; +sub.s32 r1761, r1748, r1760; +shl.b32 r1762, r1761, 2; +add.s32 r1763, r1749, r1762; +cvt.rn.f32.u32 f91, r1759; +mul.f32 f92, f91, 0f3E80ADFD; +cos.approx.f32 f61, f92; +sin.approx.f32 f93, f92; +neg.f32 f62, f93; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f61; +cvt.rn.f16.f32 high, f62; +mov.b32 r1269, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1272, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1274, {high, high}; +} +{ +mul.f16x2 r1276, r1158, r1274; +} +{ +fma.rn.f16x2 r1279, r1014, r1272, r1276; +} +{ +mul.f16x2 r1283, r1014, r1274; +} +{ +neg.f16x2 r1286, r1283; +} +{ +fma.rn.f16x2 r1288, r1158, r1272, r1286; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1294, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1296, {low, high}; +} +{ +mul.f16x2 r1297, r1294, r1296; +} +{ +mul.f16x2 r1300, r1269, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1303, {high, low}; +} +{ +fma.rn.f16x2 r1305, r1297, r1303, r1300; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1309, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1311, {high, high}; +} +{ +mul.f16x2 r1313, r1230, r1311; +} +{ +fma.rn.f16x2 r1316, r1086, r1309, r1313; +} +{ +mul.f16x2 r1320, r1086, r1311; +} +{ +neg.f16x2 r1323, r1320; +} +{ +fma.rn.f16x2 r1325, r1230, r1309, r1323; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1329, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1331, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1333, {low, high}; +} +{ +mul.f16x2 r1334, r1331, r1333; +} +{ +mul.f16x2 r1337, r1305, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1305; +mov.b32 r1340, {high, low}; +} +{ +fma.rn.f16x2 r1342, r1334, r1340, r1337; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1346, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1348, {high, high}; +} +{ +mul.f16x2 r1350, r1266, r1348; +} +{ +fma.rn.f16x2 r1353, r1122, r1346, r1350; +} +{ +mul.f16x2 r1357, r1122, r1348; +} +{ +neg.f16x2 r1360, r1357; +} +{ +fma.rn.f16x2 r1362, r1266, r1346, r1360; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1366, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1269; +mov.b32 r1368, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f69; +cvt.rn.f16.f32 high, f70; +mov.b32 r1370, {low, high}; +} +{ +mul.f16x2 r1371, r1368, r1370; +} +{ +mul.f16x2 r1374, r1342, r1366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1342; +mov.b32 r1377, {high, low}; +} +{ +fma.rn.f16x2 r1379, r1371, r1377, r1374; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1383, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1379; +mov.b32 r1385, {high, high}; +} +{ +mul.f16x2 r1387, r1194, r1385; +} +{ +fma.rn.f16x2 r1390, r1050, r1383, r1387; +} +{ +mul.f16x2 r1394, r1050, r1385; +} +{ +neg.f16x2 r1397, r1394; +} +{ +fma.rn.f16x2 r1399, r1194, r1383, r1397; +} +barrier.sync 0; +mad.lo.s32 r1764, r1759, 500, r1763; +st.shared.u32 [r1764], r966; +st.shared.u32 [r1764+100], r1279; +st.shared.u32 [r1764+200], r1316; +st.shared.u32 [r1764+300], r1353; +st.shared.u32 [r1764+400], r1390; +barrier.sync 0; +ld.shared.u32 r1434, [r1752]; +ld.shared.u32 r1431, [r1752+500]; +ld.shared.u32 r1437, [r1752+1000]; +ld.shared.u32 r1438, [r1752+1500]; +ld.shared.u32 r1432, [r1752+2000]; +barrier.sync 0; +st.shared.u32 [r1764], r978; +st.shared.u32 [r1764+100], r1288; +st.shared.u32 [r1764+200], r1325; +st.shared.u32 [r1764+300], r1362; +st.shared.u32 [r1764+400], r1399; +barrier.sync 0; +ld.shared.u32 r1446, [r1752]; +ld.shared.u32 r1443, [r1752+500]; +ld.shared.u32 r1449, [r1752+1000]; +ld.shared.u32 r1450, [r1752+1500]; +ld.shared.u32 r1444, [r1752+2000]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1420, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1421, {low, high}; +} +{ +neg.f16x2 r1422, r1421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r1424, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r1425, {low, high}; +} +{ +neg.f16x2 r1426, r1425; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r1429, {low, high}; +} +{ +add.f16x2 r1430, r1431, r1432; +} +{ +add.f16x2 r1433, r1434, r1430; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +add.f16x2 %0, r1433, r1436; +} +{ +add.f16x2 r1442, r1443, r1444; +} +{ +add.f16x2 r1445, r1446, r1442; +} +{ +add.f16x2 r1448, r1449, r1450; +} +{ +add.f16x2 %1, r1445, r1448; +} +{ +add.f16x2 r1454, r1431, r1432; +} +{ +mul.f16x2 r1457, r1454, r1420; +} +{ +add.f16x2 r1460, r1434, r1457; +} +{ +add.f16x2 r1463, r1437, r1438; +} +{ +mul.f16x2 r1466, r1463, r1424; +} +{ +add.f16x2 r1469, r1460, r1466; +} +{ +sub.f16x2 r1472, r1443, r1444; +} +{ +mul.f16x2 r1475, r1472, r1422; +} +{ +sub.f16x2 r1478, r1449, r1450; +} +{ +mul.f16x2 r1481, r1478, r1426; +} +{ +add.f16x2 r1484, r1475, r1481; +} +{ +sub.f16x2 %2, r1469, r1484; +} +{ +add.f16x2 r1490, r1431, r1432; +} +{ +mul.f16x2 r1493, r1490, r1420; +} +{ +add.f16x2 r1496, r1434, r1493; +} +{ +add.f16x2 r1499, r1437, r1438; +} +{ +mul.f16x2 r1502, r1499, r1424; +} +{ +add.f16x2 r1505, r1496, r1502; +} +{ +sub.f16x2 r1508, r1443, r1444; +} +{ +mul.f16x2 r1511, r1508, r1422; +} +{ +sub.f16x2 r1514, r1449, r1450; +} +{ +mul.f16x2 r1517, r1514, r1426; +} +{ +add.f16x2 r1520, r1511, r1517; +} +{ +add.f16x2 %8, r1505, r1520; +} +{ +add.f16x2 r1526, r1431, r1432; +} +{ +mul.f16x2 r1529, r1526, r1424; +} +{ +add.f16x2 r1532, r1434, r1529; +} +{ +add.f16x2 r1535, r1437, r1438; +} +{ +mul.f16x2 r1538, r1535, r1428; +} +{ +add.f16x2 r1541, r1532, r1538; +} +{ +sub.f16x2 r1544, r1443, r1444; +} +{ +mul.f16x2 r1547, r1544, r1426; +} +{ +sub.f16x2 r1550, r1449, r1450; +} +{ +mul.f16x2 r1553, r1550, r1429; +} +{ +add.f16x2 r1556, r1547, r1553; +} +{ +sub.f16x2 %4, r1541, r1556; +} +{ +add.f16x2 r1562, r1431, r1432; +} +{ +mul.f16x2 r1565, r1562, r1424; +} +{ +add.f16x2 r1568, r1434, r1565; +} +{ +add.f16x2 r1571, r1437, r1438; +} +{ +mul.f16x2 r1574, r1571, r1428; +} +{ +add.f16x2 r1577, r1568, r1574; +} +{ +sub.f16x2 r1580, r1443, r1444; +} +{ +mul.f16x2 r1583, r1580, r1426; +} +{ +sub.f16x2 r1586, r1449, r1450; +} +{ +mul.f16x2 r1589, r1586, r1429; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 %6, r1577, r1592; +} +{ +add.f16x2 r1598, r1443, r1444; +} +{ +mul.f16x2 r1601, r1598, r1420; +} +{ +add.f16x2 r1604, r1446, r1601; +} +{ +add.f16x2 r1607, r1449, r1450; +} +{ +mul.f16x2 r1610, r1607, r1424; +} +{ +add.f16x2 r1613, r1604, r1610; +} +{ +sub.f16x2 r1616, r1431, r1432; +} +{ +mul.f16x2 r1619, r1616, r1422; +} +{ +sub.f16x2 r1622, r1437, r1438; +} +{ +mul.f16x2 r1625, r1622, r1426; +} +{ +add.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 %3, r1613, r1628; +} +{ +add.f16x2 r1634, r1443, r1444; +} +{ +mul.f16x2 r1637, r1634, r1420; +} +{ +add.f16x2 r1640, r1446, r1637; +} +{ +add.f16x2 r1643, r1449, r1450; +} +{ +mul.f16x2 r1646, r1643, r1424; +} +{ +add.f16x2 r1649, r1640, r1646; +} +{ +sub.f16x2 r1652, r1431, r1432; +} +{ +mul.f16x2 r1655, r1652, r1422; +} +{ +sub.f16x2 r1658, r1437, r1438; +} +{ +mul.f16x2 r1661, r1658, r1426; +} +{ +add.f16x2 r1664, r1655, r1661; +} +{ +sub.f16x2 %9, r1649, r1664; +} +{ +add.f16x2 r1670, r1443, r1444; +} +{ +mul.f16x2 r1673, r1670, r1424; +} +{ +add.f16x2 r1676, r1446, r1673; +} +{ +add.f16x2 r1679, r1449, r1450; +} +{ +mul.f16x2 r1682, r1679, r1428; +} +{ +add.f16x2 r1685, r1676, r1682; +} +{ +sub.f16x2 r1688, r1431, r1432; +} +{ +mul.f16x2 r1691, r1688, r1426; +} +{ +sub.f16x2 r1694, r1437, r1438; +} +{ +mul.f16x2 r1697, r1694, r1429; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +add.f16x2 %5, r1685, r1700; +} +{ +add.f16x2 r1706, r1443, r1444; +} +{ +mul.f16x2 r1709, r1706, r1424; +} +{ +add.f16x2 r1712, r1446, r1709; +} +{ +add.f16x2 r1715, r1449, r1450; +} +{ +mul.f16x2 r1718, r1715, r1428; +} +{ +add.f16x2 r1721, r1712, r1718; +} +{ +sub.f16x2 r1724, r1431, r1432; +} +{ +mul.f16x2 r1727, r1724, r1426; +} +{ +sub.f16x2 r1730, r1437, r1438; +} +{ +mul.f16x2 r1733, r1730, r1429; +} +{ +add.f16x2 r1736, r1727, r1733; +} +{ +sub.f16x2 %7, r1721, r1736; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..449d2802393bf --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp32_fwd.hpp.inc @@ -0,0 +1,3546 @@ +#ifndef CUFFTDX_FFT_625_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_625_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<163, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1788>; +.reg .b32 r<14>; +.reg .b64 rd<10>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 5000, r13; +add.f32 f101, %62, %92; +add.f32 f103, %72, %82; +add.f32 f1787, %52, f101; +add.f32 f104, f103, f1787; +add.f32 f105, %102, %104; +add.f32 f107, %103, %83; +add.f32 f1783, %53, f105; +add.f32 f108, f107, f1783; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f1782, f101, 0f3E9E377A, %52; +sub.f32 f111, f1782, f110; +sub.f32 f112, %102, %104; +sub.f32 f114, %103, %83; +mul.f32 f1780, f112, 0f3F737871; +mul.f32 f1781, f114, 0fBF167918; +sub.f32 f116, f1781, f1780; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %52, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f1778, f105, 0f3E9E377A, %53; +mul.f32 f1779, f107, 0f3F4F1BBD; +sub.f32 f129, f1778, f1779; +sub.f32 f130, %62, %92; +sub.f32 f132, %72, %82; +mul.f32 f1776, f130, 0f3F737871; +mul.f32 f1777, f132, 0fBF167918; +sub.f32 f134, f1777, f1776; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %53, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %64, %94; +add.f32 f147, %74, %84; +add.f32 f1775, %54, f145; +add.f32 f148, f147, f1775; +add.f32 f149, %65, %95; +add.f32 f151, %107, %105; +add.f32 f1771, %106, f149; +add.f32 f152, f151, f1771; +fma.rn.f32 f1769, f145, 0f3E9E377A, %54; +mul.f32 f1770, f147, 0f3F4F1BBD; +sub.f32 f155, f1769, f1770; +sub.f32 f156, %65, %95; +sub.f32 f158, %107, %105; +mul.f32 f1767, f156, 0f3F737871; +mul.f32 f1768, f158, 0fBF167918; +sub.f32 f160, f1768, f1767; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %54, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +mul.f32 f172, f151, 0f3F4F1BBD; +fma.rn.f32 f1766, f149, 0f3E9E377A, %106; +sub.f32 f173, f1766, f172; +sub.f32 f174, %64, %94; +sub.f32 f176, %74, %84; +mul.f32 f177, f176, 0fBF167918; +mul.f32 f1765, f174, 0f3F737871; +sub.f32 f178, f177, f1765; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %106, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %66, %96; +add.f32 f191, %76, %86; +add.f32 f1764, %56, f189; +add.f32 f192, f191, f1764; +add.f32 f193, %110, %109; +add.f32 f195, %77, %111; +add.f32 f1759, %108, f193; +add.f32 f196, f195, f1759; +mul.f32 f198, f191, 0f3F4F1BBD; +fma.rn.f32 f1758, f189, 0f3E9E377A, %56; +sub.f32 f199, f1758, f198; +sub.f32 f200, %110, %109; +sub.f32 f202, %77, %111; +mul.f32 f203, f202, 0fBF167918; +mul.f32 f1757, f200, 0f3F737871; +sub.f32 f204, f203, f1757; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %56, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f1755, f193, 0f3E9E377A, %108; +mul.f32 f1756, f195, 0f3F4F1BBD; +sub.f32 f217, f1755, f1756; +sub.f32 f218, %66, %96; +sub.f32 f220, %76, %86; +mul.f32 f1753, f218, 0f3F737871; +mul.f32 f1754, f220, 0fBF167918; +sub.f32 f222, f1754, f1753; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %108, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %68, %98; +add.f32 f235, %78, %88; +add.f32 f1752, %58, f233; +add.f32 f236, f235, f1752; +add.f32 f237, %113, %112; +add.f32 f239, %114, %89; +add.f32 f1748, %59, f237; +add.f32 f240, f239, f1748; +fma.rn.f32 f1746, f233, 0f3E9E377A, %58; +mul.f32 f1747, f235, 0f3F4F1BBD; +sub.f32 f243, f1746, f1747; +sub.f32 f244, %113, %112; +sub.f32 f246, %114, %89; +mul.f32 f1744, f244, 0f3F737871; +mul.f32 f1745, f246, 0fBF167918; +sub.f32 f248, f1745, f1744; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %58, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +mul.f32 f260, f239, 0f3F4F1BBD; +fma.rn.f32 f1743, f237, 0f3E9E377A, %59; +sub.f32 f261, f1743, f260; +sub.f32 f262, %68, %98; +sub.f32 f264, %78, %88; +mul.f32 f1741, f262, 0f3F737871; +mul.f32 f1742, f264, 0fBF167918; +sub.f32 f266, f1742, f1741; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %59, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %70, %100; +add.f32 f279, %80, %90; +add.f32 f1740, %60, f277; +add.f32 f280, f279, f1740; +add.f32 f281, %71, %101; +add.f32 f283, %117, %115; +add.f32 f1736, %116, f281; +add.f32 f284, f283, f1736; +mul.f32 f286, f279, 0f3F4F1BBD; +fma.rn.f32 f1735, f277, 0f3E9E377A, %60; +sub.f32 f287, f1735, f286; +sub.f32 f288, %71, %101; +sub.f32 f290, %117, %115; +mul.f32 f1733, f288, 0f3F737871; +mul.f32 f1734, f290, 0fBF167918; +sub.f32 f292, f1734, f1733; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %60, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +mul.f32 f304, f283, 0f3F4F1BBD; +fma.rn.f32 f1732, f281, 0f3E9E377A, %116; +sub.f32 f305, f1732, f304; +sub.f32 f306, %70, %100; +sub.f32 f308, %80, %90; +mul.f32 f1730, f306, 0f3F737871; +mul.f32 f1731, f308, 0fBF167918; +sub.f32 f310, f1731, f1730; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %116, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mul.f32 f322, f179, 0fBE7EA890; +mul.f32 f1729, f161, 0f3F77F511; +sub.f32 f323, f1729, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f327, f223, 0fBEF6A86B; +mul.f32 f1728, f205, 0f3F6055A2; +sub.f32 f328, f1728, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f332, f267, 0fBF2F3E7B; +mul.f32 f1727, f249, 0f3F3A9DB0; +sub.f32 f333, f1727, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f337, f311, 0fBF5825E0; +mul.f32 f1726, f293, 0f3F092BF2; +sub.f32 f338, f1726, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f342, f187, 0fBEF6A86B; +mul.f32 f1725, f169, 0f3F6055A2; +sub.f32 f343, f1725, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f1723, f213, 0f3F092BF2; +mul.f32 f1724, f231, 0fBF5825E0; +sub.f32 f348, f1723, f1724; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f1721, f257, 0f3D809851; +mul.f32 f1722, f275, 0fBF7F7EAE; +sub.f32 f353, f1721, f1722; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f1719, f301, 0fBED9FFBE; +mul.f32 f1720, f319, 0fBF67A2BF; +sub.f32 f358, f1719, f1720; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f1717, f170, 0f3F3A9DB0; +mul.f32 f1718, f188, 0fBF2F3E7B; +sub.f32 f363, f1717, f1718; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f367, f232, 0fBF7F7EAE; +mul.f32 f1716, f214, 0f3D809851; +sub.f32 f368, f1716, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f372, f276, 0fBF45405B; +mul.f32 f1715, f258, 0fBF232E38; +sub.f32 f373, f1715, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f377, f320, 0fBE00575B; +mul.f32 f1714, f302, 0fBF7DFB3B; +sub.f32 f378, f1714, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f382, f180, 0fBF5825E0; +mul.f32 f1713, f162, 0f3F092BF2; +sub.f32 f383, f1713, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f387, f224, 0fBF67A2BF; +mul.f32 f1712, f206, 0fBED9FFBE; +sub.f32 f388, f1712, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f1710, f250, 0fBF7DFB3B; +mul.f32 f1711, f268, 0fBE00575B; +sub.f32 f393, f1710, f1711; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f1708, f294, 0fBF232E38; +mul.f32 f1709, f312, 0f3F45405B; +sub.f32 f398, f1708, f1709; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f403, f192, f236; +mul.f32 f408, f403, 0f3F4F1BBD; +fma.rn.f32 f1707, f401, 0f3E9E377A, f104; +sub.f32 f409, f1707, f408; +add.f32 f1706, f152, f284; +sub.f32 f410, f152, f284; +add.f32 f1705, f196, f240; +sub.f32 f412, f196, f240; +mul.f32 f413, f412, 0fBF167918; +mul.f32 f1704, f410, 0f3F737871; +sub.f32 f414, f413, f1704; +sub.f32 f415, f409, f414; +add.f32 f416, f414, f409; +add.f32 f1703, f104, f401; +mul.f32 f417, f401, 0f3F4F1BBD; +sub.f32 f418, f104, f417; +fma.rn.f32 f419, f403, 0f3E9E377A, f418; +mul.f32 f420, f410, 0f3F167918; +mul.f32 f421, f412, 0f3F737871; +sub.f32 f422, f421, f420; +sub.f32 f423, f419, f422; +add.f32 f424, f422, f419; +fma.rn.f32 f1701, f1706, 0f3E9E377A, f108; +mul.f32 f1702, f1705, 0f3F4F1BBD; +sub.f32 f427, f1701, f1702; +sub.f32 f428, f148, f280; +sub.f32 f430, f192, f236; +mul.f32 f1699, f428, 0f3F737871; +mul.f32 f1700, f430, 0fBF167918; +sub.f32 f432, f1700, f1699; +add.f32 f433, f432, f427; +sub.f32 f434, f427, f432; +add.f32 f1698, f108, f1706; +mul.f32 f435, f1706, 0f3F4F1BBD; +sub.f32 f436, f108, f435; +fma.rn.f32 f437, f1705, 0f3E9E377A, f436; +mul.f32 f438, f428, 0f3F167918; +mul.f32 f439, f430, 0f3F737871; +sub.f32 f440, f439, f438; +add.f32 f441, f440, f437; +sub.f32 f442, f437, f440; +add.f32 f443, f323, f338; +add.f32 f445, f328, f333; +add.f32 f1697, f117, f443; +add.f32 f446, f445, f1697; +add.f32 f447, f325, f340; +add.f32 f449, f330, f335; +add.f32 f1696, f135, f447; +add.f32 f450, f449, f1696; +fma.rn.f32 f1694, f443, 0f3E9E377A, f117; +mul.f32 f1695, f445, 0f3F4F1BBD; +sub.f32 f453, f1694, f1695; +sub.f32 f454, f325, f340; +sub.f32 f456, f330, f335; +mul.f32 f1692, f454, 0f3F737871; +mul.f32 f1693, f456, 0fBF167918; +sub.f32 f458, f1693, f1692; +sub.f32 f459, f453, f458; +add.f32 f460, f458, f453; +mul.f32 f461, f443, 0f3F4F1BBD; +sub.f32 f462, f117, f461; +fma.rn.f32 f463, f445, 0f3E9E377A, f462; +mul.f32 f464, f454, 0f3F167918; +mul.f32 f465, f456, 0f3F737871; +sub.f32 f466, f465, f464; +sub.f32 f467, f463, f466; +add.f32 f468, f466, f463; +mul.f32 f470, f449, 0f3F4F1BBD; +fma.rn.f32 f1691, f447, 0f3E9E377A, f135; +sub.f32 f471, f1691, f470; +sub.f32 f472, f323, f338; +sub.f32 f474, f328, f333; +mul.f32 f1689, f472, 0f3F737871; +mul.f32 f1690, f474, 0fBF167918; +sub.f32 f476, f1690, f1689; +add.f32 f477, f476, f471; +sub.f32 f478, f471, f476; +mul.f32 f479, f447, 0f3F4F1BBD; +sub.f32 f480, f135, f479; +fma.rn.f32 f481, f449, 0f3E9E377A, f480; +mul.f32 f482, f472, 0f3F167918; +mul.f32 f483, f474, 0f3F737871; +sub.f32 f484, f483, f482; +add.f32 f485, f484, f481; +sub.f32 f486, f481, f484; +add.f32 f487, f343, f358; +add.f32 f489, f348, f353; +add.f32 f1688, f125, f487; +add.f32 f490, f489, f1688; +add.f32 f491, f345, f360; +add.f32 f493, f350, f355; +add.f32 f1687, f143, f491; +add.f32 f494, f493, f1687; +mul.f32 f496, f489, 0f3F4F1BBD; +fma.rn.f32 f1686, f487, 0f3E9E377A, f125; +sub.f32 f497, f1686, f496; +sub.f32 f498, f345, f360; +sub.f32 f500, f350, f355; +mul.f32 f1684, f498, 0f3F737871; +mul.f32 f1685, f500, 0fBF167918; +sub.f32 f502, f1685, f1684; +sub.f32 f503, f497, f502; +add.f32 f504, f502, f497; +mul.f32 f505, f487, 0f3F4F1BBD; +sub.f32 f506, f125, f505; +fma.rn.f32 f507, f489, 0f3E9E377A, f506; +mul.f32 f508, f498, 0f3F167918; +mul.f32 f509, f500, 0f3F737871; +sub.f32 f510, f509, f508; +sub.f32 f511, f507, f510; +add.f32 f512, f510, f507; +mul.f32 f514, f493, 0f3F4F1BBD; +fma.rn.f32 f1683, f491, 0f3E9E377A, f143; +sub.f32 f515, f1683, f514; +sub.f32 f516, f343, f358; +sub.f32 f518, f348, f353; +mul.f32 f1681, f516, 0f3F737871; +mul.f32 f1682, f518, 0fBF167918; +sub.f32 f520, f1682, f1681; +add.f32 f521, f520, f515; +sub.f32 f522, f515, f520; +mul.f32 f523, f491, 0f3F4F1BBD; +sub.f32 f524, f143, f523; +fma.rn.f32 f525, f493, 0f3E9E377A, f524; +mul.f32 f526, f516, 0f3F167918; +mul.f32 f527, f518, 0f3F737871; +sub.f32 f528, f527, f526; +add.f32 f529, f528, f525; +sub.f32 f530, f525, f528; +add.f32 f531, f363, f378; +add.f32 f533, f368, f373; +add.f32 f1680, f126, f531; +add.f32 f534, f533, f1680; +add.f32 f535, f365, f380; +add.f32 f537, f370, f375; +add.f32 f1679, f144, f535; +add.f32 f538, f537, f1679; +mul.f32 f540, f533, 0f3F4F1BBD; +fma.rn.f32 f1678, f531, 0f3E9E377A, f126; +sub.f32 f541, f1678, f540; +sub.f32 f542, f365, f380; +sub.f32 f544, f370, f375; +mul.f32 f1676, f542, 0f3F737871; +mul.f32 f1677, f544, 0fBF167918; +sub.f32 f546, f1677, f1676; +sub.f32 f547, f541, f546; +add.f32 f548, f546, f541; +mul.f32 f549, f531, 0f3F4F1BBD; +sub.f32 f550, f126, f549; +fma.rn.f32 f551, f533, 0f3E9E377A, f550; +mul.f32 f552, f542, 0f3F167918; +mul.f32 f553, f544, 0f3F737871; +sub.f32 f554, f553, f552; +sub.f32 f555, f551, f554; +add.f32 f556, f554, f551; +fma.rn.f32 f1674, f535, 0f3E9E377A, f144; +mul.f32 f1675, f537, 0f3F4F1BBD; +sub.f32 f559, f1674, f1675; +sub.f32 f560, f363, f378; +sub.f32 f562, f368, f373; +mul.f32 f1672, f560, 0f3F737871; +mul.f32 f1673, f562, 0fBF167918; +sub.f32 f564, f1673, f1672; +add.f32 f565, f564, f559; +sub.f32 f566, f559, f564; +mul.f32 f567, f535, 0f3F4F1BBD; +sub.f32 f568, f144, f567; +fma.rn.f32 f569, f537, 0f3E9E377A, f568; +mul.f32 f570, f560, 0f3F167918; +mul.f32 f571, f562, 0f3F737871; +sub.f32 f572, f571, f570; +add.f32 f573, f572, f569; +sub.f32 f574, f569, f572; +add.f32 f575, f383, f398; +add.f32 f577, f388, f393; +add.f32 f1671, f118, f575; +add.f32 f578, f577, f1671; +add.f32 f579, f385, f400; +add.f32 f581, f390, f395; +add.f32 f1670, f136, f579; +add.f32 f582, f581, f1670; +fma.rn.f32 f1668, f575, 0f3E9E377A, f118; +mul.f32 f1669, f577, 0f3F4F1BBD; +sub.f32 f585, f1668, f1669; +sub.f32 f586, f385, f400; +sub.f32 f588, f390, f395; +mul.f32 f1666, f586, 0f3F737871; +mul.f32 f1667, f588, 0fBF167918; +sub.f32 f590, f1667, f1666; +sub.f32 f591, f585, f590; +add.f32 f592, f590, f585; +mul.f32 f593, f575, 0f3F4F1BBD; +sub.f32 f594, f118, f593; +fma.rn.f32 f595, f577, 0f3E9E377A, f594; +mul.f32 f596, f586, 0f3F167918; +mul.f32 f597, f588, 0f3F737871; +sub.f32 f598, f597, f596; +sub.f32 f599, f595, f598; +add.f32 f600, f598, f595; +mul.f32 f602, f581, 0f3F4F1BBD; +fma.rn.f32 f1665, f579, 0f3E9E377A, f136; +sub.f32 f603, f1665, f602; +sub.f32 f604, f383, f398; +sub.f32 f606, f388, f393; +mul.f32 f1663, f604, 0f3F737871; +mul.f32 f1664, f606, 0fBF167918; +sub.f32 f608, f1664, f1663; +add.f32 f609, f608, f603; +sub.f32 f610, f603, f608; +mul.f32 f611, f579, 0f3F4F1BBD; +sub.f32 f612, f136, f611; +fma.rn.f32 f613, f581, 0f3E9E377A, f612; +mul.f32 f614, f604, 0f3F167918; +mul.f32 f615, f606, 0f3F737871; +sub.f32 f616, f615, f614; +add.f32 f617, f616, f613; +sub.f32 f618, f613, f616; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 5000, r3; +mov.u64 rd5, %51; +mul.wide.u32 rd7, r7, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f619, f620}, [rd6]; +mul.f32 f624, f620, f450; +mul.f32 f625, f619, f450; +mul.f32 f627, f620, f620; +mul.f32 f1662, f619, f619; +sub.f32 f628, f1662, f627; +mul.f32 f629, f620, f619; +fma.rn.f32 f630, f620, f619, f629; +mul.f32 f632, f630, f494; +mul.f32 f633, f628, f494; +mul.f32 f1660, f619, f628; +mul.f32 f1661, f620, f630; +sub.f32 f636, f1660, f1661; +mul.f32 f1659, f628, f490; +mul.f32 f637, f619, f630; +fma.rn.f32 f638, f620, f628, f637; +mul.f32 f640, f638, f538; +mul.f32 f641, f636, f538; +mul.f32 f643, f620, f638; +mul.f32 f1658, f619, f636; +sub.f32 f644, f1658, f643; +mul.f32 f1657, f636, f534; +mul.f32 f645, f619, f638; +fma.rn.f32 f646, f620, f636, f645; +mul.f32 f648, f646, f582; +mul.f32 f649, f644, f582; +mul.f32 f1655, f619, f644; +mul.f32 f1656, f620, f646; +sub.f32 f652, f1655, f1656; +mul.f32 f1654, f644, f578; +mul.f32 f653, f619, f646; +fma.rn.f32 f654, f620, f644, f653; +mul.f32 f656, f654, f433; +mul.f32 f657, f652, f433; +mul.f32 f659, f620, f654; +mul.f32 f1653, f619, f652; +sub.f32 f660, f1653, f659; +mul.f32 f1652, f652, f415; +mul.f32 f661, f619, f654; +fma.rn.f32 f662, f620, f652, f661; +mul.f32 f664, f662, f477; +mul.f32 f665, f660, f477; +mul.f32 f667, f620, f662; +mul.f32 f1651, f619, f660; +sub.f32 f668, f1651, f667; +mul.f32 f1650, f660, f459; +mul.f32 f669, f619, f662; +fma.rn.f32 f670, f620, f660, f669; +mul.f32 f672, f670, f521; +mul.f32 f673, f668, f521; +mul.f32 f1648, f619, f668; +mul.f32 f1649, f620, f670; +sub.f32 f676, f1648, f1649; +mul.f32 f1647, f668, f503; +mul.f32 f677, f619, f670; +fma.rn.f32 f678, f620, f668, f677; +mul.f32 f680, f678, f565; +mul.f32 f681, f676, f565; +mul.f32 f683, f620, f678; +mul.f32 f1646, f619, f676; +sub.f32 f684, f1646, f683; +mul.f32 f1645, f676, f547; +mul.f32 f685, f619, f678; +fma.rn.f32 f686, f620, f676, f685; +mul.f32 f688, f686, f609; +mul.f32 f689, f684, f609; +mul.f32 f691, f620, f686; +mul.f32 f1644, f619, f684; +sub.f32 f692, f1644, f691; +mul.f32 f1643, f684, f591; +mul.f32 f693, f619, f686; +fma.rn.f32 f694, f620, f684, f693; +mul.f32 f696, f694, f441; +mul.f32 f697, f692, f441; +mul.f32 f1641, f619, f692; +mul.f32 f1642, f620, f694; +sub.f32 f700, f1641, f1642; +mul.f32 f1640, f692, f423; +mul.f32 f701, f619, f694; +fma.rn.f32 f702, f620, f692, f701; +mul.f32 f704, f702, f485; +mul.f32 f705, f700, f485; +mul.f32 f707, f620, f702; +mul.f32 f1639, f619, f700; +sub.f32 f708, f1639, f707; +mul.f32 f1638, f700, f467; +mul.f32 f709, f619, f702; +fma.rn.f32 f710, f620, f700, f709; +mul.f32 f712, f710, f529; +mul.f32 f713, f708, f529; +mul.f32 f1636, f619, f708; +mul.f32 f1637, f620, f710; +sub.f32 f716, f1636, f1637; +mul.f32 f1635, f708, f511; +mul.f32 f717, f619, f710; +fma.rn.f32 f718, f620, f708, f717; +mul.f32 f720, f718, f573; +mul.f32 f721, f716, f573; +mul.f32 f723, f620, f718; +mul.f32 f1634, f619, f716; +sub.f32 f724, f1634, f723; +mul.f32 f1633, f716, f555; +mul.f32 f725, f619, f718; +fma.rn.f32 f726, f620, f716, f725; +mul.f32 f728, f726, f617; +mul.f32 f729, f724, f617; +mul.f32 f731, f620, f726; +mul.f32 f1632, f619, f724; +sub.f32 f732, f1632, f731; +mul.f32 f1631, f724, f599; +mul.f32 f733, f619, f726; +fma.rn.f32 f734, f620, f724, f733; +mul.f32 f736, f734, f442; +mul.f32 f737, f732, f442; +mul.f32 f1629, f619, f732; +mul.f32 f1630, f620, f734; +sub.f32 f740, f1629, f1630; +mul.f32 f1628, f732, f424; +mul.f32 f741, f619, f734; +fma.rn.f32 f742, f620, f732, f741; +mul.f32 f744, f742, f486; +mul.f32 f745, f740, f486; +mul.f32 f747, f620, f742; +mul.f32 f1627, f619, f740; +sub.f32 f748, f1627, f747; +mul.f32 f1626, f740, f468; +mul.f32 f749, f619, f742; +fma.rn.f32 f750, f620, f740, f749; +mul.f32 f752, f750, f530; +mul.f32 f753, f748, f530; +mul.f32 f755, f620, f750; +mul.f32 f1625, f619, f748; +sub.f32 f756, f1625, f755; +mul.f32 f1624, f748, f512; +mul.f32 f757, f619, f750; +fma.rn.f32 f758, f620, f748, f757; +mul.f32 f760, f758, f574; +mul.f32 f761, f756, f574; +mul.f32 f1622, f619, f756; +mul.f32 f1623, f620, f758; +sub.f32 f764, f1622, f1623; +mul.f32 f1621, f756, f556; +mul.f32 f765, f619, f758; +fma.rn.f32 f766, f620, f756, f765; +mul.f32 f768, f766, f618; +mul.f32 f769, f764, f618; +mul.f32 f771, f620, f766; +mul.f32 f1620, f619, f764; +sub.f32 f772, f1620, f771; +mul.f32 f1619, f764, f600; +mul.f32 f773, f619, f766; +fma.rn.f32 f774, f620, f764, f773; +mul.f32 f776, f774, f434; +mul.f32 f777, f772, f434; +mul.f32 f1617, f619, f772; +mul.f32 f1618, f620, f774; +sub.f32 f780, f1617, f1618; +mul.f32 f1616, f772, f416; +mul.f32 f781, f619, f774; +fma.rn.f32 f782, f620, f772, f781; +mul.f32 f784, f782, f478; +mul.f32 f785, f780, f478; +mul.f32 f787, f620, f782; +mul.f32 f1615, f619, f780; +sub.f32 f788, f1615, f787; +mul.f32 f1614, f780, f460; +mul.f32 f789, f619, f782; +fma.rn.f32 f790, f620, f780, f789; +mul.f32 f792, f790, f522; +mul.f32 f793, f788, f522; +mul.f32 f795, f620, f790; +mul.f32 f1613, f619, f788; +sub.f32 f796, f1613, f795; +mul.f32 f1612, f788, f504; +mul.f32 f797, f619, f790; +fma.rn.f32 f798, f620, f788, f797; +mul.f32 f800, f798, f566; +mul.f32 f801, f796, f566; +mul.f32 f1610, f619, f796; +mul.f32 f1611, f620, f798; +sub.f32 f804, f1610, f1611; +mul.f32 f1609, f619, f446; +mul.f32 f805, f619, f798; +mul.f32 f1608, f796, f548; +fma.rn.f32 f806, f620, f796, f805; +mul.f32 f807, f804, f592; +mul.f32 f808, f806, f610; +mul.f32 f809, f804, f610; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +add.f32 f810, f1705, f1698; +add.f32 f811, f403, f1703; +st.shared.v2.f32 [r9], {f811, f810}; +fma.rn.f32 f812, f620, f446, f625; +sub.f32 f813, f1609, f624; +st.shared.v2.f32 [r9+8], {f813, f812}; +fma.rn.f32 f814, f630, f490, f633; +sub.f32 f815, f1659, f632; +st.shared.v2.f32 [r9+16], {f815, f814}; +fma.rn.f32 f816, f638, f534, f641; +sub.f32 f817, f1657, f640; +st.shared.v2.f32 [r9+24], {f817, f816}; +fma.rn.f32 f818, f646, f578, f649; +sub.f32 f819, f1654, f648; +st.shared.v2.f32 [r9+32], {f819, f818}; +sub.f32 f820, f1652, f656; +fma.rn.f32 f821, f654, f415, f657; +st.shared.v2.f32 [r9+40], {f820, f821}; +fma.rn.f32 f822, f662, f459, f665; +sub.f32 f823, f1650, f664; +st.shared.v2.f32 [r9+48], {f823, f822}; +sub.f32 f824, f1647, f672; +fma.rn.f32 f825, f670, f503, f673; +st.shared.v2.f32 [r9+56], {f824, f825}; +fma.rn.f32 f826, f678, f547, f681; +sub.f32 f827, f1645, f680; +st.shared.v2.f32 [r9+64], {f827, f826}; +fma.rn.f32 f828, f686, f591, f689; +sub.f32 f829, f1643, f688; +st.shared.v2.f32 [r9+72], {f829, f828}; +fma.rn.f32 f830, f694, f423, f697; +sub.f32 f831, f1640, f696; +st.shared.v2.f32 [r9+80], {f831, f830}; +fma.rn.f32 f832, f702, f467, f705; +sub.f32 f833, f1638, f704; +st.shared.v2.f32 [r9+88], {f833, f832}; +fma.rn.f32 f834, f710, f511, f713; +sub.f32 f835, f1635, f712; +st.shared.v2.f32 [r9+96], {f835, f834}; +fma.rn.f32 f836, f718, f555, f721; +sub.f32 f837, f1633, f720; +st.shared.v2.f32 [r9+104], {f837, f836}; +fma.rn.f32 f838, f726, f599, f729; +sub.f32 f839, f1631, f728; +st.shared.v2.f32 [r9+112], {f839, f838}; +fma.rn.f32 f840, f734, f424, f737; +sub.f32 f841, f1628, f736; +st.shared.v2.f32 [r9+120], {f841, f840}; +fma.rn.f32 f842, f742, f468, f745; +sub.f32 f843, f1626, f744; +st.shared.v2.f32 [r9+128], {f843, f842}; +fma.rn.f32 f844, f750, f512, f753; +sub.f32 f845, f1624, f752; +st.shared.v2.f32 [r9+136], {f845, f844}; +fma.rn.f32 f846, f758, f556, f761; +sub.f32 f847, f1621, f760; +st.shared.v2.f32 [r9+144], {f847, f846}; +fma.rn.f32 f848, f766, f600, f769; +sub.f32 f849, f1619, f768; +st.shared.v2.f32 [r9+152], {f849, f848}; +fma.rn.f32 f850, f774, f416, f777; +sub.f32 f851, f1616, f776; +st.shared.v2.f32 [r9+160], {f851, f850}; +fma.rn.f32 f852, f782, f460, f785; +sub.f32 f853, f1614, f784; +st.shared.v2.f32 [r9+168], {f853, f852}; +fma.rn.f32 f854, f790, f504, f793; +sub.f32 f855, f1612, f792; +st.shared.v2.f32 [r9+176], {f855, f854}; +fma.rn.f32 f856, f798, f548, f801; +sub.f32 f857, f1608, f800; +st.shared.v2.f32 [r9+184], {f857, f856}; +fma.rn.f32 f858, f806, f592, f809; +sub.f32 f859, f807, f808; +st.shared.v2.f32 [r9+192], {f859, f858}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.v2.f32 {f860, f861}, [r10]; +ld.shared.v2.f32 {f864, f865}, [r10+200]; +ld.shared.v2.f32 {f868, f869}, [r10+400]; +ld.shared.v2.f32 {f872, f873}, [r10+600]; +ld.shared.v2.f32 {f876, f877}, [r10+800]; +ld.shared.v2.f32 {f880, f881}, [r10+1000]; +ld.shared.v2.f32 {f884, f885}, [r10+1200]; +ld.shared.v2.f32 {f888, f889}, [r10+1400]; +ld.shared.v2.f32 {f892, f893}, [r10+1600]; +ld.shared.v2.f32 {f896, f897}, [r10+1800]; +ld.shared.v2.f32 {f900, f901}, [r10+2000]; +ld.shared.v2.f32 {f904, f905}, [r10+2200]; +ld.shared.v2.f32 {f908, f909}, [r10+2400]; +ld.shared.v2.f32 {f912, f913}, [r10+2600]; +ld.shared.v2.f32 {f916, f917}, [r10+2800]; +ld.shared.v2.f32 {f920, f921}, [r10+3000]; +ld.shared.v2.f32 {f924, f925}, [r10+3200]; +ld.shared.v2.f32 {f928, f929}, [r10+3400]; +ld.shared.v2.f32 {f932, f933}, [r10+3600]; +ld.shared.v2.f32 {f936, f937}, [r10+3800]; +ld.shared.v2.f32 {f940, f941}, [r10+4000]; +ld.shared.v2.f32 {f944, f945}, [r10+4200]; +ld.shared.v2.f32 {f948, f949}, [r10+4400]; +ld.shared.v2.f32 {f952, f953}, [r10+4600]; +ld.shared.v2.f32 {f956, f957}, [r10+4800]; +add.f32 f960, f880, f940; +add.f32 f962, f900, f920; +add.f32 f1607, f860, f960; +add.f32 f963, f962, f1607; +add.f32 f964, f881, f941; +add.f32 f966, f901, f921; +add.f32 f1606, f861, f964; +add.f32 f967, f966, f1606; +fma.rn.f32 f1604, f960, 0f3E9E377A, f860; +mul.f32 f1605, f962, 0f3F4F1BBD; +sub.f32 f970, f1604, f1605; +sub.f32 f971, f881, f941; +sub.f32 f973, f901, f921; +mul.f32 f1602, f971, 0f3F737871; +mul.f32 f1603, f973, 0fBF167918; +sub.f32 f975, f1603, f1602; +sub.f32 f976, f970, f975; +add.f32 f977, f975, f970; +mul.f32 f978, f960, 0f3F4F1BBD; +sub.f32 f979, f860, f978; +fma.rn.f32 f980, f962, 0f3E9E377A, f979; +mul.f32 f981, f971, 0f3F167918; +mul.f32 f982, f973, 0f3F737871; +sub.f32 f983, f982, f981; +sub.f32 f984, f980, f983; +add.f32 f985, f983, f980; +mul.f32 f987, f966, 0f3F4F1BBD; +fma.rn.f32 f1601, f964, 0f3E9E377A, f861; +sub.f32 f988, f1601, f987; +sub.f32 f989, f880, f940; +sub.f32 f991, f900, f920; +mul.f32 f1599, f989, 0f3F737871; +mul.f32 f1600, f991, 0fBF167918; +sub.f32 f993, f1600, f1599; +add.f32 f994, f993, f988; +sub.f32 f995, f988, f993; +mul.f32 f996, f964, 0f3F4F1BBD; +sub.f32 f997, f861, f996; +fma.rn.f32 f998, f966, 0f3E9E377A, f997; +mul.f32 f999, f989, 0f3F167918; +mul.f32 f1000, f991, 0f3F737871; +sub.f32 f1001, f1000, f999; +add.f32 f1002, f1001, f998; +sub.f32 f1003, f998, f1001; +add.f32 f1004, f884, f944; +add.f32 f1006, f904, f924; +add.f32 f1598, f864, f1004; +add.f32 f1007, f1006, f1598; +add.f32 f1008, f885, f945; +add.f32 f1010, f905, f925; +add.f32 f1597, f865, f1008; +add.f32 f1011, f1010, f1597; +mul.f32 f1013, f1006, 0f3F4F1BBD; +fma.rn.f32 f1596, f1004, 0f3E9E377A, f864; +sub.f32 f1014, f1596, f1013; +sub.f32 f1015, f885, f945; +sub.f32 f1017, f905, f925; +mul.f32 f1594, f1015, 0f3F737871; +mul.f32 f1595, f1017, 0fBF167918; +sub.f32 f1019, f1595, f1594; +sub.f32 f1020, f1014, f1019; +add.f32 f1021, f1019, f1014; +mul.f32 f1022, f1004, 0f3F4F1BBD; +sub.f32 f1023, f864, f1022; +fma.rn.f32 f1024, f1006, 0f3E9E377A, f1023; +mul.f32 f1025, f1015, 0f3F167918; +mul.f32 f1026, f1017, 0f3F737871; +sub.f32 f1027, f1026, f1025; +sub.f32 f1028, f1024, f1027; +add.f32 f1029, f1027, f1024; +fma.rn.f32 f1592, f1008, 0f3E9E377A, f865; +mul.f32 f1593, f1010, 0f3F4F1BBD; +sub.f32 f1032, f1592, f1593; +sub.f32 f1033, f884, f944; +sub.f32 f1035, f904, f924; +mul.f32 f1590, f1033, 0f3F737871; +mul.f32 f1591, f1035, 0fBF167918; +sub.f32 f1037, f1591, f1590; +add.f32 f1038, f1037, f1032; +sub.f32 f1039, f1032, f1037; +mul.f32 f1040, f1008, 0f3F4F1BBD; +sub.f32 f1041, f865, f1040; +fma.rn.f32 f1042, f1010, 0f3E9E377A, f1041; +mul.f32 f1043, f1033, 0f3F167918; +mul.f32 f1044, f1035, 0f3F737871; +sub.f32 f1045, f1044, f1043; +add.f32 f1046, f1045, f1042; +sub.f32 f1047, f1042, f1045; +add.f32 f1048, f888, f948; +add.f32 f1050, f908, f928; +add.f32 f1589, f868, f1048; +add.f32 f1051, f1050, f1589; +add.f32 f1052, f889, f949; +add.f32 f1054, f909, f929; +add.f32 f1588, f869, f1052; +add.f32 f1055, f1054, f1588; +fma.rn.f32 f1586, f1048, 0f3E9E377A, f868; +mul.f32 f1587, f1050, 0f3F4F1BBD; +sub.f32 f1058, f1586, f1587; +sub.f32 f1059, f889, f949; +sub.f32 f1061, f909, f929; +mul.f32 f1584, f1059, 0f3F737871; +mul.f32 f1585, f1061, 0fBF167918; +sub.f32 f1063, f1585, f1584; +sub.f32 f1064, f1058, f1063; +add.f32 f1065, f1063, f1058; +mul.f32 f1066, f1048, 0f3F4F1BBD; +sub.f32 f1067, f868, f1066; +fma.rn.f32 f1068, f1050, 0f3E9E377A, f1067; +mul.f32 f1069, f1059, 0f3F167918; +mul.f32 f1070, f1061, 0f3F737871; +sub.f32 f1071, f1070, f1069; +sub.f32 f1072, f1068, f1071; +add.f32 f1073, f1071, f1068; +mul.f32 f1075, f1054, 0f3F4F1BBD; +fma.rn.f32 f1583, f1052, 0f3E9E377A, f869; +sub.f32 f1076, f1583, f1075; +sub.f32 f1077, f888, f948; +sub.f32 f1079, f908, f928; +mul.f32 f1080, f1079, 0fBF167918; +mul.f32 f1582, f1077, 0f3F737871; +sub.f32 f1081, f1080, f1582; +add.f32 f1082, f1081, f1076; +sub.f32 f1083, f1076, f1081; +mul.f32 f1084, f1052, 0f3F4F1BBD; +sub.f32 f1085, f869, f1084; +fma.rn.f32 f1086, f1054, 0f3E9E377A, f1085; +mul.f32 f1087, f1077, 0f3F167918; +mul.f32 f1088, f1079, 0f3F737871; +sub.f32 f1089, f1088, f1087; +add.f32 f1090, f1089, f1086; +sub.f32 f1091, f1086, f1089; +add.f32 f1092, f892, f952; +add.f32 f1094, f912, f932; +add.f32 f1581, f872, f1092; +add.f32 f1095, f1094, f1581; +add.f32 f1096, f893, f953; +add.f32 f1098, f913, f933; +add.f32 f1580, f873, f1096; +add.f32 f1099, f1098, f1580; +mul.f32 f1101, f1094, 0f3F4F1BBD; +fma.rn.f32 f1579, f1092, 0f3E9E377A, f872; +sub.f32 f1102, f1579, f1101; +sub.f32 f1103, f893, f953; +sub.f32 f1105, f913, f933; +mul.f32 f1106, f1105, 0fBF167918; +mul.f32 f1578, f1103, 0f3F737871; +sub.f32 f1107, f1106, f1578; +sub.f32 f1108, f1102, f1107; +add.f32 f1109, f1107, f1102; +mul.f32 f1110, f1092, 0f3F4F1BBD; +sub.f32 f1111, f872, f1110; +fma.rn.f32 f1112, f1094, 0f3E9E377A, f1111; +mul.f32 f1113, f1103, 0f3F167918; +mul.f32 f1114, f1105, 0f3F737871; +sub.f32 f1115, f1114, f1113; +sub.f32 f1116, f1112, f1115; +add.f32 f1117, f1115, f1112; +fma.rn.f32 f1576, f1096, 0f3E9E377A, f873; +mul.f32 f1577, f1098, 0f3F4F1BBD; +sub.f32 f1120, f1576, f1577; +sub.f32 f1121, f892, f952; +sub.f32 f1123, f912, f932; +mul.f32 f1574, f1121, 0f3F737871; +mul.f32 f1575, f1123, 0fBF167918; +sub.f32 f1125, f1575, f1574; +add.f32 f1126, f1125, f1120; +sub.f32 f1127, f1120, f1125; +mul.f32 f1128, f1096, 0f3F4F1BBD; +sub.f32 f1129, f873, f1128; +fma.rn.f32 f1130, f1098, 0f3E9E377A, f1129; +mul.f32 f1131, f1121, 0f3F167918; +mul.f32 f1132, f1123, 0f3F737871; +sub.f32 f1133, f1132, f1131; +add.f32 f1134, f1133, f1130; +sub.f32 f1135, f1130, f1133; +add.f32 f1136, f896, f956; +add.f32 f1138, f916, f936; +add.f32 f1573, f876, f1136; +add.f32 f1139, f1138, f1573; +add.f32 f1140, f897, f957; +add.f32 f1142, f917, f937; +add.f32 f1572, f877, f1140; +add.f32 f1143, f1142, f1572; +fma.rn.f32 f1570, f1136, 0f3E9E377A, f876; +mul.f32 f1571, f1138, 0f3F4F1BBD; +sub.f32 f1146, f1570, f1571; +sub.f32 f1147, f897, f957; +sub.f32 f1149, f917, f937; +mul.f32 f1568, f1147, 0f3F737871; +mul.f32 f1569, f1149, 0fBF167918; +sub.f32 f1151, f1569, f1568; +sub.f32 f1152, f1146, f1151; +add.f32 f1153, f1151, f1146; +mul.f32 f1154, f1136, 0f3F4F1BBD; +sub.f32 f1155, f876, f1154; +fma.rn.f32 f1156, f1138, 0f3E9E377A, f1155; +mul.f32 f1157, f1147, 0f3F167918; +mul.f32 f1158, f1149, 0f3F737871; +sub.f32 f1159, f1158, f1157; +sub.f32 f1160, f1156, f1159; +add.f32 f1161, f1159, f1156; +mul.f32 f1163, f1142, 0f3F4F1BBD; +fma.rn.f32 f1567, f1140, 0f3E9E377A, f877; +sub.f32 f1164, f1567, f1163; +sub.f32 f1165, f896, f956; +sub.f32 f1167, f916, f936; +mul.f32 f1565, f1165, 0f3F737871; +mul.f32 f1566, f1167, 0fBF167918; +sub.f32 f1169, f1566, f1565; +add.f32 f1170, f1169, f1164; +sub.f32 f1171, f1164, f1169; +mul.f32 f1172, f1140, 0f3F4F1BBD; +sub.f32 f1173, f877, f1172; +fma.rn.f32 f1174, f1142, 0f3E9E377A, f1173; +mul.f32 f1175, f1165, 0f3F167918; +mul.f32 f1176, f1167, 0f3F737871; +sub.f32 f1177, f1176, f1175; +add.f32 f1178, f1177, f1174; +sub.f32 f1179, f1174, f1177; +mul.f32 f1181, f1038, 0fBE7EA890; +mul.f32 f1564, f1020, 0f3F77F511; +sub.f32 f1182, f1564, f1181; +mul.f32 f1183, f1038, 0f3F77F511; +fma.rn.f32 f1184, f1020, 0fBE7EA890, f1183; +mul.f32 f1186, f1082, 0fBEF6A86B; +mul.f32 f1563, f1064, 0f3F6055A2; +sub.f32 f1187, f1563, f1186; +mul.f32 f1188, f1082, 0f3F6055A2; +fma.rn.f32 f1189, f1064, 0fBEF6A86B, f1188; +mul.f32 f1191, f1126, 0fBF2F3E7B; +mul.f32 f1562, f1108, 0f3F3A9DB0; +sub.f32 f1192, f1562, f1191; +mul.f32 f1193, f1126, 0f3F3A9DB0; +fma.rn.f32 f1194, f1108, 0fBF2F3E7B, f1193; +mul.f32 f1196, f1170, 0fBF5825E0; +mul.f32 f1561, f1152, 0f3F092BF2; +sub.f32 f1197, f1561, f1196; +mul.f32 f1198, f1170, 0f3F092BF2; +fma.rn.f32 f1199, f1152, 0fBF5825E0, f1198; +mul.f32 f1201, f1046, 0fBEF6A86B; +mul.f32 f1560, f1028, 0f3F6055A2; +sub.f32 f1202, f1560, f1201; +mul.f32 f1203, f1046, 0f3F6055A2; +fma.rn.f32 f1204, f1028, 0fBEF6A86B, f1203; +mul.f32 f1558, f1072, 0f3F092BF2; +mul.f32 f1559, f1090, 0fBF5825E0; +sub.f32 f1207, f1558, f1559; +mul.f32 f1208, f1090, 0f3F092BF2; +fma.rn.f32 f1209, f1072, 0fBF5825E0, f1208; +mul.f32 f1556, f1116, 0f3D809851; +mul.f32 f1557, f1134, 0fBF7F7EAE; +sub.f32 f1212, f1556, f1557; +mul.f32 f1213, f1134, 0f3D809851; +fma.rn.f32 f1214, f1116, 0fBF7F7EAE, f1213; +mul.f32 f1554, f1160, 0fBED9FFBE; +mul.f32 f1555, f1178, 0fBF67A2BF; +sub.f32 f1217, f1554, f1555; +mul.f32 f1218, f1178, 0fBED9FFBE; +fma.rn.f32 f1219, f1160, 0fBF67A2BF, f1218; +mul.f32 f1221, f1047, 0fBF2F3E7B; +mul.f32 f1553, f1029, 0f3F3A9DB0; +sub.f32 f1222, f1553, f1221; +mul.f32 f1223, f1047, 0f3F3A9DB0; +fma.rn.f32 f1224, f1029, 0fBF2F3E7B, f1223; +mul.f32 f1226, f1091, 0fBF7F7EAE; +mul.f32 f1552, f1073, 0f3D809851; +sub.f32 f1227, f1552, f1226; +mul.f32 f1228, f1091, 0f3D809851; +fma.rn.f32 f1229, f1073, 0fBF7F7EAE, f1228; +mul.f32 f1231, f1135, 0fBF45405B; +mul.f32 f1551, f1117, 0fBF232E38; +sub.f32 f1232, f1551, f1231; +mul.f32 f1233, f1135, 0fBF232E38; +fma.rn.f32 f1234, f1117, 0fBF45405B, f1233; +mul.f32 f1236, f1179, 0fBE00575B; +mul.f32 f1550, f1161, 0fBF7DFB3B; +sub.f32 f1237, f1550, f1236; +mul.f32 f1238, f1179, 0fBF7DFB3B; +fma.rn.f32 f1239, f1161, 0fBE00575B, f1238; +mul.f32 f1241, f1039, 0fBF5825E0; +mul.f32 f1549, f1021, 0f3F092BF2; +sub.f32 f1242, f1549, f1241; +mul.f32 f1243, f1039, 0f3F092BF2; +fma.rn.f32 f1244, f1021, 0fBF5825E0, f1243; +mul.f32 f1246, f1083, 0fBF67A2BF; +mul.f32 f1548, f1065, 0fBED9FFBE; +sub.f32 f1247, f1548, f1246; +mul.f32 f1248, f1083, 0fBED9FFBE; +fma.rn.f32 f1249, f1065, 0fBF67A2BF, f1248; +mul.f32 f1546, f1109, 0fBF7DFB3B; +mul.f32 f1547, f1127, 0fBE00575B; +sub.f32 f1252, f1546, f1547; +mul.f32 f1253, f1127, 0fBF7DFB3B; +fma.rn.f32 f1254, f1109, 0fBE00575B, f1253; +mul.f32 f1544, f1153, 0fBF232E38; +mul.f32 f1545, f1171, 0f3F45405B; +sub.f32 f1257, f1544, f1545; +mul.f32 f1258, f1171, 0fBF232E38; +fma.rn.f32 f1259, f1153, 0f3F45405B, f1258; +add.f32 f1260, f1007, f1139; +add.f32 f1262, f1051, f1095; +mul.f32 f1267, f1262, 0f3F4F1BBD; +fma.rn.f32 f1543, f1260, 0f3E9E377A, f963; +sub.f32 f1268, f1543, f1267; +add.f32 f1542, f1011, f1143; +sub.f32 f1269, f1011, f1143; +add.f32 f1541, f1055, f1099; +sub.f32 f1271, f1055, f1099; +mul.f32 f1272, f1271, 0fBF167918; +mul.f32 f1540, f1269, 0f3F737871; +sub.f32 f1273, f1272, f1540; +add.f32 f1539, f963, f1260; +mul.f32 f1274, f1260, 0f3F4F1BBD; +sub.f32 f1275, f963, f1274; +fma.rn.f32 f1276, f1262, 0f3E9E377A, f1275; +mul.f32 f1277, f1269, 0f3F167918; +mul.f32 f1278, f1271, 0f3F737871; +sub.f32 f1279, f1278, f1277; +mul.f32 f1281, f1541, 0f3F4F1BBD; +fma.rn.f32 f1538, f1542, 0f3E9E377A, f967; +sub.f32 f1282, f1538, f1281; +sub.f32 f1283, f1007, f1139; +sub.f32 f1285, f1051, f1095; +mul.f32 f1286, f1285, 0fBF167918; +mul.f32 f1537, f1283, 0f3F737871; +sub.f32 f1287, f1286, f1537; +add.f32 f1536, f967, f1542; +mul.f32 f1288, f1542, 0f3F4F1BBD; +sub.f32 f1289, f967, f1288; +fma.rn.f32 f1290, f1541, 0f3E9E377A, f1289; +mul.f32 f1291, f1283, 0f3F167918; +mul.f32 f1292, f1285, 0f3F737871; +sub.f32 f1293, f1292, f1291; +add.f32 f1294, f1182, f1197; +add.f32 f1296, f1187, f1192; +fma.rn.f32 f1534, f1294, 0f3E9E377A, f976; +mul.f32 f1535, f1296, 0f3F4F1BBD; +sub.f32 f1302, f1534, f1535; +add.f32 f1533, f1184, f1199; +sub.f32 f1303, f1184, f1199; +add.f32 f1532, f1189, f1194; +sub.f32 f1305, f1189, f1194; +mul.f32 f1530, f1303, 0f3F737871; +mul.f32 f1531, f1305, 0fBF167918; +sub.f32 f1307, f1531, f1530; +add.f32 f1529, f976, f1294; +mul.f32 f1308, f1294, 0f3F4F1BBD; +sub.f32 f1309, f976, f1308; +fma.rn.f32 f1310, f1296, 0f3E9E377A, f1309; +mul.f32 f1311, f1303, 0f3F167918; +mul.f32 f1312, f1305, 0f3F737871; +sub.f32 f1313, f1312, f1311; +fma.rn.f32 f1527, f1533, 0f3E9E377A, f994; +mul.f32 f1528, f1532, 0f3F4F1BBD; +sub.f32 f1316, f1527, f1528; +sub.f32 f1317, f1182, f1197; +sub.f32 f1319, f1187, f1192; +mul.f32 f1525, f1317, 0f3F737871; +mul.f32 f1526, f1319, 0fBF167918; +sub.f32 f1321, f1526, f1525; +add.f32 f1524, f994, f1533; +mul.f32 f1322, f1533, 0f3F4F1BBD; +sub.f32 f1323, f994, f1322; +fma.rn.f32 f1324, f1532, 0f3E9E377A, f1323; +mul.f32 f1325, f1317, 0f3F167918; +mul.f32 f1326, f1319, 0f3F737871; +sub.f32 f1327, f1326, f1325; +add.f32 f1328, f1202, f1217; +add.f32 f1330, f1207, f1212; +mul.f32 f1335, f1330, 0f3F4F1BBD; +fma.rn.f32 f1523, f1328, 0f3E9E377A, f984; +sub.f32 f1336, f1523, f1335; +add.f32 f1522, f1204, f1219; +sub.f32 f1337, f1204, f1219; +add.f32 f1521, f1209, f1214; +sub.f32 f1339, f1209, f1214; +mul.f32 f1519, f1337, 0f3F737871; +mul.f32 f1520, f1339, 0fBF167918; +sub.f32 f1341, f1520, f1519; +add.f32 f1518, f984, f1328; +mul.f32 f1342, f1328, 0f3F4F1BBD; +sub.f32 f1343, f984, f1342; +fma.rn.f32 f1344, f1330, 0f3E9E377A, f1343; +mul.f32 f1345, f1337, 0f3F167918; +mul.f32 f1346, f1339, 0f3F737871; +sub.f32 f1347, f1346, f1345; +mul.f32 f1349, f1521, 0f3F4F1BBD; +fma.rn.f32 f1517, f1522, 0f3E9E377A, f1002; +sub.f32 f1350, f1517, f1349; +sub.f32 f1351, f1202, f1217; +sub.f32 f1353, f1207, f1212; +mul.f32 f1515, f1351, 0f3F737871; +mul.f32 f1516, f1353, 0fBF167918; +sub.f32 f1355, f1516, f1515; +add.f32 f1514, f1002, f1522; +mul.f32 f1356, f1522, 0f3F4F1BBD; +sub.f32 f1357, f1002, f1356; +fma.rn.f32 f1358, f1521, 0f3E9E377A, f1357; +mul.f32 f1359, f1351, 0f3F167918; +mul.f32 f1360, f1353, 0f3F737871; +sub.f32 f1361, f1360, f1359; +add.f32 f1362, f1222, f1237; +add.f32 f1364, f1227, f1232; +mul.f32 f1369, f1364, 0f3F4F1BBD; +fma.rn.f32 f1513, f1362, 0f3E9E377A, f985; +sub.f32 f1370, f1513, f1369; +add.f32 f1512, f1224, f1239; +sub.f32 f1371, f1224, f1239; +add.f32 f1511, f1229, f1234; +sub.f32 f1373, f1229, f1234; +mul.f32 f1374, f1373, 0fBF167918; +mul.f32 f1510, f1371, 0f3F737871; +sub.f32 f1375, f1374, f1510; +add.f32 f1509, f985, f1362; +mul.f32 f1376, f1362, 0f3F4F1BBD; +sub.f32 f1377, f985, f1376; +fma.rn.f32 f1378, f1364, 0f3E9E377A, f1377; +mul.f32 f1379, f1371, 0f3F167918; +mul.f32 f1380, f1373, 0f3F737871; +sub.f32 f1381, f1380, f1379; +mul.f32 f1383, f1511, 0f3F4F1BBD; +fma.rn.f32 f1508, f1512, 0f3E9E377A, f1003; +sub.f32 f1384, f1508, f1383; +sub.f32 f1385, f1222, f1237; +sub.f32 f1387, f1227, f1232; +mul.f32 f1388, f1387, 0fBF167918; +mul.f32 f1507, f1385, 0f3F737871; +sub.f32 f1389, f1388, f1507; +add.f32 f1506, f1003, f1512; +mul.f32 f1390, f1512, 0f3F4F1BBD; +sub.f32 f1391, f1003, f1390; +fma.rn.f32 f1392, f1511, 0f3E9E377A, f1391; +mul.f32 f1393, f1385, 0f3F167918; +mul.f32 f1394, f1387, 0f3F737871; +sub.f32 f1395, f1394, f1393; +add.f32 f1396, f1242, f1257; +add.f32 f1398, f1247, f1252; +fma.rn.f32 f1504, f1396, 0f3E9E377A, f977; +mul.f32 f1505, f1398, 0f3F4F1BBD; +sub.f32 f1404, f1504, f1505; +add.f32 f1503, f1244, f1259; +sub.f32 f1405, f1244, f1259; +add.f32 f1502, f1249, f1254; +sub.f32 f1407, f1249, f1254; +mul.f32 f1500, f1405, 0f3F737871; +mul.f32 f1501, f1407, 0fBF167918; +sub.f32 f1409, f1501, f1500; +add.f32 f1499, f977, f1396; +mul.f32 f1410, f1396, 0f3F4F1BBD; +sub.f32 f1411, f977, f1410; +fma.rn.f32 f1412, f1398, 0f3E9E377A, f1411; +mul.f32 f1413, f1405, 0f3F167918; +mul.f32 f1414, f1407, 0f3F737871; +sub.f32 f1415, f1414, f1413; +fma.rn.f32 f1497, f1503, 0f3E9E377A, f995; +mul.f32 f1498, f1502, 0f3F4F1BBD; +sub.f32 f1418, f1497, f1498; +sub.f32 f1419, f1242, f1257; +sub.f32 f1421, f1247, f1252; +mul.f32 f1495, f1419, 0f3F737871; +mul.f32 f1496, f1421, 0fBF167918; +sub.f32 f1423, f1496, f1495; +add.f32 f1494, f995, f1503; +mul.f32 f1424, f1503, 0f3F4F1BBD; +sub.f32 f1425, f995, f1424; +fma.rn.f32 f1426, f1502, 0f3E9E377A, f1425; +mul.f32 f1427, f1419, 0f3F167918; +mul.f32 f1428, f1421, 0f3F737871; +sub.f32 f1429, f1428, f1427; +add.f32 %1, f1541, f1536; +add.f32 %0, f1262, f1539; +add.f32 %3, f1532, f1524; +add.f32 %2, f1296, f1529; +add.f32 %5, f1521, f1514; +add.f32 %4, f1330, f1518; +add.f32 %7, f1511, f1506; +add.f32 %6, f1364, f1509; +add.f32 %9, f1502, f1494; +add.f32 %8, f1398, f1499; +add.f32 %11, f1287, f1282; +sub.f32 %10, f1268, f1273; +add.f32 %13, f1321, f1316; +sub.f32 %12, f1302, f1307; +sub.f32 %14, f1336, f1341; +add.f32 %15, f1355, f1350; +sub.f32 %16, f1370, f1375; +add.f32 %17, f1389, f1384; +sub.f32 %18, f1404, f1409; +add.f32 %19, f1423, f1418; +sub.f32 %20, f1276, f1279; +add.f32 %21, f1293, f1290; +add.f32 %23, f1327, f1324; +sub.f32 %22, f1310, f1313; +add.f32 %25, f1361, f1358; +sub.f32 %24, f1344, f1347; +sub.f32 %26, f1378, f1381; +add.f32 %27, f1395, f1392; +sub.f32 %28, f1412, f1415; +add.f32 %29, f1429, f1426; +sub.f32 %31, f1290, f1293; +add.f32 %30, f1279, f1276; +sub.f32 %33, f1324, f1327; +add.f32 %32, f1313, f1310; +sub.f32 %35, f1358, f1361; +add.f32 %34, f1347, f1344; +sub.f32 %37, f1392, f1395; +add.f32 %36, f1381, f1378; +sub.f32 %39, f1426, f1429; +add.f32 %38, f1415, f1412; +sub.f32 %41, f1282, f1287; +add.f32 %40, f1273, f1268; +sub.f32 %43, f1316, f1321; +add.f32 %42, f1307, f1302; +sub.f32 %45, f1350, f1355; +add.f32 %44, f1341, f1336; +sub.f32 %47, f1384, f1389; +add.f32 %46, f1375, f1370; +sub.f32 %49, f1418, f1423; +add.f32 %48, f1409, f1404; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<162, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1430>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 2500, r2; +add.f32 f101, %65, %105; +add.f32 f102, %52, f101; +add.f32 f103, %78, %92; +add.f32 f104, f103, f102; +add.f32 f105, %67, %107; +add.f32 f106, %53, f105; +add.f32 f107, %80, %93; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %52; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %67, %107; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %80, %93; +mul.f32 f115, f114, 0fBF167918; +sub.f32 f116, f115, f113; +sub.f32 f117, f111, f116; +add.f32 f118, f116, f111; +mul.f32 f119, f101, 0f3F4F1BBD; +sub.f32 f120, %52, f119; +fma.rn.f32 f121, f103, 0f3E9E377A, f120; +mul.f32 f122, f112, 0f3F167918; +mul.f32 f123, f114, 0f3F737871; +sub.f32 f124, f123, f122; +sub.f32 f125, f121, f124; +add.f32 f126, f124, f121; +fma.rn.f32 f127, f105, 0f3E9E377A, %53; +mul.f32 f128, f107, 0f3F4F1BBD; +sub.f32 f129, f127, f128; +sub.f32 f130, %65, %105; +mul.f32 f131, f130, 0f3F737871; +sub.f32 f132, %78, %92; +mul.f32 f133, f132, 0fBF167918; +sub.f32 f134, f133, f131; +add.f32 f135, f134, f129; +sub.f32 f136, f129, f134; +mul.f32 f137, f105, 0f3F4F1BBD; +sub.f32 f138, %53, f137; +fma.rn.f32 f139, f107, 0f3E9E377A, f138; +mul.f32 f140, f130, 0f3F167918; +mul.f32 f141, f132, 0f3F737871; +sub.f32 f142, f141, f140; +add.f32 f143, f142, f139; +sub.f32 f144, f139, f142; +add.f32 f145, %68, %108; +add.f32 f146, %54, f145; +add.f32 f147, %81, %94; +add.f32 f148, f147, f146; +add.f32 f149, %69, %109; +add.f32 f150, %56, f149; +add.f32 f151, %83, %96; +add.f32 f152, f151, f150; +fma.rn.f32 f153, f145, 0f3E9E377A, %54; +mul.f32 f154, f147, 0f3F4F1BBD; +sub.f32 f155, f153, f154; +sub.f32 f156, %69, %109; +mul.f32 f157, f156, 0f3F737871; +sub.f32 f158, %83, %96; +mul.f32 f159, f158, 0fBF167918; +sub.f32 f160, f159, f157; +sub.f32 f161, f155, f160; +add.f32 f162, f160, f155; +mul.f32 f163, f145, 0f3F4F1BBD; +sub.f32 f164, %54, f163; +fma.rn.f32 f165, f147, 0f3E9E377A, f164; +mul.f32 f166, f156, 0f3F167918; +mul.f32 f167, f158, 0f3F737871; +sub.f32 f168, f167, f166; +sub.f32 f169, f165, f168; +add.f32 f170, f168, f165; +fma.rn.f32 f171, f149, 0f3E9E377A, %56; +mul.f32 f172, f151, 0f3F4F1BBD; +sub.f32 f173, f171, f172; +sub.f32 f174, %68, %108; +mul.f32 f175, f174, 0f3F737871; +sub.f32 f176, %81, %94; +mul.f32 f177, f176, 0fBF167918; +sub.f32 f178, f177, f175; +add.f32 f179, f178, f173; +sub.f32 f180, f173, f178; +mul.f32 f181, f149, 0f3F4F1BBD; +sub.f32 f182, %56, f181; +fma.rn.f32 f183, f151, 0f3E9E377A, f182; +mul.f32 f184, f174, 0f3F167918; +mul.f32 f185, f176, 0f3F737871; +sub.f32 f186, f185, f184; +add.f32 f187, f186, f183; +sub.f32 f188, f183, f186; +add.f32 f189, %70, %110; +add.f32 f190, %57, f189; +add.f32 f191, %84, %97; +add.f32 f192, f191, f190; +add.f32 f193, %72, %112; +add.f32 f194, %59, f193; +add.f32 f195, %85, %99; +add.f32 f196, f195, f194; +fma.rn.f32 f197, f189, 0f3E9E377A, %57; +mul.f32 f198, f191, 0f3F4F1BBD; +sub.f32 f199, f197, f198; +sub.f32 f200, %72, %112; +mul.f32 f201, f200, 0f3F737871; +sub.f32 f202, %85, %99; +mul.f32 f203, f202, 0fBF167918; +sub.f32 f204, f203, f201; +sub.f32 f205, f199, f204; +add.f32 f206, f204, f199; +mul.f32 f207, f189, 0f3F4F1BBD; +sub.f32 f208, %57, f207; +fma.rn.f32 f209, f191, 0f3E9E377A, f208; +mul.f32 f210, f200, 0f3F167918; +mul.f32 f211, f202, 0f3F737871; +sub.f32 f212, f211, f210; +sub.f32 f213, f209, f212; +add.f32 f214, f212, f209; +fma.rn.f32 f215, f193, 0f3E9E377A, %59; +mul.f32 f216, f195, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, %70, %110; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, %84, %97; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +add.f32 f223, f222, f217; +sub.f32 f224, f217, f222; +mul.f32 f225, f193, 0f3F4F1BBD; +sub.f32 f226, %59, f225; +fma.rn.f32 f227, f195, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +add.f32 f231, f230, f227; +sub.f32 f232, f227, f230; +add.f32 f233, %73, %113; +add.f32 f234, %60, f233; +add.f32 f235, %86, %100; +add.f32 f236, f235, f234; +add.f32 f237, %75, %115; +add.f32 f238, %61, f237; +add.f32 f239, %88, %101; +add.f32 f240, f239, f238; +fma.rn.f32 f241, f233, 0f3E9E377A, %60; +mul.f32 f242, f235, 0f3F4F1BBD; +sub.f32 f243, f241, f242; +sub.f32 f244, %75, %115; +mul.f32 f245, f244, 0f3F737871; +sub.f32 f246, %88, %101; +mul.f32 f247, f246, 0fBF167918; +sub.f32 f248, f247, f245; +sub.f32 f249, f243, f248; +add.f32 f250, f248, f243; +mul.f32 f251, f233, 0f3F4F1BBD; +sub.f32 f252, %60, f251; +fma.rn.f32 f253, f235, 0f3E9E377A, f252; +mul.f32 f254, f244, 0f3F167918; +mul.f32 f255, f246, 0f3F737871; +sub.f32 f256, f255, f254; +sub.f32 f257, f253, f256; +add.f32 f258, f256, f253; +fma.rn.f32 f259, f237, 0f3E9E377A, %61; +mul.f32 f260, f239, 0f3F4F1BBD; +sub.f32 f261, f259, f260; +sub.f32 f262, %73, %113; +mul.f32 f263, f262, 0f3F737871; +sub.f32 f264, %86, %100; +mul.f32 f265, f264, 0fBF167918; +sub.f32 f266, f265, f263; +add.f32 f267, f266, f261; +sub.f32 f268, f261, f266; +mul.f32 f269, f237, 0f3F4F1BBD; +sub.f32 f270, %61, f269; +fma.rn.f32 f271, f239, 0f3E9E377A, f270; +mul.f32 f272, f262, 0f3F167918; +mul.f32 f273, f264, 0f3F737871; +sub.f32 f274, f273, f272; +add.f32 f275, f274, f271; +sub.f32 f276, f271, f274; +add.f32 f277, %76, %116; +add.f32 f278, %62, f277; +add.f32 f279, %89, %102; +add.f32 f280, f279, f278; +add.f32 f281, %77, %117; +add.f32 f282, %64, f281; +add.f32 f283, %91, %104; +add.f32 f284, f283, f282; +fma.rn.f32 f285, f277, 0f3E9E377A, %62; +mul.f32 f286, f279, 0f3F4F1BBD; +sub.f32 f287, f285, f286; +sub.f32 f288, %77, %117; +mul.f32 f289, f288, 0f3F737871; +sub.f32 f290, %91, %104; +mul.f32 f291, f290, 0fBF167918; +sub.f32 f292, f291, f289; +sub.f32 f293, f287, f292; +add.f32 f294, f292, f287; +mul.f32 f295, f277, 0f3F4F1BBD; +sub.f32 f296, %62, f295; +fma.rn.f32 f297, f279, 0f3E9E377A, f296; +mul.f32 f298, f288, 0f3F167918; +mul.f32 f299, f290, 0f3F737871; +sub.f32 f300, f299, f298; +sub.f32 f301, f297, f300; +add.f32 f302, f300, f297; +fma.rn.f32 f303, f281, 0f3E9E377A, %64; +mul.f32 f304, f283, 0f3F4F1BBD; +sub.f32 f305, f303, f304; +sub.f32 f306, %76, %116; +mul.f32 f307, f306, 0f3F737871; +sub.f32 f308, %89, %102; +mul.f32 f309, f308, 0fBF167918; +sub.f32 f310, f309, f307; +add.f32 f311, f310, f305; +sub.f32 f312, f305, f310; +mul.f32 f313, f281, 0f3F4F1BBD; +sub.f32 f314, %64, f313; +fma.rn.f32 f315, f283, 0f3E9E377A, f314; +mul.f32 f316, f306, 0f3F167918; +mul.f32 f317, f308, 0f3F737871; +sub.f32 f318, f317, f316; +add.f32 f319, f318, f315; +sub.f32 f320, f315, f318; +mov.u32 r4, %tid.x; +mul.f32 f321, f161, 0f3F77F511; +mul.f32 f322, f179, 0fBE7EA890; +sub.f32 f323, f321, f322; +mul.f32 f324, f179, 0f3F77F511; +fma.rn.f32 f325, f161, 0fBE7EA890, f324; +mul.f32 f326, f205, 0f3F6055A2; +mul.f32 f327, f223, 0fBEF6A86B; +sub.f32 f328, f326, f327; +mul.f32 f329, f223, 0f3F6055A2; +fma.rn.f32 f330, f205, 0fBEF6A86B, f329; +mul.f32 f331, f249, 0f3F3A9DB0; +mul.f32 f332, f267, 0fBF2F3E7B; +sub.f32 f333, f331, f332; +mul.f32 f334, f267, 0f3F3A9DB0; +fma.rn.f32 f335, f249, 0fBF2F3E7B, f334; +mul.f32 f336, f293, 0f3F092BF2; +mul.f32 f337, f311, 0fBF5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f311, 0f3F092BF2; +fma.rn.f32 f340, f293, 0fBF5825E0, f339; +mul.f32 f341, f169, 0f3F6055A2; +mul.f32 f342, f187, 0fBEF6A86B; +sub.f32 f343, f341, f342; +mul.f32 f344, f187, 0f3F6055A2; +fma.rn.f32 f345, f169, 0fBEF6A86B, f344; +mul.f32 f346, f213, 0f3F092BF2; +mul.f32 f347, f231, 0fBF5825E0; +sub.f32 f348, f346, f347; +mul.f32 f349, f231, 0f3F092BF2; +fma.rn.f32 f350, f213, 0fBF5825E0, f349; +mul.f32 f351, f257, 0f3D809851; +mul.f32 f352, f275, 0fBF7F7EAE; +sub.f32 f353, f351, f352; +mul.f32 f354, f275, 0f3D809851; +fma.rn.f32 f355, f257, 0fBF7F7EAE, f354; +mul.f32 f356, f301, 0fBED9FFBE; +mul.f32 f357, f319, 0fBF67A2BF; +sub.f32 f358, f356, f357; +mul.f32 f359, f319, 0fBED9FFBE; +fma.rn.f32 f360, f301, 0fBF67A2BF, f359; +mul.f32 f361, f170, 0f3F3A9DB0; +mul.f32 f362, f188, 0fBF2F3E7B; +sub.f32 f363, f361, f362; +mul.f32 f364, f188, 0f3F3A9DB0; +fma.rn.f32 f365, f170, 0fBF2F3E7B, f364; +mul.f32 f366, f214, 0f3D809851; +mul.f32 f367, f232, 0fBF7F7EAE; +sub.f32 f368, f366, f367; +mul.f32 f369, f232, 0f3D809851; +fma.rn.f32 f370, f214, 0fBF7F7EAE, f369; +mul.f32 f371, f258, 0fBF232E38; +mul.f32 f372, f276, 0fBF45405B; +sub.f32 f373, f371, f372; +mul.f32 f374, f276, 0fBF232E38; +fma.rn.f32 f375, f258, 0fBF45405B, f374; +mul.f32 f376, f302, 0fBF7DFB3B; +mul.f32 f377, f320, 0fBE00575B; +sub.f32 f378, f376, f377; +mul.f32 f379, f320, 0fBF7DFB3B; +fma.rn.f32 f380, f302, 0fBE00575B, f379; +mul.f32 f381, f162, 0f3F092BF2; +mul.f32 f382, f180, 0fBF5825E0; +sub.f32 f383, f381, f382; +mul.f32 f384, f180, 0f3F092BF2; +fma.rn.f32 f385, f162, 0fBF5825E0, f384; +mul.f32 f386, f206, 0fBED9FFBE; +mul.f32 f387, f224, 0fBF67A2BF; +sub.f32 f388, f386, f387; +mul.f32 f389, f224, 0fBED9FFBE; +fma.rn.f32 f390, f206, 0fBF67A2BF, f389; +mul.f32 f391, f250, 0fBF7DFB3B; +mul.f32 f392, f268, 0fBE00575B; +sub.f32 f393, f391, f392; +mul.f32 f394, f268, 0fBF7DFB3B; +fma.rn.f32 f395, f250, 0fBE00575B, f394; +mul.f32 f396, f294, 0fBF232E38; +mul.f32 f397, f312, 0f3F45405B; +sub.f32 f398, f396, f397; +mul.f32 f399, f312, 0fBF232E38; +fma.rn.f32 f400, f294, 0f3F45405B, f399; +add.f32 f401, f148, f280; +add.f32 f402, f104, f401; +add.f32 f403, f192, f236; +add.f32 f404, f403, f402; +add.f32 f405, f152, f284; +add.f32 f406, f108, f405; +add.f32 f407, f196, f240; +add.f32 f408, f407, f406; +fma.rn.f32 f409, f401, 0f3E9E377A, f104; +mul.f32 f410, f403, 0f3F4F1BBD; +sub.f32 f411, f409, f410; +sub.f32 f412, f152, f284; +mul.f32 f413, f412, 0f3F737871; +sub.f32 f414, f196, f240; +mul.f32 f415, f414, 0fBF167918; +sub.f32 f416, f415, f413; +sub.f32 f417, f411, f416; +add.f32 f418, f416, f411; +mul.f32 f419, f401, 0f3F4F1BBD; +sub.f32 f420, f104, f419; +fma.rn.f32 f421, f403, 0f3E9E377A, f420; +mul.f32 f422, f412, 0f3F167918; +mul.f32 f423, f414, 0f3F737871; +sub.f32 f424, f423, f422; +sub.f32 f425, f421, f424; +add.f32 f426, f424, f421; +fma.rn.f32 f427, f405, 0f3E9E377A, f108; +mul.f32 f428, f407, 0f3F4F1BBD; +sub.f32 f429, f427, f428; +sub.f32 f430, f148, f280; +mul.f32 f431, f430, 0f3F737871; +sub.f32 f432, f192, f236; +mul.f32 f433, f432, 0fBF167918; +sub.f32 f434, f433, f431; +add.f32 f435, f434, f429; +sub.f32 f436, f429, f434; +mul.f32 f437, f405, 0f3F4F1BBD; +sub.f32 f438, f108, f437; +fma.rn.f32 f439, f407, 0f3E9E377A, f438; +mul.f32 f440, f430, 0f3F167918; +mul.f32 f441, f432, 0f3F737871; +sub.f32 f442, f441, f440; +add.f32 f443, f442, f439; +sub.f32 f444, f439, f442; +add.f32 f445, f323, f338; +add.f32 f446, f117, f445; +add.f32 f447, f328, f333; +add.f32 f448, f447, f446; +add.f32 f449, f325, f340; +add.f32 f450, f135, f449; +add.f32 f451, f330, f335; +add.f32 f452, f451, f450; +fma.rn.f32 f453, f445, 0f3E9E377A, f117; +mul.f32 f454, f447, 0f3F4F1BBD; +sub.f32 f455, f453, f454; +sub.f32 f456, f325, f340; +mul.f32 f457, f456, 0f3F737871; +sub.f32 f458, f330, f335; +mul.f32 f459, f458, 0fBF167918; +sub.f32 f460, f459, f457; +sub.f32 f461, f455, f460; +add.f32 f462, f460, f455; +mul.f32 f463, f445, 0f3F4F1BBD; +sub.f32 f464, f117, f463; +fma.rn.f32 f465, f447, 0f3E9E377A, f464; +mul.f32 f466, f456, 0f3F167918; +mul.f32 f467, f458, 0f3F737871; +sub.f32 f468, f467, f466; +sub.f32 f469, f465, f468; +add.f32 f470, f468, f465; +fma.rn.f32 f471, f449, 0f3E9E377A, f135; +mul.f32 f472, f451, 0f3F4F1BBD; +sub.f32 f473, f471, f472; +sub.f32 f474, f323, f338; +mul.f32 f475, f474, 0f3F737871; +sub.f32 f476, f328, f333; +mul.f32 f477, f476, 0fBF167918; +sub.f32 f478, f477, f475; +add.f32 f479, f478, f473; +sub.f32 f480, f473, f478; +mul.f32 f481, f449, 0f3F4F1BBD; +sub.f32 f482, f135, f481; +fma.rn.f32 f483, f451, 0f3E9E377A, f482; +mul.f32 f484, f474, 0f3F167918; +mul.f32 f485, f476, 0f3F737871; +sub.f32 f486, f485, f484; +add.f32 f487, f486, f483; +sub.f32 f488, f483, f486; +add.f32 f489, f343, f358; +add.f32 f490, f125, f489; +add.f32 f491, f348, f353; +add.f32 f492, f491, f490; +add.f32 f493, f345, f360; +add.f32 f494, f143, f493; +add.f32 f495, f350, f355; +add.f32 f496, f495, f494; +fma.rn.f32 f497, f489, 0f3E9E377A, f125; +mul.f32 f498, f491, 0f3F4F1BBD; +sub.f32 f499, f497, f498; +sub.f32 f500, f345, f360; +mul.f32 f501, f500, 0f3F737871; +sub.f32 f502, f350, f355; +mul.f32 f503, f502, 0fBF167918; +sub.f32 f504, f503, f501; +sub.f32 f505, f499, f504; +add.f32 f506, f504, f499; +mul.f32 f507, f489, 0f3F4F1BBD; +sub.f32 f508, f125, f507; +fma.rn.f32 f509, f491, 0f3E9E377A, f508; +mul.f32 f510, f500, 0f3F167918; +mul.f32 f511, f502, 0f3F737871; +sub.f32 f512, f511, f510; +sub.f32 f513, f509, f512; +add.f32 f514, f512, f509; +fma.rn.f32 f515, f493, 0f3E9E377A, f143; +mul.f32 f516, f495, 0f3F4F1BBD; +sub.f32 f517, f515, f516; +sub.f32 f518, f343, f358; +mul.f32 f519, f518, 0f3F737871; +sub.f32 f520, f348, f353; +mul.f32 f521, f520, 0fBF167918; +sub.f32 f522, f521, f519; +add.f32 f523, f522, f517; +sub.f32 f524, f517, f522; +mul.f32 f525, f493, 0f3F4F1BBD; +sub.f32 f526, f143, f525; +fma.rn.f32 f527, f495, 0f3E9E377A, f526; +mul.f32 f528, f518, 0f3F167918; +mul.f32 f529, f520, 0f3F737871; +sub.f32 f530, f529, f528; +add.f32 f531, f530, f527; +sub.f32 f532, f527, f530; +add.f32 f533, f363, f378; +add.f32 f534, f126, f533; +add.f32 f535, f368, f373; +add.f32 f536, f535, f534; +add.f32 f537, f365, f380; +add.f32 f538, f144, f537; +add.f32 f539, f370, f375; +add.f32 f540, f539, f538; +fma.rn.f32 f541, f533, 0f3E9E377A, f126; +mul.f32 f542, f535, 0f3F4F1BBD; +sub.f32 f543, f541, f542; +sub.f32 f544, f365, f380; +mul.f32 f545, f544, 0f3F737871; +sub.f32 f546, f370, f375; +mul.f32 f547, f546, 0fBF167918; +sub.f32 f548, f547, f545; +sub.f32 f549, f543, f548; +add.f32 f550, f548, f543; +mul.f32 f551, f533, 0f3F4F1BBD; +sub.f32 f552, f126, f551; +fma.rn.f32 f553, f535, 0f3E9E377A, f552; +mul.f32 f554, f544, 0f3F167918; +mul.f32 f555, f546, 0f3F737871; +sub.f32 f556, f555, f554; +sub.f32 f557, f553, f556; +add.f32 f558, f556, f553; +fma.rn.f32 f559, f537, 0f3E9E377A, f144; +mul.f32 f560, f539, 0f3F4F1BBD; +sub.f32 f561, f559, f560; +sub.f32 f562, f363, f378; +mul.f32 f563, f562, 0f3F737871; +sub.f32 f564, f368, f373; +mul.f32 f565, f564, 0fBF167918; +sub.f32 f566, f565, f563; +add.f32 f567, f566, f561; +sub.f32 f568, f561, f566; +mul.f32 f569, f537, 0f3F4F1BBD; +sub.f32 f570, f144, f569; +fma.rn.f32 f571, f539, 0f3E9E377A, f570; +mul.f32 f572, f562, 0f3F167918; +mul.f32 f573, f564, 0f3F737871; +sub.f32 f574, f573, f572; +add.f32 f575, f574, f571; +sub.f32 f576, f571, f574; +add.f32 f577, f383, f398; +add.f32 f578, f118, f577; +add.f32 f579, f388, f393; +add.f32 f580, f579, f578; +add.f32 f581, f385, f400; +add.f32 f582, f136, f581; +add.f32 f583, f390, f395; +add.f32 f584, f583, f582; +fma.rn.f32 f585, f577, 0f3E9E377A, f118; +mul.f32 f586, f579, 0f3F4F1BBD; +sub.f32 f587, f585, f586; +sub.f32 f588, f385, f400; +mul.f32 f589, f588, 0f3F737871; +sub.f32 f590, f390, f395; +mul.f32 f591, f590, 0fBF167918; +sub.f32 f592, f591, f589; +sub.f32 f593, f587, f592; +add.f32 f594, f592, f587; +mul.f32 f595, f577, 0f3F4F1BBD; +sub.f32 f596, f118, f595; +fma.rn.f32 f597, f579, 0f3E9E377A, f596; +mul.f32 f598, f588, 0f3F167918; +mul.f32 f599, f590, 0f3F737871; +sub.f32 f600, f599, f598; +sub.f32 f601, f597, f600; +add.f32 f602, f600, f597; +fma.rn.f32 f603, f581, 0f3E9E377A, f136; +mul.f32 f604, f583, 0f3F4F1BBD; +sub.f32 f605, f603, f604; +sub.f32 f606, f383, f398; +mul.f32 f607, f606, 0f3F737871; +sub.f32 f608, f388, f393; +mul.f32 f609, f608, 0fBF167918; +sub.f32 f610, f609, f607; +add.f32 f611, f610, f605; +sub.f32 f612, f605, f610; +mul.f32 f613, f581, 0f3F4F1BBD; +sub.f32 f614, f136, f613; +fma.rn.f32 f615, f583, 0f3E9E377A, f614; +mul.f32 f616, f606, 0f3F167918; +mul.f32 f617, f608, 0f3F737871; +sub.f32 f618, f617, f616; +add.f32 f619, f618, f615; +sub.f32 f620, f615, f618; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f621, f622}, [rd6]; +mul.f32 f625, f621, f448; +mul.f32 f626, f622, f452; +sub.f32 f627, f625, f626; +mul.f32 f628, f621, f452; +fma.rn.f32 f629, f622, f448, f628; +mul.f32 f630, f621, f621; +mul.f32 f631, f622, f622; +sub.f32 f632, f630, f631; +mul.f32 f633, f622, f621; +fma.rn.f32 f634, f622, f621, f633; +mul.f32 f635, f632, f492; +mul.f32 f636, f634, f496; +sub.f32 f637, f635, f636; +mul.f32 f638, f632, f496; +fma.rn.f32 f639, f634, f492, f638; +mul.f32 f640, f621, f632; +mul.f32 f641, f622, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f621, f634; +fma.rn.f32 f644, f622, f632, f643; +mul.f32 f645, f642, f536; +mul.f32 f646, f644, f540; +sub.f32 f647, f645, f646; +mul.f32 f648, f642, f540; +fma.rn.f32 f649, f644, f536, f648; +mul.f32 f650, f621, f642; +mul.f32 f651, f622, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f621, f644; +fma.rn.f32 f654, f622, f642, f653; +mul.f32 f655, f652, f580; +mul.f32 f656, f654, f584; +sub.f32 f657, f655, f656; +mul.f32 f658, f652, f584; +fma.rn.f32 f659, f654, f580, f658; +mul.f32 f660, f621, f652; +mul.f32 f661, f622, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f621, f654; +fma.rn.f32 f664, f622, f652, f663; +mul.f32 f665, f662, f417; +mul.f32 f666, f664, f435; +sub.f32 f667, f665, f666; +mul.f32 f668, f662, f435; +fma.rn.f32 f669, f664, f417, f668; +mul.f32 f670, f621, f662; +mul.f32 f671, f622, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f621, f664; +fma.rn.f32 f674, f622, f662, f673; +mul.f32 f675, f672, f461; +mul.f32 f676, f674, f479; +sub.f32 f677, f675, f676; +mul.f32 f678, f672, f479; +fma.rn.f32 f679, f674, f461, f678; +mul.f32 f680, f621, f672; +mul.f32 f681, f622, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f621, f674; +fma.rn.f32 f684, f622, f672, f683; +mul.f32 f685, f682, f505; +mul.f32 f686, f684, f523; +sub.f32 f687, f685, f686; +mul.f32 f688, f682, f523; +fma.rn.f32 f689, f684, f505, f688; +mul.f32 f690, f621, f682; +mul.f32 f691, f622, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f621, f684; +fma.rn.f32 f694, f622, f682, f693; +mul.f32 f695, f692, f549; +mul.f32 f696, f694, f567; +sub.f32 f697, f695, f696; +mul.f32 f698, f692, f567; +fma.rn.f32 f699, f694, f549, f698; +mul.f32 f700, f621, f692; +mul.f32 f701, f622, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f621, f694; +fma.rn.f32 f704, f622, f692, f703; +mul.f32 f705, f702, f593; +mul.f32 f706, f704, f611; +sub.f32 f707, f705, f706; +mul.f32 f708, f702, f611; +fma.rn.f32 f709, f704, f593, f708; +mul.f32 f710, f621, f702; +mul.f32 f711, f622, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f621, f704; +fma.rn.f32 f714, f622, f702, f713; +mul.f32 f715, f712, f425; +mul.f32 f716, f714, f443; +sub.f32 f717, f715, f716; +mul.f32 f718, f712, f443; +fma.rn.f32 f719, f714, f425, f718; +mul.f32 f720, f621, f712; +mul.f32 f721, f622, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f621, f714; +fma.rn.f32 f724, f622, f712, f723; +mul.f32 f725, f722, f469; +mul.f32 f726, f724, f487; +sub.f32 f727, f725, f726; +mul.f32 f728, f722, f487; +fma.rn.f32 f729, f724, f469, f728; +mul.f32 f730, f621, f722; +mul.f32 f731, f622, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f621, f724; +fma.rn.f32 f734, f622, f722, f733; +mul.f32 f735, f732, f513; +mul.f32 f736, f734, f531; +sub.f32 f737, f735, f736; +mul.f32 f738, f732, f531; +fma.rn.f32 f739, f734, f513, f738; +mul.f32 f740, f621, f732; +mul.f32 f741, f622, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f621, f734; +fma.rn.f32 f744, f622, f732, f743; +mul.f32 f745, f742, f557; +mul.f32 f746, f744, f575; +sub.f32 f747, f745, f746; +mul.f32 f748, f742, f575; +fma.rn.f32 f749, f744, f557, f748; +mul.f32 f750, f621, f742; +mul.f32 f751, f622, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f621, f744; +fma.rn.f32 f754, f622, f742, f753; +mul.f32 f755, f752, f601; +mul.f32 f756, f754, f619; +sub.f32 f757, f755, f756; +mul.f32 f758, f752, f619; +fma.rn.f32 f759, f754, f601, f758; +mul.f32 f760, f621, f752; +mul.f32 f761, f622, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f621, f754; +fma.rn.f32 f764, f622, f752, f763; +mul.f32 f765, f762, f426; +mul.f32 f766, f764, f444; +sub.f32 f767, f765, f766; +mul.f32 f768, f762, f444; +fma.rn.f32 f769, f764, f426, f768; +mul.f32 f770, f621, f762; +mul.f32 f771, f622, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f621, f764; +fma.rn.f32 f774, f622, f762, f773; +mul.f32 f775, f772, f470; +mul.f32 f776, f774, f488; +sub.f32 f777, f775, f776; +mul.f32 f778, f772, f488; +fma.rn.f32 f779, f774, f470, f778; +mul.f32 f780, f621, f772; +mul.f32 f781, f622, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f621, f774; +fma.rn.f32 f784, f622, f772, f783; +mul.f32 f785, f782, f514; +mul.f32 f786, f784, f532; +sub.f32 f787, f785, f786; +mul.f32 f788, f782, f532; +fma.rn.f32 f789, f784, f514, f788; +mul.f32 f790, f621, f782; +mul.f32 f791, f622, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f621, f784; +fma.rn.f32 f794, f622, f782, f793; +mul.f32 f795, f792, f558; +mul.f32 f796, f794, f576; +sub.f32 f797, f795, f796; +mul.f32 f798, f792, f576; +fma.rn.f32 f799, f794, f558, f798; +mul.f32 f800, f621, f792; +mul.f32 f801, f622, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f621, f794; +fma.rn.f32 f804, f622, f792, f803; +mul.f32 f805, f802, f602; +mul.f32 f806, f804, f620; +sub.f32 f807, f805, f806; +mul.f32 f808, f802, f620; +fma.rn.f32 f809, f804, f602, f808; +mul.f32 f810, f621, f802; +mul.f32 f811, f622, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f621, f804; +fma.rn.f32 f814, f622, f802, f813; +mul.f32 f815, f812, f418; +mul.f32 f816, f814, f436; +sub.f32 f817, f815, f816; +mul.f32 f818, f812, f436; +fma.rn.f32 f819, f814, f418, f818; +mul.f32 f820, f621, f812; +mul.f32 f821, f622, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f621, f814; +fma.rn.f32 f824, f622, f812, f823; +mul.f32 f825, f822, f462; +mul.f32 f826, f824, f480; +sub.f32 f827, f825, f826; +mul.f32 f828, f822, f480; +fma.rn.f32 f829, f824, f462, f828; +mul.f32 f830, f621, f822; +mul.f32 f831, f622, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f621, f824; +fma.rn.f32 f834, f622, f822, f833; +mul.f32 f835, f832, f506; +mul.f32 f836, f834, f524; +sub.f32 f837, f835, f836; +mul.f32 f838, f832, f524; +fma.rn.f32 f839, f834, f506, f838; +mul.f32 f840, f621, f832; +mul.f32 f841, f622, f834; +sub.f32 f842, f840, f841; +mul.f32 f843, f621, f834; +fma.rn.f32 f844, f622, f832, f843; +mul.f32 f845, f842, f550; +mul.f32 f846, f844, f568; +sub.f32 f847, f845, f846; +mul.f32 f848, f842, f568; +fma.rn.f32 f849, f844, f550, f848; +mul.f32 f850, f621, f842; +mul.f32 f851, f622, f844; +sub.f32 f852, f850, f851; +mul.f32 f853, f621, f844; +fma.rn.f32 f854, f622, f842, f853; +mul.f32 f855, f852, f594; +mul.f32 f856, f854, f612; +sub.f32 f857, f855, f856; +mul.f32 f858, f852, f612; +fma.rn.f32 f859, f854, f594, f858; +mad.lo.s32 r8, r5, 2500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f404; +st.shared.f32 [r9+4], f627; +st.shared.f32 [r9+8], f637; +st.shared.f32 [r9+12], f647; +st.shared.f32 [r9+16], f657; +st.shared.f32 [r9+20], f667; +st.shared.f32 [r9+24], f677; +st.shared.f32 [r9+28], f687; +st.shared.f32 [r9+32], f697; +st.shared.f32 [r9+36], f707; +st.shared.f32 [r9+40], f717; +st.shared.f32 [r9+44], f727; +st.shared.f32 [r9+48], f737; +st.shared.f32 [r9+52], f747; +st.shared.f32 [r9+56], f757; +st.shared.f32 [r9+60], f767; +st.shared.f32 [r9+64], f777; +st.shared.f32 [r9+68], f787; +st.shared.f32 [r9+72], f797; +st.shared.f32 [r9+76], f807; +st.shared.f32 [r9+80], f817; +st.shared.f32 [r9+84], f827; +st.shared.f32 [r9+88], f837; +st.shared.f32 [r9+92], f847; +st.shared.f32 [r9+96], f857; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f860, [r10]; +ld.shared.f32 f861, [r10+100]; +ld.shared.f32 f862, [r10+200]; +ld.shared.f32 f863, [r10+300]; +ld.shared.f32 f864, [r10+400]; +ld.shared.f32 f865, [r10+500]; +ld.shared.f32 f866, [r10+600]; +ld.shared.f32 f867, [r10+700]; +ld.shared.f32 f868, [r10+800]; +ld.shared.f32 f869, [r10+900]; +ld.shared.f32 f870, [r10+1000]; +ld.shared.f32 f871, [r10+1100]; +ld.shared.f32 f872, [r10+1200]; +ld.shared.f32 f873, [r10+1300]; +ld.shared.f32 f874, [r10+1400]; +ld.shared.f32 f875, [r10+1500]; +ld.shared.f32 f876, [r10+1600]; +ld.shared.f32 f877, [r10+1700]; +ld.shared.f32 f878, [r10+1800]; +ld.shared.f32 f879, [r10+1900]; +ld.shared.f32 f880, [r10+2000]; +ld.shared.f32 f881, [r10+2100]; +ld.shared.f32 f882, [r10+2200]; +ld.shared.f32 f883, [r10+2300]; +ld.shared.f32 f884, [r10+2400]; +barrier.sync 0; +st.shared.f32 [r9], f408; +st.shared.f32 [r9+4], f629; +st.shared.f32 [r9+8], f639; +st.shared.f32 [r9+12], f649; +st.shared.f32 [r9+16], f659; +st.shared.f32 [r9+20], f669; +st.shared.f32 [r9+24], f679; +st.shared.f32 [r9+28], f689; +st.shared.f32 [r9+32], f699; +st.shared.f32 [r9+36], f709; +st.shared.f32 [r9+40], f719; +st.shared.f32 [r9+44], f729; +st.shared.f32 [r9+48], f739; +st.shared.f32 [r9+52], f749; +st.shared.f32 [r9+56], f759; +st.shared.f32 [r9+60], f769; +st.shared.f32 [r9+64], f779; +st.shared.f32 [r9+68], f789; +st.shared.f32 [r9+72], f799; +st.shared.f32 [r9+76], f809; +st.shared.f32 [r9+80], f819; +st.shared.f32 [r9+84], f829; +st.shared.f32 [r9+88], f839; +st.shared.f32 [r9+92], f849; +st.shared.f32 [r9+96], f859; +barrier.sync 0; +ld.shared.f32 f885, [r10]; +ld.shared.f32 f886, [r10+100]; +ld.shared.f32 f887, [r10+200]; +ld.shared.f32 f888, [r10+300]; +ld.shared.f32 f889, [r10+400]; +ld.shared.f32 f890, [r10+500]; +ld.shared.f32 f891, [r10+600]; +ld.shared.f32 f892, [r10+700]; +ld.shared.f32 f893, [r10+800]; +ld.shared.f32 f894, [r10+900]; +ld.shared.f32 f895, [r10+1000]; +ld.shared.f32 f896, [r10+1100]; +ld.shared.f32 f897, [r10+1200]; +ld.shared.f32 f898, [r10+1300]; +ld.shared.f32 f899, [r10+1400]; +ld.shared.f32 f900, [r10+1500]; +ld.shared.f32 f901, [r10+1600]; +ld.shared.f32 f902, [r10+1700]; +ld.shared.f32 f903, [r10+1800]; +ld.shared.f32 f904, [r10+1900]; +ld.shared.f32 f905, [r10+2000]; +ld.shared.f32 f906, [r10+2100]; +ld.shared.f32 f907, [r10+2200]; +ld.shared.f32 f908, [r10+2300]; +ld.shared.f32 f909, [r10+2400]; +add.f32 f910, f865, f880; +add.f32 f911, f860, f910; +add.f32 f912, f870, f875; +add.f32 f913, f912, f911; +add.f32 f914, f890, f905; +add.f32 f915, f885, f914; +add.f32 f916, f895, f900; +add.f32 f917, f916, f915; +fma.rn.f32 f918, f910, 0f3E9E377A, f860; +mul.f32 f919, f912, 0f3F4F1BBD; +sub.f32 f920, f918, f919; +sub.f32 f921, f890, f905; +mul.f32 f922, f921, 0f3F737871; +sub.f32 f923, f895, f900; +mul.f32 f924, f923, 0fBF167918; +sub.f32 f925, f924, f922; +sub.f32 f926, f920, f925; +add.f32 f927, f925, f920; +mul.f32 f928, f910, 0f3F4F1BBD; +sub.f32 f929, f860, f928; +fma.rn.f32 f930, f912, 0f3E9E377A, f929; +mul.f32 f931, f921, 0f3F167918; +mul.f32 f932, f923, 0f3F737871; +sub.f32 f933, f932, f931; +sub.f32 f934, f930, f933; +add.f32 f935, f933, f930; +fma.rn.f32 f936, f914, 0f3E9E377A, f885; +mul.f32 f937, f916, 0f3F4F1BBD; +sub.f32 f938, f936, f937; +sub.f32 f939, f865, f880; +mul.f32 f940, f939, 0f3F737871; +sub.f32 f941, f870, f875; +mul.f32 f942, f941, 0fBF167918; +sub.f32 f943, f942, f940; +add.f32 f944, f943, f938; +sub.f32 f945, f938, f943; +mul.f32 f946, f914, 0f3F4F1BBD; +sub.f32 f947, f885, f946; +fma.rn.f32 f948, f916, 0f3E9E377A, f947; +mul.f32 f949, f939, 0f3F167918; +mul.f32 f950, f941, 0f3F737871; +sub.f32 f951, f950, f949; +add.f32 f952, f951, f948; +sub.f32 f953, f948, f951; +add.f32 f954, f866, f881; +add.f32 f955, f861, f954; +add.f32 f956, f871, f876; +add.f32 f957, f956, f955; +add.f32 f958, f891, f906; +add.f32 f959, f886, f958; +add.f32 f960, f896, f901; +add.f32 f961, f960, f959; +fma.rn.f32 f962, f954, 0f3E9E377A, f861; +mul.f32 f963, f956, 0f3F4F1BBD; +sub.f32 f964, f962, f963; +sub.f32 f965, f891, f906; +mul.f32 f966, f965, 0f3F737871; +sub.f32 f967, f896, f901; +mul.f32 f968, f967, 0fBF167918; +sub.f32 f969, f968, f966; +sub.f32 f970, f964, f969; +add.f32 f971, f969, f964; +mul.f32 f972, f954, 0f3F4F1BBD; +sub.f32 f973, f861, f972; +fma.rn.f32 f974, f956, 0f3E9E377A, f973; +mul.f32 f975, f965, 0f3F167918; +mul.f32 f976, f967, 0f3F737871; +sub.f32 f977, f976, f975; +sub.f32 f978, f974, f977; +add.f32 f979, f977, f974; +fma.rn.f32 f980, f958, 0f3E9E377A, f886; +mul.f32 f981, f960, 0f3F4F1BBD; +sub.f32 f982, f980, f981; +sub.f32 f983, f866, f881; +mul.f32 f984, f983, 0f3F737871; +sub.f32 f985, f871, f876; +mul.f32 f986, f985, 0fBF167918; +sub.f32 f987, f986, f984; +add.f32 f988, f987, f982; +sub.f32 f989, f982, f987; +mul.f32 f990, f958, 0f3F4F1BBD; +sub.f32 f991, f886, f990; +fma.rn.f32 f992, f960, 0f3E9E377A, f991; +mul.f32 f993, f983, 0f3F167918; +mul.f32 f994, f985, 0f3F737871; +sub.f32 f995, f994, f993; +add.f32 f996, f995, f992; +sub.f32 f997, f992, f995; +add.f32 f998, f867, f882; +add.f32 f999, f862, f998; +add.f32 f1000, f872, f877; +add.f32 f1001, f1000, f999; +add.f32 f1002, f892, f907; +add.f32 f1003, f887, f1002; +add.f32 f1004, f897, f902; +add.f32 f1005, f1004, f1003; +fma.rn.f32 f1006, f998, 0f3E9E377A, f862; +mul.f32 f1007, f1000, 0f3F4F1BBD; +sub.f32 f1008, f1006, f1007; +sub.f32 f1009, f892, f907; +mul.f32 f1010, f1009, 0f3F737871; +sub.f32 f1011, f897, f902; +mul.f32 f1012, f1011, 0fBF167918; +sub.f32 f1013, f1012, f1010; +sub.f32 f1014, f1008, f1013; +add.f32 f1015, f1013, f1008; +mul.f32 f1016, f998, 0f3F4F1BBD; +sub.f32 f1017, f862, f1016; +fma.rn.f32 f1018, f1000, 0f3E9E377A, f1017; +mul.f32 f1019, f1009, 0f3F167918; +mul.f32 f1020, f1011, 0f3F737871; +sub.f32 f1021, f1020, f1019; +sub.f32 f1022, f1018, f1021; +add.f32 f1023, f1021, f1018; +fma.rn.f32 f1024, f1002, 0f3E9E377A, f887; +mul.f32 f1025, f1004, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f867, f882; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f872, f877; +mul.f32 f1030, f1029, 0fBF167918; +sub.f32 f1031, f1030, f1028; +add.f32 f1032, f1031, f1026; +sub.f32 f1033, f1026, f1031; +mul.f32 f1034, f1002, 0f3F4F1BBD; +sub.f32 f1035, f887, f1034; +fma.rn.f32 f1036, f1004, 0f3E9E377A, f1035; +mul.f32 f1037, f1027, 0f3F167918; +mul.f32 f1038, f1029, 0f3F737871; +sub.f32 f1039, f1038, f1037; +add.f32 f1040, f1039, f1036; +sub.f32 f1041, f1036, f1039; +add.f32 f1042, f868, f883; +add.f32 f1043, f863, f1042; +add.f32 f1044, f873, f878; +add.f32 f1045, f1044, f1043; +add.f32 f1046, f893, f908; +add.f32 f1047, f888, f1046; +add.f32 f1048, f898, f903; +add.f32 f1049, f1048, f1047; +fma.rn.f32 f1050, f1042, 0f3E9E377A, f863; +mul.f32 f1051, f1044, 0f3F4F1BBD; +sub.f32 f1052, f1050, f1051; +sub.f32 f1053, f893, f908; +mul.f32 f1054, f1053, 0f3F737871; +sub.f32 f1055, f898, f903; +mul.f32 f1056, f1055, 0fBF167918; +sub.f32 f1057, f1056, f1054; +sub.f32 f1058, f1052, f1057; +add.f32 f1059, f1057, f1052; +mul.f32 f1060, f1042, 0f3F4F1BBD; +sub.f32 f1061, f863, f1060; +fma.rn.f32 f1062, f1044, 0f3E9E377A, f1061; +mul.f32 f1063, f1053, 0f3F167918; +mul.f32 f1064, f1055, 0f3F737871; +sub.f32 f1065, f1064, f1063; +sub.f32 f1066, f1062, f1065; +add.f32 f1067, f1065, f1062; +fma.rn.f32 f1068, f1046, 0f3E9E377A, f888; +mul.f32 f1069, f1048, 0f3F4F1BBD; +sub.f32 f1070, f1068, f1069; +sub.f32 f1071, f868, f883; +mul.f32 f1072, f1071, 0f3F737871; +sub.f32 f1073, f873, f878; +mul.f32 f1074, f1073, 0fBF167918; +sub.f32 f1075, f1074, f1072; +add.f32 f1076, f1075, f1070; +sub.f32 f1077, f1070, f1075; +mul.f32 f1078, f1046, 0f3F4F1BBD; +sub.f32 f1079, f888, f1078; +fma.rn.f32 f1080, f1048, 0f3E9E377A, f1079; +mul.f32 f1081, f1071, 0f3F167918; +mul.f32 f1082, f1073, 0f3F737871; +sub.f32 f1083, f1082, f1081; +add.f32 f1084, f1083, f1080; +sub.f32 f1085, f1080, f1083; +add.f32 f1086, f869, f884; +add.f32 f1087, f864, f1086; +add.f32 f1088, f874, f879; +add.f32 f1089, f1088, f1087; +add.f32 f1090, f894, f909; +add.f32 f1091, f889, f1090; +add.f32 f1092, f899, f904; +add.f32 f1093, f1092, f1091; +fma.rn.f32 f1094, f1086, 0f3E9E377A, f864; +mul.f32 f1095, f1088, 0f3F4F1BBD; +sub.f32 f1096, f1094, f1095; +sub.f32 f1097, f894, f909; +mul.f32 f1098, f1097, 0f3F737871; +sub.f32 f1099, f899, f904; +mul.f32 f1100, f1099, 0fBF167918; +sub.f32 f1101, f1100, f1098; +sub.f32 f1102, f1096, f1101; +add.f32 f1103, f1101, f1096; +mul.f32 f1104, f1086, 0f3F4F1BBD; +sub.f32 f1105, f864, f1104; +fma.rn.f32 f1106, f1088, 0f3E9E377A, f1105; +mul.f32 f1107, f1097, 0f3F167918; +mul.f32 f1108, f1099, 0f3F737871; +sub.f32 f1109, f1108, f1107; +sub.f32 f1110, f1106, f1109; +add.f32 f1111, f1109, f1106; +fma.rn.f32 f1112, f1090, 0f3E9E377A, f889; +mul.f32 f1113, f1092, 0f3F4F1BBD; +sub.f32 f1114, f1112, f1113; +sub.f32 f1115, f869, f884; +mul.f32 f1116, f1115, 0f3F737871; +sub.f32 f1117, f874, f879; +mul.f32 f1118, f1117, 0fBF167918; +sub.f32 f1119, f1118, f1116; +add.f32 f1120, f1119, f1114; +sub.f32 f1121, f1114, f1119; +mul.f32 f1122, f1090, 0f3F4F1BBD; +sub.f32 f1123, f889, f1122; +fma.rn.f32 f1124, f1092, 0f3E9E377A, f1123; +mul.f32 f1125, f1115, 0f3F167918; +mul.f32 f1126, f1117, 0f3F737871; +sub.f32 f1127, f1126, f1125; +add.f32 f1128, f1127, f1124; +sub.f32 f1129, f1124, f1127; +mul.f32 f1130, f970, 0f3F77F511; +mul.f32 f1131, f988, 0fBE7EA890; +sub.f32 f1132, f1130, f1131; +mul.f32 f1133, f988, 0f3F77F511; +fma.rn.f32 f1134, f970, 0fBE7EA890, f1133; +mul.f32 f1135, f1014, 0f3F6055A2; +mul.f32 f1136, f1032, 0fBEF6A86B; +sub.f32 f1137, f1135, f1136; +mul.f32 f1138, f1032, 0f3F6055A2; +fma.rn.f32 f1139, f1014, 0fBEF6A86B, f1138; +mul.f32 f1140, f1058, 0f3F3A9DB0; +mul.f32 f1141, f1076, 0fBF2F3E7B; +sub.f32 f1142, f1140, f1141; +mul.f32 f1143, f1076, 0f3F3A9DB0; +fma.rn.f32 f1144, f1058, 0fBF2F3E7B, f1143; +mul.f32 f1145, f1102, 0f3F092BF2; +mul.f32 f1146, f1120, 0fBF5825E0; +sub.f32 f1147, f1145, f1146; +mul.f32 f1148, f1120, 0f3F092BF2; +fma.rn.f32 f1149, f1102, 0fBF5825E0, f1148; +mul.f32 f1150, f978, 0f3F6055A2; +mul.f32 f1151, f996, 0fBEF6A86B; +sub.f32 f1152, f1150, f1151; +mul.f32 f1153, f996, 0f3F6055A2; +fma.rn.f32 f1154, f978, 0fBEF6A86B, f1153; +mul.f32 f1155, f1022, 0f3F092BF2; +mul.f32 f1156, f1040, 0fBF5825E0; +sub.f32 f1157, f1155, f1156; +mul.f32 f1158, f1040, 0f3F092BF2; +fma.rn.f32 f1159, f1022, 0fBF5825E0, f1158; +mul.f32 f1160, f1066, 0f3D809851; +mul.f32 f1161, f1084, 0fBF7F7EAE; +sub.f32 f1162, f1160, f1161; +mul.f32 f1163, f1084, 0f3D809851; +fma.rn.f32 f1164, f1066, 0fBF7F7EAE, f1163; +mul.f32 f1165, f1110, 0fBED9FFBE; +mul.f32 f1166, f1128, 0fBF67A2BF; +sub.f32 f1167, f1165, f1166; +mul.f32 f1168, f1128, 0fBED9FFBE; +fma.rn.f32 f1169, f1110, 0fBF67A2BF, f1168; +mul.f32 f1170, f979, 0f3F3A9DB0; +mul.f32 f1171, f997, 0fBF2F3E7B; +sub.f32 f1172, f1170, f1171; +mul.f32 f1173, f997, 0f3F3A9DB0; +fma.rn.f32 f1174, f979, 0fBF2F3E7B, f1173; +mul.f32 f1175, f1023, 0f3D809851; +mul.f32 f1176, f1041, 0fBF7F7EAE; +sub.f32 f1177, f1175, f1176; +mul.f32 f1178, f1041, 0f3D809851; +fma.rn.f32 f1179, f1023, 0fBF7F7EAE, f1178; +mul.f32 f1180, f1067, 0fBF232E38; +mul.f32 f1181, f1085, 0fBF45405B; +sub.f32 f1182, f1180, f1181; +mul.f32 f1183, f1085, 0fBF232E38; +fma.rn.f32 f1184, f1067, 0fBF45405B, f1183; +mul.f32 f1185, f1111, 0fBF7DFB3B; +mul.f32 f1186, f1129, 0fBE00575B; +sub.f32 f1187, f1185, f1186; +mul.f32 f1188, f1129, 0fBF7DFB3B; +fma.rn.f32 f1189, f1111, 0fBE00575B, f1188; +mul.f32 f1190, f971, 0f3F092BF2; +mul.f32 f1191, f989, 0fBF5825E0; +sub.f32 f1192, f1190, f1191; +mul.f32 f1193, f989, 0f3F092BF2; +fma.rn.f32 f1194, f971, 0fBF5825E0, f1193; +mul.f32 f1195, f1015, 0fBED9FFBE; +mul.f32 f1196, f1033, 0fBF67A2BF; +sub.f32 f1197, f1195, f1196; +mul.f32 f1198, f1033, 0fBED9FFBE; +fma.rn.f32 f1199, f1015, 0fBF67A2BF, f1198; +mul.f32 f1200, f1059, 0fBF7DFB3B; +mul.f32 f1201, f1077, 0fBE00575B; +sub.f32 f1202, f1200, f1201; +mul.f32 f1203, f1077, 0fBF7DFB3B; +fma.rn.f32 f1204, f1059, 0fBE00575B, f1203; +mul.f32 f1205, f1103, 0fBF232E38; +mul.f32 f1206, f1121, 0f3F45405B; +sub.f32 f1207, f1205, f1206; +mul.f32 f1208, f1121, 0fBF232E38; +fma.rn.f32 f1209, f1103, 0f3F45405B, f1208; +add.f32 f1210, f957, f1089; +add.f32 f1211, f913, f1210; +add.f32 f1212, f1001, f1045; +add.f32 f1213, f961, f1093; +add.f32 f1214, f917, f1213; +add.f32 f1215, f1005, f1049; +fma.rn.f32 f1216, f1210, 0f3E9E377A, f913; +mul.f32 f1217, f1212, 0f3F4F1BBD; +sub.f32 f1218, f1216, f1217; +sub.f32 f1219, f961, f1093; +mul.f32 f1220, f1219, 0f3F737871; +sub.f32 f1221, f1005, f1049; +mul.f32 f1222, f1221, 0fBF167918; +sub.f32 f1223, f1222, f1220; +mul.f32 f1224, f1210, 0f3F4F1BBD; +sub.f32 f1225, f913, f1224; +fma.rn.f32 f1226, f1212, 0f3E9E377A, f1225; +mul.f32 f1227, f1219, 0f3F167918; +mul.f32 f1228, f1221, 0f3F737871; +sub.f32 f1229, f1228, f1227; +fma.rn.f32 f1230, f1213, 0f3E9E377A, f917; +mul.f32 f1231, f1215, 0f3F4F1BBD; +sub.f32 f1232, f1230, f1231; +sub.f32 f1233, f957, f1089; +mul.f32 f1234, f1233, 0f3F737871; +sub.f32 f1235, f1001, f1045; +mul.f32 f1236, f1235, 0fBF167918; +sub.f32 f1237, f1236, f1234; +mul.f32 f1238, f1213, 0f3F4F1BBD; +sub.f32 f1239, f917, f1238; +fma.rn.f32 f1240, f1215, 0f3E9E377A, f1239; +mul.f32 f1241, f1233, 0f3F167918; +mul.f32 f1242, f1235, 0f3F737871; +sub.f32 f1243, f1242, f1241; +add.f32 f1244, f1132, f1147; +add.f32 f1245, f926, f1244; +add.f32 f1246, f1137, f1142; +add.f32 f1247, f1134, f1149; +add.f32 f1248, f944, f1247; +add.f32 f1249, f1139, f1144; +fma.rn.f32 f1250, f1244, 0f3E9E377A, f926; +mul.f32 f1251, f1246, 0f3F4F1BBD; +sub.f32 f1252, f1250, f1251; +sub.f32 f1253, f1134, f1149; +mul.f32 f1254, f1253, 0f3F737871; +sub.f32 f1255, f1139, f1144; +mul.f32 f1256, f1255, 0fBF167918; +sub.f32 f1257, f1256, f1254; +mul.f32 f1258, f1244, 0f3F4F1BBD; +sub.f32 f1259, f926, f1258; +fma.rn.f32 f1260, f1246, 0f3E9E377A, f1259; +mul.f32 f1261, f1253, 0f3F167918; +mul.f32 f1262, f1255, 0f3F737871; +sub.f32 f1263, f1262, f1261; +fma.rn.f32 f1264, f1247, 0f3E9E377A, f944; +mul.f32 f1265, f1249, 0f3F4F1BBD; +sub.f32 f1266, f1264, f1265; +sub.f32 f1267, f1132, f1147; +mul.f32 f1268, f1267, 0f3F737871; +sub.f32 f1269, f1137, f1142; +mul.f32 f1270, f1269, 0fBF167918; +sub.f32 f1271, f1270, f1268; +mul.f32 f1272, f1247, 0f3F4F1BBD; +sub.f32 f1273, f944, f1272; +fma.rn.f32 f1274, f1249, 0f3E9E377A, f1273; +mul.f32 f1275, f1267, 0f3F167918; +mul.f32 f1276, f1269, 0f3F737871; +sub.f32 f1277, f1276, f1275; +add.f32 f1278, f1152, f1167; +add.f32 f1279, f934, f1278; +add.f32 f1280, f1157, f1162; +add.f32 f1281, f1154, f1169; +add.f32 f1282, f952, f1281; +add.f32 f1283, f1159, f1164; +fma.rn.f32 f1284, f1278, 0f3E9E377A, f934; +mul.f32 f1285, f1280, 0f3F4F1BBD; +sub.f32 f1286, f1284, f1285; +sub.f32 f1287, f1154, f1169; +mul.f32 f1288, f1287, 0f3F737871; +sub.f32 f1289, f1159, f1164; +mul.f32 f1290, f1289, 0fBF167918; +sub.f32 f1291, f1290, f1288; +mul.f32 f1292, f1278, 0f3F4F1BBD; +sub.f32 f1293, f934, f1292; +fma.rn.f32 f1294, f1280, 0f3E9E377A, f1293; +mul.f32 f1295, f1287, 0f3F167918; +mul.f32 f1296, f1289, 0f3F737871; +sub.f32 f1297, f1296, f1295; +fma.rn.f32 f1298, f1281, 0f3E9E377A, f952; +mul.f32 f1299, f1283, 0f3F4F1BBD; +sub.f32 f1300, f1298, f1299; +sub.f32 f1301, f1152, f1167; +mul.f32 f1302, f1301, 0f3F737871; +sub.f32 f1303, f1157, f1162; +mul.f32 f1304, f1303, 0fBF167918; +sub.f32 f1305, f1304, f1302; +mul.f32 f1306, f1281, 0f3F4F1BBD; +sub.f32 f1307, f952, f1306; +fma.rn.f32 f1308, f1283, 0f3E9E377A, f1307; +mul.f32 f1309, f1301, 0f3F167918; +mul.f32 f1310, f1303, 0f3F737871; +sub.f32 f1311, f1310, f1309; +add.f32 f1312, f1172, f1187; +add.f32 f1313, f935, f1312; +add.f32 f1314, f1177, f1182; +add.f32 f1315, f1174, f1189; +add.f32 f1316, f953, f1315; +add.f32 f1317, f1179, f1184; +fma.rn.f32 f1318, f1312, 0f3E9E377A, f935; +mul.f32 f1319, f1314, 0f3F4F1BBD; +sub.f32 f1320, f1318, f1319; +sub.f32 f1321, f1174, f1189; +mul.f32 f1322, f1321, 0f3F737871; +sub.f32 f1323, f1179, f1184; +mul.f32 f1324, f1323, 0fBF167918; +sub.f32 f1325, f1324, f1322; +mul.f32 f1326, f1312, 0f3F4F1BBD; +sub.f32 f1327, f935, f1326; +fma.rn.f32 f1328, f1314, 0f3E9E377A, f1327; +mul.f32 f1329, f1321, 0f3F167918; +mul.f32 f1330, f1323, 0f3F737871; +sub.f32 f1331, f1330, f1329; +fma.rn.f32 f1332, f1315, 0f3E9E377A, f953; +mul.f32 f1333, f1317, 0f3F4F1BBD; +sub.f32 f1334, f1332, f1333; +sub.f32 f1335, f1172, f1187; +mul.f32 f1336, f1335, 0f3F737871; +sub.f32 f1337, f1177, f1182; +mul.f32 f1338, f1337, 0fBF167918; +sub.f32 f1339, f1338, f1336; +mul.f32 f1340, f1315, 0f3F4F1BBD; +sub.f32 f1341, f953, f1340; +fma.rn.f32 f1342, f1317, 0f3E9E377A, f1341; +mul.f32 f1343, f1335, 0f3F167918; +mul.f32 f1344, f1337, 0f3F737871; +sub.f32 f1345, f1344, f1343; +add.f32 f1346, f1192, f1207; +add.f32 f1347, f927, f1346; +add.f32 f1348, f1197, f1202; +add.f32 f1349, f1194, f1209; +add.f32 f1350, f945, f1349; +add.f32 f1351, f1199, f1204; +fma.rn.f32 f1352, f1346, 0f3E9E377A, f927; +mul.f32 f1353, f1348, 0f3F4F1BBD; +sub.f32 f1354, f1352, f1353; +sub.f32 f1355, f1194, f1209; +mul.f32 f1356, f1355, 0f3F737871; +sub.f32 f1357, f1199, f1204; +mul.f32 f1358, f1357, 0fBF167918; +sub.f32 f1359, f1358, f1356; +mul.f32 f1360, f1346, 0f3F4F1BBD; +sub.f32 f1361, f927, f1360; +fma.rn.f32 f1362, f1348, 0f3E9E377A, f1361; +mul.f32 f1363, f1355, 0f3F167918; +mul.f32 f1364, f1357, 0f3F737871; +sub.f32 f1365, f1364, f1363; +fma.rn.f32 f1366, f1349, 0f3E9E377A, f945; +mul.f32 f1367, f1351, 0f3F4F1BBD; +sub.f32 f1368, f1366, f1367; +sub.f32 f1369, f1192, f1207; +mul.f32 f1370, f1369, 0f3F737871; +sub.f32 f1371, f1197, f1202; +mul.f32 f1372, f1371, 0fBF167918; +sub.f32 f1373, f1372, f1370; +mul.f32 f1374, f1349, 0f3F4F1BBD; +sub.f32 f1375, f945, f1374; +fma.rn.f32 f1376, f1351, 0f3E9E377A, f1375; +mul.f32 f1377, f1369, 0f3F167918; +mul.f32 f1378, f1371, 0f3F737871; +sub.f32 f1379, f1378, f1377; +add.f32 %0, f1212, f1211; +add.f32 %1, f1215, f1214; +add.f32 %3, f1249, f1248; +add.f32 %2, f1246, f1245; +add.f32 %5, f1283, f1282; +add.f32 %4, f1280, f1279; +add.f32 %7, f1317, f1316; +add.f32 %6, f1314, f1313; +add.f32 %9, f1351, f1350; +add.f32 %8, f1348, f1347; +add.f32 %11, f1237, f1232; +sub.f32 %10, f1218, f1223; +add.f32 %13, f1271, f1266; +sub.f32 %12, f1252, f1257; +add.f32 %15, f1305, f1300; +sub.f32 %14, f1286, f1291; +add.f32 %17, f1339, f1334; +sub.f32 %16, f1320, f1325; +add.f32 %19, f1373, f1368; +sub.f32 %18, f1354, f1359; +sub.f32 %20, f1226, f1229; +add.f32 %21, f1243, f1240; +add.f32 %23, f1277, f1274; +sub.f32 %22, f1260, f1263; +add.f32 %25, f1311, f1308; +sub.f32 %24, f1294, f1297; +add.f32 %27, f1345, f1342; +sub.f32 %26, f1328, f1331; +add.f32 %29, f1379, f1376; +sub.f32 %28, f1362, f1365; +add.f32 %30, f1229, f1226; +sub.f32 %31, f1240, f1243; +sub.f32 %33, f1274, f1277; +add.f32 %32, f1263, f1260; +sub.f32 %35, f1308, f1311; +add.f32 %34, f1297, f1294; +sub.f32 %37, f1342, f1345; +add.f32 %36, f1331, f1328; +sub.f32 %39, f1376, f1379; +add.f32 %38, f1365, f1362; +sub.f32 %41, f1232, f1237; +add.f32 %40, f1223, f1218; +sub.f32 %43, f1266, f1271; +add.f32 %42, f1257, f1252; +sub.f32 %45, f1300, f1305; +add.f32 %44, f1291, f1286; +sub.f32 %47, f1334, f1339; +add.f32 %46, f1325, f1320; +sub.f32 %49, f1368, f1373; +add.f32 %48, f1359, f1354; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<164, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<374>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 5000, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %16, %24; +add.f32 f22, %14, f21; +add.f32 f23, %19, %22; +add.f32 f24, %18, %25; +add.f32 f25, %15, f24; +add.f32 f26, %21, %23; +fma.rn.f32 f27, f21, 0f3E9E377A, %14; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %18, %25; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %21, %23; +mul.f32 f33, f32, 0fBF167918; +sub.f32 f34, f33, f31; +sub.f32 f35, f29, f34; +add.f32 f36, f34, f29; +mul.f32 f37, f21, 0f3F4F1BBD; +sub.f32 f38, %14, f37; +fma.rn.f32 f39, f23, 0f3E9E377A, f38; +mul.f32 f40, f30, 0f3F167918; +mul.f32 f41, f32, 0f3F737871; +sub.f32 f42, f41, f40; +sub.f32 f43, f39, f42; +add.f32 f44, f42, f39; +fma.rn.f32 f45, f24, 0f3E9E377A, %15; +mul.f32 f46, f26, 0f3F4F1BBD; +sub.f32 f47, f45, f46; +sub.f32 f48, %16, %24; +mul.f32 f49, f48, 0f3F737871; +sub.f32 f50, %19, %22; +mul.f32 f51, f50, 0fBF167918; +sub.f32 f52, f51, f49; +add.f32 f53, f52, f47; +sub.f32 f54, f47, f52; +mul.f32 f55, f24, 0f3F4F1BBD; +sub.f32 f56, %15, f55; +fma.rn.f32 f57, f26, 0f3E9E377A, f56; +mul.f32 f58, f48, 0f3F167918; +mul.f32 f59, f50, 0f3F737871; +sub.f32 f60, f59, f58; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f63, f35; +mul.f32 f68, f64, f53; +mul.f32 f69, f63, f53; +mul.f32 f70, f63, f63; +mul.f32 f71, f64, f64; +sub.f32 f72, f70, f71; +mul.f32 f73, f64, f63; +fma.rn.f32 f74, f64, f63, f73; +mul.f32 f75, f72, f43; +mul.f32 f76, f74, f61; +mul.f32 f77, f72, f61; +mul.f32 f78, f63, f72; +mul.f32 f79, f64, f74; +sub.f32 f80, f78, f79; +mul.f32 f81, f63, f74; +fma.rn.f32 f82, f64, f72, f81; +mul.f32 f83, f80, f44; +mul.f32 f84, f82, f62; +mul.f32 f85, f80, f62; +mul.f32 f86, f63, f80; +mul.f32 f87, f64, f82; +sub.f32 f88, f86, f87; +mul.f32 f89, f63, f82; +fma.rn.f32 f90, f64, f80, f89; +mul.f32 f91, f88, f36; +mul.f32 f92, f90, f54; +mul.f32 f93, f88, f54; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f94, f26, f25; +add.f32 f95, f23, f22; +st.shared.v2.f32 [r9], {f95, f94}; +fma.rn.f32 f96, f64, f35, f69; +sub.f32 f97, f67, f68; +st.shared.v2.f32 [r9+8], {f97, f96}; +fma.rn.f32 f98, f74, f43, f77; +sub.f32 f99, f75, f76; +st.shared.v2.f32 [r9+16], {f99, f98}; +sub.f32 f100, f83, f84; +fma.rn.f32 f101, f82, f44, f85; +st.shared.v2.f32 [r9+24], {f100, f101}; +fma.rn.f32 f102, f90, f36, f93; +sub.f32 f103, f91, f92; +st.shared.v2.f32 [r9+32], {f103, f102}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f104, f105}, [r11]; +ld.shared.v2.f32 {f108, f109}, [r11+1000]; +ld.shared.v2.f32 {f112, f113}, [r11+2000]; +ld.shared.v2.f32 {f116, f117}, [r11+3000]; +ld.shared.v2.f32 {f120, f121}, [r11+4000]; +add.f32 f124, f108, f120; +add.f32 f125, f104, f124; +add.f32 f126, f112, f116; +add.f32 f127, f109, f121; +add.f32 f128, f105, f127; +add.f32 f129, f113, f117; +fma.rn.f32 f130, f124, 0f3E9E377A, f104; +mul.f32 f131, f126, 0f3F4F1BBD; +sub.f32 f132, f130, f131; +sub.f32 f133, f109, f121; +mul.f32 f134, f133, 0f3F737871; +sub.f32 f135, f113, f117; +mul.f32 f136, f135, 0fBF167918; +sub.f32 f137, f136, f134; +sub.f32 f138, f132, f137; +add.f32 f139, f137, f132; +mul.f32 f140, f124, 0f3F4F1BBD; +sub.f32 f141, f104, f140; +fma.rn.f32 f142, f126, 0f3E9E377A, f141; +mul.f32 f143, f133, 0f3F167918; +mul.f32 f144, f135, 0f3F737871; +sub.f32 f145, f144, f143; +sub.f32 f146, f142, f145; +add.f32 f147, f145, f142; +fma.rn.f32 f148, f127, 0f3E9E377A, f105; +mul.f32 f149, f129, 0f3F4F1BBD; +sub.f32 f150, f148, f149; +sub.f32 f151, f108, f120; +mul.f32 f152, f151, 0f3F737871; +sub.f32 f153, f112, f116; +mul.f32 f154, f153, 0fBF167918; +sub.f32 f155, f154, f152; +add.f32 f156, f155, f150; +sub.f32 f157, f150, f155; +mul.f32 f158, f127, 0f3F4F1BBD; +sub.f32 f159, f105, f158; +fma.rn.f32 f160, f129, 0f3E9E377A, f159; +mul.f32 f161, f151, 0f3F167918; +mul.f32 f162, f153, 0f3F737871; +sub.f32 f163, f162, f161; +add.f32 f164, f163, f160; +sub.f32 f165, f160, f163; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f166, f167}, [rd11]; +mul.f32 f170, f166, f138; +mul.f32 f171, f167, f156; +mul.f32 f172, f166, f156; +mul.f32 f173, f166, f166; +mul.f32 f174, f167, f167; +sub.f32 f175, f173, f174; +mul.f32 f176, f167, f166; +fma.rn.f32 f177, f167, f166, f176; +mul.f32 f178, f175, f146; +mul.f32 f179, f177, f164; +mul.f32 f180, f175, f164; +mul.f32 f181, f166, f175; +mul.f32 f182, f167, f177; +sub.f32 f183, f181, f182; +mul.f32 f184, f166, f177; +fma.rn.f32 f185, f167, f175, f184; +mul.f32 f186, f183, f147; +mul.f32 f187, f185, f165; +mul.f32 f188, f183, f165; +mul.f32 f189, f166, f183; +mul.f32 f190, f167, f185; +sub.f32 f191, f189, f190; +mul.f32 f192, f166, f185; +fma.rn.f32 f193, f167, f183, f192; +mul.f32 f194, f191, f139; +mul.f32 f195, f193, f157; +mul.f32 f196, f191, f157; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +add.f32 f197, f129, f128; +add.f32 f198, f126, f125; +st.shared.v2.f32 [r17], {f198, f197}; +fma.rn.f32 f199, f167, f138, f172; +sub.f32 f200, f170, f171; +st.shared.v2.f32 [r17+40], {f200, f199}; +fma.rn.f32 f201, f177, f146, f180; +sub.f32 f202, f178, f179; +st.shared.v2.f32 [r17+80], {f202, f201}; +fma.rn.f32 f203, f185, f147, f188; +sub.f32 f204, f186, f187; +st.shared.v2.f32 [r17+120], {f204, f203}; +fma.rn.f32 f205, f193, f139, f196; +sub.f32 f206, f194, f195; +st.shared.v2.f32 [r17+160], {f206, f205}; +barrier.sync 0; +ld.shared.v2.f32 {f207, f208}, [r11]; +ld.shared.v2.f32 {f211, f212}, [r11+1000]; +ld.shared.v2.f32 {f215, f216}, [r11+2000]; +ld.shared.v2.f32 {f219, f220}, [r11+3000]; +ld.shared.v2.f32 {f223, f224}, [r11+4000]; +add.f32 f227, f211, f223; +add.f32 f228, f207, f227; +add.f32 f229, f215, f219; +add.f32 f230, f212, f224; +add.f32 f231, f208, f230; +add.f32 f232, f216, f220; +fma.rn.f32 f233, f227, 0f3E9E377A, f207; +mul.f32 f234, f229, 0f3F4F1BBD; +sub.f32 f235, f233, f234; +sub.f32 f236, f212, f224; +mul.f32 f237, f236, 0f3F737871; +sub.f32 f238, f216, f220; +mul.f32 f239, f238, 0fBF167918; +sub.f32 f240, f239, f237; +sub.f32 f241, f235, f240; +add.f32 f242, f240, f235; +mul.f32 f243, f227, 0f3F4F1BBD; +sub.f32 f244, f207, f243; +fma.rn.f32 f245, f229, 0f3E9E377A, f244; +mul.f32 f246, f236, 0f3F167918; +mul.f32 f247, f238, 0f3F737871; +sub.f32 f248, f247, f246; +sub.f32 f249, f245, f248; +add.f32 f250, f248, f245; +fma.rn.f32 f251, f230, 0f3E9E377A, f208; +mul.f32 f252, f232, 0f3F4F1BBD; +sub.f32 f253, f251, f252; +sub.f32 f254, f211, f223; +mul.f32 f255, f254, 0f3F737871; +sub.f32 f256, f215, f219; +mul.f32 f257, f256, 0fBF167918; +sub.f32 f258, f257, f255; +add.f32 f259, f258, f253; +sub.f32 f260, f253, f258; +mul.f32 f261, f230, 0f3F4F1BBD; +sub.f32 f262, f208, f261; +fma.rn.f32 f263, f232, 0f3E9E377A, f262; +mul.f32 f264, f254, 0f3F167918; +mul.f32 f265, f256, 0f3F737871; +sub.f32 f266, f265, f264; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f269, f270}, [rd16]; +mul.f32 f273, f269, f241; +mul.f32 f274, f270, f259; +mul.f32 f275, f269, f259; +mul.f32 f276, f269, f269; +mul.f32 f277, f270, f270; +sub.f32 f278, f276, f277; +mul.f32 f279, f270, f269; +fma.rn.f32 f280, f270, f269, f279; +mul.f32 f281, f278, f249; +mul.f32 f282, f280, f267; +mul.f32 f283, f278, f267; +mul.f32 f284, f269, f278; +mul.f32 f285, f270, f280; +sub.f32 f286, f284, f285; +mul.f32 f287, f269, f280; +fma.rn.f32 f288, f270, f278, f287; +mul.f32 f289, f286, f250; +mul.f32 f290, f288, f268; +mul.f32 f291, f286, f268; +mul.f32 f292, f269, f286; +mul.f32 f293, f270, f288; +sub.f32 f294, f292, f293; +mul.f32 f295, f269, f288; +fma.rn.f32 f296, f270, f286, f295; +mul.f32 f297, f294, f242; +mul.f32 f298, f296, f260; +mul.f32 f299, f294, f260; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +add.f32 f300, f232, f231; +add.f32 f301, f229, f228; +st.shared.v2.f32 [r23], {f301, f300}; +fma.rn.f32 f302, f270, f241, f275; +sub.f32 f303, f273, f274; +st.shared.v2.f32 [r23+200], {f303, f302}; +fma.rn.f32 f304, f280, f249, f283; +sub.f32 f305, f281, f282; +st.shared.v2.f32 [r23+400], {f305, f304}; +fma.rn.f32 f306, f288, f250, f291; +sub.f32 f307, f289, f290; +st.shared.v2.f32 [r23+600], {f307, f306}; +fma.rn.f32 f308, f296, f242, f299; +sub.f32 f309, f297, f298; +st.shared.v2.f32 [r23+800], {f309, f308}; +barrier.sync 0; +ld.shared.v2.f32 {f310, f311}, [r11]; +ld.shared.v2.f32 {f314, f315}, [r11+1000]; +ld.shared.v2.f32 {f318, f319}, [r11+2000]; +ld.shared.v2.f32 {f322, f323}, [r11+3000]; +ld.shared.v2.f32 {f326, f327}, [r11+4000]; +add.f32 f330, f314, f326; +add.f32 f331, f310, f330; +add.f32 f332, f318, f322; +add.f32 f333, f315, f327; +add.f32 f334, f311, f333; +add.f32 f335, f319, f323; +fma.rn.f32 f336, f330, 0f3E9E377A, f310; +mul.f32 f337, f332, 0f3F4F1BBD; +sub.f32 f338, f336, f337; +sub.f32 f339, f315, f327; +mul.f32 f340, f339, 0f3F737871; +sub.f32 f341, f319, f323; +mul.f32 f342, f341, 0fBF167918; +sub.f32 f343, f342, f340; +mul.f32 f344, f330, 0f3F4F1BBD; +sub.f32 f345, f310, f344; +fma.rn.f32 f346, f332, 0f3E9E377A, f345; +mul.f32 f347, f339, 0f3F167918; +mul.f32 f348, f341, 0f3F737871; +sub.f32 f349, f348, f347; +fma.rn.f32 f350, f333, 0f3E9E377A, f311; +mul.f32 f351, f335, 0f3F4F1BBD; +sub.f32 f352, f350, f351; +sub.f32 f353, f314, f326; +mul.f32 f354, f353, 0f3F737871; +sub.f32 f355, f318, f322; +mul.f32 f356, f355, 0fBF167918; +sub.f32 f357, f356, f354; +mul.f32 f358, f333, 0f3F4F1BBD; +sub.f32 f359, f311, f358; +fma.rn.f32 f360, f335, 0f3E9E377A, f359; +mul.f32 f361, f353, 0f3F167918; +mul.f32 f362, f355, 0f3F737871; +sub.f32 f363, f362, f361; +add.f32 %1, f335, f334; +add.f32 %0, f332, f331; +add.f32 %3, f357, f352; +sub.f32 %2, f338, f343; +add.f32 %5, f363, f360; +sub.f32 %4, f346, f349; +sub.f32 %7, f360, f363; +add.f32 %6, f349, f346; +sub.f32 %9, f352, f357; +add.f32 %8, f343, f338; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<165, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<344>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 2500, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %16, %24; +add.f32 f22, %14, f21; +add.f32 f23, %19, %22; +add.f32 f24, f23, f22; +add.f32 f25, %18, %25; +add.f32 f26, %15, f25; +add.f32 f27, %21, %23; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %14; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %18, %25; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %21, %23; +mul.f32 f35, f34, 0fBF167918; +sub.f32 f36, f35, f33; +sub.f32 f37, f31, f36; +add.f32 f38, f36, f31; +mul.f32 f39, f21, 0f3F4F1BBD; +sub.f32 f40, %14, f39; +fma.rn.f32 f41, f23, 0f3E9E377A, f40; +mul.f32 f42, f32, 0f3F167918; +mul.f32 f43, f34, 0f3F737871; +sub.f32 f44, f43, f42; +sub.f32 f45, f41, f44; +add.f32 f46, f44, f41; +fma.rn.f32 f47, f25, 0f3E9E377A, %15; +mul.f32 f48, f27, 0f3F4F1BBD; +sub.f32 f49, f47, f48; +sub.f32 f50, %16, %24; +mul.f32 f51, f50, 0f3F737871; +sub.f32 f52, %19, %22; +mul.f32 f53, f52, 0fBF167918; +sub.f32 f54, f53, f51; +add.f32 f55, f54, f49; +sub.f32 f56, f49, f54; +mul.f32 f57, f25, 0f3F4F1BBD; +sub.f32 f58, %15, f57; +fma.rn.f32 f59, f27, 0f3E9E377A, f58; +mul.f32 f60, f50, 0f3F167918; +mul.f32 f61, f52, 0f3F737871; +sub.f32 f62, f61, f60; +add.f32 f63, f62, f59; +sub.f32 f64, f59, f62; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f65, f66}, [rd6]; +mul.f32 f69, f65, f37; +mul.f32 f70, f66, f55; +sub.f32 f71, f69, f70; +mul.f32 f72, f65, f55; +fma.rn.f32 f73, f66, f37, f72; +mul.f32 f74, f65, f65; +mul.f32 f75, f66, f66; +sub.f32 f76, f74, f75; +mul.f32 f77, f66, f65; +fma.rn.f32 f78, f66, f65, f77; +mul.f32 f79, f76, f45; +mul.f32 f80, f78, f63; +sub.f32 f81, f79, f80; +mul.f32 f82, f76, f63; +fma.rn.f32 f83, f78, f45, f82; +mul.f32 f84, f65, f76; +mul.f32 f85, f66, f78; +sub.f32 f86, f84, f85; +mul.f32 f87, f65, f78; +fma.rn.f32 f88, f66, f76, f87; +mul.f32 f89, f86, f46; +mul.f32 f90, f88, f64; +sub.f32 f91, f89, f90; +mul.f32 f92, f86, f64; +fma.rn.f32 f93, f88, f46, f92; +mul.f32 f94, f65, f86; +mul.f32 f95, f66, f88; +sub.f32 f96, f94, f95; +mul.f32 f97, f65, f88; +fma.rn.f32 f98, f66, f86, f97; +mul.f32 f99, f96, f38; +mul.f32 f100, f98, f56; +sub.f32 f101, f99, f100; +mul.f32 f102, f96, f56; +fma.rn.f32 f103, f98, f38, f102; +mad.lo.s32 r8, r5, 2500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f104, [r11]; +ld.shared.f32 f105, [r11+500]; +ld.shared.f32 f106, [r11+1000]; +ld.shared.f32 f107, [r11+1500]; +ld.shared.f32 f108, [r11+2000]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f73; +st.shared.f32 [r9+8], f83; +st.shared.f32 [r9+12], f93; +st.shared.f32 [r9+16], f103; +barrier.sync 0; +ld.shared.f32 f109, [r11]; +ld.shared.f32 f110, [r11+500]; +ld.shared.f32 f111, [r11+1000]; +ld.shared.f32 f112, [r11+1500]; +ld.shared.f32 f113, [r11+2000]; +add.f32 f114, f105, f108; +add.f32 f115, f104, f114; +add.f32 f116, f106, f107; +add.f32 f117, f116, f115; +add.f32 f118, f110, f113; +add.f32 f119, f109, f118; +add.f32 f120, f111, f112; +add.f32 f121, f120, f119; +fma.rn.f32 f122, f114, 0f3E9E377A, f104; +mul.f32 f123, f116, 0f3F4F1BBD; +sub.f32 f124, f122, f123; +sub.f32 f125, f110, f113; +mul.f32 f126, f125, 0f3F737871; +sub.f32 f127, f111, f112; +mul.f32 f128, f127, 0fBF167918; +sub.f32 f129, f128, f126; +sub.f32 f130, f124, f129; +add.f32 f131, f129, f124; +mul.f32 f132, f114, 0f3F4F1BBD; +sub.f32 f133, f104, f132; +fma.rn.f32 f134, f116, 0f3E9E377A, f133; +mul.f32 f135, f125, 0f3F167918; +mul.f32 f136, f127, 0f3F737871; +sub.f32 f137, f136, f135; +sub.f32 f138, f134, f137; +add.f32 f139, f137, f134; +fma.rn.f32 f140, f118, 0f3E9E377A, f109; +mul.f32 f141, f120, 0f3F4F1BBD; +sub.f32 f142, f140, f141; +sub.f32 f143, f105, f108; +mul.f32 f144, f143, 0f3F737871; +sub.f32 f145, f106, f107; +mul.f32 f146, f145, 0fBF167918; +sub.f32 f147, f146, f144; +add.f32 f148, f147, f142; +sub.f32 f149, f142, f147; +mul.f32 f150, f118, 0f3F4F1BBD; +sub.f32 f151, f109, f150; +fma.rn.f32 f152, f120, 0f3E9E377A, f151; +mul.f32 f153, f143, 0f3F167918; +mul.f32 f154, f145, 0f3F737871; +sub.f32 f155, f154, f153; +add.f32 f156, f155, f152; +sub.f32 f157, f152, f155; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f158, f159}, [rd11]; +mul.f32 f162, f158, f130; +mul.f32 f163, f159, f148; +sub.f32 f164, f162, f163; +mul.f32 f165, f158, f148; +fma.rn.f32 f166, f159, f130, f165; +mul.f32 f167, f158, f158; +mul.f32 f168, f159, f159; +sub.f32 f169, f167, f168; +mul.f32 f170, f159, f158; +fma.rn.f32 f171, f159, f158, f170; +mul.f32 f172, f169, f138; +mul.f32 f173, f171, f156; +sub.f32 f174, f172, f173; +mul.f32 f175, f169, f156; +fma.rn.f32 f176, f171, f138, f175; +mul.f32 f177, f158, f169; +mul.f32 f178, f159, f171; +sub.f32 f179, f177, f178; +mul.f32 f180, f158, f171; +fma.rn.f32 f181, f159, f169, f180; +mul.f32 f182, f179, f139; +mul.f32 f183, f181, f157; +sub.f32 f184, f182, f183; +mul.f32 f185, f179, f157; +fma.rn.f32 f186, f181, f139, f185; +mul.f32 f187, f158, f179; +mul.f32 f188, f159, f181; +sub.f32 f189, f187, f188; +mul.f32 f190, f158, f181; +fma.rn.f32 f191, f159, f179, f190; +mul.f32 f192, f189, f131; +mul.f32 f193, f191, f149; +sub.f32 f194, f192, f193; +mul.f32 f195, f189, f149; +fma.rn.f32 f196, f191, f131, f195; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 100, r16; +st.shared.f32 [r17], f117; +st.shared.f32 [r17+20], f164; +st.shared.f32 [r17+40], f174; +st.shared.f32 [r17+60], f184; +st.shared.f32 [r17+80], f194; +barrier.sync 0; +ld.shared.f32 f197, [r11]; +ld.shared.f32 f198, [r11+500]; +ld.shared.f32 f199, [r11+1000]; +ld.shared.f32 f200, [r11+1500]; +ld.shared.f32 f201, [r11+2000]; +barrier.sync 0; +st.shared.f32 [r17], f121; +st.shared.f32 [r17+20], f166; +st.shared.f32 [r17+40], f176; +st.shared.f32 [r17+60], f186; +st.shared.f32 [r17+80], f196; +barrier.sync 0; +ld.shared.f32 f202, [r11]; +ld.shared.f32 f203, [r11+500]; +ld.shared.f32 f204, [r11+1000]; +ld.shared.f32 f205, [r11+1500]; +ld.shared.f32 f206, [r11+2000]; +add.f32 f207, f198, f201; +add.f32 f208, f197, f207; +add.f32 f209, f199, f200; +add.f32 f210, f209, f208; +add.f32 f211, f203, f206; +add.f32 f212, f202, f211; +add.f32 f213, f204, f205; +add.f32 f214, f213, f212; +fma.rn.f32 f215, f207, 0f3E9E377A, f197; +mul.f32 f216, f209, 0f3F4F1BBD; +sub.f32 f217, f215, f216; +sub.f32 f218, f203, f206; +mul.f32 f219, f218, 0f3F737871; +sub.f32 f220, f204, f205; +mul.f32 f221, f220, 0fBF167918; +sub.f32 f222, f221, f219; +sub.f32 f223, f217, f222; +add.f32 f224, f222, f217; +mul.f32 f225, f207, 0f3F4F1BBD; +sub.f32 f226, f197, f225; +fma.rn.f32 f227, f209, 0f3E9E377A, f226; +mul.f32 f228, f218, 0f3F167918; +mul.f32 f229, f220, 0f3F737871; +sub.f32 f230, f229, f228; +sub.f32 f231, f227, f230; +add.f32 f232, f230, f227; +fma.rn.f32 f233, f211, 0f3E9E377A, f202; +mul.f32 f234, f213, 0f3F4F1BBD; +sub.f32 f235, f233, f234; +sub.f32 f236, f198, f201; +mul.f32 f237, f236, 0f3F737871; +sub.f32 f238, f199, f200; +mul.f32 f239, f238, 0fBF167918; +sub.f32 f240, f239, f237; +add.f32 f241, f240, f235; +sub.f32 f242, f235, f240; +mul.f32 f243, f211, 0f3F4F1BBD; +sub.f32 f244, f202, f243; +fma.rn.f32 f245, f213, 0f3E9E377A, f244; +mul.f32 f246, f236, 0f3F167918; +mul.f32 f247, f238, 0f3F737871; +sub.f32 f248, f247, f246; +add.f32 f249, f248, f245; +sub.f32 f250, f245, f248; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f251, f252}, [rd16]; +mul.f32 f255, f251, f223; +mul.f32 f256, f252, f241; +sub.f32 f257, f255, f256; +mul.f32 f258, f251, f241; +fma.rn.f32 f259, f252, f223, f258; +mul.f32 f260, f251, f251; +mul.f32 f261, f252, f252; +sub.f32 f262, f260, f261; +mul.f32 f263, f252, f251; +fma.rn.f32 f264, f252, f251, f263; +mul.f32 f265, f262, f231; +mul.f32 f266, f264, f249; +sub.f32 f267, f265, f266; +mul.f32 f268, f262, f249; +fma.rn.f32 f269, f264, f231, f268; +mul.f32 f270, f251, f262; +mul.f32 f271, f252, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f251, f264; +fma.rn.f32 f274, f252, f262, f273; +mul.f32 f275, f272, f232; +mul.f32 f276, f274, f250; +sub.f32 f277, f275, f276; +mul.f32 f278, f272, f250; +fma.rn.f32 f279, f274, f232, f278; +mul.f32 f280, f251, f272; +mul.f32 f281, f252, f274; +sub.f32 f282, f280, f281; +mul.f32 f283, f251, f274; +fma.rn.f32 f284, f252, f272, f283; +mul.f32 f285, f282, f224; +mul.f32 f286, f284, f242; +sub.f32 f287, f285, f286; +mul.f32 f288, f282, f242; +fma.rn.f32 f289, f284, f224, f288; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 500, r22; +st.shared.f32 [r23], f210; +st.shared.f32 [r23+100], f257; +st.shared.f32 [r23+200], f267; +st.shared.f32 [r23+300], f277; +st.shared.f32 [r23+400], f287; +barrier.sync 0; +ld.shared.f32 f290, [r11]; +ld.shared.f32 f291, [r11+500]; +ld.shared.f32 f292, [r11+1000]; +ld.shared.f32 f293, [r11+1500]; +ld.shared.f32 f294, [r11+2000]; +barrier.sync 0; +st.shared.f32 [r23], f214; +st.shared.f32 [r23+100], f259; +st.shared.f32 [r23+200], f269; +st.shared.f32 [r23+300], f279; +st.shared.f32 [r23+400], f289; +barrier.sync 0; +ld.shared.f32 f295, [r11]; +ld.shared.f32 f296, [r11+500]; +ld.shared.f32 f297, [r11+1000]; +ld.shared.f32 f298, [r11+1500]; +ld.shared.f32 f299, [r11+2000]; +add.f32 f300, f291, f294; +add.f32 f301, f290, f300; +add.f32 f302, f292, f293; +add.f32 f303, f296, f299; +add.f32 f304, f295, f303; +add.f32 f305, f297, f298; +fma.rn.f32 f306, f300, 0f3E9E377A, f290; +mul.f32 f307, f302, 0f3F4F1BBD; +sub.f32 f308, f306, f307; +sub.f32 f309, f296, f299; +mul.f32 f310, f309, 0f3F737871; +sub.f32 f311, f297, f298; +mul.f32 f312, f311, 0fBF167918; +sub.f32 f313, f312, f310; +mul.f32 f314, f300, 0f3F4F1BBD; +sub.f32 f315, f290, f314; +fma.rn.f32 f316, f302, 0f3E9E377A, f315; +mul.f32 f317, f309, 0f3F167918; +mul.f32 f318, f311, 0f3F737871; +sub.f32 f319, f318, f317; +fma.rn.f32 f320, f303, 0f3E9E377A, f295; +mul.f32 f321, f305, 0f3F4F1BBD; +sub.f32 f322, f320, f321; +sub.f32 f323, f291, f294; +mul.f32 f324, f323, 0f3F737871; +sub.f32 f325, f292, f293; +mul.f32 f326, f325, 0fBF167918; +sub.f32 f327, f326, f324; +mul.f32 f328, f303, 0f3F4F1BBD; +sub.f32 f329, f295, f328; +fma.rn.f32 f330, f305, 0f3E9E377A, f329; +mul.f32 f331, f323, 0f3F167918; +mul.f32 f332, f325, 0f3F737871; +sub.f32 f333, f332, f331; +add.f32 %0, f302, f301; +add.f32 %1, f305, f304; +add.f32 %3, f327, f322; +sub.f32 %2, f308, f313; +sub.f32 %4, f316, f319; +add.f32 %5, f333, f330; +add.f32 %6, f319, f316; +sub.f32 %7, f330, f333; +sub.f32 %9, f322, f327; +add.f32 %8, f313, f308; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..5766e705c0bf8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp32_inv.hpp.inc @@ -0,0 +1,3450 @@ +#ifndef CUFFTDX_FFT_625_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_625_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<365, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1717>; +.reg .b32 r<14>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 5000, r13; +add.f32 f101, %62, %92; +add.f32 f103, %72, %82; +add.f32 f1716, %52, f101; +add.f32 f104, f103, f1716; +add.f32 f105, %102, %104; +add.f32 f107, %103, %83; +add.f32 f1712, %53, f105; +add.f32 f108, f107, f1712; +mul.f32 f110, f103, 0f3F4F1BBD; +fma.rn.f32 f1711, f101, 0f3E9E377A, %52; +sub.f32 f111, f1711, f110; +sub.f32 f112, %102, %104; +sub.f32 f114, %103, %83; +mul.f32 f1710, f112, 0f3F737871; +fma.rn.f32 f115, f114, 0f3F167918, f1710; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %52, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +mul.f32 f127, f107, 0f3F4F1BBD; +fma.rn.f32 f1709, f105, 0f3E9E377A, %53; +sub.f32 f128, f1709, f127; +sub.f32 f129, %62, %92; +sub.f32 f131, %72, %82; +mul.f32 f1708, f129, 0f3F737871; +fma.rn.f32 f132, f131, 0f3F167918, f1708; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %53, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %64, %94; +add.f32 f145, %74, %84; +add.f32 f1707, %54, f143; +add.f32 f146, f145, f1707; +add.f32 f147, %65, %95; +add.f32 f149, %107, %105; +add.f32 f1703, %106, f147; +add.f32 f150, f149, f1703; +fma.rn.f32 f1701, f143, 0f3E9E377A, %54; +mul.f32 f1702, f145, 0f3F4F1BBD; +sub.f32 f153, f1701, f1702; +sub.f32 f154, %65, %95; +sub.f32 f156, %107, %105; +mul.f32 f1700, f154, 0f3F737871; +fma.rn.f32 f157, f156, 0f3F167918, f1700; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %54, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +mul.f32 f169, f149, 0f3F4F1BBD; +fma.rn.f32 f1699, f147, 0f3E9E377A, %106; +sub.f32 f170, f1699, f169; +sub.f32 f171, %64, %94; +sub.f32 f173, %74, %84; +mul.f32 f1698, f171, 0f3F737871; +fma.rn.f32 f174, f173, 0f3F167918, f1698; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %106, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %66, %96; +add.f32 f187, %76, %86; +add.f32 f1697, %56, f185; +add.f32 f188, f187, f1697; +add.f32 f189, %110, %109; +add.f32 f191, %77, %111; +add.f32 f1692, %108, f189; +add.f32 f192, f191, f1692; +fma.rn.f32 f1690, f185, 0f3E9E377A, %56; +mul.f32 f1691, f187, 0f3F4F1BBD; +sub.f32 f195, f1690, f1691; +sub.f32 f196, %110, %109; +sub.f32 f198, %77, %111; +mul.f32 f1689, f196, 0f3F737871; +fma.rn.f32 f199, f198, 0f3F167918, f1689; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %56, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f1687, f189, 0f3E9E377A, %108; +mul.f32 f1688, f191, 0f3F4F1BBD; +sub.f32 f212, f1687, f1688; +sub.f32 f213, %66, %96; +sub.f32 f215, %76, %86; +mul.f32 f1686, f213, 0f3F737871; +fma.rn.f32 f216, f215, 0f3F167918, f1686; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %108, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %68, %98; +add.f32 f229, %78, %88; +add.f32 f1685, %58, f227; +add.f32 f230, f229, f1685; +add.f32 f231, %113, %112; +add.f32 f233, %114, %89; +add.f32 f1681, %59, f231; +add.f32 f234, f233, f1681; +mul.f32 f236, f229, 0f3F4F1BBD; +fma.rn.f32 f1680, f227, 0f3E9E377A, %58; +sub.f32 f237, f1680, f236; +sub.f32 f238, %113, %112; +sub.f32 f240, %114, %89; +mul.f32 f1679, f238, 0f3F737871; +fma.rn.f32 f241, f240, 0f3F167918, f1679; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %58, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +mul.f32 f253, f233, 0f3F4F1BBD; +fma.rn.f32 f1678, f231, 0f3E9E377A, %59; +sub.f32 f254, f1678, f253; +sub.f32 f255, %68, %98; +sub.f32 f257, %78, %88; +mul.f32 f1677, f255, 0f3F737871; +fma.rn.f32 f258, f257, 0f3F167918, f1677; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %59, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %70, %100; +add.f32 f271, %80, %90; +add.f32 f1676, %60, f269; +add.f32 f272, f271, f1676; +add.f32 f273, %71, %101; +add.f32 f275, %117, %115; +add.f32 f1672, %116, f273; +add.f32 f276, f275, f1672; +mul.f32 f278, f271, 0f3F4F1BBD; +fma.rn.f32 f1671, f269, 0f3E9E377A, %60; +sub.f32 f279, f1671, f278; +sub.f32 f280, %71, %101; +sub.f32 f282, %117, %115; +mul.f32 f1670, f280, 0f3F737871; +fma.rn.f32 f283, f282, 0f3F167918, f1670; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %60, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +mul.f32 f295, f275, 0f3F4F1BBD; +fma.rn.f32 f1669, f273, 0f3E9E377A, %116; +sub.f32 f296, f1669, f295; +sub.f32 f297, %70, %100; +sub.f32 f299, %80, %90; +mul.f32 f1668, f297, 0f3F737871; +fma.rn.f32 f300, f299, 0f3F167918, f1668; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %116, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mul.f32 f1666, f158, 0f3F77F511; +mul.f32 f1667, f175, 0f3E7EA890; +sub.f32 f313, f1666, f1667; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f1664, f200, 0f3F6055A2; +mul.f32 f1665, f217, 0f3EF6A86B; +sub.f32 f318, f1664, f1665; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f322, f259, 0f3F2F3E7B; +mul.f32 f1663, f242, 0f3F3A9DB0; +sub.f32 f323, f1663, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f327, f301, 0f3F5825E0; +mul.f32 f1662, f284, 0f3F092BF2; +sub.f32 f328, f1662, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f332, f183, 0f3EF6A86B; +mul.f32 f1661, f166, 0f3F6055A2; +sub.f32 f333, f1661, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f337, f225, 0f3F5825E0; +mul.f32 f1660, f208, 0f3F092BF2; +sub.f32 f338, f1660, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f342, f267, 0f3F7F7EAE; +mul.f32 f1659, f250, 0f3D809851; +sub.f32 f343, f1659, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f1657, f292, 0fBED9FFBE; +mul.f32 f1658, f309, 0f3F67A2BF; +sub.f32 f348, f1657, f1658; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f1655, f167, 0f3F3A9DB0; +mul.f32 f1656, f184, 0f3F2F3E7B; +sub.f32 f353, f1655, f1656; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f1653, f209, 0f3D809851; +mul.f32 f1654, f226, 0f3F7F7EAE; +sub.f32 f358, f1653, f1654; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f1651, f251, 0fBF232E38; +mul.f32 f1652, f268, 0f3F45405B; +sub.f32 f363, f1651, f1652; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f367, f310, 0f3E00575B; +mul.f32 f1650, f293, 0fBF7DFB3B; +sub.f32 f368, f1650, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f372, f176, 0f3F5825E0; +mul.f32 f1649, f159, 0f3F092BF2; +sub.f32 f373, f1649, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f377, f218, 0f3F67A2BF; +mul.f32 f1648, f201, 0fBED9FFBE; +sub.f32 f378, f1648, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f382, f260, 0f3E00575B; +mul.f32 f1647, f243, 0fBF7DFB3B; +sub.f32 f383, f1647, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f387, f302, 0fBF45405B; +mul.f32 f1646, f285, 0fBF232E38; +sub.f32 f388, f1646, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f393, f188, f230; +mul.f32 f398, f393, 0f3F4F1BBD; +fma.rn.f32 f1645, f391, 0f3E9E377A, f104; +sub.f32 f399, f1645, f398; +add.f32 f1644, f150, f276; +sub.f32 f400, f150, f276; +add.f32 f1643, f192, f234; +sub.f32 f402, f192, f234; +mul.f32 f1642, f400, 0f3F737871; +fma.rn.f32 f403, f402, 0f3F167918, f1642; +sub.f32 f404, f399, f403; +add.f32 f405, f403, f399; +add.f32 f1641, f104, f391; +mul.f32 f406, f391, 0f3F4F1BBD; +sub.f32 f407, f104, f406; +fma.rn.f32 f408, f393, 0f3E9E377A, f407; +mul.f32 f409, f400, 0f3F167918; +mul.f32 f410, f402, 0f3F737871; +sub.f32 f411, f409, f410; +sub.f32 f412, f408, f411; +add.f32 f413, f411, f408; +mul.f32 f415, f1643, 0f3F4F1BBD; +fma.rn.f32 f1640, f1644, 0f3E9E377A, f108; +sub.f32 f416, f1640, f415; +sub.f32 f417, f146, f272; +sub.f32 f419, f188, f230; +mul.f32 f1639, f417, 0f3F737871; +fma.rn.f32 f420, f419, 0f3F167918, f1639; +add.f32 f421, f420, f416; +sub.f32 f422, f416, f420; +add.f32 f1638, f108, f1644; +mul.f32 f423, f1644, 0f3F4F1BBD; +sub.f32 f424, f108, f423; +fma.rn.f32 f425, f1643, 0f3E9E377A, f424; +mul.f32 f426, f417, 0f3F167918; +mul.f32 f427, f419, 0f3F737871; +sub.f32 f428, f426, f427; +add.f32 f429, f428, f425; +sub.f32 f430, f425, f428; +add.f32 f431, f313, f328; +add.f32 f433, f318, f323; +add.f32 f1637, f116, f431; +add.f32 f434, f433, f1637; +add.f32 f435, f315, f330; +add.f32 f437, f320, f325; +add.f32 f1636, f133, f435; +add.f32 f438, f437, f1636; +fma.rn.f32 f1634, f431, 0f3E9E377A, f116; +mul.f32 f1635, f433, 0f3F4F1BBD; +sub.f32 f441, f1634, f1635; +sub.f32 f442, f315, f330; +sub.f32 f444, f320, f325; +mul.f32 f1633, f442, 0f3F737871; +fma.rn.f32 f445, f444, 0f3F167918, f1633; +sub.f32 f446, f441, f445; +add.f32 f447, f445, f441; +mul.f32 f448, f431, 0f3F4F1BBD; +sub.f32 f449, f116, f448; +fma.rn.f32 f450, f433, 0f3E9E377A, f449; +mul.f32 f451, f442, 0f3F167918; +mul.f32 f452, f444, 0f3F737871; +sub.f32 f453, f451, f452; +sub.f32 f454, f450, f453; +add.f32 f455, f453, f450; +mul.f32 f457, f437, 0f3F4F1BBD; +fma.rn.f32 f1632, f435, 0f3E9E377A, f133; +sub.f32 f458, f1632, f457; +sub.f32 f459, f313, f328; +sub.f32 f461, f318, f323; +mul.f32 f1631, f459, 0f3F737871; +fma.rn.f32 f462, f461, 0f3F167918, f1631; +add.f32 f463, f462, f458; +sub.f32 f464, f458, f462; +mul.f32 f465, f435, 0f3F4F1BBD; +sub.f32 f466, f133, f465; +fma.rn.f32 f467, f437, 0f3E9E377A, f466; +mul.f32 f468, f459, 0f3F167918; +mul.f32 f469, f461, 0f3F737871; +sub.f32 f470, f468, f469; +add.f32 f471, f470, f467; +sub.f32 f472, f467, f470; +add.f32 f473, f333, f348; +add.f32 f475, f338, f343; +add.f32 f1630, f124, f473; +add.f32 f476, f475, f1630; +add.f32 f477, f335, f350; +add.f32 f479, f340, f345; +add.f32 f1629, f141, f477; +add.f32 f480, f479, f1629; +fma.rn.f32 f1627, f473, 0f3E9E377A, f124; +mul.f32 f1628, f475, 0f3F4F1BBD; +sub.f32 f483, f1627, f1628; +sub.f32 f484, f335, f350; +sub.f32 f486, f340, f345; +mul.f32 f1626, f484, 0f3F737871; +fma.rn.f32 f487, f486, 0f3F167918, f1626; +sub.f32 f488, f483, f487; +add.f32 f489, f487, f483; +mul.f32 f490, f473, 0f3F4F1BBD; +sub.f32 f491, f124, f490; +fma.rn.f32 f492, f475, 0f3E9E377A, f491; +mul.f32 f493, f484, 0f3F167918; +mul.f32 f494, f486, 0f3F737871; +sub.f32 f495, f493, f494; +sub.f32 f496, f492, f495; +add.f32 f497, f495, f492; +fma.rn.f32 f1624, f477, 0f3E9E377A, f141; +mul.f32 f1625, f479, 0f3F4F1BBD; +sub.f32 f500, f1624, f1625; +sub.f32 f501, f333, f348; +sub.f32 f503, f338, f343; +mul.f32 f1623, f501, 0f3F737871; +fma.rn.f32 f504, f503, 0f3F167918, f1623; +add.f32 f505, f504, f500; +sub.f32 f506, f500, f504; +mul.f32 f507, f477, 0f3F4F1BBD; +sub.f32 f508, f141, f507; +fma.rn.f32 f509, f479, 0f3E9E377A, f508; +mul.f32 f510, f501, 0f3F167918; +mul.f32 f511, f503, 0f3F737871; +sub.f32 f512, f510, f511; +add.f32 f513, f512, f509; +sub.f32 f514, f509, f512; +add.f32 f515, f353, f368; +add.f32 f517, f358, f363; +add.f32 f1622, f125, f515; +add.f32 f518, f517, f1622; +add.f32 f519, f355, f370; +add.f32 f521, f360, f365; +add.f32 f1621, f142, f519; +add.f32 f522, f521, f1621; +mul.f32 f524, f517, 0f3F4F1BBD; +fma.rn.f32 f1620, f515, 0f3E9E377A, f125; +sub.f32 f525, f1620, f524; +sub.f32 f526, f355, f370; +sub.f32 f528, f360, f365; +mul.f32 f1619, f526, 0f3F737871; +fma.rn.f32 f529, f528, 0f3F167918, f1619; +sub.f32 f530, f525, f529; +add.f32 f531, f529, f525; +mul.f32 f532, f515, 0f3F4F1BBD; +sub.f32 f533, f125, f532; +fma.rn.f32 f534, f517, 0f3E9E377A, f533; +mul.f32 f535, f526, 0f3F167918; +mul.f32 f536, f528, 0f3F737871; +sub.f32 f537, f535, f536; +sub.f32 f538, f534, f537; +add.f32 f539, f537, f534; +mul.f32 f541, f521, 0f3F4F1BBD; +fma.rn.f32 f1618, f519, 0f3E9E377A, f142; +sub.f32 f542, f1618, f541; +sub.f32 f543, f353, f368; +sub.f32 f545, f358, f363; +mul.f32 f1617, f543, 0f3F737871; +fma.rn.f32 f546, f545, 0f3F167918, f1617; +add.f32 f547, f546, f542; +sub.f32 f548, f542, f546; +mul.f32 f549, f519, 0f3F4F1BBD; +sub.f32 f550, f142, f549; +fma.rn.f32 f551, f521, 0f3E9E377A, f550; +mul.f32 f552, f543, 0f3F167918; +mul.f32 f553, f545, 0f3F737871; +sub.f32 f554, f552, f553; +add.f32 f555, f554, f551; +sub.f32 f556, f551, f554; +add.f32 f557, f373, f388; +add.f32 f559, f378, f383; +add.f32 f1616, f117, f557; +add.f32 f560, f559, f1616; +add.f32 f561, f375, f390; +add.f32 f563, f380, f385; +add.f32 f1615, f134, f561; +add.f32 f564, f563, f1615; +mul.f32 f566, f559, 0f3F4F1BBD; +fma.rn.f32 f1614, f557, 0f3E9E377A, f117; +sub.f32 f567, f1614, f566; +sub.f32 f568, f375, f390; +sub.f32 f570, f380, f385; +mul.f32 f1613, f568, 0f3F737871; +fma.rn.f32 f571, f570, 0f3F167918, f1613; +sub.f32 f572, f567, f571; +add.f32 f573, f571, f567; +mul.f32 f574, f557, 0f3F4F1BBD; +sub.f32 f575, f117, f574; +fma.rn.f32 f576, f559, 0f3E9E377A, f575; +mul.f32 f577, f568, 0f3F167918; +mul.f32 f578, f570, 0f3F737871; +sub.f32 f579, f577, f578; +sub.f32 f580, f576, f579; +add.f32 f581, f579, f576; +mul.f32 f583, f563, 0f3F4F1BBD; +fma.rn.f32 f1612, f561, 0f3E9E377A, f134; +sub.f32 f584, f1612, f583; +sub.f32 f585, f373, f388; +sub.f32 f587, f378, f383; +mul.f32 f1611, f585, 0f3F737871; +fma.rn.f32 f588, f587, 0f3F167918, f1611; +add.f32 f589, f588, f584; +sub.f32 f590, f584, f588; +mul.f32 f591, f561, 0f3F4F1BBD; +sub.f32 f592, f134, f591; +fma.rn.f32 f593, f563, 0f3E9E377A, f592; +mul.f32 f594, f585, 0f3F167918; +mul.f32 f595, f587, 0f3F737871; +sub.f32 f596, f594, f595; +add.f32 f597, f596, f593; +sub.f32 f598, f593, f596; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 5000, r3; +mul.wide.u32 rd7, r7, 8; +mov.u64 rd8, %51; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f599, f600}, [rd6]; +mul.f32 f603, f438, f600; +mul.f32 f605, f599, f438; +mul.f32 f607, f600, f600; +mul.f32 f1610, f599, f599; +sub.f32 f608, f1610, f607; +mul.f32 f609, f600, f599; +fma.rn.f32 f610, f600, f599, f609; +mul.f32 f611, f480, f610; +mul.f32 f613, f608, f480; +mul.f32 f615, f600, f610; +mul.f32 f1609, f599, f608; +sub.f32 f616, f1609, f615; +mul.f32 f1608, f476, f610; +mul.f32 f617, f599, f610; +fma.rn.f32 f618, f600, f608, f617; +mul.f32 f619, f522, f618; +mul.f32 f621, f616, f522; +mul.f32 f1606, f599, f616; +mul.f32 f1607, f600, f618; +sub.f32 f624, f1606, f1607; +mul.f32 f1605, f518, f618; +mul.f32 f625, f599, f618; +fma.rn.f32 f626, f600, f616, f625; +mul.f32 f627, f564, f626; +mul.f32 f629, f624, f564; +mul.f32 f631, f600, f626; +mul.f32 f1604, f599, f624; +sub.f32 f632, f1604, f631; +mul.f32 f1603, f560, f626; +mul.f32 f633, f599, f626; +fma.rn.f32 f634, f600, f624, f633; +mul.f32 f635, f421, f634; +mul.f32 f637, f632, f421; +mul.f32 f1601, f599, f632; +mul.f32 f1602, f600, f634; +sub.f32 f640, f1601, f1602; +mul.f32 f1600, f404, f634; +mul.f32 f641, f599, f634; +fma.rn.f32 f642, f600, f632, f641; +mul.f32 f643, f463, f642; +mul.f32 f645, f640, f463; +mul.f32 f647, f600, f642; +mul.f32 f1599, f599, f640; +sub.f32 f648, f1599, f647; +mul.f32 f1598, f446, f642; +mul.f32 f649, f599, f642; +fma.rn.f32 f650, f600, f640, f649; +mul.f32 f651, f505, f650; +mul.f32 f653, f648, f505; +mul.f32 f655, f600, f650; +mul.f32 f1597, f599, f648; +sub.f32 f656, f1597, f655; +mul.f32 f1596, f488, f650; +mul.f32 f657, f599, f650; +fma.rn.f32 f658, f600, f648, f657; +mul.f32 f659, f547, f658; +mul.f32 f661, f656, f547; +mul.f32 f1594, f599, f656; +mul.f32 f1595, f600, f658; +sub.f32 f664, f1594, f1595; +mul.f32 f1593, f530, f658; +mul.f32 f665, f599, f658; +fma.rn.f32 f666, f600, f656, f665; +mul.f32 f667, f589, f666; +mul.f32 f669, f664, f589; +mul.f32 f671, f600, f666; +mul.f32 f1592, f599, f664; +sub.f32 f672, f1592, f671; +mul.f32 f1591, f572, f666; +mul.f32 f673, f599, f666; +fma.rn.f32 f674, f600, f664, f673; +mul.f32 f675, f429, f674; +mul.f32 f677, f672, f429; +mul.f32 f679, f600, f674; +mul.f32 f1590, f599, f672; +sub.f32 f680, f1590, f679; +mul.f32 f1589, f412, f674; +mul.f32 f681, f599, f674; +fma.rn.f32 f682, f600, f672, f681; +mul.f32 f683, f471, f682; +mul.f32 f685, f680, f471; +mul.f32 f1587, f599, f680; +mul.f32 f1588, f600, f682; +sub.f32 f688, f1587, f1588; +mul.f32 f1586, f454, f682; +mul.f32 f689, f599, f682; +fma.rn.f32 f690, f600, f680, f689; +mul.f32 f691, f513, f690; +mul.f32 f693, f688, f513; +mul.f32 f695, f600, f690; +mul.f32 f1585, f599, f688; +sub.f32 f696, f1585, f695; +mul.f32 f1584, f496, f690; +mul.f32 f697, f599, f690; +fma.rn.f32 f698, f600, f688, f697; +mul.f32 f699, f555, f698; +mul.f32 f701, f696, f555; +mul.f32 f1582, f599, f696; +mul.f32 f1583, f600, f698; +sub.f32 f704, f1582, f1583; +mul.f32 f1581, f538, f698; +mul.f32 f705, f599, f698; +fma.rn.f32 f706, f600, f696, f705; +mul.f32 f707, f597, f706; +mul.f32 f709, f704, f597; +mul.f32 f711, f600, f706; +mul.f32 f1580, f599, f704; +sub.f32 f712, f1580, f711; +mul.f32 f1579, f580, f706; +mul.f32 f713, f599, f706; +fma.rn.f32 f714, f600, f704, f713; +mul.f32 f715, f430, f714; +mul.f32 f717, f712, f430; +mul.f32 f719, f600, f714; +mul.f32 f1578, f599, f712; +sub.f32 f720, f1578, f719; +mul.f32 f1577, f413, f714; +mul.f32 f721, f599, f714; +fma.rn.f32 f722, f600, f712, f721; +mul.f32 f723, f472, f722; +mul.f32 f725, f720, f472; +mul.f32 f1575, f599, f720; +mul.f32 f1576, f600, f722; +sub.f32 f728, f1575, f1576; +mul.f32 f1574, f455, f722; +mul.f32 f729, f599, f722; +fma.rn.f32 f730, f600, f720, f729; +mul.f32 f731, f514, f730; +mul.f32 f733, f728, f514; +mul.f32 f735, f600, f730; +mul.f32 f1573, f599, f728; +sub.f32 f736, f1573, f735; +mul.f32 f1572, f497, f730; +mul.f32 f737, f599, f730; +fma.rn.f32 f738, f600, f728, f737; +mul.f32 f739, f556, f738; +mul.f32 f741, f736, f556; +mul.f32 f743, f600, f738; +mul.f32 f1571, f599, f736; +sub.f32 f744, f1571, f743; +mul.f32 f1570, f539, f738; +mul.f32 f745, f599, f738; +fma.rn.f32 f746, f600, f736, f745; +mul.f32 f747, f598, f746; +mul.f32 f749, f744, f598; +mul.f32 f1568, f599, f744; +mul.f32 f1569, f600, f746; +sub.f32 f752, f1568, f1569; +mul.f32 f1567, f581, f746; +mul.f32 f753, f599, f746; +fma.rn.f32 f754, f600, f744, f753; +mul.f32 f755, f422, f754; +mul.f32 f757, f752, f422; +mul.f32 f759, f600, f754; +mul.f32 f1566, f599, f752; +sub.f32 f760, f1566, f759; +mul.f32 f1565, f405, f754; +mul.f32 f761, f599, f754; +fma.rn.f32 f762, f600, f752, f761; +mul.f32 f763, f464, f762; +mul.f32 f765, f760, f464; +mul.f32 f1563, f599, f760; +mul.f32 f1564, f600, f762; +sub.f32 f768, f1563, f1564; +mul.f32 f1562, f447, f762; +mul.f32 f769, f599, f762; +fma.rn.f32 f770, f600, f760, f769; +mul.f32 f771, f506, f770; +mul.f32 f773, f768, f506; +mul.f32 f775, f600, f770; +mul.f32 f1561, f599, f768; +sub.f32 f776, f1561, f775; +mul.f32 f1560, f489, f770; +mul.f32 f777, f599, f770; +fma.rn.f32 f778, f600, f768, f777; +mul.f32 f779, f548, f778; +mul.f32 f781, f776, f548; +mul.f32 f783, f600, f778; +mul.f32 f1559, f599, f776; +sub.f32 f784, f1559, f783; +mul.f32 f1558, f531, f778; +mul.f32 f785, f599, f778; +mul.f32 f1557, f434, f600; +fma.rn.f32 f786, f600, f776, f785; +mul.f32 f787, f590, f786; +mul.f32 f788, f573, f786; +mul.f32 f789, f784, f590; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +add.f32 f790, f1643, f1638; +add.f32 f791, f393, f1641; +st.shared.v2.f32 [r9], {f791, f790}; +fma.rn.f32 f792, f599, f434, f603; +sub.f32 f793, f605, f1557; +st.shared.v2.f32 [r9+8], {f792, f793}; +fma.rn.f32 f794, f608, f476, f611; +sub.f32 f795, f613, f1608; +st.shared.v2.f32 [r9+16], {f794, f795}; +fma.rn.f32 f796, f616, f518, f619; +sub.f32 f797, f621, f1605; +st.shared.v2.f32 [r9+24], {f796, f797}; +fma.rn.f32 f798, f624, f560, f627; +sub.f32 f799, f629, f1603; +st.shared.v2.f32 [r9+32], {f798, f799}; +sub.f32 f800, f637, f1600; +fma.rn.f32 f801, f632, f404, f635; +st.shared.v2.f32 [r9+40], {f801, f800}; +fma.rn.f32 f802, f640, f446, f643; +sub.f32 f803, f645, f1598; +st.shared.v2.f32 [r9+48], {f802, f803}; +sub.f32 f804, f653, f1596; +fma.rn.f32 f805, f648, f488, f651; +st.shared.v2.f32 [r9+56], {f805, f804}; +fma.rn.f32 f806, f656, f530, f659; +sub.f32 f807, f661, f1593; +st.shared.v2.f32 [r9+64], {f806, f807}; +fma.rn.f32 f808, f664, f572, f667; +sub.f32 f809, f669, f1591; +st.shared.v2.f32 [r9+72], {f808, f809}; +fma.rn.f32 f810, f672, f412, f675; +sub.f32 f811, f677, f1589; +st.shared.v2.f32 [r9+80], {f810, f811}; +fma.rn.f32 f812, f680, f454, f683; +sub.f32 f813, f685, f1586; +st.shared.v2.f32 [r9+88], {f812, f813}; +fma.rn.f32 f814, f688, f496, f691; +sub.f32 f815, f693, f1584; +st.shared.v2.f32 [r9+96], {f814, f815}; +fma.rn.f32 f816, f696, f538, f699; +sub.f32 f817, f701, f1581; +st.shared.v2.f32 [r9+104], {f816, f817}; +fma.rn.f32 f818, f704, f580, f707; +sub.f32 f819, f709, f1579; +st.shared.v2.f32 [r9+112], {f818, f819}; +fma.rn.f32 f820, f712, f413, f715; +sub.f32 f821, f717, f1577; +st.shared.v2.f32 [r9+120], {f820, f821}; +fma.rn.f32 f822, f720, f455, f723; +sub.f32 f823, f725, f1574; +st.shared.v2.f32 [r9+128], {f822, f823}; +fma.rn.f32 f824, f728, f497, f731; +sub.f32 f825, f733, f1572; +st.shared.v2.f32 [r9+136], {f824, f825}; +fma.rn.f32 f826, f736, f539, f739; +sub.f32 f827, f741, f1570; +st.shared.v2.f32 [r9+144], {f826, f827}; +fma.rn.f32 f828, f744, f581, f747; +sub.f32 f829, f749, f1567; +st.shared.v2.f32 [r9+152], {f828, f829}; +fma.rn.f32 f830, f752, f405, f755; +sub.f32 f831, f757, f1565; +st.shared.v2.f32 [r9+160], {f830, f831}; +fma.rn.f32 f832, f760, f447, f763; +sub.f32 f833, f765, f1562; +st.shared.v2.f32 [r9+168], {f832, f833}; +fma.rn.f32 f834, f768, f489, f771; +sub.f32 f835, f773, f1560; +st.shared.v2.f32 [r9+176], {f834, f835}; +fma.rn.f32 f836, f776, f531, f779; +sub.f32 f837, f781, f1558; +st.shared.v2.f32 [r9+184], {f836, f837}; +fma.rn.f32 f838, f784, f573, f787; +sub.f32 f839, f789, f788; +st.shared.v2.f32 [r9+192], {f838, f839}; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.v2.f32 {f840, f841}, [r10]; +ld.shared.v2.f32 {f844, f845}, [r10+200]; +ld.shared.v2.f32 {f848, f849}, [r10+400]; +ld.shared.v2.f32 {f852, f853}, [r10+600]; +ld.shared.v2.f32 {f856, f857}, [r10+800]; +ld.shared.v2.f32 {f860, f861}, [r10+1000]; +ld.shared.v2.f32 {f864, f865}, [r10+1200]; +ld.shared.v2.f32 {f868, f869}, [r10+1400]; +ld.shared.v2.f32 {f872, f873}, [r10+1600]; +ld.shared.v2.f32 {f876, f877}, [r10+1800]; +ld.shared.v2.f32 {f880, f881}, [r10+2000]; +ld.shared.v2.f32 {f884, f885}, [r10+2200]; +ld.shared.v2.f32 {f888, f889}, [r10+2400]; +ld.shared.v2.f32 {f892, f893}, [r10+2600]; +ld.shared.v2.f32 {f896, f897}, [r10+2800]; +ld.shared.v2.f32 {f900, f901}, [r10+3000]; +ld.shared.v2.f32 {f904, f905}, [r10+3200]; +ld.shared.v2.f32 {f908, f909}, [r10+3400]; +ld.shared.v2.f32 {f912, f913}, [r10+3600]; +ld.shared.v2.f32 {f916, f917}, [r10+3800]; +ld.shared.v2.f32 {f920, f921}, [r10+4000]; +ld.shared.v2.f32 {f924, f925}, [r10+4200]; +ld.shared.v2.f32 {f928, f929}, [r10+4400]; +ld.shared.v2.f32 {f932, f933}, [r10+4600]; +ld.shared.v2.f32 {f936, f937}, [r10+4800]; +add.f32 f940, f860, f920; +add.f32 f942, f880, f900; +add.f32 f1556, f840, f940; +add.f32 f943, f942, f1556; +add.f32 f944, f861, f921; +add.f32 f946, f881, f901; +add.f32 f1555, f841, f944; +add.f32 f947, f946, f1555; +mul.f32 f949, f942, 0f3F4F1BBD; +fma.rn.f32 f1554, f940, 0f3E9E377A, f840; +sub.f32 f950, f1554, f949; +sub.f32 f951, f861, f921; +sub.f32 f953, f881, f901; +mul.f32 f1553, f951, 0f3F737871; +fma.rn.f32 f954, f953, 0f3F167918, f1553; +sub.f32 f955, f950, f954; +add.f32 f956, f954, f950; +mul.f32 f957, f940, 0f3F4F1BBD; +sub.f32 f958, f840, f957; +fma.rn.f32 f959, f942, 0f3E9E377A, f958; +mul.f32 f960, f951, 0f3F167918; +mul.f32 f961, f953, 0f3F737871; +sub.f32 f962, f960, f961; +sub.f32 f963, f959, f962; +add.f32 f964, f962, f959; +mul.f32 f966, f946, 0f3F4F1BBD; +fma.rn.f32 f1552, f944, 0f3E9E377A, f841; +sub.f32 f967, f1552, f966; +sub.f32 f968, f860, f920; +sub.f32 f970, f880, f900; +mul.f32 f1551, f968, 0f3F737871; +fma.rn.f32 f971, f970, 0f3F167918, f1551; +add.f32 f972, f971, f967; +sub.f32 f973, f967, f971; +mul.f32 f974, f944, 0f3F4F1BBD; +sub.f32 f975, f841, f974; +fma.rn.f32 f976, f946, 0f3E9E377A, f975; +mul.f32 f977, f968, 0f3F167918; +mul.f32 f978, f970, 0f3F737871; +sub.f32 f979, f977, f978; +add.f32 f980, f979, f976; +sub.f32 f981, f976, f979; +add.f32 f982, f864, f924; +add.f32 f984, f884, f904; +add.f32 f1550, f844, f982; +add.f32 f985, f984, f1550; +add.f32 f986, f865, f925; +add.f32 f988, f885, f905; +add.f32 f1549, f845, f986; +add.f32 f989, f988, f1549; +fma.rn.f32 f1547, f982, 0f3E9E377A, f844; +mul.f32 f1548, f984, 0f3F4F1BBD; +sub.f32 f992, f1547, f1548; +sub.f32 f993, f865, f925; +sub.f32 f995, f885, f905; +mul.f32 f1546, f993, 0f3F737871; +fma.rn.f32 f996, f995, 0f3F167918, f1546; +sub.f32 f997, f992, f996; +add.f32 f998, f996, f992; +mul.f32 f999, f982, 0f3F4F1BBD; +sub.f32 f1000, f844, f999; +fma.rn.f32 f1001, f984, 0f3E9E377A, f1000; +mul.f32 f1002, f993, 0f3F167918; +mul.f32 f1003, f995, 0f3F737871; +sub.f32 f1004, f1002, f1003; +sub.f32 f1005, f1001, f1004; +add.f32 f1006, f1004, f1001; +mul.f32 f1008, f988, 0f3F4F1BBD; +fma.rn.f32 f1545, f986, 0f3E9E377A, f845; +sub.f32 f1009, f1545, f1008; +sub.f32 f1010, f864, f924; +sub.f32 f1012, f884, f904; +mul.f32 f1544, f1010, 0f3F737871; +fma.rn.f32 f1013, f1012, 0f3F167918, f1544; +add.f32 f1014, f1013, f1009; +sub.f32 f1015, f1009, f1013; +mul.f32 f1016, f986, 0f3F4F1BBD; +sub.f32 f1017, f845, f1016; +fma.rn.f32 f1018, f988, 0f3E9E377A, f1017; +mul.f32 f1019, f1010, 0f3F167918; +mul.f32 f1020, f1012, 0f3F737871; +sub.f32 f1021, f1019, f1020; +add.f32 f1022, f1021, f1018; +sub.f32 f1023, f1018, f1021; +add.f32 f1024, f868, f928; +add.f32 f1026, f888, f908; +add.f32 f1543, f848, f1024; +add.f32 f1027, f1026, f1543; +add.f32 f1028, f869, f929; +add.f32 f1030, f889, f909; +add.f32 f1542, f849, f1028; +add.f32 f1031, f1030, f1542; +fma.rn.f32 f1540, f1024, 0f3E9E377A, f848; +mul.f32 f1541, f1026, 0f3F4F1BBD; +sub.f32 f1034, f1540, f1541; +sub.f32 f1035, f869, f929; +sub.f32 f1037, f889, f909; +mul.f32 f1539, f1035, 0f3F737871; +fma.rn.f32 f1038, f1037, 0f3F167918, f1539; +sub.f32 f1039, f1034, f1038; +add.f32 f1040, f1038, f1034; +mul.f32 f1041, f1024, 0f3F4F1BBD; +sub.f32 f1042, f848, f1041; +fma.rn.f32 f1043, f1026, 0f3E9E377A, f1042; +mul.f32 f1044, f1035, 0f3F167918; +mul.f32 f1045, f1037, 0f3F737871; +sub.f32 f1046, f1044, f1045; +sub.f32 f1047, f1043, f1046; +add.f32 f1048, f1046, f1043; +fma.rn.f32 f1537, f1028, 0f3E9E377A, f849; +mul.f32 f1538, f1030, 0f3F4F1BBD; +sub.f32 f1051, f1537, f1538; +sub.f32 f1052, f868, f928; +sub.f32 f1054, f888, f908; +mul.f32 f1536, f1052, 0f3F737871; +fma.rn.f32 f1055, f1054, 0f3F167918, f1536; +add.f32 f1056, f1055, f1051; +sub.f32 f1057, f1051, f1055; +mul.f32 f1058, f1028, 0f3F4F1BBD; +sub.f32 f1059, f849, f1058; +fma.rn.f32 f1060, f1030, 0f3E9E377A, f1059; +mul.f32 f1061, f1052, 0f3F167918; +mul.f32 f1062, f1054, 0f3F737871; +sub.f32 f1063, f1061, f1062; +add.f32 f1064, f1063, f1060; +sub.f32 f1065, f1060, f1063; +add.f32 f1066, f872, f932; +add.f32 f1068, f892, f912; +add.f32 f1535, f852, f1066; +add.f32 f1069, f1068, f1535; +add.f32 f1070, f873, f933; +add.f32 f1072, f893, f913; +add.f32 f1534, f853, f1070; +add.f32 f1073, f1072, f1534; +mul.f32 f1075, f1068, 0f3F4F1BBD; +fma.rn.f32 f1533, f1066, 0f3E9E377A, f852; +sub.f32 f1076, f1533, f1075; +sub.f32 f1077, f873, f933; +sub.f32 f1079, f893, f913; +mul.f32 f1532, f1077, 0f3F737871; +fma.rn.f32 f1080, f1079, 0f3F167918, f1532; +sub.f32 f1081, f1076, f1080; +add.f32 f1082, f1080, f1076; +mul.f32 f1083, f1066, 0f3F4F1BBD; +sub.f32 f1084, f852, f1083; +fma.rn.f32 f1085, f1068, 0f3E9E377A, f1084; +mul.f32 f1086, f1077, 0f3F167918; +mul.f32 f1087, f1079, 0f3F737871; +sub.f32 f1088, f1086, f1087; +sub.f32 f1089, f1085, f1088; +add.f32 f1090, f1088, f1085; +fma.rn.f32 f1530, f1070, 0f3E9E377A, f853; +mul.f32 f1531, f1072, 0f3F4F1BBD; +sub.f32 f1093, f1530, f1531; +sub.f32 f1094, f872, f932; +sub.f32 f1096, f892, f912; +mul.f32 f1529, f1094, 0f3F737871; +fma.rn.f32 f1097, f1096, 0f3F167918, f1529; +add.f32 f1098, f1097, f1093; +sub.f32 f1099, f1093, f1097; +mul.f32 f1100, f1070, 0f3F4F1BBD; +sub.f32 f1101, f853, f1100; +fma.rn.f32 f1102, f1072, 0f3E9E377A, f1101; +mul.f32 f1103, f1094, 0f3F167918; +mul.f32 f1104, f1096, 0f3F737871; +sub.f32 f1105, f1103, f1104; +add.f32 f1106, f1105, f1102; +sub.f32 f1107, f1102, f1105; +add.f32 f1108, f876, f936; +add.f32 f1110, f896, f916; +add.f32 f1528, f856, f1108; +add.f32 f1111, f1110, f1528; +add.f32 f1112, f877, f937; +add.f32 f1114, f897, f917; +add.f32 f1527, f857, f1112; +add.f32 f1115, f1114, f1527; +mul.f32 f1117, f1110, 0f3F4F1BBD; +fma.rn.f32 f1526, f1108, 0f3E9E377A, f856; +sub.f32 f1118, f1526, f1117; +sub.f32 f1119, f877, f937; +sub.f32 f1121, f897, f917; +mul.f32 f1525, f1119, 0f3F737871; +fma.rn.f32 f1122, f1121, 0f3F167918, f1525; +sub.f32 f1123, f1118, f1122; +add.f32 f1124, f1122, f1118; +mul.f32 f1125, f1108, 0f3F4F1BBD; +sub.f32 f1126, f856, f1125; +fma.rn.f32 f1127, f1110, 0f3E9E377A, f1126; +mul.f32 f1128, f1119, 0f3F167918; +mul.f32 f1129, f1121, 0f3F737871; +sub.f32 f1130, f1128, f1129; +sub.f32 f1131, f1127, f1130; +add.f32 f1132, f1130, f1127; +mul.f32 f1134, f1114, 0f3F4F1BBD; +fma.rn.f32 f1524, f1112, 0f3E9E377A, f857; +sub.f32 f1135, f1524, f1134; +sub.f32 f1136, f876, f936; +sub.f32 f1138, f896, f916; +mul.f32 f1523, f1136, 0f3F737871; +fma.rn.f32 f1139, f1138, 0f3F167918, f1523; +add.f32 f1140, f1139, f1135; +sub.f32 f1141, f1135, f1139; +mul.f32 f1142, f1112, 0f3F4F1BBD; +sub.f32 f1143, f857, f1142; +fma.rn.f32 f1144, f1114, 0f3E9E377A, f1143; +mul.f32 f1145, f1136, 0f3F167918; +mul.f32 f1146, f1138, 0f3F737871; +sub.f32 f1147, f1145, f1146; +add.f32 f1148, f1147, f1144; +sub.f32 f1149, f1144, f1147; +mul.f32 f1151, f1014, 0f3E7EA890; +mul.f32 f1522, f997, 0f3F77F511; +sub.f32 f1152, f1522, f1151; +mul.f32 f1153, f1014, 0f3F77F511; +fma.rn.f32 f1154, f997, 0f3E7EA890, f1153; +mul.f32 f1520, f1039, 0f3F6055A2; +mul.f32 f1521, f1056, 0f3EF6A86B; +sub.f32 f1157, f1520, f1521; +mul.f32 f1158, f1056, 0f3F6055A2; +fma.rn.f32 f1159, f1039, 0f3EF6A86B, f1158; +mul.f32 f1518, f1081, 0f3F3A9DB0; +mul.f32 f1519, f1098, 0f3F2F3E7B; +sub.f32 f1162, f1518, f1519; +mul.f32 f1163, f1098, 0f3F3A9DB0; +fma.rn.f32 f1164, f1081, 0f3F2F3E7B, f1163; +mul.f32 f1516, f1123, 0f3F092BF2; +mul.f32 f1517, f1140, 0f3F5825E0; +sub.f32 f1167, f1516, f1517; +mul.f32 f1168, f1140, 0f3F092BF2; +fma.rn.f32 f1169, f1123, 0f3F5825E0, f1168; +mul.f32 f1514, f1005, 0f3F6055A2; +mul.f32 f1515, f1022, 0f3EF6A86B; +sub.f32 f1172, f1514, f1515; +mul.f32 f1173, f1022, 0f3F6055A2; +fma.rn.f32 f1174, f1005, 0f3EF6A86B, f1173; +mul.f32 f1176, f1064, 0f3F5825E0; +mul.f32 f1513, f1047, 0f3F092BF2; +sub.f32 f1177, f1513, f1176; +mul.f32 f1178, f1064, 0f3F092BF2; +fma.rn.f32 f1179, f1047, 0f3F5825E0, f1178; +mul.f32 f1181, f1106, 0f3F7F7EAE; +mul.f32 f1512, f1089, 0f3D809851; +sub.f32 f1182, f1512, f1181; +mul.f32 f1183, f1106, 0f3D809851; +fma.rn.f32 f1184, f1089, 0f3F7F7EAE, f1183; +mul.f32 f1186, f1148, 0f3F67A2BF; +mul.f32 f1511, f1131, 0fBED9FFBE; +sub.f32 f1187, f1511, f1186; +mul.f32 f1188, f1148, 0fBED9FFBE; +fma.rn.f32 f1189, f1131, 0f3F67A2BF, f1188; +mul.f32 f1191, f1023, 0f3F2F3E7B; +mul.f32 f1510, f1006, 0f3F3A9DB0; +sub.f32 f1192, f1510, f1191; +mul.f32 f1193, f1023, 0f3F3A9DB0; +fma.rn.f32 f1194, f1006, 0f3F2F3E7B, f1193; +mul.f32 f1196, f1065, 0f3F7F7EAE; +mul.f32 f1509, f1048, 0f3D809851; +sub.f32 f1197, f1509, f1196; +mul.f32 f1198, f1065, 0f3D809851; +fma.rn.f32 f1199, f1048, 0f3F7F7EAE, f1198; +mul.f32 f1201, f1107, 0f3F45405B; +mul.f32 f1508, f1090, 0fBF232E38; +sub.f32 f1202, f1508, f1201; +mul.f32 f1203, f1107, 0fBF232E38; +fma.rn.f32 f1204, f1090, 0f3F45405B, f1203; +mul.f32 f1506, f1132, 0fBF7DFB3B; +mul.f32 f1507, f1149, 0f3E00575B; +sub.f32 f1207, f1506, f1507; +mul.f32 f1208, f1149, 0fBF7DFB3B; +fma.rn.f32 f1209, f1132, 0f3E00575B, f1208; +mul.f32 f1504, f998, 0f3F092BF2; +mul.f32 f1505, f1015, 0f3F5825E0; +sub.f32 f1212, f1504, f1505; +mul.f32 f1213, f1015, 0f3F092BF2; +fma.rn.f32 f1214, f998, 0f3F5825E0, f1213; +mul.f32 f1502, f1040, 0fBED9FFBE; +mul.f32 f1503, f1057, 0f3F67A2BF; +sub.f32 f1217, f1502, f1503; +mul.f32 f1218, f1057, 0fBED9FFBE; +fma.rn.f32 f1219, f1040, 0f3F67A2BF, f1218; +mul.f32 f1221, f1099, 0f3E00575B; +mul.f32 f1501, f1082, 0fBF7DFB3B; +sub.f32 f1222, f1501, f1221; +mul.f32 f1223, f1099, 0fBF7DFB3B; +fma.rn.f32 f1224, f1082, 0f3E00575B, f1223; +mul.f32 f1226, f1141, 0fBF45405B; +mul.f32 f1500, f1124, 0fBF232E38; +sub.f32 f1227, f1500, f1226; +mul.f32 f1228, f1141, 0fBF232E38; +fma.rn.f32 f1229, f1124, 0fBF45405B, f1228; +add.f32 f1230, f985, f1111; +add.f32 f1232, f1027, f1069; +fma.rn.f32 f1498, f1230, 0f3E9E377A, f943; +mul.f32 f1499, f1232, 0f3F4F1BBD; +sub.f32 f1238, f1498, f1499; +add.f32 f1497, f989, f1115; +sub.f32 f1239, f989, f1115; +add.f32 f1496, f1031, f1073; +sub.f32 f1241, f1031, f1073; +mul.f32 f1495, f1239, 0f3F737871; +fma.rn.f32 f1242, f1241, 0f3F167918, f1495; +add.f32 f1494, f943, f1230; +mul.f32 f1243, f1230, 0f3F4F1BBD; +sub.f32 f1244, f943, f1243; +fma.rn.f32 f1245, f1232, 0f3E9E377A, f1244; +mul.f32 f1246, f1239, 0f3F167918; +mul.f32 f1247, f1241, 0f3F737871; +sub.f32 f1248, f1246, f1247; +mul.f32 f1250, f1496, 0f3F4F1BBD; +fma.rn.f32 f1493, f1497, 0f3E9E377A, f947; +sub.f32 f1251, f1493, f1250; +sub.f32 f1252, f985, f1111; +sub.f32 f1254, f1027, f1069; +mul.f32 f1492, f1252, 0f3F737871; +fma.rn.f32 f1255, f1254, 0f3F167918, f1492; +add.f32 f1491, f947, f1497; +mul.f32 f1256, f1497, 0f3F4F1BBD; +sub.f32 f1257, f947, f1256; +fma.rn.f32 f1258, f1496, 0f3E9E377A, f1257; +mul.f32 f1259, f1252, 0f3F167918; +mul.f32 f1260, f1254, 0f3F737871; +sub.f32 f1261, f1259, f1260; +add.f32 f1262, f1152, f1167; +add.f32 f1264, f1157, f1162; +mul.f32 f1269, f1264, 0f3F4F1BBD; +fma.rn.f32 f1490, f1262, 0f3E9E377A, f955; +sub.f32 f1270, f1490, f1269; +add.f32 f1489, f1154, f1169; +sub.f32 f1271, f1154, f1169; +add.f32 f1488, f1159, f1164; +sub.f32 f1273, f1159, f1164; +mul.f32 f1487, f1271, 0f3F737871; +fma.rn.f32 f1274, f1273, 0f3F167918, f1487; +add.f32 f1486, f955, f1262; +mul.f32 f1275, f1262, 0f3F4F1BBD; +sub.f32 f1276, f955, f1275; +fma.rn.f32 f1277, f1264, 0f3E9E377A, f1276; +mul.f32 f1278, f1271, 0f3F167918; +mul.f32 f1279, f1273, 0f3F737871; +sub.f32 f1280, f1278, f1279; +fma.rn.f32 f1484, f1489, 0f3E9E377A, f972; +mul.f32 f1485, f1488, 0f3F4F1BBD; +sub.f32 f1283, f1484, f1485; +sub.f32 f1284, f1152, f1167; +sub.f32 f1286, f1157, f1162; +mul.f32 f1483, f1284, 0f3F737871; +fma.rn.f32 f1287, f1286, 0f3F167918, f1483; +add.f32 f1482, f972, f1489; +mul.f32 f1288, f1489, 0f3F4F1BBD; +sub.f32 f1289, f972, f1288; +fma.rn.f32 f1290, f1488, 0f3E9E377A, f1289; +mul.f32 f1291, f1284, 0f3F167918; +mul.f32 f1292, f1286, 0f3F737871; +sub.f32 f1293, f1291, f1292; +add.f32 f1294, f1172, f1187; +add.f32 f1296, f1177, f1182; +fma.rn.f32 f1480, f1294, 0f3E9E377A, f963; +mul.f32 f1481, f1296, 0f3F4F1BBD; +sub.f32 f1302, f1480, f1481; +add.f32 f1479, f1174, f1189; +sub.f32 f1303, f1174, f1189; +add.f32 f1478, f1179, f1184; +sub.f32 f1305, f1179, f1184; +mul.f32 f1477, f1303, 0f3F737871; +fma.rn.f32 f1306, f1305, 0f3F167918, f1477; +add.f32 f1476, f963, f1294; +mul.f32 f1307, f1294, 0f3F4F1BBD; +sub.f32 f1308, f963, f1307; +fma.rn.f32 f1309, f1296, 0f3E9E377A, f1308; +mul.f32 f1310, f1303, 0f3F167918; +mul.f32 f1311, f1305, 0f3F737871; +sub.f32 f1312, f1310, f1311; +mul.f32 f1314, f1478, 0f3F4F1BBD; +fma.rn.f32 f1475, f1479, 0f3E9E377A, f980; +sub.f32 f1315, f1475, f1314; +sub.f32 f1316, f1172, f1187; +sub.f32 f1318, f1177, f1182; +mul.f32 f1474, f1316, 0f3F737871; +fma.rn.f32 f1319, f1318, 0f3F167918, f1474; +add.f32 f1473, f980, f1479; +mul.f32 f1320, f1479, 0f3F4F1BBD; +sub.f32 f1321, f980, f1320; +fma.rn.f32 f1322, f1478, 0f3E9E377A, f1321; +mul.f32 f1323, f1316, 0f3F167918; +mul.f32 f1324, f1318, 0f3F737871; +sub.f32 f1325, f1323, f1324; +add.f32 f1326, f1192, f1207; +add.f32 f1328, f1197, f1202; +mul.f32 f1333, f1328, 0f3F4F1BBD; +fma.rn.f32 f1472, f1326, 0f3E9E377A, f964; +sub.f32 f1334, f1472, f1333; +add.f32 f1471, f1194, f1209; +sub.f32 f1335, f1194, f1209; +add.f32 f1470, f1199, f1204; +sub.f32 f1337, f1199, f1204; +mul.f32 f1469, f1335, 0f3F737871; +fma.rn.f32 f1338, f1337, 0f3F167918, f1469; +add.f32 f1468, f964, f1326; +mul.f32 f1339, f1326, 0f3F4F1BBD; +sub.f32 f1340, f964, f1339; +fma.rn.f32 f1341, f1328, 0f3E9E377A, f1340; +mul.f32 f1342, f1335, 0f3F167918; +mul.f32 f1343, f1337, 0f3F737871; +sub.f32 f1344, f1342, f1343; +fma.rn.f32 f1466, f1471, 0f3E9E377A, f981; +mul.f32 f1467, f1470, 0f3F4F1BBD; +sub.f32 f1347, f1466, f1467; +sub.f32 f1348, f1192, f1207; +sub.f32 f1350, f1197, f1202; +mul.f32 f1465, f1348, 0f3F737871; +fma.rn.f32 f1351, f1350, 0f3F167918, f1465; +add.f32 f1464, f981, f1471; +mul.f32 f1352, f1471, 0f3F4F1BBD; +sub.f32 f1353, f981, f1352; +fma.rn.f32 f1354, f1470, 0f3E9E377A, f1353; +mul.f32 f1355, f1348, 0f3F167918; +mul.f32 f1356, f1350, 0f3F737871; +sub.f32 f1357, f1355, f1356; +add.f32 f1358, f1212, f1227; +add.f32 f1360, f1217, f1222; +fma.rn.f32 f1462, f1358, 0f3E9E377A, f956; +mul.f32 f1463, f1360, 0f3F4F1BBD; +sub.f32 f1366, f1462, f1463; +add.f32 f1461, f1214, f1229; +sub.f32 f1367, f1214, f1229; +add.f32 f1460, f1219, f1224; +sub.f32 f1369, f1219, f1224; +mul.f32 f1459, f1367, 0f3F737871; +fma.rn.f32 f1370, f1369, 0f3F167918, f1459; +add.f32 f1458, f956, f1358; +mul.f32 f1371, f1358, 0f3F4F1BBD; +sub.f32 f1372, f956, f1371; +fma.rn.f32 f1373, f1360, 0f3E9E377A, f1372; +mul.f32 f1374, f1367, 0f3F167918; +mul.f32 f1375, f1369, 0f3F737871; +sub.f32 f1376, f1374, f1375; +mul.f32 f1378, f1460, 0f3F4F1BBD; +fma.rn.f32 f1457, f1461, 0f3E9E377A, f973; +sub.f32 f1379, f1457, f1378; +sub.f32 f1380, f1212, f1227; +sub.f32 f1382, f1217, f1222; +mul.f32 f1456, f1380, 0f3F737871; +fma.rn.f32 f1383, f1382, 0f3F167918, f1456; +add.f32 f1455, f973, f1461; +mul.f32 f1384, f1461, 0f3F4F1BBD; +sub.f32 f1385, f973, f1384; +fma.rn.f32 f1386, f1460, 0f3E9E377A, f1385; +mul.f32 f1387, f1380, 0f3F167918; +mul.f32 f1388, f1382, 0f3F737871; +sub.f32 f1389, f1387, f1388; +add.f32 %1, f1496, f1491; +add.f32 %0, f1232, f1494; +add.f32 %3, f1488, f1482; +add.f32 %2, f1264, f1486; +add.f32 %5, f1478, f1473; +add.f32 %4, f1296, f1476; +add.f32 %7, f1470, f1464; +add.f32 %6, f1328, f1468; +add.f32 %9, f1460, f1455; +add.f32 %8, f1360, f1458; +add.f32 %11, f1255, f1251; +sub.f32 %10, f1238, f1242; +add.f32 %13, f1287, f1283; +sub.f32 %12, f1270, f1274; +add.f32 %15, f1319, f1315; +sub.f32 %14, f1302, f1306; +sub.f32 %16, f1334, f1338; +add.f32 %17, f1351, f1347; +sub.f32 %18, f1366, f1370; +add.f32 %19, f1383, f1379; +sub.f32 %20, f1245, f1248; +add.f32 %21, f1261, f1258; +sub.f32 %22, f1277, f1280; +add.f32 %23, f1293, f1290; +add.f32 %25, f1325, f1322; +sub.f32 %24, f1309, f1312; +add.f32 %27, f1357, f1354; +sub.f32 %26, f1341, f1344; +sub.f32 %28, f1373, f1376; +add.f32 %29, f1389, f1386; +sub.f32 %31, f1258, f1261; +add.f32 %30, f1248, f1245; +sub.f32 %33, f1290, f1293; +add.f32 %32, f1280, f1277; +sub.f32 %35, f1322, f1325; +add.f32 %34, f1312, f1309; +sub.f32 %37, f1354, f1357; +add.f32 %36, f1344, f1341; +sub.f32 %39, f1386, f1389; +add.f32 %38, f1376, f1373; +sub.f32 %41, f1251, f1255; +add.f32 %40, f1242, f1238; +sub.f32 %43, f1283, f1287; +add.f32 %42, f1274, f1270; +sub.f32 %45, f1315, f1319; +add.f32 %44, f1306, f1302; +sub.f32 %47, f1347, f1351; +add.f32 %46, f1338, f1334; +sub.f32 %49, f1379, f1383; +add.f32 %48, f1370, f1366; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[5].y), "f"(rmem[10].y), "f"(rmem[20].y), "f"(rmem[16].y), "f"(rmem[1].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[22].y), "f"(rmem[7].y), "f"(rmem[17].y), "f"(rmem[23].y), "f"(rmem[8].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[4].y), "f"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<364, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1390>; +.reg .b32 r<11>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 2500, r2; +add.f32 f101, %65, %105; +add.f32 f102, %52, f101; +add.f32 f103, %78, %92; +add.f32 f104, f103, f102; +add.f32 f105, %67, %107; +add.f32 f106, %53, f105; +add.f32 f107, %80, %93; +add.f32 f108, f107, f106; +fma.rn.f32 f109, f101, 0f3E9E377A, %52; +mul.f32 f110, f103, 0f3F4F1BBD; +sub.f32 f111, f109, f110; +sub.f32 f112, %67, %107; +mul.f32 f113, f112, 0f3F737871; +sub.f32 f114, %80, %93; +fma.rn.f32 f115, f114, 0f3F167918, f113; +sub.f32 f116, f111, f115; +add.f32 f117, f115, f111; +mul.f32 f118, f101, 0f3F4F1BBD; +sub.f32 f119, %52, f118; +fma.rn.f32 f120, f103, 0f3E9E377A, f119; +mul.f32 f121, f112, 0f3F167918; +mul.f32 f122, f114, 0f3F737871; +sub.f32 f123, f121, f122; +sub.f32 f124, f120, f123; +add.f32 f125, f123, f120; +fma.rn.f32 f126, f105, 0f3E9E377A, %53; +mul.f32 f127, f107, 0f3F4F1BBD; +sub.f32 f128, f126, f127; +sub.f32 f129, %65, %105; +mul.f32 f130, f129, 0f3F737871; +sub.f32 f131, %78, %92; +fma.rn.f32 f132, f131, 0f3F167918, f130; +add.f32 f133, f132, f128; +sub.f32 f134, f128, f132; +mul.f32 f135, f105, 0f3F4F1BBD; +sub.f32 f136, %53, f135; +fma.rn.f32 f137, f107, 0f3E9E377A, f136; +mul.f32 f138, f129, 0f3F167918; +mul.f32 f139, f131, 0f3F737871; +sub.f32 f140, f138, f139; +add.f32 f141, f140, f137; +sub.f32 f142, f137, f140; +add.f32 f143, %68, %108; +add.f32 f144, %54, f143; +add.f32 f145, %81, %94; +add.f32 f146, f145, f144; +add.f32 f147, %69, %109; +add.f32 f148, %56, f147; +add.f32 f149, %83, %96; +add.f32 f150, f149, f148; +fma.rn.f32 f151, f143, 0f3E9E377A, %54; +mul.f32 f152, f145, 0f3F4F1BBD; +sub.f32 f153, f151, f152; +sub.f32 f154, %69, %109; +mul.f32 f155, f154, 0f3F737871; +sub.f32 f156, %83, %96; +fma.rn.f32 f157, f156, 0f3F167918, f155; +sub.f32 f158, f153, f157; +add.f32 f159, f157, f153; +mul.f32 f160, f143, 0f3F4F1BBD; +sub.f32 f161, %54, f160; +fma.rn.f32 f162, f145, 0f3E9E377A, f161; +mul.f32 f163, f154, 0f3F167918; +mul.f32 f164, f156, 0f3F737871; +sub.f32 f165, f163, f164; +sub.f32 f166, f162, f165; +add.f32 f167, f165, f162; +fma.rn.f32 f168, f147, 0f3E9E377A, %56; +mul.f32 f169, f149, 0f3F4F1BBD; +sub.f32 f170, f168, f169; +sub.f32 f171, %68, %108; +mul.f32 f172, f171, 0f3F737871; +sub.f32 f173, %81, %94; +fma.rn.f32 f174, f173, 0f3F167918, f172; +add.f32 f175, f174, f170; +sub.f32 f176, f170, f174; +mul.f32 f177, f147, 0f3F4F1BBD; +sub.f32 f178, %56, f177; +fma.rn.f32 f179, f149, 0f3E9E377A, f178; +mul.f32 f180, f171, 0f3F167918; +mul.f32 f181, f173, 0f3F737871; +sub.f32 f182, f180, f181; +add.f32 f183, f182, f179; +sub.f32 f184, f179, f182; +add.f32 f185, %70, %110; +add.f32 f186, %57, f185; +add.f32 f187, %84, %97; +add.f32 f188, f187, f186; +add.f32 f189, %72, %112; +add.f32 f190, %59, f189; +add.f32 f191, %85, %99; +add.f32 f192, f191, f190; +fma.rn.f32 f193, f185, 0f3E9E377A, %57; +mul.f32 f194, f187, 0f3F4F1BBD; +sub.f32 f195, f193, f194; +sub.f32 f196, %72, %112; +mul.f32 f197, f196, 0f3F737871; +sub.f32 f198, %85, %99; +fma.rn.f32 f199, f198, 0f3F167918, f197; +sub.f32 f200, f195, f199; +add.f32 f201, f199, f195; +mul.f32 f202, f185, 0f3F4F1BBD; +sub.f32 f203, %57, f202; +fma.rn.f32 f204, f187, 0f3E9E377A, f203; +mul.f32 f205, f196, 0f3F167918; +mul.f32 f206, f198, 0f3F737871; +sub.f32 f207, f205, f206; +sub.f32 f208, f204, f207; +add.f32 f209, f207, f204; +fma.rn.f32 f210, f189, 0f3E9E377A, %59; +mul.f32 f211, f191, 0f3F4F1BBD; +sub.f32 f212, f210, f211; +sub.f32 f213, %70, %110; +mul.f32 f214, f213, 0f3F737871; +sub.f32 f215, %84, %97; +fma.rn.f32 f216, f215, 0f3F167918, f214; +add.f32 f217, f216, f212; +sub.f32 f218, f212, f216; +mul.f32 f219, f189, 0f3F4F1BBD; +sub.f32 f220, %59, f219; +fma.rn.f32 f221, f191, 0f3E9E377A, f220; +mul.f32 f222, f213, 0f3F167918; +mul.f32 f223, f215, 0f3F737871; +sub.f32 f224, f222, f223; +add.f32 f225, f224, f221; +sub.f32 f226, f221, f224; +add.f32 f227, %73, %113; +add.f32 f228, %60, f227; +add.f32 f229, %86, %100; +add.f32 f230, f229, f228; +add.f32 f231, %75, %115; +add.f32 f232, %61, f231; +add.f32 f233, %88, %101; +add.f32 f234, f233, f232; +fma.rn.f32 f235, f227, 0f3E9E377A, %60; +mul.f32 f236, f229, 0f3F4F1BBD; +sub.f32 f237, f235, f236; +sub.f32 f238, %75, %115; +mul.f32 f239, f238, 0f3F737871; +sub.f32 f240, %88, %101; +fma.rn.f32 f241, f240, 0f3F167918, f239; +sub.f32 f242, f237, f241; +add.f32 f243, f241, f237; +mul.f32 f244, f227, 0f3F4F1BBD; +sub.f32 f245, %60, f244; +fma.rn.f32 f246, f229, 0f3E9E377A, f245; +mul.f32 f247, f238, 0f3F167918; +mul.f32 f248, f240, 0f3F737871; +sub.f32 f249, f247, f248; +sub.f32 f250, f246, f249; +add.f32 f251, f249, f246; +fma.rn.f32 f252, f231, 0f3E9E377A, %61; +mul.f32 f253, f233, 0f3F4F1BBD; +sub.f32 f254, f252, f253; +sub.f32 f255, %73, %113; +mul.f32 f256, f255, 0f3F737871; +sub.f32 f257, %86, %100; +fma.rn.f32 f258, f257, 0f3F167918, f256; +add.f32 f259, f258, f254; +sub.f32 f260, f254, f258; +mul.f32 f261, f231, 0f3F4F1BBD; +sub.f32 f262, %61, f261; +fma.rn.f32 f263, f233, 0f3E9E377A, f262; +mul.f32 f264, f255, 0f3F167918; +mul.f32 f265, f257, 0f3F737871; +sub.f32 f266, f264, f265; +add.f32 f267, f266, f263; +sub.f32 f268, f263, f266; +add.f32 f269, %76, %116; +add.f32 f270, %62, f269; +add.f32 f271, %89, %102; +add.f32 f272, f271, f270; +add.f32 f273, %77, %117; +add.f32 f274, %64, f273; +add.f32 f275, %91, %104; +add.f32 f276, f275, f274; +fma.rn.f32 f277, f269, 0f3E9E377A, %62; +mul.f32 f278, f271, 0f3F4F1BBD; +sub.f32 f279, f277, f278; +sub.f32 f280, %77, %117; +mul.f32 f281, f280, 0f3F737871; +sub.f32 f282, %91, %104; +fma.rn.f32 f283, f282, 0f3F167918, f281; +sub.f32 f284, f279, f283; +add.f32 f285, f283, f279; +mul.f32 f286, f269, 0f3F4F1BBD; +sub.f32 f287, %62, f286; +fma.rn.f32 f288, f271, 0f3E9E377A, f287; +mul.f32 f289, f280, 0f3F167918; +mul.f32 f290, f282, 0f3F737871; +sub.f32 f291, f289, f290; +sub.f32 f292, f288, f291; +add.f32 f293, f291, f288; +fma.rn.f32 f294, f273, 0f3E9E377A, %64; +mul.f32 f295, f275, 0f3F4F1BBD; +sub.f32 f296, f294, f295; +sub.f32 f297, %76, %116; +mul.f32 f298, f297, 0f3F737871; +sub.f32 f299, %89, %102; +fma.rn.f32 f300, f299, 0f3F167918, f298; +add.f32 f301, f300, f296; +sub.f32 f302, f296, f300; +mul.f32 f303, f273, 0f3F4F1BBD; +sub.f32 f304, %64, f303; +fma.rn.f32 f305, f275, 0f3E9E377A, f304; +mul.f32 f306, f297, 0f3F167918; +mul.f32 f307, f299, 0f3F737871; +sub.f32 f308, f306, f307; +add.f32 f309, f308, f305; +sub.f32 f310, f305, f308; +mov.u32 r4, %tid.x; +mul.f32 f311, f158, 0f3F77F511; +mul.f32 f312, f175, 0f3E7EA890; +sub.f32 f313, f311, f312; +mul.f32 f314, f175, 0f3F77F511; +fma.rn.f32 f315, f158, 0f3E7EA890, f314; +mul.f32 f316, f200, 0f3F6055A2; +mul.f32 f317, f217, 0f3EF6A86B; +sub.f32 f318, f316, f317; +mul.f32 f319, f217, 0f3F6055A2; +fma.rn.f32 f320, f200, 0f3EF6A86B, f319; +mul.f32 f321, f242, 0f3F3A9DB0; +mul.f32 f322, f259, 0f3F2F3E7B; +sub.f32 f323, f321, f322; +mul.f32 f324, f259, 0f3F3A9DB0; +fma.rn.f32 f325, f242, 0f3F2F3E7B, f324; +mul.f32 f326, f284, 0f3F092BF2; +mul.f32 f327, f301, 0f3F5825E0; +sub.f32 f328, f326, f327; +mul.f32 f329, f301, 0f3F092BF2; +fma.rn.f32 f330, f284, 0f3F5825E0, f329; +mul.f32 f331, f166, 0f3F6055A2; +mul.f32 f332, f183, 0f3EF6A86B; +sub.f32 f333, f331, f332; +mul.f32 f334, f183, 0f3F6055A2; +fma.rn.f32 f335, f166, 0f3EF6A86B, f334; +mul.f32 f336, f208, 0f3F092BF2; +mul.f32 f337, f225, 0f3F5825E0; +sub.f32 f338, f336, f337; +mul.f32 f339, f225, 0f3F092BF2; +fma.rn.f32 f340, f208, 0f3F5825E0, f339; +mul.f32 f341, f250, 0f3D809851; +mul.f32 f342, f267, 0f3F7F7EAE; +sub.f32 f343, f341, f342; +mul.f32 f344, f267, 0f3D809851; +fma.rn.f32 f345, f250, 0f3F7F7EAE, f344; +mul.f32 f346, f292, 0fBED9FFBE; +mul.f32 f347, f309, 0f3F67A2BF; +sub.f32 f348, f346, f347; +mul.f32 f349, f309, 0fBED9FFBE; +fma.rn.f32 f350, f292, 0f3F67A2BF, f349; +mul.f32 f351, f167, 0f3F3A9DB0; +mul.f32 f352, f184, 0f3F2F3E7B; +sub.f32 f353, f351, f352; +mul.f32 f354, f184, 0f3F3A9DB0; +fma.rn.f32 f355, f167, 0f3F2F3E7B, f354; +mul.f32 f356, f209, 0f3D809851; +mul.f32 f357, f226, 0f3F7F7EAE; +sub.f32 f358, f356, f357; +mul.f32 f359, f226, 0f3D809851; +fma.rn.f32 f360, f209, 0f3F7F7EAE, f359; +mul.f32 f361, f251, 0fBF232E38; +mul.f32 f362, f268, 0f3F45405B; +sub.f32 f363, f361, f362; +mul.f32 f364, f268, 0fBF232E38; +fma.rn.f32 f365, f251, 0f3F45405B, f364; +mul.f32 f366, f293, 0fBF7DFB3B; +mul.f32 f367, f310, 0f3E00575B; +sub.f32 f368, f366, f367; +mul.f32 f369, f310, 0fBF7DFB3B; +fma.rn.f32 f370, f293, 0f3E00575B, f369; +mul.f32 f371, f159, 0f3F092BF2; +mul.f32 f372, f176, 0f3F5825E0; +sub.f32 f373, f371, f372; +mul.f32 f374, f176, 0f3F092BF2; +fma.rn.f32 f375, f159, 0f3F5825E0, f374; +mul.f32 f376, f201, 0fBED9FFBE; +mul.f32 f377, f218, 0f3F67A2BF; +sub.f32 f378, f376, f377; +mul.f32 f379, f218, 0fBED9FFBE; +fma.rn.f32 f380, f201, 0f3F67A2BF, f379; +mul.f32 f381, f243, 0fBF7DFB3B; +mul.f32 f382, f260, 0f3E00575B; +sub.f32 f383, f381, f382; +mul.f32 f384, f260, 0fBF7DFB3B; +fma.rn.f32 f385, f243, 0f3E00575B, f384; +mul.f32 f386, f285, 0fBF232E38; +mul.f32 f387, f302, 0fBF45405B; +sub.f32 f388, f386, f387; +mul.f32 f389, f302, 0fBF232E38; +fma.rn.f32 f390, f285, 0fBF45405B, f389; +add.f32 f391, f146, f272; +add.f32 f392, f104, f391; +add.f32 f393, f188, f230; +add.f32 f394, f393, f392; +add.f32 f395, f150, f276; +add.f32 f396, f108, f395; +add.f32 f397, f192, f234; +add.f32 f398, f397, f396; +fma.rn.f32 f399, f391, 0f3E9E377A, f104; +mul.f32 f400, f393, 0f3F4F1BBD; +sub.f32 f401, f399, f400; +sub.f32 f402, f150, f276; +mul.f32 f403, f402, 0f3F737871; +sub.f32 f404, f192, f234; +fma.rn.f32 f405, f404, 0f3F167918, f403; +sub.f32 f406, f401, f405; +add.f32 f407, f405, f401; +mul.f32 f408, f391, 0f3F4F1BBD; +sub.f32 f409, f104, f408; +fma.rn.f32 f410, f393, 0f3E9E377A, f409; +mul.f32 f411, f402, 0f3F167918; +mul.f32 f412, f404, 0f3F737871; +sub.f32 f413, f411, f412; +sub.f32 f414, f410, f413; +add.f32 f415, f413, f410; +fma.rn.f32 f416, f395, 0f3E9E377A, f108; +mul.f32 f417, f397, 0f3F4F1BBD; +sub.f32 f418, f416, f417; +sub.f32 f419, f146, f272; +mul.f32 f420, f419, 0f3F737871; +sub.f32 f421, f188, f230; +fma.rn.f32 f422, f421, 0f3F167918, f420; +add.f32 f423, f422, f418; +sub.f32 f424, f418, f422; +mul.f32 f425, f395, 0f3F4F1BBD; +sub.f32 f426, f108, f425; +fma.rn.f32 f427, f397, 0f3E9E377A, f426; +mul.f32 f428, f419, 0f3F167918; +mul.f32 f429, f421, 0f3F737871; +sub.f32 f430, f428, f429; +add.f32 f431, f430, f427; +sub.f32 f432, f427, f430; +add.f32 f433, f313, f328; +add.f32 f434, f116, f433; +add.f32 f435, f318, f323; +add.f32 f436, f435, f434; +add.f32 f437, f315, f330; +add.f32 f438, f133, f437; +add.f32 f439, f320, f325; +add.f32 f440, f439, f438; +fma.rn.f32 f441, f433, 0f3E9E377A, f116; +mul.f32 f442, f435, 0f3F4F1BBD; +sub.f32 f443, f441, f442; +sub.f32 f444, f315, f330; +mul.f32 f445, f444, 0f3F737871; +sub.f32 f446, f320, f325; +fma.rn.f32 f447, f446, 0f3F167918, f445; +sub.f32 f448, f443, f447; +add.f32 f449, f447, f443; +mul.f32 f450, f433, 0f3F4F1BBD; +sub.f32 f451, f116, f450; +fma.rn.f32 f452, f435, 0f3E9E377A, f451; +mul.f32 f453, f444, 0f3F167918; +mul.f32 f454, f446, 0f3F737871; +sub.f32 f455, f453, f454; +sub.f32 f456, f452, f455; +add.f32 f457, f455, f452; +fma.rn.f32 f458, f437, 0f3E9E377A, f133; +mul.f32 f459, f439, 0f3F4F1BBD; +sub.f32 f460, f458, f459; +sub.f32 f461, f313, f328; +mul.f32 f462, f461, 0f3F737871; +sub.f32 f463, f318, f323; +fma.rn.f32 f464, f463, 0f3F167918, f462; +add.f32 f465, f464, f460; +sub.f32 f466, f460, f464; +mul.f32 f467, f437, 0f3F4F1BBD; +sub.f32 f468, f133, f467; +fma.rn.f32 f469, f439, 0f3E9E377A, f468; +mul.f32 f470, f461, 0f3F167918; +mul.f32 f471, f463, 0f3F737871; +sub.f32 f472, f470, f471; +add.f32 f473, f472, f469; +sub.f32 f474, f469, f472; +add.f32 f475, f333, f348; +add.f32 f476, f124, f475; +add.f32 f477, f338, f343; +add.f32 f478, f477, f476; +add.f32 f479, f335, f350; +add.f32 f480, f141, f479; +add.f32 f481, f340, f345; +add.f32 f482, f481, f480; +fma.rn.f32 f483, f475, 0f3E9E377A, f124; +mul.f32 f484, f477, 0f3F4F1BBD; +sub.f32 f485, f483, f484; +sub.f32 f486, f335, f350; +mul.f32 f487, f486, 0f3F737871; +sub.f32 f488, f340, f345; +fma.rn.f32 f489, f488, 0f3F167918, f487; +sub.f32 f490, f485, f489; +add.f32 f491, f489, f485; +mul.f32 f492, f475, 0f3F4F1BBD; +sub.f32 f493, f124, f492; +fma.rn.f32 f494, f477, 0f3E9E377A, f493; +mul.f32 f495, f486, 0f3F167918; +mul.f32 f496, f488, 0f3F737871; +sub.f32 f497, f495, f496; +sub.f32 f498, f494, f497; +add.f32 f499, f497, f494; +fma.rn.f32 f500, f479, 0f3E9E377A, f141; +mul.f32 f501, f481, 0f3F4F1BBD; +sub.f32 f502, f500, f501; +sub.f32 f503, f333, f348; +mul.f32 f504, f503, 0f3F737871; +sub.f32 f505, f338, f343; +fma.rn.f32 f506, f505, 0f3F167918, f504; +add.f32 f507, f506, f502; +sub.f32 f508, f502, f506; +mul.f32 f509, f479, 0f3F4F1BBD; +sub.f32 f510, f141, f509; +fma.rn.f32 f511, f481, 0f3E9E377A, f510; +mul.f32 f512, f503, 0f3F167918; +mul.f32 f513, f505, 0f3F737871; +sub.f32 f514, f512, f513; +add.f32 f515, f514, f511; +sub.f32 f516, f511, f514; +add.f32 f517, f353, f368; +add.f32 f518, f125, f517; +add.f32 f519, f358, f363; +add.f32 f520, f519, f518; +add.f32 f521, f355, f370; +add.f32 f522, f142, f521; +add.f32 f523, f360, f365; +add.f32 f524, f523, f522; +fma.rn.f32 f525, f517, 0f3E9E377A, f125; +mul.f32 f526, f519, 0f3F4F1BBD; +sub.f32 f527, f525, f526; +sub.f32 f528, f355, f370; +mul.f32 f529, f528, 0f3F737871; +sub.f32 f530, f360, f365; +fma.rn.f32 f531, f530, 0f3F167918, f529; +sub.f32 f532, f527, f531; +add.f32 f533, f531, f527; +mul.f32 f534, f517, 0f3F4F1BBD; +sub.f32 f535, f125, f534; +fma.rn.f32 f536, f519, 0f3E9E377A, f535; +mul.f32 f537, f528, 0f3F167918; +mul.f32 f538, f530, 0f3F737871; +sub.f32 f539, f537, f538; +sub.f32 f540, f536, f539; +add.f32 f541, f539, f536; +fma.rn.f32 f542, f521, 0f3E9E377A, f142; +mul.f32 f543, f523, 0f3F4F1BBD; +sub.f32 f544, f542, f543; +sub.f32 f545, f353, f368; +mul.f32 f546, f545, 0f3F737871; +sub.f32 f547, f358, f363; +fma.rn.f32 f548, f547, 0f3F167918, f546; +add.f32 f549, f548, f544; +sub.f32 f550, f544, f548; +mul.f32 f551, f521, 0f3F4F1BBD; +sub.f32 f552, f142, f551; +fma.rn.f32 f553, f523, 0f3E9E377A, f552; +mul.f32 f554, f545, 0f3F167918; +mul.f32 f555, f547, 0f3F737871; +sub.f32 f556, f554, f555; +add.f32 f557, f556, f553; +sub.f32 f558, f553, f556; +add.f32 f559, f373, f388; +add.f32 f560, f117, f559; +add.f32 f561, f378, f383; +add.f32 f562, f561, f560; +add.f32 f563, f375, f390; +add.f32 f564, f134, f563; +add.f32 f565, f380, f385; +add.f32 f566, f565, f564; +fma.rn.f32 f567, f559, 0f3E9E377A, f117; +mul.f32 f568, f561, 0f3F4F1BBD; +sub.f32 f569, f567, f568; +sub.f32 f570, f375, f390; +mul.f32 f571, f570, 0f3F737871; +sub.f32 f572, f380, f385; +fma.rn.f32 f573, f572, 0f3F167918, f571; +sub.f32 f574, f569, f573; +add.f32 f575, f573, f569; +mul.f32 f576, f559, 0f3F4F1BBD; +sub.f32 f577, f117, f576; +fma.rn.f32 f578, f561, 0f3E9E377A, f577; +mul.f32 f579, f570, 0f3F167918; +mul.f32 f580, f572, 0f3F737871; +sub.f32 f581, f579, f580; +sub.f32 f582, f578, f581; +add.f32 f583, f581, f578; +fma.rn.f32 f584, f563, 0f3E9E377A, f134; +mul.f32 f585, f565, 0f3F4F1BBD; +sub.f32 f586, f584, f585; +sub.f32 f587, f373, f388; +mul.f32 f588, f587, 0f3F737871; +sub.f32 f589, f378, f383; +fma.rn.f32 f590, f589, 0f3F167918, f588; +add.f32 f591, f590, f586; +sub.f32 f592, f586, f590; +mul.f32 f593, f563, 0f3F4F1BBD; +sub.f32 f594, f134, f593; +fma.rn.f32 f595, f565, 0f3E9E377A, f594; +mul.f32 f596, f587, 0f3F167918; +mul.f32 f597, f589, 0f3F737871; +sub.f32 f598, f596, f597; +add.f32 f599, f598, f595; +sub.f32 f600, f595, f598; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f601, f602}, [rd6]; +mul.f32 f605, f440, f602; +fma.rn.f32 f606, f601, f436, f605; +mul.f32 f607, f436, f602; +mul.f32 f608, f601, f440; +sub.f32 f609, f608, f607; +mul.f32 f610, f601, f601; +mul.f32 f611, f602, f602; +sub.f32 f612, f610, f611; +mul.f32 f613, f602, f601; +fma.rn.f32 f614, f602, f601, f613; +mul.f32 f615, f482, f614; +fma.rn.f32 f616, f612, f478, f615; +mul.f32 f617, f478, f614; +mul.f32 f618, f612, f482; +sub.f32 f619, f618, f617; +mul.f32 f620, f601, f612; +mul.f32 f621, f602, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f601, f614; +fma.rn.f32 f624, f602, f612, f623; +mul.f32 f625, f524, f624; +fma.rn.f32 f626, f622, f520, f625; +mul.f32 f627, f520, f624; +mul.f32 f628, f622, f524; +sub.f32 f629, f628, f627; +mul.f32 f630, f601, f622; +mul.f32 f631, f602, f624; +sub.f32 f632, f630, f631; +mul.f32 f633, f601, f624; +fma.rn.f32 f634, f602, f622, f633; +mul.f32 f635, f566, f634; +fma.rn.f32 f636, f632, f562, f635; +mul.f32 f637, f562, f634; +mul.f32 f638, f632, f566; +sub.f32 f639, f638, f637; +mul.f32 f640, f601, f632; +mul.f32 f641, f602, f634; +sub.f32 f642, f640, f641; +mul.f32 f643, f601, f634; +fma.rn.f32 f644, f602, f632, f643; +mul.f32 f645, f423, f644; +fma.rn.f32 f646, f642, f406, f645; +mul.f32 f647, f406, f644; +mul.f32 f648, f642, f423; +sub.f32 f649, f648, f647; +mul.f32 f650, f601, f642; +mul.f32 f651, f602, f644; +sub.f32 f652, f650, f651; +mul.f32 f653, f601, f644; +fma.rn.f32 f654, f602, f642, f653; +mul.f32 f655, f465, f654; +fma.rn.f32 f656, f652, f448, f655; +mul.f32 f657, f448, f654; +mul.f32 f658, f652, f465; +sub.f32 f659, f658, f657; +mul.f32 f660, f601, f652; +mul.f32 f661, f602, f654; +sub.f32 f662, f660, f661; +mul.f32 f663, f601, f654; +fma.rn.f32 f664, f602, f652, f663; +mul.f32 f665, f507, f664; +fma.rn.f32 f666, f662, f490, f665; +mul.f32 f667, f490, f664; +mul.f32 f668, f662, f507; +sub.f32 f669, f668, f667; +mul.f32 f670, f601, f662; +mul.f32 f671, f602, f664; +sub.f32 f672, f670, f671; +mul.f32 f673, f601, f664; +fma.rn.f32 f674, f602, f662, f673; +mul.f32 f675, f549, f674; +fma.rn.f32 f676, f672, f532, f675; +mul.f32 f677, f532, f674; +mul.f32 f678, f672, f549; +sub.f32 f679, f678, f677; +mul.f32 f680, f601, f672; +mul.f32 f681, f602, f674; +sub.f32 f682, f680, f681; +mul.f32 f683, f601, f674; +fma.rn.f32 f684, f602, f672, f683; +mul.f32 f685, f591, f684; +fma.rn.f32 f686, f682, f574, f685; +mul.f32 f687, f574, f684; +mul.f32 f688, f682, f591; +sub.f32 f689, f688, f687; +mul.f32 f690, f601, f682; +mul.f32 f691, f602, f684; +sub.f32 f692, f690, f691; +mul.f32 f693, f601, f684; +fma.rn.f32 f694, f602, f682, f693; +mul.f32 f695, f431, f694; +fma.rn.f32 f696, f692, f414, f695; +mul.f32 f697, f414, f694; +mul.f32 f698, f692, f431; +sub.f32 f699, f698, f697; +mul.f32 f700, f601, f692; +mul.f32 f701, f602, f694; +sub.f32 f702, f700, f701; +mul.f32 f703, f601, f694; +fma.rn.f32 f704, f602, f692, f703; +mul.f32 f705, f473, f704; +fma.rn.f32 f706, f702, f456, f705; +mul.f32 f707, f456, f704; +mul.f32 f708, f702, f473; +sub.f32 f709, f708, f707; +mul.f32 f710, f601, f702; +mul.f32 f711, f602, f704; +sub.f32 f712, f710, f711; +mul.f32 f713, f601, f704; +fma.rn.f32 f714, f602, f702, f713; +mul.f32 f715, f515, f714; +fma.rn.f32 f716, f712, f498, f715; +mul.f32 f717, f498, f714; +mul.f32 f718, f712, f515; +sub.f32 f719, f718, f717; +mul.f32 f720, f601, f712; +mul.f32 f721, f602, f714; +sub.f32 f722, f720, f721; +mul.f32 f723, f601, f714; +fma.rn.f32 f724, f602, f712, f723; +mul.f32 f725, f557, f724; +fma.rn.f32 f726, f722, f540, f725; +mul.f32 f727, f540, f724; +mul.f32 f728, f722, f557; +sub.f32 f729, f728, f727; +mul.f32 f730, f601, f722; +mul.f32 f731, f602, f724; +sub.f32 f732, f730, f731; +mul.f32 f733, f601, f724; +fma.rn.f32 f734, f602, f722, f733; +mul.f32 f735, f599, f734; +fma.rn.f32 f736, f732, f582, f735; +mul.f32 f737, f582, f734; +mul.f32 f738, f732, f599; +sub.f32 f739, f738, f737; +mul.f32 f740, f601, f732; +mul.f32 f741, f602, f734; +sub.f32 f742, f740, f741; +mul.f32 f743, f601, f734; +fma.rn.f32 f744, f602, f732, f743; +mul.f32 f745, f432, f744; +fma.rn.f32 f746, f742, f415, f745; +mul.f32 f747, f415, f744; +mul.f32 f748, f742, f432; +sub.f32 f749, f748, f747; +mul.f32 f750, f601, f742; +mul.f32 f751, f602, f744; +sub.f32 f752, f750, f751; +mul.f32 f753, f601, f744; +fma.rn.f32 f754, f602, f742, f753; +mul.f32 f755, f474, f754; +fma.rn.f32 f756, f752, f457, f755; +mul.f32 f757, f457, f754; +mul.f32 f758, f752, f474; +sub.f32 f759, f758, f757; +mul.f32 f760, f601, f752; +mul.f32 f761, f602, f754; +sub.f32 f762, f760, f761; +mul.f32 f763, f601, f754; +fma.rn.f32 f764, f602, f752, f763; +mul.f32 f765, f516, f764; +fma.rn.f32 f766, f762, f499, f765; +mul.f32 f767, f499, f764; +mul.f32 f768, f762, f516; +sub.f32 f769, f768, f767; +mul.f32 f770, f601, f762; +mul.f32 f771, f602, f764; +sub.f32 f772, f770, f771; +mul.f32 f773, f601, f764; +fma.rn.f32 f774, f602, f762, f773; +mul.f32 f775, f558, f774; +fma.rn.f32 f776, f772, f541, f775; +mul.f32 f777, f541, f774; +mul.f32 f778, f772, f558; +sub.f32 f779, f778, f777; +mul.f32 f780, f601, f772; +mul.f32 f781, f602, f774; +sub.f32 f782, f780, f781; +mul.f32 f783, f601, f774; +fma.rn.f32 f784, f602, f772, f783; +mul.f32 f785, f600, f784; +fma.rn.f32 f786, f782, f583, f785; +mul.f32 f787, f583, f784; +mul.f32 f788, f782, f600; +sub.f32 f789, f788, f787; +mul.f32 f790, f601, f782; +mul.f32 f791, f602, f784; +sub.f32 f792, f790, f791; +mul.f32 f793, f601, f784; +fma.rn.f32 f794, f602, f782, f793; +mul.f32 f795, f424, f794; +fma.rn.f32 f796, f792, f407, f795; +mul.f32 f797, f407, f794; +mul.f32 f798, f792, f424; +sub.f32 f799, f798, f797; +mul.f32 f800, f601, f792; +mul.f32 f801, f602, f794; +sub.f32 f802, f800, f801; +mul.f32 f803, f601, f794; +fma.rn.f32 f804, f602, f792, f803; +mul.f32 f805, f466, f804; +fma.rn.f32 f806, f802, f449, f805; +mul.f32 f807, f449, f804; +mul.f32 f808, f802, f466; +sub.f32 f809, f808, f807; +mul.f32 f810, f601, f802; +mul.f32 f811, f602, f804; +sub.f32 f812, f810, f811; +mul.f32 f813, f601, f804; +fma.rn.f32 f814, f602, f802, f813; +mul.f32 f815, f508, f814; +fma.rn.f32 f816, f812, f491, f815; +mul.f32 f817, f491, f814; +mul.f32 f818, f812, f508; +sub.f32 f819, f818, f817; +mul.f32 f820, f601, f812; +mul.f32 f821, f602, f814; +sub.f32 f822, f820, f821; +mul.f32 f823, f601, f814; +fma.rn.f32 f824, f602, f812, f823; +mul.f32 f825, f550, f824; +fma.rn.f32 f826, f822, f533, f825; +mul.f32 f827, f533, f824; +mul.f32 f828, f822, f550; +sub.f32 f829, f828, f827; +mul.f32 f830, f601, f822; +mul.f32 f831, f602, f824; +sub.f32 f832, f830, f831; +mul.f32 f833, f601, f824; +fma.rn.f32 f834, f602, f822, f833; +mul.f32 f835, f592, f834; +fma.rn.f32 f836, f832, f575, f835; +mul.f32 f837, f575, f834; +mul.f32 f838, f832, f592; +sub.f32 f839, f838, f837; +mad.lo.s32 r8, r5, 2500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 100, r8; +st.shared.f32 [r9], f394; +st.shared.f32 [r9+4], f606; +st.shared.f32 [r9+8], f616; +st.shared.f32 [r9+12], f626; +st.shared.f32 [r9+16], f636; +st.shared.f32 [r9+20], f646; +st.shared.f32 [r9+24], f656; +st.shared.f32 [r9+28], f666; +st.shared.f32 [r9+32], f676; +st.shared.f32 [r9+36], f686; +st.shared.f32 [r9+40], f696; +st.shared.f32 [r9+44], f706; +st.shared.f32 [r9+48], f716; +st.shared.f32 [r9+52], f726; +st.shared.f32 [r9+56], f736; +st.shared.f32 [r9+60], f746; +st.shared.f32 [r9+64], f756; +st.shared.f32 [r9+68], f766; +st.shared.f32 [r9+72], f776; +st.shared.f32 [r9+76], f786; +st.shared.f32 [r9+80], f796; +st.shared.f32 [r9+84], f806; +st.shared.f32 [r9+88], f816; +st.shared.f32 [r9+92], f826; +st.shared.f32 [r9+96], f836; +barrier.sync 0; +mad.lo.s32 r10, r7, -96, r9; +ld.shared.f32 f840, [r10]; +ld.shared.f32 f841, [r10+100]; +ld.shared.f32 f842, [r10+200]; +ld.shared.f32 f843, [r10+300]; +ld.shared.f32 f844, [r10+400]; +ld.shared.f32 f845, [r10+500]; +ld.shared.f32 f846, [r10+600]; +ld.shared.f32 f847, [r10+700]; +ld.shared.f32 f848, [r10+800]; +ld.shared.f32 f849, [r10+900]; +ld.shared.f32 f850, [r10+1000]; +ld.shared.f32 f851, [r10+1100]; +ld.shared.f32 f852, [r10+1200]; +ld.shared.f32 f853, [r10+1300]; +ld.shared.f32 f854, [r10+1400]; +ld.shared.f32 f855, [r10+1500]; +ld.shared.f32 f856, [r10+1600]; +ld.shared.f32 f857, [r10+1700]; +ld.shared.f32 f858, [r10+1800]; +ld.shared.f32 f859, [r10+1900]; +ld.shared.f32 f860, [r10+2000]; +ld.shared.f32 f861, [r10+2100]; +ld.shared.f32 f862, [r10+2200]; +ld.shared.f32 f863, [r10+2300]; +ld.shared.f32 f864, [r10+2400]; +barrier.sync 0; +st.shared.f32 [r9], f398; +st.shared.f32 [r9+4], f609; +st.shared.f32 [r9+8], f619; +st.shared.f32 [r9+12], f629; +st.shared.f32 [r9+16], f639; +st.shared.f32 [r9+20], f649; +st.shared.f32 [r9+24], f659; +st.shared.f32 [r9+28], f669; +st.shared.f32 [r9+32], f679; +st.shared.f32 [r9+36], f689; +st.shared.f32 [r9+40], f699; +st.shared.f32 [r9+44], f709; +st.shared.f32 [r9+48], f719; +st.shared.f32 [r9+52], f729; +st.shared.f32 [r9+56], f739; +st.shared.f32 [r9+60], f749; +st.shared.f32 [r9+64], f759; +st.shared.f32 [r9+68], f769; +st.shared.f32 [r9+72], f779; +st.shared.f32 [r9+76], f789; +st.shared.f32 [r9+80], f799; +st.shared.f32 [r9+84], f809; +st.shared.f32 [r9+88], f819; +st.shared.f32 [r9+92], f829; +st.shared.f32 [r9+96], f839; +barrier.sync 0; +ld.shared.f32 f865, [r10]; +ld.shared.f32 f866, [r10+100]; +ld.shared.f32 f867, [r10+200]; +ld.shared.f32 f868, [r10+300]; +ld.shared.f32 f869, [r10+400]; +ld.shared.f32 f870, [r10+500]; +ld.shared.f32 f871, [r10+600]; +ld.shared.f32 f872, [r10+700]; +ld.shared.f32 f873, [r10+800]; +ld.shared.f32 f874, [r10+900]; +ld.shared.f32 f875, [r10+1000]; +ld.shared.f32 f876, [r10+1100]; +ld.shared.f32 f877, [r10+1200]; +ld.shared.f32 f878, [r10+1300]; +ld.shared.f32 f879, [r10+1400]; +ld.shared.f32 f880, [r10+1500]; +ld.shared.f32 f881, [r10+1600]; +ld.shared.f32 f882, [r10+1700]; +ld.shared.f32 f883, [r10+1800]; +ld.shared.f32 f884, [r10+1900]; +ld.shared.f32 f885, [r10+2000]; +ld.shared.f32 f886, [r10+2100]; +ld.shared.f32 f887, [r10+2200]; +ld.shared.f32 f888, [r10+2300]; +ld.shared.f32 f889, [r10+2400]; +add.f32 f890, f845, f860; +add.f32 f891, f840, f890; +add.f32 f892, f850, f855; +add.f32 f893, f892, f891; +add.f32 f894, f870, f885; +add.f32 f895, f865, f894; +add.f32 f896, f875, f880; +add.f32 f897, f896, f895; +fma.rn.f32 f898, f890, 0f3E9E377A, f840; +mul.f32 f899, f892, 0f3F4F1BBD; +sub.f32 f900, f898, f899; +sub.f32 f901, f870, f885; +mul.f32 f902, f901, 0f3F737871; +sub.f32 f903, f875, f880; +fma.rn.f32 f904, f903, 0f3F167918, f902; +sub.f32 f905, f900, f904; +add.f32 f906, f904, f900; +mul.f32 f907, f890, 0f3F4F1BBD; +sub.f32 f908, f840, f907; +fma.rn.f32 f909, f892, 0f3E9E377A, f908; +mul.f32 f910, f901, 0f3F167918; +mul.f32 f911, f903, 0f3F737871; +sub.f32 f912, f910, f911; +sub.f32 f913, f909, f912; +add.f32 f914, f912, f909; +fma.rn.f32 f915, f894, 0f3E9E377A, f865; +mul.f32 f916, f896, 0f3F4F1BBD; +sub.f32 f917, f915, f916; +sub.f32 f918, f845, f860; +mul.f32 f919, f918, 0f3F737871; +sub.f32 f920, f850, f855; +fma.rn.f32 f921, f920, 0f3F167918, f919; +add.f32 f922, f921, f917; +sub.f32 f923, f917, f921; +mul.f32 f924, f894, 0f3F4F1BBD; +sub.f32 f925, f865, f924; +fma.rn.f32 f926, f896, 0f3E9E377A, f925; +mul.f32 f927, f918, 0f3F167918; +mul.f32 f928, f920, 0f3F737871; +sub.f32 f929, f927, f928; +add.f32 f930, f929, f926; +sub.f32 f931, f926, f929; +add.f32 f932, f846, f861; +add.f32 f933, f841, f932; +add.f32 f934, f851, f856; +add.f32 f935, f934, f933; +add.f32 f936, f871, f886; +add.f32 f937, f866, f936; +add.f32 f938, f876, f881; +add.f32 f939, f938, f937; +fma.rn.f32 f940, f932, 0f3E9E377A, f841; +mul.f32 f941, f934, 0f3F4F1BBD; +sub.f32 f942, f940, f941; +sub.f32 f943, f871, f886; +mul.f32 f944, f943, 0f3F737871; +sub.f32 f945, f876, f881; +fma.rn.f32 f946, f945, 0f3F167918, f944; +sub.f32 f947, f942, f946; +add.f32 f948, f946, f942; +mul.f32 f949, f932, 0f3F4F1BBD; +sub.f32 f950, f841, f949; +fma.rn.f32 f951, f934, 0f3E9E377A, f950; +mul.f32 f952, f943, 0f3F167918; +mul.f32 f953, f945, 0f3F737871; +sub.f32 f954, f952, f953; +sub.f32 f955, f951, f954; +add.f32 f956, f954, f951; +fma.rn.f32 f957, f936, 0f3E9E377A, f866; +mul.f32 f958, f938, 0f3F4F1BBD; +sub.f32 f959, f957, f958; +sub.f32 f960, f846, f861; +mul.f32 f961, f960, 0f3F737871; +sub.f32 f962, f851, f856; +fma.rn.f32 f963, f962, 0f3F167918, f961; +add.f32 f964, f963, f959; +sub.f32 f965, f959, f963; +mul.f32 f966, f936, 0f3F4F1BBD; +sub.f32 f967, f866, f966; +fma.rn.f32 f968, f938, 0f3E9E377A, f967; +mul.f32 f969, f960, 0f3F167918; +mul.f32 f970, f962, 0f3F737871; +sub.f32 f971, f969, f970; +add.f32 f972, f971, f968; +sub.f32 f973, f968, f971; +add.f32 f974, f847, f862; +add.f32 f975, f842, f974; +add.f32 f976, f852, f857; +add.f32 f977, f976, f975; +add.f32 f978, f872, f887; +add.f32 f979, f867, f978; +add.f32 f980, f877, f882; +add.f32 f981, f980, f979; +fma.rn.f32 f982, f974, 0f3E9E377A, f842; +mul.f32 f983, f976, 0f3F4F1BBD; +sub.f32 f984, f982, f983; +sub.f32 f985, f872, f887; +mul.f32 f986, f985, 0f3F737871; +sub.f32 f987, f877, f882; +fma.rn.f32 f988, f987, 0f3F167918, f986; +sub.f32 f989, f984, f988; +add.f32 f990, f988, f984; +mul.f32 f991, f974, 0f3F4F1BBD; +sub.f32 f992, f842, f991; +fma.rn.f32 f993, f976, 0f3E9E377A, f992; +mul.f32 f994, f985, 0f3F167918; +mul.f32 f995, f987, 0f3F737871; +sub.f32 f996, f994, f995; +sub.f32 f997, f993, f996; +add.f32 f998, f996, f993; +fma.rn.f32 f999, f978, 0f3E9E377A, f867; +mul.f32 f1000, f980, 0f3F4F1BBD; +sub.f32 f1001, f999, f1000; +sub.f32 f1002, f847, f862; +mul.f32 f1003, f1002, 0f3F737871; +sub.f32 f1004, f852, f857; +fma.rn.f32 f1005, f1004, 0f3F167918, f1003; +add.f32 f1006, f1005, f1001; +sub.f32 f1007, f1001, f1005; +mul.f32 f1008, f978, 0f3F4F1BBD; +sub.f32 f1009, f867, f1008; +fma.rn.f32 f1010, f980, 0f3E9E377A, f1009; +mul.f32 f1011, f1002, 0f3F167918; +mul.f32 f1012, f1004, 0f3F737871; +sub.f32 f1013, f1011, f1012; +add.f32 f1014, f1013, f1010; +sub.f32 f1015, f1010, f1013; +add.f32 f1016, f848, f863; +add.f32 f1017, f843, f1016; +add.f32 f1018, f853, f858; +add.f32 f1019, f1018, f1017; +add.f32 f1020, f873, f888; +add.f32 f1021, f868, f1020; +add.f32 f1022, f878, f883; +add.f32 f1023, f1022, f1021; +fma.rn.f32 f1024, f1016, 0f3E9E377A, f843; +mul.f32 f1025, f1018, 0f3F4F1BBD; +sub.f32 f1026, f1024, f1025; +sub.f32 f1027, f873, f888; +mul.f32 f1028, f1027, 0f3F737871; +sub.f32 f1029, f878, f883; +fma.rn.f32 f1030, f1029, 0f3F167918, f1028; +sub.f32 f1031, f1026, f1030; +add.f32 f1032, f1030, f1026; +mul.f32 f1033, f1016, 0f3F4F1BBD; +sub.f32 f1034, f843, f1033; +fma.rn.f32 f1035, f1018, 0f3E9E377A, f1034; +mul.f32 f1036, f1027, 0f3F167918; +mul.f32 f1037, f1029, 0f3F737871; +sub.f32 f1038, f1036, f1037; +sub.f32 f1039, f1035, f1038; +add.f32 f1040, f1038, f1035; +fma.rn.f32 f1041, f1020, 0f3E9E377A, f868; +mul.f32 f1042, f1022, 0f3F4F1BBD; +sub.f32 f1043, f1041, f1042; +sub.f32 f1044, f848, f863; +mul.f32 f1045, f1044, 0f3F737871; +sub.f32 f1046, f853, f858; +fma.rn.f32 f1047, f1046, 0f3F167918, f1045; +add.f32 f1048, f1047, f1043; +sub.f32 f1049, f1043, f1047; +mul.f32 f1050, f1020, 0f3F4F1BBD; +sub.f32 f1051, f868, f1050; +fma.rn.f32 f1052, f1022, 0f3E9E377A, f1051; +mul.f32 f1053, f1044, 0f3F167918; +mul.f32 f1054, f1046, 0f3F737871; +sub.f32 f1055, f1053, f1054; +add.f32 f1056, f1055, f1052; +sub.f32 f1057, f1052, f1055; +add.f32 f1058, f849, f864; +add.f32 f1059, f844, f1058; +add.f32 f1060, f854, f859; +add.f32 f1061, f1060, f1059; +add.f32 f1062, f874, f889; +add.f32 f1063, f869, f1062; +add.f32 f1064, f879, f884; +add.f32 f1065, f1064, f1063; +fma.rn.f32 f1066, f1058, 0f3E9E377A, f844; +mul.f32 f1067, f1060, 0f3F4F1BBD; +sub.f32 f1068, f1066, f1067; +sub.f32 f1069, f874, f889; +mul.f32 f1070, f1069, 0f3F737871; +sub.f32 f1071, f879, f884; +fma.rn.f32 f1072, f1071, 0f3F167918, f1070; +sub.f32 f1073, f1068, f1072; +add.f32 f1074, f1072, f1068; +mul.f32 f1075, f1058, 0f3F4F1BBD; +sub.f32 f1076, f844, f1075; +fma.rn.f32 f1077, f1060, 0f3E9E377A, f1076; +mul.f32 f1078, f1069, 0f3F167918; +mul.f32 f1079, f1071, 0f3F737871; +sub.f32 f1080, f1078, f1079; +sub.f32 f1081, f1077, f1080; +add.f32 f1082, f1080, f1077; +fma.rn.f32 f1083, f1062, 0f3E9E377A, f869; +mul.f32 f1084, f1064, 0f3F4F1BBD; +sub.f32 f1085, f1083, f1084; +sub.f32 f1086, f849, f864; +mul.f32 f1087, f1086, 0f3F737871; +sub.f32 f1088, f854, f859; +fma.rn.f32 f1089, f1088, 0f3F167918, f1087; +add.f32 f1090, f1089, f1085; +sub.f32 f1091, f1085, f1089; +mul.f32 f1092, f1062, 0f3F4F1BBD; +sub.f32 f1093, f869, f1092; +fma.rn.f32 f1094, f1064, 0f3E9E377A, f1093; +mul.f32 f1095, f1086, 0f3F167918; +mul.f32 f1096, f1088, 0f3F737871; +sub.f32 f1097, f1095, f1096; +add.f32 f1098, f1097, f1094; +sub.f32 f1099, f1094, f1097; +mul.f32 f1100, f947, 0f3F77F511; +mul.f32 f1101, f964, 0f3E7EA890; +sub.f32 f1102, f1100, f1101; +mul.f32 f1103, f964, 0f3F77F511; +fma.rn.f32 f1104, f947, 0f3E7EA890, f1103; +mul.f32 f1105, f989, 0f3F6055A2; +mul.f32 f1106, f1006, 0f3EF6A86B; +sub.f32 f1107, f1105, f1106; +mul.f32 f1108, f1006, 0f3F6055A2; +fma.rn.f32 f1109, f989, 0f3EF6A86B, f1108; +mul.f32 f1110, f1031, 0f3F3A9DB0; +mul.f32 f1111, f1048, 0f3F2F3E7B; +sub.f32 f1112, f1110, f1111; +mul.f32 f1113, f1048, 0f3F3A9DB0; +fma.rn.f32 f1114, f1031, 0f3F2F3E7B, f1113; +mul.f32 f1115, f1073, 0f3F092BF2; +mul.f32 f1116, f1090, 0f3F5825E0; +sub.f32 f1117, f1115, f1116; +mul.f32 f1118, f1090, 0f3F092BF2; +fma.rn.f32 f1119, f1073, 0f3F5825E0, f1118; +mul.f32 f1120, f955, 0f3F6055A2; +mul.f32 f1121, f972, 0f3EF6A86B; +sub.f32 f1122, f1120, f1121; +mul.f32 f1123, f972, 0f3F6055A2; +fma.rn.f32 f1124, f955, 0f3EF6A86B, f1123; +mul.f32 f1125, f997, 0f3F092BF2; +mul.f32 f1126, f1014, 0f3F5825E0; +sub.f32 f1127, f1125, f1126; +mul.f32 f1128, f1014, 0f3F092BF2; +fma.rn.f32 f1129, f997, 0f3F5825E0, f1128; +mul.f32 f1130, f1039, 0f3D809851; +mul.f32 f1131, f1056, 0f3F7F7EAE; +sub.f32 f1132, f1130, f1131; +mul.f32 f1133, f1056, 0f3D809851; +fma.rn.f32 f1134, f1039, 0f3F7F7EAE, f1133; +mul.f32 f1135, f1081, 0fBED9FFBE; +mul.f32 f1136, f1098, 0f3F67A2BF; +sub.f32 f1137, f1135, f1136; +mul.f32 f1138, f1098, 0fBED9FFBE; +fma.rn.f32 f1139, f1081, 0f3F67A2BF, f1138; +mul.f32 f1140, f956, 0f3F3A9DB0; +mul.f32 f1141, f973, 0f3F2F3E7B; +sub.f32 f1142, f1140, f1141; +mul.f32 f1143, f973, 0f3F3A9DB0; +fma.rn.f32 f1144, f956, 0f3F2F3E7B, f1143; +mul.f32 f1145, f998, 0f3D809851; +mul.f32 f1146, f1015, 0f3F7F7EAE; +sub.f32 f1147, f1145, f1146; +mul.f32 f1148, f1015, 0f3D809851; +fma.rn.f32 f1149, f998, 0f3F7F7EAE, f1148; +mul.f32 f1150, f1040, 0fBF232E38; +mul.f32 f1151, f1057, 0f3F45405B; +sub.f32 f1152, f1150, f1151; +mul.f32 f1153, f1057, 0fBF232E38; +fma.rn.f32 f1154, f1040, 0f3F45405B, f1153; +mul.f32 f1155, f1082, 0fBF7DFB3B; +mul.f32 f1156, f1099, 0f3E00575B; +sub.f32 f1157, f1155, f1156; +mul.f32 f1158, f1099, 0fBF7DFB3B; +fma.rn.f32 f1159, f1082, 0f3E00575B, f1158; +mul.f32 f1160, f948, 0f3F092BF2; +mul.f32 f1161, f965, 0f3F5825E0; +sub.f32 f1162, f1160, f1161; +mul.f32 f1163, f965, 0f3F092BF2; +fma.rn.f32 f1164, f948, 0f3F5825E0, f1163; +mul.f32 f1165, f990, 0fBED9FFBE; +mul.f32 f1166, f1007, 0f3F67A2BF; +sub.f32 f1167, f1165, f1166; +mul.f32 f1168, f1007, 0fBED9FFBE; +fma.rn.f32 f1169, f990, 0f3F67A2BF, f1168; +mul.f32 f1170, f1032, 0fBF7DFB3B; +mul.f32 f1171, f1049, 0f3E00575B; +sub.f32 f1172, f1170, f1171; +mul.f32 f1173, f1049, 0fBF7DFB3B; +fma.rn.f32 f1174, f1032, 0f3E00575B, f1173; +mul.f32 f1175, f1074, 0fBF232E38; +mul.f32 f1176, f1091, 0fBF45405B; +sub.f32 f1177, f1175, f1176; +mul.f32 f1178, f1091, 0fBF232E38; +fma.rn.f32 f1179, f1074, 0fBF45405B, f1178; +add.f32 f1180, f935, f1061; +add.f32 f1181, f893, f1180; +add.f32 f1182, f977, f1019; +add.f32 f1183, f939, f1065; +add.f32 f1184, f897, f1183; +add.f32 f1185, f981, f1023; +fma.rn.f32 f1186, f1180, 0f3E9E377A, f893; +mul.f32 f1187, f1182, 0f3F4F1BBD; +sub.f32 f1188, f1186, f1187; +sub.f32 f1189, f939, f1065; +mul.f32 f1190, f1189, 0f3F737871; +sub.f32 f1191, f981, f1023; +fma.rn.f32 f1192, f1191, 0f3F167918, f1190; +mul.f32 f1193, f1180, 0f3F4F1BBD; +sub.f32 f1194, f893, f1193; +fma.rn.f32 f1195, f1182, 0f3E9E377A, f1194; +mul.f32 f1196, f1189, 0f3F167918; +mul.f32 f1197, f1191, 0f3F737871; +sub.f32 f1198, f1196, f1197; +fma.rn.f32 f1199, f1183, 0f3E9E377A, f897; +mul.f32 f1200, f1185, 0f3F4F1BBD; +sub.f32 f1201, f1199, f1200; +sub.f32 f1202, f935, f1061; +mul.f32 f1203, f1202, 0f3F737871; +sub.f32 f1204, f977, f1019; +fma.rn.f32 f1205, f1204, 0f3F167918, f1203; +mul.f32 f1206, f1183, 0f3F4F1BBD; +sub.f32 f1207, f897, f1206; +fma.rn.f32 f1208, f1185, 0f3E9E377A, f1207; +mul.f32 f1209, f1202, 0f3F167918; +mul.f32 f1210, f1204, 0f3F737871; +sub.f32 f1211, f1209, f1210; +add.f32 f1212, f1102, f1117; +add.f32 f1213, f905, f1212; +add.f32 f1214, f1107, f1112; +add.f32 f1215, f1104, f1119; +add.f32 f1216, f922, f1215; +add.f32 f1217, f1109, f1114; +fma.rn.f32 f1218, f1212, 0f3E9E377A, f905; +mul.f32 f1219, f1214, 0f3F4F1BBD; +sub.f32 f1220, f1218, f1219; +sub.f32 f1221, f1104, f1119; +mul.f32 f1222, f1221, 0f3F737871; +sub.f32 f1223, f1109, f1114; +fma.rn.f32 f1224, f1223, 0f3F167918, f1222; +mul.f32 f1225, f1212, 0f3F4F1BBD; +sub.f32 f1226, f905, f1225; +fma.rn.f32 f1227, f1214, 0f3E9E377A, f1226; +mul.f32 f1228, f1221, 0f3F167918; +mul.f32 f1229, f1223, 0f3F737871; +sub.f32 f1230, f1228, f1229; +fma.rn.f32 f1231, f1215, 0f3E9E377A, f922; +mul.f32 f1232, f1217, 0f3F4F1BBD; +sub.f32 f1233, f1231, f1232; +sub.f32 f1234, f1102, f1117; +mul.f32 f1235, f1234, 0f3F737871; +sub.f32 f1236, f1107, f1112; +fma.rn.f32 f1237, f1236, 0f3F167918, f1235; +mul.f32 f1238, f1215, 0f3F4F1BBD; +sub.f32 f1239, f922, f1238; +fma.rn.f32 f1240, f1217, 0f3E9E377A, f1239; +mul.f32 f1241, f1234, 0f3F167918; +mul.f32 f1242, f1236, 0f3F737871; +sub.f32 f1243, f1241, f1242; +add.f32 f1244, f1122, f1137; +add.f32 f1245, f913, f1244; +add.f32 f1246, f1127, f1132; +add.f32 f1247, f1124, f1139; +add.f32 f1248, f930, f1247; +add.f32 f1249, f1129, f1134; +fma.rn.f32 f1250, f1244, 0f3E9E377A, f913; +mul.f32 f1251, f1246, 0f3F4F1BBD; +sub.f32 f1252, f1250, f1251; +sub.f32 f1253, f1124, f1139; +mul.f32 f1254, f1253, 0f3F737871; +sub.f32 f1255, f1129, f1134; +fma.rn.f32 f1256, f1255, 0f3F167918, f1254; +mul.f32 f1257, f1244, 0f3F4F1BBD; +sub.f32 f1258, f913, f1257; +fma.rn.f32 f1259, f1246, 0f3E9E377A, f1258; +mul.f32 f1260, f1253, 0f3F167918; +mul.f32 f1261, f1255, 0f3F737871; +sub.f32 f1262, f1260, f1261; +fma.rn.f32 f1263, f1247, 0f3E9E377A, f930; +mul.f32 f1264, f1249, 0f3F4F1BBD; +sub.f32 f1265, f1263, f1264; +sub.f32 f1266, f1122, f1137; +mul.f32 f1267, f1266, 0f3F737871; +sub.f32 f1268, f1127, f1132; +fma.rn.f32 f1269, f1268, 0f3F167918, f1267; +mul.f32 f1270, f1247, 0f3F4F1BBD; +sub.f32 f1271, f930, f1270; +fma.rn.f32 f1272, f1249, 0f3E9E377A, f1271; +mul.f32 f1273, f1266, 0f3F167918; +mul.f32 f1274, f1268, 0f3F737871; +sub.f32 f1275, f1273, f1274; +add.f32 f1276, f1142, f1157; +add.f32 f1277, f914, f1276; +add.f32 f1278, f1147, f1152; +add.f32 f1279, f1144, f1159; +add.f32 f1280, f931, f1279; +add.f32 f1281, f1149, f1154; +fma.rn.f32 f1282, f1276, 0f3E9E377A, f914; +mul.f32 f1283, f1278, 0f3F4F1BBD; +sub.f32 f1284, f1282, f1283; +sub.f32 f1285, f1144, f1159; +mul.f32 f1286, f1285, 0f3F737871; +sub.f32 f1287, f1149, f1154; +fma.rn.f32 f1288, f1287, 0f3F167918, f1286; +mul.f32 f1289, f1276, 0f3F4F1BBD; +sub.f32 f1290, f914, f1289; +fma.rn.f32 f1291, f1278, 0f3E9E377A, f1290; +mul.f32 f1292, f1285, 0f3F167918; +mul.f32 f1293, f1287, 0f3F737871; +sub.f32 f1294, f1292, f1293; +fma.rn.f32 f1295, f1279, 0f3E9E377A, f931; +mul.f32 f1296, f1281, 0f3F4F1BBD; +sub.f32 f1297, f1295, f1296; +sub.f32 f1298, f1142, f1157; +mul.f32 f1299, f1298, 0f3F737871; +sub.f32 f1300, f1147, f1152; +fma.rn.f32 f1301, f1300, 0f3F167918, f1299; +mul.f32 f1302, f1279, 0f3F4F1BBD; +sub.f32 f1303, f931, f1302; +fma.rn.f32 f1304, f1281, 0f3E9E377A, f1303; +mul.f32 f1305, f1298, 0f3F167918; +mul.f32 f1306, f1300, 0f3F737871; +sub.f32 f1307, f1305, f1306; +add.f32 f1308, f1162, f1177; +add.f32 f1309, f906, f1308; +add.f32 f1310, f1167, f1172; +add.f32 f1311, f1164, f1179; +add.f32 f1312, f923, f1311; +add.f32 f1313, f1169, f1174; +fma.rn.f32 f1314, f1308, 0f3E9E377A, f906; +mul.f32 f1315, f1310, 0f3F4F1BBD; +sub.f32 f1316, f1314, f1315; +sub.f32 f1317, f1164, f1179; +mul.f32 f1318, f1317, 0f3F737871; +sub.f32 f1319, f1169, f1174; +fma.rn.f32 f1320, f1319, 0f3F167918, f1318; +mul.f32 f1321, f1308, 0f3F4F1BBD; +sub.f32 f1322, f906, f1321; +fma.rn.f32 f1323, f1310, 0f3E9E377A, f1322; +mul.f32 f1324, f1317, 0f3F167918; +mul.f32 f1325, f1319, 0f3F737871; +sub.f32 f1326, f1324, f1325; +fma.rn.f32 f1327, f1311, 0f3E9E377A, f923; +mul.f32 f1328, f1313, 0f3F4F1BBD; +sub.f32 f1329, f1327, f1328; +sub.f32 f1330, f1162, f1177; +mul.f32 f1331, f1330, 0f3F737871; +sub.f32 f1332, f1167, f1172; +fma.rn.f32 f1333, f1332, 0f3F167918, f1331; +mul.f32 f1334, f1311, 0f3F4F1BBD; +sub.f32 f1335, f923, f1334; +fma.rn.f32 f1336, f1313, 0f3E9E377A, f1335; +mul.f32 f1337, f1330, 0f3F167918; +mul.f32 f1338, f1332, 0f3F737871; +sub.f32 f1339, f1337, f1338; +add.f32 %0, f1182, f1181; +add.f32 %1, f1185, f1184; +add.f32 %3, f1217, f1216; +add.f32 %2, f1214, f1213; +add.f32 %5, f1249, f1248; +add.f32 %4, f1246, f1245; +add.f32 %7, f1281, f1280; +add.f32 %6, f1278, f1277; +add.f32 %9, f1313, f1312; +add.f32 %8, f1310, f1309; +add.f32 %11, f1205, f1201; +sub.f32 %10, f1188, f1192; +add.f32 %13, f1237, f1233; +sub.f32 %12, f1220, f1224; +add.f32 %15, f1269, f1265; +sub.f32 %14, f1252, f1256; +add.f32 %17, f1301, f1297; +sub.f32 %16, f1284, f1288; +add.f32 %19, f1333, f1329; +sub.f32 %18, f1316, f1320; +sub.f32 %20, f1195, f1198; +add.f32 %21, f1211, f1208; +add.f32 %23, f1243, f1240; +sub.f32 %22, f1227, f1230; +add.f32 %25, f1275, f1272; +sub.f32 %24, f1259, f1262; +add.f32 %27, f1307, f1304; +sub.f32 %26, f1291, f1294; +add.f32 %29, f1339, f1336; +sub.f32 %28, f1323, f1326; +add.f32 %30, f1198, f1195; +sub.f32 %31, f1208, f1211; +sub.f32 %33, f1240, f1243; +add.f32 %32, f1230, f1227; +sub.f32 %35, f1272, f1275; +add.f32 %34, f1262, f1259; +sub.f32 %37, f1304, f1307; +add.f32 %36, f1294, f1291; +sub.f32 %39, f1336, f1339; +add.f32 %38, f1326, f1323; +sub.f32 %41, f1201, f1205; +add.f32 %40, f1192, f1188; +sub.f32 %43, f1233, f1237; +add.f32 %42, f1224, f1220; +sub.f32 %45, f1265, f1269; +add.f32 %44, f1256, f1252; +sub.f32 %47, f1297, f1301; +add.f32 %46, f1288, f1284; +sub.f32 %49, f1329, f1333; +add.f32 %48, f1320, f1316; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y): "r"(smem), "l"(lut_sp_25_625), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<366, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<366>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 5000, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %16, %24; +add.f32 f22, %14, f21; +add.f32 f23, %19, %22; +add.f32 f24, %18, %25; +add.f32 f25, %15, f24; +add.f32 f26, %21, %23; +fma.rn.f32 f27, f21, 0f3E9E377A, %14; +mul.f32 f28, f23, 0f3F4F1BBD; +sub.f32 f29, f27, f28; +sub.f32 f30, %18, %25; +mul.f32 f31, f30, 0f3F737871; +sub.f32 f32, %21, %23; +fma.rn.f32 f33, f32, 0f3F167918, f31; +sub.f32 f34, f29, f33; +add.f32 f35, f33, f29; +mul.f32 f36, f21, 0f3F4F1BBD; +sub.f32 f37, %14, f36; +fma.rn.f32 f38, f23, 0f3E9E377A, f37; +mul.f32 f39, f30, 0f3F167918; +mul.f32 f40, f32, 0f3F737871; +sub.f32 f41, f39, f40; +sub.f32 f42, f38, f41; +add.f32 f43, f41, f38; +fma.rn.f32 f44, f24, 0f3E9E377A, %15; +mul.f32 f45, f26, 0f3F4F1BBD; +sub.f32 f46, f44, f45; +sub.f32 f47, %16, %24; +mul.f32 f48, f47, 0f3F737871; +sub.f32 f49, %19, %22; +fma.rn.f32 f50, f49, 0f3F167918, f48; +add.f32 f51, f50, f46; +sub.f32 f52, f46, f50; +mul.f32 f53, f24, 0f3F4F1BBD; +sub.f32 f54, %15, f53; +fma.rn.f32 f55, f26, 0f3E9E377A, f54; +mul.f32 f56, f47, 0f3F167918; +mul.f32 f57, f49, 0f3F737871; +sub.f32 f58, f56, f57; +add.f32 f59, f58, f55; +sub.f32 f60, f55, f58; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5000, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f61, f62}, [rd6]; +mul.f32 f65, f51, f62; +mul.f32 f66, f34, f62; +mul.f32 f67, f61, f51; +mul.f32 f68, f61, f61; +mul.f32 f69, f62, f62; +sub.f32 f70, f68, f69; +mul.f32 f71, f62, f61; +fma.rn.f32 f72, f62, f61, f71; +mul.f32 f73, f59, f72; +mul.f32 f74, f42, f72; +mul.f32 f75, f70, f59; +mul.f32 f76, f61, f70; +mul.f32 f77, f62, f72; +sub.f32 f78, f76, f77; +mul.f32 f79, f61, f72; +fma.rn.f32 f80, f62, f70, f79; +mul.f32 f81, f60, f80; +mul.f32 f82, f43, f80; +mul.f32 f83, f78, f60; +mul.f32 f84, f61, f78; +mul.f32 f85, f62, f80; +sub.f32 f86, f84, f85; +mul.f32 f87, f61, f80; +fma.rn.f32 f88, f62, f78, f87; +mul.f32 f89, f52, f88; +mul.f32 f90, f35, f88; +mul.f32 f91, f86, f52; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +add.f32 f92, f26, f25; +add.f32 f93, f23, f22; +st.shared.v2.f32 [r9], {f93, f92}; +fma.rn.f32 f94, f61, f34, f65; +sub.f32 f95, f67, f66; +st.shared.v2.f32 [r9+8], {f94, f95}; +fma.rn.f32 f96, f70, f42, f73; +sub.f32 f97, f75, f74; +st.shared.v2.f32 [r9+16], {f96, f97}; +sub.f32 f98, f83, f82; +fma.rn.f32 f99, f78, f43, f81; +st.shared.v2.f32 [r9+24], {f99, f98}; +fma.rn.f32 f100, f86, f35, f89; +sub.f32 f101, f91, f90; +st.shared.v2.f32 [r9+32], {f100, f101}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f102, f103}, [r11]; +ld.shared.v2.f32 {f106, f107}, [r11+1000]; +ld.shared.v2.f32 {f110, f111}, [r11+2000]; +ld.shared.v2.f32 {f114, f115}, [r11+3000]; +ld.shared.v2.f32 {f118, f119}, [r11+4000]; +add.f32 f122, f106, f118; +add.f32 f123, f102, f122; +add.f32 f124, f110, f114; +add.f32 f125, f107, f119; +add.f32 f126, f103, f125; +add.f32 f127, f111, f115; +fma.rn.f32 f128, f122, 0f3E9E377A, f102; +mul.f32 f129, f124, 0f3F4F1BBD; +sub.f32 f130, f128, f129; +sub.f32 f131, f107, f119; +mul.f32 f132, f131, 0f3F737871; +sub.f32 f133, f111, f115; +fma.rn.f32 f134, f133, 0f3F167918, f132; +sub.f32 f135, f130, f134; +add.f32 f136, f134, f130; +mul.f32 f137, f122, 0f3F4F1BBD; +sub.f32 f138, f102, f137; +fma.rn.f32 f139, f124, 0f3E9E377A, f138; +mul.f32 f140, f131, 0f3F167918; +mul.f32 f141, f133, 0f3F737871; +sub.f32 f142, f140, f141; +sub.f32 f143, f139, f142; +add.f32 f144, f142, f139; +fma.rn.f32 f145, f125, 0f3E9E377A, f103; +mul.f32 f146, f127, 0f3F4F1BBD; +sub.f32 f147, f145, f146; +sub.f32 f148, f106, f118; +mul.f32 f149, f148, 0f3F737871; +sub.f32 f150, f110, f114; +fma.rn.f32 f151, f150, 0f3F167918, f149; +add.f32 f152, f151, f147; +sub.f32 f153, f147, f151; +mul.f32 f154, f125, 0f3F4F1BBD; +sub.f32 f155, f103, f154; +fma.rn.f32 f156, f127, 0f3E9E377A, f155; +mul.f32 f157, f148, 0f3F167918; +mul.f32 f158, f150, 0f3F737871; +sub.f32 f159, f157, f158; +add.f32 f160, f159, f156; +sub.f32 f161, f156, f159; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f162, f163}, [rd11]; +mul.f32 f166, f152, f163; +mul.f32 f167, f135, f163; +mul.f32 f168, f162, f152; +mul.f32 f169, f162, f162; +mul.f32 f170, f163, f163; +sub.f32 f171, f169, f170; +mul.f32 f172, f163, f162; +fma.rn.f32 f173, f163, f162, f172; +mul.f32 f174, f160, f173; +mul.f32 f175, f143, f173; +mul.f32 f176, f171, f160; +mul.f32 f177, f162, f171; +mul.f32 f178, f163, f173; +sub.f32 f179, f177, f178; +mul.f32 f180, f162, f173; +fma.rn.f32 f181, f163, f171, f180; +mul.f32 f182, f161, f181; +mul.f32 f183, f144, f181; +mul.f32 f184, f179, f161; +mul.f32 f185, f162, f179; +mul.f32 f186, f163, f181; +sub.f32 f187, f185, f186; +mul.f32 f188, f162, f181; +fma.rn.f32 f189, f163, f179, f188; +mul.f32 f190, f153, f189; +mul.f32 f191, f136, f189; +mul.f32 f192, f187, f153; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +add.f32 f193, f127, f126; +add.f32 f194, f124, f123; +st.shared.v2.f32 [r17], {f194, f193}; +fma.rn.f32 f195, f162, f135, f166; +sub.f32 f196, f168, f167; +st.shared.v2.f32 [r17+40], {f195, f196}; +fma.rn.f32 f197, f171, f143, f174; +sub.f32 f198, f176, f175; +st.shared.v2.f32 [r17+80], {f197, f198}; +fma.rn.f32 f199, f179, f144, f182; +sub.f32 f200, f184, f183; +st.shared.v2.f32 [r17+120], {f199, f200}; +fma.rn.f32 f201, f187, f136, f190; +sub.f32 f202, f192, f191; +st.shared.v2.f32 [r17+160], {f201, f202}; +barrier.sync 0; +ld.shared.v2.f32 {f203, f204}, [r11]; +ld.shared.v2.f32 {f207, f208}, [r11+1000]; +ld.shared.v2.f32 {f211, f212}, [r11+2000]; +ld.shared.v2.f32 {f215, f216}, [r11+3000]; +ld.shared.v2.f32 {f219, f220}, [r11+4000]; +add.f32 f223, f207, f219; +add.f32 f224, f203, f223; +add.f32 f225, f211, f215; +add.f32 f226, f208, f220; +add.f32 f227, f204, f226; +add.f32 f228, f212, f216; +fma.rn.f32 f229, f223, 0f3E9E377A, f203; +mul.f32 f230, f225, 0f3F4F1BBD; +sub.f32 f231, f229, f230; +sub.f32 f232, f208, f220; +mul.f32 f233, f232, 0f3F737871; +sub.f32 f234, f212, f216; +fma.rn.f32 f235, f234, 0f3F167918, f233; +sub.f32 f236, f231, f235; +add.f32 f237, f235, f231; +mul.f32 f238, f223, 0f3F4F1BBD; +sub.f32 f239, f203, f238; +fma.rn.f32 f240, f225, 0f3E9E377A, f239; +mul.f32 f241, f232, 0f3F167918; +mul.f32 f242, f234, 0f3F737871; +sub.f32 f243, f241, f242; +sub.f32 f244, f240, f243; +add.f32 f245, f243, f240; +fma.rn.f32 f246, f226, 0f3E9E377A, f204; +mul.f32 f247, f228, 0f3F4F1BBD; +sub.f32 f248, f246, f247; +sub.f32 f249, f207, f219; +mul.f32 f250, f249, 0f3F737871; +sub.f32 f251, f211, f215; +fma.rn.f32 f252, f251, 0f3F167918, f250; +add.f32 f253, f252, f248; +sub.f32 f254, f248, f252; +mul.f32 f255, f226, 0f3F4F1BBD; +sub.f32 f256, f204, f255; +fma.rn.f32 f257, f228, 0f3E9E377A, f256; +mul.f32 f258, f249, 0f3F167918; +mul.f32 f259, f251, 0f3F737871; +sub.f32 f260, f258, f259; +add.f32 f261, f260, f257; +sub.f32 f262, f257, f260; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f263, f264}, [rd16]; +mul.f32 f267, f253, f264; +mul.f32 f268, f236, f264; +mul.f32 f269, f263, f253; +mul.f32 f270, f263, f263; +mul.f32 f271, f264, f264; +sub.f32 f272, f270, f271; +mul.f32 f273, f264, f263; +fma.rn.f32 f274, f264, f263, f273; +mul.f32 f275, f261, f274; +mul.f32 f276, f244, f274; +mul.f32 f277, f272, f261; +mul.f32 f278, f263, f272; +mul.f32 f279, f264, f274; +sub.f32 f280, f278, f279; +mul.f32 f281, f263, f274; +fma.rn.f32 f282, f264, f272, f281; +mul.f32 f283, f262, f282; +mul.f32 f284, f245, f282; +mul.f32 f285, f280, f262; +mul.f32 f286, f263, f280; +mul.f32 f287, f264, f282; +sub.f32 f288, f286, f287; +mul.f32 f289, f263, f282; +fma.rn.f32 f290, f264, f280, f289; +mul.f32 f291, f254, f290; +mul.f32 f292, f237, f290; +mul.f32 f293, f288, f254; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +add.f32 f294, f228, f227; +add.f32 f295, f225, f224; +st.shared.v2.f32 [r23], {f295, f294}; +fma.rn.f32 f296, f263, f236, f267; +sub.f32 f297, f269, f268; +st.shared.v2.f32 [r23+200], {f296, f297}; +fma.rn.f32 f298, f272, f244, f275; +sub.f32 f299, f277, f276; +st.shared.v2.f32 [r23+400], {f298, f299}; +fma.rn.f32 f300, f280, f245, f283; +sub.f32 f301, f285, f284; +st.shared.v2.f32 [r23+600], {f300, f301}; +fma.rn.f32 f302, f288, f237, f291; +sub.f32 f303, f293, f292; +st.shared.v2.f32 [r23+800], {f302, f303}; +barrier.sync 0; +ld.shared.v2.f32 {f304, f305}, [r11]; +ld.shared.v2.f32 {f308, f309}, [r11+1000]; +ld.shared.v2.f32 {f312, f313}, [r11+2000]; +ld.shared.v2.f32 {f316, f317}, [r11+3000]; +ld.shared.v2.f32 {f320, f321}, [r11+4000]; +add.f32 f324, f308, f320; +add.f32 f325, f304, f324; +add.f32 f326, f312, f316; +add.f32 f327, f309, f321; +add.f32 f328, f305, f327; +add.f32 f329, f313, f317; +fma.rn.f32 f330, f324, 0f3E9E377A, f304; +mul.f32 f331, f326, 0f3F4F1BBD; +sub.f32 f332, f330, f331; +sub.f32 f333, f309, f321; +mul.f32 f334, f333, 0f3F737871; +sub.f32 f335, f313, f317; +fma.rn.f32 f336, f335, 0f3F167918, f334; +mul.f32 f337, f324, 0f3F4F1BBD; +sub.f32 f338, f304, f337; +fma.rn.f32 f339, f326, 0f3E9E377A, f338; +mul.f32 f340, f333, 0f3F167918; +mul.f32 f341, f335, 0f3F737871; +sub.f32 f342, f340, f341; +fma.rn.f32 f343, f327, 0f3E9E377A, f305; +mul.f32 f344, f329, 0f3F4F1BBD; +sub.f32 f345, f343, f344; +sub.f32 f346, f308, f320; +mul.f32 f347, f346, 0f3F737871; +sub.f32 f348, f312, f316; +fma.rn.f32 f349, f348, 0f3F167918, f347; +mul.f32 f350, f327, 0f3F4F1BBD; +sub.f32 f351, f305, f350; +fma.rn.f32 f352, f329, 0f3E9E377A, f351; +mul.f32 f353, f346, 0f3F167918; +mul.f32 f354, f348, 0f3F737871; +sub.f32 f355, f353, f354; +add.f32 %1, f329, f328; +add.f32 %0, f326, f325; +add.f32 %3, f349, f345; +sub.f32 %2, f332, f336; +add.f32 %5, f355, f352; +sub.f32 %4, f339, f342; +sub.f32 %7, f352, f355; +add.f32 %6, f342, f339; +sub.f32 %9, f345, f349; +add.f32 %8, f336, f332; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<367, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<336>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 2500, r2; +mov.u32 r4, %tid.x; +add.f32 f21, %16, %24; +add.f32 f22, %14, f21; +add.f32 f23, %19, %22; +add.f32 f24, f23, f22; +add.f32 f25, %18, %25; +add.f32 f26, %15, f25; +add.f32 f27, %21, %23; +add.f32 f28, f27, f26; +fma.rn.f32 f29, f21, 0f3E9E377A, %14; +mul.f32 f30, f23, 0f3F4F1BBD; +sub.f32 f31, f29, f30; +sub.f32 f32, %18, %25; +mul.f32 f33, f32, 0f3F737871; +sub.f32 f34, %21, %23; +fma.rn.f32 f35, f34, 0f3F167918, f33; +sub.f32 f36, f31, f35; +add.f32 f37, f35, f31; +mul.f32 f38, f21, 0f3F4F1BBD; +sub.f32 f39, %14, f38; +fma.rn.f32 f40, f23, 0f3E9E377A, f39; +mul.f32 f41, f32, 0f3F167918; +mul.f32 f42, f34, 0f3F737871; +sub.f32 f43, f41, f42; +sub.f32 f44, f40, f43; +add.f32 f45, f43, f40; +fma.rn.f32 f46, f25, 0f3E9E377A, %15; +mul.f32 f47, f27, 0f3F4F1BBD; +sub.f32 f48, f46, f47; +sub.f32 f49, %16, %24; +mul.f32 f50, f49, 0f3F737871; +sub.f32 f51, %19, %22; +fma.rn.f32 f52, f51, 0f3F167918, f50; +add.f32 f53, f52, f48; +sub.f32 f54, f48, f52; +mul.f32 f55, f25, 0f3F4F1BBD; +sub.f32 f56, %15, f55; +fma.rn.f32 f57, f27, 0f3E9E377A, f56; +mul.f32 f58, f49, 0f3F167918; +mul.f32 f59, f51, 0f3F737871; +sub.f32 f60, f58, f59; +add.f32 f61, f60, f57; +sub.f32 f62, f57, f60; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f63, f64}, [rd6]; +mul.f32 f67, f53, f64; +fma.rn.f32 f68, f63, f36, f67; +mul.f32 f69, f36, f64; +mul.f32 f70, f63, f53; +sub.f32 f71, f70, f69; +mul.f32 f72, f63, f63; +mul.f32 f73, f64, f64; +sub.f32 f74, f72, f73; +mul.f32 f75, f64, f63; +fma.rn.f32 f76, f64, f63, f75; +mul.f32 f77, f61, f76; +fma.rn.f32 f78, f74, f44, f77; +mul.f32 f79, f44, f76; +mul.f32 f80, f74, f61; +sub.f32 f81, f80, f79; +mul.f32 f82, f63, f74; +mul.f32 f83, f64, f76; +sub.f32 f84, f82, f83; +mul.f32 f85, f63, f76; +fma.rn.f32 f86, f64, f74, f85; +mul.f32 f87, f62, f86; +fma.rn.f32 f88, f84, f45, f87; +mul.f32 f89, f45, f86; +mul.f32 f90, f84, f62; +sub.f32 f91, f90, f89; +mul.f32 f92, f63, f84; +mul.f32 f93, f64, f86; +sub.f32 f94, f92, f93; +mul.f32 f95, f63, f86; +fma.rn.f32 f96, f64, f84, f95; +mul.f32 f97, f54, f96; +fma.rn.f32 f98, f94, f37, f97; +mul.f32 f99, f37, f96; +mul.f32 f100, f94, f54; +sub.f32 f101, f100, f99; +mad.lo.s32 r8, r5, 2500, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 20, r8; +st.shared.f32 [r9], f24; +st.shared.f32 [r9+4], f68; +st.shared.f32 [r9+8], f78; +st.shared.f32 [r9+12], f88; +st.shared.f32 [r9+16], f98; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f32 f102, [r11]; +ld.shared.f32 f103, [r11+500]; +ld.shared.f32 f104, [r11+1000]; +ld.shared.f32 f105, [r11+1500]; +ld.shared.f32 f106, [r11+2000]; +barrier.sync 0; +st.shared.f32 [r9], f28; +st.shared.f32 [r9+4], f71; +st.shared.f32 [r9+8], f81; +st.shared.f32 [r9+12], f91; +st.shared.f32 [r9+16], f101; +barrier.sync 0; +ld.shared.f32 f107, [r11]; +ld.shared.f32 f108, [r11+500]; +ld.shared.f32 f109, [r11+1000]; +ld.shared.f32 f110, [r11+1500]; +ld.shared.f32 f111, [r11+2000]; +add.f32 f112, f103, f106; +add.f32 f113, f102, f112; +add.f32 f114, f104, f105; +add.f32 f115, f114, f113; +add.f32 f116, f108, f111; +add.f32 f117, f107, f116; +add.f32 f118, f109, f110; +add.f32 f119, f118, f117; +fma.rn.f32 f120, f112, 0f3E9E377A, f102; +mul.f32 f121, f114, 0f3F4F1BBD; +sub.f32 f122, f120, f121; +sub.f32 f123, f108, f111; +mul.f32 f124, f123, 0f3F737871; +sub.f32 f125, f109, f110; +fma.rn.f32 f126, f125, 0f3F167918, f124; +sub.f32 f127, f122, f126; +add.f32 f128, f126, f122; +mul.f32 f129, f112, 0f3F4F1BBD; +sub.f32 f130, f102, f129; +fma.rn.f32 f131, f114, 0f3E9E377A, f130; +mul.f32 f132, f123, 0f3F167918; +mul.f32 f133, f125, 0f3F737871; +sub.f32 f134, f132, f133; +sub.f32 f135, f131, f134; +add.f32 f136, f134, f131; +fma.rn.f32 f137, f116, 0f3E9E377A, f107; +mul.f32 f138, f118, 0f3F4F1BBD; +sub.f32 f139, f137, f138; +sub.f32 f140, f103, f106; +mul.f32 f141, f140, 0f3F737871; +sub.f32 f142, f104, f105; +fma.rn.f32 f143, f142, 0f3F167918, f141; +add.f32 f144, f143, f139; +sub.f32 f145, f139, f143; +mul.f32 f146, f116, 0f3F4F1BBD; +sub.f32 f147, f107, f146; +fma.rn.f32 f148, f118, 0f3E9E377A, f147; +mul.f32 f149, f140, 0f3F167918; +mul.f32 f150, f142, 0f3F737871; +sub.f32 f151, f149, f150; +add.f32 f152, f151, f148; +sub.f32 f153, f148, f151; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f154, f155}, [rd11]; +mul.f32 f158, f144, f155; +fma.rn.f32 f159, f154, f127, f158; +mul.f32 f160, f127, f155; +mul.f32 f161, f154, f144; +sub.f32 f162, f161, f160; +mul.f32 f163, f154, f154; +mul.f32 f164, f155, f155; +sub.f32 f165, f163, f164; +mul.f32 f166, f155, f154; +fma.rn.f32 f167, f155, f154, f166; +mul.f32 f168, f152, f167; +fma.rn.f32 f169, f165, f135, f168; +mul.f32 f170, f135, f167; +mul.f32 f171, f165, f152; +sub.f32 f172, f171, f170; +mul.f32 f173, f154, f165; +mul.f32 f174, f155, f167; +sub.f32 f175, f173, f174; +mul.f32 f176, f154, f167; +fma.rn.f32 f177, f155, f165, f176; +mul.f32 f178, f153, f177; +fma.rn.f32 f179, f175, f136, f178; +mul.f32 f180, f136, f177; +mul.f32 f181, f175, f153; +sub.f32 f182, f181, f180; +mul.f32 f183, f154, f175; +mul.f32 f184, f155, f177; +sub.f32 f185, f183, f184; +mul.f32 f186, f154, f177; +fma.rn.f32 f187, f155, f175, f186; +mul.f32 f188, f145, f187; +fma.rn.f32 f189, f185, f128, f188; +mul.f32 f190, f128, f187; +mul.f32 f191, f185, f145; +sub.f32 f192, f191, f190; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 100, r16; +st.shared.f32 [r17], f115; +st.shared.f32 [r17+20], f159; +st.shared.f32 [r17+40], f169; +st.shared.f32 [r17+60], f179; +st.shared.f32 [r17+80], f189; +barrier.sync 0; +ld.shared.f32 f193, [r11]; +ld.shared.f32 f194, [r11+500]; +ld.shared.f32 f195, [r11+1000]; +ld.shared.f32 f196, [r11+1500]; +ld.shared.f32 f197, [r11+2000]; +barrier.sync 0; +st.shared.f32 [r17], f119; +st.shared.f32 [r17+20], f162; +st.shared.f32 [r17+40], f172; +st.shared.f32 [r17+60], f182; +st.shared.f32 [r17+80], f192; +barrier.sync 0; +ld.shared.f32 f198, [r11]; +ld.shared.f32 f199, [r11+500]; +ld.shared.f32 f200, [r11+1000]; +ld.shared.f32 f201, [r11+1500]; +ld.shared.f32 f202, [r11+2000]; +add.f32 f203, f194, f197; +add.f32 f204, f193, f203; +add.f32 f205, f195, f196; +add.f32 f206, f205, f204; +add.f32 f207, f199, f202; +add.f32 f208, f198, f207; +add.f32 f209, f200, f201; +add.f32 f210, f209, f208; +fma.rn.f32 f211, f203, 0f3E9E377A, f193; +mul.f32 f212, f205, 0f3F4F1BBD; +sub.f32 f213, f211, f212; +sub.f32 f214, f199, f202; +mul.f32 f215, f214, 0f3F737871; +sub.f32 f216, f200, f201; +fma.rn.f32 f217, f216, 0f3F167918, f215; +sub.f32 f218, f213, f217; +add.f32 f219, f217, f213; +mul.f32 f220, f203, 0f3F4F1BBD; +sub.f32 f221, f193, f220; +fma.rn.f32 f222, f205, 0f3E9E377A, f221; +mul.f32 f223, f214, 0f3F167918; +mul.f32 f224, f216, 0f3F737871; +sub.f32 f225, f223, f224; +sub.f32 f226, f222, f225; +add.f32 f227, f225, f222; +fma.rn.f32 f228, f207, 0f3E9E377A, f198; +mul.f32 f229, f209, 0f3F4F1BBD; +sub.f32 f230, f228, f229; +sub.f32 f231, f194, f197; +mul.f32 f232, f231, 0f3F737871; +sub.f32 f233, f195, f196; +fma.rn.f32 f234, f233, 0f3F167918, f232; +add.f32 f235, f234, f230; +sub.f32 f236, f230, f234; +mul.f32 f237, f207, 0f3F4F1BBD; +sub.f32 f238, f198, f237; +fma.rn.f32 f239, f209, 0f3E9E377A, f238; +mul.f32 f240, f231, 0f3F167918; +mul.f32 f241, f233, 0f3F737871; +sub.f32 f242, f240, f241; +add.f32 f243, f242, f239; +sub.f32 f244, f239, f242; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f245, f246}, [rd16]; +mul.f32 f249, f235, f246; +fma.rn.f32 f250, f245, f218, f249; +mul.f32 f251, f218, f246; +mul.f32 f252, f245, f235; +sub.f32 f253, f252, f251; +mul.f32 f254, f245, f245; +mul.f32 f255, f246, f246; +sub.f32 f256, f254, f255; +mul.f32 f257, f246, f245; +fma.rn.f32 f258, f246, f245, f257; +mul.f32 f259, f243, f258; +fma.rn.f32 f260, f256, f226, f259; +mul.f32 f261, f226, f258; +mul.f32 f262, f256, f243; +sub.f32 f263, f262, f261; +mul.f32 f264, f245, f256; +mul.f32 f265, f246, f258; +sub.f32 f266, f264, f265; +mul.f32 f267, f245, f258; +fma.rn.f32 f268, f246, f256, f267; +mul.f32 f269, f244, f268; +fma.rn.f32 f270, f266, f227, f269; +mul.f32 f271, f227, f268; +mul.f32 f272, f266, f244; +sub.f32 f273, f272, f271; +mul.f32 f274, f245, f266; +mul.f32 f275, f246, f268; +sub.f32 f276, f274, f275; +mul.f32 f277, f245, f268; +fma.rn.f32 f278, f246, f266, f277; +mul.f32 f279, f236, f278; +fma.rn.f32 f280, f276, f219, f279; +mul.f32 f281, f219, f278; +mul.f32 f282, f276, f236; +sub.f32 f283, f282, f281; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 500, r22; +st.shared.f32 [r23], f206; +st.shared.f32 [r23+100], f250; +st.shared.f32 [r23+200], f260; +st.shared.f32 [r23+300], f270; +st.shared.f32 [r23+400], f280; +barrier.sync 0; +ld.shared.f32 f284, [r11]; +ld.shared.f32 f285, [r11+500]; +ld.shared.f32 f286, [r11+1000]; +ld.shared.f32 f287, [r11+1500]; +ld.shared.f32 f288, [r11+2000]; +barrier.sync 0; +st.shared.f32 [r23], f210; +st.shared.f32 [r23+100], f253; +st.shared.f32 [r23+200], f263; +st.shared.f32 [r23+300], f273; +st.shared.f32 [r23+400], f283; +barrier.sync 0; +ld.shared.f32 f289, [r11]; +ld.shared.f32 f290, [r11+500]; +ld.shared.f32 f291, [r11+1000]; +ld.shared.f32 f292, [r11+1500]; +ld.shared.f32 f293, [r11+2000]; +add.f32 f294, f285, f288; +add.f32 f295, f284, f294; +add.f32 f296, f286, f287; +add.f32 f297, f290, f293; +add.f32 f298, f289, f297; +add.f32 f299, f291, f292; +fma.rn.f32 f300, f294, 0f3E9E377A, f284; +mul.f32 f301, f296, 0f3F4F1BBD; +sub.f32 f302, f300, f301; +sub.f32 f303, f290, f293; +mul.f32 f304, f303, 0f3F737871; +sub.f32 f305, f291, f292; +fma.rn.f32 f306, f305, 0f3F167918, f304; +mul.f32 f307, f294, 0f3F4F1BBD; +sub.f32 f308, f284, f307; +fma.rn.f32 f309, f296, 0f3E9E377A, f308; +mul.f32 f310, f303, 0f3F167918; +mul.f32 f311, f305, 0f3F737871; +sub.f32 f312, f310, f311; +fma.rn.f32 f313, f297, 0f3E9E377A, f289; +mul.f32 f314, f299, 0f3F4F1BBD; +sub.f32 f315, f313, f314; +sub.f32 f316, f285, f288; +mul.f32 f317, f316, 0f3F737871; +sub.f32 f318, f286, f287; +fma.rn.f32 f319, f318, 0f3F167918, f317; +mul.f32 f320, f297, 0f3F4F1BBD; +sub.f32 f321, f289, f320; +fma.rn.f32 f322, f299, 0f3E9E377A, f321; +mul.f32 f323, f316, 0f3F167918; +mul.f32 f324, f318, 0f3F737871; +sub.f32 f325, f323, f324; +add.f32 %0, f296, f295; +add.f32 %1, f299, f298; +add.f32 %3, f319, f315; +sub.f32 %2, f302, f306; +sub.f32 %4, f309, f312; +add.f32 %5, f325, f322; +add.f32 %6, f312, f309; +sub.f32 %7, f322, f325; +sub.f32 %9, f315, f319; +add.f32 %8, f306, f302; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y): "r"(smem), "l"(lut_sp_5_625), "l"(lut_sp_5_125), "l"(lut_sp_5_25), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..85d609ac6f672 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp64_fwd.hpp.inc @@ -0,0 +1,3514 @@ +#ifndef CUFFTDX_FFT_625_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_625_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<540, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<1429>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 5000, r2; +add.f64 fd101, %65, %105; +add.f64 fd102, %52, fd101; +add.f64 fd103, %78, %92; +add.f64 fd104, fd103, fd102; +add.f64 fd105, %67, %107; +add.f64 fd106, %53, fd105; +add.f64 fd107, %80, %93; +add.f64 fd108, fd107, fd106; +fma.rn.f64 fd109, fd101, 0d3FD3C6EF372FE950, %52; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %67, %107; +mul.f64 fd113, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd114, %80, %93; +mul.f64 fd115, fd114, 0dBFE2CF2304755A5E; +sub.f64 fd116, fd115, fd113; +sub.f64 fd117, fd111, fd116; +add.f64 fd118, fd116, fd111; +mul.f64 fd119, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd120, %52, fd119; +fma.rn.f64 fd121, fd103, 0d3FD3C6EF372FE950, fd120; +mul.f64 fd122, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd123, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd123, fd122; +sub.f64 fd125, fd121, fd124; +add.f64 fd126, fd124, fd121; +fma.rn.f64 fd127, fd105, 0d3FD3C6EF372FE950, %53; +mul.f64 fd128, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, %65, %105; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, %78, %92; +mul.f64 fd133, fd132, 0dBFE2CF2304755A5E; +sub.f64 fd134, fd133, fd131; +add.f64 fd135, fd134, fd129; +sub.f64 fd136, fd129, fd134; +mul.f64 fd137, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd138, %53, fd137; +fma.rn.f64 fd139, fd107, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd141, fd140; +add.f64 fd143, fd142, fd139; +sub.f64 fd144, fd139, fd142; +add.f64 fd145, %68, %108; +add.f64 fd146, %54, fd145; +add.f64 fd147, %81, %94; +add.f64 fd148, fd147, fd146; +add.f64 fd149, %69, %109; +add.f64 fd150, %56, fd149; +add.f64 fd151, %83, %96; +add.f64 fd152, fd151, fd150; +fma.rn.f64 fd153, fd145, 0d3FD3C6EF372FE950, %54; +mul.f64 fd154, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd155, fd153, fd154; +sub.f64 fd156, %69, %109; +mul.f64 fd157, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd158, %83, %96; +mul.f64 fd159, fd158, 0dBFE2CF2304755A5E; +sub.f64 fd160, fd159, fd157; +sub.f64 fd161, fd155, fd160; +add.f64 fd162, fd160, fd155; +mul.f64 fd163, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd164, %54, fd163; +fma.rn.f64 fd165, fd147, 0d3FD3C6EF372FE950, fd164; +mul.f64 fd166, fd156, 0d3FE2CF2304755A5E; +mul.f64 fd167, fd158, 0d3FEE6F0E134454FF; +sub.f64 fd168, fd167, fd166; +sub.f64 fd169, fd165, fd168; +add.f64 fd170, fd168, fd165; +fma.rn.f64 fd171, fd149, 0d3FD3C6EF372FE950, %56; +mul.f64 fd172, fd151, 0d3FE9E3779B97F4A8; +sub.f64 fd173, fd171, fd172; +sub.f64 fd174, %68, %108; +mul.f64 fd175, fd174, 0d3FEE6F0E134454FF; +sub.f64 fd176, %81, %94; +mul.f64 fd177, fd176, 0dBFE2CF2304755A5E; +sub.f64 fd178, fd177, fd175; +add.f64 fd179, fd178, fd173; +sub.f64 fd180, fd173, fd178; +mul.f64 fd181, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd182, %56, fd181; +fma.rn.f64 fd183, fd151, 0d3FD3C6EF372FE950, fd182; +mul.f64 fd184, fd174, 0d3FE2CF2304755A5E; +mul.f64 fd185, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd186, fd185, fd184; +add.f64 fd187, fd186, fd183; +sub.f64 fd188, fd183, fd186; +add.f64 fd189, %70, %110; +add.f64 fd190, %57, fd189; +add.f64 fd191, %84, %97; +add.f64 fd192, fd191, fd190; +add.f64 fd193, %72, %112; +add.f64 fd194, %59, fd193; +add.f64 fd195, %85, %99; +add.f64 fd196, fd195, fd194; +fma.rn.f64 fd197, fd189, 0d3FD3C6EF372FE950, %57; +mul.f64 fd198, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd199, fd197, fd198; +sub.f64 fd200, %72, %112; +mul.f64 fd201, fd200, 0d3FEE6F0E134454FF; +sub.f64 fd202, %85, %99; +mul.f64 fd203, fd202, 0dBFE2CF2304755A5E; +sub.f64 fd204, fd203, fd201; +sub.f64 fd205, fd199, fd204; +add.f64 fd206, fd204, fd199; +mul.f64 fd207, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd208, %57, fd207; +fma.rn.f64 fd209, fd191, 0d3FD3C6EF372FE950, fd208; +mul.f64 fd210, fd200, 0d3FE2CF2304755A5E; +mul.f64 fd211, fd202, 0d3FEE6F0E134454FF; +sub.f64 fd212, fd211, fd210; +sub.f64 fd213, fd209, fd212; +add.f64 fd214, fd212, fd209; +fma.rn.f64 fd215, fd193, 0d3FD3C6EF372FE950, %59; +mul.f64 fd216, fd195, 0d3FE9E3779B97F4A8; +sub.f64 fd217, fd215, fd216; +sub.f64 fd218, %70, %110; +mul.f64 fd219, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd220, %84, %97; +mul.f64 fd221, fd220, 0dBFE2CF2304755A5E; +sub.f64 fd222, fd221, fd219; +add.f64 fd223, fd222, fd217; +sub.f64 fd224, fd217, fd222; +mul.f64 fd225, fd193, 0d3FE9E3779B97F4A8; +sub.f64 fd226, %59, fd225; +fma.rn.f64 fd227, fd195, 0d3FD3C6EF372FE950, fd226; +mul.f64 fd228, fd218, 0d3FE2CF2304755A5E; +mul.f64 fd229, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd230, fd229, fd228; +add.f64 fd231, fd230, fd227; +sub.f64 fd232, fd227, fd230; +add.f64 fd233, %73, %113; +add.f64 fd234, %60, fd233; +add.f64 fd235, %86, %100; +add.f64 fd236, fd235, fd234; +add.f64 fd237, %75, %115; +add.f64 fd238, %61, fd237; +add.f64 fd239, %88, %101; +add.f64 fd240, fd239, fd238; +fma.rn.f64 fd241, fd233, 0d3FD3C6EF372FE950, %60; +mul.f64 fd242, fd235, 0d3FE9E3779B97F4A8; +sub.f64 fd243, fd241, fd242; +sub.f64 fd244, %75, %115; +mul.f64 fd245, fd244, 0d3FEE6F0E134454FF; +sub.f64 fd246, %88, %101; +mul.f64 fd247, fd246, 0dBFE2CF2304755A5E; +sub.f64 fd248, fd247, fd245; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +mul.f64 fd251, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd252, %60, fd251; +fma.rn.f64 fd253, fd235, 0d3FD3C6EF372FE950, fd252; +mul.f64 fd254, fd244, 0d3FE2CF2304755A5E; +mul.f64 fd255, fd246, 0d3FEE6F0E134454FF; +sub.f64 fd256, fd255, fd254; +sub.f64 fd257, fd253, fd256; +add.f64 fd258, fd256, fd253; +fma.rn.f64 fd259, fd237, 0d3FD3C6EF372FE950, %61; +mul.f64 fd260, fd239, 0d3FE9E3779B97F4A8; +sub.f64 fd261, fd259, fd260; +sub.f64 fd262, %73, %113; +mul.f64 fd263, fd262, 0d3FEE6F0E134454FF; +sub.f64 fd264, %86, %100; +mul.f64 fd265, fd264, 0dBFE2CF2304755A5E; +sub.f64 fd266, fd265, fd263; +add.f64 fd267, fd266, fd261; +sub.f64 fd268, fd261, fd266; +mul.f64 fd269, fd237, 0d3FE9E3779B97F4A8; +sub.f64 fd270, %61, fd269; +fma.rn.f64 fd271, fd239, 0d3FD3C6EF372FE950, fd270; +mul.f64 fd272, fd262, 0d3FE2CF2304755A5E; +mul.f64 fd273, fd264, 0d3FEE6F0E134454FF; +sub.f64 fd274, fd273, fd272; +add.f64 fd275, fd274, fd271; +sub.f64 fd276, fd271, fd274; +add.f64 fd277, %76, %116; +add.f64 fd278, %62, fd277; +add.f64 fd279, %89, %102; +add.f64 fd280, fd279, fd278; +add.f64 fd281, %77, %117; +add.f64 fd282, %64, fd281; +add.f64 fd283, %91, %104; +add.f64 fd284, fd283, fd282; +fma.rn.f64 fd285, fd277, 0d3FD3C6EF372FE950, %62; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +sub.f64 fd287, fd285, fd286; +sub.f64 fd288, %77, %117; +mul.f64 fd289, fd288, 0d3FEE6F0E134454FF; +sub.f64 fd290, %91, %104; +mul.f64 fd291, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd291, fd289; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, %62, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +fma.rn.f64 fd303, fd281, 0d3FD3C6EF372FE950, %64; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, %76, %116; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, %89, %102; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, %64, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +mov.u32 r4, %tid.x; +mul.f64 fd321, fd161, 0d3FEEFEA21D101EE0; +mul.f64 fd322, fd179, 0dBFCFD511FA1C0796; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd179, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd325, fd161, 0dBFCFD511FA1C0796, fd324; +mul.f64 fd326, fd205, 0d3FEC0AB44E81C059; +mul.f64 fd327, fd223, 0dBFDED50D5CBFA951; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd223, 0d3FEC0AB44E81C059; +fma.rn.f64 fd330, fd205, 0dBFDED50D5CBFA951, fd329; +mul.f64 fd331, fd249, 0d3FE753B603D2B816; +mul.f64 fd332, fd267, 0dBFE5E7CF55112014; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd267, 0d3FE753B603D2B816; +fma.rn.f64 fd335, fd249, 0dBFE5E7CF55112014, fd334; +mul.f64 fd336, fd293, 0d3FE1257E3C182B51; +mul.f64 fd337, fd311, 0dBFEB04BBFF642E86; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd311, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd293, 0dBFEB04BBFF642E86, fd339; +mul.f64 fd341, fd169, 0d3FEC0AB44E81C059; +mul.f64 fd342, fd187, 0dBFDED50D5CBFA951; +sub.f64 fd343, fd341, fd342; +mul.f64 fd344, fd187, 0d3FEC0AB44E81C059; +fma.rn.f64 fd345, fd169, 0dBFDED50D5CBFA951, fd344; +mul.f64 fd346, fd213, 0d3FE1257E3C182B51; +mul.f64 fd347, fd231, 0dBFEB04BBFF642E86; +sub.f64 fd348, fd346, fd347; +mul.f64 fd349, fd231, 0d3FE1257E3C182B51; +fma.rn.f64 fd350, fd213, 0dBFEB04BBFF642E86, fd349; +mul.f64 fd351, fd257, 0d3FB0130A1BE09379; +mul.f64 fd352, fd275, 0dBFEFEFD5BFE443FE; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd275, 0d3FB0130A1BE09379; +fma.rn.f64 fd355, fd257, 0dBFEFEFD5BFE443FE, fd354; +mul.f64 fd356, fd301, 0dBFDB3FF7C925819C; +mul.f64 fd357, fd319, 0dBFECF457DCDC158C; +sub.f64 fd358, fd356, fd357; +mul.f64 fd359, fd319, 0dBFDB3FF7C925819C; +fma.rn.f64 fd360, fd301, 0dBFECF457DCDC158C, fd359; +mul.f64 fd361, fd170, 0d3FE753B603D2B816; +mul.f64 fd362, fd188, 0dBFE5E7CF55112014; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd188, 0d3FE753B603D2B816; +fma.rn.f64 fd365, fd170, 0dBFE5E7CF55112014, fd364; +mul.f64 fd366, fd214, 0d3FB0130A1BE09379; +mul.f64 fd367, fd232, 0dBFEFEFD5BFE443FE; +sub.f64 fd368, fd366, fd367; +mul.f64 fd369, fd232, 0d3FB0130A1BE09379; +fma.rn.f64 fd370, fd214, 0dBFEFEFD5BFE443FE, fd369; +mul.f64 fd371, fd258, 0dBFE465C6FEB501BC; +mul.f64 fd372, fd276, 0dBFE8A80B635B6BEA; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd276, 0dBFE465C6FEB501BC; +fma.rn.f64 fd375, fd258, 0dBFE8A80B635B6BEA, fd374; +mul.f64 fd376, fd302, 0dBFEFBF675480D903; +mul.f64 fd377, fd320, 0dBFC00AEB5DA15BE0; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd320, 0dBFEFBF675480D903; +fma.rn.f64 fd380, fd302, 0dBFC00AEB5DA15BE0, fd379; +mul.f64 fd381, fd162, 0d3FE1257E3C182B51; +mul.f64 fd382, fd180, 0dBFEB04BBFF642E86; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd180, 0d3FE1257E3C182B51; +fma.rn.f64 fd385, fd162, 0dBFEB04BBFF642E86, fd384; +mul.f64 fd386, fd206, 0dBFDB3FF7C925819C; +mul.f64 fd387, fd224, 0dBFECF457DCDC158C; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd224, 0dBFDB3FF7C925819C; +fma.rn.f64 fd390, fd206, 0dBFECF457DCDC158C, fd389; +mul.f64 fd391, fd250, 0dBFEFBF675480D903; +mul.f64 fd392, fd268, 0dBFC00AEB5DA15BE0; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd268, 0dBFEFBF675480D903; +fma.rn.f64 fd395, fd250, 0dBFC00AEB5DA15BE0, fd394; +mul.f64 fd396, fd294, 0dBFE465C6FEB501BC; +mul.f64 fd397, fd312, 0d3FE8A80B635B6BEA; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd312, 0dBFE465C6FEB501BC; +fma.rn.f64 fd400, fd294, 0d3FE8A80B635B6BEA, fd399; +add.f64 fd401, fd148, fd280; +add.f64 fd402, fd104, fd401; +add.f64 fd403, fd192, fd236; +add.f64 fd404, fd403, fd402; +add.f64 fd405, fd152, fd284; +add.f64 fd406, fd108, fd405; +add.f64 fd407, fd196, fd240; +add.f64 fd408, fd407, fd406; +fma.rn.f64 fd409, fd401, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd410, fd403, 0d3FE9E3779B97F4A8; +sub.f64 fd411, fd409, fd410; +sub.f64 fd412, fd152, fd284; +mul.f64 fd413, fd412, 0d3FEE6F0E134454FF; +sub.f64 fd414, fd196, fd240; +mul.f64 fd415, fd414, 0dBFE2CF2304755A5E; +sub.f64 fd416, fd415, fd413; +sub.f64 fd417, fd411, fd416; +add.f64 fd418, fd416, fd411; +mul.f64 fd419, fd401, 0d3FE9E3779B97F4A8; +sub.f64 fd420, fd104, fd419; +fma.rn.f64 fd421, fd403, 0d3FD3C6EF372FE950, fd420; +mul.f64 fd422, fd412, 0d3FE2CF2304755A5E; +mul.f64 fd423, fd414, 0d3FEE6F0E134454FF; +sub.f64 fd424, fd423, fd422; +sub.f64 fd425, fd421, fd424; +add.f64 fd426, fd424, fd421; +fma.rn.f64 fd427, fd405, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd428, fd407, 0d3FE9E3779B97F4A8; +sub.f64 fd429, fd427, fd428; +sub.f64 fd430, fd148, fd280; +mul.f64 fd431, fd430, 0d3FEE6F0E134454FF; +sub.f64 fd432, fd192, fd236; +mul.f64 fd433, fd432, 0dBFE2CF2304755A5E; +sub.f64 fd434, fd433, fd431; +add.f64 fd435, fd434, fd429; +sub.f64 fd436, fd429, fd434; +mul.f64 fd437, fd405, 0d3FE9E3779B97F4A8; +sub.f64 fd438, fd108, fd437; +fma.rn.f64 fd439, fd407, 0d3FD3C6EF372FE950, fd438; +mul.f64 fd440, fd430, 0d3FE2CF2304755A5E; +mul.f64 fd441, fd432, 0d3FEE6F0E134454FF; +sub.f64 fd442, fd441, fd440; +add.f64 fd443, fd442, fd439; +sub.f64 fd444, fd439, fd442; +add.f64 fd445, fd323, fd338; +add.f64 fd446, fd117, fd445; +add.f64 fd447, fd328, fd333; +add.f64 fd448, fd447, fd446; +add.f64 fd449, fd325, fd340; +add.f64 fd450, fd135, fd449; +add.f64 fd451, fd330, fd335; +add.f64 fd452, fd451, fd450; +fma.rn.f64 fd453, fd445, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd454, fd447, 0d3FE9E3779B97F4A8; +sub.f64 fd455, fd453, fd454; +sub.f64 fd456, fd325, fd340; +mul.f64 fd457, fd456, 0d3FEE6F0E134454FF; +sub.f64 fd458, fd330, fd335; +mul.f64 fd459, fd458, 0dBFE2CF2304755A5E; +sub.f64 fd460, fd459, fd457; +sub.f64 fd461, fd455, fd460; +add.f64 fd462, fd460, fd455; +mul.f64 fd463, fd445, 0d3FE9E3779B97F4A8; +sub.f64 fd464, fd117, fd463; +fma.rn.f64 fd465, fd447, 0d3FD3C6EF372FE950, fd464; +mul.f64 fd466, fd456, 0d3FE2CF2304755A5E; +mul.f64 fd467, fd458, 0d3FEE6F0E134454FF; +sub.f64 fd468, fd467, fd466; +sub.f64 fd469, fd465, fd468; +add.f64 fd470, fd468, fd465; +fma.rn.f64 fd471, fd449, 0d3FD3C6EF372FE950, fd135; +mul.f64 fd472, fd451, 0d3FE9E3779B97F4A8; +sub.f64 fd473, fd471, fd472; +sub.f64 fd474, fd323, fd338; +mul.f64 fd475, fd474, 0d3FEE6F0E134454FF; +sub.f64 fd476, fd328, fd333; +mul.f64 fd477, fd476, 0dBFE2CF2304755A5E; +sub.f64 fd478, fd477, fd475; +add.f64 fd479, fd478, fd473; +sub.f64 fd480, fd473, fd478; +mul.f64 fd481, fd449, 0d3FE9E3779B97F4A8; +sub.f64 fd482, fd135, fd481; +fma.rn.f64 fd483, fd451, 0d3FD3C6EF372FE950, fd482; +mul.f64 fd484, fd474, 0d3FE2CF2304755A5E; +mul.f64 fd485, fd476, 0d3FEE6F0E134454FF; +sub.f64 fd486, fd485, fd484; +add.f64 fd487, fd486, fd483; +sub.f64 fd488, fd483, fd486; +add.f64 fd489, fd343, fd358; +add.f64 fd490, fd125, fd489; +add.f64 fd491, fd348, fd353; +add.f64 fd492, fd491, fd490; +add.f64 fd493, fd345, fd360; +add.f64 fd494, fd143, fd493; +add.f64 fd495, fd350, fd355; +add.f64 fd496, fd495, fd494; +fma.rn.f64 fd497, fd489, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd498, fd491, 0d3FE9E3779B97F4A8; +sub.f64 fd499, fd497, fd498; +sub.f64 fd500, fd345, fd360; +mul.f64 fd501, fd500, 0d3FEE6F0E134454FF; +sub.f64 fd502, fd350, fd355; +mul.f64 fd503, fd502, 0dBFE2CF2304755A5E; +sub.f64 fd504, fd503, fd501; +sub.f64 fd505, fd499, fd504; +add.f64 fd506, fd504, fd499; +mul.f64 fd507, fd489, 0d3FE9E3779B97F4A8; +sub.f64 fd508, fd125, fd507; +fma.rn.f64 fd509, fd491, 0d3FD3C6EF372FE950, fd508; +mul.f64 fd510, fd500, 0d3FE2CF2304755A5E; +mul.f64 fd511, fd502, 0d3FEE6F0E134454FF; +sub.f64 fd512, fd511, fd510; +sub.f64 fd513, fd509, fd512; +add.f64 fd514, fd512, fd509; +fma.rn.f64 fd515, fd493, 0d3FD3C6EF372FE950, fd143; +mul.f64 fd516, fd495, 0d3FE9E3779B97F4A8; +sub.f64 fd517, fd515, fd516; +sub.f64 fd518, fd343, fd358; +mul.f64 fd519, fd518, 0d3FEE6F0E134454FF; +sub.f64 fd520, fd348, fd353; +mul.f64 fd521, fd520, 0dBFE2CF2304755A5E; +sub.f64 fd522, fd521, fd519; +add.f64 fd523, fd522, fd517; +sub.f64 fd524, fd517, fd522; +mul.f64 fd525, fd493, 0d3FE9E3779B97F4A8; +sub.f64 fd526, fd143, fd525; +fma.rn.f64 fd527, fd495, 0d3FD3C6EF372FE950, fd526; +mul.f64 fd528, fd518, 0d3FE2CF2304755A5E; +mul.f64 fd529, fd520, 0d3FEE6F0E134454FF; +sub.f64 fd530, fd529, fd528; +add.f64 fd531, fd530, fd527; +sub.f64 fd532, fd527, fd530; +add.f64 fd533, fd363, fd378; +add.f64 fd534, fd126, fd533; +add.f64 fd535, fd368, fd373; +add.f64 fd536, fd535, fd534; +add.f64 fd537, fd365, fd380; +add.f64 fd538, fd144, fd537; +add.f64 fd539, fd370, fd375; +add.f64 fd540, fd539, fd538; +fma.rn.f64 fd541, fd533, 0d3FD3C6EF372FE950, fd126; +mul.f64 fd542, fd535, 0d3FE9E3779B97F4A8; +sub.f64 fd543, fd541, fd542; +sub.f64 fd544, fd365, fd380; +mul.f64 fd545, fd544, 0d3FEE6F0E134454FF; +sub.f64 fd546, fd370, fd375; +mul.f64 fd547, fd546, 0dBFE2CF2304755A5E; +sub.f64 fd548, fd547, fd545; +sub.f64 fd549, fd543, fd548; +add.f64 fd550, fd548, fd543; +mul.f64 fd551, fd533, 0d3FE9E3779B97F4A8; +sub.f64 fd552, fd126, fd551; +fma.rn.f64 fd553, fd535, 0d3FD3C6EF372FE950, fd552; +mul.f64 fd554, fd544, 0d3FE2CF2304755A5E; +mul.f64 fd555, fd546, 0d3FEE6F0E134454FF; +sub.f64 fd556, fd555, fd554; +sub.f64 fd557, fd553, fd556; +add.f64 fd558, fd556, fd553; +fma.rn.f64 fd559, fd537, 0d3FD3C6EF372FE950, fd144; +mul.f64 fd560, fd539, 0d3FE9E3779B97F4A8; +sub.f64 fd561, fd559, fd560; +sub.f64 fd562, fd363, fd378; +mul.f64 fd563, fd562, 0d3FEE6F0E134454FF; +sub.f64 fd564, fd368, fd373; +mul.f64 fd565, fd564, 0dBFE2CF2304755A5E; +sub.f64 fd566, fd565, fd563; +add.f64 fd567, fd566, fd561; +sub.f64 fd568, fd561, fd566; +mul.f64 fd569, fd537, 0d3FE9E3779B97F4A8; +sub.f64 fd570, fd144, fd569; +fma.rn.f64 fd571, fd539, 0d3FD3C6EF372FE950, fd570; +mul.f64 fd572, fd562, 0d3FE2CF2304755A5E; +mul.f64 fd573, fd564, 0d3FEE6F0E134454FF; +sub.f64 fd574, fd573, fd572; +add.f64 fd575, fd574, fd571; +sub.f64 fd576, fd571, fd574; +add.f64 fd577, fd383, fd398; +add.f64 fd578, fd118, fd577; +add.f64 fd579, fd388, fd393; +add.f64 fd580, fd579, fd578; +add.f64 fd581, fd385, fd400; +add.f64 fd582, fd136, fd581; +add.f64 fd583, fd390, fd395; +add.f64 fd584, fd583, fd582; +fma.rn.f64 fd585, fd577, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd586, fd579, 0d3FE9E3779B97F4A8; +sub.f64 fd587, fd585, fd586; +sub.f64 fd588, fd385, fd400; +mul.f64 fd589, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd590, fd390, fd395; +mul.f64 fd591, fd590, 0dBFE2CF2304755A5E; +sub.f64 fd592, fd591, fd589; +sub.f64 fd593, fd587, fd592; +add.f64 fd594, fd592, fd587; +mul.f64 fd595, fd577, 0d3FE9E3779B97F4A8; +sub.f64 fd596, fd118, fd595; +fma.rn.f64 fd597, fd579, 0d3FD3C6EF372FE950, fd596; +mul.f64 fd598, fd588, 0d3FE2CF2304755A5E; +mul.f64 fd599, fd590, 0d3FEE6F0E134454FF; +sub.f64 fd600, fd599, fd598; +sub.f64 fd601, fd597, fd600; +add.f64 fd602, fd600, fd597; +fma.rn.f64 fd603, fd581, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd604, fd583, 0d3FE9E3779B97F4A8; +sub.f64 fd605, fd603, fd604; +sub.f64 fd606, fd383, fd398; +mul.f64 fd607, fd606, 0d3FEE6F0E134454FF; +sub.f64 fd608, fd388, fd393; +mul.f64 fd609, fd608, 0dBFE2CF2304755A5E; +sub.f64 fd610, fd609, fd607; +add.f64 fd611, fd610, fd605; +sub.f64 fd612, fd605, fd610; +mul.f64 fd613, fd581, 0d3FE9E3779B97F4A8; +sub.f64 fd614, fd136, fd613; +fma.rn.f64 fd615, fd583, 0d3FD3C6EF372FE950, fd614; +mul.f64 fd616, fd606, 0d3FE2CF2304755A5E; +mul.f64 fd617, fd608, 0d3FEE6F0E134454FF; +sub.f64 fd618, fd617, fd616; +add.f64 fd619, fd618, fd615; +sub.f64 fd620, fd615, fd618; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd621, fd622}, [rd6]; +mul.f64 fd625, fd621, fd448; +mul.f64 fd626, fd622, fd452; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd621, fd452; +fma.rn.f64 fd629, fd622, fd448, fd628; +mul.f64 fd630, fd621, fd621; +mul.f64 fd631, fd622, fd622; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd622, fd621; +fma.rn.f64 fd634, fd622, fd621, fd633; +mul.f64 fd635, fd632, fd492; +mul.f64 fd636, fd634, fd496; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd632, fd496; +fma.rn.f64 fd639, fd634, fd492, fd638; +mul.f64 fd640, fd621, fd632; +mul.f64 fd641, fd622, fd634; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd621, fd634; +fma.rn.f64 fd644, fd622, fd632, fd643; +mul.f64 fd645, fd642, fd536; +mul.f64 fd646, fd644, fd540; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd642, fd540; +fma.rn.f64 fd649, fd644, fd536, fd648; +mul.f64 fd650, fd621, fd642; +mul.f64 fd651, fd622, fd644; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd621, fd644; +fma.rn.f64 fd654, fd622, fd642, fd653; +mul.f64 fd655, fd652, fd580; +mul.f64 fd656, fd654, fd584; +sub.f64 fd657, fd655, fd656; +mul.f64 fd658, fd652, fd584; +fma.rn.f64 fd659, fd654, fd580, fd658; +mul.f64 fd660, fd621, fd652; +mul.f64 fd661, fd622, fd654; +sub.f64 fd662, fd660, fd661; +mul.f64 fd663, fd621, fd654; +fma.rn.f64 fd664, fd622, fd652, fd663; +mul.f64 fd665, fd662, fd417; +mul.f64 fd666, fd664, fd435; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd662, fd435; +fma.rn.f64 fd669, fd664, fd417, fd668; +mul.f64 fd670, fd621, fd662; +mul.f64 fd671, fd622, fd664; +sub.f64 fd672, fd670, fd671; +mul.f64 fd673, fd621, fd664; +fma.rn.f64 fd674, fd622, fd662, fd673; +mul.f64 fd675, fd672, fd461; +mul.f64 fd676, fd674, fd479; +sub.f64 fd677, fd675, fd676; +mul.f64 fd678, fd672, fd479; +fma.rn.f64 fd679, fd674, fd461, fd678; +mul.f64 fd680, fd621, fd672; +mul.f64 fd681, fd622, fd674; +sub.f64 fd682, fd680, fd681; +mul.f64 fd683, fd621, fd674; +fma.rn.f64 fd684, fd622, fd672, fd683; +mul.f64 fd685, fd682, fd505; +mul.f64 fd686, fd684, fd523; +sub.f64 fd687, fd685, fd686; +mul.f64 fd688, fd682, fd523; +fma.rn.f64 fd689, fd684, fd505, fd688; +mul.f64 fd690, fd621, fd682; +mul.f64 fd691, fd622, fd684; +sub.f64 fd692, fd690, fd691; +mul.f64 fd693, fd621, fd684; +fma.rn.f64 fd694, fd622, fd682, fd693; +mul.f64 fd695, fd692, fd549; +mul.f64 fd696, fd694, fd567; +sub.f64 fd697, fd695, fd696; +mul.f64 fd698, fd692, fd567; +fma.rn.f64 fd699, fd694, fd549, fd698; +mul.f64 fd700, fd621, fd692; +mul.f64 fd701, fd622, fd694; +sub.f64 fd702, fd700, fd701; +mul.f64 fd703, fd621, fd694; +fma.rn.f64 fd704, fd622, fd692, fd703; +mul.f64 fd705, fd702, fd593; +mul.f64 fd706, fd704, fd611; +sub.f64 fd707, fd705, fd706; +mul.f64 fd708, fd702, fd611; +fma.rn.f64 fd709, fd704, fd593, fd708; +mul.f64 fd710, fd621, fd702; +mul.f64 fd711, fd622, fd704; +sub.f64 fd712, fd710, fd711; +mul.f64 fd713, fd621, fd704; +fma.rn.f64 fd714, fd622, fd702, fd713; +mul.f64 fd715, fd712, fd425; +mul.f64 fd716, fd714, fd443; +sub.f64 fd717, fd715, fd716; +mul.f64 fd718, fd712, fd443; +fma.rn.f64 fd719, fd714, fd425, fd718; +mul.f64 fd720, fd621, fd712; +mul.f64 fd721, fd622, fd714; +sub.f64 fd722, fd720, fd721; +mul.f64 fd723, fd621, fd714; +fma.rn.f64 fd724, fd622, fd712, fd723; +mul.f64 fd725, fd722, fd469; +mul.f64 fd726, fd724, fd487; +sub.f64 fd727, fd725, fd726; +mul.f64 fd728, fd722, fd487; +fma.rn.f64 fd729, fd724, fd469, fd728; +mul.f64 fd730, fd621, fd722; +mul.f64 fd731, fd622, fd724; +sub.f64 fd732, fd730, fd731; +mul.f64 fd733, fd621, fd724; +fma.rn.f64 fd734, fd622, fd722, fd733; +mul.f64 fd735, fd732, fd513; +mul.f64 fd736, fd734, fd531; +sub.f64 fd737, fd735, fd736; +mul.f64 fd738, fd732, fd531; +fma.rn.f64 fd739, fd734, fd513, fd738; +ld.global.v2.f64 {fd740, fd741}, [rd6+400]; +mul.f64 fd744, fd740, fd557; +mul.f64 fd745, fd741, fd575; +sub.f64 fd746, fd744, fd745; +mul.f64 fd747, fd740, fd575; +fma.rn.f64 fd748, fd741, fd557, fd747; +mul.f64 fd749, fd621, fd740; +mul.f64 fd750, fd622, fd741; +sub.f64 fd751, fd749, fd750; +mul.f64 fd752, fd621, fd741; +fma.rn.f64 fd753, fd622, fd740, fd752; +mul.f64 fd754, fd751, fd601; +mul.f64 fd755, fd753, fd619; +sub.f64 fd756, fd754, fd755; +mul.f64 fd757, fd751, fd619; +fma.rn.f64 fd758, fd753, fd601, fd757; +mul.f64 fd759, fd621, fd751; +mul.f64 fd760, fd622, fd753; +sub.f64 fd761, fd759, fd760; +mul.f64 fd762, fd621, fd753; +fma.rn.f64 fd763, fd622, fd751, fd762; +mul.f64 fd764, fd761, fd426; +mul.f64 fd765, fd763, fd444; +sub.f64 fd766, fd764, fd765; +mul.f64 fd767, fd761, fd444; +fma.rn.f64 fd768, fd763, fd426, fd767; +mul.f64 fd769, fd621, fd761; +mul.f64 fd770, fd622, fd763; +sub.f64 fd771, fd769, fd770; +mul.f64 fd772, fd621, fd763; +fma.rn.f64 fd773, fd622, fd761, fd772; +mul.f64 fd774, fd771, fd470; +mul.f64 fd775, fd773, fd488; +sub.f64 fd776, fd774, fd775; +mul.f64 fd777, fd771, fd488; +fma.rn.f64 fd778, fd773, fd470, fd777; +mul.f64 fd779, fd621, fd771; +mul.f64 fd780, fd622, fd773; +sub.f64 fd781, fd779, fd780; +mul.f64 fd782, fd621, fd773; +fma.rn.f64 fd783, fd622, fd771, fd782; +mul.f64 fd784, fd781, fd514; +mul.f64 fd785, fd783, fd532; +sub.f64 fd786, fd784, fd785; +mul.f64 fd787, fd781, fd532; +fma.rn.f64 fd788, fd783, fd514, fd787; +mul.f64 fd789, fd621, fd781; +mul.f64 fd790, fd622, fd783; +sub.f64 fd791, fd789, fd790; +mul.f64 fd792, fd621, fd783; +fma.rn.f64 fd793, fd622, fd781, fd792; +mul.f64 fd794, fd791, fd558; +mul.f64 fd795, fd793, fd576; +sub.f64 fd796, fd794, fd795; +mul.f64 fd797, fd791, fd576; +fma.rn.f64 fd798, fd793, fd558, fd797; +mul.f64 fd799, fd621, fd791; +mul.f64 fd800, fd622, fd793; +sub.f64 fd801, fd799, fd800; +mul.f64 fd802, fd621, fd793; +fma.rn.f64 fd803, fd622, fd791, fd802; +mul.f64 fd804, fd801, fd602; +mul.f64 fd805, fd803, fd620; +sub.f64 fd806, fd804, fd805; +mul.f64 fd807, fd801, fd620; +fma.rn.f64 fd808, fd803, fd602, fd807; +mul.f64 fd809, fd621, fd801; +mul.f64 fd810, fd622, fd803; +sub.f64 fd811, fd809, fd810; +mul.f64 fd812, fd621, fd803; +fma.rn.f64 fd813, fd622, fd801, fd812; +mul.f64 fd814, fd811, fd418; +mul.f64 fd815, fd813, fd436; +sub.f64 fd816, fd814, fd815; +mul.f64 fd817, fd811, fd436; +fma.rn.f64 fd818, fd813, fd418, fd817; +mul.f64 fd819, fd621, fd811; +mul.f64 fd820, fd622, fd813; +sub.f64 fd821, fd819, fd820; +mul.f64 fd822, fd621, fd813; +fma.rn.f64 fd823, fd622, fd811, fd822; +mul.f64 fd824, fd821, fd462; +mul.f64 fd825, fd823, fd480; +sub.f64 fd826, fd824, fd825; +mul.f64 fd827, fd821, fd480; +fma.rn.f64 fd828, fd823, fd462, fd827; +mul.f64 fd829, fd621, fd821; +mul.f64 fd830, fd622, fd823; +sub.f64 fd831, fd829, fd830; +mul.f64 fd832, fd621, fd823; +fma.rn.f64 fd833, fd622, fd821, fd832; +mul.f64 fd834, fd831, fd506; +mul.f64 fd835, fd833, fd524; +sub.f64 fd836, fd834, fd835; +mul.f64 fd837, fd831, fd524; +fma.rn.f64 fd838, fd833, fd506, fd837; +mul.f64 fd839, fd621, fd831; +mul.f64 fd840, fd622, fd833; +sub.f64 fd841, fd839, fd840; +mul.f64 fd842, fd621, fd833; +fma.rn.f64 fd843, fd622, fd831, fd842; +mul.f64 fd844, fd841, fd550; +mul.f64 fd845, fd843, fd568; +sub.f64 fd846, fd844, fd845; +mul.f64 fd847, fd841, fd568; +fma.rn.f64 fd848, fd843, fd550, fd847; +mul.f64 fd849, fd621, fd841; +mul.f64 fd850, fd622, fd843; +sub.f64 fd851, fd849, fd850; +mul.f64 fd852, fd621, fd843; +fma.rn.f64 fd853, fd622, fd841, fd852; +mul.f64 fd854, fd851, fd594; +mul.f64 fd855, fd853, fd612; +sub.f64 fd856, fd854, fd855; +mul.f64 fd857, fd851, fd612; +fma.rn.f64 fd858, fd853, fd594, fd857; +mad.lo.s32 r8, r5, 5000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +st.shared.f64 [r9], fd404; +st.shared.f64 [r9+8], fd627; +st.shared.f64 [r9+16], fd637; +st.shared.f64 [r9+24], fd647; +st.shared.f64 [r9+32], fd657; +st.shared.f64 [r9+40], fd667; +st.shared.f64 [r9+48], fd677; +st.shared.f64 [r9+56], fd687; +st.shared.f64 [r9+64], fd697; +st.shared.f64 [r9+72], fd707; +st.shared.f64 [r9+80], fd717; +st.shared.f64 [r9+88], fd727; +st.shared.f64 [r9+96], fd737; +st.shared.f64 [r9+104], fd746; +st.shared.f64 [r9+112], fd756; +st.shared.f64 [r9+120], fd766; +st.shared.f64 [r9+128], fd776; +st.shared.f64 [r9+136], fd786; +st.shared.f64 [r9+144], fd796; +st.shared.f64 [r9+152], fd806; +st.shared.f64 [r9+160], fd816; +st.shared.f64 [r9+168], fd826; +st.shared.f64 [r9+176], fd836; +st.shared.f64 [r9+184], fd846; +st.shared.f64 [r9+192], fd856; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.f64 fd859, [r10]; +ld.shared.f64 fd860, [r10+200]; +ld.shared.f64 fd861, [r10+400]; +ld.shared.f64 fd862, [r10+600]; +ld.shared.f64 fd863, [r10+800]; +ld.shared.f64 fd864, [r10+1000]; +ld.shared.f64 fd865, [r10+1200]; +ld.shared.f64 fd866, [r10+1400]; +ld.shared.f64 fd867, [r10+1600]; +ld.shared.f64 fd868, [r10+1800]; +ld.shared.f64 fd869, [r10+2000]; +ld.shared.f64 fd870, [r10+2200]; +ld.shared.f64 fd871, [r10+2400]; +ld.shared.f64 fd872, [r10+2600]; +ld.shared.f64 fd873, [r10+2800]; +ld.shared.f64 fd874, [r10+3000]; +ld.shared.f64 fd875, [r10+3200]; +ld.shared.f64 fd876, [r10+3400]; +ld.shared.f64 fd877, [r10+3600]; +ld.shared.f64 fd878, [r10+3800]; +ld.shared.f64 fd879, [r10+4000]; +ld.shared.f64 fd880, [r10+4200]; +ld.shared.f64 fd881, [r10+4400]; +ld.shared.f64 fd882, [r10+4600]; +ld.shared.f64 fd883, [r10+4800]; +barrier.sync 0; +st.shared.f64 [r9], fd408; +st.shared.f64 [r9+8], fd629; +st.shared.f64 [r9+16], fd639; +st.shared.f64 [r9+24], fd649; +st.shared.f64 [r9+32], fd659; +st.shared.f64 [r9+40], fd669; +st.shared.f64 [r9+48], fd679; +st.shared.f64 [r9+56], fd689; +st.shared.f64 [r9+64], fd699; +st.shared.f64 [r9+72], fd709; +st.shared.f64 [r9+80], fd719; +st.shared.f64 [r9+88], fd729; +st.shared.f64 [r9+96], fd739; +st.shared.f64 [r9+104], fd748; +st.shared.f64 [r9+112], fd758; +st.shared.f64 [r9+120], fd768; +st.shared.f64 [r9+128], fd778; +st.shared.f64 [r9+136], fd788; +st.shared.f64 [r9+144], fd798; +st.shared.f64 [r9+152], fd808; +st.shared.f64 [r9+160], fd818; +st.shared.f64 [r9+168], fd828; +st.shared.f64 [r9+176], fd838; +st.shared.f64 [r9+184], fd848; +st.shared.f64 [r9+192], fd858; +barrier.sync 0; +ld.shared.f64 fd884, [r10]; +ld.shared.f64 fd885, [r10+200]; +ld.shared.f64 fd886, [r10+400]; +ld.shared.f64 fd887, [r10+600]; +ld.shared.f64 fd888, [r10+800]; +ld.shared.f64 fd889, [r10+1000]; +ld.shared.f64 fd890, [r10+1200]; +ld.shared.f64 fd891, [r10+1400]; +ld.shared.f64 fd892, [r10+1600]; +ld.shared.f64 fd893, [r10+1800]; +ld.shared.f64 fd894, [r10+2000]; +ld.shared.f64 fd895, [r10+2200]; +ld.shared.f64 fd896, [r10+2400]; +ld.shared.f64 fd897, [r10+2600]; +ld.shared.f64 fd898, [r10+2800]; +ld.shared.f64 fd899, [r10+3000]; +ld.shared.f64 fd900, [r10+3200]; +ld.shared.f64 fd901, [r10+3400]; +ld.shared.f64 fd902, [r10+3600]; +ld.shared.f64 fd903, [r10+3800]; +ld.shared.f64 fd904, [r10+4000]; +ld.shared.f64 fd905, [r10+4200]; +ld.shared.f64 fd906, [r10+4400]; +ld.shared.f64 fd907, [r10+4600]; +ld.shared.f64 fd908, [r10+4800]; +add.f64 fd909, fd864, fd879; +add.f64 fd910, fd859, fd909; +add.f64 fd911, fd869, fd874; +add.f64 fd912, fd911, fd910; +add.f64 fd913, fd889, fd904; +add.f64 fd914, fd884, fd913; +add.f64 fd915, fd894, fd899; +add.f64 fd916, fd915, fd914; +fma.rn.f64 fd917, fd909, 0d3FD3C6EF372FE950, fd859; +mul.f64 fd918, fd911, 0d3FE9E3779B97F4A8; +sub.f64 fd919, fd917, fd918; +sub.f64 fd920, fd889, fd904; +mul.f64 fd921, fd920, 0d3FEE6F0E134454FF; +sub.f64 fd922, fd894, fd899; +mul.f64 fd923, fd922, 0dBFE2CF2304755A5E; +sub.f64 fd924, fd923, fd921; +sub.f64 fd925, fd919, fd924; +add.f64 fd926, fd924, fd919; +mul.f64 fd927, fd909, 0d3FE9E3779B97F4A8; +sub.f64 fd928, fd859, fd927; +fma.rn.f64 fd929, fd911, 0d3FD3C6EF372FE950, fd928; +mul.f64 fd930, fd920, 0d3FE2CF2304755A5E; +mul.f64 fd931, fd922, 0d3FEE6F0E134454FF; +sub.f64 fd932, fd931, fd930; +sub.f64 fd933, fd929, fd932; +add.f64 fd934, fd932, fd929; +fma.rn.f64 fd935, fd913, 0d3FD3C6EF372FE950, fd884; +mul.f64 fd936, fd915, 0d3FE9E3779B97F4A8; +sub.f64 fd937, fd935, fd936; +sub.f64 fd938, fd864, fd879; +mul.f64 fd939, fd938, 0d3FEE6F0E134454FF; +sub.f64 fd940, fd869, fd874; +mul.f64 fd941, fd940, 0dBFE2CF2304755A5E; +sub.f64 fd942, fd941, fd939; +add.f64 fd943, fd942, fd937; +sub.f64 fd944, fd937, fd942; +mul.f64 fd945, fd913, 0d3FE9E3779B97F4A8; +sub.f64 fd946, fd884, fd945; +fma.rn.f64 fd947, fd915, 0d3FD3C6EF372FE950, fd946; +mul.f64 fd948, fd938, 0d3FE2CF2304755A5E; +mul.f64 fd949, fd940, 0d3FEE6F0E134454FF; +sub.f64 fd950, fd949, fd948; +add.f64 fd951, fd950, fd947; +sub.f64 fd952, fd947, fd950; +add.f64 fd953, fd865, fd880; +add.f64 fd954, fd860, fd953; +add.f64 fd955, fd870, fd875; +add.f64 fd956, fd955, fd954; +add.f64 fd957, fd890, fd905; +add.f64 fd958, fd885, fd957; +add.f64 fd959, fd895, fd900; +add.f64 fd960, fd959, fd958; +fma.rn.f64 fd961, fd953, 0d3FD3C6EF372FE950, fd860; +mul.f64 fd962, fd955, 0d3FE9E3779B97F4A8; +sub.f64 fd963, fd961, fd962; +sub.f64 fd964, fd890, fd905; +mul.f64 fd965, fd964, 0d3FEE6F0E134454FF; +sub.f64 fd966, fd895, fd900; +mul.f64 fd967, fd966, 0dBFE2CF2304755A5E; +sub.f64 fd968, fd967, fd965; +sub.f64 fd969, fd963, fd968; +add.f64 fd970, fd968, fd963; +mul.f64 fd971, fd953, 0d3FE9E3779B97F4A8; +sub.f64 fd972, fd860, fd971; +fma.rn.f64 fd973, fd955, 0d3FD3C6EF372FE950, fd972; +mul.f64 fd974, fd964, 0d3FE2CF2304755A5E; +mul.f64 fd975, fd966, 0d3FEE6F0E134454FF; +sub.f64 fd976, fd975, fd974; +sub.f64 fd977, fd973, fd976; +add.f64 fd978, fd976, fd973; +fma.rn.f64 fd979, fd957, 0d3FD3C6EF372FE950, fd885; +mul.f64 fd980, fd959, 0d3FE9E3779B97F4A8; +sub.f64 fd981, fd979, fd980; +sub.f64 fd982, fd865, fd880; +mul.f64 fd983, fd982, 0d3FEE6F0E134454FF; +sub.f64 fd984, fd870, fd875; +mul.f64 fd985, fd984, 0dBFE2CF2304755A5E; +sub.f64 fd986, fd985, fd983; +add.f64 fd987, fd986, fd981; +sub.f64 fd988, fd981, fd986; +mul.f64 fd989, fd957, 0d3FE9E3779B97F4A8; +sub.f64 fd990, fd885, fd989; +fma.rn.f64 fd991, fd959, 0d3FD3C6EF372FE950, fd990; +mul.f64 fd992, fd982, 0d3FE2CF2304755A5E; +mul.f64 fd993, fd984, 0d3FEE6F0E134454FF; +sub.f64 fd994, fd993, fd992; +add.f64 fd995, fd994, fd991; +sub.f64 fd996, fd991, fd994; +add.f64 fd997, fd866, fd881; +add.f64 fd998, fd861, fd997; +add.f64 fd999, fd871, fd876; +add.f64 fd1000, fd999, fd998; +add.f64 fd1001, fd891, fd906; +add.f64 fd1002, fd886, fd1001; +add.f64 fd1003, fd896, fd901; +add.f64 fd1004, fd1003, fd1002; +fma.rn.f64 fd1005, fd997, 0d3FD3C6EF372FE950, fd861; +mul.f64 fd1006, fd999, 0d3FE9E3779B97F4A8; +sub.f64 fd1007, fd1005, fd1006; +sub.f64 fd1008, fd891, fd906; +mul.f64 fd1009, fd1008, 0d3FEE6F0E134454FF; +sub.f64 fd1010, fd896, fd901; +mul.f64 fd1011, fd1010, 0dBFE2CF2304755A5E; +sub.f64 fd1012, fd1011, fd1009; +sub.f64 fd1013, fd1007, fd1012; +add.f64 fd1014, fd1012, fd1007; +mul.f64 fd1015, fd997, 0d3FE9E3779B97F4A8; +sub.f64 fd1016, fd861, fd1015; +fma.rn.f64 fd1017, fd999, 0d3FD3C6EF372FE950, fd1016; +mul.f64 fd1018, fd1008, 0d3FE2CF2304755A5E; +mul.f64 fd1019, fd1010, 0d3FEE6F0E134454FF; +sub.f64 fd1020, fd1019, fd1018; +sub.f64 fd1021, fd1017, fd1020; +add.f64 fd1022, fd1020, fd1017; +fma.rn.f64 fd1023, fd1001, 0d3FD3C6EF372FE950, fd886; +mul.f64 fd1024, fd1003, 0d3FE9E3779B97F4A8; +sub.f64 fd1025, fd1023, fd1024; +sub.f64 fd1026, fd866, fd881; +mul.f64 fd1027, fd1026, 0d3FEE6F0E134454FF; +sub.f64 fd1028, fd871, fd876; +mul.f64 fd1029, fd1028, 0dBFE2CF2304755A5E; +sub.f64 fd1030, fd1029, fd1027; +add.f64 fd1031, fd1030, fd1025; +sub.f64 fd1032, fd1025, fd1030; +mul.f64 fd1033, fd1001, 0d3FE9E3779B97F4A8; +sub.f64 fd1034, fd886, fd1033; +fma.rn.f64 fd1035, fd1003, 0d3FD3C6EF372FE950, fd1034; +mul.f64 fd1036, fd1026, 0d3FE2CF2304755A5E; +mul.f64 fd1037, fd1028, 0d3FEE6F0E134454FF; +sub.f64 fd1038, fd1037, fd1036; +add.f64 fd1039, fd1038, fd1035; +sub.f64 fd1040, fd1035, fd1038; +add.f64 fd1041, fd867, fd882; +add.f64 fd1042, fd862, fd1041; +add.f64 fd1043, fd872, fd877; +add.f64 fd1044, fd1043, fd1042; +add.f64 fd1045, fd892, fd907; +add.f64 fd1046, fd887, fd1045; +add.f64 fd1047, fd897, fd902; +add.f64 fd1048, fd1047, fd1046; +fma.rn.f64 fd1049, fd1041, 0d3FD3C6EF372FE950, fd862; +mul.f64 fd1050, fd1043, 0d3FE9E3779B97F4A8; +sub.f64 fd1051, fd1049, fd1050; +sub.f64 fd1052, fd892, fd907; +mul.f64 fd1053, fd1052, 0d3FEE6F0E134454FF; +sub.f64 fd1054, fd897, fd902; +mul.f64 fd1055, fd1054, 0dBFE2CF2304755A5E; +sub.f64 fd1056, fd1055, fd1053; +sub.f64 fd1057, fd1051, fd1056; +add.f64 fd1058, fd1056, fd1051; +mul.f64 fd1059, fd1041, 0d3FE9E3779B97F4A8; +sub.f64 fd1060, fd862, fd1059; +fma.rn.f64 fd1061, fd1043, 0d3FD3C6EF372FE950, fd1060; +mul.f64 fd1062, fd1052, 0d3FE2CF2304755A5E; +mul.f64 fd1063, fd1054, 0d3FEE6F0E134454FF; +sub.f64 fd1064, fd1063, fd1062; +sub.f64 fd1065, fd1061, fd1064; +add.f64 fd1066, fd1064, fd1061; +fma.rn.f64 fd1067, fd1045, 0d3FD3C6EF372FE950, fd887; +mul.f64 fd1068, fd1047, 0d3FE9E3779B97F4A8; +sub.f64 fd1069, fd1067, fd1068; +sub.f64 fd1070, fd867, fd882; +mul.f64 fd1071, fd1070, 0d3FEE6F0E134454FF; +sub.f64 fd1072, fd872, fd877; +mul.f64 fd1073, fd1072, 0dBFE2CF2304755A5E; +sub.f64 fd1074, fd1073, fd1071; +add.f64 fd1075, fd1074, fd1069; +sub.f64 fd1076, fd1069, fd1074; +mul.f64 fd1077, fd1045, 0d3FE9E3779B97F4A8; +sub.f64 fd1078, fd887, fd1077; +fma.rn.f64 fd1079, fd1047, 0d3FD3C6EF372FE950, fd1078; +mul.f64 fd1080, fd1070, 0d3FE2CF2304755A5E; +mul.f64 fd1081, fd1072, 0d3FEE6F0E134454FF; +sub.f64 fd1082, fd1081, fd1080; +add.f64 fd1083, fd1082, fd1079; +sub.f64 fd1084, fd1079, fd1082; +add.f64 fd1085, fd868, fd883; +add.f64 fd1086, fd863, fd1085; +add.f64 fd1087, fd873, fd878; +add.f64 fd1088, fd1087, fd1086; +add.f64 fd1089, fd893, fd908; +add.f64 fd1090, fd888, fd1089; +add.f64 fd1091, fd898, fd903; +add.f64 fd1092, fd1091, fd1090; +fma.rn.f64 fd1093, fd1085, 0d3FD3C6EF372FE950, fd863; +mul.f64 fd1094, fd1087, 0d3FE9E3779B97F4A8; +sub.f64 fd1095, fd1093, fd1094; +sub.f64 fd1096, fd893, fd908; +mul.f64 fd1097, fd1096, 0d3FEE6F0E134454FF; +sub.f64 fd1098, fd898, fd903; +mul.f64 fd1099, fd1098, 0dBFE2CF2304755A5E; +sub.f64 fd1100, fd1099, fd1097; +sub.f64 fd1101, fd1095, fd1100; +add.f64 fd1102, fd1100, fd1095; +mul.f64 fd1103, fd1085, 0d3FE9E3779B97F4A8; +sub.f64 fd1104, fd863, fd1103; +fma.rn.f64 fd1105, fd1087, 0d3FD3C6EF372FE950, fd1104; +mul.f64 fd1106, fd1096, 0d3FE2CF2304755A5E; +mul.f64 fd1107, fd1098, 0d3FEE6F0E134454FF; +sub.f64 fd1108, fd1107, fd1106; +sub.f64 fd1109, fd1105, fd1108; +add.f64 fd1110, fd1108, fd1105; +fma.rn.f64 fd1111, fd1089, 0d3FD3C6EF372FE950, fd888; +mul.f64 fd1112, fd1091, 0d3FE9E3779B97F4A8; +sub.f64 fd1113, fd1111, fd1112; +sub.f64 fd1114, fd868, fd883; +mul.f64 fd1115, fd1114, 0d3FEE6F0E134454FF; +sub.f64 fd1116, fd873, fd878; +mul.f64 fd1117, fd1116, 0dBFE2CF2304755A5E; +sub.f64 fd1118, fd1117, fd1115; +add.f64 fd1119, fd1118, fd1113; +sub.f64 fd1120, fd1113, fd1118; +mul.f64 fd1121, fd1089, 0d3FE9E3779B97F4A8; +sub.f64 fd1122, fd888, fd1121; +fma.rn.f64 fd1123, fd1091, 0d3FD3C6EF372FE950, fd1122; +mul.f64 fd1124, fd1114, 0d3FE2CF2304755A5E; +mul.f64 fd1125, fd1116, 0d3FEE6F0E134454FF; +sub.f64 fd1126, fd1125, fd1124; +add.f64 fd1127, fd1126, fd1123; +sub.f64 fd1128, fd1123, fd1126; +mul.f64 fd1129, fd969, 0d3FEEFEA21D101EE0; +mul.f64 fd1130, fd987, 0dBFCFD511FA1C0796; +sub.f64 fd1131, fd1129, fd1130; +mul.f64 fd1132, fd987, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd1133, fd969, 0dBFCFD511FA1C0796, fd1132; +mul.f64 fd1134, fd1013, 0d3FEC0AB44E81C059; +mul.f64 fd1135, fd1031, 0dBFDED50D5CBFA951; +sub.f64 fd1136, fd1134, fd1135; +mul.f64 fd1137, fd1031, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1138, fd1013, 0dBFDED50D5CBFA951, fd1137; +mul.f64 fd1139, fd1057, 0d3FE753B603D2B816; +mul.f64 fd1140, fd1075, 0dBFE5E7CF55112014; +sub.f64 fd1141, fd1139, fd1140; +mul.f64 fd1142, fd1075, 0d3FE753B603D2B816; +fma.rn.f64 fd1143, fd1057, 0dBFE5E7CF55112014, fd1142; +mul.f64 fd1144, fd1101, 0d3FE1257E3C182B51; +mul.f64 fd1145, fd1119, 0dBFEB04BBFF642E86; +sub.f64 fd1146, fd1144, fd1145; +mul.f64 fd1147, fd1119, 0d3FE1257E3C182B51; +fma.rn.f64 fd1148, fd1101, 0dBFEB04BBFF642E86, fd1147; +mul.f64 fd1149, fd977, 0d3FEC0AB44E81C059; +mul.f64 fd1150, fd995, 0dBFDED50D5CBFA951; +sub.f64 fd1151, fd1149, fd1150; +mul.f64 fd1152, fd995, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1153, fd977, 0dBFDED50D5CBFA951, fd1152; +mul.f64 fd1154, fd1021, 0d3FE1257E3C182B51; +mul.f64 fd1155, fd1039, 0dBFEB04BBFF642E86; +sub.f64 fd1156, fd1154, fd1155; +mul.f64 fd1157, fd1039, 0d3FE1257E3C182B51; +fma.rn.f64 fd1158, fd1021, 0dBFEB04BBFF642E86, fd1157; +mul.f64 fd1159, fd1065, 0d3FB0130A1BE09379; +mul.f64 fd1160, fd1083, 0dBFEFEFD5BFE443FE; +sub.f64 fd1161, fd1159, fd1160; +mul.f64 fd1162, fd1083, 0d3FB0130A1BE09379; +fma.rn.f64 fd1163, fd1065, 0dBFEFEFD5BFE443FE, fd1162; +mul.f64 fd1164, fd1109, 0dBFDB3FF7C925819C; +mul.f64 fd1165, fd1127, 0dBFECF457DCDC158C; +sub.f64 fd1166, fd1164, fd1165; +mul.f64 fd1167, fd1127, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1168, fd1109, 0dBFECF457DCDC158C, fd1167; +mul.f64 fd1169, fd978, 0d3FE753B603D2B816; +mul.f64 fd1170, fd996, 0dBFE5E7CF55112014; +sub.f64 fd1171, fd1169, fd1170; +mul.f64 fd1172, fd996, 0d3FE753B603D2B816; +fma.rn.f64 fd1173, fd978, 0dBFE5E7CF55112014, fd1172; +mul.f64 fd1174, fd1022, 0d3FB0130A1BE09379; +mul.f64 fd1175, fd1040, 0dBFEFEFD5BFE443FE; +sub.f64 fd1176, fd1174, fd1175; +mul.f64 fd1177, fd1040, 0d3FB0130A1BE09379; +fma.rn.f64 fd1178, fd1022, 0dBFEFEFD5BFE443FE, fd1177; +mul.f64 fd1179, fd1066, 0dBFE465C6FEB501BC; +mul.f64 fd1180, fd1084, 0dBFE8A80B635B6BEA; +sub.f64 fd1181, fd1179, fd1180; +mul.f64 fd1182, fd1084, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1183, fd1066, 0dBFE8A80B635B6BEA, fd1182; +mul.f64 fd1184, fd1110, 0dBFEFBF675480D903; +mul.f64 fd1185, fd1128, 0dBFC00AEB5DA15BE0; +sub.f64 fd1186, fd1184, fd1185; +mul.f64 fd1187, fd1128, 0dBFEFBF675480D903; +fma.rn.f64 fd1188, fd1110, 0dBFC00AEB5DA15BE0, fd1187; +mul.f64 fd1189, fd970, 0d3FE1257E3C182B51; +mul.f64 fd1190, fd988, 0dBFEB04BBFF642E86; +sub.f64 fd1191, fd1189, fd1190; +mul.f64 fd1192, fd988, 0d3FE1257E3C182B51; +fma.rn.f64 fd1193, fd970, 0dBFEB04BBFF642E86, fd1192; +mul.f64 fd1194, fd1014, 0dBFDB3FF7C925819C; +mul.f64 fd1195, fd1032, 0dBFECF457DCDC158C; +sub.f64 fd1196, fd1194, fd1195; +mul.f64 fd1197, fd1032, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1198, fd1014, 0dBFECF457DCDC158C, fd1197; +mul.f64 fd1199, fd1058, 0dBFEFBF675480D903; +mul.f64 fd1200, fd1076, 0dBFC00AEB5DA15BE0; +sub.f64 fd1201, fd1199, fd1200; +mul.f64 fd1202, fd1076, 0dBFEFBF675480D903; +fma.rn.f64 fd1203, fd1058, 0dBFC00AEB5DA15BE0, fd1202; +mul.f64 fd1204, fd1102, 0dBFE465C6FEB501BC; +mul.f64 fd1205, fd1120, 0d3FE8A80B635B6BEA; +sub.f64 fd1206, fd1204, fd1205; +mul.f64 fd1207, fd1120, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1208, fd1102, 0d3FE8A80B635B6BEA, fd1207; +add.f64 fd1209, fd956, fd1088; +add.f64 fd1210, fd912, fd1209; +add.f64 fd1211, fd1000, fd1044; +add.f64 fd1212, fd960, fd1092; +add.f64 fd1213, fd916, fd1212; +add.f64 fd1214, fd1004, fd1048; +fma.rn.f64 fd1215, fd1209, 0d3FD3C6EF372FE950, fd912; +mul.f64 fd1216, fd1211, 0d3FE9E3779B97F4A8; +sub.f64 fd1217, fd1215, fd1216; +sub.f64 fd1218, fd960, fd1092; +mul.f64 fd1219, fd1218, 0d3FEE6F0E134454FF; +sub.f64 fd1220, fd1004, fd1048; +mul.f64 fd1221, fd1220, 0dBFE2CF2304755A5E; +sub.f64 fd1222, fd1221, fd1219; +mul.f64 fd1223, fd1209, 0d3FE9E3779B97F4A8; +sub.f64 fd1224, fd912, fd1223; +fma.rn.f64 fd1225, fd1211, 0d3FD3C6EF372FE950, fd1224; +mul.f64 fd1226, fd1218, 0d3FE2CF2304755A5E; +mul.f64 fd1227, fd1220, 0d3FEE6F0E134454FF; +sub.f64 fd1228, fd1227, fd1226; +fma.rn.f64 fd1229, fd1212, 0d3FD3C6EF372FE950, fd916; +mul.f64 fd1230, fd1214, 0d3FE9E3779B97F4A8; +sub.f64 fd1231, fd1229, fd1230; +sub.f64 fd1232, fd956, fd1088; +mul.f64 fd1233, fd1232, 0d3FEE6F0E134454FF; +sub.f64 fd1234, fd1000, fd1044; +mul.f64 fd1235, fd1234, 0dBFE2CF2304755A5E; +sub.f64 fd1236, fd1235, fd1233; +mul.f64 fd1237, fd1212, 0d3FE9E3779B97F4A8; +sub.f64 fd1238, fd916, fd1237; +fma.rn.f64 fd1239, fd1214, 0d3FD3C6EF372FE950, fd1238; +mul.f64 fd1240, fd1232, 0d3FE2CF2304755A5E; +mul.f64 fd1241, fd1234, 0d3FEE6F0E134454FF; +sub.f64 fd1242, fd1241, fd1240; +add.f64 fd1243, fd1131, fd1146; +add.f64 fd1244, fd925, fd1243; +add.f64 fd1245, fd1136, fd1141; +add.f64 fd1246, fd1133, fd1148; +add.f64 fd1247, fd943, fd1246; +add.f64 fd1248, fd1138, fd1143; +fma.rn.f64 fd1249, fd1243, 0d3FD3C6EF372FE950, fd925; +mul.f64 fd1250, fd1245, 0d3FE9E3779B97F4A8; +sub.f64 fd1251, fd1249, fd1250; +sub.f64 fd1252, fd1133, fd1148; +mul.f64 fd1253, fd1252, 0d3FEE6F0E134454FF; +sub.f64 fd1254, fd1138, fd1143; +mul.f64 fd1255, fd1254, 0dBFE2CF2304755A5E; +sub.f64 fd1256, fd1255, fd1253; +mul.f64 fd1257, fd1243, 0d3FE9E3779B97F4A8; +sub.f64 fd1258, fd925, fd1257; +fma.rn.f64 fd1259, fd1245, 0d3FD3C6EF372FE950, fd1258; +mul.f64 fd1260, fd1252, 0d3FE2CF2304755A5E; +mul.f64 fd1261, fd1254, 0d3FEE6F0E134454FF; +sub.f64 fd1262, fd1261, fd1260; +fma.rn.f64 fd1263, fd1246, 0d3FD3C6EF372FE950, fd943; +mul.f64 fd1264, fd1248, 0d3FE9E3779B97F4A8; +sub.f64 fd1265, fd1263, fd1264; +sub.f64 fd1266, fd1131, fd1146; +mul.f64 fd1267, fd1266, 0d3FEE6F0E134454FF; +sub.f64 fd1268, fd1136, fd1141; +mul.f64 fd1269, fd1268, 0dBFE2CF2304755A5E; +sub.f64 fd1270, fd1269, fd1267; +mul.f64 fd1271, fd1246, 0d3FE9E3779B97F4A8; +sub.f64 fd1272, fd943, fd1271; +fma.rn.f64 fd1273, fd1248, 0d3FD3C6EF372FE950, fd1272; +mul.f64 fd1274, fd1266, 0d3FE2CF2304755A5E; +mul.f64 fd1275, fd1268, 0d3FEE6F0E134454FF; +sub.f64 fd1276, fd1275, fd1274; +add.f64 fd1277, fd1151, fd1166; +add.f64 fd1278, fd933, fd1277; +add.f64 fd1279, fd1156, fd1161; +add.f64 fd1280, fd1153, fd1168; +add.f64 fd1281, fd951, fd1280; +add.f64 fd1282, fd1158, fd1163; +fma.rn.f64 fd1283, fd1277, 0d3FD3C6EF372FE950, fd933; +mul.f64 fd1284, fd1279, 0d3FE9E3779B97F4A8; +sub.f64 fd1285, fd1283, fd1284; +sub.f64 fd1286, fd1153, fd1168; +mul.f64 fd1287, fd1286, 0d3FEE6F0E134454FF; +sub.f64 fd1288, fd1158, fd1163; +mul.f64 fd1289, fd1288, 0dBFE2CF2304755A5E; +sub.f64 fd1290, fd1289, fd1287; +mul.f64 fd1291, fd1277, 0d3FE9E3779B97F4A8; +sub.f64 fd1292, fd933, fd1291; +fma.rn.f64 fd1293, fd1279, 0d3FD3C6EF372FE950, fd1292; +mul.f64 fd1294, fd1286, 0d3FE2CF2304755A5E; +mul.f64 fd1295, fd1288, 0d3FEE6F0E134454FF; +sub.f64 fd1296, fd1295, fd1294; +fma.rn.f64 fd1297, fd1280, 0d3FD3C6EF372FE950, fd951; +mul.f64 fd1298, fd1282, 0d3FE9E3779B97F4A8; +sub.f64 fd1299, fd1297, fd1298; +sub.f64 fd1300, fd1151, fd1166; +mul.f64 fd1301, fd1300, 0d3FEE6F0E134454FF; +sub.f64 fd1302, fd1156, fd1161; +mul.f64 fd1303, fd1302, 0dBFE2CF2304755A5E; +sub.f64 fd1304, fd1303, fd1301; +mul.f64 fd1305, fd1280, 0d3FE9E3779B97F4A8; +sub.f64 fd1306, fd951, fd1305; +fma.rn.f64 fd1307, fd1282, 0d3FD3C6EF372FE950, fd1306; +mul.f64 fd1308, fd1300, 0d3FE2CF2304755A5E; +mul.f64 fd1309, fd1302, 0d3FEE6F0E134454FF; +sub.f64 fd1310, fd1309, fd1308; +add.f64 fd1311, fd1171, fd1186; +add.f64 fd1312, fd934, fd1311; +add.f64 fd1313, fd1176, fd1181; +add.f64 fd1314, fd1173, fd1188; +add.f64 fd1315, fd952, fd1314; +add.f64 fd1316, fd1178, fd1183; +fma.rn.f64 fd1317, fd1311, 0d3FD3C6EF372FE950, fd934; +mul.f64 fd1318, fd1313, 0d3FE9E3779B97F4A8; +sub.f64 fd1319, fd1317, fd1318; +sub.f64 fd1320, fd1173, fd1188; +mul.f64 fd1321, fd1320, 0d3FEE6F0E134454FF; +sub.f64 fd1322, fd1178, fd1183; +mul.f64 fd1323, fd1322, 0dBFE2CF2304755A5E; +sub.f64 fd1324, fd1323, fd1321; +mul.f64 fd1325, fd1311, 0d3FE9E3779B97F4A8; +sub.f64 fd1326, fd934, fd1325; +fma.rn.f64 fd1327, fd1313, 0d3FD3C6EF372FE950, fd1326; +mul.f64 fd1328, fd1320, 0d3FE2CF2304755A5E; +mul.f64 fd1329, fd1322, 0d3FEE6F0E134454FF; +sub.f64 fd1330, fd1329, fd1328; +fma.rn.f64 fd1331, fd1314, 0d3FD3C6EF372FE950, fd952; +mul.f64 fd1332, fd1316, 0d3FE9E3779B97F4A8; +sub.f64 fd1333, fd1331, fd1332; +sub.f64 fd1334, fd1171, fd1186; +mul.f64 fd1335, fd1334, 0d3FEE6F0E134454FF; +sub.f64 fd1336, fd1176, fd1181; +mul.f64 fd1337, fd1336, 0dBFE2CF2304755A5E; +sub.f64 fd1338, fd1337, fd1335; +mul.f64 fd1339, fd1314, 0d3FE9E3779B97F4A8; +sub.f64 fd1340, fd952, fd1339; +fma.rn.f64 fd1341, fd1316, 0d3FD3C6EF372FE950, fd1340; +mul.f64 fd1342, fd1334, 0d3FE2CF2304755A5E; +mul.f64 fd1343, fd1336, 0d3FEE6F0E134454FF; +sub.f64 fd1344, fd1343, fd1342; +add.f64 fd1345, fd1191, fd1206; +add.f64 fd1346, fd926, fd1345; +add.f64 fd1347, fd1196, fd1201; +add.f64 fd1348, fd1193, fd1208; +add.f64 fd1349, fd944, fd1348; +add.f64 fd1350, fd1198, fd1203; +fma.rn.f64 fd1351, fd1345, 0d3FD3C6EF372FE950, fd926; +mul.f64 fd1352, fd1347, 0d3FE9E3779B97F4A8; +sub.f64 fd1353, fd1351, fd1352; +sub.f64 fd1354, fd1193, fd1208; +mul.f64 fd1355, fd1354, 0d3FEE6F0E134454FF; +sub.f64 fd1356, fd1198, fd1203; +mul.f64 fd1357, fd1356, 0dBFE2CF2304755A5E; +sub.f64 fd1358, fd1357, fd1355; +mul.f64 fd1359, fd1345, 0d3FE9E3779B97F4A8; +sub.f64 fd1360, fd926, fd1359; +fma.rn.f64 fd1361, fd1347, 0d3FD3C6EF372FE950, fd1360; +mul.f64 fd1362, fd1354, 0d3FE2CF2304755A5E; +mul.f64 fd1363, fd1356, 0d3FEE6F0E134454FF; +sub.f64 fd1364, fd1363, fd1362; +fma.rn.f64 fd1365, fd1348, 0d3FD3C6EF372FE950, fd944; +mul.f64 fd1366, fd1350, 0d3FE9E3779B97F4A8; +sub.f64 fd1367, fd1365, fd1366; +sub.f64 fd1368, fd1191, fd1206; +mul.f64 fd1369, fd1368, 0d3FEE6F0E134454FF; +sub.f64 fd1370, fd1196, fd1201; +mul.f64 fd1371, fd1370, 0dBFE2CF2304755A5E; +sub.f64 fd1372, fd1371, fd1369; +mul.f64 fd1373, fd1348, 0d3FE9E3779B97F4A8; +sub.f64 fd1374, fd944, fd1373; +fma.rn.f64 fd1375, fd1350, 0d3FD3C6EF372FE950, fd1374; +mul.f64 fd1376, fd1368, 0d3FE2CF2304755A5E; +mul.f64 fd1377, fd1370, 0d3FEE6F0E134454FF; +sub.f64 fd1378, fd1377, fd1376; +add.f64 %0, fd1211, fd1210; +add.f64 %1, fd1214, fd1213; +add.f64 %3, fd1248, fd1247; +add.f64 %2, fd1245, fd1244; +add.f64 %5, fd1282, fd1281; +add.f64 %4, fd1279, fd1278; +add.f64 %7, fd1316, fd1315; +add.f64 %6, fd1313, fd1312; +add.f64 %9, fd1350, fd1349; +add.f64 %8, fd1347, fd1346; +add.f64 %11, fd1236, fd1231; +sub.f64 %10, fd1217, fd1222; +add.f64 %13, fd1270, fd1265; +sub.f64 %12, fd1251, fd1256; +add.f64 %15, fd1304, fd1299; +sub.f64 %14, fd1285, fd1290; +add.f64 %17, fd1338, fd1333; +sub.f64 %16, fd1319, fd1324; +add.f64 %19, fd1372, fd1367; +sub.f64 %18, fd1353, fd1358; +sub.f64 %20, fd1225, fd1228; +add.f64 %21, fd1242, fd1239; +add.f64 %23, fd1276, fd1273; +sub.f64 %22, fd1259, fd1262; +add.f64 %25, fd1310, fd1307; +sub.f64 %24, fd1293, fd1296; +add.f64 %27, fd1344, fd1341; +sub.f64 %26, fd1327, fd1330; +add.f64 %29, fd1378, fd1375; +sub.f64 %28, fd1361, fd1364; +add.f64 %30, fd1228, fd1225; +sub.f64 %31, fd1239, fd1242; +sub.f64 %33, fd1273, fd1276; +add.f64 %32, fd1262, fd1259; +sub.f64 %35, fd1307, fd1310; +add.f64 %34, fd1296, fd1293; +sub.f64 %37, fd1341, fd1344; +add.f64 %36, fd1330, fd1327; +sub.f64 %39, fd1375, fd1378; +add.f64 %38, fd1364, fd1361; +sub.f64 %41, fd1231, fd1236; +add.f64 %40, fd1222, fd1217; +sub.f64 %43, fd1265, fd1270; +add.f64 %42, fd1256, fd1251; +sub.f64 %45, fd1299, fd1304; +add.f64 %44, fd1290, fd1285; +sub.f64 %47, fd1333, fd1338; +add.f64 %46, fd1324, fd1319; +sub.f64 %49, fd1367, fd1372; +add.f64 %48, fd1358, fd1353; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_625), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<543, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1788>; +.reg .b64 rd<10>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 10000, r13; +add.f64 fd101, %62, %92; +add.f64 fd103, %72, %82; +add.f64 fd1787, %52, fd101; +add.f64 fd104, fd103, fd1787; +add.f64 fd105, %102, %104; +add.f64 fd107, %103, %83; +add.f64 fd1783, %53, fd105; +add.f64 fd108, fd107, fd1783; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1782, fd101, 0d3FD3C6EF372FE950, %52; +sub.f64 fd111, fd1782, fd110; +sub.f64 fd112, %102, %104; +sub.f64 fd114, %103, %83; +mul.f64 fd1780, fd112, 0d3FEE6F0E134454FF; +mul.f64 fd1781, fd114, 0dBFE2CF2304755A5E; +sub.f64 fd116, fd1781, fd1780; +sub.f64 fd117, fd111, fd116; +add.f64 fd118, fd116, fd111; +mul.f64 fd119, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd120, %52, fd119; +fma.rn.f64 fd121, fd103, 0d3FD3C6EF372FE950, fd120; +mul.f64 fd122, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd123, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd123, fd122; +sub.f64 fd125, fd121, fd124; +add.f64 fd126, fd124, fd121; +fma.rn.f64 fd1778, fd105, 0d3FD3C6EF372FE950, %53; +mul.f64 fd1779, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd1778, fd1779; +sub.f64 fd130, %62, %92; +sub.f64 fd132, %72, %82; +mul.f64 fd1776, fd130, 0d3FEE6F0E134454FF; +mul.f64 fd1777, fd132, 0dBFE2CF2304755A5E; +sub.f64 fd134, fd1777, fd1776; +add.f64 fd135, fd134, fd129; +sub.f64 fd136, fd129, fd134; +mul.f64 fd137, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd138, %53, fd137; +fma.rn.f64 fd139, fd107, 0d3FD3C6EF372FE950, fd138; +mul.f64 fd140, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd141, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd142, fd141, fd140; +add.f64 fd143, fd142, fd139; +sub.f64 fd144, fd139, fd142; +add.f64 fd145, %64, %94; +add.f64 fd147, %74, %84; +add.f64 fd1775, %54, fd145; +add.f64 fd148, fd147, fd1775; +add.f64 fd149, %65, %95; +add.f64 fd151, %107, %105; +add.f64 fd1771, %106, fd149; +add.f64 fd152, fd151, fd1771; +fma.rn.f64 fd1769, fd145, 0d3FD3C6EF372FE950, %54; +mul.f64 fd1770, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd155, fd1769, fd1770; +sub.f64 fd156, %65, %95; +sub.f64 fd158, %107, %105; +mul.f64 fd1767, fd156, 0d3FEE6F0E134454FF; +mul.f64 fd1768, fd158, 0dBFE2CF2304755A5E; +sub.f64 fd160, fd1768, fd1767; +sub.f64 fd161, fd155, fd160; +add.f64 fd162, fd160, fd155; +mul.f64 fd163, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd164, %54, fd163; +fma.rn.f64 fd165, fd147, 0d3FD3C6EF372FE950, fd164; +mul.f64 fd166, fd156, 0d3FE2CF2304755A5E; +mul.f64 fd167, fd158, 0d3FEE6F0E134454FF; +sub.f64 fd168, fd167, fd166; +sub.f64 fd169, fd165, fd168; +add.f64 fd170, fd168, fd165; +mul.f64 fd172, fd151, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1766, fd149, 0d3FD3C6EF372FE950, %106; +sub.f64 fd173, fd1766, fd172; +sub.f64 fd174, %64, %94; +sub.f64 fd176, %74, %84; +mul.f64 fd177, fd176, 0dBFE2CF2304755A5E; +mul.f64 fd1765, fd174, 0d3FEE6F0E134454FF; +sub.f64 fd178, fd177, fd1765; +add.f64 fd179, fd178, fd173; +sub.f64 fd180, fd173, fd178; +mul.f64 fd181, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd182, %106, fd181; +fma.rn.f64 fd183, fd151, 0d3FD3C6EF372FE950, fd182; +mul.f64 fd184, fd174, 0d3FE2CF2304755A5E; +mul.f64 fd185, fd176, 0d3FEE6F0E134454FF; +sub.f64 fd186, fd185, fd184; +add.f64 fd187, fd186, fd183; +sub.f64 fd188, fd183, fd186; +add.f64 fd189, %66, %96; +add.f64 fd191, %76, %86; +add.f64 fd1764, %56, fd189; +add.f64 fd192, fd191, fd1764; +add.f64 fd193, %110, %109; +add.f64 fd195, %77, %111; +add.f64 fd1759, %108, fd193; +add.f64 fd196, fd195, fd1759; +mul.f64 fd198, fd191, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1758, fd189, 0d3FD3C6EF372FE950, %56; +sub.f64 fd199, fd1758, fd198; +sub.f64 fd200, %110, %109; +sub.f64 fd202, %77, %111; +mul.f64 fd203, fd202, 0dBFE2CF2304755A5E; +mul.f64 fd1757, fd200, 0d3FEE6F0E134454FF; +sub.f64 fd204, fd203, fd1757; +sub.f64 fd205, fd199, fd204; +add.f64 fd206, fd204, fd199; +mul.f64 fd207, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd208, %56, fd207; +fma.rn.f64 fd209, fd191, 0d3FD3C6EF372FE950, fd208; +mul.f64 fd210, fd200, 0d3FE2CF2304755A5E; +mul.f64 fd211, fd202, 0d3FEE6F0E134454FF; +sub.f64 fd212, fd211, fd210; +sub.f64 fd213, fd209, fd212; +add.f64 fd214, fd212, fd209; +fma.rn.f64 fd1755, fd193, 0d3FD3C6EF372FE950, %108; +mul.f64 fd1756, fd195, 0d3FE9E3779B97F4A8; +sub.f64 fd217, fd1755, fd1756; +sub.f64 fd218, %66, %96; +sub.f64 fd220, %76, %86; +mul.f64 fd1753, fd218, 0d3FEE6F0E134454FF; +mul.f64 fd1754, fd220, 0dBFE2CF2304755A5E; +sub.f64 fd222, fd1754, fd1753; +add.f64 fd223, fd222, fd217; +sub.f64 fd224, fd217, fd222; +mul.f64 fd225, fd193, 0d3FE9E3779B97F4A8; +sub.f64 fd226, %108, fd225; +fma.rn.f64 fd227, fd195, 0d3FD3C6EF372FE950, fd226; +mul.f64 fd228, fd218, 0d3FE2CF2304755A5E; +mul.f64 fd229, fd220, 0d3FEE6F0E134454FF; +sub.f64 fd230, fd229, fd228; +add.f64 fd231, fd230, fd227; +sub.f64 fd232, fd227, fd230; +add.f64 fd233, %68, %98; +add.f64 fd235, %78, %88; +add.f64 fd1752, %58, fd233; +add.f64 fd236, fd235, fd1752; +add.f64 fd237, %113, %112; +add.f64 fd239, %114, %89; +add.f64 fd1748, %59, fd237; +add.f64 fd240, fd239, fd1748; +fma.rn.f64 fd1746, fd233, 0d3FD3C6EF372FE950, %58; +mul.f64 fd1747, fd235, 0d3FE9E3779B97F4A8; +sub.f64 fd243, fd1746, fd1747; +sub.f64 fd244, %113, %112; +sub.f64 fd246, %114, %89; +mul.f64 fd1744, fd244, 0d3FEE6F0E134454FF; +mul.f64 fd1745, fd246, 0dBFE2CF2304755A5E; +sub.f64 fd248, fd1745, fd1744; +sub.f64 fd249, fd243, fd248; +add.f64 fd250, fd248, fd243; +mul.f64 fd251, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd252, %58, fd251; +fma.rn.f64 fd253, fd235, 0d3FD3C6EF372FE950, fd252; +mul.f64 fd254, fd244, 0d3FE2CF2304755A5E; +mul.f64 fd255, fd246, 0d3FEE6F0E134454FF; +sub.f64 fd256, fd255, fd254; +sub.f64 fd257, fd253, fd256; +add.f64 fd258, fd256, fd253; +mul.f64 fd260, fd239, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1743, fd237, 0d3FD3C6EF372FE950, %59; +sub.f64 fd261, fd1743, fd260; +sub.f64 fd262, %68, %98; +sub.f64 fd264, %78, %88; +mul.f64 fd1741, fd262, 0d3FEE6F0E134454FF; +mul.f64 fd1742, fd264, 0dBFE2CF2304755A5E; +sub.f64 fd266, fd1742, fd1741; +add.f64 fd267, fd266, fd261; +sub.f64 fd268, fd261, fd266; +mul.f64 fd269, fd237, 0d3FE9E3779B97F4A8; +sub.f64 fd270, %59, fd269; +fma.rn.f64 fd271, fd239, 0d3FD3C6EF372FE950, fd270; +mul.f64 fd272, fd262, 0d3FE2CF2304755A5E; +mul.f64 fd273, fd264, 0d3FEE6F0E134454FF; +sub.f64 fd274, fd273, fd272; +add.f64 fd275, fd274, fd271; +sub.f64 fd276, fd271, fd274; +add.f64 fd277, %70, %100; +add.f64 fd279, %80, %90; +add.f64 fd1740, %60, fd277; +add.f64 fd280, fd279, fd1740; +add.f64 fd281, %71, %101; +add.f64 fd283, %117, %115; +add.f64 fd1736, %116, fd281; +add.f64 fd284, fd283, fd1736; +mul.f64 fd286, fd279, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1735, fd277, 0d3FD3C6EF372FE950, %60; +sub.f64 fd287, fd1735, fd286; +sub.f64 fd288, %71, %101; +sub.f64 fd290, %117, %115; +mul.f64 fd1733, fd288, 0d3FEE6F0E134454FF; +mul.f64 fd1734, fd290, 0dBFE2CF2304755A5E; +sub.f64 fd292, fd1734, fd1733; +sub.f64 fd293, fd287, fd292; +add.f64 fd294, fd292, fd287; +mul.f64 fd295, fd277, 0d3FE9E3779B97F4A8; +sub.f64 fd296, %60, fd295; +fma.rn.f64 fd297, fd279, 0d3FD3C6EF372FE950, fd296; +mul.f64 fd298, fd288, 0d3FE2CF2304755A5E; +mul.f64 fd299, fd290, 0d3FEE6F0E134454FF; +sub.f64 fd300, fd299, fd298; +sub.f64 fd301, fd297, fd300; +add.f64 fd302, fd300, fd297; +mul.f64 fd304, fd283, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1732, fd281, 0d3FD3C6EF372FE950, %116; +sub.f64 fd305, fd1732, fd304; +sub.f64 fd306, %70, %100; +sub.f64 fd308, %80, %90; +mul.f64 fd1730, fd306, 0d3FEE6F0E134454FF; +mul.f64 fd1731, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd1731, fd1730; +add.f64 fd311, fd310, fd305; +sub.f64 fd312, fd305, fd310; +mul.f64 fd313, fd281, 0d3FE9E3779B97F4A8; +sub.f64 fd314, %116, fd313; +fma.rn.f64 fd315, fd283, 0d3FD3C6EF372FE950, fd314; +mul.f64 fd316, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd317, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd318, fd317, fd316; +add.f64 fd319, fd318, fd315; +sub.f64 fd320, fd315, fd318; +mul.f64 fd322, fd179, 0dBFCFD511FA1C0796; +mul.f64 fd1729, fd161, 0d3FEEFEA21D101EE0; +sub.f64 fd323, fd1729, fd322; +mul.f64 fd324, fd179, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd325, fd161, 0dBFCFD511FA1C0796, fd324; +mul.f64 fd327, fd223, 0dBFDED50D5CBFA951; +mul.f64 fd1728, fd205, 0d3FEC0AB44E81C059; +sub.f64 fd328, fd1728, fd327; +mul.f64 fd329, fd223, 0d3FEC0AB44E81C059; +fma.rn.f64 fd330, fd205, 0dBFDED50D5CBFA951, fd329; +mul.f64 fd332, fd267, 0dBFE5E7CF55112014; +mul.f64 fd1727, fd249, 0d3FE753B603D2B816; +sub.f64 fd333, fd1727, fd332; +mul.f64 fd334, fd267, 0d3FE753B603D2B816; +fma.rn.f64 fd335, fd249, 0dBFE5E7CF55112014, fd334; +mul.f64 fd337, fd311, 0dBFEB04BBFF642E86; +mul.f64 fd1726, fd293, 0d3FE1257E3C182B51; +sub.f64 fd338, fd1726, fd337; +mul.f64 fd339, fd311, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd293, 0dBFEB04BBFF642E86, fd339; +mul.f64 fd342, fd187, 0dBFDED50D5CBFA951; +mul.f64 fd1725, fd169, 0d3FEC0AB44E81C059; +sub.f64 fd343, fd1725, fd342; +mul.f64 fd344, fd187, 0d3FEC0AB44E81C059; +fma.rn.f64 fd345, fd169, 0dBFDED50D5CBFA951, fd344; +mul.f64 fd1723, fd213, 0d3FE1257E3C182B51; +mul.f64 fd1724, fd231, 0dBFEB04BBFF642E86; +sub.f64 fd348, fd1723, fd1724; +mul.f64 fd349, fd231, 0d3FE1257E3C182B51; +fma.rn.f64 fd350, fd213, 0dBFEB04BBFF642E86, fd349; +mul.f64 fd1721, fd257, 0d3FB0130A1BE09379; +mul.f64 fd1722, fd275, 0dBFEFEFD5BFE443FE; +sub.f64 fd353, fd1721, fd1722; +mul.f64 fd354, fd275, 0d3FB0130A1BE09379; +fma.rn.f64 fd355, fd257, 0dBFEFEFD5BFE443FE, fd354; +mul.f64 fd1719, fd301, 0dBFDB3FF7C925819C; +mul.f64 fd1720, fd319, 0dBFECF457DCDC158C; +sub.f64 fd358, fd1719, fd1720; +mul.f64 fd359, fd319, 0dBFDB3FF7C925819C; +fma.rn.f64 fd360, fd301, 0dBFECF457DCDC158C, fd359; +mul.f64 fd1717, fd170, 0d3FE753B603D2B816; +mul.f64 fd1718, fd188, 0dBFE5E7CF55112014; +sub.f64 fd363, fd1717, fd1718; +mul.f64 fd364, fd188, 0d3FE753B603D2B816; +fma.rn.f64 fd365, fd170, 0dBFE5E7CF55112014, fd364; +mul.f64 fd367, fd232, 0dBFEFEFD5BFE443FE; +mul.f64 fd1716, fd214, 0d3FB0130A1BE09379; +sub.f64 fd368, fd1716, fd367; +mul.f64 fd369, fd232, 0d3FB0130A1BE09379; +fma.rn.f64 fd370, fd214, 0dBFEFEFD5BFE443FE, fd369; +mul.f64 fd372, fd276, 0dBFE8A80B635B6BEA; +mul.f64 fd1715, fd258, 0dBFE465C6FEB501BC; +sub.f64 fd373, fd1715, fd372; +mul.f64 fd374, fd276, 0dBFE465C6FEB501BC; +fma.rn.f64 fd375, fd258, 0dBFE8A80B635B6BEA, fd374; +mul.f64 fd377, fd320, 0dBFC00AEB5DA15BE0; +mul.f64 fd1714, fd302, 0dBFEFBF675480D903; +sub.f64 fd378, fd1714, fd377; +mul.f64 fd379, fd320, 0dBFEFBF675480D903; +fma.rn.f64 fd380, fd302, 0dBFC00AEB5DA15BE0, fd379; +mul.f64 fd382, fd180, 0dBFEB04BBFF642E86; +mul.f64 fd1713, fd162, 0d3FE1257E3C182B51; +sub.f64 fd383, fd1713, fd382; +mul.f64 fd384, fd180, 0d3FE1257E3C182B51; +fma.rn.f64 fd385, fd162, 0dBFEB04BBFF642E86, fd384; +mul.f64 fd387, fd224, 0dBFECF457DCDC158C; +mul.f64 fd1712, fd206, 0dBFDB3FF7C925819C; +sub.f64 fd388, fd1712, fd387; +mul.f64 fd389, fd224, 0dBFDB3FF7C925819C; +fma.rn.f64 fd390, fd206, 0dBFECF457DCDC158C, fd389; +mul.f64 fd1710, fd250, 0dBFEFBF675480D903; +mul.f64 fd1711, fd268, 0dBFC00AEB5DA15BE0; +sub.f64 fd393, fd1710, fd1711; +mul.f64 fd394, fd268, 0dBFEFBF675480D903; +fma.rn.f64 fd395, fd250, 0dBFC00AEB5DA15BE0, fd394; +mul.f64 fd1708, fd294, 0dBFE465C6FEB501BC; +mul.f64 fd1709, fd312, 0d3FE8A80B635B6BEA; +sub.f64 fd398, fd1708, fd1709; +mul.f64 fd399, fd312, 0dBFE465C6FEB501BC; +fma.rn.f64 fd400, fd294, 0d3FE8A80B635B6BEA, fd399; +add.f64 fd401, fd148, fd280; +add.f64 fd403, fd192, fd236; +mul.f64 fd408, fd403, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1707, fd401, 0d3FD3C6EF372FE950, fd104; +sub.f64 fd409, fd1707, fd408; +add.f64 fd1706, fd152, fd284; +sub.f64 fd410, fd152, fd284; +add.f64 fd1705, fd196, fd240; +sub.f64 fd412, fd196, fd240; +mul.f64 fd413, fd412, 0dBFE2CF2304755A5E; +mul.f64 fd1704, fd410, 0d3FEE6F0E134454FF; +sub.f64 fd414, fd413, fd1704; +sub.f64 fd415, fd409, fd414; +add.f64 fd416, fd414, fd409; +add.f64 fd1703, fd104, fd401; +mul.f64 fd417, fd401, 0d3FE9E3779B97F4A8; +sub.f64 fd418, fd104, fd417; +fma.rn.f64 fd419, fd403, 0d3FD3C6EF372FE950, fd418; +mul.f64 fd420, fd410, 0d3FE2CF2304755A5E; +mul.f64 fd421, fd412, 0d3FEE6F0E134454FF; +sub.f64 fd422, fd421, fd420; +sub.f64 fd423, fd419, fd422; +add.f64 fd424, fd422, fd419; +fma.rn.f64 fd1701, fd1706, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd1702, fd1705, 0d3FE9E3779B97F4A8; +sub.f64 fd427, fd1701, fd1702; +sub.f64 fd428, fd148, fd280; +sub.f64 fd430, fd192, fd236; +mul.f64 fd1699, fd428, 0d3FEE6F0E134454FF; +mul.f64 fd1700, fd430, 0dBFE2CF2304755A5E; +sub.f64 fd432, fd1700, fd1699; +add.f64 fd433, fd432, fd427; +sub.f64 fd434, fd427, fd432; +add.f64 fd1698, fd108, fd1706; +mul.f64 fd435, fd1706, 0d3FE9E3779B97F4A8; +sub.f64 fd436, fd108, fd435; +fma.rn.f64 fd437, fd1705, 0d3FD3C6EF372FE950, fd436; +mul.f64 fd438, fd428, 0d3FE2CF2304755A5E; +mul.f64 fd439, fd430, 0d3FEE6F0E134454FF; +sub.f64 fd440, fd439, fd438; +add.f64 fd441, fd440, fd437; +sub.f64 fd442, fd437, fd440; +add.f64 fd443, fd323, fd338; +add.f64 fd445, fd328, fd333; +add.f64 fd1697, fd117, fd443; +add.f64 fd446, fd445, fd1697; +add.f64 fd447, fd325, fd340; +add.f64 fd449, fd330, fd335; +add.f64 fd1696, fd135, fd447; +add.f64 fd450, fd449, fd1696; +fma.rn.f64 fd1694, fd443, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd1695, fd445, 0d3FE9E3779B97F4A8; +sub.f64 fd453, fd1694, fd1695; +sub.f64 fd454, fd325, fd340; +sub.f64 fd456, fd330, fd335; +mul.f64 fd1692, fd454, 0d3FEE6F0E134454FF; +mul.f64 fd1693, fd456, 0dBFE2CF2304755A5E; +sub.f64 fd458, fd1693, fd1692; +sub.f64 fd459, fd453, fd458; +add.f64 fd460, fd458, fd453; +mul.f64 fd461, fd443, 0d3FE9E3779B97F4A8; +sub.f64 fd462, fd117, fd461; +fma.rn.f64 fd463, fd445, 0d3FD3C6EF372FE950, fd462; +mul.f64 fd464, fd454, 0d3FE2CF2304755A5E; +mul.f64 fd465, fd456, 0d3FEE6F0E134454FF; +sub.f64 fd466, fd465, fd464; +sub.f64 fd467, fd463, fd466; +add.f64 fd468, fd466, fd463; +mul.f64 fd470, fd449, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1691, fd447, 0d3FD3C6EF372FE950, fd135; +sub.f64 fd471, fd1691, fd470; +sub.f64 fd472, fd323, fd338; +sub.f64 fd474, fd328, fd333; +mul.f64 fd1689, fd472, 0d3FEE6F0E134454FF; +mul.f64 fd1690, fd474, 0dBFE2CF2304755A5E; +sub.f64 fd476, fd1690, fd1689; +add.f64 fd477, fd476, fd471; +sub.f64 fd478, fd471, fd476; +mul.f64 fd479, fd447, 0d3FE9E3779B97F4A8; +sub.f64 fd480, fd135, fd479; +fma.rn.f64 fd481, fd449, 0d3FD3C6EF372FE950, fd480; +mul.f64 fd482, fd472, 0d3FE2CF2304755A5E; +mul.f64 fd483, fd474, 0d3FEE6F0E134454FF; +sub.f64 fd484, fd483, fd482; +add.f64 fd485, fd484, fd481; +sub.f64 fd486, fd481, fd484; +add.f64 fd487, fd343, fd358; +add.f64 fd489, fd348, fd353; +add.f64 fd1688, fd125, fd487; +add.f64 fd490, fd489, fd1688; +add.f64 fd491, fd345, fd360; +add.f64 fd493, fd350, fd355; +add.f64 fd1687, fd143, fd491; +add.f64 fd494, fd493, fd1687; +mul.f64 fd496, fd489, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1686, fd487, 0d3FD3C6EF372FE950, fd125; +sub.f64 fd497, fd1686, fd496; +sub.f64 fd498, fd345, fd360; +sub.f64 fd500, fd350, fd355; +mul.f64 fd1684, fd498, 0d3FEE6F0E134454FF; +mul.f64 fd1685, fd500, 0dBFE2CF2304755A5E; +sub.f64 fd502, fd1685, fd1684; +sub.f64 fd503, fd497, fd502; +add.f64 fd504, fd502, fd497; +mul.f64 fd505, fd487, 0d3FE9E3779B97F4A8; +sub.f64 fd506, fd125, fd505; +fma.rn.f64 fd507, fd489, 0d3FD3C6EF372FE950, fd506; +mul.f64 fd508, fd498, 0d3FE2CF2304755A5E; +mul.f64 fd509, fd500, 0d3FEE6F0E134454FF; +sub.f64 fd510, fd509, fd508; +sub.f64 fd511, fd507, fd510; +add.f64 fd512, fd510, fd507; +mul.f64 fd514, fd493, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1683, fd491, 0d3FD3C6EF372FE950, fd143; +sub.f64 fd515, fd1683, fd514; +sub.f64 fd516, fd343, fd358; +sub.f64 fd518, fd348, fd353; +mul.f64 fd1681, fd516, 0d3FEE6F0E134454FF; +mul.f64 fd1682, fd518, 0dBFE2CF2304755A5E; +sub.f64 fd520, fd1682, fd1681; +add.f64 fd521, fd520, fd515; +sub.f64 fd522, fd515, fd520; +mul.f64 fd523, fd491, 0d3FE9E3779B97F4A8; +sub.f64 fd524, fd143, fd523; +fma.rn.f64 fd525, fd493, 0d3FD3C6EF372FE950, fd524; +mul.f64 fd526, fd516, 0d3FE2CF2304755A5E; +mul.f64 fd527, fd518, 0d3FEE6F0E134454FF; +sub.f64 fd528, fd527, fd526; +add.f64 fd529, fd528, fd525; +sub.f64 fd530, fd525, fd528; +add.f64 fd531, fd363, fd378; +add.f64 fd533, fd368, fd373; +add.f64 fd1680, fd126, fd531; +add.f64 fd534, fd533, fd1680; +add.f64 fd535, fd365, fd380; +add.f64 fd537, fd370, fd375; +add.f64 fd1679, fd144, fd535; +add.f64 fd538, fd537, fd1679; +mul.f64 fd540, fd533, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1678, fd531, 0d3FD3C6EF372FE950, fd126; +sub.f64 fd541, fd1678, fd540; +sub.f64 fd542, fd365, fd380; +sub.f64 fd544, fd370, fd375; +mul.f64 fd1676, fd542, 0d3FEE6F0E134454FF; +mul.f64 fd1677, fd544, 0dBFE2CF2304755A5E; +sub.f64 fd546, fd1677, fd1676; +sub.f64 fd547, fd541, fd546; +add.f64 fd548, fd546, fd541; +mul.f64 fd549, fd531, 0d3FE9E3779B97F4A8; +sub.f64 fd550, fd126, fd549; +fma.rn.f64 fd551, fd533, 0d3FD3C6EF372FE950, fd550; +mul.f64 fd552, fd542, 0d3FE2CF2304755A5E; +mul.f64 fd553, fd544, 0d3FEE6F0E134454FF; +sub.f64 fd554, fd553, fd552; +sub.f64 fd555, fd551, fd554; +add.f64 fd556, fd554, fd551; +fma.rn.f64 fd1674, fd535, 0d3FD3C6EF372FE950, fd144; +mul.f64 fd1675, fd537, 0d3FE9E3779B97F4A8; +sub.f64 fd559, fd1674, fd1675; +sub.f64 fd560, fd363, fd378; +sub.f64 fd562, fd368, fd373; +mul.f64 fd1672, fd560, 0d3FEE6F0E134454FF; +mul.f64 fd1673, fd562, 0dBFE2CF2304755A5E; +sub.f64 fd564, fd1673, fd1672; +add.f64 fd565, fd564, fd559; +sub.f64 fd566, fd559, fd564; +mul.f64 fd567, fd535, 0d3FE9E3779B97F4A8; +sub.f64 fd568, fd144, fd567; +fma.rn.f64 fd569, fd537, 0d3FD3C6EF372FE950, fd568; +mul.f64 fd570, fd560, 0d3FE2CF2304755A5E; +mul.f64 fd571, fd562, 0d3FEE6F0E134454FF; +sub.f64 fd572, fd571, fd570; +add.f64 fd573, fd572, fd569; +sub.f64 fd574, fd569, fd572; +add.f64 fd575, fd383, fd398; +add.f64 fd577, fd388, fd393; +add.f64 fd1671, fd118, fd575; +add.f64 fd578, fd577, fd1671; +add.f64 fd579, fd385, fd400; +add.f64 fd581, fd390, fd395; +add.f64 fd1670, fd136, fd579; +add.f64 fd582, fd581, fd1670; +fma.rn.f64 fd1668, fd575, 0d3FD3C6EF372FE950, fd118; +mul.f64 fd1669, fd577, 0d3FE9E3779B97F4A8; +sub.f64 fd585, fd1668, fd1669; +sub.f64 fd586, fd385, fd400; +sub.f64 fd588, fd390, fd395; +mul.f64 fd1666, fd586, 0d3FEE6F0E134454FF; +mul.f64 fd1667, fd588, 0dBFE2CF2304755A5E; +sub.f64 fd590, fd1667, fd1666; +sub.f64 fd591, fd585, fd590; +add.f64 fd592, fd590, fd585; +mul.f64 fd593, fd575, 0d3FE9E3779B97F4A8; +sub.f64 fd594, fd118, fd593; +fma.rn.f64 fd595, fd577, 0d3FD3C6EF372FE950, fd594; +mul.f64 fd596, fd586, 0d3FE2CF2304755A5E; +mul.f64 fd597, fd588, 0d3FEE6F0E134454FF; +sub.f64 fd598, fd597, fd596; +sub.f64 fd599, fd595, fd598; +add.f64 fd600, fd598, fd595; +mul.f64 fd602, fd581, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1665, fd579, 0d3FD3C6EF372FE950, fd136; +sub.f64 fd603, fd1665, fd602; +sub.f64 fd604, fd383, fd398; +sub.f64 fd606, fd388, fd393; +mul.f64 fd1663, fd604, 0d3FEE6F0E134454FF; +mul.f64 fd1664, fd606, 0dBFE2CF2304755A5E; +sub.f64 fd608, fd1664, fd1663; +add.f64 fd609, fd608, fd603; +sub.f64 fd610, fd603, fd608; +mul.f64 fd611, fd579, 0d3FE9E3779B97F4A8; +sub.f64 fd612, fd136, fd611; +fma.rn.f64 fd613, fd581, 0d3FD3C6EF372FE950, fd612; +mul.f64 fd614, fd604, 0d3FE2CF2304755A5E; +mul.f64 fd615, fd606, 0d3FEE6F0E134454FF; +sub.f64 fd616, fd615, fd614; +add.f64 fd617, fd616, fd613; +sub.f64 fd618, fd613, fd616; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 10000, r3; +mov.u64 rd5, %51; +mul.wide.u32 rd7, r7, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd619, fd620}, [rd6]; +mul.f64 fd624, fd620, fd450; +mul.f64 fd625, fd619, fd450; +mul.f64 fd627, fd620, fd620; +mul.f64 fd1662, fd619, fd619; +sub.f64 fd628, fd1662, fd627; +mul.f64 fd629, fd620, fd619; +fma.rn.f64 fd630, fd620, fd619, fd629; +mul.f64 fd632, fd630, fd494; +mul.f64 fd633, fd628, fd494; +mul.f64 fd1660, fd619, fd628; +mul.f64 fd1661, fd620, fd630; +sub.f64 fd636, fd1660, fd1661; +mul.f64 fd1659, fd628, fd490; +mul.f64 fd637, fd619, fd630; +fma.rn.f64 fd638, fd620, fd628, fd637; +mul.f64 fd640, fd638, fd538; +mul.f64 fd641, fd636, fd538; +mul.f64 fd643, fd620, fd638; +mul.f64 fd1658, fd619, fd636; +sub.f64 fd644, fd1658, fd643; +mul.f64 fd1657, fd636, fd534; +mul.f64 fd645, fd619, fd638; +fma.rn.f64 fd646, fd620, fd636, fd645; +mul.f64 fd648, fd646, fd582; +mul.f64 fd649, fd644, fd582; +mul.f64 fd1655, fd619, fd644; +mul.f64 fd1656, fd620, fd646; +sub.f64 fd652, fd1655, fd1656; +mul.f64 fd1654, fd644, fd578; +mul.f64 fd653, fd619, fd646; +fma.rn.f64 fd654, fd620, fd644, fd653; +mul.f64 fd656, fd654, fd433; +mul.f64 fd657, fd652, fd433; +mul.f64 fd659, fd620, fd654; +mul.f64 fd1653, fd619, fd652; +sub.f64 fd660, fd1653, fd659; +mul.f64 fd1652, fd652, fd415; +mul.f64 fd661, fd619, fd654; +fma.rn.f64 fd662, fd620, fd652, fd661; +mul.f64 fd664, fd662, fd477; +mul.f64 fd665, fd660, fd477; +mul.f64 fd667, fd620, fd662; +mul.f64 fd1651, fd619, fd660; +sub.f64 fd668, fd1651, fd667; +mul.f64 fd1650, fd660, fd459; +mul.f64 fd669, fd619, fd662; +fma.rn.f64 fd670, fd620, fd660, fd669; +mul.f64 fd672, fd670, fd521; +mul.f64 fd673, fd668, fd521; +mul.f64 fd1648, fd619, fd668; +mul.f64 fd1649, fd620, fd670; +sub.f64 fd676, fd1648, fd1649; +mul.f64 fd1647, fd668, fd503; +mul.f64 fd677, fd619, fd670; +fma.rn.f64 fd678, fd620, fd668, fd677; +mul.f64 fd680, fd678, fd565; +mul.f64 fd681, fd676, fd565; +mul.f64 fd683, fd620, fd678; +mul.f64 fd1646, fd619, fd676; +sub.f64 fd684, fd1646, fd683; +mul.f64 fd1645, fd676, fd547; +mul.f64 fd685, fd619, fd678; +fma.rn.f64 fd686, fd620, fd676, fd685; +mul.f64 fd688, fd686, fd609; +mul.f64 fd689, fd684, fd609; +mul.f64 fd691, fd620, fd686; +mul.f64 fd1644, fd619, fd684; +sub.f64 fd692, fd1644, fd691; +mul.f64 fd1643, fd684, fd591; +mul.f64 fd693, fd619, fd686; +fma.rn.f64 fd694, fd620, fd684, fd693; +mul.f64 fd696, fd694, fd441; +mul.f64 fd697, fd692, fd441; +mul.f64 fd1641, fd619, fd692; +mul.f64 fd1642, fd620, fd694; +sub.f64 fd700, fd1641, fd1642; +mul.f64 fd1640, fd692, fd423; +mul.f64 fd701, fd619, fd694; +fma.rn.f64 fd702, fd620, fd692, fd701; +mul.f64 fd704, fd702, fd485; +mul.f64 fd705, fd700, fd485; +mul.f64 fd707, fd620, fd702; +mul.f64 fd1639, fd619, fd700; +sub.f64 fd708, fd1639, fd707; +mul.f64 fd1638, fd700, fd467; +mul.f64 fd709, fd619, fd702; +fma.rn.f64 fd710, fd620, fd700, fd709; +mul.f64 fd711, fd708, fd511; +mul.f64 fd712, fd710, fd529; +mul.f64 fd713, fd708, fd529; +ld.global.v2.f64 {fd714, fd715}, [rd6+400]; +mul.f64 fd719, fd715, fd573; +mul.f64 fd720, fd714, fd573; +mul.f64 fd1636, fd619, fd714; +mul.f64 fd1637, fd620, fd715; +sub.f64 fd723, fd1636, fd1637; +mul.f64 fd1635, fd714, fd555; +mul.f64 fd724, fd619, fd715; +fma.rn.f64 fd725, fd620, fd714, fd724; +mul.f64 fd727, fd725, fd617; +mul.f64 fd728, fd723, fd617; +mul.f64 fd730, fd620, fd725; +mul.f64 fd1634, fd619, fd723; +sub.f64 fd731, fd1634, fd730; +mul.f64 fd1633, fd723, fd599; +mul.f64 fd732, fd619, fd725; +fma.rn.f64 fd733, fd620, fd723, fd732; +mul.f64 fd735, fd733, fd442; +mul.f64 fd736, fd731, fd442; +mul.f64 fd738, fd620, fd733; +mul.f64 fd1632, fd619, fd731; +sub.f64 fd739, fd1632, fd738; +mul.f64 fd1631, fd731, fd424; +mul.f64 fd740, fd619, fd733; +fma.rn.f64 fd741, fd620, fd731, fd740; +mul.f64 fd743, fd741, fd486; +mul.f64 fd744, fd739, fd486; +mul.f64 fd1629, fd619, fd739; +mul.f64 fd1630, fd620, fd741; +sub.f64 fd747, fd1629, fd1630; +mul.f64 fd1628, fd739, fd468; +mul.f64 fd748, fd619, fd741; +fma.rn.f64 fd749, fd620, fd739, fd748; +mul.f64 fd751, fd749, fd530; +mul.f64 fd752, fd747, fd530; +mul.f64 fd754, fd620, fd749; +mul.f64 fd1627, fd619, fd747; +sub.f64 fd755, fd1627, fd754; +mul.f64 fd1626, fd747, fd512; +mul.f64 fd756, fd619, fd749; +fma.rn.f64 fd757, fd620, fd747, fd756; +mul.f64 fd759, fd757, fd574; +mul.f64 fd760, fd755, fd574; +mul.f64 fd762, fd620, fd757; +mul.f64 fd1625, fd619, fd755; +sub.f64 fd763, fd1625, fd762; +mul.f64 fd1624, fd755, fd556; +mul.f64 fd764, fd619, fd757; +fma.rn.f64 fd765, fd620, fd755, fd764; +mul.f64 fd767, fd765, fd618; +mul.f64 fd768, fd763, fd618; +mul.f64 fd1622, fd619, fd763; +mul.f64 fd1623, fd620, fd765; +sub.f64 fd771, fd1622, fd1623; +mul.f64 fd1621, fd763, fd600; +mul.f64 fd772, fd619, fd765; +fma.rn.f64 fd773, fd620, fd763, fd772; +mul.f64 fd775, fd773, fd434; +mul.f64 fd776, fd771, fd434; +mul.f64 fd778, fd620, fd773; +mul.f64 fd1620, fd619, fd771; +sub.f64 fd779, fd1620, fd778; +mul.f64 fd1619, fd771, fd416; +mul.f64 fd780, fd619, fd773; +fma.rn.f64 fd781, fd620, fd771, fd780; +mul.f64 fd783, fd781, fd478; +mul.f64 fd784, fd779, fd478; +mul.f64 fd1617, fd619, fd779; +mul.f64 fd1618, fd620, fd781; +sub.f64 fd787, fd1617, fd1618; +mul.f64 fd1616, fd779, fd460; +mul.f64 fd788, fd619, fd781; +fma.rn.f64 fd789, fd620, fd779, fd788; +mul.f64 fd791, fd789, fd522; +mul.f64 fd792, fd787, fd522; +mul.f64 fd794, fd620, fd789; +mul.f64 fd1615, fd619, fd787; +sub.f64 fd795, fd1615, fd794; +mul.f64 fd1614, fd787, fd504; +mul.f64 fd796, fd619, fd789; +fma.rn.f64 fd797, fd620, fd787, fd796; +mul.f64 fd799, fd797, fd566; +mul.f64 fd800, fd795, fd566; +mul.f64 fd802, fd620, fd797; +mul.f64 fd1613, fd619, fd795; +sub.f64 fd803, fd1613, fd802; +mul.f64 fd1612, fd619, fd446; +mul.f64 fd804, fd619, fd797; +mul.f64 fd1611, fd795, fd548; +fma.rn.f64 fd805, fd620, fd795, fd804; +mul.f64 fd806, fd803, fd592; +mul.f64 fd807, fd805, fd610; +mul.f64 fd808, fd803, fd610; +barrier.sync 0; +mad.lo.s32 r9, r7, 400, r8; +add.f64 fd809, fd1705, fd1698; +add.f64 fd810, fd403, fd1703; +st.shared.v2.f64 [r9], {fd810, fd809}; +fma.rn.f64 fd811, fd620, fd446, fd625; +sub.f64 fd812, fd1612, fd624; +st.shared.v2.f64 [r9+16], {fd812, fd811}; +fma.rn.f64 fd813, fd630, fd490, fd633; +sub.f64 fd814, fd1659, fd632; +st.shared.v2.f64 [r9+32], {fd814, fd813}; +fma.rn.f64 fd815, fd638, fd534, fd641; +sub.f64 fd816, fd1657, fd640; +st.shared.v2.f64 [r9+48], {fd816, fd815}; +fma.rn.f64 fd817, fd646, fd578, fd649; +sub.f64 fd818, fd1654, fd648; +st.shared.v2.f64 [r9+64], {fd818, fd817}; +sub.f64 fd819, fd1652, fd656; +fma.rn.f64 fd820, fd654, fd415, fd657; +st.shared.v2.f64 [r9+80], {fd819, fd820}; +fma.rn.f64 fd821, fd662, fd459, fd665; +sub.f64 fd822, fd1650, fd664; +st.shared.v2.f64 [r9+96], {fd822, fd821}; +sub.f64 fd823, fd1647, fd672; +fma.rn.f64 fd824, fd670, fd503, fd673; +st.shared.v2.f64 [r9+112], {fd823, fd824}; +fma.rn.f64 fd825, fd678, fd547, fd681; +sub.f64 fd826, fd1645, fd680; +st.shared.v2.f64 [r9+128], {fd826, fd825}; +fma.rn.f64 fd827, fd686, fd591, fd689; +sub.f64 fd828, fd1643, fd688; +st.shared.v2.f64 [r9+144], {fd828, fd827}; +fma.rn.f64 fd829, fd694, fd423, fd697; +sub.f64 fd830, fd1640, fd696; +st.shared.v2.f64 [r9+160], {fd830, fd829}; +fma.rn.f64 fd831, fd702, fd467, fd705; +sub.f64 fd832, fd1638, fd704; +st.shared.v2.f64 [r9+176], {fd832, fd831}; +fma.rn.f64 fd833, fd710, fd511, fd713; +sub.f64 fd834, fd711, fd712; +st.shared.v2.f64 [r9+192], {fd834, fd833}; +fma.rn.f64 fd835, fd715, fd555, fd720; +sub.f64 fd836, fd1635, fd719; +st.shared.v2.f64 [r9+208], {fd836, fd835}; +fma.rn.f64 fd837, fd725, fd599, fd728; +sub.f64 fd838, fd1633, fd727; +st.shared.v2.f64 [r9+224], {fd838, fd837}; +fma.rn.f64 fd839, fd733, fd424, fd736; +sub.f64 fd840, fd1631, fd735; +st.shared.v2.f64 [r9+240], {fd840, fd839}; +fma.rn.f64 fd841, fd741, fd468, fd744; +sub.f64 fd842, fd1628, fd743; +st.shared.v2.f64 [r9+256], {fd842, fd841}; +fma.rn.f64 fd843, fd749, fd512, fd752; +sub.f64 fd844, fd1626, fd751; +st.shared.v2.f64 [r9+272], {fd844, fd843}; +fma.rn.f64 fd845, fd757, fd556, fd760; +sub.f64 fd846, fd1624, fd759; +st.shared.v2.f64 [r9+288], {fd846, fd845}; +sub.f64 fd847, fd1621, fd767; +fma.rn.f64 fd848, fd765, fd600, fd768; +st.shared.v2.f64 [r9+304], {fd847, fd848}; +fma.rn.f64 fd849, fd773, fd416, fd776; +sub.f64 fd850, fd1619, fd775; +st.shared.v2.f64 [r9+320], {fd850, fd849}; +fma.rn.f64 fd851, fd781, fd460, fd784; +sub.f64 fd852, fd1616, fd783; +st.shared.v2.f64 [r9+336], {fd852, fd851}; +fma.rn.f64 fd853, fd789, fd504, fd792; +sub.f64 fd854, fd1614, fd791; +st.shared.v2.f64 [r9+352], {fd854, fd853}; +fma.rn.f64 fd855, fd797, fd548, fd800; +sub.f64 fd856, fd1611, fd799; +st.shared.v2.f64 [r9+368], {fd856, fd855}; +fma.rn.f64 fd857, fd805, fd592, fd808; +sub.f64 fd858, fd806, fd807; +st.shared.v2.f64 [r9+384], {fd858, fd857}; +barrier.sync 0; +mad.lo.s32 r10, r7, -384, r9; +ld.shared.v2.f64 {fd859, fd860}, [r10]; +ld.shared.v2.f64 {fd863, fd864}, [r10+400]; +ld.shared.v2.f64 {fd867, fd868}, [r10+800]; +ld.shared.v2.f64 {fd871, fd872}, [r10+1200]; +ld.shared.v2.f64 {fd875, fd876}, [r10+1600]; +ld.shared.v2.f64 {fd879, fd880}, [r10+2000]; +ld.shared.v2.f64 {fd883, fd884}, [r10+2400]; +ld.shared.v2.f64 {fd887, fd888}, [r10+2800]; +ld.shared.v2.f64 {fd891, fd892}, [r10+3200]; +ld.shared.v2.f64 {fd895, fd896}, [r10+3600]; +ld.shared.v2.f64 {fd899, fd900}, [r10+4000]; +ld.shared.v2.f64 {fd903, fd904}, [r10+4400]; +ld.shared.v2.f64 {fd907, fd908}, [r10+4800]; +ld.shared.v2.f64 {fd911, fd912}, [r10+5200]; +ld.shared.v2.f64 {fd915, fd916}, [r10+5600]; +ld.shared.v2.f64 {fd919, fd920}, [r10+6000]; +ld.shared.v2.f64 {fd923, fd924}, [r10+6400]; +ld.shared.v2.f64 {fd927, fd928}, [r10+6800]; +ld.shared.v2.f64 {fd931, fd932}, [r10+7200]; +ld.shared.v2.f64 {fd935, fd936}, [r10+7600]; +ld.shared.v2.f64 {fd939, fd940}, [r10+8000]; +ld.shared.v2.f64 {fd943, fd944}, [r10+8400]; +ld.shared.v2.f64 {fd947, fd948}, [r10+8800]; +ld.shared.v2.f64 {fd951, fd952}, [r10+9200]; +ld.shared.v2.f64 {fd955, fd956}, [r10+9600]; +add.f64 fd959, fd879, fd939; +add.f64 fd961, fd899, fd919; +add.f64 fd1610, fd859, fd959; +add.f64 fd962, fd961, fd1610; +add.f64 fd963, fd880, fd940; +add.f64 fd965, fd900, fd920; +add.f64 fd1609, fd860, fd963; +add.f64 fd966, fd965, fd1609; +mul.f64 fd968, fd961, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1608, fd959, 0d3FD3C6EF372FE950, fd859; +sub.f64 fd969, fd1608, fd968; +sub.f64 fd970, fd880, fd940; +sub.f64 fd972, fd900, fd920; +mul.f64 fd1606, fd970, 0d3FEE6F0E134454FF; +mul.f64 fd1607, fd972, 0dBFE2CF2304755A5E; +sub.f64 fd974, fd1607, fd1606; +sub.f64 fd975, fd969, fd974; +add.f64 fd976, fd974, fd969; +mul.f64 fd977, fd959, 0d3FE9E3779B97F4A8; +sub.f64 fd978, fd859, fd977; +fma.rn.f64 fd979, fd961, 0d3FD3C6EF372FE950, fd978; +mul.f64 fd980, fd970, 0d3FE2CF2304755A5E; +mul.f64 fd981, fd972, 0d3FEE6F0E134454FF; +sub.f64 fd982, fd981, fd980; +sub.f64 fd983, fd979, fd982; +add.f64 fd984, fd982, fd979; +fma.rn.f64 fd1604, fd963, 0d3FD3C6EF372FE950, fd860; +mul.f64 fd1605, fd965, 0d3FE9E3779B97F4A8; +sub.f64 fd987, fd1604, fd1605; +sub.f64 fd988, fd879, fd939; +sub.f64 fd990, fd899, fd919; +mul.f64 fd1602, fd988, 0d3FEE6F0E134454FF; +mul.f64 fd1603, fd990, 0dBFE2CF2304755A5E; +sub.f64 fd992, fd1603, fd1602; +add.f64 fd993, fd992, fd987; +sub.f64 fd994, fd987, fd992; +mul.f64 fd995, fd963, 0d3FE9E3779B97F4A8; +sub.f64 fd996, fd860, fd995; +fma.rn.f64 fd997, fd965, 0d3FD3C6EF372FE950, fd996; +mul.f64 fd998, fd988, 0d3FE2CF2304755A5E; +mul.f64 fd999, fd990, 0d3FEE6F0E134454FF; +sub.f64 fd1000, fd999, fd998; +add.f64 fd1001, fd1000, fd997; +sub.f64 fd1002, fd997, fd1000; +add.f64 fd1003, fd883, fd943; +add.f64 fd1005, fd903, fd923; +add.f64 fd1601, fd863, fd1003; +add.f64 fd1006, fd1005, fd1601; +add.f64 fd1007, fd884, fd944; +add.f64 fd1009, fd904, fd924; +add.f64 fd1600, fd864, fd1007; +add.f64 fd1010, fd1009, fd1600; +fma.rn.f64 fd1598, fd1003, 0d3FD3C6EF372FE950, fd863; +mul.f64 fd1599, fd1005, 0d3FE9E3779B97F4A8; +sub.f64 fd1013, fd1598, fd1599; +sub.f64 fd1014, fd884, fd944; +sub.f64 fd1016, fd904, fd924; +mul.f64 fd1596, fd1014, 0d3FEE6F0E134454FF; +mul.f64 fd1597, fd1016, 0dBFE2CF2304755A5E; +sub.f64 fd1018, fd1597, fd1596; +sub.f64 fd1019, fd1013, fd1018; +add.f64 fd1020, fd1018, fd1013; +mul.f64 fd1021, fd1003, 0d3FE9E3779B97F4A8; +sub.f64 fd1022, fd863, fd1021; +fma.rn.f64 fd1023, fd1005, 0d3FD3C6EF372FE950, fd1022; +mul.f64 fd1024, fd1014, 0d3FE2CF2304755A5E; +mul.f64 fd1025, fd1016, 0d3FEE6F0E134454FF; +sub.f64 fd1026, fd1025, fd1024; +sub.f64 fd1027, fd1023, fd1026; +add.f64 fd1028, fd1026, fd1023; +mul.f64 fd1030, fd1009, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1595, fd1007, 0d3FD3C6EF372FE950, fd864; +sub.f64 fd1031, fd1595, fd1030; +sub.f64 fd1032, fd883, fd943; +sub.f64 fd1034, fd903, fd923; +mul.f64 fd1035, fd1034, 0dBFE2CF2304755A5E; +mul.f64 fd1594, fd1032, 0d3FEE6F0E134454FF; +sub.f64 fd1036, fd1035, fd1594; +add.f64 fd1037, fd1036, fd1031; +sub.f64 fd1038, fd1031, fd1036; +mul.f64 fd1039, fd1007, 0d3FE9E3779B97F4A8; +sub.f64 fd1040, fd864, fd1039; +fma.rn.f64 fd1041, fd1009, 0d3FD3C6EF372FE950, fd1040; +mul.f64 fd1042, fd1032, 0d3FE2CF2304755A5E; +mul.f64 fd1043, fd1034, 0d3FEE6F0E134454FF; +sub.f64 fd1044, fd1043, fd1042; +add.f64 fd1045, fd1044, fd1041; +sub.f64 fd1046, fd1041, fd1044; +add.f64 fd1047, fd887, fd947; +add.f64 fd1049, fd907, fd927; +add.f64 fd1593, fd867, fd1047; +add.f64 fd1050, fd1049, fd1593; +add.f64 fd1051, fd888, fd948; +add.f64 fd1053, fd908, fd928; +add.f64 fd1592, fd868, fd1051; +add.f64 fd1054, fd1053, fd1592; +mul.f64 fd1056, fd1049, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1591, fd1047, 0d3FD3C6EF372FE950, fd867; +sub.f64 fd1057, fd1591, fd1056; +sub.f64 fd1058, fd888, fd948; +sub.f64 fd1060, fd908, fd928; +mul.f64 fd1061, fd1060, 0dBFE2CF2304755A5E; +mul.f64 fd1590, fd1058, 0d3FEE6F0E134454FF; +sub.f64 fd1062, fd1061, fd1590; +sub.f64 fd1063, fd1057, fd1062; +add.f64 fd1064, fd1062, fd1057; +mul.f64 fd1065, fd1047, 0d3FE9E3779B97F4A8; +sub.f64 fd1066, fd867, fd1065; +fma.rn.f64 fd1067, fd1049, 0d3FD3C6EF372FE950, fd1066; +mul.f64 fd1068, fd1058, 0d3FE2CF2304755A5E; +mul.f64 fd1069, fd1060, 0d3FEE6F0E134454FF; +sub.f64 fd1070, fd1069, fd1068; +sub.f64 fd1071, fd1067, fd1070; +add.f64 fd1072, fd1070, fd1067; +fma.rn.f64 fd1588, fd1051, 0d3FD3C6EF372FE950, fd868; +mul.f64 fd1589, fd1053, 0d3FE9E3779B97F4A8; +sub.f64 fd1075, fd1588, fd1589; +sub.f64 fd1076, fd887, fd947; +sub.f64 fd1078, fd907, fd927; +mul.f64 fd1586, fd1076, 0d3FEE6F0E134454FF; +mul.f64 fd1587, fd1078, 0dBFE2CF2304755A5E; +sub.f64 fd1080, fd1587, fd1586; +add.f64 fd1081, fd1080, fd1075; +sub.f64 fd1082, fd1075, fd1080; +mul.f64 fd1083, fd1051, 0d3FE9E3779B97F4A8; +sub.f64 fd1084, fd868, fd1083; +fma.rn.f64 fd1085, fd1053, 0d3FD3C6EF372FE950, fd1084; +mul.f64 fd1086, fd1076, 0d3FE2CF2304755A5E; +mul.f64 fd1087, fd1078, 0d3FEE6F0E134454FF; +sub.f64 fd1088, fd1087, fd1086; +add.f64 fd1089, fd1088, fd1085; +sub.f64 fd1090, fd1085, fd1088; +add.f64 fd1091, fd891, fd951; +add.f64 fd1093, fd911, fd931; +add.f64 fd1585, fd871, fd1091; +add.f64 fd1094, fd1093, fd1585; +add.f64 fd1095, fd892, fd952; +add.f64 fd1097, fd912, fd932; +add.f64 fd1584, fd872, fd1095; +add.f64 fd1098, fd1097, fd1584; +fma.rn.f64 fd1582, fd1091, 0d3FD3C6EF372FE950, fd871; +mul.f64 fd1583, fd1093, 0d3FE9E3779B97F4A8; +sub.f64 fd1101, fd1582, fd1583; +sub.f64 fd1102, fd892, fd952; +sub.f64 fd1104, fd912, fd932; +mul.f64 fd1580, fd1102, 0d3FEE6F0E134454FF; +mul.f64 fd1581, fd1104, 0dBFE2CF2304755A5E; +sub.f64 fd1106, fd1581, fd1580; +sub.f64 fd1107, fd1101, fd1106; +add.f64 fd1108, fd1106, fd1101; +mul.f64 fd1109, fd1091, 0d3FE9E3779B97F4A8; +sub.f64 fd1110, fd871, fd1109; +fma.rn.f64 fd1111, fd1093, 0d3FD3C6EF372FE950, fd1110; +mul.f64 fd1112, fd1102, 0d3FE2CF2304755A5E; +mul.f64 fd1113, fd1104, 0d3FEE6F0E134454FF; +sub.f64 fd1114, fd1113, fd1112; +sub.f64 fd1115, fd1111, fd1114; +add.f64 fd1116, fd1114, fd1111; +mul.f64 fd1118, fd1097, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1579, fd1095, 0d3FD3C6EF372FE950, fd872; +sub.f64 fd1119, fd1579, fd1118; +sub.f64 fd1120, fd891, fd951; +sub.f64 fd1122, fd911, fd931; +mul.f64 fd1577, fd1120, 0d3FEE6F0E134454FF; +mul.f64 fd1578, fd1122, 0dBFE2CF2304755A5E; +sub.f64 fd1124, fd1578, fd1577; +add.f64 fd1125, fd1124, fd1119; +sub.f64 fd1126, fd1119, fd1124; +mul.f64 fd1127, fd1095, 0d3FE9E3779B97F4A8; +sub.f64 fd1128, fd872, fd1127; +fma.rn.f64 fd1129, fd1097, 0d3FD3C6EF372FE950, fd1128; +mul.f64 fd1130, fd1120, 0d3FE2CF2304755A5E; +mul.f64 fd1131, fd1122, 0d3FEE6F0E134454FF; +sub.f64 fd1132, fd1131, fd1130; +add.f64 fd1133, fd1132, fd1129; +sub.f64 fd1134, fd1129, fd1132; +add.f64 fd1135, fd895, fd955; +add.f64 fd1137, fd915, fd935; +add.f64 fd1576, fd875, fd1135; +add.f64 fd1138, fd1137, fd1576; +add.f64 fd1139, fd896, fd956; +add.f64 fd1141, fd916, fd936; +add.f64 fd1575, fd876, fd1139; +add.f64 fd1142, fd1141, fd1575; +mul.f64 fd1144, fd1137, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1574, fd1135, 0d3FD3C6EF372FE950, fd875; +sub.f64 fd1145, fd1574, fd1144; +sub.f64 fd1146, fd896, fd956; +sub.f64 fd1148, fd916, fd936; +mul.f64 fd1572, fd1146, 0d3FEE6F0E134454FF; +mul.f64 fd1573, fd1148, 0dBFE2CF2304755A5E; +sub.f64 fd1150, fd1573, fd1572; +sub.f64 fd1151, fd1145, fd1150; +add.f64 fd1152, fd1150, fd1145; +mul.f64 fd1153, fd1135, 0d3FE9E3779B97F4A8; +sub.f64 fd1154, fd875, fd1153; +fma.rn.f64 fd1155, fd1137, 0d3FD3C6EF372FE950, fd1154; +mul.f64 fd1156, fd1146, 0d3FE2CF2304755A5E; +mul.f64 fd1157, fd1148, 0d3FEE6F0E134454FF; +sub.f64 fd1158, fd1157, fd1156; +sub.f64 fd1159, fd1155, fd1158; +add.f64 fd1160, fd1158, fd1155; +mul.f64 fd1162, fd1141, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1571, fd1139, 0d3FD3C6EF372FE950, fd876; +sub.f64 fd1163, fd1571, fd1162; +sub.f64 fd1164, fd895, fd955; +sub.f64 fd1166, fd915, fd935; +mul.f64 fd1569, fd1164, 0d3FEE6F0E134454FF; +mul.f64 fd1570, fd1166, 0dBFE2CF2304755A5E; +sub.f64 fd1168, fd1570, fd1569; +add.f64 fd1169, fd1168, fd1163; +sub.f64 fd1170, fd1163, fd1168; +mul.f64 fd1171, fd1139, 0d3FE9E3779B97F4A8; +sub.f64 fd1172, fd876, fd1171; +fma.rn.f64 fd1173, fd1141, 0d3FD3C6EF372FE950, fd1172; +mul.f64 fd1174, fd1164, 0d3FE2CF2304755A5E; +mul.f64 fd1175, fd1166, 0d3FEE6F0E134454FF; +sub.f64 fd1176, fd1175, fd1174; +add.f64 fd1177, fd1176, fd1173; +sub.f64 fd1178, fd1173, fd1176; +mul.f64 fd1567, fd1019, 0d3FEEFEA21D101EE0; +mul.f64 fd1568, fd1037, 0dBFCFD511FA1C0796; +sub.f64 fd1181, fd1567, fd1568; +mul.f64 fd1182, fd1037, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd1183, fd1019, 0dBFCFD511FA1C0796, fd1182; +mul.f64 fd1565, fd1063, 0d3FEC0AB44E81C059; +mul.f64 fd1566, fd1081, 0dBFDED50D5CBFA951; +sub.f64 fd1186, fd1565, fd1566; +mul.f64 fd1187, fd1081, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1188, fd1063, 0dBFDED50D5CBFA951, fd1187; +mul.f64 fd1563, fd1107, 0d3FE753B603D2B816; +mul.f64 fd1564, fd1125, 0dBFE5E7CF55112014; +sub.f64 fd1191, fd1563, fd1564; +mul.f64 fd1192, fd1125, 0d3FE753B603D2B816; +fma.rn.f64 fd1193, fd1107, 0dBFE5E7CF55112014, fd1192; +mul.f64 fd1195, fd1169, 0dBFEB04BBFF642E86; +mul.f64 fd1562, fd1151, 0d3FE1257E3C182B51; +sub.f64 fd1196, fd1562, fd1195; +mul.f64 fd1197, fd1169, 0d3FE1257E3C182B51; +fma.rn.f64 fd1198, fd1151, 0dBFEB04BBFF642E86, fd1197; +mul.f64 fd1200, fd1045, 0dBFDED50D5CBFA951; +mul.f64 fd1561, fd1027, 0d3FEC0AB44E81C059; +sub.f64 fd1201, fd1561, fd1200; +mul.f64 fd1202, fd1045, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1203, fd1027, 0dBFDED50D5CBFA951, fd1202; +mul.f64 fd1205, fd1089, 0dBFEB04BBFF642E86; +mul.f64 fd1560, fd1071, 0d3FE1257E3C182B51; +sub.f64 fd1206, fd1560, fd1205; +mul.f64 fd1207, fd1089, 0d3FE1257E3C182B51; +fma.rn.f64 fd1208, fd1071, 0dBFEB04BBFF642E86, fd1207; +mul.f64 fd1210, fd1133, 0dBFEFEFD5BFE443FE; +mul.f64 fd1559, fd1115, 0d3FB0130A1BE09379; +sub.f64 fd1211, fd1559, fd1210; +mul.f64 fd1212, fd1133, 0d3FB0130A1BE09379; +fma.rn.f64 fd1213, fd1115, 0dBFEFEFD5BFE443FE, fd1212; +mul.f64 fd1215, fd1177, 0dBFECF457DCDC158C; +mul.f64 fd1558, fd1159, 0dBFDB3FF7C925819C; +sub.f64 fd1216, fd1558, fd1215; +mul.f64 fd1217, fd1177, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1218, fd1159, 0dBFECF457DCDC158C, fd1217; +mul.f64 fd1556, fd1028, 0d3FE753B603D2B816; +mul.f64 fd1557, fd1046, 0dBFE5E7CF55112014; +sub.f64 fd1221, fd1556, fd1557; +mul.f64 fd1222, fd1046, 0d3FE753B603D2B816; +fma.rn.f64 fd1223, fd1028, 0dBFE5E7CF55112014, fd1222; +mul.f64 fd1554, fd1072, 0d3FB0130A1BE09379; +mul.f64 fd1555, fd1090, 0dBFEFEFD5BFE443FE; +sub.f64 fd1226, fd1554, fd1555; +mul.f64 fd1227, fd1090, 0d3FB0130A1BE09379; +fma.rn.f64 fd1228, fd1072, 0dBFEFEFD5BFE443FE, fd1227; +mul.f64 fd1552, fd1116, 0dBFE465C6FEB501BC; +mul.f64 fd1553, fd1134, 0dBFE8A80B635B6BEA; +sub.f64 fd1231, fd1552, fd1553; +mul.f64 fd1232, fd1134, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1233, fd1116, 0dBFE8A80B635B6BEA, fd1232; +mul.f64 fd1550, fd1160, 0dBFEFBF675480D903; +mul.f64 fd1551, fd1178, 0dBFC00AEB5DA15BE0; +sub.f64 fd1236, fd1550, fd1551; +mul.f64 fd1237, fd1178, 0dBFEFBF675480D903; +fma.rn.f64 fd1238, fd1160, 0dBFC00AEB5DA15BE0, fd1237; +mul.f64 fd1240, fd1038, 0dBFEB04BBFF642E86; +mul.f64 fd1549, fd1020, 0d3FE1257E3C182B51; +sub.f64 fd1241, fd1549, fd1240; +mul.f64 fd1242, fd1038, 0d3FE1257E3C182B51; +fma.rn.f64 fd1243, fd1020, 0dBFEB04BBFF642E86, fd1242; +mul.f64 fd1245, fd1082, 0dBFECF457DCDC158C; +mul.f64 fd1548, fd1064, 0dBFDB3FF7C925819C; +sub.f64 fd1246, fd1548, fd1245; +mul.f64 fd1247, fd1082, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1248, fd1064, 0dBFECF457DCDC158C, fd1247; +mul.f64 fd1250, fd1126, 0dBFC00AEB5DA15BE0; +mul.f64 fd1547, fd1108, 0dBFEFBF675480D903; +sub.f64 fd1251, fd1547, fd1250; +mul.f64 fd1252, fd1126, 0dBFEFBF675480D903; +fma.rn.f64 fd1253, fd1108, 0dBFC00AEB5DA15BE0, fd1252; +mul.f64 fd1255, fd1170, 0d3FE8A80B635B6BEA; +mul.f64 fd1546, fd1152, 0dBFE465C6FEB501BC; +sub.f64 fd1256, fd1546, fd1255; +mul.f64 fd1257, fd1170, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1258, fd1152, 0d3FE8A80B635B6BEA, fd1257; +add.f64 fd1259, fd1006, fd1138; +add.f64 fd1261, fd1050, fd1094; +fma.rn.f64 fd1544, fd1259, 0d3FD3C6EF372FE950, fd962; +mul.f64 fd1545, fd1261, 0d3FE9E3779B97F4A8; +sub.f64 fd1267, fd1544, fd1545; +add.f64 fd1543, fd1010, fd1142; +sub.f64 fd1268, fd1010, fd1142; +add.f64 fd1542, fd1054, fd1098; +sub.f64 fd1270, fd1054, fd1098; +mul.f64 fd1540, fd1268, 0d3FEE6F0E134454FF; +mul.f64 fd1541, fd1270, 0dBFE2CF2304755A5E; +sub.f64 fd1272, fd1541, fd1540; +add.f64 fd1539, fd962, fd1259; +mul.f64 fd1273, fd1259, 0d3FE9E3779B97F4A8; +sub.f64 fd1274, fd962, fd1273; +fma.rn.f64 fd1275, fd1261, 0d3FD3C6EF372FE950, fd1274; +mul.f64 fd1276, fd1268, 0d3FE2CF2304755A5E; +mul.f64 fd1277, fd1270, 0d3FEE6F0E134454FF; +sub.f64 fd1278, fd1277, fd1276; +fma.rn.f64 fd1537, fd1543, 0d3FD3C6EF372FE950, fd966; +mul.f64 fd1538, fd1542, 0d3FE9E3779B97F4A8; +sub.f64 fd1281, fd1537, fd1538; +sub.f64 fd1282, fd1006, fd1138; +sub.f64 fd1284, fd1050, fd1094; +mul.f64 fd1535, fd1282, 0d3FEE6F0E134454FF; +mul.f64 fd1536, fd1284, 0dBFE2CF2304755A5E; +sub.f64 fd1286, fd1536, fd1535; +add.f64 fd1534, fd966, fd1543; +mul.f64 fd1287, fd1543, 0d3FE9E3779B97F4A8; +sub.f64 fd1288, fd966, fd1287; +fma.rn.f64 fd1289, fd1542, 0d3FD3C6EF372FE950, fd1288; +mul.f64 fd1290, fd1282, 0d3FE2CF2304755A5E; +mul.f64 fd1291, fd1284, 0d3FEE6F0E134454FF; +sub.f64 fd1292, fd1291, fd1290; +add.f64 fd1293, fd1181, fd1196; +add.f64 fd1295, fd1186, fd1191; +mul.f64 fd1300, fd1295, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1533, fd1293, 0d3FD3C6EF372FE950, fd975; +sub.f64 fd1301, fd1533, fd1300; +add.f64 fd1532, fd1183, fd1198; +sub.f64 fd1302, fd1183, fd1198; +add.f64 fd1531, fd1188, fd1193; +sub.f64 fd1304, fd1188, fd1193; +mul.f64 fd1305, fd1304, 0dBFE2CF2304755A5E; +mul.f64 fd1530, fd1302, 0d3FEE6F0E134454FF; +sub.f64 fd1306, fd1305, fd1530; +add.f64 fd1529, fd975, fd1293; +mul.f64 fd1307, fd1293, 0d3FE9E3779B97F4A8; +sub.f64 fd1308, fd975, fd1307; +fma.rn.f64 fd1309, fd1295, 0d3FD3C6EF372FE950, fd1308; +mul.f64 fd1310, fd1302, 0d3FE2CF2304755A5E; +mul.f64 fd1311, fd1304, 0d3FEE6F0E134454FF; +sub.f64 fd1312, fd1311, fd1310; +mul.f64 fd1314, fd1531, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1528, fd1532, 0d3FD3C6EF372FE950, fd993; +sub.f64 fd1315, fd1528, fd1314; +sub.f64 fd1316, fd1181, fd1196; +sub.f64 fd1318, fd1186, fd1191; +mul.f64 fd1526, fd1316, 0d3FEE6F0E134454FF; +mul.f64 fd1527, fd1318, 0dBFE2CF2304755A5E; +sub.f64 fd1320, fd1527, fd1526; +add.f64 fd1525, fd993, fd1532; +mul.f64 fd1321, fd1532, 0d3FE9E3779B97F4A8; +sub.f64 fd1322, fd993, fd1321; +fma.rn.f64 fd1323, fd1531, 0d3FD3C6EF372FE950, fd1322; +mul.f64 fd1324, fd1316, 0d3FE2CF2304755A5E; +mul.f64 fd1325, fd1318, 0d3FEE6F0E134454FF; +sub.f64 fd1326, fd1325, fd1324; +add.f64 fd1327, fd1201, fd1216; +add.f64 fd1329, fd1206, fd1211; +fma.rn.f64 fd1523, fd1327, 0d3FD3C6EF372FE950, fd983; +mul.f64 fd1524, fd1329, 0d3FE9E3779B97F4A8; +sub.f64 fd1335, fd1523, fd1524; +add.f64 fd1522, fd1203, fd1218; +sub.f64 fd1336, fd1203, fd1218; +add.f64 fd1521, fd1208, fd1213; +sub.f64 fd1338, fd1208, fd1213; +mul.f64 fd1519, fd1336, 0d3FEE6F0E134454FF; +mul.f64 fd1520, fd1338, 0dBFE2CF2304755A5E; +sub.f64 fd1340, fd1520, fd1519; +add.f64 fd1518, fd983, fd1327; +mul.f64 fd1341, fd1327, 0d3FE9E3779B97F4A8; +sub.f64 fd1342, fd983, fd1341; +fma.rn.f64 fd1343, fd1329, 0d3FD3C6EF372FE950, fd1342; +mul.f64 fd1344, fd1336, 0d3FE2CF2304755A5E; +mul.f64 fd1345, fd1338, 0d3FEE6F0E134454FF; +sub.f64 fd1346, fd1345, fd1344; +fma.rn.f64 fd1516, fd1522, 0d3FD3C6EF372FE950, fd1001; +mul.f64 fd1517, fd1521, 0d3FE9E3779B97F4A8; +sub.f64 fd1349, fd1516, fd1517; +sub.f64 fd1350, fd1201, fd1216; +sub.f64 fd1352, fd1206, fd1211; +mul.f64 fd1514, fd1350, 0d3FEE6F0E134454FF; +mul.f64 fd1515, fd1352, 0dBFE2CF2304755A5E; +sub.f64 fd1354, fd1515, fd1514; +add.f64 fd1513, fd1001, fd1522; +mul.f64 fd1355, fd1522, 0d3FE9E3779B97F4A8; +sub.f64 fd1356, fd1001, fd1355; +fma.rn.f64 fd1357, fd1521, 0d3FD3C6EF372FE950, fd1356; +mul.f64 fd1358, fd1350, 0d3FE2CF2304755A5E; +mul.f64 fd1359, fd1352, 0d3FEE6F0E134454FF; +sub.f64 fd1360, fd1359, fd1358; +add.f64 fd1361, fd1221, fd1236; +add.f64 fd1363, fd1226, fd1231; +mul.f64 fd1368, fd1363, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1512, fd1361, 0d3FD3C6EF372FE950, fd984; +sub.f64 fd1369, fd1512, fd1368; +add.f64 fd1511, fd1223, fd1238; +sub.f64 fd1370, fd1223, fd1238; +add.f64 fd1510, fd1228, fd1233; +sub.f64 fd1372, fd1228, fd1233; +mul.f64 fd1508, fd1370, 0d3FEE6F0E134454FF; +mul.f64 fd1509, fd1372, 0dBFE2CF2304755A5E; +sub.f64 fd1374, fd1509, fd1508; +add.f64 fd1507, fd984, fd1361; +mul.f64 fd1375, fd1361, 0d3FE9E3779B97F4A8; +sub.f64 fd1376, fd984, fd1375; +fma.rn.f64 fd1377, fd1363, 0d3FD3C6EF372FE950, fd1376; +mul.f64 fd1378, fd1370, 0d3FE2CF2304755A5E; +mul.f64 fd1379, fd1372, 0d3FEE6F0E134454FF; +sub.f64 fd1380, fd1379, fd1378; +fma.rn.f64 fd1505, fd1511, 0d3FD3C6EF372FE950, fd1002; +mul.f64 fd1506, fd1510, 0d3FE9E3779B97F4A8; +sub.f64 fd1383, fd1505, fd1506; +sub.f64 fd1384, fd1221, fd1236; +sub.f64 fd1386, fd1226, fd1231; +mul.f64 fd1503, fd1384, 0d3FEE6F0E134454FF; +mul.f64 fd1504, fd1386, 0dBFE2CF2304755A5E; +sub.f64 fd1388, fd1504, fd1503; +add.f64 fd1502, fd1002, fd1511; +mul.f64 fd1389, fd1511, 0d3FE9E3779B97F4A8; +sub.f64 fd1390, fd1002, fd1389; +fma.rn.f64 fd1391, fd1510, 0d3FD3C6EF372FE950, fd1390; +mul.f64 fd1392, fd1384, 0d3FE2CF2304755A5E; +mul.f64 fd1393, fd1386, 0d3FEE6F0E134454FF; +sub.f64 fd1394, fd1393, fd1392; +add.f64 fd1395, fd1241, fd1256; +add.f64 fd1397, fd1246, fd1251; +mul.f64 fd1402, fd1397, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1501, fd1395, 0d3FD3C6EF372FE950, fd976; +sub.f64 fd1403, fd1501, fd1402; +add.f64 fd1500, fd1243, fd1258; +sub.f64 fd1404, fd1243, fd1258; +add.f64 fd1499, fd1248, fd1253; +sub.f64 fd1406, fd1248, fd1253; +mul.f64 fd1407, fd1406, 0dBFE2CF2304755A5E; +mul.f64 fd1498, fd1404, 0d3FEE6F0E134454FF; +sub.f64 fd1408, fd1407, fd1498; +add.f64 fd1497, fd976, fd1395; +mul.f64 fd1409, fd1395, 0d3FE9E3779B97F4A8; +sub.f64 fd1410, fd976, fd1409; +fma.rn.f64 fd1411, fd1397, 0d3FD3C6EF372FE950, fd1410; +mul.f64 fd1412, fd1404, 0d3FE2CF2304755A5E; +mul.f64 fd1413, fd1406, 0d3FEE6F0E134454FF; +sub.f64 fd1414, fd1413, fd1412; +mul.f64 fd1416, fd1499, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1496, fd1500, 0d3FD3C6EF372FE950, fd994; +sub.f64 fd1417, fd1496, fd1416; +sub.f64 fd1418, fd1241, fd1256; +sub.f64 fd1420, fd1246, fd1251; +mul.f64 fd1494, fd1418, 0d3FEE6F0E134454FF; +mul.f64 fd1495, fd1420, 0dBFE2CF2304755A5E; +sub.f64 fd1422, fd1495, fd1494; +add.f64 fd1493, fd994, fd1500; +mul.f64 fd1423, fd1500, 0d3FE9E3779B97F4A8; +sub.f64 fd1424, fd994, fd1423; +fma.rn.f64 fd1425, fd1499, 0d3FD3C6EF372FE950, fd1424; +mul.f64 fd1426, fd1418, 0d3FE2CF2304755A5E; +mul.f64 fd1427, fd1420, 0d3FEE6F0E134454FF; +sub.f64 fd1428, fd1427, fd1426; +add.f64 %1, fd1542, fd1534; +add.f64 %0, fd1261, fd1539; +add.f64 %3, fd1531, fd1525; +add.f64 %2, fd1295, fd1529; +add.f64 %5, fd1521, fd1513; +add.f64 %4, fd1329, fd1518; +add.f64 %7, fd1510, fd1502; +add.f64 %6, fd1363, fd1507; +add.f64 %9, fd1499, fd1493; +add.f64 %8, fd1397, fd1497; +sub.f64 %10, fd1267, fd1272; +add.f64 %11, fd1286, fd1281; +sub.f64 %12, fd1301, fd1306; +add.f64 %13, fd1320, fd1315; +sub.f64 %14, fd1335, fd1340; +add.f64 %15, fd1354, fd1349; +add.f64 %17, fd1388, fd1383; +sub.f64 %16, fd1369, fd1374; +add.f64 %19, fd1422, fd1417; +sub.f64 %18, fd1403, fd1408; +add.f64 %21, fd1292, fd1289; +sub.f64 %20, fd1275, fd1278; +sub.f64 %22, fd1309, fd1312; +add.f64 %23, fd1326, fd1323; +sub.f64 %24, fd1343, fd1346; +add.f64 %25, fd1360, fd1357; +sub.f64 %26, fd1377, fd1380; +add.f64 %27, fd1394, fd1391; +add.f64 %29, fd1428, fd1425; +sub.f64 %28, fd1411, fd1414; +sub.f64 %31, fd1289, fd1292; +add.f64 %30, fd1278, fd1275; +sub.f64 %33, fd1323, fd1326; +add.f64 %32, fd1312, fd1309; +sub.f64 %35, fd1357, fd1360; +add.f64 %34, fd1346, fd1343; +sub.f64 %37, fd1391, fd1394; +add.f64 %36, fd1380, fd1377; +sub.f64 %39, fd1425, fd1428; +add.f64 %38, fd1414, fd1411; +sub.f64 %41, fd1281, fd1286; +add.f64 %40, fd1272, fd1267; +sub.f64 %43, fd1315, fd1320; +add.f64 %42, fd1306, fd1301; +sub.f64 %45, fd1349, fd1354; +add.f64 %44, fd1340, fd1335; +sub.f64 %47, fd1383, fd1388; +add.f64 %46, fd1374, fd1369; +sub.f64 %49, fd1417, fd1422; +add.f64 %48, fd1408, fd1403; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_625), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[5].y), "d"(rmem[10].y), "d"(rmem[20].y), "d"(rmem[16].y), "d"(rmem[1].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[22].y), "d"(rmem[7].y), "d"(rmem[17].y), "d"(rmem[23].y), "d"(rmem[8].y), "d"(rmem[13].y), "d"(rmem[19].y), "d"(rmem[4].y), "d"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<541, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<341>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 5000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %16, %24; +add.f64 fd22, %14, fd21; +add.f64 fd23, %19, %22; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %18, %25; +add.f64 fd26, %15, fd25; +add.f64 fd27, %21, %23; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %14; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %18, %25; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %21, %23; +mul.f64 fd35, fd34, 0dBFE2CF2304755A5E; +sub.f64 fd36, fd35, fd33; +sub.f64 fd37, fd31, fd36; +add.f64 fd38, fd36, fd31; +mul.f64 fd39, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd40, %14, fd39; +fma.rn.f64 fd41, fd23, 0d3FD3C6EF372FE950, fd40; +mul.f64 fd42, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd43, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd44, fd43, fd42; +sub.f64 fd45, fd41, fd44; +add.f64 fd46, fd44, fd41; +fma.rn.f64 fd47, fd25, 0d3FD3C6EF372FE950, %15; +mul.f64 fd48, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd49, fd47, fd48; +sub.f64 fd50, %16, %24; +mul.f64 fd51, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd52, %19, %22; +mul.f64 fd53, fd52, 0dBFE2CF2304755A5E; +sub.f64 fd54, fd53, fd51; +add.f64 fd55, fd54, fd49; +sub.f64 fd56, fd49, fd54; +mul.f64 fd57, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd58, %15, fd57; +fma.rn.f64 fd59, fd27, 0d3FD3C6EF372FE950, fd58; +mul.f64 fd60, fd50, 0d3FE2CF2304755A5E; +mul.f64 fd61, fd52, 0d3FEE6F0E134454FF; +sub.f64 fd62, fd61, fd60; +add.f64 fd63, fd62, fd59; +sub.f64 fd64, fd59, fd62; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd65, fd66}, [rd6]; +mul.f64 fd69, fd65, fd37; +mul.f64 fd70, fd66, fd55; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd65, fd55; +fma.rn.f64 fd73, fd66, fd37, fd72; +mul.f64 fd74, fd65, fd65; +mul.f64 fd75, fd66, fd66; +sub.f64 fd76, fd74, fd75; +mul.f64 fd77, fd66, fd65; +fma.rn.f64 fd78, fd66, fd65, fd77; +mul.f64 fd79, fd76, fd45; +mul.f64 fd80, fd78, fd63; +sub.f64 fd81, fd79, fd80; +mul.f64 fd82, fd76, fd63; +fma.rn.f64 fd83, fd78, fd45, fd82; +ld.global.v2.f64 {fd84, fd85}, [rd6+2000]; +mul.f64 fd88, fd84, fd46; +mul.f64 fd89, fd85, fd64; +sub.f64 fd90, fd88, fd89; +mul.f64 fd91, fd84, fd64; +fma.rn.f64 fd92, fd85, fd46, fd91; +mul.f64 fd93, fd65, fd84; +mul.f64 fd94, fd66, fd85; +sub.f64 fd95, fd93, fd94; +mul.f64 fd96, fd65, fd85; +fma.rn.f64 fd97, fd66, fd84, fd96; +mul.f64 fd98, fd95, fd38; +mul.f64 fd99, fd97, fd56; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd95, fd56; +fma.rn.f64 fd102, fd97, fd38, fd101; +mad.lo.s32 r8, r5, 5000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd103, [r11]; +ld.shared.f64 fd104, [r11+1000]; +ld.shared.f64 fd105, [r11+2000]; +ld.shared.f64 fd106, [r11+3000]; +ld.shared.f64 fd107, [r11+4000]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd73; +st.shared.f64 [r9+16], fd83; +st.shared.f64 [r9+24], fd92; +st.shared.f64 [r9+32], fd102; +barrier.sync 0; +ld.shared.f64 fd108, [r11]; +ld.shared.f64 fd109, [r11+1000]; +ld.shared.f64 fd110, [r11+2000]; +ld.shared.f64 fd111, [r11+3000]; +ld.shared.f64 fd112, [r11+4000]; +add.f64 fd113, fd104, fd107; +add.f64 fd114, fd103, fd113; +add.f64 fd115, fd105, fd106; +add.f64 fd116, fd115, fd114; +add.f64 fd117, fd109, fd112; +add.f64 fd118, fd108, fd117; +add.f64 fd119, fd110, fd111; +add.f64 fd120, fd119, fd118; +fma.rn.f64 fd121, fd113, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd122, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd109, fd112; +mul.f64 fd125, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd126, fd110, fd111; +mul.f64 fd127, fd126, 0dBFE2CF2304755A5E; +sub.f64 fd128, fd127, fd125; +sub.f64 fd129, fd123, fd128; +add.f64 fd130, fd128, fd123; +mul.f64 fd131, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd132, fd103, fd131; +fma.rn.f64 fd133, fd115, 0d3FD3C6EF372FE950, fd132; +mul.f64 fd134, fd124, 0d3FE2CF2304755A5E; +mul.f64 fd135, fd126, 0d3FEE6F0E134454FF; +sub.f64 fd136, fd135, fd134; +sub.f64 fd137, fd133, fd136; +add.f64 fd138, fd136, fd133; +fma.rn.f64 fd139, fd117, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd140, fd119, 0d3FE9E3779B97F4A8; +sub.f64 fd141, fd139, fd140; +sub.f64 fd142, fd104, fd107; +mul.f64 fd143, fd142, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd105, fd106; +mul.f64 fd145, fd144, 0dBFE2CF2304755A5E; +sub.f64 fd146, fd145, fd143; +add.f64 fd147, fd146, fd141; +sub.f64 fd148, fd141, fd146; +mul.f64 fd149, fd117, 0d3FE9E3779B97F4A8; +sub.f64 fd150, fd108, fd149; +fma.rn.f64 fd151, fd119, 0d3FD3C6EF372FE950, fd150; +mul.f64 fd152, fd142, 0d3FE2CF2304755A5E; +mul.f64 fd153, fd144, 0d3FEE6F0E134454FF; +sub.f64 fd154, fd153, fd152; +add.f64 fd155, fd154, fd151; +sub.f64 fd156, fd151, fd154; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd157, fd158}, [rd11]; +mul.f64 fd161, fd157, fd129; +mul.f64 fd162, fd158, fd147; +sub.f64 fd163, fd161, fd162; +mul.f64 fd164, fd157, fd147; +fma.rn.f64 fd165, fd158, fd129, fd164; +mul.f64 fd166, fd157, fd157; +mul.f64 fd167, fd158, fd158; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd158, fd157; +fma.rn.f64 fd170, fd158, fd157, fd169; +mul.f64 fd171, fd168, fd137; +mul.f64 fd172, fd170, fd155; +sub.f64 fd173, fd171, fd172; +mul.f64 fd174, fd168, fd155; +fma.rn.f64 fd175, fd170, fd137, fd174; +ld.global.v2.f64 {fd176, fd177}, [rd11+400]; +mul.f64 fd180, fd176, fd138; +mul.f64 fd181, fd177, fd156; +sub.f64 fd182, fd180, fd181; +mul.f64 fd183, fd176, fd156; +fma.rn.f64 fd184, fd177, fd138, fd183; +mul.f64 fd185, fd157, fd176; +mul.f64 fd186, fd158, fd177; +sub.f64 fd187, fd185, fd186; +mul.f64 fd188, fd157, fd177; +fma.rn.f64 fd189, fd158, fd176, fd188; +mul.f64 fd190, fd187, fd130; +mul.f64 fd191, fd189, fd148; +sub.f64 fd192, fd190, fd191; +mul.f64 fd193, fd187, fd148; +fma.rn.f64 fd194, fd189, fd130, fd193; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +st.shared.f64 [r17], fd116; +st.shared.f64 [r17+40], fd163; +st.shared.f64 [r17+80], fd173; +st.shared.f64 [r17+120], fd182; +st.shared.f64 [r17+160], fd192; +barrier.sync 0; +ld.shared.f64 fd195, [r11]; +ld.shared.f64 fd196, [r11+1000]; +ld.shared.f64 fd197, [r11+2000]; +ld.shared.f64 fd198, [r11+3000]; +ld.shared.f64 fd199, [r11+4000]; +barrier.sync 0; +st.shared.f64 [r17], fd120; +st.shared.f64 [r17+40], fd165; +st.shared.f64 [r17+80], fd175; +st.shared.f64 [r17+120], fd184; +st.shared.f64 [r17+160], fd194; +barrier.sync 0; +ld.shared.f64 fd200, [r11]; +ld.shared.f64 fd201, [r11+1000]; +ld.shared.f64 fd202, [r11+2000]; +ld.shared.f64 fd203, [r11+3000]; +ld.shared.f64 fd204, [r11+4000]; +add.f64 fd205, fd196, fd199; +add.f64 fd206, fd195, fd205; +add.f64 fd207, fd197, fd198; +add.f64 fd208, fd207, fd206; +add.f64 fd209, fd201, fd204; +add.f64 fd210, fd200, fd209; +add.f64 fd211, fd202, fd203; +add.f64 fd212, fd211, fd210; +fma.rn.f64 fd213, fd205, 0d3FD3C6EF372FE950, fd195; +mul.f64 fd214, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd215, fd213, fd214; +sub.f64 fd216, fd201, fd204; +mul.f64 fd217, fd216, 0d3FEE6F0E134454FF; +sub.f64 fd218, fd202, fd203; +mul.f64 fd219, fd218, 0dBFE2CF2304755A5E; +sub.f64 fd220, fd219, fd217; +sub.f64 fd221, fd215, fd220; +add.f64 fd222, fd220, fd215; +mul.f64 fd223, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd224, fd195, fd223; +fma.rn.f64 fd225, fd207, 0d3FD3C6EF372FE950, fd224; +mul.f64 fd226, fd216, 0d3FE2CF2304755A5E; +mul.f64 fd227, fd218, 0d3FEE6F0E134454FF; +sub.f64 fd228, fd227, fd226; +sub.f64 fd229, fd225, fd228; +add.f64 fd230, fd228, fd225; +fma.rn.f64 fd231, fd209, 0d3FD3C6EF372FE950, fd200; +mul.f64 fd232, fd211, 0d3FE9E3779B97F4A8; +sub.f64 fd233, fd231, fd232; +sub.f64 fd234, fd196, fd199; +mul.f64 fd235, fd234, 0d3FEE6F0E134454FF; +sub.f64 fd236, fd197, fd198; +mul.f64 fd237, fd236, 0dBFE2CF2304755A5E; +sub.f64 fd238, fd237, fd235; +add.f64 fd239, fd238, fd233; +sub.f64 fd240, fd233, fd238; +mul.f64 fd241, fd209, 0d3FE9E3779B97F4A8; +sub.f64 fd242, fd200, fd241; +fma.rn.f64 fd243, fd211, 0d3FD3C6EF372FE950, fd242; +mul.f64 fd244, fd234, 0d3FE2CF2304755A5E; +mul.f64 fd245, fd236, 0d3FEE6F0E134454FF; +sub.f64 fd246, fd245, fd244; +add.f64 fd247, fd246, fd243; +sub.f64 fd248, fd243, fd246; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd249, fd250}, [rd16]; +mul.f64 fd253, fd249, fd221; +mul.f64 fd254, fd250, fd239; +sub.f64 fd255, fd253, fd254; +mul.f64 fd256, fd249, fd239; +fma.rn.f64 fd257, fd250, fd221, fd256; +mul.f64 fd258, fd249, fd249; +mul.f64 fd259, fd250, fd250; +sub.f64 fd260, fd258, fd259; +mul.f64 fd261, fd250, fd249; +fma.rn.f64 fd262, fd250, fd249, fd261; +mul.f64 fd263, fd260, fd229; +mul.f64 fd264, fd262, fd247; +sub.f64 fd265, fd263, fd264; +mul.f64 fd266, fd260, fd247; +fma.rn.f64 fd267, fd262, fd229, fd266; +ld.global.v2.f64 {fd268, fd269}, [rd16+80]; +mul.f64 fd272, fd268, fd230; +mul.f64 fd273, fd269, fd248; +sub.f64 fd274, fd272, fd273; +mul.f64 fd275, fd268, fd248; +fma.rn.f64 fd276, fd269, fd230, fd275; +mul.f64 fd277, fd249, fd268; +mul.f64 fd278, fd250, fd269; +sub.f64 fd279, fd277, fd278; +mul.f64 fd280, fd249, fd269; +fma.rn.f64 fd281, fd250, fd268, fd280; +mul.f64 fd282, fd279, fd222; +mul.f64 fd283, fd281, fd240; +sub.f64 fd284, fd282, fd283; +mul.f64 fd285, fd279, fd240; +fma.rn.f64 fd286, fd281, fd222, fd285; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +st.shared.f64 [r23], fd208; +st.shared.f64 [r23+200], fd255; +st.shared.f64 [r23+400], fd265; +st.shared.f64 [r23+600], fd274; +st.shared.f64 [r23+800], fd284; +barrier.sync 0; +ld.shared.f64 fd287, [r11]; +ld.shared.f64 fd288, [r11+1000]; +ld.shared.f64 fd289, [r11+2000]; +ld.shared.f64 fd290, [r11+3000]; +ld.shared.f64 fd291, [r11+4000]; +barrier.sync 0; +st.shared.f64 [r23], fd212; +st.shared.f64 [r23+200], fd257; +st.shared.f64 [r23+400], fd267; +st.shared.f64 [r23+600], fd276; +st.shared.f64 [r23+800], fd286; +barrier.sync 0; +ld.shared.f64 fd292, [r11]; +ld.shared.f64 fd293, [r11+1000]; +ld.shared.f64 fd294, [r11+2000]; +ld.shared.f64 fd295, [r11+3000]; +ld.shared.f64 fd296, [r11+4000]; +add.f64 fd297, fd288, fd291; +add.f64 fd298, fd287, fd297; +add.f64 fd299, fd289, fd290; +add.f64 fd300, fd293, fd296; +add.f64 fd301, fd292, fd300; +add.f64 fd302, fd294, fd295; +fma.rn.f64 fd303, fd297, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd304, fd299, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd303, fd304; +sub.f64 fd306, fd293, fd296; +mul.f64 fd307, fd306, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd294, fd295; +mul.f64 fd309, fd308, 0dBFE2CF2304755A5E; +sub.f64 fd310, fd309, fd307; +mul.f64 fd311, fd297, 0d3FE9E3779B97F4A8; +sub.f64 fd312, fd287, fd311; +fma.rn.f64 fd313, fd299, 0d3FD3C6EF372FE950, fd312; +mul.f64 fd314, fd306, 0d3FE2CF2304755A5E; +mul.f64 fd315, fd308, 0d3FEE6F0E134454FF; +sub.f64 fd316, fd315, fd314; +fma.rn.f64 fd317, fd300, 0d3FD3C6EF372FE950, fd292; +mul.f64 fd318, fd302, 0d3FE9E3779B97F4A8; +sub.f64 fd319, fd317, fd318; +sub.f64 fd320, fd288, fd291; +mul.f64 fd321, fd320, 0d3FEE6F0E134454FF; +sub.f64 fd322, fd289, fd290; +mul.f64 fd323, fd322, 0dBFE2CF2304755A5E; +sub.f64 fd324, fd323, fd321; +mul.f64 fd325, fd300, 0d3FE9E3779B97F4A8; +sub.f64 fd326, fd292, fd325; +fma.rn.f64 fd327, fd302, 0d3FD3C6EF372FE950, fd326; +mul.f64 fd328, fd320, 0d3FE2CF2304755A5E; +mul.f64 fd329, fd322, 0d3FEE6F0E134454FF; +sub.f64 fd330, fd329, fd328; +add.f64 %0, fd299, fd298; +add.f64 %1, fd302, fd301; +add.f64 %3, fd324, fd319; +sub.f64 %2, fd305, fd310; +sub.f64 %4, fd313, fd316; +add.f64 %5, fd330, fd327; +add.f64 %6, fd316, fd313; +sub.f64 %7, fd327, fd330; +sub.f64 %9, fd319, fd324; +add.f64 %8, fd310, fd305; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<542, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<371>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 10000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %16, %24; +add.f64 fd22, %14, fd21; +add.f64 fd23, %19, %22; +add.f64 fd24, %18, %25; +add.f64 fd25, %15, fd24; +add.f64 fd26, %21, %23; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %14; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %18, %25; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %21, %23; +mul.f64 fd33, fd32, 0dBFE2CF2304755A5E; +sub.f64 fd34, fd33, fd31; +sub.f64 fd35, fd29, fd34; +add.f64 fd36, fd34, fd29; +mul.f64 fd37, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd38, %14, fd37; +fma.rn.f64 fd39, fd23, 0d3FD3C6EF372FE950, fd38; +mul.f64 fd40, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd41, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd42, fd41, fd40; +sub.f64 fd43, fd39, fd42; +add.f64 fd44, fd42, fd39; +fma.rn.f64 fd45, fd24, 0d3FD3C6EF372FE950, %15; +mul.f64 fd46, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd47, fd45, fd46; +sub.f64 fd48, %16, %24; +mul.f64 fd49, fd48, 0d3FEE6F0E134454FF; +sub.f64 fd50, %19, %22; +mul.f64 fd51, fd50, 0dBFE2CF2304755A5E; +sub.f64 fd52, fd51, fd49; +add.f64 fd53, fd52, fd47; +sub.f64 fd54, fd47, fd52; +mul.f64 fd55, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %15, fd55; +fma.rn.f64 fd57, fd26, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd48, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd50, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd59, fd58; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 10000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd63, fd35; +mul.f64 fd68, fd64, fd53; +mul.f64 fd69, fd63, fd53; +mul.f64 fd70, fd63, fd63; +mul.f64 fd71, fd64, fd64; +sub.f64 fd72, fd70, fd71; +mul.f64 fd73, fd64, fd63; +fma.rn.f64 fd74, fd64, fd63, fd73; +mul.f64 fd75, fd72, fd43; +mul.f64 fd76, fd74, fd61; +mul.f64 fd77, fd72, fd61; +ld.global.v2.f64 {fd78, fd79}, [rd6+2000]; +mul.f64 fd82, fd78, fd44; +mul.f64 fd83, fd79, fd62; +mul.f64 fd84, fd78, fd62; +mul.f64 fd85, fd63, fd78; +mul.f64 fd86, fd64, fd79; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd63, fd79; +fma.rn.f64 fd89, fd64, fd78, fd88; +mul.f64 fd90, fd87, fd36; +mul.f64 fd91, fd89, fd54; +mul.f64 fd92, fd87, fd54; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd93, fd26, fd25; +add.f64 fd94, fd23, fd22; +st.shared.v2.f64 [r9], {fd94, fd93}; +fma.rn.f64 fd95, fd64, fd35, fd69; +sub.f64 fd96, fd67, fd68; +st.shared.v2.f64 [r9+16], {fd96, fd95}; +fma.rn.f64 fd97, fd74, fd43, fd77; +sub.f64 fd98, fd75, fd76; +st.shared.v2.f64 [r9+32], {fd98, fd97}; +fma.rn.f64 fd99, fd79, fd44, fd84; +sub.f64 fd100, fd82, fd83; +st.shared.v2.f64 [r9+48], {fd100, fd99}; +fma.rn.f64 fd101, fd89, fd36, fd92; +sub.f64 fd102, fd90, fd91; +st.shared.v2.f64 [r9+64], {fd102, fd101}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd103, fd104}, [r11]; +ld.shared.v2.f64 {fd107, fd108}, [r11+2000]; +ld.shared.v2.f64 {fd111, fd112}, [r11+4000]; +ld.shared.v2.f64 {fd115, fd116}, [r11+6000]; +ld.shared.v2.f64 {fd119, fd120}, [r11+8000]; +add.f64 fd123, fd107, fd119; +add.f64 fd124, fd103, fd123; +add.f64 fd125, fd111, fd115; +add.f64 fd126, fd108, fd120; +add.f64 fd127, fd104, fd126; +add.f64 fd128, fd112, fd116; +fma.rn.f64 fd129, fd123, 0d3FD3C6EF372FE950, fd103; +mul.f64 fd130, fd125, 0d3FE9E3779B97F4A8; +sub.f64 fd131, fd129, fd130; +sub.f64 fd132, fd108, fd120; +mul.f64 fd133, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd134, fd112, fd116; +mul.f64 fd135, fd134, 0dBFE2CF2304755A5E; +sub.f64 fd136, fd135, fd133; +sub.f64 fd137, fd131, fd136; +add.f64 fd138, fd136, fd131; +mul.f64 fd139, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd140, fd103, fd139; +fma.rn.f64 fd141, fd125, 0d3FD3C6EF372FE950, fd140; +mul.f64 fd142, fd132, 0d3FE2CF2304755A5E; +mul.f64 fd143, fd134, 0d3FEE6F0E134454FF; +sub.f64 fd144, fd143, fd142; +sub.f64 fd145, fd141, fd144; +add.f64 fd146, fd144, fd141; +fma.rn.f64 fd147, fd126, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd148, fd128, 0d3FE9E3779B97F4A8; +sub.f64 fd149, fd147, fd148; +sub.f64 fd150, fd107, fd119; +mul.f64 fd151, fd150, 0d3FEE6F0E134454FF; +sub.f64 fd152, fd111, fd115; +mul.f64 fd153, fd152, 0dBFE2CF2304755A5E; +sub.f64 fd154, fd153, fd151; +add.f64 fd155, fd154, fd149; +sub.f64 fd156, fd149, fd154; +mul.f64 fd157, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd158, fd104, fd157; +fma.rn.f64 fd159, fd128, 0d3FD3C6EF372FE950, fd158; +mul.f64 fd160, fd150, 0d3FE2CF2304755A5E; +mul.f64 fd161, fd152, 0d3FEE6F0E134454FF; +sub.f64 fd162, fd161, fd160; +add.f64 fd163, fd162, fd159; +sub.f64 fd164, fd159, fd162; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd165, fd166}, [rd11]; +mul.f64 fd169, fd165, fd137; +mul.f64 fd170, fd166, fd155; +mul.f64 fd171, fd165, fd155; +mul.f64 fd172, fd165, fd165; +mul.f64 fd173, fd166, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd166, fd165; +fma.rn.f64 fd176, fd166, fd165, fd175; +mul.f64 fd177, fd174, fd145; +mul.f64 fd178, fd176, fd163; +mul.f64 fd179, fd174, fd163; +ld.global.v2.f64 {fd180, fd181}, [rd11+400]; +mul.f64 fd184, fd180, fd146; +mul.f64 fd185, fd181, fd164; +mul.f64 fd186, fd180, fd164; +mul.f64 fd187, fd165, fd180; +mul.f64 fd188, fd166, fd181; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd165, fd181; +fma.rn.f64 fd191, fd166, fd180, fd190; +mul.f64 fd192, fd189, fd138; +mul.f64 fd193, fd191, fd156; +mul.f64 fd194, fd189, fd156; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 400, r16; +add.f64 fd195, fd128, fd127; +add.f64 fd196, fd125, fd124; +st.shared.v2.f64 [r17], {fd196, fd195}; +fma.rn.f64 fd197, fd166, fd137, fd171; +sub.f64 fd198, fd169, fd170; +st.shared.v2.f64 [r17+80], {fd198, fd197}; +fma.rn.f64 fd199, fd176, fd145, fd179; +sub.f64 fd200, fd177, fd178; +st.shared.v2.f64 [r17+160], {fd200, fd199}; +fma.rn.f64 fd201, fd181, fd146, fd186; +sub.f64 fd202, fd184, fd185; +st.shared.v2.f64 [r17+240], {fd202, fd201}; +fma.rn.f64 fd203, fd191, fd138, fd194; +sub.f64 fd204, fd192, fd193; +st.shared.v2.f64 [r17+320], {fd204, fd203}; +barrier.sync 0; +ld.shared.v2.f64 {fd205, fd206}, [r11]; +ld.shared.v2.f64 {fd209, fd210}, [r11+2000]; +ld.shared.v2.f64 {fd213, fd214}, [r11+4000]; +ld.shared.v2.f64 {fd217, fd218}, [r11+6000]; +ld.shared.v2.f64 {fd221, fd222}, [r11+8000]; +add.f64 fd225, fd209, fd221; +add.f64 fd226, fd205, fd225; +add.f64 fd227, fd213, fd217; +add.f64 fd228, fd210, fd222; +add.f64 fd229, fd206, fd228; +add.f64 fd230, fd214, fd218; +fma.rn.f64 fd231, fd225, 0d3FD3C6EF372FE950, fd205; +mul.f64 fd232, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd233, fd231, fd232; +sub.f64 fd234, fd210, fd222; +mul.f64 fd235, fd234, 0d3FEE6F0E134454FF; +sub.f64 fd236, fd214, fd218; +mul.f64 fd237, fd236, 0dBFE2CF2304755A5E; +sub.f64 fd238, fd237, fd235; +sub.f64 fd239, fd233, fd238; +add.f64 fd240, fd238, fd233; +mul.f64 fd241, fd225, 0d3FE9E3779B97F4A8; +sub.f64 fd242, fd205, fd241; +fma.rn.f64 fd243, fd227, 0d3FD3C6EF372FE950, fd242; +mul.f64 fd244, fd234, 0d3FE2CF2304755A5E; +mul.f64 fd245, fd236, 0d3FEE6F0E134454FF; +sub.f64 fd246, fd245, fd244; +sub.f64 fd247, fd243, fd246; +add.f64 fd248, fd246, fd243; +fma.rn.f64 fd249, fd228, 0d3FD3C6EF372FE950, fd206; +mul.f64 fd250, fd230, 0d3FE9E3779B97F4A8; +sub.f64 fd251, fd249, fd250; +sub.f64 fd252, fd209, fd221; +mul.f64 fd253, fd252, 0d3FEE6F0E134454FF; +sub.f64 fd254, fd213, fd217; +mul.f64 fd255, fd254, 0dBFE2CF2304755A5E; +sub.f64 fd256, fd255, fd253; +add.f64 fd257, fd256, fd251; +sub.f64 fd258, fd251, fd256; +mul.f64 fd259, fd228, 0d3FE9E3779B97F4A8; +sub.f64 fd260, fd206, fd259; +fma.rn.f64 fd261, fd230, 0d3FD3C6EF372FE950, fd260; +mul.f64 fd262, fd252, 0d3FE2CF2304755A5E; +mul.f64 fd263, fd254, 0d3FEE6F0E134454FF; +sub.f64 fd264, fd263, fd262; +add.f64 fd265, fd264, fd261; +sub.f64 fd266, fd261, fd264; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd267, fd268}, [rd16]; +mul.f64 fd271, fd267, fd239; +mul.f64 fd272, fd268, fd257; +mul.f64 fd273, fd267, fd257; +mul.f64 fd274, fd267, fd267; +mul.f64 fd275, fd268, fd268; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd268, fd267; +fma.rn.f64 fd278, fd268, fd267, fd277; +mul.f64 fd279, fd276, fd247; +mul.f64 fd280, fd278, fd265; +mul.f64 fd281, fd276, fd265; +ld.global.v2.f64 {fd282, fd283}, [rd16+80]; +mul.f64 fd286, fd282, fd248; +mul.f64 fd287, fd283, fd266; +mul.f64 fd288, fd282, fd266; +mul.f64 fd289, fd267, fd282; +mul.f64 fd290, fd268, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd267, fd283; +fma.rn.f64 fd293, fd268, fd282, fd292; +mul.f64 fd294, fd291, fd240; +mul.f64 fd295, fd293, fd258; +mul.f64 fd296, fd291, fd258; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2000, r22; +add.f64 fd297, fd230, fd229; +add.f64 fd298, fd227, fd226; +st.shared.v2.f64 [r23], {fd298, fd297}; +fma.rn.f64 fd299, fd268, fd239, fd273; +sub.f64 fd300, fd271, fd272; +st.shared.v2.f64 [r23+400], {fd300, fd299}; +fma.rn.f64 fd301, fd278, fd247, fd281; +sub.f64 fd302, fd279, fd280; +st.shared.v2.f64 [r23+800], {fd302, fd301}; +fma.rn.f64 fd303, fd283, fd248, fd288; +sub.f64 fd304, fd286, fd287; +st.shared.v2.f64 [r23+1200], {fd304, fd303}; +fma.rn.f64 fd305, fd293, fd240, fd296; +sub.f64 fd306, fd294, fd295; +st.shared.v2.f64 [r23+1600], {fd306, fd305}; +barrier.sync 0; +ld.shared.v2.f64 {fd307, fd308}, [r11]; +ld.shared.v2.f64 {fd311, fd312}, [r11+2000]; +ld.shared.v2.f64 {fd315, fd316}, [r11+4000]; +ld.shared.v2.f64 {fd319, fd320}, [r11+6000]; +ld.shared.v2.f64 {fd323, fd324}, [r11+8000]; +add.f64 fd327, fd311, fd323; +add.f64 fd328, fd307, fd327; +add.f64 fd329, fd315, fd319; +add.f64 fd330, fd312, fd324; +add.f64 fd331, fd308, fd330; +add.f64 fd332, fd316, fd320; +fma.rn.f64 fd333, fd327, 0d3FD3C6EF372FE950, fd307; +mul.f64 fd334, fd329, 0d3FE9E3779B97F4A8; +sub.f64 fd335, fd333, fd334; +sub.f64 fd336, fd312, fd324; +mul.f64 fd337, fd336, 0d3FEE6F0E134454FF; +sub.f64 fd338, fd316, fd320; +mul.f64 fd339, fd338, 0dBFE2CF2304755A5E; +sub.f64 fd340, fd339, fd337; +mul.f64 fd341, fd327, 0d3FE9E3779B97F4A8; +sub.f64 fd342, fd307, fd341; +fma.rn.f64 fd343, fd329, 0d3FD3C6EF372FE950, fd342; +mul.f64 fd344, fd336, 0d3FE2CF2304755A5E; +mul.f64 fd345, fd338, 0d3FEE6F0E134454FF; +sub.f64 fd346, fd345, fd344; +fma.rn.f64 fd347, fd330, 0d3FD3C6EF372FE950, fd308; +mul.f64 fd348, fd332, 0d3FE9E3779B97F4A8; +sub.f64 fd349, fd347, fd348; +sub.f64 fd350, fd311, fd323; +mul.f64 fd351, fd350, 0d3FEE6F0E134454FF; +sub.f64 fd352, fd315, fd319; +mul.f64 fd353, fd352, 0dBFE2CF2304755A5E; +sub.f64 fd354, fd353, fd351; +mul.f64 fd355, fd330, 0d3FE9E3779B97F4A8; +sub.f64 fd356, fd308, fd355; +fma.rn.f64 fd357, fd332, 0d3FD3C6EF372FE950, fd356; +mul.f64 fd358, fd350, 0d3FE2CF2304755A5E; +mul.f64 fd359, fd352, 0d3FEE6F0E134454FF; +sub.f64 fd360, fd359, fd358; +add.f64 %1, fd332, fd331; +add.f64 %0, fd329, fd328; +add.f64 %3, fd354, fd349; +sub.f64 %2, fd335, fd340; +add.f64 %5, fd360, fd357; +sub.f64 %4, fd343, fd346; +sub.f64 %7, fd357, fd360; +add.f64 %6, fd346, fd343; +sub.f64 %9, fd349, fd354; +add.f64 %8, fd340, fd335; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..06cc95998e618 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_625_fp64_inv.hpp.inc @@ -0,0 +1,3418 @@ +#ifndef CUFFTDX_FFT_625_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_625_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<711, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<11>; +.reg .f64 fd<1389>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %50; +mad.lo.s32 r3, r1, 5000, r2; +add.f64 fd101, %65, %105; +add.f64 fd102, %52, fd101; +add.f64 fd103, %78, %92; +add.f64 fd104, fd103, fd102; +add.f64 fd105, %67, %107; +add.f64 fd106, %53, fd105; +add.f64 fd107, %80, %93; +add.f64 fd108, fd107, fd106; +fma.rn.f64 fd109, fd101, 0d3FD3C6EF372FE950, %52; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +sub.f64 fd111, fd109, fd110; +sub.f64 fd112, %67, %107; +mul.f64 fd113, fd112, 0d3FEE6F0E134454FF; +sub.f64 fd114, %80, %93; +fma.rn.f64 fd115, fd114, 0d3FE2CF2304755A5E, fd113; +sub.f64 fd116, fd111, fd115; +add.f64 fd117, fd115, fd111; +mul.f64 fd118, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd119, %52, fd118; +fma.rn.f64 fd120, fd103, 0d3FD3C6EF372FE950, fd119; +mul.f64 fd121, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd122, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd120, fd123; +add.f64 fd125, fd123, fd120; +fma.rn.f64 fd126, fd105, 0d3FD3C6EF372FE950, %53; +mul.f64 fd127, fd107, 0d3FE9E3779B97F4A8; +sub.f64 fd128, fd126, fd127; +sub.f64 fd129, %65, %105; +mul.f64 fd130, fd129, 0d3FEE6F0E134454FF; +sub.f64 fd131, %78, %92; +fma.rn.f64 fd132, fd131, 0d3FE2CF2304755A5E, fd130; +add.f64 fd133, fd132, fd128; +sub.f64 fd134, fd128, fd132; +mul.f64 fd135, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd136, %53, fd135; +fma.rn.f64 fd137, fd107, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd138, fd129, 0d3FE2CF2304755A5E; +mul.f64 fd139, fd131, 0d3FEE6F0E134454FF; +sub.f64 fd140, fd138, fd139; +add.f64 fd141, fd140, fd137; +sub.f64 fd142, fd137, fd140; +add.f64 fd143, %68, %108; +add.f64 fd144, %54, fd143; +add.f64 fd145, %81, %94; +add.f64 fd146, fd145, fd144; +add.f64 fd147, %69, %109; +add.f64 fd148, %56, fd147; +add.f64 fd149, %83, %96; +add.f64 fd150, fd149, fd148; +fma.rn.f64 fd151, fd143, 0d3FD3C6EF372FE950, %54; +mul.f64 fd152, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd153, fd151, fd152; +sub.f64 fd154, %69, %109; +mul.f64 fd155, fd154, 0d3FEE6F0E134454FF; +sub.f64 fd156, %83, %96; +fma.rn.f64 fd157, fd156, 0d3FE2CF2304755A5E, fd155; +sub.f64 fd158, fd153, fd157; +add.f64 fd159, fd157, fd153; +mul.f64 fd160, fd143, 0d3FE9E3779B97F4A8; +sub.f64 fd161, %54, fd160; +fma.rn.f64 fd162, fd145, 0d3FD3C6EF372FE950, fd161; +mul.f64 fd163, fd154, 0d3FE2CF2304755A5E; +mul.f64 fd164, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd165, fd163, fd164; +sub.f64 fd166, fd162, fd165; +add.f64 fd167, fd165, fd162; +fma.rn.f64 fd168, fd147, 0d3FD3C6EF372FE950, %56; +mul.f64 fd169, fd149, 0d3FE9E3779B97F4A8; +sub.f64 fd170, fd168, fd169; +sub.f64 fd171, %68, %108; +mul.f64 fd172, fd171, 0d3FEE6F0E134454FF; +sub.f64 fd173, %81, %94; +fma.rn.f64 fd174, fd173, 0d3FE2CF2304755A5E, fd172; +add.f64 fd175, fd174, fd170; +sub.f64 fd176, fd170, fd174; +mul.f64 fd177, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd178, %56, fd177; +fma.rn.f64 fd179, fd149, 0d3FD3C6EF372FE950, fd178; +mul.f64 fd180, fd171, 0d3FE2CF2304755A5E; +mul.f64 fd181, fd173, 0d3FEE6F0E134454FF; +sub.f64 fd182, fd180, fd181; +add.f64 fd183, fd182, fd179; +sub.f64 fd184, fd179, fd182; +add.f64 fd185, %70, %110; +add.f64 fd186, %57, fd185; +add.f64 fd187, %84, %97; +add.f64 fd188, fd187, fd186; +add.f64 fd189, %72, %112; +add.f64 fd190, %59, fd189; +add.f64 fd191, %85, %99; +add.f64 fd192, fd191, fd190; +fma.rn.f64 fd193, fd185, 0d3FD3C6EF372FE950, %57; +mul.f64 fd194, fd187, 0d3FE9E3779B97F4A8; +sub.f64 fd195, fd193, fd194; +sub.f64 fd196, %72, %112; +mul.f64 fd197, fd196, 0d3FEE6F0E134454FF; +sub.f64 fd198, %85, %99; +fma.rn.f64 fd199, fd198, 0d3FE2CF2304755A5E, fd197; +sub.f64 fd200, fd195, fd199; +add.f64 fd201, fd199, fd195; +mul.f64 fd202, fd185, 0d3FE9E3779B97F4A8; +sub.f64 fd203, %57, fd202; +fma.rn.f64 fd204, fd187, 0d3FD3C6EF372FE950, fd203; +mul.f64 fd205, fd196, 0d3FE2CF2304755A5E; +mul.f64 fd206, fd198, 0d3FEE6F0E134454FF; +sub.f64 fd207, fd205, fd206; +sub.f64 fd208, fd204, fd207; +add.f64 fd209, fd207, fd204; +fma.rn.f64 fd210, fd189, 0d3FD3C6EF372FE950, %59; +mul.f64 fd211, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd212, fd210, fd211; +sub.f64 fd213, %70, %110; +mul.f64 fd214, fd213, 0d3FEE6F0E134454FF; +sub.f64 fd215, %84, %97; +fma.rn.f64 fd216, fd215, 0d3FE2CF2304755A5E, fd214; +add.f64 fd217, fd216, fd212; +sub.f64 fd218, fd212, fd216; +mul.f64 fd219, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd220, %59, fd219; +fma.rn.f64 fd221, fd191, 0d3FD3C6EF372FE950, fd220; +mul.f64 fd222, fd213, 0d3FE2CF2304755A5E; +mul.f64 fd223, fd215, 0d3FEE6F0E134454FF; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd224, fd221; +sub.f64 fd226, fd221, fd224; +add.f64 fd227, %73, %113; +add.f64 fd228, %60, fd227; +add.f64 fd229, %86, %100; +add.f64 fd230, fd229, fd228; +add.f64 fd231, %75, %115; +add.f64 fd232, %61, fd231; +add.f64 fd233, %88, %101; +add.f64 fd234, fd233, fd232; +fma.rn.f64 fd235, fd227, 0d3FD3C6EF372FE950, %60; +mul.f64 fd236, fd229, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd235, fd236; +sub.f64 fd238, %75, %115; +mul.f64 fd239, fd238, 0d3FEE6F0E134454FF; +sub.f64 fd240, %88, %101; +fma.rn.f64 fd241, fd240, 0d3FE2CF2304755A5E, fd239; +sub.f64 fd242, fd237, fd241; +add.f64 fd243, fd241, fd237; +mul.f64 fd244, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd245, %60, fd244; +fma.rn.f64 fd246, fd229, 0d3FD3C6EF372FE950, fd245; +mul.f64 fd247, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd248, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd246, fd249; +add.f64 fd251, fd249, fd246; +fma.rn.f64 fd252, fd231, 0d3FD3C6EF372FE950, %61; +mul.f64 fd253, fd233, 0d3FE9E3779B97F4A8; +sub.f64 fd254, fd252, fd253; +sub.f64 fd255, %73, %113; +mul.f64 fd256, fd255, 0d3FEE6F0E134454FF; +sub.f64 fd257, %86, %100; +fma.rn.f64 fd258, fd257, 0d3FE2CF2304755A5E, fd256; +add.f64 fd259, fd258, fd254; +sub.f64 fd260, fd254, fd258; +mul.f64 fd261, fd231, 0d3FE9E3779B97F4A8; +sub.f64 fd262, %61, fd261; +fma.rn.f64 fd263, fd233, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd264, fd255, 0d3FE2CF2304755A5E; +mul.f64 fd265, fd257, 0d3FEE6F0E134454FF; +sub.f64 fd266, fd264, fd265; +add.f64 fd267, fd266, fd263; +sub.f64 fd268, fd263, fd266; +add.f64 fd269, %76, %116; +add.f64 fd270, %62, fd269; +add.f64 fd271, %89, %102; +add.f64 fd272, fd271, fd270; +add.f64 fd273, %77, %117; +add.f64 fd274, %64, fd273; +add.f64 fd275, %91, %104; +add.f64 fd276, fd275, fd274; +fma.rn.f64 fd277, fd269, 0d3FD3C6EF372FE950, %62; +mul.f64 fd278, fd271, 0d3FE9E3779B97F4A8; +sub.f64 fd279, fd277, fd278; +sub.f64 fd280, %77, %117; +mul.f64 fd281, fd280, 0d3FEE6F0E134454FF; +sub.f64 fd282, %91, %104; +fma.rn.f64 fd283, fd282, 0d3FE2CF2304755A5E, fd281; +sub.f64 fd284, fd279, fd283; +add.f64 fd285, fd283, fd279; +mul.f64 fd286, fd269, 0d3FE9E3779B97F4A8; +sub.f64 fd287, %62, fd286; +fma.rn.f64 fd288, fd271, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd289, fd280, 0d3FE2CF2304755A5E; +mul.f64 fd290, fd282, 0d3FEE6F0E134454FF; +sub.f64 fd291, fd289, fd290; +sub.f64 fd292, fd288, fd291; +add.f64 fd293, fd291, fd288; +fma.rn.f64 fd294, fd273, 0d3FD3C6EF372FE950, %64; +mul.f64 fd295, fd275, 0d3FE9E3779B97F4A8; +sub.f64 fd296, fd294, fd295; +sub.f64 fd297, %76, %116; +mul.f64 fd298, fd297, 0d3FEE6F0E134454FF; +sub.f64 fd299, %89, %102; +fma.rn.f64 fd300, fd299, 0d3FE2CF2304755A5E, fd298; +add.f64 fd301, fd300, fd296; +sub.f64 fd302, fd296, fd300; +mul.f64 fd303, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd304, %64, fd303; +fma.rn.f64 fd305, fd275, 0d3FD3C6EF372FE950, fd304; +mul.f64 fd306, fd297, 0d3FE2CF2304755A5E; +mul.f64 fd307, fd299, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd306, fd307; +add.f64 fd309, fd308, fd305; +sub.f64 fd310, fd305, fd308; +mov.u32 r4, %tid.x; +mul.f64 fd311, fd158, 0d3FEEFEA21D101EE0; +mul.f64 fd312, fd175, 0d3FCFD511FA1C0796; +sub.f64 fd313, fd311, fd312; +mul.f64 fd314, fd175, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd315, fd158, 0d3FCFD511FA1C0796, fd314; +mul.f64 fd316, fd200, 0d3FEC0AB44E81C059; +mul.f64 fd317, fd217, 0d3FDED50D5CBFA951; +sub.f64 fd318, fd316, fd317; +mul.f64 fd319, fd217, 0d3FEC0AB44E81C059; +fma.rn.f64 fd320, fd200, 0d3FDED50D5CBFA951, fd319; +mul.f64 fd321, fd242, 0d3FE753B603D2B816; +mul.f64 fd322, fd259, 0d3FE5E7CF55112014; +sub.f64 fd323, fd321, fd322; +mul.f64 fd324, fd259, 0d3FE753B603D2B816; +fma.rn.f64 fd325, fd242, 0d3FE5E7CF55112014, fd324; +mul.f64 fd326, fd284, 0d3FE1257E3C182B51; +mul.f64 fd327, fd301, 0d3FEB04BBFF642E86; +sub.f64 fd328, fd326, fd327; +mul.f64 fd329, fd301, 0d3FE1257E3C182B51; +fma.rn.f64 fd330, fd284, 0d3FEB04BBFF642E86, fd329; +mul.f64 fd331, fd166, 0d3FEC0AB44E81C059; +mul.f64 fd332, fd183, 0d3FDED50D5CBFA951; +sub.f64 fd333, fd331, fd332; +mul.f64 fd334, fd183, 0d3FEC0AB44E81C059; +fma.rn.f64 fd335, fd166, 0d3FDED50D5CBFA951, fd334; +mul.f64 fd336, fd208, 0d3FE1257E3C182B51; +mul.f64 fd337, fd225, 0d3FEB04BBFF642E86; +sub.f64 fd338, fd336, fd337; +mul.f64 fd339, fd225, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd208, 0d3FEB04BBFF642E86, fd339; +mul.f64 fd341, fd250, 0d3FB0130A1BE09379; +mul.f64 fd342, fd267, 0d3FEFEFD5BFE443FE; +sub.f64 fd343, fd341, fd342; +mul.f64 fd344, fd267, 0d3FB0130A1BE09379; +fma.rn.f64 fd345, fd250, 0d3FEFEFD5BFE443FE, fd344; +mul.f64 fd346, fd292, 0dBFDB3FF7C925819C; +mul.f64 fd347, fd309, 0d3FECF457DCDC158C; +sub.f64 fd348, fd346, fd347; +mul.f64 fd349, fd309, 0dBFDB3FF7C925819C; +fma.rn.f64 fd350, fd292, 0d3FECF457DCDC158C, fd349; +mul.f64 fd351, fd167, 0d3FE753B603D2B816; +mul.f64 fd352, fd184, 0d3FE5E7CF55112014; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd184, 0d3FE753B603D2B816; +fma.rn.f64 fd355, fd167, 0d3FE5E7CF55112014, fd354; +mul.f64 fd356, fd209, 0d3FB0130A1BE09379; +mul.f64 fd357, fd226, 0d3FEFEFD5BFE443FE; +sub.f64 fd358, fd356, fd357; +mul.f64 fd359, fd226, 0d3FB0130A1BE09379; +fma.rn.f64 fd360, fd209, 0d3FEFEFD5BFE443FE, fd359; +mul.f64 fd361, fd251, 0dBFE465C6FEB501BC; +mul.f64 fd362, fd268, 0d3FE8A80B635B6BEA; +sub.f64 fd363, fd361, fd362; +mul.f64 fd364, fd268, 0dBFE465C6FEB501BC; +fma.rn.f64 fd365, fd251, 0d3FE8A80B635B6BEA, fd364; +mul.f64 fd366, fd293, 0dBFEFBF675480D903; +mul.f64 fd367, fd310, 0d3FC00AEB5DA15BE0; +sub.f64 fd368, fd366, fd367; +mul.f64 fd369, fd310, 0dBFEFBF675480D903; +fma.rn.f64 fd370, fd293, 0d3FC00AEB5DA15BE0, fd369; +mul.f64 fd371, fd159, 0d3FE1257E3C182B51; +mul.f64 fd372, fd176, 0d3FEB04BBFF642E86; +sub.f64 fd373, fd371, fd372; +mul.f64 fd374, fd176, 0d3FE1257E3C182B51; +fma.rn.f64 fd375, fd159, 0d3FEB04BBFF642E86, fd374; +mul.f64 fd376, fd201, 0dBFDB3FF7C925819C; +mul.f64 fd377, fd218, 0d3FECF457DCDC158C; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd218, 0dBFDB3FF7C925819C; +fma.rn.f64 fd380, fd201, 0d3FECF457DCDC158C, fd379; +mul.f64 fd381, fd243, 0dBFEFBF675480D903; +mul.f64 fd382, fd260, 0d3FC00AEB5DA15BE0; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd260, 0dBFEFBF675480D903; +fma.rn.f64 fd385, fd243, 0d3FC00AEB5DA15BE0, fd384; +mul.f64 fd386, fd285, 0dBFE465C6FEB501BC; +mul.f64 fd387, fd302, 0dBFE8A80B635B6BEA; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd302, 0dBFE465C6FEB501BC; +fma.rn.f64 fd390, fd285, 0dBFE8A80B635B6BEA, fd389; +add.f64 fd391, fd146, fd272; +add.f64 fd392, fd104, fd391; +add.f64 fd393, fd188, fd230; +add.f64 fd394, fd393, fd392; +add.f64 fd395, fd150, fd276; +add.f64 fd396, fd108, fd395; +add.f64 fd397, fd192, fd234; +add.f64 fd398, fd397, fd396; +fma.rn.f64 fd399, fd391, 0d3FD3C6EF372FE950, fd104; +mul.f64 fd400, fd393, 0d3FE9E3779B97F4A8; +sub.f64 fd401, fd399, fd400; +sub.f64 fd402, fd150, fd276; +mul.f64 fd403, fd402, 0d3FEE6F0E134454FF; +sub.f64 fd404, fd192, fd234; +fma.rn.f64 fd405, fd404, 0d3FE2CF2304755A5E, fd403; +sub.f64 fd406, fd401, fd405; +add.f64 fd407, fd405, fd401; +mul.f64 fd408, fd391, 0d3FE9E3779B97F4A8; +sub.f64 fd409, fd104, fd408; +fma.rn.f64 fd410, fd393, 0d3FD3C6EF372FE950, fd409; +mul.f64 fd411, fd402, 0d3FE2CF2304755A5E; +mul.f64 fd412, fd404, 0d3FEE6F0E134454FF; +sub.f64 fd413, fd411, fd412; +sub.f64 fd414, fd410, fd413; +add.f64 fd415, fd413, fd410; +fma.rn.f64 fd416, fd395, 0d3FD3C6EF372FE950, fd108; +mul.f64 fd417, fd397, 0d3FE9E3779B97F4A8; +sub.f64 fd418, fd416, fd417; +sub.f64 fd419, fd146, fd272; +mul.f64 fd420, fd419, 0d3FEE6F0E134454FF; +sub.f64 fd421, fd188, fd230; +fma.rn.f64 fd422, fd421, 0d3FE2CF2304755A5E, fd420; +add.f64 fd423, fd422, fd418; +sub.f64 fd424, fd418, fd422; +mul.f64 fd425, fd395, 0d3FE9E3779B97F4A8; +sub.f64 fd426, fd108, fd425; +fma.rn.f64 fd427, fd397, 0d3FD3C6EF372FE950, fd426; +mul.f64 fd428, fd419, 0d3FE2CF2304755A5E; +mul.f64 fd429, fd421, 0d3FEE6F0E134454FF; +sub.f64 fd430, fd428, fd429; +add.f64 fd431, fd430, fd427; +sub.f64 fd432, fd427, fd430; +add.f64 fd433, fd313, fd328; +add.f64 fd434, fd116, fd433; +add.f64 fd435, fd318, fd323; +add.f64 fd436, fd435, fd434; +add.f64 fd437, fd315, fd330; +add.f64 fd438, fd133, fd437; +add.f64 fd439, fd320, fd325; +add.f64 fd440, fd439, fd438; +fma.rn.f64 fd441, fd433, 0d3FD3C6EF372FE950, fd116; +mul.f64 fd442, fd435, 0d3FE9E3779B97F4A8; +sub.f64 fd443, fd441, fd442; +sub.f64 fd444, fd315, fd330; +mul.f64 fd445, fd444, 0d3FEE6F0E134454FF; +sub.f64 fd446, fd320, fd325; +fma.rn.f64 fd447, fd446, 0d3FE2CF2304755A5E, fd445; +sub.f64 fd448, fd443, fd447; +add.f64 fd449, fd447, fd443; +mul.f64 fd450, fd433, 0d3FE9E3779B97F4A8; +sub.f64 fd451, fd116, fd450; +fma.rn.f64 fd452, fd435, 0d3FD3C6EF372FE950, fd451; +mul.f64 fd453, fd444, 0d3FE2CF2304755A5E; +mul.f64 fd454, fd446, 0d3FEE6F0E134454FF; +sub.f64 fd455, fd453, fd454; +sub.f64 fd456, fd452, fd455; +add.f64 fd457, fd455, fd452; +fma.rn.f64 fd458, fd437, 0d3FD3C6EF372FE950, fd133; +mul.f64 fd459, fd439, 0d3FE9E3779B97F4A8; +sub.f64 fd460, fd458, fd459; +sub.f64 fd461, fd313, fd328; +mul.f64 fd462, fd461, 0d3FEE6F0E134454FF; +sub.f64 fd463, fd318, fd323; +fma.rn.f64 fd464, fd463, 0d3FE2CF2304755A5E, fd462; +add.f64 fd465, fd464, fd460; +sub.f64 fd466, fd460, fd464; +mul.f64 fd467, fd437, 0d3FE9E3779B97F4A8; +sub.f64 fd468, fd133, fd467; +fma.rn.f64 fd469, fd439, 0d3FD3C6EF372FE950, fd468; +mul.f64 fd470, fd461, 0d3FE2CF2304755A5E; +mul.f64 fd471, fd463, 0d3FEE6F0E134454FF; +sub.f64 fd472, fd470, fd471; +add.f64 fd473, fd472, fd469; +sub.f64 fd474, fd469, fd472; +add.f64 fd475, fd333, fd348; +add.f64 fd476, fd124, fd475; +add.f64 fd477, fd338, fd343; +add.f64 fd478, fd477, fd476; +add.f64 fd479, fd335, fd350; +add.f64 fd480, fd141, fd479; +add.f64 fd481, fd340, fd345; +add.f64 fd482, fd481, fd480; +fma.rn.f64 fd483, fd475, 0d3FD3C6EF372FE950, fd124; +mul.f64 fd484, fd477, 0d3FE9E3779B97F4A8; +sub.f64 fd485, fd483, fd484; +sub.f64 fd486, fd335, fd350; +mul.f64 fd487, fd486, 0d3FEE6F0E134454FF; +sub.f64 fd488, fd340, fd345; +fma.rn.f64 fd489, fd488, 0d3FE2CF2304755A5E, fd487; +sub.f64 fd490, fd485, fd489; +add.f64 fd491, fd489, fd485; +mul.f64 fd492, fd475, 0d3FE9E3779B97F4A8; +sub.f64 fd493, fd124, fd492; +fma.rn.f64 fd494, fd477, 0d3FD3C6EF372FE950, fd493; +mul.f64 fd495, fd486, 0d3FE2CF2304755A5E; +mul.f64 fd496, fd488, 0d3FEE6F0E134454FF; +sub.f64 fd497, fd495, fd496; +sub.f64 fd498, fd494, fd497; +add.f64 fd499, fd497, fd494; +fma.rn.f64 fd500, fd479, 0d3FD3C6EF372FE950, fd141; +mul.f64 fd501, fd481, 0d3FE9E3779B97F4A8; +sub.f64 fd502, fd500, fd501; +sub.f64 fd503, fd333, fd348; +mul.f64 fd504, fd503, 0d3FEE6F0E134454FF; +sub.f64 fd505, fd338, fd343; +fma.rn.f64 fd506, fd505, 0d3FE2CF2304755A5E, fd504; +add.f64 fd507, fd506, fd502; +sub.f64 fd508, fd502, fd506; +mul.f64 fd509, fd479, 0d3FE9E3779B97F4A8; +sub.f64 fd510, fd141, fd509; +fma.rn.f64 fd511, fd481, 0d3FD3C6EF372FE950, fd510; +mul.f64 fd512, fd503, 0d3FE2CF2304755A5E; +mul.f64 fd513, fd505, 0d3FEE6F0E134454FF; +sub.f64 fd514, fd512, fd513; +add.f64 fd515, fd514, fd511; +sub.f64 fd516, fd511, fd514; +add.f64 fd517, fd353, fd368; +add.f64 fd518, fd125, fd517; +add.f64 fd519, fd358, fd363; +add.f64 fd520, fd519, fd518; +add.f64 fd521, fd355, fd370; +add.f64 fd522, fd142, fd521; +add.f64 fd523, fd360, fd365; +add.f64 fd524, fd523, fd522; +fma.rn.f64 fd525, fd517, 0d3FD3C6EF372FE950, fd125; +mul.f64 fd526, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd527, fd525, fd526; +sub.f64 fd528, fd355, fd370; +mul.f64 fd529, fd528, 0d3FEE6F0E134454FF; +sub.f64 fd530, fd360, fd365; +fma.rn.f64 fd531, fd530, 0d3FE2CF2304755A5E, fd529; +sub.f64 fd532, fd527, fd531; +add.f64 fd533, fd531, fd527; +mul.f64 fd534, fd517, 0d3FE9E3779B97F4A8; +sub.f64 fd535, fd125, fd534; +fma.rn.f64 fd536, fd519, 0d3FD3C6EF372FE950, fd535; +mul.f64 fd537, fd528, 0d3FE2CF2304755A5E; +mul.f64 fd538, fd530, 0d3FEE6F0E134454FF; +sub.f64 fd539, fd537, fd538; +sub.f64 fd540, fd536, fd539; +add.f64 fd541, fd539, fd536; +fma.rn.f64 fd542, fd521, 0d3FD3C6EF372FE950, fd142; +mul.f64 fd543, fd523, 0d3FE9E3779B97F4A8; +sub.f64 fd544, fd542, fd543; +sub.f64 fd545, fd353, fd368; +mul.f64 fd546, fd545, 0d3FEE6F0E134454FF; +sub.f64 fd547, fd358, fd363; +fma.rn.f64 fd548, fd547, 0d3FE2CF2304755A5E, fd546; +add.f64 fd549, fd548, fd544; +sub.f64 fd550, fd544, fd548; +mul.f64 fd551, fd521, 0d3FE9E3779B97F4A8; +sub.f64 fd552, fd142, fd551; +fma.rn.f64 fd553, fd523, 0d3FD3C6EF372FE950, fd552; +mul.f64 fd554, fd545, 0d3FE2CF2304755A5E; +mul.f64 fd555, fd547, 0d3FEE6F0E134454FF; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd556, fd553; +sub.f64 fd558, fd553, fd556; +add.f64 fd559, fd373, fd388; +add.f64 fd560, fd117, fd559; +add.f64 fd561, fd378, fd383; +add.f64 fd562, fd561, fd560; +add.f64 fd563, fd375, fd390; +add.f64 fd564, fd134, fd563; +add.f64 fd565, fd380, fd385; +add.f64 fd566, fd565, fd564; +fma.rn.f64 fd567, fd559, 0d3FD3C6EF372FE950, fd117; +mul.f64 fd568, fd561, 0d3FE9E3779B97F4A8; +sub.f64 fd569, fd567, fd568; +sub.f64 fd570, fd375, fd390; +mul.f64 fd571, fd570, 0d3FEE6F0E134454FF; +sub.f64 fd572, fd380, fd385; +fma.rn.f64 fd573, fd572, 0d3FE2CF2304755A5E, fd571; +sub.f64 fd574, fd569, fd573; +add.f64 fd575, fd573, fd569; +mul.f64 fd576, fd559, 0d3FE9E3779B97F4A8; +sub.f64 fd577, fd117, fd576; +fma.rn.f64 fd578, fd561, 0d3FD3C6EF372FE950, fd577; +mul.f64 fd579, fd570, 0d3FE2CF2304755A5E; +mul.f64 fd580, fd572, 0d3FEE6F0E134454FF; +sub.f64 fd581, fd579, fd580; +sub.f64 fd582, fd578, fd581; +add.f64 fd583, fd581, fd578; +fma.rn.f64 fd584, fd563, 0d3FD3C6EF372FE950, fd134; +mul.f64 fd585, fd565, 0d3FE9E3779B97F4A8; +sub.f64 fd586, fd584, fd585; +sub.f64 fd587, fd373, fd388; +mul.f64 fd588, fd587, 0d3FEE6F0E134454FF; +sub.f64 fd589, fd378, fd383; +fma.rn.f64 fd590, fd589, 0d3FE2CF2304755A5E, fd588; +add.f64 fd591, fd590, fd586; +sub.f64 fd592, fd586, fd590; +mul.f64 fd593, fd563, 0d3FE9E3779B97F4A8; +sub.f64 fd594, fd134, fd593; +fma.rn.f64 fd595, fd565, 0d3FD3C6EF372FE950, fd594; +mul.f64 fd596, fd587, 0d3FE2CF2304755A5E; +mul.f64 fd597, fd589, 0d3FEE6F0E134454FF; +sub.f64 fd598, fd596, fd597; +add.f64 fd599, fd598, fd595; +sub.f64 fd600, fd595, fd598; +mul.wide.u32 rd2, r4, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %51; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd601, fd602}, [rd6]; +mul.f64 fd605, fd440, fd602; +fma.rn.f64 fd606, fd601, fd436, fd605; +mul.f64 fd607, fd436, fd602; +mul.f64 fd608, fd601, fd440; +sub.f64 fd609, fd608, fd607; +mul.f64 fd610, fd601, fd601; +mul.f64 fd611, fd602, fd602; +sub.f64 fd612, fd610, fd611; +mul.f64 fd613, fd602, fd601; +fma.rn.f64 fd614, fd602, fd601, fd613; +mul.f64 fd615, fd482, fd614; +fma.rn.f64 fd616, fd612, fd478, fd615; +mul.f64 fd617, fd478, fd614; +mul.f64 fd618, fd612, fd482; +sub.f64 fd619, fd618, fd617; +mul.f64 fd620, fd601, fd612; +mul.f64 fd621, fd602, fd614; +sub.f64 fd622, fd620, fd621; +mul.f64 fd623, fd601, fd614; +fma.rn.f64 fd624, fd602, fd612, fd623; +mul.f64 fd625, fd524, fd624; +fma.rn.f64 fd626, fd622, fd520, fd625; +mul.f64 fd627, fd520, fd624; +mul.f64 fd628, fd622, fd524; +sub.f64 fd629, fd628, fd627; +mul.f64 fd630, fd601, fd622; +mul.f64 fd631, fd602, fd624; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd601, fd624; +fma.rn.f64 fd634, fd602, fd622, fd633; +mul.f64 fd635, fd566, fd634; +fma.rn.f64 fd636, fd632, fd562, fd635; +mul.f64 fd637, fd562, fd634; +mul.f64 fd638, fd632, fd566; +sub.f64 fd639, fd638, fd637; +mul.f64 fd640, fd601, fd632; +mul.f64 fd641, fd602, fd634; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd601, fd634; +fma.rn.f64 fd644, fd602, fd632, fd643; +mul.f64 fd645, fd423, fd644; +fma.rn.f64 fd646, fd642, fd406, fd645; +mul.f64 fd647, fd406, fd644; +mul.f64 fd648, fd642, fd423; +sub.f64 fd649, fd648, fd647; +mul.f64 fd650, fd601, fd642; +mul.f64 fd651, fd602, fd644; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd601, fd644; +fma.rn.f64 fd654, fd602, fd642, fd653; +mul.f64 fd655, fd465, fd654; +fma.rn.f64 fd656, fd652, fd448, fd655; +mul.f64 fd657, fd448, fd654; +mul.f64 fd658, fd652, fd465; +sub.f64 fd659, fd658, fd657; +mul.f64 fd660, fd601, fd652; +mul.f64 fd661, fd602, fd654; +sub.f64 fd662, fd660, fd661; +mul.f64 fd663, fd601, fd654; +fma.rn.f64 fd664, fd602, fd652, fd663; +mul.f64 fd665, fd507, fd664; +fma.rn.f64 fd666, fd662, fd490, fd665; +mul.f64 fd667, fd490, fd664; +mul.f64 fd668, fd662, fd507; +sub.f64 fd669, fd668, fd667; +mul.f64 fd670, fd601, fd662; +mul.f64 fd671, fd602, fd664; +sub.f64 fd672, fd670, fd671; +mul.f64 fd673, fd601, fd664; +fma.rn.f64 fd674, fd602, fd662, fd673; +mul.f64 fd675, fd549, fd674; +fma.rn.f64 fd676, fd672, fd532, fd675; +mul.f64 fd677, fd532, fd674; +mul.f64 fd678, fd672, fd549; +sub.f64 fd679, fd678, fd677; +mul.f64 fd680, fd601, fd672; +mul.f64 fd681, fd602, fd674; +sub.f64 fd682, fd680, fd681; +mul.f64 fd683, fd601, fd674; +fma.rn.f64 fd684, fd602, fd672, fd683; +mul.f64 fd685, fd591, fd684; +fma.rn.f64 fd686, fd682, fd574, fd685; +mul.f64 fd687, fd574, fd684; +mul.f64 fd688, fd682, fd591; +sub.f64 fd689, fd688, fd687; +mul.f64 fd690, fd601, fd682; +mul.f64 fd691, fd602, fd684; +sub.f64 fd692, fd690, fd691; +mul.f64 fd693, fd601, fd684; +fma.rn.f64 fd694, fd602, fd682, fd693; +mul.f64 fd695, fd431, fd694; +fma.rn.f64 fd696, fd692, fd414, fd695; +mul.f64 fd697, fd414, fd694; +mul.f64 fd698, fd692, fd431; +sub.f64 fd699, fd698, fd697; +mul.f64 fd700, fd601, fd692; +mul.f64 fd701, fd602, fd694; +sub.f64 fd702, fd700, fd701; +mul.f64 fd703, fd601, fd694; +fma.rn.f64 fd704, fd602, fd692, fd703; +mul.f64 fd705, fd473, fd704; +fma.rn.f64 fd706, fd702, fd456, fd705; +mul.f64 fd707, fd456, fd704; +mul.f64 fd708, fd702, fd473; +sub.f64 fd709, fd708, fd707; +mul.f64 fd710, fd601, fd702; +mul.f64 fd711, fd602, fd704; +sub.f64 fd712, fd710, fd711; +mul.f64 fd713, fd601, fd704; +fma.rn.f64 fd714, fd602, fd702, fd713; +mul.f64 fd715, fd515, fd714; +fma.rn.f64 fd716, fd712, fd498, fd715; +mul.f64 fd717, fd498, fd714; +mul.f64 fd718, fd712, fd515; +sub.f64 fd719, fd718, fd717; +ld.global.v2.f64 {fd720, fd721}, [rd6+400]; +mul.f64 fd724, fd557, fd721; +fma.rn.f64 fd725, fd720, fd540, fd724; +mul.f64 fd726, fd540, fd721; +mul.f64 fd727, fd720, fd557; +sub.f64 fd728, fd727, fd726; +mul.f64 fd729, fd601, fd720; +mul.f64 fd730, fd602, fd721; +sub.f64 fd731, fd729, fd730; +mul.f64 fd732, fd601, fd721; +fma.rn.f64 fd733, fd602, fd720, fd732; +mul.f64 fd734, fd599, fd733; +fma.rn.f64 fd735, fd731, fd582, fd734; +mul.f64 fd736, fd582, fd733; +mul.f64 fd737, fd731, fd599; +sub.f64 fd738, fd737, fd736; +mul.f64 fd739, fd601, fd731; +mul.f64 fd740, fd602, fd733; +sub.f64 fd741, fd739, fd740; +mul.f64 fd742, fd601, fd733; +fma.rn.f64 fd743, fd602, fd731, fd742; +mul.f64 fd744, fd432, fd743; +fma.rn.f64 fd745, fd741, fd415, fd744; +mul.f64 fd746, fd415, fd743; +mul.f64 fd747, fd741, fd432; +sub.f64 fd748, fd747, fd746; +mul.f64 fd749, fd601, fd741; +mul.f64 fd750, fd602, fd743; +sub.f64 fd751, fd749, fd750; +mul.f64 fd752, fd601, fd743; +fma.rn.f64 fd753, fd602, fd741, fd752; +mul.f64 fd754, fd474, fd753; +fma.rn.f64 fd755, fd751, fd457, fd754; +mul.f64 fd756, fd457, fd753; +mul.f64 fd757, fd751, fd474; +sub.f64 fd758, fd757, fd756; +mul.f64 fd759, fd601, fd751; +mul.f64 fd760, fd602, fd753; +sub.f64 fd761, fd759, fd760; +mul.f64 fd762, fd601, fd753; +fma.rn.f64 fd763, fd602, fd751, fd762; +mul.f64 fd764, fd516, fd763; +fma.rn.f64 fd765, fd761, fd499, fd764; +mul.f64 fd766, fd499, fd763; +mul.f64 fd767, fd761, fd516; +sub.f64 fd768, fd767, fd766; +mul.f64 fd769, fd601, fd761; +mul.f64 fd770, fd602, fd763; +sub.f64 fd771, fd769, fd770; +mul.f64 fd772, fd601, fd763; +fma.rn.f64 fd773, fd602, fd761, fd772; +mul.f64 fd774, fd558, fd773; +fma.rn.f64 fd775, fd771, fd541, fd774; +mul.f64 fd776, fd541, fd773; +mul.f64 fd777, fd771, fd558; +sub.f64 fd778, fd777, fd776; +mul.f64 fd779, fd601, fd771; +mul.f64 fd780, fd602, fd773; +sub.f64 fd781, fd779, fd780; +mul.f64 fd782, fd601, fd773; +fma.rn.f64 fd783, fd602, fd771, fd782; +mul.f64 fd784, fd600, fd783; +fma.rn.f64 fd785, fd781, fd583, fd784; +mul.f64 fd786, fd583, fd783; +mul.f64 fd787, fd781, fd600; +sub.f64 fd788, fd787, fd786; +mul.f64 fd789, fd601, fd781; +mul.f64 fd790, fd602, fd783; +sub.f64 fd791, fd789, fd790; +mul.f64 fd792, fd601, fd783; +fma.rn.f64 fd793, fd602, fd781, fd792; +mul.f64 fd794, fd424, fd793; +fma.rn.f64 fd795, fd791, fd407, fd794; +mul.f64 fd796, fd407, fd793; +mul.f64 fd797, fd791, fd424; +sub.f64 fd798, fd797, fd796; +mul.f64 fd799, fd601, fd791; +mul.f64 fd800, fd602, fd793; +sub.f64 fd801, fd799, fd800; +mul.f64 fd802, fd601, fd793; +fma.rn.f64 fd803, fd602, fd791, fd802; +mul.f64 fd804, fd466, fd803; +fma.rn.f64 fd805, fd801, fd449, fd804; +mul.f64 fd806, fd449, fd803; +mul.f64 fd807, fd801, fd466; +sub.f64 fd808, fd807, fd806; +mul.f64 fd809, fd601, fd801; +mul.f64 fd810, fd602, fd803; +sub.f64 fd811, fd809, fd810; +mul.f64 fd812, fd601, fd803; +fma.rn.f64 fd813, fd602, fd801, fd812; +mul.f64 fd814, fd508, fd813; +fma.rn.f64 fd815, fd811, fd491, fd814; +mul.f64 fd816, fd491, fd813; +mul.f64 fd817, fd811, fd508; +sub.f64 fd818, fd817, fd816; +mul.f64 fd819, fd601, fd811; +mul.f64 fd820, fd602, fd813; +sub.f64 fd821, fd819, fd820; +mul.f64 fd822, fd601, fd813; +fma.rn.f64 fd823, fd602, fd811, fd822; +mul.f64 fd824, fd550, fd823; +fma.rn.f64 fd825, fd821, fd533, fd824; +mul.f64 fd826, fd533, fd823; +mul.f64 fd827, fd821, fd550; +sub.f64 fd828, fd827, fd826; +mul.f64 fd829, fd601, fd821; +mul.f64 fd830, fd602, fd823; +sub.f64 fd831, fd829, fd830; +mul.f64 fd832, fd601, fd823; +fma.rn.f64 fd833, fd602, fd821, fd832; +mul.f64 fd834, fd592, fd833; +fma.rn.f64 fd835, fd831, fd575, fd834; +mul.f64 fd836, fd575, fd833; +mul.f64 fd837, fd831, fd592; +sub.f64 fd838, fd837, fd836; +mad.lo.s32 r8, r5, 5000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 200, r8; +st.shared.f64 [r9], fd394; +st.shared.f64 [r9+8], fd606; +st.shared.f64 [r9+16], fd616; +st.shared.f64 [r9+24], fd626; +st.shared.f64 [r9+32], fd636; +st.shared.f64 [r9+40], fd646; +st.shared.f64 [r9+48], fd656; +st.shared.f64 [r9+56], fd666; +st.shared.f64 [r9+64], fd676; +st.shared.f64 [r9+72], fd686; +st.shared.f64 [r9+80], fd696; +st.shared.f64 [r9+88], fd706; +st.shared.f64 [r9+96], fd716; +st.shared.f64 [r9+104], fd725; +st.shared.f64 [r9+112], fd735; +st.shared.f64 [r9+120], fd745; +st.shared.f64 [r9+128], fd755; +st.shared.f64 [r9+136], fd765; +st.shared.f64 [r9+144], fd775; +st.shared.f64 [r9+152], fd785; +st.shared.f64 [r9+160], fd795; +st.shared.f64 [r9+168], fd805; +st.shared.f64 [r9+176], fd815; +st.shared.f64 [r9+184], fd825; +st.shared.f64 [r9+192], fd835; +barrier.sync 0; +mad.lo.s32 r10, r7, -192, r9; +ld.shared.f64 fd839, [r10]; +ld.shared.f64 fd840, [r10+200]; +ld.shared.f64 fd841, [r10+400]; +ld.shared.f64 fd842, [r10+600]; +ld.shared.f64 fd843, [r10+800]; +ld.shared.f64 fd844, [r10+1000]; +ld.shared.f64 fd845, [r10+1200]; +ld.shared.f64 fd846, [r10+1400]; +ld.shared.f64 fd847, [r10+1600]; +ld.shared.f64 fd848, [r10+1800]; +ld.shared.f64 fd849, [r10+2000]; +ld.shared.f64 fd850, [r10+2200]; +ld.shared.f64 fd851, [r10+2400]; +ld.shared.f64 fd852, [r10+2600]; +ld.shared.f64 fd853, [r10+2800]; +ld.shared.f64 fd854, [r10+3000]; +ld.shared.f64 fd855, [r10+3200]; +ld.shared.f64 fd856, [r10+3400]; +ld.shared.f64 fd857, [r10+3600]; +ld.shared.f64 fd858, [r10+3800]; +ld.shared.f64 fd859, [r10+4000]; +ld.shared.f64 fd860, [r10+4200]; +ld.shared.f64 fd861, [r10+4400]; +ld.shared.f64 fd862, [r10+4600]; +ld.shared.f64 fd863, [r10+4800]; +barrier.sync 0; +st.shared.f64 [r9], fd398; +st.shared.f64 [r9+8], fd609; +st.shared.f64 [r9+16], fd619; +st.shared.f64 [r9+24], fd629; +st.shared.f64 [r9+32], fd639; +st.shared.f64 [r9+40], fd649; +st.shared.f64 [r9+48], fd659; +st.shared.f64 [r9+56], fd669; +st.shared.f64 [r9+64], fd679; +st.shared.f64 [r9+72], fd689; +st.shared.f64 [r9+80], fd699; +st.shared.f64 [r9+88], fd709; +st.shared.f64 [r9+96], fd719; +st.shared.f64 [r9+104], fd728; +st.shared.f64 [r9+112], fd738; +st.shared.f64 [r9+120], fd748; +st.shared.f64 [r9+128], fd758; +st.shared.f64 [r9+136], fd768; +st.shared.f64 [r9+144], fd778; +st.shared.f64 [r9+152], fd788; +st.shared.f64 [r9+160], fd798; +st.shared.f64 [r9+168], fd808; +st.shared.f64 [r9+176], fd818; +st.shared.f64 [r9+184], fd828; +st.shared.f64 [r9+192], fd838; +barrier.sync 0; +ld.shared.f64 fd864, [r10]; +ld.shared.f64 fd865, [r10+200]; +ld.shared.f64 fd866, [r10+400]; +ld.shared.f64 fd867, [r10+600]; +ld.shared.f64 fd868, [r10+800]; +ld.shared.f64 fd869, [r10+1000]; +ld.shared.f64 fd870, [r10+1200]; +ld.shared.f64 fd871, [r10+1400]; +ld.shared.f64 fd872, [r10+1600]; +ld.shared.f64 fd873, [r10+1800]; +ld.shared.f64 fd874, [r10+2000]; +ld.shared.f64 fd875, [r10+2200]; +ld.shared.f64 fd876, [r10+2400]; +ld.shared.f64 fd877, [r10+2600]; +ld.shared.f64 fd878, [r10+2800]; +ld.shared.f64 fd879, [r10+3000]; +ld.shared.f64 fd880, [r10+3200]; +ld.shared.f64 fd881, [r10+3400]; +ld.shared.f64 fd882, [r10+3600]; +ld.shared.f64 fd883, [r10+3800]; +ld.shared.f64 fd884, [r10+4000]; +ld.shared.f64 fd885, [r10+4200]; +ld.shared.f64 fd886, [r10+4400]; +ld.shared.f64 fd887, [r10+4600]; +ld.shared.f64 fd888, [r10+4800]; +add.f64 fd889, fd844, fd859; +add.f64 fd890, fd839, fd889; +add.f64 fd891, fd849, fd854; +add.f64 fd892, fd891, fd890; +add.f64 fd893, fd869, fd884; +add.f64 fd894, fd864, fd893; +add.f64 fd895, fd874, fd879; +add.f64 fd896, fd895, fd894; +fma.rn.f64 fd897, fd889, 0d3FD3C6EF372FE950, fd839; +mul.f64 fd898, fd891, 0d3FE9E3779B97F4A8; +sub.f64 fd899, fd897, fd898; +sub.f64 fd900, fd869, fd884; +mul.f64 fd901, fd900, 0d3FEE6F0E134454FF; +sub.f64 fd902, fd874, fd879; +fma.rn.f64 fd903, fd902, 0d3FE2CF2304755A5E, fd901; +sub.f64 fd904, fd899, fd903; +add.f64 fd905, fd903, fd899; +mul.f64 fd906, fd889, 0d3FE9E3779B97F4A8; +sub.f64 fd907, fd839, fd906; +fma.rn.f64 fd908, fd891, 0d3FD3C6EF372FE950, fd907; +mul.f64 fd909, fd900, 0d3FE2CF2304755A5E; +mul.f64 fd910, fd902, 0d3FEE6F0E134454FF; +sub.f64 fd911, fd909, fd910; +sub.f64 fd912, fd908, fd911; +add.f64 fd913, fd911, fd908; +fma.rn.f64 fd914, fd893, 0d3FD3C6EF372FE950, fd864; +mul.f64 fd915, fd895, 0d3FE9E3779B97F4A8; +sub.f64 fd916, fd914, fd915; +sub.f64 fd917, fd844, fd859; +mul.f64 fd918, fd917, 0d3FEE6F0E134454FF; +sub.f64 fd919, fd849, fd854; +fma.rn.f64 fd920, fd919, 0d3FE2CF2304755A5E, fd918; +add.f64 fd921, fd920, fd916; +sub.f64 fd922, fd916, fd920; +mul.f64 fd923, fd893, 0d3FE9E3779B97F4A8; +sub.f64 fd924, fd864, fd923; +fma.rn.f64 fd925, fd895, 0d3FD3C6EF372FE950, fd924; +mul.f64 fd926, fd917, 0d3FE2CF2304755A5E; +mul.f64 fd927, fd919, 0d3FEE6F0E134454FF; +sub.f64 fd928, fd926, fd927; +add.f64 fd929, fd928, fd925; +sub.f64 fd930, fd925, fd928; +add.f64 fd931, fd845, fd860; +add.f64 fd932, fd840, fd931; +add.f64 fd933, fd850, fd855; +add.f64 fd934, fd933, fd932; +add.f64 fd935, fd870, fd885; +add.f64 fd936, fd865, fd935; +add.f64 fd937, fd875, fd880; +add.f64 fd938, fd937, fd936; +fma.rn.f64 fd939, fd931, 0d3FD3C6EF372FE950, fd840; +mul.f64 fd940, fd933, 0d3FE9E3779B97F4A8; +sub.f64 fd941, fd939, fd940; +sub.f64 fd942, fd870, fd885; +mul.f64 fd943, fd942, 0d3FEE6F0E134454FF; +sub.f64 fd944, fd875, fd880; +fma.rn.f64 fd945, fd944, 0d3FE2CF2304755A5E, fd943; +sub.f64 fd946, fd941, fd945; +add.f64 fd947, fd945, fd941; +mul.f64 fd948, fd931, 0d3FE9E3779B97F4A8; +sub.f64 fd949, fd840, fd948; +fma.rn.f64 fd950, fd933, 0d3FD3C6EF372FE950, fd949; +mul.f64 fd951, fd942, 0d3FE2CF2304755A5E; +mul.f64 fd952, fd944, 0d3FEE6F0E134454FF; +sub.f64 fd953, fd951, fd952; +sub.f64 fd954, fd950, fd953; +add.f64 fd955, fd953, fd950; +fma.rn.f64 fd956, fd935, 0d3FD3C6EF372FE950, fd865; +mul.f64 fd957, fd937, 0d3FE9E3779B97F4A8; +sub.f64 fd958, fd956, fd957; +sub.f64 fd959, fd845, fd860; +mul.f64 fd960, fd959, 0d3FEE6F0E134454FF; +sub.f64 fd961, fd850, fd855; +fma.rn.f64 fd962, fd961, 0d3FE2CF2304755A5E, fd960; +add.f64 fd963, fd962, fd958; +sub.f64 fd964, fd958, fd962; +mul.f64 fd965, fd935, 0d3FE9E3779B97F4A8; +sub.f64 fd966, fd865, fd965; +fma.rn.f64 fd967, fd937, 0d3FD3C6EF372FE950, fd966; +mul.f64 fd968, fd959, 0d3FE2CF2304755A5E; +mul.f64 fd969, fd961, 0d3FEE6F0E134454FF; +sub.f64 fd970, fd968, fd969; +add.f64 fd971, fd970, fd967; +sub.f64 fd972, fd967, fd970; +add.f64 fd973, fd846, fd861; +add.f64 fd974, fd841, fd973; +add.f64 fd975, fd851, fd856; +add.f64 fd976, fd975, fd974; +add.f64 fd977, fd871, fd886; +add.f64 fd978, fd866, fd977; +add.f64 fd979, fd876, fd881; +add.f64 fd980, fd979, fd978; +fma.rn.f64 fd981, fd973, 0d3FD3C6EF372FE950, fd841; +mul.f64 fd982, fd975, 0d3FE9E3779B97F4A8; +sub.f64 fd983, fd981, fd982; +sub.f64 fd984, fd871, fd886; +mul.f64 fd985, fd984, 0d3FEE6F0E134454FF; +sub.f64 fd986, fd876, fd881; +fma.rn.f64 fd987, fd986, 0d3FE2CF2304755A5E, fd985; +sub.f64 fd988, fd983, fd987; +add.f64 fd989, fd987, fd983; +mul.f64 fd990, fd973, 0d3FE9E3779B97F4A8; +sub.f64 fd991, fd841, fd990; +fma.rn.f64 fd992, fd975, 0d3FD3C6EF372FE950, fd991; +mul.f64 fd993, fd984, 0d3FE2CF2304755A5E; +mul.f64 fd994, fd986, 0d3FEE6F0E134454FF; +sub.f64 fd995, fd993, fd994; +sub.f64 fd996, fd992, fd995; +add.f64 fd997, fd995, fd992; +fma.rn.f64 fd998, fd977, 0d3FD3C6EF372FE950, fd866; +mul.f64 fd999, fd979, 0d3FE9E3779B97F4A8; +sub.f64 fd1000, fd998, fd999; +sub.f64 fd1001, fd846, fd861; +mul.f64 fd1002, fd1001, 0d3FEE6F0E134454FF; +sub.f64 fd1003, fd851, fd856; +fma.rn.f64 fd1004, fd1003, 0d3FE2CF2304755A5E, fd1002; +add.f64 fd1005, fd1004, fd1000; +sub.f64 fd1006, fd1000, fd1004; +mul.f64 fd1007, fd977, 0d3FE9E3779B97F4A8; +sub.f64 fd1008, fd866, fd1007; +fma.rn.f64 fd1009, fd979, 0d3FD3C6EF372FE950, fd1008; +mul.f64 fd1010, fd1001, 0d3FE2CF2304755A5E; +mul.f64 fd1011, fd1003, 0d3FEE6F0E134454FF; +sub.f64 fd1012, fd1010, fd1011; +add.f64 fd1013, fd1012, fd1009; +sub.f64 fd1014, fd1009, fd1012; +add.f64 fd1015, fd847, fd862; +add.f64 fd1016, fd842, fd1015; +add.f64 fd1017, fd852, fd857; +add.f64 fd1018, fd1017, fd1016; +add.f64 fd1019, fd872, fd887; +add.f64 fd1020, fd867, fd1019; +add.f64 fd1021, fd877, fd882; +add.f64 fd1022, fd1021, fd1020; +fma.rn.f64 fd1023, fd1015, 0d3FD3C6EF372FE950, fd842; +mul.f64 fd1024, fd1017, 0d3FE9E3779B97F4A8; +sub.f64 fd1025, fd1023, fd1024; +sub.f64 fd1026, fd872, fd887; +mul.f64 fd1027, fd1026, 0d3FEE6F0E134454FF; +sub.f64 fd1028, fd877, fd882; +fma.rn.f64 fd1029, fd1028, 0d3FE2CF2304755A5E, fd1027; +sub.f64 fd1030, fd1025, fd1029; +add.f64 fd1031, fd1029, fd1025; +mul.f64 fd1032, fd1015, 0d3FE9E3779B97F4A8; +sub.f64 fd1033, fd842, fd1032; +fma.rn.f64 fd1034, fd1017, 0d3FD3C6EF372FE950, fd1033; +mul.f64 fd1035, fd1026, 0d3FE2CF2304755A5E; +mul.f64 fd1036, fd1028, 0d3FEE6F0E134454FF; +sub.f64 fd1037, fd1035, fd1036; +sub.f64 fd1038, fd1034, fd1037; +add.f64 fd1039, fd1037, fd1034; +fma.rn.f64 fd1040, fd1019, 0d3FD3C6EF372FE950, fd867; +mul.f64 fd1041, fd1021, 0d3FE9E3779B97F4A8; +sub.f64 fd1042, fd1040, fd1041; +sub.f64 fd1043, fd847, fd862; +mul.f64 fd1044, fd1043, 0d3FEE6F0E134454FF; +sub.f64 fd1045, fd852, fd857; +fma.rn.f64 fd1046, fd1045, 0d3FE2CF2304755A5E, fd1044; +add.f64 fd1047, fd1046, fd1042; +sub.f64 fd1048, fd1042, fd1046; +mul.f64 fd1049, fd1019, 0d3FE9E3779B97F4A8; +sub.f64 fd1050, fd867, fd1049; +fma.rn.f64 fd1051, fd1021, 0d3FD3C6EF372FE950, fd1050; +mul.f64 fd1052, fd1043, 0d3FE2CF2304755A5E; +mul.f64 fd1053, fd1045, 0d3FEE6F0E134454FF; +sub.f64 fd1054, fd1052, fd1053; +add.f64 fd1055, fd1054, fd1051; +sub.f64 fd1056, fd1051, fd1054; +add.f64 fd1057, fd848, fd863; +add.f64 fd1058, fd843, fd1057; +add.f64 fd1059, fd853, fd858; +add.f64 fd1060, fd1059, fd1058; +add.f64 fd1061, fd873, fd888; +add.f64 fd1062, fd868, fd1061; +add.f64 fd1063, fd878, fd883; +add.f64 fd1064, fd1063, fd1062; +fma.rn.f64 fd1065, fd1057, 0d3FD3C6EF372FE950, fd843; +mul.f64 fd1066, fd1059, 0d3FE9E3779B97F4A8; +sub.f64 fd1067, fd1065, fd1066; +sub.f64 fd1068, fd873, fd888; +mul.f64 fd1069, fd1068, 0d3FEE6F0E134454FF; +sub.f64 fd1070, fd878, fd883; +fma.rn.f64 fd1071, fd1070, 0d3FE2CF2304755A5E, fd1069; +sub.f64 fd1072, fd1067, fd1071; +add.f64 fd1073, fd1071, fd1067; +mul.f64 fd1074, fd1057, 0d3FE9E3779B97F4A8; +sub.f64 fd1075, fd843, fd1074; +fma.rn.f64 fd1076, fd1059, 0d3FD3C6EF372FE950, fd1075; +mul.f64 fd1077, fd1068, 0d3FE2CF2304755A5E; +mul.f64 fd1078, fd1070, 0d3FEE6F0E134454FF; +sub.f64 fd1079, fd1077, fd1078; +sub.f64 fd1080, fd1076, fd1079; +add.f64 fd1081, fd1079, fd1076; +fma.rn.f64 fd1082, fd1061, 0d3FD3C6EF372FE950, fd868; +mul.f64 fd1083, fd1063, 0d3FE9E3779B97F4A8; +sub.f64 fd1084, fd1082, fd1083; +sub.f64 fd1085, fd848, fd863; +mul.f64 fd1086, fd1085, 0d3FEE6F0E134454FF; +sub.f64 fd1087, fd853, fd858; +fma.rn.f64 fd1088, fd1087, 0d3FE2CF2304755A5E, fd1086; +add.f64 fd1089, fd1088, fd1084; +sub.f64 fd1090, fd1084, fd1088; +mul.f64 fd1091, fd1061, 0d3FE9E3779B97F4A8; +sub.f64 fd1092, fd868, fd1091; +fma.rn.f64 fd1093, fd1063, 0d3FD3C6EF372FE950, fd1092; +mul.f64 fd1094, fd1085, 0d3FE2CF2304755A5E; +mul.f64 fd1095, fd1087, 0d3FEE6F0E134454FF; +sub.f64 fd1096, fd1094, fd1095; +add.f64 fd1097, fd1096, fd1093; +sub.f64 fd1098, fd1093, fd1096; +mul.f64 fd1099, fd946, 0d3FEEFEA21D101EE0; +mul.f64 fd1100, fd963, 0d3FCFD511FA1C0796; +sub.f64 fd1101, fd1099, fd1100; +mul.f64 fd1102, fd963, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd1103, fd946, 0d3FCFD511FA1C0796, fd1102; +mul.f64 fd1104, fd988, 0d3FEC0AB44E81C059; +mul.f64 fd1105, fd1005, 0d3FDED50D5CBFA951; +sub.f64 fd1106, fd1104, fd1105; +mul.f64 fd1107, fd1005, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1108, fd988, 0d3FDED50D5CBFA951, fd1107; +mul.f64 fd1109, fd1030, 0d3FE753B603D2B816; +mul.f64 fd1110, fd1047, 0d3FE5E7CF55112014; +sub.f64 fd1111, fd1109, fd1110; +mul.f64 fd1112, fd1047, 0d3FE753B603D2B816; +fma.rn.f64 fd1113, fd1030, 0d3FE5E7CF55112014, fd1112; +mul.f64 fd1114, fd1072, 0d3FE1257E3C182B51; +mul.f64 fd1115, fd1089, 0d3FEB04BBFF642E86; +sub.f64 fd1116, fd1114, fd1115; +mul.f64 fd1117, fd1089, 0d3FE1257E3C182B51; +fma.rn.f64 fd1118, fd1072, 0d3FEB04BBFF642E86, fd1117; +mul.f64 fd1119, fd954, 0d3FEC0AB44E81C059; +mul.f64 fd1120, fd971, 0d3FDED50D5CBFA951; +sub.f64 fd1121, fd1119, fd1120; +mul.f64 fd1122, fd971, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1123, fd954, 0d3FDED50D5CBFA951, fd1122; +mul.f64 fd1124, fd996, 0d3FE1257E3C182B51; +mul.f64 fd1125, fd1013, 0d3FEB04BBFF642E86; +sub.f64 fd1126, fd1124, fd1125; +mul.f64 fd1127, fd1013, 0d3FE1257E3C182B51; +fma.rn.f64 fd1128, fd996, 0d3FEB04BBFF642E86, fd1127; +mul.f64 fd1129, fd1038, 0d3FB0130A1BE09379; +mul.f64 fd1130, fd1055, 0d3FEFEFD5BFE443FE; +sub.f64 fd1131, fd1129, fd1130; +mul.f64 fd1132, fd1055, 0d3FB0130A1BE09379; +fma.rn.f64 fd1133, fd1038, 0d3FEFEFD5BFE443FE, fd1132; +mul.f64 fd1134, fd1080, 0dBFDB3FF7C925819C; +mul.f64 fd1135, fd1097, 0d3FECF457DCDC158C; +sub.f64 fd1136, fd1134, fd1135; +mul.f64 fd1137, fd1097, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1138, fd1080, 0d3FECF457DCDC158C, fd1137; +mul.f64 fd1139, fd955, 0d3FE753B603D2B816; +mul.f64 fd1140, fd972, 0d3FE5E7CF55112014; +sub.f64 fd1141, fd1139, fd1140; +mul.f64 fd1142, fd972, 0d3FE753B603D2B816; +fma.rn.f64 fd1143, fd955, 0d3FE5E7CF55112014, fd1142; +mul.f64 fd1144, fd997, 0d3FB0130A1BE09379; +mul.f64 fd1145, fd1014, 0d3FEFEFD5BFE443FE; +sub.f64 fd1146, fd1144, fd1145; +mul.f64 fd1147, fd1014, 0d3FB0130A1BE09379; +fma.rn.f64 fd1148, fd997, 0d3FEFEFD5BFE443FE, fd1147; +mul.f64 fd1149, fd1039, 0dBFE465C6FEB501BC; +mul.f64 fd1150, fd1056, 0d3FE8A80B635B6BEA; +sub.f64 fd1151, fd1149, fd1150; +mul.f64 fd1152, fd1056, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1153, fd1039, 0d3FE8A80B635B6BEA, fd1152; +mul.f64 fd1154, fd1081, 0dBFEFBF675480D903; +mul.f64 fd1155, fd1098, 0d3FC00AEB5DA15BE0; +sub.f64 fd1156, fd1154, fd1155; +mul.f64 fd1157, fd1098, 0dBFEFBF675480D903; +fma.rn.f64 fd1158, fd1081, 0d3FC00AEB5DA15BE0, fd1157; +mul.f64 fd1159, fd947, 0d3FE1257E3C182B51; +mul.f64 fd1160, fd964, 0d3FEB04BBFF642E86; +sub.f64 fd1161, fd1159, fd1160; +mul.f64 fd1162, fd964, 0d3FE1257E3C182B51; +fma.rn.f64 fd1163, fd947, 0d3FEB04BBFF642E86, fd1162; +mul.f64 fd1164, fd989, 0dBFDB3FF7C925819C; +mul.f64 fd1165, fd1006, 0d3FECF457DCDC158C; +sub.f64 fd1166, fd1164, fd1165; +mul.f64 fd1167, fd1006, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1168, fd989, 0d3FECF457DCDC158C, fd1167; +mul.f64 fd1169, fd1031, 0dBFEFBF675480D903; +mul.f64 fd1170, fd1048, 0d3FC00AEB5DA15BE0; +sub.f64 fd1171, fd1169, fd1170; +mul.f64 fd1172, fd1048, 0dBFEFBF675480D903; +fma.rn.f64 fd1173, fd1031, 0d3FC00AEB5DA15BE0, fd1172; +mul.f64 fd1174, fd1073, 0dBFE465C6FEB501BC; +mul.f64 fd1175, fd1090, 0dBFE8A80B635B6BEA; +sub.f64 fd1176, fd1174, fd1175; +mul.f64 fd1177, fd1090, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1178, fd1073, 0dBFE8A80B635B6BEA, fd1177; +add.f64 fd1179, fd934, fd1060; +add.f64 fd1180, fd892, fd1179; +add.f64 fd1181, fd976, fd1018; +add.f64 fd1182, fd938, fd1064; +add.f64 fd1183, fd896, fd1182; +add.f64 fd1184, fd980, fd1022; +fma.rn.f64 fd1185, fd1179, 0d3FD3C6EF372FE950, fd892; +mul.f64 fd1186, fd1181, 0d3FE9E3779B97F4A8; +sub.f64 fd1187, fd1185, fd1186; +sub.f64 fd1188, fd938, fd1064; +mul.f64 fd1189, fd1188, 0d3FEE6F0E134454FF; +sub.f64 fd1190, fd980, fd1022; +fma.rn.f64 fd1191, fd1190, 0d3FE2CF2304755A5E, fd1189; +mul.f64 fd1192, fd1179, 0d3FE9E3779B97F4A8; +sub.f64 fd1193, fd892, fd1192; +fma.rn.f64 fd1194, fd1181, 0d3FD3C6EF372FE950, fd1193; +mul.f64 fd1195, fd1188, 0d3FE2CF2304755A5E; +mul.f64 fd1196, fd1190, 0d3FEE6F0E134454FF; +sub.f64 fd1197, fd1195, fd1196; +fma.rn.f64 fd1198, fd1182, 0d3FD3C6EF372FE950, fd896; +mul.f64 fd1199, fd1184, 0d3FE9E3779B97F4A8; +sub.f64 fd1200, fd1198, fd1199; +sub.f64 fd1201, fd934, fd1060; +mul.f64 fd1202, fd1201, 0d3FEE6F0E134454FF; +sub.f64 fd1203, fd976, fd1018; +fma.rn.f64 fd1204, fd1203, 0d3FE2CF2304755A5E, fd1202; +mul.f64 fd1205, fd1182, 0d3FE9E3779B97F4A8; +sub.f64 fd1206, fd896, fd1205; +fma.rn.f64 fd1207, fd1184, 0d3FD3C6EF372FE950, fd1206; +mul.f64 fd1208, fd1201, 0d3FE2CF2304755A5E; +mul.f64 fd1209, fd1203, 0d3FEE6F0E134454FF; +sub.f64 fd1210, fd1208, fd1209; +add.f64 fd1211, fd1101, fd1116; +add.f64 fd1212, fd904, fd1211; +add.f64 fd1213, fd1106, fd1111; +add.f64 fd1214, fd1103, fd1118; +add.f64 fd1215, fd921, fd1214; +add.f64 fd1216, fd1108, fd1113; +fma.rn.f64 fd1217, fd1211, 0d3FD3C6EF372FE950, fd904; +mul.f64 fd1218, fd1213, 0d3FE9E3779B97F4A8; +sub.f64 fd1219, fd1217, fd1218; +sub.f64 fd1220, fd1103, fd1118; +mul.f64 fd1221, fd1220, 0d3FEE6F0E134454FF; +sub.f64 fd1222, fd1108, fd1113; +fma.rn.f64 fd1223, fd1222, 0d3FE2CF2304755A5E, fd1221; +mul.f64 fd1224, fd1211, 0d3FE9E3779B97F4A8; +sub.f64 fd1225, fd904, fd1224; +fma.rn.f64 fd1226, fd1213, 0d3FD3C6EF372FE950, fd1225; +mul.f64 fd1227, fd1220, 0d3FE2CF2304755A5E; +mul.f64 fd1228, fd1222, 0d3FEE6F0E134454FF; +sub.f64 fd1229, fd1227, fd1228; +fma.rn.f64 fd1230, fd1214, 0d3FD3C6EF372FE950, fd921; +mul.f64 fd1231, fd1216, 0d3FE9E3779B97F4A8; +sub.f64 fd1232, fd1230, fd1231; +sub.f64 fd1233, fd1101, fd1116; +mul.f64 fd1234, fd1233, 0d3FEE6F0E134454FF; +sub.f64 fd1235, fd1106, fd1111; +fma.rn.f64 fd1236, fd1235, 0d3FE2CF2304755A5E, fd1234; +mul.f64 fd1237, fd1214, 0d3FE9E3779B97F4A8; +sub.f64 fd1238, fd921, fd1237; +fma.rn.f64 fd1239, fd1216, 0d3FD3C6EF372FE950, fd1238; +mul.f64 fd1240, fd1233, 0d3FE2CF2304755A5E; +mul.f64 fd1241, fd1235, 0d3FEE6F0E134454FF; +sub.f64 fd1242, fd1240, fd1241; +add.f64 fd1243, fd1121, fd1136; +add.f64 fd1244, fd912, fd1243; +add.f64 fd1245, fd1126, fd1131; +add.f64 fd1246, fd1123, fd1138; +add.f64 fd1247, fd929, fd1246; +add.f64 fd1248, fd1128, fd1133; +fma.rn.f64 fd1249, fd1243, 0d3FD3C6EF372FE950, fd912; +mul.f64 fd1250, fd1245, 0d3FE9E3779B97F4A8; +sub.f64 fd1251, fd1249, fd1250; +sub.f64 fd1252, fd1123, fd1138; +mul.f64 fd1253, fd1252, 0d3FEE6F0E134454FF; +sub.f64 fd1254, fd1128, fd1133; +fma.rn.f64 fd1255, fd1254, 0d3FE2CF2304755A5E, fd1253; +mul.f64 fd1256, fd1243, 0d3FE9E3779B97F4A8; +sub.f64 fd1257, fd912, fd1256; +fma.rn.f64 fd1258, fd1245, 0d3FD3C6EF372FE950, fd1257; +mul.f64 fd1259, fd1252, 0d3FE2CF2304755A5E; +mul.f64 fd1260, fd1254, 0d3FEE6F0E134454FF; +sub.f64 fd1261, fd1259, fd1260; +fma.rn.f64 fd1262, fd1246, 0d3FD3C6EF372FE950, fd929; +mul.f64 fd1263, fd1248, 0d3FE9E3779B97F4A8; +sub.f64 fd1264, fd1262, fd1263; +sub.f64 fd1265, fd1121, fd1136; +mul.f64 fd1266, fd1265, 0d3FEE6F0E134454FF; +sub.f64 fd1267, fd1126, fd1131; +fma.rn.f64 fd1268, fd1267, 0d3FE2CF2304755A5E, fd1266; +mul.f64 fd1269, fd1246, 0d3FE9E3779B97F4A8; +sub.f64 fd1270, fd929, fd1269; +fma.rn.f64 fd1271, fd1248, 0d3FD3C6EF372FE950, fd1270; +mul.f64 fd1272, fd1265, 0d3FE2CF2304755A5E; +mul.f64 fd1273, fd1267, 0d3FEE6F0E134454FF; +sub.f64 fd1274, fd1272, fd1273; +add.f64 fd1275, fd1141, fd1156; +add.f64 fd1276, fd913, fd1275; +add.f64 fd1277, fd1146, fd1151; +add.f64 fd1278, fd1143, fd1158; +add.f64 fd1279, fd930, fd1278; +add.f64 fd1280, fd1148, fd1153; +fma.rn.f64 fd1281, fd1275, 0d3FD3C6EF372FE950, fd913; +mul.f64 fd1282, fd1277, 0d3FE9E3779B97F4A8; +sub.f64 fd1283, fd1281, fd1282; +sub.f64 fd1284, fd1143, fd1158; +mul.f64 fd1285, fd1284, 0d3FEE6F0E134454FF; +sub.f64 fd1286, fd1148, fd1153; +fma.rn.f64 fd1287, fd1286, 0d3FE2CF2304755A5E, fd1285; +mul.f64 fd1288, fd1275, 0d3FE9E3779B97F4A8; +sub.f64 fd1289, fd913, fd1288; +fma.rn.f64 fd1290, fd1277, 0d3FD3C6EF372FE950, fd1289; +mul.f64 fd1291, fd1284, 0d3FE2CF2304755A5E; +mul.f64 fd1292, fd1286, 0d3FEE6F0E134454FF; +sub.f64 fd1293, fd1291, fd1292; +fma.rn.f64 fd1294, fd1278, 0d3FD3C6EF372FE950, fd930; +mul.f64 fd1295, fd1280, 0d3FE9E3779B97F4A8; +sub.f64 fd1296, fd1294, fd1295; +sub.f64 fd1297, fd1141, fd1156; +mul.f64 fd1298, fd1297, 0d3FEE6F0E134454FF; +sub.f64 fd1299, fd1146, fd1151; +fma.rn.f64 fd1300, fd1299, 0d3FE2CF2304755A5E, fd1298; +mul.f64 fd1301, fd1278, 0d3FE9E3779B97F4A8; +sub.f64 fd1302, fd930, fd1301; +fma.rn.f64 fd1303, fd1280, 0d3FD3C6EF372FE950, fd1302; +mul.f64 fd1304, fd1297, 0d3FE2CF2304755A5E; +mul.f64 fd1305, fd1299, 0d3FEE6F0E134454FF; +sub.f64 fd1306, fd1304, fd1305; +add.f64 fd1307, fd1161, fd1176; +add.f64 fd1308, fd905, fd1307; +add.f64 fd1309, fd1166, fd1171; +add.f64 fd1310, fd1163, fd1178; +add.f64 fd1311, fd922, fd1310; +add.f64 fd1312, fd1168, fd1173; +fma.rn.f64 fd1313, fd1307, 0d3FD3C6EF372FE950, fd905; +mul.f64 fd1314, fd1309, 0d3FE9E3779B97F4A8; +sub.f64 fd1315, fd1313, fd1314; +sub.f64 fd1316, fd1163, fd1178; +mul.f64 fd1317, fd1316, 0d3FEE6F0E134454FF; +sub.f64 fd1318, fd1168, fd1173; +fma.rn.f64 fd1319, fd1318, 0d3FE2CF2304755A5E, fd1317; +mul.f64 fd1320, fd1307, 0d3FE9E3779B97F4A8; +sub.f64 fd1321, fd905, fd1320; +fma.rn.f64 fd1322, fd1309, 0d3FD3C6EF372FE950, fd1321; +mul.f64 fd1323, fd1316, 0d3FE2CF2304755A5E; +mul.f64 fd1324, fd1318, 0d3FEE6F0E134454FF; +sub.f64 fd1325, fd1323, fd1324; +fma.rn.f64 fd1326, fd1310, 0d3FD3C6EF372FE950, fd922; +mul.f64 fd1327, fd1312, 0d3FE9E3779B97F4A8; +sub.f64 fd1328, fd1326, fd1327; +sub.f64 fd1329, fd1161, fd1176; +mul.f64 fd1330, fd1329, 0d3FEE6F0E134454FF; +sub.f64 fd1331, fd1166, fd1171; +fma.rn.f64 fd1332, fd1331, 0d3FE2CF2304755A5E, fd1330; +mul.f64 fd1333, fd1310, 0d3FE9E3779B97F4A8; +sub.f64 fd1334, fd922, fd1333; +fma.rn.f64 fd1335, fd1312, 0d3FD3C6EF372FE950, fd1334; +mul.f64 fd1336, fd1329, 0d3FE2CF2304755A5E; +mul.f64 fd1337, fd1331, 0d3FEE6F0E134454FF; +sub.f64 fd1338, fd1336, fd1337; +add.f64 %0, fd1181, fd1180; +add.f64 %1, fd1184, fd1183; +add.f64 %3, fd1216, fd1215; +add.f64 %2, fd1213, fd1212; +add.f64 %5, fd1248, fd1247; +add.f64 %4, fd1245, fd1244; +add.f64 %7, fd1280, fd1279; +add.f64 %6, fd1277, fd1276; +add.f64 %9, fd1312, fd1311; +add.f64 %8, fd1309, fd1308; +add.f64 %11, fd1204, fd1200; +sub.f64 %10, fd1187, fd1191; +add.f64 %13, fd1236, fd1232; +sub.f64 %12, fd1219, fd1223; +add.f64 %15, fd1268, fd1264; +sub.f64 %14, fd1251, fd1255; +add.f64 %17, fd1300, fd1296; +sub.f64 %16, fd1283, fd1287; +add.f64 %19, fd1332, fd1328; +sub.f64 %18, fd1315, fd1319; +sub.f64 %20, fd1194, fd1197; +add.f64 %21, fd1210, fd1207; +add.f64 %23, fd1242, fd1239; +sub.f64 %22, fd1226, fd1229; +add.f64 %25, fd1274, fd1271; +sub.f64 %24, fd1258, fd1261; +add.f64 %27, fd1306, fd1303; +sub.f64 %26, fd1290, fd1293; +add.f64 %29, fd1338, fd1335; +sub.f64 %28, fd1322, fd1325; +add.f64 %30, fd1197, fd1194; +sub.f64 %31, fd1207, fd1210; +sub.f64 %33, fd1239, fd1242; +add.f64 %32, fd1229, fd1226; +sub.f64 %35, fd1271, fd1274; +add.f64 %34, fd1261, fd1258; +sub.f64 %37, fd1303, fd1306; +add.f64 %36, fd1293, fd1290; +sub.f64 %39, fd1335, fd1338; +add.f64 %38, fd1325, fd1322; +sub.f64 %41, fd1200, fd1204; +add.f64 %40, fd1191, fd1187; +sub.f64 %43, fd1232, fd1236; +add.f64 %42, fd1223, fd1219; +sub.f64 %45, fd1264, fd1268; +add.f64 %44, fd1255, fd1251; +sub.f64 %47, fd1296, fd1300; +add.f64 %46, fd1287, fd1283; +sub.f64 %49, fd1328, fd1332; +add.f64 %48, fd1319, fd1315; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_625), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<714, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1714>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %50; +mad.lo.s32 r3, r12, 10000, r13; +add.f64 fd101, %62, %92; +add.f64 fd103, %72, %82; +add.f64 fd1713, %52, fd101; +add.f64 fd104, fd103, fd1713; +add.f64 fd105, %102, %104; +add.f64 fd107, %103, %83; +add.f64 fd1709, %53, fd105; +add.f64 fd108, fd107, fd1709; +mul.f64 fd110, fd103, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1708, fd101, 0d3FD3C6EF372FE950, %52; +sub.f64 fd111, fd1708, fd110; +sub.f64 fd112, %102, %104; +sub.f64 fd114, %103, %83; +mul.f64 fd1707, fd112, 0d3FEE6F0E134454FF; +fma.rn.f64 fd115, fd114, 0d3FE2CF2304755A5E, fd1707; +sub.f64 fd116, fd111, fd115; +add.f64 fd117, fd115, fd111; +mul.f64 fd118, fd101, 0d3FE9E3779B97F4A8; +sub.f64 fd119, %52, fd118; +fma.rn.f64 fd120, fd103, 0d3FD3C6EF372FE950, fd119; +mul.f64 fd121, fd112, 0d3FE2CF2304755A5E; +mul.f64 fd122, fd114, 0d3FEE6F0E134454FF; +sub.f64 fd123, fd121, fd122; +sub.f64 fd124, fd120, fd123; +add.f64 fd125, fd123, fd120; +mul.f64 fd127, fd107, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1706, fd105, 0d3FD3C6EF372FE950, %53; +sub.f64 fd128, fd1706, fd127; +sub.f64 fd129, %62, %92; +sub.f64 fd131, %72, %82; +mul.f64 fd1705, fd129, 0d3FEE6F0E134454FF; +fma.rn.f64 fd132, fd131, 0d3FE2CF2304755A5E, fd1705; +add.f64 fd133, fd132, fd128; +sub.f64 fd134, fd128, fd132; +mul.f64 fd135, fd105, 0d3FE9E3779B97F4A8; +sub.f64 fd136, %53, fd135; +fma.rn.f64 fd137, fd107, 0d3FD3C6EF372FE950, fd136; +mul.f64 fd138, fd129, 0d3FE2CF2304755A5E; +mul.f64 fd139, fd131, 0d3FEE6F0E134454FF; +sub.f64 fd140, fd138, fd139; +add.f64 fd141, fd140, fd137; +sub.f64 fd142, fd137, fd140; +add.f64 fd143, %64, %94; +add.f64 fd145, %74, %84; +add.f64 fd1704, %54, fd143; +add.f64 fd146, fd145, fd1704; +add.f64 fd147, %65, %95; +add.f64 fd149, %107, %105; +add.f64 fd1700, %106, fd147; +add.f64 fd150, fd149, fd1700; +fma.rn.f64 fd1698, fd143, 0d3FD3C6EF372FE950, %54; +mul.f64 fd1699, fd145, 0d3FE9E3779B97F4A8; +sub.f64 fd153, fd1698, fd1699; +sub.f64 fd154, %65, %95; +sub.f64 fd156, %107, %105; +mul.f64 fd1697, fd154, 0d3FEE6F0E134454FF; +fma.rn.f64 fd157, fd156, 0d3FE2CF2304755A5E, fd1697; +sub.f64 fd158, fd153, fd157; +add.f64 fd159, fd157, fd153; +mul.f64 fd160, fd143, 0d3FE9E3779B97F4A8; +sub.f64 fd161, %54, fd160; +fma.rn.f64 fd162, fd145, 0d3FD3C6EF372FE950, fd161; +mul.f64 fd163, fd154, 0d3FE2CF2304755A5E; +mul.f64 fd164, fd156, 0d3FEE6F0E134454FF; +sub.f64 fd165, fd163, fd164; +sub.f64 fd166, fd162, fd165; +add.f64 fd167, fd165, fd162; +mul.f64 fd169, fd149, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1696, fd147, 0d3FD3C6EF372FE950, %106; +sub.f64 fd170, fd1696, fd169; +sub.f64 fd171, %64, %94; +sub.f64 fd173, %74, %84; +mul.f64 fd1695, fd171, 0d3FEE6F0E134454FF; +fma.rn.f64 fd174, fd173, 0d3FE2CF2304755A5E, fd1695; +add.f64 fd175, fd174, fd170; +sub.f64 fd176, fd170, fd174; +mul.f64 fd177, fd147, 0d3FE9E3779B97F4A8; +sub.f64 fd178, %106, fd177; +fma.rn.f64 fd179, fd149, 0d3FD3C6EF372FE950, fd178; +mul.f64 fd180, fd171, 0d3FE2CF2304755A5E; +mul.f64 fd181, fd173, 0d3FEE6F0E134454FF; +sub.f64 fd182, fd180, fd181; +add.f64 fd183, fd182, fd179; +sub.f64 fd184, fd179, fd182; +add.f64 fd185, %66, %96; +add.f64 fd187, %76, %86; +add.f64 fd1694, %56, fd185; +add.f64 fd188, fd187, fd1694; +add.f64 fd189, %110, %109; +add.f64 fd191, %77, %111; +add.f64 fd1689, %108, fd189; +add.f64 fd192, fd191, fd1689; +fma.rn.f64 fd1687, fd185, 0d3FD3C6EF372FE950, %56; +mul.f64 fd1688, fd187, 0d3FE9E3779B97F4A8; +sub.f64 fd195, fd1687, fd1688; +sub.f64 fd196, %110, %109; +sub.f64 fd198, %77, %111; +mul.f64 fd1686, fd196, 0d3FEE6F0E134454FF; +fma.rn.f64 fd199, fd198, 0d3FE2CF2304755A5E, fd1686; +sub.f64 fd200, fd195, fd199; +add.f64 fd201, fd199, fd195; +mul.f64 fd202, fd185, 0d3FE9E3779B97F4A8; +sub.f64 fd203, %56, fd202; +fma.rn.f64 fd204, fd187, 0d3FD3C6EF372FE950, fd203; +mul.f64 fd205, fd196, 0d3FE2CF2304755A5E; +mul.f64 fd206, fd198, 0d3FEE6F0E134454FF; +sub.f64 fd207, fd205, fd206; +sub.f64 fd208, fd204, fd207; +add.f64 fd209, fd207, fd204; +fma.rn.f64 fd1684, fd189, 0d3FD3C6EF372FE950, %108; +mul.f64 fd1685, fd191, 0d3FE9E3779B97F4A8; +sub.f64 fd212, fd1684, fd1685; +sub.f64 fd213, %66, %96; +sub.f64 fd215, %76, %86; +mul.f64 fd1683, fd213, 0d3FEE6F0E134454FF; +fma.rn.f64 fd216, fd215, 0d3FE2CF2304755A5E, fd1683; +add.f64 fd217, fd216, fd212; +sub.f64 fd218, fd212, fd216; +mul.f64 fd219, fd189, 0d3FE9E3779B97F4A8; +sub.f64 fd220, %108, fd219; +fma.rn.f64 fd221, fd191, 0d3FD3C6EF372FE950, fd220; +mul.f64 fd222, fd213, 0d3FE2CF2304755A5E; +mul.f64 fd223, fd215, 0d3FEE6F0E134454FF; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd224, fd221; +sub.f64 fd226, fd221, fd224; +add.f64 fd227, %68, %98; +add.f64 fd229, %78, %88; +add.f64 fd1682, %58, fd227; +add.f64 fd230, fd229, fd1682; +add.f64 fd231, %113, %112; +add.f64 fd233, %114, %89; +add.f64 fd1678, %59, fd231; +add.f64 fd234, fd233, fd1678; +mul.f64 fd236, fd229, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1677, fd227, 0d3FD3C6EF372FE950, %58; +sub.f64 fd237, fd1677, fd236; +sub.f64 fd238, %113, %112; +sub.f64 fd240, %114, %89; +mul.f64 fd1676, fd238, 0d3FEE6F0E134454FF; +fma.rn.f64 fd241, fd240, 0d3FE2CF2304755A5E, fd1676; +sub.f64 fd242, fd237, fd241; +add.f64 fd243, fd241, fd237; +mul.f64 fd244, fd227, 0d3FE9E3779B97F4A8; +sub.f64 fd245, %58, fd244; +fma.rn.f64 fd246, fd229, 0d3FD3C6EF372FE950, fd245; +mul.f64 fd247, fd238, 0d3FE2CF2304755A5E; +mul.f64 fd248, fd240, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd247, fd248; +sub.f64 fd250, fd246, fd249; +add.f64 fd251, fd249, fd246; +mul.f64 fd253, fd233, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1675, fd231, 0d3FD3C6EF372FE950, %59; +sub.f64 fd254, fd1675, fd253; +sub.f64 fd255, %68, %98; +sub.f64 fd257, %78, %88; +mul.f64 fd1674, fd255, 0d3FEE6F0E134454FF; +fma.rn.f64 fd258, fd257, 0d3FE2CF2304755A5E, fd1674; +add.f64 fd259, fd258, fd254; +sub.f64 fd260, fd254, fd258; +mul.f64 fd261, fd231, 0d3FE9E3779B97F4A8; +sub.f64 fd262, %59, fd261; +fma.rn.f64 fd263, fd233, 0d3FD3C6EF372FE950, fd262; +mul.f64 fd264, fd255, 0d3FE2CF2304755A5E; +mul.f64 fd265, fd257, 0d3FEE6F0E134454FF; +sub.f64 fd266, fd264, fd265; +add.f64 fd267, fd266, fd263; +sub.f64 fd268, fd263, fd266; +add.f64 fd269, %70, %100; +add.f64 fd271, %80, %90; +add.f64 fd1673, %60, fd269; +add.f64 fd272, fd271, fd1673; +add.f64 fd273, %71, %101; +add.f64 fd275, %117, %115; +add.f64 fd1669, %116, fd273; +add.f64 fd276, fd275, fd1669; +mul.f64 fd278, fd271, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1668, fd269, 0d3FD3C6EF372FE950, %60; +sub.f64 fd279, fd1668, fd278; +sub.f64 fd280, %71, %101; +sub.f64 fd282, %117, %115; +mul.f64 fd1667, fd280, 0d3FEE6F0E134454FF; +fma.rn.f64 fd283, fd282, 0d3FE2CF2304755A5E, fd1667; +sub.f64 fd284, fd279, fd283; +add.f64 fd285, fd283, fd279; +mul.f64 fd286, fd269, 0d3FE9E3779B97F4A8; +sub.f64 fd287, %60, fd286; +fma.rn.f64 fd288, fd271, 0d3FD3C6EF372FE950, fd287; +mul.f64 fd289, fd280, 0d3FE2CF2304755A5E; +mul.f64 fd290, fd282, 0d3FEE6F0E134454FF; +sub.f64 fd291, fd289, fd290; +sub.f64 fd292, fd288, fd291; +add.f64 fd293, fd291, fd288; +mul.f64 fd295, fd275, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1666, fd273, 0d3FD3C6EF372FE950, %116; +sub.f64 fd296, fd1666, fd295; +sub.f64 fd297, %70, %100; +sub.f64 fd299, %80, %90; +mul.f64 fd1665, fd297, 0d3FEE6F0E134454FF; +fma.rn.f64 fd300, fd299, 0d3FE2CF2304755A5E, fd1665; +add.f64 fd301, fd300, fd296; +sub.f64 fd302, fd296, fd300; +mul.f64 fd303, fd273, 0d3FE9E3779B97F4A8; +sub.f64 fd304, %116, fd303; +fma.rn.f64 fd305, fd275, 0d3FD3C6EF372FE950, fd304; +mul.f64 fd306, fd297, 0d3FE2CF2304755A5E; +mul.f64 fd307, fd299, 0d3FEE6F0E134454FF; +sub.f64 fd308, fd306, fd307; +add.f64 fd309, fd308, fd305; +sub.f64 fd310, fd305, fd308; +mul.f64 fd1663, fd158, 0d3FEEFEA21D101EE0; +mul.f64 fd1664, fd175, 0d3FCFD511FA1C0796; +sub.f64 fd313, fd1663, fd1664; +mul.f64 fd314, fd175, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd315, fd158, 0d3FCFD511FA1C0796, fd314; +mul.f64 fd1661, fd200, 0d3FEC0AB44E81C059; +mul.f64 fd1662, fd217, 0d3FDED50D5CBFA951; +sub.f64 fd318, fd1661, fd1662; +mul.f64 fd319, fd217, 0d3FEC0AB44E81C059; +fma.rn.f64 fd320, fd200, 0d3FDED50D5CBFA951, fd319; +mul.f64 fd322, fd259, 0d3FE5E7CF55112014; +mul.f64 fd1660, fd242, 0d3FE753B603D2B816; +sub.f64 fd323, fd1660, fd322; +mul.f64 fd324, fd259, 0d3FE753B603D2B816; +fma.rn.f64 fd325, fd242, 0d3FE5E7CF55112014, fd324; +mul.f64 fd327, fd301, 0d3FEB04BBFF642E86; +mul.f64 fd1659, fd284, 0d3FE1257E3C182B51; +sub.f64 fd328, fd1659, fd327; +mul.f64 fd329, fd301, 0d3FE1257E3C182B51; +fma.rn.f64 fd330, fd284, 0d3FEB04BBFF642E86, fd329; +mul.f64 fd332, fd183, 0d3FDED50D5CBFA951; +mul.f64 fd1658, fd166, 0d3FEC0AB44E81C059; +sub.f64 fd333, fd1658, fd332; +mul.f64 fd334, fd183, 0d3FEC0AB44E81C059; +fma.rn.f64 fd335, fd166, 0d3FDED50D5CBFA951, fd334; +mul.f64 fd337, fd225, 0d3FEB04BBFF642E86; +mul.f64 fd1657, fd208, 0d3FE1257E3C182B51; +sub.f64 fd338, fd1657, fd337; +mul.f64 fd339, fd225, 0d3FE1257E3C182B51; +fma.rn.f64 fd340, fd208, 0d3FEB04BBFF642E86, fd339; +mul.f64 fd342, fd267, 0d3FEFEFD5BFE443FE; +mul.f64 fd1656, fd250, 0d3FB0130A1BE09379; +sub.f64 fd343, fd1656, fd342; +mul.f64 fd344, fd267, 0d3FB0130A1BE09379; +fma.rn.f64 fd345, fd250, 0d3FEFEFD5BFE443FE, fd344; +mul.f64 fd1654, fd292, 0dBFDB3FF7C925819C; +mul.f64 fd1655, fd309, 0d3FECF457DCDC158C; +sub.f64 fd348, fd1654, fd1655; +mul.f64 fd349, fd309, 0dBFDB3FF7C925819C; +fma.rn.f64 fd350, fd292, 0d3FECF457DCDC158C, fd349; +mul.f64 fd1652, fd167, 0d3FE753B603D2B816; +mul.f64 fd1653, fd184, 0d3FE5E7CF55112014; +sub.f64 fd353, fd1652, fd1653; +mul.f64 fd354, fd184, 0d3FE753B603D2B816; +fma.rn.f64 fd355, fd167, 0d3FE5E7CF55112014, fd354; +mul.f64 fd1650, fd209, 0d3FB0130A1BE09379; +mul.f64 fd1651, fd226, 0d3FEFEFD5BFE443FE; +sub.f64 fd358, fd1650, fd1651; +mul.f64 fd359, fd226, 0d3FB0130A1BE09379; +fma.rn.f64 fd360, fd209, 0d3FEFEFD5BFE443FE, fd359; +mul.f64 fd1648, fd251, 0dBFE465C6FEB501BC; +mul.f64 fd1649, fd268, 0d3FE8A80B635B6BEA; +sub.f64 fd363, fd1648, fd1649; +mul.f64 fd364, fd268, 0dBFE465C6FEB501BC; +fma.rn.f64 fd365, fd251, 0d3FE8A80B635B6BEA, fd364; +mul.f64 fd367, fd310, 0d3FC00AEB5DA15BE0; +mul.f64 fd1647, fd293, 0dBFEFBF675480D903; +sub.f64 fd368, fd1647, fd367; +mul.f64 fd369, fd310, 0dBFEFBF675480D903; +fma.rn.f64 fd370, fd293, 0d3FC00AEB5DA15BE0, fd369; +mul.f64 fd372, fd176, 0d3FEB04BBFF642E86; +mul.f64 fd1646, fd159, 0d3FE1257E3C182B51; +sub.f64 fd373, fd1646, fd372; +mul.f64 fd374, fd176, 0d3FE1257E3C182B51; +fma.rn.f64 fd375, fd159, 0d3FEB04BBFF642E86, fd374; +mul.f64 fd377, fd218, 0d3FECF457DCDC158C; +mul.f64 fd1645, fd201, 0dBFDB3FF7C925819C; +sub.f64 fd378, fd1645, fd377; +mul.f64 fd379, fd218, 0dBFDB3FF7C925819C; +fma.rn.f64 fd380, fd201, 0d3FECF457DCDC158C, fd379; +mul.f64 fd382, fd260, 0d3FC00AEB5DA15BE0; +mul.f64 fd1644, fd243, 0dBFEFBF675480D903; +sub.f64 fd383, fd1644, fd382; +mul.f64 fd384, fd260, 0dBFEFBF675480D903; +fma.rn.f64 fd385, fd243, 0d3FC00AEB5DA15BE0, fd384; +mul.f64 fd387, fd302, 0dBFE8A80B635B6BEA; +mul.f64 fd1643, fd285, 0dBFE465C6FEB501BC; +sub.f64 fd388, fd1643, fd387; +mul.f64 fd389, fd302, 0dBFE465C6FEB501BC; +fma.rn.f64 fd390, fd285, 0dBFE8A80B635B6BEA, fd389; +add.f64 fd391, fd146, fd272; +add.f64 fd393, fd188, fd230; +mul.f64 fd398, fd393, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1642, fd391, 0d3FD3C6EF372FE950, fd104; +sub.f64 fd399, fd1642, fd398; +add.f64 fd1641, fd150, fd276; +sub.f64 fd400, fd150, fd276; +add.f64 fd1640, fd192, fd234; +sub.f64 fd402, fd192, fd234; +mul.f64 fd1639, fd400, 0d3FEE6F0E134454FF; +fma.rn.f64 fd403, fd402, 0d3FE2CF2304755A5E, fd1639; +sub.f64 fd404, fd399, fd403; +add.f64 fd405, fd403, fd399; +add.f64 fd1638, fd104, fd391; +mul.f64 fd406, fd391, 0d3FE9E3779B97F4A8; +sub.f64 fd407, fd104, fd406; +fma.rn.f64 fd408, fd393, 0d3FD3C6EF372FE950, fd407; +mul.f64 fd409, fd400, 0d3FE2CF2304755A5E; +mul.f64 fd410, fd402, 0d3FEE6F0E134454FF; +sub.f64 fd411, fd409, fd410; +sub.f64 fd412, fd408, fd411; +add.f64 fd413, fd411, fd408; +mul.f64 fd415, fd1640, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1637, fd1641, 0d3FD3C6EF372FE950, fd108; +sub.f64 fd416, fd1637, fd415; +sub.f64 fd417, fd146, fd272; +sub.f64 fd419, fd188, fd230; +mul.f64 fd1636, fd417, 0d3FEE6F0E134454FF; +fma.rn.f64 fd420, fd419, 0d3FE2CF2304755A5E, fd1636; +add.f64 fd421, fd420, fd416; +sub.f64 fd422, fd416, fd420; +add.f64 fd1635, fd108, fd1641; +mul.f64 fd423, fd1641, 0d3FE9E3779B97F4A8; +sub.f64 fd424, fd108, fd423; +fma.rn.f64 fd425, fd1640, 0d3FD3C6EF372FE950, fd424; +mul.f64 fd426, fd417, 0d3FE2CF2304755A5E; +mul.f64 fd427, fd419, 0d3FEE6F0E134454FF; +sub.f64 fd428, fd426, fd427; +add.f64 fd429, fd428, fd425; +sub.f64 fd430, fd425, fd428; +add.f64 fd431, fd313, fd328; +add.f64 fd433, fd318, fd323; +add.f64 fd1634, fd116, fd431; +add.f64 fd434, fd433, fd1634; +add.f64 fd435, fd315, fd330; +add.f64 fd437, fd320, fd325; +add.f64 fd1633, fd133, fd435; +add.f64 fd438, fd437, fd1633; +fma.rn.f64 fd1631, fd431, 0d3FD3C6EF372FE950, fd116; +mul.f64 fd1632, fd433, 0d3FE9E3779B97F4A8; +sub.f64 fd441, fd1631, fd1632; +sub.f64 fd442, fd315, fd330; +sub.f64 fd444, fd320, fd325; +mul.f64 fd1630, fd442, 0d3FEE6F0E134454FF; +fma.rn.f64 fd445, fd444, 0d3FE2CF2304755A5E, fd1630; +sub.f64 fd446, fd441, fd445; +add.f64 fd447, fd445, fd441; +mul.f64 fd448, fd431, 0d3FE9E3779B97F4A8; +sub.f64 fd449, fd116, fd448; +fma.rn.f64 fd450, fd433, 0d3FD3C6EF372FE950, fd449; +mul.f64 fd451, fd442, 0d3FE2CF2304755A5E; +mul.f64 fd452, fd444, 0d3FEE6F0E134454FF; +sub.f64 fd453, fd451, fd452; +sub.f64 fd454, fd450, fd453; +add.f64 fd455, fd453, fd450; +mul.f64 fd457, fd437, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1629, fd435, 0d3FD3C6EF372FE950, fd133; +sub.f64 fd458, fd1629, fd457; +sub.f64 fd459, fd313, fd328; +sub.f64 fd461, fd318, fd323; +mul.f64 fd1628, fd459, 0d3FEE6F0E134454FF; +fma.rn.f64 fd462, fd461, 0d3FE2CF2304755A5E, fd1628; +add.f64 fd463, fd462, fd458; +sub.f64 fd464, fd458, fd462; +mul.f64 fd465, fd435, 0d3FE9E3779B97F4A8; +sub.f64 fd466, fd133, fd465; +fma.rn.f64 fd467, fd437, 0d3FD3C6EF372FE950, fd466; +mul.f64 fd468, fd459, 0d3FE2CF2304755A5E; +mul.f64 fd469, fd461, 0d3FEE6F0E134454FF; +sub.f64 fd470, fd468, fd469; +add.f64 fd471, fd470, fd467; +sub.f64 fd472, fd467, fd470; +add.f64 fd473, fd333, fd348; +add.f64 fd475, fd338, fd343; +add.f64 fd1627, fd124, fd473; +add.f64 fd476, fd475, fd1627; +add.f64 fd477, fd335, fd350; +add.f64 fd479, fd340, fd345; +add.f64 fd1626, fd141, fd477; +add.f64 fd480, fd479, fd1626; +fma.rn.f64 fd1624, fd473, 0d3FD3C6EF372FE950, fd124; +mul.f64 fd1625, fd475, 0d3FE9E3779B97F4A8; +sub.f64 fd483, fd1624, fd1625; +sub.f64 fd484, fd335, fd350; +sub.f64 fd486, fd340, fd345; +mul.f64 fd1623, fd484, 0d3FEE6F0E134454FF; +fma.rn.f64 fd487, fd486, 0d3FE2CF2304755A5E, fd1623; +sub.f64 fd488, fd483, fd487; +add.f64 fd489, fd487, fd483; +mul.f64 fd490, fd473, 0d3FE9E3779B97F4A8; +sub.f64 fd491, fd124, fd490; +fma.rn.f64 fd492, fd475, 0d3FD3C6EF372FE950, fd491; +mul.f64 fd493, fd484, 0d3FE2CF2304755A5E; +mul.f64 fd494, fd486, 0d3FEE6F0E134454FF; +sub.f64 fd495, fd493, fd494; +sub.f64 fd496, fd492, fd495; +add.f64 fd497, fd495, fd492; +fma.rn.f64 fd1621, fd477, 0d3FD3C6EF372FE950, fd141; +mul.f64 fd1622, fd479, 0d3FE9E3779B97F4A8; +sub.f64 fd500, fd1621, fd1622; +sub.f64 fd501, fd333, fd348; +sub.f64 fd503, fd338, fd343; +mul.f64 fd1620, fd501, 0d3FEE6F0E134454FF; +fma.rn.f64 fd504, fd503, 0d3FE2CF2304755A5E, fd1620; +add.f64 fd505, fd504, fd500; +sub.f64 fd506, fd500, fd504; +mul.f64 fd507, fd477, 0d3FE9E3779B97F4A8; +sub.f64 fd508, fd141, fd507; +fma.rn.f64 fd509, fd479, 0d3FD3C6EF372FE950, fd508; +mul.f64 fd510, fd501, 0d3FE2CF2304755A5E; +mul.f64 fd511, fd503, 0d3FEE6F0E134454FF; +sub.f64 fd512, fd510, fd511; +add.f64 fd513, fd512, fd509; +sub.f64 fd514, fd509, fd512; +add.f64 fd515, fd353, fd368; +add.f64 fd517, fd358, fd363; +add.f64 fd1619, fd125, fd515; +add.f64 fd518, fd517, fd1619; +add.f64 fd519, fd355, fd370; +add.f64 fd521, fd360, fd365; +add.f64 fd1618, fd142, fd519; +add.f64 fd522, fd521, fd1618; +mul.f64 fd524, fd517, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1617, fd515, 0d3FD3C6EF372FE950, fd125; +sub.f64 fd525, fd1617, fd524; +sub.f64 fd526, fd355, fd370; +sub.f64 fd528, fd360, fd365; +mul.f64 fd1616, fd526, 0d3FEE6F0E134454FF; +fma.rn.f64 fd529, fd528, 0d3FE2CF2304755A5E, fd1616; +sub.f64 fd530, fd525, fd529; +add.f64 fd531, fd529, fd525; +mul.f64 fd532, fd515, 0d3FE9E3779B97F4A8; +sub.f64 fd533, fd125, fd532; +fma.rn.f64 fd534, fd517, 0d3FD3C6EF372FE950, fd533; +mul.f64 fd535, fd526, 0d3FE2CF2304755A5E; +mul.f64 fd536, fd528, 0d3FEE6F0E134454FF; +sub.f64 fd537, fd535, fd536; +sub.f64 fd538, fd534, fd537; +add.f64 fd539, fd537, fd534; +mul.f64 fd541, fd521, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1615, fd519, 0d3FD3C6EF372FE950, fd142; +sub.f64 fd542, fd1615, fd541; +sub.f64 fd543, fd353, fd368; +sub.f64 fd545, fd358, fd363; +mul.f64 fd1614, fd543, 0d3FEE6F0E134454FF; +fma.rn.f64 fd546, fd545, 0d3FE2CF2304755A5E, fd1614; +add.f64 fd547, fd546, fd542; +sub.f64 fd548, fd542, fd546; +mul.f64 fd549, fd519, 0d3FE9E3779B97F4A8; +sub.f64 fd550, fd142, fd549; +fma.rn.f64 fd551, fd521, 0d3FD3C6EF372FE950, fd550; +mul.f64 fd552, fd543, 0d3FE2CF2304755A5E; +mul.f64 fd553, fd545, 0d3FEE6F0E134454FF; +sub.f64 fd554, fd552, fd553; +add.f64 fd555, fd554, fd551; +sub.f64 fd556, fd551, fd554; +add.f64 fd557, fd373, fd388; +add.f64 fd559, fd378, fd383; +add.f64 fd1613, fd117, fd557; +add.f64 fd560, fd559, fd1613; +add.f64 fd561, fd375, fd390; +add.f64 fd563, fd380, fd385; +add.f64 fd1612, fd134, fd561; +add.f64 fd564, fd563, fd1612; +mul.f64 fd566, fd559, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1611, fd557, 0d3FD3C6EF372FE950, fd117; +sub.f64 fd567, fd1611, fd566; +sub.f64 fd568, fd375, fd390; +sub.f64 fd570, fd380, fd385; +mul.f64 fd1610, fd568, 0d3FEE6F0E134454FF; +fma.rn.f64 fd571, fd570, 0d3FE2CF2304755A5E, fd1610; +sub.f64 fd572, fd567, fd571; +add.f64 fd573, fd571, fd567; +mul.f64 fd574, fd557, 0d3FE9E3779B97F4A8; +sub.f64 fd575, fd117, fd574; +fma.rn.f64 fd576, fd559, 0d3FD3C6EF372FE950, fd575; +mul.f64 fd577, fd568, 0d3FE2CF2304755A5E; +mul.f64 fd578, fd570, 0d3FEE6F0E134454FF; +sub.f64 fd579, fd577, fd578; +sub.f64 fd580, fd576, fd579; +add.f64 fd581, fd579, fd576; +mul.f64 fd583, fd563, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1609, fd561, 0d3FD3C6EF372FE950, fd134; +sub.f64 fd584, fd1609, fd583; +sub.f64 fd585, fd373, fd388; +sub.f64 fd587, fd378, fd383; +mul.f64 fd1608, fd585, 0d3FEE6F0E134454FF; +fma.rn.f64 fd588, fd587, 0d3FE2CF2304755A5E, fd1608; +add.f64 fd589, fd588, fd584; +sub.f64 fd590, fd584, fd588; +mul.f64 fd591, fd561, 0d3FE9E3779B97F4A8; +sub.f64 fd592, fd134, fd591; +fma.rn.f64 fd593, fd563, 0d3FD3C6EF372FE950, fd592; +mul.f64 fd594, fd585, 0d3FE2CF2304755A5E; +mul.f64 fd595, fd587, 0d3FEE6F0E134454FF; +sub.f64 fd596, fd594, fd595; +add.f64 fd597, fd596, fd593; +sub.f64 fd598, fd593, fd596; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, 1374389535; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 25; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 10000, r3; +mul.wide.u32 rd7, r7, 16; +mov.u64 rd8, %51; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd599, fd600}, [rd6]; +mul.f64 fd603, fd438, fd600; +mul.f64 fd605, fd599, fd438; +mul.f64 fd607, fd600, fd600; +mul.f64 fd1607, fd599, fd599; +sub.f64 fd608, fd1607, fd607; +mul.f64 fd609, fd600, fd599; +fma.rn.f64 fd610, fd600, fd599, fd609; +mul.f64 fd611, fd480, fd610; +mul.f64 fd613, fd608, fd480; +mul.f64 fd615, fd600, fd610; +mul.f64 fd1606, fd599, fd608; +sub.f64 fd616, fd1606, fd615; +mul.f64 fd1605, fd476, fd610; +mul.f64 fd617, fd599, fd610; +fma.rn.f64 fd618, fd600, fd608, fd617; +mul.f64 fd619, fd522, fd618; +mul.f64 fd621, fd616, fd522; +mul.f64 fd1603, fd599, fd616; +mul.f64 fd1604, fd600, fd618; +sub.f64 fd624, fd1603, fd1604; +mul.f64 fd1602, fd518, fd618; +mul.f64 fd625, fd599, fd618; +fma.rn.f64 fd626, fd600, fd616, fd625; +mul.f64 fd627, fd564, fd626; +mul.f64 fd629, fd624, fd564; +mul.f64 fd631, fd600, fd626; +mul.f64 fd1601, fd599, fd624; +sub.f64 fd632, fd1601, fd631; +mul.f64 fd1600, fd560, fd626; +mul.f64 fd633, fd599, fd626; +fma.rn.f64 fd634, fd600, fd624, fd633; +mul.f64 fd635, fd421, fd634; +mul.f64 fd637, fd632, fd421; +mul.f64 fd1598, fd599, fd632; +mul.f64 fd1599, fd600, fd634; +sub.f64 fd640, fd1598, fd1599; +mul.f64 fd1597, fd404, fd634; +mul.f64 fd641, fd599, fd634; +fma.rn.f64 fd642, fd600, fd632, fd641; +mul.f64 fd643, fd463, fd642; +mul.f64 fd645, fd640, fd463; +mul.f64 fd647, fd600, fd642; +mul.f64 fd1596, fd599, fd640; +sub.f64 fd648, fd1596, fd647; +mul.f64 fd1595, fd446, fd642; +mul.f64 fd649, fd599, fd642; +fma.rn.f64 fd650, fd600, fd640, fd649; +mul.f64 fd651, fd505, fd650; +mul.f64 fd653, fd648, fd505; +mul.f64 fd655, fd600, fd650; +mul.f64 fd1594, fd599, fd648; +sub.f64 fd656, fd1594, fd655; +mul.f64 fd1593, fd488, fd650; +mul.f64 fd657, fd599, fd650; +fma.rn.f64 fd658, fd600, fd648, fd657; +mul.f64 fd659, fd547, fd658; +mul.f64 fd661, fd656, fd547; +mul.f64 fd1591, fd599, fd656; +mul.f64 fd1592, fd600, fd658; +sub.f64 fd664, fd1591, fd1592; +mul.f64 fd1590, fd530, fd658; +mul.f64 fd665, fd599, fd658; +fma.rn.f64 fd666, fd600, fd656, fd665; +mul.f64 fd667, fd589, fd666; +mul.f64 fd669, fd664, fd589; +mul.f64 fd671, fd600, fd666; +mul.f64 fd1589, fd599, fd664; +sub.f64 fd672, fd1589, fd671; +mul.f64 fd1588, fd572, fd666; +mul.f64 fd673, fd599, fd666; +fma.rn.f64 fd674, fd600, fd664, fd673; +mul.f64 fd675, fd429, fd674; +mul.f64 fd677, fd672, fd429; +mul.f64 fd679, fd600, fd674; +mul.f64 fd1587, fd599, fd672; +sub.f64 fd680, fd1587, fd679; +mul.f64 fd1586, fd412, fd674; +mul.f64 fd681, fd599, fd674; +fma.rn.f64 fd682, fd600, fd672, fd681; +mul.f64 fd683, fd471, fd682; +mul.f64 fd685, fd680, fd471; +mul.f64 fd1584, fd599, fd680; +mul.f64 fd1585, fd600, fd682; +sub.f64 fd688, fd1584, fd1585; +mul.f64 fd1583, fd454, fd682; +mul.f64 fd689, fd599, fd682; +fma.rn.f64 fd690, fd600, fd680, fd689; +mul.f64 fd691, fd513, fd690; +mul.f64 fd692, fd496, fd690; +mul.f64 fd693, fd688, fd513; +ld.global.v2.f64 {fd694, fd695}, [rd6+400]; +mul.f64 fd698, fd555, fd695; +mul.f64 fd700, fd694, fd555; +mul.f64 fd702, fd600, fd695; +mul.f64 fd1582, fd599, fd694; +sub.f64 fd703, fd1582, fd702; +mul.f64 fd1581, fd538, fd695; +mul.f64 fd704, fd599, fd695; +fma.rn.f64 fd705, fd600, fd694, fd704; +mul.f64 fd706, fd597, fd705; +mul.f64 fd708, fd703, fd597; +mul.f64 fd1579, fd599, fd703; +mul.f64 fd1580, fd600, fd705; +sub.f64 fd711, fd1579, fd1580; +mul.f64 fd1578, fd580, fd705; +mul.f64 fd712, fd599, fd705; +fma.rn.f64 fd713, fd600, fd703, fd712; +mul.f64 fd714, fd430, fd713; +mul.f64 fd716, fd711, fd430; +mul.f64 fd1576, fd599, fd711; +mul.f64 fd1577, fd600, fd713; +sub.f64 fd719, fd1576, fd1577; +mul.f64 fd1575, fd413, fd713; +mul.f64 fd720, fd599, fd713; +fma.rn.f64 fd721, fd600, fd711, fd720; +mul.f64 fd722, fd472, fd721; +mul.f64 fd724, fd719, fd472; +mul.f64 fd726, fd600, fd721; +mul.f64 fd1574, fd599, fd719; +sub.f64 fd727, fd1574, fd726; +mul.f64 fd1573, fd455, fd721; +mul.f64 fd728, fd599, fd721; +fma.rn.f64 fd729, fd600, fd719, fd728; +mul.f64 fd730, fd514, fd729; +mul.f64 fd732, fd727, fd514; +mul.f64 fd1571, fd599, fd727; +mul.f64 fd1572, fd600, fd729; +sub.f64 fd735, fd1571, fd1572; +mul.f64 fd1570, fd497, fd729; +mul.f64 fd736, fd599, fd729; +fma.rn.f64 fd737, fd600, fd727, fd736; +mul.f64 fd738, fd556, fd737; +mul.f64 fd740, fd735, fd556; +mul.f64 fd742, fd600, fd737; +mul.f64 fd1569, fd599, fd735; +sub.f64 fd743, fd1569, fd742; +mul.f64 fd1568, fd539, fd737; +mul.f64 fd744, fd599, fd737; +fma.rn.f64 fd745, fd600, fd735, fd744; +mul.f64 fd746, fd598, fd745; +mul.f64 fd748, fd743, fd598; +mul.f64 fd750, fd600, fd745; +mul.f64 fd1567, fd599, fd743; +sub.f64 fd751, fd1567, fd750; +mul.f64 fd1566, fd581, fd745; +mul.f64 fd752, fd599, fd745; +fma.rn.f64 fd753, fd600, fd743, fd752; +mul.f64 fd754, fd422, fd753; +mul.f64 fd756, fd751, fd422; +mul.f64 fd1564, fd599, fd751; +mul.f64 fd1565, fd600, fd753; +sub.f64 fd759, fd1564, fd1565; +mul.f64 fd1563, fd405, fd753; +mul.f64 fd760, fd599, fd753; +fma.rn.f64 fd761, fd600, fd751, fd760; +mul.f64 fd762, fd464, fd761; +mul.f64 fd764, fd759, fd464; +mul.f64 fd766, fd600, fd761; +mul.f64 fd1562, fd599, fd759; +sub.f64 fd767, fd1562, fd766; +mul.f64 fd1561, fd447, fd761; +mul.f64 fd768, fd599, fd761; +fma.rn.f64 fd769, fd600, fd759, fd768; +mul.f64 fd770, fd506, fd769; +mul.f64 fd772, fd767, fd506; +mul.f64 fd1559, fd599, fd767; +mul.f64 fd1560, fd600, fd769; +sub.f64 fd775, fd1559, fd1560; +mul.f64 fd1558, fd489, fd769; +mul.f64 fd776, fd599, fd769; +fma.rn.f64 fd777, fd600, fd767, fd776; +mul.f64 fd778, fd548, fd777; +mul.f64 fd780, fd775, fd548; +mul.f64 fd1556, fd599, fd775; +mul.f64 fd1557, fd600, fd777; +sub.f64 fd783, fd1556, fd1557; +mul.f64 fd1555, fd531, fd777; +mul.f64 fd784, fd599, fd777; +mul.f64 fd1554, fd434, fd600; +fma.rn.f64 fd785, fd600, fd775, fd784; +mul.f64 fd786, fd590, fd785; +mul.f64 fd787, fd573, fd785; +mul.f64 fd788, fd783, fd590; +barrier.sync 0; +mad.lo.s32 r9, r7, 400, r8; +add.f64 fd789, fd1640, fd1635; +add.f64 fd790, fd393, fd1638; +st.shared.v2.f64 [r9], {fd790, fd789}; +fma.rn.f64 fd791, fd599, fd434, fd603; +sub.f64 fd792, fd605, fd1554; +st.shared.v2.f64 [r9+16], {fd791, fd792}; +fma.rn.f64 fd793, fd608, fd476, fd611; +sub.f64 fd794, fd613, fd1605; +st.shared.v2.f64 [r9+32], {fd793, fd794}; +fma.rn.f64 fd795, fd616, fd518, fd619; +sub.f64 fd796, fd621, fd1602; +st.shared.v2.f64 [r9+48], {fd795, fd796}; +fma.rn.f64 fd797, fd624, fd560, fd627; +sub.f64 fd798, fd629, fd1600; +st.shared.v2.f64 [r9+64], {fd797, fd798}; +sub.f64 fd799, fd637, fd1597; +fma.rn.f64 fd800, fd632, fd404, fd635; +st.shared.v2.f64 [r9+80], {fd800, fd799}; +fma.rn.f64 fd801, fd640, fd446, fd643; +sub.f64 fd802, fd645, fd1595; +st.shared.v2.f64 [r9+96], {fd801, fd802}; +sub.f64 fd803, fd653, fd1593; +fma.rn.f64 fd804, fd648, fd488, fd651; +st.shared.v2.f64 [r9+112], {fd804, fd803}; +fma.rn.f64 fd805, fd656, fd530, fd659; +sub.f64 fd806, fd661, fd1590; +st.shared.v2.f64 [r9+128], {fd805, fd806}; +fma.rn.f64 fd807, fd664, fd572, fd667; +sub.f64 fd808, fd669, fd1588; +st.shared.v2.f64 [r9+144], {fd807, fd808}; +fma.rn.f64 fd809, fd672, fd412, fd675; +sub.f64 fd810, fd677, fd1586; +st.shared.v2.f64 [r9+160], {fd809, fd810}; +fma.rn.f64 fd811, fd680, fd454, fd683; +sub.f64 fd812, fd685, fd1583; +st.shared.v2.f64 [r9+176], {fd811, fd812}; +fma.rn.f64 fd813, fd688, fd496, fd691; +sub.f64 fd814, fd693, fd692; +st.shared.v2.f64 [r9+192], {fd813, fd814}; +fma.rn.f64 fd815, fd694, fd538, fd698; +sub.f64 fd816, fd700, fd1581; +st.shared.v2.f64 [r9+208], {fd815, fd816}; +fma.rn.f64 fd817, fd703, fd580, fd706; +sub.f64 fd818, fd708, fd1578; +st.shared.v2.f64 [r9+224], {fd817, fd818}; +fma.rn.f64 fd819, fd711, fd413, fd714; +sub.f64 fd820, fd716, fd1575; +st.shared.v2.f64 [r9+240], {fd819, fd820}; +fma.rn.f64 fd821, fd719, fd455, fd722; +sub.f64 fd822, fd724, fd1573; +st.shared.v2.f64 [r9+256], {fd821, fd822}; +fma.rn.f64 fd823, fd727, fd497, fd730; +sub.f64 fd824, fd732, fd1570; +st.shared.v2.f64 [r9+272], {fd823, fd824}; +fma.rn.f64 fd825, fd735, fd539, fd738; +sub.f64 fd826, fd740, fd1568; +st.shared.v2.f64 [r9+288], {fd825, fd826}; +sub.f64 fd827, fd748, fd1566; +fma.rn.f64 fd828, fd743, fd581, fd746; +st.shared.v2.f64 [r9+304], {fd828, fd827}; +fma.rn.f64 fd829, fd751, fd405, fd754; +sub.f64 fd830, fd756, fd1563; +st.shared.v2.f64 [r9+320], {fd829, fd830}; +fma.rn.f64 fd831, fd759, fd447, fd762; +sub.f64 fd832, fd764, fd1561; +st.shared.v2.f64 [r9+336], {fd831, fd832}; +fma.rn.f64 fd833, fd767, fd489, fd770; +sub.f64 fd834, fd772, fd1558; +st.shared.v2.f64 [r9+352], {fd833, fd834}; +fma.rn.f64 fd835, fd775, fd531, fd778; +sub.f64 fd836, fd780, fd1555; +st.shared.v2.f64 [r9+368], {fd835, fd836}; +fma.rn.f64 fd837, fd783, fd573, fd786; +sub.f64 fd838, fd788, fd787; +st.shared.v2.f64 [r9+384], {fd837, fd838}; +barrier.sync 0; +mad.lo.s32 r10, r7, -384, r9; +ld.shared.v2.f64 {fd839, fd840}, [r10]; +ld.shared.v2.f64 {fd843, fd844}, [r10+400]; +ld.shared.v2.f64 {fd847, fd848}, [r10+800]; +ld.shared.v2.f64 {fd851, fd852}, [r10+1200]; +ld.shared.v2.f64 {fd855, fd856}, [r10+1600]; +ld.shared.v2.f64 {fd859, fd860}, [r10+2000]; +ld.shared.v2.f64 {fd863, fd864}, [r10+2400]; +ld.shared.v2.f64 {fd867, fd868}, [r10+2800]; +ld.shared.v2.f64 {fd871, fd872}, [r10+3200]; +ld.shared.v2.f64 {fd875, fd876}, [r10+3600]; +ld.shared.v2.f64 {fd879, fd880}, [r10+4000]; +ld.shared.v2.f64 {fd883, fd884}, [r10+4400]; +ld.shared.v2.f64 {fd887, fd888}, [r10+4800]; +ld.shared.v2.f64 {fd891, fd892}, [r10+5200]; +ld.shared.v2.f64 {fd895, fd896}, [r10+5600]; +ld.shared.v2.f64 {fd899, fd900}, [r10+6000]; +ld.shared.v2.f64 {fd903, fd904}, [r10+6400]; +ld.shared.v2.f64 {fd907, fd908}, [r10+6800]; +ld.shared.v2.f64 {fd911, fd912}, [r10+7200]; +ld.shared.v2.f64 {fd915, fd916}, [r10+7600]; +ld.shared.v2.f64 {fd919, fd920}, [r10+8000]; +ld.shared.v2.f64 {fd923, fd924}, [r10+8400]; +ld.shared.v2.f64 {fd927, fd928}, [r10+8800]; +ld.shared.v2.f64 {fd931, fd932}, [r10+9200]; +ld.shared.v2.f64 {fd935, fd936}, [r10+9600]; +add.f64 fd939, fd859, fd919; +add.f64 fd941, fd879, fd899; +add.f64 fd1553, fd839, fd939; +add.f64 fd942, fd941, fd1553; +add.f64 fd943, fd860, fd920; +add.f64 fd945, fd880, fd900; +add.f64 fd1552, fd840, fd943; +add.f64 fd946, fd945, fd1552; +fma.rn.f64 fd1550, fd939, 0d3FD3C6EF372FE950, fd839; +mul.f64 fd1551, fd941, 0d3FE9E3779B97F4A8; +sub.f64 fd949, fd1550, fd1551; +sub.f64 fd950, fd860, fd920; +sub.f64 fd952, fd880, fd900; +mul.f64 fd1549, fd950, 0d3FEE6F0E134454FF; +fma.rn.f64 fd953, fd952, 0d3FE2CF2304755A5E, fd1549; +sub.f64 fd954, fd949, fd953; +add.f64 fd955, fd953, fd949; +mul.f64 fd956, fd939, 0d3FE9E3779B97F4A8; +sub.f64 fd957, fd839, fd956; +fma.rn.f64 fd958, fd941, 0d3FD3C6EF372FE950, fd957; +mul.f64 fd959, fd950, 0d3FE2CF2304755A5E; +mul.f64 fd960, fd952, 0d3FEE6F0E134454FF; +sub.f64 fd961, fd959, fd960; +sub.f64 fd962, fd958, fd961; +add.f64 fd963, fd961, fd958; +mul.f64 fd965, fd945, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1548, fd943, 0d3FD3C6EF372FE950, fd840; +sub.f64 fd966, fd1548, fd965; +sub.f64 fd967, fd859, fd919; +sub.f64 fd969, fd879, fd899; +mul.f64 fd1547, fd967, 0d3FEE6F0E134454FF; +fma.rn.f64 fd970, fd969, 0d3FE2CF2304755A5E, fd1547; +add.f64 fd971, fd970, fd966; +sub.f64 fd972, fd966, fd970; +mul.f64 fd973, fd943, 0d3FE9E3779B97F4A8; +sub.f64 fd974, fd840, fd973; +fma.rn.f64 fd975, fd945, 0d3FD3C6EF372FE950, fd974; +mul.f64 fd976, fd967, 0d3FE2CF2304755A5E; +mul.f64 fd977, fd969, 0d3FEE6F0E134454FF; +sub.f64 fd978, fd976, fd977; +add.f64 fd979, fd978, fd975; +sub.f64 fd980, fd975, fd978; +add.f64 fd981, fd863, fd923; +add.f64 fd983, fd883, fd903; +add.f64 fd1546, fd843, fd981; +add.f64 fd984, fd983, fd1546; +add.f64 fd985, fd864, fd924; +add.f64 fd987, fd884, fd904; +add.f64 fd1545, fd844, fd985; +add.f64 fd988, fd987, fd1545; +mul.f64 fd990, fd983, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1544, fd981, 0d3FD3C6EF372FE950, fd843; +sub.f64 fd991, fd1544, fd990; +sub.f64 fd992, fd864, fd924; +sub.f64 fd994, fd884, fd904; +mul.f64 fd1543, fd992, 0d3FEE6F0E134454FF; +fma.rn.f64 fd995, fd994, 0d3FE2CF2304755A5E, fd1543; +sub.f64 fd996, fd991, fd995; +add.f64 fd997, fd995, fd991; +mul.f64 fd998, fd981, 0d3FE9E3779B97F4A8; +sub.f64 fd999, fd843, fd998; +fma.rn.f64 fd1000, fd983, 0d3FD3C6EF372FE950, fd999; +mul.f64 fd1001, fd992, 0d3FE2CF2304755A5E; +mul.f64 fd1002, fd994, 0d3FEE6F0E134454FF; +sub.f64 fd1003, fd1001, fd1002; +sub.f64 fd1004, fd1000, fd1003; +add.f64 fd1005, fd1003, fd1000; +fma.rn.f64 fd1541, fd985, 0d3FD3C6EF372FE950, fd844; +mul.f64 fd1542, fd987, 0d3FE9E3779B97F4A8; +sub.f64 fd1008, fd1541, fd1542; +sub.f64 fd1009, fd863, fd923; +sub.f64 fd1011, fd883, fd903; +mul.f64 fd1540, fd1009, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1012, fd1011, 0d3FE2CF2304755A5E, fd1540; +add.f64 fd1013, fd1012, fd1008; +sub.f64 fd1014, fd1008, fd1012; +mul.f64 fd1015, fd985, 0d3FE9E3779B97F4A8; +sub.f64 fd1016, fd844, fd1015; +fma.rn.f64 fd1017, fd987, 0d3FD3C6EF372FE950, fd1016; +mul.f64 fd1018, fd1009, 0d3FE2CF2304755A5E; +mul.f64 fd1019, fd1011, 0d3FEE6F0E134454FF; +sub.f64 fd1020, fd1018, fd1019; +add.f64 fd1021, fd1020, fd1017; +sub.f64 fd1022, fd1017, fd1020; +add.f64 fd1023, fd867, fd927; +add.f64 fd1025, fd887, fd907; +add.f64 fd1539, fd847, fd1023; +add.f64 fd1026, fd1025, fd1539; +add.f64 fd1027, fd868, fd928; +add.f64 fd1029, fd888, fd908; +add.f64 fd1538, fd848, fd1027; +add.f64 fd1030, fd1029, fd1538; +mul.f64 fd1032, fd1025, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1537, fd1023, 0d3FD3C6EF372FE950, fd847; +sub.f64 fd1033, fd1537, fd1032; +sub.f64 fd1034, fd868, fd928; +sub.f64 fd1036, fd888, fd908; +mul.f64 fd1536, fd1034, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1037, fd1036, 0d3FE2CF2304755A5E, fd1536; +sub.f64 fd1038, fd1033, fd1037; +add.f64 fd1039, fd1037, fd1033; +mul.f64 fd1040, fd1023, 0d3FE9E3779B97F4A8; +sub.f64 fd1041, fd847, fd1040; +fma.rn.f64 fd1042, fd1025, 0d3FD3C6EF372FE950, fd1041; +mul.f64 fd1043, fd1034, 0d3FE2CF2304755A5E; +mul.f64 fd1044, fd1036, 0d3FEE6F0E134454FF; +sub.f64 fd1045, fd1043, fd1044; +sub.f64 fd1046, fd1042, fd1045; +add.f64 fd1047, fd1045, fd1042; +mul.f64 fd1049, fd1029, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1535, fd1027, 0d3FD3C6EF372FE950, fd848; +sub.f64 fd1050, fd1535, fd1049; +sub.f64 fd1051, fd867, fd927; +sub.f64 fd1053, fd887, fd907; +mul.f64 fd1534, fd1051, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1054, fd1053, 0d3FE2CF2304755A5E, fd1534; +add.f64 fd1055, fd1054, fd1050; +sub.f64 fd1056, fd1050, fd1054; +mul.f64 fd1057, fd1027, 0d3FE9E3779B97F4A8; +sub.f64 fd1058, fd848, fd1057; +fma.rn.f64 fd1059, fd1029, 0d3FD3C6EF372FE950, fd1058; +mul.f64 fd1060, fd1051, 0d3FE2CF2304755A5E; +mul.f64 fd1061, fd1053, 0d3FEE6F0E134454FF; +sub.f64 fd1062, fd1060, fd1061; +add.f64 fd1063, fd1062, fd1059; +sub.f64 fd1064, fd1059, fd1062; +add.f64 fd1065, fd871, fd931; +add.f64 fd1067, fd891, fd911; +add.f64 fd1533, fd851, fd1065; +add.f64 fd1068, fd1067, fd1533; +add.f64 fd1069, fd872, fd932; +add.f64 fd1071, fd892, fd912; +add.f64 fd1532, fd852, fd1069; +add.f64 fd1072, fd1071, fd1532; +fma.rn.f64 fd1530, fd1065, 0d3FD3C6EF372FE950, fd851; +mul.f64 fd1531, fd1067, 0d3FE9E3779B97F4A8; +sub.f64 fd1075, fd1530, fd1531; +sub.f64 fd1076, fd872, fd932; +sub.f64 fd1078, fd892, fd912; +mul.f64 fd1529, fd1076, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1079, fd1078, 0d3FE2CF2304755A5E, fd1529; +sub.f64 fd1080, fd1075, fd1079; +add.f64 fd1081, fd1079, fd1075; +mul.f64 fd1082, fd1065, 0d3FE9E3779B97F4A8; +sub.f64 fd1083, fd851, fd1082; +fma.rn.f64 fd1084, fd1067, 0d3FD3C6EF372FE950, fd1083; +mul.f64 fd1085, fd1076, 0d3FE2CF2304755A5E; +mul.f64 fd1086, fd1078, 0d3FEE6F0E134454FF; +sub.f64 fd1087, fd1085, fd1086; +sub.f64 fd1088, fd1084, fd1087; +add.f64 fd1089, fd1087, fd1084; +mul.f64 fd1091, fd1071, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1528, fd1069, 0d3FD3C6EF372FE950, fd852; +sub.f64 fd1092, fd1528, fd1091; +sub.f64 fd1093, fd871, fd931; +sub.f64 fd1095, fd891, fd911; +mul.f64 fd1527, fd1093, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1096, fd1095, 0d3FE2CF2304755A5E, fd1527; +add.f64 fd1097, fd1096, fd1092; +sub.f64 fd1098, fd1092, fd1096; +mul.f64 fd1099, fd1069, 0d3FE9E3779B97F4A8; +sub.f64 fd1100, fd852, fd1099; +fma.rn.f64 fd1101, fd1071, 0d3FD3C6EF372FE950, fd1100; +mul.f64 fd1102, fd1093, 0d3FE2CF2304755A5E; +mul.f64 fd1103, fd1095, 0d3FEE6F0E134454FF; +sub.f64 fd1104, fd1102, fd1103; +add.f64 fd1105, fd1104, fd1101; +sub.f64 fd1106, fd1101, fd1104; +add.f64 fd1107, fd875, fd935; +add.f64 fd1109, fd895, fd915; +add.f64 fd1526, fd855, fd1107; +add.f64 fd1110, fd1109, fd1526; +add.f64 fd1111, fd876, fd936; +add.f64 fd1113, fd896, fd916; +add.f64 fd1525, fd856, fd1111; +add.f64 fd1114, fd1113, fd1525; +fma.rn.f64 fd1523, fd1107, 0d3FD3C6EF372FE950, fd855; +mul.f64 fd1524, fd1109, 0d3FE9E3779B97F4A8; +sub.f64 fd1117, fd1523, fd1524; +sub.f64 fd1118, fd876, fd936; +sub.f64 fd1120, fd896, fd916; +mul.f64 fd1522, fd1118, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1121, fd1120, 0d3FE2CF2304755A5E, fd1522; +sub.f64 fd1122, fd1117, fd1121; +add.f64 fd1123, fd1121, fd1117; +mul.f64 fd1124, fd1107, 0d3FE9E3779B97F4A8; +sub.f64 fd1125, fd855, fd1124; +fma.rn.f64 fd1126, fd1109, 0d3FD3C6EF372FE950, fd1125; +mul.f64 fd1127, fd1118, 0d3FE2CF2304755A5E; +mul.f64 fd1128, fd1120, 0d3FEE6F0E134454FF; +sub.f64 fd1129, fd1127, fd1128; +sub.f64 fd1130, fd1126, fd1129; +add.f64 fd1131, fd1129, fd1126; +fma.rn.f64 fd1520, fd1111, 0d3FD3C6EF372FE950, fd856; +mul.f64 fd1521, fd1113, 0d3FE9E3779B97F4A8; +sub.f64 fd1134, fd1520, fd1521; +sub.f64 fd1135, fd875, fd935; +sub.f64 fd1137, fd895, fd915; +mul.f64 fd1519, fd1135, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1138, fd1137, 0d3FE2CF2304755A5E, fd1519; +add.f64 fd1139, fd1138, fd1134; +sub.f64 fd1140, fd1134, fd1138; +mul.f64 fd1141, fd1111, 0d3FE9E3779B97F4A8; +sub.f64 fd1142, fd856, fd1141; +fma.rn.f64 fd1143, fd1113, 0d3FD3C6EF372FE950, fd1142; +mul.f64 fd1144, fd1135, 0d3FE2CF2304755A5E; +mul.f64 fd1145, fd1137, 0d3FEE6F0E134454FF; +sub.f64 fd1146, fd1144, fd1145; +add.f64 fd1147, fd1146, fd1143; +sub.f64 fd1148, fd1143, fd1146; +mul.f64 fd1150, fd1013, 0d3FCFD511FA1C0796; +mul.f64 fd1518, fd996, 0d3FEEFEA21D101EE0; +sub.f64 fd1151, fd1518, fd1150; +mul.f64 fd1152, fd1013, 0d3FEEFEA21D101EE0; +fma.rn.f64 fd1153, fd996, 0d3FCFD511FA1C0796, fd1152; +mul.f64 fd1155, fd1055, 0d3FDED50D5CBFA951; +mul.f64 fd1517, fd1038, 0d3FEC0AB44E81C059; +sub.f64 fd1156, fd1517, fd1155; +mul.f64 fd1157, fd1055, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1158, fd1038, 0d3FDED50D5CBFA951, fd1157; +mul.f64 fd1160, fd1097, 0d3FE5E7CF55112014; +mul.f64 fd1516, fd1080, 0d3FE753B603D2B816; +sub.f64 fd1161, fd1516, fd1160; +mul.f64 fd1162, fd1097, 0d3FE753B603D2B816; +fma.rn.f64 fd1163, fd1080, 0d3FE5E7CF55112014, fd1162; +mul.f64 fd1165, fd1139, 0d3FEB04BBFF642E86; +mul.f64 fd1515, fd1122, 0d3FE1257E3C182B51; +sub.f64 fd1166, fd1515, fd1165; +mul.f64 fd1167, fd1139, 0d3FE1257E3C182B51; +fma.rn.f64 fd1168, fd1122, 0d3FEB04BBFF642E86, fd1167; +mul.f64 fd1170, fd1021, 0d3FDED50D5CBFA951; +mul.f64 fd1514, fd1004, 0d3FEC0AB44E81C059; +sub.f64 fd1171, fd1514, fd1170; +mul.f64 fd1172, fd1021, 0d3FEC0AB44E81C059; +fma.rn.f64 fd1173, fd1004, 0d3FDED50D5CBFA951, fd1172; +mul.f64 fd1512, fd1046, 0d3FE1257E3C182B51; +mul.f64 fd1513, fd1063, 0d3FEB04BBFF642E86; +sub.f64 fd1176, fd1512, fd1513; +mul.f64 fd1177, fd1063, 0d3FE1257E3C182B51; +fma.rn.f64 fd1178, fd1046, 0d3FEB04BBFF642E86, fd1177; +mul.f64 fd1510, fd1088, 0d3FB0130A1BE09379; +mul.f64 fd1511, fd1105, 0d3FEFEFD5BFE443FE; +sub.f64 fd1181, fd1510, fd1511; +mul.f64 fd1182, fd1105, 0d3FB0130A1BE09379; +fma.rn.f64 fd1183, fd1088, 0d3FEFEFD5BFE443FE, fd1182; +mul.f64 fd1508, fd1130, 0dBFDB3FF7C925819C; +mul.f64 fd1509, fd1147, 0d3FECF457DCDC158C; +sub.f64 fd1186, fd1508, fd1509; +mul.f64 fd1187, fd1147, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1188, fd1130, 0d3FECF457DCDC158C, fd1187; +mul.f64 fd1506, fd1005, 0d3FE753B603D2B816; +mul.f64 fd1507, fd1022, 0d3FE5E7CF55112014; +sub.f64 fd1191, fd1506, fd1507; +mul.f64 fd1192, fd1022, 0d3FE753B603D2B816; +fma.rn.f64 fd1193, fd1005, 0d3FE5E7CF55112014, fd1192; +mul.f64 fd1195, fd1064, 0d3FEFEFD5BFE443FE; +mul.f64 fd1505, fd1047, 0d3FB0130A1BE09379; +sub.f64 fd1196, fd1505, fd1195; +mul.f64 fd1197, fd1064, 0d3FB0130A1BE09379; +fma.rn.f64 fd1198, fd1047, 0d3FEFEFD5BFE443FE, fd1197; +mul.f64 fd1200, fd1106, 0d3FE8A80B635B6BEA; +mul.f64 fd1504, fd1089, 0dBFE465C6FEB501BC; +sub.f64 fd1201, fd1504, fd1200; +mul.f64 fd1202, fd1106, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1203, fd1089, 0d3FE8A80B635B6BEA, fd1202; +mul.f64 fd1205, fd1148, 0d3FC00AEB5DA15BE0; +mul.f64 fd1503, fd1131, 0dBFEFBF675480D903; +sub.f64 fd1206, fd1503, fd1205; +mul.f64 fd1207, fd1148, 0dBFEFBF675480D903; +fma.rn.f64 fd1208, fd1131, 0d3FC00AEB5DA15BE0, fd1207; +mul.f64 fd1210, fd1014, 0d3FEB04BBFF642E86; +mul.f64 fd1502, fd997, 0d3FE1257E3C182B51; +sub.f64 fd1211, fd1502, fd1210; +mul.f64 fd1212, fd1014, 0d3FE1257E3C182B51; +fma.rn.f64 fd1213, fd997, 0d3FEB04BBFF642E86, fd1212; +mul.f64 fd1215, fd1056, 0d3FECF457DCDC158C; +mul.f64 fd1501, fd1039, 0dBFDB3FF7C925819C; +sub.f64 fd1216, fd1501, fd1215; +mul.f64 fd1217, fd1056, 0dBFDB3FF7C925819C; +fma.rn.f64 fd1218, fd1039, 0d3FECF457DCDC158C, fd1217; +mul.f64 fd1499, fd1081, 0dBFEFBF675480D903; +mul.f64 fd1500, fd1098, 0d3FC00AEB5DA15BE0; +sub.f64 fd1221, fd1499, fd1500; +mul.f64 fd1222, fd1098, 0dBFEFBF675480D903; +fma.rn.f64 fd1223, fd1081, 0d3FC00AEB5DA15BE0, fd1222; +mul.f64 fd1497, fd1123, 0dBFE465C6FEB501BC; +mul.f64 fd1498, fd1140, 0dBFE8A80B635B6BEA; +sub.f64 fd1226, fd1497, fd1498; +mul.f64 fd1227, fd1140, 0dBFE465C6FEB501BC; +fma.rn.f64 fd1228, fd1123, 0dBFE8A80B635B6BEA, fd1227; +add.f64 fd1229, fd984, fd1110; +add.f64 fd1231, fd1026, fd1068; +mul.f64 fd1236, fd1231, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1496, fd1229, 0d3FD3C6EF372FE950, fd942; +sub.f64 fd1237, fd1496, fd1236; +add.f64 fd1495, fd988, fd1114; +sub.f64 fd1238, fd988, fd1114; +add.f64 fd1494, fd1030, fd1072; +sub.f64 fd1240, fd1030, fd1072; +mul.f64 fd1493, fd1238, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1241, fd1240, 0d3FE2CF2304755A5E, fd1493; +add.f64 fd1492, fd942, fd1229; +mul.f64 fd1242, fd1229, 0d3FE9E3779B97F4A8; +sub.f64 fd1243, fd942, fd1242; +fma.rn.f64 fd1244, fd1231, 0d3FD3C6EF372FE950, fd1243; +mul.f64 fd1245, fd1238, 0d3FE2CF2304755A5E; +mul.f64 fd1246, fd1240, 0d3FEE6F0E134454FF; +sub.f64 fd1247, fd1245, fd1246; +fma.rn.f64 fd1490, fd1495, 0d3FD3C6EF372FE950, fd946; +mul.f64 fd1491, fd1494, 0d3FE9E3779B97F4A8; +sub.f64 fd1250, fd1490, fd1491; +sub.f64 fd1251, fd984, fd1110; +sub.f64 fd1253, fd1026, fd1068; +mul.f64 fd1489, fd1251, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1254, fd1253, 0d3FE2CF2304755A5E, fd1489; +add.f64 fd1488, fd946, fd1495; +mul.f64 fd1255, fd1495, 0d3FE9E3779B97F4A8; +sub.f64 fd1256, fd946, fd1255; +fma.rn.f64 fd1257, fd1494, 0d3FD3C6EF372FE950, fd1256; +mul.f64 fd1258, fd1251, 0d3FE2CF2304755A5E; +mul.f64 fd1259, fd1253, 0d3FEE6F0E134454FF; +sub.f64 fd1260, fd1258, fd1259; +add.f64 fd1261, fd1151, fd1166; +add.f64 fd1263, fd1156, fd1161; +fma.rn.f64 fd1486, fd1261, 0d3FD3C6EF372FE950, fd954; +mul.f64 fd1487, fd1263, 0d3FE9E3779B97F4A8; +sub.f64 fd1269, fd1486, fd1487; +add.f64 fd1485, fd1153, fd1168; +sub.f64 fd1270, fd1153, fd1168; +add.f64 fd1484, fd1158, fd1163; +sub.f64 fd1272, fd1158, fd1163; +mul.f64 fd1483, fd1270, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1273, fd1272, 0d3FE2CF2304755A5E, fd1483; +add.f64 fd1482, fd954, fd1261; +mul.f64 fd1274, fd1261, 0d3FE9E3779B97F4A8; +sub.f64 fd1275, fd954, fd1274; +fma.rn.f64 fd1276, fd1263, 0d3FD3C6EF372FE950, fd1275; +mul.f64 fd1277, fd1270, 0d3FE2CF2304755A5E; +mul.f64 fd1278, fd1272, 0d3FEE6F0E134454FF; +sub.f64 fd1279, fd1277, fd1278; +mul.f64 fd1281, fd1484, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1481, fd1485, 0d3FD3C6EF372FE950, fd971; +sub.f64 fd1282, fd1481, fd1281; +sub.f64 fd1283, fd1151, fd1166; +sub.f64 fd1285, fd1156, fd1161; +mul.f64 fd1480, fd1283, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1286, fd1285, 0d3FE2CF2304755A5E, fd1480; +add.f64 fd1479, fd971, fd1485; +mul.f64 fd1287, fd1485, 0d3FE9E3779B97F4A8; +sub.f64 fd1288, fd971, fd1287; +fma.rn.f64 fd1289, fd1484, 0d3FD3C6EF372FE950, fd1288; +mul.f64 fd1290, fd1283, 0d3FE2CF2304755A5E; +mul.f64 fd1291, fd1285, 0d3FEE6F0E134454FF; +sub.f64 fd1292, fd1290, fd1291; +add.f64 fd1293, fd1171, fd1186; +add.f64 fd1295, fd1176, fd1181; +mul.f64 fd1300, fd1295, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1478, fd1293, 0d3FD3C6EF372FE950, fd962; +sub.f64 fd1301, fd1478, fd1300; +add.f64 fd1477, fd1173, fd1188; +sub.f64 fd1302, fd1173, fd1188; +add.f64 fd1476, fd1178, fd1183; +sub.f64 fd1304, fd1178, fd1183; +mul.f64 fd1475, fd1302, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1305, fd1304, 0d3FE2CF2304755A5E, fd1475; +add.f64 fd1474, fd962, fd1293; +mul.f64 fd1306, fd1293, 0d3FE9E3779B97F4A8; +sub.f64 fd1307, fd962, fd1306; +fma.rn.f64 fd1308, fd1295, 0d3FD3C6EF372FE950, fd1307; +mul.f64 fd1309, fd1302, 0d3FE2CF2304755A5E; +mul.f64 fd1310, fd1304, 0d3FEE6F0E134454FF; +sub.f64 fd1311, fd1309, fd1310; +fma.rn.f64 fd1472, fd1477, 0d3FD3C6EF372FE950, fd979; +mul.f64 fd1473, fd1476, 0d3FE9E3779B97F4A8; +sub.f64 fd1314, fd1472, fd1473; +sub.f64 fd1315, fd1171, fd1186; +sub.f64 fd1317, fd1176, fd1181; +mul.f64 fd1471, fd1315, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1318, fd1317, 0d3FE2CF2304755A5E, fd1471; +add.f64 fd1470, fd979, fd1477; +mul.f64 fd1319, fd1477, 0d3FE9E3779B97F4A8; +sub.f64 fd1320, fd979, fd1319; +fma.rn.f64 fd1321, fd1476, 0d3FD3C6EF372FE950, fd1320; +mul.f64 fd1322, fd1315, 0d3FE2CF2304755A5E; +mul.f64 fd1323, fd1317, 0d3FEE6F0E134454FF; +sub.f64 fd1324, fd1322, fd1323; +add.f64 fd1325, fd1191, fd1206; +add.f64 fd1327, fd1196, fd1201; +fma.rn.f64 fd1468, fd1325, 0d3FD3C6EF372FE950, fd963; +mul.f64 fd1469, fd1327, 0d3FE9E3779B97F4A8; +sub.f64 fd1333, fd1468, fd1469; +add.f64 fd1467, fd1193, fd1208; +sub.f64 fd1334, fd1193, fd1208; +add.f64 fd1466, fd1198, fd1203; +sub.f64 fd1336, fd1198, fd1203; +mul.f64 fd1465, fd1334, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1337, fd1336, 0d3FE2CF2304755A5E, fd1465; +add.f64 fd1464, fd963, fd1325; +mul.f64 fd1338, fd1325, 0d3FE9E3779B97F4A8; +sub.f64 fd1339, fd963, fd1338; +fma.rn.f64 fd1340, fd1327, 0d3FD3C6EF372FE950, fd1339; +mul.f64 fd1341, fd1334, 0d3FE2CF2304755A5E; +mul.f64 fd1342, fd1336, 0d3FEE6F0E134454FF; +sub.f64 fd1343, fd1341, fd1342; +mul.f64 fd1345, fd1466, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1463, fd1467, 0d3FD3C6EF372FE950, fd980; +sub.f64 fd1346, fd1463, fd1345; +sub.f64 fd1347, fd1191, fd1206; +sub.f64 fd1349, fd1196, fd1201; +mul.f64 fd1462, fd1347, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1350, fd1349, 0d3FE2CF2304755A5E, fd1462; +add.f64 fd1461, fd980, fd1467; +mul.f64 fd1351, fd1467, 0d3FE9E3779B97F4A8; +sub.f64 fd1352, fd980, fd1351; +fma.rn.f64 fd1353, fd1466, 0d3FD3C6EF372FE950, fd1352; +mul.f64 fd1354, fd1347, 0d3FE2CF2304755A5E; +mul.f64 fd1355, fd1349, 0d3FEE6F0E134454FF; +sub.f64 fd1356, fd1354, fd1355; +add.f64 fd1357, fd1211, fd1226; +add.f64 fd1359, fd1216, fd1221; +mul.f64 fd1364, fd1359, 0d3FE9E3779B97F4A8; +fma.rn.f64 fd1460, fd1357, 0d3FD3C6EF372FE950, fd955; +sub.f64 fd1365, fd1460, fd1364; +add.f64 fd1459, fd1213, fd1228; +sub.f64 fd1366, fd1213, fd1228; +add.f64 fd1458, fd1218, fd1223; +sub.f64 fd1368, fd1218, fd1223; +mul.f64 fd1457, fd1366, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1369, fd1368, 0d3FE2CF2304755A5E, fd1457; +add.f64 fd1456, fd955, fd1357; +mul.f64 fd1370, fd1357, 0d3FE9E3779B97F4A8; +sub.f64 fd1371, fd955, fd1370; +fma.rn.f64 fd1372, fd1359, 0d3FD3C6EF372FE950, fd1371; +mul.f64 fd1373, fd1366, 0d3FE2CF2304755A5E; +mul.f64 fd1374, fd1368, 0d3FEE6F0E134454FF; +sub.f64 fd1375, fd1373, fd1374; +fma.rn.f64 fd1454, fd1459, 0d3FD3C6EF372FE950, fd972; +mul.f64 fd1455, fd1458, 0d3FE9E3779B97F4A8; +sub.f64 fd1378, fd1454, fd1455; +sub.f64 fd1379, fd1211, fd1226; +sub.f64 fd1381, fd1216, fd1221; +mul.f64 fd1453, fd1379, 0d3FEE6F0E134454FF; +fma.rn.f64 fd1382, fd1381, 0d3FE2CF2304755A5E, fd1453; +add.f64 fd1452, fd972, fd1459; +mul.f64 fd1383, fd1459, 0d3FE9E3779B97F4A8; +sub.f64 fd1384, fd972, fd1383; +fma.rn.f64 fd1385, fd1458, 0d3FD3C6EF372FE950, fd1384; +mul.f64 fd1386, fd1379, 0d3FE2CF2304755A5E; +mul.f64 fd1387, fd1381, 0d3FEE6F0E134454FF; +sub.f64 fd1388, fd1386, fd1387; +add.f64 %1, fd1494, fd1488; +add.f64 %0, fd1231, fd1492; +add.f64 %3, fd1484, fd1479; +add.f64 %2, fd1263, fd1482; +add.f64 %5, fd1476, fd1470; +add.f64 %4, fd1295, fd1474; +add.f64 %7, fd1466, fd1461; +add.f64 %6, fd1327, fd1464; +add.f64 %9, fd1458, fd1452; +add.f64 %8, fd1359, fd1456; +sub.f64 %10, fd1237, fd1241; +add.f64 %11, fd1254, fd1250; +sub.f64 %12, fd1269, fd1273; +add.f64 %13, fd1286, fd1282; +sub.f64 %14, fd1301, fd1305; +add.f64 %15, fd1318, fd1314; +sub.f64 %16, fd1333, fd1337; +add.f64 %17, fd1350, fd1346; +add.f64 %19, fd1382, fd1378; +sub.f64 %18, fd1365, fd1369; +add.f64 %21, fd1260, fd1257; +sub.f64 %20, fd1244, fd1247; +add.f64 %23, fd1292, fd1289; +sub.f64 %22, fd1276, fd1279; +sub.f64 %24, fd1308, fd1311; +add.f64 %25, fd1324, fd1321; +sub.f64 %26, fd1340, fd1343; +add.f64 %27, fd1356, fd1353; +sub.f64 %28, fd1372, fd1375; +add.f64 %29, fd1388, fd1385; +sub.f64 %31, fd1257, fd1260; +add.f64 %30, fd1247, fd1244; +sub.f64 %33, fd1289, fd1292; +add.f64 %32, fd1279, fd1276; +sub.f64 %35, fd1321, fd1324; +add.f64 %34, fd1311, fd1308; +sub.f64 %37, fd1353, fd1356; +add.f64 %36, fd1343, fd1340; +sub.f64 %39, fd1385, fd1388; +add.f64 %38, fd1375, fd1372; +sub.f64 %41, fd1250, fd1254; +add.f64 %40, fd1241, fd1237; +sub.f64 %43, fd1282, fd1286; +add.f64 %42, fd1273, fd1269; +sub.f64 %45, fd1314, fd1318; +add.f64 %44, fd1305, fd1301; +sub.f64 %47, fd1346, fd1350; +add.f64 %46, fd1337, fd1333; +sub.f64 %49, fd1378, fd1382; +add.f64 %48, fd1369, fd1365; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y): "r"(smem), "l"(lut_dp_25_625), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[5].y), "d"(rmem[10].y), "d"(rmem[20].y), "d"(rmem[16].y), "d"(rmem[1].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[22].y), "d"(rmem[7].y), "d"(rmem[17].y), "d"(rmem[23].y), "d"(rmem[8].y), "d"(rmem[13].y), "d"(rmem[19].y), "d"(rmem[4].y), "d"(rmem[14].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<712, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<333>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 5000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %16, %24; +add.f64 fd22, %14, fd21; +add.f64 fd23, %19, %22; +add.f64 fd24, fd23, fd22; +add.f64 fd25, %18, %25; +add.f64 fd26, %15, fd25; +add.f64 fd27, %21, %23; +add.f64 fd28, fd27, fd26; +fma.rn.f64 fd29, fd21, 0d3FD3C6EF372FE950, %14; +mul.f64 fd30, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd31, fd29, fd30; +sub.f64 fd32, %18, %25; +mul.f64 fd33, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd34, %21, %23; +fma.rn.f64 fd35, fd34, 0d3FE2CF2304755A5E, fd33; +sub.f64 fd36, fd31, fd35; +add.f64 fd37, fd35, fd31; +mul.f64 fd38, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd39, %14, fd38; +fma.rn.f64 fd40, fd23, 0d3FD3C6EF372FE950, fd39; +mul.f64 fd41, fd32, 0d3FE2CF2304755A5E; +mul.f64 fd42, fd34, 0d3FEE6F0E134454FF; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, fd40, fd43; +add.f64 fd45, fd43, fd40; +fma.rn.f64 fd46, fd25, 0d3FD3C6EF372FE950, %15; +mul.f64 fd47, fd27, 0d3FE9E3779B97F4A8; +sub.f64 fd48, fd46, fd47; +sub.f64 fd49, %16, %24; +mul.f64 fd50, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd51, %19, %22; +fma.rn.f64 fd52, fd51, 0d3FE2CF2304755A5E, fd50; +add.f64 fd53, fd52, fd48; +sub.f64 fd54, fd48, fd52; +mul.f64 fd55, fd25, 0d3FE9E3779B97F4A8; +sub.f64 fd56, %15, fd55; +fma.rn.f64 fd57, fd27, 0d3FD3C6EF372FE950, fd56; +mul.f64 fd58, fd49, 0d3FE2CF2304755A5E; +mul.f64 fd59, fd51, 0d3FEE6F0E134454FF; +sub.f64 fd60, fd58, fd59; +add.f64 fd61, fd60, fd57; +sub.f64 fd62, fd57, fd60; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd63, fd64}, [rd6]; +mul.f64 fd67, fd53, fd64; +fma.rn.f64 fd68, fd63, fd36, fd67; +mul.f64 fd69, fd36, fd64; +mul.f64 fd70, fd63, fd53; +sub.f64 fd71, fd70, fd69; +mul.f64 fd72, fd63, fd63; +mul.f64 fd73, fd64, fd64; +sub.f64 fd74, fd72, fd73; +mul.f64 fd75, fd64, fd63; +fma.rn.f64 fd76, fd64, fd63, fd75; +mul.f64 fd77, fd61, fd76; +fma.rn.f64 fd78, fd74, fd44, fd77; +mul.f64 fd79, fd44, fd76; +mul.f64 fd80, fd74, fd61; +sub.f64 fd81, fd80, fd79; +ld.global.v2.f64 {fd82, fd83}, [rd6+2000]; +mul.f64 fd86, fd62, fd83; +fma.rn.f64 fd87, fd82, fd45, fd86; +mul.f64 fd88, fd45, fd83; +mul.f64 fd89, fd82, fd62; +sub.f64 fd90, fd89, fd88; +mul.f64 fd91, fd63, fd82; +mul.f64 fd92, fd64, fd83; +sub.f64 fd93, fd91, fd92; +mul.f64 fd94, fd63, fd83; +fma.rn.f64 fd95, fd64, fd82, fd94; +mul.f64 fd96, fd54, fd95; +fma.rn.f64 fd97, fd93, fd37, fd96; +mul.f64 fd98, fd37, fd95; +mul.f64 fd99, fd93, fd54; +sub.f64 fd100, fd99, fd98; +mad.lo.s32 r8, r5, 5000, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 40, r8; +st.shared.f64 [r9], fd24; +st.shared.f64 [r9+8], fd68; +st.shared.f64 [r9+16], fd78; +st.shared.f64 [r9+24], fd87; +st.shared.f64 [r9+32], fd97; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f64 fd101, [r11]; +ld.shared.f64 fd102, [r11+1000]; +ld.shared.f64 fd103, [r11+2000]; +ld.shared.f64 fd104, [r11+3000]; +ld.shared.f64 fd105, [r11+4000]; +barrier.sync 0; +st.shared.f64 [r9], fd28; +st.shared.f64 [r9+8], fd71; +st.shared.f64 [r9+16], fd81; +st.shared.f64 [r9+24], fd90; +st.shared.f64 [r9+32], fd100; +barrier.sync 0; +ld.shared.f64 fd106, [r11]; +ld.shared.f64 fd107, [r11+1000]; +ld.shared.f64 fd108, [r11+2000]; +ld.shared.f64 fd109, [r11+3000]; +ld.shared.f64 fd110, [r11+4000]; +add.f64 fd111, fd102, fd105; +add.f64 fd112, fd101, fd111; +add.f64 fd113, fd103, fd104; +add.f64 fd114, fd113, fd112; +add.f64 fd115, fd107, fd110; +add.f64 fd116, fd106, fd115; +add.f64 fd117, fd108, fd109; +add.f64 fd118, fd117, fd116; +fma.rn.f64 fd119, fd111, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd120, fd113, 0d3FE9E3779B97F4A8; +sub.f64 fd121, fd119, fd120; +sub.f64 fd122, fd107, fd110; +mul.f64 fd123, fd122, 0d3FEE6F0E134454FF; +sub.f64 fd124, fd108, fd109; +fma.rn.f64 fd125, fd124, 0d3FE2CF2304755A5E, fd123; +sub.f64 fd126, fd121, fd125; +add.f64 fd127, fd125, fd121; +mul.f64 fd128, fd111, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd101, fd128; +fma.rn.f64 fd130, fd113, 0d3FD3C6EF372FE950, fd129; +mul.f64 fd131, fd122, 0d3FE2CF2304755A5E; +mul.f64 fd132, fd124, 0d3FEE6F0E134454FF; +sub.f64 fd133, fd131, fd132; +sub.f64 fd134, fd130, fd133; +add.f64 fd135, fd133, fd130; +fma.rn.f64 fd136, fd115, 0d3FD3C6EF372FE950, fd106; +mul.f64 fd137, fd117, 0d3FE9E3779B97F4A8; +sub.f64 fd138, fd136, fd137; +sub.f64 fd139, fd102, fd105; +mul.f64 fd140, fd139, 0d3FEE6F0E134454FF; +sub.f64 fd141, fd103, fd104; +fma.rn.f64 fd142, fd141, 0d3FE2CF2304755A5E, fd140; +add.f64 fd143, fd142, fd138; +sub.f64 fd144, fd138, fd142; +mul.f64 fd145, fd115, 0d3FE9E3779B97F4A8; +sub.f64 fd146, fd106, fd145; +fma.rn.f64 fd147, fd117, 0d3FD3C6EF372FE950, fd146; +mul.f64 fd148, fd139, 0d3FE2CF2304755A5E; +mul.f64 fd149, fd141, 0d3FEE6F0E134454FF; +sub.f64 fd150, fd148, fd149; +add.f64 fd151, fd150, fd147; +sub.f64 fd152, fd147, fd150; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd153, fd154}, [rd11]; +mul.f64 fd157, fd143, fd154; +fma.rn.f64 fd158, fd153, fd126, fd157; +mul.f64 fd159, fd126, fd154; +mul.f64 fd160, fd153, fd143; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd151, fd166; +fma.rn.f64 fd168, fd164, fd134, fd167; +mul.f64 fd169, fd134, fd166; +mul.f64 fd170, fd164, fd151; +sub.f64 fd171, fd170, fd169; +ld.global.v2.f64 {fd172, fd173}, [rd11+400]; +mul.f64 fd176, fd152, fd173; +fma.rn.f64 fd177, fd172, fd135, fd176; +mul.f64 fd178, fd135, fd173; +mul.f64 fd179, fd172, fd152; +sub.f64 fd180, fd179, fd178; +mul.f64 fd181, fd153, fd172; +mul.f64 fd182, fd154, fd173; +sub.f64 fd183, fd181, fd182; +mul.f64 fd184, fd153, fd173; +fma.rn.f64 fd185, fd154, fd172, fd184; +mul.f64 fd186, fd144, fd185; +fma.rn.f64 fd187, fd183, fd127, fd186; +mul.f64 fd188, fd127, fd185; +mul.f64 fd189, fd183, fd144; +sub.f64 fd190, fd189, fd188; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 200, r16; +st.shared.f64 [r17], fd114; +st.shared.f64 [r17+40], fd158; +st.shared.f64 [r17+80], fd168; +st.shared.f64 [r17+120], fd177; +st.shared.f64 [r17+160], fd187; +barrier.sync 0; +ld.shared.f64 fd191, [r11]; +ld.shared.f64 fd192, [r11+1000]; +ld.shared.f64 fd193, [r11+2000]; +ld.shared.f64 fd194, [r11+3000]; +ld.shared.f64 fd195, [r11+4000]; +barrier.sync 0; +st.shared.f64 [r17], fd118; +st.shared.f64 [r17+40], fd161; +st.shared.f64 [r17+80], fd171; +st.shared.f64 [r17+120], fd180; +st.shared.f64 [r17+160], fd190; +barrier.sync 0; +ld.shared.f64 fd196, [r11]; +ld.shared.f64 fd197, [r11+1000]; +ld.shared.f64 fd198, [r11+2000]; +ld.shared.f64 fd199, [r11+3000]; +ld.shared.f64 fd200, [r11+4000]; +add.f64 fd201, fd192, fd195; +add.f64 fd202, fd191, fd201; +add.f64 fd203, fd193, fd194; +add.f64 fd204, fd203, fd202; +add.f64 fd205, fd197, fd200; +add.f64 fd206, fd196, fd205; +add.f64 fd207, fd198, fd199; +add.f64 fd208, fd207, fd206; +fma.rn.f64 fd209, fd201, 0d3FD3C6EF372FE950, fd191; +mul.f64 fd210, fd203, 0d3FE9E3779B97F4A8; +sub.f64 fd211, fd209, fd210; +sub.f64 fd212, fd197, fd200; +mul.f64 fd213, fd212, 0d3FEE6F0E134454FF; +sub.f64 fd214, fd198, fd199; +fma.rn.f64 fd215, fd214, 0d3FE2CF2304755A5E, fd213; +sub.f64 fd216, fd211, fd215; +add.f64 fd217, fd215, fd211; +mul.f64 fd218, fd201, 0d3FE9E3779B97F4A8; +sub.f64 fd219, fd191, fd218; +fma.rn.f64 fd220, fd203, 0d3FD3C6EF372FE950, fd219; +mul.f64 fd221, fd212, 0d3FE2CF2304755A5E; +mul.f64 fd222, fd214, 0d3FEE6F0E134454FF; +sub.f64 fd223, fd221, fd222; +sub.f64 fd224, fd220, fd223; +add.f64 fd225, fd223, fd220; +fma.rn.f64 fd226, fd205, 0d3FD3C6EF372FE950, fd196; +mul.f64 fd227, fd207, 0d3FE9E3779B97F4A8; +sub.f64 fd228, fd226, fd227; +sub.f64 fd229, fd192, fd195; +mul.f64 fd230, fd229, 0d3FEE6F0E134454FF; +sub.f64 fd231, fd193, fd194; +fma.rn.f64 fd232, fd231, 0d3FE2CF2304755A5E, fd230; +add.f64 fd233, fd232, fd228; +sub.f64 fd234, fd228, fd232; +mul.f64 fd235, fd205, 0d3FE9E3779B97F4A8; +sub.f64 fd236, fd196, fd235; +fma.rn.f64 fd237, fd207, 0d3FD3C6EF372FE950, fd236; +mul.f64 fd238, fd229, 0d3FE2CF2304755A5E; +mul.f64 fd239, fd231, 0d3FEE6F0E134454FF; +sub.f64 fd240, fd238, fd239; +add.f64 fd241, fd240, fd237; +sub.f64 fd242, fd237, fd240; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd243, fd244}, [rd16]; +mul.f64 fd247, fd233, fd244; +fma.rn.f64 fd248, fd243, fd216, fd247; +mul.f64 fd249, fd216, fd244; +mul.f64 fd250, fd243, fd233; +sub.f64 fd251, fd250, fd249; +mul.f64 fd252, fd243, fd243; +mul.f64 fd253, fd244, fd244; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd244, fd243; +fma.rn.f64 fd256, fd244, fd243, fd255; +mul.f64 fd257, fd241, fd256; +fma.rn.f64 fd258, fd254, fd224, fd257; +mul.f64 fd259, fd224, fd256; +mul.f64 fd260, fd254, fd241; +sub.f64 fd261, fd260, fd259; +ld.global.v2.f64 {fd262, fd263}, [rd16+80]; +mul.f64 fd266, fd242, fd263; +fma.rn.f64 fd267, fd262, fd225, fd266; +mul.f64 fd268, fd225, fd263; +mul.f64 fd269, fd262, fd242; +sub.f64 fd270, fd269, fd268; +mul.f64 fd271, fd243, fd262; +mul.f64 fd272, fd244, fd263; +sub.f64 fd273, fd271, fd272; +mul.f64 fd274, fd243, fd263; +fma.rn.f64 fd275, fd244, fd262, fd274; +mul.f64 fd276, fd234, fd275; +fma.rn.f64 fd277, fd273, fd217, fd276; +mul.f64 fd278, fd217, fd275; +mul.f64 fd279, fd273, fd234; +sub.f64 fd280, fd279, fd278; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 1000, r22; +st.shared.f64 [r23], fd204; +st.shared.f64 [r23+200], fd248; +st.shared.f64 [r23+400], fd258; +st.shared.f64 [r23+600], fd267; +st.shared.f64 [r23+800], fd277; +barrier.sync 0; +ld.shared.f64 fd281, [r11]; +ld.shared.f64 fd282, [r11+1000]; +ld.shared.f64 fd283, [r11+2000]; +ld.shared.f64 fd284, [r11+3000]; +ld.shared.f64 fd285, [r11+4000]; +barrier.sync 0; +st.shared.f64 [r23], fd208; +st.shared.f64 [r23+200], fd251; +st.shared.f64 [r23+400], fd261; +st.shared.f64 [r23+600], fd270; +st.shared.f64 [r23+800], fd280; +barrier.sync 0; +ld.shared.f64 fd286, [r11]; +ld.shared.f64 fd287, [r11+1000]; +ld.shared.f64 fd288, [r11+2000]; +ld.shared.f64 fd289, [r11+3000]; +ld.shared.f64 fd290, [r11+4000]; +add.f64 fd291, fd282, fd285; +add.f64 fd292, fd281, fd291; +add.f64 fd293, fd283, fd284; +add.f64 fd294, fd287, fd290; +add.f64 fd295, fd286, fd294; +add.f64 fd296, fd288, fd289; +fma.rn.f64 fd297, fd291, 0d3FD3C6EF372FE950, fd281; +mul.f64 fd298, fd293, 0d3FE9E3779B97F4A8; +sub.f64 fd299, fd297, fd298; +sub.f64 fd300, fd287, fd290; +mul.f64 fd301, fd300, 0d3FEE6F0E134454FF; +sub.f64 fd302, fd288, fd289; +fma.rn.f64 fd303, fd302, 0d3FE2CF2304755A5E, fd301; +mul.f64 fd304, fd291, 0d3FE9E3779B97F4A8; +sub.f64 fd305, fd281, fd304; +fma.rn.f64 fd306, fd293, 0d3FD3C6EF372FE950, fd305; +mul.f64 fd307, fd300, 0d3FE2CF2304755A5E; +mul.f64 fd308, fd302, 0d3FEE6F0E134454FF; +sub.f64 fd309, fd307, fd308; +fma.rn.f64 fd310, fd294, 0d3FD3C6EF372FE950, fd286; +mul.f64 fd311, fd296, 0d3FE9E3779B97F4A8; +sub.f64 fd312, fd310, fd311; +sub.f64 fd313, fd282, fd285; +mul.f64 fd314, fd313, 0d3FEE6F0E134454FF; +sub.f64 fd315, fd283, fd284; +fma.rn.f64 fd316, fd315, 0d3FE2CF2304755A5E, fd314; +mul.f64 fd317, fd294, 0d3FE9E3779B97F4A8; +sub.f64 fd318, fd286, fd317; +fma.rn.f64 fd319, fd296, 0d3FD3C6EF372FE950, fd318; +mul.f64 fd320, fd313, 0d3FE2CF2304755A5E; +mul.f64 fd321, fd315, 0d3FEE6F0E134454FF; +sub.f64 fd322, fd320, fd321; +add.f64 %0, fd293, fd292; +add.f64 %1, fd296, fd295; +add.f64 %3, fd316, fd312; +sub.f64 %2, fd299, fd303; +sub.f64 %4, fd306, fd309; +add.f64 %5, fd322, fd319; +add.f64 %6, fd309, fd306; +sub.f64 %7, fd319, fd322; +sub.f64 %9, fd312, fd316; +add.f64 %8, fd303, fd299; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<713, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<363>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %10; +mad.lo.s32 r3, r1, 10000, r2; +mov.u32 r4, %tid.x; +add.f64 fd21, %16, %24; +add.f64 fd22, %14, fd21; +add.f64 fd23, %19, %22; +add.f64 fd24, %18, %25; +add.f64 fd25, %15, fd24; +add.f64 fd26, %21, %23; +fma.rn.f64 fd27, fd21, 0d3FD3C6EF372FE950, %14; +mul.f64 fd28, fd23, 0d3FE9E3779B97F4A8; +sub.f64 fd29, fd27, fd28; +sub.f64 fd30, %18, %25; +mul.f64 fd31, fd30, 0d3FEE6F0E134454FF; +sub.f64 fd32, %21, %23; +fma.rn.f64 fd33, fd32, 0d3FE2CF2304755A5E, fd31; +sub.f64 fd34, fd29, fd33; +add.f64 fd35, fd33, fd29; +mul.f64 fd36, fd21, 0d3FE9E3779B97F4A8; +sub.f64 fd37, %14, fd36; +fma.rn.f64 fd38, fd23, 0d3FD3C6EF372FE950, fd37; +mul.f64 fd39, fd30, 0d3FE2CF2304755A5E; +mul.f64 fd40, fd32, 0d3FEE6F0E134454FF; +sub.f64 fd41, fd39, fd40; +sub.f64 fd42, fd38, fd41; +add.f64 fd43, fd41, fd38; +fma.rn.f64 fd44, fd24, 0d3FD3C6EF372FE950, %15; +mul.f64 fd45, fd26, 0d3FE9E3779B97F4A8; +sub.f64 fd46, fd44, fd45; +sub.f64 fd47, %16, %24; +mul.f64 fd48, fd47, 0d3FEE6F0E134454FF; +sub.f64 fd49, %19, %22; +fma.rn.f64 fd50, fd49, 0d3FE2CF2304755A5E, fd48; +add.f64 fd51, fd50, fd46; +sub.f64 fd52, fd46, fd50; +mul.f64 fd53, fd24, 0d3FE9E3779B97F4A8; +sub.f64 fd54, %15, fd53; +fma.rn.f64 fd55, fd26, 0d3FD3C6EF372FE950, fd54; +mul.f64 fd56, fd47, 0d3FE2CF2304755A5E; +mul.f64 fd57, fd49, 0d3FEE6F0E134454FF; +sub.f64 fd58, fd56, fd57; +add.f64 fd59, fd58, fd55; +sub.f64 fd60, fd55, fd58; +mul.wide.u32 rd2, r4, 274877907; +shr.u64 rd3, rd2, 35; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 125; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 10000, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %11; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd61, fd62}, [rd6]; +mul.f64 fd65, fd51, fd62; +mul.f64 fd66, fd34, fd62; +mul.f64 fd67, fd61, fd51; +mul.f64 fd68, fd61, fd61; +mul.f64 fd69, fd62, fd62; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd62, fd61; +fma.rn.f64 fd72, fd62, fd61, fd71; +mul.f64 fd73, fd59, fd72; +mul.f64 fd74, fd42, fd72; +mul.f64 fd75, fd70, fd59; +ld.global.v2.f64 {fd76, fd77}, [rd6+2000]; +mul.f64 fd80, fd60, fd77; +mul.f64 fd81, fd43, fd77; +mul.f64 fd82, fd76, fd60; +mul.f64 fd83, fd61, fd76; +mul.f64 fd84, fd62, fd77; +sub.f64 fd85, fd83, fd84; +mul.f64 fd86, fd61, fd77; +fma.rn.f64 fd87, fd62, fd76, fd86; +mul.f64 fd88, fd52, fd87; +mul.f64 fd89, fd35, fd87; +mul.f64 fd90, fd85, fd52; +barrier.sync 0; +mad.lo.s32 r9, r7, 80, r8; +add.f64 fd91, fd26, fd25; +add.f64 fd92, fd23, fd22; +st.shared.v2.f64 [r9], {fd92, fd91}; +fma.rn.f64 fd93, fd61, fd34, fd65; +sub.f64 fd94, fd67, fd66; +st.shared.v2.f64 [r9+16], {fd93, fd94}; +fma.rn.f64 fd95, fd70, fd42, fd73; +sub.f64 fd96, fd75, fd74; +st.shared.v2.f64 [r9+32], {fd95, fd96}; +fma.rn.f64 fd97, fd76, fd43, fd80; +sub.f64 fd98, fd82, fd81; +st.shared.v2.f64 [r9+48], {fd97, fd98}; +fma.rn.f64 fd99, fd85, fd35, fd88; +sub.f64 fd100, fd90, fd89; +st.shared.v2.f64 [r9+64], {fd99, fd100}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd101, fd102}, [r11]; +ld.shared.v2.f64 {fd105, fd106}, [r11+2000]; +ld.shared.v2.f64 {fd109, fd110}, [r11+4000]; +ld.shared.v2.f64 {fd113, fd114}, [r11+6000]; +ld.shared.v2.f64 {fd117, fd118}, [r11+8000]; +add.f64 fd121, fd105, fd117; +add.f64 fd122, fd101, fd121; +add.f64 fd123, fd109, fd113; +add.f64 fd124, fd106, fd118; +add.f64 fd125, fd102, fd124; +add.f64 fd126, fd110, fd114; +fma.rn.f64 fd127, fd121, 0d3FD3C6EF372FE950, fd101; +mul.f64 fd128, fd123, 0d3FE9E3779B97F4A8; +sub.f64 fd129, fd127, fd128; +sub.f64 fd130, fd106, fd118; +mul.f64 fd131, fd130, 0d3FEE6F0E134454FF; +sub.f64 fd132, fd110, fd114; +fma.rn.f64 fd133, fd132, 0d3FE2CF2304755A5E, fd131; +sub.f64 fd134, fd129, fd133; +add.f64 fd135, fd133, fd129; +mul.f64 fd136, fd121, 0d3FE9E3779B97F4A8; +sub.f64 fd137, fd101, fd136; +fma.rn.f64 fd138, fd123, 0d3FD3C6EF372FE950, fd137; +mul.f64 fd139, fd130, 0d3FE2CF2304755A5E; +mul.f64 fd140, fd132, 0d3FEE6F0E134454FF; +sub.f64 fd141, fd139, fd140; +sub.f64 fd142, fd138, fd141; +add.f64 fd143, fd141, fd138; +fma.rn.f64 fd144, fd124, 0d3FD3C6EF372FE950, fd102; +mul.f64 fd145, fd126, 0d3FE9E3779B97F4A8; +sub.f64 fd146, fd144, fd145; +sub.f64 fd147, fd105, fd117; +mul.f64 fd148, fd147, 0d3FEE6F0E134454FF; +sub.f64 fd149, fd109, fd113; +fma.rn.f64 fd150, fd149, 0d3FE2CF2304755A5E, fd148; +add.f64 fd151, fd150, fd146; +sub.f64 fd152, fd146, fd150; +mul.f64 fd153, fd124, 0d3FE9E3779B97F4A8; +sub.f64 fd154, fd102, fd153; +fma.rn.f64 fd155, fd126, 0d3FD3C6EF372FE950, fd154; +mul.f64 fd156, fd147, 0d3FE2CF2304755A5E; +mul.f64 fd157, fd149, 0d3FEE6F0E134454FF; +sub.f64 fd158, fd156, fd157; +add.f64 fd159, fd158, fd155; +sub.f64 fd160, fd155, fd158; +mul.wide.u32 rd7, r7, -858993459; +shr.u64 rd8, rd7, 34; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 5; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %12; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd161, fd162}, [rd11]; +mul.f64 fd165, fd151, fd162; +mul.f64 fd166, fd134, fd162; +mul.f64 fd167, fd161, fd151; +mul.f64 fd168, fd161, fd161; +mul.f64 fd169, fd162, fd162; +sub.f64 fd170, fd168, fd169; +mul.f64 fd171, fd162, fd161; +fma.rn.f64 fd172, fd162, fd161, fd171; +mul.f64 fd173, fd159, fd172; +mul.f64 fd174, fd142, fd172; +mul.f64 fd175, fd170, fd159; +ld.global.v2.f64 {fd176, fd177}, [rd11+400]; +mul.f64 fd180, fd160, fd177; +mul.f64 fd181, fd143, fd177; +mul.f64 fd182, fd176, fd160; +mul.f64 fd183, fd161, fd176; +mul.f64 fd184, fd162, fd177; +sub.f64 fd185, fd183, fd184; +mul.f64 fd186, fd161, fd177; +fma.rn.f64 fd187, fd162, fd176, fd186; +mul.f64 fd188, fd152, fd187; +mul.f64 fd189, fd135, fd187; +mul.f64 fd190, fd185, fd152; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 400, r16; +add.f64 fd191, fd126, fd125; +add.f64 fd192, fd123, fd122; +st.shared.v2.f64 [r17], {fd192, fd191}; +fma.rn.f64 fd193, fd161, fd134, fd165; +sub.f64 fd194, fd167, fd166; +st.shared.v2.f64 [r17+80], {fd193, fd194}; +fma.rn.f64 fd195, fd170, fd142, fd173; +sub.f64 fd196, fd175, fd174; +st.shared.v2.f64 [r17+160], {fd195, fd196}; +fma.rn.f64 fd197, fd176, fd143, fd180; +sub.f64 fd198, fd182, fd181; +st.shared.v2.f64 [r17+240], {fd197, fd198}; +fma.rn.f64 fd199, fd185, fd135, fd188; +sub.f64 fd200, fd190, fd189; +st.shared.v2.f64 [r17+320], {fd199, fd200}; +barrier.sync 0; +ld.shared.v2.f64 {fd201, fd202}, [r11]; +ld.shared.v2.f64 {fd205, fd206}, [r11+2000]; +ld.shared.v2.f64 {fd209, fd210}, [r11+4000]; +ld.shared.v2.f64 {fd213, fd214}, [r11+6000]; +ld.shared.v2.f64 {fd217, fd218}, [r11+8000]; +add.f64 fd221, fd205, fd217; +add.f64 fd222, fd201, fd221; +add.f64 fd223, fd209, fd213; +add.f64 fd224, fd206, fd218; +add.f64 fd225, fd202, fd224; +add.f64 fd226, fd210, fd214; +fma.rn.f64 fd227, fd221, 0d3FD3C6EF372FE950, fd201; +mul.f64 fd228, fd223, 0d3FE9E3779B97F4A8; +sub.f64 fd229, fd227, fd228; +sub.f64 fd230, fd206, fd218; +mul.f64 fd231, fd230, 0d3FEE6F0E134454FF; +sub.f64 fd232, fd210, fd214; +fma.rn.f64 fd233, fd232, 0d3FE2CF2304755A5E, fd231; +sub.f64 fd234, fd229, fd233; +add.f64 fd235, fd233, fd229; +mul.f64 fd236, fd221, 0d3FE9E3779B97F4A8; +sub.f64 fd237, fd201, fd236; +fma.rn.f64 fd238, fd223, 0d3FD3C6EF372FE950, fd237; +mul.f64 fd239, fd230, 0d3FE2CF2304755A5E; +mul.f64 fd240, fd232, 0d3FEE6F0E134454FF; +sub.f64 fd241, fd239, fd240; +sub.f64 fd242, fd238, fd241; +add.f64 fd243, fd241, fd238; +fma.rn.f64 fd244, fd224, 0d3FD3C6EF372FE950, fd202; +mul.f64 fd245, fd226, 0d3FE9E3779B97F4A8; +sub.f64 fd246, fd244, fd245; +sub.f64 fd247, fd205, fd217; +mul.f64 fd248, fd247, 0d3FEE6F0E134454FF; +sub.f64 fd249, fd209, fd213; +fma.rn.f64 fd250, fd249, 0d3FE2CF2304755A5E, fd248; +add.f64 fd251, fd250, fd246; +sub.f64 fd252, fd246, fd250; +mul.f64 fd253, fd224, 0d3FE9E3779B97F4A8; +sub.f64 fd254, fd202, fd253; +fma.rn.f64 fd255, fd226, 0d3FD3C6EF372FE950, fd254; +mul.f64 fd256, fd247, 0d3FE2CF2304755A5E; +mul.f64 fd257, fd249, 0d3FEE6F0E134454FF; +sub.f64 fd258, fd256, fd257; +add.f64 fd259, fd258, fd255; +sub.f64 fd260, fd255, fd258; +mul.wide.u32 rd12, r7, 1374389535; +shr.u64 rd13, rd12, 35; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 25; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %13; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd261, fd262}, [rd16]; +mul.f64 fd265, fd251, fd262; +mul.f64 fd266, fd234, fd262; +mul.f64 fd267, fd261, fd251; +mul.f64 fd268, fd261, fd261; +mul.f64 fd269, fd262, fd262; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd262, fd261; +fma.rn.f64 fd272, fd262, fd261, fd271; +mul.f64 fd273, fd259, fd272; +mul.f64 fd274, fd242, fd272; +mul.f64 fd275, fd270, fd259; +ld.global.v2.f64 {fd276, fd277}, [rd16+80]; +mul.f64 fd280, fd260, fd277; +mul.f64 fd281, fd243, fd277; +mul.f64 fd282, fd276, fd260; +mul.f64 fd283, fd261, fd276; +mul.f64 fd284, fd262, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd261, fd277; +fma.rn.f64 fd287, fd262, fd276, fd286; +mul.f64 fd288, fd252, fd287; +mul.f64 fd289, fd235, fd287; +mul.f64 fd290, fd285, fd252; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2000, r22; +add.f64 fd291, fd226, fd225; +add.f64 fd292, fd223, fd222; +st.shared.v2.f64 [r23], {fd292, fd291}; +fma.rn.f64 fd293, fd261, fd234, fd265; +sub.f64 fd294, fd267, fd266; +st.shared.v2.f64 [r23+400], {fd293, fd294}; +fma.rn.f64 fd295, fd270, fd242, fd273; +sub.f64 fd296, fd275, fd274; +st.shared.v2.f64 [r23+800], {fd295, fd296}; +fma.rn.f64 fd297, fd276, fd243, fd280; +sub.f64 fd298, fd282, fd281; +st.shared.v2.f64 [r23+1200], {fd297, fd298}; +fma.rn.f64 fd299, fd285, fd235, fd288; +sub.f64 fd300, fd290, fd289; +st.shared.v2.f64 [r23+1600], {fd299, fd300}; +barrier.sync 0; +ld.shared.v2.f64 {fd301, fd302}, [r11]; +ld.shared.v2.f64 {fd305, fd306}, [r11+2000]; +ld.shared.v2.f64 {fd309, fd310}, [r11+4000]; +ld.shared.v2.f64 {fd313, fd314}, [r11+6000]; +ld.shared.v2.f64 {fd317, fd318}, [r11+8000]; +add.f64 fd321, fd305, fd317; +add.f64 fd322, fd301, fd321; +add.f64 fd323, fd309, fd313; +add.f64 fd324, fd306, fd318; +add.f64 fd325, fd302, fd324; +add.f64 fd326, fd310, fd314; +fma.rn.f64 fd327, fd321, 0d3FD3C6EF372FE950, fd301; +mul.f64 fd328, fd323, 0d3FE9E3779B97F4A8; +sub.f64 fd329, fd327, fd328; +sub.f64 fd330, fd306, fd318; +mul.f64 fd331, fd330, 0d3FEE6F0E134454FF; +sub.f64 fd332, fd310, fd314; +fma.rn.f64 fd333, fd332, 0d3FE2CF2304755A5E, fd331; +mul.f64 fd334, fd321, 0d3FE9E3779B97F4A8; +sub.f64 fd335, fd301, fd334; +fma.rn.f64 fd336, fd323, 0d3FD3C6EF372FE950, fd335; +mul.f64 fd337, fd330, 0d3FE2CF2304755A5E; +mul.f64 fd338, fd332, 0d3FEE6F0E134454FF; +sub.f64 fd339, fd337, fd338; +fma.rn.f64 fd340, fd324, 0d3FD3C6EF372FE950, fd302; +mul.f64 fd341, fd326, 0d3FE9E3779B97F4A8; +sub.f64 fd342, fd340, fd341; +sub.f64 fd343, fd305, fd317; +mul.f64 fd344, fd343, 0d3FEE6F0E134454FF; +sub.f64 fd345, fd309, fd313; +fma.rn.f64 fd346, fd345, 0d3FE2CF2304755A5E, fd344; +mul.f64 fd347, fd324, 0d3FE9E3779B97F4A8; +sub.f64 fd348, fd302, fd347; +fma.rn.f64 fd349, fd326, 0d3FD3C6EF372FE950, fd348; +mul.f64 fd350, fd343, 0d3FE2CF2304755A5E; +mul.f64 fd351, fd345, 0d3FEE6F0E134454FF; +sub.f64 fd352, fd350, fd351; +add.f64 %1, fd326, fd325; +add.f64 %0, fd323, fd322; +add.f64 %3, fd346, fd342; +sub.f64 %2, fd329, fd333; +add.f64 %5, fd352, fd349; +sub.f64 %4, fd336, fd339; +sub.f64 %7, fd349, fd352; +add.f64 %6, fd339, fd336; +sub.f64 %9, fd342, fd346; +add.f64 %8, fd333, fd329; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y): "r"(smem), "l"(lut_dp_5_625), "l"(lut_dp_5_125), "l"(lut_dp_5_25), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..67ad57884dbaa --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp16_fwd.hpp.inc @@ -0,0 +1,14784 @@ +#ifndef CUFFTDX_FFT_64_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_64_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<787, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<78>; +.reg .b32 r<667>; +.reg .b64 rd<2>; +mov.u32 r655, %tid.y; +shl.b32 r656, r655, 9; +mov.u32 r657, %16; +add.s32 r658, r657, r656; +mov.u32 r659, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f48, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r101, {low, high}; +} +mov.f32 f58, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f44, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r660, r659, 7; +shl.b32 r661, r659, 6; +and.b32 r662, r661, -512; +add.s32 r663, r658, r662; +cvt.rn.f32.u32 f75, r660; +mul.f32 f76, f75, 0f3DC90FDB; +cos.approx.f32 f29, f76; +sin.approx.f32 f77, f76; +neg.f32 f30, f77; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r664, r661, 448; +add.s32 r665, r663, r664; +st.shared.v4.f32 [r665], {r149, r152, r209, r216}; +st.shared.v4.f32 [r665+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r665+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r665+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r666, r660, -56, r665; +ld.shared.u32 r460, [r666]; +ld.shared.u32 r463, [r666+4]; +ld.shared.u32 r510, [r666+64]; +ld.shared.u32 r513, [r666+68]; +ld.shared.u32 r472, [r666+128]; +ld.shared.u32 r475, [r666+132]; +ld.shared.u32 r522, [r666+192]; +ld.shared.u32 r525, [r666+196]; +ld.shared.u32 r461, [r666+256]; +ld.shared.u32 r464, [r666+260]; +ld.shared.u32 r511, [r666+320]; +ld.shared.u32 r514, [r666+324]; +ld.shared.u32 r473, [r666+384]; +ld.shared.u32 r476, [r666+388]; +ld.shared.u32 r523, [r666+448]; +ld.shared.u32 r526, [r666+452]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 %0, r485, r535; +} +{ +add.f16x2 %1, r488, r538; +} +{ +sub.f16x2 %8, r485, r535; +} +{ +sub.f16x2 %9, r488, r538; +} +{ +add.f16x2 %2, r497, r579; +} +{ +add.f16x2 %3, r500, r585; +} +{ +sub.f16x2 %10, r497, r579; +} +{ +sub.f16x2 %11, r500, r585; +} +{ +add.f16x2 %4, r491, r544; +} +{ +add.f16x2 %5, r494, r589; +} +{ +sub.f16x2 %12, r491, r544; +} +{ +sub.f16x2 %13, r494, r589; +} +{ +add.f16x2 %6, r503, r597; +} +{ +add.f16x2 %7, r506, r603; +} +{ +sub.f16x2 %14, r503, r597; +} +{ +sub.f16x2 %15, r506, r603; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<788, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<78>; +.reg .b32 r<667>; +.reg .b64 rd<2>; +mov.u32 r655, %tid.y; +shl.b32 r656, r655, 8; +mov.u32 r657, %16; +add.s32 r658, r657, r656; +mov.u32 r659, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f48, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r101, {low, high}; +} +mov.f32 f58, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f44, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r660, r659, 7; +shl.b32 r661, r659, 5; +and.b32 r662, r661, -256; +add.s32 r663, r658, r662; +cvt.rn.f32.u32 f75, r660; +mul.f32 f76, f75, 0f3DC90FDB; +cos.approx.f32 f29, f76; +sin.approx.f32 f77, f76; +neg.f32 f30, f77; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r664, r661, 224; +add.s32 r665, r663, r664; +st.shared.v4.f32 [r665], {r149, r209, r246, r283}; +st.shared.v4.f32 [r665+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r666, r660, -28, r665; +ld.shared.u32 r460, [r666]; +ld.shared.u32 r510, [r666+32]; +ld.shared.u32 r472, [r666+64]; +ld.shared.u32 r522, [r666+96]; +ld.shared.u32 r461, [r666+128]; +ld.shared.u32 r511, [r666+160]; +ld.shared.u32 r473, [r666+192]; +ld.shared.u32 r523, [r666+224]; +barrier.sync 0; +st.shared.v4.f32 [r665], {r152, r216, r253, r290}; +st.shared.v4.f32 [r665+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r666]; +ld.shared.u32 r513, [r666+32]; +ld.shared.u32 r475, [r666+64]; +ld.shared.u32 r525, [r666+96]; +ld.shared.u32 r464, [r666+128]; +ld.shared.u32 r514, [r666+160]; +ld.shared.u32 r476, [r666+192]; +ld.shared.u32 r526, [r666+224]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f48; +cvt.rn.f16.f32 high, f48; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 %0, r485, r535; +} +{ +add.f16x2 %1, r488, r538; +} +{ +sub.f16x2 %8, r485, r535; +} +{ +sub.f16x2 %9, r488, r538; +} +{ +add.f16x2 %2, r497, r579; +} +{ +add.f16x2 %3, r500, r585; +} +{ +sub.f16x2 %10, r497, r579; +} +{ +sub.f16x2 %11, r500, r585; +} +{ +add.f16x2 %4, r491, r544; +} +{ +add.f16x2 %5, r494, r589; +} +{ +sub.f16x2 %12, r491, r544; +} +{ +sub.f16x2 %13, r494, r589; +} +{ +add.f16x2 %6, r503, r597; +} +{ +add.f16x2 %7, r506, r603; +} +{ +sub.f16x2 %14, r503, r597; +} +{ +sub.f16x2 %15, r506, r603; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<789, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<399>; +.reg .b64 rd<2>; +mov.u32 r379, %tid.y; +shl.b32 r380, r379, 9; +mov.u32 r381, %8; +add.s32 r382, r381, r380; +mov.u32 r383, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r384, r383, 15; +shl.b32 r385, r383, 5; +and.b32 r386, r385, -512; +add.s32 r387, r382, r386; +cvt.rn.f32.u32 f21, r384; +mul.f32 f22, f21, 0f3DC90FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r388, r385, 480; +add.s32 r389, r387, r388; +st.shared.v4.f32 [r389], {r27, r30, r63, r70}; +st.shared.v4.f32 [r389+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r390, r384, -24, r389; +ld.shared.u32 r166, [r390]; +ld.shared.u32 r169, [r390+4]; +ld.shared.u32 r178, [r390+128]; +ld.shared.u32 r181, [r390+132]; +ld.shared.u32 r167, [r390+256]; +ld.shared.u32 r170, [r390+260]; +ld.shared.u32 r179, [r390+384]; +ld.shared.u32 r182, [r390+388]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r391, r383, 12; +bfe.u32 r392, r383, 2, 2; +cvt.rn.f32.u32 f24, r392; +mul.f32 f25, f24, 0f3EC90FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +shl.b32 r393, r383, 3; +and.b32 r394, r393, 24; +add.s32 r395, r387, r394; +barrier.sync 0; +and.b32 r396, r385, 384; +add.s32 r397, r395, r396; +st.shared.u32 [r397], r191; +st.shared.u32 [r397+4], r194; +st.shared.u32 [r397+32], r227; +st.shared.u32 [r397+36], r234; +st.shared.u32 [r397+64], r264; +st.shared.u32 [r397+68], r271; +st.shared.u32 [r397+96], r301; +st.shared.u32 [r397+100], r308; +barrier.sync 0; +mad.lo.s32 r398, r391, -24, r397; +ld.shared.u32 r330, [r398]; +ld.shared.u32 r333, [r398+4]; +ld.shared.u32 r342, [r398+128]; +ld.shared.u32 r345, [r398+132]; +ld.shared.u32 r331, [r398+256]; +ld.shared.u32 r334, [r398+260]; +ld.shared.u32 r343, [r398+384]; +ld.shared.u32 r346, [r398+388]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 %0, r329, r341; +} +{ +add.f16x2 %1, r332, r344; +} +{ +sub.f16x2 %4, r329, r341; +} +{ +sub.f16x2 %5, r332, r344; +} +{ +add.f16x2 %2, r335, r350; +} +{ +add.f16x2 %3, r338, r353; +} +{ +sub.f16x2 %6, r335, r350; +} +{ +sub.f16x2 %7, r338, r353; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<790, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<399>; +.reg .b64 rd<2>; +mov.u32 r379, %tid.y; +shl.b32 r380, r379, 8; +mov.u32 r381, %8; +add.s32 r382, r381, r380; +mov.u32 r383, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r384, r383, 15; +shl.b32 r385, r383, 4; +and.b32 r386, r385, -256; +add.s32 r387, r382, r386; +cvt.rn.f32.u32 f21, r384; +mul.f32 f22, f21, 0f3DC90FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r388, r385, 240; +add.s32 r389, r387, r388; +st.shared.v4.f32 [r389], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r390, r384, -12, r389; +ld.shared.u32 r166, [r390]; +ld.shared.u32 r178, [r390+64]; +ld.shared.u32 r167, [r390+128]; +ld.shared.u32 r179, [r390+192]; +barrier.sync 0; +st.shared.v4.f32 [r389], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r390]; +ld.shared.u32 r181, [r390+64]; +ld.shared.u32 r170, [r390+128]; +ld.shared.u32 r182, [r390+192]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r183; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r186; +} +{ +add.f16x2 r206, r174, r189; +} +{ +sub.f16x2 r209, r171, r186; +} +{ +sub.f16x2 r212, r174, r189; +} +and.b32 r391, r383, 12; +bfe.u32 r392, r383, 2, 2; +shl.b32 r393, r383, 2; +and.b32 r394, r393, 12; +add.s32 r395, r387, r394; +cvt.rn.f32.u32 f24, r392; +mul.f32 f25, f24, 0f3EC90FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +neg.f16x2 r225, r222; +} +{ +fma.rn.f16x2 r227, r203, r218, r225; +} +{ +mul.f16x2 r231, r203, r220; +} +{ +fma.rn.f16x2 r234, r206, r218, r231; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +neg.f16x2 r262, r259; +} +{ +fma.rn.f16x2 r264, r197, r255, r262; +} +{ +mul.f16x2 r268, r197, r257; +} +{ +fma.rn.f16x2 r271, r200, r255, r268; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +neg.f16x2 r299, r296; +} +{ +fma.rn.f16x2 r301, r209, r292, r299; +} +{ +mul.f16x2 r305, r209, r294; +} +{ +fma.rn.f16x2 r308, r212, r292, r305; +} +barrier.sync 0; +and.b32 r396, r385, 192; +add.s32 r397, r395, r396; +st.shared.u32 [r397], r191; +st.shared.u32 [r397+16], r227; +st.shared.u32 [r397+32], r264; +st.shared.u32 [r397+48], r301; +barrier.sync 0; +mad.lo.s32 r398, r391, -12, r397; +ld.shared.u32 r330, [r398]; +ld.shared.u32 r342, [r398+64]; +ld.shared.u32 r331, [r398+128]; +ld.shared.u32 r343, [r398+192]; +barrier.sync 0; +st.shared.u32 [r397], r194; +st.shared.u32 [r397+16], r234; +st.shared.u32 [r397+32], r271; +st.shared.u32 [r397+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r398]; +ld.shared.u32 r345, [r398+64]; +ld.shared.u32 r334, [r398+128]; +ld.shared.u32 r346, [r398+192]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r347; +} +{ +add.f16x2 %0, r329, r341; +} +{ +add.f16x2 %1, r332, r344; +} +{ +sub.f16x2 %4, r329, r341; +} +{ +sub.f16x2 %5, r332, r344; +} +{ +add.f16x2 %2, r335, r350; +} +{ +add.f16x2 %3, r338, r353; +} +{ +sub.f16x2 %6, r335, r350; +} +{ +sub.f16x2 %7, r338, r353; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<791, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1387>; +.reg .b64 rd<2>; +mov.u32 r1375, %tid.y; +shl.b32 r1376, r1375, 9; +mov.u32 r1377, %32; +add.s32 r1378, r1377, r1376; +mov.u32 r1379, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f62, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r101, {low, high}; +} +mov.f32 f80, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f58, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +mov.f32 f66, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r397, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1380, r1379, 3; +shl.b32 r1381, r1379, 7; +and.b32 r1382, r1381, -512; +add.s32 r1383, r1378, r1382; +cvt.rn.f32.u32 f151, r1380; +mul.f32 f152, f151, 0f3DC90FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1384, r1381, 384; +add.s32 r1385, r1383, r1384; +st.shared.v4.f32 [r1385], {r521, r524, r629, r636}; +st.shared.v4.f32 [r1385+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r1385+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r1385+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r1385+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r1385+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r1385+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r1385+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r1386, r1380, -120, r1385; +ld.shared.u32 r1176, [r1386]; +ld.shared.u32 r1179, [r1386+4]; +ld.shared.u32 r1226, [r1386+32]; +ld.shared.u32 r1229, [r1386+36]; +ld.shared.u32 r1276, [r1386+64]; +ld.shared.u32 r1279, [r1386+68]; +ld.shared.u32 r1326, [r1386+96]; +ld.shared.u32 r1329, [r1386+100]; +ld.shared.u32 r1188, [r1386+128]; +ld.shared.u32 r1191, [r1386+132]; +ld.shared.u32 r1238, [r1386+160]; +ld.shared.u32 r1241, [r1386+164]; +ld.shared.u32 r1288, [r1386+192]; +ld.shared.u32 r1291, [r1386+196]; +ld.shared.u32 r1338, [r1386+224]; +ld.shared.u32 r1341, [r1386+228]; +ld.shared.u32 r1177, [r1386+256]; +ld.shared.u32 r1180, [r1386+260]; +ld.shared.u32 r1227, [r1386+288]; +ld.shared.u32 r1230, [r1386+292]; +ld.shared.u32 r1277, [r1386+320]; +ld.shared.u32 r1280, [r1386+324]; +ld.shared.u32 r1327, [r1386+352]; +ld.shared.u32 r1330, [r1386+356]; +ld.shared.u32 r1189, [r1386+384]; +ld.shared.u32 r1192, [r1386+388]; +ld.shared.u32 r1239, [r1386+416]; +ld.shared.u32 r1242, [r1386+420]; +ld.shared.u32 r1289, [r1386+448]; +ld.shared.u32 r1292, [r1386+452]; +ld.shared.u32 r1339, [r1386+480]; +ld.shared.u32 r1342, [r1386+484]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 %0, r1175, r1187; +} +{ +add.f16x2 %1, r1178, r1190; +} +{ +sub.f16x2 %16, r1175, r1187; +} +{ +sub.f16x2 %17, r1178, r1190; +} +{ +add.f16x2 %8, r1181, r1196; +} +{ +add.f16x2 %9, r1184, r1199; +} +{ +sub.f16x2 %24, r1181, r1196; +} +{ +sub.f16x2 %25, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 %2, r1225, r1237; +} +{ +add.f16x2 %3, r1228, r1240; +} +{ +sub.f16x2 %18, r1225, r1237; +} +{ +sub.f16x2 %19, r1228, r1240; +} +{ +add.f16x2 %10, r1231, r1246; +} +{ +add.f16x2 %11, r1234, r1249; +} +{ +sub.f16x2 %26, r1231, r1246; +} +{ +sub.f16x2 %27, r1234, r1249; +} +{ +add.f16x2 r1275, r1276, r1277; +} +{ +add.f16x2 r1278, r1279, r1280; +} +{ +sub.f16x2 r1281, r1276, r1277; +} +{ +sub.f16x2 r1284, r1279, r1280; +} +{ +add.f16x2 r1287, r1288, r1289; +} +{ +add.f16x2 r1290, r1291, r1292; +} +{ +sub.f16x2 r1293, r1288, r1289; +} +{ +sub.f16x2 r1296, r1291, r1292; +} +{ +neg.f16x2 r1299, r1293; +} +{ +add.f16x2 %4, r1275, r1287; +} +{ +add.f16x2 %5, r1278, r1290; +} +{ +sub.f16x2 %20, r1275, r1287; +} +{ +sub.f16x2 %21, r1278, r1290; +} +{ +add.f16x2 %12, r1281, r1296; +} +{ +add.f16x2 %13, r1284, r1299; +} +{ +sub.f16x2 %28, r1281, r1296; +} +{ +sub.f16x2 %29, r1284, r1299; +} +{ +add.f16x2 r1325, r1326, r1327; +} +{ +add.f16x2 r1328, r1329, r1330; +} +{ +sub.f16x2 r1331, r1326, r1327; +} +{ +sub.f16x2 r1334, r1329, r1330; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1342; +} +{ +sub.f16x2 r1343, r1338, r1339; +} +{ +sub.f16x2 r1346, r1341, r1342; +} +{ +neg.f16x2 r1349, r1343; +} +{ +add.f16x2 %6, r1325, r1337; +} +{ +add.f16x2 %7, r1328, r1340; +} +{ +sub.f16x2 %22, r1325, r1337; +} +{ +sub.f16x2 %23, r1328, r1340; +} +{ +add.f16x2 %14, r1331, r1346; +} +{ +add.f16x2 %15, r1334, r1349; +} +{ +sub.f16x2 %30, r1331, r1346; +} +{ +sub.f16x2 %31, r1334, r1349; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<792, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1387>; +.reg .b64 rd<2>; +mov.u32 r1375, %tid.y; +shl.b32 r1376, r1375, 8; +mov.u32 r1377, %32; +add.s32 r1378, r1377, r1376; +mov.u32 r1379, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f62, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r101, {low, high}; +} +mov.f32 f80, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f148, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f58, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +mov.f32 f66, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f66; +cvt.rn.f16.f32 high, f66; +mov.b32 r397, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1380, r1379, 3; +shl.b32 r1381, r1379, 6; +and.b32 r1382, r1381, -256; +add.s32 r1383, r1378, r1382; +cvt.rn.f32.u32 f151, r1380; +mul.f32 f152, f151, 0f3DC90FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r1384, r1381, 192; +add.s32 r1385, r1383, r1384; +st.shared.v4.f32 [r1385], {r521, r629, r666, r703}; +st.shared.v4.f32 [r1385+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r1385+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r1385+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r1386, r1380, -60, r1385; +ld.shared.u32 r1176, [r1386]; +ld.shared.u32 r1226, [r1386+16]; +ld.shared.u32 r1276, [r1386+32]; +ld.shared.u32 r1326, [r1386+48]; +ld.shared.u32 r1188, [r1386+64]; +ld.shared.u32 r1238, [r1386+80]; +ld.shared.u32 r1288, [r1386+96]; +ld.shared.u32 r1338, [r1386+112]; +ld.shared.u32 r1177, [r1386+128]; +ld.shared.u32 r1227, [r1386+144]; +ld.shared.u32 r1277, [r1386+160]; +ld.shared.u32 r1327, [r1386+176]; +ld.shared.u32 r1189, [r1386+192]; +ld.shared.u32 r1239, [r1386+208]; +ld.shared.u32 r1289, [r1386+224]; +ld.shared.u32 r1339, [r1386+240]; +barrier.sync 0; +st.shared.v4.f32 [r1385], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1385+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1385+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1385+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1386]; +ld.shared.u32 r1229, [r1386+16]; +ld.shared.u32 r1279, [r1386+32]; +ld.shared.u32 r1329, [r1386+48]; +ld.shared.u32 r1191, [r1386+64]; +ld.shared.u32 r1241, [r1386+80]; +ld.shared.u32 r1291, [r1386+96]; +ld.shared.u32 r1341, [r1386+112]; +ld.shared.u32 r1180, [r1386+128]; +ld.shared.u32 r1230, [r1386+144]; +ld.shared.u32 r1280, [r1386+160]; +ld.shared.u32 r1330, [r1386+176]; +ld.shared.u32 r1192, [r1386+192]; +ld.shared.u32 r1242, [r1386+208]; +ld.shared.u32 r1292, [r1386+224]; +ld.shared.u32 r1342, [r1386+240]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 %0, r1175, r1187; +} +{ +add.f16x2 %1, r1178, r1190; +} +{ +sub.f16x2 %16, r1175, r1187; +} +{ +sub.f16x2 %17, r1178, r1190; +} +{ +add.f16x2 %8, r1181, r1196; +} +{ +add.f16x2 %9, r1184, r1199; +} +{ +sub.f16x2 %24, r1181, r1196; +} +{ +sub.f16x2 %25, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 %2, r1225, r1237; +} +{ +add.f16x2 %3, r1228, r1240; +} +{ +sub.f16x2 %18, r1225, r1237; +} +{ +sub.f16x2 %19, r1228, r1240; +} +{ +add.f16x2 %10, r1231, r1246; +} +{ +add.f16x2 %11, r1234, r1249; +} +{ +sub.f16x2 %26, r1231, r1246; +} +{ +sub.f16x2 %27, r1234, r1249; +} +{ +add.f16x2 r1275, r1276, r1277; +} +{ +add.f16x2 r1278, r1279, r1280; +} +{ +sub.f16x2 r1281, r1276, r1277; +} +{ +sub.f16x2 r1284, r1279, r1280; +} +{ +add.f16x2 r1287, r1288, r1289; +} +{ +add.f16x2 r1290, r1291, r1292; +} +{ +sub.f16x2 r1293, r1288, r1289; +} +{ +sub.f16x2 r1296, r1291, r1292; +} +{ +neg.f16x2 r1299, r1293; +} +{ +add.f16x2 %4, r1275, r1287; +} +{ +add.f16x2 %5, r1278, r1290; +} +{ +sub.f16x2 %20, r1275, r1287; +} +{ +sub.f16x2 %21, r1278, r1290; +} +{ +add.f16x2 %12, r1281, r1296; +} +{ +add.f16x2 %13, r1284, r1299; +} +{ +sub.f16x2 %28, r1281, r1296; +} +{ +sub.f16x2 %29, r1284, r1299; +} +{ +add.f16x2 r1325, r1326, r1327; +} +{ +add.f16x2 r1328, r1329, r1330; +} +{ +sub.f16x2 r1331, r1326, r1327; +} +{ +sub.f16x2 r1334, r1329, r1330; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1342; +} +{ +sub.f16x2 r1343, r1338, r1339; +} +{ +sub.f16x2 r1346, r1341, r1342; +} +{ +neg.f16x2 r1349, r1343; +} +{ +add.f16x2 %6, r1325, r1337; +} +{ +add.f16x2 %7, r1328, r1340; +} +{ +sub.f16x2 %22, r1325, r1337; +} +{ +sub.f16x2 %23, r1328, r1340; +} +{ +add.f16x2 %14, r1331, r1346; +} +{ +add.f16x2 %15, r1334, r1349; +} +{ +sub.f16x2 %30, r1331, r1346; +} +{ +sub.f16x2 %31, r1334, r1349; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<793, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3131>; +.reg .b64 rd<3>; +mov.u32 r3055, %tid.y; +shl.b32 r3056, r3055, 9; +mov.u32 r3057, %64; +add.s32 r3058, r3057, r3056; +mov.u32 r3059, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f246, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r101, {low, high}; +} +mov.f32 f280, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %79, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %79, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f238, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +mov.f32 f254, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r397, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %78, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %78, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3060, r3059, 1; +shl.b32 r3061, r3059, 8; +and.b32 r3062, r3061, -512; +add.s32 r3063, r3058, r3062; +cvt.rn.f32.u32 f423, r3060; +mul.f32 f424, f423, 0f3DC90FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r3064, r3061, 256; +add.s32 r3065, r3063, r3064; +st.shared.v4.f32 [r3065], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r3065+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r3065+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r3065+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r3065+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r3065+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r3065+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r3065+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r3065+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r3065+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r3065+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r3065+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r3065+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r3065+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r3065+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r3065+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r3066, r3060, -248, r3065; +ld.shared.u32 r2864, [r3066]; +ld.shared.u32 r2867, [r3066+4]; +ld.shared.u32 r2876, [r3066+16]; +ld.shared.u32 r2879, [r3066+20]; +ld.shared.u32 r2888, [r3066+32]; +ld.shared.u32 r2891, [r3066+36]; +ld.shared.u32 r2900, [r3066+48]; +ld.shared.u32 r2903, [r3066+52]; +ld.shared.u32 r2912, [r3066+64]; +ld.shared.u32 r2915, [r3066+68]; +ld.shared.u32 r2924, [r3066+80]; +ld.shared.u32 r2927, [r3066+84]; +ld.shared.u32 r2936, [r3066+96]; +ld.shared.u32 r2939, [r3066+100]; +ld.shared.u32 r2948, [r3066+112]; +ld.shared.u32 r2951, [r3066+116]; +ld.shared.u32 r2960, [r3066+128]; +ld.shared.u32 r2963, [r3066+132]; +ld.shared.u32 r2972, [r3066+144]; +ld.shared.u32 r2975, [r3066+148]; +ld.shared.u32 r2984, [r3066+160]; +ld.shared.u32 r2987, [r3066+164]; +ld.shared.u32 r2996, [r3066+176]; +ld.shared.u32 r2999, [r3066+180]; +ld.shared.u32 r3008, [r3066+192]; +ld.shared.u32 r3011, [r3066+196]; +ld.shared.u32 r3020, [r3066+208]; +ld.shared.u32 r3023, [r3066+212]; +ld.shared.u32 r3032, [r3066+224]; +ld.shared.u32 r3035, [r3066+228]; +ld.shared.u32 r3044, [r3066+240]; +ld.shared.u32 r3047, [r3066+244]; +ld.shared.u32 r2865, [r3066+256]; +ld.shared.u32 r2868, [r3066+260]; +ld.shared.u32 r2877, [r3066+272]; +ld.shared.u32 r2880, [r3066+276]; +ld.shared.u32 r2889, [r3066+288]; +ld.shared.u32 r2892, [r3066+292]; +ld.shared.u32 r2901, [r3066+304]; +ld.shared.u32 r2904, [r3066+308]; +ld.shared.u32 r2913, [r3066+320]; +ld.shared.u32 r2916, [r3066+324]; +ld.shared.u32 r2925, [r3066+336]; +ld.shared.u32 r2928, [r3066+340]; +ld.shared.u32 r2937, [r3066+352]; +ld.shared.u32 r2940, [r3066+356]; +ld.shared.u32 r2949, [r3066+368]; +ld.shared.u32 r2952, [r3066+372]; +ld.shared.u32 r2961, [r3066+384]; +ld.shared.u32 r2964, [r3066+388]; +ld.shared.u32 r2973, [r3066+400]; +ld.shared.u32 r2976, [r3066+404]; +ld.shared.u32 r2985, [r3066+416]; +ld.shared.u32 r2988, [r3066+420]; +ld.shared.u32 r2997, [r3066+432]; +ld.shared.u32 r3000, [r3066+436]; +ld.shared.u32 r3009, [r3066+448]; +ld.shared.u32 r3012, [r3066+452]; +ld.shared.u32 r3021, [r3066+464]; +ld.shared.u32 r3024, [r3066+468]; +ld.shared.u32 r3033, [r3066+480]; +ld.shared.u32 r3036, [r3066+484]; +ld.shared.u32 r3045, [r3066+496]; +ld.shared.u32 r3048, [r3066+500]; +{ +add.f16x2 %0, r2864, r2865; +} +{ +add.f16x2 %1, r2867, r2868; +} +{ +sub.f16x2 %32, r2864, r2865; +} +{ +sub.f16x2 %33, r2867, r2868; +} +{ +add.f16x2 %2, r2876, r2877; +} +{ +add.f16x2 %3, r2879, r2880; +} +{ +sub.f16x2 %34, r2876, r2877; +} +{ +sub.f16x2 %35, r2879, r2880; +} +{ +add.f16x2 %4, r2888, r2889; +} +{ +add.f16x2 %5, r2891, r2892; +} +{ +sub.f16x2 %36, r2888, r2889; +} +{ +sub.f16x2 %37, r2891, r2892; +} +{ +add.f16x2 %6, r2900, r2901; +} +{ +add.f16x2 %7, r2903, r2904; +} +{ +sub.f16x2 %38, r2900, r2901; +} +{ +sub.f16x2 %39, r2903, r2904; +} +{ +add.f16x2 %8, r2912, r2913; +} +{ +add.f16x2 %9, r2915, r2916; +} +{ +sub.f16x2 %40, r2912, r2913; +} +{ +sub.f16x2 %41, r2915, r2916; +} +{ +add.f16x2 %10, r2924, r2925; +} +{ +add.f16x2 %11, r2927, r2928; +} +{ +sub.f16x2 %42, r2924, r2925; +} +{ +sub.f16x2 %43, r2927, r2928; +} +{ +add.f16x2 %12, r2936, r2937; +} +{ +add.f16x2 %13, r2939, r2940; +} +{ +sub.f16x2 %44, r2936, r2937; +} +{ +sub.f16x2 %45, r2939, r2940; +} +{ +add.f16x2 %14, r2948, r2949; +} +{ +add.f16x2 %15, r2951, r2952; +} +{ +sub.f16x2 %46, r2948, r2949; +} +{ +sub.f16x2 %47, r2951, r2952; +} +{ +add.f16x2 %16, r2960, r2961; +} +{ +add.f16x2 %17, r2963, r2964; +} +{ +sub.f16x2 %48, r2960, r2961; +} +{ +sub.f16x2 %49, r2963, r2964; +} +{ +add.f16x2 %18, r2972, r2973; +} +{ +add.f16x2 %19, r2975, r2976; +} +{ +sub.f16x2 %50, r2972, r2973; +} +{ +sub.f16x2 %51, r2975, r2976; +} +{ +add.f16x2 %20, r2984, r2985; +} +{ +add.f16x2 %21, r2987, r2988; +} +{ +sub.f16x2 %52, r2984, r2985; +} +{ +sub.f16x2 %53, r2987, r2988; +} +{ +add.f16x2 %22, r2996, r2997; +} +{ +add.f16x2 %23, r2999, r3000; +} +{ +sub.f16x2 %54, r2996, r2997; +} +{ +sub.f16x2 %55, r2999, r3000; +} +{ +add.f16x2 %24, r3008, r3009; +} +{ +add.f16x2 %25, r3011, r3012; +} +{ +sub.f16x2 %56, r3008, r3009; +} +{ +sub.f16x2 %57, r3011, r3012; +} +{ +add.f16x2 %26, r3020, r3021; +} +{ +add.f16x2 %27, r3023, r3024; +} +{ +sub.f16x2 %58, r3020, r3021; +} +{ +sub.f16x2 %59, r3023, r3024; +} +{ +add.f16x2 %28, r3032, r3033; +} +{ +add.f16x2 %29, r3035, r3036; +} +{ +sub.f16x2 %60, r3032, r3033; +} +{ +sub.f16x2 %61, r3035, r3036; +} +{ +add.f16x2 %30, r3044, r3045; +} +{ +add.f16x2 %31, r3047, r3048; +} +{ +sub.f16x2 %62, r3044, r3045; +} +{ +sub.f16x2 %63, r3047, r3048; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<794, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<46>; +.reg .b32 r<315>; +.reg .b64 rd<2>; +mov.u32 r273, %tid.y; +shl.b32 r274, r273, 9; +mov.u32 r275, %4; +add.s32 r276, r275, r274; +mov.u32 r277, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r278, r277, 31; +shl.b32 r279, r277, 4; +and.b32 r280, r279, -512; +add.s32 r281, r276, r280; +cvt.rn.f32.u32 f31, r278; +mul.f32 f32, f31, 0f3DC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r282, r279, 496; +add.s32 r283, r281, r282; +st.shared.v2.f32 [r283], {r1, r4}; +st.shared.v2.f32 [r283+8], {r25, r32}; +barrier.sync 0; +shl.b32 r284, r277, 3; +and.b32 r285, r284, 248; +sub.s32 r286, r283, r285; +ld.shared.u32 r54, [r286]; +ld.shared.u32 r57, [r286+4]; +ld.shared.u32 r55, [r286+256]; +ld.shared.u32 r58, [r286+260]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r287, r277, 1, 4; +cvt.rn.f32.u32 f34, r287; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f7, f35; +sin.approx.f32 f36, f35; +neg.f32 f8, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r288, r284, 8; +add.s32 r289, r281, r288; +barrier.sync 0; +and.b32 r290, r279, 480; +add.s32 r291, r289, r290; +st.shared.u32 [r291], r53; +st.shared.u32 [r291+4], r56; +st.shared.u32 [r291+16], r77; +st.shared.u32 [r291+20], r84; +barrier.sync 0; +and.b32 r292, r284, 240; +sub.s32 r293, r291, r292; +ld.shared.u32 r106, [r293]; +ld.shared.u32 r109, [r293+4]; +ld.shared.u32 r107, [r293+256]; +ld.shared.u32 r110, [r293+260]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r294, r277, 2, 3; +cvt.rn.f32.u32 f37, r294; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +and.b32 r295, r284, 24; +add.s32 r296, r281, r295; +barrier.sync 0; +and.b32 r297, r279, 448; +add.s32 r298, r296, r297; +st.shared.u32 [r298], r105; +st.shared.u32 [r298+4], r108; +st.shared.u32 [r298+32], r129; +st.shared.u32 [r298+36], r136; +barrier.sync 0; +and.b32 r299, r284, 224; +sub.s32 r300, r298, r299; +ld.shared.u32 r158, [r300]; +ld.shared.u32 r161, [r300+4]; +ld.shared.u32 r159, [r300+256]; +ld.shared.u32 r162, [r300+260]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r301, r277, 3, 2; +cvt.rn.f32.u32 f40, r301; +mul.f32 f41, f40, 0f3F490FDB; +cos.approx.f32 f19, f41; +sin.approx.f32 f42, f41; +neg.f32 f20, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +and.b32 r302, r284, 56; +add.s32 r303, r281, r302; +barrier.sync 0; +and.b32 r304, r279, 384; +add.s32 r305, r303, r304; +st.shared.u32 [r305], r157; +st.shared.u32 [r305+4], r160; +st.shared.u32 [r305+64], r181; +st.shared.u32 [r305+68], r188; +barrier.sync 0; +and.b32 r306, r284, 192; +sub.s32 r307, r305, r306; +ld.shared.u32 r210, [r307]; +ld.shared.u32 r213, [r307+4]; +ld.shared.u32 r211, [r307+256]; +ld.shared.u32 r214, [r307+260]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r308, r277, 4, 1; +cvt.rn.f32.u32 f43, r308; +mul.f32 f44, f43, 0f3FC90FDB; +cos.approx.f32 f25, f44; +sin.approx.f32 f45, f44; +neg.f32 f26, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +and.b32 r309, r284, 120; +add.s32 r310, r281, r309; +barrier.sync 0; +and.b32 r311, r279, 256; +add.s32 r312, r310, r311; +st.shared.u32 [r312], r209; +st.shared.u32 [r312+4], r212; +st.shared.u32 [r312+128], r233; +st.shared.u32 [r312+132], r240; +barrier.sync 0; +and.b32 r313, r284, 128; +sub.s32 r314, r312, r313; +ld.shared.u32 r262, [r314]; +ld.shared.u32 r265, [r314+4]; +ld.shared.u32 r263, [r314+256]; +ld.shared.u32 r266, [r314+260]; +{ +add.f16x2 %0, r262, r263; +} +{ +add.f16x2 %1, r265, r266; +} +{ +sub.f16x2 %2, r262, r263; +} +{ +sub.f16x2 %3, r265, r266; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<795, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<46>; +.reg .b32 r<315>; +.reg .b64 rd<2>; +mov.u32 r273, %tid.y; +shl.b32 r274, r273, 8; +mov.u32 r275, %4; +add.s32 r276, r275, r274; +mov.u32 r277, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r278, r277, 31; +shl.b32 r279, r277, 3; +and.b32 r280, r279, -256; +add.s32 r281, r276, r280; +cvt.rn.f32.u32 f31, r278; +mul.f32 f32, f31, 0f3DC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r282, r279, 248; +add.s32 r283, r281, r282; +st.shared.v2.f32 [r283], {r1, r25}; +barrier.sync 0; +shl.b32 r284, r277, 2; +and.b32 r285, r284, 124; +sub.s32 r286, r283, r285; +ld.shared.u32 r54, [r286]; +ld.shared.u32 r55, [r286+128]; +barrier.sync 0; +st.shared.v2.f32 [r283], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r286]; +ld.shared.u32 r58, [r286+128]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r287, r277, 1, 4; +and.b32 r288, r284, 4; +add.s32 r289, r281, r288; +cvt.rn.f32.u32 f34, r287; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f7, f35; +sin.approx.f32 f36, f35; +neg.f32 f8, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r290, r279, 240; +add.s32 r291, r289, r290; +st.shared.u32 [r291], r53; +st.shared.u32 [r291+8], r77; +barrier.sync 0; +and.b32 r292, r284, 120; +sub.s32 r293, r291, r292; +ld.shared.u32 r106, [r293]; +ld.shared.u32 r107, [r293+128]; +barrier.sync 0; +st.shared.u32 [r291], r56; +st.shared.u32 [r291+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r293]; +ld.shared.u32 r110, [r293+128]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r294, r277, 2, 3; +and.b32 r295, r284, 12; +add.s32 r296, r281, r295; +cvt.rn.f32.u32 f37, r294; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +neg.f16x2 r127, r124; +} +{ +fma.rn.f16x2 r129, r111, r120, r127; +} +{ +mul.f16x2 r133, r111, r122; +} +{ +fma.rn.f16x2 r136, r114, r120, r133; +} +barrier.sync 0; +and.b32 r297, r279, 224; +add.s32 r298, r296, r297; +st.shared.u32 [r298], r105; +st.shared.u32 [r298+16], r129; +barrier.sync 0; +and.b32 r299, r284, 112; +sub.s32 r300, r298, r299; +ld.shared.u32 r158, [r300]; +ld.shared.u32 r159, [r300+128]; +barrier.sync 0; +st.shared.u32 [r298], r108; +st.shared.u32 [r298+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r300]; +ld.shared.u32 r162, [r300+128]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r301, r277, 3, 2; +and.b32 r302, r284, 28; +add.s32 r303, r281, r302; +cvt.rn.f32.u32 f40, r301; +mul.f32 f41, f40, 0f3F490FDB; +cos.approx.f32 f19, f41; +sin.approx.f32 f42, f41; +neg.f32 f20, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +neg.f16x2 r179, r176; +} +{ +fma.rn.f16x2 r181, r163, r172, r179; +} +{ +mul.f16x2 r185, r163, r174; +} +{ +fma.rn.f16x2 r188, r166, r172, r185; +} +barrier.sync 0; +and.b32 r304, r279, 192; +add.s32 r305, r303, r304; +st.shared.u32 [r305], r157; +st.shared.u32 [r305+32], r181; +barrier.sync 0; +and.b32 r306, r284, 96; +sub.s32 r307, r305, r306; +ld.shared.u32 r210, [r307]; +ld.shared.u32 r211, [r307+128]; +barrier.sync 0; +st.shared.u32 [r305], r160; +st.shared.u32 [r305+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r307]; +ld.shared.u32 r214, [r307+128]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r308, r277, 4, 1; +and.b32 r309, r284, 60; +add.s32 r310, r281, r309; +cvt.rn.f32.u32 f43, r308; +mul.f32 f44, f43, 0f3FC90FDB; +cos.approx.f32 f25, f44; +sin.approx.f32 f45, f44; +neg.f32 f26, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +neg.f16x2 r231, r228; +} +{ +fma.rn.f16x2 r233, r215, r224, r231; +} +{ +mul.f16x2 r237, r215, r226; +} +{ +fma.rn.f16x2 r240, r218, r224, r237; +} +barrier.sync 0; +and.b32 r311, r279, 128; +add.s32 r312, r310, r311; +st.shared.u32 [r312], r209; +st.shared.u32 [r312+64], r233; +barrier.sync 0; +and.b32 r313, r284, 64; +sub.s32 r314, r312, r313; +ld.shared.u32 r262, [r314]; +ld.shared.u32 r263, [r314+128]; +barrier.sync 0; +st.shared.u32 [r312], r212; +st.shared.u32 [r312+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r314]; +ld.shared.u32 r266, [r314+128]; +{ +add.f16x2 %0, r262, r263; +} +{ +add.f16x2 %1, r265, r266; +} +{ +sub.f16x2 %2, r262, r263; +} +{ +sub.f16x2 %3, r265, r266; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<796, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3131>; +.reg .b64 rd<3>; +mov.u32 r3055, %tid.y; +shl.b32 r3056, r3055, 8; +mov.u32 r3057, %64; +add.s32 r3058, r3057, r3056; +mov.u32 r3059, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f246, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r101, {low, high}; +} +mov.f32 f280, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %79, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %79, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f238, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +mov.f32 f254, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r397, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %78, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %78, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f234, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f234; +cvt.rn.f16.f32 high, f234; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f238; +cvt.rn.f16.f32 high, f238; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f242, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +mov.f32 f250, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f250; +cvt.rn.f16.f32 high, f250; +mov.b32 r1241, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1244, {low, high}; +} +mov.f32 f258, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f258; +cvt.rn.f16.f32 high, f258; +mov.b32 r1245, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3060, r3059, 1; +shl.b32 r3061, r3059, 7; +and.b32 r3062, r3061, -256; +add.s32 r3063, r3058, r3062; +cvt.rn.f32.u32 f423, r3060; +mul.f32 f424, f423, 0f3DC90FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r3064, r3061, 128; +add.s32 r3065, r3063, r3064; +st.shared.v4.f32 [r3065], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r3065+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r3065+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r3065+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r3065+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r3065+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r3065+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r3065+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r3066, r3060, -124, r3065; +ld.shared.u32 r2864, [r3066]; +ld.shared.u32 r2876, [r3066+8]; +ld.shared.u32 r2888, [r3066+16]; +ld.shared.u32 r2900, [r3066+24]; +ld.shared.u32 r2912, [r3066+32]; +ld.shared.u32 r2924, [r3066+40]; +ld.shared.u32 r2936, [r3066+48]; +ld.shared.u32 r2948, [r3066+56]; +ld.shared.u32 r2960, [r3066+64]; +ld.shared.u32 r2972, [r3066+72]; +ld.shared.u32 r2984, [r3066+80]; +ld.shared.u32 r2996, [r3066+88]; +ld.shared.u32 r3008, [r3066+96]; +ld.shared.u32 r3020, [r3066+104]; +ld.shared.u32 r3032, [r3066+112]; +ld.shared.u32 r3044, [r3066+120]; +ld.shared.u32 r2865, [r3066+128]; +ld.shared.u32 r2877, [r3066+136]; +ld.shared.u32 r2889, [r3066+144]; +ld.shared.u32 r2901, [r3066+152]; +ld.shared.u32 r2913, [r3066+160]; +ld.shared.u32 r2925, [r3066+168]; +ld.shared.u32 r2937, [r3066+176]; +ld.shared.u32 r2949, [r3066+184]; +ld.shared.u32 r2961, [r3066+192]; +ld.shared.u32 r2973, [r3066+200]; +ld.shared.u32 r2985, [r3066+208]; +ld.shared.u32 r2997, [r3066+216]; +ld.shared.u32 r3009, [r3066+224]; +ld.shared.u32 r3021, [r3066+232]; +ld.shared.u32 r3033, [r3066+240]; +ld.shared.u32 r3045, [r3066+248]; +barrier.sync 0; +st.shared.v4.f32 [r3065], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r3065+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r3065+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r3065+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r3065+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r3065+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r3065+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r3065+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r3066]; +ld.shared.u32 r2879, [r3066+8]; +ld.shared.u32 r2891, [r3066+16]; +ld.shared.u32 r2903, [r3066+24]; +ld.shared.u32 r2915, [r3066+32]; +ld.shared.u32 r2927, [r3066+40]; +ld.shared.u32 r2939, [r3066+48]; +ld.shared.u32 r2951, [r3066+56]; +ld.shared.u32 r2963, [r3066+64]; +ld.shared.u32 r2975, [r3066+72]; +ld.shared.u32 r2987, [r3066+80]; +ld.shared.u32 r2999, [r3066+88]; +ld.shared.u32 r3011, [r3066+96]; +ld.shared.u32 r3023, [r3066+104]; +ld.shared.u32 r3035, [r3066+112]; +ld.shared.u32 r3047, [r3066+120]; +ld.shared.u32 r2868, [r3066+128]; +ld.shared.u32 r2880, [r3066+136]; +ld.shared.u32 r2892, [r3066+144]; +ld.shared.u32 r2904, [r3066+152]; +ld.shared.u32 r2916, [r3066+160]; +ld.shared.u32 r2928, [r3066+168]; +ld.shared.u32 r2940, [r3066+176]; +ld.shared.u32 r2952, [r3066+184]; +ld.shared.u32 r2964, [r3066+192]; +ld.shared.u32 r2976, [r3066+200]; +ld.shared.u32 r2988, [r3066+208]; +ld.shared.u32 r3000, [r3066+216]; +ld.shared.u32 r3012, [r3066+224]; +ld.shared.u32 r3024, [r3066+232]; +ld.shared.u32 r3036, [r3066+240]; +ld.shared.u32 r3048, [r3066+248]; +{ +add.f16x2 %0, r2864, r2865; +} +{ +add.f16x2 %1, r2867, r2868; +} +{ +sub.f16x2 %32, r2864, r2865; +} +{ +sub.f16x2 %33, r2867, r2868; +} +{ +add.f16x2 %2, r2876, r2877; +} +{ +add.f16x2 %3, r2879, r2880; +} +{ +sub.f16x2 %34, r2876, r2877; +} +{ +sub.f16x2 %35, r2879, r2880; +} +{ +add.f16x2 %4, r2888, r2889; +} +{ +add.f16x2 %5, r2891, r2892; +} +{ +sub.f16x2 %36, r2888, r2889; +} +{ +sub.f16x2 %37, r2891, r2892; +} +{ +add.f16x2 %6, r2900, r2901; +} +{ +add.f16x2 %7, r2903, r2904; +} +{ +sub.f16x2 %38, r2900, r2901; +} +{ +sub.f16x2 %39, r2903, r2904; +} +{ +add.f16x2 %8, r2912, r2913; +} +{ +add.f16x2 %9, r2915, r2916; +} +{ +sub.f16x2 %40, r2912, r2913; +} +{ +sub.f16x2 %41, r2915, r2916; +} +{ +add.f16x2 %10, r2924, r2925; +} +{ +add.f16x2 %11, r2927, r2928; +} +{ +sub.f16x2 %42, r2924, r2925; +} +{ +sub.f16x2 %43, r2927, r2928; +} +{ +add.f16x2 %12, r2936, r2937; +} +{ +add.f16x2 %13, r2939, r2940; +} +{ +sub.f16x2 %44, r2936, r2937; +} +{ +sub.f16x2 %45, r2939, r2940; +} +{ +add.f16x2 %14, r2948, r2949; +} +{ +add.f16x2 %15, r2951, r2952; +} +{ +sub.f16x2 %46, r2948, r2949; +} +{ +sub.f16x2 %47, r2951, r2952; +} +{ +add.f16x2 %16, r2960, r2961; +} +{ +add.f16x2 %17, r2963, r2964; +} +{ +sub.f16x2 %48, r2960, r2961; +} +{ +sub.f16x2 %49, r2963, r2964; +} +{ +add.f16x2 %18, r2972, r2973; +} +{ +add.f16x2 %19, r2975, r2976; +} +{ +sub.f16x2 %50, r2972, r2973; +} +{ +sub.f16x2 %51, r2975, r2976; +} +{ +add.f16x2 %20, r2984, r2985; +} +{ +add.f16x2 %21, r2987, r2988; +} +{ +sub.f16x2 %52, r2984, r2985; +} +{ +sub.f16x2 %53, r2987, r2988; +} +{ +add.f16x2 %22, r2996, r2997; +} +{ +add.f16x2 %23, r2999, r3000; +} +{ +sub.f16x2 %54, r2996, r2997; +} +{ +sub.f16x2 %55, r2999, r3000; +} +{ +add.f16x2 %24, r3008, r3009; +} +{ +add.f16x2 %25, r3011, r3012; +} +{ +sub.f16x2 %56, r3008, r3009; +} +{ +sub.f16x2 %57, r3011, r3012; +} +{ +add.f16x2 %26, r3020, r3021; +} +{ +add.f16x2 %27, r3023, r3024; +} +{ +sub.f16x2 %58, r3020, r3021; +} +{ +sub.f16x2 %59, r3023, r3024; +} +{ +add.f16x2 %28, r3032, r3033; +} +{ +add.f16x2 %29, r3035, r3036; +} +{ +sub.f16x2 %60, r3032, r3033; +} +{ +sub.f16x2 %61, r3035, r3036; +} +{ +add.f16x2 %30, r3044, r3045; +} +{ +add.f16x2 %31, r3047, r3048; +} +{ +sub.f16x2 %62, r3044, r3045; +} +{ +sub.f16x2 %63, r3047, r3048; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ffa6a6c4de1dc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp16_inv.hpp.inc @@ -0,0 +1,14784 @@ +#ifndef CUFFTDX_FFT_64_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_64_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<989, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<78>; +.reg .b32 r<667>; +.reg .b64 rd<2>; +mov.u32 r655, %tid.y; +shl.b32 r656, r655, 9; +mov.u32 r657, %16; +add.s32 r658, r657, r656; +mov.u32 r659, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f58, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f44, 0f3F800000; +mov.f32 f56, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r660, r659, 7; +shl.b32 r661, r659, 6; +and.b32 r662, r661, -512; +add.s32 r663, r658, r662; +cvt.rn.f32.u32 f75, r660; +mul.f32 f76, f75, 0f3DC90FDB; +cos.approx.f32 f29, f76; +sin.approx.f32 f77, f76; +neg.f32 f30, f77; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r664, r661, 448; +add.s32 r665, r663, r664; +st.shared.v4.f32 [r665], {r149, r152, r207, r216}; +st.shared.v4.f32 [r665+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r665+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r665+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r666, r660, -56, r665; +ld.shared.u32 r460, [r666]; +ld.shared.u32 r463, [r666+4]; +ld.shared.u32 r510, [r666+64]; +ld.shared.u32 r513, [r666+68]; +ld.shared.u32 r472, [r666+128]; +ld.shared.u32 r475, [r666+132]; +ld.shared.u32 r522, [r666+192]; +ld.shared.u32 r525, [r666+196]; +ld.shared.u32 r461, [r666+256]; +ld.shared.u32 r464, [r666+260]; +ld.shared.u32 r511, [r666+320]; +ld.shared.u32 r514, [r666+324]; +ld.shared.u32 r473, [r666+384]; +ld.shared.u32 r476, [r666+388]; +ld.shared.u32 r523, [r666+448]; +ld.shared.u32 r526, [r666+452]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 %0, r485, r535; +} +{ +add.f16x2 %1, r488, r538; +} +{ +sub.f16x2 %8, r485, r535; +} +{ +sub.f16x2 %9, r488, r538; +} +{ +add.f16x2 %2, r497, r579; +} +{ +add.f16x2 %3, r500, r585; +} +{ +sub.f16x2 %10, r497, r579; +} +{ +sub.f16x2 %11, r500, r585; +} +{ +add.f16x2 %4, r491, r589; +} +{ +add.f16x2 %5, r494, r541; +} +{ +sub.f16x2 %12, r491, r589; +} +{ +sub.f16x2 %13, r494, r541; +} +{ +add.f16x2 %6, r503, r597; +} +{ +add.f16x2 %7, r506, r603; +} +{ +sub.f16x2 %14, r503, r597; +} +{ +sub.f16x2 %15, r506, r603; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<990, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<78>; +.reg .b32 r<667>; +.reg .b64 rd<2>; +mov.u32 r655, %tid.y; +shl.b32 r656, r655, 8; +mov.u32 r657, %16; +add.s32 r658, r657, r656; +mov.u32 r659, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f58, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r102, {low, high}; +} +mov.f32 f44, 0f3F800000; +mov.f32 f56, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r106, {low, high}; +} +mov.f32 f43, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r660, r659, 7; +shl.b32 r661, r659, 5; +and.b32 r662, r661, -256; +add.s32 r663, r658, r662; +cvt.rn.f32.u32 f75, r660; +mul.f32 f76, f75, 0f3DC90FDB; +cos.approx.f32 f29, f76; +sin.approx.f32 f77, f76; +neg.f32 f30, f77; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f43; +cvt.rn.f16.f32 high, f44; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r664, r661, 224; +add.s32 r665, r663, r664; +st.shared.v4.f32 [r665], {r149, r207, r244, r281}; +st.shared.v4.f32 [r665+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r666, r660, -28, r665; +ld.shared.u32 r460, [r666]; +ld.shared.u32 r510, [r666+32]; +ld.shared.u32 r472, [r666+64]; +ld.shared.u32 r522, [r666+96]; +ld.shared.u32 r461, [r666+128]; +ld.shared.u32 r511, [r666+160]; +ld.shared.u32 r473, [r666+192]; +ld.shared.u32 r523, [r666+224]; +barrier.sync 0; +st.shared.v4.f32 [r665], {r152, r216, r253, r290}; +st.shared.v4.f32 [r665+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r666]; +ld.shared.u32 r513, [r666+32]; +ld.shared.u32 r475, [r666+64]; +ld.shared.u32 r525, [r666+96]; +ld.shared.u32 r464, [r666+128]; +ld.shared.u32 r514, [r666+160]; +ld.shared.u32 r476, [r666+192]; +ld.shared.u32 r526, [r666+224]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f56; +cvt.rn.f16.f32 high, f56; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f58; +cvt.rn.f16.f32 high, f58; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 %0, r485, r535; +} +{ +add.f16x2 %1, r488, r538; +} +{ +sub.f16x2 %8, r485, r535; +} +{ +sub.f16x2 %9, r488, r538; +} +{ +add.f16x2 %2, r497, r579; +} +{ +add.f16x2 %3, r500, r585; +} +{ +sub.f16x2 %10, r497, r579; +} +{ +sub.f16x2 %11, r500, r585; +} +{ +add.f16x2 %4, r491, r589; +} +{ +add.f16x2 %5, r494, r541; +} +{ +sub.f16x2 %12, r491, r589; +} +{ +sub.f16x2 %13, r494, r541; +} +{ +add.f16x2 %6, r503, r597; +} +{ +add.f16x2 %7, r506, r603; +} +{ +sub.f16x2 %14, r503, r597; +} +{ +sub.f16x2 %15, r506, r603; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<991, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<399>; +.reg .b64 rd<2>; +mov.u32 r379, %tid.y; +shl.b32 r380, r379, 9; +mov.u32 r381, %8; +add.s32 r382, r381, r380; +mov.u32 r383, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r384, r383, 15; +shl.b32 r385, r383, 5; +and.b32 r386, r385, -512; +add.s32 r387, r382, r386; +cvt.rn.f32.u32 f21, r384; +mul.f32 f22, f21, 0f3DC90FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r388, r385, 480; +add.s32 r389, r387, r388; +st.shared.v4.f32 [r389], {r27, r30, r61, r70}; +st.shared.v4.f32 [r389+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r390, r384, -24, r389; +ld.shared.u32 r166, [r390]; +ld.shared.u32 r169, [r390+4]; +ld.shared.u32 r178, [r390+128]; +ld.shared.u32 r181, [r390+132]; +ld.shared.u32 r167, [r390+256]; +ld.shared.u32 r170, [r390+260]; +ld.shared.u32 r179, [r390+384]; +ld.shared.u32 r182, [r390+388]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r391, r383, 12; +bfe.u32 r392, r383, 2, 2; +cvt.rn.f32.u32 f24, r392; +mul.f32 f25, f24, 0f3EC90FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +shl.b32 r393, r383, 3; +and.b32 r394, r393, 24; +add.s32 r395, r387, r394; +barrier.sync 0; +and.b32 r396, r385, 384; +add.s32 r397, r395, r396; +st.shared.u32 [r397], r191; +st.shared.u32 [r397+4], r194; +st.shared.u32 [r397+32], r225; +st.shared.u32 [r397+36], r234; +st.shared.u32 [r397+64], r262; +st.shared.u32 [r397+68], r271; +st.shared.u32 [r397+96], r299; +st.shared.u32 [r397+100], r308; +barrier.sync 0; +mad.lo.s32 r398, r391, -24, r397; +ld.shared.u32 r330, [r398]; +ld.shared.u32 r333, [r398+4]; +ld.shared.u32 r342, [r398+128]; +ld.shared.u32 r345, [r398+132]; +ld.shared.u32 r331, [r398+256]; +ld.shared.u32 r334, [r398+260]; +ld.shared.u32 r343, [r398+384]; +ld.shared.u32 r346, [r398+388]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 %0, r329, r341; +} +{ +add.f16x2 %1, r332, r344; +} +{ +sub.f16x2 %4, r329, r341; +} +{ +sub.f16x2 %5, r332, r344; +} +{ +add.f16x2 %2, r335, r353; +} +{ +add.f16x2 %3, r338, r347; +} +{ +sub.f16x2 %6, r335, r353; +} +{ +sub.f16x2 %7, r338, r347; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<992, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<27>; +.reg .b32 r<399>; +.reg .b64 rd<2>; +mov.u32 r379, %tid.y; +shl.b32 r380, r379, 8; +mov.u32 r381, %8; +add.s32 r382, r381, r380; +mov.u32 r383, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r384, r383, 15; +shl.b32 r385, r383, 4; +and.b32 r386, r385, -256; +add.s32 r387, r382, r386; +cvt.rn.f32.u32 f21, r384; +mul.f32 f22, f21, 0f3DC90FDB; +cos.approx.f32 f1, f22; +sin.approx.f32 f23, f22; +neg.f32 f2, f23; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f17, 0fBF800000; +mov.f32 f18, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r388, r385, 240; +add.s32 r389, r387, r388; +st.shared.v4.f32 [r389], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r390, r384, -12, r389; +ld.shared.u32 r166, [r390]; +ld.shared.u32 r178, [r390+64]; +ld.shared.u32 r167, [r390+128]; +ld.shared.u32 r179, [r390+192]; +barrier.sync 0; +st.shared.v4.f32 [r389], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r390]; +ld.shared.u32 r181, [r390+64]; +ld.shared.u32 r170, [r390+128]; +ld.shared.u32 r182, [r390+192]; +{ +add.f16x2 r165, r166, r167; +} +{ +add.f16x2 r168, r169, r170; +} +{ +sub.f16x2 r171, r166, r167; +} +{ +sub.f16x2 r174, r169, r170; +} +{ +add.f16x2 r177, r178, r179; +} +{ +add.f16x2 r180, r181, r182; +} +{ +sub.f16x2 r183, r178, r179; +} +{ +sub.f16x2 r186, r181, r182; +} +{ +neg.f16x2 r189, r186; +} +{ +add.f16x2 r191, r165, r177; +} +{ +add.f16x2 r194, r168, r180; +} +{ +sub.f16x2 r197, r165, r177; +} +{ +sub.f16x2 r200, r168, r180; +} +{ +add.f16x2 r203, r171, r189; +} +{ +add.f16x2 r206, r174, r183; +} +{ +sub.f16x2 r209, r171, r189; +} +{ +sub.f16x2 r212, r174, r183; +} +and.b32 r391, r383, 12; +bfe.u32 r392, r383, 2, 2; +shl.b32 r393, r383, 2; +and.b32 r394, r393, 12; +add.s32 r395, r387, r394; +cvt.rn.f32.u32 f24, r392; +mul.f32 f25, f24, 0f3EC90FDB; +cos.approx.f32 f11, f25; +sin.approx.f32 f26, f25; +neg.f32 f12, f26; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f11; +cvt.rn.f16.f32 high, f12; +mov.b32 r215, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r218, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r220, {high, high}; +} +{ +mul.f16x2 r222, r206, r220; +} +{ +fma.rn.f16x2 r225, r203, r218, r222; +} +{ +mul.f16x2 r229, r203, r220; +} +{ +neg.f16x2 r232, r229; +} +{ +fma.rn.f16x2 r234, r206, r218, r232; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r240, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r242, {low, high}; +} +{ +mul.f16x2 r243, r240, r242; +} +{ +mul.f16x2 r246, r215, r238; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r249, {high, low}; +} +{ +fma.rn.f16x2 r251, r243, r249, r246; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r255, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r257, {high, high}; +} +{ +mul.f16x2 r259, r200, r257; +} +{ +fma.rn.f16x2 r262, r197, r255, r259; +} +{ +mul.f16x2 r266, r197, r257; +} +{ +neg.f16x2 r269, r266; +} +{ +fma.rn.f16x2 r271, r200, r255, r269; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r215; +mov.b32 r277, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r279, {low, high}; +} +{ +mul.f16x2 r280, r277, r279; +} +{ +mul.f16x2 r283, r251, r275; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r251; +mov.b32 r286, {high, low}; +} +{ +fma.rn.f16x2 r288, r280, r286, r283; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r292, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r288; +mov.b32 r294, {high, high}; +} +{ +mul.f16x2 r296, r212, r294; +} +{ +fma.rn.f16x2 r299, r209, r292, r296; +} +{ +mul.f16x2 r303, r209, r294; +} +{ +neg.f16x2 r306, r303; +} +{ +fma.rn.f16x2 r308, r212, r292, r306; +} +barrier.sync 0; +and.b32 r396, r385, 192; +add.s32 r397, r395, r396; +st.shared.u32 [r397], r191; +st.shared.u32 [r397+16], r225; +st.shared.u32 [r397+32], r262; +st.shared.u32 [r397+48], r299; +barrier.sync 0; +mad.lo.s32 r398, r391, -12, r397; +ld.shared.u32 r330, [r398]; +ld.shared.u32 r342, [r398+64]; +ld.shared.u32 r331, [r398+128]; +ld.shared.u32 r343, [r398+192]; +barrier.sync 0; +st.shared.u32 [r397], r194; +st.shared.u32 [r397+16], r234; +st.shared.u32 [r397+32], r271; +st.shared.u32 [r397+48], r308; +barrier.sync 0; +ld.shared.u32 r333, [r398]; +ld.shared.u32 r345, [r398+64]; +ld.shared.u32 r334, [r398+128]; +ld.shared.u32 r346, [r398+192]; +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r334; +} +{ +sub.f16x2 r335, r330, r331; +} +{ +sub.f16x2 r338, r333, r334; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r346; +} +{ +sub.f16x2 r347, r342, r343; +} +{ +sub.f16x2 r350, r345, r346; +} +{ +neg.f16x2 r353, r350; +} +{ +add.f16x2 %0, r329, r341; +} +{ +add.f16x2 %1, r332, r344; +} +{ +sub.f16x2 %4, r329, r341; +} +{ +sub.f16x2 %5, r332, r344; +} +{ +add.f16x2 %2, r335, r353; +} +{ +add.f16x2 %3, r338, r347; +} +{ +sub.f16x2 %6, r335, r353; +} +{ +sub.f16x2 %7, r338, r347; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<993, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1387>; +.reg .b64 rd<2>; +mov.u32 r1375, %tid.y; +shl.b32 r1376, r1375, 9; +mov.u32 r1377, %32; +add.s32 r1378, r1377, r1376; +mov.u32 r1379, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f80, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f78, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f76, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r398, {low, high}; +} +mov.f32 f74, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1380, r1379, 3; +shl.b32 r1381, r1379, 7; +and.b32 r1382, r1381, -512; +add.s32 r1383, r1378, r1382; +cvt.rn.f32.u32 f151, r1380; +mul.f32 f152, f151, 0f3DC90FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1384, r1381, 384; +add.s32 r1385, r1383, r1384; +st.shared.v4.f32 [r1385], {r521, r524, r627, r636}; +st.shared.v4.f32 [r1385+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r1385+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r1385+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r1385+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r1385+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r1385+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r1385+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r1386, r1380, -120, r1385; +ld.shared.u32 r1176, [r1386]; +ld.shared.u32 r1179, [r1386+4]; +ld.shared.u32 r1226, [r1386+32]; +ld.shared.u32 r1229, [r1386+36]; +ld.shared.u32 r1276, [r1386+64]; +ld.shared.u32 r1279, [r1386+68]; +ld.shared.u32 r1326, [r1386+96]; +ld.shared.u32 r1329, [r1386+100]; +ld.shared.u32 r1188, [r1386+128]; +ld.shared.u32 r1191, [r1386+132]; +ld.shared.u32 r1238, [r1386+160]; +ld.shared.u32 r1241, [r1386+164]; +ld.shared.u32 r1288, [r1386+192]; +ld.shared.u32 r1291, [r1386+196]; +ld.shared.u32 r1338, [r1386+224]; +ld.shared.u32 r1341, [r1386+228]; +ld.shared.u32 r1177, [r1386+256]; +ld.shared.u32 r1180, [r1386+260]; +ld.shared.u32 r1227, [r1386+288]; +ld.shared.u32 r1230, [r1386+292]; +ld.shared.u32 r1277, [r1386+320]; +ld.shared.u32 r1280, [r1386+324]; +ld.shared.u32 r1327, [r1386+352]; +ld.shared.u32 r1330, [r1386+356]; +ld.shared.u32 r1189, [r1386+384]; +ld.shared.u32 r1192, [r1386+388]; +ld.shared.u32 r1239, [r1386+416]; +ld.shared.u32 r1242, [r1386+420]; +ld.shared.u32 r1289, [r1386+448]; +ld.shared.u32 r1292, [r1386+452]; +ld.shared.u32 r1339, [r1386+480]; +ld.shared.u32 r1342, [r1386+484]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 %0, r1175, r1187; +} +{ +add.f16x2 %1, r1178, r1190; +} +{ +sub.f16x2 %16, r1175, r1187; +} +{ +sub.f16x2 %17, r1178, r1190; +} +{ +add.f16x2 %8, r1181, r1199; +} +{ +add.f16x2 %9, r1184, r1193; +} +{ +sub.f16x2 %24, r1181, r1199; +} +{ +sub.f16x2 %25, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 %2, r1225, r1237; +} +{ +add.f16x2 %3, r1228, r1240; +} +{ +sub.f16x2 %18, r1225, r1237; +} +{ +sub.f16x2 %19, r1228, r1240; +} +{ +add.f16x2 %10, r1231, r1249; +} +{ +add.f16x2 %11, r1234, r1243; +} +{ +sub.f16x2 %26, r1231, r1249; +} +{ +sub.f16x2 %27, r1234, r1243; +} +{ +add.f16x2 r1275, r1276, r1277; +} +{ +add.f16x2 r1278, r1279, r1280; +} +{ +sub.f16x2 r1281, r1276, r1277; +} +{ +sub.f16x2 r1284, r1279, r1280; +} +{ +add.f16x2 r1287, r1288, r1289; +} +{ +add.f16x2 r1290, r1291, r1292; +} +{ +sub.f16x2 r1293, r1288, r1289; +} +{ +sub.f16x2 r1296, r1291, r1292; +} +{ +neg.f16x2 r1299, r1296; +} +{ +add.f16x2 %4, r1275, r1287; +} +{ +add.f16x2 %5, r1278, r1290; +} +{ +sub.f16x2 %20, r1275, r1287; +} +{ +sub.f16x2 %21, r1278, r1290; +} +{ +add.f16x2 %12, r1281, r1299; +} +{ +add.f16x2 %13, r1284, r1293; +} +{ +sub.f16x2 %28, r1281, r1299; +} +{ +sub.f16x2 %29, r1284, r1293; +} +{ +add.f16x2 r1325, r1326, r1327; +} +{ +add.f16x2 r1328, r1329, r1330; +} +{ +sub.f16x2 r1331, r1326, r1327; +} +{ +sub.f16x2 r1334, r1329, r1330; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1342; +} +{ +sub.f16x2 r1343, r1338, r1339; +} +{ +sub.f16x2 r1346, r1341, r1342; +} +{ +neg.f16x2 r1349, r1346; +} +{ +add.f16x2 %6, r1325, r1337; +} +{ +add.f16x2 %7, r1328, r1340; +} +{ +sub.f16x2 %22, r1325, r1337; +} +{ +sub.f16x2 %23, r1328, r1340; +} +{ +add.f16x2 %14, r1331, r1349; +} +{ +add.f16x2 %15, r1334, r1343; +} +{ +sub.f16x2 %30, r1331, r1349; +} +{ +sub.f16x2 %31, r1334, r1343; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<994, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<154>; +.reg .b32 r<1387>; +.reg .b64 rd<2>; +mov.u32 r1375, %tid.y; +shl.b32 r1376, r1375, 8; +mov.u32 r1377, %32; +add.s32 r1378, r1377, r1376; +mov.u32 r1379, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f80, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r102, {low, high}; +} +mov.f32 f148, 0f3F800000; +mov.f32 f78, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r106, {low, high}; +} +mov.f32 f147, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f76, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r393, {low, high}; +} +mov.f32 f84, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r398, {low, high}; +} +mov.f32 f74, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f74; +cvt.rn.f16.f32 high, f74; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f76; +cvt.rn.f16.f32 high, f76; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f78; +cvt.rn.f16.f32 high, f78; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f80; +cvt.rn.f16.f32 high, f80; +mov.b32 r404, {low, high}; +} +mov.f32 f82, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f82; +cvt.rn.f16.f32 high, f82; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f84; +cvt.rn.f16.f32 high, f84; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r1380, r1379, 3; +shl.b32 r1381, r1379, 6; +and.b32 r1382, r1381, -256; +add.s32 r1383, r1378, r1382; +cvt.rn.f32.u32 f151, r1380; +mul.f32 f152, f151, 0f3DC90FDB; +cos.approx.f32 f117, f152; +sin.approx.f32 f153, f152; +neg.f32 f118, f153; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f147; +cvt.rn.f16.f32 high, f148; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r1384, r1381, 192; +add.s32 r1385, r1383, r1384; +st.shared.v4.f32 [r1385], {r521, r627, r664, r701}; +st.shared.v4.f32 [r1385+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r1385+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r1385+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r1386, r1380, -60, r1385; +ld.shared.u32 r1176, [r1386]; +ld.shared.u32 r1226, [r1386+16]; +ld.shared.u32 r1276, [r1386+32]; +ld.shared.u32 r1326, [r1386+48]; +ld.shared.u32 r1188, [r1386+64]; +ld.shared.u32 r1238, [r1386+80]; +ld.shared.u32 r1288, [r1386+96]; +ld.shared.u32 r1338, [r1386+112]; +ld.shared.u32 r1177, [r1386+128]; +ld.shared.u32 r1227, [r1386+144]; +ld.shared.u32 r1277, [r1386+160]; +ld.shared.u32 r1327, [r1386+176]; +ld.shared.u32 r1189, [r1386+192]; +ld.shared.u32 r1239, [r1386+208]; +ld.shared.u32 r1289, [r1386+224]; +ld.shared.u32 r1339, [r1386+240]; +barrier.sync 0; +st.shared.v4.f32 [r1385], {r524, r636, r673, r710}; +st.shared.v4.f32 [r1385+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r1385+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r1385+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r1386]; +ld.shared.u32 r1229, [r1386+16]; +ld.shared.u32 r1279, [r1386+32]; +ld.shared.u32 r1329, [r1386+48]; +ld.shared.u32 r1191, [r1386+64]; +ld.shared.u32 r1241, [r1386+80]; +ld.shared.u32 r1291, [r1386+96]; +ld.shared.u32 r1341, [r1386+112]; +ld.shared.u32 r1180, [r1386+128]; +ld.shared.u32 r1230, [r1386+144]; +ld.shared.u32 r1280, [r1386+160]; +ld.shared.u32 r1330, [r1386+176]; +ld.shared.u32 r1192, [r1386+192]; +ld.shared.u32 r1242, [r1386+208]; +ld.shared.u32 r1292, [r1386+224]; +ld.shared.u32 r1342, [r1386+240]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 %0, r1175, r1187; +} +{ +add.f16x2 %1, r1178, r1190; +} +{ +sub.f16x2 %16, r1175, r1187; +} +{ +sub.f16x2 %17, r1178, r1190; +} +{ +add.f16x2 %8, r1181, r1199; +} +{ +add.f16x2 %9, r1184, r1193; +} +{ +sub.f16x2 %24, r1181, r1199; +} +{ +sub.f16x2 %25, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 %2, r1225, r1237; +} +{ +add.f16x2 %3, r1228, r1240; +} +{ +sub.f16x2 %18, r1225, r1237; +} +{ +sub.f16x2 %19, r1228, r1240; +} +{ +add.f16x2 %10, r1231, r1249; +} +{ +add.f16x2 %11, r1234, r1243; +} +{ +sub.f16x2 %26, r1231, r1249; +} +{ +sub.f16x2 %27, r1234, r1243; +} +{ +add.f16x2 r1275, r1276, r1277; +} +{ +add.f16x2 r1278, r1279, r1280; +} +{ +sub.f16x2 r1281, r1276, r1277; +} +{ +sub.f16x2 r1284, r1279, r1280; +} +{ +add.f16x2 r1287, r1288, r1289; +} +{ +add.f16x2 r1290, r1291, r1292; +} +{ +sub.f16x2 r1293, r1288, r1289; +} +{ +sub.f16x2 r1296, r1291, r1292; +} +{ +neg.f16x2 r1299, r1296; +} +{ +add.f16x2 %4, r1275, r1287; +} +{ +add.f16x2 %5, r1278, r1290; +} +{ +sub.f16x2 %20, r1275, r1287; +} +{ +sub.f16x2 %21, r1278, r1290; +} +{ +add.f16x2 %12, r1281, r1299; +} +{ +add.f16x2 %13, r1284, r1293; +} +{ +sub.f16x2 %28, r1281, r1299; +} +{ +sub.f16x2 %29, r1284, r1293; +} +{ +add.f16x2 r1325, r1326, r1327; +} +{ +add.f16x2 r1328, r1329, r1330; +} +{ +sub.f16x2 r1331, r1326, r1327; +} +{ +sub.f16x2 r1334, r1329, r1330; +} +{ +add.f16x2 r1337, r1338, r1339; +} +{ +add.f16x2 r1340, r1341, r1342; +} +{ +sub.f16x2 r1343, r1338, r1339; +} +{ +sub.f16x2 r1346, r1341, r1342; +} +{ +neg.f16x2 r1349, r1346; +} +{ +add.f16x2 %6, r1325, r1337; +} +{ +add.f16x2 %7, r1328, r1340; +} +{ +sub.f16x2 %22, r1325, r1337; +} +{ +sub.f16x2 %23, r1328, r1340; +} +{ +add.f16x2 %14, r1331, r1349; +} +{ +add.f16x2 %15, r1334, r1343; +} +{ +sub.f16x2 %30, r1331, r1349; +} +{ +sub.f16x2 %31, r1334, r1343; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<995, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3131>; +.reg .b64 rd<3>; +mov.u32 r3055, %tid.y; +shl.b32 r3056, r3055, 9; +mov.u32 r3057, %64; +add.s32 r3058, r3057, r3056; +mov.u32 r3059, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f280, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +mov.f32 f278, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %84, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %84, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f272, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r398, {low, high}; +} +mov.f32 f270, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %83; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %83; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3060, r3059, 1; +shl.b32 r3061, r3059, 8; +and.b32 r3062, r3061, -512; +add.s32 r3063, r3058, r3062; +cvt.rn.f32.u32 f423, r3060; +mul.f32 f424, f423, 0f3DC90FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r3064, r3061, 256; +add.s32 r3065, r3063, r3064; +st.shared.v4.f32 [r3065], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r3065+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r3065+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r3065+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r3065+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r3065+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r3065+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r3065+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r3065+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r3065+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r3065+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r3065+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r3065+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r3065+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r3065+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r3065+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r3066, r3060, -248, r3065; +ld.shared.u32 r2864, [r3066]; +ld.shared.u32 r2867, [r3066+4]; +ld.shared.u32 r2876, [r3066+16]; +ld.shared.u32 r2879, [r3066+20]; +ld.shared.u32 r2888, [r3066+32]; +ld.shared.u32 r2891, [r3066+36]; +ld.shared.u32 r2900, [r3066+48]; +ld.shared.u32 r2903, [r3066+52]; +ld.shared.u32 r2912, [r3066+64]; +ld.shared.u32 r2915, [r3066+68]; +ld.shared.u32 r2924, [r3066+80]; +ld.shared.u32 r2927, [r3066+84]; +ld.shared.u32 r2936, [r3066+96]; +ld.shared.u32 r2939, [r3066+100]; +ld.shared.u32 r2948, [r3066+112]; +ld.shared.u32 r2951, [r3066+116]; +ld.shared.u32 r2960, [r3066+128]; +ld.shared.u32 r2963, [r3066+132]; +ld.shared.u32 r2972, [r3066+144]; +ld.shared.u32 r2975, [r3066+148]; +ld.shared.u32 r2984, [r3066+160]; +ld.shared.u32 r2987, [r3066+164]; +ld.shared.u32 r2996, [r3066+176]; +ld.shared.u32 r2999, [r3066+180]; +ld.shared.u32 r3008, [r3066+192]; +ld.shared.u32 r3011, [r3066+196]; +ld.shared.u32 r3020, [r3066+208]; +ld.shared.u32 r3023, [r3066+212]; +ld.shared.u32 r3032, [r3066+224]; +ld.shared.u32 r3035, [r3066+228]; +ld.shared.u32 r3044, [r3066+240]; +ld.shared.u32 r3047, [r3066+244]; +ld.shared.u32 r2865, [r3066+256]; +ld.shared.u32 r2868, [r3066+260]; +ld.shared.u32 r2877, [r3066+272]; +ld.shared.u32 r2880, [r3066+276]; +ld.shared.u32 r2889, [r3066+288]; +ld.shared.u32 r2892, [r3066+292]; +ld.shared.u32 r2901, [r3066+304]; +ld.shared.u32 r2904, [r3066+308]; +ld.shared.u32 r2913, [r3066+320]; +ld.shared.u32 r2916, [r3066+324]; +ld.shared.u32 r2925, [r3066+336]; +ld.shared.u32 r2928, [r3066+340]; +ld.shared.u32 r2937, [r3066+352]; +ld.shared.u32 r2940, [r3066+356]; +ld.shared.u32 r2949, [r3066+368]; +ld.shared.u32 r2952, [r3066+372]; +ld.shared.u32 r2961, [r3066+384]; +ld.shared.u32 r2964, [r3066+388]; +ld.shared.u32 r2973, [r3066+400]; +ld.shared.u32 r2976, [r3066+404]; +ld.shared.u32 r2985, [r3066+416]; +ld.shared.u32 r2988, [r3066+420]; +ld.shared.u32 r2997, [r3066+432]; +ld.shared.u32 r3000, [r3066+436]; +ld.shared.u32 r3009, [r3066+448]; +ld.shared.u32 r3012, [r3066+452]; +ld.shared.u32 r3021, [r3066+464]; +ld.shared.u32 r3024, [r3066+468]; +ld.shared.u32 r3033, [r3066+480]; +ld.shared.u32 r3036, [r3066+484]; +ld.shared.u32 r3045, [r3066+496]; +ld.shared.u32 r3048, [r3066+500]; +{ +add.f16x2 %0, r2864, r2865; +} +{ +add.f16x2 %1, r2867, r2868; +} +{ +sub.f16x2 %32, r2864, r2865; +} +{ +sub.f16x2 %33, r2867, r2868; +} +{ +add.f16x2 %2, r2876, r2877; +} +{ +add.f16x2 %3, r2879, r2880; +} +{ +sub.f16x2 %34, r2876, r2877; +} +{ +sub.f16x2 %35, r2879, r2880; +} +{ +add.f16x2 %4, r2888, r2889; +} +{ +add.f16x2 %5, r2891, r2892; +} +{ +sub.f16x2 %36, r2888, r2889; +} +{ +sub.f16x2 %37, r2891, r2892; +} +{ +add.f16x2 %6, r2900, r2901; +} +{ +add.f16x2 %7, r2903, r2904; +} +{ +sub.f16x2 %38, r2900, r2901; +} +{ +sub.f16x2 %39, r2903, r2904; +} +{ +add.f16x2 %8, r2912, r2913; +} +{ +add.f16x2 %9, r2915, r2916; +} +{ +sub.f16x2 %40, r2912, r2913; +} +{ +sub.f16x2 %41, r2915, r2916; +} +{ +add.f16x2 %10, r2924, r2925; +} +{ +add.f16x2 %11, r2927, r2928; +} +{ +sub.f16x2 %42, r2924, r2925; +} +{ +sub.f16x2 %43, r2927, r2928; +} +{ +add.f16x2 %12, r2936, r2937; +} +{ +add.f16x2 %13, r2939, r2940; +} +{ +sub.f16x2 %44, r2936, r2937; +} +{ +sub.f16x2 %45, r2939, r2940; +} +{ +add.f16x2 %14, r2948, r2949; +} +{ +add.f16x2 %15, r2951, r2952; +} +{ +sub.f16x2 %46, r2948, r2949; +} +{ +sub.f16x2 %47, r2951, r2952; +} +{ +add.f16x2 %16, r2960, r2961; +} +{ +add.f16x2 %17, r2963, r2964; +} +{ +sub.f16x2 %48, r2960, r2961; +} +{ +sub.f16x2 %49, r2963, r2964; +} +{ +add.f16x2 %18, r2972, r2973; +} +{ +add.f16x2 %19, r2975, r2976; +} +{ +sub.f16x2 %50, r2972, r2973; +} +{ +sub.f16x2 %51, r2975, r2976; +} +{ +add.f16x2 %20, r2984, r2985; +} +{ +add.f16x2 %21, r2987, r2988; +} +{ +sub.f16x2 %52, r2984, r2985; +} +{ +sub.f16x2 %53, r2987, r2988; +} +{ +add.f16x2 %22, r2996, r2997; +} +{ +add.f16x2 %23, r2999, r3000; +} +{ +sub.f16x2 %54, r2996, r2997; +} +{ +sub.f16x2 %55, r2999, r3000; +} +{ +add.f16x2 %24, r3008, r3009; +} +{ +add.f16x2 %25, r3011, r3012; +} +{ +sub.f16x2 %56, r3008, r3009; +} +{ +sub.f16x2 %57, r3011, r3012; +} +{ +add.f16x2 %26, r3020, r3021; +} +{ +add.f16x2 %27, r3023, r3024; +} +{ +sub.f16x2 %58, r3020, r3021; +} +{ +sub.f16x2 %59, r3023, r3024; +} +{ +add.f16x2 %28, r3032, r3033; +} +{ +add.f16x2 %29, r3035, r3036; +} +{ +sub.f16x2 %60, r3032, r3033; +} +{ +sub.f16x2 %61, r3035, r3036; +} +{ +add.f16x2 %30, r3044, r3045; +} +{ +add.f16x2 %31, r3047, r3048; +} +{ +sub.f16x2 %62, r3044, r3045; +} +{ +sub.f16x2 %63, r3047, r3048; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<996, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<46>; +.reg .b32 r<315>; +.reg .b64 rd<2>; +mov.u32 r273, %tid.y; +shl.b32 r274, r273, 9; +mov.u32 r275, %4; +add.s32 r276, r275, r274; +mov.u32 r277, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r278, r277, 31; +shl.b32 r279, r277, 4; +and.b32 r280, r279, -512; +add.s32 r281, r276, r280; +cvt.rn.f32.u32 f31, r278; +mul.f32 f32, f31, 0f3DC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r282, r279, 496; +add.s32 r283, r281, r282; +st.shared.v2.f32 [r283], {r1, r4}; +st.shared.v2.f32 [r283+8], {r23, r32}; +barrier.sync 0; +shl.b32 r284, r277, 3; +and.b32 r285, r284, 248; +sub.s32 r286, r283, r285; +ld.shared.u32 r54, [r286]; +ld.shared.u32 r57, [r286+4]; +ld.shared.u32 r55, [r286+256]; +ld.shared.u32 r58, [r286+260]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r287, r277, 1, 4; +cvt.rn.f32.u32 f34, r287; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f7, f35; +sin.approx.f32 f36, f35; +neg.f32 f8, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r288, r284, 8; +add.s32 r289, r281, r288; +barrier.sync 0; +and.b32 r290, r279, 480; +add.s32 r291, r289, r290; +st.shared.u32 [r291], r53; +st.shared.u32 [r291+4], r56; +st.shared.u32 [r291+16], r75; +st.shared.u32 [r291+20], r84; +barrier.sync 0; +and.b32 r292, r284, 240; +sub.s32 r293, r291, r292; +ld.shared.u32 r106, [r293]; +ld.shared.u32 r109, [r293+4]; +ld.shared.u32 r107, [r293+256]; +ld.shared.u32 r110, [r293+260]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r294, r277, 2, 3; +cvt.rn.f32.u32 f37, r294; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +and.b32 r295, r284, 24; +add.s32 r296, r281, r295; +barrier.sync 0; +and.b32 r297, r279, 448; +add.s32 r298, r296, r297; +st.shared.u32 [r298], r105; +st.shared.u32 [r298+4], r108; +st.shared.u32 [r298+32], r127; +st.shared.u32 [r298+36], r136; +barrier.sync 0; +and.b32 r299, r284, 224; +sub.s32 r300, r298, r299; +ld.shared.u32 r158, [r300]; +ld.shared.u32 r161, [r300+4]; +ld.shared.u32 r159, [r300+256]; +ld.shared.u32 r162, [r300+260]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r301, r277, 3, 2; +cvt.rn.f32.u32 f40, r301; +mul.f32 f41, f40, 0f3F490FDB; +cos.approx.f32 f19, f41; +sin.approx.f32 f42, f41; +neg.f32 f20, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +and.b32 r302, r284, 56; +add.s32 r303, r281, r302; +barrier.sync 0; +and.b32 r304, r279, 384; +add.s32 r305, r303, r304; +st.shared.u32 [r305], r157; +st.shared.u32 [r305+4], r160; +st.shared.u32 [r305+64], r179; +st.shared.u32 [r305+68], r188; +barrier.sync 0; +and.b32 r306, r284, 192; +sub.s32 r307, r305, r306; +ld.shared.u32 r210, [r307]; +ld.shared.u32 r213, [r307+4]; +ld.shared.u32 r211, [r307+256]; +ld.shared.u32 r214, [r307+260]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r308, r277, 4, 1; +cvt.rn.f32.u32 f43, r308; +mul.f32 f44, f43, 0f3FC90FDB; +cos.approx.f32 f25, f44; +sin.approx.f32 f45, f44; +neg.f32 f26, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +and.b32 r309, r284, 120; +add.s32 r310, r281, r309; +barrier.sync 0; +and.b32 r311, r279, 256; +add.s32 r312, r310, r311; +st.shared.u32 [r312], r209; +st.shared.u32 [r312+4], r212; +st.shared.u32 [r312+128], r231; +st.shared.u32 [r312+132], r240; +barrier.sync 0; +and.b32 r313, r284, 128; +sub.s32 r314, r312, r313; +ld.shared.u32 r262, [r314]; +ld.shared.u32 r265, [r314+4]; +ld.shared.u32 r263, [r314+256]; +ld.shared.u32 r266, [r314+260]; +{ +add.f16x2 %0, r262, r263; +} +{ +add.f16x2 %1, r265, r266; +} +{ +sub.f16x2 %2, r262, r263; +} +{ +sub.f16x2 %3, r265, r266; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<997, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<46>; +.reg .b32 r<315>; +.reg .b64 rd<2>; +mov.u32 r273, %tid.y; +shl.b32 r274, r273, 8; +mov.u32 r275, %4; +add.s32 r276, r275, r274; +mov.u32 r277, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r278, r277, 31; +shl.b32 r279, r277, 3; +and.b32 r280, r279, -256; +add.s32 r281, r276, r280; +cvt.rn.f32.u32 f31, r278; +mul.f32 f32, f31, 0f3DC90FDB; +cos.approx.f32 f1, f32; +sin.approx.f32 f33, f32; +neg.f32 f2, f33; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r282, r279, 248; +add.s32 r283, r281, r282; +st.shared.v2.f32 [r283], {r1, r23}; +barrier.sync 0; +shl.b32 r284, r277, 2; +and.b32 r285, r284, 124; +sub.s32 r286, r283, r285; +ld.shared.u32 r54, [r286]; +ld.shared.u32 r55, [r286+128]; +barrier.sync 0; +st.shared.v2.f32 [r283], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r286]; +ld.shared.u32 r58, [r286+128]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r287, r277, 1, 4; +and.b32 r288, r284, 4; +add.s32 r289, r281, r288; +cvt.rn.f32.u32 f34, r287; +mul.f32 f35, f34, 0f3E490FDB; +cos.approx.f32 f7, f35; +sin.approx.f32 f36, f35; +neg.f32 f8, f36; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r290, r279, 240; +add.s32 r291, r289, r290; +st.shared.u32 [r291], r53; +st.shared.u32 [r291+8], r75; +barrier.sync 0; +and.b32 r292, r284, 120; +sub.s32 r293, r291, r292; +ld.shared.u32 r106, [r293]; +ld.shared.u32 r107, [r293+128]; +barrier.sync 0; +st.shared.u32 [r291], r56; +st.shared.u32 [r291+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r293]; +ld.shared.u32 r110, [r293+128]; +{ +add.f16x2 r105, r106, r107; +} +{ +add.f16x2 r108, r109, r110; +} +{ +sub.f16x2 r111, r106, r107; +} +{ +sub.f16x2 r114, r109, r110; +} +bfe.u32 r294, r277, 2, 3; +and.b32 r295, r284, 12; +add.s32 r296, r281, r295; +cvt.rn.f32.u32 f37, r294; +mul.f32 f38, f37, 0f3EC90FDB; +cos.approx.f32 f13, f38; +sin.approx.f32 f39, f38; +neg.f32 f14, f39; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f13; +cvt.rn.f16.f32 high, f14; +mov.b32 r117, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r120, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r117; +mov.b32 r122, {high, high}; +} +{ +mul.f16x2 r124, r114, r122; +} +{ +fma.rn.f16x2 r127, r111, r120, r124; +} +{ +mul.f16x2 r131, r111, r122; +} +{ +neg.f16x2 r134, r131; +} +{ +fma.rn.f16x2 r136, r114, r120, r134; +} +barrier.sync 0; +and.b32 r297, r279, 224; +add.s32 r298, r296, r297; +st.shared.u32 [r298], r105; +st.shared.u32 [r298+16], r127; +barrier.sync 0; +and.b32 r299, r284, 112; +sub.s32 r300, r298, r299; +ld.shared.u32 r158, [r300]; +ld.shared.u32 r159, [r300+128]; +barrier.sync 0; +st.shared.u32 [r298], r108; +st.shared.u32 [r298+16], r136; +barrier.sync 0; +ld.shared.u32 r161, [r300]; +ld.shared.u32 r162, [r300+128]; +{ +add.f16x2 r157, r158, r159; +} +{ +add.f16x2 r160, r161, r162; +} +{ +sub.f16x2 r163, r158, r159; +} +{ +sub.f16x2 r166, r161, r162; +} +bfe.u32 r301, r277, 3, 2; +and.b32 r302, r284, 28; +add.s32 r303, r281, r302; +cvt.rn.f32.u32 f40, r301; +mul.f32 f41, f40, 0f3F490FDB; +cos.approx.f32 f19, f41; +sin.approx.f32 f42, f41; +neg.f32 f20, f42; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f19; +cvt.rn.f16.f32 high, f20; +mov.b32 r169, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r172, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r169; +mov.b32 r174, {high, high}; +} +{ +mul.f16x2 r176, r166, r174; +} +{ +fma.rn.f16x2 r179, r163, r172, r176; +} +{ +mul.f16x2 r183, r163, r174; +} +{ +neg.f16x2 r186, r183; +} +{ +fma.rn.f16x2 r188, r166, r172, r186; +} +barrier.sync 0; +and.b32 r304, r279, 192; +add.s32 r305, r303, r304; +st.shared.u32 [r305], r157; +st.shared.u32 [r305+32], r179; +barrier.sync 0; +and.b32 r306, r284, 96; +sub.s32 r307, r305, r306; +ld.shared.u32 r210, [r307]; +ld.shared.u32 r211, [r307+128]; +barrier.sync 0; +st.shared.u32 [r305], r160; +st.shared.u32 [r305+32], r188; +barrier.sync 0; +ld.shared.u32 r213, [r307]; +ld.shared.u32 r214, [r307+128]; +{ +add.f16x2 r209, r210, r211; +} +{ +add.f16x2 r212, r213, r214; +} +{ +sub.f16x2 r215, r210, r211; +} +{ +sub.f16x2 r218, r213, r214; +} +bfe.u32 r308, r277, 4, 1; +and.b32 r309, r284, 60; +add.s32 r310, r281, r309; +cvt.rn.f32.u32 f43, r308; +mul.f32 f44, f43, 0f3FC90FDB; +cos.approx.f32 f25, f44; +sin.approx.f32 f45, f44; +neg.f32 f26, f45; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f25; +cvt.rn.f16.f32 high, f26; +mov.b32 r221, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r221; +mov.b32 r226, {high, high}; +} +{ +mul.f16x2 r228, r218, r226; +} +{ +fma.rn.f16x2 r231, r215, r224, r228; +} +{ +mul.f16x2 r235, r215, r226; +} +{ +neg.f16x2 r238, r235; +} +{ +fma.rn.f16x2 r240, r218, r224, r238; +} +barrier.sync 0; +and.b32 r311, r279, 128; +add.s32 r312, r310, r311; +st.shared.u32 [r312], r209; +st.shared.u32 [r312+64], r231; +barrier.sync 0; +and.b32 r313, r284, 64; +sub.s32 r314, r312, r313; +ld.shared.u32 r262, [r314]; +ld.shared.u32 r263, [r314+128]; +barrier.sync 0; +st.shared.u32 [r312], r212; +st.shared.u32 [r312+64], r240; +barrier.sync 0; +ld.shared.u32 r265, [r314]; +ld.shared.u32 r266, [r314+128]; +{ +add.f16x2 %0, r262, r263; +} +{ +add.f16x2 %1, r265, r266; +} +{ +sub.f16x2 %2, r262, r263; +} +{ +sub.f16x2 %3, r265, r266; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<998, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<428>; +.reg .b32 r<3131>; +.reg .b64 rd<3>; +mov.u32 r3055, %tid.y; +shl.b32 r3056, r3055, 8; +mov.u32 r3057, %64; +add.s32 r3058, r3057, r3056; +mov.u32 r3059, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %106, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %106, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f280, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r102, {low, high}; +} +mov.f32 f278, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %84, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %84, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %108; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %108; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f272, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r393, {low, high}; +} +mov.f32 f288, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r398, {low, high}; +} +mov.f32 f270, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r404, {low, high}; +} +mov.f32 f286, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %107, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %107, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %109, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %109, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %83; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %83; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f268, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1233, {low, high}; +} +mov.f32 f292, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1236, {low, high}; +} +mov.f32 f276, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1237, {low, high}; +} +mov.f32 f284, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1246, {low, high}; +} +mov.f32 f266, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f266; +cvt.rn.f16.f32 high, f266; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f268; +cvt.rn.f16.f32 high, f268; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f270; +cvt.rn.f16.f32 high, f270; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f272; +cvt.rn.f16.f32 high, f272; +mov.b32 r1252, {low, high}; +} +mov.f32 f274, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f274; +cvt.rn.f16.f32 high, f274; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f276; +cvt.rn.f16.f32 high, f276; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f278; +cvt.rn.f16.f32 high, f278; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f280; +cvt.rn.f16.f32 high, f280; +mov.b32 r1256, {low, high}; +} +mov.f32 f282, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f286; +cvt.rn.f16.f32 high, f286; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f288; +cvt.rn.f16.f32 high, f288; +mov.b32 r1260, {low, high}; +} +mov.f32 f290, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f290; +cvt.rn.f16.f32 high, f290; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f292; +cvt.rn.f16.f32 high, f292; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +and.b32 r3060, r3059, 1; +shl.b32 r3061, r3059, 7; +and.b32 r3062, r3061, -256; +add.s32 r3063, r3058, r3062; +cvt.rn.f32.u32 f423, r3060; +mul.f32 f424, f423, 0f3DC90FDB; +cos.approx.f32 f357, f424; +sin.approx.f32 f425, f424; +neg.f32 f358, f425; +mov.f32 f427, 0fBF800000; +mov.f32 f426, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f427; +cvt.rn.f16.f32 high, f426; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r3064, r3061, 128; +add.s32 r3065, r3063, r3064; +st.shared.v4.f32 [r3065], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r3065+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r3065+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r3065+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r3065+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r3065+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r3065+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r3065+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r3066, r3060, -124, r3065; +ld.shared.u32 r2864, [r3066]; +ld.shared.u32 r2876, [r3066+8]; +ld.shared.u32 r2888, [r3066+16]; +ld.shared.u32 r2900, [r3066+24]; +ld.shared.u32 r2912, [r3066+32]; +ld.shared.u32 r2924, [r3066+40]; +ld.shared.u32 r2936, [r3066+48]; +ld.shared.u32 r2948, [r3066+56]; +ld.shared.u32 r2960, [r3066+64]; +ld.shared.u32 r2972, [r3066+72]; +ld.shared.u32 r2984, [r3066+80]; +ld.shared.u32 r2996, [r3066+88]; +ld.shared.u32 r3008, [r3066+96]; +ld.shared.u32 r3020, [r3066+104]; +ld.shared.u32 r3032, [r3066+112]; +ld.shared.u32 r3044, [r3066+120]; +ld.shared.u32 r2865, [r3066+128]; +ld.shared.u32 r2877, [r3066+136]; +ld.shared.u32 r2889, [r3066+144]; +ld.shared.u32 r2901, [r3066+152]; +ld.shared.u32 r2913, [r3066+160]; +ld.shared.u32 r2925, [r3066+168]; +ld.shared.u32 r2937, [r3066+176]; +ld.shared.u32 r2949, [r3066+184]; +ld.shared.u32 r2961, [r3066+192]; +ld.shared.u32 r2973, [r3066+200]; +ld.shared.u32 r2985, [r3066+208]; +ld.shared.u32 r2997, [r3066+216]; +ld.shared.u32 r3009, [r3066+224]; +ld.shared.u32 r3021, [r3066+232]; +ld.shared.u32 r3033, [r3066+240]; +ld.shared.u32 r3045, [r3066+248]; +barrier.sync 0; +st.shared.v4.f32 [r3065], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r3065+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r3065+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r3065+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r3065+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r3065+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r3065+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r3065+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r3066]; +ld.shared.u32 r2879, [r3066+8]; +ld.shared.u32 r2891, [r3066+16]; +ld.shared.u32 r2903, [r3066+24]; +ld.shared.u32 r2915, [r3066+32]; +ld.shared.u32 r2927, [r3066+40]; +ld.shared.u32 r2939, [r3066+48]; +ld.shared.u32 r2951, [r3066+56]; +ld.shared.u32 r2963, [r3066+64]; +ld.shared.u32 r2975, [r3066+72]; +ld.shared.u32 r2987, [r3066+80]; +ld.shared.u32 r2999, [r3066+88]; +ld.shared.u32 r3011, [r3066+96]; +ld.shared.u32 r3023, [r3066+104]; +ld.shared.u32 r3035, [r3066+112]; +ld.shared.u32 r3047, [r3066+120]; +ld.shared.u32 r2868, [r3066+128]; +ld.shared.u32 r2880, [r3066+136]; +ld.shared.u32 r2892, [r3066+144]; +ld.shared.u32 r2904, [r3066+152]; +ld.shared.u32 r2916, [r3066+160]; +ld.shared.u32 r2928, [r3066+168]; +ld.shared.u32 r2940, [r3066+176]; +ld.shared.u32 r2952, [r3066+184]; +ld.shared.u32 r2964, [r3066+192]; +ld.shared.u32 r2976, [r3066+200]; +ld.shared.u32 r2988, [r3066+208]; +ld.shared.u32 r3000, [r3066+216]; +ld.shared.u32 r3012, [r3066+224]; +ld.shared.u32 r3024, [r3066+232]; +ld.shared.u32 r3036, [r3066+240]; +ld.shared.u32 r3048, [r3066+248]; +{ +add.f16x2 %0, r2864, r2865; +} +{ +add.f16x2 %1, r2867, r2868; +} +{ +sub.f16x2 %32, r2864, r2865; +} +{ +sub.f16x2 %33, r2867, r2868; +} +{ +add.f16x2 %2, r2876, r2877; +} +{ +add.f16x2 %3, r2879, r2880; +} +{ +sub.f16x2 %34, r2876, r2877; +} +{ +sub.f16x2 %35, r2879, r2880; +} +{ +add.f16x2 %4, r2888, r2889; +} +{ +add.f16x2 %5, r2891, r2892; +} +{ +sub.f16x2 %36, r2888, r2889; +} +{ +sub.f16x2 %37, r2891, r2892; +} +{ +add.f16x2 %6, r2900, r2901; +} +{ +add.f16x2 %7, r2903, r2904; +} +{ +sub.f16x2 %38, r2900, r2901; +} +{ +sub.f16x2 %39, r2903, r2904; +} +{ +add.f16x2 %8, r2912, r2913; +} +{ +add.f16x2 %9, r2915, r2916; +} +{ +sub.f16x2 %40, r2912, r2913; +} +{ +sub.f16x2 %41, r2915, r2916; +} +{ +add.f16x2 %10, r2924, r2925; +} +{ +add.f16x2 %11, r2927, r2928; +} +{ +sub.f16x2 %42, r2924, r2925; +} +{ +sub.f16x2 %43, r2927, r2928; +} +{ +add.f16x2 %12, r2936, r2937; +} +{ +add.f16x2 %13, r2939, r2940; +} +{ +sub.f16x2 %44, r2936, r2937; +} +{ +sub.f16x2 %45, r2939, r2940; +} +{ +add.f16x2 %14, r2948, r2949; +} +{ +add.f16x2 %15, r2951, r2952; +} +{ +sub.f16x2 %46, r2948, r2949; +} +{ +sub.f16x2 %47, r2951, r2952; +} +{ +add.f16x2 %16, r2960, r2961; +} +{ +add.f16x2 %17, r2963, r2964; +} +{ +sub.f16x2 %48, r2960, r2961; +} +{ +sub.f16x2 %49, r2963, r2964; +} +{ +add.f16x2 %18, r2972, r2973; +} +{ +add.f16x2 %19, r2975, r2976; +} +{ +sub.f16x2 %50, r2972, r2973; +} +{ +sub.f16x2 %51, r2975, r2976; +} +{ +add.f16x2 %20, r2984, r2985; +} +{ +add.f16x2 %21, r2987, r2988; +} +{ +sub.f16x2 %52, r2984, r2985; +} +{ +sub.f16x2 %53, r2987, r2988; +} +{ +add.f16x2 %22, r2996, r2997; +} +{ +add.f16x2 %23, r2999, r3000; +} +{ +sub.f16x2 %54, r2996, r2997; +} +{ +sub.f16x2 %55, r2999, r3000; +} +{ +add.f16x2 %24, r3008, r3009; +} +{ +add.f16x2 %25, r3011, r3012; +} +{ +sub.f16x2 %56, r3008, r3009; +} +{ +sub.f16x2 %57, r3011, r3012; +} +{ +add.f16x2 %26, r3020, r3021; +} +{ +add.f16x2 %27, r3023, r3024; +} +{ +sub.f16x2 %58, r3020, r3021; +} +{ +sub.f16x2 %59, r3023, r3024; +} +{ +add.f16x2 %28, r3032, r3033; +} +{ +add.f16x2 %29, r3035, r3036; +} +{ +sub.f16x2 %60, r3032, r3033; +} +{ +sub.f16x2 %61, r3035, r3036; +} +{ +add.f16x2 %30, r3044, r3045; +} +{ +add.f16x2 %31, r3047, r3048; +} +{ +sub.f16x2 %62, r3044, r3045; +} +{ +sub.f16x2 %63, r3047, r3048; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..f726ed64b74a8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp32_fwd.hpp.inc @@ -0,0 +1,3954 @@ +#ifndef CUFFTDX_FFT_64_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_64_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<41, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<248>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 448; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+64]; +ld.shared.v2.f32 {f167, f168}, [r13+128]; +ld.shared.v2.f32 {f171, f172}, [r13+192]; +ld.shared.v2.f32 {f175, f176}, [r13+256]; +ld.shared.v2.f32 {f179, f180}, [r13+320]; +ld.shared.v2.f32 {f183, f184}, [r13+384]; +ld.shared.v2.f32 {f187, f188}, [r13+448]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +add.f32 %1, f200, f216; +add.f32 %0, f199, f215; +add.f32 %3, f204, f227; +add.f32 %2, f203, f225; +sub.f32 %5, f202, f217; +add.f32 %4, f201, f218; +add.f32 %7, f206, f231; +add.f32 %6, f205, f230; +sub.f32 %9, f200, f216; +sub.f32 %8, f199, f215; +sub.f32 %11, f204, f227; +sub.f32 %10, f203, f225; +add.f32 %13, f202, f217; +sub.f32 %12, f201, f218; +sub.f32 %15, f206, f231; +sub.f32 %14, f205, f230; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<42, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<232>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 224; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+32]; +ld.shared.f32 f161, [r13+64]; +ld.shared.f32 f162, [r13+96]; +ld.shared.f32 f163, [r13+128]; +ld.shared.f32 f164, [r13+160]; +ld.shared.f32 f165, [r13+192]; +ld.shared.f32 f166, [r13+224]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+32]; +ld.shared.f32 f169, [r13+64]; +ld.shared.f32 f170, [r13+96]; +ld.shared.f32 f171, [r13+128]; +ld.shared.f32 f172, [r13+160]; +ld.shared.f32 f173, [r13+192]; +ld.shared.f32 f174, [r13+224]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 %0, f183, f199; +add.f32 %1, f184, f200; +add.f32 %3, f188, f211; +add.f32 %2, f187, f209; +sub.f32 %5, f186, f201; +add.f32 %4, f185, f202; +add.f32 %7, f190, f215; +add.f32 %6, f189, f214; +sub.f32 %8, f183, f199; +sub.f32 %9, f184, f200; +sub.f32 %11, f188, f211; +sub.f32 %10, f187, f209; +add.f32 %13, f186, f201; +sub.f32 %12, f185, f202; +sub.f32 %15, f190, f215; +sub.f32 %14, f189, f214; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<43, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<155>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 480; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+128]; +ld.shared.v2.f32 {f70, f71}, [r13+256]; +ld.shared.v2.f32 {f74, f75}, [r13+384]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +add.f32 f88, f80, f85; +sub.f32 f89, f81, f84; +sub.f32 f90, f80, f85; +add.f32 f91, f81, f84; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f92, f88; +mul.f32 f97, f93, f89; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f101, f86; +mul.f32 f105, f103, f87; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f109, f90; +mul.f32 f113, f111, f91; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 384; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f93, f88, f98; +sub.f32 f118, f96, f97; +st.shared.v2.f32 [r19+32], {f118, f117}; +fma.rn.f32 f119, f103, f86, f106; +sub.f32 f120, f104, f105; +st.shared.v2.f32 [r19+64], {f120, f119}; +sub.f32 f121, f112, f113; +fma.rn.f32 f122, f111, f90, f114; +st.shared.v2.f32 [r19+96], {f121, f122}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+128]; +ld.shared.v2.f32 {f131, f132}, [r20+256]; +ld.shared.v2.f32 {f135, f136}, [r20+384]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +add.f32 %1, f140, f144; +add.f32 %0, f139, f143; +sub.f32 %3, f142, f145; +add.f32 %2, f141, f146; +sub.f32 %5, f140, f144; +sub.f32 %4, f139, f143; +add.f32 %7, f142, f145; +sub.f32 %6, f141, f146; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<44, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<139>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 240; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+64]; +ld.shared.f32 f64, [r13+128]; +ld.shared.f32 f65, [r13+192]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+64]; +ld.shared.f32 f68, [r13+128]; +ld.shared.f32 f69, [r13+192]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +add.f32 f82, f72, f77; +sub.f32 f83, f73, f76; +sub.f32 f84, f72, f77; +add.f32 f85, f73, f76; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f86, f82; +mul.f32 f91, f87, f83; +sub.f32 f92, f90, f91; +mul.f32 f93, f86, f83; +fma.rn.f32 f94, f87, f82, f93; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +sub.f32 f102, f100, f101; +mul.f32 f103, f97, f81; +fma.rn.f32 f104, f99, f80, f103; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f107, f84; +mul.f32 f111, f109, f85; +sub.f32 f112, f110, f111; +mul.f32 f113, f107, f85; +fma.rn.f32 f114, f109, f84, f113; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 192; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f92; +st.shared.f32 [r20+32], f102; +st.shared.f32 [r20+48], f112; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+64]; +ld.shared.f32 f117, [r21+128]; +ld.shared.f32 f118, [r21+192]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+64]; +ld.shared.f32 f121, [r21+128]; +ld.shared.f32 f122, [r21+192]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 %0, f123, f127; +add.f32 %1, f124, f128; +sub.f32 %3, f126, f129; +add.f32 %2, f125, f130; +sub.f32 %4, f123, f127; +sub.f32 %5, f124, f128; +add.f32 %7, f126, f129; +sub.f32 %6, f125, f130; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<45, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<632>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f624, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f622, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f621, f624, f622; +sub.f32 f76, f624, f622; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f620, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f617, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f615, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f614, f617, f615; +sub.f32 f92, f617, f615; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f613, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f613, 0fBF3504F3; +mul.f32 f612, f93, 0f3F3504F3; +sub.f32 f99, f612, f98; +mul.f32 f100, f613, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f611, f621, f614; +sub.f32 f109, f621, f614; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f610, f620, f101; +sub.f32 f113, f620, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f609, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f608, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f606, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f603, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f602, f606, f603; +sub.f32 f133, f606, f603; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f601, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f599, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f597, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f596, f599, f597; +sub.f32 f149, f599, f597; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f595, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f595, 0fBF3504F3; +mul.f32 f594, f150, 0f3F3504F3; +sub.f32 f156, f594, f155; +mul.f32 f157, f595, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f593, f602, f596; +sub.f32 f166, f602, f596; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f592, f601, f158; +sub.f32 f170, f601, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f591, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f590, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f588, f167, 0f3F6C835E; +mul.f32 f589, f592, 0fBEC3EF15; +sub.f32 f181, f588, f589; +mul.f32 f182, f592, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f586, f171, 0f3F3504F3; +mul.f32 f587, f591, 0fBF3504F3; +sub.f32 f186, f586, f587; +mul.f32 f187, f591, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f584, f175, 0f3EC3EF15; +mul.f32 f585, f590, 0fBF6C835E; +sub.f32 f191, f584, f585; +mul.f32 f192, f590, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f582, f169, 0fBEC3EF15; +mul.f32 f583, f170, 0fBF6C835E; +sub.f32 f196, f582, f583; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f580, f177, 0fBF6C835E; +mul.f32 f581, f178, 0fBEC3EF15; +sub.f32 f205, f580, f581; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f579, f610, f183; +sub.f32 f213, f610, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f578, f609, f188; +sub.f32 f217, f609, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f577, f608, f193; +sub.f32 f221, f608, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f576, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f575, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f574, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f573, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +and.b32 r14, r15, 3; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f579; +mul.f32 f244, f238, f579; +mul.f32 f246, f239, f239; +mul.f32 f572, f238, f238; +sub.f32 f247, f572, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f578; +mul.f32 f252, f247, f578; +mul.f32 f570, f238, f247; +mul.f32 f571, f239, f249; +sub.f32 f255, f570, f571; +mul.f32 f569, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f577; +mul.f32 f260, f255, f577; +mul.f32 f262, f239, f257; +mul.f32 f568, f238, f255; +sub.f32 f263, f568, f262; +mul.f32 f567, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f576; +mul.f32 f268, f263, f576; +mul.f32 f270, f239, f265; +mul.f32 f566, f238, f263; +sub.f32 f271, f566, f270; +mul.f32 f565, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f575; +mul.f32 f276, f271, f575; +mul.f32 f563, f238, f271; +mul.f32 f564, f239, f273; +sub.f32 f279, f563, f564; +mul.f32 f562, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f574; +mul.f32 f284, f279, f574; +mul.f32 f286, f239, f281; +mul.f32 f561, f238, f279; +sub.f32 f287, f561, f286; +mul.f32 f560, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f573; +mul.f32 f292, f287, f573; +mul.f32 f294, f239, f289; +mul.f32 f559, f238, f287; +sub.f32 f295, f559, f294; +mul.f32 f558, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f557, f611, f593; +mul.f32 f299, f297, f557; +mul.f32 f300, f295, f557; +mul.f32 f555, f238, f295; +mul.f32 f556, f239, f297; +sub.f32 f303, f555, f556; +sub.f32 f554, f106, f163; +mul.f32 f553, f295, f554; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f552, f238, f303; +sub.f32 f311, f552, f310; +mul.f32 f551, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f549, f238, f311; +mul.f32 f550, f239, f313; +sub.f32 f319, f549, f550; +mul.f32 f548, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f547, f238, f319; +sub.f32 f327, f547, f326; +mul.f32 f546, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f545, f238, f327; +sub.f32 f335, f545, f334; +mul.f32 f544, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f542, f238, f335; +mul.f32 f543, f239, f337; +sub.f32 f343, f542, f543; +mul.f32 f541, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f540, f238, f343; +sub.f32 f351, f540, f350; +mul.f32 f539, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f538, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f629, f611, f593; +mul.f32 f628, f297, f629; +mov.u32 r21, %tid.x; +shl.b32 r20, r21, 7; +barrier.sync 0; +and.b32 r11, r20, 384; +add.s32 r12, r9, r11; +sub.f32 f631, f611, f593; +mul.f32 f630, f297, f631; +add.f32 f357, f611, f593; +sub.f32 f627, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 3; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 3; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f539, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f569, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f567, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f565, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f562, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f560, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f558, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f627, f300; +sub.f32 f374, f553, f630; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f551, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f548, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f546, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f544, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f541, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f538, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +mad.lo.s32 r13, r18, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+32]; +ld.shared.v2.f32 {f397, f398}, [r13+64]; +ld.shared.v2.f32 {f401, f402}, [r13+96]; +ld.shared.v2.f32 {f405, f406}, [r13+128]; +ld.shared.v2.f32 {f409, f410}, [r13+160]; +ld.shared.v2.f32 {f413, f414}, [r13+192]; +ld.shared.v2.f32 {f417, f418}, [r13+224]; +ld.shared.v2.f32 {f421, f422}, [r13+256]; +ld.shared.v2.f32 {f425, f426}, [r13+288]; +ld.shared.v2.f32 {f429, f430}, [r13+320]; +ld.shared.v2.f32 {f433, f434}, [r13+352]; +ld.shared.v2.f32 {f437, f438}, [r13+384]; +ld.shared.v2.f32 {f441, f442}, [r13+416]; +ld.shared.v2.f32 {f445, f446}, [r13+448]; +ld.shared.v2.f32 {f449, f450}, [r13+480]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f537, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f536, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f393, f425; +sub.f32 f463, f393, f425; +add.f32 f535, f394, f426; +sub.f32 f464, f394, f426; +add.f32 f465, f409, f441; +sub.f32 f467, f409, f441; +add.f32 f534, f410, f442; +sub.f32 f468, f410, f442; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f533, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f532, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f401, f433; +sub.f32 f479, f401, f433; +add.f32 f531, f402, f434; +sub.f32 f480, f402, f434; +add.f32 f481, f417, f449; +sub.f32 f483, f417, f449; +add.f32 f530, f418, f450; +sub.f32 f484, f418, f450; +add.f32 %1, f537, f536; +add.f32 %0, f453, f457; +add.f32 %3, f535, f534; +add.f32 %2, f461, f465; +add.f32 %4, f469, f473; +add.f32 %5, f533, f532; +add.f32 %6, f477, f481; +add.f32 %7, f531, f530; +add.f32 %8, f455, f460; +sub.f32 %9, f456, f459; +sub.f32 %11, f464, f467; +add.f32 %10, f463, f468; +sub.f32 %13, f472, f475; +add.f32 %12, f471, f476; +sub.f32 %15, f480, f483; +add.f32 %14, f479, f484; +sub.f32 %17, f537, f536; +sub.f32 %16, f453, f457; +sub.f32 %19, f535, f534; +sub.f32 %18, f461, f465; +sub.f32 %21, f533, f532; +sub.f32 %20, f469, f473; +sub.f32 %23, f531, f530; +sub.f32 %22, f477, f481; +add.f32 %25, f456, f459; +sub.f32 %24, f455, f460; +add.f32 %27, f464, f467; +sub.f32 %26, f463, f468; +add.f32 %29, f472, f475; +sub.f32 %28, f471, f476; +add.f32 %31, f480, f483; +sub.f32 %30, f479, f484; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<46, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<485>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 192; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+16]; +ld.shared.f32 f391, [r13+32]; +ld.shared.f32 f392, [r13+48]; +ld.shared.f32 f393, [r13+64]; +ld.shared.f32 f394, [r13+80]; +ld.shared.f32 f395, [r13+96]; +ld.shared.f32 f396, [r13+112]; +ld.shared.f32 f397, [r13+128]; +ld.shared.f32 f398, [r13+144]; +ld.shared.f32 f399, [r13+160]; +ld.shared.f32 f400, [r13+176]; +ld.shared.f32 f401, [r13+192]; +ld.shared.f32 f402, [r13+208]; +ld.shared.f32 f403, [r13+224]; +ld.shared.f32 f404, [r13+240]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+16]; +ld.shared.f32 f407, [r13+32]; +ld.shared.f32 f408, [r13+48]; +ld.shared.f32 f409, [r13+64]; +ld.shared.f32 f410, [r13+80]; +ld.shared.f32 f411, [r13+96]; +ld.shared.f32 f412, [r13+112]; +ld.shared.f32 f413, [r13+128]; +ld.shared.f32 f414, [r13+144]; +ld.shared.f32 f415, [r13+160]; +ld.shared.f32 f416, [r13+176]; +ld.shared.f32 f417, [r13+192]; +ld.shared.f32 f418, [r13+208]; +ld.shared.f32 f419, [r13+224]; +ld.shared.f32 f420, [r13+240]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f390, f398; +add.f32 f430, f406, f414; +sub.f32 f431, f390, f398; +sub.f32 f432, f406, f414; +add.f32 f433, f394, f402; +add.f32 f434, f410, f418; +sub.f32 f435, f394, f402; +sub.f32 f436, f410, f418; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f392, f400; +add.f32 f446, f408, f416; +sub.f32 f447, f392, f400; +sub.f32 f448, f408, f416; +add.f32 f449, f396, f404; +add.f32 f450, f412, f420; +sub.f32 f451, f396, f404; +sub.f32 f452, f412, f420; +add.f32 %0, f421, f425; +add.f32 %1, f422, f426; +add.f32 %2, f429, f433; +add.f32 %3, f430, f434; +add.f32 %4, f437, f441; +add.f32 %5, f438, f442; +add.f32 %6, f445, f449; +add.f32 %7, f446, f450; +sub.f32 %9, f424, f427; +add.f32 %8, f423, f428; +sub.f32 %11, f432, f435; +add.f32 %10, f431, f436; +sub.f32 %13, f440, f443; +add.f32 %12, f439, f444; +sub.f32 %15, f448, f451; +add.f32 %14, f447, f452; +sub.f32 %16, f421, f425; +sub.f32 %17, f422, f426; +sub.f32 %18, f429, f433; +sub.f32 %19, f430, f434; +sub.f32 %20, f437, f441; +sub.f32 %21, f438, f442; +sub.f32 %22, f445, f449; +sub.f32 %23, f446, f450; +add.f32 %25, f424, f427; +sub.f32 %24, f423, f428; +add.f32 %27, f432, f435; +sub.f32 %26, f431, f436; +add.f32 %29, f440, f443; +sub.f32 %28, f439, f444; +add.f32 %31, f448, f451; +sub.f32 %30, f447, f452; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<47, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1352>; +.reg .b32 r<24>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1347, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1345, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1344, f1347, f1345; +sub.f32 f140, f1347, f1345; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1343, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1340, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1338, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1337, f1340, f1338; +sub.f32 f156, f1340, f1338; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1336, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1336, 0fBF3504F3; +mul.f32 f1335, f157, 0f3F3504F3; +sub.f32 f163, f1335, f162; +mul.f32 f164, f1336, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1334, f1344, f1337; +sub.f32 f173, f1344, f1337; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1333, f1343, f165; +sub.f32 f177, f1343, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1332, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1331, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1329, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1326, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1325, f1329, f1326; +sub.f32 f197, f1329, f1326; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1324, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1322, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1320, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1319, f1322, f1320; +sub.f32 f213, f1322, f1320; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1318, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1318, 0fBF3504F3; +mul.f32 f1317, f214, 0f3F3504F3; +sub.f32 f220, f1317, f219; +mul.f32 f221, f1318, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1316, f1325, f1319; +sub.f32 f230, f1325, f1319; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1315, f1324, f222; +sub.f32 f234, f1324, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1314, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1313, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1311, f231, 0f3F6C835E; +mul.f32 f1312, f1315, 0fBEC3EF15; +sub.f32 f245, f1311, f1312; +mul.f32 f246, f1315, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1309, f235, 0f3F3504F3; +mul.f32 f1310, f1314, 0fBF3504F3; +sub.f32 f250, f1309, f1310; +mul.f32 f251, f1314, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1307, f239, 0f3EC3EF15; +mul.f32 f1308, f1313, 0fBF6C835E; +sub.f32 f255, f1307, f1308; +mul.f32 f256, f1313, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1305, f233, 0fBEC3EF15; +mul.f32 f1306, f234, 0fBF6C835E; +sub.f32 f260, f1305, f1306; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1303, f241, 0fBF6C835E; +mul.f32 f1304, f242, 0fBEC3EF15; +sub.f32 f269, f1303, f1304; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1302, f1334, f1316; +sub.f32 f275, f1334, f1316; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1301, f1333, f247; +sub.f32 f279, f1333, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1300, f1332, f252; +sub.f32 f283, f1332, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1299, f1331, f257; +sub.f32 f287, f1331, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1298, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1297, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1296, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1295, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1292, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1290, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1289, f1292, f1290; +sub.f32 f315, f1292, f1290; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1288, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1286, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1283, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1282, f1286, f1283; +sub.f32 f331, f1286, f1283; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1281, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1279, f332, 0f3F3504F3; +mul.f32 f1280, f1281, 0fBF3504F3; +sub.f32 f338, f1279, f1280; +mul.f32 f339, f1281, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1278, f1289, f1282; +sub.f32 f348, f1289, f1282; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1277, f1288, f340; +sub.f32 f352, f1288, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1276, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1275, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1273, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1271, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1270, f1273, f1271; +sub.f32 f372, f1273, f1271; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1269, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1266, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1265, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1264, f1266, f1265; +sub.f32 f388, f1266, f1265; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1263, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1261, f389, 0f3F3504F3; +mul.f32 f1262, f1263, 0fBF3504F3; +sub.f32 f395, f1261, f1262; +mul.f32 f396, f1263, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1260, f1270, f1264; +sub.f32 f405, f1270, f1264; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1259, f1269, f397; +sub.f32 f409, f1269, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1258, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1257, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1259, 0fBEC3EF15; +mul.f32 f1256, f406, 0f3F6C835E; +sub.f32 f420, f1256, f419; +mul.f32 f421, f1259, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1258, 0fBF3504F3; +mul.f32 f1255, f410, 0f3F3504F3; +sub.f32 f425, f1255, f424; +mul.f32 f426, f1258, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1253, f414, 0f3EC3EF15; +mul.f32 f1254, f1257, 0fBF6C835E; +sub.f32 f430, f1253, f1254; +mul.f32 f431, f1257, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1251, f408, 0fBEC3EF15; +mul.f32 f1252, f409, 0fBF6C835E; +sub.f32 f435, f1251, f1252; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1250, f416, 0fBF6C835E; +sub.f32 f444, f1250, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1249, f1278, f1260; +sub.f32 f450, f1278, f1260; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1248, f1277, f422; +sub.f32 f454, f1277, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1247, f1276, f427; +sub.f32 f458, f1276, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1246, f1275, f432; +sub.f32 f462, f1275, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1245, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1244, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1243, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1242, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1248, 0fBE47C5C2; +mul.f32 f1241, f451, 0f3F7B14BE; +sub.f32 f481, f1241, f480; +mul.f32 f482, f1248, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1247, 0fBEC3EF15; +mul.f32 f1240, f455, 0f3F6C835E; +sub.f32 f486, f1240, f485; +mul.f32 f487, f1247, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1246, 0fBF0E39DA; +mul.f32 f1239, f459, 0f3F54DB31; +sub.f32 f491, f1239, f490; +mul.f32 f492, f1246, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1245, 0fBF3504F3; +mul.f32 f1238, f463, 0f3F3504F3; +sub.f32 f496, f1238, f495; +mul.f32 f497, f1245, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1236, f467, 0f3F0E39DA; +mul.f32 f1237, f1244, 0fBF54DB31; +sub.f32 f501, f1236, f1237; +mul.f32 f502, f1244, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1234, f471, 0f3EC3EF15; +mul.f32 f1235, f1243, 0fBF6C835E; +sub.f32 f506, f1234, f1235; +mul.f32 f507, f1243, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1232, f475, 0f3E47C5C2; +mul.f32 f1233, f1242, 0fBF7B14BE; +sub.f32 f511, f1232, f1233; +mul.f32 f512, f1242, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1230, f453, 0fBE47C5C2; +mul.f32 f1231, f454, 0fBF7B14BE; +sub.f32 f516, f1230, f1231; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1229, f457, 0fBEC3EF15; +sub.f32 f521, f1229, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1228, f461, 0fBF0E39DA; +sub.f32 f526, f1228, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1226, f469, 0fBF54DB31; +mul.f32 f1227, f470, 0fBF0E39DA; +sub.f32 f535, f1226, f1227; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1225, f473, 0fBF6C835E; +sub.f32 f540, f1225, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1224, f477, 0fBF7B14BE; +sub.f32 f545, f1224, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1223, f1301, f483; +sub.f32 f553, f1301, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1222, f1300, f488; +sub.f32 f557, f1300, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1221, f1299, f493; +sub.f32 f561, f1299, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1220, f1298, f498; +sub.f32 f565, f1298, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f1219, f1297, f503; +sub.f32 f569, f1297, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f1218, f1296, f508; +sub.f32 f573, f1296, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f1217, f1295, f513; +sub.f32 f577, f1295, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f1216, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f1215, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f1214, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f1213, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f1212, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1211, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1210, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1209, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +and.b32 r14, r15, 1; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f1223; +mul.f32 f616, f610, f1223; +mul.f32 f618, f611, f611; +mul.f32 f1208, f610, f610; +sub.f32 f619, f1208, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f1222; +mul.f32 f624, f619, f1222; +mul.f32 f626, f611, f621; +mul.f32 f1207, f610, f619; +sub.f32 f627, f1207, f626; +mul.f32 f1206, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f1221; +mul.f32 f632, f627, f1221; +mul.f32 f1204, f610, f627; +mul.f32 f1205, f611, f629; +sub.f32 f635, f1204, f1205; +mul.f32 f1203, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f1220; +mul.f32 f640, f635, f1220; +mul.f32 f642, f611, f637; +mul.f32 f1202, f610, f635; +sub.f32 f643, f1202, f642; +mul.f32 f1201, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f1219; +mul.f32 f648, f643, f1219; +mul.f32 f1199, f610, f643; +mul.f32 f1200, f611, f645; +sub.f32 f651, f1199, f1200; +mul.f32 f1198, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f1218; +mul.f32 f656, f651, f1218; +mul.f32 f658, f611, f653; +mul.f32 f1197, f610, f651; +sub.f32 f659, f1197, f658; +mul.f32 f1196, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f1217; +mul.f32 f664, f659, f1217; +mul.f32 f666, f611, f661; +mul.f32 f1195, f610, f659; +sub.f32 f667, f1195, f666; +mul.f32 f1194, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f1216; +mul.f32 f672, f667, f1216; +mul.f32 f1192, f610, f667; +mul.f32 f1193, f611, f669; +sub.f32 f675, f1192, f1193; +mul.f32 f1191, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f1215; +mul.f32 f680, f675, f1215; +mul.f32 f682, f611, f677; +mul.f32 f1190, f610, f675; +sub.f32 f683, f1190, f682; +mul.f32 f1189, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f1214; +mul.f32 f688, f683, f1214; +mul.f32 f690, f611, f685; +mul.f32 f1188, f610, f683; +sub.f32 f691, f1188, f690; +mul.f32 f1187, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f1213; +mul.f32 f696, f691, f1213; +mul.f32 f1185, f610, f691; +mul.f32 f1186, f611, f693; +sub.f32 f699, f1185, f1186; +mul.f32 f1184, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f1212; +mul.f32 f704, f699, f1212; +mul.f32 f706, f611, f701; +mul.f32 f1183, f610, f699; +sub.f32 f707, f1183, f706; +mul.f32 f1182, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f1211; +mul.f32 f712, f707, f1211; +mul.f32 f1180, f610, f707; +mul.f32 f1181, f611, f709; +sub.f32 f715, f1180, f1181; +mul.f32 f1179, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f1210; +mul.f32 f720, f715, f1210; +mul.f32 f722, f611, f717; +mul.f32 f1178, f610, f715; +sub.f32 f723, f1178, f722; +mul.f32 f1177, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f1209; +mul.f32 f728, f723, f1209; +mul.f32 f730, f611, f725; +mul.f32 f1176, f610, f723; +sub.f32 f731, f1176, f730; +mul.f32 f1175, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1174, f1302, f1249; +mul.f32 f735, f733, f1174; +mul.f32 f736, f731, f1174; +mul.f32 f1172, f610, f731; +mul.f32 f1173, f611, f733; +sub.f32 f739, f1172, f1173; +sub.f32 f1171, f272, f447; +mul.f32 f1170, f731, f1171; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1169, f610, f739; +sub.f32 f747, f1169, f746; +mul.f32 f1168, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1167, f610, f747; +sub.f32 f755, f1167, f754; +mul.f32 f1166, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f1164, f610, f755; +mul.f32 f1165, f611, f757; +sub.f32 f763, f1164, f1165; +mul.f32 f1163, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1162, f610, f763; +sub.f32 f771, f1162, f770; +mul.f32 f1161, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f1159, f610, f771; +mul.f32 f1160, f611, f773; +sub.f32 f779, f1159, f1160; +mul.f32 f1158, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1157, f610, f779; +sub.f32 f787, f1157, f786; +mul.f32 f1156, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1155, f610, f787; +sub.f32 f795, f1155, f794; +mul.f32 f1154, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f1152, f610, f795; +mul.f32 f1153, f611, f797; +sub.f32 f803, f1152, f1153; +mul.f32 f1151, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1150, f610, f803; +sub.f32 f811, f1150, f810; +mul.f32 f1149, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1148, f610, f811; +sub.f32 f819, f1148, f818; +mul.f32 f1147, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f1145, f610, f819; +mul.f32 f1146, f611, f821; +sub.f32 f827, f1145, f1146; +mul.f32 f1144, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1143, f610, f827; +sub.f32 f835, f1143, f834; +mul.f32 f1142, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f1140, f610, f835; +mul.f32 f1141, f611, f837; +sub.f32 f843, f1140, f1141; +mul.f32 f1139, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1138, f610, f843; +sub.f32 f851, f1138, f850; +mul.f32 f1137, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f1136, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r17, %tid.x; +shl.b32 r16, r17, 8; +barrier.sync 0; +and.b32 r11, r16, 256; +add.s32 r12, r9, r11; +sub.f32 f1350, f1302, f1249; +mul.f32 f1349, f733, f1350; +add.f32 f857, f1302, f1249; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 1; +sub.f32 f1351, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 1; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 1; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f1137, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f1206, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f1203, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f1201, f639; +sub.f32 f867, f1198, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f1196, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f1194, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f1191, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f1189, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f1187, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f1184, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f1182, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f1179, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f1177, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f1175, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f1351, f736; +sub.f32 f890, f1170, f1349; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f1168, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f1166, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f1163, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f1161, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f1158, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f1156, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f1154, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f1151, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f1149, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f1147, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f1144, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f1142, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f1139, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f1136, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +mad.lo.s32 r13, r22, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+16]; +ld.shared.v2.f32 {f929, f930}, [r13+32]; +ld.shared.v2.f32 {f933, f934}, [r13+48]; +ld.shared.v2.f32 {f937, f938}, [r13+64]; +ld.shared.v2.f32 {f941, f942}, [r13+80]; +ld.shared.v2.f32 {f945, f946}, [r13+96]; +ld.shared.v2.f32 {f949, f950}, [r13+112]; +ld.shared.v2.f32 {f953, f954}, [r13+128]; +ld.shared.v2.f32 {f957, f958}, [r13+144]; +ld.shared.v2.f32 {f961, f962}, [r13+160]; +ld.shared.v2.f32 {f965, f966}, [r13+176]; +ld.shared.v2.f32 {f969, f970}, [r13+192]; +ld.shared.v2.f32 {f973, f974}, [r13+208]; +ld.shared.v2.f32 {f977, f978}, [r13+224]; +ld.shared.v2.f32 {f981, f982}, [r13+240]; +ld.shared.v2.f32 {f985, f986}, [r13+256]; +ld.shared.v2.f32 {f989, f990}, [r13+272]; +ld.shared.v2.f32 {f993, f994}, [r13+288]; +ld.shared.v2.f32 {f997, f998}, [r13+304]; +ld.shared.v2.f32 {f1001, f1002}, [r13+320]; +ld.shared.v2.f32 {f1005, f1006}, [r13+336]; +ld.shared.v2.f32 {f1009, f1010}, [r13+352]; +ld.shared.v2.f32 {f1013, f1014}, [r13+368]; +ld.shared.v2.f32 {f1017, f1018}, [r13+384]; +ld.shared.v2.f32 {f1021, f1022}, [r13+400]; +ld.shared.v2.f32 {f1025, f1026}, [r13+416]; +ld.shared.v2.f32 {f1029, f1030}, [r13+432]; +ld.shared.v2.f32 {f1033, f1034}, [r13+448]; +ld.shared.v2.f32 {f1037, f1038}, [r13+464]; +ld.shared.v2.f32 {f1041, f1042}, [r13+480]; +ld.shared.v2.f32 {f1045, f1046}, [r13+496]; +add.f32 %1, f922, f986; +add.f32 %0, f921, f985; +add.f32 %2, f925, f989; +add.f32 %3, f926, f990; +add.f32 %4, f929, f993; +add.f32 %5, f930, f994; +add.f32 %6, f933, f997; +add.f32 %7, f934, f998; +add.f32 %8, f937, f1001; +add.f32 %9, f938, f1002; +add.f32 %11, f942, f1006; +add.f32 %10, f941, f1005; +add.f32 %13, f946, f1010; +add.f32 %12, f945, f1009; +add.f32 %15, f950, f1014; +add.f32 %14, f949, f1013; +add.f32 %16, f953, f1017; +add.f32 %17, f954, f1018; +add.f32 %18, f957, f1021; +add.f32 %19, f958, f1022; +add.f32 %20, f961, f1025; +add.f32 %21, f962, f1026; +add.f32 %23, f966, f1030; +add.f32 %22, f965, f1029; +add.f32 %25, f970, f1034; +add.f32 %24, f969, f1033; +add.f32 %27, f974, f1038; +add.f32 %26, f973, f1037; +add.f32 %28, f977, f1041; +add.f32 %29, f978, f1042; +add.f32 %30, f981, f1045; +add.f32 %31, f982, f1046; +sub.f32 %33, f922, f986; +sub.f32 %32, f921, f985; +sub.f32 %35, f926, f990; +sub.f32 %34, f925, f989; +sub.f32 %37, f930, f994; +sub.f32 %36, f929, f993; +sub.f32 %39, f934, f998; +sub.f32 %38, f933, f997; +sub.f32 %41, f938, f1002; +sub.f32 %40, f937, f1001; +sub.f32 %43, f942, f1006; +sub.f32 %42, f941, f1005; +sub.f32 %45, f946, f1010; +sub.f32 %44, f945, f1009; +sub.f32 %47, f950, f1014; +sub.f32 %46, f949, f1013; +sub.f32 %49, f954, f1018; +sub.f32 %48, f953, f1017; +sub.f32 %51, f958, f1022; +sub.f32 %50, f957, f1021; +sub.f32 %53, f962, f1026; +sub.f32 %52, f961, f1025; +sub.f32 %55, f966, f1030; +sub.f32 %54, f965, f1029; +sub.f32 %57, f970, f1034; +sub.f32 %56, f969, f1033; +sub.f32 %59, f974, f1038; +sub.f32 %58, f973, f1037; +sub.f32 %61, f978, f1042; +sub.f32 %60, f977, f1041; +sub.f32 %63, f982, f1046; +sub.f32 %62, f981, f1045; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<48, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<118>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %10, %12; +sub.f32 f10, %11, %13; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 496; +add.s32 r11, r8, r10; +add.f32 f18, %11, %13; +add.f32 f19, %10, %12; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 248; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+256]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 480; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 240; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+256]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f53, f51; +mul.f32 f58, f54, f52; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 448; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f54, f51, f59; +sub.f32 f63, f57, f58; +st.shared.v2.f32 [r25+32], {f63, f62}; +barrier.sync 0; +and.b32 r26, r9, 224; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+256]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 24; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f74, f72; +mul.f32 f79, f75, f73; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 384; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f75, f72, f80; +sub.f32 f84, f78, f79; +st.shared.v2.f32 [r32+64], {f84, f83}; +barrier.sync 0; +and.b32 r33, r9, 192; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+256]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 1; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f95, f93; +mul.f32 f100, f96, f94; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 256; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f96, f93, f101; +sub.f32 f105, f99, f100; +st.shared.v2.f32 [r39+128], {f105, f104}; +barrier.sync 0; +and.b32 r40, r9, 128; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+256]; +add.f32 %1, f107, f111; +add.f32 %0, f106, f110; +sub.f32 %3, f107, f111; +sub.f32 %2, f106, f110; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<49, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<98>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %10, %12; +add.f32 f10, %11, %13; +sub.f32 f11, %10, %12; +sub.f32 f12, %11, %13; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 248; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 124; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+128]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+128]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 240; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 120; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+128]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+128]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f47, f45; +mul.f32 f52, f48, f46; +sub.f32 f53, f51, f52; +mul.f32 f54, f47, f46; +fma.rn.f32 f55, f48, f45, f54; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 224; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f53; +barrier.sync 0; +and.b32 r26, r11, 112; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+128]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+128]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 24; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f64, f62; +mul.f32 f69, f65, f63; +sub.f32 f70, f68, f69; +mul.f32 f71, f64, f63; +fma.rn.f32 f72, f65, f62, f71; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 192; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f70; +barrier.sync 0; +and.b32 r33, r11, 96; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+128]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+128]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 1; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f81, f79; +mul.f32 f86, f82, f80; +sub.f32 f87, f85, f86; +mul.f32 f88, f81, f80; +fma.rn.f32 f89, f82, f79, f88; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 128; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f87; +barrier.sync 0; +and.b32 r40, r11, 64; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+128]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+128]; +add.f32 %0, f90, f91; +add.f32 %1, f92, f93; +sub.f32 %2, f90, f91; +sub.f32 %3, f92, f93; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<50, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1301>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1299, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1297, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1296, f1299, f1297; +sub.f32 f140, f1299, f1297; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f1295, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1292, %132, %133; +sub.f32 f148, %132, %133; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1290, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1289, f1292, f1290; +sub.f32 f156, f1292, f1290; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f1288, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f1288, 0fBF3504F3; +mul.f32 f1287, f157, 0f3F3504F3; +sub.f32 f163, f1287, f162; +mul.f32 f164, f1288, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1286, f1296, f1289; +sub.f32 f173, f1296, f1289; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1285, f1295, f165; +sub.f32 f177, f1295, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f1284, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f1283, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1281, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1278, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1277, f1281, f1278; +sub.f32 f197, f1281, f1278; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f1276, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1274, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1272, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1271, f1274, f1272; +sub.f32 f213, f1274, f1272; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f1270, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f1270, 0fBF3504F3; +mul.f32 f1269, f214, 0f3F3504F3; +sub.f32 f220, f1269, f219; +mul.f32 f221, f1270, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1268, f1277, f1271; +sub.f32 f230, f1277, f1271; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1267, f1276, f222; +sub.f32 f234, f1276, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f1266, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f1265, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1263, f231, 0f3F6C835E; +mul.f32 f1264, f1267, 0fBEC3EF15; +sub.f32 f245, f1263, f1264; +mul.f32 f246, f1267, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f1261, f235, 0f3F3504F3; +mul.f32 f1262, f1266, 0fBF3504F3; +sub.f32 f250, f1261, f1262; +mul.f32 f251, f1266, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f1259, f239, 0f3EC3EF15; +mul.f32 f1260, f1265, 0fBF6C835E; +sub.f32 f255, f1259, f1260; +mul.f32 f256, f1265, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f1257, f233, 0fBEC3EF15; +mul.f32 f1258, f234, 0fBF6C835E; +sub.f32 f260, f1257, f1258; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f1255, f241, 0fBF6C835E; +mul.f32 f1256, f242, 0fBEC3EF15; +sub.f32 f269, f1255, f1256; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1254, f1286, f1268; +sub.f32 f275, f1286, f1268; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1253, f1285, f247; +sub.f32 f279, f1285, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1252, f1284, f252; +sub.f32 f283, f1284, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f1251, f1283, f257; +sub.f32 f287, f1283, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f1250, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f1249, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f1248, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1247, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1244, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1242, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1241, f1244, f1242; +sub.f32 f315, f1244, f1242; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f1240, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1238, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1235, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1234, f1238, f1235; +sub.f32 f331, f1238, f1235; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f1233, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f1231, f332, 0f3F3504F3; +mul.f32 f1232, f1233, 0fBF3504F3; +sub.f32 f338, f1231, f1232; +mul.f32 f339, f1233, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1230, f1241, f1234; +sub.f32 f348, f1241, f1234; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1229, f1240, f340; +sub.f32 f352, f1240, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f1228, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f1227, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1225, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1223, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1222, f1225, f1223; +sub.f32 f372, f1225, f1223; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f1221, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1218, %148, %149; +sub.f32 f380, %148, %149; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1217, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1216, f1218, f1217; +sub.f32 f388, f1218, f1217; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f1215, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f1213, f389, 0f3F3504F3; +mul.f32 f1214, f1215, 0fBF3504F3; +sub.f32 f395, f1213, f1214; +mul.f32 f396, f1215, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1212, f1222, f1216; +sub.f32 f405, f1222, f1216; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1211, f1221, f397; +sub.f32 f409, f1221, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f1210, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f1209, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1211, 0fBEC3EF15; +mul.f32 f1208, f406, 0f3F6C835E; +sub.f32 f420, f1208, f419; +mul.f32 f421, f1211, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f1210, 0fBF3504F3; +mul.f32 f1207, f410, 0f3F3504F3; +sub.f32 f425, f1207, f424; +mul.f32 f426, f1210, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f1205, f414, 0f3EC3EF15; +mul.f32 f1206, f1209, 0fBF6C835E; +sub.f32 f430, f1205, f1206; +mul.f32 f431, f1209, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f1203, f408, 0fBEC3EF15; +mul.f32 f1204, f409, 0fBF6C835E; +sub.f32 f435, f1203, f1204; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f1202, f416, 0fBF6C835E; +sub.f32 f444, f1202, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1201, f1230, f1212; +sub.f32 f450, f1230, f1212; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1200, f1229, f422; +sub.f32 f454, f1229, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1199, f1228, f427; +sub.f32 f458, f1228, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f1198, f1227, f432; +sub.f32 f462, f1227, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f1197, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f1196, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f1195, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1194, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1200, 0fBE47C5C2; +mul.f32 f1193, f451, 0f3F7B14BE; +sub.f32 f481, f1193, f480; +mul.f32 f482, f1200, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f1199, 0fBEC3EF15; +mul.f32 f1192, f455, 0f3F6C835E; +sub.f32 f486, f1192, f485; +mul.f32 f487, f1199, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f1198, 0fBF0E39DA; +mul.f32 f1191, f459, 0f3F54DB31; +sub.f32 f491, f1191, f490; +mul.f32 f492, f1198, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f1197, 0fBF3504F3; +mul.f32 f1190, f463, 0f3F3504F3; +sub.f32 f496, f1190, f495; +mul.f32 f497, f1197, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f1188, f467, 0f3F0E39DA; +mul.f32 f1189, f1196, 0fBF54DB31; +sub.f32 f501, f1188, f1189; +mul.f32 f502, f1196, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f1186, f471, 0f3EC3EF15; +mul.f32 f1187, f1195, 0fBF6C835E; +sub.f32 f506, f1186, f1187; +mul.f32 f507, f1195, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f1184, f475, 0f3E47C5C2; +mul.f32 f1185, f1194, 0fBF7B14BE; +sub.f32 f511, f1184, f1185; +mul.f32 f512, f1194, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f1182, f453, 0fBE47C5C2; +mul.f32 f1183, f454, 0fBF7B14BE; +sub.f32 f516, f1182, f1183; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f1181, f457, 0fBEC3EF15; +sub.f32 f521, f1181, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f1180, f461, 0fBF0E39DA; +sub.f32 f526, f1180, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f1178, f469, 0fBF54DB31; +mul.f32 f1179, f470, 0fBF0E39DA; +sub.f32 f535, f1178, f1179; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f1177, f473, 0fBF6C835E; +sub.f32 f540, f1177, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f1176, f477, 0fBF7B14BE; +sub.f32 f545, f1176, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1175, f1254, f1201; +sub.f32 f551, f1254, f1201; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1174, f1253, f483; +sub.f32 f555, f1253, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1173, f1252, f488; +sub.f32 f559, f1252, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1172, f1251, f493; +sub.f32 f563, f1251, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1171, f1250, f498; +sub.f32 f567, f1250, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f1170, f1249, f503; +sub.f32 f571, f1249, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f1169, f1248, f508; +sub.f32 f575, f1248, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f1168, f1247, f513; +sub.f32 f579, f1247, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f1167, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f1166, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f1165, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f1164, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f1163, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1162, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1161, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1160, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f1174; +mul.f32 f1159, f612, f552; +sub.f32 f618, f1159, f617; +mul.f32 f619, f612, f1174; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f1157, f612, f612; +mul.f32 f1158, f613, f613; +sub.f32 f623, f1157, f1158; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f1155, f623, f556; +mul.f32 f1156, f625, f1173; +sub.f32 f628, f1155, f1156; +mul.f32 f629, f623, f1173; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f1153, f612, f623; +mul.f32 f1154, f613, f625; +sub.f32 f633, f1153, f1154; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f1151, f633, f560; +mul.f32 f1152, f635, f1172; +sub.f32 f638, f1151, f1152; +mul.f32 f639, f633, f1172; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f1150, f612, f633; +sub.f32 f643, f1150, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f1171; +mul.f32 f1149, f643, f564; +sub.f32 f648, f1149, f647; +mul.f32 f649, f643, f1171; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f1148, f612, f643; +sub.f32 f653, f1148, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f1170; +mul.f32 f1147, f653, f568; +sub.f32 f658, f1147, f657; +mul.f32 f659, f653, f1170; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f1146, f612, f653; +sub.f32 f663, f1146, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f1144, f663, f572; +mul.f32 f1145, f665, f1169; +sub.f32 f668, f1144, f1145; +mul.f32 f669, f663, f1169; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f1142, f612, f663; +mul.f32 f1143, f613, f665; +sub.f32 f673, f1142, f1143; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f1140, f673, f576; +mul.f32 f1141, f675, f1168; +sub.f32 f678, f1140, f1141; +mul.f32 f679, f673, f1168; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f1138, f612, f673; +mul.f32 f1139, f613, f675; +sub.f32 f683, f1138, f1139; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f1167; +mul.f32 f1137, f683, f580; +sub.f32 f688, f1137, f687; +mul.f32 f689, f683, f1167; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f1136, f612, f683; +sub.f32 f693, f1136, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f1166; +mul.f32 f1135, f693, f584; +sub.f32 f698, f1135, f697; +mul.f32 f699, f693, f1166; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f1134, f612, f693; +sub.f32 f703, f1134, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f1165; +mul.f32 f1133, f703, f588; +sub.f32 f708, f1133, f707; +mul.f32 f709, f703, f1165; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f1131, f612, f703; +mul.f32 f1132, f613, f705; +sub.f32 f713, f1131, f1132; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f1129, f713, f592; +mul.f32 f1130, f715, f1164; +sub.f32 f718, f1129, f1130; +mul.f32 f719, f713, f1164; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f1127, f612, f713; +mul.f32 f1128, f613, f715; +sub.f32 f723, f1127, f1128; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f1125, f723, f596; +mul.f32 f1126, f725, f1163; +sub.f32 f728, f1125, f1126; +mul.f32 f729, f723, f1163; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f1124, f612, f723; +sub.f32 f733, f1124, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f1162; +mul.f32 f1123, f733, f600; +sub.f32 f738, f1123, f737; +mul.f32 f739, f733, f1162; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f1122, f612, f733; +sub.f32 f743, f1122, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f1161; +mul.f32 f1121, f743, f604; +sub.f32 f748, f1121, f747; +mul.f32 f749, f743, f1161; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f1120, f612, f743; +sub.f32 f753, f1120, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f1160; +mul.f32 f1119, f753, f608; +sub.f32 f758, f1119, f757; +mul.f32 f759, f753, f1160; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f1117, f612, f753; +mul.f32 f1118, f613, f755; +sub.f32 f763, f1117, f1118; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f1115, f763, f550; +mul.f32 f1116, f765, f551; +sub.f32 f768, f1115, f1116; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f1113, f612, f763; +mul.f32 f1114, f613, f765; +sub.f32 f773, f1113, f1114; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f1112, f773, f554; +sub.f32 f778, f1112, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f1111, f612, f773; +sub.f32 f783, f1111, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f1110, f783, f558; +sub.f32 f788, f1110, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f1109, f612, f783; +sub.f32 f793, f1109, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f1108, f793, f562; +sub.f32 f798, f1108, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f1107, f612, f793; +sub.f32 f803, f1107, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f1105, f803, f566; +mul.f32 f1106, f805, f567; +sub.f32 f808, f1105, f1106; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f1103, f612, f803; +mul.f32 f1104, f613, f805; +sub.f32 f813, f1103, f1104; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f1101, f813, f570; +mul.f32 f1102, f815, f571; +sub.f32 f818, f1101, f1102; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f1099, f612, f813; +mul.f32 f1100, f613, f815; +sub.f32 f823, f1099, f1100; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f1098, f823, f574; +sub.f32 f828, f1098, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f1097, f612, f823; +sub.f32 f833, f1097, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f1096, f833, f578; +sub.f32 f838, f1096, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f1095, f612, f833; +sub.f32 f843, f1095, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f1094, f843, f582; +sub.f32 f848, f1094, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f1092, f612, f843; +mul.f32 f1093, f613, f845; +sub.f32 f853, f1092, f1093; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f1090, f853, f586; +mul.f32 f1091, f855, f587; +sub.f32 f858, f1090, f1091; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f1088, f612, f853; +mul.f32 f1089, f613, f855; +sub.f32 f863, f1088, f1089; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f1086, f863, f590; +mul.f32 f1087, f865, f591; +sub.f32 f868, f1086, f1087; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f1085, f612, f863; +sub.f32 f873, f1085, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f1084, f873, f594; +sub.f32 f878, f1084, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f1083, f612, f873; +sub.f32 f883, f1083, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f1082, f883, f598; +sub.f32 f888, f1082, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f1081, f612, f883; +sub.f32 f893, f1081, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f1079, f893, f602; +mul.f32 f1080, f895, f603; +sub.f32 f898, f1079, f1080; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f1077, f612, f893; +mul.f32 f1078, f613, f895; +sub.f32 f903, f1077, f1078; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f1075, f903, f606; +mul.f32 f1076, f905, f607; +sub.f32 f908, f1075, f1076; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f1073, f612, f903; +mul.f32 f1074, f613, f905; +sub.f32 f913, f1073, f1074; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f1072, f913, f610; +sub.f32 f918, f1072, f917; +mov.u32 r17, %tid.x; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +and.b32 r14, r17, 1; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 128; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+8]; +ld.shared.f32 f923, [r13+16]; +ld.shared.f32 f924, [r13+24]; +ld.shared.f32 f925, [r13+32]; +ld.shared.f32 f926, [r13+40]; +ld.shared.f32 f927, [r13+48]; +ld.shared.f32 f928, [r13+56]; +ld.shared.f32 f929, [r13+64]; +ld.shared.f32 f930, [r13+72]; +ld.shared.f32 f931, [r13+80]; +ld.shared.f32 f932, [r13+88]; +ld.shared.f32 f933, [r13+96]; +ld.shared.f32 f934, [r13+104]; +ld.shared.f32 f935, [r13+112]; +ld.shared.f32 f936, [r13+120]; +ld.shared.f32 f937, [r13+128]; +ld.shared.f32 f938, [r13+136]; +ld.shared.f32 f939, [r13+144]; +ld.shared.f32 f940, [r13+152]; +ld.shared.f32 f941, [r13+160]; +ld.shared.f32 f942, [r13+168]; +ld.shared.f32 f943, [r13+176]; +ld.shared.f32 f944, [r13+184]; +ld.shared.f32 f945, [r13+192]; +ld.shared.f32 f946, [r13+200]; +ld.shared.f32 f947, [r13+208]; +ld.shared.f32 f948, [r13+216]; +ld.shared.f32 f949, [r13+224]; +ld.shared.f32 f950, [r13+232]; +ld.shared.f32 f951, [r13+240]; +ld.shared.f32 f952, [r13+248]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1175, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+8]; +ld.shared.f32 f955, [r13+16]; +ld.shared.f32 f956, [r13+24]; +ld.shared.f32 f957, [r13+32]; +ld.shared.f32 f958, [r13+40]; +ld.shared.f32 f959, [r13+48]; +ld.shared.f32 f960, [r13+56]; +ld.shared.f32 f961, [r13+64]; +ld.shared.f32 f962, [r13+72]; +ld.shared.f32 f963, [r13+80]; +ld.shared.f32 f964, [r13+88]; +ld.shared.f32 f965, [r13+96]; +ld.shared.f32 f966, [r13+104]; +ld.shared.f32 f967, [r13+112]; +ld.shared.f32 f968, [r13+120]; +ld.shared.f32 f969, [r13+128]; +ld.shared.f32 f970, [r13+136]; +ld.shared.f32 f971, [r13+144]; +ld.shared.f32 f972, [r13+152]; +ld.shared.f32 f973, [r13+160]; +ld.shared.f32 f974, [r13+168]; +ld.shared.f32 f975, [r13+176]; +ld.shared.f32 f976, [r13+184]; +ld.shared.f32 f977, [r13+192]; +ld.shared.f32 f978, [r13+200]; +ld.shared.f32 f979, [r13+208]; +ld.shared.f32 f980, [r13+216]; +ld.shared.f32 f981, [r13+224]; +ld.shared.f32 f982, [r13+232]; +ld.shared.f32 f983, [r13+240]; +ld.shared.f32 f984, [r13+248]; +add.f32 %0, f921, f937; +add.f32 %1, f953, f969; +add.f32 %3, f954, f970; +add.f32 %2, f922, f938; +add.f32 %5, f955, f971; +add.f32 %4, f923, f939; +add.f32 %7, f956, f972; +add.f32 %6, f924, f940; +add.f32 %9, f957, f973; +add.f32 %8, f925, f941; +add.f32 %10, f926, f942; +add.f32 %11, f958, f974; +add.f32 %12, f927, f943; +add.f32 %13, f959, f975; +add.f32 %14, f928, f944; +add.f32 %15, f960, f976; +add.f32 %17, f961, f977; +add.f32 %16, f929, f945; +add.f32 %19, f962, f978; +add.f32 %18, f930, f946; +add.f32 %21, f963, f979; +add.f32 %20, f931, f947; +add.f32 %22, f932, f948; +add.f32 %23, f964, f980; +add.f32 %24, f933, f949; +add.f32 %25, f965, f981; +add.f32 %26, f934, f950; +add.f32 %27, f966, f982; +add.f32 %29, f967, f983; +add.f32 %28, f935, f951; +add.f32 %31, f968, f984; +add.f32 %30, f936, f952; +sub.f32 %32, f921, f937; +sub.f32 %33, f953, f969; +sub.f32 %34, f922, f938; +sub.f32 %35, f954, f970; +sub.f32 %36, f923, f939; +sub.f32 %37, f955, f971; +sub.f32 %38, f924, f940; +sub.f32 %39, f956, f972; +sub.f32 %40, f925, f941; +sub.f32 %41, f957, f973; +sub.f32 %42, f926, f942; +sub.f32 %43, f958, f974; +sub.f32 %44, f927, f943; +sub.f32 %45, f959, f975; +sub.f32 %46, f928, f944; +sub.f32 %47, f960, f976; +sub.f32 %48, f929, f945; +sub.f32 %49, f961, f977; +sub.f32 %50, f930, f946; +sub.f32 %51, f962, f978; +sub.f32 %52, f931, f947; +sub.f32 %53, f963, f979; +sub.f32 %54, f932, f948; +sub.f32 %55, f964, f980; +sub.f32 %56, f933, f949; +sub.f32 %57, f965, f981; +sub.f32 %58, f934, f950; +sub.f32 %59, f966, f982; +sub.f32 %60, f935, f951; +sub.f32 %61, f967, f983; +sub.f32 %62, f936, f952; +sub.f32 %63, f968, f984; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..cb0ef76fc4c60 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp32_inv.hpp.inc @@ -0,0 +1,3952 @@ +#ifndef CUFFTDX_FFT_64_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_64_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<243, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<248>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 448; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+64]; +ld.shared.v2.f32 {f167, f168}, [r13+128]; +ld.shared.v2.f32 {f171, f172}, [r13+192]; +ld.shared.v2.f32 {f175, f176}, [r13+256]; +ld.shared.v2.f32 {f179, f180}, [r13+320]; +ld.shared.v2.f32 {f183, f184}, [r13+384]; +ld.shared.v2.f32 {f187, f188}, [r13+448]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +add.f32 %1, f200, f216; +add.f32 %0, f199, f215; +add.f32 %3, f204, f226; +add.f32 %2, f203, f225; +add.f32 %5, f202, f217; +sub.f32 %4, f201, f218; +add.f32 %7, f206, f231; +add.f32 %6, f205, f229; +sub.f32 %9, f200, f216; +sub.f32 %8, f199, f215; +sub.f32 %11, f204, f226; +sub.f32 %10, f203, f225; +sub.f32 %13, f202, f217; +add.f32 %12, f201, f218; +sub.f32 %15, f206, f231; +sub.f32 %14, f205, f229; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<244, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<232>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %18, %28; +add.f32 f34, %19, %30; +sub.f32 f35, %18, %28; +sub.f32 f36, %19, %30; +add.f32 f37, %23, %34; +add.f32 f38, %25, %35; +sub.f32 f39, %23, %34; +sub.f32 f40, %25, %35; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %20, %31; +add.f32 f50, %22, %33; +sub.f32 f51, %20, %31; +sub.f32 f52, %22, %33; +add.f32 f53, %26, %36; +add.f32 f54, %27, %37; +sub.f32 f55, %26, %36; +sub.f32 f56, %27, %37; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 56; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 224; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+32]; +ld.shared.f32 f161, [r13+64]; +ld.shared.f32 f162, [r13+96]; +ld.shared.f32 f163, [r13+128]; +ld.shared.f32 f164, [r13+160]; +ld.shared.f32 f165, [r13+192]; +ld.shared.f32 f166, [r13+224]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+32]; +ld.shared.f32 f169, [r13+64]; +ld.shared.f32 f170, [r13+96]; +ld.shared.f32 f171, [r13+128]; +ld.shared.f32 f172, [r13+160]; +ld.shared.f32 f173, [r13+192]; +ld.shared.f32 f174, [r13+224]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 %0, f183, f199; +add.f32 %1, f184, f200; +add.f32 %3, f188, f210; +add.f32 %2, f187, f209; +add.f32 %5, f186, f201; +sub.f32 %4, f185, f202; +add.f32 %7, f190, f215; +add.f32 %6, f189, f213; +sub.f32 %8, f183, f199; +sub.f32 %9, f184, f200; +sub.f32 %11, f188, f210; +sub.f32 %10, f187, f209; +sub.f32 %13, f186, f201; +add.f32 %12, f185, f202; +sub.f32 %15, f190, f215; +sub.f32 %14, f189, f213; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<245, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<155>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 480; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+128]; +ld.shared.v2.f32 {f70, f71}, [r13+256]; +ld.shared.v2.f32 {f74, f75}, [r13+384]; +add.f32 f78, f62, f70; +add.f32 f79, f63, f71; +sub.f32 f80, f62, f70; +sub.f32 f81, f63, f71; +add.f32 f82, f66, f74; +add.f32 f83, f67, f75; +sub.f32 f84, f66, f74; +sub.f32 f85, f67, f75; +sub.f32 f86, f78, f82; +sub.f32 f87, f79, f83; +sub.f32 f88, f80, f85; +add.f32 f89, f81, f84; +add.f32 f90, f80, f85; +sub.f32 f91, f81, f84; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f92, f93}, [rd8]; +mul.f32 f96, f89, f93; +mul.f32 f97, f88, f93; +mul.f32 f98, f92, f89; +mul.f32 f99, f92, f92; +mul.f32 f100, f93, f93; +sub.f32 f101, f99, f100; +mul.f32 f102, f93, f92; +fma.rn.f32 f103, f93, f92, f102; +mul.f32 f104, f87, f103; +mul.f32 f105, f86, f103; +mul.f32 f106, f101, f87; +mul.f32 f107, f92, f101; +mul.f32 f108, f93, f103; +sub.f32 f109, f107, f108; +mul.f32 f110, f92, f103; +fma.rn.f32 f111, f93, f101, f110; +mul.f32 f112, f91, f111; +mul.f32 f113, f90, f111; +mul.f32 f114, f109, f91; +and.b32 r16, r10, 24; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 384; +add.s32 r19, r17, r18; +add.f32 f115, f79, f83; +add.f32 f116, f78, f82; +st.shared.v2.f32 [r19], {f116, f115}; +fma.rn.f32 f117, f92, f88, f96; +sub.f32 f118, f98, f97; +st.shared.v2.f32 [r19+32], {f117, f118}; +fma.rn.f32 f119, f101, f86, f104; +sub.f32 f120, f106, f105; +st.shared.v2.f32 [r19+64], {f119, f120}; +sub.f32 f121, f114, f113; +fma.rn.f32 f122, f109, f90, f112; +st.shared.v2.f32 [r19+96], {f122, f121}; +barrier.sync 0; +mad.lo.s32 r20, r14, -24, r19; +ld.shared.v2.f32 {f123, f124}, [r20]; +ld.shared.v2.f32 {f127, f128}, [r20+128]; +ld.shared.v2.f32 {f131, f132}, [r20+256]; +ld.shared.v2.f32 {f135, f136}, [r20+384]; +add.f32 f139, f123, f131; +add.f32 f140, f124, f132; +sub.f32 f141, f123, f131; +sub.f32 f142, f124, f132; +add.f32 f143, f127, f135; +add.f32 f144, f128, f136; +sub.f32 f145, f127, f135; +sub.f32 f146, f128, f136; +add.f32 %1, f140, f144; +add.f32 %0, f139, f143; +add.f32 %3, f142, f145; +sub.f32 %2, f141, f146; +sub.f32 %5, f140, f144; +sub.f32 %4, f139, f143; +sub.f32 %7, f142, f145; +add.f32 %6, f141, f146; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<246, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<139>; +.reg .b32 r<22>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %11, %16; +add.f32 f18, %12, %18; +sub.f32 f19, %11, %16; +sub.f32 f20, %12, %18; +add.f32 f21, %13, %19; +add.f32 f22, %15, %20; +sub.f32 f23, %13, %19; +sub.f32 f24, %15, %20; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 120; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 240; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+64]; +ld.shared.f32 f64, [r13+128]; +ld.shared.f32 f65, [r13+192]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+64]; +ld.shared.f32 f68, [r13+128]; +ld.shared.f32 f69, [r13+192]; +add.f32 f70, f62, f64; +add.f32 f71, f66, f68; +sub.f32 f72, f62, f64; +sub.f32 f73, f66, f68; +add.f32 f74, f63, f65; +add.f32 f75, f67, f69; +sub.f32 f76, f63, f65; +sub.f32 f77, f67, f69; +add.f32 f78, f70, f74; +add.f32 f79, f71, f75; +sub.f32 f80, f70, f74; +sub.f32 f81, f71, f75; +sub.f32 f82, f72, f77; +add.f32 f83, f73, f76; +add.f32 f84, f72, f77; +sub.f32 f85, f73, f76; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f86, f87}, [rd8]; +mul.f32 f90, f83, f87; +fma.rn.f32 f91, f86, f82, f90; +mul.f32 f92, f82, f87; +mul.f32 f93, f86, f83; +sub.f32 f94, f93, f92; +mul.f32 f95, f86, f86; +mul.f32 f96, f87, f87; +sub.f32 f97, f95, f96; +mul.f32 f98, f87, f86; +fma.rn.f32 f99, f87, f86, f98; +mul.f32 f100, f81, f99; +fma.rn.f32 f101, f97, f80, f100; +mul.f32 f102, f80, f99; +mul.f32 f103, f97, f81; +sub.f32 f104, f103, f102; +mul.f32 f105, f86, f97; +mul.f32 f106, f87, f99; +sub.f32 f107, f105, f106; +mul.f32 f108, f86, f99; +fma.rn.f32 f109, f87, f97, f108; +mul.f32 f110, f85, f109; +fma.rn.f32 f111, f107, f84, f110; +mul.f32 f112, f84, f109; +mul.f32 f113, f107, f85; +sub.f32 f114, f113, f112; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 12; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 192; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f78; +st.shared.f32 [r20+16], f91; +st.shared.f32 [r20+32], f101; +st.shared.f32 [r20+48], f111; +barrier.sync 0; +mad.lo.s32 r21, r14, -12, r20; +ld.shared.f32 f115, [r21]; +ld.shared.f32 f116, [r21+64]; +ld.shared.f32 f117, [r21+128]; +ld.shared.f32 f118, [r21+192]; +barrier.sync 0; +st.shared.f32 [r20], f79; +st.shared.f32 [r20+16], f94; +st.shared.f32 [r20+32], f104; +st.shared.f32 [r20+48], f114; +barrier.sync 0; +ld.shared.f32 f119, [r21]; +ld.shared.f32 f120, [r21+64]; +ld.shared.f32 f121, [r21+128]; +ld.shared.f32 f122, [r21+192]; +add.f32 f123, f115, f117; +add.f32 f124, f119, f121; +sub.f32 f125, f115, f117; +sub.f32 f126, f119, f121; +add.f32 f127, f116, f118; +add.f32 f128, f120, f122; +sub.f32 f129, f116, f118; +sub.f32 f130, f120, f122; +add.f32 %0, f123, f127; +add.f32 %1, f124, f128; +add.f32 %3, f126, f129; +sub.f32 %2, f125, f130; +sub.f32 %4, f123, f127; +sub.f32 %5, f124, f128; +sub.f32 %7, f126, f129; +add.f32 %6, f125, f130; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_64), "l"(lut_sp_4_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<247, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<630>; +.reg .b32 r<22>; +.reg .b64 rd<8>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %34, %50; +sub.f32 f67, %34, %50; +add.f32 f624, %35, %66; +sub.f32 f68, %35, %66; +add.f32 f69, %42, %58; +sub.f32 f71, %42, %58; +add.f32 f622, %67, %59; +sub.f32 f72, %67, %59; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f621, f624, f622; +sub.f32 f76, f624, f622; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f620, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %38, %54; +sub.f32 f83, %38, %54; +add.f32 f617, %69, %68; +sub.f32 f84, %69, %68; +add.f32 f85, %46, %62; +sub.f32 f87, %46, %62; +add.f32 f615, %47, %70; +sub.f32 f88, %47, %70; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f614, f617, f615; +sub.f32 f92, f617, f615; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f613, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f613, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f611, f95, 0fBF3504F3; +mul.f32 f612, f96, 0f3F3504F3; +sub.f32 f103, f611, f612; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f610, f621, f614; +sub.f32 f109, f621, f614; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f609, f620, f100; +sub.f32 f113, f620, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f608, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f607, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %52; +sub.f32 f124, %36, %52; +add.f32 f605, %71, %53; +sub.f32 f125, %71, %53; +add.f32 f126, %44, %60; +sub.f32 f128, %44, %60; +add.f32 f602, %72, %73; +sub.f32 f129, %72, %73; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f601, f605, f602; +sub.f32 f133, f605, f602; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f600, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %40, %56; +sub.f32 f140, %40, %56; +add.f32 f598, %41, %74; +sub.f32 f141, %41, %74; +add.f32 f142, %48, %64; +sub.f32 f144, %48, %64; +add.f32 f596, %75, %65; +sub.f32 f145, %75, %65; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f595, f598, f596; +sub.f32 f149, f598, f596; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f594, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f594, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f592, f152, 0fBF3504F3; +mul.f32 f593, f153, 0f3F3504F3; +sub.f32 f160, f592, f593; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f591, f601, f595; +sub.f32 f166, f601, f595; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f590, f600, f157; +sub.f32 f170, f600, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f589, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f588, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f586, f167, 0f3F6C835E; +mul.f32 f587, f590, 0f3EC3EF15; +sub.f32 f181, f586, f587; +mul.f32 f182, f590, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f589, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f588, 0f3F6C835E; +mul.f32 f585, f175, 0f3EC3EF15; +sub.f32 f190, f585, f189; +mul.f32 f191, f588, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f584, f169, 0fBEC3EF15; +sub.f32 f195, f584, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f582, f173, 0fBF3504F3; +mul.f32 f583, f174, 0f3F3504F3; +sub.f32 f200, f582, f583; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f580, f177, 0fBF6C835E; +mul.f32 f581, f178, 0f3EC3EF15; +sub.f32 f205, f580, f581; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f579, f609, f183; +sub.f32 f213, f609, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f578, f608, f187; +sub.f32 f217, f608, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f577, f607, f192; +sub.f32 f221, f607, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f576, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f575, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f574, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f573, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 7; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +and.b32 r14, r15, 3; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f579, f239; +mul.f32 f244, f238, f579; +mul.f32 f246, f239, f239; +mul.f32 f572, f238, f238; +sub.f32 f247, f572, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f578, f249; +mul.f32 f252, f247, f578; +mul.f32 f570, f238, f247; +mul.f32 f571, f239, f249; +sub.f32 f255, f570, f571; +mul.f32 f569, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f577, f257; +mul.f32 f260, f255, f577; +mul.f32 f262, f239, f257; +mul.f32 f568, f238, f255; +sub.f32 f263, f568, f262; +mul.f32 f567, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f576, f265; +mul.f32 f268, f263, f576; +mul.f32 f270, f239, f265; +mul.f32 f566, f238, f263; +sub.f32 f271, f566, f270; +mul.f32 f565, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f575, f273; +mul.f32 f276, f271, f575; +mul.f32 f563, f238, f271; +mul.f32 f564, f239, f273; +sub.f32 f279, f563, f564; +mul.f32 f562, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f574, f281; +mul.f32 f284, f279, f574; +mul.f32 f286, f239, f281; +mul.f32 f561, f238, f279; +sub.f32 f287, f561, f286; +mul.f32 f560, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f573, f289; +mul.f32 f292, f287, f573; +mul.f32 f294, f239, f289; +mul.f32 f559, f238, f287; +sub.f32 f295, f559, f294; +mul.f32 f558, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f557, f610, f591; +mul.f32 f298, f557, f297; +mul.f32 f300, f295, f557; +mul.f32 f555, f238, f295; +mul.f32 f556, f239, f297; +sub.f32 f303, f555, f556; +sub.f32 f554, f106, f163; +mul.f32 f553, f554, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f552, f238, f303; +sub.f32 f311, f552, f310; +mul.f32 f551, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f549, f238, f311; +mul.f32 f550, f239, f313; +sub.f32 f319, f549, f550; +mul.f32 f548, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f547, f238, f319; +sub.f32 f327, f547, f326; +mul.f32 f546, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f545, f238, f327; +sub.f32 f335, f545, f334; +mul.f32 f544, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f542, f238, f335; +mul.f32 f543, f239, f337; +sub.f32 f343, f542, f543; +mul.f32 f541, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f540, f238, f343; +sub.f32 f351, f540, f350; +mul.f32 f539, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f538, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 384; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 3; +sub.f32 f629, f610, f591; +mul.f32 f628, f295, f629; +add.f32 f357, f610, f591; +sub.f32 f627, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 3; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 3; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f538; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f569; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f567; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f565; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f562; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f560; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f558; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f627, f298; +sub.f32 f374, f628, f553; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f551; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f548; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f546; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f544; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f541; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f539; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +mad.lo.s32 r13, r20, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+32]; +ld.shared.v2.f32 {f397, f398}, [r13+64]; +ld.shared.v2.f32 {f401, f402}, [r13+96]; +ld.shared.v2.f32 {f405, f406}, [r13+128]; +ld.shared.v2.f32 {f409, f410}, [r13+160]; +ld.shared.v2.f32 {f413, f414}, [r13+192]; +ld.shared.v2.f32 {f417, f418}, [r13+224]; +ld.shared.v2.f32 {f421, f422}, [r13+256]; +ld.shared.v2.f32 {f425, f426}, [r13+288]; +ld.shared.v2.f32 {f429, f430}, [r13+320]; +ld.shared.v2.f32 {f433, f434}, [r13+352]; +ld.shared.v2.f32 {f437, f438}, [r13+384]; +ld.shared.v2.f32 {f441, f442}, [r13+416]; +ld.shared.v2.f32 {f445, f446}, [r13+448]; +ld.shared.v2.f32 {f449, f450}, [r13+480]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f537, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f536, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f393, f425; +sub.f32 f463, f393, f425; +add.f32 f535, f394, f426; +sub.f32 f464, f394, f426; +add.f32 f465, f409, f441; +sub.f32 f467, f409, f441; +add.f32 f534, f410, f442; +sub.f32 f468, f410, f442; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f533, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f532, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f401, f433; +sub.f32 f479, f401, f433; +add.f32 f531, f402, f434; +sub.f32 f480, f402, f434; +add.f32 f481, f417, f449; +sub.f32 f483, f417, f449; +add.f32 f530, f418, f450; +sub.f32 f484, f418, f450; +add.f32 %1, f537, f536; +add.f32 %0, f453, f457; +add.f32 %3, f535, f534; +add.f32 %2, f461, f465; +add.f32 %4, f469, f473; +add.f32 %5, f533, f532; +add.f32 %6, f477, f481; +add.f32 %7, f531, f530; +sub.f32 %8, f455, f460; +add.f32 %9, f456, f459; +add.f32 %11, f464, f467; +sub.f32 %10, f463, f468; +add.f32 %13, f472, f475; +sub.f32 %12, f471, f476; +add.f32 %15, f480, f483; +sub.f32 %14, f479, f484; +sub.f32 %17, f537, f536; +sub.f32 %16, f453, f457; +sub.f32 %19, f535, f534; +sub.f32 %18, f461, f465; +sub.f32 %21, f533, f532; +sub.f32 %20, f469, f473; +sub.f32 %23, f531, f530; +sub.f32 %22, f477, f481; +sub.f32 %25, f456, f459; +add.f32 %24, f455, f460; +sub.f32 %27, f464, f467; +add.f32 %26, f463, f468; +sub.f32 %29, f472, f475; +add.f32 %28, f471, f476; +sub.f32 %31, f480, f483; +add.f32 %30, f479, f484; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<248, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<485>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %34, %55; +add.f32 f66, %35, %57; +sub.f32 f67, %34, %55; +sub.f32 f68, %35, %57; +add.f32 f69, %44, %66; +add.f32 f70, %46, %67; +sub.f32 f71, %44, %66; +sub.f32 f72, %46, %67; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %39, %60; +add.f32 f82, %41, %62; +sub.f32 f83, %39, %60; +sub.f32 f84, %41, %62; +add.f32 f85, %50, %71; +add.f32 f86, %51, %73; +sub.f32 f87, %50, %71; +sub.f32 f88, %51, %73; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %36, %58; +add.f32 f123, %38, %59; +sub.f32 f124, %36, %58; +sub.f32 f125, %38, %59; +add.f32 f126, %47, %68; +add.f32 f127, %49, %70; +sub.f32 f128, %47, %68; +sub.f32 f129, %49, %70; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %42, %63; +add.f32 f139, %43, %65; +sub.f32 f140, %42, %63; +sub.f32 f141, %43, %65; +add.f32 f142, %52, %74; +add.f32 f143, %54, %75; +sub.f32 f144, %52, %74; +sub.f32 f145, %54, %75; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 192; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+16]; +ld.shared.f32 f391, [r13+32]; +ld.shared.f32 f392, [r13+48]; +ld.shared.f32 f393, [r13+64]; +ld.shared.f32 f394, [r13+80]; +ld.shared.f32 f395, [r13+96]; +ld.shared.f32 f396, [r13+112]; +ld.shared.f32 f397, [r13+128]; +ld.shared.f32 f398, [r13+144]; +ld.shared.f32 f399, [r13+160]; +ld.shared.f32 f400, [r13+176]; +ld.shared.f32 f401, [r13+192]; +ld.shared.f32 f402, [r13+208]; +ld.shared.f32 f403, [r13+224]; +ld.shared.f32 f404, [r13+240]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+16]; +ld.shared.f32 f407, [r13+32]; +ld.shared.f32 f408, [r13+48]; +ld.shared.f32 f409, [r13+64]; +ld.shared.f32 f410, [r13+80]; +ld.shared.f32 f411, [r13+96]; +ld.shared.f32 f412, [r13+112]; +ld.shared.f32 f413, [r13+128]; +ld.shared.f32 f414, [r13+144]; +ld.shared.f32 f415, [r13+160]; +ld.shared.f32 f416, [r13+176]; +ld.shared.f32 f417, [r13+192]; +ld.shared.f32 f418, [r13+208]; +ld.shared.f32 f419, [r13+224]; +ld.shared.f32 f420, [r13+240]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f390, f398; +add.f32 f430, f406, f414; +sub.f32 f431, f390, f398; +sub.f32 f432, f406, f414; +add.f32 f433, f394, f402; +add.f32 f434, f410, f418; +sub.f32 f435, f394, f402; +sub.f32 f436, f410, f418; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f392, f400; +add.f32 f446, f408, f416; +sub.f32 f447, f392, f400; +sub.f32 f448, f408, f416; +add.f32 f449, f396, f404; +add.f32 f450, f412, f420; +sub.f32 f451, f396, f404; +sub.f32 f452, f412, f420; +add.f32 %0, f421, f425; +add.f32 %1, f422, f426; +add.f32 %2, f429, f433; +add.f32 %3, f430, f434; +add.f32 %4, f437, f441; +add.f32 %5, f438, f442; +add.f32 %6, f445, f449; +add.f32 %7, f446, f450; +add.f32 %9, f424, f427; +sub.f32 %8, f423, f428; +add.f32 %11, f432, f435; +sub.f32 %10, f431, f436; +add.f32 %13, f440, f443; +sub.f32 %12, f439, f444; +add.f32 %15, f448, f451; +sub.f32 %14, f447, f452; +sub.f32 %16, f421, f425; +sub.f32 %17, f422, f426; +sub.f32 %18, f429, f433; +sub.f32 %19, f430, f434; +sub.f32 %20, f437, f441; +sub.f32 %21, f438, f442; +sub.f32 %22, f445, f449; +sub.f32 %23, f446, f450; +sub.f32 %25, f424, f427; +add.f32 %24, f423, f428; +sub.f32 %27, f432, f435; +add.f32 %26, f431, f436; +sub.f32 %29, f440, f443; +add.f32 %28, f439, f444; +sub.f32 %31, f448, f451; +add.f32 %30, f447, f452; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<249, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1348>; +.reg .b32 r<24>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1343, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1341, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1340, f1343, f1341; +sub.f32 f140, f1343, f1341; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1339, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1336, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1334, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1333, f1336, f1334; +sub.f32 f156, f1336, f1334; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1332, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1332, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1330, f159, 0fBF3504F3; +mul.f32 f1331, f160, 0f3F3504F3; +sub.f32 f167, f1330, f1331; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1329, f1340, f1333; +sub.f32 f173, f1340, f1333; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1328, f1339, f164; +sub.f32 f177, f1339, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1327, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1326, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1324, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1321, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1320, f1324, f1321; +sub.f32 f197, f1324, f1321; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1319, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1317, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1315, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1314, f1317, f1315; +sub.f32 f213, f1317, f1315; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1313, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1313, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1311, f216, 0fBF3504F3; +mul.f32 f1312, f217, 0f3F3504F3; +sub.f32 f224, f1311, f1312; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1310, f1320, f1314; +sub.f32 f230, f1320, f1314; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1309, f1319, f221; +sub.f32 f234, f1319, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1308, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1307, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1305, f231, 0f3F6C835E; +mul.f32 f1306, f1309, 0f3EC3EF15; +sub.f32 f245, f1305, f1306; +mul.f32 f246, f1309, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1308, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1307, 0f3F6C835E; +mul.f32 f1304, f239, 0f3EC3EF15; +sub.f32 f254, f1304, f253; +mul.f32 f255, f1307, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1303, f233, 0fBEC3EF15; +sub.f32 f259, f1303, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1301, f237, 0fBF3504F3; +mul.f32 f1302, f238, 0f3F3504F3; +sub.f32 f264, f1301, f1302; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1299, f241, 0fBF6C835E; +mul.f32 f1300, f242, 0f3EC3EF15; +sub.f32 f269, f1299, f1300; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1298, f1329, f1310; +sub.f32 f275, f1329, f1310; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1297, f1328, f247; +sub.f32 f279, f1328, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1296, f1327, f251; +sub.f32 f283, f1327, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1295, f1326, f256; +sub.f32 f287, f1326, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1294, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1293, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1292, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1291, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1288, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1286, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1285, f1288, f1286; +sub.f32 f315, f1288, f1286; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1284, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1282, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1279, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1278, f1282, f1279; +sub.f32 f331, f1282, f1279; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1277, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1277, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1276, f334, 0fBF3504F3; +sub.f32 f342, f1276, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1275, f1285, f1278; +sub.f32 f348, f1285, f1278; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1274, f1284, f339; +sub.f32 f352, f1284, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1273, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1272, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1270, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1268, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1267, f1270, f1268; +sub.f32 f372, f1270, f1268; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1266, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1263, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1262, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1261, f1263, f1262; +sub.f32 f388, f1263, f1262; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1260, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1260, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1258, f391, 0fBF3504F3; +mul.f32 f1259, f392, 0f3F3504F3; +sub.f32 f399, f1258, f1259; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1257, f1267, f1261; +sub.f32 f405, f1267, f1261; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1256, f1266, f396; +sub.f32 f409, f1266, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1255, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1254, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1256, 0f3EC3EF15; +mul.f32 f1253, f406, 0f3F6C835E; +sub.f32 f420, f1253, f419; +mul.f32 f421, f1256, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1255, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1254, 0f3F6C835E; +mul.f32 f1252, f414, 0f3EC3EF15; +sub.f32 f429, f1252, f428; +mul.f32 f430, f1254, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1251, f408, 0fBEC3EF15; +sub.f32 f434, f1251, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1250, f412, 0fBF3504F3; +sub.f32 f439, f1250, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1249, f416, 0fBF6C835E; +sub.f32 f444, f1249, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1248, f1275, f1257; +sub.f32 f450, f1275, f1257; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1247, f1274, f422; +sub.f32 f454, f1274, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1246, f1273, f426; +sub.f32 f458, f1273, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1245, f1272, f431; +sub.f32 f462, f1272, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1244, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1243, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1242, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1241, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1247, 0f3E47C5C2; +mul.f32 f1240, f451, 0f3F7B14BE; +sub.f32 f481, f1240, f480; +mul.f32 f482, f1247, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1246, 0f3EC3EF15; +mul.f32 f1239, f455, 0f3F6C835E; +sub.f32 f486, f1239, f485; +mul.f32 f487, f1246, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1245, 0f3F0E39DA; +mul.f32 f1238, f459, 0f3F54DB31; +sub.f32 f491, f1238, f490; +mul.f32 f492, f1245, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1244, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1243, 0f3F54DB31; +mul.f32 f1237, f467, 0f3F0E39DA; +sub.f32 f500, f1237, f499; +mul.f32 f501, f1243, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1242, 0f3F6C835E; +mul.f32 f1236, f471, 0f3EC3EF15; +sub.f32 f505, f1236, f504; +mul.f32 f506, f1242, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1241, 0f3F7B14BE; +mul.f32 f1235, f475, 0f3E47C5C2; +sub.f32 f510, f1235, f509; +mul.f32 f511, f1241, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1234, f453, 0fBE47C5C2; +sub.f32 f515, f1234, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1232, f457, 0fBEC3EF15; +mul.f32 f1233, f458, 0f3F6C835E; +sub.f32 f520, f1232, f1233; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1230, f461, 0fBF0E39DA; +mul.f32 f1231, f462, 0f3F54DB31; +sub.f32 f525, f1230, f1231; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1228, f465, 0fBF3504F3; +mul.f32 f1229, f466, 0f3F3504F3; +sub.f32 f530, f1228, f1229; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1226, f469, 0fBF54DB31; +mul.f32 f1227, f470, 0f3F0E39DA; +sub.f32 f535, f1226, f1227; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1225, f473, 0fBF6C835E; +sub.f32 f540, f1225, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1224, f477, 0fBF7B14BE; +sub.f32 f545, f1224, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f1223, f1297, f483; +sub.f32 f553, f1297, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f1222, f1296, f488; +sub.f32 f557, f1296, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f1221, f1295, f493; +sub.f32 f561, f1295, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f1220, f1294, f497; +sub.f32 f565, f1294, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f1219, f1293, f502; +sub.f32 f569, f1293, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f1218, f1292, f507; +sub.f32 f573, f1292, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f1217, f1291, f512; +sub.f32 f577, f1291, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f1216, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f1215, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f1214, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f1213, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f1212, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f1211, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f1210, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f1209, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -512; +add.s32 r9, r4, r8; +and.b32 r14, r15, 1; +shl.b32 r10, r15, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f1223, f611; +mul.f32 f616, f610, f1223; +mul.f32 f618, f611, f611; +mul.f32 f1208, f610, f610; +sub.f32 f619, f1208, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f1222, f621; +mul.f32 f624, f619, f1222; +mul.f32 f626, f611, f621; +mul.f32 f1207, f610, f619; +sub.f32 f627, f1207, f626; +mul.f32 f1206, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f1221, f629; +mul.f32 f632, f627, f1221; +mul.f32 f1204, f610, f627; +mul.f32 f1205, f611, f629; +sub.f32 f635, f1204, f1205; +mul.f32 f1203, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f1220, f637; +mul.f32 f640, f635, f1220; +mul.f32 f642, f611, f637; +mul.f32 f1202, f610, f635; +sub.f32 f643, f1202, f642; +mul.f32 f1201, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f1219, f645; +mul.f32 f648, f643, f1219; +mul.f32 f1199, f610, f643; +mul.f32 f1200, f611, f645; +sub.f32 f651, f1199, f1200; +mul.f32 f1198, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f1218, f653; +mul.f32 f656, f651, f1218; +mul.f32 f658, f611, f653; +mul.f32 f1197, f610, f651; +sub.f32 f659, f1197, f658; +mul.f32 f1196, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f1217, f661; +mul.f32 f664, f659, f1217; +mul.f32 f666, f611, f661; +mul.f32 f1195, f610, f659; +sub.f32 f667, f1195, f666; +mul.f32 f1194, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f1216, f669; +mul.f32 f672, f667, f1216; +mul.f32 f1192, f610, f667; +mul.f32 f1193, f611, f669; +sub.f32 f675, f1192, f1193; +mul.f32 f1191, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f1215, f677; +mul.f32 f680, f675, f1215; +mul.f32 f682, f611, f677; +mul.f32 f1190, f610, f675; +sub.f32 f683, f1190, f682; +mul.f32 f1189, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f1214, f685; +mul.f32 f688, f683, f1214; +mul.f32 f690, f611, f685; +mul.f32 f1188, f610, f683; +sub.f32 f691, f1188, f690; +mul.f32 f1187, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f1213, f693; +mul.f32 f696, f691, f1213; +mul.f32 f1185, f610, f691; +mul.f32 f1186, f611, f693; +sub.f32 f699, f1185, f1186; +mul.f32 f1184, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f1212, f701; +mul.f32 f704, f699, f1212; +mul.f32 f706, f611, f701; +mul.f32 f1183, f610, f699; +sub.f32 f707, f1183, f706; +mul.f32 f1182, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f1211, f709; +mul.f32 f712, f707, f1211; +mul.f32 f1180, f610, f707; +mul.f32 f1181, f611, f709; +sub.f32 f715, f1180, f1181; +mul.f32 f1179, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f1210, f717; +mul.f32 f720, f715, f1210; +mul.f32 f722, f611, f717; +mul.f32 f1178, f610, f715; +sub.f32 f723, f1178, f722; +mul.f32 f1177, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f1209, f725; +mul.f32 f728, f723, f1209; +mul.f32 f730, f611, f725; +mul.f32 f1176, f610, f723; +sub.f32 f731, f1176, f730; +mul.f32 f1175, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f1174, f1298, f1248; +mul.f32 f734, f1174, f733; +mul.f32 f736, f731, f1174; +mul.f32 f1172, f610, f731; +mul.f32 f1173, f611, f733; +sub.f32 f739, f1172, f1173; +sub.f32 f1171, f272, f447; +mul.f32 f1170, f1171, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f1169, f610, f739; +sub.f32 f747, f1169, f746; +mul.f32 f1168, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f1167, f610, f747; +sub.f32 f755, f1167, f754; +mul.f32 f1166, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f1164, f610, f755; +mul.f32 f1165, f611, f757; +sub.f32 f763, f1164, f1165; +mul.f32 f1163, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f1162, f610, f763; +sub.f32 f771, f1162, f770; +mul.f32 f1161, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f1159, f610, f771; +mul.f32 f1160, f611, f773; +sub.f32 f779, f1159, f1160; +mul.f32 f1158, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f1157, f610, f779; +sub.f32 f787, f1157, f786; +mul.f32 f1156, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f1155, f610, f787; +sub.f32 f795, f1155, f794; +mul.f32 f1154, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f1152, f610, f795; +mul.f32 f1153, f611, f797; +sub.f32 f803, f1152, f1153; +mul.f32 f1151, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f1150, f610, f803; +sub.f32 f811, f1150, f810; +mul.f32 f1149, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f1148, f610, f811; +sub.f32 f819, f1148, f818; +mul.f32 f1147, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f1145, f610, f819; +mul.f32 f1146, f611, f821; +sub.f32 f827, f1145, f1146; +mul.f32 f1144, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f1143, f610, f827; +sub.f32 f835, f1143, f834; +mul.f32 f1142, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f1140, f610, f835; +mul.f32 f1141, f611, f837; +sub.f32 f843, f1140, f1141; +mul.f32 f1139, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f1138, f610, f843; +sub.f32 f851, f1138, f850; +mul.f32 f1137, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f1136, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 256; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 1; +sub.f32 f1346, f1298, f1248; +mul.f32 f1345, f731, f1346; +add.f32 f857, f1298, f1248; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 1; +sub.f32 f1347, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 1; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 1; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f1136; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f1206; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f1203; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f1201; +sub.f32 f867, f648, f1198; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f1196; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f1194; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f1191; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f1189; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f1187; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f1184; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f1182; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f1179; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f1177; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f1175; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f1347, f734; +sub.f32 f890, f1345, f1170; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f1168; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f1166; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f1163; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f1161; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f1158; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f1156; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f1154; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f1151; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f1149; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f1147; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f1144; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f1142; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f1139; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f1137; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +mad.lo.s32 r13, r22, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+16]; +ld.shared.v2.f32 {f929, f930}, [r13+32]; +ld.shared.v2.f32 {f933, f934}, [r13+48]; +ld.shared.v2.f32 {f937, f938}, [r13+64]; +ld.shared.v2.f32 {f941, f942}, [r13+80]; +ld.shared.v2.f32 {f945, f946}, [r13+96]; +ld.shared.v2.f32 {f949, f950}, [r13+112]; +ld.shared.v2.f32 {f953, f954}, [r13+128]; +ld.shared.v2.f32 {f957, f958}, [r13+144]; +ld.shared.v2.f32 {f961, f962}, [r13+160]; +ld.shared.v2.f32 {f965, f966}, [r13+176]; +ld.shared.v2.f32 {f969, f970}, [r13+192]; +ld.shared.v2.f32 {f973, f974}, [r13+208]; +ld.shared.v2.f32 {f977, f978}, [r13+224]; +ld.shared.v2.f32 {f981, f982}, [r13+240]; +ld.shared.v2.f32 {f985, f986}, [r13+256]; +ld.shared.v2.f32 {f989, f990}, [r13+272]; +ld.shared.v2.f32 {f993, f994}, [r13+288]; +ld.shared.v2.f32 {f997, f998}, [r13+304]; +ld.shared.v2.f32 {f1001, f1002}, [r13+320]; +ld.shared.v2.f32 {f1005, f1006}, [r13+336]; +ld.shared.v2.f32 {f1009, f1010}, [r13+352]; +ld.shared.v2.f32 {f1013, f1014}, [r13+368]; +ld.shared.v2.f32 {f1017, f1018}, [r13+384]; +ld.shared.v2.f32 {f1021, f1022}, [r13+400]; +ld.shared.v2.f32 {f1025, f1026}, [r13+416]; +ld.shared.v2.f32 {f1029, f1030}, [r13+432]; +ld.shared.v2.f32 {f1033, f1034}, [r13+448]; +ld.shared.v2.f32 {f1037, f1038}, [r13+464]; +ld.shared.v2.f32 {f1041, f1042}, [r13+480]; +ld.shared.v2.f32 {f1045, f1046}, [r13+496]; +add.f32 %1, f922, f986; +add.f32 %0, f921, f985; +add.f32 %2, f925, f989; +add.f32 %3, f926, f990; +add.f32 %4, f929, f993; +add.f32 %5, f930, f994; +add.f32 %6, f933, f997; +add.f32 %7, f934, f998; +add.f32 %8, f937, f1001; +add.f32 %9, f938, f1002; +add.f32 %11, f942, f1006; +add.f32 %10, f941, f1005; +add.f32 %13, f946, f1010; +add.f32 %12, f945, f1009; +add.f32 %15, f950, f1014; +add.f32 %14, f949, f1013; +add.f32 %16, f953, f1017; +add.f32 %17, f954, f1018; +add.f32 %18, f957, f1021; +add.f32 %19, f958, f1022; +add.f32 %20, f961, f1025; +add.f32 %21, f962, f1026; +add.f32 %23, f966, f1030; +add.f32 %22, f965, f1029; +add.f32 %25, f970, f1034; +add.f32 %24, f969, f1033; +add.f32 %27, f974, f1038; +add.f32 %26, f973, f1037; +add.f32 %28, f977, f1041; +add.f32 %29, f978, f1042; +add.f32 %30, f981, f1045; +add.f32 %31, f982, f1046; +sub.f32 %33, f922, f986; +sub.f32 %32, f921, f985; +sub.f32 %35, f926, f990; +sub.f32 %34, f925, f989; +sub.f32 %37, f930, f994; +sub.f32 %36, f929, f993; +sub.f32 %39, f934, f998; +sub.f32 %38, f933, f997; +sub.f32 %41, f938, f1002; +sub.f32 %40, f937, f1001; +sub.f32 %43, f942, f1006; +sub.f32 %42, f941, f1005; +sub.f32 %45, f946, f1010; +sub.f32 %44, f945, f1009; +sub.f32 %47, f950, f1014; +sub.f32 %46, f949, f1013; +sub.f32 %49, f954, f1018; +sub.f32 %48, f953, f1017; +sub.f32 %51, f958, f1022; +sub.f32 %50, f957, f1021; +sub.f32 %53, f962, f1026; +sub.f32 %52, f961, f1025; +sub.f32 %55, f966, f1030; +sub.f32 %54, f965, f1029; +sub.f32 %57, f970, f1034; +sub.f32 %56, f969, f1033; +sub.f32 %59, f974, f1038; +sub.f32 %58, f973, f1037; +sub.f32 %61, f978, f1042; +sub.f32 %60, f977, f1041; +sub.f32 %63, f982, f1046; +sub.f32 %62, f981, f1045; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<250, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<118>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %10, %12; +sub.f32 f10, %11, %13; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 496; +add.s32 r11, r8, r10; +add.f32 f18, %11, %13; +add.f32 f19, %10, %12; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 248; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+256]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 480; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 240; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+256]; +sub.f32 f51, f43, f47; +sub.f32 f52, f44, f48; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f53, f54}, [rd11]; +mul.f32 f57, f52, f54; +mul.f32 f58, f51, f54; +mul.f32 f59, f53, f52; +and.b32 r22, r9, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 448; +add.s32 r25, r23, r24; +add.f32 f60, f44, f48; +add.f32 f61, f43, f47; +st.shared.v2.f32 [r25], {f61, f60}; +fma.rn.f32 f62, f53, f51, f57; +sub.f32 f63, f59, f58; +st.shared.v2.f32 [r25+32], {f62, f63}; +barrier.sync 0; +and.b32 r26, r9, 224; +sub.s32 r27, r25, r26; +ld.shared.v2.f32 {f64, f65}, [r27]; +ld.shared.v2.f32 {f68, f69}, [r27+256]; +sub.f32 f72, f64, f68; +sub.f32 f73, f65, f69; +and.b32 r28, r5, 24; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f74, f75}, [rd14]; +mul.f32 f78, f73, f75; +mul.f32 f79, f72, f75; +mul.f32 f80, f74, f73; +and.b32 r29, r9, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 384; +add.s32 r32, r30, r31; +add.f32 f81, f65, f69; +add.f32 f82, f64, f68; +st.shared.v2.f32 [r32], {f82, f81}; +fma.rn.f32 f83, f74, f72, f78; +sub.f32 f84, f80, f79; +st.shared.v2.f32 [r32+64], {f83, f84}; +barrier.sync 0; +and.b32 r33, r9, 192; +sub.s32 r34, r32, r33; +ld.shared.v2.f32 {f85, f86}, [r34]; +ld.shared.v2.f32 {f89, f90}, [r34+256]; +sub.f32 f93, f85, f89; +sub.f32 f94, f86, f90; +bfe.u32 r35, r5, 4, 1; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f95, f96}, [rd17]; +mul.f32 f99, f94, f96; +mul.f32 f100, f93, f96; +mul.f32 f101, f95, f94; +and.b32 r36, r9, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 256; +add.s32 r39, r37, r38; +add.f32 f102, f86, f90; +add.f32 f103, f85, f89; +st.shared.v2.f32 [r39], {f103, f102}; +fma.rn.f32 f104, f95, f93, f99; +sub.f32 f105, f101, f100; +st.shared.v2.f32 [r39+128], {f104, f105}; +barrier.sync 0; +and.b32 r40, r9, 128; +sub.s32 r41, r39, r40; +ld.shared.v2.f32 {f106, f107}, [r41]; +ld.shared.v2.f32 {f110, f111}, [r41+256]; +add.f32 %1, f107, f111; +add.f32 %0, f106, f110; +sub.f32 %3, f107, f111; +sub.f32 %2, f106, f110; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<251, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<98>; +.reg .b32 r<42>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %10, %12; +add.f32 f10, %11, %13; +sub.f32 f11, %10, %12; +sub.f32 f12, %11, %13; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 248; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -256; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 248; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 124; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+128]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+128]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 240; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 120; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+128]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+128]; +add.f32 f43, f39, f40; +add.f32 f44, f41, f42; +sub.f32 f45, f39, f40; +sub.f32 f46, f41, f42; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f47, f48}, [rd11]; +mul.f32 f51, f46, f48; +fma.rn.f32 f52, f47, f45, f51; +mul.f32 f53, f45, f48; +mul.f32 f54, f47, f46; +sub.f32 f55, f54, f53; +and.b32 r22, r11, 12; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 224; +add.s32 r25, r23, r24; +st.shared.f32 [r25], f43; +st.shared.f32 [r25+16], f52; +barrier.sync 0; +and.b32 r26, r11, 112; +sub.s32 r27, r25, r26; +ld.shared.f32 f56, [r27]; +ld.shared.f32 f57, [r27+128]; +barrier.sync 0; +st.shared.f32 [r25], f44; +st.shared.f32 [r25+16], f55; +barrier.sync 0; +ld.shared.f32 f58, [r27]; +ld.shared.f32 f59, [r27+128]; +add.f32 f60, f56, f57; +add.f32 f61, f58, f59; +sub.f32 f62, f56, f57; +sub.f32 f63, f58, f59; +and.b32 r28, r5, 24; +cvt.u64.u32 rd12, r28; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f64, f65}, [rd14]; +mul.f32 f68, f63, f65; +fma.rn.f32 f69, f64, f62, f68; +mul.f32 f70, f62, f65; +mul.f32 f71, f64, f63; +sub.f32 f72, f71, f70; +and.b32 r29, r11, 28; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 192; +add.s32 r32, r30, r31; +st.shared.f32 [r32], f60; +st.shared.f32 [r32+32], f69; +barrier.sync 0; +and.b32 r33, r11, 96; +sub.s32 r34, r32, r33; +ld.shared.f32 f73, [r34]; +ld.shared.f32 f74, [r34+128]; +barrier.sync 0; +st.shared.f32 [r32], f61; +st.shared.f32 [r32+32], f72; +barrier.sync 0; +ld.shared.f32 f75, [r34]; +ld.shared.f32 f76, [r34+128]; +add.f32 f77, f73, f74; +add.f32 f78, f75, f76; +sub.f32 f79, f73, f74; +sub.f32 f80, f75, f76; +bfe.u32 r35, r5, 4, 1; +mul.wide.u32 rd15, r35, 8; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f32 {f81, f82}, [rd17]; +mul.f32 f85, f80, f82; +fma.rn.f32 f86, f81, f79, f85; +mul.f32 f87, f79, f82; +mul.f32 f88, f81, f80; +sub.f32 f89, f88, f87; +and.b32 r36, r11, 60; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 128; +add.s32 r39, r37, r38; +st.shared.f32 [r39], f77; +st.shared.f32 [r39+64], f86; +barrier.sync 0; +and.b32 r40, r11, 64; +sub.s32 r41, r39, r40; +ld.shared.f32 f90, [r41]; +ld.shared.f32 f91, [r41+128]; +barrier.sync 0; +st.shared.f32 [r39], f78; +st.shared.f32 [r39+64], f89; +barrier.sync 0; +ld.shared.f32 f92, [r41]; +ld.shared.f32 f93, [r41+128]; +add.f32 %0, f90, f91; +add.f32 %1, f92, f93; +sub.f32 %2, f90, f91; +sub.f32 %3, f92, f93; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_64), "l"(lut_sp_2_32), "l"(lut_sp_2_16), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<252, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1253>; +.reg .b32 r<18>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 8; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %66, %98; +sub.f32 f131, %66, %98; +add.f32 f1251, %67, %130; +sub.f32 f132, %67, %130; +add.f32 f133, %82, %114; +sub.f32 f135, %82, %114; +add.f32 f1249, %131, %115; +sub.f32 f136, %131, %115; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f1248, f1251, f1249; +sub.f32 f140, f1251, f1249; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f1247, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %74, %106; +sub.f32 f147, %74, %106; +add.f32 f1244, %133, %132; +sub.f32 f148, %133, %132; +add.f32 f149, %90, %122; +sub.f32 f151, %90, %122; +add.f32 f1242, %91, %134; +sub.f32 f152, %91, %134; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f1241, f1244, f1242; +sub.f32 f156, f1244, f1242; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f1240, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f1240, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f1238, f159, 0fBF3504F3; +mul.f32 f1239, f160, 0f3F3504F3; +sub.f32 f167, f1238, f1239; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f1237, f1248, f1241; +sub.f32 f173, f1248, f1241; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f1236, f1247, f164; +sub.f32 f177, f1247, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f1235, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f1234, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %70, %102; +sub.f32 f188, %70, %102; +add.f32 f1232, %135, %103; +sub.f32 f189, %135, %103; +add.f32 f190, %86, %118; +sub.f32 f192, %86, %118; +add.f32 f1229, %137, %136; +sub.f32 f193, %137, %136; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f1228, f1232, f1229; +sub.f32 f197, f1232, f1229; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f1227, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %78, %110; +sub.f32 f204, %78, %110; +add.f32 f1225, %79, %138; +sub.f32 f205, %79, %138; +add.f32 f206, %94, %126; +sub.f32 f208, %94, %126; +add.f32 f1223, %139, %127; +sub.f32 f209, %139, %127; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f1222, f1225, f1223; +sub.f32 f213, f1225, f1223; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f1221, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f1221, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f1219, f216, 0fBF3504F3; +mul.f32 f1220, f217, 0f3F3504F3; +sub.f32 f224, f1219, f1220; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f1218, f1228, f1222; +sub.f32 f230, f1228, f1222; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f1217, f1227, f221; +sub.f32 f234, f1227, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f1216, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f1215, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f1213, f231, 0f3F6C835E; +mul.f32 f1214, f1217, 0f3EC3EF15; +sub.f32 f245, f1213, f1214; +mul.f32 f246, f1217, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f1216, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f1215, 0f3F6C835E; +mul.f32 f1212, f239, 0f3EC3EF15; +sub.f32 f254, f1212, f253; +mul.f32 f255, f1215, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f1211, f233, 0fBEC3EF15; +sub.f32 f259, f1211, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f1209, f237, 0fBF3504F3; +mul.f32 f1210, f238, 0f3F3504F3; +sub.f32 f264, f1209, f1210; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f1207, f241, 0fBF6C835E; +mul.f32 f1208, f242, 0f3EC3EF15; +sub.f32 f269, f1207, f1208; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f1206, f1237, f1218; +sub.f32 f275, f1237, f1218; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f1205, f1236, f247; +sub.f32 f279, f1236, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f1204, f1235, f251; +sub.f32 f283, f1235, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f1203, f1234, f256; +sub.f32 f287, f1234, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f1202, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f1201, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f1200, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f1199, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %68, %100; +sub.f32 f306, %68, %100; +add.f32 f1196, %141, %140; +sub.f32 f307, %141, %140; +add.f32 f308, %84, %116; +sub.f32 f310, %84, %116; +add.f32 f1194, %85, %142; +sub.f32 f311, %85, %142; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f1193, f1196, f1194; +sub.f32 f315, f1196, f1194; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f1192, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %76, %108; +sub.f32 f322, %76, %108; +add.f32 f1190, %143, %109; +sub.f32 f323, %143, %109; +add.f32 f324, %92, %124; +sub.f32 f326, %92, %124; +add.f32 f1187, %145, %144; +sub.f32 f327, %145, %144; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f1186, f1190, f1187; +sub.f32 f331, f1190, f1187; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f1185, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f1185, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f1184, f334, 0fBF3504F3; +sub.f32 f342, f1184, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f1183, f1193, f1186; +sub.f32 f348, f1193, f1186; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f1182, f1192, f339; +sub.f32 f352, f1192, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f1181, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f1180, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %72, %104; +sub.f32 f363, %72, %104; +add.f32 f1178, %73, %146; +sub.f32 f364, %73, %146; +add.f32 f365, %88, %120; +sub.f32 f367, %88, %120; +add.f32 f1176, %147, %121; +sub.f32 f368, %147, %121; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f1175, f1178, f1176; +sub.f32 f372, f1178, f1176; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f1174, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %80, %112; +sub.f32 f379, %80, %112; +add.f32 f1171, %149, %148; +sub.f32 f380, %149, %148; +add.f32 f381, %96, %128; +sub.f32 f383, %96, %128; +add.f32 f1170, %97, %129; +sub.f32 f384, %97, %129; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f1169, f1171, f1170; +sub.f32 f388, f1171, f1170; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f1168, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f1168, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f1166, f391, 0fBF3504F3; +mul.f32 f1167, f392, 0f3F3504F3; +sub.f32 f399, f1166, f1167; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f1165, f1175, f1169; +sub.f32 f405, f1175, f1169; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f1164, f1174, f396; +sub.f32 f409, f1174, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f1163, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f1162, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f1164, 0f3EC3EF15; +mul.f32 f1161, f406, 0f3F6C835E; +sub.f32 f420, f1161, f419; +mul.f32 f421, f1164, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f1163, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f1162, 0f3F6C835E; +mul.f32 f1160, f414, 0f3EC3EF15; +sub.f32 f429, f1160, f428; +mul.f32 f430, f1162, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f1159, f408, 0fBEC3EF15; +sub.f32 f434, f1159, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f1158, f412, 0fBF3504F3; +sub.f32 f439, f1158, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f1157, f416, 0fBF6C835E; +sub.f32 f444, f1157, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f1156, f1183, f1165; +sub.f32 f450, f1183, f1165; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f1155, f1182, f422; +sub.f32 f454, f1182, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f1154, f1181, f426; +sub.f32 f458, f1181, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f1153, f1180, f431; +sub.f32 f462, f1180, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f1152, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f1151, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f1150, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f1149, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f1155, 0f3E47C5C2; +mul.f32 f1148, f451, 0f3F7B14BE; +sub.f32 f481, f1148, f480; +mul.f32 f482, f1155, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f1154, 0f3EC3EF15; +mul.f32 f1147, f455, 0f3F6C835E; +sub.f32 f486, f1147, f485; +mul.f32 f487, f1154, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f1153, 0f3F0E39DA; +mul.f32 f1146, f459, 0f3F54DB31; +sub.f32 f491, f1146, f490; +mul.f32 f492, f1153, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f1152, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f1151, 0f3F54DB31; +mul.f32 f1145, f467, 0f3F0E39DA; +sub.f32 f500, f1145, f499; +mul.f32 f501, f1151, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f1150, 0f3F6C835E; +mul.f32 f1144, f471, 0f3EC3EF15; +sub.f32 f505, f1144, f504; +mul.f32 f506, f1150, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f1149, 0f3F7B14BE; +mul.f32 f1143, f475, 0f3E47C5C2; +sub.f32 f510, f1143, f509; +mul.f32 f511, f1149, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f1142, f453, 0fBE47C5C2; +sub.f32 f515, f1142, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f1140, f457, 0fBEC3EF15; +mul.f32 f1141, f458, 0f3F6C835E; +sub.f32 f520, f1140, f1141; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f1138, f461, 0fBF0E39DA; +mul.f32 f1139, f462, 0f3F54DB31; +sub.f32 f525, f1138, f1139; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f1136, f465, 0fBF3504F3; +mul.f32 f1137, f466, 0f3F3504F3; +sub.f32 f530, f1136, f1137; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f1134, f469, 0fBF54DB31; +mul.f32 f1135, f470, 0f3F0E39DA; +sub.f32 f535, f1134, f1135; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f1133, f473, 0fBF6C835E; +sub.f32 f540, f1133, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f1132, f477, 0fBF7B14BE; +sub.f32 f545, f1132, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f1131, f1206, f1156; +sub.f32 f551, f1206, f1156; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f1130, f1205, f483; +sub.f32 f555, f1205, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f1129, f1204, f488; +sub.f32 f559, f1204, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f1128, f1203, f493; +sub.f32 f563, f1203, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f1127, f1202, f497; +sub.f32 f567, f1202, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f1126, f1201, f502; +sub.f32 f571, f1201, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f1125, f1200, f507; +sub.f32 f575, f1200, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f1124, f1199, f512; +sub.f32 f579, f1199, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f1123, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f1122, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f1121, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f1120, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f1119, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f1118, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f1117, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f1116, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f1130, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f1130; +sub.f32 f620, f619, f618; +mul.f32 f1114, f612, f612; +mul.f32 f1115, f613, f613; +sub.f32 f623, f1114, f1115; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f1129, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f1129; +sub.f32 f630, f629, f628; +mul.f32 f1112, f612, f623; +mul.f32 f1113, f613, f625; +sub.f32 f633, f1112, f1113; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f1128, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f1128; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f1111, f612, f633; +sub.f32 f643, f1111, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f1127, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f1127; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f1110, f612, f643; +sub.f32 f653, f1110, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f1126, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f1126; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f1109, f612, f653; +sub.f32 f663, f1109, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f1125, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f1125; +sub.f32 f670, f669, f668; +mul.f32 f1107, f612, f663; +mul.f32 f1108, f613, f665; +sub.f32 f673, f1107, f1108; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f1124, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f1124; +sub.f32 f680, f679, f678; +mul.f32 f1105, f612, f673; +mul.f32 f1106, f613, f675; +sub.f32 f683, f1105, f1106; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f1123, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f1123; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f1104, f612, f683; +sub.f32 f693, f1104, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f1122, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f1122; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f1103, f612, f693; +sub.f32 f703, f1103, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f1121, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f1121; +sub.f32 f710, f709, f708; +mul.f32 f1101, f612, f703; +mul.f32 f1102, f613, f705; +sub.f32 f713, f1101, f1102; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f1120, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f1120; +sub.f32 f720, f719, f718; +mul.f32 f1099, f612, f713; +mul.f32 f1100, f613, f715; +sub.f32 f723, f1099, f1100; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f1119, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f1119; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f1098, f612, f723; +sub.f32 f733, f1098, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f1118, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f1118; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f1097, f612, f733; +sub.f32 f743, f1097, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f1117, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f1117; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f1096, f612, f743; +sub.f32 f753, f1096, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f1116, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f1116; +sub.f32 f760, f759, f758; +mul.f32 f1094, f612, f753; +mul.f32 f1095, f613, f755; +sub.f32 f763, f1094, f1095; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f1092, f612, f763; +mul.f32 f1093, f613, f765; +sub.f32 f773, f1092, f1093; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f1091, f612, f773; +sub.f32 f783, f1091, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f1090, f612, f783; +sub.f32 f793, f1090, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f1089, f612, f793; +sub.f32 f803, f1089, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f1087, f612, f803; +mul.f32 f1088, f613, f805; +sub.f32 f813, f1087, f1088; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f1085, f612, f813; +mul.f32 f1086, f613, f815; +sub.f32 f823, f1085, f1086; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f1084, f612, f823; +sub.f32 f833, f1084, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f1083, f612, f833; +sub.f32 f843, f1083, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f1081, f612, f843; +mul.f32 f1082, f613, f845; +sub.f32 f853, f1081, f1082; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f1079, f612, f853; +mul.f32 f1080, f613, f855; +sub.f32 f863, f1079, f1080; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f1078, f612, f863; +sub.f32 f873, f1078, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f1077, f612, f873; +sub.f32 f883, f1077, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f1076, f612, f883; +sub.f32 f893, f1076, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f1074, f612, f893; +mul.f32 f1075, f613, f895; +sub.f32 f903, f1074, f1075; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f1072, f612, f903; +mul.f32 f1073, f613, f905; +sub.f32 f913, f1072, f1073; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mov.u32 r17, %tid.x; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +and.b32 r14, r17, 1; +shl.b32 r8, r17, 7; +and.b32 r9, r8, -256; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 128; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +mad.lo.s32 r13, r14, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+8]; +ld.shared.f32 f923, [r13+16]; +ld.shared.f32 f924, [r13+24]; +ld.shared.f32 f925, [r13+32]; +ld.shared.f32 f926, [r13+40]; +ld.shared.f32 f927, [r13+48]; +ld.shared.f32 f928, [r13+56]; +ld.shared.f32 f929, [r13+64]; +ld.shared.f32 f930, [r13+72]; +ld.shared.f32 f931, [r13+80]; +ld.shared.f32 f932, [r13+88]; +ld.shared.f32 f933, [r13+96]; +ld.shared.f32 f934, [r13+104]; +ld.shared.f32 f935, [r13+112]; +ld.shared.f32 f936, [r13+120]; +ld.shared.f32 f937, [r13+128]; +ld.shared.f32 f938, [r13+136]; +ld.shared.f32 f939, [r13+144]; +ld.shared.f32 f940, [r13+152]; +ld.shared.f32 f941, [r13+160]; +ld.shared.f32 f942, [r13+168]; +ld.shared.f32 f943, [r13+176]; +ld.shared.f32 f944, [r13+184]; +ld.shared.f32 f945, [r13+192]; +ld.shared.f32 f946, [r13+200]; +ld.shared.f32 f947, [r13+208]; +ld.shared.f32 f948, [r13+216]; +ld.shared.f32 f949, [r13+224]; +ld.shared.f32 f950, [r13+232]; +ld.shared.f32 f951, [r13+240]; +ld.shared.f32 f952, [r13+248]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f1131, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+8]; +ld.shared.f32 f955, [r13+16]; +ld.shared.f32 f956, [r13+24]; +ld.shared.f32 f957, [r13+32]; +ld.shared.f32 f958, [r13+40]; +ld.shared.f32 f959, [r13+48]; +ld.shared.f32 f960, [r13+56]; +ld.shared.f32 f961, [r13+64]; +ld.shared.f32 f962, [r13+72]; +ld.shared.f32 f963, [r13+80]; +ld.shared.f32 f964, [r13+88]; +ld.shared.f32 f965, [r13+96]; +ld.shared.f32 f966, [r13+104]; +ld.shared.f32 f967, [r13+112]; +ld.shared.f32 f968, [r13+120]; +ld.shared.f32 f969, [r13+128]; +ld.shared.f32 f970, [r13+136]; +ld.shared.f32 f971, [r13+144]; +ld.shared.f32 f972, [r13+152]; +ld.shared.f32 f973, [r13+160]; +ld.shared.f32 f974, [r13+168]; +ld.shared.f32 f975, [r13+176]; +ld.shared.f32 f976, [r13+184]; +ld.shared.f32 f977, [r13+192]; +ld.shared.f32 f978, [r13+200]; +ld.shared.f32 f979, [r13+208]; +ld.shared.f32 f980, [r13+216]; +ld.shared.f32 f981, [r13+224]; +ld.shared.f32 f982, [r13+232]; +ld.shared.f32 f983, [r13+240]; +ld.shared.f32 f984, [r13+248]; +add.f32 %0, f921, f937; +add.f32 %1, f953, f969; +add.f32 %3, f954, f970; +add.f32 %2, f922, f938; +add.f32 %5, f955, f971; +add.f32 %4, f923, f939; +add.f32 %7, f956, f972; +add.f32 %6, f924, f940; +add.f32 %9, f957, f973; +add.f32 %8, f925, f941; +add.f32 %10, f926, f942; +add.f32 %11, f958, f974; +add.f32 %12, f927, f943; +add.f32 %13, f959, f975; +add.f32 %14, f928, f944; +add.f32 %15, f960, f976; +add.f32 %17, f961, f977; +add.f32 %16, f929, f945; +add.f32 %19, f962, f978; +add.f32 %18, f930, f946; +add.f32 %21, f963, f979; +add.f32 %20, f931, f947; +add.f32 %22, f932, f948; +add.f32 %23, f964, f980; +add.f32 %24, f933, f949; +add.f32 %25, f965, f981; +add.f32 %26, f934, f950; +add.f32 %27, f966, f982; +add.f32 %29, f967, f983; +add.f32 %28, f935, f951; +add.f32 %31, f968, f984; +add.f32 %30, f936, f952; +sub.f32 %32, f921, f937; +sub.f32 %33, f953, f969; +sub.f32 %34, f922, f938; +sub.f32 %35, f954, f970; +sub.f32 %36, f923, f939; +sub.f32 %37, f955, f971; +sub.f32 %38, f924, f940; +sub.f32 %39, f956, f972; +sub.f32 %40, f925, f941; +sub.f32 %41, f957, f973; +sub.f32 %42, f926, f942; +sub.f32 %43, f958, f974; +sub.f32 %44, f927, f943; +sub.f32 %45, f959, f975; +sub.f32 %46, f928, f944; +sub.f32 %47, f960, f976; +sub.f32 %48, f929, f945; +sub.f32 %49, f961, f977; +sub.f32 %50, f930, f946; +sub.f32 %51, f962, f978; +sub.f32 %52, f931, f947; +sub.f32 %53, f963, f979; +sub.f32 %54, f932, f948; +sub.f32 %55, f964, f980; +sub.f32 %56, f933, f949; +sub.f32 %57, f965, f981; +sub.f32 %58, f934, f950; +sub.f32 %59, f966, f982; +sub.f32 %60, f935, f951; +sub.f32 %61, f967, f983; +sub.f32 %62, f936, f952; +sub.f32 %63, f968, f984; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_64), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[20].y), "f"(rmem[4].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[23].y), "f"(rmem[7].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..4974676e1dfd7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp64_fwd.hpp.inc @@ -0,0 +1,2040 @@ +#ifndef CUFFTDX_FFT_64_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_64_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<444, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<231>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+128]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 448; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+64]; +ld.shared.f64 fd160, [r13+128]; +ld.shared.f64 fd161, [r13+192]; +ld.shared.f64 fd162, [r13+256]; +ld.shared.f64 fd163, [r13+320]; +ld.shared.f64 fd164, [r13+384]; +ld.shared.f64 fd165, [r13+448]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+64]; +ld.shared.f64 fd168, [r13+128]; +ld.shared.f64 fd169, [r13+192]; +ld.shared.f64 fd170, [r13+256]; +ld.shared.f64 fd171, [r13+320]; +ld.shared.f64 fd172, [r13+384]; +ld.shared.f64 fd173, [r13+448]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 %0, fd182, fd198; +add.f64 %1, fd183, fd199; +add.f64 %3, fd187, fd210; +add.f64 %2, fd186, fd208; +sub.f64 %5, fd185, fd200; +add.f64 %4, fd184, fd201; +add.f64 %7, fd189, fd214; +add.f64 %6, fd188, fd213; +sub.f64 %8, fd182, fd198; +sub.f64 %9, fd183, fd199; +sub.f64 %11, fd187, fd210; +sub.f64 %10, fd186, fd208; +add.f64 %13, fd185, fd200; +sub.f64 %12, fd184, fd201; +sub.f64 %15, fd189, fd214; +sub.f64 %14, fd188, fd213; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<446, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<153>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+256]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 960; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+256]; +ld.shared.v2.f64 {fd69, fd70}, [r13+512]; +ld.shared.v2.f64 {fd73, fd74}, [r13+768]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +add.f64 fd87, fd79, fd84; +sub.f64 fd88, fd80, fd83; +sub.f64 fd89, fd79, fd84; +add.f64 fd90, fd80, fd83; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd91, fd87; +mul.f64 fd96, fd92, fd88; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd100, fd85; +mul.f64 fd104, fd102, fd86; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+64]; +mul.f64 fd110, fd106, fd89; +mul.f64 fd111, fd107, fd90; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 768; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd92, fd87, fd97; +sub.f64 fd116, fd95, fd96; +st.shared.v2.f64 [r19+64], {fd116, fd115}; +fma.rn.f64 fd117, fd102, fd85, fd105; +sub.f64 fd118, fd103, fd104; +st.shared.v2.f64 [r19+128], {fd118, fd117}; +fma.rn.f64 fd119, fd107, fd89, fd112; +sub.f64 fd120, fd110, fd111; +st.shared.v2.f64 [r19+192], {fd120, fd119}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+256]; +ld.shared.v2.f64 {fd129, fd130}, [r20+512]; +ld.shared.v2.f64 {fd133, fd134}, [r20+768]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +add.f64 %1, fd138, fd142; +add.f64 %0, fd137, fd141; +sub.f64 %3, fd140, fd143; +add.f64 %2, fd139, fd144; +sub.f64 %5, fd138, fd142; +sub.f64 %4, fd137, fd141; +add.f64 %7, fd140, fd143; +sub.f64 %6, fd139, fd144; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<447, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<247>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+128]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 896; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+128]; +ld.shared.v2.f64 {fd166, fd167}, [r13+256]; +ld.shared.v2.f64 {fd170, fd171}, [r13+384]; +ld.shared.v2.f64 {fd174, fd175}, [r13+512]; +ld.shared.v2.f64 {fd178, fd179}, [r13+640]; +ld.shared.v2.f64 {fd182, fd183}, [r13+768]; +ld.shared.v2.f64 {fd186, fd187}, [r13+896]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +add.f64 %1, fd199, fd215; +add.f64 %0, fd198, fd214; +add.f64 %3, fd203, fd226; +add.f64 %2, fd202, fd224; +sub.f64 %5, fd201, fd216; +add.f64 %4, fd200, fd217; +add.f64 %7, fd205, fd230; +add.f64 %6, fd204, fd229; +sub.f64 %9, fd199, fd215; +sub.f64 %8, fd198, fd214; +sub.f64 %11, fd203, fd226; +sub.f64 %10, fd202, fd224; +add.f64 %13, fd201, fd216; +sub.f64 %12, fd200, fd217; +sub.f64 %15, fd205, fd230; +sub.f64 %14, fd204, fd229; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<445, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<137>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+256]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 480; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+128]; +ld.shared.f64 fd63, [r13+256]; +ld.shared.f64 fd64, [r13+384]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+128]; +ld.shared.f64 fd67, [r13+256]; +ld.shared.f64 fd68, [r13+384]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +add.f64 fd81, fd71, fd76; +sub.f64 fd82, fd72, fd75; +sub.f64 fd83, fd71, fd76; +add.f64 fd84, fd72, fd75; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd85, fd81; +mul.f64 fd90, fd86, fd82; +sub.f64 fd91, fd89, fd90; +mul.f64 fd92, fd85, fd82; +fma.rn.f64 fd93, fd86, fd81, fd92; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd96, fd79; +mul.f64 fd100, fd98, fd80; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd96, fd80; +fma.rn.f64 fd103, fd98, fd79, fd102; +ld.global.v2.f64 {fd104, fd105}, [rd8+64]; +mul.f64 fd108, fd104, fd83; +mul.f64 fd109, fd105, fd84; +sub.f64 fd110, fd108, fd109; +mul.f64 fd111, fd104, fd84; +fma.rn.f64 fd112, fd105, fd83, fd111; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 384; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd91; +st.shared.f64 [r20+64], fd101; +st.shared.f64 [r20+96], fd110; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+128]; +ld.shared.f64 fd115, [r21+256]; +ld.shared.f64 fd116, [r21+384]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+128]; +ld.shared.f64 fd119, [r21+256]; +ld.shared.f64 fd120, [r21+384]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 %0, fd121, fd125; +add.f64 %1, fd122, fd126; +sub.f64 %3, fd124, fd127; +add.f64 %2, fd123, fd128; +sub.f64 %4, fd121, fd125; +sub.f64 %5, fd122, fd126; +add.f64 %7, fd124, fd127; +sub.f64 %6, fd123, fd128; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<448, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<626>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd619, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd617, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd616, fd619, fd617; +sub.f64 fd76, fd619, fd617; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd615, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd612, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd610, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd609, fd612, fd610; +sub.f64 fd92, fd612, fd610; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd608, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd608, 0dBFE6A09E667F3BCD; +mul.f64 fd607, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd607, fd98; +mul.f64 fd100, fd608, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd606, fd616, fd609; +sub.f64 fd109, fd616, fd609; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd605, fd615, fd101; +sub.f64 fd113, fd615, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd604, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd603, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd601, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd598, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd597, fd601, fd598; +sub.f64 fd133, fd601, fd598; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd596, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd594, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd592, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd591, fd594, fd592; +sub.f64 fd149, fd594, fd592; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd590, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd590, 0dBFE6A09E667F3BCD; +mul.f64 fd589, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd589, fd155; +mul.f64 fd157, fd590, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd588, fd597, fd591; +sub.f64 fd166, fd597, fd591; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd587, fd596, fd158; +sub.f64 fd170, fd596, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd586, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd585, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd583, fd167, 0d3FED906BCF328D46; +mul.f64 fd584, fd587, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd583, fd584; +mul.f64 fd182, fd587, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd581, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd582, fd586, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd581, fd582; +mul.f64 fd187, fd586, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd579, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd580, fd585, 0dBFED906BCF328D46; +sub.f64 fd191, fd579, fd580; +mul.f64 fd192, fd585, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd577, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd578, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd577, fd578; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd575, fd177, 0dBFED906BCF328D46; +mul.f64 fd576, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd575, fd576; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd574, fd605, fd183; +sub.f64 fd213, fd605, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd573, fd604, fd188; +sub.f64 fd217, fd604, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd572, fd603, fd193; +sub.f64 fd221, fd603, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd571, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd570, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd569, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd568, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +and.b32 r14, r15, 3; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd574; +mul.f64 fd244, fd238, fd574; +mul.f64 fd246, fd239, fd239; +mul.f64 fd567, fd238, fd238; +sub.f64 fd247, fd567, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd573; +mul.f64 fd252, fd247, fd573; +mul.f64 fd565, fd238, fd247; +mul.f64 fd566, fd239, fd249; +sub.f64 fd255, fd565, fd566; +mul.f64 fd564, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd572; +mul.f64 fd260, fd255, fd572; +mul.f64 fd262, fd239, fd257; +mul.f64 fd563, fd238, fd255; +sub.f64 fd263, fd563, fd262; +mul.f64 fd562, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd571; +mul.f64 fd268, fd263, fd571; +mul.f64 fd270, fd239, fd265; +mul.f64 fd561, fd238, fd263; +sub.f64 fd271, fd561, fd270; +mul.f64 fd560, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd570; +mul.f64 fd276, fd271, fd570; +mul.f64 fd558, fd238, fd271; +mul.f64 fd559, fd239, fd273; +sub.f64 fd279, fd558, fd559; +mul.f64 fd557, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd569; +mul.f64 fd284, fd279, fd569; +mul.f64 fd286, fd239, fd281; +mul.f64 fd556, fd238, fd279; +sub.f64 fd287, fd556, fd286; +mul.f64 fd555, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd568; +mul.f64 fd292, fd287, fd568; +mul.f64 fd294, fd239, fd289; +mul.f64 fd554, fd238, fd287; +sub.f64 fd295, fd554, fd294; +mul.f64 fd553, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd552, fd606, fd588; +sub.f64 fd551, fd106, fd163; +mul.f64 fd298, fd295, fd551; +mul.f64 fd299, fd297, fd552; +mul.f64 fd300, fd295, fd552; +ld.global.v2.f64 {fd301, fd302}, [rd5+64]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd549, fd238, fd301; +mul.f64 fd550, fd239, fd302; +sub.f64 fd310, fd549, fd550; +mul.f64 fd548, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd547, fd238, fd310; +sub.f64 fd318, fd547, fd317; +mul.f64 fd546, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd544, fd238, fd318; +mul.f64 fd545, fd239, fd320; +sub.f64 fd326, fd544, fd545; +mul.f64 fd543, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd541, fd238, fd326; +mul.f64 fd542, fd239, fd328; +sub.f64 fd334, fd541, fd542; +mul.f64 fd540, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd539, fd238, fd334; +sub.f64 fd342, fd539, fd341; +mul.f64 fd538, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd536, fd238, fd342; +mul.f64 fd537, fd239, fd344; +sub.f64 fd350, fd536, fd537; +mul.f64 fd535, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd534, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 3; +sub.f64 fd623, fd606, fd588; +mul.f64 fd622, fd297, fd623; +mov.u32 r23, %tid.x; +shl.b32 r22, r23, 8; +barrier.sync 0; +and.b32 r11, r22, 768; +add.s32 r12, r9, r11; +sub.f64 fd625, fd606, fd588; +mul.f64 fd624, fd297, fd625; +add.f64 fd356, fd606, fd588; +sub.f64 fd621, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 3; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 3; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd535, fd243; +st.shared.v2.f64 [r12+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd564, fd251; +st.shared.v2.f64 [r12+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd562, fd259; +st.shared.v2.f64 [r12+48], {fd363, fd362}; +sub.f64 fd364, fd560, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r12+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd557, fd275; +st.shared.v2.f64 [r12+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd555, fd283; +st.shared.v2.f64 [r12+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd553, fd291; +st.shared.v2.f64 [r12+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd621, fd300; +sub.f64 fd373, fd298, fd624; +st.shared.v2.f64 [r12+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd548, fd306; +st.shared.v2.f64 [r12+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd546, fd314; +st.shared.v2.f64 [r12+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd543, fd322; +st.shared.v2.f64 [r12+176], {fd379, fd378}; +sub.f64 fd380, fd540, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r12+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd538, fd338; +st.shared.v2.f64 [r12+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd534, fd346; +st.shared.v2.f64 [r12+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r12+240], {fd387, fd386}; +barrier.sync 0; +mad.lo.s32 r13, r20, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+64]; +ld.shared.v2.f64 {fd396, fd397}, [r13+128]; +ld.shared.v2.f64 {fd400, fd401}, [r13+192]; +ld.shared.v2.f64 {fd404, fd405}, [r13+256]; +ld.shared.v2.f64 {fd408, fd409}, [r13+320]; +ld.shared.v2.f64 {fd412, fd413}, [r13+384]; +ld.shared.v2.f64 {fd416, fd417}, [r13+448]; +ld.shared.v2.f64 {fd420, fd421}, [r13+512]; +ld.shared.v2.f64 {fd424, fd425}, [r13+576]; +ld.shared.v2.f64 {fd428, fd429}, [r13+640]; +ld.shared.v2.f64 {fd432, fd433}, [r13+704]; +ld.shared.v2.f64 {fd436, fd437}, [r13+768]; +ld.shared.v2.f64 {fd440, fd441}, [r13+832]; +ld.shared.v2.f64 {fd444, fd445}, [r13+896]; +ld.shared.v2.f64 {fd448, fd449}, [r13+960]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd533, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd532, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd392, fd424; +sub.f64 fd462, fd392, fd424; +add.f64 fd531, fd393, fd425; +sub.f64 fd463, fd393, fd425; +add.f64 fd464, fd408, fd440; +sub.f64 fd466, fd408, fd440; +add.f64 fd530, fd409, fd441; +sub.f64 fd467, fd409, fd441; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd529, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd528, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd400, fd432; +sub.f64 fd478, fd400, fd432; +add.f64 fd527, fd401, fd433; +sub.f64 fd479, fd401, fd433; +add.f64 fd480, fd416, fd448; +sub.f64 fd482, fd416, fd448; +add.f64 fd526, fd417, fd449; +sub.f64 fd483, fd417, fd449; +add.f64 %0, fd452, fd456; +add.f64 %1, fd533, fd532; +add.f64 %2, fd460, fd464; +add.f64 %3, fd531, fd530; +add.f64 %4, fd468, fd472; +add.f64 %5, fd529, fd528; +add.f64 %7, fd527, fd526; +add.f64 %6, fd476, fd480; +sub.f64 %9, fd455, fd458; +add.f64 %8, fd454, fd459; +add.f64 %10, fd462, fd467; +sub.f64 %11, fd463, fd466; +add.f64 %12, fd470, fd475; +sub.f64 %13, fd471, fd474; +add.f64 %14, fd478, fd483; +sub.f64 %15, fd479, fd482; +sub.f64 %17, fd533, fd532; +sub.f64 %16, fd452, fd456; +sub.f64 %19, fd531, fd530; +sub.f64 %18, fd460, fd464; +sub.f64 %21, fd529, fd528; +sub.f64 %20, fd468, fd472; +sub.f64 %23, fd527, fd526; +sub.f64 %22, fd476, fd480; +add.f64 %25, fd455, fd458; +sub.f64 %24, fd454, fd459; +add.f64 %27, fd463, fd466; +sub.f64 %26, fd462, fd467; +add.f64 %29, fd471, fd474; +sub.f64 %28, fd470, fd475; +add.f64 %31, fd479, fd482; +sub.f64 %30, fd478, fd483; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<449, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<484>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+64]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 384; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+32]; +ld.shared.f64 fd390, [r13+64]; +ld.shared.f64 fd391, [r13+96]; +ld.shared.f64 fd392, [r13+128]; +ld.shared.f64 fd393, [r13+160]; +ld.shared.f64 fd394, [r13+192]; +ld.shared.f64 fd395, [r13+224]; +ld.shared.f64 fd396, [r13+256]; +ld.shared.f64 fd397, [r13+288]; +ld.shared.f64 fd398, [r13+320]; +ld.shared.f64 fd399, [r13+352]; +ld.shared.f64 fd400, [r13+384]; +ld.shared.f64 fd401, [r13+416]; +ld.shared.f64 fd402, [r13+448]; +ld.shared.f64 fd403, [r13+480]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+32]; +ld.shared.f64 fd406, [r13+64]; +ld.shared.f64 fd407, [r13+96]; +ld.shared.f64 fd408, [r13+128]; +ld.shared.f64 fd409, [r13+160]; +ld.shared.f64 fd410, [r13+192]; +ld.shared.f64 fd411, [r13+224]; +ld.shared.f64 fd412, [r13+256]; +ld.shared.f64 fd413, [r13+288]; +ld.shared.f64 fd414, [r13+320]; +ld.shared.f64 fd415, [r13+352]; +ld.shared.f64 fd416, [r13+384]; +ld.shared.f64 fd417, [r13+416]; +ld.shared.f64 fd418, [r13+448]; +ld.shared.f64 fd419, [r13+480]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd389, fd397; +add.f64 fd429, fd405, fd413; +sub.f64 fd430, fd389, fd397; +sub.f64 fd431, fd405, fd413; +add.f64 fd432, fd393, fd401; +add.f64 fd433, fd409, fd417; +sub.f64 fd434, fd393, fd401; +sub.f64 fd435, fd409, fd417; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd391, fd399; +add.f64 fd445, fd407, fd415; +sub.f64 fd446, fd391, fd399; +sub.f64 fd447, fd407, fd415; +add.f64 fd448, fd395, fd403; +add.f64 fd449, fd411, fd419; +sub.f64 fd450, fd395, fd403; +sub.f64 fd451, fd411, fd419; +add.f64 %0, fd420, fd424; +add.f64 %1, fd421, fd425; +add.f64 %2, fd428, fd432; +add.f64 %3, fd429, fd433; +add.f64 %4, fd436, fd440; +add.f64 %5, fd437, fd441; +add.f64 %6, fd444, fd448; +add.f64 %7, fd445, fd449; +sub.f64 %9, fd423, fd426; +add.f64 %8, fd422, fd427; +sub.f64 %11, fd431, fd434; +add.f64 %10, fd430, fd435; +sub.f64 %13, fd439, fd442; +add.f64 %12, fd438, fd443; +sub.f64 %15, fd447, fd450; +add.f64 %14, fd446, fd451; +sub.f64 %16, fd420, fd424; +sub.f64 %17, fd421, fd425; +sub.f64 %18, fd428, fd432; +sub.f64 %19, fd429, fd433; +sub.f64 %20, fd436, fd440; +sub.f64 %21, fd437, fd441; +sub.f64 %22, fd444, fd448; +sub.f64 %23, fd445, fd449; +add.f64 %25, fd423, fd426; +sub.f64 %24, fd422, fd427; +add.f64 %27, fd431, fd434; +sub.f64 %26, fd430, fd435; +add.f64 %29, fd439, fd442; +sub.f64 %28, fd438, fd443; +add.f64 %31, fd447, fd450; +sub.f64 %30, fd446, fd451; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<451, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<98>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %10, %12; +add.f64 fd10, %11, %13; +sub.f64 fd11, %10, %12; +sub.f64 fd12, %11, %13; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 496; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 248; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+256]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+256]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 480; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 240; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+256]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+256]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd47, fd45; +mul.f64 fd52, fd48, fd46; +sub.f64 fd53, fd51, fd52; +mul.f64 fd54, fd47, fd46; +fma.rn.f64 fd55, fd48, fd45, fd54; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 448; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd53; +barrier.sync 0; +and.b32 r26, r11, 224; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+256]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+256]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 2; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd64, fd62; +mul.f64 fd69, fd65, fd63; +sub.f64 fd70, fd68, fd69; +mul.f64 fd71, fd64, fd63; +fma.rn.f64 fd72, fd65, fd62, fd71; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 384; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd70; +barrier.sync 0; +and.b32 r33, r11, 192; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+256]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+256]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 16; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd81, fd79; +mul.f64 fd86, fd82, fd80; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd81, fd80; +fma.rn.f64 fd89, fd82, fd79, fd88; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 256; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd87; +barrier.sync 0; +and.b32 r40, r11, 128; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+256]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+256]; +add.f64 %0, fd90, fd91; +add.f64 %1, fd92, fd93; +sub.f64 %2, fd90, fd91; +sub.f64 %3, fd92, fd93; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<450, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<118>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %10, %12; +sub.f64 fd10, %11, %13; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 992; +add.s32 r11, r8, r10; +add.f64 fd18, %11, %13; +add.f64 fd19, %10, %12; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 496; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+512]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 960; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 480; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+512]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd53, fd51; +mul.f64 fd58, fd54, fd52; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 896; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd54, fd51, fd59; +sub.f64 fd63, fd57, fd58; +st.shared.v2.f64 [r25+64], {fd63, fd62}; +barrier.sync 0; +and.b32 r26, r9, 448; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+512]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 2; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd74, fd72; +mul.f64 fd79, fd75, fd73; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 768; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd75, fd72, fd80; +sub.f64 fd84, fd78, fd79; +st.shared.v2.f64 [r32+128], {fd84, fd83}; +barrier.sync 0; +and.b32 r33, r9, 384; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+512]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 16; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd95, fd93; +mul.f64 fd100, fd96, fd94; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 512; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd96, fd93, fd101; +sub.f64 fd105, fd99, fd100; +st.shared.v2.f64 [r39+256], {fd105, fd104}; +barrier.sync 0; +and.b32 r40, r9, 256; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+512]; +add.f64 %1, fd107, fd111; +add.f64 %0, fd106, fd110; +sub.f64 %3, fd107, fd111; +sub.f64 %2, fd106, fd110; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..f2d4953eb2366 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_64_fp64_inv.hpp.inc @@ -0,0 +1,2038 @@ +#ifndef CUFFTDX_FFT_64_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_64_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<615, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<231>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+128]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 448; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+64]; +ld.shared.f64 fd160, [r13+128]; +ld.shared.f64 fd161, [r13+192]; +ld.shared.f64 fd162, [r13+256]; +ld.shared.f64 fd163, [r13+320]; +ld.shared.f64 fd164, [r13+384]; +ld.shared.f64 fd165, [r13+448]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+64]; +ld.shared.f64 fd168, [r13+128]; +ld.shared.f64 fd169, [r13+192]; +ld.shared.f64 fd170, [r13+256]; +ld.shared.f64 fd171, [r13+320]; +ld.shared.f64 fd172, [r13+384]; +ld.shared.f64 fd173, [r13+448]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 %0, fd182, fd198; +add.f64 %1, fd183, fd199; +add.f64 %3, fd187, fd209; +add.f64 %2, fd186, fd208; +add.f64 %5, fd185, fd200; +sub.f64 %4, fd184, fd201; +add.f64 %7, fd189, fd214; +add.f64 %6, fd188, fd212; +sub.f64 %8, fd182, fd198; +sub.f64 %9, fd183, fd199; +sub.f64 %11, fd187, fd209; +sub.f64 %10, fd186, fd208; +sub.f64 %13, fd185, fd200; +add.f64 %12, fd184, fd201; +sub.f64 %15, fd189, fd214; +sub.f64 %14, fd188, fd212; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<617, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<153>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+256]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 960; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+256]; +ld.shared.v2.f64 {fd69, fd70}, [r13+512]; +ld.shared.v2.f64 {fd73, fd74}, [r13+768]; +add.f64 fd77, fd61, fd69; +add.f64 fd78, fd62, fd70; +sub.f64 fd79, fd61, fd69; +sub.f64 fd80, fd62, fd70; +add.f64 fd81, fd65, fd73; +add.f64 fd82, fd66, fd74; +sub.f64 fd83, fd65, fd73; +sub.f64 fd84, fd66, fd74; +sub.f64 fd85, fd77, fd81; +sub.f64 fd86, fd78, fd82; +sub.f64 fd87, fd79, fd84; +add.f64 fd88, fd80, fd83; +add.f64 fd89, fd79, fd84; +sub.f64 fd90, fd80, fd83; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd91, fd92}, [rd8]; +mul.f64 fd95, fd88, fd92; +mul.f64 fd96, fd87, fd92; +mul.f64 fd97, fd91, fd88; +mul.f64 fd98, fd91, fd91; +mul.f64 fd99, fd92, fd92; +sub.f64 fd100, fd98, fd99; +mul.f64 fd101, fd92, fd91; +fma.rn.f64 fd102, fd92, fd91, fd101; +mul.f64 fd103, fd86, fd102; +mul.f64 fd104, fd85, fd102; +mul.f64 fd105, fd100, fd86; +ld.global.v2.f64 {fd106, fd107}, [rd8+64]; +mul.f64 fd110, fd90, fd107; +mul.f64 fd111, fd89, fd107; +mul.f64 fd112, fd106, fd90; +and.b32 r16, r10, 48; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 768; +add.s32 r19, r17, r18; +add.f64 fd113, fd78, fd82; +add.f64 fd114, fd77, fd81; +st.shared.v2.f64 [r19], {fd114, fd113}; +fma.rn.f64 fd115, fd91, fd87, fd95; +sub.f64 fd116, fd97, fd96; +st.shared.v2.f64 [r19+64], {fd115, fd116}; +fma.rn.f64 fd117, fd100, fd85, fd103; +sub.f64 fd118, fd105, fd104; +st.shared.v2.f64 [r19+128], {fd117, fd118}; +fma.rn.f64 fd119, fd106, fd89, fd110; +sub.f64 fd120, fd112, fd111; +st.shared.v2.f64 [r19+192], {fd119, fd120}; +barrier.sync 0; +mad.lo.s32 r20, r14, -48, r19; +ld.shared.v2.f64 {fd121, fd122}, [r20]; +ld.shared.v2.f64 {fd125, fd126}, [r20+256]; +ld.shared.v2.f64 {fd129, fd130}, [r20+512]; +ld.shared.v2.f64 {fd133, fd134}, [r20+768]; +add.f64 fd137, fd121, fd129; +add.f64 fd138, fd122, fd130; +sub.f64 fd139, fd121, fd129; +sub.f64 fd140, fd122, fd130; +add.f64 fd141, fd125, fd133; +add.f64 fd142, fd126, fd134; +sub.f64 fd143, fd125, fd133; +sub.f64 fd144, fd126, fd134; +add.f64 %1, fd138, fd142; +add.f64 %0, fd137, fd141; +add.f64 %3, fd140, fd143; +sub.f64 %2, fd139, fd144; +sub.f64 %5, fd138, fd142; +sub.f64 %4, fd137, fd141; +sub.f64 %7, fd140, fd143; +add.f64 %6, fd139, fd144; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<618, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<247>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %18, %28; +add.f64 fd34, %19, %30; +sub.f64 fd35, %18, %28; +sub.f64 fd36, %19, %30; +add.f64 fd37, %23, %34; +add.f64 fd38, %25, %35; +sub.f64 fd39, %23, %34; +sub.f64 fd40, %25, %35; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %20, %31; +add.f64 fd50, %22, %33; +sub.f64 fd51, %20, %31; +sub.f64 fd52, %22, %33; +add.f64 fd53, %26, %36; +add.f64 fd54, %27, %37; +sub.f64 fd55, %26, %36; +sub.f64 fd56, %27, %37; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 7; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 112; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+128]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 896; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+128]; +ld.shared.v2.f64 {fd166, fd167}, [r13+256]; +ld.shared.v2.f64 {fd170, fd171}, [r13+384]; +ld.shared.v2.f64 {fd174, fd175}, [r13+512]; +ld.shared.v2.f64 {fd178, fd179}, [r13+640]; +ld.shared.v2.f64 {fd182, fd183}, [r13+768]; +ld.shared.v2.f64 {fd186, fd187}, [r13+896]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +add.f64 %1, fd199, fd215; +add.f64 %0, fd198, fd214; +add.f64 %3, fd203, fd225; +add.f64 %2, fd202, fd224; +add.f64 %5, fd201, fd216; +sub.f64 %4, fd200, fd217; +add.f64 %7, fd205, fd230; +add.f64 %6, fd204, fd228; +sub.f64 %9, fd199, fd215; +sub.f64 %8, fd198, fd214; +sub.f64 %11, fd203, fd225; +sub.f64 %10, fd202, fd224; +sub.f64 %13, fd201, fd216; +add.f64 %12, fd200, fd217; +sub.f64 %15, fd205, fd230; +sub.f64 %14, fd204, fd228; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<616, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<22>; +.reg .f64 fd<137>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %11, %16; +add.f64 fd18, %12, %18; +sub.f64 fd19, %11, %16; +sub.f64 fd20, %12, %18; +add.f64 fd21, %13, %19; +add.f64 fd22, %15, %20; +sub.f64 fd23, %13, %19; +sub.f64 fd24, %15, %20; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 15; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 240; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+256]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 480; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+128]; +ld.shared.f64 fd63, [r13+256]; +ld.shared.f64 fd64, [r13+384]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+128]; +ld.shared.f64 fd67, [r13+256]; +ld.shared.f64 fd68, [r13+384]; +add.f64 fd69, fd61, fd63; +add.f64 fd70, fd65, fd67; +sub.f64 fd71, fd61, fd63; +sub.f64 fd72, fd65, fd67; +add.f64 fd73, fd62, fd64; +add.f64 fd74, fd66, fd68; +sub.f64 fd75, fd62, fd64; +sub.f64 fd76, fd66, fd68; +add.f64 fd77, fd69, fd73; +add.f64 fd78, fd70, fd74; +sub.f64 fd79, fd69, fd73; +sub.f64 fd80, fd70, fd74; +sub.f64 fd81, fd71, fd76; +add.f64 fd82, fd72, fd75; +add.f64 fd83, fd71, fd76; +sub.f64 fd84, fd72, fd75; +and.b32 r14, r5, 12; +bfe.u32 r15, r5, 2, 2; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %10; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd85, fd86}, [rd8]; +mul.f64 fd89, fd82, fd86; +fma.rn.f64 fd90, fd85, fd81, fd89; +mul.f64 fd91, fd81, fd86; +mul.f64 fd92, fd85, fd82; +sub.f64 fd93, fd92, fd91; +mul.f64 fd94, fd85, fd85; +mul.f64 fd95, fd86, fd86; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd86, fd85; +fma.rn.f64 fd98, fd86, fd85, fd97; +mul.f64 fd99, fd80, fd98; +fma.rn.f64 fd100, fd96, fd79, fd99; +mul.f64 fd101, fd79, fd98; +mul.f64 fd102, fd96, fd80; +sub.f64 fd103, fd102, fd101; +ld.global.v2.f64 {fd104, fd105}, [rd8+64]; +mul.f64 fd108, fd84, fd105; +fma.rn.f64 fd109, fd104, fd83, fd108; +mul.f64 fd110, fd83, fd105; +mul.f64 fd111, fd104, fd84; +sub.f64 fd112, fd111, fd110; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 24; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 384; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd77; +st.shared.f64 [r20+32], fd90; +st.shared.f64 [r20+64], fd100; +st.shared.f64 [r20+96], fd109; +barrier.sync 0; +mad.lo.s32 r21, r14, -24, r20; +ld.shared.f64 fd113, [r21]; +ld.shared.f64 fd114, [r21+128]; +ld.shared.f64 fd115, [r21+256]; +ld.shared.f64 fd116, [r21+384]; +barrier.sync 0; +st.shared.f64 [r20], fd78; +st.shared.f64 [r20+32], fd93; +st.shared.f64 [r20+64], fd103; +st.shared.f64 [r20+96], fd112; +barrier.sync 0; +ld.shared.f64 fd117, [r21]; +ld.shared.f64 fd118, [r21+128]; +ld.shared.f64 fd119, [r21+256]; +ld.shared.f64 fd120, [r21+384]; +add.f64 fd121, fd113, fd115; +add.f64 fd122, fd117, fd119; +sub.f64 fd123, fd113, fd115; +sub.f64 fd124, fd117, fd119; +add.f64 fd125, fd114, fd116; +add.f64 fd126, fd118, fd120; +sub.f64 fd127, fd114, fd116; +sub.f64 fd128, fd118, fd120; +add.f64 %0, fd121, fd125; +add.f64 %1, fd122, fd126; +add.f64 %3, fd124, fd127; +sub.f64 %2, fd123, fd128; +sub.f64 %4, fd121, fd125; +sub.f64 %5, fd122, fd126; +sub.f64 %7, fd124, fd127; +add.f64 %6, fd123, fd128; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_64), "l"(lut_dp_4_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<619, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<625>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %34, %50; +sub.f64 fd67, %34, %50; +add.f64 fd619, %35, %66; +sub.f64 fd68, %35, %66; +add.f64 fd69, %42, %58; +sub.f64 fd71, %42, %58; +add.f64 fd617, %67, %59; +sub.f64 fd72, %67, %59; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd616, fd619, fd617; +sub.f64 fd76, fd619, fd617; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd615, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %38, %54; +sub.f64 fd83, %38, %54; +add.f64 fd612, %69, %68; +sub.f64 fd84, %69, %68; +add.f64 fd85, %46, %62; +sub.f64 fd87, %46, %62; +add.f64 fd610, %47, %70; +sub.f64 fd88, %47, %70; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd609, fd612, fd610; +sub.f64 fd92, fd612, fd610; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd608, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd608, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd606, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd607, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd606, fd607; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd605, fd616, fd609; +sub.f64 fd109, fd616, fd609; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd604, fd615, fd100; +sub.f64 fd113, fd615, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd603, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd602, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %52; +sub.f64 fd124, %36, %52; +add.f64 fd600, %71, %53; +sub.f64 fd125, %71, %53; +add.f64 fd126, %44, %60; +sub.f64 fd128, %44, %60; +add.f64 fd597, %72, %73; +sub.f64 fd129, %72, %73; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd596, fd600, fd597; +sub.f64 fd133, fd600, fd597; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd595, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %40, %56; +sub.f64 fd140, %40, %56; +add.f64 fd593, %41, %74; +sub.f64 fd141, %41, %74; +add.f64 fd142, %48, %64; +sub.f64 fd144, %48, %64; +add.f64 fd591, %75, %65; +sub.f64 fd145, %75, %65; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd590, fd593, fd591; +sub.f64 fd149, fd593, fd591; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd589, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd589, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd587, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd588, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd587, fd588; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd586, fd596, fd590; +sub.f64 fd166, fd596, fd590; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd585, fd595, fd157; +sub.f64 fd170, fd595, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd584, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd583, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd581, fd167, 0d3FED906BCF328D46; +mul.f64 fd582, fd585, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd581, fd582; +mul.f64 fd182, fd585, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd584, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd583, 0d3FED906BCF328D46; +mul.f64 fd580, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd580, fd189; +mul.f64 fd191, fd583, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd579, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd579, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd577, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd578, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd577, fd578; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd575, fd177, 0dBFED906BCF328D46; +mul.f64 fd576, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd575, fd576; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd574, fd604, fd183; +sub.f64 fd213, fd604, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd573, fd603, fd187; +sub.f64 fd217, fd603, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd572, fd602, fd192; +sub.f64 fd221, fd602, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd571, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd570, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd569, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd568, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r15, %tid.x; +shl.b32 r7, r15, 8; +and.b32 r8, r7, -1024; +add.s32 r9, r4, r8; +and.b32 r14, r15, 3; +shl.b32 r10, r15, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd574, fd239; +mul.f64 fd244, fd238, fd574; +mul.f64 fd246, fd239, fd239; +mul.f64 fd567, fd238, fd238; +sub.f64 fd247, fd567, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd573, fd249; +mul.f64 fd252, fd247, fd573; +mul.f64 fd565, fd238, fd247; +mul.f64 fd566, fd239, fd249; +sub.f64 fd255, fd565, fd566; +mul.f64 fd564, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd572, fd257; +mul.f64 fd260, fd255, fd572; +mul.f64 fd262, fd239, fd257; +mul.f64 fd563, fd238, fd255; +sub.f64 fd263, fd563, fd262; +mul.f64 fd562, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd571, fd265; +mul.f64 fd268, fd263, fd571; +mul.f64 fd270, fd239, fd265; +mul.f64 fd561, fd238, fd263; +sub.f64 fd271, fd561, fd270; +mul.f64 fd560, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd570, fd273; +mul.f64 fd276, fd271, fd570; +mul.f64 fd558, fd238, fd271; +mul.f64 fd559, fd239, fd273; +sub.f64 fd279, fd558, fd559; +mul.f64 fd557, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd569, fd281; +mul.f64 fd284, fd279, fd569; +mul.f64 fd286, fd239, fd281; +mul.f64 fd556, fd238, fd279; +sub.f64 fd287, fd556, fd286; +mul.f64 fd555, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd568, fd289; +mul.f64 fd292, fd287, fd568; +mul.f64 fd294, fd239, fd289; +mul.f64 fd554, fd238, fd287; +sub.f64 fd295, fd554, fd294; +mul.f64 fd553, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd552, fd605, fd586; +mul.f64 fd298, fd552, fd297; +sub.f64 fd551, fd106, fd163; +mul.f64 fd299, fd551, fd297; +mul.f64 fd300, fd295, fd552; +ld.global.v2.f64 {fd301, fd302}, [rd5+64]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd549, fd238, fd301; +mul.f64 fd550, fd239, fd302; +sub.f64 fd310, fd549, fd550; +mul.f64 fd548, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd547, fd238, fd310; +sub.f64 fd318, fd547, fd317; +mul.f64 fd546, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd544, fd238, fd318; +mul.f64 fd545, fd239, fd320; +sub.f64 fd326, fd544, fd545; +mul.f64 fd543, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd541, fd238, fd326; +mul.f64 fd542, fd239, fd328; +sub.f64 fd334, fd541, fd542; +mul.f64 fd540, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd539, fd238, fd334; +sub.f64 fd342, fd539, fd341; +mul.f64 fd538, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd536, fd238, fd342; +mul.f64 fd537, fd239, fd344; +sub.f64 fd350, fd536, fd537; +mul.f64 fd535, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd534, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 768; +add.s32 r12, r9, r11; +mov.u32 r17, %tid.x; +and.b32 r16, r17, 3; +sub.f64 fd624, fd605, fd586; +mul.f64 fd623, fd295, fd624; +add.f64 fd356, fd605, fd586; +mov.u32 r19, %tid.x; +and.b32 r18, r19, 3; +sub.f64 fd622, fd106, fd163; +add.f64 fd357, fd106, fd163; +st.shared.v2.f64 [r12], {fd357, fd356}; +mov.u32 r21, %tid.x; +and.b32 r20, r21, 3; +mov.u32 r23, %tid.x; +and.b32 r22, r23, 3; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd534; +st.shared.v2.f64 [r12+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd564; +st.shared.v2.f64 [r12+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd562; +st.shared.v2.f64 [r12+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd560; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r12+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd557; +st.shared.v2.f64 [r12+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd555; +st.shared.v2.f64 [r12+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd553; +st.shared.v2.f64 [r12+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd622, fd298; +sub.f64 fd373, fd623, fd299; +st.shared.v2.f64 [r12+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd548; +st.shared.v2.f64 [r12+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd546; +st.shared.v2.f64 [r12+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd543; +st.shared.v2.f64 [r12+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd540; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r12+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd538; +st.shared.v2.f64 [r12+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd535; +st.shared.v2.f64 [r12+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r12+240], {fd386, fd387}; +barrier.sync 0; +mad.lo.s32 r13, r22, -240, r12; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+64]; +ld.shared.v2.f64 {fd396, fd397}, [r13+128]; +ld.shared.v2.f64 {fd400, fd401}, [r13+192]; +ld.shared.v2.f64 {fd404, fd405}, [r13+256]; +ld.shared.v2.f64 {fd408, fd409}, [r13+320]; +ld.shared.v2.f64 {fd412, fd413}, [r13+384]; +ld.shared.v2.f64 {fd416, fd417}, [r13+448]; +ld.shared.v2.f64 {fd420, fd421}, [r13+512]; +ld.shared.v2.f64 {fd424, fd425}, [r13+576]; +ld.shared.v2.f64 {fd428, fd429}, [r13+640]; +ld.shared.v2.f64 {fd432, fd433}, [r13+704]; +ld.shared.v2.f64 {fd436, fd437}, [r13+768]; +ld.shared.v2.f64 {fd440, fd441}, [r13+832]; +ld.shared.v2.f64 {fd444, fd445}, [r13+896]; +ld.shared.v2.f64 {fd448, fd449}, [r13+960]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd533, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd532, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd392, fd424; +sub.f64 fd462, fd392, fd424; +add.f64 fd531, fd393, fd425; +sub.f64 fd463, fd393, fd425; +add.f64 fd464, fd408, fd440; +sub.f64 fd466, fd408, fd440; +add.f64 fd530, fd409, fd441; +sub.f64 fd467, fd409, fd441; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd529, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd528, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd400, fd432; +sub.f64 fd478, fd400, fd432; +add.f64 fd527, fd401, fd433; +sub.f64 fd479, fd401, fd433; +add.f64 fd480, fd416, fd448; +sub.f64 fd482, fd416, fd448; +add.f64 fd526, fd417, fd449; +sub.f64 fd483, fd417, fd449; +add.f64 %0, fd452, fd456; +add.f64 %1, fd533, fd532; +add.f64 %2, fd460, fd464; +add.f64 %3, fd531, fd530; +add.f64 %4, fd468, fd472; +add.f64 %5, fd529, fd528; +add.f64 %7, fd527, fd526; +add.f64 %6, fd476, fd480; +add.f64 %9, fd455, fd458; +sub.f64 %8, fd454, fd459; +sub.f64 %10, fd462, fd467; +add.f64 %11, fd463, fd466; +sub.f64 %12, fd470, fd475; +add.f64 %13, fd471, fd474; +sub.f64 %14, fd478, fd483; +add.f64 %15, fd479, fd482; +sub.f64 %17, fd533, fd532; +sub.f64 %16, fd452, fd456; +sub.f64 %19, fd531, fd530; +sub.f64 %18, fd460, fd464; +sub.f64 %21, fd529, fd528; +sub.f64 %20, fd468, fd472; +sub.f64 %23, fd527, fd526; +sub.f64 %22, fd476, fd480; +sub.f64 %25, fd455, fd458; +add.f64 %24, fd454, fd459; +sub.f64 %27, fd463, fd466; +add.f64 %26, fd462, fd467; +sub.f64 %29, fd471, fd474; +add.f64 %28, fd470, fd475; +sub.f64 %31, fd479, fd482; +add.f64 %30, fd478, fd483; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<620, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<484>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %34, %55; +add.f64 fd66, %35, %57; +sub.f64 fd67, %34, %55; +sub.f64 fd68, %35, %57; +add.f64 fd69, %44, %66; +add.f64 fd70, %46, %67; +sub.f64 fd71, %44, %66; +sub.f64 fd72, %46, %67; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %39, %60; +add.f64 fd82, %41, %62; +sub.f64 fd83, %39, %60; +sub.f64 fd84, %41, %62; +add.f64 fd85, %50, %71; +add.f64 fd86, %51, %73; +sub.f64 fd87, %50, %71; +sub.f64 fd88, %51, %73; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %36, %58; +add.f64 fd123, %38, %59; +sub.f64 fd124, %36, %58; +sub.f64 fd125, %38, %59; +add.f64 fd126, %47, %68; +add.f64 fd127, %49, %70; +sub.f64 fd128, %47, %68; +sub.f64 fd129, %49, %70; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %63; +add.f64 fd139, %43, %65; +sub.f64 fd140, %42, %63; +sub.f64 fd141, %43, %65; +add.f64 fd142, %52, %74; +add.f64 fd143, %54, %75; +sub.f64 fd144, %52, %74; +sub.f64 fd145, %54, %75; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 3; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+64]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -512; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 384; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+32]; +ld.shared.f64 fd390, [r13+64]; +ld.shared.f64 fd391, [r13+96]; +ld.shared.f64 fd392, [r13+128]; +ld.shared.f64 fd393, [r13+160]; +ld.shared.f64 fd394, [r13+192]; +ld.shared.f64 fd395, [r13+224]; +ld.shared.f64 fd396, [r13+256]; +ld.shared.f64 fd397, [r13+288]; +ld.shared.f64 fd398, [r13+320]; +ld.shared.f64 fd399, [r13+352]; +ld.shared.f64 fd400, [r13+384]; +ld.shared.f64 fd401, [r13+416]; +ld.shared.f64 fd402, [r13+448]; +ld.shared.f64 fd403, [r13+480]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+32]; +ld.shared.f64 fd406, [r13+64]; +ld.shared.f64 fd407, [r13+96]; +ld.shared.f64 fd408, [r13+128]; +ld.shared.f64 fd409, [r13+160]; +ld.shared.f64 fd410, [r13+192]; +ld.shared.f64 fd411, [r13+224]; +ld.shared.f64 fd412, [r13+256]; +ld.shared.f64 fd413, [r13+288]; +ld.shared.f64 fd414, [r13+320]; +ld.shared.f64 fd415, [r13+352]; +ld.shared.f64 fd416, [r13+384]; +ld.shared.f64 fd417, [r13+416]; +ld.shared.f64 fd418, [r13+448]; +ld.shared.f64 fd419, [r13+480]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd389, fd397; +add.f64 fd429, fd405, fd413; +sub.f64 fd430, fd389, fd397; +sub.f64 fd431, fd405, fd413; +add.f64 fd432, fd393, fd401; +add.f64 fd433, fd409, fd417; +sub.f64 fd434, fd393, fd401; +sub.f64 fd435, fd409, fd417; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd391, fd399; +add.f64 fd445, fd407, fd415; +sub.f64 fd446, fd391, fd399; +sub.f64 fd447, fd407, fd415; +add.f64 fd448, fd395, fd403; +add.f64 fd449, fd411, fd419; +sub.f64 fd450, fd395, fd403; +sub.f64 fd451, fd411, fd419; +add.f64 %0, fd420, fd424; +add.f64 %1, fd421, fd425; +add.f64 %2, fd428, fd432; +add.f64 %3, fd429, fd433; +add.f64 %4, fd436, fd440; +add.f64 %5, fd437, fd441; +add.f64 %6, fd444, fd448; +add.f64 %7, fd445, fd449; +add.f64 %9, fd423, fd426; +sub.f64 %8, fd422, fd427; +add.f64 %11, fd431, fd434; +sub.f64 %10, fd430, fd435; +add.f64 %13, fd439, fd442; +sub.f64 %12, fd438, fd443; +add.f64 %15, fd447, fd450; +sub.f64 %14, fd446, fd451; +sub.f64 %16, fd420, fd424; +sub.f64 %17, fd421, fd425; +sub.f64 %18, fd428, fd432; +sub.f64 %19, fd429, fd433; +sub.f64 %20, fd436, fd440; +sub.f64 %21, fd437, fd441; +sub.f64 %22, fd444, fd448; +sub.f64 %23, fd445, fd449; +sub.f64 %25, fd423, fd426; +add.f64 %24, fd422, fd427; +sub.f64 %27, fd431, fd434; +add.f64 %26, fd430, fd435; +sub.f64 %29, fd439, fd442; +add.f64 %28, fd438, fd443; +sub.f64 %31, fd447, fd450; +add.f64 %30, fd446, fd451; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_64), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<622, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<98>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 9; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %10, %12; +add.f64 fd10, %11, %13; +sub.f64 fd11, %10, %12; +sub.f64 fd12, %11, %13; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -512; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 496; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 248; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+256]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+256]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 480; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 240; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+256]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+256]; +add.f64 fd43, fd39, fd40; +add.f64 fd44, fd41, fd42; +sub.f64 fd45, fd39, fd40; +sub.f64 fd46, fd41, fd42; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd47, fd48}, [rd11]; +mul.f64 fd51, fd46, fd48; +fma.rn.f64 fd52, fd47, fd45, fd51; +mul.f64 fd53, fd45, fd48; +mul.f64 fd54, fd47, fd46; +sub.f64 fd55, fd54, fd53; +and.b32 r22, r11, 24; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 448; +add.s32 r25, r23, r24; +st.shared.f64 [r25], fd43; +st.shared.f64 [r25+32], fd52; +barrier.sync 0; +and.b32 r26, r11, 224; +sub.s32 r27, r25, r26; +ld.shared.f64 fd56, [r27]; +ld.shared.f64 fd57, [r27+256]; +barrier.sync 0; +st.shared.f64 [r25], fd44; +st.shared.f64 [r25+32], fd55; +barrier.sync 0; +ld.shared.f64 fd58, [r27]; +ld.shared.f64 fd59, [r27+256]; +add.f64 fd60, fd56, fd57; +add.f64 fd61, fd58, fd59; +sub.f64 fd62, fd56, fd57; +sub.f64 fd63, fd58, fd59; +bfe.u32 r28, r5, 3, 2; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd64, fd65}, [rd14]; +mul.f64 fd68, fd63, fd65; +fma.rn.f64 fd69, fd64, fd62, fd68; +mul.f64 fd70, fd62, fd65; +mul.f64 fd71, fd64, fd63; +sub.f64 fd72, fd71, fd70; +and.b32 r29, r11, 56; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 384; +add.s32 r32, r30, r31; +st.shared.f64 [r32], fd60; +st.shared.f64 [r32+64], fd69; +barrier.sync 0; +and.b32 r33, r11, 192; +sub.s32 r34, r32, r33; +ld.shared.f64 fd73, [r34]; +ld.shared.f64 fd74, [r34+256]; +barrier.sync 0; +st.shared.f64 [r32], fd61; +st.shared.f64 [r32+64], fd72; +barrier.sync 0; +ld.shared.f64 fd75, [r34]; +ld.shared.f64 fd76, [r34+256]; +add.f64 fd77, fd73, fd74; +add.f64 fd78, fd75, fd76; +sub.f64 fd79, fd73, fd74; +sub.f64 fd80, fd75, fd76; +and.b32 r35, r5, 16; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd81, fd82}, [rd17]; +mul.f64 fd85, fd80, fd82; +fma.rn.f64 fd86, fd81, fd79, fd85; +mul.f64 fd87, fd79, fd82; +mul.f64 fd88, fd81, fd80; +sub.f64 fd89, fd88, fd87; +and.b32 r36, r11, 120; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 256; +add.s32 r39, r37, r38; +st.shared.f64 [r39], fd77; +st.shared.f64 [r39+128], fd86; +barrier.sync 0; +and.b32 r40, r11, 128; +sub.s32 r41, r39, r40; +ld.shared.f64 fd90, [r41]; +ld.shared.f64 fd91, [r41+256]; +barrier.sync 0; +st.shared.f64 [r39], fd78; +st.shared.f64 [r39+128], fd89; +barrier.sync 0; +ld.shared.f64 fd92, [r41]; +ld.shared.f64 fd93, [r41+256]; +add.f64 %0, fd90, fd91; +add.f64 %1, fd92, fd93; +sub.f64 %2, fd90, fd91; +sub.f64 %3, fd92, fd93; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<621, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<42>; +.reg .f64 fd<118>; +.reg .b64 rd<18>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 10; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %10, %12; +sub.f64 fd10, %11, %13; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -1024; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 496; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 992; +add.s32 r11, r8, r10; +add.f64 fd18, %11, %13; +add.f64 fd19, %10, %12; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 496; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+512]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 4; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 960; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 480; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+512]; +sub.f64 fd51, fd43, fd47; +sub.f64 fd52, fd44, fd48; +bfe.u32 r21, r5, 2, 3; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %7; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd53, fd54}, [rd11]; +mul.f64 fd57, fd52, fd54; +mul.f64 fd58, fd51, fd54; +mul.f64 fd59, fd53, fd52; +and.b32 r22, r9, 48; +add.s32 r23, r8, r22; +barrier.sync 0; +and.b32 r24, r6, 896; +add.s32 r25, r23, r24; +add.f64 fd60, fd44, fd48; +add.f64 fd61, fd43, fd47; +st.shared.v2.f64 [r25], {fd61, fd60}; +fma.rn.f64 fd62, fd53, fd51, fd57; +sub.f64 fd63, fd59, fd58; +st.shared.v2.f64 [r25+64], {fd62, fd63}; +barrier.sync 0; +and.b32 r26, r9, 448; +sub.s32 r27, r25, r26; +ld.shared.v2.f64 {fd64, fd65}, [r27]; +ld.shared.v2.f64 {fd68, fd69}, [r27+512]; +sub.f64 fd72, fd64, fd68; +sub.f64 fd73, fd65, fd69; +bfe.u32 r28, r5, 3, 2; +mul.wide.u32 rd12, r28, 16; +mov.u64 rd13, %8; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd74, fd75}, [rd14]; +mul.f64 fd78, fd73, fd75; +mul.f64 fd79, fd72, fd75; +mul.f64 fd80, fd74, fd73; +and.b32 r29, r9, 112; +add.s32 r30, r8, r29; +barrier.sync 0; +and.b32 r31, r6, 768; +add.s32 r32, r30, r31; +add.f64 fd81, fd65, fd69; +add.f64 fd82, fd64, fd68; +st.shared.v2.f64 [r32], {fd82, fd81}; +fma.rn.f64 fd83, fd74, fd72, fd78; +sub.f64 fd84, fd80, fd79; +st.shared.v2.f64 [r32+128], {fd83, fd84}; +barrier.sync 0; +and.b32 r33, r9, 384; +sub.s32 r34, r32, r33; +ld.shared.v2.f64 {fd85, fd86}, [r34]; +ld.shared.v2.f64 {fd89, fd90}, [r34+512]; +sub.f64 fd93, fd85, fd89; +sub.f64 fd94, fd86, fd90; +and.b32 r35, r5, 16; +cvt.u64.u32 rd15, r35; +mov.u64 rd16, %9; +add.s64 rd17, rd16, rd15; +ld.global.v2.f64 {fd95, fd96}, [rd17]; +mul.f64 fd99, fd94, fd96; +mul.f64 fd100, fd93, fd96; +mul.f64 fd101, fd95, fd94; +and.b32 r36, r9, 240; +add.s32 r37, r8, r36; +barrier.sync 0; +and.b32 r38, r6, 512; +add.s32 r39, r37, r38; +add.f64 fd102, fd86, fd90; +add.f64 fd103, fd85, fd89; +st.shared.v2.f64 [r39], {fd103, fd102}; +fma.rn.f64 fd104, fd95, fd93, fd99; +sub.f64 fd105, fd101, fd100; +st.shared.v2.f64 [r39+256], {fd104, fd105}; +barrier.sync 0; +and.b32 r40, r9, 256; +sub.s32 r41, r39, r40; +ld.shared.v2.f64 {fd106, fd107}, [r41]; +ld.shared.v2.f64 {fd110, fd111}, [r41+512]; +add.f64 %1, fd107, fd111; +add.f64 %0, fd106, fd110; +sub.f64 %3, fd107, fd111; +sub.f64 %2, fd106, fd110; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_64), "l"(lut_dp_2_32), "l"(lut_dp_2_16), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..8acdb52f4f7cc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp16_fwd.hpp.inc @@ -0,0 +1,32090 @@ +#ifndef CUFFTDX_FFT_6561_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_6561_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<897, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<903>; +.reg .b32 r<9678>; +.reg .b64 rd<6>; +mov.u32 r9604, %54; +mov.u32 r9677, %tid.y; +mad.lo.s32 r9605, r9677, 52488, r9604; +mov.u32 r9606, %tid.x; +mov.f32 f894, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1, {low, high}; +} +mov.f32 f896, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f854, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r265, {low, high}; +} +mov.f32 f856, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r266, {low, high}; +} +mov.f32 f858, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r267, {low, high}; +} +mov.f32 f860, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r268, {low, high}; +} +mov.f32 f866, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r271, {low, high}; +} +mov.f32 f868, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %83, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %83, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %83, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %81, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %82; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %81, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %82; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %81, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %82; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %82; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %81, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %82; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %81, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1825, {low, high}; +} +mov.f32 f536, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1826, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1827, {low, high}; +} +mov.f32 f540, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1830, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1831, {low, high}; +} +mov.f32 f548, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1832, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1833, {low, high}; +} +mov.f32 f552, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1836, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1837, {low, high}; +} +mov.f32 f560, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1838, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1839, {low, high}; +} +mov.f32 f564, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1840, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1843, {low, high}; +} +mov.f32 f572, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1844, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1848, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1851, {low, high}; +} +mov.f32 f588, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1855, {low, high}; +} +mov.f32 f596, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r9606, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r9607, rd3; +mul.lo.s32 r9608, r9607, 243; +sub.s32 r9609, r9606, r9608; +cvt.rn.f32.u32 f897, r9609; +mul.f32 f898, f897, 0f3A7B0B40; +cos.approx.f32 f309, f898; +sin.approx.f32 f899, f898; +neg.f32 f310, f899; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +mad.lo.s32 r9610, r9607, 52488, r9605; +barrier.sync 0; +mad.lo.s32 r9611, r9609, 216, r9610; +st.shared.v2.f32 [r9611], {r2140, r2146}; +st.shared.v2.f32 [r9611+8], {r2937, r2944}; +st.shared.v2.f32 [r9611+16], {r2974, r2981}; +st.shared.v2.f32 [r9611+24], {r3011, r3018}; +st.shared.v2.f32 [r9611+32], {r3048, r3055}; +st.shared.v2.f32 [r9611+40], {r3085, r3092}; +st.shared.v2.f32 [r9611+48], {r3122, r3129}; +st.shared.v2.f32 [r9611+56], {r3159, r3166}; +st.shared.v2.f32 [r9611+64], {r3196, r3203}; +st.shared.v2.f32 [r9611+72], {r3233, r3240}; +st.shared.v2.f32 [r9611+80], {r3270, r3277}; +st.shared.v2.f32 [r9611+88], {r3307, r3314}; +st.shared.v2.f32 [r9611+96], {r3344, r3351}; +st.shared.v2.f32 [r9611+104], {r3381, r3388}; +st.shared.v2.f32 [r9611+112], {r3418, r3425}; +st.shared.v2.f32 [r9611+120], {r3455, r3462}; +st.shared.v2.f32 [r9611+128], {r3492, r3499}; +st.shared.v2.f32 [r9611+136], {r3529, r3536}; +st.shared.v2.f32 [r9611+144], {r3566, r3573}; +st.shared.v2.f32 [r9611+152], {r3603, r3610}; +st.shared.v2.f32 [r9611+160], {r3640, r3647}; +st.shared.v2.f32 [r9611+168], {r3677, r3684}; +st.shared.v2.f32 [r9611+176], {r3714, r3721}; +st.shared.v2.f32 [r9611+184], {r3751, r3758}; +st.shared.v2.f32 [r9611+192], {r3788, r3795}; +st.shared.v2.f32 [r9611+200], {r3825, r3832}; +st.shared.v2.f32 [r9611+208], {r3862, r3869}; +barrier.sync 0; +mad.lo.s32 r9612, r9609, -208, r9611; +ld.shared.u32 r3898, [r9612]; +ld.shared.u32 r3904, [r9612+4]; +ld.shared.u32 r4506, [r9612+1944]; +ld.shared.u32 r4512, [r9612+1948]; +ld.shared.u32 r5114, [r9612+3888]; +ld.shared.u32 r5120, [r9612+3892]; +ld.shared.u32 r3986, [r9612+5832]; +ld.shared.u32 r3992, [r9612+5836]; +ld.shared.u32 r4594, [r9612+7776]; +ld.shared.u32 r4600, [r9612+7780]; +ld.shared.u32 r5202, [r9612+9720]; +ld.shared.u32 r5208, [r9612+9724]; +ld.shared.u32 r4074, [r9612+11664]; +ld.shared.u32 r4080, [r9612+11668]; +ld.shared.u32 r4682, [r9612+13608]; +ld.shared.u32 r4688, [r9612+13612]; +ld.shared.u32 r5290, [r9612+15552]; +ld.shared.u32 r5296, [r9612+15556]; +ld.shared.u32 r3895, [r9612+17496]; +ld.shared.u32 r3901, [r9612+17500]; +ld.shared.u32 r4503, [r9612+19440]; +ld.shared.u32 r4509, [r9612+19444]; +ld.shared.u32 r5111, [r9612+21384]; +ld.shared.u32 r5117, [r9612+21388]; +ld.shared.u32 r3983, [r9612+23328]; +ld.shared.u32 r3989, [r9612+23332]; +ld.shared.u32 r4591, [r9612+25272]; +ld.shared.u32 r4597, [r9612+25276]; +ld.shared.u32 r5199, [r9612+27216]; +ld.shared.u32 r5205, [r9612+27220]; +ld.shared.u32 r4071, [r9612+29160]; +ld.shared.u32 r4077, [r9612+29164]; +ld.shared.u32 r4679, [r9612+31104]; +ld.shared.u32 r4685, [r9612+31108]; +ld.shared.u32 r5287, [r9612+33048]; +ld.shared.u32 r5293, [r9612+33052]; +ld.shared.u32 r3896, [r9612+34992]; +ld.shared.u32 r3902, [r9612+34996]; +ld.shared.u32 r4504, [r9612+36936]; +ld.shared.u32 r4510, [r9612+36940]; +ld.shared.u32 r5112, [r9612+38880]; +ld.shared.u32 r5118, [r9612+38884]; +ld.shared.u32 r3984, [r9612+40824]; +ld.shared.u32 r3990, [r9612+40828]; +ld.shared.u32 r4592, [r9612+42768]; +ld.shared.u32 r4598, [r9612+42772]; +ld.shared.u32 r5200, [r9612+44712]; +ld.shared.u32 r5206, [r9612+44716]; +ld.shared.u32 r4072, [r9612+46656]; +ld.shared.u32 r4078, [r9612+46660]; +ld.shared.u32 r4680, [r9612+48600]; +ld.shared.u32 r4686, [r9612+48604]; +ld.shared.u32 r5288, [r9612+50544]; +ld.shared.u32 r5294, [r9612+50548]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 r6029, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 r6035, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 r6053, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 r6071, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 r6089, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 r6107, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 r6117, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 r6123, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 r6141, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 r6159, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 r6177, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 r6195, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 r6205, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 r6211, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 r6229, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 r6247, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 r6265, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 r6283, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 r6293, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 r6299, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 r6335, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 r6353, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 r6371, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 r6381, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 r6387, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 r6405, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 r6423, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 r6441, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 r6459, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 r6469, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 r6475, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 r6511, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 r6529, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 r6547, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 r6557, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 r6563, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 r6617, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 r6635, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 r6645, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 r6651, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 r6669, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 r6687, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 r6705, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 r6723, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 r6733, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 r6739, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 r6757, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 r6775, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 r6793, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 r6811, r6802, r6808; +} +mul.wide.u32 rd4, r9609, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r9613, rd5; +sub.s32 r9614, r9609, r9613; +shr.u32 r9615, r9614, 1; +add.s32 r9616, r9615, r9613; +shr.u32 r9617, r9616, 4; +cvt.rn.f32.u32 f900, r9617; +mul.f32 f901, f900, 0f3CD3D17E; +cos.approx.f32 f673, f901; +sin.approx.f32 f902, f901; +neg.f32 f674, f902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6814, {low, high}; +} +mul.lo.s32 r9618, r9617, 27; +sub.s32 r9619, r9609, r9618; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6819, {high, high}; +} +{ +mul.f16x2 r6821, r6123, r6819; +} +{ +neg.f16x2 r6824, r6821; +} +{ +fma.rn.f16x2 r6826, r6117, r6817, r6824; +} +{ +mul.f16x2 r6830, r6117, r6819; +} +{ +fma.rn.f16x2 r6833, r6123, r6817, r6830; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6839, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6841, {low, high}; +} +{ +mul.f16x2 r6842, r6839, r6841; +} +{ +mul.f16x2 r6845, r6814, r6837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6848, {high, low}; +} +{ +fma.rn.f16x2 r6850, r6842, r6848, r6845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6856, {high, high}; +} +{ +mul.f16x2 r6858, r6211, r6856; +} +{ +neg.f16x2 r6861, r6858; +} +{ +fma.rn.f16x2 r6863, r6205, r6854, r6861; +} +{ +mul.f16x2 r6867, r6205, r6856; +} +{ +fma.rn.f16x2 r6870, r6211, r6854, r6867; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6876, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6878, {low, high}; +} +{ +mul.f16x2 r6879, r6876, r6878; +} +{ +mul.f16x2 r6882, r6850, r6874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6885, {high, low}; +} +{ +fma.rn.f16x2 r6887, r6879, r6885, r6882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6891, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6893, {high, high}; +} +{ +mul.f16x2 r6895, r6299, r6893; +} +{ +neg.f16x2 r6898, r6895; +} +{ +fma.rn.f16x2 r6900, r6293, r6891, r6898; +} +{ +mul.f16x2 r6904, r6293, r6893; +} +{ +fma.rn.f16x2 r6907, r6299, r6891, r6904; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6913, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6915, {low, high}; +} +{ +mul.f16x2 r6916, r6913, r6915; +} +{ +mul.f16x2 r6919, r6887, r6911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6922, {high, low}; +} +{ +fma.rn.f16x2 r6924, r6916, r6922, r6919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6930, {high, high}; +} +{ +mul.f16x2 r6932, r6387, r6930; +} +{ +neg.f16x2 r6935, r6932; +} +{ +fma.rn.f16x2 r6937, r6381, r6928, r6935; +} +{ +mul.f16x2 r6941, r6381, r6930; +} +{ +fma.rn.f16x2 r6944, r6387, r6928, r6941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6950, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6952, {low, high}; +} +{ +mul.f16x2 r6953, r6950, r6952; +} +{ +mul.f16x2 r6956, r6924, r6948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6959, {high, low}; +} +{ +fma.rn.f16x2 r6961, r6953, r6959, r6956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6967, {high, high}; +} +{ +mul.f16x2 r6969, r6475, r6967; +} +{ +neg.f16x2 r6972, r6969; +} +{ +fma.rn.f16x2 r6974, r6469, r6965, r6972; +} +{ +mul.f16x2 r6978, r6469, r6967; +} +{ +fma.rn.f16x2 r6981, r6475, r6965, r6978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6989, {low, high}; +} +{ +mul.f16x2 r6990, r6987, r6989; +} +{ +mul.f16x2 r6993, r6961, r6985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6996, {high, low}; +} +{ +fma.rn.f16x2 r6998, r6990, r6996, r6993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7004, {high, high}; +} +{ +mul.f16x2 r7006, r6563, r7004; +} +{ +neg.f16x2 r7009, r7006; +} +{ +fma.rn.f16x2 r7011, r6557, r7002, r7009; +} +{ +mul.f16x2 r7015, r6557, r7004; +} +{ +fma.rn.f16x2 r7018, r6563, r7002, r7015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7026, {low, high}; +} +{ +mul.f16x2 r7027, r7024, r7026; +} +{ +mul.f16x2 r7030, r6998, r7022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7033, {high, low}; +} +{ +fma.rn.f16x2 r7035, r7027, r7033, r7030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7041, {high, high}; +} +{ +mul.f16x2 r7043, r6651, r7041; +} +{ +neg.f16x2 r7046, r7043; +} +{ +fma.rn.f16x2 r7048, r6645, r7039, r7046; +} +{ +mul.f16x2 r7052, r6645, r7041; +} +{ +fma.rn.f16x2 r7055, r6651, r7039, r7052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7063, {low, high}; +} +{ +mul.f16x2 r7064, r7061, r7063; +} +{ +mul.f16x2 r7067, r7035, r7059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7070, {high, low}; +} +{ +fma.rn.f16x2 r7072, r7064, r7070, r7067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7078, {high, high}; +} +{ +mul.f16x2 r7080, r6739, r7078; +} +{ +neg.f16x2 r7083, r7080; +} +{ +fma.rn.f16x2 r7085, r6733, r7076, r7083; +} +{ +mul.f16x2 r7089, r6733, r7078; +} +{ +fma.rn.f16x2 r7092, r6739, r7076, r7089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7100, {low, high}; +} +{ +mul.f16x2 r7101, r7098, r7100; +} +{ +mul.f16x2 r7104, r7072, r7096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7107, {high, low}; +} +{ +fma.rn.f16x2 r7109, r7101, r7107, r7104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7115, {high, high}; +} +{ +mul.f16x2 r7117, r6089, r7115; +} +{ +neg.f16x2 r7120, r7117; +} +{ +fma.rn.f16x2 r7122, r6053, r7113, r7120; +} +{ +mul.f16x2 r7126, r6053, r7115; +} +{ +fma.rn.f16x2 r7129, r6089, r7113, r7126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7137, {low, high}; +} +{ +mul.f16x2 r7138, r7135, r7137; +} +{ +mul.f16x2 r7141, r7109, r7133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7144, {high, low}; +} +{ +fma.rn.f16x2 r7146, r7138, r7144, r7141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7152, {high, high}; +} +{ +mul.f16x2 r7154, r6177, r7152; +} +{ +neg.f16x2 r7157, r7154; +} +{ +fma.rn.f16x2 r7159, r6141, r7150, r7157; +} +{ +mul.f16x2 r7163, r6141, r7152; +} +{ +fma.rn.f16x2 r7166, r6177, r7150, r7163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7174, {low, high}; +} +{ +mul.f16x2 r7175, r7172, r7174; +} +{ +mul.f16x2 r7178, r7146, r7170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7181, {high, low}; +} +{ +fma.rn.f16x2 r7183, r7175, r7181, r7178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7189, {high, high}; +} +{ +mul.f16x2 r7191, r6265, r7189; +} +{ +neg.f16x2 r7194, r7191; +} +{ +fma.rn.f16x2 r7196, r6229, r7187, r7194; +} +{ +mul.f16x2 r7200, r6229, r7189; +} +{ +fma.rn.f16x2 r7203, r6265, r7187, r7200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7211, {low, high}; +} +{ +mul.f16x2 r7212, r7209, r7211; +} +{ +mul.f16x2 r7215, r7183, r7207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7218, {high, low}; +} +{ +fma.rn.f16x2 r7220, r7212, r7218, r7215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7226, {high, high}; +} +{ +mul.f16x2 r7228, r6353, r7226; +} +{ +neg.f16x2 r7231, r7228; +} +{ +fma.rn.f16x2 r7233, r6317, r7224, r7231; +} +{ +mul.f16x2 r7237, r6317, r7226; +} +{ +fma.rn.f16x2 r7240, r6353, r7224, r7237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7248, {low, high}; +} +{ +mul.f16x2 r7249, r7246, r7248; +} +{ +mul.f16x2 r7252, r7220, r7244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7255, {high, low}; +} +{ +fma.rn.f16x2 r7257, r7249, r7255, r7252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7263, {high, high}; +} +{ +mul.f16x2 r7265, r6441, r7263; +} +{ +neg.f16x2 r7268, r7265; +} +{ +fma.rn.f16x2 r7270, r6405, r7261, r7268; +} +{ +mul.f16x2 r7274, r6405, r7263; +} +{ +fma.rn.f16x2 r7277, r6441, r7261, r7274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7285, {low, high}; +} +{ +mul.f16x2 r7286, r7283, r7285; +} +{ +mul.f16x2 r7289, r7257, r7281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7292, {high, low}; +} +{ +fma.rn.f16x2 r7294, r7286, r7292, r7289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7300, {high, high}; +} +{ +mul.f16x2 r7302, r6529, r7300; +} +{ +neg.f16x2 r7305, r7302; +} +{ +fma.rn.f16x2 r7307, r6493, r7298, r7305; +} +{ +mul.f16x2 r7311, r6493, r7300; +} +{ +fma.rn.f16x2 r7314, r6529, r7298, r7311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7322, {low, high}; +} +{ +mul.f16x2 r7323, r7320, r7322; +} +{ +mul.f16x2 r7326, r7294, r7318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7329, {high, low}; +} +{ +fma.rn.f16x2 r7331, r7323, r7329, r7326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7337, {high, high}; +} +{ +mul.f16x2 r7339, r6617, r7337; +} +{ +neg.f16x2 r7342, r7339; +} +{ +fma.rn.f16x2 r7344, r6581, r7335, r7342; +} +{ +mul.f16x2 r7348, r6581, r7337; +} +{ +fma.rn.f16x2 r7351, r6617, r7335, r7348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7359, {low, high}; +} +{ +mul.f16x2 r7360, r7357, r7359; +} +{ +mul.f16x2 r7363, r7331, r7355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7366, {high, low}; +} +{ +fma.rn.f16x2 r7368, r7360, r7366, r7363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7374, {high, high}; +} +{ +mul.f16x2 r7376, r6705, r7374; +} +{ +neg.f16x2 r7379, r7376; +} +{ +fma.rn.f16x2 r7381, r6669, r7372, r7379; +} +{ +mul.f16x2 r7385, r6669, r7374; +} +{ +fma.rn.f16x2 r7388, r6705, r7372, r7385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7396, {low, high}; +} +{ +mul.f16x2 r7397, r7394, r7396; +} +{ +mul.f16x2 r7400, r7368, r7392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7403, {high, low}; +} +{ +fma.rn.f16x2 r7405, r7397, r7403, r7400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7411, {high, high}; +} +{ +mul.f16x2 r7413, r6793, r7411; +} +{ +neg.f16x2 r7416, r7413; +} +{ +fma.rn.f16x2 r7418, r6757, r7409, r7416; +} +{ +mul.f16x2 r7422, r6757, r7411; +} +{ +fma.rn.f16x2 r7425, r6793, r7409, r7422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7433, {low, high}; +} +{ +mul.f16x2 r7434, r7431, r7433; +} +{ +mul.f16x2 r7437, r7405, r7429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7440, {high, low}; +} +{ +fma.rn.f16x2 r7442, r7434, r7440, r7437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7448, {high, high}; +} +{ +mul.f16x2 r7450, r6107, r7448; +} +{ +neg.f16x2 r7453, r7450; +} +{ +fma.rn.f16x2 r7455, r6071, r7446, r7453; +} +{ +mul.f16x2 r7459, r6071, r7448; +} +{ +fma.rn.f16x2 r7462, r6107, r7446, r7459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7470, {low, high}; +} +{ +mul.f16x2 r7471, r7468, r7470; +} +{ +mul.f16x2 r7474, r7442, r7466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7477, {high, low}; +} +{ +fma.rn.f16x2 r7479, r7471, r7477, r7474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7485, {high, high}; +} +{ +mul.f16x2 r7487, r6195, r7485; +} +{ +neg.f16x2 r7490, r7487; +} +{ +fma.rn.f16x2 r7492, r6159, r7483, r7490; +} +{ +mul.f16x2 r7496, r6159, r7485; +} +{ +fma.rn.f16x2 r7499, r6195, r7483, r7496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7507, {low, high}; +} +{ +mul.f16x2 r7508, r7505, r7507; +} +{ +mul.f16x2 r7511, r7479, r7503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7514, {high, low}; +} +{ +fma.rn.f16x2 r7516, r7508, r7514, r7511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7522, {high, high}; +} +{ +mul.f16x2 r7524, r6283, r7522; +} +{ +neg.f16x2 r7527, r7524; +} +{ +fma.rn.f16x2 r7529, r6247, r7520, r7527; +} +{ +mul.f16x2 r7533, r6247, r7522; +} +{ +fma.rn.f16x2 r7536, r6283, r7520, r7533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7544, {low, high}; +} +{ +mul.f16x2 r7545, r7542, r7544; +} +{ +mul.f16x2 r7548, r7516, r7540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7551, {high, low}; +} +{ +fma.rn.f16x2 r7553, r7545, r7551, r7548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7559, {high, high}; +} +{ +mul.f16x2 r7561, r6371, r7559; +} +{ +neg.f16x2 r7564, r7561; +} +{ +fma.rn.f16x2 r7566, r6335, r7557, r7564; +} +{ +mul.f16x2 r7570, r6335, r7559; +} +{ +fma.rn.f16x2 r7573, r6371, r7557, r7570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7581, {low, high}; +} +{ +mul.f16x2 r7582, r7579, r7581; +} +{ +mul.f16x2 r7585, r7553, r7577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7588, {high, low}; +} +{ +fma.rn.f16x2 r7590, r7582, r7588, r7585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7596, {high, high}; +} +{ +mul.f16x2 r7598, r6459, r7596; +} +{ +neg.f16x2 r7601, r7598; +} +{ +fma.rn.f16x2 r7603, r6423, r7594, r7601; +} +{ +mul.f16x2 r7607, r6423, r7596; +} +{ +fma.rn.f16x2 r7610, r6459, r7594, r7607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7618, {low, high}; +} +{ +mul.f16x2 r7619, r7616, r7618; +} +{ +mul.f16x2 r7622, r7590, r7614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7625, {high, low}; +} +{ +fma.rn.f16x2 r7627, r7619, r7625, r7622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7633, {high, high}; +} +{ +mul.f16x2 r7635, r6547, r7633; +} +{ +neg.f16x2 r7638, r7635; +} +{ +fma.rn.f16x2 r7640, r6511, r7631, r7638; +} +{ +mul.f16x2 r7644, r6511, r7633; +} +{ +fma.rn.f16x2 r7647, r6547, r7631, r7644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7655, {low, high}; +} +{ +mul.f16x2 r7656, r7653, r7655; +} +{ +mul.f16x2 r7659, r7627, r7651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7662, {high, low}; +} +{ +fma.rn.f16x2 r7664, r7656, r7662, r7659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7670, {high, high}; +} +{ +mul.f16x2 r7672, r6635, r7670; +} +{ +neg.f16x2 r7675, r7672; +} +{ +fma.rn.f16x2 r7677, r6599, r7668, r7675; +} +{ +mul.f16x2 r7681, r6599, r7670; +} +{ +fma.rn.f16x2 r7684, r6635, r7668, r7681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7692, {low, high}; +} +{ +mul.f16x2 r7693, r7690, r7692; +} +{ +mul.f16x2 r7696, r7664, r7688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7699, {high, low}; +} +{ +fma.rn.f16x2 r7701, r7693, r7699, r7696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7707, {high, high}; +} +{ +mul.f16x2 r7709, r6723, r7707; +} +{ +neg.f16x2 r7712, r7709; +} +{ +fma.rn.f16x2 r7714, r6687, r7705, r7712; +} +{ +mul.f16x2 r7718, r6687, r7707; +} +{ +fma.rn.f16x2 r7721, r6723, r7705, r7718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7729, {low, high}; +} +{ +mul.f16x2 r7730, r7727, r7729; +} +{ +mul.f16x2 r7733, r7701, r7725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7736, {high, low}; +} +{ +fma.rn.f16x2 r7738, r7730, r7736, r7733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7744, {high, high}; +} +{ +mul.f16x2 r7746, r6811, r7744; +} +{ +neg.f16x2 r7749, r7746; +} +{ +fma.rn.f16x2 r7751, r6775, r7742, r7749; +} +{ +mul.f16x2 r7755, r6775, r7744; +} +{ +fma.rn.f16x2 r7758, r6811, r7742, r7755; +} +shl.b32 r9620, r9619, 3; +add.s32 r9621, r9610, r9620; +barrier.sync 0; +mad.lo.s32 r9622, r9617, 5832, r9621; +st.shared.u32 [r9622], r6029; +st.shared.u32 [r9622+4], r6035; +st.shared.u32 [r9622+216], r6826; +st.shared.u32 [r9622+220], r6833; +st.shared.u32 [r9622+432], r6863; +st.shared.u32 [r9622+436], r6870; +st.shared.u32 [r9622+648], r6900; +st.shared.u32 [r9622+652], r6907; +st.shared.u32 [r9622+864], r6937; +st.shared.u32 [r9622+868], r6944; +st.shared.u32 [r9622+1080], r6974; +st.shared.u32 [r9622+1084], r6981; +st.shared.u32 [r9622+1296], r7011; +st.shared.u32 [r9622+1300], r7018; +st.shared.u32 [r9622+1512], r7048; +st.shared.u32 [r9622+1516], r7055; +st.shared.u32 [r9622+1728], r7085; +st.shared.u32 [r9622+1732], r7092; +st.shared.u32 [r9622+1944], r7122; +st.shared.u32 [r9622+1948], r7129; +st.shared.u32 [r9622+2160], r7159; +st.shared.u32 [r9622+2164], r7166; +st.shared.u32 [r9622+2376], r7196; +st.shared.u32 [r9622+2380], r7203; +st.shared.u32 [r9622+2592], r7233; +st.shared.u32 [r9622+2596], r7240; +st.shared.u32 [r9622+2808], r7270; +st.shared.u32 [r9622+2812], r7277; +st.shared.u32 [r9622+3024], r7307; +st.shared.u32 [r9622+3028], r7314; +st.shared.u32 [r9622+3240], r7344; +st.shared.u32 [r9622+3244], r7351; +st.shared.u32 [r9622+3456], r7381; +st.shared.u32 [r9622+3460], r7388; +st.shared.u32 [r9622+3672], r7418; +st.shared.u32 [r9622+3676], r7425; +st.shared.u32 [r9622+3888], r7455; +st.shared.u32 [r9622+3892], r7462; +st.shared.u32 [r9622+4104], r7492; +st.shared.u32 [r9622+4108], r7499; +st.shared.u32 [r9622+4320], r7529; +st.shared.u32 [r9622+4324], r7536; +st.shared.u32 [r9622+4536], r7566; +st.shared.u32 [r9622+4540], r7573; +st.shared.u32 [r9622+4752], r7603; +st.shared.u32 [r9622+4756], r7610; +st.shared.u32 [r9622+4968], r7640; +st.shared.u32 [r9622+4972], r7647; +st.shared.u32 [r9622+5184], r7677; +st.shared.u32 [r9622+5188], r7684; +st.shared.u32 [r9622+5400], r7714; +st.shared.u32 [r9622+5404], r7721; +st.shared.u32 [r9622+5616], r7751; +st.shared.u32 [r9622+5620], r7758; +barrier.sync 0; +ld.shared.u32 r7787, [r9612]; +ld.shared.u32 r7793, [r9612+4]; +ld.shared.u32 r8395, [r9612+1944]; +ld.shared.u32 r8401, [r9612+1948]; +ld.shared.u32 r9003, [r9612+3888]; +ld.shared.u32 r9009, [r9612+3892]; +ld.shared.u32 r7875, [r9612+5832]; +ld.shared.u32 r7881, [r9612+5836]; +ld.shared.u32 r8483, [r9612+7776]; +ld.shared.u32 r8489, [r9612+7780]; +ld.shared.u32 r9091, [r9612+9720]; +ld.shared.u32 r9097, [r9612+9724]; +ld.shared.u32 r7963, [r9612+11664]; +ld.shared.u32 r7969, [r9612+11668]; +ld.shared.u32 r8571, [r9612+13608]; +ld.shared.u32 r8577, [r9612+13612]; +ld.shared.u32 r9179, [r9612+15552]; +ld.shared.u32 r9185, [r9612+15556]; +ld.shared.u32 r7784, [r9612+17496]; +ld.shared.u32 r7790, [r9612+17500]; +ld.shared.u32 r8392, [r9612+19440]; +ld.shared.u32 r8398, [r9612+19444]; +ld.shared.u32 r9000, [r9612+21384]; +ld.shared.u32 r9006, [r9612+21388]; +ld.shared.u32 r7872, [r9612+23328]; +ld.shared.u32 r7878, [r9612+23332]; +ld.shared.u32 r8480, [r9612+25272]; +ld.shared.u32 r8486, [r9612+25276]; +ld.shared.u32 r9088, [r9612+27216]; +ld.shared.u32 r9094, [r9612+27220]; +ld.shared.u32 r7960, [r9612+29160]; +ld.shared.u32 r7966, [r9612+29164]; +ld.shared.u32 r8568, [r9612+31104]; +ld.shared.u32 r8574, [r9612+31108]; +ld.shared.u32 r9176, [r9612+33048]; +ld.shared.u32 r9182, [r9612+33052]; +ld.shared.u32 r7785, [r9612+34992]; +ld.shared.u32 r7791, [r9612+34996]; +ld.shared.u32 r8393, [r9612+36936]; +ld.shared.u32 r8399, [r9612+36940]; +ld.shared.u32 r9001, [r9612+38880]; +ld.shared.u32 r9007, [r9612+38884]; +ld.shared.u32 r7873, [r9612+40824]; +ld.shared.u32 r7879, [r9612+40828]; +ld.shared.u32 r8481, [r9612+42768]; +ld.shared.u32 r8487, [r9612+42772]; +ld.shared.u32 r9089, [r9612+44712]; +ld.shared.u32 r9095, [r9612+44716]; +ld.shared.u32 r7961, [r9612+46656]; +ld.shared.u32 r7967, [r9612+46660]; +ld.shared.u32 r8569, [r9612+48600]; +ld.shared.u32 r8575, [r9612+48604]; +ld.shared.u32 r9177, [r9612+50544]; +ld.shared.u32 r9183, [r9612+50548]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7780, {low, high}; +} +{ +neg.f16x2 r7781, r7780; +} +{ +add.f16x2 r7783, r7784, r7785; +} +{ +add.f16x2 r7786, r7787, r7783; +} +{ +add.f16x2 r7789, r7790, r7791; +} +{ +add.f16x2 r7792, r7793, r7789; +} +{ +add.f16x2 r7795, r7784, r7785; +} +{ +mul.f16x2 r7798, r7795, r7779; +} +{ +add.f16x2 r7801, r7787, r7798; +} +{ +sub.f16x2 r7804, r7790, r7791; +} +{ +mul.f16x2 r7807, r7804, r7781; +} +{ +add.f16x2 r7810, r7801, r7807; +} +{ +add.f16x2 r7813, r7784, r7785; +} +{ +mul.f16x2 r7816, r7813, r7779; +} +{ +add.f16x2 r7819, r7787, r7816; +} +{ +sub.f16x2 r7822, r7790, r7791; +} +{ +mul.f16x2 r7825, r7822, r7781; +} +{ +sub.f16x2 r7828, r7819, r7825; +} +{ +add.f16x2 r7831, r7790, r7791; +} +{ +mul.f16x2 r7834, r7831, r7779; +} +{ +add.f16x2 r7837, r7793, r7834; +} +{ +sub.f16x2 r7840, r7784, r7785; +} +{ +mul.f16x2 r7843, r7840, r7781; +} +{ +sub.f16x2 r7846, r7837, r7843; +} +{ +add.f16x2 r7849, r7790, r7791; +} +{ +mul.f16x2 r7852, r7849, r7779; +} +{ +add.f16x2 r7855, r7793, r7852; +} +{ +sub.f16x2 r7858, r7784, r7785; +} +{ +mul.f16x2 r7861, r7858, r7781; +} +{ +add.f16x2 r7864, r7855, r7861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7868, {low, high}; +} +{ +neg.f16x2 r7869, r7868; +} +{ +add.f16x2 r7871, r7872, r7873; +} +{ +add.f16x2 r7874, r7875, r7871; +} +{ +add.f16x2 r7877, r7878, r7879; +} +{ +add.f16x2 r7880, r7881, r7877; +} +{ +add.f16x2 r7883, r7872, r7873; +} +{ +mul.f16x2 r7886, r7883, r7867; +} +{ +add.f16x2 r7889, r7875, r7886; +} +{ +sub.f16x2 r7892, r7878, r7879; +} +{ +mul.f16x2 r7895, r7892, r7869; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 r7901, r7872, r7873; +} +{ +mul.f16x2 r7904, r7901, r7867; +} +{ +add.f16x2 r7907, r7875, r7904; +} +{ +sub.f16x2 r7910, r7878, r7879; +} +{ +mul.f16x2 r7913, r7910, r7869; +} +{ +sub.f16x2 r7916, r7907, r7913; +} +{ +add.f16x2 r7919, r7878, r7879; +} +{ +mul.f16x2 r7922, r7919, r7867; +} +{ +add.f16x2 r7925, r7881, r7922; +} +{ +sub.f16x2 r7928, r7872, r7873; +} +{ +mul.f16x2 r7931, r7928, r7869; +} +{ +sub.f16x2 r7934, r7925, r7931; +} +{ +add.f16x2 r7937, r7878, r7879; +} +{ +mul.f16x2 r7940, r7937, r7867; +} +{ +add.f16x2 r7943, r7881, r7940; +} +{ +sub.f16x2 r7946, r7872, r7873; +} +{ +mul.f16x2 r7949, r7946, r7869; +} +{ +add.f16x2 r7952, r7943, r7949; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7956, {low, high}; +} +{ +neg.f16x2 r7957, r7956; +} +{ +add.f16x2 r7959, r7960, r7961; +} +{ +add.f16x2 r7962, r7963, r7959; +} +{ +add.f16x2 r7965, r7966, r7967; +} +{ +add.f16x2 r7968, r7969, r7965; +} +{ +add.f16x2 r7971, r7960, r7961; +} +{ +mul.f16x2 r7974, r7971, r7955; +} +{ +add.f16x2 r7977, r7963, r7974; +} +{ +sub.f16x2 r7980, r7966, r7967; +} +{ +mul.f16x2 r7983, r7980, r7957; +} +{ +add.f16x2 r7986, r7977, r7983; +} +{ +add.f16x2 r7989, r7960, r7961; +} +{ +mul.f16x2 r7992, r7989, r7955; +} +{ +add.f16x2 r7995, r7963, r7992; +} +{ +sub.f16x2 r7998, r7966, r7967; +} +{ +mul.f16x2 r8001, r7998, r7957; +} +{ +sub.f16x2 r8004, r7995, r8001; +} +{ +add.f16x2 r8007, r7966, r7967; +} +{ +mul.f16x2 r8010, r8007, r7955; +} +{ +add.f16x2 r8013, r7969, r8010; +} +{ +sub.f16x2 r8016, r7960, r7961; +} +{ +mul.f16x2 r8019, r8016, r7957; +} +{ +sub.f16x2 r8022, r8013, r8019; +} +{ +add.f16x2 r8025, r7966, r7967; +} +{ +mul.f16x2 r8028, r8025, r7955; +} +{ +add.f16x2 r8031, r7969, r8028; +} +{ +sub.f16x2 r8034, r7960, r7961; +} +{ +mul.f16x2 r8037, r8034, r7957; +} +{ +add.f16x2 r8040, r8031, r8037; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r8043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r8044, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r8045, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r8046, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r8049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r8050, {low, high}; +} +{ +mul.f16x2 r8059, r7898, r8043; +} +{ +mul.f16x2 r8062, r7934, r8044; +} +{ +sub.f16x2 r8065, r8059, r8062; +} +{ +mul.f16x2 r8068, r7898, r8044; +} +{ +fma.rn.f16x2 r8071, r7934, r8043, r8068; +} +{ +mul.f16x2 r8075, r7986, r8045; +} +{ +mul.f16x2 r8078, r8022, r8046; +} +{ +sub.f16x2 r8081, r8075, r8078; +} +{ +mul.f16x2 r8084, r7986, r8046; +} +{ +fma.rn.f16x2 r8087, r8022, r8045, r8084; +} +{ +mul.f16x2 r8091, r7916, r8045; +} +{ +mul.f16x2 r8094, r7952, r8046; +} +{ +sub.f16x2 r8097, r8091, r8094; +} +{ +mul.f16x2 r8100, r7916, r8046; +} +{ +fma.rn.f16x2 r8103, r7952, r8045, r8100; +} +{ +mul.f16x2 r8107, r8004, r8049; +} +{ +mul.f16x2 r8110, r8040, r8050; +} +{ +sub.f16x2 r8113, r8107, r8110; +} +{ +mul.f16x2 r8116, r8004, r8050; +} +{ +fma.rn.f16x2 r8119, r8040, r8049, r8116; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8124, {low, high}; +} +{ +neg.f16x2 r8125, r8124; +} +{ +add.f16x2 r8127, r7874, r7962; +} +{ +add.f16x2 %0, r7786, r8127; +} +{ +add.f16x2 r8133, r7880, r7968; +} +{ +add.f16x2 %1, r7792, r8133; +} +{ +add.f16x2 r8139, r7874, r7962; +} +{ +mul.f16x2 r8142, r8139, r8123; +} +{ +add.f16x2 r8145, r7786, r8142; +} +{ +sub.f16x2 r8148, r7880, r7968; +} +{ +mul.f16x2 r8151, r8148, r8125; +} +{ +add.f16x2 %18, r8145, r8151; +} +{ +add.f16x2 r8157, r7874, r7962; +} +{ +mul.f16x2 r8160, r8157, r8123; +} +{ +add.f16x2 r8163, r7786, r8160; +} +{ +sub.f16x2 r8166, r7880, r7968; +} +{ +mul.f16x2 r8169, r8166, r8125; +} +{ +sub.f16x2 %36, r8163, r8169; +} +{ +add.f16x2 r8175, r7880, r7968; +} +{ +mul.f16x2 r8178, r8175, r8123; +} +{ +add.f16x2 r8181, r7792, r8178; +} +{ +sub.f16x2 r8184, r7874, r7962; +} +{ +mul.f16x2 r8187, r8184, r8125; +} +{ +sub.f16x2 %19, r8181, r8187; +} +{ +add.f16x2 r8193, r7880, r7968; +} +{ +mul.f16x2 r8196, r8193, r8123; +} +{ +add.f16x2 r8199, r7792, r8196; +} +{ +sub.f16x2 r8202, r7874, r7962; +} +{ +mul.f16x2 r8205, r8202, r8125; +} +{ +add.f16x2 %37, r8199, r8205; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8212, {low, high}; +} +{ +neg.f16x2 r8213, r8212; +} +{ +add.f16x2 r8215, r8065, r8081; +} +{ +add.f16x2 %6, r7810, r8215; +} +{ +add.f16x2 r8221, r8071, r8087; +} +{ +add.f16x2 %7, r7846, r8221; +} +{ +add.f16x2 r8227, r8065, r8081; +} +{ +mul.f16x2 r8230, r8227, r8211; +} +{ +add.f16x2 r8233, r7810, r8230; +} +{ +sub.f16x2 r8236, r8071, r8087; +} +{ +mul.f16x2 r8239, r8236, r8213; +} +{ +add.f16x2 %24, r8233, r8239; +} +{ +add.f16x2 r8245, r8065, r8081; +} +{ +mul.f16x2 r8248, r8245, r8211; +} +{ +add.f16x2 r8251, r7810, r8248; +} +{ +sub.f16x2 r8254, r8071, r8087; +} +{ +mul.f16x2 r8257, r8254, r8213; +} +{ +sub.f16x2 %42, r8251, r8257; +} +{ +add.f16x2 r8263, r8071, r8087; +} +{ +mul.f16x2 r8266, r8263, r8211; +} +{ +add.f16x2 r8269, r7846, r8266; +} +{ +sub.f16x2 r8272, r8065, r8081; +} +{ +mul.f16x2 r8275, r8272, r8213; +} +{ +sub.f16x2 %25, r8269, r8275; +} +{ +add.f16x2 r8281, r8071, r8087; +} +{ +mul.f16x2 r8284, r8281, r8211; +} +{ +add.f16x2 r8287, r7846, r8284; +} +{ +sub.f16x2 r8290, r8065, r8081; +} +{ +mul.f16x2 r8293, r8290, r8213; +} +{ +add.f16x2 %43, r8287, r8293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8299, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8300, {low, high}; +} +{ +neg.f16x2 r8301, r8300; +} +{ +add.f16x2 r8303, r8097, r8113; +} +{ +add.f16x2 %12, r7828, r8303; +} +{ +add.f16x2 r8309, r8103, r8119; +} +{ +add.f16x2 %13, r7864, r8309; +} +{ +add.f16x2 r8315, r8097, r8113; +} +{ +mul.f16x2 r8318, r8315, r8299; +} +{ +add.f16x2 r8321, r7828, r8318; +} +{ +sub.f16x2 r8324, r8103, r8119; +} +{ +mul.f16x2 r8327, r8324, r8301; +} +{ +add.f16x2 %30, r8321, r8327; +} +{ +add.f16x2 r8333, r8097, r8113; +} +{ +mul.f16x2 r8336, r8333, r8299; +} +{ +add.f16x2 r8339, r7828, r8336; +} +{ +sub.f16x2 r8342, r8103, r8119; +} +{ +mul.f16x2 r8345, r8342, r8301; +} +{ +sub.f16x2 %48, r8339, r8345; +} +{ +add.f16x2 r8351, r8103, r8119; +} +{ +mul.f16x2 r8354, r8351, r8299; +} +{ +add.f16x2 r8357, r7864, r8354; +} +{ +sub.f16x2 r8360, r8097, r8113; +} +{ +mul.f16x2 r8363, r8360, r8301; +} +{ +sub.f16x2 %31, r8357, r8363; +} +{ +add.f16x2 r8369, r8103, r8119; +} +{ +mul.f16x2 r8372, r8369, r8299; +} +{ +add.f16x2 r8375, r7864, r8372; +} +{ +sub.f16x2 r8378, r8097, r8113; +} +{ +mul.f16x2 r8381, r8378, r8301; +} +{ +add.f16x2 %49, r8375, r8381; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8387, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8388, {low, high}; +} +{ +neg.f16x2 r8389, r8388; +} +{ +add.f16x2 r8391, r8392, r8393; +} +{ +add.f16x2 r8394, r8395, r8391; +} +{ +add.f16x2 r8397, r8398, r8399; +} +{ +add.f16x2 r8400, r8401, r8397; +} +{ +add.f16x2 r8403, r8392, r8393; +} +{ +mul.f16x2 r8406, r8403, r8387; +} +{ +add.f16x2 r8409, r8395, r8406; +} +{ +sub.f16x2 r8412, r8398, r8399; +} +{ +mul.f16x2 r8415, r8412, r8389; +} +{ +add.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8392, r8393; +} +{ +mul.f16x2 r8424, r8421, r8387; +} +{ +add.f16x2 r8427, r8395, r8424; +} +{ +sub.f16x2 r8430, r8398, r8399; +} +{ +mul.f16x2 r8433, r8430, r8389; +} +{ +sub.f16x2 r8436, r8427, r8433; +} +{ +add.f16x2 r8439, r8398, r8399; +} +{ +mul.f16x2 r8442, r8439, r8387; +} +{ +add.f16x2 r8445, r8401, r8442; +} +{ +sub.f16x2 r8448, r8392, r8393; +} +{ +mul.f16x2 r8451, r8448, r8389; +} +{ +sub.f16x2 r8454, r8445, r8451; +} +{ +add.f16x2 r8457, r8398, r8399; +} +{ +mul.f16x2 r8460, r8457, r8387; +} +{ +add.f16x2 r8463, r8401, r8460; +} +{ +sub.f16x2 r8466, r8392, r8393; +} +{ +mul.f16x2 r8469, r8466, r8389; +} +{ +add.f16x2 r8472, r8463, r8469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8476, {low, high}; +} +{ +neg.f16x2 r8477, r8476; +} +{ +add.f16x2 r8479, r8480, r8481; +} +{ +add.f16x2 r8482, r8483, r8479; +} +{ +add.f16x2 r8485, r8486, r8487; +} +{ +add.f16x2 r8488, r8489, r8485; +} +{ +add.f16x2 r8491, r8480, r8481; +} +{ +mul.f16x2 r8494, r8491, r8475; +} +{ +add.f16x2 r8497, r8483, r8494; +} +{ +sub.f16x2 r8500, r8486, r8487; +} +{ +mul.f16x2 r8503, r8500, r8477; +} +{ +add.f16x2 r8506, r8497, r8503; +} +{ +add.f16x2 r8509, r8480, r8481; +} +{ +mul.f16x2 r8512, r8509, r8475; +} +{ +add.f16x2 r8515, r8483, r8512; +} +{ +sub.f16x2 r8518, r8486, r8487; +} +{ +mul.f16x2 r8521, r8518, r8477; +} +{ +sub.f16x2 r8524, r8515, r8521; +} +{ +add.f16x2 r8527, r8486, r8487; +} +{ +mul.f16x2 r8530, r8527, r8475; +} +{ +add.f16x2 r8533, r8489, r8530; +} +{ +sub.f16x2 r8536, r8480, r8481; +} +{ +mul.f16x2 r8539, r8536, r8477; +} +{ +sub.f16x2 r8542, r8533, r8539; +} +{ +add.f16x2 r8545, r8486, r8487; +} +{ +mul.f16x2 r8548, r8545, r8475; +} +{ +add.f16x2 r8551, r8489, r8548; +} +{ +sub.f16x2 r8554, r8480, r8481; +} +{ +mul.f16x2 r8557, r8554, r8477; +} +{ +add.f16x2 r8560, r8551, r8557; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8564, {low, high}; +} +{ +neg.f16x2 r8565, r8564; +} +{ +add.f16x2 r8567, r8568, r8569; +} +{ +add.f16x2 r8570, r8571, r8567; +} +{ +add.f16x2 r8573, r8574, r8575; +} +{ +add.f16x2 r8576, r8577, r8573; +} +{ +add.f16x2 r8579, r8568, r8569; +} +{ +mul.f16x2 r8582, r8579, r8563; +} +{ +add.f16x2 r8585, r8571, r8582; +} +{ +sub.f16x2 r8588, r8574, r8575; +} +{ +mul.f16x2 r8591, r8588, r8565; +} +{ +add.f16x2 r8594, r8585, r8591; +} +{ +add.f16x2 r8597, r8568, r8569; +} +{ +mul.f16x2 r8600, r8597, r8563; +} +{ +add.f16x2 r8603, r8571, r8600; +} +{ +sub.f16x2 r8606, r8574, r8575; +} +{ +mul.f16x2 r8609, r8606, r8565; +} +{ +sub.f16x2 r8612, r8603, r8609; +} +{ +add.f16x2 r8615, r8574, r8575; +} +{ +mul.f16x2 r8618, r8615, r8563; +} +{ +add.f16x2 r8621, r8577, r8618; +} +{ +sub.f16x2 r8624, r8568, r8569; +} +{ +mul.f16x2 r8627, r8624, r8565; +} +{ +sub.f16x2 r8630, r8621, r8627; +} +{ +add.f16x2 r8633, r8574, r8575; +} +{ +mul.f16x2 r8636, r8633, r8563; +} +{ +add.f16x2 r8639, r8577, r8636; +} +{ +sub.f16x2 r8642, r8568, r8569; +} +{ +mul.f16x2 r8645, r8642, r8565; +} +{ +add.f16x2 r8648, r8639, r8645; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r8651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r8652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r8653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r8654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r8657, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r8658, {low, high}; +} +{ +mul.f16x2 r8667, r8506, r8651; +} +{ +mul.f16x2 r8670, r8542, r8652; +} +{ +sub.f16x2 r8673, r8667, r8670; +} +{ +mul.f16x2 r8676, r8506, r8652; +} +{ +fma.rn.f16x2 r8679, r8542, r8651, r8676; +} +{ +mul.f16x2 r8683, r8594, r8653; +} +{ +mul.f16x2 r8686, r8630, r8654; +} +{ +sub.f16x2 r8689, r8683, r8686; +} +{ +mul.f16x2 r8692, r8594, r8654; +} +{ +fma.rn.f16x2 r8695, r8630, r8653, r8692; +} +{ +mul.f16x2 r8699, r8524, r8653; +} +{ +mul.f16x2 r8702, r8560, r8654; +} +{ +sub.f16x2 r8705, r8699, r8702; +} +{ +mul.f16x2 r8708, r8524, r8654; +} +{ +fma.rn.f16x2 r8711, r8560, r8653, r8708; +} +{ +mul.f16x2 r8715, r8612, r8657; +} +{ +mul.f16x2 r8718, r8648, r8658; +} +{ +sub.f16x2 r8721, r8715, r8718; +} +{ +mul.f16x2 r8724, r8612, r8658; +} +{ +fma.rn.f16x2 r8727, r8648, r8657, r8724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8732, {low, high}; +} +{ +neg.f16x2 r8733, r8732; +} +{ +add.f16x2 r8735, r8482, r8570; +} +{ +add.f16x2 %2, r8394, r8735; +} +{ +add.f16x2 r8741, r8488, r8576; +} +{ +add.f16x2 %3, r8400, r8741; +} +{ +add.f16x2 r8747, r8482, r8570; +} +{ +mul.f16x2 r8750, r8747, r8731; +} +{ +add.f16x2 r8753, r8394, r8750; +} +{ +sub.f16x2 r8756, r8488, r8576; +} +{ +mul.f16x2 r8759, r8756, r8733; +} +{ +add.f16x2 %20, r8753, r8759; +} +{ +add.f16x2 r8765, r8482, r8570; +} +{ +mul.f16x2 r8768, r8765, r8731; +} +{ +add.f16x2 r8771, r8394, r8768; +} +{ +sub.f16x2 r8774, r8488, r8576; +} +{ +mul.f16x2 r8777, r8774, r8733; +} +{ +sub.f16x2 %38, r8771, r8777; +} +{ +add.f16x2 r8783, r8488, r8576; +} +{ +mul.f16x2 r8786, r8783, r8731; +} +{ +add.f16x2 r8789, r8400, r8786; +} +{ +sub.f16x2 r8792, r8482, r8570; +} +{ +mul.f16x2 r8795, r8792, r8733; +} +{ +sub.f16x2 %21, r8789, r8795; +} +{ +add.f16x2 r8801, r8488, r8576; +} +{ +mul.f16x2 r8804, r8801, r8731; +} +{ +add.f16x2 r8807, r8400, r8804; +} +{ +sub.f16x2 r8810, r8482, r8570; +} +{ +mul.f16x2 r8813, r8810, r8733; +} +{ +add.f16x2 %39, r8807, r8813; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8819, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8820, {low, high}; +} +{ +neg.f16x2 r8821, r8820; +} +{ +add.f16x2 r8823, r8673, r8689; +} +{ +add.f16x2 %8, r8418, r8823; +} +{ +add.f16x2 r8829, r8679, r8695; +} +{ +add.f16x2 %9, r8454, r8829; +} +{ +add.f16x2 r8835, r8673, r8689; +} +{ +mul.f16x2 r8838, r8835, r8819; +} +{ +add.f16x2 r8841, r8418, r8838; +} +{ +sub.f16x2 r8844, r8679, r8695; +} +{ +mul.f16x2 r8847, r8844, r8821; +} +{ +add.f16x2 %26, r8841, r8847; +} +{ +add.f16x2 r8853, r8673, r8689; +} +{ +mul.f16x2 r8856, r8853, r8819; +} +{ +add.f16x2 r8859, r8418, r8856; +} +{ +sub.f16x2 r8862, r8679, r8695; +} +{ +mul.f16x2 r8865, r8862, r8821; +} +{ +sub.f16x2 %44, r8859, r8865; +} +{ +add.f16x2 r8871, r8679, r8695; +} +{ +mul.f16x2 r8874, r8871, r8819; +} +{ +add.f16x2 r8877, r8454, r8874; +} +{ +sub.f16x2 r8880, r8673, r8689; +} +{ +mul.f16x2 r8883, r8880, r8821; +} +{ +sub.f16x2 %27, r8877, r8883; +} +{ +add.f16x2 r8889, r8679, r8695; +} +{ +mul.f16x2 r8892, r8889, r8819; +} +{ +add.f16x2 r8895, r8454, r8892; +} +{ +sub.f16x2 r8898, r8673, r8689; +} +{ +mul.f16x2 r8901, r8898, r8821; +} +{ +add.f16x2 %45, r8895, r8901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8908, {low, high}; +} +{ +neg.f16x2 r8909, r8908; +} +{ +add.f16x2 r8911, r8705, r8721; +} +{ +add.f16x2 %14, r8436, r8911; +} +{ +add.f16x2 r8917, r8711, r8727; +} +{ +add.f16x2 %15, r8472, r8917; +} +{ +add.f16x2 r8923, r8705, r8721; +} +{ +mul.f16x2 r8926, r8923, r8907; +} +{ +add.f16x2 r8929, r8436, r8926; +} +{ +sub.f16x2 r8932, r8711, r8727; +} +{ +mul.f16x2 r8935, r8932, r8909; +} +{ +add.f16x2 %32, r8929, r8935; +} +{ +add.f16x2 r8941, r8705, r8721; +} +{ +mul.f16x2 r8944, r8941, r8907; +} +{ +add.f16x2 r8947, r8436, r8944; +} +{ +sub.f16x2 r8950, r8711, r8727; +} +{ +mul.f16x2 r8953, r8950, r8909; +} +{ +sub.f16x2 %50, r8947, r8953; +} +{ +add.f16x2 r8959, r8711, r8727; +} +{ +mul.f16x2 r8962, r8959, r8907; +} +{ +add.f16x2 r8965, r8472, r8962; +} +{ +sub.f16x2 r8968, r8705, r8721; +} +{ +mul.f16x2 r8971, r8968, r8909; +} +{ +sub.f16x2 %33, r8965, r8971; +} +{ +add.f16x2 r8977, r8711, r8727; +} +{ +mul.f16x2 r8980, r8977, r8907; +} +{ +add.f16x2 r8983, r8472, r8980; +} +{ +sub.f16x2 r8986, r8705, r8721; +} +{ +mul.f16x2 r8989, r8986, r8909; +} +{ +add.f16x2 %51, r8983, r8989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8995, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8996, {low, high}; +} +{ +neg.f16x2 r8997, r8996; +} +{ +add.f16x2 r8999, r9000, r9001; +} +{ +add.f16x2 r9002, r9003, r8999; +} +{ +add.f16x2 r9005, r9006, r9007; +} +{ +add.f16x2 r9008, r9009, r9005; +} +{ +add.f16x2 r9011, r9000, r9001; +} +{ +mul.f16x2 r9014, r9011, r8995; +} +{ +add.f16x2 r9017, r9003, r9014; +} +{ +sub.f16x2 r9020, r9006, r9007; +} +{ +mul.f16x2 r9023, r9020, r8997; +} +{ +add.f16x2 r9026, r9017, r9023; +} +{ +add.f16x2 r9029, r9000, r9001; +} +{ +mul.f16x2 r9032, r9029, r8995; +} +{ +add.f16x2 r9035, r9003, r9032; +} +{ +sub.f16x2 r9038, r9006, r9007; +} +{ +mul.f16x2 r9041, r9038, r8997; +} +{ +sub.f16x2 r9044, r9035, r9041; +} +{ +add.f16x2 r9047, r9006, r9007; +} +{ +mul.f16x2 r9050, r9047, r8995; +} +{ +add.f16x2 r9053, r9009, r9050; +} +{ +sub.f16x2 r9056, r9000, r9001; +} +{ +mul.f16x2 r9059, r9056, r8997; +} +{ +sub.f16x2 r9062, r9053, r9059; +} +{ +add.f16x2 r9065, r9006, r9007; +} +{ +mul.f16x2 r9068, r9065, r8995; +} +{ +add.f16x2 r9071, r9009, r9068; +} +{ +sub.f16x2 r9074, r9000, r9001; +} +{ +mul.f16x2 r9077, r9074, r8997; +} +{ +add.f16x2 r9080, r9071, r9077; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9083, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9084, {low, high}; +} +{ +neg.f16x2 r9085, r9084; +} +{ +add.f16x2 r9087, r9088, r9089; +} +{ +add.f16x2 r9090, r9091, r9087; +} +{ +add.f16x2 r9093, r9094, r9095; +} +{ +add.f16x2 r9096, r9097, r9093; +} +{ +add.f16x2 r9099, r9088, r9089; +} +{ +mul.f16x2 r9102, r9099, r9083; +} +{ +add.f16x2 r9105, r9091, r9102; +} +{ +sub.f16x2 r9108, r9094, r9095; +} +{ +mul.f16x2 r9111, r9108, r9085; +} +{ +add.f16x2 r9114, r9105, r9111; +} +{ +add.f16x2 r9117, r9088, r9089; +} +{ +mul.f16x2 r9120, r9117, r9083; +} +{ +add.f16x2 r9123, r9091, r9120; +} +{ +sub.f16x2 r9126, r9094, r9095; +} +{ +mul.f16x2 r9129, r9126, r9085; +} +{ +sub.f16x2 r9132, r9123, r9129; +} +{ +add.f16x2 r9135, r9094, r9095; +} +{ +mul.f16x2 r9138, r9135, r9083; +} +{ +add.f16x2 r9141, r9097, r9138; +} +{ +sub.f16x2 r9144, r9088, r9089; +} +{ +mul.f16x2 r9147, r9144, r9085; +} +{ +sub.f16x2 r9150, r9141, r9147; +} +{ +add.f16x2 r9153, r9094, r9095; +} +{ +mul.f16x2 r9156, r9153, r9083; +} +{ +add.f16x2 r9159, r9097, r9156; +} +{ +sub.f16x2 r9162, r9088, r9089; +} +{ +mul.f16x2 r9165, r9162, r9085; +} +{ +add.f16x2 r9168, r9159, r9165; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9171, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9172, {low, high}; +} +{ +neg.f16x2 r9173, r9172; +} +{ +add.f16x2 r9175, r9176, r9177; +} +{ +add.f16x2 r9178, r9179, r9175; +} +{ +add.f16x2 r9181, r9182, r9183; +} +{ +add.f16x2 r9184, r9185, r9181; +} +{ +add.f16x2 r9187, r9176, r9177; +} +{ +mul.f16x2 r9190, r9187, r9171; +} +{ +add.f16x2 r9193, r9179, r9190; +} +{ +sub.f16x2 r9196, r9182, r9183; +} +{ +mul.f16x2 r9199, r9196, r9173; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +add.f16x2 r9205, r9176, r9177; +} +{ +mul.f16x2 r9208, r9205, r9171; +} +{ +add.f16x2 r9211, r9179, r9208; +} +{ +sub.f16x2 r9214, r9182, r9183; +} +{ +mul.f16x2 r9217, r9214, r9173; +} +{ +sub.f16x2 r9220, r9211, r9217; +} +{ +add.f16x2 r9223, r9182, r9183; +} +{ +mul.f16x2 r9226, r9223, r9171; +} +{ +add.f16x2 r9229, r9185, r9226; +} +{ +sub.f16x2 r9232, r9176, r9177; +} +{ +mul.f16x2 r9235, r9232, r9173; +} +{ +sub.f16x2 r9238, r9229, r9235; +} +{ +add.f16x2 r9241, r9182, r9183; +} +{ +mul.f16x2 r9244, r9241, r9171; +} +{ +add.f16x2 r9247, r9185, r9244; +} +{ +sub.f16x2 r9250, r9176, r9177; +} +{ +mul.f16x2 r9253, r9250, r9173; +} +{ +add.f16x2 r9256, r9247, r9253; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r9259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r9260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r9261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r9262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r9265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r9266, {low, high}; +} +{ +mul.f16x2 r9275, r9114, r9259; +} +{ +mul.f16x2 r9278, r9150, r9260; +} +{ +sub.f16x2 r9281, r9275, r9278; +} +{ +mul.f16x2 r9284, r9114, r9260; +} +{ +fma.rn.f16x2 r9287, r9150, r9259, r9284; +} +{ +mul.f16x2 r9291, r9202, r9261; +} +{ +mul.f16x2 r9294, r9238, r9262; +} +{ +sub.f16x2 r9297, r9291, r9294; +} +{ +mul.f16x2 r9300, r9202, r9262; +} +{ +fma.rn.f16x2 r9303, r9238, r9261, r9300; +} +{ +mul.f16x2 r9307, r9132, r9261; +} +{ +mul.f16x2 r9310, r9168, r9262; +} +{ +sub.f16x2 r9313, r9307, r9310; +} +{ +mul.f16x2 r9316, r9132, r9262; +} +{ +fma.rn.f16x2 r9319, r9168, r9261, r9316; +} +{ +mul.f16x2 r9323, r9220, r9265; +} +{ +mul.f16x2 r9326, r9256, r9266; +} +{ +sub.f16x2 r9329, r9323, r9326; +} +{ +mul.f16x2 r9332, r9220, r9266; +} +{ +fma.rn.f16x2 r9335, r9256, r9265, r9332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9340, {low, high}; +} +{ +neg.f16x2 r9341, r9340; +} +{ +add.f16x2 r9343, r9090, r9178; +} +{ +add.f16x2 %4, r9002, r9343; +} +{ +add.f16x2 r9349, r9096, r9184; +} +{ +add.f16x2 %5, r9008, r9349; +} +{ +add.f16x2 r9355, r9090, r9178; +} +{ +mul.f16x2 r9358, r9355, r9339; +} +{ +add.f16x2 r9361, r9002, r9358; +} +{ +sub.f16x2 r9364, r9096, r9184; +} +{ +mul.f16x2 r9367, r9364, r9341; +} +{ +add.f16x2 %22, r9361, r9367; +} +{ +add.f16x2 r9373, r9090, r9178; +} +{ +mul.f16x2 r9376, r9373, r9339; +} +{ +add.f16x2 r9379, r9002, r9376; +} +{ +sub.f16x2 r9382, r9096, r9184; +} +{ +mul.f16x2 r9385, r9382, r9341; +} +{ +sub.f16x2 %40, r9379, r9385; +} +{ +add.f16x2 r9391, r9096, r9184; +} +{ +mul.f16x2 r9394, r9391, r9339; +} +{ +add.f16x2 r9397, r9008, r9394; +} +{ +sub.f16x2 r9400, r9090, r9178; +} +{ +mul.f16x2 r9403, r9400, r9341; +} +{ +sub.f16x2 %23, r9397, r9403; +} +{ +add.f16x2 r9409, r9096, r9184; +} +{ +mul.f16x2 r9412, r9409, r9339; +} +{ +add.f16x2 r9415, r9008, r9412; +} +{ +sub.f16x2 r9418, r9090, r9178; +} +{ +mul.f16x2 r9421, r9418, r9341; +} +{ +add.f16x2 %41, r9415, r9421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9427, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9428, {low, high}; +} +{ +neg.f16x2 r9429, r9428; +} +{ +add.f16x2 r9431, r9281, r9297; +} +{ +add.f16x2 %10, r9026, r9431; +} +{ +add.f16x2 r9437, r9287, r9303; +} +{ +add.f16x2 %11, r9062, r9437; +} +{ +add.f16x2 r9443, r9281, r9297; +} +{ +mul.f16x2 r9446, r9443, r9427; +} +{ +add.f16x2 r9449, r9026, r9446; +} +{ +sub.f16x2 r9452, r9287, r9303; +} +{ +mul.f16x2 r9455, r9452, r9429; +} +{ +add.f16x2 %28, r9449, r9455; +} +{ +add.f16x2 r9461, r9281, r9297; +} +{ +mul.f16x2 r9464, r9461, r9427; +} +{ +add.f16x2 r9467, r9026, r9464; +} +{ +sub.f16x2 r9470, r9287, r9303; +} +{ +mul.f16x2 r9473, r9470, r9429; +} +{ +sub.f16x2 %46, r9467, r9473; +} +{ +add.f16x2 r9479, r9287, r9303; +} +{ +mul.f16x2 r9482, r9479, r9427; +} +{ +add.f16x2 r9485, r9062, r9482; +} +{ +sub.f16x2 r9488, r9281, r9297; +} +{ +mul.f16x2 r9491, r9488, r9429; +} +{ +sub.f16x2 %29, r9485, r9491; +} +{ +add.f16x2 r9497, r9287, r9303; +} +{ +mul.f16x2 r9500, r9497, r9427; +} +{ +add.f16x2 r9503, r9062, r9500; +} +{ +sub.f16x2 r9506, r9281, r9297; +} +{ +mul.f16x2 r9509, r9506, r9429; +} +{ +add.f16x2 %47, r9503, r9509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9516, {low, high}; +} +{ +neg.f16x2 r9517, r9516; +} +{ +add.f16x2 r9519, r9313, r9329; +} +{ +add.f16x2 %16, r9044, r9519; +} +{ +add.f16x2 r9525, r9319, r9335; +} +{ +add.f16x2 %17, r9080, r9525; +} +{ +add.f16x2 r9531, r9313, r9329; +} +{ +mul.f16x2 r9534, r9531, r9515; +} +{ +add.f16x2 r9537, r9044, r9534; +} +{ +sub.f16x2 r9540, r9319, r9335; +} +{ +mul.f16x2 r9543, r9540, r9517; +} +{ +add.f16x2 %34, r9537, r9543; +} +{ +add.f16x2 r9549, r9313, r9329; +} +{ +mul.f16x2 r9552, r9549, r9515; +} +{ +add.f16x2 r9555, r9044, r9552; +} +{ +sub.f16x2 r9558, r9319, r9335; +} +{ +mul.f16x2 r9561, r9558, r9517; +} +{ +sub.f16x2 %52, r9555, r9561; +} +{ +add.f16x2 r9567, r9319, r9335; +} +{ +mul.f16x2 r9570, r9567, r9515; +} +{ +add.f16x2 r9573, r9080, r9570; +} +{ +sub.f16x2 r9576, r9313, r9329; +} +{ +mul.f16x2 r9579, r9576, r9517; +} +{ +sub.f16x2 %35, r9573, r9579; +} +{ +add.f16x2 r9585, r9319, r9335; +} +{ +mul.f16x2 r9588, r9585, r9515; +} +{ +add.f16x2 r9591, r9080, r9588; +} +{ +sub.f16x2 r9594, r9313, r9329; +} +{ +mul.f16x2 r9597, r9594, r9517; +} +{ +add.f16x2 %53, r9591, r9597; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<896, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<903>; +.reg .b32 r<9678>; +.reg .b64 rd<6>; +mov.u32 r9604, %54; +mov.u32 r9677, %tid.y; +mad.lo.s32 r9605, r9677, 26244, r9604; +mov.u32 r9606, %tid.x; +mov.f32 f894, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1, {low, high}; +} +mov.f32 f896, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f854, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r265, {low, high}; +} +mov.f32 f856, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r266, {low, high}; +} +mov.f32 f858, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r267, {low, high}; +} +mov.f32 f860, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r268, {low, high}; +} +mov.f32 f866, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r271, {low, high}; +} +mov.f32 f868, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %83, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %83, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %83, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %81, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %82; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %81, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %82; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %81, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %82; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %82; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %81, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %82; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %81, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1825, {low, high}; +} +mov.f32 f536, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1826, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1827, {low, high}; +} +mov.f32 f540, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1830, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1831, {low, high}; +} +mov.f32 f548, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1832, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1833, {low, high}; +} +mov.f32 f552, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1836, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1837, {low, high}; +} +mov.f32 f560, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1838, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1839, {low, high}; +} +mov.f32 f564, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1840, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1843, {low, high}; +} +mov.f32 f572, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1844, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1848, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1851, {low, high}; +} +mov.f32 f588, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1855, {low, high}; +} +mov.f32 f596, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r9606, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r9607, rd3; +mul.lo.s32 r9608, r9607, 243; +sub.s32 r9609, r9606, r9608; +mad.lo.s32 r9610, r9607, 26244, r9605; +cvt.rn.f32.u32 f897, r9609; +mul.f32 f898, f897, 0f3A7B0B40; +cos.approx.f32 f309, f898; +sin.approx.f32 f899, f898; +neg.f32 f310, f899; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +barrier.sync 0; +mad.lo.s32 r9611, r9609, 108, r9610; +st.shared.u32 [r9611], r2140; +st.shared.u32 [r9611+4], r2937; +st.shared.u32 [r9611+8], r2974; +st.shared.u32 [r9611+12], r3011; +st.shared.u32 [r9611+16], r3048; +st.shared.u32 [r9611+20], r3085; +st.shared.u32 [r9611+24], r3122; +st.shared.u32 [r9611+28], r3159; +st.shared.u32 [r9611+32], r3196; +st.shared.u32 [r9611+36], r3233; +st.shared.u32 [r9611+40], r3270; +st.shared.u32 [r9611+44], r3307; +st.shared.u32 [r9611+48], r3344; +st.shared.u32 [r9611+52], r3381; +st.shared.u32 [r9611+56], r3418; +st.shared.u32 [r9611+60], r3455; +st.shared.u32 [r9611+64], r3492; +st.shared.u32 [r9611+68], r3529; +st.shared.u32 [r9611+72], r3566; +st.shared.u32 [r9611+76], r3603; +st.shared.u32 [r9611+80], r3640; +st.shared.u32 [r9611+84], r3677; +st.shared.u32 [r9611+88], r3714; +st.shared.u32 [r9611+92], r3751; +st.shared.u32 [r9611+96], r3788; +st.shared.u32 [r9611+100], r3825; +st.shared.u32 [r9611+104], r3862; +barrier.sync 0; +mad.lo.s32 r9612, r9609, -104, r9611; +ld.shared.u32 r3898, [r9612]; +ld.shared.u32 r4506, [r9612+972]; +ld.shared.u32 r5114, [r9612+1944]; +ld.shared.u32 r3986, [r9612+2916]; +ld.shared.u32 r4594, [r9612+3888]; +ld.shared.u32 r5202, [r9612+4860]; +ld.shared.u32 r4074, [r9612+5832]; +ld.shared.u32 r4682, [r9612+6804]; +ld.shared.u32 r5290, [r9612+7776]; +ld.shared.u32 r3895, [r9612+8748]; +ld.shared.u32 r4503, [r9612+9720]; +ld.shared.u32 r5111, [r9612+10692]; +ld.shared.u32 r3983, [r9612+11664]; +ld.shared.u32 r4591, [r9612+12636]; +ld.shared.u32 r5199, [r9612+13608]; +ld.shared.u32 r4071, [r9612+14580]; +ld.shared.u32 r4679, [r9612+15552]; +ld.shared.u32 r5287, [r9612+16524]; +ld.shared.u32 r3896, [r9612+17496]; +ld.shared.u32 r4504, [r9612+18468]; +ld.shared.u32 r5112, [r9612+19440]; +ld.shared.u32 r3984, [r9612+20412]; +ld.shared.u32 r4592, [r9612+21384]; +ld.shared.u32 r5200, [r9612+22356]; +ld.shared.u32 r4072, [r9612+23328]; +ld.shared.u32 r4680, [r9612+24300]; +ld.shared.u32 r5288, [r9612+25272]; +barrier.sync 0; +st.shared.u32 [r9611], r2146; +st.shared.u32 [r9611+4], r2944; +st.shared.u32 [r9611+8], r2981; +st.shared.u32 [r9611+12], r3018; +st.shared.u32 [r9611+16], r3055; +st.shared.u32 [r9611+20], r3092; +st.shared.u32 [r9611+24], r3129; +st.shared.u32 [r9611+28], r3166; +st.shared.u32 [r9611+32], r3203; +st.shared.u32 [r9611+36], r3240; +st.shared.u32 [r9611+40], r3277; +st.shared.u32 [r9611+44], r3314; +st.shared.u32 [r9611+48], r3351; +st.shared.u32 [r9611+52], r3388; +st.shared.u32 [r9611+56], r3425; +st.shared.u32 [r9611+60], r3462; +st.shared.u32 [r9611+64], r3499; +st.shared.u32 [r9611+68], r3536; +st.shared.u32 [r9611+72], r3573; +st.shared.u32 [r9611+76], r3610; +st.shared.u32 [r9611+80], r3647; +st.shared.u32 [r9611+84], r3684; +st.shared.u32 [r9611+88], r3721; +st.shared.u32 [r9611+92], r3758; +st.shared.u32 [r9611+96], r3795; +st.shared.u32 [r9611+100], r3832; +st.shared.u32 [r9611+104], r3869; +barrier.sync 0; +ld.shared.u32 r3904, [r9612]; +ld.shared.u32 r4512, [r9612+972]; +ld.shared.u32 r5120, [r9612+1944]; +ld.shared.u32 r3992, [r9612+2916]; +ld.shared.u32 r4600, [r9612+3888]; +ld.shared.u32 r5208, [r9612+4860]; +ld.shared.u32 r4080, [r9612+5832]; +ld.shared.u32 r4688, [r9612+6804]; +ld.shared.u32 r5296, [r9612+7776]; +ld.shared.u32 r3901, [r9612+8748]; +ld.shared.u32 r4509, [r9612+9720]; +ld.shared.u32 r5117, [r9612+10692]; +ld.shared.u32 r3989, [r9612+11664]; +ld.shared.u32 r4597, [r9612+12636]; +ld.shared.u32 r5205, [r9612+13608]; +ld.shared.u32 r4077, [r9612+14580]; +ld.shared.u32 r4685, [r9612+15552]; +ld.shared.u32 r5293, [r9612+16524]; +ld.shared.u32 r3902, [r9612+17496]; +ld.shared.u32 r4510, [r9612+18468]; +ld.shared.u32 r5118, [r9612+19440]; +ld.shared.u32 r3990, [r9612+20412]; +ld.shared.u32 r4598, [r9612+21384]; +ld.shared.u32 r5206, [r9612+22356]; +ld.shared.u32 r4078, [r9612+23328]; +ld.shared.u32 r4686, [r9612+24300]; +ld.shared.u32 r5294, [r9612+25272]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 r6029, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 r6035, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 r6053, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 r6071, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 r6089, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 r6107, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 r6117, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 r6123, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 r6141, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 r6159, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 r6177, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 r6195, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 r6205, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 r6211, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 r6229, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 r6247, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 r6265, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 r6283, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 r6293, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 r6299, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 r6317, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 r6335, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 r6353, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 r6371, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 r6381, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 r6387, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 r6405, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 r6423, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 r6441, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 r6459, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 r6469, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 r6475, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 r6493, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 r6511, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 r6529, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 r6547, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 r6557, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 r6563, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 r6617, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 r6635, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 r6645, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 r6651, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 r6669, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 r6687, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 r6705, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 r6723, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 r6733, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 r6739, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 r6757, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 r6775, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 r6793, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 r6811, r6802, r6808; +} +mul.wide.u32 rd4, r9609, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r9613, rd5; +sub.s32 r9614, r9609, r9613; +shr.u32 r9615, r9614, 1; +add.s32 r9616, r9615, r9613; +shr.u32 r9617, r9616, 4; +mul.lo.s32 r9618, r9617, 27; +sub.s32 r9619, r9609, r9618; +shl.b32 r9620, r9619, 2; +add.s32 r9621, r9610, r9620; +cvt.rn.f32.u32 f900, r9617; +mul.f32 f901, f900, 0f3CD3D17E; +cos.approx.f32 f673, f901; +sin.approx.f32 f902, f901; +neg.f32 f674, f902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6814, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6819, {high, high}; +} +{ +mul.f16x2 r6821, r6123, r6819; +} +{ +neg.f16x2 r6824, r6821; +} +{ +fma.rn.f16x2 r6826, r6117, r6817, r6824; +} +{ +mul.f16x2 r6830, r6117, r6819; +} +{ +fma.rn.f16x2 r6833, r6123, r6817, r6830; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6839, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6841, {low, high}; +} +{ +mul.f16x2 r6842, r6839, r6841; +} +{ +mul.f16x2 r6845, r6814, r6837; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6848, {high, low}; +} +{ +fma.rn.f16x2 r6850, r6842, r6848, r6845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6856, {high, high}; +} +{ +mul.f16x2 r6858, r6211, r6856; +} +{ +neg.f16x2 r6861, r6858; +} +{ +fma.rn.f16x2 r6863, r6205, r6854, r6861; +} +{ +mul.f16x2 r6867, r6205, r6856; +} +{ +fma.rn.f16x2 r6870, r6211, r6854, r6867; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6876, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6878, {low, high}; +} +{ +mul.f16x2 r6879, r6876, r6878; +} +{ +mul.f16x2 r6882, r6850, r6874; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6850; +mov.b32 r6885, {high, low}; +} +{ +fma.rn.f16x2 r6887, r6879, r6885, r6882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6891, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6893, {high, high}; +} +{ +mul.f16x2 r6895, r6299, r6893; +} +{ +neg.f16x2 r6898, r6895; +} +{ +fma.rn.f16x2 r6900, r6293, r6891, r6898; +} +{ +mul.f16x2 r6904, r6293, r6893; +} +{ +fma.rn.f16x2 r6907, r6299, r6891, r6904; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6913, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6915, {low, high}; +} +{ +mul.f16x2 r6916, r6913, r6915; +} +{ +mul.f16x2 r6919, r6887, r6911; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6887; +mov.b32 r6922, {high, low}; +} +{ +fma.rn.f16x2 r6924, r6916, r6922, r6919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6930, {high, high}; +} +{ +mul.f16x2 r6932, r6387, r6930; +} +{ +neg.f16x2 r6935, r6932; +} +{ +fma.rn.f16x2 r6937, r6381, r6928, r6935; +} +{ +mul.f16x2 r6941, r6381, r6930; +} +{ +fma.rn.f16x2 r6944, r6387, r6928, r6941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6950, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6952, {low, high}; +} +{ +mul.f16x2 r6953, r6950, r6952; +} +{ +mul.f16x2 r6956, r6924, r6948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6924; +mov.b32 r6959, {high, low}; +} +{ +fma.rn.f16x2 r6961, r6953, r6959, r6956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6967, {high, high}; +} +{ +mul.f16x2 r6969, r6475, r6967; +} +{ +neg.f16x2 r6972, r6969; +} +{ +fma.rn.f16x2 r6974, r6469, r6965, r6972; +} +{ +mul.f16x2 r6978, r6469, r6967; +} +{ +fma.rn.f16x2 r6981, r6475, r6965, r6978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r6987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6989, {low, high}; +} +{ +mul.f16x2 r6990, r6987, r6989; +} +{ +mul.f16x2 r6993, r6961, r6985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6961; +mov.b32 r6996, {high, low}; +} +{ +fma.rn.f16x2 r6998, r6990, r6996, r6993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7004, {high, high}; +} +{ +mul.f16x2 r7006, r6563, r7004; +} +{ +neg.f16x2 r7009, r7006; +} +{ +fma.rn.f16x2 r7011, r6557, r7002, r7009; +} +{ +mul.f16x2 r7015, r6557, r7004; +} +{ +fma.rn.f16x2 r7018, r6563, r7002, r7015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7026, {low, high}; +} +{ +mul.f16x2 r7027, r7024, r7026; +} +{ +mul.f16x2 r7030, r6998, r7022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6998; +mov.b32 r7033, {high, low}; +} +{ +fma.rn.f16x2 r7035, r7027, r7033, r7030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7041, {high, high}; +} +{ +mul.f16x2 r7043, r6651, r7041; +} +{ +neg.f16x2 r7046, r7043; +} +{ +fma.rn.f16x2 r7048, r6645, r7039, r7046; +} +{ +mul.f16x2 r7052, r6645, r7041; +} +{ +fma.rn.f16x2 r7055, r6651, r7039, r7052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7063, {low, high}; +} +{ +mul.f16x2 r7064, r7061, r7063; +} +{ +mul.f16x2 r7067, r7035, r7059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7035; +mov.b32 r7070, {high, low}; +} +{ +fma.rn.f16x2 r7072, r7064, r7070, r7067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7078, {high, high}; +} +{ +mul.f16x2 r7080, r6739, r7078; +} +{ +neg.f16x2 r7083, r7080; +} +{ +fma.rn.f16x2 r7085, r6733, r7076, r7083; +} +{ +mul.f16x2 r7089, r6733, r7078; +} +{ +fma.rn.f16x2 r7092, r6739, r7076, r7089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7100, {low, high}; +} +{ +mul.f16x2 r7101, r7098, r7100; +} +{ +mul.f16x2 r7104, r7072, r7096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7072; +mov.b32 r7107, {high, low}; +} +{ +fma.rn.f16x2 r7109, r7101, r7107, r7104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7115, {high, high}; +} +{ +mul.f16x2 r7117, r6089, r7115; +} +{ +neg.f16x2 r7120, r7117; +} +{ +fma.rn.f16x2 r7122, r6053, r7113, r7120; +} +{ +mul.f16x2 r7126, r6053, r7115; +} +{ +fma.rn.f16x2 r7129, r6089, r7113, r7126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7137, {low, high}; +} +{ +mul.f16x2 r7138, r7135, r7137; +} +{ +mul.f16x2 r7141, r7109, r7133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7109; +mov.b32 r7144, {high, low}; +} +{ +fma.rn.f16x2 r7146, r7138, r7144, r7141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7152, {high, high}; +} +{ +mul.f16x2 r7154, r6177, r7152; +} +{ +neg.f16x2 r7157, r7154; +} +{ +fma.rn.f16x2 r7159, r6141, r7150, r7157; +} +{ +mul.f16x2 r7163, r6141, r7152; +} +{ +fma.rn.f16x2 r7166, r6177, r7150, r7163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7174, {low, high}; +} +{ +mul.f16x2 r7175, r7172, r7174; +} +{ +mul.f16x2 r7178, r7146, r7170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7146; +mov.b32 r7181, {high, low}; +} +{ +fma.rn.f16x2 r7183, r7175, r7181, r7178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7189, {high, high}; +} +{ +mul.f16x2 r7191, r6265, r7189; +} +{ +neg.f16x2 r7194, r7191; +} +{ +fma.rn.f16x2 r7196, r6229, r7187, r7194; +} +{ +mul.f16x2 r7200, r6229, r7189; +} +{ +fma.rn.f16x2 r7203, r6265, r7187, r7200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7211, {low, high}; +} +{ +mul.f16x2 r7212, r7209, r7211; +} +{ +mul.f16x2 r7215, r7183, r7207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7183; +mov.b32 r7218, {high, low}; +} +{ +fma.rn.f16x2 r7220, r7212, r7218, r7215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7226, {high, high}; +} +{ +mul.f16x2 r7228, r6353, r7226; +} +{ +neg.f16x2 r7231, r7228; +} +{ +fma.rn.f16x2 r7233, r6317, r7224, r7231; +} +{ +mul.f16x2 r7237, r6317, r7226; +} +{ +fma.rn.f16x2 r7240, r6353, r7224, r7237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7248, {low, high}; +} +{ +mul.f16x2 r7249, r7246, r7248; +} +{ +mul.f16x2 r7252, r7220, r7244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7220; +mov.b32 r7255, {high, low}; +} +{ +fma.rn.f16x2 r7257, r7249, r7255, r7252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7263, {high, high}; +} +{ +mul.f16x2 r7265, r6441, r7263; +} +{ +neg.f16x2 r7268, r7265; +} +{ +fma.rn.f16x2 r7270, r6405, r7261, r7268; +} +{ +mul.f16x2 r7274, r6405, r7263; +} +{ +fma.rn.f16x2 r7277, r6441, r7261, r7274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7285, {low, high}; +} +{ +mul.f16x2 r7286, r7283, r7285; +} +{ +mul.f16x2 r7289, r7257, r7281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7257; +mov.b32 r7292, {high, low}; +} +{ +fma.rn.f16x2 r7294, r7286, r7292, r7289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7300, {high, high}; +} +{ +mul.f16x2 r7302, r6529, r7300; +} +{ +neg.f16x2 r7305, r7302; +} +{ +fma.rn.f16x2 r7307, r6493, r7298, r7305; +} +{ +mul.f16x2 r7311, r6493, r7300; +} +{ +fma.rn.f16x2 r7314, r6529, r7298, r7311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7322, {low, high}; +} +{ +mul.f16x2 r7323, r7320, r7322; +} +{ +mul.f16x2 r7326, r7294, r7318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7294; +mov.b32 r7329, {high, low}; +} +{ +fma.rn.f16x2 r7331, r7323, r7329, r7326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7337, {high, high}; +} +{ +mul.f16x2 r7339, r6617, r7337; +} +{ +neg.f16x2 r7342, r7339; +} +{ +fma.rn.f16x2 r7344, r6581, r7335, r7342; +} +{ +mul.f16x2 r7348, r6581, r7337; +} +{ +fma.rn.f16x2 r7351, r6617, r7335, r7348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7359, {low, high}; +} +{ +mul.f16x2 r7360, r7357, r7359; +} +{ +mul.f16x2 r7363, r7331, r7355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7331; +mov.b32 r7366, {high, low}; +} +{ +fma.rn.f16x2 r7368, r7360, r7366, r7363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7374, {high, high}; +} +{ +mul.f16x2 r7376, r6705, r7374; +} +{ +neg.f16x2 r7379, r7376; +} +{ +fma.rn.f16x2 r7381, r6669, r7372, r7379; +} +{ +mul.f16x2 r7385, r6669, r7374; +} +{ +fma.rn.f16x2 r7388, r6705, r7372, r7385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7396, {low, high}; +} +{ +mul.f16x2 r7397, r7394, r7396; +} +{ +mul.f16x2 r7400, r7368, r7392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7368; +mov.b32 r7403, {high, low}; +} +{ +fma.rn.f16x2 r7405, r7397, r7403, r7400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7411, {high, high}; +} +{ +mul.f16x2 r7413, r6793, r7411; +} +{ +neg.f16x2 r7416, r7413; +} +{ +fma.rn.f16x2 r7418, r6757, r7409, r7416; +} +{ +mul.f16x2 r7422, r6757, r7411; +} +{ +fma.rn.f16x2 r7425, r6793, r7409, r7422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7433, {low, high}; +} +{ +mul.f16x2 r7434, r7431, r7433; +} +{ +mul.f16x2 r7437, r7405, r7429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7405; +mov.b32 r7440, {high, low}; +} +{ +fma.rn.f16x2 r7442, r7434, r7440, r7437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7448, {high, high}; +} +{ +mul.f16x2 r7450, r6107, r7448; +} +{ +neg.f16x2 r7453, r7450; +} +{ +fma.rn.f16x2 r7455, r6071, r7446, r7453; +} +{ +mul.f16x2 r7459, r6071, r7448; +} +{ +fma.rn.f16x2 r7462, r6107, r7446, r7459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7470, {low, high}; +} +{ +mul.f16x2 r7471, r7468, r7470; +} +{ +mul.f16x2 r7474, r7442, r7466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7442; +mov.b32 r7477, {high, low}; +} +{ +fma.rn.f16x2 r7479, r7471, r7477, r7474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7485, {high, high}; +} +{ +mul.f16x2 r7487, r6195, r7485; +} +{ +neg.f16x2 r7490, r7487; +} +{ +fma.rn.f16x2 r7492, r6159, r7483, r7490; +} +{ +mul.f16x2 r7496, r6159, r7485; +} +{ +fma.rn.f16x2 r7499, r6195, r7483, r7496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7507, {low, high}; +} +{ +mul.f16x2 r7508, r7505, r7507; +} +{ +mul.f16x2 r7511, r7479, r7503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7479; +mov.b32 r7514, {high, low}; +} +{ +fma.rn.f16x2 r7516, r7508, r7514, r7511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7522, {high, high}; +} +{ +mul.f16x2 r7524, r6283, r7522; +} +{ +neg.f16x2 r7527, r7524; +} +{ +fma.rn.f16x2 r7529, r6247, r7520, r7527; +} +{ +mul.f16x2 r7533, r6247, r7522; +} +{ +fma.rn.f16x2 r7536, r6283, r7520, r7533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7544, {low, high}; +} +{ +mul.f16x2 r7545, r7542, r7544; +} +{ +mul.f16x2 r7548, r7516, r7540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7516; +mov.b32 r7551, {high, low}; +} +{ +fma.rn.f16x2 r7553, r7545, r7551, r7548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7559, {high, high}; +} +{ +mul.f16x2 r7561, r6371, r7559; +} +{ +neg.f16x2 r7564, r7561; +} +{ +fma.rn.f16x2 r7566, r6335, r7557, r7564; +} +{ +mul.f16x2 r7570, r6335, r7559; +} +{ +fma.rn.f16x2 r7573, r6371, r7557, r7570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7581, {low, high}; +} +{ +mul.f16x2 r7582, r7579, r7581; +} +{ +mul.f16x2 r7585, r7553, r7577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7553; +mov.b32 r7588, {high, low}; +} +{ +fma.rn.f16x2 r7590, r7582, r7588, r7585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7596, {high, high}; +} +{ +mul.f16x2 r7598, r6459, r7596; +} +{ +neg.f16x2 r7601, r7598; +} +{ +fma.rn.f16x2 r7603, r6423, r7594, r7601; +} +{ +mul.f16x2 r7607, r6423, r7596; +} +{ +fma.rn.f16x2 r7610, r6459, r7594, r7607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7618, {low, high}; +} +{ +mul.f16x2 r7619, r7616, r7618; +} +{ +mul.f16x2 r7622, r7590, r7614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7590; +mov.b32 r7625, {high, low}; +} +{ +fma.rn.f16x2 r7627, r7619, r7625, r7622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7633, {high, high}; +} +{ +mul.f16x2 r7635, r6547, r7633; +} +{ +neg.f16x2 r7638, r7635; +} +{ +fma.rn.f16x2 r7640, r6511, r7631, r7638; +} +{ +mul.f16x2 r7644, r6511, r7633; +} +{ +fma.rn.f16x2 r7647, r6547, r7631, r7644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7655, {low, high}; +} +{ +mul.f16x2 r7656, r7653, r7655; +} +{ +mul.f16x2 r7659, r7627, r7651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7627; +mov.b32 r7662, {high, low}; +} +{ +fma.rn.f16x2 r7664, r7656, r7662, r7659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7670, {high, high}; +} +{ +mul.f16x2 r7672, r6635, r7670; +} +{ +neg.f16x2 r7675, r7672; +} +{ +fma.rn.f16x2 r7677, r6599, r7668, r7675; +} +{ +mul.f16x2 r7681, r6599, r7670; +} +{ +fma.rn.f16x2 r7684, r6635, r7668, r7681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7692, {low, high}; +} +{ +mul.f16x2 r7693, r7690, r7692; +} +{ +mul.f16x2 r7696, r7664, r7688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7664; +mov.b32 r7699, {high, low}; +} +{ +fma.rn.f16x2 r7701, r7693, r7699, r7696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7707, {high, high}; +} +{ +mul.f16x2 r7709, r6723, r7707; +} +{ +neg.f16x2 r7712, r7709; +} +{ +fma.rn.f16x2 r7714, r6687, r7705, r7712; +} +{ +mul.f16x2 r7718, r6687, r7707; +} +{ +fma.rn.f16x2 r7721, r6723, r7705, r7718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6814; +mov.b32 r7727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7729, {low, high}; +} +{ +mul.f16x2 r7730, r7727, r7729; +} +{ +mul.f16x2 r7733, r7701, r7725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7701; +mov.b32 r7736, {high, low}; +} +{ +fma.rn.f16x2 r7738, r7730, r7736, r7733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7738; +mov.b32 r7744, {high, high}; +} +{ +mul.f16x2 r7746, r6811, r7744; +} +{ +neg.f16x2 r7749, r7746; +} +{ +fma.rn.f16x2 r7751, r6775, r7742, r7749; +} +{ +mul.f16x2 r7755, r6775, r7744; +} +{ +fma.rn.f16x2 r7758, r6811, r7742, r7755; +} +barrier.sync 0; +mad.lo.s32 r9622, r9617, 2916, r9621; +st.shared.u32 [r9622], r6029; +st.shared.u32 [r9622+108], r6826; +st.shared.u32 [r9622+216], r6863; +st.shared.u32 [r9622+324], r6900; +st.shared.u32 [r9622+432], r6937; +st.shared.u32 [r9622+540], r6974; +st.shared.u32 [r9622+648], r7011; +st.shared.u32 [r9622+756], r7048; +st.shared.u32 [r9622+864], r7085; +st.shared.u32 [r9622+972], r7122; +st.shared.u32 [r9622+1080], r7159; +st.shared.u32 [r9622+1188], r7196; +st.shared.u32 [r9622+1296], r7233; +st.shared.u32 [r9622+1404], r7270; +st.shared.u32 [r9622+1512], r7307; +st.shared.u32 [r9622+1620], r7344; +st.shared.u32 [r9622+1728], r7381; +st.shared.u32 [r9622+1836], r7418; +st.shared.u32 [r9622+1944], r7455; +st.shared.u32 [r9622+2052], r7492; +st.shared.u32 [r9622+2160], r7529; +st.shared.u32 [r9622+2268], r7566; +st.shared.u32 [r9622+2376], r7603; +st.shared.u32 [r9622+2484], r7640; +st.shared.u32 [r9622+2592], r7677; +st.shared.u32 [r9622+2700], r7714; +st.shared.u32 [r9622+2808], r7751; +barrier.sync 0; +ld.shared.u32 r7787, [r9612]; +ld.shared.u32 r8395, [r9612+972]; +ld.shared.u32 r9003, [r9612+1944]; +ld.shared.u32 r7875, [r9612+2916]; +ld.shared.u32 r8483, [r9612+3888]; +ld.shared.u32 r9091, [r9612+4860]; +ld.shared.u32 r7963, [r9612+5832]; +ld.shared.u32 r8571, [r9612+6804]; +ld.shared.u32 r9179, [r9612+7776]; +ld.shared.u32 r7784, [r9612+8748]; +ld.shared.u32 r8392, [r9612+9720]; +ld.shared.u32 r9000, [r9612+10692]; +ld.shared.u32 r7872, [r9612+11664]; +ld.shared.u32 r8480, [r9612+12636]; +ld.shared.u32 r9088, [r9612+13608]; +ld.shared.u32 r7960, [r9612+14580]; +ld.shared.u32 r8568, [r9612+15552]; +ld.shared.u32 r9176, [r9612+16524]; +ld.shared.u32 r7785, [r9612+17496]; +ld.shared.u32 r8393, [r9612+18468]; +ld.shared.u32 r9001, [r9612+19440]; +ld.shared.u32 r7873, [r9612+20412]; +ld.shared.u32 r8481, [r9612+21384]; +ld.shared.u32 r9089, [r9612+22356]; +ld.shared.u32 r7961, [r9612+23328]; +ld.shared.u32 r8569, [r9612+24300]; +ld.shared.u32 r9177, [r9612+25272]; +barrier.sync 0; +st.shared.u32 [r9622], r6035; +st.shared.u32 [r9622+108], r6833; +st.shared.u32 [r9622+216], r6870; +st.shared.u32 [r9622+324], r6907; +st.shared.u32 [r9622+432], r6944; +st.shared.u32 [r9622+540], r6981; +st.shared.u32 [r9622+648], r7018; +st.shared.u32 [r9622+756], r7055; +st.shared.u32 [r9622+864], r7092; +st.shared.u32 [r9622+972], r7129; +st.shared.u32 [r9622+1080], r7166; +st.shared.u32 [r9622+1188], r7203; +st.shared.u32 [r9622+1296], r7240; +st.shared.u32 [r9622+1404], r7277; +st.shared.u32 [r9622+1512], r7314; +st.shared.u32 [r9622+1620], r7351; +st.shared.u32 [r9622+1728], r7388; +st.shared.u32 [r9622+1836], r7425; +st.shared.u32 [r9622+1944], r7462; +st.shared.u32 [r9622+2052], r7499; +st.shared.u32 [r9622+2160], r7536; +st.shared.u32 [r9622+2268], r7573; +st.shared.u32 [r9622+2376], r7610; +st.shared.u32 [r9622+2484], r7647; +st.shared.u32 [r9622+2592], r7684; +st.shared.u32 [r9622+2700], r7721; +st.shared.u32 [r9622+2808], r7758; +barrier.sync 0; +ld.shared.u32 r7793, [r9612]; +ld.shared.u32 r8401, [r9612+972]; +ld.shared.u32 r9009, [r9612+1944]; +ld.shared.u32 r7881, [r9612+2916]; +ld.shared.u32 r8489, [r9612+3888]; +ld.shared.u32 r9097, [r9612+4860]; +ld.shared.u32 r7969, [r9612+5832]; +ld.shared.u32 r8577, [r9612+6804]; +ld.shared.u32 r9185, [r9612+7776]; +ld.shared.u32 r7790, [r9612+8748]; +ld.shared.u32 r8398, [r9612+9720]; +ld.shared.u32 r9006, [r9612+10692]; +ld.shared.u32 r7878, [r9612+11664]; +ld.shared.u32 r8486, [r9612+12636]; +ld.shared.u32 r9094, [r9612+13608]; +ld.shared.u32 r7966, [r9612+14580]; +ld.shared.u32 r8574, [r9612+15552]; +ld.shared.u32 r9182, [r9612+16524]; +ld.shared.u32 r7791, [r9612+17496]; +ld.shared.u32 r8399, [r9612+18468]; +ld.shared.u32 r9007, [r9612+19440]; +ld.shared.u32 r7879, [r9612+20412]; +ld.shared.u32 r8487, [r9612+21384]; +ld.shared.u32 r9095, [r9612+22356]; +ld.shared.u32 r7967, [r9612+23328]; +ld.shared.u32 r8575, [r9612+24300]; +ld.shared.u32 r9183, [r9612+25272]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7780, {low, high}; +} +{ +neg.f16x2 r7781, r7780; +} +{ +add.f16x2 r7783, r7784, r7785; +} +{ +add.f16x2 r7786, r7787, r7783; +} +{ +add.f16x2 r7789, r7790, r7791; +} +{ +add.f16x2 r7792, r7793, r7789; +} +{ +add.f16x2 r7795, r7784, r7785; +} +{ +mul.f16x2 r7798, r7795, r7779; +} +{ +add.f16x2 r7801, r7787, r7798; +} +{ +sub.f16x2 r7804, r7790, r7791; +} +{ +mul.f16x2 r7807, r7804, r7781; +} +{ +add.f16x2 r7810, r7801, r7807; +} +{ +add.f16x2 r7813, r7784, r7785; +} +{ +mul.f16x2 r7816, r7813, r7779; +} +{ +add.f16x2 r7819, r7787, r7816; +} +{ +sub.f16x2 r7822, r7790, r7791; +} +{ +mul.f16x2 r7825, r7822, r7781; +} +{ +sub.f16x2 r7828, r7819, r7825; +} +{ +add.f16x2 r7831, r7790, r7791; +} +{ +mul.f16x2 r7834, r7831, r7779; +} +{ +add.f16x2 r7837, r7793, r7834; +} +{ +sub.f16x2 r7840, r7784, r7785; +} +{ +mul.f16x2 r7843, r7840, r7781; +} +{ +sub.f16x2 r7846, r7837, r7843; +} +{ +add.f16x2 r7849, r7790, r7791; +} +{ +mul.f16x2 r7852, r7849, r7779; +} +{ +add.f16x2 r7855, r7793, r7852; +} +{ +sub.f16x2 r7858, r7784, r7785; +} +{ +mul.f16x2 r7861, r7858, r7781; +} +{ +add.f16x2 r7864, r7855, r7861; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7867, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7868, {low, high}; +} +{ +neg.f16x2 r7869, r7868; +} +{ +add.f16x2 r7871, r7872, r7873; +} +{ +add.f16x2 r7874, r7875, r7871; +} +{ +add.f16x2 r7877, r7878, r7879; +} +{ +add.f16x2 r7880, r7881, r7877; +} +{ +add.f16x2 r7883, r7872, r7873; +} +{ +mul.f16x2 r7886, r7883, r7867; +} +{ +add.f16x2 r7889, r7875, r7886; +} +{ +sub.f16x2 r7892, r7878, r7879; +} +{ +mul.f16x2 r7895, r7892, r7869; +} +{ +add.f16x2 r7898, r7889, r7895; +} +{ +add.f16x2 r7901, r7872, r7873; +} +{ +mul.f16x2 r7904, r7901, r7867; +} +{ +add.f16x2 r7907, r7875, r7904; +} +{ +sub.f16x2 r7910, r7878, r7879; +} +{ +mul.f16x2 r7913, r7910, r7869; +} +{ +sub.f16x2 r7916, r7907, r7913; +} +{ +add.f16x2 r7919, r7878, r7879; +} +{ +mul.f16x2 r7922, r7919, r7867; +} +{ +add.f16x2 r7925, r7881, r7922; +} +{ +sub.f16x2 r7928, r7872, r7873; +} +{ +mul.f16x2 r7931, r7928, r7869; +} +{ +sub.f16x2 r7934, r7925, r7931; +} +{ +add.f16x2 r7937, r7878, r7879; +} +{ +mul.f16x2 r7940, r7937, r7867; +} +{ +add.f16x2 r7943, r7881, r7940; +} +{ +sub.f16x2 r7946, r7872, r7873; +} +{ +mul.f16x2 r7949, r7946, r7869; +} +{ +add.f16x2 r7952, r7943, r7949; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7955, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7956, {low, high}; +} +{ +neg.f16x2 r7957, r7956; +} +{ +add.f16x2 r7959, r7960, r7961; +} +{ +add.f16x2 r7962, r7963, r7959; +} +{ +add.f16x2 r7965, r7966, r7967; +} +{ +add.f16x2 r7968, r7969, r7965; +} +{ +add.f16x2 r7971, r7960, r7961; +} +{ +mul.f16x2 r7974, r7971, r7955; +} +{ +add.f16x2 r7977, r7963, r7974; +} +{ +sub.f16x2 r7980, r7966, r7967; +} +{ +mul.f16x2 r7983, r7980, r7957; +} +{ +add.f16x2 r7986, r7977, r7983; +} +{ +add.f16x2 r7989, r7960, r7961; +} +{ +mul.f16x2 r7992, r7989, r7955; +} +{ +add.f16x2 r7995, r7963, r7992; +} +{ +sub.f16x2 r7998, r7966, r7967; +} +{ +mul.f16x2 r8001, r7998, r7957; +} +{ +sub.f16x2 r8004, r7995, r8001; +} +{ +add.f16x2 r8007, r7966, r7967; +} +{ +mul.f16x2 r8010, r8007, r7955; +} +{ +add.f16x2 r8013, r7969, r8010; +} +{ +sub.f16x2 r8016, r7960, r7961; +} +{ +mul.f16x2 r8019, r8016, r7957; +} +{ +sub.f16x2 r8022, r8013, r8019; +} +{ +add.f16x2 r8025, r7966, r7967; +} +{ +mul.f16x2 r8028, r8025, r7955; +} +{ +add.f16x2 r8031, r7969, r8028; +} +{ +sub.f16x2 r8034, r7960, r7961; +} +{ +mul.f16x2 r8037, r8034, r7957; +} +{ +add.f16x2 r8040, r8031, r8037; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r8043, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r8044, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r8045, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r8046, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r8049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r8050, {low, high}; +} +{ +mul.f16x2 r8059, r7898, r8043; +} +{ +mul.f16x2 r8062, r7934, r8044; +} +{ +sub.f16x2 r8065, r8059, r8062; +} +{ +mul.f16x2 r8068, r7898, r8044; +} +{ +fma.rn.f16x2 r8071, r7934, r8043, r8068; +} +{ +mul.f16x2 r8075, r7986, r8045; +} +{ +mul.f16x2 r8078, r8022, r8046; +} +{ +sub.f16x2 r8081, r8075, r8078; +} +{ +mul.f16x2 r8084, r7986, r8046; +} +{ +fma.rn.f16x2 r8087, r8022, r8045, r8084; +} +{ +mul.f16x2 r8091, r7916, r8045; +} +{ +mul.f16x2 r8094, r7952, r8046; +} +{ +sub.f16x2 r8097, r8091, r8094; +} +{ +mul.f16x2 r8100, r7916, r8046; +} +{ +fma.rn.f16x2 r8103, r7952, r8045, r8100; +} +{ +mul.f16x2 r8107, r8004, r8049; +} +{ +mul.f16x2 r8110, r8040, r8050; +} +{ +sub.f16x2 r8113, r8107, r8110; +} +{ +mul.f16x2 r8116, r8004, r8050; +} +{ +fma.rn.f16x2 r8119, r8040, r8049, r8116; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8124, {low, high}; +} +{ +neg.f16x2 r8125, r8124; +} +{ +add.f16x2 r8127, r7874, r7962; +} +{ +add.f16x2 %0, r7786, r8127; +} +{ +add.f16x2 r8133, r7880, r7968; +} +{ +add.f16x2 %1, r7792, r8133; +} +{ +add.f16x2 r8139, r7874, r7962; +} +{ +mul.f16x2 r8142, r8139, r8123; +} +{ +add.f16x2 r8145, r7786, r8142; +} +{ +sub.f16x2 r8148, r7880, r7968; +} +{ +mul.f16x2 r8151, r8148, r8125; +} +{ +add.f16x2 %18, r8145, r8151; +} +{ +add.f16x2 r8157, r7874, r7962; +} +{ +mul.f16x2 r8160, r8157, r8123; +} +{ +add.f16x2 r8163, r7786, r8160; +} +{ +sub.f16x2 r8166, r7880, r7968; +} +{ +mul.f16x2 r8169, r8166, r8125; +} +{ +sub.f16x2 %36, r8163, r8169; +} +{ +add.f16x2 r8175, r7880, r7968; +} +{ +mul.f16x2 r8178, r8175, r8123; +} +{ +add.f16x2 r8181, r7792, r8178; +} +{ +sub.f16x2 r8184, r7874, r7962; +} +{ +mul.f16x2 r8187, r8184, r8125; +} +{ +sub.f16x2 %19, r8181, r8187; +} +{ +add.f16x2 r8193, r7880, r7968; +} +{ +mul.f16x2 r8196, r8193, r8123; +} +{ +add.f16x2 r8199, r7792, r8196; +} +{ +sub.f16x2 r8202, r7874, r7962; +} +{ +mul.f16x2 r8205, r8202, r8125; +} +{ +add.f16x2 %37, r8199, r8205; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8211, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8212, {low, high}; +} +{ +neg.f16x2 r8213, r8212; +} +{ +add.f16x2 r8215, r8065, r8081; +} +{ +add.f16x2 %6, r7810, r8215; +} +{ +add.f16x2 r8221, r8071, r8087; +} +{ +add.f16x2 %7, r7846, r8221; +} +{ +add.f16x2 r8227, r8065, r8081; +} +{ +mul.f16x2 r8230, r8227, r8211; +} +{ +add.f16x2 r8233, r7810, r8230; +} +{ +sub.f16x2 r8236, r8071, r8087; +} +{ +mul.f16x2 r8239, r8236, r8213; +} +{ +add.f16x2 %24, r8233, r8239; +} +{ +add.f16x2 r8245, r8065, r8081; +} +{ +mul.f16x2 r8248, r8245, r8211; +} +{ +add.f16x2 r8251, r7810, r8248; +} +{ +sub.f16x2 r8254, r8071, r8087; +} +{ +mul.f16x2 r8257, r8254, r8213; +} +{ +sub.f16x2 %42, r8251, r8257; +} +{ +add.f16x2 r8263, r8071, r8087; +} +{ +mul.f16x2 r8266, r8263, r8211; +} +{ +add.f16x2 r8269, r7846, r8266; +} +{ +sub.f16x2 r8272, r8065, r8081; +} +{ +mul.f16x2 r8275, r8272, r8213; +} +{ +sub.f16x2 %25, r8269, r8275; +} +{ +add.f16x2 r8281, r8071, r8087; +} +{ +mul.f16x2 r8284, r8281, r8211; +} +{ +add.f16x2 r8287, r7846, r8284; +} +{ +sub.f16x2 r8290, r8065, r8081; +} +{ +mul.f16x2 r8293, r8290, r8213; +} +{ +add.f16x2 %43, r8287, r8293; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8299, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8300, {low, high}; +} +{ +neg.f16x2 r8301, r8300; +} +{ +add.f16x2 r8303, r8097, r8113; +} +{ +add.f16x2 %12, r7828, r8303; +} +{ +add.f16x2 r8309, r8103, r8119; +} +{ +add.f16x2 %13, r7864, r8309; +} +{ +add.f16x2 r8315, r8097, r8113; +} +{ +mul.f16x2 r8318, r8315, r8299; +} +{ +add.f16x2 r8321, r7828, r8318; +} +{ +sub.f16x2 r8324, r8103, r8119; +} +{ +mul.f16x2 r8327, r8324, r8301; +} +{ +add.f16x2 %30, r8321, r8327; +} +{ +add.f16x2 r8333, r8097, r8113; +} +{ +mul.f16x2 r8336, r8333, r8299; +} +{ +add.f16x2 r8339, r7828, r8336; +} +{ +sub.f16x2 r8342, r8103, r8119; +} +{ +mul.f16x2 r8345, r8342, r8301; +} +{ +sub.f16x2 %48, r8339, r8345; +} +{ +add.f16x2 r8351, r8103, r8119; +} +{ +mul.f16x2 r8354, r8351, r8299; +} +{ +add.f16x2 r8357, r7864, r8354; +} +{ +sub.f16x2 r8360, r8097, r8113; +} +{ +mul.f16x2 r8363, r8360, r8301; +} +{ +sub.f16x2 %31, r8357, r8363; +} +{ +add.f16x2 r8369, r8103, r8119; +} +{ +mul.f16x2 r8372, r8369, r8299; +} +{ +add.f16x2 r8375, r7864, r8372; +} +{ +sub.f16x2 r8378, r8097, r8113; +} +{ +mul.f16x2 r8381, r8378, r8301; +} +{ +add.f16x2 %49, r8375, r8381; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8387, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8388, {low, high}; +} +{ +neg.f16x2 r8389, r8388; +} +{ +add.f16x2 r8391, r8392, r8393; +} +{ +add.f16x2 r8394, r8395, r8391; +} +{ +add.f16x2 r8397, r8398, r8399; +} +{ +add.f16x2 r8400, r8401, r8397; +} +{ +add.f16x2 r8403, r8392, r8393; +} +{ +mul.f16x2 r8406, r8403, r8387; +} +{ +add.f16x2 r8409, r8395, r8406; +} +{ +sub.f16x2 r8412, r8398, r8399; +} +{ +mul.f16x2 r8415, r8412, r8389; +} +{ +add.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8392, r8393; +} +{ +mul.f16x2 r8424, r8421, r8387; +} +{ +add.f16x2 r8427, r8395, r8424; +} +{ +sub.f16x2 r8430, r8398, r8399; +} +{ +mul.f16x2 r8433, r8430, r8389; +} +{ +sub.f16x2 r8436, r8427, r8433; +} +{ +add.f16x2 r8439, r8398, r8399; +} +{ +mul.f16x2 r8442, r8439, r8387; +} +{ +add.f16x2 r8445, r8401, r8442; +} +{ +sub.f16x2 r8448, r8392, r8393; +} +{ +mul.f16x2 r8451, r8448, r8389; +} +{ +sub.f16x2 r8454, r8445, r8451; +} +{ +add.f16x2 r8457, r8398, r8399; +} +{ +mul.f16x2 r8460, r8457, r8387; +} +{ +add.f16x2 r8463, r8401, r8460; +} +{ +sub.f16x2 r8466, r8392, r8393; +} +{ +mul.f16x2 r8469, r8466, r8389; +} +{ +add.f16x2 r8472, r8463, r8469; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8476, {low, high}; +} +{ +neg.f16x2 r8477, r8476; +} +{ +add.f16x2 r8479, r8480, r8481; +} +{ +add.f16x2 r8482, r8483, r8479; +} +{ +add.f16x2 r8485, r8486, r8487; +} +{ +add.f16x2 r8488, r8489, r8485; +} +{ +add.f16x2 r8491, r8480, r8481; +} +{ +mul.f16x2 r8494, r8491, r8475; +} +{ +add.f16x2 r8497, r8483, r8494; +} +{ +sub.f16x2 r8500, r8486, r8487; +} +{ +mul.f16x2 r8503, r8500, r8477; +} +{ +add.f16x2 r8506, r8497, r8503; +} +{ +add.f16x2 r8509, r8480, r8481; +} +{ +mul.f16x2 r8512, r8509, r8475; +} +{ +add.f16x2 r8515, r8483, r8512; +} +{ +sub.f16x2 r8518, r8486, r8487; +} +{ +mul.f16x2 r8521, r8518, r8477; +} +{ +sub.f16x2 r8524, r8515, r8521; +} +{ +add.f16x2 r8527, r8486, r8487; +} +{ +mul.f16x2 r8530, r8527, r8475; +} +{ +add.f16x2 r8533, r8489, r8530; +} +{ +sub.f16x2 r8536, r8480, r8481; +} +{ +mul.f16x2 r8539, r8536, r8477; +} +{ +sub.f16x2 r8542, r8533, r8539; +} +{ +add.f16x2 r8545, r8486, r8487; +} +{ +mul.f16x2 r8548, r8545, r8475; +} +{ +add.f16x2 r8551, r8489, r8548; +} +{ +sub.f16x2 r8554, r8480, r8481; +} +{ +mul.f16x2 r8557, r8554, r8477; +} +{ +add.f16x2 r8560, r8551, r8557; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8564, {low, high}; +} +{ +neg.f16x2 r8565, r8564; +} +{ +add.f16x2 r8567, r8568, r8569; +} +{ +add.f16x2 r8570, r8571, r8567; +} +{ +add.f16x2 r8573, r8574, r8575; +} +{ +add.f16x2 r8576, r8577, r8573; +} +{ +add.f16x2 r8579, r8568, r8569; +} +{ +mul.f16x2 r8582, r8579, r8563; +} +{ +add.f16x2 r8585, r8571, r8582; +} +{ +sub.f16x2 r8588, r8574, r8575; +} +{ +mul.f16x2 r8591, r8588, r8565; +} +{ +add.f16x2 r8594, r8585, r8591; +} +{ +add.f16x2 r8597, r8568, r8569; +} +{ +mul.f16x2 r8600, r8597, r8563; +} +{ +add.f16x2 r8603, r8571, r8600; +} +{ +sub.f16x2 r8606, r8574, r8575; +} +{ +mul.f16x2 r8609, r8606, r8565; +} +{ +sub.f16x2 r8612, r8603, r8609; +} +{ +add.f16x2 r8615, r8574, r8575; +} +{ +mul.f16x2 r8618, r8615, r8563; +} +{ +add.f16x2 r8621, r8577, r8618; +} +{ +sub.f16x2 r8624, r8568, r8569; +} +{ +mul.f16x2 r8627, r8624, r8565; +} +{ +sub.f16x2 r8630, r8621, r8627; +} +{ +add.f16x2 r8633, r8574, r8575; +} +{ +mul.f16x2 r8636, r8633, r8563; +} +{ +add.f16x2 r8639, r8577, r8636; +} +{ +sub.f16x2 r8642, r8568, r8569; +} +{ +mul.f16x2 r8645, r8642, r8565; +} +{ +add.f16x2 r8648, r8639, r8645; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r8651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r8652, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r8653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r8654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r8657, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r8658, {low, high}; +} +{ +mul.f16x2 r8667, r8506, r8651; +} +{ +mul.f16x2 r8670, r8542, r8652; +} +{ +sub.f16x2 r8673, r8667, r8670; +} +{ +mul.f16x2 r8676, r8506, r8652; +} +{ +fma.rn.f16x2 r8679, r8542, r8651, r8676; +} +{ +mul.f16x2 r8683, r8594, r8653; +} +{ +mul.f16x2 r8686, r8630, r8654; +} +{ +sub.f16x2 r8689, r8683, r8686; +} +{ +mul.f16x2 r8692, r8594, r8654; +} +{ +fma.rn.f16x2 r8695, r8630, r8653, r8692; +} +{ +mul.f16x2 r8699, r8524, r8653; +} +{ +mul.f16x2 r8702, r8560, r8654; +} +{ +sub.f16x2 r8705, r8699, r8702; +} +{ +mul.f16x2 r8708, r8524, r8654; +} +{ +fma.rn.f16x2 r8711, r8560, r8653, r8708; +} +{ +mul.f16x2 r8715, r8612, r8657; +} +{ +mul.f16x2 r8718, r8648, r8658; +} +{ +sub.f16x2 r8721, r8715, r8718; +} +{ +mul.f16x2 r8724, r8612, r8658; +} +{ +fma.rn.f16x2 r8727, r8648, r8657, r8724; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8731, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8732, {low, high}; +} +{ +neg.f16x2 r8733, r8732; +} +{ +add.f16x2 r8735, r8482, r8570; +} +{ +add.f16x2 %2, r8394, r8735; +} +{ +add.f16x2 r8741, r8488, r8576; +} +{ +add.f16x2 %3, r8400, r8741; +} +{ +add.f16x2 r8747, r8482, r8570; +} +{ +mul.f16x2 r8750, r8747, r8731; +} +{ +add.f16x2 r8753, r8394, r8750; +} +{ +sub.f16x2 r8756, r8488, r8576; +} +{ +mul.f16x2 r8759, r8756, r8733; +} +{ +add.f16x2 %20, r8753, r8759; +} +{ +add.f16x2 r8765, r8482, r8570; +} +{ +mul.f16x2 r8768, r8765, r8731; +} +{ +add.f16x2 r8771, r8394, r8768; +} +{ +sub.f16x2 r8774, r8488, r8576; +} +{ +mul.f16x2 r8777, r8774, r8733; +} +{ +sub.f16x2 %38, r8771, r8777; +} +{ +add.f16x2 r8783, r8488, r8576; +} +{ +mul.f16x2 r8786, r8783, r8731; +} +{ +add.f16x2 r8789, r8400, r8786; +} +{ +sub.f16x2 r8792, r8482, r8570; +} +{ +mul.f16x2 r8795, r8792, r8733; +} +{ +sub.f16x2 %21, r8789, r8795; +} +{ +add.f16x2 r8801, r8488, r8576; +} +{ +mul.f16x2 r8804, r8801, r8731; +} +{ +add.f16x2 r8807, r8400, r8804; +} +{ +sub.f16x2 r8810, r8482, r8570; +} +{ +mul.f16x2 r8813, r8810, r8733; +} +{ +add.f16x2 %39, r8807, r8813; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8819, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8820, {low, high}; +} +{ +neg.f16x2 r8821, r8820; +} +{ +add.f16x2 r8823, r8673, r8689; +} +{ +add.f16x2 %8, r8418, r8823; +} +{ +add.f16x2 r8829, r8679, r8695; +} +{ +add.f16x2 %9, r8454, r8829; +} +{ +add.f16x2 r8835, r8673, r8689; +} +{ +mul.f16x2 r8838, r8835, r8819; +} +{ +add.f16x2 r8841, r8418, r8838; +} +{ +sub.f16x2 r8844, r8679, r8695; +} +{ +mul.f16x2 r8847, r8844, r8821; +} +{ +add.f16x2 %26, r8841, r8847; +} +{ +add.f16x2 r8853, r8673, r8689; +} +{ +mul.f16x2 r8856, r8853, r8819; +} +{ +add.f16x2 r8859, r8418, r8856; +} +{ +sub.f16x2 r8862, r8679, r8695; +} +{ +mul.f16x2 r8865, r8862, r8821; +} +{ +sub.f16x2 %44, r8859, r8865; +} +{ +add.f16x2 r8871, r8679, r8695; +} +{ +mul.f16x2 r8874, r8871, r8819; +} +{ +add.f16x2 r8877, r8454, r8874; +} +{ +sub.f16x2 r8880, r8673, r8689; +} +{ +mul.f16x2 r8883, r8880, r8821; +} +{ +sub.f16x2 %27, r8877, r8883; +} +{ +add.f16x2 r8889, r8679, r8695; +} +{ +mul.f16x2 r8892, r8889, r8819; +} +{ +add.f16x2 r8895, r8454, r8892; +} +{ +sub.f16x2 r8898, r8673, r8689; +} +{ +mul.f16x2 r8901, r8898, r8821; +} +{ +add.f16x2 %45, r8895, r8901; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8907, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8908, {low, high}; +} +{ +neg.f16x2 r8909, r8908; +} +{ +add.f16x2 r8911, r8705, r8721; +} +{ +add.f16x2 %14, r8436, r8911; +} +{ +add.f16x2 r8917, r8711, r8727; +} +{ +add.f16x2 %15, r8472, r8917; +} +{ +add.f16x2 r8923, r8705, r8721; +} +{ +mul.f16x2 r8926, r8923, r8907; +} +{ +add.f16x2 r8929, r8436, r8926; +} +{ +sub.f16x2 r8932, r8711, r8727; +} +{ +mul.f16x2 r8935, r8932, r8909; +} +{ +add.f16x2 %32, r8929, r8935; +} +{ +add.f16x2 r8941, r8705, r8721; +} +{ +mul.f16x2 r8944, r8941, r8907; +} +{ +add.f16x2 r8947, r8436, r8944; +} +{ +sub.f16x2 r8950, r8711, r8727; +} +{ +mul.f16x2 r8953, r8950, r8909; +} +{ +sub.f16x2 %50, r8947, r8953; +} +{ +add.f16x2 r8959, r8711, r8727; +} +{ +mul.f16x2 r8962, r8959, r8907; +} +{ +add.f16x2 r8965, r8472, r8962; +} +{ +sub.f16x2 r8968, r8705, r8721; +} +{ +mul.f16x2 r8971, r8968, r8909; +} +{ +sub.f16x2 %33, r8965, r8971; +} +{ +add.f16x2 r8977, r8711, r8727; +} +{ +mul.f16x2 r8980, r8977, r8907; +} +{ +add.f16x2 r8983, r8472, r8980; +} +{ +sub.f16x2 r8986, r8705, r8721; +} +{ +mul.f16x2 r8989, r8986, r8909; +} +{ +add.f16x2 %51, r8983, r8989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8995, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8996, {low, high}; +} +{ +neg.f16x2 r8997, r8996; +} +{ +add.f16x2 r8999, r9000, r9001; +} +{ +add.f16x2 r9002, r9003, r8999; +} +{ +add.f16x2 r9005, r9006, r9007; +} +{ +add.f16x2 r9008, r9009, r9005; +} +{ +add.f16x2 r9011, r9000, r9001; +} +{ +mul.f16x2 r9014, r9011, r8995; +} +{ +add.f16x2 r9017, r9003, r9014; +} +{ +sub.f16x2 r9020, r9006, r9007; +} +{ +mul.f16x2 r9023, r9020, r8997; +} +{ +add.f16x2 r9026, r9017, r9023; +} +{ +add.f16x2 r9029, r9000, r9001; +} +{ +mul.f16x2 r9032, r9029, r8995; +} +{ +add.f16x2 r9035, r9003, r9032; +} +{ +sub.f16x2 r9038, r9006, r9007; +} +{ +mul.f16x2 r9041, r9038, r8997; +} +{ +sub.f16x2 r9044, r9035, r9041; +} +{ +add.f16x2 r9047, r9006, r9007; +} +{ +mul.f16x2 r9050, r9047, r8995; +} +{ +add.f16x2 r9053, r9009, r9050; +} +{ +sub.f16x2 r9056, r9000, r9001; +} +{ +mul.f16x2 r9059, r9056, r8997; +} +{ +sub.f16x2 r9062, r9053, r9059; +} +{ +add.f16x2 r9065, r9006, r9007; +} +{ +mul.f16x2 r9068, r9065, r8995; +} +{ +add.f16x2 r9071, r9009, r9068; +} +{ +sub.f16x2 r9074, r9000, r9001; +} +{ +mul.f16x2 r9077, r9074, r8997; +} +{ +add.f16x2 r9080, r9071, r9077; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9083, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9084, {low, high}; +} +{ +neg.f16x2 r9085, r9084; +} +{ +add.f16x2 r9087, r9088, r9089; +} +{ +add.f16x2 r9090, r9091, r9087; +} +{ +add.f16x2 r9093, r9094, r9095; +} +{ +add.f16x2 r9096, r9097, r9093; +} +{ +add.f16x2 r9099, r9088, r9089; +} +{ +mul.f16x2 r9102, r9099, r9083; +} +{ +add.f16x2 r9105, r9091, r9102; +} +{ +sub.f16x2 r9108, r9094, r9095; +} +{ +mul.f16x2 r9111, r9108, r9085; +} +{ +add.f16x2 r9114, r9105, r9111; +} +{ +add.f16x2 r9117, r9088, r9089; +} +{ +mul.f16x2 r9120, r9117, r9083; +} +{ +add.f16x2 r9123, r9091, r9120; +} +{ +sub.f16x2 r9126, r9094, r9095; +} +{ +mul.f16x2 r9129, r9126, r9085; +} +{ +sub.f16x2 r9132, r9123, r9129; +} +{ +add.f16x2 r9135, r9094, r9095; +} +{ +mul.f16x2 r9138, r9135, r9083; +} +{ +add.f16x2 r9141, r9097, r9138; +} +{ +sub.f16x2 r9144, r9088, r9089; +} +{ +mul.f16x2 r9147, r9144, r9085; +} +{ +sub.f16x2 r9150, r9141, r9147; +} +{ +add.f16x2 r9153, r9094, r9095; +} +{ +mul.f16x2 r9156, r9153, r9083; +} +{ +add.f16x2 r9159, r9097, r9156; +} +{ +sub.f16x2 r9162, r9088, r9089; +} +{ +mul.f16x2 r9165, r9162, r9085; +} +{ +add.f16x2 r9168, r9159, r9165; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9171, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9172, {low, high}; +} +{ +neg.f16x2 r9173, r9172; +} +{ +add.f16x2 r9175, r9176, r9177; +} +{ +add.f16x2 r9178, r9179, r9175; +} +{ +add.f16x2 r9181, r9182, r9183; +} +{ +add.f16x2 r9184, r9185, r9181; +} +{ +add.f16x2 r9187, r9176, r9177; +} +{ +mul.f16x2 r9190, r9187, r9171; +} +{ +add.f16x2 r9193, r9179, r9190; +} +{ +sub.f16x2 r9196, r9182, r9183; +} +{ +mul.f16x2 r9199, r9196, r9173; +} +{ +add.f16x2 r9202, r9193, r9199; +} +{ +add.f16x2 r9205, r9176, r9177; +} +{ +mul.f16x2 r9208, r9205, r9171; +} +{ +add.f16x2 r9211, r9179, r9208; +} +{ +sub.f16x2 r9214, r9182, r9183; +} +{ +mul.f16x2 r9217, r9214, r9173; +} +{ +sub.f16x2 r9220, r9211, r9217; +} +{ +add.f16x2 r9223, r9182, r9183; +} +{ +mul.f16x2 r9226, r9223, r9171; +} +{ +add.f16x2 r9229, r9185, r9226; +} +{ +sub.f16x2 r9232, r9176, r9177; +} +{ +mul.f16x2 r9235, r9232, r9173; +} +{ +sub.f16x2 r9238, r9229, r9235; +} +{ +add.f16x2 r9241, r9182, r9183; +} +{ +mul.f16x2 r9244, r9241, r9171; +} +{ +add.f16x2 r9247, r9185, r9244; +} +{ +sub.f16x2 r9250, r9176, r9177; +} +{ +mul.f16x2 r9253, r9250, r9173; +} +{ +add.f16x2 r9256, r9247, r9253; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r9259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r9260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r9261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r9262, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r9265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r9266, {low, high}; +} +{ +mul.f16x2 r9275, r9114, r9259; +} +{ +mul.f16x2 r9278, r9150, r9260; +} +{ +sub.f16x2 r9281, r9275, r9278; +} +{ +mul.f16x2 r9284, r9114, r9260; +} +{ +fma.rn.f16x2 r9287, r9150, r9259, r9284; +} +{ +mul.f16x2 r9291, r9202, r9261; +} +{ +mul.f16x2 r9294, r9238, r9262; +} +{ +sub.f16x2 r9297, r9291, r9294; +} +{ +mul.f16x2 r9300, r9202, r9262; +} +{ +fma.rn.f16x2 r9303, r9238, r9261, r9300; +} +{ +mul.f16x2 r9307, r9132, r9261; +} +{ +mul.f16x2 r9310, r9168, r9262; +} +{ +sub.f16x2 r9313, r9307, r9310; +} +{ +mul.f16x2 r9316, r9132, r9262; +} +{ +fma.rn.f16x2 r9319, r9168, r9261, r9316; +} +{ +mul.f16x2 r9323, r9220, r9265; +} +{ +mul.f16x2 r9326, r9256, r9266; +} +{ +sub.f16x2 r9329, r9323, r9326; +} +{ +mul.f16x2 r9332, r9220, r9266; +} +{ +fma.rn.f16x2 r9335, r9256, r9265, r9332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9340, {low, high}; +} +{ +neg.f16x2 r9341, r9340; +} +{ +add.f16x2 r9343, r9090, r9178; +} +{ +add.f16x2 %4, r9002, r9343; +} +{ +add.f16x2 r9349, r9096, r9184; +} +{ +add.f16x2 %5, r9008, r9349; +} +{ +add.f16x2 r9355, r9090, r9178; +} +{ +mul.f16x2 r9358, r9355, r9339; +} +{ +add.f16x2 r9361, r9002, r9358; +} +{ +sub.f16x2 r9364, r9096, r9184; +} +{ +mul.f16x2 r9367, r9364, r9341; +} +{ +add.f16x2 %22, r9361, r9367; +} +{ +add.f16x2 r9373, r9090, r9178; +} +{ +mul.f16x2 r9376, r9373, r9339; +} +{ +add.f16x2 r9379, r9002, r9376; +} +{ +sub.f16x2 r9382, r9096, r9184; +} +{ +mul.f16x2 r9385, r9382, r9341; +} +{ +sub.f16x2 %40, r9379, r9385; +} +{ +add.f16x2 r9391, r9096, r9184; +} +{ +mul.f16x2 r9394, r9391, r9339; +} +{ +add.f16x2 r9397, r9008, r9394; +} +{ +sub.f16x2 r9400, r9090, r9178; +} +{ +mul.f16x2 r9403, r9400, r9341; +} +{ +sub.f16x2 %23, r9397, r9403; +} +{ +add.f16x2 r9409, r9096, r9184; +} +{ +mul.f16x2 r9412, r9409, r9339; +} +{ +add.f16x2 r9415, r9008, r9412; +} +{ +sub.f16x2 r9418, r9090, r9178; +} +{ +mul.f16x2 r9421, r9418, r9341; +} +{ +add.f16x2 %41, r9415, r9421; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9427, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9428, {low, high}; +} +{ +neg.f16x2 r9429, r9428; +} +{ +add.f16x2 r9431, r9281, r9297; +} +{ +add.f16x2 %10, r9026, r9431; +} +{ +add.f16x2 r9437, r9287, r9303; +} +{ +add.f16x2 %11, r9062, r9437; +} +{ +add.f16x2 r9443, r9281, r9297; +} +{ +mul.f16x2 r9446, r9443, r9427; +} +{ +add.f16x2 r9449, r9026, r9446; +} +{ +sub.f16x2 r9452, r9287, r9303; +} +{ +mul.f16x2 r9455, r9452, r9429; +} +{ +add.f16x2 %28, r9449, r9455; +} +{ +add.f16x2 r9461, r9281, r9297; +} +{ +mul.f16x2 r9464, r9461, r9427; +} +{ +add.f16x2 r9467, r9026, r9464; +} +{ +sub.f16x2 r9470, r9287, r9303; +} +{ +mul.f16x2 r9473, r9470, r9429; +} +{ +sub.f16x2 %46, r9467, r9473; +} +{ +add.f16x2 r9479, r9287, r9303; +} +{ +mul.f16x2 r9482, r9479, r9427; +} +{ +add.f16x2 r9485, r9062, r9482; +} +{ +sub.f16x2 r9488, r9281, r9297; +} +{ +mul.f16x2 r9491, r9488, r9429; +} +{ +sub.f16x2 %29, r9485, r9491; +} +{ +add.f16x2 r9497, r9287, r9303; +} +{ +mul.f16x2 r9500, r9497, r9427; +} +{ +add.f16x2 r9503, r9062, r9500; +} +{ +sub.f16x2 r9506, r9281, r9297; +} +{ +mul.f16x2 r9509, r9506, r9429; +} +{ +add.f16x2 %47, r9503, r9509; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9515, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9516, {low, high}; +} +{ +neg.f16x2 r9517, r9516; +} +{ +add.f16x2 r9519, r9313, r9329; +} +{ +add.f16x2 %16, r9044, r9519; +} +{ +add.f16x2 r9525, r9319, r9335; +} +{ +add.f16x2 %17, r9080, r9525; +} +{ +add.f16x2 r9531, r9313, r9329; +} +{ +mul.f16x2 r9534, r9531, r9515; +} +{ +add.f16x2 r9537, r9044, r9534; +} +{ +sub.f16x2 r9540, r9319, r9335; +} +{ +mul.f16x2 r9543, r9540, r9517; +} +{ +add.f16x2 %34, r9537, r9543; +} +{ +add.f16x2 r9549, r9313, r9329; +} +{ +mul.f16x2 r9552, r9549, r9515; +} +{ +add.f16x2 r9555, r9044, r9552; +} +{ +sub.f16x2 r9558, r9319, r9335; +} +{ +mul.f16x2 r9561, r9558, r9517; +} +{ +sub.f16x2 %52, r9555, r9561; +} +{ +add.f16x2 r9567, r9319, r9335; +} +{ +mul.f16x2 r9570, r9567, r9515; +} +{ +add.f16x2 r9573, r9080, r9570; +} +{ +sub.f16x2 r9576, r9313, r9329; +} +{ +mul.f16x2 r9579, r9576, r9517; +} +{ +sub.f16x2 %35, r9573, r9579; +} +{ +add.f16x2 r9585, r9319, r9335; +} +{ +mul.f16x2 r9588, r9585, r9515; +} +{ +add.f16x2 r9591, r9080, r9588; +} +{ +sub.f16x2 r9594, r9313, r9329; +} +{ +mul.f16x2 r9597, r9594, r9517; +} +{ +add.f16x2 %53, r9591, r9597; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<899, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<294>; +.reg .b32 r<3353>; +.reg .b64 rd<8>; +mov.u32 r3330, %tid.y; +mov.u32 r3331, %18; +mad.lo.s32 r3332, r3330, 52488, r3331; +mov.u32 r3333, %tid.x; +mov.f32 f282, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1, {low, high}; +} +mov.f32 f284, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f242, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r265, {low, high}; +} +mov.f32 f244, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r266, {low, high}; +} +mov.f32 f246, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r267, {low, high}; +} +mov.f32 f248, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r268, {low, high}; +} +mov.f32 f254, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r271, {low, high}; +} +mov.f32 f256, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r3333, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3334, rd3; +mul.lo.s32 r3335, r3334, 729; +sub.s32 r3336, r3333, r3335; +cvt.rn.f32.u32 f285, r3336; +mul.f32 f286, f285, 0f3A7B0B40; +cos.approx.f32 f57, f286; +sin.approx.f32 f287, f286; +neg.f32 f58, f287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +mad.lo.s32 r3337, r3334, 52488, r3332; +barrier.sync 0; +mad.lo.s32 r3338, r3336, 72, r3337; +st.shared.v2.f32 [r3338], {r352, r358}; +st.shared.v2.f32 [r3338+8], {r621, r628}; +st.shared.v2.f32 [r3338+16], {r658, r665}; +st.shared.v2.f32 [r3338+24], {r695, r702}; +st.shared.v2.f32 [r3338+32], {r732, r739}; +st.shared.v2.f32 [r3338+40], {r769, r776}; +st.shared.v2.f32 [r3338+48], {r806, r813}; +st.shared.v2.f32 [r3338+56], {r843, r850}; +st.shared.v2.f32 [r3338+64], {r880, r887}; +barrier.sync 0; +shl.b32 r3339, r3336, 6; +sub.s32 r3340, r3338, r3339; +ld.shared.u32 r916, [r3340]; +ld.shared.u32 r922, [r3340+4]; +ld.shared.u32 r1004, [r3340+5832]; +ld.shared.u32 r1010, [r3340+5836]; +ld.shared.u32 r1092, [r3340+11664]; +ld.shared.u32 r1098, [r3340+11668]; +ld.shared.u32 r913, [r3340+17496]; +ld.shared.u32 r919, [r3340+17500]; +ld.shared.u32 r1001, [r3340+23328]; +ld.shared.u32 r1007, [r3340+23332]; +ld.shared.u32 r1089, [r3340+29160]; +ld.shared.u32 r1095, [r3340+29164]; +ld.shared.u32 r914, [r3340+34992]; +ld.shared.u32 r920, [r3340+34996]; +ld.shared.u32 r1002, [r3340+40824]; +ld.shared.u32 r1008, [r3340+40828]; +ld.shared.u32 r1090, [r3340+46656]; +ld.shared.u32 r1096, [r3340+46660]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r3336, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r3341, rd5; +cvt.rn.f32.u32 f288, r3341; +mul.f32 f289, f288, 0f3C0D3654; +cos.approx.f32 f133, f289; +sin.approx.f32 f290, f289; +neg.f32 f134, f290; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +mul.lo.s32 r3342, r3341, 9; +sub.s32 r3343, r3336, r3342; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +shl.b32 r3344, r3343, 3; +add.s32 r3345, r3337, r3344; +barrier.sync 0; +mad.lo.s32 r3346, r3341, 648, r3345; +st.shared.u32 [r3346], r1259; +st.shared.u32 [r3346+4], r1265; +st.shared.u32 [r3346+72], r1528; +st.shared.u32 [r3346+76], r1535; +st.shared.u32 [r3346+144], r1565; +st.shared.u32 [r3346+148], r1572; +st.shared.u32 [r3346+216], r1602; +st.shared.u32 [r3346+220], r1609; +st.shared.u32 [r3346+288], r1639; +st.shared.u32 [r3346+292], r1646; +st.shared.u32 [r3346+360], r1676; +st.shared.u32 [r3346+364], r1683; +st.shared.u32 [r3346+432], r1713; +st.shared.u32 [r3346+436], r1720; +st.shared.u32 [r3346+504], r1750; +st.shared.u32 [r3346+508], r1757; +st.shared.u32 [r3346+576], r1787; +st.shared.u32 [r3346+580], r1794; +barrier.sync 0; +ld.shared.u32 r1823, [r3340]; +ld.shared.u32 r1829, [r3340+4]; +ld.shared.u32 r1911, [r3340+5832]; +ld.shared.u32 r1917, [r3340+5836]; +ld.shared.u32 r1999, [r3340+11664]; +ld.shared.u32 r2005, [r3340+11668]; +ld.shared.u32 r1820, [r3340+17496]; +ld.shared.u32 r1826, [r3340+17500]; +ld.shared.u32 r1908, [r3340+23328]; +ld.shared.u32 r1914, [r3340+23332]; +ld.shared.u32 r1996, [r3340+29160]; +ld.shared.u32 r2002, [r3340+29164]; +ld.shared.u32 r1821, [r3340+34992]; +ld.shared.u32 r1827, [r3340+34996]; +ld.shared.u32 r1909, [r3340+40824]; +ld.shared.u32 r1915, [r3340+40828]; +ld.shared.u32 r1997, [r3340+46656]; +ld.shared.u32 r2003, [r3340+46660]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 r1822, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 r1828, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 r1846, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 r1864, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 r1882, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 r1900, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 r1910, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 r1916, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 r1934, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 r1952, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 r1970, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 r1998, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 r2004, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 r2040, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 r2058, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2080, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2081, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2082, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2085, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2086, {low, high}; +} +{ +mul.f16x2 r2095, r1934, r2079; +} +{ +mul.f16x2 r2098, r1970, r2080; +} +{ +sub.f16x2 r2101, r2095, r2098; +} +{ +mul.f16x2 r2104, r1934, r2080; +} +{ +fma.rn.f16x2 r2107, r1970, r2079, r2104; +} +{ +mul.f16x2 r2111, r2022, r2081; +} +{ +mul.f16x2 r2114, r2058, r2082; +} +{ +sub.f16x2 r2117, r2111, r2114; +} +{ +mul.f16x2 r2120, r2022, r2082; +} +{ +fma.rn.f16x2 r2123, r2058, r2081, r2120; +} +{ +mul.f16x2 r2127, r1952, r2081; +} +{ +mul.f16x2 r2130, r1988, r2082; +} +{ +sub.f16x2 r2133, r2127, r2130; +} +{ +mul.f16x2 r2136, r1952, r2082; +} +{ +fma.rn.f16x2 r2139, r1988, r2081, r2136; +} +{ +mul.f16x2 r2143, r2040, r2085; +} +{ +mul.f16x2 r2146, r2076, r2086; +} +{ +sub.f16x2 r2149, r2143, r2146; +} +{ +mul.f16x2 r2152, r2040, r2086; +} +{ +fma.rn.f16x2 r2155, r2076, r2085, r2152; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2160, {low, high}; +} +{ +neg.f16x2 r2161, r2160; +} +{ +add.f16x2 r2163, r1910, r1998; +} +{ +add.f16x2 r2166, r1822, r2163; +} +{ +add.f16x2 r2169, r1916, r2004; +} +{ +add.f16x2 r2172, r1828, r2169; +} +{ +add.f16x2 r2175, r1910, r1998; +} +{ +mul.f16x2 r2178, r2175, r2159; +} +{ +add.f16x2 r2181, r1822, r2178; +} +{ +sub.f16x2 r2184, r1916, r2004; +} +{ +mul.f16x2 r2187, r2184, r2161; +} +{ +add.f16x2 r2190, r2181, r2187; +} +{ +add.f16x2 r2193, r1910, r1998; +} +{ +mul.f16x2 r2196, r2193, r2159; +} +{ +add.f16x2 r2199, r1822, r2196; +} +{ +sub.f16x2 r2202, r1916, r2004; +} +{ +mul.f16x2 r2205, r2202, r2161; +} +{ +sub.f16x2 r2208, r2199, r2205; +} +{ +add.f16x2 r2211, r1916, r2004; +} +{ +mul.f16x2 r2214, r2211, r2159; +} +{ +add.f16x2 r2217, r1828, r2214; +} +{ +sub.f16x2 r2220, r1910, r1998; +} +{ +mul.f16x2 r2223, r2220, r2161; +} +{ +sub.f16x2 r2226, r2217, r2223; +} +{ +add.f16x2 r2229, r1916, r2004; +} +{ +mul.f16x2 r2232, r2229, r2159; +} +{ +add.f16x2 r2235, r1828, r2232; +} +{ +sub.f16x2 r2238, r1910, r1998; +} +{ +mul.f16x2 r2241, r2238, r2161; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2248, {low, high}; +} +{ +neg.f16x2 r2249, r2248; +} +{ +add.f16x2 r2251, r2101, r2117; +} +{ +add.f16x2 r2254, r1846, r2251; +} +{ +add.f16x2 r2257, r2107, r2123; +} +{ +add.f16x2 r2260, r1882, r2257; +} +{ +add.f16x2 r2263, r2101, r2117; +} +{ +mul.f16x2 r2266, r2263, r2247; +} +{ +add.f16x2 r2269, r1846, r2266; +} +{ +sub.f16x2 r2272, r2107, r2123; +} +{ +mul.f16x2 r2275, r2272, r2249; +} +{ +add.f16x2 r2278, r2269, r2275; +} +{ +add.f16x2 r2281, r2101, r2117; +} +{ +mul.f16x2 r2284, r2281, r2247; +} +{ +add.f16x2 r2287, r1846, r2284; +} +{ +sub.f16x2 r2290, r2107, r2123; +} +{ +mul.f16x2 r2293, r2290, r2249; +} +{ +sub.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r2107, r2123; +} +{ +mul.f16x2 r2302, r2299, r2247; +} +{ +add.f16x2 r2305, r1882, r2302; +} +{ +sub.f16x2 r2308, r2101, r2117; +} +{ +mul.f16x2 r2311, r2308, r2249; +} +{ +sub.f16x2 r2314, r2305, r2311; +} +{ +add.f16x2 r2317, r2107, r2123; +} +{ +mul.f16x2 r2320, r2317, r2247; +} +{ +add.f16x2 r2323, r1882, r2320; +} +{ +sub.f16x2 r2326, r2101, r2117; +} +{ +mul.f16x2 r2329, r2326, r2249; +} +{ +add.f16x2 r2332, r2323, r2329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2336, {low, high}; +} +{ +neg.f16x2 r2337, r2336; +} +{ +add.f16x2 r2339, r2133, r2149; +} +{ +add.f16x2 r2342, r1864, r2339; +} +{ +add.f16x2 r2345, r2139, r2155; +} +{ +add.f16x2 r2348, r1900, r2345; +} +{ +add.f16x2 r2351, r2133, r2149; +} +{ +mul.f16x2 r2354, r2351, r2335; +} +{ +add.f16x2 r2357, r1864, r2354; +} +{ +sub.f16x2 r2360, r2139, r2155; +} +{ +mul.f16x2 r2363, r2360, r2337; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2133, r2149; +} +{ +mul.f16x2 r2372, r2369, r2335; +} +{ +add.f16x2 r2375, r1864, r2372; +} +{ +sub.f16x2 r2378, r2139, r2155; +} +{ +mul.f16x2 r2381, r2378, r2337; +} +{ +sub.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r2139, r2155; +} +{ +mul.f16x2 r2390, r2387, r2335; +} +{ +add.f16x2 r2393, r1900, r2390; +} +{ +sub.f16x2 r2396, r2133, r2149; +} +{ +mul.f16x2 r2399, r2396, r2337; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r2139, r2155; +} +{ +mul.f16x2 r2408, r2405, r2335; +} +{ +add.f16x2 r2411, r1900, r2408; +} +{ +sub.f16x2 r2414, r2133, r2149; +} +{ +mul.f16x2 r2417, r2414, r2337; +} +{ +add.f16x2 r2420, r2411, r2417; +} +mul.wide.u32 rd6, r3336, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r3347, rd7; +cvt.rn.f32.u32 f291, r3347; +mul.f32 f292, f291, 0f3D9EDD1F; +cos.approx.f32 f209, f292; +sin.approx.f32 f293, f292; +neg.f32 f210, f293; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2423, {low, high}; +} +mul.lo.s32 r3348, r3347, 81; +sub.s32 r3349, r3336, r3348; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2426, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2428, {high, high}; +} +{ +mul.f16x2 r2430, r2260, r2428; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r2254, r2426, r2433; +} +{ +mul.f16x2 r2439, r2254, r2428; +} +{ +fma.rn.f16x2 r2442, r2260, r2426, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2448, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2450, {low, high}; +} +{ +mul.f16x2 r2451, r2448, r2450; +} +{ +mul.f16x2 r2454, r2423, r2446; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2457, {high, low}; +} +{ +fma.rn.f16x2 r2459, r2451, r2457, r2454; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2463, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2465, {high, high}; +} +{ +mul.f16x2 r2467, r2348, r2465; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r2342, r2463, r2470; +} +{ +mul.f16x2 r2476, r2342, r2465; +} +{ +fma.rn.f16x2 r2479, r2348, r2463, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2485, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2487, {low, high}; +} +{ +mul.f16x2 r2488, r2485, r2487; +} +{ +mul.f16x2 r2491, r2459, r2483; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2494, {high, low}; +} +{ +fma.rn.f16x2 r2496, r2488, r2494, r2491; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2500, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2502, {high, high}; +} +{ +mul.f16x2 r2504, r2226, r2502; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r2190, r2500, r2507; +} +{ +mul.f16x2 r2513, r2190, r2502; +} +{ +fma.rn.f16x2 r2516, r2226, r2500, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2522, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2524, {low, high}; +} +{ +mul.f16x2 r2525, r2522, r2524; +} +{ +mul.f16x2 r2528, r2496, r2520; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2531, {high, low}; +} +{ +fma.rn.f16x2 r2533, r2525, r2531, r2528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2537, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2539, {high, high}; +} +{ +mul.f16x2 r2541, r2314, r2539; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r2278, r2537, r2544; +} +{ +mul.f16x2 r2550, r2278, r2539; +} +{ +fma.rn.f16x2 r2553, r2314, r2537, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2559, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2561, {low, high}; +} +{ +mul.f16x2 r2562, r2559, r2561; +} +{ +mul.f16x2 r2565, r2533, r2557; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2568, {high, low}; +} +{ +fma.rn.f16x2 r2570, r2562, r2568, r2565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2576, {high, high}; +} +{ +mul.f16x2 r2578, r2402, r2576; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r2366, r2574, r2581; +} +{ +mul.f16x2 r2587, r2366, r2576; +} +{ +fma.rn.f16x2 r2590, r2402, r2574, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2598, {low, high}; +} +{ +mul.f16x2 r2599, r2596, r2598; +} +{ +mul.f16x2 r2602, r2570, r2594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2605, {high, low}; +} +{ +fma.rn.f16x2 r2607, r2599, r2605, r2602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2613, {high, high}; +} +{ +mul.f16x2 r2615, r2244, r2613; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r2208, r2611, r2618; +} +{ +mul.f16x2 r2624, r2208, r2613; +} +{ +fma.rn.f16x2 r2627, r2244, r2611, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2635, {low, high}; +} +{ +mul.f16x2 r2636, r2633, r2635; +} +{ +mul.f16x2 r2639, r2607, r2631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2642, {high, low}; +} +{ +fma.rn.f16x2 r2644, r2636, r2642, r2639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2650, {high, high}; +} +{ +mul.f16x2 r2652, r2332, r2650; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r2296, r2648, r2655; +} +{ +mul.f16x2 r2661, r2296, r2650; +} +{ +fma.rn.f16x2 r2664, r2332, r2648, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2672, {low, high}; +} +{ +mul.f16x2 r2673, r2670, r2672; +} +{ +mul.f16x2 r2676, r2644, r2668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2679, {high, low}; +} +{ +fma.rn.f16x2 r2681, r2673, r2679, r2676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2687, {high, high}; +} +{ +mul.f16x2 r2689, r2420, r2687; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r2384, r2685, r2692; +} +{ +mul.f16x2 r2698, r2384, r2687; +} +{ +fma.rn.f16x2 r2701, r2420, r2685, r2698; +} +shl.b32 r3350, r3349, 3; +add.s32 r3351, r3337, r3350; +barrier.sync 0; +mad.lo.s32 r3352, r3347, 5832, r3351; +st.shared.u32 [r3352], r2166; +st.shared.u32 [r3352+4], r2172; +st.shared.u32 [r3352+648], r2435; +st.shared.u32 [r3352+652], r2442; +st.shared.u32 [r3352+1296], r2472; +st.shared.u32 [r3352+1300], r2479; +st.shared.u32 [r3352+1944], r2509; +st.shared.u32 [r3352+1948], r2516; +st.shared.u32 [r3352+2592], r2546; +st.shared.u32 [r3352+2596], r2553; +st.shared.u32 [r3352+3240], r2583; +st.shared.u32 [r3352+3244], r2590; +st.shared.u32 [r3352+3888], r2620; +st.shared.u32 [r3352+3892], r2627; +st.shared.u32 [r3352+4536], r2657; +st.shared.u32 [r3352+4540], r2664; +st.shared.u32 [r3352+5184], r2694; +st.shared.u32 [r3352+5188], r2701; +barrier.sync 0; +ld.shared.u32 r2730, [r3340]; +ld.shared.u32 r2736, [r3340+4]; +ld.shared.u32 r2818, [r3340+5832]; +ld.shared.u32 r2824, [r3340+5836]; +ld.shared.u32 r2906, [r3340+11664]; +ld.shared.u32 r2912, [r3340+11668]; +ld.shared.u32 r2727, [r3340+17496]; +ld.shared.u32 r2733, [r3340+17500]; +ld.shared.u32 r2815, [r3340+23328]; +ld.shared.u32 r2821, [r3340+23332]; +ld.shared.u32 r2903, [r3340+29160]; +ld.shared.u32 r2909, [r3340+29164]; +ld.shared.u32 r2728, [r3340+34992]; +ld.shared.u32 r2734, [r3340+34996]; +ld.shared.u32 r2816, [r3340+40824]; +ld.shared.u32 r2822, [r3340+40828]; +ld.shared.u32 r2904, [r3340+46656]; +ld.shared.u32 r2910, [r3340+46660]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2723, {low, high}; +} +{ +neg.f16x2 r2724, r2723; +} +{ +add.f16x2 r2726, r2727, r2728; +} +{ +add.f16x2 r2729, r2730, r2726; +} +{ +add.f16x2 r2732, r2733, r2734; +} +{ +add.f16x2 r2735, r2736, r2732; +} +{ +add.f16x2 r2738, r2727, r2728; +} +{ +mul.f16x2 r2741, r2738, r2722; +} +{ +add.f16x2 r2744, r2730, r2741; +} +{ +sub.f16x2 r2747, r2733, r2734; +} +{ +mul.f16x2 r2750, r2747, r2724; +} +{ +add.f16x2 r2753, r2744, r2750; +} +{ +add.f16x2 r2756, r2727, r2728; +} +{ +mul.f16x2 r2759, r2756, r2722; +} +{ +add.f16x2 r2762, r2730, r2759; +} +{ +sub.f16x2 r2765, r2733, r2734; +} +{ +mul.f16x2 r2768, r2765, r2724; +} +{ +sub.f16x2 r2771, r2762, r2768; +} +{ +add.f16x2 r2774, r2733, r2734; +} +{ +mul.f16x2 r2777, r2774, r2722; +} +{ +add.f16x2 r2780, r2736, r2777; +} +{ +sub.f16x2 r2783, r2727, r2728; +} +{ +mul.f16x2 r2786, r2783, r2724; +} +{ +sub.f16x2 r2789, r2780, r2786; +} +{ +add.f16x2 r2792, r2733, r2734; +} +{ +mul.f16x2 r2795, r2792, r2722; +} +{ +add.f16x2 r2798, r2736, r2795; +} +{ +sub.f16x2 r2801, r2727, r2728; +} +{ +mul.f16x2 r2804, r2801, r2724; +} +{ +add.f16x2 r2807, r2798, r2804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2810, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2811, {low, high}; +} +{ +neg.f16x2 r2812, r2811; +} +{ +add.f16x2 r2814, r2815, r2816; +} +{ +add.f16x2 r2817, r2818, r2814; +} +{ +add.f16x2 r2820, r2821, r2822; +} +{ +add.f16x2 r2823, r2824, r2820; +} +{ +add.f16x2 r2826, r2815, r2816; +} +{ +mul.f16x2 r2829, r2826, r2810; +} +{ +add.f16x2 r2832, r2818, r2829; +} +{ +sub.f16x2 r2835, r2821, r2822; +} +{ +mul.f16x2 r2838, r2835, r2812; +} +{ +add.f16x2 r2841, r2832, r2838; +} +{ +add.f16x2 r2844, r2815, r2816; +} +{ +mul.f16x2 r2847, r2844, r2810; +} +{ +add.f16x2 r2850, r2818, r2847; +} +{ +sub.f16x2 r2853, r2821, r2822; +} +{ +mul.f16x2 r2856, r2853, r2812; +} +{ +sub.f16x2 r2859, r2850, r2856; +} +{ +add.f16x2 r2862, r2821, r2822; +} +{ +mul.f16x2 r2865, r2862, r2810; +} +{ +add.f16x2 r2868, r2824, r2865; +} +{ +sub.f16x2 r2871, r2815, r2816; +} +{ +mul.f16x2 r2874, r2871, r2812; +} +{ +sub.f16x2 r2877, r2868, r2874; +} +{ +add.f16x2 r2880, r2821, r2822; +} +{ +mul.f16x2 r2883, r2880, r2810; +} +{ +add.f16x2 r2886, r2824, r2883; +} +{ +sub.f16x2 r2889, r2815, r2816; +} +{ +mul.f16x2 r2892, r2889, r2812; +} +{ +add.f16x2 r2895, r2886, r2892; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2898, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2899, {low, high}; +} +{ +neg.f16x2 r2900, r2899; +} +{ +add.f16x2 r2902, r2903, r2904; +} +{ +add.f16x2 r2905, r2906, r2902; +} +{ +add.f16x2 r2908, r2909, r2910; +} +{ +add.f16x2 r2911, r2912, r2908; +} +{ +add.f16x2 r2914, r2903, r2904; +} +{ +mul.f16x2 r2917, r2914, r2898; +} +{ +add.f16x2 r2920, r2906, r2917; +} +{ +sub.f16x2 r2923, r2909, r2910; +} +{ +mul.f16x2 r2926, r2923, r2900; +} +{ +add.f16x2 r2929, r2920, r2926; +} +{ +add.f16x2 r2932, r2903, r2904; +} +{ +mul.f16x2 r2935, r2932, r2898; +} +{ +add.f16x2 r2938, r2906, r2935; +} +{ +sub.f16x2 r2941, r2909, r2910; +} +{ +mul.f16x2 r2944, r2941, r2900; +} +{ +sub.f16x2 r2947, r2938, r2944; +} +{ +add.f16x2 r2950, r2909, r2910; +} +{ +mul.f16x2 r2953, r2950, r2898; +} +{ +add.f16x2 r2956, r2912, r2953; +} +{ +sub.f16x2 r2959, r2903, r2904; +} +{ +mul.f16x2 r2962, r2959, r2900; +} +{ +sub.f16x2 r2965, r2956, r2962; +} +{ +add.f16x2 r2968, r2909, r2910; +} +{ +mul.f16x2 r2971, r2968, r2898; +} +{ +add.f16x2 r2974, r2912, r2971; +} +{ +sub.f16x2 r2977, r2903, r2904; +} +{ +mul.f16x2 r2980, r2977, r2900; +} +{ +add.f16x2 r2983, r2974, r2980; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2986, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2987, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2988, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2989, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2992, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2993, {low, high}; +} +{ +mul.f16x2 r3002, r2841, r2986; +} +{ +mul.f16x2 r3005, r2877, r2987; +} +{ +sub.f16x2 r3008, r3002, r3005; +} +{ +mul.f16x2 r3011, r2841, r2987; +} +{ +fma.rn.f16x2 r3014, r2877, r2986, r3011; +} +{ +mul.f16x2 r3018, r2929, r2988; +} +{ +mul.f16x2 r3021, r2965, r2989; +} +{ +sub.f16x2 r3024, r3018, r3021; +} +{ +mul.f16x2 r3027, r2929, r2989; +} +{ +fma.rn.f16x2 r3030, r2965, r2988, r3027; +} +{ +mul.f16x2 r3034, r2859, r2988; +} +{ +mul.f16x2 r3037, r2895, r2989; +} +{ +sub.f16x2 r3040, r3034, r3037; +} +{ +mul.f16x2 r3043, r2859, r2989; +} +{ +fma.rn.f16x2 r3046, r2895, r2988, r3043; +} +{ +mul.f16x2 r3050, r2947, r2992; +} +{ +mul.f16x2 r3053, r2983, r2993; +} +{ +sub.f16x2 r3056, r3050, r3053; +} +{ +mul.f16x2 r3059, r2947, r2993; +} +{ +fma.rn.f16x2 r3062, r2983, r2992, r3059; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3067, {low, high}; +} +{ +neg.f16x2 r3068, r3067; +} +{ +add.f16x2 r3070, r2817, r2905; +} +{ +add.f16x2 %0, r2729, r3070; +} +{ +add.f16x2 r3076, r2823, r2911; +} +{ +add.f16x2 %1, r2735, r3076; +} +{ +add.f16x2 r3082, r2817, r2905; +} +{ +mul.f16x2 r3085, r3082, r3066; +} +{ +add.f16x2 r3088, r2729, r3085; +} +{ +sub.f16x2 r3091, r2823, r2911; +} +{ +mul.f16x2 r3094, r3091, r3068; +} +{ +add.f16x2 %6, r3088, r3094; +} +{ +add.f16x2 r3100, r2817, r2905; +} +{ +mul.f16x2 r3103, r3100, r3066; +} +{ +add.f16x2 r3106, r2729, r3103; +} +{ +sub.f16x2 r3109, r2823, r2911; +} +{ +mul.f16x2 r3112, r3109, r3068; +} +{ +sub.f16x2 %12, r3106, r3112; +} +{ +add.f16x2 r3118, r2823, r2911; +} +{ +mul.f16x2 r3121, r3118, r3066; +} +{ +add.f16x2 r3124, r2735, r3121; +} +{ +sub.f16x2 r3127, r2817, r2905; +} +{ +mul.f16x2 r3130, r3127, r3068; +} +{ +sub.f16x2 %7, r3124, r3130; +} +{ +add.f16x2 r3136, r2823, r2911; +} +{ +mul.f16x2 r3139, r3136, r3066; +} +{ +add.f16x2 r3142, r2735, r3139; +} +{ +sub.f16x2 r3145, r2817, r2905; +} +{ +mul.f16x2 r3148, r3145, r3068; +} +{ +add.f16x2 %13, r3142, r3148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3155, {low, high}; +} +{ +neg.f16x2 r3156, r3155; +} +{ +add.f16x2 r3158, r3008, r3024; +} +{ +add.f16x2 %2, r2753, r3158; +} +{ +add.f16x2 r3164, r3014, r3030; +} +{ +add.f16x2 %3, r2789, r3164; +} +{ +add.f16x2 r3170, r3008, r3024; +} +{ +mul.f16x2 r3173, r3170, r3154; +} +{ +add.f16x2 r3176, r2753, r3173; +} +{ +sub.f16x2 r3179, r3014, r3030; +} +{ +mul.f16x2 r3182, r3179, r3156; +} +{ +add.f16x2 %8, r3176, r3182; +} +{ +add.f16x2 r3188, r3008, r3024; +} +{ +mul.f16x2 r3191, r3188, r3154; +} +{ +add.f16x2 r3194, r2753, r3191; +} +{ +sub.f16x2 r3197, r3014, r3030; +} +{ +mul.f16x2 r3200, r3197, r3156; +} +{ +sub.f16x2 %14, r3194, r3200; +} +{ +add.f16x2 r3206, r3014, r3030; +} +{ +mul.f16x2 r3209, r3206, r3154; +} +{ +add.f16x2 r3212, r2789, r3209; +} +{ +sub.f16x2 r3215, r3008, r3024; +} +{ +mul.f16x2 r3218, r3215, r3156; +} +{ +sub.f16x2 %9, r3212, r3218; +} +{ +add.f16x2 r3224, r3014, r3030; +} +{ +mul.f16x2 r3227, r3224, r3154; +} +{ +add.f16x2 r3230, r2789, r3227; +} +{ +sub.f16x2 r3233, r3008, r3024; +} +{ +mul.f16x2 r3236, r3233, r3156; +} +{ +add.f16x2 %15, r3230, r3236; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3243, {low, high}; +} +{ +neg.f16x2 r3244, r3243; +} +{ +add.f16x2 r3246, r3040, r3056; +} +{ +add.f16x2 %4, r2771, r3246; +} +{ +add.f16x2 r3252, r3046, r3062; +} +{ +add.f16x2 %5, r2807, r3252; +} +{ +add.f16x2 r3258, r3040, r3056; +} +{ +mul.f16x2 r3261, r3258, r3242; +} +{ +add.f16x2 r3264, r2771, r3261; +} +{ +sub.f16x2 r3267, r3046, r3062; +} +{ +mul.f16x2 r3270, r3267, r3244; +} +{ +add.f16x2 %10, r3264, r3270; +} +{ +add.f16x2 r3276, r3040, r3056; +} +{ +mul.f16x2 r3279, r3276, r3242; +} +{ +add.f16x2 r3282, r2771, r3279; +} +{ +sub.f16x2 r3285, r3046, r3062; +} +{ +mul.f16x2 r3288, r3285, r3244; +} +{ +sub.f16x2 %16, r3282, r3288; +} +{ +add.f16x2 r3294, r3046, r3062; +} +{ +mul.f16x2 r3297, r3294, r3242; +} +{ +add.f16x2 r3300, r2807, r3297; +} +{ +sub.f16x2 r3303, r3040, r3056; +} +{ +mul.f16x2 r3306, r3303, r3244; +} +{ +sub.f16x2 %11, r3300, r3306; +} +{ +add.f16x2 r3312, r3046, r3062; +} +{ +mul.f16x2 r3315, r3312, r3242; +} +{ +add.f16x2 r3318, r2807, r3315; +} +{ +sub.f16x2 r3321, r3040, r3056; +} +{ +mul.f16x2 r3324, r3321, r3244; +} +{ +add.f16x2 %17, r3318, r3324; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<898, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<294>; +.reg .b32 r<3353>; +.reg .b64 rd<8>; +mov.u32 r3330, %tid.y; +mov.u32 r3331, %18; +mad.lo.s32 r3332, r3330, 26244, r3331; +mov.u32 r3333, %tid.x; +mov.f32 f282, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1, {low, high}; +} +mov.f32 f284, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f242, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r265, {low, high}; +} +mov.f32 f244, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r266, {low, high}; +} +mov.f32 f246, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r267, {low, high}; +} +mov.f32 f248, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r268, {low, high}; +} +mov.f32 f254, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r271, {low, high}; +} +mov.f32 f256, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r3333, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3334, rd3; +mul.lo.s32 r3335, r3334, 729; +sub.s32 r3336, r3333, r3335; +mad.lo.s32 r3337, r3334, 26244, r3332; +cvt.rn.f32.u32 f285, r3336; +mul.f32 f286, f285, 0f3A7B0B40; +cos.approx.f32 f57, f286; +sin.approx.f32 f287, f286; +neg.f32 f58, f287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +barrier.sync 0; +mad.lo.s32 r3338, r3336, 36, r3337; +st.shared.u32 [r3338], r352; +st.shared.u32 [r3338+4], r621; +st.shared.u32 [r3338+8], r658; +st.shared.u32 [r3338+12], r695; +st.shared.u32 [r3338+16], r732; +st.shared.u32 [r3338+20], r769; +st.shared.u32 [r3338+24], r806; +st.shared.u32 [r3338+28], r843; +st.shared.u32 [r3338+32], r880; +barrier.sync 0; +shl.b32 r3339, r3336, 5; +sub.s32 r3340, r3338, r3339; +ld.shared.u32 r916, [r3340]; +ld.shared.u32 r1004, [r3340+2916]; +ld.shared.u32 r1092, [r3340+5832]; +ld.shared.u32 r913, [r3340+8748]; +ld.shared.u32 r1001, [r3340+11664]; +ld.shared.u32 r1089, [r3340+14580]; +ld.shared.u32 r914, [r3340+17496]; +ld.shared.u32 r1002, [r3340+20412]; +ld.shared.u32 r1090, [r3340+23328]; +barrier.sync 0; +st.shared.u32 [r3338], r358; +st.shared.u32 [r3338+4], r628; +st.shared.u32 [r3338+8], r665; +st.shared.u32 [r3338+12], r702; +st.shared.u32 [r3338+16], r739; +st.shared.u32 [r3338+20], r776; +st.shared.u32 [r3338+24], r813; +st.shared.u32 [r3338+28], r850; +st.shared.u32 [r3338+32], r887; +barrier.sync 0; +ld.shared.u32 r922, [r3340]; +ld.shared.u32 r1010, [r3340+2916]; +ld.shared.u32 r1098, [r3340+5832]; +ld.shared.u32 r919, [r3340+8748]; +ld.shared.u32 r1007, [r3340+11664]; +ld.shared.u32 r1095, [r3340+14580]; +ld.shared.u32 r920, [r3340+17496]; +ld.shared.u32 r1008, [r3340+20412]; +ld.shared.u32 r1096, [r3340+23328]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r3336, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r3341, rd5; +mul.lo.s32 r3342, r3341, 9; +sub.s32 r3343, r3336, r3342; +shl.b32 r3344, r3343, 2; +add.s32 r3345, r3337, r3344; +cvt.rn.f32.u32 f288, r3341; +mul.f32 f289, f288, 0f3C0D3654; +cos.approx.f32 f133, f289; +sin.approx.f32 f290, f289; +neg.f32 f134, f290; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +barrier.sync 0; +mad.lo.s32 r3346, r3341, 324, r3345; +st.shared.u32 [r3346], r1259; +st.shared.u32 [r3346+36], r1528; +st.shared.u32 [r3346+72], r1565; +st.shared.u32 [r3346+108], r1602; +st.shared.u32 [r3346+144], r1639; +st.shared.u32 [r3346+180], r1676; +st.shared.u32 [r3346+216], r1713; +st.shared.u32 [r3346+252], r1750; +st.shared.u32 [r3346+288], r1787; +barrier.sync 0; +ld.shared.u32 r1823, [r3340]; +ld.shared.u32 r1911, [r3340+2916]; +ld.shared.u32 r1999, [r3340+5832]; +ld.shared.u32 r1820, [r3340+8748]; +ld.shared.u32 r1908, [r3340+11664]; +ld.shared.u32 r1996, [r3340+14580]; +ld.shared.u32 r1821, [r3340+17496]; +ld.shared.u32 r1909, [r3340+20412]; +ld.shared.u32 r1997, [r3340+23328]; +barrier.sync 0; +st.shared.u32 [r3346], r1265; +st.shared.u32 [r3346+36], r1535; +st.shared.u32 [r3346+72], r1572; +st.shared.u32 [r3346+108], r1609; +st.shared.u32 [r3346+144], r1646; +st.shared.u32 [r3346+180], r1683; +st.shared.u32 [r3346+216], r1720; +st.shared.u32 [r3346+252], r1757; +st.shared.u32 [r3346+288], r1794; +barrier.sync 0; +ld.shared.u32 r1829, [r3340]; +ld.shared.u32 r1917, [r3340+2916]; +ld.shared.u32 r2005, [r3340+5832]; +ld.shared.u32 r1826, [r3340+8748]; +ld.shared.u32 r1914, [r3340+11664]; +ld.shared.u32 r2002, [r3340+14580]; +ld.shared.u32 r1827, [r3340+17496]; +ld.shared.u32 r1915, [r3340+20412]; +ld.shared.u32 r2003, [r3340+23328]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 r1822, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 r1828, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 r1846, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 r1864, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 r1882, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 r1900, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 r1910, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 r1916, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 r1934, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 r1952, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 r1970, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 r1998, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 r2004, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 r2040, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 r2058, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2080, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2081, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2082, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2085, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2086, {low, high}; +} +{ +mul.f16x2 r2095, r1934, r2079; +} +{ +mul.f16x2 r2098, r1970, r2080; +} +{ +sub.f16x2 r2101, r2095, r2098; +} +{ +mul.f16x2 r2104, r1934, r2080; +} +{ +fma.rn.f16x2 r2107, r1970, r2079, r2104; +} +{ +mul.f16x2 r2111, r2022, r2081; +} +{ +mul.f16x2 r2114, r2058, r2082; +} +{ +sub.f16x2 r2117, r2111, r2114; +} +{ +mul.f16x2 r2120, r2022, r2082; +} +{ +fma.rn.f16x2 r2123, r2058, r2081, r2120; +} +{ +mul.f16x2 r2127, r1952, r2081; +} +{ +mul.f16x2 r2130, r1988, r2082; +} +{ +sub.f16x2 r2133, r2127, r2130; +} +{ +mul.f16x2 r2136, r1952, r2082; +} +{ +fma.rn.f16x2 r2139, r1988, r2081, r2136; +} +{ +mul.f16x2 r2143, r2040, r2085; +} +{ +mul.f16x2 r2146, r2076, r2086; +} +{ +sub.f16x2 r2149, r2143, r2146; +} +{ +mul.f16x2 r2152, r2040, r2086; +} +{ +fma.rn.f16x2 r2155, r2076, r2085, r2152; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2160, {low, high}; +} +{ +neg.f16x2 r2161, r2160; +} +{ +add.f16x2 r2163, r1910, r1998; +} +{ +add.f16x2 r2166, r1822, r2163; +} +{ +add.f16x2 r2169, r1916, r2004; +} +{ +add.f16x2 r2172, r1828, r2169; +} +{ +add.f16x2 r2175, r1910, r1998; +} +{ +mul.f16x2 r2178, r2175, r2159; +} +{ +add.f16x2 r2181, r1822, r2178; +} +{ +sub.f16x2 r2184, r1916, r2004; +} +{ +mul.f16x2 r2187, r2184, r2161; +} +{ +add.f16x2 r2190, r2181, r2187; +} +{ +add.f16x2 r2193, r1910, r1998; +} +{ +mul.f16x2 r2196, r2193, r2159; +} +{ +add.f16x2 r2199, r1822, r2196; +} +{ +sub.f16x2 r2202, r1916, r2004; +} +{ +mul.f16x2 r2205, r2202, r2161; +} +{ +sub.f16x2 r2208, r2199, r2205; +} +{ +add.f16x2 r2211, r1916, r2004; +} +{ +mul.f16x2 r2214, r2211, r2159; +} +{ +add.f16x2 r2217, r1828, r2214; +} +{ +sub.f16x2 r2220, r1910, r1998; +} +{ +mul.f16x2 r2223, r2220, r2161; +} +{ +sub.f16x2 r2226, r2217, r2223; +} +{ +add.f16x2 r2229, r1916, r2004; +} +{ +mul.f16x2 r2232, r2229, r2159; +} +{ +add.f16x2 r2235, r1828, r2232; +} +{ +sub.f16x2 r2238, r1910, r1998; +} +{ +mul.f16x2 r2241, r2238, r2161; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2248, {low, high}; +} +{ +neg.f16x2 r2249, r2248; +} +{ +add.f16x2 r2251, r2101, r2117; +} +{ +add.f16x2 r2254, r1846, r2251; +} +{ +add.f16x2 r2257, r2107, r2123; +} +{ +add.f16x2 r2260, r1882, r2257; +} +{ +add.f16x2 r2263, r2101, r2117; +} +{ +mul.f16x2 r2266, r2263, r2247; +} +{ +add.f16x2 r2269, r1846, r2266; +} +{ +sub.f16x2 r2272, r2107, r2123; +} +{ +mul.f16x2 r2275, r2272, r2249; +} +{ +add.f16x2 r2278, r2269, r2275; +} +{ +add.f16x2 r2281, r2101, r2117; +} +{ +mul.f16x2 r2284, r2281, r2247; +} +{ +add.f16x2 r2287, r1846, r2284; +} +{ +sub.f16x2 r2290, r2107, r2123; +} +{ +mul.f16x2 r2293, r2290, r2249; +} +{ +sub.f16x2 r2296, r2287, r2293; +} +{ +add.f16x2 r2299, r2107, r2123; +} +{ +mul.f16x2 r2302, r2299, r2247; +} +{ +add.f16x2 r2305, r1882, r2302; +} +{ +sub.f16x2 r2308, r2101, r2117; +} +{ +mul.f16x2 r2311, r2308, r2249; +} +{ +sub.f16x2 r2314, r2305, r2311; +} +{ +add.f16x2 r2317, r2107, r2123; +} +{ +mul.f16x2 r2320, r2317, r2247; +} +{ +add.f16x2 r2323, r1882, r2320; +} +{ +sub.f16x2 r2326, r2101, r2117; +} +{ +mul.f16x2 r2329, r2326, r2249; +} +{ +add.f16x2 r2332, r2323, r2329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2336, {low, high}; +} +{ +neg.f16x2 r2337, r2336; +} +{ +add.f16x2 r2339, r2133, r2149; +} +{ +add.f16x2 r2342, r1864, r2339; +} +{ +add.f16x2 r2345, r2139, r2155; +} +{ +add.f16x2 r2348, r1900, r2345; +} +{ +add.f16x2 r2351, r2133, r2149; +} +{ +mul.f16x2 r2354, r2351, r2335; +} +{ +add.f16x2 r2357, r1864, r2354; +} +{ +sub.f16x2 r2360, r2139, r2155; +} +{ +mul.f16x2 r2363, r2360, r2337; +} +{ +add.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2133, r2149; +} +{ +mul.f16x2 r2372, r2369, r2335; +} +{ +add.f16x2 r2375, r1864, r2372; +} +{ +sub.f16x2 r2378, r2139, r2155; +} +{ +mul.f16x2 r2381, r2378, r2337; +} +{ +sub.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r2139, r2155; +} +{ +mul.f16x2 r2390, r2387, r2335; +} +{ +add.f16x2 r2393, r1900, r2390; +} +{ +sub.f16x2 r2396, r2133, r2149; +} +{ +mul.f16x2 r2399, r2396, r2337; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r2139, r2155; +} +{ +mul.f16x2 r2408, r2405, r2335; +} +{ +add.f16x2 r2411, r1900, r2408; +} +{ +sub.f16x2 r2414, r2133, r2149; +} +{ +mul.f16x2 r2417, r2414, r2337; +} +{ +add.f16x2 r2420, r2411, r2417; +} +mul.wide.u32 rd6, r3336, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r3347, rd7; +mul.lo.s32 r3348, r3347, 81; +sub.s32 r3349, r3336, r3348; +shl.b32 r3350, r3349, 2; +add.s32 r3351, r3337, r3350; +cvt.rn.f32.u32 f291, r3347; +mul.f32 f292, f291, 0f3D9EDD1F; +cos.approx.f32 f209, f292; +sin.approx.f32 f293, f292; +neg.f32 f210, f293; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2423, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2426, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2428, {high, high}; +} +{ +mul.f16x2 r2430, r2260, r2428; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r2254, r2426, r2433; +} +{ +mul.f16x2 r2439, r2254, r2428; +} +{ +fma.rn.f16x2 r2442, r2260, r2426, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2448, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2450, {low, high}; +} +{ +mul.f16x2 r2451, r2448, r2450; +} +{ +mul.f16x2 r2454, r2423, r2446; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2457, {high, low}; +} +{ +fma.rn.f16x2 r2459, r2451, r2457, r2454; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2463, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2465, {high, high}; +} +{ +mul.f16x2 r2467, r2348, r2465; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r2342, r2463, r2470; +} +{ +mul.f16x2 r2476, r2342, r2465; +} +{ +fma.rn.f16x2 r2479, r2348, r2463, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2485, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2487, {low, high}; +} +{ +mul.f16x2 r2488, r2485, r2487; +} +{ +mul.f16x2 r2491, r2459, r2483; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2459; +mov.b32 r2494, {high, low}; +} +{ +fma.rn.f16x2 r2496, r2488, r2494, r2491; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2500, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2502, {high, high}; +} +{ +mul.f16x2 r2504, r2226, r2502; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r2190, r2500, r2507; +} +{ +mul.f16x2 r2513, r2190, r2502; +} +{ +fma.rn.f16x2 r2516, r2226, r2500, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2522, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2524, {low, high}; +} +{ +mul.f16x2 r2525, r2522, r2524; +} +{ +mul.f16x2 r2528, r2496, r2520; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2496; +mov.b32 r2531, {high, low}; +} +{ +fma.rn.f16x2 r2533, r2525, r2531, r2528; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2537, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2539, {high, high}; +} +{ +mul.f16x2 r2541, r2314, r2539; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r2278, r2537, r2544; +} +{ +mul.f16x2 r2550, r2278, r2539; +} +{ +fma.rn.f16x2 r2553, r2314, r2537, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2559, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2561, {low, high}; +} +{ +mul.f16x2 r2562, r2559, r2561; +} +{ +mul.f16x2 r2565, r2533, r2557; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2533; +mov.b32 r2568, {high, low}; +} +{ +fma.rn.f16x2 r2570, r2562, r2568, r2565; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2576, {high, high}; +} +{ +mul.f16x2 r2578, r2402, r2576; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r2366, r2574, r2581; +} +{ +mul.f16x2 r2587, r2366, r2576; +} +{ +fma.rn.f16x2 r2590, r2402, r2574, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2598, {low, high}; +} +{ +mul.f16x2 r2599, r2596, r2598; +} +{ +mul.f16x2 r2602, r2570, r2594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2570; +mov.b32 r2605, {high, low}; +} +{ +fma.rn.f16x2 r2607, r2599, r2605, r2602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2613, {high, high}; +} +{ +mul.f16x2 r2615, r2244, r2613; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r2208, r2611, r2618; +} +{ +mul.f16x2 r2624, r2208, r2613; +} +{ +fma.rn.f16x2 r2627, r2244, r2611, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2635, {low, high}; +} +{ +mul.f16x2 r2636, r2633, r2635; +} +{ +mul.f16x2 r2639, r2607, r2631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2607; +mov.b32 r2642, {high, low}; +} +{ +fma.rn.f16x2 r2644, r2636, r2642, r2639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2650, {high, high}; +} +{ +mul.f16x2 r2652, r2332, r2650; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r2296, r2648, r2655; +} +{ +mul.f16x2 r2661, r2296, r2650; +} +{ +fma.rn.f16x2 r2664, r2332, r2648, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2672, {low, high}; +} +{ +mul.f16x2 r2673, r2670, r2672; +} +{ +mul.f16x2 r2676, r2644, r2668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2644; +mov.b32 r2679, {high, low}; +} +{ +fma.rn.f16x2 r2681, r2673, r2679, r2676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2681; +mov.b32 r2687, {high, high}; +} +{ +mul.f16x2 r2689, r2420, r2687; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r2384, r2685, r2692; +} +{ +mul.f16x2 r2698, r2384, r2687; +} +{ +fma.rn.f16x2 r2701, r2420, r2685, r2698; +} +barrier.sync 0; +mad.lo.s32 r3352, r3347, 2916, r3351; +st.shared.u32 [r3352], r2166; +st.shared.u32 [r3352+324], r2435; +st.shared.u32 [r3352+648], r2472; +st.shared.u32 [r3352+972], r2509; +st.shared.u32 [r3352+1296], r2546; +st.shared.u32 [r3352+1620], r2583; +st.shared.u32 [r3352+1944], r2620; +st.shared.u32 [r3352+2268], r2657; +st.shared.u32 [r3352+2592], r2694; +barrier.sync 0; +ld.shared.u32 r2730, [r3340]; +ld.shared.u32 r2818, [r3340+2916]; +ld.shared.u32 r2906, [r3340+5832]; +ld.shared.u32 r2727, [r3340+8748]; +ld.shared.u32 r2815, [r3340+11664]; +ld.shared.u32 r2903, [r3340+14580]; +ld.shared.u32 r2728, [r3340+17496]; +ld.shared.u32 r2816, [r3340+20412]; +ld.shared.u32 r2904, [r3340+23328]; +barrier.sync 0; +st.shared.u32 [r3352], r2172; +st.shared.u32 [r3352+324], r2442; +st.shared.u32 [r3352+648], r2479; +st.shared.u32 [r3352+972], r2516; +st.shared.u32 [r3352+1296], r2553; +st.shared.u32 [r3352+1620], r2590; +st.shared.u32 [r3352+1944], r2627; +st.shared.u32 [r3352+2268], r2664; +st.shared.u32 [r3352+2592], r2701; +barrier.sync 0; +ld.shared.u32 r2736, [r3340]; +ld.shared.u32 r2824, [r3340+2916]; +ld.shared.u32 r2912, [r3340+5832]; +ld.shared.u32 r2733, [r3340+8748]; +ld.shared.u32 r2821, [r3340+11664]; +ld.shared.u32 r2909, [r3340+14580]; +ld.shared.u32 r2734, [r3340+17496]; +ld.shared.u32 r2822, [r3340+20412]; +ld.shared.u32 r2910, [r3340+23328]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2723, {low, high}; +} +{ +neg.f16x2 r2724, r2723; +} +{ +add.f16x2 r2726, r2727, r2728; +} +{ +add.f16x2 r2729, r2730, r2726; +} +{ +add.f16x2 r2732, r2733, r2734; +} +{ +add.f16x2 r2735, r2736, r2732; +} +{ +add.f16x2 r2738, r2727, r2728; +} +{ +mul.f16x2 r2741, r2738, r2722; +} +{ +add.f16x2 r2744, r2730, r2741; +} +{ +sub.f16x2 r2747, r2733, r2734; +} +{ +mul.f16x2 r2750, r2747, r2724; +} +{ +add.f16x2 r2753, r2744, r2750; +} +{ +add.f16x2 r2756, r2727, r2728; +} +{ +mul.f16x2 r2759, r2756, r2722; +} +{ +add.f16x2 r2762, r2730, r2759; +} +{ +sub.f16x2 r2765, r2733, r2734; +} +{ +mul.f16x2 r2768, r2765, r2724; +} +{ +sub.f16x2 r2771, r2762, r2768; +} +{ +add.f16x2 r2774, r2733, r2734; +} +{ +mul.f16x2 r2777, r2774, r2722; +} +{ +add.f16x2 r2780, r2736, r2777; +} +{ +sub.f16x2 r2783, r2727, r2728; +} +{ +mul.f16x2 r2786, r2783, r2724; +} +{ +sub.f16x2 r2789, r2780, r2786; +} +{ +add.f16x2 r2792, r2733, r2734; +} +{ +mul.f16x2 r2795, r2792, r2722; +} +{ +add.f16x2 r2798, r2736, r2795; +} +{ +sub.f16x2 r2801, r2727, r2728; +} +{ +mul.f16x2 r2804, r2801, r2724; +} +{ +add.f16x2 r2807, r2798, r2804; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2810, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2811, {low, high}; +} +{ +neg.f16x2 r2812, r2811; +} +{ +add.f16x2 r2814, r2815, r2816; +} +{ +add.f16x2 r2817, r2818, r2814; +} +{ +add.f16x2 r2820, r2821, r2822; +} +{ +add.f16x2 r2823, r2824, r2820; +} +{ +add.f16x2 r2826, r2815, r2816; +} +{ +mul.f16x2 r2829, r2826, r2810; +} +{ +add.f16x2 r2832, r2818, r2829; +} +{ +sub.f16x2 r2835, r2821, r2822; +} +{ +mul.f16x2 r2838, r2835, r2812; +} +{ +add.f16x2 r2841, r2832, r2838; +} +{ +add.f16x2 r2844, r2815, r2816; +} +{ +mul.f16x2 r2847, r2844, r2810; +} +{ +add.f16x2 r2850, r2818, r2847; +} +{ +sub.f16x2 r2853, r2821, r2822; +} +{ +mul.f16x2 r2856, r2853, r2812; +} +{ +sub.f16x2 r2859, r2850, r2856; +} +{ +add.f16x2 r2862, r2821, r2822; +} +{ +mul.f16x2 r2865, r2862, r2810; +} +{ +add.f16x2 r2868, r2824, r2865; +} +{ +sub.f16x2 r2871, r2815, r2816; +} +{ +mul.f16x2 r2874, r2871, r2812; +} +{ +sub.f16x2 r2877, r2868, r2874; +} +{ +add.f16x2 r2880, r2821, r2822; +} +{ +mul.f16x2 r2883, r2880, r2810; +} +{ +add.f16x2 r2886, r2824, r2883; +} +{ +sub.f16x2 r2889, r2815, r2816; +} +{ +mul.f16x2 r2892, r2889, r2812; +} +{ +add.f16x2 r2895, r2886, r2892; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2898, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2899, {low, high}; +} +{ +neg.f16x2 r2900, r2899; +} +{ +add.f16x2 r2902, r2903, r2904; +} +{ +add.f16x2 r2905, r2906, r2902; +} +{ +add.f16x2 r2908, r2909, r2910; +} +{ +add.f16x2 r2911, r2912, r2908; +} +{ +add.f16x2 r2914, r2903, r2904; +} +{ +mul.f16x2 r2917, r2914, r2898; +} +{ +add.f16x2 r2920, r2906, r2917; +} +{ +sub.f16x2 r2923, r2909, r2910; +} +{ +mul.f16x2 r2926, r2923, r2900; +} +{ +add.f16x2 r2929, r2920, r2926; +} +{ +add.f16x2 r2932, r2903, r2904; +} +{ +mul.f16x2 r2935, r2932, r2898; +} +{ +add.f16x2 r2938, r2906, r2935; +} +{ +sub.f16x2 r2941, r2909, r2910; +} +{ +mul.f16x2 r2944, r2941, r2900; +} +{ +sub.f16x2 r2947, r2938, r2944; +} +{ +add.f16x2 r2950, r2909, r2910; +} +{ +mul.f16x2 r2953, r2950, r2898; +} +{ +add.f16x2 r2956, r2912, r2953; +} +{ +sub.f16x2 r2959, r2903, r2904; +} +{ +mul.f16x2 r2962, r2959, r2900; +} +{ +sub.f16x2 r2965, r2956, r2962; +} +{ +add.f16x2 r2968, r2909, r2910; +} +{ +mul.f16x2 r2971, r2968, r2898; +} +{ +add.f16x2 r2974, r2912, r2971; +} +{ +sub.f16x2 r2977, r2903, r2904; +} +{ +mul.f16x2 r2980, r2977, r2900; +} +{ +add.f16x2 r2983, r2974, r2980; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2986, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2987, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2988, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2989, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2992, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2993, {low, high}; +} +{ +mul.f16x2 r3002, r2841, r2986; +} +{ +mul.f16x2 r3005, r2877, r2987; +} +{ +sub.f16x2 r3008, r3002, r3005; +} +{ +mul.f16x2 r3011, r2841, r2987; +} +{ +fma.rn.f16x2 r3014, r2877, r2986, r3011; +} +{ +mul.f16x2 r3018, r2929, r2988; +} +{ +mul.f16x2 r3021, r2965, r2989; +} +{ +sub.f16x2 r3024, r3018, r3021; +} +{ +mul.f16x2 r3027, r2929, r2989; +} +{ +fma.rn.f16x2 r3030, r2965, r2988, r3027; +} +{ +mul.f16x2 r3034, r2859, r2988; +} +{ +mul.f16x2 r3037, r2895, r2989; +} +{ +sub.f16x2 r3040, r3034, r3037; +} +{ +mul.f16x2 r3043, r2859, r2989; +} +{ +fma.rn.f16x2 r3046, r2895, r2988, r3043; +} +{ +mul.f16x2 r3050, r2947, r2992; +} +{ +mul.f16x2 r3053, r2983, r2993; +} +{ +sub.f16x2 r3056, r3050, r3053; +} +{ +mul.f16x2 r3059, r2947, r2993; +} +{ +fma.rn.f16x2 r3062, r2983, r2992, r3059; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3067, {low, high}; +} +{ +neg.f16x2 r3068, r3067; +} +{ +add.f16x2 r3070, r2817, r2905; +} +{ +add.f16x2 %0, r2729, r3070; +} +{ +add.f16x2 r3076, r2823, r2911; +} +{ +add.f16x2 %1, r2735, r3076; +} +{ +add.f16x2 r3082, r2817, r2905; +} +{ +mul.f16x2 r3085, r3082, r3066; +} +{ +add.f16x2 r3088, r2729, r3085; +} +{ +sub.f16x2 r3091, r2823, r2911; +} +{ +mul.f16x2 r3094, r3091, r3068; +} +{ +add.f16x2 %6, r3088, r3094; +} +{ +add.f16x2 r3100, r2817, r2905; +} +{ +mul.f16x2 r3103, r3100, r3066; +} +{ +add.f16x2 r3106, r2729, r3103; +} +{ +sub.f16x2 r3109, r2823, r2911; +} +{ +mul.f16x2 r3112, r3109, r3068; +} +{ +sub.f16x2 %12, r3106, r3112; +} +{ +add.f16x2 r3118, r2823, r2911; +} +{ +mul.f16x2 r3121, r3118, r3066; +} +{ +add.f16x2 r3124, r2735, r3121; +} +{ +sub.f16x2 r3127, r2817, r2905; +} +{ +mul.f16x2 r3130, r3127, r3068; +} +{ +sub.f16x2 %7, r3124, r3130; +} +{ +add.f16x2 r3136, r2823, r2911; +} +{ +mul.f16x2 r3139, r3136, r3066; +} +{ +add.f16x2 r3142, r2735, r3139; +} +{ +sub.f16x2 r3145, r2817, r2905; +} +{ +mul.f16x2 r3148, r3145, r3068; +} +{ +add.f16x2 %13, r3142, r3148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3155, {low, high}; +} +{ +neg.f16x2 r3156, r3155; +} +{ +add.f16x2 r3158, r3008, r3024; +} +{ +add.f16x2 %2, r2753, r3158; +} +{ +add.f16x2 r3164, r3014, r3030; +} +{ +add.f16x2 %3, r2789, r3164; +} +{ +add.f16x2 r3170, r3008, r3024; +} +{ +mul.f16x2 r3173, r3170, r3154; +} +{ +add.f16x2 r3176, r2753, r3173; +} +{ +sub.f16x2 r3179, r3014, r3030; +} +{ +mul.f16x2 r3182, r3179, r3156; +} +{ +add.f16x2 %8, r3176, r3182; +} +{ +add.f16x2 r3188, r3008, r3024; +} +{ +mul.f16x2 r3191, r3188, r3154; +} +{ +add.f16x2 r3194, r2753, r3191; +} +{ +sub.f16x2 r3197, r3014, r3030; +} +{ +mul.f16x2 r3200, r3197, r3156; +} +{ +sub.f16x2 %14, r3194, r3200; +} +{ +add.f16x2 r3206, r3014, r3030; +} +{ +mul.f16x2 r3209, r3206, r3154; +} +{ +add.f16x2 r3212, r2789, r3209; +} +{ +sub.f16x2 r3215, r3008, r3024; +} +{ +mul.f16x2 r3218, r3215, r3156; +} +{ +sub.f16x2 %9, r3212, r3218; +} +{ +add.f16x2 r3224, r3014, r3030; +} +{ +mul.f16x2 r3227, r3224, r3154; +} +{ +add.f16x2 r3230, r2789, r3227; +} +{ +sub.f16x2 r3233, r3008, r3024; +} +{ +mul.f16x2 r3236, r3233, r3156; +} +{ +add.f16x2 %15, r3230, r3236; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3243, {low, high}; +} +{ +neg.f16x2 r3244, r3243; +} +{ +add.f16x2 r3246, r3040, r3056; +} +{ +add.f16x2 %4, r2771, r3246; +} +{ +add.f16x2 r3252, r3046, r3062; +} +{ +add.f16x2 %5, r2807, r3252; +} +{ +add.f16x2 r3258, r3040, r3056; +} +{ +mul.f16x2 r3261, r3258, r3242; +} +{ +add.f16x2 r3264, r2771, r3261; +} +{ +sub.f16x2 r3267, r3046, r3062; +} +{ +mul.f16x2 r3270, r3267, r3244; +} +{ +add.f16x2 %10, r3264, r3270; +} +{ +add.f16x2 r3276, r3040, r3056; +} +{ +mul.f16x2 r3279, r3276, r3242; +} +{ +add.f16x2 r3282, r2771, r3279; +} +{ +sub.f16x2 r3285, r3046, r3062; +} +{ +mul.f16x2 r3288, r3285, r3244; +} +{ +sub.f16x2 %16, r3282, r3288; +} +{ +add.f16x2 r3294, r3046, r3062; +} +{ +mul.f16x2 r3297, r3294, r3242; +} +{ +add.f16x2 r3300, r2807, r3297; +} +{ +sub.f16x2 r3303, r3040, r3056; +} +{ +mul.f16x2 r3306, r3303, r3244; +} +{ +sub.f16x2 %11, r3300, r3306; +} +{ +add.f16x2 r3312, r3046, r3062; +} +{ +mul.f16x2 r3315, r3312, r3242; +} +{ +add.f16x2 r3318, r2807, r3315; +} +{ +sub.f16x2 r3321, r3040, r3056; +} +{ +mul.f16x2 r3324, r3321, r3244; +} +{ +add.f16x2 %17, r3318, r3324; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..a92af2ab9d655 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp16_inv.hpp.inc @@ -0,0 +1,31514 @@ +#ifndef CUFFTDX_FFT_6561_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_6561_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1099, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<903>; +.reg .b32 r<9534>; +.reg .b64 rd<6>; +mov.u32 r9460, %54; +mov.u32 r9533, %tid.y; +mad.lo.s32 r9461, r9533, 52488, r9460; +mov.u32 r9462, %tid.x; +mov.f32 f894, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1, {low, high}; +} +mov.f32 f896, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f854, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r259, {low, high}; +} +mov.f32 f856, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r260, {low, high}; +} +mov.f32 f858, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r261, {low, high}; +} +mov.f32 f860, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r262, {low, high}; +} +mov.f32 f866, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r265, {low, high}; +} +mov.f32 f868, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %76; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %76; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %76; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %76; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %76; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %74; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %74; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %74; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %74; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %74; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %75, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %75, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %75, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %75, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %75, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1789, {low, high}; +} +mov.f32 f536, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1790, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1791, {low, high}; +} +mov.f32 f540, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1794, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1795, {low, high}; +} +mov.f32 f548, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1796, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1797, {low, high}; +} +mov.f32 f552, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1800, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1801, {low, high}; +} +mov.f32 f560, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1802, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1803, {low, high}; +} +mov.f32 f564, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1804, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1807, {low, high}; +} +mov.f32 f572, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1808, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1812, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1815, {low, high}; +} +mov.f32 f588, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1819, {low, high}; +} +mov.f32 f596, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r9462, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r9463, rd3; +mul.lo.s32 r9464, r9463, 243; +sub.s32 r9465, r9462, r9464; +cvt.rn.f32.u32 f897, r9465; +mul.f32 f898, f897, 0f3A7B0B40; +cos.approx.f32 f309, f898; +sin.approx.f32 f899, f898; +neg.f32 f310, f899; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +mad.lo.s32 r9466, r9463, 52488, r9461; +barrier.sync 0; +mad.lo.s32 r9467, r9465, 216, r9466; +st.shared.v2.f32 [r9467], {r2102, r2108}; +st.shared.v2.f32 [r9467+8], {r2881, r2890}; +st.shared.v2.f32 [r9467+16], {r2918, r2927}; +st.shared.v2.f32 [r9467+24], {r2955, r2964}; +st.shared.v2.f32 [r9467+32], {r2992, r3001}; +st.shared.v2.f32 [r9467+40], {r3029, r3038}; +st.shared.v2.f32 [r9467+48], {r3066, r3075}; +st.shared.v2.f32 [r9467+56], {r3103, r3112}; +st.shared.v2.f32 [r9467+64], {r3140, r3149}; +st.shared.v2.f32 [r9467+72], {r3177, r3186}; +st.shared.v2.f32 [r9467+80], {r3214, r3223}; +st.shared.v2.f32 [r9467+88], {r3251, r3260}; +st.shared.v2.f32 [r9467+96], {r3288, r3297}; +st.shared.v2.f32 [r9467+104], {r3325, r3334}; +st.shared.v2.f32 [r9467+112], {r3362, r3371}; +st.shared.v2.f32 [r9467+120], {r3399, r3408}; +st.shared.v2.f32 [r9467+128], {r3436, r3445}; +st.shared.v2.f32 [r9467+136], {r3473, r3482}; +st.shared.v2.f32 [r9467+144], {r3510, r3519}; +st.shared.v2.f32 [r9467+152], {r3547, r3556}; +st.shared.v2.f32 [r9467+160], {r3584, r3593}; +st.shared.v2.f32 [r9467+168], {r3621, r3630}; +st.shared.v2.f32 [r9467+176], {r3658, r3667}; +st.shared.v2.f32 [r9467+184], {r3695, r3704}; +st.shared.v2.f32 [r9467+192], {r3732, r3741}; +st.shared.v2.f32 [r9467+200], {r3769, r3778}; +st.shared.v2.f32 [r9467+208], {r3806, r3815}; +barrier.sync 0; +mad.lo.s32 r9468, r9465, -208, r9467; +ld.shared.u32 r3842, [r9468]; +ld.shared.u32 r3848, [r9468+4]; +ld.shared.u32 r4438, [r9468+1944]; +ld.shared.u32 r4444, [r9468+1948]; +ld.shared.u32 r5034, [r9468+3888]; +ld.shared.u32 r5040, [r9468+3892]; +ld.shared.u32 r3928, [r9468+5832]; +ld.shared.u32 r3934, [r9468+5836]; +ld.shared.u32 r4524, [r9468+7776]; +ld.shared.u32 r4530, [r9468+7780]; +ld.shared.u32 r5120, [r9468+9720]; +ld.shared.u32 r5126, [r9468+9724]; +ld.shared.u32 r4014, [r9468+11664]; +ld.shared.u32 r4020, [r9468+11668]; +ld.shared.u32 r4610, [r9468+13608]; +ld.shared.u32 r4616, [r9468+13612]; +ld.shared.u32 r5206, [r9468+15552]; +ld.shared.u32 r5212, [r9468+15556]; +ld.shared.u32 r3839, [r9468+17496]; +ld.shared.u32 r3845, [r9468+17500]; +ld.shared.u32 r4435, [r9468+19440]; +ld.shared.u32 r4441, [r9468+19444]; +ld.shared.u32 r5031, [r9468+21384]; +ld.shared.u32 r5037, [r9468+21388]; +ld.shared.u32 r3925, [r9468+23328]; +ld.shared.u32 r3931, [r9468+23332]; +ld.shared.u32 r4521, [r9468+25272]; +ld.shared.u32 r4527, [r9468+25276]; +ld.shared.u32 r5117, [r9468+27216]; +ld.shared.u32 r5123, [r9468+27220]; +ld.shared.u32 r4011, [r9468+29160]; +ld.shared.u32 r4017, [r9468+29164]; +ld.shared.u32 r4607, [r9468+31104]; +ld.shared.u32 r4613, [r9468+31108]; +ld.shared.u32 r5203, [r9468+33048]; +ld.shared.u32 r5209, [r9468+33052]; +ld.shared.u32 r3840, [r9468+34992]; +ld.shared.u32 r3846, [r9468+34996]; +ld.shared.u32 r4436, [r9468+36936]; +ld.shared.u32 r4442, [r9468+36940]; +ld.shared.u32 r5032, [r9468+38880]; +ld.shared.u32 r5038, [r9468+38884]; +ld.shared.u32 r3926, [r9468+40824]; +ld.shared.u32 r3932, [r9468+40828]; +ld.shared.u32 r4522, [r9468+42768]; +ld.shared.u32 r4528, [r9468+42772]; +ld.shared.u32 r5118, [r9468+44712]; +ld.shared.u32 r5124, [r9468+44716]; +ld.shared.u32 r4012, [r9468+46656]; +ld.shared.u32 r4018, [r9468+46660]; +ld.shared.u32 r4608, [r9468+48600]; +ld.shared.u32 r4614, [r9468+48604]; +ld.shared.u32 r5204, [r9468+50544]; +ld.shared.u32 r5210, [r9468+50548]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 r5937, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 r5943, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 r5979, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 r5997, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 r6015, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 r6023, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 r6029, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 r6047, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 r6065, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 r6083, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 r6109, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 r6115, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 r6133, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 r6151, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 r6169, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 r6187, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 r6195, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 r6201, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 r6219, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 r6237, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 r6255, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 r6273, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 r6281, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 r6287, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 r6305, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 r6323, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 r6341, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 r6359, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 r6367, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 r6373, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 r6391, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 r6409, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 r6427, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 r6445, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 r6453, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 r6459, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 r6477, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 r6495, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 r6513, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 r6531, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 r6539, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 r6545, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 r6617, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 r6625, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 r6631, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 r6649, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 r6667, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 r6685, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 r6703, r6694, r6700; +} +mul.wide.u32 rd4, r9465, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r9469, rd5; +sub.s32 r9470, r9465, r9469; +shr.u32 r9471, r9470, 1; +add.s32 r9472, r9471, r9469; +shr.u32 r9473, r9472, 4; +cvt.rn.f32.u32 f900, r9473; +mul.f32 f901, f900, 0f3CD3D17E; +cos.approx.f32 f673, f901; +sin.approx.f32 f902, f901; +neg.f32 f674, f902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6706, {low, high}; +} +mul.lo.s32 r9474, r9473, 27; +sub.s32 r9475, r9465, r9474; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6711, {high, high}; +} +{ +mul.f16x2 r6713, r6029, r6711; +} +{ +fma.rn.f16x2 r6716, r6023, r6709, r6713; +} +{ +mul.f16x2 r6720, r6023, r6711; +} +{ +neg.f16x2 r6723, r6720; +} +{ +fma.rn.f16x2 r6725, r6029, r6709, r6723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6733, {low, high}; +} +{ +mul.f16x2 r6734, r6731, r6733; +} +{ +mul.f16x2 r6737, r6706, r6729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6740, {high, low}; +} +{ +fma.rn.f16x2 r6742, r6734, r6740, r6737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6748, {high, high}; +} +{ +mul.f16x2 r6750, r6115, r6748; +} +{ +fma.rn.f16x2 r6753, r6109, r6746, r6750; +} +{ +mul.f16x2 r6757, r6109, r6748; +} +{ +neg.f16x2 r6760, r6757; +} +{ +fma.rn.f16x2 r6762, r6115, r6746, r6760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6770, {low, high}; +} +{ +mul.f16x2 r6771, r6768, r6770; +} +{ +mul.f16x2 r6774, r6742, r6766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6777, {high, low}; +} +{ +fma.rn.f16x2 r6779, r6771, r6777, r6774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6785, {high, high}; +} +{ +mul.f16x2 r6787, r6201, r6785; +} +{ +fma.rn.f16x2 r6790, r6195, r6783, r6787; +} +{ +mul.f16x2 r6794, r6195, r6785; +} +{ +neg.f16x2 r6797, r6794; +} +{ +fma.rn.f16x2 r6799, r6201, r6783, r6797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6807, {low, high}; +} +{ +mul.f16x2 r6808, r6805, r6807; +} +{ +mul.f16x2 r6811, r6779, r6803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6814, {high, low}; +} +{ +fma.rn.f16x2 r6816, r6808, r6814, r6811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6822, {high, high}; +} +{ +mul.f16x2 r6824, r6287, r6822; +} +{ +fma.rn.f16x2 r6827, r6281, r6820, r6824; +} +{ +mul.f16x2 r6831, r6281, r6822; +} +{ +neg.f16x2 r6834, r6831; +} +{ +fma.rn.f16x2 r6836, r6287, r6820, r6834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6844, {low, high}; +} +{ +mul.f16x2 r6845, r6842, r6844; +} +{ +mul.f16x2 r6848, r6816, r6840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6851, {high, low}; +} +{ +fma.rn.f16x2 r6853, r6845, r6851, r6848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6859, {high, high}; +} +{ +mul.f16x2 r6861, r6373, r6859; +} +{ +fma.rn.f16x2 r6864, r6367, r6857, r6861; +} +{ +mul.f16x2 r6868, r6367, r6859; +} +{ +neg.f16x2 r6871, r6868; +} +{ +fma.rn.f16x2 r6873, r6373, r6857, r6871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6881, {low, high}; +} +{ +mul.f16x2 r6882, r6879, r6881; +} +{ +mul.f16x2 r6885, r6853, r6877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6888, {high, low}; +} +{ +fma.rn.f16x2 r6890, r6882, r6888, r6885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6896, {high, high}; +} +{ +mul.f16x2 r6898, r6459, r6896; +} +{ +fma.rn.f16x2 r6901, r6453, r6894, r6898; +} +{ +mul.f16x2 r6905, r6453, r6896; +} +{ +neg.f16x2 r6908, r6905; +} +{ +fma.rn.f16x2 r6910, r6459, r6894, r6908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6918, {low, high}; +} +{ +mul.f16x2 r6919, r6916, r6918; +} +{ +mul.f16x2 r6922, r6890, r6914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6925, {high, low}; +} +{ +fma.rn.f16x2 r6927, r6919, r6925, r6922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6933, {high, high}; +} +{ +mul.f16x2 r6935, r6545, r6933; +} +{ +fma.rn.f16x2 r6938, r6539, r6931, r6935; +} +{ +mul.f16x2 r6942, r6539, r6933; +} +{ +neg.f16x2 r6945, r6942; +} +{ +fma.rn.f16x2 r6947, r6545, r6931, r6945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6955, {low, high}; +} +{ +mul.f16x2 r6956, r6953, r6955; +} +{ +mul.f16x2 r6959, r6927, r6951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6962, {high, low}; +} +{ +fma.rn.f16x2 r6964, r6956, r6962, r6959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6970, {high, high}; +} +{ +mul.f16x2 r6972, r6631, r6970; +} +{ +fma.rn.f16x2 r6975, r6625, r6968, r6972; +} +{ +mul.f16x2 r6979, r6625, r6970; +} +{ +neg.f16x2 r6982, r6979; +} +{ +fma.rn.f16x2 r6984, r6631, r6968, r6982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6992, {low, high}; +} +{ +mul.f16x2 r6993, r6990, r6992; +} +{ +mul.f16x2 r6996, r6964, r6988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6999, {high, low}; +} +{ +fma.rn.f16x2 r7001, r6993, r6999, r6996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7007, {high, high}; +} +{ +mul.f16x2 r7009, r5997, r7007; +} +{ +fma.rn.f16x2 r7012, r5961, r7005, r7009; +} +{ +mul.f16x2 r7016, r5961, r7007; +} +{ +neg.f16x2 r7019, r7016; +} +{ +fma.rn.f16x2 r7021, r5997, r7005, r7019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7029, {low, high}; +} +{ +mul.f16x2 r7030, r7027, r7029; +} +{ +mul.f16x2 r7033, r7001, r7025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7036, {high, low}; +} +{ +fma.rn.f16x2 r7038, r7030, r7036, r7033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7044, {high, high}; +} +{ +mul.f16x2 r7046, r6083, r7044; +} +{ +fma.rn.f16x2 r7049, r6047, r7042, r7046; +} +{ +mul.f16x2 r7053, r6047, r7044; +} +{ +neg.f16x2 r7056, r7053; +} +{ +fma.rn.f16x2 r7058, r6083, r7042, r7056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7066, {low, high}; +} +{ +mul.f16x2 r7067, r7064, r7066; +} +{ +mul.f16x2 r7070, r7038, r7062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7073, {high, low}; +} +{ +fma.rn.f16x2 r7075, r7067, r7073, r7070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7081, {high, high}; +} +{ +mul.f16x2 r7083, r6169, r7081; +} +{ +fma.rn.f16x2 r7086, r6133, r7079, r7083; +} +{ +mul.f16x2 r7090, r6133, r7081; +} +{ +neg.f16x2 r7093, r7090; +} +{ +fma.rn.f16x2 r7095, r6169, r7079, r7093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7103, {low, high}; +} +{ +mul.f16x2 r7104, r7101, r7103; +} +{ +mul.f16x2 r7107, r7075, r7099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7110, {high, low}; +} +{ +fma.rn.f16x2 r7112, r7104, r7110, r7107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7118, {high, high}; +} +{ +mul.f16x2 r7120, r6255, r7118; +} +{ +fma.rn.f16x2 r7123, r6219, r7116, r7120; +} +{ +mul.f16x2 r7127, r6219, r7118; +} +{ +neg.f16x2 r7130, r7127; +} +{ +fma.rn.f16x2 r7132, r6255, r7116, r7130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7140, {low, high}; +} +{ +mul.f16x2 r7141, r7138, r7140; +} +{ +mul.f16x2 r7144, r7112, r7136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7147, {high, low}; +} +{ +fma.rn.f16x2 r7149, r7141, r7147, r7144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7155, {high, high}; +} +{ +mul.f16x2 r7157, r6341, r7155; +} +{ +fma.rn.f16x2 r7160, r6305, r7153, r7157; +} +{ +mul.f16x2 r7164, r6305, r7155; +} +{ +neg.f16x2 r7167, r7164; +} +{ +fma.rn.f16x2 r7169, r6341, r7153, r7167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7177, {low, high}; +} +{ +mul.f16x2 r7178, r7175, r7177; +} +{ +mul.f16x2 r7181, r7149, r7173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7184, {high, low}; +} +{ +fma.rn.f16x2 r7186, r7178, r7184, r7181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7192, {high, high}; +} +{ +mul.f16x2 r7194, r6427, r7192; +} +{ +fma.rn.f16x2 r7197, r6391, r7190, r7194; +} +{ +mul.f16x2 r7201, r6391, r7192; +} +{ +neg.f16x2 r7204, r7201; +} +{ +fma.rn.f16x2 r7206, r6427, r7190, r7204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7214, {low, high}; +} +{ +mul.f16x2 r7215, r7212, r7214; +} +{ +mul.f16x2 r7218, r7186, r7210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7221, {high, low}; +} +{ +fma.rn.f16x2 r7223, r7215, r7221, r7218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7229, {high, high}; +} +{ +mul.f16x2 r7231, r6513, r7229; +} +{ +fma.rn.f16x2 r7234, r6477, r7227, r7231; +} +{ +mul.f16x2 r7238, r6477, r7229; +} +{ +neg.f16x2 r7241, r7238; +} +{ +fma.rn.f16x2 r7243, r6513, r7227, r7241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7251, {low, high}; +} +{ +mul.f16x2 r7252, r7249, r7251; +} +{ +mul.f16x2 r7255, r7223, r7247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7258, {high, low}; +} +{ +fma.rn.f16x2 r7260, r7252, r7258, r7255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7266, {high, high}; +} +{ +mul.f16x2 r7268, r6599, r7266; +} +{ +fma.rn.f16x2 r7271, r6563, r7264, r7268; +} +{ +mul.f16x2 r7275, r6563, r7266; +} +{ +neg.f16x2 r7278, r7275; +} +{ +fma.rn.f16x2 r7280, r6599, r7264, r7278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7288, {low, high}; +} +{ +mul.f16x2 r7289, r7286, r7288; +} +{ +mul.f16x2 r7292, r7260, r7284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7295, {high, low}; +} +{ +fma.rn.f16x2 r7297, r7289, r7295, r7292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7303, {high, high}; +} +{ +mul.f16x2 r7305, r6685, r7303; +} +{ +fma.rn.f16x2 r7308, r6649, r7301, r7305; +} +{ +mul.f16x2 r7312, r6649, r7303; +} +{ +neg.f16x2 r7315, r7312; +} +{ +fma.rn.f16x2 r7317, r6685, r7301, r7315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7325, {low, high}; +} +{ +mul.f16x2 r7326, r7323, r7325; +} +{ +mul.f16x2 r7329, r7297, r7321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7332, {high, low}; +} +{ +fma.rn.f16x2 r7334, r7326, r7332, r7329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7340, {high, high}; +} +{ +mul.f16x2 r7342, r6015, r7340; +} +{ +fma.rn.f16x2 r7345, r5979, r7338, r7342; +} +{ +mul.f16x2 r7349, r5979, r7340; +} +{ +neg.f16x2 r7352, r7349; +} +{ +fma.rn.f16x2 r7354, r6015, r7338, r7352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7362, {low, high}; +} +{ +mul.f16x2 r7363, r7360, r7362; +} +{ +mul.f16x2 r7366, r7334, r7358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7369, {high, low}; +} +{ +fma.rn.f16x2 r7371, r7363, r7369, r7366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7377, {high, high}; +} +{ +mul.f16x2 r7379, r6101, r7377; +} +{ +fma.rn.f16x2 r7382, r6065, r7375, r7379; +} +{ +mul.f16x2 r7386, r6065, r7377; +} +{ +neg.f16x2 r7389, r7386; +} +{ +fma.rn.f16x2 r7391, r6101, r7375, r7389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7399, {low, high}; +} +{ +mul.f16x2 r7400, r7397, r7399; +} +{ +mul.f16x2 r7403, r7371, r7395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7406, {high, low}; +} +{ +fma.rn.f16x2 r7408, r7400, r7406, r7403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7414, {high, high}; +} +{ +mul.f16x2 r7416, r6187, r7414; +} +{ +fma.rn.f16x2 r7419, r6151, r7412, r7416; +} +{ +mul.f16x2 r7423, r6151, r7414; +} +{ +neg.f16x2 r7426, r7423; +} +{ +fma.rn.f16x2 r7428, r6187, r7412, r7426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7436, {low, high}; +} +{ +mul.f16x2 r7437, r7434, r7436; +} +{ +mul.f16x2 r7440, r7408, r7432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7443, {high, low}; +} +{ +fma.rn.f16x2 r7445, r7437, r7443, r7440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7451, {high, high}; +} +{ +mul.f16x2 r7453, r6273, r7451; +} +{ +fma.rn.f16x2 r7456, r6237, r7449, r7453; +} +{ +mul.f16x2 r7460, r6237, r7451; +} +{ +neg.f16x2 r7463, r7460; +} +{ +fma.rn.f16x2 r7465, r6273, r7449, r7463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7473, {low, high}; +} +{ +mul.f16x2 r7474, r7471, r7473; +} +{ +mul.f16x2 r7477, r7445, r7469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7480, {high, low}; +} +{ +fma.rn.f16x2 r7482, r7474, r7480, r7477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7488, {high, high}; +} +{ +mul.f16x2 r7490, r6359, r7488; +} +{ +fma.rn.f16x2 r7493, r6323, r7486, r7490; +} +{ +mul.f16x2 r7497, r6323, r7488; +} +{ +neg.f16x2 r7500, r7497; +} +{ +fma.rn.f16x2 r7502, r6359, r7486, r7500; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7508, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7510, {low, high}; +} +{ +mul.f16x2 r7511, r7508, r7510; +} +{ +mul.f16x2 r7514, r7482, r7506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7517, {high, low}; +} +{ +fma.rn.f16x2 r7519, r7511, r7517, r7514; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7525, {high, high}; +} +{ +mul.f16x2 r7527, r6445, r7525; +} +{ +fma.rn.f16x2 r7530, r6409, r7523, r7527; +} +{ +mul.f16x2 r7534, r6409, r7525; +} +{ +neg.f16x2 r7537, r7534; +} +{ +fma.rn.f16x2 r7539, r6445, r7523, r7537; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7545, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7547, {low, high}; +} +{ +mul.f16x2 r7548, r7545, r7547; +} +{ +mul.f16x2 r7551, r7519, r7543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7554, {high, low}; +} +{ +fma.rn.f16x2 r7556, r7548, r7554, r7551; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7562, {high, high}; +} +{ +mul.f16x2 r7564, r6531, r7562; +} +{ +fma.rn.f16x2 r7567, r6495, r7560, r7564; +} +{ +mul.f16x2 r7571, r6495, r7562; +} +{ +neg.f16x2 r7574, r7571; +} +{ +fma.rn.f16x2 r7576, r6531, r7560, r7574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7582, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7584, {low, high}; +} +{ +mul.f16x2 r7585, r7582, r7584; +} +{ +mul.f16x2 r7588, r7556, r7580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7591, {high, low}; +} +{ +fma.rn.f16x2 r7593, r7585, r7591, r7588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7599, {high, high}; +} +{ +mul.f16x2 r7601, r6617, r7599; +} +{ +fma.rn.f16x2 r7604, r6581, r7597, r7601; +} +{ +mul.f16x2 r7608, r6581, r7599; +} +{ +neg.f16x2 r7611, r7608; +} +{ +fma.rn.f16x2 r7613, r6617, r7597, r7611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7619, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7621, {low, high}; +} +{ +mul.f16x2 r7622, r7619, r7621; +} +{ +mul.f16x2 r7625, r7593, r7617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7628, {high, low}; +} +{ +fma.rn.f16x2 r7630, r7622, r7628, r7625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7636, {high, high}; +} +{ +mul.f16x2 r7638, r6703, r7636; +} +{ +fma.rn.f16x2 r7641, r6667, r7634, r7638; +} +{ +mul.f16x2 r7645, r6667, r7636; +} +{ +neg.f16x2 r7648, r7645; +} +{ +fma.rn.f16x2 r7650, r6703, r7634, r7648; +} +shl.b32 r9476, r9475, 3; +add.s32 r9477, r9466, r9476; +barrier.sync 0; +mad.lo.s32 r9478, r9473, 5832, r9477; +st.shared.u32 [r9478], r5937; +st.shared.u32 [r9478+4], r5943; +st.shared.u32 [r9478+216], r6716; +st.shared.u32 [r9478+220], r6725; +st.shared.u32 [r9478+432], r6753; +st.shared.u32 [r9478+436], r6762; +st.shared.u32 [r9478+648], r6790; +st.shared.u32 [r9478+652], r6799; +st.shared.u32 [r9478+864], r6827; +st.shared.u32 [r9478+868], r6836; +st.shared.u32 [r9478+1080], r6864; +st.shared.u32 [r9478+1084], r6873; +st.shared.u32 [r9478+1296], r6901; +st.shared.u32 [r9478+1300], r6910; +st.shared.u32 [r9478+1512], r6938; +st.shared.u32 [r9478+1516], r6947; +st.shared.u32 [r9478+1728], r6975; +st.shared.u32 [r9478+1732], r6984; +st.shared.u32 [r9478+1944], r7012; +st.shared.u32 [r9478+1948], r7021; +st.shared.u32 [r9478+2160], r7049; +st.shared.u32 [r9478+2164], r7058; +st.shared.u32 [r9478+2376], r7086; +st.shared.u32 [r9478+2380], r7095; +st.shared.u32 [r9478+2592], r7123; +st.shared.u32 [r9478+2596], r7132; +st.shared.u32 [r9478+2808], r7160; +st.shared.u32 [r9478+2812], r7169; +st.shared.u32 [r9478+3024], r7197; +st.shared.u32 [r9478+3028], r7206; +st.shared.u32 [r9478+3240], r7234; +st.shared.u32 [r9478+3244], r7243; +st.shared.u32 [r9478+3456], r7271; +st.shared.u32 [r9478+3460], r7280; +st.shared.u32 [r9478+3672], r7308; +st.shared.u32 [r9478+3676], r7317; +st.shared.u32 [r9478+3888], r7345; +st.shared.u32 [r9478+3892], r7354; +st.shared.u32 [r9478+4104], r7382; +st.shared.u32 [r9478+4108], r7391; +st.shared.u32 [r9478+4320], r7419; +st.shared.u32 [r9478+4324], r7428; +st.shared.u32 [r9478+4536], r7456; +st.shared.u32 [r9478+4540], r7465; +st.shared.u32 [r9478+4752], r7493; +st.shared.u32 [r9478+4756], r7502; +st.shared.u32 [r9478+4968], r7530; +st.shared.u32 [r9478+4972], r7539; +st.shared.u32 [r9478+5184], r7567; +st.shared.u32 [r9478+5188], r7576; +st.shared.u32 [r9478+5400], r7604; +st.shared.u32 [r9478+5404], r7613; +st.shared.u32 [r9478+5616], r7641; +st.shared.u32 [r9478+5620], r7650; +barrier.sync 0; +ld.shared.u32 r7677, [r9468]; +ld.shared.u32 r7683, [r9468+4]; +ld.shared.u32 r8273, [r9468+1944]; +ld.shared.u32 r8279, [r9468+1948]; +ld.shared.u32 r8869, [r9468+3888]; +ld.shared.u32 r8875, [r9468+3892]; +ld.shared.u32 r7763, [r9468+5832]; +ld.shared.u32 r7769, [r9468+5836]; +ld.shared.u32 r8359, [r9468+7776]; +ld.shared.u32 r8365, [r9468+7780]; +ld.shared.u32 r8955, [r9468+9720]; +ld.shared.u32 r8961, [r9468+9724]; +ld.shared.u32 r7849, [r9468+11664]; +ld.shared.u32 r7855, [r9468+11668]; +ld.shared.u32 r8445, [r9468+13608]; +ld.shared.u32 r8451, [r9468+13612]; +ld.shared.u32 r9041, [r9468+15552]; +ld.shared.u32 r9047, [r9468+15556]; +ld.shared.u32 r7674, [r9468+17496]; +ld.shared.u32 r7680, [r9468+17500]; +ld.shared.u32 r8270, [r9468+19440]; +ld.shared.u32 r8276, [r9468+19444]; +ld.shared.u32 r8866, [r9468+21384]; +ld.shared.u32 r8872, [r9468+21388]; +ld.shared.u32 r7760, [r9468+23328]; +ld.shared.u32 r7766, [r9468+23332]; +ld.shared.u32 r8356, [r9468+25272]; +ld.shared.u32 r8362, [r9468+25276]; +ld.shared.u32 r8952, [r9468+27216]; +ld.shared.u32 r8958, [r9468+27220]; +ld.shared.u32 r7846, [r9468+29160]; +ld.shared.u32 r7852, [r9468+29164]; +ld.shared.u32 r8442, [r9468+31104]; +ld.shared.u32 r8448, [r9468+31108]; +ld.shared.u32 r9038, [r9468+33048]; +ld.shared.u32 r9044, [r9468+33052]; +ld.shared.u32 r7675, [r9468+34992]; +ld.shared.u32 r7681, [r9468+34996]; +ld.shared.u32 r8271, [r9468+36936]; +ld.shared.u32 r8277, [r9468+36940]; +ld.shared.u32 r8867, [r9468+38880]; +ld.shared.u32 r8873, [r9468+38884]; +ld.shared.u32 r7761, [r9468+40824]; +ld.shared.u32 r7767, [r9468+40828]; +ld.shared.u32 r8357, [r9468+42768]; +ld.shared.u32 r8363, [r9468+42772]; +ld.shared.u32 r8953, [r9468+44712]; +ld.shared.u32 r8959, [r9468+44716]; +ld.shared.u32 r7847, [r9468+46656]; +ld.shared.u32 r7853, [r9468+46660]; +ld.shared.u32 r8443, [r9468+48600]; +ld.shared.u32 r8449, [r9468+48604]; +ld.shared.u32 r9039, [r9468+50544]; +ld.shared.u32 r9045, [r9468+50548]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7672, {low, high}; +} +{ +add.f16x2 r7673, r7674, r7675; +} +{ +add.f16x2 r7676, r7677, r7673; +} +{ +add.f16x2 r7679, r7680, r7681; +} +{ +add.f16x2 r7682, r7683, r7679; +} +{ +add.f16x2 r7685, r7674, r7675; +} +{ +mul.f16x2 r7688, r7685, r7671; +} +{ +add.f16x2 r7691, r7677, r7688; +} +{ +sub.f16x2 r7694, r7680, r7681; +} +{ +mul.f16x2 r7697, r7694, r7672; +} +{ +add.f16x2 r7700, r7691, r7697; +} +{ +add.f16x2 r7703, r7674, r7675; +} +{ +mul.f16x2 r7706, r7703, r7671; +} +{ +add.f16x2 r7709, r7677, r7706; +} +{ +sub.f16x2 r7712, r7680, r7681; +} +{ +mul.f16x2 r7715, r7712, r7672; +} +{ +sub.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 r7721, r7680, r7681; +} +{ +mul.f16x2 r7724, r7721, r7671; +} +{ +add.f16x2 r7727, r7683, r7724; +} +{ +sub.f16x2 r7730, r7674, r7675; +} +{ +mul.f16x2 r7733, r7730, r7672; +} +{ +sub.f16x2 r7736, r7727, r7733; +} +{ +add.f16x2 r7739, r7680, r7681; +} +{ +mul.f16x2 r7742, r7739, r7671; +} +{ +add.f16x2 r7745, r7683, r7742; +} +{ +sub.f16x2 r7748, r7674, r7675; +} +{ +mul.f16x2 r7751, r7748, r7672; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7757, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7758, {low, high}; +} +{ +add.f16x2 r7759, r7760, r7761; +} +{ +add.f16x2 r7762, r7763, r7759; +} +{ +add.f16x2 r7765, r7766, r7767; +} +{ +add.f16x2 r7768, r7769, r7765; +} +{ +add.f16x2 r7771, r7760, r7761; +} +{ +mul.f16x2 r7774, r7771, r7757; +} +{ +add.f16x2 r7777, r7763, r7774; +} +{ +sub.f16x2 r7780, r7766, r7767; +} +{ +mul.f16x2 r7783, r7780, r7758; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7760, r7761; +} +{ +mul.f16x2 r7792, r7789, r7757; +} +{ +add.f16x2 r7795, r7763, r7792; +} +{ +sub.f16x2 r7798, r7766, r7767; +} +{ +mul.f16x2 r7801, r7798, r7758; +} +{ +sub.f16x2 r7804, r7795, r7801; +} +{ +add.f16x2 r7807, r7766, r7767; +} +{ +mul.f16x2 r7810, r7807, r7757; +} +{ +add.f16x2 r7813, r7769, r7810; +} +{ +sub.f16x2 r7816, r7760, r7761; +} +{ +mul.f16x2 r7819, r7816, r7758; +} +{ +sub.f16x2 r7822, r7813, r7819; +} +{ +add.f16x2 r7825, r7766, r7767; +} +{ +mul.f16x2 r7828, r7825, r7757; +} +{ +add.f16x2 r7831, r7769, r7828; +} +{ +sub.f16x2 r7834, r7760, r7761; +} +{ +mul.f16x2 r7837, r7834, r7758; +} +{ +add.f16x2 r7840, r7831, r7837; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7843, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7844, {low, high}; +} +{ +add.f16x2 r7845, r7846, r7847; +} +{ +add.f16x2 r7848, r7849, r7845; +} +{ +add.f16x2 r7851, r7852, r7853; +} +{ +add.f16x2 r7854, r7855, r7851; +} +{ +add.f16x2 r7857, r7846, r7847; +} +{ +mul.f16x2 r7860, r7857, r7843; +} +{ +add.f16x2 r7863, r7849, r7860; +} +{ +sub.f16x2 r7866, r7852, r7853; +} +{ +mul.f16x2 r7869, r7866, r7844; +} +{ +add.f16x2 r7872, r7863, r7869; +} +{ +add.f16x2 r7875, r7846, r7847; +} +{ +mul.f16x2 r7878, r7875, r7843; +} +{ +add.f16x2 r7881, r7849, r7878; +} +{ +sub.f16x2 r7884, r7852, r7853; +} +{ +mul.f16x2 r7887, r7884, r7844; +} +{ +sub.f16x2 r7890, r7881, r7887; +} +{ +add.f16x2 r7893, r7852, r7853; +} +{ +mul.f16x2 r7896, r7893, r7843; +} +{ +add.f16x2 r7899, r7855, r7896; +} +{ +sub.f16x2 r7902, r7846, r7847; +} +{ +mul.f16x2 r7905, r7902, r7844; +} +{ +sub.f16x2 r7908, r7899, r7905; +} +{ +add.f16x2 r7911, r7852, r7853; +} +{ +mul.f16x2 r7914, r7911, r7843; +} +{ +add.f16x2 r7917, r7855, r7914; +} +{ +sub.f16x2 r7920, r7846, r7847; +} +{ +mul.f16x2 r7923, r7920, r7844; +} +{ +add.f16x2 r7926, r7917, r7923; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r7929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r7930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r7931, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r7932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r7935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r7936, {low, high}; +} +{ +mul.f16x2 r7945, r7786, r7929; +} +{ +mul.f16x2 r7948, r7822, r7930; +} +{ +sub.f16x2 r7951, r7945, r7948; +} +{ +mul.f16x2 r7954, r7786, r7930; +} +{ +fma.rn.f16x2 r7957, r7822, r7929, r7954; +} +{ +mul.f16x2 r7961, r7872, r7931; +} +{ +mul.f16x2 r7964, r7908, r7932; +} +{ +sub.f16x2 r7967, r7961, r7964; +} +{ +mul.f16x2 r7970, r7872, r7932; +} +{ +fma.rn.f16x2 r7973, r7908, r7931, r7970; +} +{ +mul.f16x2 r7977, r7804, r7931; +} +{ +mul.f16x2 r7980, r7840, r7932; +} +{ +sub.f16x2 r7983, r7977, r7980; +} +{ +mul.f16x2 r7986, r7804, r7932; +} +{ +fma.rn.f16x2 r7989, r7840, r7931, r7986; +} +{ +mul.f16x2 r7993, r7890, r7935; +} +{ +mul.f16x2 r7996, r7926, r7936; +} +{ +sub.f16x2 r7999, r7993, r7996; +} +{ +mul.f16x2 r8002, r7890, r7936; +} +{ +fma.rn.f16x2 r8005, r7926, r7935, r8002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8010, {low, high}; +} +{ +add.f16x2 r8011, r7762, r7848; +} +{ +add.f16x2 %0, r7676, r8011; +} +{ +add.f16x2 r8017, r7768, r7854; +} +{ +add.f16x2 %1, r7682, r8017; +} +{ +add.f16x2 r8023, r7762, r7848; +} +{ +mul.f16x2 r8026, r8023, r8009; +} +{ +add.f16x2 r8029, r7676, r8026; +} +{ +sub.f16x2 r8032, r7768, r7854; +} +{ +mul.f16x2 r8035, r8032, r8010; +} +{ +add.f16x2 %18, r8029, r8035; +} +{ +add.f16x2 r8041, r7762, r7848; +} +{ +mul.f16x2 r8044, r8041, r8009; +} +{ +add.f16x2 r8047, r7676, r8044; +} +{ +sub.f16x2 r8050, r7768, r7854; +} +{ +mul.f16x2 r8053, r8050, r8010; +} +{ +sub.f16x2 %36, r8047, r8053; +} +{ +add.f16x2 r8059, r7768, r7854; +} +{ +mul.f16x2 r8062, r8059, r8009; +} +{ +add.f16x2 r8065, r7682, r8062; +} +{ +sub.f16x2 r8068, r7762, r7848; +} +{ +mul.f16x2 r8071, r8068, r8010; +} +{ +sub.f16x2 %19, r8065, r8071; +} +{ +add.f16x2 r8077, r7768, r7854; +} +{ +mul.f16x2 r8080, r8077, r8009; +} +{ +add.f16x2 r8083, r7682, r8080; +} +{ +sub.f16x2 r8086, r7762, r7848; +} +{ +mul.f16x2 r8089, r8086, r8010; +} +{ +add.f16x2 %37, r8083, r8089; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8096, {low, high}; +} +{ +add.f16x2 r8097, r7951, r7967; +} +{ +add.f16x2 %6, r7700, r8097; +} +{ +add.f16x2 r8103, r7957, r7973; +} +{ +add.f16x2 %7, r7736, r8103; +} +{ +add.f16x2 r8109, r7951, r7967; +} +{ +mul.f16x2 r8112, r8109, r8095; +} +{ +add.f16x2 r8115, r7700, r8112; +} +{ +sub.f16x2 r8118, r7957, r7973; +} +{ +mul.f16x2 r8121, r8118, r8096; +} +{ +add.f16x2 %24, r8115, r8121; +} +{ +add.f16x2 r8127, r7951, r7967; +} +{ +mul.f16x2 r8130, r8127, r8095; +} +{ +add.f16x2 r8133, r7700, r8130; +} +{ +sub.f16x2 r8136, r7957, r7973; +} +{ +mul.f16x2 r8139, r8136, r8096; +} +{ +sub.f16x2 %42, r8133, r8139; +} +{ +add.f16x2 r8145, r7957, r7973; +} +{ +mul.f16x2 r8148, r8145, r8095; +} +{ +add.f16x2 r8151, r7736, r8148; +} +{ +sub.f16x2 r8154, r7951, r7967; +} +{ +mul.f16x2 r8157, r8154, r8096; +} +{ +sub.f16x2 %25, r8151, r8157; +} +{ +add.f16x2 r8163, r7957, r7973; +} +{ +mul.f16x2 r8166, r8163, r8095; +} +{ +add.f16x2 r8169, r7736, r8166; +} +{ +sub.f16x2 r8172, r7951, r7967; +} +{ +mul.f16x2 r8175, r8172, r8096; +} +{ +add.f16x2 %43, r8169, r8175; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8181, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8182, {low, high}; +} +{ +add.f16x2 r8183, r7983, r7999; +} +{ +add.f16x2 %12, r7718, r8183; +} +{ +add.f16x2 r8189, r7989, r8005; +} +{ +add.f16x2 %13, r7754, r8189; +} +{ +add.f16x2 r8195, r7983, r7999; +} +{ +mul.f16x2 r8198, r8195, r8181; +} +{ +add.f16x2 r8201, r7718, r8198; +} +{ +sub.f16x2 r8204, r7989, r8005; +} +{ +mul.f16x2 r8207, r8204, r8182; +} +{ +add.f16x2 %30, r8201, r8207; +} +{ +add.f16x2 r8213, r7983, r7999; +} +{ +mul.f16x2 r8216, r8213, r8181; +} +{ +add.f16x2 r8219, r7718, r8216; +} +{ +sub.f16x2 r8222, r7989, r8005; +} +{ +mul.f16x2 r8225, r8222, r8182; +} +{ +sub.f16x2 %48, r8219, r8225; +} +{ +add.f16x2 r8231, r7989, r8005; +} +{ +mul.f16x2 r8234, r8231, r8181; +} +{ +add.f16x2 r8237, r7754, r8234; +} +{ +sub.f16x2 r8240, r7983, r7999; +} +{ +mul.f16x2 r8243, r8240, r8182; +} +{ +sub.f16x2 %31, r8237, r8243; +} +{ +add.f16x2 r8249, r7989, r8005; +} +{ +mul.f16x2 r8252, r8249, r8181; +} +{ +add.f16x2 r8255, r7754, r8252; +} +{ +sub.f16x2 r8258, r7983, r7999; +} +{ +mul.f16x2 r8261, r8258, r8182; +} +{ +add.f16x2 %49, r8255, r8261; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8268, {low, high}; +} +{ +add.f16x2 r8269, r8270, r8271; +} +{ +add.f16x2 r8272, r8273, r8269; +} +{ +add.f16x2 r8275, r8276, r8277; +} +{ +add.f16x2 r8278, r8279, r8275; +} +{ +add.f16x2 r8281, r8270, r8271; +} +{ +mul.f16x2 r8284, r8281, r8267; +} +{ +add.f16x2 r8287, r8273, r8284; +} +{ +sub.f16x2 r8290, r8276, r8277; +} +{ +mul.f16x2 r8293, r8290, r8268; +} +{ +add.f16x2 r8296, r8287, r8293; +} +{ +add.f16x2 r8299, r8270, r8271; +} +{ +mul.f16x2 r8302, r8299, r8267; +} +{ +add.f16x2 r8305, r8273, r8302; +} +{ +sub.f16x2 r8308, r8276, r8277; +} +{ +mul.f16x2 r8311, r8308, r8268; +} +{ +sub.f16x2 r8314, r8305, r8311; +} +{ +add.f16x2 r8317, r8276, r8277; +} +{ +mul.f16x2 r8320, r8317, r8267; +} +{ +add.f16x2 r8323, r8279, r8320; +} +{ +sub.f16x2 r8326, r8270, r8271; +} +{ +mul.f16x2 r8329, r8326, r8268; +} +{ +sub.f16x2 r8332, r8323, r8329; +} +{ +add.f16x2 r8335, r8276, r8277; +} +{ +mul.f16x2 r8338, r8335, r8267; +} +{ +add.f16x2 r8341, r8279, r8338; +} +{ +sub.f16x2 r8344, r8270, r8271; +} +{ +mul.f16x2 r8347, r8344, r8268; +} +{ +add.f16x2 r8350, r8341, r8347; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8354, {low, high}; +} +{ +add.f16x2 r8355, r8356, r8357; +} +{ +add.f16x2 r8358, r8359, r8355; +} +{ +add.f16x2 r8361, r8362, r8363; +} +{ +add.f16x2 r8364, r8365, r8361; +} +{ +add.f16x2 r8367, r8356, r8357; +} +{ +mul.f16x2 r8370, r8367, r8353; +} +{ +add.f16x2 r8373, r8359, r8370; +} +{ +sub.f16x2 r8376, r8362, r8363; +} +{ +mul.f16x2 r8379, r8376, r8354; +} +{ +add.f16x2 r8382, r8373, r8379; +} +{ +add.f16x2 r8385, r8356, r8357; +} +{ +mul.f16x2 r8388, r8385, r8353; +} +{ +add.f16x2 r8391, r8359, r8388; +} +{ +sub.f16x2 r8394, r8362, r8363; +} +{ +mul.f16x2 r8397, r8394, r8354; +} +{ +sub.f16x2 r8400, r8391, r8397; +} +{ +add.f16x2 r8403, r8362, r8363; +} +{ +mul.f16x2 r8406, r8403, r8353; +} +{ +add.f16x2 r8409, r8365, r8406; +} +{ +sub.f16x2 r8412, r8356, r8357; +} +{ +mul.f16x2 r8415, r8412, r8354; +} +{ +sub.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8362, r8363; +} +{ +mul.f16x2 r8424, r8421, r8353; +} +{ +add.f16x2 r8427, r8365, r8424; +} +{ +sub.f16x2 r8430, r8356, r8357; +} +{ +mul.f16x2 r8433, r8430, r8354; +} +{ +add.f16x2 r8436, r8427, r8433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8440, {low, high}; +} +{ +add.f16x2 r8441, r8442, r8443; +} +{ +add.f16x2 r8444, r8445, r8441; +} +{ +add.f16x2 r8447, r8448, r8449; +} +{ +add.f16x2 r8450, r8451, r8447; +} +{ +add.f16x2 r8453, r8442, r8443; +} +{ +mul.f16x2 r8456, r8453, r8439; +} +{ +add.f16x2 r8459, r8445, r8456; +} +{ +sub.f16x2 r8462, r8448, r8449; +} +{ +mul.f16x2 r8465, r8462, r8440; +} +{ +add.f16x2 r8468, r8459, r8465; +} +{ +add.f16x2 r8471, r8442, r8443; +} +{ +mul.f16x2 r8474, r8471, r8439; +} +{ +add.f16x2 r8477, r8445, r8474; +} +{ +sub.f16x2 r8480, r8448, r8449; +} +{ +mul.f16x2 r8483, r8480, r8440; +} +{ +sub.f16x2 r8486, r8477, r8483; +} +{ +add.f16x2 r8489, r8448, r8449; +} +{ +mul.f16x2 r8492, r8489, r8439; +} +{ +add.f16x2 r8495, r8451, r8492; +} +{ +sub.f16x2 r8498, r8442, r8443; +} +{ +mul.f16x2 r8501, r8498, r8440; +} +{ +sub.f16x2 r8504, r8495, r8501; +} +{ +add.f16x2 r8507, r8448, r8449; +} +{ +mul.f16x2 r8510, r8507, r8439; +} +{ +add.f16x2 r8513, r8451, r8510; +} +{ +sub.f16x2 r8516, r8442, r8443; +} +{ +mul.f16x2 r8519, r8516, r8440; +} +{ +add.f16x2 r8522, r8513, r8519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r8525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r8526, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r8527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r8528, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r8531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r8532, {low, high}; +} +{ +mul.f16x2 r8541, r8382, r8525; +} +{ +mul.f16x2 r8544, r8418, r8526; +} +{ +sub.f16x2 r8547, r8541, r8544; +} +{ +mul.f16x2 r8550, r8382, r8526; +} +{ +fma.rn.f16x2 r8553, r8418, r8525, r8550; +} +{ +mul.f16x2 r8557, r8468, r8527; +} +{ +mul.f16x2 r8560, r8504, r8528; +} +{ +sub.f16x2 r8563, r8557, r8560; +} +{ +mul.f16x2 r8566, r8468, r8528; +} +{ +fma.rn.f16x2 r8569, r8504, r8527, r8566; +} +{ +mul.f16x2 r8573, r8400, r8527; +} +{ +mul.f16x2 r8576, r8436, r8528; +} +{ +sub.f16x2 r8579, r8573, r8576; +} +{ +mul.f16x2 r8582, r8400, r8528; +} +{ +fma.rn.f16x2 r8585, r8436, r8527, r8582; +} +{ +mul.f16x2 r8589, r8486, r8531; +} +{ +mul.f16x2 r8592, r8522, r8532; +} +{ +sub.f16x2 r8595, r8589, r8592; +} +{ +mul.f16x2 r8598, r8486, r8532; +} +{ +fma.rn.f16x2 r8601, r8522, r8531, r8598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8606, {low, high}; +} +{ +add.f16x2 r8607, r8358, r8444; +} +{ +add.f16x2 %2, r8272, r8607; +} +{ +add.f16x2 r8613, r8364, r8450; +} +{ +add.f16x2 %3, r8278, r8613; +} +{ +add.f16x2 r8619, r8358, r8444; +} +{ +mul.f16x2 r8622, r8619, r8605; +} +{ +add.f16x2 r8625, r8272, r8622; +} +{ +sub.f16x2 r8628, r8364, r8450; +} +{ +mul.f16x2 r8631, r8628, r8606; +} +{ +add.f16x2 %20, r8625, r8631; +} +{ +add.f16x2 r8637, r8358, r8444; +} +{ +mul.f16x2 r8640, r8637, r8605; +} +{ +add.f16x2 r8643, r8272, r8640; +} +{ +sub.f16x2 r8646, r8364, r8450; +} +{ +mul.f16x2 r8649, r8646, r8606; +} +{ +sub.f16x2 %38, r8643, r8649; +} +{ +add.f16x2 r8655, r8364, r8450; +} +{ +mul.f16x2 r8658, r8655, r8605; +} +{ +add.f16x2 r8661, r8278, r8658; +} +{ +sub.f16x2 r8664, r8358, r8444; +} +{ +mul.f16x2 r8667, r8664, r8606; +} +{ +sub.f16x2 %21, r8661, r8667; +} +{ +add.f16x2 r8673, r8364, r8450; +} +{ +mul.f16x2 r8676, r8673, r8605; +} +{ +add.f16x2 r8679, r8278, r8676; +} +{ +sub.f16x2 r8682, r8358, r8444; +} +{ +mul.f16x2 r8685, r8682, r8606; +} +{ +add.f16x2 %39, r8679, r8685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8692, {low, high}; +} +{ +add.f16x2 r8693, r8547, r8563; +} +{ +add.f16x2 %8, r8296, r8693; +} +{ +add.f16x2 r8699, r8553, r8569; +} +{ +add.f16x2 %9, r8332, r8699; +} +{ +add.f16x2 r8705, r8547, r8563; +} +{ +mul.f16x2 r8708, r8705, r8691; +} +{ +add.f16x2 r8711, r8296, r8708; +} +{ +sub.f16x2 r8714, r8553, r8569; +} +{ +mul.f16x2 r8717, r8714, r8692; +} +{ +add.f16x2 %26, r8711, r8717; +} +{ +add.f16x2 r8723, r8547, r8563; +} +{ +mul.f16x2 r8726, r8723, r8691; +} +{ +add.f16x2 r8729, r8296, r8726; +} +{ +sub.f16x2 r8732, r8553, r8569; +} +{ +mul.f16x2 r8735, r8732, r8692; +} +{ +sub.f16x2 %44, r8729, r8735; +} +{ +add.f16x2 r8741, r8553, r8569; +} +{ +mul.f16x2 r8744, r8741, r8691; +} +{ +add.f16x2 r8747, r8332, r8744; +} +{ +sub.f16x2 r8750, r8547, r8563; +} +{ +mul.f16x2 r8753, r8750, r8692; +} +{ +sub.f16x2 %27, r8747, r8753; +} +{ +add.f16x2 r8759, r8553, r8569; +} +{ +mul.f16x2 r8762, r8759, r8691; +} +{ +add.f16x2 r8765, r8332, r8762; +} +{ +sub.f16x2 r8768, r8547, r8563; +} +{ +mul.f16x2 r8771, r8768, r8692; +} +{ +add.f16x2 %45, r8765, r8771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8777, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8778, {low, high}; +} +{ +add.f16x2 r8779, r8579, r8595; +} +{ +add.f16x2 %14, r8314, r8779; +} +{ +add.f16x2 r8785, r8585, r8601; +} +{ +add.f16x2 %15, r8350, r8785; +} +{ +add.f16x2 r8791, r8579, r8595; +} +{ +mul.f16x2 r8794, r8791, r8777; +} +{ +add.f16x2 r8797, r8314, r8794; +} +{ +sub.f16x2 r8800, r8585, r8601; +} +{ +mul.f16x2 r8803, r8800, r8778; +} +{ +add.f16x2 %32, r8797, r8803; +} +{ +add.f16x2 r8809, r8579, r8595; +} +{ +mul.f16x2 r8812, r8809, r8777; +} +{ +add.f16x2 r8815, r8314, r8812; +} +{ +sub.f16x2 r8818, r8585, r8601; +} +{ +mul.f16x2 r8821, r8818, r8778; +} +{ +sub.f16x2 %50, r8815, r8821; +} +{ +add.f16x2 r8827, r8585, r8601; +} +{ +mul.f16x2 r8830, r8827, r8777; +} +{ +add.f16x2 r8833, r8350, r8830; +} +{ +sub.f16x2 r8836, r8579, r8595; +} +{ +mul.f16x2 r8839, r8836, r8778; +} +{ +sub.f16x2 %33, r8833, r8839; +} +{ +add.f16x2 r8845, r8585, r8601; +} +{ +mul.f16x2 r8848, r8845, r8777; +} +{ +add.f16x2 r8851, r8350, r8848; +} +{ +sub.f16x2 r8854, r8579, r8595; +} +{ +mul.f16x2 r8857, r8854, r8778; +} +{ +add.f16x2 %51, r8851, r8857; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8864, {low, high}; +} +{ +add.f16x2 r8865, r8866, r8867; +} +{ +add.f16x2 r8868, r8869, r8865; +} +{ +add.f16x2 r8871, r8872, r8873; +} +{ +add.f16x2 r8874, r8875, r8871; +} +{ +add.f16x2 r8877, r8866, r8867; +} +{ +mul.f16x2 r8880, r8877, r8863; +} +{ +add.f16x2 r8883, r8869, r8880; +} +{ +sub.f16x2 r8886, r8872, r8873; +} +{ +mul.f16x2 r8889, r8886, r8864; +} +{ +add.f16x2 r8892, r8883, r8889; +} +{ +add.f16x2 r8895, r8866, r8867; +} +{ +mul.f16x2 r8898, r8895, r8863; +} +{ +add.f16x2 r8901, r8869, r8898; +} +{ +sub.f16x2 r8904, r8872, r8873; +} +{ +mul.f16x2 r8907, r8904, r8864; +} +{ +sub.f16x2 r8910, r8901, r8907; +} +{ +add.f16x2 r8913, r8872, r8873; +} +{ +mul.f16x2 r8916, r8913, r8863; +} +{ +add.f16x2 r8919, r8875, r8916; +} +{ +sub.f16x2 r8922, r8866, r8867; +} +{ +mul.f16x2 r8925, r8922, r8864; +} +{ +sub.f16x2 r8928, r8919, r8925; +} +{ +add.f16x2 r8931, r8872, r8873; +} +{ +mul.f16x2 r8934, r8931, r8863; +} +{ +add.f16x2 r8937, r8875, r8934; +} +{ +sub.f16x2 r8940, r8866, r8867; +} +{ +mul.f16x2 r8943, r8940, r8864; +} +{ +add.f16x2 r8946, r8937, r8943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8949, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8950, {low, high}; +} +{ +add.f16x2 r8951, r8952, r8953; +} +{ +add.f16x2 r8954, r8955, r8951; +} +{ +add.f16x2 r8957, r8958, r8959; +} +{ +add.f16x2 r8960, r8961, r8957; +} +{ +add.f16x2 r8963, r8952, r8953; +} +{ +mul.f16x2 r8966, r8963, r8949; +} +{ +add.f16x2 r8969, r8955, r8966; +} +{ +sub.f16x2 r8972, r8958, r8959; +} +{ +mul.f16x2 r8975, r8972, r8950; +} +{ +add.f16x2 r8978, r8969, r8975; +} +{ +add.f16x2 r8981, r8952, r8953; +} +{ +mul.f16x2 r8984, r8981, r8949; +} +{ +add.f16x2 r8987, r8955, r8984; +} +{ +sub.f16x2 r8990, r8958, r8959; +} +{ +mul.f16x2 r8993, r8990, r8950; +} +{ +sub.f16x2 r8996, r8987, r8993; +} +{ +add.f16x2 r8999, r8958, r8959; +} +{ +mul.f16x2 r9002, r8999, r8949; +} +{ +add.f16x2 r9005, r8961, r9002; +} +{ +sub.f16x2 r9008, r8952, r8953; +} +{ +mul.f16x2 r9011, r9008, r8950; +} +{ +sub.f16x2 r9014, r9005, r9011; +} +{ +add.f16x2 r9017, r8958, r8959; +} +{ +mul.f16x2 r9020, r9017, r8949; +} +{ +add.f16x2 r9023, r8961, r9020; +} +{ +sub.f16x2 r9026, r8952, r8953; +} +{ +mul.f16x2 r9029, r9026, r8950; +} +{ +add.f16x2 r9032, r9023, r9029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9035, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9036, {low, high}; +} +{ +add.f16x2 r9037, r9038, r9039; +} +{ +add.f16x2 r9040, r9041, r9037; +} +{ +add.f16x2 r9043, r9044, r9045; +} +{ +add.f16x2 r9046, r9047, r9043; +} +{ +add.f16x2 r9049, r9038, r9039; +} +{ +mul.f16x2 r9052, r9049, r9035; +} +{ +add.f16x2 r9055, r9041, r9052; +} +{ +sub.f16x2 r9058, r9044, r9045; +} +{ +mul.f16x2 r9061, r9058, r9036; +} +{ +add.f16x2 r9064, r9055, r9061; +} +{ +add.f16x2 r9067, r9038, r9039; +} +{ +mul.f16x2 r9070, r9067, r9035; +} +{ +add.f16x2 r9073, r9041, r9070; +} +{ +sub.f16x2 r9076, r9044, r9045; +} +{ +mul.f16x2 r9079, r9076, r9036; +} +{ +sub.f16x2 r9082, r9073, r9079; +} +{ +add.f16x2 r9085, r9044, r9045; +} +{ +mul.f16x2 r9088, r9085, r9035; +} +{ +add.f16x2 r9091, r9047, r9088; +} +{ +sub.f16x2 r9094, r9038, r9039; +} +{ +mul.f16x2 r9097, r9094, r9036; +} +{ +sub.f16x2 r9100, r9091, r9097; +} +{ +add.f16x2 r9103, r9044, r9045; +} +{ +mul.f16x2 r9106, r9103, r9035; +} +{ +add.f16x2 r9109, r9047, r9106; +} +{ +sub.f16x2 r9112, r9038, r9039; +} +{ +mul.f16x2 r9115, r9112, r9036; +} +{ +add.f16x2 r9118, r9109, r9115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r9121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r9122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r9123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r9124, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r9127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r9128, {low, high}; +} +{ +mul.f16x2 r9137, r8978, r9121; +} +{ +mul.f16x2 r9140, r9014, r9122; +} +{ +sub.f16x2 r9143, r9137, r9140; +} +{ +mul.f16x2 r9146, r8978, r9122; +} +{ +fma.rn.f16x2 r9149, r9014, r9121, r9146; +} +{ +mul.f16x2 r9153, r9064, r9123; +} +{ +mul.f16x2 r9156, r9100, r9124; +} +{ +sub.f16x2 r9159, r9153, r9156; +} +{ +mul.f16x2 r9162, r9064, r9124; +} +{ +fma.rn.f16x2 r9165, r9100, r9123, r9162; +} +{ +mul.f16x2 r9169, r8996, r9123; +} +{ +mul.f16x2 r9172, r9032, r9124; +} +{ +sub.f16x2 r9175, r9169, r9172; +} +{ +mul.f16x2 r9178, r8996, r9124; +} +{ +fma.rn.f16x2 r9181, r9032, r9123, r9178; +} +{ +mul.f16x2 r9185, r9082, r9127; +} +{ +mul.f16x2 r9188, r9118, r9128; +} +{ +sub.f16x2 r9191, r9185, r9188; +} +{ +mul.f16x2 r9194, r9082, r9128; +} +{ +fma.rn.f16x2 r9197, r9118, r9127, r9194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9201, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9202, {low, high}; +} +{ +add.f16x2 r9203, r8954, r9040; +} +{ +add.f16x2 %4, r8868, r9203; +} +{ +add.f16x2 r9209, r8960, r9046; +} +{ +add.f16x2 %5, r8874, r9209; +} +{ +add.f16x2 r9215, r8954, r9040; +} +{ +mul.f16x2 r9218, r9215, r9201; +} +{ +add.f16x2 r9221, r8868, r9218; +} +{ +sub.f16x2 r9224, r8960, r9046; +} +{ +mul.f16x2 r9227, r9224, r9202; +} +{ +add.f16x2 %22, r9221, r9227; +} +{ +add.f16x2 r9233, r8954, r9040; +} +{ +mul.f16x2 r9236, r9233, r9201; +} +{ +add.f16x2 r9239, r8868, r9236; +} +{ +sub.f16x2 r9242, r8960, r9046; +} +{ +mul.f16x2 r9245, r9242, r9202; +} +{ +sub.f16x2 %40, r9239, r9245; +} +{ +add.f16x2 r9251, r8960, r9046; +} +{ +mul.f16x2 r9254, r9251, r9201; +} +{ +add.f16x2 r9257, r8874, r9254; +} +{ +sub.f16x2 r9260, r8954, r9040; +} +{ +mul.f16x2 r9263, r9260, r9202; +} +{ +sub.f16x2 %23, r9257, r9263; +} +{ +add.f16x2 r9269, r8960, r9046; +} +{ +mul.f16x2 r9272, r9269, r9201; +} +{ +add.f16x2 r9275, r8874, r9272; +} +{ +sub.f16x2 r9278, r8954, r9040; +} +{ +mul.f16x2 r9281, r9278, r9202; +} +{ +add.f16x2 %41, r9275, r9281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9288, {low, high}; +} +{ +add.f16x2 r9289, r9143, r9159; +} +{ +add.f16x2 %10, r8892, r9289; +} +{ +add.f16x2 r9295, r9149, r9165; +} +{ +add.f16x2 %11, r8928, r9295; +} +{ +add.f16x2 r9301, r9143, r9159; +} +{ +mul.f16x2 r9304, r9301, r9287; +} +{ +add.f16x2 r9307, r8892, r9304; +} +{ +sub.f16x2 r9310, r9149, r9165; +} +{ +mul.f16x2 r9313, r9310, r9288; +} +{ +add.f16x2 %28, r9307, r9313; +} +{ +add.f16x2 r9319, r9143, r9159; +} +{ +mul.f16x2 r9322, r9319, r9287; +} +{ +add.f16x2 r9325, r8892, r9322; +} +{ +sub.f16x2 r9328, r9149, r9165; +} +{ +mul.f16x2 r9331, r9328, r9288; +} +{ +sub.f16x2 %46, r9325, r9331; +} +{ +add.f16x2 r9337, r9149, r9165; +} +{ +mul.f16x2 r9340, r9337, r9287; +} +{ +add.f16x2 r9343, r8928, r9340; +} +{ +sub.f16x2 r9346, r9143, r9159; +} +{ +mul.f16x2 r9349, r9346, r9288; +} +{ +sub.f16x2 %29, r9343, r9349; +} +{ +add.f16x2 r9355, r9149, r9165; +} +{ +mul.f16x2 r9358, r9355, r9287; +} +{ +add.f16x2 r9361, r8928, r9358; +} +{ +sub.f16x2 r9364, r9143, r9159; +} +{ +mul.f16x2 r9367, r9364, r9288; +} +{ +add.f16x2 %47, r9361, r9367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9374, {low, high}; +} +{ +add.f16x2 r9375, r9175, r9191; +} +{ +add.f16x2 %16, r8910, r9375; +} +{ +add.f16x2 r9381, r9181, r9197; +} +{ +add.f16x2 %17, r8946, r9381; +} +{ +add.f16x2 r9387, r9175, r9191; +} +{ +mul.f16x2 r9390, r9387, r9373; +} +{ +add.f16x2 r9393, r8910, r9390; +} +{ +sub.f16x2 r9396, r9181, r9197; +} +{ +mul.f16x2 r9399, r9396, r9374; +} +{ +add.f16x2 %34, r9393, r9399; +} +{ +add.f16x2 r9405, r9175, r9191; +} +{ +mul.f16x2 r9408, r9405, r9373; +} +{ +add.f16x2 r9411, r8910, r9408; +} +{ +sub.f16x2 r9414, r9181, r9197; +} +{ +mul.f16x2 r9417, r9414, r9374; +} +{ +sub.f16x2 %52, r9411, r9417; +} +{ +add.f16x2 r9423, r9181, r9197; +} +{ +mul.f16x2 r9426, r9423, r9373; +} +{ +add.f16x2 r9429, r8946, r9426; +} +{ +sub.f16x2 r9432, r9175, r9191; +} +{ +mul.f16x2 r9435, r9432, r9374; +} +{ +sub.f16x2 %35, r9429, r9435; +} +{ +add.f16x2 r9441, r9181, r9197; +} +{ +mul.f16x2 r9444, r9441, r9373; +} +{ +add.f16x2 r9447, r8946, r9444; +} +{ +sub.f16x2 r9450, r9175, r9191; +} +{ +mul.f16x2 r9453, r9450, r9374; +} +{ +add.f16x2 %53, r9447, r9453; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1098, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<903>; +.reg .b32 r<9534>; +.reg .b64 rd<6>; +mov.u32 r9460, %54; +mov.u32 r9533, %tid.y; +mad.lo.s32 r9461, r9533, 26244, r9460; +mov.u32 r9462, %tid.x; +mov.f32 f894, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1, {low, high}; +} +mov.f32 f896, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f854, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r259, {low, high}; +} +mov.f32 f856, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r260, {low, high}; +} +mov.f32 f858, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r261, {low, high}; +} +mov.f32 f860, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r262, {low, high}; +} +mov.f32 f866, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r265, {low, high}; +} +mov.f32 f868, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %76; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %76; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %76; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %76; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %76; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %74; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %74; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %74; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %74; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %74; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %75, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %75, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %75, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %75, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %75, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1789, {low, high}; +} +mov.f32 f536, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1790, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1791, {low, high}; +} +mov.f32 f540, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r1794, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1795, {low, high}; +} +mov.f32 f548, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1796, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1797, {low, high}; +} +mov.f32 f552, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r1800, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1801, {low, high}; +} +mov.f32 f560, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1802, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1803, {low, high}; +} +mov.f32 f564, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1804, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1807, {low, high}; +} +mov.f32 f572, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1808, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r1812, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1815, {low, high}; +} +mov.f32 f588, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1819, {low, high}; +} +mov.f32 f596, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r9462, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r9463, rd3; +mul.lo.s32 r9464, r9463, 243; +sub.s32 r9465, r9462, r9464; +mad.lo.s32 r9466, r9463, 26244, r9461; +cvt.rn.f32.u32 f897, r9465; +mul.f32 f898, f897, 0f3A7B0B40; +cos.approx.f32 f309, f898; +sin.approx.f32 f899, f898; +neg.f32 f310, f899; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f725, 0fBF800000; +mov.f32 f726, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +barrier.sync 0; +mad.lo.s32 r9467, r9465, 108, r9466; +st.shared.u32 [r9467], r2102; +st.shared.u32 [r9467+4], r2881; +st.shared.u32 [r9467+8], r2918; +st.shared.u32 [r9467+12], r2955; +st.shared.u32 [r9467+16], r2992; +st.shared.u32 [r9467+20], r3029; +st.shared.u32 [r9467+24], r3066; +st.shared.u32 [r9467+28], r3103; +st.shared.u32 [r9467+32], r3140; +st.shared.u32 [r9467+36], r3177; +st.shared.u32 [r9467+40], r3214; +st.shared.u32 [r9467+44], r3251; +st.shared.u32 [r9467+48], r3288; +st.shared.u32 [r9467+52], r3325; +st.shared.u32 [r9467+56], r3362; +st.shared.u32 [r9467+60], r3399; +st.shared.u32 [r9467+64], r3436; +st.shared.u32 [r9467+68], r3473; +st.shared.u32 [r9467+72], r3510; +st.shared.u32 [r9467+76], r3547; +st.shared.u32 [r9467+80], r3584; +st.shared.u32 [r9467+84], r3621; +st.shared.u32 [r9467+88], r3658; +st.shared.u32 [r9467+92], r3695; +st.shared.u32 [r9467+96], r3732; +st.shared.u32 [r9467+100], r3769; +st.shared.u32 [r9467+104], r3806; +barrier.sync 0; +mad.lo.s32 r9468, r9465, -104, r9467; +ld.shared.u32 r3842, [r9468]; +ld.shared.u32 r4438, [r9468+972]; +ld.shared.u32 r5034, [r9468+1944]; +ld.shared.u32 r3928, [r9468+2916]; +ld.shared.u32 r4524, [r9468+3888]; +ld.shared.u32 r5120, [r9468+4860]; +ld.shared.u32 r4014, [r9468+5832]; +ld.shared.u32 r4610, [r9468+6804]; +ld.shared.u32 r5206, [r9468+7776]; +ld.shared.u32 r3839, [r9468+8748]; +ld.shared.u32 r4435, [r9468+9720]; +ld.shared.u32 r5031, [r9468+10692]; +ld.shared.u32 r3925, [r9468+11664]; +ld.shared.u32 r4521, [r9468+12636]; +ld.shared.u32 r5117, [r9468+13608]; +ld.shared.u32 r4011, [r9468+14580]; +ld.shared.u32 r4607, [r9468+15552]; +ld.shared.u32 r5203, [r9468+16524]; +ld.shared.u32 r3840, [r9468+17496]; +ld.shared.u32 r4436, [r9468+18468]; +ld.shared.u32 r5032, [r9468+19440]; +ld.shared.u32 r3926, [r9468+20412]; +ld.shared.u32 r4522, [r9468+21384]; +ld.shared.u32 r5118, [r9468+22356]; +ld.shared.u32 r4012, [r9468+23328]; +ld.shared.u32 r4608, [r9468+24300]; +ld.shared.u32 r5204, [r9468+25272]; +barrier.sync 0; +st.shared.u32 [r9467], r2108; +st.shared.u32 [r9467+4], r2890; +st.shared.u32 [r9467+8], r2927; +st.shared.u32 [r9467+12], r2964; +st.shared.u32 [r9467+16], r3001; +st.shared.u32 [r9467+20], r3038; +st.shared.u32 [r9467+24], r3075; +st.shared.u32 [r9467+28], r3112; +st.shared.u32 [r9467+32], r3149; +st.shared.u32 [r9467+36], r3186; +st.shared.u32 [r9467+40], r3223; +st.shared.u32 [r9467+44], r3260; +st.shared.u32 [r9467+48], r3297; +st.shared.u32 [r9467+52], r3334; +st.shared.u32 [r9467+56], r3371; +st.shared.u32 [r9467+60], r3408; +st.shared.u32 [r9467+64], r3445; +st.shared.u32 [r9467+68], r3482; +st.shared.u32 [r9467+72], r3519; +st.shared.u32 [r9467+76], r3556; +st.shared.u32 [r9467+80], r3593; +st.shared.u32 [r9467+84], r3630; +st.shared.u32 [r9467+88], r3667; +st.shared.u32 [r9467+92], r3704; +st.shared.u32 [r9467+96], r3741; +st.shared.u32 [r9467+100], r3778; +st.shared.u32 [r9467+104], r3815; +barrier.sync 0; +ld.shared.u32 r3848, [r9468]; +ld.shared.u32 r4444, [r9468+972]; +ld.shared.u32 r5040, [r9468+1944]; +ld.shared.u32 r3934, [r9468+2916]; +ld.shared.u32 r4530, [r9468+3888]; +ld.shared.u32 r5126, [r9468+4860]; +ld.shared.u32 r4020, [r9468+5832]; +ld.shared.u32 r4616, [r9468+6804]; +ld.shared.u32 r5212, [r9468+7776]; +ld.shared.u32 r3845, [r9468+8748]; +ld.shared.u32 r4441, [r9468+9720]; +ld.shared.u32 r5037, [r9468+10692]; +ld.shared.u32 r3931, [r9468+11664]; +ld.shared.u32 r4527, [r9468+12636]; +ld.shared.u32 r5123, [r9468+13608]; +ld.shared.u32 r4017, [r9468+14580]; +ld.shared.u32 r4613, [r9468+15552]; +ld.shared.u32 r5209, [r9468+16524]; +ld.shared.u32 r3846, [r9468+17496]; +ld.shared.u32 r4442, [r9468+18468]; +ld.shared.u32 r5038, [r9468+19440]; +ld.shared.u32 r3932, [r9468+20412]; +ld.shared.u32 r4528, [r9468+21384]; +ld.shared.u32 r5124, [r9468+22356]; +ld.shared.u32 r4018, [r9468+23328]; +ld.shared.u32 r4614, [r9468+24300]; +ld.shared.u32 r5210, [r9468+25272]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 r5937, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 r5943, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 r5961, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 r5979, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 r5997, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 r6015, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 r6023, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 r6029, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 r6047, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 r6065, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 r6083, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 r6101, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 r6109, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 r6115, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 r6133, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 r6151, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 r6169, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 r6187, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 r6195, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 r6201, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 r6219, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 r6237, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 r6255, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 r6273, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 r6281, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 r6287, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 r6305, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 r6323, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 r6341, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 r6359, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 r6367, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 r6373, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 r6391, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 r6409, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 r6427, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 r6445, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 r6453, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 r6459, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 r6477, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 r6495, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 r6513, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 r6531, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 r6539, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 r6545, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 r6563, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 r6581, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 r6599, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 r6617, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 r6625, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 r6631, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 r6649, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 r6667, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 r6685, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 r6703, r6694, r6700; +} +mul.wide.u32 rd4, r9465, 795364315; +shr.u64 rd5, rd4, 32; +cvt.u32.u64 r9469, rd5; +sub.s32 r9470, r9465, r9469; +shr.u32 r9471, r9470, 1; +add.s32 r9472, r9471, r9469; +shr.u32 r9473, r9472, 4; +mul.lo.s32 r9474, r9473, 27; +sub.s32 r9475, r9465, r9474; +shl.b32 r9476, r9475, 2; +add.s32 r9477, r9466, r9476; +cvt.rn.f32.u32 f900, r9473; +mul.f32 f901, f900, 0f3CD3D17E; +cos.approx.f32 f673, f901; +sin.approx.f32 f902, f901; +neg.f32 f674, f902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f673; +cvt.rn.f16.f32 high, f674; +mov.b32 r6706, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6711, {high, high}; +} +{ +mul.f16x2 r6713, r6029, r6711; +} +{ +fma.rn.f16x2 r6716, r6023, r6709, r6713; +} +{ +mul.f16x2 r6720, r6023, r6711; +} +{ +neg.f16x2 r6723, r6720; +} +{ +fma.rn.f16x2 r6725, r6029, r6709, r6723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6729, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6731, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6733, {low, high}; +} +{ +mul.f16x2 r6734, r6731, r6733; +} +{ +mul.f16x2 r6737, r6706, r6729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6740, {high, low}; +} +{ +fma.rn.f16x2 r6742, r6734, r6740, r6737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6748, {high, high}; +} +{ +mul.f16x2 r6750, r6115, r6748; +} +{ +fma.rn.f16x2 r6753, r6109, r6746, r6750; +} +{ +mul.f16x2 r6757, r6109, r6748; +} +{ +neg.f16x2 r6760, r6757; +} +{ +fma.rn.f16x2 r6762, r6115, r6746, r6760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6766, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6768, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6770, {low, high}; +} +{ +mul.f16x2 r6771, r6768, r6770; +} +{ +mul.f16x2 r6774, r6742, r6766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6742; +mov.b32 r6777, {high, low}; +} +{ +fma.rn.f16x2 r6779, r6771, r6777, r6774; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6785, {high, high}; +} +{ +mul.f16x2 r6787, r6201, r6785; +} +{ +fma.rn.f16x2 r6790, r6195, r6783, r6787; +} +{ +mul.f16x2 r6794, r6195, r6785; +} +{ +neg.f16x2 r6797, r6794; +} +{ +fma.rn.f16x2 r6799, r6201, r6783, r6797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6803, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6805, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6807, {low, high}; +} +{ +mul.f16x2 r6808, r6805, r6807; +} +{ +mul.f16x2 r6811, r6779, r6803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6779; +mov.b32 r6814, {high, low}; +} +{ +fma.rn.f16x2 r6816, r6808, r6814, r6811; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6822, {high, high}; +} +{ +mul.f16x2 r6824, r6287, r6822; +} +{ +fma.rn.f16x2 r6827, r6281, r6820, r6824; +} +{ +mul.f16x2 r6831, r6281, r6822; +} +{ +neg.f16x2 r6834, r6831; +} +{ +fma.rn.f16x2 r6836, r6287, r6820, r6834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6840, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6842, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6844, {low, high}; +} +{ +mul.f16x2 r6845, r6842, r6844; +} +{ +mul.f16x2 r6848, r6816, r6840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6816; +mov.b32 r6851, {high, low}; +} +{ +fma.rn.f16x2 r6853, r6845, r6851, r6848; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6859, {high, high}; +} +{ +mul.f16x2 r6861, r6373, r6859; +} +{ +fma.rn.f16x2 r6864, r6367, r6857, r6861; +} +{ +mul.f16x2 r6868, r6367, r6859; +} +{ +neg.f16x2 r6871, r6868; +} +{ +fma.rn.f16x2 r6873, r6373, r6857, r6871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6877, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6879, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6881, {low, high}; +} +{ +mul.f16x2 r6882, r6879, r6881; +} +{ +mul.f16x2 r6885, r6853, r6877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6853; +mov.b32 r6888, {high, low}; +} +{ +fma.rn.f16x2 r6890, r6882, r6888, r6885; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6896, {high, high}; +} +{ +mul.f16x2 r6898, r6459, r6896; +} +{ +fma.rn.f16x2 r6901, r6453, r6894, r6898; +} +{ +mul.f16x2 r6905, r6453, r6896; +} +{ +neg.f16x2 r6908, r6905; +} +{ +fma.rn.f16x2 r6910, r6459, r6894, r6908; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6914, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6916, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6918, {low, high}; +} +{ +mul.f16x2 r6919, r6916, r6918; +} +{ +mul.f16x2 r6922, r6890, r6914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6890; +mov.b32 r6925, {high, low}; +} +{ +fma.rn.f16x2 r6927, r6919, r6925, r6922; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6933, {high, high}; +} +{ +mul.f16x2 r6935, r6545, r6933; +} +{ +fma.rn.f16x2 r6938, r6539, r6931, r6935; +} +{ +mul.f16x2 r6942, r6539, r6933; +} +{ +neg.f16x2 r6945, r6942; +} +{ +fma.rn.f16x2 r6947, r6545, r6931, r6945; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6951, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6953, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6955, {low, high}; +} +{ +mul.f16x2 r6956, r6953, r6955; +} +{ +mul.f16x2 r6959, r6927, r6951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6927; +mov.b32 r6962, {high, low}; +} +{ +fma.rn.f16x2 r6964, r6956, r6962, r6959; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6970, {high, high}; +} +{ +mul.f16x2 r6972, r6631, r6970; +} +{ +fma.rn.f16x2 r6975, r6625, r6968, r6972; +} +{ +mul.f16x2 r6979, r6625, r6970; +} +{ +neg.f16x2 r6982, r6979; +} +{ +fma.rn.f16x2 r6984, r6631, r6968, r6982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r6990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r6992, {low, high}; +} +{ +mul.f16x2 r6993, r6990, r6992; +} +{ +mul.f16x2 r6996, r6964, r6988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6964; +mov.b32 r6999, {high, low}; +} +{ +fma.rn.f16x2 r7001, r6993, r6999, r6996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7007, {high, high}; +} +{ +mul.f16x2 r7009, r5997, r7007; +} +{ +fma.rn.f16x2 r7012, r5961, r7005, r7009; +} +{ +mul.f16x2 r7016, r5961, r7007; +} +{ +neg.f16x2 r7019, r7016; +} +{ +fma.rn.f16x2 r7021, r5997, r7005, r7019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7029, {low, high}; +} +{ +mul.f16x2 r7030, r7027, r7029; +} +{ +mul.f16x2 r7033, r7001, r7025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7001; +mov.b32 r7036, {high, low}; +} +{ +fma.rn.f16x2 r7038, r7030, r7036, r7033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7044, {high, high}; +} +{ +mul.f16x2 r7046, r6083, r7044; +} +{ +fma.rn.f16x2 r7049, r6047, r7042, r7046; +} +{ +mul.f16x2 r7053, r6047, r7044; +} +{ +neg.f16x2 r7056, r7053; +} +{ +fma.rn.f16x2 r7058, r6083, r7042, r7056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7066, {low, high}; +} +{ +mul.f16x2 r7067, r7064, r7066; +} +{ +mul.f16x2 r7070, r7038, r7062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7038; +mov.b32 r7073, {high, low}; +} +{ +fma.rn.f16x2 r7075, r7067, r7073, r7070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7081, {high, high}; +} +{ +mul.f16x2 r7083, r6169, r7081; +} +{ +fma.rn.f16x2 r7086, r6133, r7079, r7083; +} +{ +mul.f16x2 r7090, r6133, r7081; +} +{ +neg.f16x2 r7093, r7090; +} +{ +fma.rn.f16x2 r7095, r6169, r7079, r7093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7103, {low, high}; +} +{ +mul.f16x2 r7104, r7101, r7103; +} +{ +mul.f16x2 r7107, r7075, r7099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7075; +mov.b32 r7110, {high, low}; +} +{ +fma.rn.f16x2 r7112, r7104, r7110, r7107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7118, {high, high}; +} +{ +mul.f16x2 r7120, r6255, r7118; +} +{ +fma.rn.f16x2 r7123, r6219, r7116, r7120; +} +{ +mul.f16x2 r7127, r6219, r7118; +} +{ +neg.f16x2 r7130, r7127; +} +{ +fma.rn.f16x2 r7132, r6255, r7116, r7130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7140, {low, high}; +} +{ +mul.f16x2 r7141, r7138, r7140; +} +{ +mul.f16x2 r7144, r7112, r7136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7112; +mov.b32 r7147, {high, low}; +} +{ +fma.rn.f16x2 r7149, r7141, r7147, r7144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7155, {high, high}; +} +{ +mul.f16x2 r7157, r6341, r7155; +} +{ +fma.rn.f16x2 r7160, r6305, r7153, r7157; +} +{ +mul.f16x2 r7164, r6305, r7155; +} +{ +neg.f16x2 r7167, r7164; +} +{ +fma.rn.f16x2 r7169, r6341, r7153, r7167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7177, {low, high}; +} +{ +mul.f16x2 r7178, r7175, r7177; +} +{ +mul.f16x2 r7181, r7149, r7173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7149; +mov.b32 r7184, {high, low}; +} +{ +fma.rn.f16x2 r7186, r7178, r7184, r7181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7192, {high, high}; +} +{ +mul.f16x2 r7194, r6427, r7192; +} +{ +fma.rn.f16x2 r7197, r6391, r7190, r7194; +} +{ +mul.f16x2 r7201, r6391, r7192; +} +{ +neg.f16x2 r7204, r7201; +} +{ +fma.rn.f16x2 r7206, r6427, r7190, r7204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7214, {low, high}; +} +{ +mul.f16x2 r7215, r7212, r7214; +} +{ +mul.f16x2 r7218, r7186, r7210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7186; +mov.b32 r7221, {high, low}; +} +{ +fma.rn.f16x2 r7223, r7215, r7221, r7218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7229, {high, high}; +} +{ +mul.f16x2 r7231, r6513, r7229; +} +{ +fma.rn.f16x2 r7234, r6477, r7227, r7231; +} +{ +mul.f16x2 r7238, r6477, r7229; +} +{ +neg.f16x2 r7241, r7238; +} +{ +fma.rn.f16x2 r7243, r6513, r7227, r7241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7251, {low, high}; +} +{ +mul.f16x2 r7252, r7249, r7251; +} +{ +mul.f16x2 r7255, r7223, r7247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7223; +mov.b32 r7258, {high, low}; +} +{ +fma.rn.f16x2 r7260, r7252, r7258, r7255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7266, {high, high}; +} +{ +mul.f16x2 r7268, r6599, r7266; +} +{ +fma.rn.f16x2 r7271, r6563, r7264, r7268; +} +{ +mul.f16x2 r7275, r6563, r7266; +} +{ +neg.f16x2 r7278, r7275; +} +{ +fma.rn.f16x2 r7280, r6599, r7264, r7278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7288, {low, high}; +} +{ +mul.f16x2 r7289, r7286, r7288; +} +{ +mul.f16x2 r7292, r7260, r7284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7260; +mov.b32 r7295, {high, low}; +} +{ +fma.rn.f16x2 r7297, r7289, r7295, r7292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7303, {high, high}; +} +{ +mul.f16x2 r7305, r6685, r7303; +} +{ +fma.rn.f16x2 r7308, r6649, r7301, r7305; +} +{ +mul.f16x2 r7312, r6649, r7303; +} +{ +neg.f16x2 r7315, r7312; +} +{ +fma.rn.f16x2 r7317, r6685, r7301, r7315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7325, {low, high}; +} +{ +mul.f16x2 r7326, r7323, r7325; +} +{ +mul.f16x2 r7329, r7297, r7321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7297; +mov.b32 r7332, {high, low}; +} +{ +fma.rn.f16x2 r7334, r7326, r7332, r7329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7340, {high, high}; +} +{ +mul.f16x2 r7342, r6015, r7340; +} +{ +fma.rn.f16x2 r7345, r5979, r7338, r7342; +} +{ +mul.f16x2 r7349, r5979, r7340; +} +{ +neg.f16x2 r7352, r7349; +} +{ +fma.rn.f16x2 r7354, r6015, r7338, r7352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7362, {low, high}; +} +{ +mul.f16x2 r7363, r7360, r7362; +} +{ +mul.f16x2 r7366, r7334, r7358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7334; +mov.b32 r7369, {high, low}; +} +{ +fma.rn.f16x2 r7371, r7363, r7369, r7366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7377, {high, high}; +} +{ +mul.f16x2 r7379, r6101, r7377; +} +{ +fma.rn.f16x2 r7382, r6065, r7375, r7379; +} +{ +mul.f16x2 r7386, r6065, r7377; +} +{ +neg.f16x2 r7389, r7386; +} +{ +fma.rn.f16x2 r7391, r6101, r7375, r7389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7399, {low, high}; +} +{ +mul.f16x2 r7400, r7397, r7399; +} +{ +mul.f16x2 r7403, r7371, r7395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7371; +mov.b32 r7406, {high, low}; +} +{ +fma.rn.f16x2 r7408, r7400, r7406, r7403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7414, {high, high}; +} +{ +mul.f16x2 r7416, r6187, r7414; +} +{ +fma.rn.f16x2 r7419, r6151, r7412, r7416; +} +{ +mul.f16x2 r7423, r6151, r7414; +} +{ +neg.f16x2 r7426, r7423; +} +{ +fma.rn.f16x2 r7428, r6187, r7412, r7426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7436, {low, high}; +} +{ +mul.f16x2 r7437, r7434, r7436; +} +{ +mul.f16x2 r7440, r7408, r7432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7408; +mov.b32 r7443, {high, low}; +} +{ +fma.rn.f16x2 r7445, r7437, r7443, r7440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7451, {high, high}; +} +{ +mul.f16x2 r7453, r6273, r7451; +} +{ +fma.rn.f16x2 r7456, r6237, r7449, r7453; +} +{ +mul.f16x2 r7460, r6237, r7451; +} +{ +neg.f16x2 r7463, r7460; +} +{ +fma.rn.f16x2 r7465, r6273, r7449, r7463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7473, {low, high}; +} +{ +mul.f16x2 r7474, r7471, r7473; +} +{ +mul.f16x2 r7477, r7445, r7469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7445; +mov.b32 r7480, {high, low}; +} +{ +fma.rn.f16x2 r7482, r7474, r7480, r7477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7488, {high, high}; +} +{ +mul.f16x2 r7490, r6359, r7488; +} +{ +fma.rn.f16x2 r7493, r6323, r7486, r7490; +} +{ +mul.f16x2 r7497, r6323, r7488; +} +{ +neg.f16x2 r7500, r7497; +} +{ +fma.rn.f16x2 r7502, r6359, r7486, r7500; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7506, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7508, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7510, {low, high}; +} +{ +mul.f16x2 r7511, r7508, r7510; +} +{ +mul.f16x2 r7514, r7482, r7506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7482; +mov.b32 r7517, {high, low}; +} +{ +fma.rn.f16x2 r7519, r7511, r7517, r7514; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7525, {high, high}; +} +{ +mul.f16x2 r7527, r6445, r7525; +} +{ +fma.rn.f16x2 r7530, r6409, r7523, r7527; +} +{ +mul.f16x2 r7534, r6409, r7525; +} +{ +neg.f16x2 r7537, r7534; +} +{ +fma.rn.f16x2 r7539, r6445, r7523, r7537; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7543, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7545, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7547, {low, high}; +} +{ +mul.f16x2 r7548, r7545, r7547; +} +{ +mul.f16x2 r7551, r7519, r7543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7519; +mov.b32 r7554, {high, low}; +} +{ +fma.rn.f16x2 r7556, r7548, r7554, r7551; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7562, {high, high}; +} +{ +mul.f16x2 r7564, r6531, r7562; +} +{ +fma.rn.f16x2 r7567, r6495, r7560, r7564; +} +{ +mul.f16x2 r7571, r6495, r7562; +} +{ +neg.f16x2 r7574, r7571; +} +{ +fma.rn.f16x2 r7576, r6531, r7560, r7574; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7580, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7582, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7584, {low, high}; +} +{ +mul.f16x2 r7585, r7582, r7584; +} +{ +mul.f16x2 r7588, r7556, r7580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7556; +mov.b32 r7591, {high, low}; +} +{ +fma.rn.f16x2 r7593, r7585, r7591, r7588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7599, {high, high}; +} +{ +mul.f16x2 r7601, r6617, r7599; +} +{ +fma.rn.f16x2 r7604, r6581, r7597, r7601; +} +{ +mul.f16x2 r7608, r6581, r7599; +} +{ +neg.f16x2 r7611, r7608; +} +{ +fma.rn.f16x2 r7613, r6617, r7597, r7611; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7617, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r6706; +mov.b32 r7619, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f725; +cvt.rn.f16.f32 high, f726; +mov.b32 r7621, {low, high}; +} +{ +mul.f16x2 r7622, r7619, r7621; +} +{ +mul.f16x2 r7625, r7593, r7617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7593; +mov.b32 r7628, {high, low}; +} +{ +fma.rn.f16x2 r7630, r7622, r7628, r7625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r7630; +mov.b32 r7636, {high, high}; +} +{ +mul.f16x2 r7638, r6703, r7636; +} +{ +fma.rn.f16x2 r7641, r6667, r7634, r7638; +} +{ +mul.f16x2 r7645, r6667, r7636; +} +{ +neg.f16x2 r7648, r7645; +} +{ +fma.rn.f16x2 r7650, r6703, r7634, r7648; +} +barrier.sync 0; +mad.lo.s32 r9478, r9473, 2916, r9477; +st.shared.u32 [r9478], r5937; +st.shared.u32 [r9478+108], r6716; +st.shared.u32 [r9478+216], r6753; +st.shared.u32 [r9478+324], r6790; +st.shared.u32 [r9478+432], r6827; +st.shared.u32 [r9478+540], r6864; +st.shared.u32 [r9478+648], r6901; +st.shared.u32 [r9478+756], r6938; +st.shared.u32 [r9478+864], r6975; +st.shared.u32 [r9478+972], r7012; +st.shared.u32 [r9478+1080], r7049; +st.shared.u32 [r9478+1188], r7086; +st.shared.u32 [r9478+1296], r7123; +st.shared.u32 [r9478+1404], r7160; +st.shared.u32 [r9478+1512], r7197; +st.shared.u32 [r9478+1620], r7234; +st.shared.u32 [r9478+1728], r7271; +st.shared.u32 [r9478+1836], r7308; +st.shared.u32 [r9478+1944], r7345; +st.shared.u32 [r9478+2052], r7382; +st.shared.u32 [r9478+2160], r7419; +st.shared.u32 [r9478+2268], r7456; +st.shared.u32 [r9478+2376], r7493; +st.shared.u32 [r9478+2484], r7530; +st.shared.u32 [r9478+2592], r7567; +st.shared.u32 [r9478+2700], r7604; +st.shared.u32 [r9478+2808], r7641; +barrier.sync 0; +ld.shared.u32 r7677, [r9468]; +ld.shared.u32 r8273, [r9468+972]; +ld.shared.u32 r8869, [r9468+1944]; +ld.shared.u32 r7763, [r9468+2916]; +ld.shared.u32 r8359, [r9468+3888]; +ld.shared.u32 r8955, [r9468+4860]; +ld.shared.u32 r7849, [r9468+5832]; +ld.shared.u32 r8445, [r9468+6804]; +ld.shared.u32 r9041, [r9468+7776]; +ld.shared.u32 r7674, [r9468+8748]; +ld.shared.u32 r8270, [r9468+9720]; +ld.shared.u32 r8866, [r9468+10692]; +ld.shared.u32 r7760, [r9468+11664]; +ld.shared.u32 r8356, [r9468+12636]; +ld.shared.u32 r8952, [r9468+13608]; +ld.shared.u32 r7846, [r9468+14580]; +ld.shared.u32 r8442, [r9468+15552]; +ld.shared.u32 r9038, [r9468+16524]; +ld.shared.u32 r7675, [r9468+17496]; +ld.shared.u32 r8271, [r9468+18468]; +ld.shared.u32 r8867, [r9468+19440]; +ld.shared.u32 r7761, [r9468+20412]; +ld.shared.u32 r8357, [r9468+21384]; +ld.shared.u32 r8953, [r9468+22356]; +ld.shared.u32 r7847, [r9468+23328]; +ld.shared.u32 r8443, [r9468+24300]; +ld.shared.u32 r9039, [r9468+25272]; +barrier.sync 0; +st.shared.u32 [r9478], r5943; +st.shared.u32 [r9478+108], r6725; +st.shared.u32 [r9478+216], r6762; +st.shared.u32 [r9478+324], r6799; +st.shared.u32 [r9478+432], r6836; +st.shared.u32 [r9478+540], r6873; +st.shared.u32 [r9478+648], r6910; +st.shared.u32 [r9478+756], r6947; +st.shared.u32 [r9478+864], r6984; +st.shared.u32 [r9478+972], r7021; +st.shared.u32 [r9478+1080], r7058; +st.shared.u32 [r9478+1188], r7095; +st.shared.u32 [r9478+1296], r7132; +st.shared.u32 [r9478+1404], r7169; +st.shared.u32 [r9478+1512], r7206; +st.shared.u32 [r9478+1620], r7243; +st.shared.u32 [r9478+1728], r7280; +st.shared.u32 [r9478+1836], r7317; +st.shared.u32 [r9478+1944], r7354; +st.shared.u32 [r9478+2052], r7391; +st.shared.u32 [r9478+2160], r7428; +st.shared.u32 [r9478+2268], r7465; +st.shared.u32 [r9478+2376], r7502; +st.shared.u32 [r9478+2484], r7539; +st.shared.u32 [r9478+2592], r7576; +st.shared.u32 [r9478+2700], r7613; +st.shared.u32 [r9478+2808], r7650; +barrier.sync 0; +ld.shared.u32 r7683, [r9468]; +ld.shared.u32 r8279, [r9468+972]; +ld.shared.u32 r8875, [r9468+1944]; +ld.shared.u32 r7769, [r9468+2916]; +ld.shared.u32 r8365, [r9468+3888]; +ld.shared.u32 r8961, [r9468+4860]; +ld.shared.u32 r7855, [r9468+5832]; +ld.shared.u32 r8451, [r9468+6804]; +ld.shared.u32 r9047, [r9468+7776]; +ld.shared.u32 r7680, [r9468+8748]; +ld.shared.u32 r8276, [r9468+9720]; +ld.shared.u32 r8872, [r9468+10692]; +ld.shared.u32 r7766, [r9468+11664]; +ld.shared.u32 r8362, [r9468+12636]; +ld.shared.u32 r8958, [r9468+13608]; +ld.shared.u32 r7852, [r9468+14580]; +ld.shared.u32 r8448, [r9468+15552]; +ld.shared.u32 r9044, [r9468+16524]; +ld.shared.u32 r7681, [r9468+17496]; +ld.shared.u32 r8277, [r9468+18468]; +ld.shared.u32 r8873, [r9468+19440]; +ld.shared.u32 r7767, [r9468+20412]; +ld.shared.u32 r8363, [r9468+21384]; +ld.shared.u32 r8959, [r9468+22356]; +ld.shared.u32 r7853, [r9468+23328]; +ld.shared.u32 r8449, [r9468+24300]; +ld.shared.u32 r9045, [r9468+25272]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7671, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7672, {low, high}; +} +{ +add.f16x2 r7673, r7674, r7675; +} +{ +add.f16x2 r7676, r7677, r7673; +} +{ +add.f16x2 r7679, r7680, r7681; +} +{ +add.f16x2 r7682, r7683, r7679; +} +{ +add.f16x2 r7685, r7674, r7675; +} +{ +mul.f16x2 r7688, r7685, r7671; +} +{ +add.f16x2 r7691, r7677, r7688; +} +{ +sub.f16x2 r7694, r7680, r7681; +} +{ +mul.f16x2 r7697, r7694, r7672; +} +{ +add.f16x2 r7700, r7691, r7697; +} +{ +add.f16x2 r7703, r7674, r7675; +} +{ +mul.f16x2 r7706, r7703, r7671; +} +{ +add.f16x2 r7709, r7677, r7706; +} +{ +sub.f16x2 r7712, r7680, r7681; +} +{ +mul.f16x2 r7715, r7712, r7672; +} +{ +sub.f16x2 r7718, r7709, r7715; +} +{ +add.f16x2 r7721, r7680, r7681; +} +{ +mul.f16x2 r7724, r7721, r7671; +} +{ +add.f16x2 r7727, r7683, r7724; +} +{ +sub.f16x2 r7730, r7674, r7675; +} +{ +mul.f16x2 r7733, r7730, r7672; +} +{ +sub.f16x2 r7736, r7727, r7733; +} +{ +add.f16x2 r7739, r7680, r7681; +} +{ +mul.f16x2 r7742, r7739, r7671; +} +{ +add.f16x2 r7745, r7683, r7742; +} +{ +sub.f16x2 r7748, r7674, r7675; +} +{ +mul.f16x2 r7751, r7748, r7672; +} +{ +add.f16x2 r7754, r7745, r7751; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7757, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7758, {low, high}; +} +{ +add.f16x2 r7759, r7760, r7761; +} +{ +add.f16x2 r7762, r7763, r7759; +} +{ +add.f16x2 r7765, r7766, r7767; +} +{ +add.f16x2 r7768, r7769, r7765; +} +{ +add.f16x2 r7771, r7760, r7761; +} +{ +mul.f16x2 r7774, r7771, r7757; +} +{ +add.f16x2 r7777, r7763, r7774; +} +{ +sub.f16x2 r7780, r7766, r7767; +} +{ +mul.f16x2 r7783, r7780, r7758; +} +{ +add.f16x2 r7786, r7777, r7783; +} +{ +add.f16x2 r7789, r7760, r7761; +} +{ +mul.f16x2 r7792, r7789, r7757; +} +{ +add.f16x2 r7795, r7763, r7792; +} +{ +sub.f16x2 r7798, r7766, r7767; +} +{ +mul.f16x2 r7801, r7798, r7758; +} +{ +sub.f16x2 r7804, r7795, r7801; +} +{ +add.f16x2 r7807, r7766, r7767; +} +{ +mul.f16x2 r7810, r7807, r7757; +} +{ +add.f16x2 r7813, r7769, r7810; +} +{ +sub.f16x2 r7816, r7760, r7761; +} +{ +mul.f16x2 r7819, r7816, r7758; +} +{ +sub.f16x2 r7822, r7813, r7819; +} +{ +add.f16x2 r7825, r7766, r7767; +} +{ +mul.f16x2 r7828, r7825, r7757; +} +{ +add.f16x2 r7831, r7769, r7828; +} +{ +sub.f16x2 r7834, r7760, r7761; +} +{ +mul.f16x2 r7837, r7834, r7758; +} +{ +add.f16x2 r7840, r7831, r7837; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r7843, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r7844, {low, high}; +} +{ +add.f16x2 r7845, r7846, r7847; +} +{ +add.f16x2 r7848, r7849, r7845; +} +{ +add.f16x2 r7851, r7852, r7853; +} +{ +add.f16x2 r7854, r7855, r7851; +} +{ +add.f16x2 r7857, r7846, r7847; +} +{ +mul.f16x2 r7860, r7857, r7843; +} +{ +add.f16x2 r7863, r7849, r7860; +} +{ +sub.f16x2 r7866, r7852, r7853; +} +{ +mul.f16x2 r7869, r7866, r7844; +} +{ +add.f16x2 r7872, r7863, r7869; +} +{ +add.f16x2 r7875, r7846, r7847; +} +{ +mul.f16x2 r7878, r7875, r7843; +} +{ +add.f16x2 r7881, r7849, r7878; +} +{ +sub.f16x2 r7884, r7852, r7853; +} +{ +mul.f16x2 r7887, r7884, r7844; +} +{ +sub.f16x2 r7890, r7881, r7887; +} +{ +add.f16x2 r7893, r7852, r7853; +} +{ +mul.f16x2 r7896, r7893, r7843; +} +{ +add.f16x2 r7899, r7855, r7896; +} +{ +sub.f16x2 r7902, r7846, r7847; +} +{ +mul.f16x2 r7905, r7902, r7844; +} +{ +sub.f16x2 r7908, r7899, r7905; +} +{ +add.f16x2 r7911, r7852, r7853; +} +{ +mul.f16x2 r7914, r7911, r7843; +} +{ +add.f16x2 r7917, r7855, r7914; +} +{ +sub.f16x2 r7920, r7846, r7847; +} +{ +mul.f16x2 r7923, r7920, r7844; +} +{ +add.f16x2 r7926, r7917, r7923; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r7929, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r7930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r7931, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r7932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r7935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r7936, {low, high}; +} +{ +mul.f16x2 r7945, r7786, r7929; +} +{ +mul.f16x2 r7948, r7822, r7930; +} +{ +sub.f16x2 r7951, r7945, r7948; +} +{ +mul.f16x2 r7954, r7786, r7930; +} +{ +fma.rn.f16x2 r7957, r7822, r7929, r7954; +} +{ +mul.f16x2 r7961, r7872, r7931; +} +{ +mul.f16x2 r7964, r7908, r7932; +} +{ +sub.f16x2 r7967, r7961, r7964; +} +{ +mul.f16x2 r7970, r7872, r7932; +} +{ +fma.rn.f16x2 r7973, r7908, r7931, r7970; +} +{ +mul.f16x2 r7977, r7804, r7931; +} +{ +mul.f16x2 r7980, r7840, r7932; +} +{ +sub.f16x2 r7983, r7977, r7980; +} +{ +mul.f16x2 r7986, r7804, r7932; +} +{ +fma.rn.f16x2 r7989, r7840, r7931, r7986; +} +{ +mul.f16x2 r7993, r7890, r7935; +} +{ +mul.f16x2 r7996, r7926, r7936; +} +{ +sub.f16x2 r7999, r7993, r7996; +} +{ +mul.f16x2 r8002, r7890, r7936; +} +{ +fma.rn.f16x2 r8005, r7926, r7935, r8002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8010, {low, high}; +} +{ +add.f16x2 r8011, r7762, r7848; +} +{ +add.f16x2 %0, r7676, r8011; +} +{ +add.f16x2 r8017, r7768, r7854; +} +{ +add.f16x2 %1, r7682, r8017; +} +{ +add.f16x2 r8023, r7762, r7848; +} +{ +mul.f16x2 r8026, r8023, r8009; +} +{ +add.f16x2 r8029, r7676, r8026; +} +{ +sub.f16x2 r8032, r7768, r7854; +} +{ +mul.f16x2 r8035, r8032, r8010; +} +{ +add.f16x2 %18, r8029, r8035; +} +{ +add.f16x2 r8041, r7762, r7848; +} +{ +mul.f16x2 r8044, r8041, r8009; +} +{ +add.f16x2 r8047, r7676, r8044; +} +{ +sub.f16x2 r8050, r7768, r7854; +} +{ +mul.f16x2 r8053, r8050, r8010; +} +{ +sub.f16x2 %36, r8047, r8053; +} +{ +add.f16x2 r8059, r7768, r7854; +} +{ +mul.f16x2 r8062, r8059, r8009; +} +{ +add.f16x2 r8065, r7682, r8062; +} +{ +sub.f16x2 r8068, r7762, r7848; +} +{ +mul.f16x2 r8071, r8068, r8010; +} +{ +sub.f16x2 %19, r8065, r8071; +} +{ +add.f16x2 r8077, r7768, r7854; +} +{ +mul.f16x2 r8080, r8077, r8009; +} +{ +add.f16x2 r8083, r7682, r8080; +} +{ +sub.f16x2 r8086, r7762, r7848; +} +{ +mul.f16x2 r8089, r8086, r8010; +} +{ +add.f16x2 %37, r8083, r8089; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8096, {low, high}; +} +{ +add.f16x2 r8097, r7951, r7967; +} +{ +add.f16x2 %6, r7700, r8097; +} +{ +add.f16x2 r8103, r7957, r7973; +} +{ +add.f16x2 %7, r7736, r8103; +} +{ +add.f16x2 r8109, r7951, r7967; +} +{ +mul.f16x2 r8112, r8109, r8095; +} +{ +add.f16x2 r8115, r7700, r8112; +} +{ +sub.f16x2 r8118, r7957, r7973; +} +{ +mul.f16x2 r8121, r8118, r8096; +} +{ +add.f16x2 %24, r8115, r8121; +} +{ +add.f16x2 r8127, r7951, r7967; +} +{ +mul.f16x2 r8130, r8127, r8095; +} +{ +add.f16x2 r8133, r7700, r8130; +} +{ +sub.f16x2 r8136, r7957, r7973; +} +{ +mul.f16x2 r8139, r8136, r8096; +} +{ +sub.f16x2 %42, r8133, r8139; +} +{ +add.f16x2 r8145, r7957, r7973; +} +{ +mul.f16x2 r8148, r8145, r8095; +} +{ +add.f16x2 r8151, r7736, r8148; +} +{ +sub.f16x2 r8154, r7951, r7967; +} +{ +mul.f16x2 r8157, r8154, r8096; +} +{ +sub.f16x2 %25, r8151, r8157; +} +{ +add.f16x2 r8163, r7957, r7973; +} +{ +mul.f16x2 r8166, r8163, r8095; +} +{ +add.f16x2 r8169, r7736, r8166; +} +{ +sub.f16x2 r8172, r7951, r7967; +} +{ +mul.f16x2 r8175, r8172, r8096; +} +{ +add.f16x2 %43, r8169, r8175; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8181, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8182, {low, high}; +} +{ +add.f16x2 r8183, r7983, r7999; +} +{ +add.f16x2 %12, r7718, r8183; +} +{ +add.f16x2 r8189, r7989, r8005; +} +{ +add.f16x2 %13, r7754, r8189; +} +{ +add.f16x2 r8195, r7983, r7999; +} +{ +mul.f16x2 r8198, r8195, r8181; +} +{ +add.f16x2 r8201, r7718, r8198; +} +{ +sub.f16x2 r8204, r7989, r8005; +} +{ +mul.f16x2 r8207, r8204, r8182; +} +{ +add.f16x2 %30, r8201, r8207; +} +{ +add.f16x2 r8213, r7983, r7999; +} +{ +mul.f16x2 r8216, r8213, r8181; +} +{ +add.f16x2 r8219, r7718, r8216; +} +{ +sub.f16x2 r8222, r7989, r8005; +} +{ +mul.f16x2 r8225, r8222, r8182; +} +{ +sub.f16x2 %48, r8219, r8225; +} +{ +add.f16x2 r8231, r7989, r8005; +} +{ +mul.f16x2 r8234, r8231, r8181; +} +{ +add.f16x2 r8237, r7754, r8234; +} +{ +sub.f16x2 r8240, r7983, r7999; +} +{ +mul.f16x2 r8243, r8240, r8182; +} +{ +sub.f16x2 %31, r8237, r8243; +} +{ +add.f16x2 r8249, r7989, r8005; +} +{ +mul.f16x2 r8252, r8249, r8181; +} +{ +add.f16x2 r8255, r7754, r8252; +} +{ +sub.f16x2 r8258, r7983, r7999; +} +{ +mul.f16x2 r8261, r8258, r8182; +} +{ +add.f16x2 %49, r8255, r8261; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8268, {low, high}; +} +{ +add.f16x2 r8269, r8270, r8271; +} +{ +add.f16x2 r8272, r8273, r8269; +} +{ +add.f16x2 r8275, r8276, r8277; +} +{ +add.f16x2 r8278, r8279, r8275; +} +{ +add.f16x2 r8281, r8270, r8271; +} +{ +mul.f16x2 r8284, r8281, r8267; +} +{ +add.f16x2 r8287, r8273, r8284; +} +{ +sub.f16x2 r8290, r8276, r8277; +} +{ +mul.f16x2 r8293, r8290, r8268; +} +{ +add.f16x2 r8296, r8287, r8293; +} +{ +add.f16x2 r8299, r8270, r8271; +} +{ +mul.f16x2 r8302, r8299, r8267; +} +{ +add.f16x2 r8305, r8273, r8302; +} +{ +sub.f16x2 r8308, r8276, r8277; +} +{ +mul.f16x2 r8311, r8308, r8268; +} +{ +sub.f16x2 r8314, r8305, r8311; +} +{ +add.f16x2 r8317, r8276, r8277; +} +{ +mul.f16x2 r8320, r8317, r8267; +} +{ +add.f16x2 r8323, r8279, r8320; +} +{ +sub.f16x2 r8326, r8270, r8271; +} +{ +mul.f16x2 r8329, r8326, r8268; +} +{ +sub.f16x2 r8332, r8323, r8329; +} +{ +add.f16x2 r8335, r8276, r8277; +} +{ +mul.f16x2 r8338, r8335, r8267; +} +{ +add.f16x2 r8341, r8279, r8338; +} +{ +sub.f16x2 r8344, r8270, r8271; +} +{ +mul.f16x2 r8347, r8344, r8268; +} +{ +add.f16x2 r8350, r8341, r8347; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8353, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8354, {low, high}; +} +{ +add.f16x2 r8355, r8356, r8357; +} +{ +add.f16x2 r8358, r8359, r8355; +} +{ +add.f16x2 r8361, r8362, r8363; +} +{ +add.f16x2 r8364, r8365, r8361; +} +{ +add.f16x2 r8367, r8356, r8357; +} +{ +mul.f16x2 r8370, r8367, r8353; +} +{ +add.f16x2 r8373, r8359, r8370; +} +{ +sub.f16x2 r8376, r8362, r8363; +} +{ +mul.f16x2 r8379, r8376, r8354; +} +{ +add.f16x2 r8382, r8373, r8379; +} +{ +add.f16x2 r8385, r8356, r8357; +} +{ +mul.f16x2 r8388, r8385, r8353; +} +{ +add.f16x2 r8391, r8359, r8388; +} +{ +sub.f16x2 r8394, r8362, r8363; +} +{ +mul.f16x2 r8397, r8394, r8354; +} +{ +sub.f16x2 r8400, r8391, r8397; +} +{ +add.f16x2 r8403, r8362, r8363; +} +{ +mul.f16x2 r8406, r8403, r8353; +} +{ +add.f16x2 r8409, r8365, r8406; +} +{ +sub.f16x2 r8412, r8356, r8357; +} +{ +mul.f16x2 r8415, r8412, r8354; +} +{ +sub.f16x2 r8418, r8409, r8415; +} +{ +add.f16x2 r8421, r8362, r8363; +} +{ +mul.f16x2 r8424, r8421, r8353; +} +{ +add.f16x2 r8427, r8365, r8424; +} +{ +sub.f16x2 r8430, r8356, r8357; +} +{ +mul.f16x2 r8433, r8430, r8354; +} +{ +add.f16x2 r8436, r8427, r8433; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8439, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8440, {low, high}; +} +{ +add.f16x2 r8441, r8442, r8443; +} +{ +add.f16x2 r8444, r8445, r8441; +} +{ +add.f16x2 r8447, r8448, r8449; +} +{ +add.f16x2 r8450, r8451, r8447; +} +{ +add.f16x2 r8453, r8442, r8443; +} +{ +mul.f16x2 r8456, r8453, r8439; +} +{ +add.f16x2 r8459, r8445, r8456; +} +{ +sub.f16x2 r8462, r8448, r8449; +} +{ +mul.f16x2 r8465, r8462, r8440; +} +{ +add.f16x2 r8468, r8459, r8465; +} +{ +add.f16x2 r8471, r8442, r8443; +} +{ +mul.f16x2 r8474, r8471, r8439; +} +{ +add.f16x2 r8477, r8445, r8474; +} +{ +sub.f16x2 r8480, r8448, r8449; +} +{ +mul.f16x2 r8483, r8480, r8440; +} +{ +sub.f16x2 r8486, r8477, r8483; +} +{ +add.f16x2 r8489, r8448, r8449; +} +{ +mul.f16x2 r8492, r8489, r8439; +} +{ +add.f16x2 r8495, r8451, r8492; +} +{ +sub.f16x2 r8498, r8442, r8443; +} +{ +mul.f16x2 r8501, r8498, r8440; +} +{ +sub.f16x2 r8504, r8495, r8501; +} +{ +add.f16x2 r8507, r8448, r8449; +} +{ +mul.f16x2 r8510, r8507, r8439; +} +{ +add.f16x2 r8513, r8451, r8510; +} +{ +sub.f16x2 r8516, r8442, r8443; +} +{ +mul.f16x2 r8519, r8516, r8440; +} +{ +add.f16x2 r8522, r8513, r8519; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r8525, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r8526, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r8527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r8528, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r8531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r8532, {low, high}; +} +{ +mul.f16x2 r8541, r8382, r8525; +} +{ +mul.f16x2 r8544, r8418, r8526; +} +{ +sub.f16x2 r8547, r8541, r8544; +} +{ +mul.f16x2 r8550, r8382, r8526; +} +{ +fma.rn.f16x2 r8553, r8418, r8525, r8550; +} +{ +mul.f16x2 r8557, r8468, r8527; +} +{ +mul.f16x2 r8560, r8504, r8528; +} +{ +sub.f16x2 r8563, r8557, r8560; +} +{ +mul.f16x2 r8566, r8468, r8528; +} +{ +fma.rn.f16x2 r8569, r8504, r8527, r8566; +} +{ +mul.f16x2 r8573, r8400, r8527; +} +{ +mul.f16x2 r8576, r8436, r8528; +} +{ +sub.f16x2 r8579, r8573, r8576; +} +{ +mul.f16x2 r8582, r8400, r8528; +} +{ +fma.rn.f16x2 r8585, r8436, r8527, r8582; +} +{ +mul.f16x2 r8589, r8486, r8531; +} +{ +mul.f16x2 r8592, r8522, r8532; +} +{ +sub.f16x2 r8595, r8589, r8592; +} +{ +mul.f16x2 r8598, r8486, r8532; +} +{ +fma.rn.f16x2 r8601, r8522, r8531, r8598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8605, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8606, {low, high}; +} +{ +add.f16x2 r8607, r8358, r8444; +} +{ +add.f16x2 %2, r8272, r8607; +} +{ +add.f16x2 r8613, r8364, r8450; +} +{ +add.f16x2 %3, r8278, r8613; +} +{ +add.f16x2 r8619, r8358, r8444; +} +{ +mul.f16x2 r8622, r8619, r8605; +} +{ +add.f16x2 r8625, r8272, r8622; +} +{ +sub.f16x2 r8628, r8364, r8450; +} +{ +mul.f16x2 r8631, r8628, r8606; +} +{ +add.f16x2 %20, r8625, r8631; +} +{ +add.f16x2 r8637, r8358, r8444; +} +{ +mul.f16x2 r8640, r8637, r8605; +} +{ +add.f16x2 r8643, r8272, r8640; +} +{ +sub.f16x2 r8646, r8364, r8450; +} +{ +mul.f16x2 r8649, r8646, r8606; +} +{ +sub.f16x2 %38, r8643, r8649; +} +{ +add.f16x2 r8655, r8364, r8450; +} +{ +mul.f16x2 r8658, r8655, r8605; +} +{ +add.f16x2 r8661, r8278, r8658; +} +{ +sub.f16x2 r8664, r8358, r8444; +} +{ +mul.f16x2 r8667, r8664, r8606; +} +{ +sub.f16x2 %21, r8661, r8667; +} +{ +add.f16x2 r8673, r8364, r8450; +} +{ +mul.f16x2 r8676, r8673, r8605; +} +{ +add.f16x2 r8679, r8278, r8676; +} +{ +sub.f16x2 r8682, r8358, r8444; +} +{ +mul.f16x2 r8685, r8682, r8606; +} +{ +add.f16x2 %39, r8679, r8685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8692, {low, high}; +} +{ +add.f16x2 r8693, r8547, r8563; +} +{ +add.f16x2 %8, r8296, r8693; +} +{ +add.f16x2 r8699, r8553, r8569; +} +{ +add.f16x2 %9, r8332, r8699; +} +{ +add.f16x2 r8705, r8547, r8563; +} +{ +mul.f16x2 r8708, r8705, r8691; +} +{ +add.f16x2 r8711, r8296, r8708; +} +{ +sub.f16x2 r8714, r8553, r8569; +} +{ +mul.f16x2 r8717, r8714, r8692; +} +{ +add.f16x2 %26, r8711, r8717; +} +{ +add.f16x2 r8723, r8547, r8563; +} +{ +mul.f16x2 r8726, r8723, r8691; +} +{ +add.f16x2 r8729, r8296, r8726; +} +{ +sub.f16x2 r8732, r8553, r8569; +} +{ +mul.f16x2 r8735, r8732, r8692; +} +{ +sub.f16x2 %44, r8729, r8735; +} +{ +add.f16x2 r8741, r8553, r8569; +} +{ +mul.f16x2 r8744, r8741, r8691; +} +{ +add.f16x2 r8747, r8332, r8744; +} +{ +sub.f16x2 r8750, r8547, r8563; +} +{ +mul.f16x2 r8753, r8750, r8692; +} +{ +sub.f16x2 %27, r8747, r8753; +} +{ +add.f16x2 r8759, r8553, r8569; +} +{ +mul.f16x2 r8762, r8759, r8691; +} +{ +add.f16x2 r8765, r8332, r8762; +} +{ +sub.f16x2 r8768, r8547, r8563; +} +{ +mul.f16x2 r8771, r8768, r8692; +} +{ +add.f16x2 %45, r8765, r8771; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8777, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8778, {low, high}; +} +{ +add.f16x2 r8779, r8579, r8595; +} +{ +add.f16x2 %14, r8314, r8779; +} +{ +add.f16x2 r8785, r8585, r8601; +} +{ +add.f16x2 %15, r8350, r8785; +} +{ +add.f16x2 r8791, r8579, r8595; +} +{ +mul.f16x2 r8794, r8791, r8777; +} +{ +add.f16x2 r8797, r8314, r8794; +} +{ +sub.f16x2 r8800, r8585, r8601; +} +{ +mul.f16x2 r8803, r8800, r8778; +} +{ +add.f16x2 %32, r8797, r8803; +} +{ +add.f16x2 r8809, r8579, r8595; +} +{ +mul.f16x2 r8812, r8809, r8777; +} +{ +add.f16x2 r8815, r8314, r8812; +} +{ +sub.f16x2 r8818, r8585, r8601; +} +{ +mul.f16x2 r8821, r8818, r8778; +} +{ +sub.f16x2 %50, r8815, r8821; +} +{ +add.f16x2 r8827, r8585, r8601; +} +{ +mul.f16x2 r8830, r8827, r8777; +} +{ +add.f16x2 r8833, r8350, r8830; +} +{ +sub.f16x2 r8836, r8579, r8595; +} +{ +mul.f16x2 r8839, r8836, r8778; +} +{ +sub.f16x2 %33, r8833, r8839; +} +{ +add.f16x2 r8845, r8585, r8601; +} +{ +mul.f16x2 r8848, r8845, r8777; +} +{ +add.f16x2 r8851, r8350, r8848; +} +{ +sub.f16x2 r8854, r8579, r8595; +} +{ +mul.f16x2 r8857, r8854, r8778; +} +{ +add.f16x2 %51, r8851, r8857; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8863, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8864, {low, high}; +} +{ +add.f16x2 r8865, r8866, r8867; +} +{ +add.f16x2 r8868, r8869, r8865; +} +{ +add.f16x2 r8871, r8872, r8873; +} +{ +add.f16x2 r8874, r8875, r8871; +} +{ +add.f16x2 r8877, r8866, r8867; +} +{ +mul.f16x2 r8880, r8877, r8863; +} +{ +add.f16x2 r8883, r8869, r8880; +} +{ +sub.f16x2 r8886, r8872, r8873; +} +{ +mul.f16x2 r8889, r8886, r8864; +} +{ +add.f16x2 r8892, r8883, r8889; +} +{ +add.f16x2 r8895, r8866, r8867; +} +{ +mul.f16x2 r8898, r8895, r8863; +} +{ +add.f16x2 r8901, r8869, r8898; +} +{ +sub.f16x2 r8904, r8872, r8873; +} +{ +mul.f16x2 r8907, r8904, r8864; +} +{ +sub.f16x2 r8910, r8901, r8907; +} +{ +add.f16x2 r8913, r8872, r8873; +} +{ +mul.f16x2 r8916, r8913, r8863; +} +{ +add.f16x2 r8919, r8875, r8916; +} +{ +sub.f16x2 r8922, r8866, r8867; +} +{ +mul.f16x2 r8925, r8922, r8864; +} +{ +sub.f16x2 r8928, r8919, r8925; +} +{ +add.f16x2 r8931, r8872, r8873; +} +{ +mul.f16x2 r8934, r8931, r8863; +} +{ +add.f16x2 r8937, r8875, r8934; +} +{ +sub.f16x2 r8940, r8866, r8867; +} +{ +mul.f16x2 r8943, r8940, r8864; +} +{ +add.f16x2 r8946, r8937, r8943; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r8949, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r8950, {low, high}; +} +{ +add.f16x2 r8951, r8952, r8953; +} +{ +add.f16x2 r8954, r8955, r8951; +} +{ +add.f16x2 r8957, r8958, r8959; +} +{ +add.f16x2 r8960, r8961, r8957; +} +{ +add.f16x2 r8963, r8952, r8953; +} +{ +mul.f16x2 r8966, r8963, r8949; +} +{ +add.f16x2 r8969, r8955, r8966; +} +{ +sub.f16x2 r8972, r8958, r8959; +} +{ +mul.f16x2 r8975, r8972, r8950; +} +{ +add.f16x2 r8978, r8969, r8975; +} +{ +add.f16x2 r8981, r8952, r8953; +} +{ +mul.f16x2 r8984, r8981, r8949; +} +{ +add.f16x2 r8987, r8955, r8984; +} +{ +sub.f16x2 r8990, r8958, r8959; +} +{ +mul.f16x2 r8993, r8990, r8950; +} +{ +sub.f16x2 r8996, r8987, r8993; +} +{ +add.f16x2 r8999, r8958, r8959; +} +{ +mul.f16x2 r9002, r8999, r8949; +} +{ +add.f16x2 r9005, r8961, r9002; +} +{ +sub.f16x2 r9008, r8952, r8953; +} +{ +mul.f16x2 r9011, r9008, r8950; +} +{ +sub.f16x2 r9014, r9005, r9011; +} +{ +add.f16x2 r9017, r8958, r8959; +} +{ +mul.f16x2 r9020, r9017, r8949; +} +{ +add.f16x2 r9023, r8961, r9020; +} +{ +sub.f16x2 r9026, r8952, r8953; +} +{ +mul.f16x2 r9029, r9026, r8950; +} +{ +add.f16x2 r9032, r9023, r9029; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9035, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9036, {low, high}; +} +{ +add.f16x2 r9037, r9038, r9039; +} +{ +add.f16x2 r9040, r9041, r9037; +} +{ +add.f16x2 r9043, r9044, r9045; +} +{ +add.f16x2 r9046, r9047, r9043; +} +{ +add.f16x2 r9049, r9038, r9039; +} +{ +mul.f16x2 r9052, r9049, r9035; +} +{ +add.f16x2 r9055, r9041, r9052; +} +{ +sub.f16x2 r9058, r9044, r9045; +} +{ +mul.f16x2 r9061, r9058, r9036; +} +{ +add.f16x2 r9064, r9055, r9061; +} +{ +add.f16x2 r9067, r9038, r9039; +} +{ +mul.f16x2 r9070, r9067, r9035; +} +{ +add.f16x2 r9073, r9041, r9070; +} +{ +sub.f16x2 r9076, r9044, r9045; +} +{ +mul.f16x2 r9079, r9076, r9036; +} +{ +sub.f16x2 r9082, r9073, r9079; +} +{ +add.f16x2 r9085, r9044, r9045; +} +{ +mul.f16x2 r9088, r9085, r9035; +} +{ +add.f16x2 r9091, r9047, r9088; +} +{ +sub.f16x2 r9094, r9038, r9039; +} +{ +mul.f16x2 r9097, r9094, r9036; +} +{ +sub.f16x2 r9100, r9091, r9097; +} +{ +add.f16x2 r9103, r9044, r9045; +} +{ +mul.f16x2 r9106, r9103, r9035; +} +{ +add.f16x2 r9109, r9047, r9106; +} +{ +sub.f16x2 r9112, r9038, r9039; +} +{ +mul.f16x2 r9115, r9112, r9036; +} +{ +add.f16x2 r9118, r9109, r9115; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f854; +cvt.rn.f16.f32 high, f854; +mov.b32 r9121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f856; +cvt.rn.f16.f32 high, f856; +mov.b32 r9122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f858; +cvt.rn.f16.f32 high, f858; +mov.b32 r9123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f860; +cvt.rn.f16.f32 high, f860; +mov.b32 r9124, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f866; +cvt.rn.f16.f32 high, f866; +mov.b32 r9127, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f868; +cvt.rn.f16.f32 high, f868; +mov.b32 r9128, {low, high}; +} +{ +mul.f16x2 r9137, r8978, r9121; +} +{ +mul.f16x2 r9140, r9014, r9122; +} +{ +sub.f16x2 r9143, r9137, r9140; +} +{ +mul.f16x2 r9146, r8978, r9122; +} +{ +fma.rn.f16x2 r9149, r9014, r9121, r9146; +} +{ +mul.f16x2 r9153, r9064, r9123; +} +{ +mul.f16x2 r9156, r9100, r9124; +} +{ +sub.f16x2 r9159, r9153, r9156; +} +{ +mul.f16x2 r9162, r9064, r9124; +} +{ +fma.rn.f16x2 r9165, r9100, r9123, r9162; +} +{ +mul.f16x2 r9169, r8996, r9123; +} +{ +mul.f16x2 r9172, r9032, r9124; +} +{ +sub.f16x2 r9175, r9169, r9172; +} +{ +mul.f16x2 r9178, r8996, r9124; +} +{ +fma.rn.f16x2 r9181, r9032, r9123, r9178; +} +{ +mul.f16x2 r9185, r9082, r9127; +} +{ +mul.f16x2 r9188, r9118, r9128; +} +{ +sub.f16x2 r9191, r9185, r9188; +} +{ +mul.f16x2 r9194, r9082, r9128; +} +{ +fma.rn.f16x2 r9197, r9118, r9127, r9194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9201, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9202, {low, high}; +} +{ +add.f16x2 r9203, r8954, r9040; +} +{ +add.f16x2 %4, r8868, r9203; +} +{ +add.f16x2 r9209, r8960, r9046; +} +{ +add.f16x2 %5, r8874, r9209; +} +{ +add.f16x2 r9215, r8954, r9040; +} +{ +mul.f16x2 r9218, r9215, r9201; +} +{ +add.f16x2 r9221, r8868, r9218; +} +{ +sub.f16x2 r9224, r8960, r9046; +} +{ +mul.f16x2 r9227, r9224, r9202; +} +{ +add.f16x2 %22, r9221, r9227; +} +{ +add.f16x2 r9233, r8954, r9040; +} +{ +mul.f16x2 r9236, r9233, r9201; +} +{ +add.f16x2 r9239, r8868, r9236; +} +{ +sub.f16x2 r9242, r8960, r9046; +} +{ +mul.f16x2 r9245, r9242, r9202; +} +{ +sub.f16x2 %40, r9239, r9245; +} +{ +add.f16x2 r9251, r8960, r9046; +} +{ +mul.f16x2 r9254, r9251, r9201; +} +{ +add.f16x2 r9257, r8874, r9254; +} +{ +sub.f16x2 r9260, r8954, r9040; +} +{ +mul.f16x2 r9263, r9260, r9202; +} +{ +sub.f16x2 %23, r9257, r9263; +} +{ +add.f16x2 r9269, r8960, r9046; +} +{ +mul.f16x2 r9272, r9269, r9201; +} +{ +add.f16x2 r9275, r8874, r9272; +} +{ +sub.f16x2 r9278, r8954, r9040; +} +{ +mul.f16x2 r9281, r9278, r9202; +} +{ +add.f16x2 %41, r9275, r9281; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9288, {low, high}; +} +{ +add.f16x2 r9289, r9143, r9159; +} +{ +add.f16x2 %10, r8892, r9289; +} +{ +add.f16x2 r9295, r9149, r9165; +} +{ +add.f16x2 %11, r8928, r9295; +} +{ +add.f16x2 r9301, r9143, r9159; +} +{ +mul.f16x2 r9304, r9301, r9287; +} +{ +add.f16x2 r9307, r8892, r9304; +} +{ +sub.f16x2 r9310, r9149, r9165; +} +{ +mul.f16x2 r9313, r9310, r9288; +} +{ +add.f16x2 %28, r9307, r9313; +} +{ +add.f16x2 r9319, r9143, r9159; +} +{ +mul.f16x2 r9322, r9319, r9287; +} +{ +add.f16x2 r9325, r8892, r9322; +} +{ +sub.f16x2 r9328, r9149, r9165; +} +{ +mul.f16x2 r9331, r9328, r9288; +} +{ +sub.f16x2 %46, r9325, r9331; +} +{ +add.f16x2 r9337, r9149, r9165; +} +{ +mul.f16x2 r9340, r9337, r9287; +} +{ +add.f16x2 r9343, r8928, r9340; +} +{ +sub.f16x2 r9346, r9143, r9159; +} +{ +mul.f16x2 r9349, r9346, r9288; +} +{ +sub.f16x2 %29, r9343, r9349; +} +{ +add.f16x2 r9355, r9149, r9165; +} +{ +mul.f16x2 r9358, r9355, r9287; +} +{ +add.f16x2 r9361, r8928, r9358; +} +{ +sub.f16x2 r9364, r9143, r9159; +} +{ +mul.f16x2 r9367, r9364, r9288; +} +{ +add.f16x2 %47, r9361, r9367; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f894; +cvt.rn.f16.f32 high, f894; +mov.b32 r9373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f896; +cvt.rn.f16.f32 high, f896; +mov.b32 r9374, {low, high}; +} +{ +add.f16x2 r9375, r9175, r9191; +} +{ +add.f16x2 %16, r8910, r9375; +} +{ +add.f16x2 r9381, r9181, r9197; +} +{ +add.f16x2 %17, r8946, r9381; +} +{ +add.f16x2 r9387, r9175, r9191; +} +{ +mul.f16x2 r9390, r9387, r9373; +} +{ +add.f16x2 r9393, r8910, r9390; +} +{ +sub.f16x2 r9396, r9181, r9197; +} +{ +mul.f16x2 r9399, r9396, r9374; +} +{ +add.f16x2 %34, r9393, r9399; +} +{ +add.f16x2 r9405, r9175, r9191; +} +{ +mul.f16x2 r9408, r9405, r9373; +} +{ +add.f16x2 r9411, r8910, r9408; +} +{ +sub.f16x2 r9414, r9181, r9197; +} +{ +mul.f16x2 r9417, r9414, r9374; +} +{ +sub.f16x2 %52, r9411, r9417; +} +{ +add.f16x2 r9423, r9181, r9197; +} +{ +mul.f16x2 r9426, r9423, r9373; +} +{ +add.f16x2 r9429, r8946, r9426; +} +{ +sub.f16x2 r9432, r9175, r9191; +} +{ +mul.f16x2 r9435, r9432, r9374; +} +{ +sub.f16x2 %35, r9429, r9435; +} +{ +add.f16x2 r9441, r9181, r9197; +} +{ +mul.f16x2 r9444, r9441, r9373; +} +{ +add.f16x2 r9447, r8946, r9444; +} +{ +sub.f16x2 r9450, r9175, r9191; +} +{ +mul.f16x2 r9453, r9450, r9374; +} +{ +add.f16x2 %53, r9447, r9453; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1101, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<294>; +.reg .b32 r<3305>; +.reg .b64 rd<8>; +mov.u32 r3282, %tid.y; +mov.u32 r3283, %18; +mad.lo.s32 r3284, r3282, 52488, r3283; +mov.u32 r3285, %tid.x; +mov.f32 f282, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1, {low, high}; +} +mov.f32 f284, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f242, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r259, {low, high}; +} +mov.f32 f244, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r260, {low, high}; +} +mov.f32 f246, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r261, {low, high}; +} +mov.f32 f248, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r262, {low, high}; +} +mov.f32 f254, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r265, {low, high}; +} +mov.f32 f256, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r3285, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3286, rd3; +mul.lo.s32 r3287, r3286, 729; +sub.s32 r3288, r3285, r3287; +cvt.rn.f32.u32 f285, r3288; +mul.f32 f286, f285, 0f3A7B0B40; +cos.approx.f32 f57, f286; +sin.approx.f32 f287, f286; +neg.f32 f58, f287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +mad.lo.s32 r3289, r3286, 52488, r3284; +barrier.sync 0; +mad.lo.s32 r3290, r3288, 72, r3289; +st.shared.v2.f32 [r3290], {r344, r350}; +st.shared.v2.f32 [r3290+8], {r607, r616}; +st.shared.v2.f32 [r3290+16], {r644, r653}; +st.shared.v2.f32 [r3290+24], {r681, r690}; +st.shared.v2.f32 [r3290+32], {r718, r727}; +st.shared.v2.f32 [r3290+40], {r755, r764}; +st.shared.v2.f32 [r3290+48], {r792, r801}; +st.shared.v2.f32 [r3290+56], {r829, r838}; +st.shared.v2.f32 [r3290+64], {r866, r875}; +barrier.sync 0; +shl.b32 r3291, r3288, 6; +sub.s32 r3292, r3290, r3291; +ld.shared.u32 r902, [r3292]; +ld.shared.u32 r908, [r3292+4]; +ld.shared.u32 r988, [r3292+5832]; +ld.shared.u32 r994, [r3292+5836]; +ld.shared.u32 r1074, [r3292+11664]; +ld.shared.u32 r1080, [r3292+11668]; +ld.shared.u32 r899, [r3292+17496]; +ld.shared.u32 r905, [r3292+17500]; +ld.shared.u32 r985, [r3292+23328]; +ld.shared.u32 r991, [r3292+23332]; +ld.shared.u32 r1071, [r3292+29160]; +ld.shared.u32 r1077, [r3292+29164]; +ld.shared.u32 r900, [r3292+34992]; +ld.shared.u32 r906, [r3292+34996]; +ld.shared.u32 r986, [r3292+40824]; +ld.shared.u32 r992, [r3292+40828]; +ld.shared.u32 r1072, [r3292+46656]; +ld.shared.u32 r1078, [r3292+46660]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r3288, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r3293, rd5; +cvt.rn.f32.u32 f288, r3293; +mul.f32 f289, f288, 0f3C0D3654; +cos.approx.f32 f133, f289; +sin.approx.f32 f290, f289; +neg.f32 f134, f290; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +mul.lo.s32 r3294, r3293, 9; +sub.s32 r3295, r3288, r3294; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +shl.b32 r3296, r3295, 3; +add.s32 r3297, r3289, r3296; +barrier.sync 0; +mad.lo.s32 r3298, r3293, 648, r3297; +st.shared.u32 [r3298], r1239; +st.shared.u32 [r3298+4], r1245; +st.shared.u32 [r3298+72], r1502; +st.shared.u32 [r3298+76], r1511; +st.shared.u32 [r3298+144], r1539; +st.shared.u32 [r3298+148], r1548; +st.shared.u32 [r3298+216], r1576; +st.shared.u32 [r3298+220], r1585; +st.shared.u32 [r3298+288], r1613; +st.shared.u32 [r3298+292], r1622; +st.shared.u32 [r3298+360], r1650; +st.shared.u32 [r3298+364], r1659; +st.shared.u32 [r3298+432], r1687; +st.shared.u32 [r3298+436], r1696; +st.shared.u32 [r3298+504], r1724; +st.shared.u32 [r3298+508], r1733; +st.shared.u32 [r3298+576], r1761; +st.shared.u32 [r3298+580], r1770; +barrier.sync 0; +ld.shared.u32 r1797, [r3292]; +ld.shared.u32 r1803, [r3292+4]; +ld.shared.u32 r1883, [r3292+5832]; +ld.shared.u32 r1889, [r3292+5836]; +ld.shared.u32 r1969, [r3292+11664]; +ld.shared.u32 r1975, [r3292+11668]; +ld.shared.u32 r1794, [r3292+17496]; +ld.shared.u32 r1800, [r3292+17500]; +ld.shared.u32 r1880, [r3292+23328]; +ld.shared.u32 r1886, [r3292+23332]; +ld.shared.u32 r1966, [r3292+29160]; +ld.shared.u32 r1972, [r3292+29164]; +ld.shared.u32 r1795, [r3292+34992]; +ld.shared.u32 r1801, [r3292+34996]; +ld.shared.u32 r1881, [r3292+40824]; +ld.shared.u32 r1887, [r3292+40828]; +ld.shared.u32 r1967, [r3292+46656]; +ld.shared.u32 r1973, [r3292+46660]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 r1802, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 r1820, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 r1838, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 r1856, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 r1874, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 r1942, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 r1968, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 r1974, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 r1992, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 r2010, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 r2028, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2065, r1906, r2049; +} +{ +mul.f16x2 r2068, r1942, r2050; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1906, r2050; +} +{ +fma.rn.f16x2 r2077, r1942, r2049, r2074; +} +{ +mul.f16x2 r2081, r1992, r2051; +} +{ +mul.f16x2 r2084, r2028, r2052; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1992, r2052; +} +{ +fma.rn.f16x2 r2093, r2028, r2051, r2090; +} +{ +mul.f16x2 r2097, r1924, r2051; +} +{ +mul.f16x2 r2100, r1960, r2052; +} +{ +sub.f16x2 r2103, r2097, r2100; +} +{ +mul.f16x2 r2106, r1924, r2052; +} +{ +fma.rn.f16x2 r2109, r1960, r2051, r2106; +} +{ +mul.f16x2 r2113, r2010, r2055; +} +{ +mul.f16x2 r2116, r2046, r2056; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r2010, r2056; +} +{ +fma.rn.f16x2 r2125, r2046, r2055, r2122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2130, {low, high}; +} +{ +add.f16x2 r2131, r1882, r1968; +} +{ +add.f16x2 r2134, r1796, r2131; +} +{ +add.f16x2 r2137, r1888, r1974; +} +{ +add.f16x2 r2140, r1802, r2137; +} +{ +add.f16x2 r2143, r1882, r1968; +} +{ +mul.f16x2 r2146, r2143, r2129; +} +{ +add.f16x2 r2149, r1796, r2146; +} +{ +sub.f16x2 r2152, r1888, r1974; +} +{ +mul.f16x2 r2155, r2152, r2130; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +add.f16x2 r2161, r1882, r1968; +} +{ +mul.f16x2 r2164, r2161, r2129; +} +{ +add.f16x2 r2167, r1796, r2164; +} +{ +sub.f16x2 r2170, r1888, r1974; +} +{ +mul.f16x2 r2173, r2170, r2130; +} +{ +sub.f16x2 r2176, r2167, r2173; +} +{ +add.f16x2 r2179, r1888, r1974; +} +{ +mul.f16x2 r2182, r2179, r2129; +} +{ +add.f16x2 r2185, r1802, r2182; +} +{ +sub.f16x2 r2188, r1882, r1968; +} +{ +mul.f16x2 r2191, r2188, r2130; +} +{ +sub.f16x2 r2194, r2185, r2191; +} +{ +add.f16x2 r2197, r1888, r1974; +} +{ +mul.f16x2 r2200, r2197, r2129; +} +{ +add.f16x2 r2203, r1802, r2200; +} +{ +sub.f16x2 r2206, r1882, r1968; +} +{ +mul.f16x2 r2209, r2206, r2130; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2215, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2216, {low, high}; +} +{ +add.f16x2 r2217, r2071, r2087; +} +{ +add.f16x2 r2220, r1820, r2217; +} +{ +add.f16x2 r2223, r2077, r2093; +} +{ +add.f16x2 r2226, r1856, r2223; +} +{ +add.f16x2 r2229, r2071, r2087; +} +{ +mul.f16x2 r2232, r2229, r2215; +} +{ +add.f16x2 r2235, r1820, r2232; +} +{ +sub.f16x2 r2238, r2077, r2093; +} +{ +mul.f16x2 r2241, r2238, r2216; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +add.f16x2 r2247, r2071, r2087; +} +{ +mul.f16x2 r2250, r2247, r2215; +} +{ +add.f16x2 r2253, r1820, r2250; +} +{ +sub.f16x2 r2256, r2077, r2093; +} +{ +mul.f16x2 r2259, r2256, r2216; +} +{ +sub.f16x2 r2262, r2253, r2259; +} +{ +add.f16x2 r2265, r2077, r2093; +} +{ +mul.f16x2 r2268, r2265, r2215; +} +{ +add.f16x2 r2271, r1856, r2268; +} +{ +sub.f16x2 r2274, r2071, r2087; +} +{ +mul.f16x2 r2277, r2274, r2216; +} +{ +sub.f16x2 r2280, r2271, r2277; +} +{ +add.f16x2 r2283, r2077, r2093; +} +{ +mul.f16x2 r2286, r2283, r2215; +} +{ +add.f16x2 r2289, r1856, r2286; +} +{ +sub.f16x2 r2292, r2071, r2087; +} +{ +mul.f16x2 r2295, r2292, r2216; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2302, {low, high}; +} +{ +add.f16x2 r2303, r2103, r2119; +} +{ +add.f16x2 r2306, r1838, r2303; +} +{ +add.f16x2 r2309, r2109, r2125; +} +{ +add.f16x2 r2312, r1874, r2309; +} +{ +add.f16x2 r2315, r2103, r2119; +} +{ +mul.f16x2 r2318, r2315, r2301; +} +{ +add.f16x2 r2321, r1838, r2318; +} +{ +sub.f16x2 r2324, r2109, r2125; +} +{ +mul.f16x2 r2327, r2324, r2302; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +add.f16x2 r2333, r2103, r2119; +} +{ +mul.f16x2 r2336, r2333, r2301; +} +{ +add.f16x2 r2339, r1838, r2336; +} +{ +sub.f16x2 r2342, r2109, r2125; +} +{ +mul.f16x2 r2345, r2342, r2302; +} +{ +sub.f16x2 r2348, r2339, r2345; +} +{ +add.f16x2 r2351, r2109, r2125; +} +{ +mul.f16x2 r2354, r2351, r2301; +} +{ +add.f16x2 r2357, r1874, r2354; +} +{ +sub.f16x2 r2360, r2103, r2119; +} +{ +mul.f16x2 r2363, r2360, r2302; +} +{ +sub.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2109, r2125; +} +{ +mul.f16x2 r2372, r2369, r2301; +} +{ +add.f16x2 r2375, r1874, r2372; +} +{ +sub.f16x2 r2378, r2103, r2119; +} +{ +mul.f16x2 r2381, r2378, r2302; +} +{ +add.f16x2 r2384, r2375, r2381; +} +mul.wide.u32 rd6, r3288, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r3299, rd7; +cvt.rn.f32.u32 f291, r3299; +mul.f32 f292, f291, 0f3D9EDD1F; +cos.approx.f32 f209, f292; +sin.approx.f32 f293, f292; +neg.f32 f210, f293; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2387, {low, high}; +} +mul.lo.s32 r3300, r3299, 81; +sub.s32 r3301, r3288, r3300; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2390, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2392, {high, high}; +} +{ +mul.f16x2 r2394, r2226, r2392; +} +{ +fma.rn.f16x2 r2397, r2220, r2390, r2394; +} +{ +mul.f16x2 r2401, r2220, r2392; +} +{ +neg.f16x2 r2404, r2401; +} +{ +fma.rn.f16x2 r2406, r2226, r2390, r2404; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2410, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2412, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2414, {low, high}; +} +{ +mul.f16x2 r2415, r2412, r2414; +} +{ +mul.f16x2 r2418, r2387, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2421, {high, low}; +} +{ +fma.rn.f16x2 r2423, r2415, r2421, r2418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2427, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2429, {high, high}; +} +{ +mul.f16x2 r2431, r2312, r2429; +} +{ +fma.rn.f16x2 r2434, r2306, r2427, r2431; +} +{ +mul.f16x2 r2438, r2306, r2429; +} +{ +neg.f16x2 r2441, r2438; +} +{ +fma.rn.f16x2 r2443, r2312, r2427, r2441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2447, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2449, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2451, {low, high}; +} +{ +mul.f16x2 r2452, r2449, r2451; +} +{ +mul.f16x2 r2455, r2423, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2458, {high, low}; +} +{ +fma.rn.f16x2 r2460, r2452, r2458, r2455; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2464, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2466, {high, high}; +} +{ +mul.f16x2 r2468, r2194, r2466; +} +{ +fma.rn.f16x2 r2471, r2158, r2464, r2468; +} +{ +mul.f16x2 r2475, r2158, r2466; +} +{ +neg.f16x2 r2478, r2475; +} +{ +fma.rn.f16x2 r2480, r2194, r2464, r2478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2484, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2486, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2488, {low, high}; +} +{ +mul.f16x2 r2489, r2486, r2488; +} +{ +mul.f16x2 r2492, r2460, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2495, {high, low}; +} +{ +fma.rn.f16x2 r2497, r2489, r2495, r2492; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2501, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2503, {high, high}; +} +{ +mul.f16x2 r2505, r2280, r2503; +} +{ +fma.rn.f16x2 r2508, r2244, r2501, r2505; +} +{ +mul.f16x2 r2512, r2244, r2503; +} +{ +neg.f16x2 r2515, r2512; +} +{ +fma.rn.f16x2 r2517, r2280, r2501, r2515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2521, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2523, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2525, {low, high}; +} +{ +mul.f16x2 r2526, r2523, r2525; +} +{ +mul.f16x2 r2529, r2497, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2532, {high, low}; +} +{ +fma.rn.f16x2 r2534, r2526, r2532, r2529; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2538, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2540, {high, high}; +} +{ +mul.f16x2 r2542, r2366, r2540; +} +{ +fma.rn.f16x2 r2545, r2330, r2538, r2542; +} +{ +mul.f16x2 r2549, r2330, r2540; +} +{ +neg.f16x2 r2552, r2549; +} +{ +fma.rn.f16x2 r2554, r2366, r2538, r2552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2558, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2560, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2562, {low, high}; +} +{ +mul.f16x2 r2563, r2560, r2562; +} +{ +mul.f16x2 r2566, r2534, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2569, {high, low}; +} +{ +fma.rn.f16x2 r2571, r2563, r2569, r2566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2575, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2577, {high, high}; +} +{ +mul.f16x2 r2579, r2212, r2577; +} +{ +fma.rn.f16x2 r2582, r2176, r2575, r2579; +} +{ +mul.f16x2 r2586, r2176, r2577; +} +{ +neg.f16x2 r2589, r2586; +} +{ +fma.rn.f16x2 r2591, r2212, r2575, r2589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2595, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2597, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2599, {low, high}; +} +{ +mul.f16x2 r2600, r2597, r2599; +} +{ +mul.f16x2 r2603, r2571, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2606, {high, low}; +} +{ +fma.rn.f16x2 r2608, r2600, r2606, r2603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2614, {high, high}; +} +{ +mul.f16x2 r2616, r2298, r2614; +} +{ +fma.rn.f16x2 r2619, r2262, r2612, r2616; +} +{ +mul.f16x2 r2623, r2262, r2614; +} +{ +neg.f16x2 r2626, r2623; +} +{ +fma.rn.f16x2 r2628, r2298, r2612, r2626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2634, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2636, {low, high}; +} +{ +mul.f16x2 r2637, r2634, r2636; +} +{ +mul.f16x2 r2640, r2608, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2643, {high, low}; +} +{ +fma.rn.f16x2 r2645, r2637, r2643, r2640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2651, {high, high}; +} +{ +mul.f16x2 r2653, r2384, r2651; +} +{ +fma.rn.f16x2 r2656, r2348, r2649, r2653; +} +{ +mul.f16x2 r2660, r2348, r2651; +} +{ +neg.f16x2 r2663, r2660; +} +{ +fma.rn.f16x2 r2665, r2384, r2649, r2663; +} +shl.b32 r3302, r3301, 3; +add.s32 r3303, r3289, r3302; +barrier.sync 0; +mad.lo.s32 r3304, r3299, 5832, r3303; +st.shared.u32 [r3304], r2134; +st.shared.u32 [r3304+4], r2140; +st.shared.u32 [r3304+648], r2397; +st.shared.u32 [r3304+652], r2406; +st.shared.u32 [r3304+1296], r2434; +st.shared.u32 [r3304+1300], r2443; +st.shared.u32 [r3304+1944], r2471; +st.shared.u32 [r3304+1948], r2480; +st.shared.u32 [r3304+2592], r2508; +st.shared.u32 [r3304+2596], r2517; +st.shared.u32 [r3304+3240], r2545; +st.shared.u32 [r3304+3244], r2554; +st.shared.u32 [r3304+3888], r2582; +st.shared.u32 [r3304+3892], r2591; +st.shared.u32 [r3304+4536], r2619; +st.shared.u32 [r3304+4540], r2628; +st.shared.u32 [r3304+5184], r2656; +st.shared.u32 [r3304+5188], r2665; +barrier.sync 0; +ld.shared.u32 r2692, [r3292]; +ld.shared.u32 r2698, [r3292+4]; +ld.shared.u32 r2778, [r3292+5832]; +ld.shared.u32 r2784, [r3292+5836]; +ld.shared.u32 r2864, [r3292+11664]; +ld.shared.u32 r2870, [r3292+11668]; +ld.shared.u32 r2689, [r3292+17496]; +ld.shared.u32 r2695, [r3292+17500]; +ld.shared.u32 r2775, [r3292+23328]; +ld.shared.u32 r2781, [r3292+23332]; +ld.shared.u32 r2861, [r3292+29160]; +ld.shared.u32 r2867, [r3292+29164]; +ld.shared.u32 r2690, [r3292+34992]; +ld.shared.u32 r2696, [r3292+34996]; +ld.shared.u32 r2776, [r3292+40824]; +ld.shared.u32 r2782, [r3292+40828]; +ld.shared.u32 r2862, [r3292+46656]; +ld.shared.u32 r2868, [r3292+46660]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2686, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2687, {low, high}; +} +{ +add.f16x2 r2688, r2689, r2690; +} +{ +add.f16x2 r2691, r2692, r2688; +} +{ +add.f16x2 r2694, r2695, r2696; +} +{ +add.f16x2 r2697, r2698, r2694; +} +{ +add.f16x2 r2700, r2689, r2690; +} +{ +mul.f16x2 r2703, r2700, r2686; +} +{ +add.f16x2 r2706, r2692, r2703; +} +{ +sub.f16x2 r2709, r2695, r2696; +} +{ +mul.f16x2 r2712, r2709, r2687; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2689, r2690; +} +{ +mul.f16x2 r2721, r2718, r2686; +} +{ +add.f16x2 r2724, r2692, r2721; +} +{ +sub.f16x2 r2727, r2695, r2696; +} +{ +mul.f16x2 r2730, r2727, r2687; +} +{ +sub.f16x2 r2733, r2724, r2730; +} +{ +add.f16x2 r2736, r2695, r2696; +} +{ +mul.f16x2 r2739, r2736, r2686; +} +{ +add.f16x2 r2742, r2698, r2739; +} +{ +sub.f16x2 r2745, r2689, r2690; +} +{ +mul.f16x2 r2748, r2745, r2687; +} +{ +sub.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2695, r2696; +} +{ +mul.f16x2 r2757, r2754, r2686; +} +{ +add.f16x2 r2760, r2698, r2757; +} +{ +sub.f16x2 r2763, r2689, r2690; +} +{ +mul.f16x2 r2766, r2763, r2687; +} +{ +add.f16x2 r2769, r2760, r2766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2773, {low, high}; +} +{ +add.f16x2 r2774, r2775, r2776; +} +{ +add.f16x2 r2777, r2778, r2774; +} +{ +add.f16x2 r2780, r2781, r2782; +} +{ +add.f16x2 r2783, r2784, r2780; +} +{ +add.f16x2 r2786, r2775, r2776; +} +{ +mul.f16x2 r2789, r2786, r2772; +} +{ +add.f16x2 r2792, r2778, r2789; +} +{ +sub.f16x2 r2795, r2781, r2782; +} +{ +mul.f16x2 r2798, r2795, r2773; +} +{ +add.f16x2 r2801, r2792, r2798; +} +{ +add.f16x2 r2804, r2775, r2776; +} +{ +mul.f16x2 r2807, r2804, r2772; +} +{ +add.f16x2 r2810, r2778, r2807; +} +{ +sub.f16x2 r2813, r2781, r2782; +} +{ +mul.f16x2 r2816, r2813, r2773; +} +{ +sub.f16x2 r2819, r2810, r2816; +} +{ +add.f16x2 r2822, r2781, r2782; +} +{ +mul.f16x2 r2825, r2822, r2772; +} +{ +add.f16x2 r2828, r2784, r2825; +} +{ +sub.f16x2 r2831, r2775, r2776; +} +{ +mul.f16x2 r2834, r2831, r2773; +} +{ +sub.f16x2 r2837, r2828, r2834; +} +{ +add.f16x2 r2840, r2781, r2782; +} +{ +mul.f16x2 r2843, r2840, r2772; +} +{ +add.f16x2 r2846, r2784, r2843; +} +{ +sub.f16x2 r2849, r2775, r2776; +} +{ +mul.f16x2 r2852, r2849, r2773; +} +{ +add.f16x2 r2855, r2846, r2852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2859, {low, high}; +} +{ +add.f16x2 r2860, r2861, r2862; +} +{ +add.f16x2 r2863, r2864, r2860; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +add.f16x2 r2869, r2870, r2866; +} +{ +add.f16x2 r2872, r2861, r2862; +} +{ +mul.f16x2 r2875, r2872, r2858; +} +{ +add.f16x2 r2878, r2864, r2875; +} +{ +sub.f16x2 r2881, r2867, r2868; +} +{ +mul.f16x2 r2884, r2881, r2859; +} +{ +add.f16x2 r2887, r2878, r2884; +} +{ +add.f16x2 r2890, r2861, r2862; +} +{ +mul.f16x2 r2893, r2890, r2858; +} +{ +add.f16x2 r2896, r2864, r2893; +} +{ +sub.f16x2 r2899, r2867, r2868; +} +{ +mul.f16x2 r2902, r2899, r2859; +} +{ +sub.f16x2 r2905, r2896, r2902; +} +{ +add.f16x2 r2908, r2867, r2868; +} +{ +mul.f16x2 r2911, r2908, r2858; +} +{ +add.f16x2 r2914, r2870, r2911; +} +{ +sub.f16x2 r2917, r2861, r2862; +} +{ +mul.f16x2 r2920, r2917, r2859; +} +{ +sub.f16x2 r2923, r2914, r2920; +} +{ +add.f16x2 r2926, r2867, r2868; +} +{ +mul.f16x2 r2929, r2926, r2858; +} +{ +add.f16x2 r2932, r2870, r2929; +} +{ +sub.f16x2 r2935, r2861, r2862; +} +{ +mul.f16x2 r2938, r2935, r2859; +} +{ +add.f16x2 r2941, r2932, r2938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2950, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2951, {low, high}; +} +{ +mul.f16x2 r2960, r2801, r2944; +} +{ +mul.f16x2 r2963, r2837, r2945; +} +{ +sub.f16x2 r2966, r2960, r2963; +} +{ +mul.f16x2 r2969, r2801, r2945; +} +{ +fma.rn.f16x2 r2972, r2837, r2944, r2969; +} +{ +mul.f16x2 r2976, r2887, r2946; +} +{ +mul.f16x2 r2979, r2923, r2947; +} +{ +sub.f16x2 r2982, r2976, r2979; +} +{ +mul.f16x2 r2985, r2887, r2947; +} +{ +fma.rn.f16x2 r2988, r2923, r2946, r2985; +} +{ +mul.f16x2 r2992, r2819, r2946; +} +{ +mul.f16x2 r2995, r2855, r2947; +} +{ +sub.f16x2 r2998, r2992, r2995; +} +{ +mul.f16x2 r3001, r2819, r2947; +} +{ +fma.rn.f16x2 r3004, r2855, r2946, r3001; +} +{ +mul.f16x2 r3008, r2905, r2950; +} +{ +mul.f16x2 r3011, r2941, r2951; +} +{ +sub.f16x2 r3014, r3008, r3011; +} +{ +mul.f16x2 r3017, r2905, r2951; +} +{ +fma.rn.f16x2 r3020, r2941, r2950, r3017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3024, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3025, {low, high}; +} +{ +add.f16x2 r3026, r2777, r2863; +} +{ +add.f16x2 %0, r2691, r3026; +} +{ +add.f16x2 r3032, r2783, r2869; +} +{ +add.f16x2 %1, r2697, r3032; +} +{ +add.f16x2 r3038, r2777, r2863; +} +{ +mul.f16x2 r3041, r3038, r3024; +} +{ +add.f16x2 r3044, r2691, r3041; +} +{ +sub.f16x2 r3047, r2783, r2869; +} +{ +mul.f16x2 r3050, r3047, r3025; +} +{ +add.f16x2 %6, r3044, r3050; +} +{ +add.f16x2 r3056, r2777, r2863; +} +{ +mul.f16x2 r3059, r3056, r3024; +} +{ +add.f16x2 r3062, r2691, r3059; +} +{ +sub.f16x2 r3065, r2783, r2869; +} +{ +mul.f16x2 r3068, r3065, r3025; +} +{ +sub.f16x2 %12, r3062, r3068; +} +{ +add.f16x2 r3074, r2783, r2869; +} +{ +mul.f16x2 r3077, r3074, r3024; +} +{ +add.f16x2 r3080, r2697, r3077; +} +{ +sub.f16x2 r3083, r2777, r2863; +} +{ +mul.f16x2 r3086, r3083, r3025; +} +{ +sub.f16x2 %7, r3080, r3086; +} +{ +add.f16x2 r3092, r2783, r2869; +} +{ +mul.f16x2 r3095, r3092, r3024; +} +{ +add.f16x2 r3098, r2697, r3095; +} +{ +sub.f16x2 r3101, r2777, r2863; +} +{ +mul.f16x2 r3104, r3101, r3025; +} +{ +add.f16x2 %13, r3098, r3104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3111, {low, high}; +} +{ +add.f16x2 r3112, r2966, r2982; +} +{ +add.f16x2 %2, r2715, r3112; +} +{ +add.f16x2 r3118, r2972, r2988; +} +{ +add.f16x2 %3, r2751, r3118; +} +{ +add.f16x2 r3124, r2966, r2982; +} +{ +mul.f16x2 r3127, r3124, r3110; +} +{ +add.f16x2 r3130, r2715, r3127; +} +{ +sub.f16x2 r3133, r2972, r2988; +} +{ +mul.f16x2 r3136, r3133, r3111; +} +{ +add.f16x2 %8, r3130, r3136; +} +{ +add.f16x2 r3142, r2966, r2982; +} +{ +mul.f16x2 r3145, r3142, r3110; +} +{ +add.f16x2 r3148, r2715, r3145; +} +{ +sub.f16x2 r3151, r2972, r2988; +} +{ +mul.f16x2 r3154, r3151, r3111; +} +{ +sub.f16x2 %14, r3148, r3154; +} +{ +add.f16x2 r3160, r2972, r2988; +} +{ +mul.f16x2 r3163, r3160, r3110; +} +{ +add.f16x2 r3166, r2751, r3163; +} +{ +sub.f16x2 r3169, r2966, r2982; +} +{ +mul.f16x2 r3172, r3169, r3111; +} +{ +sub.f16x2 %9, r3166, r3172; +} +{ +add.f16x2 r3178, r2972, r2988; +} +{ +mul.f16x2 r3181, r3178, r3110; +} +{ +add.f16x2 r3184, r2751, r3181; +} +{ +sub.f16x2 r3187, r2966, r2982; +} +{ +mul.f16x2 r3190, r3187, r3111; +} +{ +add.f16x2 %15, r3184, r3190; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3196, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3197, {low, high}; +} +{ +add.f16x2 r3198, r2998, r3014; +} +{ +add.f16x2 %4, r2733, r3198; +} +{ +add.f16x2 r3204, r3004, r3020; +} +{ +add.f16x2 %5, r2769, r3204; +} +{ +add.f16x2 r3210, r2998, r3014; +} +{ +mul.f16x2 r3213, r3210, r3196; +} +{ +add.f16x2 r3216, r2733, r3213; +} +{ +sub.f16x2 r3219, r3004, r3020; +} +{ +mul.f16x2 r3222, r3219, r3197; +} +{ +add.f16x2 %10, r3216, r3222; +} +{ +add.f16x2 r3228, r2998, r3014; +} +{ +mul.f16x2 r3231, r3228, r3196; +} +{ +add.f16x2 r3234, r2733, r3231; +} +{ +sub.f16x2 r3237, r3004, r3020; +} +{ +mul.f16x2 r3240, r3237, r3197; +} +{ +sub.f16x2 %16, r3234, r3240; +} +{ +add.f16x2 r3246, r3004, r3020; +} +{ +mul.f16x2 r3249, r3246, r3196; +} +{ +add.f16x2 r3252, r2769, r3249; +} +{ +sub.f16x2 r3255, r2998, r3014; +} +{ +mul.f16x2 r3258, r3255, r3197; +} +{ +sub.f16x2 %11, r3252, r3258; +} +{ +add.f16x2 r3264, r3004, r3020; +} +{ +mul.f16x2 r3267, r3264, r3196; +} +{ +add.f16x2 r3270, r2769, r3267; +} +{ +sub.f16x2 r3273, r2998, r3014; +} +{ +mul.f16x2 r3276, r3273, r3197; +} +{ +add.f16x2 %17, r3270, r3276; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1100, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<294>; +.reg .b32 r<3305>; +.reg .b64 rd<8>; +mov.u32 r3282, %tid.y; +mov.u32 r3283, %18; +mad.lo.s32 r3284, r3282, 26244, r3283; +mov.u32 r3285, %tid.x; +mov.f32 f282, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1, {low, high}; +} +mov.f32 f284, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f242, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r259, {low, high}; +} +mov.f32 f244, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r260, {low, high}; +} +mov.f32 f246, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r261, {low, high}; +} +mov.f32 f248, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r262, {low, high}; +} +mov.f32 f254, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r265, {low, high}; +} +mov.f32 f256, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r3285, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r3286, rd3; +mul.lo.s32 r3287, r3286, 729; +sub.s32 r3288, r3285, r3287; +mad.lo.s32 r3289, r3286, 26244, r3284; +cvt.rn.f32.u32 f285, r3288; +mul.f32 f286, f285, 0f3A7B0B40; +cos.approx.f32 f57, f286; +sin.approx.f32 f287, f286; +neg.f32 f58, f287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f225, 0fBF800000; +mov.f32 f226, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +barrier.sync 0; +mad.lo.s32 r3290, r3288, 36, r3289; +st.shared.u32 [r3290], r344; +st.shared.u32 [r3290+4], r607; +st.shared.u32 [r3290+8], r644; +st.shared.u32 [r3290+12], r681; +st.shared.u32 [r3290+16], r718; +st.shared.u32 [r3290+20], r755; +st.shared.u32 [r3290+24], r792; +st.shared.u32 [r3290+28], r829; +st.shared.u32 [r3290+32], r866; +barrier.sync 0; +shl.b32 r3291, r3288, 5; +sub.s32 r3292, r3290, r3291; +ld.shared.u32 r902, [r3292]; +ld.shared.u32 r988, [r3292+2916]; +ld.shared.u32 r1074, [r3292+5832]; +ld.shared.u32 r899, [r3292+8748]; +ld.shared.u32 r985, [r3292+11664]; +ld.shared.u32 r1071, [r3292+14580]; +ld.shared.u32 r900, [r3292+17496]; +ld.shared.u32 r986, [r3292+20412]; +ld.shared.u32 r1072, [r3292+23328]; +barrier.sync 0; +st.shared.u32 [r3290], r350; +st.shared.u32 [r3290+4], r616; +st.shared.u32 [r3290+8], r653; +st.shared.u32 [r3290+12], r690; +st.shared.u32 [r3290+16], r727; +st.shared.u32 [r3290+20], r764; +st.shared.u32 [r3290+24], r801; +st.shared.u32 [r3290+28], r838; +st.shared.u32 [r3290+32], r875; +barrier.sync 0; +ld.shared.u32 r908, [r3292]; +ld.shared.u32 r994, [r3292+2916]; +ld.shared.u32 r1080, [r3292+5832]; +ld.shared.u32 r905, [r3292+8748]; +ld.shared.u32 r991, [r3292+11664]; +ld.shared.u32 r1077, [r3292+14580]; +ld.shared.u32 r906, [r3292+17496]; +ld.shared.u32 r992, [r3292+20412]; +ld.shared.u32 r1078, [r3292+23328]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r3288, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r3293, rd5; +mul.lo.s32 r3294, r3293, 9; +sub.s32 r3295, r3288, r3294; +shl.b32 r3296, r3295, 2; +add.s32 r3297, r3289, r3296; +cvt.rn.f32.u32 f288, r3293; +mul.f32 f289, f288, 0f3C0D3654; +cos.approx.f32 f133, f289; +sin.approx.f32 f290, f289; +neg.f32 f134, f290; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +barrier.sync 0; +mad.lo.s32 r3298, r3293, 324, r3297; +st.shared.u32 [r3298], r1239; +st.shared.u32 [r3298+36], r1502; +st.shared.u32 [r3298+72], r1539; +st.shared.u32 [r3298+108], r1576; +st.shared.u32 [r3298+144], r1613; +st.shared.u32 [r3298+180], r1650; +st.shared.u32 [r3298+216], r1687; +st.shared.u32 [r3298+252], r1724; +st.shared.u32 [r3298+288], r1761; +barrier.sync 0; +ld.shared.u32 r1797, [r3292]; +ld.shared.u32 r1883, [r3292+2916]; +ld.shared.u32 r1969, [r3292+5832]; +ld.shared.u32 r1794, [r3292+8748]; +ld.shared.u32 r1880, [r3292+11664]; +ld.shared.u32 r1966, [r3292+14580]; +ld.shared.u32 r1795, [r3292+17496]; +ld.shared.u32 r1881, [r3292+20412]; +ld.shared.u32 r1967, [r3292+23328]; +barrier.sync 0; +st.shared.u32 [r3298], r1245; +st.shared.u32 [r3298+36], r1511; +st.shared.u32 [r3298+72], r1548; +st.shared.u32 [r3298+108], r1585; +st.shared.u32 [r3298+144], r1622; +st.shared.u32 [r3298+180], r1659; +st.shared.u32 [r3298+216], r1696; +st.shared.u32 [r3298+252], r1733; +st.shared.u32 [r3298+288], r1770; +barrier.sync 0; +ld.shared.u32 r1803, [r3292]; +ld.shared.u32 r1889, [r3292+2916]; +ld.shared.u32 r1975, [r3292+5832]; +ld.shared.u32 r1800, [r3292+8748]; +ld.shared.u32 r1886, [r3292+11664]; +ld.shared.u32 r1972, [r3292+14580]; +ld.shared.u32 r1801, [r3292+17496]; +ld.shared.u32 r1887, [r3292+20412]; +ld.shared.u32 r1973, [r3292+23328]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 r1802, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 r1820, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 r1838, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 r1856, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 r1874, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 r1942, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 r1968, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 r1974, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 r1992, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 r2010, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 r2028, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2065, r1906, r2049; +} +{ +mul.f16x2 r2068, r1942, r2050; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1906, r2050; +} +{ +fma.rn.f16x2 r2077, r1942, r2049, r2074; +} +{ +mul.f16x2 r2081, r1992, r2051; +} +{ +mul.f16x2 r2084, r2028, r2052; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1992, r2052; +} +{ +fma.rn.f16x2 r2093, r2028, r2051, r2090; +} +{ +mul.f16x2 r2097, r1924, r2051; +} +{ +mul.f16x2 r2100, r1960, r2052; +} +{ +sub.f16x2 r2103, r2097, r2100; +} +{ +mul.f16x2 r2106, r1924, r2052; +} +{ +fma.rn.f16x2 r2109, r1960, r2051, r2106; +} +{ +mul.f16x2 r2113, r2010, r2055; +} +{ +mul.f16x2 r2116, r2046, r2056; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r2010, r2056; +} +{ +fma.rn.f16x2 r2125, r2046, r2055, r2122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2130, {low, high}; +} +{ +add.f16x2 r2131, r1882, r1968; +} +{ +add.f16x2 r2134, r1796, r2131; +} +{ +add.f16x2 r2137, r1888, r1974; +} +{ +add.f16x2 r2140, r1802, r2137; +} +{ +add.f16x2 r2143, r1882, r1968; +} +{ +mul.f16x2 r2146, r2143, r2129; +} +{ +add.f16x2 r2149, r1796, r2146; +} +{ +sub.f16x2 r2152, r1888, r1974; +} +{ +mul.f16x2 r2155, r2152, r2130; +} +{ +add.f16x2 r2158, r2149, r2155; +} +{ +add.f16x2 r2161, r1882, r1968; +} +{ +mul.f16x2 r2164, r2161, r2129; +} +{ +add.f16x2 r2167, r1796, r2164; +} +{ +sub.f16x2 r2170, r1888, r1974; +} +{ +mul.f16x2 r2173, r2170, r2130; +} +{ +sub.f16x2 r2176, r2167, r2173; +} +{ +add.f16x2 r2179, r1888, r1974; +} +{ +mul.f16x2 r2182, r2179, r2129; +} +{ +add.f16x2 r2185, r1802, r2182; +} +{ +sub.f16x2 r2188, r1882, r1968; +} +{ +mul.f16x2 r2191, r2188, r2130; +} +{ +sub.f16x2 r2194, r2185, r2191; +} +{ +add.f16x2 r2197, r1888, r1974; +} +{ +mul.f16x2 r2200, r2197, r2129; +} +{ +add.f16x2 r2203, r1802, r2200; +} +{ +sub.f16x2 r2206, r1882, r1968; +} +{ +mul.f16x2 r2209, r2206, r2130; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2215, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2216, {low, high}; +} +{ +add.f16x2 r2217, r2071, r2087; +} +{ +add.f16x2 r2220, r1820, r2217; +} +{ +add.f16x2 r2223, r2077, r2093; +} +{ +add.f16x2 r2226, r1856, r2223; +} +{ +add.f16x2 r2229, r2071, r2087; +} +{ +mul.f16x2 r2232, r2229, r2215; +} +{ +add.f16x2 r2235, r1820, r2232; +} +{ +sub.f16x2 r2238, r2077, r2093; +} +{ +mul.f16x2 r2241, r2238, r2216; +} +{ +add.f16x2 r2244, r2235, r2241; +} +{ +add.f16x2 r2247, r2071, r2087; +} +{ +mul.f16x2 r2250, r2247, r2215; +} +{ +add.f16x2 r2253, r1820, r2250; +} +{ +sub.f16x2 r2256, r2077, r2093; +} +{ +mul.f16x2 r2259, r2256, r2216; +} +{ +sub.f16x2 r2262, r2253, r2259; +} +{ +add.f16x2 r2265, r2077, r2093; +} +{ +mul.f16x2 r2268, r2265, r2215; +} +{ +add.f16x2 r2271, r1856, r2268; +} +{ +sub.f16x2 r2274, r2071, r2087; +} +{ +mul.f16x2 r2277, r2274, r2216; +} +{ +sub.f16x2 r2280, r2271, r2277; +} +{ +add.f16x2 r2283, r2077, r2093; +} +{ +mul.f16x2 r2286, r2283, r2215; +} +{ +add.f16x2 r2289, r1856, r2286; +} +{ +sub.f16x2 r2292, r2071, r2087; +} +{ +mul.f16x2 r2295, r2292, r2216; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2302, {low, high}; +} +{ +add.f16x2 r2303, r2103, r2119; +} +{ +add.f16x2 r2306, r1838, r2303; +} +{ +add.f16x2 r2309, r2109, r2125; +} +{ +add.f16x2 r2312, r1874, r2309; +} +{ +add.f16x2 r2315, r2103, r2119; +} +{ +mul.f16x2 r2318, r2315, r2301; +} +{ +add.f16x2 r2321, r1838, r2318; +} +{ +sub.f16x2 r2324, r2109, r2125; +} +{ +mul.f16x2 r2327, r2324, r2302; +} +{ +add.f16x2 r2330, r2321, r2327; +} +{ +add.f16x2 r2333, r2103, r2119; +} +{ +mul.f16x2 r2336, r2333, r2301; +} +{ +add.f16x2 r2339, r1838, r2336; +} +{ +sub.f16x2 r2342, r2109, r2125; +} +{ +mul.f16x2 r2345, r2342, r2302; +} +{ +sub.f16x2 r2348, r2339, r2345; +} +{ +add.f16x2 r2351, r2109, r2125; +} +{ +mul.f16x2 r2354, r2351, r2301; +} +{ +add.f16x2 r2357, r1874, r2354; +} +{ +sub.f16x2 r2360, r2103, r2119; +} +{ +mul.f16x2 r2363, r2360, r2302; +} +{ +sub.f16x2 r2366, r2357, r2363; +} +{ +add.f16x2 r2369, r2109, r2125; +} +{ +mul.f16x2 r2372, r2369, r2301; +} +{ +add.f16x2 r2375, r1874, r2372; +} +{ +sub.f16x2 r2378, r2103, r2119; +} +{ +mul.f16x2 r2381, r2378, r2302; +} +{ +add.f16x2 r2384, r2375, r2381; +} +mul.wide.u32 rd6, r3288, -901412889; +shr.u64 rd7, rd6, 38; +cvt.u32.u64 r3299, rd7; +mul.lo.s32 r3300, r3299, 81; +sub.s32 r3301, r3288, r3300; +shl.b32 r3302, r3301, 2; +add.s32 r3303, r3289, r3302; +cvt.rn.f32.u32 f291, r3299; +mul.f32 f292, f291, 0f3D9EDD1F; +cos.approx.f32 f209, f292; +sin.approx.f32 f293, f292; +neg.f32 f210, f293; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f209; +cvt.rn.f16.f32 high, f210; +mov.b32 r2387, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2390, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2392, {high, high}; +} +{ +mul.f16x2 r2394, r2226, r2392; +} +{ +fma.rn.f16x2 r2397, r2220, r2390, r2394; +} +{ +mul.f16x2 r2401, r2220, r2392; +} +{ +neg.f16x2 r2404, r2401; +} +{ +fma.rn.f16x2 r2406, r2226, r2390, r2404; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2410, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2412, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2414, {low, high}; +} +{ +mul.f16x2 r2415, r2412, r2414; +} +{ +mul.f16x2 r2418, r2387, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2421, {high, low}; +} +{ +fma.rn.f16x2 r2423, r2415, r2421, r2418; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2427, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2429, {high, high}; +} +{ +mul.f16x2 r2431, r2312, r2429; +} +{ +fma.rn.f16x2 r2434, r2306, r2427, r2431; +} +{ +mul.f16x2 r2438, r2306, r2429; +} +{ +neg.f16x2 r2441, r2438; +} +{ +fma.rn.f16x2 r2443, r2312, r2427, r2441; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2447, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2449, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2451, {low, high}; +} +{ +mul.f16x2 r2452, r2449, r2451; +} +{ +mul.f16x2 r2455, r2423, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2423; +mov.b32 r2458, {high, low}; +} +{ +fma.rn.f16x2 r2460, r2452, r2458, r2455; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2464, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2466, {high, high}; +} +{ +mul.f16x2 r2468, r2194, r2466; +} +{ +fma.rn.f16x2 r2471, r2158, r2464, r2468; +} +{ +mul.f16x2 r2475, r2158, r2466; +} +{ +neg.f16x2 r2478, r2475; +} +{ +fma.rn.f16x2 r2480, r2194, r2464, r2478; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2484, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2486, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2488, {low, high}; +} +{ +mul.f16x2 r2489, r2486, r2488; +} +{ +mul.f16x2 r2492, r2460, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2460; +mov.b32 r2495, {high, low}; +} +{ +fma.rn.f16x2 r2497, r2489, r2495, r2492; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2501, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2503, {high, high}; +} +{ +mul.f16x2 r2505, r2280, r2503; +} +{ +fma.rn.f16x2 r2508, r2244, r2501, r2505; +} +{ +mul.f16x2 r2512, r2244, r2503; +} +{ +neg.f16x2 r2515, r2512; +} +{ +fma.rn.f16x2 r2517, r2280, r2501, r2515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2521, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2523, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2525, {low, high}; +} +{ +mul.f16x2 r2526, r2523, r2525; +} +{ +mul.f16x2 r2529, r2497, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2497; +mov.b32 r2532, {high, low}; +} +{ +fma.rn.f16x2 r2534, r2526, r2532, r2529; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2538, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2540, {high, high}; +} +{ +mul.f16x2 r2542, r2366, r2540; +} +{ +fma.rn.f16x2 r2545, r2330, r2538, r2542; +} +{ +mul.f16x2 r2549, r2330, r2540; +} +{ +neg.f16x2 r2552, r2549; +} +{ +fma.rn.f16x2 r2554, r2366, r2538, r2552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2558, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2560, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2562, {low, high}; +} +{ +mul.f16x2 r2563, r2560, r2562; +} +{ +mul.f16x2 r2566, r2534, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2534; +mov.b32 r2569, {high, low}; +} +{ +fma.rn.f16x2 r2571, r2563, r2569, r2566; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2575, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2577, {high, high}; +} +{ +mul.f16x2 r2579, r2212, r2577; +} +{ +fma.rn.f16x2 r2582, r2176, r2575, r2579; +} +{ +mul.f16x2 r2586, r2176, r2577; +} +{ +neg.f16x2 r2589, r2586; +} +{ +fma.rn.f16x2 r2591, r2212, r2575, r2589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2595, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2597, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2599, {low, high}; +} +{ +mul.f16x2 r2600, r2597, r2599; +} +{ +mul.f16x2 r2603, r2571, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2571; +mov.b32 r2606, {high, low}; +} +{ +fma.rn.f16x2 r2608, r2600, r2606, r2603; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2614, {high, high}; +} +{ +mul.f16x2 r2616, r2298, r2614; +} +{ +fma.rn.f16x2 r2619, r2262, r2612, r2616; +} +{ +mul.f16x2 r2623, r2262, r2614; +} +{ +neg.f16x2 r2626, r2623; +} +{ +fma.rn.f16x2 r2628, r2298, r2612, r2626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2387; +mov.b32 r2634, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f225; +cvt.rn.f16.f32 high, f226; +mov.b32 r2636, {low, high}; +} +{ +mul.f16x2 r2637, r2634, r2636; +} +{ +mul.f16x2 r2640, r2608, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2608; +mov.b32 r2643, {high, low}; +} +{ +fma.rn.f16x2 r2645, r2637, r2643, r2640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2645; +mov.b32 r2651, {high, high}; +} +{ +mul.f16x2 r2653, r2384, r2651; +} +{ +fma.rn.f16x2 r2656, r2348, r2649, r2653; +} +{ +mul.f16x2 r2660, r2348, r2651; +} +{ +neg.f16x2 r2663, r2660; +} +{ +fma.rn.f16x2 r2665, r2384, r2649, r2663; +} +barrier.sync 0; +mad.lo.s32 r3304, r3299, 2916, r3303; +st.shared.u32 [r3304], r2134; +st.shared.u32 [r3304+324], r2397; +st.shared.u32 [r3304+648], r2434; +st.shared.u32 [r3304+972], r2471; +st.shared.u32 [r3304+1296], r2508; +st.shared.u32 [r3304+1620], r2545; +st.shared.u32 [r3304+1944], r2582; +st.shared.u32 [r3304+2268], r2619; +st.shared.u32 [r3304+2592], r2656; +barrier.sync 0; +ld.shared.u32 r2692, [r3292]; +ld.shared.u32 r2778, [r3292+2916]; +ld.shared.u32 r2864, [r3292+5832]; +ld.shared.u32 r2689, [r3292+8748]; +ld.shared.u32 r2775, [r3292+11664]; +ld.shared.u32 r2861, [r3292+14580]; +ld.shared.u32 r2690, [r3292+17496]; +ld.shared.u32 r2776, [r3292+20412]; +ld.shared.u32 r2862, [r3292+23328]; +barrier.sync 0; +st.shared.u32 [r3304], r2140; +st.shared.u32 [r3304+324], r2406; +st.shared.u32 [r3304+648], r2443; +st.shared.u32 [r3304+972], r2480; +st.shared.u32 [r3304+1296], r2517; +st.shared.u32 [r3304+1620], r2554; +st.shared.u32 [r3304+1944], r2591; +st.shared.u32 [r3304+2268], r2628; +st.shared.u32 [r3304+2592], r2665; +barrier.sync 0; +ld.shared.u32 r2698, [r3292]; +ld.shared.u32 r2784, [r3292+2916]; +ld.shared.u32 r2870, [r3292+5832]; +ld.shared.u32 r2695, [r3292+8748]; +ld.shared.u32 r2781, [r3292+11664]; +ld.shared.u32 r2867, [r3292+14580]; +ld.shared.u32 r2696, [r3292+17496]; +ld.shared.u32 r2782, [r3292+20412]; +ld.shared.u32 r2868, [r3292+23328]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2686, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2687, {low, high}; +} +{ +add.f16x2 r2688, r2689, r2690; +} +{ +add.f16x2 r2691, r2692, r2688; +} +{ +add.f16x2 r2694, r2695, r2696; +} +{ +add.f16x2 r2697, r2698, r2694; +} +{ +add.f16x2 r2700, r2689, r2690; +} +{ +mul.f16x2 r2703, r2700, r2686; +} +{ +add.f16x2 r2706, r2692, r2703; +} +{ +sub.f16x2 r2709, r2695, r2696; +} +{ +mul.f16x2 r2712, r2709, r2687; +} +{ +add.f16x2 r2715, r2706, r2712; +} +{ +add.f16x2 r2718, r2689, r2690; +} +{ +mul.f16x2 r2721, r2718, r2686; +} +{ +add.f16x2 r2724, r2692, r2721; +} +{ +sub.f16x2 r2727, r2695, r2696; +} +{ +mul.f16x2 r2730, r2727, r2687; +} +{ +sub.f16x2 r2733, r2724, r2730; +} +{ +add.f16x2 r2736, r2695, r2696; +} +{ +mul.f16x2 r2739, r2736, r2686; +} +{ +add.f16x2 r2742, r2698, r2739; +} +{ +sub.f16x2 r2745, r2689, r2690; +} +{ +mul.f16x2 r2748, r2745, r2687; +} +{ +sub.f16x2 r2751, r2742, r2748; +} +{ +add.f16x2 r2754, r2695, r2696; +} +{ +mul.f16x2 r2757, r2754, r2686; +} +{ +add.f16x2 r2760, r2698, r2757; +} +{ +sub.f16x2 r2763, r2689, r2690; +} +{ +mul.f16x2 r2766, r2763, r2687; +} +{ +add.f16x2 r2769, r2760, r2766; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2772, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2773, {low, high}; +} +{ +add.f16x2 r2774, r2775, r2776; +} +{ +add.f16x2 r2777, r2778, r2774; +} +{ +add.f16x2 r2780, r2781, r2782; +} +{ +add.f16x2 r2783, r2784, r2780; +} +{ +add.f16x2 r2786, r2775, r2776; +} +{ +mul.f16x2 r2789, r2786, r2772; +} +{ +add.f16x2 r2792, r2778, r2789; +} +{ +sub.f16x2 r2795, r2781, r2782; +} +{ +mul.f16x2 r2798, r2795, r2773; +} +{ +add.f16x2 r2801, r2792, r2798; +} +{ +add.f16x2 r2804, r2775, r2776; +} +{ +mul.f16x2 r2807, r2804, r2772; +} +{ +add.f16x2 r2810, r2778, r2807; +} +{ +sub.f16x2 r2813, r2781, r2782; +} +{ +mul.f16x2 r2816, r2813, r2773; +} +{ +sub.f16x2 r2819, r2810, r2816; +} +{ +add.f16x2 r2822, r2781, r2782; +} +{ +mul.f16x2 r2825, r2822, r2772; +} +{ +add.f16x2 r2828, r2784, r2825; +} +{ +sub.f16x2 r2831, r2775, r2776; +} +{ +mul.f16x2 r2834, r2831, r2773; +} +{ +sub.f16x2 r2837, r2828, r2834; +} +{ +add.f16x2 r2840, r2781, r2782; +} +{ +mul.f16x2 r2843, r2840, r2772; +} +{ +add.f16x2 r2846, r2784, r2843; +} +{ +sub.f16x2 r2849, r2775, r2776; +} +{ +mul.f16x2 r2852, r2849, r2773; +} +{ +add.f16x2 r2855, r2846, r2852; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r2858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r2859, {low, high}; +} +{ +add.f16x2 r2860, r2861, r2862; +} +{ +add.f16x2 r2863, r2864, r2860; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +add.f16x2 r2869, r2870, r2866; +} +{ +add.f16x2 r2872, r2861, r2862; +} +{ +mul.f16x2 r2875, r2872, r2858; +} +{ +add.f16x2 r2878, r2864, r2875; +} +{ +sub.f16x2 r2881, r2867, r2868; +} +{ +mul.f16x2 r2884, r2881, r2859; +} +{ +add.f16x2 r2887, r2878, r2884; +} +{ +add.f16x2 r2890, r2861, r2862; +} +{ +mul.f16x2 r2893, r2890, r2858; +} +{ +add.f16x2 r2896, r2864, r2893; +} +{ +sub.f16x2 r2899, r2867, r2868; +} +{ +mul.f16x2 r2902, r2899, r2859; +} +{ +sub.f16x2 r2905, r2896, r2902; +} +{ +add.f16x2 r2908, r2867, r2868; +} +{ +mul.f16x2 r2911, r2908, r2858; +} +{ +add.f16x2 r2914, r2870, r2911; +} +{ +sub.f16x2 r2917, r2861, r2862; +} +{ +mul.f16x2 r2920, r2917, r2859; +} +{ +sub.f16x2 r2923, r2914, r2920; +} +{ +add.f16x2 r2926, r2867, r2868; +} +{ +mul.f16x2 r2929, r2926, r2858; +} +{ +add.f16x2 r2932, r2870, r2929; +} +{ +sub.f16x2 r2935, r2861, r2862; +} +{ +mul.f16x2 r2938, r2935, r2859; +} +{ +add.f16x2 r2941, r2932, r2938; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f242; +cvt.rn.f16.f32 high, f242; +mov.b32 r2944, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f244; +cvt.rn.f16.f32 high, f244; +mov.b32 r2945, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f246; +cvt.rn.f16.f32 high, f246; +mov.b32 r2946, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f248; +cvt.rn.f16.f32 high, f248; +mov.b32 r2947, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f254; +cvt.rn.f16.f32 high, f254; +mov.b32 r2950, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f256; +cvt.rn.f16.f32 high, f256; +mov.b32 r2951, {low, high}; +} +{ +mul.f16x2 r2960, r2801, r2944; +} +{ +mul.f16x2 r2963, r2837, r2945; +} +{ +sub.f16x2 r2966, r2960, r2963; +} +{ +mul.f16x2 r2969, r2801, r2945; +} +{ +fma.rn.f16x2 r2972, r2837, r2944, r2969; +} +{ +mul.f16x2 r2976, r2887, r2946; +} +{ +mul.f16x2 r2979, r2923, r2947; +} +{ +sub.f16x2 r2982, r2976, r2979; +} +{ +mul.f16x2 r2985, r2887, r2947; +} +{ +fma.rn.f16x2 r2988, r2923, r2946, r2985; +} +{ +mul.f16x2 r2992, r2819, r2946; +} +{ +mul.f16x2 r2995, r2855, r2947; +} +{ +sub.f16x2 r2998, r2992, r2995; +} +{ +mul.f16x2 r3001, r2819, r2947; +} +{ +fma.rn.f16x2 r3004, r2855, r2946, r3001; +} +{ +mul.f16x2 r3008, r2905, r2950; +} +{ +mul.f16x2 r3011, r2941, r2951; +} +{ +sub.f16x2 r3014, r3008, r3011; +} +{ +mul.f16x2 r3017, r2905, r2951; +} +{ +fma.rn.f16x2 r3020, r2941, r2950, r3017; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3024, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3025, {low, high}; +} +{ +add.f16x2 r3026, r2777, r2863; +} +{ +add.f16x2 %0, r2691, r3026; +} +{ +add.f16x2 r3032, r2783, r2869; +} +{ +add.f16x2 %1, r2697, r3032; +} +{ +add.f16x2 r3038, r2777, r2863; +} +{ +mul.f16x2 r3041, r3038, r3024; +} +{ +add.f16x2 r3044, r2691, r3041; +} +{ +sub.f16x2 r3047, r2783, r2869; +} +{ +mul.f16x2 r3050, r3047, r3025; +} +{ +add.f16x2 %6, r3044, r3050; +} +{ +add.f16x2 r3056, r2777, r2863; +} +{ +mul.f16x2 r3059, r3056, r3024; +} +{ +add.f16x2 r3062, r2691, r3059; +} +{ +sub.f16x2 r3065, r2783, r2869; +} +{ +mul.f16x2 r3068, r3065, r3025; +} +{ +sub.f16x2 %12, r3062, r3068; +} +{ +add.f16x2 r3074, r2783, r2869; +} +{ +mul.f16x2 r3077, r3074, r3024; +} +{ +add.f16x2 r3080, r2697, r3077; +} +{ +sub.f16x2 r3083, r2777, r2863; +} +{ +mul.f16x2 r3086, r3083, r3025; +} +{ +sub.f16x2 %7, r3080, r3086; +} +{ +add.f16x2 r3092, r2783, r2869; +} +{ +mul.f16x2 r3095, r3092, r3024; +} +{ +add.f16x2 r3098, r2697, r3095; +} +{ +sub.f16x2 r3101, r2777, r2863; +} +{ +mul.f16x2 r3104, r3101, r3025; +} +{ +add.f16x2 %13, r3098, r3104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3111, {low, high}; +} +{ +add.f16x2 r3112, r2966, r2982; +} +{ +add.f16x2 %2, r2715, r3112; +} +{ +add.f16x2 r3118, r2972, r2988; +} +{ +add.f16x2 %3, r2751, r3118; +} +{ +add.f16x2 r3124, r2966, r2982; +} +{ +mul.f16x2 r3127, r3124, r3110; +} +{ +add.f16x2 r3130, r2715, r3127; +} +{ +sub.f16x2 r3133, r2972, r2988; +} +{ +mul.f16x2 r3136, r3133, r3111; +} +{ +add.f16x2 %8, r3130, r3136; +} +{ +add.f16x2 r3142, r2966, r2982; +} +{ +mul.f16x2 r3145, r3142, r3110; +} +{ +add.f16x2 r3148, r2715, r3145; +} +{ +sub.f16x2 r3151, r2972, r2988; +} +{ +mul.f16x2 r3154, r3151, r3111; +} +{ +sub.f16x2 %14, r3148, r3154; +} +{ +add.f16x2 r3160, r2972, r2988; +} +{ +mul.f16x2 r3163, r3160, r3110; +} +{ +add.f16x2 r3166, r2751, r3163; +} +{ +sub.f16x2 r3169, r2966, r2982; +} +{ +mul.f16x2 r3172, r3169, r3111; +} +{ +sub.f16x2 %9, r3166, r3172; +} +{ +add.f16x2 r3178, r2972, r2988; +} +{ +mul.f16x2 r3181, r3178, r3110; +} +{ +add.f16x2 r3184, r2751, r3181; +} +{ +sub.f16x2 r3187, r2966, r2982; +} +{ +mul.f16x2 r3190, r3187, r3111; +} +{ +add.f16x2 %15, r3184, r3190; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f282; +cvt.rn.f16.f32 high, f282; +mov.b32 r3196, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f284; +cvt.rn.f16.f32 high, f284; +mov.b32 r3197, {low, high}; +} +{ +add.f16x2 r3198, r2998, r3014; +} +{ +add.f16x2 %4, r2733, r3198; +} +{ +add.f16x2 r3204, r3004, r3020; +} +{ +add.f16x2 %5, r2769, r3204; +} +{ +add.f16x2 r3210, r2998, r3014; +} +{ +mul.f16x2 r3213, r3210, r3196; +} +{ +add.f16x2 r3216, r2733, r3213; +} +{ +sub.f16x2 r3219, r3004, r3020; +} +{ +mul.f16x2 r3222, r3219, r3197; +} +{ +add.f16x2 %10, r3216, r3222; +} +{ +add.f16x2 r3228, r2998, r3014; +} +{ +mul.f16x2 r3231, r3228, r3196; +} +{ +add.f16x2 r3234, r2733, r3231; +} +{ +sub.f16x2 r3237, r3004, r3020; +} +{ +mul.f16x2 r3240, r3237, r3197; +} +{ +sub.f16x2 %16, r3234, r3240; +} +{ +add.f16x2 r3246, r3004, r3020; +} +{ +mul.f16x2 r3249, r3246, r3196; +} +{ +add.f16x2 r3252, r2769, r3249; +} +{ +sub.f16x2 r3255, r2998, r3014; +} +{ +mul.f16x2 r3258, r3255, r3197; +} +{ +sub.f16x2 %11, r3252, r3258; +} +{ +add.f16x2 r3264, r3004, r3020; +} +{ +mul.f16x2 r3267, r3264, r3196; +} +{ +add.f16x2 r3270, r2769, r3267; +} +{ +sub.f16x2 r3273, r2998, r3014; +} +{ +mul.f16x2 r3276, r3273, r3197; +} +{ +add.f16x2 %17, r3270, r3276; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..d9760fa023f0e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp32_fwd.hpp.inc @@ -0,0 +1,6108 @@ +#ifndef CUFFTDX_FFT_6561_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_6561_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<151, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2721>; +.reg .b32 r<25>; +.reg .b64 rd<16>; +mov.u32 r23, %tid.y; +mov.u32 r24, %54; +mad.lo.s32 r3, r23, 52488, r24; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2720, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2719, %58, f2720; +mul.f32 f119, f2720, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2718, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2717, %64, f2718; +mul.f32 f135, f2718, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2716, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2715, %70, f2716; +mul.f32 f151, f2716, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f2714, f133, 0f3F441B7D; +sub.f32 f159, f2714, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f2712, f149, 0f3E31D0D4; +mul.f32 f2713, f155, 0fBF7C1C5C; +sub.f32 f164, f2712, f2713; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f2710, f134, 0f3E31D0D4; +mul.f32 f2711, f140, 0fBF7C1C5C; +sub.f32 f169, f2710, f2711; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f2708, f150, 0fBF708FB2; +mul.f32 f2709, f156, 0fBEAF1D44; +sub.f32 f174, f2708, f2709; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2707, f2717, f2715; +sub.f32 f183, f2717, f2715; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2706, f2719, f2707; +mul.f32 f187, f2707, 0f3F000000; +sub.f32 f188, f2719, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2705, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2704, f123, f2705; +mul.f32 f203, f2705, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2703, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2702, f124, f2703; +mul.f32 f219, f2703, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2699, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2697, %113, f2699; +mul.f32 f235, f2699, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2694, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2692, %116, f2694; +mul.f32 f251, f2694, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2689, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2687, %119, f2689; +mul.f32 f267, f2689, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f2686, f249, 0f3F441B7D; +sub.f32 f275, f2686, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f2685, f265, 0f3E31D0D4; +sub.f32 f280, f2685, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f2683, f250, 0f3E31D0D4; +mul.f32 f2684, f256, 0fBF7C1C5C; +sub.f32 f285, f2683, f2684; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f2681, f266, 0fBF708FB2; +mul.f32 f2682, f272, 0fBEAF1D44; +sub.f32 f290, f2681, f2682; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2680, f2692, f2687; +sub.f32 f299, f2692, f2687; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2679, f2697, f2680; +mul.f32 f303, f2680, 0f3F000000; +sub.f32 f304, f2697, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2678, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2677, f239, f2678; +mul.f32 f319, f2678, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2676, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2675, f240, f2676; +mul.f32 f335, f2676, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2672, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2670, %122, f2672; +mul.f32 f351, f2672, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2667, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2665, %125, f2667; +mul.f32 f367, f2667, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2663, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2661, %127, f2663; +mul.f32 f383, f2663, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f2660, f365, 0f3F441B7D; +sub.f32 f391, f2660, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f2659, f381, 0f3E31D0D4; +sub.f32 f396, f2659, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f2657, f366, 0f3E31D0D4; +mul.f32 f2658, f372, 0fBF7C1C5C; +sub.f32 f401, f2657, f2658; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f2655, f382, 0fBF708FB2; +mul.f32 f2656, f388, 0fBEAF1D44; +sub.f32 f406, f2655, f2656; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2654, f2665, f2661; +sub.f32 f415, f2665, f2661; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2653, f2670, f2654; +mul.f32 f419, f2654, 0f3F000000; +sub.f32 f420, f2670, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2652, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2651, f355, f2652; +mul.f32 f435, f2652, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2650, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2649, f356, f2650; +mul.f32 f451, f2650, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2677, 0fBE6C2691; +mul.f32 f2648, f310, 0f3F791978; +sub.f32 f459, f2648, f458; +mul.f32 f460, f2677, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f2646, f426, 0f3F64C51C; +mul.f32 f2647, f2651, 0fBEE5C902; +sub.f32 f464, f2646, f2647; +mul.f32 f465, f2651, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f2644, f326, 0f3F64C51C; +mul.f32 f2645, f2675, 0fBEE5C902; +sub.f32 f469, f2644, f2645; +mul.f32 f470, f2675, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f2642, f442, 0f3F18DF63; +mul.f32 f2643, f2649, 0fBF4D57F2; +sub.f32 f474, f2642, f2643; +mul.f32 f475, f2649, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f2640, f301, 0f3F441B7D; +mul.f32 f2641, f307, 0fBF248DBB; +sub.f32 f479, f2640, f2641; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f2639, f417, 0f3E31D0D4; +sub.f32 f484, f2639, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f2638, f317, 0f3F18DF63; +sub.f32 f489, f2638, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f2637, f433, 0fBE92D7E0; +sub.f32 f494, f2637, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f2636, f333, 0f3ECACAF8; +sub.f32 f499, f2636, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f2635, f449, 0fBF2FAD88; +sub.f32 f504, f2635, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f2634, f302, 0f3E31D0D4; +sub.f32 f509, f2634, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f2632, f418, 0fBF708FB2; +mul.f32 f2633, f424, 0fBEAF1D44; +sub.f32 f514, f2632, f2633; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f2630, f318, 0fBD6E2946; +mul.f32 f2631, f324, 0fBF7F9120; +sub.f32 f519, f2630, f2631; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f2628, f434, 0fBF7E44DE; +mul.f32 f2629, f440, 0f3DEDC21F; +sub.f32 f524, f2628, f2629; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f2627, f334, 0fBE92D7E0; +sub.f32 f529, f2627, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f2626, f450, 0fBF55E287; +sub.f32 f534, f2626, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f2625, f2679, f2653; +sub.f32 f541, f2679, f2653; +mul.f32 f542, f541, 0f3F5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f2625, 0f3F000000; +sub.f32 f546, f2706, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0f3F5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f2624, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0f3F5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f2623, f2704, f2624; +mul.f32 f561, f2624, 0f3F000000; +sub.f32 f562, f2704, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0f3F5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f2622, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0f3F5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f2621, f2702, f2622; +mul.f32 f577, f2622, 0f3F000000; +sub.f32 f578, f2702, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0f3F5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f2620, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0f3F5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f2619, f191, f2620; +mul.f32 f593, f2620, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0f3F5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f2618, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f2617, f207, f2618; +mul.f32 f609, f2618, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0f3F5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f2616, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0f3F5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f2615, f223, f2616; +mul.f32 f625, f2616, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0f3F5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f2614, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0f3F5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f2613, f192, f2614; +mul.f32 f641, f2614, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0f3F5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f2612, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0f3F5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f2611, f208, f2612; +mul.f32 f657, f2612, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0f3F5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f2610, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0f3F5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f2609, f224, f2610; +mul.f32 f673, f2610, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0f3F5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r22, %tid.x; +mul.wide.u32 rd2, r22, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r22, r6; +mad.lo.s32 r8, r5, 52488, r3; +mul.wide.u32 rd14, r7, 8; +mov.u64 rd15, %55; +add.s64 rd6, rd15, rd14; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f684, f680, f2623; +mul.f32 f685, f679, f2623; +mul.f32 f2607, f679, f679; +mul.f32 f2608, f680, f680; +sub.f32 f688, f2607, f2608; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f692, f690, f2621; +mul.f32 f693, f688, f2621; +mul.f32 f695, f680, f690; +mul.f32 f2606, f679, f688; +sub.f32 f696, f2606, f695; +mul.f32 f2605, f688, f568; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f700, f698, f2619; +mul.f32 f701, f696, f2619; +mul.f32 f2603, f679, f696; +mul.f32 f2604, f680, f698; +sub.f32 f704, f2603, f2604; +mul.f32 f2602, f696, f584; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f708, f706, f2617; +mul.f32 f709, f704, f2617; +mul.f32 f711, f680, f706; +mul.f32 f2601, f679, f704; +sub.f32 f712, f2601, f711; +mul.f32 f2600, f704, f600; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f716, f714, f2615; +mul.f32 f717, f712, f2615; +mul.f32 f719, f680, f714; +mul.f32 f2599, f679, f712; +sub.f32 f720, f2599, f719; +mul.f32 f2598, f712, f616; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f724, f722, f2613; +mul.f32 f725, f720, f2613; +mul.f32 f2596, f679, f720; +mul.f32 f2597, f680, f722; +sub.f32 f728, f2596, f2597; +mul.f32 f2595, f720, f632; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f732, f730, f2611; +mul.f32 f733, f728, f2611; +mul.f32 f735, f680, f730; +mul.f32 f2594, f679, f728; +sub.f32 f736, f2594, f735; +mul.f32 f2593, f728, f648; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f740, f738, f2609; +mul.f32 f741, f736, f2609; +mul.f32 f743, f680, f738; +mul.f32 f2592, f679, f736; +sub.f32 f744, f2592, f743; +mul.f32 f2591, f736, f664; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f748, f746, f549; +mul.f32 f749, f744, f549; +mul.f32 f2589, f679, f744; +mul.f32 f2590, f680, f746; +sub.f32 f752, f2589, f2590; +mul.f32 f2588, f744, f543; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f756, f754, f565; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f2587, f679, f752; +sub.f32 f760, f2587, f759; +mul.f32 f2586, f752, f559; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f764, f762, f581; +mul.f32 f765, f760, f581; +mul.f32 f2584, f679, f760; +mul.f32 f2585, f680, f762; +sub.f32 f768, f2584, f2585; +mul.f32 f2583, f760, f575; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f772, f770, f597; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f2582, f679, f768; +sub.f32 f776, f2582, f775; +mul.f32 f2581, f768, f591; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f780, f778, f613; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f2580, f679, f776; +sub.f32 f784, f2580, f783; +mul.f32 f2579, f776, f607; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f788, f786, f629; +mul.f32 f789, f784, f629; +mul.f32 f2577, f679, f784; +mul.f32 f2578, f680, f786; +sub.f32 f792, f2577, f2578; +mul.f32 f2576, f784, f623; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f796, f794, f645; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f2575, f679, f792; +sub.f32 f800, f2575, f799; +mul.f32 f2574, f792, f639; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f804, f802, f661; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f2573, f679, f800; +sub.f32 f808, f2573, f807; +mul.f32 f2572, f800, f655; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f812, f810, f677; +mul.f32 f813, f808, f677; +mul.f32 f2570, f679, f808; +mul.f32 f2571, f680, f810; +sub.f32 f816, f2570, f2571; +mul.f32 f2569, f808, f671; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f820, f818, f550; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f2568, f679, f816; +sub.f32 f824, f2568, f823; +mul.f32 f2567, f816, f544; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f828, f826, f566; +mul.f32 f829, f824, f566; +mul.f32 f2565, f679, f824; +mul.f32 f2566, f680, f826; +sub.f32 f832, f2565, f2566; +mul.f32 f2564, f824, f560; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f836, f834, f582; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f2563, f679, f832; +sub.f32 f840, f2563, f839; +mul.f32 f2562, f832, f576; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f844, f842, f598; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f2561, f679, f840; +sub.f32 f848, f2561, f847; +mul.f32 f2560, f840, f592; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f852, f850, f614; +mul.f32 f853, f848, f614; +mul.f32 f2558, f679, f848; +mul.f32 f2559, f680, f850; +sub.f32 f856, f2558, f2559; +mul.f32 f2557, f848, f608; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f860, f858, f630; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f2556, f679, f856; +sub.f32 f864, f2556, f863; +mul.f32 f2555, f856, f624; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f868, f866, f646; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f2554, f679, f864; +sub.f32 f872, f2554, f871; +mul.f32 f2553, f864, f640; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f876, f874, f662; +mul.f32 f877, f872, f662; +mul.f32 f2551, f679, f872; +mul.f32 f2552, f680, f874; +sub.f32 f880, f2551, f2552; +mul.f32 f2550, f679, f552; +mul.f32 f881, f679, f874; +mul.f32 f2549, f872, f656; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f880, f672; +mul.f32 f884, f882, f678; +mul.f32 f885, f880, f678; +barrier.sync 0; +add.f32 f886, f2706, f2625; +add.f32 f887, f178, f537; +mad.lo.s32 r21, r7, 216, r8; +st.shared.v2.f32 [r21], {f887, f886}; +fma.rn.f32 f888, f680, f552, f685; +sub.f32 f889, f2550, f684; +st.shared.v2.f32 [r21+8], {f889, f888}; +fma.rn.f32 f890, f690, f568, f693; +sub.f32 f891, f2605, f692; +st.shared.v2.f32 [r21+16], {f891, f890}; +fma.rn.f32 f892, f698, f584, f701; +sub.f32 f893, f2602, f700; +st.shared.v2.f32 [r21+24], {f893, f892}; +fma.rn.f32 f894, f706, f600, f709; +sub.f32 f895, f2600, f708; +st.shared.v2.f32 [r21+32], {f895, f894}; +fma.rn.f32 f896, f714, f616, f717; +sub.f32 f897, f2598, f716; +st.shared.v2.f32 [r21+40], {f897, f896}; +fma.rn.f32 f898, f722, f632, f725; +sub.f32 f899, f2595, f724; +st.shared.v2.f32 [r21+48], {f899, f898}; +sub.f32 f900, f2593, f732; +fma.rn.f32 f901, f730, f648, f733; +st.shared.v2.f32 [r21+56], {f900, f901}; +fma.rn.f32 f902, f738, f664, f741; +sub.f32 f903, f2591, f740; +st.shared.v2.f32 [r21+64], {f903, f902}; +fma.rn.f32 f904, f746, f543, f749; +sub.f32 f905, f2588, f748; +st.shared.v2.f32 [r21+72], {f905, f904}; +fma.rn.f32 f906, f754, f559, f757; +sub.f32 f907, f2586, f756; +st.shared.v2.f32 [r21+80], {f907, f906}; +fma.rn.f32 f908, f762, f575, f765; +sub.f32 f909, f2583, f764; +st.shared.v2.f32 [r21+88], {f909, f908}; +fma.rn.f32 f910, f770, f591, f773; +sub.f32 f911, f2581, f772; +st.shared.v2.f32 [r21+96], {f911, f910}; +fma.rn.f32 f912, f778, f607, f781; +sub.f32 f913, f2579, f780; +st.shared.v2.f32 [r21+104], {f913, f912}; +fma.rn.f32 f914, f786, f623, f789; +sub.f32 f915, f2576, f788; +st.shared.v2.f32 [r21+112], {f915, f914}; +fma.rn.f32 f916, f794, f639, f797; +sub.f32 f917, f2574, f796; +st.shared.v2.f32 [r21+120], {f917, f916}; +fma.rn.f32 f918, f802, f655, f805; +sub.f32 f919, f2572, f804; +st.shared.v2.f32 [r21+128], {f919, f918}; +fma.rn.f32 f920, f810, f671, f813; +sub.f32 f921, f2569, f812; +st.shared.v2.f32 [r21+136], {f921, f920}; +fma.rn.f32 f922, f818, f544, f821; +sub.f32 f923, f2567, f820; +st.shared.v2.f32 [r21+144], {f923, f922}; +fma.rn.f32 f924, f826, f560, f829; +sub.f32 f925, f2564, f828; +st.shared.v2.f32 [r21+152], {f925, f924}; +fma.rn.f32 f926, f834, f576, f837; +sub.f32 f927, f2562, f836; +st.shared.v2.f32 [r21+160], {f927, f926}; +fma.rn.f32 f928, f842, f592, f845; +sub.f32 f929, f2560, f844; +st.shared.v2.f32 [r21+168], {f929, f928}; +fma.rn.f32 f930, f850, f608, f853; +sub.f32 f931, f2557, f852; +st.shared.v2.f32 [r21+176], {f931, f930}; +fma.rn.f32 f932, f858, f624, f861; +sub.f32 f933, f2555, f860; +st.shared.v2.f32 [r21+184], {f933, f932}; +fma.rn.f32 f934, f866, f640, f869; +sub.f32 f935, f2553, f868; +st.shared.v2.f32 [r21+192], {f935, f934}; +fma.rn.f32 f936, f874, f656, f877; +sub.f32 f937, f2549, f876; +st.shared.v2.f32 [r21+200], {f937, f936}; +fma.rn.f32 f938, f882, f672, f885; +sub.f32 f939, f883, f884; +st.shared.v2.f32 [r21+208], {f939, f938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r21; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+1944]; +ld.shared.v2.f32 {f948, f949}, [r10+3888]; +ld.shared.v2.f32 {f952, f953}, [r10+5832]; +ld.shared.v2.f32 {f956, f957}, [r10+7776]; +ld.shared.v2.f32 {f960, f961}, [r10+9720]; +ld.shared.v2.f32 {f964, f965}, [r10+11664]; +ld.shared.v2.f32 {f968, f969}, [r10+13608]; +ld.shared.v2.f32 {f972, f973}, [r10+15552]; +ld.shared.v2.f32 {f976, f977}, [r10+17496]; +ld.shared.v2.f32 {f980, f981}, [r10+19440]; +ld.shared.v2.f32 {f984, f985}, [r10+21384]; +ld.shared.v2.f32 {f988, f989}, [r10+23328]; +ld.shared.v2.f32 {f992, f993}, [r10+25272]; +ld.shared.v2.f32 {f996, f997}, [r10+27216]; +ld.shared.v2.f32 {f1000, f1001}, [r10+29160]; +ld.shared.v2.f32 {f1004, f1005}, [r10+31104]; +ld.shared.v2.f32 {f1008, f1009}, [r10+33048]; +ld.shared.v2.f32 {f1012, f1013}, [r10+34992]; +ld.shared.v2.f32 {f1016, f1017}, [r10+36936]; +ld.shared.v2.f32 {f1020, f1021}, [r10+38880]; +ld.shared.v2.f32 {f1024, f1025}, [r10+40824]; +ld.shared.v2.f32 {f1028, f1029}, [r10+42768]; +ld.shared.v2.f32 {f1032, f1033}, [r10+44712]; +ld.shared.v2.f32 {f1036, f1037}, [r10+46656]; +ld.shared.v2.f32 {f1040, f1041}, [r10+48600]; +ld.shared.v2.f32 {f1044, f1045}, [r10+50544]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f2548, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0f3F5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f2547, f941, f2548; +mul.f32 f1058, f2548, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0f3F5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f2546, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0f3F5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f2545, f953, f2546; +mul.f32 f1074, f2546, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0f3F5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f2544, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0f3F5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f2543, f965, f2544; +mul.f32 f1090, f2544, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0f3F5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f2541, f1072, 0f3F441B7D; +mul.f32 f2542, f1078, 0fBF248DBB; +sub.f32 f1098, f2541, f2542; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0fBF248DBB, f1099; +mul.f32 f2539, f1088, 0f3E31D0D4; +mul.f32 f2540, f1094, 0fBF7C1C5C; +sub.f32 f1103, f2539, f2540; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0fBF7C1C5C, f1104; +mul.f32 f2537, f1073, 0f3E31D0D4; +mul.f32 f2538, f1079, 0fBF7C1C5C; +sub.f32 f1108, f2537, f2538; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0fBF7C1C5C, f1109; +mul.f32 f1112, f1095, 0fBEAF1D44; +mul.f32 f2536, f1089, 0fBF708FB2; +sub.f32 f1113, f2536, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0fBEAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f2535, f2545, f2543; +sub.f32 f1122, f2545, f2543; +mul.f32 f1123, f1122, 0f3F5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f2534, f2547, f2535; +mul.f32 f1126, f2535, 0f3F000000; +sub.f32 f1127, f2547, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0f3F5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f2533, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0f3F5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f2532, f1062, f2533; +mul.f32 f1142, f2533, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0f3F5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f2531, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0f3F5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f2530, f1063, f2531; +mul.f32 f1158, f2531, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0f3F5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f2529, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0f3F5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f2528, f945, f2529; +mul.f32 f1174, f2529, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0f3F5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f2527, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0f3F5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f2526, f957, f2527; +mul.f32 f1190, f2527, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0f3F5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f2525, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0f3F5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f2524, f969, f2525; +mul.f32 f1206, f2525, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0f3F5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f2522, f1188, 0f3F441B7D; +mul.f32 f2523, f1194, 0fBF248DBB; +sub.f32 f1214, f2522, f2523; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0fBF248DBB, f1215; +mul.f32 f2520, f1204, 0f3E31D0D4; +mul.f32 f2521, f1210, 0fBF7C1C5C; +sub.f32 f1219, f2520, f2521; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0fBF7C1C5C, f1220; +mul.f32 f2518, f1189, 0f3E31D0D4; +mul.f32 f2519, f1195, 0fBF7C1C5C; +sub.f32 f1224, f2518, f2519; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0fBF7C1C5C, f1225; +mul.f32 f2516, f1205, 0fBF708FB2; +mul.f32 f2517, f1211, 0fBEAF1D44; +sub.f32 f1229, f2516, f2517; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0fBEAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f2515, f2526, f2524; +sub.f32 f1238, f2526, f2524; +mul.f32 f1239, f1238, 0f3F5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f2514, f2528, f2515; +mul.f32 f1242, f2515, 0f3F000000; +sub.f32 f1243, f2528, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0f3F5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f2513, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0f3F5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f2512, f1178, f2513; +mul.f32 f1258, f2513, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0f3F5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f2511, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0f3F5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f2510, f1179, f2511; +mul.f32 f1274, f2511, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0f3F5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f2509, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0f3F5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f2508, f949, f2509; +mul.f32 f1290, f2509, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0f3F5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f2507, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0f3F5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f2506, f961, f2507; +mul.f32 f1306, f2507, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0f3F5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f2505, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0f3F5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f2504, f973, f2505; +mul.f32 f1322, f2505, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0f3F5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0fBF248DBB; +mul.f32 f2503, f1304, 0f3F441B7D; +sub.f32 f1330, f2503, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0fBF248DBB, f1331; +mul.f32 f2501, f1320, 0f3E31D0D4; +mul.f32 f2502, f1326, 0fBF7C1C5C; +sub.f32 f1335, f2501, f2502; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0fBF7C1C5C, f1336; +mul.f32 f2499, f1305, 0f3E31D0D4; +mul.f32 f2500, f1311, 0fBF7C1C5C; +sub.f32 f1340, f2499, f2500; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0fBF7C1C5C, f1341; +mul.f32 f2497, f1321, 0fBF708FB2; +mul.f32 f2498, f1327, 0fBEAF1D44; +sub.f32 f1345, f2497, f2498; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0fBEAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f2496, f2506, f2504; +sub.f32 f1354, f2506, f2504; +mul.f32 f1355, f1354, 0f3F5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f2495, f2508, f2496; +mul.f32 f1358, f2496, 0f3F000000; +sub.f32 f1359, f2508, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0f3F5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f2494, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0f3F5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f2493, f1294, f2494; +mul.f32 f1374, f2494, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0f3F5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f2492, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0f3F5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f2491, f1295, f2492; +mul.f32 f1390, f2492, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0f3F5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1397, f2512, 0fBE6C2691; +mul.f32 f2490, f1249, 0f3F791978; +sub.f32 f1398, f2490, f1397; +mul.f32 f1399, f2512, 0f3F791978; +fma.rn.f32 f1400, f1249, 0fBE6C2691, f1399; +mul.f32 f1402, f2493, 0fBEE5C902; +mul.f32 f2489, f1365, 0f3F64C51C; +sub.f32 f1403, f2489, f1402; +mul.f32 f1404, f2493, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0fBEE5C902, f1404; +mul.f32 f1407, f2510, 0fBEE5C902; +mul.f32 f2488, f1265, 0f3F64C51C; +sub.f32 f1408, f2488, f1407; +mul.f32 f1409, f2510, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0fBEE5C902, f1409; +mul.f32 f2486, f1381, 0f3F18DF63; +mul.f32 f2487, f2491, 0fBF4D57F2; +sub.f32 f1413, f2486, f2487; +mul.f32 f1414, f2491, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0fBF4D57F2, f1414; +mul.f32 f2484, f1240, 0f3F441B7D; +mul.f32 f2485, f1246, 0fBF248DBB; +sub.f32 f1418, f2484, f2485; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0fBF248DBB, f1419; +mul.f32 f2482, f1356, 0f3E31D0D4; +mul.f32 f2483, f1362, 0fBF7C1C5C; +sub.f32 f1423, f2482, f2483; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0fBF7C1C5C, f1424; +mul.f32 f2480, f1256, 0f3F18DF63; +mul.f32 f2481, f1262, 0fBF4D57F2; +sub.f32 f1428, f2480, f2481; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0fBF4D57F2, f1429; +mul.f32 f1432, f1378, 0fBF753ECD; +mul.f32 f2479, f1372, 0fBE92D7E0; +sub.f32 f1433, f2479, f1432; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0fBF753ECD, f1434; +mul.f32 f1437, f1278, 0fBF6B1036; +mul.f32 f2478, f1272, 0f3ECACAF8; +sub.f32 f1438, f2478, f1437; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0fBF6B1036, f1439; +mul.f32 f1442, f1394, 0fBF3A3529; +mul.f32 f2477, f1388, 0fBF2FAD88; +sub.f32 f1443, f2477, f1442; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0fBF3A3529, f1444; +mul.f32 f1447, f1247, 0fBF7C1C5C; +mul.f32 f2476, f1241, 0f3E31D0D4; +sub.f32 f1448, f2476, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0fBF7C1C5C, f1449; +mul.f32 f1452, f1363, 0fBEAF1D44; +mul.f32 f2475, f1357, 0fBF708FB2; +sub.f32 f1453, f2475, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0fBEAF1D44, f1454; +mul.f32 f1457, f1263, 0fBF7F9120; +mul.f32 f2474, f1257, 0fBD6E2946; +sub.f32 f1458, f2474, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0fBF7F9120, f1459; +mul.f32 f2472, f1373, 0fBF7E44DE; +mul.f32 f2473, f1379, 0f3DEDC21F; +sub.f32 f1463, f2472, f2473; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0f3DEDC21F, f1464; +mul.f32 f2470, f1273, 0fBE92D7E0; +mul.f32 f2471, f1279, 0fBF753ECD; +sub.f32 f1468, f2470, f2471; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0fBF753ECD, f1469; +mul.f32 f2468, f1389, 0fBF55E287; +mul.f32 f2469, f1395, 0f3F0CAC9F; +sub.f32 f1473, f2468, f2469; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0f3F0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f2467, f2514, f2495; +sub.f32 f1480, f2514, f2495; +mul.f32 f1481, f1480, 0f3F5DB3D7; +add.f32 f1482, f1481, f1479; +sub.f32 f1483, f1479, f1481; +mul.f32 f1484, f2467, 0f3F000000; +sub.f32 f1485, f2534, f1484; +sub.f32 f1486, f1233, f1349; +mul.f32 f1487, f1486, 0f3F5DB3D7; +sub.f32 f1488, f1485, f1487; +add.f32 f1489, f1487, f1485; +add.f32 f1490, f1398, f1403; +add.f32 f1491, f1133, f1490; +mul.f32 f1494, f1490, 0f3F000000; +sub.f32 f1495, f1133, f1494; +add.f32 f2466, f1400, f1405; +sub.f32 f1496, f1400, f1405; +mul.f32 f1497, f1496, 0f3F5DB3D7; +add.f32 f1498, f1497, f1495; +sub.f32 f1499, f1495, f1497; +add.f32 f2465, f2532, f2466; +mul.f32 f1500, f2466, 0f3F000000; +sub.f32 f1501, f2532, f1500; +sub.f32 f1502, f1398, f1403; +mul.f32 f1503, f1502, 0f3F5DB3D7; +sub.f32 f1504, f1501, f1503; +add.f32 f1505, f1503, f1501; +add.f32 f1506, f1408, f1413; +add.f32 f1507, f1149, f1506; +mul.f32 f1510, f1506, 0f3F000000; +sub.f32 f1511, f1149, f1510; +add.f32 f2464, f1410, f1415; +sub.f32 f1512, f1410, f1415; +mul.f32 f1513, f1512, 0f3F5DB3D7; +add.f32 f1514, f1513, f1511; +sub.f32 f1515, f1511, f1513; +add.f32 f2463, f2530, f2464; +mul.f32 f1516, f2464, 0f3F000000; +sub.f32 f1517, f2530, f1516; +sub.f32 f1518, f1408, f1413; +mul.f32 f1519, f1518, 0f3F5DB3D7; +sub.f32 f1520, f1517, f1519; +add.f32 f1521, f1519, f1517; +add.f32 f1522, f1418, f1423; +add.f32 f1523, f1124, f1522; +mul.f32 f1526, f1522, 0f3F000000; +sub.f32 f1527, f1124, f1526; +add.f32 f2462, f1420, f1425; +sub.f32 f1528, f1420, f1425; +mul.f32 f1529, f1528, 0f3F5DB3D7; +add.f32 f1530, f1529, f1527; +sub.f32 f1531, f1527, f1529; +add.f32 f2461, f1130, f2462; +mul.f32 f1532, f2462, 0f3F000000; +sub.f32 f1533, f1130, f1532; +sub.f32 f1534, f1418, f1423; +mul.f32 f1535, f1534, 0f3F5DB3D7; +sub.f32 f1536, f1533, f1535; +add.f32 f1537, f1535, f1533; +add.f32 f1538, f1428, f1433; +add.f32 f1539, f1140, f1538; +mul.f32 f1542, f1538, 0f3F000000; +sub.f32 f1543, f1140, f1542; +add.f32 f2460, f1430, f1435; +sub.f32 f1544, f1430, f1435; +mul.f32 f1545, f1544, 0f3F5DB3D7; +add.f32 f1546, f1545, f1543; +sub.f32 f1547, f1543, f1545; +add.f32 f2459, f1146, f2460; +mul.f32 f1548, f2460, 0f3F000000; +sub.f32 f1549, f1146, f1548; +sub.f32 f1550, f1428, f1433; +mul.f32 f1551, f1550, 0f3F5DB3D7; +sub.f32 f1552, f1549, f1551; +add.f32 f1553, f1551, f1549; +add.f32 f1554, f1438, f1443; +add.f32 f1555, f1156, f1554; +mul.f32 f1558, f1554, 0f3F000000; +sub.f32 f1559, f1156, f1558; +add.f32 f2458, f1440, f1445; +sub.f32 f1560, f1440, f1445; +mul.f32 f1561, f1560, 0f3F5DB3D7; +add.f32 f1562, f1561, f1559; +sub.f32 f1563, f1559, f1561; +add.f32 f2457, f1162, f2458; +mul.f32 f1564, f2458, 0f3F000000; +sub.f32 f1565, f1162, f1564; +sub.f32 f1566, f1438, f1443; +mul.f32 f1567, f1566, 0f3F5DB3D7; +sub.f32 f1568, f1565, f1567; +add.f32 f1569, f1567, f1565; +add.f32 f1570, f1448, f1453; +add.f32 f1571, f1125, f1570; +mul.f32 f1574, f1570, 0f3F000000; +sub.f32 f1575, f1125, f1574; +add.f32 f2456, f1450, f1455; +sub.f32 f1576, f1450, f1455; +mul.f32 f1577, f1576, 0f3F5DB3D7; +add.f32 f1578, f1577, f1575; +sub.f32 f1579, f1575, f1577; +add.f32 f2455, f1131, f2456; +mul.f32 f1580, f2456, 0f3F000000; +sub.f32 f1581, f1131, f1580; +sub.f32 f1582, f1448, f1453; +mul.f32 f1583, f1582, 0f3F5DB3D7; +sub.f32 f1584, f1581, f1583; +add.f32 f1585, f1583, f1581; +add.f32 f1586, f1458, f1463; +add.f32 f1587, f1141, f1586; +mul.f32 f1590, f1586, 0f3F000000; +sub.f32 f1591, f1141, f1590; +add.f32 f2454, f1460, f1465; +sub.f32 f1592, f1460, f1465; +mul.f32 f1593, f1592, 0f3F5DB3D7; +add.f32 f1594, f1593, f1591; +sub.f32 f1595, f1591, f1593; +add.f32 f2453, f1147, f2454; +mul.f32 f1596, f2454, 0f3F000000; +sub.f32 f1597, f1147, f1596; +sub.f32 f1598, f1458, f1463; +mul.f32 f1599, f1598, 0f3F5DB3D7; +sub.f32 f1600, f1597, f1599; +add.f32 f1601, f1599, f1597; +add.f32 f1602, f1468, f1473; +add.f32 f1603, f1157, f1602; +mul.f32 f1606, f1602, 0f3F000000; +sub.f32 f1607, f1157, f1606; +add.f32 f2452, f1470, f1475; +sub.f32 f1608, f1470, f1475; +mul.f32 f1609, f1608, 0f3F5DB3D7; +add.f32 f1610, f1609, f1607; +sub.f32 f1611, f1607, f1609; +add.f32 f2451, f1163, f2452; +mul.f32 f1612, f2452, 0f3F000000; +sub.f32 f1613, f1163, f1612; +sub.f32 f1614, f1468, f1473; +mul.f32 f1615, f1614, 0f3F5DB3D7; +sub.f32 f1616, f1613, f1615; +add.f32 f1617, f1615, f1613; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1618, f1619}, [rd11]; +mul.f32 f1623, f1619, f2465; +mul.f32 f1624, f1618, f2465; +mul.f32 f2449, f1618, f1618; +mul.f32 f2450, f1619, f1619; +sub.f32 f1627, f2449, f2450; +mul.f32 f1628, f1619, f1618; +fma.rn.f32 f1629, f1619, f1618, f1628; +mul.f32 f1631, f1629, f2463; +mul.f32 f1632, f1627, f2463; +mul.f32 f1634, f1619, f1629; +mul.f32 f2448, f1618, f1627; +sub.f32 f1635, f2448, f1634; +mul.f32 f2447, f1627, f1507; +mul.f32 f1636, f1618, f1629; +fma.rn.f32 f1637, f1619, f1627, f1636; +mul.f32 f1639, f1637, f2461; +mul.f32 f1640, f1635, f2461; +mul.f32 f2445, f1618, f1635; +mul.f32 f2446, f1619, f1637; +sub.f32 f1643, f2445, f2446; +mul.f32 f2444, f1635, f1523; +mul.f32 f1644, f1618, f1637; +fma.rn.f32 f1645, f1619, f1635, f1644; +mul.f32 f1647, f1645, f2459; +mul.f32 f1648, f1643, f2459; +mul.f32 f1650, f1619, f1645; +mul.f32 f2443, f1618, f1643; +sub.f32 f1651, f2443, f1650; +mul.f32 f2442, f1643, f1539; +mul.f32 f1652, f1618, f1645; +fma.rn.f32 f1653, f1619, f1643, f1652; +mul.f32 f1655, f1653, f2457; +mul.f32 f1656, f1651, f2457; +mul.f32 f1658, f1619, f1653; +mul.f32 f2441, f1618, f1651; +sub.f32 f1659, f2441, f1658; +mul.f32 f2440, f1651, f1555; +mul.f32 f1660, f1618, f1653; +fma.rn.f32 f1661, f1619, f1651, f1660; +mul.f32 f1663, f1661, f2455; +mul.f32 f1664, f1659, f2455; +mul.f32 f2438, f1618, f1659; +mul.f32 f2439, f1619, f1661; +sub.f32 f1667, f2438, f2439; +mul.f32 f2437, f1659, f1571; +mul.f32 f1668, f1618, f1661; +fma.rn.f32 f1669, f1619, f1659, f1668; +mul.f32 f1671, f1669, f2453; +mul.f32 f1672, f1667, f2453; +mul.f32 f1674, f1619, f1669; +mul.f32 f2436, f1618, f1667; +sub.f32 f1675, f2436, f1674; +mul.f32 f2435, f1667, f1587; +mul.f32 f1676, f1618, f1669; +fma.rn.f32 f1677, f1619, f1667, f1676; +mul.f32 f1679, f1677, f2451; +mul.f32 f1680, f1675, f2451; +mul.f32 f1682, f1619, f1677; +mul.f32 f2434, f1618, f1675; +sub.f32 f1683, f2434, f1682; +mul.f32 f2433, f1675, f1603; +mul.f32 f1684, f1618, f1677; +fma.rn.f32 f1685, f1619, f1675, f1684; +mul.f32 f1687, f1685, f1488; +mul.f32 f1688, f1683, f1488; +mul.f32 f2431, f1618, f1683; +mul.f32 f2432, f1619, f1685; +sub.f32 f1691, f2431, f2432; +mul.f32 f2430, f1683, f1482; +mul.f32 f1692, f1618, f1685; +fma.rn.f32 f1693, f1619, f1683, f1692; +mul.f32 f1695, f1693, f1504; +mul.f32 f1696, f1691, f1504; +mul.f32 f1698, f1619, f1693; +mul.f32 f2429, f1618, f1691; +sub.f32 f1699, f2429, f1698; +mul.f32 f2428, f1691, f1498; +mul.f32 f1700, f1618, f1693; +fma.rn.f32 f1701, f1619, f1691, f1700; +mul.f32 f1703, f1701, f1520; +mul.f32 f1704, f1699, f1520; +mul.f32 f2426, f1618, f1699; +mul.f32 f2427, f1619, f1701; +sub.f32 f1707, f2426, f2427; +mul.f32 f2425, f1699, f1514; +mul.f32 f1708, f1618, f1701; +fma.rn.f32 f1709, f1619, f1699, f1708; +mul.f32 f1711, f1709, f1536; +mul.f32 f1712, f1707, f1536; +mul.f32 f1714, f1619, f1709; +mul.f32 f2424, f1618, f1707; +sub.f32 f1715, f2424, f1714; +mul.f32 f2423, f1707, f1530; +mul.f32 f1716, f1618, f1709; +fma.rn.f32 f1717, f1619, f1707, f1716; +mul.f32 f1719, f1717, f1552; +mul.f32 f1720, f1715, f1552; +mul.f32 f1722, f1619, f1717; +mul.f32 f2422, f1618, f1715; +sub.f32 f1723, f2422, f1722; +mul.f32 f2421, f1715, f1546; +mul.f32 f1724, f1618, f1717; +fma.rn.f32 f1725, f1619, f1715, f1724; +mul.f32 f1727, f1725, f1568; +mul.f32 f1728, f1723, f1568; +mul.f32 f2419, f1618, f1723; +mul.f32 f2420, f1619, f1725; +sub.f32 f1731, f2419, f2420; +mul.f32 f2418, f1723, f1562; +mul.f32 f1732, f1618, f1725; +fma.rn.f32 f1733, f1619, f1723, f1732; +mul.f32 f1735, f1733, f1584; +mul.f32 f1736, f1731, f1584; +mul.f32 f1738, f1619, f1733; +mul.f32 f2417, f1618, f1731; +sub.f32 f1739, f2417, f1738; +mul.f32 f2416, f1731, f1578; +mul.f32 f1740, f1618, f1733; +fma.rn.f32 f1741, f1619, f1731, f1740; +mul.f32 f1743, f1741, f1600; +mul.f32 f1744, f1739, f1600; +mul.f32 f1746, f1619, f1741; +mul.f32 f2415, f1618, f1739; +sub.f32 f1747, f2415, f1746; +mul.f32 f2414, f1739, f1594; +mul.f32 f1748, f1618, f1741; +fma.rn.f32 f1749, f1619, f1739, f1748; +mul.f32 f1751, f1749, f1616; +mul.f32 f1752, f1747, f1616; +mul.f32 f2412, f1618, f1747; +mul.f32 f2413, f1619, f1749; +sub.f32 f1755, f2412, f2413; +mul.f32 f2411, f1747, f1610; +mul.f32 f1756, f1618, f1749; +fma.rn.f32 f1757, f1619, f1747, f1756; +mul.f32 f1759, f1757, f1489; +mul.f32 f1760, f1755, f1489; +mul.f32 f1762, f1619, f1757; +mul.f32 f2410, f1618, f1755; +sub.f32 f1763, f2410, f1762; +mul.f32 f2409, f1755, f1483; +mul.f32 f1764, f1618, f1757; +fma.rn.f32 f1765, f1619, f1755, f1764; +mul.f32 f1767, f1765, f1505; +mul.f32 f1768, f1763, f1505; +mul.f32 f2407, f1618, f1763; +mul.f32 f2408, f1619, f1765; +sub.f32 f1771, f2407, f2408; +mul.f32 f2406, f1763, f1499; +mul.f32 f1772, f1618, f1765; +fma.rn.f32 f1773, f1619, f1763, f1772; +mul.f32 f1775, f1773, f1521; +mul.f32 f1776, f1771, f1521; +mul.f32 f1778, f1619, f1773; +mul.f32 f2405, f1618, f1771; +sub.f32 f1779, f2405, f1778; +mul.f32 f2404, f1771, f1515; +mul.f32 f1780, f1618, f1773; +fma.rn.f32 f1781, f1619, f1771, f1780; +mul.f32 f1783, f1781, f1537; +mul.f32 f1784, f1779, f1537; +mul.f32 f1786, f1619, f1781; +mul.f32 f2403, f1618, f1779; +sub.f32 f1787, f2403, f1786; +mul.f32 f2402, f1779, f1531; +mul.f32 f1788, f1618, f1781; +fma.rn.f32 f1789, f1619, f1779, f1788; +mul.f32 f1791, f1789, f1553; +mul.f32 f1792, f1787, f1553; +mul.f32 f2400, f1618, f1787; +mul.f32 f2401, f1619, f1789; +sub.f32 f1795, f2400, f2401; +mul.f32 f2399, f1787, f1547; +mul.f32 f1796, f1618, f1789; +fma.rn.f32 f1797, f1619, f1787, f1796; +mul.f32 f1799, f1797, f1569; +mul.f32 f1800, f1795, f1569; +mul.f32 f1802, f1619, f1797; +mul.f32 f2398, f1618, f1795; +sub.f32 f1803, f2398, f1802; +mul.f32 f2397, f1795, f1563; +mul.f32 f1804, f1618, f1797; +fma.rn.f32 f1805, f1619, f1795, f1804; +mul.f32 f1807, f1805, f1585; +mul.f32 f1808, f1803, f1585; +mul.f32 f1810, f1619, f1805; +mul.f32 f2396, f1618, f1803; +sub.f32 f1811, f2396, f1810; +mul.f32 f2395, f1803, f1579; +mul.f32 f1812, f1618, f1805; +fma.rn.f32 f1813, f1619, f1803, f1812; +mul.f32 f1815, f1813, f1601; +mul.f32 f1816, f1811, f1601; +mul.f32 f2393, f1618, f1811; +mul.f32 f2394, f1619, f1813; +sub.f32 f1819, f2393, f2394; +mul.f32 f2392, f1618, f1491; +mul.f32 f1820, f1618, f1813; +mul.f32 f2391, f1811, f1595; +fma.rn.f32 f1821, f1619, f1811, f1820; +mul.f32 f1822, f1819, f1611; +mul.f32 f1823, f1821, f1617; +mul.f32 f1824, f1819, f1617; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 5832, r19; +add.f32 f1825, f2534, f2467; +add.f32 f1826, f1117, f1476; +st.shared.v2.f32 [r20], {f1826, f1825}; +fma.rn.f32 f1827, f1619, f1491, f1624; +sub.f32 f1828, f2392, f1623; +st.shared.v2.f32 [r20+216], {f1828, f1827}; +fma.rn.f32 f1829, f1629, f1507, f1632; +sub.f32 f1830, f2447, f1631; +st.shared.v2.f32 [r20+432], {f1830, f1829}; +fma.rn.f32 f1831, f1637, f1523, f1640; +sub.f32 f1832, f2444, f1639; +st.shared.v2.f32 [r20+648], {f1832, f1831}; +fma.rn.f32 f1833, f1645, f1539, f1648; +sub.f32 f1834, f2442, f1647; +st.shared.v2.f32 [r20+864], {f1834, f1833}; +fma.rn.f32 f1835, f1653, f1555, f1656; +sub.f32 f1836, f2440, f1655; +st.shared.v2.f32 [r20+1080], {f1836, f1835}; +sub.f32 f1837, f2437, f1663; +fma.rn.f32 f1838, f1661, f1571, f1664; +st.shared.v2.f32 [r20+1296], {f1837, f1838}; +fma.rn.f32 f1839, f1669, f1587, f1672; +sub.f32 f1840, f2435, f1671; +st.shared.v2.f32 [r20+1512], {f1840, f1839}; +sub.f32 f1841, f2433, f1679; +fma.rn.f32 f1842, f1677, f1603, f1680; +st.shared.v2.f32 [r20+1728], {f1841, f1842}; +fma.rn.f32 f1843, f1685, f1482, f1688; +sub.f32 f1844, f2430, f1687; +st.shared.v2.f32 [r20+1944], {f1844, f1843}; +fma.rn.f32 f1845, f1693, f1498, f1696; +sub.f32 f1846, f2428, f1695; +st.shared.v2.f32 [r20+2160], {f1846, f1845}; +fma.rn.f32 f1847, f1701, f1514, f1704; +sub.f32 f1848, f2425, f1703; +st.shared.v2.f32 [r20+2376], {f1848, f1847}; +fma.rn.f32 f1849, f1709, f1530, f1712; +sub.f32 f1850, f2423, f1711; +st.shared.v2.f32 [r20+2592], {f1850, f1849}; +fma.rn.f32 f1851, f1717, f1546, f1720; +sub.f32 f1852, f2421, f1719; +st.shared.v2.f32 [r20+2808], {f1852, f1851}; +fma.rn.f32 f1853, f1725, f1562, f1728; +sub.f32 f1854, f2418, f1727; +st.shared.v2.f32 [r20+3024], {f1854, f1853}; +fma.rn.f32 f1855, f1733, f1578, f1736; +sub.f32 f1856, f2416, f1735; +st.shared.v2.f32 [r20+3240], {f1856, f1855}; +fma.rn.f32 f1857, f1741, f1594, f1744; +sub.f32 f1858, f2414, f1743; +st.shared.v2.f32 [r20+3456], {f1858, f1857}; +fma.rn.f32 f1859, f1749, f1610, f1752; +sub.f32 f1860, f2411, f1751; +st.shared.v2.f32 [r20+3672], {f1860, f1859}; +fma.rn.f32 f1861, f1757, f1483, f1760; +sub.f32 f1862, f2409, f1759; +st.shared.v2.f32 [r20+3888], {f1862, f1861}; +fma.rn.f32 f1863, f1765, f1499, f1768; +sub.f32 f1864, f2406, f1767; +st.shared.v2.f32 [r20+4104], {f1864, f1863}; +fma.rn.f32 f1865, f1773, f1515, f1776; +sub.f32 f1866, f2404, f1775; +st.shared.v2.f32 [r20+4320], {f1866, f1865}; +fma.rn.f32 f1867, f1781, f1531, f1784; +sub.f32 f1868, f2402, f1783; +st.shared.v2.f32 [r20+4536], {f1868, f1867}; +fma.rn.f32 f1869, f1789, f1547, f1792; +sub.f32 f1870, f2399, f1791; +st.shared.v2.f32 [r20+4752], {f1870, f1869}; +fma.rn.f32 f1871, f1797, f1563, f1800; +sub.f32 f1872, f2397, f1799; +st.shared.v2.f32 [r20+4968], {f1872, f1871}; +fma.rn.f32 f1873, f1805, f1579, f1808; +sub.f32 f1874, f2395, f1807; +st.shared.v2.f32 [r20+5184], {f1874, f1873}; +fma.rn.f32 f1875, f1813, f1595, f1816; +sub.f32 f1876, f2391, f1815; +st.shared.v2.f32 [r20+5400], {f1876, f1875}; +fma.rn.f32 f1877, f1821, f1611, f1824; +sub.f32 f1878, f1822, f1823; +st.shared.v2.f32 [r20+5616], {f1878, f1877}; +barrier.sync 0; +ld.shared.v2.f32 {f1879, f1880}, [r10]; +ld.shared.v2.f32 {f1883, f1884}, [r10+1944]; +ld.shared.v2.f32 {f1887, f1888}, [r10+3888]; +ld.shared.v2.f32 {f1891, f1892}, [r10+5832]; +ld.shared.v2.f32 {f1895, f1896}, [r10+7776]; +ld.shared.v2.f32 {f1899, f1900}, [r10+9720]; +ld.shared.v2.f32 {f1903, f1904}, [r10+11664]; +ld.shared.v2.f32 {f1907, f1908}, [r10+13608]; +ld.shared.v2.f32 {f1911, f1912}, [r10+15552]; +ld.shared.v2.f32 {f1915, f1916}, [r10+17496]; +ld.shared.v2.f32 {f1919, f1920}, [r10+19440]; +ld.shared.v2.f32 {f1923, f1924}, [r10+21384]; +ld.shared.v2.f32 {f1927, f1928}, [r10+23328]; +ld.shared.v2.f32 {f1931, f1932}, [r10+25272]; +ld.shared.v2.f32 {f1935, f1936}, [r10+27216]; +ld.shared.v2.f32 {f1939, f1940}, [r10+29160]; +ld.shared.v2.f32 {f1943, f1944}, [r10+31104]; +ld.shared.v2.f32 {f1947, f1948}, [r10+33048]; +ld.shared.v2.f32 {f1951, f1952}, [r10+34992]; +ld.shared.v2.f32 {f1955, f1956}, [r10+36936]; +ld.shared.v2.f32 {f1959, f1960}, [r10+38880]; +ld.shared.v2.f32 {f1963, f1964}, [r10+40824]; +ld.shared.v2.f32 {f1967, f1968}, [r10+42768]; +ld.shared.v2.f32 {f1971, f1972}, [r10+44712]; +ld.shared.v2.f32 {f1975, f1976}, [r10+46656]; +ld.shared.v2.f32 {f1979, f1980}, [r10+48600]; +ld.shared.v2.f32 {f1983, f1984}, [r10+50544]; +add.f32 f1987, f1915, f1951; +add.f32 f1988, f1879, f1987; +mul.f32 f1991, f1987, 0f3F000000; +sub.f32 f1992, f1879, f1991; +add.f32 f2390, f1916, f1952; +sub.f32 f1993, f1916, f1952; +mul.f32 f1994, f1993, 0f3F5DB3D7; +add.f32 f1995, f1994, f1992; +sub.f32 f1996, f1992, f1994; +add.f32 f2389, f1880, f2390; +mul.f32 f1997, f2390, 0f3F000000; +sub.f32 f1998, f1880, f1997; +sub.f32 f1999, f1915, f1951; +mul.f32 f2000, f1999, 0f3F5DB3D7; +sub.f32 f2001, f1998, f2000; +add.f32 f2002, f2000, f1998; +add.f32 f2003, f1927, f1963; +add.f32 f2004, f1891, f2003; +mul.f32 f2007, f2003, 0f3F000000; +sub.f32 f2008, f1891, f2007; +add.f32 f2388, f1928, f1964; +sub.f32 f2009, f1928, f1964; +mul.f32 f2010, f2009, 0f3F5DB3D7; +add.f32 f2011, f2010, f2008; +sub.f32 f2012, f2008, f2010; +add.f32 f2387, f1892, f2388; +mul.f32 f2013, f2388, 0f3F000000; +sub.f32 f2014, f1892, f2013; +sub.f32 f2015, f1927, f1963; +mul.f32 f2016, f2015, 0f3F5DB3D7; +sub.f32 f2017, f2014, f2016; +add.f32 f2018, f2016, f2014; +add.f32 f2019, f1939, f1975; +add.f32 f2020, f1903, f2019; +mul.f32 f2023, f2019, 0f3F000000; +sub.f32 f2024, f1903, f2023; +add.f32 f2386, f1940, f1976; +sub.f32 f2025, f1940, f1976; +mul.f32 f2026, f2025, 0f3F5DB3D7; +add.f32 f2027, f2026, f2024; +sub.f32 f2028, f2024, f2026; +add.f32 f2385, f1904, f2386; +mul.f32 f2029, f2386, 0f3F000000; +sub.f32 f2030, f1904, f2029; +sub.f32 f2031, f1939, f1975; +mul.f32 f2032, f2031, 0f3F5DB3D7; +sub.f32 f2033, f2030, f2032; +add.f32 f2034, f2032, f2030; +mul.f32 f2036, f2017, 0fBF248DBB; +mul.f32 f2384, f2011, 0f3F441B7D; +sub.f32 f2037, f2384, f2036; +mul.f32 f2038, f2017, 0f3F441B7D; +fma.rn.f32 f2039, f2011, 0fBF248DBB, f2038; +mul.f32 f2041, f2033, 0fBF7C1C5C; +mul.f32 f2383, f2027, 0f3E31D0D4; +sub.f32 f2042, f2383, f2041; +mul.f32 f2043, f2033, 0f3E31D0D4; +fma.rn.f32 f2044, f2027, 0fBF7C1C5C, f2043; +mul.f32 f2046, f2018, 0fBF7C1C5C; +mul.f32 f2382, f2012, 0f3E31D0D4; +sub.f32 f2047, f2382, f2046; +mul.f32 f2048, f2018, 0f3E31D0D4; +fma.rn.f32 f2049, f2012, 0fBF7C1C5C, f2048; +mul.f32 f2051, f2034, 0fBEAF1D44; +mul.f32 f2381, f2028, 0fBF708FB2; +sub.f32 f2052, f2381, f2051; +mul.f32 f2053, f2034, 0fBF708FB2; +fma.rn.f32 f2054, f2028, 0fBEAF1D44, f2053; +add.f32 f2055, f2004, f2020; +mul.f32 f2057, f2055, 0f3F000000; +sub.f32 f2058, f1988, f2057; +add.f32 f2380, f2387, f2385; +sub.f32 f2059, f2387, f2385; +mul.f32 f2060, f2059, 0f3F5DB3D7; +mul.f32 f2061, f2380, 0f3F000000; +sub.f32 f2062, f2389, f2061; +sub.f32 f2063, f2004, f2020; +mul.f32 f2064, f2063, 0f3F5DB3D7; +add.f32 f2065, f2037, f2042; +mul.f32 f2067, f2065, 0f3F000000; +sub.f32 f2068, f1995, f2067; +add.f32 f2379, f2039, f2044; +sub.f32 f2069, f2039, f2044; +mul.f32 f2070, f2069, 0f3F5DB3D7; +mul.f32 f2071, f2379, 0f3F000000; +sub.f32 f2072, f2001, f2071; +sub.f32 f2073, f2037, f2042; +mul.f32 f2074, f2073, 0f3F5DB3D7; +add.f32 f2075, f2047, f2052; +mul.f32 f2077, f2075, 0f3F000000; +sub.f32 f2078, f1996, f2077; +add.f32 f2378, f2049, f2054; +sub.f32 f2079, f2049, f2054; +mul.f32 f2080, f2079, 0f3F5DB3D7; +mul.f32 f2081, f2378, 0f3F000000; +sub.f32 f2082, f2002, f2081; +sub.f32 f2083, f2047, f2052; +mul.f32 f2084, f2083, 0f3F5DB3D7; +add.f32 f2085, f1919, f1955; +add.f32 f2086, f1883, f2085; +mul.f32 f2089, f2085, 0f3F000000; +sub.f32 f2090, f1883, f2089; +add.f32 f2377, f1920, f1956; +sub.f32 f2091, f1920, f1956; +mul.f32 f2092, f2091, 0f3F5DB3D7; +add.f32 f2093, f2092, f2090; +sub.f32 f2094, f2090, f2092; +add.f32 f2376, f1884, f2377; +mul.f32 f2095, f2377, 0f3F000000; +sub.f32 f2096, f1884, f2095; +sub.f32 f2097, f1919, f1955; +mul.f32 f2098, f2097, 0f3F5DB3D7; +sub.f32 f2099, f2096, f2098; +add.f32 f2100, f2098, f2096; +add.f32 f2101, f1931, f1967; +add.f32 f2102, f1895, f2101; +mul.f32 f2105, f2101, 0f3F000000; +sub.f32 f2106, f1895, f2105; +add.f32 f2375, f1932, f1968; +sub.f32 f2107, f1932, f1968; +mul.f32 f2108, f2107, 0f3F5DB3D7; +add.f32 f2109, f2108, f2106; +sub.f32 f2110, f2106, f2108; +add.f32 f2374, f1896, f2375; +mul.f32 f2111, f2375, 0f3F000000; +sub.f32 f2112, f1896, f2111; +sub.f32 f2113, f1931, f1967; +mul.f32 f2114, f2113, 0f3F5DB3D7; +sub.f32 f2115, f2112, f2114; +add.f32 f2116, f2114, f2112; +add.f32 f2117, f1943, f1979; +add.f32 f2118, f1907, f2117; +mul.f32 f2121, f2117, 0f3F000000; +sub.f32 f2122, f1907, f2121; +add.f32 f2373, f1944, f1980; +sub.f32 f2123, f1944, f1980; +mul.f32 f2124, f2123, 0f3F5DB3D7; +add.f32 f2125, f2124, f2122; +sub.f32 f2126, f2122, f2124; +add.f32 f2372, f1908, f2373; +mul.f32 f2127, f2373, 0f3F000000; +sub.f32 f2128, f1908, f2127; +sub.f32 f2129, f1943, f1979; +mul.f32 f2130, f2129, 0f3F5DB3D7; +sub.f32 f2131, f2128, f2130; +add.f32 f2132, f2130, f2128; +mul.f32 f2134, f2115, 0fBF248DBB; +mul.f32 f2371, f2109, 0f3F441B7D; +sub.f32 f2135, f2371, f2134; +mul.f32 f2136, f2115, 0f3F441B7D; +fma.rn.f32 f2137, f2109, 0fBF248DBB, f2136; +mul.f32 f2369, f2125, 0f3E31D0D4; +mul.f32 f2370, f2131, 0fBF7C1C5C; +sub.f32 f2140, f2369, f2370; +mul.f32 f2141, f2131, 0f3E31D0D4; +fma.rn.f32 f2142, f2125, 0fBF7C1C5C, f2141; +mul.f32 f2367, f2110, 0f3E31D0D4; +mul.f32 f2368, f2116, 0fBF7C1C5C; +sub.f32 f2145, f2367, f2368; +mul.f32 f2146, f2116, 0f3E31D0D4; +fma.rn.f32 f2147, f2110, 0fBF7C1C5C, f2146; +mul.f32 f2365, f2126, 0fBF708FB2; +mul.f32 f2366, f2132, 0fBEAF1D44; +sub.f32 f2150, f2365, f2366; +mul.f32 f2151, f2132, 0fBF708FB2; +fma.rn.f32 f2152, f2126, 0fBEAF1D44, f2151; +add.f32 f2153, f2102, f2118; +mul.f32 f2155, f2153, 0f3F000000; +sub.f32 f2156, f2086, f2155; +add.f32 f2364, f2374, f2372; +sub.f32 f2157, f2374, f2372; +mul.f32 f2158, f2157, 0f3F5DB3D7; +mul.f32 f2159, f2364, 0f3F000000; +sub.f32 f2160, f2376, f2159; +sub.f32 f2161, f2102, f2118; +mul.f32 f2162, f2161, 0f3F5DB3D7; +add.f32 f2163, f2135, f2140; +mul.f32 f2165, f2163, 0f3F000000; +sub.f32 f2166, f2093, f2165; +add.f32 f2363, f2137, f2142; +sub.f32 f2167, f2137, f2142; +mul.f32 f2168, f2167, 0f3F5DB3D7; +mul.f32 f2169, f2363, 0f3F000000; +sub.f32 f2170, f2099, f2169; +sub.f32 f2171, f2135, f2140; +mul.f32 f2172, f2171, 0f3F5DB3D7; +add.f32 f2173, f2145, f2150; +mul.f32 f2175, f2173, 0f3F000000; +sub.f32 f2176, f2094, f2175; +add.f32 f2362, f2147, f2152; +sub.f32 f2177, f2147, f2152; +mul.f32 f2178, f2177, 0f3F5DB3D7; +mul.f32 f2179, f2362, 0f3F000000; +sub.f32 f2180, f2100, f2179; +sub.f32 f2181, f2145, f2150; +mul.f32 f2182, f2181, 0f3F5DB3D7; +add.f32 f2183, f1923, f1959; +add.f32 f2184, f1887, f2183; +mul.f32 f2187, f2183, 0f3F000000; +sub.f32 f2188, f1887, f2187; +add.f32 f2361, f1924, f1960; +sub.f32 f2189, f1924, f1960; +mul.f32 f2190, f2189, 0f3F5DB3D7; +add.f32 f2191, f2190, f2188; +sub.f32 f2192, f2188, f2190; +add.f32 f2360, f1888, f2361; +mul.f32 f2193, f2361, 0f3F000000; +sub.f32 f2194, f1888, f2193; +sub.f32 f2195, f1923, f1959; +mul.f32 f2196, f2195, 0f3F5DB3D7; +sub.f32 f2197, f2194, f2196; +add.f32 f2198, f2196, f2194; +add.f32 f2199, f1935, f1971; +add.f32 f2200, f1899, f2199; +mul.f32 f2203, f2199, 0f3F000000; +sub.f32 f2204, f1899, f2203; +add.f32 f2359, f1936, f1972; +sub.f32 f2205, f1936, f1972; +mul.f32 f2206, f2205, 0f3F5DB3D7; +add.f32 f2207, f2206, f2204; +sub.f32 f2208, f2204, f2206; +add.f32 f2358, f1900, f2359; +mul.f32 f2209, f2359, 0f3F000000; +sub.f32 f2210, f1900, f2209; +sub.f32 f2211, f1935, f1971; +mul.f32 f2212, f2211, 0f3F5DB3D7; +sub.f32 f2213, f2210, f2212; +add.f32 f2214, f2212, f2210; +add.f32 f2215, f1947, f1983; +add.f32 f2216, f1911, f2215; +mul.f32 f2219, f2215, 0f3F000000; +sub.f32 f2220, f1911, f2219; +add.f32 f2357, f1948, f1984; +sub.f32 f2221, f1948, f1984; +mul.f32 f2222, f2221, 0f3F5DB3D7; +add.f32 f2223, f2222, f2220; +sub.f32 f2224, f2220, f2222; +add.f32 f2356, f1912, f2357; +mul.f32 f2225, f2357, 0f3F000000; +sub.f32 f2226, f1912, f2225; +sub.f32 f2227, f1947, f1983; +mul.f32 f2228, f2227, 0f3F5DB3D7; +sub.f32 f2229, f2226, f2228; +add.f32 f2230, f2228, f2226; +mul.f32 f2354, f2207, 0f3F441B7D; +mul.f32 f2355, f2213, 0fBF248DBB; +sub.f32 f2233, f2354, f2355; +mul.f32 f2234, f2213, 0f3F441B7D; +fma.rn.f32 f2235, f2207, 0fBF248DBB, f2234; +mul.f32 f2352, f2223, 0f3E31D0D4; +mul.f32 f2353, f2229, 0fBF7C1C5C; +sub.f32 f2238, f2352, f2353; +mul.f32 f2239, f2229, 0f3E31D0D4; +fma.rn.f32 f2240, f2223, 0fBF7C1C5C, f2239; +mul.f32 f2242, f2214, 0fBF7C1C5C; +mul.f32 f2351, f2208, 0f3E31D0D4; +sub.f32 f2243, f2351, f2242; +mul.f32 f2244, f2214, 0f3E31D0D4; +fma.rn.f32 f2245, f2208, 0fBF7C1C5C, f2244; +mul.f32 f2247, f2230, 0fBEAF1D44; +mul.f32 f2350, f2224, 0fBF708FB2; +sub.f32 f2248, f2350, f2247; +mul.f32 f2249, f2230, 0fBF708FB2; +fma.rn.f32 f2250, f2224, 0fBEAF1D44, f2249; +add.f32 f2251, f2200, f2216; +mul.f32 f2253, f2251, 0f3F000000; +sub.f32 f2254, f2184, f2253; +add.f32 f2349, f2358, f2356; +sub.f32 f2255, f2358, f2356; +mul.f32 f2256, f2255, 0f3F5DB3D7; +mul.f32 f2257, f2349, 0f3F000000; +sub.f32 f2258, f2360, f2257; +sub.f32 f2259, f2200, f2216; +mul.f32 f2260, f2259, 0f3F5DB3D7; +add.f32 f2261, f2233, f2238; +mul.f32 f2263, f2261, 0f3F000000; +sub.f32 f2264, f2191, f2263; +add.f32 f2348, f2235, f2240; +sub.f32 f2265, f2235, f2240; +mul.f32 f2266, f2265, 0f3F5DB3D7; +mul.f32 f2267, f2348, 0f3F000000; +sub.f32 f2268, f2197, f2267; +sub.f32 f2269, f2233, f2238; +mul.f32 f2270, f2269, 0f3F5DB3D7; +add.f32 f2271, f2243, f2248; +mul.f32 f2273, f2271, 0f3F000000; +sub.f32 f2274, f2192, f2273; +add.f32 f2347, f2245, f2250; +sub.f32 f2275, f2245, f2250; +mul.f32 f2276, f2275, 0f3F5DB3D7; +mul.f32 f2277, f2347, 0f3F000000; +sub.f32 f2278, f2198, f2277; +sub.f32 f2279, f2243, f2248; +mul.f32 f2280, f2279, 0f3F5DB3D7; +add.f32 %1, f2389, f2380; +add.f32 %0, f1988, f2055; +add.f32 %3, f2376, f2364; +add.f32 %2, f2086, f2153; +add.f32 %5, f2360, f2349; +add.f32 %4, f2184, f2251; +add.f32 %7, f2001, f2379; +add.f32 %6, f1995, f2065; +add.f32 %9, f2099, f2363; +add.f32 %8, f2093, f2163; +add.f32 %11, f2197, f2348; +add.f32 %10, f2191, f2261; +add.f32 %13, f2002, f2378; +add.f32 %12, f1996, f2075; +add.f32 %15, f2100, f2362; +add.f32 %14, f2094, f2173; +add.f32 %17, f2198, f2347; +add.f32 %16, f2192, f2271; +add.f32 %18, f2060, f2058; +sub.f32 %19, f2062, f2064; +add.f32 %20, f2158, f2156; +sub.f32 %21, f2160, f2162; +add.f32 %22, f2256, f2254; +sub.f32 %23, f2258, f2260; +add.f32 %24, f2070, f2068; +sub.f32 %25, f2072, f2074; +sub.f32 %27, f2170, f2172; +add.f32 %26, f2168, f2166; +sub.f32 %29, f2268, f2270; +add.f32 %28, f2266, f2264; +sub.f32 %31, f2082, f2084; +add.f32 %30, f2080, f2078; +add.f32 %32, f2178, f2176; +sub.f32 %33, f2180, f2182; +add.f32 %34, f2276, f2274; +sub.f32 %35, f2278, f2280; +add.f32 %37, f2064, f2062; +sub.f32 %36, f2058, f2060; +add.f32 %39, f2162, f2160; +sub.f32 %38, f2156, f2158; +add.f32 %41, f2260, f2258; +sub.f32 %40, f2254, f2256; +add.f32 %43, f2074, f2072; +sub.f32 %42, f2068, f2070; +add.f32 %45, f2172, f2170; +sub.f32 %44, f2166, f2168; +add.f32 %47, f2270, f2268; +sub.f32 %46, f2264, f2266; +add.f32 %49, f2084, f2082; +sub.f32 %48, f2078, f2080; +add.f32 %51, f2182, f2180; +sub.f32 %50, f2176, f2178; +add.f32 %53, f2280, f2278; +sub.f32 %52, f2274, f2276; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_6561), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<150, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2671>; +.reg .b32 r<24>; +.reg .b64 rd<15>; +mov.u32 r22, %tid.y; +mov.u32 r23, %54; +mad.lo.s32 r3, r22, 26244, r23; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2662, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2661, %58, f2662; +mul.f32 f119, f2662, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2660, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2659, %64, f2660; +mul.f32 f135, f2660, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2658, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2657, %70, f2658; +mul.f32 f151, f2658, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f2656, f133, 0f3F441B7D; +sub.f32 f159, f2656, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f2654, f149, 0f3E31D0D4; +mul.f32 f2655, f155, 0fBF7C1C5C; +sub.f32 f164, f2654, f2655; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f2652, f134, 0f3E31D0D4; +mul.f32 f2653, f140, 0fBF7C1C5C; +sub.f32 f169, f2652, f2653; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f2650, f150, 0fBF708FB2; +mul.f32 f2651, f156, 0fBEAF1D44; +sub.f32 f174, f2650, f2651; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2649, f2659, f2657; +sub.f32 f183, f2659, f2657; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2648, f2661, f2649; +mul.f32 f187, f2649, 0f3F000000; +sub.f32 f188, f2661, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2647, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2646, f123, f2647; +mul.f32 f203, f2647, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2645, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2644, f124, f2645; +mul.f32 f219, f2645, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2641, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2639, %113, f2641; +mul.f32 f235, f2641, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2636, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2634, %116, f2636; +mul.f32 f251, f2636, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2631, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2629, %119, f2631; +mul.f32 f267, f2631, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f2628, f249, 0f3F441B7D; +sub.f32 f275, f2628, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f2627, f265, 0f3E31D0D4; +sub.f32 f280, f2627, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f2625, f250, 0f3E31D0D4; +mul.f32 f2626, f256, 0fBF7C1C5C; +sub.f32 f285, f2625, f2626; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f2623, f266, 0fBF708FB2; +mul.f32 f2624, f272, 0fBEAF1D44; +sub.f32 f290, f2623, f2624; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2622, f2634, f2629; +sub.f32 f299, f2634, f2629; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2621, f2639, f2622; +mul.f32 f303, f2622, 0f3F000000; +sub.f32 f304, f2639, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2620, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2619, f239, f2620; +mul.f32 f319, f2620, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2618, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2617, f240, f2618; +mul.f32 f335, f2618, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2614, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2612, %122, f2614; +mul.f32 f351, f2614, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2609, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2607, %125, f2609; +mul.f32 f367, f2609, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2605, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2603, %127, f2605; +mul.f32 f383, f2605, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f2602, f365, 0f3F441B7D; +sub.f32 f391, f2602, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f2601, f381, 0f3E31D0D4; +sub.f32 f396, f2601, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f2599, f366, 0f3E31D0D4; +mul.f32 f2600, f372, 0fBF7C1C5C; +sub.f32 f401, f2599, f2600; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f2597, f382, 0fBF708FB2; +mul.f32 f2598, f388, 0fBEAF1D44; +sub.f32 f406, f2597, f2598; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2596, f2607, f2603; +sub.f32 f415, f2607, f2603; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2595, f2612, f2596; +mul.f32 f419, f2596, 0f3F000000; +sub.f32 f420, f2612, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2594, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2593, f355, f2594; +mul.f32 f435, f2594, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2592, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2591, f356, f2592; +mul.f32 f451, f2592, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2619, 0fBE6C2691; +mul.f32 f2590, f310, 0f3F791978; +sub.f32 f459, f2590, f458; +mul.f32 f460, f2619, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f2588, f426, 0f3F64C51C; +mul.f32 f2589, f2593, 0fBEE5C902; +sub.f32 f464, f2588, f2589; +mul.f32 f465, f2593, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f2586, f326, 0f3F64C51C; +mul.f32 f2587, f2617, 0fBEE5C902; +sub.f32 f469, f2586, f2587; +mul.f32 f470, f2617, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f2584, f442, 0f3F18DF63; +mul.f32 f2585, f2591, 0fBF4D57F2; +sub.f32 f474, f2584, f2585; +mul.f32 f475, f2591, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f2582, f301, 0f3F441B7D; +mul.f32 f2583, f307, 0fBF248DBB; +sub.f32 f479, f2582, f2583; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f2581, f417, 0f3E31D0D4; +sub.f32 f484, f2581, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f2580, f317, 0f3F18DF63; +sub.f32 f489, f2580, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f2579, f433, 0fBE92D7E0; +sub.f32 f494, f2579, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f2578, f333, 0f3ECACAF8; +sub.f32 f499, f2578, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f2577, f449, 0fBF2FAD88; +sub.f32 f504, f2577, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f2576, f302, 0f3E31D0D4; +sub.f32 f509, f2576, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f2574, f418, 0fBF708FB2; +mul.f32 f2575, f424, 0fBEAF1D44; +sub.f32 f514, f2574, f2575; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f2572, f318, 0fBD6E2946; +mul.f32 f2573, f324, 0fBF7F9120; +sub.f32 f519, f2572, f2573; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f2570, f434, 0fBF7E44DE; +mul.f32 f2571, f440, 0f3DEDC21F; +sub.f32 f524, f2570, f2571; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f2569, f334, 0fBE92D7E0; +sub.f32 f529, f2569, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f2568, f450, 0fBF55E287; +sub.f32 f534, f2568, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f2567, f2621, f2595; +sub.f32 f543, f2621, f2595; +mul.f32 f544, f543, 0f3F5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f2566, f2648, f2567; +mul.f32 f547, f2567, 0f3F000000; +sub.f32 f548, f2648, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0f3F5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f2565, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f2564, f2646, f2565; +mul.f32 f563, f2565, 0f3F000000; +sub.f32 f564, f2646, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0f3F5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f2563, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f2562, f2644, f2563; +mul.f32 f579, f2563, 0f3F000000; +sub.f32 f580, f2644, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0f3F5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f2561, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0f3F5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f2560, f191, f2561; +mul.f32 f595, f2561, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0f3F5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f2559, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0f3F5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f2558, f207, f2559; +mul.f32 f611, f2559, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0f3F5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f2557, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0f3F5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f2556, f223, f2557; +mul.f32 f627, f2557, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0f3F5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f2555, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0f3F5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f2554, f192, f2555; +mul.f32 f643, f2555, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0f3F5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f2553, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0f3F5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f2552, f208, f2553; +mul.f32 f659, f2553, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0f3F5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f2551, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0f3F5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f2550, f224, f2551; +mul.f32 f675, f2551, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0f3F5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r21, %tid.x; +mul.wide.u32 rd2, r21, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r21, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f686, f682, f2564; +mul.f32 f2549, f681, f554; +sub.f32 f687, f2549, f686; +mul.f32 f688, f681, f2564; +fma.rn.f32 f689, f682, f554, f688; +mul.f32 f691, f682, f682; +mul.f32 f2548, f681, f681; +sub.f32 f692, f2548, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f696, f694, f2562; +mul.f32 f2547, f692, f570; +sub.f32 f697, f2547, f696; +mul.f32 f698, f692, f2562; +fma.rn.f32 f699, f694, f570, f698; +mul.f32 f701, f682, f694; +mul.f32 f2546, f681, f692; +sub.f32 f702, f2546, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f706, f704, f2560; +mul.f32 f2545, f702, f586; +sub.f32 f707, f2545, f706; +mul.f32 f708, f702, f2560; +fma.rn.f32 f709, f704, f586, f708; +mul.f32 f2543, f681, f702; +mul.f32 f2544, f682, f704; +sub.f32 f712, f2543, f2544; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f2541, f712, f602; +mul.f32 f2542, f714, f2558; +sub.f32 f717, f2541, f2542; +mul.f32 f718, f712, f2558; +fma.rn.f32 f719, f714, f602, f718; +mul.f32 f2539, f681, f712; +mul.f32 f2540, f682, f714; +sub.f32 f722, f2539, f2540; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f2537, f722, f618; +mul.f32 f2538, f724, f2556; +sub.f32 f727, f2537, f2538; +mul.f32 f728, f722, f2556; +fma.rn.f32 f729, f724, f618, f728; +mul.f32 f731, f682, f724; +mul.f32 f2536, f681, f722; +sub.f32 f732, f2536, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f736, f734, f2554; +mul.f32 f2535, f732, f634; +sub.f32 f737, f2535, f736; +mul.f32 f738, f732, f2554; +fma.rn.f32 f739, f734, f634, f738; +mul.f32 f741, f682, f734; +mul.f32 f2534, f681, f732; +sub.f32 f742, f2534, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f746, f744, f2552; +mul.f32 f2533, f742, f650; +sub.f32 f747, f2533, f746; +mul.f32 f748, f742, f2552; +fma.rn.f32 f749, f744, f650, f748; +mul.f32 f751, f682, f744; +mul.f32 f2532, f681, f742; +sub.f32 f752, f2532, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f756, f754, f2550; +mul.f32 f2531, f752, f666; +sub.f32 f757, f2531, f756; +mul.f32 f758, f752, f2550; +fma.rn.f32 f759, f754, f666, f758; +mul.f32 f2529, f681, f752; +mul.f32 f2530, f682, f754; +sub.f32 f762, f2529, f2530; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f2527, f762, f545; +mul.f32 f2528, f764, f551; +sub.f32 f767, f2527, f2528; +mul.f32 f768, f762, f551; +fma.rn.f32 f769, f764, f545, f768; +mul.f32 f2525, f681, f762; +mul.f32 f2526, f682, f764; +sub.f32 f772, f2525, f2526; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f776, f774, f567; +mul.f32 f2524, f772, f561; +sub.f32 f777, f2524, f776; +mul.f32 f778, f772, f567; +fma.rn.f32 f779, f774, f561, f778; +mul.f32 f781, f682, f774; +mul.f32 f2523, f681, f772; +sub.f32 f782, f2523, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f786, f784, f583; +mul.f32 f2522, f782, f577; +sub.f32 f787, f2522, f786; +mul.f32 f788, f782, f583; +fma.rn.f32 f789, f784, f577, f788; +mul.f32 f791, f682, f784; +mul.f32 f2521, f681, f782; +sub.f32 f792, f2521, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f796, f794, f599; +mul.f32 f2520, f792, f593; +sub.f32 f797, f2520, f796; +mul.f32 f798, f792, f599; +fma.rn.f32 f799, f794, f593, f798; +mul.f32 f801, f682, f794; +mul.f32 f2519, f681, f792; +sub.f32 f802, f2519, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f2517, f802, f609; +mul.f32 f2518, f804, f615; +sub.f32 f807, f2517, f2518; +mul.f32 f808, f802, f615; +fma.rn.f32 f809, f804, f609, f808; +mul.f32 f2515, f681, f802; +mul.f32 f2516, f682, f804; +sub.f32 f812, f2515, f2516; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f2513, f812, f625; +mul.f32 f2514, f814, f631; +sub.f32 f817, f2513, f2514; +mul.f32 f818, f812, f631; +fma.rn.f32 f819, f814, f625, f818; +mul.f32 f2511, f681, f812; +mul.f32 f2512, f682, f814; +sub.f32 f822, f2511, f2512; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f826, f824, f647; +mul.f32 f2510, f822, f641; +sub.f32 f827, f2510, f826; +mul.f32 f828, f822, f647; +fma.rn.f32 f829, f824, f641, f828; +mul.f32 f831, f682, f824; +mul.f32 f2509, f681, f822; +sub.f32 f832, f2509, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f836, f834, f663; +mul.f32 f2508, f832, f657; +sub.f32 f837, f2508, f836; +mul.f32 f838, f832, f663; +fma.rn.f32 f839, f834, f657, f838; +mul.f32 f841, f682, f834; +mul.f32 f2507, f681, f832; +sub.f32 f842, f2507, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f846, f844, f679; +mul.f32 f2506, f842, f673; +sub.f32 f847, f2506, f846; +mul.f32 f848, f842, f679; +fma.rn.f32 f849, f844, f673, f848; +mul.f32 f2504, f681, f842; +mul.f32 f2505, f682, f844; +sub.f32 f852, f2504, f2505; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f2502, f852, f546; +mul.f32 f2503, f854, f552; +sub.f32 f857, f2502, f2503; +mul.f32 f858, f852, f552; +fma.rn.f32 f859, f854, f546, f858; +mul.f32 f2500, f681, f852; +mul.f32 f2501, f682, f854; +sub.f32 f862, f2500, f2501; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f2498, f862, f562; +mul.f32 f2499, f864, f568; +sub.f32 f867, f2498, f2499; +mul.f32 f868, f862, f568; +fma.rn.f32 f869, f864, f562, f868; +mul.f32 f871, f682, f864; +mul.f32 f2497, f681, f862; +sub.f32 f872, f2497, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f876, f874, f584; +mul.f32 f2496, f872, f578; +sub.f32 f877, f2496, f876; +mul.f32 f878, f872, f584; +fma.rn.f32 f879, f874, f578, f878; +mul.f32 f881, f682, f874; +mul.f32 f2495, f681, f872; +sub.f32 f882, f2495, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f886, f884, f600; +mul.f32 f2494, f882, f594; +sub.f32 f887, f2494, f886; +mul.f32 f888, f882, f600; +fma.rn.f32 f889, f884, f594, f888; +mul.f32 f891, f682, f884; +mul.f32 f2493, f681, f882; +sub.f32 f892, f2493, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f2491, f892, f610; +mul.f32 f2492, f894, f616; +sub.f32 f897, f2491, f2492; +mul.f32 f898, f892, f616; +fma.rn.f32 f899, f894, f610, f898; +mul.f32 f2489, f681, f892; +mul.f32 f2490, f682, f894; +sub.f32 f902, f2489, f2490; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f2487, f902, f626; +mul.f32 f2488, f904, f632; +sub.f32 f907, f2487, f2488; +mul.f32 f908, f902, f632; +fma.rn.f32 f909, f904, f626, f908; +mul.f32 f2485, f681, f902; +mul.f32 f2486, f682, f904; +sub.f32 f912, f2485, f2486; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f916, f914, f648; +mul.f32 f2484, f912, f642; +sub.f32 f917, f2484, f916; +mul.f32 f918, f912, f648; +fma.rn.f32 f919, f914, f642, f918; +mul.f32 f921, f682, f914; +mul.f32 f2483, f681, f912; +sub.f32 f922, f2483, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f926, f924, f664; +mul.f32 f2482, f922, f658; +sub.f32 f927, f2482, f926; +mul.f32 f928, f922, f664; +fma.rn.f32 f929, f924, f658, f928; +mul.f32 f931, f682, f924; +mul.f32 f2481, f681, f922; +sub.f32 f932, f2481, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f936, f934, f680; +mul.f32 f2480, f932, f674; +sub.f32 f937, f2480, f936; +mul.f32 f938, f932, f680; +fma.rn.f32 f939, f934, f674, f938; +mad.lo.s32 r8, r5, 26244, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f687; +st.shared.f32 [r9+8], f697; +st.shared.f32 [r9+12], f707; +st.shared.f32 [r9+16], f717; +st.shared.f32 [r9+20], f727; +st.shared.f32 [r9+24], f737; +st.shared.f32 [r9+28], f747; +st.shared.f32 [r9+32], f757; +st.shared.f32 [r9+36], f767; +st.shared.f32 [r9+40], f777; +st.shared.f32 [r9+44], f787; +st.shared.f32 [r9+48], f797; +st.shared.f32 [r9+52], f807; +st.shared.f32 [r9+56], f817; +st.shared.f32 [r9+60], f827; +st.shared.f32 [r9+64], f837; +st.shared.f32 [r9+68], f847; +st.shared.f32 [r9+72], f857; +st.shared.f32 [r9+76], f867; +st.shared.f32 [r9+80], f877; +st.shared.f32 [r9+84], f887; +st.shared.f32 [r9+88], f897; +st.shared.f32 [r9+92], f907; +st.shared.f32 [r9+96], f917; +st.shared.f32 [r9+100], f927; +st.shared.f32 [r9+104], f937; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+972]; +ld.shared.f32 f942, [r10+1944]; +ld.shared.f32 f943, [r10+2916]; +ld.shared.f32 f944, [r10+3888]; +ld.shared.f32 f945, [r10+4860]; +ld.shared.f32 f946, [r10+5832]; +ld.shared.f32 f947, [r10+6804]; +ld.shared.f32 f948, [r10+7776]; +ld.shared.f32 f949, [r10+8748]; +ld.shared.f32 f950, [r10+9720]; +ld.shared.f32 f951, [r10+10692]; +ld.shared.f32 f952, [r10+11664]; +ld.shared.f32 f953, [r10+12636]; +ld.shared.f32 f954, [r10+13608]; +ld.shared.f32 f955, [r10+14580]; +ld.shared.f32 f956, [r10+15552]; +ld.shared.f32 f957, [r10+16524]; +ld.shared.f32 f958, [r10+17496]; +ld.shared.f32 f959, [r10+18468]; +ld.shared.f32 f960, [r10+19440]; +ld.shared.f32 f961, [r10+20412]; +ld.shared.f32 f962, [r10+21384]; +ld.shared.f32 f963, [r10+22356]; +ld.shared.f32 f964, [r10+23328]; +ld.shared.f32 f965, [r10+24300]; +ld.shared.f32 f966, [r10+25272]; +barrier.sync 0; +st.shared.f32 [r9], f2566; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +ld.shared.f32 f2479, [r10+8748]; +ld.shared.f32 f2478, [r10+17496]; +add.f32 f2477, f2479, f2478; +sub.f32 f1000, f2479, f2478; +mul.f32 f1001, f1000, 0f3F5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +ld.shared.f32 f2476, [r10]; +add.f32 f2475, f2476, f2477; +mul.f32 f1004, f2477, 0f3F000000; +sub.f32 f1005, f2476, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0f3F5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +ld.shared.f32 f2474, [r10+11664]; +ld.shared.f32 f2473, [r10+20412]; +add.f32 f2472, f2474, f2473; +sub.f32 f1016, f2474, f2473; +mul.f32 f1017, f1016, 0f3F5DB3D7; +ld.shared.f32 f2471, [r10+2916]; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f2470, f2471, f2472; +mul.f32 f1020, f2472, 0f3F000000; +sub.f32 f1021, f2471, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0f3F5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +ld.shared.f32 f2469, [r10+14580]; +sub.f32 f1031, f946, f1030; +ld.shared.f32 f2468, [r10+23328]; +add.f32 f2467, f2469, f2468; +sub.f32 f1032, f2469, f2468; +mul.f32 f1033, f1032, 0f3F5DB3D7; +ld.shared.f32 f2466, [r10+5832]; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f2465, f2466, f2467; +mul.f32 f1036, f2467, 0f3F000000; +sub.f32 f1037, f2466, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0f3F5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f2463, f1018, 0f3F441B7D; +mul.f32 f2464, f1024, 0fBF248DBB; +sub.f32 f1044, f2463, f2464; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0fBF248DBB, f1045; +mul.f32 f1048, f1040, 0fBF7C1C5C; +mul.f32 f2462, f1034, 0f3E31D0D4; +sub.f32 f1049, f2462, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0fBF7C1C5C, f1050; +mul.f32 f1053, f1025, 0fBF7C1C5C; +mul.f32 f2461, f1019, 0f3E31D0D4; +sub.f32 f1054, f2461, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0fBF7C1C5C, f1055; +mul.f32 f1058, f1041, 0fBEAF1D44; +mul.f32 f2460, f1035, 0fBF708FB2; +sub.f32 f1059, f2460, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0fBEAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f2459, f2470, f2465; +sub.f32 f1068, f2470, f2465; +mul.f32 f1069, f1068, 0f3F5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f2458, f2475, f2459; +mul.f32 f1072, f2459, 0f3F000000; +sub.f32 f1073, f2475, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0f3F5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f2457, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0f3F5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f2456, f1008, f2457; +mul.f32 f1088, f2457, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0f3F5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f2455, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0f3F5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f2454, f1009, f2455; +mul.f32 f1104, f2455, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0f3F5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +ld.shared.f32 f2453, [r10+18468]; +sub.f32 f1115, f941, f1114; +ld.shared.f32 f2452, [r10+9720]; +add.f32 f2451, f2452, f2453; +sub.f32 f1116, f2452, f2453; +mul.f32 f1117, f1116, 0f3F5DB3D7; +ld.shared.f32 f2450, [r10+972]; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +add.f32 f2449, f2450, f2451; +mul.f32 f1120, f2451, 0f3F000000; +sub.f32 f1121, f2450, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0f3F5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +ld.shared.f32 f2448, [r10+21384]; +sub.f32 f1131, f944, f1130; +ld.shared.f32 f2447, [r10+12636]; +add.f32 f2446, f2447, f2448; +sub.f32 f1132, f2447, f2448; +mul.f32 f1133, f1132, 0f3F5DB3D7; +ld.shared.f32 f2445, [r10+3888]; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +add.f32 f2444, f2445, f2446; +mul.f32 f1136, f2446, 0f3F000000; +sub.f32 f1137, f2445, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0f3F5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +ld.shared.f32 f2443, [r10+15552]; +sub.f32 f1147, f947, f1146; +ld.shared.f32 f2442, [r10+24300]; +add.f32 f2441, f2443, f2442; +sub.f32 f1148, f2443, f2442; +mul.f32 f1149, f1148, 0f3F5DB3D7; +ld.shared.f32 f2440, [r10+6804]; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +add.f32 f2439, f2440, f2441; +mul.f32 f1152, f2441, 0f3F000000; +sub.f32 f1153, f2440, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0f3F5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f2437, f1134, 0f3F441B7D; +mul.f32 f2438, f1140, 0fBF248DBB; +sub.f32 f1160, f2437, f2438; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0fBF248DBB, f1161; +mul.f32 f2435, f1150, 0f3E31D0D4; +mul.f32 f2436, f1156, 0fBF7C1C5C; +sub.f32 f1165, f2435, f2436; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0fBF7C1C5C, f1166; +mul.f32 f1169, f1141, 0fBF7C1C5C; +mul.f32 f2434, f1135, 0f3E31D0D4; +sub.f32 f1170, f2434, f1169; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0fBF7C1C5C, f1171; +mul.f32 f1174, f1157, 0fBEAF1D44; +mul.f32 f2433, f1151, 0fBF708FB2; +sub.f32 f1175, f2433, f1174; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0fBEAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f2432, f2444, f2439; +sub.f32 f1184, f2444, f2439; +mul.f32 f1185, f1184, 0f3F5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f2431, f2449, f2432; +mul.f32 f1188, f2432, 0f3F000000; +sub.f32 f1189, f2449, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0f3F5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f2430, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0f3F5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f2429, f1124, f2430; +mul.f32 f1204, f2430, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0f3F5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f2428, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0f3F5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f2427, f1125, f2428; +mul.f32 f1220, f2428, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0f3F5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +ld.shared.f32 f2426, [r10+10692]; +ld.shared.f32 f2425, [r10+19440]; +sub.f32 f1231, f942, f1230; +add.f32 f2424, f2426, f2425; +sub.f32 f1232, f2426, f2425; +mul.f32 f1233, f1232, 0f3F5DB3D7; +ld.shared.f32 f2423, [r10+1944]; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f2422, f2423, f2424; +mul.f32 f1236, f2424, 0f3F000000; +sub.f32 f1237, f2423, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0f3F5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +ld.shared.f32 f2421, [r10+13608]; +ld.shared.f32 f2420, [r10+22356]; +sub.f32 f1247, f945, f1246; +add.f32 f2419, f2421, f2420; +sub.f32 f1248, f2421, f2420; +mul.f32 f1249, f1248, 0f3F5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +ld.shared.f32 f2418, [r10+4860]; +add.f32 f2417, f2418, f2419; +mul.f32 f1252, f2419, 0f3F000000; +sub.f32 f1253, f2418, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0f3F5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +ld.shared.f32 f2416, [r10+25272]; +ld.shared.f32 f2415, [r10+16524]; +add.f32 f2414, f2415, f2416; +sub.f32 f1264, f2415, f2416; +mul.f32 f1265, f1264, 0f3F5DB3D7; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +ld.shared.f32 f2413, [r10+7776]; +add.f32 f2412, f2413, f2414; +mul.f32 f1268, f2414, 0f3F000000; +sub.f32 f1269, f2413, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0f3F5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f2410, f1250, 0f3F441B7D; +mul.f32 f2411, f1256, 0fBF248DBB; +sub.f32 f1276, f2410, f2411; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0fBF248DBB, f1277; +mul.f32 f2408, f1266, 0f3E31D0D4; +mul.f32 f2409, f1272, 0fBF7C1C5C; +sub.f32 f1281, f2408, f2409; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0fBF7C1C5C, f1282; +mul.f32 f1285, f1257, 0fBF7C1C5C; +mul.f32 f2407, f1251, 0f3E31D0D4; +sub.f32 f1286, f2407, f1285; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0fBF7C1C5C, f1287; +mul.f32 f1290, f1273, 0fBEAF1D44; +mul.f32 f2406, f1267, 0fBF708FB2; +sub.f32 f1291, f2406, f1290; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0fBEAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f2405, f2417, f2412; +sub.f32 f1300, f2417, f2412; +mul.f32 f1301, f1300, 0f3F5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f2404, f2422, f2405; +mul.f32 f1304, f2405, 0f3F000000; +sub.f32 f1305, f2422, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0f3F5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f2403, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0f3F5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f2402, f1240, f2403; +mul.f32 f1320, f2403, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0f3F5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f2401, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0f3F5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f2400, f1241, f2401; +mul.f32 f1336, f2401, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0f3F5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f2429, 0fBE6C2691; +mul.f32 f2399, f1195, 0f3F791978; +sub.f32 f1344, f2399, f1343; +mul.f32 f1345, f2429, 0f3F791978; +fma.rn.f32 f1346, f1195, 0fBE6C2691, f1345; +mul.f32 f2397, f1311, 0f3F64C51C; +mul.f32 f2398, f2402, 0fBEE5C902; +sub.f32 f1349, f2397, f2398; +mul.f32 f1350, f2402, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0fBEE5C902, f1350; +mul.f32 f2395, f1211, 0f3F64C51C; +mul.f32 f2396, f2427, 0fBEE5C902; +sub.f32 f1354, f2395, f2396; +mul.f32 f1355, f2427, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0fBEE5C902, f1355; +mul.f32 f2393, f1327, 0f3F18DF63; +mul.f32 f2394, f2400, 0fBF4D57F2; +sub.f32 f1359, f2393, f2394; +mul.f32 f1360, f2400, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0fBF4D57F2, f1360; +mul.f32 f2391, f1186, 0f3F441B7D; +mul.f32 f2392, f1192, 0fBF248DBB; +sub.f32 f1364, f2391, f2392; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0fBF248DBB, f1365; +mul.f32 f1368, f1308, 0fBF7C1C5C; +mul.f32 f2390, f1302, 0f3E31D0D4; +sub.f32 f1369, f2390, f1368; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0fBF7C1C5C, f1370; +mul.f32 f1373, f1208, 0fBF4D57F2; +mul.f32 f2389, f1202, 0f3F18DF63; +sub.f32 f1374, f2389, f1373; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0fBF4D57F2, f1375; +mul.f32 f1378, f1324, 0fBF753ECD; +mul.f32 f2388, f1318, 0fBE92D7E0; +sub.f32 f1379, f2388, f1378; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0fBF753ECD, f1380; +mul.f32 f1383, f1224, 0fBF6B1036; +mul.f32 f2387, f1218, 0f3ECACAF8; +sub.f32 f1384, f2387, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0fBF6B1036, f1385; +mul.f32 f1388, f1340, 0fBF3A3529; +mul.f32 f2386, f1334, 0fBF2FAD88; +sub.f32 f1389, f2386, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0fBF3A3529, f1390; +mul.f32 f1393, f1193, 0fBF7C1C5C; +mul.f32 f2385, f1187, 0f3E31D0D4; +sub.f32 f1394, f2385, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0fBF7C1C5C, f1395; +mul.f32 f2383, f1303, 0fBF708FB2; +mul.f32 f2384, f1309, 0fBEAF1D44; +sub.f32 f1399, f2383, f2384; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0fBEAF1D44, f1400; +mul.f32 f2381, f1203, 0fBD6E2946; +mul.f32 f2382, f1209, 0fBF7F9120; +sub.f32 f1404, f2381, f2382; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0fBF7F9120, f1405; +mul.f32 f2379, f1319, 0fBF7E44DE; +mul.f32 f2380, f1325, 0f3DEDC21F; +sub.f32 f1409, f2379, f2380; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0f3DEDC21F, f1410; +mul.f32 f1413, f1225, 0fBF753ECD; +mul.f32 f2378, f1219, 0fBE92D7E0; +sub.f32 f1414, f2378, f1413; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0fBF753ECD, f1415; +mul.f32 f1418, f1341, 0f3F0CAC9F; +mul.f32 f2377, f1335, 0fBF55E287; +sub.f32 f1419, f2377, f1418; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0f3F0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +add.f32 f1423, f1063, f1422; +mul.f32 f1426, f1422, 0f3F000000; +sub.f32 f1427, f1063, f1426; +add.f32 f2376, f2431, f2404; +sub.f32 f1428, f2431, f2404; +mul.f32 f1429, f1428, 0f3F5DB3D7; +add.f32 f1430, f1429, f1427; +sub.f32 f1431, f1427, f1429; +add.f32 f2375, f2458, f2376; +mul.f32 f1432, f2376, 0f3F000000; +sub.f32 f1433, f2458, f1432; +sub.f32 f1434, f1179, f1295; +mul.f32 f1435, f1434, 0f3F5DB3D7; +sub.f32 f1436, f1433, f1435; +add.f32 f1437, f1435, f1433; +add.f32 f1438, f1344, f1349; +add.f32 f1439, f1079, f1438; +mul.f32 f1442, f1438, 0f3F000000; +sub.f32 f1443, f1079, f1442; +add.f32 f2374, f1346, f1351; +sub.f32 f1444, f1346, f1351; +mul.f32 f1445, f1444, 0f3F5DB3D7; +add.f32 f1446, f1445, f1443; +sub.f32 f1447, f1443, f1445; +add.f32 f2373, f2456, f2374; +mul.f32 f1448, f2374, 0f3F000000; +sub.f32 f1449, f2456, f1448; +sub.f32 f1450, f1344, f1349; +mul.f32 f1451, f1450, 0f3F5DB3D7; +sub.f32 f1452, f1449, f1451; +add.f32 f1453, f1451, f1449; +add.f32 f1454, f1354, f1359; +add.f32 f1455, f1095, f1454; +mul.f32 f1458, f1454, 0f3F000000; +sub.f32 f1459, f1095, f1458; +add.f32 f2372, f1356, f1361; +sub.f32 f1460, f1356, f1361; +mul.f32 f1461, f1460, 0f3F5DB3D7; +add.f32 f1462, f1461, f1459; +sub.f32 f1463, f1459, f1461; +add.f32 f2371, f2454, f2372; +mul.f32 f1464, f2372, 0f3F000000; +sub.f32 f1465, f2454, f1464; +sub.f32 f1466, f1354, f1359; +mul.f32 f1467, f1466, 0f3F5DB3D7; +sub.f32 f1468, f1465, f1467; +add.f32 f1469, f1467, f1465; +add.f32 f1470, f1364, f1369; +add.f32 f1471, f1070, f1470; +mul.f32 f1474, f1470, 0f3F000000; +sub.f32 f1475, f1070, f1474; +add.f32 f2370, f1366, f1371; +sub.f32 f1476, f1366, f1371; +mul.f32 f1477, f1476, 0f3F5DB3D7; +add.f32 f1478, f1477, f1475; +sub.f32 f1479, f1475, f1477; +add.f32 f2369, f1076, f2370; +mul.f32 f1480, f2370, 0f3F000000; +sub.f32 f1481, f1076, f1480; +sub.f32 f1482, f1364, f1369; +mul.f32 f1483, f1482, 0f3F5DB3D7; +sub.f32 f1484, f1481, f1483; +add.f32 f1485, f1483, f1481; +add.f32 f1486, f1374, f1379; +add.f32 f1487, f1086, f1486; +mul.f32 f1490, f1486, 0f3F000000; +sub.f32 f1491, f1086, f1490; +add.f32 f2368, f1376, f1381; +sub.f32 f1492, f1376, f1381; +mul.f32 f1493, f1492, 0f3F5DB3D7; +add.f32 f1494, f1493, f1491; +sub.f32 f1495, f1491, f1493; +add.f32 f2367, f1092, f2368; +mul.f32 f1496, f2368, 0f3F000000; +sub.f32 f1497, f1092, f1496; +sub.f32 f1498, f1374, f1379; +mul.f32 f1499, f1498, 0f3F5DB3D7; +sub.f32 f1500, f1497, f1499; +add.f32 f1501, f1499, f1497; +add.f32 f1502, f1384, f1389; +add.f32 f1503, f1102, f1502; +mul.f32 f1506, f1502, 0f3F000000; +sub.f32 f1507, f1102, f1506; +add.f32 f2366, f1386, f1391; +sub.f32 f1508, f1386, f1391; +mul.f32 f1509, f1508, 0f3F5DB3D7; +add.f32 f1510, f1509, f1507; +sub.f32 f1511, f1507, f1509; +add.f32 f2365, f1108, f2366; +mul.f32 f1512, f2366, 0f3F000000; +sub.f32 f1513, f1108, f1512; +sub.f32 f1514, f1384, f1389; +mul.f32 f1515, f1514, 0f3F5DB3D7; +sub.f32 f1516, f1513, f1515; +add.f32 f1517, f1515, f1513; +add.f32 f1518, f1394, f1399; +add.f32 f1519, f1071, f1518; +mul.f32 f1522, f1518, 0f3F000000; +sub.f32 f1523, f1071, f1522; +add.f32 f2364, f1396, f1401; +sub.f32 f1524, f1396, f1401; +mul.f32 f1525, f1524, 0f3F5DB3D7; +add.f32 f1526, f1525, f1523; +sub.f32 f1527, f1523, f1525; +add.f32 f2363, f1077, f2364; +mul.f32 f1528, f2364, 0f3F000000; +sub.f32 f1529, f1077, f1528; +sub.f32 f1530, f1394, f1399; +mul.f32 f1531, f1530, 0f3F5DB3D7; +sub.f32 f1532, f1529, f1531; +add.f32 f1533, f1531, f1529; +add.f32 f1534, f1404, f1409; +add.f32 f1535, f1087, f1534; +mul.f32 f1538, f1534, 0f3F000000; +sub.f32 f1539, f1087, f1538; +add.f32 f2362, f1406, f1411; +sub.f32 f1540, f1406, f1411; +mul.f32 f1541, f1540, 0f3F5DB3D7; +add.f32 f1542, f1541, f1539; +sub.f32 f1543, f1539, f1541; +add.f32 f2361, f1093, f2362; +mul.f32 f1544, f2362, 0f3F000000; +sub.f32 f1545, f1093, f1544; +sub.f32 f1546, f1404, f1409; +mul.f32 f1547, f1546, 0f3F5DB3D7; +sub.f32 f1548, f1545, f1547; +add.f32 f1549, f1547, f1545; +add.f32 f1550, f1414, f1419; +add.f32 f1551, f1103, f1550; +mul.f32 f1554, f1550, 0f3F000000; +sub.f32 f1555, f1103, f1554; +add.f32 f2360, f1416, f1421; +sub.f32 f1556, f1416, f1421; +mul.f32 f1557, f1556, 0f3F5DB3D7; +add.f32 f1558, f1557, f1555; +sub.f32 f1559, f1555, f1557; +add.f32 f2359, f1109, f2360; +mul.f32 f1560, f2360, 0f3F000000; +sub.f32 f1561, f1109, f1560; +sub.f32 f1562, f1414, f1419; +mul.f32 f1563, f1562, 0f3F5DB3D7; +sub.f32 f1564, f1561, f1563; +add.f32 f1565, f1563, f1561; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1566, f1567}, [rd11]; +mul.f32 f2357, f1566, f1439; +mul.f32 f2358, f1567, f2373; +sub.f32 f1572, f2357, f2358; +mul.f32 f1573, f1566, f2373; +fma.rn.f32 f1574, f1567, f1439, f1573; +mul.f32 f2355, f1566, f1566; +mul.f32 f2356, f1567, f1567; +sub.f32 f1577, f2355, f2356; +mul.f32 f1578, f1567, f1566; +fma.rn.f32 f1579, f1567, f1566, f1578; +mul.f32 f2353, f1577, f1455; +mul.f32 f2354, f1579, f2371; +sub.f32 f1582, f2353, f2354; +mul.f32 f1583, f1577, f2371; +fma.rn.f32 f1584, f1579, f1455, f1583; +mul.f32 f1586, f1567, f1579; +mul.f32 f2352, f1566, f1577; +sub.f32 f1587, f2352, f1586; +mul.f32 f1588, f1566, f1579; +fma.rn.f32 f1589, f1567, f1577, f1588; +mul.f32 f1591, f1589, f2369; +mul.f32 f2351, f1587, f1471; +sub.f32 f1592, f2351, f1591; +mul.f32 f1593, f1587, f2369; +fma.rn.f32 f1594, f1589, f1471, f1593; +mul.f32 f1596, f1567, f1589; +mul.f32 f2350, f1566, f1587; +sub.f32 f1597, f2350, f1596; +mul.f32 f1598, f1566, f1589; +fma.rn.f32 f1599, f1567, f1587, f1598; +mul.f32 f1601, f1599, f2367; +mul.f32 f2349, f1597, f1487; +sub.f32 f1602, f2349, f1601; +mul.f32 f1603, f1597, f2367; +fma.rn.f32 f1604, f1599, f1487, f1603; +mul.f32 f1606, f1567, f1599; +mul.f32 f2348, f1566, f1597; +sub.f32 f1607, f2348, f1606; +mul.f32 f1608, f1566, f1599; +fma.rn.f32 f1609, f1567, f1597, f1608; +mul.f32 f2346, f1607, f1503; +mul.f32 f2347, f1609, f2365; +sub.f32 f1612, f2346, f2347; +mul.f32 f1613, f1607, f2365; +fma.rn.f32 f1614, f1609, f1503, f1613; +mul.f32 f2344, f1566, f1607; +mul.f32 f2345, f1567, f1609; +sub.f32 f1617, f2344, f2345; +mul.f32 f1618, f1566, f1609; +fma.rn.f32 f1619, f1567, f1607, f1618; +mul.f32 f2342, f1617, f1519; +mul.f32 f2343, f1619, f2363; +sub.f32 f1622, f2342, f2343; +mul.f32 f1623, f1617, f2363; +fma.rn.f32 f1624, f1619, f1519, f1623; +mul.f32 f2340, f1566, f1617; +mul.f32 f2341, f1567, f1619; +sub.f32 f1627, f2340, f2341; +mul.f32 f1628, f1566, f1619; +fma.rn.f32 f1629, f1567, f1617, f1628; +mul.f32 f1631, f1629, f2361; +mul.f32 f2339, f1627, f1535; +sub.f32 f1632, f2339, f1631; +mul.f32 f1633, f1627, f2361; +fma.rn.f32 f1634, f1629, f1535, f1633; +mul.f32 f1636, f1567, f1629; +mul.f32 f2338, f1566, f1627; +sub.f32 f1637, f2338, f1636; +mul.f32 f1638, f1566, f1629; +fma.rn.f32 f1639, f1567, f1627, f1638; +mul.f32 f1641, f1639, f2359; +mul.f32 f2337, f1637, f1551; +sub.f32 f1642, f2337, f1641; +mul.f32 f1643, f1637, f2359; +fma.rn.f32 f1644, f1639, f1551, f1643; +mul.f32 f1646, f1567, f1639; +mul.f32 f2336, f1566, f1637; +sub.f32 f1647, f2336, f1646; +mul.f32 f1648, f1566, f1639; +fma.rn.f32 f1649, f1567, f1637, f1648; +mul.f32 f1651, f1649, f1436; +mul.f32 f2335, f1647, f1430; +sub.f32 f1652, f2335, f1651; +mul.f32 f1653, f1647, f1436; +fma.rn.f32 f1654, f1649, f1430, f1653; +mul.f32 f2333, f1566, f1647; +mul.f32 f2334, f1567, f1649; +sub.f32 f1657, f2333, f2334; +mul.f32 f1658, f1566, f1649; +fma.rn.f32 f1659, f1567, f1647, f1658; +mul.f32 f2331, f1657, f1446; +mul.f32 f2332, f1659, f1452; +sub.f32 f1662, f2331, f2332; +mul.f32 f1663, f1657, f1452; +fma.rn.f32 f1664, f1659, f1446, f1663; +mul.f32 f2329, f1566, f1657; +mul.f32 f2330, f1567, f1659; +sub.f32 f1667, f2329, f2330; +mul.f32 f1668, f1566, f1659; +fma.rn.f32 f1669, f1567, f1657, f1668; +mul.f32 f2327, f1667, f1462; +mul.f32 f2328, f1669, f1468; +sub.f32 f1672, f2327, f2328; +mul.f32 f1673, f1667, f1468; +fma.rn.f32 f1674, f1669, f1462, f1673; +mul.f32 f1676, f1567, f1669; +mul.f32 f2326, f1566, f1667; +sub.f32 f1677, f2326, f1676; +mul.f32 f1678, f1566, f1669; +fma.rn.f32 f1679, f1567, f1667, f1678; +mul.f32 f1681, f1679, f1484; +mul.f32 f2325, f1677, f1478; +sub.f32 f1682, f2325, f1681; +mul.f32 f1683, f1677, f1484; +fma.rn.f32 f1684, f1679, f1478, f1683; +mul.f32 f1686, f1567, f1679; +mul.f32 f2324, f1566, f1677; +sub.f32 f1687, f2324, f1686; +mul.f32 f1688, f1566, f1679; +fma.rn.f32 f1689, f1567, f1677, f1688; +mul.f32 f1691, f1689, f1500; +mul.f32 f2323, f1687, f1494; +sub.f32 f1692, f2323, f1691; +mul.f32 f1693, f1687, f1500; +fma.rn.f32 f1694, f1689, f1494, f1693; +mul.f32 f1696, f1567, f1689; +mul.f32 f2322, f1566, f1687; +sub.f32 f1697, f2322, f1696; +mul.f32 f1698, f1566, f1689; +fma.rn.f32 f1699, f1567, f1687, f1698; +mul.f32 f1701, f1699, f1516; +mul.f32 f2321, f1697, f1510; +sub.f32 f1702, f2321, f1701; +mul.f32 f1703, f1697, f1516; +fma.rn.f32 f1704, f1699, f1510, f1703; +mul.f32 f2319, f1566, f1697; +mul.f32 f2320, f1567, f1699; +sub.f32 f1707, f2319, f2320; +mul.f32 f1708, f1566, f1699; +fma.rn.f32 f1709, f1567, f1697, f1708; +mul.f32 f2317, f1707, f1526; +mul.f32 f2318, f1709, f1532; +sub.f32 f1712, f2317, f2318; +mul.f32 f1713, f1707, f1532; +fma.rn.f32 f1714, f1709, f1526, f1713; +mul.f32 f2315, f1566, f1707; +mul.f32 f2316, f1567, f1709; +sub.f32 f1717, f2315, f2316; +mul.f32 f1718, f1566, f1709; +fma.rn.f32 f1719, f1567, f1707, f1718; +mul.f32 f1721, f1719, f1548; +mul.f32 f2314, f1717, f1542; +sub.f32 f1722, f2314, f1721; +mul.f32 f1723, f1717, f1548; +fma.rn.f32 f1724, f1719, f1542, f1723; +mul.f32 f1726, f1567, f1719; +mul.f32 f2313, f1566, f1717; +sub.f32 f1727, f2313, f1726; +mul.f32 f1728, f1566, f1719; +fma.rn.f32 f1729, f1567, f1717, f1728; +mul.f32 f1731, f1729, f1564; +mul.f32 f2312, f1727, f1558; +sub.f32 f1732, f2312, f1731; +mul.f32 f1733, f1727, f1564; +fma.rn.f32 f1734, f1729, f1558, f1733; +mul.f32 f1736, f1567, f1729; +mul.f32 f2311, f1566, f1727; +sub.f32 f1737, f2311, f1736; +mul.f32 f1738, f1566, f1729; +fma.rn.f32 f1739, f1567, f1727, f1738; +mul.f32 f1741, f1739, f1437; +mul.f32 f2310, f1737, f1431; +sub.f32 f1742, f2310, f1741; +mul.f32 f1743, f1737, f1437; +fma.rn.f32 f1744, f1739, f1431, f1743; +mul.f32 f1746, f1567, f1739; +mul.f32 f2309, f1566, f1737; +sub.f32 f1747, f2309, f1746; +mul.f32 f1748, f1566, f1739; +fma.rn.f32 f1749, f1567, f1737, f1748; +mul.f32 f2307, f1747, f1447; +mul.f32 f2308, f1749, f1453; +sub.f32 f1752, f2307, f2308; +mul.f32 f1753, f1747, f1453; +fma.rn.f32 f1754, f1749, f1447, f1753; +mul.f32 f2305, f1566, f1747; +mul.f32 f2306, f1567, f1749; +sub.f32 f1757, f2305, f2306; +mul.f32 f1758, f1566, f1749; +fma.rn.f32 f1759, f1567, f1747, f1758; +mul.f32 f2303, f1757, f1463; +mul.f32 f2304, f1759, f1469; +sub.f32 f1762, f2303, f2304; +mul.f32 f1763, f1757, f1469; +fma.rn.f32 f1764, f1759, f1463, f1763; +mul.f32 f2301, f1566, f1757; +mul.f32 f2302, f1567, f1759; +sub.f32 f1767, f2301, f2302; +mul.f32 f1768, f1566, f1759; +fma.rn.f32 f1769, f1567, f1757, f1768; +mul.f32 f1771, f1769, f1485; +mul.f32 f2300, f1767, f1479; +sub.f32 f1772, f2300, f1771; +mul.f32 f1773, f1767, f1485; +fma.rn.f32 f1774, f1769, f1479, f1773; +mul.f32 f1776, f1567, f1769; +mul.f32 f2299, f1566, f1767; +sub.f32 f1777, f2299, f1776; +mul.f32 f1778, f1566, f1769; +fma.rn.f32 f1779, f1567, f1767, f1778; +mul.f32 f1781, f1779, f1501; +mul.f32 f2298, f1777, f1495; +sub.f32 f1782, f2298, f1781; +mul.f32 f1783, f1777, f1501; +fma.rn.f32 f1784, f1779, f1495, f1783; +mul.f32 f1786, f1567, f1779; +mul.f32 f2297, f1566, f1777; +sub.f32 f1787, f2297, f1786; +mul.f32 f1788, f1566, f1779; +fma.rn.f32 f1789, f1567, f1777, f1788; +mul.f32 f1791, f1789, f1517; +mul.f32 f2296, f1787, f1511; +sub.f32 f1792, f2296, f1791; +mul.f32 f1793, f1787, f1517; +fma.rn.f32 f1794, f1789, f1511, f1793; +mul.f32 f2294, f1566, f1787; +mul.f32 f2295, f1567, f1789; +sub.f32 f1797, f2294, f2295; +mul.f32 f1798, f1566, f1789; +fma.rn.f32 f1799, f1567, f1787, f1798; +mul.f32 f2292, f1797, f1527; +mul.f32 f2293, f1799, f1533; +sub.f32 f1802, f2292, f2293; +mul.f32 f1803, f1797, f1533; +fma.rn.f32 f1804, f1799, f1527, f1803; +mul.f32 f2290, f1566, f1797; +mul.f32 f2291, f1567, f1799; +sub.f32 f1807, f2290, f2291; +mul.f32 f1808, f1566, f1799; +fma.rn.f32 f1809, f1567, f1797, f1808; +mul.f32 f2288, f1807, f1543; +mul.f32 f2289, f1809, f1549; +sub.f32 f1812, f2288, f2289; +mul.f32 f1813, f1807, f1549; +fma.rn.f32 f1814, f1809, f1543, f1813; +mul.f32 f1816, f1567, f1809; +mul.f32 f2287, f1566, f1807; +sub.f32 f1817, f2287, f1816; +mul.f32 f1818, f1566, f1809; +fma.rn.f32 f1819, f1567, f1807, f1818; +mul.f32 f1821, f1819, f1565; +mul.f32 f2286, f1817, f1559; +sub.f32 f1822, f2286, f1821; +mul.f32 f1823, f1817, f1565; +fma.rn.f32 f1824, f1819, f1559, f1823; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 2916, r19; +st.shared.f32 [r20], f1423; +st.shared.f32 [r20+108], f1572; +st.shared.f32 [r20+216], f1582; +st.shared.f32 [r20+324], f1592; +st.shared.f32 [r20+432], f1602; +st.shared.f32 [r20+540], f1612; +st.shared.f32 [r20+648], f1622; +st.shared.f32 [r20+756], f1632; +st.shared.f32 [r20+864], f1642; +st.shared.f32 [r20+972], f1652; +st.shared.f32 [r20+1080], f1662; +st.shared.f32 [r20+1188], f1672; +st.shared.f32 [r20+1296], f1682; +st.shared.f32 [r20+1404], f1692; +st.shared.f32 [r20+1512], f1702; +st.shared.f32 [r20+1620], f1712; +st.shared.f32 [r20+1728], f1722; +st.shared.f32 [r20+1836], f1732; +st.shared.f32 [r20+1944], f1742; +st.shared.f32 [r20+2052], f1752; +st.shared.f32 [r20+2160], f1762; +st.shared.f32 [r20+2268], f1772; +st.shared.f32 [r20+2376], f1782; +st.shared.f32 [r20+2484], f1792; +st.shared.f32 [r20+2592], f1802; +st.shared.f32 [r20+2700], f1812; +st.shared.f32 [r20+2808], f1822; +barrier.sync 0; +ld.shared.f32 f1825, [r10]; +ld.shared.f32 f1826, [r10+972]; +ld.shared.f32 f1827, [r10+1944]; +ld.shared.f32 f1828, [r10+2916]; +ld.shared.f32 f1829, [r10+3888]; +ld.shared.f32 f1830, [r10+4860]; +ld.shared.f32 f1831, [r10+5832]; +ld.shared.f32 f1832, [r10+6804]; +ld.shared.f32 f1833, [r10+7776]; +ld.shared.f32 f1834, [r10+8748]; +ld.shared.f32 f1835, [r10+9720]; +ld.shared.f32 f1836, [r10+10692]; +ld.shared.f32 f1837, [r10+11664]; +ld.shared.f32 f1838, [r10+12636]; +ld.shared.f32 f1839, [r10+13608]; +ld.shared.f32 f1840, [r10+14580]; +ld.shared.f32 f1841, [r10+15552]; +ld.shared.f32 f1842, [r10+16524]; +ld.shared.f32 f1843, [r10+17496]; +ld.shared.f32 f1844, [r10+18468]; +ld.shared.f32 f1845, [r10+19440]; +ld.shared.f32 f1846, [r10+20412]; +ld.shared.f32 f1847, [r10+21384]; +ld.shared.f32 f1848, [r10+22356]; +ld.shared.f32 f1849, [r10+23328]; +ld.shared.f32 f1850, [r10+24300]; +ld.shared.f32 f1851, [r10+25272]; +barrier.sync 0; +st.shared.f32 [r20], f2375; +st.shared.f32 [r20+108], f1574; +st.shared.f32 [r20+216], f1584; +st.shared.f32 [r20+324], f1594; +st.shared.f32 [r20+432], f1604; +st.shared.f32 [r20+540], f1614; +st.shared.f32 [r20+648], f1624; +st.shared.f32 [r20+756], f1634; +st.shared.f32 [r20+864], f1644; +st.shared.f32 [r20+972], f1654; +st.shared.f32 [r20+1080], f1664; +st.shared.f32 [r20+1188], f1674; +st.shared.f32 [r20+1296], f1684; +st.shared.f32 [r20+1404], f1694; +st.shared.f32 [r20+1512], f1704; +st.shared.f32 [r20+1620], f1714; +st.shared.f32 [r20+1728], f1724; +st.shared.f32 [r20+1836], f1734; +st.shared.f32 [r20+1944], f1744; +st.shared.f32 [r20+2052], f1754; +st.shared.f32 [r20+2160], f1764; +st.shared.f32 [r20+2268], f1774; +st.shared.f32 [r20+2376], f1784; +st.shared.f32 [r20+2484], f1794; +st.shared.f32 [r20+2592], f1804; +st.shared.f32 [r20+2700], f1814; +st.shared.f32 [r20+2808], f1824; +barrier.sync 0; +ld.shared.f32 f1852, [r10]; +ld.shared.f32 f1853, [r10+972]; +ld.shared.f32 f1854, [r10+1944]; +ld.shared.f32 f1855, [r10+2916]; +ld.shared.f32 f1856, [r10+3888]; +ld.shared.f32 f1857, [r10+4860]; +ld.shared.f32 f1858, [r10+5832]; +ld.shared.f32 f1859, [r10+6804]; +ld.shared.f32 f1860, [r10+7776]; +ld.shared.f32 f1861, [r10+8748]; +ld.shared.f32 f1862, [r10+9720]; +ld.shared.f32 f1863, [r10+10692]; +ld.shared.f32 f1864, [r10+11664]; +ld.shared.f32 f1865, [r10+12636]; +ld.shared.f32 f1866, [r10+13608]; +ld.shared.f32 f1867, [r10+14580]; +ld.shared.f32 f1868, [r10+15552]; +ld.shared.f32 f1869, [r10+16524]; +ld.shared.f32 f1870, [r10+17496]; +ld.shared.f32 f1871, [r10+18468]; +ld.shared.f32 f1872, [r10+19440]; +ld.shared.f32 f1873, [r10+20412]; +ld.shared.f32 f1874, [r10+21384]; +ld.shared.f32 f1875, [r10+22356]; +ld.shared.f32 f1876, [r10+23328]; +ld.shared.f32 f1877, [r10+24300]; +ld.shared.f32 f1878, [r10+25272]; +add.f32 f1879, f1834, f1843; +add.f32 f1880, f1825, f1879; +mul.f32 f1883, f1879, 0f3F000000; +sub.f32 f1884, f1825, f1883; +add.f32 f2285, f1861, f1870; +sub.f32 f1885, f1861, f1870; +mul.f32 f1886, f1885, 0f3F5DB3D7; +add.f32 f1887, f1886, f1884; +sub.f32 f1888, f1884, f1886; +add.f32 f2284, f1852, f2285; +mul.f32 f1889, f2285, 0f3F000000; +sub.f32 f1890, f1852, f1889; +sub.f32 f1891, f1834, f1843; +mul.f32 f1892, f1891, 0f3F5DB3D7; +sub.f32 f1893, f1890, f1892; +add.f32 f1894, f1892, f1890; +add.f32 f1895, f1837, f1846; +add.f32 f1896, f1828, f1895; +mul.f32 f1899, f1895, 0f3F000000; +sub.f32 f1900, f1828, f1899; +add.f32 f2283, f1864, f1873; +sub.f32 f1901, f1864, f1873; +mul.f32 f1902, f1901, 0f3F5DB3D7; +add.f32 f1903, f1902, f1900; +sub.f32 f1904, f1900, f1902; +add.f32 f2282, f1855, f2283; +mul.f32 f1905, f2283, 0f3F000000; +sub.f32 f1906, f1855, f1905; +sub.f32 f1907, f1837, f1846; +mul.f32 f1908, f1907, 0f3F5DB3D7; +sub.f32 f1909, f1906, f1908; +add.f32 f1910, f1908, f1906; +add.f32 f1911, f1840, f1849; +add.f32 f1912, f1831, f1911; +mul.f32 f1915, f1911, 0f3F000000; +sub.f32 f1916, f1831, f1915; +add.f32 f2281, f1867, f1876; +sub.f32 f1917, f1867, f1876; +mul.f32 f1918, f1917, 0f3F5DB3D7; +add.f32 f1919, f1918, f1916; +sub.f32 f1920, f1916, f1918; +add.f32 f2280, f1858, f2281; +mul.f32 f1921, f2281, 0f3F000000; +sub.f32 f1922, f1858, f1921; +sub.f32 f1923, f1840, f1849; +mul.f32 f1924, f1923, 0f3F5DB3D7; +sub.f32 f1925, f1922, f1924; +add.f32 f1926, f1924, f1922; +mul.f32 f2278, f1903, 0f3F441B7D; +mul.f32 f2279, f1909, 0fBF248DBB; +sub.f32 f1929, f2278, f2279; +mul.f32 f1930, f1909, 0f3F441B7D; +fma.rn.f32 f1931, f1903, 0fBF248DBB, f1930; +mul.f32 f2276, f1919, 0f3E31D0D4; +mul.f32 f2277, f1925, 0fBF7C1C5C; +sub.f32 f1934, f2276, f2277; +mul.f32 f1935, f1925, 0f3E31D0D4; +fma.rn.f32 f1936, f1919, 0fBF7C1C5C, f1935; +mul.f32 f2274, f1904, 0f3E31D0D4; +mul.f32 f2275, f1910, 0fBF7C1C5C; +sub.f32 f1939, f2274, f2275; +mul.f32 f1940, f1910, 0f3E31D0D4; +fma.rn.f32 f1941, f1904, 0fBF7C1C5C, f1940; +mul.f32 f2272, f1920, 0fBF708FB2; +mul.f32 f2273, f1926, 0fBEAF1D44; +sub.f32 f1944, f2272, f2273; +mul.f32 f1945, f1926, 0fBF708FB2; +fma.rn.f32 f1946, f1920, 0fBEAF1D44, f1945; +add.f32 f1947, f1896, f1912; +mul.f32 f1949, f1947, 0f3F000000; +sub.f32 f1950, f1880, f1949; +add.f32 f2271, f2282, f2280; +sub.f32 f1951, f2282, f2280; +mul.f32 f1952, f1951, 0f3F5DB3D7; +mul.f32 f1953, f2271, 0f3F000000; +sub.f32 f1954, f2284, f1953; +sub.f32 f1955, f1896, f1912; +mul.f32 f1956, f1955, 0f3F5DB3D7; +add.f32 f1957, f1929, f1934; +mul.f32 f1959, f1957, 0f3F000000; +sub.f32 f1960, f1887, f1959; +add.f32 f2270, f1931, f1936; +sub.f32 f1961, f1931, f1936; +mul.f32 f1962, f1961, 0f3F5DB3D7; +mul.f32 f1963, f2270, 0f3F000000; +sub.f32 f1964, f1893, f1963; +sub.f32 f1965, f1929, f1934; +mul.f32 f1966, f1965, 0f3F5DB3D7; +add.f32 f1967, f1939, f1944; +mul.f32 f1969, f1967, 0f3F000000; +sub.f32 f1970, f1888, f1969; +add.f32 f2269, f1941, f1946; +sub.f32 f1971, f1941, f1946; +mul.f32 f1972, f1971, 0f3F5DB3D7; +mul.f32 f1973, f2269, 0f3F000000; +sub.f32 f1974, f1894, f1973; +sub.f32 f1975, f1939, f1944; +mul.f32 f1976, f1975, 0f3F5DB3D7; +add.f32 f1977, f1835, f1844; +add.f32 f1978, f1826, f1977; +mul.f32 f1981, f1977, 0f3F000000; +sub.f32 f1982, f1826, f1981; +add.f32 f2268, f1862, f1871; +sub.f32 f1983, f1862, f1871; +mul.f32 f1984, f1983, 0f3F5DB3D7; +add.f32 f1985, f1984, f1982; +sub.f32 f1986, f1982, f1984; +add.f32 f2267, f1853, f2268; +mul.f32 f1987, f2268, 0f3F000000; +sub.f32 f1988, f1853, f1987; +sub.f32 f1989, f1835, f1844; +mul.f32 f1990, f1989, 0f3F5DB3D7; +sub.f32 f1991, f1988, f1990; +add.f32 f1992, f1990, f1988; +add.f32 f1993, f1838, f1847; +add.f32 f1994, f1829, f1993; +mul.f32 f1997, f1993, 0f3F000000; +sub.f32 f1998, f1829, f1997; +add.f32 f2266, f1865, f1874; +sub.f32 f1999, f1865, f1874; +mul.f32 f2000, f1999, 0f3F5DB3D7; +add.f32 f2001, f2000, f1998; +sub.f32 f2002, f1998, f2000; +add.f32 f2265, f1856, f2266; +mul.f32 f2003, f2266, 0f3F000000; +sub.f32 f2004, f1856, f2003; +sub.f32 f2005, f1838, f1847; +mul.f32 f2006, f2005, 0f3F5DB3D7; +sub.f32 f2007, f2004, f2006; +add.f32 f2008, f2006, f2004; +add.f32 f2009, f1841, f1850; +add.f32 f2010, f1832, f2009; +mul.f32 f2013, f2009, 0f3F000000; +sub.f32 f2014, f1832, f2013; +add.f32 f2264, f1868, f1877; +sub.f32 f2015, f1868, f1877; +mul.f32 f2016, f2015, 0f3F5DB3D7; +add.f32 f2017, f2016, f2014; +sub.f32 f2018, f2014, f2016; +add.f32 f2263, f1859, f2264; +mul.f32 f2019, f2264, 0f3F000000; +sub.f32 f2020, f1859, f2019; +sub.f32 f2021, f1841, f1850; +mul.f32 f2022, f2021, 0f3F5DB3D7; +sub.f32 f2023, f2020, f2022; +add.f32 f2024, f2022, f2020; +mul.f32 f2261, f2001, 0f3F441B7D; +mul.f32 f2262, f2007, 0fBF248DBB; +sub.f32 f2027, f2261, f2262; +mul.f32 f2028, f2007, 0f3F441B7D; +fma.rn.f32 f2029, f2001, 0fBF248DBB, f2028; +mul.f32 f2031, f2023, 0fBF7C1C5C; +mul.f32 f2260, f2017, 0f3E31D0D4; +sub.f32 f2032, f2260, f2031; +mul.f32 f2033, f2023, 0f3E31D0D4; +fma.rn.f32 f2034, f2017, 0fBF7C1C5C, f2033; +mul.f32 f2036, f2008, 0fBF7C1C5C; +mul.f32 f2259, f2002, 0f3E31D0D4; +sub.f32 f2037, f2259, f2036; +mul.f32 f2038, f2008, 0f3E31D0D4; +fma.rn.f32 f2039, f2002, 0fBF7C1C5C, f2038; +mul.f32 f2041, f2024, 0fBEAF1D44; +mul.f32 f2258, f2018, 0fBF708FB2; +sub.f32 f2042, f2258, f2041; +mul.f32 f2043, f2024, 0fBF708FB2; +fma.rn.f32 f2044, f2018, 0fBEAF1D44, f2043; +add.f32 f2045, f1994, f2010; +mul.f32 f2047, f2045, 0f3F000000; +sub.f32 f2048, f1978, f2047; +add.f32 f2257, f2265, f2263; +sub.f32 f2049, f2265, f2263; +mul.f32 f2050, f2049, 0f3F5DB3D7; +mul.f32 f2051, f2257, 0f3F000000; +sub.f32 f2052, f2267, f2051; +sub.f32 f2053, f1994, f2010; +mul.f32 f2054, f2053, 0f3F5DB3D7; +add.f32 f2055, f2027, f2032; +mul.f32 f2057, f2055, 0f3F000000; +sub.f32 f2058, f1985, f2057; +add.f32 f2256, f2029, f2034; +sub.f32 f2059, f2029, f2034; +mul.f32 f2060, f2059, 0f3F5DB3D7; +mul.f32 f2061, f2256, 0f3F000000; +sub.f32 f2062, f1991, f2061; +sub.f32 f2063, f2027, f2032; +mul.f32 f2064, f2063, 0f3F5DB3D7; +add.f32 f2065, f2037, f2042; +mul.f32 f2067, f2065, 0f3F000000; +sub.f32 f2068, f1986, f2067; +add.f32 f2255, f2039, f2044; +sub.f32 f2069, f2039, f2044; +mul.f32 f2070, f2069, 0f3F5DB3D7; +mul.f32 f2071, f2255, 0f3F000000; +sub.f32 f2072, f1992, f2071; +sub.f32 f2073, f2037, f2042; +mul.f32 f2074, f2073, 0f3F5DB3D7; +add.f32 f2075, f1836, f1845; +add.f32 f2076, f1827, f2075; +mul.f32 f2079, f2075, 0f3F000000; +sub.f32 f2080, f1827, f2079; +add.f32 f2254, f1863, f1872; +sub.f32 f2081, f1863, f1872; +mul.f32 f2082, f2081, 0f3F5DB3D7; +add.f32 f2083, f2082, f2080; +sub.f32 f2084, f2080, f2082; +add.f32 f2253, f1854, f2254; +mul.f32 f2085, f2254, 0f3F000000; +sub.f32 f2086, f1854, f2085; +sub.f32 f2087, f1836, f1845; +mul.f32 f2088, f2087, 0f3F5DB3D7; +sub.f32 f2089, f2086, f2088; +add.f32 f2090, f2088, f2086; +add.f32 f2091, f1839, f1848; +add.f32 f2092, f1830, f2091; +mul.f32 f2095, f2091, 0f3F000000; +sub.f32 f2096, f1830, f2095; +add.f32 f2252, f1866, f1875; +sub.f32 f2097, f1866, f1875; +mul.f32 f2098, f2097, 0f3F5DB3D7; +add.f32 f2099, f2098, f2096; +sub.f32 f2100, f2096, f2098; +add.f32 f2251, f1857, f2252; +mul.f32 f2101, f2252, 0f3F000000; +sub.f32 f2102, f1857, f2101; +sub.f32 f2103, f1839, f1848; +mul.f32 f2104, f2103, 0f3F5DB3D7; +sub.f32 f2105, f2102, f2104; +add.f32 f2106, f2104, f2102; +add.f32 f2107, f1842, f1851; +add.f32 f2108, f1833, f2107; +mul.f32 f2111, f2107, 0f3F000000; +sub.f32 f2112, f1833, f2111; +add.f32 f2250, f1869, f1878; +sub.f32 f2113, f1869, f1878; +mul.f32 f2114, f2113, 0f3F5DB3D7; +add.f32 f2115, f2114, f2112; +sub.f32 f2116, f2112, f2114; +add.f32 f2249, f1860, f2250; +mul.f32 f2117, f2250, 0f3F000000; +sub.f32 f2118, f1860, f2117; +sub.f32 f2119, f1842, f1851; +mul.f32 f2120, f2119, 0f3F5DB3D7; +sub.f32 f2121, f2118, f2120; +add.f32 f2122, f2120, f2118; +mul.f32 f2124, f2105, 0fBF248DBB; +mul.f32 f2248, f2099, 0f3F441B7D; +sub.f32 f2125, f2248, f2124; +mul.f32 f2126, f2105, 0f3F441B7D; +fma.rn.f32 f2127, f2099, 0fBF248DBB, f2126; +mul.f32 f2129, f2121, 0fBF7C1C5C; +mul.f32 f2247, f2115, 0f3E31D0D4; +sub.f32 f2130, f2247, f2129; +mul.f32 f2131, f2121, 0f3E31D0D4; +fma.rn.f32 f2132, f2115, 0fBF7C1C5C, f2131; +mul.f32 f2134, f2106, 0fBF7C1C5C; +mul.f32 f2246, f2100, 0f3E31D0D4; +sub.f32 f2135, f2246, f2134; +mul.f32 f2136, f2106, 0f3E31D0D4; +fma.rn.f32 f2137, f2100, 0fBF7C1C5C, f2136; +mul.f32 f2244, f2116, 0fBF708FB2; +mul.f32 f2245, f2122, 0fBEAF1D44; +sub.f32 f2140, f2244, f2245; +mul.f32 f2141, f2122, 0fBF708FB2; +fma.rn.f32 f2142, f2116, 0fBEAF1D44, f2141; +add.f32 f2143, f2092, f2108; +mul.f32 f2145, f2143, 0f3F000000; +sub.f32 f2146, f2076, f2145; +add.f32 f2243, f2251, f2249; +sub.f32 f2147, f2251, f2249; +mul.f32 f2148, f2147, 0f3F5DB3D7; +mul.f32 f2149, f2243, 0f3F000000; +sub.f32 f2150, f2253, f2149; +sub.f32 f2151, f2092, f2108; +mul.f32 f2152, f2151, 0f3F5DB3D7; +add.f32 f2153, f2125, f2130; +mul.f32 f2155, f2153, 0f3F000000; +sub.f32 f2156, f2083, f2155; +add.f32 f2242, f2127, f2132; +sub.f32 f2157, f2127, f2132; +mul.f32 f2158, f2157, 0f3F5DB3D7; +mul.f32 f2159, f2242, 0f3F000000; +sub.f32 f2160, f2089, f2159; +sub.f32 f2161, f2125, f2130; +mul.f32 f2162, f2161, 0f3F5DB3D7; +add.f32 f2163, f2135, f2140; +mul.f32 f2165, f2163, 0f3F000000; +sub.f32 f2166, f2084, f2165; +add.f32 f2241, f2137, f2142; +sub.f32 f2167, f2137, f2142; +mul.f32 f2168, f2167, 0f3F5DB3D7; +mul.f32 f2169, f2241, 0f3F000000; +sub.f32 f2170, f2090, f2169; +sub.f32 f2171, f2135, f2140; +mul.f32 f2664, f2143, 0f3F000000; +sub.f32 f2663, f2076, f2664; +mul.f32 f2172, f2171, 0f3F5DB3D7; +add.f32 %0, f1880, f1947; +mul.f32 f2666, f2242, 0f3F000000; +sub.f32 f2665, f2089, f2666; +add.f32 %1, f2284, f2271; +mul.f32 f2668, f1967, 0f3F000000; +sub.f32 f2667, f1888, f2668; +mul.f32 f2670, f2243, 0f3F000000; +sub.f32 f2669, f2253, f2670; +add.f32 %2, f1978, f2045; +add.f32 %3, f2267, f2257; +add.f32 %4, f2076, f2143; +add.f32 %5, f2253, f2243; +add.f32 %7, f1893, f2270; +add.f32 %6, f1887, f1957; +add.f32 %9, f1991, f2256; +add.f32 %8, f1985, f2055; +add.f32 %11, f2089, f2242; +add.f32 %10, f2083, f2153; +add.f32 %13, f1894, f2269; +add.f32 %12, f1888, f1967; +add.f32 %15, f1992, f2255; +add.f32 %14, f1986, f2065; +add.f32 %17, f2090, f2241; +add.f32 %16, f2084, f2163; +add.f32 %18, f1952, f1950; +sub.f32 %19, f1954, f1956; +add.f32 %20, f2050, f2048; +sub.f32 %21, f2052, f2054; +add.f32 %22, f2148, f2663; +sub.f32 %23, f2669, f2152; +add.f32 %24, f1962, f1960; +sub.f32 %25, f1964, f1966; +add.f32 %26, f2060, f2058; +sub.f32 %27, f2062, f2064; +add.f32 %28, f2158, f2156; +sub.f32 %29, f2665, f2162; +add.f32 %30, f1972, f2667; +sub.f32 %31, f1974, f1976; +sub.f32 %33, f2072, f2074; +add.f32 %32, f2070, f2068; +sub.f32 %35, f2170, f2172; +add.f32 %34, f2168, f2166; +sub.f32 %36, f1950, f1952; +add.f32 %37, f1956, f1954; +sub.f32 %38, f2048, f2050; +add.f32 %39, f2054, f2052; +sub.f32 %40, f2663, f2148; +add.f32 %41, f2152, f2669; +add.f32 %43, f1966, f1964; +sub.f32 %42, f1960, f1962; +add.f32 %45, f2064, f2062; +sub.f32 %44, f2058, f2060; +add.f32 %47, f2162, f2665; +sub.f32 %46, f2156, f2158; +add.f32 %49, f1976, f1974; +sub.f32 %48, f2667, f1972; +add.f32 %51, f2074, f2072; +sub.f32 %50, f2068, f2070; +add.f32 %53, f2172, f2170; +sub.f32 %52, f2166, f2168; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_6561), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<153, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<846>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 52488, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0f3F5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0f3F5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 52488, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f151, f120; +mul.f32 f156, f152, f122; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f160, f136; +mul.f32 f164, f162, f138; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f168, f111; +mul.f32 f172, f170, f117; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f176, f127; +mul.f32 f180, f178, f133; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f184, f143; +mul.f32 f188, f186, f149; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f192, f112; +mul.f32 f196, f194, f118; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f200, f128; +mul.f32 f204, f202, f134; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f208, f144; +mul.f32 f212, f210, f150; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f152, f120, f157; +sub.f32 f217, f155, f156; +st.shared.v2.f32 [r9+8], {f217, f216}; +fma.rn.f32 f218, f162, f136, f165; +sub.f32 f219, f163, f164; +st.shared.v2.f32 [r9+16], {f219, f218}; +sub.f32 f220, f171, f172; +fma.rn.f32 f221, f170, f111, f173; +st.shared.v2.f32 [r9+24], {f220, f221}; +fma.rn.f32 f222, f178, f127, f181; +sub.f32 f223, f179, f180; +st.shared.v2.f32 [r9+32], {f223, f222}; +sub.f32 f224, f187, f188; +fma.rn.f32 f225, f186, f143, f189; +st.shared.v2.f32 [r9+40], {f224, f225}; +fma.rn.f32 f226, f194, f112, f197; +sub.f32 f227, f195, f196; +st.shared.v2.f32 [r9+48], {f227, f226}; +fma.rn.f32 f228, f202, f128, f205; +sub.f32 f229, f203, f204; +st.shared.v2.f32 [r9+56], {f229, f228}; +fma.rn.f32 f230, f210, f144, f213; +sub.f32 f231, f211, f212; +st.shared.v2.f32 [r9+64], {f231, f230}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+5832]; +ld.shared.v2.f32 {f240, f241}, [r11+11664]; +ld.shared.v2.f32 {f244, f245}, [r11+17496]; +ld.shared.v2.f32 {f248, f249}, [r11+23328]; +ld.shared.v2.f32 {f252, f253}, [r11+29160]; +ld.shared.v2.f32 {f256, f257}, [r11+34992]; +ld.shared.v2.f32 {f260, f261}, [r11+40824]; +ld.shared.v2.f32 {f264, f265}, [r11+46656]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0f3F5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0f3F5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0f3F5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0f3F5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0f3F5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0f3F5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0fBF248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0fBF248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0fBF7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0fBF7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0fBF7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0fBF7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0fBEAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0fBEAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0f3F5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0f3F5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f382, f351; +mul.f32 f387, f383, f353; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f391, f367; +mul.f32 f395, f393, f369; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f399, f342; +mul.f32 f403, f401, f348; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f415, f374; +mul.f32 f419, f417, f380; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f423, f343; +mul.f32 f427, f425, f349; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f431, f359; +mul.f32 f435, f433, f365; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f439, f375; +mul.f32 f443, f441, f381; +mul.f32 f444, f439, f381; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r17], {f446, f445}; +fma.rn.f32 f447, f383, f351, f388; +sub.f32 f448, f386, f387; +st.shared.v2.f32 [r17+72], {f448, f447}; +fma.rn.f32 f449, f393, f367, f396; +sub.f32 f450, f394, f395; +st.shared.v2.f32 [r17+144], {f450, f449}; +fma.rn.f32 f451, f401, f342, f404; +sub.f32 f452, f402, f403; +st.shared.v2.f32 [r17+216], {f452, f451}; +fma.rn.f32 f453, f409, f358, f412; +sub.f32 f454, f410, f411; +st.shared.v2.f32 [r17+288], {f454, f453}; +fma.rn.f32 f455, f417, f374, f420; +sub.f32 f456, f418, f419; +st.shared.v2.f32 [r17+360], {f456, f455}; +fma.rn.f32 f457, f425, f343, f428; +sub.f32 f458, f426, f427; +st.shared.v2.f32 [r17+432], {f458, f457}; +sub.f32 f459, f434, f435; +fma.rn.f32 f460, f433, f359, f436; +st.shared.v2.f32 [r17+504], {f459, f460}; +fma.rn.f32 f461, f441, f375, f444; +sub.f32 f462, f442, f443; +st.shared.v2.f32 [r17+576], {f462, f461}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r11]; +ld.shared.v2.f32 {f467, f468}, [r11+5832]; +ld.shared.v2.f32 {f471, f472}, [r11+11664]; +ld.shared.v2.f32 {f475, f476}, [r11+17496]; +ld.shared.v2.f32 {f479, f480}, [r11+23328]; +ld.shared.v2.f32 {f483, f484}, [r11+29160]; +ld.shared.v2.f32 {f487, f488}, [r11+34992]; +ld.shared.v2.f32 {f491, f492}, [r11+40824]; +ld.shared.v2.f32 {f495, f496}, [r11+46656]; +add.f32 f499, f475, f487; +add.f32 f500, f463, f499; +add.f32 f501, f476, f488; +add.f32 f502, f464, f501; +mul.f32 f503, f499, 0f3F000000; +sub.f32 f504, f463, f503; +sub.f32 f505, f476, f488; +mul.f32 f506, f505, 0f3F5DB3D7; +add.f32 f507, f506, f504; +sub.f32 f508, f504, f506; +mul.f32 f509, f501, 0f3F000000; +sub.f32 f510, f464, f509; +sub.f32 f511, f475, f487; +mul.f32 f512, f511, 0f3F5DB3D7; +sub.f32 f513, f510, f512; +add.f32 f514, f512, f510; +add.f32 f515, f479, f491; +add.f32 f516, f467, f515; +add.f32 f517, f480, f492; +add.f32 f518, f468, f517; +mul.f32 f519, f515, 0f3F000000; +sub.f32 f520, f467, f519; +sub.f32 f521, f480, f492; +mul.f32 f522, f521, 0f3F5DB3D7; +add.f32 f523, f522, f520; +sub.f32 f524, f520, f522; +mul.f32 f525, f517, 0f3F000000; +sub.f32 f526, f468, f525; +sub.f32 f527, f479, f491; +mul.f32 f528, f527, 0f3F5DB3D7; +sub.f32 f529, f526, f528; +add.f32 f530, f528, f526; +add.f32 f531, f483, f495; +add.f32 f532, f471, f531; +add.f32 f533, f484, f496; +add.f32 f534, f472, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f471, f535; +sub.f32 f537, f484, f496; +mul.f32 f538, f537, 0f3F5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f472, f541; +sub.f32 f543, f483, f495; +mul.f32 f544, f543, 0f3F5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +mul.f32 f547, f523, 0f3F441B7D; +mul.f32 f548, f529, 0fBF248DBB; +sub.f32 f549, f547, f548; +mul.f32 f550, f529, 0f3F441B7D; +fma.rn.f32 f551, f523, 0fBF248DBB, f550; +mul.f32 f552, f539, 0f3E31D0D4; +mul.f32 f553, f545, 0fBF7C1C5C; +sub.f32 f554, f552, f553; +mul.f32 f555, f545, 0f3E31D0D4; +fma.rn.f32 f556, f539, 0fBF7C1C5C, f555; +mul.f32 f557, f524, 0f3E31D0D4; +mul.f32 f558, f530, 0fBF7C1C5C; +sub.f32 f559, f557, f558; +mul.f32 f560, f530, 0f3E31D0D4; +fma.rn.f32 f561, f524, 0fBF7C1C5C, f560; +mul.f32 f562, f540, 0fBF708FB2; +mul.f32 f563, f546, 0fBEAF1D44; +sub.f32 f564, f562, f563; +mul.f32 f565, f546, 0fBF708FB2; +fma.rn.f32 f566, f540, 0fBEAF1D44, f565; +add.f32 f567, f516, f532; +add.f32 f568, f518, f534; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f500, f569; +sub.f32 f571, f518, f534; +mul.f32 f572, f571, 0f3F5DB3D7; +add.f32 f573, f572, f570; +sub.f32 f574, f570, f572; +mul.f32 f575, f568, 0f3F000000; +sub.f32 f576, f502, f575; +sub.f32 f577, f516, f532; +mul.f32 f578, f577, 0f3F5DB3D7; +sub.f32 f579, f576, f578; +add.f32 f580, f578, f576; +add.f32 f581, f549, f554; +add.f32 f582, f507, f581; +add.f32 f583, f551, f556; +add.f32 f584, f513, f583; +mul.f32 f585, f581, 0f3F000000; +sub.f32 f586, f507, f585; +sub.f32 f587, f551, f556; +mul.f32 f588, f587, 0f3F5DB3D7; +add.f32 f589, f588, f586; +sub.f32 f590, f586, f588; +mul.f32 f591, f583, 0f3F000000; +sub.f32 f592, f513, f591; +sub.f32 f593, f549, f554; +mul.f32 f594, f593, 0f3F5DB3D7; +sub.f32 f595, f592, f594; +add.f32 f596, f594, f592; +add.f32 f597, f559, f564; +add.f32 f598, f508, f597; +add.f32 f599, f561, f566; +add.f32 f600, f514, f599; +mul.f32 f601, f597, 0f3F000000; +sub.f32 f602, f508, f601; +sub.f32 f603, f561, f566; +mul.f32 f604, f603, 0f3F5DB3D7; +add.f32 f605, f604, f602; +sub.f32 f606, f602, f604; +mul.f32 f607, f599, 0f3F000000; +sub.f32 f608, f514, f607; +sub.f32 f609, f559, f564; +mul.f32 f610, f609, 0f3F5DB3D7; +sub.f32 f611, f608, f610; +add.f32 f612, f610, f608; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f613, f614}, [rd16]; +mul.f32 f617, f613, f582; +mul.f32 f618, f614, f584; +mul.f32 f619, f613, f584; +mul.f32 f620, f613, f613; +mul.f32 f621, f614, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f614, f613; +fma.rn.f32 f624, f614, f613, f623; +mul.f32 f625, f622, f598; +mul.f32 f626, f624, f600; +mul.f32 f627, f622, f600; +mul.f32 f628, f613, f622; +mul.f32 f629, f614, f624; +sub.f32 f630, f628, f629; +mul.f32 f631, f613, f624; +fma.rn.f32 f632, f614, f622, f631; +mul.f32 f633, f630, f573; +mul.f32 f634, f632, f579; +mul.f32 f635, f630, f579; +mul.f32 f636, f613, f630; +mul.f32 f637, f614, f632; +sub.f32 f638, f636, f637; +mul.f32 f639, f613, f632; +fma.rn.f32 f640, f614, f630, f639; +mul.f32 f641, f638, f589; +mul.f32 f642, f640, f595; +mul.f32 f643, f638, f595; +mul.f32 f644, f613, f638; +mul.f32 f645, f614, f640; +sub.f32 f646, f644, f645; +mul.f32 f647, f613, f640; +fma.rn.f32 f648, f614, f638, f647; +mul.f32 f649, f646, f605; +mul.f32 f650, f648, f611; +mul.f32 f651, f646, f611; +mul.f32 f652, f613, f646; +mul.f32 f653, f614, f648; +sub.f32 f654, f652, f653; +mul.f32 f655, f613, f648; +fma.rn.f32 f656, f614, f646, f655; +mul.f32 f657, f654, f574; +mul.f32 f658, f656, f580; +mul.f32 f659, f654, f580; +mul.f32 f660, f613, f654; +mul.f32 f661, f614, f656; +sub.f32 f662, f660, f661; +mul.f32 f663, f613, f656; +fma.rn.f32 f664, f614, f654, f663; +mul.f32 f665, f662, f590; +mul.f32 f666, f664, f596; +mul.f32 f667, f662, f596; +mul.f32 f668, f613, f662; +mul.f32 f669, f614, f664; +sub.f32 f670, f668, f669; +mul.f32 f671, f613, f664; +fma.rn.f32 f672, f614, f662, f671; +mul.f32 f673, f670, f606; +mul.f32 f674, f672, f612; +mul.f32 f675, f670, f612; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +add.f32 f676, f502, f568; +add.f32 f677, f500, f567; +st.shared.v2.f32 [r23], {f677, f676}; +fma.rn.f32 f678, f614, f582, f619; +sub.f32 f679, f617, f618; +st.shared.v2.f32 [r23+648], {f679, f678}; +fma.rn.f32 f680, f624, f598, f627; +sub.f32 f681, f625, f626; +st.shared.v2.f32 [r23+1296], {f681, f680}; +fma.rn.f32 f682, f632, f573, f635; +sub.f32 f683, f633, f634; +st.shared.v2.f32 [r23+1944], {f683, f682}; +fma.rn.f32 f684, f640, f589, f643; +sub.f32 f685, f641, f642; +st.shared.v2.f32 [r23+2592], {f685, f684}; +fma.rn.f32 f686, f648, f605, f651; +sub.f32 f687, f649, f650; +st.shared.v2.f32 [r23+3240], {f687, f686}; +fma.rn.f32 f688, f656, f574, f659; +sub.f32 f689, f657, f658; +st.shared.v2.f32 [r23+3888], {f689, f688}; +sub.f32 f690, f665, f666; +fma.rn.f32 f691, f664, f590, f667; +st.shared.v2.f32 [r23+4536], {f690, f691}; +fma.rn.f32 f692, f672, f606, f675; +sub.f32 f693, f673, f674; +st.shared.v2.f32 [r23+5184], {f693, f692}; +barrier.sync 0; +ld.shared.v2.f32 {f694, f695}, [r11]; +ld.shared.v2.f32 {f698, f699}, [r11+5832]; +ld.shared.v2.f32 {f702, f703}, [r11+11664]; +ld.shared.v2.f32 {f706, f707}, [r11+17496]; +ld.shared.v2.f32 {f710, f711}, [r11+23328]; +ld.shared.v2.f32 {f714, f715}, [r11+29160]; +ld.shared.v2.f32 {f718, f719}, [r11+34992]; +ld.shared.v2.f32 {f722, f723}, [r11+40824]; +ld.shared.v2.f32 {f726, f727}, [r11+46656]; +add.f32 f730, f706, f718; +add.f32 f731, f694, f730; +add.f32 f732, f707, f719; +add.f32 f733, f695, f732; +mul.f32 f734, f730, 0f3F000000; +sub.f32 f735, f694, f734; +sub.f32 f736, f707, f719; +mul.f32 f737, f736, 0f3F5DB3D7; +add.f32 f738, f737, f735; +sub.f32 f739, f735, f737; +mul.f32 f740, f732, 0f3F000000; +sub.f32 f741, f695, f740; +sub.f32 f742, f706, f718; +mul.f32 f743, f742, 0f3F5DB3D7; +sub.f32 f744, f741, f743; +add.f32 f745, f743, f741; +add.f32 f746, f710, f722; +add.f32 f747, f698, f746; +add.f32 f748, f711, f723; +add.f32 f749, f699, f748; +mul.f32 f750, f746, 0f3F000000; +sub.f32 f751, f698, f750; +sub.f32 f752, f711, f723; +mul.f32 f753, f752, 0f3F5DB3D7; +add.f32 f754, f753, f751; +sub.f32 f755, f751, f753; +mul.f32 f756, f748, 0f3F000000; +sub.f32 f757, f699, f756; +sub.f32 f758, f710, f722; +mul.f32 f759, f758, 0f3F5DB3D7; +sub.f32 f760, f757, f759; +add.f32 f761, f759, f757; +add.f32 f762, f714, f726; +add.f32 f763, f702, f762; +add.f32 f764, f715, f727; +add.f32 f765, f703, f764; +mul.f32 f766, f762, 0f3F000000; +sub.f32 f767, f702, f766; +sub.f32 f768, f715, f727; +mul.f32 f769, f768, 0f3F5DB3D7; +add.f32 f770, f769, f767; +sub.f32 f771, f767, f769; +mul.f32 f772, f764, 0f3F000000; +sub.f32 f773, f703, f772; +sub.f32 f774, f714, f726; +mul.f32 f775, f774, 0f3F5DB3D7; +sub.f32 f776, f773, f775; +add.f32 f777, f775, f773; +mul.f32 f778, f754, 0f3F441B7D; +mul.f32 f779, f760, 0fBF248DBB; +sub.f32 f780, f778, f779; +mul.f32 f781, f760, 0f3F441B7D; +fma.rn.f32 f782, f754, 0fBF248DBB, f781; +mul.f32 f783, f770, 0f3E31D0D4; +mul.f32 f784, f776, 0fBF7C1C5C; +sub.f32 f785, f783, f784; +mul.f32 f786, f776, 0f3E31D0D4; +fma.rn.f32 f787, f770, 0fBF7C1C5C, f786; +mul.f32 f788, f755, 0f3E31D0D4; +mul.f32 f789, f761, 0fBF7C1C5C; +sub.f32 f790, f788, f789; +mul.f32 f791, f761, 0f3E31D0D4; +fma.rn.f32 f792, f755, 0fBF7C1C5C, f791; +mul.f32 f793, f771, 0fBF708FB2; +mul.f32 f794, f777, 0fBEAF1D44; +sub.f32 f795, f793, f794; +mul.f32 f796, f777, 0fBF708FB2; +fma.rn.f32 f797, f771, 0fBEAF1D44, f796; +add.f32 f798, f747, f763; +add.f32 f799, f749, f765; +mul.f32 f800, f798, 0f3F000000; +sub.f32 f801, f731, f800; +sub.f32 f802, f749, f765; +mul.f32 f803, f802, 0f3F5DB3D7; +mul.f32 f804, f799, 0f3F000000; +sub.f32 f805, f733, f804; +sub.f32 f806, f747, f763; +mul.f32 f807, f806, 0f3F5DB3D7; +add.f32 f808, f780, f785; +add.f32 f809, f782, f787; +mul.f32 f810, f808, 0f3F000000; +sub.f32 f811, f738, f810; +sub.f32 f812, f782, f787; +mul.f32 f813, f812, 0f3F5DB3D7; +mul.f32 f814, f809, 0f3F000000; +sub.f32 f815, f744, f814; +sub.f32 f816, f780, f785; +mul.f32 f817, f816, 0f3F5DB3D7; +add.f32 f818, f790, f795; +add.f32 f819, f792, f797; +mul.f32 f820, f818, 0f3F000000; +sub.f32 f821, f739, f820; +sub.f32 f822, f792, f797; +mul.f32 f823, f822, 0f3F5DB3D7; +mul.f32 f824, f819, 0f3F000000; +sub.f32 f825, f745, f824; +sub.f32 f826, f790, f795; +mul.f32 f827, f826, 0f3F5DB3D7; +add.f32 %1, f733, f799; +add.f32 %0, f731, f798; +add.f32 %3, f744, f809; +add.f32 %2, f738, f808; +add.f32 %5, f745, f819; +add.f32 %4, f739, f818; +sub.f32 %7, f805, f807; +add.f32 %6, f803, f801; +sub.f32 %9, f815, f817; +add.f32 %8, f813, f811; +sub.f32 %11, f825, f827; +add.f32 %10, f823, f821; +add.f32 %13, f807, f805; +sub.f32 %12, f801, f803; +add.f32 %15, f817, f815; +sub.f32 %14, f811, f813; +add.f32 %17, f827, f825; +sub.f32 %16, f821, f823; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_6561), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<152, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<792>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 26244, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0f3F5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0f3F5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0f3F5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0f3F5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f153, f122; +mul.f32 f158, f154, f124; +sub.f32 f159, f157, f158; +mul.f32 f160, f153, f124; +fma.rn.f32 f161, f154, f122, f160; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f164, f138; +mul.f32 f168, f166, f140; +sub.f32 f169, f167, f168; +mul.f32 f170, f164, f140; +fma.rn.f32 f171, f166, f138, f170; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f174, f113; +mul.f32 f178, f176, f119; +sub.f32 f179, f177, f178; +mul.f32 f180, f174, f119; +fma.rn.f32 f181, f176, f113, f180; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f184, f129; +mul.f32 f188, f186, f135; +sub.f32 f189, f187, f188; +mul.f32 f190, f184, f135; +fma.rn.f32 f191, f186, f129, f190; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f194, f145; +mul.f32 f198, f196, f151; +sub.f32 f199, f197, f198; +mul.f32 f200, f194, f151; +fma.rn.f32 f201, f196, f145, f200; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f204, f114; +mul.f32 f208, f206, f120; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f120; +fma.rn.f32 f211, f206, f114, f210; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f214, f130; +mul.f32 f218, f216, f136; +sub.f32 f219, f217, f218; +mul.f32 f220, f214, f136; +fma.rn.f32 f221, f216, f130, f220; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f224, f146; +mul.f32 f228, f226, f152; +sub.f32 f229, f227, f228; +mul.f32 f230, f224, f152; +fma.rn.f32 f231, f226, f146, f230; +mad.lo.s32 r8, r5, 26244, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f159; +st.shared.f32 [r9+8], f169; +st.shared.f32 [r9+12], f179; +st.shared.f32 [r9+16], f189; +st.shared.f32 [r9+20], f199; +st.shared.f32 [r9+24], f209; +st.shared.f32 [r9+28], f219; +st.shared.f32 [r9+32], f229; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+2916]; +ld.shared.f32 f234, [r11+5832]; +ld.shared.f32 f235, [r11+8748]; +ld.shared.f32 f236, [r11+11664]; +ld.shared.f32 f237, [r11+14580]; +ld.shared.f32 f238, [r11+17496]; +ld.shared.f32 f239, [r11+20412]; +ld.shared.f32 f240, [r11+23328]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+2916]; +ld.shared.f32 f243, [r11+5832]; +ld.shared.f32 f244, [r11+8748]; +ld.shared.f32 f245, [r11+11664]; +ld.shared.f32 f246, [r11+14580]; +ld.shared.f32 f247, [r11+17496]; +ld.shared.f32 f248, [r11+20412]; +ld.shared.f32 f249, [r11+23328]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0f3F5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0f3F5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0f3F5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0f3F5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0f3F5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0f3F5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0fBF248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0fBF248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0fBF7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0fBF7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0fBF7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0fBF7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0fBEAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0fBEAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0f3F5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0f3F5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f366, f335; +mul.f32 f371, f367, f337; +sub.f32 f372, f370, f371; +mul.f32 f373, f366, f337; +fma.rn.f32 f374, f367, f335, f373; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f377, f351; +mul.f32 f381, f379, f353; +sub.f32 f382, f380, f381; +mul.f32 f383, f377, f353; +fma.rn.f32 f384, f379, f351, f383; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f387, f326; +mul.f32 f391, f389, f332; +sub.f32 f392, f390, f391; +mul.f32 f393, f387, f332; +fma.rn.f32 f394, f389, f326, f393; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f397, f342; +mul.f32 f401, f399, f348; +sub.f32 f402, f400, f401; +mul.f32 f403, f397, f348; +fma.rn.f32 f404, f399, f342, f403; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +sub.f32 f412, f410, f411; +mul.f32 f413, f407, f364; +fma.rn.f32 f414, f409, f358, f413; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f417, f327; +mul.f32 f421, f419, f333; +sub.f32 f422, f420, f421; +mul.f32 f423, f417, f333; +fma.rn.f32 f424, f419, f327, f423; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f427, f343; +mul.f32 f431, f429, f349; +sub.f32 f432, f430, f431; +mul.f32 f433, f427, f349; +fma.rn.f32 f434, f429, f343, f433; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f437, f359; +mul.f32 f441, f439, f365; +sub.f32 f442, f440, f441; +mul.f32 f443, f437, f365; +fma.rn.f32 f444, f439, f359, f443; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 324, r16; +st.shared.f32 [r17], f319; +st.shared.f32 [r17+36], f372; +st.shared.f32 [r17+72], f382; +st.shared.f32 [r17+108], f392; +st.shared.f32 [r17+144], f402; +st.shared.f32 [r17+180], f412; +st.shared.f32 [r17+216], f422; +st.shared.f32 [r17+252], f432; +st.shared.f32 [r17+288], f442; +barrier.sync 0; +ld.shared.f32 f445, [r11]; +ld.shared.f32 f446, [r11+2916]; +ld.shared.f32 f447, [r11+5832]; +ld.shared.f32 f448, [r11+8748]; +ld.shared.f32 f449, [r11+11664]; +ld.shared.f32 f450, [r11+14580]; +ld.shared.f32 f451, [r11+17496]; +ld.shared.f32 f452, [r11+20412]; +ld.shared.f32 f453, [r11+23328]; +barrier.sync 0; +st.shared.f32 [r17], f321; +st.shared.f32 [r17+36], f374; +st.shared.f32 [r17+72], f384; +st.shared.f32 [r17+108], f394; +st.shared.f32 [r17+144], f404; +st.shared.f32 [r17+180], f414; +st.shared.f32 [r17+216], f424; +st.shared.f32 [r17+252], f434; +st.shared.f32 [r17+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r11]; +ld.shared.f32 f455, [r11+2916]; +ld.shared.f32 f456, [r11+5832]; +ld.shared.f32 f457, [r11+8748]; +ld.shared.f32 f458, [r11+11664]; +ld.shared.f32 f459, [r11+14580]; +ld.shared.f32 f460, [r11+17496]; +ld.shared.f32 f461, [r11+20412]; +ld.shared.f32 f462, [r11+23328]; +add.f32 f463, f448, f451; +add.f32 f464, f445, f463; +add.f32 f465, f457, f460; +add.f32 f466, f454, f465; +mul.f32 f467, f463, 0f3F000000; +sub.f32 f468, f445, f467; +sub.f32 f469, f457, f460; +mul.f32 f470, f469, 0f3F5DB3D7; +add.f32 f471, f470, f468; +sub.f32 f472, f468, f470; +mul.f32 f473, f465, 0f3F000000; +sub.f32 f474, f454, f473; +sub.f32 f475, f448, f451; +mul.f32 f476, f475, 0f3F5DB3D7; +sub.f32 f477, f474, f476; +add.f32 f478, f476, f474; +add.f32 f479, f449, f452; +add.f32 f480, f446, f479; +add.f32 f481, f458, f461; +add.f32 f482, f455, f481; +mul.f32 f483, f479, 0f3F000000; +sub.f32 f484, f446, f483; +sub.f32 f485, f458, f461; +mul.f32 f486, f485, 0f3F5DB3D7; +add.f32 f487, f486, f484; +sub.f32 f488, f484, f486; +mul.f32 f489, f481, 0f3F000000; +sub.f32 f490, f455, f489; +sub.f32 f491, f449, f452; +mul.f32 f492, f491, 0f3F5DB3D7; +sub.f32 f493, f490, f492; +add.f32 f494, f492, f490; +add.f32 f495, f450, f453; +add.f32 f496, f447, f495; +add.f32 f497, f459, f462; +add.f32 f498, f456, f497; +mul.f32 f499, f495, 0f3F000000; +sub.f32 f500, f447, f499; +sub.f32 f501, f459, f462; +mul.f32 f502, f501, 0f3F5DB3D7; +add.f32 f503, f502, f500; +sub.f32 f504, f500, f502; +mul.f32 f505, f497, 0f3F000000; +sub.f32 f506, f456, f505; +sub.f32 f507, f450, f453; +mul.f32 f508, f507, 0f3F5DB3D7; +sub.f32 f509, f506, f508; +add.f32 f510, f508, f506; +mul.f32 f511, f487, 0f3F441B7D; +mul.f32 f512, f493, 0fBF248DBB; +sub.f32 f513, f511, f512; +mul.f32 f514, f493, 0f3F441B7D; +fma.rn.f32 f515, f487, 0fBF248DBB, f514; +mul.f32 f516, f503, 0f3E31D0D4; +mul.f32 f517, f509, 0fBF7C1C5C; +sub.f32 f518, f516, f517; +mul.f32 f519, f509, 0f3E31D0D4; +fma.rn.f32 f520, f503, 0fBF7C1C5C, f519; +mul.f32 f521, f488, 0f3E31D0D4; +mul.f32 f522, f494, 0fBF7C1C5C; +sub.f32 f523, f521, f522; +mul.f32 f524, f494, 0f3E31D0D4; +fma.rn.f32 f525, f488, 0fBF7C1C5C, f524; +mul.f32 f526, f504, 0fBF708FB2; +mul.f32 f527, f510, 0fBEAF1D44; +sub.f32 f528, f526, f527; +mul.f32 f529, f510, 0fBF708FB2; +fma.rn.f32 f530, f504, 0fBEAF1D44, f529; +add.f32 f531, f480, f496; +add.f32 f532, f464, f531; +add.f32 f533, f482, f498; +add.f32 f534, f466, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f464, f535; +sub.f32 f537, f482, f498; +mul.f32 f538, f537, 0f3F5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f466, f541; +sub.f32 f543, f480, f496; +mul.f32 f544, f543, 0f3F5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +add.f32 f547, f513, f518; +add.f32 f548, f471, f547; +add.f32 f549, f515, f520; +add.f32 f550, f477, f549; +mul.f32 f551, f547, 0f3F000000; +sub.f32 f552, f471, f551; +sub.f32 f553, f515, f520; +mul.f32 f554, f553, 0f3F5DB3D7; +add.f32 f555, f554, f552; +sub.f32 f556, f552, f554; +mul.f32 f557, f549, 0f3F000000; +sub.f32 f558, f477, f557; +sub.f32 f559, f513, f518; +mul.f32 f560, f559, 0f3F5DB3D7; +sub.f32 f561, f558, f560; +add.f32 f562, f560, f558; +add.f32 f563, f523, f528; +add.f32 f564, f472, f563; +add.f32 f565, f525, f530; +add.f32 f566, f478, f565; +mul.f32 f567, f563, 0f3F000000; +sub.f32 f568, f472, f567; +sub.f32 f569, f525, f530; +mul.f32 f570, f569, 0f3F5DB3D7; +add.f32 f571, f570, f568; +sub.f32 f572, f568, f570; +mul.f32 f573, f565, 0f3F000000; +sub.f32 f574, f478, f573; +sub.f32 f575, f523, f528; +mul.f32 f576, f575, 0f3F5DB3D7; +sub.f32 f577, f574, f576; +add.f32 f578, f576, f574; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f579, f580}, [rd16]; +mul.f32 f583, f579, f548; +mul.f32 f584, f580, f550; +sub.f32 f585, f583, f584; +mul.f32 f586, f579, f550; +fma.rn.f32 f587, f580, f548, f586; +mul.f32 f588, f579, f579; +mul.f32 f589, f580, f580; +sub.f32 f590, f588, f589; +mul.f32 f591, f580, f579; +fma.rn.f32 f592, f580, f579, f591; +mul.f32 f593, f590, f564; +mul.f32 f594, f592, f566; +sub.f32 f595, f593, f594; +mul.f32 f596, f590, f566; +fma.rn.f32 f597, f592, f564, f596; +mul.f32 f598, f579, f590; +mul.f32 f599, f580, f592; +sub.f32 f600, f598, f599; +mul.f32 f601, f579, f592; +fma.rn.f32 f602, f580, f590, f601; +mul.f32 f603, f600, f539; +mul.f32 f604, f602, f545; +sub.f32 f605, f603, f604; +mul.f32 f606, f600, f545; +fma.rn.f32 f607, f602, f539, f606; +mul.f32 f608, f579, f600; +mul.f32 f609, f580, f602; +sub.f32 f610, f608, f609; +mul.f32 f611, f579, f602; +fma.rn.f32 f612, f580, f600, f611; +mul.f32 f613, f610, f555; +mul.f32 f614, f612, f561; +sub.f32 f615, f613, f614; +mul.f32 f616, f610, f561; +fma.rn.f32 f617, f612, f555, f616; +mul.f32 f618, f579, f610; +mul.f32 f619, f580, f612; +sub.f32 f620, f618, f619; +mul.f32 f621, f579, f612; +fma.rn.f32 f622, f580, f610, f621; +mul.f32 f623, f620, f571; +mul.f32 f624, f622, f577; +sub.f32 f625, f623, f624; +mul.f32 f626, f620, f577; +fma.rn.f32 f627, f622, f571, f626; +mul.f32 f628, f579, f620; +mul.f32 f629, f580, f622; +sub.f32 f630, f628, f629; +mul.f32 f631, f579, f622; +fma.rn.f32 f632, f580, f620, f631; +mul.f32 f633, f630, f540; +mul.f32 f634, f632, f546; +sub.f32 f635, f633, f634; +mul.f32 f636, f630, f546; +fma.rn.f32 f637, f632, f540, f636; +mul.f32 f638, f579, f630; +mul.f32 f639, f580, f632; +sub.f32 f640, f638, f639; +mul.f32 f641, f579, f632; +fma.rn.f32 f642, f580, f630, f641; +mul.f32 f643, f640, f556; +mul.f32 f644, f642, f562; +sub.f32 f645, f643, f644; +mul.f32 f646, f640, f562; +fma.rn.f32 f647, f642, f556, f646; +mul.f32 f648, f579, f640; +mul.f32 f649, f580, f642; +sub.f32 f650, f648, f649; +mul.f32 f651, f579, f642; +fma.rn.f32 f652, f580, f640, f651; +mul.f32 f653, f650, f572; +mul.f32 f654, f652, f578; +sub.f32 f655, f653, f654; +mul.f32 f656, f650, f578; +fma.rn.f32 f657, f652, f572, f656; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2916, r22; +st.shared.f32 [r23], f532; +st.shared.f32 [r23+324], f585; +st.shared.f32 [r23+648], f595; +st.shared.f32 [r23+972], f605; +st.shared.f32 [r23+1296], f615; +st.shared.f32 [r23+1620], f625; +st.shared.f32 [r23+1944], f635; +st.shared.f32 [r23+2268], f645; +st.shared.f32 [r23+2592], f655; +barrier.sync 0; +ld.shared.f32 f658, [r11]; +ld.shared.f32 f659, [r11+2916]; +ld.shared.f32 f660, [r11+5832]; +ld.shared.f32 f661, [r11+8748]; +ld.shared.f32 f662, [r11+11664]; +ld.shared.f32 f663, [r11+14580]; +ld.shared.f32 f664, [r11+17496]; +ld.shared.f32 f665, [r11+20412]; +ld.shared.f32 f666, [r11+23328]; +barrier.sync 0; +st.shared.f32 [r23], f534; +st.shared.f32 [r23+324], f587; +st.shared.f32 [r23+648], f597; +st.shared.f32 [r23+972], f607; +st.shared.f32 [r23+1296], f617; +st.shared.f32 [r23+1620], f627; +st.shared.f32 [r23+1944], f637; +st.shared.f32 [r23+2268], f647; +st.shared.f32 [r23+2592], f657; +barrier.sync 0; +ld.shared.f32 f667, [r11]; +ld.shared.f32 f668, [r11+2916]; +ld.shared.f32 f669, [r11+5832]; +ld.shared.f32 f670, [r11+8748]; +ld.shared.f32 f671, [r11+11664]; +ld.shared.f32 f672, [r11+14580]; +ld.shared.f32 f673, [r11+17496]; +ld.shared.f32 f674, [r11+20412]; +ld.shared.f32 f675, [r11+23328]; +add.f32 f676, f661, f664; +add.f32 f677, f658, f676; +add.f32 f678, f670, f673; +add.f32 f679, f667, f678; +mul.f32 f680, f676, 0f3F000000; +sub.f32 f681, f658, f680; +sub.f32 f682, f670, f673; +mul.f32 f683, f682, 0f3F5DB3D7; +add.f32 f684, f683, f681; +sub.f32 f685, f681, f683; +mul.f32 f686, f678, 0f3F000000; +sub.f32 f687, f667, f686; +sub.f32 f688, f661, f664; +mul.f32 f689, f688, 0f3F5DB3D7; +sub.f32 f690, f687, f689; +add.f32 f691, f689, f687; +add.f32 f692, f662, f665; +add.f32 f693, f659, f692; +add.f32 f694, f671, f674; +add.f32 f695, f668, f694; +mul.f32 f696, f692, 0f3F000000; +sub.f32 f697, f659, f696; +sub.f32 f698, f671, f674; +mul.f32 f699, f698, 0f3F5DB3D7; +add.f32 f700, f699, f697; +sub.f32 f701, f697, f699; +mul.f32 f702, f694, 0f3F000000; +sub.f32 f703, f668, f702; +sub.f32 f704, f662, f665; +mul.f32 f705, f704, 0f3F5DB3D7; +sub.f32 f706, f703, f705; +add.f32 f707, f705, f703; +add.f32 f708, f663, f666; +add.f32 f709, f660, f708; +add.f32 f710, f672, f675; +add.f32 f711, f669, f710; +mul.f32 f712, f708, 0f3F000000; +sub.f32 f713, f660, f712; +sub.f32 f714, f672, f675; +mul.f32 f715, f714, 0f3F5DB3D7; +add.f32 f716, f715, f713; +sub.f32 f717, f713, f715; +mul.f32 f718, f710, 0f3F000000; +sub.f32 f719, f669, f718; +sub.f32 f720, f663, f666; +mul.f32 f721, f720, 0f3F5DB3D7; +sub.f32 f722, f719, f721; +add.f32 f723, f721, f719; +mul.f32 f724, f700, 0f3F441B7D; +mul.f32 f725, f706, 0fBF248DBB; +sub.f32 f726, f724, f725; +mul.f32 f727, f706, 0f3F441B7D; +fma.rn.f32 f728, f700, 0fBF248DBB, f727; +mul.f32 f729, f716, 0f3E31D0D4; +mul.f32 f730, f722, 0fBF7C1C5C; +sub.f32 f731, f729, f730; +mul.f32 f732, f722, 0f3E31D0D4; +fma.rn.f32 f733, f716, 0fBF7C1C5C, f732; +mul.f32 f734, f701, 0f3E31D0D4; +mul.f32 f735, f707, 0fBF7C1C5C; +sub.f32 f736, f734, f735; +mul.f32 f737, f707, 0f3E31D0D4; +fma.rn.f32 f738, f701, 0fBF7C1C5C, f737; +mul.f32 f739, f717, 0fBF708FB2; +mul.f32 f740, f723, 0fBEAF1D44; +sub.f32 f741, f739, f740; +mul.f32 f742, f723, 0fBF708FB2; +fma.rn.f32 f743, f717, 0fBEAF1D44, f742; +add.f32 f744, f693, f709; +add.f32 f745, f695, f711; +mul.f32 f746, f744, 0f3F000000; +sub.f32 f747, f677, f746; +sub.f32 f748, f695, f711; +mul.f32 f749, f748, 0f3F5DB3D7; +mul.f32 f750, f745, 0f3F000000; +sub.f32 f751, f679, f750; +sub.f32 f752, f693, f709; +mul.f32 f753, f752, 0f3F5DB3D7; +add.f32 f754, f726, f731; +add.f32 f755, f728, f733; +mul.f32 f756, f754, 0f3F000000; +sub.f32 f757, f684, f756; +sub.f32 f758, f728, f733; +mul.f32 f759, f758, 0f3F5DB3D7; +mul.f32 f760, f755, 0f3F000000; +sub.f32 f761, f690, f760; +sub.f32 f762, f726, f731; +mul.f32 f763, f762, 0f3F5DB3D7; +add.f32 f764, f736, f741; +add.f32 f765, f738, f743; +mul.f32 f766, f764, 0f3F000000; +sub.f32 f767, f685, f766; +sub.f32 f768, f738, f743; +mul.f32 f769, f768, 0f3F5DB3D7; +mul.f32 f770, f765, 0f3F000000; +sub.f32 f771, f691, f770; +sub.f32 f772, f736, f741; +mul.f32 f773, f772, 0f3F5DB3D7; +add.f32 %0, f677, f744; +add.f32 %1, f679, f745; +add.f32 %3, f690, f755; +add.f32 %2, f684, f754; +add.f32 %5, f691, f765; +add.f32 %4, f685, f764; +add.f32 %6, f749, f747; +sub.f32 %7, f751, f753; +sub.f32 %9, f761, f763; +add.f32 %8, f759, f757; +sub.f32 %11, f771, f773; +add.f32 %10, f769, f767; +sub.f32 %12, f747, f749; +add.f32 %13, f753, f751; +add.f32 %15, f763, f761; +sub.f32 %14, f757, f759; +add.f32 %17, f773, f771; +sub.f32 %16, f767, f769; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_6561), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..6a23abbe84af2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp32_inv.hpp.inc @@ -0,0 +1,6108 @@ +#ifndef CUFFTDX_FFT_6561_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_6561_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<353, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2721>; +.reg .b32 r<25>; +.reg .b64 rd<16>; +mov.u32 r23, %tid.y; +mov.u32 r24, %54; +mad.lo.s32 r3, r23, 52488, r24; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2720, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2719, %58, f2720; +mul.f32 f119, f2720, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2718, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2717, %64, f2718; +mul.f32 f135, f2718, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2716, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2715, %70, f2716; +mul.f32 f151, f2716, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f2714, f133, 0f3F441B7D; +sub.f32 f159, f2714, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f2712, f149, 0f3E31D0D4; +mul.f32 f2713, f155, 0f3F7C1C5C; +sub.f32 f164, f2712, f2713; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f2710, f134, 0f3E31D0D4; +mul.f32 f2711, f140, 0f3F7C1C5C; +sub.f32 f169, f2710, f2711; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f2708, f150, 0fBF708FB2; +mul.f32 f2709, f156, 0f3EAF1D44; +sub.f32 f174, f2708, f2709; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2707, f2717, f2715; +sub.f32 f183, f2717, f2715; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2706, f2719, f2707; +mul.f32 f187, f2707, 0f3F000000; +sub.f32 f188, f2719, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2705, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2704, f123, f2705; +mul.f32 f203, f2705, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2703, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2702, f124, f2703; +mul.f32 f219, f2703, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2699, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2697, %113, f2699; +mul.f32 f235, f2699, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2694, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2692, %116, f2694; +mul.f32 f251, f2694, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2689, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2687, %119, f2689; +mul.f32 f267, f2689, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f2686, f249, 0f3F441B7D; +sub.f32 f275, f2686, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f2685, f265, 0f3E31D0D4; +sub.f32 f280, f2685, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f2683, f250, 0f3E31D0D4; +mul.f32 f2684, f256, 0f3F7C1C5C; +sub.f32 f285, f2683, f2684; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f2681, f266, 0fBF708FB2; +mul.f32 f2682, f272, 0f3EAF1D44; +sub.f32 f290, f2681, f2682; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2680, f2692, f2687; +sub.f32 f299, f2692, f2687; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2679, f2697, f2680; +mul.f32 f303, f2680, 0f3F000000; +sub.f32 f304, f2697, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2678, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2677, f239, f2678; +mul.f32 f319, f2678, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2676, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2675, f240, f2676; +mul.f32 f335, f2676, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2672, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2670, %122, f2672; +mul.f32 f351, f2672, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2667, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2665, %125, f2667; +mul.f32 f367, f2667, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2663, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2661, %127, f2663; +mul.f32 f383, f2663, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f2660, f365, 0f3F441B7D; +sub.f32 f391, f2660, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f2659, f381, 0f3E31D0D4; +sub.f32 f396, f2659, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f2657, f366, 0f3E31D0D4; +mul.f32 f2658, f372, 0f3F7C1C5C; +sub.f32 f401, f2657, f2658; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f2655, f382, 0fBF708FB2; +mul.f32 f2656, f388, 0f3EAF1D44; +sub.f32 f406, f2655, f2656; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2654, f2665, f2661; +sub.f32 f415, f2665, f2661; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2653, f2670, f2654; +mul.f32 f419, f2654, 0f3F000000; +sub.f32 f420, f2670, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2652, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2651, f355, f2652; +mul.f32 f435, f2652, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2650, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2649, f356, f2650; +mul.f32 f451, f2650, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2677, 0f3E6C2691; +mul.f32 f2648, f310, 0f3F791978; +sub.f32 f459, f2648, f458; +mul.f32 f460, f2677, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f2646, f426, 0f3F64C51C; +mul.f32 f2647, f2651, 0f3EE5C902; +sub.f32 f464, f2646, f2647; +mul.f32 f465, f2651, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f2644, f326, 0f3F64C51C; +mul.f32 f2645, f2675, 0f3EE5C902; +sub.f32 f469, f2644, f2645; +mul.f32 f470, f2675, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f2642, f442, 0f3F18DF63; +mul.f32 f2643, f2649, 0f3F4D57F2; +sub.f32 f474, f2642, f2643; +mul.f32 f475, f2649, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f2640, f301, 0f3F441B7D; +mul.f32 f2641, f307, 0f3F248DBB; +sub.f32 f479, f2640, f2641; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f2639, f417, 0f3E31D0D4; +sub.f32 f484, f2639, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f2638, f317, 0f3F18DF63; +sub.f32 f489, f2638, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f2637, f433, 0fBE92D7E0; +sub.f32 f494, f2637, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f2636, f333, 0f3ECACAF8; +sub.f32 f499, f2636, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f2635, f449, 0fBF2FAD88; +sub.f32 f504, f2635, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f2634, f302, 0f3E31D0D4; +sub.f32 f509, f2634, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f2632, f418, 0fBF708FB2; +mul.f32 f2633, f424, 0f3EAF1D44; +sub.f32 f514, f2632, f2633; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f2630, f318, 0fBD6E2946; +mul.f32 f2631, f324, 0f3F7F9120; +sub.f32 f519, f2630, f2631; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f2628, f434, 0fBF7E44DE; +mul.f32 f2629, f440, 0fBDEDC21F; +sub.f32 f524, f2628, f2629; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f2627, f334, 0fBE92D7E0; +sub.f32 f529, f2627, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f2626, f450, 0fBF55E287; +sub.f32 f534, f2626, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f2625, f2679, f2653; +sub.f32 f541, f2679, f2653; +mul.f32 f542, f541, 0fBF5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f2625, 0f3F000000; +sub.f32 f546, f2706, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0fBF5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f2624, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0fBF5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f2623, f2704, f2624; +mul.f32 f561, f2624, 0f3F000000; +sub.f32 f562, f2704, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0fBF5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f2622, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0fBF5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f2621, f2702, f2622; +mul.f32 f577, f2622, 0f3F000000; +sub.f32 f578, f2702, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0fBF5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f2620, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0fBF5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f2619, f191, f2620; +mul.f32 f593, f2620, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0fBF5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f2618, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f2617, f207, f2618; +mul.f32 f609, f2618, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0fBF5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f2616, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0fBF5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f2615, f223, f2616; +mul.f32 f625, f2616, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0fBF5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f2614, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0fBF5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f2613, f192, f2614; +mul.f32 f641, f2614, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0fBF5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f2612, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0fBF5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f2611, f208, f2612; +mul.f32 f657, f2612, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0fBF5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f2610, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0fBF5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f2609, f224, f2610; +mul.f32 f673, f2610, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0fBF5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r22, %tid.x; +mul.wide.u32 rd2, r22, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r22, r6; +mad.lo.s32 r8, r5, 52488, r3; +mul.wide.u32 rd14, r7, 8; +mov.u64 rd15, %55; +add.s64 rd6, rd15, rd14; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f683, f2623, f680; +mul.f32 f685, f679, f2623; +mul.f32 f2607, f679, f679; +mul.f32 f2608, f680, f680; +sub.f32 f688, f2607, f2608; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f691, f2621, f690; +mul.f32 f693, f688, f2621; +mul.f32 f695, f680, f690; +mul.f32 f2606, f679, f688; +sub.f32 f696, f2606, f695; +mul.f32 f2605, f568, f690; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f699, f2619, f698; +mul.f32 f701, f696, f2619; +mul.f32 f2603, f679, f696; +mul.f32 f2604, f680, f698; +sub.f32 f704, f2603, f2604; +mul.f32 f2602, f584, f698; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f707, f2617, f706; +mul.f32 f709, f704, f2617; +mul.f32 f711, f680, f706; +mul.f32 f2601, f679, f704; +sub.f32 f712, f2601, f711; +mul.f32 f2600, f600, f706; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f715, f2615, f714; +mul.f32 f717, f712, f2615; +mul.f32 f719, f680, f714; +mul.f32 f2599, f679, f712; +sub.f32 f720, f2599, f719; +mul.f32 f2598, f616, f714; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f723, f2613, f722; +mul.f32 f725, f720, f2613; +mul.f32 f2596, f679, f720; +mul.f32 f2597, f680, f722; +sub.f32 f728, f2596, f2597; +mul.f32 f2595, f632, f722; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f731, f2611, f730; +mul.f32 f733, f728, f2611; +mul.f32 f735, f680, f730; +mul.f32 f2594, f679, f728; +sub.f32 f736, f2594, f735; +mul.f32 f2593, f648, f730; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f739, f2609, f738; +mul.f32 f741, f736, f2609; +mul.f32 f743, f680, f738; +mul.f32 f2592, f679, f736; +sub.f32 f744, f2592, f743; +mul.f32 f2591, f664, f738; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f747, f549, f746; +mul.f32 f749, f744, f549; +mul.f32 f2589, f679, f744; +mul.f32 f2590, f680, f746; +sub.f32 f752, f2589, f2590; +mul.f32 f2588, f543, f746; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f755, f565, f754; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f2587, f679, f752; +sub.f32 f760, f2587, f759; +mul.f32 f2586, f559, f754; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f763, f581, f762; +mul.f32 f765, f760, f581; +mul.f32 f2584, f679, f760; +mul.f32 f2585, f680, f762; +sub.f32 f768, f2584, f2585; +mul.f32 f2583, f575, f762; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f771, f597, f770; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f2582, f679, f768; +sub.f32 f776, f2582, f775; +mul.f32 f2581, f591, f770; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f779, f613, f778; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f2580, f679, f776; +sub.f32 f784, f2580, f783; +mul.f32 f2579, f607, f778; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f787, f629, f786; +mul.f32 f789, f784, f629; +mul.f32 f2577, f679, f784; +mul.f32 f2578, f680, f786; +sub.f32 f792, f2577, f2578; +mul.f32 f2576, f623, f786; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f795, f645, f794; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f2575, f679, f792; +sub.f32 f800, f2575, f799; +mul.f32 f2574, f639, f794; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f803, f661, f802; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f2573, f679, f800; +sub.f32 f808, f2573, f807; +mul.f32 f2572, f655, f802; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f811, f677, f810; +mul.f32 f813, f808, f677; +mul.f32 f2570, f679, f808; +mul.f32 f2571, f680, f810; +sub.f32 f816, f2570, f2571; +mul.f32 f2569, f671, f810; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f819, f550, f818; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f2568, f679, f816; +sub.f32 f824, f2568, f823; +mul.f32 f2567, f544, f818; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f827, f566, f826; +mul.f32 f829, f824, f566; +mul.f32 f2565, f679, f824; +mul.f32 f2566, f680, f826; +sub.f32 f832, f2565, f2566; +mul.f32 f2564, f560, f826; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f835, f582, f834; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f2563, f679, f832; +sub.f32 f840, f2563, f839; +mul.f32 f2562, f576, f834; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f843, f598, f842; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f2561, f679, f840; +sub.f32 f848, f2561, f847; +mul.f32 f2560, f592, f842; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f851, f614, f850; +mul.f32 f853, f848, f614; +mul.f32 f2558, f679, f848; +mul.f32 f2559, f680, f850; +sub.f32 f856, f2558, f2559; +mul.f32 f2557, f608, f850; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f859, f630, f858; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f2556, f679, f856; +sub.f32 f864, f2556, f863; +mul.f32 f2555, f624, f858; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f867, f646, f866; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f2554, f679, f864; +sub.f32 f872, f2554, f871; +mul.f32 f2553, f640, f866; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f875, f662, f874; +mul.f32 f877, f872, f662; +mul.f32 f2551, f679, f872; +mul.f32 f2552, f680, f874; +sub.f32 f880, f2551, f2552; +mul.f32 f2550, f656, f874; +mul.f32 f881, f679, f874; +mul.f32 f2549, f552, f680; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f678, f882; +mul.f32 f884, f672, f882; +mul.f32 f885, f880, f678; +barrier.sync 0; +add.f32 f886, f2706, f2625; +add.f32 f887, f178, f537; +mad.lo.s32 r21, r7, 216, r8; +st.shared.v2.f32 [r21], {f887, f886}; +fma.rn.f32 f888, f679, f552, f683; +sub.f32 f889, f685, f2549; +st.shared.v2.f32 [r21+8], {f888, f889}; +fma.rn.f32 f890, f688, f568, f691; +sub.f32 f891, f693, f2605; +st.shared.v2.f32 [r21+16], {f890, f891}; +fma.rn.f32 f892, f696, f584, f699; +sub.f32 f893, f701, f2602; +st.shared.v2.f32 [r21+24], {f892, f893}; +fma.rn.f32 f894, f704, f600, f707; +sub.f32 f895, f709, f2600; +st.shared.v2.f32 [r21+32], {f894, f895}; +fma.rn.f32 f896, f712, f616, f715; +sub.f32 f897, f717, f2598; +st.shared.v2.f32 [r21+40], {f896, f897}; +fma.rn.f32 f898, f720, f632, f723; +sub.f32 f899, f725, f2595; +st.shared.v2.f32 [r21+48], {f898, f899}; +sub.f32 f900, f733, f2593; +fma.rn.f32 f901, f728, f648, f731; +st.shared.v2.f32 [r21+56], {f901, f900}; +fma.rn.f32 f902, f736, f664, f739; +sub.f32 f903, f741, f2591; +st.shared.v2.f32 [r21+64], {f902, f903}; +fma.rn.f32 f904, f744, f543, f747; +sub.f32 f905, f749, f2588; +st.shared.v2.f32 [r21+72], {f904, f905}; +fma.rn.f32 f906, f752, f559, f755; +sub.f32 f907, f757, f2586; +st.shared.v2.f32 [r21+80], {f906, f907}; +fma.rn.f32 f908, f760, f575, f763; +sub.f32 f909, f765, f2583; +st.shared.v2.f32 [r21+88], {f908, f909}; +fma.rn.f32 f910, f768, f591, f771; +sub.f32 f911, f773, f2581; +st.shared.v2.f32 [r21+96], {f910, f911}; +fma.rn.f32 f912, f776, f607, f779; +sub.f32 f913, f781, f2579; +st.shared.v2.f32 [r21+104], {f912, f913}; +fma.rn.f32 f914, f784, f623, f787; +sub.f32 f915, f789, f2576; +st.shared.v2.f32 [r21+112], {f914, f915}; +fma.rn.f32 f916, f792, f639, f795; +sub.f32 f917, f797, f2574; +st.shared.v2.f32 [r21+120], {f916, f917}; +fma.rn.f32 f918, f800, f655, f803; +sub.f32 f919, f805, f2572; +st.shared.v2.f32 [r21+128], {f918, f919}; +fma.rn.f32 f920, f808, f671, f811; +sub.f32 f921, f813, f2569; +st.shared.v2.f32 [r21+136], {f920, f921}; +fma.rn.f32 f922, f816, f544, f819; +sub.f32 f923, f821, f2567; +st.shared.v2.f32 [r21+144], {f922, f923}; +fma.rn.f32 f924, f824, f560, f827; +sub.f32 f925, f829, f2564; +st.shared.v2.f32 [r21+152], {f924, f925}; +fma.rn.f32 f926, f832, f576, f835; +sub.f32 f927, f837, f2562; +st.shared.v2.f32 [r21+160], {f926, f927}; +fma.rn.f32 f928, f840, f592, f843; +sub.f32 f929, f845, f2560; +st.shared.v2.f32 [r21+168], {f928, f929}; +fma.rn.f32 f930, f848, f608, f851; +sub.f32 f931, f853, f2557; +st.shared.v2.f32 [r21+176], {f930, f931}; +fma.rn.f32 f932, f856, f624, f859; +sub.f32 f933, f861, f2555; +st.shared.v2.f32 [r21+184], {f932, f933}; +fma.rn.f32 f934, f864, f640, f867; +sub.f32 f935, f869, f2553; +st.shared.v2.f32 [r21+192], {f934, f935}; +fma.rn.f32 f936, f872, f656, f875; +sub.f32 f937, f877, f2550; +st.shared.v2.f32 [r21+200], {f936, f937}; +fma.rn.f32 f938, f880, f672, f883; +sub.f32 f939, f885, f884; +st.shared.v2.f32 [r21+208], {f938, f939}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r21; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+1944]; +ld.shared.v2.f32 {f948, f949}, [r10+3888]; +ld.shared.v2.f32 {f952, f953}, [r10+5832]; +ld.shared.v2.f32 {f956, f957}, [r10+7776]; +ld.shared.v2.f32 {f960, f961}, [r10+9720]; +ld.shared.v2.f32 {f964, f965}, [r10+11664]; +ld.shared.v2.f32 {f968, f969}, [r10+13608]; +ld.shared.v2.f32 {f972, f973}, [r10+15552]; +ld.shared.v2.f32 {f976, f977}, [r10+17496]; +ld.shared.v2.f32 {f980, f981}, [r10+19440]; +ld.shared.v2.f32 {f984, f985}, [r10+21384]; +ld.shared.v2.f32 {f988, f989}, [r10+23328]; +ld.shared.v2.f32 {f992, f993}, [r10+25272]; +ld.shared.v2.f32 {f996, f997}, [r10+27216]; +ld.shared.v2.f32 {f1000, f1001}, [r10+29160]; +ld.shared.v2.f32 {f1004, f1005}, [r10+31104]; +ld.shared.v2.f32 {f1008, f1009}, [r10+33048]; +ld.shared.v2.f32 {f1012, f1013}, [r10+34992]; +ld.shared.v2.f32 {f1016, f1017}, [r10+36936]; +ld.shared.v2.f32 {f1020, f1021}, [r10+38880]; +ld.shared.v2.f32 {f1024, f1025}, [r10+40824]; +ld.shared.v2.f32 {f1028, f1029}, [r10+42768]; +ld.shared.v2.f32 {f1032, f1033}, [r10+44712]; +ld.shared.v2.f32 {f1036, f1037}, [r10+46656]; +ld.shared.v2.f32 {f1040, f1041}, [r10+48600]; +ld.shared.v2.f32 {f1044, f1045}, [r10+50544]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f2548, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0fBF5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f2547, f941, f2548; +mul.f32 f1058, f2548, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0fBF5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f2546, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0fBF5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f2545, f953, f2546; +mul.f32 f1074, f2546, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0fBF5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f2544, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0fBF5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f2543, f965, f2544; +mul.f32 f1090, f2544, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0fBF5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f2541, f1072, 0f3F441B7D; +mul.f32 f2542, f1078, 0f3F248DBB; +sub.f32 f1098, f2541, f2542; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0f3F248DBB, f1099; +mul.f32 f2539, f1088, 0f3E31D0D4; +mul.f32 f2540, f1094, 0f3F7C1C5C; +sub.f32 f1103, f2539, f2540; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0f3F7C1C5C, f1104; +mul.f32 f2537, f1073, 0f3E31D0D4; +mul.f32 f2538, f1079, 0f3F7C1C5C; +sub.f32 f1108, f2537, f2538; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0f3F7C1C5C, f1109; +mul.f32 f1112, f1095, 0f3EAF1D44; +mul.f32 f2536, f1089, 0fBF708FB2; +sub.f32 f1113, f2536, f1112; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0f3EAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f2535, f2545, f2543; +sub.f32 f1122, f2545, f2543; +mul.f32 f1123, f1122, 0fBF5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f2534, f2547, f2535; +mul.f32 f1126, f2535, 0f3F000000; +sub.f32 f1127, f2547, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0fBF5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f2533, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0fBF5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f2532, f1062, f2533; +mul.f32 f1142, f2533, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0fBF5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f2531, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0fBF5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f2530, f1063, f2531; +mul.f32 f1158, f2531, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0fBF5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f2529, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0fBF5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f2528, f945, f2529; +mul.f32 f1174, f2529, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0fBF5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f2527, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0fBF5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f2526, f957, f2527; +mul.f32 f1190, f2527, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0fBF5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f2525, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0fBF5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f2524, f969, f2525; +mul.f32 f1206, f2525, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0fBF5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f2522, f1188, 0f3F441B7D; +mul.f32 f2523, f1194, 0f3F248DBB; +sub.f32 f1214, f2522, f2523; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0f3F248DBB, f1215; +mul.f32 f2520, f1204, 0f3E31D0D4; +mul.f32 f2521, f1210, 0f3F7C1C5C; +sub.f32 f1219, f2520, f2521; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0f3F7C1C5C, f1220; +mul.f32 f2518, f1189, 0f3E31D0D4; +mul.f32 f2519, f1195, 0f3F7C1C5C; +sub.f32 f1224, f2518, f2519; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0f3F7C1C5C, f1225; +mul.f32 f2516, f1205, 0fBF708FB2; +mul.f32 f2517, f1211, 0f3EAF1D44; +sub.f32 f1229, f2516, f2517; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0f3EAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f2515, f2526, f2524; +sub.f32 f1238, f2526, f2524; +mul.f32 f1239, f1238, 0fBF5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f2514, f2528, f2515; +mul.f32 f1242, f2515, 0f3F000000; +sub.f32 f1243, f2528, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0fBF5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f2513, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0fBF5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f2512, f1178, f2513; +mul.f32 f1258, f2513, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0fBF5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f2511, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0fBF5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f2510, f1179, f2511; +mul.f32 f1274, f2511, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0fBF5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f2509, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0fBF5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f2508, f949, f2509; +mul.f32 f1290, f2509, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0fBF5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f2507, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0fBF5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f2506, f961, f2507; +mul.f32 f1306, f2507, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0fBF5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f2505, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0fBF5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f2504, f973, f2505; +mul.f32 f1322, f2505, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0fBF5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0f3F248DBB; +mul.f32 f2503, f1304, 0f3F441B7D; +sub.f32 f1330, f2503, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0f3F248DBB, f1331; +mul.f32 f2501, f1320, 0f3E31D0D4; +mul.f32 f2502, f1326, 0f3F7C1C5C; +sub.f32 f1335, f2501, f2502; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0f3F7C1C5C, f1336; +mul.f32 f2499, f1305, 0f3E31D0D4; +mul.f32 f2500, f1311, 0f3F7C1C5C; +sub.f32 f1340, f2499, f2500; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0f3F7C1C5C, f1341; +mul.f32 f2497, f1321, 0fBF708FB2; +mul.f32 f2498, f1327, 0f3EAF1D44; +sub.f32 f1345, f2497, f2498; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0f3EAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f2496, f2506, f2504; +sub.f32 f1354, f2506, f2504; +mul.f32 f1355, f1354, 0fBF5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f2495, f2508, f2496; +mul.f32 f1358, f2496, 0f3F000000; +sub.f32 f1359, f2508, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0fBF5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f2494, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0fBF5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f2493, f1294, f2494; +mul.f32 f1374, f2494, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0fBF5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f2492, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0fBF5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f2491, f1295, f2492; +mul.f32 f1390, f2492, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0fBF5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1397, f2512, 0f3E6C2691; +mul.f32 f2490, f1249, 0f3F791978; +sub.f32 f1398, f2490, f1397; +mul.f32 f1399, f2512, 0f3F791978; +fma.rn.f32 f1400, f1249, 0f3E6C2691, f1399; +mul.f32 f1402, f2493, 0f3EE5C902; +mul.f32 f2489, f1365, 0f3F64C51C; +sub.f32 f1403, f2489, f1402; +mul.f32 f1404, f2493, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0f3EE5C902, f1404; +mul.f32 f1407, f2510, 0f3EE5C902; +mul.f32 f2488, f1265, 0f3F64C51C; +sub.f32 f1408, f2488, f1407; +mul.f32 f1409, f2510, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0f3EE5C902, f1409; +mul.f32 f2486, f1381, 0f3F18DF63; +mul.f32 f2487, f2491, 0f3F4D57F2; +sub.f32 f1413, f2486, f2487; +mul.f32 f1414, f2491, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0f3F4D57F2, f1414; +mul.f32 f2484, f1240, 0f3F441B7D; +mul.f32 f2485, f1246, 0f3F248DBB; +sub.f32 f1418, f2484, f2485; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0f3F248DBB, f1419; +mul.f32 f2482, f1356, 0f3E31D0D4; +mul.f32 f2483, f1362, 0f3F7C1C5C; +sub.f32 f1423, f2482, f2483; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0f3F7C1C5C, f1424; +mul.f32 f2480, f1256, 0f3F18DF63; +mul.f32 f2481, f1262, 0f3F4D57F2; +sub.f32 f1428, f2480, f2481; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0f3F4D57F2, f1429; +mul.f32 f1432, f1378, 0f3F753ECD; +mul.f32 f2479, f1372, 0fBE92D7E0; +sub.f32 f1433, f2479, f1432; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0f3F753ECD, f1434; +mul.f32 f1437, f1278, 0f3F6B1036; +mul.f32 f2478, f1272, 0f3ECACAF8; +sub.f32 f1438, f2478, f1437; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0f3F6B1036, f1439; +mul.f32 f1442, f1394, 0f3F3A3529; +mul.f32 f2477, f1388, 0fBF2FAD88; +sub.f32 f1443, f2477, f1442; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0f3F3A3529, f1444; +mul.f32 f1447, f1247, 0f3F7C1C5C; +mul.f32 f2476, f1241, 0f3E31D0D4; +sub.f32 f1448, f2476, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0f3F7C1C5C, f1449; +mul.f32 f1452, f1363, 0f3EAF1D44; +mul.f32 f2475, f1357, 0fBF708FB2; +sub.f32 f1453, f2475, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0f3EAF1D44, f1454; +mul.f32 f1457, f1263, 0f3F7F9120; +mul.f32 f2474, f1257, 0fBD6E2946; +sub.f32 f1458, f2474, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0f3F7F9120, f1459; +mul.f32 f2472, f1373, 0fBF7E44DE; +mul.f32 f2473, f1379, 0fBDEDC21F; +sub.f32 f1463, f2472, f2473; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0fBDEDC21F, f1464; +mul.f32 f2470, f1273, 0fBE92D7E0; +mul.f32 f2471, f1279, 0f3F753ECD; +sub.f32 f1468, f2470, f2471; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0f3F753ECD, f1469; +mul.f32 f2468, f1389, 0fBF55E287; +mul.f32 f2469, f1395, 0fBF0CAC9F; +sub.f32 f1473, f2468, f2469; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0fBF0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f2467, f2514, f2495; +sub.f32 f1480, f2514, f2495; +mul.f32 f1481, f1480, 0fBF5DB3D7; +add.f32 f1482, f1481, f1479; +sub.f32 f1483, f1479, f1481; +mul.f32 f1484, f2467, 0f3F000000; +sub.f32 f1485, f2534, f1484; +sub.f32 f1486, f1233, f1349; +mul.f32 f1487, f1486, 0fBF5DB3D7; +sub.f32 f1488, f1485, f1487; +add.f32 f1489, f1487, f1485; +add.f32 f1490, f1398, f1403; +add.f32 f1491, f1133, f1490; +mul.f32 f1494, f1490, 0f3F000000; +sub.f32 f1495, f1133, f1494; +add.f32 f2466, f1400, f1405; +sub.f32 f1496, f1400, f1405; +mul.f32 f1497, f1496, 0fBF5DB3D7; +add.f32 f1498, f1497, f1495; +sub.f32 f1499, f1495, f1497; +add.f32 f2465, f2532, f2466; +mul.f32 f1500, f2466, 0f3F000000; +sub.f32 f1501, f2532, f1500; +sub.f32 f1502, f1398, f1403; +mul.f32 f1503, f1502, 0fBF5DB3D7; +sub.f32 f1504, f1501, f1503; +add.f32 f1505, f1503, f1501; +add.f32 f1506, f1408, f1413; +add.f32 f1507, f1149, f1506; +mul.f32 f1510, f1506, 0f3F000000; +sub.f32 f1511, f1149, f1510; +add.f32 f2464, f1410, f1415; +sub.f32 f1512, f1410, f1415; +mul.f32 f1513, f1512, 0fBF5DB3D7; +add.f32 f1514, f1513, f1511; +sub.f32 f1515, f1511, f1513; +add.f32 f2463, f2530, f2464; +mul.f32 f1516, f2464, 0f3F000000; +sub.f32 f1517, f2530, f1516; +sub.f32 f1518, f1408, f1413; +mul.f32 f1519, f1518, 0fBF5DB3D7; +sub.f32 f1520, f1517, f1519; +add.f32 f1521, f1519, f1517; +add.f32 f1522, f1418, f1423; +add.f32 f1523, f1124, f1522; +mul.f32 f1526, f1522, 0f3F000000; +sub.f32 f1527, f1124, f1526; +add.f32 f2462, f1420, f1425; +sub.f32 f1528, f1420, f1425; +mul.f32 f1529, f1528, 0fBF5DB3D7; +add.f32 f1530, f1529, f1527; +sub.f32 f1531, f1527, f1529; +add.f32 f2461, f1130, f2462; +mul.f32 f1532, f2462, 0f3F000000; +sub.f32 f1533, f1130, f1532; +sub.f32 f1534, f1418, f1423; +mul.f32 f1535, f1534, 0fBF5DB3D7; +sub.f32 f1536, f1533, f1535; +add.f32 f1537, f1535, f1533; +add.f32 f1538, f1428, f1433; +add.f32 f1539, f1140, f1538; +mul.f32 f1542, f1538, 0f3F000000; +sub.f32 f1543, f1140, f1542; +add.f32 f2460, f1430, f1435; +sub.f32 f1544, f1430, f1435; +mul.f32 f1545, f1544, 0fBF5DB3D7; +add.f32 f1546, f1545, f1543; +sub.f32 f1547, f1543, f1545; +add.f32 f2459, f1146, f2460; +mul.f32 f1548, f2460, 0f3F000000; +sub.f32 f1549, f1146, f1548; +sub.f32 f1550, f1428, f1433; +mul.f32 f1551, f1550, 0fBF5DB3D7; +sub.f32 f1552, f1549, f1551; +add.f32 f1553, f1551, f1549; +add.f32 f1554, f1438, f1443; +add.f32 f1555, f1156, f1554; +mul.f32 f1558, f1554, 0f3F000000; +sub.f32 f1559, f1156, f1558; +add.f32 f2458, f1440, f1445; +sub.f32 f1560, f1440, f1445; +mul.f32 f1561, f1560, 0fBF5DB3D7; +add.f32 f1562, f1561, f1559; +sub.f32 f1563, f1559, f1561; +add.f32 f2457, f1162, f2458; +mul.f32 f1564, f2458, 0f3F000000; +sub.f32 f1565, f1162, f1564; +sub.f32 f1566, f1438, f1443; +mul.f32 f1567, f1566, 0fBF5DB3D7; +sub.f32 f1568, f1565, f1567; +add.f32 f1569, f1567, f1565; +add.f32 f1570, f1448, f1453; +add.f32 f1571, f1125, f1570; +mul.f32 f1574, f1570, 0f3F000000; +sub.f32 f1575, f1125, f1574; +add.f32 f2456, f1450, f1455; +sub.f32 f1576, f1450, f1455; +mul.f32 f1577, f1576, 0fBF5DB3D7; +add.f32 f1578, f1577, f1575; +sub.f32 f1579, f1575, f1577; +add.f32 f2455, f1131, f2456; +mul.f32 f1580, f2456, 0f3F000000; +sub.f32 f1581, f1131, f1580; +sub.f32 f1582, f1448, f1453; +mul.f32 f1583, f1582, 0fBF5DB3D7; +sub.f32 f1584, f1581, f1583; +add.f32 f1585, f1583, f1581; +add.f32 f1586, f1458, f1463; +add.f32 f1587, f1141, f1586; +mul.f32 f1590, f1586, 0f3F000000; +sub.f32 f1591, f1141, f1590; +add.f32 f2454, f1460, f1465; +sub.f32 f1592, f1460, f1465; +mul.f32 f1593, f1592, 0fBF5DB3D7; +add.f32 f1594, f1593, f1591; +sub.f32 f1595, f1591, f1593; +add.f32 f2453, f1147, f2454; +mul.f32 f1596, f2454, 0f3F000000; +sub.f32 f1597, f1147, f1596; +sub.f32 f1598, f1458, f1463; +mul.f32 f1599, f1598, 0fBF5DB3D7; +sub.f32 f1600, f1597, f1599; +add.f32 f1601, f1599, f1597; +add.f32 f1602, f1468, f1473; +add.f32 f1603, f1157, f1602; +mul.f32 f1606, f1602, 0f3F000000; +sub.f32 f1607, f1157, f1606; +add.f32 f2452, f1470, f1475; +sub.f32 f1608, f1470, f1475; +mul.f32 f1609, f1608, 0fBF5DB3D7; +add.f32 f1610, f1609, f1607; +sub.f32 f1611, f1607, f1609; +add.f32 f2451, f1163, f2452; +mul.f32 f1612, f2452, 0f3F000000; +sub.f32 f1613, f1163, f1612; +sub.f32 f1614, f1468, f1473; +mul.f32 f1615, f1614, 0fBF5DB3D7; +sub.f32 f1616, f1613, f1615; +add.f32 f1617, f1615, f1613; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1618, f1619}, [rd11]; +mul.f32 f1622, f2465, f1619; +mul.f32 f1624, f1618, f2465; +mul.f32 f2449, f1618, f1618; +mul.f32 f2450, f1619, f1619; +sub.f32 f1627, f2449, f2450; +mul.f32 f1628, f1619, f1618; +fma.rn.f32 f1629, f1619, f1618, f1628; +mul.f32 f1630, f2463, f1629; +mul.f32 f1632, f1627, f2463; +mul.f32 f1634, f1619, f1629; +mul.f32 f2448, f1618, f1627; +sub.f32 f1635, f2448, f1634; +mul.f32 f2447, f1507, f1629; +mul.f32 f1636, f1618, f1629; +fma.rn.f32 f1637, f1619, f1627, f1636; +mul.f32 f1638, f2461, f1637; +mul.f32 f1640, f1635, f2461; +mul.f32 f2445, f1618, f1635; +mul.f32 f2446, f1619, f1637; +sub.f32 f1643, f2445, f2446; +mul.f32 f2444, f1523, f1637; +mul.f32 f1644, f1618, f1637; +fma.rn.f32 f1645, f1619, f1635, f1644; +mul.f32 f1646, f2459, f1645; +mul.f32 f1648, f1643, f2459; +mul.f32 f1650, f1619, f1645; +mul.f32 f2443, f1618, f1643; +sub.f32 f1651, f2443, f1650; +mul.f32 f2442, f1539, f1645; +mul.f32 f1652, f1618, f1645; +fma.rn.f32 f1653, f1619, f1643, f1652; +mul.f32 f1654, f2457, f1653; +mul.f32 f1656, f1651, f2457; +mul.f32 f1658, f1619, f1653; +mul.f32 f2441, f1618, f1651; +sub.f32 f1659, f2441, f1658; +mul.f32 f2440, f1555, f1653; +mul.f32 f1660, f1618, f1653; +fma.rn.f32 f1661, f1619, f1651, f1660; +mul.f32 f1662, f2455, f1661; +mul.f32 f1664, f1659, f2455; +mul.f32 f2438, f1618, f1659; +mul.f32 f2439, f1619, f1661; +sub.f32 f1667, f2438, f2439; +mul.f32 f2437, f1571, f1661; +mul.f32 f1668, f1618, f1661; +fma.rn.f32 f1669, f1619, f1659, f1668; +mul.f32 f1670, f2453, f1669; +mul.f32 f1672, f1667, f2453; +mul.f32 f1674, f1619, f1669; +mul.f32 f2436, f1618, f1667; +sub.f32 f1675, f2436, f1674; +mul.f32 f2435, f1587, f1669; +mul.f32 f1676, f1618, f1669; +fma.rn.f32 f1677, f1619, f1667, f1676; +mul.f32 f1678, f2451, f1677; +mul.f32 f1680, f1675, f2451; +mul.f32 f1682, f1619, f1677; +mul.f32 f2434, f1618, f1675; +sub.f32 f1683, f2434, f1682; +mul.f32 f2433, f1603, f1677; +mul.f32 f1684, f1618, f1677; +fma.rn.f32 f1685, f1619, f1675, f1684; +mul.f32 f1686, f1488, f1685; +mul.f32 f1688, f1683, f1488; +mul.f32 f2431, f1618, f1683; +mul.f32 f2432, f1619, f1685; +sub.f32 f1691, f2431, f2432; +mul.f32 f2430, f1482, f1685; +mul.f32 f1692, f1618, f1685; +fma.rn.f32 f1693, f1619, f1683, f1692; +mul.f32 f1694, f1504, f1693; +mul.f32 f1696, f1691, f1504; +mul.f32 f1698, f1619, f1693; +mul.f32 f2429, f1618, f1691; +sub.f32 f1699, f2429, f1698; +mul.f32 f2428, f1498, f1693; +mul.f32 f1700, f1618, f1693; +fma.rn.f32 f1701, f1619, f1691, f1700; +mul.f32 f1702, f1520, f1701; +mul.f32 f1704, f1699, f1520; +mul.f32 f2426, f1618, f1699; +mul.f32 f2427, f1619, f1701; +sub.f32 f1707, f2426, f2427; +mul.f32 f2425, f1514, f1701; +mul.f32 f1708, f1618, f1701; +fma.rn.f32 f1709, f1619, f1699, f1708; +mul.f32 f1710, f1536, f1709; +mul.f32 f1712, f1707, f1536; +mul.f32 f1714, f1619, f1709; +mul.f32 f2424, f1618, f1707; +sub.f32 f1715, f2424, f1714; +mul.f32 f2423, f1530, f1709; +mul.f32 f1716, f1618, f1709; +fma.rn.f32 f1717, f1619, f1707, f1716; +mul.f32 f1718, f1552, f1717; +mul.f32 f1720, f1715, f1552; +mul.f32 f1722, f1619, f1717; +mul.f32 f2422, f1618, f1715; +sub.f32 f1723, f2422, f1722; +mul.f32 f2421, f1546, f1717; +mul.f32 f1724, f1618, f1717; +fma.rn.f32 f1725, f1619, f1715, f1724; +mul.f32 f1726, f1568, f1725; +mul.f32 f1728, f1723, f1568; +mul.f32 f2419, f1618, f1723; +mul.f32 f2420, f1619, f1725; +sub.f32 f1731, f2419, f2420; +mul.f32 f2418, f1562, f1725; +mul.f32 f1732, f1618, f1725; +fma.rn.f32 f1733, f1619, f1723, f1732; +mul.f32 f1734, f1584, f1733; +mul.f32 f1736, f1731, f1584; +mul.f32 f1738, f1619, f1733; +mul.f32 f2417, f1618, f1731; +sub.f32 f1739, f2417, f1738; +mul.f32 f2416, f1578, f1733; +mul.f32 f1740, f1618, f1733; +fma.rn.f32 f1741, f1619, f1731, f1740; +mul.f32 f1742, f1600, f1741; +mul.f32 f1744, f1739, f1600; +mul.f32 f1746, f1619, f1741; +mul.f32 f2415, f1618, f1739; +sub.f32 f1747, f2415, f1746; +mul.f32 f2414, f1594, f1741; +mul.f32 f1748, f1618, f1741; +fma.rn.f32 f1749, f1619, f1739, f1748; +mul.f32 f1750, f1616, f1749; +mul.f32 f1752, f1747, f1616; +mul.f32 f2412, f1618, f1747; +mul.f32 f2413, f1619, f1749; +sub.f32 f1755, f2412, f2413; +mul.f32 f2411, f1610, f1749; +mul.f32 f1756, f1618, f1749; +fma.rn.f32 f1757, f1619, f1747, f1756; +mul.f32 f1758, f1489, f1757; +mul.f32 f1760, f1755, f1489; +mul.f32 f1762, f1619, f1757; +mul.f32 f2410, f1618, f1755; +sub.f32 f1763, f2410, f1762; +mul.f32 f2409, f1483, f1757; +mul.f32 f1764, f1618, f1757; +fma.rn.f32 f1765, f1619, f1755, f1764; +mul.f32 f1766, f1505, f1765; +mul.f32 f1768, f1763, f1505; +mul.f32 f2407, f1618, f1763; +mul.f32 f2408, f1619, f1765; +sub.f32 f1771, f2407, f2408; +mul.f32 f2406, f1499, f1765; +mul.f32 f1772, f1618, f1765; +fma.rn.f32 f1773, f1619, f1763, f1772; +mul.f32 f1774, f1521, f1773; +mul.f32 f1776, f1771, f1521; +mul.f32 f1778, f1619, f1773; +mul.f32 f2405, f1618, f1771; +sub.f32 f1779, f2405, f1778; +mul.f32 f2404, f1515, f1773; +mul.f32 f1780, f1618, f1773; +fma.rn.f32 f1781, f1619, f1771, f1780; +mul.f32 f1782, f1537, f1781; +mul.f32 f1784, f1779, f1537; +mul.f32 f1786, f1619, f1781; +mul.f32 f2403, f1618, f1779; +sub.f32 f1787, f2403, f1786; +mul.f32 f2402, f1531, f1781; +mul.f32 f1788, f1618, f1781; +fma.rn.f32 f1789, f1619, f1779, f1788; +mul.f32 f1790, f1553, f1789; +mul.f32 f1792, f1787, f1553; +mul.f32 f2400, f1618, f1787; +mul.f32 f2401, f1619, f1789; +sub.f32 f1795, f2400, f2401; +mul.f32 f2399, f1547, f1789; +mul.f32 f1796, f1618, f1789; +fma.rn.f32 f1797, f1619, f1787, f1796; +mul.f32 f1798, f1569, f1797; +mul.f32 f1800, f1795, f1569; +mul.f32 f1802, f1619, f1797; +mul.f32 f2398, f1618, f1795; +sub.f32 f1803, f2398, f1802; +mul.f32 f2397, f1563, f1797; +mul.f32 f1804, f1618, f1797; +fma.rn.f32 f1805, f1619, f1795, f1804; +mul.f32 f1806, f1585, f1805; +mul.f32 f1808, f1803, f1585; +mul.f32 f1810, f1619, f1805; +mul.f32 f2396, f1618, f1803; +sub.f32 f1811, f2396, f1810; +mul.f32 f2395, f1579, f1805; +mul.f32 f1812, f1618, f1805; +fma.rn.f32 f1813, f1619, f1803, f1812; +mul.f32 f1814, f1601, f1813; +mul.f32 f1816, f1811, f1601; +mul.f32 f2393, f1618, f1811; +mul.f32 f2394, f1619, f1813; +sub.f32 f1819, f2393, f2394; +mul.f32 f2392, f1595, f1813; +mul.f32 f1820, f1618, f1813; +mul.f32 f2391, f1491, f1619; +fma.rn.f32 f1821, f1619, f1811, f1820; +mul.f32 f1822, f1617, f1821; +mul.f32 f1823, f1611, f1821; +mul.f32 f1824, f1819, f1617; +shl.b32 r18, r17, 3; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 5832, r19; +add.f32 f1825, f2534, f2467; +add.f32 f1826, f1117, f1476; +st.shared.v2.f32 [r20], {f1826, f1825}; +fma.rn.f32 f1827, f1618, f1491, f1622; +sub.f32 f1828, f1624, f2391; +st.shared.v2.f32 [r20+216], {f1827, f1828}; +fma.rn.f32 f1829, f1627, f1507, f1630; +sub.f32 f1830, f1632, f2447; +st.shared.v2.f32 [r20+432], {f1829, f1830}; +fma.rn.f32 f1831, f1635, f1523, f1638; +sub.f32 f1832, f1640, f2444; +st.shared.v2.f32 [r20+648], {f1831, f1832}; +fma.rn.f32 f1833, f1643, f1539, f1646; +sub.f32 f1834, f1648, f2442; +st.shared.v2.f32 [r20+864], {f1833, f1834}; +fma.rn.f32 f1835, f1651, f1555, f1654; +sub.f32 f1836, f1656, f2440; +st.shared.v2.f32 [r20+1080], {f1835, f1836}; +sub.f32 f1837, f1664, f2437; +fma.rn.f32 f1838, f1659, f1571, f1662; +st.shared.v2.f32 [r20+1296], {f1838, f1837}; +fma.rn.f32 f1839, f1667, f1587, f1670; +sub.f32 f1840, f1672, f2435; +st.shared.v2.f32 [r20+1512], {f1839, f1840}; +sub.f32 f1841, f1680, f2433; +fma.rn.f32 f1842, f1675, f1603, f1678; +st.shared.v2.f32 [r20+1728], {f1842, f1841}; +fma.rn.f32 f1843, f1683, f1482, f1686; +sub.f32 f1844, f1688, f2430; +st.shared.v2.f32 [r20+1944], {f1843, f1844}; +fma.rn.f32 f1845, f1691, f1498, f1694; +sub.f32 f1846, f1696, f2428; +st.shared.v2.f32 [r20+2160], {f1845, f1846}; +fma.rn.f32 f1847, f1699, f1514, f1702; +sub.f32 f1848, f1704, f2425; +st.shared.v2.f32 [r20+2376], {f1847, f1848}; +fma.rn.f32 f1849, f1707, f1530, f1710; +sub.f32 f1850, f1712, f2423; +st.shared.v2.f32 [r20+2592], {f1849, f1850}; +fma.rn.f32 f1851, f1715, f1546, f1718; +sub.f32 f1852, f1720, f2421; +st.shared.v2.f32 [r20+2808], {f1851, f1852}; +fma.rn.f32 f1853, f1723, f1562, f1726; +sub.f32 f1854, f1728, f2418; +st.shared.v2.f32 [r20+3024], {f1853, f1854}; +fma.rn.f32 f1855, f1731, f1578, f1734; +sub.f32 f1856, f1736, f2416; +st.shared.v2.f32 [r20+3240], {f1855, f1856}; +fma.rn.f32 f1857, f1739, f1594, f1742; +sub.f32 f1858, f1744, f2414; +st.shared.v2.f32 [r20+3456], {f1857, f1858}; +fma.rn.f32 f1859, f1747, f1610, f1750; +sub.f32 f1860, f1752, f2411; +st.shared.v2.f32 [r20+3672], {f1859, f1860}; +fma.rn.f32 f1861, f1755, f1483, f1758; +sub.f32 f1862, f1760, f2409; +st.shared.v2.f32 [r20+3888], {f1861, f1862}; +fma.rn.f32 f1863, f1763, f1499, f1766; +sub.f32 f1864, f1768, f2406; +st.shared.v2.f32 [r20+4104], {f1863, f1864}; +fma.rn.f32 f1865, f1771, f1515, f1774; +sub.f32 f1866, f1776, f2404; +st.shared.v2.f32 [r20+4320], {f1865, f1866}; +fma.rn.f32 f1867, f1779, f1531, f1782; +sub.f32 f1868, f1784, f2402; +st.shared.v2.f32 [r20+4536], {f1867, f1868}; +fma.rn.f32 f1869, f1787, f1547, f1790; +sub.f32 f1870, f1792, f2399; +st.shared.v2.f32 [r20+4752], {f1869, f1870}; +fma.rn.f32 f1871, f1795, f1563, f1798; +sub.f32 f1872, f1800, f2397; +st.shared.v2.f32 [r20+4968], {f1871, f1872}; +fma.rn.f32 f1873, f1803, f1579, f1806; +sub.f32 f1874, f1808, f2395; +st.shared.v2.f32 [r20+5184], {f1873, f1874}; +fma.rn.f32 f1875, f1811, f1595, f1814; +sub.f32 f1876, f1816, f2392; +st.shared.v2.f32 [r20+5400], {f1875, f1876}; +fma.rn.f32 f1877, f1819, f1611, f1822; +sub.f32 f1878, f1824, f1823; +st.shared.v2.f32 [r20+5616], {f1877, f1878}; +barrier.sync 0; +ld.shared.v2.f32 {f1879, f1880}, [r10]; +ld.shared.v2.f32 {f1883, f1884}, [r10+1944]; +ld.shared.v2.f32 {f1887, f1888}, [r10+3888]; +ld.shared.v2.f32 {f1891, f1892}, [r10+5832]; +ld.shared.v2.f32 {f1895, f1896}, [r10+7776]; +ld.shared.v2.f32 {f1899, f1900}, [r10+9720]; +ld.shared.v2.f32 {f1903, f1904}, [r10+11664]; +ld.shared.v2.f32 {f1907, f1908}, [r10+13608]; +ld.shared.v2.f32 {f1911, f1912}, [r10+15552]; +ld.shared.v2.f32 {f1915, f1916}, [r10+17496]; +ld.shared.v2.f32 {f1919, f1920}, [r10+19440]; +ld.shared.v2.f32 {f1923, f1924}, [r10+21384]; +ld.shared.v2.f32 {f1927, f1928}, [r10+23328]; +ld.shared.v2.f32 {f1931, f1932}, [r10+25272]; +ld.shared.v2.f32 {f1935, f1936}, [r10+27216]; +ld.shared.v2.f32 {f1939, f1940}, [r10+29160]; +ld.shared.v2.f32 {f1943, f1944}, [r10+31104]; +ld.shared.v2.f32 {f1947, f1948}, [r10+33048]; +ld.shared.v2.f32 {f1951, f1952}, [r10+34992]; +ld.shared.v2.f32 {f1955, f1956}, [r10+36936]; +ld.shared.v2.f32 {f1959, f1960}, [r10+38880]; +ld.shared.v2.f32 {f1963, f1964}, [r10+40824]; +ld.shared.v2.f32 {f1967, f1968}, [r10+42768]; +ld.shared.v2.f32 {f1971, f1972}, [r10+44712]; +ld.shared.v2.f32 {f1975, f1976}, [r10+46656]; +ld.shared.v2.f32 {f1979, f1980}, [r10+48600]; +ld.shared.v2.f32 {f1983, f1984}, [r10+50544]; +add.f32 f1987, f1915, f1951; +add.f32 f1988, f1879, f1987; +mul.f32 f1991, f1987, 0f3F000000; +sub.f32 f1992, f1879, f1991; +add.f32 f2390, f1916, f1952; +sub.f32 f1993, f1916, f1952; +mul.f32 f1994, f1993, 0fBF5DB3D7; +add.f32 f1995, f1994, f1992; +sub.f32 f1996, f1992, f1994; +add.f32 f2389, f1880, f2390; +mul.f32 f1997, f2390, 0f3F000000; +sub.f32 f1998, f1880, f1997; +sub.f32 f1999, f1915, f1951; +mul.f32 f2000, f1999, 0fBF5DB3D7; +sub.f32 f2001, f1998, f2000; +add.f32 f2002, f2000, f1998; +add.f32 f2003, f1927, f1963; +add.f32 f2004, f1891, f2003; +mul.f32 f2007, f2003, 0f3F000000; +sub.f32 f2008, f1891, f2007; +add.f32 f2388, f1928, f1964; +sub.f32 f2009, f1928, f1964; +mul.f32 f2010, f2009, 0fBF5DB3D7; +add.f32 f2011, f2010, f2008; +sub.f32 f2012, f2008, f2010; +add.f32 f2387, f1892, f2388; +mul.f32 f2013, f2388, 0f3F000000; +sub.f32 f2014, f1892, f2013; +sub.f32 f2015, f1927, f1963; +mul.f32 f2016, f2015, 0fBF5DB3D7; +sub.f32 f2017, f2014, f2016; +add.f32 f2018, f2016, f2014; +add.f32 f2019, f1939, f1975; +add.f32 f2020, f1903, f2019; +mul.f32 f2023, f2019, 0f3F000000; +sub.f32 f2024, f1903, f2023; +add.f32 f2386, f1940, f1976; +sub.f32 f2025, f1940, f1976; +mul.f32 f2026, f2025, 0fBF5DB3D7; +add.f32 f2027, f2026, f2024; +sub.f32 f2028, f2024, f2026; +add.f32 f2385, f1904, f2386; +mul.f32 f2029, f2386, 0f3F000000; +sub.f32 f2030, f1904, f2029; +sub.f32 f2031, f1939, f1975; +mul.f32 f2032, f2031, 0fBF5DB3D7; +sub.f32 f2033, f2030, f2032; +add.f32 f2034, f2032, f2030; +mul.f32 f2036, f2017, 0f3F248DBB; +mul.f32 f2384, f2011, 0f3F441B7D; +sub.f32 f2037, f2384, f2036; +mul.f32 f2038, f2017, 0f3F441B7D; +fma.rn.f32 f2039, f2011, 0f3F248DBB, f2038; +mul.f32 f2041, f2033, 0f3F7C1C5C; +mul.f32 f2383, f2027, 0f3E31D0D4; +sub.f32 f2042, f2383, f2041; +mul.f32 f2043, f2033, 0f3E31D0D4; +fma.rn.f32 f2044, f2027, 0f3F7C1C5C, f2043; +mul.f32 f2046, f2018, 0f3F7C1C5C; +mul.f32 f2382, f2012, 0f3E31D0D4; +sub.f32 f2047, f2382, f2046; +mul.f32 f2048, f2018, 0f3E31D0D4; +fma.rn.f32 f2049, f2012, 0f3F7C1C5C, f2048; +mul.f32 f2051, f2034, 0f3EAF1D44; +mul.f32 f2381, f2028, 0fBF708FB2; +sub.f32 f2052, f2381, f2051; +mul.f32 f2053, f2034, 0fBF708FB2; +fma.rn.f32 f2054, f2028, 0f3EAF1D44, f2053; +add.f32 f2055, f2004, f2020; +mul.f32 f2057, f2055, 0f3F000000; +sub.f32 f2058, f1988, f2057; +add.f32 f2380, f2387, f2385; +sub.f32 f2059, f2387, f2385; +mul.f32 f2060, f2059, 0fBF5DB3D7; +mul.f32 f2061, f2380, 0f3F000000; +sub.f32 f2062, f2389, f2061; +sub.f32 f2063, f2004, f2020; +mul.f32 f2064, f2063, 0fBF5DB3D7; +add.f32 f2065, f2037, f2042; +mul.f32 f2067, f2065, 0f3F000000; +sub.f32 f2068, f1995, f2067; +add.f32 f2379, f2039, f2044; +sub.f32 f2069, f2039, f2044; +mul.f32 f2070, f2069, 0fBF5DB3D7; +mul.f32 f2071, f2379, 0f3F000000; +sub.f32 f2072, f2001, f2071; +sub.f32 f2073, f2037, f2042; +mul.f32 f2074, f2073, 0fBF5DB3D7; +add.f32 f2075, f2047, f2052; +mul.f32 f2077, f2075, 0f3F000000; +sub.f32 f2078, f1996, f2077; +add.f32 f2378, f2049, f2054; +sub.f32 f2079, f2049, f2054; +mul.f32 f2080, f2079, 0fBF5DB3D7; +mul.f32 f2081, f2378, 0f3F000000; +sub.f32 f2082, f2002, f2081; +sub.f32 f2083, f2047, f2052; +mul.f32 f2084, f2083, 0fBF5DB3D7; +add.f32 f2085, f1919, f1955; +add.f32 f2086, f1883, f2085; +mul.f32 f2089, f2085, 0f3F000000; +sub.f32 f2090, f1883, f2089; +add.f32 f2377, f1920, f1956; +sub.f32 f2091, f1920, f1956; +mul.f32 f2092, f2091, 0fBF5DB3D7; +add.f32 f2093, f2092, f2090; +sub.f32 f2094, f2090, f2092; +add.f32 f2376, f1884, f2377; +mul.f32 f2095, f2377, 0f3F000000; +sub.f32 f2096, f1884, f2095; +sub.f32 f2097, f1919, f1955; +mul.f32 f2098, f2097, 0fBF5DB3D7; +sub.f32 f2099, f2096, f2098; +add.f32 f2100, f2098, f2096; +add.f32 f2101, f1931, f1967; +add.f32 f2102, f1895, f2101; +mul.f32 f2105, f2101, 0f3F000000; +sub.f32 f2106, f1895, f2105; +add.f32 f2375, f1932, f1968; +sub.f32 f2107, f1932, f1968; +mul.f32 f2108, f2107, 0fBF5DB3D7; +add.f32 f2109, f2108, f2106; +sub.f32 f2110, f2106, f2108; +add.f32 f2374, f1896, f2375; +mul.f32 f2111, f2375, 0f3F000000; +sub.f32 f2112, f1896, f2111; +sub.f32 f2113, f1931, f1967; +mul.f32 f2114, f2113, 0fBF5DB3D7; +sub.f32 f2115, f2112, f2114; +add.f32 f2116, f2114, f2112; +add.f32 f2117, f1943, f1979; +add.f32 f2118, f1907, f2117; +mul.f32 f2121, f2117, 0f3F000000; +sub.f32 f2122, f1907, f2121; +add.f32 f2373, f1944, f1980; +sub.f32 f2123, f1944, f1980; +mul.f32 f2124, f2123, 0fBF5DB3D7; +add.f32 f2125, f2124, f2122; +sub.f32 f2126, f2122, f2124; +add.f32 f2372, f1908, f2373; +mul.f32 f2127, f2373, 0f3F000000; +sub.f32 f2128, f1908, f2127; +sub.f32 f2129, f1943, f1979; +mul.f32 f2130, f2129, 0fBF5DB3D7; +sub.f32 f2131, f2128, f2130; +add.f32 f2132, f2130, f2128; +mul.f32 f2134, f2115, 0f3F248DBB; +mul.f32 f2371, f2109, 0f3F441B7D; +sub.f32 f2135, f2371, f2134; +mul.f32 f2136, f2115, 0f3F441B7D; +fma.rn.f32 f2137, f2109, 0f3F248DBB, f2136; +mul.f32 f2369, f2125, 0f3E31D0D4; +mul.f32 f2370, f2131, 0f3F7C1C5C; +sub.f32 f2140, f2369, f2370; +mul.f32 f2141, f2131, 0f3E31D0D4; +fma.rn.f32 f2142, f2125, 0f3F7C1C5C, f2141; +mul.f32 f2367, f2110, 0f3E31D0D4; +mul.f32 f2368, f2116, 0f3F7C1C5C; +sub.f32 f2145, f2367, f2368; +mul.f32 f2146, f2116, 0f3E31D0D4; +fma.rn.f32 f2147, f2110, 0f3F7C1C5C, f2146; +mul.f32 f2365, f2126, 0fBF708FB2; +mul.f32 f2366, f2132, 0f3EAF1D44; +sub.f32 f2150, f2365, f2366; +mul.f32 f2151, f2132, 0fBF708FB2; +fma.rn.f32 f2152, f2126, 0f3EAF1D44, f2151; +add.f32 f2153, f2102, f2118; +mul.f32 f2155, f2153, 0f3F000000; +sub.f32 f2156, f2086, f2155; +add.f32 f2364, f2374, f2372; +sub.f32 f2157, f2374, f2372; +mul.f32 f2158, f2157, 0fBF5DB3D7; +mul.f32 f2159, f2364, 0f3F000000; +sub.f32 f2160, f2376, f2159; +sub.f32 f2161, f2102, f2118; +mul.f32 f2162, f2161, 0fBF5DB3D7; +add.f32 f2163, f2135, f2140; +mul.f32 f2165, f2163, 0f3F000000; +sub.f32 f2166, f2093, f2165; +add.f32 f2363, f2137, f2142; +sub.f32 f2167, f2137, f2142; +mul.f32 f2168, f2167, 0fBF5DB3D7; +mul.f32 f2169, f2363, 0f3F000000; +sub.f32 f2170, f2099, f2169; +sub.f32 f2171, f2135, f2140; +mul.f32 f2172, f2171, 0fBF5DB3D7; +add.f32 f2173, f2145, f2150; +mul.f32 f2175, f2173, 0f3F000000; +sub.f32 f2176, f2094, f2175; +add.f32 f2362, f2147, f2152; +sub.f32 f2177, f2147, f2152; +mul.f32 f2178, f2177, 0fBF5DB3D7; +mul.f32 f2179, f2362, 0f3F000000; +sub.f32 f2180, f2100, f2179; +sub.f32 f2181, f2145, f2150; +mul.f32 f2182, f2181, 0fBF5DB3D7; +add.f32 f2183, f1923, f1959; +add.f32 f2184, f1887, f2183; +mul.f32 f2187, f2183, 0f3F000000; +sub.f32 f2188, f1887, f2187; +add.f32 f2361, f1924, f1960; +sub.f32 f2189, f1924, f1960; +mul.f32 f2190, f2189, 0fBF5DB3D7; +add.f32 f2191, f2190, f2188; +sub.f32 f2192, f2188, f2190; +add.f32 f2360, f1888, f2361; +mul.f32 f2193, f2361, 0f3F000000; +sub.f32 f2194, f1888, f2193; +sub.f32 f2195, f1923, f1959; +mul.f32 f2196, f2195, 0fBF5DB3D7; +sub.f32 f2197, f2194, f2196; +add.f32 f2198, f2196, f2194; +add.f32 f2199, f1935, f1971; +add.f32 f2200, f1899, f2199; +mul.f32 f2203, f2199, 0f3F000000; +sub.f32 f2204, f1899, f2203; +add.f32 f2359, f1936, f1972; +sub.f32 f2205, f1936, f1972; +mul.f32 f2206, f2205, 0fBF5DB3D7; +add.f32 f2207, f2206, f2204; +sub.f32 f2208, f2204, f2206; +add.f32 f2358, f1900, f2359; +mul.f32 f2209, f2359, 0f3F000000; +sub.f32 f2210, f1900, f2209; +sub.f32 f2211, f1935, f1971; +mul.f32 f2212, f2211, 0fBF5DB3D7; +sub.f32 f2213, f2210, f2212; +add.f32 f2214, f2212, f2210; +add.f32 f2215, f1947, f1983; +add.f32 f2216, f1911, f2215; +mul.f32 f2219, f2215, 0f3F000000; +sub.f32 f2220, f1911, f2219; +add.f32 f2357, f1948, f1984; +sub.f32 f2221, f1948, f1984; +mul.f32 f2222, f2221, 0fBF5DB3D7; +add.f32 f2223, f2222, f2220; +sub.f32 f2224, f2220, f2222; +add.f32 f2356, f1912, f2357; +mul.f32 f2225, f2357, 0f3F000000; +sub.f32 f2226, f1912, f2225; +sub.f32 f2227, f1947, f1983; +mul.f32 f2228, f2227, 0fBF5DB3D7; +sub.f32 f2229, f2226, f2228; +add.f32 f2230, f2228, f2226; +mul.f32 f2354, f2207, 0f3F441B7D; +mul.f32 f2355, f2213, 0f3F248DBB; +sub.f32 f2233, f2354, f2355; +mul.f32 f2234, f2213, 0f3F441B7D; +fma.rn.f32 f2235, f2207, 0f3F248DBB, f2234; +mul.f32 f2352, f2223, 0f3E31D0D4; +mul.f32 f2353, f2229, 0f3F7C1C5C; +sub.f32 f2238, f2352, f2353; +mul.f32 f2239, f2229, 0f3E31D0D4; +fma.rn.f32 f2240, f2223, 0f3F7C1C5C, f2239; +mul.f32 f2242, f2214, 0f3F7C1C5C; +mul.f32 f2351, f2208, 0f3E31D0D4; +sub.f32 f2243, f2351, f2242; +mul.f32 f2244, f2214, 0f3E31D0D4; +fma.rn.f32 f2245, f2208, 0f3F7C1C5C, f2244; +mul.f32 f2247, f2230, 0f3EAF1D44; +mul.f32 f2350, f2224, 0fBF708FB2; +sub.f32 f2248, f2350, f2247; +mul.f32 f2249, f2230, 0fBF708FB2; +fma.rn.f32 f2250, f2224, 0f3EAF1D44, f2249; +add.f32 f2251, f2200, f2216; +mul.f32 f2253, f2251, 0f3F000000; +sub.f32 f2254, f2184, f2253; +add.f32 f2349, f2358, f2356; +sub.f32 f2255, f2358, f2356; +mul.f32 f2256, f2255, 0fBF5DB3D7; +mul.f32 f2257, f2349, 0f3F000000; +sub.f32 f2258, f2360, f2257; +sub.f32 f2259, f2200, f2216; +mul.f32 f2260, f2259, 0fBF5DB3D7; +add.f32 f2261, f2233, f2238; +mul.f32 f2263, f2261, 0f3F000000; +sub.f32 f2264, f2191, f2263; +add.f32 f2348, f2235, f2240; +sub.f32 f2265, f2235, f2240; +mul.f32 f2266, f2265, 0fBF5DB3D7; +mul.f32 f2267, f2348, 0f3F000000; +sub.f32 f2268, f2197, f2267; +sub.f32 f2269, f2233, f2238; +mul.f32 f2270, f2269, 0fBF5DB3D7; +add.f32 f2271, f2243, f2248; +mul.f32 f2273, f2271, 0f3F000000; +sub.f32 f2274, f2192, f2273; +add.f32 f2347, f2245, f2250; +sub.f32 f2275, f2245, f2250; +mul.f32 f2276, f2275, 0fBF5DB3D7; +mul.f32 f2277, f2347, 0f3F000000; +sub.f32 f2278, f2198, f2277; +sub.f32 f2279, f2243, f2248; +mul.f32 f2280, f2279, 0fBF5DB3D7; +add.f32 %1, f2389, f2380; +add.f32 %0, f1988, f2055; +add.f32 %3, f2376, f2364; +add.f32 %2, f2086, f2153; +add.f32 %5, f2360, f2349; +add.f32 %4, f2184, f2251; +add.f32 %7, f2001, f2379; +add.f32 %6, f1995, f2065; +add.f32 %9, f2099, f2363; +add.f32 %8, f2093, f2163; +add.f32 %11, f2197, f2348; +add.f32 %10, f2191, f2261; +add.f32 %13, f2002, f2378; +add.f32 %12, f1996, f2075; +add.f32 %15, f2100, f2362; +add.f32 %14, f2094, f2173; +add.f32 %17, f2198, f2347; +add.f32 %16, f2192, f2271; +add.f32 %18, f2060, f2058; +sub.f32 %19, f2062, f2064; +add.f32 %20, f2158, f2156; +sub.f32 %21, f2160, f2162; +add.f32 %22, f2256, f2254; +sub.f32 %23, f2258, f2260; +add.f32 %24, f2070, f2068; +sub.f32 %25, f2072, f2074; +sub.f32 %27, f2170, f2172; +add.f32 %26, f2168, f2166; +sub.f32 %29, f2268, f2270; +add.f32 %28, f2266, f2264; +sub.f32 %31, f2082, f2084; +add.f32 %30, f2080, f2078; +add.f32 %32, f2178, f2176; +sub.f32 %33, f2180, f2182; +add.f32 %34, f2276, f2274; +sub.f32 %35, f2278, f2280; +add.f32 %37, f2064, f2062; +sub.f32 %36, f2058, f2060; +add.f32 %39, f2162, f2160; +sub.f32 %38, f2156, f2158; +add.f32 %41, f2260, f2258; +sub.f32 %40, f2254, f2256; +add.f32 %43, f2074, f2072; +sub.f32 %42, f2068, f2070; +add.f32 %45, f2172, f2170; +sub.f32 %44, f2166, f2168; +add.f32 %47, f2270, f2268; +sub.f32 %46, f2264, f2266; +add.f32 %49, f2084, f2082; +sub.f32 %48, f2078, f2080; +add.f32 %51, f2182, f2180; +sub.f32 %50, f2176, f2178; +add.f32 %53, f2280, f2278; +sub.f32 %52, f2274, f2276; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_6561), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<352, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2599>; +.reg .b32 r<24>; +.reg .b64 rd<15>; +mov.u32 r22, %tid.y; +mov.u32 r23, %54; +mad.lo.s32 r3, r22, 26244, r23; +add.f32 f109, %75, %93; +add.f32 f110, %57, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %57, f113; +add.f32 f2590, %76, %94; +sub.f32 f115, %76, %94; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f2589, %58, f2590; +mul.f32 f119, f2590, 0f3F000000; +sub.f32 f120, %58, f119; +sub.f32 f121, %75, %93; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %81, %99; +add.f32 f126, %63, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %63, f129; +add.f32 f2588, %82, %100; +sub.f32 f131, %82, %100; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f2587, %64, f2588; +mul.f32 f135, f2588, 0f3F000000; +sub.f32 f136, %64, f135; +sub.f32 f137, %81, %99; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %87, %105; +add.f32 f142, %69, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %69, f145; +add.f32 f2586, %88, %106; +sub.f32 f147, %88, %106; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f2585, %70, f2586; +mul.f32 f151, f2586, 0f3F000000; +sub.f32 f152, %70, f151; +sub.f32 f153, %87, %105; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f2584, f133, 0f3F441B7D; +sub.f32 f159, f2584, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f2582, f149, 0f3E31D0D4; +mul.f32 f2583, f155, 0f3F7C1C5C; +sub.f32 f164, f2582, f2583; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f2580, f134, 0f3E31D0D4; +mul.f32 f2581, f140, 0f3F7C1C5C; +sub.f32 f169, f2580, f2581; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f2578, f150, 0fBF708FB2; +mul.f32 f2579, f156, 0f3EAF1D44; +sub.f32 f174, f2578, f2579; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f2577, f2587, f2585; +sub.f32 f183, f2587, f2585; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f2576, f2589, f2577; +mul.f32 f187, f2577, 0f3F000000; +sub.f32 f188, f2589, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f2575, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f2574, f123, f2575; +mul.f32 f203, f2575, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f2573, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f2572, f124, f2573; +mul.f32 f219, f2573, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %77, %95; +add.f32 f226, %59, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %59, f229; +add.f32 f2569, %111, %112; +sub.f32 f231, %111, %112; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f2567, %113, f2569; +mul.f32 f235, f2569, 0f3F000000; +sub.f32 f236, %113, f235; +sub.f32 f237, %77, %95; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %83, %101; +add.f32 f242, %65, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %65, f245; +add.f32 f2564, %115, %114; +sub.f32 f247, %115, %114; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f2562, %116, f2564; +mul.f32 f251, f2564, 0f3F000000; +sub.f32 f252, %116, f251; +sub.f32 f253, %83, %101; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %89, %107; +add.f32 f258, %71, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %71, f261; +add.f32 f2559, %117, %118; +sub.f32 f263, %117, %118; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f2557, %119, f2559; +mul.f32 f267, f2559, 0f3F000000; +sub.f32 f268, %119, f267; +sub.f32 f269, %89, %107; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f2556, f249, 0f3F441B7D; +sub.f32 f275, f2556, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f2555, f265, 0f3E31D0D4; +sub.f32 f280, f2555, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f2553, f250, 0f3E31D0D4; +mul.f32 f2554, f256, 0f3F7C1C5C; +sub.f32 f285, f2553, f2554; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f2551, f266, 0fBF708FB2; +mul.f32 f2552, f272, 0f3EAF1D44; +sub.f32 f290, f2551, f2552; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f2550, f2562, f2557; +sub.f32 f299, f2562, f2557; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f2549, f2567, f2550; +mul.f32 f303, f2550, 0f3F000000; +sub.f32 f304, f2567, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f2548, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f2547, f239, f2548; +mul.f32 f319, f2548, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f2546, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f2545, f240, f2546; +mul.f32 f335, f2546, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %79, %97; +add.f32 f342, %61, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %61, f345; +add.f32 f2542, %120, %121; +sub.f32 f347, %120, %121; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f2540, %122, f2542; +mul.f32 f351, f2542, 0f3F000000; +sub.f32 f352, %122, f351; +sub.f32 f353, %79, %97; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %85, %103; +add.f32 f358, %67, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %67, f361; +add.f32 f2537, %124, %123; +sub.f32 f363, %124, %123; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f2535, %125, f2537; +mul.f32 f367, f2537, 0f3F000000; +sub.f32 f368, %125, f367; +sub.f32 f369, %85, %103; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %91, %109; +add.f32 f374, %73, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %73, f377; +add.f32 f2533, %126, %110; +sub.f32 f379, %126, %110; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f2531, %127, f2533; +mul.f32 f383, f2533, 0f3F000000; +sub.f32 f384, %127, f383; +sub.f32 f385, %91, %109; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f2530, f365, 0f3F441B7D; +sub.f32 f391, f2530, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f2529, f381, 0f3E31D0D4; +sub.f32 f396, f2529, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f2527, f366, 0f3E31D0D4; +mul.f32 f2528, f372, 0f3F7C1C5C; +sub.f32 f401, f2527, f2528; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f2525, f382, 0fBF708FB2; +mul.f32 f2526, f388, 0f3EAF1D44; +sub.f32 f406, f2525, f2526; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f2524, f2535, f2531; +sub.f32 f415, f2535, f2531; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f2523, f2540, f2524; +mul.f32 f419, f2524, 0f3F000000; +sub.f32 f420, f2540, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f2522, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f2521, f355, f2522; +mul.f32 f435, f2522, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f2520, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f2519, f356, f2520; +mul.f32 f451, f2520, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f2547, 0f3E6C2691; +mul.f32 f2518, f310, 0f3F791978; +sub.f32 f459, f2518, f458; +mul.f32 f460, f2547, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f2516, f426, 0f3F64C51C; +mul.f32 f2517, f2521, 0f3EE5C902; +sub.f32 f464, f2516, f2517; +mul.f32 f465, f2521, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f2514, f326, 0f3F64C51C; +mul.f32 f2515, f2545, 0f3EE5C902; +sub.f32 f469, f2514, f2515; +mul.f32 f470, f2545, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f2512, f442, 0f3F18DF63; +mul.f32 f2513, f2519, 0f3F4D57F2; +sub.f32 f474, f2512, f2513; +mul.f32 f475, f2519, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f2510, f301, 0f3F441B7D; +mul.f32 f2511, f307, 0f3F248DBB; +sub.f32 f479, f2510, f2511; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f2509, f417, 0f3E31D0D4; +sub.f32 f484, f2509, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f2508, f317, 0f3F18DF63; +sub.f32 f489, f2508, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f2507, f433, 0fBE92D7E0; +sub.f32 f494, f2507, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f2506, f333, 0f3ECACAF8; +sub.f32 f499, f2506, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f2505, f449, 0fBF2FAD88; +sub.f32 f504, f2505, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f2504, f302, 0f3E31D0D4; +sub.f32 f509, f2504, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f2502, f418, 0fBF708FB2; +mul.f32 f2503, f424, 0f3EAF1D44; +sub.f32 f514, f2502, f2503; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f2500, f318, 0fBD6E2946; +mul.f32 f2501, f324, 0f3F7F9120; +sub.f32 f519, f2500, f2501; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f2498, f434, 0fBF7E44DE; +mul.f32 f2499, f440, 0fBDEDC21F; +sub.f32 f524, f2498, f2499; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f2497, f334, 0fBE92D7E0; +sub.f32 f529, f2497, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f2496, f450, 0fBF55E287; +sub.f32 f534, f2496, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f2495, f2549, f2523; +sub.f32 f543, f2549, f2523; +mul.f32 f544, f543, 0fBF5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f2494, f2576, f2495; +mul.f32 f547, f2495, 0f3F000000; +sub.f32 f548, f2576, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0fBF5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f2493, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f2492, f2574, f2493; +mul.f32 f563, f2493, 0f3F000000; +sub.f32 f564, f2574, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0fBF5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f2491, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f2490, f2572, f2491; +mul.f32 f579, f2491, 0f3F000000; +sub.f32 f580, f2572, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0fBF5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f2489, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0fBF5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f2488, f191, f2489; +mul.f32 f595, f2489, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0fBF5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f2487, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0fBF5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f2486, f207, f2487; +mul.f32 f611, f2487, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0fBF5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f2485, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0fBF5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f2484, f223, f2485; +mul.f32 f627, f2485, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0fBF5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f2483, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0fBF5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f2482, f192, f2483; +mul.f32 f643, f2483, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0fBF5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f2481, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0fBF5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f2480, f208, f2481; +mul.f32 f659, f2481, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0fBF5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f2479, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0fBF5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f2478, f224, f2479; +mul.f32 f675, f2479, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0fBF5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r21, %tid.x; +mul.wide.u32 rd2, r21, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r21, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd14, r7, 8; +add.s64 rd6, rd5, rd14; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f685, f2492, f682; +fma.rn.f32 f686, f681, f554, f685; +mul.f32 f687, f554, f682; +mul.f32 f688, f681, f2492; +sub.f32 f689, f688, f687; +mul.f32 f691, f682, f682; +mul.f32 f2477, f681, f681; +sub.f32 f692, f2477, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f695, f2490, f694; +fma.rn.f32 f696, f692, f570, f695; +mul.f32 f697, f570, f694; +mul.f32 f698, f692, f2490; +sub.f32 f699, f698, f697; +mul.f32 f701, f682, f694; +mul.f32 f2476, f681, f692; +sub.f32 f702, f2476, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f705, f2488, f704; +fma.rn.f32 f706, f702, f586, f705; +mul.f32 f707, f586, f704; +mul.f32 f708, f702, f2488; +sub.f32 f709, f708, f707; +mul.f32 f2474, f681, f702; +mul.f32 f2475, f682, f704; +sub.f32 f712, f2474, f2475; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f715, f2486, f714; +fma.rn.f32 f716, f712, f602, f715; +mul.f32 f717, f602, f714; +mul.f32 f718, f712, f2486; +sub.f32 f719, f718, f717; +mul.f32 f2472, f681, f712; +mul.f32 f2473, f682, f714; +sub.f32 f722, f2472, f2473; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f725, f2484, f724; +fma.rn.f32 f726, f722, f618, f725; +mul.f32 f727, f618, f724; +mul.f32 f728, f722, f2484; +sub.f32 f729, f728, f727; +mul.f32 f731, f682, f724; +mul.f32 f2471, f681, f722; +sub.f32 f732, f2471, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f735, f2482, f734; +fma.rn.f32 f736, f732, f634, f735; +mul.f32 f737, f634, f734; +mul.f32 f738, f732, f2482; +sub.f32 f739, f738, f737; +mul.f32 f741, f682, f734; +mul.f32 f2470, f681, f732; +sub.f32 f742, f2470, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f745, f2480, f744; +fma.rn.f32 f746, f742, f650, f745; +mul.f32 f747, f650, f744; +mul.f32 f748, f742, f2480; +sub.f32 f749, f748, f747; +mul.f32 f751, f682, f744; +mul.f32 f2469, f681, f742; +sub.f32 f752, f2469, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f755, f2478, f754; +fma.rn.f32 f756, f752, f666, f755; +mul.f32 f757, f666, f754; +mul.f32 f758, f752, f2478; +sub.f32 f759, f758, f757; +mul.f32 f2467, f681, f752; +mul.f32 f2468, f682, f754; +sub.f32 f762, f2467, f2468; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f765, f551, f764; +fma.rn.f32 f766, f762, f545, f765; +mul.f32 f767, f545, f764; +mul.f32 f768, f762, f551; +sub.f32 f769, f768, f767; +mul.f32 f2465, f681, f762; +mul.f32 f2466, f682, f764; +sub.f32 f772, f2465, f2466; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f775, f567, f774; +fma.rn.f32 f776, f772, f561, f775; +mul.f32 f777, f561, f774; +mul.f32 f778, f772, f567; +sub.f32 f779, f778, f777; +mul.f32 f781, f682, f774; +mul.f32 f2464, f681, f772; +sub.f32 f782, f2464, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f785, f583, f784; +fma.rn.f32 f786, f782, f577, f785; +mul.f32 f787, f577, f784; +mul.f32 f788, f782, f583; +sub.f32 f789, f788, f787; +mul.f32 f791, f682, f784; +mul.f32 f2463, f681, f782; +sub.f32 f792, f2463, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f795, f599, f794; +fma.rn.f32 f796, f792, f593, f795; +mul.f32 f797, f593, f794; +mul.f32 f798, f792, f599; +sub.f32 f799, f798, f797; +mul.f32 f801, f682, f794; +mul.f32 f2462, f681, f792; +sub.f32 f802, f2462, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f805, f615, f804; +fma.rn.f32 f806, f802, f609, f805; +mul.f32 f807, f609, f804; +mul.f32 f808, f802, f615; +sub.f32 f809, f808, f807; +mul.f32 f2460, f681, f802; +mul.f32 f2461, f682, f804; +sub.f32 f812, f2460, f2461; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f815, f631, f814; +fma.rn.f32 f816, f812, f625, f815; +mul.f32 f817, f625, f814; +mul.f32 f818, f812, f631; +sub.f32 f819, f818, f817; +mul.f32 f2458, f681, f812; +mul.f32 f2459, f682, f814; +sub.f32 f822, f2458, f2459; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f825, f647, f824; +fma.rn.f32 f826, f822, f641, f825; +mul.f32 f827, f641, f824; +mul.f32 f828, f822, f647; +sub.f32 f829, f828, f827; +mul.f32 f831, f682, f824; +mul.f32 f2457, f681, f822; +sub.f32 f832, f2457, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f835, f663, f834; +fma.rn.f32 f836, f832, f657, f835; +mul.f32 f837, f657, f834; +mul.f32 f838, f832, f663; +sub.f32 f839, f838, f837; +mul.f32 f841, f682, f834; +mul.f32 f2456, f681, f832; +sub.f32 f842, f2456, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f845, f679, f844; +fma.rn.f32 f846, f842, f673, f845; +mul.f32 f847, f673, f844; +mul.f32 f848, f842, f679; +sub.f32 f849, f848, f847; +mul.f32 f2454, f681, f842; +mul.f32 f2455, f682, f844; +sub.f32 f852, f2454, f2455; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f855, f552, f854; +fma.rn.f32 f856, f852, f546, f855; +mul.f32 f857, f546, f854; +mul.f32 f858, f852, f552; +sub.f32 f859, f858, f857; +mul.f32 f2452, f681, f852; +mul.f32 f2453, f682, f854; +sub.f32 f862, f2452, f2453; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f865, f568, f864; +fma.rn.f32 f866, f862, f562, f865; +mul.f32 f867, f562, f864; +mul.f32 f868, f862, f568; +sub.f32 f869, f868, f867; +mul.f32 f871, f682, f864; +mul.f32 f2451, f681, f862; +sub.f32 f872, f2451, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f875, f584, f874; +fma.rn.f32 f876, f872, f578, f875; +mul.f32 f877, f578, f874; +mul.f32 f878, f872, f584; +sub.f32 f879, f878, f877; +mul.f32 f881, f682, f874; +mul.f32 f2450, f681, f872; +sub.f32 f882, f2450, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f885, f600, f884; +fma.rn.f32 f886, f882, f594, f885; +mul.f32 f887, f594, f884; +mul.f32 f888, f882, f600; +sub.f32 f889, f888, f887; +mul.f32 f891, f682, f884; +mul.f32 f2449, f681, f882; +sub.f32 f892, f2449, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f895, f616, f894; +fma.rn.f32 f896, f892, f610, f895; +mul.f32 f897, f610, f894; +mul.f32 f898, f892, f616; +sub.f32 f899, f898, f897; +mul.f32 f2447, f681, f892; +mul.f32 f2448, f682, f894; +sub.f32 f902, f2447, f2448; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f905, f632, f904; +fma.rn.f32 f906, f902, f626, f905; +mul.f32 f907, f626, f904; +mul.f32 f908, f902, f632; +sub.f32 f909, f908, f907; +mul.f32 f2445, f681, f902; +mul.f32 f2446, f682, f904; +sub.f32 f912, f2445, f2446; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f915, f648, f914; +fma.rn.f32 f916, f912, f642, f915; +mul.f32 f917, f642, f914; +mul.f32 f918, f912, f648; +sub.f32 f919, f918, f917; +mul.f32 f921, f682, f914; +mul.f32 f2444, f681, f912; +sub.f32 f922, f2444, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f925, f664, f924; +fma.rn.f32 f926, f922, f658, f925; +mul.f32 f927, f658, f924; +mul.f32 f928, f922, f664; +sub.f32 f929, f928, f927; +mul.f32 f931, f682, f924; +mul.f32 f2443, f681, f922; +sub.f32 f932, f2443, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f935, f680, f934; +fma.rn.f32 f936, f932, f674, f935; +mul.f32 f937, f674, f934; +mul.f32 f938, f932, f680; +sub.f32 f939, f938, f937; +mad.lo.s32 r8, r5, 26244, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f686; +st.shared.f32 [r9+8], f696; +st.shared.f32 [r9+12], f706; +st.shared.f32 [r9+16], f716; +st.shared.f32 [r9+20], f726; +st.shared.f32 [r9+24], f736; +st.shared.f32 [r9+28], f746; +st.shared.f32 [r9+32], f756; +st.shared.f32 [r9+36], f766; +st.shared.f32 [r9+40], f776; +st.shared.f32 [r9+44], f786; +st.shared.f32 [r9+48], f796; +st.shared.f32 [r9+52], f806; +st.shared.f32 [r9+56], f816; +st.shared.f32 [r9+60], f826; +st.shared.f32 [r9+64], f836; +st.shared.f32 [r9+68], f846; +st.shared.f32 [r9+72], f856; +st.shared.f32 [r9+76], f866; +st.shared.f32 [r9+80], f876; +st.shared.f32 [r9+84], f886; +st.shared.f32 [r9+88], f896; +st.shared.f32 [r9+92], f906; +st.shared.f32 [r9+96], f916; +st.shared.f32 [r9+100], f926; +st.shared.f32 [r9+104], f936; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+972]; +ld.shared.f32 f942, [r10+1944]; +ld.shared.f32 f943, [r10+2916]; +ld.shared.f32 f944, [r10+3888]; +ld.shared.f32 f945, [r10+4860]; +ld.shared.f32 f946, [r10+5832]; +ld.shared.f32 f947, [r10+6804]; +ld.shared.f32 f948, [r10+7776]; +ld.shared.f32 f949, [r10+8748]; +ld.shared.f32 f950, [r10+9720]; +ld.shared.f32 f951, [r10+10692]; +ld.shared.f32 f952, [r10+11664]; +ld.shared.f32 f953, [r10+12636]; +ld.shared.f32 f954, [r10+13608]; +ld.shared.f32 f955, [r10+14580]; +ld.shared.f32 f956, [r10+15552]; +ld.shared.f32 f957, [r10+16524]; +ld.shared.f32 f958, [r10+17496]; +ld.shared.f32 f959, [r10+18468]; +ld.shared.f32 f960, [r10+19440]; +ld.shared.f32 f961, [r10+20412]; +ld.shared.f32 f962, [r10+21384]; +ld.shared.f32 f963, [r10+22356]; +ld.shared.f32 f964, [r10+23328]; +ld.shared.f32 f965, [r10+24300]; +ld.shared.f32 f966, [r10+25272]; +barrier.sync 0; +st.shared.f32 [r9], f2494; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +ld.shared.f32 f2442, [r10+17496]; +ld.shared.f32 f2441, [r10+8748]; +add.f32 f2440, f2441, f2442; +sub.f32 f1000, f2441, f2442; +mul.f32 f1001, f1000, 0fBF5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +ld.shared.f32 f2439, [r10]; +add.f32 f2438, f2439, f2440; +mul.f32 f1004, f2440, 0f3F000000; +sub.f32 f1005, f2439, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0fBF5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +ld.shared.f32 f2437, [r10+20412]; +ld.shared.f32 f2436, [r10+11664]; +add.f32 f2435, f2436, f2437; +sub.f32 f1016, f2436, f2437; +mul.f32 f1017, f1016, 0fBF5DB3D7; +ld.shared.f32 f2434, [r10+2916]; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f2433, f2434, f2435; +mul.f32 f1020, f2435, 0f3F000000; +sub.f32 f1021, f2434, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0fBF5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +ld.shared.f32 f2432, [r10+23328]; +ld.shared.f32 f2431, [r10+14580]; +sub.f32 f1031, f946, f1030; +add.f32 f2430, f2431, f2432; +sub.f32 f1032, f2431, f2432; +mul.f32 f1033, f1032, 0fBF5DB3D7; +ld.shared.f32 f2429, [r10+5832]; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f2428, f2429, f2430; +mul.f32 f1036, f2430, 0f3F000000; +sub.f32 f1037, f2429, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0fBF5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f2426, f1018, 0f3F441B7D; +mul.f32 f2427, f1024, 0f3F248DBB; +sub.f32 f1044, f2426, f2427; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0f3F248DBB, f1045; +mul.f32 f1048, f1040, 0f3F7C1C5C; +mul.f32 f2425, f1034, 0f3E31D0D4; +sub.f32 f1049, f2425, f1048; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0f3F7C1C5C, f1050; +mul.f32 f1053, f1025, 0f3F7C1C5C; +mul.f32 f2424, f1019, 0f3E31D0D4; +sub.f32 f1054, f2424, f1053; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0f3F7C1C5C, f1055; +mul.f32 f1058, f1041, 0f3EAF1D44; +mul.f32 f2423, f1035, 0fBF708FB2; +sub.f32 f1059, f2423, f1058; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0f3EAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f2422, f2433, f2428; +sub.f32 f1068, f2433, f2428; +mul.f32 f1069, f1068, 0fBF5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f2421, f2438, f2422; +mul.f32 f1072, f2422, 0f3F000000; +sub.f32 f1073, f2438, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0fBF5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f2420, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0fBF5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f2419, f1008, f2420; +mul.f32 f1088, f2420, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0fBF5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f2418, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0fBF5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f2417, f1009, f2418; +mul.f32 f1104, f2418, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0fBF5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +sub.f32 f1115, f941, f1114; +ld.shared.f32 f2416, [r10+18468]; +ld.shared.f32 f2415, [r10+9720]; +add.f32 f2414, f2415, f2416; +sub.f32 f1116, f2415, f2416; +mul.f32 f1117, f1116, 0fBF5DB3D7; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +ld.shared.f32 f2413, [r10+972]; +add.f32 f2412, f2413, f2414; +mul.f32 f1120, f2414, 0f3F000000; +sub.f32 f1121, f2413, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0fBF5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +ld.shared.f32 f2411, [r10+21384]; +sub.f32 f1131, f944, f1130; +ld.shared.f32 f2410, [r10+12636]; +add.f32 f2409, f2410, f2411; +sub.f32 f1132, f2410, f2411; +mul.f32 f1133, f1132, 0fBF5DB3D7; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +ld.shared.f32 f2408, [r10+3888]; +add.f32 f2407, f2408, f2409; +mul.f32 f1136, f2409, 0f3F000000; +sub.f32 f1137, f2408, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0fBF5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +ld.shared.f32 f2406, [r10+15552]; +sub.f32 f1147, f947, f1146; +ld.shared.f32 f2405, [r10+24300]; +add.f32 f2404, f2406, f2405; +sub.f32 f1148, f2406, f2405; +mul.f32 f1149, f1148, 0fBF5DB3D7; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +ld.shared.f32 f2403, [r10+6804]; +add.f32 f2402, f2403, f2404; +mul.f32 f1152, f2404, 0f3F000000; +sub.f32 f1153, f2403, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0fBF5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f2400, f1134, 0f3F441B7D; +mul.f32 f2401, f1140, 0f3F248DBB; +sub.f32 f1160, f2400, f2401; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0f3F248DBB, f1161; +mul.f32 f2398, f1150, 0f3E31D0D4; +mul.f32 f2399, f1156, 0f3F7C1C5C; +sub.f32 f1165, f2398, f2399; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0f3F7C1C5C, f1166; +mul.f32 f1169, f1141, 0f3F7C1C5C; +mul.f32 f2397, f1135, 0f3E31D0D4; +sub.f32 f1170, f2397, f1169; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0f3F7C1C5C, f1171; +mul.f32 f1174, f1157, 0f3EAF1D44; +mul.f32 f2396, f1151, 0fBF708FB2; +sub.f32 f1175, f2396, f1174; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0f3EAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f2395, f2407, f2402; +sub.f32 f1184, f2407, f2402; +mul.f32 f1185, f1184, 0fBF5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f2394, f2412, f2395; +mul.f32 f1188, f2395, 0f3F000000; +sub.f32 f1189, f2412, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0fBF5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f2393, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0fBF5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f2392, f1124, f2393; +mul.f32 f1204, f2393, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0fBF5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f2391, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0fBF5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f2390, f1125, f2391; +mul.f32 f1220, f2391, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0fBF5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +ld.shared.f32 f2389, [r10+10692]; +sub.f32 f1231, f942, f1230; +ld.shared.f32 f2388, [r10+19440]; +add.f32 f2387, f2389, f2388; +sub.f32 f1232, f2389, f2388; +mul.f32 f1233, f1232, 0fBF5DB3D7; +ld.shared.f32 f2386, [r10+1944]; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f2385, f2386, f2387; +mul.f32 f1236, f2387, 0f3F000000; +sub.f32 f1237, f2386, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0fBF5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +ld.shared.f32 f2384, [r10+13608]; +ld.shared.f32 f2383, [r10+22356]; +sub.f32 f1247, f945, f1246; +add.f32 f2382, f2384, f2383; +sub.f32 f1248, f2384, f2383; +mul.f32 f1249, f1248, 0fBF5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +ld.shared.f32 f2381, [r10+4860]; +add.f32 f2380, f2381, f2382; +mul.f32 f1252, f2382, 0f3F000000; +sub.f32 f1253, f2381, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0fBF5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +ld.shared.f32 f2379, [r10+25272]; +ld.shared.f32 f2378, [r10+16524]; +add.f32 f2377, f2378, f2379; +sub.f32 f1264, f2378, f2379; +mul.f32 f1265, f1264, 0fBF5DB3D7; +ld.shared.f32 f2376, [r10+7776]; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +add.f32 f2375, f2376, f2377; +mul.f32 f1268, f2377, 0f3F000000; +sub.f32 f1269, f2376, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0fBF5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f2373, f1250, 0f3F441B7D; +mul.f32 f2374, f1256, 0f3F248DBB; +sub.f32 f1276, f2373, f2374; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0f3F248DBB, f1277; +mul.f32 f2371, f1266, 0f3E31D0D4; +mul.f32 f2372, f1272, 0f3F7C1C5C; +sub.f32 f1281, f2371, f2372; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0f3F7C1C5C, f1282; +mul.f32 f1285, f1257, 0f3F7C1C5C; +mul.f32 f2370, f1251, 0f3E31D0D4; +sub.f32 f1286, f2370, f1285; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0f3F7C1C5C, f1287; +mul.f32 f1290, f1273, 0f3EAF1D44; +mul.f32 f2369, f1267, 0fBF708FB2; +sub.f32 f1291, f2369, f1290; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0f3EAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f2368, f2380, f2375; +sub.f32 f1300, f2380, f2375; +mul.f32 f1301, f1300, 0fBF5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f2367, f2385, f2368; +mul.f32 f1304, f2368, 0f3F000000; +sub.f32 f1305, f2385, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0fBF5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f2366, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0fBF5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f2365, f1240, f2366; +mul.f32 f1320, f2366, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0fBF5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f2364, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0fBF5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f2363, f1241, f2364; +mul.f32 f1336, f2364, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0fBF5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f2392, 0f3E6C2691; +mul.f32 f2362, f1195, 0f3F791978; +sub.f32 f1344, f2362, f1343; +mul.f32 f1345, f2392, 0f3F791978; +fma.rn.f32 f1346, f1195, 0f3E6C2691, f1345; +mul.f32 f2360, f1311, 0f3F64C51C; +mul.f32 f2361, f2365, 0f3EE5C902; +sub.f32 f1349, f2360, f2361; +mul.f32 f1350, f2365, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0f3EE5C902, f1350; +mul.f32 f2358, f1211, 0f3F64C51C; +mul.f32 f2359, f2390, 0f3EE5C902; +sub.f32 f1354, f2358, f2359; +mul.f32 f1355, f2390, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0f3EE5C902, f1355; +mul.f32 f2356, f1327, 0f3F18DF63; +mul.f32 f2357, f2363, 0f3F4D57F2; +sub.f32 f1359, f2356, f2357; +mul.f32 f1360, f2363, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0f3F4D57F2, f1360; +mul.f32 f2354, f1186, 0f3F441B7D; +mul.f32 f2355, f1192, 0f3F248DBB; +sub.f32 f1364, f2354, f2355; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0f3F248DBB, f1365; +mul.f32 f1368, f1308, 0f3F7C1C5C; +mul.f32 f2353, f1302, 0f3E31D0D4; +sub.f32 f1369, f2353, f1368; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0f3F7C1C5C, f1370; +mul.f32 f1373, f1208, 0f3F4D57F2; +mul.f32 f2352, f1202, 0f3F18DF63; +sub.f32 f1374, f2352, f1373; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0f3F4D57F2, f1375; +mul.f32 f1378, f1324, 0f3F753ECD; +mul.f32 f2351, f1318, 0fBE92D7E0; +sub.f32 f1379, f2351, f1378; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0f3F753ECD, f1380; +mul.f32 f1383, f1224, 0f3F6B1036; +mul.f32 f2350, f1218, 0f3ECACAF8; +sub.f32 f1384, f2350, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0f3F6B1036, f1385; +mul.f32 f1388, f1340, 0f3F3A3529; +mul.f32 f2349, f1334, 0fBF2FAD88; +sub.f32 f1389, f2349, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0f3F3A3529, f1390; +mul.f32 f1393, f1193, 0f3F7C1C5C; +mul.f32 f2348, f1187, 0f3E31D0D4; +sub.f32 f1394, f2348, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0f3F7C1C5C, f1395; +mul.f32 f2346, f1303, 0fBF708FB2; +mul.f32 f2347, f1309, 0f3EAF1D44; +sub.f32 f1399, f2346, f2347; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0f3EAF1D44, f1400; +mul.f32 f2344, f1203, 0fBD6E2946; +mul.f32 f2345, f1209, 0f3F7F9120; +sub.f32 f1404, f2344, f2345; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0f3F7F9120, f1405; +mul.f32 f2342, f1319, 0fBF7E44DE; +mul.f32 f2343, f1325, 0fBDEDC21F; +sub.f32 f1409, f2342, f2343; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0fBDEDC21F, f1410; +mul.f32 f1413, f1225, 0f3F753ECD; +mul.f32 f2341, f1219, 0fBE92D7E0; +sub.f32 f1414, f2341, f1413; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0f3F753ECD, f1415; +mul.f32 f1418, f1341, 0fBF0CAC9F; +mul.f32 f2340, f1335, 0fBF55E287; +sub.f32 f1419, f2340, f1418; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0fBF0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +add.f32 f1423, f1063, f1422; +mul.f32 f1426, f1422, 0f3F000000; +sub.f32 f1427, f1063, f1426; +add.f32 f2339, f2394, f2367; +sub.f32 f1428, f2394, f2367; +mul.f32 f1429, f1428, 0fBF5DB3D7; +add.f32 f1430, f1429, f1427; +sub.f32 f1431, f1427, f1429; +add.f32 f2338, f2421, f2339; +mul.f32 f1432, f2339, 0f3F000000; +sub.f32 f1433, f2421, f1432; +sub.f32 f1434, f1179, f1295; +mul.f32 f1435, f1434, 0fBF5DB3D7; +sub.f32 f1436, f1433, f1435; +add.f32 f1437, f1435, f1433; +add.f32 f1438, f1344, f1349; +add.f32 f1439, f1079, f1438; +mul.f32 f1442, f1438, 0f3F000000; +sub.f32 f1443, f1079, f1442; +add.f32 f2337, f1346, f1351; +sub.f32 f1444, f1346, f1351; +mul.f32 f1445, f1444, 0fBF5DB3D7; +add.f32 f1446, f1445, f1443; +sub.f32 f1447, f1443, f1445; +add.f32 f2336, f2419, f2337; +mul.f32 f1448, f2337, 0f3F000000; +sub.f32 f1449, f2419, f1448; +sub.f32 f1450, f1344, f1349; +mul.f32 f1451, f1450, 0fBF5DB3D7; +sub.f32 f1452, f1449, f1451; +add.f32 f1453, f1451, f1449; +add.f32 f1454, f1354, f1359; +add.f32 f1455, f1095, f1454; +mul.f32 f1458, f1454, 0f3F000000; +sub.f32 f1459, f1095, f1458; +add.f32 f2335, f1356, f1361; +sub.f32 f1460, f1356, f1361; +mul.f32 f1461, f1460, 0fBF5DB3D7; +add.f32 f1462, f1461, f1459; +sub.f32 f1463, f1459, f1461; +add.f32 f2334, f2417, f2335; +mul.f32 f1464, f2335, 0f3F000000; +sub.f32 f1465, f2417, f1464; +sub.f32 f1466, f1354, f1359; +mul.f32 f1467, f1466, 0fBF5DB3D7; +sub.f32 f1468, f1465, f1467; +add.f32 f1469, f1467, f1465; +add.f32 f1470, f1364, f1369; +add.f32 f1471, f1070, f1470; +mul.f32 f1474, f1470, 0f3F000000; +sub.f32 f1475, f1070, f1474; +add.f32 f2333, f1366, f1371; +sub.f32 f1476, f1366, f1371; +mul.f32 f1477, f1476, 0fBF5DB3D7; +add.f32 f1478, f1477, f1475; +sub.f32 f1479, f1475, f1477; +add.f32 f2332, f1076, f2333; +mul.f32 f1480, f2333, 0f3F000000; +sub.f32 f1481, f1076, f1480; +sub.f32 f1482, f1364, f1369; +mul.f32 f1483, f1482, 0fBF5DB3D7; +sub.f32 f1484, f1481, f1483; +add.f32 f1485, f1483, f1481; +add.f32 f1486, f1374, f1379; +add.f32 f1487, f1086, f1486; +mul.f32 f1490, f1486, 0f3F000000; +sub.f32 f1491, f1086, f1490; +add.f32 f2331, f1376, f1381; +sub.f32 f1492, f1376, f1381; +mul.f32 f1493, f1492, 0fBF5DB3D7; +add.f32 f1494, f1493, f1491; +sub.f32 f1495, f1491, f1493; +add.f32 f2330, f1092, f2331; +mul.f32 f1496, f2331, 0f3F000000; +sub.f32 f1497, f1092, f1496; +sub.f32 f1498, f1374, f1379; +mul.f32 f1499, f1498, 0fBF5DB3D7; +sub.f32 f1500, f1497, f1499; +add.f32 f1501, f1499, f1497; +add.f32 f1502, f1384, f1389; +add.f32 f1503, f1102, f1502; +mul.f32 f1506, f1502, 0f3F000000; +sub.f32 f1507, f1102, f1506; +add.f32 f2329, f1386, f1391; +sub.f32 f1508, f1386, f1391; +mul.f32 f1509, f1508, 0fBF5DB3D7; +add.f32 f1510, f1509, f1507; +sub.f32 f1511, f1507, f1509; +add.f32 f2328, f1108, f2329; +mul.f32 f1512, f2329, 0f3F000000; +sub.f32 f1513, f1108, f1512; +sub.f32 f1514, f1384, f1389; +mul.f32 f1515, f1514, 0fBF5DB3D7; +sub.f32 f1516, f1513, f1515; +add.f32 f1517, f1515, f1513; +add.f32 f1518, f1394, f1399; +add.f32 f1519, f1071, f1518; +mul.f32 f1522, f1518, 0f3F000000; +sub.f32 f1523, f1071, f1522; +add.f32 f2327, f1396, f1401; +sub.f32 f1524, f1396, f1401; +mul.f32 f1525, f1524, 0fBF5DB3D7; +add.f32 f1526, f1525, f1523; +sub.f32 f1527, f1523, f1525; +add.f32 f2326, f1077, f2327; +mul.f32 f1528, f2327, 0f3F000000; +sub.f32 f1529, f1077, f1528; +sub.f32 f1530, f1394, f1399; +mul.f32 f1531, f1530, 0fBF5DB3D7; +sub.f32 f1532, f1529, f1531; +add.f32 f1533, f1531, f1529; +add.f32 f1534, f1404, f1409; +add.f32 f1535, f1087, f1534; +mul.f32 f1538, f1534, 0f3F000000; +sub.f32 f1539, f1087, f1538; +add.f32 f2325, f1406, f1411; +sub.f32 f1540, f1406, f1411; +mul.f32 f1541, f1540, 0fBF5DB3D7; +add.f32 f1542, f1541, f1539; +sub.f32 f1543, f1539, f1541; +add.f32 f2324, f1093, f2325; +mul.f32 f1544, f2325, 0f3F000000; +sub.f32 f1545, f1093, f1544; +sub.f32 f1546, f1404, f1409; +mul.f32 f1547, f1546, 0fBF5DB3D7; +sub.f32 f1548, f1545, f1547; +add.f32 f1549, f1547, f1545; +add.f32 f1550, f1414, f1419; +add.f32 f1551, f1103, f1550; +mul.f32 f1554, f1550, 0f3F000000; +sub.f32 f1555, f1103, f1554; +add.f32 f2323, f1416, f1421; +sub.f32 f1556, f1416, f1421; +mul.f32 f1557, f1556, 0fBF5DB3D7; +add.f32 f1558, f1557, f1555; +sub.f32 f1559, f1555, f1557; +add.f32 f2322, f1109, f2323; +mul.f32 f1560, f2323, 0f3F000000; +sub.f32 f1561, f1109, f1560; +sub.f32 f1562, f1414, f1419; +mul.f32 f1563, f1562, 0fBF5DB3D7; +sub.f32 f1564, f1561, f1563; +add.f32 f1565, f1563, f1561; +mul.wide.u32 rd7, r7, 795364315; +shr.u64 rd8, rd7, 32; +cvt.u32.u64 r11, rd8; +sub.s32 r12, r7, r11; +shr.u32 r13, r12, 1; +add.s32 r14, r13, r11; +shr.u32 r15, r14, 4; +mul.lo.s32 r16, r15, 27; +sub.s32 r17, r7, r16; +mul.wide.u32 rd12, r15, 8; +mov.u64 rd13, %56; +add.s64 rd11, rd13, rd12; +ld.global.v2.f32 {f1566, f1567}, [rd11]; +mul.f32 f1570, f2336, f1567; +fma.rn.f32 f1571, f1566, f1439, f1570; +mul.f32 f1572, f1439, f1567; +mul.f32 f1573, f1566, f2336; +sub.f32 f1574, f1573, f1572; +mul.f32 f2320, f1566, f1566; +mul.f32 f2321, f1567, f1567; +sub.f32 f1577, f2320, f2321; +mul.f32 f1578, f1567, f1566; +fma.rn.f32 f1579, f1567, f1566, f1578; +mul.f32 f1580, f2334, f1579; +fma.rn.f32 f1581, f1577, f1455, f1580; +mul.f32 f1582, f1455, f1579; +mul.f32 f1583, f1577, f2334; +sub.f32 f1584, f1583, f1582; +mul.f32 f1586, f1567, f1579; +mul.f32 f2319, f1566, f1577; +sub.f32 f1587, f2319, f1586; +mul.f32 f1588, f1566, f1579; +fma.rn.f32 f1589, f1567, f1577, f1588; +mul.f32 f1590, f2332, f1589; +fma.rn.f32 f1591, f1587, f1471, f1590; +mul.f32 f1592, f1471, f1589; +mul.f32 f1593, f1587, f2332; +sub.f32 f1594, f1593, f1592; +mul.f32 f1596, f1567, f1589; +mul.f32 f2318, f1566, f1587; +sub.f32 f1597, f2318, f1596; +mul.f32 f1598, f1566, f1589; +fma.rn.f32 f1599, f1567, f1587, f1598; +mul.f32 f1600, f2330, f1599; +fma.rn.f32 f1601, f1597, f1487, f1600; +mul.f32 f1602, f1487, f1599; +mul.f32 f1603, f1597, f2330; +sub.f32 f1604, f1603, f1602; +mul.f32 f1606, f1567, f1599; +mul.f32 f2317, f1566, f1597; +sub.f32 f1607, f2317, f1606; +mul.f32 f1608, f1566, f1599; +fma.rn.f32 f1609, f1567, f1597, f1608; +mul.f32 f1610, f2328, f1609; +fma.rn.f32 f1611, f1607, f1503, f1610; +mul.f32 f1612, f1503, f1609; +mul.f32 f1613, f1607, f2328; +sub.f32 f1614, f1613, f1612; +mul.f32 f2315, f1566, f1607; +mul.f32 f2316, f1567, f1609; +sub.f32 f1617, f2315, f2316; +mul.f32 f1618, f1566, f1609; +fma.rn.f32 f1619, f1567, f1607, f1618; +mul.f32 f1620, f2326, f1619; +fma.rn.f32 f1621, f1617, f1519, f1620; +mul.f32 f1622, f1519, f1619; +mul.f32 f1623, f1617, f2326; +sub.f32 f1624, f1623, f1622; +mul.f32 f2313, f1566, f1617; +mul.f32 f2314, f1567, f1619; +sub.f32 f1627, f2313, f2314; +mul.f32 f1628, f1566, f1619; +fma.rn.f32 f1629, f1567, f1617, f1628; +mul.f32 f1630, f2324, f1629; +fma.rn.f32 f1631, f1627, f1535, f1630; +mul.f32 f1632, f1535, f1629; +mul.f32 f1633, f1627, f2324; +sub.f32 f1634, f1633, f1632; +mul.f32 f1636, f1567, f1629; +mul.f32 f2312, f1566, f1627; +sub.f32 f1637, f2312, f1636; +mul.f32 f1638, f1566, f1629; +fma.rn.f32 f1639, f1567, f1627, f1638; +mul.f32 f1640, f2322, f1639; +fma.rn.f32 f1641, f1637, f1551, f1640; +mul.f32 f1642, f1551, f1639; +mul.f32 f1643, f1637, f2322; +sub.f32 f1644, f1643, f1642; +mul.f32 f1646, f1567, f1639; +mul.f32 f2311, f1566, f1637; +sub.f32 f1647, f2311, f1646; +mul.f32 f1648, f1566, f1639; +fma.rn.f32 f1649, f1567, f1637, f1648; +mul.f32 f1650, f1436, f1649; +fma.rn.f32 f1651, f1647, f1430, f1650; +mul.f32 f1652, f1430, f1649; +mul.f32 f1653, f1647, f1436; +sub.f32 f1654, f1653, f1652; +mul.f32 f2309, f1566, f1647; +mul.f32 f2310, f1567, f1649; +sub.f32 f1657, f2309, f2310; +mul.f32 f1658, f1566, f1649; +fma.rn.f32 f1659, f1567, f1647, f1658; +mul.f32 f1660, f1452, f1659; +fma.rn.f32 f1661, f1657, f1446, f1660; +mul.f32 f1662, f1446, f1659; +mul.f32 f1663, f1657, f1452; +sub.f32 f1664, f1663, f1662; +mul.f32 f2307, f1566, f1657; +mul.f32 f2308, f1567, f1659; +sub.f32 f1667, f2307, f2308; +mul.f32 f1668, f1566, f1659; +fma.rn.f32 f1669, f1567, f1657, f1668; +mul.f32 f1670, f1468, f1669; +fma.rn.f32 f1671, f1667, f1462, f1670; +mul.f32 f1672, f1462, f1669; +mul.f32 f1673, f1667, f1468; +sub.f32 f1674, f1673, f1672; +mul.f32 f1676, f1567, f1669; +mul.f32 f2306, f1566, f1667; +sub.f32 f1677, f2306, f1676; +mul.f32 f1678, f1566, f1669; +fma.rn.f32 f1679, f1567, f1667, f1678; +mul.f32 f1680, f1484, f1679; +fma.rn.f32 f1681, f1677, f1478, f1680; +mul.f32 f1682, f1478, f1679; +mul.f32 f1683, f1677, f1484; +sub.f32 f1684, f1683, f1682; +mul.f32 f1686, f1567, f1679; +mul.f32 f2305, f1566, f1677; +sub.f32 f1687, f2305, f1686; +mul.f32 f1688, f1566, f1679; +fma.rn.f32 f1689, f1567, f1677, f1688; +mul.f32 f1690, f1500, f1689; +fma.rn.f32 f1691, f1687, f1494, f1690; +mul.f32 f1692, f1494, f1689; +mul.f32 f1693, f1687, f1500; +sub.f32 f1694, f1693, f1692; +mul.f32 f1696, f1567, f1689; +mul.f32 f2304, f1566, f1687; +sub.f32 f1697, f2304, f1696; +mul.f32 f1698, f1566, f1689; +fma.rn.f32 f1699, f1567, f1687, f1698; +mul.f32 f1700, f1516, f1699; +fma.rn.f32 f1701, f1697, f1510, f1700; +mul.f32 f1702, f1510, f1699; +mul.f32 f1703, f1697, f1516; +sub.f32 f1704, f1703, f1702; +mul.f32 f2302, f1566, f1697; +mul.f32 f2303, f1567, f1699; +sub.f32 f1707, f2302, f2303; +mul.f32 f1708, f1566, f1699; +fma.rn.f32 f1709, f1567, f1697, f1708; +mul.f32 f1710, f1532, f1709; +fma.rn.f32 f1711, f1707, f1526, f1710; +mul.f32 f1712, f1526, f1709; +mul.f32 f1713, f1707, f1532; +sub.f32 f1714, f1713, f1712; +mul.f32 f2300, f1566, f1707; +mul.f32 f2301, f1567, f1709; +sub.f32 f1717, f2300, f2301; +mul.f32 f1718, f1566, f1709; +fma.rn.f32 f1719, f1567, f1707, f1718; +mul.f32 f1720, f1548, f1719; +fma.rn.f32 f1721, f1717, f1542, f1720; +mul.f32 f1722, f1542, f1719; +mul.f32 f1723, f1717, f1548; +sub.f32 f1724, f1723, f1722; +mul.f32 f1726, f1567, f1719; +mul.f32 f2299, f1566, f1717; +sub.f32 f1727, f2299, f1726; +mul.f32 f1728, f1566, f1719; +fma.rn.f32 f1729, f1567, f1717, f1728; +mul.f32 f1730, f1564, f1729; +fma.rn.f32 f1731, f1727, f1558, f1730; +mul.f32 f1732, f1558, f1729; +mul.f32 f1733, f1727, f1564; +sub.f32 f1734, f1733, f1732; +mul.f32 f1736, f1567, f1729; +mul.f32 f2298, f1566, f1727; +sub.f32 f1737, f2298, f1736; +mul.f32 f1738, f1566, f1729; +fma.rn.f32 f1739, f1567, f1727, f1738; +mul.f32 f1740, f1437, f1739; +fma.rn.f32 f1741, f1737, f1431, f1740; +mul.f32 f1742, f1431, f1739; +mul.f32 f1743, f1737, f1437; +sub.f32 f1744, f1743, f1742; +mul.f32 f1746, f1567, f1739; +mul.f32 f2297, f1566, f1737; +sub.f32 f1747, f2297, f1746; +mul.f32 f1748, f1566, f1739; +fma.rn.f32 f1749, f1567, f1737, f1748; +mul.f32 f1750, f1453, f1749; +fma.rn.f32 f1751, f1747, f1447, f1750; +mul.f32 f1752, f1447, f1749; +mul.f32 f1753, f1747, f1453; +sub.f32 f1754, f1753, f1752; +mul.f32 f2295, f1566, f1747; +mul.f32 f2296, f1567, f1749; +sub.f32 f1757, f2295, f2296; +mul.f32 f1758, f1566, f1749; +fma.rn.f32 f1759, f1567, f1747, f1758; +mul.f32 f1760, f1469, f1759; +fma.rn.f32 f1761, f1757, f1463, f1760; +mul.f32 f1762, f1463, f1759; +mul.f32 f1763, f1757, f1469; +sub.f32 f1764, f1763, f1762; +mul.f32 f2293, f1566, f1757; +mul.f32 f2294, f1567, f1759; +sub.f32 f1767, f2293, f2294; +mul.f32 f1768, f1566, f1759; +fma.rn.f32 f1769, f1567, f1757, f1768; +mul.f32 f1770, f1485, f1769; +fma.rn.f32 f1771, f1767, f1479, f1770; +mul.f32 f1772, f1479, f1769; +mul.f32 f1773, f1767, f1485; +sub.f32 f1774, f1773, f1772; +mul.f32 f1776, f1567, f1769; +mul.f32 f2292, f1566, f1767; +sub.f32 f1777, f2292, f1776; +mul.f32 f1778, f1566, f1769; +fma.rn.f32 f1779, f1567, f1767, f1778; +mul.f32 f1780, f1501, f1779; +fma.rn.f32 f1781, f1777, f1495, f1780; +mul.f32 f1782, f1495, f1779; +mul.f32 f1783, f1777, f1501; +sub.f32 f1784, f1783, f1782; +mul.f32 f1786, f1567, f1779; +mul.f32 f2291, f1566, f1777; +sub.f32 f1787, f2291, f1786; +mul.f32 f1788, f1566, f1779; +fma.rn.f32 f1789, f1567, f1777, f1788; +mul.f32 f1790, f1517, f1789; +fma.rn.f32 f1791, f1787, f1511, f1790; +mul.f32 f1792, f1511, f1789; +mul.f32 f1793, f1787, f1517; +sub.f32 f1794, f1793, f1792; +mul.f32 f2289, f1566, f1787; +mul.f32 f2290, f1567, f1789; +sub.f32 f1797, f2289, f2290; +mul.f32 f1798, f1566, f1789; +fma.rn.f32 f1799, f1567, f1787, f1798; +mul.f32 f1800, f1533, f1799; +fma.rn.f32 f1801, f1797, f1527, f1800; +mul.f32 f1802, f1527, f1799; +mul.f32 f1803, f1797, f1533; +sub.f32 f1804, f1803, f1802; +mul.f32 f2287, f1566, f1797; +mul.f32 f2288, f1567, f1799; +sub.f32 f1807, f2287, f2288; +mul.f32 f1808, f1566, f1799; +fma.rn.f32 f1809, f1567, f1797, f1808; +mul.f32 f1810, f1549, f1809; +fma.rn.f32 f1811, f1807, f1543, f1810; +mul.f32 f1812, f1543, f1809; +mul.f32 f1813, f1807, f1549; +sub.f32 f1814, f1813, f1812; +mul.f32 f1816, f1567, f1809; +mul.f32 f2286, f1566, f1807; +sub.f32 f1817, f2286, f1816; +mul.f32 f1818, f1566, f1809; +fma.rn.f32 f1819, f1567, f1807, f1818; +mul.f32 f1820, f1565, f1819; +fma.rn.f32 f1821, f1817, f1559, f1820; +mul.f32 f1822, f1559, f1819; +mul.f32 f1823, f1817, f1565; +sub.f32 f1824, f1823, f1822; +shl.b32 r18, r17, 2; +add.s32 r19, r8, r18; +barrier.sync 0; +mad.lo.s32 r20, r15, 2916, r19; +st.shared.f32 [r20], f1423; +st.shared.f32 [r20+108], f1571; +st.shared.f32 [r20+216], f1581; +st.shared.f32 [r20+324], f1591; +st.shared.f32 [r20+432], f1601; +st.shared.f32 [r20+540], f1611; +st.shared.f32 [r20+648], f1621; +st.shared.f32 [r20+756], f1631; +st.shared.f32 [r20+864], f1641; +st.shared.f32 [r20+972], f1651; +st.shared.f32 [r20+1080], f1661; +st.shared.f32 [r20+1188], f1671; +st.shared.f32 [r20+1296], f1681; +st.shared.f32 [r20+1404], f1691; +st.shared.f32 [r20+1512], f1701; +st.shared.f32 [r20+1620], f1711; +st.shared.f32 [r20+1728], f1721; +st.shared.f32 [r20+1836], f1731; +st.shared.f32 [r20+1944], f1741; +st.shared.f32 [r20+2052], f1751; +st.shared.f32 [r20+2160], f1761; +st.shared.f32 [r20+2268], f1771; +st.shared.f32 [r20+2376], f1781; +st.shared.f32 [r20+2484], f1791; +st.shared.f32 [r20+2592], f1801; +st.shared.f32 [r20+2700], f1811; +st.shared.f32 [r20+2808], f1821; +barrier.sync 0; +ld.shared.f32 f1825, [r10]; +ld.shared.f32 f1826, [r10+972]; +ld.shared.f32 f1827, [r10+1944]; +ld.shared.f32 f1828, [r10+2916]; +ld.shared.f32 f1829, [r10+3888]; +ld.shared.f32 f1830, [r10+4860]; +ld.shared.f32 f1831, [r10+5832]; +ld.shared.f32 f1832, [r10+6804]; +ld.shared.f32 f1833, [r10+7776]; +ld.shared.f32 f1834, [r10+8748]; +ld.shared.f32 f1835, [r10+9720]; +ld.shared.f32 f1836, [r10+10692]; +ld.shared.f32 f1837, [r10+11664]; +ld.shared.f32 f1838, [r10+12636]; +ld.shared.f32 f1839, [r10+13608]; +ld.shared.f32 f1840, [r10+14580]; +ld.shared.f32 f1841, [r10+15552]; +ld.shared.f32 f1842, [r10+16524]; +ld.shared.f32 f1843, [r10+17496]; +ld.shared.f32 f1844, [r10+18468]; +ld.shared.f32 f1845, [r10+19440]; +ld.shared.f32 f1846, [r10+20412]; +ld.shared.f32 f1847, [r10+21384]; +ld.shared.f32 f1848, [r10+22356]; +ld.shared.f32 f1849, [r10+23328]; +ld.shared.f32 f1850, [r10+24300]; +ld.shared.f32 f1851, [r10+25272]; +barrier.sync 0; +st.shared.f32 [r20], f2338; +st.shared.f32 [r20+108], f1574; +st.shared.f32 [r20+216], f1584; +st.shared.f32 [r20+324], f1594; +st.shared.f32 [r20+432], f1604; +st.shared.f32 [r20+540], f1614; +st.shared.f32 [r20+648], f1624; +st.shared.f32 [r20+756], f1634; +st.shared.f32 [r20+864], f1644; +st.shared.f32 [r20+972], f1654; +st.shared.f32 [r20+1080], f1664; +st.shared.f32 [r20+1188], f1674; +st.shared.f32 [r20+1296], f1684; +st.shared.f32 [r20+1404], f1694; +st.shared.f32 [r20+1512], f1704; +st.shared.f32 [r20+1620], f1714; +st.shared.f32 [r20+1728], f1724; +st.shared.f32 [r20+1836], f1734; +st.shared.f32 [r20+1944], f1744; +st.shared.f32 [r20+2052], f1754; +st.shared.f32 [r20+2160], f1764; +st.shared.f32 [r20+2268], f1774; +st.shared.f32 [r20+2376], f1784; +st.shared.f32 [r20+2484], f1794; +st.shared.f32 [r20+2592], f1804; +st.shared.f32 [r20+2700], f1814; +st.shared.f32 [r20+2808], f1824; +barrier.sync 0; +ld.shared.f32 f1852, [r10]; +ld.shared.f32 f1853, [r10+972]; +ld.shared.f32 f1854, [r10+1944]; +ld.shared.f32 f1855, [r10+2916]; +ld.shared.f32 f1856, [r10+3888]; +ld.shared.f32 f1857, [r10+4860]; +ld.shared.f32 f1858, [r10+5832]; +ld.shared.f32 f1859, [r10+6804]; +ld.shared.f32 f1860, [r10+7776]; +ld.shared.f32 f1861, [r10+8748]; +ld.shared.f32 f1862, [r10+9720]; +ld.shared.f32 f1863, [r10+10692]; +ld.shared.f32 f1864, [r10+11664]; +ld.shared.f32 f1865, [r10+12636]; +ld.shared.f32 f1866, [r10+13608]; +ld.shared.f32 f1867, [r10+14580]; +ld.shared.f32 f1868, [r10+15552]; +ld.shared.f32 f1869, [r10+16524]; +ld.shared.f32 f1870, [r10+17496]; +ld.shared.f32 f1871, [r10+18468]; +ld.shared.f32 f1872, [r10+19440]; +ld.shared.f32 f1873, [r10+20412]; +ld.shared.f32 f1874, [r10+21384]; +ld.shared.f32 f1875, [r10+22356]; +ld.shared.f32 f1876, [r10+23328]; +ld.shared.f32 f1877, [r10+24300]; +ld.shared.f32 f1878, [r10+25272]; +add.f32 f1879, f1834, f1843; +add.f32 f1880, f1825, f1879; +mul.f32 f1883, f1879, 0f3F000000; +sub.f32 f1884, f1825, f1883; +add.f32 f2285, f1861, f1870; +sub.f32 f1885, f1861, f1870; +mul.f32 f1886, f1885, 0fBF5DB3D7; +add.f32 f1887, f1886, f1884; +sub.f32 f1888, f1884, f1886; +add.f32 f2284, f1852, f2285; +mul.f32 f1889, f2285, 0f3F000000; +sub.f32 f1890, f1852, f1889; +sub.f32 f1891, f1834, f1843; +mul.f32 f1892, f1891, 0fBF5DB3D7; +sub.f32 f1893, f1890, f1892; +add.f32 f1894, f1892, f1890; +add.f32 f1895, f1837, f1846; +add.f32 f1896, f1828, f1895; +mul.f32 f1899, f1895, 0f3F000000; +sub.f32 f1900, f1828, f1899; +add.f32 f2283, f1864, f1873; +sub.f32 f1901, f1864, f1873; +mul.f32 f1902, f1901, 0fBF5DB3D7; +add.f32 f1903, f1902, f1900; +sub.f32 f1904, f1900, f1902; +add.f32 f2282, f1855, f2283; +mul.f32 f1905, f2283, 0f3F000000; +sub.f32 f1906, f1855, f1905; +sub.f32 f1907, f1837, f1846; +mul.f32 f1908, f1907, 0fBF5DB3D7; +sub.f32 f1909, f1906, f1908; +add.f32 f1910, f1908, f1906; +add.f32 f1911, f1840, f1849; +add.f32 f1912, f1831, f1911; +mul.f32 f1915, f1911, 0f3F000000; +sub.f32 f1916, f1831, f1915; +add.f32 f2281, f1867, f1876; +sub.f32 f1917, f1867, f1876; +mul.f32 f1918, f1917, 0fBF5DB3D7; +add.f32 f1919, f1918, f1916; +sub.f32 f1920, f1916, f1918; +add.f32 f2280, f1858, f2281; +mul.f32 f1921, f2281, 0f3F000000; +sub.f32 f1922, f1858, f1921; +sub.f32 f1923, f1840, f1849; +mul.f32 f1924, f1923, 0fBF5DB3D7; +sub.f32 f1925, f1922, f1924; +add.f32 f1926, f1924, f1922; +mul.f32 f2278, f1903, 0f3F441B7D; +mul.f32 f2279, f1909, 0f3F248DBB; +sub.f32 f1929, f2278, f2279; +mul.f32 f1930, f1909, 0f3F441B7D; +fma.rn.f32 f1931, f1903, 0f3F248DBB, f1930; +mul.f32 f2276, f1919, 0f3E31D0D4; +mul.f32 f2277, f1925, 0f3F7C1C5C; +sub.f32 f1934, f2276, f2277; +mul.f32 f1935, f1925, 0f3E31D0D4; +fma.rn.f32 f1936, f1919, 0f3F7C1C5C, f1935; +mul.f32 f2274, f1904, 0f3E31D0D4; +mul.f32 f2275, f1910, 0f3F7C1C5C; +sub.f32 f1939, f2274, f2275; +mul.f32 f1940, f1910, 0f3E31D0D4; +fma.rn.f32 f1941, f1904, 0f3F7C1C5C, f1940; +mul.f32 f2272, f1920, 0fBF708FB2; +mul.f32 f2273, f1926, 0f3EAF1D44; +sub.f32 f1944, f2272, f2273; +mul.f32 f1945, f1926, 0fBF708FB2; +fma.rn.f32 f1946, f1920, 0f3EAF1D44, f1945; +add.f32 f1947, f1896, f1912; +mul.f32 f1949, f1947, 0f3F000000; +sub.f32 f1950, f1880, f1949; +add.f32 f2271, f2282, f2280; +sub.f32 f1951, f2282, f2280; +mul.f32 f1952, f1951, 0fBF5DB3D7; +mul.f32 f1953, f2271, 0f3F000000; +sub.f32 f1954, f2284, f1953; +sub.f32 f1955, f1896, f1912; +mul.f32 f1956, f1955, 0fBF5DB3D7; +add.f32 f1957, f1929, f1934; +mul.f32 f1959, f1957, 0f3F000000; +sub.f32 f1960, f1887, f1959; +add.f32 f2270, f1931, f1936; +sub.f32 f1961, f1931, f1936; +mul.f32 f1962, f1961, 0fBF5DB3D7; +mul.f32 f1963, f2270, 0f3F000000; +sub.f32 f1964, f1893, f1963; +sub.f32 f1965, f1929, f1934; +mul.f32 f1966, f1965, 0fBF5DB3D7; +add.f32 f1967, f1939, f1944; +mul.f32 f1969, f1967, 0f3F000000; +sub.f32 f1970, f1888, f1969; +add.f32 f2269, f1941, f1946; +sub.f32 f1971, f1941, f1946; +mul.f32 f1972, f1971, 0fBF5DB3D7; +mul.f32 f1973, f2269, 0f3F000000; +sub.f32 f1974, f1894, f1973; +sub.f32 f1975, f1939, f1944; +mul.f32 f1976, f1975, 0fBF5DB3D7; +add.f32 f1977, f1835, f1844; +add.f32 f1978, f1826, f1977; +mul.f32 f1981, f1977, 0f3F000000; +sub.f32 f1982, f1826, f1981; +add.f32 f2268, f1862, f1871; +sub.f32 f1983, f1862, f1871; +mul.f32 f1984, f1983, 0fBF5DB3D7; +add.f32 f1985, f1984, f1982; +sub.f32 f1986, f1982, f1984; +add.f32 f2267, f1853, f2268; +mul.f32 f1987, f2268, 0f3F000000; +sub.f32 f1988, f1853, f1987; +sub.f32 f1989, f1835, f1844; +mul.f32 f1990, f1989, 0fBF5DB3D7; +sub.f32 f1991, f1988, f1990; +add.f32 f1992, f1990, f1988; +add.f32 f1993, f1838, f1847; +add.f32 f1994, f1829, f1993; +mul.f32 f1997, f1993, 0f3F000000; +sub.f32 f1998, f1829, f1997; +add.f32 f2266, f1865, f1874; +sub.f32 f1999, f1865, f1874; +mul.f32 f2000, f1999, 0fBF5DB3D7; +add.f32 f2001, f2000, f1998; +sub.f32 f2002, f1998, f2000; +add.f32 f2265, f1856, f2266; +mul.f32 f2003, f2266, 0f3F000000; +sub.f32 f2004, f1856, f2003; +sub.f32 f2005, f1838, f1847; +mul.f32 f2006, f2005, 0fBF5DB3D7; +sub.f32 f2007, f2004, f2006; +add.f32 f2008, f2006, f2004; +add.f32 f2009, f1841, f1850; +add.f32 f2010, f1832, f2009; +mul.f32 f2013, f2009, 0f3F000000; +sub.f32 f2014, f1832, f2013; +add.f32 f2264, f1868, f1877; +sub.f32 f2015, f1868, f1877; +mul.f32 f2016, f2015, 0fBF5DB3D7; +add.f32 f2017, f2016, f2014; +sub.f32 f2018, f2014, f2016; +add.f32 f2263, f1859, f2264; +mul.f32 f2019, f2264, 0f3F000000; +sub.f32 f2020, f1859, f2019; +sub.f32 f2021, f1841, f1850; +mul.f32 f2022, f2021, 0fBF5DB3D7; +sub.f32 f2023, f2020, f2022; +add.f32 f2024, f2022, f2020; +mul.f32 f2261, f2001, 0f3F441B7D; +mul.f32 f2262, f2007, 0f3F248DBB; +sub.f32 f2027, f2261, f2262; +mul.f32 f2028, f2007, 0f3F441B7D; +fma.rn.f32 f2029, f2001, 0f3F248DBB, f2028; +mul.f32 f2031, f2023, 0f3F7C1C5C; +mul.f32 f2260, f2017, 0f3E31D0D4; +sub.f32 f2032, f2260, f2031; +mul.f32 f2033, f2023, 0f3E31D0D4; +fma.rn.f32 f2034, f2017, 0f3F7C1C5C, f2033; +mul.f32 f2036, f2008, 0f3F7C1C5C; +mul.f32 f2259, f2002, 0f3E31D0D4; +sub.f32 f2037, f2259, f2036; +mul.f32 f2038, f2008, 0f3E31D0D4; +fma.rn.f32 f2039, f2002, 0f3F7C1C5C, f2038; +mul.f32 f2041, f2024, 0f3EAF1D44; +mul.f32 f2258, f2018, 0fBF708FB2; +sub.f32 f2042, f2258, f2041; +mul.f32 f2043, f2024, 0fBF708FB2; +fma.rn.f32 f2044, f2018, 0f3EAF1D44, f2043; +add.f32 f2045, f1994, f2010; +mul.f32 f2047, f2045, 0f3F000000; +sub.f32 f2048, f1978, f2047; +add.f32 f2257, f2265, f2263; +sub.f32 f2049, f2265, f2263; +mul.f32 f2050, f2049, 0fBF5DB3D7; +mul.f32 f2051, f2257, 0f3F000000; +sub.f32 f2052, f2267, f2051; +sub.f32 f2053, f1994, f2010; +mul.f32 f2054, f2053, 0fBF5DB3D7; +add.f32 f2055, f2027, f2032; +mul.f32 f2057, f2055, 0f3F000000; +sub.f32 f2058, f1985, f2057; +add.f32 f2256, f2029, f2034; +sub.f32 f2059, f2029, f2034; +mul.f32 f2060, f2059, 0fBF5DB3D7; +mul.f32 f2061, f2256, 0f3F000000; +sub.f32 f2062, f1991, f2061; +sub.f32 f2063, f2027, f2032; +mul.f32 f2064, f2063, 0fBF5DB3D7; +add.f32 f2065, f2037, f2042; +mul.f32 f2067, f2065, 0f3F000000; +sub.f32 f2068, f1986, f2067; +add.f32 f2255, f2039, f2044; +sub.f32 f2069, f2039, f2044; +mul.f32 f2070, f2069, 0fBF5DB3D7; +mul.f32 f2071, f2255, 0f3F000000; +sub.f32 f2072, f1992, f2071; +sub.f32 f2073, f2037, f2042; +mul.f32 f2074, f2073, 0fBF5DB3D7; +add.f32 f2075, f1836, f1845; +add.f32 f2076, f1827, f2075; +mul.f32 f2079, f2075, 0f3F000000; +sub.f32 f2080, f1827, f2079; +add.f32 f2254, f1863, f1872; +sub.f32 f2081, f1863, f1872; +mul.f32 f2082, f2081, 0fBF5DB3D7; +add.f32 f2083, f2082, f2080; +sub.f32 f2084, f2080, f2082; +add.f32 f2253, f1854, f2254; +mul.f32 f2085, f2254, 0f3F000000; +sub.f32 f2086, f1854, f2085; +sub.f32 f2087, f1836, f1845; +mul.f32 f2088, f2087, 0fBF5DB3D7; +sub.f32 f2089, f2086, f2088; +add.f32 f2090, f2088, f2086; +add.f32 f2091, f1839, f1848; +add.f32 f2092, f1830, f2091; +mul.f32 f2095, f2091, 0f3F000000; +sub.f32 f2096, f1830, f2095; +add.f32 f2252, f1866, f1875; +sub.f32 f2097, f1866, f1875; +mul.f32 f2098, f2097, 0fBF5DB3D7; +add.f32 f2099, f2098, f2096; +sub.f32 f2100, f2096, f2098; +add.f32 f2251, f1857, f2252; +mul.f32 f2101, f2252, 0f3F000000; +sub.f32 f2102, f1857, f2101; +sub.f32 f2103, f1839, f1848; +mul.f32 f2104, f2103, 0fBF5DB3D7; +sub.f32 f2105, f2102, f2104; +add.f32 f2106, f2104, f2102; +add.f32 f2107, f1842, f1851; +add.f32 f2108, f1833, f2107; +mul.f32 f2111, f2107, 0f3F000000; +sub.f32 f2112, f1833, f2111; +add.f32 f2250, f1869, f1878; +sub.f32 f2113, f1869, f1878; +mul.f32 f2114, f2113, 0fBF5DB3D7; +add.f32 f2115, f2114, f2112; +sub.f32 f2116, f2112, f2114; +add.f32 f2249, f1860, f2250; +mul.f32 f2117, f2250, 0f3F000000; +sub.f32 f2118, f1860, f2117; +sub.f32 f2119, f1842, f1851; +mul.f32 f2120, f2119, 0fBF5DB3D7; +sub.f32 f2121, f2118, f2120; +add.f32 f2122, f2120, f2118; +mul.f32 f2124, f2105, 0f3F248DBB; +mul.f32 f2248, f2099, 0f3F441B7D; +sub.f32 f2125, f2248, f2124; +mul.f32 f2126, f2105, 0f3F441B7D; +fma.rn.f32 f2127, f2099, 0f3F248DBB, f2126; +mul.f32 f2129, f2121, 0f3F7C1C5C; +mul.f32 f2247, f2115, 0f3E31D0D4; +sub.f32 f2130, f2247, f2129; +mul.f32 f2131, f2121, 0f3E31D0D4; +fma.rn.f32 f2132, f2115, 0f3F7C1C5C, f2131; +mul.f32 f2134, f2106, 0f3F7C1C5C; +mul.f32 f2246, f2100, 0f3E31D0D4; +sub.f32 f2135, f2246, f2134; +mul.f32 f2136, f2106, 0f3E31D0D4; +fma.rn.f32 f2137, f2100, 0f3F7C1C5C, f2136; +mul.f32 f2244, f2116, 0fBF708FB2; +mul.f32 f2245, f2122, 0f3EAF1D44; +sub.f32 f2140, f2244, f2245; +mul.f32 f2141, f2122, 0fBF708FB2; +fma.rn.f32 f2142, f2116, 0f3EAF1D44, f2141; +add.f32 f2143, f2092, f2108; +mul.f32 f2145, f2143, 0f3F000000; +sub.f32 f2146, f2076, f2145; +add.f32 f2243, f2251, f2249; +sub.f32 f2147, f2251, f2249; +mul.f32 f2148, f2147, 0fBF5DB3D7; +mul.f32 f2149, f2243, 0f3F000000; +sub.f32 f2150, f2253, f2149; +sub.f32 f2151, f2092, f2108; +mul.f32 f2152, f2151, 0fBF5DB3D7; +add.f32 f2153, f2125, f2130; +mul.f32 f2155, f2153, 0f3F000000; +sub.f32 f2156, f2083, f2155; +add.f32 f2242, f2127, f2132; +sub.f32 f2157, f2127, f2132; +mul.f32 f2158, f2157, 0fBF5DB3D7; +mul.f32 f2159, f2242, 0f3F000000; +sub.f32 f2160, f2089, f2159; +sub.f32 f2161, f2125, f2130; +mul.f32 f2162, f2161, 0fBF5DB3D7; +add.f32 f2163, f2135, f2140; +mul.f32 f2165, f2163, 0f3F000000; +sub.f32 f2166, f2084, f2165; +add.f32 f2241, f2137, f2142; +sub.f32 f2167, f2137, f2142; +mul.f32 f2168, f2167, 0fBF5DB3D7; +mul.f32 f2169, f2241, 0f3F000000; +sub.f32 f2170, f2090, f2169; +sub.f32 f2171, f2135, f2140; +mul.f32 f2592, f2143, 0f3F000000; +sub.f32 f2591, f2076, f2592; +mul.f32 f2172, f2171, 0fBF5DB3D7; +add.f32 %0, f1880, f1947; +mul.f32 f2594, f2242, 0f3F000000; +sub.f32 f2593, f2089, f2594; +add.f32 %1, f2284, f2271; +mul.f32 f2596, f1967, 0f3F000000; +sub.f32 f2595, f1888, f2596; +mul.f32 f2598, f2243, 0f3F000000; +sub.f32 f2597, f2253, f2598; +add.f32 %2, f1978, f2045; +add.f32 %3, f2267, f2257; +add.f32 %4, f2076, f2143; +add.f32 %5, f2253, f2243; +add.f32 %7, f1893, f2270; +add.f32 %6, f1887, f1957; +add.f32 %9, f1991, f2256; +add.f32 %8, f1985, f2055; +add.f32 %11, f2089, f2242; +add.f32 %10, f2083, f2153; +add.f32 %13, f1894, f2269; +add.f32 %12, f1888, f1967; +add.f32 %15, f1992, f2255; +add.f32 %14, f1986, f2065; +add.f32 %17, f2090, f2241; +add.f32 %16, f2084, f2163; +add.f32 %18, f1952, f1950; +sub.f32 %19, f1954, f1956; +add.f32 %20, f2050, f2048; +sub.f32 %21, f2052, f2054; +add.f32 %22, f2148, f2591; +sub.f32 %23, f2597, f2152; +add.f32 %24, f1962, f1960; +sub.f32 %25, f1964, f1966; +add.f32 %26, f2060, f2058; +sub.f32 %27, f2062, f2064; +add.f32 %28, f2158, f2156; +sub.f32 %29, f2593, f2162; +add.f32 %30, f1972, f2595; +sub.f32 %31, f1974, f1976; +sub.f32 %33, f2072, f2074; +add.f32 %32, f2070, f2068; +sub.f32 %35, f2170, f2172; +add.f32 %34, f2168, f2166; +sub.f32 %36, f1950, f1952; +add.f32 %37, f1956, f1954; +sub.f32 %38, f2048, f2050; +add.f32 %39, f2054, f2052; +sub.f32 %40, f2591, f2148; +add.f32 %41, f2152, f2597; +add.f32 %43, f1966, f1964; +sub.f32 %42, f1960, f1962; +add.f32 %45, f2064, f2062; +sub.f32 %44, f2058, f2060; +add.f32 %47, f2162, f2593; +sub.f32 %46, f2156, f2158; +add.f32 %49, f1976, f1974; +sub.f32 %48, f2595, f1972; +add.f32 %51, f2074, f2072; +sub.f32 %50, f2068, f2070; +add.f32 %53, f2172, f2170; +sub.f32 %52, f2166, f2168; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_6561), "l"(lut_sp_27_243), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<355, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<846>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 52488, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0fBF5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0fBF5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 52488, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f122, f152; +mul.f32 f156, f120, f152; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f138, f162; +mul.f32 f164, f136, f162; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f117, f170; +mul.f32 f172, f111, f170; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f133, f178; +mul.f32 f180, f127, f178; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f149, f186; +mul.f32 f188, f143, f186; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f118, f194; +mul.f32 f196, f112, f194; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f134, f202; +mul.f32 f204, f128, f202; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f150, f210; +mul.f32 f212, f144, f210; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f151, f120, f155; +sub.f32 f217, f157, f156; +st.shared.v2.f32 [r9+8], {f216, f217}; +fma.rn.f32 f218, f160, f136, f163; +sub.f32 f219, f165, f164; +st.shared.v2.f32 [r9+16], {f218, f219}; +sub.f32 f220, f173, f172; +fma.rn.f32 f221, f168, f111, f171; +st.shared.v2.f32 [r9+24], {f221, f220}; +fma.rn.f32 f222, f176, f127, f179; +sub.f32 f223, f181, f180; +st.shared.v2.f32 [r9+32], {f222, f223}; +sub.f32 f224, f189, f188; +fma.rn.f32 f225, f184, f143, f187; +st.shared.v2.f32 [r9+40], {f225, f224}; +fma.rn.f32 f226, f192, f112, f195; +sub.f32 f227, f197, f196; +st.shared.v2.f32 [r9+48], {f226, f227}; +fma.rn.f32 f228, f200, f128, f203; +sub.f32 f229, f205, f204; +st.shared.v2.f32 [r9+56], {f228, f229}; +fma.rn.f32 f230, f208, f144, f211; +sub.f32 f231, f213, f212; +st.shared.v2.f32 [r9+64], {f230, f231}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+5832]; +ld.shared.v2.f32 {f240, f241}, [r11+11664]; +ld.shared.v2.f32 {f244, f245}, [r11+17496]; +ld.shared.v2.f32 {f248, f249}, [r11+23328]; +ld.shared.v2.f32 {f252, f253}, [r11+29160]; +ld.shared.v2.f32 {f256, f257}, [r11+34992]; +ld.shared.v2.f32 {f260, f261}, [r11+40824]; +ld.shared.v2.f32 {f264, f265}, [r11+46656]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0fBF5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0fBF5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0fBF5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0fBF5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0fBF5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0fBF5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0f3F248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0f3F248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0f3F7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0f3F7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0f3F7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0f3F7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0f3EAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0f3EAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0fBF5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0fBF5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f353, f383; +mul.f32 f387, f351, f383; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f369, f393; +mul.f32 f395, f367, f393; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f348, f401; +mul.f32 f403, f342, f401; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f364, f409; +mul.f32 f411, f358, f409; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f380, f417; +mul.f32 f419, f374, f417; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f349, f425; +mul.f32 f427, f343, f425; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f365, f433; +mul.f32 f435, f359, f433; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f381, f441; +mul.f32 f443, f375, f441; +mul.f32 f444, f439, f381; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r17], {f446, f445}; +fma.rn.f32 f447, f382, f351, f386; +sub.f32 f448, f388, f387; +st.shared.v2.f32 [r17+72], {f447, f448}; +fma.rn.f32 f449, f391, f367, f394; +sub.f32 f450, f396, f395; +st.shared.v2.f32 [r17+144], {f449, f450}; +fma.rn.f32 f451, f399, f342, f402; +sub.f32 f452, f404, f403; +st.shared.v2.f32 [r17+216], {f451, f452}; +fma.rn.f32 f453, f407, f358, f410; +sub.f32 f454, f412, f411; +st.shared.v2.f32 [r17+288], {f453, f454}; +fma.rn.f32 f455, f415, f374, f418; +sub.f32 f456, f420, f419; +st.shared.v2.f32 [r17+360], {f455, f456}; +fma.rn.f32 f457, f423, f343, f426; +sub.f32 f458, f428, f427; +st.shared.v2.f32 [r17+432], {f457, f458}; +sub.f32 f459, f436, f435; +fma.rn.f32 f460, f431, f359, f434; +st.shared.v2.f32 [r17+504], {f460, f459}; +fma.rn.f32 f461, f439, f375, f442; +sub.f32 f462, f444, f443; +st.shared.v2.f32 [r17+576], {f461, f462}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r11]; +ld.shared.v2.f32 {f467, f468}, [r11+5832]; +ld.shared.v2.f32 {f471, f472}, [r11+11664]; +ld.shared.v2.f32 {f475, f476}, [r11+17496]; +ld.shared.v2.f32 {f479, f480}, [r11+23328]; +ld.shared.v2.f32 {f483, f484}, [r11+29160]; +ld.shared.v2.f32 {f487, f488}, [r11+34992]; +ld.shared.v2.f32 {f491, f492}, [r11+40824]; +ld.shared.v2.f32 {f495, f496}, [r11+46656]; +add.f32 f499, f475, f487; +add.f32 f500, f463, f499; +add.f32 f501, f476, f488; +add.f32 f502, f464, f501; +mul.f32 f503, f499, 0f3F000000; +sub.f32 f504, f463, f503; +sub.f32 f505, f476, f488; +mul.f32 f506, f505, 0fBF5DB3D7; +add.f32 f507, f506, f504; +sub.f32 f508, f504, f506; +mul.f32 f509, f501, 0f3F000000; +sub.f32 f510, f464, f509; +sub.f32 f511, f475, f487; +mul.f32 f512, f511, 0fBF5DB3D7; +sub.f32 f513, f510, f512; +add.f32 f514, f512, f510; +add.f32 f515, f479, f491; +add.f32 f516, f467, f515; +add.f32 f517, f480, f492; +add.f32 f518, f468, f517; +mul.f32 f519, f515, 0f3F000000; +sub.f32 f520, f467, f519; +sub.f32 f521, f480, f492; +mul.f32 f522, f521, 0fBF5DB3D7; +add.f32 f523, f522, f520; +sub.f32 f524, f520, f522; +mul.f32 f525, f517, 0f3F000000; +sub.f32 f526, f468, f525; +sub.f32 f527, f479, f491; +mul.f32 f528, f527, 0fBF5DB3D7; +sub.f32 f529, f526, f528; +add.f32 f530, f528, f526; +add.f32 f531, f483, f495; +add.f32 f532, f471, f531; +add.f32 f533, f484, f496; +add.f32 f534, f472, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f471, f535; +sub.f32 f537, f484, f496; +mul.f32 f538, f537, 0fBF5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f472, f541; +sub.f32 f543, f483, f495; +mul.f32 f544, f543, 0fBF5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +mul.f32 f547, f523, 0f3F441B7D; +mul.f32 f548, f529, 0f3F248DBB; +sub.f32 f549, f547, f548; +mul.f32 f550, f529, 0f3F441B7D; +fma.rn.f32 f551, f523, 0f3F248DBB, f550; +mul.f32 f552, f539, 0f3E31D0D4; +mul.f32 f553, f545, 0f3F7C1C5C; +sub.f32 f554, f552, f553; +mul.f32 f555, f545, 0f3E31D0D4; +fma.rn.f32 f556, f539, 0f3F7C1C5C, f555; +mul.f32 f557, f524, 0f3E31D0D4; +mul.f32 f558, f530, 0f3F7C1C5C; +sub.f32 f559, f557, f558; +mul.f32 f560, f530, 0f3E31D0D4; +fma.rn.f32 f561, f524, 0f3F7C1C5C, f560; +mul.f32 f562, f540, 0fBF708FB2; +mul.f32 f563, f546, 0f3EAF1D44; +sub.f32 f564, f562, f563; +mul.f32 f565, f546, 0fBF708FB2; +fma.rn.f32 f566, f540, 0f3EAF1D44, f565; +add.f32 f567, f516, f532; +add.f32 f568, f518, f534; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f500, f569; +sub.f32 f571, f518, f534; +mul.f32 f572, f571, 0fBF5DB3D7; +add.f32 f573, f572, f570; +sub.f32 f574, f570, f572; +mul.f32 f575, f568, 0f3F000000; +sub.f32 f576, f502, f575; +sub.f32 f577, f516, f532; +mul.f32 f578, f577, 0fBF5DB3D7; +sub.f32 f579, f576, f578; +add.f32 f580, f578, f576; +add.f32 f581, f549, f554; +add.f32 f582, f507, f581; +add.f32 f583, f551, f556; +add.f32 f584, f513, f583; +mul.f32 f585, f581, 0f3F000000; +sub.f32 f586, f507, f585; +sub.f32 f587, f551, f556; +mul.f32 f588, f587, 0fBF5DB3D7; +add.f32 f589, f588, f586; +sub.f32 f590, f586, f588; +mul.f32 f591, f583, 0f3F000000; +sub.f32 f592, f513, f591; +sub.f32 f593, f549, f554; +mul.f32 f594, f593, 0fBF5DB3D7; +sub.f32 f595, f592, f594; +add.f32 f596, f594, f592; +add.f32 f597, f559, f564; +add.f32 f598, f508, f597; +add.f32 f599, f561, f566; +add.f32 f600, f514, f599; +mul.f32 f601, f597, 0f3F000000; +sub.f32 f602, f508, f601; +sub.f32 f603, f561, f566; +mul.f32 f604, f603, 0fBF5DB3D7; +add.f32 f605, f604, f602; +sub.f32 f606, f602, f604; +mul.f32 f607, f599, 0f3F000000; +sub.f32 f608, f514, f607; +sub.f32 f609, f559, f564; +mul.f32 f610, f609, 0fBF5DB3D7; +sub.f32 f611, f608, f610; +add.f32 f612, f610, f608; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f613, f614}, [rd16]; +mul.f32 f617, f584, f614; +mul.f32 f618, f582, f614; +mul.f32 f619, f613, f584; +mul.f32 f620, f613, f613; +mul.f32 f621, f614, f614; +sub.f32 f622, f620, f621; +mul.f32 f623, f614, f613; +fma.rn.f32 f624, f614, f613, f623; +mul.f32 f625, f600, f624; +mul.f32 f626, f598, f624; +mul.f32 f627, f622, f600; +mul.f32 f628, f613, f622; +mul.f32 f629, f614, f624; +sub.f32 f630, f628, f629; +mul.f32 f631, f613, f624; +fma.rn.f32 f632, f614, f622, f631; +mul.f32 f633, f579, f632; +mul.f32 f634, f573, f632; +mul.f32 f635, f630, f579; +mul.f32 f636, f613, f630; +mul.f32 f637, f614, f632; +sub.f32 f638, f636, f637; +mul.f32 f639, f613, f632; +fma.rn.f32 f640, f614, f630, f639; +mul.f32 f641, f595, f640; +mul.f32 f642, f589, f640; +mul.f32 f643, f638, f595; +mul.f32 f644, f613, f638; +mul.f32 f645, f614, f640; +sub.f32 f646, f644, f645; +mul.f32 f647, f613, f640; +fma.rn.f32 f648, f614, f638, f647; +mul.f32 f649, f611, f648; +mul.f32 f650, f605, f648; +mul.f32 f651, f646, f611; +mul.f32 f652, f613, f646; +mul.f32 f653, f614, f648; +sub.f32 f654, f652, f653; +mul.f32 f655, f613, f648; +fma.rn.f32 f656, f614, f646, f655; +mul.f32 f657, f580, f656; +mul.f32 f658, f574, f656; +mul.f32 f659, f654, f580; +mul.f32 f660, f613, f654; +mul.f32 f661, f614, f656; +sub.f32 f662, f660, f661; +mul.f32 f663, f613, f656; +fma.rn.f32 f664, f614, f654, f663; +mul.f32 f665, f596, f664; +mul.f32 f666, f590, f664; +mul.f32 f667, f662, f596; +mul.f32 f668, f613, f662; +mul.f32 f669, f614, f664; +sub.f32 f670, f668, f669; +mul.f32 f671, f613, f664; +fma.rn.f32 f672, f614, f662, f671; +mul.f32 f673, f612, f672; +mul.f32 f674, f606, f672; +mul.f32 f675, f670, f612; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +add.f32 f676, f502, f568; +add.f32 f677, f500, f567; +st.shared.v2.f32 [r23], {f677, f676}; +fma.rn.f32 f678, f613, f582, f617; +sub.f32 f679, f619, f618; +st.shared.v2.f32 [r23+648], {f678, f679}; +fma.rn.f32 f680, f622, f598, f625; +sub.f32 f681, f627, f626; +st.shared.v2.f32 [r23+1296], {f680, f681}; +fma.rn.f32 f682, f630, f573, f633; +sub.f32 f683, f635, f634; +st.shared.v2.f32 [r23+1944], {f682, f683}; +fma.rn.f32 f684, f638, f589, f641; +sub.f32 f685, f643, f642; +st.shared.v2.f32 [r23+2592], {f684, f685}; +fma.rn.f32 f686, f646, f605, f649; +sub.f32 f687, f651, f650; +st.shared.v2.f32 [r23+3240], {f686, f687}; +fma.rn.f32 f688, f654, f574, f657; +sub.f32 f689, f659, f658; +st.shared.v2.f32 [r23+3888], {f688, f689}; +sub.f32 f690, f667, f666; +fma.rn.f32 f691, f662, f590, f665; +st.shared.v2.f32 [r23+4536], {f691, f690}; +fma.rn.f32 f692, f670, f606, f673; +sub.f32 f693, f675, f674; +st.shared.v2.f32 [r23+5184], {f692, f693}; +barrier.sync 0; +ld.shared.v2.f32 {f694, f695}, [r11]; +ld.shared.v2.f32 {f698, f699}, [r11+5832]; +ld.shared.v2.f32 {f702, f703}, [r11+11664]; +ld.shared.v2.f32 {f706, f707}, [r11+17496]; +ld.shared.v2.f32 {f710, f711}, [r11+23328]; +ld.shared.v2.f32 {f714, f715}, [r11+29160]; +ld.shared.v2.f32 {f718, f719}, [r11+34992]; +ld.shared.v2.f32 {f722, f723}, [r11+40824]; +ld.shared.v2.f32 {f726, f727}, [r11+46656]; +add.f32 f730, f706, f718; +add.f32 f731, f694, f730; +add.f32 f732, f707, f719; +add.f32 f733, f695, f732; +mul.f32 f734, f730, 0f3F000000; +sub.f32 f735, f694, f734; +sub.f32 f736, f707, f719; +mul.f32 f737, f736, 0fBF5DB3D7; +add.f32 f738, f737, f735; +sub.f32 f739, f735, f737; +mul.f32 f740, f732, 0f3F000000; +sub.f32 f741, f695, f740; +sub.f32 f742, f706, f718; +mul.f32 f743, f742, 0fBF5DB3D7; +sub.f32 f744, f741, f743; +add.f32 f745, f743, f741; +add.f32 f746, f710, f722; +add.f32 f747, f698, f746; +add.f32 f748, f711, f723; +add.f32 f749, f699, f748; +mul.f32 f750, f746, 0f3F000000; +sub.f32 f751, f698, f750; +sub.f32 f752, f711, f723; +mul.f32 f753, f752, 0fBF5DB3D7; +add.f32 f754, f753, f751; +sub.f32 f755, f751, f753; +mul.f32 f756, f748, 0f3F000000; +sub.f32 f757, f699, f756; +sub.f32 f758, f710, f722; +mul.f32 f759, f758, 0fBF5DB3D7; +sub.f32 f760, f757, f759; +add.f32 f761, f759, f757; +add.f32 f762, f714, f726; +add.f32 f763, f702, f762; +add.f32 f764, f715, f727; +add.f32 f765, f703, f764; +mul.f32 f766, f762, 0f3F000000; +sub.f32 f767, f702, f766; +sub.f32 f768, f715, f727; +mul.f32 f769, f768, 0fBF5DB3D7; +add.f32 f770, f769, f767; +sub.f32 f771, f767, f769; +mul.f32 f772, f764, 0f3F000000; +sub.f32 f773, f703, f772; +sub.f32 f774, f714, f726; +mul.f32 f775, f774, 0fBF5DB3D7; +sub.f32 f776, f773, f775; +add.f32 f777, f775, f773; +mul.f32 f778, f754, 0f3F441B7D; +mul.f32 f779, f760, 0f3F248DBB; +sub.f32 f780, f778, f779; +mul.f32 f781, f760, 0f3F441B7D; +fma.rn.f32 f782, f754, 0f3F248DBB, f781; +mul.f32 f783, f770, 0f3E31D0D4; +mul.f32 f784, f776, 0f3F7C1C5C; +sub.f32 f785, f783, f784; +mul.f32 f786, f776, 0f3E31D0D4; +fma.rn.f32 f787, f770, 0f3F7C1C5C, f786; +mul.f32 f788, f755, 0f3E31D0D4; +mul.f32 f789, f761, 0f3F7C1C5C; +sub.f32 f790, f788, f789; +mul.f32 f791, f761, 0f3E31D0D4; +fma.rn.f32 f792, f755, 0f3F7C1C5C, f791; +mul.f32 f793, f771, 0fBF708FB2; +mul.f32 f794, f777, 0f3EAF1D44; +sub.f32 f795, f793, f794; +mul.f32 f796, f777, 0fBF708FB2; +fma.rn.f32 f797, f771, 0f3EAF1D44, f796; +add.f32 f798, f747, f763; +add.f32 f799, f749, f765; +mul.f32 f800, f798, 0f3F000000; +sub.f32 f801, f731, f800; +sub.f32 f802, f749, f765; +mul.f32 f803, f802, 0fBF5DB3D7; +mul.f32 f804, f799, 0f3F000000; +sub.f32 f805, f733, f804; +sub.f32 f806, f747, f763; +mul.f32 f807, f806, 0fBF5DB3D7; +add.f32 f808, f780, f785; +add.f32 f809, f782, f787; +mul.f32 f810, f808, 0f3F000000; +sub.f32 f811, f738, f810; +sub.f32 f812, f782, f787; +mul.f32 f813, f812, 0fBF5DB3D7; +mul.f32 f814, f809, 0f3F000000; +sub.f32 f815, f744, f814; +sub.f32 f816, f780, f785; +mul.f32 f817, f816, 0fBF5DB3D7; +add.f32 f818, f790, f795; +add.f32 f819, f792, f797; +mul.f32 f820, f818, 0f3F000000; +sub.f32 f821, f739, f820; +sub.f32 f822, f792, f797; +mul.f32 f823, f822, 0fBF5DB3D7; +mul.f32 f824, f819, 0f3F000000; +sub.f32 f825, f745, f824; +sub.f32 f826, f790, f795; +mul.f32 f827, f826, 0fBF5DB3D7; +add.f32 %1, f733, f799; +add.f32 %0, f731, f798; +add.f32 %3, f744, f809; +add.f32 %2, f738, f808; +add.f32 %5, f745, f819; +add.f32 %4, f739, f818; +sub.f32 %7, f805, f807; +add.f32 %6, f803, f801; +sub.f32 %9, f815, f817; +add.f32 %8, f813, f811; +sub.f32 %11, f825, f827; +add.f32 %10, f823, f821; +add.f32 %13, f807, f805; +sub.f32 %12, f801, f803; +add.f32 %15, f817, f815; +sub.f32 %14, f811, f813; +add.f32 %17, f827, f825; +sub.f32 %16, f821, f823; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_6561), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<354, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<792>; +.reg .b32 r<24>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 26244, r2; +add.f32 f37, %30, %38; +add.f32 f38, %22, f37; +add.f32 f39, %31, %39; +add.f32 f40, %23, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %22, f41; +sub.f32 f43, %31, %39; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %23, f47; +sub.f32 f49, %30, %38; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %32, %40; +add.f32 f54, %24, f53; +add.f32 f55, %34, %42; +add.f32 f56, %26, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %24, f57; +sub.f32 f59, %34, %42; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %26, f63; +sub.f32 f65, %32, %40; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %35, %43; +add.f32 f70, %27, f69; +add.f32 f71, %37, %44; +add.f32 f72, %29, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %27, f73; +sub.f32 f75, %37, %44; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %29, f79; +sub.f32 f81, %35, %43; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0fBF5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0fBF5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0fBF5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0fBF5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f124, f154; +fma.rn.f32 f158, f153, f122, f157; +mul.f32 f159, f122, f154; +mul.f32 f160, f153, f124; +sub.f32 f161, f160, f159; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f140, f166; +fma.rn.f32 f168, f164, f138, f167; +mul.f32 f169, f138, f166; +mul.f32 f170, f164, f140; +sub.f32 f171, f170, f169; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f119, f176; +fma.rn.f32 f178, f174, f113, f177; +mul.f32 f179, f113, f176; +mul.f32 f180, f174, f119; +sub.f32 f181, f180, f179; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f135, f186; +fma.rn.f32 f188, f184, f129, f187; +mul.f32 f189, f129, f186; +mul.f32 f190, f184, f135; +sub.f32 f191, f190, f189; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f151, f196; +fma.rn.f32 f198, f194, f145, f197; +mul.f32 f199, f145, f196; +mul.f32 f200, f194, f151; +sub.f32 f201, f200, f199; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f120, f206; +fma.rn.f32 f208, f204, f114, f207; +mul.f32 f209, f114, f206; +mul.f32 f210, f204, f120; +sub.f32 f211, f210, f209; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f136, f216; +fma.rn.f32 f218, f214, f130, f217; +mul.f32 f219, f130, f216; +mul.f32 f220, f214, f136; +sub.f32 f221, f220, f219; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f152, f226; +fma.rn.f32 f228, f224, f146, f227; +mul.f32 f229, f146, f226; +mul.f32 f230, f224, f152; +sub.f32 f231, f230, f229; +mad.lo.s32 r8, r5, 26244, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f158; +st.shared.f32 [r9+8], f168; +st.shared.f32 [r9+12], f178; +st.shared.f32 [r9+16], f188; +st.shared.f32 [r9+20], f198; +st.shared.f32 [r9+24], f208; +st.shared.f32 [r9+28], f218; +st.shared.f32 [r9+32], f228; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+2916]; +ld.shared.f32 f234, [r11+5832]; +ld.shared.f32 f235, [r11+8748]; +ld.shared.f32 f236, [r11+11664]; +ld.shared.f32 f237, [r11+14580]; +ld.shared.f32 f238, [r11+17496]; +ld.shared.f32 f239, [r11+20412]; +ld.shared.f32 f240, [r11+23328]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+2916]; +ld.shared.f32 f243, [r11+5832]; +ld.shared.f32 f244, [r11+8748]; +ld.shared.f32 f245, [r11+11664]; +ld.shared.f32 f246, [r11+14580]; +ld.shared.f32 f247, [r11+17496]; +ld.shared.f32 f248, [r11+20412]; +ld.shared.f32 f249, [r11+23328]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0fBF5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0fBF5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0fBF5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0fBF5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0fBF5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0fBF5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0f3F248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0f3F248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0f3F7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0f3F7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0f3F7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0f3F7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0f3EAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0f3EAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0fBF5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0fBF5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f337, f367; +fma.rn.f32 f371, f366, f335, f370; +mul.f32 f372, f335, f367; +mul.f32 f373, f366, f337; +sub.f32 f374, f373, f372; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f353, f379; +fma.rn.f32 f381, f377, f351, f380; +mul.f32 f382, f351, f379; +mul.f32 f383, f377, f353; +sub.f32 f384, f383, f382; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f332, f389; +fma.rn.f32 f391, f387, f326, f390; +mul.f32 f392, f326, f389; +mul.f32 f393, f387, f332; +sub.f32 f394, f393, f392; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f348, f399; +fma.rn.f32 f401, f397, f342, f400; +mul.f32 f402, f342, f399; +mul.f32 f403, f397, f348; +sub.f32 f404, f403, f402; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f364, f409; +fma.rn.f32 f411, f407, f358, f410; +mul.f32 f412, f358, f409; +mul.f32 f413, f407, f364; +sub.f32 f414, f413, f412; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f333, f419; +fma.rn.f32 f421, f417, f327, f420; +mul.f32 f422, f327, f419; +mul.f32 f423, f417, f333; +sub.f32 f424, f423, f422; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f349, f429; +fma.rn.f32 f431, f427, f343, f430; +mul.f32 f432, f343, f429; +mul.f32 f433, f427, f349; +sub.f32 f434, f433, f432; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f365, f439; +fma.rn.f32 f441, f437, f359, f440; +mul.f32 f442, f359, f439; +mul.f32 f443, f437, f365; +sub.f32 f444, f443, f442; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 324, r16; +st.shared.f32 [r17], f319; +st.shared.f32 [r17+36], f371; +st.shared.f32 [r17+72], f381; +st.shared.f32 [r17+108], f391; +st.shared.f32 [r17+144], f401; +st.shared.f32 [r17+180], f411; +st.shared.f32 [r17+216], f421; +st.shared.f32 [r17+252], f431; +st.shared.f32 [r17+288], f441; +barrier.sync 0; +ld.shared.f32 f445, [r11]; +ld.shared.f32 f446, [r11+2916]; +ld.shared.f32 f447, [r11+5832]; +ld.shared.f32 f448, [r11+8748]; +ld.shared.f32 f449, [r11+11664]; +ld.shared.f32 f450, [r11+14580]; +ld.shared.f32 f451, [r11+17496]; +ld.shared.f32 f452, [r11+20412]; +ld.shared.f32 f453, [r11+23328]; +barrier.sync 0; +st.shared.f32 [r17], f321; +st.shared.f32 [r17+36], f374; +st.shared.f32 [r17+72], f384; +st.shared.f32 [r17+108], f394; +st.shared.f32 [r17+144], f404; +st.shared.f32 [r17+180], f414; +st.shared.f32 [r17+216], f424; +st.shared.f32 [r17+252], f434; +st.shared.f32 [r17+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r11]; +ld.shared.f32 f455, [r11+2916]; +ld.shared.f32 f456, [r11+5832]; +ld.shared.f32 f457, [r11+8748]; +ld.shared.f32 f458, [r11+11664]; +ld.shared.f32 f459, [r11+14580]; +ld.shared.f32 f460, [r11+17496]; +ld.shared.f32 f461, [r11+20412]; +ld.shared.f32 f462, [r11+23328]; +add.f32 f463, f448, f451; +add.f32 f464, f445, f463; +add.f32 f465, f457, f460; +add.f32 f466, f454, f465; +mul.f32 f467, f463, 0f3F000000; +sub.f32 f468, f445, f467; +sub.f32 f469, f457, f460; +mul.f32 f470, f469, 0fBF5DB3D7; +add.f32 f471, f470, f468; +sub.f32 f472, f468, f470; +mul.f32 f473, f465, 0f3F000000; +sub.f32 f474, f454, f473; +sub.f32 f475, f448, f451; +mul.f32 f476, f475, 0fBF5DB3D7; +sub.f32 f477, f474, f476; +add.f32 f478, f476, f474; +add.f32 f479, f449, f452; +add.f32 f480, f446, f479; +add.f32 f481, f458, f461; +add.f32 f482, f455, f481; +mul.f32 f483, f479, 0f3F000000; +sub.f32 f484, f446, f483; +sub.f32 f485, f458, f461; +mul.f32 f486, f485, 0fBF5DB3D7; +add.f32 f487, f486, f484; +sub.f32 f488, f484, f486; +mul.f32 f489, f481, 0f3F000000; +sub.f32 f490, f455, f489; +sub.f32 f491, f449, f452; +mul.f32 f492, f491, 0fBF5DB3D7; +sub.f32 f493, f490, f492; +add.f32 f494, f492, f490; +add.f32 f495, f450, f453; +add.f32 f496, f447, f495; +add.f32 f497, f459, f462; +add.f32 f498, f456, f497; +mul.f32 f499, f495, 0f3F000000; +sub.f32 f500, f447, f499; +sub.f32 f501, f459, f462; +mul.f32 f502, f501, 0fBF5DB3D7; +add.f32 f503, f502, f500; +sub.f32 f504, f500, f502; +mul.f32 f505, f497, 0f3F000000; +sub.f32 f506, f456, f505; +sub.f32 f507, f450, f453; +mul.f32 f508, f507, 0fBF5DB3D7; +sub.f32 f509, f506, f508; +add.f32 f510, f508, f506; +mul.f32 f511, f487, 0f3F441B7D; +mul.f32 f512, f493, 0f3F248DBB; +sub.f32 f513, f511, f512; +mul.f32 f514, f493, 0f3F441B7D; +fma.rn.f32 f515, f487, 0f3F248DBB, f514; +mul.f32 f516, f503, 0f3E31D0D4; +mul.f32 f517, f509, 0f3F7C1C5C; +sub.f32 f518, f516, f517; +mul.f32 f519, f509, 0f3E31D0D4; +fma.rn.f32 f520, f503, 0f3F7C1C5C, f519; +mul.f32 f521, f488, 0f3E31D0D4; +mul.f32 f522, f494, 0f3F7C1C5C; +sub.f32 f523, f521, f522; +mul.f32 f524, f494, 0f3E31D0D4; +fma.rn.f32 f525, f488, 0f3F7C1C5C, f524; +mul.f32 f526, f504, 0fBF708FB2; +mul.f32 f527, f510, 0f3EAF1D44; +sub.f32 f528, f526, f527; +mul.f32 f529, f510, 0fBF708FB2; +fma.rn.f32 f530, f504, 0f3EAF1D44, f529; +add.f32 f531, f480, f496; +add.f32 f532, f464, f531; +add.f32 f533, f482, f498; +add.f32 f534, f466, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f464, f535; +sub.f32 f537, f482, f498; +mul.f32 f538, f537, 0fBF5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f466, f541; +sub.f32 f543, f480, f496; +mul.f32 f544, f543, 0fBF5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +add.f32 f547, f513, f518; +add.f32 f548, f471, f547; +add.f32 f549, f515, f520; +add.f32 f550, f477, f549; +mul.f32 f551, f547, 0f3F000000; +sub.f32 f552, f471, f551; +sub.f32 f553, f515, f520; +mul.f32 f554, f553, 0fBF5DB3D7; +add.f32 f555, f554, f552; +sub.f32 f556, f552, f554; +mul.f32 f557, f549, 0f3F000000; +sub.f32 f558, f477, f557; +sub.f32 f559, f513, f518; +mul.f32 f560, f559, 0fBF5DB3D7; +sub.f32 f561, f558, f560; +add.f32 f562, f560, f558; +add.f32 f563, f523, f528; +add.f32 f564, f472, f563; +add.f32 f565, f525, f530; +add.f32 f566, f478, f565; +mul.f32 f567, f563, 0f3F000000; +sub.f32 f568, f472, f567; +sub.f32 f569, f525, f530; +mul.f32 f570, f569, 0fBF5DB3D7; +add.f32 f571, f570, f568; +sub.f32 f572, f568, f570; +mul.f32 f573, f565, 0f3F000000; +sub.f32 f574, f478, f573; +sub.f32 f575, f523, f528; +mul.f32 f576, f575, 0fBF5DB3D7; +sub.f32 f577, f574, f576; +add.f32 f578, f576, f574; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f579, f580}, [rd16]; +mul.f32 f583, f550, f580; +fma.rn.f32 f584, f579, f548, f583; +mul.f32 f585, f548, f580; +mul.f32 f586, f579, f550; +sub.f32 f587, f586, f585; +mul.f32 f588, f579, f579; +mul.f32 f589, f580, f580; +sub.f32 f590, f588, f589; +mul.f32 f591, f580, f579; +fma.rn.f32 f592, f580, f579, f591; +mul.f32 f593, f566, f592; +fma.rn.f32 f594, f590, f564, f593; +mul.f32 f595, f564, f592; +mul.f32 f596, f590, f566; +sub.f32 f597, f596, f595; +mul.f32 f598, f579, f590; +mul.f32 f599, f580, f592; +sub.f32 f600, f598, f599; +mul.f32 f601, f579, f592; +fma.rn.f32 f602, f580, f590, f601; +mul.f32 f603, f545, f602; +fma.rn.f32 f604, f600, f539, f603; +mul.f32 f605, f539, f602; +mul.f32 f606, f600, f545; +sub.f32 f607, f606, f605; +mul.f32 f608, f579, f600; +mul.f32 f609, f580, f602; +sub.f32 f610, f608, f609; +mul.f32 f611, f579, f602; +fma.rn.f32 f612, f580, f600, f611; +mul.f32 f613, f561, f612; +fma.rn.f32 f614, f610, f555, f613; +mul.f32 f615, f555, f612; +mul.f32 f616, f610, f561; +sub.f32 f617, f616, f615; +mul.f32 f618, f579, f610; +mul.f32 f619, f580, f612; +sub.f32 f620, f618, f619; +mul.f32 f621, f579, f612; +fma.rn.f32 f622, f580, f610, f621; +mul.f32 f623, f577, f622; +fma.rn.f32 f624, f620, f571, f623; +mul.f32 f625, f571, f622; +mul.f32 f626, f620, f577; +sub.f32 f627, f626, f625; +mul.f32 f628, f579, f620; +mul.f32 f629, f580, f622; +sub.f32 f630, f628, f629; +mul.f32 f631, f579, f622; +fma.rn.f32 f632, f580, f620, f631; +mul.f32 f633, f546, f632; +fma.rn.f32 f634, f630, f540, f633; +mul.f32 f635, f540, f632; +mul.f32 f636, f630, f546; +sub.f32 f637, f636, f635; +mul.f32 f638, f579, f630; +mul.f32 f639, f580, f632; +sub.f32 f640, f638, f639; +mul.f32 f641, f579, f632; +fma.rn.f32 f642, f580, f630, f641; +mul.f32 f643, f562, f642; +fma.rn.f32 f644, f640, f556, f643; +mul.f32 f645, f556, f642; +mul.f32 f646, f640, f562; +sub.f32 f647, f646, f645; +mul.f32 f648, f579, f640; +mul.f32 f649, f580, f642; +sub.f32 f650, f648, f649; +mul.f32 f651, f579, f642; +fma.rn.f32 f652, f580, f640, f651; +mul.f32 f653, f578, f652; +fma.rn.f32 f654, f650, f572, f653; +mul.f32 f655, f572, f652; +mul.f32 f656, f650, f578; +sub.f32 f657, f656, f655; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 2916, r22; +st.shared.f32 [r23], f532; +st.shared.f32 [r23+324], f584; +st.shared.f32 [r23+648], f594; +st.shared.f32 [r23+972], f604; +st.shared.f32 [r23+1296], f614; +st.shared.f32 [r23+1620], f624; +st.shared.f32 [r23+1944], f634; +st.shared.f32 [r23+2268], f644; +st.shared.f32 [r23+2592], f654; +barrier.sync 0; +ld.shared.f32 f658, [r11]; +ld.shared.f32 f659, [r11+2916]; +ld.shared.f32 f660, [r11+5832]; +ld.shared.f32 f661, [r11+8748]; +ld.shared.f32 f662, [r11+11664]; +ld.shared.f32 f663, [r11+14580]; +ld.shared.f32 f664, [r11+17496]; +ld.shared.f32 f665, [r11+20412]; +ld.shared.f32 f666, [r11+23328]; +barrier.sync 0; +st.shared.f32 [r23], f534; +st.shared.f32 [r23+324], f587; +st.shared.f32 [r23+648], f597; +st.shared.f32 [r23+972], f607; +st.shared.f32 [r23+1296], f617; +st.shared.f32 [r23+1620], f627; +st.shared.f32 [r23+1944], f637; +st.shared.f32 [r23+2268], f647; +st.shared.f32 [r23+2592], f657; +barrier.sync 0; +ld.shared.f32 f667, [r11]; +ld.shared.f32 f668, [r11+2916]; +ld.shared.f32 f669, [r11+5832]; +ld.shared.f32 f670, [r11+8748]; +ld.shared.f32 f671, [r11+11664]; +ld.shared.f32 f672, [r11+14580]; +ld.shared.f32 f673, [r11+17496]; +ld.shared.f32 f674, [r11+20412]; +ld.shared.f32 f675, [r11+23328]; +add.f32 f676, f661, f664; +add.f32 f677, f658, f676; +add.f32 f678, f670, f673; +add.f32 f679, f667, f678; +mul.f32 f680, f676, 0f3F000000; +sub.f32 f681, f658, f680; +sub.f32 f682, f670, f673; +mul.f32 f683, f682, 0fBF5DB3D7; +add.f32 f684, f683, f681; +sub.f32 f685, f681, f683; +mul.f32 f686, f678, 0f3F000000; +sub.f32 f687, f667, f686; +sub.f32 f688, f661, f664; +mul.f32 f689, f688, 0fBF5DB3D7; +sub.f32 f690, f687, f689; +add.f32 f691, f689, f687; +add.f32 f692, f662, f665; +add.f32 f693, f659, f692; +add.f32 f694, f671, f674; +add.f32 f695, f668, f694; +mul.f32 f696, f692, 0f3F000000; +sub.f32 f697, f659, f696; +sub.f32 f698, f671, f674; +mul.f32 f699, f698, 0fBF5DB3D7; +add.f32 f700, f699, f697; +sub.f32 f701, f697, f699; +mul.f32 f702, f694, 0f3F000000; +sub.f32 f703, f668, f702; +sub.f32 f704, f662, f665; +mul.f32 f705, f704, 0fBF5DB3D7; +sub.f32 f706, f703, f705; +add.f32 f707, f705, f703; +add.f32 f708, f663, f666; +add.f32 f709, f660, f708; +add.f32 f710, f672, f675; +add.f32 f711, f669, f710; +mul.f32 f712, f708, 0f3F000000; +sub.f32 f713, f660, f712; +sub.f32 f714, f672, f675; +mul.f32 f715, f714, 0fBF5DB3D7; +add.f32 f716, f715, f713; +sub.f32 f717, f713, f715; +mul.f32 f718, f710, 0f3F000000; +sub.f32 f719, f669, f718; +sub.f32 f720, f663, f666; +mul.f32 f721, f720, 0fBF5DB3D7; +sub.f32 f722, f719, f721; +add.f32 f723, f721, f719; +mul.f32 f724, f700, 0f3F441B7D; +mul.f32 f725, f706, 0f3F248DBB; +sub.f32 f726, f724, f725; +mul.f32 f727, f706, 0f3F441B7D; +fma.rn.f32 f728, f700, 0f3F248DBB, f727; +mul.f32 f729, f716, 0f3E31D0D4; +mul.f32 f730, f722, 0f3F7C1C5C; +sub.f32 f731, f729, f730; +mul.f32 f732, f722, 0f3E31D0D4; +fma.rn.f32 f733, f716, 0f3F7C1C5C, f732; +mul.f32 f734, f701, 0f3E31D0D4; +mul.f32 f735, f707, 0f3F7C1C5C; +sub.f32 f736, f734, f735; +mul.f32 f737, f707, 0f3E31D0D4; +fma.rn.f32 f738, f701, 0f3F7C1C5C, f737; +mul.f32 f739, f717, 0fBF708FB2; +mul.f32 f740, f723, 0f3EAF1D44; +sub.f32 f741, f739, f740; +mul.f32 f742, f723, 0fBF708FB2; +fma.rn.f32 f743, f717, 0f3EAF1D44, f742; +add.f32 f744, f693, f709; +add.f32 f745, f695, f711; +mul.f32 f746, f744, 0f3F000000; +sub.f32 f747, f677, f746; +sub.f32 f748, f695, f711; +mul.f32 f749, f748, 0fBF5DB3D7; +mul.f32 f750, f745, 0f3F000000; +sub.f32 f751, f679, f750; +sub.f32 f752, f693, f709; +mul.f32 f753, f752, 0fBF5DB3D7; +add.f32 f754, f726, f731; +add.f32 f755, f728, f733; +mul.f32 f756, f754, 0f3F000000; +sub.f32 f757, f684, f756; +sub.f32 f758, f728, f733; +mul.f32 f759, f758, 0fBF5DB3D7; +mul.f32 f760, f755, 0f3F000000; +sub.f32 f761, f690, f760; +sub.f32 f762, f726, f731; +mul.f32 f763, f762, 0fBF5DB3D7; +add.f32 f764, f736, f741; +add.f32 f765, f738, f743; +mul.f32 f766, f764, 0f3F000000; +sub.f32 f767, f685, f766; +sub.f32 f768, f738, f743; +mul.f32 f769, f768, 0fBF5DB3D7; +mul.f32 f770, f765, 0f3F000000; +sub.f32 f771, f691, f770; +sub.f32 f772, f736, f741; +mul.f32 f773, f772, 0fBF5DB3D7; +add.f32 %0, f677, f744; +add.f32 %1, f679, f745; +add.f32 %3, f690, f755; +add.f32 %2, f684, f754; +add.f32 %5, f691, f765; +add.f32 %4, f685, f764; +add.f32 %6, f749, f747; +sub.f32 %7, f751, f753; +sub.f32 %9, f761, f763; +add.f32 %8, f759, f757; +sub.f32 %11, f771, f773; +add.f32 %10, f769, f767; +sub.f32 %12, f747, f749; +add.f32 %13, f753, f751; +add.f32 %15, f763, f761; +sub.f32 %14, f757, f759; +add.f32 %17, f773, f771; +sub.f32 %16, f767, f769; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_6561), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..a18a7a1699be8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp64_fwd.hpp.inc @@ -0,0 +1,1646 @@ +#ifndef CUFFTDX_FFT_6561_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_6561_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1165, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<843>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 104976, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 104976, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd151, fd120; +mul.f64 fd156, fd152, fd122; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd136; +mul.f64 fd164, fd162, fd138; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd168, fd111; +mul.f64 fd172, fd170, fd117; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd176, fd127; +mul.f64 fd180, fd178, fd133; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+11664]; +mul.f64 fd186, fd182, fd143; +mul.f64 fd187, fd183, fd149; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd191, fd112; +mul.f64 fd195, fd193, fd118; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd199, fd128; +mul.f64 fd203, fd201, fd134; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd207, fd144; +mul.f64 fd211, fd209, fd150; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd152, fd120, fd157; +sub.f64 fd216, fd155, fd156; +st.shared.v2.f64 [r9+16], {fd216, fd215}; +fma.rn.f64 fd217, fd162, fd136, fd165; +sub.f64 fd218, fd163, fd164; +st.shared.v2.f64 [r9+32], {fd218, fd217}; +sub.f64 fd219, fd171, fd172; +fma.rn.f64 fd220, fd170, fd111, fd173; +st.shared.v2.f64 [r9+48], {fd219, fd220}; +fma.rn.f64 fd221, fd178, fd127, fd181; +sub.f64 fd222, fd179, fd180; +st.shared.v2.f64 [r9+64], {fd222, fd221}; +fma.rn.f64 fd223, fd183, fd143, fd188; +sub.f64 fd224, fd186, fd187; +st.shared.v2.f64 [r9+80], {fd224, fd223}; +fma.rn.f64 fd225, fd193, fd112, fd196; +sub.f64 fd226, fd194, fd195; +st.shared.v2.f64 [r9+96], {fd226, fd225}; +fma.rn.f64 fd227, fd201, fd128, fd204; +sub.f64 fd228, fd202, fd203; +st.shared.v2.f64 [r9+112], {fd228, fd227}; +fma.rn.f64 fd229, fd209, fd144, fd212; +sub.f64 fd230, fd210, fd211; +st.shared.v2.f64 [r9+128], {fd230, fd229}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+11664]; +ld.shared.v2.f64 {fd239, fd240}, [r11+23328]; +ld.shared.v2.f64 {fd243, fd244}, [r11+34992]; +ld.shared.v2.f64 {fd247, fd248}, [r11+46656]; +ld.shared.v2.f64 {fd251, fd252}, [r11+58320]; +ld.shared.v2.f64 {fd255, fd256}, [r11+69984]; +ld.shared.v2.f64 {fd259, fd260}, [r11+81648]; +ld.shared.v2.f64 {fd263, fd264}, [r11+93312]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0d3FEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0d3FEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0d3FEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0dBFE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0dBFE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0dBFEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0dBFEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0dBFEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0dBFEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0dBFD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0dBFD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0d3FEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0d3FEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd381, fd350; +mul.f64 fd386, fd382, fd352; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd390, fd366; +mul.f64 fd394, fd392, fd368; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd398, fd341; +mul.f64 fd402, fd400, fd347; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd406, fd357; +mul.f64 fd410, fd408, fd363; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+1296]; +mul.f64 fd416, fd412, fd373; +mul.f64 fd417, fd413, fd379; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd421, fd342; +mul.f64 fd425, fd423, fd348; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd429, fd358; +mul.f64 fd433, fd431, fd364; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd437, fd374; +mul.f64 fd441, fd439, fd380; +mul.f64 fd442, fd437, fd380; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 1296, r16; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r17], {fd444, fd443}; +fma.rn.f64 fd445, fd382, fd350, fd387; +sub.f64 fd446, fd385, fd386; +st.shared.v2.f64 [r17+144], {fd446, fd445}; +fma.rn.f64 fd447, fd392, fd366, fd395; +sub.f64 fd448, fd393, fd394; +st.shared.v2.f64 [r17+288], {fd448, fd447}; +fma.rn.f64 fd449, fd400, fd341, fd403; +sub.f64 fd450, fd401, fd402; +st.shared.v2.f64 [r17+432], {fd450, fd449}; +fma.rn.f64 fd451, fd408, fd357, fd411; +sub.f64 fd452, fd409, fd410; +st.shared.v2.f64 [r17+576], {fd452, fd451}; +fma.rn.f64 fd453, fd413, fd373, fd418; +sub.f64 fd454, fd416, fd417; +st.shared.v2.f64 [r17+720], {fd454, fd453}; +fma.rn.f64 fd455, fd423, fd342, fd426; +sub.f64 fd456, fd424, fd425; +st.shared.v2.f64 [r17+864], {fd456, fd455}; +fma.rn.f64 fd457, fd431, fd358, fd434; +sub.f64 fd458, fd432, fd433; +st.shared.v2.f64 [r17+1008], {fd458, fd457}; +fma.rn.f64 fd459, fd439, fd374, fd442; +sub.f64 fd460, fd440, fd441; +st.shared.v2.f64 [r17+1152], {fd460, fd459}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r11]; +ld.shared.v2.f64 {fd465, fd466}, [r11+11664]; +ld.shared.v2.f64 {fd469, fd470}, [r11+23328]; +ld.shared.v2.f64 {fd473, fd474}, [r11+34992]; +ld.shared.v2.f64 {fd477, fd478}, [r11+46656]; +ld.shared.v2.f64 {fd481, fd482}, [r11+58320]; +ld.shared.v2.f64 {fd485, fd486}, [r11+69984]; +ld.shared.v2.f64 {fd489, fd490}, [r11+81648]; +ld.shared.v2.f64 {fd493, fd494}, [r11+93312]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd461, fd497; +add.f64 fd499, fd474, fd486; +add.f64 fd500, fd462, fd499; +mul.f64 fd501, fd497, 0d3FE0000000000000; +sub.f64 fd502, fd461, fd501; +sub.f64 fd503, fd474, fd486; +mul.f64 fd504, fd503, 0d3FEBB67AE8584CAA; +add.f64 fd505, fd504, fd502; +sub.f64 fd506, fd502, fd504; +mul.f64 fd507, fd499, 0d3FE0000000000000; +sub.f64 fd508, fd462, fd507; +sub.f64 fd509, fd473, fd485; +mul.f64 fd510, fd509, 0d3FEBB67AE8584CAA; +sub.f64 fd511, fd508, fd510; +add.f64 fd512, fd510, fd508; +add.f64 fd513, fd477, fd489; +add.f64 fd514, fd465, fd513; +add.f64 fd515, fd478, fd490; +add.f64 fd516, fd466, fd515; +mul.f64 fd517, fd513, 0d3FE0000000000000; +sub.f64 fd518, fd465, fd517; +sub.f64 fd519, fd478, fd490; +mul.f64 fd520, fd519, 0d3FEBB67AE8584CAA; +add.f64 fd521, fd520, fd518; +sub.f64 fd522, fd518, fd520; +mul.f64 fd523, fd515, 0d3FE0000000000000; +sub.f64 fd524, fd466, fd523; +sub.f64 fd525, fd477, fd489; +mul.f64 fd526, fd525, 0d3FEBB67AE8584CAA; +sub.f64 fd527, fd524, fd526; +add.f64 fd528, fd526, fd524; +add.f64 fd529, fd481, fd493; +add.f64 fd530, fd469, fd529; +add.f64 fd531, fd482, fd494; +add.f64 fd532, fd470, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd469, fd533; +sub.f64 fd535, fd482, fd494; +mul.f64 fd536, fd535, 0d3FEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd470, fd539; +sub.f64 fd541, fd481, fd493; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +mul.f64 fd545, fd521, 0d3FE8836FA2CF5039; +mul.f64 fd546, fd527, 0dBFE491B7523C161D; +sub.f64 fd547, fd545, fd546; +mul.f64 fd548, fd527, 0d3FE8836FA2CF5039; +fma.rn.f64 fd549, fd521, 0dBFE491B7523C161D, fd548; +mul.f64 fd550, fd537, 0d3FC63A1A7E0B738A; +mul.f64 fd551, fd543, 0dBFEF838B8C811C17; +sub.f64 fd552, fd550, fd551; +mul.f64 fd553, fd543, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd554, fd537, 0dBFEF838B8C811C17, fd553; +mul.f64 fd555, fd522, 0d3FC63A1A7E0B738A; +mul.f64 fd556, fd528, 0dBFEF838B8C811C17; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd528, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd559, fd522, 0dBFEF838B8C811C17, fd558; +mul.f64 fd560, fd538, 0dBFEE11F642522D1C; +mul.f64 fd561, fd544, 0dBFD5E3A8748A0BF5; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd544, 0dBFEE11F642522D1C; +fma.rn.f64 fd564, fd538, 0dBFD5E3A8748A0BF5, fd563; +add.f64 fd565, fd514, fd530; +add.f64 fd566, fd516, fd532; +mul.f64 fd567, fd565, 0d3FE0000000000000; +sub.f64 fd568, fd498, fd567; +sub.f64 fd569, fd516, fd532; +mul.f64 fd570, fd569, 0d3FEBB67AE8584CAA; +add.f64 fd571, fd570, fd568; +sub.f64 fd572, fd568, fd570; +mul.f64 fd573, fd566, 0d3FE0000000000000; +sub.f64 fd574, fd500, fd573; +sub.f64 fd575, fd514, fd530; +mul.f64 fd576, fd575, 0d3FEBB67AE8584CAA; +sub.f64 fd577, fd574, fd576; +add.f64 fd578, fd576, fd574; +add.f64 fd579, fd547, fd552; +add.f64 fd580, fd505, fd579; +add.f64 fd581, fd549, fd554; +add.f64 fd582, fd511, fd581; +mul.f64 fd583, fd579, 0d3FE0000000000000; +sub.f64 fd584, fd505, fd583; +sub.f64 fd585, fd549, fd554; +mul.f64 fd586, fd585, 0d3FEBB67AE8584CAA; +add.f64 fd587, fd586, fd584; +sub.f64 fd588, fd584, fd586; +mul.f64 fd589, fd581, 0d3FE0000000000000; +sub.f64 fd590, fd511, fd589; +sub.f64 fd591, fd547, fd552; +mul.f64 fd592, fd591, 0d3FEBB67AE8584CAA; +sub.f64 fd593, fd590, fd592; +add.f64 fd594, fd592, fd590; +add.f64 fd595, fd557, fd562; +add.f64 fd596, fd506, fd595; +add.f64 fd597, fd559, fd564; +add.f64 fd598, fd512, fd597; +mul.f64 fd599, fd595, 0d3FE0000000000000; +sub.f64 fd600, fd506, fd599; +sub.f64 fd601, fd559, fd564; +mul.f64 fd602, fd601, 0d3FEBB67AE8584CAA; +add.f64 fd603, fd602, fd600; +sub.f64 fd604, fd600, fd602; +mul.f64 fd605, fd597, 0d3FE0000000000000; +sub.f64 fd606, fd512, fd605; +sub.f64 fd607, fd557, fd562; +mul.f64 fd608, fd607, 0d3FEBB67AE8584CAA; +sub.f64 fd609, fd606, fd608; +add.f64 fd610, fd608, fd606; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd611, fd612}, [rd16]; +mul.f64 fd615, fd611, fd580; +mul.f64 fd616, fd612, fd582; +mul.f64 fd617, fd611, fd582; +mul.f64 fd618, fd611, fd611; +mul.f64 fd619, fd612, fd612; +sub.f64 fd620, fd618, fd619; +mul.f64 fd621, fd612, fd611; +fma.rn.f64 fd622, fd612, fd611, fd621; +mul.f64 fd623, fd620, fd596; +mul.f64 fd624, fd622, fd598; +mul.f64 fd625, fd620, fd598; +mul.f64 fd626, fd611, fd620; +mul.f64 fd627, fd612, fd622; +sub.f64 fd628, fd626, fd627; +mul.f64 fd629, fd611, fd622; +fma.rn.f64 fd630, fd612, fd620, fd629; +mul.f64 fd631, fd628, fd571; +mul.f64 fd632, fd630, fd577; +mul.f64 fd633, fd628, fd577; +mul.f64 fd634, fd611, fd628; +mul.f64 fd635, fd612, fd630; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd611, fd630; +fma.rn.f64 fd638, fd612, fd628, fd637; +mul.f64 fd639, fd636, fd587; +mul.f64 fd640, fd638, fd593; +mul.f64 fd641, fd636, fd593; +ld.global.v2.f64 {fd642, fd643}, [rd16+144]; +mul.f64 fd646, fd642, fd603; +mul.f64 fd647, fd643, fd609; +mul.f64 fd648, fd642, fd609; +mul.f64 fd649, fd611, fd642; +mul.f64 fd650, fd612, fd643; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd611, fd643; +fma.rn.f64 fd653, fd612, fd642, fd652; +mul.f64 fd654, fd651, fd572; +mul.f64 fd655, fd653, fd578; +mul.f64 fd656, fd651, fd578; +mul.f64 fd657, fd611, fd651; +mul.f64 fd658, fd612, fd653; +sub.f64 fd659, fd657, fd658; +mul.f64 fd660, fd611, fd653; +fma.rn.f64 fd661, fd612, fd651, fd660; +mul.f64 fd662, fd659, fd588; +mul.f64 fd663, fd661, fd594; +mul.f64 fd664, fd659, fd594; +mul.f64 fd665, fd611, fd659; +mul.f64 fd666, fd612, fd661; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd611, fd661; +fma.rn.f64 fd669, fd612, fd659, fd668; +mul.f64 fd670, fd667, fd604; +mul.f64 fd671, fd669, fd610; +mul.f64 fd672, fd667, fd610; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 11664, r22; +add.f64 fd673, fd500, fd566; +add.f64 fd674, fd498, fd565; +st.shared.v2.f64 [r23], {fd674, fd673}; +fma.rn.f64 fd675, fd612, fd580, fd617; +sub.f64 fd676, fd615, fd616; +st.shared.v2.f64 [r23+1296], {fd676, fd675}; +fma.rn.f64 fd677, fd622, fd596, fd625; +sub.f64 fd678, fd623, fd624; +st.shared.v2.f64 [r23+2592], {fd678, fd677}; +fma.rn.f64 fd679, fd630, fd571, fd633; +sub.f64 fd680, fd631, fd632; +st.shared.v2.f64 [r23+3888], {fd680, fd679}; +fma.rn.f64 fd681, fd638, fd587, fd641; +sub.f64 fd682, fd639, fd640; +st.shared.v2.f64 [r23+5184], {fd682, fd681}; +fma.rn.f64 fd683, fd643, fd603, fd648; +sub.f64 fd684, fd646, fd647; +st.shared.v2.f64 [r23+6480], {fd684, fd683}; +fma.rn.f64 fd685, fd653, fd572, fd656; +sub.f64 fd686, fd654, fd655; +st.shared.v2.f64 [r23+7776], {fd686, fd685}; +fma.rn.f64 fd687, fd661, fd588, fd664; +sub.f64 fd688, fd662, fd663; +st.shared.v2.f64 [r23+9072], {fd688, fd687}; +fma.rn.f64 fd689, fd669, fd604, fd672; +sub.f64 fd690, fd670, fd671; +st.shared.v2.f64 [r23+10368], {fd690, fd689}; +barrier.sync 0; +ld.shared.v2.f64 {fd691, fd692}, [r11]; +ld.shared.v2.f64 {fd695, fd696}, [r11+11664]; +ld.shared.v2.f64 {fd699, fd700}, [r11+23328]; +ld.shared.v2.f64 {fd703, fd704}, [r11+34992]; +ld.shared.v2.f64 {fd707, fd708}, [r11+46656]; +ld.shared.v2.f64 {fd711, fd712}, [r11+58320]; +ld.shared.v2.f64 {fd715, fd716}, [r11+69984]; +ld.shared.v2.f64 {fd719, fd720}, [r11+81648]; +ld.shared.v2.f64 {fd723, fd724}, [r11+93312]; +add.f64 fd727, fd703, fd715; +add.f64 fd728, fd691, fd727; +add.f64 fd729, fd704, fd716; +add.f64 fd730, fd692, fd729; +mul.f64 fd731, fd727, 0d3FE0000000000000; +sub.f64 fd732, fd691, fd731; +sub.f64 fd733, fd704, fd716; +mul.f64 fd734, fd733, 0d3FEBB67AE8584CAA; +add.f64 fd735, fd734, fd732; +sub.f64 fd736, fd732, fd734; +mul.f64 fd737, fd729, 0d3FE0000000000000; +sub.f64 fd738, fd692, fd737; +sub.f64 fd739, fd703, fd715; +mul.f64 fd740, fd739, 0d3FEBB67AE8584CAA; +sub.f64 fd741, fd738, fd740; +add.f64 fd742, fd740, fd738; +add.f64 fd743, fd707, fd719; +add.f64 fd744, fd695, fd743; +add.f64 fd745, fd708, fd720; +add.f64 fd746, fd696, fd745; +mul.f64 fd747, fd743, 0d3FE0000000000000; +sub.f64 fd748, fd695, fd747; +sub.f64 fd749, fd708, fd720; +mul.f64 fd750, fd749, 0d3FEBB67AE8584CAA; +add.f64 fd751, fd750, fd748; +sub.f64 fd752, fd748, fd750; +mul.f64 fd753, fd745, 0d3FE0000000000000; +sub.f64 fd754, fd696, fd753; +sub.f64 fd755, fd707, fd719; +mul.f64 fd756, fd755, 0d3FEBB67AE8584CAA; +sub.f64 fd757, fd754, fd756; +add.f64 fd758, fd756, fd754; +add.f64 fd759, fd711, fd723; +add.f64 fd760, fd699, fd759; +add.f64 fd761, fd712, fd724; +add.f64 fd762, fd700, fd761; +mul.f64 fd763, fd759, 0d3FE0000000000000; +sub.f64 fd764, fd699, fd763; +sub.f64 fd765, fd712, fd724; +mul.f64 fd766, fd765, 0d3FEBB67AE8584CAA; +add.f64 fd767, fd766, fd764; +sub.f64 fd768, fd764, fd766; +mul.f64 fd769, fd761, 0d3FE0000000000000; +sub.f64 fd770, fd700, fd769; +sub.f64 fd771, fd711, fd723; +mul.f64 fd772, fd771, 0d3FEBB67AE8584CAA; +sub.f64 fd773, fd770, fd772; +add.f64 fd774, fd772, fd770; +mul.f64 fd775, fd751, 0d3FE8836FA2CF5039; +mul.f64 fd776, fd757, 0dBFE491B7523C161D; +sub.f64 fd777, fd775, fd776; +mul.f64 fd778, fd757, 0d3FE8836FA2CF5039; +fma.rn.f64 fd779, fd751, 0dBFE491B7523C161D, fd778; +mul.f64 fd780, fd767, 0d3FC63A1A7E0B738A; +mul.f64 fd781, fd773, 0dBFEF838B8C811C17; +sub.f64 fd782, fd780, fd781; +mul.f64 fd783, fd773, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd784, fd767, 0dBFEF838B8C811C17, fd783; +mul.f64 fd785, fd752, 0d3FC63A1A7E0B738A; +mul.f64 fd786, fd758, 0dBFEF838B8C811C17; +sub.f64 fd787, fd785, fd786; +mul.f64 fd788, fd758, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd789, fd752, 0dBFEF838B8C811C17, fd788; +mul.f64 fd790, fd768, 0dBFEE11F642522D1C; +mul.f64 fd791, fd774, 0dBFD5E3A8748A0BF5; +sub.f64 fd792, fd790, fd791; +mul.f64 fd793, fd774, 0dBFEE11F642522D1C; +fma.rn.f64 fd794, fd768, 0dBFD5E3A8748A0BF5, fd793; +add.f64 fd795, fd744, fd760; +add.f64 fd796, fd746, fd762; +mul.f64 fd797, fd795, 0d3FE0000000000000; +sub.f64 fd798, fd728, fd797; +sub.f64 fd799, fd746, fd762; +mul.f64 fd800, fd799, 0d3FEBB67AE8584CAA; +mul.f64 fd801, fd796, 0d3FE0000000000000; +sub.f64 fd802, fd730, fd801; +sub.f64 fd803, fd744, fd760; +mul.f64 fd804, fd803, 0d3FEBB67AE8584CAA; +add.f64 fd805, fd777, fd782; +add.f64 fd806, fd779, fd784; +mul.f64 fd807, fd805, 0d3FE0000000000000; +sub.f64 fd808, fd735, fd807; +sub.f64 fd809, fd779, fd784; +mul.f64 fd810, fd809, 0d3FEBB67AE8584CAA; +mul.f64 fd811, fd806, 0d3FE0000000000000; +sub.f64 fd812, fd741, fd811; +sub.f64 fd813, fd777, fd782; +mul.f64 fd814, fd813, 0d3FEBB67AE8584CAA; +add.f64 fd815, fd787, fd792; +add.f64 fd816, fd789, fd794; +mul.f64 fd817, fd815, 0d3FE0000000000000; +sub.f64 fd818, fd736, fd817; +sub.f64 fd819, fd789, fd794; +mul.f64 fd820, fd819, 0d3FEBB67AE8584CAA; +mul.f64 fd821, fd816, 0d3FE0000000000000; +sub.f64 fd822, fd742, fd821; +sub.f64 fd823, fd787, fd792; +mul.f64 fd824, fd823, 0d3FEBB67AE8584CAA; +add.f64 %1, fd730, fd796; +add.f64 %0, fd728, fd795; +add.f64 %3, fd741, fd806; +add.f64 %2, fd735, fd805; +add.f64 %5, fd742, fd816; +add.f64 %4, fd736, fd815; +sub.f64 %7, fd802, fd804; +add.f64 %6, fd800, fd798; +sub.f64 %9, fd812, fd814; +add.f64 %8, fd810, fd808; +sub.f64 %11, fd822, fd824; +add.f64 %10, fd820, fd818; +add.f64 %13, fd804, fd802; +sub.f64 %12, fd798, fd800; +add.f64 %15, fd814, fd812; +sub.f64 %14, fd808, fd810; +add.f64 %17, fd824, fd822; +sub.f64 %16, fd818, fd820; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_6561), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<531, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<789>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 52488, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0d3FEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0d3FEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0d3FEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd153, fd122; +mul.f64 fd158, fd154, fd124; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd153, fd124; +fma.rn.f64 fd161, fd154, fd122, fd160; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd164, fd138; +mul.f64 fd168, fd166, fd140; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd164, fd140; +fma.rn.f64 fd171, fd166, fd138, fd170; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd174, fd113; +mul.f64 fd178, fd176, fd119; +sub.f64 fd179, fd177, fd178; +mul.f64 fd180, fd174, fd119; +fma.rn.f64 fd181, fd176, fd113, fd180; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd184, fd129; +mul.f64 fd188, fd186, fd135; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd184, fd135; +fma.rn.f64 fd191, fd186, fd129, fd190; +ld.global.v2.f64 {fd192, fd193}, [rd6+11664]; +mul.f64 fd196, fd192, fd145; +mul.f64 fd197, fd193, fd151; +sub.f64 fd198, fd196, fd197; +mul.f64 fd199, fd192, fd151; +fma.rn.f64 fd200, fd193, fd145, fd199; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd203, fd114; +mul.f64 fd207, fd205, fd120; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, fd120; +fma.rn.f64 fd210, fd205, fd114, fd209; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd213, fd130; +mul.f64 fd217, fd215, fd136; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd213, fd136; +fma.rn.f64 fd220, fd215, fd130, fd219; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd223, fd146; +mul.f64 fd227, fd225, fd152; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd223, fd152; +fma.rn.f64 fd230, fd225, fd146, fd229; +mad.lo.s32 r8, r5, 52488, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd159; +st.shared.f64 [r9+16], fd169; +st.shared.f64 [r9+24], fd179; +st.shared.f64 [r9+32], fd189; +st.shared.f64 [r9+40], fd198; +st.shared.f64 [r9+48], fd208; +st.shared.f64 [r9+56], fd218; +st.shared.f64 [r9+64], fd228; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+5832]; +ld.shared.f64 fd233, [r11+11664]; +ld.shared.f64 fd234, [r11+17496]; +ld.shared.f64 fd235, [r11+23328]; +ld.shared.f64 fd236, [r11+29160]; +ld.shared.f64 fd237, [r11+34992]; +ld.shared.f64 fd238, [r11+40824]; +ld.shared.f64 fd239, [r11+46656]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+5832]; +ld.shared.f64 fd242, [r11+11664]; +ld.shared.f64 fd243, [r11+17496]; +ld.shared.f64 fd244, [r11+23328]; +ld.shared.f64 fd245, [r11+29160]; +ld.shared.f64 fd246, [r11+34992]; +ld.shared.f64 fd247, [r11+40824]; +ld.shared.f64 fd248, [r11+46656]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0d3FEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0d3FEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0d3FEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0dBFE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0dBFE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0dBFEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0dBFEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0dBFEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0dBFEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0dBFD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0dBFD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0d3FEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0d3FEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd365, fd334; +mul.f64 fd370, fd366, fd336; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd365, fd336; +fma.rn.f64 fd373, fd366, fd334, fd372; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd376, fd350; +mul.f64 fd380, fd378, fd352; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, fd352; +fma.rn.f64 fd383, fd378, fd350, fd382; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd386, fd325; +mul.f64 fd390, fd388, fd331; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd386, fd331; +fma.rn.f64 fd393, fd388, fd325, fd392; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd396, fd341; +mul.f64 fd400, fd398, fd347; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd396, fd347; +fma.rn.f64 fd403, fd398, fd341, fd402; +ld.global.v2.f64 {fd404, fd405}, [rd11+1296]; +mul.f64 fd408, fd404, fd357; +mul.f64 fd409, fd405, fd363; +sub.f64 fd410, fd408, fd409; +mul.f64 fd411, fd404, fd363; +fma.rn.f64 fd412, fd405, fd357, fd411; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd415, fd326; +mul.f64 fd419, fd417, fd332; +sub.f64 fd420, fd418, fd419; +mul.f64 fd421, fd415, fd332; +fma.rn.f64 fd422, fd417, fd326, fd421; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd425, fd342; +mul.f64 fd429, fd427, fd348; +sub.f64 fd430, fd428, fd429; +mul.f64 fd431, fd425, fd348; +fma.rn.f64 fd432, fd427, fd342, fd431; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd435, fd358; +mul.f64 fd439, fd437, fd364; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd435, fd364; +fma.rn.f64 fd442, fd437, fd358, fd441; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +st.shared.f64 [r17], fd318; +st.shared.f64 [r17+72], fd371; +st.shared.f64 [r17+144], fd381; +st.shared.f64 [r17+216], fd391; +st.shared.f64 [r17+288], fd401; +st.shared.f64 [r17+360], fd410; +st.shared.f64 [r17+432], fd420; +st.shared.f64 [r17+504], fd430; +st.shared.f64 [r17+576], fd440; +barrier.sync 0; +ld.shared.f64 fd443, [r11]; +ld.shared.f64 fd444, [r11+5832]; +ld.shared.f64 fd445, [r11+11664]; +ld.shared.f64 fd446, [r11+17496]; +ld.shared.f64 fd447, [r11+23328]; +ld.shared.f64 fd448, [r11+29160]; +ld.shared.f64 fd449, [r11+34992]; +ld.shared.f64 fd450, [r11+40824]; +ld.shared.f64 fd451, [r11+46656]; +barrier.sync 0; +st.shared.f64 [r17], fd320; +st.shared.f64 [r17+72], fd373; +st.shared.f64 [r17+144], fd383; +st.shared.f64 [r17+216], fd393; +st.shared.f64 [r17+288], fd403; +st.shared.f64 [r17+360], fd412; +st.shared.f64 [r17+432], fd422; +st.shared.f64 [r17+504], fd432; +st.shared.f64 [r17+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r11]; +ld.shared.f64 fd453, [r11+5832]; +ld.shared.f64 fd454, [r11+11664]; +ld.shared.f64 fd455, [r11+17496]; +ld.shared.f64 fd456, [r11+23328]; +ld.shared.f64 fd457, [r11+29160]; +ld.shared.f64 fd458, [r11+34992]; +ld.shared.f64 fd459, [r11+40824]; +ld.shared.f64 fd460, [r11+46656]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd443, fd461; +add.f64 fd463, fd455, fd458; +add.f64 fd464, fd452, fd463; +mul.f64 fd465, fd461, 0d3FE0000000000000; +sub.f64 fd466, fd443, fd465; +sub.f64 fd467, fd455, fd458; +mul.f64 fd468, fd467, 0d3FEBB67AE8584CAA; +add.f64 fd469, fd468, fd466; +sub.f64 fd470, fd466, fd468; +mul.f64 fd471, fd463, 0d3FE0000000000000; +sub.f64 fd472, fd452, fd471; +sub.f64 fd473, fd446, fd449; +mul.f64 fd474, fd473, 0d3FEBB67AE8584CAA; +sub.f64 fd475, fd472, fd474; +add.f64 fd476, fd474, fd472; +add.f64 fd477, fd447, fd450; +add.f64 fd478, fd444, fd477; +add.f64 fd479, fd456, fd459; +add.f64 fd480, fd453, fd479; +mul.f64 fd481, fd477, 0d3FE0000000000000; +sub.f64 fd482, fd444, fd481; +sub.f64 fd483, fd456, fd459; +mul.f64 fd484, fd483, 0d3FEBB67AE8584CAA; +add.f64 fd485, fd484, fd482; +sub.f64 fd486, fd482, fd484; +mul.f64 fd487, fd479, 0d3FE0000000000000; +sub.f64 fd488, fd453, fd487; +sub.f64 fd489, fd447, fd450; +mul.f64 fd490, fd489, 0d3FEBB67AE8584CAA; +sub.f64 fd491, fd488, fd490; +add.f64 fd492, fd490, fd488; +add.f64 fd493, fd448, fd451; +add.f64 fd494, fd445, fd493; +add.f64 fd495, fd457, fd460; +add.f64 fd496, fd454, fd495; +mul.f64 fd497, fd493, 0d3FE0000000000000; +sub.f64 fd498, fd445, fd497; +sub.f64 fd499, fd457, fd460; +mul.f64 fd500, fd499, 0d3FEBB67AE8584CAA; +add.f64 fd501, fd500, fd498; +sub.f64 fd502, fd498, fd500; +mul.f64 fd503, fd495, 0d3FE0000000000000; +sub.f64 fd504, fd454, fd503; +sub.f64 fd505, fd448, fd451; +mul.f64 fd506, fd505, 0d3FEBB67AE8584CAA; +sub.f64 fd507, fd504, fd506; +add.f64 fd508, fd506, fd504; +mul.f64 fd509, fd485, 0d3FE8836FA2CF5039; +mul.f64 fd510, fd491, 0dBFE491B7523C161D; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd491, 0d3FE8836FA2CF5039; +fma.rn.f64 fd513, fd485, 0dBFE491B7523C161D, fd512; +mul.f64 fd514, fd501, 0d3FC63A1A7E0B738A; +mul.f64 fd515, fd507, 0dBFEF838B8C811C17; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd507, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd518, fd501, 0dBFEF838B8C811C17, fd517; +mul.f64 fd519, fd486, 0d3FC63A1A7E0B738A; +mul.f64 fd520, fd492, 0dBFEF838B8C811C17; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd492, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd523, fd486, 0dBFEF838B8C811C17, fd522; +mul.f64 fd524, fd502, 0dBFEE11F642522D1C; +mul.f64 fd525, fd508, 0dBFD5E3A8748A0BF5; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd508, 0dBFEE11F642522D1C; +fma.rn.f64 fd528, fd502, 0dBFD5E3A8748A0BF5, fd527; +add.f64 fd529, fd478, fd494; +add.f64 fd530, fd462, fd529; +add.f64 fd531, fd480, fd496; +add.f64 fd532, fd464, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd462, fd533; +sub.f64 fd535, fd480, fd496; +mul.f64 fd536, fd535, 0d3FEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd464, fd539; +sub.f64 fd541, fd478, fd494; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +add.f64 fd545, fd511, fd516; +add.f64 fd546, fd469, fd545; +add.f64 fd547, fd513, fd518; +add.f64 fd548, fd475, fd547; +mul.f64 fd549, fd545, 0d3FE0000000000000; +sub.f64 fd550, fd469, fd549; +sub.f64 fd551, fd513, fd518; +mul.f64 fd552, fd551, 0d3FEBB67AE8584CAA; +add.f64 fd553, fd552, fd550; +sub.f64 fd554, fd550, fd552; +mul.f64 fd555, fd547, 0d3FE0000000000000; +sub.f64 fd556, fd475, fd555; +sub.f64 fd557, fd511, fd516; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +sub.f64 fd559, fd556, fd558; +add.f64 fd560, fd558, fd556; +add.f64 fd561, fd521, fd526; +add.f64 fd562, fd470, fd561; +add.f64 fd563, fd523, fd528; +add.f64 fd564, fd476, fd563; +mul.f64 fd565, fd561, 0d3FE0000000000000; +sub.f64 fd566, fd470, fd565; +sub.f64 fd567, fd523, fd528; +mul.f64 fd568, fd567, 0d3FEBB67AE8584CAA; +add.f64 fd569, fd568, fd566; +sub.f64 fd570, fd566, fd568; +mul.f64 fd571, fd563, 0d3FE0000000000000; +sub.f64 fd572, fd476, fd571; +sub.f64 fd573, fd521, fd526; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +sub.f64 fd575, fd572, fd574; +add.f64 fd576, fd574, fd572; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd577, fd578}, [rd16]; +mul.f64 fd581, fd577, fd546; +mul.f64 fd582, fd578, fd548; +sub.f64 fd583, fd581, fd582; +mul.f64 fd584, fd577, fd548; +fma.rn.f64 fd585, fd578, fd546, fd584; +mul.f64 fd586, fd577, fd577; +mul.f64 fd587, fd578, fd578; +sub.f64 fd588, fd586, fd587; +mul.f64 fd589, fd578, fd577; +fma.rn.f64 fd590, fd578, fd577, fd589; +mul.f64 fd591, fd588, fd562; +mul.f64 fd592, fd590, fd564; +sub.f64 fd593, fd591, fd592; +mul.f64 fd594, fd588, fd564; +fma.rn.f64 fd595, fd590, fd562, fd594; +mul.f64 fd596, fd577, fd588; +mul.f64 fd597, fd578, fd590; +sub.f64 fd598, fd596, fd597; +mul.f64 fd599, fd577, fd590; +fma.rn.f64 fd600, fd578, fd588, fd599; +mul.f64 fd601, fd598, fd537; +mul.f64 fd602, fd600, fd543; +sub.f64 fd603, fd601, fd602; +mul.f64 fd604, fd598, fd543; +fma.rn.f64 fd605, fd600, fd537, fd604; +mul.f64 fd606, fd577, fd598; +mul.f64 fd607, fd578, fd600; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd577, fd600; +fma.rn.f64 fd610, fd578, fd598, fd609; +mul.f64 fd611, fd608, fd553; +mul.f64 fd612, fd610, fd559; +sub.f64 fd613, fd611, fd612; +mul.f64 fd614, fd608, fd559; +fma.rn.f64 fd615, fd610, fd553, fd614; +ld.global.v2.f64 {fd616, fd617}, [rd16+144]; +mul.f64 fd620, fd616, fd569; +mul.f64 fd621, fd617, fd575; +sub.f64 fd622, fd620, fd621; +mul.f64 fd623, fd616, fd575; +fma.rn.f64 fd624, fd617, fd569, fd623; +mul.f64 fd625, fd577, fd616; +mul.f64 fd626, fd578, fd617; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd577, fd617; +fma.rn.f64 fd629, fd578, fd616, fd628; +mul.f64 fd630, fd627, fd538; +mul.f64 fd631, fd629, fd544; +sub.f64 fd632, fd630, fd631; +mul.f64 fd633, fd627, fd544; +fma.rn.f64 fd634, fd629, fd538, fd633; +mul.f64 fd635, fd577, fd627; +mul.f64 fd636, fd578, fd629; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd577, fd629; +fma.rn.f64 fd639, fd578, fd627, fd638; +mul.f64 fd640, fd637, fd554; +mul.f64 fd641, fd639, fd560; +sub.f64 fd642, fd640, fd641; +mul.f64 fd643, fd637, fd560; +fma.rn.f64 fd644, fd639, fd554, fd643; +mul.f64 fd645, fd577, fd637; +mul.f64 fd646, fd578, fd639; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd577, fd639; +fma.rn.f64 fd649, fd578, fd637, fd648; +mul.f64 fd650, fd647, fd570; +mul.f64 fd651, fd649, fd576; +sub.f64 fd652, fd650, fd651; +mul.f64 fd653, fd647, fd576; +fma.rn.f64 fd654, fd649, fd570, fd653; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +st.shared.f64 [r23], fd530; +st.shared.f64 [r23+648], fd583; +st.shared.f64 [r23+1296], fd593; +st.shared.f64 [r23+1944], fd603; +st.shared.f64 [r23+2592], fd613; +st.shared.f64 [r23+3240], fd622; +st.shared.f64 [r23+3888], fd632; +st.shared.f64 [r23+4536], fd642; +st.shared.f64 [r23+5184], fd652; +barrier.sync 0; +ld.shared.f64 fd655, [r11]; +ld.shared.f64 fd656, [r11+5832]; +ld.shared.f64 fd657, [r11+11664]; +ld.shared.f64 fd658, [r11+17496]; +ld.shared.f64 fd659, [r11+23328]; +ld.shared.f64 fd660, [r11+29160]; +ld.shared.f64 fd661, [r11+34992]; +ld.shared.f64 fd662, [r11+40824]; +ld.shared.f64 fd663, [r11+46656]; +barrier.sync 0; +st.shared.f64 [r23], fd532; +st.shared.f64 [r23+648], fd585; +st.shared.f64 [r23+1296], fd595; +st.shared.f64 [r23+1944], fd605; +st.shared.f64 [r23+2592], fd615; +st.shared.f64 [r23+3240], fd624; +st.shared.f64 [r23+3888], fd634; +st.shared.f64 [r23+4536], fd644; +st.shared.f64 [r23+5184], fd654; +barrier.sync 0; +ld.shared.f64 fd664, [r11]; +ld.shared.f64 fd665, [r11+5832]; +ld.shared.f64 fd666, [r11+11664]; +ld.shared.f64 fd667, [r11+17496]; +ld.shared.f64 fd668, [r11+23328]; +ld.shared.f64 fd669, [r11+29160]; +ld.shared.f64 fd670, [r11+34992]; +ld.shared.f64 fd671, [r11+40824]; +ld.shared.f64 fd672, [r11+46656]; +add.f64 fd673, fd658, fd661; +add.f64 fd674, fd655, fd673; +add.f64 fd675, fd667, fd670; +add.f64 fd676, fd664, fd675; +mul.f64 fd677, fd673, 0d3FE0000000000000; +sub.f64 fd678, fd655, fd677; +sub.f64 fd679, fd667, fd670; +mul.f64 fd680, fd679, 0d3FEBB67AE8584CAA; +add.f64 fd681, fd680, fd678; +sub.f64 fd682, fd678, fd680; +mul.f64 fd683, fd675, 0d3FE0000000000000; +sub.f64 fd684, fd664, fd683; +sub.f64 fd685, fd658, fd661; +mul.f64 fd686, fd685, 0d3FEBB67AE8584CAA; +sub.f64 fd687, fd684, fd686; +add.f64 fd688, fd686, fd684; +add.f64 fd689, fd659, fd662; +add.f64 fd690, fd656, fd689; +add.f64 fd691, fd668, fd671; +add.f64 fd692, fd665, fd691; +mul.f64 fd693, fd689, 0d3FE0000000000000; +sub.f64 fd694, fd656, fd693; +sub.f64 fd695, fd668, fd671; +mul.f64 fd696, fd695, 0d3FEBB67AE8584CAA; +add.f64 fd697, fd696, fd694; +sub.f64 fd698, fd694, fd696; +mul.f64 fd699, fd691, 0d3FE0000000000000; +sub.f64 fd700, fd665, fd699; +sub.f64 fd701, fd659, fd662; +mul.f64 fd702, fd701, 0d3FEBB67AE8584CAA; +sub.f64 fd703, fd700, fd702; +add.f64 fd704, fd702, fd700; +add.f64 fd705, fd660, fd663; +add.f64 fd706, fd657, fd705; +add.f64 fd707, fd669, fd672; +add.f64 fd708, fd666, fd707; +mul.f64 fd709, fd705, 0d3FE0000000000000; +sub.f64 fd710, fd657, fd709; +sub.f64 fd711, fd669, fd672; +mul.f64 fd712, fd711, 0d3FEBB67AE8584CAA; +add.f64 fd713, fd712, fd710; +sub.f64 fd714, fd710, fd712; +mul.f64 fd715, fd707, 0d3FE0000000000000; +sub.f64 fd716, fd666, fd715; +sub.f64 fd717, fd660, fd663; +mul.f64 fd718, fd717, 0d3FEBB67AE8584CAA; +sub.f64 fd719, fd716, fd718; +add.f64 fd720, fd718, fd716; +mul.f64 fd721, fd697, 0d3FE8836FA2CF5039; +mul.f64 fd722, fd703, 0dBFE491B7523C161D; +sub.f64 fd723, fd721, fd722; +mul.f64 fd724, fd703, 0d3FE8836FA2CF5039; +fma.rn.f64 fd725, fd697, 0dBFE491B7523C161D, fd724; +mul.f64 fd726, fd713, 0d3FC63A1A7E0B738A; +mul.f64 fd727, fd719, 0dBFEF838B8C811C17; +sub.f64 fd728, fd726, fd727; +mul.f64 fd729, fd719, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd730, fd713, 0dBFEF838B8C811C17, fd729; +mul.f64 fd731, fd698, 0d3FC63A1A7E0B738A; +mul.f64 fd732, fd704, 0dBFEF838B8C811C17; +sub.f64 fd733, fd731, fd732; +mul.f64 fd734, fd704, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd735, fd698, 0dBFEF838B8C811C17, fd734; +mul.f64 fd736, fd714, 0dBFEE11F642522D1C; +mul.f64 fd737, fd720, 0dBFD5E3A8748A0BF5; +sub.f64 fd738, fd736, fd737; +mul.f64 fd739, fd720, 0dBFEE11F642522D1C; +fma.rn.f64 fd740, fd714, 0dBFD5E3A8748A0BF5, fd739; +add.f64 fd741, fd690, fd706; +add.f64 fd742, fd692, fd708; +mul.f64 fd743, fd741, 0d3FE0000000000000; +sub.f64 fd744, fd674, fd743; +sub.f64 fd745, fd692, fd708; +mul.f64 fd746, fd745, 0d3FEBB67AE8584CAA; +mul.f64 fd747, fd742, 0d3FE0000000000000; +sub.f64 fd748, fd676, fd747; +sub.f64 fd749, fd690, fd706; +mul.f64 fd750, fd749, 0d3FEBB67AE8584CAA; +add.f64 fd751, fd723, fd728; +add.f64 fd752, fd725, fd730; +mul.f64 fd753, fd751, 0d3FE0000000000000; +sub.f64 fd754, fd681, fd753; +sub.f64 fd755, fd725, fd730; +mul.f64 fd756, fd755, 0d3FEBB67AE8584CAA; +mul.f64 fd757, fd752, 0d3FE0000000000000; +sub.f64 fd758, fd687, fd757; +sub.f64 fd759, fd723, fd728; +mul.f64 fd760, fd759, 0d3FEBB67AE8584CAA; +add.f64 fd761, fd733, fd738; +add.f64 fd762, fd735, fd740; +mul.f64 fd763, fd761, 0d3FE0000000000000; +sub.f64 fd764, fd682, fd763; +sub.f64 fd765, fd735, fd740; +mul.f64 fd766, fd765, 0d3FEBB67AE8584CAA; +mul.f64 fd767, fd762, 0d3FE0000000000000; +sub.f64 fd768, fd688, fd767; +sub.f64 fd769, fd733, fd738; +mul.f64 fd770, fd769, 0d3FEBB67AE8584CAA; +add.f64 %0, fd674, fd741; +add.f64 %1, fd676, fd742; +add.f64 %3, fd687, fd752; +add.f64 %2, fd681, fd751; +add.f64 %5, fd688, fd762; +add.f64 %4, fd682, fd761; +add.f64 %6, fd746, fd744; +sub.f64 %7, fd748, fd750; +sub.f64 %9, fd758, fd760; +add.f64 %8, fd756, fd754; +sub.f64 %11, fd768, fd770; +add.f64 %10, fd766, fd764; +sub.f64 %12, fd744, fd746; +add.f64 %13, fd750, fd748; +add.f64 %15, fd760, fd758; +sub.f64 %14, fd754, fd756; +add.f64 %17, fd770, fd768; +sub.f64 %16, fd764, fd766; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_6561), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..9e0d0f600dbf4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6561_fp64_inv.hpp.inc @@ -0,0 +1,1646 @@ +#ifndef CUFFTDX_FFT_6561_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_6561_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1171, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<843>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 104976, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 104976, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd122, fd152; +mul.f64 fd156, fd120, fd152; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd138, fd162; +mul.f64 fd164, fd136, fd162; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd117, fd170; +mul.f64 fd172, fd111, fd170; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd133, fd178; +mul.f64 fd180, fd127, fd178; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+11664]; +mul.f64 fd186, fd149, fd183; +mul.f64 fd187, fd143, fd183; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd118, fd193; +mul.f64 fd195, fd112, fd193; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd134, fd201; +mul.f64 fd203, fd128, fd201; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd150, fd209; +mul.f64 fd211, fd144, fd209; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd151, fd120, fd155; +sub.f64 fd216, fd157, fd156; +st.shared.v2.f64 [r9+16], {fd215, fd216}; +fma.rn.f64 fd217, fd160, fd136, fd163; +sub.f64 fd218, fd165, fd164; +st.shared.v2.f64 [r9+32], {fd217, fd218}; +sub.f64 fd219, fd173, fd172; +fma.rn.f64 fd220, fd168, fd111, fd171; +st.shared.v2.f64 [r9+48], {fd220, fd219}; +fma.rn.f64 fd221, fd176, fd127, fd179; +sub.f64 fd222, fd181, fd180; +st.shared.v2.f64 [r9+64], {fd221, fd222}; +fma.rn.f64 fd223, fd182, fd143, fd186; +sub.f64 fd224, fd188, fd187; +st.shared.v2.f64 [r9+80], {fd223, fd224}; +fma.rn.f64 fd225, fd191, fd112, fd194; +sub.f64 fd226, fd196, fd195; +st.shared.v2.f64 [r9+96], {fd225, fd226}; +fma.rn.f64 fd227, fd199, fd128, fd202; +sub.f64 fd228, fd204, fd203; +st.shared.v2.f64 [r9+112], {fd227, fd228}; +fma.rn.f64 fd229, fd207, fd144, fd210; +sub.f64 fd230, fd212, fd211; +st.shared.v2.f64 [r9+128], {fd229, fd230}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+11664]; +ld.shared.v2.f64 {fd239, fd240}, [r11+23328]; +ld.shared.v2.f64 {fd243, fd244}, [r11+34992]; +ld.shared.v2.f64 {fd247, fd248}, [r11+46656]; +ld.shared.v2.f64 {fd251, fd252}, [r11+58320]; +ld.shared.v2.f64 {fd255, fd256}, [r11+69984]; +ld.shared.v2.f64 {fd259, fd260}, [r11+81648]; +ld.shared.v2.f64 {fd263, fd264}, [r11+93312]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0dBFEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0dBFEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0dBFEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0d3FE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0d3FE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0d3FEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0d3FEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0d3FEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0d3FEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0d3FD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0d3FD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0dBFEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0dBFEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd352, fd382; +mul.f64 fd386, fd350, fd382; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd368, fd392; +mul.f64 fd394, fd366, fd392; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd347, fd400; +mul.f64 fd402, fd341, fd400; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd363, fd408; +mul.f64 fd410, fd357, fd408; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+1296]; +mul.f64 fd416, fd379, fd413; +mul.f64 fd417, fd373, fd413; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd348, fd423; +mul.f64 fd425, fd342, fd423; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd364, fd431; +mul.f64 fd433, fd358, fd431; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd380, fd439; +mul.f64 fd441, fd374, fd439; +mul.f64 fd442, fd437, fd380; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 1296, r16; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r17], {fd444, fd443}; +fma.rn.f64 fd445, fd381, fd350, fd385; +sub.f64 fd446, fd387, fd386; +st.shared.v2.f64 [r17+144], {fd445, fd446}; +fma.rn.f64 fd447, fd390, fd366, fd393; +sub.f64 fd448, fd395, fd394; +st.shared.v2.f64 [r17+288], {fd447, fd448}; +fma.rn.f64 fd449, fd398, fd341, fd401; +sub.f64 fd450, fd403, fd402; +st.shared.v2.f64 [r17+432], {fd449, fd450}; +fma.rn.f64 fd451, fd406, fd357, fd409; +sub.f64 fd452, fd411, fd410; +st.shared.v2.f64 [r17+576], {fd451, fd452}; +fma.rn.f64 fd453, fd412, fd373, fd416; +sub.f64 fd454, fd418, fd417; +st.shared.v2.f64 [r17+720], {fd453, fd454}; +fma.rn.f64 fd455, fd421, fd342, fd424; +sub.f64 fd456, fd426, fd425; +st.shared.v2.f64 [r17+864], {fd455, fd456}; +fma.rn.f64 fd457, fd429, fd358, fd432; +sub.f64 fd458, fd434, fd433; +st.shared.v2.f64 [r17+1008], {fd457, fd458}; +fma.rn.f64 fd459, fd437, fd374, fd440; +sub.f64 fd460, fd442, fd441; +st.shared.v2.f64 [r17+1152], {fd459, fd460}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r11]; +ld.shared.v2.f64 {fd465, fd466}, [r11+11664]; +ld.shared.v2.f64 {fd469, fd470}, [r11+23328]; +ld.shared.v2.f64 {fd473, fd474}, [r11+34992]; +ld.shared.v2.f64 {fd477, fd478}, [r11+46656]; +ld.shared.v2.f64 {fd481, fd482}, [r11+58320]; +ld.shared.v2.f64 {fd485, fd486}, [r11+69984]; +ld.shared.v2.f64 {fd489, fd490}, [r11+81648]; +ld.shared.v2.f64 {fd493, fd494}, [r11+93312]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd461, fd497; +add.f64 fd499, fd474, fd486; +add.f64 fd500, fd462, fd499; +mul.f64 fd501, fd497, 0d3FE0000000000000; +sub.f64 fd502, fd461, fd501; +sub.f64 fd503, fd474, fd486; +mul.f64 fd504, fd503, 0dBFEBB67AE8584CAA; +add.f64 fd505, fd504, fd502; +sub.f64 fd506, fd502, fd504; +mul.f64 fd507, fd499, 0d3FE0000000000000; +sub.f64 fd508, fd462, fd507; +sub.f64 fd509, fd473, fd485; +mul.f64 fd510, fd509, 0dBFEBB67AE8584CAA; +sub.f64 fd511, fd508, fd510; +add.f64 fd512, fd510, fd508; +add.f64 fd513, fd477, fd489; +add.f64 fd514, fd465, fd513; +add.f64 fd515, fd478, fd490; +add.f64 fd516, fd466, fd515; +mul.f64 fd517, fd513, 0d3FE0000000000000; +sub.f64 fd518, fd465, fd517; +sub.f64 fd519, fd478, fd490; +mul.f64 fd520, fd519, 0dBFEBB67AE8584CAA; +add.f64 fd521, fd520, fd518; +sub.f64 fd522, fd518, fd520; +mul.f64 fd523, fd515, 0d3FE0000000000000; +sub.f64 fd524, fd466, fd523; +sub.f64 fd525, fd477, fd489; +mul.f64 fd526, fd525, 0dBFEBB67AE8584CAA; +sub.f64 fd527, fd524, fd526; +add.f64 fd528, fd526, fd524; +add.f64 fd529, fd481, fd493; +add.f64 fd530, fd469, fd529; +add.f64 fd531, fd482, fd494; +add.f64 fd532, fd470, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd469, fd533; +sub.f64 fd535, fd482, fd494; +mul.f64 fd536, fd535, 0dBFEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd470, fd539; +sub.f64 fd541, fd481, fd493; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +mul.f64 fd545, fd521, 0d3FE8836FA2CF5039; +mul.f64 fd546, fd527, 0d3FE491B7523C161D; +sub.f64 fd547, fd545, fd546; +mul.f64 fd548, fd527, 0d3FE8836FA2CF5039; +fma.rn.f64 fd549, fd521, 0d3FE491B7523C161D, fd548; +mul.f64 fd550, fd537, 0d3FC63A1A7E0B738A; +mul.f64 fd551, fd543, 0d3FEF838B8C811C17; +sub.f64 fd552, fd550, fd551; +mul.f64 fd553, fd543, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd554, fd537, 0d3FEF838B8C811C17, fd553; +mul.f64 fd555, fd522, 0d3FC63A1A7E0B738A; +mul.f64 fd556, fd528, 0d3FEF838B8C811C17; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd528, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd559, fd522, 0d3FEF838B8C811C17, fd558; +mul.f64 fd560, fd538, 0dBFEE11F642522D1C; +mul.f64 fd561, fd544, 0d3FD5E3A8748A0BF5; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd544, 0dBFEE11F642522D1C; +fma.rn.f64 fd564, fd538, 0d3FD5E3A8748A0BF5, fd563; +add.f64 fd565, fd514, fd530; +add.f64 fd566, fd516, fd532; +mul.f64 fd567, fd565, 0d3FE0000000000000; +sub.f64 fd568, fd498, fd567; +sub.f64 fd569, fd516, fd532; +mul.f64 fd570, fd569, 0dBFEBB67AE8584CAA; +add.f64 fd571, fd570, fd568; +sub.f64 fd572, fd568, fd570; +mul.f64 fd573, fd566, 0d3FE0000000000000; +sub.f64 fd574, fd500, fd573; +sub.f64 fd575, fd514, fd530; +mul.f64 fd576, fd575, 0dBFEBB67AE8584CAA; +sub.f64 fd577, fd574, fd576; +add.f64 fd578, fd576, fd574; +add.f64 fd579, fd547, fd552; +add.f64 fd580, fd505, fd579; +add.f64 fd581, fd549, fd554; +add.f64 fd582, fd511, fd581; +mul.f64 fd583, fd579, 0d3FE0000000000000; +sub.f64 fd584, fd505, fd583; +sub.f64 fd585, fd549, fd554; +mul.f64 fd586, fd585, 0dBFEBB67AE8584CAA; +add.f64 fd587, fd586, fd584; +sub.f64 fd588, fd584, fd586; +mul.f64 fd589, fd581, 0d3FE0000000000000; +sub.f64 fd590, fd511, fd589; +sub.f64 fd591, fd547, fd552; +mul.f64 fd592, fd591, 0dBFEBB67AE8584CAA; +sub.f64 fd593, fd590, fd592; +add.f64 fd594, fd592, fd590; +add.f64 fd595, fd557, fd562; +add.f64 fd596, fd506, fd595; +add.f64 fd597, fd559, fd564; +add.f64 fd598, fd512, fd597; +mul.f64 fd599, fd595, 0d3FE0000000000000; +sub.f64 fd600, fd506, fd599; +sub.f64 fd601, fd559, fd564; +mul.f64 fd602, fd601, 0dBFEBB67AE8584CAA; +add.f64 fd603, fd602, fd600; +sub.f64 fd604, fd600, fd602; +mul.f64 fd605, fd597, 0d3FE0000000000000; +sub.f64 fd606, fd512, fd605; +sub.f64 fd607, fd557, fd562; +mul.f64 fd608, fd607, 0dBFEBB67AE8584CAA; +sub.f64 fd609, fd606, fd608; +add.f64 fd610, fd608, fd606; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd611, fd612}, [rd16]; +mul.f64 fd615, fd582, fd612; +mul.f64 fd616, fd580, fd612; +mul.f64 fd617, fd611, fd582; +mul.f64 fd618, fd611, fd611; +mul.f64 fd619, fd612, fd612; +sub.f64 fd620, fd618, fd619; +mul.f64 fd621, fd612, fd611; +fma.rn.f64 fd622, fd612, fd611, fd621; +mul.f64 fd623, fd598, fd622; +mul.f64 fd624, fd596, fd622; +mul.f64 fd625, fd620, fd598; +mul.f64 fd626, fd611, fd620; +mul.f64 fd627, fd612, fd622; +sub.f64 fd628, fd626, fd627; +mul.f64 fd629, fd611, fd622; +fma.rn.f64 fd630, fd612, fd620, fd629; +mul.f64 fd631, fd577, fd630; +mul.f64 fd632, fd571, fd630; +mul.f64 fd633, fd628, fd577; +mul.f64 fd634, fd611, fd628; +mul.f64 fd635, fd612, fd630; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd611, fd630; +fma.rn.f64 fd638, fd612, fd628, fd637; +mul.f64 fd639, fd593, fd638; +mul.f64 fd640, fd587, fd638; +mul.f64 fd641, fd636, fd593; +ld.global.v2.f64 {fd642, fd643}, [rd16+144]; +mul.f64 fd646, fd609, fd643; +mul.f64 fd647, fd603, fd643; +mul.f64 fd648, fd642, fd609; +mul.f64 fd649, fd611, fd642; +mul.f64 fd650, fd612, fd643; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd611, fd643; +fma.rn.f64 fd653, fd612, fd642, fd652; +mul.f64 fd654, fd578, fd653; +mul.f64 fd655, fd572, fd653; +mul.f64 fd656, fd651, fd578; +mul.f64 fd657, fd611, fd651; +mul.f64 fd658, fd612, fd653; +sub.f64 fd659, fd657, fd658; +mul.f64 fd660, fd611, fd653; +fma.rn.f64 fd661, fd612, fd651, fd660; +mul.f64 fd662, fd594, fd661; +mul.f64 fd663, fd588, fd661; +mul.f64 fd664, fd659, fd594; +mul.f64 fd665, fd611, fd659; +mul.f64 fd666, fd612, fd661; +sub.f64 fd667, fd665, fd666; +mul.f64 fd668, fd611, fd661; +fma.rn.f64 fd669, fd612, fd659, fd668; +mul.f64 fd670, fd610, fd669; +mul.f64 fd671, fd604, fd669; +mul.f64 fd672, fd667, fd610; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 11664, r22; +add.f64 fd673, fd500, fd566; +add.f64 fd674, fd498, fd565; +st.shared.v2.f64 [r23], {fd674, fd673}; +fma.rn.f64 fd675, fd611, fd580, fd615; +sub.f64 fd676, fd617, fd616; +st.shared.v2.f64 [r23+1296], {fd675, fd676}; +fma.rn.f64 fd677, fd620, fd596, fd623; +sub.f64 fd678, fd625, fd624; +st.shared.v2.f64 [r23+2592], {fd677, fd678}; +fma.rn.f64 fd679, fd628, fd571, fd631; +sub.f64 fd680, fd633, fd632; +st.shared.v2.f64 [r23+3888], {fd679, fd680}; +fma.rn.f64 fd681, fd636, fd587, fd639; +sub.f64 fd682, fd641, fd640; +st.shared.v2.f64 [r23+5184], {fd681, fd682}; +fma.rn.f64 fd683, fd642, fd603, fd646; +sub.f64 fd684, fd648, fd647; +st.shared.v2.f64 [r23+6480], {fd683, fd684}; +fma.rn.f64 fd685, fd651, fd572, fd654; +sub.f64 fd686, fd656, fd655; +st.shared.v2.f64 [r23+7776], {fd685, fd686}; +fma.rn.f64 fd687, fd659, fd588, fd662; +sub.f64 fd688, fd664, fd663; +st.shared.v2.f64 [r23+9072], {fd687, fd688}; +fma.rn.f64 fd689, fd667, fd604, fd670; +sub.f64 fd690, fd672, fd671; +st.shared.v2.f64 [r23+10368], {fd689, fd690}; +barrier.sync 0; +ld.shared.v2.f64 {fd691, fd692}, [r11]; +ld.shared.v2.f64 {fd695, fd696}, [r11+11664]; +ld.shared.v2.f64 {fd699, fd700}, [r11+23328]; +ld.shared.v2.f64 {fd703, fd704}, [r11+34992]; +ld.shared.v2.f64 {fd707, fd708}, [r11+46656]; +ld.shared.v2.f64 {fd711, fd712}, [r11+58320]; +ld.shared.v2.f64 {fd715, fd716}, [r11+69984]; +ld.shared.v2.f64 {fd719, fd720}, [r11+81648]; +ld.shared.v2.f64 {fd723, fd724}, [r11+93312]; +add.f64 fd727, fd703, fd715; +add.f64 fd728, fd691, fd727; +add.f64 fd729, fd704, fd716; +add.f64 fd730, fd692, fd729; +mul.f64 fd731, fd727, 0d3FE0000000000000; +sub.f64 fd732, fd691, fd731; +sub.f64 fd733, fd704, fd716; +mul.f64 fd734, fd733, 0dBFEBB67AE8584CAA; +add.f64 fd735, fd734, fd732; +sub.f64 fd736, fd732, fd734; +mul.f64 fd737, fd729, 0d3FE0000000000000; +sub.f64 fd738, fd692, fd737; +sub.f64 fd739, fd703, fd715; +mul.f64 fd740, fd739, 0dBFEBB67AE8584CAA; +sub.f64 fd741, fd738, fd740; +add.f64 fd742, fd740, fd738; +add.f64 fd743, fd707, fd719; +add.f64 fd744, fd695, fd743; +add.f64 fd745, fd708, fd720; +add.f64 fd746, fd696, fd745; +mul.f64 fd747, fd743, 0d3FE0000000000000; +sub.f64 fd748, fd695, fd747; +sub.f64 fd749, fd708, fd720; +mul.f64 fd750, fd749, 0dBFEBB67AE8584CAA; +add.f64 fd751, fd750, fd748; +sub.f64 fd752, fd748, fd750; +mul.f64 fd753, fd745, 0d3FE0000000000000; +sub.f64 fd754, fd696, fd753; +sub.f64 fd755, fd707, fd719; +mul.f64 fd756, fd755, 0dBFEBB67AE8584CAA; +sub.f64 fd757, fd754, fd756; +add.f64 fd758, fd756, fd754; +add.f64 fd759, fd711, fd723; +add.f64 fd760, fd699, fd759; +add.f64 fd761, fd712, fd724; +add.f64 fd762, fd700, fd761; +mul.f64 fd763, fd759, 0d3FE0000000000000; +sub.f64 fd764, fd699, fd763; +sub.f64 fd765, fd712, fd724; +mul.f64 fd766, fd765, 0dBFEBB67AE8584CAA; +add.f64 fd767, fd766, fd764; +sub.f64 fd768, fd764, fd766; +mul.f64 fd769, fd761, 0d3FE0000000000000; +sub.f64 fd770, fd700, fd769; +sub.f64 fd771, fd711, fd723; +mul.f64 fd772, fd771, 0dBFEBB67AE8584CAA; +sub.f64 fd773, fd770, fd772; +add.f64 fd774, fd772, fd770; +mul.f64 fd775, fd751, 0d3FE8836FA2CF5039; +mul.f64 fd776, fd757, 0d3FE491B7523C161D; +sub.f64 fd777, fd775, fd776; +mul.f64 fd778, fd757, 0d3FE8836FA2CF5039; +fma.rn.f64 fd779, fd751, 0d3FE491B7523C161D, fd778; +mul.f64 fd780, fd767, 0d3FC63A1A7E0B738A; +mul.f64 fd781, fd773, 0d3FEF838B8C811C17; +sub.f64 fd782, fd780, fd781; +mul.f64 fd783, fd773, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd784, fd767, 0d3FEF838B8C811C17, fd783; +mul.f64 fd785, fd752, 0d3FC63A1A7E0B738A; +mul.f64 fd786, fd758, 0d3FEF838B8C811C17; +sub.f64 fd787, fd785, fd786; +mul.f64 fd788, fd758, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd789, fd752, 0d3FEF838B8C811C17, fd788; +mul.f64 fd790, fd768, 0dBFEE11F642522D1C; +mul.f64 fd791, fd774, 0d3FD5E3A8748A0BF5; +sub.f64 fd792, fd790, fd791; +mul.f64 fd793, fd774, 0dBFEE11F642522D1C; +fma.rn.f64 fd794, fd768, 0d3FD5E3A8748A0BF5, fd793; +add.f64 fd795, fd744, fd760; +add.f64 fd796, fd746, fd762; +mul.f64 fd797, fd795, 0d3FE0000000000000; +sub.f64 fd798, fd728, fd797; +sub.f64 fd799, fd746, fd762; +mul.f64 fd800, fd799, 0dBFEBB67AE8584CAA; +mul.f64 fd801, fd796, 0d3FE0000000000000; +sub.f64 fd802, fd730, fd801; +sub.f64 fd803, fd744, fd760; +mul.f64 fd804, fd803, 0dBFEBB67AE8584CAA; +add.f64 fd805, fd777, fd782; +add.f64 fd806, fd779, fd784; +mul.f64 fd807, fd805, 0d3FE0000000000000; +sub.f64 fd808, fd735, fd807; +sub.f64 fd809, fd779, fd784; +mul.f64 fd810, fd809, 0dBFEBB67AE8584CAA; +mul.f64 fd811, fd806, 0d3FE0000000000000; +sub.f64 fd812, fd741, fd811; +sub.f64 fd813, fd777, fd782; +mul.f64 fd814, fd813, 0dBFEBB67AE8584CAA; +add.f64 fd815, fd787, fd792; +add.f64 fd816, fd789, fd794; +mul.f64 fd817, fd815, 0d3FE0000000000000; +sub.f64 fd818, fd736, fd817; +sub.f64 fd819, fd789, fd794; +mul.f64 fd820, fd819, 0dBFEBB67AE8584CAA; +mul.f64 fd821, fd816, 0d3FE0000000000000; +sub.f64 fd822, fd742, fd821; +sub.f64 fd823, fd787, fd792; +mul.f64 fd824, fd823, 0dBFEBB67AE8584CAA; +add.f64 %1, fd730, fd796; +add.f64 %0, fd728, fd795; +add.f64 %3, fd741, fd806; +add.f64 %2, fd735, fd805; +add.f64 %5, fd742, fd816; +add.f64 %4, fd736, fd815; +sub.f64 %7, fd802, fd804; +add.f64 %6, fd800, fd798; +sub.f64 %9, fd812, fd814; +add.f64 %8, fd810, fd808; +sub.f64 %11, fd822, fd824; +add.f64 %10, fd820, fd818; +add.f64 %13, fd804, fd802; +sub.f64 %12, fd798, fd800; +add.f64 %15, fd814, fd812; +sub.f64 %14, fd808, fd810; +add.f64 %17, fd824, fd822; +sub.f64 %16, fd818, fd820; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_6561), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<702, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<24>; +.reg .f64 fd<789>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 52488, r2; +add.f64 fd37, %30, %38; +add.f64 fd38, %22, fd37; +add.f64 fd39, %31, %39; +add.f64 fd40, %23, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %22, fd41; +sub.f64 fd43, %31, %39; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %23, fd47; +sub.f64 fd49, %30, %38; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %32, %40; +add.f64 fd54, %24, fd53; +add.f64 fd55, %34, %42; +add.f64 fd56, %26, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %24, fd57; +sub.f64 fd59, %34, %42; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %26, fd63; +sub.f64 fd65, %32, %40; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %35, %43; +add.f64 fd70, %27, fd69; +add.f64 fd71, %37, %44; +add.f64 fd72, %29, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %27, fd73; +sub.f64 fd75, %37, %44; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %29, fd79; +sub.f64 fd81, %35, %43; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0dBFEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0dBFEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0dBFEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, 1508246403; +shr.u64 rd3, rd2, 40; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 729; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd124, fd154; +fma.rn.f64 fd158, fd153, fd122, fd157; +mul.f64 fd159, fd122, fd154; +mul.f64 fd160, fd153, fd124; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd140, fd166; +fma.rn.f64 fd168, fd164, fd138, fd167; +mul.f64 fd169, fd138, fd166; +mul.f64 fd170, fd164, fd140; +sub.f64 fd171, fd170, fd169; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd119, fd176; +fma.rn.f64 fd178, fd174, fd113, fd177; +mul.f64 fd179, fd113, fd176; +mul.f64 fd180, fd174, fd119; +sub.f64 fd181, fd180, fd179; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd135, fd186; +fma.rn.f64 fd188, fd184, fd129, fd187; +mul.f64 fd189, fd129, fd186; +mul.f64 fd190, fd184, fd135; +sub.f64 fd191, fd190, fd189; +ld.global.v2.f64 {fd192, fd193}, [rd6+11664]; +mul.f64 fd196, fd151, fd193; +fma.rn.f64 fd197, fd192, fd145, fd196; +mul.f64 fd198, fd145, fd193; +mul.f64 fd199, fd192, fd151; +sub.f64 fd200, fd199, fd198; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd120, fd205; +fma.rn.f64 fd207, fd203, fd114, fd206; +mul.f64 fd208, fd114, fd205; +mul.f64 fd209, fd203, fd120; +sub.f64 fd210, fd209, fd208; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd136, fd215; +fma.rn.f64 fd217, fd213, fd130, fd216; +mul.f64 fd218, fd130, fd215; +mul.f64 fd219, fd213, fd136; +sub.f64 fd220, fd219, fd218; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd152, fd225; +fma.rn.f64 fd227, fd223, fd146, fd226; +mul.f64 fd228, fd146, fd225; +mul.f64 fd229, fd223, fd152; +sub.f64 fd230, fd229, fd228; +mad.lo.s32 r8, r5, 52488, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd158; +st.shared.f64 [r9+16], fd168; +st.shared.f64 [r9+24], fd178; +st.shared.f64 [r9+32], fd188; +st.shared.f64 [r9+40], fd197; +st.shared.f64 [r9+48], fd207; +st.shared.f64 [r9+56], fd217; +st.shared.f64 [r9+64], fd227; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+5832]; +ld.shared.f64 fd233, [r11+11664]; +ld.shared.f64 fd234, [r11+17496]; +ld.shared.f64 fd235, [r11+23328]; +ld.shared.f64 fd236, [r11+29160]; +ld.shared.f64 fd237, [r11+34992]; +ld.shared.f64 fd238, [r11+40824]; +ld.shared.f64 fd239, [r11+46656]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+5832]; +ld.shared.f64 fd242, [r11+11664]; +ld.shared.f64 fd243, [r11+17496]; +ld.shared.f64 fd244, [r11+23328]; +ld.shared.f64 fd245, [r11+29160]; +ld.shared.f64 fd246, [r11+34992]; +ld.shared.f64 fd247, [r11+40824]; +ld.shared.f64 fd248, [r11+46656]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0dBFEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0d3FE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0d3FE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0d3FEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0d3FEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0d3FEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0d3FEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0d3FD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0d3FD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0dBFEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0dBFEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd336, fd366; +fma.rn.f64 fd370, fd365, fd334, fd369; +mul.f64 fd371, fd334, fd366; +mul.f64 fd372, fd365, fd336; +sub.f64 fd373, fd372, fd371; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd352, fd378; +fma.rn.f64 fd380, fd376, fd350, fd379; +mul.f64 fd381, fd350, fd378; +mul.f64 fd382, fd376, fd352; +sub.f64 fd383, fd382, fd381; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd331, fd388; +fma.rn.f64 fd390, fd386, fd325, fd389; +mul.f64 fd391, fd325, fd388; +mul.f64 fd392, fd386, fd331; +sub.f64 fd393, fd392, fd391; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd347, fd398; +fma.rn.f64 fd400, fd396, fd341, fd399; +mul.f64 fd401, fd341, fd398; +mul.f64 fd402, fd396, fd347; +sub.f64 fd403, fd402, fd401; +ld.global.v2.f64 {fd404, fd405}, [rd11+1296]; +mul.f64 fd408, fd363, fd405; +fma.rn.f64 fd409, fd404, fd357, fd408; +mul.f64 fd410, fd357, fd405; +mul.f64 fd411, fd404, fd363; +sub.f64 fd412, fd411, fd410; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd332, fd417; +fma.rn.f64 fd419, fd415, fd326, fd418; +mul.f64 fd420, fd326, fd417; +mul.f64 fd421, fd415, fd332; +sub.f64 fd422, fd421, fd420; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd348, fd427; +fma.rn.f64 fd429, fd425, fd342, fd428; +mul.f64 fd430, fd342, fd427; +mul.f64 fd431, fd425, fd348; +sub.f64 fd432, fd431, fd430; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd364, fd437; +fma.rn.f64 fd439, fd435, fd358, fd438; +mul.f64 fd440, fd358, fd437; +mul.f64 fd441, fd435, fd364; +sub.f64 fd442, fd441, fd440; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +st.shared.f64 [r17], fd318; +st.shared.f64 [r17+72], fd370; +st.shared.f64 [r17+144], fd380; +st.shared.f64 [r17+216], fd390; +st.shared.f64 [r17+288], fd400; +st.shared.f64 [r17+360], fd409; +st.shared.f64 [r17+432], fd419; +st.shared.f64 [r17+504], fd429; +st.shared.f64 [r17+576], fd439; +barrier.sync 0; +ld.shared.f64 fd443, [r11]; +ld.shared.f64 fd444, [r11+5832]; +ld.shared.f64 fd445, [r11+11664]; +ld.shared.f64 fd446, [r11+17496]; +ld.shared.f64 fd447, [r11+23328]; +ld.shared.f64 fd448, [r11+29160]; +ld.shared.f64 fd449, [r11+34992]; +ld.shared.f64 fd450, [r11+40824]; +ld.shared.f64 fd451, [r11+46656]; +barrier.sync 0; +st.shared.f64 [r17], fd320; +st.shared.f64 [r17+72], fd373; +st.shared.f64 [r17+144], fd383; +st.shared.f64 [r17+216], fd393; +st.shared.f64 [r17+288], fd403; +st.shared.f64 [r17+360], fd412; +st.shared.f64 [r17+432], fd422; +st.shared.f64 [r17+504], fd432; +st.shared.f64 [r17+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r11]; +ld.shared.f64 fd453, [r11+5832]; +ld.shared.f64 fd454, [r11+11664]; +ld.shared.f64 fd455, [r11+17496]; +ld.shared.f64 fd456, [r11+23328]; +ld.shared.f64 fd457, [r11+29160]; +ld.shared.f64 fd458, [r11+34992]; +ld.shared.f64 fd459, [r11+40824]; +ld.shared.f64 fd460, [r11+46656]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd443, fd461; +add.f64 fd463, fd455, fd458; +add.f64 fd464, fd452, fd463; +mul.f64 fd465, fd461, 0d3FE0000000000000; +sub.f64 fd466, fd443, fd465; +sub.f64 fd467, fd455, fd458; +mul.f64 fd468, fd467, 0dBFEBB67AE8584CAA; +add.f64 fd469, fd468, fd466; +sub.f64 fd470, fd466, fd468; +mul.f64 fd471, fd463, 0d3FE0000000000000; +sub.f64 fd472, fd452, fd471; +sub.f64 fd473, fd446, fd449; +mul.f64 fd474, fd473, 0dBFEBB67AE8584CAA; +sub.f64 fd475, fd472, fd474; +add.f64 fd476, fd474, fd472; +add.f64 fd477, fd447, fd450; +add.f64 fd478, fd444, fd477; +add.f64 fd479, fd456, fd459; +add.f64 fd480, fd453, fd479; +mul.f64 fd481, fd477, 0d3FE0000000000000; +sub.f64 fd482, fd444, fd481; +sub.f64 fd483, fd456, fd459; +mul.f64 fd484, fd483, 0dBFEBB67AE8584CAA; +add.f64 fd485, fd484, fd482; +sub.f64 fd486, fd482, fd484; +mul.f64 fd487, fd479, 0d3FE0000000000000; +sub.f64 fd488, fd453, fd487; +sub.f64 fd489, fd447, fd450; +mul.f64 fd490, fd489, 0dBFEBB67AE8584CAA; +sub.f64 fd491, fd488, fd490; +add.f64 fd492, fd490, fd488; +add.f64 fd493, fd448, fd451; +add.f64 fd494, fd445, fd493; +add.f64 fd495, fd457, fd460; +add.f64 fd496, fd454, fd495; +mul.f64 fd497, fd493, 0d3FE0000000000000; +sub.f64 fd498, fd445, fd497; +sub.f64 fd499, fd457, fd460; +mul.f64 fd500, fd499, 0dBFEBB67AE8584CAA; +add.f64 fd501, fd500, fd498; +sub.f64 fd502, fd498, fd500; +mul.f64 fd503, fd495, 0d3FE0000000000000; +sub.f64 fd504, fd454, fd503; +sub.f64 fd505, fd448, fd451; +mul.f64 fd506, fd505, 0dBFEBB67AE8584CAA; +sub.f64 fd507, fd504, fd506; +add.f64 fd508, fd506, fd504; +mul.f64 fd509, fd485, 0d3FE8836FA2CF5039; +mul.f64 fd510, fd491, 0d3FE491B7523C161D; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd491, 0d3FE8836FA2CF5039; +fma.rn.f64 fd513, fd485, 0d3FE491B7523C161D, fd512; +mul.f64 fd514, fd501, 0d3FC63A1A7E0B738A; +mul.f64 fd515, fd507, 0d3FEF838B8C811C17; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd507, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd518, fd501, 0d3FEF838B8C811C17, fd517; +mul.f64 fd519, fd486, 0d3FC63A1A7E0B738A; +mul.f64 fd520, fd492, 0d3FEF838B8C811C17; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd492, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd523, fd486, 0d3FEF838B8C811C17, fd522; +mul.f64 fd524, fd502, 0dBFEE11F642522D1C; +mul.f64 fd525, fd508, 0d3FD5E3A8748A0BF5; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd508, 0dBFEE11F642522D1C; +fma.rn.f64 fd528, fd502, 0d3FD5E3A8748A0BF5, fd527; +add.f64 fd529, fd478, fd494; +add.f64 fd530, fd462, fd529; +add.f64 fd531, fd480, fd496; +add.f64 fd532, fd464, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd462, fd533; +sub.f64 fd535, fd480, fd496; +mul.f64 fd536, fd535, 0dBFEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd464, fd539; +sub.f64 fd541, fd478, fd494; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +add.f64 fd545, fd511, fd516; +add.f64 fd546, fd469, fd545; +add.f64 fd547, fd513, fd518; +add.f64 fd548, fd475, fd547; +mul.f64 fd549, fd545, 0d3FE0000000000000; +sub.f64 fd550, fd469, fd549; +sub.f64 fd551, fd513, fd518; +mul.f64 fd552, fd551, 0dBFEBB67AE8584CAA; +add.f64 fd553, fd552, fd550; +sub.f64 fd554, fd550, fd552; +mul.f64 fd555, fd547, 0d3FE0000000000000; +sub.f64 fd556, fd475, fd555; +sub.f64 fd557, fd511, fd516; +mul.f64 fd558, fd557, 0dBFEBB67AE8584CAA; +sub.f64 fd559, fd556, fd558; +add.f64 fd560, fd558, fd556; +add.f64 fd561, fd521, fd526; +add.f64 fd562, fd470, fd561; +add.f64 fd563, fd523, fd528; +add.f64 fd564, fd476, fd563; +mul.f64 fd565, fd561, 0d3FE0000000000000; +sub.f64 fd566, fd470, fd565; +sub.f64 fd567, fd523, fd528; +mul.f64 fd568, fd567, 0dBFEBB67AE8584CAA; +add.f64 fd569, fd568, fd566; +sub.f64 fd570, fd566, fd568; +mul.f64 fd571, fd563, 0d3FE0000000000000; +sub.f64 fd572, fd476, fd571; +sub.f64 fd573, fd521, fd526; +mul.f64 fd574, fd573, 0dBFEBB67AE8584CAA; +sub.f64 fd575, fd572, fd574; +add.f64 fd576, fd574, fd572; +mul.wide.u32 rd12, r7, -901412889; +shr.u64 rd13, rd12, 38; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 81; +sub.s32 r20, r7, r19; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %21; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd577, fd578}, [rd16]; +mul.f64 fd581, fd548, fd578; +fma.rn.f64 fd582, fd577, fd546, fd581; +mul.f64 fd583, fd546, fd578; +mul.f64 fd584, fd577, fd548; +sub.f64 fd585, fd584, fd583; +mul.f64 fd586, fd577, fd577; +mul.f64 fd587, fd578, fd578; +sub.f64 fd588, fd586, fd587; +mul.f64 fd589, fd578, fd577; +fma.rn.f64 fd590, fd578, fd577, fd589; +mul.f64 fd591, fd564, fd590; +fma.rn.f64 fd592, fd588, fd562, fd591; +mul.f64 fd593, fd562, fd590; +mul.f64 fd594, fd588, fd564; +sub.f64 fd595, fd594, fd593; +mul.f64 fd596, fd577, fd588; +mul.f64 fd597, fd578, fd590; +sub.f64 fd598, fd596, fd597; +mul.f64 fd599, fd577, fd590; +fma.rn.f64 fd600, fd578, fd588, fd599; +mul.f64 fd601, fd543, fd600; +fma.rn.f64 fd602, fd598, fd537, fd601; +mul.f64 fd603, fd537, fd600; +mul.f64 fd604, fd598, fd543; +sub.f64 fd605, fd604, fd603; +mul.f64 fd606, fd577, fd598; +mul.f64 fd607, fd578, fd600; +sub.f64 fd608, fd606, fd607; +mul.f64 fd609, fd577, fd600; +fma.rn.f64 fd610, fd578, fd598, fd609; +mul.f64 fd611, fd559, fd610; +fma.rn.f64 fd612, fd608, fd553, fd611; +mul.f64 fd613, fd553, fd610; +mul.f64 fd614, fd608, fd559; +sub.f64 fd615, fd614, fd613; +ld.global.v2.f64 {fd616, fd617}, [rd16+144]; +mul.f64 fd620, fd575, fd617; +fma.rn.f64 fd621, fd616, fd569, fd620; +mul.f64 fd622, fd569, fd617; +mul.f64 fd623, fd616, fd575; +sub.f64 fd624, fd623, fd622; +mul.f64 fd625, fd577, fd616; +mul.f64 fd626, fd578, fd617; +sub.f64 fd627, fd625, fd626; +mul.f64 fd628, fd577, fd617; +fma.rn.f64 fd629, fd578, fd616, fd628; +mul.f64 fd630, fd544, fd629; +fma.rn.f64 fd631, fd627, fd538, fd630; +mul.f64 fd632, fd538, fd629; +mul.f64 fd633, fd627, fd544; +sub.f64 fd634, fd633, fd632; +mul.f64 fd635, fd577, fd627; +mul.f64 fd636, fd578, fd629; +sub.f64 fd637, fd635, fd636; +mul.f64 fd638, fd577, fd629; +fma.rn.f64 fd639, fd578, fd627, fd638; +mul.f64 fd640, fd560, fd639; +fma.rn.f64 fd641, fd637, fd554, fd640; +mul.f64 fd642, fd554, fd639; +mul.f64 fd643, fd637, fd560; +sub.f64 fd644, fd643, fd642; +mul.f64 fd645, fd577, fd637; +mul.f64 fd646, fd578, fd639; +sub.f64 fd647, fd645, fd646; +mul.f64 fd648, fd577, fd639; +fma.rn.f64 fd649, fd578, fd637, fd648; +mul.f64 fd650, fd576, fd649; +fma.rn.f64 fd651, fd647, fd570, fd650; +mul.f64 fd652, fd570, fd649; +mul.f64 fd653, fd647, fd576; +sub.f64 fd654, fd653, fd652; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +barrier.sync 0; +mad.lo.s32 r23, r18, 5832, r22; +st.shared.f64 [r23], fd530; +st.shared.f64 [r23+648], fd582; +st.shared.f64 [r23+1296], fd592; +st.shared.f64 [r23+1944], fd602; +st.shared.f64 [r23+2592], fd612; +st.shared.f64 [r23+3240], fd621; +st.shared.f64 [r23+3888], fd631; +st.shared.f64 [r23+4536], fd641; +st.shared.f64 [r23+5184], fd651; +barrier.sync 0; +ld.shared.f64 fd655, [r11]; +ld.shared.f64 fd656, [r11+5832]; +ld.shared.f64 fd657, [r11+11664]; +ld.shared.f64 fd658, [r11+17496]; +ld.shared.f64 fd659, [r11+23328]; +ld.shared.f64 fd660, [r11+29160]; +ld.shared.f64 fd661, [r11+34992]; +ld.shared.f64 fd662, [r11+40824]; +ld.shared.f64 fd663, [r11+46656]; +barrier.sync 0; +st.shared.f64 [r23], fd532; +st.shared.f64 [r23+648], fd585; +st.shared.f64 [r23+1296], fd595; +st.shared.f64 [r23+1944], fd605; +st.shared.f64 [r23+2592], fd615; +st.shared.f64 [r23+3240], fd624; +st.shared.f64 [r23+3888], fd634; +st.shared.f64 [r23+4536], fd644; +st.shared.f64 [r23+5184], fd654; +barrier.sync 0; +ld.shared.f64 fd664, [r11]; +ld.shared.f64 fd665, [r11+5832]; +ld.shared.f64 fd666, [r11+11664]; +ld.shared.f64 fd667, [r11+17496]; +ld.shared.f64 fd668, [r11+23328]; +ld.shared.f64 fd669, [r11+29160]; +ld.shared.f64 fd670, [r11+34992]; +ld.shared.f64 fd671, [r11+40824]; +ld.shared.f64 fd672, [r11+46656]; +add.f64 fd673, fd658, fd661; +add.f64 fd674, fd655, fd673; +add.f64 fd675, fd667, fd670; +add.f64 fd676, fd664, fd675; +mul.f64 fd677, fd673, 0d3FE0000000000000; +sub.f64 fd678, fd655, fd677; +sub.f64 fd679, fd667, fd670; +mul.f64 fd680, fd679, 0dBFEBB67AE8584CAA; +add.f64 fd681, fd680, fd678; +sub.f64 fd682, fd678, fd680; +mul.f64 fd683, fd675, 0d3FE0000000000000; +sub.f64 fd684, fd664, fd683; +sub.f64 fd685, fd658, fd661; +mul.f64 fd686, fd685, 0dBFEBB67AE8584CAA; +sub.f64 fd687, fd684, fd686; +add.f64 fd688, fd686, fd684; +add.f64 fd689, fd659, fd662; +add.f64 fd690, fd656, fd689; +add.f64 fd691, fd668, fd671; +add.f64 fd692, fd665, fd691; +mul.f64 fd693, fd689, 0d3FE0000000000000; +sub.f64 fd694, fd656, fd693; +sub.f64 fd695, fd668, fd671; +mul.f64 fd696, fd695, 0dBFEBB67AE8584CAA; +add.f64 fd697, fd696, fd694; +sub.f64 fd698, fd694, fd696; +mul.f64 fd699, fd691, 0d3FE0000000000000; +sub.f64 fd700, fd665, fd699; +sub.f64 fd701, fd659, fd662; +mul.f64 fd702, fd701, 0dBFEBB67AE8584CAA; +sub.f64 fd703, fd700, fd702; +add.f64 fd704, fd702, fd700; +add.f64 fd705, fd660, fd663; +add.f64 fd706, fd657, fd705; +add.f64 fd707, fd669, fd672; +add.f64 fd708, fd666, fd707; +mul.f64 fd709, fd705, 0d3FE0000000000000; +sub.f64 fd710, fd657, fd709; +sub.f64 fd711, fd669, fd672; +mul.f64 fd712, fd711, 0dBFEBB67AE8584CAA; +add.f64 fd713, fd712, fd710; +sub.f64 fd714, fd710, fd712; +mul.f64 fd715, fd707, 0d3FE0000000000000; +sub.f64 fd716, fd666, fd715; +sub.f64 fd717, fd660, fd663; +mul.f64 fd718, fd717, 0dBFEBB67AE8584CAA; +sub.f64 fd719, fd716, fd718; +add.f64 fd720, fd718, fd716; +mul.f64 fd721, fd697, 0d3FE8836FA2CF5039; +mul.f64 fd722, fd703, 0d3FE491B7523C161D; +sub.f64 fd723, fd721, fd722; +mul.f64 fd724, fd703, 0d3FE8836FA2CF5039; +fma.rn.f64 fd725, fd697, 0d3FE491B7523C161D, fd724; +mul.f64 fd726, fd713, 0d3FC63A1A7E0B738A; +mul.f64 fd727, fd719, 0d3FEF838B8C811C17; +sub.f64 fd728, fd726, fd727; +mul.f64 fd729, fd719, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd730, fd713, 0d3FEF838B8C811C17, fd729; +mul.f64 fd731, fd698, 0d3FC63A1A7E0B738A; +mul.f64 fd732, fd704, 0d3FEF838B8C811C17; +sub.f64 fd733, fd731, fd732; +mul.f64 fd734, fd704, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd735, fd698, 0d3FEF838B8C811C17, fd734; +mul.f64 fd736, fd714, 0dBFEE11F642522D1C; +mul.f64 fd737, fd720, 0d3FD5E3A8748A0BF5; +sub.f64 fd738, fd736, fd737; +mul.f64 fd739, fd720, 0dBFEE11F642522D1C; +fma.rn.f64 fd740, fd714, 0d3FD5E3A8748A0BF5, fd739; +add.f64 fd741, fd690, fd706; +add.f64 fd742, fd692, fd708; +mul.f64 fd743, fd741, 0d3FE0000000000000; +sub.f64 fd744, fd674, fd743; +sub.f64 fd745, fd692, fd708; +mul.f64 fd746, fd745, 0dBFEBB67AE8584CAA; +mul.f64 fd747, fd742, 0d3FE0000000000000; +sub.f64 fd748, fd676, fd747; +sub.f64 fd749, fd690, fd706; +mul.f64 fd750, fd749, 0dBFEBB67AE8584CAA; +add.f64 fd751, fd723, fd728; +add.f64 fd752, fd725, fd730; +mul.f64 fd753, fd751, 0d3FE0000000000000; +sub.f64 fd754, fd681, fd753; +sub.f64 fd755, fd725, fd730; +mul.f64 fd756, fd755, 0dBFEBB67AE8584CAA; +mul.f64 fd757, fd752, 0d3FE0000000000000; +sub.f64 fd758, fd687, fd757; +sub.f64 fd759, fd723, fd728; +mul.f64 fd760, fd759, 0dBFEBB67AE8584CAA; +add.f64 fd761, fd733, fd738; +add.f64 fd762, fd735, fd740; +mul.f64 fd763, fd761, 0d3FE0000000000000; +sub.f64 fd764, fd682, fd763; +sub.f64 fd765, fd735, fd740; +mul.f64 fd766, fd765, 0dBFEBB67AE8584CAA; +mul.f64 fd767, fd762, 0d3FE0000000000000; +sub.f64 fd768, fd688, fd767; +sub.f64 fd769, fd733, fd738; +mul.f64 fd770, fd769, 0dBFEBB67AE8584CAA; +add.f64 %0, fd674, fd741; +add.f64 %1, fd676, fd742; +add.f64 %3, fd687, fd752; +add.f64 %2, fd681, fd751; +add.f64 %5, fd688, fd762; +add.f64 %4, fd682, fd761; +add.f64 %6, fd746, fd744; +sub.f64 %7, fd748, fd750; +sub.f64 %9, fd758, fd760; +add.f64 %8, fd756, fd754; +sub.f64 %11, fd768, fd770; +add.f64 %10, fd766, fd764; +sub.f64 %12, fd744, fd746; +add.f64 %13, fd750, fd748; +add.f64 %15, fd760, fd758; +sub.f64 %14, fd754, fd756; +add.f64 %17, fd770, fd768; +sub.f64 %16, fd764, fd766; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_6561), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..e76943f4bde92 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp16_fwd.hpp.inc @@ -0,0 +1,293 @@ +#ifndef CUFFTDX_FFT_6_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_6_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<929, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<19>; +.reg .b32 r<237>; +.reg .f64 fd<15>; +.reg .b64 rd<2>; +mov.f64 fd7, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd7; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd8, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd8; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %17, %13; +} +{ +add.f16x2 r4, %18, r1; +} +{ +add.f16x2 r7, %19, %14; +} +{ +add.f16x2 r10, %20, r7; +} +{ +add.f16x2 r13, %17, %13; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %18, r16; +} +{ +sub.f16x2 r22, %19, %14; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %17, %13; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %18, r34; +} +{ +sub.f16x2 r40, %19, %14; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %19, %14; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %20, r52; +} +{ +sub.f16x2 r58, %17, %13; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %19, %14; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %20, r70; +} +{ +sub.f16x2 r76, %17, %13; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs5, fd7; +} +mov.b32 r156, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd8; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r165, {rs7, rs7}; +{ +add.f16x2 r85, %21, %15; +} +{ +add.f16x2 r88, %22, r85; +} +{ +add.f16x2 r91, %12, %16; +} +{ +add.f16x2 r94, %23, r91; +} +{ +add.f16x2 r97, %21, %15; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %22, r100; +} +{ +sub.f16x2 r106, %12, %16; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %21, %15; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %22, r118; +} +{ +sub.f16x2 r124, %12, %16; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %12, %16; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %23, r136; +} +{ +sub.f16x2 r142, %21, %15; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %12, %16; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %23, r154; +} +{ +sub.f16x2 r160, %21, %15; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +mov.f64 fd5, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs9, fd5; +} +{ +cvt.rn.f16.f64 rs10, fd8; +} +{ +cvt.rn.f16.f64 rs11, fd7; +} +{ +cvt.rn.f16.f64 rs12, fd8; +} +mov.b32 r183, {rs9, rs9}; +{ +mul.f16x2 r169, r112, r183; +} +mov.b32 r180, {rs10, rs10}; +{ +mul.f16x2 r172, r148, r180; +} +{ +sub.f16x2 r175, r169, r172; +} +{ +mul.f16x2 r178, r112, r180; +} +{ +fma.rn.f16x2 r181, r148, r183, r178; +} +mov.b32 r199, {rs11, rs11}; +{ +mul.f16x2 r185, r130, r199; +} +mov.b32 r196, {rs12, rs12}; +{ +mul.f16x2 r188, r166, r196; +} +{ +sub.f16x2 r191, r185, r188; +} +{ +mul.f16x2 r194, r130, r196; +} +{ +fma.rn.f16x2 r197, r166, r199, r194; +} +{ +add.f16x2 %0, r4, r88; +} +{ +add.f16x2 %1, r10, r94; +} +{ +sub.f16x2 %6, r4, r88; +} +{ +sub.f16x2 %7, r10, r94; +} +{ +add.f16x2 %2, r28, r175; +} +{ +add.f16x2 %3, r64, r181; +} +{ +sub.f16x2 %8, r28, r175; +} +{ +sub.f16x2 %9, r64, r181; +} +{ +add.f16x2 %4, r46, r191; +} +{ +add.f16x2 %5, r82, r197; +} +{ +sub.f16x2 %10, r46, r191; +} +{ +sub.f16x2 %11, r82, r197; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ec71a933dd8a1 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp16_inv.hpp.inc @@ -0,0 +1,288 @@ +#ifndef CUFFTDX_FFT_6_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_6_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1131, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<15>; +.reg .b32 r<237>; +.reg .f64 fd<15>; +.reg .b64 rd<2>; +mov.f64 fd7, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd7; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd4, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd4; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %17, %13; +} +{ +add.f16x2 r4, %18, r1; +} +{ +add.f16x2 r7, %19, %14; +} +{ +add.f16x2 r10, %20, r7; +} +{ +add.f16x2 r13, %17, %13; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %18, r16; +} +{ +sub.f16x2 r22, %19, %14; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %17, %13; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %18, r34; +} +{ +sub.f16x2 r40, %19, %14; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %19, %14; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %20, r52; +} +{ +sub.f16x2 r58, %17, %13; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %19, %14; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %20, r70; +} +{ +sub.f16x2 r76, %17, %13; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs3, fd7; +} +mov.b32 r156, {rs3, rs3}; +{ +cvt.rn.f16.f64 rs4, fd4; +} +mov.b32 r165, {rs4, rs4}; +{ +add.f16x2 r85, %21, %15; +} +{ +add.f16x2 r88, %22, r85; +} +{ +add.f16x2 r91, %12, %16; +} +{ +add.f16x2 r94, %23, r91; +} +{ +add.f16x2 r97, %21, %15; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %22, r100; +} +{ +sub.f16x2 r106, %12, %16; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %21, %15; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %22, r118; +} +{ +sub.f16x2 r124, %12, %16; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %12, %16; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %23, r136; +} +{ +sub.f16x2 r142, %21, %15; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %12, %16; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %23, r154; +} +{ +sub.f16x2 r160, %21, %15; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +mov.f64 fd5, 0d3FE0000000000000; +{ +cvt.rn.f16.f64 rs5, fd5; +} +mov.f64 fd8, 0d3FEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs6, fd8; +} +{ +cvt.rn.f16.f64 rs7, fd7; +} +{ +cvt.rn.f16.f64 rs8, fd8; +} +mov.b32 r183, {rs5, rs5}; +{ +mul.f16x2 r169, r112, r183; +} +mov.b32 r180, {rs6, rs6}; +{ +mul.f16x2 r172, r148, r180; +} +{ +sub.f16x2 r175, r169, r172; +} +{ +mul.f16x2 r178, r112, r180; +} +{ +fma.rn.f16x2 r181, r148, r183, r178; +} +mov.b32 r199, {rs7, rs7}; +{ +mul.f16x2 r185, r130, r199; +} +mov.b32 r196, {rs8, rs8}; +{ +mul.f16x2 r188, r166, r196; +} +{ +sub.f16x2 r191, r185, r188; +} +{ +mul.f16x2 r194, r130, r196; +} +{ +fma.rn.f16x2 r197, r166, r199, r194; +} +{ +add.f16x2 %0, r4, r88; +} +{ +add.f16x2 %1, r10, r94; +} +{ +sub.f16x2 %6, r4, r88; +} +{ +sub.f16x2 %7, r10, r94; +} +{ +add.f16x2 %2, r28, r175; +} +{ +add.f16x2 %3, r64, r181; +} +{ +sub.f16x2 %8, r28, r175; +} +{ +sub.f16x2 %9, r64, r181; +} +{ +add.f16x2 %4, r46, r191; +} +{ +add.f16x2 %5, r82, r197; +} +{ +sub.f16x2 %10, r46, r191; +} +{ +sub.f16x2 %11, r82, r197; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)): "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..45a4efd0fab73 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp32_fwd.hpp.inc @@ -0,0 +1,70 @@ +#ifndef CUFFTDX_FFT_6_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_6_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<183, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<79>; +.reg .b64 rd<2>; +add.f32 f25, %17, %22; +add.f32 f26, %12, f25; +add.f32 f27, %19, %24; +add.f32 f28, %13, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %12, f29; +sub.f32 f31, %19, %24; +mul.f32 f32, f31, 0f3F5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %13, f35; +sub.f32 f37, %17, %22; +mul.f32 f38, f37, 0f3F5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %20, %25; +add.f32 f42, %14, f41; +add.f32 f43, %21, %26; +add.f32 f44, %16, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %14, f45; +sub.f32 f47, %21, %26; +mul.f32 f48, f47, 0f3F5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %16, f51; +sub.f32 f53, %20, %25; +mul.f32 f54, f53, 0f3F5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0fBF5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0fBF5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0fBF5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0fBF5DB3D7, f65; +add.f32 %1, f28, f44; +add.f32 %0, f26, f42; +add.f32 %3, f39, f61; +add.f32 %2, f33, f59; +add.f32 %5, f40, f66; +add.f32 %4, f34, f64; +sub.f32 %7, f28, f44; +sub.f32 %6, f26, f42; +sub.f32 %9, f39, f61; +sub.f32 %8, f33, f59; +sub.f32 %11, f40, f66; +sub.f32 %10, f34, f64; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..624105d221d86 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp32_inv.hpp.inc @@ -0,0 +1,70 @@ +#ifndef CUFFTDX_FFT_6_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_6_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<385, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<79>; +.reg .b64 rd<2>; +add.f32 f25, %17, %22; +add.f32 f26, %12, f25; +add.f32 f27, %19, %24; +add.f32 f28, %13, f27; +mul.f32 f29, f25, 0f3F000000; +sub.f32 f30, %12, f29; +sub.f32 f31, %19, %24; +mul.f32 f32, f31, 0fBF5DB3D7; +add.f32 f33, f32, f30; +sub.f32 f34, f30, f32; +mul.f32 f35, f27, 0f3F000000; +sub.f32 f36, %13, f35; +sub.f32 f37, %17, %22; +mul.f32 f38, f37, 0fBF5DB3D7; +sub.f32 f39, f36, f38; +add.f32 f40, f38, f36; +add.f32 f41, %20, %25; +add.f32 f42, %14, f41; +add.f32 f43, %21, %26; +add.f32 f44, %16, f43; +mul.f32 f45, f41, 0f3F000000; +sub.f32 f46, %14, f45; +sub.f32 f47, %21, %26; +mul.f32 f48, f47, 0fBF5DB3D7; +add.f32 f49, f48, f46; +sub.f32 f50, f46, f48; +mul.f32 f51, f43, 0f3F000000; +sub.f32 f52, %16, f51; +sub.f32 f53, %20, %25; +mul.f32 f54, f53, 0fBF5DB3D7; +sub.f32 f55, f52, f54; +add.f32 f56, f54, f52; +mul.f32 f57, f49, 0f3F000000; +mul.f32 f58, f55, 0f3F5DB3D7; +sub.f32 f59, f57, f58; +mul.f32 f60, f55, 0f3F000000; +fma.rn.f32 f61, f49, 0f3F5DB3D7, f60; +mul.f32 f62, f50, 0fBF000000; +mul.f32 f63, f56, 0f3F5DB3D7; +sub.f32 f64, f62, f63; +mul.f32 f65, f56, 0fBF000000; +fma.rn.f32 f66, f50, 0f3F5DB3D7, f65; +add.f32 %1, f28, f44; +add.f32 %0, f26, f42; +add.f32 %3, f39, f61; +add.f32 %2, f33, f59; +add.f32 %5, f40, f66; +add.f32 %4, f34, f64; +sub.f32 %7, f28, f44; +sub.f32 %6, f26, f42; +sub.f32 %9, f39, f61; +sub.f32 %8, f33, f59; +sub.f32 %11, f40, f66; +sub.f32 %10, f34, f64; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..95cf78fbe7d0d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp64_fwd.hpp.inc @@ -0,0 +1,70 @@ +#ifndef CUFFTDX_FFT_6_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_6_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<558, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<79>; +.reg .b64 rd<2>; +add.f64 fd25, %17, %22; +add.f64 fd26, %12, fd25; +add.f64 fd27, %19, %24; +add.f64 fd28, %13, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %12, fd29; +sub.f64 fd31, %19, %24; +mul.f64 fd32, fd31, 0d3FEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %13, fd35; +sub.f64 fd37, %17, %22; +mul.f64 fd38, fd37, 0d3FEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %20, %25; +add.f64 fd42, %14, fd41; +add.f64 fd43, %21, %26; +add.f64 fd44, %16, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %14, fd45; +sub.f64 fd47, %21, %26; +mul.f64 fd48, fd47, 0d3FEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %16, fd51; +sub.f64 fd53, %20, %25; +mul.f64 fd54, fd53, 0d3FEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0dBFEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0dBFEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0dBFEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0dBFEBB67AE8584CAA, fd65; +add.f64 %1, fd28, fd44; +add.f64 %0, fd26, fd42; +add.f64 %3, fd39, fd61; +add.f64 %2, fd33, fd59; +add.f64 %5, fd40, fd66; +add.f64 %4, fd34, fd64; +sub.f64 %7, fd28, fd44; +sub.f64 %6, fd26, fd42; +sub.f64 %9, fd39, fd61; +sub.f64 %8, fd33, fd59; +sub.f64 %11, fd40, fd66; +sub.f64 %10, fd34, fd64; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..0599e5fe23b42 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_6_fp64_inv.hpp.inc @@ -0,0 +1,70 @@ +#ifndef CUFFTDX_FFT_6_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_6_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<729, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<79>; +.reg .b64 rd<2>; +add.f64 fd25, %17, %22; +add.f64 fd26, %12, fd25; +add.f64 fd27, %19, %24; +add.f64 fd28, %13, fd27; +mul.f64 fd29, fd25, 0d3FE0000000000000; +sub.f64 fd30, %12, fd29; +sub.f64 fd31, %19, %24; +mul.f64 fd32, fd31, 0dBFEBB67AE8584CAA; +add.f64 fd33, fd32, fd30; +sub.f64 fd34, fd30, fd32; +mul.f64 fd35, fd27, 0d3FE0000000000000; +sub.f64 fd36, %13, fd35; +sub.f64 fd37, %17, %22; +mul.f64 fd38, fd37, 0dBFEBB67AE8584CAA; +sub.f64 fd39, fd36, fd38; +add.f64 fd40, fd38, fd36; +add.f64 fd41, %20, %25; +add.f64 fd42, %14, fd41; +add.f64 fd43, %21, %26; +add.f64 fd44, %16, fd43; +mul.f64 fd45, fd41, 0d3FE0000000000000; +sub.f64 fd46, %14, fd45; +sub.f64 fd47, %21, %26; +mul.f64 fd48, fd47, 0dBFEBB67AE8584CAA; +add.f64 fd49, fd48, fd46; +sub.f64 fd50, fd46, fd48; +mul.f64 fd51, fd43, 0d3FE0000000000000; +sub.f64 fd52, %16, fd51; +sub.f64 fd53, %20, %25; +mul.f64 fd54, fd53, 0dBFEBB67AE8584CAA; +sub.f64 fd55, fd52, fd54; +add.f64 fd56, fd54, fd52; +mul.f64 fd57, fd49, 0d3FE0000000000000; +mul.f64 fd58, fd55, 0d3FEBB67AE8584CAA; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd55, 0d3FE0000000000000; +fma.rn.f64 fd61, fd49, 0d3FEBB67AE8584CAA, fd60; +mul.f64 fd62, fd50, 0dBFE0000000000000; +mul.f64 fd63, fd56, 0d3FEBB67AE8584CAA; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd56, 0dBFE0000000000000; +fma.rn.f64 fd66, fd50, 0d3FEBB67AE8584CAA, fd65; +add.f64 %1, fd28, fd44; +add.f64 %0, fd26, fd42; +add.f64 %3, fd39, fd61; +add.f64 %2, fd33, fd59; +add.f64 %5, fd40, fd66; +add.f64 %4, fd34, fd64; +sub.f64 %7, fd28, fd44; +sub.f64 %6, fd26, fd42; +sub.f64 %9, fd39, fd61; +sub.f64 %8, fd33, fd59; +sub.f64 %11, fd40, fd66; +sub.f64 %10, fd34, fd64; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..2be11c8316238 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp16_fwd.hpp.inc @@ -0,0 +1,24951 @@ +#ifndef CUFFTDX_FFT_729_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_729_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<884, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<676>; +.reg .b32 r<6883>; +.reg .b64 rd<4>; +mov.u32 r6815, %54; +mov.u32 r6882, %tid.y; +mad.lo.s32 r6816, r6882, 5832, r6815; +mov.u32 r6817, %tid.x; +mov.f32 f670, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1, {low, high}; +} +mov.f32 f672, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r265, {low, high}; +} +mov.f32 f544, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r266, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r267, {low, high}; +} +mov.f32 f556, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r268, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r271, {low, high}; +} +mov.f32 f580, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %83, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %83, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %83, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %82, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %81; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %82, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %81; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %82, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %81; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %81; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %82, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %81; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %82, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1825, {low, high}; +} +mov.f32 f536, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1826, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1827, {low, high}; +} +mov.f32 f540, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1830, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1831, {low, high}; +} +mov.f32 f548, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1832, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1833, {low, high}; +} +mov.f32 f552, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1836, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1837, {low, high}; +} +mov.f32 f560, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1838, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1839, {low, high}; +} +mov.f32 f564, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1840, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1843, {low, high}; +} +mov.f32 f572, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1844, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1848, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1851, {low, high}; +} +mov.f32 f588, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1855, {low, high}; +} +mov.f32 f596, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r6817, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6818, rd3; +sub.s32 r6819, r6817, r6818; +shr.u32 r6820, r6819, 1; +add.s32 r6821, r6820, r6818; +shr.u32 r6822, r6821, 4; +mul.lo.s32 r6823, r6822, 27; +sub.s32 r6824, r6817, r6823; +cvt.rn.f32.u32 f673, r6824; +mul.f32 f674, f673, 0f3C0D3654; +cos.approx.f32 f309, f674; +sin.approx.f32 f675, f674; +neg.f32 f310, f675; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +mad.lo.s32 r6825, r6822, 5832, r6816; +barrier.sync 0; +mad.lo.s32 r6826, r6824, 216, r6825; +st.shared.v2.f32 [r6826], {r2140, r2146}; +st.shared.v2.f32 [r6826+8], {r2937, r2944}; +st.shared.v2.f32 [r6826+16], {r2974, r2981}; +st.shared.v2.f32 [r6826+24], {r3011, r3018}; +st.shared.v2.f32 [r6826+32], {r3048, r3055}; +st.shared.v2.f32 [r6826+40], {r3085, r3092}; +st.shared.v2.f32 [r6826+48], {r3122, r3129}; +st.shared.v2.f32 [r6826+56], {r3159, r3166}; +st.shared.v2.f32 [r6826+64], {r3196, r3203}; +st.shared.v2.f32 [r6826+72], {r3233, r3240}; +st.shared.v2.f32 [r6826+80], {r3270, r3277}; +st.shared.v2.f32 [r6826+88], {r3307, r3314}; +st.shared.v2.f32 [r6826+96], {r3344, r3351}; +st.shared.v2.f32 [r6826+104], {r3381, r3388}; +st.shared.v2.f32 [r6826+112], {r3418, r3425}; +st.shared.v2.f32 [r6826+120], {r3455, r3462}; +st.shared.v2.f32 [r6826+128], {r3492, r3499}; +st.shared.v2.f32 [r6826+136], {r3529, r3536}; +st.shared.v2.f32 [r6826+144], {r3566, r3573}; +st.shared.v2.f32 [r6826+152], {r3603, r3610}; +st.shared.v2.f32 [r6826+160], {r3640, r3647}; +st.shared.v2.f32 [r6826+168], {r3677, r3684}; +st.shared.v2.f32 [r6826+176], {r3714, r3721}; +st.shared.v2.f32 [r6826+184], {r3751, r3758}; +st.shared.v2.f32 [r6826+192], {r3788, r3795}; +st.shared.v2.f32 [r6826+200], {r3825, r3832}; +st.shared.v2.f32 [r6826+208], {r3862, r3869}; +barrier.sync 0; +mad.lo.s32 r6827, r6824, -208, r6826; +ld.shared.u32 r3898, [r6827]; +ld.shared.u32 r3904, [r6827+4]; +ld.shared.u32 r4506, [r6827+216]; +ld.shared.u32 r4512, [r6827+220]; +ld.shared.u32 r5114, [r6827+432]; +ld.shared.u32 r5120, [r6827+436]; +ld.shared.u32 r3986, [r6827+648]; +ld.shared.u32 r3992, [r6827+652]; +ld.shared.u32 r4594, [r6827+864]; +ld.shared.u32 r4600, [r6827+868]; +ld.shared.u32 r5202, [r6827+1080]; +ld.shared.u32 r5208, [r6827+1084]; +ld.shared.u32 r4074, [r6827+1296]; +ld.shared.u32 r4080, [r6827+1300]; +ld.shared.u32 r4682, [r6827+1512]; +ld.shared.u32 r4688, [r6827+1516]; +ld.shared.u32 r5290, [r6827+1728]; +ld.shared.u32 r5296, [r6827+1732]; +ld.shared.u32 r3895, [r6827+1944]; +ld.shared.u32 r3901, [r6827+1948]; +ld.shared.u32 r4503, [r6827+2160]; +ld.shared.u32 r4509, [r6827+2164]; +ld.shared.u32 r5111, [r6827+2376]; +ld.shared.u32 r5117, [r6827+2380]; +ld.shared.u32 r3983, [r6827+2592]; +ld.shared.u32 r3989, [r6827+2596]; +ld.shared.u32 r4591, [r6827+2808]; +ld.shared.u32 r4597, [r6827+2812]; +ld.shared.u32 r5199, [r6827+3024]; +ld.shared.u32 r5205, [r6827+3028]; +ld.shared.u32 r4071, [r6827+3240]; +ld.shared.u32 r4077, [r6827+3244]; +ld.shared.u32 r4679, [r6827+3456]; +ld.shared.u32 r4685, [r6827+3460]; +ld.shared.u32 r5287, [r6827+3672]; +ld.shared.u32 r5293, [r6827+3676]; +ld.shared.u32 r3896, [r6827+3888]; +ld.shared.u32 r3902, [r6827+3892]; +ld.shared.u32 r4504, [r6827+4104]; +ld.shared.u32 r4510, [r6827+4108]; +ld.shared.u32 r5112, [r6827+4320]; +ld.shared.u32 r5118, [r6827+4324]; +ld.shared.u32 r3984, [r6827+4536]; +ld.shared.u32 r3990, [r6827+4540]; +ld.shared.u32 r4592, [r6827+4752]; +ld.shared.u32 r4598, [r6827+4756]; +ld.shared.u32 r5200, [r6827+4968]; +ld.shared.u32 r5206, [r6827+4972]; +ld.shared.u32 r4072, [r6827+5184]; +ld.shared.u32 r4078, [r6827+5188]; +ld.shared.u32 r4680, [r6827+5400]; +ld.shared.u32 r4686, [r6827+5404]; +ld.shared.u32 r5288, [r6827+5616]; +ld.shared.u32 r5294, [r6827+5620]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 %0, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 %1, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 %18, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 %36, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 %19, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 %37, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 %2, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 %3, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 %20, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 %38, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 %21, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 %39, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 %4, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 %5, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 %22, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 %40, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 %23, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 %41, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 %6, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 %7, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 %24, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 %42, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 %25, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 %43, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 %8, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 %9, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 %26, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 %44, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 %27, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 %45, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 %10, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 %11, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 %28, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 %46, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 %29, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 %47, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 %12, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 %13, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 %30, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 %48, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 %31, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 %49, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 %14, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 %15, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 %32, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 %50, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 %33, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 %51, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 %16, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 %17, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 %34, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 %52, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 %35, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 %53, r6802, r6808; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<885, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<676>; +.reg .b32 r<6883>; +.reg .b64 rd<4>; +mov.u32 r6815, %54; +mov.u32 r6882, %tid.y; +mad.lo.s32 r6816, r6882, 2916, r6815; +mov.u32 r6817, %tid.x; +mov.f32 f670, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1, {low, high}; +} +mov.f32 f672, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %92, %85; +} +{ +add.f16x2 r8, %68, r5; +} +{ +add.f16x2 r11, %100, %91; +} +{ +add.f16x2 r14, %76, r11; +} +{ +add.f16x2 r17, %92, %85; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %68, r20; +} +{ +sub.f16x2 r26, %100, %91; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %92, %85; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %68, r38; +} +{ +sub.f16x2 r44, %100, %91; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %100, %91; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %76, r56; +} +{ +sub.f16x2 r62, %92, %85; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %100, %91; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %76, r74; +} +{ +sub.f16x2 r80, %92, %85; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %77, %67; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %84, %75; +} +{ +add.f16x2 r102, %58, r99; +} +{ +add.f16x2 r105, %77, %67; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %84, %75; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %77, %67; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %84, %75; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %84, %75; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %58, r144; +} +{ +sub.f16x2 r150, %77, %67; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %84, %75; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %58, r162; +} +{ +sub.f16x2 r168, %77, %67; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %59, %105; +} +{ +add.f16x2 r184, %90, r181; +} +{ +add.f16x2 r187, %66, %57; +} +{ +add.f16x2 r190, %99, r187; +} +{ +add.f16x2 r193, %59, %105; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %90, r196; +} +{ +sub.f16x2 r202, %66, %57; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %59, %105; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %90, r214; +} +{ +sub.f16x2 r220, %66, %57; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %66, %57; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %99, r232; +} +{ +sub.f16x2 r238, %59, %105; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %66, %57; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %99, r250; +} +{ +sub.f16x2 r256, %59, %105; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r265, {low, high}; +} +mov.f32 f544, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r266, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r267, {low, high}; +} +mov.f32 f556, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r268, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r271, {low, high}; +} +mov.f32 f580, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %95, %87; +} +{ +add.f16x2 r616, %71, r613; +} +{ +add.f16x2 r619, %102, %94; +} +{ +add.f16x2 r622, %79, r619; +} +{ +add.f16x2 r625, %95, %87; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %71, r628; +} +{ +sub.f16x2 r634, %102, %94; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %95, %87; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %71, r646; +} +{ +sub.f16x2 r652, %102, %94; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %102, %94; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %79, r664; +} +{ +sub.f16x2 r670, %95, %87; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %102, %94; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %79, r682; +} +{ +sub.f16x2 r688, %95, %87; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %80, %70; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %86, %78; +} +{ +add.f16x2 r710, %61, r707; +} +{ +add.f16x2 r713, %80, %70; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %86, %78; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %80, %70; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %86, %78; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %86, %78; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %61, r752; +} +{ +sub.f16x2 r758, %80, %70; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %86, %78; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %61, r770; +} +{ +sub.f16x2 r776, %80, %70; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %62, %107; +} +{ +add.f16x2 r792, %93, r789; +} +{ +add.f16x2 r795, %69, %60; +} +{ +add.f16x2 r798, %101, r795; +} +{ +add.f16x2 r801, %62, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %93, r804; +} +{ +sub.f16x2 r810, %69, %60; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %62, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %93, r822; +} +{ +sub.f16x2 r828, %69, %60; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %69, %60; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %101, r840; +} +{ +sub.f16x2 r846, %62, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %69, %60; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %101, r858; +} +{ +sub.f16x2 r864, %62, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %98, %89; +} +{ +add.f16x2 r1224, %74, r1221; +} +{ +add.f16x2 r1227, %104, %97; +} +{ +add.f16x2 r1230, %83, r1227; +} +{ +add.f16x2 r1233, %98, %89; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %74, r1236; +} +{ +sub.f16x2 r1242, %104, %97; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %98, %89; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %74, r1254; +} +{ +sub.f16x2 r1260, %104, %97; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %104, %97; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %83, r1272; +} +{ +sub.f16x2 r1278, %98, %89; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %104, %97; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %83, r1290; +} +{ +sub.f16x2 r1296, %98, %89; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %82, %73; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %88, %81; +} +{ +add.f16x2 r1318, %64, r1315; +} +{ +add.f16x2 r1321, %82, %73; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %88, %81; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %82, %73; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %88, %81; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %88, %81; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %64, r1360; +} +{ +sub.f16x2 r1366, %82, %73; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %88, %81; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %64, r1378; +} +{ +sub.f16x2 r1384, %82, %73; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %65, %55; +} +{ +add.f16x2 r1400, %96, r1397; +} +{ +add.f16x2 r1403, %72, %63; +} +{ +add.f16x2 r1406, %103, r1403; +} +{ +add.f16x2 r1409, %65, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %96, r1412; +} +{ +sub.f16x2 r1418, %72, %63; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %65, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %96, r1430; +} +{ +sub.f16x2 r1436, %72, %63; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %72, %63; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %103, r1448; +} +{ +sub.f16x2 r1454, %65, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %72, %63; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %103, r1466; +} +{ +sub.f16x2 r1472, %65, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1825, {low, high}; +} +mov.f32 f536, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1826, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1827, {low, high}; +} +mov.f32 f540, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1830, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1831, {low, high}; +} +mov.f32 f548, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1832, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1833, {low, high}; +} +mov.f32 f552, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1836, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1837, {low, high}; +} +mov.f32 f560, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1838, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1839, {low, high}; +} +mov.f32 f564, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1840, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1843, {low, high}; +} +mov.f32 f572, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1844, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1848, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1851, {low, high}; +} +mov.f32 f588, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1855, {low, high}; +} +mov.f32 f596, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r6817, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6818, rd3; +sub.s32 r6819, r6817, r6818; +shr.u32 r6820, r6819, 1; +add.s32 r6821, r6820, r6818; +shr.u32 r6822, r6821, 4; +mul.lo.s32 r6823, r6822, 27; +sub.s32 r6824, r6817, r6823; +mad.lo.s32 r6825, r6822, 2916, r6816; +cvt.rn.f32.u32 f673, r6824; +mul.f32 f674, f673, 0f3C0D3654; +cos.approx.f32 f309, f674; +sin.approx.f32 f675, f674; +neg.f32 f310, f675; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +barrier.sync 0; +mad.lo.s32 r6826, r6824, 108, r6825; +st.shared.u32 [r6826], r2140; +st.shared.u32 [r6826+4], r2937; +st.shared.u32 [r6826+8], r2974; +st.shared.u32 [r6826+12], r3011; +st.shared.u32 [r6826+16], r3048; +st.shared.u32 [r6826+20], r3085; +st.shared.u32 [r6826+24], r3122; +st.shared.u32 [r6826+28], r3159; +st.shared.u32 [r6826+32], r3196; +st.shared.u32 [r6826+36], r3233; +st.shared.u32 [r6826+40], r3270; +st.shared.u32 [r6826+44], r3307; +st.shared.u32 [r6826+48], r3344; +st.shared.u32 [r6826+52], r3381; +st.shared.u32 [r6826+56], r3418; +st.shared.u32 [r6826+60], r3455; +st.shared.u32 [r6826+64], r3492; +st.shared.u32 [r6826+68], r3529; +st.shared.u32 [r6826+72], r3566; +st.shared.u32 [r6826+76], r3603; +st.shared.u32 [r6826+80], r3640; +st.shared.u32 [r6826+84], r3677; +st.shared.u32 [r6826+88], r3714; +st.shared.u32 [r6826+92], r3751; +st.shared.u32 [r6826+96], r3788; +st.shared.u32 [r6826+100], r3825; +st.shared.u32 [r6826+104], r3862; +barrier.sync 0; +mad.lo.s32 r6827, r6824, -104, r6826; +ld.shared.u32 r3898, [r6827]; +ld.shared.u32 r4506, [r6827+108]; +ld.shared.u32 r5114, [r6827+216]; +ld.shared.u32 r3986, [r6827+324]; +ld.shared.u32 r4594, [r6827+432]; +ld.shared.u32 r5202, [r6827+540]; +ld.shared.u32 r4074, [r6827+648]; +ld.shared.u32 r4682, [r6827+756]; +ld.shared.u32 r5290, [r6827+864]; +ld.shared.u32 r3895, [r6827+972]; +ld.shared.u32 r4503, [r6827+1080]; +ld.shared.u32 r5111, [r6827+1188]; +ld.shared.u32 r3983, [r6827+1296]; +ld.shared.u32 r4591, [r6827+1404]; +ld.shared.u32 r5199, [r6827+1512]; +ld.shared.u32 r4071, [r6827+1620]; +ld.shared.u32 r4679, [r6827+1728]; +ld.shared.u32 r5287, [r6827+1836]; +ld.shared.u32 r3896, [r6827+1944]; +ld.shared.u32 r4504, [r6827+2052]; +ld.shared.u32 r5112, [r6827+2160]; +ld.shared.u32 r3984, [r6827+2268]; +ld.shared.u32 r4592, [r6827+2376]; +ld.shared.u32 r5200, [r6827+2484]; +ld.shared.u32 r4072, [r6827+2592]; +ld.shared.u32 r4680, [r6827+2700]; +ld.shared.u32 r5288, [r6827+2808]; +barrier.sync 0; +st.shared.u32 [r6826], r2146; +st.shared.u32 [r6826+4], r2944; +st.shared.u32 [r6826+8], r2981; +st.shared.u32 [r6826+12], r3018; +st.shared.u32 [r6826+16], r3055; +st.shared.u32 [r6826+20], r3092; +st.shared.u32 [r6826+24], r3129; +st.shared.u32 [r6826+28], r3166; +st.shared.u32 [r6826+32], r3203; +st.shared.u32 [r6826+36], r3240; +st.shared.u32 [r6826+40], r3277; +st.shared.u32 [r6826+44], r3314; +st.shared.u32 [r6826+48], r3351; +st.shared.u32 [r6826+52], r3388; +st.shared.u32 [r6826+56], r3425; +st.shared.u32 [r6826+60], r3462; +st.shared.u32 [r6826+64], r3499; +st.shared.u32 [r6826+68], r3536; +st.shared.u32 [r6826+72], r3573; +st.shared.u32 [r6826+76], r3610; +st.shared.u32 [r6826+80], r3647; +st.shared.u32 [r6826+84], r3684; +st.shared.u32 [r6826+88], r3721; +st.shared.u32 [r6826+92], r3758; +st.shared.u32 [r6826+96], r3795; +st.shared.u32 [r6826+100], r3832; +st.shared.u32 [r6826+104], r3869; +barrier.sync 0; +ld.shared.u32 r3904, [r6827]; +ld.shared.u32 r4512, [r6827+108]; +ld.shared.u32 r5120, [r6827+216]; +ld.shared.u32 r3992, [r6827+324]; +ld.shared.u32 r4600, [r6827+432]; +ld.shared.u32 r5208, [r6827+540]; +ld.shared.u32 r4080, [r6827+648]; +ld.shared.u32 r4688, [r6827+756]; +ld.shared.u32 r5296, [r6827+864]; +ld.shared.u32 r3901, [r6827+972]; +ld.shared.u32 r4509, [r6827+1080]; +ld.shared.u32 r5117, [r6827+1188]; +ld.shared.u32 r3989, [r6827+1296]; +ld.shared.u32 r4597, [r6827+1404]; +ld.shared.u32 r5205, [r6827+1512]; +ld.shared.u32 r4077, [r6827+1620]; +ld.shared.u32 r4685, [r6827+1728]; +ld.shared.u32 r5293, [r6827+1836]; +ld.shared.u32 r3902, [r6827+1944]; +ld.shared.u32 r4510, [r6827+2052]; +ld.shared.u32 r5118, [r6827+2160]; +ld.shared.u32 r3990, [r6827+2268]; +ld.shared.u32 r4598, [r6827+2376]; +ld.shared.u32 r5206, [r6827+2484]; +ld.shared.u32 r4078, [r6827+2592]; +ld.shared.u32 r4686, [r6827+2700]; +ld.shared.u32 r5294, [r6827+2808]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 r3897, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 r3903, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 r3921, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 r3939, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 r3957, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 r3975, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 r3985, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 r3991, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 r4009, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 r4027, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 r4045, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 r4063, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 r4073, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 r4079, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 r4097, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 r4115, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 r4133, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 r4151, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4161, {low, high}; +} +{ +mul.f16x2 r4170, r4009, r4154; +} +{ +mul.f16x2 r4173, r4045, r4155; +} +{ +sub.f16x2 r4176, r4170, r4173; +} +{ +mul.f16x2 r4179, r4009, r4155; +} +{ +fma.rn.f16x2 r4182, r4045, r4154, r4179; +} +{ +mul.f16x2 r4186, r4097, r4156; +} +{ +mul.f16x2 r4189, r4133, r4157; +} +{ +sub.f16x2 r4192, r4186, r4189; +} +{ +mul.f16x2 r4195, r4097, r4157; +} +{ +fma.rn.f16x2 r4198, r4133, r4156, r4195; +} +{ +mul.f16x2 r4202, r4027, r4156; +} +{ +mul.f16x2 r4205, r4063, r4157; +} +{ +sub.f16x2 r4208, r4202, r4205; +} +{ +mul.f16x2 r4211, r4027, r4157; +} +{ +fma.rn.f16x2 r4214, r4063, r4156, r4211; +} +{ +mul.f16x2 r4218, r4115, r4160; +} +{ +mul.f16x2 r4221, r4151, r4161; +} +{ +sub.f16x2 r4224, r4218, r4221; +} +{ +mul.f16x2 r4227, r4115, r4161; +} +{ +fma.rn.f16x2 r4230, r4151, r4160, r4227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4235, {low, high}; +} +{ +neg.f16x2 r4236, r4235; +} +{ +add.f16x2 r4238, r3985, r4073; +} +{ +add.f16x2 r4241, r3897, r4238; +} +{ +add.f16x2 r4244, r3991, r4079; +} +{ +add.f16x2 r4247, r3903, r4244; +} +{ +add.f16x2 r4250, r3985, r4073; +} +{ +mul.f16x2 r4253, r4250, r4234; +} +{ +add.f16x2 r4256, r3897, r4253; +} +{ +sub.f16x2 r4259, r3991, r4079; +} +{ +mul.f16x2 r4262, r4259, r4236; +} +{ +add.f16x2 r4265, r4256, r4262; +} +{ +add.f16x2 r4268, r3985, r4073; +} +{ +mul.f16x2 r4271, r4268, r4234; +} +{ +add.f16x2 r4274, r3897, r4271; +} +{ +sub.f16x2 r4277, r3991, r4079; +} +{ +mul.f16x2 r4280, r4277, r4236; +} +{ +sub.f16x2 r4283, r4274, r4280; +} +{ +add.f16x2 r4286, r3991, r4079; +} +{ +mul.f16x2 r4289, r4286, r4234; +} +{ +add.f16x2 r4292, r3903, r4289; +} +{ +sub.f16x2 r4295, r3985, r4073; +} +{ +mul.f16x2 r4298, r4295, r4236; +} +{ +sub.f16x2 r4301, r4292, r4298; +} +{ +add.f16x2 r4304, r3991, r4079; +} +{ +mul.f16x2 r4307, r4304, r4234; +} +{ +add.f16x2 r4310, r3903, r4307; +} +{ +sub.f16x2 r4313, r3985, r4073; +} +{ +mul.f16x2 r4316, r4313, r4236; +} +{ +add.f16x2 r4319, r4310, r4316; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4322, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4323, {low, high}; +} +{ +neg.f16x2 r4324, r4323; +} +{ +add.f16x2 r4326, r4176, r4192; +} +{ +add.f16x2 r4329, r3921, r4326; +} +{ +add.f16x2 r4332, r4182, r4198; +} +{ +add.f16x2 r4335, r3957, r4332; +} +{ +add.f16x2 r4338, r4176, r4192; +} +{ +mul.f16x2 r4341, r4338, r4322; +} +{ +add.f16x2 r4344, r3921, r4341; +} +{ +sub.f16x2 r4347, r4182, r4198; +} +{ +mul.f16x2 r4350, r4347, r4324; +} +{ +add.f16x2 r4353, r4344, r4350; +} +{ +add.f16x2 r4356, r4176, r4192; +} +{ +mul.f16x2 r4359, r4356, r4322; +} +{ +add.f16x2 r4362, r3921, r4359; +} +{ +sub.f16x2 r4365, r4182, r4198; +} +{ +mul.f16x2 r4368, r4365, r4324; +} +{ +sub.f16x2 r4371, r4362, r4368; +} +{ +add.f16x2 r4374, r4182, r4198; +} +{ +mul.f16x2 r4377, r4374, r4322; +} +{ +add.f16x2 r4380, r3957, r4377; +} +{ +sub.f16x2 r4383, r4176, r4192; +} +{ +mul.f16x2 r4386, r4383, r4324; +} +{ +sub.f16x2 r4389, r4380, r4386; +} +{ +add.f16x2 r4392, r4182, r4198; +} +{ +mul.f16x2 r4395, r4392, r4322; +} +{ +add.f16x2 r4398, r3957, r4395; +} +{ +sub.f16x2 r4401, r4176, r4192; +} +{ +mul.f16x2 r4404, r4401, r4324; +} +{ +add.f16x2 r4407, r4398, r4404; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4410, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4411, {low, high}; +} +{ +neg.f16x2 r4412, r4411; +} +{ +add.f16x2 r4414, r4208, r4224; +} +{ +add.f16x2 r4417, r3939, r4414; +} +{ +add.f16x2 r4420, r4214, r4230; +} +{ +add.f16x2 r4423, r3975, r4420; +} +{ +add.f16x2 r4426, r4208, r4224; +} +{ +mul.f16x2 r4429, r4426, r4410; +} +{ +add.f16x2 r4432, r3939, r4429; +} +{ +sub.f16x2 r4435, r4214, r4230; +} +{ +mul.f16x2 r4438, r4435, r4412; +} +{ +add.f16x2 r4441, r4432, r4438; +} +{ +add.f16x2 r4444, r4208, r4224; +} +{ +mul.f16x2 r4447, r4444, r4410; +} +{ +add.f16x2 r4450, r3939, r4447; +} +{ +sub.f16x2 r4453, r4214, r4230; +} +{ +mul.f16x2 r4456, r4453, r4412; +} +{ +sub.f16x2 r4459, r4450, r4456; +} +{ +add.f16x2 r4462, r4214, r4230; +} +{ +mul.f16x2 r4465, r4462, r4410; +} +{ +add.f16x2 r4468, r3975, r4465; +} +{ +sub.f16x2 r4471, r4208, r4224; +} +{ +mul.f16x2 r4474, r4471, r4412; +} +{ +sub.f16x2 r4477, r4468, r4474; +} +{ +add.f16x2 r4480, r4214, r4230; +} +{ +mul.f16x2 r4483, r4480, r4410; +} +{ +add.f16x2 r4486, r3975, r4483; +} +{ +sub.f16x2 r4489, r4208, r4224; +} +{ +mul.f16x2 r4492, r4489, r4412; +} +{ +add.f16x2 r4495, r4486, r4492; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4498, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4499, {low, high}; +} +{ +neg.f16x2 r4500, r4499; +} +{ +add.f16x2 r4502, r4503, r4504; +} +{ +add.f16x2 r4505, r4506, r4502; +} +{ +add.f16x2 r4508, r4509, r4510; +} +{ +add.f16x2 r4511, r4512, r4508; +} +{ +add.f16x2 r4514, r4503, r4504; +} +{ +mul.f16x2 r4517, r4514, r4498; +} +{ +add.f16x2 r4520, r4506, r4517; +} +{ +sub.f16x2 r4523, r4509, r4510; +} +{ +mul.f16x2 r4526, r4523, r4500; +} +{ +add.f16x2 r4529, r4520, r4526; +} +{ +add.f16x2 r4532, r4503, r4504; +} +{ +mul.f16x2 r4535, r4532, r4498; +} +{ +add.f16x2 r4538, r4506, r4535; +} +{ +sub.f16x2 r4541, r4509, r4510; +} +{ +mul.f16x2 r4544, r4541, r4500; +} +{ +sub.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4509, r4510; +} +{ +mul.f16x2 r4553, r4550, r4498; +} +{ +add.f16x2 r4556, r4512, r4553; +} +{ +sub.f16x2 r4559, r4503, r4504; +} +{ +mul.f16x2 r4562, r4559, r4500; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4509, r4510; +} +{ +mul.f16x2 r4571, r4568, r4498; +} +{ +add.f16x2 r4574, r4512, r4571; +} +{ +sub.f16x2 r4577, r4503, r4504; +} +{ +mul.f16x2 r4580, r4577, r4500; +} +{ +add.f16x2 r4583, r4574, r4580; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4586, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4587, {low, high}; +} +{ +neg.f16x2 r4588, r4587; +} +{ +add.f16x2 r4590, r4591, r4592; +} +{ +add.f16x2 r4593, r4594, r4590; +} +{ +add.f16x2 r4596, r4597, r4598; +} +{ +add.f16x2 r4599, r4600, r4596; +} +{ +add.f16x2 r4602, r4591, r4592; +} +{ +mul.f16x2 r4605, r4602, r4586; +} +{ +add.f16x2 r4608, r4594, r4605; +} +{ +sub.f16x2 r4611, r4597, r4598; +} +{ +mul.f16x2 r4614, r4611, r4588; +} +{ +add.f16x2 r4617, r4608, r4614; +} +{ +add.f16x2 r4620, r4591, r4592; +} +{ +mul.f16x2 r4623, r4620, r4586; +} +{ +add.f16x2 r4626, r4594, r4623; +} +{ +sub.f16x2 r4629, r4597, r4598; +} +{ +mul.f16x2 r4632, r4629, r4588; +} +{ +sub.f16x2 r4635, r4626, r4632; +} +{ +add.f16x2 r4638, r4597, r4598; +} +{ +mul.f16x2 r4641, r4638, r4586; +} +{ +add.f16x2 r4644, r4600, r4641; +} +{ +sub.f16x2 r4647, r4591, r4592; +} +{ +mul.f16x2 r4650, r4647, r4588; +} +{ +sub.f16x2 r4653, r4644, r4650; +} +{ +add.f16x2 r4656, r4597, r4598; +} +{ +mul.f16x2 r4659, r4656, r4586; +} +{ +add.f16x2 r4662, r4600, r4659; +} +{ +sub.f16x2 r4665, r4591, r4592; +} +{ +mul.f16x2 r4668, r4665, r4588; +} +{ +add.f16x2 r4671, r4662, r4668; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4674, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4675, {low, high}; +} +{ +neg.f16x2 r4676, r4675; +} +{ +add.f16x2 r4678, r4679, r4680; +} +{ +add.f16x2 r4681, r4682, r4678; +} +{ +add.f16x2 r4684, r4685, r4686; +} +{ +add.f16x2 r4687, r4688, r4684; +} +{ +add.f16x2 r4690, r4679, r4680; +} +{ +mul.f16x2 r4693, r4690, r4674; +} +{ +add.f16x2 r4696, r4682, r4693; +} +{ +sub.f16x2 r4699, r4685, r4686; +} +{ +mul.f16x2 r4702, r4699, r4676; +} +{ +add.f16x2 r4705, r4696, r4702; +} +{ +add.f16x2 r4708, r4679, r4680; +} +{ +mul.f16x2 r4711, r4708, r4674; +} +{ +add.f16x2 r4714, r4682, r4711; +} +{ +sub.f16x2 r4717, r4685, r4686; +} +{ +mul.f16x2 r4720, r4717, r4676; +} +{ +sub.f16x2 r4723, r4714, r4720; +} +{ +add.f16x2 r4726, r4685, r4686; +} +{ +mul.f16x2 r4729, r4726, r4674; +} +{ +add.f16x2 r4732, r4688, r4729; +} +{ +sub.f16x2 r4735, r4679, r4680; +} +{ +mul.f16x2 r4738, r4735, r4676; +} +{ +sub.f16x2 r4741, r4732, r4738; +} +{ +add.f16x2 r4744, r4685, r4686; +} +{ +mul.f16x2 r4747, r4744, r4674; +} +{ +add.f16x2 r4750, r4688, r4747; +} +{ +sub.f16x2 r4753, r4679, r4680; +} +{ +mul.f16x2 r4756, r4753, r4676; +} +{ +add.f16x2 r4759, r4750, r4756; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4762, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4763, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4764, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4765, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4768, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4769, {low, high}; +} +{ +mul.f16x2 r4778, r4617, r4762; +} +{ +mul.f16x2 r4781, r4653, r4763; +} +{ +sub.f16x2 r4784, r4778, r4781; +} +{ +mul.f16x2 r4787, r4617, r4763; +} +{ +fma.rn.f16x2 r4790, r4653, r4762, r4787; +} +{ +mul.f16x2 r4794, r4705, r4764; +} +{ +mul.f16x2 r4797, r4741, r4765; +} +{ +sub.f16x2 r4800, r4794, r4797; +} +{ +mul.f16x2 r4803, r4705, r4765; +} +{ +fma.rn.f16x2 r4806, r4741, r4764, r4803; +} +{ +mul.f16x2 r4810, r4635, r4764; +} +{ +mul.f16x2 r4813, r4671, r4765; +} +{ +sub.f16x2 r4816, r4810, r4813; +} +{ +mul.f16x2 r4819, r4635, r4765; +} +{ +fma.rn.f16x2 r4822, r4671, r4764, r4819; +} +{ +mul.f16x2 r4826, r4723, r4768; +} +{ +mul.f16x2 r4829, r4759, r4769; +} +{ +sub.f16x2 r4832, r4826, r4829; +} +{ +mul.f16x2 r4835, r4723, r4769; +} +{ +fma.rn.f16x2 r4838, r4759, r4768, r4835; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4842, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4843, {low, high}; +} +{ +neg.f16x2 r4844, r4843; +} +{ +add.f16x2 r4846, r4593, r4681; +} +{ +add.f16x2 r4849, r4505, r4846; +} +{ +add.f16x2 r4852, r4599, r4687; +} +{ +add.f16x2 r4855, r4511, r4852; +} +{ +add.f16x2 r4858, r4593, r4681; +} +{ +mul.f16x2 r4861, r4858, r4842; +} +{ +add.f16x2 r4864, r4505, r4861; +} +{ +sub.f16x2 r4867, r4599, r4687; +} +{ +mul.f16x2 r4870, r4867, r4844; +} +{ +add.f16x2 r4873, r4864, r4870; +} +{ +add.f16x2 r4876, r4593, r4681; +} +{ +mul.f16x2 r4879, r4876, r4842; +} +{ +add.f16x2 r4882, r4505, r4879; +} +{ +sub.f16x2 r4885, r4599, r4687; +} +{ +mul.f16x2 r4888, r4885, r4844; +} +{ +sub.f16x2 r4891, r4882, r4888; +} +{ +add.f16x2 r4894, r4599, r4687; +} +{ +mul.f16x2 r4897, r4894, r4842; +} +{ +add.f16x2 r4900, r4511, r4897; +} +{ +sub.f16x2 r4903, r4593, r4681; +} +{ +mul.f16x2 r4906, r4903, r4844; +} +{ +sub.f16x2 r4909, r4900, r4906; +} +{ +add.f16x2 r4912, r4599, r4687; +} +{ +mul.f16x2 r4915, r4912, r4842; +} +{ +add.f16x2 r4918, r4511, r4915; +} +{ +sub.f16x2 r4921, r4593, r4681; +} +{ +mul.f16x2 r4924, r4921, r4844; +} +{ +add.f16x2 r4927, r4918, r4924; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4930, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4931, {low, high}; +} +{ +neg.f16x2 r4932, r4931; +} +{ +add.f16x2 r4934, r4784, r4800; +} +{ +add.f16x2 r4937, r4529, r4934; +} +{ +add.f16x2 r4940, r4790, r4806; +} +{ +add.f16x2 r4943, r4565, r4940; +} +{ +add.f16x2 r4946, r4784, r4800; +} +{ +mul.f16x2 r4949, r4946, r4930; +} +{ +add.f16x2 r4952, r4529, r4949; +} +{ +sub.f16x2 r4955, r4790, r4806; +} +{ +mul.f16x2 r4958, r4955, r4932; +} +{ +add.f16x2 r4961, r4952, r4958; +} +{ +add.f16x2 r4964, r4784, r4800; +} +{ +mul.f16x2 r4967, r4964, r4930; +} +{ +add.f16x2 r4970, r4529, r4967; +} +{ +sub.f16x2 r4973, r4790, r4806; +} +{ +mul.f16x2 r4976, r4973, r4932; +} +{ +sub.f16x2 r4979, r4970, r4976; +} +{ +add.f16x2 r4982, r4790, r4806; +} +{ +mul.f16x2 r4985, r4982, r4930; +} +{ +add.f16x2 r4988, r4565, r4985; +} +{ +sub.f16x2 r4991, r4784, r4800; +} +{ +mul.f16x2 r4994, r4991, r4932; +} +{ +sub.f16x2 r4997, r4988, r4994; +} +{ +add.f16x2 r5000, r4790, r4806; +} +{ +mul.f16x2 r5003, r5000, r4930; +} +{ +add.f16x2 r5006, r4565, r5003; +} +{ +sub.f16x2 r5009, r4784, r4800; +} +{ +mul.f16x2 r5012, r5009, r4932; +} +{ +add.f16x2 r5015, r5006, r5012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5019, {low, high}; +} +{ +neg.f16x2 r5020, r5019; +} +{ +add.f16x2 r5022, r4816, r4832; +} +{ +add.f16x2 r5025, r4547, r5022; +} +{ +add.f16x2 r5028, r4822, r4838; +} +{ +add.f16x2 r5031, r4583, r5028; +} +{ +add.f16x2 r5034, r4816, r4832; +} +{ +mul.f16x2 r5037, r5034, r5018; +} +{ +add.f16x2 r5040, r4547, r5037; +} +{ +sub.f16x2 r5043, r4822, r4838; +} +{ +mul.f16x2 r5046, r5043, r5020; +} +{ +add.f16x2 r5049, r5040, r5046; +} +{ +add.f16x2 r5052, r4816, r4832; +} +{ +mul.f16x2 r5055, r5052, r5018; +} +{ +add.f16x2 r5058, r4547, r5055; +} +{ +sub.f16x2 r5061, r4822, r4838; +} +{ +mul.f16x2 r5064, r5061, r5020; +} +{ +sub.f16x2 r5067, r5058, r5064; +} +{ +add.f16x2 r5070, r4822, r4838; +} +{ +mul.f16x2 r5073, r5070, r5018; +} +{ +add.f16x2 r5076, r4583, r5073; +} +{ +sub.f16x2 r5079, r4816, r4832; +} +{ +mul.f16x2 r5082, r5079, r5020; +} +{ +sub.f16x2 r5085, r5076, r5082; +} +{ +add.f16x2 r5088, r4822, r4838; +} +{ +mul.f16x2 r5091, r5088, r5018; +} +{ +add.f16x2 r5094, r4583, r5091; +} +{ +sub.f16x2 r5097, r4816, r4832; +} +{ +mul.f16x2 r5100, r5097, r5020; +} +{ +add.f16x2 r5103, r5094, r5100; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5107, {low, high}; +} +{ +neg.f16x2 r5108, r5107; +} +{ +add.f16x2 r5110, r5111, r5112; +} +{ +add.f16x2 r5113, r5114, r5110; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5111, r5112; +} +{ +mul.f16x2 r5125, r5122, r5106; +} +{ +add.f16x2 r5128, r5114, r5125; +} +{ +sub.f16x2 r5131, r5117, r5118; +} +{ +mul.f16x2 r5134, r5131, r5108; +} +{ +add.f16x2 r5137, r5128, r5134; +} +{ +add.f16x2 r5140, r5111, r5112; +} +{ +mul.f16x2 r5143, r5140, r5106; +} +{ +add.f16x2 r5146, r5114, r5143; +} +{ +sub.f16x2 r5149, r5117, r5118; +} +{ +mul.f16x2 r5152, r5149, r5108; +} +{ +sub.f16x2 r5155, r5146, r5152; +} +{ +add.f16x2 r5158, r5117, r5118; +} +{ +mul.f16x2 r5161, r5158, r5106; +} +{ +add.f16x2 r5164, r5120, r5161; +} +{ +sub.f16x2 r5167, r5111, r5112; +} +{ +mul.f16x2 r5170, r5167, r5108; +} +{ +sub.f16x2 r5173, r5164, r5170; +} +{ +add.f16x2 r5176, r5117, r5118; +} +{ +mul.f16x2 r5179, r5176, r5106; +} +{ +add.f16x2 r5182, r5120, r5179; +} +{ +sub.f16x2 r5185, r5111, r5112; +} +{ +mul.f16x2 r5188, r5185, r5108; +} +{ +add.f16x2 r5191, r5182, r5188; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5194, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5195, {low, high}; +} +{ +neg.f16x2 r5196, r5195; +} +{ +add.f16x2 r5198, r5199, r5200; +} +{ +add.f16x2 r5201, r5202, r5198; +} +{ +add.f16x2 r5204, r5205, r5206; +} +{ +add.f16x2 r5207, r5208, r5204; +} +{ +add.f16x2 r5210, r5199, r5200; +} +{ +mul.f16x2 r5213, r5210, r5194; +} +{ +add.f16x2 r5216, r5202, r5213; +} +{ +sub.f16x2 r5219, r5205, r5206; +} +{ +mul.f16x2 r5222, r5219, r5196; +} +{ +add.f16x2 r5225, r5216, r5222; +} +{ +add.f16x2 r5228, r5199, r5200; +} +{ +mul.f16x2 r5231, r5228, r5194; +} +{ +add.f16x2 r5234, r5202, r5231; +} +{ +sub.f16x2 r5237, r5205, r5206; +} +{ +mul.f16x2 r5240, r5237, r5196; +} +{ +sub.f16x2 r5243, r5234, r5240; +} +{ +add.f16x2 r5246, r5205, r5206; +} +{ +mul.f16x2 r5249, r5246, r5194; +} +{ +add.f16x2 r5252, r5208, r5249; +} +{ +sub.f16x2 r5255, r5199, r5200; +} +{ +mul.f16x2 r5258, r5255, r5196; +} +{ +sub.f16x2 r5261, r5252, r5258; +} +{ +add.f16x2 r5264, r5205, r5206; +} +{ +mul.f16x2 r5267, r5264, r5194; +} +{ +add.f16x2 r5270, r5208, r5267; +} +{ +sub.f16x2 r5273, r5199, r5200; +} +{ +mul.f16x2 r5276, r5273, r5196; +} +{ +add.f16x2 r5279, r5270, r5276; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5282, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5283, {low, high}; +} +{ +neg.f16x2 r5284, r5283; +} +{ +add.f16x2 r5286, r5287, r5288; +} +{ +add.f16x2 r5289, r5290, r5286; +} +{ +add.f16x2 r5292, r5293, r5294; +} +{ +add.f16x2 r5295, r5296, r5292; +} +{ +add.f16x2 r5298, r5287, r5288; +} +{ +mul.f16x2 r5301, r5298, r5282; +} +{ +add.f16x2 r5304, r5290, r5301; +} +{ +sub.f16x2 r5307, r5293, r5294; +} +{ +mul.f16x2 r5310, r5307, r5284; +} +{ +add.f16x2 r5313, r5304, r5310; +} +{ +add.f16x2 r5316, r5287, r5288; +} +{ +mul.f16x2 r5319, r5316, r5282; +} +{ +add.f16x2 r5322, r5290, r5319; +} +{ +sub.f16x2 r5325, r5293, r5294; +} +{ +mul.f16x2 r5328, r5325, r5284; +} +{ +sub.f16x2 r5331, r5322, r5328; +} +{ +add.f16x2 r5334, r5293, r5294; +} +{ +mul.f16x2 r5337, r5334, r5282; +} +{ +add.f16x2 r5340, r5296, r5337; +} +{ +sub.f16x2 r5343, r5287, r5288; +} +{ +mul.f16x2 r5346, r5343, r5284; +} +{ +sub.f16x2 r5349, r5340, r5346; +} +{ +add.f16x2 r5352, r5293, r5294; +} +{ +mul.f16x2 r5355, r5352, r5282; +} +{ +add.f16x2 r5358, r5296, r5355; +} +{ +sub.f16x2 r5361, r5287, r5288; +} +{ +mul.f16x2 r5364, r5361, r5284; +} +{ +add.f16x2 r5367, r5358, r5364; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5370, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5371, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5372, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5373, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5376, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5377, {low, high}; +} +{ +mul.f16x2 r5386, r5225, r5370; +} +{ +mul.f16x2 r5389, r5261, r5371; +} +{ +sub.f16x2 r5392, r5386, r5389; +} +{ +mul.f16x2 r5395, r5225, r5371; +} +{ +fma.rn.f16x2 r5398, r5261, r5370, r5395; +} +{ +mul.f16x2 r5402, r5313, r5372; +} +{ +mul.f16x2 r5405, r5349, r5373; +} +{ +sub.f16x2 r5408, r5402, r5405; +} +{ +mul.f16x2 r5411, r5313, r5373; +} +{ +fma.rn.f16x2 r5414, r5349, r5372, r5411; +} +{ +mul.f16x2 r5418, r5243, r5372; +} +{ +mul.f16x2 r5421, r5279, r5373; +} +{ +sub.f16x2 r5424, r5418, r5421; +} +{ +mul.f16x2 r5427, r5243, r5373; +} +{ +fma.rn.f16x2 r5430, r5279, r5372, r5427; +} +{ +mul.f16x2 r5434, r5331, r5376; +} +{ +mul.f16x2 r5437, r5367, r5377; +} +{ +sub.f16x2 r5440, r5434, r5437; +} +{ +mul.f16x2 r5443, r5331, r5377; +} +{ +fma.rn.f16x2 r5446, r5367, r5376, r5443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5451, {low, high}; +} +{ +neg.f16x2 r5452, r5451; +} +{ +add.f16x2 r5454, r5201, r5289; +} +{ +add.f16x2 r5457, r5113, r5454; +} +{ +add.f16x2 r5460, r5207, r5295; +} +{ +add.f16x2 r5463, r5119, r5460; +} +{ +add.f16x2 r5466, r5201, r5289; +} +{ +mul.f16x2 r5469, r5466, r5450; +} +{ +add.f16x2 r5472, r5113, r5469; +} +{ +sub.f16x2 r5475, r5207, r5295; +} +{ +mul.f16x2 r5478, r5475, r5452; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5201, r5289; +} +{ +mul.f16x2 r5487, r5484, r5450; +} +{ +add.f16x2 r5490, r5113, r5487; +} +{ +sub.f16x2 r5493, r5207, r5295; +} +{ +mul.f16x2 r5496, r5493, r5452; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5207, r5295; +} +{ +mul.f16x2 r5505, r5502, r5450; +} +{ +add.f16x2 r5508, r5119, r5505; +} +{ +sub.f16x2 r5511, r5201, r5289; +} +{ +mul.f16x2 r5514, r5511, r5452; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5207, r5295; +} +{ +mul.f16x2 r5523, r5520, r5450; +} +{ +add.f16x2 r5526, r5119, r5523; +} +{ +sub.f16x2 r5529, r5201, r5289; +} +{ +mul.f16x2 r5532, r5529, r5452; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5539, {low, high}; +} +{ +neg.f16x2 r5540, r5539; +} +{ +add.f16x2 r5542, r5392, r5408; +} +{ +add.f16x2 r5545, r5137, r5542; +} +{ +add.f16x2 r5548, r5398, r5414; +} +{ +add.f16x2 r5551, r5173, r5548; +} +{ +add.f16x2 r5554, r5392, r5408; +} +{ +mul.f16x2 r5557, r5554, r5538; +} +{ +add.f16x2 r5560, r5137, r5557; +} +{ +sub.f16x2 r5563, r5398, r5414; +} +{ +mul.f16x2 r5566, r5563, r5540; +} +{ +add.f16x2 r5569, r5560, r5566; +} +{ +add.f16x2 r5572, r5392, r5408; +} +{ +mul.f16x2 r5575, r5572, r5538; +} +{ +add.f16x2 r5578, r5137, r5575; +} +{ +sub.f16x2 r5581, r5398, r5414; +} +{ +mul.f16x2 r5584, r5581, r5540; +} +{ +sub.f16x2 r5587, r5578, r5584; +} +{ +add.f16x2 r5590, r5398, r5414; +} +{ +mul.f16x2 r5593, r5590, r5538; +} +{ +add.f16x2 r5596, r5173, r5593; +} +{ +sub.f16x2 r5599, r5392, r5408; +} +{ +mul.f16x2 r5602, r5599, r5540; +} +{ +sub.f16x2 r5605, r5596, r5602; +} +{ +add.f16x2 r5608, r5398, r5414; +} +{ +mul.f16x2 r5611, r5608, r5538; +} +{ +add.f16x2 r5614, r5173, r5611; +} +{ +sub.f16x2 r5617, r5392, r5408; +} +{ +mul.f16x2 r5620, r5617, r5540; +} +{ +add.f16x2 r5623, r5614, r5620; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5627, {low, high}; +} +{ +neg.f16x2 r5628, r5627; +} +{ +add.f16x2 r5630, r5424, r5440; +} +{ +add.f16x2 r5633, r5155, r5630; +} +{ +add.f16x2 r5636, r5430, r5446; +} +{ +add.f16x2 r5639, r5191, r5636; +} +{ +add.f16x2 r5642, r5424, r5440; +} +{ +mul.f16x2 r5645, r5642, r5626; +} +{ +add.f16x2 r5648, r5155, r5645; +} +{ +sub.f16x2 r5651, r5430, r5446; +} +{ +mul.f16x2 r5654, r5651, r5628; +} +{ +add.f16x2 r5657, r5648, r5654; +} +{ +add.f16x2 r5660, r5424, r5440; +} +{ +mul.f16x2 r5663, r5660, r5626; +} +{ +add.f16x2 r5666, r5155, r5663; +} +{ +sub.f16x2 r5669, r5430, r5446; +} +{ +mul.f16x2 r5672, r5669, r5628; +} +{ +sub.f16x2 r5675, r5666, r5672; +} +{ +add.f16x2 r5678, r5430, r5446; +} +{ +mul.f16x2 r5681, r5678, r5626; +} +{ +add.f16x2 r5684, r5191, r5681; +} +{ +sub.f16x2 r5687, r5424, r5440; +} +{ +mul.f16x2 r5690, r5687, r5628; +} +{ +sub.f16x2 r5693, r5684, r5690; +} +{ +add.f16x2 r5696, r5430, r5446; +} +{ +mul.f16x2 r5699, r5696, r5626; +} +{ +add.f16x2 r5702, r5191, r5699; +} +{ +sub.f16x2 r5705, r5424, r5440; +} +{ +mul.f16x2 r5708, r5705, r5628; +} +{ +add.f16x2 r5711, r5702, r5708; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5714, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5715, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5716, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5719, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5720, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5722, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5723, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5724, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5725, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5727, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5728, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5729, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5732, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5733, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5736, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5740, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5745, {low, high}; +} +{ +mul.f16x2 r5766, r4937, r5714; +} +{ +mul.f16x2 r5769, r4943, r5715; +} +{ +sub.f16x2 r5772, r5766, r5769; +} +{ +mul.f16x2 r5775, r4937, r5715; +} +{ +fma.rn.f16x2 r5778, r4943, r5714, r5775; +} +{ +mul.f16x2 r5782, r5545, r5716; +} +{ +mul.f16x2 r5785, r5551, r5717; +} +{ +sub.f16x2 r5788, r5782, r5785; +} +{ +mul.f16x2 r5791, r5545, r5717; +} +{ +fma.rn.f16x2 r5794, r5551, r5716, r5791; +} +{ +mul.f16x2 r5798, r5025, r5716; +} +{ +mul.f16x2 r5801, r5031, r5717; +} +{ +sub.f16x2 r5804, r5798, r5801; +} +{ +mul.f16x2 r5807, r5025, r5717; +} +{ +fma.rn.f16x2 r5810, r5031, r5716, r5807; +} +{ +mul.f16x2 r5814, r5633, r5720; +} +{ +mul.f16x2 r5817, r5639, r5721; +} +{ +sub.f16x2 r5820, r5814, r5817; +} +{ +mul.f16x2 r5823, r5633, r5721; +} +{ +fma.rn.f16x2 r5826, r5639, r5720, r5823; +} +{ +mul.f16x2 r5830, r4873, r5718; +} +{ +mul.f16x2 r5833, r4909, r5719; +} +{ +sub.f16x2 r5836, r5830, r5833; +} +{ +mul.f16x2 r5839, r4873, r5719; +} +{ +fma.rn.f16x2 r5842, r4909, r5718, r5839; +} +{ +mul.f16x2 r5846, r5481, r5724; +} +{ +mul.f16x2 r5849, r5517, r5725; +} +{ +sub.f16x2 r5852, r5846, r5849; +} +{ +mul.f16x2 r5855, r5481, r5725; +} +{ +fma.rn.f16x2 r5858, r5517, r5724, r5855; +} +{ +mul.f16x2 r5862, r4961, r5720; +} +{ +mul.f16x2 r5865, r4997, r5721; +} +{ +sub.f16x2 r5868, r5862, r5865; +} +{ +mul.f16x2 r5871, r4961, r5721; +} +{ +fma.rn.f16x2 r5874, r4997, r5720, r5871; +} +{ +mul.f16x2 r5878, r5569, r5728; +} +{ +mul.f16x2 r5881, r5605, r5729; +} +{ +sub.f16x2 r5884, r5878, r5881; +} +{ +mul.f16x2 r5887, r5569, r5729; +} +{ +fma.rn.f16x2 r5890, r5605, r5728, r5887; +} +{ +mul.f16x2 r5894, r5049, r5722; +} +{ +mul.f16x2 r5897, r5085, r5723; +} +{ +sub.f16x2 r5900, r5894, r5897; +} +{ +mul.f16x2 r5903, r5049, r5723; +} +{ +fma.rn.f16x2 r5906, r5085, r5722, r5903; +} +{ +mul.f16x2 r5910, r5657, r5732; +} +{ +mul.f16x2 r5913, r5693, r5733; +} +{ +sub.f16x2 r5916, r5910, r5913; +} +{ +mul.f16x2 r5919, r5657, r5733; +} +{ +fma.rn.f16x2 r5922, r5693, r5732, r5919; +} +{ +mul.f16x2 r5926, r4891, r5724; +} +{ +mul.f16x2 r5929, r4927, r5725; +} +{ +sub.f16x2 r5932, r5926, r5929; +} +{ +mul.f16x2 r5935, r4891, r5725; +} +{ +fma.rn.f16x2 r5938, r4927, r5724, r5935; +} +{ +mul.f16x2 r5942, r5499, r5736; +} +{ +mul.f16x2 r5945, r5535, r5737; +} +{ +sub.f16x2 r5948, r5942, r5945; +} +{ +mul.f16x2 r5951, r5499, r5737; +} +{ +fma.rn.f16x2 r5954, r5535, r5736, r5951; +} +{ +mul.f16x2 r5958, r4979, r5726; +} +{ +mul.f16x2 r5961, r5015, r5727; +} +{ +sub.f16x2 r5964, r5958, r5961; +} +{ +mul.f16x2 r5967, r4979, r5727; +} +{ +fma.rn.f16x2 r5970, r5015, r5726, r5967; +} +{ +mul.f16x2 r5974, r5587, r5740; +} +{ +mul.f16x2 r5977, r5623, r5741; +} +{ +sub.f16x2 r5980, r5974, r5977; +} +{ +mul.f16x2 r5983, r5587, r5741; +} +{ +fma.rn.f16x2 r5986, r5623, r5740, r5983; +} +{ +mul.f16x2 r5990, r5067, r5728; +} +{ +mul.f16x2 r5993, r5103, r5729; +} +{ +sub.f16x2 r5996, r5990, r5993; +} +{ +mul.f16x2 r5999, r5067, r5729; +} +{ +fma.rn.f16x2 r6002, r5103, r5728, r5999; +} +{ +mul.f16x2 r6006, r5675, r5744; +} +{ +mul.f16x2 r6009, r5711, r5745; +} +{ +sub.f16x2 r6012, r6006, r6009; +} +{ +mul.f16x2 r6015, r5675, r5745; +} +{ +fma.rn.f16x2 r6018, r5711, r5744, r6015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6023, {low, high}; +} +{ +neg.f16x2 r6024, r6023; +} +{ +add.f16x2 r6026, r4849, r5457; +} +{ +add.f16x2 %0, r4241, r6026; +} +{ +add.f16x2 r6032, r4855, r5463; +} +{ +add.f16x2 %1, r4247, r6032; +} +{ +add.f16x2 r6038, r4849, r5457; +} +{ +mul.f16x2 r6041, r6038, r6022; +} +{ +add.f16x2 r6044, r4241, r6041; +} +{ +sub.f16x2 r6047, r4855, r5463; +} +{ +mul.f16x2 r6050, r6047, r6024; +} +{ +add.f16x2 %18, r6044, r6050; +} +{ +add.f16x2 r6056, r4849, r5457; +} +{ +mul.f16x2 r6059, r6056, r6022; +} +{ +add.f16x2 r6062, r4241, r6059; +} +{ +sub.f16x2 r6065, r4855, r5463; +} +{ +mul.f16x2 r6068, r6065, r6024; +} +{ +sub.f16x2 %36, r6062, r6068; +} +{ +add.f16x2 r6074, r4855, r5463; +} +{ +mul.f16x2 r6077, r6074, r6022; +} +{ +add.f16x2 r6080, r4247, r6077; +} +{ +sub.f16x2 r6083, r4849, r5457; +} +{ +mul.f16x2 r6086, r6083, r6024; +} +{ +sub.f16x2 %19, r6080, r6086; +} +{ +add.f16x2 r6092, r4855, r5463; +} +{ +mul.f16x2 r6095, r6092, r6022; +} +{ +add.f16x2 r6098, r4247, r6095; +} +{ +sub.f16x2 r6101, r4849, r5457; +} +{ +mul.f16x2 r6104, r6101, r6024; +} +{ +add.f16x2 %37, r6098, r6104; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6110, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6111, {low, high}; +} +{ +neg.f16x2 r6112, r6111; +} +{ +add.f16x2 r6114, r5772, r5788; +} +{ +add.f16x2 %2, r4329, r6114; +} +{ +add.f16x2 r6120, r5778, r5794; +} +{ +add.f16x2 %3, r4335, r6120; +} +{ +add.f16x2 r6126, r5772, r5788; +} +{ +mul.f16x2 r6129, r6126, r6110; +} +{ +add.f16x2 r6132, r4329, r6129; +} +{ +sub.f16x2 r6135, r5778, r5794; +} +{ +mul.f16x2 r6138, r6135, r6112; +} +{ +add.f16x2 %20, r6132, r6138; +} +{ +add.f16x2 r6144, r5772, r5788; +} +{ +mul.f16x2 r6147, r6144, r6110; +} +{ +add.f16x2 r6150, r4329, r6147; +} +{ +sub.f16x2 r6153, r5778, r5794; +} +{ +mul.f16x2 r6156, r6153, r6112; +} +{ +sub.f16x2 %38, r6150, r6156; +} +{ +add.f16x2 r6162, r5778, r5794; +} +{ +mul.f16x2 r6165, r6162, r6110; +} +{ +add.f16x2 r6168, r4335, r6165; +} +{ +sub.f16x2 r6171, r5772, r5788; +} +{ +mul.f16x2 r6174, r6171, r6112; +} +{ +sub.f16x2 %21, r6168, r6174; +} +{ +add.f16x2 r6180, r5778, r5794; +} +{ +mul.f16x2 r6183, r6180, r6110; +} +{ +add.f16x2 r6186, r4335, r6183; +} +{ +sub.f16x2 r6189, r5772, r5788; +} +{ +mul.f16x2 r6192, r6189, r6112; +} +{ +add.f16x2 %39, r6186, r6192; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6198, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6199, {low, high}; +} +{ +neg.f16x2 r6200, r6199; +} +{ +add.f16x2 r6202, r5804, r5820; +} +{ +add.f16x2 %4, r4417, r6202; +} +{ +add.f16x2 r6208, r5810, r5826; +} +{ +add.f16x2 %5, r4423, r6208; +} +{ +add.f16x2 r6214, r5804, r5820; +} +{ +mul.f16x2 r6217, r6214, r6198; +} +{ +add.f16x2 r6220, r4417, r6217; +} +{ +sub.f16x2 r6223, r5810, r5826; +} +{ +mul.f16x2 r6226, r6223, r6200; +} +{ +add.f16x2 %22, r6220, r6226; +} +{ +add.f16x2 r6232, r5804, r5820; +} +{ +mul.f16x2 r6235, r6232, r6198; +} +{ +add.f16x2 r6238, r4417, r6235; +} +{ +sub.f16x2 r6241, r5810, r5826; +} +{ +mul.f16x2 r6244, r6241, r6200; +} +{ +sub.f16x2 %40, r6238, r6244; +} +{ +add.f16x2 r6250, r5810, r5826; +} +{ +mul.f16x2 r6253, r6250, r6198; +} +{ +add.f16x2 r6256, r4423, r6253; +} +{ +sub.f16x2 r6259, r5804, r5820; +} +{ +mul.f16x2 r6262, r6259, r6200; +} +{ +sub.f16x2 %23, r6256, r6262; +} +{ +add.f16x2 r6268, r5810, r5826; +} +{ +mul.f16x2 r6271, r6268, r6198; +} +{ +add.f16x2 r6274, r4423, r6271; +} +{ +sub.f16x2 r6277, r5804, r5820; +} +{ +mul.f16x2 r6280, r6277, r6200; +} +{ +add.f16x2 %41, r6274, r6280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6287, {low, high}; +} +{ +neg.f16x2 r6288, r6287; +} +{ +add.f16x2 r6290, r5836, r5852; +} +{ +add.f16x2 %6, r4265, r6290; +} +{ +add.f16x2 r6296, r5842, r5858; +} +{ +add.f16x2 %7, r4301, r6296; +} +{ +add.f16x2 r6302, r5836, r5852; +} +{ +mul.f16x2 r6305, r6302, r6286; +} +{ +add.f16x2 r6308, r4265, r6305; +} +{ +sub.f16x2 r6311, r5842, r5858; +} +{ +mul.f16x2 r6314, r6311, r6288; +} +{ +add.f16x2 %24, r6308, r6314; +} +{ +add.f16x2 r6320, r5836, r5852; +} +{ +mul.f16x2 r6323, r6320, r6286; +} +{ +add.f16x2 r6326, r4265, r6323; +} +{ +sub.f16x2 r6329, r5842, r5858; +} +{ +mul.f16x2 r6332, r6329, r6288; +} +{ +sub.f16x2 %42, r6326, r6332; +} +{ +add.f16x2 r6338, r5842, r5858; +} +{ +mul.f16x2 r6341, r6338, r6286; +} +{ +add.f16x2 r6344, r4301, r6341; +} +{ +sub.f16x2 r6347, r5836, r5852; +} +{ +mul.f16x2 r6350, r6347, r6288; +} +{ +sub.f16x2 %25, r6344, r6350; +} +{ +add.f16x2 r6356, r5842, r5858; +} +{ +mul.f16x2 r6359, r6356, r6286; +} +{ +add.f16x2 r6362, r4301, r6359; +} +{ +sub.f16x2 r6365, r5836, r5852; +} +{ +mul.f16x2 r6368, r6365, r6288; +} +{ +add.f16x2 %43, r6362, r6368; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6374, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6375, {low, high}; +} +{ +neg.f16x2 r6376, r6375; +} +{ +add.f16x2 r6378, r5868, r5884; +} +{ +add.f16x2 %8, r4353, r6378; +} +{ +add.f16x2 r6384, r5874, r5890; +} +{ +add.f16x2 %9, r4389, r6384; +} +{ +add.f16x2 r6390, r5868, r5884; +} +{ +mul.f16x2 r6393, r6390, r6374; +} +{ +add.f16x2 r6396, r4353, r6393; +} +{ +sub.f16x2 r6399, r5874, r5890; +} +{ +mul.f16x2 r6402, r6399, r6376; +} +{ +add.f16x2 %26, r6396, r6402; +} +{ +add.f16x2 r6408, r5868, r5884; +} +{ +mul.f16x2 r6411, r6408, r6374; +} +{ +add.f16x2 r6414, r4353, r6411; +} +{ +sub.f16x2 r6417, r5874, r5890; +} +{ +mul.f16x2 r6420, r6417, r6376; +} +{ +sub.f16x2 %44, r6414, r6420; +} +{ +add.f16x2 r6426, r5874, r5890; +} +{ +mul.f16x2 r6429, r6426, r6374; +} +{ +add.f16x2 r6432, r4389, r6429; +} +{ +sub.f16x2 r6435, r5868, r5884; +} +{ +mul.f16x2 r6438, r6435, r6376; +} +{ +sub.f16x2 %27, r6432, r6438; +} +{ +add.f16x2 r6444, r5874, r5890; +} +{ +mul.f16x2 r6447, r6444, r6374; +} +{ +add.f16x2 r6450, r4389, r6447; +} +{ +sub.f16x2 r6453, r5868, r5884; +} +{ +mul.f16x2 r6456, r6453, r6376; +} +{ +add.f16x2 %45, r6450, r6456; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6462, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6463, {low, high}; +} +{ +neg.f16x2 r6464, r6463; +} +{ +add.f16x2 r6466, r5900, r5916; +} +{ +add.f16x2 %10, r4441, r6466; +} +{ +add.f16x2 r6472, r5906, r5922; +} +{ +add.f16x2 %11, r4477, r6472; +} +{ +add.f16x2 r6478, r5900, r5916; +} +{ +mul.f16x2 r6481, r6478, r6462; +} +{ +add.f16x2 r6484, r4441, r6481; +} +{ +sub.f16x2 r6487, r5906, r5922; +} +{ +mul.f16x2 r6490, r6487, r6464; +} +{ +add.f16x2 %28, r6484, r6490; +} +{ +add.f16x2 r6496, r5900, r5916; +} +{ +mul.f16x2 r6499, r6496, r6462; +} +{ +add.f16x2 r6502, r4441, r6499; +} +{ +sub.f16x2 r6505, r5906, r5922; +} +{ +mul.f16x2 r6508, r6505, r6464; +} +{ +sub.f16x2 %46, r6502, r6508; +} +{ +add.f16x2 r6514, r5906, r5922; +} +{ +mul.f16x2 r6517, r6514, r6462; +} +{ +add.f16x2 r6520, r4477, r6517; +} +{ +sub.f16x2 r6523, r5900, r5916; +} +{ +mul.f16x2 r6526, r6523, r6464; +} +{ +sub.f16x2 %29, r6520, r6526; +} +{ +add.f16x2 r6532, r5906, r5922; +} +{ +mul.f16x2 r6535, r6532, r6462; +} +{ +add.f16x2 r6538, r4477, r6535; +} +{ +sub.f16x2 r6541, r5900, r5916; +} +{ +mul.f16x2 r6544, r6541, r6464; +} +{ +add.f16x2 %47, r6538, r6544; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6550, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6551, {low, high}; +} +{ +neg.f16x2 r6552, r6551; +} +{ +add.f16x2 r6554, r5932, r5948; +} +{ +add.f16x2 %12, r4283, r6554; +} +{ +add.f16x2 r6560, r5938, r5954; +} +{ +add.f16x2 %13, r4319, r6560; +} +{ +add.f16x2 r6566, r5932, r5948; +} +{ +mul.f16x2 r6569, r6566, r6550; +} +{ +add.f16x2 r6572, r4283, r6569; +} +{ +sub.f16x2 r6575, r5938, r5954; +} +{ +mul.f16x2 r6578, r6575, r6552; +} +{ +add.f16x2 %30, r6572, r6578; +} +{ +add.f16x2 r6584, r5932, r5948; +} +{ +mul.f16x2 r6587, r6584, r6550; +} +{ +add.f16x2 r6590, r4283, r6587; +} +{ +sub.f16x2 r6593, r5938, r5954; +} +{ +mul.f16x2 r6596, r6593, r6552; +} +{ +sub.f16x2 %48, r6590, r6596; +} +{ +add.f16x2 r6602, r5938, r5954; +} +{ +mul.f16x2 r6605, r6602, r6550; +} +{ +add.f16x2 r6608, r4319, r6605; +} +{ +sub.f16x2 r6611, r5932, r5948; +} +{ +mul.f16x2 r6614, r6611, r6552; +} +{ +sub.f16x2 %31, r6608, r6614; +} +{ +add.f16x2 r6620, r5938, r5954; +} +{ +mul.f16x2 r6623, r6620, r6550; +} +{ +add.f16x2 r6626, r4319, r6623; +} +{ +sub.f16x2 r6629, r5932, r5948; +} +{ +mul.f16x2 r6632, r6629, r6552; +} +{ +add.f16x2 %49, r6626, r6632; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6639, {low, high}; +} +{ +neg.f16x2 r6640, r6639; +} +{ +add.f16x2 r6642, r5964, r5980; +} +{ +add.f16x2 %14, r4371, r6642; +} +{ +add.f16x2 r6648, r5970, r5986; +} +{ +add.f16x2 %15, r4407, r6648; +} +{ +add.f16x2 r6654, r5964, r5980; +} +{ +mul.f16x2 r6657, r6654, r6638; +} +{ +add.f16x2 r6660, r4371, r6657; +} +{ +sub.f16x2 r6663, r5970, r5986; +} +{ +mul.f16x2 r6666, r6663, r6640; +} +{ +add.f16x2 %32, r6660, r6666; +} +{ +add.f16x2 r6672, r5964, r5980; +} +{ +mul.f16x2 r6675, r6672, r6638; +} +{ +add.f16x2 r6678, r4371, r6675; +} +{ +sub.f16x2 r6681, r5970, r5986; +} +{ +mul.f16x2 r6684, r6681, r6640; +} +{ +sub.f16x2 %50, r6678, r6684; +} +{ +add.f16x2 r6690, r5970, r5986; +} +{ +mul.f16x2 r6693, r6690, r6638; +} +{ +add.f16x2 r6696, r4407, r6693; +} +{ +sub.f16x2 r6699, r5964, r5980; +} +{ +mul.f16x2 r6702, r6699, r6640; +} +{ +sub.f16x2 %33, r6696, r6702; +} +{ +add.f16x2 r6708, r5970, r5986; +} +{ +mul.f16x2 r6711, r6708, r6638; +} +{ +add.f16x2 r6714, r4407, r6711; +} +{ +sub.f16x2 r6717, r5964, r5980; +} +{ +mul.f16x2 r6720, r6717, r6640; +} +{ +add.f16x2 %51, r6714, r6720; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6726, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6727, {low, high}; +} +{ +neg.f16x2 r6728, r6727; +} +{ +add.f16x2 r6730, r5996, r6012; +} +{ +add.f16x2 %16, r4459, r6730; +} +{ +add.f16x2 r6736, r6002, r6018; +} +{ +add.f16x2 %17, r4495, r6736; +} +{ +add.f16x2 r6742, r5996, r6012; +} +{ +mul.f16x2 r6745, r6742, r6726; +} +{ +add.f16x2 r6748, r4459, r6745; +} +{ +sub.f16x2 r6751, r6002, r6018; +} +{ +mul.f16x2 r6754, r6751, r6728; +} +{ +add.f16x2 %34, r6748, r6754; +} +{ +add.f16x2 r6760, r5996, r6012; +} +{ +mul.f16x2 r6763, r6760, r6726; +} +{ +add.f16x2 r6766, r4459, r6763; +} +{ +sub.f16x2 r6769, r6002, r6018; +} +{ +mul.f16x2 r6772, r6769, r6728; +} +{ +sub.f16x2 %52, r6766, r6772; +} +{ +add.f16x2 r6778, r6002, r6018; +} +{ +mul.f16x2 r6781, r6778, r6726; +} +{ +add.f16x2 r6784, r4495, r6781; +} +{ +sub.f16x2 r6787, r5996, r6012; +} +{ +mul.f16x2 r6790, r6787, r6728; +} +{ +sub.f16x2 %35, r6784, r6790; +} +{ +add.f16x2 r6796, r6002, r6018; +} +{ +mul.f16x2 r6799, r6796, r6726; +} +{ +add.f16x2 r6802, r4495, r6799; +} +{ +sub.f16x2 r6805, r5996, r6012; +} +{ +mul.f16x2 r6808, r6805, r6728; +} +{ +add.f16x2 %53, r6802, r6808; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<886, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<215>; +.reg .b32 r<2440>; +.reg .b64 rd<6>; +mov.u32 r2423, %tid.y; +mov.u32 r2424, %18; +mad.lo.s32 r2425, r2423, 5832, r2424; +mov.u32 r2426, %tid.x; +mov.f32 f206, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1, {low, high}; +} +mov.f32 f208, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r265, {low, high}; +} +mov.f32 f168, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r266, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r267, {low, high}; +} +mov.f32 f172, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r268, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r271, {low, high}; +} +mov.f32 f180, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r2426, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r2427, rd3; +mul.lo.s32 r2428, r2427, 81; +sub.s32 r2429, r2426, r2428; +cvt.rn.f32.u32 f209, r2429; +mul.f32 f210, f209, 0f3C0D3654; +cos.approx.f32 f57, f210; +sin.approx.f32 f211, f210; +neg.f32 f58, f211; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +mad.lo.s32 r2430, r2427, 5832, r2425; +barrier.sync 0; +mad.lo.s32 r2431, r2429, 72, r2430; +st.shared.v2.f32 [r2431], {r352, r358}; +st.shared.v2.f32 [r2431+8], {r621, r628}; +st.shared.v2.f32 [r2431+16], {r658, r665}; +st.shared.v2.f32 [r2431+24], {r695, r702}; +st.shared.v2.f32 [r2431+32], {r732, r739}; +st.shared.v2.f32 [r2431+40], {r769, r776}; +st.shared.v2.f32 [r2431+48], {r806, r813}; +st.shared.v2.f32 [r2431+56], {r843, r850}; +st.shared.v2.f32 [r2431+64], {r880, r887}; +barrier.sync 0; +shl.b32 r2432, r2429, 6; +sub.s32 r2433, r2431, r2432; +ld.shared.u32 r916, [r2433]; +ld.shared.u32 r922, [r2433+4]; +ld.shared.u32 r1004, [r2433+648]; +ld.shared.u32 r1010, [r2433+652]; +ld.shared.u32 r1092, [r2433+1296]; +ld.shared.u32 r1098, [r2433+1300]; +ld.shared.u32 r913, [r2433+1944]; +ld.shared.u32 r919, [r2433+1948]; +ld.shared.u32 r1001, [r2433+2592]; +ld.shared.u32 r1007, [r2433+2596]; +ld.shared.u32 r1089, [r2433+3240]; +ld.shared.u32 r1095, [r2433+3244]; +ld.shared.u32 r914, [r2433+3888]; +ld.shared.u32 r920, [r2433+3892]; +ld.shared.u32 r1002, [r2433+4536]; +ld.shared.u32 r1008, [r2433+4540]; +ld.shared.u32 r1090, [r2433+5184]; +ld.shared.u32 r1096, [r2433+5188]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r2429, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2434, rd5; +cvt.rn.f32.u32 f212, r2434; +mul.f32 f213, f212, 0f3D9EDD1F; +cos.approx.f32 f133, f213; +sin.approx.f32 f214, f213; +neg.f32 f134, f214; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +mul.lo.s32 r2435, r2434, 9; +sub.s32 r2436, r2429, r2435; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +shl.b32 r2437, r2436, 3; +add.s32 r2438, r2430, r2437; +barrier.sync 0; +mad.lo.s32 r2439, r2434, 648, r2438; +st.shared.u32 [r2439], r1259; +st.shared.u32 [r2439+4], r1265; +st.shared.u32 [r2439+72], r1528; +st.shared.u32 [r2439+76], r1535; +st.shared.u32 [r2439+144], r1565; +st.shared.u32 [r2439+148], r1572; +st.shared.u32 [r2439+216], r1602; +st.shared.u32 [r2439+220], r1609; +st.shared.u32 [r2439+288], r1639; +st.shared.u32 [r2439+292], r1646; +st.shared.u32 [r2439+360], r1676; +st.shared.u32 [r2439+364], r1683; +st.shared.u32 [r2439+432], r1713; +st.shared.u32 [r2439+436], r1720; +st.shared.u32 [r2439+504], r1750; +st.shared.u32 [r2439+508], r1757; +st.shared.u32 [r2439+576], r1787; +st.shared.u32 [r2439+580], r1794; +barrier.sync 0; +ld.shared.u32 r1823, [r2433]; +ld.shared.u32 r1829, [r2433+4]; +ld.shared.u32 r1911, [r2433+648]; +ld.shared.u32 r1917, [r2433+652]; +ld.shared.u32 r1999, [r2433+1296]; +ld.shared.u32 r2005, [r2433+1300]; +ld.shared.u32 r1820, [r2433+1944]; +ld.shared.u32 r1826, [r2433+1948]; +ld.shared.u32 r1908, [r2433+2592]; +ld.shared.u32 r1914, [r2433+2596]; +ld.shared.u32 r1996, [r2433+3240]; +ld.shared.u32 r2002, [r2433+3244]; +ld.shared.u32 r1821, [r2433+3888]; +ld.shared.u32 r1827, [r2433+3892]; +ld.shared.u32 r1909, [r2433+4536]; +ld.shared.u32 r1915, [r2433+4540]; +ld.shared.u32 r1997, [r2433+5184]; +ld.shared.u32 r2003, [r2433+5188]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 r1822, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 r1828, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 r1846, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 r1864, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 r1882, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 r1900, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 r1910, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 r1916, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 r1934, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 r1952, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 r1970, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 r1998, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 r2004, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 r2040, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 r2058, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2080, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2081, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2082, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2085, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2086, {low, high}; +} +{ +mul.f16x2 r2095, r1934, r2079; +} +{ +mul.f16x2 r2098, r1970, r2080; +} +{ +sub.f16x2 r2101, r2095, r2098; +} +{ +mul.f16x2 r2104, r1934, r2080; +} +{ +fma.rn.f16x2 r2107, r1970, r2079, r2104; +} +{ +mul.f16x2 r2111, r2022, r2081; +} +{ +mul.f16x2 r2114, r2058, r2082; +} +{ +sub.f16x2 r2117, r2111, r2114; +} +{ +mul.f16x2 r2120, r2022, r2082; +} +{ +fma.rn.f16x2 r2123, r2058, r2081, r2120; +} +{ +mul.f16x2 r2127, r1952, r2081; +} +{ +mul.f16x2 r2130, r1988, r2082; +} +{ +sub.f16x2 r2133, r2127, r2130; +} +{ +mul.f16x2 r2136, r1952, r2082; +} +{ +fma.rn.f16x2 r2139, r1988, r2081, r2136; +} +{ +mul.f16x2 r2143, r2040, r2085; +} +{ +mul.f16x2 r2146, r2076, r2086; +} +{ +sub.f16x2 r2149, r2143, r2146; +} +{ +mul.f16x2 r2152, r2040, r2086; +} +{ +fma.rn.f16x2 r2155, r2076, r2085, r2152; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2160, {low, high}; +} +{ +neg.f16x2 r2161, r2160; +} +{ +add.f16x2 r2163, r1910, r1998; +} +{ +add.f16x2 %0, r1822, r2163; +} +{ +add.f16x2 r2169, r1916, r2004; +} +{ +add.f16x2 %1, r1828, r2169; +} +{ +add.f16x2 r2175, r1910, r1998; +} +{ +mul.f16x2 r2178, r2175, r2159; +} +{ +add.f16x2 r2181, r1822, r2178; +} +{ +sub.f16x2 r2184, r1916, r2004; +} +{ +mul.f16x2 r2187, r2184, r2161; +} +{ +add.f16x2 %6, r2181, r2187; +} +{ +add.f16x2 r2193, r1910, r1998; +} +{ +mul.f16x2 r2196, r2193, r2159; +} +{ +add.f16x2 r2199, r1822, r2196; +} +{ +sub.f16x2 r2202, r1916, r2004; +} +{ +mul.f16x2 r2205, r2202, r2161; +} +{ +sub.f16x2 %12, r2199, r2205; +} +{ +add.f16x2 r2211, r1916, r2004; +} +{ +mul.f16x2 r2214, r2211, r2159; +} +{ +add.f16x2 r2217, r1828, r2214; +} +{ +sub.f16x2 r2220, r1910, r1998; +} +{ +mul.f16x2 r2223, r2220, r2161; +} +{ +sub.f16x2 %7, r2217, r2223; +} +{ +add.f16x2 r2229, r1916, r2004; +} +{ +mul.f16x2 r2232, r2229, r2159; +} +{ +add.f16x2 r2235, r1828, r2232; +} +{ +sub.f16x2 r2238, r1910, r1998; +} +{ +mul.f16x2 r2241, r2238, r2161; +} +{ +add.f16x2 %13, r2235, r2241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2248, {low, high}; +} +{ +neg.f16x2 r2249, r2248; +} +{ +add.f16x2 r2251, r2101, r2117; +} +{ +add.f16x2 %2, r1846, r2251; +} +{ +add.f16x2 r2257, r2107, r2123; +} +{ +add.f16x2 %3, r1882, r2257; +} +{ +add.f16x2 r2263, r2101, r2117; +} +{ +mul.f16x2 r2266, r2263, r2247; +} +{ +add.f16x2 r2269, r1846, r2266; +} +{ +sub.f16x2 r2272, r2107, r2123; +} +{ +mul.f16x2 r2275, r2272, r2249; +} +{ +add.f16x2 %8, r2269, r2275; +} +{ +add.f16x2 r2281, r2101, r2117; +} +{ +mul.f16x2 r2284, r2281, r2247; +} +{ +add.f16x2 r2287, r1846, r2284; +} +{ +sub.f16x2 r2290, r2107, r2123; +} +{ +mul.f16x2 r2293, r2290, r2249; +} +{ +sub.f16x2 %14, r2287, r2293; +} +{ +add.f16x2 r2299, r2107, r2123; +} +{ +mul.f16x2 r2302, r2299, r2247; +} +{ +add.f16x2 r2305, r1882, r2302; +} +{ +sub.f16x2 r2308, r2101, r2117; +} +{ +mul.f16x2 r2311, r2308, r2249; +} +{ +sub.f16x2 %9, r2305, r2311; +} +{ +add.f16x2 r2317, r2107, r2123; +} +{ +mul.f16x2 r2320, r2317, r2247; +} +{ +add.f16x2 r2323, r1882, r2320; +} +{ +sub.f16x2 r2326, r2101, r2117; +} +{ +mul.f16x2 r2329, r2326, r2249; +} +{ +add.f16x2 %15, r2323, r2329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2336, {low, high}; +} +{ +neg.f16x2 r2337, r2336; +} +{ +add.f16x2 r2339, r2133, r2149; +} +{ +add.f16x2 %4, r1864, r2339; +} +{ +add.f16x2 r2345, r2139, r2155; +} +{ +add.f16x2 %5, r1900, r2345; +} +{ +add.f16x2 r2351, r2133, r2149; +} +{ +mul.f16x2 r2354, r2351, r2335; +} +{ +add.f16x2 r2357, r1864, r2354; +} +{ +sub.f16x2 r2360, r2139, r2155; +} +{ +mul.f16x2 r2363, r2360, r2337; +} +{ +add.f16x2 %10, r2357, r2363; +} +{ +add.f16x2 r2369, r2133, r2149; +} +{ +mul.f16x2 r2372, r2369, r2335; +} +{ +add.f16x2 r2375, r1864, r2372; +} +{ +sub.f16x2 r2378, r2139, r2155; +} +{ +mul.f16x2 r2381, r2378, r2337; +} +{ +sub.f16x2 %16, r2375, r2381; +} +{ +add.f16x2 r2387, r2139, r2155; +} +{ +mul.f16x2 r2390, r2387, r2335; +} +{ +add.f16x2 r2393, r1900, r2390; +} +{ +sub.f16x2 r2396, r2133, r2149; +} +{ +mul.f16x2 r2399, r2396, r2337; +} +{ +sub.f16x2 %11, r2393, r2399; +} +{ +add.f16x2 r2405, r2139, r2155; +} +{ +mul.f16x2 r2408, r2405, r2335; +} +{ +add.f16x2 r2411, r1900, r2408; +} +{ +sub.f16x2 r2414, r2133, r2149; +} +{ +mul.f16x2 r2417, r2414, r2337; +} +{ +add.f16x2 %17, r2411, r2417; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<887, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<215>; +.reg .b32 r<2440>; +.reg .b64 rd<6>; +mov.u32 r2423, %tid.y; +mov.u32 r2424, %18; +mad.lo.s32 r2425, r2423, 2916, r2424; +mov.u32 r2426, %tid.x; +mov.f32 f206, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1, {low, high}; +} +mov.f32 f208, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r265, {low, high}; +} +mov.f32 f168, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r266, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r267, {low, high}; +} +mov.f32 f172, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r268, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r271, {low, high}; +} +mov.f32 f180, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r2426, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r2427, rd3; +mul.lo.s32 r2428, r2427, 81; +sub.s32 r2429, r2426, r2428; +mad.lo.s32 r2430, r2427, 2916, r2425; +cvt.rn.f32.u32 f209, r2429; +mul.f32 f210, f209, 0f3C0D3654; +cos.approx.f32 f57, f210; +sin.approx.f32 f211, f210; +neg.f32 f58, f211; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +barrier.sync 0; +mad.lo.s32 r2431, r2429, 36, r2430; +st.shared.u32 [r2431], r352; +st.shared.u32 [r2431+4], r621; +st.shared.u32 [r2431+8], r658; +st.shared.u32 [r2431+12], r695; +st.shared.u32 [r2431+16], r732; +st.shared.u32 [r2431+20], r769; +st.shared.u32 [r2431+24], r806; +st.shared.u32 [r2431+28], r843; +st.shared.u32 [r2431+32], r880; +barrier.sync 0; +shl.b32 r2432, r2429, 5; +sub.s32 r2433, r2431, r2432; +ld.shared.u32 r916, [r2433]; +ld.shared.u32 r1004, [r2433+324]; +ld.shared.u32 r1092, [r2433+648]; +ld.shared.u32 r913, [r2433+972]; +ld.shared.u32 r1001, [r2433+1296]; +ld.shared.u32 r1089, [r2433+1620]; +ld.shared.u32 r914, [r2433+1944]; +ld.shared.u32 r1002, [r2433+2268]; +ld.shared.u32 r1090, [r2433+2592]; +barrier.sync 0; +st.shared.u32 [r2431], r358; +st.shared.u32 [r2431+4], r628; +st.shared.u32 [r2431+8], r665; +st.shared.u32 [r2431+12], r702; +st.shared.u32 [r2431+16], r739; +st.shared.u32 [r2431+20], r776; +st.shared.u32 [r2431+24], r813; +st.shared.u32 [r2431+28], r850; +st.shared.u32 [r2431+32], r887; +barrier.sync 0; +ld.shared.u32 r922, [r2433]; +ld.shared.u32 r1010, [r2433+324]; +ld.shared.u32 r1098, [r2433+648]; +ld.shared.u32 r919, [r2433+972]; +ld.shared.u32 r1007, [r2433+1296]; +ld.shared.u32 r1095, [r2433+1620]; +ld.shared.u32 r920, [r2433+1944]; +ld.shared.u32 r1008, [r2433+2268]; +ld.shared.u32 r1096, [r2433+2592]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 r1259, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 r1265, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 r1283, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 r1301, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 r1319, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 r1337, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 r1347, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 r1353, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 r1371, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 r1389, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 r1407, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 r1425, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 r1435, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 r1441, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 r1459, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 r1477, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 r1495, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 r1513, r1504, r1510; +} +mul.wide.u32 rd4, r2429, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2434, rd5; +mul.lo.s32 r2435, r2434, 9; +sub.s32 r2436, r2429, r2435; +shl.b32 r2437, r2436, 2; +add.s32 r2438, r2430, r2437; +cvt.rn.f32.u32 f212, r2434; +mul.f32 f213, f212, 0f3D9EDD1F; +cos.approx.f32 f133, f213; +sin.approx.f32 f214, f213; +neg.f32 f134, f214; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1516, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1519, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1521, {high, high}; +} +{ +mul.f16x2 r1523, r1353, r1521; +} +{ +neg.f16x2 r1526, r1523; +} +{ +fma.rn.f16x2 r1528, r1347, r1519, r1526; +} +{ +mul.f16x2 r1532, r1347, r1521; +} +{ +fma.rn.f16x2 r1535, r1353, r1519, r1532; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1539, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1541, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1543, {low, high}; +} +{ +mul.f16x2 r1544, r1541, r1543; +} +{ +mul.f16x2 r1547, r1516, r1539; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1550, {high, low}; +} +{ +fma.rn.f16x2 r1552, r1544, r1550, r1547; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1556, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1558, {high, high}; +} +{ +mul.f16x2 r1560, r1441, r1558; +} +{ +neg.f16x2 r1563, r1560; +} +{ +fma.rn.f16x2 r1565, r1435, r1556, r1563; +} +{ +mul.f16x2 r1569, r1435, r1558; +} +{ +fma.rn.f16x2 r1572, r1441, r1556, r1569; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1576, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1578, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1581, r1578, r1580; +} +{ +mul.f16x2 r1584, r1552, r1576; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1552; +mov.b32 r1587, {high, low}; +} +{ +fma.rn.f16x2 r1589, r1581, r1587, r1584; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1593, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1595, {high, high}; +} +{ +mul.f16x2 r1597, r1319, r1595; +} +{ +neg.f16x2 r1600, r1597; +} +{ +fma.rn.f16x2 r1602, r1283, r1593, r1600; +} +{ +mul.f16x2 r1606, r1283, r1595; +} +{ +fma.rn.f16x2 r1609, r1319, r1593, r1606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1613, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1615, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1617, {low, high}; +} +{ +mul.f16x2 r1618, r1615, r1617; +} +{ +mul.f16x2 r1621, r1589, r1613; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1589; +mov.b32 r1624, {high, low}; +} +{ +fma.rn.f16x2 r1626, r1618, r1624, r1621; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1630, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1632, {high, high}; +} +{ +mul.f16x2 r1634, r1407, r1632; +} +{ +neg.f16x2 r1637, r1634; +} +{ +fma.rn.f16x2 r1639, r1371, r1630, r1637; +} +{ +mul.f16x2 r1643, r1371, r1632; +} +{ +fma.rn.f16x2 r1646, r1407, r1630, r1643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1650, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1652, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1654, {low, high}; +} +{ +mul.f16x2 r1655, r1652, r1654; +} +{ +mul.f16x2 r1658, r1626, r1650; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1626; +mov.b32 r1661, {high, low}; +} +{ +fma.rn.f16x2 r1663, r1655, r1661, r1658; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1667, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1669, {high, high}; +} +{ +mul.f16x2 r1671, r1495, r1669; +} +{ +neg.f16x2 r1674, r1671; +} +{ +fma.rn.f16x2 r1676, r1459, r1667, r1674; +} +{ +mul.f16x2 r1680, r1459, r1669; +} +{ +fma.rn.f16x2 r1683, r1495, r1667, r1680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1687, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1689, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1691, {low, high}; +} +{ +mul.f16x2 r1692, r1689, r1691; +} +{ +mul.f16x2 r1695, r1663, r1687; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1663; +mov.b32 r1698, {high, low}; +} +{ +fma.rn.f16x2 r1700, r1692, r1698, r1695; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1704, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1706, {high, high}; +} +{ +mul.f16x2 r1708, r1337, r1706; +} +{ +neg.f16x2 r1711, r1708; +} +{ +fma.rn.f16x2 r1713, r1301, r1704, r1711; +} +{ +mul.f16x2 r1717, r1301, r1706; +} +{ +fma.rn.f16x2 r1720, r1337, r1704, r1717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1724, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1726, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1728, {low, high}; +} +{ +mul.f16x2 r1729, r1726, r1728; +} +{ +mul.f16x2 r1732, r1700, r1724; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1700; +mov.b32 r1735, {high, low}; +} +{ +fma.rn.f16x2 r1737, r1729, r1735, r1732; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1741, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1743, {high, high}; +} +{ +mul.f16x2 r1745, r1425, r1743; +} +{ +neg.f16x2 r1748, r1745; +} +{ +fma.rn.f16x2 r1750, r1389, r1741, r1748; +} +{ +mul.f16x2 r1754, r1389, r1743; +} +{ +fma.rn.f16x2 r1757, r1425, r1741, r1754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1761, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1516; +mov.b32 r1763, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1765, {low, high}; +} +{ +mul.f16x2 r1766, r1763, r1765; +} +{ +mul.f16x2 r1769, r1737, r1761; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1737; +mov.b32 r1772, {high, low}; +} +{ +fma.rn.f16x2 r1774, r1766, r1772, r1769; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1778, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1774; +mov.b32 r1780, {high, high}; +} +{ +mul.f16x2 r1782, r1513, r1780; +} +{ +neg.f16x2 r1785, r1782; +} +{ +fma.rn.f16x2 r1787, r1477, r1778, r1785; +} +{ +mul.f16x2 r1791, r1477, r1780; +} +{ +fma.rn.f16x2 r1794, r1513, r1778, r1791; +} +barrier.sync 0; +mad.lo.s32 r2439, r2434, 324, r2438; +st.shared.u32 [r2439], r1259; +st.shared.u32 [r2439+36], r1528; +st.shared.u32 [r2439+72], r1565; +st.shared.u32 [r2439+108], r1602; +st.shared.u32 [r2439+144], r1639; +st.shared.u32 [r2439+180], r1676; +st.shared.u32 [r2439+216], r1713; +st.shared.u32 [r2439+252], r1750; +st.shared.u32 [r2439+288], r1787; +barrier.sync 0; +ld.shared.u32 r1823, [r2433]; +ld.shared.u32 r1911, [r2433+324]; +ld.shared.u32 r1999, [r2433+648]; +ld.shared.u32 r1820, [r2433+972]; +ld.shared.u32 r1908, [r2433+1296]; +ld.shared.u32 r1996, [r2433+1620]; +ld.shared.u32 r1821, [r2433+1944]; +ld.shared.u32 r1909, [r2433+2268]; +ld.shared.u32 r1997, [r2433+2592]; +barrier.sync 0; +st.shared.u32 [r2439], r1265; +st.shared.u32 [r2439+36], r1535; +st.shared.u32 [r2439+72], r1572; +st.shared.u32 [r2439+108], r1609; +st.shared.u32 [r2439+144], r1646; +st.shared.u32 [r2439+180], r1683; +st.shared.u32 [r2439+216], r1720; +st.shared.u32 [r2439+252], r1757; +st.shared.u32 [r2439+288], r1794; +barrier.sync 0; +ld.shared.u32 r1829, [r2433]; +ld.shared.u32 r1917, [r2433+324]; +ld.shared.u32 r2005, [r2433+648]; +ld.shared.u32 r1826, [r2433+972]; +ld.shared.u32 r1914, [r2433+1296]; +ld.shared.u32 r2002, [r2433+1620]; +ld.shared.u32 r1827, [r2433+1944]; +ld.shared.u32 r1915, [r2433+2268]; +ld.shared.u32 r2003, [r2433+2592]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1815, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1816, {low, high}; +} +{ +neg.f16x2 r1817, r1816; +} +{ +add.f16x2 r1819, r1820, r1821; +} +{ +add.f16x2 r1822, r1823, r1819; +} +{ +add.f16x2 r1825, r1826, r1827; +} +{ +add.f16x2 r1828, r1829, r1825; +} +{ +add.f16x2 r1831, r1820, r1821; +} +{ +mul.f16x2 r1834, r1831, r1815; +} +{ +add.f16x2 r1837, r1823, r1834; +} +{ +sub.f16x2 r1840, r1826, r1827; +} +{ +mul.f16x2 r1843, r1840, r1817; +} +{ +add.f16x2 r1846, r1837, r1843; +} +{ +add.f16x2 r1849, r1820, r1821; +} +{ +mul.f16x2 r1852, r1849, r1815; +} +{ +add.f16x2 r1855, r1823, r1852; +} +{ +sub.f16x2 r1858, r1826, r1827; +} +{ +mul.f16x2 r1861, r1858, r1817; +} +{ +sub.f16x2 r1864, r1855, r1861; +} +{ +add.f16x2 r1867, r1826, r1827; +} +{ +mul.f16x2 r1870, r1867, r1815; +} +{ +add.f16x2 r1873, r1829, r1870; +} +{ +sub.f16x2 r1876, r1820, r1821; +} +{ +mul.f16x2 r1879, r1876, r1817; +} +{ +sub.f16x2 r1882, r1873, r1879; +} +{ +add.f16x2 r1885, r1826, r1827; +} +{ +mul.f16x2 r1888, r1885, r1815; +} +{ +add.f16x2 r1891, r1829, r1888; +} +{ +sub.f16x2 r1894, r1820, r1821; +} +{ +mul.f16x2 r1897, r1894, r1817; +} +{ +add.f16x2 r1900, r1891, r1897; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1903, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1904, {low, high}; +} +{ +neg.f16x2 r1905, r1904; +} +{ +add.f16x2 r1907, r1908, r1909; +} +{ +add.f16x2 r1910, r1911, r1907; +} +{ +add.f16x2 r1913, r1914, r1915; +} +{ +add.f16x2 r1916, r1917, r1913; +} +{ +add.f16x2 r1919, r1908, r1909; +} +{ +mul.f16x2 r1922, r1919, r1903; +} +{ +add.f16x2 r1925, r1911, r1922; +} +{ +sub.f16x2 r1928, r1914, r1915; +} +{ +mul.f16x2 r1931, r1928, r1905; +} +{ +add.f16x2 r1934, r1925, r1931; +} +{ +add.f16x2 r1937, r1908, r1909; +} +{ +mul.f16x2 r1940, r1937, r1903; +} +{ +add.f16x2 r1943, r1911, r1940; +} +{ +sub.f16x2 r1946, r1914, r1915; +} +{ +mul.f16x2 r1949, r1946, r1905; +} +{ +sub.f16x2 r1952, r1943, r1949; +} +{ +add.f16x2 r1955, r1914, r1915; +} +{ +mul.f16x2 r1958, r1955, r1903; +} +{ +add.f16x2 r1961, r1917, r1958; +} +{ +sub.f16x2 r1964, r1908, r1909; +} +{ +mul.f16x2 r1967, r1964, r1905; +} +{ +sub.f16x2 r1970, r1961, r1967; +} +{ +add.f16x2 r1973, r1914, r1915; +} +{ +mul.f16x2 r1976, r1973, r1903; +} +{ +add.f16x2 r1979, r1917, r1976; +} +{ +sub.f16x2 r1982, r1908, r1909; +} +{ +mul.f16x2 r1985, r1982, r1905; +} +{ +add.f16x2 r1988, r1979, r1985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1991, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1992, {low, high}; +} +{ +neg.f16x2 r1993, r1992; +} +{ +add.f16x2 r1995, r1996, r1997; +} +{ +add.f16x2 r1998, r1999, r1995; +} +{ +add.f16x2 r2001, r2002, r2003; +} +{ +add.f16x2 r2004, r2005, r2001; +} +{ +add.f16x2 r2007, r1996, r1997; +} +{ +mul.f16x2 r2010, r2007, r1991; +} +{ +add.f16x2 r2013, r1999, r2010; +} +{ +sub.f16x2 r2016, r2002, r2003; +} +{ +mul.f16x2 r2019, r2016, r1993; +} +{ +add.f16x2 r2022, r2013, r2019; +} +{ +add.f16x2 r2025, r1996, r1997; +} +{ +mul.f16x2 r2028, r2025, r1991; +} +{ +add.f16x2 r2031, r1999, r2028; +} +{ +sub.f16x2 r2034, r2002, r2003; +} +{ +mul.f16x2 r2037, r2034, r1993; +} +{ +sub.f16x2 r2040, r2031, r2037; +} +{ +add.f16x2 r2043, r2002, r2003; +} +{ +mul.f16x2 r2046, r2043, r1991; +} +{ +add.f16x2 r2049, r2005, r2046; +} +{ +sub.f16x2 r2052, r1996, r1997; +} +{ +mul.f16x2 r2055, r2052, r1993; +} +{ +sub.f16x2 r2058, r2049, r2055; +} +{ +add.f16x2 r2061, r2002, r2003; +} +{ +mul.f16x2 r2064, r2061, r1991; +} +{ +add.f16x2 r2067, r2005, r2064; +} +{ +sub.f16x2 r2070, r1996, r1997; +} +{ +mul.f16x2 r2073, r2070, r1993; +} +{ +add.f16x2 r2076, r2067, r2073; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2079, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2080, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2081, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2082, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2085, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2086, {low, high}; +} +{ +mul.f16x2 r2095, r1934, r2079; +} +{ +mul.f16x2 r2098, r1970, r2080; +} +{ +sub.f16x2 r2101, r2095, r2098; +} +{ +mul.f16x2 r2104, r1934, r2080; +} +{ +fma.rn.f16x2 r2107, r1970, r2079, r2104; +} +{ +mul.f16x2 r2111, r2022, r2081; +} +{ +mul.f16x2 r2114, r2058, r2082; +} +{ +sub.f16x2 r2117, r2111, r2114; +} +{ +mul.f16x2 r2120, r2022, r2082; +} +{ +fma.rn.f16x2 r2123, r2058, r2081, r2120; +} +{ +mul.f16x2 r2127, r1952, r2081; +} +{ +mul.f16x2 r2130, r1988, r2082; +} +{ +sub.f16x2 r2133, r2127, r2130; +} +{ +mul.f16x2 r2136, r1952, r2082; +} +{ +fma.rn.f16x2 r2139, r1988, r2081, r2136; +} +{ +mul.f16x2 r2143, r2040, r2085; +} +{ +mul.f16x2 r2146, r2076, r2086; +} +{ +sub.f16x2 r2149, r2143, r2146; +} +{ +mul.f16x2 r2152, r2040, r2086; +} +{ +fma.rn.f16x2 r2155, r2076, r2085, r2152; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2160, {low, high}; +} +{ +neg.f16x2 r2161, r2160; +} +{ +add.f16x2 r2163, r1910, r1998; +} +{ +add.f16x2 %0, r1822, r2163; +} +{ +add.f16x2 r2169, r1916, r2004; +} +{ +add.f16x2 %1, r1828, r2169; +} +{ +add.f16x2 r2175, r1910, r1998; +} +{ +mul.f16x2 r2178, r2175, r2159; +} +{ +add.f16x2 r2181, r1822, r2178; +} +{ +sub.f16x2 r2184, r1916, r2004; +} +{ +mul.f16x2 r2187, r2184, r2161; +} +{ +add.f16x2 %6, r2181, r2187; +} +{ +add.f16x2 r2193, r1910, r1998; +} +{ +mul.f16x2 r2196, r2193, r2159; +} +{ +add.f16x2 r2199, r1822, r2196; +} +{ +sub.f16x2 r2202, r1916, r2004; +} +{ +mul.f16x2 r2205, r2202, r2161; +} +{ +sub.f16x2 %12, r2199, r2205; +} +{ +add.f16x2 r2211, r1916, r2004; +} +{ +mul.f16x2 r2214, r2211, r2159; +} +{ +add.f16x2 r2217, r1828, r2214; +} +{ +sub.f16x2 r2220, r1910, r1998; +} +{ +mul.f16x2 r2223, r2220, r2161; +} +{ +sub.f16x2 %7, r2217, r2223; +} +{ +add.f16x2 r2229, r1916, r2004; +} +{ +mul.f16x2 r2232, r2229, r2159; +} +{ +add.f16x2 r2235, r1828, r2232; +} +{ +sub.f16x2 r2238, r1910, r1998; +} +{ +mul.f16x2 r2241, r2238, r2161; +} +{ +add.f16x2 %13, r2235, r2241; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2247, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2248, {low, high}; +} +{ +neg.f16x2 r2249, r2248; +} +{ +add.f16x2 r2251, r2101, r2117; +} +{ +add.f16x2 %2, r1846, r2251; +} +{ +add.f16x2 r2257, r2107, r2123; +} +{ +add.f16x2 %3, r1882, r2257; +} +{ +add.f16x2 r2263, r2101, r2117; +} +{ +mul.f16x2 r2266, r2263, r2247; +} +{ +add.f16x2 r2269, r1846, r2266; +} +{ +sub.f16x2 r2272, r2107, r2123; +} +{ +mul.f16x2 r2275, r2272, r2249; +} +{ +add.f16x2 %8, r2269, r2275; +} +{ +add.f16x2 r2281, r2101, r2117; +} +{ +mul.f16x2 r2284, r2281, r2247; +} +{ +add.f16x2 r2287, r1846, r2284; +} +{ +sub.f16x2 r2290, r2107, r2123; +} +{ +mul.f16x2 r2293, r2290, r2249; +} +{ +sub.f16x2 %14, r2287, r2293; +} +{ +add.f16x2 r2299, r2107, r2123; +} +{ +mul.f16x2 r2302, r2299, r2247; +} +{ +add.f16x2 r2305, r1882, r2302; +} +{ +sub.f16x2 r2308, r2101, r2117; +} +{ +mul.f16x2 r2311, r2308, r2249; +} +{ +sub.f16x2 %9, r2305, r2311; +} +{ +add.f16x2 r2317, r2107, r2123; +} +{ +mul.f16x2 r2320, r2317, r2247; +} +{ +add.f16x2 r2323, r1882, r2320; +} +{ +sub.f16x2 r2326, r2101, r2117; +} +{ +mul.f16x2 r2329, r2326, r2249; +} +{ +add.f16x2 %15, r2323, r2329; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2335, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2336, {low, high}; +} +{ +neg.f16x2 r2337, r2336; +} +{ +add.f16x2 r2339, r2133, r2149; +} +{ +add.f16x2 %4, r1864, r2339; +} +{ +add.f16x2 r2345, r2139, r2155; +} +{ +add.f16x2 %5, r1900, r2345; +} +{ +add.f16x2 r2351, r2133, r2149; +} +{ +mul.f16x2 r2354, r2351, r2335; +} +{ +add.f16x2 r2357, r1864, r2354; +} +{ +sub.f16x2 r2360, r2139, r2155; +} +{ +mul.f16x2 r2363, r2360, r2337; +} +{ +add.f16x2 %10, r2357, r2363; +} +{ +add.f16x2 r2369, r2133, r2149; +} +{ +mul.f16x2 r2372, r2369, r2335; +} +{ +add.f16x2 r2375, r1864, r2372; +} +{ +sub.f16x2 r2378, r2139, r2155; +} +{ +mul.f16x2 r2381, r2378, r2337; +} +{ +sub.f16x2 %16, r2375, r2381; +} +{ +add.f16x2 r2387, r2139, r2155; +} +{ +mul.f16x2 r2390, r2387, r2335; +} +{ +add.f16x2 r2393, r1900, r2390; +} +{ +sub.f16x2 r2396, r2133, r2149; +} +{ +mul.f16x2 r2399, r2396, r2337; +} +{ +sub.f16x2 %11, r2393, r2399; +} +{ +add.f16x2 r2405, r2139, r2155; +} +{ +mul.f16x2 r2408, r2405, r2335; +} +{ +add.f16x2 r2411, r1900, r2408; +} +{ +sub.f16x2 r2414, r2133, r2149; +} +{ +mul.f16x2 r2417, r2414, r2337; +} +{ +add.f16x2 %17, r2411, r2417; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<888, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<80>; +.reg .b32 r<953>; +.reg .b64 rd<12>; +mov.u32 r914, %tid.y; +mov.u32 r915, %6; +mad.lo.s32 r916, r914, 5832, r915; +mov.u32 r917, %tid.x; +mov.f32 f62, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1, {low, high}; +} +mov.f32 f64, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r917, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r918, rd3; +mul.lo.s32 r919, r918, 243; +sub.s32 r920, r917, r919; +mad.lo.s32 r921, r918, 5832, r916; +cvt.rn.f32.u32 f65, r920; +mul.f32 f66, f65, 0f3C0D3654; +cos.approx.f32 f5, f66; +sin.approx.f32 f67, f66; +neg.f32 f6, f67; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f57, 0fBF800000; +mov.f32 f58, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r922, r920, 24, r921; +st.shared.v2.f32 [r922], {r8, r14}; +st.shared.v2.f32 [r922+8], {r101, r108}; +st.shared.v2.f32 [r922+16], {r138, r145}; +barrier.sync 0; +shl.b32 r923, r920, 4; +sub.s32 r924, r922, r923; +ld.shared.u32 r174, [r924]; +ld.shared.u32 r180, [r924+4]; +ld.shared.u32 r171, [r924+1944]; +ld.shared.u32 r177, [r924+1948]; +ld.shared.u32 r172, [r924+3888]; +ld.shared.u32 r178, [r924+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r920, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r925, rd5; +mul.lo.s32 r926, r925, 3; +sub.s32 r927, r920, r926; +shl.b32 r928, r927, 3; +add.s32 r929, r921, r928; +cvt.rn.f32.u32 f68, r925; +mul.f32 f69, f68, 0f3CD3D17E; +cos.approx.f32 f17, f69; +sin.approx.f32 f70, f69; +neg.f32 f18, f70; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r930, r925, 72, r929; +st.shared.u32 [r930], r173; +st.shared.u32 [r930+4], r179; +st.shared.u32 [r930+24], r266; +st.shared.u32 [r930+28], r273; +st.shared.u32 [r930+48], r303; +st.shared.u32 [r930+52], r310; +barrier.sync 0; +ld.shared.u32 r339, [r924]; +ld.shared.u32 r345, [r924+4]; +ld.shared.u32 r336, [r924+1944]; +ld.shared.u32 r342, [r924+1948]; +ld.shared.u32 r337, [r924+3888]; +ld.shared.u32 r343, [r924+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r920, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r931, rd7; +mul.lo.s32 r932, r931, 9; +sub.s32 r933, r920, r932; +shl.b32 r934, r933, 3; +add.s32 r935, r921, r934; +cvt.rn.f32.u32 f71, r931; +mul.f32 f72, f71, 0f3D9EDD1F; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r936, r931, 216, r935; +st.shared.u32 [r936], r338; +st.shared.u32 [r936+4], r344; +st.shared.u32 [r936+72], r431; +st.shared.u32 [r936+76], r438; +st.shared.u32 [r936+144], r468; +st.shared.u32 [r936+148], r475; +barrier.sync 0; +ld.shared.u32 r504, [r924]; +ld.shared.u32 r510, [r924+4]; +ld.shared.u32 r501, [r924+1944]; +ld.shared.u32 r507, [r924+1948]; +ld.shared.u32 r502, [r924+3888]; +ld.shared.u32 r508, [r924+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 r503, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 r509, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 r545, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 r563, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 r581, r572, r578; +} +mul.wide.u32 rd8, r920, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r937, rd9; +sub.s32 r938, r920, r937; +shr.u32 r939, r938, 1; +add.s32 r940, r939, r937; +shr.u32 r941, r940, 4; +mul.lo.s32 r942, r941, 27; +sub.s32 r943, r920, r942; +shl.b32 r944, r943, 3; +add.s32 r945, r921, r944; +cvt.rn.f32.u32 f74, r941; +mul.f32 f75, f74, 0f3E6E4BAE; +cos.approx.f32 f41, f75; +sin.approx.f32 f76, f75; +neg.f32 f42, f76; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r584, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r589, {high, high}; +} +{ +mul.f16x2 r591, r563, r589; +} +{ +neg.f16x2 r594, r591; +} +{ +fma.rn.f16x2 r596, r527, r587, r594; +} +{ +mul.f16x2 r600, r527, r589; +} +{ +fma.rn.f16x2 r603, r563, r587, r600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r607, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r609, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r609, r611; +} +{ +mul.f16x2 r615, r584, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r618, {high, low}; +} +{ +fma.rn.f16x2 r620, r612, r618, r615; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r626, {high, high}; +} +{ +mul.f16x2 r628, r581, r626; +} +{ +neg.f16x2 r631, r628; +} +{ +fma.rn.f16x2 r633, r545, r624, r631; +} +{ +mul.f16x2 r637, r545, r626; +} +{ +fma.rn.f16x2 r640, r581, r624, r637; +} +barrier.sync 0; +mad.lo.s32 r946, r941, 648, r945; +st.shared.u32 [r946], r503; +st.shared.u32 [r946+4], r509; +st.shared.u32 [r946+216], r596; +st.shared.u32 [r946+220], r603; +st.shared.u32 [r946+432], r633; +st.shared.u32 [r946+436], r640; +barrier.sync 0; +ld.shared.u32 r669, [r924]; +ld.shared.u32 r675, [r924+4]; +ld.shared.u32 r666, [r924+1944]; +ld.shared.u32 r672, [r924+1948]; +ld.shared.u32 r667, [r924+3888]; +ld.shared.u32 r673, [r924+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r662, {low, high}; +} +{ +neg.f16x2 r663, r662; +} +{ +add.f16x2 r665, r666, r667; +} +{ +add.f16x2 r668, r669, r665; +} +{ +add.f16x2 r671, r672, r673; +} +{ +add.f16x2 r674, r675, r671; +} +{ +add.f16x2 r677, r666, r667; +} +{ +mul.f16x2 r680, r677, r661; +} +{ +add.f16x2 r683, r669, r680; +} +{ +sub.f16x2 r686, r672, r673; +} +{ +mul.f16x2 r689, r686, r663; +} +{ +add.f16x2 r692, r683, r689; +} +{ +add.f16x2 r695, r666, r667; +} +{ +mul.f16x2 r698, r695, r661; +} +{ +add.f16x2 r701, r669, r698; +} +{ +sub.f16x2 r704, r672, r673; +} +{ +mul.f16x2 r707, r704, r663; +} +{ +sub.f16x2 r710, r701, r707; +} +{ +add.f16x2 r713, r672, r673; +} +{ +mul.f16x2 r716, r713, r661; +} +{ +add.f16x2 r719, r675, r716; +} +{ +sub.f16x2 r722, r666, r667; +} +{ +mul.f16x2 r725, r722, r663; +} +{ +sub.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, r672, r673; +} +{ +mul.f16x2 r734, r731, r661; +} +{ +add.f16x2 r737, r675, r734; +} +{ +sub.f16x2 r740, r666, r667; +} +{ +mul.f16x2 r743, r740, r663; +} +{ +add.f16x2 r746, r737, r743; +} +mul.wide.u32 rd10, r920, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r947, rd11; +mul.lo.s32 r948, r947, 81; +sub.s32 r949, r920, r948; +shl.b32 r950, r949, 3; +add.s32 r951, r921, r950; +cvt.rn.f32.u32 f77, r947; +mul.f32 f78, f77, 0f3F32B8C2; +cos.approx.f32 f53, f78; +sin.approx.f32 f79, f78; +neg.f32 f54, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r749, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r754, {high, high}; +} +{ +mul.f16x2 r756, r728, r754; +} +{ +neg.f16x2 r759, r756; +} +{ +fma.rn.f16x2 r761, r692, r752, r759; +} +{ +mul.f16x2 r765, r692, r754; +} +{ +fma.rn.f16x2 r768, r728, r752, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r776, {low, high}; +} +{ +mul.f16x2 r777, r774, r776; +} +{ +mul.f16x2 r780, r749, r772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r783, {high, low}; +} +{ +fma.rn.f16x2 r785, r777, r783, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r791, {high, high}; +} +{ +mul.f16x2 r793, r746, r791; +} +{ +neg.f16x2 r796, r793; +} +{ +fma.rn.f16x2 r798, r710, r789, r796; +} +{ +mul.f16x2 r802, r710, r791; +} +{ +fma.rn.f16x2 r805, r746, r789, r802; +} +barrier.sync 0; +mad.lo.s32 r952, r947, 1944, r951; +st.shared.u32 [r952], r668; +st.shared.u32 [r952+4], r674; +st.shared.u32 [r952+648], r761; +st.shared.u32 [r952+652], r768; +st.shared.u32 [r952+1296], r798; +st.shared.u32 [r952+1300], r805; +barrier.sync 0; +ld.shared.u32 r834, [r924]; +ld.shared.u32 r840, [r924+4]; +ld.shared.u32 r831, [r924+1944]; +ld.shared.u32 r837, [r924+1948]; +ld.shared.u32 r832, [r924+3888]; +ld.shared.u32 r838, [r924+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r827, {low, high}; +} +{ +neg.f16x2 r828, r827; +} +{ +add.f16x2 r830, r831, r832; +} +{ +add.f16x2 %0, r834, r830; +} +{ +add.f16x2 r836, r837, r838; +} +{ +add.f16x2 %1, r840, r836; +} +{ +add.f16x2 r842, r831, r832; +} +{ +mul.f16x2 r845, r842, r826; +} +{ +add.f16x2 r848, r834, r845; +} +{ +sub.f16x2 r851, r837, r838; +} +{ +mul.f16x2 r854, r851, r828; +} +{ +add.f16x2 %2, r848, r854; +} +{ +add.f16x2 r860, r831, r832; +} +{ +mul.f16x2 r863, r860, r826; +} +{ +add.f16x2 r866, r834, r863; +} +{ +sub.f16x2 r869, r837, r838; +} +{ +mul.f16x2 r872, r869, r828; +} +{ +sub.f16x2 %4, r866, r872; +} +{ +add.f16x2 r878, r837, r838; +} +{ +mul.f16x2 r881, r878, r826; +} +{ +add.f16x2 r884, r840, r881; +} +{ +sub.f16x2 r887, r831, r832; +} +{ +mul.f16x2 r890, r887, r828; +} +{ +sub.f16x2 %3, r884, r890; +} +{ +add.f16x2 r896, r837, r838; +} +{ +mul.f16x2 r899, r896, r826; +} +{ +add.f16x2 r902, r840, r899; +} +{ +sub.f16x2 r905, r831, r832; +} +{ +mul.f16x2 r908, r905, r828; +} +{ +add.f16x2 %5, r902, r908; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<889, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<80>; +.reg .b32 r<953>; +.reg .b64 rd<12>; +mov.u32 r914, %tid.y; +mov.u32 r915, %6; +mad.lo.s32 r916, r914, 2916, r915; +mov.u32 r917, %tid.x; +mov.f32 f62, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1, {low, high}; +} +mov.f32 f64, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r917, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r918, rd3; +mul.lo.s32 r919, r918, 243; +sub.s32 r920, r917, r919; +mad.lo.s32 r921, r918, 2916, r916; +cvt.rn.f32.u32 f65, r920; +mul.f32 f66, f65, 0f3C0D3654; +cos.approx.f32 f5, f66; +sin.approx.f32 f67, f66; +neg.f32 f6, f67; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f57, 0fBF800000; +mov.f32 f58, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r922, r920, 12, r921; +st.shared.u32 [r922], r8; +st.shared.u32 [r922+4], r101; +st.shared.u32 [r922+8], r138; +barrier.sync 0; +shl.b32 r923, r920, 3; +sub.s32 r924, r922, r923; +ld.shared.u32 r174, [r924]; +ld.shared.u32 r171, [r924+972]; +ld.shared.u32 r172, [r924+1944]; +barrier.sync 0; +st.shared.u32 [r922], r14; +st.shared.u32 [r922+4], r108; +st.shared.u32 [r922+8], r145; +barrier.sync 0; +ld.shared.u32 r180, [r924]; +ld.shared.u32 r177, [r924+972]; +ld.shared.u32 r178, [r924+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r920, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r925, rd5; +mul.lo.s32 r926, r925, 3; +sub.s32 r927, r920, r926; +shl.b32 r928, r927, 2; +add.s32 r929, r921, r928; +cvt.rn.f32.u32 f68, r925; +mul.f32 f69, f68, 0f3CD3D17E; +cos.approx.f32 f17, f69; +sin.approx.f32 f70, f69; +neg.f32 f18, f70; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r930, r925, 36, r929; +st.shared.u32 [r930], r173; +st.shared.u32 [r930+12], r266; +st.shared.u32 [r930+24], r303; +barrier.sync 0; +ld.shared.u32 r339, [r924]; +ld.shared.u32 r336, [r924+972]; +ld.shared.u32 r337, [r924+1944]; +barrier.sync 0; +st.shared.u32 [r930], r179; +st.shared.u32 [r930+12], r273; +st.shared.u32 [r930+24], r310; +barrier.sync 0; +ld.shared.u32 r345, [r924]; +ld.shared.u32 r342, [r924+972]; +ld.shared.u32 r343, [r924+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r920, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r931, rd7; +mul.lo.s32 r932, r931, 9; +sub.s32 r933, r920, r932; +shl.b32 r934, r933, 2; +add.s32 r935, r921, r934; +cvt.rn.f32.u32 f71, r931; +mul.f32 f72, f71, 0f3D9EDD1F; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r936, r931, 108, r935; +st.shared.u32 [r936], r338; +st.shared.u32 [r936+36], r431; +st.shared.u32 [r936+72], r468; +barrier.sync 0; +ld.shared.u32 r504, [r924]; +ld.shared.u32 r501, [r924+972]; +ld.shared.u32 r502, [r924+1944]; +barrier.sync 0; +st.shared.u32 [r936], r344; +st.shared.u32 [r936+36], r438; +st.shared.u32 [r936+72], r475; +barrier.sync 0; +ld.shared.u32 r510, [r924]; +ld.shared.u32 r507, [r924+972]; +ld.shared.u32 r508, [r924+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 r503, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 r509, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 r527, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 r545, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 r563, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 r581, r572, r578; +} +mul.wide.u32 rd8, r920, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r937, rd9; +sub.s32 r938, r920, r937; +shr.u32 r939, r938, 1; +add.s32 r940, r939, r937; +shr.u32 r941, r940, 4; +mul.lo.s32 r942, r941, 27; +sub.s32 r943, r920, r942; +shl.b32 r944, r943, 2; +add.s32 r945, r921, r944; +cvt.rn.f32.u32 f74, r941; +mul.f32 f75, f74, 0f3E6E4BAE; +cos.approx.f32 f41, f75; +sin.approx.f32 f76, f75; +neg.f32 f42, f76; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r584, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r589, {high, high}; +} +{ +mul.f16x2 r591, r563, r589; +} +{ +neg.f16x2 r594, r591; +} +{ +fma.rn.f16x2 r596, r527, r587, r594; +} +{ +mul.f16x2 r600, r527, r589; +} +{ +fma.rn.f16x2 r603, r563, r587, r600; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r607, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r609, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r611, {low, high}; +} +{ +mul.f16x2 r612, r609, r611; +} +{ +mul.f16x2 r615, r584, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r584; +mov.b32 r618, {high, low}; +} +{ +fma.rn.f16x2 r620, r612, r618, r615; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r620; +mov.b32 r626, {high, high}; +} +{ +mul.f16x2 r628, r581, r626; +} +{ +neg.f16x2 r631, r628; +} +{ +fma.rn.f16x2 r633, r545, r624, r631; +} +{ +mul.f16x2 r637, r545, r626; +} +{ +fma.rn.f16x2 r640, r581, r624, r637; +} +barrier.sync 0; +mad.lo.s32 r946, r941, 324, r945; +st.shared.u32 [r946], r503; +st.shared.u32 [r946+108], r596; +st.shared.u32 [r946+216], r633; +barrier.sync 0; +ld.shared.u32 r669, [r924]; +ld.shared.u32 r666, [r924+972]; +ld.shared.u32 r667, [r924+1944]; +barrier.sync 0; +st.shared.u32 [r946], r509; +st.shared.u32 [r946+108], r603; +st.shared.u32 [r946+216], r640; +barrier.sync 0; +ld.shared.u32 r675, [r924]; +ld.shared.u32 r672, [r924+972]; +ld.shared.u32 r673, [r924+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r662, {low, high}; +} +{ +neg.f16x2 r663, r662; +} +{ +add.f16x2 r665, r666, r667; +} +{ +add.f16x2 r668, r669, r665; +} +{ +add.f16x2 r671, r672, r673; +} +{ +add.f16x2 r674, r675, r671; +} +{ +add.f16x2 r677, r666, r667; +} +{ +mul.f16x2 r680, r677, r661; +} +{ +add.f16x2 r683, r669, r680; +} +{ +sub.f16x2 r686, r672, r673; +} +{ +mul.f16x2 r689, r686, r663; +} +{ +add.f16x2 r692, r683, r689; +} +{ +add.f16x2 r695, r666, r667; +} +{ +mul.f16x2 r698, r695, r661; +} +{ +add.f16x2 r701, r669, r698; +} +{ +sub.f16x2 r704, r672, r673; +} +{ +mul.f16x2 r707, r704, r663; +} +{ +sub.f16x2 r710, r701, r707; +} +{ +add.f16x2 r713, r672, r673; +} +{ +mul.f16x2 r716, r713, r661; +} +{ +add.f16x2 r719, r675, r716; +} +{ +sub.f16x2 r722, r666, r667; +} +{ +mul.f16x2 r725, r722, r663; +} +{ +sub.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, r672, r673; +} +{ +mul.f16x2 r734, r731, r661; +} +{ +add.f16x2 r737, r675, r734; +} +{ +sub.f16x2 r740, r666, r667; +} +{ +mul.f16x2 r743, r740, r663; +} +{ +add.f16x2 r746, r737, r743; +} +mul.wide.u32 rd10, r920, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r947, rd11; +mul.lo.s32 r948, r947, 81; +sub.s32 r949, r920, r948; +shl.b32 r950, r949, 2; +add.s32 r951, r921, r950; +cvt.rn.f32.u32 f77, r947; +mul.f32 f78, f77, 0f3F32B8C2; +cos.approx.f32 f53, f78; +sin.approx.f32 f79, f78; +neg.f32 f54, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r749, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r754, {high, high}; +} +{ +mul.f16x2 r756, r728, r754; +} +{ +neg.f16x2 r759, r756; +} +{ +fma.rn.f16x2 r761, r692, r752, r759; +} +{ +mul.f16x2 r765, r692, r754; +} +{ +fma.rn.f16x2 r768, r728, r752, r765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r776, {low, high}; +} +{ +mul.f16x2 r777, r774, r776; +} +{ +mul.f16x2 r780, r749, r772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r749; +mov.b32 r783, {high, low}; +} +{ +fma.rn.f16x2 r785, r777, r783, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r785; +mov.b32 r791, {high, high}; +} +{ +mul.f16x2 r793, r746, r791; +} +{ +neg.f16x2 r796, r793; +} +{ +fma.rn.f16x2 r798, r710, r789, r796; +} +{ +mul.f16x2 r802, r710, r791; +} +{ +fma.rn.f16x2 r805, r746, r789, r802; +} +barrier.sync 0; +mad.lo.s32 r952, r947, 972, r951; +st.shared.u32 [r952], r668; +st.shared.u32 [r952+324], r761; +st.shared.u32 [r952+648], r798; +barrier.sync 0; +ld.shared.u32 r834, [r924]; +ld.shared.u32 r831, [r924+972]; +ld.shared.u32 r832, [r924+1944]; +barrier.sync 0; +st.shared.u32 [r952], r674; +st.shared.u32 [r952+324], r768; +st.shared.u32 [r952+648], r805; +barrier.sync 0; +ld.shared.u32 r840, [r924]; +ld.shared.u32 r837, [r924+972]; +ld.shared.u32 r838, [r924+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r827, {low, high}; +} +{ +neg.f16x2 r828, r827; +} +{ +add.f16x2 r830, r831, r832; +} +{ +add.f16x2 %0, r834, r830; +} +{ +add.f16x2 r836, r837, r838; +} +{ +add.f16x2 %1, r840, r836; +} +{ +add.f16x2 r842, r831, r832; +} +{ +mul.f16x2 r845, r842, r826; +} +{ +add.f16x2 r848, r834, r845; +} +{ +sub.f16x2 r851, r837, r838; +} +{ +mul.f16x2 r854, r851, r828; +} +{ +add.f16x2 %2, r848, r854; +} +{ +add.f16x2 r860, r831, r832; +} +{ +mul.f16x2 r863, r860, r826; +} +{ +add.f16x2 r866, r834, r863; +} +{ +sub.f16x2 r869, r837, r838; +} +{ +mul.f16x2 r872, r869, r828; +} +{ +sub.f16x2 %4, r866, r872; +} +{ +add.f16x2 r878, r837, r838; +} +{ +mul.f16x2 r881, r878, r826; +} +{ +add.f16x2 r884, r840, r881; +} +{ +sub.f16x2 r887, r831, r832; +} +{ +mul.f16x2 r890, r887, r828; +} +{ +sub.f16x2 %3, r884, r890; +} +{ +add.f16x2 r896, r837, r838; +} +{ +mul.f16x2 r899, r896, r826; +} +{ +add.f16x2 r902, r840, r899; +} +{ +sub.f16x2 r905, r831, r832; +} +{ +mul.f16x2 r908, r905, r828; +} +{ +add.f16x2 %5, r902, r908; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..d3542c5632d39 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp16_inv.hpp.inc @@ -0,0 +1,24483 @@ +#ifndef CUFFTDX_FFT_729_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_729_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1086, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<676>; +.reg .b32 r<6776>; +.reg .b64 rd<4>; +mov.u32 r6774, %tid.y; +mov.u32 r6775, %54; +mad.lo.s32 r6708, r6774, 5832, r6775; +mov.u32 r6709, %tid.x; +mov.f32 f670, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1, {low, high}; +} +mov.f32 f672, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r259, {low, high}; +} +mov.f32 f544, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r260, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r261, {low, high}; +} +mov.f32 f556, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r262, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r265, {low, high}; +} +mov.f32 f580, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %74; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %74; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %74; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %74; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %74; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %75; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %75; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %75; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %75; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %75; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %76, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %76, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %76, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %76, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %76, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1789, {low, high}; +} +mov.f32 f536, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1790, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1791, {low, high}; +} +mov.f32 f540, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1794, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1795, {low, high}; +} +mov.f32 f548, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1796, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1797, {low, high}; +} +mov.f32 f552, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1800, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1801, {low, high}; +} +mov.f32 f560, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1802, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1803, {low, high}; +} +mov.f32 f564, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1804, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1807, {low, high}; +} +mov.f32 f572, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1808, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1812, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1815, {low, high}; +} +mov.f32 f588, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1819, {low, high}; +} +mov.f32 f596, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r6709, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6710, rd3; +sub.s32 r6711, r6709, r6710; +shr.u32 r6712, r6711, 1; +add.s32 r6713, r6712, r6710; +shr.u32 r6714, r6713, 4; +mul.lo.s32 r6715, r6714, 27; +sub.s32 r6716, r6709, r6715; +cvt.rn.f32.u32 f673, r6716; +mul.f32 f674, f673, 0f3C0D3654; +cos.approx.f32 f309, f674; +sin.approx.f32 f675, f674; +neg.f32 f310, f675; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +mad.lo.s32 r6717, r6714, 5832, r6708; +barrier.sync 0; +mad.lo.s32 r6718, r6716, 216, r6717; +st.shared.v2.f32 [r6718], {r2102, r2108}; +st.shared.v2.f32 [r6718+8], {r2881, r2890}; +st.shared.v2.f32 [r6718+16], {r2918, r2927}; +st.shared.v2.f32 [r6718+24], {r2955, r2964}; +st.shared.v2.f32 [r6718+32], {r2992, r3001}; +st.shared.v2.f32 [r6718+40], {r3029, r3038}; +st.shared.v2.f32 [r6718+48], {r3066, r3075}; +st.shared.v2.f32 [r6718+56], {r3103, r3112}; +st.shared.v2.f32 [r6718+64], {r3140, r3149}; +st.shared.v2.f32 [r6718+72], {r3177, r3186}; +st.shared.v2.f32 [r6718+80], {r3214, r3223}; +st.shared.v2.f32 [r6718+88], {r3251, r3260}; +st.shared.v2.f32 [r6718+96], {r3288, r3297}; +st.shared.v2.f32 [r6718+104], {r3325, r3334}; +st.shared.v2.f32 [r6718+112], {r3362, r3371}; +st.shared.v2.f32 [r6718+120], {r3399, r3408}; +st.shared.v2.f32 [r6718+128], {r3436, r3445}; +st.shared.v2.f32 [r6718+136], {r3473, r3482}; +st.shared.v2.f32 [r6718+144], {r3510, r3519}; +st.shared.v2.f32 [r6718+152], {r3547, r3556}; +st.shared.v2.f32 [r6718+160], {r3584, r3593}; +st.shared.v2.f32 [r6718+168], {r3621, r3630}; +st.shared.v2.f32 [r6718+176], {r3658, r3667}; +st.shared.v2.f32 [r6718+184], {r3695, r3704}; +st.shared.v2.f32 [r6718+192], {r3732, r3741}; +st.shared.v2.f32 [r6718+200], {r3769, r3778}; +st.shared.v2.f32 [r6718+208], {r3806, r3815}; +barrier.sync 0; +mad.lo.s32 r6719, r6716, -208, r6718; +ld.shared.u32 r3842, [r6719]; +ld.shared.u32 r3848, [r6719+4]; +ld.shared.u32 r4438, [r6719+216]; +ld.shared.u32 r4444, [r6719+220]; +ld.shared.u32 r5034, [r6719+432]; +ld.shared.u32 r5040, [r6719+436]; +ld.shared.u32 r3928, [r6719+648]; +ld.shared.u32 r3934, [r6719+652]; +ld.shared.u32 r4524, [r6719+864]; +ld.shared.u32 r4530, [r6719+868]; +ld.shared.u32 r5120, [r6719+1080]; +ld.shared.u32 r5126, [r6719+1084]; +ld.shared.u32 r4014, [r6719+1296]; +ld.shared.u32 r4020, [r6719+1300]; +ld.shared.u32 r4610, [r6719+1512]; +ld.shared.u32 r4616, [r6719+1516]; +ld.shared.u32 r5206, [r6719+1728]; +ld.shared.u32 r5212, [r6719+1732]; +ld.shared.u32 r3839, [r6719+1944]; +ld.shared.u32 r3845, [r6719+1948]; +ld.shared.u32 r4435, [r6719+2160]; +ld.shared.u32 r4441, [r6719+2164]; +ld.shared.u32 r5031, [r6719+2376]; +ld.shared.u32 r5037, [r6719+2380]; +ld.shared.u32 r3925, [r6719+2592]; +ld.shared.u32 r3931, [r6719+2596]; +ld.shared.u32 r4521, [r6719+2808]; +ld.shared.u32 r4527, [r6719+2812]; +ld.shared.u32 r5117, [r6719+3024]; +ld.shared.u32 r5123, [r6719+3028]; +ld.shared.u32 r4011, [r6719+3240]; +ld.shared.u32 r4017, [r6719+3244]; +ld.shared.u32 r4607, [r6719+3456]; +ld.shared.u32 r4613, [r6719+3460]; +ld.shared.u32 r5203, [r6719+3672]; +ld.shared.u32 r5209, [r6719+3676]; +ld.shared.u32 r3840, [r6719+3888]; +ld.shared.u32 r3846, [r6719+3892]; +ld.shared.u32 r4436, [r6719+4104]; +ld.shared.u32 r4442, [r6719+4108]; +ld.shared.u32 r5032, [r6719+4320]; +ld.shared.u32 r5038, [r6719+4324]; +ld.shared.u32 r3926, [r6719+4536]; +ld.shared.u32 r3932, [r6719+4540]; +ld.shared.u32 r4522, [r6719+4752]; +ld.shared.u32 r4528, [r6719+4756]; +ld.shared.u32 r5118, [r6719+4968]; +ld.shared.u32 r5124, [r6719+4972]; +ld.shared.u32 r4012, [r6719+5184]; +ld.shared.u32 r4018, [r6719+5188]; +ld.shared.u32 r4608, [r6719+5400]; +ld.shared.u32 r4614, [r6719+5404]; +ld.shared.u32 r5204, [r6719+5616]; +ld.shared.u32 r5210, [r6719+5620]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 %0, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 %1, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 %18, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 %36, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 %19, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 %37, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 %2, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 %3, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 %20, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 %38, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 %21, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 %39, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 %4, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 %5, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 %22, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 %40, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 %23, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 %41, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 %6, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 %7, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 %24, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 %42, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 %25, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 %43, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 %8, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 %9, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 %26, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 %44, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 %27, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 %45, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 %10, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 %11, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 %28, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 %46, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 %29, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 %47, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 %12, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 %13, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 %30, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 %48, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 %31, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 %49, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 %14, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 %15, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 %32, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 %50, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 %33, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 %51, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 %16, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 %17, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 %34, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 %52, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 %35, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 %53, r6694, r6700; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1087, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<676>; +.reg .b32 r<6776>; +.reg .b64 rd<4>; +mov.u32 r6774, %tid.y; +mov.u32 r6775, %54; +mad.lo.s32 r6708, r6774, 2916, r6775; +mov.u32 r6709, %tid.x; +mov.f32 f670, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1, {low, high}; +} +mov.f32 f672, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %108, %99; +} +{ +add.f16x2 r6, %81, r3; +} +{ +add.f16x2 r9, %60, %106; +} +{ +add.f16x2 r12, %90, r9; +} +{ +add.f16x2 r15, %108, %99; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %81, r18; +} +{ +sub.f16x2 r24, %60, %106; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %108, %99; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %81, r36; +} +{ +sub.f16x2 r42, %60, %106; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %60, %106; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %90, r54; +} +{ +sub.f16x2 r60, %108, %99; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %60, %106; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %90, r72; +} +{ +sub.f16x2 r78, %108, %99; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %107, %98; +} +{ +add.f16x2 r92, %80, r89; +} +{ +add.f16x2 r95, %59, %104; +} +{ +add.f16x2 r98, %89, r95; +} +{ +add.f16x2 r101, %107, %98; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %80, r104; +} +{ +sub.f16x2 r110, %59, %104; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %107, %98; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %80, r122; +} +{ +sub.f16x2 r128, %59, %104; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %59, %104; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %89, r140; +} +{ +sub.f16x2 r146, %107, %98; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %59, %104; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %89, r158; +} +{ +sub.f16x2 r164, %107, %98; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %105, %97; +} +{ +add.f16x2 r178, %79, r175; +} +{ +add.f16x2 r181, %58, %103; +} +{ +add.f16x2 r184, %88, r181; +} +{ +add.f16x2 r187, %105, %97; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %79, r190; +} +{ +sub.f16x2 r196, %58, %103; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %105, %97; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %79, r208; +} +{ +sub.f16x2 r214, %58, %103; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %58, %103; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %88, r226; +} +{ +sub.f16x2 r232, %105, %97; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %58, %103; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %88, r244; +} +{ +sub.f16x2 r250, %105, %97; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f542, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r259, {low, high}; +} +mov.f32 f544, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r260, {low, high}; +} +mov.f32 f554, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r261, {low, high}; +} +mov.f32 f556, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r262, {low, high}; +} +mov.f32 f578, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r265, {low, high}; +} +mov.f32 f580, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %96, %84; +} +{ +add.f16x2 r602, %66, r599; +} +{ +add.f16x2 r605, %102, %94; +} +{ +add.f16x2 r608, %72, r605; +} +{ +add.f16x2 r611, %96, %84; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %66, r614; +} +{ +sub.f16x2 r620, %102, %94; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %96, %84; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %66, r632; +} +{ +sub.f16x2 r638, %102, %94; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %102, %94; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %72, r650; +} +{ +sub.f16x2 r656, %96, %84; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %102, %94; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %72, r668; +} +{ +sub.f16x2 r674, %96, %84; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %95, %83; +} +{ +add.f16x2 r688, %65, r685; +} +{ +add.f16x2 r691, %101, %92; +} +{ +add.f16x2 r694, %71, r691; +} +{ +add.f16x2 r697, %95, %83; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %65, r700; +} +{ +sub.f16x2 r706, %101, %92; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %95, %83; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %65, r718; +} +{ +sub.f16x2 r724, %101, %92; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %101, %92; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %71, r736; +} +{ +sub.f16x2 r742, %95, %83; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %101, %92; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %71, r754; +} +{ +sub.f16x2 r760, %95, %83; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %93, %82; +} +{ +add.f16x2 r774, %64, r771; +} +{ +add.f16x2 r777, %100, %91; +} +{ +add.f16x2 r780, %70, r777; +} +{ +add.f16x2 r783, %93, %82; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %64, r786; +} +{ +sub.f16x2 r792, %100, %91; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %93, %82; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %64, r804; +} +{ +sub.f16x2 r810, %100, %91; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %100, %91; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %70, r822; +} +{ +sub.f16x2 r828, %93, %82; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %100, %91; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %70, r840; +} +{ +sub.f16x2 r846, %93, %82; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %78, %69; +} +{ +add.f16x2 r1198, %57, r1195; +} +{ +add.f16x2 r1201, %87, %74; +} +{ +add.f16x2 r1204, %63, r1201; +} +{ +add.f16x2 r1207, %78, %69; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %57, r1210; +} +{ +sub.f16x2 r1216, %87, %74; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %78, %69; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %57, r1228; +} +{ +sub.f16x2 r1234, %87, %74; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %87, %74; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %63, r1246; +} +{ +sub.f16x2 r1252, %78, %69; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %87, %74; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %63, r1264; +} +{ +sub.f16x2 r1270, %78, %69; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %77, %68; +} +{ +add.f16x2 r1284, %56, r1281; +} +{ +add.f16x2 r1287, %86, %75; +} +{ +add.f16x2 r1290, %62, r1287; +} +{ +add.f16x2 r1293, %77, %68; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %56, r1296; +} +{ +sub.f16x2 r1302, %86, %75; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %77, %68; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %56, r1314; +} +{ +sub.f16x2 r1320, %86, %75; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %86, %75; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %62, r1332; +} +{ +sub.f16x2 r1338, %77, %68; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %86, %75; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %62, r1350; +} +{ +sub.f16x2 r1356, %77, %68; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %76, %67; +} +{ +add.f16x2 r1370, %55, r1367; +} +{ +add.f16x2 r1373, %85, %73; +} +{ +add.f16x2 r1376, %61, r1373; +} +{ +add.f16x2 r1379, %76, %67; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %55, r1382; +} +{ +sub.f16x2 r1388, %85, %73; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %76, %67; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %55, r1400; +} +{ +sub.f16x2 r1406, %85, %73; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %85, %73; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %61, r1418; +} +{ +sub.f16x2 r1424, %76, %67; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %85, %73; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %61, r1436; +} +{ +sub.f16x2 r1442, %76, %67; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f534, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r1789, {low, high}; +} +mov.f32 f536, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r1790, {low, high}; +} +mov.f32 f538, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r1791, {low, high}; +} +mov.f32 f540, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r1794, {low, high}; +} +mov.f32 f546, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r1795, {low, high}; +} +mov.f32 f548, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r1796, {low, high}; +} +mov.f32 f550, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r1797, {low, high}; +} +mov.f32 f552, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r1800, {low, high}; +} +mov.f32 f558, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r1801, {low, high}; +} +mov.f32 f560, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r1802, {low, high}; +} +mov.f32 f562, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r1803, {low, high}; +} +mov.f32 f564, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r1804, {low, high}; +} +mov.f32 f570, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r1807, {low, high}; +} +mov.f32 f572, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r1808, {low, high}; +} +mov.f32 f594, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r1812, {low, high}; +} +mov.f32 f586, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r1815, {low, high}; +} +mov.f32 f588, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r1819, {low, high}; +} +mov.f32 f596, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r6709, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r6710, rd3; +sub.s32 r6711, r6709, r6710; +shr.u32 r6712, r6711, 1; +add.s32 r6713, r6712, r6710; +shr.u32 r6714, r6713, 4; +mul.lo.s32 r6715, r6714, 27; +sub.s32 r6716, r6709, r6715; +mad.lo.s32 r6717, r6714, 2916, r6708; +cvt.rn.f32.u32 f673, r6716; +mul.f32 f674, f673, 0f3C0D3654; +cos.approx.f32 f309, f674; +sin.approx.f32 f675, f674; +neg.f32 f310, f675; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +barrier.sync 0; +mad.lo.s32 r6718, r6716, 108, r6717; +st.shared.u32 [r6718], r2102; +st.shared.u32 [r6718+4], r2881; +st.shared.u32 [r6718+8], r2918; +st.shared.u32 [r6718+12], r2955; +st.shared.u32 [r6718+16], r2992; +st.shared.u32 [r6718+20], r3029; +st.shared.u32 [r6718+24], r3066; +st.shared.u32 [r6718+28], r3103; +st.shared.u32 [r6718+32], r3140; +st.shared.u32 [r6718+36], r3177; +st.shared.u32 [r6718+40], r3214; +st.shared.u32 [r6718+44], r3251; +st.shared.u32 [r6718+48], r3288; +st.shared.u32 [r6718+52], r3325; +st.shared.u32 [r6718+56], r3362; +st.shared.u32 [r6718+60], r3399; +st.shared.u32 [r6718+64], r3436; +st.shared.u32 [r6718+68], r3473; +st.shared.u32 [r6718+72], r3510; +st.shared.u32 [r6718+76], r3547; +st.shared.u32 [r6718+80], r3584; +st.shared.u32 [r6718+84], r3621; +st.shared.u32 [r6718+88], r3658; +st.shared.u32 [r6718+92], r3695; +st.shared.u32 [r6718+96], r3732; +st.shared.u32 [r6718+100], r3769; +st.shared.u32 [r6718+104], r3806; +barrier.sync 0; +mad.lo.s32 r6719, r6716, -104, r6718; +ld.shared.u32 r3842, [r6719]; +ld.shared.u32 r4438, [r6719+108]; +ld.shared.u32 r5034, [r6719+216]; +ld.shared.u32 r3928, [r6719+324]; +ld.shared.u32 r4524, [r6719+432]; +ld.shared.u32 r5120, [r6719+540]; +ld.shared.u32 r4014, [r6719+648]; +ld.shared.u32 r4610, [r6719+756]; +ld.shared.u32 r5206, [r6719+864]; +ld.shared.u32 r3839, [r6719+972]; +ld.shared.u32 r4435, [r6719+1080]; +ld.shared.u32 r5031, [r6719+1188]; +ld.shared.u32 r3925, [r6719+1296]; +ld.shared.u32 r4521, [r6719+1404]; +ld.shared.u32 r5117, [r6719+1512]; +ld.shared.u32 r4011, [r6719+1620]; +ld.shared.u32 r4607, [r6719+1728]; +ld.shared.u32 r5203, [r6719+1836]; +ld.shared.u32 r3840, [r6719+1944]; +ld.shared.u32 r4436, [r6719+2052]; +ld.shared.u32 r5032, [r6719+2160]; +ld.shared.u32 r3926, [r6719+2268]; +ld.shared.u32 r4522, [r6719+2376]; +ld.shared.u32 r5118, [r6719+2484]; +ld.shared.u32 r4012, [r6719+2592]; +ld.shared.u32 r4608, [r6719+2700]; +ld.shared.u32 r5204, [r6719+2808]; +barrier.sync 0; +st.shared.u32 [r6718], r2108; +st.shared.u32 [r6718+4], r2890; +st.shared.u32 [r6718+8], r2927; +st.shared.u32 [r6718+12], r2964; +st.shared.u32 [r6718+16], r3001; +st.shared.u32 [r6718+20], r3038; +st.shared.u32 [r6718+24], r3075; +st.shared.u32 [r6718+28], r3112; +st.shared.u32 [r6718+32], r3149; +st.shared.u32 [r6718+36], r3186; +st.shared.u32 [r6718+40], r3223; +st.shared.u32 [r6718+44], r3260; +st.shared.u32 [r6718+48], r3297; +st.shared.u32 [r6718+52], r3334; +st.shared.u32 [r6718+56], r3371; +st.shared.u32 [r6718+60], r3408; +st.shared.u32 [r6718+64], r3445; +st.shared.u32 [r6718+68], r3482; +st.shared.u32 [r6718+72], r3519; +st.shared.u32 [r6718+76], r3556; +st.shared.u32 [r6718+80], r3593; +st.shared.u32 [r6718+84], r3630; +st.shared.u32 [r6718+88], r3667; +st.shared.u32 [r6718+92], r3704; +st.shared.u32 [r6718+96], r3741; +st.shared.u32 [r6718+100], r3778; +st.shared.u32 [r6718+104], r3815; +barrier.sync 0; +ld.shared.u32 r3848, [r6719]; +ld.shared.u32 r4444, [r6719+108]; +ld.shared.u32 r5040, [r6719+216]; +ld.shared.u32 r3934, [r6719+324]; +ld.shared.u32 r4530, [r6719+432]; +ld.shared.u32 r5126, [r6719+540]; +ld.shared.u32 r4020, [r6719+648]; +ld.shared.u32 r4616, [r6719+756]; +ld.shared.u32 r5212, [r6719+864]; +ld.shared.u32 r3845, [r6719+972]; +ld.shared.u32 r4441, [r6719+1080]; +ld.shared.u32 r5037, [r6719+1188]; +ld.shared.u32 r3931, [r6719+1296]; +ld.shared.u32 r4527, [r6719+1404]; +ld.shared.u32 r5123, [r6719+1512]; +ld.shared.u32 r4017, [r6719+1620]; +ld.shared.u32 r4613, [r6719+1728]; +ld.shared.u32 r5209, [r6719+1836]; +ld.shared.u32 r3846, [r6719+1944]; +ld.shared.u32 r4442, [r6719+2052]; +ld.shared.u32 r5038, [r6719+2160]; +ld.shared.u32 r3932, [r6719+2268]; +ld.shared.u32 r4528, [r6719+2376]; +ld.shared.u32 r5124, [r6719+2484]; +ld.shared.u32 r4018, [r6719+2592]; +ld.shared.u32 r4614, [r6719+2700]; +ld.shared.u32 r5210, [r6719+2808]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 r3841, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 r3847, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 r3865, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 r3883, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 r3901, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 r3919, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 r3927, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 r3933, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 r3951, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 r3969, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 r3987, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 r4005, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 r4013, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 r4019, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 r4037, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 r4055, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 r4073, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 r4091, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4101, {low, high}; +} +{ +mul.f16x2 r4110, r3951, r4094; +} +{ +mul.f16x2 r4113, r3987, r4095; +} +{ +sub.f16x2 r4116, r4110, r4113; +} +{ +mul.f16x2 r4119, r3951, r4095; +} +{ +fma.rn.f16x2 r4122, r3987, r4094, r4119; +} +{ +mul.f16x2 r4126, r4037, r4096; +} +{ +mul.f16x2 r4129, r4073, r4097; +} +{ +sub.f16x2 r4132, r4126, r4129; +} +{ +mul.f16x2 r4135, r4037, r4097; +} +{ +fma.rn.f16x2 r4138, r4073, r4096, r4135; +} +{ +mul.f16x2 r4142, r3969, r4096; +} +{ +mul.f16x2 r4145, r4005, r4097; +} +{ +sub.f16x2 r4148, r4142, r4145; +} +{ +mul.f16x2 r4151, r3969, r4097; +} +{ +fma.rn.f16x2 r4154, r4005, r4096, r4151; +} +{ +mul.f16x2 r4158, r4055, r4100; +} +{ +mul.f16x2 r4161, r4091, r4101; +} +{ +sub.f16x2 r4164, r4158, r4161; +} +{ +mul.f16x2 r4167, r4055, r4101; +} +{ +fma.rn.f16x2 r4170, r4091, r4100, r4167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4175, {low, high}; +} +{ +add.f16x2 r4176, r3927, r4013; +} +{ +add.f16x2 r4179, r3841, r4176; +} +{ +add.f16x2 r4182, r3933, r4019; +} +{ +add.f16x2 r4185, r3847, r4182; +} +{ +add.f16x2 r4188, r3927, r4013; +} +{ +mul.f16x2 r4191, r4188, r4174; +} +{ +add.f16x2 r4194, r3841, r4191; +} +{ +sub.f16x2 r4197, r3933, r4019; +} +{ +mul.f16x2 r4200, r4197, r4175; +} +{ +add.f16x2 r4203, r4194, r4200; +} +{ +add.f16x2 r4206, r3927, r4013; +} +{ +mul.f16x2 r4209, r4206, r4174; +} +{ +add.f16x2 r4212, r3841, r4209; +} +{ +sub.f16x2 r4215, r3933, r4019; +} +{ +mul.f16x2 r4218, r4215, r4175; +} +{ +sub.f16x2 r4221, r4212, r4218; +} +{ +add.f16x2 r4224, r3933, r4019; +} +{ +mul.f16x2 r4227, r4224, r4174; +} +{ +add.f16x2 r4230, r3847, r4227; +} +{ +sub.f16x2 r4233, r3927, r4013; +} +{ +mul.f16x2 r4236, r4233, r4175; +} +{ +sub.f16x2 r4239, r4230, r4236; +} +{ +add.f16x2 r4242, r3933, r4019; +} +{ +mul.f16x2 r4245, r4242, r4174; +} +{ +add.f16x2 r4248, r3847, r4245; +} +{ +sub.f16x2 r4251, r3927, r4013; +} +{ +mul.f16x2 r4254, r4251, r4175; +} +{ +add.f16x2 r4257, r4248, r4254; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4261, {low, high}; +} +{ +add.f16x2 r4262, r4116, r4132; +} +{ +add.f16x2 r4265, r3865, r4262; +} +{ +add.f16x2 r4268, r4122, r4138; +} +{ +add.f16x2 r4271, r3901, r4268; +} +{ +add.f16x2 r4274, r4116, r4132; +} +{ +mul.f16x2 r4277, r4274, r4260; +} +{ +add.f16x2 r4280, r3865, r4277; +} +{ +sub.f16x2 r4283, r4122, r4138; +} +{ +mul.f16x2 r4286, r4283, r4261; +} +{ +add.f16x2 r4289, r4280, r4286; +} +{ +add.f16x2 r4292, r4116, r4132; +} +{ +mul.f16x2 r4295, r4292, r4260; +} +{ +add.f16x2 r4298, r3865, r4295; +} +{ +sub.f16x2 r4301, r4122, r4138; +} +{ +mul.f16x2 r4304, r4301, r4261; +} +{ +sub.f16x2 r4307, r4298, r4304; +} +{ +add.f16x2 r4310, r4122, r4138; +} +{ +mul.f16x2 r4313, r4310, r4260; +} +{ +add.f16x2 r4316, r3901, r4313; +} +{ +sub.f16x2 r4319, r4116, r4132; +} +{ +mul.f16x2 r4322, r4319, r4261; +} +{ +sub.f16x2 r4325, r4316, r4322; +} +{ +add.f16x2 r4328, r4122, r4138; +} +{ +mul.f16x2 r4331, r4328, r4260; +} +{ +add.f16x2 r4334, r3901, r4331; +} +{ +sub.f16x2 r4337, r4116, r4132; +} +{ +mul.f16x2 r4340, r4337, r4261; +} +{ +add.f16x2 r4343, r4334, r4340; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4346, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4347, {low, high}; +} +{ +add.f16x2 r4348, r4148, r4164; +} +{ +add.f16x2 r4351, r3883, r4348; +} +{ +add.f16x2 r4354, r4154, r4170; +} +{ +add.f16x2 r4357, r3919, r4354; +} +{ +add.f16x2 r4360, r4148, r4164; +} +{ +mul.f16x2 r4363, r4360, r4346; +} +{ +add.f16x2 r4366, r3883, r4363; +} +{ +sub.f16x2 r4369, r4154, r4170; +} +{ +mul.f16x2 r4372, r4369, r4347; +} +{ +add.f16x2 r4375, r4366, r4372; +} +{ +add.f16x2 r4378, r4148, r4164; +} +{ +mul.f16x2 r4381, r4378, r4346; +} +{ +add.f16x2 r4384, r3883, r4381; +} +{ +sub.f16x2 r4387, r4154, r4170; +} +{ +mul.f16x2 r4390, r4387, r4347; +} +{ +sub.f16x2 r4393, r4384, r4390; +} +{ +add.f16x2 r4396, r4154, r4170; +} +{ +mul.f16x2 r4399, r4396, r4346; +} +{ +add.f16x2 r4402, r3919, r4399; +} +{ +sub.f16x2 r4405, r4148, r4164; +} +{ +mul.f16x2 r4408, r4405, r4347; +} +{ +sub.f16x2 r4411, r4402, r4408; +} +{ +add.f16x2 r4414, r4154, r4170; +} +{ +mul.f16x2 r4417, r4414, r4346; +} +{ +add.f16x2 r4420, r3919, r4417; +} +{ +sub.f16x2 r4423, r4148, r4164; +} +{ +mul.f16x2 r4426, r4423, r4347; +} +{ +add.f16x2 r4429, r4420, r4426; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4432, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4433, {low, high}; +} +{ +add.f16x2 r4434, r4435, r4436; +} +{ +add.f16x2 r4437, r4438, r4434; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 r4443, r4444, r4440; +} +{ +add.f16x2 r4446, r4435, r4436; +} +{ +mul.f16x2 r4449, r4446, r4432; +} +{ +add.f16x2 r4452, r4438, r4449; +} +{ +sub.f16x2 r4455, r4441, r4442; +} +{ +mul.f16x2 r4458, r4455, r4433; +} +{ +add.f16x2 r4461, r4452, r4458; +} +{ +add.f16x2 r4464, r4435, r4436; +} +{ +mul.f16x2 r4467, r4464, r4432; +} +{ +add.f16x2 r4470, r4438, r4467; +} +{ +sub.f16x2 r4473, r4441, r4442; +} +{ +mul.f16x2 r4476, r4473, r4433; +} +{ +sub.f16x2 r4479, r4470, r4476; +} +{ +add.f16x2 r4482, r4441, r4442; +} +{ +mul.f16x2 r4485, r4482, r4432; +} +{ +add.f16x2 r4488, r4444, r4485; +} +{ +sub.f16x2 r4491, r4435, r4436; +} +{ +mul.f16x2 r4494, r4491, r4433; +} +{ +sub.f16x2 r4497, r4488, r4494; +} +{ +add.f16x2 r4500, r4441, r4442; +} +{ +mul.f16x2 r4503, r4500, r4432; +} +{ +add.f16x2 r4506, r4444, r4503; +} +{ +sub.f16x2 r4509, r4435, r4436; +} +{ +mul.f16x2 r4512, r4509, r4433; +} +{ +add.f16x2 r4515, r4506, r4512; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4518, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4519, {low, high}; +} +{ +add.f16x2 r4520, r4521, r4522; +} +{ +add.f16x2 r4523, r4524, r4520; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 r4529, r4530, r4526; +} +{ +add.f16x2 r4532, r4521, r4522; +} +{ +mul.f16x2 r4535, r4532, r4518; +} +{ +add.f16x2 r4538, r4524, r4535; +} +{ +sub.f16x2 r4541, r4527, r4528; +} +{ +mul.f16x2 r4544, r4541, r4519; +} +{ +add.f16x2 r4547, r4538, r4544; +} +{ +add.f16x2 r4550, r4521, r4522; +} +{ +mul.f16x2 r4553, r4550, r4518; +} +{ +add.f16x2 r4556, r4524, r4553; +} +{ +sub.f16x2 r4559, r4527, r4528; +} +{ +mul.f16x2 r4562, r4559, r4519; +} +{ +sub.f16x2 r4565, r4556, r4562; +} +{ +add.f16x2 r4568, r4527, r4528; +} +{ +mul.f16x2 r4571, r4568, r4518; +} +{ +add.f16x2 r4574, r4530, r4571; +} +{ +sub.f16x2 r4577, r4521, r4522; +} +{ +mul.f16x2 r4580, r4577, r4519; +} +{ +sub.f16x2 r4583, r4574, r4580; +} +{ +add.f16x2 r4586, r4527, r4528; +} +{ +mul.f16x2 r4589, r4586, r4518; +} +{ +add.f16x2 r4592, r4530, r4589; +} +{ +sub.f16x2 r4595, r4521, r4522; +} +{ +mul.f16x2 r4598, r4595, r4519; +} +{ +add.f16x2 r4601, r4592, r4598; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4604, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4605, {low, high}; +} +{ +add.f16x2 r4606, r4607, r4608; +} +{ +add.f16x2 r4609, r4610, r4606; +} +{ +add.f16x2 r4612, r4613, r4614; +} +{ +add.f16x2 r4615, r4616, r4612; +} +{ +add.f16x2 r4618, r4607, r4608; +} +{ +mul.f16x2 r4621, r4618, r4604; +} +{ +add.f16x2 r4624, r4610, r4621; +} +{ +sub.f16x2 r4627, r4613, r4614; +} +{ +mul.f16x2 r4630, r4627, r4605; +} +{ +add.f16x2 r4633, r4624, r4630; +} +{ +add.f16x2 r4636, r4607, r4608; +} +{ +mul.f16x2 r4639, r4636, r4604; +} +{ +add.f16x2 r4642, r4610, r4639; +} +{ +sub.f16x2 r4645, r4613, r4614; +} +{ +mul.f16x2 r4648, r4645, r4605; +} +{ +sub.f16x2 r4651, r4642, r4648; +} +{ +add.f16x2 r4654, r4613, r4614; +} +{ +mul.f16x2 r4657, r4654, r4604; +} +{ +add.f16x2 r4660, r4616, r4657; +} +{ +sub.f16x2 r4663, r4607, r4608; +} +{ +mul.f16x2 r4666, r4663, r4605; +} +{ +sub.f16x2 r4669, r4660, r4666; +} +{ +add.f16x2 r4672, r4613, r4614; +} +{ +mul.f16x2 r4675, r4672, r4604; +} +{ +add.f16x2 r4678, r4616, r4675; +} +{ +sub.f16x2 r4681, r4607, r4608; +} +{ +mul.f16x2 r4684, r4681, r4605; +} +{ +add.f16x2 r4687, r4678, r4684; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r4690, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r4691, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r4692, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r4693, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r4696, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r4697, {low, high}; +} +{ +mul.f16x2 r4706, r4547, r4690; +} +{ +mul.f16x2 r4709, r4583, r4691; +} +{ +sub.f16x2 r4712, r4706, r4709; +} +{ +mul.f16x2 r4715, r4547, r4691; +} +{ +fma.rn.f16x2 r4718, r4583, r4690, r4715; +} +{ +mul.f16x2 r4722, r4633, r4692; +} +{ +mul.f16x2 r4725, r4669, r4693; +} +{ +sub.f16x2 r4728, r4722, r4725; +} +{ +mul.f16x2 r4731, r4633, r4693; +} +{ +fma.rn.f16x2 r4734, r4669, r4692, r4731; +} +{ +mul.f16x2 r4738, r4565, r4692; +} +{ +mul.f16x2 r4741, r4601, r4693; +} +{ +sub.f16x2 r4744, r4738, r4741; +} +{ +mul.f16x2 r4747, r4565, r4693; +} +{ +fma.rn.f16x2 r4750, r4601, r4692, r4747; +} +{ +mul.f16x2 r4754, r4651, r4696; +} +{ +mul.f16x2 r4757, r4687, r4697; +} +{ +sub.f16x2 r4760, r4754, r4757; +} +{ +mul.f16x2 r4763, r4651, r4697; +} +{ +fma.rn.f16x2 r4766, r4687, r4696, r4763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4770, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4771, {low, high}; +} +{ +add.f16x2 r4772, r4523, r4609; +} +{ +add.f16x2 r4775, r4437, r4772; +} +{ +add.f16x2 r4778, r4529, r4615; +} +{ +add.f16x2 r4781, r4443, r4778; +} +{ +add.f16x2 r4784, r4523, r4609; +} +{ +mul.f16x2 r4787, r4784, r4770; +} +{ +add.f16x2 r4790, r4437, r4787; +} +{ +sub.f16x2 r4793, r4529, r4615; +} +{ +mul.f16x2 r4796, r4793, r4771; +} +{ +add.f16x2 r4799, r4790, r4796; +} +{ +add.f16x2 r4802, r4523, r4609; +} +{ +mul.f16x2 r4805, r4802, r4770; +} +{ +add.f16x2 r4808, r4437, r4805; +} +{ +sub.f16x2 r4811, r4529, r4615; +} +{ +mul.f16x2 r4814, r4811, r4771; +} +{ +sub.f16x2 r4817, r4808, r4814; +} +{ +add.f16x2 r4820, r4529, r4615; +} +{ +mul.f16x2 r4823, r4820, r4770; +} +{ +add.f16x2 r4826, r4443, r4823; +} +{ +sub.f16x2 r4829, r4523, r4609; +} +{ +mul.f16x2 r4832, r4829, r4771; +} +{ +sub.f16x2 r4835, r4826, r4832; +} +{ +add.f16x2 r4838, r4529, r4615; +} +{ +mul.f16x2 r4841, r4838, r4770; +} +{ +add.f16x2 r4844, r4443, r4841; +} +{ +sub.f16x2 r4847, r4523, r4609; +} +{ +mul.f16x2 r4850, r4847, r4771; +} +{ +add.f16x2 r4853, r4844, r4850; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4857, {low, high}; +} +{ +add.f16x2 r4858, r4712, r4728; +} +{ +add.f16x2 r4861, r4461, r4858; +} +{ +add.f16x2 r4864, r4718, r4734; +} +{ +add.f16x2 r4867, r4497, r4864; +} +{ +add.f16x2 r4870, r4712, r4728; +} +{ +mul.f16x2 r4873, r4870, r4856; +} +{ +add.f16x2 r4876, r4461, r4873; +} +{ +sub.f16x2 r4879, r4718, r4734; +} +{ +mul.f16x2 r4882, r4879, r4857; +} +{ +add.f16x2 r4885, r4876, r4882; +} +{ +add.f16x2 r4888, r4712, r4728; +} +{ +mul.f16x2 r4891, r4888, r4856; +} +{ +add.f16x2 r4894, r4461, r4891; +} +{ +sub.f16x2 r4897, r4718, r4734; +} +{ +mul.f16x2 r4900, r4897, r4857; +} +{ +sub.f16x2 r4903, r4894, r4900; +} +{ +add.f16x2 r4906, r4718, r4734; +} +{ +mul.f16x2 r4909, r4906, r4856; +} +{ +add.f16x2 r4912, r4497, r4909; +} +{ +sub.f16x2 r4915, r4712, r4728; +} +{ +mul.f16x2 r4918, r4915, r4857; +} +{ +sub.f16x2 r4921, r4912, r4918; +} +{ +add.f16x2 r4924, r4718, r4734; +} +{ +mul.f16x2 r4927, r4924, r4856; +} +{ +add.f16x2 r4930, r4497, r4927; +} +{ +sub.f16x2 r4933, r4712, r4728; +} +{ +mul.f16x2 r4936, r4933, r4857; +} +{ +add.f16x2 r4939, r4930, r4936; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r4942, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4943, {low, high}; +} +{ +add.f16x2 r4944, r4744, r4760; +} +{ +add.f16x2 r4947, r4479, r4944; +} +{ +add.f16x2 r4950, r4750, r4766; +} +{ +add.f16x2 r4953, r4515, r4950; +} +{ +add.f16x2 r4956, r4744, r4760; +} +{ +mul.f16x2 r4959, r4956, r4942; +} +{ +add.f16x2 r4962, r4479, r4959; +} +{ +sub.f16x2 r4965, r4750, r4766; +} +{ +mul.f16x2 r4968, r4965, r4943; +} +{ +add.f16x2 r4971, r4962, r4968; +} +{ +add.f16x2 r4974, r4744, r4760; +} +{ +mul.f16x2 r4977, r4974, r4942; +} +{ +add.f16x2 r4980, r4479, r4977; +} +{ +sub.f16x2 r4983, r4750, r4766; +} +{ +mul.f16x2 r4986, r4983, r4943; +} +{ +sub.f16x2 r4989, r4980, r4986; +} +{ +add.f16x2 r4992, r4750, r4766; +} +{ +mul.f16x2 r4995, r4992, r4942; +} +{ +add.f16x2 r4998, r4515, r4995; +} +{ +sub.f16x2 r5001, r4744, r4760; +} +{ +mul.f16x2 r5004, r5001, r4943; +} +{ +sub.f16x2 r5007, r4998, r5004; +} +{ +add.f16x2 r5010, r4750, r4766; +} +{ +mul.f16x2 r5013, r5010, r4942; +} +{ +add.f16x2 r5016, r4515, r5013; +} +{ +sub.f16x2 r5019, r4744, r4760; +} +{ +mul.f16x2 r5022, r5019, r4943; +} +{ +add.f16x2 r5025, r5016, r5022; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5028, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5029, {low, high}; +} +{ +add.f16x2 r5030, r5031, r5032; +} +{ +add.f16x2 r5033, r5034, r5030; +} +{ +add.f16x2 r5036, r5037, r5038; +} +{ +add.f16x2 r5039, r5040, r5036; +} +{ +add.f16x2 r5042, r5031, r5032; +} +{ +mul.f16x2 r5045, r5042, r5028; +} +{ +add.f16x2 r5048, r5034, r5045; +} +{ +sub.f16x2 r5051, r5037, r5038; +} +{ +mul.f16x2 r5054, r5051, r5029; +} +{ +add.f16x2 r5057, r5048, r5054; +} +{ +add.f16x2 r5060, r5031, r5032; +} +{ +mul.f16x2 r5063, r5060, r5028; +} +{ +add.f16x2 r5066, r5034, r5063; +} +{ +sub.f16x2 r5069, r5037, r5038; +} +{ +mul.f16x2 r5072, r5069, r5029; +} +{ +sub.f16x2 r5075, r5066, r5072; +} +{ +add.f16x2 r5078, r5037, r5038; +} +{ +mul.f16x2 r5081, r5078, r5028; +} +{ +add.f16x2 r5084, r5040, r5081; +} +{ +sub.f16x2 r5087, r5031, r5032; +} +{ +mul.f16x2 r5090, r5087, r5029; +} +{ +sub.f16x2 r5093, r5084, r5090; +} +{ +add.f16x2 r5096, r5037, r5038; +} +{ +mul.f16x2 r5099, r5096, r5028; +} +{ +add.f16x2 r5102, r5040, r5099; +} +{ +sub.f16x2 r5105, r5031, r5032; +} +{ +mul.f16x2 r5108, r5105, r5029; +} +{ +add.f16x2 r5111, r5102, r5108; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5115, {low, high}; +} +{ +add.f16x2 r5116, r5117, r5118; +} +{ +add.f16x2 r5119, r5120, r5116; +} +{ +add.f16x2 r5122, r5123, r5124; +} +{ +add.f16x2 r5125, r5126, r5122; +} +{ +add.f16x2 r5128, r5117, r5118; +} +{ +mul.f16x2 r5131, r5128, r5114; +} +{ +add.f16x2 r5134, r5120, r5131; +} +{ +sub.f16x2 r5137, r5123, r5124; +} +{ +mul.f16x2 r5140, r5137, r5115; +} +{ +add.f16x2 r5143, r5134, r5140; +} +{ +add.f16x2 r5146, r5117, r5118; +} +{ +mul.f16x2 r5149, r5146, r5114; +} +{ +add.f16x2 r5152, r5120, r5149; +} +{ +sub.f16x2 r5155, r5123, r5124; +} +{ +mul.f16x2 r5158, r5155, r5115; +} +{ +sub.f16x2 r5161, r5152, r5158; +} +{ +add.f16x2 r5164, r5123, r5124; +} +{ +mul.f16x2 r5167, r5164, r5114; +} +{ +add.f16x2 r5170, r5126, r5167; +} +{ +sub.f16x2 r5173, r5117, r5118; +} +{ +mul.f16x2 r5176, r5173, r5115; +} +{ +sub.f16x2 r5179, r5170, r5176; +} +{ +add.f16x2 r5182, r5123, r5124; +} +{ +mul.f16x2 r5185, r5182, r5114; +} +{ +add.f16x2 r5188, r5126, r5185; +} +{ +sub.f16x2 r5191, r5117, r5118; +} +{ +mul.f16x2 r5194, r5191, r5115; +} +{ +add.f16x2 r5197, r5188, r5194; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5200, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5201, {low, high}; +} +{ +add.f16x2 r5202, r5203, r5204; +} +{ +add.f16x2 r5205, r5206, r5202; +} +{ +add.f16x2 r5208, r5209, r5210; +} +{ +add.f16x2 r5211, r5212, r5208; +} +{ +add.f16x2 r5214, r5203, r5204; +} +{ +mul.f16x2 r5217, r5214, r5200; +} +{ +add.f16x2 r5220, r5206, r5217; +} +{ +sub.f16x2 r5223, r5209, r5210; +} +{ +mul.f16x2 r5226, r5223, r5201; +} +{ +add.f16x2 r5229, r5220, r5226; +} +{ +add.f16x2 r5232, r5203, r5204; +} +{ +mul.f16x2 r5235, r5232, r5200; +} +{ +add.f16x2 r5238, r5206, r5235; +} +{ +sub.f16x2 r5241, r5209, r5210; +} +{ +mul.f16x2 r5244, r5241, r5201; +} +{ +sub.f16x2 r5247, r5238, r5244; +} +{ +add.f16x2 r5250, r5209, r5210; +} +{ +mul.f16x2 r5253, r5250, r5200; +} +{ +add.f16x2 r5256, r5212, r5253; +} +{ +sub.f16x2 r5259, r5203, r5204; +} +{ +mul.f16x2 r5262, r5259, r5201; +} +{ +sub.f16x2 r5265, r5256, r5262; +} +{ +add.f16x2 r5268, r5209, r5210; +} +{ +mul.f16x2 r5271, r5268, r5200; +} +{ +add.f16x2 r5274, r5212, r5271; +} +{ +sub.f16x2 r5277, r5203, r5204; +} +{ +mul.f16x2 r5280, r5277, r5201; +} +{ +add.f16x2 r5283, r5274, r5280; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5286, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5287, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5288, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5289, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5292, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5293, {low, high}; +} +{ +mul.f16x2 r5302, r5143, r5286; +} +{ +mul.f16x2 r5305, r5179, r5287; +} +{ +sub.f16x2 r5308, r5302, r5305; +} +{ +mul.f16x2 r5311, r5143, r5287; +} +{ +fma.rn.f16x2 r5314, r5179, r5286, r5311; +} +{ +mul.f16x2 r5318, r5229, r5288; +} +{ +mul.f16x2 r5321, r5265, r5289; +} +{ +sub.f16x2 r5324, r5318, r5321; +} +{ +mul.f16x2 r5327, r5229, r5289; +} +{ +fma.rn.f16x2 r5330, r5265, r5288, r5327; +} +{ +mul.f16x2 r5334, r5161, r5288; +} +{ +mul.f16x2 r5337, r5197, r5289; +} +{ +sub.f16x2 r5340, r5334, r5337; +} +{ +mul.f16x2 r5343, r5161, r5289; +} +{ +fma.rn.f16x2 r5346, r5197, r5288, r5343; +} +{ +mul.f16x2 r5350, r5247, r5292; +} +{ +mul.f16x2 r5353, r5283, r5293; +} +{ +sub.f16x2 r5356, r5350, r5353; +} +{ +mul.f16x2 r5359, r5247, r5293; +} +{ +fma.rn.f16x2 r5362, r5283, r5292, r5359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5366, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5367, {low, high}; +} +{ +add.f16x2 r5368, r5119, r5205; +} +{ +add.f16x2 r5371, r5033, r5368; +} +{ +add.f16x2 r5374, r5125, r5211; +} +{ +add.f16x2 r5377, r5039, r5374; +} +{ +add.f16x2 r5380, r5119, r5205; +} +{ +mul.f16x2 r5383, r5380, r5366; +} +{ +add.f16x2 r5386, r5033, r5383; +} +{ +sub.f16x2 r5389, r5125, r5211; +} +{ +mul.f16x2 r5392, r5389, r5367; +} +{ +add.f16x2 r5395, r5386, r5392; +} +{ +add.f16x2 r5398, r5119, r5205; +} +{ +mul.f16x2 r5401, r5398, r5366; +} +{ +add.f16x2 r5404, r5033, r5401; +} +{ +sub.f16x2 r5407, r5125, r5211; +} +{ +mul.f16x2 r5410, r5407, r5367; +} +{ +sub.f16x2 r5413, r5404, r5410; +} +{ +add.f16x2 r5416, r5125, r5211; +} +{ +mul.f16x2 r5419, r5416, r5366; +} +{ +add.f16x2 r5422, r5039, r5419; +} +{ +sub.f16x2 r5425, r5119, r5205; +} +{ +mul.f16x2 r5428, r5425, r5367; +} +{ +sub.f16x2 r5431, r5422, r5428; +} +{ +add.f16x2 r5434, r5125, r5211; +} +{ +mul.f16x2 r5437, r5434, r5366; +} +{ +add.f16x2 r5440, r5039, r5437; +} +{ +sub.f16x2 r5443, r5119, r5205; +} +{ +mul.f16x2 r5446, r5443, r5367; +} +{ +add.f16x2 r5449, r5440, r5446; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5453, {low, high}; +} +{ +add.f16x2 r5454, r5308, r5324; +} +{ +add.f16x2 r5457, r5057, r5454; +} +{ +add.f16x2 r5460, r5314, r5330; +} +{ +add.f16x2 r5463, r5093, r5460; +} +{ +add.f16x2 r5466, r5308, r5324; +} +{ +mul.f16x2 r5469, r5466, r5452; +} +{ +add.f16x2 r5472, r5057, r5469; +} +{ +sub.f16x2 r5475, r5314, r5330; +} +{ +mul.f16x2 r5478, r5475, r5453; +} +{ +add.f16x2 r5481, r5472, r5478; +} +{ +add.f16x2 r5484, r5308, r5324; +} +{ +mul.f16x2 r5487, r5484, r5452; +} +{ +add.f16x2 r5490, r5057, r5487; +} +{ +sub.f16x2 r5493, r5314, r5330; +} +{ +mul.f16x2 r5496, r5493, r5453; +} +{ +sub.f16x2 r5499, r5490, r5496; +} +{ +add.f16x2 r5502, r5314, r5330; +} +{ +mul.f16x2 r5505, r5502, r5452; +} +{ +add.f16x2 r5508, r5093, r5505; +} +{ +sub.f16x2 r5511, r5308, r5324; +} +{ +mul.f16x2 r5514, r5511, r5453; +} +{ +sub.f16x2 r5517, r5508, r5514; +} +{ +add.f16x2 r5520, r5314, r5330; +} +{ +mul.f16x2 r5523, r5520, r5452; +} +{ +add.f16x2 r5526, r5093, r5523; +} +{ +sub.f16x2 r5529, r5308, r5324; +} +{ +mul.f16x2 r5532, r5529, r5453; +} +{ +add.f16x2 r5535, r5526, r5532; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5538, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5539, {low, high}; +} +{ +add.f16x2 r5540, r5340, r5356; +} +{ +add.f16x2 r5543, r5075, r5540; +} +{ +add.f16x2 r5546, r5346, r5362; +} +{ +add.f16x2 r5549, r5111, r5546; +} +{ +add.f16x2 r5552, r5340, r5356; +} +{ +mul.f16x2 r5555, r5552, r5538; +} +{ +add.f16x2 r5558, r5075, r5555; +} +{ +sub.f16x2 r5561, r5346, r5362; +} +{ +mul.f16x2 r5564, r5561, r5539; +} +{ +add.f16x2 r5567, r5558, r5564; +} +{ +add.f16x2 r5570, r5340, r5356; +} +{ +mul.f16x2 r5573, r5570, r5538; +} +{ +add.f16x2 r5576, r5075, r5573; +} +{ +sub.f16x2 r5579, r5346, r5362; +} +{ +mul.f16x2 r5582, r5579, r5539; +} +{ +sub.f16x2 r5585, r5576, r5582; +} +{ +add.f16x2 r5588, r5346, r5362; +} +{ +mul.f16x2 r5591, r5588, r5538; +} +{ +add.f16x2 r5594, r5111, r5591; +} +{ +sub.f16x2 r5597, r5340, r5356; +} +{ +mul.f16x2 r5600, r5597, r5539; +} +{ +sub.f16x2 r5603, r5594, r5600; +} +{ +add.f16x2 r5606, r5346, r5362; +} +{ +mul.f16x2 r5609, r5606, r5538; +} +{ +add.f16x2 r5612, r5111, r5609; +} +{ +sub.f16x2 r5615, r5340, r5356; +} +{ +mul.f16x2 r5618, r5615, r5539; +} +{ +add.f16x2 r5621, r5612, r5618; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f534; +cvt.rn.f16.f32 high, f534; +mov.b32 r5624, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f536; +cvt.rn.f16.f32 high, f536; +mov.b32 r5625, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f538; +cvt.rn.f16.f32 high, f538; +mov.b32 r5626, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f540; +cvt.rn.f16.f32 high, f540; +mov.b32 r5627, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f542; +cvt.rn.f16.f32 high, f542; +mov.b32 r5628, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f544; +cvt.rn.f16.f32 high, f544; +mov.b32 r5629, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f546; +cvt.rn.f16.f32 high, f546; +mov.b32 r5630, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f548; +cvt.rn.f16.f32 high, f548; +mov.b32 r5631, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f550; +cvt.rn.f16.f32 high, f550; +mov.b32 r5632, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f552; +cvt.rn.f16.f32 high, f552; +mov.b32 r5633, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f554; +cvt.rn.f16.f32 high, f554; +mov.b32 r5634, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f556; +cvt.rn.f16.f32 high, f556; +mov.b32 r5635, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f558; +cvt.rn.f16.f32 high, f558; +mov.b32 r5636, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f560; +cvt.rn.f16.f32 high, f560; +mov.b32 r5637, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f562; +cvt.rn.f16.f32 high, f562; +mov.b32 r5638, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f564; +cvt.rn.f16.f32 high, f564; +mov.b32 r5639, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f570; +cvt.rn.f16.f32 high, f570; +mov.b32 r5642, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f572; +cvt.rn.f16.f32 high, f572; +mov.b32 r5643, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f578; +cvt.rn.f16.f32 high, f578; +mov.b32 r5646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f580; +cvt.rn.f16.f32 high, f580; +mov.b32 r5647, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f586; +cvt.rn.f16.f32 high, f586; +mov.b32 r5650, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f588; +cvt.rn.f16.f32 high, f588; +mov.b32 r5651, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f594; +cvt.rn.f16.f32 high, f594; +mov.b32 r5654, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f596; +cvt.rn.f16.f32 high, f596; +mov.b32 r5655, {low, high}; +} +{ +mul.f16x2 r5676, r4861, r5624; +} +{ +mul.f16x2 r5679, r4867, r5625; +} +{ +sub.f16x2 r5682, r5676, r5679; +} +{ +mul.f16x2 r5685, r4861, r5625; +} +{ +fma.rn.f16x2 r5688, r4867, r5624, r5685; +} +{ +mul.f16x2 r5692, r5457, r5626; +} +{ +mul.f16x2 r5695, r5463, r5627; +} +{ +sub.f16x2 r5698, r5692, r5695; +} +{ +mul.f16x2 r5701, r5457, r5627; +} +{ +fma.rn.f16x2 r5704, r5463, r5626, r5701; +} +{ +mul.f16x2 r5708, r4947, r5626; +} +{ +mul.f16x2 r5711, r4953, r5627; +} +{ +sub.f16x2 r5714, r5708, r5711; +} +{ +mul.f16x2 r5717, r4947, r5627; +} +{ +fma.rn.f16x2 r5720, r4953, r5626, r5717; +} +{ +mul.f16x2 r5724, r5543, r5630; +} +{ +mul.f16x2 r5727, r5549, r5631; +} +{ +sub.f16x2 r5730, r5724, r5727; +} +{ +mul.f16x2 r5733, r5543, r5631; +} +{ +fma.rn.f16x2 r5736, r5549, r5630, r5733; +} +{ +mul.f16x2 r5740, r4799, r5628; +} +{ +mul.f16x2 r5743, r4835, r5629; +} +{ +sub.f16x2 r5746, r5740, r5743; +} +{ +mul.f16x2 r5749, r4799, r5629; +} +{ +fma.rn.f16x2 r5752, r4835, r5628, r5749; +} +{ +mul.f16x2 r5756, r5395, r5634; +} +{ +mul.f16x2 r5759, r5431, r5635; +} +{ +sub.f16x2 r5762, r5756, r5759; +} +{ +mul.f16x2 r5765, r5395, r5635; +} +{ +fma.rn.f16x2 r5768, r5431, r5634, r5765; +} +{ +mul.f16x2 r5772, r4885, r5630; +} +{ +mul.f16x2 r5775, r4921, r5631; +} +{ +sub.f16x2 r5778, r5772, r5775; +} +{ +mul.f16x2 r5781, r4885, r5631; +} +{ +fma.rn.f16x2 r5784, r4921, r5630, r5781; +} +{ +mul.f16x2 r5788, r5481, r5638; +} +{ +mul.f16x2 r5791, r5517, r5639; +} +{ +sub.f16x2 r5794, r5788, r5791; +} +{ +mul.f16x2 r5797, r5481, r5639; +} +{ +fma.rn.f16x2 r5800, r5517, r5638, r5797; +} +{ +mul.f16x2 r5804, r4971, r5632; +} +{ +mul.f16x2 r5807, r5007, r5633; +} +{ +sub.f16x2 r5810, r5804, r5807; +} +{ +mul.f16x2 r5813, r4971, r5633; +} +{ +fma.rn.f16x2 r5816, r5007, r5632, r5813; +} +{ +mul.f16x2 r5820, r5567, r5642; +} +{ +mul.f16x2 r5823, r5603, r5643; +} +{ +sub.f16x2 r5826, r5820, r5823; +} +{ +mul.f16x2 r5829, r5567, r5643; +} +{ +fma.rn.f16x2 r5832, r5603, r5642, r5829; +} +{ +mul.f16x2 r5836, r4817, r5634; +} +{ +mul.f16x2 r5839, r4853, r5635; +} +{ +sub.f16x2 r5842, r5836, r5839; +} +{ +mul.f16x2 r5845, r4817, r5635; +} +{ +fma.rn.f16x2 r5848, r4853, r5634, r5845; +} +{ +mul.f16x2 r5852, r5413, r5646; +} +{ +mul.f16x2 r5855, r5449, r5647; +} +{ +sub.f16x2 r5858, r5852, r5855; +} +{ +mul.f16x2 r5861, r5413, r5647; +} +{ +fma.rn.f16x2 r5864, r5449, r5646, r5861; +} +{ +mul.f16x2 r5868, r4903, r5636; +} +{ +mul.f16x2 r5871, r4939, r5637; +} +{ +sub.f16x2 r5874, r5868, r5871; +} +{ +mul.f16x2 r5877, r4903, r5637; +} +{ +fma.rn.f16x2 r5880, r4939, r5636, r5877; +} +{ +mul.f16x2 r5884, r5499, r5650; +} +{ +mul.f16x2 r5887, r5535, r5651; +} +{ +sub.f16x2 r5890, r5884, r5887; +} +{ +mul.f16x2 r5893, r5499, r5651; +} +{ +fma.rn.f16x2 r5896, r5535, r5650, r5893; +} +{ +mul.f16x2 r5900, r4989, r5638; +} +{ +mul.f16x2 r5903, r5025, r5639; +} +{ +sub.f16x2 r5906, r5900, r5903; +} +{ +mul.f16x2 r5909, r4989, r5639; +} +{ +fma.rn.f16x2 r5912, r5025, r5638, r5909; +} +{ +mul.f16x2 r5916, r5585, r5654; +} +{ +mul.f16x2 r5919, r5621, r5655; +} +{ +sub.f16x2 r5922, r5916, r5919; +} +{ +mul.f16x2 r5925, r5585, r5655; +} +{ +fma.rn.f16x2 r5928, r5621, r5654, r5925; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r5932, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r5933, {low, high}; +} +{ +add.f16x2 r5934, r4775, r5371; +} +{ +add.f16x2 %0, r4179, r5934; +} +{ +add.f16x2 r5940, r4781, r5377; +} +{ +add.f16x2 %1, r4185, r5940; +} +{ +add.f16x2 r5946, r4775, r5371; +} +{ +mul.f16x2 r5949, r5946, r5932; +} +{ +add.f16x2 r5952, r4179, r5949; +} +{ +sub.f16x2 r5955, r4781, r5377; +} +{ +mul.f16x2 r5958, r5955, r5933; +} +{ +add.f16x2 %18, r5952, r5958; +} +{ +add.f16x2 r5964, r4775, r5371; +} +{ +mul.f16x2 r5967, r5964, r5932; +} +{ +add.f16x2 r5970, r4179, r5967; +} +{ +sub.f16x2 r5973, r4781, r5377; +} +{ +mul.f16x2 r5976, r5973, r5933; +} +{ +sub.f16x2 %36, r5970, r5976; +} +{ +add.f16x2 r5982, r4781, r5377; +} +{ +mul.f16x2 r5985, r5982, r5932; +} +{ +add.f16x2 r5988, r4185, r5985; +} +{ +sub.f16x2 r5991, r4775, r5371; +} +{ +mul.f16x2 r5994, r5991, r5933; +} +{ +sub.f16x2 %19, r5988, r5994; +} +{ +add.f16x2 r6000, r4781, r5377; +} +{ +mul.f16x2 r6003, r6000, r5932; +} +{ +add.f16x2 r6006, r4185, r6003; +} +{ +sub.f16x2 r6009, r4775, r5371; +} +{ +mul.f16x2 r6012, r6009, r5933; +} +{ +add.f16x2 %37, r6006, r6012; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6019, {low, high}; +} +{ +add.f16x2 r6020, r5682, r5698; +} +{ +add.f16x2 %2, r4265, r6020; +} +{ +add.f16x2 r6026, r5688, r5704; +} +{ +add.f16x2 %3, r4271, r6026; +} +{ +add.f16x2 r6032, r5682, r5698; +} +{ +mul.f16x2 r6035, r6032, r6018; +} +{ +add.f16x2 r6038, r4265, r6035; +} +{ +sub.f16x2 r6041, r5688, r5704; +} +{ +mul.f16x2 r6044, r6041, r6019; +} +{ +add.f16x2 %20, r6038, r6044; +} +{ +add.f16x2 r6050, r5682, r5698; +} +{ +mul.f16x2 r6053, r6050, r6018; +} +{ +add.f16x2 r6056, r4265, r6053; +} +{ +sub.f16x2 r6059, r5688, r5704; +} +{ +mul.f16x2 r6062, r6059, r6019; +} +{ +sub.f16x2 %38, r6056, r6062; +} +{ +add.f16x2 r6068, r5688, r5704; +} +{ +mul.f16x2 r6071, r6068, r6018; +} +{ +add.f16x2 r6074, r4271, r6071; +} +{ +sub.f16x2 r6077, r5682, r5698; +} +{ +mul.f16x2 r6080, r6077, r6019; +} +{ +sub.f16x2 %21, r6074, r6080; +} +{ +add.f16x2 r6086, r5688, r5704; +} +{ +mul.f16x2 r6089, r6086, r6018; +} +{ +add.f16x2 r6092, r4271, r6089; +} +{ +sub.f16x2 r6095, r5682, r5698; +} +{ +mul.f16x2 r6098, r6095, r6019; +} +{ +add.f16x2 %39, r6092, r6098; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6105, {low, high}; +} +{ +add.f16x2 r6106, r5714, r5730; +} +{ +add.f16x2 %4, r4351, r6106; +} +{ +add.f16x2 r6112, r5720, r5736; +} +{ +add.f16x2 %5, r4357, r6112; +} +{ +add.f16x2 r6118, r5714, r5730; +} +{ +mul.f16x2 r6121, r6118, r6104; +} +{ +add.f16x2 r6124, r4351, r6121; +} +{ +sub.f16x2 r6127, r5720, r5736; +} +{ +mul.f16x2 r6130, r6127, r6105; +} +{ +add.f16x2 %22, r6124, r6130; +} +{ +add.f16x2 r6136, r5714, r5730; +} +{ +mul.f16x2 r6139, r6136, r6104; +} +{ +add.f16x2 r6142, r4351, r6139; +} +{ +sub.f16x2 r6145, r5720, r5736; +} +{ +mul.f16x2 r6148, r6145, r6105; +} +{ +sub.f16x2 %40, r6142, r6148; +} +{ +add.f16x2 r6154, r5720, r5736; +} +{ +mul.f16x2 r6157, r6154, r6104; +} +{ +add.f16x2 r6160, r4357, r6157; +} +{ +sub.f16x2 r6163, r5714, r5730; +} +{ +mul.f16x2 r6166, r6163, r6105; +} +{ +sub.f16x2 %23, r6160, r6166; +} +{ +add.f16x2 r6172, r5720, r5736; +} +{ +mul.f16x2 r6175, r6172, r6104; +} +{ +add.f16x2 r6178, r4357, r6175; +} +{ +sub.f16x2 r6181, r5714, r5730; +} +{ +mul.f16x2 r6184, r6181, r6105; +} +{ +add.f16x2 %41, r6178, r6184; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6190, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6191, {low, high}; +} +{ +add.f16x2 r6192, r5746, r5762; +} +{ +add.f16x2 %6, r4203, r6192; +} +{ +add.f16x2 r6198, r5752, r5768; +} +{ +add.f16x2 %7, r4239, r6198; +} +{ +add.f16x2 r6204, r5746, r5762; +} +{ +mul.f16x2 r6207, r6204, r6190; +} +{ +add.f16x2 r6210, r4203, r6207; +} +{ +sub.f16x2 r6213, r5752, r5768; +} +{ +mul.f16x2 r6216, r6213, r6191; +} +{ +add.f16x2 %24, r6210, r6216; +} +{ +add.f16x2 r6222, r5746, r5762; +} +{ +mul.f16x2 r6225, r6222, r6190; +} +{ +add.f16x2 r6228, r4203, r6225; +} +{ +sub.f16x2 r6231, r5752, r5768; +} +{ +mul.f16x2 r6234, r6231, r6191; +} +{ +sub.f16x2 %42, r6228, r6234; +} +{ +add.f16x2 r6240, r5752, r5768; +} +{ +mul.f16x2 r6243, r6240, r6190; +} +{ +add.f16x2 r6246, r4239, r6243; +} +{ +sub.f16x2 r6249, r5746, r5762; +} +{ +mul.f16x2 r6252, r6249, r6191; +} +{ +sub.f16x2 %25, r6246, r6252; +} +{ +add.f16x2 r6258, r5752, r5768; +} +{ +mul.f16x2 r6261, r6258, r6190; +} +{ +add.f16x2 r6264, r4239, r6261; +} +{ +sub.f16x2 r6267, r5746, r5762; +} +{ +mul.f16x2 r6270, r6267, r6191; +} +{ +add.f16x2 %43, r6264, r6270; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6277, {low, high}; +} +{ +add.f16x2 r6278, r5778, r5794; +} +{ +add.f16x2 %8, r4289, r6278; +} +{ +add.f16x2 r6284, r5784, r5800; +} +{ +add.f16x2 %9, r4325, r6284; +} +{ +add.f16x2 r6290, r5778, r5794; +} +{ +mul.f16x2 r6293, r6290, r6276; +} +{ +add.f16x2 r6296, r4289, r6293; +} +{ +sub.f16x2 r6299, r5784, r5800; +} +{ +mul.f16x2 r6302, r6299, r6277; +} +{ +add.f16x2 %26, r6296, r6302; +} +{ +add.f16x2 r6308, r5778, r5794; +} +{ +mul.f16x2 r6311, r6308, r6276; +} +{ +add.f16x2 r6314, r4289, r6311; +} +{ +sub.f16x2 r6317, r5784, r5800; +} +{ +mul.f16x2 r6320, r6317, r6277; +} +{ +sub.f16x2 %44, r6314, r6320; +} +{ +add.f16x2 r6326, r5784, r5800; +} +{ +mul.f16x2 r6329, r6326, r6276; +} +{ +add.f16x2 r6332, r4325, r6329; +} +{ +sub.f16x2 r6335, r5778, r5794; +} +{ +mul.f16x2 r6338, r6335, r6277; +} +{ +sub.f16x2 %27, r6332, r6338; +} +{ +add.f16x2 r6344, r5784, r5800; +} +{ +mul.f16x2 r6347, r6344, r6276; +} +{ +add.f16x2 r6350, r4325, r6347; +} +{ +sub.f16x2 r6353, r5778, r5794; +} +{ +mul.f16x2 r6356, r6353, r6277; +} +{ +add.f16x2 %45, r6350, r6356; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6362, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6363, {low, high}; +} +{ +add.f16x2 r6364, r5810, r5826; +} +{ +add.f16x2 %10, r4375, r6364; +} +{ +add.f16x2 r6370, r5816, r5832; +} +{ +add.f16x2 %11, r4411, r6370; +} +{ +add.f16x2 r6376, r5810, r5826; +} +{ +mul.f16x2 r6379, r6376, r6362; +} +{ +add.f16x2 r6382, r4375, r6379; +} +{ +sub.f16x2 r6385, r5816, r5832; +} +{ +mul.f16x2 r6388, r6385, r6363; +} +{ +add.f16x2 %28, r6382, r6388; +} +{ +add.f16x2 r6394, r5810, r5826; +} +{ +mul.f16x2 r6397, r6394, r6362; +} +{ +add.f16x2 r6400, r4375, r6397; +} +{ +sub.f16x2 r6403, r5816, r5832; +} +{ +mul.f16x2 r6406, r6403, r6363; +} +{ +sub.f16x2 %46, r6400, r6406; +} +{ +add.f16x2 r6412, r5816, r5832; +} +{ +mul.f16x2 r6415, r6412, r6362; +} +{ +add.f16x2 r6418, r4411, r6415; +} +{ +sub.f16x2 r6421, r5810, r5826; +} +{ +mul.f16x2 r6424, r6421, r6363; +} +{ +sub.f16x2 %29, r6418, r6424; +} +{ +add.f16x2 r6430, r5816, r5832; +} +{ +mul.f16x2 r6433, r6430, r6362; +} +{ +add.f16x2 r6436, r4411, r6433; +} +{ +sub.f16x2 r6439, r5810, r5826; +} +{ +mul.f16x2 r6442, r6439, r6363; +} +{ +add.f16x2 %47, r6436, r6442; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6448, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6449, {low, high}; +} +{ +add.f16x2 r6450, r5842, r5858; +} +{ +add.f16x2 %12, r4221, r6450; +} +{ +add.f16x2 r6456, r5848, r5864; +} +{ +add.f16x2 %13, r4257, r6456; +} +{ +add.f16x2 r6462, r5842, r5858; +} +{ +mul.f16x2 r6465, r6462, r6448; +} +{ +add.f16x2 r6468, r4221, r6465; +} +{ +sub.f16x2 r6471, r5848, r5864; +} +{ +mul.f16x2 r6474, r6471, r6449; +} +{ +add.f16x2 %30, r6468, r6474; +} +{ +add.f16x2 r6480, r5842, r5858; +} +{ +mul.f16x2 r6483, r6480, r6448; +} +{ +add.f16x2 r6486, r4221, r6483; +} +{ +sub.f16x2 r6489, r5848, r5864; +} +{ +mul.f16x2 r6492, r6489, r6449; +} +{ +sub.f16x2 %48, r6486, r6492; +} +{ +add.f16x2 r6498, r5848, r5864; +} +{ +mul.f16x2 r6501, r6498, r6448; +} +{ +add.f16x2 r6504, r4257, r6501; +} +{ +sub.f16x2 r6507, r5842, r5858; +} +{ +mul.f16x2 r6510, r6507, r6449; +} +{ +sub.f16x2 %31, r6504, r6510; +} +{ +add.f16x2 r6516, r5848, r5864; +} +{ +mul.f16x2 r6519, r6516, r6448; +} +{ +add.f16x2 r6522, r4257, r6519; +} +{ +sub.f16x2 r6525, r5842, r5858; +} +{ +mul.f16x2 r6528, r6525, r6449; +} +{ +add.f16x2 %49, r6522, r6528; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6534, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6535, {low, high}; +} +{ +add.f16x2 r6536, r5874, r5890; +} +{ +add.f16x2 %14, r4307, r6536; +} +{ +add.f16x2 r6542, r5880, r5896; +} +{ +add.f16x2 %15, r4343, r6542; +} +{ +add.f16x2 r6548, r5874, r5890; +} +{ +mul.f16x2 r6551, r6548, r6534; +} +{ +add.f16x2 r6554, r4307, r6551; +} +{ +sub.f16x2 r6557, r5880, r5896; +} +{ +mul.f16x2 r6560, r6557, r6535; +} +{ +add.f16x2 %32, r6554, r6560; +} +{ +add.f16x2 r6566, r5874, r5890; +} +{ +mul.f16x2 r6569, r6566, r6534; +} +{ +add.f16x2 r6572, r4307, r6569; +} +{ +sub.f16x2 r6575, r5880, r5896; +} +{ +mul.f16x2 r6578, r6575, r6535; +} +{ +sub.f16x2 %50, r6572, r6578; +} +{ +add.f16x2 r6584, r5880, r5896; +} +{ +mul.f16x2 r6587, r6584, r6534; +} +{ +add.f16x2 r6590, r4343, r6587; +} +{ +sub.f16x2 r6593, r5874, r5890; +} +{ +mul.f16x2 r6596, r6593, r6535; +} +{ +sub.f16x2 %33, r6590, r6596; +} +{ +add.f16x2 r6602, r5880, r5896; +} +{ +mul.f16x2 r6605, r6602, r6534; +} +{ +add.f16x2 r6608, r4343, r6605; +} +{ +sub.f16x2 r6611, r5874, r5890; +} +{ +mul.f16x2 r6614, r6611, r6535; +} +{ +add.f16x2 %51, r6608, r6614; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f670; +cvt.rn.f16.f32 high, f670; +mov.b32 r6620, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r6621, {low, high}; +} +{ +add.f16x2 r6622, r5906, r5922; +} +{ +add.f16x2 %16, r4393, r6622; +} +{ +add.f16x2 r6628, r5912, r5928; +} +{ +add.f16x2 %17, r4429, r6628; +} +{ +add.f16x2 r6634, r5906, r5922; +} +{ +mul.f16x2 r6637, r6634, r6620; +} +{ +add.f16x2 r6640, r4393, r6637; +} +{ +sub.f16x2 r6643, r5912, r5928; +} +{ +mul.f16x2 r6646, r6643, r6621; +} +{ +add.f16x2 %34, r6640, r6646; +} +{ +add.f16x2 r6652, r5906, r5922; +} +{ +mul.f16x2 r6655, r6652, r6620; +} +{ +add.f16x2 r6658, r4393, r6655; +} +{ +sub.f16x2 r6661, r5912, r5928; +} +{ +mul.f16x2 r6664, r6661, r6621; +} +{ +sub.f16x2 %52, r6658, r6664; +} +{ +add.f16x2 r6670, r5912, r5928; +} +{ +mul.f16x2 r6673, r6670, r6620; +} +{ +add.f16x2 r6676, r4429, r6673; +} +{ +sub.f16x2 r6679, r5906, r5922; +} +{ +mul.f16x2 r6682, r6679, r6621; +} +{ +sub.f16x2 %35, r6676, r6682; +} +{ +add.f16x2 r6688, r5912, r5928; +} +{ +mul.f16x2 r6691, r6688, r6620; +} +{ +add.f16x2 r6694, r4429, r6691; +} +{ +sub.f16x2 r6697, r5906, r5922; +} +{ +mul.f16x2 r6700, r6697, r6621; +} +{ +add.f16x2 %53, r6694, r6700; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1088, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<215>; +.reg .b32 r<2404>; +.reg .b64 rd<6>; +mov.u32 r2387, %tid.y; +mov.u32 r2388, %18; +mad.lo.s32 r2389, r2387, 5832, r2388; +mov.u32 r2390, %tid.x; +mov.f32 f206, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1, {low, high}; +} +mov.f32 f208, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r259, {low, high}; +} +mov.f32 f168, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r260, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r261, {low, high}; +} +mov.f32 f172, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r262, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r265, {low, high}; +} +mov.f32 f180, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r2390, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r2391, rd3; +mul.lo.s32 r2392, r2391, 81; +sub.s32 r2393, r2390, r2392; +cvt.rn.f32.u32 f209, r2393; +mul.f32 f210, f209, 0f3C0D3654; +cos.approx.f32 f57, f210; +sin.approx.f32 f211, f210; +neg.f32 f58, f211; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +mad.lo.s32 r2394, r2391, 5832, r2389; +barrier.sync 0; +mad.lo.s32 r2395, r2393, 72, r2394; +st.shared.v2.f32 [r2395], {r344, r350}; +st.shared.v2.f32 [r2395+8], {r607, r616}; +st.shared.v2.f32 [r2395+16], {r644, r653}; +st.shared.v2.f32 [r2395+24], {r681, r690}; +st.shared.v2.f32 [r2395+32], {r718, r727}; +st.shared.v2.f32 [r2395+40], {r755, r764}; +st.shared.v2.f32 [r2395+48], {r792, r801}; +st.shared.v2.f32 [r2395+56], {r829, r838}; +st.shared.v2.f32 [r2395+64], {r866, r875}; +barrier.sync 0; +shl.b32 r2396, r2393, 6; +sub.s32 r2397, r2395, r2396; +ld.shared.u32 r902, [r2397]; +ld.shared.u32 r908, [r2397+4]; +ld.shared.u32 r988, [r2397+648]; +ld.shared.u32 r994, [r2397+652]; +ld.shared.u32 r1074, [r2397+1296]; +ld.shared.u32 r1080, [r2397+1300]; +ld.shared.u32 r899, [r2397+1944]; +ld.shared.u32 r905, [r2397+1948]; +ld.shared.u32 r985, [r2397+2592]; +ld.shared.u32 r991, [r2397+2596]; +ld.shared.u32 r1071, [r2397+3240]; +ld.shared.u32 r1077, [r2397+3244]; +ld.shared.u32 r900, [r2397+3888]; +ld.shared.u32 r906, [r2397+3892]; +ld.shared.u32 r986, [r2397+4536]; +ld.shared.u32 r992, [r2397+4540]; +ld.shared.u32 r1072, [r2397+5184]; +ld.shared.u32 r1078, [r2397+5188]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r2393, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2398, rd5; +cvt.rn.f32.u32 f212, r2398; +mul.f32 f213, f212, 0f3D9EDD1F; +cos.approx.f32 f133, f213; +sin.approx.f32 f214, f213; +neg.f32 f134, f214; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +mul.lo.s32 r2399, r2398, 9; +sub.s32 r2400, r2393, r2399; +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +shl.b32 r2401, r2400, 3; +add.s32 r2402, r2394, r2401; +barrier.sync 0; +mad.lo.s32 r2403, r2398, 648, r2402; +st.shared.u32 [r2403], r1239; +st.shared.u32 [r2403+4], r1245; +st.shared.u32 [r2403+72], r1502; +st.shared.u32 [r2403+76], r1511; +st.shared.u32 [r2403+144], r1539; +st.shared.u32 [r2403+148], r1548; +st.shared.u32 [r2403+216], r1576; +st.shared.u32 [r2403+220], r1585; +st.shared.u32 [r2403+288], r1613; +st.shared.u32 [r2403+292], r1622; +st.shared.u32 [r2403+360], r1650; +st.shared.u32 [r2403+364], r1659; +st.shared.u32 [r2403+432], r1687; +st.shared.u32 [r2403+436], r1696; +st.shared.u32 [r2403+504], r1724; +st.shared.u32 [r2403+508], r1733; +st.shared.u32 [r2403+576], r1761; +st.shared.u32 [r2403+580], r1770; +barrier.sync 0; +ld.shared.u32 r1797, [r2397]; +ld.shared.u32 r1803, [r2397+4]; +ld.shared.u32 r1883, [r2397+648]; +ld.shared.u32 r1889, [r2397+652]; +ld.shared.u32 r1969, [r2397+1296]; +ld.shared.u32 r1975, [r2397+1300]; +ld.shared.u32 r1794, [r2397+1944]; +ld.shared.u32 r1800, [r2397+1948]; +ld.shared.u32 r1880, [r2397+2592]; +ld.shared.u32 r1886, [r2397+2596]; +ld.shared.u32 r1966, [r2397+3240]; +ld.shared.u32 r1972, [r2397+3244]; +ld.shared.u32 r1795, [r2397+3888]; +ld.shared.u32 r1801, [r2397+3892]; +ld.shared.u32 r1881, [r2397+4536]; +ld.shared.u32 r1887, [r2397+4540]; +ld.shared.u32 r1967, [r2397+5184]; +ld.shared.u32 r1973, [r2397+5188]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 r1802, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 r1820, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 r1838, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 r1856, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 r1874, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 r1942, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 r1968, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 r1974, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 r1992, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 r2010, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 r2028, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2065, r1906, r2049; +} +{ +mul.f16x2 r2068, r1942, r2050; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1906, r2050; +} +{ +fma.rn.f16x2 r2077, r1942, r2049, r2074; +} +{ +mul.f16x2 r2081, r1992, r2051; +} +{ +mul.f16x2 r2084, r2028, r2052; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1992, r2052; +} +{ +fma.rn.f16x2 r2093, r2028, r2051, r2090; +} +{ +mul.f16x2 r2097, r1924, r2051; +} +{ +mul.f16x2 r2100, r1960, r2052; +} +{ +sub.f16x2 r2103, r2097, r2100; +} +{ +mul.f16x2 r2106, r1924, r2052; +} +{ +fma.rn.f16x2 r2109, r1960, r2051, r2106; +} +{ +mul.f16x2 r2113, r2010, r2055; +} +{ +mul.f16x2 r2116, r2046, r2056; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r2010, r2056; +} +{ +fma.rn.f16x2 r2125, r2046, r2055, r2122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2130, {low, high}; +} +{ +add.f16x2 r2131, r1882, r1968; +} +{ +add.f16x2 %0, r1796, r2131; +} +{ +add.f16x2 r2137, r1888, r1974; +} +{ +add.f16x2 %1, r1802, r2137; +} +{ +add.f16x2 r2143, r1882, r1968; +} +{ +mul.f16x2 r2146, r2143, r2129; +} +{ +add.f16x2 r2149, r1796, r2146; +} +{ +sub.f16x2 r2152, r1888, r1974; +} +{ +mul.f16x2 r2155, r2152, r2130; +} +{ +add.f16x2 %6, r2149, r2155; +} +{ +add.f16x2 r2161, r1882, r1968; +} +{ +mul.f16x2 r2164, r2161, r2129; +} +{ +add.f16x2 r2167, r1796, r2164; +} +{ +sub.f16x2 r2170, r1888, r1974; +} +{ +mul.f16x2 r2173, r2170, r2130; +} +{ +sub.f16x2 %12, r2167, r2173; +} +{ +add.f16x2 r2179, r1888, r1974; +} +{ +mul.f16x2 r2182, r2179, r2129; +} +{ +add.f16x2 r2185, r1802, r2182; +} +{ +sub.f16x2 r2188, r1882, r1968; +} +{ +mul.f16x2 r2191, r2188, r2130; +} +{ +sub.f16x2 %7, r2185, r2191; +} +{ +add.f16x2 r2197, r1888, r1974; +} +{ +mul.f16x2 r2200, r2197, r2129; +} +{ +add.f16x2 r2203, r1802, r2200; +} +{ +sub.f16x2 r2206, r1882, r1968; +} +{ +mul.f16x2 r2209, r2206, r2130; +} +{ +add.f16x2 %13, r2203, r2209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2215, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2216, {low, high}; +} +{ +add.f16x2 r2217, r2071, r2087; +} +{ +add.f16x2 %2, r1820, r2217; +} +{ +add.f16x2 r2223, r2077, r2093; +} +{ +add.f16x2 %3, r1856, r2223; +} +{ +add.f16x2 r2229, r2071, r2087; +} +{ +mul.f16x2 r2232, r2229, r2215; +} +{ +add.f16x2 r2235, r1820, r2232; +} +{ +sub.f16x2 r2238, r2077, r2093; +} +{ +mul.f16x2 r2241, r2238, r2216; +} +{ +add.f16x2 %8, r2235, r2241; +} +{ +add.f16x2 r2247, r2071, r2087; +} +{ +mul.f16x2 r2250, r2247, r2215; +} +{ +add.f16x2 r2253, r1820, r2250; +} +{ +sub.f16x2 r2256, r2077, r2093; +} +{ +mul.f16x2 r2259, r2256, r2216; +} +{ +sub.f16x2 %14, r2253, r2259; +} +{ +add.f16x2 r2265, r2077, r2093; +} +{ +mul.f16x2 r2268, r2265, r2215; +} +{ +add.f16x2 r2271, r1856, r2268; +} +{ +sub.f16x2 r2274, r2071, r2087; +} +{ +mul.f16x2 r2277, r2274, r2216; +} +{ +sub.f16x2 %9, r2271, r2277; +} +{ +add.f16x2 r2283, r2077, r2093; +} +{ +mul.f16x2 r2286, r2283, r2215; +} +{ +add.f16x2 r2289, r1856, r2286; +} +{ +sub.f16x2 r2292, r2071, r2087; +} +{ +mul.f16x2 r2295, r2292, r2216; +} +{ +add.f16x2 %15, r2289, r2295; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2302, {low, high}; +} +{ +add.f16x2 r2303, r2103, r2119; +} +{ +add.f16x2 %4, r1838, r2303; +} +{ +add.f16x2 r2309, r2109, r2125; +} +{ +add.f16x2 %5, r1874, r2309; +} +{ +add.f16x2 r2315, r2103, r2119; +} +{ +mul.f16x2 r2318, r2315, r2301; +} +{ +add.f16x2 r2321, r1838, r2318; +} +{ +sub.f16x2 r2324, r2109, r2125; +} +{ +mul.f16x2 r2327, r2324, r2302; +} +{ +add.f16x2 %10, r2321, r2327; +} +{ +add.f16x2 r2333, r2103, r2119; +} +{ +mul.f16x2 r2336, r2333, r2301; +} +{ +add.f16x2 r2339, r1838, r2336; +} +{ +sub.f16x2 r2342, r2109, r2125; +} +{ +mul.f16x2 r2345, r2342, r2302; +} +{ +sub.f16x2 %16, r2339, r2345; +} +{ +add.f16x2 r2351, r2109, r2125; +} +{ +mul.f16x2 r2354, r2351, r2301; +} +{ +add.f16x2 r2357, r1874, r2354; +} +{ +sub.f16x2 r2360, r2103, r2119; +} +{ +mul.f16x2 r2363, r2360, r2302; +} +{ +sub.f16x2 %11, r2357, r2363; +} +{ +add.f16x2 r2369, r2109, r2125; +} +{ +mul.f16x2 r2372, r2369, r2301; +} +{ +add.f16x2 r2375, r1874, r2372; +} +{ +sub.f16x2 r2378, r2103, r2119; +} +{ +mul.f16x2 r2381, r2378, r2302; +} +{ +add.f16x2 %17, r2375, r2381; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1089, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<215>; +.reg .b32 r<2404>; +.reg .b64 rd<6>; +mov.u32 r2387, %tid.y; +mov.u32 r2388, %18; +mad.lo.s32 r2389, r2387, 2916, r2388; +mov.u32 r2390, %tid.x; +mov.f32 f206, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1, {low, high}; +} +mov.f32 f208, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f166, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r259, {low, high}; +} +mov.f32 f168, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r260, {low, high}; +} +mov.f32 f170, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r261, {low, high}; +} +mov.f32 f172, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r262, {low, high}; +} +mov.f32 f178, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r265, {low, high}; +} +mov.f32 f180, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r2390, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r2391, rd3; +mul.lo.s32 r2392, r2391, 81; +sub.s32 r2393, r2390, r2392; +mad.lo.s32 r2394, r2391, 2916, r2389; +cvt.rn.f32.u32 f209, r2393; +mul.f32 f210, f209, 0f3C0D3654; +cos.approx.f32 f57, f210; +sin.approx.f32 f211, f210; +neg.f32 f58, f211; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f149, 0fBF800000; +mov.f32 f150, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +barrier.sync 0; +mad.lo.s32 r2395, r2393, 36, r2394; +st.shared.u32 [r2395], r344; +st.shared.u32 [r2395+4], r607; +st.shared.u32 [r2395+8], r644; +st.shared.u32 [r2395+12], r681; +st.shared.u32 [r2395+16], r718; +st.shared.u32 [r2395+20], r755; +st.shared.u32 [r2395+24], r792; +st.shared.u32 [r2395+28], r829; +st.shared.u32 [r2395+32], r866; +barrier.sync 0; +shl.b32 r2396, r2393, 5; +sub.s32 r2397, r2395, r2396; +ld.shared.u32 r902, [r2397]; +ld.shared.u32 r988, [r2397+324]; +ld.shared.u32 r1074, [r2397+648]; +ld.shared.u32 r899, [r2397+972]; +ld.shared.u32 r985, [r2397+1296]; +ld.shared.u32 r1071, [r2397+1620]; +ld.shared.u32 r900, [r2397+1944]; +ld.shared.u32 r986, [r2397+2268]; +ld.shared.u32 r1072, [r2397+2592]; +barrier.sync 0; +st.shared.u32 [r2395], r350; +st.shared.u32 [r2395+4], r616; +st.shared.u32 [r2395+8], r653; +st.shared.u32 [r2395+12], r690; +st.shared.u32 [r2395+16], r727; +st.shared.u32 [r2395+20], r764; +st.shared.u32 [r2395+24], r801; +st.shared.u32 [r2395+28], r838; +st.shared.u32 [r2395+32], r875; +barrier.sync 0; +ld.shared.u32 r908, [r2397]; +ld.shared.u32 r994, [r2397+324]; +ld.shared.u32 r1080, [r2397+648]; +ld.shared.u32 r905, [r2397+972]; +ld.shared.u32 r991, [r2397+1296]; +ld.shared.u32 r1077, [r2397+1620]; +ld.shared.u32 r906, [r2397+1944]; +ld.shared.u32 r992, [r2397+2268]; +ld.shared.u32 r1078, [r2397+2592]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 r1239, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 r1245, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 r1263, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 r1281, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 r1299, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 r1317, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 r1325, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 r1331, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 r1349, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 r1367, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 r1385, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 r1403, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 r1411, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 r1417, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 r1435, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 r1453, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 r1471, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 r1489, r1480, r1486; +} +mul.wide.u32 rd4, r2393, 954437177; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r2398, rd5; +mul.lo.s32 r2399, r2398, 9; +sub.s32 r2400, r2393, r2399; +shl.b32 r2401, r2400, 2; +add.s32 r2402, r2394, r2401; +cvt.rn.f32.u32 f212, r2398; +mul.f32 f213, f212, 0f3D9EDD1F; +cos.approx.f32 f133, f213; +sin.approx.f32 f214, f213; +neg.f32 f134, f214; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f133; +cvt.rn.f16.f32 high, f134; +mov.b32 r1492, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1495, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1497, {high, high}; +} +{ +mul.f16x2 r1499, r1331, r1497; +} +{ +fma.rn.f16x2 r1502, r1325, r1495, r1499; +} +{ +mul.f16x2 r1506, r1325, r1497; +} +{ +neg.f16x2 r1509, r1506; +} +{ +fma.rn.f16x2 r1511, r1331, r1495, r1509; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1515, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1517, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1519, {low, high}; +} +{ +mul.f16x2 r1520, r1517, r1519; +} +{ +mul.f16x2 r1523, r1492, r1515; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1526, {high, low}; +} +{ +fma.rn.f16x2 r1528, r1520, r1526, r1523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1532, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1534, {high, high}; +} +{ +mul.f16x2 r1536, r1417, r1534; +} +{ +fma.rn.f16x2 r1539, r1411, r1532, r1536; +} +{ +mul.f16x2 r1543, r1411, r1534; +} +{ +neg.f16x2 r1546, r1543; +} +{ +fma.rn.f16x2 r1548, r1417, r1532, r1546; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1552, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1554, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1556, {low, high}; +} +{ +mul.f16x2 r1557, r1554, r1556; +} +{ +mul.f16x2 r1560, r1528, r1552; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1528; +mov.b32 r1563, {high, low}; +} +{ +fma.rn.f16x2 r1565, r1557, r1563, r1560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1569, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1571, {high, high}; +} +{ +mul.f16x2 r1573, r1299, r1571; +} +{ +fma.rn.f16x2 r1576, r1263, r1569, r1573; +} +{ +mul.f16x2 r1580, r1263, r1571; +} +{ +neg.f16x2 r1583, r1580; +} +{ +fma.rn.f16x2 r1585, r1299, r1569, r1583; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1589, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1591, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1593, {low, high}; +} +{ +mul.f16x2 r1594, r1591, r1593; +} +{ +mul.f16x2 r1597, r1565, r1589; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1565; +mov.b32 r1600, {high, low}; +} +{ +fma.rn.f16x2 r1602, r1594, r1600, r1597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1606, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1608, {high, high}; +} +{ +mul.f16x2 r1610, r1385, r1608; +} +{ +fma.rn.f16x2 r1613, r1349, r1606, r1610; +} +{ +mul.f16x2 r1617, r1349, r1608; +} +{ +neg.f16x2 r1620, r1617; +} +{ +fma.rn.f16x2 r1622, r1385, r1606, r1620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1626, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1628, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1630, {low, high}; +} +{ +mul.f16x2 r1631, r1628, r1630; +} +{ +mul.f16x2 r1634, r1602, r1626; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1602; +mov.b32 r1637, {high, low}; +} +{ +fma.rn.f16x2 r1639, r1631, r1637, r1634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1643, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1645, {high, high}; +} +{ +mul.f16x2 r1647, r1471, r1645; +} +{ +fma.rn.f16x2 r1650, r1435, r1643, r1647; +} +{ +mul.f16x2 r1654, r1435, r1645; +} +{ +neg.f16x2 r1657, r1654; +} +{ +fma.rn.f16x2 r1659, r1471, r1643, r1657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1663, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1665, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1667, {low, high}; +} +{ +mul.f16x2 r1668, r1665, r1667; +} +{ +mul.f16x2 r1671, r1639, r1663; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1639; +mov.b32 r1674, {high, low}; +} +{ +fma.rn.f16x2 r1676, r1668, r1674, r1671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1680, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1682, {high, high}; +} +{ +mul.f16x2 r1684, r1317, r1682; +} +{ +fma.rn.f16x2 r1687, r1281, r1680, r1684; +} +{ +mul.f16x2 r1691, r1281, r1682; +} +{ +neg.f16x2 r1694, r1691; +} +{ +fma.rn.f16x2 r1696, r1317, r1680, r1694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1700, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1702, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1704, {low, high}; +} +{ +mul.f16x2 r1705, r1702, r1704; +} +{ +mul.f16x2 r1708, r1676, r1700; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1676; +mov.b32 r1711, {high, low}; +} +{ +fma.rn.f16x2 r1713, r1705, r1711, r1708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1717, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1719, {high, high}; +} +{ +mul.f16x2 r1721, r1403, r1719; +} +{ +fma.rn.f16x2 r1724, r1367, r1717, r1721; +} +{ +mul.f16x2 r1728, r1367, r1719; +} +{ +neg.f16x2 r1731, r1728; +} +{ +fma.rn.f16x2 r1733, r1403, r1717, r1731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1737, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1492; +mov.b32 r1739, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f149; +cvt.rn.f16.f32 high, f150; +mov.b32 r1741, {low, high}; +} +{ +mul.f16x2 r1742, r1739, r1741; +} +{ +mul.f16x2 r1745, r1713, r1737; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1748, {high, low}; +} +{ +fma.rn.f16x2 r1750, r1742, r1748, r1745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1754, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1750; +mov.b32 r1756, {high, high}; +} +{ +mul.f16x2 r1758, r1489, r1756; +} +{ +fma.rn.f16x2 r1761, r1453, r1754, r1758; +} +{ +mul.f16x2 r1765, r1453, r1756; +} +{ +neg.f16x2 r1768, r1765; +} +{ +fma.rn.f16x2 r1770, r1489, r1754, r1768; +} +barrier.sync 0; +mad.lo.s32 r2403, r2398, 324, r2402; +st.shared.u32 [r2403], r1239; +st.shared.u32 [r2403+36], r1502; +st.shared.u32 [r2403+72], r1539; +st.shared.u32 [r2403+108], r1576; +st.shared.u32 [r2403+144], r1613; +st.shared.u32 [r2403+180], r1650; +st.shared.u32 [r2403+216], r1687; +st.shared.u32 [r2403+252], r1724; +st.shared.u32 [r2403+288], r1761; +barrier.sync 0; +ld.shared.u32 r1797, [r2397]; +ld.shared.u32 r1883, [r2397+324]; +ld.shared.u32 r1969, [r2397+648]; +ld.shared.u32 r1794, [r2397+972]; +ld.shared.u32 r1880, [r2397+1296]; +ld.shared.u32 r1966, [r2397+1620]; +ld.shared.u32 r1795, [r2397+1944]; +ld.shared.u32 r1881, [r2397+2268]; +ld.shared.u32 r1967, [r2397+2592]; +barrier.sync 0; +st.shared.u32 [r2403], r1245; +st.shared.u32 [r2403+36], r1511; +st.shared.u32 [r2403+72], r1548; +st.shared.u32 [r2403+108], r1585; +st.shared.u32 [r2403+144], r1622; +st.shared.u32 [r2403+180], r1659; +st.shared.u32 [r2403+216], r1696; +st.shared.u32 [r2403+252], r1733; +st.shared.u32 [r2403+288], r1770; +barrier.sync 0; +ld.shared.u32 r1803, [r2397]; +ld.shared.u32 r1889, [r2397+324]; +ld.shared.u32 r1975, [r2397+648]; +ld.shared.u32 r1800, [r2397+972]; +ld.shared.u32 r1886, [r2397+1296]; +ld.shared.u32 r1972, [r2397+1620]; +ld.shared.u32 r1801, [r2397+1944]; +ld.shared.u32 r1887, [r2397+2268]; +ld.shared.u32 r1973, [r2397+2592]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1792, {low, high}; +} +{ +add.f16x2 r1793, r1794, r1795; +} +{ +add.f16x2 r1796, r1797, r1793; +} +{ +add.f16x2 r1799, r1800, r1801; +} +{ +add.f16x2 r1802, r1803, r1799; +} +{ +add.f16x2 r1805, r1794, r1795; +} +{ +mul.f16x2 r1808, r1805, r1791; +} +{ +add.f16x2 r1811, r1797, r1808; +} +{ +sub.f16x2 r1814, r1800, r1801; +} +{ +mul.f16x2 r1817, r1814, r1792; +} +{ +add.f16x2 r1820, r1811, r1817; +} +{ +add.f16x2 r1823, r1794, r1795; +} +{ +mul.f16x2 r1826, r1823, r1791; +} +{ +add.f16x2 r1829, r1797, r1826; +} +{ +sub.f16x2 r1832, r1800, r1801; +} +{ +mul.f16x2 r1835, r1832, r1792; +} +{ +sub.f16x2 r1838, r1829, r1835; +} +{ +add.f16x2 r1841, r1800, r1801; +} +{ +mul.f16x2 r1844, r1841, r1791; +} +{ +add.f16x2 r1847, r1803, r1844; +} +{ +sub.f16x2 r1850, r1794, r1795; +} +{ +mul.f16x2 r1853, r1850, r1792; +} +{ +sub.f16x2 r1856, r1847, r1853; +} +{ +add.f16x2 r1859, r1800, r1801; +} +{ +mul.f16x2 r1862, r1859, r1791; +} +{ +add.f16x2 r1865, r1803, r1862; +} +{ +sub.f16x2 r1868, r1794, r1795; +} +{ +mul.f16x2 r1871, r1868, r1792; +} +{ +add.f16x2 r1874, r1865, r1871; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1877, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1878, {low, high}; +} +{ +add.f16x2 r1879, r1880, r1881; +} +{ +add.f16x2 r1882, r1883, r1879; +} +{ +add.f16x2 r1885, r1886, r1887; +} +{ +add.f16x2 r1888, r1889, r1885; +} +{ +add.f16x2 r1891, r1880, r1881; +} +{ +mul.f16x2 r1894, r1891, r1877; +} +{ +add.f16x2 r1897, r1883, r1894; +} +{ +sub.f16x2 r1900, r1886, r1887; +} +{ +mul.f16x2 r1903, r1900, r1878; +} +{ +add.f16x2 r1906, r1897, r1903; +} +{ +add.f16x2 r1909, r1880, r1881; +} +{ +mul.f16x2 r1912, r1909, r1877; +} +{ +add.f16x2 r1915, r1883, r1912; +} +{ +sub.f16x2 r1918, r1886, r1887; +} +{ +mul.f16x2 r1921, r1918, r1878; +} +{ +sub.f16x2 r1924, r1915, r1921; +} +{ +add.f16x2 r1927, r1886, r1887; +} +{ +mul.f16x2 r1930, r1927, r1877; +} +{ +add.f16x2 r1933, r1889, r1930; +} +{ +sub.f16x2 r1936, r1880, r1881; +} +{ +mul.f16x2 r1939, r1936, r1878; +} +{ +sub.f16x2 r1942, r1933, r1939; +} +{ +add.f16x2 r1945, r1886, r1887; +} +{ +mul.f16x2 r1948, r1945, r1877; +} +{ +add.f16x2 r1951, r1889, r1948; +} +{ +sub.f16x2 r1954, r1880, r1881; +} +{ +mul.f16x2 r1957, r1954, r1878; +} +{ +add.f16x2 r1960, r1951, r1957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1964, {low, high}; +} +{ +add.f16x2 r1965, r1966, r1967; +} +{ +add.f16x2 r1968, r1969, r1965; +} +{ +add.f16x2 r1971, r1972, r1973; +} +{ +add.f16x2 r1974, r1975, r1971; +} +{ +add.f16x2 r1977, r1966, r1967; +} +{ +mul.f16x2 r1980, r1977, r1963; +} +{ +add.f16x2 r1983, r1969, r1980; +} +{ +sub.f16x2 r1986, r1972, r1973; +} +{ +mul.f16x2 r1989, r1986, r1964; +} +{ +add.f16x2 r1992, r1983, r1989; +} +{ +add.f16x2 r1995, r1966, r1967; +} +{ +mul.f16x2 r1998, r1995, r1963; +} +{ +add.f16x2 r2001, r1969, r1998; +} +{ +sub.f16x2 r2004, r1972, r1973; +} +{ +mul.f16x2 r2007, r2004, r1964; +} +{ +sub.f16x2 r2010, r2001, r2007; +} +{ +add.f16x2 r2013, r1972, r1973; +} +{ +mul.f16x2 r2016, r2013, r1963; +} +{ +add.f16x2 r2019, r1975, r2016; +} +{ +sub.f16x2 r2022, r1966, r1967; +} +{ +mul.f16x2 r2025, r2022, r1964; +} +{ +sub.f16x2 r2028, r2019, r2025; +} +{ +add.f16x2 r2031, r1972, r1973; +} +{ +mul.f16x2 r2034, r2031, r1963; +} +{ +add.f16x2 r2037, r1975, r2034; +} +{ +sub.f16x2 r2040, r1966, r1967; +} +{ +mul.f16x2 r2043, r2040, r1964; +} +{ +add.f16x2 r2046, r2037, r2043; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f166; +cvt.rn.f16.f32 high, f166; +mov.b32 r2049, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f168; +cvt.rn.f16.f32 high, f168; +mov.b32 r2050, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r2051, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r2052, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r2055, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r2056, {low, high}; +} +{ +mul.f16x2 r2065, r1906, r2049; +} +{ +mul.f16x2 r2068, r1942, r2050; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1906, r2050; +} +{ +fma.rn.f16x2 r2077, r1942, r2049, r2074; +} +{ +mul.f16x2 r2081, r1992, r2051; +} +{ +mul.f16x2 r2084, r2028, r2052; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1992, r2052; +} +{ +fma.rn.f16x2 r2093, r2028, r2051, r2090; +} +{ +mul.f16x2 r2097, r1924, r2051; +} +{ +mul.f16x2 r2100, r1960, r2052; +} +{ +sub.f16x2 r2103, r2097, r2100; +} +{ +mul.f16x2 r2106, r1924, r2052; +} +{ +fma.rn.f16x2 r2109, r1960, r2051, r2106; +} +{ +mul.f16x2 r2113, r2010, r2055; +} +{ +mul.f16x2 r2116, r2046, r2056; +} +{ +sub.f16x2 r2119, r2113, r2116; +} +{ +mul.f16x2 r2122, r2010, r2056; +} +{ +fma.rn.f16x2 r2125, r2046, r2055, r2122; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2130, {low, high}; +} +{ +add.f16x2 r2131, r1882, r1968; +} +{ +add.f16x2 %0, r1796, r2131; +} +{ +add.f16x2 r2137, r1888, r1974; +} +{ +add.f16x2 %1, r1802, r2137; +} +{ +add.f16x2 r2143, r1882, r1968; +} +{ +mul.f16x2 r2146, r2143, r2129; +} +{ +add.f16x2 r2149, r1796, r2146; +} +{ +sub.f16x2 r2152, r1888, r1974; +} +{ +mul.f16x2 r2155, r2152, r2130; +} +{ +add.f16x2 %6, r2149, r2155; +} +{ +add.f16x2 r2161, r1882, r1968; +} +{ +mul.f16x2 r2164, r2161, r2129; +} +{ +add.f16x2 r2167, r1796, r2164; +} +{ +sub.f16x2 r2170, r1888, r1974; +} +{ +mul.f16x2 r2173, r2170, r2130; +} +{ +sub.f16x2 %12, r2167, r2173; +} +{ +add.f16x2 r2179, r1888, r1974; +} +{ +mul.f16x2 r2182, r2179, r2129; +} +{ +add.f16x2 r2185, r1802, r2182; +} +{ +sub.f16x2 r2188, r1882, r1968; +} +{ +mul.f16x2 r2191, r2188, r2130; +} +{ +sub.f16x2 %7, r2185, r2191; +} +{ +add.f16x2 r2197, r1888, r1974; +} +{ +mul.f16x2 r2200, r2197, r2129; +} +{ +add.f16x2 r2203, r1802, r2200; +} +{ +sub.f16x2 r2206, r1882, r1968; +} +{ +mul.f16x2 r2209, r2206, r2130; +} +{ +add.f16x2 %13, r2203, r2209; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2215, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2216, {low, high}; +} +{ +add.f16x2 r2217, r2071, r2087; +} +{ +add.f16x2 %2, r1820, r2217; +} +{ +add.f16x2 r2223, r2077, r2093; +} +{ +add.f16x2 %3, r1856, r2223; +} +{ +add.f16x2 r2229, r2071, r2087; +} +{ +mul.f16x2 r2232, r2229, r2215; +} +{ +add.f16x2 r2235, r1820, r2232; +} +{ +sub.f16x2 r2238, r2077, r2093; +} +{ +mul.f16x2 r2241, r2238, r2216; +} +{ +add.f16x2 %8, r2235, r2241; +} +{ +add.f16x2 r2247, r2071, r2087; +} +{ +mul.f16x2 r2250, r2247, r2215; +} +{ +add.f16x2 r2253, r1820, r2250; +} +{ +sub.f16x2 r2256, r2077, r2093; +} +{ +mul.f16x2 r2259, r2256, r2216; +} +{ +sub.f16x2 %14, r2253, r2259; +} +{ +add.f16x2 r2265, r2077, r2093; +} +{ +mul.f16x2 r2268, r2265, r2215; +} +{ +add.f16x2 r2271, r1856, r2268; +} +{ +sub.f16x2 r2274, r2071, r2087; +} +{ +mul.f16x2 r2277, r2274, r2216; +} +{ +sub.f16x2 %9, r2271, r2277; +} +{ +add.f16x2 r2283, r2077, r2093; +} +{ +mul.f16x2 r2286, r2283, r2215; +} +{ +add.f16x2 r2289, r1856, r2286; +} +{ +sub.f16x2 r2292, r2071, r2087; +} +{ +mul.f16x2 r2295, r2292, r2216; +} +{ +add.f16x2 %15, r2289, r2295; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r2301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r2302, {low, high}; +} +{ +add.f16x2 r2303, r2103, r2119; +} +{ +add.f16x2 %4, r1838, r2303; +} +{ +add.f16x2 r2309, r2109, r2125; +} +{ +add.f16x2 %5, r1874, r2309; +} +{ +add.f16x2 r2315, r2103, r2119; +} +{ +mul.f16x2 r2318, r2315, r2301; +} +{ +add.f16x2 r2321, r1838, r2318; +} +{ +sub.f16x2 r2324, r2109, r2125; +} +{ +mul.f16x2 r2327, r2324, r2302; +} +{ +add.f16x2 %10, r2321, r2327; +} +{ +add.f16x2 r2333, r2103, r2119; +} +{ +mul.f16x2 r2336, r2333, r2301; +} +{ +add.f16x2 r2339, r1838, r2336; +} +{ +sub.f16x2 r2342, r2109, r2125; +} +{ +mul.f16x2 r2345, r2342, r2302; +} +{ +sub.f16x2 %16, r2339, r2345; +} +{ +add.f16x2 r2351, r2109, r2125; +} +{ +mul.f16x2 r2354, r2351, r2301; +} +{ +add.f16x2 r2357, r1874, r2354; +} +{ +sub.f16x2 r2360, r2103, r2119; +} +{ +mul.f16x2 r2363, r2360, r2302; +} +{ +sub.f16x2 %11, r2357, r2363; +} +{ +add.f16x2 r2369, r2109, r2125; +} +{ +mul.f16x2 r2372, r2369, r2301; +} +{ +add.f16x2 r2375, r1874, r2372; +} +{ +sub.f16x2 r2378, r2103, r2119; +} +{ +mul.f16x2 r2381, r2378, r2302; +} +{ +add.f16x2 %17, r2375, r2381; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1090, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<80>; +.reg .b32 r<941>; +.reg .b64 rd<12>; +mov.u32 r902, %tid.y; +mov.u32 r903, %6; +mad.lo.s32 r904, r902, 5832, r903; +mov.u32 r905, %tid.x; +mov.f32 f62, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1, {low, high}; +} +mov.f32 f64, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r905, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r906, rd3; +mul.lo.s32 r907, r906, 243; +sub.s32 r908, r905, r907; +mad.lo.s32 r909, r906, 5832, r904; +cvt.rn.f32.u32 f65, r908; +mul.f32 f66, f65, 0f3C0D3654; +cos.approx.f32 f5, f66; +sin.approx.f32 f67, f66; +neg.f32 f6, f67; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f57, 0fBF800000; +mov.f32 f58, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r910, r908, 24, r909; +st.shared.v2.f32 [r910], {r6, r12}; +st.shared.v2.f32 [r910+8], {r97, r106}; +st.shared.v2.f32 [r910+16], {r134, r143}; +barrier.sync 0; +shl.b32 r911, r908, 4; +sub.s32 r912, r910, r911; +ld.shared.u32 r170, [r912]; +ld.shared.u32 r176, [r912+4]; +ld.shared.u32 r167, [r912+1944]; +ld.shared.u32 r173, [r912+1948]; +ld.shared.u32 r168, [r912+3888]; +ld.shared.u32 r174, [r912+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r908, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r913, rd5; +mul.lo.s32 r914, r913, 3; +sub.s32 r915, r908, r914; +shl.b32 r916, r915, 3; +add.s32 r917, r909, r916; +cvt.rn.f32.u32 f68, r913; +mul.f32 f69, f68, 0f3CD3D17E; +cos.approx.f32 f17, f69; +sin.approx.f32 f70, f69; +neg.f32 f18, f70; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r918, r913, 72, r917; +st.shared.u32 [r918], r169; +st.shared.u32 [r918+4], r175; +st.shared.u32 [r918+24], r260; +st.shared.u32 [r918+28], r269; +st.shared.u32 [r918+48], r297; +st.shared.u32 [r918+52], r306; +barrier.sync 0; +ld.shared.u32 r333, [r912]; +ld.shared.u32 r339, [r912+4]; +ld.shared.u32 r330, [r912+1944]; +ld.shared.u32 r336, [r912+1948]; +ld.shared.u32 r331, [r912+3888]; +ld.shared.u32 r337, [r912+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r908, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r919, rd7; +mul.lo.s32 r920, r919, 9; +sub.s32 r921, r908, r920; +shl.b32 r922, r921, 3; +add.s32 r923, r909, r922; +cvt.rn.f32.u32 f71, r919; +mul.f32 f72, f71, 0f3D9EDD1F; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r924, r919, 216, r923; +st.shared.u32 [r924], r332; +st.shared.u32 [r924+4], r338; +st.shared.u32 [r924+72], r423; +st.shared.u32 [r924+76], r432; +st.shared.u32 [r924+144], r460; +st.shared.u32 [r924+148], r469; +barrier.sync 0; +ld.shared.u32 r496, [r912]; +ld.shared.u32 r502, [r912+4]; +ld.shared.u32 r493, [r912+1944]; +ld.shared.u32 r499, [r912+1948]; +ld.shared.u32 r494, [r912+3888]; +ld.shared.u32 r500, [r912+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 r519, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 r537, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 r555, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 r573, r564, r570; +} +mul.wide.u32 rd8, r908, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r925, rd9; +sub.s32 r926, r908, r925; +shr.u32 r927, r926, 1; +add.s32 r928, r927, r925; +shr.u32 r929, r928, 4; +mul.lo.s32 r930, r929, 27; +sub.s32 r931, r908, r930; +shl.b32 r932, r931, 3; +add.s32 r933, r909, r932; +cvt.rn.f32.u32 f74, r929; +mul.f32 f75, f74, 0f3E6E4BAE; +cos.approx.f32 f41, f75; +sin.approx.f32 f76, f75; +neg.f32 f42, f76; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r576, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r579, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r581, {high, high}; +} +{ +mul.f16x2 r583, r555, r581; +} +{ +fma.rn.f16x2 r586, r519, r579, r583; +} +{ +mul.f16x2 r590, r519, r581; +} +{ +neg.f16x2 r593, r590; +} +{ +fma.rn.f16x2 r595, r555, r579, r593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r599, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r601, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r603, {low, high}; +} +{ +mul.f16x2 r604, r601, r603; +} +{ +mul.f16x2 r607, r576, r599; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r610, {high, low}; +} +{ +fma.rn.f16x2 r612, r604, r610, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r616, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r618, {high, high}; +} +{ +mul.f16x2 r620, r573, r618; +} +{ +fma.rn.f16x2 r623, r537, r616, r620; +} +{ +mul.f16x2 r627, r537, r618; +} +{ +neg.f16x2 r630, r627; +} +{ +fma.rn.f16x2 r632, r573, r616, r630; +} +barrier.sync 0; +mad.lo.s32 r934, r929, 648, r933; +st.shared.u32 [r934], r495; +st.shared.u32 [r934+4], r501; +st.shared.u32 [r934+216], r586; +st.shared.u32 [r934+220], r595; +st.shared.u32 [r934+432], r623; +st.shared.u32 [r934+436], r632; +barrier.sync 0; +ld.shared.u32 r659, [r912]; +ld.shared.u32 r665, [r912+4]; +ld.shared.u32 r656, [r912+1944]; +ld.shared.u32 r662, [r912+1948]; +ld.shared.u32 r657, [r912+3888]; +ld.shared.u32 r663, [r912+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, r656, r657; +} +{ +add.f16x2 r658, r659, r655; +} +{ +add.f16x2 r661, r662, r663; +} +{ +add.f16x2 r664, r665, r661; +} +{ +add.f16x2 r667, r656, r657; +} +{ +mul.f16x2 r670, r667, r653; +} +{ +add.f16x2 r673, r659, r670; +} +{ +sub.f16x2 r676, r662, r663; +} +{ +mul.f16x2 r679, r676, r654; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r656, r657; +} +{ +mul.f16x2 r688, r685, r653; +} +{ +add.f16x2 r691, r659, r688; +} +{ +sub.f16x2 r694, r662, r663; +} +{ +mul.f16x2 r697, r694, r654; +} +{ +sub.f16x2 r700, r691, r697; +} +{ +add.f16x2 r703, r662, r663; +} +{ +mul.f16x2 r706, r703, r653; +} +{ +add.f16x2 r709, r665, r706; +} +{ +sub.f16x2 r712, r656, r657; +} +{ +mul.f16x2 r715, r712, r654; +} +{ +sub.f16x2 r718, r709, r715; +} +{ +add.f16x2 r721, r662, r663; +} +{ +mul.f16x2 r724, r721, r653; +} +{ +add.f16x2 r727, r665, r724; +} +{ +sub.f16x2 r730, r656, r657; +} +{ +mul.f16x2 r733, r730, r654; +} +{ +add.f16x2 r736, r727, r733; +} +mul.wide.u32 rd10, r908, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r935, rd11; +mul.lo.s32 r936, r935, 81; +sub.s32 r937, r908, r936; +shl.b32 r938, r937, 3; +add.s32 r939, r909, r938; +cvt.rn.f32.u32 f77, r935; +mul.f32 f78, f77, 0f3F32B8C2; +cos.approx.f32 f53, f78; +sin.approx.f32 f79, f78; +neg.f32 f54, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r739, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r744, {high, high}; +} +{ +mul.f16x2 r746, r718, r744; +} +{ +fma.rn.f16x2 r749, r682, r742, r746; +} +{ +mul.f16x2 r753, r682, r744; +} +{ +neg.f16x2 r756, r753; +} +{ +fma.rn.f16x2 r758, r718, r742, r756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r766, {low, high}; +} +{ +mul.f16x2 r767, r764, r766; +} +{ +mul.f16x2 r770, r739, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r773, {high, low}; +} +{ +fma.rn.f16x2 r775, r767, r773, r770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r781, {high, high}; +} +{ +mul.f16x2 r783, r736, r781; +} +{ +fma.rn.f16x2 r786, r700, r779, r783; +} +{ +mul.f16x2 r790, r700, r781; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r736, r779, r793; +} +barrier.sync 0; +mad.lo.s32 r940, r935, 1944, r939; +st.shared.u32 [r940], r658; +st.shared.u32 [r940+4], r664; +st.shared.u32 [r940+648], r749; +st.shared.u32 [r940+652], r758; +st.shared.u32 [r940+1296], r786; +st.shared.u32 [r940+1300], r795; +barrier.sync 0; +ld.shared.u32 r822, [r912]; +ld.shared.u32 r828, [r912+4]; +ld.shared.u32 r819, [r912+1944]; +ld.shared.u32 r825, [r912+1948]; +ld.shared.u32 r820, [r912+3888]; +ld.shared.u32 r826, [r912+3892]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r817, {low, high}; +} +{ +add.f16x2 r818, r819, r820; +} +{ +add.f16x2 %0, r822, r818; +} +{ +add.f16x2 r824, r825, r826; +} +{ +add.f16x2 %1, r828, r824; +} +{ +add.f16x2 r830, r819, r820; +} +{ +mul.f16x2 r833, r830, r816; +} +{ +add.f16x2 r836, r822, r833; +} +{ +sub.f16x2 r839, r825, r826; +} +{ +mul.f16x2 r842, r839, r817; +} +{ +add.f16x2 %2, r836, r842; +} +{ +add.f16x2 r848, r819, r820; +} +{ +mul.f16x2 r851, r848, r816; +} +{ +add.f16x2 r854, r822, r851; +} +{ +sub.f16x2 r857, r825, r826; +} +{ +mul.f16x2 r860, r857, r817; +} +{ +sub.f16x2 %4, r854, r860; +} +{ +add.f16x2 r866, r825, r826; +} +{ +mul.f16x2 r869, r866, r816; +} +{ +add.f16x2 r872, r828, r869; +} +{ +sub.f16x2 r875, r819, r820; +} +{ +mul.f16x2 r878, r875, r817; +} +{ +sub.f16x2 %3, r872, r878; +} +{ +add.f16x2 r884, r825, r826; +} +{ +mul.f16x2 r887, r884, r816; +} +{ +add.f16x2 r890, r828, r887; +} +{ +sub.f16x2 r893, r819, r820; +} +{ +mul.f16x2 r896, r893, r817; +} +{ +add.f16x2 %5, r890, r896; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1091, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<80>; +.reg .b32 r<941>; +.reg .b64 rd<12>; +mov.u32 r902, %tid.y; +mov.u32 r903, %6; +mad.lo.s32 r904, r902, 2916, r903; +mov.u32 r905, %tid.x; +mov.f32 f62, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r1, {low, high}; +} +mov.f32 f64, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r905, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r906, rd3; +mul.lo.s32 r907, r906, 243; +sub.s32 r908, r905, r907; +mad.lo.s32 r909, r906, 2916, r904; +cvt.rn.f32.u32 f65, r908; +mul.f32 f66, f65, 0f3C0D3654; +cos.approx.f32 f5, f66; +sin.approx.f32 f67, f66; +neg.f32 f6, f67; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f57, 0fBF800000; +mov.f32 f58, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r910, r908, 12, r909; +st.shared.u32 [r910], r6; +st.shared.u32 [r910+4], r97; +st.shared.u32 [r910+8], r134; +barrier.sync 0; +shl.b32 r911, r908, 3; +sub.s32 r912, r910, r911; +ld.shared.u32 r170, [r912]; +ld.shared.u32 r167, [r912+972]; +ld.shared.u32 r168, [r912+1944]; +barrier.sync 0; +st.shared.u32 [r910], r12; +st.shared.u32 [r910+4], r106; +st.shared.u32 [r910+8], r143; +barrier.sync 0; +ld.shared.u32 r176, [r912]; +ld.shared.u32 r173, [r912+972]; +ld.shared.u32 r174, [r912+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r908, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r913, rd5; +mul.lo.s32 r914, r913, 3; +sub.s32 r915, r908, r914; +shl.b32 r916, r915, 2; +add.s32 r917, r909, r916; +cvt.rn.f32.u32 f68, r913; +mul.f32 f69, f68, 0f3CD3D17E; +cos.approx.f32 f17, f69; +sin.approx.f32 f70, f69; +neg.f32 f18, f70; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r918, r913, 36, r917; +st.shared.u32 [r918], r169; +st.shared.u32 [r918+12], r260; +st.shared.u32 [r918+24], r297; +barrier.sync 0; +ld.shared.u32 r333, [r912]; +ld.shared.u32 r330, [r912+972]; +ld.shared.u32 r331, [r912+1944]; +barrier.sync 0; +st.shared.u32 [r918], r175; +st.shared.u32 [r918+12], r269; +st.shared.u32 [r918+24], r306; +barrier.sync 0; +ld.shared.u32 r339, [r912]; +ld.shared.u32 r336, [r912+972]; +ld.shared.u32 r337, [r912+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r908, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r919, rd7; +mul.lo.s32 r920, r919, 9; +sub.s32 r921, r908, r920; +shl.b32 r922, r921, 2; +add.s32 r923, r909, r922; +cvt.rn.f32.u32 f71, r919; +mul.f32 f72, f71, 0f3D9EDD1F; +cos.approx.f32 f29, f72; +sin.approx.f32 f73, f72; +neg.f32 f30, f73; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r924, r919, 108, r923; +st.shared.u32 [r924], r332; +st.shared.u32 [r924+36], r423; +st.shared.u32 [r924+72], r460; +barrier.sync 0; +ld.shared.u32 r496, [r912]; +ld.shared.u32 r493, [r912+972]; +ld.shared.u32 r494, [r912+1944]; +barrier.sync 0; +st.shared.u32 [r924], r338; +st.shared.u32 [r924+36], r432; +st.shared.u32 [r924+72], r469; +barrier.sync 0; +ld.shared.u32 r502, [r912]; +ld.shared.u32 r499, [r912+972]; +ld.shared.u32 r500, [r912+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 r495, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 r501, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 r519, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 r537, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 r555, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 r573, r564, r570; +} +mul.wide.u32 rd8, r908, 795364315; +shr.u64 rd9, rd8, 32; +cvt.u32.u64 r925, rd9; +sub.s32 r926, r908, r925; +shr.u32 r927, r926, 1; +add.s32 r928, r927, r925; +shr.u32 r929, r928, 4; +mul.lo.s32 r930, r929, 27; +sub.s32 r931, r908, r930; +shl.b32 r932, r931, 2; +add.s32 r933, r909, r932; +cvt.rn.f32.u32 f74, r929; +mul.f32 f75, f74, 0f3E6E4BAE; +cos.approx.f32 f41, f75; +sin.approx.f32 f76, f75; +neg.f32 f42, f76; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f41; +cvt.rn.f16.f32 high, f42; +mov.b32 r576, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r579, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r581, {high, high}; +} +{ +mul.f16x2 r583, r555, r581; +} +{ +fma.rn.f16x2 r586, r519, r579, r583; +} +{ +mul.f16x2 r590, r519, r581; +} +{ +neg.f16x2 r593, r590; +} +{ +fma.rn.f16x2 r595, r555, r579, r593; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r599, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r601, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r603, {low, high}; +} +{ +mul.f16x2 r604, r601, r603; +} +{ +mul.f16x2 r607, r576, r599; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r576; +mov.b32 r610, {high, low}; +} +{ +fma.rn.f16x2 r612, r604, r610, r607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r616, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r612; +mov.b32 r618, {high, high}; +} +{ +mul.f16x2 r620, r573, r618; +} +{ +fma.rn.f16x2 r623, r537, r616, r620; +} +{ +mul.f16x2 r627, r537, r618; +} +{ +neg.f16x2 r630, r627; +} +{ +fma.rn.f16x2 r632, r573, r616, r630; +} +barrier.sync 0; +mad.lo.s32 r934, r929, 324, r933; +st.shared.u32 [r934], r495; +st.shared.u32 [r934+108], r586; +st.shared.u32 [r934+216], r623; +barrier.sync 0; +ld.shared.u32 r659, [r912]; +ld.shared.u32 r656, [r912+972]; +ld.shared.u32 r657, [r912+1944]; +barrier.sync 0; +st.shared.u32 [r934], r501; +st.shared.u32 [r934+108], r595; +st.shared.u32 [r934+216], r632; +barrier.sync 0; +ld.shared.u32 r665, [r912]; +ld.shared.u32 r662, [r912+972]; +ld.shared.u32 r663, [r912+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r653, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r654, {low, high}; +} +{ +add.f16x2 r655, r656, r657; +} +{ +add.f16x2 r658, r659, r655; +} +{ +add.f16x2 r661, r662, r663; +} +{ +add.f16x2 r664, r665, r661; +} +{ +add.f16x2 r667, r656, r657; +} +{ +mul.f16x2 r670, r667, r653; +} +{ +add.f16x2 r673, r659, r670; +} +{ +sub.f16x2 r676, r662, r663; +} +{ +mul.f16x2 r679, r676, r654; +} +{ +add.f16x2 r682, r673, r679; +} +{ +add.f16x2 r685, r656, r657; +} +{ +mul.f16x2 r688, r685, r653; +} +{ +add.f16x2 r691, r659, r688; +} +{ +sub.f16x2 r694, r662, r663; +} +{ +mul.f16x2 r697, r694, r654; +} +{ +sub.f16x2 r700, r691, r697; +} +{ +add.f16x2 r703, r662, r663; +} +{ +mul.f16x2 r706, r703, r653; +} +{ +add.f16x2 r709, r665, r706; +} +{ +sub.f16x2 r712, r656, r657; +} +{ +mul.f16x2 r715, r712, r654; +} +{ +sub.f16x2 r718, r709, r715; +} +{ +add.f16x2 r721, r662, r663; +} +{ +mul.f16x2 r724, r721, r653; +} +{ +add.f16x2 r727, r665, r724; +} +{ +sub.f16x2 r730, r656, r657; +} +{ +mul.f16x2 r733, r730, r654; +} +{ +add.f16x2 r736, r727, r733; +} +mul.wide.u32 rd10, r908, -901412889; +shr.u64 rd11, rd10, 38; +cvt.u32.u64 r935, rd11; +mul.lo.s32 r936, r935, 81; +sub.s32 r937, r908, r936; +shl.b32 r938, r937, 2; +add.s32 r939, r909, r938; +cvt.rn.f32.u32 f77, r935; +mul.f32 f78, f77, 0f3F32B8C2; +cos.approx.f32 f53, f78; +sin.approx.f32 f79, f78; +neg.f32 f54, f79; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f53; +cvt.rn.f16.f32 high, f54; +mov.b32 r739, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r744, {high, high}; +} +{ +mul.f16x2 r746, r718, r744; +} +{ +fma.rn.f16x2 r749, r682, r742, r746; +} +{ +mul.f16x2 r753, r682, r744; +} +{ +neg.f16x2 r756, r753; +} +{ +fma.rn.f16x2 r758, r718, r742, r756; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r766, {low, high}; +} +{ +mul.f16x2 r767, r764, r766; +} +{ +mul.f16x2 r770, r739, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r739; +mov.b32 r773, {high, low}; +} +{ +fma.rn.f16x2 r775, r767, r773, r770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r775; +mov.b32 r781, {high, high}; +} +{ +mul.f16x2 r783, r736, r781; +} +{ +fma.rn.f16x2 r786, r700, r779, r783; +} +{ +mul.f16x2 r790, r700, r781; +} +{ +neg.f16x2 r793, r790; +} +{ +fma.rn.f16x2 r795, r736, r779, r793; +} +barrier.sync 0; +mad.lo.s32 r940, r935, 972, r939; +st.shared.u32 [r940], r658; +st.shared.u32 [r940+324], r749; +st.shared.u32 [r940+648], r786; +barrier.sync 0; +ld.shared.u32 r822, [r912]; +ld.shared.u32 r819, [r912+972]; +ld.shared.u32 r820, [r912+1944]; +barrier.sync 0; +st.shared.u32 [r940], r664; +st.shared.u32 [r940+324], r758; +st.shared.u32 [r940+648], r795; +barrier.sync 0; +ld.shared.u32 r828, [r912]; +ld.shared.u32 r825, [r912+972]; +ld.shared.u32 r826, [r912+1944]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f62; +cvt.rn.f16.f32 high, f62; +mov.b32 r816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f64; +cvt.rn.f16.f32 high, f64; +mov.b32 r817, {low, high}; +} +{ +add.f16x2 r818, r819, r820; +} +{ +add.f16x2 %0, r822, r818; +} +{ +add.f16x2 r824, r825, r826; +} +{ +add.f16x2 %1, r828, r824; +} +{ +add.f16x2 r830, r819, r820; +} +{ +mul.f16x2 r833, r830, r816; +} +{ +add.f16x2 r836, r822, r833; +} +{ +sub.f16x2 r839, r825, r826; +} +{ +mul.f16x2 r842, r839, r817; +} +{ +add.f16x2 %2, r836, r842; +} +{ +add.f16x2 r848, r819, r820; +} +{ +mul.f16x2 r851, r848, r816; +} +{ +add.f16x2 r854, r822, r851; +} +{ +sub.f16x2 r857, r825, r826; +} +{ +mul.f16x2 r860, r857, r817; +} +{ +sub.f16x2 %4, r854, r860; +} +{ +add.f16x2 r866, r825, r826; +} +{ +mul.f16x2 r869, r866, r816; +} +{ +add.f16x2 r872, r828, r869; +} +{ +sub.f16x2 r875, r819, r820; +} +{ +mul.f16x2 r878, r875, r817; +} +{ +sub.f16x2 %3, r872, r878; +} +{ +add.f16x2 r884, r825, r826; +} +{ +mul.f16x2 r887, r884, r816; +} +{ +add.f16x2 r890, r828, r887; +} +{ +sub.f16x2 r893, r819, r820; +} +{ +mul.f16x2 r896, r893, r817; +} +{ +add.f16x2 %5, r890, r896; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..bf33d4b874487 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp32_fwd.hpp.inc @@ -0,0 +1,4870 @@ +#ifndef CUFFTDX_FFT_729_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_729_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<138, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1886>; +.reg .b32 r<18>; +.reg .b64 rd<10>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 5832, r17; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1885, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1884, %57, f1885; +mul.f32 f119, f1885, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1883, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1882, %63, f1883; +mul.f32 f135, f1883, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1881, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1880, %69, f1881; +mul.f32 f151, f1881, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f1879, f133, 0f3F441B7D; +sub.f32 f159, f1879, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f1877, f149, 0f3E31D0D4; +mul.f32 f1878, f155, 0fBF7C1C5C; +sub.f32 f164, f1877, f1878; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f1875, f134, 0f3E31D0D4; +mul.f32 f1876, f140, 0fBF7C1C5C; +sub.f32 f169, f1875, f1876; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f1873, f150, 0fBF708FB2; +mul.f32 f1874, f156, 0fBEAF1D44; +sub.f32 f174, f1873, f1874; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1872, f1882, f1880; +sub.f32 f183, f1882, f1880; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1871, f1884, f1872; +mul.f32 f187, f1872, 0f3F000000; +sub.f32 f188, f1884, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1870, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1869, f123, f1870; +mul.f32 f203, f1870, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1868, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1867, f124, f1868; +mul.f32 f219, f1868, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1864, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1862, %112, f1864; +mul.f32 f235, f1864, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1859, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1857, %115, f1859; +mul.f32 f251, f1859, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1854, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1852, %118, f1854; +mul.f32 f267, f1854, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f1851, f249, 0f3F441B7D; +sub.f32 f275, f1851, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f1850, f265, 0f3E31D0D4; +sub.f32 f280, f1850, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f1848, f250, 0f3E31D0D4; +mul.f32 f1849, f256, 0fBF7C1C5C; +sub.f32 f285, f1848, f1849; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f1846, f266, 0fBF708FB2; +mul.f32 f1847, f272, 0fBEAF1D44; +sub.f32 f290, f1846, f1847; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1845, f1857, f1852; +sub.f32 f299, f1857, f1852; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1844, f1862, f1845; +mul.f32 f303, f1845, 0f3F000000; +sub.f32 f304, f1862, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1843, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1842, f239, f1843; +mul.f32 f319, f1843, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1841, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1840, f240, f1841; +mul.f32 f335, f1841, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1837, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1835, %121, f1837; +mul.f32 f351, f1837, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1832, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1830, %124, f1832; +mul.f32 f367, f1832, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1828, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1826, %126, f1828; +mul.f32 f383, f1828, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f1825, f365, 0f3F441B7D; +sub.f32 f391, f1825, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f1824, f381, 0f3E31D0D4; +sub.f32 f396, f1824, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f1822, f366, 0f3E31D0D4; +mul.f32 f1823, f372, 0fBF7C1C5C; +sub.f32 f401, f1822, f1823; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f1820, f382, 0fBF708FB2; +mul.f32 f1821, f388, 0fBEAF1D44; +sub.f32 f406, f1820, f1821; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1819, f1830, f1826; +sub.f32 f415, f1830, f1826; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1818, f1835, f1819; +mul.f32 f419, f1819, 0f3F000000; +sub.f32 f420, f1835, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1817, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1816, f355, f1817; +mul.f32 f435, f1817, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1815, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1814, f356, f1815; +mul.f32 f451, f1815, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1842, 0fBE6C2691; +mul.f32 f1813, f310, 0f3F791978; +sub.f32 f459, f1813, f458; +mul.f32 f460, f1842, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f1811, f426, 0f3F64C51C; +mul.f32 f1812, f1816, 0fBEE5C902; +sub.f32 f464, f1811, f1812; +mul.f32 f465, f1816, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f1809, f326, 0f3F64C51C; +mul.f32 f1810, f1840, 0fBEE5C902; +sub.f32 f469, f1809, f1810; +mul.f32 f470, f1840, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f1807, f442, 0f3F18DF63; +mul.f32 f1808, f1814, 0fBF4D57F2; +sub.f32 f474, f1807, f1808; +mul.f32 f475, f1814, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f1805, f301, 0f3F441B7D; +mul.f32 f1806, f307, 0fBF248DBB; +sub.f32 f479, f1805, f1806; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f1804, f417, 0f3E31D0D4; +sub.f32 f484, f1804, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f1803, f317, 0f3F18DF63; +sub.f32 f489, f1803, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f1802, f433, 0fBE92D7E0; +sub.f32 f494, f1802, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f1801, f333, 0f3ECACAF8; +sub.f32 f499, f1801, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f1800, f449, 0fBF2FAD88; +sub.f32 f504, f1800, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f1799, f302, 0f3E31D0D4; +sub.f32 f509, f1799, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f1797, f418, 0fBF708FB2; +mul.f32 f1798, f424, 0fBEAF1D44; +sub.f32 f514, f1797, f1798; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f1795, f318, 0fBD6E2946; +mul.f32 f1796, f324, 0fBF7F9120; +sub.f32 f519, f1795, f1796; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f1793, f434, 0fBF7E44DE; +mul.f32 f1794, f440, 0f3DEDC21F; +sub.f32 f524, f1793, f1794; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f1792, f334, 0fBE92D7E0; +sub.f32 f529, f1792, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f1791, f450, 0fBF55E287; +sub.f32 f534, f1791, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f1790, f1844, f1818; +sub.f32 f541, f1844, f1818; +mul.f32 f542, f541, 0f3F5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f1790, 0f3F000000; +sub.f32 f546, f1871, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0f3F5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f1789, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0f3F5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f1788, f1869, f1789; +mul.f32 f561, f1789, 0f3F000000; +sub.f32 f562, f1869, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0f3F5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f1787, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0f3F5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f1786, f1867, f1787; +mul.f32 f577, f1787, 0f3F000000; +sub.f32 f578, f1867, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0f3F5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f1785, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0f3F5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f1784, f191, f1785; +mul.f32 f593, f1785, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0f3F5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f1783, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f1782, f207, f1783; +mul.f32 f609, f1783, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0f3F5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f1781, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0f3F5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f1780, f223, f1781; +mul.f32 f625, f1781, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0f3F5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f1779, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0f3F5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f1778, f192, f1779; +mul.f32 f641, f1779, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0f3F5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f1777, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0f3F5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f1776, f208, f1777; +mul.f32 f657, f1777, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0f3F5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f1775, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0f3F5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f1774, f224, f1775; +mul.f32 f673, f1775, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0f3F5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mad.lo.s32 r12, r9, 5832, r3; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r11, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f684, f680, f1788; +mul.f32 f685, f679, f1788; +mul.f32 f687, f680, f680; +mul.f32 f1773, f679, f679; +sub.f32 f688, f1773, f687; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f692, f690, f1786; +mul.f32 f693, f688, f1786; +mul.f32 f1771, f679, f688; +mul.f32 f1772, f680, f690; +sub.f32 f696, f1771, f1772; +mul.f32 f1770, f688, f568; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f700, f698, f1784; +mul.f32 f701, f696, f1784; +mul.f32 f703, f680, f698; +mul.f32 f1769, f679, f696; +sub.f32 f704, f1769, f703; +mul.f32 f1768, f696, f584; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f708, f706, f1782; +mul.f32 f709, f704, f1782; +mul.f32 f1766, f679, f704; +mul.f32 f1767, f680, f706; +sub.f32 f712, f1766, f1767; +mul.f32 f1765, f704, f600; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f716, f714, f1780; +mul.f32 f717, f712, f1780; +mul.f32 f719, f680, f714; +mul.f32 f1764, f679, f712; +sub.f32 f720, f1764, f719; +mul.f32 f1763, f712, f616; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f724, f722, f1778; +mul.f32 f725, f720, f1778; +mul.f32 f727, f680, f722; +mul.f32 f1762, f679, f720; +sub.f32 f728, f1762, f727; +mul.f32 f1761, f720, f632; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f732, f730, f1776; +mul.f32 f733, f728, f1776; +mul.f32 f1759, f679, f728; +mul.f32 f1760, f680, f730; +sub.f32 f736, f1759, f1760; +mul.f32 f1758, f728, f648; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f740, f738, f1774; +mul.f32 f741, f736, f1774; +mul.f32 f743, f680, f738; +mul.f32 f1757, f679, f736; +sub.f32 f744, f1757, f743; +mul.f32 f1756, f736, f664; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f748, f746, f549; +mul.f32 f749, f744, f549; +mul.f32 f751, f680, f746; +mul.f32 f1755, f679, f744; +sub.f32 f752, f1755, f751; +mul.f32 f1754, f744, f543; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f756, f754, f565; +mul.f32 f757, f752, f565; +mul.f32 f1752, f679, f752; +mul.f32 f1753, f680, f754; +sub.f32 f760, f1752, f1753; +mul.f32 f1751, f752, f559; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f764, f762, f581; +mul.f32 f765, f760, f581; +mul.f32 f767, f680, f762; +mul.f32 f1750, f679, f760; +sub.f32 f768, f1750, f767; +mul.f32 f1749, f760, f575; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f772, f770, f597; +mul.f32 f773, f768, f597; +mul.f32 f1747, f679, f768; +mul.f32 f1748, f680, f770; +sub.f32 f776, f1747, f1748; +mul.f32 f1746, f768, f591; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f780, f778, f613; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f1745, f679, f776; +sub.f32 f784, f1745, f783; +mul.f32 f1744, f776, f607; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f788, f786, f629; +mul.f32 f789, f784, f629; +mul.f32 f791, f680, f786; +mul.f32 f1743, f679, f784; +sub.f32 f792, f1743, f791; +mul.f32 f1742, f784, f623; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f796, f794, f645; +mul.f32 f797, f792, f645; +mul.f32 f1740, f679, f792; +mul.f32 f1741, f680, f794; +sub.f32 f800, f1740, f1741; +mul.f32 f1739, f792, f639; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f804, f802, f661; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f1738, f679, f800; +sub.f32 f808, f1738, f807; +mul.f32 f1737, f800, f655; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f812, f810, f677; +mul.f32 f813, f808, f677; +mul.f32 f815, f680, f810; +mul.f32 f1736, f679, f808; +sub.f32 f816, f1736, f815; +mul.f32 f1735, f808, f671; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f820, f818, f550; +mul.f32 f821, f816, f550; +mul.f32 f1733, f679, f816; +mul.f32 f1734, f680, f818; +sub.f32 f824, f1733, f1734; +mul.f32 f1732, f816, f544; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f828, f826, f566; +mul.f32 f829, f824, f566; +mul.f32 f831, f680, f826; +mul.f32 f1731, f679, f824; +sub.f32 f832, f1731, f831; +mul.f32 f1730, f824, f560; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f836, f834, f582; +mul.f32 f837, f832, f582; +mul.f32 f1728, f679, f832; +mul.f32 f1729, f680, f834; +sub.f32 f840, f1728, f1729; +mul.f32 f1727, f832, f576; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f844, f842, f598; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f1726, f679, f840; +sub.f32 f848, f1726, f847; +mul.f32 f1725, f840, f592; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f852, f850, f614; +mul.f32 f853, f848, f614; +mul.f32 f855, f680, f850; +mul.f32 f1724, f679, f848; +sub.f32 f856, f1724, f855; +mul.f32 f1723, f848, f608; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f860, f858, f630; +mul.f32 f861, f856, f630; +mul.f32 f1721, f679, f856; +mul.f32 f1722, f680, f858; +sub.f32 f864, f1721, f1722; +mul.f32 f1720, f856, f624; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f868, f866, f646; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f1719, f679, f864; +sub.f32 f872, f1719, f871; +mul.f32 f1718, f864, f640; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f876, f874, f662; +mul.f32 f877, f872, f662; +mul.f32 f879, f680, f874; +mul.f32 f1717, f679, f872; +sub.f32 f880, f1717, f879; +mul.f32 f1716, f679, f552; +mul.f32 f881, f679, f874; +mul.f32 f1715, f872, f656; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f880, f672; +mul.f32 f884, f882, f678; +mul.f32 f885, f880, f678; +barrier.sync 0; +mad.lo.s32 r13, r11, 216, r12; +add.f32 f886, f1871, f1790; +add.f32 f887, f178, f537; +st.shared.v2.f32 [r13], {f887, f886}; +fma.rn.f32 f888, f680, f552, f685; +sub.f32 f889, f1716, f684; +st.shared.v2.f32 [r13+8], {f889, f888}; +fma.rn.f32 f890, f690, f568, f693; +sub.f32 f891, f1770, f692; +st.shared.v2.f32 [r13+16], {f891, f890}; +fma.rn.f32 f892, f698, f584, f701; +sub.f32 f893, f1768, f700; +st.shared.v2.f32 [r13+24], {f893, f892}; +fma.rn.f32 f894, f706, f600, f709; +sub.f32 f895, f1765, f708; +st.shared.v2.f32 [r13+32], {f895, f894}; +fma.rn.f32 f896, f714, f616, f717; +sub.f32 f897, f1763, f716; +st.shared.v2.f32 [r13+40], {f897, f896}; +fma.rn.f32 f898, f722, f632, f725; +sub.f32 f899, f1761, f724; +st.shared.v2.f32 [r13+48], {f899, f898}; +sub.f32 f900, f1758, f732; +fma.rn.f32 f901, f730, f648, f733; +st.shared.v2.f32 [r13+56], {f900, f901}; +fma.rn.f32 f902, f738, f664, f741; +sub.f32 f903, f1756, f740; +st.shared.v2.f32 [r13+64], {f903, f902}; +fma.rn.f32 f904, f746, f543, f749; +sub.f32 f905, f1754, f748; +st.shared.v2.f32 [r13+72], {f905, f904}; +fma.rn.f32 f906, f754, f559, f757; +sub.f32 f907, f1751, f756; +st.shared.v2.f32 [r13+80], {f907, f906}; +fma.rn.f32 f908, f762, f575, f765; +sub.f32 f909, f1749, f764; +st.shared.v2.f32 [r13+88], {f909, f908}; +fma.rn.f32 f910, f770, f591, f773; +sub.f32 f911, f1746, f772; +st.shared.v2.f32 [r13+96], {f911, f910}; +fma.rn.f32 f912, f778, f607, f781; +sub.f32 f913, f1744, f780; +st.shared.v2.f32 [r13+104], {f913, f912}; +fma.rn.f32 f914, f786, f623, f789; +sub.f32 f915, f1742, f788; +st.shared.v2.f32 [r13+112], {f915, f914}; +fma.rn.f32 f916, f794, f639, f797; +sub.f32 f917, f1739, f796; +st.shared.v2.f32 [r13+120], {f917, f916}; +fma.rn.f32 f918, f802, f655, f805; +sub.f32 f919, f1737, f804; +st.shared.v2.f32 [r13+128], {f919, f918}; +fma.rn.f32 f920, f810, f671, f813; +sub.f32 f921, f1735, f812; +st.shared.v2.f32 [r13+136], {f921, f920}; +fma.rn.f32 f922, f818, f544, f821; +sub.f32 f923, f1732, f820; +st.shared.v2.f32 [r13+144], {f923, f922}; +fma.rn.f32 f924, f826, f560, f829; +sub.f32 f925, f1730, f828; +st.shared.v2.f32 [r13+152], {f925, f924}; +fma.rn.f32 f926, f834, f576, f837; +sub.f32 f927, f1727, f836; +st.shared.v2.f32 [r13+160], {f927, f926}; +fma.rn.f32 f928, f842, f592, f845; +sub.f32 f929, f1725, f844; +st.shared.v2.f32 [r13+168], {f929, f928}; +fma.rn.f32 f930, f850, f608, f853; +sub.f32 f931, f1723, f852; +st.shared.v2.f32 [r13+176], {f931, f930}; +fma.rn.f32 f932, f858, f624, f861; +sub.f32 f933, f1720, f860; +st.shared.v2.f32 [r13+184], {f933, f932}; +fma.rn.f32 f934, f866, f640, f869; +sub.f32 f935, f1718, f868; +st.shared.v2.f32 [r13+192], {f935, f934}; +fma.rn.f32 f936, f874, f656, f877; +sub.f32 f937, f1715, f876; +st.shared.v2.f32 [r13+200], {f937, f936}; +fma.rn.f32 f938, f882, f672, f885; +sub.f32 f939, f883, f884; +st.shared.v2.f32 [r13+208], {f939, f938}; +barrier.sync 0; +mad.lo.s32 r14, r11, -208, r13; +ld.shared.v2.f32 {f940, f941}, [r14]; +ld.shared.v2.f32 {f944, f945}, [r14+216]; +ld.shared.v2.f32 {f948, f949}, [r14+432]; +ld.shared.v2.f32 {f952, f953}, [r14+648]; +ld.shared.v2.f32 {f956, f957}, [r14+864]; +ld.shared.v2.f32 {f960, f961}, [r14+1080]; +ld.shared.v2.f32 {f964, f965}, [r14+1296]; +ld.shared.v2.f32 {f968, f969}, [r14+1512]; +ld.shared.v2.f32 {f972, f973}, [r14+1728]; +ld.shared.v2.f32 {f976, f977}, [r14+1944]; +ld.shared.v2.f32 {f980, f981}, [r14+2160]; +ld.shared.v2.f32 {f984, f985}, [r14+2376]; +ld.shared.v2.f32 {f988, f989}, [r14+2592]; +ld.shared.v2.f32 {f992, f993}, [r14+2808]; +ld.shared.v2.f32 {f996, f997}, [r14+3024]; +ld.shared.v2.f32 {f1000, f1001}, [r14+3240]; +ld.shared.v2.f32 {f1004, f1005}, [r14+3456]; +ld.shared.v2.f32 {f1008, f1009}, [r14+3672]; +ld.shared.v2.f32 {f1012, f1013}, [r14+3888]; +ld.shared.v2.f32 {f1016, f1017}, [r14+4104]; +ld.shared.v2.f32 {f1020, f1021}, [r14+4320]; +ld.shared.v2.f32 {f1024, f1025}, [r14+4536]; +ld.shared.v2.f32 {f1028, f1029}, [r14+4752]; +ld.shared.v2.f32 {f1032, f1033}, [r14+4968]; +ld.shared.v2.f32 {f1036, f1037}, [r14+5184]; +ld.shared.v2.f32 {f1040, f1041}, [r14+5400]; +ld.shared.v2.f32 {f1044, f1045}, [r14+5616]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f1714, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0f3F5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f1713, f941, f1714; +mul.f32 f1058, f1714, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0f3F5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f1712, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0f3F5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f1711, f953, f1712; +mul.f32 f1074, f1712, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0f3F5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f1710, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0f3F5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f1709, f965, f1710; +mul.f32 f1090, f1710, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0f3F5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f1097, f1078, 0fBF248DBB; +mul.f32 f1708, f1072, 0f3F441B7D; +sub.f32 f1098, f1708, f1097; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0fBF248DBB, f1099; +mul.f32 f1102, f1094, 0fBF7C1C5C; +mul.f32 f1707, f1088, 0f3E31D0D4; +sub.f32 f1103, f1707, f1102; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0fBF7C1C5C, f1104; +mul.f32 f1705, f1073, 0f3E31D0D4; +mul.f32 f1706, f1079, 0fBF7C1C5C; +sub.f32 f1108, f1705, f1706; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0fBF7C1C5C, f1109; +mul.f32 f1703, f1089, 0fBF708FB2; +mul.f32 f1704, f1095, 0fBEAF1D44; +sub.f32 f1113, f1703, f1704; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0fBEAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f1702, f1711, f1709; +sub.f32 f1122, f1711, f1709; +mul.f32 f1123, f1122, 0f3F5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f1701, f1713, f1702; +mul.f32 f1126, f1702, 0f3F000000; +sub.f32 f1127, f1713, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0f3F5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f1700, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0f3F5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f1699, f1062, f1700; +mul.f32 f1142, f1700, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0f3F5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f1698, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0f3F5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f1697, f1063, f1698; +mul.f32 f1158, f1698, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0f3F5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f1696, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0f3F5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f1695, f945, f1696; +mul.f32 f1174, f1696, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0f3F5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f1694, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0f3F5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f1693, f957, f1694; +mul.f32 f1190, f1694, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0f3F5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f1692, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0f3F5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f1691, f969, f1692; +mul.f32 f1206, f1692, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0f3F5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f1213, f1194, 0fBF248DBB; +mul.f32 f1690, f1188, 0f3F441B7D; +sub.f32 f1214, f1690, f1213; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0fBF248DBB, f1215; +mul.f32 f1218, f1210, 0fBF7C1C5C; +mul.f32 f1689, f1204, 0f3E31D0D4; +sub.f32 f1219, f1689, f1218; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0fBF7C1C5C, f1220; +mul.f32 f1223, f1195, 0fBF7C1C5C; +mul.f32 f1688, f1189, 0f3E31D0D4; +sub.f32 f1224, f1688, f1223; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0fBF7C1C5C, f1225; +mul.f32 f1686, f1205, 0fBF708FB2; +mul.f32 f1687, f1211, 0fBEAF1D44; +sub.f32 f1229, f1686, f1687; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0fBEAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f1685, f1693, f1691; +sub.f32 f1238, f1693, f1691; +mul.f32 f1239, f1238, 0f3F5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f1684, f1695, f1685; +mul.f32 f1242, f1685, 0f3F000000; +sub.f32 f1243, f1695, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0f3F5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f1683, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0f3F5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f1682, f1178, f1683; +mul.f32 f1258, f1683, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0f3F5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f1681, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0f3F5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f1680, f1179, f1681; +mul.f32 f1274, f1681, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0f3F5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f1679, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0f3F5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f1678, f949, f1679; +mul.f32 f1290, f1679, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0f3F5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f1677, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0f3F5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f1676, f961, f1677; +mul.f32 f1306, f1677, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0f3F5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f1675, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0f3F5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f1674, f973, f1675; +mul.f32 f1322, f1675, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0f3F5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0fBF248DBB; +mul.f32 f1673, f1304, 0f3F441B7D; +sub.f32 f1330, f1673, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0fBF248DBB, f1331; +mul.f32 f1334, f1326, 0fBF7C1C5C; +mul.f32 f1672, f1320, 0f3E31D0D4; +sub.f32 f1335, f1672, f1334; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0fBF7C1C5C, f1336; +mul.f32 f1339, f1311, 0fBF7C1C5C; +mul.f32 f1671, f1305, 0f3E31D0D4; +sub.f32 f1340, f1671, f1339; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0fBF7C1C5C, f1341; +mul.f32 f1669, f1321, 0fBF708FB2; +mul.f32 f1670, f1327, 0fBEAF1D44; +sub.f32 f1345, f1669, f1670; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0fBEAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f1668, f1676, f1674; +sub.f32 f1354, f1676, f1674; +mul.f32 f1355, f1354, 0f3F5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f1667, f1678, f1668; +mul.f32 f1358, f1668, 0f3F000000; +sub.f32 f1359, f1678, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0f3F5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f1666, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0f3F5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f1665, f1294, f1666; +mul.f32 f1374, f1666, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0f3F5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f1664, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0f3F5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f1663, f1295, f1664; +mul.f32 f1390, f1664, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0f3F5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1661, f1249, 0f3F791978; +mul.f32 f1662, f1682, 0fBE6C2691; +sub.f32 f1398, f1661, f1662; +mul.f32 f1399, f1682, 0f3F791978; +fma.rn.f32 f1400, f1249, 0fBE6C2691, f1399; +mul.f32 f1402, f1665, 0fBEE5C902; +mul.f32 f1660, f1365, 0f3F64C51C; +sub.f32 f1403, f1660, f1402; +mul.f32 f1404, f1665, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0fBEE5C902, f1404; +mul.f32 f1407, f1680, 0fBEE5C902; +mul.f32 f1659, f1265, 0f3F64C51C; +sub.f32 f1408, f1659, f1407; +mul.f32 f1409, f1680, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0fBEE5C902, f1409; +mul.f32 f1412, f1663, 0fBF4D57F2; +mul.f32 f1658, f1381, 0f3F18DF63; +sub.f32 f1413, f1658, f1412; +mul.f32 f1414, f1663, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0fBF4D57F2, f1414; +mul.f32 f1417, f1246, 0fBF248DBB; +mul.f32 f1657, f1240, 0f3F441B7D; +sub.f32 f1418, f1657, f1417; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0fBF248DBB, f1419; +mul.f32 f1422, f1362, 0fBF7C1C5C; +mul.f32 f1656, f1356, 0f3E31D0D4; +sub.f32 f1423, f1656, f1422; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0fBF7C1C5C, f1424; +mul.f32 f1654, f1256, 0f3F18DF63; +mul.f32 f1655, f1262, 0fBF4D57F2; +sub.f32 f1428, f1654, f1655; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0fBF4D57F2, f1429; +mul.f32 f1652, f1372, 0fBE92D7E0; +mul.f32 f1653, f1378, 0fBF753ECD; +sub.f32 f1433, f1652, f1653; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0fBF753ECD, f1434; +mul.f32 f1650, f1272, 0f3ECACAF8; +mul.f32 f1651, f1278, 0fBF6B1036; +sub.f32 f1438, f1650, f1651; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0fBF6B1036, f1439; +mul.f32 f1648, f1388, 0fBF2FAD88; +mul.f32 f1649, f1394, 0fBF3A3529; +sub.f32 f1443, f1648, f1649; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0fBF3A3529, f1444; +mul.f32 f1447, f1247, 0fBF7C1C5C; +mul.f32 f1647, f1241, 0f3E31D0D4; +sub.f32 f1448, f1647, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0fBF7C1C5C, f1449; +mul.f32 f1452, f1363, 0fBEAF1D44; +mul.f32 f1646, f1357, 0fBF708FB2; +sub.f32 f1453, f1646, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0fBEAF1D44, f1454; +mul.f32 f1457, f1263, 0fBF7F9120; +mul.f32 f1645, f1257, 0fBD6E2946; +sub.f32 f1458, f1645, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0fBF7F9120, f1459; +mul.f32 f1462, f1379, 0f3DEDC21F; +mul.f32 f1644, f1373, 0fBF7E44DE; +sub.f32 f1463, f1644, f1462; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0f3DEDC21F, f1464; +mul.f32 f1467, f1279, 0fBF753ECD; +mul.f32 f1643, f1273, 0fBE92D7E0; +sub.f32 f1468, f1643, f1467; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0fBF753ECD, f1469; +mul.f32 f1641, f1389, 0fBF55E287; +mul.f32 f1642, f1395, 0f3F0CAC9F; +sub.f32 f1473, f1641, f1642; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0f3F0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f1640, f1684, f1667; +sub.f32 f1480, f1684, f1667; +mul.f32 f1481, f1480, 0f3F5DB3D7; +mul.f32 f1482, f1640, 0f3F000000; +sub.f32 f1483, f1701, f1482; +sub.f32 f1484, f1233, f1349; +mul.f32 f1485, f1484, 0f3F5DB3D7; +add.f32 f1486, f1398, f1403; +mul.f32 f1488, f1486, 0f3F000000; +sub.f32 f1489, f1133, f1488; +add.f32 f1639, f1400, f1405; +sub.f32 f1490, f1400, f1405; +mul.f32 f1491, f1490, 0f3F5DB3D7; +mul.f32 f1492, f1639, 0f3F000000; +sub.f32 f1493, f1699, f1492; +sub.f32 f1494, f1398, f1403; +mul.f32 f1495, f1494, 0f3F5DB3D7; +add.f32 f1496, f1408, f1413; +mul.f32 f1498, f1496, 0f3F000000; +sub.f32 f1499, f1149, f1498; +add.f32 f1638, f1410, f1415; +sub.f32 f1500, f1410, f1415; +mul.f32 f1501, f1500, 0f3F5DB3D7; +mul.f32 f1502, f1638, 0f3F000000; +sub.f32 f1503, f1697, f1502; +sub.f32 f1504, f1408, f1413; +mul.f32 f1505, f1504, 0f3F5DB3D7; +add.f32 f1506, f1418, f1423; +mul.f32 f1508, f1506, 0f3F000000; +sub.f32 f1509, f1124, f1508; +add.f32 f1637, f1420, f1425; +sub.f32 f1510, f1420, f1425; +mul.f32 f1511, f1510, 0f3F5DB3D7; +mul.f32 f1512, f1637, 0f3F000000; +sub.f32 f1513, f1130, f1512; +sub.f32 f1514, f1418, f1423; +mul.f32 f1515, f1514, 0f3F5DB3D7; +add.f32 f1516, f1428, f1433; +mul.f32 f1518, f1516, 0f3F000000; +sub.f32 f1519, f1140, f1518; +add.f32 f1636, f1430, f1435; +sub.f32 f1520, f1430, f1435; +mul.f32 f1521, f1520, 0f3F5DB3D7; +mul.f32 f1522, f1636, 0f3F000000; +sub.f32 f1523, f1146, f1522; +sub.f32 f1524, f1428, f1433; +mul.f32 f1525, f1524, 0f3F5DB3D7; +add.f32 f1526, f1438, f1443; +mul.f32 f1528, f1526, 0f3F000000; +sub.f32 f1529, f1156, f1528; +add.f32 f1635, f1440, f1445; +sub.f32 f1530, f1440, f1445; +mul.f32 f1531, f1530, 0f3F5DB3D7; +mul.f32 f1532, f1635, 0f3F000000; +sub.f32 f1533, f1162, f1532; +sub.f32 f1534, f1438, f1443; +mul.f32 f1535, f1534, 0f3F5DB3D7; +add.f32 f1536, f1448, f1453; +mul.f32 f1538, f1536, 0f3F000000; +sub.f32 f1539, f1125, f1538; +add.f32 f1634, f1450, f1455; +sub.f32 f1540, f1450, f1455; +mul.f32 f1541, f1540, 0f3F5DB3D7; +mul.f32 f1542, f1634, 0f3F000000; +sub.f32 f1543, f1131, f1542; +sub.f32 f1544, f1448, f1453; +mul.f32 f1545, f1544, 0f3F5DB3D7; +add.f32 f1546, f1458, f1463; +mul.f32 f1548, f1546, 0f3F000000; +sub.f32 f1549, f1141, f1548; +add.f32 f1633, f1460, f1465; +sub.f32 f1550, f1460, f1465; +mul.f32 f1551, f1550, 0f3F5DB3D7; +mul.f32 f1552, f1633, 0f3F000000; +sub.f32 f1553, f1147, f1552; +sub.f32 f1554, f1458, f1463; +mul.f32 f1555, f1554, 0f3F5DB3D7; +add.f32 f1556, f1468, f1473; +mul.f32 f1558, f1556, 0f3F000000; +sub.f32 f1559, f1157, f1558; +add.f32 f1632, f1470, f1475; +sub.f32 f1560, f1470, f1475; +mul.f32 f1561, f1560, 0f3F5DB3D7; +mul.f32 f1562, f1632, 0f3F000000; +sub.f32 f1563, f1163, f1562; +sub.f32 f1564, f1468, f1473; +mul.f32 f1565, f1564, 0f3F5DB3D7; +add.f32 %1, f1701, f1640; +add.f32 %0, f1117, f1476; +add.f32 %3, f1699, f1639; +add.f32 %2, f1133, f1486; +add.f32 %5, f1697, f1638; +add.f32 %4, f1149, f1496; +add.f32 %7, f1130, f1637; +add.f32 %6, f1124, f1506; +add.f32 %9, f1146, f1636; +add.f32 %8, f1140, f1516; +add.f32 %11, f1162, f1635; +add.f32 %10, f1156, f1526; +add.f32 %13, f1131, f1634; +add.f32 %12, f1125, f1536; +add.f32 %15, f1147, f1633; +add.f32 %14, f1141, f1546; +add.f32 %17, f1163, f1632; +add.f32 %16, f1157, f1556; +add.f32 %18, f1481, f1479; +sub.f32 %19, f1483, f1485; +add.f32 %20, f1491, f1489; +sub.f32 %21, f1493, f1495; +sub.f32 %23, f1503, f1505; +add.f32 %22, f1501, f1499; +sub.f32 %25, f1513, f1515; +add.f32 %24, f1511, f1509; +sub.f32 %27, f1523, f1525; +add.f32 %26, f1521, f1519; +add.f32 %28, f1531, f1529; +sub.f32 %29, f1533, f1535; +add.f32 %30, f1541, f1539; +sub.f32 %31, f1543, f1545; +add.f32 %32, f1551, f1549; +sub.f32 %33, f1553, f1555; +add.f32 %34, f1561, f1559; +sub.f32 %35, f1563, f1565; +add.f32 %37, f1485, f1483; +sub.f32 %36, f1479, f1481; +add.f32 %39, f1495, f1493; +sub.f32 %38, f1489, f1491; +add.f32 %41, f1505, f1503; +sub.f32 %40, f1499, f1501; +add.f32 %43, f1515, f1513; +sub.f32 %42, f1509, f1511; +add.f32 %45, f1525, f1523; +sub.f32 %44, f1519, f1521; +add.f32 %47, f1535, f1533; +sub.f32 %46, f1529, f1531; +add.f32 %49, f1545, f1543; +sub.f32 %48, f1539, f1541; +add.f32 %51, f1555, f1553; +sub.f32 %50, f1549, f1551; +add.f32 %53, f1565, f1563; +sub.f32 %52, f1559, f1561; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<139, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1861>; +.reg .b32 r<18>; +.reg .b64 rd<10>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 2916, r17; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1852, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1851, %57, f1852; +mul.f32 f119, f1852, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1850, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1849, %63, f1850; +mul.f32 f135, f1850, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1848, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1847, %69, f1848; +mul.f32 f151, f1848, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f1846, f133, 0f3F441B7D; +sub.f32 f159, f1846, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f1844, f149, 0f3E31D0D4; +mul.f32 f1845, f155, 0fBF7C1C5C; +sub.f32 f164, f1844, f1845; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f1842, f134, 0f3E31D0D4; +mul.f32 f1843, f140, 0fBF7C1C5C; +sub.f32 f169, f1842, f1843; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f1840, f150, 0fBF708FB2; +mul.f32 f1841, f156, 0fBEAF1D44; +sub.f32 f174, f1840, f1841; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1839, f1849, f1847; +sub.f32 f183, f1849, f1847; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1838, f1851, f1839; +mul.f32 f187, f1839, 0f3F000000; +sub.f32 f188, f1851, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1837, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1836, f123, f1837; +mul.f32 f203, f1837, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1835, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1834, f124, f1835; +mul.f32 f219, f1835, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1831, %110, %111; +sub.f32 f231, %110, %111; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1829, %112, f1831; +mul.f32 f235, f1831, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1826, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1824, %115, f1826; +mul.f32 f251, f1826, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1821, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1819, %118, f1821; +mul.f32 f267, f1821, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f1818, f249, 0f3F441B7D; +sub.f32 f275, f1818, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f1817, f265, 0f3E31D0D4; +sub.f32 f280, f1817, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f1815, f250, 0f3E31D0D4; +mul.f32 f1816, f256, 0fBF7C1C5C; +sub.f32 f285, f1815, f1816; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f1813, f266, 0fBF708FB2; +mul.f32 f1814, f272, 0fBEAF1D44; +sub.f32 f290, f1813, f1814; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1812, f1824, f1819; +sub.f32 f299, f1824, f1819; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1811, f1829, f1812; +mul.f32 f303, f1812, 0f3F000000; +sub.f32 f304, f1829, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1810, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1809, f239, f1810; +mul.f32 f319, f1810, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1808, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1807, f240, f1808; +mul.f32 f335, f1808, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1804, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1802, %121, f1804; +mul.f32 f351, f1804, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1799, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1797, %124, f1799; +mul.f32 f367, f1799, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1795, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1793, %126, f1795; +mul.f32 f383, f1795, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f1792, f365, 0f3F441B7D; +sub.f32 f391, f1792, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f1791, f381, 0f3E31D0D4; +sub.f32 f396, f1791, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f1789, f366, 0f3E31D0D4; +mul.f32 f1790, f372, 0fBF7C1C5C; +sub.f32 f401, f1789, f1790; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f1787, f382, 0fBF708FB2; +mul.f32 f1788, f388, 0fBEAF1D44; +sub.f32 f406, f1787, f1788; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1786, f1797, f1793; +sub.f32 f415, f1797, f1793; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1785, f1802, f1786; +mul.f32 f419, f1786, 0f3F000000; +sub.f32 f420, f1802, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1784, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1783, f355, f1784; +mul.f32 f435, f1784, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1782, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1781, f356, f1782; +mul.f32 f451, f1782, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1809, 0fBE6C2691; +mul.f32 f1780, f310, 0f3F791978; +sub.f32 f459, f1780, f458; +mul.f32 f460, f1809, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f1778, f426, 0f3F64C51C; +mul.f32 f1779, f1783, 0fBEE5C902; +sub.f32 f464, f1778, f1779; +mul.f32 f465, f1783, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f1776, f326, 0f3F64C51C; +mul.f32 f1777, f1807, 0fBEE5C902; +sub.f32 f469, f1776, f1777; +mul.f32 f470, f1807, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f1774, f442, 0f3F18DF63; +mul.f32 f1775, f1781, 0fBF4D57F2; +sub.f32 f474, f1774, f1775; +mul.f32 f475, f1781, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f1772, f301, 0f3F441B7D; +mul.f32 f1773, f307, 0fBF248DBB; +sub.f32 f479, f1772, f1773; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f1771, f417, 0f3E31D0D4; +sub.f32 f484, f1771, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f1770, f317, 0f3F18DF63; +sub.f32 f489, f1770, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f1769, f433, 0fBE92D7E0; +sub.f32 f494, f1769, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f1768, f333, 0f3ECACAF8; +sub.f32 f499, f1768, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f1767, f449, 0fBF2FAD88; +sub.f32 f504, f1767, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f1766, f302, 0f3E31D0D4; +sub.f32 f509, f1766, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f1764, f418, 0fBF708FB2; +mul.f32 f1765, f424, 0fBEAF1D44; +sub.f32 f514, f1764, f1765; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f1762, f318, 0fBD6E2946; +mul.f32 f1763, f324, 0fBF7F9120; +sub.f32 f519, f1762, f1763; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f1760, f434, 0fBF7E44DE; +mul.f32 f1761, f440, 0f3DEDC21F; +sub.f32 f524, f1760, f1761; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f1759, f334, 0fBE92D7E0; +sub.f32 f529, f1759, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f1758, f450, 0fBF55E287; +sub.f32 f534, f1758, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f1757, f1811, f1785; +sub.f32 f543, f1811, f1785; +mul.f32 f544, f543, 0f3F5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f1756, f1838, f1757; +mul.f32 f547, f1757, 0f3F000000; +sub.f32 f548, f1838, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0f3F5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f1755, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f1754, f1836, f1755; +mul.f32 f563, f1755, 0f3F000000; +sub.f32 f564, f1836, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0f3F5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f1753, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f1752, f1834, f1753; +mul.f32 f579, f1753, 0f3F000000; +sub.f32 f580, f1834, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0f3F5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f1751, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0f3F5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f1750, f191, f1751; +mul.f32 f595, f1751, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0f3F5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f1749, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0f3F5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f1748, f207, f1749; +mul.f32 f611, f1749, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0f3F5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f1747, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0f3F5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f1746, f223, f1747; +mul.f32 f627, f1747, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0f3F5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f1745, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0f3F5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f1744, f192, f1745; +mul.f32 f643, f1745, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0f3F5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f1743, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0f3F5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f1742, f208, f1743; +mul.f32 f659, f1743, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0f3F5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f1741, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0f3F5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f1740, f224, f1741; +mul.f32 f675, f1741, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0f3F5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mul.wide.u32 rd7, r11, 8; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f1738, f681, f554; +mul.f32 f1739, f682, f1754; +sub.f32 f687, f1738, f1739; +mul.f32 f688, f681, f1754; +fma.rn.f32 f689, f682, f554, f688; +mul.f32 f1736, f681, f681; +mul.f32 f1737, f682, f682; +sub.f32 f692, f1736, f1737; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f1734, f692, f570; +mul.f32 f1735, f694, f1752; +sub.f32 f697, f1734, f1735; +mul.f32 f698, f692, f1752; +fma.rn.f32 f699, f694, f570, f698; +mul.f32 f701, f682, f694; +mul.f32 f1733, f681, f692; +sub.f32 f702, f1733, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f706, f704, f1750; +mul.f32 f1732, f702, f586; +sub.f32 f707, f1732, f706; +mul.f32 f708, f702, f1750; +fma.rn.f32 f709, f704, f586, f708; +mul.f32 f711, f682, f704; +mul.f32 f1731, f681, f702; +sub.f32 f712, f1731, f711; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f716, f714, f1748; +mul.f32 f1730, f712, f602; +sub.f32 f717, f1730, f716; +mul.f32 f718, f712, f1748; +fma.rn.f32 f719, f714, f602, f718; +mul.f32 f721, f682, f714; +mul.f32 f1729, f681, f712; +sub.f32 f722, f1729, f721; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f1727, f722, f618; +mul.f32 f1728, f724, f1746; +sub.f32 f727, f1727, f1728; +mul.f32 f728, f722, f1746; +fma.rn.f32 f729, f724, f618, f728; +mul.f32 f1725, f681, f722; +mul.f32 f1726, f682, f724; +sub.f32 f732, f1725, f1726; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f1723, f732, f634; +mul.f32 f1724, f734, f1744; +sub.f32 f737, f1723, f1724; +mul.f32 f738, f732, f1744; +fma.rn.f32 f739, f734, f634, f738; +mul.f32 f1721, f681, f732; +mul.f32 f1722, f682, f734; +sub.f32 f742, f1721, f1722; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f746, f744, f1742; +mul.f32 f1720, f742, f650; +sub.f32 f747, f1720, f746; +mul.f32 f748, f742, f1742; +fma.rn.f32 f749, f744, f650, f748; +mul.f32 f751, f682, f744; +mul.f32 f1719, f681, f742; +sub.f32 f752, f1719, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f756, f754, f1740; +mul.f32 f1718, f752, f666; +sub.f32 f757, f1718, f756; +mul.f32 f758, f752, f1740; +fma.rn.f32 f759, f754, f666, f758; +mul.f32 f761, f682, f754; +mul.f32 f1717, f681, f752; +sub.f32 f762, f1717, f761; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f766, f764, f551; +mul.f32 f1716, f762, f545; +sub.f32 f767, f1716, f766; +mul.f32 f768, f762, f551; +fma.rn.f32 f769, f764, f545, f768; +mul.f32 f1714, f681, f762; +mul.f32 f1715, f682, f764; +sub.f32 f772, f1714, f1715; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f1712, f772, f561; +mul.f32 f1713, f774, f567; +sub.f32 f777, f1712, f1713; +mul.f32 f778, f772, f567; +fma.rn.f32 f779, f774, f561, f778; +mul.f32 f1710, f681, f772; +mul.f32 f1711, f682, f774; +sub.f32 f782, f1710, f1711; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f1708, f782, f577; +mul.f32 f1709, f784, f583; +sub.f32 f787, f1708, f1709; +mul.f32 f788, f782, f583; +fma.rn.f32 f789, f784, f577, f788; +mul.f32 f791, f682, f784; +mul.f32 f1707, f681, f782; +sub.f32 f792, f1707, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f796, f794, f599; +mul.f32 f1706, f792, f593; +sub.f32 f797, f1706, f796; +mul.f32 f798, f792, f599; +fma.rn.f32 f799, f794, f593, f798; +mul.f32 f801, f682, f794; +mul.f32 f1705, f681, f792; +sub.f32 f802, f1705, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f806, f804, f615; +mul.f32 f1704, f802, f609; +sub.f32 f807, f1704, f806; +mul.f32 f808, f802, f615; +fma.rn.f32 f809, f804, f609, f808; +mul.f32 f811, f682, f804; +mul.f32 f1703, f681, f802; +sub.f32 f812, f1703, f811; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f816, f814, f631; +mul.f32 f1702, f812, f625; +sub.f32 f817, f1702, f816; +mul.f32 f818, f812, f631; +fma.rn.f32 f819, f814, f625, f818; +mul.f32 f1700, f681, f812; +mul.f32 f1701, f682, f814; +sub.f32 f822, f1700, f1701; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f1698, f822, f641; +mul.f32 f1699, f824, f647; +sub.f32 f827, f1698, f1699; +mul.f32 f828, f822, f647; +fma.rn.f32 f829, f824, f641, f828; +mul.f32 f1696, f681, f822; +mul.f32 f1697, f682, f824; +sub.f32 f832, f1696, f1697; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f836, f834, f663; +mul.f32 f1695, f832, f657; +sub.f32 f837, f1695, f836; +mul.f32 f838, f832, f663; +fma.rn.f32 f839, f834, f657, f838; +mul.f32 f841, f682, f834; +mul.f32 f1694, f681, f832; +sub.f32 f842, f1694, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f846, f844, f679; +mul.f32 f1693, f842, f673; +sub.f32 f847, f1693, f846; +mul.f32 f848, f842, f679; +fma.rn.f32 f849, f844, f673, f848; +mul.f32 f851, f682, f844; +mul.f32 f1692, f681, f842; +sub.f32 f852, f1692, f851; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f856, f854, f552; +mul.f32 f1691, f852, f546; +sub.f32 f857, f1691, f856; +mul.f32 f858, f852, f552; +fma.rn.f32 f859, f854, f546, f858; +mul.f32 f861, f682, f854; +mul.f32 f1690, f681, f852; +sub.f32 f862, f1690, f861; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f1688, f862, f562; +mul.f32 f1689, f864, f568; +sub.f32 f867, f1688, f1689; +mul.f32 f868, f862, f568; +fma.rn.f32 f869, f864, f562, f868; +mul.f32 f1686, f681, f862; +mul.f32 f1687, f682, f864; +sub.f32 f872, f1686, f1687; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f1684, f872, f578; +mul.f32 f1685, f874, f584; +sub.f32 f877, f1684, f1685; +mul.f32 f878, f872, f584; +fma.rn.f32 f879, f874, f578, f878; +mul.f32 f1682, f681, f872; +mul.f32 f1683, f682, f874; +sub.f32 f882, f1682, f1683; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f886, f884, f600; +mul.f32 f1681, f882, f594; +sub.f32 f887, f1681, f886; +mul.f32 f888, f882, f600; +fma.rn.f32 f889, f884, f594, f888; +mul.f32 f891, f682, f884; +mul.f32 f1680, f681, f882; +sub.f32 f892, f1680, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f896, f894, f616; +mul.f32 f1679, f892, f610; +sub.f32 f897, f1679, f896; +mul.f32 f898, f892, f616; +fma.rn.f32 f899, f894, f610, f898; +mul.f32 f901, f682, f894; +mul.f32 f1678, f681, f892; +sub.f32 f902, f1678, f901; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f906, f904, f632; +mul.f32 f1677, f902, f626; +sub.f32 f907, f1677, f906; +mul.f32 f908, f902, f632; +fma.rn.f32 f909, f904, f626, f908; +mul.f32 f1675, f681, f902; +mul.f32 f1676, f682, f904; +sub.f32 f912, f1675, f1676; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f1673, f912, f642; +mul.f32 f1674, f914, f648; +sub.f32 f917, f1673, f1674; +mul.f32 f918, f912, f648; +fma.rn.f32 f919, f914, f642, f918; +mul.f32 f1671, f681, f912; +mul.f32 f1672, f682, f914; +sub.f32 f922, f1671, f1672; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f1669, f922, f658; +mul.f32 f1670, f924, f664; +sub.f32 f927, f1669, f1670; +mul.f32 f928, f922, f664; +fma.rn.f32 f929, f924, f658, f928; +mul.f32 f931, f682, f924; +mul.f32 f1668, f681, f922; +sub.f32 f932, f1668, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f936, f934, f680; +mul.f32 f1667, f932, f674; +sub.f32 f937, f1667, f936; +mul.f32 f938, f932, f680; +fma.rn.f32 f939, f934, f674, f938; +mad.lo.s32 r12, r9, 2916, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 108, r12; +st.shared.f32 [r13], f538; +st.shared.f32 [r13+4], f687; +st.shared.f32 [r13+8], f697; +st.shared.f32 [r13+12], f707; +st.shared.f32 [r13+16], f717; +st.shared.f32 [r13+20], f727; +st.shared.f32 [r13+24], f737; +st.shared.f32 [r13+28], f747; +st.shared.f32 [r13+32], f757; +st.shared.f32 [r13+36], f767; +st.shared.f32 [r13+40], f777; +st.shared.f32 [r13+44], f787; +st.shared.f32 [r13+48], f797; +st.shared.f32 [r13+52], f807; +st.shared.f32 [r13+56], f817; +st.shared.f32 [r13+60], f827; +st.shared.f32 [r13+64], f837; +st.shared.f32 [r13+68], f847; +st.shared.f32 [r13+72], f857; +st.shared.f32 [r13+76], f867; +st.shared.f32 [r13+80], f877; +st.shared.f32 [r13+84], f887; +st.shared.f32 [r13+88], f897; +st.shared.f32 [r13+92], f907; +st.shared.f32 [r13+96], f917; +st.shared.f32 [r13+100], f927; +st.shared.f32 [r13+104], f937; +barrier.sync 0; +mad.lo.s32 r14, r11, -104, r13; +ld.shared.f32 f940, [r14]; +ld.shared.f32 f941, [r14+108]; +ld.shared.f32 f942, [r14+216]; +ld.shared.f32 f943, [r14+324]; +ld.shared.f32 f944, [r14+432]; +ld.shared.f32 f945, [r14+540]; +ld.shared.f32 f946, [r14+648]; +ld.shared.f32 f947, [r14+756]; +ld.shared.f32 f948, [r14+864]; +ld.shared.f32 f949, [r14+972]; +ld.shared.f32 f950, [r14+1080]; +ld.shared.f32 f951, [r14+1188]; +ld.shared.f32 f952, [r14+1296]; +ld.shared.f32 f953, [r14+1404]; +ld.shared.f32 f954, [r14+1512]; +ld.shared.f32 f955, [r14+1620]; +ld.shared.f32 f956, [r14+1728]; +ld.shared.f32 f957, [r14+1836]; +ld.shared.f32 f958, [r14+1944]; +ld.shared.f32 f959, [r14+2052]; +ld.shared.f32 f960, [r14+2160]; +ld.shared.f32 f961, [r14+2268]; +ld.shared.f32 f962, [r14+2376]; +ld.shared.f32 f963, [r14+2484]; +ld.shared.f32 f964, [r14+2592]; +ld.shared.f32 f965, [r14+2700]; +ld.shared.f32 f966, [r14+2808]; +barrier.sync 0; +st.shared.f32 [r13], f1756; +st.shared.f32 [r13+4], f689; +st.shared.f32 [r13+8], f699; +st.shared.f32 [r13+12], f709; +st.shared.f32 [r13+16], f719; +st.shared.f32 [r13+20], f729; +st.shared.f32 [r13+24], f739; +st.shared.f32 [r13+28], f749; +st.shared.f32 [r13+32], f759; +st.shared.f32 [r13+36], f769; +st.shared.f32 [r13+40], f779; +st.shared.f32 [r13+44], f789; +st.shared.f32 [r13+48], f799; +st.shared.f32 [r13+52], f809; +st.shared.f32 [r13+56], f819; +st.shared.f32 [r13+60], f829; +st.shared.f32 [r13+64], f839; +st.shared.f32 [r13+68], f849; +st.shared.f32 [r13+72], f859; +st.shared.f32 [r13+76], f869; +st.shared.f32 [r13+80], f879; +st.shared.f32 [r13+84], f889; +st.shared.f32 [r13+88], f899; +st.shared.f32 [r13+92], f909; +st.shared.f32 [r13+96], f919; +st.shared.f32 [r13+100], f929; +st.shared.f32 [r13+104], f939; +barrier.sync 0; +ld.shared.f32 f967, [r14]; +ld.shared.f32 f968, [r14+108]; +ld.shared.f32 f969, [r14+216]; +ld.shared.f32 f970, [r14+324]; +ld.shared.f32 f971, [r14+432]; +ld.shared.f32 f972, [r14+540]; +ld.shared.f32 f973, [r14+648]; +ld.shared.f32 f974, [r14+756]; +ld.shared.f32 f975, [r14+864]; +ld.shared.f32 f976, [r14+972]; +ld.shared.f32 f977, [r14+1080]; +ld.shared.f32 f978, [r14+1188]; +ld.shared.f32 f979, [r14+1296]; +ld.shared.f32 f980, [r14+1404]; +ld.shared.f32 f981, [r14+1512]; +ld.shared.f32 f982, [r14+1620]; +ld.shared.f32 f983, [r14+1728]; +ld.shared.f32 f984, [r14+1836]; +ld.shared.f32 f985, [r14+1944]; +ld.shared.f32 f986, [r14+2052]; +ld.shared.f32 f987, [r14+2160]; +ld.shared.f32 f988, [r14+2268]; +ld.shared.f32 f989, [r14+2376]; +ld.shared.f32 f990, [r14+2484]; +ld.shared.f32 f991, [r14+2592]; +ld.shared.f32 f992, [r14+2700]; +ld.shared.f32 f993, [r14+2808]; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +add.f32 f1666, f976, f985; +sub.f32 f1000, f976, f985; +mul.f32 f1001, f1000, 0f3F5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +add.f32 f1665, f967, f1666; +mul.f32 f1004, f1666, 0f3F000000; +sub.f32 f1005, f967, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0f3F5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +add.f32 f1664, f979, f988; +sub.f32 f1016, f979, f988; +mul.f32 f1017, f1016, 0f3F5DB3D7; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f1663, f970, f1664; +mul.f32 f1020, f1664, 0f3F000000; +sub.f32 f1021, f970, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0f3F5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +sub.f32 f1031, f946, f1030; +add.f32 f1662, f982, f991; +sub.f32 f1032, f982, f991; +mul.f32 f1033, f1032, 0f3F5DB3D7; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f1661, f973, f1662; +mul.f32 f1036, f1662, 0f3F000000; +sub.f32 f1037, f973, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0f3F5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f1659, f1018, 0f3F441B7D; +mul.f32 f1660, f1024, 0fBF248DBB; +sub.f32 f1044, f1659, f1660; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0fBF248DBB, f1045; +mul.f32 f1657, f1034, 0f3E31D0D4; +mul.f32 f1658, f1040, 0fBF7C1C5C; +sub.f32 f1049, f1657, f1658; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0fBF7C1C5C, f1050; +mul.f32 f1655, f1019, 0f3E31D0D4; +mul.f32 f1656, f1025, 0fBF7C1C5C; +sub.f32 f1054, f1655, f1656; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0fBF7C1C5C, f1055; +mul.f32 f1653, f1035, 0fBF708FB2; +mul.f32 f1654, f1041, 0fBEAF1D44; +sub.f32 f1059, f1653, f1654; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0fBEAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f1652, f1663, f1661; +sub.f32 f1068, f1663, f1661; +mul.f32 f1069, f1068, 0f3F5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f1651, f1665, f1652; +mul.f32 f1072, f1652, 0f3F000000; +sub.f32 f1073, f1665, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0f3F5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f1650, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0f3F5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f1649, f1008, f1650; +mul.f32 f1088, f1650, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0f3F5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f1648, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0f3F5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f1647, f1009, f1648; +mul.f32 f1104, f1648, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0f3F5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +sub.f32 f1115, f941, f1114; +add.f32 f1646, f977, f986; +sub.f32 f1116, f977, f986; +mul.f32 f1117, f1116, 0f3F5DB3D7; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +add.f32 f1645, f968, f1646; +mul.f32 f1120, f1646, 0f3F000000; +sub.f32 f1121, f968, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0f3F5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +sub.f32 f1131, f944, f1130; +add.f32 f1644, f980, f989; +sub.f32 f1132, f980, f989; +mul.f32 f1133, f1132, 0f3F5DB3D7; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +add.f32 f1643, f971, f1644; +mul.f32 f1136, f1644, 0f3F000000; +sub.f32 f1137, f971, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0f3F5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +sub.f32 f1147, f947, f1146; +add.f32 f1642, f983, f992; +sub.f32 f1148, f983, f992; +mul.f32 f1149, f1148, 0f3F5DB3D7; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +add.f32 f1641, f974, f1642; +mul.f32 f1152, f1642, 0f3F000000; +sub.f32 f1153, f974, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0f3F5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f1159, f1140, 0fBF248DBB; +mul.f32 f1640, f1134, 0f3F441B7D; +sub.f32 f1160, f1640, f1159; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0fBF248DBB, f1161; +mul.f32 f1638, f1150, 0f3E31D0D4; +mul.f32 f1639, f1156, 0fBF7C1C5C; +sub.f32 f1165, f1638, f1639; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0fBF7C1C5C, f1166; +mul.f32 f1636, f1135, 0f3E31D0D4; +mul.f32 f1637, f1141, 0fBF7C1C5C; +sub.f32 f1170, f1636, f1637; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0fBF7C1C5C, f1171; +mul.f32 f1634, f1151, 0fBF708FB2; +mul.f32 f1635, f1157, 0fBEAF1D44; +sub.f32 f1175, f1634, f1635; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0fBEAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f1633, f1643, f1641; +sub.f32 f1184, f1643, f1641; +mul.f32 f1185, f1184, 0f3F5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f1632, f1645, f1633; +mul.f32 f1188, f1633, 0f3F000000; +sub.f32 f1189, f1645, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0f3F5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f1631, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0f3F5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f1630, f1124, f1631; +mul.f32 f1204, f1631, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0f3F5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f1629, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0f3F5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f1628, f1125, f1629; +mul.f32 f1220, f1629, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0f3F5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +sub.f32 f1231, f942, f1230; +add.f32 f1627, f978, f987; +sub.f32 f1232, f978, f987; +mul.f32 f1233, f1232, 0f3F5DB3D7; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f1626, f969, f1627; +mul.f32 f1236, f1627, 0f3F000000; +sub.f32 f1237, f969, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0f3F5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +sub.f32 f1247, f945, f1246; +add.f32 f1625, f981, f990; +sub.f32 f1248, f981, f990; +mul.f32 f1249, f1248, 0f3F5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +add.f32 f1624, f972, f1625; +mul.f32 f1252, f1625, 0f3F000000; +sub.f32 f1253, f972, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0f3F5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +add.f32 f1623, f984, f993; +sub.f32 f1264, f984, f993; +mul.f32 f1265, f1264, 0f3F5DB3D7; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +add.f32 f1622, f975, f1623; +mul.f32 f1268, f1623, 0f3F000000; +sub.f32 f1269, f975, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0f3F5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f1275, f1256, 0fBF248DBB; +mul.f32 f1621, f1250, 0f3F441B7D; +sub.f32 f1276, f1621, f1275; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0fBF248DBB, f1277; +mul.f32 f1619, f1266, 0f3E31D0D4; +mul.f32 f1620, f1272, 0fBF7C1C5C; +sub.f32 f1281, f1619, f1620; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0fBF7C1C5C, f1282; +mul.f32 f1617, f1251, 0f3E31D0D4; +mul.f32 f1618, f1257, 0fBF7C1C5C; +sub.f32 f1286, f1617, f1618; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0fBF7C1C5C, f1287; +mul.f32 f1615, f1267, 0fBF708FB2; +mul.f32 f1616, f1273, 0fBEAF1D44; +sub.f32 f1291, f1615, f1616; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0fBEAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f1614, f1624, f1622; +sub.f32 f1300, f1624, f1622; +mul.f32 f1301, f1300, 0f3F5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f1613, f1626, f1614; +mul.f32 f1304, f1614, 0f3F000000; +sub.f32 f1305, f1626, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0f3F5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f1612, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0f3F5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f1611, f1240, f1612; +mul.f32 f1320, f1612, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0f3F5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f1610, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0f3F5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f1609, f1241, f1610; +mul.f32 f1336, f1610, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0f3F5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f1630, 0fBE6C2691; +mul.f32 f1608, f1195, 0f3F791978; +sub.f32 f1344, f1608, f1343; +mul.f32 f1345, f1630, 0f3F791978; +fma.rn.f32 f1346, f1195, 0fBE6C2691, f1345; +mul.f32 f1348, f1611, 0fBEE5C902; +mul.f32 f1607, f1311, 0f3F64C51C; +sub.f32 f1349, f1607, f1348; +mul.f32 f1350, f1611, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0fBEE5C902, f1350; +mul.f32 f1353, f1628, 0fBEE5C902; +mul.f32 f1606, f1211, 0f3F64C51C; +sub.f32 f1354, f1606, f1353; +mul.f32 f1355, f1628, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0fBEE5C902, f1355; +mul.f32 f1358, f1609, 0fBF4D57F2; +mul.f32 f1605, f1327, 0f3F18DF63; +sub.f32 f1359, f1605, f1358; +mul.f32 f1360, f1609, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0fBF4D57F2, f1360; +mul.f32 f1603, f1186, 0f3F441B7D; +mul.f32 f1604, f1192, 0fBF248DBB; +sub.f32 f1364, f1603, f1604; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0fBF248DBB, f1365; +mul.f32 f1601, f1302, 0f3E31D0D4; +mul.f32 f1602, f1308, 0fBF7C1C5C; +sub.f32 f1369, f1601, f1602; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0fBF7C1C5C, f1370; +mul.f32 f1599, f1202, 0f3F18DF63; +mul.f32 f1600, f1208, 0fBF4D57F2; +sub.f32 f1374, f1599, f1600; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0fBF4D57F2, f1375; +mul.f32 f1597, f1318, 0fBE92D7E0; +mul.f32 f1598, f1324, 0fBF753ECD; +sub.f32 f1379, f1597, f1598; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0fBF753ECD, f1380; +mul.f32 f1383, f1224, 0fBF6B1036; +mul.f32 f1596, f1218, 0f3ECACAF8; +sub.f32 f1384, f1596, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0fBF6B1036, f1385; +mul.f32 f1388, f1340, 0fBF3A3529; +mul.f32 f1595, f1334, 0fBF2FAD88; +sub.f32 f1389, f1595, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0fBF3A3529, f1390; +mul.f32 f1393, f1193, 0fBF7C1C5C; +mul.f32 f1594, f1187, 0f3E31D0D4; +sub.f32 f1394, f1594, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0fBF7C1C5C, f1395; +mul.f32 f1398, f1309, 0fBEAF1D44; +mul.f32 f1593, f1303, 0fBF708FB2; +sub.f32 f1399, f1593, f1398; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0fBEAF1D44, f1400; +mul.f32 f1403, f1209, 0fBF7F9120; +mul.f32 f1592, f1203, 0fBD6E2946; +sub.f32 f1404, f1592, f1403; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0fBF7F9120, f1405; +mul.f32 f1590, f1319, 0fBF7E44DE; +mul.f32 f1591, f1325, 0f3DEDC21F; +sub.f32 f1409, f1590, f1591; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0f3DEDC21F, f1410; +mul.f32 f1588, f1219, 0fBE92D7E0; +mul.f32 f1589, f1225, 0fBF753ECD; +sub.f32 f1414, f1588, f1589; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0fBF753ECD, f1415; +mul.f32 f1586, f1335, 0fBF55E287; +mul.f32 f1587, f1341, 0f3F0CAC9F; +sub.f32 f1419, f1586, f1587; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0f3F0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +mul.f32 f1424, f1422, 0f3F000000; +sub.f32 f1425, f1063, f1424; +add.f32 f1585, f1632, f1613; +sub.f32 f1426, f1632, f1613; +mul.f32 f1427, f1426, 0f3F5DB3D7; +mul.f32 f1428, f1585, 0f3F000000; +sub.f32 f1429, f1651, f1428; +sub.f32 f1430, f1179, f1295; +mul.f32 f1431, f1430, 0f3F5DB3D7; +add.f32 f1432, f1344, f1349; +mul.f32 f1434, f1432, 0f3F000000; +sub.f32 f1435, f1079, f1434; +add.f32 f1584, f1346, f1351; +sub.f32 f1436, f1346, f1351; +mul.f32 f1437, f1436, 0f3F5DB3D7; +mul.f32 f1438, f1584, 0f3F000000; +sub.f32 f1439, f1649, f1438; +sub.f32 f1440, f1344, f1349; +mul.f32 f1441, f1440, 0f3F5DB3D7; +add.f32 f1442, f1354, f1359; +mul.f32 f1444, f1442, 0f3F000000; +sub.f32 f1445, f1095, f1444; +add.f32 f1583, f1356, f1361; +sub.f32 f1446, f1356, f1361; +mul.f32 f1447, f1446, 0f3F5DB3D7; +mul.f32 f1448, f1583, 0f3F000000; +sub.f32 f1449, f1647, f1448; +sub.f32 f1450, f1354, f1359; +mul.f32 f1451, f1450, 0f3F5DB3D7; +add.f32 f1452, f1364, f1369; +mul.f32 f1454, f1452, 0f3F000000; +sub.f32 f1455, f1070, f1454; +add.f32 f1582, f1366, f1371; +sub.f32 f1456, f1366, f1371; +mul.f32 f1457, f1456, 0f3F5DB3D7; +mul.f32 f1458, f1582, 0f3F000000; +sub.f32 f1459, f1076, f1458; +sub.f32 f1460, f1364, f1369; +mul.f32 f1461, f1460, 0f3F5DB3D7; +add.f32 f1462, f1374, f1379; +mul.f32 f1464, f1462, 0f3F000000; +sub.f32 f1465, f1086, f1464; +add.f32 f1581, f1376, f1381; +sub.f32 f1466, f1376, f1381; +mul.f32 f1467, f1466, 0f3F5DB3D7; +mul.f32 f1468, f1581, 0f3F000000; +sub.f32 f1469, f1092, f1468; +sub.f32 f1470, f1374, f1379; +mul.f32 f1471, f1470, 0f3F5DB3D7; +add.f32 f1472, f1384, f1389; +mul.f32 f1474, f1472, 0f3F000000; +sub.f32 f1475, f1102, f1474; +add.f32 f1580, f1386, f1391; +sub.f32 f1476, f1386, f1391; +mul.f32 f1477, f1476, 0f3F5DB3D7; +mul.f32 f1478, f1580, 0f3F000000; +sub.f32 f1479, f1108, f1478; +sub.f32 f1480, f1384, f1389; +mul.f32 f1481, f1480, 0f3F5DB3D7; +add.f32 f1482, f1394, f1399; +mul.f32 f1484, f1482, 0f3F000000; +sub.f32 f1485, f1071, f1484; +add.f32 f1579, f1396, f1401; +sub.f32 f1486, f1396, f1401; +mul.f32 f1487, f1486, 0f3F5DB3D7; +mul.f32 f1488, f1579, 0f3F000000; +sub.f32 f1489, f1077, f1488; +sub.f32 f1490, f1394, f1399; +mul.f32 f1491, f1490, 0f3F5DB3D7; +add.f32 f1492, f1404, f1409; +mul.f32 f1494, f1492, 0f3F000000; +sub.f32 f1495, f1087, f1494; +add.f32 f1578, f1406, f1411; +sub.f32 f1496, f1406, f1411; +mul.f32 f1497, f1496, 0f3F5DB3D7; +mul.f32 f1498, f1578, 0f3F000000; +sub.f32 f1499, f1093, f1498; +sub.f32 f1500, f1404, f1409; +mul.f32 f1501, f1500, 0f3F5DB3D7; +add.f32 f1502, f1414, f1419; +mul.f32 f1504, f1502, 0f3F000000; +sub.f32 f1505, f1103, f1504; +add.f32 f1577, f1416, f1421; +sub.f32 f1506, f1416, f1421; +mul.f32 f1507, f1506, 0f3F5DB3D7; +mul.f32 f1508, f1577, 0f3F000000; +sub.f32 f1509, f1109, f1508; +sub.f32 f1510, f1414, f1419; +mul.f32 f1854, f1580, 0f3F000000; +sub.f32 f1853, f1108, f1854; +mul.f32 f1511, f1510, 0f3F5DB3D7; +add.f32 %0, f1063, f1422; +mul.f32 f1856, f1452, 0f3F000000; +sub.f32 f1855, f1070, f1856; +add.f32 %1, f1651, f1585; +mul.f32 f1858, f1581, 0f3F000000; +sub.f32 f1857, f1092, f1858; +mul.f32 f1860, f1452, 0f3F000000; +sub.f32 f1859, f1070, f1860; +add.f32 %3, f1649, f1584; +add.f32 %2, f1079, f1432; +add.f32 %5, f1647, f1583; +add.f32 %4, f1095, f1442; +add.f32 %7, f1076, f1582; +add.f32 %6, f1070, f1452; +add.f32 %9, f1092, f1581; +add.f32 %8, f1086, f1462; +add.f32 %11, f1108, f1580; +add.f32 %10, f1102, f1472; +add.f32 %13, f1077, f1579; +add.f32 %12, f1071, f1482; +add.f32 %15, f1093, f1578; +add.f32 %14, f1087, f1492; +add.f32 %17, f1109, f1577; +add.f32 %16, f1103, f1502; +sub.f32 %19, f1429, f1431; +add.f32 %18, f1427, f1425; +add.f32 %20, f1437, f1435; +sub.f32 %21, f1439, f1441; +add.f32 %22, f1447, f1445; +sub.f32 %23, f1449, f1451; +add.f32 %24, f1457, f1859; +sub.f32 %25, f1459, f1461; +sub.f32 %27, f1857, f1471; +add.f32 %26, f1467, f1465; +sub.f32 %29, f1853, f1481; +add.f32 %28, f1477, f1475; +add.f32 %30, f1487, f1485; +sub.f32 %31, f1489, f1491; +add.f32 %32, f1497, f1495; +sub.f32 %33, f1499, f1501; +add.f32 %34, f1507, f1505; +sub.f32 %35, f1509, f1511; +sub.f32 %36, f1425, f1427; +add.f32 %37, f1431, f1429; +add.f32 %39, f1441, f1439; +sub.f32 %38, f1435, f1437; +add.f32 %41, f1451, f1449; +sub.f32 %40, f1445, f1447; +add.f32 %43, f1461, f1459; +sub.f32 %42, f1859, f1457; +add.f32 %45, f1471, f1857; +sub.f32 %44, f1465, f1467; +add.f32 %47, f1481, f1853; +sub.f32 %46, f1475, f1477; +add.f32 %49, f1491, f1489; +sub.f32 %48, f1485, f1487; +add.f32 %51, f1501, f1499; +sub.f32 %50, f1495, f1497; +add.f32 %53, f1511, f1509; +sub.f32 %52, f1505, f1507; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[19].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<140, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<615>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 5832, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0f3F5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0f3F5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5832, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f151, f120; +mul.f32 f156, f152, f122; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f160, f136; +mul.f32 f164, f162, f138; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f168, f111; +mul.f32 f172, f170, f117; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f176, f127; +mul.f32 f180, f178, f133; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f184, f143; +mul.f32 f188, f186, f149; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f192, f112; +mul.f32 f196, f194, f118; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f200, f128; +mul.f32 f204, f202, f134; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f208, f144; +mul.f32 f212, f210, f150; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f152, f120, f157; +sub.f32 f217, f155, f156; +st.shared.v2.f32 [r9+8], {f217, f216}; +fma.rn.f32 f218, f162, f136, f165; +sub.f32 f219, f163, f164; +st.shared.v2.f32 [r9+16], {f219, f218}; +sub.f32 f220, f171, f172; +fma.rn.f32 f221, f170, f111, f173; +st.shared.v2.f32 [r9+24], {f220, f221}; +fma.rn.f32 f222, f178, f127, f181; +sub.f32 f223, f179, f180; +st.shared.v2.f32 [r9+32], {f223, f222}; +sub.f32 f224, f187, f188; +fma.rn.f32 f225, f186, f143, f189; +st.shared.v2.f32 [r9+40], {f224, f225}; +fma.rn.f32 f226, f194, f112, f197; +sub.f32 f227, f195, f196; +st.shared.v2.f32 [r9+48], {f227, f226}; +fma.rn.f32 f228, f202, f128, f205; +sub.f32 f229, f203, f204; +st.shared.v2.f32 [r9+56], {f229, f228}; +fma.rn.f32 f230, f210, f144, f213; +sub.f32 f231, f211, f212; +st.shared.v2.f32 [r9+64], {f231, f230}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+648]; +ld.shared.v2.f32 {f240, f241}, [r11+1296]; +ld.shared.v2.f32 {f244, f245}, [r11+1944]; +ld.shared.v2.f32 {f248, f249}, [r11+2592]; +ld.shared.v2.f32 {f252, f253}, [r11+3240]; +ld.shared.v2.f32 {f256, f257}, [r11+3888]; +ld.shared.v2.f32 {f260, f261}, [r11+4536]; +ld.shared.v2.f32 {f264, f265}, [r11+5184]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0f3F5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0f3F5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0f3F5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0f3F5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0f3F5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0f3F5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0fBF248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0fBF248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0fBF7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0fBF7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0fBF7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0fBF7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0fBEAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0fBEAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0f3F5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0f3F5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f382, f351; +mul.f32 f387, f383, f353; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f391, f367; +mul.f32 f395, f393, f369; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f399, f342; +mul.f32 f403, f401, f348; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f415, f374; +mul.f32 f419, f417, f380; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f423, f343; +mul.f32 f427, f425, f349; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f431, f359; +mul.f32 f435, f433, f365; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f439, f375; +mul.f32 f443, f441, f381; +mul.f32 f444, f439, f381; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r17], {f446, f445}; +fma.rn.f32 f447, f383, f351, f388; +sub.f32 f448, f386, f387; +st.shared.v2.f32 [r17+72], {f448, f447}; +fma.rn.f32 f449, f393, f367, f396; +sub.f32 f450, f394, f395; +st.shared.v2.f32 [r17+144], {f450, f449}; +fma.rn.f32 f451, f401, f342, f404; +sub.f32 f452, f402, f403; +st.shared.v2.f32 [r17+216], {f452, f451}; +fma.rn.f32 f453, f409, f358, f412; +sub.f32 f454, f410, f411; +st.shared.v2.f32 [r17+288], {f454, f453}; +fma.rn.f32 f455, f417, f374, f420; +sub.f32 f456, f418, f419; +st.shared.v2.f32 [r17+360], {f456, f455}; +fma.rn.f32 f457, f425, f343, f428; +sub.f32 f458, f426, f427; +st.shared.v2.f32 [r17+432], {f458, f457}; +sub.f32 f459, f434, f435; +fma.rn.f32 f460, f433, f359, f436; +st.shared.v2.f32 [r17+504], {f459, f460}; +fma.rn.f32 f461, f441, f375, f444; +sub.f32 f462, f442, f443; +st.shared.v2.f32 [r17+576], {f462, f461}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r11]; +ld.shared.v2.f32 {f467, f468}, [r11+648]; +ld.shared.v2.f32 {f471, f472}, [r11+1296]; +ld.shared.v2.f32 {f475, f476}, [r11+1944]; +ld.shared.v2.f32 {f479, f480}, [r11+2592]; +ld.shared.v2.f32 {f483, f484}, [r11+3240]; +ld.shared.v2.f32 {f487, f488}, [r11+3888]; +ld.shared.v2.f32 {f491, f492}, [r11+4536]; +ld.shared.v2.f32 {f495, f496}, [r11+5184]; +add.f32 f499, f475, f487; +add.f32 f500, f463, f499; +add.f32 f501, f476, f488; +add.f32 f502, f464, f501; +mul.f32 f503, f499, 0f3F000000; +sub.f32 f504, f463, f503; +sub.f32 f505, f476, f488; +mul.f32 f506, f505, 0f3F5DB3D7; +add.f32 f507, f506, f504; +sub.f32 f508, f504, f506; +mul.f32 f509, f501, 0f3F000000; +sub.f32 f510, f464, f509; +sub.f32 f511, f475, f487; +mul.f32 f512, f511, 0f3F5DB3D7; +sub.f32 f513, f510, f512; +add.f32 f514, f512, f510; +add.f32 f515, f479, f491; +add.f32 f516, f467, f515; +add.f32 f517, f480, f492; +add.f32 f518, f468, f517; +mul.f32 f519, f515, 0f3F000000; +sub.f32 f520, f467, f519; +sub.f32 f521, f480, f492; +mul.f32 f522, f521, 0f3F5DB3D7; +add.f32 f523, f522, f520; +sub.f32 f524, f520, f522; +mul.f32 f525, f517, 0f3F000000; +sub.f32 f526, f468, f525; +sub.f32 f527, f479, f491; +mul.f32 f528, f527, 0f3F5DB3D7; +sub.f32 f529, f526, f528; +add.f32 f530, f528, f526; +add.f32 f531, f483, f495; +add.f32 f532, f471, f531; +add.f32 f533, f484, f496; +add.f32 f534, f472, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f471, f535; +sub.f32 f537, f484, f496; +mul.f32 f538, f537, 0f3F5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f472, f541; +sub.f32 f543, f483, f495; +mul.f32 f544, f543, 0f3F5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +mul.f32 f547, f523, 0f3F441B7D; +mul.f32 f548, f529, 0fBF248DBB; +sub.f32 f549, f547, f548; +mul.f32 f550, f529, 0f3F441B7D; +fma.rn.f32 f551, f523, 0fBF248DBB, f550; +mul.f32 f552, f539, 0f3E31D0D4; +mul.f32 f553, f545, 0fBF7C1C5C; +sub.f32 f554, f552, f553; +mul.f32 f555, f545, 0f3E31D0D4; +fma.rn.f32 f556, f539, 0fBF7C1C5C, f555; +mul.f32 f557, f524, 0f3E31D0D4; +mul.f32 f558, f530, 0fBF7C1C5C; +sub.f32 f559, f557, f558; +mul.f32 f560, f530, 0f3E31D0D4; +fma.rn.f32 f561, f524, 0fBF7C1C5C, f560; +mul.f32 f562, f540, 0fBF708FB2; +mul.f32 f563, f546, 0fBEAF1D44; +sub.f32 f564, f562, f563; +mul.f32 f565, f546, 0fBF708FB2; +fma.rn.f32 f566, f540, 0fBEAF1D44, f565; +add.f32 f567, f516, f532; +add.f32 f568, f518, f534; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f500, f569; +sub.f32 f571, f518, f534; +mul.f32 f572, f571, 0f3F5DB3D7; +mul.f32 f573, f568, 0f3F000000; +sub.f32 f574, f502, f573; +sub.f32 f575, f516, f532; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f549, f554; +add.f32 f578, f551, f556; +mul.f32 f579, f577, 0f3F000000; +sub.f32 f580, f507, f579; +sub.f32 f581, f551, f556; +mul.f32 f582, f581, 0f3F5DB3D7; +mul.f32 f583, f578, 0f3F000000; +sub.f32 f584, f513, f583; +sub.f32 f585, f549, f554; +mul.f32 f586, f585, 0f3F5DB3D7; +add.f32 f587, f559, f564; +add.f32 f588, f561, f566; +mul.f32 f589, f587, 0f3F000000; +sub.f32 f590, f508, f589; +sub.f32 f591, f561, f566; +mul.f32 f592, f591, 0f3F5DB3D7; +mul.f32 f593, f588, 0f3F000000; +sub.f32 f594, f514, f593; +sub.f32 f595, f559, f564; +mul.f32 f596, f595, 0f3F5DB3D7; +add.f32 %1, f502, f568; +add.f32 %0, f500, f567; +add.f32 %3, f513, f578; +add.f32 %2, f507, f577; +add.f32 %5, f514, f588; +add.f32 %4, f508, f587; +sub.f32 %7, f574, f576; +add.f32 %6, f572, f570; +sub.f32 %9, f584, f586; +add.f32 %8, f582, f580; +sub.f32 %11, f594, f596; +add.f32 %10, f592, f590; +add.f32 %13, f576, f574; +sub.f32 %12, f570, f572; +add.f32 %15, f586, f584; +sub.f32 %14, f580, f582; +add.f32 %17, f596, f594; +sub.f32 %16, f590, f592; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<141, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<579>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 2916, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0f3F5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0f3F5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0f3F5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0f3F5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f153, f122; +mul.f32 f158, f154, f124; +sub.f32 f159, f157, f158; +mul.f32 f160, f153, f124; +fma.rn.f32 f161, f154, f122, f160; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f164, f138; +mul.f32 f168, f166, f140; +sub.f32 f169, f167, f168; +mul.f32 f170, f164, f140; +fma.rn.f32 f171, f166, f138, f170; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f174, f113; +mul.f32 f178, f176, f119; +sub.f32 f179, f177, f178; +mul.f32 f180, f174, f119; +fma.rn.f32 f181, f176, f113, f180; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f184, f129; +mul.f32 f188, f186, f135; +sub.f32 f189, f187, f188; +mul.f32 f190, f184, f135; +fma.rn.f32 f191, f186, f129, f190; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f194, f145; +mul.f32 f198, f196, f151; +sub.f32 f199, f197, f198; +mul.f32 f200, f194, f151; +fma.rn.f32 f201, f196, f145, f200; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f204, f114; +mul.f32 f208, f206, f120; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f120; +fma.rn.f32 f211, f206, f114, f210; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f214, f130; +mul.f32 f218, f216, f136; +sub.f32 f219, f217, f218; +mul.f32 f220, f214, f136; +fma.rn.f32 f221, f216, f130, f220; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f224, f146; +mul.f32 f228, f226, f152; +sub.f32 f229, f227, f228; +mul.f32 f230, f224, f152; +fma.rn.f32 f231, f226, f146, f230; +mad.lo.s32 r8, r5, 2916, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f159; +st.shared.f32 [r9+8], f169; +st.shared.f32 [r9+12], f179; +st.shared.f32 [r9+16], f189; +st.shared.f32 [r9+20], f199; +st.shared.f32 [r9+24], f209; +st.shared.f32 [r9+28], f219; +st.shared.f32 [r9+32], f229; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+324]; +ld.shared.f32 f234, [r11+648]; +ld.shared.f32 f235, [r11+972]; +ld.shared.f32 f236, [r11+1296]; +ld.shared.f32 f237, [r11+1620]; +ld.shared.f32 f238, [r11+1944]; +ld.shared.f32 f239, [r11+2268]; +ld.shared.f32 f240, [r11+2592]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+324]; +ld.shared.f32 f243, [r11+648]; +ld.shared.f32 f244, [r11+972]; +ld.shared.f32 f245, [r11+1296]; +ld.shared.f32 f246, [r11+1620]; +ld.shared.f32 f247, [r11+1944]; +ld.shared.f32 f248, [r11+2268]; +ld.shared.f32 f249, [r11+2592]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0f3F5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0f3F5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0f3F5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0f3F5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0f3F5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0f3F5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0fBF248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0fBF248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0fBF7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0fBF7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0fBF7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0fBF7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0fBEAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0fBEAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0f3F5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0f3F5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0f3F5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0f3F5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0f3F5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0f3F5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f366, f335; +mul.f32 f371, f367, f337; +sub.f32 f372, f370, f371; +mul.f32 f373, f366, f337; +fma.rn.f32 f374, f367, f335, f373; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f377, f351; +mul.f32 f381, f379, f353; +sub.f32 f382, f380, f381; +mul.f32 f383, f377, f353; +fma.rn.f32 f384, f379, f351, f383; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f387, f326; +mul.f32 f391, f389, f332; +sub.f32 f392, f390, f391; +mul.f32 f393, f387, f332; +fma.rn.f32 f394, f389, f326, f393; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f397, f342; +mul.f32 f401, f399, f348; +sub.f32 f402, f400, f401; +mul.f32 f403, f397, f348; +fma.rn.f32 f404, f399, f342, f403; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f407, f358; +mul.f32 f411, f409, f364; +sub.f32 f412, f410, f411; +mul.f32 f413, f407, f364; +fma.rn.f32 f414, f409, f358, f413; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f417, f327; +mul.f32 f421, f419, f333; +sub.f32 f422, f420, f421; +mul.f32 f423, f417, f333; +fma.rn.f32 f424, f419, f327, f423; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f427, f343; +mul.f32 f431, f429, f349; +sub.f32 f432, f430, f431; +mul.f32 f433, f427, f349; +fma.rn.f32 f434, f429, f343, f433; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f437, f359; +mul.f32 f441, f439, f365; +sub.f32 f442, f440, f441; +mul.f32 f443, f437, f365; +fma.rn.f32 f444, f439, f359, f443; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 324, r16; +st.shared.f32 [r17], f319; +st.shared.f32 [r17+36], f372; +st.shared.f32 [r17+72], f382; +st.shared.f32 [r17+108], f392; +st.shared.f32 [r17+144], f402; +st.shared.f32 [r17+180], f412; +st.shared.f32 [r17+216], f422; +st.shared.f32 [r17+252], f432; +st.shared.f32 [r17+288], f442; +barrier.sync 0; +ld.shared.f32 f445, [r11]; +ld.shared.f32 f446, [r11+324]; +ld.shared.f32 f447, [r11+648]; +ld.shared.f32 f448, [r11+972]; +ld.shared.f32 f449, [r11+1296]; +ld.shared.f32 f450, [r11+1620]; +ld.shared.f32 f451, [r11+1944]; +ld.shared.f32 f452, [r11+2268]; +ld.shared.f32 f453, [r11+2592]; +barrier.sync 0; +st.shared.f32 [r17], f321; +st.shared.f32 [r17+36], f374; +st.shared.f32 [r17+72], f384; +st.shared.f32 [r17+108], f394; +st.shared.f32 [r17+144], f404; +st.shared.f32 [r17+180], f414; +st.shared.f32 [r17+216], f424; +st.shared.f32 [r17+252], f434; +st.shared.f32 [r17+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r11]; +ld.shared.f32 f455, [r11+324]; +ld.shared.f32 f456, [r11+648]; +ld.shared.f32 f457, [r11+972]; +ld.shared.f32 f458, [r11+1296]; +ld.shared.f32 f459, [r11+1620]; +ld.shared.f32 f460, [r11+1944]; +ld.shared.f32 f461, [r11+2268]; +ld.shared.f32 f462, [r11+2592]; +add.f32 f463, f448, f451; +add.f32 f464, f445, f463; +add.f32 f465, f457, f460; +add.f32 f466, f454, f465; +mul.f32 f467, f463, 0f3F000000; +sub.f32 f468, f445, f467; +sub.f32 f469, f457, f460; +mul.f32 f470, f469, 0f3F5DB3D7; +add.f32 f471, f470, f468; +sub.f32 f472, f468, f470; +mul.f32 f473, f465, 0f3F000000; +sub.f32 f474, f454, f473; +sub.f32 f475, f448, f451; +mul.f32 f476, f475, 0f3F5DB3D7; +sub.f32 f477, f474, f476; +add.f32 f478, f476, f474; +add.f32 f479, f449, f452; +add.f32 f480, f446, f479; +add.f32 f481, f458, f461; +add.f32 f482, f455, f481; +mul.f32 f483, f479, 0f3F000000; +sub.f32 f484, f446, f483; +sub.f32 f485, f458, f461; +mul.f32 f486, f485, 0f3F5DB3D7; +add.f32 f487, f486, f484; +sub.f32 f488, f484, f486; +mul.f32 f489, f481, 0f3F000000; +sub.f32 f490, f455, f489; +sub.f32 f491, f449, f452; +mul.f32 f492, f491, 0f3F5DB3D7; +sub.f32 f493, f490, f492; +add.f32 f494, f492, f490; +add.f32 f495, f450, f453; +add.f32 f496, f447, f495; +add.f32 f497, f459, f462; +add.f32 f498, f456, f497; +mul.f32 f499, f495, 0f3F000000; +sub.f32 f500, f447, f499; +sub.f32 f501, f459, f462; +mul.f32 f502, f501, 0f3F5DB3D7; +add.f32 f503, f502, f500; +sub.f32 f504, f500, f502; +mul.f32 f505, f497, 0f3F000000; +sub.f32 f506, f456, f505; +sub.f32 f507, f450, f453; +mul.f32 f508, f507, 0f3F5DB3D7; +sub.f32 f509, f506, f508; +add.f32 f510, f508, f506; +mul.f32 f511, f487, 0f3F441B7D; +mul.f32 f512, f493, 0fBF248DBB; +sub.f32 f513, f511, f512; +mul.f32 f514, f493, 0f3F441B7D; +fma.rn.f32 f515, f487, 0fBF248DBB, f514; +mul.f32 f516, f503, 0f3E31D0D4; +mul.f32 f517, f509, 0fBF7C1C5C; +sub.f32 f518, f516, f517; +mul.f32 f519, f509, 0f3E31D0D4; +fma.rn.f32 f520, f503, 0fBF7C1C5C, f519; +mul.f32 f521, f488, 0f3E31D0D4; +mul.f32 f522, f494, 0fBF7C1C5C; +sub.f32 f523, f521, f522; +mul.f32 f524, f494, 0f3E31D0D4; +fma.rn.f32 f525, f488, 0fBF7C1C5C, f524; +mul.f32 f526, f504, 0fBF708FB2; +mul.f32 f527, f510, 0fBEAF1D44; +sub.f32 f528, f526, f527; +mul.f32 f529, f510, 0fBF708FB2; +fma.rn.f32 f530, f504, 0fBEAF1D44, f529; +add.f32 f531, f480, f496; +add.f32 f532, f482, f498; +mul.f32 f533, f531, 0f3F000000; +sub.f32 f534, f464, f533; +sub.f32 f535, f482, f498; +mul.f32 f536, f535, 0f3F5DB3D7; +mul.f32 f537, f532, 0f3F000000; +sub.f32 f538, f466, f537; +sub.f32 f539, f480, f496; +mul.f32 f540, f539, 0f3F5DB3D7; +add.f32 f541, f513, f518; +add.f32 f542, f515, f520; +mul.f32 f543, f541, 0f3F000000; +sub.f32 f544, f471, f543; +sub.f32 f545, f515, f520; +mul.f32 f546, f545, 0f3F5DB3D7; +mul.f32 f547, f542, 0f3F000000; +sub.f32 f548, f477, f547; +sub.f32 f549, f513, f518; +mul.f32 f550, f549, 0f3F5DB3D7; +add.f32 f551, f523, f528; +add.f32 f552, f525, f530; +mul.f32 f553, f551, 0f3F000000; +sub.f32 f554, f472, f553; +sub.f32 f555, f525, f530; +mul.f32 f556, f555, 0f3F5DB3D7; +mul.f32 f557, f552, 0f3F000000; +sub.f32 f558, f478, f557; +sub.f32 f559, f523, f528; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 %0, f464, f531; +add.f32 %1, f466, f532; +add.f32 %3, f477, f542; +add.f32 %2, f471, f541; +add.f32 %5, f478, f552; +add.f32 %4, f472, f551; +add.f32 %6, f536, f534; +sub.f32 %7, f538, f540; +sub.f32 %9, f548, f550; +add.f32 %8, f546, f544; +sub.f32 %11, f558, f560; +add.f32 %10, f556, f554; +sub.f32 %12, f534, f536; +add.f32 %13, f540, f538; +add.f32 %15, f550, f548; +sub.f32 %14, f544, f546; +add.f32 %17, f560, f558; +sub.f32 %16, f554, f556; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<142, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<264>; +.reg .b32 r<40>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 5832, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %14, %17; +add.f32 f14, %16, %18; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %12, f15; +sub.f32 f17, %16, %18; +mul.f32 f18, f17, 0f3F5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %13, f21; +sub.f32 f23, %14, %17; +mul.f32 f24, f23, 0f3F5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5832, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f27, f19; +mul.f32 f32, f28, f25; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f36, f20; +mul.f32 f40, f38, f26; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %13, f14; +add.f32 f43, %12, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f28, f19, f33; +sub.f32 f45, f31, f32; +st.shared.v2.f32 [r9+8], {f45, f44}; +sub.f32 f46, f39, f40; +fma.rn.f32 f47, f38, f20, f41; +st.shared.v2.f32 [r9+16], {f46, f47}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+1944]; +ld.shared.v2.f32 {f56, f57}, [r11+3888]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0f3F5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0f3F5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f74, f66; +mul.f32 f79, f75, f72; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f83, f67; +mul.f32 f87, f85, f73; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f75, f66, f80; +sub.f32 f92, f78, f79; +st.shared.v2.f32 [r17+24], {f92, f91}; +fma.rn.f32 f93, f85, f67, f88; +sub.f32 f94, f86, f87; +st.shared.v2.f32 [r17+48], {f94, f93}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+1944]; +ld.shared.v2.f32 {f103, f104}, [r11+3888]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f121, f113; +mul.f32 f126, f122, f119; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f130, f114; +mul.f32 f134, f132, f120; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r23], {f137, f136}; +fma.rn.f32 f138, f122, f113, f127; +sub.f32 f139, f125, f126; +st.shared.v2.f32 [r23+72], {f139, f138}; +fma.rn.f32 f140, f132, f114, f135; +sub.f32 f141, f133, f134; +st.shared.v2.f32 [r23+144], {f141, f140}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r11]; +ld.shared.v2.f32 {f146, f147}, [r11+1944]; +ld.shared.v2.f32 {f150, f151}, [r11+3888]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0f3F5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f155, 0f3F000000; +sub.f32 f163, f143, f162; +sub.f32 f164, f146, f150; +mul.f32 f165, f164, 0f3F5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f168, f169}, [rd21]; +mul.f32 f172, f168, f160; +mul.f32 f173, f169, f166; +mul.f32 f174, f168, f166; +mul.f32 f175, f168, f168; +mul.f32 f176, f169, f169; +sub.f32 f177, f175, f176; +mul.f32 f178, f169, f168; +fma.rn.f32 f179, f169, f168, f178; +mul.f32 f180, f177, f161; +mul.f32 f181, f179, f167; +mul.f32 f182, f177, f167; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +add.f32 f183, f143, f155; +add.f32 f184, f142, f154; +st.shared.v2.f32 [r33], {f184, f183}; +fma.rn.f32 f185, f169, f160, f174; +sub.f32 f186, f172, f173; +st.shared.v2.f32 [r33+216], {f186, f185}; +fma.rn.f32 f187, f179, f161, f182; +sub.f32 f188, f180, f181; +st.shared.v2.f32 [r33+432], {f188, f187}; +barrier.sync 0; +ld.shared.v2.f32 {f189, f190}, [r11]; +ld.shared.v2.f32 {f193, f194}, [r11+1944]; +ld.shared.v2.f32 {f197, f198}, [r11+3888]; +add.f32 f201, f193, f197; +add.f32 f202, f194, f198; +mul.f32 f203, f201, 0f3F000000; +sub.f32 f204, f189, f203; +sub.f32 f205, f194, f198; +mul.f32 f206, f205, 0f3F5DB3D7; +add.f32 f207, f206, f204; +sub.f32 f208, f204, f206; +mul.f32 f209, f202, 0f3F000000; +sub.f32 f210, f190, f209; +sub.f32 f211, f193, f197; +mul.f32 f212, f211, 0f3F5DB3D7; +sub.f32 f213, f210, f212; +add.f32 f214, f212, f210; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f215, f216}, [rd26]; +mul.f32 f219, f215, f207; +mul.f32 f220, f216, f213; +mul.f32 f221, f215, f213; +mul.f32 f222, f215, f215; +mul.f32 f223, f216, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f216, f215; +fma.rn.f32 f226, f216, f215, f225; +mul.f32 f227, f224, f208; +mul.f32 f228, f226, f214; +mul.f32 f229, f224, f214; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +add.f32 f230, f190, f202; +add.f32 f231, f189, f201; +st.shared.v2.f32 [r39], {f231, f230}; +fma.rn.f32 f232, f216, f207, f221; +sub.f32 f233, f219, f220; +st.shared.v2.f32 [r39+648], {f233, f232}; +fma.rn.f32 f234, f226, f208, f229; +sub.f32 f235, f227, f228; +st.shared.v2.f32 [r39+1296], {f235, f234}; +barrier.sync 0; +ld.shared.v2.f32 {f236, f237}, [r11]; +ld.shared.v2.f32 {f240, f241}, [r11+1944]; +ld.shared.v2.f32 {f244, f245}, [r11+3888]; +add.f32 f248, f240, f244; +add.f32 f249, f241, f245; +mul.f32 f250, f248, 0f3F000000; +sub.f32 f251, f236, f250; +sub.f32 f252, f241, f245; +mul.f32 f253, f252, 0f3F5DB3D7; +mul.f32 f254, f249, 0f3F000000; +sub.f32 f255, f237, f254; +sub.f32 f256, f240, f244; +mul.f32 f257, f256, 0f3F5DB3D7; +add.f32 %1, f237, f249; +add.f32 %0, f236, f248; +sub.f32 %3, f255, f257; +add.f32 %2, f253, f251; +add.f32 %5, f257, f255; +sub.f32 %4, f251, f253; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<143, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<234>; +.reg .b32 r<40>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 2916, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %14, %17; +add.f32 f14, %12, f13; +add.f32 f15, %16, %18; +add.f32 f16, %13, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %12, f17; +sub.f32 f19, %16, %18; +mul.f32 f20, f19, 0f3F5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %13, f23; +sub.f32 f25, %14, %17; +mul.f32 f26, f25, 0f3F5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2916, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f29, f21; +mul.f32 f34, f30, f27; +sub.f32 f35, f33, f34; +mul.f32 f36, f29, f27; +fma.rn.f32 f37, f30, f21, f36; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f40, f22; +mul.f32 f44, f42, f28; +sub.f32 f45, f43, f44; +mul.f32 f46, f40, f28; +fma.rn.f32 f47, f42, f22, f46; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f35; +st.shared.f32 [r9+8], f45; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+972]; +ld.shared.f32 f50, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+972]; +ld.shared.f32 f53, [r11+1944]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0f3F5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0f3F5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f70, f62; +mul.f32 f75, f71, f68; +sub.f32 f76, f74, f75; +mul.f32 f77, f70, f68; +fma.rn.f32 f78, f71, f62, f77; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f81, f63; +mul.f32 f85, f83, f69; +sub.f32 f86, f84, f85; +mul.f32 f87, f81, f69; +fma.rn.f32 f88, f83, f63, f87; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f76; +st.shared.f32 [r17+24], f86; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+972]; +ld.shared.f32 f91, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+972]; +ld.shared.f32 f94, [r11+1944]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0f3F5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0f3F5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f111, f103; +mul.f32 f116, f112, f109; +sub.f32 f117, f115, f116; +mul.f32 f118, f111, f109; +fma.rn.f32 f119, f112, f103, f118; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f122, f104; +mul.f32 f126, f124, f110; +sub.f32 f127, f125, f126; +mul.f32 f128, f122, f110; +fma.rn.f32 f129, f124, f104, f128; +barrier.sync 0; +mad.lo.s32 r23, r18, 108, r22; +st.shared.f32 [r23], f96; +st.shared.f32 [r23+36], f117; +st.shared.f32 [r23+72], f127; +barrier.sync 0; +ld.shared.f32 f130, [r11]; +ld.shared.f32 f131, [r11+972]; +ld.shared.f32 f132, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r23], f98; +st.shared.f32 [r23+36], f119; +st.shared.f32 [r23+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r11]; +ld.shared.f32 f134, [r11+972]; +ld.shared.f32 f135, [r11+1944]; +add.f32 f136, f131, f132; +add.f32 f137, f130, f136; +add.f32 f138, f134, f135; +add.f32 f139, f133, f138; +mul.f32 f140, f136, 0f3F000000; +sub.f32 f141, f130, f140; +sub.f32 f142, f134, f135; +mul.f32 f143, f142, 0f3F5DB3D7; +add.f32 f144, f143, f141; +sub.f32 f145, f141, f143; +mul.f32 f146, f138, 0f3F000000; +sub.f32 f147, f133, f146; +sub.f32 f148, f131, f132; +mul.f32 f149, f148, 0f3F5DB3D7; +sub.f32 f150, f147, f149; +add.f32 f151, f149, f147; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 2; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f152, f153}, [rd21]; +mul.f32 f156, f152, f144; +mul.f32 f157, f153, f150; +sub.f32 f158, f156, f157; +mul.f32 f159, f152, f150; +fma.rn.f32 f160, f153, f144, f159; +mul.f32 f161, f152, f152; +mul.f32 f162, f153, f153; +sub.f32 f163, f161, f162; +mul.f32 f164, f153, f152; +fma.rn.f32 f165, f153, f152, f164; +mul.f32 f166, f163, f145; +mul.f32 f167, f165, f151; +sub.f32 f168, f166, f167; +mul.f32 f169, f163, f151; +fma.rn.f32 f170, f165, f145, f169; +barrier.sync 0; +mad.lo.s32 r33, r28, 324, r32; +st.shared.f32 [r33], f137; +st.shared.f32 [r33+108], f158; +st.shared.f32 [r33+216], f168; +barrier.sync 0; +ld.shared.f32 f171, [r11]; +ld.shared.f32 f172, [r11+972]; +ld.shared.f32 f173, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r33], f139; +st.shared.f32 [r33+108], f160; +st.shared.f32 [r33+216], f170; +barrier.sync 0; +ld.shared.f32 f174, [r11]; +ld.shared.f32 f175, [r11+972]; +ld.shared.f32 f176, [r11+1944]; +add.f32 f177, f172, f173; +add.f32 f178, f171, f177; +add.f32 f179, f175, f176; +add.f32 f180, f174, f179; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f171, f181; +sub.f32 f183, f175, f176; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +mul.f32 f187, f179, 0f3F000000; +sub.f32 f188, f174, f187; +sub.f32 f189, f172, f173; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 2; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f193, f194}, [rd26]; +mul.f32 f197, f193, f185; +mul.f32 f198, f194, f191; +sub.f32 f199, f197, f198; +mul.f32 f200, f193, f191; +fma.rn.f32 f201, f194, f185, f200; +mul.f32 f202, f193, f193; +mul.f32 f203, f194, f194; +sub.f32 f204, f202, f203; +mul.f32 f205, f194, f193; +fma.rn.f32 f206, f194, f193, f205; +mul.f32 f207, f204, f186; +mul.f32 f208, f206, f192; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f192; +fma.rn.f32 f211, f206, f186, f210; +barrier.sync 0; +mad.lo.s32 r39, r34, 972, r38; +st.shared.f32 [r39], f178; +st.shared.f32 [r39+324], f199; +st.shared.f32 [r39+648], f209; +barrier.sync 0; +ld.shared.f32 f212, [r11]; +ld.shared.f32 f213, [r11+972]; +ld.shared.f32 f214, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r39], f180; +st.shared.f32 [r39+324], f201; +st.shared.f32 [r39+648], f211; +barrier.sync 0; +ld.shared.f32 f215, [r11]; +ld.shared.f32 f216, [r11+972]; +ld.shared.f32 f217, [r11+1944]; +add.f32 f218, f213, f214; +add.f32 f219, f216, f217; +mul.f32 f220, f218, 0f3F000000; +sub.f32 f221, f212, f220; +sub.f32 f222, f216, f217; +mul.f32 f223, f222, 0f3F5DB3D7; +mul.f32 f224, f219, 0f3F000000; +sub.f32 f225, f215, f224; +sub.f32 f226, f213, f214; +mul.f32 f227, f226, 0f3F5DB3D7; +add.f32 %0, f212, f218; +add.f32 %1, f215, f219; +add.f32 %2, f223, f221; +sub.f32 %3, f225, f227; +sub.f32 %4, f221, f223; +add.f32 %5, f227, f225; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..4c4ef0efcc41d --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp32_inv.hpp.inc @@ -0,0 +1,4870 @@ +#ifndef CUFFTDX_FFT_729_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_729_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<340, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1886>; +.reg .b32 r<18>; +.reg .b64 rd<8>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 5832, r17; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1885, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1884, %57, f1885; +mul.f32 f119, f1885, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1883, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1882, %63, f1883; +mul.f32 f135, f1883, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1881, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1880, %69, f1881; +mul.f32 f151, f1881, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f1879, f133, 0f3F441B7D; +sub.f32 f159, f1879, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f1877, f149, 0f3E31D0D4; +mul.f32 f1878, f155, 0f3F7C1C5C; +sub.f32 f164, f1877, f1878; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f1875, f134, 0f3E31D0D4; +mul.f32 f1876, f140, 0f3F7C1C5C; +sub.f32 f169, f1875, f1876; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f1873, f150, 0fBF708FB2; +mul.f32 f1874, f156, 0f3EAF1D44; +sub.f32 f174, f1873, f1874; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1872, f1882, f1880; +sub.f32 f183, f1882, f1880; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1871, f1884, f1872; +mul.f32 f187, f1872, 0f3F000000; +sub.f32 f188, f1884, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1870, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1869, f123, f1870; +mul.f32 f203, f1870, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1868, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1867, f124, f1868; +mul.f32 f219, f1868, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1864, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1862, %112, f1864; +mul.f32 f235, f1864, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1859, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1857, %115, f1859; +mul.f32 f251, f1859, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1854, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1852, %118, f1854; +mul.f32 f267, f1854, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f1851, f249, 0f3F441B7D; +sub.f32 f275, f1851, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f1850, f265, 0f3E31D0D4; +sub.f32 f280, f1850, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f1848, f250, 0f3E31D0D4; +mul.f32 f1849, f256, 0f3F7C1C5C; +sub.f32 f285, f1848, f1849; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f1846, f266, 0fBF708FB2; +mul.f32 f1847, f272, 0f3EAF1D44; +sub.f32 f290, f1846, f1847; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1845, f1857, f1852; +sub.f32 f299, f1857, f1852; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1844, f1862, f1845; +mul.f32 f303, f1845, 0f3F000000; +sub.f32 f304, f1862, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1843, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1842, f239, f1843; +mul.f32 f319, f1843, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1841, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1840, f240, f1841; +mul.f32 f335, f1841, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1837, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1835, %121, f1837; +mul.f32 f351, f1837, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1832, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1830, %124, f1832; +mul.f32 f367, f1832, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1828, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1826, %126, f1828; +mul.f32 f383, f1828, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f1825, f365, 0f3F441B7D; +sub.f32 f391, f1825, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f1824, f381, 0f3E31D0D4; +sub.f32 f396, f1824, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f1822, f366, 0f3E31D0D4; +mul.f32 f1823, f372, 0f3F7C1C5C; +sub.f32 f401, f1822, f1823; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f1820, f382, 0fBF708FB2; +mul.f32 f1821, f388, 0f3EAF1D44; +sub.f32 f406, f1820, f1821; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1819, f1830, f1826; +sub.f32 f415, f1830, f1826; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1818, f1835, f1819; +mul.f32 f419, f1819, 0f3F000000; +sub.f32 f420, f1835, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1817, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1816, f355, f1817; +mul.f32 f435, f1817, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1815, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1814, f356, f1815; +mul.f32 f451, f1815, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1842, 0f3E6C2691; +mul.f32 f1813, f310, 0f3F791978; +sub.f32 f459, f1813, f458; +mul.f32 f460, f1842, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f1811, f426, 0f3F64C51C; +mul.f32 f1812, f1816, 0f3EE5C902; +sub.f32 f464, f1811, f1812; +mul.f32 f465, f1816, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f1809, f326, 0f3F64C51C; +mul.f32 f1810, f1840, 0f3EE5C902; +sub.f32 f469, f1809, f1810; +mul.f32 f470, f1840, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f1807, f442, 0f3F18DF63; +mul.f32 f1808, f1814, 0f3F4D57F2; +sub.f32 f474, f1807, f1808; +mul.f32 f475, f1814, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f1805, f301, 0f3F441B7D; +mul.f32 f1806, f307, 0f3F248DBB; +sub.f32 f479, f1805, f1806; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f1804, f417, 0f3E31D0D4; +sub.f32 f484, f1804, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f1803, f317, 0f3F18DF63; +sub.f32 f489, f1803, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f1802, f433, 0fBE92D7E0; +sub.f32 f494, f1802, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f1801, f333, 0f3ECACAF8; +sub.f32 f499, f1801, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f1800, f449, 0fBF2FAD88; +sub.f32 f504, f1800, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f1799, f302, 0f3E31D0D4; +sub.f32 f509, f1799, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f1797, f418, 0fBF708FB2; +mul.f32 f1798, f424, 0f3EAF1D44; +sub.f32 f514, f1797, f1798; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f1795, f318, 0fBD6E2946; +mul.f32 f1796, f324, 0f3F7F9120; +sub.f32 f519, f1795, f1796; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f1793, f434, 0fBF7E44DE; +mul.f32 f1794, f440, 0fBDEDC21F; +sub.f32 f524, f1793, f1794; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f1792, f334, 0fBE92D7E0; +sub.f32 f529, f1792, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f1791, f450, 0fBF55E287; +sub.f32 f534, f1791, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f1790, f1844, f1818; +sub.f32 f541, f1844, f1818; +mul.f32 f542, f541, 0fBF5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f1790, 0f3F000000; +sub.f32 f546, f1871, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0fBF5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f1789, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0fBF5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f1788, f1869, f1789; +mul.f32 f561, f1789, 0f3F000000; +sub.f32 f562, f1869, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0fBF5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f1787, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0fBF5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f1786, f1867, f1787; +mul.f32 f577, f1787, 0f3F000000; +sub.f32 f578, f1867, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0fBF5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f1785, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0fBF5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f1784, f191, f1785; +mul.f32 f593, f1785, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0fBF5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f1783, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f1782, f207, f1783; +mul.f32 f609, f1783, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0fBF5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f1781, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0fBF5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f1780, f223, f1781; +mul.f32 f625, f1781, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0fBF5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f1779, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0fBF5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f1778, f192, f1779; +mul.f32 f641, f1779, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0fBF5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f1777, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0fBF5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f1776, f208, f1777; +mul.f32 f657, f1777, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0fBF5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f1775, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0fBF5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f1774, f224, f1775; +mul.f32 f673, f1775, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0fBF5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mad.lo.s32 r12, r9, 5832, r3; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r11, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f683, f1788, f680; +mul.f32 f685, f679, f1788; +mul.f32 f687, f680, f680; +mul.f32 f1773, f679, f679; +sub.f32 f688, f1773, f687; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f691, f1786, f690; +mul.f32 f693, f688, f1786; +mul.f32 f1771, f679, f688; +mul.f32 f1772, f680, f690; +sub.f32 f696, f1771, f1772; +mul.f32 f1770, f568, f690; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f699, f1784, f698; +mul.f32 f701, f696, f1784; +mul.f32 f703, f680, f698; +mul.f32 f1769, f679, f696; +sub.f32 f704, f1769, f703; +mul.f32 f1768, f584, f698; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f707, f1782, f706; +mul.f32 f709, f704, f1782; +mul.f32 f1766, f679, f704; +mul.f32 f1767, f680, f706; +sub.f32 f712, f1766, f1767; +mul.f32 f1765, f600, f706; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f715, f1780, f714; +mul.f32 f717, f712, f1780; +mul.f32 f719, f680, f714; +mul.f32 f1764, f679, f712; +sub.f32 f720, f1764, f719; +mul.f32 f1763, f616, f714; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f723, f1778, f722; +mul.f32 f725, f720, f1778; +mul.f32 f727, f680, f722; +mul.f32 f1762, f679, f720; +sub.f32 f728, f1762, f727; +mul.f32 f1761, f632, f722; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f731, f1776, f730; +mul.f32 f733, f728, f1776; +mul.f32 f1759, f679, f728; +mul.f32 f1760, f680, f730; +sub.f32 f736, f1759, f1760; +mul.f32 f1758, f648, f730; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f739, f1774, f738; +mul.f32 f741, f736, f1774; +mul.f32 f743, f680, f738; +mul.f32 f1757, f679, f736; +sub.f32 f744, f1757, f743; +mul.f32 f1756, f664, f738; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f747, f549, f746; +mul.f32 f749, f744, f549; +mul.f32 f751, f680, f746; +mul.f32 f1755, f679, f744; +sub.f32 f752, f1755, f751; +mul.f32 f1754, f543, f746; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f755, f565, f754; +mul.f32 f757, f752, f565; +mul.f32 f1752, f679, f752; +mul.f32 f1753, f680, f754; +sub.f32 f760, f1752, f1753; +mul.f32 f1751, f559, f754; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f763, f581, f762; +mul.f32 f765, f760, f581; +mul.f32 f767, f680, f762; +mul.f32 f1750, f679, f760; +sub.f32 f768, f1750, f767; +mul.f32 f1749, f575, f762; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f771, f597, f770; +mul.f32 f773, f768, f597; +mul.f32 f1747, f679, f768; +mul.f32 f1748, f680, f770; +sub.f32 f776, f1747, f1748; +mul.f32 f1746, f591, f770; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f779, f613, f778; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f1745, f679, f776; +sub.f32 f784, f1745, f783; +mul.f32 f1744, f607, f778; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f787, f629, f786; +mul.f32 f789, f784, f629; +mul.f32 f791, f680, f786; +mul.f32 f1743, f679, f784; +sub.f32 f792, f1743, f791; +mul.f32 f1742, f623, f786; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f795, f645, f794; +mul.f32 f797, f792, f645; +mul.f32 f1740, f679, f792; +mul.f32 f1741, f680, f794; +sub.f32 f800, f1740, f1741; +mul.f32 f1739, f639, f794; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f803, f661, f802; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f1738, f679, f800; +sub.f32 f808, f1738, f807; +mul.f32 f1737, f655, f802; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f811, f677, f810; +mul.f32 f813, f808, f677; +mul.f32 f815, f680, f810; +mul.f32 f1736, f679, f808; +sub.f32 f816, f1736, f815; +mul.f32 f1735, f671, f810; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f819, f550, f818; +mul.f32 f821, f816, f550; +mul.f32 f1733, f679, f816; +mul.f32 f1734, f680, f818; +sub.f32 f824, f1733, f1734; +mul.f32 f1732, f544, f818; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f827, f566, f826; +mul.f32 f829, f824, f566; +mul.f32 f831, f680, f826; +mul.f32 f1731, f679, f824; +sub.f32 f832, f1731, f831; +mul.f32 f1730, f560, f826; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f835, f582, f834; +mul.f32 f837, f832, f582; +mul.f32 f1728, f679, f832; +mul.f32 f1729, f680, f834; +sub.f32 f840, f1728, f1729; +mul.f32 f1727, f576, f834; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f843, f598, f842; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f1726, f679, f840; +sub.f32 f848, f1726, f847; +mul.f32 f1725, f592, f842; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f851, f614, f850; +mul.f32 f853, f848, f614; +mul.f32 f855, f680, f850; +mul.f32 f1724, f679, f848; +sub.f32 f856, f1724, f855; +mul.f32 f1723, f608, f850; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f859, f630, f858; +mul.f32 f861, f856, f630; +mul.f32 f1721, f679, f856; +mul.f32 f1722, f680, f858; +sub.f32 f864, f1721, f1722; +mul.f32 f1720, f624, f858; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f867, f646, f866; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f1719, f679, f864; +sub.f32 f872, f1719, f871; +mul.f32 f1718, f640, f866; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f875, f662, f874; +mul.f32 f877, f872, f662; +mul.f32 f879, f680, f874; +mul.f32 f1717, f679, f872; +sub.f32 f880, f1717, f879; +mul.f32 f1716, f656, f874; +mul.f32 f881, f679, f874; +mul.f32 f1715, f552, f680; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f678, f882; +mul.f32 f884, f672, f882; +mul.f32 f885, f880, f678; +barrier.sync 0; +mad.lo.s32 r13, r11, 216, r12; +add.f32 f886, f1871, f1790; +add.f32 f887, f178, f537; +st.shared.v2.f32 [r13], {f887, f886}; +fma.rn.f32 f888, f679, f552, f683; +sub.f32 f889, f685, f1715; +st.shared.v2.f32 [r13+8], {f888, f889}; +fma.rn.f32 f890, f688, f568, f691; +sub.f32 f891, f693, f1770; +st.shared.v2.f32 [r13+16], {f890, f891}; +fma.rn.f32 f892, f696, f584, f699; +sub.f32 f893, f701, f1768; +st.shared.v2.f32 [r13+24], {f892, f893}; +fma.rn.f32 f894, f704, f600, f707; +sub.f32 f895, f709, f1765; +st.shared.v2.f32 [r13+32], {f894, f895}; +fma.rn.f32 f896, f712, f616, f715; +sub.f32 f897, f717, f1763; +st.shared.v2.f32 [r13+40], {f896, f897}; +fma.rn.f32 f898, f720, f632, f723; +sub.f32 f899, f725, f1761; +st.shared.v2.f32 [r13+48], {f898, f899}; +sub.f32 f900, f733, f1758; +fma.rn.f32 f901, f728, f648, f731; +st.shared.v2.f32 [r13+56], {f901, f900}; +fma.rn.f32 f902, f736, f664, f739; +sub.f32 f903, f741, f1756; +st.shared.v2.f32 [r13+64], {f902, f903}; +fma.rn.f32 f904, f744, f543, f747; +sub.f32 f905, f749, f1754; +st.shared.v2.f32 [r13+72], {f904, f905}; +fma.rn.f32 f906, f752, f559, f755; +sub.f32 f907, f757, f1751; +st.shared.v2.f32 [r13+80], {f906, f907}; +fma.rn.f32 f908, f760, f575, f763; +sub.f32 f909, f765, f1749; +st.shared.v2.f32 [r13+88], {f908, f909}; +fma.rn.f32 f910, f768, f591, f771; +sub.f32 f911, f773, f1746; +st.shared.v2.f32 [r13+96], {f910, f911}; +fma.rn.f32 f912, f776, f607, f779; +sub.f32 f913, f781, f1744; +st.shared.v2.f32 [r13+104], {f912, f913}; +fma.rn.f32 f914, f784, f623, f787; +sub.f32 f915, f789, f1742; +st.shared.v2.f32 [r13+112], {f914, f915}; +fma.rn.f32 f916, f792, f639, f795; +sub.f32 f917, f797, f1739; +st.shared.v2.f32 [r13+120], {f916, f917}; +fma.rn.f32 f918, f800, f655, f803; +sub.f32 f919, f805, f1737; +st.shared.v2.f32 [r13+128], {f918, f919}; +fma.rn.f32 f920, f808, f671, f811; +sub.f32 f921, f813, f1735; +st.shared.v2.f32 [r13+136], {f920, f921}; +fma.rn.f32 f922, f816, f544, f819; +sub.f32 f923, f821, f1732; +st.shared.v2.f32 [r13+144], {f922, f923}; +fma.rn.f32 f924, f824, f560, f827; +sub.f32 f925, f829, f1730; +st.shared.v2.f32 [r13+152], {f924, f925}; +fma.rn.f32 f926, f832, f576, f835; +sub.f32 f927, f837, f1727; +st.shared.v2.f32 [r13+160], {f926, f927}; +fma.rn.f32 f928, f840, f592, f843; +sub.f32 f929, f845, f1725; +st.shared.v2.f32 [r13+168], {f928, f929}; +fma.rn.f32 f930, f848, f608, f851; +sub.f32 f931, f853, f1723; +st.shared.v2.f32 [r13+176], {f930, f931}; +fma.rn.f32 f932, f856, f624, f859; +sub.f32 f933, f861, f1720; +st.shared.v2.f32 [r13+184], {f932, f933}; +fma.rn.f32 f934, f864, f640, f867; +sub.f32 f935, f869, f1718; +st.shared.v2.f32 [r13+192], {f934, f935}; +fma.rn.f32 f936, f872, f656, f875; +sub.f32 f937, f877, f1716; +st.shared.v2.f32 [r13+200], {f936, f937}; +fma.rn.f32 f938, f880, f672, f883; +sub.f32 f939, f885, f884; +st.shared.v2.f32 [r13+208], {f938, f939}; +barrier.sync 0; +mad.lo.s32 r14, r11, -208, r13; +ld.shared.v2.f32 {f940, f941}, [r14]; +ld.shared.v2.f32 {f944, f945}, [r14+216]; +ld.shared.v2.f32 {f948, f949}, [r14+432]; +ld.shared.v2.f32 {f952, f953}, [r14+648]; +ld.shared.v2.f32 {f956, f957}, [r14+864]; +ld.shared.v2.f32 {f960, f961}, [r14+1080]; +ld.shared.v2.f32 {f964, f965}, [r14+1296]; +ld.shared.v2.f32 {f968, f969}, [r14+1512]; +ld.shared.v2.f32 {f972, f973}, [r14+1728]; +ld.shared.v2.f32 {f976, f977}, [r14+1944]; +ld.shared.v2.f32 {f980, f981}, [r14+2160]; +ld.shared.v2.f32 {f984, f985}, [r14+2376]; +ld.shared.v2.f32 {f988, f989}, [r14+2592]; +ld.shared.v2.f32 {f992, f993}, [r14+2808]; +ld.shared.v2.f32 {f996, f997}, [r14+3024]; +ld.shared.v2.f32 {f1000, f1001}, [r14+3240]; +ld.shared.v2.f32 {f1004, f1005}, [r14+3456]; +ld.shared.v2.f32 {f1008, f1009}, [r14+3672]; +ld.shared.v2.f32 {f1012, f1013}, [r14+3888]; +ld.shared.v2.f32 {f1016, f1017}, [r14+4104]; +ld.shared.v2.f32 {f1020, f1021}, [r14+4320]; +ld.shared.v2.f32 {f1024, f1025}, [r14+4536]; +ld.shared.v2.f32 {f1028, f1029}, [r14+4752]; +ld.shared.v2.f32 {f1032, f1033}, [r14+4968]; +ld.shared.v2.f32 {f1036, f1037}, [r14+5184]; +ld.shared.v2.f32 {f1040, f1041}, [r14+5400]; +ld.shared.v2.f32 {f1044, f1045}, [r14+5616]; +add.f32 f1048, f976, f1012; +add.f32 f1049, f940, f1048; +mul.f32 f1052, f1048, 0f3F000000; +sub.f32 f1053, f940, f1052; +add.f32 f1714, f977, f1013; +sub.f32 f1054, f977, f1013; +mul.f32 f1055, f1054, 0fBF5DB3D7; +add.f32 f1056, f1055, f1053; +sub.f32 f1057, f1053, f1055; +add.f32 f1713, f941, f1714; +mul.f32 f1058, f1714, 0f3F000000; +sub.f32 f1059, f941, f1058; +sub.f32 f1060, f976, f1012; +mul.f32 f1061, f1060, 0fBF5DB3D7; +sub.f32 f1062, f1059, f1061; +add.f32 f1063, f1061, f1059; +add.f32 f1064, f988, f1024; +add.f32 f1065, f952, f1064; +mul.f32 f1068, f1064, 0f3F000000; +sub.f32 f1069, f952, f1068; +add.f32 f1712, f989, f1025; +sub.f32 f1070, f989, f1025; +mul.f32 f1071, f1070, 0fBF5DB3D7; +add.f32 f1072, f1071, f1069; +sub.f32 f1073, f1069, f1071; +add.f32 f1711, f953, f1712; +mul.f32 f1074, f1712, 0f3F000000; +sub.f32 f1075, f953, f1074; +sub.f32 f1076, f988, f1024; +mul.f32 f1077, f1076, 0fBF5DB3D7; +sub.f32 f1078, f1075, f1077; +add.f32 f1079, f1077, f1075; +add.f32 f1080, f1000, f1036; +add.f32 f1081, f964, f1080; +mul.f32 f1084, f1080, 0f3F000000; +sub.f32 f1085, f964, f1084; +add.f32 f1710, f1001, f1037; +sub.f32 f1086, f1001, f1037; +mul.f32 f1087, f1086, 0fBF5DB3D7; +add.f32 f1088, f1087, f1085; +sub.f32 f1089, f1085, f1087; +add.f32 f1709, f965, f1710; +mul.f32 f1090, f1710, 0f3F000000; +sub.f32 f1091, f965, f1090; +sub.f32 f1092, f1000, f1036; +mul.f32 f1093, f1092, 0fBF5DB3D7; +sub.f32 f1094, f1091, f1093; +add.f32 f1095, f1093, f1091; +mul.f32 f1097, f1078, 0f3F248DBB; +mul.f32 f1708, f1072, 0f3F441B7D; +sub.f32 f1098, f1708, f1097; +mul.f32 f1099, f1078, 0f3F441B7D; +fma.rn.f32 f1100, f1072, 0f3F248DBB, f1099; +mul.f32 f1102, f1094, 0f3F7C1C5C; +mul.f32 f1707, f1088, 0f3E31D0D4; +sub.f32 f1103, f1707, f1102; +mul.f32 f1104, f1094, 0f3E31D0D4; +fma.rn.f32 f1105, f1088, 0f3F7C1C5C, f1104; +mul.f32 f1705, f1073, 0f3E31D0D4; +mul.f32 f1706, f1079, 0f3F7C1C5C; +sub.f32 f1108, f1705, f1706; +mul.f32 f1109, f1079, 0f3E31D0D4; +fma.rn.f32 f1110, f1073, 0f3F7C1C5C, f1109; +mul.f32 f1703, f1089, 0fBF708FB2; +mul.f32 f1704, f1095, 0f3EAF1D44; +sub.f32 f1113, f1703, f1704; +mul.f32 f1114, f1095, 0fBF708FB2; +fma.rn.f32 f1115, f1089, 0f3EAF1D44, f1114; +add.f32 f1116, f1065, f1081; +add.f32 f1117, f1049, f1116; +mul.f32 f1120, f1116, 0f3F000000; +sub.f32 f1121, f1049, f1120; +add.f32 f1702, f1711, f1709; +sub.f32 f1122, f1711, f1709; +mul.f32 f1123, f1122, 0fBF5DB3D7; +add.f32 f1124, f1123, f1121; +sub.f32 f1125, f1121, f1123; +add.f32 f1701, f1713, f1702; +mul.f32 f1126, f1702, 0f3F000000; +sub.f32 f1127, f1713, f1126; +sub.f32 f1128, f1065, f1081; +mul.f32 f1129, f1128, 0fBF5DB3D7; +sub.f32 f1130, f1127, f1129; +add.f32 f1131, f1129, f1127; +add.f32 f1132, f1098, f1103; +add.f32 f1133, f1056, f1132; +mul.f32 f1136, f1132, 0f3F000000; +sub.f32 f1137, f1056, f1136; +add.f32 f1700, f1100, f1105; +sub.f32 f1138, f1100, f1105; +mul.f32 f1139, f1138, 0fBF5DB3D7; +add.f32 f1140, f1139, f1137; +sub.f32 f1141, f1137, f1139; +add.f32 f1699, f1062, f1700; +mul.f32 f1142, f1700, 0f3F000000; +sub.f32 f1143, f1062, f1142; +sub.f32 f1144, f1098, f1103; +mul.f32 f1145, f1144, 0fBF5DB3D7; +sub.f32 f1146, f1143, f1145; +add.f32 f1147, f1145, f1143; +add.f32 f1148, f1108, f1113; +add.f32 f1149, f1057, f1148; +mul.f32 f1152, f1148, 0f3F000000; +sub.f32 f1153, f1057, f1152; +add.f32 f1698, f1110, f1115; +sub.f32 f1154, f1110, f1115; +mul.f32 f1155, f1154, 0fBF5DB3D7; +add.f32 f1156, f1155, f1153; +sub.f32 f1157, f1153, f1155; +add.f32 f1697, f1063, f1698; +mul.f32 f1158, f1698, 0f3F000000; +sub.f32 f1159, f1063, f1158; +sub.f32 f1160, f1108, f1113; +mul.f32 f1161, f1160, 0fBF5DB3D7; +sub.f32 f1162, f1159, f1161; +add.f32 f1163, f1161, f1159; +add.f32 f1164, f980, f1016; +add.f32 f1165, f944, f1164; +mul.f32 f1168, f1164, 0f3F000000; +sub.f32 f1169, f944, f1168; +add.f32 f1696, f981, f1017; +sub.f32 f1170, f981, f1017; +mul.f32 f1171, f1170, 0fBF5DB3D7; +add.f32 f1172, f1171, f1169; +sub.f32 f1173, f1169, f1171; +add.f32 f1695, f945, f1696; +mul.f32 f1174, f1696, 0f3F000000; +sub.f32 f1175, f945, f1174; +sub.f32 f1176, f980, f1016; +mul.f32 f1177, f1176, 0fBF5DB3D7; +sub.f32 f1178, f1175, f1177; +add.f32 f1179, f1177, f1175; +add.f32 f1180, f992, f1028; +add.f32 f1181, f956, f1180; +mul.f32 f1184, f1180, 0f3F000000; +sub.f32 f1185, f956, f1184; +add.f32 f1694, f993, f1029; +sub.f32 f1186, f993, f1029; +mul.f32 f1187, f1186, 0fBF5DB3D7; +add.f32 f1188, f1187, f1185; +sub.f32 f1189, f1185, f1187; +add.f32 f1693, f957, f1694; +mul.f32 f1190, f1694, 0f3F000000; +sub.f32 f1191, f957, f1190; +sub.f32 f1192, f992, f1028; +mul.f32 f1193, f1192, 0fBF5DB3D7; +sub.f32 f1194, f1191, f1193; +add.f32 f1195, f1193, f1191; +add.f32 f1196, f1004, f1040; +add.f32 f1197, f968, f1196; +mul.f32 f1200, f1196, 0f3F000000; +sub.f32 f1201, f968, f1200; +add.f32 f1692, f1005, f1041; +sub.f32 f1202, f1005, f1041; +mul.f32 f1203, f1202, 0fBF5DB3D7; +add.f32 f1204, f1203, f1201; +sub.f32 f1205, f1201, f1203; +add.f32 f1691, f969, f1692; +mul.f32 f1206, f1692, 0f3F000000; +sub.f32 f1207, f969, f1206; +sub.f32 f1208, f1004, f1040; +mul.f32 f1209, f1208, 0fBF5DB3D7; +sub.f32 f1210, f1207, f1209; +add.f32 f1211, f1209, f1207; +mul.f32 f1213, f1194, 0f3F248DBB; +mul.f32 f1690, f1188, 0f3F441B7D; +sub.f32 f1214, f1690, f1213; +mul.f32 f1215, f1194, 0f3F441B7D; +fma.rn.f32 f1216, f1188, 0f3F248DBB, f1215; +mul.f32 f1218, f1210, 0f3F7C1C5C; +mul.f32 f1689, f1204, 0f3E31D0D4; +sub.f32 f1219, f1689, f1218; +mul.f32 f1220, f1210, 0f3E31D0D4; +fma.rn.f32 f1221, f1204, 0f3F7C1C5C, f1220; +mul.f32 f1223, f1195, 0f3F7C1C5C; +mul.f32 f1688, f1189, 0f3E31D0D4; +sub.f32 f1224, f1688, f1223; +mul.f32 f1225, f1195, 0f3E31D0D4; +fma.rn.f32 f1226, f1189, 0f3F7C1C5C, f1225; +mul.f32 f1686, f1205, 0fBF708FB2; +mul.f32 f1687, f1211, 0f3EAF1D44; +sub.f32 f1229, f1686, f1687; +mul.f32 f1230, f1211, 0fBF708FB2; +fma.rn.f32 f1231, f1205, 0f3EAF1D44, f1230; +add.f32 f1232, f1181, f1197; +add.f32 f1233, f1165, f1232; +mul.f32 f1236, f1232, 0f3F000000; +sub.f32 f1237, f1165, f1236; +add.f32 f1685, f1693, f1691; +sub.f32 f1238, f1693, f1691; +mul.f32 f1239, f1238, 0fBF5DB3D7; +add.f32 f1240, f1239, f1237; +sub.f32 f1241, f1237, f1239; +add.f32 f1684, f1695, f1685; +mul.f32 f1242, f1685, 0f3F000000; +sub.f32 f1243, f1695, f1242; +sub.f32 f1244, f1181, f1197; +mul.f32 f1245, f1244, 0fBF5DB3D7; +sub.f32 f1246, f1243, f1245; +add.f32 f1247, f1245, f1243; +add.f32 f1248, f1214, f1219; +add.f32 f1249, f1172, f1248; +mul.f32 f1252, f1248, 0f3F000000; +sub.f32 f1253, f1172, f1252; +add.f32 f1683, f1216, f1221; +sub.f32 f1254, f1216, f1221; +mul.f32 f1255, f1254, 0fBF5DB3D7; +add.f32 f1256, f1255, f1253; +sub.f32 f1257, f1253, f1255; +add.f32 f1682, f1178, f1683; +mul.f32 f1258, f1683, 0f3F000000; +sub.f32 f1259, f1178, f1258; +sub.f32 f1260, f1214, f1219; +mul.f32 f1261, f1260, 0fBF5DB3D7; +sub.f32 f1262, f1259, f1261; +add.f32 f1263, f1261, f1259; +add.f32 f1264, f1224, f1229; +add.f32 f1265, f1173, f1264; +mul.f32 f1268, f1264, 0f3F000000; +sub.f32 f1269, f1173, f1268; +add.f32 f1681, f1226, f1231; +sub.f32 f1270, f1226, f1231; +mul.f32 f1271, f1270, 0fBF5DB3D7; +add.f32 f1272, f1271, f1269; +sub.f32 f1273, f1269, f1271; +add.f32 f1680, f1179, f1681; +mul.f32 f1274, f1681, 0f3F000000; +sub.f32 f1275, f1179, f1274; +sub.f32 f1276, f1224, f1229; +mul.f32 f1277, f1276, 0fBF5DB3D7; +sub.f32 f1278, f1275, f1277; +add.f32 f1279, f1277, f1275; +add.f32 f1280, f984, f1020; +add.f32 f1281, f948, f1280; +mul.f32 f1284, f1280, 0f3F000000; +sub.f32 f1285, f948, f1284; +add.f32 f1679, f985, f1021; +sub.f32 f1286, f985, f1021; +mul.f32 f1287, f1286, 0fBF5DB3D7; +add.f32 f1288, f1287, f1285; +sub.f32 f1289, f1285, f1287; +add.f32 f1678, f949, f1679; +mul.f32 f1290, f1679, 0f3F000000; +sub.f32 f1291, f949, f1290; +sub.f32 f1292, f984, f1020; +mul.f32 f1293, f1292, 0fBF5DB3D7; +sub.f32 f1294, f1291, f1293; +add.f32 f1295, f1293, f1291; +add.f32 f1296, f996, f1032; +add.f32 f1297, f960, f1296; +mul.f32 f1300, f1296, 0f3F000000; +sub.f32 f1301, f960, f1300; +add.f32 f1677, f997, f1033; +sub.f32 f1302, f997, f1033; +mul.f32 f1303, f1302, 0fBF5DB3D7; +add.f32 f1304, f1303, f1301; +sub.f32 f1305, f1301, f1303; +add.f32 f1676, f961, f1677; +mul.f32 f1306, f1677, 0f3F000000; +sub.f32 f1307, f961, f1306; +sub.f32 f1308, f996, f1032; +mul.f32 f1309, f1308, 0fBF5DB3D7; +sub.f32 f1310, f1307, f1309; +add.f32 f1311, f1309, f1307; +add.f32 f1312, f1008, f1044; +add.f32 f1313, f972, f1312; +mul.f32 f1316, f1312, 0f3F000000; +sub.f32 f1317, f972, f1316; +add.f32 f1675, f1009, f1045; +sub.f32 f1318, f1009, f1045; +mul.f32 f1319, f1318, 0fBF5DB3D7; +add.f32 f1320, f1319, f1317; +sub.f32 f1321, f1317, f1319; +add.f32 f1674, f973, f1675; +mul.f32 f1322, f1675, 0f3F000000; +sub.f32 f1323, f973, f1322; +sub.f32 f1324, f1008, f1044; +mul.f32 f1325, f1324, 0fBF5DB3D7; +sub.f32 f1326, f1323, f1325; +add.f32 f1327, f1325, f1323; +mul.f32 f1329, f1310, 0f3F248DBB; +mul.f32 f1673, f1304, 0f3F441B7D; +sub.f32 f1330, f1673, f1329; +mul.f32 f1331, f1310, 0f3F441B7D; +fma.rn.f32 f1332, f1304, 0f3F248DBB, f1331; +mul.f32 f1334, f1326, 0f3F7C1C5C; +mul.f32 f1672, f1320, 0f3E31D0D4; +sub.f32 f1335, f1672, f1334; +mul.f32 f1336, f1326, 0f3E31D0D4; +fma.rn.f32 f1337, f1320, 0f3F7C1C5C, f1336; +mul.f32 f1339, f1311, 0f3F7C1C5C; +mul.f32 f1671, f1305, 0f3E31D0D4; +sub.f32 f1340, f1671, f1339; +mul.f32 f1341, f1311, 0f3E31D0D4; +fma.rn.f32 f1342, f1305, 0f3F7C1C5C, f1341; +mul.f32 f1669, f1321, 0fBF708FB2; +mul.f32 f1670, f1327, 0f3EAF1D44; +sub.f32 f1345, f1669, f1670; +mul.f32 f1346, f1327, 0fBF708FB2; +fma.rn.f32 f1347, f1321, 0f3EAF1D44, f1346; +add.f32 f1348, f1297, f1313; +add.f32 f1349, f1281, f1348; +mul.f32 f1352, f1348, 0f3F000000; +sub.f32 f1353, f1281, f1352; +add.f32 f1668, f1676, f1674; +sub.f32 f1354, f1676, f1674; +mul.f32 f1355, f1354, 0fBF5DB3D7; +add.f32 f1356, f1355, f1353; +sub.f32 f1357, f1353, f1355; +add.f32 f1667, f1678, f1668; +mul.f32 f1358, f1668, 0f3F000000; +sub.f32 f1359, f1678, f1358; +sub.f32 f1360, f1297, f1313; +mul.f32 f1361, f1360, 0fBF5DB3D7; +sub.f32 f1362, f1359, f1361; +add.f32 f1363, f1361, f1359; +add.f32 f1364, f1330, f1335; +add.f32 f1365, f1288, f1364; +mul.f32 f1368, f1364, 0f3F000000; +sub.f32 f1369, f1288, f1368; +add.f32 f1666, f1332, f1337; +sub.f32 f1370, f1332, f1337; +mul.f32 f1371, f1370, 0fBF5DB3D7; +add.f32 f1372, f1371, f1369; +sub.f32 f1373, f1369, f1371; +add.f32 f1665, f1294, f1666; +mul.f32 f1374, f1666, 0f3F000000; +sub.f32 f1375, f1294, f1374; +sub.f32 f1376, f1330, f1335; +mul.f32 f1377, f1376, 0fBF5DB3D7; +sub.f32 f1378, f1375, f1377; +add.f32 f1379, f1377, f1375; +add.f32 f1380, f1340, f1345; +add.f32 f1381, f1289, f1380; +mul.f32 f1384, f1380, 0f3F000000; +sub.f32 f1385, f1289, f1384; +add.f32 f1664, f1342, f1347; +sub.f32 f1386, f1342, f1347; +mul.f32 f1387, f1386, 0fBF5DB3D7; +add.f32 f1388, f1387, f1385; +sub.f32 f1389, f1385, f1387; +add.f32 f1663, f1295, f1664; +mul.f32 f1390, f1664, 0f3F000000; +sub.f32 f1391, f1295, f1390; +sub.f32 f1392, f1340, f1345; +mul.f32 f1393, f1392, 0fBF5DB3D7; +sub.f32 f1394, f1391, f1393; +add.f32 f1395, f1393, f1391; +mul.f32 f1661, f1249, 0f3F791978; +mul.f32 f1662, f1682, 0f3E6C2691; +sub.f32 f1398, f1661, f1662; +mul.f32 f1399, f1682, 0f3F791978; +fma.rn.f32 f1400, f1249, 0f3E6C2691, f1399; +mul.f32 f1402, f1665, 0f3EE5C902; +mul.f32 f1660, f1365, 0f3F64C51C; +sub.f32 f1403, f1660, f1402; +mul.f32 f1404, f1665, 0f3F64C51C; +fma.rn.f32 f1405, f1365, 0f3EE5C902, f1404; +mul.f32 f1407, f1680, 0f3EE5C902; +mul.f32 f1659, f1265, 0f3F64C51C; +sub.f32 f1408, f1659, f1407; +mul.f32 f1409, f1680, 0f3F64C51C; +fma.rn.f32 f1410, f1265, 0f3EE5C902, f1409; +mul.f32 f1412, f1663, 0f3F4D57F2; +mul.f32 f1658, f1381, 0f3F18DF63; +sub.f32 f1413, f1658, f1412; +mul.f32 f1414, f1663, 0f3F18DF63; +fma.rn.f32 f1415, f1381, 0f3F4D57F2, f1414; +mul.f32 f1417, f1246, 0f3F248DBB; +mul.f32 f1657, f1240, 0f3F441B7D; +sub.f32 f1418, f1657, f1417; +mul.f32 f1419, f1246, 0f3F441B7D; +fma.rn.f32 f1420, f1240, 0f3F248DBB, f1419; +mul.f32 f1422, f1362, 0f3F7C1C5C; +mul.f32 f1656, f1356, 0f3E31D0D4; +sub.f32 f1423, f1656, f1422; +mul.f32 f1424, f1362, 0f3E31D0D4; +fma.rn.f32 f1425, f1356, 0f3F7C1C5C, f1424; +mul.f32 f1654, f1256, 0f3F18DF63; +mul.f32 f1655, f1262, 0f3F4D57F2; +sub.f32 f1428, f1654, f1655; +mul.f32 f1429, f1262, 0f3F18DF63; +fma.rn.f32 f1430, f1256, 0f3F4D57F2, f1429; +mul.f32 f1652, f1372, 0fBE92D7E0; +mul.f32 f1653, f1378, 0f3F753ECD; +sub.f32 f1433, f1652, f1653; +mul.f32 f1434, f1378, 0fBE92D7E0; +fma.rn.f32 f1435, f1372, 0f3F753ECD, f1434; +mul.f32 f1650, f1272, 0f3ECACAF8; +mul.f32 f1651, f1278, 0f3F6B1036; +sub.f32 f1438, f1650, f1651; +mul.f32 f1439, f1278, 0f3ECACAF8; +fma.rn.f32 f1440, f1272, 0f3F6B1036, f1439; +mul.f32 f1648, f1388, 0fBF2FAD88; +mul.f32 f1649, f1394, 0f3F3A3529; +sub.f32 f1443, f1648, f1649; +mul.f32 f1444, f1394, 0fBF2FAD88; +fma.rn.f32 f1445, f1388, 0f3F3A3529, f1444; +mul.f32 f1447, f1247, 0f3F7C1C5C; +mul.f32 f1647, f1241, 0f3E31D0D4; +sub.f32 f1448, f1647, f1447; +mul.f32 f1449, f1247, 0f3E31D0D4; +fma.rn.f32 f1450, f1241, 0f3F7C1C5C, f1449; +mul.f32 f1452, f1363, 0f3EAF1D44; +mul.f32 f1646, f1357, 0fBF708FB2; +sub.f32 f1453, f1646, f1452; +mul.f32 f1454, f1363, 0fBF708FB2; +fma.rn.f32 f1455, f1357, 0f3EAF1D44, f1454; +mul.f32 f1457, f1263, 0f3F7F9120; +mul.f32 f1645, f1257, 0fBD6E2946; +sub.f32 f1458, f1645, f1457; +mul.f32 f1459, f1263, 0fBD6E2946; +fma.rn.f32 f1460, f1257, 0f3F7F9120, f1459; +mul.f32 f1462, f1379, 0fBDEDC21F; +mul.f32 f1644, f1373, 0fBF7E44DE; +sub.f32 f1463, f1644, f1462; +mul.f32 f1464, f1379, 0fBF7E44DE; +fma.rn.f32 f1465, f1373, 0fBDEDC21F, f1464; +mul.f32 f1467, f1279, 0f3F753ECD; +mul.f32 f1643, f1273, 0fBE92D7E0; +sub.f32 f1468, f1643, f1467; +mul.f32 f1469, f1279, 0fBE92D7E0; +fma.rn.f32 f1470, f1273, 0f3F753ECD, f1469; +mul.f32 f1641, f1389, 0fBF55E287; +mul.f32 f1642, f1395, 0fBF0CAC9F; +sub.f32 f1473, f1641, f1642; +mul.f32 f1474, f1395, 0fBF55E287; +fma.rn.f32 f1475, f1389, 0fBF0CAC9F, f1474; +add.f32 f1476, f1233, f1349; +mul.f32 f1478, f1476, 0f3F000000; +sub.f32 f1479, f1117, f1478; +add.f32 f1640, f1684, f1667; +sub.f32 f1480, f1684, f1667; +mul.f32 f1481, f1480, 0fBF5DB3D7; +mul.f32 f1482, f1640, 0f3F000000; +sub.f32 f1483, f1701, f1482; +sub.f32 f1484, f1233, f1349; +mul.f32 f1485, f1484, 0fBF5DB3D7; +add.f32 f1486, f1398, f1403; +mul.f32 f1488, f1486, 0f3F000000; +sub.f32 f1489, f1133, f1488; +add.f32 f1639, f1400, f1405; +sub.f32 f1490, f1400, f1405; +mul.f32 f1491, f1490, 0fBF5DB3D7; +mul.f32 f1492, f1639, 0f3F000000; +sub.f32 f1493, f1699, f1492; +sub.f32 f1494, f1398, f1403; +mul.f32 f1495, f1494, 0fBF5DB3D7; +add.f32 f1496, f1408, f1413; +mul.f32 f1498, f1496, 0f3F000000; +sub.f32 f1499, f1149, f1498; +add.f32 f1638, f1410, f1415; +sub.f32 f1500, f1410, f1415; +mul.f32 f1501, f1500, 0fBF5DB3D7; +mul.f32 f1502, f1638, 0f3F000000; +sub.f32 f1503, f1697, f1502; +sub.f32 f1504, f1408, f1413; +mul.f32 f1505, f1504, 0fBF5DB3D7; +add.f32 f1506, f1418, f1423; +mul.f32 f1508, f1506, 0f3F000000; +sub.f32 f1509, f1124, f1508; +add.f32 f1637, f1420, f1425; +sub.f32 f1510, f1420, f1425; +mul.f32 f1511, f1510, 0fBF5DB3D7; +mul.f32 f1512, f1637, 0f3F000000; +sub.f32 f1513, f1130, f1512; +sub.f32 f1514, f1418, f1423; +mul.f32 f1515, f1514, 0fBF5DB3D7; +add.f32 f1516, f1428, f1433; +mul.f32 f1518, f1516, 0f3F000000; +sub.f32 f1519, f1140, f1518; +add.f32 f1636, f1430, f1435; +sub.f32 f1520, f1430, f1435; +mul.f32 f1521, f1520, 0fBF5DB3D7; +mul.f32 f1522, f1636, 0f3F000000; +sub.f32 f1523, f1146, f1522; +sub.f32 f1524, f1428, f1433; +mul.f32 f1525, f1524, 0fBF5DB3D7; +add.f32 f1526, f1438, f1443; +mul.f32 f1528, f1526, 0f3F000000; +sub.f32 f1529, f1156, f1528; +add.f32 f1635, f1440, f1445; +sub.f32 f1530, f1440, f1445; +mul.f32 f1531, f1530, 0fBF5DB3D7; +mul.f32 f1532, f1635, 0f3F000000; +sub.f32 f1533, f1162, f1532; +sub.f32 f1534, f1438, f1443; +mul.f32 f1535, f1534, 0fBF5DB3D7; +add.f32 f1536, f1448, f1453; +mul.f32 f1538, f1536, 0f3F000000; +sub.f32 f1539, f1125, f1538; +add.f32 f1634, f1450, f1455; +sub.f32 f1540, f1450, f1455; +mul.f32 f1541, f1540, 0fBF5DB3D7; +mul.f32 f1542, f1634, 0f3F000000; +sub.f32 f1543, f1131, f1542; +sub.f32 f1544, f1448, f1453; +mul.f32 f1545, f1544, 0fBF5DB3D7; +add.f32 f1546, f1458, f1463; +mul.f32 f1548, f1546, 0f3F000000; +sub.f32 f1549, f1141, f1548; +add.f32 f1633, f1460, f1465; +sub.f32 f1550, f1460, f1465; +mul.f32 f1551, f1550, 0fBF5DB3D7; +mul.f32 f1552, f1633, 0f3F000000; +sub.f32 f1553, f1147, f1552; +sub.f32 f1554, f1458, f1463; +mul.f32 f1555, f1554, 0fBF5DB3D7; +add.f32 f1556, f1468, f1473; +mul.f32 f1558, f1556, 0f3F000000; +sub.f32 f1559, f1157, f1558; +add.f32 f1632, f1470, f1475; +sub.f32 f1560, f1470, f1475; +mul.f32 f1561, f1560, 0fBF5DB3D7; +mul.f32 f1562, f1632, 0f3F000000; +sub.f32 f1563, f1163, f1562; +sub.f32 f1564, f1468, f1473; +mul.f32 f1565, f1564, 0fBF5DB3D7; +add.f32 %1, f1701, f1640; +add.f32 %0, f1117, f1476; +add.f32 %3, f1699, f1639; +add.f32 %2, f1133, f1486; +add.f32 %5, f1697, f1638; +add.f32 %4, f1149, f1496; +add.f32 %7, f1130, f1637; +add.f32 %6, f1124, f1506; +add.f32 %9, f1146, f1636; +add.f32 %8, f1140, f1516; +add.f32 %11, f1162, f1635; +add.f32 %10, f1156, f1526; +add.f32 %13, f1131, f1634; +add.f32 %12, f1125, f1536; +add.f32 %15, f1147, f1633; +add.f32 %14, f1141, f1546; +add.f32 %17, f1163, f1632; +add.f32 %16, f1157, f1556; +add.f32 %18, f1481, f1479; +sub.f32 %19, f1483, f1485; +add.f32 %20, f1491, f1489; +sub.f32 %21, f1493, f1495; +sub.f32 %23, f1503, f1505; +add.f32 %22, f1501, f1499; +sub.f32 %25, f1513, f1515; +add.f32 %24, f1511, f1509; +sub.f32 %27, f1523, f1525; +add.f32 %26, f1521, f1519; +add.f32 %28, f1531, f1529; +sub.f32 %29, f1533, f1535; +add.f32 %30, f1541, f1539; +sub.f32 %31, f1543, f1545; +add.f32 %32, f1551, f1549; +sub.f32 %33, f1553, f1555; +add.f32 %34, f1561, f1559; +sub.f32 %35, f1563, f1565; +add.f32 %37, f1485, f1483; +sub.f32 %36, f1479, f1481; +add.f32 %39, f1495, f1493; +sub.f32 %38, f1489, f1491; +add.f32 %41, f1505, f1503; +sub.f32 %40, f1499, f1501; +add.f32 %43, f1515, f1513; +sub.f32 %42, f1509, f1511; +add.f32 %45, f1525, f1523; +sub.f32 %44, f1519, f1521; +add.f32 %47, f1535, f1533; +sub.f32 %46, f1529, f1531; +add.f32 %49, f1545, f1543; +sub.f32 %48, f1539, f1541; +add.f32 %51, f1555, f1553; +sub.f32 %50, f1549, f1551; +add.f32 %53, f1565, f1563; +sub.f32 %52, f1559, f1561; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<341, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1824>; +.reg .b32 r<18>; +.reg .b64 rd<10>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 2916, r17; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1815, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1814, %57, f1815; +mul.f32 f119, f1815, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1813, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1812, %63, f1813; +mul.f32 f135, f1813, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1811, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1810, %69, f1811; +mul.f32 f151, f1811, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f1809, f133, 0f3F441B7D; +sub.f32 f159, f1809, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f1807, f149, 0f3E31D0D4; +mul.f32 f1808, f155, 0f3F7C1C5C; +sub.f32 f164, f1807, f1808; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f1805, f134, 0f3E31D0D4; +mul.f32 f1806, f140, 0f3F7C1C5C; +sub.f32 f169, f1805, f1806; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f1803, f150, 0fBF708FB2; +mul.f32 f1804, f156, 0f3EAF1D44; +sub.f32 f174, f1803, f1804; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1802, f1812, f1810; +sub.f32 f183, f1812, f1810; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1801, f1814, f1802; +mul.f32 f187, f1802, 0f3F000000; +sub.f32 f188, f1814, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1800, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1799, f123, f1800; +mul.f32 f203, f1800, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1798, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1797, f124, f1798; +mul.f32 f219, f1798, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1794, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1792, %112, f1794; +mul.f32 f235, f1794, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1789, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1787, %115, f1789; +mul.f32 f251, f1789, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1784, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1782, %118, f1784; +mul.f32 f267, f1784, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f1781, f249, 0f3F441B7D; +sub.f32 f275, f1781, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f1780, f265, 0f3E31D0D4; +sub.f32 f280, f1780, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f1778, f250, 0f3E31D0D4; +mul.f32 f1779, f256, 0f3F7C1C5C; +sub.f32 f285, f1778, f1779; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f1776, f266, 0fBF708FB2; +mul.f32 f1777, f272, 0f3EAF1D44; +sub.f32 f290, f1776, f1777; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1775, f1787, f1782; +sub.f32 f299, f1787, f1782; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1774, f1792, f1775; +mul.f32 f303, f1775, 0f3F000000; +sub.f32 f304, f1792, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1773, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1772, f239, f1773; +mul.f32 f319, f1773, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1771, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1770, f240, f1771; +mul.f32 f335, f1771, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1767, %120, %119; +sub.f32 f347, %120, %119; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1765, %121, f1767; +mul.f32 f351, f1767, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1762, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1760, %124, f1762; +mul.f32 f367, f1762, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1758, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1756, %126, f1758; +mul.f32 f383, f1758, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f1755, f365, 0f3F441B7D; +sub.f32 f391, f1755, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f1754, f381, 0f3E31D0D4; +sub.f32 f396, f1754, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f1752, f366, 0f3E31D0D4; +mul.f32 f1753, f372, 0f3F7C1C5C; +sub.f32 f401, f1752, f1753; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f1750, f382, 0fBF708FB2; +mul.f32 f1751, f388, 0f3EAF1D44; +sub.f32 f406, f1750, f1751; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1749, f1760, f1756; +sub.f32 f415, f1760, f1756; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1748, f1765, f1749; +mul.f32 f419, f1749, 0f3F000000; +sub.f32 f420, f1765, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1747, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1746, f355, f1747; +mul.f32 f435, f1747, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1745, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1744, f356, f1745; +mul.f32 f451, f1745, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1772, 0f3E6C2691; +mul.f32 f1743, f310, 0f3F791978; +sub.f32 f459, f1743, f458; +mul.f32 f460, f1772, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f1741, f426, 0f3F64C51C; +mul.f32 f1742, f1746, 0f3EE5C902; +sub.f32 f464, f1741, f1742; +mul.f32 f465, f1746, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f1739, f326, 0f3F64C51C; +mul.f32 f1740, f1770, 0f3EE5C902; +sub.f32 f469, f1739, f1740; +mul.f32 f470, f1770, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f1737, f442, 0f3F18DF63; +mul.f32 f1738, f1744, 0f3F4D57F2; +sub.f32 f474, f1737, f1738; +mul.f32 f475, f1744, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f1735, f301, 0f3F441B7D; +mul.f32 f1736, f307, 0f3F248DBB; +sub.f32 f479, f1735, f1736; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f1734, f417, 0f3E31D0D4; +sub.f32 f484, f1734, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f1733, f317, 0f3F18DF63; +sub.f32 f489, f1733, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f1732, f433, 0fBE92D7E0; +sub.f32 f494, f1732, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f1731, f333, 0f3ECACAF8; +sub.f32 f499, f1731, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f1730, f449, 0fBF2FAD88; +sub.f32 f504, f1730, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f1729, f302, 0f3E31D0D4; +sub.f32 f509, f1729, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f1727, f418, 0fBF708FB2; +mul.f32 f1728, f424, 0f3EAF1D44; +sub.f32 f514, f1727, f1728; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f1725, f318, 0fBD6E2946; +mul.f32 f1726, f324, 0f3F7F9120; +sub.f32 f519, f1725, f1726; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f1723, f434, 0fBF7E44DE; +mul.f32 f1724, f440, 0fBDEDC21F; +sub.f32 f524, f1723, f1724; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f1722, f334, 0fBE92D7E0; +sub.f32 f529, f1722, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f1721, f450, 0fBF55E287; +sub.f32 f534, f1721, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f1720, f1774, f1748; +sub.f32 f543, f1774, f1748; +mul.f32 f544, f543, 0fBF5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f1719, f1801, f1720; +mul.f32 f547, f1720, 0f3F000000; +sub.f32 f548, f1801, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0fBF5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f1718, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f1717, f1799, f1718; +mul.f32 f563, f1718, 0f3F000000; +sub.f32 f564, f1799, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0fBF5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f1716, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f1715, f1797, f1716; +mul.f32 f579, f1716, 0f3F000000; +sub.f32 f580, f1797, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0fBF5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f1714, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0fBF5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f1713, f191, f1714; +mul.f32 f595, f1714, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0fBF5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f1712, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0fBF5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f1711, f207, f1712; +mul.f32 f611, f1712, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0fBF5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f1710, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0fBF5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f1709, f223, f1710; +mul.f32 f627, f1710, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0fBF5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f1708, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0fBF5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f1707, f192, f1708; +mul.f32 f643, f1708, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0fBF5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f1706, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0fBF5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f1705, f208, f1706; +mul.f32 f659, f1706, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0fBF5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f1704, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0fBF5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f1703, f224, f1704; +mul.f32 f675, f1704, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0fBF5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mul.wide.u32 rd7, r11, 8; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f685, f1717, f682; +fma.rn.f32 f686, f681, f554, f685; +mul.f32 f687, f554, f682; +mul.f32 f688, f681, f1717; +sub.f32 f689, f688, f687; +mul.f32 f1701, f681, f681; +mul.f32 f1702, f682, f682; +sub.f32 f692, f1701, f1702; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f695, f1715, f694; +fma.rn.f32 f696, f692, f570, f695; +mul.f32 f697, f570, f694; +mul.f32 f698, f692, f1715; +sub.f32 f699, f698, f697; +mul.f32 f701, f682, f694; +mul.f32 f1700, f681, f692; +sub.f32 f702, f1700, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f705, f1713, f704; +fma.rn.f32 f706, f702, f586, f705; +mul.f32 f707, f586, f704; +mul.f32 f708, f702, f1713; +sub.f32 f709, f708, f707; +mul.f32 f711, f682, f704; +mul.f32 f1699, f681, f702; +sub.f32 f712, f1699, f711; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f715, f1711, f714; +fma.rn.f32 f716, f712, f602, f715; +mul.f32 f717, f602, f714; +mul.f32 f718, f712, f1711; +sub.f32 f719, f718, f717; +mul.f32 f721, f682, f714; +mul.f32 f1698, f681, f712; +sub.f32 f722, f1698, f721; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f725, f1709, f724; +fma.rn.f32 f726, f722, f618, f725; +mul.f32 f727, f618, f724; +mul.f32 f728, f722, f1709; +sub.f32 f729, f728, f727; +mul.f32 f1696, f681, f722; +mul.f32 f1697, f682, f724; +sub.f32 f732, f1696, f1697; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f735, f1707, f734; +fma.rn.f32 f736, f732, f634, f735; +mul.f32 f737, f634, f734; +mul.f32 f738, f732, f1707; +sub.f32 f739, f738, f737; +mul.f32 f1694, f681, f732; +mul.f32 f1695, f682, f734; +sub.f32 f742, f1694, f1695; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f745, f1705, f744; +fma.rn.f32 f746, f742, f650, f745; +mul.f32 f747, f650, f744; +mul.f32 f748, f742, f1705; +sub.f32 f749, f748, f747; +mul.f32 f751, f682, f744; +mul.f32 f1693, f681, f742; +sub.f32 f752, f1693, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f755, f1703, f754; +fma.rn.f32 f756, f752, f666, f755; +mul.f32 f757, f666, f754; +mul.f32 f758, f752, f1703; +sub.f32 f759, f758, f757; +mul.f32 f761, f682, f754; +mul.f32 f1692, f681, f752; +sub.f32 f762, f1692, f761; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f765, f551, f764; +fma.rn.f32 f766, f762, f545, f765; +mul.f32 f767, f545, f764; +mul.f32 f768, f762, f551; +sub.f32 f769, f768, f767; +mul.f32 f1690, f681, f762; +mul.f32 f1691, f682, f764; +sub.f32 f772, f1690, f1691; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f775, f567, f774; +fma.rn.f32 f776, f772, f561, f775; +mul.f32 f777, f561, f774; +mul.f32 f778, f772, f567; +sub.f32 f779, f778, f777; +mul.f32 f1688, f681, f772; +mul.f32 f1689, f682, f774; +sub.f32 f782, f1688, f1689; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f785, f583, f784; +fma.rn.f32 f786, f782, f577, f785; +mul.f32 f787, f577, f784; +mul.f32 f788, f782, f583; +sub.f32 f789, f788, f787; +mul.f32 f791, f682, f784; +mul.f32 f1687, f681, f782; +sub.f32 f792, f1687, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f795, f599, f794; +fma.rn.f32 f796, f792, f593, f795; +mul.f32 f797, f593, f794; +mul.f32 f798, f792, f599; +sub.f32 f799, f798, f797; +mul.f32 f801, f682, f794; +mul.f32 f1686, f681, f792; +sub.f32 f802, f1686, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f805, f615, f804; +fma.rn.f32 f806, f802, f609, f805; +mul.f32 f807, f609, f804; +mul.f32 f808, f802, f615; +sub.f32 f809, f808, f807; +mul.f32 f811, f682, f804; +mul.f32 f1685, f681, f802; +sub.f32 f812, f1685, f811; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f815, f631, f814; +fma.rn.f32 f816, f812, f625, f815; +mul.f32 f817, f625, f814; +mul.f32 f818, f812, f631; +sub.f32 f819, f818, f817; +mul.f32 f1683, f681, f812; +mul.f32 f1684, f682, f814; +sub.f32 f822, f1683, f1684; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f825, f647, f824; +fma.rn.f32 f826, f822, f641, f825; +mul.f32 f827, f641, f824; +mul.f32 f828, f822, f647; +sub.f32 f829, f828, f827; +mul.f32 f1681, f681, f822; +mul.f32 f1682, f682, f824; +sub.f32 f832, f1681, f1682; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f835, f663, f834; +fma.rn.f32 f836, f832, f657, f835; +mul.f32 f837, f657, f834; +mul.f32 f838, f832, f663; +sub.f32 f839, f838, f837; +mul.f32 f841, f682, f834; +mul.f32 f1680, f681, f832; +sub.f32 f842, f1680, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f845, f679, f844; +fma.rn.f32 f846, f842, f673, f845; +mul.f32 f847, f673, f844; +mul.f32 f848, f842, f679; +sub.f32 f849, f848, f847; +mul.f32 f851, f682, f844; +mul.f32 f1679, f681, f842; +sub.f32 f852, f1679, f851; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f855, f552, f854; +fma.rn.f32 f856, f852, f546, f855; +mul.f32 f857, f546, f854; +mul.f32 f858, f852, f552; +sub.f32 f859, f858, f857; +mul.f32 f861, f682, f854; +mul.f32 f1678, f681, f852; +sub.f32 f862, f1678, f861; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f865, f568, f864; +fma.rn.f32 f866, f862, f562, f865; +mul.f32 f867, f562, f864; +mul.f32 f868, f862, f568; +sub.f32 f869, f868, f867; +mul.f32 f1676, f681, f862; +mul.f32 f1677, f682, f864; +sub.f32 f872, f1676, f1677; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f875, f584, f874; +fma.rn.f32 f876, f872, f578, f875; +mul.f32 f877, f578, f874; +mul.f32 f878, f872, f584; +sub.f32 f879, f878, f877; +mul.f32 f1674, f681, f872; +mul.f32 f1675, f682, f874; +sub.f32 f882, f1674, f1675; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f885, f600, f884; +fma.rn.f32 f886, f882, f594, f885; +mul.f32 f887, f594, f884; +mul.f32 f888, f882, f600; +sub.f32 f889, f888, f887; +mul.f32 f891, f682, f884; +mul.f32 f1673, f681, f882; +sub.f32 f892, f1673, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f895, f616, f894; +fma.rn.f32 f896, f892, f610, f895; +mul.f32 f897, f610, f894; +mul.f32 f898, f892, f616; +sub.f32 f899, f898, f897; +mul.f32 f901, f682, f894; +mul.f32 f1672, f681, f892; +sub.f32 f902, f1672, f901; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f905, f632, f904; +fma.rn.f32 f906, f902, f626, f905; +mul.f32 f907, f626, f904; +mul.f32 f908, f902, f632; +sub.f32 f909, f908, f907; +mul.f32 f1670, f681, f902; +mul.f32 f1671, f682, f904; +sub.f32 f912, f1670, f1671; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f915, f648, f914; +fma.rn.f32 f916, f912, f642, f915; +mul.f32 f917, f642, f914; +mul.f32 f918, f912, f648; +sub.f32 f919, f918, f917; +mul.f32 f1668, f681, f912; +mul.f32 f1669, f682, f914; +sub.f32 f922, f1668, f1669; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f925, f664, f924; +fma.rn.f32 f926, f922, f658, f925; +mul.f32 f927, f658, f924; +mul.f32 f928, f922, f664; +sub.f32 f929, f928, f927; +mul.f32 f931, f682, f924; +mul.f32 f1667, f681, f922; +sub.f32 f932, f1667, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f935, f680, f934; +fma.rn.f32 f936, f932, f674, f935; +mul.f32 f937, f674, f934; +mul.f32 f938, f932, f680; +sub.f32 f939, f938, f937; +mad.lo.s32 r12, r9, 2916, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 108, r12; +st.shared.f32 [r13], f538; +st.shared.f32 [r13+4], f686; +st.shared.f32 [r13+8], f696; +st.shared.f32 [r13+12], f706; +st.shared.f32 [r13+16], f716; +st.shared.f32 [r13+20], f726; +st.shared.f32 [r13+24], f736; +st.shared.f32 [r13+28], f746; +st.shared.f32 [r13+32], f756; +st.shared.f32 [r13+36], f766; +st.shared.f32 [r13+40], f776; +st.shared.f32 [r13+44], f786; +st.shared.f32 [r13+48], f796; +st.shared.f32 [r13+52], f806; +st.shared.f32 [r13+56], f816; +st.shared.f32 [r13+60], f826; +st.shared.f32 [r13+64], f836; +st.shared.f32 [r13+68], f846; +st.shared.f32 [r13+72], f856; +st.shared.f32 [r13+76], f866; +st.shared.f32 [r13+80], f876; +st.shared.f32 [r13+84], f886; +st.shared.f32 [r13+88], f896; +st.shared.f32 [r13+92], f906; +st.shared.f32 [r13+96], f916; +st.shared.f32 [r13+100], f926; +st.shared.f32 [r13+104], f936; +barrier.sync 0; +mad.lo.s32 r14, r11, -104, r13; +ld.shared.f32 f940, [r14]; +ld.shared.f32 f941, [r14+108]; +ld.shared.f32 f942, [r14+216]; +ld.shared.f32 f943, [r14+324]; +ld.shared.f32 f944, [r14+432]; +ld.shared.f32 f945, [r14+540]; +ld.shared.f32 f946, [r14+648]; +ld.shared.f32 f947, [r14+756]; +ld.shared.f32 f948, [r14+864]; +ld.shared.f32 f949, [r14+972]; +ld.shared.f32 f950, [r14+1080]; +ld.shared.f32 f951, [r14+1188]; +ld.shared.f32 f952, [r14+1296]; +ld.shared.f32 f953, [r14+1404]; +ld.shared.f32 f954, [r14+1512]; +ld.shared.f32 f955, [r14+1620]; +ld.shared.f32 f956, [r14+1728]; +ld.shared.f32 f957, [r14+1836]; +ld.shared.f32 f958, [r14+1944]; +ld.shared.f32 f959, [r14+2052]; +ld.shared.f32 f960, [r14+2160]; +ld.shared.f32 f961, [r14+2268]; +ld.shared.f32 f962, [r14+2376]; +ld.shared.f32 f963, [r14+2484]; +ld.shared.f32 f964, [r14+2592]; +ld.shared.f32 f965, [r14+2700]; +ld.shared.f32 f966, [r14+2808]; +barrier.sync 0; +st.shared.f32 [r13], f1719; +st.shared.f32 [r13+4], f689; +st.shared.f32 [r13+8], f699; +st.shared.f32 [r13+12], f709; +st.shared.f32 [r13+16], f719; +st.shared.f32 [r13+20], f729; +st.shared.f32 [r13+24], f739; +st.shared.f32 [r13+28], f749; +st.shared.f32 [r13+32], f759; +st.shared.f32 [r13+36], f769; +st.shared.f32 [r13+40], f779; +st.shared.f32 [r13+44], f789; +st.shared.f32 [r13+48], f799; +st.shared.f32 [r13+52], f809; +st.shared.f32 [r13+56], f819; +st.shared.f32 [r13+60], f829; +st.shared.f32 [r13+64], f839; +st.shared.f32 [r13+68], f849; +st.shared.f32 [r13+72], f859; +st.shared.f32 [r13+76], f869; +st.shared.f32 [r13+80], f879; +st.shared.f32 [r13+84], f889; +st.shared.f32 [r13+88], f899; +st.shared.f32 [r13+92], f909; +st.shared.f32 [r13+96], f919; +st.shared.f32 [r13+100], f929; +st.shared.f32 [r13+104], f939; +barrier.sync 0; +ld.shared.f32 f967, [r14]; +ld.shared.f32 f968, [r14+108]; +ld.shared.f32 f969, [r14+216]; +ld.shared.f32 f970, [r14+324]; +ld.shared.f32 f971, [r14+432]; +ld.shared.f32 f972, [r14+540]; +ld.shared.f32 f973, [r14+648]; +ld.shared.f32 f974, [r14+756]; +ld.shared.f32 f975, [r14+864]; +ld.shared.f32 f976, [r14+972]; +ld.shared.f32 f977, [r14+1080]; +ld.shared.f32 f978, [r14+1188]; +ld.shared.f32 f979, [r14+1296]; +ld.shared.f32 f980, [r14+1404]; +ld.shared.f32 f981, [r14+1512]; +ld.shared.f32 f982, [r14+1620]; +ld.shared.f32 f983, [r14+1728]; +ld.shared.f32 f984, [r14+1836]; +ld.shared.f32 f985, [r14+1944]; +ld.shared.f32 f986, [r14+2052]; +ld.shared.f32 f987, [r14+2160]; +ld.shared.f32 f988, [r14+2268]; +ld.shared.f32 f989, [r14+2376]; +ld.shared.f32 f990, [r14+2484]; +ld.shared.f32 f991, [r14+2592]; +ld.shared.f32 f992, [r14+2700]; +ld.shared.f32 f993, [r14+2808]; +add.f32 f994, f949, f958; +add.f32 f995, f940, f994; +mul.f32 f998, f994, 0f3F000000; +sub.f32 f999, f940, f998; +add.f32 f1666, f976, f985; +sub.f32 f1000, f976, f985; +mul.f32 f1001, f1000, 0fBF5DB3D7; +add.f32 f1002, f1001, f999; +sub.f32 f1003, f999, f1001; +add.f32 f1665, f967, f1666; +mul.f32 f1004, f1666, 0f3F000000; +sub.f32 f1005, f967, f1004; +sub.f32 f1006, f949, f958; +mul.f32 f1007, f1006, 0fBF5DB3D7; +sub.f32 f1008, f1005, f1007; +add.f32 f1009, f1007, f1005; +add.f32 f1010, f952, f961; +add.f32 f1011, f943, f1010; +mul.f32 f1014, f1010, 0f3F000000; +sub.f32 f1015, f943, f1014; +add.f32 f1664, f979, f988; +sub.f32 f1016, f979, f988; +mul.f32 f1017, f1016, 0fBF5DB3D7; +add.f32 f1018, f1017, f1015; +sub.f32 f1019, f1015, f1017; +add.f32 f1663, f970, f1664; +mul.f32 f1020, f1664, 0f3F000000; +sub.f32 f1021, f970, f1020; +sub.f32 f1022, f952, f961; +mul.f32 f1023, f1022, 0fBF5DB3D7; +sub.f32 f1024, f1021, f1023; +add.f32 f1025, f1023, f1021; +add.f32 f1026, f955, f964; +add.f32 f1027, f946, f1026; +mul.f32 f1030, f1026, 0f3F000000; +sub.f32 f1031, f946, f1030; +add.f32 f1662, f982, f991; +sub.f32 f1032, f982, f991; +mul.f32 f1033, f1032, 0fBF5DB3D7; +add.f32 f1034, f1033, f1031; +sub.f32 f1035, f1031, f1033; +add.f32 f1661, f973, f1662; +mul.f32 f1036, f1662, 0f3F000000; +sub.f32 f1037, f973, f1036; +sub.f32 f1038, f955, f964; +mul.f32 f1039, f1038, 0fBF5DB3D7; +sub.f32 f1040, f1037, f1039; +add.f32 f1041, f1039, f1037; +mul.f32 f1659, f1018, 0f3F441B7D; +mul.f32 f1660, f1024, 0f3F248DBB; +sub.f32 f1044, f1659, f1660; +mul.f32 f1045, f1024, 0f3F441B7D; +fma.rn.f32 f1046, f1018, 0f3F248DBB, f1045; +mul.f32 f1657, f1034, 0f3E31D0D4; +mul.f32 f1658, f1040, 0f3F7C1C5C; +sub.f32 f1049, f1657, f1658; +mul.f32 f1050, f1040, 0f3E31D0D4; +fma.rn.f32 f1051, f1034, 0f3F7C1C5C, f1050; +mul.f32 f1655, f1019, 0f3E31D0D4; +mul.f32 f1656, f1025, 0f3F7C1C5C; +sub.f32 f1054, f1655, f1656; +mul.f32 f1055, f1025, 0f3E31D0D4; +fma.rn.f32 f1056, f1019, 0f3F7C1C5C, f1055; +mul.f32 f1653, f1035, 0fBF708FB2; +mul.f32 f1654, f1041, 0f3EAF1D44; +sub.f32 f1059, f1653, f1654; +mul.f32 f1060, f1041, 0fBF708FB2; +fma.rn.f32 f1061, f1035, 0f3EAF1D44, f1060; +add.f32 f1062, f1011, f1027; +add.f32 f1063, f995, f1062; +mul.f32 f1066, f1062, 0f3F000000; +sub.f32 f1067, f995, f1066; +add.f32 f1652, f1663, f1661; +sub.f32 f1068, f1663, f1661; +mul.f32 f1069, f1068, 0fBF5DB3D7; +add.f32 f1070, f1069, f1067; +sub.f32 f1071, f1067, f1069; +add.f32 f1651, f1665, f1652; +mul.f32 f1072, f1652, 0f3F000000; +sub.f32 f1073, f1665, f1072; +sub.f32 f1074, f1011, f1027; +mul.f32 f1075, f1074, 0fBF5DB3D7; +sub.f32 f1076, f1073, f1075; +add.f32 f1077, f1075, f1073; +add.f32 f1078, f1044, f1049; +add.f32 f1079, f1002, f1078; +mul.f32 f1082, f1078, 0f3F000000; +sub.f32 f1083, f1002, f1082; +add.f32 f1650, f1046, f1051; +sub.f32 f1084, f1046, f1051; +mul.f32 f1085, f1084, 0fBF5DB3D7; +add.f32 f1086, f1085, f1083; +sub.f32 f1087, f1083, f1085; +add.f32 f1649, f1008, f1650; +mul.f32 f1088, f1650, 0f3F000000; +sub.f32 f1089, f1008, f1088; +sub.f32 f1090, f1044, f1049; +mul.f32 f1091, f1090, 0fBF5DB3D7; +sub.f32 f1092, f1089, f1091; +add.f32 f1093, f1091, f1089; +add.f32 f1094, f1054, f1059; +add.f32 f1095, f1003, f1094; +mul.f32 f1098, f1094, 0f3F000000; +sub.f32 f1099, f1003, f1098; +add.f32 f1648, f1056, f1061; +sub.f32 f1100, f1056, f1061; +mul.f32 f1101, f1100, 0fBF5DB3D7; +add.f32 f1102, f1101, f1099; +sub.f32 f1103, f1099, f1101; +add.f32 f1647, f1009, f1648; +mul.f32 f1104, f1648, 0f3F000000; +sub.f32 f1105, f1009, f1104; +sub.f32 f1106, f1054, f1059; +mul.f32 f1107, f1106, 0fBF5DB3D7; +sub.f32 f1108, f1105, f1107; +add.f32 f1109, f1107, f1105; +add.f32 f1110, f950, f959; +add.f32 f1111, f941, f1110; +mul.f32 f1114, f1110, 0f3F000000; +sub.f32 f1115, f941, f1114; +add.f32 f1646, f977, f986; +sub.f32 f1116, f977, f986; +mul.f32 f1117, f1116, 0fBF5DB3D7; +add.f32 f1118, f1117, f1115; +sub.f32 f1119, f1115, f1117; +add.f32 f1645, f968, f1646; +mul.f32 f1120, f1646, 0f3F000000; +sub.f32 f1121, f968, f1120; +sub.f32 f1122, f950, f959; +mul.f32 f1123, f1122, 0fBF5DB3D7; +sub.f32 f1124, f1121, f1123; +add.f32 f1125, f1123, f1121; +add.f32 f1126, f953, f962; +add.f32 f1127, f944, f1126; +mul.f32 f1130, f1126, 0f3F000000; +sub.f32 f1131, f944, f1130; +add.f32 f1644, f980, f989; +sub.f32 f1132, f980, f989; +mul.f32 f1133, f1132, 0fBF5DB3D7; +add.f32 f1134, f1133, f1131; +sub.f32 f1135, f1131, f1133; +add.f32 f1643, f971, f1644; +mul.f32 f1136, f1644, 0f3F000000; +sub.f32 f1137, f971, f1136; +sub.f32 f1138, f953, f962; +mul.f32 f1139, f1138, 0fBF5DB3D7; +sub.f32 f1140, f1137, f1139; +add.f32 f1141, f1139, f1137; +add.f32 f1142, f956, f965; +add.f32 f1143, f947, f1142; +mul.f32 f1146, f1142, 0f3F000000; +sub.f32 f1147, f947, f1146; +add.f32 f1642, f983, f992; +sub.f32 f1148, f983, f992; +mul.f32 f1149, f1148, 0fBF5DB3D7; +add.f32 f1150, f1149, f1147; +sub.f32 f1151, f1147, f1149; +add.f32 f1641, f974, f1642; +mul.f32 f1152, f1642, 0f3F000000; +sub.f32 f1153, f974, f1152; +sub.f32 f1154, f956, f965; +mul.f32 f1155, f1154, 0fBF5DB3D7; +sub.f32 f1156, f1153, f1155; +add.f32 f1157, f1155, f1153; +mul.f32 f1159, f1140, 0f3F248DBB; +mul.f32 f1640, f1134, 0f3F441B7D; +sub.f32 f1160, f1640, f1159; +mul.f32 f1161, f1140, 0f3F441B7D; +fma.rn.f32 f1162, f1134, 0f3F248DBB, f1161; +mul.f32 f1638, f1150, 0f3E31D0D4; +mul.f32 f1639, f1156, 0f3F7C1C5C; +sub.f32 f1165, f1638, f1639; +mul.f32 f1166, f1156, 0f3E31D0D4; +fma.rn.f32 f1167, f1150, 0f3F7C1C5C, f1166; +mul.f32 f1636, f1135, 0f3E31D0D4; +mul.f32 f1637, f1141, 0f3F7C1C5C; +sub.f32 f1170, f1636, f1637; +mul.f32 f1171, f1141, 0f3E31D0D4; +fma.rn.f32 f1172, f1135, 0f3F7C1C5C, f1171; +mul.f32 f1634, f1151, 0fBF708FB2; +mul.f32 f1635, f1157, 0f3EAF1D44; +sub.f32 f1175, f1634, f1635; +mul.f32 f1176, f1157, 0fBF708FB2; +fma.rn.f32 f1177, f1151, 0f3EAF1D44, f1176; +add.f32 f1178, f1127, f1143; +add.f32 f1179, f1111, f1178; +mul.f32 f1182, f1178, 0f3F000000; +sub.f32 f1183, f1111, f1182; +add.f32 f1633, f1643, f1641; +sub.f32 f1184, f1643, f1641; +mul.f32 f1185, f1184, 0fBF5DB3D7; +add.f32 f1186, f1185, f1183; +sub.f32 f1187, f1183, f1185; +add.f32 f1632, f1645, f1633; +mul.f32 f1188, f1633, 0f3F000000; +sub.f32 f1189, f1645, f1188; +sub.f32 f1190, f1127, f1143; +mul.f32 f1191, f1190, 0fBF5DB3D7; +sub.f32 f1192, f1189, f1191; +add.f32 f1193, f1191, f1189; +add.f32 f1194, f1160, f1165; +add.f32 f1195, f1118, f1194; +mul.f32 f1198, f1194, 0f3F000000; +sub.f32 f1199, f1118, f1198; +add.f32 f1631, f1162, f1167; +sub.f32 f1200, f1162, f1167; +mul.f32 f1201, f1200, 0fBF5DB3D7; +add.f32 f1202, f1201, f1199; +sub.f32 f1203, f1199, f1201; +add.f32 f1630, f1124, f1631; +mul.f32 f1204, f1631, 0f3F000000; +sub.f32 f1205, f1124, f1204; +sub.f32 f1206, f1160, f1165; +mul.f32 f1207, f1206, 0fBF5DB3D7; +sub.f32 f1208, f1205, f1207; +add.f32 f1209, f1207, f1205; +add.f32 f1210, f1170, f1175; +add.f32 f1211, f1119, f1210; +mul.f32 f1214, f1210, 0f3F000000; +sub.f32 f1215, f1119, f1214; +add.f32 f1629, f1172, f1177; +sub.f32 f1216, f1172, f1177; +mul.f32 f1217, f1216, 0fBF5DB3D7; +add.f32 f1218, f1217, f1215; +sub.f32 f1219, f1215, f1217; +add.f32 f1628, f1125, f1629; +mul.f32 f1220, f1629, 0f3F000000; +sub.f32 f1221, f1125, f1220; +sub.f32 f1222, f1170, f1175; +mul.f32 f1223, f1222, 0fBF5DB3D7; +sub.f32 f1224, f1221, f1223; +add.f32 f1225, f1223, f1221; +add.f32 f1226, f951, f960; +add.f32 f1227, f942, f1226; +mul.f32 f1230, f1226, 0f3F000000; +sub.f32 f1231, f942, f1230; +add.f32 f1627, f978, f987; +sub.f32 f1232, f978, f987; +mul.f32 f1233, f1232, 0fBF5DB3D7; +add.f32 f1234, f1233, f1231; +sub.f32 f1235, f1231, f1233; +add.f32 f1626, f969, f1627; +mul.f32 f1236, f1627, 0f3F000000; +sub.f32 f1237, f969, f1236; +sub.f32 f1238, f951, f960; +mul.f32 f1239, f1238, 0fBF5DB3D7; +sub.f32 f1240, f1237, f1239; +add.f32 f1241, f1239, f1237; +add.f32 f1242, f954, f963; +add.f32 f1243, f945, f1242; +mul.f32 f1246, f1242, 0f3F000000; +sub.f32 f1247, f945, f1246; +add.f32 f1625, f981, f990; +sub.f32 f1248, f981, f990; +mul.f32 f1249, f1248, 0fBF5DB3D7; +add.f32 f1250, f1249, f1247; +sub.f32 f1251, f1247, f1249; +add.f32 f1624, f972, f1625; +mul.f32 f1252, f1625, 0f3F000000; +sub.f32 f1253, f972, f1252; +sub.f32 f1254, f954, f963; +mul.f32 f1255, f1254, 0fBF5DB3D7; +sub.f32 f1256, f1253, f1255; +add.f32 f1257, f1255, f1253; +add.f32 f1258, f957, f966; +add.f32 f1259, f948, f1258; +mul.f32 f1262, f1258, 0f3F000000; +sub.f32 f1263, f948, f1262; +add.f32 f1623, f984, f993; +sub.f32 f1264, f984, f993; +mul.f32 f1265, f1264, 0fBF5DB3D7; +add.f32 f1266, f1265, f1263; +sub.f32 f1267, f1263, f1265; +add.f32 f1622, f975, f1623; +mul.f32 f1268, f1623, 0f3F000000; +sub.f32 f1269, f975, f1268; +sub.f32 f1270, f957, f966; +mul.f32 f1271, f1270, 0fBF5DB3D7; +sub.f32 f1272, f1269, f1271; +add.f32 f1273, f1271, f1269; +mul.f32 f1275, f1256, 0f3F248DBB; +mul.f32 f1621, f1250, 0f3F441B7D; +sub.f32 f1276, f1621, f1275; +mul.f32 f1277, f1256, 0f3F441B7D; +fma.rn.f32 f1278, f1250, 0f3F248DBB, f1277; +mul.f32 f1619, f1266, 0f3E31D0D4; +mul.f32 f1620, f1272, 0f3F7C1C5C; +sub.f32 f1281, f1619, f1620; +mul.f32 f1282, f1272, 0f3E31D0D4; +fma.rn.f32 f1283, f1266, 0f3F7C1C5C, f1282; +mul.f32 f1617, f1251, 0f3E31D0D4; +mul.f32 f1618, f1257, 0f3F7C1C5C; +sub.f32 f1286, f1617, f1618; +mul.f32 f1287, f1257, 0f3E31D0D4; +fma.rn.f32 f1288, f1251, 0f3F7C1C5C, f1287; +mul.f32 f1615, f1267, 0fBF708FB2; +mul.f32 f1616, f1273, 0f3EAF1D44; +sub.f32 f1291, f1615, f1616; +mul.f32 f1292, f1273, 0fBF708FB2; +fma.rn.f32 f1293, f1267, 0f3EAF1D44, f1292; +add.f32 f1294, f1243, f1259; +add.f32 f1295, f1227, f1294; +mul.f32 f1298, f1294, 0f3F000000; +sub.f32 f1299, f1227, f1298; +add.f32 f1614, f1624, f1622; +sub.f32 f1300, f1624, f1622; +mul.f32 f1301, f1300, 0fBF5DB3D7; +add.f32 f1302, f1301, f1299; +sub.f32 f1303, f1299, f1301; +add.f32 f1613, f1626, f1614; +mul.f32 f1304, f1614, 0f3F000000; +sub.f32 f1305, f1626, f1304; +sub.f32 f1306, f1243, f1259; +mul.f32 f1307, f1306, 0fBF5DB3D7; +sub.f32 f1308, f1305, f1307; +add.f32 f1309, f1307, f1305; +add.f32 f1310, f1276, f1281; +add.f32 f1311, f1234, f1310; +mul.f32 f1314, f1310, 0f3F000000; +sub.f32 f1315, f1234, f1314; +add.f32 f1612, f1278, f1283; +sub.f32 f1316, f1278, f1283; +mul.f32 f1317, f1316, 0fBF5DB3D7; +add.f32 f1318, f1317, f1315; +sub.f32 f1319, f1315, f1317; +add.f32 f1611, f1240, f1612; +mul.f32 f1320, f1612, 0f3F000000; +sub.f32 f1321, f1240, f1320; +sub.f32 f1322, f1276, f1281; +mul.f32 f1323, f1322, 0fBF5DB3D7; +sub.f32 f1324, f1321, f1323; +add.f32 f1325, f1323, f1321; +add.f32 f1326, f1286, f1291; +add.f32 f1327, f1235, f1326; +mul.f32 f1330, f1326, 0f3F000000; +sub.f32 f1331, f1235, f1330; +add.f32 f1610, f1288, f1293; +sub.f32 f1332, f1288, f1293; +mul.f32 f1333, f1332, 0fBF5DB3D7; +add.f32 f1334, f1333, f1331; +sub.f32 f1335, f1331, f1333; +add.f32 f1609, f1241, f1610; +mul.f32 f1336, f1610, 0f3F000000; +sub.f32 f1337, f1241, f1336; +sub.f32 f1338, f1286, f1291; +mul.f32 f1339, f1338, 0fBF5DB3D7; +sub.f32 f1340, f1337, f1339; +add.f32 f1341, f1339, f1337; +mul.f32 f1343, f1630, 0f3E6C2691; +mul.f32 f1608, f1195, 0f3F791978; +sub.f32 f1344, f1608, f1343; +mul.f32 f1345, f1630, 0f3F791978; +fma.rn.f32 f1346, f1195, 0f3E6C2691, f1345; +mul.f32 f1348, f1611, 0f3EE5C902; +mul.f32 f1607, f1311, 0f3F64C51C; +sub.f32 f1349, f1607, f1348; +mul.f32 f1350, f1611, 0f3F64C51C; +fma.rn.f32 f1351, f1311, 0f3EE5C902, f1350; +mul.f32 f1353, f1628, 0f3EE5C902; +mul.f32 f1606, f1211, 0f3F64C51C; +sub.f32 f1354, f1606, f1353; +mul.f32 f1355, f1628, 0f3F64C51C; +fma.rn.f32 f1356, f1211, 0f3EE5C902, f1355; +mul.f32 f1358, f1609, 0f3F4D57F2; +mul.f32 f1605, f1327, 0f3F18DF63; +sub.f32 f1359, f1605, f1358; +mul.f32 f1360, f1609, 0f3F18DF63; +fma.rn.f32 f1361, f1327, 0f3F4D57F2, f1360; +mul.f32 f1603, f1186, 0f3F441B7D; +mul.f32 f1604, f1192, 0f3F248DBB; +sub.f32 f1364, f1603, f1604; +mul.f32 f1365, f1192, 0f3F441B7D; +fma.rn.f32 f1366, f1186, 0f3F248DBB, f1365; +mul.f32 f1601, f1302, 0f3E31D0D4; +mul.f32 f1602, f1308, 0f3F7C1C5C; +sub.f32 f1369, f1601, f1602; +mul.f32 f1370, f1308, 0f3E31D0D4; +fma.rn.f32 f1371, f1302, 0f3F7C1C5C, f1370; +mul.f32 f1599, f1202, 0f3F18DF63; +mul.f32 f1600, f1208, 0f3F4D57F2; +sub.f32 f1374, f1599, f1600; +mul.f32 f1375, f1208, 0f3F18DF63; +fma.rn.f32 f1376, f1202, 0f3F4D57F2, f1375; +mul.f32 f1597, f1318, 0fBE92D7E0; +mul.f32 f1598, f1324, 0f3F753ECD; +sub.f32 f1379, f1597, f1598; +mul.f32 f1380, f1324, 0fBE92D7E0; +fma.rn.f32 f1381, f1318, 0f3F753ECD, f1380; +mul.f32 f1383, f1224, 0f3F6B1036; +mul.f32 f1596, f1218, 0f3ECACAF8; +sub.f32 f1384, f1596, f1383; +mul.f32 f1385, f1224, 0f3ECACAF8; +fma.rn.f32 f1386, f1218, 0f3F6B1036, f1385; +mul.f32 f1388, f1340, 0f3F3A3529; +mul.f32 f1595, f1334, 0fBF2FAD88; +sub.f32 f1389, f1595, f1388; +mul.f32 f1390, f1340, 0fBF2FAD88; +fma.rn.f32 f1391, f1334, 0f3F3A3529, f1390; +mul.f32 f1393, f1193, 0f3F7C1C5C; +mul.f32 f1594, f1187, 0f3E31D0D4; +sub.f32 f1394, f1594, f1393; +mul.f32 f1395, f1193, 0f3E31D0D4; +fma.rn.f32 f1396, f1187, 0f3F7C1C5C, f1395; +mul.f32 f1398, f1309, 0f3EAF1D44; +mul.f32 f1593, f1303, 0fBF708FB2; +sub.f32 f1399, f1593, f1398; +mul.f32 f1400, f1309, 0fBF708FB2; +fma.rn.f32 f1401, f1303, 0f3EAF1D44, f1400; +mul.f32 f1403, f1209, 0f3F7F9120; +mul.f32 f1592, f1203, 0fBD6E2946; +sub.f32 f1404, f1592, f1403; +mul.f32 f1405, f1209, 0fBD6E2946; +fma.rn.f32 f1406, f1203, 0f3F7F9120, f1405; +mul.f32 f1590, f1319, 0fBF7E44DE; +mul.f32 f1591, f1325, 0fBDEDC21F; +sub.f32 f1409, f1590, f1591; +mul.f32 f1410, f1325, 0fBF7E44DE; +fma.rn.f32 f1411, f1319, 0fBDEDC21F, f1410; +mul.f32 f1588, f1219, 0fBE92D7E0; +mul.f32 f1589, f1225, 0f3F753ECD; +sub.f32 f1414, f1588, f1589; +mul.f32 f1415, f1225, 0fBE92D7E0; +fma.rn.f32 f1416, f1219, 0f3F753ECD, f1415; +mul.f32 f1586, f1335, 0fBF55E287; +mul.f32 f1587, f1341, 0fBF0CAC9F; +sub.f32 f1419, f1586, f1587; +mul.f32 f1420, f1341, 0fBF55E287; +fma.rn.f32 f1421, f1335, 0fBF0CAC9F, f1420; +add.f32 f1422, f1179, f1295; +mul.f32 f1424, f1422, 0f3F000000; +sub.f32 f1425, f1063, f1424; +add.f32 f1585, f1632, f1613; +sub.f32 f1426, f1632, f1613; +mul.f32 f1427, f1426, 0fBF5DB3D7; +mul.f32 f1428, f1585, 0f3F000000; +sub.f32 f1429, f1651, f1428; +sub.f32 f1430, f1179, f1295; +mul.f32 f1431, f1430, 0fBF5DB3D7; +add.f32 f1432, f1344, f1349; +mul.f32 f1434, f1432, 0f3F000000; +sub.f32 f1435, f1079, f1434; +add.f32 f1584, f1346, f1351; +sub.f32 f1436, f1346, f1351; +mul.f32 f1437, f1436, 0fBF5DB3D7; +mul.f32 f1438, f1584, 0f3F000000; +sub.f32 f1439, f1649, f1438; +sub.f32 f1440, f1344, f1349; +mul.f32 f1441, f1440, 0fBF5DB3D7; +add.f32 f1442, f1354, f1359; +mul.f32 f1444, f1442, 0f3F000000; +sub.f32 f1445, f1095, f1444; +add.f32 f1583, f1356, f1361; +sub.f32 f1446, f1356, f1361; +mul.f32 f1447, f1446, 0fBF5DB3D7; +mul.f32 f1448, f1583, 0f3F000000; +sub.f32 f1449, f1647, f1448; +sub.f32 f1450, f1354, f1359; +mul.f32 f1451, f1450, 0fBF5DB3D7; +add.f32 f1452, f1364, f1369; +mul.f32 f1454, f1452, 0f3F000000; +sub.f32 f1455, f1070, f1454; +add.f32 f1582, f1366, f1371; +sub.f32 f1456, f1366, f1371; +mul.f32 f1457, f1456, 0fBF5DB3D7; +mul.f32 f1458, f1582, 0f3F000000; +sub.f32 f1459, f1076, f1458; +sub.f32 f1460, f1364, f1369; +mul.f32 f1461, f1460, 0fBF5DB3D7; +add.f32 f1462, f1374, f1379; +mul.f32 f1464, f1462, 0f3F000000; +sub.f32 f1465, f1086, f1464; +add.f32 f1581, f1376, f1381; +sub.f32 f1466, f1376, f1381; +mul.f32 f1467, f1466, 0fBF5DB3D7; +mul.f32 f1468, f1581, 0f3F000000; +sub.f32 f1469, f1092, f1468; +sub.f32 f1470, f1374, f1379; +mul.f32 f1471, f1470, 0fBF5DB3D7; +add.f32 f1472, f1384, f1389; +mul.f32 f1474, f1472, 0f3F000000; +sub.f32 f1475, f1102, f1474; +add.f32 f1580, f1386, f1391; +sub.f32 f1476, f1386, f1391; +mul.f32 f1477, f1476, 0fBF5DB3D7; +mul.f32 f1478, f1580, 0f3F000000; +sub.f32 f1479, f1108, f1478; +sub.f32 f1480, f1384, f1389; +mul.f32 f1481, f1480, 0fBF5DB3D7; +add.f32 f1482, f1394, f1399; +mul.f32 f1484, f1482, 0f3F000000; +sub.f32 f1485, f1071, f1484; +add.f32 f1579, f1396, f1401; +sub.f32 f1486, f1396, f1401; +mul.f32 f1487, f1486, 0fBF5DB3D7; +mul.f32 f1488, f1579, 0f3F000000; +sub.f32 f1489, f1077, f1488; +sub.f32 f1490, f1394, f1399; +mul.f32 f1491, f1490, 0fBF5DB3D7; +add.f32 f1492, f1404, f1409; +mul.f32 f1494, f1492, 0f3F000000; +sub.f32 f1495, f1087, f1494; +add.f32 f1578, f1406, f1411; +sub.f32 f1496, f1406, f1411; +mul.f32 f1497, f1496, 0fBF5DB3D7; +mul.f32 f1498, f1578, 0f3F000000; +sub.f32 f1499, f1093, f1498; +sub.f32 f1500, f1404, f1409; +mul.f32 f1501, f1500, 0fBF5DB3D7; +add.f32 f1502, f1414, f1419; +mul.f32 f1504, f1502, 0f3F000000; +sub.f32 f1505, f1103, f1504; +add.f32 f1577, f1416, f1421; +sub.f32 f1506, f1416, f1421; +mul.f32 f1507, f1506, 0fBF5DB3D7; +mul.f32 f1508, f1577, 0f3F000000; +sub.f32 f1509, f1109, f1508; +sub.f32 f1510, f1414, f1419; +mul.f32 f1817, f1580, 0f3F000000; +sub.f32 f1816, f1108, f1817; +mul.f32 f1511, f1510, 0fBF5DB3D7; +add.f32 %0, f1063, f1422; +mul.f32 f1819, f1452, 0f3F000000; +sub.f32 f1818, f1070, f1819; +add.f32 %1, f1651, f1585; +mul.f32 f1821, f1581, 0f3F000000; +sub.f32 f1820, f1092, f1821; +mul.f32 f1823, f1581, 0f3F000000; +sub.f32 f1822, f1092, f1823; +add.f32 %3, f1649, f1584; +add.f32 %2, f1079, f1432; +add.f32 %5, f1647, f1583; +add.f32 %4, f1095, f1442; +add.f32 %7, f1076, f1582; +add.f32 %6, f1070, f1452; +add.f32 %9, f1092, f1581; +add.f32 %8, f1086, f1462; +add.f32 %11, f1108, f1580; +add.f32 %10, f1102, f1472; +add.f32 %13, f1077, f1579; +add.f32 %12, f1071, f1482; +add.f32 %15, f1093, f1578; +add.f32 %14, f1087, f1492; +add.f32 %17, f1109, f1577; +add.f32 %16, f1103, f1502; +sub.f32 %19, f1429, f1431; +add.f32 %18, f1427, f1425; +add.f32 %20, f1437, f1435; +sub.f32 %21, f1439, f1441; +add.f32 %22, f1447, f1445; +sub.f32 %23, f1449, f1451; +add.f32 %24, f1457, f1818; +sub.f32 %25, f1459, f1461; +sub.f32 %27, f1822, f1471; +add.f32 %26, f1467, f1465; +sub.f32 %29, f1816, f1481; +add.f32 %28, f1477, f1475; +add.f32 %30, f1487, f1485; +sub.f32 %31, f1489, f1491; +add.f32 %32, f1497, f1495; +sub.f32 %33, f1499, f1501; +add.f32 %34, f1507, f1505; +sub.f32 %35, f1509, f1511; +sub.f32 %36, f1425, f1427; +add.f32 %37, f1431, f1429; +add.f32 %39, f1441, f1439; +sub.f32 %38, f1435, f1437; +add.f32 %41, f1451, f1449; +sub.f32 %40, f1445, f1447; +add.f32 %43, f1461, f1459; +sub.f32 %42, f1818, f1457; +add.f32 %45, f1471, f1822; +sub.f32 %44, f1465, f1467; +add.f32 %47, f1481, f1816; +sub.f32 %46, f1475, f1477; +add.f32 %49, f1491, f1489; +sub.f32 %48, f1485, f1487; +add.f32 %51, f1501, f1499; +sub.f32 %50, f1495, f1497; +add.f32 %53, f1511, f1509; +sub.f32 %52, f1505, f1507; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_729), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[20].y), "f"(rmem[11].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<342, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<615>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 5832, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0fBF5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0fBF5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5832, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f122, f152; +mul.f32 f156, f120, f152; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f138, f162; +mul.f32 f164, f136, f162; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f117, f170; +mul.f32 f172, f111, f170; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f133, f178; +mul.f32 f180, f127, f178; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f149, f186; +mul.f32 f188, f143, f186; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f118, f194; +mul.f32 f196, f112, f194; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f134, f202; +mul.f32 f204, f128, f202; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f150, f210; +mul.f32 f212, f144, f210; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f151, f120, f155; +sub.f32 f217, f157, f156; +st.shared.v2.f32 [r9+8], {f216, f217}; +fma.rn.f32 f218, f160, f136, f163; +sub.f32 f219, f165, f164; +st.shared.v2.f32 [r9+16], {f218, f219}; +sub.f32 f220, f173, f172; +fma.rn.f32 f221, f168, f111, f171; +st.shared.v2.f32 [r9+24], {f221, f220}; +fma.rn.f32 f222, f176, f127, f179; +sub.f32 f223, f181, f180; +st.shared.v2.f32 [r9+32], {f222, f223}; +sub.f32 f224, f189, f188; +fma.rn.f32 f225, f184, f143, f187; +st.shared.v2.f32 [r9+40], {f225, f224}; +fma.rn.f32 f226, f192, f112, f195; +sub.f32 f227, f197, f196; +st.shared.v2.f32 [r9+48], {f226, f227}; +fma.rn.f32 f228, f200, f128, f203; +sub.f32 f229, f205, f204; +st.shared.v2.f32 [r9+56], {f228, f229}; +fma.rn.f32 f230, f208, f144, f211; +sub.f32 f231, f213, f212; +st.shared.v2.f32 [r9+64], {f230, f231}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+648]; +ld.shared.v2.f32 {f240, f241}, [r11+1296]; +ld.shared.v2.f32 {f244, f245}, [r11+1944]; +ld.shared.v2.f32 {f248, f249}, [r11+2592]; +ld.shared.v2.f32 {f252, f253}, [r11+3240]; +ld.shared.v2.f32 {f256, f257}, [r11+3888]; +ld.shared.v2.f32 {f260, f261}, [r11+4536]; +ld.shared.v2.f32 {f264, f265}, [r11+5184]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0fBF5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0fBF5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0fBF5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0fBF5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0fBF5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0fBF5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0f3F248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0f3F248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0f3F7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0f3F7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0f3F7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0f3F7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0f3EAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0f3EAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f337, 0f3F000000; +sub.f32 f345, f271, f344; +sub.f32 f346, f285, f301; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f318, f323; +add.f32 f351, f276, f350; +add.f32 f352, f320, f325; +add.f32 f353, f282, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f276, f354; +sub.f32 f356, f320, f325; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f282, f360; +sub.f32 f362, f318, f323; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +add.f32 f366, f328, f333; +add.f32 f367, f277, f366; +add.f32 f368, f330, f335; +add.f32 f369, f283, f368; +mul.f32 f370, f366, 0f3F000000; +sub.f32 f371, f277, f370; +sub.f32 f372, f330, f335; +mul.f32 f373, f372, 0fBF5DB3D7; +add.f32 f374, f373, f371; +sub.f32 f375, f371, f373; +mul.f32 f376, f368, 0f3F000000; +sub.f32 f377, f283, f376; +sub.f32 f378, f328, f333; +mul.f32 f379, f378, 0fBF5DB3D7; +sub.f32 f380, f377, f379; +add.f32 f381, f379, f377; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f382, f383}, [rd11]; +mul.f32 f386, f353, f383; +mul.f32 f387, f351, f383; +mul.f32 f388, f382, f353; +mul.f32 f389, f382, f382; +mul.f32 f390, f383, f383; +sub.f32 f391, f389, f390; +mul.f32 f392, f383, f382; +fma.rn.f32 f393, f383, f382, f392; +mul.f32 f394, f369, f393; +mul.f32 f395, f367, f393; +mul.f32 f396, f391, f369; +mul.f32 f397, f382, f391; +mul.f32 f398, f383, f393; +sub.f32 f399, f397, f398; +mul.f32 f400, f382, f393; +fma.rn.f32 f401, f383, f391, f400; +mul.f32 f402, f348, f401; +mul.f32 f403, f342, f401; +mul.f32 f404, f399, f348; +mul.f32 f405, f382, f399; +mul.f32 f406, f383, f401; +sub.f32 f407, f405, f406; +mul.f32 f408, f382, f401; +fma.rn.f32 f409, f383, f399, f408; +mul.f32 f410, f364, f409; +mul.f32 f411, f358, f409; +mul.f32 f412, f407, f364; +mul.f32 f413, f382, f407; +mul.f32 f414, f383, f409; +sub.f32 f415, f413, f414; +mul.f32 f416, f382, f409; +fma.rn.f32 f417, f383, f407, f416; +mul.f32 f418, f380, f417; +mul.f32 f419, f374, f417; +mul.f32 f420, f415, f380; +mul.f32 f421, f382, f415; +mul.f32 f422, f383, f417; +sub.f32 f423, f421, f422; +mul.f32 f424, f382, f417; +fma.rn.f32 f425, f383, f415, f424; +mul.f32 f426, f349, f425; +mul.f32 f427, f343, f425; +mul.f32 f428, f423, f349; +mul.f32 f429, f382, f423; +mul.f32 f430, f383, f425; +sub.f32 f431, f429, f430; +mul.f32 f432, f382, f425; +fma.rn.f32 f433, f383, f423, f432; +mul.f32 f434, f365, f433; +mul.f32 f435, f359, f433; +mul.f32 f436, f431, f365; +mul.f32 f437, f382, f431; +mul.f32 f438, f383, f433; +sub.f32 f439, f437, f438; +mul.f32 f440, f382, f433; +fma.rn.f32 f441, f383, f431, f440; +mul.f32 f442, f381, f441; +mul.f32 f443, f375, f441; +mul.f32 f444, f439, f381; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +add.f32 f445, f271, f337; +add.f32 f446, f269, f336; +st.shared.v2.f32 [r17], {f446, f445}; +fma.rn.f32 f447, f382, f351, f386; +sub.f32 f448, f388, f387; +st.shared.v2.f32 [r17+72], {f447, f448}; +fma.rn.f32 f449, f391, f367, f394; +sub.f32 f450, f396, f395; +st.shared.v2.f32 [r17+144], {f449, f450}; +fma.rn.f32 f451, f399, f342, f402; +sub.f32 f452, f404, f403; +st.shared.v2.f32 [r17+216], {f451, f452}; +fma.rn.f32 f453, f407, f358, f410; +sub.f32 f454, f412, f411; +st.shared.v2.f32 [r17+288], {f453, f454}; +fma.rn.f32 f455, f415, f374, f418; +sub.f32 f456, f420, f419; +st.shared.v2.f32 [r17+360], {f455, f456}; +fma.rn.f32 f457, f423, f343, f426; +sub.f32 f458, f428, f427; +st.shared.v2.f32 [r17+432], {f457, f458}; +sub.f32 f459, f436, f435; +fma.rn.f32 f460, f431, f359, f434; +st.shared.v2.f32 [r17+504], {f460, f459}; +fma.rn.f32 f461, f439, f375, f442; +sub.f32 f462, f444, f443; +st.shared.v2.f32 [r17+576], {f461, f462}; +barrier.sync 0; +ld.shared.v2.f32 {f463, f464}, [r11]; +ld.shared.v2.f32 {f467, f468}, [r11+648]; +ld.shared.v2.f32 {f471, f472}, [r11+1296]; +ld.shared.v2.f32 {f475, f476}, [r11+1944]; +ld.shared.v2.f32 {f479, f480}, [r11+2592]; +ld.shared.v2.f32 {f483, f484}, [r11+3240]; +ld.shared.v2.f32 {f487, f488}, [r11+3888]; +ld.shared.v2.f32 {f491, f492}, [r11+4536]; +ld.shared.v2.f32 {f495, f496}, [r11+5184]; +add.f32 f499, f475, f487; +add.f32 f500, f463, f499; +add.f32 f501, f476, f488; +add.f32 f502, f464, f501; +mul.f32 f503, f499, 0f3F000000; +sub.f32 f504, f463, f503; +sub.f32 f505, f476, f488; +mul.f32 f506, f505, 0fBF5DB3D7; +add.f32 f507, f506, f504; +sub.f32 f508, f504, f506; +mul.f32 f509, f501, 0f3F000000; +sub.f32 f510, f464, f509; +sub.f32 f511, f475, f487; +mul.f32 f512, f511, 0fBF5DB3D7; +sub.f32 f513, f510, f512; +add.f32 f514, f512, f510; +add.f32 f515, f479, f491; +add.f32 f516, f467, f515; +add.f32 f517, f480, f492; +add.f32 f518, f468, f517; +mul.f32 f519, f515, 0f3F000000; +sub.f32 f520, f467, f519; +sub.f32 f521, f480, f492; +mul.f32 f522, f521, 0fBF5DB3D7; +add.f32 f523, f522, f520; +sub.f32 f524, f520, f522; +mul.f32 f525, f517, 0f3F000000; +sub.f32 f526, f468, f525; +sub.f32 f527, f479, f491; +mul.f32 f528, f527, 0fBF5DB3D7; +sub.f32 f529, f526, f528; +add.f32 f530, f528, f526; +add.f32 f531, f483, f495; +add.f32 f532, f471, f531; +add.f32 f533, f484, f496; +add.f32 f534, f472, f533; +mul.f32 f535, f531, 0f3F000000; +sub.f32 f536, f471, f535; +sub.f32 f537, f484, f496; +mul.f32 f538, f537, 0fBF5DB3D7; +add.f32 f539, f538, f536; +sub.f32 f540, f536, f538; +mul.f32 f541, f533, 0f3F000000; +sub.f32 f542, f472, f541; +sub.f32 f543, f483, f495; +mul.f32 f544, f543, 0fBF5DB3D7; +sub.f32 f545, f542, f544; +add.f32 f546, f544, f542; +mul.f32 f547, f523, 0f3F441B7D; +mul.f32 f548, f529, 0f3F248DBB; +sub.f32 f549, f547, f548; +mul.f32 f550, f529, 0f3F441B7D; +fma.rn.f32 f551, f523, 0f3F248DBB, f550; +mul.f32 f552, f539, 0f3E31D0D4; +mul.f32 f553, f545, 0f3F7C1C5C; +sub.f32 f554, f552, f553; +mul.f32 f555, f545, 0f3E31D0D4; +fma.rn.f32 f556, f539, 0f3F7C1C5C, f555; +mul.f32 f557, f524, 0f3E31D0D4; +mul.f32 f558, f530, 0f3F7C1C5C; +sub.f32 f559, f557, f558; +mul.f32 f560, f530, 0f3E31D0D4; +fma.rn.f32 f561, f524, 0f3F7C1C5C, f560; +mul.f32 f562, f540, 0fBF708FB2; +mul.f32 f563, f546, 0f3EAF1D44; +sub.f32 f564, f562, f563; +mul.f32 f565, f546, 0fBF708FB2; +fma.rn.f32 f566, f540, 0f3EAF1D44, f565; +add.f32 f567, f516, f532; +add.f32 f568, f518, f534; +mul.f32 f569, f567, 0f3F000000; +sub.f32 f570, f500, f569; +sub.f32 f571, f518, f534; +mul.f32 f572, f571, 0fBF5DB3D7; +mul.f32 f573, f568, 0f3F000000; +sub.f32 f574, f502, f573; +sub.f32 f575, f516, f532; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f549, f554; +add.f32 f578, f551, f556; +mul.f32 f579, f577, 0f3F000000; +sub.f32 f580, f507, f579; +sub.f32 f581, f551, f556; +mul.f32 f582, f581, 0fBF5DB3D7; +mul.f32 f583, f578, 0f3F000000; +sub.f32 f584, f513, f583; +sub.f32 f585, f549, f554; +mul.f32 f586, f585, 0fBF5DB3D7; +add.f32 f587, f559, f564; +add.f32 f588, f561, f566; +mul.f32 f589, f587, 0f3F000000; +sub.f32 f590, f508, f589; +sub.f32 f591, f561, f566; +mul.f32 f592, f591, 0fBF5DB3D7; +mul.f32 f593, f588, 0f3F000000; +sub.f32 f594, f514, f593; +sub.f32 f595, f559, f564; +mul.f32 f596, f595, 0fBF5DB3D7; +add.f32 %1, f502, f568; +add.f32 %0, f500, f567; +add.f32 %3, f513, f578; +add.f32 %2, f507, f577; +add.f32 %5, f514, f588; +add.f32 %4, f508, f587; +sub.f32 %7, f574, f576; +add.f32 %6, f572, f570; +sub.f32 %9, f584, f586; +add.f32 %8, f582, f580; +sub.f32 %11, f594, f596; +add.f32 %10, f592, f590; +add.f32 %13, f576, f574; +sub.f32 %12, f570, f572; +add.f32 %15, f586, f584; +sub.f32 %14, f580, f582; +add.f32 %17, f596, f594; +sub.f32 %16, f590, f592; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<343, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<579>; +.reg .b32 r<18>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 2916, r2; +add.f32 f37, %29, %37; +add.f32 f38, %21, f37; +add.f32 f39, %30, %38; +add.f32 f40, %22, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %21, f41; +sub.f32 f43, %30, %38; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %22, f47; +sub.f32 f49, %29, %37; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %31, %39; +add.f32 f54, %23, f53; +add.f32 f55, %33, %41; +add.f32 f56, %25, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %23, f57; +sub.f32 f59, %33, %41; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %25, f63; +sub.f32 f65, %31, %39; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %34, %42; +add.f32 f70, %26, f69; +add.f32 f71, %36, %43; +add.f32 f72, %28, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %26, f73; +sub.f32 f75, %36, %43; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %28, f79; +sub.f32 f81, %34, %42; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0fBF5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0fBF5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0fBF5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0fBF5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f124, f154; +fma.rn.f32 f158, f153, f122, f157; +mul.f32 f159, f122, f154; +mul.f32 f160, f153, f124; +sub.f32 f161, f160, f159; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f140, f166; +fma.rn.f32 f168, f164, f138, f167; +mul.f32 f169, f138, f166; +mul.f32 f170, f164, f140; +sub.f32 f171, f170, f169; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f119, f176; +fma.rn.f32 f178, f174, f113, f177; +mul.f32 f179, f113, f176; +mul.f32 f180, f174, f119; +sub.f32 f181, f180, f179; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f135, f186; +fma.rn.f32 f188, f184, f129, f187; +mul.f32 f189, f129, f186; +mul.f32 f190, f184, f135; +sub.f32 f191, f190, f189; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f151, f196; +fma.rn.f32 f198, f194, f145, f197; +mul.f32 f199, f145, f196; +mul.f32 f200, f194, f151; +sub.f32 f201, f200, f199; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f120, f206; +fma.rn.f32 f208, f204, f114, f207; +mul.f32 f209, f114, f206; +mul.f32 f210, f204, f120; +sub.f32 f211, f210, f209; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f136, f216; +fma.rn.f32 f218, f214, f130, f217; +mul.f32 f219, f130, f216; +mul.f32 f220, f214, f136; +sub.f32 f221, f220, f219; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f152, f226; +fma.rn.f32 f228, f224, f146, f227; +mul.f32 f229, f146, f226; +mul.f32 f230, f224, f152; +sub.f32 f231, f230, f229; +mad.lo.s32 r8, r5, 2916, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f158; +st.shared.f32 [r9+8], f168; +st.shared.f32 [r9+12], f178; +st.shared.f32 [r9+16], f188; +st.shared.f32 [r9+20], f198; +st.shared.f32 [r9+24], f208; +st.shared.f32 [r9+28], f218; +st.shared.f32 [r9+32], f228; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+324]; +ld.shared.f32 f234, [r11+648]; +ld.shared.f32 f235, [r11+972]; +ld.shared.f32 f236, [r11+1296]; +ld.shared.f32 f237, [r11+1620]; +ld.shared.f32 f238, [r11+1944]; +ld.shared.f32 f239, [r11+2268]; +ld.shared.f32 f240, [r11+2592]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+324]; +ld.shared.f32 f243, [r11+648]; +ld.shared.f32 f244, [r11+972]; +ld.shared.f32 f245, [r11+1296]; +ld.shared.f32 f246, [r11+1620]; +ld.shared.f32 f247, [r11+1944]; +ld.shared.f32 f248, [r11+2268]; +ld.shared.f32 f249, [r11+2592]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0fBF5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0fBF5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0fBF5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0fBF5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0fBF5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0fBF5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0f3F248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0f3F248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0f3F7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0f3F7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0f3F7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0f3F7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0f3EAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0f3EAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f251, f318; +add.f32 f320, f269, f285; +add.f32 f321, f253, f320; +mul.f32 f322, f318, 0f3F000000; +sub.f32 f323, f251, f322; +sub.f32 f324, f269, f285; +mul.f32 f325, f324, 0fBF5DB3D7; +add.f32 f326, f325, f323; +sub.f32 f327, f323, f325; +mul.f32 f328, f320, 0f3F000000; +sub.f32 f329, f253, f328; +sub.f32 f330, f267, f283; +mul.f32 f331, f330, 0fBF5DB3D7; +sub.f32 f332, f329, f331; +add.f32 f333, f331, f329; +add.f32 f334, f300, f305; +add.f32 f335, f258, f334; +add.f32 f336, f302, f307; +add.f32 f337, f264, f336; +mul.f32 f338, f334, 0f3F000000; +sub.f32 f339, f258, f338; +sub.f32 f340, f302, f307; +mul.f32 f341, f340, 0fBF5DB3D7; +add.f32 f342, f341, f339; +sub.f32 f343, f339, f341; +mul.f32 f344, f336, 0f3F000000; +sub.f32 f345, f264, f344; +sub.f32 f346, f300, f305; +mul.f32 f347, f346, 0fBF5DB3D7; +sub.f32 f348, f345, f347; +add.f32 f349, f347, f345; +add.f32 f350, f310, f315; +add.f32 f351, f259, f350; +add.f32 f352, f312, f317; +add.f32 f353, f265, f352; +mul.f32 f354, f350, 0f3F000000; +sub.f32 f355, f259, f354; +sub.f32 f356, f312, f317; +mul.f32 f357, f356, 0fBF5DB3D7; +add.f32 f358, f357, f355; +sub.f32 f359, f355, f357; +mul.f32 f360, f352, 0f3F000000; +sub.f32 f361, f265, f360; +sub.f32 f362, f310, f315; +mul.f32 f363, f362, 0fBF5DB3D7; +sub.f32 f364, f361, f363; +add.f32 f365, f363, f361; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f366, f367}, [rd11]; +mul.f32 f370, f337, f367; +fma.rn.f32 f371, f366, f335, f370; +mul.f32 f372, f335, f367; +mul.f32 f373, f366, f337; +sub.f32 f374, f373, f372; +mul.f32 f375, f366, f366; +mul.f32 f376, f367, f367; +sub.f32 f377, f375, f376; +mul.f32 f378, f367, f366; +fma.rn.f32 f379, f367, f366, f378; +mul.f32 f380, f353, f379; +fma.rn.f32 f381, f377, f351, f380; +mul.f32 f382, f351, f379; +mul.f32 f383, f377, f353; +sub.f32 f384, f383, f382; +mul.f32 f385, f366, f377; +mul.f32 f386, f367, f379; +sub.f32 f387, f385, f386; +mul.f32 f388, f366, f379; +fma.rn.f32 f389, f367, f377, f388; +mul.f32 f390, f332, f389; +fma.rn.f32 f391, f387, f326, f390; +mul.f32 f392, f326, f389; +mul.f32 f393, f387, f332; +sub.f32 f394, f393, f392; +mul.f32 f395, f366, f387; +mul.f32 f396, f367, f389; +sub.f32 f397, f395, f396; +mul.f32 f398, f366, f389; +fma.rn.f32 f399, f367, f387, f398; +mul.f32 f400, f348, f399; +fma.rn.f32 f401, f397, f342, f400; +mul.f32 f402, f342, f399; +mul.f32 f403, f397, f348; +sub.f32 f404, f403, f402; +mul.f32 f405, f366, f397; +mul.f32 f406, f367, f399; +sub.f32 f407, f405, f406; +mul.f32 f408, f366, f399; +fma.rn.f32 f409, f367, f397, f408; +mul.f32 f410, f364, f409; +fma.rn.f32 f411, f407, f358, f410; +mul.f32 f412, f358, f409; +mul.f32 f413, f407, f364; +sub.f32 f414, f413, f412; +mul.f32 f415, f366, f407; +mul.f32 f416, f367, f409; +sub.f32 f417, f415, f416; +mul.f32 f418, f366, f409; +fma.rn.f32 f419, f367, f407, f418; +mul.f32 f420, f333, f419; +fma.rn.f32 f421, f417, f327, f420; +mul.f32 f422, f327, f419; +mul.f32 f423, f417, f333; +sub.f32 f424, f423, f422; +mul.f32 f425, f366, f417; +mul.f32 f426, f367, f419; +sub.f32 f427, f425, f426; +mul.f32 f428, f366, f419; +fma.rn.f32 f429, f367, f417, f428; +mul.f32 f430, f349, f429; +fma.rn.f32 f431, f427, f343, f430; +mul.f32 f432, f343, f429; +mul.f32 f433, f427, f349; +sub.f32 f434, f433, f432; +mul.f32 f435, f366, f427; +mul.f32 f436, f367, f429; +sub.f32 f437, f435, f436; +mul.f32 f438, f366, f429; +fma.rn.f32 f439, f367, f427, f438; +mul.f32 f440, f365, f439; +fma.rn.f32 f441, f437, f359, f440; +mul.f32 f442, f359, f439; +mul.f32 f443, f437, f365; +sub.f32 f444, f443, f442; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 324, r16; +st.shared.f32 [r17], f319; +st.shared.f32 [r17+36], f371; +st.shared.f32 [r17+72], f381; +st.shared.f32 [r17+108], f391; +st.shared.f32 [r17+144], f401; +st.shared.f32 [r17+180], f411; +st.shared.f32 [r17+216], f421; +st.shared.f32 [r17+252], f431; +st.shared.f32 [r17+288], f441; +barrier.sync 0; +ld.shared.f32 f445, [r11]; +ld.shared.f32 f446, [r11+324]; +ld.shared.f32 f447, [r11+648]; +ld.shared.f32 f448, [r11+972]; +ld.shared.f32 f449, [r11+1296]; +ld.shared.f32 f450, [r11+1620]; +ld.shared.f32 f451, [r11+1944]; +ld.shared.f32 f452, [r11+2268]; +ld.shared.f32 f453, [r11+2592]; +barrier.sync 0; +st.shared.f32 [r17], f321; +st.shared.f32 [r17+36], f374; +st.shared.f32 [r17+72], f384; +st.shared.f32 [r17+108], f394; +st.shared.f32 [r17+144], f404; +st.shared.f32 [r17+180], f414; +st.shared.f32 [r17+216], f424; +st.shared.f32 [r17+252], f434; +st.shared.f32 [r17+288], f444; +barrier.sync 0; +ld.shared.f32 f454, [r11]; +ld.shared.f32 f455, [r11+324]; +ld.shared.f32 f456, [r11+648]; +ld.shared.f32 f457, [r11+972]; +ld.shared.f32 f458, [r11+1296]; +ld.shared.f32 f459, [r11+1620]; +ld.shared.f32 f460, [r11+1944]; +ld.shared.f32 f461, [r11+2268]; +ld.shared.f32 f462, [r11+2592]; +add.f32 f463, f448, f451; +add.f32 f464, f445, f463; +add.f32 f465, f457, f460; +add.f32 f466, f454, f465; +mul.f32 f467, f463, 0f3F000000; +sub.f32 f468, f445, f467; +sub.f32 f469, f457, f460; +mul.f32 f470, f469, 0fBF5DB3D7; +add.f32 f471, f470, f468; +sub.f32 f472, f468, f470; +mul.f32 f473, f465, 0f3F000000; +sub.f32 f474, f454, f473; +sub.f32 f475, f448, f451; +mul.f32 f476, f475, 0fBF5DB3D7; +sub.f32 f477, f474, f476; +add.f32 f478, f476, f474; +add.f32 f479, f449, f452; +add.f32 f480, f446, f479; +add.f32 f481, f458, f461; +add.f32 f482, f455, f481; +mul.f32 f483, f479, 0f3F000000; +sub.f32 f484, f446, f483; +sub.f32 f485, f458, f461; +mul.f32 f486, f485, 0fBF5DB3D7; +add.f32 f487, f486, f484; +sub.f32 f488, f484, f486; +mul.f32 f489, f481, 0f3F000000; +sub.f32 f490, f455, f489; +sub.f32 f491, f449, f452; +mul.f32 f492, f491, 0fBF5DB3D7; +sub.f32 f493, f490, f492; +add.f32 f494, f492, f490; +add.f32 f495, f450, f453; +add.f32 f496, f447, f495; +add.f32 f497, f459, f462; +add.f32 f498, f456, f497; +mul.f32 f499, f495, 0f3F000000; +sub.f32 f500, f447, f499; +sub.f32 f501, f459, f462; +mul.f32 f502, f501, 0fBF5DB3D7; +add.f32 f503, f502, f500; +sub.f32 f504, f500, f502; +mul.f32 f505, f497, 0f3F000000; +sub.f32 f506, f456, f505; +sub.f32 f507, f450, f453; +mul.f32 f508, f507, 0fBF5DB3D7; +sub.f32 f509, f506, f508; +add.f32 f510, f508, f506; +mul.f32 f511, f487, 0f3F441B7D; +mul.f32 f512, f493, 0f3F248DBB; +sub.f32 f513, f511, f512; +mul.f32 f514, f493, 0f3F441B7D; +fma.rn.f32 f515, f487, 0f3F248DBB, f514; +mul.f32 f516, f503, 0f3E31D0D4; +mul.f32 f517, f509, 0f3F7C1C5C; +sub.f32 f518, f516, f517; +mul.f32 f519, f509, 0f3E31D0D4; +fma.rn.f32 f520, f503, 0f3F7C1C5C, f519; +mul.f32 f521, f488, 0f3E31D0D4; +mul.f32 f522, f494, 0f3F7C1C5C; +sub.f32 f523, f521, f522; +mul.f32 f524, f494, 0f3E31D0D4; +fma.rn.f32 f525, f488, 0f3F7C1C5C, f524; +mul.f32 f526, f504, 0fBF708FB2; +mul.f32 f527, f510, 0f3EAF1D44; +sub.f32 f528, f526, f527; +mul.f32 f529, f510, 0fBF708FB2; +fma.rn.f32 f530, f504, 0f3EAF1D44, f529; +add.f32 f531, f480, f496; +add.f32 f532, f482, f498; +mul.f32 f533, f531, 0f3F000000; +sub.f32 f534, f464, f533; +sub.f32 f535, f482, f498; +mul.f32 f536, f535, 0fBF5DB3D7; +mul.f32 f537, f532, 0f3F000000; +sub.f32 f538, f466, f537; +sub.f32 f539, f480, f496; +mul.f32 f540, f539, 0fBF5DB3D7; +add.f32 f541, f513, f518; +add.f32 f542, f515, f520; +mul.f32 f543, f541, 0f3F000000; +sub.f32 f544, f471, f543; +sub.f32 f545, f515, f520; +mul.f32 f546, f545, 0fBF5DB3D7; +mul.f32 f547, f542, 0f3F000000; +sub.f32 f548, f477, f547; +sub.f32 f549, f513, f518; +mul.f32 f550, f549, 0fBF5DB3D7; +add.f32 f551, f523, f528; +add.f32 f552, f525, f530; +mul.f32 f553, f551, 0f3F000000; +sub.f32 f554, f472, f553; +sub.f32 f555, f525, f530; +mul.f32 f556, f555, 0fBF5DB3D7; +mul.f32 f557, f552, 0f3F000000; +sub.f32 f558, f478, f557; +sub.f32 f559, f523, f528; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 %0, f464, f531; +add.f32 %1, f466, f532; +add.f32 %3, f477, f542; +add.f32 %2, f471, f541; +add.f32 %5, f478, f552; +add.f32 %4, f472, f551; +add.f32 %6, f536, f534; +sub.f32 %7, f538, f540; +sub.f32 %9, f548, f550; +add.f32 %8, f546, f544; +sub.f32 %11, f558, f560; +add.f32 %10, f556, f554; +sub.f32 %12, f534, f536; +add.f32 %13, f540, f538; +add.f32 %15, f550, f548; +sub.f32 %14, f544, f546; +add.f32 %17, f560, f558; +sub.f32 %16, f554, f556; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_729), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<344, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<264>; +.reg .b32 r<40>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 5832, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %14, %17; +add.f32 f14, %16, %18; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %12, f15; +sub.f32 f17, %16, %18; +mul.f32 f18, f17, 0fBF5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %13, f21; +sub.f32 f23, %14, %17; +mul.f32 f24, f23, 0fBF5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5832, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f25, f28; +mul.f32 f32, f19, f28; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f26, f38; +mul.f32 f40, f20, f38; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %13, f14; +add.f32 f43, %12, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f27, f19, f31; +sub.f32 f45, f33, f32; +st.shared.v2.f32 [r9+8], {f44, f45}; +sub.f32 f46, f41, f40; +fma.rn.f32 f47, f36, f20, f39; +st.shared.v2.f32 [r9+16], {f47, f46}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+1944]; +ld.shared.v2.f32 {f56, f57}, [r11+3888]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0fBF5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0fBF5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f72, f75; +mul.f32 f79, f66, f75; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f73, f85; +mul.f32 f87, f67, f85; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r17], {f90, f89}; +fma.rn.f32 f91, f74, f66, f78; +sub.f32 f92, f80, f79; +st.shared.v2.f32 [r17+24], {f91, f92}; +fma.rn.f32 f93, f83, f67, f86; +sub.f32 f94, f88, f87; +st.shared.v2.f32 [r17+48], {f93, f94}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r11]; +ld.shared.v2.f32 {f99, f100}, [r11+1944]; +ld.shared.v2.f32 {f103, f104}, [r11+3888]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f119, f122; +mul.f32 f126, f113, f122; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f120, f132; +mul.f32 f134, f114, f132; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r23], {f137, f136}; +fma.rn.f32 f138, f121, f113, f125; +sub.f32 f139, f127, f126; +st.shared.v2.f32 [r23+72], {f138, f139}; +fma.rn.f32 f140, f130, f114, f133; +sub.f32 f141, f135, f134; +st.shared.v2.f32 [r23+144], {f140, f141}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r11]; +ld.shared.v2.f32 {f146, f147}, [r11+1944]; +ld.shared.v2.f32 {f150, f151}, [r11+3888]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0fBF5DB3D7; +add.f32 f160, f159, f157; +sub.f32 f161, f157, f159; +mul.f32 f162, f155, 0f3F000000; +sub.f32 f163, f143, f162; +sub.f32 f164, f146, f150; +mul.f32 f165, f164, 0fBF5DB3D7; +sub.f32 f166, f163, f165; +add.f32 f167, f165, f163; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f168, f169}, [rd21]; +mul.f32 f172, f166, f169; +mul.f32 f173, f160, f169; +mul.f32 f174, f168, f166; +mul.f32 f175, f168, f168; +mul.f32 f176, f169, f169; +sub.f32 f177, f175, f176; +mul.f32 f178, f169, f168; +fma.rn.f32 f179, f169, f168, f178; +mul.f32 f180, f167, f179; +mul.f32 f181, f161, f179; +mul.f32 f182, f177, f167; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +add.f32 f183, f143, f155; +add.f32 f184, f142, f154; +st.shared.v2.f32 [r33], {f184, f183}; +fma.rn.f32 f185, f168, f160, f172; +sub.f32 f186, f174, f173; +st.shared.v2.f32 [r33+216], {f185, f186}; +fma.rn.f32 f187, f177, f161, f180; +sub.f32 f188, f182, f181; +st.shared.v2.f32 [r33+432], {f187, f188}; +barrier.sync 0; +ld.shared.v2.f32 {f189, f190}, [r11]; +ld.shared.v2.f32 {f193, f194}, [r11+1944]; +ld.shared.v2.f32 {f197, f198}, [r11+3888]; +add.f32 f201, f193, f197; +add.f32 f202, f194, f198; +mul.f32 f203, f201, 0f3F000000; +sub.f32 f204, f189, f203; +sub.f32 f205, f194, f198; +mul.f32 f206, f205, 0fBF5DB3D7; +add.f32 f207, f206, f204; +sub.f32 f208, f204, f206; +mul.f32 f209, f202, 0f3F000000; +sub.f32 f210, f190, f209; +sub.f32 f211, f193, f197; +mul.f32 f212, f211, 0fBF5DB3D7; +sub.f32 f213, f210, f212; +add.f32 f214, f212, f210; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f215, f216}, [rd26]; +mul.f32 f219, f213, f216; +mul.f32 f220, f207, f216; +mul.f32 f221, f215, f213; +mul.f32 f222, f215, f215; +mul.f32 f223, f216, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f216, f215; +fma.rn.f32 f226, f216, f215, f225; +mul.f32 f227, f214, f226; +mul.f32 f228, f208, f226; +mul.f32 f229, f224, f214; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +add.f32 f230, f190, f202; +add.f32 f231, f189, f201; +st.shared.v2.f32 [r39], {f231, f230}; +fma.rn.f32 f232, f215, f207, f219; +sub.f32 f233, f221, f220; +st.shared.v2.f32 [r39+648], {f232, f233}; +fma.rn.f32 f234, f224, f208, f227; +sub.f32 f235, f229, f228; +st.shared.v2.f32 [r39+1296], {f234, f235}; +barrier.sync 0; +ld.shared.v2.f32 {f236, f237}, [r11]; +ld.shared.v2.f32 {f240, f241}, [r11+1944]; +ld.shared.v2.f32 {f244, f245}, [r11+3888]; +add.f32 f248, f240, f244; +add.f32 f249, f241, f245; +mul.f32 f250, f248, 0f3F000000; +sub.f32 f251, f236, f250; +sub.f32 f252, f241, f245; +mul.f32 f253, f252, 0fBF5DB3D7; +mul.f32 f254, f249, 0f3F000000; +sub.f32 f255, f237, f254; +sub.f32 f256, f240, f244; +mul.f32 f257, f256, 0fBF5DB3D7; +add.f32 %1, f237, f249; +add.f32 %0, f236, f248; +sub.f32 %3, f255, f257; +add.f32 %2, f253, f251; +add.f32 %5, f257, f255; +sub.f32 %4, f251, f253; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<345, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<234>; +.reg .b32 r<40>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 2916, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %14, %17; +add.f32 f14, %12, f13; +add.f32 f15, %16, %18; +add.f32 f16, %13, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %12, f17; +sub.f32 f19, %16, %18; +mul.f32 f20, f19, 0fBF5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %13, f23; +sub.f32 f25, %14, %17; +mul.f32 f26, f25, 0fBF5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 2916, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f27, f30; +fma.rn.f32 f34, f29, f21, f33; +mul.f32 f35, f21, f30; +mul.f32 f36, f29, f27; +sub.f32 f37, f36, f35; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f28, f42; +fma.rn.f32 f44, f40, f22, f43; +mul.f32 f45, f22, f42; +mul.f32 f46, f40, f28; +sub.f32 f47, f46, f45; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f34; +st.shared.f32 [r9+8], f44; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+972]; +ld.shared.f32 f50, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+972]; +ld.shared.f32 f53, [r11+1944]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0fBF5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0fBF5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 2; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f68, f71; +fma.rn.f32 f75, f70, f62, f74; +mul.f32 f76, f62, f71; +mul.f32 f77, f70, f68; +sub.f32 f78, f77, f76; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f69, f83; +fma.rn.f32 f85, f81, f63, f84; +mul.f32 f86, f63, f83; +mul.f32 f87, f81, f69; +sub.f32 f88, f87, f86; +barrier.sync 0; +mad.lo.s32 r17, r12, 36, r16; +st.shared.f32 [r17], f55; +st.shared.f32 [r17+12], f75; +st.shared.f32 [r17+24], f85; +barrier.sync 0; +ld.shared.f32 f89, [r11]; +ld.shared.f32 f90, [r11+972]; +ld.shared.f32 f91, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r17], f57; +st.shared.f32 [r17+12], f78; +st.shared.f32 [r17+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r11]; +ld.shared.f32 f93, [r11+972]; +ld.shared.f32 f94, [r11+1944]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0fBF5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0fBF5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 2; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f109, f112; +fma.rn.f32 f116, f111, f103, f115; +mul.f32 f117, f103, f112; +mul.f32 f118, f111, f109; +sub.f32 f119, f118, f117; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f110, f124; +fma.rn.f32 f126, f122, f104, f125; +mul.f32 f127, f104, f124; +mul.f32 f128, f122, f110; +sub.f32 f129, f128, f127; +barrier.sync 0; +mad.lo.s32 r23, r18, 108, r22; +st.shared.f32 [r23], f96; +st.shared.f32 [r23+36], f116; +st.shared.f32 [r23+72], f126; +barrier.sync 0; +ld.shared.f32 f130, [r11]; +ld.shared.f32 f131, [r11+972]; +ld.shared.f32 f132, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r23], f98; +st.shared.f32 [r23+36], f119; +st.shared.f32 [r23+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r11]; +ld.shared.f32 f134, [r11+972]; +ld.shared.f32 f135, [r11+1944]; +add.f32 f136, f131, f132; +add.f32 f137, f130, f136; +add.f32 f138, f134, f135; +add.f32 f139, f133, f138; +mul.f32 f140, f136, 0f3F000000; +sub.f32 f141, f130, f140; +sub.f32 f142, f134, f135; +mul.f32 f143, f142, 0fBF5DB3D7; +add.f32 f144, f143, f141; +sub.f32 f145, f141, f143; +mul.f32 f146, f138, 0f3F000000; +sub.f32 f147, f133, f146; +sub.f32 f148, f131, f132; +mul.f32 f149, f148, 0fBF5DB3D7; +sub.f32 f150, f147, f149; +add.f32 f151, f149, f147; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 2; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 8; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f32 {f152, f153}, [rd21]; +mul.f32 f156, f150, f153; +fma.rn.f32 f157, f152, f144, f156; +mul.f32 f158, f144, f153; +mul.f32 f159, f152, f150; +sub.f32 f160, f159, f158; +mul.f32 f161, f152, f152; +mul.f32 f162, f153, f153; +sub.f32 f163, f161, f162; +mul.f32 f164, f153, f152; +fma.rn.f32 f165, f153, f152, f164; +mul.f32 f166, f151, f165; +fma.rn.f32 f167, f163, f145, f166; +mul.f32 f168, f145, f165; +mul.f32 f169, f163, f151; +sub.f32 f170, f169, f168; +barrier.sync 0; +mad.lo.s32 r33, r28, 324, r32; +st.shared.f32 [r33], f137; +st.shared.f32 [r33+108], f157; +st.shared.f32 [r33+216], f167; +barrier.sync 0; +ld.shared.f32 f171, [r11]; +ld.shared.f32 f172, [r11+972]; +ld.shared.f32 f173, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r33], f139; +st.shared.f32 [r33+108], f160; +st.shared.f32 [r33+216], f170; +barrier.sync 0; +ld.shared.f32 f174, [r11]; +ld.shared.f32 f175, [r11+972]; +ld.shared.f32 f176, [r11+1944]; +add.f32 f177, f172, f173; +add.f32 f178, f171, f177; +add.f32 f179, f175, f176; +add.f32 f180, f174, f179; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f171, f181; +sub.f32 f183, f175, f176; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +mul.f32 f187, f179, 0f3F000000; +sub.f32 f188, f174, f187; +sub.f32 f189, f172, f173; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 2; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 8; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f32 {f193, f194}, [rd26]; +mul.f32 f197, f191, f194; +fma.rn.f32 f198, f193, f185, f197; +mul.f32 f199, f185, f194; +mul.f32 f200, f193, f191; +sub.f32 f201, f200, f199; +mul.f32 f202, f193, f193; +mul.f32 f203, f194, f194; +sub.f32 f204, f202, f203; +mul.f32 f205, f194, f193; +fma.rn.f32 f206, f194, f193, f205; +mul.f32 f207, f192, f206; +fma.rn.f32 f208, f204, f186, f207; +mul.f32 f209, f186, f206; +mul.f32 f210, f204, f192; +sub.f32 f211, f210, f209; +barrier.sync 0; +mad.lo.s32 r39, r34, 972, r38; +st.shared.f32 [r39], f178; +st.shared.f32 [r39+324], f198; +st.shared.f32 [r39+648], f208; +barrier.sync 0; +ld.shared.f32 f212, [r11]; +ld.shared.f32 f213, [r11+972]; +ld.shared.f32 f214, [r11+1944]; +barrier.sync 0; +st.shared.f32 [r39], f180; +st.shared.f32 [r39+324], f201; +st.shared.f32 [r39+648], f211; +barrier.sync 0; +ld.shared.f32 f215, [r11]; +ld.shared.f32 f216, [r11+972]; +ld.shared.f32 f217, [r11+1944]; +add.f32 f218, f213, f214; +add.f32 f219, f216, f217; +mul.f32 f220, f218, 0f3F000000; +sub.f32 f221, f212, f220; +sub.f32 f222, f216, f217; +mul.f32 f223, f222, 0fBF5DB3D7; +mul.f32 f224, f219, 0f3F000000; +sub.f32 f225, f215, f224; +sub.f32 f226, f213, f214; +mul.f32 f227, f226, 0fBF5DB3D7; +add.f32 %0, f212, f218; +add.f32 %1, f215, f219; +add.f32 %2, f223, f221; +sub.f32 %3, f225, f227; +sub.f32 %4, f221, f223; +add.f32 %5, f227, f225; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_729), "l"(lut_sp_3_243), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..c1d306fdbd16f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp64_fwd.hpp.inc @@ -0,0 +1,4806 @@ +#ifndef CUFFTDX_FFT_729_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_729_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<523, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<1850>; +.reg .b64 rd<9>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 5832, r17; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1841, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1840, %57, fd1841; +mul.f64 fd119, fd1841, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1839, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1838, %63, fd1839; +mul.f64 fd135, fd1839, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1837, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1836, %69, fd1837; +mul.f64 fd151, fd1837, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd1835, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1835, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd1833, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1834, fd155, 0dBFEF838B8C811C17; +sub.f64 fd164, fd1833, fd1834; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd1831, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1832, fd140, 0dBFEF838B8C811C17; +sub.f64 fd169, fd1831, fd1832; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd1829, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1830, fd156, 0dBFD5E3A8748A0BF5; +sub.f64 fd174, fd1829, fd1830; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1828, fd1838, fd1836; +sub.f64 fd183, fd1838, fd1836; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1827, fd1840, fd1828; +mul.f64 fd187, fd1828, 0d3FE0000000000000; +sub.f64 fd188, fd1840, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1826, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1825, fd123, fd1826; +mul.f64 fd203, fd1826, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1824, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1823, fd124, fd1824; +mul.f64 fd219, fd1824, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1820, %110, %111; +sub.f64 fd231, %110, %111; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1818, %112, fd1820; +mul.f64 fd235, fd1820, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1815, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1813, %115, fd1815; +mul.f64 fd251, fd1815, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1810, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1808, %118, fd1810; +mul.f64 fd267, fd1810, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd1807, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1807, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd1806, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1806, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd1804, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1805, fd256, 0dBFEF838B8C811C17; +sub.f64 fd285, fd1804, fd1805; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd1802, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1803, fd272, 0dBFD5E3A8748A0BF5; +sub.f64 fd290, fd1802, fd1803; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1801, fd1813, fd1808; +sub.f64 fd299, fd1813, fd1808; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1800, fd1818, fd1801; +mul.f64 fd303, fd1801, 0d3FE0000000000000; +sub.f64 fd304, fd1818, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1799, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1798, fd239, fd1799; +mul.f64 fd319, fd1799, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1797, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1796, fd240, fd1797; +mul.f64 fd335, fd1797, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1793, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1791, %121, fd1793; +mul.f64 fd351, fd1793, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1788, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1786, %124, fd1788; +mul.f64 fd367, fd1788, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1784, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1782, %126, fd1784; +mul.f64 fd383, fd1784, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd1781, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1781, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd1780, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1780, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd1778, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1779, fd372, 0dBFEF838B8C811C17; +sub.f64 fd401, fd1778, fd1779; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd1776, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1777, fd388, 0dBFD5E3A8748A0BF5; +sub.f64 fd406, fd1776, fd1777; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1775, fd1786, fd1782; +sub.f64 fd415, fd1786, fd1782; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1774, fd1791, fd1775; +mul.f64 fd419, fd1775, 0d3FE0000000000000; +sub.f64 fd420, fd1791, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1773, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1772, fd355, fd1773; +mul.f64 fd435, fd1773, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1771, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1770, fd356, fd1771; +mul.f64 fd451, fd1771, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1798, 0dBFCD84D223638000; +mul.f64 fd1769, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1769, fd458; +mul.f64 fd460, fd1798, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd1767, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1768, fd1772, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd1767, fd1768; +mul.f64 fd465, fd1772, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd1765, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1766, fd1796, 0dBFDCB920325BAFA6; +sub.f64 fd469, fd1765, fd1766; +mul.f64 fd470, fd1796, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd1763, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1764, fd1770, 0dBFE9AAFE4207DF5F; +sub.f64 fd474, fd1763, fd1764; +mul.f64 fd475, fd1770, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd1761, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1762, fd307, 0dBFE491B7523C161D; +sub.f64 fd479, fd1761, fd1762; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd1760, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1760, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd1759, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1759, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd1758, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1758, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0dBFED6206BEB6C24B; +mul.f64 fd1757, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1757, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0dBFE746A51650EADE; +mul.f64 fd1756, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1756, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0dBFEF838B8C811C17; +mul.f64 fd1755, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1755, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd1753, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1754, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd1753, fd1754; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd1751, fd318, 0dBFADC528B5343A86; +mul.f64 fd1752, fd324, 0dBFEFF223F3635CE3; +sub.f64 fd519, fd1751, fd1752; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd1749, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1750, fd440, 0d3FBDB843E577175E; +sub.f64 fd524, fd1749, fd1750; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd1748, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1748, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd1747, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1747, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +add.f64 fd538, fd178, fd537; +mul.f64 fd541, fd537, 0d3FE0000000000000; +sub.f64 fd542, fd178, fd541; +add.f64 fd1746, fd1800, fd1774; +sub.f64 fd543, fd1800, fd1774; +mul.f64 fd544, fd543, 0d3FEBB67AE8584CAA; +add.f64 fd545, fd544, fd542; +sub.f64 fd546, fd542, fd544; +add.f64 fd1745, fd1827, fd1746; +mul.f64 fd547, fd1746, 0d3FE0000000000000; +sub.f64 fd548, fd1827, fd547; +sub.f64 fd549, fd294, fd410; +mul.f64 fd550, fd549, 0d3FEBB67AE8584CAA; +sub.f64 fd551, fd548, fd550; +add.f64 fd552, fd550, fd548; +add.f64 fd553, fd459, fd464; +add.f64 fd554, fd194, fd553; +mul.f64 fd557, fd553, 0d3FE0000000000000; +sub.f64 fd558, fd194, fd557; +add.f64 fd1744, fd461, fd466; +sub.f64 fd559, fd461, fd466; +mul.f64 fd560, fd559, 0d3FEBB67AE8584CAA; +add.f64 fd561, fd560, fd558; +sub.f64 fd562, fd558, fd560; +add.f64 fd1743, fd1825, fd1744; +mul.f64 fd563, fd1744, 0d3FE0000000000000; +sub.f64 fd564, fd1825, fd563; +sub.f64 fd565, fd459, fd464; +mul.f64 fd566, fd565, 0d3FEBB67AE8584CAA; +sub.f64 fd567, fd564, fd566; +add.f64 fd568, fd566, fd564; +add.f64 fd569, fd469, fd474; +add.f64 fd570, fd210, fd569; +mul.f64 fd573, fd569, 0d3FE0000000000000; +sub.f64 fd574, fd210, fd573; +add.f64 fd1742, fd471, fd476; +sub.f64 fd575, fd471, fd476; +mul.f64 fd576, fd575, 0d3FEBB67AE8584CAA; +add.f64 fd577, fd576, fd574; +sub.f64 fd578, fd574, fd576; +add.f64 fd1741, fd1823, fd1742; +mul.f64 fd579, fd1742, 0d3FE0000000000000; +sub.f64 fd580, fd1823, fd579; +sub.f64 fd581, fd469, fd474; +mul.f64 fd582, fd581, 0d3FEBB67AE8584CAA; +sub.f64 fd583, fd580, fd582; +add.f64 fd584, fd582, fd580; +add.f64 fd585, fd479, fd484; +add.f64 fd586, fd185, fd585; +mul.f64 fd589, fd585, 0d3FE0000000000000; +sub.f64 fd590, fd185, fd589; +add.f64 fd1740, fd481, fd486; +sub.f64 fd591, fd481, fd486; +mul.f64 fd592, fd591, 0d3FEBB67AE8584CAA; +add.f64 fd593, fd592, fd590; +sub.f64 fd594, fd590, fd592; +add.f64 fd1739, fd191, fd1740; +mul.f64 fd595, fd1740, 0d3FE0000000000000; +sub.f64 fd596, fd191, fd595; +sub.f64 fd597, fd479, fd484; +mul.f64 fd598, fd597, 0d3FEBB67AE8584CAA; +sub.f64 fd599, fd596, fd598; +add.f64 fd600, fd598, fd596; +add.f64 fd601, fd489, fd494; +add.f64 fd602, fd201, fd601; +mul.f64 fd605, fd601, 0d3FE0000000000000; +sub.f64 fd606, fd201, fd605; +add.f64 fd1738, fd491, fd496; +sub.f64 fd607, fd491, fd496; +mul.f64 fd608, fd607, 0d3FEBB67AE8584CAA; +add.f64 fd609, fd608, fd606; +sub.f64 fd610, fd606, fd608; +add.f64 fd1737, fd207, fd1738; +mul.f64 fd611, fd1738, 0d3FE0000000000000; +sub.f64 fd612, fd207, fd611; +sub.f64 fd613, fd489, fd494; +mul.f64 fd614, fd613, 0d3FEBB67AE8584CAA; +sub.f64 fd615, fd612, fd614; +add.f64 fd616, fd614, fd612; +add.f64 fd617, fd499, fd504; +add.f64 fd618, fd217, fd617; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd217, fd621; +add.f64 fd1736, fd501, fd506; +sub.f64 fd623, fd501, fd506; +mul.f64 fd624, fd623, 0d3FEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +add.f64 fd1735, fd223, fd1736; +mul.f64 fd627, fd1736, 0d3FE0000000000000; +sub.f64 fd628, fd223, fd627; +sub.f64 fd629, fd499, fd504; +mul.f64 fd630, fd629, 0d3FEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd509, fd514; +add.f64 fd634, fd186, fd633; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd186, fd637; +add.f64 fd1734, fd511, fd516; +sub.f64 fd639, fd511, fd516; +mul.f64 fd640, fd639, 0d3FEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +add.f64 fd1733, fd192, fd1734; +mul.f64 fd643, fd1734, 0d3FE0000000000000; +sub.f64 fd644, fd192, fd643; +sub.f64 fd645, fd509, fd514; +mul.f64 fd646, fd645, 0d3FEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +add.f64 fd649, fd519, fd524; +add.f64 fd650, fd202, fd649; +mul.f64 fd653, fd649, 0d3FE0000000000000; +sub.f64 fd654, fd202, fd653; +add.f64 fd1732, fd521, fd526; +sub.f64 fd655, fd521, fd526; +mul.f64 fd656, fd655, 0d3FEBB67AE8584CAA; +add.f64 fd657, fd656, fd654; +sub.f64 fd658, fd654, fd656; +add.f64 fd1731, fd208, fd1732; +mul.f64 fd659, fd1732, 0d3FE0000000000000; +sub.f64 fd660, fd208, fd659; +sub.f64 fd661, fd519, fd524; +mul.f64 fd662, fd661, 0d3FEBB67AE8584CAA; +sub.f64 fd663, fd660, fd662; +add.f64 fd664, fd662, fd660; +add.f64 fd665, fd529, fd534; +add.f64 fd666, fd218, fd665; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd218, fd669; +add.f64 fd1730, fd531, fd536; +sub.f64 fd671, fd531, fd536; +mul.f64 fd672, fd671, 0d3FEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +add.f64 fd1729, fd224, fd1730; +mul.f64 fd675, fd1730, 0d3FE0000000000000; +sub.f64 fd676, fd224, fd675; +sub.f64 fd677, fd529, fd534; +mul.f64 fd678, fd677, 0d3FEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mul.wide.u32 rd7, r11, 16; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd681, fd682}, [rd6]; +mul.f64 fd1727, fd681, fd554; +mul.f64 fd1728, fd682, fd1743; +sub.f64 fd687, fd1727, fd1728; +mul.f64 fd688, fd681, fd1743; +fma.rn.f64 fd689, fd682, fd554, fd688; +mul.f64 fd1725, fd681, fd681; +mul.f64 fd1726, fd682, fd682; +sub.f64 fd692, fd1725, fd1726; +mul.f64 fd693, fd682, fd681; +fma.rn.f64 fd694, fd682, fd681, fd693; +mul.f64 fd1723, fd692, fd570; +mul.f64 fd1724, fd694, fd1741; +sub.f64 fd697, fd1723, fd1724; +mul.f64 fd698, fd692, fd1741; +fma.rn.f64 fd699, fd694, fd570, fd698; +mul.f64 fd701, fd682, fd694; +mul.f64 fd1722, fd681, fd692; +sub.f64 fd702, fd1722, fd701; +mul.f64 fd703, fd681, fd694; +fma.rn.f64 fd704, fd682, fd692, fd703; +mul.f64 fd706, fd704, fd1739; +mul.f64 fd1721, fd702, fd586; +sub.f64 fd707, fd1721, fd706; +mul.f64 fd708, fd702, fd1739; +fma.rn.f64 fd709, fd704, fd586, fd708; +mul.f64 fd711, fd682, fd704; +mul.f64 fd1720, fd681, fd702; +sub.f64 fd712, fd1720, fd711; +mul.f64 fd713, fd681, fd704; +fma.rn.f64 fd714, fd682, fd702, fd713; +mul.f64 fd716, fd714, fd1737; +mul.f64 fd1719, fd712, fd602; +sub.f64 fd717, fd1719, fd716; +mul.f64 fd718, fd712, fd1737; +fma.rn.f64 fd719, fd714, fd602, fd718; +mul.f64 fd721, fd682, fd714; +mul.f64 fd1718, fd681, fd712; +sub.f64 fd722, fd1718, fd721; +mul.f64 fd723, fd681, fd714; +fma.rn.f64 fd724, fd682, fd712, fd723; +mul.f64 fd1716, fd722, fd618; +mul.f64 fd1717, fd724, fd1735; +sub.f64 fd727, fd1716, fd1717; +mul.f64 fd728, fd722, fd1735; +fma.rn.f64 fd729, fd724, fd618, fd728; +mul.f64 fd1714, fd681, fd722; +mul.f64 fd1715, fd682, fd724; +sub.f64 fd732, fd1714, fd1715; +mul.f64 fd733, fd681, fd724; +fma.rn.f64 fd734, fd682, fd722, fd733; +mul.f64 fd1712, fd732, fd634; +mul.f64 fd1713, fd734, fd1733; +sub.f64 fd737, fd1712, fd1713; +mul.f64 fd738, fd732, fd1733; +fma.rn.f64 fd739, fd734, fd634, fd738; +mul.f64 fd1710, fd681, fd732; +mul.f64 fd1711, fd682, fd734; +sub.f64 fd742, fd1710, fd1711; +mul.f64 fd743, fd681, fd734; +fma.rn.f64 fd744, fd682, fd732, fd743; +mul.f64 fd746, fd744, fd1731; +mul.f64 fd1709, fd742, fd650; +sub.f64 fd747, fd1709, fd746; +mul.f64 fd748, fd742, fd1731; +fma.rn.f64 fd749, fd744, fd650, fd748; +mul.f64 fd751, fd682, fd744; +mul.f64 fd1708, fd681, fd742; +sub.f64 fd752, fd1708, fd751; +mul.f64 fd753, fd681, fd744; +fma.rn.f64 fd754, fd682, fd742, fd753; +mul.f64 fd756, fd754, fd1729; +mul.f64 fd1707, fd752, fd666; +sub.f64 fd757, fd1707, fd756; +mul.f64 fd758, fd752, fd1729; +fma.rn.f64 fd759, fd754, fd666, fd758; +mul.f64 fd761, fd682, fd754; +mul.f64 fd1706, fd681, fd752; +sub.f64 fd762, fd1706, fd761; +mul.f64 fd763, fd681, fd754; +fma.rn.f64 fd764, fd682, fd752, fd763; +mul.f64 fd766, fd764, fd551; +mul.f64 fd1705, fd762, fd545; +sub.f64 fd767, fd1705, fd766; +mul.f64 fd768, fd762, fd551; +fma.rn.f64 fd769, fd764, fd545, fd768; +mul.f64 fd1703, fd681, fd762; +mul.f64 fd1704, fd682, fd764; +sub.f64 fd772, fd1703, fd1704; +mul.f64 fd773, fd681, fd764; +fma.rn.f64 fd774, fd682, fd762, fd773; +mul.f64 fd1701, fd772, fd561; +mul.f64 fd1702, fd774, fd567; +sub.f64 fd777, fd1701, fd1702; +mul.f64 fd778, fd772, fd567; +fma.rn.f64 fd779, fd774, fd561, fd778; +mul.f64 fd1699, fd681, fd772; +mul.f64 fd1700, fd682, fd774; +sub.f64 fd782, fd1699, fd1700; +mul.f64 fd783, fd681, fd774; +fma.rn.f64 fd784, fd682, fd772, fd783; +mul.f64 fd1697, fd782, fd577; +mul.f64 fd1698, fd784, fd583; +sub.f64 fd787, fd1697, fd1698; +mul.f64 fd788, fd782, fd583; +fma.rn.f64 fd789, fd784, fd577, fd788; +mul.f64 fd791, fd682, fd784; +mul.f64 fd1696, fd681, fd782; +sub.f64 fd792, fd1696, fd791; +mul.f64 fd793, fd681, fd784; +fma.rn.f64 fd794, fd682, fd782, fd793; +mul.f64 fd796, fd794, fd599; +mul.f64 fd1695, fd792, fd593; +sub.f64 fd797, fd1695, fd796; +mul.f64 fd798, fd792, fd599; +fma.rn.f64 fd799, fd794, fd593, fd798; +mul.f64 fd801, fd682, fd794; +mul.f64 fd1694, fd681, fd792; +sub.f64 fd802, fd1694, fd801; +mul.f64 fd803, fd681, fd794; +fma.rn.f64 fd804, fd682, fd792, fd803; +mul.f64 fd806, fd804, fd615; +mul.f64 fd1693, fd802, fd609; +sub.f64 fd807, fd1693, fd806; +mul.f64 fd808, fd802, fd615; +fma.rn.f64 fd809, fd804, fd609, fd808; +ld.global.v2.f64 {fd810, fd811}, [rd6+432]; +mul.f64 fd815, fd811, fd631; +mul.f64 fd1692, fd810, fd625; +sub.f64 fd816, fd1692, fd815; +mul.f64 fd817, fd810, fd631; +fma.rn.f64 fd818, fd811, fd625, fd817; +mul.f64 fd820, fd682, fd811; +mul.f64 fd1691, fd681, fd810; +sub.f64 fd821, fd1691, fd820; +mul.f64 fd822, fd681, fd811; +fma.rn.f64 fd823, fd682, fd810, fd822; +mul.f64 fd825, fd823, fd647; +mul.f64 fd1690, fd821, fd641; +sub.f64 fd826, fd1690, fd825; +mul.f64 fd827, fd821, fd647; +fma.rn.f64 fd828, fd823, fd641, fd827; +mul.f64 fd830, fd682, fd823; +mul.f64 fd1689, fd681, fd821; +sub.f64 fd831, fd1689, fd830; +mul.f64 fd832, fd681, fd823; +fma.rn.f64 fd833, fd682, fd821, fd832; +mul.f64 fd1687, fd831, fd657; +mul.f64 fd1688, fd833, fd663; +sub.f64 fd836, fd1687, fd1688; +mul.f64 fd837, fd831, fd663; +fma.rn.f64 fd838, fd833, fd657, fd837; +mul.f64 fd1685, fd681, fd831; +mul.f64 fd1686, fd682, fd833; +sub.f64 fd841, fd1685, fd1686; +mul.f64 fd842, fd681, fd833; +fma.rn.f64 fd843, fd682, fd831, fd842; +mul.f64 fd1683, fd841, fd673; +mul.f64 fd1684, fd843, fd679; +sub.f64 fd846, fd1683, fd1684; +mul.f64 fd847, fd841, fd679; +fma.rn.f64 fd848, fd843, fd673, fd847; +mul.f64 fd1681, fd681, fd841; +mul.f64 fd1682, fd682, fd843; +sub.f64 fd851, fd1681, fd1682; +mul.f64 fd852, fd681, fd843; +fma.rn.f64 fd853, fd682, fd841, fd852; +mul.f64 fd855, fd853, fd552; +mul.f64 fd1680, fd851, fd546; +sub.f64 fd856, fd1680, fd855; +mul.f64 fd857, fd851, fd552; +fma.rn.f64 fd858, fd853, fd546, fd857; +mul.f64 fd860, fd682, fd853; +mul.f64 fd1679, fd681, fd851; +sub.f64 fd861, fd1679, fd860; +mul.f64 fd862, fd681, fd853; +fma.rn.f64 fd863, fd682, fd851, fd862; +mul.f64 fd865, fd863, fd568; +mul.f64 fd1678, fd861, fd562; +sub.f64 fd866, fd1678, fd865; +mul.f64 fd867, fd861, fd568; +fma.rn.f64 fd868, fd863, fd562, fd867; +mul.f64 fd870, fd682, fd863; +mul.f64 fd1677, fd681, fd861; +sub.f64 fd871, fd1677, fd870; +mul.f64 fd872, fd681, fd863; +fma.rn.f64 fd873, fd682, fd861, fd872; +mul.f64 fd875, fd873, fd584; +mul.f64 fd1676, fd871, fd578; +sub.f64 fd876, fd1676, fd875; +mul.f64 fd877, fd871, fd584; +fma.rn.f64 fd878, fd873, fd578, fd877; +mul.f64 fd880, fd682, fd873; +mul.f64 fd1675, fd681, fd871; +sub.f64 fd881, fd1675, fd880; +mul.f64 fd882, fd681, fd873; +fma.rn.f64 fd883, fd682, fd871, fd882; +mul.f64 fd1673, fd881, fd594; +mul.f64 fd1674, fd883, fd600; +sub.f64 fd886, fd1673, fd1674; +mul.f64 fd887, fd881, fd600; +fma.rn.f64 fd888, fd883, fd594, fd887; +mul.f64 fd1671, fd681, fd881; +mul.f64 fd1672, fd682, fd883; +sub.f64 fd891, fd1671, fd1672; +mul.f64 fd892, fd681, fd883; +fma.rn.f64 fd893, fd682, fd881, fd892; +mul.f64 fd1669, fd891, fd610; +mul.f64 fd1670, fd893, fd616; +sub.f64 fd896, fd1669, fd1670; +mul.f64 fd897, fd891, fd616; +fma.rn.f64 fd898, fd893, fd610, fd897; +mul.f64 fd900, fd682, fd893; +mul.f64 fd1668, fd681, fd891; +sub.f64 fd901, fd1668, fd900; +mul.f64 fd902, fd681, fd893; +fma.rn.f64 fd903, fd682, fd891, fd902; +mul.f64 fd905, fd903, fd632; +mul.f64 fd1667, fd901, fd626; +sub.f64 fd906, fd1667, fd905; +mul.f64 fd907, fd901, fd632; +fma.rn.f64 fd908, fd903, fd626, fd907; +mul.f64 fd910, fd682, fd903; +mul.f64 fd1666, fd681, fd901; +sub.f64 fd911, fd1666, fd910; +mul.f64 fd912, fd681, fd903; +fma.rn.f64 fd913, fd682, fd901, fd912; +mul.f64 fd915, fd913, fd648; +mul.f64 fd1665, fd911, fd642; +sub.f64 fd916, fd1665, fd915; +mul.f64 fd917, fd911, fd648; +fma.rn.f64 fd918, fd913, fd642, fd917; +mul.f64 fd920, fd682, fd913; +mul.f64 fd1664, fd681, fd911; +sub.f64 fd921, fd1664, fd920; +mul.f64 fd922, fd681, fd913; +fma.rn.f64 fd923, fd682, fd911, fd922; +mul.f64 fd925, fd923, fd664; +mul.f64 fd1663, fd921, fd658; +sub.f64 fd926, fd1663, fd925; +mul.f64 fd927, fd921, fd664; +fma.rn.f64 fd928, fd923, fd658, fd927; +mul.f64 fd1661, fd681, fd921; +mul.f64 fd1662, fd682, fd923; +sub.f64 fd931, fd1661, fd1662; +mul.f64 fd932, fd681, fd923; +fma.rn.f64 fd933, fd682, fd921, fd932; +mul.f64 fd1659, fd931, fd674; +mul.f64 fd1660, fd933, fd680; +sub.f64 fd936, fd1659, fd1660; +mul.f64 fd937, fd931, fd680; +fma.rn.f64 fd938, fd933, fd674, fd937; +mad.lo.s32 r12, r9, 5832, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 216, r12; +st.shared.f64 [r13], fd538; +st.shared.f64 [r13+8], fd687; +st.shared.f64 [r13+16], fd697; +st.shared.f64 [r13+24], fd707; +st.shared.f64 [r13+32], fd717; +st.shared.f64 [r13+40], fd727; +st.shared.f64 [r13+48], fd737; +st.shared.f64 [r13+56], fd747; +st.shared.f64 [r13+64], fd757; +st.shared.f64 [r13+72], fd767; +st.shared.f64 [r13+80], fd777; +st.shared.f64 [r13+88], fd787; +st.shared.f64 [r13+96], fd797; +st.shared.f64 [r13+104], fd807; +st.shared.f64 [r13+112], fd816; +st.shared.f64 [r13+120], fd826; +st.shared.f64 [r13+128], fd836; +st.shared.f64 [r13+136], fd846; +st.shared.f64 [r13+144], fd856; +st.shared.f64 [r13+152], fd866; +st.shared.f64 [r13+160], fd876; +st.shared.f64 [r13+168], fd886; +st.shared.f64 [r13+176], fd896; +st.shared.f64 [r13+184], fd906; +st.shared.f64 [r13+192], fd916; +st.shared.f64 [r13+200], fd926; +st.shared.f64 [r13+208], fd936; +barrier.sync 0; +mad.lo.s32 r14, r11, -208, r13; +ld.shared.f64 fd939, [r14]; +ld.shared.f64 fd940, [r14+216]; +ld.shared.f64 fd941, [r14+432]; +ld.shared.f64 fd942, [r14+648]; +ld.shared.f64 fd943, [r14+864]; +ld.shared.f64 fd944, [r14+1080]; +ld.shared.f64 fd945, [r14+1296]; +ld.shared.f64 fd946, [r14+1512]; +ld.shared.f64 fd947, [r14+1728]; +ld.shared.f64 fd948, [r14+1944]; +ld.shared.f64 fd949, [r14+2160]; +ld.shared.f64 fd950, [r14+2376]; +ld.shared.f64 fd951, [r14+2592]; +ld.shared.f64 fd952, [r14+2808]; +ld.shared.f64 fd953, [r14+3024]; +ld.shared.f64 fd954, [r14+3240]; +ld.shared.f64 fd955, [r14+3456]; +ld.shared.f64 fd956, [r14+3672]; +ld.shared.f64 fd957, [r14+3888]; +ld.shared.f64 fd958, [r14+4104]; +ld.shared.f64 fd959, [r14+4320]; +ld.shared.f64 fd960, [r14+4536]; +ld.shared.f64 fd961, [r14+4752]; +ld.shared.f64 fd962, [r14+4968]; +ld.shared.f64 fd963, [r14+5184]; +ld.shared.f64 fd964, [r14+5400]; +ld.shared.f64 fd965, [r14+5616]; +barrier.sync 0; +st.shared.f64 [r13], fd1745; +st.shared.f64 [r13+8], fd689; +st.shared.f64 [r13+16], fd699; +st.shared.f64 [r13+24], fd709; +st.shared.f64 [r13+32], fd719; +st.shared.f64 [r13+40], fd729; +st.shared.f64 [r13+48], fd739; +st.shared.f64 [r13+56], fd749; +st.shared.f64 [r13+64], fd759; +st.shared.f64 [r13+72], fd769; +st.shared.f64 [r13+80], fd779; +st.shared.f64 [r13+88], fd789; +st.shared.f64 [r13+96], fd799; +st.shared.f64 [r13+104], fd809; +st.shared.f64 [r13+112], fd818; +st.shared.f64 [r13+120], fd828; +st.shared.f64 [r13+128], fd838; +st.shared.f64 [r13+136], fd848; +st.shared.f64 [r13+144], fd858; +st.shared.f64 [r13+152], fd868; +st.shared.f64 [r13+160], fd878; +st.shared.f64 [r13+168], fd888; +st.shared.f64 [r13+176], fd898; +st.shared.f64 [r13+184], fd908; +st.shared.f64 [r13+192], fd918; +st.shared.f64 [r13+200], fd928; +st.shared.f64 [r13+208], fd938; +barrier.sync 0; +ld.shared.f64 fd966, [r14]; +ld.shared.f64 fd967, [r14+216]; +ld.shared.f64 fd968, [r14+432]; +ld.shared.f64 fd969, [r14+648]; +ld.shared.f64 fd970, [r14+864]; +ld.shared.f64 fd971, [r14+1080]; +ld.shared.f64 fd972, [r14+1296]; +ld.shared.f64 fd973, [r14+1512]; +ld.shared.f64 fd974, [r14+1728]; +ld.shared.f64 fd975, [r14+1944]; +ld.shared.f64 fd976, [r14+2160]; +ld.shared.f64 fd977, [r14+2376]; +ld.shared.f64 fd978, [r14+2592]; +ld.shared.f64 fd979, [r14+2808]; +ld.shared.f64 fd980, [r14+3024]; +ld.shared.f64 fd981, [r14+3240]; +ld.shared.f64 fd982, [r14+3456]; +ld.shared.f64 fd983, [r14+3672]; +ld.shared.f64 fd984, [r14+3888]; +ld.shared.f64 fd985, [r14+4104]; +ld.shared.f64 fd986, [r14+4320]; +ld.shared.f64 fd987, [r14+4536]; +ld.shared.f64 fd988, [r14+4752]; +ld.shared.f64 fd989, [r14+4968]; +ld.shared.f64 fd990, [r14+5184]; +ld.shared.f64 fd991, [r14+5400]; +ld.shared.f64 fd992, [r14+5616]; +add.f64 fd993, fd948, fd957; +add.f64 fd994, fd939, fd993; +mul.f64 fd997, fd993, 0d3FE0000000000000; +sub.f64 fd998, fd939, fd997; +add.f64 fd1658, fd975, fd984; +sub.f64 fd999, fd975, fd984; +mul.f64 fd1000, fd999, 0d3FEBB67AE8584CAA; +add.f64 fd1001, fd1000, fd998; +sub.f64 fd1002, fd998, fd1000; +add.f64 fd1657, fd966, fd1658; +mul.f64 fd1003, fd1658, 0d3FE0000000000000; +sub.f64 fd1004, fd966, fd1003; +sub.f64 fd1005, fd948, fd957; +mul.f64 fd1006, fd1005, 0d3FEBB67AE8584CAA; +sub.f64 fd1007, fd1004, fd1006; +add.f64 fd1008, fd1006, fd1004; +add.f64 fd1009, fd951, fd960; +add.f64 fd1010, fd942, fd1009; +mul.f64 fd1013, fd1009, 0d3FE0000000000000; +sub.f64 fd1014, fd942, fd1013; +add.f64 fd1656, fd978, fd987; +sub.f64 fd1015, fd978, fd987; +mul.f64 fd1016, fd1015, 0d3FEBB67AE8584CAA; +add.f64 fd1017, fd1016, fd1014; +sub.f64 fd1018, fd1014, fd1016; +add.f64 fd1655, fd969, fd1656; +mul.f64 fd1019, fd1656, 0d3FE0000000000000; +sub.f64 fd1020, fd969, fd1019; +sub.f64 fd1021, fd951, fd960; +mul.f64 fd1022, fd1021, 0d3FEBB67AE8584CAA; +sub.f64 fd1023, fd1020, fd1022; +add.f64 fd1024, fd1022, fd1020; +add.f64 fd1025, fd954, fd963; +add.f64 fd1026, fd945, fd1025; +mul.f64 fd1029, fd1025, 0d3FE0000000000000; +sub.f64 fd1030, fd945, fd1029; +add.f64 fd1654, fd981, fd990; +sub.f64 fd1031, fd981, fd990; +mul.f64 fd1032, fd1031, 0d3FEBB67AE8584CAA; +add.f64 fd1033, fd1032, fd1030; +sub.f64 fd1034, fd1030, fd1032; +add.f64 fd1653, fd972, fd1654; +mul.f64 fd1035, fd1654, 0d3FE0000000000000; +sub.f64 fd1036, fd972, fd1035; +sub.f64 fd1037, fd954, fd963; +mul.f64 fd1038, fd1037, 0d3FEBB67AE8584CAA; +sub.f64 fd1039, fd1036, fd1038; +add.f64 fd1040, fd1038, fd1036; +mul.f64 fd1042, fd1023, 0dBFE491B7523C161D; +mul.f64 fd1652, fd1017, 0d3FE8836FA2CF5039; +sub.f64 fd1043, fd1652, fd1042; +mul.f64 fd1044, fd1023, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1045, fd1017, 0dBFE491B7523C161D, fd1044; +mul.f64 fd1047, fd1039, 0dBFEF838B8C811C17; +mul.f64 fd1651, fd1033, 0d3FC63A1A7E0B738A; +sub.f64 fd1048, fd1651, fd1047; +mul.f64 fd1049, fd1039, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1050, fd1033, 0dBFEF838B8C811C17, fd1049; +mul.f64 fd1052, fd1024, 0dBFEF838B8C811C17; +mul.f64 fd1650, fd1018, 0d3FC63A1A7E0B738A; +sub.f64 fd1053, fd1650, fd1052; +mul.f64 fd1054, fd1024, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1055, fd1018, 0dBFEF838B8C811C17, fd1054; +mul.f64 fd1057, fd1040, 0dBFD5E3A8748A0BF5; +mul.f64 fd1649, fd1034, 0dBFEE11F642522D1C; +sub.f64 fd1058, fd1649, fd1057; +mul.f64 fd1059, fd1040, 0dBFEE11F642522D1C; +fma.rn.f64 fd1060, fd1034, 0dBFD5E3A8748A0BF5, fd1059; +add.f64 fd1061, fd1010, fd1026; +add.f64 fd1062, fd994, fd1061; +mul.f64 fd1065, fd1061, 0d3FE0000000000000; +sub.f64 fd1066, fd994, fd1065; +add.f64 fd1648, fd1655, fd1653; +sub.f64 fd1067, fd1655, fd1653; +mul.f64 fd1068, fd1067, 0d3FEBB67AE8584CAA; +add.f64 fd1069, fd1068, fd1066; +sub.f64 fd1070, fd1066, fd1068; +add.f64 fd1647, fd1657, fd1648; +mul.f64 fd1071, fd1648, 0d3FE0000000000000; +sub.f64 fd1072, fd1657, fd1071; +sub.f64 fd1073, fd1010, fd1026; +mul.f64 fd1074, fd1073, 0d3FEBB67AE8584CAA; +sub.f64 fd1075, fd1072, fd1074; +add.f64 fd1076, fd1074, fd1072; +add.f64 fd1077, fd1043, fd1048; +add.f64 fd1078, fd1001, fd1077; +mul.f64 fd1081, fd1077, 0d3FE0000000000000; +sub.f64 fd1082, fd1001, fd1081; +add.f64 fd1646, fd1045, fd1050; +sub.f64 fd1083, fd1045, fd1050; +mul.f64 fd1084, fd1083, 0d3FEBB67AE8584CAA; +add.f64 fd1085, fd1084, fd1082; +sub.f64 fd1086, fd1082, fd1084; +add.f64 fd1645, fd1007, fd1646; +mul.f64 fd1087, fd1646, 0d3FE0000000000000; +sub.f64 fd1088, fd1007, fd1087; +sub.f64 fd1089, fd1043, fd1048; +mul.f64 fd1090, fd1089, 0d3FEBB67AE8584CAA; +sub.f64 fd1091, fd1088, fd1090; +add.f64 fd1092, fd1090, fd1088; +add.f64 fd1093, fd1053, fd1058; +add.f64 fd1094, fd1002, fd1093; +mul.f64 fd1097, fd1093, 0d3FE0000000000000; +sub.f64 fd1098, fd1002, fd1097; +add.f64 fd1644, fd1055, fd1060; +sub.f64 fd1099, fd1055, fd1060; +mul.f64 fd1100, fd1099, 0d3FEBB67AE8584CAA; +add.f64 fd1101, fd1100, fd1098; +sub.f64 fd1102, fd1098, fd1100; +add.f64 fd1643, fd1008, fd1644; +mul.f64 fd1103, fd1644, 0d3FE0000000000000; +sub.f64 fd1104, fd1008, fd1103; +sub.f64 fd1105, fd1053, fd1058; +mul.f64 fd1106, fd1105, 0d3FEBB67AE8584CAA; +sub.f64 fd1107, fd1104, fd1106; +add.f64 fd1108, fd1106, fd1104; +add.f64 fd1109, fd949, fd958; +add.f64 fd1110, fd940, fd1109; +mul.f64 fd1113, fd1109, 0d3FE0000000000000; +sub.f64 fd1114, fd940, fd1113; +add.f64 fd1642, fd976, fd985; +sub.f64 fd1115, fd976, fd985; +mul.f64 fd1116, fd1115, 0d3FEBB67AE8584CAA; +add.f64 fd1117, fd1116, fd1114; +sub.f64 fd1118, fd1114, fd1116; +add.f64 fd1641, fd967, fd1642; +mul.f64 fd1119, fd1642, 0d3FE0000000000000; +sub.f64 fd1120, fd967, fd1119; +sub.f64 fd1121, fd949, fd958; +mul.f64 fd1122, fd1121, 0d3FEBB67AE8584CAA; +sub.f64 fd1123, fd1120, fd1122; +add.f64 fd1124, fd1122, fd1120; +add.f64 fd1125, fd952, fd961; +add.f64 fd1126, fd943, fd1125; +mul.f64 fd1129, fd1125, 0d3FE0000000000000; +sub.f64 fd1130, fd943, fd1129; +add.f64 fd1640, fd979, fd988; +sub.f64 fd1131, fd979, fd988; +mul.f64 fd1132, fd1131, 0d3FEBB67AE8584CAA; +add.f64 fd1133, fd1132, fd1130; +sub.f64 fd1134, fd1130, fd1132; +add.f64 fd1639, fd970, fd1640; +mul.f64 fd1135, fd1640, 0d3FE0000000000000; +sub.f64 fd1136, fd970, fd1135; +sub.f64 fd1137, fd952, fd961; +mul.f64 fd1138, fd1137, 0d3FEBB67AE8584CAA; +sub.f64 fd1139, fd1136, fd1138; +add.f64 fd1140, fd1138, fd1136; +add.f64 fd1141, fd955, fd964; +add.f64 fd1142, fd946, fd1141; +mul.f64 fd1145, fd1141, 0d3FE0000000000000; +sub.f64 fd1146, fd946, fd1145; +add.f64 fd1638, fd982, fd991; +sub.f64 fd1147, fd982, fd991; +mul.f64 fd1148, fd1147, 0d3FEBB67AE8584CAA; +add.f64 fd1149, fd1148, fd1146; +sub.f64 fd1150, fd1146, fd1148; +add.f64 fd1637, fd973, fd1638; +mul.f64 fd1151, fd1638, 0d3FE0000000000000; +sub.f64 fd1152, fd973, fd1151; +sub.f64 fd1153, fd955, fd964; +mul.f64 fd1154, fd1153, 0d3FEBB67AE8584CAA; +sub.f64 fd1155, fd1152, fd1154; +add.f64 fd1156, fd1154, fd1152; +mul.f64 fd1158, fd1139, 0dBFE491B7523C161D; +mul.f64 fd1636, fd1133, 0d3FE8836FA2CF5039; +sub.f64 fd1159, fd1636, fd1158; +mul.f64 fd1160, fd1139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1161, fd1133, 0dBFE491B7523C161D, fd1160; +mul.f64 fd1163, fd1155, 0dBFEF838B8C811C17; +mul.f64 fd1635, fd1149, 0d3FC63A1A7E0B738A; +sub.f64 fd1164, fd1635, fd1163; +mul.f64 fd1165, fd1155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1166, fd1149, 0dBFEF838B8C811C17, fd1165; +mul.f64 fd1168, fd1140, 0dBFEF838B8C811C17; +mul.f64 fd1634, fd1134, 0d3FC63A1A7E0B738A; +sub.f64 fd1169, fd1634, fd1168; +mul.f64 fd1170, fd1140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1171, fd1134, 0dBFEF838B8C811C17, fd1170; +mul.f64 fd1173, fd1156, 0dBFD5E3A8748A0BF5; +mul.f64 fd1633, fd1150, 0dBFEE11F642522D1C; +sub.f64 fd1174, fd1633, fd1173; +mul.f64 fd1175, fd1156, 0dBFEE11F642522D1C; +fma.rn.f64 fd1176, fd1150, 0dBFD5E3A8748A0BF5, fd1175; +add.f64 fd1177, fd1126, fd1142; +add.f64 fd1178, fd1110, fd1177; +mul.f64 fd1181, fd1177, 0d3FE0000000000000; +sub.f64 fd1182, fd1110, fd1181; +add.f64 fd1632, fd1639, fd1637; +sub.f64 fd1183, fd1639, fd1637; +mul.f64 fd1184, fd1183, 0d3FEBB67AE8584CAA; +add.f64 fd1185, fd1184, fd1182; +sub.f64 fd1186, fd1182, fd1184; +add.f64 fd1631, fd1641, fd1632; +mul.f64 fd1187, fd1632, 0d3FE0000000000000; +sub.f64 fd1188, fd1641, fd1187; +sub.f64 fd1189, fd1126, fd1142; +mul.f64 fd1190, fd1189, 0d3FEBB67AE8584CAA; +sub.f64 fd1191, fd1188, fd1190; +add.f64 fd1192, fd1190, fd1188; +add.f64 fd1193, fd1159, fd1164; +add.f64 fd1194, fd1117, fd1193; +mul.f64 fd1197, fd1193, 0d3FE0000000000000; +sub.f64 fd1198, fd1117, fd1197; +add.f64 fd1630, fd1161, fd1166; +sub.f64 fd1199, fd1161, fd1166; +mul.f64 fd1200, fd1199, 0d3FEBB67AE8584CAA; +add.f64 fd1201, fd1200, fd1198; +sub.f64 fd1202, fd1198, fd1200; +add.f64 fd1629, fd1123, fd1630; +mul.f64 fd1203, fd1630, 0d3FE0000000000000; +sub.f64 fd1204, fd1123, fd1203; +sub.f64 fd1205, fd1159, fd1164; +mul.f64 fd1206, fd1205, 0d3FEBB67AE8584CAA; +sub.f64 fd1207, fd1204, fd1206; +add.f64 fd1208, fd1206, fd1204; +add.f64 fd1209, fd1169, fd1174; +add.f64 fd1210, fd1118, fd1209; +mul.f64 fd1213, fd1209, 0d3FE0000000000000; +sub.f64 fd1214, fd1118, fd1213; +add.f64 fd1628, fd1171, fd1176; +sub.f64 fd1215, fd1171, fd1176; +mul.f64 fd1216, fd1215, 0d3FEBB67AE8584CAA; +add.f64 fd1217, fd1216, fd1214; +sub.f64 fd1218, fd1214, fd1216; +add.f64 fd1627, fd1124, fd1628; +mul.f64 fd1219, fd1628, 0d3FE0000000000000; +sub.f64 fd1220, fd1124, fd1219; +sub.f64 fd1221, fd1169, fd1174; +mul.f64 fd1222, fd1221, 0d3FEBB67AE8584CAA; +sub.f64 fd1223, fd1220, fd1222; +add.f64 fd1224, fd1222, fd1220; +add.f64 fd1225, fd950, fd959; +add.f64 fd1226, fd941, fd1225; +mul.f64 fd1229, fd1225, 0d3FE0000000000000; +sub.f64 fd1230, fd941, fd1229; +add.f64 fd1626, fd977, fd986; +sub.f64 fd1231, fd977, fd986; +mul.f64 fd1232, fd1231, 0d3FEBB67AE8584CAA; +add.f64 fd1233, fd1232, fd1230; +sub.f64 fd1234, fd1230, fd1232; +add.f64 fd1625, fd968, fd1626; +mul.f64 fd1235, fd1626, 0d3FE0000000000000; +sub.f64 fd1236, fd968, fd1235; +sub.f64 fd1237, fd950, fd959; +mul.f64 fd1238, fd1237, 0d3FEBB67AE8584CAA; +sub.f64 fd1239, fd1236, fd1238; +add.f64 fd1240, fd1238, fd1236; +add.f64 fd1241, fd953, fd962; +add.f64 fd1242, fd944, fd1241; +mul.f64 fd1245, fd1241, 0d3FE0000000000000; +sub.f64 fd1246, fd944, fd1245; +add.f64 fd1624, fd980, fd989; +sub.f64 fd1247, fd980, fd989; +mul.f64 fd1248, fd1247, 0d3FEBB67AE8584CAA; +add.f64 fd1249, fd1248, fd1246; +sub.f64 fd1250, fd1246, fd1248; +add.f64 fd1623, fd971, fd1624; +mul.f64 fd1251, fd1624, 0d3FE0000000000000; +sub.f64 fd1252, fd971, fd1251; +sub.f64 fd1253, fd953, fd962; +mul.f64 fd1254, fd1253, 0d3FEBB67AE8584CAA; +sub.f64 fd1255, fd1252, fd1254; +add.f64 fd1256, fd1254, fd1252; +add.f64 fd1257, fd956, fd965; +add.f64 fd1258, fd947, fd1257; +mul.f64 fd1261, fd1257, 0d3FE0000000000000; +sub.f64 fd1262, fd947, fd1261; +add.f64 fd1622, fd983, fd992; +sub.f64 fd1263, fd983, fd992; +mul.f64 fd1264, fd1263, 0d3FEBB67AE8584CAA; +add.f64 fd1265, fd1264, fd1262; +sub.f64 fd1266, fd1262, fd1264; +add.f64 fd1621, fd974, fd1622; +mul.f64 fd1267, fd1622, 0d3FE0000000000000; +sub.f64 fd1268, fd974, fd1267; +sub.f64 fd1269, fd956, fd965; +mul.f64 fd1270, fd1269, 0d3FEBB67AE8584CAA; +sub.f64 fd1271, fd1268, fd1270; +add.f64 fd1272, fd1270, fd1268; +mul.f64 fd1274, fd1255, 0dBFE491B7523C161D; +mul.f64 fd1620, fd1249, 0d3FE8836FA2CF5039; +sub.f64 fd1275, fd1620, fd1274; +mul.f64 fd1276, fd1255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1277, fd1249, 0dBFE491B7523C161D, fd1276; +mul.f64 fd1279, fd1271, 0dBFEF838B8C811C17; +mul.f64 fd1619, fd1265, 0d3FC63A1A7E0B738A; +sub.f64 fd1280, fd1619, fd1279; +mul.f64 fd1281, fd1271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1282, fd1265, 0dBFEF838B8C811C17, fd1281; +mul.f64 fd1284, fd1256, 0dBFEF838B8C811C17; +mul.f64 fd1618, fd1250, 0d3FC63A1A7E0B738A; +sub.f64 fd1285, fd1618, fd1284; +mul.f64 fd1286, fd1256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1287, fd1250, 0dBFEF838B8C811C17, fd1286; +mul.f64 fd1289, fd1272, 0dBFD5E3A8748A0BF5; +mul.f64 fd1617, fd1266, 0dBFEE11F642522D1C; +sub.f64 fd1290, fd1617, fd1289; +mul.f64 fd1291, fd1272, 0dBFEE11F642522D1C; +fma.rn.f64 fd1292, fd1266, 0dBFD5E3A8748A0BF5, fd1291; +add.f64 fd1293, fd1242, fd1258; +add.f64 fd1294, fd1226, fd1293; +mul.f64 fd1297, fd1293, 0d3FE0000000000000; +sub.f64 fd1298, fd1226, fd1297; +add.f64 fd1616, fd1623, fd1621; +sub.f64 fd1299, fd1623, fd1621; +mul.f64 fd1300, fd1299, 0d3FEBB67AE8584CAA; +add.f64 fd1301, fd1300, fd1298; +sub.f64 fd1302, fd1298, fd1300; +add.f64 fd1615, fd1625, fd1616; +mul.f64 fd1303, fd1616, 0d3FE0000000000000; +sub.f64 fd1304, fd1625, fd1303; +sub.f64 fd1305, fd1242, fd1258; +mul.f64 fd1306, fd1305, 0d3FEBB67AE8584CAA; +sub.f64 fd1307, fd1304, fd1306; +add.f64 fd1308, fd1306, fd1304; +add.f64 fd1309, fd1275, fd1280; +add.f64 fd1310, fd1233, fd1309; +mul.f64 fd1313, fd1309, 0d3FE0000000000000; +sub.f64 fd1314, fd1233, fd1313; +add.f64 fd1614, fd1277, fd1282; +sub.f64 fd1315, fd1277, fd1282; +mul.f64 fd1316, fd1315, 0d3FEBB67AE8584CAA; +add.f64 fd1317, fd1316, fd1314; +sub.f64 fd1318, fd1314, fd1316; +add.f64 fd1613, fd1239, fd1614; +mul.f64 fd1319, fd1614, 0d3FE0000000000000; +sub.f64 fd1320, fd1239, fd1319; +sub.f64 fd1321, fd1275, fd1280; +mul.f64 fd1322, fd1321, 0d3FEBB67AE8584CAA; +sub.f64 fd1323, fd1320, fd1322; +add.f64 fd1324, fd1322, fd1320; +add.f64 fd1325, fd1285, fd1290; +add.f64 fd1326, fd1234, fd1325; +mul.f64 fd1329, fd1325, 0d3FE0000000000000; +sub.f64 fd1330, fd1234, fd1329; +add.f64 fd1612, fd1287, fd1292; +sub.f64 fd1331, fd1287, fd1292; +mul.f64 fd1332, fd1331, 0d3FEBB67AE8584CAA; +add.f64 fd1333, fd1332, fd1330; +sub.f64 fd1334, fd1330, fd1332; +add.f64 fd1611, fd1240, fd1612; +mul.f64 fd1335, fd1612, 0d3FE0000000000000; +sub.f64 fd1336, fd1240, fd1335; +sub.f64 fd1337, fd1285, fd1290; +mul.f64 fd1338, fd1337, 0d3FEBB67AE8584CAA; +sub.f64 fd1339, fd1336, fd1338; +add.f64 fd1340, fd1338, fd1336; +mul.f64 fd1609, fd1194, 0d3FEF232EFF15C9E6; +mul.f64 fd1610, fd1629, 0dBFCD84D223638000; +sub.f64 fd1343, fd1609, fd1610; +mul.f64 fd1344, fd1629, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd1345, fd1194, 0dBFCD84D223638000, fd1344; +mul.f64 fd1607, fd1310, 0d3FEC98A37A9A7850; +mul.f64 fd1608, fd1613, 0dBFDCB920325BAFA6; +sub.f64 fd1348, fd1607, fd1608; +mul.f64 fd1349, fd1613, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1350, fd1310, 0dBFDCB920325BAFA6, fd1349; +mul.f64 fd1605, fd1210, 0d3FEC98A37A9A7850; +mul.f64 fd1606, fd1627, 0dBFDCB920325BAFA6; +sub.f64 fd1353, fd1605, fd1606; +mul.f64 fd1354, fd1627, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1355, fd1210, 0dBFDCB920325BAFA6, fd1354; +mul.f64 fd1357, fd1611, 0dBFE9AAFE4207DF5F; +mul.f64 fd1604, fd1326, 0d3FE31BEC55BC71BC; +sub.f64 fd1358, fd1604, fd1357; +mul.f64 fd1359, fd1611, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1360, fd1326, 0dBFE9AAFE4207DF5F, fd1359; +mul.f64 fd1362, fd1191, 0dBFE491B7523C161D; +mul.f64 fd1603, fd1185, 0d3FE8836FA2CF5039; +sub.f64 fd1363, fd1603, fd1362; +mul.f64 fd1364, fd1191, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1365, fd1185, 0dBFE491B7523C161D, fd1364; +mul.f64 fd1367, fd1307, 0dBFEF838B8C811C17; +mul.f64 fd1602, fd1301, 0d3FC63A1A7E0B738A; +sub.f64 fd1368, fd1602, fd1367; +mul.f64 fd1369, fd1307, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1370, fd1301, 0dBFEF838B8C811C17, fd1369; +mul.f64 fd1372, fd1207, 0dBFE9AAFE4207DF5F; +mul.f64 fd1601, fd1201, 0d3FE31BEC55BC71BC; +sub.f64 fd1373, fd1601, fd1372; +mul.f64 fd1374, fd1207, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1375, fd1201, 0dBFE9AAFE4207DF5F, fd1374; +mul.f64 fd1377, fd1323, 0dBFEEA7D99F29CADE; +mul.f64 fd1600, fd1317, 0dBFD25AFBF23865BF; +sub.f64 fd1378, fd1600, fd1377; +mul.f64 fd1379, fd1323, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1380, fd1317, 0dBFEEA7D99F29CADE, fd1379; +mul.f64 fd1598, fd1217, 0d3FD9595EF26FB670; +mul.f64 fd1599, fd1223, 0dBFED6206BEB6C24B; +sub.f64 fd1383, fd1598, fd1599; +mul.f64 fd1384, fd1223, 0d3FD9595EF26FB670; +fma.rn.f64 fd1385, fd1217, 0dBFED6206BEB6C24B, fd1384; +mul.f64 fd1596, fd1333, 0dBFE5F5B105F99707; +mul.f64 fd1597, fd1339, 0dBFE746A51650EADE; +sub.f64 fd1388, fd1596, fd1597; +mul.f64 fd1389, fd1339, 0dBFE5F5B105F99707; +fma.rn.f64 fd1390, fd1333, 0dBFE746A51650EADE, fd1389; +mul.f64 fd1594, fd1186, 0d3FC63A1A7E0B738A; +mul.f64 fd1595, fd1192, 0dBFEF838B8C811C17; +sub.f64 fd1393, fd1594, fd1595; +mul.f64 fd1394, fd1192, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1395, fd1186, 0dBFEF838B8C811C17, fd1394; +mul.f64 fd1592, fd1302, 0dBFEE11F642522D1C; +mul.f64 fd1593, fd1308, 0dBFD5E3A8748A0BF5; +sub.f64 fd1398, fd1592, fd1593; +mul.f64 fd1399, fd1308, 0dBFEE11F642522D1C; +fma.rn.f64 fd1400, fd1302, 0dBFD5E3A8748A0BF5, fd1399; +mul.f64 fd1402, fd1208, 0dBFEFF223F3635CE3; +mul.f64 fd1591, fd1202, 0dBFADC528B5343A86; +sub.f64 fd1403, fd1591, fd1402; +mul.f64 fd1404, fd1208, 0dBFADC528B5343A86; +fma.rn.f64 fd1405, fd1202, 0dBFEFF223F3635CE3, fd1404; +mul.f64 fd1407, fd1324, 0d3FBDB843E577175E; +mul.f64 fd1590, fd1318, 0dBFEFC89BCEF44CF4; +sub.f64 fd1408, fd1590, fd1407; +mul.f64 fd1409, fd1324, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd1410, fd1318, 0d3FBDB843E577175E, fd1409; +mul.f64 fd1412, fd1224, 0dBFEEA7D99F29CADE; +mul.f64 fd1589, fd1218, 0dBFD25AFBF23865BF; +sub.f64 fd1413, fd1589, fd1412; +mul.f64 fd1414, fd1224, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1415, fd1218, 0dBFEEA7D99F29CADE, fd1414; +mul.f64 fd1417, fd1340, 0d3FE19593DA358510; +mul.f64 fd1588, fd1334, 0dBFEABC50EF4734A7; +sub.f64 fd1418, fd1588, fd1417; +mul.f64 fd1419, fd1340, 0dBFEABC50EF4734A7; +fma.rn.f64 fd1420, fd1334, 0d3FE19593DA358510, fd1419; +add.f64 fd1421, fd1178, fd1294; +mul.f64 fd1423, fd1421, 0d3FE0000000000000; +sub.f64 fd1424, fd1062, fd1423; +add.f64 fd1587, fd1631, fd1615; +sub.f64 fd1425, fd1631, fd1615; +mul.f64 fd1426, fd1425, 0d3FEBB67AE8584CAA; +mul.f64 fd1427, fd1587, 0d3FE0000000000000; +sub.f64 fd1428, fd1647, fd1427; +sub.f64 fd1429, fd1178, fd1294; +mul.f64 fd1430, fd1429, 0d3FEBB67AE8584CAA; +add.f64 fd1431, fd1343, fd1348; +mul.f64 fd1433, fd1431, 0d3FE0000000000000; +sub.f64 fd1434, fd1078, fd1433; +add.f64 fd1586, fd1345, fd1350; +sub.f64 fd1435, fd1345, fd1350; +mul.f64 fd1436, fd1435, 0d3FEBB67AE8584CAA; +mul.f64 fd1437, fd1586, 0d3FE0000000000000; +sub.f64 fd1438, fd1645, fd1437; +sub.f64 fd1439, fd1343, fd1348; +mul.f64 fd1440, fd1439, 0d3FEBB67AE8584CAA; +add.f64 fd1441, fd1353, fd1358; +mul.f64 fd1443, fd1441, 0d3FE0000000000000; +sub.f64 fd1444, fd1094, fd1443; +add.f64 fd1585, fd1355, fd1360; +sub.f64 fd1445, fd1355, fd1360; +mul.f64 fd1446, fd1445, 0d3FEBB67AE8584CAA; +mul.f64 fd1447, fd1585, 0d3FE0000000000000; +sub.f64 fd1448, fd1643, fd1447; +sub.f64 fd1449, fd1353, fd1358; +mul.f64 fd1450, fd1449, 0d3FEBB67AE8584CAA; +add.f64 fd1451, fd1363, fd1368; +mul.f64 fd1453, fd1451, 0d3FE0000000000000; +sub.f64 fd1454, fd1069, fd1453; +add.f64 fd1584, fd1365, fd1370; +sub.f64 fd1455, fd1365, fd1370; +mul.f64 fd1456, fd1455, 0d3FEBB67AE8584CAA; +mul.f64 fd1457, fd1584, 0d3FE0000000000000; +sub.f64 fd1458, fd1075, fd1457; +sub.f64 fd1459, fd1363, fd1368; +mul.f64 fd1460, fd1459, 0d3FEBB67AE8584CAA; +add.f64 fd1461, fd1373, fd1378; +mul.f64 fd1463, fd1461, 0d3FE0000000000000; +sub.f64 fd1464, fd1085, fd1463; +add.f64 fd1583, fd1375, fd1380; +sub.f64 fd1465, fd1375, fd1380; +mul.f64 fd1466, fd1465, 0d3FEBB67AE8584CAA; +mul.f64 fd1467, fd1583, 0d3FE0000000000000; +sub.f64 fd1468, fd1091, fd1467; +sub.f64 fd1469, fd1373, fd1378; +mul.f64 fd1470, fd1469, 0d3FEBB67AE8584CAA; +add.f64 fd1471, fd1383, fd1388; +mul.f64 fd1473, fd1471, 0d3FE0000000000000; +sub.f64 fd1474, fd1101, fd1473; +add.f64 fd1582, fd1385, fd1390; +sub.f64 fd1475, fd1385, fd1390; +mul.f64 fd1476, fd1475, 0d3FEBB67AE8584CAA; +mul.f64 fd1477, fd1582, 0d3FE0000000000000; +sub.f64 fd1478, fd1107, fd1477; +sub.f64 fd1479, fd1383, fd1388; +mul.f64 fd1480, fd1479, 0d3FEBB67AE8584CAA; +add.f64 fd1481, fd1393, fd1398; +mul.f64 fd1483, fd1481, 0d3FE0000000000000; +sub.f64 fd1484, fd1070, fd1483; +add.f64 fd1581, fd1395, fd1400; +sub.f64 fd1485, fd1395, fd1400; +mul.f64 fd1486, fd1485, 0d3FEBB67AE8584CAA; +mul.f64 fd1487, fd1581, 0d3FE0000000000000; +sub.f64 fd1488, fd1076, fd1487; +sub.f64 fd1489, fd1393, fd1398; +mul.f64 fd1490, fd1489, 0d3FEBB67AE8584CAA; +add.f64 fd1491, fd1403, fd1408; +mul.f64 fd1493, fd1491, 0d3FE0000000000000; +sub.f64 fd1494, fd1086, fd1493; +add.f64 fd1580, fd1405, fd1410; +sub.f64 fd1495, fd1405, fd1410; +mul.f64 fd1496, fd1495, 0d3FEBB67AE8584CAA; +mul.f64 fd1497, fd1580, 0d3FE0000000000000; +sub.f64 fd1498, fd1092, fd1497; +sub.f64 fd1499, fd1403, fd1408; +mul.f64 fd1500, fd1499, 0d3FEBB67AE8584CAA; +add.f64 fd1501, fd1413, fd1418; +mul.f64 fd1503, fd1501, 0d3FE0000000000000; +sub.f64 fd1504, fd1102, fd1503; +add.f64 fd1579, fd1415, fd1420; +sub.f64 fd1505, fd1415, fd1420; +mul.f64 fd1506, fd1505, 0d3FEBB67AE8584CAA; +mul.f64 fd1507, fd1579, 0d3FE0000000000000; +sub.f64 fd1508, fd1108, fd1507; +sub.f64 fd1509, fd1413, fd1418; +mul.f64 fd1843, fd1431, 0d3FE0000000000000; +sub.f64 fd1842, fd1078, fd1843; +mul.f64 fd1510, fd1509, 0d3FEBB67AE8584CAA; +add.f64 %0, fd1062, fd1421; +mul.f64 fd1845, fd1585, 0d3FE0000000000000; +sub.f64 fd1844, fd1643, fd1845; +add.f64 %1, fd1647, fd1587; +mul.f64 fd1847, fd1586, 0d3FE0000000000000; +sub.f64 fd1846, fd1645, fd1847; +mul.f64 fd1849, fd1501, 0d3FE0000000000000; +sub.f64 fd1848, fd1102, fd1849; +add.f64 %3, fd1645, fd1586; +add.f64 %2, fd1078, fd1431; +add.f64 %5, fd1643, fd1585; +add.f64 %4, fd1094, fd1441; +add.f64 %7, fd1075, fd1584; +add.f64 %6, fd1069, fd1451; +add.f64 %9, fd1091, fd1583; +add.f64 %8, fd1085, fd1461; +add.f64 %11, fd1107, fd1582; +add.f64 %10, fd1101, fd1471; +add.f64 %13, fd1076, fd1581; +add.f64 %12, fd1070, fd1481; +add.f64 %15, fd1092, fd1580; +add.f64 %14, fd1086, fd1491; +add.f64 %17, fd1108, fd1579; +add.f64 %16, fd1102, fd1501; +sub.f64 %19, fd1428, fd1430; +add.f64 %18, fd1426, fd1424; +sub.f64 %21, fd1846, fd1440; +add.f64 %20, fd1436, fd1842; +sub.f64 %23, fd1844, fd1450; +add.f64 %22, fd1446, fd1444; +sub.f64 %25, fd1458, fd1460; +add.f64 %24, fd1456, fd1454; +add.f64 %26, fd1466, fd1464; +sub.f64 %27, fd1468, fd1470; +add.f64 %28, fd1476, fd1474; +sub.f64 %29, fd1478, fd1480; +add.f64 %30, fd1486, fd1484; +sub.f64 %31, fd1488, fd1490; +sub.f64 %33, fd1498, fd1500; +add.f64 %32, fd1496, fd1494; +sub.f64 %35, fd1508, fd1510; +add.f64 %34, fd1506, fd1848; +sub.f64 %36, fd1424, fd1426; +add.f64 %37, fd1430, fd1428; +add.f64 %39, fd1440, fd1846; +sub.f64 %38, fd1842, fd1436; +add.f64 %41, fd1450, fd1844; +sub.f64 %40, fd1444, fd1446; +add.f64 %43, fd1460, fd1458; +sub.f64 %42, fd1454, fd1456; +add.f64 %45, fd1470, fd1468; +sub.f64 %44, fd1464, fd1466; +add.f64 %47, fd1480, fd1478; +sub.f64 %46, fd1474, fd1476; +add.f64 %49, fd1490, fd1488; +sub.f64 %48, fd1484, fd1486; +add.f64 %51, fd1500, fd1498; +sub.f64 %50, fd1494, fd1496; +add.f64 %53, fd1510, fd1508; +sub.f64 %52, fd1848, fd1506; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_729), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[10].y), "d"(rmem[19].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<524, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<1887>; +.reg .b64 rd<10>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 11664, r17; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1886, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1885, %57, fd1886; +mul.f64 fd119, fd1886, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1884, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1883, %63, fd1884; +mul.f64 fd135, fd1884, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1882, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1881, %69, fd1882; +mul.f64 fd151, fd1882, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd1880, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1880, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd1878, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1879, fd155, 0dBFEF838B8C811C17; +sub.f64 fd164, fd1878, fd1879; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd1876, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1877, fd140, 0dBFEF838B8C811C17; +sub.f64 fd169, fd1876, fd1877; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd1874, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1875, fd156, 0dBFD5E3A8748A0BF5; +sub.f64 fd174, fd1874, fd1875; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1873, fd1883, fd1881; +sub.f64 fd183, fd1883, fd1881; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1872, fd1885, fd1873; +mul.f64 fd187, fd1873, 0d3FE0000000000000; +sub.f64 fd188, fd1885, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1871, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1870, fd123, fd1871; +mul.f64 fd203, fd1871, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1869, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1868, fd124, fd1869; +mul.f64 fd219, fd1869, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1865, %110, %111; +sub.f64 fd231, %110, %111; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1863, %112, fd1865; +mul.f64 fd235, fd1865, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1860, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1858, %115, fd1860; +mul.f64 fd251, fd1860, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1855, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1853, %118, fd1855; +mul.f64 fd267, fd1855, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd1852, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1852, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd1851, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1851, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd1849, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1850, fd256, 0dBFEF838B8C811C17; +sub.f64 fd285, fd1849, fd1850; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd1847, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1848, fd272, 0dBFD5E3A8748A0BF5; +sub.f64 fd290, fd1847, fd1848; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1846, fd1858, fd1853; +sub.f64 fd299, fd1858, fd1853; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1845, fd1863, fd1846; +mul.f64 fd303, fd1846, 0d3FE0000000000000; +sub.f64 fd304, fd1863, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1844, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1843, fd239, fd1844; +mul.f64 fd319, fd1844, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1842, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1841, fd240, fd1842; +mul.f64 fd335, fd1842, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1838, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1836, %121, fd1838; +mul.f64 fd351, fd1838, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1833, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1831, %124, fd1833; +mul.f64 fd367, fd1833, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1829, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1827, %126, fd1829; +mul.f64 fd383, fd1829, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd1826, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1826, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd1825, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1825, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd1823, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1824, fd372, 0dBFEF838B8C811C17; +sub.f64 fd401, fd1823, fd1824; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd1821, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1822, fd388, 0dBFD5E3A8748A0BF5; +sub.f64 fd406, fd1821, fd1822; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1820, fd1831, fd1827; +sub.f64 fd415, fd1831, fd1827; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1819, fd1836, fd1820; +mul.f64 fd419, fd1820, 0d3FE0000000000000; +sub.f64 fd420, fd1836, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1818, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1817, fd355, fd1818; +mul.f64 fd435, fd1818, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1816, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1815, fd356, fd1816; +mul.f64 fd451, fd1816, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1843, 0dBFCD84D223638000; +mul.f64 fd1814, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1814, fd458; +mul.f64 fd460, fd1843, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd1812, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1813, fd1817, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd1812, fd1813; +mul.f64 fd465, fd1817, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd1810, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1811, fd1841, 0dBFDCB920325BAFA6; +sub.f64 fd469, fd1810, fd1811; +mul.f64 fd470, fd1841, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd1808, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1809, fd1815, 0dBFE9AAFE4207DF5F; +sub.f64 fd474, fd1808, fd1809; +mul.f64 fd475, fd1815, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd1806, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1807, fd307, 0dBFE491B7523C161D; +sub.f64 fd479, fd1806, fd1807; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd1805, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1805, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd1804, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1804, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd1803, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1803, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0dBFED6206BEB6C24B; +mul.f64 fd1802, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1802, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0dBFE746A51650EADE; +mul.f64 fd1801, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1801, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0dBFEF838B8C811C17; +mul.f64 fd1800, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1800, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd1798, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1799, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd1798, fd1799; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd1796, fd318, 0dBFADC528B5343A86; +mul.f64 fd1797, fd324, 0dBFEFF223F3635CE3; +sub.f64 fd519, fd1796, fd1797; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd1794, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1795, fd440, 0d3FBDB843E577175E; +sub.f64 fd524, fd1794, fd1795; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd1793, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1793, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd1792, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1792, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd1791, fd1845, fd1819; +sub.f64 fd541, fd1845, fd1819; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +add.f64 fd543, fd542, fd540; +sub.f64 fd544, fd540, fd542; +mul.f64 fd545, fd1791, 0d3FE0000000000000; +sub.f64 fd546, fd1872, fd545; +sub.f64 fd547, fd294, fd410; +mul.f64 fd548, fd547, 0d3FEBB67AE8584CAA; +sub.f64 fd549, fd546, fd548; +add.f64 fd550, fd548, fd546; +add.f64 fd551, fd459, fd464; +add.f64 fd552, fd194, fd551; +mul.f64 fd555, fd551, 0d3FE0000000000000; +sub.f64 fd556, fd194, fd555; +add.f64 fd1790, fd461, fd466; +sub.f64 fd557, fd461, fd466; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +add.f64 fd559, fd558, fd556; +sub.f64 fd560, fd556, fd558; +add.f64 fd1789, fd1870, fd1790; +mul.f64 fd561, fd1790, 0d3FE0000000000000; +sub.f64 fd562, fd1870, fd561; +sub.f64 fd563, fd459, fd464; +mul.f64 fd564, fd563, 0d3FEBB67AE8584CAA; +sub.f64 fd565, fd562, fd564; +add.f64 fd566, fd564, fd562; +add.f64 fd567, fd469, fd474; +add.f64 fd568, fd210, fd567; +mul.f64 fd571, fd567, 0d3FE0000000000000; +sub.f64 fd572, fd210, fd571; +add.f64 fd1788, fd471, fd476; +sub.f64 fd573, fd471, fd476; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +add.f64 fd575, fd574, fd572; +sub.f64 fd576, fd572, fd574; +add.f64 fd1787, fd1868, fd1788; +mul.f64 fd577, fd1788, 0d3FE0000000000000; +sub.f64 fd578, fd1868, fd577; +sub.f64 fd579, fd469, fd474; +mul.f64 fd580, fd579, 0d3FEBB67AE8584CAA; +sub.f64 fd581, fd578, fd580; +add.f64 fd582, fd580, fd578; +add.f64 fd583, fd479, fd484; +add.f64 fd584, fd185, fd583; +mul.f64 fd587, fd583, 0d3FE0000000000000; +sub.f64 fd588, fd185, fd587; +add.f64 fd1786, fd481, fd486; +sub.f64 fd589, fd481, fd486; +mul.f64 fd590, fd589, 0d3FEBB67AE8584CAA; +add.f64 fd591, fd590, fd588; +sub.f64 fd592, fd588, fd590; +add.f64 fd1785, fd191, fd1786; +mul.f64 fd593, fd1786, 0d3FE0000000000000; +sub.f64 fd594, fd191, fd593; +sub.f64 fd595, fd479, fd484; +mul.f64 fd596, fd595, 0d3FEBB67AE8584CAA; +sub.f64 fd597, fd594, fd596; +add.f64 fd598, fd596, fd594; +add.f64 fd599, fd489, fd494; +add.f64 fd600, fd201, fd599; +mul.f64 fd603, fd599, 0d3FE0000000000000; +sub.f64 fd604, fd201, fd603; +add.f64 fd1784, fd491, fd496; +sub.f64 fd605, fd491, fd496; +mul.f64 fd606, fd605, 0d3FEBB67AE8584CAA; +add.f64 fd607, fd606, fd604; +sub.f64 fd608, fd604, fd606; +add.f64 fd1783, fd207, fd1784; +mul.f64 fd609, fd1784, 0d3FE0000000000000; +sub.f64 fd610, fd207, fd609; +sub.f64 fd611, fd489, fd494; +mul.f64 fd612, fd611, 0d3FEBB67AE8584CAA; +sub.f64 fd613, fd610, fd612; +add.f64 fd614, fd612, fd610; +add.f64 fd615, fd499, fd504; +add.f64 fd616, fd217, fd615; +mul.f64 fd619, fd615, 0d3FE0000000000000; +sub.f64 fd620, fd217, fd619; +add.f64 fd1782, fd501, fd506; +sub.f64 fd621, fd501, fd506; +mul.f64 fd622, fd621, 0d3FEBB67AE8584CAA; +add.f64 fd623, fd622, fd620; +sub.f64 fd624, fd620, fd622; +add.f64 fd1781, fd223, fd1782; +mul.f64 fd625, fd1782, 0d3FE0000000000000; +sub.f64 fd626, fd223, fd625; +sub.f64 fd627, fd499, fd504; +mul.f64 fd628, fd627, 0d3FEBB67AE8584CAA; +sub.f64 fd629, fd626, fd628; +add.f64 fd630, fd628, fd626; +add.f64 fd631, fd509, fd514; +add.f64 fd632, fd186, fd631; +mul.f64 fd635, fd631, 0d3FE0000000000000; +sub.f64 fd636, fd186, fd635; +add.f64 fd1780, fd511, fd516; +sub.f64 fd637, fd511, fd516; +mul.f64 fd638, fd637, 0d3FEBB67AE8584CAA; +add.f64 fd639, fd638, fd636; +sub.f64 fd640, fd636, fd638; +add.f64 fd1779, fd192, fd1780; +mul.f64 fd641, fd1780, 0d3FE0000000000000; +sub.f64 fd642, fd192, fd641; +sub.f64 fd643, fd509, fd514; +mul.f64 fd644, fd643, 0d3FEBB67AE8584CAA; +sub.f64 fd645, fd642, fd644; +add.f64 fd646, fd644, fd642; +add.f64 fd647, fd519, fd524; +add.f64 fd648, fd202, fd647; +mul.f64 fd651, fd647, 0d3FE0000000000000; +sub.f64 fd652, fd202, fd651; +add.f64 fd1778, fd521, fd526; +sub.f64 fd653, fd521, fd526; +mul.f64 fd654, fd653, 0d3FEBB67AE8584CAA; +add.f64 fd655, fd654, fd652; +sub.f64 fd656, fd652, fd654; +add.f64 fd1777, fd208, fd1778; +mul.f64 fd657, fd1778, 0d3FE0000000000000; +sub.f64 fd658, fd208, fd657; +sub.f64 fd659, fd519, fd524; +mul.f64 fd660, fd659, 0d3FEBB67AE8584CAA; +sub.f64 fd661, fd658, fd660; +add.f64 fd662, fd660, fd658; +add.f64 fd663, fd529, fd534; +add.f64 fd664, fd218, fd663; +mul.f64 fd667, fd663, 0d3FE0000000000000; +sub.f64 fd668, fd218, fd667; +add.f64 fd1776, fd531, fd536; +sub.f64 fd669, fd531, fd536; +mul.f64 fd670, fd669, 0d3FEBB67AE8584CAA; +add.f64 fd671, fd670, fd668; +sub.f64 fd672, fd668, fd670; +add.f64 fd1775, fd224, fd1776; +mul.f64 fd673, fd1776, 0d3FE0000000000000; +sub.f64 fd674, fd224, fd673; +sub.f64 fd675, fd529, fd534; +mul.f64 fd676, fd675, 0d3FEBB67AE8584CAA; +sub.f64 fd677, fd674, fd676; +add.f64 fd678, fd676, fd674; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mad.lo.s32 r12, r9, 11664, r3; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r11, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd679, fd680}, [rd6]; +mul.f64 fd684, fd680, fd1789; +mul.f64 fd685, fd679, fd1789; +mul.f64 fd687, fd680, fd680; +mul.f64 fd1774, fd679, fd679; +sub.f64 fd688, fd1774, fd687; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd692, fd690, fd1787; +mul.f64 fd693, fd688, fd1787; +mul.f64 fd1772, fd679, fd688; +mul.f64 fd1773, fd680, fd690; +sub.f64 fd696, fd1772, fd1773; +mul.f64 fd1771, fd688, fd568; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd700, fd698, fd1785; +mul.f64 fd701, fd696, fd1785; +mul.f64 fd703, fd680, fd698; +mul.f64 fd1770, fd679, fd696; +sub.f64 fd704, fd1770, fd703; +mul.f64 fd1769, fd696, fd584; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd708, fd706, fd1783; +mul.f64 fd709, fd704, fd1783; +mul.f64 fd1767, fd679, fd704; +mul.f64 fd1768, fd680, fd706; +sub.f64 fd712, fd1767, fd1768; +mul.f64 fd1766, fd704, fd600; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd716, fd714, fd1781; +mul.f64 fd717, fd712, fd1781; +mul.f64 fd719, fd680, fd714; +mul.f64 fd1765, fd679, fd712; +sub.f64 fd720, fd1765, fd719; +mul.f64 fd1764, fd712, fd616; +mul.f64 fd721, fd679, fd714; +fma.rn.f64 fd722, fd680, fd712, fd721; +mul.f64 fd724, fd722, fd1779; +mul.f64 fd725, fd720, fd1779; +mul.f64 fd727, fd680, fd722; +mul.f64 fd1763, fd679, fd720; +sub.f64 fd728, fd1763, fd727; +mul.f64 fd1762, fd720, fd632; +mul.f64 fd729, fd679, fd722; +fma.rn.f64 fd730, fd680, fd720, fd729; +mul.f64 fd732, fd730, fd1777; +mul.f64 fd733, fd728, fd1777; +mul.f64 fd1760, fd679, fd728; +mul.f64 fd1761, fd680, fd730; +sub.f64 fd736, fd1760, fd1761; +mul.f64 fd1759, fd728, fd648; +mul.f64 fd737, fd679, fd730; +fma.rn.f64 fd738, fd680, fd728, fd737; +mul.f64 fd740, fd738, fd1775; +mul.f64 fd741, fd736, fd1775; +mul.f64 fd743, fd680, fd738; +mul.f64 fd1758, fd679, fd736; +sub.f64 fd744, fd1758, fd743; +mul.f64 fd1757, fd736, fd664; +mul.f64 fd745, fd679, fd738; +fma.rn.f64 fd746, fd680, fd736, fd745; +mul.f64 fd748, fd746, fd549; +mul.f64 fd749, fd744, fd549; +mul.f64 fd751, fd680, fd746; +mul.f64 fd1756, fd679, fd744; +sub.f64 fd752, fd1756, fd751; +mul.f64 fd1755, fd744, fd543; +mul.f64 fd753, fd679, fd746; +fma.rn.f64 fd754, fd680, fd744, fd753; +mul.f64 fd756, fd754, fd565; +mul.f64 fd757, fd752, fd565; +mul.f64 fd1753, fd679, fd752; +mul.f64 fd1754, fd680, fd754; +sub.f64 fd760, fd1753, fd1754; +mul.f64 fd1752, fd752, fd559; +mul.f64 fd761, fd679, fd754; +fma.rn.f64 fd762, fd680, fd752, fd761; +mul.f64 fd764, fd762, fd581; +mul.f64 fd765, fd760, fd581; +mul.f64 fd767, fd680, fd762; +mul.f64 fd1751, fd679, fd760; +sub.f64 fd768, fd1751, fd767; +mul.f64 fd1750, fd760, fd575; +mul.f64 fd769, fd679, fd762; +fma.rn.f64 fd770, fd680, fd760, fd769; +mul.f64 fd772, fd770, fd597; +mul.f64 fd773, fd768, fd597; +mul.f64 fd1748, fd679, fd768; +mul.f64 fd1749, fd680, fd770; +sub.f64 fd776, fd1748, fd1749; +mul.f64 fd1747, fd768, fd591; +mul.f64 fd777, fd679, fd770; +fma.rn.f64 fd778, fd680, fd768, fd777; +mul.f64 fd779, fd776, fd607; +mul.f64 fd780, fd778, fd613; +mul.f64 fd781, fd776, fd613; +ld.global.v2.f64 {fd782, fd783}, [rd6+432]; +mul.f64 fd787, fd783, fd629; +mul.f64 fd788, fd782, fd629; +mul.f64 fd790, fd680, fd783; +mul.f64 fd1746, fd679, fd782; +sub.f64 fd791, fd1746, fd790; +mul.f64 fd1745, fd782, fd623; +mul.f64 fd792, fd679, fd783; +fma.rn.f64 fd793, fd680, fd782, fd792; +mul.f64 fd795, fd793, fd645; +mul.f64 fd796, fd791, fd645; +mul.f64 fd798, fd680, fd793; +mul.f64 fd1744, fd679, fd791; +sub.f64 fd799, fd1744, fd798; +mul.f64 fd1743, fd791, fd639; +mul.f64 fd800, fd679, fd793; +fma.rn.f64 fd801, fd680, fd791, fd800; +mul.f64 fd803, fd801, fd661; +mul.f64 fd804, fd799, fd661; +mul.f64 fd1741, fd679, fd799; +mul.f64 fd1742, fd680, fd801; +sub.f64 fd807, fd1741, fd1742; +mul.f64 fd1740, fd799, fd655; +mul.f64 fd808, fd679, fd801; +fma.rn.f64 fd809, fd680, fd799, fd808; +mul.f64 fd811, fd809, fd677; +mul.f64 fd812, fd807, fd677; +mul.f64 fd814, fd680, fd809; +mul.f64 fd1739, fd679, fd807; +sub.f64 fd815, fd1739, fd814; +mul.f64 fd1738, fd807, fd671; +mul.f64 fd816, fd679, fd809; +fma.rn.f64 fd817, fd680, fd807, fd816; +mul.f64 fd819, fd817, fd550; +mul.f64 fd820, fd815, fd550; +mul.f64 fd822, fd680, fd817; +mul.f64 fd1737, fd679, fd815; +sub.f64 fd823, fd1737, fd822; +mul.f64 fd1736, fd815, fd544; +mul.f64 fd824, fd679, fd817; +fma.rn.f64 fd825, fd680, fd815, fd824; +mul.f64 fd827, fd825, fd566; +mul.f64 fd828, fd823, fd566; +mul.f64 fd1734, fd679, fd823; +mul.f64 fd1735, fd680, fd825; +sub.f64 fd831, fd1734, fd1735; +mul.f64 fd1733, fd823, fd560; +mul.f64 fd832, fd679, fd825; +fma.rn.f64 fd833, fd680, fd823, fd832; +mul.f64 fd835, fd833, fd582; +mul.f64 fd836, fd831, fd582; +mul.f64 fd838, fd680, fd833; +mul.f64 fd1732, fd679, fd831; +sub.f64 fd839, fd1732, fd838; +mul.f64 fd1731, fd831, fd576; +mul.f64 fd840, fd679, fd833; +fma.rn.f64 fd841, fd680, fd831, fd840; +mul.f64 fd843, fd841, fd598; +mul.f64 fd844, fd839, fd598; +mul.f64 fd1729, fd679, fd839; +mul.f64 fd1730, fd680, fd841; +sub.f64 fd847, fd1729, fd1730; +mul.f64 fd1728, fd839, fd592; +mul.f64 fd848, fd679, fd841; +fma.rn.f64 fd849, fd680, fd839, fd848; +mul.f64 fd851, fd849, fd614; +mul.f64 fd852, fd847, fd614; +mul.f64 fd854, fd680, fd849; +mul.f64 fd1727, fd679, fd847; +sub.f64 fd855, fd1727, fd854; +mul.f64 fd1726, fd847, fd608; +mul.f64 fd856, fd679, fd849; +fma.rn.f64 fd857, fd680, fd847, fd856; +mul.f64 fd859, fd857, fd630; +mul.f64 fd860, fd855, fd630; +mul.f64 fd862, fd680, fd857; +mul.f64 fd1725, fd679, fd855; +sub.f64 fd863, fd1725, fd862; +mul.f64 fd1724, fd855, fd624; +mul.f64 fd864, fd679, fd857; +fma.rn.f64 fd865, fd680, fd855, fd864; +mul.f64 fd867, fd865, fd646; +mul.f64 fd868, fd863, fd646; +mul.f64 fd1722, fd679, fd863; +mul.f64 fd1723, fd680, fd865; +sub.f64 fd871, fd1722, fd1723; +mul.f64 fd1721, fd863, fd640; +mul.f64 fd872, fd679, fd865; +fma.rn.f64 fd873, fd680, fd863, fd872; +mul.f64 fd875, fd873, fd662; +mul.f64 fd876, fd871, fd662; +mul.f64 fd878, fd680, fd873; +mul.f64 fd1720, fd679, fd871; +sub.f64 fd879, fd1720, fd878; +mul.f64 fd1719, fd679, fd552; +mul.f64 fd880, fd679, fd873; +mul.f64 fd1718, fd871, fd656; +fma.rn.f64 fd881, fd680, fd871, fd880; +mul.f64 fd882, fd879, fd672; +mul.f64 fd883, fd881, fd678; +mul.f64 fd884, fd879, fd678; +barrier.sync 0; +mad.lo.s32 r13, r11, 432, r12; +add.f64 fd885, fd1872, fd1791; +add.f64 fd886, fd178, fd537; +st.shared.v2.f64 [r13], {fd886, fd885}; +fma.rn.f64 fd887, fd680, fd552, fd685; +sub.f64 fd888, fd1719, fd684; +st.shared.v2.f64 [r13+16], {fd888, fd887}; +fma.rn.f64 fd889, fd690, fd568, fd693; +sub.f64 fd890, fd1771, fd692; +st.shared.v2.f64 [r13+32], {fd890, fd889}; +fma.rn.f64 fd891, fd698, fd584, fd701; +sub.f64 fd892, fd1769, fd700; +st.shared.v2.f64 [r13+48], {fd892, fd891}; +fma.rn.f64 fd893, fd706, fd600, fd709; +sub.f64 fd894, fd1766, fd708; +st.shared.v2.f64 [r13+64], {fd894, fd893}; +fma.rn.f64 fd895, fd714, fd616, fd717; +sub.f64 fd896, fd1764, fd716; +st.shared.v2.f64 [r13+80], {fd896, fd895}; +fma.rn.f64 fd897, fd722, fd632, fd725; +sub.f64 fd898, fd1762, fd724; +st.shared.v2.f64 [r13+96], {fd898, fd897}; +sub.f64 fd899, fd1759, fd732; +fma.rn.f64 fd900, fd730, fd648, fd733; +st.shared.v2.f64 [r13+112], {fd899, fd900}; +fma.rn.f64 fd901, fd738, fd664, fd741; +sub.f64 fd902, fd1757, fd740; +st.shared.v2.f64 [r13+128], {fd902, fd901}; +fma.rn.f64 fd903, fd746, fd543, fd749; +sub.f64 fd904, fd1755, fd748; +st.shared.v2.f64 [r13+144], {fd904, fd903}; +fma.rn.f64 fd905, fd754, fd559, fd757; +sub.f64 fd906, fd1752, fd756; +st.shared.v2.f64 [r13+160], {fd906, fd905}; +fma.rn.f64 fd907, fd762, fd575, fd765; +sub.f64 fd908, fd1750, fd764; +st.shared.v2.f64 [r13+176], {fd908, fd907}; +fma.rn.f64 fd909, fd770, fd591, fd773; +sub.f64 fd910, fd1747, fd772; +st.shared.v2.f64 [r13+192], {fd910, fd909}; +fma.rn.f64 fd911, fd778, fd607, fd781; +sub.f64 fd912, fd779, fd780; +st.shared.v2.f64 [r13+208], {fd912, fd911}; +fma.rn.f64 fd913, fd783, fd623, fd788; +sub.f64 fd914, fd1745, fd787; +st.shared.v2.f64 [r13+224], {fd914, fd913}; +fma.rn.f64 fd915, fd793, fd639, fd796; +sub.f64 fd916, fd1743, fd795; +st.shared.v2.f64 [r13+240], {fd916, fd915}; +fma.rn.f64 fd917, fd801, fd655, fd804; +sub.f64 fd918, fd1740, fd803; +st.shared.v2.f64 [r13+256], {fd918, fd917}; +fma.rn.f64 fd919, fd809, fd671, fd812; +sub.f64 fd920, fd1738, fd811; +st.shared.v2.f64 [r13+272], {fd920, fd919}; +fma.rn.f64 fd921, fd817, fd544, fd820; +sub.f64 fd922, fd1736, fd819; +st.shared.v2.f64 [r13+288], {fd922, fd921}; +fma.rn.f64 fd923, fd825, fd560, fd828; +sub.f64 fd924, fd1733, fd827; +st.shared.v2.f64 [r13+304], {fd924, fd923}; +sub.f64 fd925, fd1731, fd835; +fma.rn.f64 fd926, fd833, fd576, fd836; +st.shared.v2.f64 [r13+320], {fd925, fd926}; +fma.rn.f64 fd927, fd841, fd592, fd844; +sub.f64 fd928, fd1728, fd843; +st.shared.v2.f64 [r13+336], {fd928, fd927}; +fma.rn.f64 fd929, fd849, fd608, fd852; +sub.f64 fd930, fd1726, fd851; +st.shared.v2.f64 [r13+352], {fd930, fd929}; +fma.rn.f64 fd931, fd857, fd624, fd860; +sub.f64 fd932, fd1724, fd859; +st.shared.v2.f64 [r13+368], {fd932, fd931}; +fma.rn.f64 fd933, fd865, fd640, fd868; +sub.f64 fd934, fd1721, fd867; +st.shared.v2.f64 [r13+384], {fd934, fd933}; +fma.rn.f64 fd935, fd873, fd656, fd876; +sub.f64 fd936, fd1718, fd875; +st.shared.v2.f64 [r13+400], {fd936, fd935}; +fma.rn.f64 fd937, fd881, fd672, fd884; +sub.f64 fd938, fd882, fd883; +st.shared.v2.f64 [r13+416], {fd938, fd937}; +barrier.sync 0; +mad.lo.s32 r14, r11, -416, r13; +ld.shared.v2.f64 {fd939, fd940}, [r14]; +ld.shared.v2.f64 {fd943, fd944}, [r14+432]; +ld.shared.v2.f64 {fd947, fd948}, [r14+864]; +ld.shared.v2.f64 {fd951, fd952}, [r14+1296]; +ld.shared.v2.f64 {fd955, fd956}, [r14+1728]; +ld.shared.v2.f64 {fd959, fd960}, [r14+2160]; +ld.shared.v2.f64 {fd963, fd964}, [r14+2592]; +ld.shared.v2.f64 {fd967, fd968}, [r14+3024]; +ld.shared.v2.f64 {fd971, fd972}, [r14+3456]; +ld.shared.v2.f64 {fd975, fd976}, [r14+3888]; +ld.shared.v2.f64 {fd979, fd980}, [r14+4320]; +ld.shared.v2.f64 {fd983, fd984}, [r14+4752]; +ld.shared.v2.f64 {fd987, fd988}, [r14+5184]; +ld.shared.v2.f64 {fd991, fd992}, [r14+5616]; +ld.shared.v2.f64 {fd995, fd996}, [r14+6048]; +ld.shared.v2.f64 {fd999, fd1000}, [r14+6480]; +ld.shared.v2.f64 {fd1003, fd1004}, [r14+6912]; +ld.shared.v2.f64 {fd1007, fd1008}, [r14+7344]; +ld.shared.v2.f64 {fd1011, fd1012}, [r14+7776]; +ld.shared.v2.f64 {fd1015, fd1016}, [r14+8208]; +ld.shared.v2.f64 {fd1019, fd1020}, [r14+8640]; +ld.shared.v2.f64 {fd1023, fd1024}, [r14+9072]; +ld.shared.v2.f64 {fd1027, fd1028}, [r14+9504]; +ld.shared.v2.f64 {fd1031, fd1032}, [r14+9936]; +ld.shared.v2.f64 {fd1035, fd1036}, [r14+10368]; +ld.shared.v2.f64 {fd1039, fd1040}, [r14+10800]; +ld.shared.v2.f64 {fd1043, fd1044}, [r14+11232]; +add.f64 fd1047, fd975, fd1011; +add.f64 fd1048, fd939, fd1047; +mul.f64 fd1051, fd1047, 0d3FE0000000000000; +sub.f64 fd1052, fd939, fd1051; +add.f64 fd1717, fd976, fd1012; +sub.f64 fd1053, fd976, fd1012; +mul.f64 fd1054, fd1053, 0d3FEBB67AE8584CAA; +add.f64 fd1055, fd1054, fd1052; +sub.f64 fd1056, fd1052, fd1054; +add.f64 fd1716, fd940, fd1717; +mul.f64 fd1057, fd1717, 0d3FE0000000000000; +sub.f64 fd1058, fd940, fd1057; +sub.f64 fd1059, fd975, fd1011; +mul.f64 fd1060, fd1059, 0d3FEBB67AE8584CAA; +sub.f64 fd1061, fd1058, fd1060; +add.f64 fd1062, fd1060, fd1058; +add.f64 fd1063, fd987, fd1023; +add.f64 fd1064, fd951, fd1063; +mul.f64 fd1067, fd1063, 0d3FE0000000000000; +sub.f64 fd1068, fd951, fd1067; +add.f64 fd1715, fd988, fd1024; +sub.f64 fd1069, fd988, fd1024; +mul.f64 fd1070, fd1069, 0d3FEBB67AE8584CAA; +add.f64 fd1071, fd1070, fd1068; +sub.f64 fd1072, fd1068, fd1070; +add.f64 fd1714, fd952, fd1715; +mul.f64 fd1073, fd1715, 0d3FE0000000000000; +sub.f64 fd1074, fd952, fd1073; +sub.f64 fd1075, fd987, fd1023; +mul.f64 fd1076, fd1075, 0d3FEBB67AE8584CAA; +sub.f64 fd1077, fd1074, fd1076; +add.f64 fd1078, fd1076, fd1074; +add.f64 fd1079, fd999, fd1035; +add.f64 fd1080, fd963, fd1079; +mul.f64 fd1083, fd1079, 0d3FE0000000000000; +sub.f64 fd1084, fd963, fd1083; +add.f64 fd1713, fd1000, fd1036; +sub.f64 fd1085, fd1000, fd1036; +mul.f64 fd1086, fd1085, 0d3FEBB67AE8584CAA; +add.f64 fd1087, fd1086, fd1084; +sub.f64 fd1088, fd1084, fd1086; +add.f64 fd1712, fd964, fd1713; +mul.f64 fd1089, fd1713, 0d3FE0000000000000; +sub.f64 fd1090, fd964, fd1089; +sub.f64 fd1091, fd999, fd1035; +mul.f64 fd1092, fd1091, 0d3FEBB67AE8584CAA; +sub.f64 fd1093, fd1090, fd1092; +add.f64 fd1094, fd1092, fd1090; +mul.f64 fd1710, fd1071, 0d3FE8836FA2CF5039; +mul.f64 fd1711, fd1077, 0dBFE491B7523C161D; +sub.f64 fd1097, fd1710, fd1711; +mul.f64 fd1098, fd1077, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1099, fd1071, 0dBFE491B7523C161D, fd1098; +mul.f64 fd1101, fd1093, 0dBFEF838B8C811C17; +mul.f64 fd1709, fd1087, 0d3FC63A1A7E0B738A; +sub.f64 fd1102, fd1709, fd1101; +mul.f64 fd1103, fd1093, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1104, fd1087, 0dBFEF838B8C811C17, fd1103; +mul.f64 fd1106, fd1078, 0dBFEF838B8C811C17; +mul.f64 fd1708, fd1072, 0d3FC63A1A7E0B738A; +sub.f64 fd1107, fd1708, fd1106; +mul.f64 fd1108, fd1078, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1109, fd1072, 0dBFEF838B8C811C17, fd1108; +mul.f64 fd1111, fd1094, 0dBFD5E3A8748A0BF5; +mul.f64 fd1707, fd1088, 0dBFEE11F642522D1C; +sub.f64 fd1112, fd1707, fd1111; +mul.f64 fd1113, fd1094, 0dBFEE11F642522D1C; +fma.rn.f64 fd1114, fd1088, 0dBFD5E3A8748A0BF5, fd1113; +add.f64 fd1115, fd1064, fd1080; +add.f64 fd1116, fd1048, fd1115; +mul.f64 fd1119, fd1115, 0d3FE0000000000000; +sub.f64 fd1120, fd1048, fd1119; +add.f64 fd1706, fd1714, fd1712; +sub.f64 fd1121, fd1714, fd1712; +mul.f64 fd1122, fd1121, 0d3FEBB67AE8584CAA; +add.f64 fd1123, fd1122, fd1120; +sub.f64 fd1124, fd1120, fd1122; +add.f64 fd1705, fd1716, fd1706; +mul.f64 fd1125, fd1706, 0d3FE0000000000000; +sub.f64 fd1126, fd1716, fd1125; +sub.f64 fd1127, fd1064, fd1080; +mul.f64 fd1128, fd1127, 0d3FEBB67AE8584CAA; +sub.f64 fd1129, fd1126, fd1128; +add.f64 fd1130, fd1128, fd1126; +add.f64 fd1131, fd1097, fd1102; +add.f64 fd1132, fd1055, fd1131; +mul.f64 fd1135, fd1131, 0d3FE0000000000000; +sub.f64 fd1136, fd1055, fd1135; +add.f64 fd1704, fd1099, fd1104; +sub.f64 fd1137, fd1099, fd1104; +mul.f64 fd1138, fd1137, 0d3FEBB67AE8584CAA; +add.f64 fd1139, fd1138, fd1136; +sub.f64 fd1140, fd1136, fd1138; +add.f64 fd1703, fd1061, fd1704; +mul.f64 fd1141, fd1704, 0d3FE0000000000000; +sub.f64 fd1142, fd1061, fd1141; +sub.f64 fd1143, fd1097, fd1102; +mul.f64 fd1144, fd1143, 0d3FEBB67AE8584CAA; +sub.f64 fd1145, fd1142, fd1144; +add.f64 fd1146, fd1144, fd1142; +add.f64 fd1147, fd1107, fd1112; +add.f64 fd1148, fd1056, fd1147; +mul.f64 fd1151, fd1147, 0d3FE0000000000000; +sub.f64 fd1152, fd1056, fd1151; +add.f64 fd1702, fd1109, fd1114; +sub.f64 fd1153, fd1109, fd1114; +mul.f64 fd1154, fd1153, 0d3FEBB67AE8584CAA; +add.f64 fd1155, fd1154, fd1152; +sub.f64 fd1156, fd1152, fd1154; +add.f64 fd1701, fd1062, fd1702; +mul.f64 fd1157, fd1702, 0d3FE0000000000000; +sub.f64 fd1158, fd1062, fd1157; +sub.f64 fd1159, fd1107, fd1112; +mul.f64 fd1160, fd1159, 0d3FEBB67AE8584CAA; +sub.f64 fd1161, fd1158, fd1160; +add.f64 fd1162, fd1160, fd1158; +add.f64 fd1163, fd979, fd1015; +add.f64 fd1164, fd943, fd1163; +mul.f64 fd1167, fd1163, 0d3FE0000000000000; +sub.f64 fd1168, fd943, fd1167; +add.f64 fd1700, fd980, fd1016; +sub.f64 fd1169, fd980, fd1016; +mul.f64 fd1170, fd1169, 0d3FEBB67AE8584CAA; +add.f64 fd1171, fd1170, fd1168; +sub.f64 fd1172, fd1168, fd1170; +add.f64 fd1699, fd944, fd1700; +mul.f64 fd1173, fd1700, 0d3FE0000000000000; +sub.f64 fd1174, fd944, fd1173; +sub.f64 fd1175, fd979, fd1015; +mul.f64 fd1176, fd1175, 0d3FEBB67AE8584CAA; +sub.f64 fd1177, fd1174, fd1176; +add.f64 fd1178, fd1176, fd1174; +add.f64 fd1179, fd991, fd1027; +add.f64 fd1180, fd955, fd1179; +mul.f64 fd1183, fd1179, 0d3FE0000000000000; +sub.f64 fd1184, fd955, fd1183; +add.f64 fd1698, fd992, fd1028; +sub.f64 fd1185, fd992, fd1028; +mul.f64 fd1186, fd1185, 0d3FEBB67AE8584CAA; +add.f64 fd1187, fd1186, fd1184; +sub.f64 fd1188, fd1184, fd1186; +add.f64 fd1697, fd956, fd1698; +mul.f64 fd1189, fd1698, 0d3FE0000000000000; +sub.f64 fd1190, fd956, fd1189; +sub.f64 fd1191, fd991, fd1027; +mul.f64 fd1192, fd1191, 0d3FEBB67AE8584CAA; +sub.f64 fd1193, fd1190, fd1192; +add.f64 fd1194, fd1192, fd1190; +add.f64 fd1195, fd1003, fd1039; +add.f64 fd1196, fd967, fd1195; +mul.f64 fd1199, fd1195, 0d3FE0000000000000; +sub.f64 fd1200, fd967, fd1199; +add.f64 fd1696, fd1004, fd1040; +sub.f64 fd1201, fd1004, fd1040; +mul.f64 fd1202, fd1201, 0d3FEBB67AE8584CAA; +add.f64 fd1203, fd1202, fd1200; +sub.f64 fd1204, fd1200, fd1202; +add.f64 fd1695, fd968, fd1696; +mul.f64 fd1205, fd1696, 0d3FE0000000000000; +sub.f64 fd1206, fd968, fd1205; +sub.f64 fd1207, fd1003, fd1039; +mul.f64 fd1208, fd1207, 0d3FEBB67AE8584CAA; +sub.f64 fd1209, fd1206, fd1208; +add.f64 fd1210, fd1208, fd1206; +mul.f64 fd1693, fd1187, 0d3FE8836FA2CF5039; +mul.f64 fd1694, fd1193, 0dBFE491B7523C161D; +sub.f64 fd1213, fd1693, fd1694; +mul.f64 fd1214, fd1193, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1215, fd1187, 0dBFE491B7523C161D, fd1214; +mul.f64 fd1217, fd1209, 0dBFEF838B8C811C17; +mul.f64 fd1692, fd1203, 0d3FC63A1A7E0B738A; +sub.f64 fd1218, fd1692, fd1217; +mul.f64 fd1219, fd1209, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1220, fd1203, 0dBFEF838B8C811C17, fd1219; +mul.f64 fd1222, fd1194, 0dBFEF838B8C811C17; +mul.f64 fd1691, fd1188, 0d3FC63A1A7E0B738A; +sub.f64 fd1223, fd1691, fd1222; +mul.f64 fd1224, fd1194, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1225, fd1188, 0dBFEF838B8C811C17, fd1224; +mul.f64 fd1227, fd1210, 0dBFD5E3A8748A0BF5; +mul.f64 fd1690, fd1204, 0dBFEE11F642522D1C; +sub.f64 fd1228, fd1690, fd1227; +mul.f64 fd1229, fd1210, 0dBFEE11F642522D1C; +fma.rn.f64 fd1230, fd1204, 0dBFD5E3A8748A0BF5, fd1229; +add.f64 fd1231, fd1180, fd1196; +add.f64 fd1232, fd1164, fd1231; +mul.f64 fd1235, fd1231, 0d3FE0000000000000; +sub.f64 fd1236, fd1164, fd1235; +add.f64 fd1689, fd1697, fd1695; +sub.f64 fd1237, fd1697, fd1695; +mul.f64 fd1238, fd1237, 0d3FEBB67AE8584CAA; +add.f64 fd1239, fd1238, fd1236; +sub.f64 fd1240, fd1236, fd1238; +add.f64 fd1688, fd1699, fd1689; +mul.f64 fd1241, fd1689, 0d3FE0000000000000; +sub.f64 fd1242, fd1699, fd1241; +sub.f64 fd1243, fd1180, fd1196; +mul.f64 fd1244, fd1243, 0d3FEBB67AE8584CAA; +sub.f64 fd1245, fd1242, fd1244; +add.f64 fd1246, fd1244, fd1242; +add.f64 fd1247, fd1213, fd1218; +add.f64 fd1248, fd1171, fd1247; +mul.f64 fd1251, fd1247, 0d3FE0000000000000; +sub.f64 fd1252, fd1171, fd1251; +add.f64 fd1687, fd1215, fd1220; +sub.f64 fd1253, fd1215, fd1220; +mul.f64 fd1254, fd1253, 0d3FEBB67AE8584CAA; +add.f64 fd1255, fd1254, fd1252; +sub.f64 fd1256, fd1252, fd1254; +add.f64 fd1686, fd1177, fd1687; +mul.f64 fd1257, fd1687, 0d3FE0000000000000; +sub.f64 fd1258, fd1177, fd1257; +sub.f64 fd1259, fd1213, fd1218; +mul.f64 fd1260, fd1259, 0d3FEBB67AE8584CAA; +sub.f64 fd1261, fd1258, fd1260; +add.f64 fd1262, fd1260, fd1258; +add.f64 fd1263, fd1223, fd1228; +add.f64 fd1264, fd1172, fd1263; +mul.f64 fd1267, fd1263, 0d3FE0000000000000; +sub.f64 fd1268, fd1172, fd1267; +add.f64 fd1685, fd1225, fd1230; +sub.f64 fd1269, fd1225, fd1230; +mul.f64 fd1270, fd1269, 0d3FEBB67AE8584CAA; +add.f64 fd1271, fd1270, fd1268; +sub.f64 fd1272, fd1268, fd1270; +add.f64 fd1684, fd1178, fd1685; +mul.f64 fd1273, fd1685, 0d3FE0000000000000; +sub.f64 fd1274, fd1178, fd1273; +sub.f64 fd1275, fd1223, fd1228; +mul.f64 fd1276, fd1275, 0d3FEBB67AE8584CAA; +sub.f64 fd1277, fd1274, fd1276; +add.f64 fd1278, fd1276, fd1274; +add.f64 fd1279, fd983, fd1019; +add.f64 fd1280, fd947, fd1279; +mul.f64 fd1283, fd1279, 0d3FE0000000000000; +sub.f64 fd1284, fd947, fd1283; +add.f64 fd1683, fd984, fd1020; +sub.f64 fd1285, fd984, fd1020; +mul.f64 fd1286, fd1285, 0d3FEBB67AE8584CAA; +add.f64 fd1287, fd1286, fd1284; +sub.f64 fd1288, fd1284, fd1286; +add.f64 fd1682, fd948, fd1683; +mul.f64 fd1289, fd1683, 0d3FE0000000000000; +sub.f64 fd1290, fd948, fd1289; +sub.f64 fd1291, fd983, fd1019; +mul.f64 fd1292, fd1291, 0d3FEBB67AE8584CAA; +sub.f64 fd1293, fd1290, fd1292; +add.f64 fd1294, fd1292, fd1290; +add.f64 fd1295, fd995, fd1031; +add.f64 fd1296, fd959, fd1295; +mul.f64 fd1299, fd1295, 0d3FE0000000000000; +sub.f64 fd1300, fd959, fd1299; +add.f64 fd1681, fd996, fd1032; +sub.f64 fd1301, fd996, fd1032; +mul.f64 fd1302, fd1301, 0d3FEBB67AE8584CAA; +add.f64 fd1303, fd1302, fd1300; +sub.f64 fd1304, fd1300, fd1302; +add.f64 fd1680, fd960, fd1681; +mul.f64 fd1305, fd1681, 0d3FE0000000000000; +sub.f64 fd1306, fd960, fd1305; +sub.f64 fd1307, fd995, fd1031; +mul.f64 fd1308, fd1307, 0d3FEBB67AE8584CAA; +sub.f64 fd1309, fd1306, fd1308; +add.f64 fd1310, fd1308, fd1306; +add.f64 fd1311, fd1007, fd1043; +add.f64 fd1312, fd971, fd1311; +mul.f64 fd1315, fd1311, 0d3FE0000000000000; +sub.f64 fd1316, fd971, fd1315; +add.f64 fd1679, fd1008, fd1044; +sub.f64 fd1317, fd1008, fd1044; +mul.f64 fd1318, fd1317, 0d3FEBB67AE8584CAA; +add.f64 fd1319, fd1318, fd1316; +sub.f64 fd1320, fd1316, fd1318; +add.f64 fd1678, fd972, fd1679; +mul.f64 fd1321, fd1679, 0d3FE0000000000000; +sub.f64 fd1322, fd972, fd1321; +sub.f64 fd1323, fd1007, fd1043; +mul.f64 fd1324, fd1323, 0d3FEBB67AE8584CAA; +sub.f64 fd1325, fd1322, fd1324; +add.f64 fd1326, fd1324, fd1322; +mul.f64 fd1676, fd1303, 0d3FE8836FA2CF5039; +mul.f64 fd1677, fd1309, 0dBFE491B7523C161D; +sub.f64 fd1329, fd1676, fd1677; +mul.f64 fd1330, fd1309, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1331, fd1303, 0dBFE491B7523C161D, fd1330; +mul.f64 fd1674, fd1319, 0d3FC63A1A7E0B738A; +mul.f64 fd1675, fd1325, 0dBFEF838B8C811C17; +sub.f64 fd1334, fd1674, fd1675; +mul.f64 fd1335, fd1325, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1336, fd1319, 0dBFEF838B8C811C17, fd1335; +mul.f64 fd1338, fd1310, 0dBFEF838B8C811C17; +mul.f64 fd1673, fd1304, 0d3FC63A1A7E0B738A; +sub.f64 fd1339, fd1673, fd1338; +mul.f64 fd1340, fd1310, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1341, fd1304, 0dBFEF838B8C811C17, fd1340; +mul.f64 fd1343, fd1326, 0dBFD5E3A8748A0BF5; +mul.f64 fd1672, fd1320, 0dBFEE11F642522D1C; +sub.f64 fd1344, fd1672, fd1343; +mul.f64 fd1345, fd1326, 0dBFEE11F642522D1C; +fma.rn.f64 fd1346, fd1320, 0dBFD5E3A8748A0BF5, fd1345; +add.f64 fd1347, fd1296, fd1312; +add.f64 fd1348, fd1280, fd1347; +mul.f64 fd1351, fd1347, 0d3FE0000000000000; +sub.f64 fd1352, fd1280, fd1351; +add.f64 fd1671, fd1680, fd1678; +sub.f64 fd1353, fd1680, fd1678; +mul.f64 fd1354, fd1353, 0d3FEBB67AE8584CAA; +add.f64 fd1355, fd1354, fd1352; +sub.f64 fd1356, fd1352, fd1354; +add.f64 fd1670, fd1682, fd1671; +mul.f64 fd1357, fd1671, 0d3FE0000000000000; +sub.f64 fd1358, fd1682, fd1357; +sub.f64 fd1359, fd1296, fd1312; +mul.f64 fd1360, fd1359, 0d3FEBB67AE8584CAA; +sub.f64 fd1361, fd1358, fd1360; +add.f64 fd1362, fd1360, fd1358; +add.f64 fd1363, fd1329, fd1334; +add.f64 fd1364, fd1287, fd1363; +mul.f64 fd1367, fd1363, 0d3FE0000000000000; +sub.f64 fd1368, fd1287, fd1367; +add.f64 fd1669, fd1331, fd1336; +sub.f64 fd1369, fd1331, fd1336; +mul.f64 fd1370, fd1369, 0d3FEBB67AE8584CAA; +add.f64 fd1371, fd1370, fd1368; +sub.f64 fd1372, fd1368, fd1370; +add.f64 fd1668, fd1293, fd1669; +mul.f64 fd1373, fd1669, 0d3FE0000000000000; +sub.f64 fd1374, fd1293, fd1373; +sub.f64 fd1375, fd1329, fd1334; +mul.f64 fd1376, fd1375, 0d3FEBB67AE8584CAA; +sub.f64 fd1377, fd1374, fd1376; +add.f64 fd1378, fd1376, fd1374; +add.f64 fd1379, fd1339, fd1344; +add.f64 fd1380, fd1288, fd1379; +mul.f64 fd1383, fd1379, 0d3FE0000000000000; +sub.f64 fd1384, fd1288, fd1383; +add.f64 fd1667, fd1341, fd1346; +sub.f64 fd1385, fd1341, fd1346; +mul.f64 fd1386, fd1385, 0d3FEBB67AE8584CAA; +add.f64 fd1387, fd1386, fd1384; +sub.f64 fd1388, fd1384, fd1386; +add.f64 fd1666, fd1294, fd1667; +mul.f64 fd1389, fd1667, 0d3FE0000000000000; +sub.f64 fd1390, fd1294, fd1389; +sub.f64 fd1391, fd1339, fd1344; +mul.f64 fd1392, fd1391, 0d3FEBB67AE8584CAA; +sub.f64 fd1393, fd1390, fd1392; +add.f64 fd1394, fd1392, fd1390; +mul.f64 fd1396, fd1686, 0dBFCD84D223638000; +mul.f64 fd1665, fd1248, 0d3FEF232EFF15C9E6; +sub.f64 fd1397, fd1665, fd1396; +mul.f64 fd1398, fd1686, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd1399, fd1248, 0dBFCD84D223638000, fd1398; +mul.f64 fd1663, fd1364, 0d3FEC98A37A9A7850; +mul.f64 fd1664, fd1668, 0dBFDCB920325BAFA6; +sub.f64 fd1402, fd1663, fd1664; +mul.f64 fd1403, fd1668, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1404, fd1364, 0dBFDCB920325BAFA6, fd1403; +mul.f64 fd1661, fd1264, 0d3FEC98A37A9A7850; +mul.f64 fd1662, fd1684, 0dBFDCB920325BAFA6; +sub.f64 fd1407, fd1661, fd1662; +mul.f64 fd1408, fd1684, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1409, fd1264, 0dBFDCB920325BAFA6, fd1408; +mul.f64 fd1659, fd1380, 0d3FE31BEC55BC71BC; +mul.f64 fd1660, fd1666, 0dBFE9AAFE4207DF5F; +sub.f64 fd1412, fd1659, fd1660; +mul.f64 fd1413, fd1666, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1414, fd1380, 0dBFE9AAFE4207DF5F, fd1413; +mul.f64 fd1657, fd1239, 0d3FE8836FA2CF5039; +mul.f64 fd1658, fd1245, 0dBFE491B7523C161D; +sub.f64 fd1417, fd1657, fd1658; +mul.f64 fd1418, fd1245, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1419, fd1239, 0dBFE491B7523C161D, fd1418; +mul.f64 fd1421, fd1361, 0dBFEF838B8C811C17; +mul.f64 fd1656, fd1355, 0d3FC63A1A7E0B738A; +sub.f64 fd1422, fd1656, fd1421; +mul.f64 fd1423, fd1361, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1424, fd1355, 0dBFEF838B8C811C17, fd1423; +mul.f64 fd1426, fd1261, 0dBFE9AAFE4207DF5F; +mul.f64 fd1655, fd1255, 0d3FE31BEC55BC71BC; +sub.f64 fd1427, fd1655, fd1426; +mul.f64 fd1428, fd1261, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1429, fd1255, 0dBFE9AAFE4207DF5F, fd1428; +mul.f64 fd1431, fd1377, 0dBFEEA7D99F29CADE; +mul.f64 fd1654, fd1371, 0dBFD25AFBF23865BF; +sub.f64 fd1432, fd1654, fd1431; +mul.f64 fd1433, fd1377, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1434, fd1371, 0dBFEEA7D99F29CADE, fd1433; +mul.f64 fd1436, fd1277, 0dBFED6206BEB6C24B; +mul.f64 fd1653, fd1271, 0d3FD9595EF26FB670; +sub.f64 fd1437, fd1653, fd1436; +mul.f64 fd1438, fd1277, 0d3FD9595EF26FB670; +fma.rn.f64 fd1439, fd1271, 0dBFED6206BEB6C24B, fd1438; +mul.f64 fd1441, fd1393, 0dBFE746A51650EADE; +mul.f64 fd1652, fd1387, 0dBFE5F5B105F99707; +sub.f64 fd1442, fd1652, fd1441; +mul.f64 fd1443, fd1393, 0dBFE5F5B105F99707; +fma.rn.f64 fd1444, fd1387, 0dBFE746A51650EADE, fd1443; +mul.f64 fd1650, fd1240, 0d3FC63A1A7E0B738A; +mul.f64 fd1651, fd1246, 0dBFEF838B8C811C17; +sub.f64 fd1447, fd1650, fd1651; +mul.f64 fd1448, fd1246, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1449, fd1240, 0dBFEF838B8C811C17, fd1448; +mul.f64 fd1648, fd1356, 0dBFEE11F642522D1C; +mul.f64 fd1649, fd1362, 0dBFD5E3A8748A0BF5; +sub.f64 fd1452, fd1648, fd1649; +mul.f64 fd1453, fd1362, 0dBFEE11F642522D1C; +fma.rn.f64 fd1454, fd1356, 0dBFD5E3A8748A0BF5, fd1453; +mul.f64 fd1646, fd1256, 0dBFADC528B5343A86; +mul.f64 fd1647, fd1262, 0dBFEFF223F3635CE3; +sub.f64 fd1457, fd1646, fd1647; +mul.f64 fd1458, fd1262, 0dBFADC528B5343A86; +fma.rn.f64 fd1459, fd1256, 0dBFEFF223F3635CE3, fd1458; +mul.f64 fd1644, fd1372, 0dBFEFC89BCEF44CF4; +mul.f64 fd1645, fd1378, 0d3FBDB843E577175E; +sub.f64 fd1462, fd1644, fd1645; +mul.f64 fd1463, fd1378, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd1464, fd1372, 0d3FBDB843E577175E, fd1463; +mul.f64 fd1466, fd1278, 0dBFEEA7D99F29CADE; +mul.f64 fd1643, fd1272, 0dBFD25AFBF23865BF; +sub.f64 fd1467, fd1643, fd1466; +mul.f64 fd1468, fd1278, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1469, fd1272, 0dBFEEA7D99F29CADE, fd1468; +mul.f64 fd1471, fd1394, 0d3FE19593DA358510; +mul.f64 fd1642, fd1388, 0dBFEABC50EF4734A7; +sub.f64 fd1472, fd1642, fd1471; +mul.f64 fd1473, fd1394, 0dBFEABC50EF4734A7; +fma.rn.f64 fd1474, fd1388, 0d3FE19593DA358510, fd1473; +add.f64 fd1475, fd1232, fd1348; +mul.f64 fd1477, fd1475, 0d3FE0000000000000; +sub.f64 fd1478, fd1116, fd1477; +add.f64 fd1641, fd1688, fd1670; +sub.f64 fd1479, fd1688, fd1670; +mul.f64 fd1480, fd1479, 0d3FEBB67AE8584CAA; +mul.f64 fd1481, fd1641, 0d3FE0000000000000; +sub.f64 fd1482, fd1705, fd1481; +sub.f64 fd1483, fd1232, fd1348; +mul.f64 fd1484, fd1483, 0d3FEBB67AE8584CAA; +add.f64 fd1485, fd1397, fd1402; +mul.f64 fd1487, fd1485, 0d3FE0000000000000; +sub.f64 fd1488, fd1132, fd1487; +add.f64 fd1640, fd1399, fd1404; +sub.f64 fd1489, fd1399, fd1404; +mul.f64 fd1490, fd1489, 0d3FEBB67AE8584CAA; +mul.f64 fd1491, fd1640, 0d3FE0000000000000; +sub.f64 fd1492, fd1703, fd1491; +sub.f64 fd1493, fd1397, fd1402; +mul.f64 fd1494, fd1493, 0d3FEBB67AE8584CAA; +add.f64 fd1495, fd1407, fd1412; +mul.f64 fd1497, fd1495, 0d3FE0000000000000; +sub.f64 fd1498, fd1148, fd1497; +add.f64 fd1639, fd1409, fd1414; +sub.f64 fd1499, fd1409, fd1414; +mul.f64 fd1500, fd1499, 0d3FEBB67AE8584CAA; +mul.f64 fd1501, fd1639, 0d3FE0000000000000; +sub.f64 fd1502, fd1701, fd1501; +sub.f64 fd1503, fd1407, fd1412; +mul.f64 fd1504, fd1503, 0d3FEBB67AE8584CAA; +add.f64 fd1505, fd1417, fd1422; +mul.f64 fd1507, fd1505, 0d3FE0000000000000; +sub.f64 fd1508, fd1123, fd1507; +add.f64 fd1638, fd1419, fd1424; +sub.f64 fd1509, fd1419, fd1424; +mul.f64 fd1510, fd1509, 0d3FEBB67AE8584CAA; +mul.f64 fd1511, fd1638, 0d3FE0000000000000; +sub.f64 fd1512, fd1129, fd1511; +sub.f64 fd1513, fd1417, fd1422; +mul.f64 fd1514, fd1513, 0d3FEBB67AE8584CAA; +add.f64 fd1515, fd1427, fd1432; +mul.f64 fd1517, fd1515, 0d3FE0000000000000; +sub.f64 fd1518, fd1139, fd1517; +add.f64 fd1637, fd1429, fd1434; +sub.f64 fd1519, fd1429, fd1434; +mul.f64 fd1520, fd1519, 0d3FEBB67AE8584CAA; +mul.f64 fd1521, fd1637, 0d3FE0000000000000; +sub.f64 fd1522, fd1145, fd1521; +sub.f64 fd1523, fd1427, fd1432; +mul.f64 fd1524, fd1523, 0d3FEBB67AE8584CAA; +add.f64 fd1525, fd1437, fd1442; +mul.f64 fd1527, fd1525, 0d3FE0000000000000; +sub.f64 fd1528, fd1155, fd1527; +add.f64 fd1636, fd1439, fd1444; +sub.f64 fd1529, fd1439, fd1444; +mul.f64 fd1530, fd1529, 0d3FEBB67AE8584CAA; +mul.f64 fd1531, fd1636, 0d3FE0000000000000; +sub.f64 fd1532, fd1161, fd1531; +sub.f64 fd1533, fd1437, fd1442; +mul.f64 fd1534, fd1533, 0d3FEBB67AE8584CAA; +add.f64 fd1535, fd1447, fd1452; +mul.f64 fd1537, fd1535, 0d3FE0000000000000; +sub.f64 fd1538, fd1124, fd1537; +add.f64 fd1635, fd1449, fd1454; +sub.f64 fd1539, fd1449, fd1454; +mul.f64 fd1540, fd1539, 0d3FEBB67AE8584CAA; +mul.f64 fd1541, fd1635, 0d3FE0000000000000; +sub.f64 fd1542, fd1130, fd1541; +sub.f64 fd1543, fd1447, fd1452; +mul.f64 fd1544, fd1543, 0d3FEBB67AE8584CAA; +add.f64 fd1545, fd1457, fd1462; +mul.f64 fd1547, fd1545, 0d3FE0000000000000; +sub.f64 fd1548, fd1140, fd1547; +add.f64 fd1634, fd1459, fd1464; +sub.f64 fd1549, fd1459, fd1464; +mul.f64 fd1550, fd1549, 0d3FEBB67AE8584CAA; +mul.f64 fd1551, fd1634, 0d3FE0000000000000; +sub.f64 fd1552, fd1146, fd1551; +sub.f64 fd1553, fd1457, fd1462; +mul.f64 fd1554, fd1553, 0d3FEBB67AE8584CAA; +add.f64 fd1555, fd1467, fd1472; +mul.f64 fd1557, fd1555, 0d3FE0000000000000; +sub.f64 fd1558, fd1156, fd1557; +add.f64 fd1633, fd1469, fd1474; +sub.f64 fd1559, fd1469, fd1474; +mul.f64 fd1560, fd1559, 0d3FEBB67AE8584CAA; +mul.f64 fd1561, fd1633, 0d3FE0000000000000; +sub.f64 fd1562, fd1162, fd1561; +sub.f64 fd1563, fd1467, fd1472; +mul.f64 fd1564, fd1563, 0d3FEBB67AE8584CAA; +add.f64 %1, fd1705, fd1641; +add.f64 %0, fd1116, fd1475; +add.f64 %3, fd1703, fd1640; +add.f64 %2, fd1132, fd1485; +add.f64 %5, fd1701, fd1639; +add.f64 %4, fd1148, fd1495; +add.f64 %7, fd1129, fd1638; +add.f64 %6, fd1123, fd1505; +add.f64 %9, fd1145, fd1637; +add.f64 %8, fd1139, fd1515; +add.f64 %11, fd1161, fd1636; +add.f64 %10, fd1155, fd1525; +add.f64 %13, fd1130, fd1635; +add.f64 %12, fd1124, fd1535; +add.f64 %15, fd1146, fd1634; +add.f64 %14, fd1140, fd1545; +add.f64 %17, fd1162, fd1633; +add.f64 %16, fd1156, fd1555; +sub.f64 %19, fd1482, fd1484; +add.f64 %18, fd1480, fd1478; +sub.f64 %21, fd1492, fd1494; +add.f64 %20, fd1490, fd1488; +add.f64 %22, fd1500, fd1498; +sub.f64 %23, fd1502, fd1504; +add.f64 %24, fd1510, fd1508; +sub.f64 %25, fd1512, fd1514; +add.f64 %26, fd1520, fd1518; +sub.f64 %27, fd1522, fd1524; +add.f64 %28, fd1530, fd1528; +sub.f64 %29, fd1532, fd1534; +sub.f64 %31, fd1542, fd1544; +add.f64 %30, fd1540, fd1538; +sub.f64 %33, fd1552, fd1554; +add.f64 %32, fd1550, fd1548; +sub.f64 %35, fd1562, fd1564; +add.f64 %34, fd1560, fd1558; +add.f64 %37, fd1484, fd1482; +sub.f64 %36, fd1478, fd1480; +add.f64 %39, fd1494, fd1492; +sub.f64 %38, fd1488, fd1490; +add.f64 %41, fd1504, fd1502; +sub.f64 %40, fd1498, fd1500; +add.f64 %43, fd1514, fd1512; +sub.f64 %42, fd1508, fd1510; +add.f64 %45, fd1524, fd1522; +sub.f64 %44, fd1518, fd1520; +add.f64 %47, fd1534, fd1532; +sub.f64 %46, fd1528, fd1530; +add.f64 %49, fd1544, fd1542; +sub.f64 %48, fd1538, fd1540; +add.f64 %51, fd1554, fd1552; +sub.f64 %50, fd1548, fd1550; +add.f64 %53, fd1564, fd1562; +sub.f64 %52, fd1558, fd1560; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_729), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[10].y), "d"(rmem[19].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<521, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<613>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 11664, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 11664, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd151, fd120; +mul.f64 fd156, fd152, fd122; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd136; +mul.f64 fd164, fd162, fd138; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd168, fd111; +mul.f64 fd172, fd170, fd117; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd176, fd127; +mul.f64 fd180, fd178, fd133; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+1296]; +mul.f64 fd186, fd182, fd143; +mul.f64 fd187, fd183, fd149; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd191, fd112; +mul.f64 fd195, fd193, fd118; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd199, fd128; +mul.f64 fd203, fd201, fd134; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd207, fd144; +mul.f64 fd211, fd209, fd150; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd152, fd120, fd157; +sub.f64 fd216, fd155, fd156; +st.shared.v2.f64 [r9+16], {fd216, fd215}; +fma.rn.f64 fd217, fd162, fd136, fd165; +sub.f64 fd218, fd163, fd164; +st.shared.v2.f64 [r9+32], {fd218, fd217}; +sub.f64 fd219, fd171, fd172; +fma.rn.f64 fd220, fd170, fd111, fd173; +st.shared.v2.f64 [r9+48], {fd219, fd220}; +fma.rn.f64 fd221, fd178, fd127, fd181; +sub.f64 fd222, fd179, fd180; +st.shared.v2.f64 [r9+64], {fd222, fd221}; +fma.rn.f64 fd223, fd183, fd143, fd188; +sub.f64 fd224, fd186, fd187; +st.shared.v2.f64 [r9+80], {fd224, fd223}; +fma.rn.f64 fd225, fd193, fd112, fd196; +sub.f64 fd226, fd194, fd195; +st.shared.v2.f64 [r9+96], {fd226, fd225}; +fma.rn.f64 fd227, fd201, fd128, fd204; +sub.f64 fd228, fd202, fd203; +st.shared.v2.f64 [r9+112], {fd228, fd227}; +fma.rn.f64 fd229, fd209, fd144, fd212; +sub.f64 fd230, fd210, fd211; +st.shared.v2.f64 [r9+128], {fd230, fd229}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+1296]; +ld.shared.v2.f64 {fd239, fd240}, [r11+2592]; +ld.shared.v2.f64 {fd243, fd244}, [r11+3888]; +ld.shared.v2.f64 {fd247, fd248}, [r11+5184]; +ld.shared.v2.f64 {fd251, fd252}, [r11+6480]; +ld.shared.v2.f64 {fd255, fd256}, [r11+7776]; +ld.shared.v2.f64 {fd259, fd260}, [r11+9072]; +ld.shared.v2.f64 {fd263, fd264}, [r11+10368]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0d3FEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0d3FEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0d3FEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0dBFE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0dBFE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0dBFEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0dBFEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0dBFEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0dBFEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0dBFD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0dBFD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0d3FEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0d3FEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd381, fd350; +mul.f64 fd386, fd382, fd352; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd390, fd366; +mul.f64 fd394, fd392, fd368; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd398, fd341; +mul.f64 fd402, fd400, fd347; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd406, fd357; +mul.f64 fd410, fd408, fd363; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+144]; +mul.f64 fd416, fd412, fd373; +mul.f64 fd417, fd413, fd379; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd421, fd342; +mul.f64 fd425, fd423, fd348; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd429, fd358; +mul.f64 fd433, fd431, fd364; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd437, fd374; +mul.f64 fd441, fd439, fd380; +mul.f64 fd442, fd437, fd380; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 1296, r16; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r17], {fd444, fd443}; +fma.rn.f64 fd445, fd382, fd350, fd387; +sub.f64 fd446, fd385, fd386; +st.shared.v2.f64 [r17+144], {fd446, fd445}; +fma.rn.f64 fd447, fd392, fd366, fd395; +sub.f64 fd448, fd393, fd394; +st.shared.v2.f64 [r17+288], {fd448, fd447}; +fma.rn.f64 fd449, fd400, fd341, fd403; +sub.f64 fd450, fd401, fd402; +st.shared.v2.f64 [r17+432], {fd450, fd449}; +fma.rn.f64 fd451, fd408, fd357, fd411; +sub.f64 fd452, fd409, fd410; +st.shared.v2.f64 [r17+576], {fd452, fd451}; +fma.rn.f64 fd453, fd413, fd373, fd418; +sub.f64 fd454, fd416, fd417; +st.shared.v2.f64 [r17+720], {fd454, fd453}; +fma.rn.f64 fd455, fd423, fd342, fd426; +sub.f64 fd456, fd424, fd425; +st.shared.v2.f64 [r17+864], {fd456, fd455}; +fma.rn.f64 fd457, fd431, fd358, fd434; +sub.f64 fd458, fd432, fd433; +st.shared.v2.f64 [r17+1008], {fd458, fd457}; +fma.rn.f64 fd459, fd439, fd374, fd442; +sub.f64 fd460, fd440, fd441; +st.shared.v2.f64 [r17+1152], {fd460, fd459}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r11]; +ld.shared.v2.f64 {fd465, fd466}, [r11+1296]; +ld.shared.v2.f64 {fd469, fd470}, [r11+2592]; +ld.shared.v2.f64 {fd473, fd474}, [r11+3888]; +ld.shared.v2.f64 {fd477, fd478}, [r11+5184]; +ld.shared.v2.f64 {fd481, fd482}, [r11+6480]; +ld.shared.v2.f64 {fd485, fd486}, [r11+7776]; +ld.shared.v2.f64 {fd489, fd490}, [r11+9072]; +ld.shared.v2.f64 {fd493, fd494}, [r11+10368]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd461, fd497; +add.f64 fd499, fd474, fd486; +add.f64 fd500, fd462, fd499; +mul.f64 fd501, fd497, 0d3FE0000000000000; +sub.f64 fd502, fd461, fd501; +sub.f64 fd503, fd474, fd486; +mul.f64 fd504, fd503, 0d3FEBB67AE8584CAA; +add.f64 fd505, fd504, fd502; +sub.f64 fd506, fd502, fd504; +mul.f64 fd507, fd499, 0d3FE0000000000000; +sub.f64 fd508, fd462, fd507; +sub.f64 fd509, fd473, fd485; +mul.f64 fd510, fd509, 0d3FEBB67AE8584CAA; +sub.f64 fd511, fd508, fd510; +add.f64 fd512, fd510, fd508; +add.f64 fd513, fd477, fd489; +add.f64 fd514, fd465, fd513; +add.f64 fd515, fd478, fd490; +add.f64 fd516, fd466, fd515; +mul.f64 fd517, fd513, 0d3FE0000000000000; +sub.f64 fd518, fd465, fd517; +sub.f64 fd519, fd478, fd490; +mul.f64 fd520, fd519, 0d3FEBB67AE8584CAA; +add.f64 fd521, fd520, fd518; +sub.f64 fd522, fd518, fd520; +mul.f64 fd523, fd515, 0d3FE0000000000000; +sub.f64 fd524, fd466, fd523; +sub.f64 fd525, fd477, fd489; +mul.f64 fd526, fd525, 0d3FEBB67AE8584CAA; +sub.f64 fd527, fd524, fd526; +add.f64 fd528, fd526, fd524; +add.f64 fd529, fd481, fd493; +add.f64 fd530, fd469, fd529; +add.f64 fd531, fd482, fd494; +add.f64 fd532, fd470, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd469, fd533; +sub.f64 fd535, fd482, fd494; +mul.f64 fd536, fd535, 0d3FEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd470, fd539; +sub.f64 fd541, fd481, fd493; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +mul.f64 fd545, fd521, 0d3FE8836FA2CF5039; +mul.f64 fd546, fd527, 0dBFE491B7523C161D; +sub.f64 fd547, fd545, fd546; +mul.f64 fd548, fd527, 0d3FE8836FA2CF5039; +fma.rn.f64 fd549, fd521, 0dBFE491B7523C161D, fd548; +mul.f64 fd550, fd537, 0d3FC63A1A7E0B738A; +mul.f64 fd551, fd543, 0dBFEF838B8C811C17; +sub.f64 fd552, fd550, fd551; +mul.f64 fd553, fd543, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd554, fd537, 0dBFEF838B8C811C17, fd553; +mul.f64 fd555, fd522, 0d3FC63A1A7E0B738A; +mul.f64 fd556, fd528, 0dBFEF838B8C811C17; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd528, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd559, fd522, 0dBFEF838B8C811C17, fd558; +mul.f64 fd560, fd538, 0dBFEE11F642522D1C; +mul.f64 fd561, fd544, 0dBFD5E3A8748A0BF5; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd544, 0dBFEE11F642522D1C; +fma.rn.f64 fd564, fd538, 0dBFD5E3A8748A0BF5, fd563; +add.f64 fd565, fd514, fd530; +add.f64 fd566, fd516, fd532; +mul.f64 fd567, fd565, 0d3FE0000000000000; +sub.f64 fd568, fd498, fd567; +sub.f64 fd569, fd516, fd532; +mul.f64 fd570, fd569, 0d3FEBB67AE8584CAA; +mul.f64 fd571, fd566, 0d3FE0000000000000; +sub.f64 fd572, fd500, fd571; +sub.f64 fd573, fd514, fd530; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +add.f64 fd575, fd547, fd552; +add.f64 fd576, fd549, fd554; +mul.f64 fd577, fd575, 0d3FE0000000000000; +sub.f64 fd578, fd505, fd577; +sub.f64 fd579, fd549, fd554; +mul.f64 fd580, fd579, 0d3FEBB67AE8584CAA; +mul.f64 fd581, fd576, 0d3FE0000000000000; +sub.f64 fd582, fd511, fd581; +sub.f64 fd583, fd547, fd552; +mul.f64 fd584, fd583, 0d3FEBB67AE8584CAA; +add.f64 fd585, fd557, fd562; +add.f64 fd586, fd559, fd564; +mul.f64 fd587, fd585, 0d3FE0000000000000; +sub.f64 fd588, fd506, fd587; +sub.f64 fd589, fd559, fd564; +mul.f64 fd590, fd589, 0d3FEBB67AE8584CAA; +mul.f64 fd591, fd586, 0d3FE0000000000000; +sub.f64 fd592, fd512, fd591; +sub.f64 fd593, fd557, fd562; +mul.f64 fd594, fd593, 0d3FEBB67AE8584CAA; +add.f64 %1, fd500, fd566; +add.f64 %0, fd498, fd565; +add.f64 %3, fd511, fd576; +add.f64 %2, fd505, fd575; +add.f64 %5, fd512, fd586; +add.f64 %4, fd506, fd585; +sub.f64 %7, fd572, fd574; +add.f64 %6, fd570, fd568; +sub.f64 %9, fd582, fd584; +add.f64 %8, fd580, fd578; +sub.f64 %11, fd592, fd594; +add.f64 %10, fd590, fd588; +add.f64 %13, fd574, fd572; +sub.f64 %12, fd568, fd570; +add.f64 %15, fd584, fd582; +sub.f64 %14, fd578, fd580; +add.f64 %17, fd594, fd592; +sub.f64 %16, fd588, fd590; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<522, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<577>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 5832, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0d3FEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0d3FEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0d3FEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd153, fd122; +mul.f64 fd158, fd154, fd124; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd153, fd124; +fma.rn.f64 fd161, fd154, fd122, fd160; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd164, fd138; +mul.f64 fd168, fd166, fd140; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd164, fd140; +fma.rn.f64 fd171, fd166, fd138, fd170; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd174, fd113; +mul.f64 fd178, fd176, fd119; +sub.f64 fd179, fd177, fd178; +mul.f64 fd180, fd174, fd119; +fma.rn.f64 fd181, fd176, fd113, fd180; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd184, fd129; +mul.f64 fd188, fd186, fd135; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd184, fd135; +fma.rn.f64 fd191, fd186, fd129, fd190; +ld.global.v2.f64 {fd192, fd193}, [rd6+1296]; +mul.f64 fd196, fd192, fd145; +mul.f64 fd197, fd193, fd151; +sub.f64 fd198, fd196, fd197; +mul.f64 fd199, fd192, fd151; +fma.rn.f64 fd200, fd193, fd145, fd199; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd203, fd114; +mul.f64 fd207, fd205, fd120; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, fd120; +fma.rn.f64 fd210, fd205, fd114, fd209; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd213, fd130; +mul.f64 fd217, fd215, fd136; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd213, fd136; +fma.rn.f64 fd220, fd215, fd130, fd219; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd223, fd146; +mul.f64 fd227, fd225, fd152; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd223, fd152; +fma.rn.f64 fd230, fd225, fd146, fd229; +mad.lo.s32 r8, r5, 5832, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd159; +st.shared.f64 [r9+16], fd169; +st.shared.f64 [r9+24], fd179; +st.shared.f64 [r9+32], fd189; +st.shared.f64 [r9+40], fd198; +st.shared.f64 [r9+48], fd208; +st.shared.f64 [r9+56], fd218; +st.shared.f64 [r9+64], fd228; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+648]; +ld.shared.f64 fd233, [r11+1296]; +ld.shared.f64 fd234, [r11+1944]; +ld.shared.f64 fd235, [r11+2592]; +ld.shared.f64 fd236, [r11+3240]; +ld.shared.f64 fd237, [r11+3888]; +ld.shared.f64 fd238, [r11+4536]; +ld.shared.f64 fd239, [r11+5184]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+648]; +ld.shared.f64 fd242, [r11+1296]; +ld.shared.f64 fd243, [r11+1944]; +ld.shared.f64 fd244, [r11+2592]; +ld.shared.f64 fd245, [r11+3240]; +ld.shared.f64 fd246, [r11+3888]; +ld.shared.f64 fd247, [r11+4536]; +ld.shared.f64 fd248, [r11+5184]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0d3FEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0d3FEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0d3FEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0dBFE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0dBFE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0dBFEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0dBFEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0dBFEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0dBFEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0dBFD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0dBFD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0d3FEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0d3FEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0d3FEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0d3FEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd365, fd334; +mul.f64 fd370, fd366, fd336; +sub.f64 fd371, fd369, fd370; +mul.f64 fd372, fd365, fd336; +fma.rn.f64 fd373, fd366, fd334, fd372; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd376, fd350; +mul.f64 fd380, fd378, fd352; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, fd352; +fma.rn.f64 fd383, fd378, fd350, fd382; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd386, fd325; +mul.f64 fd390, fd388, fd331; +sub.f64 fd391, fd389, fd390; +mul.f64 fd392, fd386, fd331; +fma.rn.f64 fd393, fd388, fd325, fd392; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd396, fd341; +mul.f64 fd400, fd398, fd347; +sub.f64 fd401, fd399, fd400; +mul.f64 fd402, fd396, fd347; +fma.rn.f64 fd403, fd398, fd341, fd402; +ld.global.v2.f64 {fd404, fd405}, [rd11+144]; +mul.f64 fd408, fd404, fd357; +mul.f64 fd409, fd405, fd363; +sub.f64 fd410, fd408, fd409; +mul.f64 fd411, fd404, fd363; +fma.rn.f64 fd412, fd405, fd357, fd411; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd415, fd326; +mul.f64 fd419, fd417, fd332; +sub.f64 fd420, fd418, fd419; +mul.f64 fd421, fd415, fd332; +fma.rn.f64 fd422, fd417, fd326, fd421; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd425, fd342; +mul.f64 fd429, fd427, fd348; +sub.f64 fd430, fd428, fd429; +mul.f64 fd431, fd425, fd348; +fma.rn.f64 fd432, fd427, fd342, fd431; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd435, fd358; +mul.f64 fd439, fd437, fd364; +sub.f64 fd440, fd438, fd439; +mul.f64 fd441, fd435, fd364; +fma.rn.f64 fd442, fd437, fd358, fd441; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +st.shared.f64 [r17], fd318; +st.shared.f64 [r17+72], fd371; +st.shared.f64 [r17+144], fd381; +st.shared.f64 [r17+216], fd391; +st.shared.f64 [r17+288], fd401; +st.shared.f64 [r17+360], fd410; +st.shared.f64 [r17+432], fd420; +st.shared.f64 [r17+504], fd430; +st.shared.f64 [r17+576], fd440; +barrier.sync 0; +ld.shared.f64 fd443, [r11]; +ld.shared.f64 fd444, [r11+648]; +ld.shared.f64 fd445, [r11+1296]; +ld.shared.f64 fd446, [r11+1944]; +ld.shared.f64 fd447, [r11+2592]; +ld.shared.f64 fd448, [r11+3240]; +ld.shared.f64 fd449, [r11+3888]; +ld.shared.f64 fd450, [r11+4536]; +ld.shared.f64 fd451, [r11+5184]; +barrier.sync 0; +st.shared.f64 [r17], fd320; +st.shared.f64 [r17+72], fd373; +st.shared.f64 [r17+144], fd383; +st.shared.f64 [r17+216], fd393; +st.shared.f64 [r17+288], fd403; +st.shared.f64 [r17+360], fd412; +st.shared.f64 [r17+432], fd422; +st.shared.f64 [r17+504], fd432; +st.shared.f64 [r17+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r11]; +ld.shared.f64 fd453, [r11+648]; +ld.shared.f64 fd454, [r11+1296]; +ld.shared.f64 fd455, [r11+1944]; +ld.shared.f64 fd456, [r11+2592]; +ld.shared.f64 fd457, [r11+3240]; +ld.shared.f64 fd458, [r11+3888]; +ld.shared.f64 fd459, [r11+4536]; +ld.shared.f64 fd460, [r11+5184]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd443, fd461; +add.f64 fd463, fd455, fd458; +add.f64 fd464, fd452, fd463; +mul.f64 fd465, fd461, 0d3FE0000000000000; +sub.f64 fd466, fd443, fd465; +sub.f64 fd467, fd455, fd458; +mul.f64 fd468, fd467, 0d3FEBB67AE8584CAA; +add.f64 fd469, fd468, fd466; +sub.f64 fd470, fd466, fd468; +mul.f64 fd471, fd463, 0d3FE0000000000000; +sub.f64 fd472, fd452, fd471; +sub.f64 fd473, fd446, fd449; +mul.f64 fd474, fd473, 0d3FEBB67AE8584CAA; +sub.f64 fd475, fd472, fd474; +add.f64 fd476, fd474, fd472; +add.f64 fd477, fd447, fd450; +add.f64 fd478, fd444, fd477; +add.f64 fd479, fd456, fd459; +add.f64 fd480, fd453, fd479; +mul.f64 fd481, fd477, 0d3FE0000000000000; +sub.f64 fd482, fd444, fd481; +sub.f64 fd483, fd456, fd459; +mul.f64 fd484, fd483, 0d3FEBB67AE8584CAA; +add.f64 fd485, fd484, fd482; +sub.f64 fd486, fd482, fd484; +mul.f64 fd487, fd479, 0d3FE0000000000000; +sub.f64 fd488, fd453, fd487; +sub.f64 fd489, fd447, fd450; +mul.f64 fd490, fd489, 0d3FEBB67AE8584CAA; +sub.f64 fd491, fd488, fd490; +add.f64 fd492, fd490, fd488; +add.f64 fd493, fd448, fd451; +add.f64 fd494, fd445, fd493; +add.f64 fd495, fd457, fd460; +add.f64 fd496, fd454, fd495; +mul.f64 fd497, fd493, 0d3FE0000000000000; +sub.f64 fd498, fd445, fd497; +sub.f64 fd499, fd457, fd460; +mul.f64 fd500, fd499, 0d3FEBB67AE8584CAA; +add.f64 fd501, fd500, fd498; +sub.f64 fd502, fd498, fd500; +mul.f64 fd503, fd495, 0d3FE0000000000000; +sub.f64 fd504, fd454, fd503; +sub.f64 fd505, fd448, fd451; +mul.f64 fd506, fd505, 0d3FEBB67AE8584CAA; +sub.f64 fd507, fd504, fd506; +add.f64 fd508, fd506, fd504; +mul.f64 fd509, fd485, 0d3FE8836FA2CF5039; +mul.f64 fd510, fd491, 0dBFE491B7523C161D; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd491, 0d3FE8836FA2CF5039; +fma.rn.f64 fd513, fd485, 0dBFE491B7523C161D, fd512; +mul.f64 fd514, fd501, 0d3FC63A1A7E0B738A; +mul.f64 fd515, fd507, 0dBFEF838B8C811C17; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd507, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd518, fd501, 0dBFEF838B8C811C17, fd517; +mul.f64 fd519, fd486, 0d3FC63A1A7E0B738A; +mul.f64 fd520, fd492, 0dBFEF838B8C811C17; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd492, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd523, fd486, 0dBFEF838B8C811C17, fd522; +mul.f64 fd524, fd502, 0dBFEE11F642522D1C; +mul.f64 fd525, fd508, 0dBFD5E3A8748A0BF5; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd508, 0dBFEE11F642522D1C; +fma.rn.f64 fd528, fd502, 0dBFD5E3A8748A0BF5, fd527; +add.f64 fd529, fd478, fd494; +add.f64 fd530, fd480, fd496; +mul.f64 fd531, fd529, 0d3FE0000000000000; +sub.f64 fd532, fd462, fd531; +sub.f64 fd533, fd480, fd496; +mul.f64 fd534, fd533, 0d3FEBB67AE8584CAA; +mul.f64 fd535, fd530, 0d3FE0000000000000; +sub.f64 fd536, fd464, fd535; +sub.f64 fd537, fd478, fd494; +mul.f64 fd538, fd537, 0d3FEBB67AE8584CAA; +add.f64 fd539, fd511, fd516; +add.f64 fd540, fd513, fd518; +mul.f64 fd541, fd539, 0d3FE0000000000000; +sub.f64 fd542, fd469, fd541; +sub.f64 fd543, fd513, fd518; +mul.f64 fd544, fd543, 0d3FEBB67AE8584CAA; +mul.f64 fd545, fd540, 0d3FE0000000000000; +sub.f64 fd546, fd475, fd545; +sub.f64 fd547, fd511, fd516; +mul.f64 fd548, fd547, 0d3FEBB67AE8584CAA; +add.f64 fd549, fd521, fd526; +add.f64 fd550, fd523, fd528; +mul.f64 fd551, fd549, 0d3FE0000000000000; +sub.f64 fd552, fd470, fd551; +sub.f64 fd553, fd523, fd528; +mul.f64 fd554, fd553, 0d3FEBB67AE8584CAA; +mul.f64 fd555, fd550, 0d3FE0000000000000; +sub.f64 fd556, fd476, fd555; +sub.f64 fd557, fd521, fd526; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +add.f64 %0, fd462, fd529; +add.f64 %1, fd464, fd530; +add.f64 %3, fd475, fd540; +add.f64 %2, fd469, fd539; +add.f64 %5, fd476, fd550; +add.f64 %4, fd470, fd549; +add.f64 %6, fd534, fd532; +sub.f64 %7, fd536, fd538; +sub.f64 %9, fd546, fd548; +add.f64 %8, fd544, fd542; +sub.f64 %11, fd556, fd558; +add.f64 %10, fd554, fd552; +sub.f64 %12, fd532, fd534; +add.f64 %13, fd538, fd536; +add.f64 %15, fd548, fd546; +sub.f64 %14, fd542, fd544; +add.f64 %17, fd558, fd556; +sub.f64 %16, fd552, fd554; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<525, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<229>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 5832, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %14, %17; +add.f64 fd14, %12, fd13; +add.f64 fd15, %16, %18; +add.f64 fd16, %13, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %12, fd17; +sub.f64 fd19, %16, %18; +mul.f64 fd20, fd19, 0d3FEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %13, fd23; +sub.f64 fd25, %14, %17; +mul.f64 fd26, fd25, 0d3FEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5832, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd29, fd21; +mul.f64 fd34, fd30, fd27; +sub.f64 fd35, fd33, fd34; +mul.f64 fd36, fd29, fd27; +fma.rn.f64 fd37, fd30, fd21, fd36; +ld.global.v2.f64 {fd38, fd39}, [rd6+3888]; +mul.f64 fd42, fd38, fd22; +mul.f64 fd43, fd39, fd28; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd38, fd28; +fma.rn.f64 fd46, fd39, fd22, fd45; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd35; +st.shared.f64 [r9+16], fd44; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+1944]; +ld.shared.f64 fd49, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+1944]; +ld.shared.f64 fd52, [r11+3888]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd69, fd61; +mul.f64 fd74, fd70, fd67; +sub.f64 fd75, fd73, fd74; +mul.f64 fd76, fd69, fd67; +fma.rn.f64 fd77, fd70, fd61, fd76; +ld.global.v2.f64 {fd78, fd79}, [rd11+1296]; +mul.f64 fd82, fd78, fd62; +mul.f64 fd83, fd79, fd68; +sub.f64 fd84, fd82, fd83; +mul.f64 fd85, fd78, fd68; +fma.rn.f64 fd86, fd79, fd62, fd85; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd75; +st.shared.f64 [r17+48], fd84; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+1944]; +ld.shared.f64 fd89, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+1944]; +ld.shared.f64 fd92, [r11+3888]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0d3FEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0d3FEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd109, fd101; +mul.f64 fd114, fd110, fd107; +sub.f64 fd115, fd113, fd114; +mul.f64 fd116, fd109, fd107; +fma.rn.f64 fd117, fd110, fd101, fd116; +ld.global.v2.f64 {fd118, fd119}, [rd16+432]; +mul.f64 fd122, fd118, fd102; +mul.f64 fd123, fd119, fd108; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd118, fd108; +fma.rn.f64 fd126, fd119, fd102, fd125; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +st.shared.f64 [r23], fd94; +st.shared.f64 [r23+72], fd115; +st.shared.f64 [r23+144], fd124; +barrier.sync 0; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+1944]; +ld.shared.f64 fd129, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r23], fd96; +st.shared.f64 [r23+72], fd117; +st.shared.f64 [r23+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r11]; +ld.shared.f64 fd131, [r11+1944]; +ld.shared.f64 fd132, [r11+3888]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd127, fd133; +add.f64 fd135, fd131, fd132; +add.f64 fd136, fd130, fd135; +mul.f64 fd137, fd133, 0d3FE0000000000000; +sub.f64 fd138, fd127, fd137; +sub.f64 fd139, fd131, fd132; +mul.f64 fd140, fd139, 0d3FEBB67AE8584CAA; +add.f64 fd141, fd140, fd138; +sub.f64 fd142, fd138, fd140; +mul.f64 fd143, fd135, 0d3FE0000000000000; +sub.f64 fd144, fd130, fd143; +sub.f64 fd145, fd128, fd129; +mul.f64 fd146, fd145, 0d3FEBB67AE8584CAA; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd146, fd144; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd149, fd150}, [rd21]; +mul.f64 fd153, fd149, fd141; +mul.f64 fd154, fd150, fd147; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd149, fd147; +fma.rn.f64 fd157, fd150, fd141, fd156; +ld.global.v2.f64 {fd158, fd159}, [rd21+144]; +mul.f64 fd162, fd158, fd142; +mul.f64 fd163, fd159, fd148; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd158, fd148; +fma.rn.f64 fd166, fd159, fd142, fd165; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +st.shared.f64 [r33], fd134; +st.shared.f64 [r33+216], fd155; +st.shared.f64 [r33+432], fd164; +barrier.sync 0; +ld.shared.f64 fd167, [r11]; +ld.shared.f64 fd168, [r11+1944]; +ld.shared.f64 fd169, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r33], fd136; +st.shared.f64 [r33+216], fd157; +st.shared.f64 [r33+432], fd166; +barrier.sync 0; +ld.shared.f64 fd170, [r11]; +ld.shared.f64 fd171, [r11+1944]; +ld.shared.f64 fd172, [r11+3888]; +add.f64 fd173, fd168, fd169; +add.f64 fd174, fd167, fd173; +add.f64 fd175, fd171, fd172; +add.f64 fd176, fd170, fd175; +mul.f64 fd177, fd173, 0d3FE0000000000000; +sub.f64 fd178, fd167, fd177; +sub.f64 fd179, fd171, fd172; +mul.f64 fd180, fd179, 0d3FEBB67AE8584CAA; +add.f64 fd181, fd180, fd178; +sub.f64 fd182, fd178, fd180; +mul.f64 fd183, fd175, 0d3FE0000000000000; +sub.f64 fd184, fd170, fd183; +sub.f64 fd185, fd168, fd169; +mul.f64 fd186, fd185, 0d3FEBB67AE8584CAA; +sub.f64 fd187, fd184, fd186; +add.f64 fd188, fd186, fd184; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd189, fd190}, [rd26]; +mul.f64 fd193, fd189, fd181; +mul.f64 fd194, fd190, fd187; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd189, fd187; +fma.rn.f64 fd197, fd190, fd181, fd196; +ld.global.v2.f64 {fd198, fd199}, [rd26+48]; +mul.f64 fd202, fd198, fd182; +mul.f64 fd203, fd199, fd188; +sub.f64 fd204, fd202, fd203; +mul.f64 fd205, fd198, fd188; +fma.rn.f64 fd206, fd199, fd182, fd205; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +st.shared.f64 [r39], fd174; +st.shared.f64 [r39+648], fd195; +st.shared.f64 [r39+1296], fd204; +barrier.sync 0; +ld.shared.f64 fd207, [r11]; +ld.shared.f64 fd208, [r11+1944]; +ld.shared.f64 fd209, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r39], fd176; +st.shared.f64 [r39+648], fd197; +st.shared.f64 [r39+1296], fd206; +barrier.sync 0; +ld.shared.f64 fd210, [r11]; +ld.shared.f64 fd211, [r11+1944]; +ld.shared.f64 fd212, [r11+3888]; +add.f64 fd213, fd208, fd209; +add.f64 fd214, fd211, fd212; +mul.f64 fd215, fd213, 0d3FE0000000000000; +sub.f64 fd216, fd207, fd215; +sub.f64 fd217, fd211, fd212; +mul.f64 fd218, fd217, 0d3FEBB67AE8584CAA; +mul.f64 fd219, fd214, 0d3FE0000000000000; +sub.f64 fd220, fd210, fd219; +sub.f64 fd221, fd208, fd209; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +add.f64 %0, fd207, fd213; +add.f64 %1, fd210, fd214; +add.f64 %2, fd218, fd216; +sub.f64 %3, fd220, fd222; +sub.f64 %4, fd216, fd218; +add.f64 %5, fd222, fd220; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<526, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<259>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 11664, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %14, %17; +add.f64 fd14, %16, %18; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %12, fd15; +sub.f64 fd17, %16, %18; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %13, fd21; +sub.f64 fd23, %14, %17; +mul.f64 fd24, fd23, 0d3FEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 11664, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd27, fd19; +mul.f64 fd32, fd28, fd25; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+3888]; +mul.f64 fd38, fd34, fd20; +mul.f64 fd39, fd35, fd26; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %13, fd14; +add.f64 fd42, %12, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd28, fd19, fd33; +sub.f64 fd44, fd31, fd32; +st.shared.v2.f64 [r9+16], {fd44, fd43}; +fma.rn.f64 fd45, fd35, fd20, fd40; +sub.f64 fd46, fd38, fd39; +st.shared.v2.f64 [r9+32], {fd46, fd45}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+3888]; +ld.shared.v2.f64 {fd55, fd56}, [r11+7776]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0d3FEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0d3FEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd73, fd65; +mul.f64 fd78, fd74, fd71; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+1296]; +mul.f64 fd84, fd80, fd66; +mul.f64 fd85, fd81, fd72; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd74, fd65, fd79; +sub.f64 fd90, fd77, fd78; +st.shared.v2.f64 [r17+48], {fd90, fd89}; +fma.rn.f64 fd91, fd81, fd66, fd86; +sub.f64 fd92, fd84, fd85; +st.shared.v2.f64 [r17+96], {fd92, fd91}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+3888]; +ld.shared.v2.f64 {fd101, fd102}, [r11+7776]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd119, fd111; +mul.f64 fd124, fd120, fd117; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+432]; +mul.f64 fd130, fd126, fd112; +mul.f64 fd131, fd127, fd118; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r23, r18, 432, r22; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r23], {fd134, fd133}; +fma.rn.f64 fd135, fd120, fd111, fd125; +sub.f64 fd136, fd123, fd124; +st.shared.v2.f64 [r23+144], {fd136, fd135}; +fma.rn.f64 fd137, fd127, fd112, fd132; +sub.f64 fd138, fd130, fd131; +st.shared.v2.f64 [r23+288], {fd138, fd137}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r11]; +ld.shared.v2.f64 {fd143, fd144}, [r11+3888]; +ld.shared.v2.f64 {fd147, fd148}, [r11+7776]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0d3FEBB67AE8584CAA; +add.f64 fd157, fd156, fd154; +sub.f64 fd158, fd154, fd156; +mul.f64 fd159, fd152, 0d3FE0000000000000; +sub.f64 fd160, fd140, fd159; +sub.f64 fd161, fd143, fd147; +mul.f64 fd162, fd161, 0d3FEBB67AE8584CAA; +sub.f64 fd163, fd160, fd162; +add.f64 fd164, fd162, fd160; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 4; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd165, fd166}, [rd21]; +mul.f64 fd169, fd165, fd157; +mul.f64 fd170, fd166, fd163; +mul.f64 fd171, fd165, fd163; +ld.global.v2.f64 {fd172, fd173}, [rd21+144]; +mul.f64 fd176, fd172, fd158; +mul.f64 fd177, fd173, fd164; +mul.f64 fd178, fd172, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 1296, r32; +add.f64 fd179, fd140, fd152; +add.f64 fd180, fd139, fd151; +st.shared.v2.f64 [r33], {fd180, fd179}; +fma.rn.f64 fd181, fd166, fd157, fd171; +sub.f64 fd182, fd169, fd170; +st.shared.v2.f64 [r33+432], {fd182, fd181}; +fma.rn.f64 fd183, fd173, fd158, fd178; +sub.f64 fd184, fd176, fd177; +st.shared.v2.f64 [r33+864], {fd184, fd183}; +barrier.sync 0; +ld.shared.v2.f64 {fd185, fd186}, [r11]; +ld.shared.v2.f64 {fd189, fd190}, [r11+3888]; +ld.shared.v2.f64 {fd193, fd194}, [r11+7776]; +add.f64 fd197, fd189, fd193; +add.f64 fd198, fd190, fd194; +mul.f64 fd199, fd197, 0d3FE0000000000000; +sub.f64 fd200, fd185, fd199; +sub.f64 fd201, fd190, fd194; +mul.f64 fd202, fd201, 0d3FEBB67AE8584CAA; +add.f64 fd203, fd202, fd200; +sub.f64 fd204, fd200, fd202; +mul.f64 fd205, fd198, 0d3FE0000000000000; +sub.f64 fd206, fd186, fd205; +sub.f64 fd207, fd189, fd193; +mul.f64 fd208, fd207, 0d3FEBB67AE8584CAA; +sub.f64 fd209, fd206, fd208; +add.f64 fd210, fd208, fd206; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 4; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd211, fd212}, [rd26]; +mul.f64 fd215, fd211, fd203; +mul.f64 fd216, fd212, fd209; +mul.f64 fd217, fd211, fd209; +ld.global.v2.f64 {fd218, fd219}, [rd26+48]; +mul.f64 fd222, fd218, fd204; +mul.f64 fd223, fd219, fd210; +mul.f64 fd224, fd218, fd210; +barrier.sync 0; +mad.lo.s32 r39, r34, 3888, r38; +add.f64 fd225, fd186, fd198; +add.f64 fd226, fd185, fd197; +st.shared.v2.f64 [r39], {fd226, fd225}; +fma.rn.f64 fd227, fd212, fd203, fd217; +sub.f64 fd228, fd215, fd216; +st.shared.v2.f64 [r39+1296], {fd228, fd227}; +fma.rn.f64 fd229, fd219, fd204, fd224; +sub.f64 fd230, fd222, fd223; +st.shared.v2.f64 [r39+2592], {fd230, fd229}; +barrier.sync 0; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+3888]; +ld.shared.v2.f64 {fd239, fd240}, [r11+7776]; +add.f64 fd243, fd235, fd239; +add.f64 fd244, fd236, fd240; +mul.f64 fd245, fd243, 0d3FE0000000000000; +sub.f64 fd246, fd231, fd245; +sub.f64 fd247, fd236, fd240; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +mul.f64 fd249, fd244, 0d3FE0000000000000; +sub.f64 fd250, fd232, fd249; +sub.f64 fd251, fd235, fd239; +mul.f64 fd252, fd251, 0d3FEBB67AE8584CAA; +add.f64 %1, fd232, fd244; +add.f64 %0, fd231, fd243; +sub.f64 %3, fd250, fd252; +add.f64 %2, fd248, fd246; +add.f64 %5, fd252, fd250; +sub.f64 %4, fd246, fd248; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..1cf029770fafc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_729_fp64_inv.hpp.inc @@ -0,0 +1,4806 @@ +#ifndef CUFFTDX_FFT_729_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_729_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<694, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<1813>; +.reg .b64 rd<9>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 5832, r17; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1804, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1803, %57, fd1804; +mul.f64 fd119, fd1804, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1802, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1801, %63, fd1802; +mul.f64 fd135, fd1802, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1800, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1799, %69, fd1800; +mul.f64 fd151, fd1800, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd1798, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1798, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd1796, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1797, fd155, 0d3FEF838B8C811C17; +sub.f64 fd164, fd1796, fd1797; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd1794, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1795, fd140, 0d3FEF838B8C811C17; +sub.f64 fd169, fd1794, fd1795; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd1792, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1793, fd156, 0d3FD5E3A8748A0BF5; +sub.f64 fd174, fd1792, fd1793; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1791, fd1801, fd1799; +sub.f64 fd183, fd1801, fd1799; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1790, fd1803, fd1791; +mul.f64 fd187, fd1791, 0d3FE0000000000000; +sub.f64 fd188, fd1803, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1789, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1788, fd123, fd1789; +mul.f64 fd203, fd1789, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1787, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1786, fd124, fd1787; +mul.f64 fd219, fd1787, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1783, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1781, %112, fd1783; +mul.f64 fd235, fd1783, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1778, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1776, %115, fd1778; +mul.f64 fd251, fd1778, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1773, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1771, %118, fd1773; +mul.f64 fd267, fd1773, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd1770, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1770, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd1769, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1769, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd1767, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1768, fd256, 0d3FEF838B8C811C17; +sub.f64 fd285, fd1767, fd1768; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd1765, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1766, fd272, 0d3FD5E3A8748A0BF5; +sub.f64 fd290, fd1765, fd1766; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1764, fd1776, fd1771; +sub.f64 fd299, fd1776, fd1771; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1763, fd1781, fd1764; +mul.f64 fd303, fd1764, 0d3FE0000000000000; +sub.f64 fd304, fd1781, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1762, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1761, fd239, fd1762; +mul.f64 fd319, fd1762, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1760, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1759, fd240, fd1760; +mul.f64 fd335, fd1760, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1756, %120, %119; +sub.f64 fd347, %120, %119; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1754, %121, fd1756; +mul.f64 fd351, fd1756, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1751, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1749, %124, fd1751; +mul.f64 fd367, fd1751, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1747, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1745, %126, fd1747; +mul.f64 fd383, fd1747, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd1744, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1744, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd1743, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1743, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd1741, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1742, fd372, 0d3FEF838B8C811C17; +sub.f64 fd401, fd1741, fd1742; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd1739, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1740, fd388, 0d3FD5E3A8748A0BF5; +sub.f64 fd406, fd1739, fd1740; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1738, fd1749, fd1745; +sub.f64 fd415, fd1749, fd1745; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1737, fd1754, fd1738; +mul.f64 fd419, fd1738, 0d3FE0000000000000; +sub.f64 fd420, fd1754, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1736, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1735, fd355, fd1736; +mul.f64 fd435, fd1736, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1734, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1733, fd356, fd1734; +mul.f64 fd451, fd1734, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1761, 0d3FCD84D223638000; +mul.f64 fd1732, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1732, fd458; +mul.f64 fd460, fd1761, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd1730, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1731, fd1735, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd1730, fd1731; +mul.f64 fd465, fd1735, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd1728, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1729, fd1759, 0d3FDCB920325BAFA6; +sub.f64 fd469, fd1728, fd1729; +mul.f64 fd470, fd1759, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd1726, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1727, fd1733, 0d3FE9AAFE4207DF5F; +sub.f64 fd474, fd1726, fd1727; +mul.f64 fd475, fd1733, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd1724, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1725, fd307, 0d3FE491B7523C161D; +sub.f64 fd479, fd1724, fd1725; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd1723, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1723, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd1722, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1722, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd1721, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1721, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0d3FED6206BEB6C24B; +mul.f64 fd1720, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1720, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0d3FE746A51650EADE; +mul.f64 fd1719, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1719, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0d3FEF838B8C811C17; +mul.f64 fd1718, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1718, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd1716, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1717, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd1716, fd1717; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd1714, fd318, 0dBFADC528B5343A86; +mul.f64 fd1715, fd324, 0d3FEFF223F3635CE3; +sub.f64 fd519, fd1714, fd1715; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd1712, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1713, fd440, 0dBFBDB843E577175E; +sub.f64 fd524, fd1712, fd1713; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd1711, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1711, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd1710, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1710, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +add.f64 fd538, fd178, fd537; +mul.f64 fd541, fd537, 0d3FE0000000000000; +sub.f64 fd542, fd178, fd541; +add.f64 fd1709, fd1763, fd1737; +sub.f64 fd543, fd1763, fd1737; +mul.f64 fd544, fd543, 0dBFEBB67AE8584CAA; +add.f64 fd545, fd544, fd542; +sub.f64 fd546, fd542, fd544; +add.f64 fd1708, fd1790, fd1709; +mul.f64 fd547, fd1709, 0d3FE0000000000000; +sub.f64 fd548, fd1790, fd547; +sub.f64 fd549, fd294, fd410; +mul.f64 fd550, fd549, 0dBFEBB67AE8584CAA; +sub.f64 fd551, fd548, fd550; +add.f64 fd552, fd550, fd548; +add.f64 fd553, fd459, fd464; +add.f64 fd554, fd194, fd553; +mul.f64 fd557, fd553, 0d3FE0000000000000; +sub.f64 fd558, fd194, fd557; +add.f64 fd1707, fd461, fd466; +sub.f64 fd559, fd461, fd466; +mul.f64 fd560, fd559, 0dBFEBB67AE8584CAA; +add.f64 fd561, fd560, fd558; +sub.f64 fd562, fd558, fd560; +add.f64 fd1706, fd1788, fd1707; +mul.f64 fd563, fd1707, 0d3FE0000000000000; +sub.f64 fd564, fd1788, fd563; +sub.f64 fd565, fd459, fd464; +mul.f64 fd566, fd565, 0dBFEBB67AE8584CAA; +sub.f64 fd567, fd564, fd566; +add.f64 fd568, fd566, fd564; +add.f64 fd569, fd469, fd474; +add.f64 fd570, fd210, fd569; +mul.f64 fd573, fd569, 0d3FE0000000000000; +sub.f64 fd574, fd210, fd573; +add.f64 fd1705, fd471, fd476; +sub.f64 fd575, fd471, fd476; +mul.f64 fd576, fd575, 0dBFEBB67AE8584CAA; +add.f64 fd577, fd576, fd574; +sub.f64 fd578, fd574, fd576; +add.f64 fd1704, fd1786, fd1705; +mul.f64 fd579, fd1705, 0d3FE0000000000000; +sub.f64 fd580, fd1786, fd579; +sub.f64 fd581, fd469, fd474; +mul.f64 fd582, fd581, 0dBFEBB67AE8584CAA; +sub.f64 fd583, fd580, fd582; +add.f64 fd584, fd582, fd580; +add.f64 fd585, fd479, fd484; +add.f64 fd586, fd185, fd585; +mul.f64 fd589, fd585, 0d3FE0000000000000; +sub.f64 fd590, fd185, fd589; +add.f64 fd1703, fd481, fd486; +sub.f64 fd591, fd481, fd486; +mul.f64 fd592, fd591, 0dBFEBB67AE8584CAA; +add.f64 fd593, fd592, fd590; +sub.f64 fd594, fd590, fd592; +add.f64 fd1702, fd191, fd1703; +mul.f64 fd595, fd1703, 0d3FE0000000000000; +sub.f64 fd596, fd191, fd595; +sub.f64 fd597, fd479, fd484; +mul.f64 fd598, fd597, 0dBFEBB67AE8584CAA; +sub.f64 fd599, fd596, fd598; +add.f64 fd600, fd598, fd596; +add.f64 fd601, fd489, fd494; +add.f64 fd602, fd201, fd601; +mul.f64 fd605, fd601, 0d3FE0000000000000; +sub.f64 fd606, fd201, fd605; +add.f64 fd1701, fd491, fd496; +sub.f64 fd607, fd491, fd496; +mul.f64 fd608, fd607, 0dBFEBB67AE8584CAA; +add.f64 fd609, fd608, fd606; +sub.f64 fd610, fd606, fd608; +add.f64 fd1700, fd207, fd1701; +mul.f64 fd611, fd1701, 0d3FE0000000000000; +sub.f64 fd612, fd207, fd611; +sub.f64 fd613, fd489, fd494; +mul.f64 fd614, fd613, 0dBFEBB67AE8584CAA; +sub.f64 fd615, fd612, fd614; +add.f64 fd616, fd614, fd612; +add.f64 fd617, fd499, fd504; +add.f64 fd618, fd217, fd617; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd217, fd621; +add.f64 fd1699, fd501, fd506; +sub.f64 fd623, fd501, fd506; +mul.f64 fd624, fd623, 0dBFEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +add.f64 fd1698, fd223, fd1699; +mul.f64 fd627, fd1699, 0d3FE0000000000000; +sub.f64 fd628, fd223, fd627; +sub.f64 fd629, fd499, fd504; +mul.f64 fd630, fd629, 0dBFEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd509, fd514; +add.f64 fd634, fd186, fd633; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd186, fd637; +add.f64 fd1697, fd511, fd516; +sub.f64 fd639, fd511, fd516; +mul.f64 fd640, fd639, 0dBFEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +add.f64 fd1696, fd192, fd1697; +mul.f64 fd643, fd1697, 0d3FE0000000000000; +sub.f64 fd644, fd192, fd643; +sub.f64 fd645, fd509, fd514; +mul.f64 fd646, fd645, 0dBFEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +add.f64 fd649, fd519, fd524; +add.f64 fd650, fd202, fd649; +mul.f64 fd653, fd649, 0d3FE0000000000000; +sub.f64 fd654, fd202, fd653; +add.f64 fd1695, fd521, fd526; +sub.f64 fd655, fd521, fd526; +mul.f64 fd656, fd655, 0dBFEBB67AE8584CAA; +add.f64 fd657, fd656, fd654; +sub.f64 fd658, fd654, fd656; +add.f64 fd1694, fd208, fd1695; +mul.f64 fd659, fd1695, 0d3FE0000000000000; +sub.f64 fd660, fd208, fd659; +sub.f64 fd661, fd519, fd524; +mul.f64 fd662, fd661, 0dBFEBB67AE8584CAA; +sub.f64 fd663, fd660, fd662; +add.f64 fd664, fd662, fd660; +add.f64 fd665, fd529, fd534; +add.f64 fd666, fd218, fd665; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd218, fd669; +add.f64 fd1693, fd531, fd536; +sub.f64 fd671, fd531, fd536; +mul.f64 fd672, fd671, 0dBFEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +add.f64 fd1692, fd224, fd1693; +mul.f64 fd675, fd1693, 0d3FE0000000000000; +sub.f64 fd676, fd224, fd675; +sub.f64 fd677, fd529, fd534; +mul.f64 fd678, fd677, 0dBFEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mul.wide.u32 rd7, r11, 16; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd681, fd682}, [rd6]; +mul.f64 fd685, fd1706, fd682; +fma.rn.f64 fd686, fd681, fd554, fd685; +mul.f64 fd687, fd554, fd682; +mul.f64 fd688, fd681, fd1706; +sub.f64 fd689, fd688, fd687; +mul.f64 fd1690, fd681, fd681; +mul.f64 fd1691, fd682, fd682; +sub.f64 fd692, fd1690, fd1691; +mul.f64 fd693, fd682, fd681; +fma.rn.f64 fd694, fd682, fd681, fd693; +mul.f64 fd695, fd1704, fd694; +fma.rn.f64 fd696, fd692, fd570, fd695; +mul.f64 fd697, fd570, fd694; +mul.f64 fd698, fd692, fd1704; +sub.f64 fd699, fd698, fd697; +mul.f64 fd701, fd682, fd694; +mul.f64 fd1689, fd681, fd692; +sub.f64 fd702, fd1689, fd701; +mul.f64 fd703, fd681, fd694; +fma.rn.f64 fd704, fd682, fd692, fd703; +mul.f64 fd705, fd1702, fd704; +fma.rn.f64 fd706, fd702, fd586, fd705; +mul.f64 fd707, fd586, fd704; +mul.f64 fd708, fd702, fd1702; +sub.f64 fd709, fd708, fd707; +mul.f64 fd711, fd682, fd704; +mul.f64 fd1688, fd681, fd702; +sub.f64 fd712, fd1688, fd711; +mul.f64 fd713, fd681, fd704; +fma.rn.f64 fd714, fd682, fd702, fd713; +mul.f64 fd715, fd1700, fd714; +fma.rn.f64 fd716, fd712, fd602, fd715; +mul.f64 fd717, fd602, fd714; +mul.f64 fd718, fd712, fd1700; +sub.f64 fd719, fd718, fd717; +mul.f64 fd721, fd682, fd714; +mul.f64 fd1687, fd681, fd712; +sub.f64 fd722, fd1687, fd721; +mul.f64 fd723, fd681, fd714; +fma.rn.f64 fd724, fd682, fd712, fd723; +mul.f64 fd725, fd1698, fd724; +fma.rn.f64 fd726, fd722, fd618, fd725; +mul.f64 fd727, fd618, fd724; +mul.f64 fd728, fd722, fd1698; +sub.f64 fd729, fd728, fd727; +mul.f64 fd1685, fd681, fd722; +mul.f64 fd1686, fd682, fd724; +sub.f64 fd732, fd1685, fd1686; +mul.f64 fd733, fd681, fd724; +fma.rn.f64 fd734, fd682, fd722, fd733; +mul.f64 fd735, fd1696, fd734; +fma.rn.f64 fd736, fd732, fd634, fd735; +mul.f64 fd737, fd634, fd734; +mul.f64 fd738, fd732, fd1696; +sub.f64 fd739, fd738, fd737; +mul.f64 fd1683, fd681, fd732; +mul.f64 fd1684, fd682, fd734; +sub.f64 fd742, fd1683, fd1684; +mul.f64 fd743, fd681, fd734; +fma.rn.f64 fd744, fd682, fd732, fd743; +mul.f64 fd745, fd1694, fd744; +fma.rn.f64 fd746, fd742, fd650, fd745; +mul.f64 fd747, fd650, fd744; +mul.f64 fd748, fd742, fd1694; +sub.f64 fd749, fd748, fd747; +mul.f64 fd751, fd682, fd744; +mul.f64 fd1682, fd681, fd742; +sub.f64 fd752, fd1682, fd751; +mul.f64 fd753, fd681, fd744; +fma.rn.f64 fd754, fd682, fd742, fd753; +mul.f64 fd755, fd1692, fd754; +fma.rn.f64 fd756, fd752, fd666, fd755; +mul.f64 fd757, fd666, fd754; +mul.f64 fd758, fd752, fd1692; +sub.f64 fd759, fd758, fd757; +mul.f64 fd761, fd682, fd754; +mul.f64 fd1681, fd681, fd752; +sub.f64 fd762, fd1681, fd761; +mul.f64 fd763, fd681, fd754; +fma.rn.f64 fd764, fd682, fd752, fd763; +mul.f64 fd765, fd551, fd764; +fma.rn.f64 fd766, fd762, fd545, fd765; +mul.f64 fd767, fd545, fd764; +mul.f64 fd768, fd762, fd551; +sub.f64 fd769, fd768, fd767; +mul.f64 fd1679, fd681, fd762; +mul.f64 fd1680, fd682, fd764; +sub.f64 fd772, fd1679, fd1680; +mul.f64 fd773, fd681, fd764; +fma.rn.f64 fd774, fd682, fd762, fd773; +mul.f64 fd775, fd567, fd774; +fma.rn.f64 fd776, fd772, fd561, fd775; +mul.f64 fd777, fd561, fd774; +mul.f64 fd778, fd772, fd567; +sub.f64 fd779, fd778, fd777; +mul.f64 fd1677, fd681, fd772; +mul.f64 fd1678, fd682, fd774; +sub.f64 fd782, fd1677, fd1678; +mul.f64 fd783, fd681, fd774; +fma.rn.f64 fd784, fd682, fd772, fd783; +mul.f64 fd785, fd583, fd784; +fma.rn.f64 fd786, fd782, fd577, fd785; +mul.f64 fd787, fd577, fd784; +mul.f64 fd788, fd782, fd583; +sub.f64 fd789, fd788, fd787; +mul.f64 fd791, fd682, fd784; +mul.f64 fd1676, fd681, fd782; +sub.f64 fd792, fd1676, fd791; +mul.f64 fd793, fd681, fd784; +fma.rn.f64 fd794, fd682, fd782, fd793; +mul.f64 fd795, fd599, fd794; +fma.rn.f64 fd796, fd792, fd593, fd795; +mul.f64 fd797, fd593, fd794; +mul.f64 fd798, fd792, fd599; +sub.f64 fd799, fd798, fd797; +mul.f64 fd801, fd682, fd794; +mul.f64 fd1675, fd681, fd792; +sub.f64 fd802, fd1675, fd801; +mul.f64 fd803, fd681, fd794; +fma.rn.f64 fd804, fd682, fd792, fd803; +mul.f64 fd805, fd615, fd804; +fma.rn.f64 fd806, fd802, fd609, fd805; +mul.f64 fd807, fd609, fd804; +mul.f64 fd808, fd802, fd615; +sub.f64 fd809, fd808, fd807; +ld.global.v2.f64 {fd810, fd811}, [rd6+432]; +mul.f64 fd814, fd631, fd811; +fma.rn.f64 fd815, fd810, fd625, fd814; +mul.f64 fd816, fd625, fd811; +mul.f64 fd817, fd810, fd631; +sub.f64 fd818, fd817, fd816; +mul.f64 fd820, fd682, fd811; +mul.f64 fd1674, fd681, fd810; +sub.f64 fd821, fd1674, fd820; +mul.f64 fd822, fd681, fd811; +fma.rn.f64 fd823, fd682, fd810, fd822; +mul.f64 fd824, fd647, fd823; +fma.rn.f64 fd825, fd821, fd641, fd824; +mul.f64 fd826, fd641, fd823; +mul.f64 fd827, fd821, fd647; +sub.f64 fd828, fd827, fd826; +mul.f64 fd830, fd682, fd823; +mul.f64 fd1673, fd681, fd821; +sub.f64 fd831, fd1673, fd830; +mul.f64 fd832, fd681, fd823; +fma.rn.f64 fd833, fd682, fd821, fd832; +mul.f64 fd834, fd663, fd833; +fma.rn.f64 fd835, fd831, fd657, fd834; +mul.f64 fd836, fd657, fd833; +mul.f64 fd837, fd831, fd663; +sub.f64 fd838, fd837, fd836; +mul.f64 fd1671, fd681, fd831; +mul.f64 fd1672, fd682, fd833; +sub.f64 fd841, fd1671, fd1672; +mul.f64 fd842, fd681, fd833; +fma.rn.f64 fd843, fd682, fd831, fd842; +mul.f64 fd844, fd679, fd843; +fma.rn.f64 fd845, fd841, fd673, fd844; +mul.f64 fd846, fd673, fd843; +mul.f64 fd847, fd841, fd679; +sub.f64 fd848, fd847, fd846; +mul.f64 fd1669, fd681, fd841; +mul.f64 fd1670, fd682, fd843; +sub.f64 fd851, fd1669, fd1670; +mul.f64 fd852, fd681, fd843; +fma.rn.f64 fd853, fd682, fd841, fd852; +mul.f64 fd854, fd552, fd853; +fma.rn.f64 fd855, fd851, fd546, fd854; +mul.f64 fd856, fd546, fd853; +mul.f64 fd857, fd851, fd552; +sub.f64 fd858, fd857, fd856; +mul.f64 fd860, fd682, fd853; +mul.f64 fd1668, fd681, fd851; +sub.f64 fd861, fd1668, fd860; +mul.f64 fd862, fd681, fd853; +fma.rn.f64 fd863, fd682, fd851, fd862; +mul.f64 fd864, fd568, fd863; +fma.rn.f64 fd865, fd861, fd562, fd864; +mul.f64 fd866, fd562, fd863; +mul.f64 fd867, fd861, fd568; +sub.f64 fd868, fd867, fd866; +mul.f64 fd870, fd682, fd863; +mul.f64 fd1667, fd681, fd861; +sub.f64 fd871, fd1667, fd870; +mul.f64 fd872, fd681, fd863; +fma.rn.f64 fd873, fd682, fd861, fd872; +mul.f64 fd874, fd584, fd873; +fma.rn.f64 fd875, fd871, fd578, fd874; +mul.f64 fd876, fd578, fd873; +mul.f64 fd877, fd871, fd584; +sub.f64 fd878, fd877, fd876; +mul.f64 fd880, fd682, fd873; +mul.f64 fd1666, fd681, fd871; +sub.f64 fd881, fd1666, fd880; +mul.f64 fd882, fd681, fd873; +fma.rn.f64 fd883, fd682, fd871, fd882; +mul.f64 fd884, fd600, fd883; +fma.rn.f64 fd885, fd881, fd594, fd884; +mul.f64 fd886, fd594, fd883; +mul.f64 fd887, fd881, fd600; +sub.f64 fd888, fd887, fd886; +mul.f64 fd1664, fd681, fd881; +mul.f64 fd1665, fd682, fd883; +sub.f64 fd891, fd1664, fd1665; +mul.f64 fd892, fd681, fd883; +fma.rn.f64 fd893, fd682, fd881, fd892; +mul.f64 fd894, fd616, fd893; +fma.rn.f64 fd895, fd891, fd610, fd894; +mul.f64 fd896, fd610, fd893; +mul.f64 fd897, fd891, fd616; +sub.f64 fd898, fd897, fd896; +mul.f64 fd900, fd682, fd893; +mul.f64 fd1663, fd681, fd891; +sub.f64 fd901, fd1663, fd900; +mul.f64 fd902, fd681, fd893; +fma.rn.f64 fd903, fd682, fd891, fd902; +mul.f64 fd904, fd632, fd903; +fma.rn.f64 fd905, fd901, fd626, fd904; +mul.f64 fd906, fd626, fd903; +mul.f64 fd907, fd901, fd632; +sub.f64 fd908, fd907, fd906; +mul.f64 fd910, fd682, fd903; +mul.f64 fd1662, fd681, fd901; +sub.f64 fd911, fd1662, fd910; +mul.f64 fd912, fd681, fd903; +fma.rn.f64 fd913, fd682, fd901, fd912; +mul.f64 fd914, fd648, fd913; +fma.rn.f64 fd915, fd911, fd642, fd914; +mul.f64 fd916, fd642, fd913; +mul.f64 fd917, fd911, fd648; +sub.f64 fd918, fd917, fd916; +mul.f64 fd920, fd682, fd913; +mul.f64 fd1661, fd681, fd911; +sub.f64 fd921, fd1661, fd920; +mul.f64 fd922, fd681, fd913; +fma.rn.f64 fd923, fd682, fd911, fd922; +mul.f64 fd924, fd664, fd923; +fma.rn.f64 fd925, fd921, fd658, fd924; +mul.f64 fd926, fd658, fd923; +mul.f64 fd927, fd921, fd664; +sub.f64 fd928, fd927, fd926; +mul.f64 fd1659, fd681, fd921; +mul.f64 fd1660, fd682, fd923; +sub.f64 fd931, fd1659, fd1660; +mul.f64 fd932, fd681, fd923; +fma.rn.f64 fd933, fd682, fd921, fd932; +mul.f64 fd934, fd680, fd933; +fma.rn.f64 fd935, fd931, fd674, fd934; +mul.f64 fd936, fd674, fd933; +mul.f64 fd937, fd931, fd680; +sub.f64 fd938, fd937, fd936; +mad.lo.s32 r12, r9, 5832, r3; +barrier.sync 0; +mad.lo.s32 r13, r11, 216, r12; +st.shared.f64 [r13], fd538; +st.shared.f64 [r13+8], fd686; +st.shared.f64 [r13+16], fd696; +st.shared.f64 [r13+24], fd706; +st.shared.f64 [r13+32], fd716; +st.shared.f64 [r13+40], fd726; +st.shared.f64 [r13+48], fd736; +st.shared.f64 [r13+56], fd746; +st.shared.f64 [r13+64], fd756; +st.shared.f64 [r13+72], fd766; +st.shared.f64 [r13+80], fd776; +st.shared.f64 [r13+88], fd786; +st.shared.f64 [r13+96], fd796; +st.shared.f64 [r13+104], fd806; +st.shared.f64 [r13+112], fd815; +st.shared.f64 [r13+120], fd825; +st.shared.f64 [r13+128], fd835; +st.shared.f64 [r13+136], fd845; +st.shared.f64 [r13+144], fd855; +st.shared.f64 [r13+152], fd865; +st.shared.f64 [r13+160], fd875; +st.shared.f64 [r13+168], fd885; +st.shared.f64 [r13+176], fd895; +st.shared.f64 [r13+184], fd905; +st.shared.f64 [r13+192], fd915; +st.shared.f64 [r13+200], fd925; +st.shared.f64 [r13+208], fd935; +barrier.sync 0; +mad.lo.s32 r14, r11, -208, r13; +ld.shared.f64 fd939, [r14]; +ld.shared.f64 fd940, [r14+216]; +ld.shared.f64 fd941, [r14+432]; +ld.shared.f64 fd942, [r14+648]; +ld.shared.f64 fd943, [r14+864]; +ld.shared.f64 fd944, [r14+1080]; +ld.shared.f64 fd945, [r14+1296]; +ld.shared.f64 fd946, [r14+1512]; +ld.shared.f64 fd947, [r14+1728]; +ld.shared.f64 fd948, [r14+1944]; +ld.shared.f64 fd949, [r14+2160]; +ld.shared.f64 fd950, [r14+2376]; +ld.shared.f64 fd951, [r14+2592]; +ld.shared.f64 fd952, [r14+2808]; +ld.shared.f64 fd953, [r14+3024]; +ld.shared.f64 fd954, [r14+3240]; +ld.shared.f64 fd955, [r14+3456]; +ld.shared.f64 fd956, [r14+3672]; +ld.shared.f64 fd957, [r14+3888]; +ld.shared.f64 fd958, [r14+4104]; +ld.shared.f64 fd959, [r14+4320]; +ld.shared.f64 fd960, [r14+4536]; +ld.shared.f64 fd961, [r14+4752]; +ld.shared.f64 fd962, [r14+4968]; +ld.shared.f64 fd963, [r14+5184]; +ld.shared.f64 fd964, [r14+5400]; +ld.shared.f64 fd965, [r14+5616]; +barrier.sync 0; +st.shared.f64 [r13], fd1708; +st.shared.f64 [r13+8], fd689; +st.shared.f64 [r13+16], fd699; +st.shared.f64 [r13+24], fd709; +st.shared.f64 [r13+32], fd719; +st.shared.f64 [r13+40], fd729; +st.shared.f64 [r13+48], fd739; +st.shared.f64 [r13+56], fd749; +st.shared.f64 [r13+64], fd759; +st.shared.f64 [r13+72], fd769; +st.shared.f64 [r13+80], fd779; +st.shared.f64 [r13+88], fd789; +st.shared.f64 [r13+96], fd799; +st.shared.f64 [r13+104], fd809; +st.shared.f64 [r13+112], fd818; +st.shared.f64 [r13+120], fd828; +st.shared.f64 [r13+128], fd838; +st.shared.f64 [r13+136], fd848; +st.shared.f64 [r13+144], fd858; +st.shared.f64 [r13+152], fd868; +st.shared.f64 [r13+160], fd878; +st.shared.f64 [r13+168], fd888; +st.shared.f64 [r13+176], fd898; +st.shared.f64 [r13+184], fd908; +st.shared.f64 [r13+192], fd918; +st.shared.f64 [r13+200], fd928; +st.shared.f64 [r13+208], fd938; +barrier.sync 0; +ld.shared.f64 fd966, [r14]; +ld.shared.f64 fd967, [r14+216]; +ld.shared.f64 fd968, [r14+432]; +ld.shared.f64 fd969, [r14+648]; +ld.shared.f64 fd970, [r14+864]; +ld.shared.f64 fd971, [r14+1080]; +ld.shared.f64 fd972, [r14+1296]; +ld.shared.f64 fd973, [r14+1512]; +ld.shared.f64 fd974, [r14+1728]; +ld.shared.f64 fd975, [r14+1944]; +ld.shared.f64 fd976, [r14+2160]; +ld.shared.f64 fd977, [r14+2376]; +ld.shared.f64 fd978, [r14+2592]; +ld.shared.f64 fd979, [r14+2808]; +ld.shared.f64 fd980, [r14+3024]; +ld.shared.f64 fd981, [r14+3240]; +ld.shared.f64 fd982, [r14+3456]; +ld.shared.f64 fd983, [r14+3672]; +ld.shared.f64 fd984, [r14+3888]; +ld.shared.f64 fd985, [r14+4104]; +ld.shared.f64 fd986, [r14+4320]; +ld.shared.f64 fd987, [r14+4536]; +ld.shared.f64 fd988, [r14+4752]; +ld.shared.f64 fd989, [r14+4968]; +ld.shared.f64 fd990, [r14+5184]; +ld.shared.f64 fd991, [r14+5400]; +ld.shared.f64 fd992, [r14+5616]; +add.f64 fd993, fd948, fd957; +add.f64 fd994, fd939, fd993; +mul.f64 fd997, fd993, 0d3FE0000000000000; +sub.f64 fd998, fd939, fd997; +add.f64 fd1658, fd975, fd984; +sub.f64 fd999, fd975, fd984; +mul.f64 fd1000, fd999, 0dBFEBB67AE8584CAA; +add.f64 fd1001, fd1000, fd998; +sub.f64 fd1002, fd998, fd1000; +add.f64 fd1657, fd966, fd1658; +mul.f64 fd1003, fd1658, 0d3FE0000000000000; +sub.f64 fd1004, fd966, fd1003; +sub.f64 fd1005, fd948, fd957; +mul.f64 fd1006, fd1005, 0dBFEBB67AE8584CAA; +sub.f64 fd1007, fd1004, fd1006; +add.f64 fd1008, fd1006, fd1004; +add.f64 fd1009, fd951, fd960; +add.f64 fd1010, fd942, fd1009; +mul.f64 fd1013, fd1009, 0d3FE0000000000000; +sub.f64 fd1014, fd942, fd1013; +add.f64 fd1656, fd978, fd987; +sub.f64 fd1015, fd978, fd987; +mul.f64 fd1016, fd1015, 0dBFEBB67AE8584CAA; +add.f64 fd1017, fd1016, fd1014; +sub.f64 fd1018, fd1014, fd1016; +add.f64 fd1655, fd969, fd1656; +mul.f64 fd1019, fd1656, 0d3FE0000000000000; +sub.f64 fd1020, fd969, fd1019; +sub.f64 fd1021, fd951, fd960; +mul.f64 fd1022, fd1021, 0dBFEBB67AE8584CAA; +sub.f64 fd1023, fd1020, fd1022; +add.f64 fd1024, fd1022, fd1020; +add.f64 fd1025, fd954, fd963; +add.f64 fd1026, fd945, fd1025; +mul.f64 fd1029, fd1025, 0d3FE0000000000000; +sub.f64 fd1030, fd945, fd1029; +add.f64 fd1654, fd981, fd990; +sub.f64 fd1031, fd981, fd990; +mul.f64 fd1032, fd1031, 0dBFEBB67AE8584CAA; +add.f64 fd1033, fd1032, fd1030; +sub.f64 fd1034, fd1030, fd1032; +add.f64 fd1653, fd972, fd1654; +mul.f64 fd1035, fd1654, 0d3FE0000000000000; +sub.f64 fd1036, fd972, fd1035; +sub.f64 fd1037, fd954, fd963; +mul.f64 fd1038, fd1037, 0dBFEBB67AE8584CAA; +sub.f64 fd1039, fd1036, fd1038; +add.f64 fd1040, fd1038, fd1036; +mul.f64 fd1042, fd1023, 0d3FE491B7523C161D; +mul.f64 fd1652, fd1017, 0d3FE8836FA2CF5039; +sub.f64 fd1043, fd1652, fd1042; +mul.f64 fd1044, fd1023, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1045, fd1017, 0d3FE491B7523C161D, fd1044; +mul.f64 fd1047, fd1039, 0d3FEF838B8C811C17; +mul.f64 fd1651, fd1033, 0d3FC63A1A7E0B738A; +sub.f64 fd1048, fd1651, fd1047; +mul.f64 fd1049, fd1039, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1050, fd1033, 0d3FEF838B8C811C17, fd1049; +mul.f64 fd1052, fd1024, 0d3FEF838B8C811C17; +mul.f64 fd1650, fd1018, 0d3FC63A1A7E0B738A; +sub.f64 fd1053, fd1650, fd1052; +mul.f64 fd1054, fd1024, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1055, fd1018, 0d3FEF838B8C811C17, fd1054; +mul.f64 fd1057, fd1040, 0d3FD5E3A8748A0BF5; +mul.f64 fd1649, fd1034, 0dBFEE11F642522D1C; +sub.f64 fd1058, fd1649, fd1057; +mul.f64 fd1059, fd1040, 0dBFEE11F642522D1C; +fma.rn.f64 fd1060, fd1034, 0d3FD5E3A8748A0BF5, fd1059; +add.f64 fd1061, fd1010, fd1026; +add.f64 fd1062, fd994, fd1061; +mul.f64 fd1065, fd1061, 0d3FE0000000000000; +sub.f64 fd1066, fd994, fd1065; +add.f64 fd1648, fd1655, fd1653; +sub.f64 fd1067, fd1655, fd1653; +mul.f64 fd1068, fd1067, 0dBFEBB67AE8584CAA; +add.f64 fd1069, fd1068, fd1066; +sub.f64 fd1070, fd1066, fd1068; +add.f64 fd1647, fd1657, fd1648; +mul.f64 fd1071, fd1648, 0d3FE0000000000000; +sub.f64 fd1072, fd1657, fd1071; +sub.f64 fd1073, fd1010, fd1026; +mul.f64 fd1074, fd1073, 0dBFEBB67AE8584CAA; +sub.f64 fd1075, fd1072, fd1074; +add.f64 fd1076, fd1074, fd1072; +add.f64 fd1077, fd1043, fd1048; +add.f64 fd1078, fd1001, fd1077; +mul.f64 fd1081, fd1077, 0d3FE0000000000000; +sub.f64 fd1082, fd1001, fd1081; +add.f64 fd1646, fd1045, fd1050; +sub.f64 fd1083, fd1045, fd1050; +mul.f64 fd1084, fd1083, 0dBFEBB67AE8584CAA; +add.f64 fd1085, fd1084, fd1082; +sub.f64 fd1086, fd1082, fd1084; +add.f64 fd1645, fd1007, fd1646; +mul.f64 fd1087, fd1646, 0d3FE0000000000000; +sub.f64 fd1088, fd1007, fd1087; +sub.f64 fd1089, fd1043, fd1048; +mul.f64 fd1090, fd1089, 0dBFEBB67AE8584CAA; +sub.f64 fd1091, fd1088, fd1090; +add.f64 fd1092, fd1090, fd1088; +add.f64 fd1093, fd1053, fd1058; +add.f64 fd1094, fd1002, fd1093; +mul.f64 fd1097, fd1093, 0d3FE0000000000000; +sub.f64 fd1098, fd1002, fd1097; +add.f64 fd1644, fd1055, fd1060; +sub.f64 fd1099, fd1055, fd1060; +mul.f64 fd1100, fd1099, 0dBFEBB67AE8584CAA; +add.f64 fd1101, fd1100, fd1098; +sub.f64 fd1102, fd1098, fd1100; +add.f64 fd1643, fd1008, fd1644; +mul.f64 fd1103, fd1644, 0d3FE0000000000000; +sub.f64 fd1104, fd1008, fd1103; +sub.f64 fd1105, fd1053, fd1058; +mul.f64 fd1106, fd1105, 0dBFEBB67AE8584CAA; +sub.f64 fd1107, fd1104, fd1106; +add.f64 fd1108, fd1106, fd1104; +add.f64 fd1109, fd949, fd958; +add.f64 fd1110, fd940, fd1109; +mul.f64 fd1113, fd1109, 0d3FE0000000000000; +sub.f64 fd1114, fd940, fd1113; +add.f64 fd1642, fd976, fd985; +sub.f64 fd1115, fd976, fd985; +mul.f64 fd1116, fd1115, 0dBFEBB67AE8584CAA; +add.f64 fd1117, fd1116, fd1114; +sub.f64 fd1118, fd1114, fd1116; +add.f64 fd1641, fd967, fd1642; +mul.f64 fd1119, fd1642, 0d3FE0000000000000; +sub.f64 fd1120, fd967, fd1119; +sub.f64 fd1121, fd949, fd958; +mul.f64 fd1122, fd1121, 0dBFEBB67AE8584CAA; +sub.f64 fd1123, fd1120, fd1122; +add.f64 fd1124, fd1122, fd1120; +add.f64 fd1125, fd952, fd961; +add.f64 fd1126, fd943, fd1125; +mul.f64 fd1129, fd1125, 0d3FE0000000000000; +sub.f64 fd1130, fd943, fd1129; +add.f64 fd1640, fd979, fd988; +sub.f64 fd1131, fd979, fd988; +mul.f64 fd1132, fd1131, 0dBFEBB67AE8584CAA; +add.f64 fd1133, fd1132, fd1130; +sub.f64 fd1134, fd1130, fd1132; +add.f64 fd1639, fd970, fd1640; +mul.f64 fd1135, fd1640, 0d3FE0000000000000; +sub.f64 fd1136, fd970, fd1135; +sub.f64 fd1137, fd952, fd961; +mul.f64 fd1138, fd1137, 0dBFEBB67AE8584CAA; +sub.f64 fd1139, fd1136, fd1138; +add.f64 fd1140, fd1138, fd1136; +add.f64 fd1141, fd955, fd964; +add.f64 fd1142, fd946, fd1141; +mul.f64 fd1145, fd1141, 0d3FE0000000000000; +sub.f64 fd1146, fd946, fd1145; +add.f64 fd1638, fd982, fd991; +sub.f64 fd1147, fd982, fd991; +mul.f64 fd1148, fd1147, 0dBFEBB67AE8584CAA; +add.f64 fd1149, fd1148, fd1146; +sub.f64 fd1150, fd1146, fd1148; +add.f64 fd1637, fd973, fd1638; +mul.f64 fd1151, fd1638, 0d3FE0000000000000; +sub.f64 fd1152, fd973, fd1151; +sub.f64 fd1153, fd955, fd964; +mul.f64 fd1154, fd1153, 0dBFEBB67AE8584CAA; +sub.f64 fd1155, fd1152, fd1154; +add.f64 fd1156, fd1154, fd1152; +mul.f64 fd1158, fd1139, 0d3FE491B7523C161D; +mul.f64 fd1636, fd1133, 0d3FE8836FA2CF5039; +sub.f64 fd1159, fd1636, fd1158; +mul.f64 fd1160, fd1139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1161, fd1133, 0d3FE491B7523C161D, fd1160; +mul.f64 fd1163, fd1155, 0d3FEF838B8C811C17; +mul.f64 fd1635, fd1149, 0d3FC63A1A7E0B738A; +sub.f64 fd1164, fd1635, fd1163; +mul.f64 fd1165, fd1155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1166, fd1149, 0d3FEF838B8C811C17, fd1165; +mul.f64 fd1168, fd1140, 0d3FEF838B8C811C17; +mul.f64 fd1634, fd1134, 0d3FC63A1A7E0B738A; +sub.f64 fd1169, fd1634, fd1168; +mul.f64 fd1170, fd1140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1171, fd1134, 0d3FEF838B8C811C17, fd1170; +mul.f64 fd1173, fd1156, 0d3FD5E3A8748A0BF5; +mul.f64 fd1633, fd1150, 0dBFEE11F642522D1C; +sub.f64 fd1174, fd1633, fd1173; +mul.f64 fd1175, fd1156, 0dBFEE11F642522D1C; +fma.rn.f64 fd1176, fd1150, 0d3FD5E3A8748A0BF5, fd1175; +add.f64 fd1177, fd1126, fd1142; +add.f64 fd1178, fd1110, fd1177; +mul.f64 fd1181, fd1177, 0d3FE0000000000000; +sub.f64 fd1182, fd1110, fd1181; +add.f64 fd1632, fd1639, fd1637; +sub.f64 fd1183, fd1639, fd1637; +mul.f64 fd1184, fd1183, 0dBFEBB67AE8584CAA; +add.f64 fd1185, fd1184, fd1182; +sub.f64 fd1186, fd1182, fd1184; +add.f64 fd1631, fd1641, fd1632; +mul.f64 fd1187, fd1632, 0d3FE0000000000000; +sub.f64 fd1188, fd1641, fd1187; +sub.f64 fd1189, fd1126, fd1142; +mul.f64 fd1190, fd1189, 0dBFEBB67AE8584CAA; +sub.f64 fd1191, fd1188, fd1190; +add.f64 fd1192, fd1190, fd1188; +add.f64 fd1193, fd1159, fd1164; +add.f64 fd1194, fd1117, fd1193; +mul.f64 fd1197, fd1193, 0d3FE0000000000000; +sub.f64 fd1198, fd1117, fd1197; +add.f64 fd1630, fd1161, fd1166; +sub.f64 fd1199, fd1161, fd1166; +mul.f64 fd1200, fd1199, 0dBFEBB67AE8584CAA; +add.f64 fd1201, fd1200, fd1198; +sub.f64 fd1202, fd1198, fd1200; +add.f64 fd1629, fd1123, fd1630; +mul.f64 fd1203, fd1630, 0d3FE0000000000000; +sub.f64 fd1204, fd1123, fd1203; +sub.f64 fd1205, fd1159, fd1164; +mul.f64 fd1206, fd1205, 0dBFEBB67AE8584CAA; +sub.f64 fd1207, fd1204, fd1206; +add.f64 fd1208, fd1206, fd1204; +add.f64 fd1209, fd1169, fd1174; +add.f64 fd1210, fd1118, fd1209; +mul.f64 fd1213, fd1209, 0d3FE0000000000000; +sub.f64 fd1214, fd1118, fd1213; +add.f64 fd1628, fd1171, fd1176; +sub.f64 fd1215, fd1171, fd1176; +mul.f64 fd1216, fd1215, 0dBFEBB67AE8584CAA; +add.f64 fd1217, fd1216, fd1214; +sub.f64 fd1218, fd1214, fd1216; +add.f64 fd1627, fd1124, fd1628; +mul.f64 fd1219, fd1628, 0d3FE0000000000000; +sub.f64 fd1220, fd1124, fd1219; +sub.f64 fd1221, fd1169, fd1174; +mul.f64 fd1222, fd1221, 0dBFEBB67AE8584CAA; +sub.f64 fd1223, fd1220, fd1222; +add.f64 fd1224, fd1222, fd1220; +add.f64 fd1225, fd950, fd959; +add.f64 fd1226, fd941, fd1225; +mul.f64 fd1229, fd1225, 0d3FE0000000000000; +sub.f64 fd1230, fd941, fd1229; +add.f64 fd1626, fd977, fd986; +sub.f64 fd1231, fd977, fd986; +mul.f64 fd1232, fd1231, 0dBFEBB67AE8584CAA; +add.f64 fd1233, fd1232, fd1230; +sub.f64 fd1234, fd1230, fd1232; +add.f64 fd1625, fd968, fd1626; +mul.f64 fd1235, fd1626, 0d3FE0000000000000; +sub.f64 fd1236, fd968, fd1235; +sub.f64 fd1237, fd950, fd959; +mul.f64 fd1238, fd1237, 0dBFEBB67AE8584CAA; +sub.f64 fd1239, fd1236, fd1238; +add.f64 fd1240, fd1238, fd1236; +add.f64 fd1241, fd953, fd962; +add.f64 fd1242, fd944, fd1241; +mul.f64 fd1245, fd1241, 0d3FE0000000000000; +sub.f64 fd1246, fd944, fd1245; +add.f64 fd1624, fd980, fd989; +sub.f64 fd1247, fd980, fd989; +mul.f64 fd1248, fd1247, 0dBFEBB67AE8584CAA; +add.f64 fd1249, fd1248, fd1246; +sub.f64 fd1250, fd1246, fd1248; +add.f64 fd1623, fd971, fd1624; +mul.f64 fd1251, fd1624, 0d3FE0000000000000; +sub.f64 fd1252, fd971, fd1251; +sub.f64 fd1253, fd953, fd962; +mul.f64 fd1254, fd1253, 0dBFEBB67AE8584CAA; +sub.f64 fd1255, fd1252, fd1254; +add.f64 fd1256, fd1254, fd1252; +add.f64 fd1257, fd956, fd965; +add.f64 fd1258, fd947, fd1257; +mul.f64 fd1261, fd1257, 0d3FE0000000000000; +sub.f64 fd1262, fd947, fd1261; +add.f64 fd1622, fd983, fd992; +sub.f64 fd1263, fd983, fd992; +mul.f64 fd1264, fd1263, 0dBFEBB67AE8584CAA; +add.f64 fd1265, fd1264, fd1262; +sub.f64 fd1266, fd1262, fd1264; +add.f64 fd1621, fd974, fd1622; +mul.f64 fd1267, fd1622, 0d3FE0000000000000; +sub.f64 fd1268, fd974, fd1267; +sub.f64 fd1269, fd956, fd965; +mul.f64 fd1270, fd1269, 0dBFEBB67AE8584CAA; +sub.f64 fd1271, fd1268, fd1270; +add.f64 fd1272, fd1270, fd1268; +mul.f64 fd1274, fd1255, 0d3FE491B7523C161D; +mul.f64 fd1620, fd1249, 0d3FE8836FA2CF5039; +sub.f64 fd1275, fd1620, fd1274; +mul.f64 fd1276, fd1255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1277, fd1249, 0d3FE491B7523C161D, fd1276; +mul.f64 fd1279, fd1271, 0d3FEF838B8C811C17; +mul.f64 fd1619, fd1265, 0d3FC63A1A7E0B738A; +sub.f64 fd1280, fd1619, fd1279; +mul.f64 fd1281, fd1271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1282, fd1265, 0d3FEF838B8C811C17, fd1281; +mul.f64 fd1284, fd1256, 0d3FEF838B8C811C17; +mul.f64 fd1618, fd1250, 0d3FC63A1A7E0B738A; +sub.f64 fd1285, fd1618, fd1284; +mul.f64 fd1286, fd1256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1287, fd1250, 0d3FEF838B8C811C17, fd1286; +mul.f64 fd1289, fd1272, 0d3FD5E3A8748A0BF5; +mul.f64 fd1617, fd1266, 0dBFEE11F642522D1C; +sub.f64 fd1290, fd1617, fd1289; +mul.f64 fd1291, fd1272, 0dBFEE11F642522D1C; +fma.rn.f64 fd1292, fd1266, 0d3FD5E3A8748A0BF5, fd1291; +add.f64 fd1293, fd1242, fd1258; +add.f64 fd1294, fd1226, fd1293; +mul.f64 fd1297, fd1293, 0d3FE0000000000000; +sub.f64 fd1298, fd1226, fd1297; +add.f64 fd1616, fd1623, fd1621; +sub.f64 fd1299, fd1623, fd1621; +mul.f64 fd1300, fd1299, 0dBFEBB67AE8584CAA; +add.f64 fd1301, fd1300, fd1298; +sub.f64 fd1302, fd1298, fd1300; +add.f64 fd1615, fd1625, fd1616; +mul.f64 fd1303, fd1616, 0d3FE0000000000000; +sub.f64 fd1304, fd1625, fd1303; +sub.f64 fd1305, fd1242, fd1258; +mul.f64 fd1306, fd1305, 0dBFEBB67AE8584CAA; +sub.f64 fd1307, fd1304, fd1306; +add.f64 fd1308, fd1306, fd1304; +add.f64 fd1309, fd1275, fd1280; +add.f64 fd1310, fd1233, fd1309; +mul.f64 fd1313, fd1309, 0d3FE0000000000000; +sub.f64 fd1314, fd1233, fd1313; +add.f64 fd1614, fd1277, fd1282; +sub.f64 fd1315, fd1277, fd1282; +mul.f64 fd1316, fd1315, 0dBFEBB67AE8584CAA; +add.f64 fd1317, fd1316, fd1314; +sub.f64 fd1318, fd1314, fd1316; +add.f64 fd1613, fd1239, fd1614; +mul.f64 fd1319, fd1614, 0d3FE0000000000000; +sub.f64 fd1320, fd1239, fd1319; +sub.f64 fd1321, fd1275, fd1280; +mul.f64 fd1322, fd1321, 0dBFEBB67AE8584CAA; +sub.f64 fd1323, fd1320, fd1322; +add.f64 fd1324, fd1322, fd1320; +add.f64 fd1325, fd1285, fd1290; +add.f64 fd1326, fd1234, fd1325; +mul.f64 fd1329, fd1325, 0d3FE0000000000000; +sub.f64 fd1330, fd1234, fd1329; +add.f64 fd1612, fd1287, fd1292; +sub.f64 fd1331, fd1287, fd1292; +mul.f64 fd1332, fd1331, 0dBFEBB67AE8584CAA; +add.f64 fd1333, fd1332, fd1330; +sub.f64 fd1334, fd1330, fd1332; +add.f64 fd1611, fd1240, fd1612; +mul.f64 fd1335, fd1612, 0d3FE0000000000000; +sub.f64 fd1336, fd1240, fd1335; +sub.f64 fd1337, fd1285, fd1290; +mul.f64 fd1338, fd1337, 0dBFEBB67AE8584CAA; +sub.f64 fd1339, fd1336, fd1338; +add.f64 fd1340, fd1338, fd1336; +mul.f64 fd1609, fd1194, 0d3FEF232EFF15C9E6; +mul.f64 fd1610, fd1629, 0d3FCD84D223638000; +sub.f64 fd1343, fd1609, fd1610; +mul.f64 fd1344, fd1629, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd1345, fd1194, 0d3FCD84D223638000, fd1344; +mul.f64 fd1607, fd1310, 0d3FEC98A37A9A7850; +mul.f64 fd1608, fd1613, 0d3FDCB920325BAFA6; +sub.f64 fd1348, fd1607, fd1608; +mul.f64 fd1349, fd1613, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1350, fd1310, 0d3FDCB920325BAFA6, fd1349; +mul.f64 fd1605, fd1210, 0d3FEC98A37A9A7850; +mul.f64 fd1606, fd1627, 0d3FDCB920325BAFA6; +sub.f64 fd1353, fd1605, fd1606; +mul.f64 fd1354, fd1627, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1355, fd1210, 0d3FDCB920325BAFA6, fd1354; +mul.f64 fd1357, fd1611, 0d3FE9AAFE4207DF5F; +mul.f64 fd1604, fd1326, 0d3FE31BEC55BC71BC; +sub.f64 fd1358, fd1604, fd1357; +mul.f64 fd1359, fd1611, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1360, fd1326, 0d3FE9AAFE4207DF5F, fd1359; +mul.f64 fd1362, fd1191, 0d3FE491B7523C161D; +mul.f64 fd1603, fd1185, 0d3FE8836FA2CF5039; +sub.f64 fd1363, fd1603, fd1362; +mul.f64 fd1364, fd1191, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1365, fd1185, 0d3FE491B7523C161D, fd1364; +mul.f64 fd1367, fd1307, 0d3FEF838B8C811C17; +mul.f64 fd1602, fd1301, 0d3FC63A1A7E0B738A; +sub.f64 fd1368, fd1602, fd1367; +mul.f64 fd1369, fd1307, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1370, fd1301, 0d3FEF838B8C811C17, fd1369; +mul.f64 fd1372, fd1207, 0d3FE9AAFE4207DF5F; +mul.f64 fd1601, fd1201, 0d3FE31BEC55BC71BC; +sub.f64 fd1373, fd1601, fd1372; +mul.f64 fd1374, fd1207, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1375, fd1201, 0d3FE9AAFE4207DF5F, fd1374; +mul.f64 fd1377, fd1323, 0d3FEEA7D99F29CADE; +mul.f64 fd1600, fd1317, 0dBFD25AFBF23865BF; +sub.f64 fd1378, fd1600, fd1377; +mul.f64 fd1379, fd1323, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1380, fd1317, 0d3FEEA7D99F29CADE, fd1379; +mul.f64 fd1598, fd1217, 0d3FD9595EF26FB670; +mul.f64 fd1599, fd1223, 0d3FED6206BEB6C24B; +sub.f64 fd1383, fd1598, fd1599; +mul.f64 fd1384, fd1223, 0d3FD9595EF26FB670; +fma.rn.f64 fd1385, fd1217, 0d3FED6206BEB6C24B, fd1384; +mul.f64 fd1596, fd1333, 0dBFE5F5B105F99707; +mul.f64 fd1597, fd1339, 0d3FE746A51650EADE; +sub.f64 fd1388, fd1596, fd1597; +mul.f64 fd1389, fd1339, 0dBFE5F5B105F99707; +fma.rn.f64 fd1390, fd1333, 0d3FE746A51650EADE, fd1389; +mul.f64 fd1594, fd1186, 0d3FC63A1A7E0B738A; +mul.f64 fd1595, fd1192, 0d3FEF838B8C811C17; +sub.f64 fd1393, fd1594, fd1595; +mul.f64 fd1394, fd1192, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1395, fd1186, 0d3FEF838B8C811C17, fd1394; +mul.f64 fd1592, fd1302, 0dBFEE11F642522D1C; +mul.f64 fd1593, fd1308, 0d3FD5E3A8748A0BF5; +sub.f64 fd1398, fd1592, fd1593; +mul.f64 fd1399, fd1308, 0dBFEE11F642522D1C; +fma.rn.f64 fd1400, fd1302, 0d3FD5E3A8748A0BF5, fd1399; +mul.f64 fd1402, fd1208, 0d3FEFF223F3635CE3; +mul.f64 fd1591, fd1202, 0dBFADC528B5343A86; +sub.f64 fd1403, fd1591, fd1402; +mul.f64 fd1404, fd1208, 0dBFADC528B5343A86; +fma.rn.f64 fd1405, fd1202, 0d3FEFF223F3635CE3, fd1404; +mul.f64 fd1407, fd1324, 0dBFBDB843E577175E; +mul.f64 fd1590, fd1318, 0dBFEFC89BCEF44CF4; +sub.f64 fd1408, fd1590, fd1407; +mul.f64 fd1409, fd1324, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd1410, fd1318, 0dBFBDB843E577175E, fd1409; +mul.f64 fd1412, fd1224, 0d3FEEA7D99F29CADE; +mul.f64 fd1589, fd1218, 0dBFD25AFBF23865BF; +sub.f64 fd1413, fd1589, fd1412; +mul.f64 fd1414, fd1224, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1415, fd1218, 0d3FEEA7D99F29CADE, fd1414; +mul.f64 fd1417, fd1340, 0dBFE19593DA358510; +mul.f64 fd1588, fd1334, 0dBFEABC50EF4734A7; +sub.f64 fd1418, fd1588, fd1417; +mul.f64 fd1419, fd1340, 0dBFEABC50EF4734A7; +fma.rn.f64 fd1420, fd1334, 0dBFE19593DA358510, fd1419; +add.f64 fd1421, fd1178, fd1294; +mul.f64 fd1423, fd1421, 0d3FE0000000000000; +sub.f64 fd1424, fd1062, fd1423; +add.f64 fd1587, fd1631, fd1615; +sub.f64 fd1425, fd1631, fd1615; +mul.f64 fd1426, fd1425, 0dBFEBB67AE8584CAA; +mul.f64 fd1427, fd1587, 0d3FE0000000000000; +sub.f64 fd1428, fd1647, fd1427; +sub.f64 fd1429, fd1178, fd1294; +mul.f64 fd1430, fd1429, 0dBFEBB67AE8584CAA; +add.f64 fd1431, fd1343, fd1348; +mul.f64 fd1433, fd1431, 0d3FE0000000000000; +sub.f64 fd1434, fd1078, fd1433; +add.f64 fd1586, fd1345, fd1350; +sub.f64 fd1435, fd1345, fd1350; +mul.f64 fd1436, fd1435, 0dBFEBB67AE8584CAA; +mul.f64 fd1437, fd1586, 0d3FE0000000000000; +sub.f64 fd1438, fd1645, fd1437; +sub.f64 fd1439, fd1343, fd1348; +mul.f64 fd1440, fd1439, 0dBFEBB67AE8584CAA; +add.f64 fd1441, fd1353, fd1358; +mul.f64 fd1443, fd1441, 0d3FE0000000000000; +sub.f64 fd1444, fd1094, fd1443; +add.f64 fd1585, fd1355, fd1360; +sub.f64 fd1445, fd1355, fd1360; +mul.f64 fd1446, fd1445, 0dBFEBB67AE8584CAA; +mul.f64 fd1447, fd1585, 0d3FE0000000000000; +sub.f64 fd1448, fd1643, fd1447; +sub.f64 fd1449, fd1353, fd1358; +mul.f64 fd1450, fd1449, 0dBFEBB67AE8584CAA; +add.f64 fd1451, fd1363, fd1368; +mul.f64 fd1453, fd1451, 0d3FE0000000000000; +sub.f64 fd1454, fd1069, fd1453; +add.f64 fd1584, fd1365, fd1370; +sub.f64 fd1455, fd1365, fd1370; +mul.f64 fd1456, fd1455, 0dBFEBB67AE8584CAA; +mul.f64 fd1457, fd1584, 0d3FE0000000000000; +sub.f64 fd1458, fd1075, fd1457; +sub.f64 fd1459, fd1363, fd1368; +mul.f64 fd1460, fd1459, 0dBFEBB67AE8584CAA; +add.f64 fd1461, fd1373, fd1378; +mul.f64 fd1463, fd1461, 0d3FE0000000000000; +sub.f64 fd1464, fd1085, fd1463; +add.f64 fd1583, fd1375, fd1380; +sub.f64 fd1465, fd1375, fd1380; +mul.f64 fd1466, fd1465, 0dBFEBB67AE8584CAA; +mul.f64 fd1467, fd1583, 0d3FE0000000000000; +sub.f64 fd1468, fd1091, fd1467; +sub.f64 fd1469, fd1373, fd1378; +mul.f64 fd1470, fd1469, 0dBFEBB67AE8584CAA; +add.f64 fd1471, fd1383, fd1388; +mul.f64 fd1473, fd1471, 0d3FE0000000000000; +sub.f64 fd1474, fd1101, fd1473; +add.f64 fd1582, fd1385, fd1390; +sub.f64 fd1475, fd1385, fd1390; +mul.f64 fd1476, fd1475, 0dBFEBB67AE8584CAA; +mul.f64 fd1477, fd1582, 0d3FE0000000000000; +sub.f64 fd1478, fd1107, fd1477; +sub.f64 fd1479, fd1383, fd1388; +mul.f64 fd1480, fd1479, 0dBFEBB67AE8584CAA; +add.f64 fd1481, fd1393, fd1398; +mul.f64 fd1483, fd1481, 0d3FE0000000000000; +sub.f64 fd1484, fd1070, fd1483; +add.f64 fd1581, fd1395, fd1400; +sub.f64 fd1485, fd1395, fd1400; +mul.f64 fd1486, fd1485, 0dBFEBB67AE8584CAA; +mul.f64 fd1487, fd1581, 0d3FE0000000000000; +sub.f64 fd1488, fd1076, fd1487; +sub.f64 fd1489, fd1393, fd1398; +mul.f64 fd1490, fd1489, 0dBFEBB67AE8584CAA; +add.f64 fd1491, fd1403, fd1408; +mul.f64 fd1493, fd1491, 0d3FE0000000000000; +sub.f64 fd1494, fd1086, fd1493; +add.f64 fd1580, fd1405, fd1410; +sub.f64 fd1495, fd1405, fd1410; +mul.f64 fd1496, fd1495, 0dBFEBB67AE8584CAA; +mul.f64 fd1497, fd1580, 0d3FE0000000000000; +sub.f64 fd1498, fd1092, fd1497; +sub.f64 fd1499, fd1403, fd1408; +mul.f64 fd1500, fd1499, 0dBFEBB67AE8584CAA; +add.f64 fd1501, fd1413, fd1418; +mul.f64 fd1503, fd1501, 0d3FE0000000000000; +sub.f64 fd1504, fd1102, fd1503; +add.f64 fd1579, fd1415, fd1420; +sub.f64 fd1505, fd1415, fd1420; +mul.f64 fd1506, fd1505, 0dBFEBB67AE8584CAA; +mul.f64 fd1507, fd1579, 0d3FE0000000000000; +sub.f64 fd1508, fd1108, fd1507; +sub.f64 fd1509, fd1413, fd1418; +mul.f64 fd1806, fd1431, 0d3FE0000000000000; +sub.f64 fd1805, fd1078, fd1806; +mul.f64 fd1510, fd1509, 0dBFEBB67AE8584CAA; +add.f64 %0, fd1062, fd1421; +mul.f64 fd1808, fd1585, 0d3FE0000000000000; +sub.f64 fd1807, fd1643, fd1808; +add.f64 %1, fd1647, fd1587; +mul.f64 fd1810, fd1586, 0d3FE0000000000000; +sub.f64 fd1809, fd1645, fd1810; +mul.f64 fd1812, fd1501, 0d3FE0000000000000; +sub.f64 fd1811, fd1102, fd1812; +add.f64 %3, fd1645, fd1586; +add.f64 %2, fd1078, fd1431; +add.f64 %5, fd1643, fd1585; +add.f64 %4, fd1094, fd1441; +add.f64 %7, fd1075, fd1584; +add.f64 %6, fd1069, fd1451; +add.f64 %9, fd1091, fd1583; +add.f64 %8, fd1085, fd1461; +add.f64 %11, fd1107, fd1582; +add.f64 %10, fd1101, fd1471; +add.f64 %13, fd1076, fd1581; +add.f64 %12, fd1070, fd1481; +add.f64 %15, fd1092, fd1580; +add.f64 %14, fd1086, fd1491; +add.f64 %17, fd1108, fd1579; +add.f64 %16, fd1102, fd1501; +sub.f64 %19, fd1428, fd1430; +add.f64 %18, fd1426, fd1424; +sub.f64 %21, fd1809, fd1440; +add.f64 %20, fd1436, fd1805; +sub.f64 %23, fd1807, fd1450; +add.f64 %22, fd1446, fd1444; +sub.f64 %25, fd1458, fd1460; +add.f64 %24, fd1456, fd1454; +add.f64 %26, fd1466, fd1464; +sub.f64 %27, fd1468, fd1470; +add.f64 %28, fd1476, fd1474; +sub.f64 %29, fd1478, fd1480; +add.f64 %30, fd1486, fd1484; +sub.f64 %31, fd1488, fd1490; +sub.f64 %33, fd1498, fd1500; +add.f64 %32, fd1496, fd1494; +sub.f64 %35, fd1508, fd1510; +add.f64 %34, fd1506, fd1811; +sub.f64 %36, fd1424, fd1426; +add.f64 %37, fd1430, fd1428; +add.f64 %39, fd1440, fd1809; +sub.f64 %38, fd1805, fd1436; +add.f64 %41, fd1450, fd1807; +sub.f64 %40, fd1444, fd1446; +add.f64 %43, fd1460, fd1458; +sub.f64 %42, fd1454, fd1456; +add.f64 %45, fd1470, fd1468; +sub.f64 %44, fd1464, fd1466; +add.f64 %47, fd1480, fd1478; +sub.f64 %46, fd1474, fd1476; +add.f64 %49, fd1490, fd1488; +sub.f64 %48, fd1484, fd1486; +add.f64 %51, fd1500, fd1498; +sub.f64 %50, fd1494, fd1496; +add.f64 %53, fd1510, fd1508; +sub.f64 %52, fd1811, fd1506; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_729), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[20].y), "d"(rmem[11].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<695, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<1887>; +.reg .b64 rd<8>; +mov.u32 r16, %tid.y; +mov.u32 r17, %54; +mad.lo.s32 r3, r16, 11664, r17; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1886, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1885, %57, fd1886; +mul.f64 fd119, fd1886, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1884, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1883, %63, fd1884; +mul.f64 fd135, fd1884, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1882, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1881, %69, fd1882; +mul.f64 fd151, fd1882, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd1880, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1880, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd1878, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1879, fd155, 0d3FEF838B8C811C17; +sub.f64 fd164, fd1878, fd1879; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd1876, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1877, fd140, 0d3FEF838B8C811C17; +sub.f64 fd169, fd1876, fd1877; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd1874, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1875, fd156, 0d3FD5E3A8748A0BF5; +sub.f64 fd174, fd1874, fd1875; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1873, fd1883, fd1881; +sub.f64 fd183, fd1883, fd1881; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1872, fd1885, fd1873; +mul.f64 fd187, fd1873, 0d3FE0000000000000; +sub.f64 fd188, fd1885, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1871, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1870, fd123, fd1871; +mul.f64 fd203, fd1871, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1869, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1868, fd124, fd1869; +mul.f64 fd219, fd1869, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1865, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1863, %112, fd1865; +mul.f64 fd235, fd1865, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1860, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1858, %115, fd1860; +mul.f64 fd251, fd1860, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1855, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1853, %118, fd1855; +mul.f64 fd267, fd1855, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd1852, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1852, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd1851, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1851, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd1849, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1850, fd256, 0d3FEF838B8C811C17; +sub.f64 fd285, fd1849, fd1850; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd1847, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1848, fd272, 0d3FD5E3A8748A0BF5; +sub.f64 fd290, fd1847, fd1848; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1846, fd1858, fd1853; +sub.f64 fd299, fd1858, fd1853; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1845, fd1863, fd1846; +mul.f64 fd303, fd1846, 0d3FE0000000000000; +sub.f64 fd304, fd1863, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1844, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1843, fd239, fd1844; +mul.f64 fd319, fd1844, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1842, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1841, fd240, fd1842; +mul.f64 fd335, fd1842, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1838, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1836, %121, fd1838; +mul.f64 fd351, fd1838, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1833, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1831, %124, fd1833; +mul.f64 fd367, fd1833, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1829, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1827, %126, fd1829; +mul.f64 fd383, fd1829, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd1826, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1826, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd1825, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1825, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd1823, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1824, fd372, 0d3FEF838B8C811C17; +sub.f64 fd401, fd1823, fd1824; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd1821, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1822, fd388, 0d3FD5E3A8748A0BF5; +sub.f64 fd406, fd1821, fd1822; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1820, fd1831, fd1827; +sub.f64 fd415, fd1831, fd1827; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1819, fd1836, fd1820; +mul.f64 fd419, fd1820, 0d3FE0000000000000; +sub.f64 fd420, fd1836, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1818, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1817, fd355, fd1818; +mul.f64 fd435, fd1818, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1816, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1815, fd356, fd1816; +mul.f64 fd451, fd1816, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1843, 0d3FCD84D223638000; +mul.f64 fd1814, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1814, fd458; +mul.f64 fd460, fd1843, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd1812, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1813, fd1817, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd1812, fd1813; +mul.f64 fd465, fd1817, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd1810, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1811, fd1841, 0d3FDCB920325BAFA6; +sub.f64 fd469, fd1810, fd1811; +mul.f64 fd470, fd1841, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd1808, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1809, fd1815, 0d3FE9AAFE4207DF5F; +sub.f64 fd474, fd1808, fd1809; +mul.f64 fd475, fd1815, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd1806, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1807, fd307, 0d3FE491B7523C161D; +sub.f64 fd479, fd1806, fd1807; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd1805, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1805, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd1804, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1804, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd1803, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1803, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0d3FED6206BEB6C24B; +mul.f64 fd1802, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1802, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0d3FE746A51650EADE; +mul.f64 fd1801, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1801, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0d3FEF838B8C811C17; +mul.f64 fd1800, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1800, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd1798, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1799, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd1798, fd1799; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd1796, fd318, 0dBFADC528B5343A86; +mul.f64 fd1797, fd324, 0d3FEFF223F3635CE3; +sub.f64 fd519, fd1796, fd1797; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd1794, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1795, fd440, 0dBFBDB843E577175E; +sub.f64 fd524, fd1794, fd1795; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd1793, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1793, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd1792, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1792, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd1791, fd1845, fd1819; +sub.f64 fd541, fd1845, fd1819; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +add.f64 fd543, fd542, fd540; +sub.f64 fd544, fd540, fd542; +mul.f64 fd545, fd1791, 0d3FE0000000000000; +sub.f64 fd546, fd1872, fd545; +sub.f64 fd547, fd294, fd410; +mul.f64 fd548, fd547, 0dBFEBB67AE8584CAA; +sub.f64 fd549, fd546, fd548; +add.f64 fd550, fd548, fd546; +add.f64 fd551, fd459, fd464; +add.f64 fd552, fd194, fd551; +mul.f64 fd555, fd551, 0d3FE0000000000000; +sub.f64 fd556, fd194, fd555; +add.f64 fd1790, fd461, fd466; +sub.f64 fd557, fd461, fd466; +mul.f64 fd558, fd557, 0dBFEBB67AE8584CAA; +add.f64 fd559, fd558, fd556; +sub.f64 fd560, fd556, fd558; +add.f64 fd1789, fd1870, fd1790; +mul.f64 fd561, fd1790, 0d3FE0000000000000; +sub.f64 fd562, fd1870, fd561; +sub.f64 fd563, fd459, fd464; +mul.f64 fd564, fd563, 0dBFEBB67AE8584CAA; +sub.f64 fd565, fd562, fd564; +add.f64 fd566, fd564, fd562; +add.f64 fd567, fd469, fd474; +add.f64 fd568, fd210, fd567; +mul.f64 fd571, fd567, 0d3FE0000000000000; +sub.f64 fd572, fd210, fd571; +add.f64 fd1788, fd471, fd476; +sub.f64 fd573, fd471, fd476; +mul.f64 fd574, fd573, 0dBFEBB67AE8584CAA; +add.f64 fd575, fd574, fd572; +sub.f64 fd576, fd572, fd574; +add.f64 fd1787, fd1868, fd1788; +mul.f64 fd577, fd1788, 0d3FE0000000000000; +sub.f64 fd578, fd1868, fd577; +sub.f64 fd579, fd469, fd474; +mul.f64 fd580, fd579, 0dBFEBB67AE8584CAA; +sub.f64 fd581, fd578, fd580; +add.f64 fd582, fd580, fd578; +add.f64 fd583, fd479, fd484; +add.f64 fd584, fd185, fd583; +mul.f64 fd587, fd583, 0d3FE0000000000000; +sub.f64 fd588, fd185, fd587; +add.f64 fd1786, fd481, fd486; +sub.f64 fd589, fd481, fd486; +mul.f64 fd590, fd589, 0dBFEBB67AE8584CAA; +add.f64 fd591, fd590, fd588; +sub.f64 fd592, fd588, fd590; +add.f64 fd1785, fd191, fd1786; +mul.f64 fd593, fd1786, 0d3FE0000000000000; +sub.f64 fd594, fd191, fd593; +sub.f64 fd595, fd479, fd484; +mul.f64 fd596, fd595, 0dBFEBB67AE8584CAA; +sub.f64 fd597, fd594, fd596; +add.f64 fd598, fd596, fd594; +add.f64 fd599, fd489, fd494; +add.f64 fd600, fd201, fd599; +mul.f64 fd603, fd599, 0d3FE0000000000000; +sub.f64 fd604, fd201, fd603; +add.f64 fd1784, fd491, fd496; +sub.f64 fd605, fd491, fd496; +mul.f64 fd606, fd605, 0dBFEBB67AE8584CAA; +add.f64 fd607, fd606, fd604; +sub.f64 fd608, fd604, fd606; +add.f64 fd1783, fd207, fd1784; +mul.f64 fd609, fd1784, 0d3FE0000000000000; +sub.f64 fd610, fd207, fd609; +sub.f64 fd611, fd489, fd494; +mul.f64 fd612, fd611, 0dBFEBB67AE8584CAA; +sub.f64 fd613, fd610, fd612; +add.f64 fd614, fd612, fd610; +add.f64 fd615, fd499, fd504; +add.f64 fd616, fd217, fd615; +mul.f64 fd619, fd615, 0d3FE0000000000000; +sub.f64 fd620, fd217, fd619; +add.f64 fd1782, fd501, fd506; +sub.f64 fd621, fd501, fd506; +mul.f64 fd622, fd621, 0dBFEBB67AE8584CAA; +add.f64 fd623, fd622, fd620; +sub.f64 fd624, fd620, fd622; +add.f64 fd1781, fd223, fd1782; +mul.f64 fd625, fd1782, 0d3FE0000000000000; +sub.f64 fd626, fd223, fd625; +sub.f64 fd627, fd499, fd504; +mul.f64 fd628, fd627, 0dBFEBB67AE8584CAA; +sub.f64 fd629, fd626, fd628; +add.f64 fd630, fd628, fd626; +add.f64 fd631, fd509, fd514; +add.f64 fd632, fd186, fd631; +mul.f64 fd635, fd631, 0d3FE0000000000000; +sub.f64 fd636, fd186, fd635; +add.f64 fd1780, fd511, fd516; +sub.f64 fd637, fd511, fd516; +mul.f64 fd638, fd637, 0dBFEBB67AE8584CAA; +add.f64 fd639, fd638, fd636; +sub.f64 fd640, fd636, fd638; +add.f64 fd1779, fd192, fd1780; +mul.f64 fd641, fd1780, 0d3FE0000000000000; +sub.f64 fd642, fd192, fd641; +sub.f64 fd643, fd509, fd514; +mul.f64 fd644, fd643, 0dBFEBB67AE8584CAA; +sub.f64 fd645, fd642, fd644; +add.f64 fd646, fd644, fd642; +add.f64 fd647, fd519, fd524; +add.f64 fd648, fd202, fd647; +mul.f64 fd651, fd647, 0d3FE0000000000000; +sub.f64 fd652, fd202, fd651; +add.f64 fd1778, fd521, fd526; +sub.f64 fd653, fd521, fd526; +mul.f64 fd654, fd653, 0dBFEBB67AE8584CAA; +add.f64 fd655, fd654, fd652; +sub.f64 fd656, fd652, fd654; +add.f64 fd1777, fd208, fd1778; +mul.f64 fd657, fd1778, 0d3FE0000000000000; +sub.f64 fd658, fd208, fd657; +sub.f64 fd659, fd519, fd524; +mul.f64 fd660, fd659, 0dBFEBB67AE8584CAA; +sub.f64 fd661, fd658, fd660; +add.f64 fd662, fd660, fd658; +add.f64 fd663, fd529, fd534; +add.f64 fd664, fd218, fd663; +mul.f64 fd667, fd663, 0d3FE0000000000000; +sub.f64 fd668, fd218, fd667; +add.f64 fd1776, fd531, fd536; +sub.f64 fd669, fd531, fd536; +mul.f64 fd670, fd669, 0dBFEBB67AE8584CAA; +add.f64 fd671, fd670, fd668; +sub.f64 fd672, fd668, fd670; +add.f64 fd1775, fd224, fd1776; +mul.f64 fd673, fd1776, 0d3FE0000000000000; +sub.f64 fd674, fd224, fd673; +sub.f64 fd675, fd529, fd534; +mul.f64 fd676, fd675, 0dBFEBB67AE8584CAA; +sub.f64 fd677, fd674, fd676; +add.f64 fd678, fd676, fd674; +mov.u32 r15, %tid.x; +mul.wide.u32 rd2, r15, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r15, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r15, r10; +mad.lo.s32 r12, r9, 11664, r3; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r11, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd679, fd680}, [rd6]; +mul.f64 fd683, fd1789, fd680; +mul.f64 fd685, fd679, fd1789; +mul.f64 fd687, fd680, fd680; +mul.f64 fd1774, fd679, fd679; +sub.f64 fd688, fd1774, fd687; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd691, fd1787, fd690; +mul.f64 fd693, fd688, fd1787; +mul.f64 fd1772, fd679, fd688; +mul.f64 fd1773, fd680, fd690; +sub.f64 fd696, fd1772, fd1773; +mul.f64 fd1771, fd568, fd690; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd699, fd1785, fd698; +mul.f64 fd701, fd696, fd1785; +mul.f64 fd703, fd680, fd698; +mul.f64 fd1770, fd679, fd696; +sub.f64 fd704, fd1770, fd703; +mul.f64 fd1769, fd584, fd698; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd707, fd1783, fd706; +mul.f64 fd709, fd704, fd1783; +mul.f64 fd1767, fd679, fd704; +mul.f64 fd1768, fd680, fd706; +sub.f64 fd712, fd1767, fd1768; +mul.f64 fd1766, fd600, fd706; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd715, fd1781, fd714; +mul.f64 fd717, fd712, fd1781; +mul.f64 fd719, fd680, fd714; +mul.f64 fd1765, fd679, fd712; +sub.f64 fd720, fd1765, fd719; +mul.f64 fd1764, fd616, fd714; +mul.f64 fd721, fd679, fd714; +fma.rn.f64 fd722, fd680, fd712, fd721; +mul.f64 fd723, fd1779, fd722; +mul.f64 fd725, fd720, fd1779; +mul.f64 fd727, fd680, fd722; +mul.f64 fd1763, fd679, fd720; +sub.f64 fd728, fd1763, fd727; +mul.f64 fd1762, fd632, fd722; +mul.f64 fd729, fd679, fd722; +fma.rn.f64 fd730, fd680, fd720, fd729; +mul.f64 fd731, fd1777, fd730; +mul.f64 fd733, fd728, fd1777; +mul.f64 fd1760, fd679, fd728; +mul.f64 fd1761, fd680, fd730; +sub.f64 fd736, fd1760, fd1761; +mul.f64 fd1759, fd648, fd730; +mul.f64 fd737, fd679, fd730; +fma.rn.f64 fd738, fd680, fd728, fd737; +mul.f64 fd739, fd1775, fd738; +mul.f64 fd741, fd736, fd1775; +mul.f64 fd743, fd680, fd738; +mul.f64 fd1758, fd679, fd736; +sub.f64 fd744, fd1758, fd743; +mul.f64 fd1757, fd664, fd738; +mul.f64 fd745, fd679, fd738; +fma.rn.f64 fd746, fd680, fd736, fd745; +mul.f64 fd747, fd549, fd746; +mul.f64 fd749, fd744, fd549; +mul.f64 fd751, fd680, fd746; +mul.f64 fd1756, fd679, fd744; +sub.f64 fd752, fd1756, fd751; +mul.f64 fd1755, fd543, fd746; +mul.f64 fd753, fd679, fd746; +fma.rn.f64 fd754, fd680, fd744, fd753; +mul.f64 fd755, fd565, fd754; +mul.f64 fd757, fd752, fd565; +mul.f64 fd1753, fd679, fd752; +mul.f64 fd1754, fd680, fd754; +sub.f64 fd760, fd1753, fd1754; +mul.f64 fd1752, fd559, fd754; +mul.f64 fd761, fd679, fd754; +fma.rn.f64 fd762, fd680, fd752, fd761; +mul.f64 fd763, fd581, fd762; +mul.f64 fd765, fd760, fd581; +mul.f64 fd767, fd680, fd762; +mul.f64 fd1751, fd679, fd760; +sub.f64 fd768, fd1751, fd767; +mul.f64 fd1750, fd575, fd762; +mul.f64 fd769, fd679, fd762; +fma.rn.f64 fd770, fd680, fd760, fd769; +mul.f64 fd771, fd597, fd770; +mul.f64 fd773, fd768, fd597; +mul.f64 fd1748, fd679, fd768; +mul.f64 fd1749, fd680, fd770; +sub.f64 fd776, fd1748, fd1749; +mul.f64 fd1747, fd591, fd770; +mul.f64 fd777, fd679, fd770; +fma.rn.f64 fd778, fd680, fd768, fd777; +mul.f64 fd779, fd613, fd778; +mul.f64 fd780, fd607, fd778; +mul.f64 fd781, fd776, fd613; +ld.global.v2.f64 {fd782, fd783}, [rd6+432]; +mul.f64 fd786, fd629, fd783; +mul.f64 fd788, fd782, fd629; +mul.f64 fd790, fd680, fd783; +mul.f64 fd1746, fd679, fd782; +sub.f64 fd791, fd1746, fd790; +mul.f64 fd1745, fd623, fd783; +mul.f64 fd792, fd679, fd783; +fma.rn.f64 fd793, fd680, fd782, fd792; +mul.f64 fd794, fd645, fd793; +mul.f64 fd796, fd791, fd645; +mul.f64 fd798, fd680, fd793; +mul.f64 fd1744, fd679, fd791; +sub.f64 fd799, fd1744, fd798; +mul.f64 fd1743, fd639, fd793; +mul.f64 fd800, fd679, fd793; +fma.rn.f64 fd801, fd680, fd791, fd800; +mul.f64 fd802, fd661, fd801; +mul.f64 fd804, fd799, fd661; +mul.f64 fd1741, fd679, fd799; +mul.f64 fd1742, fd680, fd801; +sub.f64 fd807, fd1741, fd1742; +mul.f64 fd1740, fd655, fd801; +mul.f64 fd808, fd679, fd801; +fma.rn.f64 fd809, fd680, fd799, fd808; +mul.f64 fd810, fd677, fd809; +mul.f64 fd812, fd807, fd677; +mul.f64 fd814, fd680, fd809; +mul.f64 fd1739, fd679, fd807; +sub.f64 fd815, fd1739, fd814; +mul.f64 fd1738, fd671, fd809; +mul.f64 fd816, fd679, fd809; +fma.rn.f64 fd817, fd680, fd807, fd816; +mul.f64 fd818, fd550, fd817; +mul.f64 fd820, fd815, fd550; +mul.f64 fd822, fd680, fd817; +mul.f64 fd1737, fd679, fd815; +sub.f64 fd823, fd1737, fd822; +mul.f64 fd1736, fd544, fd817; +mul.f64 fd824, fd679, fd817; +fma.rn.f64 fd825, fd680, fd815, fd824; +mul.f64 fd826, fd566, fd825; +mul.f64 fd828, fd823, fd566; +mul.f64 fd1734, fd679, fd823; +mul.f64 fd1735, fd680, fd825; +sub.f64 fd831, fd1734, fd1735; +mul.f64 fd1733, fd560, fd825; +mul.f64 fd832, fd679, fd825; +fma.rn.f64 fd833, fd680, fd823, fd832; +mul.f64 fd834, fd582, fd833; +mul.f64 fd836, fd831, fd582; +mul.f64 fd838, fd680, fd833; +mul.f64 fd1732, fd679, fd831; +sub.f64 fd839, fd1732, fd838; +mul.f64 fd1731, fd576, fd833; +mul.f64 fd840, fd679, fd833; +fma.rn.f64 fd841, fd680, fd831, fd840; +mul.f64 fd842, fd598, fd841; +mul.f64 fd844, fd839, fd598; +mul.f64 fd1729, fd679, fd839; +mul.f64 fd1730, fd680, fd841; +sub.f64 fd847, fd1729, fd1730; +mul.f64 fd1728, fd592, fd841; +mul.f64 fd848, fd679, fd841; +fma.rn.f64 fd849, fd680, fd839, fd848; +mul.f64 fd850, fd614, fd849; +mul.f64 fd852, fd847, fd614; +mul.f64 fd854, fd680, fd849; +mul.f64 fd1727, fd679, fd847; +sub.f64 fd855, fd1727, fd854; +mul.f64 fd1726, fd608, fd849; +mul.f64 fd856, fd679, fd849; +fma.rn.f64 fd857, fd680, fd847, fd856; +mul.f64 fd858, fd630, fd857; +mul.f64 fd860, fd855, fd630; +mul.f64 fd862, fd680, fd857; +mul.f64 fd1725, fd679, fd855; +sub.f64 fd863, fd1725, fd862; +mul.f64 fd1724, fd624, fd857; +mul.f64 fd864, fd679, fd857; +fma.rn.f64 fd865, fd680, fd855, fd864; +mul.f64 fd866, fd646, fd865; +mul.f64 fd868, fd863, fd646; +mul.f64 fd1722, fd679, fd863; +mul.f64 fd1723, fd680, fd865; +sub.f64 fd871, fd1722, fd1723; +mul.f64 fd1721, fd640, fd865; +mul.f64 fd872, fd679, fd865; +fma.rn.f64 fd873, fd680, fd863, fd872; +mul.f64 fd874, fd662, fd873; +mul.f64 fd876, fd871, fd662; +mul.f64 fd878, fd680, fd873; +mul.f64 fd1720, fd679, fd871; +sub.f64 fd879, fd1720, fd878; +mul.f64 fd1719, fd656, fd873; +mul.f64 fd880, fd679, fd873; +mul.f64 fd1718, fd552, fd680; +fma.rn.f64 fd881, fd680, fd871, fd880; +mul.f64 fd882, fd678, fd881; +mul.f64 fd883, fd672, fd881; +mul.f64 fd884, fd879, fd678; +barrier.sync 0; +mad.lo.s32 r13, r11, 432, r12; +add.f64 fd885, fd1872, fd1791; +add.f64 fd886, fd178, fd537; +st.shared.v2.f64 [r13], {fd886, fd885}; +fma.rn.f64 fd887, fd679, fd552, fd683; +sub.f64 fd888, fd685, fd1718; +st.shared.v2.f64 [r13+16], {fd887, fd888}; +fma.rn.f64 fd889, fd688, fd568, fd691; +sub.f64 fd890, fd693, fd1771; +st.shared.v2.f64 [r13+32], {fd889, fd890}; +fma.rn.f64 fd891, fd696, fd584, fd699; +sub.f64 fd892, fd701, fd1769; +st.shared.v2.f64 [r13+48], {fd891, fd892}; +fma.rn.f64 fd893, fd704, fd600, fd707; +sub.f64 fd894, fd709, fd1766; +st.shared.v2.f64 [r13+64], {fd893, fd894}; +fma.rn.f64 fd895, fd712, fd616, fd715; +sub.f64 fd896, fd717, fd1764; +st.shared.v2.f64 [r13+80], {fd895, fd896}; +fma.rn.f64 fd897, fd720, fd632, fd723; +sub.f64 fd898, fd725, fd1762; +st.shared.v2.f64 [r13+96], {fd897, fd898}; +sub.f64 fd899, fd733, fd1759; +fma.rn.f64 fd900, fd728, fd648, fd731; +st.shared.v2.f64 [r13+112], {fd900, fd899}; +fma.rn.f64 fd901, fd736, fd664, fd739; +sub.f64 fd902, fd741, fd1757; +st.shared.v2.f64 [r13+128], {fd901, fd902}; +fma.rn.f64 fd903, fd744, fd543, fd747; +sub.f64 fd904, fd749, fd1755; +st.shared.v2.f64 [r13+144], {fd903, fd904}; +fma.rn.f64 fd905, fd752, fd559, fd755; +sub.f64 fd906, fd757, fd1752; +st.shared.v2.f64 [r13+160], {fd905, fd906}; +fma.rn.f64 fd907, fd760, fd575, fd763; +sub.f64 fd908, fd765, fd1750; +st.shared.v2.f64 [r13+176], {fd907, fd908}; +fma.rn.f64 fd909, fd768, fd591, fd771; +sub.f64 fd910, fd773, fd1747; +st.shared.v2.f64 [r13+192], {fd909, fd910}; +fma.rn.f64 fd911, fd776, fd607, fd779; +sub.f64 fd912, fd781, fd780; +st.shared.v2.f64 [r13+208], {fd911, fd912}; +fma.rn.f64 fd913, fd782, fd623, fd786; +sub.f64 fd914, fd788, fd1745; +st.shared.v2.f64 [r13+224], {fd913, fd914}; +fma.rn.f64 fd915, fd791, fd639, fd794; +sub.f64 fd916, fd796, fd1743; +st.shared.v2.f64 [r13+240], {fd915, fd916}; +fma.rn.f64 fd917, fd799, fd655, fd802; +sub.f64 fd918, fd804, fd1740; +st.shared.v2.f64 [r13+256], {fd917, fd918}; +fma.rn.f64 fd919, fd807, fd671, fd810; +sub.f64 fd920, fd812, fd1738; +st.shared.v2.f64 [r13+272], {fd919, fd920}; +fma.rn.f64 fd921, fd815, fd544, fd818; +sub.f64 fd922, fd820, fd1736; +st.shared.v2.f64 [r13+288], {fd921, fd922}; +fma.rn.f64 fd923, fd823, fd560, fd826; +sub.f64 fd924, fd828, fd1733; +st.shared.v2.f64 [r13+304], {fd923, fd924}; +sub.f64 fd925, fd836, fd1731; +fma.rn.f64 fd926, fd831, fd576, fd834; +st.shared.v2.f64 [r13+320], {fd926, fd925}; +fma.rn.f64 fd927, fd839, fd592, fd842; +sub.f64 fd928, fd844, fd1728; +st.shared.v2.f64 [r13+336], {fd927, fd928}; +fma.rn.f64 fd929, fd847, fd608, fd850; +sub.f64 fd930, fd852, fd1726; +st.shared.v2.f64 [r13+352], {fd929, fd930}; +fma.rn.f64 fd931, fd855, fd624, fd858; +sub.f64 fd932, fd860, fd1724; +st.shared.v2.f64 [r13+368], {fd931, fd932}; +fma.rn.f64 fd933, fd863, fd640, fd866; +sub.f64 fd934, fd868, fd1721; +st.shared.v2.f64 [r13+384], {fd933, fd934}; +fma.rn.f64 fd935, fd871, fd656, fd874; +sub.f64 fd936, fd876, fd1719; +st.shared.v2.f64 [r13+400], {fd935, fd936}; +fma.rn.f64 fd937, fd879, fd672, fd882; +sub.f64 fd938, fd884, fd883; +st.shared.v2.f64 [r13+416], {fd937, fd938}; +barrier.sync 0; +mad.lo.s32 r14, r11, -416, r13; +ld.shared.v2.f64 {fd939, fd940}, [r14]; +ld.shared.v2.f64 {fd943, fd944}, [r14+432]; +ld.shared.v2.f64 {fd947, fd948}, [r14+864]; +ld.shared.v2.f64 {fd951, fd952}, [r14+1296]; +ld.shared.v2.f64 {fd955, fd956}, [r14+1728]; +ld.shared.v2.f64 {fd959, fd960}, [r14+2160]; +ld.shared.v2.f64 {fd963, fd964}, [r14+2592]; +ld.shared.v2.f64 {fd967, fd968}, [r14+3024]; +ld.shared.v2.f64 {fd971, fd972}, [r14+3456]; +ld.shared.v2.f64 {fd975, fd976}, [r14+3888]; +ld.shared.v2.f64 {fd979, fd980}, [r14+4320]; +ld.shared.v2.f64 {fd983, fd984}, [r14+4752]; +ld.shared.v2.f64 {fd987, fd988}, [r14+5184]; +ld.shared.v2.f64 {fd991, fd992}, [r14+5616]; +ld.shared.v2.f64 {fd995, fd996}, [r14+6048]; +ld.shared.v2.f64 {fd999, fd1000}, [r14+6480]; +ld.shared.v2.f64 {fd1003, fd1004}, [r14+6912]; +ld.shared.v2.f64 {fd1007, fd1008}, [r14+7344]; +ld.shared.v2.f64 {fd1011, fd1012}, [r14+7776]; +ld.shared.v2.f64 {fd1015, fd1016}, [r14+8208]; +ld.shared.v2.f64 {fd1019, fd1020}, [r14+8640]; +ld.shared.v2.f64 {fd1023, fd1024}, [r14+9072]; +ld.shared.v2.f64 {fd1027, fd1028}, [r14+9504]; +ld.shared.v2.f64 {fd1031, fd1032}, [r14+9936]; +ld.shared.v2.f64 {fd1035, fd1036}, [r14+10368]; +ld.shared.v2.f64 {fd1039, fd1040}, [r14+10800]; +ld.shared.v2.f64 {fd1043, fd1044}, [r14+11232]; +add.f64 fd1047, fd975, fd1011; +add.f64 fd1048, fd939, fd1047; +mul.f64 fd1051, fd1047, 0d3FE0000000000000; +sub.f64 fd1052, fd939, fd1051; +add.f64 fd1717, fd976, fd1012; +sub.f64 fd1053, fd976, fd1012; +mul.f64 fd1054, fd1053, 0dBFEBB67AE8584CAA; +add.f64 fd1055, fd1054, fd1052; +sub.f64 fd1056, fd1052, fd1054; +add.f64 fd1716, fd940, fd1717; +mul.f64 fd1057, fd1717, 0d3FE0000000000000; +sub.f64 fd1058, fd940, fd1057; +sub.f64 fd1059, fd975, fd1011; +mul.f64 fd1060, fd1059, 0dBFEBB67AE8584CAA; +sub.f64 fd1061, fd1058, fd1060; +add.f64 fd1062, fd1060, fd1058; +add.f64 fd1063, fd987, fd1023; +add.f64 fd1064, fd951, fd1063; +mul.f64 fd1067, fd1063, 0d3FE0000000000000; +sub.f64 fd1068, fd951, fd1067; +add.f64 fd1715, fd988, fd1024; +sub.f64 fd1069, fd988, fd1024; +mul.f64 fd1070, fd1069, 0dBFEBB67AE8584CAA; +add.f64 fd1071, fd1070, fd1068; +sub.f64 fd1072, fd1068, fd1070; +add.f64 fd1714, fd952, fd1715; +mul.f64 fd1073, fd1715, 0d3FE0000000000000; +sub.f64 fd1074, fd952, fd1073; +sub.f64 fd1075, fd987, fd1023; +mul.f64 fd1076, fd1075, 0dBFEBB67AE8584CAA; +sub.f64 fd1077, fd1074, fd1076; +add.f64 fd1078, fd1076, fd1074; +add.f64 fd1079, fd999, fd1035; +add.f64 fd1080, fd963, fd1079; +mul.f64 fd1083, fd1079, 0d3FE0000000000000; +sub.f64 fd1084, fd963, fd1083; +add.f64 fd1713, fd1000, fd1036; +sub.f64 fd1085, fd1000, fd1036; +mul.f64 fd1086, fd1085, 0dBFEBB67AE8584CAA; +add.f64 fd1087, fd1086, fd1084; +sub.f64 fd1088, fd1084, fd1086; +add.f64 fd1712, fd964, fd1713; +mul.f64 fd1089, fd1713, 0d3FE0000000000000; +sub.f64 fd1090, fd964, fd1089; +sub.f64 fd1091, fd999, fd1035; +mul.f64 fd1092, fd1091, 0dBFEBB67AE8584CAA; +sub.f64 fd1093, fd1090, fd1092; +add.f64 fd1094, fd1092, fd1090; +mul.f64 fd1710, fd1071, 0d3FE8836FA2CF5039; +mul.f64 fd1711, fd1077, 0d3FE491B7523C161D; +sub.f64 fd1097, fd1710, fd1711; +mul.f64 fd1098, fd1077, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1099, fd1071, 0d3FE491B7523C161D, fd1098; +mul.f64 fd1101, fd1093, 0d3FEF838B8C811C17; +mul.f64 fd1709, fd1087, 0d3FC63A1A7E0B738A; +sub.f64 fd1102, fd1709, fd1101; +mul.f64 fd1103, fd1093, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1104, fd1087, 0d3FEF838B8C811C17, fd1103; +mul.f64 fd1106, fd1078, 0d3FEF838B8C811C17; +mul.f64 fd1708, fd1072, 0d3FC63A1A7E0B738A; +sub.f64 fd1107, fd1708, fd1106; +mul.f64 fd1108, fd1078, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1109, fd1072, 0d3FEF838B8C811C17, fd1108; +mul.f64 fd1111, fd1094, 0d3FD5E3A8748A0BF5; +mul.f64 fd1707, fd1088, 0dBFEE11F642522D1C; +sub.f64 fd1112, fd1707, fd1111; +mul.f64 fd1113, fd1094, 0dBFEE11F642522D1C; +fma.rn.f64 fd1114, fd1088, 0d3FD5E3A8748A0BF5, fd1113; +add.f64 fd1115, fd1064, fd1080; +add.f64 fd1116, fd1048, fd1115; +mul.f64 fd1119, fd1115, 0d3FE0000000000000; +sub.f64 fd1120, fd1048, fd1119; +add.f64 fd1706, fd1714, fd1712; +sub.f64 fd1121, fd1714, fd1712; +mul.f64 fd1122, fd1121, 0dBFEBB67AE8584CAA; +add.f64 fd1123, fd1122, fd1120; +sub.f64 fd1124, fd1120, fd1122; +add.f64 fd1705, fd1716, fd1706; +mul.f64 fd1125, fd1706, 0d3FE0000000000000; +sub.f64 fd1126, fd1716, fd1125; +sub.f64 fd1127, fd1064, fd1080; +mul.f64 fd1128, fd1127, 0dBFEBB67AE8584CAA; +sub.f64 fd1129, fd1126, fd1128; +add.f64 fd1130, fd1128, fd1126; +add.f64 fd1131, fd1097, fd1102; +add.f64 fd1132, fd1055, fd1131; +mul.f64 fd1135, fd1131, 0d3FE0000000000000; +sub.f64 fd1136, fd1055, fd1135; +add.f64 fd1704, fd1099, fd1104; +sub.f64 fd1137, fd1099, fd1104; +mul.f64 fd1138, fd1137, 0dBFEBB67AE8584CAA; +add.f64 fd1139, fd1138, fd1136; +sub.f64 fd1140, fd1136, fd1138; +add.f64 fd1703, fd1061, fd1704; +mul.f64 fd1141, fd1704, 0d3FE0000000000000; +sub.f64 fd1142, fd1061, fd1141; +sub.f64 fd1143, fd1097, fd1102; +mul.f64 fd1144, fd1143, 0dBFEBB67AE8584CAA; +sub.f64 fd1145, fd1142, fd1144; +add.f64 fd1146, fd1144, fd1142; +add.f64 fd1147, fd1107, fd1112; +add.f64 fd1148, fd1056, fd1147; +mul.f64 fd1151, fd1147, 0d3FE0000000000000; +sub.f64 fd1152, fd1056, fd1151; +add.f64 fd1702, fd1109, fd1114; +sub.f64 fd1153, fd1109, fd1114; +mul.f64 fd1154, fd1153, 0dBFEBB67AE8584CAA; +add.f64 fd1155, fd1154, fd1152; +sub.f64 fd1156, fd1152, fd1154; +add.f64 fd1701, fd1062, fd1702; +mul.f64 fd1157, fd1702, 0d3FE0000000000000; +sub.f64 fd1158, fd1062, fd1157; +sub.f64 fd1159, fd1107, fd1112; +mul.f64 fd1160, fd1159, 0dBFEBB67AE8584CAA; +sub.f64 fd1161, fd1158, fd1160; +add.f64 fd1162, fd1160, fd1158; +add.f64 fd1163, fd979, fd1015; +add.f64 fd1164, fd943, fd1163; +mul.f64 fd1167, fd1163, 0d3FE0000000000000; +sub.f64 fd1168, fd943, fd1167; +add.f64 fd1700, fd980, fd1016; +sub.f64 fd1169, fd980, fd1016; +mul.f64 fd1170, fd1169, 0dBFEBB67AE8584CAA; +add.f64 fd1171, fd1170, fd1168; +sub.f64 fd1172, fd1168, fd1170; +add.f64 fd1699, fd944, fd1700; +mul.f64 fd1173, fd1700, 0d3FE0000000000000; +sub.f64 fd1174, fd944, fd1173; +sub.f64 fd1175, fd979, fd1015; +mul.f64 fd1176, fd1175, 0dBFEBB67AE8584CAA; +sub.f64 fd1177, fd1174, fd1176; +add.f64 fd1178, fd1176, fd1174; +add.f64 fd1179, fd991, fd1027; +add.f64 fd1180, fd955, fd1179; +mul.f64 fd1183, fd1179, 0d3FE0000000000000; +sub.f64 fd1184, fd955, fd1183; +add.f64 fd1698, fd992, fd1028; +sub.f64 fd1185, fd992, fd1028; +mul.f64 fd1186, fd1185, 0dBFEBB67AE8584CAA; +add.f64 fd1187, fd1186, fd1184; +sub.f64 fd1188, fd1184, fd1186; +add.f64 fd1697, fd956, fd1698; +mul.f64 fd1189, fd1698, 0d3FE0000000000000; +sub.f64 fd1190, fd956, fd1189; +sub.f64 fd1191, fd991, fd1027; +mul.f64 fd1192, fd1191, 0dBFEBB67AE8584CAA; +sub.f64 fd1193, fd1190, fd1192; +add.f64 fd1194, fd1192, fd1190; +add.f64 fd1195, fd1003, fd1039; +add.f64 fd1196, fd967, fd1195; +mul.f64 fd1199, fd1195, 0d3FE0000000000000; +sub.f64 fd1200, fd967, fd1199; +add.f64 fd1696, fd1004, fd1040; +sub.f64 fd1201, fd1004, fd1040; +mul.f64 fd1202, fd1201, 0dBFEBB67AE8584CAA; +add.f64 fd1203, fd1202, fd1200; +sub.f64 fd1204, fd1200, fd1202; +add.f64 fd1695, fd968, fd1696; +mul.f64 fd1205, fd1696, 0d3FE0000000000000; +sub.f64 fd1206, fd968, fd1205; +sub.f64 fd1207, fd1003, fd1039; +mul.f64 fd1208, fd1207, 0dBFEBB67AE8584CAA; +sub.f64 fd1209, fd1206, fd1208; +add.f64 fd1210, fd1208, fd1206; +mul.f64 fd1693, fd1187, 0d3FE8836FA2CF5039; +mul.f64 fd1694, fd1193, 0d3FE491B7523C161D; +sub.f64 fd1213, fd1693, fd1694; +mul.f64 fd1214, fd1193, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1215, fd1187, 0d3FE491B7523C161D, fd1214; +mul.f64 fd1217, fd1209, 0d3FEF838B8C811C17; +mul.f64 fd1692, fd1203, 0d3FC63A1A7E0B738A; +sub.f64 fd1218, fd1692, fd1217; +mul.f64 fd1219, fd1209, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1220, fd1203, 0d3FEF838B8C811C17, fd1219; +mul.f64 fd1222, fd1194, 0d3FEF838B8C811C17; +mul.f64 fd1691, fd1188, 0d3FC63A1A7E0B738A; +sub.f64 fd1223, fd1691, fd1222; +mul.f64 fd1224, fd1194, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1225, fd1188, 0d3FEF838B8C811C17, fd1224; +mul.f64 fd1227, fd1210, 0d3FD5E3A8748A0BF5; +mul.f64 fd1690, fd1204, 0dBFEE11F642522D1C; +sub.f64 fd1228, fd1690, fd1227; +mul.f64 fd1229, fd1210, 0dBFEE11F642522D1C; +fma.rn.f64 fd1230, fd1204, 0d3FD5E3A8748A0BF5, fd1229; +add.f64 fd1231, fd1180, fd1196; +add.f64 fd1232, fd1164, fd1231; +mul.f64 fd1235, fd1231, 0d3FE0000000000000; +sub.f64 fd1236, fd1164, fd1235; +add.f64 fd1689, fd1697, fd1695; +sub.f64 fd1237, fd1697, fd1695; +mul.f64 fd1238, fd1237, 0dBFEBB67AE8584CAA; +add.f64 fd1239, fd1238, fd1236; +sub.f64 fd1240, fd1236, fd1238; +add.f64 fd1688, fd1699, fd1689; +mul.f64 fd1241, fd1689, 0d3FE0000000000000; +sub.f64 fd1242, fd1699, fd1241; +sub.f64 fd1243, fd1180, fd1196; +mul.f64 fd1244, fd1243, 0dBFEBB67AE8584CAA; +sub.f64 fd1245, fd1242, fd1244; +add.f64 fd1246, fd1244, fd1242; +add.f64 fd1247, fd1213, fd1218; +add.f64 fd1248, fd1171, fd1247; +mul.f64 fd1251, fd1247, 0d3FE0000000000000; +sub.f64 fd1252, fd1171, fd1251; +add.f64 fd1687, fd1215, fd1220; +sub.f64 fd1253, fd1215, fd1220; +mul.f64 fd1254, fd1253, 0dBFEBB67AE8584CAA; +add.f64 fd1255, fd1254, fd1252; +sub.f64 fd1256, fd1252, fd1254; +add.f64 fd1686, fd1177, fd1687; +mul.f64 fd1257, fd1687, 0d3FE0000000000000; +sub.f64 fd1258, fd1177, fd1257; +sub.f64 fd1259, fd1213, fd1218; +mul.f64 fd1260, fd1259, 0dBFEBB67AE8584CAA; +sub.f64 fd1261, fd1258, fd1260; +add.f64 fd1262, fd1260, fd1258; +add.f64 fd1263, fd1223, fd1228; +add.f64 fd1264, fd1172, fd1263; +mul.f64 fd1267, fd1263, 0d3FE0000000000000; +sub.f64 fd1268, fd1172, fd1267; +add.f64 fd1685, fd1225, fd1230; +sub.f64 fd1269, fd1225, fd1230; +mul.f64 fd1270, fd1269, 0dBFEBB67AE8584CAA; +add.f64 fd1271, fd1270, fd1268; +sub.f64 fd1272, fd1268, fd1270; +add.f64 fd1684, fd1178, fd1685; +mul.f64 fd1273, fd1685, 0d3FE0000000000000; +sub.f64 fd1274, fd1178, fd1273; +sub.f64 fd1275, fd1223, fd1228; +mul.f64 fd1276, fd1275, 0dBFEBB67AE8584CAA; +sub.f64 fd1277, fd1274, fd1276; +add.f64 fd1278, fd1276, fd1274; +add.f64 fd1279, fd983, fd1019; +add.f64 fd1280, fd947, fd1279; +mul.f64 fd1283, fd1279, 0d3FE0000000000000; +sub.f64 fd1284, fd947, fd1283; +add.f64 fd1683, fd984, fd1020; +sub.f64 fd1285, fd984, fd1020; +mul.f64 fd1286, fd1285, 0dBFEBB67AE8584CAA; +add.f64 fd1287, fd1286, fd1284; +sub.f64 fd1288, fd1284, fd1286; +add.f64 fd1682, fd948, fd1683; +mul.f64 fd1289, fd1683, 0d3FE0000000000000; +sub.f64 fd1290, fd948, fd1289; +sub.f64 fd1291, fd983, fd1019; +mul.f64 fd1292, fd1291, 0dBFEBB67AE8584CAA; +sub.f64 fd1293, fd1290, fd1292; +add.f64 fd1294, fd1292, fd1290; +add.f64 fd1295, fd995, fd1031; +add.f64 fd1296, fd959, fd1295; +mul.f64 fd1299, fd1295, 0d3FE0000000000000; +sub.f64 fd1300, fd959, fd1299; +add.f64 fd1681, fd996, fd1032; +sub.f64 fd1301, fd996, fd1032; +mul.f64 fd1302, fd1301, 0dBFEBB67AE8584CAA; +add.f64 fd1303, fd1302, fd1300; +sub.f64 fd1304, fd1300, fd1302; +add.f64 fd1680, fd960, fd1681; +mul.f64 fd1305, fd1681, 0d3FE0000000000000; +sub.f64 fd1306, fd960, fd1305; +sub.f64 fd1307, fd995, fd1031; +mul.f64 fd1308, fd1307, 0dBFEBB67AE8584CAA; +sub.f64 fd1309, fd1306, fd1308; +add.f64 fd1310, fd1308, fd1306; +add.f64 fd1311, fd1007, fd1043; +add.f64 fd1312, fd971, fd1311; +mul.f64 fd1315, fd1311, 0d3FE0000000000000; +sub.f64 fd1316, fd971, fd1315; +add.f64 fd1679, fd1008, fd1044; +sub.f64 fd1317, fd1008, fd1044; +mul.f64 fd1318, fd1317, 0dBFEBB67AE8584CAA; +add.f64 fd1319, fd1318, fd1316; +sub.f64 fd1320, fd1316, fd1318; +add.f64 fd1678, fd972, fd1679; +mul.f64 fd1321, fd1679, 0d3FE0000000000000; +sub.f64 fd1322, fd972, fd1321; +sub.f64 fd1323, fd1007, fd1043; +mul.f64 fd1324, fd1323, 0dBFEBB67AE8584CAA; +sub.f64 fd1325, fd1322, fd1324; +add.f64 fd1326, fd1324, fd1322; +mul.f64 fd1676, fd1303, 0d3FE8836FA2CF5039; +mul.f64 fd1677, fd1309, 0d3FE491B7523C161D; +sub.f64 fd1329, fd1676, fd1677; +mul.f64 fd1330, fd1309, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1331, fd1303, 0d3FE491B7523C161D, fd1330; +mul.f64 fd1674, fd1319, 0d3FC63A1A7E0B738A; +mul.f64 fd1675, fd1325, 0d3FEF838B8C811C17; +sub.f64 fd1334, fd1674, fd1675; +mul.f64 fd1335, fd1325, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1336, fd1319, 0d3FEF838B8C811C17, fd1335; +mul.f64 fd1338, fd1310, 0d3FEF838B8C811C17; +mul.f64 fd1673, fd1304, 0d3FC63A1A7E0B738A; +sub.f64 fd1339, fd1673, fd1338; +mul.f64 fd1340, fd1310, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1341, fd1304, 0d3FEF838B8C811C17, fd1340; +mul.f64 fd1343, fd1326, 0d3FD5E3A8748A0BF5; +mul.f64 fd1672, fd1320, 0dBFEE11F642522D1C; +sub.f64 fd1344, fd1672, fd1343; +mul.f64 fd1345, fd1326, 0dBFEE11F642522D1C; +fma.rn.f64 fd1346, fd1320, 0d3FD5E3A8748A0BF5, fd1345; +add.f64 fd1347, fd1296, fd1312; +add.f64 fd1348, fd1280, fd1347; +mul.f64 fd1351, fd1347, 0d3FE0000000000000; +sub.f64 fd1352, fd1280, fd1351; +add.f64 fd1671, fd1680, fd1678; +sub.f64 fd1353, fd1680, fd1678; +mul.f64 fd1354, fd1353, 0dBFEBB67AE8584CAA; +add.f64 fd1355, fd1354, fd1352; +sub.f64 fd1356, fd1352, fd1354; +add.f64 fd1670, fd1682, fd1671; +mul.f64 fd1357, fd1671, 0d3FE0000000000000; +sub.f64 fd1358, fd1682, fd1357; +sub.f64 fd1359, fd1296, fd1312; +mul.f64 fd1360, fd1359, 0dBFEBB67AE8584CAA; +sub.f64 fd1361, fd1358, fd1360; +add.f64 fd1362, fd1360, fd1358; +add.f64 fd1363, fd1329, fd1334; +add.f64 fd1364, fd1287, fd1363; +mul.f64 fd1367, fd1363, 0d3FE0000000000000; +sub.f64 fd1368, fd1287, fd1367; +add.f64 fd1669, fd1331, fd1336; +sub.f64 fd1369, fd1331, fd1336; +mul.f64 fd1370, fd1369, 0dBFEBB67AE8584CAA; +add.f64 fd1371, fd1370, fd1368; +sub.f64 fd1372, fd1368, fd1370; +add.f64 fd1668, fd1293, fd1669; +mul.f64 fd1373, fd1669, 0d3FE0000000000000; +sub.f64 fd1374, fd1293, fd1373; +sub.f64 fd1375, fd1329, fd1334; +mul.f64 fd1376, fd1375, 0dBFEBB67AE8584CAA; +sub.f64 fd1377, fd1374, fd1376; +add.f64 fd1378, fd1376, fd1374; +add.f64 fd1379, fd1339, fd1344; +add.f64 fd1380, fd1288, fd1379; +mul.f64 fd1383, fd1379, 0d3FE0000000000000; +sub.f64 fd1384, fd1288, fd1383; +add.f64 fd1667, fd1341, fd1346; +sub.f64 fd1385, fd1341, fd1346; +mul.f64 fd1386, fd1385, 0dBFEBB67AE8584CAA; +add.f64 fd1387, fd1386, fd1384; +sub.f64 fd1388, fd1384, fd1386; +add.f64 fd1666, fd1294, fd1667; +mul.f64 fd1389, fd1667, 0d3FE0000000000000; +sub.f64 fd1390, fd1294, fd1389; +sub.f64 fd1391, fd1339, fd1344; +mul.f64 fd1392, fd1391, 0dBFEBB67AE8584CAA; +sub.f64 fd1393, fd1390, fd1392; +add.f64 fd1394, fd1392, fd1390; +mul.f64 fd1396, fd1686, 0d3FCD84D223638000; +mul.f64 fd1665, fd1248, 0d3FEF232EFF15C9E6; +sub.f64 fd1397, fd1665, fd1396; +mul.f64 fd1398, fd1686, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd1399, fd1248, 0d3FCD84D223638000, fd1398; +mul.f64 fd1663, fd1364, 0d3FEC98A37A9A7850; +mul.f64 fd1664, fd1668, 0d3FDCB920325BAFA6; +sub.f64 fd1402, fd1663, fd1664; +mul.f64 fd1403, fd1668, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1404, fd1364, 0d3FDCB920325BAFA6, fd1403; +mul.f64 fd1661, fd1264, 0d3FEC98A37A9A7850; +mul.f64 fd1662, fd1684, 0d3FDCB920325BAFA6; +sub.f64 fd1407, fd1661, fd1662; +mul.f64 fd1408, fd1684, 0d3FEC98A37A9A7850; +fma.rn.f64 fd1409, fd1264, 0d3FDCB920325BAFA6, fd1408; +mul.f64 fd1659, fd1380, 0d3FE31BEC55BC71BC; +mul.f64 fd1660, fd1666, 0d3FE9AAFE4207DF5F; +sub.f64 fd1412, fd1659, fd1660; +mul.f64 fd1413, fd1666, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1414, fd1380, 0d3FE9AAFE4207DF5F, fd1413; +mul.f64 fd1657, fd1239, 0d3FE8836FA2CF5039; +mul.f64 fd1658, fd1245, 0d3FE491B7523C161D; +sub.f64 fd1417, fd1657, fd1658; +mul.f64 fd1418, fd1245, 0d3FE8836FA2CF5039; +fma.rn.f64 fd1419, fd1239, 0d3FE491B7523C161D, fd1418; +mul.f64 fd1421, fd1361, 0d3FEF838B8C811C17; +mul.f64 fd1656, fd1355, 0d3FC63A1A7E0B738A; +sub.f64 fd1422, fd1656, fd1421; +mul.f64 fd1423, fd1361, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1424, fd1355, 0d3FEF838B8C811C17, fd1423; +mul.f64 fd1426, fd1261, 0d3FE9AAFE4207DF5F; +mul.f64 fd1655, fd1255, 0d3FE31BEC55BC71BC; +sub.f64 fd1427, fd1655, fd1426; +mul.f64 fd1428, fd1261, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd1429, fd1255, 0d3FE9AAFE4207DF5F, fd1428; +mul.f64 fd1431, fd1377, 0d3FEEA7D99F29CADE; +mul.f64 fd1654, fd1371, 0dBFD25AFBF23865BF; +sub.f64 fd1432, fd1654, fd1431; +mul.f64 fd1433, fd1377, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1434, fd1371, 0d3FEEA7D99F29CADE, fd1433; +mul.f64 fd1436, fd1277, 0d3FED6206BEB6C24B; +mul.f64 fd1653, fd1271, 0d3FD9595EF26FB670; +sub.f64 fd1437, fd1653, fd1436; +mul.f64 fd1438, fd1277, 0d3FD9595EF26FB670; +fma.rn.f64 fd1439, fd1271, 0d3FED6206BEB6C24B, fd1438; +mul.f64 fd1441, fd1393, 0d3FE746A51650EADE; +mul.f64 fd1652, fd1387, 0dBFE5F5B105F99707; +sub.f64 fd1442, fd1652, fd1441; +mul.f64 fd1443, fd1393, 0dBFE5F5B105F99707; +fma.rn.f64 fd1444, fd1387, 0d3FE746A51650EADE, fd1443; +mul.f64 fd1650, fd1240, 0d3FC63A1A7E0B738A; +mul.f64 fd1651, fd1246, 0d3FEF838B8C811C17; +sub.f64 fd1447, fd1650, fd1651; +mul.f64 fd1448, fd1246, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd1449, fd1240, 0d3FEF838B8C811C17, fd1448; +mul.f64 fd1648, fd1356, 0dBFEE11F642522D1C; +mul.f64 fd1649, fd1362, 0d3FD5E3A8748A0BF5; +sub.f64 fd1452, fd1648, fd1649; +mul.f64 fd1453, fd1362, 0dBFEE11F642522D1C; +fma.rn.f64 fd1454, fd1356, 0d3FD5E3A8748A0BF5, fd1453; +mul.f64 fd1646, fd1256, 0dBFADC528B5343A86; +mul.f64 fd1647, fd1262, 0d3FEFF223F3635CE3; +sub.f64 fd1457, fd1646, fd1647; +mul.f64 fd1458, fd1262, 0dBFADC528B5343A86; +fma.rn.f64 fd1459, fd1256, 0d3FEFF223F3635CE3, fd1458; +mul.f64 fd1644, fd1372, 0dBFEFC89BCEF44CF4; +mul.f64 fd1645, fd1378, 0dBFBDB843E577175E; +sub.f64 fd1462, fd1644, fd1645; +mul.f64 fd1463, fd1378, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd1464, fd1372, 0dBFBDB843E577175E, fd1463; +mul.f64 fd1466, fd1278, 0d3FEEA7D99F29CADE; +mul.f64 fd1643, fd1272, 0dBFD25AFBF23865BF; +sub.f64 fd1467, fd1643, fd1466; +mul.f64 fd1468, fd1278, 0dBFD25AFBF23865BF; +fma.rn.f64 fd1469, fd1272, 0d3FEEA7D99F29CADE, fd1468; +mul.f64 fd1471, fd1394, 0dBFE19593DA358510; +mul.f64 fd1642, fd1388, 0dBFEABC50EF4734A7; +sub.f64 fd1472, fd1642, fd1471; +mul.f64 fd1473, fd1394, 0dBFEABC50EF4734A7; +fma.rn.f64 fd1474, fd1388, 0dBFE19593DA358510, fd1473; +add.f64 fd1475, fd1232, fd1348; +mul.f64 fd1477, fd1475, 0d3FE0000000000000; +sub.f64 fd1478, fd1116, fd1477; +add.f64 fd1641, fd1688, fd1670; +sub.f64 fd1479, fd1688, fd1670; +mul.f64 fd1480, fd1479, 0dBFEBB67AE8584CAA; +mul.f64 fd1481, fd1641, 0d3FE0000000000000; +sub.f64 fd1482, fd1705, fd1481; +sub.f64 fd1483, fd1232, fd1348; +mul.f64 fd1484, fd1483, 0dBFEBB67AE8584CAA; +add.f64 fd1485, fd1397, fd1402; +mul.f64 fd1487, fd1485, 0d3FE0000000000000; +sub.f64 fd1488, fd1132, fd1487; +add.f64 fd1640, fd1399, fd1404; +sub.f64 fd1489, fd1399, fd1404; +mul.f64 fd1490, fd1489, 0dBFEBB67AE8584CAA; +mul.f64 fd1491, fd1640, 0d3FE0000000000000; +sub.f64 fd1492, fd1703, fd1491; +sub.f64 fd1493, fd1397, fd1402; +mul.f64 fd1494, fd1493, 0dBFEBB67AE8584CAA; +add.f64 fd1495, fd1407, fd1412; +mul.f64 fd1497, fd1495, 0d3FE0000000000000; +sub.f64 fd1498, fd1148, fd1497; +add.f64 fd1639, fd1409, fd1414; +sub.f64 fd1499, fd1409, fd1414; +mul.f64 fd1500, fd1499, 0dBFEBB67AE8584CAA; +mul.f64 fd1501, fd1639, 0d3FE0000000000000; +sub.f64 fd1502, fd1701, fd1501; +sub.f64 fd1503, fd1407, fd1412; +mul.f64 fd1504, fd1503, 0dBFEBB67AE8584CAA; +add.f64 fd1505, fd1417, fd1422; +mul.f64 fd1507, fd1505, 0d3FE0000000000000; +sub.f64 fd1508, fd1123, fd1507; +add.f64 fd1638, fd1419, fd1424; +sub.f64 fd1509, fd1419, fd1424; +mul.f64 fd1510, fd1509, 0dBFEBB67AE8584CAA; +mul.f64 fd1511, fd1638, 0d3FE0000000000000; +sub.f64 fd1512, fd1129, fd1511; +sub.f64 fd1513, fd1417, fd1422; +mul.f64 fd1514, fd1513, 0dBFEBB67AE8584CAA; +add.f64 fd1515, fd1427, fd1432; +mul.f64 fd1517, fd1515, 0d3FE0000000000000; +sub.f64 fd1518, fd1139, fd1517; +add.f64 fd1637, fd1429, fd1434; +sub.f64 fd1519, fd1429, fd1434; +mul.f64 fd1520, fd1519, 0dBFEBB67AE8584CAA; +mul.f64 fd1521, fd1637, 0d3FE0000000000000; +sub.f64 fd1522, fd1145, fd1521; +sub.f64 fd1523, fd1427, fd1432; +mul.f64 fd1524, fd1523, 0dBFEBB67AE8584CAA; +add.f64 fd1525, fd1437, fd1442; +mul.f64 fd1527, fd1525, 0d3FE0000000000000; +sub.f64 fd1528, fd1155, fd1527; +add.f64 fd1636, fd1439, fd1444; +sub.f64 fd1529, fd1439, fd1444; +mul.f64 fd1530, fd1529, 0dBFEBB67AE8584CAA; +mul.f64 fd1531, fd1636, 0d3FE0000000000000; +sub.f64 fd1532, fd1161, fd1531; +sub.f64 fd1533, fd1437, fd1442; +mul.f64 fd1534, fd1533, 0dBFEBB67AE8584CAA; +add.f64 fd1535, fd1447, fd1452; +mul.f64 fd1537, fd1535, 0d3FE0000000000000; +sub.f64 fd1538, fd1124, fd1537; +add.f64 fd1635, fd1449, fd1454; +sub.f64 fd1539, fd1449, fd1454; +mul.f64 fd1540, fd1539, 0dBFEBB67AE8584CAA; +mul.f64 fd1541, fd1635, 0d3FE0000000000000; +sub.f64 fd1542, fd1130, fd1541; +sub.f64 fd1543, fd1447, fd1452; +mul.f64 fd1544, fd1543, 0dBFEBB67AE8584CAA; +add.f64 fd1545, fd1457, fd1462; +mul.f64 fd1547, fd1545, 0d3FE0000000000000; +sub.f64 fd1548, fd1140, fd1547; +add.f64 fd1634, fd1459, fd1464; +sub.f64 fd1549, fd1459, fd1464; +mul.f64 fd1550, fd1549, 0dBFEBB67AE8584CAA; +mul.f64 fd1551, fd1634, 0d3FE0000000000000; +sub.f64 fd1552, fd1146, fd1551; +sub.f64 fd1553, fd1457, fd1462; +mul.f64 fd1554, fd1553, 0dBFEBB67AE8584CAA; +add.f64 fd1555, fd1467, fd1472; +mul.f64 fd1557, fd1555, 0d3FE0000000000000; +sub.f64 fd1558, fd1156, fd1557; +add.f64 fd1633, fd1469, fd1474; +sub.f64 fd1559, fd1469, fd1474; +mul.f64 fd1560, fd1559, 0dBFEBB67AE8584CAA; +mul.f64 fd1561, fd1633, 0d3FE0000000000000; +sub.f64 fd1562, fd1162, fd1561; +sub.f64 fd1563, fd1467, fd1472; +mul.f64 fd1564, fd1563, 0dBFEBB67AE8584CAA; +add.f64 %1, fd1705, fd1641; +add.f64 %0, fd1116, fd1475; +add.f64 %3, fd1703, fd1640; +add.f64 %2, fd1132, fd1485; +add.f64 %5, fd1701, fd1639; +add.f64 %4, fd1148, fd1495; +add.f64 %7, fd1129, fd1638; +add.f64 %6, fd1123, fd1505; +add.f64 %9, fd1145, fd1637; +add.f64 %8, fd1139, fd1515; +add.f64 %11, fd1161, fd1636; +add.f64 %10, fd1155, fd1525; +add.f64 %13, fd1130, fd1635; +add.f64 %12, fd1124, fd1535; +add.f64 %15, fd1146, fd1634; +add.f64 %14, fd1140, fd1545; +add.f64 %17, fd1162, fd1633; +add.f64 %16, fd1156, fd1555; +sub.f64 %19, fd1482, fd1484; +add.f64 %18, fd1480, fd1478; +sub.f64 %21, fd1492, fd1494; +add.f64 %20, fd1490, fd1488; +add.f64 %22, fd1500, fd1498; +sub.f64 %23, fd1502, fd1504; +add.f64 %24, fd1510, fd1508; +sub.f64 %25, fd1512, fd1514; +add.f64 %26, fd1520, fd1518; +sub.f64 %27, fd1522, fd1524; +add.f64 %28, fd1530, fd1528; +sub.f64 %29, fd1532, fd1534; +sub.f64 %31, fd1542, fd1544; +add.f64 %30, fd1540, fd1538; +sub.f64 %33, fd1552, fd1554; +add.f64 %32, fd1550, fd1548; +sub.f64 %35, fd1562, fd1564; +add.f64 %34, fd1560, fd1558; +add.f64 %37, fd1484, fd1482; +sub.f64 %36, fd1478, fd1480; +add.f64 %39, fd1494, fd1492; +sub.f64 %38, fd1488, fd1490; +add.f64 %41, fd1504, fd1502; +sub.f64 %40, fd1498, fd1500; +add.f64 %43, fd1514, fd1512; +sub.f64 %42, fd1508, fd1510; +add.f64 %45, fd1524, fd1522; +sub.f64 %44, fd1518, fd1520; +add.f64 %47, fd1534, fd1532; +sub.f64 %46, fd1528, fd1530; +add.f64 %49, fd1544, fd1542; +sub.f64 %48, fd1538, fd1540; +add.f64 %51, fd1554, fd1552; +sub.f64 %50, fd1548, fd1550; +add.f64 %53, fd1564, fd1562; +sub.f64 %52, fd1558, fd1560; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_729), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<692, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<613>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 11664, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 11664, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd122, fd152; +mul.f64 fd156, fd120, fd152; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd138, fd162; +mul.f64 fd164, fd136, fd162; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd117, fd170; +mul.f64 fd172, fd111, fd170; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd133, fd178; +mul.f64 fd180, fd127, fd178; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+1296]; +mul.f64 fd186, fd149, fd183; +mul.f64 fd187, fd143, fd183; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd118, fd193; +mul.f64 fd195, fd112, fd193; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd134, fd201; +mul.f64 fd203, fd128, fd201; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd150, fd209; +mul.f64 fd211, fd144, fd209; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd151, fd120, fd155; +sub.f64 fd216, fd157, fd156; +st.shared.v2.f64 [r9+16], {fd215, fd216}; +fma.rn.f64 fd217, fd160, fd136, fd163; +sub.f64 fd218, fd165, fd164; +st.shared.v2.f64 [r9+32], {fd217, fd218}; +sub.f64 fd219, fd173, fd172; +fma.rn.f64 fd220, fd168, fd111, fd171; +st.shared.v2.f64 [r9+48], {fd220, fd219}; +fma.rn.f64 fd221, fd176, fd127, fd179; +sub.f64 fd222, fd181, fd180; +st.shared.v2.f64 [r9+64], {fd221, fd222}; +fma.rn.f64 fd223, fd182, fd143, fd186; +sub.f64 fd224, fd188, fd187; +st.shared.v2.f64 [r9+80], {fd223, fd224}; +fma.rn.f64 fd225, fd191, fd112, fd194; +sub.f64 fd226, fd196, fd195; +st.shared.v2.f64 [r9+96], {fd225, fd226}; +fma.rn.f64 fd227, fd199, fd128, fd202; +sub.f64 fd228, fd204, fd203; +st.shared.v2.f64 [r9+112], {fd227, fd228}; +fma.rn.f64 fd229, fd207, fd144, fd210; +sub.f64 fd230, fd212, fd211; +st.shared.v2.f64 [r9+128], {fd229, fd230}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+1296]; +ld.shared.v2.f64 {fd239, fd240}, [r11+2592]; +ld.shared.v2.f64 {fd243, fd244}, [r11+3888]; +ld.shared.v2.f64 {fd247, fd248}, [r11+5184]; +ld.shared.v2.f64 {fd251, fd252}, [r11+6480]; +ld.shared.v2.f64 {fd255, fd256}, [r11+7776]; +ld.shared.v2.f64 {fd259, fd260}, [r11+9072]; +ld.shared.v2.f64 {fd263, fd264}, [r11+10368]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0dBFEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0dBFEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0dBFEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0d3FE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0d3FE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0d3FEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0d3FEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0d3FEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0d3FEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0d3FD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0d3FD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd336, 0d3FE0000000000000; +sub.f64 fd344, fd270, fd343; +sub.f64 fd345, fd284, fd300; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd317, fd322; +add.f64 fd350, fd275, fd349; +add.f64 fd351, fd319, fd324; +add.f64 fd352, fd281, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd275, fd353; +sub.f64 fd355, fd319, fd324; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd281, fd359; +sub.f64 fd361, fd317, fd322; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +add.f64 fd365, fd327, fd332; +add.f64 fd366, fd276, fd365; +add.f64 fd367, fd329, fd334; +add.f64 fd368, fd282, fd367; +mul.f64 fd369, fd365, 0d3FE0000000000000; +sub.f64 fd370, fd276, fd369; +sub.f64 fd371, fd329, fd334; +mul.f64 fd372, fd371, 0dBFEBB67AE8584CAA; +add.f64 fd373, fd372, fd370; +sub.f64 fd374, fd370, fd372; +mul.f64 fd375, fd367, 0d3FE0000000000000; +sub.f64 fd376, fd282, fd375; +sub.f64 fd377, fd327, fd332; +mul.f64 fd378, fd377, 0dBFEBB67AE8584CAA; +sub.f64 fd379, fd376, fd378; +add.f64 fd380, fd378, fd376; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd381, fd382}, [rd11]; +mul.f64 fd385, fd352, fd382; +mul.f64 fd386, fd350, fd382; +mul.f64 fd387, fd381, fd352; +mul.f64 fd388, fd381, fd381; +mul.f64 fd389, fd382, fd382; +sub.f64 fd390, fd388, fd389; +mul.f64 fd391, fd382, fd381; +fma.rn.f64 fd392, fd382, fd381, fd391; +mul.f64 fd393, fd368, fd392; +mul.f64 fd394, fd366, fd392; +mul.f64 fd395, fd390, fd368; +mul.f64 fd396, fd381, fd390; +mul.f64 fd397, fd382, fd392; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd381, fd392; +fma.rn.f64 fd400, fd382, fd390, fd399; +mul.f64 fd401, fd347, fd400; +mul.f64 fd402, fd341, fd400; +mul.f64 fd403, fd398, fd347; +mul.f64 fd404, fd381, fd398; +mul.f64 fd405, fd382, fd400; +sub.f64 fd406, fd404, fd405; +mul.f64 fd407, fd381, fd400; +fma.rn.f64 fd408, fd382, fd398, fd407; +mul.f64 fd409, fd363, fd408; +mul.f64 fd410, fd357, fd408; +mul.f64 fd411, fd406, fd363; +ld.global.v2.f64 {fd412, fd413}, [rd11+144]; +mul.f64 fd416, fd379, fd413; +mul.f64 fd417, fd373, fd413; +mul.f64 fd418, fd412, fd379; +mul.f64 fd419, fd381, fd412; +mul.f64 fd420, fd382, fd413; +sub.f64 fd421, fd419, fd420; +mul.f64 fd422, fd381, fd413; +fma.rn.f64 fd423, fd382, fd412, fd422; +mul.f64 fd424, fd348, fd423; +mul.f64 fd425, fd342, fd423; +mul.f64 fd426, fd421, fd348; +mul.f64 fd427, fd381, fd421; +mul.f64 fd428, fd382, fd423; +sub.f64 fd429, fd427, fd428; +mul.f64 fd430, fd381, fd423; +fma.rn.f64 fd431, fd382, fd421, fd430; +mul.f64 fd432, fd364, fd431; +mul.f64 fd433, fd358, fd431; +mul.f64 fd434, fd429, fd364; +mul.f64 fd435, fd381, fd429; +mul.f64 fd436, fd382, fd431; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd381, fd431; +fma.rn.f64 fd439, fd382, fd429, fd438; +mul.f64 fd440, fd380, fd439; +mul.f64 fd441, fd374, fd439; +mul.f64 fd442, fd437, fd380; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 1296, r16; +add.f64 fd443, fd270, fd336; +add.f64 fd444, fd268, fd335; +st.shared.v2.f64 [r17], {fd444, fd443}; +fma.rn.f64 fd445, fd381, fd350, fd385; +sub.f64 fd446, fd387, fd386; +st.shared.v2.f64 [r17+144], {fd445, fd446}; +fma.rn.f64 fd447, fd390, fd366, fd393; +sub.f64 fd448, fd395, fd394; +st.shared.v2.f64 [r17+288], {fd447, fd448}; +fma.rn.f64 fd449, fd398, fd341, fd401; +sub.f64 fd450, fd403, fd402; +st.shared.v2.f64 [r17+432], {fd449, fd450}; +fma.rn.f64 fd451, fd406, fd357, fd409; +sub.f64 fd452, fd411, fd410; +st.shared.v2.f64 [r17+576], {fd451, fd452}; +fma.rn.f64 fd453, fd412, fd373, fd416; +sub.f64 fd454, fd418, fd417; +st.shared.v2.f64 [r17+720], {fd453, fd454}; +fma.rn.f64 fd455, fd421, fd342, fd424; +sub.f64 fd456, fd426, fd425; +st.shared.v2.f64 [r17+864], {fd455, fd456}; +fma.rn.f64 fd457, fd429, fd358, fd432; +sub.f64 fd458, fd434, fd433; +st.shared.v2.f64 [r17+1008], {fd457, fd458}; +fma.rn.f64 fd459, fd437, fd374, fd440; +sub.f64 fd460, fd442, fd441; +st.shared.v2.f64 [r17+1152], {fd459, fd460}; +barrier.sync 0; +ld.shared.v2.f64 {fd461, fd462}, [r11]; +ld.shared.v2.f64 {fd465, fd466}, [r11+1296]; +ld.shared.v2.f64 {fd469, fd470}, [r11+2592]; +ld.shared.v2.f64 {fd473, fd474}, [r11+3888]; +ld.shared.v2.f64 {fd477, fd478}, [r11+5184]; +ld.shared.v2.f64 {fd481, fd482}, [r11+6480]; +ld.shared.v2.f64 {fd485, fd486}, [r11+7776]; +ld.shared.v2.f64 {fd489, fd490}, [r11+9072]; +ld.shared.v2.f64 {fd493, fd494}, [r11+10368]; +add.f64 fd497, fd473, fd485; +add.f64 fd498, fd461, fd497; +add.f64 fd499, fd474, fd486; +add.f64 fd500, fd462, fd499; +mul.f64 fd501, fd497, 0d3FE0000000000000; +sub.f64 fd502, fd461, fd501; +sub.f64 fd503, fd474, fd486; +mul.f64 fd504, fd503, 0dBFEBB67AE8584CAA; +add.f64 fd505, fd504, fd502; +sub.f64 fd506, fd502, fd504; +mul.f64 fd507, fd499, 0d3FE0000000000000; +sub.f64 fd508, fd462, fd507; +sub.f64 fd509, fd473, fd485; +mul.f64 fd510, fd509, 0dBFEBB67AE8584CAA; +sub.f64 fd511, fd508, fd510; +add.f64 fd512, fd510, fd508; +add.f64 fd513, fd477, fd489; +add.f64 fd514, fd465, fd513; +add.f64 fd515, fd478, fd490; +add.f64 fd516, fd466, fd515; +mul.f64 fd517, fd513, 0d3FE0000000000000; +sub.f64 fd518, fd465, fd517; +sub.f64 fd519, fd478, fd490; +mul.f64 fd520, fd519, 0dBFEBB67AE8584CAA; +add.f64 fd521, fd520, fd518; +sub.f64 fd522, fd518, fd520; +mul.f64 fd523, fd515, 0d3FE0000000000000; +sub.f64 fd524, fd466, fd523; +sub.f64 fd525, fd477, fd489; +mul.f64 fd526, fd525, 0dBFEBB67AE8584CAA; +sub.f64 fd527, fd524, fd526; +add.f64 fd528, fd526, fd524; +add.f64 fd529, fd481, fd493; +add.f64 fd530, fd469, fd529; +add.f64 fd531, fd482, fd494; +add.f64 fd532, fd470, fd531; +mul.f64 fd533, fd529, 0d3FE0000000000000; +sub.f64 fd534, fd469, fd533; +sub.f64 fd535, fd482, fd494; +mul.f64 fd536, fd535, 0dBFEBB67AE8584CAA; +add.f64 fd537, fd536, fd534; +sub.f64 fd538, fd534, fd536; +mul.f64 fd539, fd531, 0d3FE0000000000000; +sub.f64 fd540, fd470, fd539; +sub.f64 fd541, fd481, fd493; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +sub.f64 fd543, fd540, fd542; +add.f64 fd544, fd542, fd540; +mul.f64 fd545, fd521, 0d3FE8836FA2CF5039; +mul.f64 fd546, fd527, 0d3FE491B7523C161D; +sub.f64 fd547, fd545, fd546; +mul.f64 fd548, fd527, 0d3FE8836FA2CF5039; +fma.rn.f64 fd549, fd521, 0d3FE491B7523C161D, fd548; +mul.f64 fd550, fd537, 0d3FC63A1A7E0B738A; +mul.f64 fd551, fd543, 0d3FEF838B8C811C17; +sub.f64 fd552, fd550, fd551; +mul.f64 fd553, fd543, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd554, fd537, 0d3FEF838B8C811C17, fd553; +mul.f64 fd555, fd522, 0d3FC63A1A7E0B738A; +mul.f64 fd556, fd528, 0d3FEF838B8C811C17; +sub.f64 fd557, fd555, fd556; +mul.f64 fd558, fd528, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd559, fd522, 0d3FEF838B8C811C17, fd558; +mul.f64 fd560, fd538, 0dBFEE11F642522D1C; +mul.f64 fd561, fd544, 0d3FD5E3A8748A0BF5; +sub.f64 fd562, fd560, fd561; +mul.f64 fd563, fd544, 0dBFEE11F642522D1C; +fma.rn.f64 fd564, fd538, 0d3FD5E3A8748A0BF5, fd563; +add.f64 fd565, fd514, fd530; +add.f64 fd566, fd516, fd532; +mul.f64 fd567, fd565, 0d3FE0000000000000; +sub.f64 fd568, fd498, fd567; +sub.f64 fd569, fd516, fd532; +mul.f64 fd570, fd569, 0dBFEBB67AE8584CAA; +mul.f64 fd571, fd566, 0d3FE0000000000000; +sub.f64 fd572, fd500, fd571; +sub.f64 fd573, fd514, fd530; +mul.f64 fd574, fd573, 0dBFEBB67AE8584CAA; +add.f64 fd575, fd547, fd552; +add.f64 fd576, fd549, fd554; +mul.f64 fd577, fd575, 0d3FE0000000000000; +sub.f64 fd578, fd505, fd577; +sub.f64 fd579, fd549, fd554; +mul.f64 fd580, fd579, 0dBFEBB67AE8584CAA; +mul.f64 fd581, fd576, 0d3FE0000000000000; +sub.f64 fd582, fd511, fd581; +sub.f64 fd583, fd547, fd552; +mul.f64 fd584, fd583, 0dBFEBB67AE8584CAA; +add.f64 fd585, fd557, fd562; +add.f64 fd586, fd559, fd564; +mul.f64 fd587, fd585, 0d3FE0000000000000; +sub.f64 fd588, fd506, fd587; +sub.f64 fd589, fd559, fd564; +mul.f64 fd590, fd589, 0dBFEBB67AE8584CAA; +mul.f64 fd591, fd586, 0d3FE0000000000000; +sub.f64 fd592, fd512, fd591; +sub.f64 fd593, fd557, fd562; +mul.f64 fd594, fd593, 0dBFEBB67AE8584CAA; +add.f64 %1, fd500, fd566; +add.f64 %0, fd498, fd565; +add.f64 %3, fd511, fd576; +add.f64 %2, fd505, fd575; +add.f64 %5, fd512, fd586; +add.f64 %4, fd506, fd585; +sub.f64 %7, fd572, fd574; +add.f64 %6, fd570, fd568; +sub.f64 %9, fd582, fd584; +add.f64 %8, fd580, fd578; +sub.f64 %11, fd592, fd594; +add.f64 %10, fd590, fd588; +add.f64 %13, fd574, fd572; +sub.f64 %12, fd568, fd570; +add.f64 %15, fd584, fd582; +sub.f64 %14, fd578, fd580; +add.f64 %17, fd594, fd592; +sub.f64 %16, fd588, fd590; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<693, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<18>; +.reg .f64 fd<577>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 5832, r2; +add.f64 fd37, %29, %37; +add.f64 fd38, %21, fd37; +add.f64 fd39, %30, %38; +add.f64 fd40, %22, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %21, fd41; +sub.f64 fd43, %30, %38; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %22, fd47; +sub.f64 fd49, %29, %37; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %31, %39; +add.f64 fd54, %23, fd53; +add.f64 fd55, %33, %41; +add.f64 fd56, %25, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %23, fd57; +sub.f64 fd59, %33, %41; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %25, fd63; +sub.f64 fd65, %31, %39; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %34, %42; +add.f64 fd70, %26, fd69; +add.f64 fd71, %36, %43; +add.f64 fd72, %28, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %26, fd73; +sub.f64 fd75, %36, %43; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %28, fd79; +sub.f64 fd81, %34, %42; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0dBFEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0dBFEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0dBFEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, -901412889; +shr.u64 rd3, rd2, 38; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 81; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd124, fd154; +fma.rn.f64 fd158, fd153, fd122, fd157; +mul.f64 fd159, fd122, fd154; +mul.f64 fd160, fd153, fd124; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd140, fd166; +fma.rn.f64 fd168, fd164, fd138, fd167; +mul.f64 fd169, fd138, fd166; +mul.f64 fd170, fd164, fd140; +sub.f64 fd171, fd170, fd169; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd119, fd176; +fma.rn.f64 fd178, fd174, fd113, fd177; +mul.f64 fd179, fd113, fd176; +mul.f64 fd180, fd174, fd119; +sub.f64 fd181, fd180, fd179; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd135, fd186; +fma.rn.f64 fd188, fd184, fd129, fd187; +mul.f64 fd189, fd129, fd186; +mul.f64 fd190, fd184, fd135; +sub.f64 fd191, fd190, fd189; +ld.global.v2.f64 {fd192, fd193}, [rd6+1296]; +mul.f64 fd196, fd151, fd193; +fma.rn.f64 fd197, fd192, fd145, fd196; +mul.f64 fd198, fd145, fd193; +mul.f64 fd199, fd192, fd151; +sub.f64 fd200, fd199, fd198; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd120, fd205; +fma.rn.f64 fd207, fd203, fd114, fd206; +mul.f64 fd208, fd114, fd205; +mul.f64 fd209, fd203, fd120; +sub.f64 fd210, fd209, fd208; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd136, fd215; +fma.rn.f64 fd217, fd213, fd130, fd216; +mul.f64 fd218, fd130, fd215; +mul.f64 fd219, fd213, fd136; +sub.f64 fd220, fd219, fd218; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd152, fd225; +fma.rn.f64 fd227, fd223, fd146, fd226; +mul.f64 fd228, fd146, fd225; +mul.f64 fd229, fd223, fd152; +sub.f64 fd230, fd229, fd228; +mad.lo.s32 r8, r5, 5832, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd158; +st.shared.f64 [r9+16], fd168; +st.shared.f64 [r9+24], fd178; +st.shared.f64 [r9+32], fd188; +st.shared.f64 [r9+40], fd197; +st.shared.f64 [r9+48], fd207; +st.shared.f64 [r9+56], fd217; +st.shared.f64 [r9+64], fd227; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+648]; +ld.shared.f64 fd233, [r11+1296]; +ld.shared.f64 fd234, [r11+1944]; +ld.shared.f64 fd235, [r11+2592]; +ld.shared.f64 fd236, [r11+3240]; +ld.shared.f64 fd237, [r11+3888]; +ld.shared.f64 fd238, [r11+4536]; +ld.shared.f64 fd239, [r11+5184]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+648]; +ld.shared.f64 fd242, [r11+1296]; +ld.shared.f64 fd243, [r11+1944]; +ld.shared.f64 fd244, [r11+2592]; +ld.shared.f64 fd245, [r11+3240]; +ld.shared.f64 fd246, [r11+3888]; +ld.shared.f64 fd247, [r11+4536]; +ld.shared.f64 fd248, [r11+5184]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0dBFEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0d3FE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0d3FE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0d3FEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0d3FEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0d3FEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0d3FEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0d3FD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0d3FD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd250, fd317; +add.f64 fd319, fd268, fd284; +add.f64 fd320, fd252, fd319; +mul.f64 fd321, fd317, 0d3FE0000000000000; +sub.f64 fd322, fd250, fd321; +sub.f64 fd323, fd268, fd284; +mul.f64 fd324, fd323, 0dBFEBB67AE8584CAA; +add.f64 fd325, fd324, fd322; +sub.f64 fd326, fd322, fd324; +mul.f64 fd327, fd319, 0d3FE0000000000000; +sub.f64 fd328, fd252, fd327; +sub.f64 fd329, fd266, fd282; +mul.f64 fd330, fd329, 0dBFEBB67AE8584CAA; +sub.f64 fd331, fd328, fd330; +add.f64 fd332, fd330, fd328; +add.f64 fd333, fd299, fd304; +add.f64 fd334, fd257, fd333; +add.f64 fd335, fd301, fd306; +add.f64 fd336, fd263, fd335; +mul.f64 fd337, fd333, 0d3FE0000000000000; +sub.f64 fd338, fd257, fd337; +sub.f64 fd339, fd301, fd306; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +add.f64 fd341, fd340, fd338; +sub.f64 fd342, fd338, fd340; +mul.f64 fd343, fd335, 0d3FE0000000000000; +sub.f64 fd344, fd263, fd343; +sub.f64 fd345, fd299, fd304; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +sub.f64 fd347, fd344, fd346; +add.f64 fd348, fd346, fd344; +add.f64 fd349, fd309, fd314; +add.f64 fd350, fd258, fd349; +add.f64 fd351, fd311, fd316; +add.f64 fd352, fd264, fd351; +mul.f64 fd353, fd349, 0d3FE0000000000000; +sub.f64 fd354, fd258, fd353; +sub.f64 fd355, fd311, fd316; +mul.f64 fd356, fd355, 0dBFEBB67AE8584CAA; +add.f64 fd357, fd356, fd354; +sub.f64 fd358, fd354, fd356; +mul.f64 fd359, fd351, 0d3FE0000000000000; +sub.f64 fd360, fd264, fd359; +sub.f64 fd361, fd309, fd314; +mul.f64 fd362, fd361, 0dBFEBB67AE8584CAA; +sub.f64 fd363, fd360, fd362; +add.f64 fd364, fd362, fd360; +mul.wide.u32 rd7, r7, 954437177; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 9; +sub.s32 r14, r7, r13; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %20; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd365, fd366}, [rd11]; +mul.f64 fd369, fd336, fd366; +fma.rn.f64 fd370, fd365, fd334, fd369; +mul.f64 fd371, fd334, fd366; +mul.f64 fd372, fd365, fd336; +sub.f64 fd373, fd372, fd371; +mul.f64 fd374, fd365, fd365; +mul.f64 fd375, fd366, fd366; +sub.f64 fd376, fd374, fd375; +mul.f64 fd377, fd366, fd365; +fma.rn.f64 fd378, fd366, fd365, fd377; +mul.f64 fd379, fd352, fd378; +fma.rn.f64 fd380, fd376, fd350, fd379; +mul.f64 fd381, fd350, fd378; +mul.f64 fd382, fd376, fd352; +sub.f64 fd383, fd382, fd381; +mul.f64 fd384, fd365, fd376; +mul.f64 fd385, fd366, fd378; +sub.f64 fd386, fd384, fd385; +mul.f64 fd387, fd365, fd378; +fma.rn.f64 fd388, fd366, fd376, fd387; +mul.f64 fd389, fd331, fd388; +fma.rn.f64 fd390, fd386, fd325, fd389; +mul.f64 fd391, fd325, fd388; +mul.f64 fd392, fd386, fd331; +sub.f64 fd393, fd392, fd391; +mul.f64 fd394, fd365, fd386; +mul.f64 fd395, fd366, fd388; +sub.f64 fd396, fd394, fd395; +mul.f64 fd397, fd365, fd388; +fma.rn.f64 fd398, fd366, fd386, fd397; +mul.f64 fd399, fd347, fd398; +fma.rn.f64 fd400, fd396, fd341, fd399; +mul.f64 fd401, fd341, fd398; +mul.f64 fd402, fd396, fd347; +sub.f64 fd403, fd402, fd401; +ld.global.v2.f64 {fd404, fd405}, [rd11+144]; +mul.f64 fd408, fd363, fd405; +fma.rn.f64 fd409, fd404, fd357, fd408; +mul.f64 fd410, fd357, fd405; +mul.f64 fd411, fd404, fd363; +sub.f64 fd412, fd411, fd410; +mul.f64 fd413, fd365, fd404; +mul.f64 fd414, fd366, fd405; +sub.f64 fd415, fd413, fd414; +mul.f64 fd416, fd365, fd405; +fma.rn.f64 fd417, fd366, fd404, fd416; +mul.f64 fd418, fd332, fd417; +fma.rn.f64 fd419, fd415, fd326, fd418; +mul.f64 fd420, fd326, fd417; +mul.f64 fd421, fd415, fd332; +sub.f64 fd422, fd421, fd420; +mul.f64 fd423, fd365, fd415; +mul.f64 fd424, fd366, fd417; +sub.f64 fd425, fd423, fd424; +mul.f64 fd426, fd365, fd417; +fma.rn.f64 fd427, fd366, fd415, fd426; +mul.f64 fd428, fd348, fd427; +fma.rn.f64 fd429, fd425, fd342, fd428; +mul.f64 fd430, fd342, fd427; +mul.f64 fd431, fd425, fd348; +sub.f64 fd432, fd431, fd430; +mul.f64 fd433, fd365, fd425; +mul.f64 fd434, fd366, fd427; +sub.f64 fd435, fd433, fd434; +mul.f64 fd436, fd365, fd427; +fma.rn.f64 fd437, fd366, fd425, fd436; +mul.f64 fd438, fd364, fd437; +fma.rn.f64 fd439, fd435, fd358, fd438; +mul.f64 fd440, fd358, fd437; +mul.f64 fd441, fd435, fd364; +sub.f64 fd442, fd441, fd440; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +barrier.sync 0; +mad.lo.s32 r17, r12, 648, r16; +st.shared.f64 [r17], fd318; +st.shared.f64 [r17+72], fd370; +st.shared.f64 [r17+144], fd380; +st.shared.f64 [r17+216], fd390; +st.shared.f64 [r17+288], fd400; +st.shared.f64 [r17+360], fd409; +st.shared.f64 [r17+432], fd419; +st.shared.f64 [r17+504], fd429; +st.shared.f64 [r17+576], fd439; +barrier.sync 0; +ld.shared.f64 fd443, [r11]; +ld.shared.f64 fd444, [r11+648]; +ld.shared.f64 fd445, [r11+1296]; +ld.shared.f64 fd446, [r11+1944]; +ld.shared.f64 fd447, [r11+2592]; +ld.shared.f64 fd448, [r11+3240]; +ld.shared.f64 fd449, [r11+3888]; +ld.shared.f64 fd450, [r11+4536]; +ld.shared.f64 fd451, [r11+5184]; +barrier.sync 0; +st.shared.f64 [r17], fd320; +st.shared.f64 [r17+72], fd373; +st.shared.f64 [r17+144], fd383; +st.shared.f64 [r17+216], fd393; +st.shared.f64 [r17+288], fd403; +st.shared.f64 [r17+360], fd412; +st.shared.f64 [r17+432], fd422; +st.shared.f64 [r17+504], fd432; +st.shared.f64 [r17+576], fd442; +barrier.sync 0; +ld.shared.f64 fd452, [r11]; +ld.shared.f64 fd453, [r11+648]; +ld.shared.f64 fd454, [r11+1296]; +ld.shared.f64 fd455, [r11+1944]; +ld.shared.f64 fd456, [r11+2592]; +ld.shared.f64 fd457, [r11+3240]; +ld.shared.f64 fd458, [r11+3888]; +ld.shared.f64 fd459, [r11+4536]; +ld.shared.f64 fd460, [r11+5184]; +add.f64 fd461, fd446, fd449; +add.f64 fd462, fd443, fd461; +add.f64 fd463, fd455, fd458; +add.f64 fd464, fd452, fd463; +mul.f64 fd465, fd461, 0d3FE0000000000000; +sub.f64 fd466, fd443, fd465; +sub.f64 fd467, fd455, fd458; +mul.f64 fd468, fd467, 0dBFEBB67AE8584CAA; +add.f64 fd469, fd468, fd466; +sub.f64 fd470, fd466, fd468; +mul.f64 fd471, fd463, 0d3FE0000000000000; +sub.f64 fd472, fd452, fd471; +sub.f64 fd473, fd446, fd449; +mul.f64 fd474, fd473, 0dBFEBB67AE8584CAA; +sub.f64 fd475, fd472, fd474; +add.f64 fd476, fd474, fd472; +add.f64 fd477, fd447, fd450; +add.f64 fd478, fd444, fd477; +add.f64 fd479, fd456, fd459; +add.f64 fd480, fd453, fd479; +mul.f64 fd481, fd477, 0d3FE0000000000000; +sub.f64 fd482, fd444, fd481; +sub.f64 fd483, fd456, fd459; +mul.f64 fd484, fd483, 0dBFEBB67AE8584CAA; +add.f64 fd485, fd484, fd482; +sub.f64 fd486, fd482, fd484; +mul.f64 fd487, fd479, 0d3FE0000000000000; +sub.f64 fd488, fd453, fd487; +sub.f64 fd489, fd447, fd450; +mul.f64 fd490, fd489, 0dBFEBB67AE8584CAA; +sub.f64 fd491, fd488, fd490; +add.f64 fd492, fd490, fd488; +add.f64 fd493, fd448, fd451; +add.f64 fd494, fd445, fd493; +add.f64 fd495, fd457, fd460; +add.f64 fd496, fd454, fd495; +mul.f64 fd497, fd493, 0d3FE0000000000000; +sub.f64 fd498, fd445, fd497; +sub.f64 fd499, fd457, fd460; +mul.f64 fd500, fd499, 0dBFEBB67AE8584CAA; +add.f64 fd501, fd500, fd498; +sub.f64 fd502, fd498, fd500; +mul.f64 fd503, fd495, 0d3FE0000000000000; +sub.f64 fd504, fd454, fd503; +sub.f64 fd505, fd448, fd451; +mul.f64 fd506, fd505, 0dBFEBB67AE8584CAA; +sub.f64 fd507, fd504, fd506; +add.f64 fd508, fd506, fd504; +mul.f64 fd509, fd485, 0d3FE8836FA2CF5039; +mul.f64 fd510, fd491, 0d3FE491B7523C161D; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd491, 0d3FE8836FA2CF5039; +fma.rn.f64 fd513, fd485, 0d3FE491B7523C161D, fd512; +mul.f64 fd514, fd501, 0d3FC63A1A7E0B738A; +mul.f64 fd515, fd507, 0d3FEF838B8C811C17; +sub.f64 fd516, fd514, fd515; +mul.f64 fd517, fd507, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd518, fd501, 0d3FEF838B8C811C17, fd517; +mul.f64 fd519, fd486, 0d3FC63A1A7E0B738A; +mul.f64 fd520, fd492, 0d3FEF838B8C811C17; +sub.f64 fd521, fd519, fd520; +mul.f64 fd522, fd492, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd523, fd486, 0d3FEF838B8C811C17, fd522; +mul.f64 fd524, fd502, 0dBFEE11F642522D1C; +mul.f64 fd525, fd508, 0d3FD5E3A8748A0BF5; +sub.f64 fd526, fd524, fd525; +mul.f64 fd527, fd508, 0dBFEE11F642522D1C; +fma.rn.f64 fd528, fd502, 0d3FD5E3A8748A0BF5, fd527; +add.f64 fd529, fd478, fd494; +add.f64 fd530, fd480, fd496; +mul.f64 fd531, fd529, 0d3FE0000000000000; +sub.f64 fd532, fd462, fd531; +sub.f64 fd533, fd480, fd496; +mul.f64 fd534, fd533, 0dBFEBB67AE8584CAA; +mul.f64 fd535, fd530, 0d3FE0000000000000; +sub.f64 fd536, fd464, fd535; +sub.f64 fd537, fd478, fd494; +mul.f64 fd538, fd537, 0dBFEBB67AE8584CAA; +add.f64 fd539, fd511, fd516; +add.f64 fd540, fd513, fd518; +mul.f64 fd541, fd539, 0d3FE0000000000000; +sub.f64 fd542, fd469, fd541; +sub.f64 fd543, fd513, fd518; +mul.f64 fd544, fd543, 0dBFEBB67AE8584CAA; +mul.f64 fd545, fd540, 0d3FE0000000000000; +sub.f64 fd546, fd475, fd545; +sub.f64 fd547, fd511, fd516; +mul.f64 fd548, fd547, 0dBFEBB67AE8584CAA; +add.f64 fd549, fd521, fd526; +add.f64 fd550, fd523, fd528; +mul.f64 fd551, fd549, 0d3FE0000000000000; +sub.f64 fd552, fd470, fd551; +sub.f64 fd553, fd523, fd528; +mul.f64 fd554, fd553, 0dBFEBB67AE8584CAA; +mul.f64 fd555, fd550, 0d3FE0000000000000; +sub.f64 fd556, fd476, fd555; +sub.f64 fd557, fd521, fd526; +mul.f64 fd558, fd557, 0dBFEBB67AE8584CAA; +add.f64 %0, fd462, fd529; +add.f64 %1, fd464, fd530; +add.f64 %3, fd475, fd540; +add.f64 %2, fd469, fd539; +add.f64 %5, fd476, fd550; +add.f64 %4, fd470, fd549; +add.f64 %6, fd534, fd532; +sub.f64 %7, fd536, fd538; +sub.f64 %9, fd546, fd548; +add.f64 %8, fd544, fd542; +sub.f64 %11, fd556, fd558; +add.f64 %10, fd554, fd552; +sub.f64 %12, fd532, fd534; +add.f64 %13, fd538, fd536; +add.f64 %15, fd548, fd546; +sub.f64 %14, fd542, fd544; +add.f64 %17, fd558, fd556; +sub.f64 %16, fd552, fd554; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_729), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<696, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<229>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 5832, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %14, %17; +add.f64 fd14, %12, fd13; +add.f64 fd15, %16, %18; +add.f64 fd16, %13, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %12, fd17; +sub.f64 fd19, %16, %18; +mul.f64 fd20, fd19, 0dBFEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %13, fd23; +sub.f64 fd25, %14, %17; +mul.f64 fd26, fd25, 0dBFEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 5832, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd27, fd30; +fma.rn.f64 fd34, fd29, fd21, fd33; +mul.f64 fd35, fd21, fd30; +mul.f64 fd36, fd29, fd27; +sub.f64 fd37, fd36, fd35; +ld.global.v2.f64 {fd38, fd39}, [rd6+3888]; +mul.f64 fd42, fd28, fd39; +fma.rn.f64 fd43, fd38, fd22, fd42; +mul.f64 fd44, fd22, fd39; +mul.f64 fd45, fd38, fd28; +sub.f64 fd46, fd45, fd44; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd34; +st.shared.f64 [r9+16], fd43; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+1944]; +ld.shared.f64 fd49, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+1944]; +ld.shared.f64 fd52, [r11+3888]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 3; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd67, fd70; +fma.rn.f64 fd74, fd69, fd61, fd73; +mul.f64 fd75, fd61, fd70; +mul.f64 fd76, fd69, fd67; +sub.f64 fd77, fd76, fd75; +ld.global.v2.f64 {fd78, fd79}, [rd11+1296]; +mul.f64 fd82, fd68, fd79; +fma.rn.f64 fd83, fd78, fd62, fd82; +mul.f64 fd84, fd62, fd79; +mul.f64 fd85, fd78, fd68; +sub.f64 fd86, fd85, fd84; +barrier.sync 0; +mad.lo.s32 r17, r12, 72, r16; +st.shared.f64 [r17], fd54; +st.shared.f64 [r17+24], fd74; +st.shared.f64 [r17+48], fd83; +barrier.sync 0; +ld.shared.f64 fd87, [r11]; +ld.shared.f64 fd88, [r11+1944]; +ld.shared.f64 fd89, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r17], fd56; +st.shared.f64 [r17+24], fd77; +st.shared.f64 [r17+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r11]; +ld.shared.f64 fd91, [r11+1944]; +ld.shared.f64 fd92, [r11+3888]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0dBFEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0dBFEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 3; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd107, fd110; +fma.rn.f64 fd114, fd109, fd101, fd113; +mul.f64 fd115, fd101, fd110; +mul.f64 fd116, fd109, fd107; +sub.f64 fd117, fd116, fd115; +ld.global.v2.f64 {fd118, fd119}, [rd16+432]; +mul.f64 fd122, fd108, fd119; +fma.rn.f64 fd123, fd118, fd102, fd122; +mul.f64 fd124, fd102, fd119; +mul.f64 fd125, fd118, fd108; +sub.f64 fd126, fd125, fd124; +barrier.sync 0; +mad.lo.s32 r23, r18, 216, r22; +st.shared.f64 [r23], fd94; +st.shared.f64 [r23+72], fd114; +st.shared.f64 [r23+144], fd123; +barrier.sync 0; +ld.shared.f64 fd127, [r11]; +ld.shared.f64 fd128, [r11+1944]; +ld.shared.f64 fd129, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r23], fd96; +st.shared.f64 [r23+72], fd117; +st.shared.f64 [r23+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r11]; +ld.shared.f64 fd131, [r11+1944]; +ld.shared.f64 fd132, [r11+3888]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd127, fd133; +add.f64 fd135, fd131, fd132; +add.f64 fd136, fd130, fd135; +mul.f64 fd137, fd133, 0d3FE0000000000000; +sub.f64 fd138, fd127, fd137; +sub.f64 fd139, fd131, fd132; +mul.f64 fd140, fd139, 0dBFEBB67AE8584CAA; +add.f64 fd141, fd140, fd138; +sub.f64 fd142, fd138, fd140; +mul.f64 fd143, fd135, 0d3FE0000000000000; +sub.f64 fd144, fd130, fd143; +sub.f64 fd145, fd128, fd129; +mul.f64 fd146, fd145, 0dBFEBB67AE8584CAA; +sub.f64 fd147, fd144, fd146; +add.f64 fd148, fd146, fd144; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 3; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd149, fd150}, [rd21]; +mul.f64 fd153, fd147, fd150; +fma.rn.f64 fd154, fd149, fd141, fd153; +mul.f64 fd155, fd141, fd150; +mul.f64 fd156, fd149, fd147; +sub.f64 fd157, fd156, fd155; +ld.global.v2.f64 {fd158, fd159}, [rd21+144]; +mul.f64 fd162, fd148, fd159; +fma.rn.f64 fd163, fd158, fd142, fd162; +mul.f64 fd164, fd142, fd159; +mul.f64 fd165, fd158, fd148; +sub.f64 fd166, fd165, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 648, r32; +st.shared.f64 [r33], fd134; +st.shared.f64 [r33+216], fd154; +st.shared.f64 [r33+432], fd163; +barrier.sync 0; +ld.shared.f64 fd167, [r11]; +ld.shared.f64 fd168, [r11+1944]; +ld.shared.f64 fd169, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r33], fd136; +st.shared.f64 [r33+216], fd157; +st.shared.f64 [r33+432], fd166; +barrier.sync 0; +ld.shared.f64 fd170, [r11]; +ld.shared.f64 fd171, [r11+1944]; +ld.shared.f64 fd172, [r11+3888]; +add.f64 fd173, fd168, fd169; +add.f64 fd174, fd167, fd173; +add.f64 fd175, fd171, fd172; +add.f64 fd176, fd170, fd175; +mul.f64 fd177, fd173, 0d3FE0000000000000; +sub.f64 fd178, fd167, fd177; +sub.f64 fd179, fd171, fd172; +mul.f64 fd180, fd179, 0dBFEBB67AE8584CAA; +add.f64 fd181, fd180, fd178; +sub.f64 fd182, fd178, fd180; +mul.f64 fd183, fd175, 0d3FE0000000000000; +sub.f64 fd184, fd170, fd183; +sub.f64 fd185, fd168, fd169; +mul.f64 fd186, fd185, 0dBFEBB67AE8584CAA; +sub.f64 fd187, fd184, fd186; +add.f64 fd188, fd186, fd184; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 3; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd189, fd190}, [rd26]; +mul.f64 fd193, fd187, fd190; +fma.rn.f64 fd194, fd189, fd181, fd193; +mul.f64 fd195, fd181, fd190; +mul.f64 fd196, fd189, fd187; +sub.f64 fd197, fd196, fd195; +ld.global.v2.f64 {fd198, fd199}, [rd26+48]; +mul.f64 fd202, fd188, fd199; +fma.rn.f64 fd203, fd198, fd182, fd202; +mul.f64 fd204, fd182, fd199; +mul.f64 fd205, fd198, fd188; +sub.f64 fd206, fd205, fd204; +barrier.sync 0; +mad.lo.s32 r39, r34, 1944, r38; +st.shared.f64 [r39], fd174; +st.shared.f64 [r39+648], fd194; +st.shared.f64 [r39+1296], fd203; +barrier.sync 0; +ld.shared.f64 fd207, [r11]; +ld.shared.f64 fd208, [r11+1944]; +ld.shared.f64 fd209, [r11+3888]; +barrier.sync 0; +st.shared.f64 [r39], fd176; +st.shared.f64 [r39+648], fd197; +st.shared.f64 [r39+1296], fd206; +barrier.sync 0; +ld.shared.f64 fd210, [r11]; +ld.shared.f64 fd211, [r11+1944]; +ld.shared.f64 fd212, [r11+3888]; +add.f64 fd213, fd208, fd209; +add.f64 fd214, fd211, fd212; +mul.f64 fd215, fd213, 0d3FE0000000000000; +sub.f64 fd216, fd207, fd215; +sub.f64 fd217, fd211, fd212; +mul.f64 fd218, fd217, 0dBFEBB67AE8584CAA; +mul.f64 fd219, fd214, 0d3FE0000000000000; +sub.f64 fd220, fd210, fd219; +sub.f64 fd221, fd208, fd209; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +add.f64 %0, fd207, fd213; +add.f64 %1, fd210, fd214; +add.f64 %2, fd218, fd216; +sub.f64 %3, fd220, fd222; +sub.f64 %4, fd216, fd218; +add.f64 %5, fd222, fd220; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<697, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<40>; +.reg .f64 fd<259>; +.reg .b64 rd<27>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 11664, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %14, %17; +add.f64 fd14, %16, %18; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %12, fd15; +sub.f64 fd17, %16, %18; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %13, fd21; +sub.f64 fd23, %14, %17; +mul.f64 fd24, fd23, 0dBFEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, -2032597691; +shr.u64 rd3, rd2, 39; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 243; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 11664, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd25, fd28; +mul.f64 fd32, fd19, fd28; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+3888]; +mul.f64 fd38, fd26, fd35; +mul.f64 fd39, fd20, fd35; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %13, fd14; +add.f64 fd42, %12, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd27, fd19, fd31; +sub.f64 fd44, fd33, fd32; +st.shared.v2.f64 [r9+16], {fd43, fd44}; +fma.rn.f64 fd45, fd34, fd20, fd38; +sub.f64 fd46, fd40, fd39; +st.shared.v2.f64 [r9+32], {fd45, fd46}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+3888]; +ld.shared.v2.f64 {fd55, fd56}, [r11+7776]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0dBFEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0dBFEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r7, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r12, rd8; +mul.lo.s32 r13, r12, 3; +sub.s32 r14, r7, r13; +shl.b32 r15, r14, 4; +add.s32 r16, r8, r15; +mul.wide.u32 rd9, r12, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd71, fd74; +mul.f64 fd78, fd65, fd74; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+1296]; +mul.f64 fd84, fd72, fd81; +mul.f64 fd85, fd66, fd81; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r17, r12, 144, r16; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r17], {fd88, fd87}; +fma.rn.f64 fd89, fd73, fd65, fd77; +sub.f64 fd90, fd79, fd78; +st.shared.v2.f64 [r17+48], {fd89, fd90}; +fma.rn.f64 fd91, fd80, fd66, fd84; +sub.f64 fd92, fd86, fd85; +st.shared.v2.f64 [r17+96], {fd91, fd92}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r11]; +ld.shared.v2.f64 {fd97, fd98}, [r11+3888]; +ld.shared.v2.f64 {fd101, fd102}, [r11+7776]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r7, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r18, rd13; +mul.lo.s32 r19, r18, 9; +sub.s32 r20, r7, r19; +shl.b32 r21, r20, 4; +add.s32 r22, r8, r21; +mul.wide.u32 rd14, r18, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd117, fd120; +mul.f64 fd124, fd111, fd120; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+432]; +mul.f64 fd130, fd118, fd127; +mul.f64 fd131, fd112, fd127; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r23, r18, 432, r22; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r23], {fd134, fd133}; +fma.rn.f64 fd135, fd119, fd111, fd123; +sub.f64 fd136, fd125, fd124; +st.shared.v2.f64 [r23+144], {fd135, fd136}; +fma.rn.f64 fd137, fd126, fd112, fd130; +sub.f64 fd138, fd132, fd131; +st.shared.v2.f64 [r23+288], {fd137, fd138}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r11]; +ld.shared.v2.f64 {fd143, fd144}, [r11+3888]; +ld.shared.v2.f64 {fd147, fd148}, [r11+7776]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0dBFEBB67AE8584CAA; +add.f64 fd157, fd156, fd154; +sub.f64 fd158, fd154, fd156; +mul.f64 fd159, fd152, 0d3FE0000000000000; +sub.f64 fd160, fd140, fd159; +sub.f64 fd161, fd143, fd147; +mul.f64 fd162, fd161, 0dBFEBB67AE8584CAA; +sub.f64 fd163, fd160, fd162; +add.f64 fd164, fd162, fd160; +mul.wide.u32 rd17, r7, 795364315; +shr.u64 rd18, rd17, 32; +cvt.u32.u64 r24, rd18; +sub.s32 r25, r7, r24; +shr.u32 r26, r25, 1; +add.s32 r27, r26, r24; +shr.u32 r28, r27, 4; +mul.lo.s32 r29, r28, 27; +sub.s32 r30, r7, r29; +shl.b32 r31, r30, 4; +add.s32 r32, r8, r31; +mul.wide.u32 rd19, r28, 16; +mov.u64 rd20, %10; +add.s64 rd21, rd20, rd19; +ld.global.v2.f64 {fd165, fd166}, [rd21]; +mul.f64 fd169, fd163, fd166; +mul.f64 fd170, fd157, fd166; +mul.f64 fd171, fd165, fd163; +ld.global.v2.f64 {fd172, fd173}, [rd21+144]; +mul.f64 fd176, fd164, fd173; +mul.f64 fd177, fd158, fd173; +mul.f64 fd178, fd172, fd164; +barrier.sync 0; +mad.lo.s32 r33, r28, 1296, r32; +add.f64 fd179, fd140, fd152; +add.f64 fd180, fd139, fd151; +st.shared.v2.f64 [r33], {fd180, fd179}; +fma.rn.f64 fd181, fd165, fd157, fd169; +sub.f64 fd182, fd171, fd170; +st.shared.v2.f64 [r33+432], {fd181, fd182}; +fma.rn.f64 fd183, fd172, fd158, fd176; +sub.f64 fd184, fd178, fd177; +st.shared.v2.f64 [r33+864], {fd183, fd184}; +barrier.sync 0; +ld.shared.v2.f64 {fd185, fd186}, [r11]; +ld.shared.v2.f64 {fd189, fd190}, [r11+3888]; +ld.shared.v2.f64 {fd193, fd194}, [r11+7776]; +add.f64 fd197, fd189, fd193; +add.f64 fd198, fd190, fd194; +mul.f64 fd199, fd197, 0d3FE0000000000000; +sub.f64 fd200, fd185, fd199; +sub.f64 fd201, fd190, fd194; +mul.f64 fd202, fd201, 0dBFEBB67AE8584CAA; +add.f64 fd203, fd202, fd200; +sub.f64 fd204, fd200, fd202; +mul.f64 fd205, fd198, 0d3FE0000000000000; +sub.f64 fd206, fd186, fd205; +sub.f64 fd207, fd189, fd193; +mul.f64 fd208, fd207, 0dBFEBB67AE8584CAA; +sub.f64 fd209, fd206, fd208; +add.f64 fd210, fd208, fd206; +mul.wide.u32 rd22, r7, -901412889; +shr.u64 rd23, rd22, 38; +cvt.u32.u64 r34, rd23; +mul.lo.s32 r35, r34, 81; +sub.s32 r36, r7, r35; +shl.b32 r37, r36, 4; +add.s32 r38, r8, r37; +mul.wide.u32 rd24, r34, 16; +mov.u64 rd25, %11; +add.s64 rd26, rd25, rd24; +ld.global.v2.f64 {fd211, fd212}, [rd26]; +mul.f64 fd215, fd209, fd212; +mul.f64 fd216, fd203, fd212; +mul.f64 fd217, fd211, fd209; +ld.global.v2.f64 {fd218, fd219}, [rd26+48]; +mul.f64 fd222, fd210, fd219; +mul.f64 fd223, fd204, fd219; +mul.f64 fd224, fd218, fd210; +barrier.sync 0; +mad.lo.s32 r39, r34, 3888, r38; +add.f64 fd225, fd186, fd198; +add.f64 fd226, fd185, fd197; +st.shared.v2.f64 [r39], {fd226, fd225}; +fma.rn.f64 fd227, fd211, fd203, fd215; +sub.f64 fd228, fd217, fd216; +st.shared.v2.f64 [r39+1296], {fd227, fd228}; +fma.rn.f64 fd229, fd218, fd204, fd222; +sub.f64 fd230, fd224, fd223; +st.shared.v2.f64 [r39+2592], {fd229, fd230}; +barrier.sync 0; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+3888]; +ld.shared.v2.f64 {fd239, fd240}, [r11+7776]; +add.f64 fd243, fd235, fd239; +add.f64 fd244, fd236, fd240; +mul.f64 fd245, fd243, 0d3FE0000000000000; +sub.f64 fd246, fd231, fd245; +sub.f64 fd247, fd236, fd240; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +mul.f64 fd249, fd244, 0d3FE0000000000000; +sub.f64 fd250, fd232, fd249; +sub.f64 fd251, fd235, fd239; +mul.f64 fd252, fd251, 0dBFEBB67AE8584CAA; +add.f64 %1, fd232, fd244; +add.f64 %0, fd231, fd243; +sub.f64 %3, fd250, fd252; +add.f64 %2, fd248, fd246; +add.f64 %5, fd252, fd250; +sub.f64 %4, fd246, fd248; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_729), "l"(lut_dp_3_243), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..360634d0ba536 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp16_fwd.hpp.inc @@ -0,0 +1,754 @@ +#ifndef CUFFTDX_FFT_7_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_7_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<917, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<15>; +.reg .b32 r<685>; +.reg .f64 fd<11>; +.reg .b64 rd<2>; +mov.f64 fd9, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd9; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd10, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd10; +} +mov.b32 r447, {rs2, rs2}; +mov.f64 fd3, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs3, fd3; +} +mov.b32 r654, {rs3, rs3}; +mov.f64 fd4, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs4, fd4; +} +mov.b32 r678, {rs4, rs4}; +mov.f64 fd7, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs5, fd7; +} +mov.b32 r636, {rs5, rs5}; +mov.f64 fd8, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs6, fd8; +} +mov.b32 r663, {rs6, rs6}; +{ +cvt.rn.f16.f64 rs7, fd7; +} +mov.b32 r537, {rs7, rs7}; +{ +cvt.rn.f16.f64 rs8, fd8; +} +{ +neg.f16 rs9, rs8; +} +mov.b32 r561, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs11, fd9; +} +mov.b32 r645, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs12, fd10; +} +{ +neg.f16 rs13, rs12; +} +mov.b32 r669, {rs13, rs13}; +{ +add.f16x2 r1, %23, %21; +} +{ +add.f16x2 r4, %24, r1; +} +{ +add.f16x2 r7, %25, %19; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %15, %17; +} +{ +add.f16x2 %0, r10, r13; +} +{ +add.f16x2 r19, %26, %22; +} +{ +add.f16x2 r22, %27, r19; +} +{ +add.f16x2 r25, %14, %20; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %16, %18; +} +{ +add.f16x2 %1, r28, r31; +} +{ +add.f16x2 r37, %23, %21; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %24, r40; +} +{ +add.f16x2 r46, %25, %19; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %15, %17; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %26, %22; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %14, %20; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %16, %18; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 %2, r61, r85; +} +{ +add.f16x2 r91, %23, %21; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %24, r94; +} +{ +add.f16x2 r100, %25, %19; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %15, %17; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %26, %22; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %14, %20; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %16, %18; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 %12, r115, r139; +} +{ +add.f16x2 r145, %23, %21; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %24, r148; +} +{ +add.f16x2 r154, %25, %19; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %15, %17; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %26, %22; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %14, %20; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %16, %18; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 %4, r169, r193; +} +{ +add.f16x2 r199, %23, %21; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %24, r202; +} +{ +add.f16x2 r208, %25, %19; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %15, %17; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %26, %22; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %14, %20; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %16, %18; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 %10, r223, r247; +} +{ +add.f16x2 r253, %23, %21; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %24, r256; +} +{ +add.f16x2 r262, %25, %19; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %15, %17; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %26, %22; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %14, %20; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %16, %18; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 %6, r277, r301; +} +{ +add.f16x2 r307, %23, %21; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %24, r310; +} +{ +add.f16x2 r316, %25, %19; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %15, %17; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %26, %22; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %14, %20; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %16, %18; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 %8, r331, r355; +} +{ +add.f16x2 r361, %26, %22; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %27, r364; +} +{ +add.f16x2 r370, %14, %20; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %16, %18; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %23, %21; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %25, %19; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %15, %17; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 %3, r385, r409; +} +{ +add.f16x2 r415, %26, %22; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %27, r418; +} +{ +add.f16x2 r424, %14, %20; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %16, %18; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %23, %21; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %25, %19; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %15, %17; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 %13, r439, r463; +} +{ +add.f16x2 r469, %26, %22; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %27, r472; +} +{ +add.f16x2 r478, %14, %20; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %16, %18; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %23, %21; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %25, %19; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %15, %17; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 %5, r493, r517; +} +{ +add.f16x2 r523, %26, %22; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %27, r526; +} +{ +add.f16x2 r532, %14, %20; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %16, %18; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %23, %21; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %25, %19; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %15, %17; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 %11, r547, r571; +} +{ +add.f16x2 r577, %26, %22; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %27, r580; +} +{ +add.f16x2 r586, %14, %20; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %16, %18; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %23, %21; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %25, %19; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %15, %17; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 %7, r601, r625; +} +{ +add.f16x2 r631, %26, %22; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %27, r634; +} +{ +add.f16x2 r640, %14, %20; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %16, %18; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %23, %21; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %25, %19; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %15, %17; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 %9, r655, r679; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..52d0444af7da2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp16_inv.hpp.inc @@ -0,0 +1,757 @@ +#ifndef CUFFTDX_FFT_7_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_7_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1119, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<17>; +.reg .b32 r<685>; +.reg .f64 fd<11>; +.reg .b64 rd<2>; +mov.f64 fd9, 0d3FE3F3A0E28BEDD1; +{ +cvt.rn.f16.f64 rs1, fd9; +} +mov.b32 r420, {rs1, rs1}; +mov.f64 fd10, 0dBFE904C37505DE4B; +{ +cvt.rn.f16.f64 rs2, fd10; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r447, {rs3, rs3}; +mov.f64 fd3, 0dBFCC7B90E3024582; +{ +cvt.rn.f16.f64 rs5, fd3; +} +mov.b32 r654, {rs5, rs5}; +mov.f64 fd4, 0dBFEF329C0558E969; +{ +cvt.rn.f16.f64 rs6, fd4; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r678, {rs7, rs7}; +mov.f64 fd7, 0dBFECD4BCA9CB5C71; +{ +cvt.rn.f16.f64 rs9, fd7; +} +mov.b32 r636, {rs9, rs9}; +mov.f64 fd8, 0dBFDBC4C04D71ABC1; +{ +cvt.rn.f16.f64 rs10, fd8; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r663, {rs11, rs11}; +{ +cvt.rn.f16.f64 rs13, fd7; +} +mov.b32 r537, {rs13, rs13}; +{ +cvt.rn.f16.f64 rs14, fd8; +} +mov.b32 r561, {rs14, rs14}; +{ +cvt.rn.f16.f64 rs15, fd9; +} +mov.b32 r645, {rs15, rs15}; +{ +cvt.rn.f16.f64 rs16, fd10; +} +mov.b32 r669, {rs16, rs16}; +{ +add.f16x2 r1, %23, %21; +} +{ +add.f16x2 r4, %24, r1; +} +{ +add.f16x2 r7, %25, %19; +} +{ +add.f16x2 r10, r4, r7; +} +{ +add.f16x2 r13, %15, %17; +} +{ +add.f16x2 %0, r10, r13; +} +{ +add.f16x2 r19, %26, %22; +} +{ +add.f16x2 r22, %27, r19; +} +{ +add.f16x2 r25, %14, %20; +} +{ +add.f16x2 r28, r22, r25; +} +{ +add.f16x2 r31, %16, %18; +} +{ +add.f16x2 %1, r28, r31; +} +{ +add.f16x2 r37, %23, %21; +} +{ +mul.f16x2 r40, r37, r420; +} +{ +add.f16x2 r43, %24, r40; +} +{ +add.f16x2 r46, %25, %19; +} +{ +mul.f16x2 r49, r46, r654; +} +{ +add.f16x2 r52, r43, r49; +} +{ +add.f16x2 r55, %15, %17; +} +{ +mul.f16x2 r58, r55, r636; +} +{ +add.f16x2 r61, r52, r58; +} +{ +sub.f16x2 r64, %26, %22; +} +{ +mul.f16x2 r67, r64, r447; +} +{ +sub.f16x2 r70, %14, %20; +} +{ +mul.f16x2 r73, r70, r678; +} +{ +add.f16x2 r76, r67, r73; +} +{ +sub.f16x2 r79, %16, %18; +} +{ +mul.f16x2 r82, r79, r663; +} +{ +add.f16x2 r85, r76, r82; +} +{ +sub.f16x2 %2, r61, r85; +} +{ +add.f16x2 r91, %23, %21; +} +{ +mul.f16x2 r94, r91, r420; +} +{ +add.f16x2 r97, %24, r94; +} +{ +add.f16x2 r100, %25, %19; +} +{ +mul.f16x2 r103, r100, r654; +} +{ +add.f16x2 r106, r97, r103; +} +{ +add.f16x2 r109, %15, %17; +} +{ +mul.f16x2 r112, r109, r636; +} +{ +add.f16x2 r115, r106, r112; +} +{ +sub.f16x2 r118, %26, %22; +} +{ +mul.f16x2 r121, r118, r447; +} +{ +sub.f16x2 r124, %14, %20; +} +{ +mul.f16x2 r127, r124, r678; +} +{ +add.f16x2 r130, r121, r127; +} +{ +sub.f16x2 r133, %16, %18; +} +{ +mul.f16x2 r136, r133, r663; +} +{ +add.f16x2 r139, r130, r136; +} +{ +add.f16x2 %12, r115, r139; +} +{ +add.f16x2 r145, %23, %21; +} +{ +mul.f16x2 r148, r145, r654; +} +{ +add.f16x2 r151, %24, r148; +} +{ +add.f16x2 r154, %25, %19; +} +{ +mul.f16x2 r157, r154, r537; +} +{ +add.f16x2 r160, r151, r157; +} +{ +add.f16x2 r163, %15, %17; +} +{ +mul.f16x2 r166, r163, r645; +} +{ +add.f16x2 r169, r160, r166; +} +{ +sub.f16x2 r172, %26, %22; +} +{ +mul.f16x2 r175, r172, r678; +} +{ +sub.f16x2 r178, %14, %20; +} +{ +mul.f16x2 r181, r178, r561; +} +{ +add.f16x2 r184, r175, r181; +} +{ +sub.f16x2 r187, %16, %18; +} +{ +mul.f16x2 r190, r187, r669; +} +{ +add.f16x2 r193, r184, r190; +} +{ +sub.f16x2 %4, r169, r193; +} +{ +add.f16x2 r199, %23, %21; +} +{ +mul.f16x2 r202, r199, r654; +} +{ +add.f16x2 r205, %24, r202; +} +{ +add.f16x2 r208, %25, %19; +} +{ +mul.f16x2 r211, r208, r537; +} +{ +add.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %15, %17; +} +{ +mul.f16x2 r220, r217, r645; +} +{ +add.f16x2 r223, r214, r220; +} +{ +sub.f16x2 r226, %26, %22; +} +{ +mul.f16x2 r229, r226, r678; +} +{ +sub.f16x2 r232, %14, %20; +} +{ +mul.f16x2 r235, r232, r561; +} +{ +add.f16x2 r238, r229, r235; +} +{ +sub.f16x2 r241, %16, %18; +} +{ +mul.f16x2 r244, r241, r669; +} +{ +add.f16x2 r247, r238, r244; +} +{ +add.f16x2 %10, r223, r247; +} +{ +add.f16x2 r253, %23, %21; +} +{ +mul.f16x2 r256, r253, r636; +} +{ +add.f16x2 r259, %24, r256; +} +{ +add.f16x2 r262, %25, %19; +} +{ +mul.f16x2 r265, r262, r645; +} +{ +add.f16x2 r268, r259, r265; +} +{ +add.f16x2 r271, %15, %17; +} +{ +mul.f16x2 r274, r271, r654; +} +{ +add.f16x2 r277, r268, r274; +} +{ +sub.f16x2 r280, %26, %22; +} +{ +mul.f16x2 r283, r280, r663; +} +{ +sub.f16x2 r286, %14, %20; +} +{ +mul.f16x2 r289, r286, r669; +} +{ +add.f16x2 r292, r283, r289; +} +{ +sub.f16x2 r295, %16, %18; +} +{ +mul.f16x2 r298, r295, r678; +} +{ +add.f16x2 r301, r292, r298; +} +{ +sub.f16x2 %6, r277, r301; +} +{ +add.f16x2 r307, %23, %21; +} +{ +mul.f16x2 r310, r307, r636; +} +{ +add.f16x2 r313, %24, r310; +} +{ +add.f16x2 r316, %25, %19; +} +{ +mul.f16x2 r319, r316, r645; +} +{ +add.f16x2 r322, r313, r319; +} +{ +add.f16x2 r325, %15, %17; +} +{ +mul.f16x2 r328, r325, r654; +} +{ +add.f16x2 r331, r322, r328; +} +{ +sub.f16x2 r334, %26, %22; +} +{ +mul.f16x2 r337, r334, r663; +} +{ +sub.f16x2 r340, %14, %20; +} +{ +mul.f16x2 r343, r340, r669; +} +{ +add.f16x2 r346, r337, r343; +} +{ +sub.f16x2 r349, %16, %18; +} +{ +mul.f16x2 r352, r349, r678; +} +{ +add.f16x2 r355, r346, r352; +} +{ +add.f16x2 %8, r331, r355; +} +{ +add.f16x2 r361, %26, %22; +} +{ +mul.f16x2 r364, r361, r420; +} +{ +add.f16x2 r367, %27, r364; +} +{ +add.f16x2 r370, %14, %20; +} +{ +mul.f16x2 r373, r370, r654; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, %16, %18; +} +{ +mul.f16x2 r382, r379, r636; +} +{ +add.f16x2 r385, r376, r382; +} +{ +sub.f16x2 r388, %23, %21; +} +{ +mul.f16x2 r391, r388, r447; +} +{ +sub.f16x2 r394, %25, %19; +} +{ +mul.f16x2 r397, r394, r678; +} +{ +add.f16x2 r400, r391, r397; +} +{ +sub.f16x2 r403, %15, %17; +} +{ +mul.f16x2 r406, r403, r663; +} +{ +add.f16x2 r409, r400, r406; +} +{ +add.f16x2 %3, r385, r409; +} +{ +add.f16x2 r415, %26, %22; +} +{ +mul.f16x2 r418, r415, r420; +} +{ +add.f16x2 r421, %27, r418; +} +{ +add.f16x2 r424, %14, %20; +} +{ +mul.f16x2 r427, r424, r654; +} +{ +add.f16x2 r430, r421, r427; +} +{ +add.f16x2 r433, %16, %18; +} +{ +mul.f16x2 r436, r433, r636; +} +{ +add.f16x2 r439, r430, r436; +} +{ +sub.f16x2 r442, %23, %21; +} +{ +mul.f16x2 r445, r442, r447; +} +{ +sub.f16x2 r448, %25, %19; +} +{ +mul.f16x2 r451, r448, r678; +} +{ +add.f16x2 r454, r445, r451; +} +{ +sub.f16x2 r457, %15, %17; +} +{ +mul.f16x2 r460, r457, r663; +} +{ +add.f16x2 r463, r454, r460; +} +{ +sub.f16x2 %13, r439, r463; +} +{ +add.f16x2 r469, %26, %22; +} +{ +mul.f16x2 r472, r469, r654; +} +{ +add.f16x2 r475, %27, r472; +} +{ +add.f16x2 r478, %14, %20; +} +{ +mul.f16x2 r481, r478, r537; +} +{ +add.f16x2 r484, r475, r481; +} +{ +add.f16x2 r487, %16, %18; +} +{ +mul.f16x2 r490, r487, r645; +} +{ +add.f16x2 r493, r484, r490; +} +{ +sub.f16x2 r496, %23, %21; +} +{ +mul.f16x2 r499, r496, r678; +} +{ +sub.f16x2 r502, %25, %19; +} +{ +mul.f16x2 r505, r502, r561; +} +{ +add.f16x2 r508, r499, r505; +} +{ +sub.f16x2 r511, %15, %17; +} +{ +mul.f16x2 r514, r511, r669; +} +{ +add.f16x2 r517, r508, r514; +} +{ +add.f16x2 %5, r493, r517; +} +{ +add.f16x2 r523, %26, %22; +} +{ +mul.f16x2 r526, r523, r654; +} +{ +add.f16x2 r529, %27, r526; +} +{ +add.f16x2 r532, %14, %20; +} +{ +mul.f16x2 r535, r532, r537; +} +{ +add.f16x2 r538, r529, r535; +} +{ +add.f16x2 r541, %16, %18; +} +{ +mul.f16x2 r544, r541, r645; +} +{ +add.f16x2 r547, r538, r544; +} +{ +sub.f16x2 r550, %23, %21; +} +{ +mul.f16x2 r553, r550, r678; +} +{ +sub.f16x2 r556, %25, %19; +} +{ +mul.f16x2 r559, r556, r561; +} +{ +add.f16x2 r562, r553, r559; +} +{ +sub.f16x2 r565, %15, %17; +} +{ +mul.f16x2 r568, r565, r669; +} +{ +add.f16x2 r571, r562, r568; +} +{ +sub.f16x2 %11, r547, r571; +} +{ +add.f16x2 r577, %26, %22; +} +{ +mul.f16x2 r580, r577, r636; +} +{ +add.f16x2 r583, %27, r580; +} +{ +add.f16x2 r586, %14, %20; +} +{ +mul.f16x2 r589, r586, r645; +} +{ +add.f16x2 r592, r583, r589; +} +{ +add.f16x2 r595, %16, %18; +} +{ +mul.f16x2 r598, r595, r654; +} +{ +add.f16x2 r601, r592, r598; +} +{ +sub.f16x2 r604, %23, %21; +} +{ +mul.f16x2 r607, r604, r663; +} +{ +sub.f16x2 r610, %25, %19; +} +{ +mul.f16x2 r613, r610, r669; +} +{ +add.f16x2 r616, r607, r613; +} +{ +sub.f16x2 r619, %15, %17; +} +{ +mul.f16x2 r622, r619, r678; +} +{ +add.f16x2 r625, r616, r622; +} +{ +add.f16x2 %7, r601, r625; +} +{ +add.f16x2 r631, %26, %22; +} +{ +mul.f16x2 r634, r631, r636; +} +{ +add.f16x2 r637, %27, r634; +} +{ +add.f16x2 r640, %14, %20; +} +{ +mul.f16x2 r643, r640, r645; +} +{ +add.f16x2 r646, r637, r643; +} +{ +add.f16x2 r649, %16, %18; +} +{ +mul.f16x2 r652, r649, r654; +} +{ +add.f16x2 r655, r646, r652; +} +{ +sub.f16x2 r658, %23, %21; +} +{ +mul.f16x2 r661, r658, r663; +} +{ +sub.f16x2 r664, %25, %19; +} +{ +mul.f16x2 r667, r664, r669; +} +{ +add.f16x2 r670, r661, r667; +} +{ +sub.f16x2 r673, %15, %17; +} +{ +mul.f16x2 r676, r673, r678; +} +{ +add.f16x2 r679, r670, r676; +} +{ +sub.f16x2 %9, r655, r679; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)): "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[0].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..acf2ae96e4bdb --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp32_fwd.hpp.inc @@ -0,0 +1,104 @@ +#ifndef CUFFTDX_FFT_7_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_7_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<171, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<117>; +.reg .b64 rd<2>; +add.f32 f29, %16, %30; +add.f32 f30, %14, f29; +add.f32 f31, %19, %27; +add.f32 f32, f31, f30; +add.f32 f33, %22, %24; +add.f32 f34, %18, %31; +add.f32 f35, %15, f34; +add.f32 f36, %21, %29; +add.f32 f37, f36, f35; +add.f32 f38, %23, %26; +fma.rn.f32 f39, f29, 0f3F1F9D07, %14; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %18, %31; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %21, %29; +mul.f32 f47, f46, 0fBF7994E0; +sub.f32 f48, f47, f45; +sub.f32 f49, %23, %26; +mul.f32 f50, f49, 0f3EDE2602; +sub.f32 f51, f48, f50; +mul.f32 f52, f29, 0f3E63DC87; +sub.f32 f53, %14, f52; +mul.f32 f54, f31, 0f3F66A5E5; +sub.f32 f55, f53, f54; +fma.rn.f32 f56, f33, 0f3F1F9D07, f55; +mul.f32 f57, f44, 0f3F7994E0; +mul.f32 f58, f46, 0f3EDE2602; +sub.f32 f59, f58, f57; +fma.rn.f32 f60, f49, 0f3F48261C, f59; +mul.f32 f61, f29, 0f3F66A5E5; +sub.f32 f62, %14, f61; +fma.rn.f32 f63, f31, 0f3F1F9D07, f62; +mul.f32 f64, f33, 0f3E63DC87; +sub.f32 f65, f63, f64; +mul.f32 f66, f44, 0f3EDE2602; +mul.f32 f67, f46, 0f3F48261C; +sub.f32 f68, f67, f66; +mul.f32 f69, f49, 0f3F7994E0; +sub.f32 f70, f68, f69; +fma.rn.f32 f71, f34, 0f3F1F9D07, %15; +mul.f32 f72, f36, 0f3E63DC87; +sub.f32 f73, f71, f72; +mul.f32 f74, f38, 0f3F66A5E5; +sub.f32 f75, f73, f74; +sub.f32 f76, %16, %30; +mul.f32 f77, f76, 0f3F48261C; +sub.f32 f78, %19, %27; +mul.f32 f79, f78, 0fBF7994E0; +sub.f32 f80, f79, f77; +sub.f32 f81, %22, %24; +mul.f32 f82, f81, 0f3EDE2602; +sub.f32 f83, f80, f82; +mul.f32 f84, f34, 0f3E63DC87; +sub.f32 f85, %15, f84; +mul.f32 f86, f36, 0f3F66A5E5; +sub.f32 f87, f85, f86; +fma.rn.f32 f88, f38, 0f3F1F9D07, f87; +mul.f32 f89, f76, 0f3F7994E0; +mul.f32 f90, f78, 0f3EDE2602; +sub.f32 f91, f90, f89; +fma.rn.f32 f92, f81, 0f3F48261C, f91; +mul.f32 f93, f34, 0f3F66A5E5; +sub.f32 f94, %15, f93; +fma.rn.f32 f95, f36, 0f3F1F9D07, f94; +mul.f32 f96, f38, 0f3E63DC87; +sub.f32 f97, f95, f96; +mul.f32 f98, f76, 0f3EDE2602; +mul.f32 f99, f78, 0f3F48261C; +sub.f32 f100, f99, f98; +mul.f32 f101, f81, 0f3F7994E0; +sub.f32 f102, f100, f101; +add.f32 %1, f38, f37; +add.f32 %0, f33, f32; +add.f32 %3, f83, f75; +sub.f32 %2, f43, f51; +add.f32 %5, f92, f88; +sub.f32 %4, f56, f60; +add.f32 %7, f102, f97; +sub.f32 %6, f65, f70; +sub.f32 %9, f97, f102; +add.f32 %8, f70, f65; +sub.f32 %11, f88, f92; +add.f32 %10, f60, f56; +sub.f32 %13, f75, f83; +add.f32 %12, f51, f43; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..61e07b20fe07a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp32_inv.hpp.inc @@ -0,0 +1,100 @@ +#ifndef CUFFTDX_FFT_7_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_7_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<373, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<113>; +.reg .b64 rd<2>; +add.f32 f29, %16, %30; +add.f32 f30, %14, f29; +add.f32 f31, %19, %27; +add.f32 f32, f31, f30; +add.f32 f33, %22, %24; +add.f32 f34, %18, %31; +add.f32 f35, %15, f34; +add.f32 f36, %21, %29; +add.f32 f37, f36, f35; +add.f32 f38, %23, %26; +fma.rn.f32 f39, f29, 0f3F1F9D07, %14; +mul.f32 f40, f31, 0f3E63DC87; +sub.f32 f41, f39, f40; +mul.f32 f42, f33, 0f3F66A5E5; +sub.f32 f43, f41, f42; +sub.f32 f44, %18, %31; +mul.f32 f45, f44, 0f3F48261C; +sub.f32 f46, %21, %29; +fma.rn.f32 f47, f46, 0f3F7994E0, f45; +sub.f32 f48, %23, %26; +fma.rn.f32 f49, f48, 0f3EDE2602, f47; +mul.f32 f50, f29, 0f3E63DC87; +sub.f32 f51, %14, f50; +mul.f32 f52, f31, 0f3F66A5E5; +sub.f32 f53, f51, f52; +fma.rn.f32 f54, f33, 0f3F1F9D07, f53; +mul.f32 f55, f44, 0f3F7994E0; +mul.f32 f56, f46, 0f3EDE2602; +sub.f32 f57, f55, f56; +mul.f32 f58, f48, 0f3F48261C; +sub.f32 f59, f57, f58; +mul.f32 f60, f29, 0f3F66A5E5; +sub.f32 f61, %14, f60; +fma.rn.f32 f62, f31, 0f3F1F9D07, f61; +mul.f32 f63, f33, 0f3E63DC87; +sub.f32 f64, f62, f63; +mul.f32 f65, f44, 0f3EDE2602; +mul.f32 f66, f46, 0f3F48261C; +sub.f32 f67, f65, f66; +fma.rn.f32 f68, f48, 0f3F7994E0, f67; +fma.rn.f32 f69, f34, 0f3F1F9D07, %15; +mul.f32 f70, f36, 0f3E63DC87; +sub.f32 f71, f69, f70; +mul.f32 f72, f38, 0f3F66A5E5; +sub.f32 f73, f71, f72; +sub.f32 f74, %16, %30; +mul.f32 f75, f74, 0f3F48261C; +sub.f32 f76, %19, %27; +fma.rn.f32 f77, f76, 0f3F7994E0, f75; +sub.f32 f78, %22, %24; +fma.rn.f32 f79, f78, 0f3EDE2602, f77; +mul.f32 f80, f34, 0f3E63DC87; +sub.f32 f81, %15, f80; +mul.f32 f82, f36, 0f3F66A5E5; +sub.f32 f83, f81, f82; +fma.rn.f32 f84, f38, 0f3F1F9D07, f83; +mul.f32 f85, f74, 0f3F7994E0; +mul.f32 f86, f76, 0f3EDE2602; +sub.f32 f87, f85, f86; +mul.f32 f88, f78, 0f3F48261C; +sub.f32 f89, f87, f88; +mul.f32 f90, f34, 0f3F66A5E5; +sub.f32 f91, %15, f90; +fma.rn.f32 f92, f36, 0f3F1F9D07, f91; +mul.f32 f93, f38, 0f3E63DC87; +sub.f32 f94, f92, f93; +mul.f32 f95, f74, 0f3EDE2602; +mul.f32 f96, f76, 0f3F48261C; +sub.f32 f97, f95, f96; +fma.rn.f32 f98, f78, 0f3F7994E0, f97; +add.f32 %1, f38, f37; +add.f32 %0, f33, f32; +add.f32 %3, f79, f73; +sub.f32 %2, f43, f49; +add.f32 %5, f89, f84; +sub.f32 %4, f54, f59; +add.f32 %7, f98, f94; +sub.f32 %6, f64, f68; +sub.f32 %9, f94, f98; +add.f32 %8, f68, f64; +sub.f32 %11, f84, f89; +add.f32 %10, f59, f54; +sub.f32 %13, f73, f79; +add.f32 %12, f49, f43; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..b1c297e52e826 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp64_fwd.hpp.inc @@ -0,0 +1,104 @@ +#ifndef CUFFTDX_FFT_7_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_7_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<546, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<117>; +.reg .b64 rd<2>; +add.f64 fd29, %16, %30; +add.f64 fd30, %14, fd29; +add.f64 fd31, %19, %27; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %22, %24; +add.f64 fd34, %18, %31; +add.f64 fd35, %15, fd34; +add.f64 fd36, %21, %29; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %23, %26; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %14; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %18, %31; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %21, %29; +mul.f64 fd47, fd46, 0dBFEF329C0558E969; +sub.f64 fd48, fd47, fd45; +sub.f64 fd49, %23, %26; +mul.f64 fd50, fd49, 0d3FDBC4C04D71ABC1; +sub.f64 fd51, fd48, fd50; +mul.f64 fd52, fd29, 0d3FCC7B90E3024582; +sub.f64 fd53, %14, fd52; +mul.f64 fd54, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd55, fd53, fd54; +fma.rn.f64 fd56, fd33, 0d3FE3F3A0E28BEDD1, fd55; +mul.f64 fd57, fd44, 0d3FEF329C0558E969; +mul.f64 fd58, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd59, fd58, fd57; +fma.rn.f64 fd60, fd49, 0d3FE904C37505DE4B, fd59; +mul.f64 fd61, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd62, %14, fd61; +fma.rn.f64 fd63, fd31, 0d3FE3F3A0E28BEDD1, fd62; +mul.f64 fd64, fd33, 0d3FCC7B90E3024582; +sub.f64 fd65, fd63, fd64; +mul.f64 fd66, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd67, fd46, 0d3FE904C37505DE4B; +sub.f64 fd68, fd67, fd66; +mul.f64 fd69, fd49, 0d3FEF329C0558E969; +sub.f64 fd70, fd68, fd69; +fma.rn.f64 fd71, fd34, 0d3FE3F3A0E28BEDD1, %15; +mul.f64 fd72, fd36, 0d3FCC7B90E3024582; +sub.f64 fd73, fd71, fd72; +mul.f64 fd74, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd75, fd73, fd74; +sub.f64 fd76, %16, %30; +mul.f64 fd77, fd76, 0d3FE904C37505DE4B; +sub.f64 fd78, %19, %27; +mul.f64 fd79, fd78, 0dBFEF329C0558E969; +sub.f64 fd80, fd79, fd77; +sub.f64 fd81, %22, %24; +mul.f64 fd82, fd81, 0d3FDBC4C04D71ABC1; +sub.f64 fd83, fd80, fd82; +mul.f64 fd84, fd34, 0d3FCC7B90E3024582; +sub.f64 fd85, %15, fd84; +mul.f64 fd86, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd87, fd85, fd86; +fma.rn.f64 fd88, fd38, 0d3FE3F3A0E28BEDD1, fd87; +mul.f64 fd89, fd76, 0d3FEF329C0558E969; +mul.f64 fd90, fd78, 0d3FDBC4C04D71ABC1; +sub.f64 fd91, fd90, fd89; +fma.rn.f64 fd92, fd81, 0d3FE904C37505DE4B, fd91; +mul.f64 fd93, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd94, %15, fd93; +fma.rn.f64 fd95, fd36, 0d3FE3F3A0E28BEDD1, fd94; +mul.f64 fd96, fd38, 0d3FCC7B90E3024582; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd76, 0d3FDBC4C04D71ABC1; +mul.f64 fd99, fd78, 0d3FE904C37505DE4B; +sub.f64 fd100, fd99, fd98; +mul.f64 fd101, fd81, 0d3FEF329C0558E969; +sub.f64 fd102, fd100, fd101; +add.f64 %1, fd38, fd37; +add.f64 %0, fd33, fd32; +add.f64 %3, fd83, fd75; +sub.f64 %2, fd43, fd51; +add.f64 %5, fd92, fd88; +sub.f64 %4, fd56, fd60; +add.f64 %7, fd102, fd97; +sub.f64 %6, fd65, fd70; +sub.f64 %9, fd97, fd102; +add.f64 %8, fd70, fd65; +sub.f64 %11, fd88, fd92; +add.f64 %10, fd60, fd56; +sub.f64 %13, fd75, fd83; +add.f64 %12, fd51, fd43; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..2f322584713d7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_7_fp64_inv.hpp.inc @@ -0,0 +1,100 @@ +#ifndef CUFFTDX_FFT_7_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_7_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<717, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<113>; +.reg .b64 rd<2>; +add.f64 fd29, %16, %30; +add.f64 fd30, %14, fd29; +add.f64 fd31, %19, %27; +add.f64 fd32, fd31, fd30; +add.f64 fd33, %22, %24; +add.f64 fd34, %18, %31; +add.f64 fd35, %15, fd34; +add.f64 fd36, %21, %29; +add.f64 fd37, fd36, fd35; +add.f64 fd38, %23, %26; +fma.rn.f64 fd39, fd29, 0d3FE3F3A0E28BEDD1, %14; +mul.f64 fd40, fd31, 0d3FCC7B90E3024582; +sub.f64 fd41, fd39, fd40; +mul.f64 fd42, fd33, 0d3FECD4BCA9CB5C71; +sub.f64 fd43, fd41, fd42; +sub.f64 fd44, %18, %31; +mul.f64 fd45, fd44, 0d3FE904C37505DE4B; +sub.f64 fd46, %21, %29; +fma.rn.f64 fd47, fd46, 0d3FEF329C0558E969, fd45; +sub.f64 fd48, %23, %26; +fma.rn.f64 fd49, fd48, 0d3FDBC4C04D71ABC1, fd47; +mul.f64 fd50, fd29, 0d3FCC7B90E3024582; +sub.f64 fd51, %14, fd50; +mul.f64 fd52, fd31, 0d3FECD4BCA9CB5C71; +sub.f64 fd53, fd51, fd52; +fma.rn.f64 fd54, fd33, 0d3FE3F3A0E28BEDD1, fd53; +mul.f64 fd55, fd44, 0d3FEF329C0558E969; +mul.f64 fd56, fd46, 0d3FDBC4C04D71ABC1; +sub.f64 fd57, fd55, fd56; +mul.f64 fd58, fd48, 0d3FE904C37505DE4B; +sub.f64 fd59, fd57, fd58; +mul.f64 fd60, fd29, 0d3FECD4BCA9CB5C71; +sub.f64 fd61, %14, fd60; +fma.rn.f64 fd62, fd31, 0d3FE3F3A0E28BEDD1, fd61; +mul.f64 fd63, fd33, 0d3FCC7B90E3024582; +sub.f64 fd64, fd62, fd63; +mul.f64 fd65, fd44, 0d3FDBC4C04D71ABC1; +mul.f64 fd66, fd46, 0d3FE904C37505DE4B; +sub.f64 fd67, fd65, fd66; +fma.rn.f64 fd68, fd48, 0d3FEF329C0558E969, fd67; +fma.rn.f64 fd69, fd34, 0d3FE3F3A0E28BEDD1, %15; +mul.f64 fd70, fd36, 0d3FCC7B90E3024582; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd38, 0d3FECD4BCA9CB5C71; +sub.f64 fd73, fd71, fd72; +sub.f64 fd74, %16, %30; +mul.f64 fd75, fd74, 0d3FE904C37505DE4B; +sub.f64 fd76, %19, %27; +fma.rn.f64 fd77, fd76, 0d3FEF329C0558E969, fd75; +sub.f64 fd78, %22, %24; +fma.rn.f64 fd79, fd78, 0d3FDBC4C04D71ABC1, fd77; +mul.f64 fd80, fd34, 0d3FCC7B90E3024582; +sub.f64 fd81, %15, fd80; +mul.f64 fd82, fd36, 0d3FECD4BCA9CB5C71; +sub.f64 fd83, fd81, fd82; +fma.rn.f64 fd84, fd38, 0d3FE3F3A0E28BEDD1, fd83; +mul.f64 fd85, fd74, 0d3FEF329C0558E969; +mul.f64 fd86, fd76, 0d3FDBC4C04D71ABC1; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd78, 0d3FE904C37505DE4B; +sub.f64 fd89, fd87, fd88; +mul.f64 fd90, fd34, 0d3FECD4BCA9CB5C71; +sub.f64 fd91, %15, fd90; +fma.rn.f64 fd92, fd36, 0d3FE3F3A0E28BEDD1, fd91; +mul.f64 fd93, fd38, 0d3FCC7B90E3024582; +sub.f64 fd94, fd92, fd93; +mul.f64 fd95, fd74, 0d3FDBC4C04D71ABC1; +mul.f64 fd96, fd76, 0d3FE904C37505DE4B; +sub.f64 fd97, fd95, fd96; +fma.rn.f64 fd98, fd78, 0d3FEF329C0558E969, fd97; +add.f64 %1, fd38, fd37; +add.f64 %0, fd33, fd32; +add.f64 %3, fd79, fd73; +sub.f64 %2, fd43, fd49; +add.f64 %5, fd89, fd84; +sub.f64 %4, fd54, fd59; +add.f64 %7, fd98, fd94; +sub.f64 %6, fd64, fd68; +sub.f64 %9, fd94, fd98; +add.f64 %8, fd68, fd64; +sub.f64 %11, fd84, fd89; +add.f64 %10, fd59, fd54; +sub.f64 %13, fd73, fd79; +add.f64 %12, fd49, fd43; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..1e01e2d446120 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp16_fwd.hpp.inc @@ -0,0 +1,31188 @@ +#ifndef CUFFTDX_FFT_8192_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_8192_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<856, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<965>; +.reg .b32 r<6595>; +.reg .b64 rd<3>; +mov.u32 r6509, %tid.y; +shl.b32 r6510, r6509, 16; +mov.u32 r6511, %64; +add.s32 r6512, r6511, r6510; +mov.u32 r6513, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %109, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %109, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f930, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r101, {low, high}; +} +mov.f32 f940, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %107; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %107; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %106, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %106, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %108, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %108, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6515, r6513, 8; +and.b32 r6516, r6515, -65536; +add.s32 r6517, r6512, r6516; +and.b32 r6530, r6513, 255; +cvt.rn.f32.u32 f957, r6530; +mul.f32 f958, f957, 0f3A490FDB; +cos.approx.f32 f357, f958; +sin.approx.f32 f959, f958; +neg.f32 f358, f959; +mov.f32 f964, 0f3F800000; +mov.f32 f963, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r6518, r6515, 65280; +add.s32 r6519, r6517, r6518; +st.shared.v4.f32 [r6519], {r1521, r1524, r1725, r1732}; +st.shared.v4.f32 [r6519+16], {r1762, r1769, r1799, r1806}; +st.shared.v4.f32 [r6519+32], {r1836, r1843, r1873, r1880}; +st.shared.v4.f32 [r6519+48], {r1910, r1917, r1947, r1954}; +st.shared.v4.f32 [r6519+64], {r1984, r1991, r2021, r2028}; +st.shared.v4.f32 [r6519+80], {r2058, r2065, r2095, r2102}; +st.shared.v4.f32 [r6519+96], {r2132, r2139, r2169, r2176}; +st.shared.v4.f32 [r6519+112], {r2206, r2213, r2243, r2250}; +st.shared.v4.f32 [r6519+128], {r2280, r2287, r2317, r2324}; +st.shared.v4.f32 [r6519+144], {r2354, r2361, r2391, r2398}; +st.shared.v4.f32 [r6519+160], {r2428, r2435, r2465, r2472}; +st.shared.v4.f32 [r6519+176], {r2502, r2509, r2539, r2546}; +st.shared.v4.f32 [r6519+192], {r2576, r2583, r2613, r2620}; +st.shared.v4.f32 [r6519+208], {r2650, r2657, r2687, r2694}; +st.shared.v4.f32 [r6519+224], {r2724, r2731, r2761, r2768}; +st.shared.v4.f32 [r6519+240], {r2798, r2805, r2835, r2842}; +barrier.sync 0; +mad.lo.s32 r6520, r6530, -248, r6519; +ld.shared.u32 r2864, [r6520]; +ld.shared.u32 r2867, [r6520+4]; +ld.shared.u32 r3480, [r6520+2048]; +ld.shared.u32 r3483, [r6520+2052]; +ld.shared.u32 r3060, [r6520+4096]; +ld.shared.u32 r3063, [r6520+4100]; +ld.shared.u32 r3676, [r6520+6144]; +ld.shared.u32 r3679, [r6520+6148]; +ld.shared.u32 r2914, [r6520+8192]; +ld.shared.u32 r2917, [r6520+8196]; +ld.shared.u32 r3530, [r6520+10240]; +ld.shared.u32 r3533, [r6520+10244]; +ld.shared.u32 r3110, [r6520+12288]; +ld.shared.u32 r3113, [r6520+12292]; +ld.shared.u32 r3726, [r6520+14336]; +ld.shared.u32 r3729, [r6520+14340]; +ld.shared.u32 r2876, [r6520+16384]; +ld.shared.u32 r2879, [r6520+16388]; +ld.shared.u32 r3492, [r6520+18432]; +ld.shared.u32 r3495, [r6520+18436]; +ld.shared.u32 r3072, [r6520+20480]; +ld.shared.u32 r3075, [r6520+20484]; +ld.shared.u32 r3688, [r6520+22528]; +ld.shared.u32 r3691, [r6520+22532]; +ld.shared.u32 r2926, [r6520+24576]; +ld.shared.u32 r2929, [r6520+24580]; +ld.shared.u32 r3542, [r6520+26624]; +ld.shared.u32 r3545, [r6520+26628]; +ld.shared.u32 r3122, [r6520+28672]; +ld.shared.u32 r3125, [r6520+28676]; +ld.shared.u32 r3738, [r6520+30720]; +ld.shared.u32 r3741, [r6520+30724]; +ld.shared.u32 r2865, [r6520+32768]; +ld.shared.u32 r2868, [r6520+32772]; +ld.shared.u32 r3481, [r6520+34816]; +ld.shared.u32 r3484, [r6520+34820]; +ld.shared.u32 r3061, [r6520+36864]; +ld.shared.u32 r3064, [r6520+36868]; +ld.shared.u32 r3677, [r6520+38912]; +ld.shared.u32 r3680, [r6520+38916]; +ld.shared.u32 r2915, [r6520+40960]; +ld.shared.u32 r2918, [r6520+40964]; +ld.shared.u32 r3531, [r6520+43008]; +ld.shared.u32 r3534, [r6520+43012]; +ld.shared.u32 r3111, [r6520+45056]; +ld.shared.u32 r3114, [r6520+45060]; +ld.shared.u32 r3727, [r6520+47104]; +ld.shared.u32 r3730, [r6520+47108]; +ld.shared.u32 r2877, [r6520+49152]; +ld.shared.u32 r2880, [r6520+49156]; +ld.shared.u32 r3493, [r6520+51200]; +ld.shared.u32 r3496, [r6520+51204]; +ld.shared.u32 r3073, [r6520+53248]; +ld.shared.u32 r3076, [r6520+53252]; +ld.shared.u32 r3689, [r6520+55296]; +ld.shared.u32 r3692, [r6520+55300]; +ld.shared.u32 r2927, [r6520+57344]; +ld.shared.u32 r2930, [r6520+57348]; +ld.shared.u32 r3543, [r6520+59392]; +ld.shared.u32 r3546, [r6520+59396]; +ld.shared.u32 r3123, [r6520+61440]; +ld.shared.u32 r3126, [r6520+61444]; +ld.shared.u32 r3739, [r6520+63488]; +ld.shared.u32 r3742, [r6520+63492]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r6522, r6513, 5, 3; +cvt.rn.f32.u32 f960, r6522; +mul.f32 f961, f960, 0f3CC90FDB; +cos.approx.f32 f779, f961; +sin.approx.f32 f962, f961; +neg.f32 f780, f962; +and.b32 r6529, r6513, 224; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +shl.b32 r6523, r6513, 3; +and.b32 r6524, r6523, 248; +add.s32 r6525, r6517, r6524; +barrier.sync 0; +and.b32 r6526, r6515, 57344; +add.s32 r6527, r6525, r6526; +st.shared.u32 [r6527], r4383; +st.shared.u32 [r6527+4], r4386; +st.shared.u32 [r6527+256], r4587; +st.shared.u32 [r6527+260], r4594; +st.shared.u32 [r6527+512], r4624; +st.shared.u32 [r6527+516], r4631; +st.shared.u32 [r6527+768], r4661; +st.shared.u32 [r6527+772], r4668; +st.shared.u32 [r6527+1024], r4698; +st.shared.u32 [r6527+1028], r4705; +st.shared.u32 [r6527+1280], r4735; +st.shared.u32 [r6527+1284], r4742; +st.shared.u32 [r6527+1536], r4772; +st.shared.u32 [r6527+1540], r4779; +st.shared.u32 [r6527+1792], r4809; +st.shared.u32 [r6527+1796], r4816; +st.shared.u32 [r6527+2048], r4846; +st.shared.u32 [r6527+2052], r4853; +st.shared.u32 [r6527+2304], r4883; +st.shared.u32 [r6527+2308], r4890; +st.shared.u32 [r6527+2560], r4920; +st.shared.u32 [r6527+2564], r4927; +st.shared.u32 [r6527+2816], r4957; +st.shared.u32 [r6527+2820], r4964; +st.shared.u32 [r6527+3072], r4994; +st.shared.u32 [r6527+3076], r5001; +st.shared.u32 [r6527+3328], r5031; +st.shared.u32 [r6527+3332], r5038; +st.shared.u32 [r6527+3584], r5068; +st.shared.u32 [r6527+3588], r5075; +st.shared.u32 [r6527+3840], r5105; +st.shared.u32 [r6527+3844], r5112; +st.shared.u32 [r6527+4096], r5142; +st.shared.u32 [r6527+4100], r5149; +st.shared.u32 [r6527+4352], r5179; +st.shared.u32 [r6527+4356], r5186; +st.shared.u32 [r6527+4608], r5216; +st.shared.u32 [r6527+4612], r5223; +st.shared.u32 [r6527+4864], r5253; +st.shared.u32 [r6527+4868], r5260; +st.shared.u32 [r6527+5120], r5290; +st.shared.u32 [r6527+5124], r5297; +st.shared.u32 [r6527+5376], r5327; +st.shared.u32 [r6527+5380], r5334; +st.shared.u32 [r6527+5632], r5364; +st.shared.u32 [r6527+5636], r5371; +st.shared.u32 [r6527+5888], r5401; +st.shared.u32 [r6527+5892], r5408; +st.shared.u32 [r6527+6144], r5438; +st.shared.u32 [r6527+6148], r5445; +st.shared.u32 [r6527+6400], r5475; +st.shared.u32 [r6527+6404], r5482; +st.shared.u32 [r6527+6656], r5512; +st.shared.u32 [r6527+6660], r5519; +st.shared.u32 [r6527+6912], r5549; +st.shared.u32 [r6527+6916], r5556; +st.shared.u32 [r6527+7168], r5586; +st.shared.u32 [r6527+7172], r5593; +st.shared.u32 [r6527+7424], r5623; +st.shared.u32 [r6527+7428], r5630; +st.shared.u32 [r6527+7680], r5660; +st.shared.u32 [r6527+7684], r5667; +st.shared.u32 [r6527+7936], r5697; +st.shared.u32 [r6527+7940], r5704; +barrier.sync 0; +mad.lo.s32 r6528, r6529, -248, r6527; +ld.shared.u32 r5726, [r6528]; +ld.shared.u32 r5729, [r6528+4]; +ld.shared.u32 r5922, [r6528+2048]; +ld.shared.u32 r5925, [r6528+2052]; +ld.shared.u32 r6118, [r6528+4096]; +ld.shared.u32 r6121, [r6528+4100]; +ld.shared.u32 r6314, [r6528+6144]; +ld.shared.u32 r6317, [r6528+6148]; +ld.shared.u32 r5776, [r6528+8192]; +ld.shared.u32 r5779, [r6528+8196]; +ld.shared.u32 r5972, [r6528+10240]; +ld.shared.u32 r5975, [r6528+10244]; +ld.shared.u32 r6168, [r6528+12288]; +ld.shared.u32 r6171, [r6528+12292]; +ld.shared.u32 r6364, [r6528+14336]; +ld.shared.u32 r6367, [r6528+14340]; +ld.shared.u32 r5738, [r6528+16384]; +ld.shared.u32 r5741, [r6528+16388]; +ld.shared.u32 r5934, [r6528+18432]; +ld.shared.u32 r5937, [r6528+18436]; +ld.shared.u32 r6130, [r6528+20480]; +ld.shared.u32 r6133, [r6528+20484]; +ld.shared.u32 r6326, [r6528+22528]; +ld.shared.u32 r6329, [r6528+22532]; +ld.shared.u32 r5788, [r6528+24576]; +ld.shared.u32 r5791, [r6528+24580]; +ld.shared.u32 r5984, [r6528+26624]; +ld.shared.u32 r5987, [r6528+26628]; +ld.shared.u32 r6180, [r6528+28672]; +ld.shared.u32 r6183, [r6528+28676]; +ld.shared.u32 r6376, [r6528+30720]; +ld.shared.u32 r6379, [r6528+30724]; +ld.shared.u32 r5727, [r6528+32768]; +ld.shared.u32 r5730, [r6528+32772]; +ld.shared.u32 r5923, [r6528+34816]; +ld.shared.u32 r5926, [r6528+34820]; +ld.shared.u32 r6119, [r6528+36864]; +ld.shared.u32 r6122, [r6528+36868]; +ld.shared.u32 r6315, [r6528+38912]; +ld.shared.u32 r6318, [r6528+38916]; +ld.shared.u32 r5777, [r6528+40960]; +ld.shared.u32 r5780, [r6528+40964]; +ld.shared.u32 r5973, [r6528+43008]; +ld.shared.u32 r5976, [r6528+43012]; +ld.shared.u32 r6169, [r6528+45056]; +ld.shared.u32 r6172, [r6528+45060]; +ld.shared.u32 r6365, [r6528+47104]; +ld.shared.u32 r6368, [r6528+47108]; +ld.shared.u32 r5739, [r6528+49152]; +ld.shared.u32 r5742, [r6528+49156]; +ld.shared.u32 r5935, [r6528+51200]; +ld.shared.u32 r5938, [r6528+51204]; +ld.shared.u32 r6131, [r6528+53248]; +ld.shared.u32 r6134, [r6528+53252]; +ld.shared.u32 r6327, [r6528+55296]; +ld.shared.u32 r6330, [r6528+55300]; +ld.shared.u32 r5789, [r6528+57344]; +ld.shared.u32 r5792, [r6528+57348]; +ld.shared.u32 r5985, [r6528+59392]; +ld.shared.u32 r5988, [r6528+59396]; +ld.shared.u32 r6181, [r6528+61440]; +ld.shared.u32 r6184, [r6528+61444]; +ld.shared.u32 r6377, [r6528+63488]; +ld.shared.u32 r6380, [r6528+63492]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5746; +} +{ +add.f16x2 r5766, r5734, r5749; +} +{ +sub.f16x2 r5769, r5731, r5746; +} +{ +sub.f16x2 r5772, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5796; +} +{ +add.f16x2 r5816, r5784, r5799; +} +{ +sub.f16x2 r5819, r5781, r5796; +} +{ +sub.f16x2 r5822, r5784, r5799; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5807; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 %0, r5751, r5801; +} +{ +add.f16x2 %1, r5754, r5804; +} +{ +sub.f16x2 %32, r5751, r5801; +} +{ +sub.f16x2 %33, r5754, r5804; +} +{ +add.f16x2 %8, r5763, r5845; +} +{ +add.f16x2 %9, r5766, r5851; +} +{ +sub.f16x2 %40, r5763, r5845; +} +{ +sub.f16x2 %41, r5766, r5851; +} +{ +add.f16x2 %16, r5757, r5810; +} +{ +add.f16x2 %17, r5760, r5855; +} +{ +sub.f16x2 %48, r5757, r5810; +} +{ +sub.f16x2 %49, r5760, r5855; +} +{ +add.f16x2 %24, r5769, r5863; +} +{ +add.f16x2 %25, r5772, r5869; +} +{ +sub.f16x2 %56, r5769, r5863; +} +{ +sub.f16x2 %57, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5939; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5942; +} +{ +add.f16x2 r5962, r5930, r5945; +} +{ +sub.f16x2 r5965, r5927, r5942; +} +{ +sub.f16x2 r5968, r5930, r5945; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5989; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5992; +} +{ +add.f16x2 r6012, r5980, r5995; +} +{ +sub.f16x2 r6015, r5977, r5992; +} +{ +sub.f16x2 r6018, r5980, r5995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6003; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 %2, r5947, r5997; +} +{ +add.f16x2 %3, r5950, r6000; +} +{ +sub.f16x2 %34, r5947, r5997; +} +{ +sub.f16x2 %35, r5950, r6000; +} +{ +add.f16x2 %10, r5959, r6041; +} +{ +add.f16x2 %11, r5962, r6047; +} +{ +sub.f16x2 %42, r5959, r6041; +} +{ +sub.f16x2 %43, r5962, r6047; +} +{ +add.f16x2 %18, r5953, r6006; +} +{ +add.f16x2 %19, r5956, r6051; +} +{ +sub.f16x2 %50, r5953, r6006; +} +{ +sub.f16x2 %51, r5956, r6051; +} +{ +add.f16x2 %26, r5965, r6059; +} +{ +add.f16x2 %27, r5968, r6065; +} +{ +sub.f16x2 %58, r5965, r6059; +} +{ +sub.f16x2 %59, r5968, r6065; +} +{ +add.f16x2 r6117, r6118, r6119; +} +{ +add.f16x2 r6120, r6121, r6122; +} +{ +sub.f16x2 r6123, r6118, r6119; +} +{ +sub.f16x2 r6126, r6121, r6122; +} +{ +add.f16x2 r6129, r6130, r6131; +} +{ +add.f16x2 r6132, r6133, r6134; +} +{ +sub.f16x2 r6135, r6130, r6131; +} +{ +sub.f16x2 r6138, r6133, r6134; +} +{ +neg.f16x2 r6141, r6135; +} +{ +add.f16x2 r6143, r6117, r6129; +} +{ +add.f16x2 r6146, r6120, r6132; +} +{ +sub.f16x2 r6149, r6117, r6129; +} +{ +sub.f16x2 r6152, r6120, r6132; +} +{ +add.f16x2 r6155, r6123, r6138; +} +{ +add.f16x2 r6158, r6126, r6141; +} +{ +sub.f16x2 r6161, r6123, r6138; +} +{ +sub.f16x2 r6164, r6126, r6141; +} +{ +add.f16x2 r6167, r6168, r6169; +} +{ +add.f16x2 r6170, r6171, r6172; +} +{ +sub.f16x2 r6173, r6168, r6169; +} +{ +sub.f16x2 r6176, r6171, r6172; +} +{ +add.f16x2 r6179, r6180, r6181; +} +{ +add.f16x2 r6182, r6183, r6184; +} +{ +sub.f16x2 r6185, r6180, r6181; +} +{ +sub.f16x2 r6188, r6183, r6184; +} +{ +neg.f16x2 r6191, r6185; +} +{ +add.f16x2 r6193, r6167, r6179; +} +{ +add.f16x2 r6196, r6170, r6182; +} +{ +sub.f16x2 r6199, r6167, r6179; +} +{ +sub.f16x2 r6202, r6170, r6182; +} +{ +add.f16x2 r6205, r6173, r6188; +} +{ +add.f16x2 r6208, r6176, r6191; +} +{ +sub.f16x2 r6211, r6173, r6188; +} +{ +sub.f16x2 r6214, r6176, r6191; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r6217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6218, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6222, {low, high}; +} +{ +mul.f16x2 r6231, r6205, r6217; +} +{ +mul.f16x2 r6234, r6208, r6218; +} +{ +sub.f16x2 r6237, r6231, r6234; +} +{ +mul.f16x2 r6240, r6205, r6218; +} +{ +fma.rn.f16x2 r6243, r6208, r6217, r6240; +} +{ +neg.f16x2 r6247, r6199; +} +{ +mul.f16x2 r6249, r6211, r6221; +} +{ +mul.f16x2 r6252, r6214, r6222; +} +{ +sub.f16x2 r6255, r6249, r6252; +} +{ +mul.f16x2 r6258, r6211, r6222; +} +{ +fma.rn.f16x2 r6261, r6214, r6221, r6258; +} +{ +add.f16x2 %4, r6143, r6193; +} +{ +add.f16x2 %5, r6146, r6196; +} +{ +sub.f16x2 %36, r6143, r6193; +} +{ +sub.f16x2 %37, r6146, r6196; +} +{ +add.f16x2 %12, r6155, r6237; +} +{ +add.f16x2 %13, r6158, r6243; +} +{ +sub.f16x2 %44, r6155, r6237; +} +{ +sub.f16x2 %45, r6158, r6243; +} +{ +add.f16x2 %20, r6149, r6202; +} +{ +add.f16x2 %21, r6152, r6247; +} +{ +sub.f16x2 %52, r6149, r6202; +} +{ +sub.f16x2 %53, r6152, r6247; +} +{ +add.f16x2 %28, r6161, r6255; +} +{ +add.f16x2 %29, r6164, r6261; +} +{ +sub.f16x2 %60, r6161, r6255; +} +{ +sub.f16x2 %61, r6164, r6261; +} +{ +add.f16x2 r6313, r6314, r6315; +} +{ +add.f16x2 r6316, r6317, r6318; +} +{ +sub.f16x2 r6319, r6314, r6315; +} +{ +sub.f16x2 r6322, r6317, r6318; +} +{ +add.f16x2 r6325, r6326, r6327; +} +{ +add.f16x2 r6328, r6329, r6330; +} +{ +sub.f16x2 r6331, r6326, r6327; +} +{ +sub.f16x2 r6334, r6329, r6330; +} +{ +neg.f16x2 r6337, r6331; +} +{ +add.f16x2 r6339, r6313, r6325; +} +{ +add.f16x2 r6342, r6316, r6328; +} +{ +sub.f16x2 r6345, r6313, r6325; +} +{ +sub.f16x2 r6348, r6316, r6328; +} +{ +add.f16x2 r6351, r6319, r6334; +} +{ +add.f16x2 r6354, r6322, r6337; +} +{ +sub.f16x2 r6357, r6319, r6334; +} +{ +sub.f16x2 r6360, r6322, r6337; +} +{ +add.f16x2 r6363, r6364, r6365; +} +{ +add.f16x2 r6366, r6367, r6368; +} +{ +sub.f16x2 r6369, r6364, r6365; +} +{ +sub.f16x2 r6372, r6367, r6368; +} +{ +add.f16x2 r6375, r6376, r6377; +} +{ +add.f16x2 r6378, r6379, r6380; +} +{ +sub.f16x2 r6381, r6376, r6377; +} +{ +sub.f16x2 r6384, r6379, r6380; +} +{ +neg.f16x2 r6387, r6381; +} +{ +add.f16x2 r6389, r6363, r6375; +} +{ +add.f16x2 r6392, r6366, r6378; +} +{ +sub.f16x2 r6395, r6363, r6375; +} +{ +sub.f16x2 r6398, r6366, r6378; +} +{ +add.f16x2 r6401, r6369, r6384; +} +{ +add.f16x2 r6404, r6372, r6387; +} +{ +sub.f16x2 r6407, r6369, r6384; +} +{ +sub.f16x2 r6410, r6372, r6387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r6413, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6418, {low, high}; +} +{ +mul.f16x2 r6427, r6401, r6413; +} +{ +mul.f16x2 r6430, r6404, r6414; +} +{ +sub.f16x2 r6433, r6427, r6430; +} +{ +mul.f16x2 r6436, r6401, r6414; +} +{ +fma.rn.f16x2 r6439, r6404, r6413, r6436; +} +{ +neg.f16x2 r6443, r6395; +} +{ +mul.f16x2 r6445, r6407, r6417; +} +{ +mul.f16x2 r6448, r6410, r6418; +} +{ +sub.f16x2 r6451, r6445, r6448; +} +{ +mul.f16x2 r6454, r6407, r6418; +} +{ +fma.rn.f16x2 r6457, r6410, r6417, r6454; +} +{ +add.f16x2 %6, r6339, r6389; +} +{ +add.f16x2 %7, r6342, r6392; +} +{ +sub.f16x2 %38, r6339, r6389; +} +{ +sub.f16x2 %39, r6342, r6392; +} +{ +add.f16x2 %14, r6351, r6433; +} +{ +add.f16x2 %15, r6354, r6439; +} +{ +sub.f16x2 %46, r6351, r6433; +} +{ +sub.f16x2 %47, r6354, r6439; +} +{ +add.f16x2 %22, r6345, r6398; +} +{ +add.f16x2 %23, r6348, r6443; +} +{ +sub.f16x2 %54, r6345, r6398; +} +{ +sub.f16x2 %55, r6348, r6443; +} +{ +add.f16x2 %30, r6357, r6451; +} +{ +add.f16x2 %31, r6360, r6457; +} +{ +sub.f16x2 %62, r6357, r6451; +} +{ +sub.f16x2 %63, r6360, r6457; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<857, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<965>; +.reg .b32 r<6594>; +.reg .b64 rd<3>; +mov.u32 r6509, %tid.y; +shl.b32 r6510, r6509, 15; +mov.u32 r6511, %64; +add.s32 r6512, r6511, r6510; +mov.u32 r6513, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %109, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %109, %101; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f930, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r101, {low, high}; +} +mov.f32 f940, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %107; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %107; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f660, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r396, {low, high}; +} +mov.f32 f676, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r397, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r635; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r638; +} +{ +add.f16x2 r658, r626, r641; +} +{ +sub.f16x2 r661, r623, r638; +} +{ +sub.f16x2 r664, r626, r641; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %106, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %106, %99; +} +{ +neg.f16x2 r691, r685; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r688; +} +{ +add.f16x2 r708, r676, r691; +} +{ +sub.f16x2 r711, r673, r688; +} +{ +sub.f16x2 r714, r676, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r699; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r702; +} +{ +add.f16x2 r792, r652, r747; +} +{ +sub.f16x2 r795, r649, r702; +} +{ +sub.f16x2 r798, r652, r747; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %108, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %108, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r831; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r834; +} +{ +add.f16x2 r854, r822, r837; +} +{ +sub.f16x2 r857, r819, r834; +} +{ +sub.f16x2 r860, r822, r837; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r881; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r884; +} +{ +add.f16x2 r904, r872, r887; +} +{ +sub.f16x2 r907, r869, r884; +} +{ +sub.f16x2 r910, r872, r887; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r895; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r898; +} +{ +add.f16x2 r988, r848, r943; +} +{ +sub.f16x2 r991, r845, r898; +} +{ +sub.f16x2 r994, r848, r943; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r967; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r970; +} +{ +add.f16x2 r1188, r774, r1087; +} +{ +sub.f16x2 r1191, r771, r970; +} +{ +sub.f16x2 r1194, r774, r1087; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f656, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f664, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1240, {low, high}; +} +mov.f32 f672, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r1241, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1244, {low, high}; +} +mov.f32 f680, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r1245, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1246, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1143; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1146; +} +{ +add.f16x2 r1620, r530, r1407; +} +{ +sub.f16x2 r1623, r527, r1146; +} +{ +sub.f16x2 r1626, r530, r1407; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6515, r6513, 7; +and.b32 r6516, r6515, -32768; +add.s32 r6517, r6512, r6516; +and.b32 r6529, r6513, 255; +cvt.rn.f32.u32 f957, r6529; +mul.f32 f958, f957, 0f3A490FDB; +cos.approx.f32 f357, f958; +sin.approx.f32 f959, f958; +neg.f32 f358, f959; +mov.f32 f964, 0f3F800000; +mov.f32 f963, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +neg.f16x2 r1723, r1720; +} +{ +fma.rn.f16x2 r1725, r1533, r1716, r1723; +} +{ +mul.f16x2 r1729, r1533, r1718; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +neg.f16x2 r1760, r1757; +} +{ +fma.rn.f16x2 r1762, r1545, r1753, r1760; +} +{ +mul.f16x2 r1766, r1545, r1755; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +neg.f16x2 r1797, r1794; +} +{ +fma.rn.f16x2 r1799, r1557, r1790, r1797; +} +{ +mul.f16x2 r1803, r1557, r1792; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +neg.f16x2 r1834, r1831; +} +{ +fma.rn.f16x2 r1836, r1569, r1827, r1834; +} +{ +mul.f16x2 r1840, r1569, r1829; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1840; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +neg.f16x2 r1871, r1868; +} +{ +fma.rn.f16x2 r1873, r1581, r1864, r1871; +} +{ +mul.f16x2 r1877, r1581, r1866; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1877; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +neg.f16x2 r1908, r1905; +} +{ +fma.rn.f16x2 r1910, r1593, r1901, r1908; +} +{ +mul.f16x2 r1914, r1593, r1903; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1914; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +neg.f16x2 r1945, r1942; +} +{ +fma.rn.f16x2 r1947, r1605, r1938, r1945; +} +{ +mul.f16x2 r1951, r1605, r1940; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1951; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +neg.f16x2 r1982, r1979; +} +{ +fma.rn.f16x2 r1984, r1617, r1975, r1982; +} +{ +mul.f16x2 r1988, r1617, r1977; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +neg.f16x2 r2019, r2016; +} +{ +fma.rn.f16x2 r2021, r1629, r2012, r2019; +} +{ +mul.f16x2 r2025, r1629, r2014; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +neg.f16x2 r2056, r2053; +} +{ +fma.rn.f16x2 r2058, r1641, r2049, r2056; +} +{ +mul.f16x2 r2062, r1641, r2051; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +neg.f16x2 r2093, r2090; +} +{ +fma.rn.f16x2 r2095, r1653, r2086, r2093; +} +{ +mul.f16x2 r2099, r1653, r2088; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +neg.f16x2 r2130, r2127; +} +{ +fma.rn.f16x2 r2132, r1665, r2123, r2130; +} +{ +mul.f16x2 r2136, r1665, r2125; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +neg.f16x2 r2167, r2164; +} +{ +fma.rn.f16x2 r2169, r1677, r2160, r2167; +} +{ +mul.f16x2 r2173, r1677, r2162; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +neg.f16x2 r2204, r2201; +} +{ +fma.rn.f16x2 r2206, r1689, r2197, r2204; +} +{ +mul.f16x2 r2210, r1689, r2199; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +neg.f16x2 r2241, r2238; +} +{ +fma.rn.f16x2 r2243, r1701, r2234, r2241; +} +{ +mul.f16x2 r2247, r1701, r2236; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +neg.f16x2 r2278, r2275; +} +{ +fma.rn.f16x2 r2280, r1527, r2271, r2278; +} +{ +mul.f16x2 r2284, r1527, r2273; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +neg.f16x2 r2315, r2312; +} +{ +fma.rn.f16x2 r2317, r1539, r2308, r2315; +} +{ +mul.f16x2 r2321, r1539, r2310; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +neg.f16x2 r2352, r2349; +} +{ +fma.rn.f16x2 r2354, r1551, r2345, r2352; +} +{ +mul.f16x2 r2358, r1551, r2347; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +neg.f16x2 r2389, r2386; +} +{ +fma.rn.f16x2 r2391, r1563, r2382, r2389; +} +{ +mul.f16x2 r2395, r1563, r2384; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +neg.f16x2 r2426, r2423; +} +{ +fma.rn.f16x2 r2428, r1575, r2419, r2426; +} +{ +mul.f16x2 r2432, r1575, r2421; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +neg.f16x2 r2463, r2460; +} +{ +fma.rn.f16x2 r2465, r1587, r2456, r2463; +} +{ +mul.f16x2 r2469, r1587, r2458; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +neg.f16x2 r2500, r2497; +} +{ +fma.rn.f16x2 r2502, r1599, r2493, r2500; +} +{ +mul.f16x2 r2506, r1599, r2495; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2506; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +neg.f16x2 r2537, r2534; +} +{ +fma.rn.f16x2 r2539, r1611, r2530, r2537; +} +{ +mul.f16x2 r2543, r1611, r2532; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2543; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +neg.f16x2 r2574, r2571; +} +{ +fma.rn.f16x2 r2576, r1623, r2567, r2574; +} +{ +mul.f16x2 r2580, r1623, r2569; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2580; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +neg.f16x2 r2611, r2608; +} +{ +fma.rn.f16x2 r2613, r1635, r2604, r2611; +} +{ +mul.f16x2 r2617, r1635, r2606; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2617; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +neg.f16x2 r2648, r2645; +} +{ +fma.rn.f16x2 r2650, r1647, r2641, r2648; +} +{ +mul.f16x2 r2654, r1647, r2643; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2654; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +neg.f16x2 r2685, r2682; +} +{ +fma.rn.f16x2 r2687, r1659, r2678, r2685; +} +{ +mul.f16x2 r2691, r1659, r2680; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2691; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +neg.f16x2 r2722, r2719; +} +{ +fma.rn.f16x2 r2724, r1671, r2715, r2722; +} +{ +mul.f16x2 r2728, r1671, r2717; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2728; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +neg.f16x2 r2759, r2756; +} +{ +fma.rn.f16x2 r2761, r1683, r2752, r2759; +} +{ +mul.f16x2 r2765, r1683, r2754; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2765; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +neg.f16x2 r2796, r2793; +} +{ +fma.rn.f16x2 r2798, r1695, r2789, r2796; +} +{ +mul.f16x2 r2802, r1695, r2791; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2802; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +neg.f16x2 r2833, r2830; +} +{ +fma.rn.f16x2 r2835, r1707, r2826, r2833; +} +{ +mul.f16x2 r2839, r1707, r2828; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2839; +} +barrier.sync 0; +and.b32 r6518, r6515, 32640; +add.s32 r6519, r6517, r6518; +st.shared.v4.f32 [r6519], {r1521, r1725, r1762, r1799}; +st.shared.v4.f32 [r6519+16], {r1836, r1873, r1910, r1947}; +st.shared.v4.f32 [r6519+32], {r1984, r2021, r2058, r2095}; +st.shared.v4.f32 [r6519+48], {r2132, r2169, r2206, r2243}; +st.shared.v4.f32 [r6519+64], {r2280, r2317, r2354, r2391}; +st.shared.v4.f32 [r6519+80], {r2428, r2465, r2502, r2539}; +st.shared.v4.f32 [r6519+96], {r2576, r2613, r2650, r2687}; +st.shared.v4.f32 [r6519+112], {r2724, r2761, r2798, r2835}; +barrier.sync 0; +mad.lo.s32 r6520, r6529, -124, r6519; +ld.shared.u32 r2864, [r6520]; +ld.shared.u32 r3480, [r6520+1024]; +ld.shared.u32 r3060, [r6520+2048]; +ld.shared.u32 r3676, [r6520+3072]; +ld.shared.u32 r2914, [r6520+4096]; +ld.shared.u32 r3530, [r6520+5120]; +ld.shared.u32 r3110, [r6520+6144]; +ld.shared.u32 r3726, [r6520+7168]; +ld.shared.u32 r2876, [r6520+8192]; +ld.shared.u32 r3492, [r6520+9216]; +ld.shared.u32 r3072, [r6520+10240]; +ld.shared.u32 r3688, [r6520+11264]; +ld.shared.u32 r2926, [r6520+12288]; +ld.shared.u32 r3542, [r6520+13312]; +ld.shared.u32 r3122, [r6520+14336]; +ld.shared.u32 r3738, [r6520+15360]; +ld.shared.u32 r2865, [r6520+16384]; +ld.shared.u32 r3481, [r6520+17408]; +ld.shared.u32 r3061, [r6520+18432]; +ld.shared.u32 r3677, [r6520+19456]; +ld.shared.u32 r2915, [r6520+20480]; +ld.shared.u32 r3531, [r6520+21504]; +ld.shared.u32 r3111, [r6520+22528]; +ld.shared.u32 r3727, [r6520+23552]; +ld.shared.u32 r2877, [r6520+24576]; +ld.shared.u32 r3493, [r6520+25600]; +ld.shared.u32 r3073, [r6520+26624]; +ld.shared.u32 r3689, [r6520+27648]; +ld.shared.u32 r2927, [r6520+28672]; +ld.shared.u32 r3543, [r6520+29696]; +ld.shared.u32 r3123, [r6520+30720]; +ld.shared.u32 r3739, [r6520+31744]; +barrier.sync 0; +st.shared.v4.f32 [r6519], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r6519+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r6519+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r6519+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r6519+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r6519+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r6519+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r6519+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r6520]; +ld.shared.u32 r3483, [r6520+1024]; +ld.shared.u32 r3063, [r6520+2048]; +ld.shared.u32 r3679, [r6520+3072]; +ld.shared.u32 r2917, [r6520+4096]; +ld.shared.u32 r3533, [r6520+5120]; +ld.shared.u32 r3113, [r6520+6144]; +ld.shared.u32 r3729, [r6520+7168]; +ld.shared.u32 r2879, [r6520+8192]; +ld.shared.u32 r3495, [r6520+9216]; +ld.shared.u32 r3075, [r6520+10240]; +ld.shared.u32 r3691, [r6520+11264]; +ld.shared.u32 r2929, [r6520+12288]; +ld.shared.u32 r3545, [r6520+13312]; +ld.shared.u32 r3125, [r6520+14336]; +ld.shared.u32 r3741, [r6520+15360]; +ld.shared.u32 r2868, [r6520+16384]; +ld.shared.u32 r3484, [r6520+17408]; +ld.shared.u32 r3064, [r6520+18432]; +ld.shared.u32 r3680, [r6520+19456]; +ld.shared.u32 r2918, [r6520+20480]; +ld.shared.u32 r3534, [r6520+21504]; +ld.shared.u32 r3114, [r6520+22528]; +ld.shared.u32 r3730, [r6520+23552]; +ld.shared.u32 r2880, [r6520+24576]; +ld.shared.u32 r3496, [r6520+25600]; +ld.shared.u32 r3076, [r6520+26624]; +ld.shared.u32 r3692, [r6520+27648]; +ld.shared.u32 r2930, [r6520+28672]; +ld.shared.u32 r3546, [r6520+29696]; +ld.shared.u32 r3126, [r6520+30720]; +ld.shared.u32 r3742, [r6520+31744]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2881; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2884; +} +{ +add.f16x2 r2904, r2872, r2887; +} +{ +sub.f16x2 r2907, r2869, r2884; +} +{ +sub.f16x2 r2910, r2872, r2887; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2931; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2934; +} +{ +add.f16x2 r2954, r2922, r2937; +} +{ +sub.f16x2 r2957, r2919, r2934; +} +{ +sub.f16x2 r2960, r2922, r2937; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2945; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2948; +} +{ +add.f16x2 r3038, r2898, r2993; +} +{ +sub.f16x2 r3041, r2895, r2948; +} +{ +sub.f16x2 r3044, r2898, r2993; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3077; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3080; +} +{ +add.f16x2 r3100, r3068, r3083; +} +{ +sub.f16x2 r3103, r3065, r3080; +} +{ +sub.f16x2 r3106, r3068, r3083; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3127; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3130; +} +{ +add.f16x2 r3150, r3118, r3133; +} +{ +sub.f16x2 r3153, r3115, r3130; +} +{ +sub.f16x2 r3156, r3118, r3133; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3141; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3144; +} +{ +add.f16x2 r3234, r3094, r3189; +} +{ +sub.f16x2 r3237, r3091, r3144; +} +{ +sub.f16x2 r3240, r3094, r3189; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3213; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3216; +} +{ +add.f16x2 r3434, r3020, r3333; +} +{ +sub.f16x2 r3437, r3017, r3216; +} +{ +sub.f16x2 r3440, r3020, r3333; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3497; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3500; +} +{ +add.f16x2 r3520, r3488, r3503; +} +{ +sub.f16x2 r3523, r3485, r3500; +} +{ +sub.f16x2 r3526, r3488, r3503; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3547; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3550; +} +{ +add.f16x2 r3570, r3538, r3553; +} +{ +sub.f16x2 r3573, r3535, r3550; +} +{ +sub.f16x2 r3576, r3538, r3553; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3561; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3564; +} +{ +add.f16x2 r3654, r3514, r3609; +} +{ +sub.f16x2 r3657, r3511, r3564; +} +{ +sub.f16x2 r3660, r3514, r3609; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3693; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3696; +} +{ +add.f16x2 r3716, r3684, r3699; +} +{ +sub.f16x2 r3719, r3681, r3696; +} +{ +sub.f16x2 r3722, r3684, r3699; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3743; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3746; +} +{ +add.f16x2 r3766, r3734, r3749; +} +{ +sub.f16x2 r3769, r3731, r3746; +} +{ +sub.f16x2 r3772, r3734, r3749; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3757; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3760; +} +{ +add.f16x2 r3850, r3710, r3805; +} +{ +sub.f16x2 r3853, r3707, r3760; +} +{ +sub.f16x2 r3856, r3710, r3805; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3829; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3832; +} +{ +add.f16x2 r4050, r3636, r3949; +} +{ +sub.f16x2 r4053, r3633, r3832; +} +{ +sub.f16x2 r4056, r3636, r3949; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f656; +cvt.rn.f16.f32 high, f656; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f660; +cvt.rn.f16.f32 high, f660; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f664; +cvt.rn.f16.f32 high, f664; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f672; +cvt.rn.f16.f32 high, f672; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f676; +cvt.rn.f16.f32 high, f676; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f680; +cvt.rn.f16.f32 high, f680; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4005; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4008; +} +{ +add.f16x2 r4482, r3392, r4269; +} +{ +sub.f16x2 r4485, r3389, r4008; +} +{ +sub.f16x2 r4488, r3392, r4269; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r6521, r6513, 224; +bfe.u32 r6522, r6513, 5, 3; +shl.b32 r6523, r6513, 2; +and.b32 r6524, r6523, 124; +add.s32 r6525, r6517, r6524; +cvt.rn.f32.u32 f960, r6522; +mul.f32 f961, f960, 0f3CC90FDB; +cos.approx.f32 f779, f961; +sin.approx.f32 f962, f961; +neg.f32 f780, f962; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +neg.f16x2 r4585, r4582; +} +{ +fma.rn.f16x2 r4587, r4395, r4578, r4585; +} +{ +mul.f16x2 r4591, r4395, r4580; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +neg.f16x2 r4622, r4619; +} +{ +fma.rn.f16x2 r4624, r4407, r4615, r4622; +} +{ +mul.f16x2 r4628, r4407, r4617; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +neg.f16x2 r4659, r4656; +} +{ +fma.rn.f16x2 r4661, r4419, r4652, r4659; +} +{ +mul.f16x2 r4665, r4419, r4654; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +neg.f16x2 r4696, r4693; +} +{ +fma.rn.f16x2 r4698, r4431, r4689, r4696; +} +{ +mul.f16x2 r4702, r4431, r4691; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +neg.f16x2 r4733, r4730; +} +{ +fma.rn.f16x2 r4735, r4443, r4726, r4733; +} +{ +mul.f16x2 r4739, r4443, r4728; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +neg.f16x2 r4770, r4767; +} +{ +fma.rn.f16x2 r4772, r4455, r4763, r4770; +} +{ +mul.f16x2 r4776, r4455, r4765; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +neg.f16x2 r4807, r4804; +} +{ +fma.rn.f16x2 r4809, r4467, r4800, r4807; +} +{ +mul.f16x2 r4813, r4467, r4802; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +neg.f16x2 r4844, r4841; +} +{ +fma.rn.f16x2 r4846, r4479, r4837, r4844; +} +{ +mul.f16x2 r4850, r4479, r4839; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +neg.f16x2 r4881, r4878; +} +{ +fma.rn.f16x2 r4883, r4491, r4874, r4881; +} +{ +mul.f16x2 r4887, r4491, r4876; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4887; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +neg.f16x2 r4918, r4915; +} +{ +fma.rn.f16x2 r4920, r4503, r4911, r4918; +} +{ +mul.f16x2 r4924, r4503, r4913; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4924; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +neg.f16x2 r4955, r4952; +} +{ +fma.rn.f16x2 r4957, r4515, r4948, r4955; +} +{ +mul.f16x2 r4961, r4515, r4950; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4961; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +neg.f16x2 r4992, r4989; +} +{ +fma.rn.f16x2 r4994, r4527, r4985, r4992; +} +{ +mul.f16x2 r4998, r4527, r4987; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4998; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +neg.f16x2 r5029, r5026; +} +{ +fma.rn.f16x2 r5031, r4539, r5022, r5029; +} +{ +mul.f16x2 r5035, r4539, r5024; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5035; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +neg.f16x2 r5066, r5063; +} +{ +fma.rn.f16x2 r5068, r4551, r5059, r5066; +} +{ +mul.f16x2 r5072, r4551, r5061; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5072; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +neg.f16x2 r5103, r5100; +} +{ +fma.rn.f16x2 r5105, r4563, r5096, r5103; +} +{ +mul.f16x2 r5109, r4563, r5098; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5109; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +neg.f16x2 r5140, r5137; +} +{ +fma.rn.f16x2 r5142, r4389, r5133, r5140; +} +{ +mul.f16x2 r5146, r4389, r5135; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5146; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +neg.f16x2 r5177, r5174; +} +{ +fma.rn.f16x2 r5179, r4401, r5170, r5177; +} +{ +mul.f16x2 r5183, r4401, r5172; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5183; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +neg.f16x2 r5214, r5211; +} +{ +fma.rn.f16x2 r5216, r4413, r5207, r5214; +} +{ +mul.f16x2 r5220, r4413, r5209; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +neg.f16x2 r5251, r5248; +} +{ +fma.rn.f16x2 r5253, r4425, r5244, r5251; +} +{ +mul.f16x2 r5257, r4425, r5246; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +neg.f16x2 r5288, r5285; +} +{ +fma.rn.f16x2 r5290, r4437, r5281, r5288; +} +{ +mul.f16x2 r5294, r4437, r5283; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +neg.f16x2 r5325, r5322; +} +{ +fma.rn.f16x2 r5327, r4449, r5318, r5325; +} +{ +mul.f16x2 r5331, r4449, r5320; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +neg.f16x2 r5362, r5359; +} +{ +fma.rn.f16x2 r5364, r4461, r5355, r5362; +} +{ +mul.f16x2 r5368, r4461, r5357; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +neg.f16x2 r5399, r5396; +} +{ +fma.rn.f16x2 r5401, r4473, r5392, r5399; +} +{ +mul.f16x2 r5405, r4473, r5394; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +neg.f16x2 r5436, r5433; +} +{ +fma.rn.f16x2 r5438, r4485, r5429, r5436; +} +{ +mul.f16x2 r5442, r4485, r5431; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +neg.f16x2 r5473, r5470; +} +{ +fma.rn.f16x2 r5475, r4497, r5466, r5473; +} +{ +mul.f16x2 r5479, r4497, r5468; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5479; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +neg.f16x2 r5510, r5507; +} +{ +fma.rn.f16x2 r5512, r4509, r5503, r5510; +} +{ +mul.f16x2 r5516, r4509, r5505; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5516; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +neg.f16x2 r5547, r5544; +} +{ +fma.rn.f16x2 r5549, r4521, r5540, r5547; +} +{ +mul.f16x2 r5553, r4521, r5542; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5553; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +neg.f16x2 r5584, r5581; +} +{ +fma.rn.f16x2 r5586, r4533, r5577, r5584; +} +{ +mul.f16x2 r5590, r4533, r5579; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5590; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +neg.f16x2 r5621, r5618; +} +{ +fma.rn.f16x2 r5623, r4545, r5614, r5621; +} +{ +mul.f16x2 r5627, r4545, r5616; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5627; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +neg.f16x2 r5658, r5655; +} +{ +fma.rn.f16x2 r5660, r4557, r5651, r5658; +} +{ +mul.f16x2 r5664, r4557, r5653; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5664; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +neg.f16x2 r5695, r5692; +} +{ +fma.rn.f16x2 r5697, r4569, r5688, r5695; +} +{ +mul.f16x2 r5701, r4569, r5690; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5701; +} +barrier.sync 0; +and.b32 r6526, r6515, 28672; +add.s32 r6527, r6525, r6526; +st.shared.u32 [r6527], r4383; +st.shared.u32 [r6527+128], r4587; +st.shared.u32 [r6527+256], r4624; +st.shared.u32 [r6527+384], r4661; +st.shared.u32 [r6527+512], r4698; +st.shared.u32 [r6527+640], r4735; +st.shared.u32 [r6527+768], r4772; +st.shared.u32 [r6527+896], r4809; +st.shared.u32 [r6527+1024], r4846; +st.shared.u32 [r6527+1152], r4883; +st.shared.u32 [r6527+1280], r4920; +st.shared.u32 [r6527+1408], r4957; +st.shared.u32 [r6527+1536], r4994; +st.shared.u32 [r6527+1664], r5031; +st.shared.u32 [r6527+1792], r5068; +st.shared.u32 [r6527+1920], r5105; +st.shared.u32 [r6527+2048], r5142; +st.shared.u32 [r6527+2176], r5179; +st.shared.u32 [r6527+2304], r5216; +st.shared.u32 [r6527+2432], r5253; +st.shared.u32 [r6527+2560], r5290; +st.shared.u32 [r6527+2688], r5327; +st.shared.u32 [r6527+2816], r5364; +st.shared.u32 [r6527+2944], r5401; +st.shared.u32 [r6527+3072], r5438; +st.shared.u32 [r6527+3200], r5475; +st.shared.u32 [r6527+3328], r5512; +st.shared.u32 [r6527+3456], r5549; +st.shared.u32 [r6527+3584], r5586; +st.shared.u32 [r6527+3712], r5623; +st.shared.u32 [r6527+3840], r5660; +st.shared.u32 [r6527+3968], r5697; +barrier.sync 0; +mad.lo.s32 r6528, r6521, -124, r6527; +ld.shared.u32 r5726, [r6528]; +ld.shared.u32 r5922, [r6528+1024]; +ld.shared.u32 r6118, [r6528+2048]; +ld.shared.u32 r6314, [r6528+3072]; +ld.shared.u32 r5776, [r6528+4096]; +ld.shared.u32 r5972, [r6528+5120]; +ld.shared.u32 r6168, [r6528+6144]; +ld.shared.u32 r6364, [r6528+7168]; +ld.shared.u32 r5738, [r6528+8192]; +ld.shared.u32 r5934, [r6528+9216]; +ld.shared.u32 r6130, [r6528+10240]; +ld.shared.u32 r6326, [r6528+11264]; +ld.shared.u32 r5788, [r6528+12288]; +ld.shared.u32 r5984, [r6528+13312]; +ld.shared.u32 r6180, [r6528+14336]; +ld.shared.u32 r6376, [r6528+15360]; +ld.shared.u32 r5727, [r6528+16384]; +ld.shared.u32 r5923, [r6528+17408]; +ld.shared.u32 r6119, [r6528+18432]; +ld.shared.u32 r6315, [r6528+19456]; +ld.shared.u32 r5777, [r6528+20480]; +ld.shared.u32 r5973, [r6528+21504]; +ld.shared.u32 r6169, [r6528+22528]; +ld.shared.u32 r6365, [r6528+23552]; +ld.shared.u32 r5739, [r6528+24576]; +ld.shared.u32 r5935, [r6528+25600]; +ld.shared.u32 r6131, [r6528+26624]; +ld.shared.u32 r6327, [r6528+27648]; +ld.shared.u32 r5789, [r6528+28672]; +ld.shared.u32 r5985, [r6528+29696]; +ld.shared.u32 r6181, [r6528+30720]; +ld.shared.u32 r6377, [r6528+31744]; +barrier.sync 0; +st.shared.u32 [r6527], r4386; +st.shared.u32 [r6527+128], r4594; +st.shared.u32 [r6527+256], r4631; +st.shared.u32 [r6527+384], r4668; +st.shared.u32 [r6527+512], r4705; +st.shared.u32 [r6527+640], r4742; +st.shared.u32 [r6527+768], r4779; +st.shared.u32 [r6527+896], r4816; +st.shared.u32 [r6527+1024], r4853; +st.shared.u32 [r6527+1152], r4890; +st.shared.u32 [r6527+1280], r4927; +st.shared.u32 [r6527+1408], r4964; +st.shared.u32 [r6527+1536], r5001; +st.shared.u32 [r6527+1664], r5038; +st.shared.u32 [r6527+1792], r5075; +st.shared.u32 [r6527+1920], r5112; +st.shared.u32 [r6527+2048], r5149; +st.shared.u32 [r6527+2176], r5186; +st.shared.u32 [r6527+2304], r5223; +st.shared.u32 [r6527+2432], r5260; +st.shared.u32 [r6527+2560], r5297; +st.shared.u32 [r6527+2688], r5334; +st.shared.u32 [r6527+2816], r5371; +st.shared.u32 [r6527+2944], r5408; +st.shared.u32 [r6527+3072], r5445; +st.shared.u32 [r6527+3200], r5482; +st.shared.u32 [r6527+3328], r5519; +st.shared.u32 [r6527+3456], r5556; +st.shared.u32 [r6527+3584], r5593; +st.shared.u32 [r6527+3712], r5630; +st.shared.u32 [r6527+3840], r5667; +st.shared.u32 [r6527+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r6528]; +ld.shared.u32 r5925, [r6528+1024]; +ld.shared.u32 r6121, [r6528+2048]; +ld.shared.u32 r6317, [r6528+3072]; +ld.shared.u32 r5779, [r6528+4096]; +ld.shared.u32 r5975, [r6528+5120]; +ld.shared.u32 r6171, [r6528+6144]; +ld.shared.u32 r6367, [r6528+7168]; +ld.shared.u32 r5741, [r6528+8192]; +ld.shared.u32 r5937, [r6528+9216]; +ld.shared.u32 r6133, [r6528+10240]; +ld.shared.u32 r6329, [r6528+11264]; +ld.shared.u32 r5791, [r6528+12288]; +ld.shared.u32 r5987, [r6528+13312]; +ld.shared.u32 r6183, [r6528+14336]; +ld.shared.u32 r6379, [r6528+15360]; +ld.shared.u32 r5730, [r6528+16384]; +ld.shared.u32 r5926, [r6528+17408]; +ld.shared.u32 r6122, [r6528+18432]; +ld.shared.u32 r6318, [r6528+19456]; +ld.shared.u32 r5780, [r6528+20480]; +ld.shared.u32 r5976, [r6528+21504]; +ld.shared.u32 r6172, [r6528+22528]; +ld.shared.u32 r6368, [r6528+23552]; +ld.shared.u32 r5742, [r6528+24576]; +ld.shared.u32 r5938, [r6528+25600]; +ld.shared.u32 r6134, [r6528+26624]; +ld.shared.u32 r6330, [r6528+27648]; +ld.shared.u32 r5792, [r6528+28672]; +ld.shared.u32 r5988, [r6528+29696]; +ld.shared.u32 r6184, [r6528+30720]; +ld.shared.u32 r6380, [r6528+31744]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5743; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5746; +} +{ +add.f16x2 r5766, r5734, r5749; +} +{ +sub.f16x2 r5769, r5731, r5746; +} +{ +sub.f16x2 r5772, r5734, r5749; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5793; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5796; +} +{ +add.f16x2 r5816, r5784, r5799; +} +{ +sub.f16x2 r5819, r5781, r5796; +} +{ +sub.f16x2 r5822, r5784, r5799; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5807; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 %0, r5751, r5801; +} +{ +add.f16x2 %1, r5754, r5804; +} +{ +sub.f16x2 %32, r5751, r5801; +} +{ +sub.f16x2 %33, r5754, r5804; +} +{ +add.f16x2 %8, r5763, r5845; +} +{ +add.f16x2 %9, r5766, r5851; +} +{ +sub.f16x2 %40, r5763, r5845; +} +{ +sub.f16x2 %41, r5766, r5851; +} +{ +add.f16x2 %16, r5757, r5810; +} +{ +add.f16x2 %17, r5760, r5855; +} +{ +sub.f16x2 %48, r5757, r5810; +} +{ +sub.f16x2 %49, r5760, r5855; +} +{ +add.f16x2 %24, r5769, r5863; +} +{ +add.f16x2 %25, r5772, r5869; +} +{ +sub.f16x2 %56, r5769, r5863; +} +{ +sub.f16x2 %57, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5939; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5942; +} +{ +add.f16x2 r5962, r5930, r5945; +} +{ +sub.f16x2 r5965, r5927, r5942; +} +{ +sub.f16x2 r5968, r5930, r5945; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5989; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5992; +} +{ +add.f16x2 r6012, r5980, r5995; +} +{ +sub.f16x2 r6015, r5977, r5992; +} +{ +sub.f16x2 r6018, r5980, r5995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6003; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 %2, r5947, r5997; +} +{ +add.f16x2 %3, r5950, r6000; +} +{ +sub.f16x2 %34, r5947, r5997; +} +{ +sub.f16x2 %35, r5950, r6000; +} +{ +add.f16x2 %10, r5959, r6041; +} +{ +add.f16x2 %11, r5962, r6047; +} +{ +sub.f16x2 %42, r5959, r6041; +} +{ +sub.f16x2 %43, r5962, r6047; +} +{ +add.f16x2 %18, r5953, r6006; +} +{ +add.f16x2 %19, r5956, r6051; +} +{ +sub.f16x2 %50, r5953, r6006; +} +{ +sub.f16x2 %51, r5956, r6051; +} +{ +add.f16x2 %26, r5965, r6059; +} +{ +add.f16x2 %27, r5968, r6065; +} +{ +sub.f16x2 %58, r5965, r6059; +} +{ +sub.f16x2 %59, r5968, r6065; +} +{ +add.f16x2 r6117, r6118, r6119; +} +{ +add.f16x2 r6120, r6121, r6122; +} +{ +sub.f16x2 r6123, r6118, r6119; +} +{ +sub.f16x2 r6126, r6121, r6122; +} +{ +add.f16x2 r6129, r6130, r6131; +} +{ +add.f16x2 r6132, r6133, r6134; +} +{ +sub.f16x2 r6135, r6130, r6131; +} +{ +sub.f16x2 r6138, r6133, r6134; +} +{ +neg.f16x2 r6141, r6135; +} +{ +add.f16x2 r6143, r6117, r6129; +} +{ +add.f16x2 r6146, r6120, r6132; +} +{ +sub.f16x2 r6149, r6117, r6129; +} +{ +sub.f16x2 r6152, r6120, r6132; +} +{ +add.f16x2 r6155, r6123, r6138; +} +{ +add.f16x2 r6158, r6126, r6141; +} +{ +sub.f16x2 r6161, r6123, r6138; +} +{ +sub.f16x2 r6164, r6126, r6141; +} +{ +add.f16x2 r6167, r6168, r6169; +} +{ +add.f16x2 r6170, r6171, r6172; +} +{ +sub.f16x2 r6173, r6168, r6169; +} +{ +sub.f16x2 r6176, r6171, r6172; +} +{ +add.f16x2 r6179, r6180, r6181; +} +{ +add.f16x2 r6182, r6183, r6184; +} +{ +sub.f16x2 r6185, r6180, r6181; +} +{ +sub.f16x2 r6188, r6183, r6184; +} +{ +neg.f16x2 r6191, r6185; +} +{ +add.f16x2 r6193, r6167, r6179; +} +{ +add.f16x2 r6196, r6170, r6182; +} +{ +sub.f16x2 r6199, r6167, r6179; +} +{ +sub.f16x2 r6202, r6170, r6182; +} +{ +add.f16x2 r6205, r6173, r6188; +} +{ +add.f16x2 r6208, r6176, r6191; +} +{ +sub.f16x2 r6211, r6173, r6188; +} +{ +sub.f16x2 r6214, r6176, r6191; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r6217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6218, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6222, {low, high}; +} +{ +mul.f16x2 r6231, r6205, r6217; +} +{ +mul.f16x2 r6234, r6208, r6218; +} +{ +sub.f16x2 r6237, r6231, r6234; +} +{ +mul.f16x2 r6240, r6205, r6218; +} +{ +fma.rn.f16x2 r6243, r6208, r6217, r6240; +} +{ +neg.f16x2 r6247, r6199; +} +{ +mul.f16x2 r6249, r6211, r6221; +} +{ +mul.f16x2 r6252, r6214, r6222; +} +{ +sub.f16x2 r6255, r6249, r6252; +} +{ +mul.f16x2 r6258, r6211, r6222; +} +{ +fma.rn.f16x2 r6261, r6214, r6221, r6258; +} +{ +add.f16x2 %4, r6143, r6193; +} +{ +add.f16x2 %5, r6146, r6196; +} +{ +sub.f16x2 %36, r6143, r6193; +} +{ +sub.f16x2 %37, r6146, r6196; +} +{ +add.f16x2 %12, r6155, r6237; +} +{ +add.f16x2 %13, r6158, r6243; +} +{ +sub.f16x2 %44, r6155, r6237; +} +{ +sub.f16x2 %45, r6158, r6243; +} +{ +add.f16x2 %20, r6149, r6202; +} +{ +add.f16x2 %21, r6152, r6247; +} +{ +sub.f16x2 %52, r6149, r6202; +} +{ +sub.f16x2 %53, r6152, r6247; +} +{ +add.f16x2 %28, r6161, r6255; +} +{ +add.f16x2 %29, r6164, r6261; +} +{ +sub.f16x2 %60, r6161, r6255; +} +{ +sub.f16x2 %61, r6164, r6261; +} +{ +add.f16x2 r6313, r6314, r6315; +} +{ +add.f16x2 r6316, r6317, r6318; +} +{ +sub.f16x2 r6319, r6314, r6315; +} +{ +sub.f16x2 r6322, r6317, r6318; +} +{ +add.f16x2 r6325, r6326, r6327; +} +{ +add.f16x2 r6328, r6329, r6330; +} +{ +sub.f16x2 r6331, r6326, r6327; +} +{ +sub.f16x2 r6334, r6329, r6330; +} +{ +neg.f16x2 r6337, r6331; +} +{ +add.f16x2 r6339, r6313, r6325; +} +{ +add.f16x2 r6342, r6316, r6328; +} +{ +sub.f16x2 r6345, r6313, r6325; +} +{ +sub.f16x2 r6348, r6316, r6328; +} +{ +add.f16x2 r6351, r6319, r6334; +} +{ +add.f16x2 r6354, r6322, r6337; +} +{ +sub.f16x2 r6357, r6319, r6334; +} +{ +sub.f16x2 r6360, r6322, r6337; +} +{ +add.f16x2 r6363, r6364, r6365; +} +{ +add.f16x2 r6366, r6367, r6368; +} +{ +sub.f16x2 r6369, r6364, r6365; +} +{ +sub.f16x2 r6372, r6367, r6368; +} +{ +add.f16x2 r6375, r6376, r6377; +} +{ +add.f16x2 r6378, r6379, r6380; +} +{ +sub.f16x2 r6381, r6376, r6377; +} +{ +sub.f16x2 r6384, r6379, r6380; +} +{ +neg.f16x2 r6387, r6381; +} +{ +add.f16x2 r6389, r6363, r6375; +} +{ +add.f16x2 r6392, r6366, r6378; +} +{ +sub.f16x2 r6395, r6363, r6375; +} +{ +sub.f16x2 r6398, r6366, r6378; +} +{ +add.f16x2 r6401, r6369, r6384; +} +{ +add.f16x2 r6404, r6372, r6387; +} +{ +sub.f16x2 r6407, r6369, r6384; +} +{ +sub.f16x2 r6410, r6372, r6387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f930; +cvt.rn.f16.f32 high, f930; +mov.b32 r6413, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6418, {low, high}; +} +{ +mul.f16x2 r6427, r6401, r6413; +} +{ +mul.f16x2 r6430, r6404, r6414; +} +{ +sub.f16x2 r6433, r6427, r6430; +} +{ +mul.f16x2 r6436, r6401, r6414; +} +{ +fma.rn.f16x2 r6439, r6404, r6413, r6436; +} +{ +neg.f16x2 r6443, r6395; +} +{ +mul.f16x2 r6445, r6407, r6417; +} +{ +mul.f16x2 r6448, r6410, r6418; +} +{ +sub.f16x2 r6451, r6445, r6448; +} +{ +mul.f16x2 r6454, r6407, r6418; +} +{ +fma.rn.f16x2 r6457, r6410, r6417, r6454; +} +{ +add.f16x2 %6, r6339, r6389; +} +{ +add.f16x2 %7, r6342, r6392; +} +{ +sub.f16x2 %38, r6339, r6389; +} +{ +sub.f16x2 %39, r6342, r6392; +} +{ +add.f16x2 %14, r6351, r6433; +} +{ +add.f16x2 %15, r6354, r6439; +} +{ +sub.f16x2 %46, r6351, r6433; +} +{ +sub.f16x2 %47, r6354, r6439; +} +{ +add.f16x2 %22, r6345, r6398; +} +{ +add.f16x2 %23, r6348, r6443; +} +{ +sub.f16x2 %54, r6345, r6398; +} +{ +sub.f16x2 %55, r6348, r6443; +} +{ +add.f16x2 %30, r6357, r6451; +} +{ +add.f16x2 %31, r6360, r6457; +} +{ +sub.f16x2 %62, r6357, r6451; +} +{ +sub.f16x2 %63, r6360, r6457; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<855, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3646>; +.reg .b64 rd<2>; +mov.u32 r3619, %tid.y; +shl.b32 r3620, r3619, 15; +mov.u32 r3621, %32; +add.s32 r3622, r3621, r3620; +mov.u32 r3623, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f362, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r101, {low, high}; +} +mov.f32 f380, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f448, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f358, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +mov.f32 f366, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r397, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3624, r3623, 511; +shl.b32 r3625, r3623, 6; +and.b32 r3626, r3625, -32768; +add.s32 r3627, r3622, r3626; +cvt.rn.f32.u32 f451, r3624; +mul.f32 f452, f451, 0f3A490FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r3628, r3625, 32704; +add.s32 r3629, r3627, r3628; +st.shared.v4.f32 [r3629], {r521, r629, r666, r703}; +st.shared.v4.f32 [r3629+16], {r740, r777, r814, r851}; +st.shared.v4.f32 [r3629+32], {r888, r925, r962, r999}; +st.shared.v4.f32 [r3629+48], {r1036, r1073, r1110, r1147}; +barrier.sync 0; +mad.lo.s32 r3630, r3624, -60, r3629; +ld.shared.u32 r1176, [r3630]; +ld.shared.u32 r1372, [r3630+2048]; +ld.shared.u32 r1226, [r3630+4096]; +ld.shared.u32 r1422, [r3630+6144]; +ld.shared.u32 r1188, [r3630+8192]; +ld.shared.u32 r1384, [r3630+10240]; +ld.shared.u32 r1238, [r3630+12288]; +ld.shared.u32 r1434, [r3630+14336]; +ld.shared.u32 r1177, [r3630+16384]; +ld.shared.u32 r1373, [r3630+18432]; +ld.shared.u32 r1227, [r3630+20480]; +ld.shared.u32 r1423, [r3630+22528]; +ld.shared.u32 r1189, [r3630+24576]; +ld.shared.u32 r1385, [r3630+26624]; +ld.shared.u32 r1239, [r3630+28672]; +ld.shared.u32 r1435, [r3630+30720]; +barrier.sync 0; +st.shared.v4.f32 [r3629], {r524, r636, r673, r710}; +st.shared.v4.f32 [r3629+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r3629+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r3629+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r3630]; +ld.shared.u32 r1375, [r3630+2048]; +ld.shared.u32 r1229, [r3630+4096]; +ld.shared.u32 r1425, [r3630+6144]; +ld.shared.u32 r1191, [r3630+8192]; +ld.shared.u32 r1387, [r3630+10240]; +ld.shared.u32 r1241, [r3630+12288]; +ld.shared.u32 r1437, [r3630+14336]; +ld.shared.u32 r1180, [r3630+16384]; +ld.shared.u32 r1376, [r3630+18432]; +ld.shared.u32 r1230, [r3630+20480]; +ld.shared.u32 r1426, [r3630+22528]; +ld.shared.u32 r1192, [r3630+24576]; +ld.shared.u32 r1388, [r3630+26624]; +ld.shared.u32 r1242, [r3630+28672]; +ld.shared.u32 r1438, [r3630+30720]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3631, r3623, 496; +bfe.u32 r3632, r3623, 4, 5; +shl.b32 r3633, r3623, 2; +and.b32 r3634, r3633, 60; +add.s32 r3635, r3627, r3634; +cvt.rn.f32.u32 f454, r3632; +mul.f32 f455, f454, 0f3C490FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +barrier.sync 0; +and.b32 r3636, r3625, 31744; +add.s32 r3637, r3635, r3636; +st.shared.u32 [r3637], r1695; +st.shared.u32 [r3637+64], r1803; +st.shared.u32 [r3637+128], r1840; +st.shared.u32 [r3637+192], r1877; +st.shared.u32 [r3637+256], r1914; +st.shared.u32 [r3637+320], r1951; +st.shared.u32 [r3637+384], r1988; +st.shared.u32 [r3637+448], r2025; +st.shared.u32 [r3637+512], r2062; +st.shared.u32 [r3637+576], r2099; +st.shared.u32 [r3637+640], r2136; +st.shared.u32 [r3637+704], r2173; +st.shared.u32 [r3637+768], r2210; +st.shared.u32 [r3637+832], r2247; +st.shared.u32 [r3637+896], r2284; +st.shared.u32 [r3637+960], r2321; +barrier.sync 0; +mad.lo.s32 r3638, r3631, -60, r3637; +ld.shared.u32 r2350, [r3638]; +ld.shared.u32 r2546, [r3638+2048]; +ld.shared.u32 r2400, [r3638+4096]; +ld.shared.u32 r2596, [r3638+6144]; +ld.shared.u32 r2362, [r3638+8192]; +ld.shared.u32 r2558, [r3638+10240]; +ld.shared.u32 r2412, [r3638+12288]; +ld.shared.u32 r2608, [r3638+14336]; +ld.shared.u32 r2351, [r3638+16384]; +ld.shared.u32 r2547, [r3638+18432]; +ld.shared.u32 r2401, [r3638+20480]; +ld.shared.u32 r2597, [r3638+22528]; +ld.shared.u32 r2363, [r3638+24576]; +ld.shared.u32 r2559, [r3638+26624]; +ld.shared.u32 r2413, [r3638+28672]; +ld.shared.u32 r2609, [r3638+30720]; +barrier.sync 0; +st.shared.u32 [r3637], r1698; +st.shared.u32 [r3637+64], r1810; +st.shared.u32 [r3637+128], r1847; +st.shared.u32 [r3637+192], r1884; +st.shared.u32 [r3637+256], r1921; +st.shared.u32 [r3637+320], r1958; +st.shared.u32 [r3637+384], r1995; +st.shared.u32 [r3637+448], r2032; +st.shared.u32 [r3637+512], r2069; +st.shared.u32 [r3637+576], r2106; +st.shared.u32 [r3637+640], r2143; +st.shared.u32 [r3637+704], r2180; +st.shared.u32 [r3637+768], r2217; +st.shared.u32 [r3637+832], r2254; +st.shared.u32 [r3637+896], r2291; +st.shared.u32 [r3637+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r3638]; +ld.shared.u32 r2549, [r3638+2048]; +ld.shared.u32 r2403, [r3638+4096]; +ld.shared.u32 r2599, [r3638+6144]; +ld.shared.u32 r2365, [r3638+8192]; +ld.shared.u32 r2561, [r3638+10240]; +ld.shared.u32 r2415, [r3638+12288]; +ld.shared.u32 r2611, [r3638+14336]; +ld.shared.u32 r2354, [r3638+16384]; +ld.shared.u32 r2550, [r3638+18432]; +ld.shared.u32 r2404, [r3638+20480]; +ld.shared.u32 r2600, [r3638+22528]; +ld.shared.u32 r2366, [r3638+24576]; +ld.shared.u32 r2562, [r3638+26624]; +ld.shared.u32 r2416, [r3638+28672]; +ld.shared.u32 r2612, [r3638+30720]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2434; +} +{ +add.f16x2 r2524, r2384, r2479; +} +{ +sub.f16x2 r2527, r2381, r2434; +} +{ +sub.f16x2 r2530, r2384, r2479; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2630; +} +{ +add.f16x2 r2720, r2580, r2675; +} +{ +sub.f16x2 r2723, r2577, r2630; +} +{ +sub.f16x2 r2726, r2580, r2675; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2699; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2702; +} +{ +add.f16x2 r2920, r2506, r2819; +} +{ +sub.f16x2 r2923, r2503, r2702; +} +{ +sub.f16x2 r2926, r2506, r2819; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3639, r3623, 256; +bfe.u32 r3640, r3623, 8, 1; +and.b32 r3641, r3633, 1020; +add.s32 r3642, r3627, r3641; +cvt.rn.f32.u32 f457, r3640; +mul.f32 f458, f457, 0f3E490FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +neg.f16x2 r2975, r2972; +} +{ +fma.rn.f16x2 r2977, r2881, r2968, r2975; +} +{ +mul.f16x2 r2981, r2881, r2970; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +neg.f16x2 r3012, r3009; +} +{ +fma.rn.f16x2 r3014, r2893, r3005, r3012; +} +{ +mul.f16x2 r3018, r2893, r3007; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +neg.f16x2 r3049, r3046; +} +{ +fma.rn.f16x2 r3051, r2905, r3042, r3049; +} +{ +mul.f16x2 r3055, r2905, r3044; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +neg.f16x2 r3086, r3083; +} +{ +fma.rn.f16x2 r3088, r2917, r3079, r3086; +} +{ +mul.f16x2 r3092, r2917, r3081; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +neg.f16x2 r3123, r3120; +} +{ +fma.rn.f16x2 r3125, r2929, r3116, r3123; +} +{ +mul.f16x2 r3129, r2929, r3118; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +neg.f16x2 r3160, r3157; +} +{ +fma.rn.f16x2 r3162, r2941, r3153, r3160; +} +{ +mul.f16x2 r3166, r2941, r3155; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +neg.f16x2 r3197, r3194; +} +{ +fma.rn.f16x2 r3199, r2953, r3190, r3197; +} +{ +mul.f16x2 r3203, r2953, r3192; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +neg.f16x2 r3234, r3231; +} +{ +fma.rn.f16x2 r3236, r2875, r3227, r3234; +} +{ +mul.f16x2 r3240, r2875, r3229; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +neg.f16x2 r3271, r3268; +} +{ +fma.rn.f16x2 r3273, r2887, r3264, r3271; +} +{ +mul.f16x2 r3277, r2887, r3266; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +neg.f16x2 r3308, r3305; +} +{ +fma.rn.f16x2 r3310, r2899, r3301, r3308; +} +{ +mul.f16x2 r3314, r2899, r3303; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +neg.f16x2 r3345, r3342; +} +{ +fma.rn.f16x2 r3347, r2911, r3338, r3345; +} +{ +mul.f16x2 r3351, r2911, r3340; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3351; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +neg.f16x2 r3382, r3379; +} +{ +fma.rn.f16x2 r3384, r2923, r3375, r3382; +} +{ +mul.f16x2 r3388, r2923, r3377; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3388; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +neg.f16x2 r3419, r3416; +} +{ +fma.rn.f16x2 r3421, r2935, r3412, r3419; +} +{ +mul.f16x2 r3425, r2935, r3414; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3425; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +neg.f16x2 r3456, r3453; +} +{ +fma.rn.f16x2 r3458, r2947, r3449, r3456; +} +{ +mul.f16x2 r3462, r2947, r3451; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +neg.f16x2 r3493, r3490; +} +{ +fma.rn.f16x2 r3495, r2959, r3486, r3493; +} +{ +mul.f16x2 r3499, r2959, r3488; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3499; +} +barrier.sync 0; +and.b32 r3643, r3625, 16384; +add.s32 r3644, r3642, r3643; +st.shared.u32 [r3644], r2869; +st.shared.u32 [r3644+1024], r2977; +st.shared.u32 [r3644+2048], r3014; +st.shared.u32 [r3644+3072], r3051; +st.shared.u32 [r3644+4096], r3088; +st.shared.u32 [r3644+5120], r3125; +st.shared.u32 [r3644+6144], r3162; +st.shared.u32 [r3644+7168], r3199; +st.shared.u32 [r3644+8192], r3236; +st.shared.u32 [r3644+9216], r3273; +st.shared.u32 [r3644+10240], r3310; +st.shared.u32 [r3644+11264], r3347; +st.shared.u32 [r3644+12288], r3384; +st.shared.u32 [r3644+13312], r3421; +st.shared.u32 [r3644+14336], r3458; +st.shared.u32 [r3644+15360], r3495; +barrier.sync 0; +mad.lo.s32 r3645, r3639, -60, r3644; +ld.shared.u32 r3524, [r3645]; +ld.shared.u32 r3536, [r3645+2048]; +ld.shared.u32 r3548, [r3645+4096]; +ld.shared.u32 r3560, [r3645+6144]; +ld.shared.u32 r3572, [r3645+8192]; +ld.shared.u32 r3584, [r3645+10240]; +ld.shared.u32 r3596, [r3645+12288]; +ld.shared.u32 r3608, [r3645+14336]; +ld.shared.u32 r3525, [r3645+16384]; +ld.shared.u32 r3537, [r3645+18432]; +ld.shared.u32 r3549, [r3645+20480]; +ld.shared.u32 r3561, [r3645+22528]; +ld.shared.u32 r3573, [r3645+24576]; +ld.shared.u32 r3585, [r3645+26624]; +ld.shared.u32 r3597, [r3645+28672]; +ld.shared.u32 r3609, [r3645+30720]; +barrier.sync 0; +st.shared.u32 [r3644], r2872; +st.shared.u32 [r3644+1024], r2984; +st.shared.u32 [r3644+2048], r3021; +st.shared.u32 [r3644+3072], r3058; +st.shared.u32 [r3644+4096], r3095; +st.shared.u32 [r3644+5120], r3132; +st.shared.u32 [r3644+6144], r3169; +st.shared.u32 [r3644+7168], r3206; +st.shared.u32 [r3644+8192], r3243; +st.shared.u32 [r3644+9216], r3280; +st.shared.u32 [r3644+10240], r3317; +st.shared.u32 [r3644+11264], r3354; +st.shared.u32 [r3644+12288], r3391; +st.shared.u32 [r3644+13312], r3428; +st.shared.u32 [r3644+14336], r3465; +st.shared.u32 [r3644+15360], r3502; +barrier.sync 0; +ld.shared.u32 r3527, [r3645]; +ld.shared.u32 r3539, [r3645+2048]; +ld.shared.u32 r3551, [r3645+4096]; +ld.shared.u32 r3563, [r3645+6144]; +ld.shared.u32 r3575, [r3645+8192]; +ld.shared.u32 r3587, [r3645+10240]; +ld.shared.u32 r3599, [r3645+12288]; +ld.shared.u32 r3611, [r3645+14336]; +ld.shared.u32 r3528, [r3645+16384]; +ld.shared.u32 r3540, [r3645+18432]; +ld.shared.u32 r3552, [r3645+20480]; +ld.shared.u32 r3564, [r3645+22528]; +ld.shared.u32 r3576, [r3645+24576]; +ld.shared.u32 r3588, [r3645+26624]; +ld.shared.u32 r3600, [r3645+28672]; +ld.shared.u32 r3612, [r3645+30720]; +{ +add.f16x2 %0, r3524, r3525; +} +{ +add.f16x2 %1, r3527, r3528; +} +{ +sub.f16x2 %16, r3524, r3525; +} +{ +sub.f16x2 %17, r3527, r3528; +} +{ +add.f16x2 %2, r3536, r3537; +} +{ +add.f16x2 %3, r3539, r3540; +} +{ +sub.f16x2 %18, r3536, r3537; +} +{ +sub.f16x2 %19, r3539, r3540; +} +{ +add.f16x2 %4, r3548, r3549; +} +{ +add.f16x2 %5, r3551, r3552; +} +{ +sub.f16x2 %20, r3548, r3549; +} +{ +sub.f16x2 %21, r3551, r3552; +} +{ +add.f16x2 %6, r3560, r3561; +} +{ +add.f16x2 %7, r3563, r3564; +} +{ +sub.f16x2 %22, r3560, r3561; +} +{ +sub.f16x2 %23, r3563, r3564; +} +{ +add.f16x2 %8, r3572, r3573; +} +{ +add.f16x2 %9, r3575, r3576; +} +{ +sub.f16x2 %24, r3572, r3573; +} +{ +sub.f16x2 %25, r3575, r3576; +} +{ +add.f16x2 %10, r3584, r3585; +} +{ +add.f16x2 %11, r3587, r3588; +} +{ +sub.f16x2 %26, r3584, r3585; +} +{ +sub.f16x2 %27, r3587, r3588; +} +{ +add.f16x2 %12, r3596, r3597; +} +{ +add.f16x2 %13, r3599, r3600; +} +{ +sub.f16x2 %28, r3596, r3597; +} +{ +sub.f16x2 %29, r3599, r3600; +} +{ +add.f16x2 %14, r3608, r3609; +} +{ +add.f16x2 %15, r3611, r3612; +} +{ +sub.f16x2 %30, r3608, r3609; +} +{ +sub.f16x2 %31, r3611, r3612; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<859, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3646>; +.reg .b64 rd<2>; +mov.u32 r3619, %tid.y; +shl.b32 r3620, r3619, 16; +mov.u32 r3621, %32; +add.s32 r3622, r3621, r3620; +mov.u32 r3623, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f362, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r101, {low, high}; +} +mov.f32 f380, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f448, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r215; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r218; +} +{ +add.f16x2 r238, r206, r221; +} +{ +sub.f16x2 r241, r203, r218; +} +{ +sub.f16x2 r244, r206, r221; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r265; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r268; +} +{ +add.f16x2 r288, r256, r271; +} +{ +sub.f16x2 r291, r253, r268; +} +{ +sub.f16x2 r294, r256, r271; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r279; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r282; +} +{ +add.f16x2 r372, r232, r327; +} +{ +sub.f16x2 r375, r229, r282; +} +{ +sub.f16x2 r378, r232, r327; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f358, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +mov.f32 f366, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r397, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r398, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r351; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r354; +} +{ +add.f16x2 r572, r158, r471; +} +{ +sub.f16x2 r575, r155, r354; +} +{ +sub.f16x2 r578, r158, r471; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3624, r3623, 511; +shl.b32 r3625, r3623, 7; +and.b32 r3626, r3625, -65536; +add.s32 r3627, r3622, r3626; +cvt.rn.f32.u32 f451, r3624; +mul.f32 f452, f451, 0f3A490FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +neg.f16x2 r627, r624; +} +{ +fma.rn.f16x2 r629, r533, r620, r627; +} +{ +mul.f16x2 r633, r533, r622; +} +{ +fma.rn.f16x2 r636, r536, r620, r633; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +neg.f16x2 r664, r661; +} +{ +fma.rn.f16x2 r666, r545, r657, r664; +} +{ +mul.f16x2 r670, r545, r659; +} +{ +fma.rn.f16x2 r673, r548, r657, r670; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +neg.f16x2 r701, r698; +} +{ +fma.rn.f16x2 r703, r557, r694, r701; +} +{ +mul.f16x2 r707, r557, r696; +} +{ +fma.rn.f16x2 r710, r560, r694, r707; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +neg.f16x2 r738, r735; +} +{ +fma.rn.f16x2 r740, r569, r731, r738; +} +{ +mul.f16x2 r744, r569, r733; +} +{ +fma.rn.f16x2 r747, r572, r731, r744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +neg.f16x2 r775, r772; +} +{ +fma.rn.f16x2 r777, r581, r768, r775; +} +{ +mul.f16x2 r781, r581, r770; +} +{ +fma.rn.f16x2 r784, r584, r768, r781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +neg.f16x2 r812, r809; +} +{ +fma.rn.f16x2 r814, r593, r805, r812; +} +{ +mul.f16x2 r818, r593, r807; +} +{ +fma.rn.f16x2 r821, r596, r805, r818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +neg.f16x2 r849, r846; +} +{ +fma.rn.f16x2 r851, r605, r842, r849; +} +{ +mul.f16x2 r855, r605, r844; +} +{ +fma.rn.f16x2 r858, r608, r842, r855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +neg.f16x2 r886, r883; +} +{ +fma.rn.f16x2 r888, r527, r879, r886; +} +{ +mul.f16x2 r892, r527, r881; +} +{ +fma.rn.f16x2 r895, r530, r879, r892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +neg.f16x2 r923, r920; +} +{ +fma.rn.f16x2 r925, r539, r916, r923; +} +{ +mul.f16x2 r929, r539, r918; +} +{ +fma.rn.f16x2 r932, r542, r916, r929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +neg.f16x2 r960, r957; +} +{ +fma.rn.f16x2 r962, r551, r953, r960; +} +{ +mul.f16x2 r966, r551, r955; +} +{ +fma.rn.f16x2 r969, r554, r953, r966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +neg.f16x2 r997, r994; +} +{ +fma.rn.f16x2 r999, r563, r990, r997; +} +{ +mul.f16x2 r1003, r563, r992; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +neg.f16x2 r1034, r1031; +} +{ +fma.rn.f16x2 r1036, r575, r1027, r1034; +} +{ +mul.f16x2 r1040, r575, r1029; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +neg.f16x2 r1071, r1068; +} +{ +fma.rn.f16x2 r1073, r587, r1064, r1071; +} +{ +mul.f16x2 r1077, r587, r1066; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +neg.f16x2 r1108, r1105; +} +{ +fma.rn.f16x2 r1110, r599, r1101, r1108; +} +{ +mul.f16x2 r1114, r599, r1103; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +neg.f16x2 r1145, r1142; +} +{ +fma.rn.f16x2 r1147, r611, r1138, r1145; +} +{ +mul.f16x2 r1151, r611, r1140; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1151; +} +barrier.sync 0; +and.b32 r3628, r3625, 65408; +add.s32 r3629, r3627, r3628; +st.shared.v4.f32 [r3629], {r521, r524, r629, r636}; +st.shared.v4.f32 [r3629+16], {r666, r673, r703, r710}; +st.shared.v4.f32 [r3629+32], {r740, r747, r777, r784}; +st.shared.v4.f32 [r3629+48], {r814, r821, r851, r858}; +st.shared.v4.f32 [r3629+64], {r888, r895, r925, r932}; +st.shared.v4.f32 [r3629+80], {r962, r969, r999, r1006}; +st.shared.v4.f32 [r3629+96], {r1036, r1043, r1073, r1080}; +st.shared.v4.f32 [r3629+112], {r1110, r1117, r1147, r1154}; +barrier.sync 0; +mad.lo.s32 r3630, r3624, -120, r3629; +ld.shared.u32 r1176, [r3630]; +ld.shared.u32 r1179, [r3630+4]; +ld.shared.u32 r1372, [r3630+4096]; +ld.shared.u32 r1375, [r3630+4100]; +ld.shared.u32 r1226, [r3630+8192]; +ld.shared.u32 r1229, [r3630+8196]; +ld.shared.u32 r1422, [r3630+12288]; +ld.shared.u32 r1425, [r3630+12292]; +ld.shared.u32 r1188, [r3630+16384]; +ld.shared.u32 r1191, [r3630+16388]; +ld.shared.u32 r1384, [r3630+20480]; +ld.shared.u32 r1387, [r3630+20484]; +ld.shared.u32 r1238, [r3630+24576]; +ld.shared.u32 r1241, [r3630+24580]; +ld.shared.u32 r1434, [r3630+28672]; +ld.shared.u32 r1437, [r3630+28676]; +ld.shared.u32 r1177, [r3630+32768]; +ld.shared.u32 r1180, [r3630+32772]; +ld.shared.u32 r1373, [r3630+36864]; +ld.shared.u32 r1376, [r3630+36868]; +ld.shared.u32 r1227, [r3630+40960]; +ld.shared.u32 r1230, [r3630+40964]; +ld.shared.u32 r1423, [r3630+45056]; +ld.shared.u32 r1426, [r3630+45060]; +ld.shared.u32 r1189, [r3630+49152]; +ld.shared.u32 r1192, [r3630+49156]; +ld.shared.u32 r1385, [r3630+53248]; +ld.shared.u32 r1388, [r3630+53252]; +ld.shared.u32 r1239, [r3630+57344]; +ld.shared.u32 r1242, [r3630+57348]; +ld.shared.u32 r1435, [r3630+61440]; +ld.shared.u32 r1438, [r3630+61444]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1193; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1196; +} +{ +add.f16x2 r1216, r1184, r1199; +} +{ +sub.f16x2 r1219, r1181, r1196; +} +{ +sub.f16x2 r1222, r1184, r1199; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1243; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1246; +} +{ +add.f16x2 r1266, r1234, r1249; +} +{ +sub.f16x2 r1269, r1231, r1246; +} +{ +sub.f16x2 r1272, r1234, r1249; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1257; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1260; +} +{ +add.f16x2 r1350, r1210, r1305; +} +{ +sub.f16x2 r1353, r1207, r1260; +} +{ +sub.f16x2 r1356, r1210, r1305; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1389; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1392; +} +{ +add.f16x2 r1412, r1380, r1395; +} +{ +sub.f16x2 r1415, r1377, r1392; +} +{ +sub.f16x2 r1418, r1380, r1395; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1439; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1442; +} +{ +add.f16x2 r1462, r1430, r1445; +} +{ +sub.f16x2 r1465, r1427, r1442; +} +{ +sub.f16x2 r1468, r1430, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1453; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1456; +} +{ +add.f16x2 r1546, r1406, r1501; +} +{ +sub.f16x2 r1549, r1403, r1456; +} +{ +sub.f16x2 r1552, r1406, r1501; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1525; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1528; +} +{ +add.f16x2 r1746, r1332, r1645; +} +{ +sub.f16x2 r1749, r1329, r1528; +} +{ +sub.f16x2 r1752, r1332, r1645; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3631, r3623, 496; +bfe.u32 r3632, r3623, 4, 5; +cvt.rn.f32.u32 f454, r3632; +mul.f32 f455, f454, 0f3C490FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +neg.f16x2 r1801, r1798; +} +{ +fma.rn.f16x2 r1803, r1707, r1794, r1801; +} +{ +mul.f16x2 r1807, r1707, r1796; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +neg.f16x2 r1838, r1835; +} +{ +fma.rn.f16x2 r1840, r1719, r1831, r1838; +} +{ +mul.f16x2 r1844, r1719, r1833; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +neg.f16x2 r1875, r1872; +} +{ +fma.rn.f16x2 r1877, r1731, r1868, r1875; +} +{ +mul.f16x2 r1881, r1731, r1870; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1881; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +neg.f16x2 r1912, r1909; +} +{ +fma.rn.f16x2 r1914, r1743, r1905, r1912; +} +{ +mul.f16x2 r1918, r1743, r1907; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1918; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +neg.f16x2 r1949, r1946; +} +{ +fma.rn.f16x2 r1951, r1755, r1942, r1949; +} +{ +mul.f16x2 r1955, r1755, r1944; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1955; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +neg.f16x2 r1986, r1983; +} +{ +fma.rn.f16x2 r1988, r1767, r1979, r1986; +} +{ +mul.f16x2 r1992, r1767, r1981; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1992; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +neg.f16x2 r2023, r2020; +} +{ +fma.rn.f16x2 r2025, r1779, r2016, r2023; +} +{ +mul.f16x2 r2029, r1779, r2018; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2029; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +neg.f16x2 r2060, r2057; +} +{ +fma.rn.f16x2 r2062, r1701, r2053, r2060; +} +{ +mul.f16x2 r2066, r1701, r2055; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2066; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +neg.f16x2 r2097, r2094; +} +{ +fma.rn.f16x2 r2099, r1713, r2090, r2097; +} +{ +mul.f16x2 r2103, r1713, r2092; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2103; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +neg.f16x2 r2134, r2131; +} +{ +fma.rn.f16x2 r2136, r1725, r2127, r2134; +} +{ +mul.f16x2 r2140, r1725, r2129; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2140; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +neg.f16x2 r2171, r2168; +} +{ +fma.rn.f16x2 r2173, r1737, r2164, r2171; +} +{ +mul.f16x2 r2177, r1737, r2166; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2177; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +neg.f16x2 r2208, r2205; +} +{ +fma.rn.f16x2 r2210, r1749, r2201, r2208; +} +{ +mul.f16x2 r2214, r1749, r2203; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +neg.f16x2 r2245, r2242; +} +{ +fma.rn.f16x2 r2247, r1761, r2238, r2245; +} +{ +mul.f16x2 r2251, r1761, r2240; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +neg.f16x2 r2282, r2279; +} +{ +fma.rn.f16x2 r2284, r1773, r2275, r2282; +} +{ +mul.f16x2 r2288, r1773, r2277; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +neg.f16x2 r2319, r2316; +} +{ +fma.rn.f16x2 r2321, r1785, r2312, r2319; +} +{ +mul.f16x2 r2325, r1785, r2314; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2325; +} +shl.b32 r3633, r3623, 3; +and.b32 r3634, r3633, 120; +add.s32 r3635, r3627, r3634; +barrier.sync 0; +and.b32 r3636, r3625, 63488; +add.s32 r3637, r3635, r3636; +st.shared.u32 [r3637], r1695; +st.shared.u32 [r3637+4], r1698; +st.shared.u32 [r3637+128], r1803; +st.shared.u32 [r3637+132], r1810; +st.shared.u32 [r3637+256], r1840; +st.shared.u32 [r3637+260], r1847; +st.shared.u32 [r3637+384], r1877; +st.shared.u32 [r3637+388], r1884; +st.shared.u32 [r3637+512], r1914; +st.shared.u32 [r3637+516], r1921; +st.shared.u32 [r3637+640], r1951; +st.shared.u32 [r3637+644], r1958; +st.shared.u32 [r3637+768], r1988; +st.shared.u32 [r3637+772], r1995; +st.shared.u32 [r3637+896], r2025; +st.shared.u32 [r3637+900], r2032; +st.shared.u32 [r3637+1024], r2062; +st.shared.u32 [r3637+1028], r2069; +st.shared.u32 [r3637+1152], r2099; +st.shared.u32 [r3637+1156], r2106; +st.shared.u32 [r3637+1280], r2136; +st.shared.u32 [r3637+1284], r2143; +st.shared.u32 [r3637+1408], r2173; +st.shared.u32 [r3637+1412], r2180; +st.shared.u32 [r3637+1536], r2210; +st.shared.u32 [r3637+1540], r2217; +st.shared.u32 [r3637+1664], r2247; +st.shared.u32 [r3637+1668], r2254; +st.shared.u32 [r3637+1792], r2284; +st.shared.u32 [r3637+1796], r2291; +st.shared.u32 [r3637+1920], r2321; +st.shared.u32 [r3637+1924], r2328; +barrier.sync 0; +mad.lo.s32 r3638, r3631, -120, r3637; +ld.shared.u32 r2350, [r3638]; +ld.shared.u32 r2353, [r3638+4]; +ld.shared.u32 r2546, [r3638+4096]; +ld.shared.u32 r2549, [r3638+4100]; +ld.shared.u32 r2400, [r3638+8192]; +ld.shared.u32 r2403, [r3638+8196]; +ld.shared.u32 r2596, [r3638+12288]; +ld.shared.u32 r2599, [r3638+12292]; +ld.shared.u32 r2362, [r3638+16384]; +ld.shared.u32 r2365, [r3638+16388]; +ld.shared.u32 r2558, [r3638+20480]; +ld.shared.u32 r2561, [r3638+20484]; +ld.shared.u32 r2412, [r3638+24576]; +ld.shared.u32 r2415, [r3638+24580]; +ld.shared.u32 r2608, [r3638+28672]; +ld.shared.u32 r2611, [r3638+28676]; +ld.shared.u32 r2351, [r3638+32768]; +ld.shared.u32 r2354, [r3638+32772]; +ld.shared.u32 r2547, [r3638+36864]; +ld.shared.u32 r2550, [r3638+36868]; +ld.shared.u32 r2401, [r3638+40960]; +ld.shared.u32 r2404, [r3638+40964]; +ld.shared.u32 r2597, [r3638+45056]; +ld.shared.u32 r2600, [r3638+45060]; +ld.shared.u32 r2363, [r3638+49152]; +ld.shared.u32 r2366, [r3638+49156]; +ld.shared.u32 r2559, [r3638+53248]; +ld.shared.u32 r2562, [r3638+53252]; +ld.shared.u32 r2413, [r3638+57344]; +ld.shared.u32 r2416, [r3638+57348]; +ld.shared.u32 r2609, [r3638+61440]; +ld.shared.u32 r2612, [r3638+61444]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2367; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2370; +} +{ +add.f16x2 r2390, r2358, r2373; +} +{ +sub.f16x2 r2393, r2355, r2370; +} +{ +sub.f16x2 r2396, r2358, r2373; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2417; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2420; +} +{ +add.f16x2 r2440, r2408, r2423; +} +{ +sub.f16x2 r2443, r2405, r2420; +} +{ +sub.f16x2 r2446, r2408, r2423; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2431; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2434; +} +{ +add.f16x2 r2524, r2384, r2479; +} +{ +sub.f16x2 r2527, r2381, r2434; +} +{ +sub.f16x2 r2530, r2384, r2479; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2563; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2566; +} +{ +add.f16x2 r2586, r2554, r2569; +} +{ +sub.f16x2 r2589, r2551, r2566; +} +{ +sub.f16x2 r2592, r2554, r2569; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2613; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2616; +} +{ +add.f16x2 r2636, r2604, r2619; +} +{ +sub.f16x2 r2639, r2601, r2616; +} +{ +sub.f16x2 r2642, r2604, r2619; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2627; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2630; +} +{ +add.f16x2 r2720, r2580, r2675; +} +{ +sub.f16x2 r2723, r2577, r2630; +} +{ +sub.f16x2 r2726, r2580, r2675; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f358; +cvt.rn.f16.f32 high, f358; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f362; +cvt.rn.f16.f32 high, f362; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f366; +cvt.rn.f16.f32 high, f366; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2699; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2702; +} +{ +add.f16x2 r2920, r2506, r2819; +} +{ +sub.f16x2 r2923, r2503, r2702; +} +{ +sub.f16x2 r2926, r2506, r2819; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3639, r3623, 256; +bfe.u32 r3640, r3623, 8, 1; +cvt.rn.f32.u32 f457, r3640; +mul.f32 f458, f457, 0f3E490FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +neg.f16x2 r2975, r2972; +} +{ +fma.rn.f16x2 r2977, r2881, r2968, r2975; +} +{ +mul.f16x2 r2981, r2881, r2970; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +neg.f16x2 r3012, r3009; +} +{ +fma.rn.f16x2 r3014, r2893, r3005, r3012; +} +{ +mul.f16x2 r3018, r2893, r3007; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +neg.f16x2 r3049, r3046; +} +{ +fma.rn.f16x2 r3051, r2905, r3042, r3049; +} +{ +mul.f16x2 r3055, r2905, r3044; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +neg.f16x2 r3086, r3083; +} +{ +fma.rn.f16x2 r3088, r2917, r3079, r3086; +} +{ +mul.f16x2 r3092, r2917, r3081; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +neg.f16x2 r3123, r3120; +} +{ +fma.rn.f16x2 r3125, r2929, r3116, r3123; +} +{ +mul.f16x2 r3129, r2929, r3118; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +neg.f16x2 r3160, r3157; +} +{ +fma.rn.f16x2 r3162, r2941, r3153, r3160; +} +{ +mul.f16x2 r3166, r2941, r3155; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +neg.f16x2 r3197, r3194; +} +{ +fma.rn.f16x2 r3199, r2953, r3190, r3197; +} +{ +mul.f16x2 r3203, r2953, r3192; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +neg.f16x2 r3234, r3231; +} +{ +fma.rn.f16x2 r3236, r2875, r3227, r3234; +} +{ +mul.f16x2 r3240, r2875, r3229; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +neg.f16x2 r3271, r3268; +} +{ +fma.rn.f16x2 r3273, r2887, r3264, r3271; +} +{ +mul.f16x2 r3277, r2887, r3266; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +neg.f16x2 r3308, r3305; +} +{ +fma.rn.f16x2 r3310, r2899, r3301, r3308; +} +{ +mul.f16x2 r3314, r2899, r3303; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +neg.f16x2 r3345, r3342; +} +{ +fma.rn.f16x2 r3347, r2911, r3338, r3345; +} +{ +mul.f16x2 r3351, r2911, r3340; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3351; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +neg.f16x2 r3382, r3379; +} +{ +fma.rn.f16x2 r3384, r2923, r3375, r3382; +} +{ +mul.f16x2 r3388, r2923, r3377; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3388; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +neg.f16x2 r3419, r3416; +} +{ +fma.rn.f16x2 r3421, r2935, r3412, r3419; +} +{ +mul.f16x2 r3425, r2935, r3414; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3425; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +neg.f16x2 r3456, r3453; +} +{ +fma.rn.f16x2 r3458, r2947, r3449, r3456; +} +{ +mul.f16x2 r3462, r2947, r3451; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3462; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +neg.f16x2 r3493, r3490; +} +{ +fma.rn.f16x2 r3495, r2959, r3486, r3493; +} +{ +mul.f16x2 r3499, r2959, r3488; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3499; +} +and.b32 r3641, r3633, 2040; +add.s32 r3642, r3627, r3641; +barrier.sync 0; +and.b32 r3643, r3625, 32768; +add.s32 r3644, r3642, r3643; +st.shared.u32 [r3644], r2869; +st.shared.u32 [r3644+4], r2872; +st.shared.u32 [r3644+2048], r2977; +st.shared.u32 [r3644+2052], r2984; +st.shared.u32 [r3644+4096], r3014; +st.shared.u32 [r3644+4100], r3021; +st.shared.u32 [r3644+6144], r3051; +st.shared.u32 [r3644+6148], r3058; +st.shared.u32 [r3644+8192], r3088; +st.shared.u32 [r3644+8196], r3095; +st.shared.u32 [r3644+10240], r3125; +st.shared.u32 [r3644+10244], r3132; +st.shared.u32 [r3644+12288], r3162; +st.shared.u32 [r3644+12292], r3169; +st.shared.u32 [r3644+14336], r3199; +st.shared.u32 [r3644+14340], r3206; +st.shared.u32 [r3644+16384], r3236; +st.shared.u32 [r3644+16388], r3243; +st.shared.u32 [r3644+18432], r3273; +st.shared.u32 [r3644+18436], r3280; +st.shared.u32 [r3644+20480], r3310; +st.shared.u32 [r3644+20484], r3317; +st.shared.u32 [r3644+22528], r3347; +st.shared.u32 [r3644+22532], r3354; +st.shared.u32 [r3644+24576], r3384; +st.shared.u32 [r3644+24580], r3391; +st.shared.u32 [r3644+26624], r3421; +st.shared.u32 [r3644+26628], r3428; +st.shared.u32 [r3644+28672], r3458; +st.shared.u32 [r3644+28676], r3465; +st.shared.u32 [r3644+30720], r3495; +st.shared.u32 [r3644+30724], r3502; +barrier.sync 0; +mad.lo.s32 r3645, r3639, -120, r3644; +ld.shared.u32 r3524, [r3645]; +ld.shared.u32 r3527, [r3645+4]; +ld.shared.u32 r3536, [r3645+4096]; +ld.shared.u32 r3539, [r3645+4100]; +ld.shared.u32 r3548, [r3645+8192]; +ld.shared.u32 r3551, [r3645+8196]; +ld.shared.u32 r3560, [r3645+12288]; +ld.shared.u32 r3563, [r3645+12292]; +ld.shared.u32 r3572, [r3645+16384]; +ld.shared.u32 r3575, [r3645+16388]; +ld.shared.u32 r3584, [r3645+20480]; +ld.shared.u32 r3587, [r3645+20484]; +ld.shared.u32 r3596, [r3645+24576]; +ld.shared.u32 r3599, [r3645+24580]; +ld.shared.u32 r3608, [r3645+28672]; +ld.shared.u32 r3611, [r3645+28676]; +ld.shared.u32 r3525, [r3645+32768]; +ld.shared.u32 r3528, [r3645+32772]; +ld.shared.u32 r3537, [r3645+36864]; +ld.shared.u32 r3540, [r3645+36868]; +ld.shared.u32 r3549, [r3645+40960]; +ld.shared.u32 r3552, [r3645+40964]; +ld.shared.u32 r3561, [r3645+45056]; +ld.shared.u32 r3564, [r3645+45060]; +ld.shared.u32 r3573, [r3645+49152]; +ld.shared.u32 r3576, [r3645+49156]; +ld.shared.u32 r3585, [r3645+53248]; +ld.shared.u32 r3588, [r3645+53252]; +ld.shared.u32 r3597, [r3645+57344]; +ld.shared.u32 r3600, [r3645+57348]; +ld.shared.u32 r3609, [r3645+61440]; +ld.shared.u32 r3612, [r3645+61444]; +{ +add.f16x2 %0, r3524, r3525; +} +{ +add.f16x2 %1, r3527, r3528; +} +{ +sub.f16x2 %16, r3524, r3525; +} +{ +sub.f16x2 %17, r3527, r3528; +} +{ +add.f16x2 %2, r3536, r3537; +} +{ +add.f16x2 %3, r3539, r3540; +} +{ +sub.f16x2 %18, r3536, r3537; +} +{ +sub.f16x2 %19, r3539, r3540; +} +{ +add.f16x2 %4, r3548, r3549; +} +{ +add.f16x2 %5, r3551, r3552; +} +{ +sub.f16x2 %20, r3548, r3549; +} +{ +sub.f16x2 %21, r3551, r3552; +} +{ +add.f16x2 %6, r3560, r3561; +} +{ +add.f16x2 %7, r3563, r3564; +} +{ +sub.f16x2 %22, r3560, r3561; +} +{ +sub.f16x2 %23, r3563, r3564; +} +{ +add.f16x2 %8, r3572, r3573; +} +{ +add.f16x2 %9, r3575, r3576; +} +{ +sub.f16x2 %24, r3572, r3573; +} +{ +sub.f16x2 %25, r3575, r3576; +} +{ +add.f16x2 %10, r3584, r3585; +} +{ +add.f16x2 %11, r3587, r3588; +} +{ +sub.f16x2 %26, r3584, r3585; +} +{ +sub.f16x2 %27, r3587, r3588; +} +{ +add.f16x2 %12, r3596, r3597; +} +{ +add.f16x2 %13, r3599, r3600; +} +{ +sub.f16x2 %28, r3596, r3597; +} +{ +sub.f16x2 %29, r3599, r3600; +} +{ +add.f16x2 %14, r3608, r3609; +} +{ +add.f16x2 %15, r3611, r3612; +} +{ +sub.f16x2 %30, r3608, r3609; +} +{ +sub.f16x2 %31, r3611, r3612; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<860, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<197>; +.reg .b32 r<1915>; +.reg .b64 rd<2>; +mov.u32 r1881, %tid.y; +shl.b32 r1882, r1881, 15; +mov.u32 r1883, %16; +add.s32 r1884, r1883, r1882; +mov.u32 r1885, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f140, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r101, {low, high}; +} +mov.f32 f150, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f181, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f182, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1886, r1885, 1023; +shl.b32 r1887, r1885, 5; +and.b32 r1888, r1887, -32768; +add.s32 r1889, r1884, r1888; +cvt.rn.f32.u32 f185, r1886; +mul.f32 f186, f185, 0f3A490FDB; +cos.approx.f32 f29, f186; +sin.approx.f32 f187, f186; +neg.f32 f30, f187; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1890, r1887, 32736; +add.s32 r1891, r1889, r1890; +st.shared.v4.f32 [r1891], {r149, r209, r246, r283}; +st.shared.v4.f32 [r1891+16], {r320, r357, r394, r431}; +barrier.sync 0; +mad.lo.s32 r1892, r1886, -28, r1891; +ld.shared.u32 r460, [r1892]; +ld.shared.u32 r510, [r1892+4096]; +ld.shared.u32 r472, [r1892+8192]; +ld.shared.u32 r522, [r1892+12288]; +ld.shared.u32 r461, [r1892+16384]; +ld.shared.u32 r511, [r1892+20480]; +ld.shared.u32 r473, [r1892+24576]; +ld.shared.u32 r523, [r1892+28672]; +barrier.sync 0; +st.shared.v4.f32 [r1891], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1891+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1892]; +ld.shared.u32 r513, [r1892+4096]; +ld.shared.u32 r475, [r1892+8192]; +ld.shared.u32 r525, [r1892+12288]; +ld.shared.u32 r464, [r1892+16384]; +ld.shared.u32 r514, [r1892+20480]; +ld.shared.u32 r476, [r1892+24576]; +ld.shared.u32 r526, [r1892+28672]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1893, r1885, 1016; +bfe.u32 r1894, r1885, 3, 7; +shl.b32 r1895, r1885, 2; +and.b32 r1896, r1895, 28; +add.s32 r1897, r1889, r1896; +cvt.rn.f32.u32 f188, r1894; +mul.f32 f189, f188, 0f3BC90FDB; +cos.approx.f32 f75, f189; +sin.approx.f32 f190, f189; +neg.f32 f76, f190; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +barrier.sync 0; +and.b32 r1898, r1887, 32512; +add.s32 r1899, r1897, r1898; +st.shared.u32 [r1899], r607; +st.shared.u32 [r1899+32], r667; +st.shared.u32 [r1899+64], r704; +st.shared.u32 [r1899+96], r741; +st.shared.u32 [r1899+128], r778; +st.shared.u32 [r1899+160], r815; +st.shared.u32 [r1899+192], r852; +st.shared.u32 [r1899+224], r889; +barrier.sync 0; +mad.lo.s32 r1900, r1893, -28, r1899; +ld.shared.u32 r918, [r1900]; +ld.shared.u32 r968, [r1900+4096]; +ld.shared.u32 r930, [r1900+8192]; +ld.shared.u32 r980, [r1900+12288]; +ld.shared.u32 r919, [r1900+16384]; +ld.shared.u32 r969, [r1900+20480]; +ld.shared.u32 r931, [r1900+24576]; +ld.shared.u32 r981, [r1900+28672]; +barrier.sync 0; +st.shared.u32 [r1899], r610; +st.shared.u32 [r1899+32], r674; +st.shared.u32 [r1899+64], r711; +st.shared.u32 [r1899+96], r748; +st.shared.u32 [r1899+128], r785; +st.shared.u32 [r1899+160], r822; +st.shared.u32 [r1899+192], r859; +st.shared.u32 [r1899+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1900]; +ld.shared.u32 r971, [r1900+4096]; +ld.shared.u32 r933, [r1900+8192]; +ld.shared.u32 r983, [r1900+12288]; +ld.shared.u32 r922, [r1900+16384]; +ld.shared.u32 r972, [r1900+20480]; +ld.shared.u32 r934, [r1900+24576]; +ld.shared.u32 r984, [r1900+28672]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1901, r1885, 960; +bfe.u32 r1902, r1885, 6, 4; +and.b32 r1903, r1895, 252; +add.s32 r1904, r1889, r1903; +cvt.rn.f32.u32 f191, r1902; +mul.f32 f192, f191, 0f3D490FDB; +cos.approx.f32 f121, f192; +sin.approx.f32 f193, f192; +neg.f32 f122, f193; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +barrier.sync 0; +and.b32 r1905, r1887, 30720; +add.s32 r1906, r1904, r1905; +st.shared.u32 [r1906], r1065; +st.shared.u32 [r1906+256], r1125; +st.shared.u32 [r1906+512], r1162; +st.shared.u32 [r1906+768], r1199; +st.shared.u32 [r1906+1024], r1236; +st.shared.u32 [r1906+1280], r1273; +st.shared.u32 [r1906+1536], r1310; +st.shared.u32 [r1906+1792], r1347; +barrier.sync 0; +mad.lo.s32 r1907, r1901, -28, r1906; +ld.shared.u32 r1376, [r1907]; +ld.shared.u32 r1426, [r1907+4096]; +ld.shared.u32 r1388, [r1907+8192]; +ld.shared.u32 r1438, [r1907+12288]; +ld.shared.u32 r1377, [r1907+16384]; +ld.shared.u32 r1427, [r1907+20480]; +ld.shared.u32 r1389, [r1907+24576]; +ld.shared.u32 r1439, [r1907+28672]; +barrier.sync 0; +st.shared.u32 [r1906], r1068; +st.shared.u32 [r1906+256], r1132; +st.shared.u32 [r1906+512], r1169; +st.shared.u32 [r1906+768], r1206; +st.shared.u32 [r1906+1024], r1243; +st.shared.u32 [r1906+1280], r1280; +st.shared.u32 [r1906+1536], r1317; +st.shared.u32 [r1906+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1907]; +ld.shared.u32 r1429, [r1907+4096]; +ld.shared.u32 r1391, [r1907+8192]; +ld.shared.u32 r1441, [r1907+12288]; +ld.shared.u32 r1380, [r1907+16384]; +ld.shared.u32 r1430, [r1907+20480]; +ld.shared.u32 r1392, [r1907+24576]; +ld.shared.u32 r1442, [r1907+28672]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1393; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1396; +} +{ +add.f16x2 r1416, r1384, r1399; +} +{ +sub.f16x2 r1419, r1381, r1396; +} +{ +sub.f16x2 r1422, r1384, r1399; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1443; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1446; +} +{ +add.f16x2 r1466, r1434, r1449; +} +{ +sub.f16x2 r1469, r1431, r1446; +} +{ +sub.f16x2 r1472, r1434, r1449; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1457; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 r1523, r1401, r1451; +} +{ +add.f16x2 r1526, r1404, r1454; +} +{ +sub.f16x2 r1529, r1401, r1451; +} +{ +sub.f16x2 r1532, r1404, r1454; +} +{ +add.f16x2 r1535, r1413, r1495; +} +{ +add.f16x2 r1538, r1416, r1501; +} +{ +sub.f16x2 r1541, r1413, r1495; +} +{ +sub.f16x2 r1544, r1416, r1501; +} +{ +add.f16x2 r1547, r1407, r1460; +} +{ +add.f16x2 r1550, r1410, r1505; +} +{ +sub.f16x2 r1553, r1407, r1460; +} +{ +sub.f16x2 r1556, r1410, r1505; +} +{ +add.f16x2 r1559, r1419, r1513; +} +{ +add.f16x2 r1562, r1422, r1519; +} +{ +sub.f16x2 r1565, r1419, r1513; +} +{ +sub.f16x2 r1568, r1422, r1519; +} +and.b32 r1908, r1885, 512; +bfe.u32 r1909, r1885, 9, 1; +and.b32 r1910, r1895, 2044; +add.s32 r1911, r1889, r1910; +cvt.rn.f32.u32 f194, r1909; +mul.f32 f195, f194, 0f3EC90FDB; +cos.approx.f32 f167, f195; +sin.approx.f32 f196, f195; +neg.f32 f168, f196; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f167; +cvt.rn.f16.f32 high, f168; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1576, {high, high}; +} +{ +mul.f16x2 r1578, r1538, r1576; +} +{ +neg.f16x2 r1581, r1578; +} +{ +fma.rn.f16x2 r1583, r1535, r1574, r1581; +} +{ +mul.f16x2 r1587, r1535, r1576; +} +{ +fma.rn.f16x2 r1590, r1538, r1574, r1587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1598, {low, high}; +} +{ +mul.f16x2 r1599, r1596, r1598; +} +{ +mul.f16x2 r1602, r1571, r1594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1605, {high, low}; +} +{ +fma.rn.f16x2 r1607, r1599, r1605, r1602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1613, {high, high}; +} +{ +mul.f16x2 r1615, r1550, r1613; +} +{ +neg.f16x2 r1618, r1615; +} +{ +fma.rn.f16x2 r1620, r1547, r1611, r1618; +} +{ +mul.f16x2 r1624, r1547, r1613; +} +{ +fma.rn.f16x2 r1627, r1550, r1611, r1624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1635, {low, high}; +} +{ +mul.f16x2 r1636, r1633, r1635; +} +{ +mul.f16x2 r1639, r1607, r1631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1642, {high, low}; +} +{ +fma.rn.f16x2 r1644, r1636, r1642, r1639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1650, {high, high}; +} +{ +mul.f16x2 r1652, r1562, r1650; +} +{ +neg.f16x2 r1655, r1652; +} +{ +fma.rn.f16x2 r1657, r1559, r1648, r1655; +} +{ +mul.f16x2 r1661, r1559, r1650; +} +{ +fma.rn.f16x2 r1664, r1562, r1648, r1661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1672, {low, high}; +} +{ +mul.f16x2 r1673, r1670, r1672; +} +{ +mul.f16x2 r1676, r1644, r1668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1679, {high, low}; +} +{ +fma.rn.f16x2 r1681, r1673, r1679, r1676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1687, {high, high}; +} +{ +mul.f16x2 r1689, r1532, r1687; +} +{ +neg.f16x2 r1692, r1689; +} +{ +fma.rn.f16x2 r1694, r1529, r1685, r1692; +} +{ +mul.f16x2 r1698, r1529, r1687; +} +{ +fma.rn.f16x2 r1701, r1532, r1685, r1698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1707, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1709, {low, high}; +} +{ +mul.f16x2 r1710, r1707, r1709; +} +{ +mul.f16x2 r1713, r1681, r1705; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1716, {high, low}; +} +{ +fma.rn.f16x2 r1718, r1710, r1716, r1713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1724, {high, high}; +} +{ +mul.f16x2 r1726, r1544, r1724; +} +{ +neg.f16x2 r1729, r1726; +} +{ +fma.rn.f16x2 r1731, r1541, r1722, r1729; +} +{ +mul.f16x2 r1735, r1541, r1724; +} +{ +fma.rn.f16x2 r1738, r1544, r1722, r1735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1744, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1744, r1746; +} +{ +mul.f16x2 r1750, r1718, r1742; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1753, {high, low}; +} +{ +fma.rn.f16x2 r1755, r1747, r1753, r1750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1761, {high, high}; +} +{ +mul.f16x2 r1763, r1556, r1761; +} +{ +neg.f16x2 r1766, r1763; +} +{ +fma.rn.f16x2 r1768, r1553, r1759, r1766; +} +{ +mul.f16x2 r1772, r1553, r1761; +} +{ +fma.rn.f16x2 r1775, r1556, r1759, r1772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1781, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1781, r1783; +} +{ +mul.f16x2 r1787, r1755, r1779; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1790, {high, low}; +} +{ +fma.rn.f16x2 r1792, r1784, r1790, r1787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1798, {high, high}; +} +{ +mul.f16x2 r1800, r1568, r1798; +} +{ +neg.f16x2 r1803, r1800; +} +{ +fma.rn.f16x2 r1805, r1565, r1796, r1803; +} +{ +mul.f16x2 r1809, r1565, r1798; +} +{ +fma.rn.f16x2 r1812, r1568, r1796, r1809; +} +barrier.sync 0; +and.b32 r1912, r1887, 16384; +add.s32 r1913, r1911, r1912; +st.shared.u32 [r1913], r1523; +st.shared.u32 [r1913+2048], r1583; +st.shared.u32 [r1913+4096], r1620; +st.shared.u32 [r1913+6144], r1657; +st.shared.u32 [r1913+8192], r1694; +st.shared.u32 [r1913+10240], r1731; +st.shared.u32 [r1913+12288], r1768; +st.shared.u32 [r1913+14336], r1805; +barrier.sync 0; +mad.lo.s32 r1914, r1908, -28, r1913; +ld.shared.u32 r1834, [r1914]; +ld.shared.u32 r1846, [r1914+4096]; +ld.shared.u32 r1858, [r1914+8192]; +ld.shared.u32 r1870, [r1914+12288]; +ld.shared.u32 r1835, [r1914+16384]; +ld.shared.u32 r1847, [r1914+20480]; +ld.shared.u32 r1859, [r1914+24576]; +ld.shared.u32 r1871, [r1914+28672]; +barrier.sync 0; +st.shared.u32 [r1913], r1526; +st.shared.u32 [r1913+2048], r1590; +st.shared.u32 [r1913+4096], r1627; +st.shared.u32 [r1913+6144], r1664; +st.shared.u32 [r1913+8192], r1701; +st.shared.u32 [r1913+10240], r1738; +st.shared.u32 [r1913+12288], r1775; +st.shared.u32 [r1913+14336], r1812; +barrier.sync 0; +ld.shared.u32 r1837, [r1914]; +ld.shared.u32 r1849, [r1914+4096]; +ld.shared.u32 r1861, [r1914+8192]; +ld.shared.u32 r1873, [r1914+12288]; +ld.shared.u32 r1838, [r1914+16384]; +ld.shared.u32 r1850, [r1914+20480]; +ld.shared.u32 r1862, [r1914+24576]; +ld.shared.u32 r1874, [r1914+28672]; +{ +add.f16x2 %0, r1834, r1835; +} +{ +add.f16x2 %1, r1837, r1838; +} +{ +sub.f16x2 %8, r1834, r1835; +} +{ +sub.f16x2 %9, r1837, r1838; +} +{ +add.f16x2 %2, r1846, r1847; +} +{ +add.f16x2 %3, r1849, r1850; +} +{ +sub.f16x2 %10, r1846, r1847; +} +{ +sub.f16x2 %11, r1849, r1850; +} +{ +add.f16x2 %4, r1858, r1859; +} +{ +add.f16x2 %5, r1861, r1862; +} +{ +sub.f16x2 %12, r1858, r1859; +} +{ +sub.f16x2 %13, r1861, r1862; +} +{ +add.f16x2 %6, r1870, r1871; +} +{ +add.f16x2 %7, r1873, r1874; +} +{ +sub.f16x2 %14, r1870, r1871; +} +{ +sub.f16x2 %15, r1873, r1874; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<858, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<197>; +.reg .b32 r<1915>; +.reg .b64 rd<2>; +mov.u32 r1881, %tid.y; +shl.b32 r1882, r1881, 16; +mov.u32 r1883, %16; +add.s32 r1884, r1883, r1882; +mov.u32 r1885, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f32 f140, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r101, {low, high}; +} +mov.f32 f150, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f181, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f182, 0f3F800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r83; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r86; +} +{ +add.f16x2 r176, r36, r131; +} +{ +sub.f16x2 r179, r33, r86; +} +{ +sub.f16x2 r182, r36, r131; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1886, r1885, 1023; +shl.b32 r1887, r1885, 6; +and.b32 r1888, r1887, -65536; +add.s32 r1889, r1884, r1888; +cvt.rn.f32.u32 f185, r1886; +mul.f32 f186, f185, 0f3A490FDB; +cos.approx.f32 f29, f186; +sin.approx.f32 f187, f186; +neg.f32 f30, f187; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +neg.f16x2 r207, r204; +} +{ +fma.rn.f16x2 r209, r161, r200, r207; +} +{ +mul.f16x2 r213, r161, r202; +} +{ +fma.rn.f16x2 r216, r164, r200, r213; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +neg.f16x2 r244, r241; +} +{ +fma.rn.f16x2 r246, r173, r237, r244; +} +{ +mul.f16x2 r250, r173, r239; +} +{ +fma.rn.f16x2 r253, r176, r237, r250; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +neg.f16x2 r281, r278; +} +{ +fma.rn.f16x2 r283, r185, r274, r281; +} +{ +mul.f16x2 r287, r185, r276; +} +{ +fma.rn.f16x2 r290, r188, r274, r287; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +neg.f16x2 r318, r315; +} +{ +fma.rn.f16x2 r320, r155, r311, r318; +} +{ +mul.f16x2 r324, r155, r313; +} +{ +fma.rn.f16x2 r327, r158, r311, r324; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +neg.f16x2 r355, r352; +} +{ +fma.rn.f16x2 r357, r167, r348, r355; +} +{ +mul.f16x2 r361, r167, r350; +} +{ +fma.rn.f16x2 r364, r170, r348, r361; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +neg.f16x2 r392, r389; +} +{ +fma.rn.f16x2 r394, r179, r385, r392; +} +{ +mul.f16x2 r398, r179, r387; +} +{ +fma.rn.f16x2 r401, r182, r385, r398; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r191, r422, r429; +} +{ +mul.f16x2 r435, r191, r424; +} +{ +fma.rn.f16x2 r438, r194, r422, r435; +} +barrier.sync 0; +and.b32 r1890, r1887, 65472; +add.s32 r1891, r1889, r1890; +st.shared.v4.f32 [r1891], {r149, r152, r209, r216}; +st.shared.v4.f32 [r1891+16], {r246, r253, r283, r290}; +st.shared.v4.f32 [r1891+32], {r320, r327, r357, r364}; +st.shared.v4.f32 [r1891+48], {r394, r401, r431, r438}; +barrier.sync 0; +mad.lo.s32 r1892, r1886, -56, r1891; +ld.shared.u32 r460, [r1892]; +ld.shared.u32 r463, [r1892+4]; +ld.shared.u32 r510, [r1892+8192]; +ld.shared.u32 r513, [r1892+8196]; +ld.shared.u32 r472, [r1892+16384]; +ld.shared.u32 r475, [r1892+16388]; +ld.shared.u32 r522, [r1892+24576]; +ld.shared.u32 r525, [r1892+24580]; +ld.shared.u32 r461, [r1892+32768]; +ld.shared.u32 r464, [r1892+32772]; +ld.shared.u32 r511, [r1892+40960]; +ld.shared.u32 r514, [r1892+40964]; +ld.shared.u32 r473, [r1892+49152]; +ld.shared.u32 r476, [r1892+49156]; +ld.shared.u32 r523, [r1892+57344]; +ld.shared.u32 r526, [r1892+57348]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r477; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r480; +} +{ +add.f16x2 r500, r468, r483; +} +{ +sub.f16x2 r503, r465, r480; +} +{ +sub.f16x2 r506, r468, r483; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r527; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r530; +} +{ +add.f16x2 r550, r518, r533; +} +{ +sub.f16x2 r553, r515, r530; +} +{ +sub.f16x2 r556, r518, r533; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r541; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r544; +} +{ +add.f16x2 r634, r494, r589; +} +{ +sub.f16x2 r637, r491, r544; +} +{ +sub.f16x2 r640, r494, r589; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1893, r1885, 1016; +bfe.u32 r1894, r1885, 3, 7; +cvt.rn.f32.u32 f188, r1894; +mul.f32 f189, f188, 0f3BC90FDB; +cos.approx.f32 f75, f189; +sin.approx.f32 f190, f189; +neg.f32 f76, f190; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +neg.f16x2 r665, r662; +} +{ +fma.rn.f16x2 r667, r619, r658, r665; +} +{ +mul.f16x2 r671, r619, r660; +} +{ +fma.rn.f16x2 r674, r622, r658, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +neg.f16x2 r702, r699; +} +{ +fma.rn.f16x2 r704, r631, r695, r702; +} +{ +mul.f16x2 r708, r631, r697; +} +{ +fma.rn.f16x2 r711, r634, r695, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +neg.f16x2 r739, r736; +} +{ +fma.rn.f16x2 r741, r643, r732, r739; +} +{ +mul.f16x2 r745, r643, r734; +} +{ +fma.rn.f16x2 r748, r646, r732, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +neg.f16x2 r776, r773; +} +{ +fma.rn.f16x2 r778, r613, r769, r776; +} +{ +mul.f16x2 r782, r613, r771; +} +{ +fma.rn.f16x2 r785, r616, r769, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +neg.f16x2 r813, r810; +} +{ +fma.rn.f16x2 r815, r625, r806, r813; +} +{ +mul.f16x2 r819, r625, r808; +} +{ +fma.rn.f16x2 r822, r628, r806, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +neg.f16x2 r850, r847; +} +{ +fma.rn.f16x2 r852, r637, r843, r850; +} +{ +mul.f16x2 r856, r637, r845; +} +{ +fma.rn.f16x2 r859, r640, r843, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +neg.f16x2 r887, r884; +} +{ +fma.rn.f16x2 r889, r649, r880, r887; +} +{ +mul.f16x2 r893, r649, r882; +} +{ +fma.rn.f16x2 r896, r652, r880, r893; +} +shl.b32 r1895, r1885, 3; +and.b32 r1896, r1895, 56; +add.s32 r1897, r1889, r1896; +barrier.sync 0; +and.b32 r1898, r1887, 65024; +add.s32 r1899, r1897, r1898; +st.shared.u32 [r1899], r607; +st.shared.u32 [r1899+4], r610; +st.shared.u32 [r1899+64], r667; +st.shared.u32 [r1899+68], r674; +st.shared.u32 [r1899+128], r704; +st.shared.u32 [r1899+132], r711; +st.shared.u32 [r1899+192], r741; +st.shared.u32 [r1899+196], r748; +st.shared.u32 [r1899+256], r778; +st.shared.u32 [r1899+260], r785; +st.shared.u32 [r1899+320], r815; +st.shared.u32 [r1899+324], r822; +st.shared.u32 [r1899+384], r852; +st.shared.u32 [r1899+388], r859; +st.shared.u32 [r1899+448], r889; +st.shared.u32 [r1899+452], r896; +barrier.sync 0; +mad.lo.s32 r1900, r1893, -56, r1899; +ld.shared.u32 r918, [r1900]; +ld.shared.u32 r921, [r1900+4]; +ld.shared.u32 r968, [r1900+8192]; +ld.shared.u32 r971, [r1900+8196]; +ld.shared.u32 r930, [r1900+16384]; +ld.shared.u32 r933, [r1900+16388]; +ld.shared.u32 r980, [r1900+24576]; +ld.shared.u32 r983, [r1900+24580]; +ld.shared.u32 r919, [r1900+32768]; +ld.shared.u32 r922, [r1900+32772]; +ld.shared.u32 r969, [r1900+40960]; +ld.shared.u32 r972, [r1900+40964]; +ld.shared.u32 r931, [r1900+49152]; +ld.shared.u32 r934, [r1900+49156]; +ld.shared.u32 r981, [r1900+57344]; +ld.shared.u32 r984, [r1900+57348]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r935; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r938; +} +{ +add.f16x2 r958, r926, r941; +} +{ +sub.f16x2 r961, r923, r938; +} +{ +sub.f16x2 r964, r926, r941; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r985; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r988; +} +{ +add.f16x2 r1008, r976, r991; +} +{ +sub.f16x2 r1011, r973, r988; +} +{ +sub.f16x2 r1014, r976, r991; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r999; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1002; +} +{ +add.f16x2 r1092, r952, r1047; +} +{ +sub.f16x2 r1095, r949, r1002; +} +{ +sub.f16x2 r1098, r952, r1047; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1901, r1885, 960; +bfe.u32 r1902, r1885, 6, 4; +cvt.rn.f32.u32 f191, r1902; +mul.f32 f192, f191, 0f3D490FDB; +cos.approx.f32 f121, f192; +sin.approx.f32 f193, f192; +neg.f32 f122, f193; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +neg.f16x2 r1123, r1120; +} +{ +fma.rn.f16x2 r1125, r1077, r1116, r1123; +} +{ +mul.f16x2 r1129, r1077, r1118; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +neg.f16x2 r1160, r1157; +} +{ +fma.rn.f16x2 r1162, r1089, r1153, r1160; +} +{ +mul.f16x2 r1166, r1089, r1155; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1166; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +neg.f16x2 r1197, r1194; +} +{ +fma.rn.f16x2 r1199, r1101, r1190, r1197; +} +{ +mul.f16x2 r1203, r1101, r1192; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1203; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +neg.f16x2 r1234, r1231; +} +{ +fma.rn.f16x2 r1236, r1071, r1227, r1234; +} +{ +mul.f16x2 r1240, r1071, r1229; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1240; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +neg.f16x2 r1271, r1268; +} +{ +fma.rn.f16x2 r1273, r1083, r1264, r1271; +} +{ +mul.f16x2 r1277, r1083, r1266; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +neg.f16x2 r1308, r1305; +} +{ +fma.rn.f16x2 r1310, r1095, r1301, r1308; +} +{ +mul.f16x2 r1314, r1095, r1303; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1314; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +neg.f16x2 r1345, r1342; +} +{ +fma.rn.f16x2 r1347, r1107, r1338, r1345; +} +{ +mul.f16x2 r1351, r1107, r1340; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1351; +} +and.b32 r1903, r1895, 504; +add.s32 r1904, r1889, r1903; +barrier.sync 0; +and.b32 r1905, r1887, 61440; +add.s32 r1906, r1904, r1905; +st.shared.u32 [r1906], r1065; +st.shared.u32 [r1906+4], r1068; +st.shared.u32 [r1906+512], r1125; +st.shared.u32 [r1906+516], r1132; +st.shared.u32 [r1906+1024], r1162; +st.shared.u32 [r1906+1028], r1169; +st.shared.u32 [r1906+1536], r1199; +st.shared.u32 [r1906+1540], r1206; +st.shared.u32 [r1906+2048], r1236; +st.shared.u32 [r1906+2052], r1243; +st.shared.u32 [r1906+2560], r1273; +st.shared.u32 [r1906+2564], r1280; +st.shared.u32 [r1906+3072], r1310; +st.shared.u32 [r1906+3076], r1317; +st.shared.u32 [r1906+3584], r1347; +st.shared.u32 [r1906+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1907, r1901, -56, r1906; +ld.shared.u32 r1376, [r1907]; +ld.shared.u32 r1379, [r1907+4]; +ld.shared.u32 r1426, [r1907+8192]; +ld.shared.u32 r1429, [r1907+8196]; +ld.shared.u32 r1388, [r1907+16384]; +ld.shared.u32 r1391, [r1907+16388]; +ld.shared.u32 r1438, [r1907+24576]; +ld.shared.u32 r1441, [r1907+24580]; +ld.shared.u32 r1377, [r1907+32768]; +ld.shared.u32 r1380, [r1907+32772]; +ld.shared.u32 r1427, [r1907+40960]; +ld.shared.u32 r1430, [r1907+40964]; +ld.shared.u32 r1389, [r1907+49152]; +ld.shared.u32 r1392, [r1907+49156]; +ld.shared.u32 r1439, [r1907+57344]; +ld.shared.u32 r1442, [r1907+57348]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1393; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1396; +} +{ +add.f16x2 r1416, r1384, r1399; +} +{ +sub.f16x2 r1419, r1381, r1396; +} +{ +sub.f16x2 r1422, r1384, r1399; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1443; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1446; +} +{ +add.f16x2 r1466, r1434, r1449; +} +{ +sub.f16x2 r1469, r1431, r1446; +} +{ +sub.f16x2 r1472, r1434, r1449; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f140; +cvt.rn.f16.f32 high, f140; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1457; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 r1523, r1401, r1451; +} +{ +add.f16x2 r1526, r1404, r1454; +} +{ +sub.f16x2 r1529, r1401, r1451; +} +{ +sub.f16x2 r1532, r1404, r1454; +} +{ +add.f16x2 r1535, r1413, r1495; +} +{ +add.f16x2 r1538, r1416, r1501; +} +{ +sub.f16x2 r1541, r1413, r1495; +} +{ +sub.f16x2 r1544, r1416, r1501; +} +{ +add.f16x2 r1547, r1407, r1460; +} +{ +add.f16x2 r1550, r1410, r1505; +} +{ +sub.f16x2 r1553, r1407, r1460; +} +{ +sub.f16x2 r1556, r1410, r1505; +} +{ +add.f16x2 r1559, r1419, r1513; +} +{ +add.f16x2 r1562, r1422, r1519; +} +{ +sub.f16x2 r1565, r1419, r1513; +} +{ +sub.f16x2 r1568, r1422, r1519; +} +and.b32 r1908, r1885, 512; +bfe.u32 r1909, r1885, 9, 1; +cvt.rn.f32.u32 f194, r1909; +mul.f32 f195, f194, 0f3EC90FDB; +cos.approx.f32 f167, f195; +sin.approx.f32 f196, f195; +neg.f32 f168, f196; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f167; +cvt.rn.f16.f32 high, f168; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1576, {high, high}; +} +{ +mul.f16x2 r1578, r1538, r1576; +} +{ +neg.f16x2 r1581, r1578; +} +{ +fma.rn.f16x2 r1583, r1535, r1574, r1581; +} +{ +mul.f16x2 r1587, r1535, r1576; +} +{ +fma.rn.f16x2 r1590, r1538, r1574, r1587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1598, {low, high}; +} +{ +mul.f16x2 r1599, r1596, r1598; +} +{ +mul.f16x2 r1602, r1571, r1594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1605, {high, low}; +} +{ +fma.rn.f16x2 r1607, r1599, r1605, r1602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1613, {high, high}; +} +{ +mul.f16x2 r1615, r1550, r1613; +} +{ +neg.f16x2 r1618, r1615; +} +{ +fma.rn.f16x2 r1620, r1547, r1611, r1618; +} +{ +mul.f16x2 r1624, r1547, r1613; +} +{ +fma.rn.f16x2 r1627, r1550, r1611, r1624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1635, {low, high}; +} +{ +mul.f16x2 r1636, r1633, r1635; +} +{ +mul.f16x2 r1639, r1607, r1631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1642, {high, low}; +} +{ +fma.rn.f16x2 r1644, r1636, r1642, r1639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1650, {high, high}; +} +{ +mul.f16x2 r1652, r1562, r1650; +} +{ +neg.f16x2 r1655, r1652; +} +{ +fma.rn.f16x2 r1657, r1559, r1648, r1655; +} +{ +mul.f16x2 r1661, r1559, r1650; +} +{ +fma.rn.f16x2 r1664, r1562, r1648, r1661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1672, {low, high}; +} +{ +mul.f16x2 r1673, r1670, r1672; +} +{ +mul.f16x2 r1676, r1644, r1668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1679, {high, low}; +} +{ +fma.rn.f16x2 r1681, r1673, r1679, r1676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1687, {high, high}; +} +{ +mul.f16x2 r1689, r1532, r1687; +} +{ +neg.f16x2 r1692, r1689; +} +{ +fma.rn.f16x2 r1694, r1529, r1685, r1692; +} +{ +mul.f16x2 r1698, r1529, r1687; +} +{ +fma.rn.f16x2 r1701, r1532, r1685, r1698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1707, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1709, {low, high}; +} +{ +mul.f16x2 r1710, r1707, r1709; +} +{ +mul.f16x2 r1713, r1681, r1705; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1716, {high, low}; +} +{ +fma.rn.f16x2 r1718, r1710, r1716, r1713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1724, {high, high}; +} +{ +mul.f16x2 r1726, r1544, r1724; +} +{ +neg.f16x2 r1729, r1726; +} +{ +fma.rn.f16x2 r1731, r1541, r1722, r1729; +} +{ +mul.f16x2 r1735, r1541, r1724; +} +{ +fma.rn.f16x2 r1738, r1544, r1722, r1735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1744, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1744, r1746; +} +{ +mul.f16x2 r1750, r1718, r1742; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1753, {high, low}; +} +{ +fma.rn.f16x2 r1755, r1747, r1753, r1750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1761, {high, high}; +} +{ +mul.f16x2 r1763, r1556, r1761; +} +{ +neg.f16x2 r1766, r1763; +} +{ +fma.rn.f16x2 r1768, r1553, r1759, r1766; +} +{ +mul.f16x2 r1772, r1553, r1761; +} +{ +fma.rn.f16x2 r1775, r1556, r1759, r1772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1781, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1781, r1783; +} +{ +mul.f16x2 r1787, r1755, r1779; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1790, {high, low}; +} +{ +fma.rn.f16x2 r1792, r1784, r1790, r1787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1798, {high, high}; +} +{ +mul.f16x2 r1800, r1568, r1798; +} +{ +neg.f16x2 r1803, r1800; +} +{ +fma.rn.f16x2 r1805, r1565, r1796, r1803; +} +{ +mul.f16x2 r1809, r1565, r1798; +} +{ +fma.rn.f16x2 r1812, r1568, r1796, r1809; +} +and.b32 r1910, r1895, 4088; +add.s32 r1911, r1889, r1910; +barrier.sync 0; +and.b32 r1912, r1887, 32768; +add.s32 r1913, r1911, r1912; +st.shared.u32 [r1913], r1523; +st.shared.u32 [r1913+4], r1526; +st.shared.u32 [r1913+4096], r1583; +st.shared.u32 [r1913+4100], r1590; +st.shared.u32 [r1913+8192], r1620; +st.shared.u32 [r1913+8196], r1627; +st.shared.u32 [r1913+12288], r1657; +st.shared.u32 [r1913+12292], r1664; +st.shared.u32 [r1913+16384], r1694; +st.shared.u32 [r1913+16388], r1701; +st.shared.u32 [r1913+20480], r1731; +st.shared.u32 [r1913+20484], r1738; +st.shared.u32 [r1913+24576], r1768; +st.shared.u32 [r1913+24580], r1775; +st.shared.u32 [r1913+28672], r1805; +st.shared.u32 [r1913+28676], r1812; +barrier.sync 0; +mad.lo.s32 r1914, r1908, -56, r1913; +ld.shared.u32 r1834, [r1914]; +ld.shared.u32 r1837, [r1914+4]; +ld.shared.u32 r1846, [r1914+8192]; +ld.shared.u32 r1849, [r1914+8196]; +ld.shared.u32 r1858, [r1914+16384]; +ld.shared.u32 r1861, [r1914+16388]; +ld.shared.u32 r1870, [r1914+24576]; +ld.shared.u32 r1873, [r1914+24580]; +ld.shared.u32 r1835, [r1914+32768]; +ld.shared.u32 r1838, [r1914+32772]; +ld.shared.u32 r1847, [r1914+40960]; +ld.shared.u32 r1850, [r1914+40964]; +ld.shared.u32 r1859, [r1914+49152]; +ld.shared.u32 r1862, [r1914+49156]; +ld.shared.u32 r1871, [r1914+57344]; +ld.shared.u32 r1874, [r1914+57348]; +{ +add.f16x2 %0, r1834, r1835; +} +{ +add.f16x2 %1, r1837, r1838; +} +{ +sub.f16x2 %8, r1834, r1835; +} +{ +sub.f16x2 %9, r1837, r1838; +} +{ +add.f16x2 %2, r1846, r1847; +} +{ +add.f16x2 %3, r1849, r1850; +} +{ +sub.f16x2 %10, r1846, r1847; +} +{ +sub.f16x2 %11, r1849, r1850; +} +{ +add.f16x2 %4, r1858, r1859; +} +{ +add.f16x2 %5, r1861, r1862; +} +{ +sub.f16x2 %12, r1858, r1859; +} +{ +sub.f16x2 %13, r1861, r1862; +} +{ +add.f16x2 %6, r1870, r1871; +} +{ +add.f16x2 %7, r1873, r1874; +} +{ +sub.f16x2 %14, r1870, r1871; +} +{ +sub.f16x2 %15, r1873, r1874; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ccd7cbecf1d0b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp16_inv.hpp.inc @@ -0,0 +1,31188 @@ +#ifndef CUFFTDX_FFT_8192_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_8192_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1058, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<965>; +.reg .b32 r<6595>; +.reg .b64 rd<3>; +mov.u32 r6509, %tid.y; +shl.b32 r6510, r6509, 16; +mov.u32 r6511, %64; +add.s32 r6512, r6511, r6510; +mov.u32 r6513, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %109, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %109, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f940, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r102, {low, high}; +} +mov.f32 f938, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %107; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %107; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %106, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %106, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %108, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %108, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6515, r6513, 8; +and.b32 r6516, r6515, -65536; +add.s32 r6517, r6512, r6516; +and.b32 r6530, r6513, 255; +cvt.rn.f32.u32 f957, r6530; +mul.f32 f958, f957, 0f3A490FDB; +cos.approx.f32 f357, f958; +sin.approx.f32 f959, f958; +neg.f32 f358, f959; +mov.f32 f964, 0f3F800000; +mov.f32 f963, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r6518, r6515, 65280; +add.s32 r6519, r6517, r6518; +st.shared.v4.f32 [r6519], {r1521, r1524, r1723, r1732}; +st.shared.v4.f32 [r6519+16], {r1760, r1769, r1797, r1806}; +st.shared.v4.f32 [r6519+32], {r1834, r1843, r1871, r1880}; +st.shared.v4.f32 [r6519+48], {r1908, r1917, r1945, r1954}; +st.shared.v4.f32 [r6519+64], {r1982, r1991, r2019, r2028}; +st.shared.v4.f32 [r6519+80], {r2056, r2065, r2093, r2102}; +st.shared.v4.f32 [r6519+96], {r2130, r2139, r2167, r2176}; +st.shared.v4.f32 [r6519+112], {r2204, r2213, r2241, r2250}; +st.shared.v4.f32 [r6519+128], {r2278, r2287, r2315, r2324}; +st.shared.v4.f32 [r6519+144], {r2352, r2361, r2389, r2398}; +st.shared.v4.f32 [r6519+160], {r2426, r2435, r2463, r2472}; +st.shared.v4.f32 [r6519+176], {r2500, r2509, r2537, r2546}; +st.shared.v4.f32 [r6519+192], {r2574, r2583, r2611, r2620}; +st.shared.v4.f32 [r6519+208], {r2648, r2657, r2685, r2694}; +st.shared.v4.f32 [r6519+224], {r2722, r2731, r2759, r2768}; +st.shared.v4.f32 [r6519+240], {r2796, r2805, r2833, r2842}; +barrier.sync 0; +mad.lo.s32 r6520, r6530, -248, r6519; +ld.shared.u32 r2864, [r6520]; +ld.shared.u32 r2867, [r6520+4]; +ld.shared.u32 r3480, [r6520+2048]; +ld.shared.u32 r3483, [r6520+2052]; +ld.shared.u32 r3060, [r6520+4096]; +ld.shared.u32 r3063, [r6520+4100]; +ld.shared.u32 r3676, [r6520+6144]; +ld.shared.u32 r3679, [r6520+6148]; +ld.shared.u32 r2914, [r6520+8192]; +ld.shared.u32 r2917, [r6520+8196]; +ld.shared.u32 r3530, [r6520+10240]; +ld.shared.u32 r3533, [r6520+10244]; +ld.shared.u32 r3110, [r6520+12288]; +ld.shared.u32 r3113, [r6520+12292]; +ld.shared.u32 r3726, [r6520+14336]; +ld.shared.u32 r3729, [r6520+14340]; +ld.shared.u32 r2876, [r6520+16384]; +ld.shared.u32 r2879, [r6520+16388]; +ld.shared.u32 r3492, [r6520+18432]; +ld.shared.u32 r3495, [r6520+18436]; +ld.shared.u32 r3072, [r6520+20480]; +ld.shared.u32 r3075, [r6520+20484]; +ld.shared.u32 r3688, [r6520+22528]; +ld.shared.u32 r3691, [r6520+22532]; +ld.shared.u32 r2926, [r6520+24576]; +ld.shared.u32 r2929, [r6520+24580]; +ld.shared.u32 r3542, [r6520+26624]; +ld.shared.u32 r3545, [r6520+26628]; +ld.shared.u32 r3122, [r6520+28672]; +ld.shared.u32 r3125, [r6520+28676]; +ld.shared.u32 r3738, [r6520+30720]; +ld.shared.u32 r3741, [r6520+30724]; +ld.shared.u32 r2865, [r6520+32768]; +ld.shared.u32 r2868, [r6520+32772]; +ld.shared.u32 r3481, [r6520+34816]; +ld.shared.u32 r3484, [r6520+34820]; +ld.shared.u32 r3061, [r6520+36864]; +ld.shared.u32 r3064, [r6520+36868]; +ld.shared.u32 r3677, [r6520+38912]; +ld.shared.u32 r3680, [r6520+38916]; +ld.shared.u32 r2915, [r6520+40960]; +ld.shared.u32 r2918, [r6520+40964]; +ld.shared.u32 r3531, [r6520+43008]; +ld.shared.u32 r3534, [r6520+43012]; +ld.shared.u32 r3111, [r6520+45056]; +ld.shared.u32 r3114, [r6520+45060]; +ld.shared.u32 r3727, [r6520+47104]; +ld.shared.u32 r3730, [r6520+47108]; +ld.shared.u32 r2877, [r6520+49152]; +ld.shared.u32 r2880, [r6520+49156]; +ld.shared.u32 r3493, [r6520+51200]; +ld.shared.u32 r3496, [r6520+51204]; +ld.shared.u32 r3073, [r6520+53248]; +ld.shared.u32 r3076, [r6520+53252]; +ld.shared.u32 r3689, [r6520+55296]; +ld.shared.u32 r3692, [r6520+55300]; +ld.shared.u32 r2927, [r6520+57344]; +ld.shared.u32 r2930, [r6520+57348]; +ld.shared.u32 r3543, [r6520+59392]; +ld.shared.u32 r3546, [r6520+59396]; +ld.shared.u32 r3123, [r6520+61440]; +ld.shared.u32 r3126, [r6520+61444]; +ld.shared.u32 r3739, [r6520+63488]; +ld.shared.u32 r3742, [r6520+63492]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +bfe.u32 r6522, r6513, 5, 3; +cvt.rn.f32.u32 f960, r6522; +mul.f32 f961, f960, 0f3CC90FDB; +cos.approx.f32 f779, f961; +sin.approx.f32 f962, f961; +neg.f32 f780, f962; +and.b32 r6529, r6513, 224; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +shl.b32 r6523, r6513, 3; +and.b32 r6524, r6523, 248; +add.s32 r6525, r6517, r6524; +barrier.sync 0; +and.b32 r6526, r6515, 57344; +add.s32 r6527, r6525, r6526; +st.shared.u32 [r6527], r4383; +st.shared.u32 [r6527+4], r4386; +st.shared.u32 [r6527+256], r4585; +st.shared.u32 [r6527+260], r4594; +st.shared.u32 [r6527+512], r4622; +st.shared.u32 [r6527+516], r4631; +st.shared.u32 [r6527+768], r4659; +st.shared.u32 [r6527+772], r4668; +st.shared.u32 [r6527+1024], r4696; +st.shared.u32 [r6527+1028], r4705; +st.shared.u32 [r6527+1280], r4733; +st.shared.u32 [r6527+1284], r4742; +st.shared.u32 [r6527+1536], r4770; +st.shared.u32 [r6527+1540], r4779; +st.shared.u32 [r6527+1792], r4807; +st.shared.u32 [r6527+1796], r4816; +st.shared.u32 [r6527+2048], r4844; +st.shared.u32 [r6527+2052], r4853; +st.shared.u32 [r6527+2304], r4881; +st.shared.u32 [r6527+2308], r4890; +st.shared.u32 [r6527+2560], r4918; +st.shared.u32 [r6527+2564], r4927; +st.shared.u32 [r6527+2816], r4955; +st.shared.u32 [r6527+2820], r4964; +st.shared.u32 [r6527+3072], r4992; +st.shared.u32 [r6527+3076], r5001; +st.shared.u32 [r6527+3328], r5029; +st.shared.u32 [r6527+3332], r5038; +st.shared.u32 [r6527+3584], r5066; +st.shared.u32 [r6527+3588], r5075; +st.shared.u32 [r6527+3840], r5103; +st.shared.u32 [r6527+3844], r5112; +st.shared.u32 [r6527+4096], r5140; +st.shared.u32 [r6527+4100], r5149; +st.shared.u32 [r6527+4352], r5177; +st.shared.u32 [r6527+4356], r5186; +st.shared.u32 [r6527+4608], r5214; +st.shared.u32 [r6527+4612], r5223; +st.shared.u32 [r6527+4864], r5251; +st.shared.u32 [r6527+4868], r5260; +st.shared.u32 [r6527+5120], r5288; +st.shared.u32 [r6527+5124], r5297; +st.shared.u32 [r6527+5376], r5325; +st.shared.u32 [r6527+5380], r5334; +st.shared.u32 [r6527+5632], r5362; +st.shared.u32 [r6527+5636], r5371; +st.shared.u32 [r6527+5888], r5399; +st.shared.u32 [r6527+5892], r5408; +st.shared.u32 [r6527+6144], r5436; +st.shared.u32 [r6527+6148], r5445; +st.shared.u32 [r6527+6400], r5473; +st.shared.u32 [r6527+6404], r5482; +st.shared.u32 [r6527+6656], r5510; +st.shared.u32 [r6527+6660], r5519; +st.shared.u32 [r6527+6912], r5547; +st.shared.u32 [r6527+6916], r5556; +st.shared.u32 [r6527+7168], r5584; +st.shared.u32 [r6527+7172], r5593; +st.shared.u32 [r6527+7424], r5621; +st.shared.u32 [r6527+7428], r5630; +st.shared.u32 [r6527+7680], r5658; +st.shared.u32 [r6527+7684], r5667; +st.shared.u32 [r6527+7936], r5695; +st.shared.u32 [r6527+7940], r5704; +barrier.sync 0; +mad.lo.s32 r6528, r6529, -248, r6527; +ld.shared.u32 r5726, [r6528]; +ld.shared.u32 r5729, [r6528+4]; +ld.shared.u32 r5922, [r6528+2048]; +ld.shared.u32 r5925, [r6528+2052]; +ld.shared.u32 r6118, [r6528+4096]; +ld.shared.u32 r6121, [r6528+4100]; +ld.shared.u32 r6314, [r6528+6144]; +ld.shared.u32 r6317, [r6528+6148]; +ld.shared.u32 r5776, [r6528+8192]; +ld.shared.u32 r5779, [r6528+8196]; +ld.shared.u32 r5972, [r6528+10240]; +ld.shared.u32 r5975, [r6528+10244]; +ld.shared.u32 r6168, [r6528+12288]; +ld.shared.u32 r6171, [r6528+12292]; +ld.shared.u32 r6364, [r6528+14336]; +ld.shared.u32 r6367, [r6528+14340]; +ld.shared.u32 r5738, [r6528+16384]; +ld.shared.u32 r5741, [r6528+16388]; +ld.shared.u32 r5934, [r6528+18432]; +ld.shared.u32 r5937, [r6528+18436]; +ld.shared.u32 r6130, [r6528+20480]; +ld.shared.u32 r6133, [r6528+20484]; +ld.shared.u32 r6326, [r6528+22528]; +ld.shared.u32 r6329, [r6528+22532]; +ld.shared.u32 r5788, [r6528+24576]; +ld.shared.u32 r5791, [r6528+24580]; +ld.shared.u32 r5984, [r6528+26624]; +ld.shared.u32 r5987, [r6528+26628]; +ld.shared.u32 r6180, [r6528+28672]; +ld.shared.u32 r6183, [r6528+28676]; +ld.shared.u32 r6376, [r6528+30720]; +ld.shared.u32 r6379, [r6528+30724]; +ld.shared.u32 r5727, [r6528+32768]; +ld.shared.u32 r5730, [r6528+32772]; +ld.shared.u32 r5923, [r6528+34816]; +ld.shared.u32 r5926, [r6528+34820]; +ld.shared.u32 r6119, [r6528+36864]; +ld.shared.u32 r6122, [r6528+36868]; +ld.shared.u32 r6315, [r6528+38912]; +ld.shared.u32 r6318, [r6528+38916]; +ld.shared.u32 r5777, [r6528+40960]; +ld.shared.u32 r5780, [r6528+40964]; +ld.shared.u32 r5973, [r6528+43008]; +ld.shared.u32 r5976, [r6528+43012]; +ld.shared.u32 r6169, [r6528+45056]; +ld.shared.u32 r6172, [r6528+45060]; +ld.shared.u32 r6365, [r6528+47104]; +ld.shared.u32 r6368, [r6528+47108]; +ld.shared.u32 r5739, [r6528+49152]; +ld.shared.u32 r5742, [r6528+49156]; +ld.shared.u32 r5935, [r6528+51200]; +ld.shared.u32 r5938, [r6528+51204]; +ld.shared.u32 r6131, [r6528+53248]; +ld.shared.u32 r6134, [r6528+53252]; +ld.shared.u32 r6327, [r6528+55296]; +ld.shared.u32 r6330, [r6528+55300]; +ld.shared.u32 r5789, [r6528+57344]; +ld.shared.u32 r5792, [r6528+57348]; +ld.shared.u32 r5985, [r6528+59392]; +ld.shared.u32 r5988, [r6528+59396]; +ld.shared.u32 r6181, [r6528+61440]; +ld.shared.u32 r6184, [r6528+61444]; +ld.shared.u32 r6377, [r6528+63488]; +ld.shared.u32 r6380, [r6528+63492]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5749; +} +{ +add.f16x2 r5766, r5734, r5743; +} +{ +sub.f16x2 r5769, r5731, r5749; +} +{ +sub.f16x2 r5772, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5799; +} +{ +add.f16x2 r5816, r5784, r5793; +} +{ +sub.f16x2 r5819, r5781, r5799; +} +{ +sub.f16x2 r5822, r5784, r5793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5810; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 %0, r5751, r5801; +} +{ +add.f16x2 %1, r5754, r5804; +} +{ +sub.f16x2 %32, r5751, r5801; +} +{ +sub.f16x2 %33, r5754, r5804; +} +{ +add.f16x2 %8, r5763, r5845; +} +{ +add.f16x2 %9, r5766, r5851; +} +{ +sub.f16x2 %40, r5763, r5845; +} +{ +sub.f16x2 %41, r5766, r5851; +} +{ +add.f16x2 %16, r5757, r5855; +} +{ +add.f16x2 %17, r5760, r5807; +} +{ +sub.f16x2 %48, r5757, r5855; +} +{ +sub.f16x2 %49, r5760, r5807; +} +{ +add.f16x2 %24, r5769, r5863; +} +{ +add.f16x2 %25, r5772, r5869; +} +{ +sub.f16x2 %56, r5769, r5863; +} +{ +sub.f16x2 %57, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5942; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5945; +} +{ +add.f16x2 r5962, r5930, r5939; +} +{ +sub.f16x2 r5965, r5927, r5945; +} +{ +sub.f16x2 r5968, r5930, r5939; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5992; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5995; +} +{ +add.f16x2 r6012, r5980, r5989; +} +{ +sub.f16x2 r6015, r5977, r5995; +} +{ +sub.f16x2 r6018, r5980, r5989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6006; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 %2, r5947, r5997; +} +{ +add.f16x2 %3, r5950, r6000; +} +{ +sub.f16x2 %34, r5947, r5997; +} +{ +sub.f16x2 %35, r5950, r6000; +} +{ +add.f16x2 %10, r5959, r6041; +} +{ +add.f16x2 %11, r5962, r6047; +} +{ +sub.f16x2 %42, r5959, r6041; +} +{ +sub.f16x2 %43, r5962, r6047; +} +{ +add.f16x2 %18, r5953, r6051; +} +{ +add.f16x2 %19, r5956, r6003; +} +{ +sub.f16x2 %50, r5953, r6051; +} +{ +sub.f16x2 %51, r5956, r6003; +} +{ +add.f16x2 %26, r5965, r6059; +} +{ +add.f16x2 %27, r5968, r6065; +} +{ +sub.f16x2 %58, r5965, r6059; +} +{ +sub.f16x2 %59, r5968, r6065; +} +{ +add.f16x2 r6117, r6118, r6119; +} +{ +add.f16x2 r6120, r6121, r6122; +} +{ +sub.f16x2 r6123, r6118, r6119; +} +{ +sub.f16x2 r6126, r6121, r6122; +} +{ +add.f16x2 r6129, r6130, r6131; +} +{ +add.f16x2 r6132, r6133, r6134; +} +{ +sub.f16x2 r6135, r6130, r6131; +} +{ +sub.f16x2 r6138, r6133, r6134; +} +{ +neg.f16x2 r6141, r6138; +} +{ +add.f16x2 r6143, r6117, r6129; +} +{ +add.f16x2 r6146, r6120, r6132; +} +{ +sub.f16x2 r6149, r6117, r6129; +} +{ +sub.f16x2 r6152, r6120, r6132; +} +{ +add.f16x2 r6155, r6123, r6141; +} +{ +add.f16x2 r6158, r6126, r6135; +} +{ +sub.f16x2 r6161, r6123, r6141; +} +{ +sub.f16x2 r6164, r6126, r6135; +} +{ +add.f16x2 r6167, r6168, r6169; +} +{ +add.f16x2 r6170, r6171, r6172; +} +{ +sub.f16x2 r6173, r6168, r6169; +} +{ +sub.f16x2 r6176, r6171, r6172; +} +{ +add.f16x2 r6179, r6180, r6181; +} +{ +add.f16x2 r6182, r6183, r6184; +} +{ +sub.f16x2 r6185, r6180, r6181; +} +{ +sub.f16x2 r6188, r6183, r6184; +} +{ +neg.f16x2 r6191, r6188; +} +{ +add.f16x2 r6193, r6167, r6179; +} +{ +add.f16x2 r6196, r6170, r6182; +} +{ +sub.f16x2 r6199, r6167, r6179; +} +{ +sub.f16x2 r6202, r6170, r6182; +} +{ +add.f16x2 r6205, r6173, r6191; +} +{ +add.f16x2 r6208, r6176, r6185; +} +{ +sub.f16x2 r6211, r6173, r6191; +} +{ +sub.f16x2 r6214, r6176, r6185; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6218, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r6221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6222, {low, high}; +} +{ +mul.f16x2 r6231, r6205, r6217; +} +{ +mul.f16x2 r6234, r6208, r6218; +} +{ +sub.f16x2 r6237, r6231, r6234; +} +{ +mul.f16x2 r6240, r6205, r6218; +} +{ +fma.rn.f16x2 r6243, r6208, r6217, r6240; +} +{ +neg.f16x2 r6247, r6202; +} +{ +mul.f16x2 r6249, r6211, r6221; +} +{ +mul.f16x2 r6252, r6214, r6222; +} +{ +sub.f16x2 r6255, r6249, r6252; +} +{ +mul.f16x2 r6258, r6211, r6222; +} +{ +fma.rn.f16x2 r6261, r6214, r6221, r6258; +} +{ +add.f16x2 %4, r6143, r6193; +} +{ +add.f16x2 %5, r6146, r6196; +} +{ +sub.f16x2 %36, r6143, r6193; +} +{ +sub.f16x2 %37, r6146, r6196; +} +{ +add.f16x2 %12, r6155, r6237; +} +{ +add.f16x2 %13, r6158, r6243; +} +{ +sub.f16x2 %44, r6155, r6237; +} +{ +sub.f16x2 %45, r6158, r6243; +} +{ +add.f16x2 %20, r6149, r6247; +} +{ +add.f16x2 %21, r6152, r6199; +} +{ +sub.f16x2 %52, r6149, r6247; +} +{ +sub.f16x2 %53, r6152, r6199; +} +{ +add.f16x2 %28, r6161, r6255; +} +{ +add.f16x2 %29, r6164, r6261; +} +{ +sub.f16x2 %60, r6161, r6255; +} +{ +sub.f16x2 %61, r6164, r6261; +} +{ +add.f16x2 r6313, r6314, r6315; +} +{ +add.f16x2 r6316, r6317, r6318; +} +{ +sub.f16x2 r6319, r6314, r6315; +} +{ +sub.f16x2 r6322, r6317, r6318; +} +{ +add.f16x2 r6325, r6326, r6327; +} +{ +add.f16x2 r6328, r6329, r6330; +} +{ +sub.f16x2 r6331, r6326, r6327; +} +{ +sub.f16x2 r6334, r6329, r6330; +} +{ +neg.f16x2 r6337, r6334; +} +{ +add.f16x2 r6339, r6313, r6325; +} +{ +add.f16x2 r6342, r6316, r6328; +} +{ +sub.f16x2 r6345, r6313, r6325; +} +{ +sub.f16x2 r6348, r6316, r6328; +} +{ +add.f16x2 r6351, r6319, r6337; +} +{ +add.f16x2 r6354, r6322, r6331; +} +{ +sub.f16x2 r6357, r6319, r6337; +} +{ +sub.f16x2 r6360, r6322, r6331; +} +{ +add.f16x2 r6363, r6364, r6365; +} +{ +add.f16x2 r6366, r6367, r6368; +} +{ +sub.f16x2 r6369, r6364, r6365; +} +{ +sub.f16x2 r6372, r6367, r6368; +} +{ +add.f16x2 r6375, r6376, r6377; +} +{ +add.f16x2 r6378, r6379, r6380; +} +{ +sub.f16x2 r6381, r6376, r6377; +} +{ +sub.f16x2 r6384, r6379, r6380; +} +{ +neg.f16x2 r6387, r6384; +} +{ +add.f16x2 r6389, r6363, r6375; +} +{ +add.f16x2 r6392, r6366, r6378; +} +{ +sub.f16x2 r6395, r6363, r6375; +} +{ +sub.f16x2 r6398, r6366, r6378; +} +{ +add.f16x2 r6401, r6369, r6387; +} +{ +add.f16x2 r6404, r6372, r6381; +} +{ +sub.f16x2 r6407, r6369, r6387; +} +{ +sub.f16x2 r6410, r6372, r6381; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6413, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r6417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6418, {low, high}; +} +{ +mul.f16x2 r6427, r6401, r6413; +} +{ +mul.f16x2 r6430, r6404, r6414; +} +{ +sub.f16x2 r6433, r6427, r6430; +} +{ +mul.f16x2 r6436, r6401, r6414; +} +{ +fma.rn.f16x2 r6439, r6404, r6413, r6436; +} +{ +neg.f16x2 r6443, r6398; +} +{ +mul.f16x2 r6445, r6407, r6417; +} +{ +mul.f16x2 r6448, r6410, r6418; +} +{ +sub.f16x2 r6451, r6445, r6448; +} +{ +mul.f16x2 r6454, r6407, r6418; +} +{ +fma.rn.f16x2 r6457, r6410, r6417, r6454; +} +{ +add.f16x2 %6, r6339, r6389; +} +{ +add.f16x2 %7, r6342, r6392; +} +{ +sub.f16x2 %38, r6339, r6389; +} +{ +sub.f16x2 %39, r6342, r6392; +} +{ +add.f16x2 %14, r6351, r6433; +} +{ +add.f16x2 %15, r6354, r6439; +} +{ +sub.f16x2 %46, r6351, r6433; +} +{ +sub.f16x2 %47, r6354, r6439; +} +{ +add.f16x2 %22, r6345, r6443; +} +{ +add.f16x2 %23, r6348, r6395; +} +{ +sub.f16x2 %54, r6345, r6443; +} +{ +sub.f16x2 %55, r6348, r6395; +} +{ +add.f16x2 %30, r6357, r6451; +} +{ +add.f16x2 %31, r6360, r6457; +} +{ +sub.f16x2 %62, r6357, r6451; +} +{ +sub.f16x2 %63, r6360, r6457; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1059, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<965>; +.reg .b32 r<6594>; +.reg .b64 rd<3>; +mov.u32 r6509, %tid.y; +shl.b32 r6510, r6509, 15; +mov.u32 r6511, %64; +add.s32 r6512, r6511, r6510; +mov.u32 r6513, %tid.x; +{ +add.f16x2 r1, %119, %111; +} +{ +add.f16x2 r4, %91, %81; +} +{ +sub.f16x2 r7, %119, %111; +} +{ +sub.f16x2 r10, %91, %81; +} +{ +add.f16x2 r13, %73, %128; +} +{ +add.f16x2 r16, %109, %101; +} +{ +sub.f16x2 r19, %73, %128; +} +{ +sub.f16x2 r22, %109, %101; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %105, %96; +} +{ +add.f16x2 r54, %77, %67; +} +{ +sub.f16x2 r57, %105, %96; +} +{ +sub.f16x2 r60, %77, %67; +} +{ +add.f16x2 r63, %122, %115; +} +{ +add.f16x2 r66, %93, %85; +} +{ +sub.f16x2 r69, %122, %115; +} +{ +sub.f16x2 r72, %93, %85; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f940, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r102, {low, high}; +} +mov.f32 f938, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r106, {low, high}; +} +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %94, %87; +} +{ +add.f16x2 r200, %66, %123; +} +{ +sub.f16x2 r203, %94, %87; +} +{ +sub.f16x2 r206, %66, %123; +} +{ +add.f16x2 r209, %113, %103; +} +{ +add.f16x2 r212, %83, %75; +} +{ +sub.f16x2 r215, %113, %103; +} +{ +sub.f16x2 r218, %83, %75; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %78, %72; +} +{ +add.f16x2 r250, %117, %107; +} +{ +sub.f16x2 r253, %78, %72; +} +{ +sub.f16x2 r256, %117, %107; +} +{ +add.f16x2 r259, %97, %89; +} +{ +add.f16x2 r262, %69, %125; +} +{ +sub.f16x2 r265, %97, %89; +} +{ +sub.f16x2 r268, %69, %125; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f694, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r393, {low, high}; +} +mov.f32 f710, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r398, {low, high}; +} +mov.f32 f692, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r404, {low, high}; +} +mov.f32 f708, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +{ +add.f16x2 r617, %68, %124; +} +{ +add.f16x2 r620, %104, %95; +} +{ +sub.f16x2 r623, %68, %124; +} +{ +sub.f16x2 r626, %104, %95; +} +{ +add.f16x2 r629, %86, %76; +} +{ +add.f16x2 r632, %121, %114; +} +{ +sub.f16x2 r635, %86, %76; +} +{ +sub.f16x2 r638, %121, %114; +} +{ +neg.f16x2 r641, r638; +} +{ +add.f16x2 r643, r617, r629; +} +{ +add.f16x2 r646, r620, r632; +} +{ +sub.f16x2 r649, r617, r629; +} +{ +sub.f16x2 r652, r620, r632; +} +{ +add.f16x2 r655, r623, r641; +} +{ +add.f16x2 r658, r626, r635; +} +{ +sub.f16x2 r661, r623, r641; +} +{ +sub.f16x2 r664, r626, r635; +} +{ +add.f16x2 r667, %118, %110; +} +{ +add.f16x2 r670, %90, %80; +} +{ +sub.f16x2 r673, %118, %110; +} +{ +sub.f16x2 r676, %90, %80; +} +{ +add.f16x2 r679, %70, %127; +} +{ +add.f16x2 r682, %106, %99; +} +{ +sub.f16x2 r685, %70, %127; +} +{ +sub.f16x2 r688, %106, %99; +} +{ +neg.f16x2 r691, r688; +} +{ +add.f16x2 r693, r667, r679; +} +{ +add.f16x2 r696, r670, r682; +} +{ +sub.f16x2 r699, r667, r679; +} +{ +sub.f16x2 r702, r670, r682; +} +{ +add.f16x2 r705, r673, r691; +} +{ +add.f16x2 r708, r676, r685; +} +{ +sub.f16x2 r711, r673, r691; +} +{ +sub.f16x2 r714, r676, r685; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r717, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r718, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r721, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r722, {low, high}; +} +{ +mul.f16x2 r731, r705, r717; +} +{ +mul.f16x2 r734, r708, r718; +} +{ +sub.f16x2 r737, r731, r734; +} +{ +mul.f16x2 r740, r705, r718; +} +{ +fma.rn.f16x2 r743, r708, r717, r740; +} +{ +neg.f16x2 r747, r702; +} +{ +mul.f16x2 r749, r711, r721; +} +{ +mul.f16x2 r752, r714, r722; +} +{ +sub.f16x2 r755, r749, r752; +} +{ +mul.f16x2 r758, r711, r722; +} +{ +fma.rn.f16x2 r761, r714, r721, r758; +} +{ +add.f16x2 r765, r643, r693; +} +{ +add.f16x2 r768, r646, r696; +} +{ +sub.f16x2 r771, r643, r693; +} +{ +sub.f16x2 r774, r646, r696; +} +{ +add.f16x2 r777, r655, r737; +} +{ +add.f16x2 r780, r658, r743; +} +{ +sub.f16x2 r783, r655, r737; +} +{ +sub.f16x2 r786, r658, r743; +} +{ +add.f16x2 r789, r649, r747; +} +{ +add.f16x2 r792, r652, r699; +} +{ +sub.f16x2 r795, r649, r747; +} +{ +sub.f16x2 r798, r652, r699; +} +{ +add.f16x2 r801, r661, r755; +} +{ +add.f16x2 r804, r664, r761; +} +{ +sub.f16x2 r807, r661, r755; +} +{ +sub.f16x2 r810, r664, r761; +} +{ +add.f16x2 r813, %108, %100; +} +{ +add.f16x2 r816, %79, %71; +} +{ +sub.f16x2 r819, %108, %100; +} +{ +sub.f16x2 r822, %79, %71; +} +{ +add.f16x2 r825, %126, %116; +} +{ +add.f16x2 r828, %98, %88; +} +{ +sub.f16x2 r831, %126, %116; +} +{ +sub.f16x2 r834, %98, %88; +} +{ +neg.f16x2 r837, r834; +} +{ +add.f16x2 r839, r813, r825; +} +{ +add.f16x2 r842, r816, r828; +} +{ +sub.f16x2 r845, r813, r825; +} +{ +sub.f16x2 r848, r816, r828; +} +{ +add.f16x2 r851, r819, r837; +} +{ +add.f16x2 r854, r822, r831; +} +{ +sub.f16x2 r857, r819, r837; +} +{ +sub.f16x2 r860, r822, r831; +} +{ +add.f16x2 r863, %92, %84; +} +{ +add.f16x2 r866, %65, %120; +} +{ +sub.f16x2 r869, %92, %84; +} +{ +sub.f16x2 r872, %65, %120; +} +{ +add.f16x2 r875, %112, %102; +} +{ +add.f16x2 r878, %82, %74; +} +{ +sub.f16x2 r881, %112, %102; +} +{ +sub.f16x2 r884, %82, %74; +} +{ +neg.f16x2 r887, r884; +} +{ +add.f16x2 r889, r863, r875; +} +{ +add.f16x2 r892, r866, r878; +} +{ +sub.f16x2 r895, r863, r875; +} +{ +sub.f16x2 r898, r866, r878; +} +{ +add.f16x2 r901, r869, r887; +} +{ +add.f16x2 r904, r872, r881; +} +{ +sub.f16x2 r907, r869, r887; +} +{ +sub.f16x2 r910, r872, r881; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r913, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r914, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r917, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r918, {low, high}; +} +{ +mul.f16x2 r927, r901, r913; +} +{ +mul.f16x2 r930, r904, r914; +} +{ +sub.f16x2 r933, r927, r930; +} +{ +mul.f16x2 r936, r901, r914; +} +{ +fma.rn.f16x2 r939, r904, r913, r936; +} +{ +neg.f16x2 r943, r898; +} +{ +mul.f16x2 r945, r907, r917; +} +{ +mul.f16x2 r948, r910, r918; +} +{ +sub.f16x2 r951, r945, r948; +} +{ +mul.f16x2 r954, r907, r918; +} +{ +fma.rn.f16x2 r957, r910, r917, r954; +} +{ +add.f16x2 r961, r839, r889; +} +{ +add.f16x2 r964, r842, r892; +} +{ +sub.f16x2 r967, r839, r889; +} +{ +sub.f16x2 r970, r842, r892; +} +{ +add.f16x2 r973, r851, r933; +} +{ +add.f16x2 r976, r854, r939; +} +{ +sub.f16x2 r979, r851, r933; +} +{ +sub.f16x2 r982, r854, r939; +} +{ +add.f16x2 r985, r845, r943; +} +{ +add.f16x2 r988, r848, r895; +} +{ +sub.f16x2 r991, r845, r943; +} +{ +sub.f16x2 r994, r848, r895; +} +{ +add.f16x2 r997, r857, r951; +} +{ +add.f16x2 r1000, r860, r957; +} +{ +sub.f16x2 r1003, r857, r951; +} +{ +sub.f16x2 r1006, r860, r957; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1009, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1010, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1011, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1012, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1013, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1014, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r1019, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1020, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1039, r973, r1009; +} +{ +mul.f16x2 r1042, r976, r1010; +} +{ +sub.f16x2 r1045, r1039, r1042; +} +{ +mul.f16x2 r1048, r973, r1010; +} +{ +fma.rn.f16x2 r1051, r976, r1009, r1048; +} +{ +mul.f16x2 r1055, r985, r1011; +} +{ +mul.f16x2 r1058, r988, r1012; +} +{ +sub.f16x2 r1061, r1055, r1058; +} +{ +mul.f16x2 r1064, r985, r1012; +} +{ +fma.rn.f16x2 r1067, r988, r1011, r1064; +} +{ +mul.f16x2 r1071, r997, r1013; +} +{ +mul.f16x2 r1074, r1000, r1014; +} +{ +sub.f16x2 r1077, r1071, r1074; +} +{ +mul.f16x2 r1080, r997, r1014; +} +{ +fma.rn.f16x2 r1083, r1000, r1013, r1080; +} +{ +neg.f16x2 r1087, r970; +} +{ +mul.f16x2 r1089, r979, r1017; +} +{ +mul.f16x2 r1092, r982, r1018; +} +{ +sub.f16x2 r1095, r1089, r1092; +} +{ +mul.f16x2 r1098, r979, r1018; +} +{ +fma.rn.f16x2 r1101, r982, r1017, r1098; +} +{ +mul.f16x2 r1105, r991, r1019; +} +{ +mul.f16x2 r1108, r994, r1020; +} +{ +sub.f16x2 r1111, r1105, r1108; +} +{ +mul.f16x2 r1114, r991, r1020; +} +{ +fma.rn.f16x2 r1117, r994, r1019, r1114; +} +{ +mul.f16x2 r1121, r1003, r1021; +} +{ +mul.f16x2 r1124, r1006, r1022; +} +{ +sub.f16x2 r1127, r1121, r1124; +} +{ +mul.f16x2 r1130, r1003, r1022; +} +{ +fma.rn.f16x2 r1133, r1006, r1021, r1130; +} +{ +add.f16x2 r1137, r765, r961; +} +{ +add.f16x2 r1140, r768, r964; +} +{ +sub.f16x2 r1143, r765, r961; +} +{ +sub.f16x2 r1146, r768, r964; +} +{ +add.f16x2 r1149, r777, r1045; +} +{ +add.f16x2 r1152, r780, r1051; +} +{ +sub.f16x2 r1155, r777, r1045; +} +{ +sub.f16x2 r1158, r780, r1051; +} +{ +add.f16x2 r1161, r789, r1061; +} +{ +add.f16x2 r1164, r792, r1067; +} +{ +sub.f16x2 r1167, r789, r1061; +} +{ +sub.f16x2 r1170, r792, r1067; +} +{ +add.f16x2 r1173, r801, r1077; +} +{ +add.f16x2 r1176, r804, r1083; +} +{ +sub.f16x2 r1179, r801, r1077; +} +{ +sub.f16x2 r1182, r804, r1083; +} +{ +add.f16x2 r1185, r771, r1087; +} +{ +add.f16x2 r1188, r774, r967; +} +{ +sub.f16x2 r1191, r771, r1087; +} +{ +sub.f16x2 r1194, r774, r967; +} +{ +add.f16x2 r1197, r783, r1095; +} +{ +add.f16x2 r1200, r786, r1101; +} +{ +sub.f16x2 r1203, r783, r1095; +} +{ +sub.f16x2 r1206, r786, r1101; +} +{ +add.f16x2 r1209, r795, r1111; +} +{ +add.f16x2 r1212, r798, r1117; +} +{ +sub.f16x2 r1215, r795, r1111; +} +{ +sub.f16x2 r1218, r798, r1117; +} +{ +add.f16x2 r1221, r807, r1127; +} +{ +add.f16x2 r1224, r810, r1133; +} +{ +sub.f16x2 r1227, r807, r1127; +} +{ +sub.f16x2 r1230, r810, r1133; +} +mov.f32 f690, 0f3F7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1233, {low, high}; +} +mov.f32 f714, 0f3E47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1235, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1236, {low, high}; +} +mov.f32 f698, 0f3F54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1237, {low, high}; +} +mov.f32 f706, 0f3F0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1238, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1239, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1240, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1241, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1243, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1244, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1245, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1246, {low, high}; +} +mov.f32 f688, 0fBE47C5C2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r1249, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r1250, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r1251, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r1252, {low, high}; +} +mov.f32 f696, 0fBF0E39DA; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r1253, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r1254, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r1255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r1256, {low, high}; +} +mov.f32 f704, 0fBF54DB31; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r1257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r1258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r1259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r1260, {low, high}; +} +mov.f32 f712, 0fBF7B14BE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r1261, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r1262, {low, high}; +} +{ +mul.f16x2 r1295, r1149, r1233; +} +{ +mul.f16x2 r1298, r1152, r1234; +} +{ +sub.f16x2 r1301, r1295, r1298; +} +{ +mul.f16x2 r1304, r1149, r1234; +} +{ +fma.rn.f16x2 r1307, r1152, r1233, r1304; +} +{ +mul.f16x2 r1311, r1161, r1235; +} +{ +mul.f16x2 r1314, r1164, r1236; +} +{ +sub.f16x2 r1317, r1311, r1314; +} +{ +mul.f16x2 r1320, r1161, r1236; +} +{ +fma.rn.f16x2 r1323, r1164, r1235, r1320; +} +{ +mul.f16x2 r1327, r1173, r1237; +} +{ +mul.f16x2 r1330, r1176, r1238; +} +{ +sub.f16x2 r1333, r1327, r1330; +} +{ +mul.f16x2 r1336, r1173, r1238; +} +{ +fma.rn.f16x2 r1339, r1176, r1237, r1336; +} +{ +mul.f16x2 r1343, r1185, r1239; +} +{ +mul.f16x2 r1346, r1188, r1240; +} +{ +sub.f16x2 r1349, r1343, r1346; +} +{ +mul.f16x2 r1352, r1185, r1240; +} +{ +fma.rn.f16x2 r1355, r1188, r1239, r1352; +} +{ +mul.f16x2 r1359, r1197, r1241; +} +{ +mul.f16x2 r1362, r1200, r1242; +} +{ +sub.f16x2 r1365, r1359, r1362; +} +{ +mul.f16x2 r1368, r1197, r1242; +} +{ +fma.rn.f16x2 r1371, r1200, r1241, r1368; +} +{ +mul.f16x2 r1375, r1209, r1243; +} +{ +mul.f16x2 r1378, r1212, r1244; +} +{ +sub.f16x2 r1381, r1375, r1378; +} +{ +mul.f16x2 r1384, r1209, r1244; +} +{ +fma.rn.f16x2 r1387, r1212, r1243, r1384; +} +{ +mul.f16x2 r1391, r1221, r1245; +} +{ +mul.f16x2 r1394, r1224, r1246; +} +{ +sub.f16x2 r1397, r1391, r1394; +} +{ +mul.f16x2 r1400, r1221, r1246; +} +{ +fma.rn.f16x2 r1403, r1224, r1245, r1400; +} +{ +neg.f16x2 r1407, r1146; +} +{ +mul.f16x2 r1409, r1155, r1249; +} +{ +mul.f16x2 r1412, r1158, r1250; +} +{ +sub.f16x2 r1415, r1409, r1412; +} +{ +mul.f16x2 r1418, r1155, r1250; +} +{ +fma.rn.f16x2 r1421, r1158, r1249, r1418; +} +{ +mul.f16x2 r1425, r1167, r1251; +} +{ +mul.f16x2 r1428, r1170, r1252; +} +{ +sub.f16x2 r1431, r1425, r1428; +} +{ +mul.f16x2 r1434, r1167, r1252; +} +{ +fma.rn.f16x2 r1437, r1170, r1251, r1434; +} +{ +mul.f16x2 r1441, r1179, r1253; +} +{ +mul.f16x2 r1444, r1182, r1254; +} +{ +sub.f16x2 r1447, r1441, r1444; +} +{ +mul.f16x2 r1450, r1179, r1254; +} +{ +fma.rn.f16x2 r1453, r1182, r1253, r1450; +} +{ +mul.f16x2 r1457, r1191, r1255; +} +{ +mul.f16x2 r1460, r1194, r1256; +} +{ +sub.f16x2 r1463, r1457, r1460; +} +{ +mul.f16x2 r1466, r1191, r1256; +} +{ +fma.rn.f16x2 r1469, r1194, r1255, r1466; +} +{ +mul.f16x2 r1473, r1203, r1257; +} +{ +mul.f16x2 r1476, r1206, r1258; +} +{ +sub.f16x2 r1479, r1473, r1476; +} +{ +mul.f16x2 r1482, r1203, r1258; +} +{ +fma.rn.f16x2 r1485, r1206, r1257, r1482; +} +{ +mul.f16x2 r1489, r1215, r1259; +} +{ +mul.f16x2 r1492, r1218, r1260; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1215, r1260; +} +{ +fma.rn.f16x2 r1501, r1218, r1259, r1498; +} +{ +mul.f16x2 r1505, r1227, r1261; +} +{ +mul.f16x2 r1508, r1230, r1262; +} +{ +sub.f16x2 r1511, r1505, r1508; +} +{ +mul.f16x2 r1514, r1227, r1262; +} +{ +fma.rn.f16x2 r1517, r1230, r1261, r1514; +} +{ +add.f16x2 r1521, r521, r1137; +} +{ +add.f16x2 r1524, r524, r1140; +} +{ +sub.f16x2 r1527, r521, r1137; +} +{ +sub.f16x2 r1530, r524, r1140; +} +{ +add.f16x2 r1533, r533, r1301; +} +{ +add.f16x2 r1536, r536, r1307; +} +{ +sub.f16x2 r1539, r533, r1301; +} +{ +sub.f16x2 r1542, r536, r1307; +} +{ +add.f16x2 r1545, r545, r1317; +} +{ +add.f16x2 r1548, r548, r1323; +} +{ +sub.f16x2 r1551, r545, r1317; +} +{ +sub.f16x2 r1554, r548, r1323; +} +{ +add.f16x2 r1557, r557, r1333; +} +{ +add.f16x2 r1560, r560, r1339; +} +{ +sub.f16x2 r1563, r557, r1333; +} +{ +sub.f16x2 r1566, r560, r1339; +} +{ +add.f16x2 r1569, r569, r1349; +} +{ +add.f16x2 r1572, r572, r1355; +} +{ +sub.f16x2 r1575, r569, r1349; +} +{ +sub.f16x2 r1578, r572, r1355; +} +{ +add.f16x2 r1581, r581, r1365; +} +{ +add.f16x2 r1584, r584, r1371; +} +{ +sub.f16x2 r1587, r581, r1365; +} +{ +sub.f16x2 r1590, r584, r1371; +} +{ +add.f16x2 r1593, r593, r1381; +} +{ +add.f16x2 r1596, r596, r1387; +} +{ +sub.f16x2 r1599, r593, r1381; +} +{ +sub.f16x2 r1602, r596, r1387; +} +{ +add.f16x2 r1605, r605, r1397; +} +{ +add.f16x2 r1608, r608, r1403; +} +{ +sub.f16x2 r1611, r605, r1397; +} +{ +sub.f16x2 r1614, r608, r1403; +} +{ +add.f16x2 r1617, r527, r1407; +} +{ +add.f16x2 r1620, r530, r1143; +} +{ +sub.f16x2 r1623, r527, r1407; +} +{ +sub.f16x2 r1626, r530, r1143; +} +{ +add.f16x2 r1629, r539, r1415; +} +{ +add.f16x2 r1632, r542, r1421; +} +{ +sub.f16x2 r1635, r539, r1415; +} +{ +sub.f16x2 r1638, r542, r1421; +} +{ +add.f16x2 r1641, r551, r1431; +} +{ +add.f16x2 r1644, r554, r1437; +} +{ +sub.f16x2 r1647, r551, r1431; +} +{ +sub.f16x2 r1650, r554, r1437; +} +{ +add.f16x2 r1653, r563, r1447; +} +{ +add.f16x2 r1656, r566, r1453; +} +{ +sub.f16x2 r1659, r563, r1447; +} +{ +sub.f16x2 r1662, r566, r1453; +} +{ +add.f16x2 r1665, r575, r1463; +} +{ +add.f16x2 r1668, r578, r1469; +} +{ +sub.f16x2 r1671, r575, r1463; +} +{ +sub.f16x2 r1674, r578, r1469; +} +{ +add.f16x2 r1677, r587, r1479; +} +{ +add.f16x2 r1680, r590, r1485; +} +{ +sub.f16x2 r1683, r587, r1479; +} +{ +sub.f16x2 r1686, r590, r1485; +} +{ +add.f16x2 r1689, r599, r1495; +} +{ +add.f16x2 r1692, r602, r1501; +} +{ +sub.f16x2 r1695, r599, r1495; +} +{ +sub.f16x2 r1698, r602, r1501; +} +{ +add.f16x2 r1701, r611, r1511; +} +{ +add.f16x2 r1704, r614, r1517; +} +{ +sub.f16x2 r1707, r611, r1511; +} +{ +sub.f16x2 r1710, r614, r1517; +} +shl.b32 r6515, r6513, 7; +and.b32 r6516, r6515, -32768; +add.s32 r6517, r6512, r6516; +and.b32 r6529, r6513, 255; +cvt.rn.f32.u32 f957, r6529; +mul.f32 f958, f957, 0f3A490FDB; +cos.approx.f32 f357, f958; +sin.approx.f32 f959, f958; +neg.f32 f358, f959; +mov.f32 f964, 0f3F800000; +mov.f32 f963, 0fBF800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f357; +cvt.rn.f16.f32 high, f358; +mov.b32 r1713, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1716, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1718, {high, high}; +} +{ +mul.f16x2 r1720, r1536, r1718; +} +{ +fma.rn.f16x2 r1723, r1533, r1716, r1720; +} +{ +mul.f16x2 r1727, r1533, r1718; +} +{ +neg.f16x2 r1730, r1727; +} +{ +fma.rn.f16x2 r1732, r1536, r1716, r1730; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1736, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1738, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1740, {low, high}; +} +{ +mul.f16x2 r1741, r1738, r1740; +} +{ +mul.f16x2 r1744, r1713, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1747, {high, low}; +} +{ +fma.rn.f16x2 r1749, r1741, r1747, r1744; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1753, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1755, {high, high}; +} +{ +mul.f16x2 r1757, r1548, r1755; +} +{ +fma.rn.f16x2 r1760, r1545, r1753, r1757; +} +{ +mul.f16x2 r1764, r1545, r1755; +} +{ +neg.f16x2 r1767, r1764; +} +{ +fma.rn.f16x2 r1769, r1548, r1753, r1767; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1773, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1775, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1777, {low, high}; +} +{ +mul.f16x2 r1778, r1775, r1777; +} +{ +mul.f16x2 r1781, r1749, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1749; +mov.b32 r1784, {high, low}; +} +{ +fma.rn.f16x2 r1786, r1778, r1784, r1781; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1790, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1792, {high, high}; +} +{ +mul.f16x2 r1794, r1560, r1792; +} +{ +fma.rn.f16x2 r1797, r1557, r1790, r1794; +} +{ +mul.f16x2 r1801, r1557, r1792; +} +{ +neg.f16x2 r1804, r1801; +} +{ +fma.rn.f16x2 r1806, r1560, r1790, r1804; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1810, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1812, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1814, {low, high}; +} +{ +mul.f16x2 r1815, r1812, r1814; +} +{ +mul.f16x2 r1818, r1786, r1810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1786; +mov.b32 r1821, {high, low}; +} +{ +fma.rn.f16x2 r1823, r1815, r1821, r1818; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1827, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1829, {high, high}; +} +{ +mul.f16x2 r1831, r1572, r1829; +} +{ +fma.rn.f16x2 r1834, r1569, r1827, r1831; +} +{ +mul.f16x2 r1838, r1569, r1829; +} +{ +neg.f16x2 r1841, r1838; +} +{ +fma.rn.f16x2 r1843, r1572, r1827, r1841; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1847, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1849, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1851, {low, high}; +} +{ +mul.f16x2 r1852, r1849, r1851; +} +{ +mul.f16x2 r1855, r1823, r1847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1823; +mov.b32 r1858, {high, low}; +} +{ +fma.rn.f16x2 r1860, r1852, r1858, r1855; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1864, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1866, {high, high}; +} +{ +mul.f16x2 r1868, r1584, r1866; +} +{ +fma.rn.f16x2 r1871, r1581, r1864, r1868; +} +{ +mul.f16x2 r1875, r1581, r1866; +} +{ +neg.f16x2 r1878, r1875; +} +{ +fma.rn.f16x2 r1880, r1584, r1864, r1878; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1884, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1886, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1888, {low, high}; +} +{ +mul.f16x2 r1889, r1886, r1888; +} +{ +mul.f16x2 r1892, r1860, r1884; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1860; +mov.b32 r1895, {high, low}; +} +{ +fma.rn.f16x2 r1897, r1889, r1895, r1892; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1901, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1903, {high, high}; +} +{ +mul.f16x2 r1905, r1596, r1903; +} +{ +fma.rn.f16x2 r1908, r1593, r1901, r1905; +} +{ +mul.f16x2 r1912, r1593, r1903; +} +{ +neg.f16x2 r1915, r1912; +} +{ +fma.rn.f16x2 r1917, r1596, r1901, r1915; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1921, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1923, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1925, {low, high}; +} +{ +mul.f16x2 r1926, r1923, r1925; +} +{ +mul.f16x2 r1929, r1897, r1921; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1897; +mov.b32 r1932, {high, low}; +} +{ +fma.rn.f16x2 r1934, r1926, r1932, r1929; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1938, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1940, {high, high}; +} +{ +mul.f16x2 r1942, r1608, r1940; +} +{ +fma.rn.f16x2 r1945, r1605, r1938, r1942; +} +{ +mul.f16x2 r1949, r1605, r1940; +} +{ +neg.f16x2 r1952, r1949; +} +{ +fma.rn.f16x2 r1954, r1608, r1938, r1952; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1958, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1960, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1962, {low, high}; +} +{ +mul.f16x2 r1963, r1960, r1962; +} +{ +mul.f16x2 r1966, r1934, r1958; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1934; +mov.b32 r1969, {high, low}; +} +{ +fma.rn.f16x2 r1971, r1963, r1969, r1966; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1975, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r1977, {high, high}; +} +{ +mul.f16x2 r1979, r1620, r1977; +} +{ +fma.rn.f16x2 r1982, r1617, r1975, r1979; +} +{ +mul.f16x2 r1986, r1617, r1977; +} +{ +neg.f16x2 r1989, r1986; +} +{ +fma.rn.f16x2 r1991, r1620, r1975, r1989; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1995, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r1997, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r1999, {low, high}; +} +{ +mul.f16x2 r2000, r1997, r1999; +} +{ +mul.f16x2 r2003, r1971, r1995; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1971; +mov.b32 r2006, {high, low}; +} +{ +fma.rn.f16x2 r2008, r2000, r2006, r2003; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2012, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2014, {high, high}; +} +{ +mul.f16x2 r2016, r1632, r2014; +} +{ +fma.rn.f16x2 r2019, r1629, r2012, r2016; +} +{ +mul.f16x2 r2023, r1629, r2014; +} +{ +neg.f16x2 r2026, r2023; +} +{ +fma.rn.f16x2 r2028, r1632, r2012, r2026; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2032, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2034, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2036, {low, high}; +} +{ +mul.f16x2 r2037, r2034, r2036; +} +{ +mul.f16x2 r2040, r2008, r2032; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2008; +mov.b32 r2043, {high, low}; +} +{ +fma.rn.f16x2 r2045, r2037, r2043, r2040; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2049, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2051, {high, high}; +} +{ +mul.f16x2 r2053, r1644, r2051; +} +{ +fma.rn.f16x2 r2056, r1641, r2049, r2053; +} +{ +mul.f16x2 r2060, r1641, r2051; +} +{ +neg.f16x2 r2063, r2060; +} +{ +fma.rn.f16x2 r2065, r1644, r2049, r2063; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2069, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2071, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2073, {low, high}; +} +{ +mul.f16x2 r2074, r2071, r2073; +} +{ +mul.f16x2 r2077, r2045, r2069; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2045; +mov.b32 r2080, {high, low}; +} +{ +fma.rn.f16x2 r2082, r2074, r2080, r2077; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2086, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2088, {high, high}; +} +{ +mul.f16x2 r2090, r1656, r2088; +} +{ +fma.rn.f16x2 r2093, r1653, r2086, r2090; +} +{ +mul.f16x2 r2097, r1653, r2088; +} +{ +neg.f16x2 r2100, r2097; +} +{ +fma.rn.f16x2 r2102, r1656, r2086, r2100; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2106, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2108, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2110, {low, high}; +} +{ +mul.f16x2 r2111, r2108, r2110; +} +{ +mul.f16x2 r2114, r2082, r2106; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2082; +mov.b32 r2117, {high, low}; +} +{ +fma.rn.f16x2 r2119, r2111, r2117, r2114; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2123, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2125, {high, high}; +} +{ +mul.f16x2 r2127, r1668, r2125; +} +{ +fma.rn.f16x2 r2130, r1665, r2123, r2127; +} +{ +mul.f16x2 r2134, r1665, r2125; +} +{ +neg.f16x2 r2137, r2134; +} +{ +fma.rn.f16x2 r2139, r1668, r2123, r2137; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2143, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2145, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2147, {low, high}; +} +{ +mul.f16x2 r2148, r2145, r2147; +} +{ +mul.f16x2 r2151, r2119, r2143; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2119; +mov.b32 r2154, {high, low}; +} +{ +fma.rn.f16x2 r2156, r2148, r2154, r2151; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2160, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2162, {high, high}; +} +{ +mul.f16x2 r2164, r1680, r2162; +} +{ +fma.rn.f16x2 r2167, r1677, r2160, r2164; +} +{ +mul.f16x2 r2171, r1677, r2162; +} +{ +neg.f16x2 r2174, r2171; +} +{ +fma.rn.f16x2 r2176, r1680, r2160, r2174; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2180, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2182, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2184, {low, high}; +} +{ +mul.f16x2 r2185, r2182, r2184; +} +{ +mul.f16x2 r2188, r2156, r2180; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2156; +mov.b32 r2191, {high, low}; +} +{ +fma.rn.f16x2 r2193, r2185, r2191, r2188; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2197, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2199, {high, high}; +} +{ +mul.f16x2 r2201, r1692, r2199; +} +{ +fma.rn.f16x2 r2204, r1689, r2197, r2201; +} +{ +mul.f16x2 r2208, r1689, r2199; +} +{ +neg.f16x2 r2211, r2208; +} +{ +fma.rn.f16x2 r2213, r1692, r2197, r2211; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2217, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2219, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2221, {low, high}; +} +{ +mul.f16x2 r2222, r2219, r2221; +} +{ +mul.f16x2 r2225, r2193, r2217; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2193; +mov.b32 r2228, {high, low}; +} +{ +fma.rn.f16x2 r2230, r2222, r2228, r2225; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2234, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2236, {high, high}; +} +{ +mul.f16x2 r2238, r1704, r2236; +} +{ +fma.rn.f16x2 r2241, r1701, r2234, r2238; +} +{ +mul.f16x2 r2245, r1701, r2236; +} +{ +neg.f16x2 r2248, r2245; +} +{ +fma.rn.f16x2 r2250, r1704, r2234, r2248; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2254, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2256, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2258, {low, high}; +} +{ +mul.f16x2 r2259, r2256, r2258; +} +{ +mul.f16x2 r2262, r2230, r2254; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2230; +mov.b32 r2265, {high, low}; +} +{ +fma.rn.f16x2 r2267, r2259, r2265, r2262; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2271, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2273, {high, high}; +} +{ +mul.f16x2 r2275, r1530, r2273; +} +{ +fma.rn.f16x2 r2278, r1527, r2271, r2275; +} +{ +mul.f16x2 r2282, r1527, r2273; +} +{ +neg.f16x2 r2285, r2282; +} +{ +fma.rn.f16x2 r2287, r1530, r2271, r2285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2291, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2293, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2295, {low, high}; +} +{ +mul.f16x2 r2296, r2293, r2295; +} +{ +mul.f16x2 r2299, r2267, r2291; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2267; +mov.b32 r2302, {high, low}; +} +{ +fma.rn.f16x2 r2304, r2296, r2302, r2299; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2308, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2310, {high, high}; +} +{ +mul.f16x2 r2312, r1542, r2310; +} +{ +fma.rn.f16x2 r2315, r1539, r2308, r2312; +} +{ +mul.f16x2 r2319, r1539, r2310; +} +{ +neg.f16x2 r2322, r2319; +} +{ +fma.rn.f16x2 r2324, r1542, r2308, r2322; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2328, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2330, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2332, {low, high}; +} +{ +mul.f16x2 r2333, r2330, r2332; +} +{ +mul.f16x2 r2336, r2304, r2328; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2304; +mov.b32 r2339, {high, low}; +} +{ +fma.rn.f16x2 r2341, r2333, r2339, r2336; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2345, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2347, {high, high}; +} +{ +mul.f16x2 r2349, r1554, r2347; +} +{ +fma.rn.f16x2 r2352, r1551, r2345, r2349; +} +{ +mul.f16x2 r2356, r1551, r2347; +} +{ +neg.f16x2 r2359, r2356; +} +{ +fma.rn.f16x2 r2361, r1554, r2345, r2359; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2365, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2367, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2369, {low, high}; +} +{ +mul.f16x2 r2370, r2367, r2369; +} +{ +mul.f16x2 r2373, r2341, r2365; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2341; +mov.b32 r2376, {high, low}; +} +{ +fma.rn.f16x2 r2378, r2370, r2376, r2373; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2382, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2384, {high, high}; +} +{ +mul.f16x2 r2386, r1566, r2384; +} +{ +fma.rn.f16x2 r2389, r1563, r2382, r2386; +} +{ +mul.f16x2 r2393, r1563, r2384; +} +{ +neg.f16x2 r2396, r2393; +} +{ +fma.rn.f16x2 r2398, r1566, r2382, r2396; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2402, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2404, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2406, {low, high}; +} +{ +mul.f16x2 r2407, r2404, r2406; +} +{ +mul.f16x2 r2410, r2378, r2402; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2378; +mov.b32 r2413, {high, low}; +} +{ +fma.rn.f16x2 r2415, r2407, r2413, r2410; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2419, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2421, {high, high}; +} +{ +mul.f16x2 r2423, r1578, r2421; +} +{ +fma.rn.f16x2 r2426, r1575, r2419, r2423; +} +{ +mul.f16x2 r2430, r1575, r2421; +} +{ +neg.f16x2 r2433, r2430; +} +{ +fma.rn.f16x2 r2435, r1578, r2419, r2433; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2439, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2441, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2443, {low, high}; +} +{ +mul.f16x2 r2444, r2441, r2443; +} +{ +mul.f16x2 r2447, r2415, r2439; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2415; +mov.b32 r2450, {high, low}; +} +{ +fma.rn.f16x2 r2452, r2444, r2450, r2447; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2456, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2458, {high, high}; +} +{ +mul.f16x2 r2460, r1590, r2458; +} +{ +fma.rn.f16x2 r2463, r1587, r2456, r2460; +} +{ +mul.f16x2 r2467, r1587, r2458; +} +{ +neg.f16x2 r2470, r2467; +} +{ +fma.rn.f16x2 r2472, r1590, r2456, r2470; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2476, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2478, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2480, {low, high}; +} +{ +mul.f16x2 r2481, r2478, r2480; +} +{ +mul.f16x2 r2484, r2452, r2476; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2452; +mov.b32 r2487, {high, low}; +} +{ +fma.rn.f16x2 r2489, r2481, r2487, r2484; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2493, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2495, {high, high}; +} +{ +mul.f16x2 r2497, r1602, r2495; +} +{ +fma.rn.f16x2 r2500, r1599, r2493, r2497; +} +{ +mul.f16x2 r2504, r1599, r2495; +} +{ +neg.f16x2 r2507, r2504; +} +{ +fma.rn.f16x2 r2509, r1602, r2493, r2507; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2513, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2515, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2517, {low, high}; +} +{ +mul.f16x2 r2518, r2515, r2517; +} +{ +mul.f16x2 r2521, r2489, r2513; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2489; +mov.b32 r2524, {high, low}; +} +{ +fma.rn.f16x2 r2526, r2518, r2524, r2521; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2530, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2532, {high, high}; +} +{ +mul.f16x2 r2534, r1614, r2532; +} +{ +fma.rn.f16x2 r2537, r1611, r2530, r2534; +} +{ +mul.f16x2 r2541, r1611, r2532; +} +{ +neg.f16x2 r2544, r2541; +} +{ +fma.rn.f16x2 r2546, r1614, r2530, r2544; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2550, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2552, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2554, {low, high}; +} +{ +mul.f16x2 r2555, r2552, r2554; +} +{ +mul.f16x2 r2558, r2526, r2550; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2526; +mov.b32 r2561, {high, low}; +} +{ +fma.rn.f16x2 r2563, r2555, r2561, r2558; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2567, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2569, {high, high}; +} +{ +mul.f16x2 r2571, r1626, r2569; +} +{ +fma.rn.f16x2 r2574, r1623, r2567, r2571; +} +{ +mul.f16x2 r2578, r1623, r2569; +} +{ +neg.f16x2 r2581, r2578; +} +{ +fma.rn.f16x2 r2583, r1626, r2567, r2581; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2587, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2589, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2591, {low, high}; +} +{ +mul.f16x2 r2592, r2589, r2591; +} +{ +mul.f16x2 r2595, r2563, r2587; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2563; +mov.b32 r2598, {high, low}; +} +{ +fma.rn.f16x2 r2600, r2592, r2598, r2595; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2604, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2606, {high, high}; +} +{ +mul.f16x2 r2608, r1638, r2606; +} +{ +fma.rn.f16x2 r2611, r1635, r2604, r2608; +} +{ +mul.f16x2 r2615, r1635, r2606; +} +{ +neg.f16x2 r2618, r2615; +} +{ +fma.rn.f16x2 r2620, r1638, r2604, r2618; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2624, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2626, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2628, {low, high}; +} +{ +mul.f16x2 r2629, r2626, r2628; +} +{ +mul.f16x2 r2632, r2600, r2624; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2600; +mov.b32 r2635, {high, low}; +} +{ +fma.rn.f16x2 r2637, r2629, r2635, r2632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2641, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2643, {high, high}; +} +{ +mul.f16x2 r2645, r1650, r2643; +} +{ +fma.rn.f16x2 r2648, r1647, r2641, r2645; +} +{ +mul.f16x2 r2652, r1647, r2643; +} +{ +neg.f16x2 r2655, r2652; +} +{ +fma.rn.f16x2 r2657, r1650, r2641, r2655; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2661, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2663, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2665, {low, high}; +} +{ +mul.f16x2 r2666, r2663, r2665; +} +{ +mul.f16x2 r2669, r2637, r2661; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2637; +mov.b32 r2672, {high, low}; +} +{ +fma.rn.f16x2 r2674, r2666, r2672, r2669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2680, {high, high}; +} +{ +mul.f16x2 r2682, r1662, r2680; +} +{ +fma.rn.f16x2 r2685, r1659, r2678, r2682; +} +{ +mul.f16x2 r2689, r1659, r2680; +} +{ +neg.f16x2 r2692, r2689; +} +{ +fma.rn.f16x2 r2694, r1662, r2678, r2692; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2698, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2700, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2702, {low, high}; +} +{ +mul.f16x2 r2703, r2700, r2702; +} +{ +mul.f16x2 r2706, r2674, r2698; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2674; +mov.b32 r2709, {high, low}; +} +{ +fma.rn.f16x2 r2711, r2703, r2709, r2706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2717, {high, high}; +} +{ +mul.f16x2 r2719, r1674, r2717; +} +{ +fma.rn.f16x2 r2722, r1671, r2715, r2719; +} +{ +mul.f16x2 r2726, r1671, r2717; +} +{ +neg.f16x2 r2729, r2726; +} +{ +fma.rn.f16x2 r2731, r1674, r2715, r2729; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2735, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2737, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2739, {low, high}; +} +{ +mul.f16x2 r2740, r2737, r2739; +} +{ +mul.f16x2 r2743, r2711, r2735; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2711; +mov.b32 r2746, {high, low}; +} +{ +fma.rn.f16x2 r2748, r2740, r2746, r2743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2754, {high, high}; +} +{ +mul.f16x2 r2756, r1686, r2754; +} +{ +fma.rn.f16x2 r2759, r1683, r2752, r2756; +} +{ +mul.f16x2 r2763, r1683, r2754; +} +{ +neg.f16x2 r2766, r2763; +} +{ +fma.rn.f16x2 r2768, r1686, r2752, r2766; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2772, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2774, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2776, {low, high}; +} +{ +mul.f16x2 r2777, r2774, r2776; +} +{ +mul.f16x2 r2780, r2748, r2772; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2748; +mov.b32 r2783, {high, low}; +} +{ +fma.rn.f16x2 r2785, r2777, r2783, r2780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2791, {high, high}; +} +{ +mul.f16x2 r2793, r1698, r2791; +} +{ +fma.rn.f16x2 r2796, r1695, r2789, r2793; +} +{ +mul.f16x2 r2800, r1695, r2791; +} +{ +neg.f16x2 r2803, r2800; +} +{ +fma.rn.f16x2 r2805, r1698, r2789, r2803; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2809, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1713; +mov.b32 r2811, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r2813, {low, high}; +} +{ +mul.f16x2 r2814, r2811, r2813; +} +{ +mul.f16x2 r2817, r2785, r2809; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2785; +mov.b32 r2820, {high, low}; +} +{ +fma.rn.f16x2 r2822, r2814, r2820, r2817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2822; +mov.b32 r2828, {high, high}; +} +{ +mul.f16x2 r2830, r1710, r2828; +} +{ +fma.rn.f16x2 r2833, r1707, r2826, r2830; +} +{ +mul.f16x2 r2837, r1707, r2828; +} +{ +neg.f16x2 r2840, r2837; +} +{ +fma.rn.f16x2 r2842, r1710, r2826, r2840; +} +barrier.sync 0; +and.b32 r6518, r6515, 32640; +add.s32 r6519, r6517, r6518; +st.shared.v4.f32 [r6519], {r1521, r1723, r1760, r1797}; +st.shared.v4.f32 [r6519+16], {r1834, r1871, r1908, r1945}; +st.shared.v4.f32 [r6519+32], {r1982, r2019, r2056, r2093}; +st.shared.v4.f32 [r6519+48], {r2130, r2167, r2204, r2241}; +st.shared.v4.f32 [r6519+64], {r2278, r2315, r2352, r2389}; +st.shared.v4.f32 [r6519+80], {r2426, r2463, r2500, r2537}; +st.shared.v4.f32 [r6519+96], {r2574, r2611, r2648, r2685}; +st.shared.v4.f32 [r6519+112], {r2722, r2759, r2796, r2833}; +barrier.sync 0; +mad.lo.s32 r6520, r6529, -124, r6519; +ld.shared.u32 r2864, [r6520]; +ld.shared.u32 r3480, [r6520+1024]; +ld.shared.u32 r3060, [r6520+2048]; +ld.shared.u32 r3676, [r6520+3072]; +ld.shared.u32 r2914, [r6520+4096]; +ld.shared.u32 r3530, [r6520+5120]; +ld.shared.u32 r3110, [r6520+6144]; +ld.shared.u32 r3726, [r6520+7168]; +ld.shared.u32 r2876, [r6520+8192]; +ld.shared.u32 r3492, [r6520+9216]; +ld.shared.u32 r3072, [r6520+10240]; +ld.shared.u32 r3688, [r6520+11264]; +ld.shared.u32 r2926, [r6520+12288]; +ld.shared.u32 r3542, [r6520+13312]; +ld.shared.u32 r3122, [r6520+14336]; +ld.shared.u32 r3738, [r6520+15360]; +ld.shared.u32 r2865, [r6520+16384]; +ld.shared.u32 r3481, [r6520+17408]; +ld.shared.u32 r3061, [r6520+18432]; +ld.shared.u32 r3677, [r6520+19456]; +ld.shared.u32 r2915, [r6520+20480]; +ld.shared.u32 r3531, [r6520+21504]; +ld.shared.u32 r3111, [r6520+22528]; +ld.shared.u32 r3727, [r6520+23552]; +ld.shared.u32 r2877, [r6520+24576]; +ld.shared.u32 r3493, [r6520+25600]; +ld.shared.u32 r3073, [r6520+26624]; +ld.shared.u32 r3689, [r6520+27648]; +ld.shared.u32 r2927, [r6520+28672]; +ld.shared.u32 r3543, [r6520+29696]; +ld.shared.u32 r3123, [r6520+30720]; +ld.shared.u32 r3739, [r6520+31744]; +barrier.sync 0; +st.shared.v4.f32 [r6519], {r1524, r1732, r1769, r1806}; +st.shared.v4.f32 [r6519+16], {r1843, r1880, r1917, r1954}; +st.shared.v4.f32 [r6519+32], {r1991, r2028, r2065, r2102}; +st.shared.v4.f32 [r6519+48], {r2139, r2176, r2213, r2250}; +st.shared.v4.f32 [r6519+64], {r2287, r2324, r2361, r2398}; +st.shared.v4.f32 [r6519+80], {r2435, r2472, r2509, r2546}; +st.shared.v4.f32 [r6519+96], {r2583, r2620, r2657, r2694}; +st.shared.v4.f32 [r6519+112], {r2731, r2768, r2805, r2842}; +barrier.sync 0; +ld.shared.u32 r2867, [r6520]; +ld.shared.u32 r3483, [r6520+1024]; +ld.shared.u32 r3063, [r6520+2048]; +ld.shared.u32 r3679, [r6520+3072]; +ld.shared.u32 r2917, [r6520+4096]; +ld.shared.u32 r3533, [r6520+5120]; +ld.shared.u32 r3113, [r6520+6144]; +ld.shared.u32 r3729, [r6520+7168]; +ld.shared.u32 r2879, [r6520+8192]; +ld.shared.u32 r3495, [r6520+9216]; +ld.shared.u32 r3075, [r6520+10240]; +ld.shared.u32 r3691, [r6520+11264]; +ld.shared.u32 r2929, [r6520+12288]; +ld.shared.u32 r3545, [r6520+13312]; +ld.shared.u32 r3125, [r6520+14336]; +ld.shared.u32 r3741, [r6520+15360]; +ld.shared.u32 r2868, [r6520+16384]; +ld.shared.u32 r3484, [r6520+17408]; +ld.shared.u32 r3064, [r6520+18432]; +ld.shared.u32 r3680, [r6520+19456]; +ld.shared.u32 r2918, [r6520+20480]; +ld.shared.u32 r3534, [r6520+21504]; +ld.shared.u32 r3114, [r6520+22528]; +ld.shared.u32 r3730, [r6520+23552]; +ld.shared.u32 r2880, [r6520+24576]; +ld.shared.u32 r3496, [r6520+25600]; +ld.shared.u32 r3076, [r6520+26624]; +ld.shared.u32 r3692, [r6520+27648]; +ld.shared.u32 r2930, [r6520+28672]; +ld.shared.u32 r3546, [r6520+29696]; +ld.shared.u32 r3126, [r6520+30720]; +ld.shared.u32 r3742, [r6520+31744]; +{ +add.f16x2 r2863, r2864, r2865; +} +{ +add.f16x2 r2866, r2867, r2868; +} +{ +sub.f16x2 r2869, r2864, r2865; +} +{ +sub.f16x2 r2872, r2867, r2868; +} +{ +add.f16x2 r2875, r2876, r2877; +} +{ +add.f16x2 r2878, r2879, r2880; +} +{ +sub.f16x2 r2881, r2876, r2877; +} +{ +sub.f16x2 r2884, r2879, r2880; +} +{ +neg.f16x2 r2887, r2884; +} +{ +add.f16x2 r2889, r2863, r2875; +} +{ +add.f16x2 r2892, r2866, r2878; +} +{ +sub.f16x2 r2895, r2863, r2875; +} +{ +sub.f16x2 r2898, r2866, r2878; +} +{ +add.f16x2 r2901, r2869, r2887; +} +{ +add.f16x2 r2904, r2872, r2881; +} +{ +sub.f16x2 r2907, r2869, r2887; +} +{ +sub.f16x2 r2910, r2872, r2881; +} +{ +add.f16x2 r2913, r2914, r2915; +} +{ +add.f16x2 r2916, r2917, r2918; +} +{ +sub.f16x2 r2919, r2914, r2915; +} +{ +sub.f16x2 r2922, r2917, r2918; +} +{ +add.f16x2 r2925, r2926, r2927; +} +{ +add.f16x2 r2928, r2929, r2930; +} +{ +sub.f16x2 r2931, r2926, r2927; +} +{ +sub.f16x2 r2934, r2929, r2930; +} +{ +neg.f16x2 r2937, r2934; +} +{ +add.f16x2 r2939, r2913, r2925; +} +{ +add.f16x2 r2942, r2916, r2928; +} +{ +sub.f16x2 r2945, r2913, r2925; +} +{ +sub.f16x2 r2948, r2916, r2928; +} +{ +add.f16x2 r2951, r2919, r2937; +} +{ +add.f16x2 r2954, r2922, r2931; +} +{ +sub.f16x2 r2957, r2919, r2937; +} +{ +sub.f16x2 r2960, r2922, r2931; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2963, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2964, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r2967, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r2968, {low, high}; +} +{ +mul.f16x2 r2977, r2951, r2963; +} +{ +mul.f16x2 r2980, r2954, r2964; +} +{ +sub.f16x2 r2983, r2977, r2980; +} +{ +mul.f16x2 r2986, r2951, r2964; +} +{ +fma.rn.f16x2 r2989, r2954, r2963, r2986; +} +{ +neg.f16x2 r2993, r2948; +} +{ +mul.f16x2 r2995, r2957, r2967; +} +{ +mul.f16x2 r2998, r2960, r2968; +} +{ +sub.f16x2 r3001, r2995, r2998; +} +{ +mul.f16x2 r3004, r2957, r2968; +} +{ +fma.rn.f16x2 r3007, r2960, r2967, r3004; +} +{ +add.f16x2 r3011, r2889, r2939; +} +{ +add.f16x2 r3014, r2892, r2942; +} +{ +sub.f16x2 r3017, r2889, r2939; +} +{ +sub.f16x2 r3020, r2892, r2942; +} +{ +add.f16x2 r3023, r2901, r2983; +} +{ +add.f16x2 r3026, r2904, r2989; +} +{ +sub.f16x2 r3029, r2901, r2983; +} +{ +sub.f16x2 r3032, r2904, r2989; +} +{ +add.f16x2 r3035, r2895, r2993; +} +{ +add.f16x2 r3038, r2898, r2945; +} +{ +sub.f16x2 r3041, r2895, r2993; +} +{ +sub.f16x2 r3044, r2898, r2945; +} +{ +add.f16x2 r3047, r2907, r3001; +} +{ +add.f16x2 r3050, r2910, r3007; +} +{ +sub.f16x2 r3053, r2907, r3001; +} +{ +sub.f16x2 r3056, r2910, r3007; +} +{ +add.f16x2 r3059, r3060, r3061; +} +{ +add.f16x2 r3062, r3063, r3064; +} +{ +sub.f16x2 r3065, r3060, r3061; +} +{ +sub.f16x2 r3068, r3063, r3064; +} +{ +add.f16x2 r3071, r3072, r3073; +} +{ +add.f16x2 r3074, r3075, r3076; +} +{ +sub.f16x2 r3077, r3072, r3073; +} +{ +sub.f16x2 r3080, r3075, r3076; +} +{ +neg.f16x2 r3083, r3080; +} +{ +add.f16x2 r3085, r3059, r3071; +} +{ +add.f16x2 r3088, r3062, r3074; +} +{ +sub.f16x2 r3091, r3059, r3071; +} +{ +sub.f16x2 r3094, r3062, r3074; +} +{ +add.f16x2 r3097, r3065, r3083; +} +{ +add.f16x2 r3100, r3068, r3077; +} +{ +sub.f16x2 r3103, r3065, r3083; +} +{ +sub.f16x2 r3106, r3068, r3077; +} +{ +add.f16x2 r3109, r3110, r3111; +} +{ +add.f16x2 r3112, r3113, r3114; +} +{ +sub.f16x2 r3115, r3110, r3111; +} +{ +sub.f16x2 r3118, r3113, r3114; +} +{ +add.f16x2 r3121, r3122, r3123; +} +{ +add.f16x2 r3124, r3125, r3126; +} +{ +sub.f16x2 r3127, r3122, r3123; +} +{ +sub.f16x2 r3130, r3125, r3126; +} +{ +neg.f16x2 r3133, r3130; +} +{ +add.f16x2 r3135, r3109, r3121; +} +{ +add.f16x2 r3138, r3112, r3124; +} +{ +sub.f16x2 r3141, r3109, r3121; +} +{ +sub.f16x2 r3144, r3112, r3124; +} +{ +add.f16x2 r3147, r3115, r3133; +} +{ +add.f16x2 r3150, r3118, r3127; +} +{ +sub.f16x2 r3153, r3115, r3133; +} +{ +sub.f16x2 r3156, r3118, r3127; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3159, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3163, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3164, {low, high}; +} +{ +mul.f16x2 r3173, r3147, r3159; +} +{ +mul.f16x2 r3176, r3150, r3160; +} +{ +sub.f16x2 r3179, r3173, r3176; +} +{ +mul.f16x2 r3182, r3147, r3160; +} +{ +fma.rn.f16x2 r3185, r3150, r3159, r3182; +} +{ +neg.f16x2 r3189, r3144; +} +{ +mul.f16x2 r3191, r3153, r3163; +} +{ +mul.f16x2 r3194, r3156, r3164; +} +{ +sub.f16x2 r3197, r3191, r3194; +} +{ +mul.f16x2 r3200, r3153, r3164; +} +{ +fma.rn.f16x2 r3203, r3156, r3163, r3200; +} +{ +add.f16x2 r3207, r3085, r3135; +} +{ +add.f16x2 r3210, r3088, r3138; +} +{ +sub.f16x2 r3213, r3085, r3135; +} +{ +sub.f16x2 r3216, r3088, r3138; +} +{ +add.f16x2 r3219, r3097, r3179; +} +{ +add.f16x2 r3222, r3100, r3185; +} +{ +sub.f16x2 r3225, r3097, r3179; +} +{ +sub.f16x2 r3228, r3100, r3185; +} +{ +add.f16x2 r3231, r3091, r3189; +} +{ +add.f16x2 r3234, r3094, r3141; +} +{ +sub.f16x2 r3237, r3091, r3189; +} +{ +sub.f16x2 r3240, r3094, r3141; +} +{ +add.f16x2 r3243, r3103, r3197; +} +{ +add.f16x2 r3246, r3106, r3203; +} +{ +sub.f16x2 r3249, r3103, r3197; +} +{ +sub.f16x2 r3252, r3106, r3203; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3255, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3256, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3257, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3258, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3259, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3260, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3263, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3264, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3265, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3267, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3285, r3219, r3255; +} +{ +mul.f16x2 r3288, r3222, r3256; +} +{ +sub.f16x2 r3291, r3285, r3288; +} +{ +mul.f16x2 r3294, r3219, r3256; +} +{ +fma.rn.f16x2 r3297, r3222, r3255, r3294; +} +{ +mul.f16x2 r3301, r3231, r3257; +} +{ +mul.f16x2 r3304, r3234, r3258; +} +{ +sub.f16x2 r3307, r3301, r3304; +} +{ +mul.f16x2 r3310, r3231, r3258; +} +{ +fma.rn.f16x2 r3313, r3234, r3257, r3310; +} +{ +mul.f16x2 r3317, r3243, r3259; +} +{ +mul.f16x2 r3320, r3246, r3260; +} +{ +sub.f16x2 r3323, r3317, r3320; +} +{ +mul.f16x2 r3326, r3243, r3260; +} +{ +fma.rn.f16x2 r3329, r3246, r3259, r3326; +} +{ +neg.f16x2 r3333, r3216; +} +{ +mul.f16x2 r3335, r3225, r3263; +} +{ +mul.f16x2 r3338, r3228, r3264; +} +{ +sub.f16x2 r3341, r3335, r3338; +} +{ +mul.f16x2 r3344, r3225, r3264; +} +{ +fma.rn.f16x2 r3347, r3228, r3263, r3344; +} +{ +mul.f16x2 r3351, r3237, r3265; +} +{ +mul.f16x2 r3354, r3240, r3266; +} +{ +sub.f16x2 r3357, r3351, r3354; +} +{ +mul.f16x2 r3360, r3237, r3266; +} +{ +fma.rn.f16x2 r3363, r3240, r3265, r3360; +} +{ +mul.f16x2 r3367, r3249, r3267; +} +{ +mul.f16x2 r3370, r3252, r3268; +} +{ +sub.f16x2 r3373, r3367, r3370; +} +{ +mul.f16x2 r3376, r3249, r3268; +} +{ +fma.rn.f16x2 r3379, r3252, r3267, r3376; +} +{ +add.f16x2 r3383, r3011, r3207; +} +{ +add.f16x2 r3386, r3014, r3210; +} +{ +sub.f16x2 r3389, r3011, r3207; +} +{ +sub.f16x2 r3392, r3014, r3210; +} +{ +add.f16x2 r3395, r3023, r3291; +} +{ +add.f16x2 r3398, r3026, r3297; +} +{ +sub.f16x2 r3401, r3023, r3291; +} +{ +sub.f16x2 r3404, r3026, r3297; +} +{ +add.f16x2 r3407, r3035, r3307; +} +{ +add.f16x2 r3410, r3038, r3313; +} +{ +sub.f16x2 r3413, r3035, r3307; +} +{ +sub.f16x2 r3416, r3038, r3313; +} +{ +add.f16x2 r3419, r3047, r3323; +} +{ +add.f16x2 r3422, r3050, r3329; +} +{ +sub.f16x2 r3425, r3047, r3323; +} +{ +sub.f16x2 r3428, r3050, r3329; +} +{ +add.f16x2 r3431, r3017, r3333; +} +{ +add.f16x2 r3434, r3020, r3213; +} +{ +sub.f16x2 r3437, r3017, r3333; +} +{ +sub.f16x2 r3440, r3020, r3213; +} +{ +add.f16x2 r3443, r3029, r3341; +} +{ +add.f16x2 r3446, r3032, r3347; +} +{ +sub.f16x2 r3449, r3029, r3341; +} +{ +sub.f16x2 r3452, r3032, r3347; +} +{ +add.f16x2 r3455, r3041, r3357; +} +{ +add.f16x2 r3458, r3044, r3363; +} +{ +sub.f16x2 r3461, r3041, r3357; +} +{ +sub.f16x2 r3464, r3044, r3363; +} +{ +add.f16x2 r3467, r3053, r3373; +} +{ +add.f16x2 r3470, r3056, r3379; +} +{ +sub.f16x2 r3473, r3053, r3373; +} +{ +sub.f16x2 r3476, r3056, r3379; +} +{ +add.f16x2 r3479, r3480, r3481; +} +{ +add.f16x2 r3482, r3483, r3484; +} +{ +sub.f16x2 r3485, r3480, r3481; +} +{ +sub.f16x2 r3488, r3483, r3484; +} +{ +add.f16x2 r3491, r3492, r3493; +} +{ +add.f16x2 r3494, r3495, r3496; +} +{ +sub.f16x2 r3497, r3492, r3493; +} +{ +sub.f16x2 r3500, r3495, r3496; +} +{ +neg.f16x2 r3503, r3500; +} +{ +add.f16x2 r3505, r3479, r3491; +} +{ +add.f16x2 r3508, r3482, r3494; +} +{ +sub.f16x2 r3511, r3479, r3491; +} +{ +sub.f16x2 r3514, r3482, r3494; +} +{ +add.f16x2 r3517, r3485, r3503; +} +{ +add.f16x2 r3520, r3488, r3497; +} +{ +sub.f16x2 r3523, r3485, r3503; +} +{ +sub.f16x2 r3526, r3488, r3497; +} +{ +add.f16x2 r3529, r3530, r3531; +} +{ +add.f16x2 r3532, r3533, r3534; +} +{ +sub.f16x2 r3535, r3530, r3531; +} +{ +sub.f16x2 r3538, r3533, r3534; +} +{ +add.f16x2 r3541, r3542, r3543; +} +{ +add.f16x2 r3544, r3545, r3546; +} +{ +sub.f16x2 r3547, r3542, r3543; +} +{ +sub.f16x2 r3550, r3545, r3546; +} +{ +neg.f16x2 r3553, r3550; +} +{ +add.f16x2 r3555, r3529, r3541; +} +{ +add.f16x2 r3558, r3532, r3544; +} +{ +sub.f16x2 r3561, r3529, r3541; +} +{ +sub.f16x2 r3564, r3532, r3544; +} +{ +add.f16x2 r3567, r3535, r3553; +} +{ +add.f16x2 r3570, r3538, r3547; +} +{ +sub.f16x2 r3573, r3535, r3553; +} +{ +sub.f16x2 r3576, r3538, r3547; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3580, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3583, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3584, {low, high}; +} +{ +mul.f16x2 r3593, r3567, r3579; +} +{ +mul.f16x2 r3596, r3570, r3580; +} +{ +sub.f16x2 r3599, r3593, r3596; +} +{ +mul.f16x2 r3602, r3567, r3580; +} +{ +fma.rn.f16x2 r3605, r3570, r3579, r3602; +} +{ +neg.f16x2 r3609, r3564; +} +{ +mul.f16x2 r3611, r3573, r3583; +} +{ +mul.f16x2 r3614, r3576, r3584; +} +{ +sub.f16x2 r3617, r3611, r3614; +} +{ +mul.f16x2 r3620, r3573, r3584; +} +{ +fma.rn.f16x2 r3623, r3576, r3583, r3620; +} +{ +add.f16x2 r3627, r3505, r3555; +} +{ +add.f16x2 r3630, r3508, r3558; +} +{ +sub.f16x2 r3633, r3505, r3555; +} +{ +sub.f16x2 r3636, r3508, r3558; +} +{ +add.f16x2 r3639, r3517, r3599; +} +{ +add.f16x2 r3642, r3520, r3605; +} +{ +sub.f16x2 r3645, r3517, r3599; +} +{ +sub.f16x2 r3648, r3520, r3605; +} +{ +add.f16x2 r3651, r3511, r3609; +} +{ +add.f16x2 r3654, r3514, r3561; +} +{ +sub.f16x2 r3657, r3511, r3609; +} +{ +sub.f16x2 r3660, r3514, r3561; +} +{ +add.f16x2 r3663, r3523, r3617; +} +{ +add.f16x2 r3666, r3526, r3623; +} +{ +sub.f16x2 r3669, r3523, r3617; +} +{ +sub.f16x2 r3672, r3526, r3623; +} +{ +add.f16x2 r3675, r3676, r3677; +} +{ +add.f16x2 r3678, r3679, r3680; +} +{ +sub.f16x2 r3681, r3676, r3677; +} +{ +sub.f16x2 r3684, r3679, r3680; +} +{ +add.f16x2 r3687, r3688, r3689; +} +{ +add.f16x2 r3690, r3691, r3692; +} +{ +sub.f16x2 r3693, r3688, r3689; +} +{ +sub.f16x2 r3696, r3691, r3692; +} +{ +neg.f16x2 r3699, r3696; +} +{ +add.f16x2 r3701, r3675, r3687; +} +{ +add.f16x2 r3704, r3678, r3690; +} +{ +sub.f16x2 r3707, r3675, r3687; +} +{ +sub.f16x2 r3710, r3678, r3690; +} +{ +add.f16x2 r3713, r3681, r3699; +} +{ +add.f16x2 r3716, r3684, r3693; +} +{ +sub.f16x2 r3719, r3681, r3699; +} +{ +sub.f16x2 r3722, r3684, r3693; +} +{ +add.f16x2 r3725, r3726, r3727; +} +{ +add.f16x2 r3728, r3729, r3730; +} +{ +sub.f16x2 r3731, r3726, r3727; +} +{ +sub.f16x2 r3734, r3729, r3730; +} +{ +add.f16x2 r3737, r3738, r3739; +} +{ +add.f16x2 r3740, r3741, r3742; +} +{ +sub.f16x2 r3743, r3738, r3739; +} +{ +sub.f16x2 r3746, r3741, r3742; +} +{ +neg.f16x2 r3749, r3746; +} +{ +add.f16x2 r3751, r3725, r3737; +} +{ +add.f16x2 r3754, r3728, r3740; +} +{ +sub.f16x2 r3757, r3725, r3737; +} +{ +sub.f16x2 r3760, r3728, r3740; +} +{ +add.f16x2 r3763, r3731, r3749; +} +{ +add.f16x2 r3766, r3734, r3743; +} +{ +sub.f16x2 r3769, r3731, r3749; +} +{ +sub.f16x2 r3772, r3734, r3743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3775, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3776, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3779, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3780, {low, high}; +} +{ +mul.f16x2 r3789, r3763, r3775; +} +{ +mul.f16x2 r3792, r3766, r3776; +} +{ +sub.f16x2 r3795, r3789, r3792; +} +{ +mul.f16x2 r3798, r3763, r3776; +} +{ +fma.rn.f16x2 r3801, r3766, r3775, r3798; +} +{ +neg.f16x2 r3805, r3760; +} +{ +mul.f16x2 r3807, r3769, r3779; +} +{ +mul.f16x2 r3810, r3772, r3780; +} +{ +sub.f16x2 r3813, r3807, r3810; +} +{ +mul.f16x2 r3816, r3769, r3780; +} +{ +fma.rn.f16x2 r3819, r3772, r3779, r3816; +} +{ +add.f16x2 r3823, r3701, r3751; +} +{ +add.f16x2 r3826, r3704, r3754; +} +{ +sub.f16x2 r3829, r3701, r3751; +} +{ +sub.f16x2 r3832, r3704, r3754; +} +{ +add.f16x2 r3835, r3713, r3795; +} +{ +add.f16x2 r3838, r3716, r3801; +} +{ +sub.f16x2 r3841, r3713, r3795; +} +{ +sub.f16x2 r3844, r3716, r3801; +} +{ +add.f16x2 r3847, r3707, r3805; +} +{ +add.f16x2 r3850, r3710, r3757; +} +{ +sub.f16x2 r3853, r3707, r3805; +} +{ +sub.f16x2 r3856, r3710, r3757; +} +{ +add.f16x2 r3859, r3719, r3813; +} +{ +add.f16x2 r3862, r3722, r3819; +} +{ +sub.f16x2 r3865, r3719, r3813; +} +{ +sub.f16x2 r3868, r3722, r3819; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3871, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3872, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r3879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r3880, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r3881, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r3882, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r3883, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r3884, {low, high}; +} +{ +mul.f16x2 r3901, r3835, r3871; +} +{ +mul.f16x2 r3904, r3838, r3872; +} +{ +sub.f16x2 r3907, r3901, r3904; +} +{ +mul.f16x2 r3910, r3835, r3872; +} +{ +fma.rn.f16x2 r3913, r3838, r3871, r3910; +} +{ +mul.f16x2 r3917, r3847, r3873; +} +{ +mul.f16x2 r3920, r3850, r3874; +} +{ +sub.f16x2 r3923, r3917, r3920; +} +{ +mul.f16x2 r3926, r3847, r3874; +} +{ +fma.rn.f16x2 r3929, r3850, r3873, r3926; +} +{ +mul.f16x2 r3933, r3859, r3875; +} +{ +mul.f16x2 r3936, r3862, r3876; +} +{ +sub.f16x2 r3939, r3933, r3936; +} +{ +mul.f16x2 r3942, r3859, r3876; +} +{ +fma.rn.f16x2 r3945, r3862, r3875, r3942; +} +{ +neg.f16x2 r3949, r3832; +} +{ +mul.f16x2 r3951, r3841, r3879; +} +{ +mul.f16x2 r3954, r3844, r3880; +} +{ +sub.f16x2 r3957, r3951, r3954; +} +{ +mul.f16x2 r3960, r3841, r3880; +} +{ +fma.rn.f16x2 r3963, r3844, r3879, r3960; +} +{ +mul.f16x2 r3967, r3853, r3881; +} +{ +mul.f16x2 r3970, r3856, r3882; +} +{ +sub.f16x2 r3973, r3967, r3970; +} +{ +mul.f16x2 r3976, r3853, r3882; +} +{ +fma.rn.f16x2 r3979, r3856, r3881, r3976; +} +{ +mul.f16x2 r3983, r3865, r3883; +} +{ +mul.f16x2 r3986, r3868, r3884; +} +{ +sub.f16x2 r3989, r3983, r3986; +} +{ +mul.f16x2 r3992, r3865, r3884; +} +{ +fma.rn.f16x2 r3995, r3868, r3883, r3992; +} +{ +add.f16x2 r3999, r3627, r3823; +} +{ +add.f16x2 r4002, r3630, r3826; +} +{ +sub.f16x2 r4005, r3627, r3823; +} +{ +sub.f16x2 r4008, r3630, r3826; +} +{ +add.f16x2 r4011, r3639, r3907; +} +{ +add.f16x2 r4014, r3642, r3913; +} +{ +sub.f16x2 r4017, r3639, r3907; +} +{ +sub.f16x2 r4020, r3642, r3913; +} +{ +add.f16x2 r4023, r3651, r3923; +} +{ +add.f16x2 r4026, r3654, r3929; +} +{ +sub.f16x2 r4029, r3651, r3923; +} +{ +sub.f16x2 r4032, r3654, r3929; +} +{ +add.f16x2 r4035, r3663, r3939; +} +{ +add.f16x2 r4038, r3666, r3945; +} +{ +sub.f16x2 r4041, r3663, r3939; +} +{ +sub.f16x2 r4044, r3666, r3945; +} +{ +add.f16x2 r4047, r3633, r3949; +} +{ +add.f16x2 r4050, r3636, r3829; +} +{ +sub.f16x2 r4053, r3633, r3949; +} +{ +sub.f16x2 r4056, r3636, r3829; +} +{ +add.f16x2 r4059, r3645, r3957; +} +{ +add.f16x2 r4062, r3648, r3963; +} +{ +sub.f16x2 r4065, r3645, r3957; +} +{ +sub.f16x2 r4068, r3648, r3963; +} +{ +add.f16x2 r4071, r3657, r3973; +} +{ +add.f16x2 r4074, r3660, r3979; +} +{ +sub.f16x2 r4077, r3657, r3973; +} +{ +sub.f16x2 r4080, r3660, r3979; +} +{ +add.f16x2 r4083, r3669, r3989; +} +{ +add.f16x2 r4086, r3672, r3995; +} +{ +sub.f16x2 r4089, r3669, r3989; +} +{ +sub.f16x2 r4092, r3672, r3995; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4095, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4096, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4098, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4099, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4100, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4102, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4103, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4104, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4106, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4108, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f688; +cvt.rn.f16.f32 high, f688; +mov.b32 r4111, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f690; +cvt.rn.f16.f32 high, f690; +mov.b32 r4112, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f692; +cvt.rn.f16.f32 high, f692; +mov.b32 r4113, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f694; +cvt.rn.f16.f32 high, f694; +mov.b32 r4114, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f696; +cvt.rn.f16.f32 high, f696; +mov.b32 r4115, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f698; +cvt.rn.f16.f32 high, f698; +mov.b32 r4116, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r4117, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r4118, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f704; +cvt.rn.f16.f32 high, f704; +mov.b32 r4119, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f706; +cvt.rn.f16.f32 high, f706; +mov.b32 r4120, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f708; +cvt.rn.f16.f32 high, f708; +mov.b32 r4121, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f710; +cvt.rn.f16.f32 high, f710; +mov.b32 r4122, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f712; +cvt.rn.f16.f32 high, f712; +mov.b32 r4123, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f714; +cvt.rn.f16.f32 high, f714; +mov.b32 r4124, {low, high}; +} +{ +mul.f16x2 r4157, r4011, r4095; +} +{ +mul.f16x2 r4160, r4014, r4096; +} +{ +sub.f16x2 r4163, r4157, r4160; +} +{ +mul.f16x2 r4166, r4011, r4096; +} +{ +fma.rn.f16x2 r4169, r4014, r4095, r4166; +} +{ +mul.f16x2 r4173, r4023, r4097; +} +{ +mul.f16x2 r4176, r4026, r4098; +} +{ +sub.f16x2 r4179, r4173, r4176; +} +{ +mul.f16x2 r4182, r4023, r4098; +} +{ +fma.rn.f16x2 r4185, r4026, r4097, r4182; +} +{ +mul.f16x2 r4189, r4035, r4099; +} +{ +mul.f16x2 r4192, r4038, r4100; +} +{ +sub.f16x2 r4195, r4189, r4192; +} +{ +mul.f16x2 r4198, r4035, r4100; +} +{ +fma.rn.f16x2 r4201, r4038, r4099, r4198; +} +{ +mul.f16x2 r4205, r4047, r4101; +} +{ +mul.f16x2 r4208, r4050, r4102; +} +{ +sub.f16x2 r4211, r4205, r4208; +} +{ +mul.f16x2 r4214, r4047, r4102; +} +{ +fma.rn.f16x2 r4217, r4050, r4101, r4214; +} +{ +mul.f16x2 r4221, r4059, r4103; +} +{ +mul.f16x2 r4224, r4062, r4104; +} +{ +sub.f16x2 r4227, r4221, r4224; +} +{ +mul.f16x2 r4230, r4059, r4104; +} +{ +fma.rn.f16x2 r4233, r4062, r4103, r4230; +} +{ +mul.f16x2 r4237, r4071, r4105; +} +{ +mul.f16x2 r4240, r4074, r4106; +} +{ +sub.f16x2 r4243, r4237, r4240; +} +{ +mul.f16x2 r4246, r4071, r4106; +} +{ +fma.rn.f16x2 r4249, r4074, r4105, r4246; +} +{ +mul.f16x2 r4253, r4083, r4107; +} +{ +mul.f16x2 r4256, r4086, r4108; +} +{ +sub.f16x2 r4259, r4253, r4256; +} +{ +mul.f16x2 r4262, r4083, r4108; +} +{ +fma.rn.f16x2 r4265, r4086, r4107, r4262; +} +{ +neg.f16x2 r4269, r4008; +} +{ +mul.f16x2 r4271, r4017, r4111; +} +{ +mul.f16x2 r4274, r4020, r4112; +} +{ +sub.f16x2 r4277, r4271, r4274; +} +{ +mul.f16x2 r4280, r4017, r4112; +} +{ +fma.rn.f16x2 r4283, r4020, r4111, r4280; +} +{ +mul.f16x2 r4287, r4029, r4113; +} +{ +mul.f16x2 r4290, r4032, r4114; +} +{ +sub.f16x2 r4293, r4287, r4290; +} +{ +mul.f16x2 r4296, r4029, r4114; +} +{ +fma.rn.f16x2 r4299, r4032, r4113, r4296; +} +{ +mul.f16x2 r4303, r4041, r4115; +} +{ +mul.f16x2 r4306, r4044, r4116; +} +{ +sub.f16x2 r4309, r4303, r4306; +} +{ +mul.f16x2 r4312, r4041, r4116; +} +{ +fma.rn.f16x2 r4315, r4044, r4115, r4312; +} +{ +mul.f16x2 r4319, r4053, r4117; +} +{ +mul.f16x2 r4322, r4056, r4118; +} +{ +sub.f16x2 r4325, r4319, r4322; +} +{ +mul.f16x2 r4328, r4053, r4118; +} +{ +fma.rn.f16x2 r4331, r4056, r4117, r4328; +} +{ +mul.f16x2 r4335, r4065, r4119; +} +{ +mul.f16x2 r4338, r4068, r4120; +} +{ +sub.f16x2 r4341, r4335, r4338; +} +{ +mul.f16x2 r4344, r4065, r4120; +} +{ +fma.rn.f16x2 r4347, r4068, r4119, r4344; +} +{ +mul.f16x2 r4351, r4077, r4121; +} +{ +mul.f16x2 r4354, r4080, r4122; +} +{ +sub.f16x2 r4357, r4351, r4354; +} +{ +mul.f16x2 r4360, r4077, r4122; +} +{ +fma.rn.f16x2 r4363, r4080, r4121, r4360; +} +{ +mul.f16x2 r4367, r4089, r4123; +} +{ +mul.f16x2 r4370, r4092, r4124; +} +{ +sub.f16x2 r4373, r4367, r4370; +} +{ +mul.f16x2 r4376, r4089, r4124; +} +{ +fma.rn.f16x2 r4379, r4092, r4123, r4376; +} +{ +add.f16x2 r4383, r3383, r3999; +} +{ +add.f16x2 r4386, r3386, r4002; +} +{ +sub.f16x2 r4389, r3383, r3999; +} +{ +sub.f16x2 r4392, r3386, r4002; +} +{ +add.f16x2 r4395, r3395, r4163; +} +{ +add.f16x2 r4398, r3398, r4169; +} +{ +sub.f16x2 r4401, r3395, r4163; +} +{ +sub.f16x2 r4404, r3398, r4169; +} +{ +add.f16x2 r4407, r3407, r4179; +} +{ +add.f16x2 r4410, r3410, r4185; +} +{ +sub.f16x2 r4413, r3407, r4179; +} +{ +sub.f16x2 r4416, r3410, r4185; +} +{ +add.f16x2 r4419, r3419, r4195; +} +{ +add.f16x2 r4422, r3422, r4201; +} +{ +sub.f16x2 r4425, r3419, r4195; +} +{ +sub.f16x2 r4428, r3422, r4201; +} +{ +add.f16x2 r4431, r3431, r4211; +} +{ +add.f16x2 r4434, r3434, r4217; +} +{ +sub.f16x2 r4437, r3431, r4211; +} +{ +sub.f16x2 r4440, r3434, r4217; +} +{ +add.f16x2 r4443, r3443, r4227; +} +{ +add.f16x2 r4446, r3446, r4233; +} +{ +sub.f16x2 r4449, r3443, r4227; +} +{ +sub.f16x2 r4452, r3446, r4233; +} +{ +add.f16x2 r4455, r3455, r4243; +} +{ +add.f16x2 r4458, r3458, r4249; +} +{ +sub.f16x2 r4461, r3455, r4243; +} +{ +sub.f16x2 r4464, r3458, r4249; +} +{ +add.f16x2 r4467, r3467, r4259; +} +{ +add.f16x2 r4470, r3470, r4265; +} +{ +sub.f16x2 r4473, r3467, r4259; +} +{ +sub.f16x2 r4476, r3470, r4265; +} +{ +add.f16x2 r4479, r3389, r4269; +} +{ +add.f16x2 r4482, r3392, r4005; +} +{ +sub.f16x2 r4485, r3389, r4269; +} +{ +sub.f16x2 r4488, r3392, r4005; +} +{ +add.f16x2 r4491, r3401, r4277; +} +{ +add.f16x2 r4494, r3404, r4283; +} +{ +sub.f16x2 r4497, r3401, r4277; +} +{ +sub.f16x2 r4500, r3404, r4283; +} +{ +add.f16x2 r4503, r3413, r4293; +} +{ +add.f16x2 r4506, r3416, r4299; +} +{ +sub.f16x2 r4509, r3413, r4293; +} +{ +sub.f16x2 r4512, r3416, r4299; +} +{ +add.f16x2 r4515, r3425, r4309; +} +{ +add.f16x2 r4518, r3428, r4315; +} +{ +sub.f16x2 r4521, r3425, r4309; +} +{ +sub.f16x2 r4524, r3428, r4315; +} +{ +add.f16x2 r4527, r3437, r4325; +} +{ +add.f16x2 r4530, r3440, r4331; +} +{ +sub.f16x2 r4533, r3437, r4325; +} +{ +sub.f16x2 r4536, r3440, r4331; +} +{ +add.f16x2 r4539, r3449, r4341; +} +{ +add.f16x2 r4542, r3452, r4347; +} +{ +sub.f16x2 r4545, r3449, r4341; +} +{ +sub.f16x2 r4548, r3452, r4347; +} +{ +add.f16x2 r4551, r3461, r4357; +} +{ +add.f16x2 r4554, r3464, r4363; +} +{ +sub.f16x2 r4557, r3461, r4357; +} +{ +sub.f16x2 r4560, r3464, r4363; +} +{ +add.f16x2 r4563, r3473, r4373; +} +{ +add.f16x2 r4566, r3476, r4379; +} +{ +sub.f16x2 r4569, r3473, r4373; +} +{ +sub.f16x2 r4572, r3476, r4379; +} +and.b32 r6521, r6513, 224; +bfe.u32 r6522, r6513, 5, 3; +shl.b32 r6523, r6513, 2; +and.b32 r6524, r6523, 124; +add.s32 r6525, r6517, r6524; +cvt.rn.f32.u32 f960, r6522; +mul.f32 f961, f960, 0f3CC90FDB; +cos.approx.f32 f779, f961; +sin.approx.f32 f962, f961; +neg.f32 f780, f962; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f779; +cvt.rn.f16.f32 high, f780; +mov.b32 r4575, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4578, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4580, {high, high}; +} +{ +mul.f16x2 r4582, r4398, r4580; +} +{ +fma.rn.f16x2 r4585, r4395, r4578, r4582; +} +{ +mul.f16x2 r4589, r4395, r4580; +} +{ +neg.f16x2 r4592, r4589; +} +{ +fma.rn.f16x2 r4594, r4398, r4578, r4592; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4598, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4600, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4602, {low, high}; +} +{ +mul.f16x2 r4603, r4600, r4602; +} +{ +mul.f16x2 r4606, r4575, r4598; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4609, {high, low}; +} +{ +fma.rn.f16x2 r4611, r4603, r4609, r4606; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4615, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4617, {high, high}; +} +{ +mul.f16x2 r4619, r4410, r4617; +} +{ +fma.rn.f16x2 r4622, r4407, r4615, r4619; +} +{ +mul.f16x2 r4626, r4407, r4617; +} +{ +neg.f16x2 r4629, r4626; +} +{ +fma.rn.f16x2 r4631, r4410, r4615, r4629; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4635, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4637, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4639, {low, high}; +} +{ +mul.f16x2 r4640, r4637, r4639; +} +{ +mul.f16x2 r4643, r4611, r4635; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4611; +mov.b32 r4646, {high, low}; +} +{ +fma.rn.f16x2 r4648, r4640, r4646, r4643; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4652, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4654, {high, high}; +} +{ +mul.f16x2 r4656, r4422, r4654; +} +{ +fma.rn.f16x2 r4659, r4419, r4652, r4656; +} +{ +mul.f16x2 r4663, r4419, r4654; +} +{ +neg.f16x2 r4666, r4663; +} +{ +fma.rn.f16x2 r4668, r4422, r4652, r4666; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4672, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4674, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4676, {low, high}; +} +{ +mul.f16x2 r4677, r4674, r4676; +} +{ +mul.f16x2 r4680, r4648, r4672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4648; +mov.b32 r4683, {high, low}; +} +{ +fma.rn.f16x2 r4685, r4677, r4683, r4680; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4689, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4691, {high, high}; +} +{ +mul.f16x2 r4693, r4434, r4691; +} +{ +fma.rn.f16x2 r4696, r4431, r4689, r4693; +} +{ +mul.f16x2 r4700, r4431, r4691; +} +{ +neg.f16x2 r4703, r4700; +} +{ +fma.rn.f16x2 r4705, r4434, r4689, r4703; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4709, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4711, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4713, {low, high}; +} +{ +mul.f16x2 r4714, r4711, r4713; +} +{ +mul.f16x2 r4717, r4685, r4709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4685; +mov.b32 r4720, {high, low}; +} +{ +fma.rn.f16x2 r4722, r4714, r4720, r4717; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4726, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4728, {high, high}; +} +{ +mul.f16x2 r4730, r4446, r4728; +} +{ +fma.rn.f16x2 r4733, r4443, r4726, r4730; +} +{ +mul.f16x2 r4737, r4443, r4728; +} +{ +neg.f16x2 r4740, r4737; +} +{ +fma.rn.f16x2 r4742, r4446, r4726, r4740; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4746, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4748, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4750, {low, high}; +} +{ +mul.f16x2 r4751, r4748, r4750; +} +{ +mul.f16x2 r4754, r4722, r4746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4722; +mov.b32 r4757, {high, low}; +} +{ +fma.rn.f16x2 r4759, r4751, r4757, r4754; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4763, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4765, {high, high}; +} +{ +mul.f16x2 r4767, r4458, r4765; +} +{ +fma.rn.f16x2 r4770, r4455, r4763, r4767; +} +{ +mul.f16x2 r4774, r4455, r4765; +} +{ +neg.f16x2 r4777, r4774; +} +{ +fma.rn.f16x2 r4779, r4458, r4763, r4777; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4783, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4785, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4787, {low, high}; +} +{ +mul.f16x2 r4788, r4785, r4787; +} +{ +mul.f16x2 r4791, r4759, r4783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4759; +mov.b32 r4794, {high, low}; +} +{ +fma.rn.f16x2 r4796, r4788, r4794, r4791; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4800, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4802, {high, high}; +} +{ +mul.f16x2 r4804, r4470, r4802; +} +{ +fma.rn.f16x2 r4807, r4467, r4800, r4804; +} +{ +mul.f16x2 r4811, r4467, r4802; +} +{ +neg.f16x2 r4814, r4811; +} +{ +fma.rn.f16x2 r4816, r4470, r4800, r4814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4820, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4822, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4824, {low, high}; +} +{ +mul.f16x2 r4825, r4822, r4824; +} +{ +mul.f16x2 r4828, r4796, r4820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4796; +mov.b32 r4831, {high, low}; +} +{ +fma.rn.f16x2 r4833, r4825, r4831, r4828; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4837, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4839, {high, high}; +} +{ +mul.f16x2 r4841, r4482, r4839; +} +{ +fma.rn.f16x2 r4844, r4479, r4837, r4841; +} +{ +mul.f16x2 r4848, r4479, r4839; +} +{ +neg.f16x2 r4851, r4848; +} +{ +fma.rn.f16x2 r4853, r4482, r4837, r4851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4857, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4859, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4861, {low, high}; +} +{ +mul.f16x2 r4862, r4859, r4861; +} +{ +mul.f16x2 r4865, r4833, r4857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4833; +mov.b32 r4868, {high, low}; +} +{ +fma.rn.f16x2 r4870, r4862, r4868, r4865; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4876, {high, high}; +} +{ +mul.f16x2 r4878, r4494, r4876; +} +{ +fma.rn.f16x2 r4881, r4491, r4874, r4878; +} +{ +mul.f16x2 r4885, r4491, r4876; +} +{ +neg.f16x2 r4888, r4885; +} +{ +fma.rn.f16x2 r4890, r4494, r4874, r4888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4896, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4898, {low, high}; +} +{ +mul.f16x2 r4899, r4896, r4898; +} +{ +mul.f16x2 r4902, r4870, r4894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4870; +mov.b32 r4905, {high, low}; +} +{ +fma.rn.f16x2 r4907, r4899, r4905, r4902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4913, {high, high}; +} +{ +mul.f16x2 r4915, r4506, r4913; +} +{ +fma.rn.f16x2 r4918, r4503, r4911, r4915; +} +{ +mul.f16x2 r4922, r4503, r4913; +} +{ +neg.f16x2 r4925, r4922; +} +{ +fma.rn.f16x2 r4927, r4506, r4911, r4925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4935, {low, high}; +} +{ +mul.f16x2 r4936, r4933, r4935; +} +{ +mul.f16x2 r4939, r4907, r4931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4907; +mov.b32 r4942, {high, low}; +} +{ +fma.rn.f16x2 r4944, r4936, r4942, r4939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4950, {high, high}; +} +{ +mul.f16x2 r4952, r4518, r4950; +} +{ +fma.rn.f16x2 r4955, r4515, r4948, r4952; +} +{ +mul.f16x2 r4959, r4515, r4950; +} +{ +neg.f16x2 r4962, r4959; +} +{ +fma.rn.f16x2 r4964, r4518, r4948, r4962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r4970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r4972, {low, high}; +} +{ +mul.f16x2 r4973, r4970, r4972; +} +{ +mul.f16x2 r4976, r4944, r4968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4944; +mov.b32 r4979, {high, low}; +} +{ +fma.rn.f16x2 r4981, r4973, r4979, r4976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r4987, {high, high}; +} +{ +mul.f16x2 r4989, r4530, r4987; +} +{ +fma.rn.f16x2 r4992, r4527, r4985, r4989; +} +{ +mul.f16x2 r4996, r4527, r4987; +} +{ +neg.f16x2 r4999, r4996; +} +{ +fma.rn.f16x2 r5001, r4530, r4985, r4999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5009, {low, high}; +} +{ +mul.f16x2 r5010, r5007, r5009; +} +{ +mul.f16x2 r5013, r4981, r5005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4981; +mov.b32 r5016, {high, low}; +} +{ +fma.rn.f16x2 r5018, r5010, r5016, r5013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5024, {high, high}; +} +{ +mul.f16x2 r5026, r4542, r5024; +} +{ +fma.rn.f16x2 r5029, r4539, r5022, r5026; +} +{ +mul.f16x2 r5033, r4539, r5024; +} +{ +neg.f16x2 r5036, r5033; +} +{ +fma.rn.f16x2 r5038, r4542, r5022, r5036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5046, {low, high}; +} +{ +mul.f16x2 r5047, r5044, r5046; +} +{ +mul.f16x2 r5050, r5018, r5042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5018; +mov.b32 r5053, {high, low}; +} +{ +fma.rn.f16x2 r5055, r5047, r5053, r5050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5061, {high, high}; +} +{ +mul.f16x2 r5063, r4554, r5061; +} +{ +fma.rn.f16x2 r5066, r4551, r5059, r5063; +} +{ +mul.f16x2 r5070, r4551, r5061; +} +{ +neg.f16x2 r5073, r5070; +} +{ +fma.rn.f16x2 r5075, r4554, r5059, r5073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5083, {low, high}; +} +{ +mul.f16x2 r5084, r5081, r5083; +} +{ +mul.f16x2 r5087, r5055, r5079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5055; +mov.b32 r5090, {high, low}; +} +{ +fma.rn.f16x2 r5092, r5084, r5090, r5087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5098, {high, high}; +} +{ +mul.f16x2 r5100, r4566, r5098; +} +{ +fma.rn.f16x2 r5103, r4563, r5096, r5100; +} +{ +mul.f16x2 r5107, r4563, r5098; +} +{ +neg.f16x2 r5110, r5107; +} +{ +fma.rn.f16x2 r5112, r4566, r5096, r5110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5120, {low, high}; +} +{ +mul.f16x2 r5121, r5118, r5120; +} +{ +mul.f16x2 r5124, r5092, r5116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5092; +mov.b32 r5127, {high, low}; +} +{ +fma.rn.f16x2 r5129, r5121, r5127, r5124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5135, {high, high}; +} +{ +mul.f16x2 r5137, r4392, r5135; +} +{ +fma.rn.f16x2 r5140, r4389, r5133, r5137; +} +{ +mul.f16x2 r5144, r4389, r5135; +} +{ +neg.f16x2 r5147, r5144; +} +{ +fma.rn.f16x2 r5149, r4392, r5133, r5147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5157, {low, high}; +} +{ +mul.f16x2 r5158, r5155, r5157; +} +{ +mul.f16x2 r5161, r5129, r5153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5129; +mov.b32 r5164, {high, low}; +} +{ +fma.rn.f16x2 r5166, r5158, r5164, r5161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5172, {high, high}; +} +{ +mul.f16x2 r5174, r4404, r5172; +} +{ +fma.rn.f16x2 r5177, r4401, r5170, r5174; +} +{ +mul.f16x2 r5181, r4401, r5172; +} +{ +neg.f16x2 r5184, r5181; +} +{ +fma.rn.f16x2 r5186, r4404, r5170, r5184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5194, {low, high}; +} +{ +mul.f16x2 r5195, r5192, r5194; +} +{ +mul.f16x2 r5198, r5166, r5190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5166; +mov.b32 r5201, {high, low}; +} +{ +fma.rn.f16x2 r5203, r5195, r5201, r5198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5209, {high, high}; +} +{ +mul.f16x2 r5211, r4416, r5209; +} +{ +fma.rn.f16x2 r5214, r4413, r5207, r5211; +} +{ +mul.f16x2 r5218, r4413, r5209; +} +{ +neg.f16x2 r5221, r5218; +} +{ +fma.rn.f16x2 r5223, r4416, r5207, r5221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5231, {low, high}; +} +{ +mul.f16x2 r5232, r5229, r5231; +} +{ +mul.f16x2 r5235, r5203, r5227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5203; +mov.b32 r5238, {high, low}; +} +{ +fma.rn.f16x2 r5240, r5232, r5238, r5235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5246, {high, high}; +} +{ +mul.f16x2 r5248, r4428, r5246; +} +{ +fma.rn.f16x2 r5251, r4425, r5244, r5248; +} +{ +mul.f16x2 r5255, r4425, r5246; +} +{ +neg.f16x2 r5258, r5255; +} +{ +fma.rn.f16x2 r5260, r4428, r5244, r5258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5268, {low, high}; +} +{ +mul.f16x2 r5269, r5266, r5268; +} +{ +mul.f16x2 r5272, r5240, r5264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5240; +mov.b32 r5275, {high, low}; +} +{ +fma.rn.f16x2 r5277, r5269, r5275, r5272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5283, {high, high}; +} +{ +mul.f16x2 r5285, r4440, r5283; +} +{ +fma.rn.f16x2 r5288, r4437, r5281, r5285; +} +{ +mul.f16x2 r5292, r4437, r5283; +} +{ +neg.f16x2 r5295, r5292; +} +{ +fma.rn.f16x2 r5297, r4440, r5281, r5295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5305, {low, high}; +} +{ +mul.f16x2 r5306, r5303, r5305; +} +{ +mul.f16x2 r5309, r5277, r5301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5277; +mov.b32 r5312, {high, low}; +} +{ +fma.rn.f16x2 r5314, r5306, r5312, r5309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5320, {high, high}; +} +{ +mul.f16x2 r5322, r4452, r5320; +} +{ +fma.rn.f16x2 r5325, r4449, r5318, r5322; +} +{ +mul.f16x2 r5329, r4449, r5320; +} +{ +neg.f16x2 r5332, r5329; +} +{ +fma.rn.f16x2 r5334, r4452, r5318, r5332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5342, {low, high}; +} +{ +mul.f16x2 r5343, r5340, r5342; +} +{ +mul.f16x2 r5346, r5314, r5338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5314; +mov.b32 r5349, {high, low}; +} +{ +fma.rn.f16x2 r5351, r5343, r5349, r5346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5357, {high, high}; +} +{ +mul.f16x2 r5359, r4464, r5357; +} +{ +fma.rn.f16x2 r5362, r4461, r5355, r5359; +} +{ +mul.f16x2 r5366, r4461, r5357; +} +{ +neg.f16x2 r5369, r5366; +} +{ +fma.rn.f16x2 r5371, r4464, r5355, r5369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5379, {low, high}; +} +{ +mul.f16x2 r5380, r5377, r5379; +} +{ +mul.f16x2 r5383, r5351, r5375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5351; +mov.b32 r5386, {high, low}; +} +{ +fma.rn.f16x2 r5388, r5380, r5386, r5383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5394, {high, high}; +} +{ +mul.f16x2 r5396, r4476, r5394; +} +{ +fma.rn.f16x2 r5399, r4473, r5392, r5396; +} +{ +mul.f16x2 r5403, r4473, r5394; +} +{ +neg.f16x2 r5406, r5403; +} +{ +fma.rn.f16x2 r5408, r4476, r5392, r5406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5416, {low, high}; +} +{ +mul.f16x2 r5417, r5414, r5416; +} +{ +mul.f16x2 r5420, r5388, r5412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5388; +mov.b32 r5423, {high, low}; +} +{ +fma.rn.f16x2 r5425, r5417, r5423, r5420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5431, {high, high}; +} +{ +mul.f16x2 r5433, r4488, r5431; +} +{ +fma.rn.f16x2 r5436, r4485, r5429, r5433; +} +{ +mul.f16x2 r5440, r4485, r5431; +} +{ +neg.f16x2 r5443, r5440; +} +{ +fma.rn.f16x2 r5445, r4488, r5429, r5443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5453, {low, high}; +} +{ +mul.f16x2 r5454, r5451, r5453; +} +{ +mul.f16x2 r5457, r5425, r5449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5425; +mov.b32 r5460, {high, low}; +} +{ +fma.rn.f16x2 r5462, r5454, r5460, r5457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5468, {high, high}; +} +{ +mul.f16x2 r5470, r4500, r5468; +} +{ +fma.rn.f16x2 r5473, r4497, r5466, r5470; +} +{ +mul.f16x2 r5477, r4497, r5468; +} +{ +neg.f16x2 r5480, r5477; +} +{ +fma.rn.f16x2 r5482, r4500, r5466, r5480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5490, {low, high}; +} +{ +mul.f16x2 r5491, r5488, r5490; +} +{ +mul.f16x2 r5494, r5462, r5486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5462; +mov.b32 r5497, {high, low}; +} +{ +fma.rn.f16x2 r5499, r5491, r5497, r5494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5505, {high, high}; +} +{ +mul.f16x2 r5507, r4512, r5505; +} +{ +fma.rn.f16x2 r5510, r4509, r5503, r5507; +} +{ +mul.f16x2 r5514, r4509, r5505; +} +{ +neg.f16x2 r5517, r5514; +} +{ +fma.rn.f16x2 r5519, r4512, r5503, r5517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5527, {low, high}; +} +{ +mul.f16x2 r5528, r5525, r5527; +} +{ +mul.f16x2 r5531, r5499, r5523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5499; +mov.b32 r5534, {high, low}; +} +{ +fma.rn.f16x2 r5536, r5528, r5534, r5531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5542, {high, high}; +} +{ +mul.f16x2 r5544, r4524, r5542; +} +{ +fma.rn.f16x2 r5547, r4521, r5540, r5544; +} +{ +mul.f16x2 r5551, r4521, r5542; +} +{ +neg.f16x2 r5554, r5551; +} +{ +fma.rn.f16x2 r5556, r4524, r5540, r5554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5564, {low, high}; +} +{ +mul.f16x2 r5565, r5562, r5564; +} +{ +mul.f16x2 r5568, r5536, r5560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5536; +mov.b32 r5571, {high, low}; +} +{ +fma.rn.f16x2 r5573, r5565, r5571, r5568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5579, {high, high}; +} +{ +mul.f16x2 r5581, r4536, r5579; +} +{ +fma.rn.f16x2 r5584, r4533, r5577, r5581; +} +{ +mul.f16x2 r5588, r4533, r5579; +} +{ +neg.f16x2 r5591, r5588; +} +{ +fma.rn.f16x2 r5593, r4536, r5577, r5591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5601, {low, high}; +} +{ +mul.f16x2 r5602, r5599, r5601; +} +{ +mul.f16x2 r5605, r5573, r5597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5573; +mov.b32 r5608, {high, low}; +} +{ +fma.rn.f16x2 r5610, r5602, r5608, r5605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5616, {high, high}; +} +{ +mul.f16x2 r5618, r4548, r5616; +} +{ +fma.rn.f16x2 r5621, r4545, r5614, r5618; +} +{ +mul.f16x2 r5625, r4545, r5616; +} +{ +neg.f16x2 r5628, r5625; +} +{ +fma.rn.f16x2 r5630, r4548, r5614, r5628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5638, {low, high}; +} +{ +mul.f16x2 r5639, r5636, r5638; +} +{ +mul.f16x2 r5642, r5610, r5634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5610; +mov.b32 r5645, {high, low}; +} +{ +fma.rn.f16x2 r5647, r5639, r5645, r5642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5653, {high, high}; +} +{ +mul.f16x2 r5655, r4560, r5653; +} +{ +fma.rn.f16x2 r5658, r4557, r5651, r5655; +} +{ +mul.f16x2 r5662, r4557, r5653; +} +{ +neg.f16x2 r5665, r5662; +} +{ +fma.rn.f16x2 r5667, r4560, r5651, r5665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r4575; +mov.b32 r5673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f963; +cvt.rn.f16.f32 high, f964; +mov.b32 r5675, {low, high}; +} +{ +mul.f16x2 r5676, r5673, r5675; +} +{ +mul.f16x2 r5679, r5647, r5671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5647; +mov.b32 r5682, {high, low}; +} +{ +fma.rn.f16x2 r5684, r5676, r5682, r5679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r5684; +mov.b32 r5690, {high, high}; +} +{ +mul.f16x2 r5692, r4572, r5690; +} +{ +fma.rn.f16x2 r5695, r4569, r5688, r5692; +} +{ +mul.f16x2 r5699, r4569, r5690; +} +{ +neg.f16x2 r5702, r5699; +} +{ +fma.rn.f16x2 r5704, r4572, r5688, r5702; +} +barrier.sync 0; +and.b32 r6526, r6515, 28672; +add.s32 r6527, r6525, r6526; +st.shared.u32 [r6527], r4383; +st.shared.u32 [r6527+128], r4585; +st.shared.u32 [r6527+256], r4622; +st.shared.u32 [r6527+384], r4659; +st.shared.u32 [r6527+512], r4696; +st.shared.u32 [r6527+640], r4733; +st.shared.u32 [r6527+768], r4770; +st.shared.u32 [r6527+896], r4807; +st.shared.u32 [r6527+1024], r4844; +st.shared.u32 [r6527+1152], r4881; +st.shared.u32 [r6527+1280], r4918; +st.shared.u32 [r6527+1408], r4955; +st.shared.u32 [r6527+1536], r4992; +st.shared.u32 [r6527+1664], r5029; +st.shared.u32 [r6527+1792], r5066; +st.shared.u32 [r6527+1920], r5103; +st.shared.u32 [r6527+2048], r5140; +st.shared.u32 [r6527+2176], r5177; +st.shared.u32 [r6527+2304], r5214; +st.shared.u32 [r6527+2432], r5251; +st.shared.u32 [r6527+2560], r5288; +st.shared.u32 [r6527+2688], r5325; +st.shared.u32 [r6527+2816], r5362; +st.shared.u32 [r6527+2944], r5399; +st.shared.u32 [r6527+3072], r5436; +st.shared.u32 [r6527+3200], r5473; +st.shared.u32 [r6527+3328], r5510; +st.shared.u32 [r6527+3456], r5547; +st.shared.u32 [r6527+3584], r5584; +st.shared.u32 [r6527+3712], r5621; +st.shared.u32 [r6527+3840], r5658; +st.shared.u32 [r6527+3968], r5695; +barrier.sync 0; +mad.lo.s32 r6528, r6521, -124, r6527; +ld.shared.u32 r5726, [r6528]; +ld.shared.u32 r5922, [r6528+1024]; +ld.shared.u32 r6118, [r6528+2048]; +ld.shared.u32 r6314, [r6528+3072]; +ld.shared.u32 r5776, [r6528+4096]; +ld.shared.u32 r5972, [r6528+5120]; +ld.shared.u32 r6168, [r6528+6144]; +ld.shared.u32 r6364, [r6528+7168]; +ld.shared.u32 r5738, [r6528+8192]; +ld.shared.u32 r5934, [r6528+9216]; +ld.shared.u32 r6130, [r6528+10240]; +ld.shared.u32 r6326, [r6528+11264]; +ld.shared.u32 r5788, [r6528+12288]; +ld.shared.u32 r5984, [r6528+13312]; +ld.shared.u32 r6180, [r6528+14336]; +ld.shared.u32 r6376, [r6528+15360]; +ld.shared.u32 r5727, [r6528+16384]; +ld.shared.u32 r5923, [r6528+17408]; +ld.shared.u32 r6119, [r6528+18432]; +ld.shared.u32 r6315, [r6528+19456]; +ld.shared.u32 r5777, [r6528+20480]; +ld.shared.u32 r5973, [r6528+21504]; +ld.shared.u32 r6169, [r6528+22528]; +ld.shared.u32 r6365, [r6528+23552]; +ld.shared.u32 r5739, [r6528+24576]; +ld.shared.u32 r5935, [r6528+25600]; +ld.shared.u32 r6131, [r6528+26624]; +ld.shared.u32 r6327, [r6528+27648]; +ld.shared.u32 r5789, [r6528+28672]; +ld.shared.u32 r5985, [r6528+29696]; +ld.shared.u32 r6181, [r6528+30720]; +ld.shared.u32 r6377, [r6528+31744]; +barrier.sync 0; +st.shared.u32 [r6527], r4386; +st.shared.u32 [r6527+128], r4594; +st.shared.u32 [r6527+256], r4631; +st.shared.u32 [r6527+384], r4668; +st.shared.u32 [r6527+512], r4705; +st.shared.u32 [r6527+640], r4742; +st.shared.u32 [r6527+768], r4779; +st.shared.u32 [r6527+896], r4816; +st.shared.u32 [r6527+1024], r4853; +st.shared.u32 [r6527+1152], r4890; +st.shared.u32 [r6527+1280], r4927; +st.shared.u32 [r6527+1408], r4964; +st.shared.u32 [r6527+1536], r5001; +st.shared.u32 [r6527+1664], r5038; +st.shared.u32 [r6527+1792], r5075; +st.shared.u32 [r6527+1920], r5112; +st.shared.u32 [r6527+2048], r5149; +st.shared.u32 [r6527+2176], r5186; +st.shared.u32 [r6527+2304], r5223; +st.shared.u32 [r6527+2432], r5260; +st.shared.u32 [r6527+2560], r5297; +st.shared.u32 [r6527+2688], r5334; +st.shared.u32 [r6527+2816], r5371; +st.shared.u32 [r6527+2944], r5408; +st.shared.u32 [r6527+3072], r5445; +st.shared.u32 [r6527+3200], r5482; +st.shared.u32 [r6527+3328], r5519; +st.shared.u32 [r6527+3456], r5556; +st.shared.u32 [r6527+3584], r5593; +st.shared.u32 [r6527+3712], r5630; +st.shared.u32 [r6527+3840], r5667; +st.shared.u32 [r6527+3968], r5704; +barrier.sync 0; +ld.shared.u32 r5729, [r6528]; +ld.shared.u32 r5925, [r6528+1024]; +ld.shared.u32 r6121, [r6528+2048]; +ld.shared.u32 r6317, [r6528+3072]; +ld.shared.u32 r5779, [r6528+4096]; +ld.shared.u32 r5975, [r6528+5120]; +ld.shared.u32 r6171, [r6528+6144]; +ld.shared.u32 r6367, [r6528+7168]; +ld.shared.u32 r5741, [r6528+8192]; +ld.shared.u32 r5937, [r6528+9216]; +ld.shared.u32 r6133, [r6528+10240]; +ld.shared.u32 r6329, [r6528+11264]; +ld.shared.u32 r5791, [r6528+12288]; +ld.shared.u32 r5987, [r6528+13312]; +ld.shared.u32 r6183, [r6528+14336]; +ld.shared.u32 r6379, [r6528+15360]; +ld.shared.u32 r5730, [r6528+16384]; +ld.shared.u32 r5926, [r6528+17408]; +ld.shared.u32 r6122, [r6528+18432]; +ld.shared.u32 r6318, [r6528+19456]; +ld.shared.u32 r5780, [r6528+20480]; +ld.shared.u32 r5976, [r6528+21504]; +ld.shared.u32 r6172, [r6528+22528]; +ld.shared.u32 r6368, [r6528+23552]; +ld.shared.u32 r5742, [r6528+24576]; +ld.shared.u32 r5938, [r6528+25600]; +ld.shared.u32 r6134, [r6528+26624]; +ld.shared.u32 r6330, [r6528+27648]; +ld.shared.u32 r5792, [r6528+28672]; +ld.shared.u32 r5988, [r6528+29696]; +ld.shared.u32 r6184, [r6528+30720]; +ld.shared.u32 r6380, [r6528+31744]; +{ +add.f16x2 r5725, r5726, r5727; +} +{ +add.f16x2 r5728, r5729, r5730; +} +{ +sub.f16x2 r5731, r5726, r5727; +} +{ +sub.f16x2 r5734, r5729, r5730; +} +{ +add.f16x2 r5737, r5738, r5739; +} +{ +add.f16x2 r5740, r5741, r5742; +} +{ +sub.f16x2 r5743, r5738, r5739; +} +{ +sub.f16x2 r5746, r5741, r5742; +} +{ +neg.f16x2 r5749, r5746; +} +{ +add.f16x2 r5751, r5725, r5737; +} +{ +add.f16x2 r5754, r5728, r5740; +} +{ +sub.f16x2 r5757, r5725, r5737; +} +{ +sub.f16x2 r5760, r5728, r5740; +} +{ +add.f16x2 r5763, r5731, r5749; +} +{ +add.f16x2 r5766, r5734, r5743; +} +{ +sub.f16x2 r5769, r5731, r5749; +} +{ +sub.f16x2 r5772, r5734, r5743; +} +{ +add.f16x2 r5775, r5776, r5777; +} +{ +add.f16x2 r5778, r5779, r5780; +} +{ +sub.f16x2 r5781, r5776, r5777; +} +{ +sub.f16x2 r5784, r5779, r5780; +} +{ +add.f16x2 r5787, r5788, r5789; +} +{ +add.f16x2 r5790, r5791, r5792; +} +{ +sub.f16x2 r5793, r5788, r5789; +} +{ +sub.f16x2 r5796, r5791, r5792; +} +{ +neg.f16x2 r5799, r5796; +} +{ +add.f16x2 r5801, r5775, r5787; +} +{ +add.f16x2 r5804, r5778, r5790; +} +{ +sub.f16x2 r5807, r5775, r5787; +} +{ +sub.f16x2 r5810, r5778, r5790; +} +{ +add.f16x2 r5813, r5781, r5799; +} +{ +add.f16x2 r5816, r5784, r5793; +} +{ +sub.f16x2 r5819, r5781, r5799; +} +{ +sub.f16x2 r5822, r5784, r5793; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5825, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5826, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r5829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r5830, {low, high}; +} +{ +mul.f16x2 r5839, r5813, r5825; +} +{ +mul.f16x2 r5842, r5816, r5826; +} +{ +sub.f16x2 r5845, r5839, r5842; +} +{ +mul.f16x2 r5848, r5813, r5826; +} +{ +fma.rn.f16x2 r5851, r5816, r5825, r5848; +} +{ +neg.f16x2 r5855, r5810; +} +{ +mul.f16x2 r5857, r5819, r5829; +} +{ +mul.f16x2 r5860, r5822, r5830; +} +{ +sub.f16x2 r5863, r5857, r5860; +} +{ +mul.f16x2 r5866, r5819, r5830; +} +{ +fma.rn.f16x2 r5869, r5822, r5829, r5866; +} +{ +add.f16x2 %0, r5751, r5801; +} +{ +add.f16x2 %1, r5754, r5804; +} +{ +sub.f16x2 %32, r5751, r5801; +} +{ +sub.f16x2 %33, r5754, r5804; +} +{ +add.f16x2 %8, r5763, r5845; +} +{ +add.f16x2 %9, r5766, r5851; +} +{ +sub.f16x2 %40, r5763, r5845; +} +{ +sub.f16x2 %41, r5766, r5851; +} +{ +add.f16x2 %16, r5757, r5855; +} +{ +add.f16x2 %17, r5760, r5807; +} +{ +sub.f16x2 %48, r5757, r5855; +} +{ +sub.f16x2 %49, r5760, r5807; +} +{ +add.f16x2 %24, r5769, r5863; +} +{ +add.f16x2 %25, r5772, r5869; +} +{ +sub.f16x2 %56, r5769, r5863; +} +{ +sub.f16x2 %57, r5772, r5869; +} +{ +add.f16x2 r5921, r5922, r5923; +} +{ +add.f16x2 r5924, r5925, r5926; +} +{ +sub.f16x2 r5927, r5922, r5923; +} +{ +sub.f16x2 r5930, r5925, r5926; +} +{ +add.f16x2 r5933, r5934, r5935; +} +{ +add.f16x2 r5936, r5937, r5938; +} +{ +sub.f16x2 r5939, r5934, r5935; +} +{ +sub.f16x2 r5942, r5937, r5938; +} +{ +neg.f16x2 r5945, r5942; +} +{ +add.f16x2 r5947, r5921, r5933; +} +{ +add.f16x2 r5950, r5924, r5936; +} +{ +sub.f16x2 r5953, r5921, r5933; +} +{ +sub.f16x2 r5956, r5924, r5936; +} +{ +add.f16x2 r5959, r5927, r5945; +} +{ +add.f16x2 r5962, r5930, r5939; +} +{ +sub.f16x2 r5965, r5927, r5945; +} +{ +sub.f16x2 r5968, r5930, r5939; +} +{ +add.f16x2 r5971, r5972, r5973; +} +{ +add.f16x2 r5974, r5975, r5976; +} +{ +sub.f16x2 r5977, r5972, r5973; +} +{ +sub.f16x2 r5980, r5975, r5976; +} +{ +add.f16x2 r5983, r5984, r5985; +} +{ +add.f16x2 r5986, r5987, r5988; +} +{ +sub.f16x2 r5989, r5984, r5985; +} +{ +sub.f16x2 r5992, r5987, r5988; +} +{ +neg.f16x2 r5995, r5992; +} +{ +add.f16x2 r5997, r5971, r5983; +} +{ +add.f16x2 r6000, r5974, r5986; +} +{ +sub.f16x2 r6003, r5971, r5983; +} +{ +sub.f16x2 r6006, r5974, r5986; +} +{ +add.f16x2 r6009, r5977, r5995; +} +{ +add.f16x2 r6012, r5980, r5989; +} +{ +sub.f16x2 r6015, r5977, r5995; +} +{ +sub.f16x2 r6018, r5980, r5989; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6022, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r6025, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6026, {low, high}; +} +{ +mul.f16x2 r6035, r6009, r6021; +} +{ +mul.f16x2 r6038, r6012, r6022; +} +{ +sub.f16x2 r6041, r6035, r6038; +} +{ +mul.f16x2 r6044, r6009, r6022; +} +{ +fma.rn.f16x2 r6047, r6012, r6021, r6044; +} +{ +neg.f16x2 r6051, r6006; +} +{ +mul.f16x2 r6053, r6015, r6025; +} +{ +mul.f16x2 r6056, r6018, r6026; +} +{ +sub.f16x2 r6059, r6053, r6056; +} +{ +mul.f16x2 r6062, r6015, r6026; +} +{ +fma.rn.f16x2 r6065, r6018, r6025, r6062; +} +{ +add.f16x2 %2, r5947, r5997; +} +{ +add.f16x2 %3, r5950, r6000; +} +{ +sub.f16x2 %34, r5947, r5997; +} +{ +sub.f16x2 %35, r5950, r6000; +} +{ +add.f16x2 %10, r5959, r6041; +} +{ +add.f16x2 %11, r5962, r6047; +} +{ +sub.f16x2 %42, r5959, r6041; +} +{ +sub.f16x2 %43, r5962, r6047; +} +{ +add.f16x2 %18, r5953, r6051; +} +{ +add.f16x2 %19, r5956, r6003; +} +{ +sub.f16x2 %50, r5953, r6051; +} +{ +sub.f16x2 %51, r5956, r6003; +} +{ +add.f16x2 %26, r5965, r6059; +} +{ +add.f16x2 %27, r5968, r6065; +} +{ +sub.f16x2 %58, r5965, r6059; +} +{ +sub.f16x2 %59, r5968, r6065; +} +{ +add.f16x2 r6117, r6118, r6119; +} +{ +add.f16x2 r6120, r6121, r6122; +} +{ +sub.f16x2 r6123, r6118, r6119; +} +{ +sub.f16x2 r6126, r6121, r6122; +} +{ +add.f16x2 r6129, r6130, r6131; +} +{ +add.f16x2 r6132, r6133, r6134; +} +{ +sub.f16x2 r6135, r6130, r6131; +} +{ +sub.f16x2 r6138, r6133, r6134; +} +{ +neg.f16x2 r6141, r6138; +} +{ +add.f16x2 r6143, r6117, r6129; +} +{ +add.f16x2 r6146, r6120, r6132; +} +{ +sub.f16x2 r6149, r6117, r6129; +} +{ +sub.f16x2 r6152, r6120, r6132; +} +{ +add.f16x2 r6155, r6123, r6141; +} +{ +add.f16x2 r6158, r6126, r6135; +} +{ +sub.f16x2 r6161, r6123, r6141; +} +{ +sub.f16x2 r6164, r6126, r6135; +} +{ +add.f16x2 r6167, r6168, r6169; +} +{ +add.f16x2 r6170, r6171, r6172; +} +{ +sub.f16x2 r6173, r6168, r6169; +} +{ +sub.f16x2 r6176, r6171, r6172; +} +{ +add.f16x2 r6179, r6180, r6181; +} +{ +add.f16x2 r6182, r6183, r6184; +} +{ +sub.f16x2 r6185, r6180, r6181; +} +{ +sub.f16x2 r6188, r6183, r6184; +} +{ +neg.f16x2 r6191, r6188; +} +{ +add.f16x2 r6193, r6167, r6179; +} +{ +add.f16x2 r6196, r6170, r6182; +} +{ +sub.f16x2 r6199, r6167, r6179; +} +{ +sub.f16x2 r6202, r6170, r6182; +} +{ +add.f16x2 r6205, r6173, r6191; +} +{ +add.f16x2 r6208, r6176, r6185; +} +{ +sub.f16x2 r6211, r6173, r6191; +} +{ +sub.f16x2 r6214, r6176, r6185; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6218, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r6221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6222, {low, high}; +} +{ +mul.f16x2 r6231, r6205, r6217; +} +{ +mul.f16x2 r6234, r6208, r6218; +} +{ +sub.f16x2 r6237, r6231, r6234; +} +{ +mul.f16x2 r6240, r6205, r6218; +} +{ +fma.rn.f16x2 r6243, r6208, r6217, r6240; +} +{ +neg.f16x2 r6247, r6202; +} +{ +mul.f16x2 r6249, r6211, r6221; +} +{ +mul.f16x2 r6252, r6214, r6222; +} +{ +sub.f16x2 r6255, r6249, r6252; +} +{ +mul.f16x2 r6258, r6211, r6222; +} +{ +fma.rn.f16x2 r6261, r6214, r6221, r6258; +} +{ +add.f16x2 %4, r6143, r6193; +} +{ +add.f16x2 %5, r6146, r6196; +} +{ +sub.f16x2 %36, r6143, r6193; +} +{ +sub.f16x2 %37, r6146, r6196; +} +{ +add.f16x2 %12, r6155, r6237; +} +{ +add.f16x2 %13, r6158, r6243; +} +{ +sub.f16x2 %44, r6155, r6237; +} +{ +sub.f16x2 %45, r6158, r6243; +} +{ +add.f16x2 %20, r6149, r6247; +} +{ +add.f16x2 %21, r6152, r6199; +} +{ +sub.f16x2 %52, r6149, r6247; +} +{ +sub.f16x2 %53, r6152, r6199; +} +{ +add.f16x2 %28, r6161, r6255; +} +{ +add.f16x2 %29, r6164, r6261; +} +{ +sub.f16x2 %60, r6161, r6255; +} +{ +sub.f16x2 %61, r6164, r6261; +} +{ +add.f16x2 r6313, r6314, r6315; +} +{ +add.f16x2 r6316, r6317, r6318; +} +{ +sub.f16x2 r6319, r6314, r6315; +} +{ +sub.f16x2 r6322, r6317, r6318; +} +{ +add.f16x2 r6325, r6326, r6327; +} +{ +add.f16x2 r6328, r6329, r6330; +} +{ +sub.f16x2 r6331, r6326, r6327; +} +{ +sub.f16x2 r6334, r6329, r6330; +} +{ +neg.f16x2 r6337, r6334; +} +{ +add.f16x2 r6339, r6313, r6325; +} +{ +add.f16x2 r6342, r6316, r6328; +} +{ +sub.f16x2 r6345, r6313, r6325; +} +{ +sub.f16x2 r6348, r6316, r6328; +} +{ +add.f16x2 r6351, r6319, r6337; +} +{ +add.f16x2 r6354, r6322, r6331; +} +{ +sub.f16x2 r6357, r6319, r6337; +} +{ +sub.f16x2 r6360, r6322, r6331; +} +{ +add.f16x2 r6363, r6364, r6365; +} +{ +add.f16x2 r6366, r6367, r6368; +} +{ +sub.f16x2 r6369, r6364, r6365; +} +{ +sub.f16x2 r6372, r6367, r6368; +} +{ +add.f16x2 r6375, r6376, r6377; +} +{ +add.f16x2 r6378, r6379, r6380; +} +{ +sub.f16x2 r6381, r6376, r6377; +} +{ +sub.f16x2 r6384, r6379, r6380; +} +{ +neg.f16x2 r6387, r6384; +} +{ +add.f16x2 r6389, r6363, r6375; +} +{ +add.f16x2 r6392, r6366, r6378; +} +{ +sub.f16x2 r6395, r6363, r6375; +} +{ +sub.f16x2 r6398, r6366, r6378; +} +{ +add.f16x2 r6401, r6369, r6387; +} +{ +add.f16x2 r6404, r6372, r6381; +} +{ +sub.f16x2 r6407, r6369, r6387; +} +{ +sub.f16x2 r6410, r6372, r6381; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6413, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6414, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f938; +cvt.rn.f16.f32 high, f938; +mov.b32 r6417, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f940; +cvt.rn.f16.f32 high, f940; +mov.b32 r6418, {low, high}; +} +{ +mul.f16x2 r6427, r6401, r6413; +} +{ +mul.f16x2 r6430, r6404, r6414; +} +{ +sub.f16x2 r6433, r6427, r6430; +} +{ +mul.f16x2 r6436, r6401, r6414; +} +{ +fma.rn.f16x2 r6439, r6404, r6413, r6436; +} +{ +neg.f16x2 r6443, r6398; +} +{ +mul.f16x2 r6445, r6407, r6417; +} +{ +mul.f16x2 r6448, r6410, r6418; +} +{ +sub.f16x2 r6451, r6445, r6448; +} +{ +mul.f16x2 r6454, r6407, r6418; +} +{ +fma.rn.f16x2 r6457, r6410, r6417, r6454; +} +{ +add.f16x2 %6, r6339, r6389; +} +{ +add.f16x2 %7, r6342, r6392; +} +{ +sub.f16x2 %38, r6339, r6389; +} +{ +sub.f16x2 %39, r6342, r6392; +} +{ +add.f16x2 %14, r6351, r6433; +} +{ +add.f16x2 %15, r6354, r6439; +} +{ +sub.f16x2 %46, r6351, r6433; +} +{ +sub.f16x2 %47, r6354, r6439; +} +{ +add.f16x2 %22, r6345, r6443; +} +{ +add.f16x2 %23, r6348, r6395; +} +{ +sub.f16x2 %54, r6345, r6443; +} +{ +sub.f16x2 %55, r6348, r6395; +} +{ +add.f16x2 %30, r6357, r6451; +} +{ +add.f16x2 %31, r6360, r6457; +} +{ +sub.f16x2 %62, r6357, r6451; +} +{ +sub.f16x2 %63, r6360, r6457; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)), "=r"(__HALF2_TO_UI(rmem[27].x)), "=r"(__HALF2_TO_UI(rmem[27].y)), "=r"(__HALF2_TO_UI(rmem[28].x)), "=r"(__HALF2_TO_UI(rmem[28].y)), "=r"(__HALF2_TO_UI(rmem[29].x)), "=r"(__HALF2_TO_UI(rmem[29].y)), "=r"(__HALF2_TO_UI(rmem[30].x)), "=r"(__HALF2_TO_UI(rmem[30].y)), "=r"(__HALF2_TO_UI(rmem[31].x)), "=r"(__HALF2_TO_UI(rmem[31].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[31].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[28].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[27].y)), "r"(__HALF2_TO_UI(rmem[30].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[29].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[31].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[28].x)), "r"(__HALF2_TO_UI(rmem[27].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[30].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[29].x)), "r"(__HALF2_TO_UI(rmem[24].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1057, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3646>; +.reg .b64 rd<2>; +mov.u32 r3619, %tid.y; +shl.b32 r3620, r3619, 15; +mov.u32 r3621, %32; +add.s32 r3622, r3621, r3620; +mov.u32 r3623, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f380, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f448, 0f3F800000; +mov.f32 f378, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f376, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r398, {low, high}; +} +mov.f32 f374, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3624, r3623, 511; +shl.b32 r3625, r3623, 6; +and.b32 r3626, r3625, -32768; +add.s32 r3627, r3622, r3626; +cvt.rn.f32.u32 f451, r3624; +mul.f32 f452, f451, 0f3A490FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r3628, r3625, 32704; +add.s32 r3629, r3627, r3628; +st.shared.v4.f32 [r3629], {r521, r627, r664, r701}; +st.shared.v4.f32 [r3629+16], {r738, r775, r812, r849}; +st.shared.v4.f32 [r3629+32], {r886, r923, r960, r997}; +st.shared.v4.f32 [r3629+48], {r1034, r1071, r1108, r1145}; +barrier.sync 0; +mad.lo.s32 r3630, r3624, -60, r3629; +ld.shared.u32 r1176, [r3630]; +ld.shared.u32 r1372, [r3630+2048]; +ld.shared.u32 r1226, [r3630+4096]; +ld.shared.u32 r1422, [r3630+6144]; +ld.shared.u32 r1188, [r3630+8192]; +ld.shared.u32 r1384, [r3630+10240]; +ld.shared.u32 r1238, [r3630+12288]; +ld.shared.u32 r1434, [r3630+14336]; +ld.shared.u32 r1177, [r3630+16384]; +ld.shared.u32 r1373, [r3630+18432]; +ld.shared.u32 r1227, [r3630+20480]; +ld.shared.u32 r1423, [r3630+22528]; +ld.shared.u32 r1189, [r3630+24576]; +ld.shared.u32 r1385, [r3630+26624]; +ld.shared.u32 r1239, [r3630+28672]; +ld.shared.u32 r1435, [r3630+30720]; +barrier.sync 0; +st.shared.v4.f32 [r3629], {r524, r636, r673, r710}; +st.shared.v4.f32 [r3629+16], {r747, r784, r821, r858}; +st.shared.v4.f32 [r3629+32], {r895, r932, r969, r1006}; +st.shared.v4.f32 [r3629+48], {r1043, r1080, r1117, r1154}; +barrier.sync 0; +ld.shared.u32 r1179, [r3630]; +ld.shared.u32 r1375, [r3630+2048]; +ld.shared.u32 r1229, [r3630+4096]; +ld.shared.u32 r1425, [r3630+6144]; +ld.shared.u32 r1191, [r3630+8192]; +ld.shared.u32 r1387, [r3630+10240]; +ld.shared.u32 r1241, [r3630+12288]; +ld.shared.u32 r1437, [r3630+14336]; +ld.shared.u32 r1180, [r3630+16384]; +ld.shared.u32 r1376, [r3630+18432]; +ld.shared.u32 r1230, [r3630+20480]; +ld.shared.u32 r1426, [r3630+22528]; +ld.shared.u32 r1192, [r3630+24576]; +ld.shared.u32 r1388, [r3630+26624]; +ld.shared.u32 r1242, [r3630+28672]; +ld.shared.u32 r1438, [r3630+30720]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3631, r3623, 496; +bfe.u32 r3632, r3623, 4, 5; +shl.b32 r3633, r3623, 2; +and.b32 r3634, r3633, 60; +add.s32 r3635, r3627, r3634; +cvt.rn.f32.u32 f454, r3632; +mul.f32 f455, f454, 0f3C490FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +barrier.sync 0; +and.b32 r3636, r3625, 31744; +add.s32 r3637, r3635, r3636; +st.shared.u32 [r3637], r1695; +st.shared.u32 [r3637+64], r1801; +st.shared.u32 [r3637+128], r1838; +st.shared.u32 [r3637+192], r1875; +st.shared.u32 [r3637+256], r1912; +st.shared.u32 [r3637+320], r1949; +st.shared.u32 [r3637+384], r1986; +st.shared.u32 [r3637+448], r2023; +st.shared.u32 [r3637+512], r2060; +st.shared.u32 [r3637+576], r2097; +st.shared.u32 [r3637+640], r2134; +st.shared.u32 [r3637+704], r2171; +st.shared.u32 [r3637+768], r2208; +st.shared.u32 [r3637+832], r2245; +st.shared.u32 [r3637+896], r2282; +st.shared.u32 [r3637+960], r2319; +barrier.sync 0; +mad.lo.s32 r3638, r3631, -60, r3637; +ld.shared.u32 r2350, [r3638]; +ld.shared.u32 r2546, [r3638+2048]; +ld.shared.u32 r2400, [r3638+4096]; +ld.shared.u32 r2596, [r3638+6144]; +ld.shared.u32 r2362, [r3638+8192]; +ld.shared.u32 r2558, [r3638+10240]; +ld.shared.u32 r2412, [r3638+12288]; +ld.shared.u32 r2608, [r3638+14336]; +ld.shared.u32 r2351, [r3638+16384]; +ld.shared.u32 r2547, [r3638+18432]; +ld.shared.u32 r2401, [r3638+20480]; +ld.shared.u32 r2597, [r3638+22528]; +ld.shared.u32 r2363, [r3638+24576]; +ld.shared.u32 r2559, [r3638+26624]; +ld.shared.u32 r2413, [r3638+28672]; +ld.shared.u32 r2609, [r3638+30720]; +barrier.sync 0; +st.shared.u32 [r3637], r1698; +st.shared.u32 [r3637+64], r1810; +st.shared.u32 [r3637+128], r1847; +st.shared.u32 [r3637+192], r1884; +st.shared.u32 [r3637+256], r1921; +st.shared.u32 [r3637+320], r1958; +st.shared.u32 [r3637+384], r1995; +st.shared.u32 [r3637+448], r2032; +st.shared.u32 [r3637+512], r2069; +st.shared.u32 [r3637+576], r2106; +st.shared.u32 [r3637+640], r2143; +st.shared.u32 [r3637+704], r2180; +st.shared.u32 [r3637+768], r2217; +st.shared.u32 [r3637+832], r2254; +st.shared.u32 [r3637+896], r2291; +st.shared.u32 [r3637+960], r2328; +barrier.sync 0; +ld.shared.u32 r2353, [r3638]; +ld.shared.u32 r2549, [r3638+2048]; +ld.shared.u32 r2403, [r3638+4096]; +ld.shared.u32 r2599, [r3638+6144]; +ld.shared.u32 r2365, [r3638+8192]; +ld.shared.u32 r2561, [r3638+10240]; +ld.shared.u32 r2415, [r3638+12288]; +ld.shared.u32 r2611, [r3638+14336]; +ld.shared.u32 r2354, [r3638+16384]; +ld.shared.u32 r2550, [r3638+18432]; +ld.shared.u32 r2404, [r3638+20480]; +ld.shared.u32 r2600, [r3638+22528]; +ld.shared.u32 r2366, [r3638+24576]; +ld.shared.u32 r2562, [r3638+26624]; +ld.shared.u32 r2416, [r3638+28672]; +ld.shared.u32 r2612, [r3638+30720]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2479; +} +{ +add.f16x2 r2524, r2384, r2431; +} +{ +sub.f16x2 r2527, r2381, r2479; +} +{ +sub.f16x2 r2530, r2384, r2431; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2675; +} +{ +add.f16x2 r2720, r2580, r2627; +} +{ +sub.f16x2 r2723, r2577, r2675; +} +{ +sub.f16x2 r2726, r2580, r2627; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2702; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2819; +} +{ +add.f16x2 r2920, r2506, r2699; +} +{ +sub.f16x2 r2923, r2503, r2819; +} +{ +sub.f16x2 r2926, r2506, r2699; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3639, r3623, 256; +bfe.u32 r3640, r3623, 8, 1; +and.b32 r3641, r3633, 1020; +add.s32 r3642, r3627, r3641; +cvt.rn.f32.u32 f457, r3640; +mul.f32 f458, f457, 0f3E490FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +fma.rn.f16x2 r2975, r2881, r2968, r2972; +} +{ +mul.f16x2 r2979, r2881, r2970; +} +{ +neg.f16x2 r2982, r2979; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +fma.rn.f16x2 r3012, r2893, r3005, r3009; +} +{ +mul.f16x2 r3016, r2893, r3007; +} +{ +neg.f16x2 r3019, r3016; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +fma.rn.f16x2 r3049, r2905, r3042, r3046; +} +{ +mul.f16x2 r3053, r2905, r3044; +} +{ +neg.f16x2 r3056, r3053; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +fma.rn.f16x2 r3086, r2917, r3079, r3083; +} +{ +mul.f16x2 r3090, r2917, r3081; +} +{ +neg.f16x2 r3093, r3090; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +fma.rn.f16x2 r3123, r2929, r3116, r3120; +} +{ +mul.f16x2 r3127, r2929, r3118; +} +{ +neg.f16x2 r3130, r3127; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +fma.rn.f16x2 r3160, r2941, r3153, r3157; +} +{ +mul.f16x2 r3164, r2941, r3155; +} +{ +neg.f16x2 r3167, r3164; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +fma.rn.f16x2 r3197, r2953, r3190, r3194; +} +{ +mul.f16x2 r3201, r2953, r3192; +} +{ +neg.f16x2 r3204, r3201; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +fma.rn.f16x2 r3234, r2875, r3227, r3231; +} +{ +mul.f16x2 r3238, r2875, r3229; +} +{ +neg.f16x2 r3241, r3238; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +fma.rn.f16x2 r3271, r2887, r3264, r3268; +} +{ +mul.f16x2 r3275, r2887, r3266; +} +{ +neg.f16x2 r3278, r3275; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +fma.rn.f16x2 r3308, r2899, r3301, r3305; +} +{ +mul.f16x2 r3312, r2899, r3303; +} +{ +neg.f16x2 r3315, r3312; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +fma.rn.f16x2 r3345, r2911, r3338, r3342; +} +{ +mul.f16x2 r3349, r2911, r3340; +} +{ +neg.f16x2 r3352, r3349; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +fma.rn.f16x2 r3382, r2923, r3375, r3379; +} +{ +mul.f16x2 r3386, r2923, r3377; +} +{ +neg.f16x2 r3389, r3386; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +fma.rn.f16x2 r3419, r2935, r3412, r3416; +} +{ +mul.f16x2 r3423, r2935, r3414; +} +{ +neg.f16x2 r3426, r3423; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +fma.rn.f16x2 r3456, r2947, r3449, r3453; +} +{ +mul.f16x2 r3460, r2947, r3451; +} +{ +neg.f16x2 r3463, r3460; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +fma.rn.f16x2 r3493, r2959, r3486, r3490; +} +{ +mul.f16x2 r3497, r2959, r3488; +} +{ +neg.f16x2 r3500, r3497; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3500; +} +barrier.sync 0; +and.b32 r3643, r3625, 16384; +add.s32 r3644, r3642, r3643; +st.shared.u32 [r3644], r2869; +st.shared.u32 [r3644+1024], r2975; +st.shared.u32 [r3644+2048], r3012; +st.shared.u32 [r3644+3072], r3049; +st.shared.u32 [r3644+4096], r3086; +st.shared.u32 [r3644+5120], r3123; +st.shared.u32 [r3644+6144], r3160; +st.shared.u32 [r3644+7168], r3197; +st.shared.u32 [r3644+8192], r3234; +st.shared.u32 [r3644+9216], r3271; +st.shared.u32 [r3644+10240], r3308; +st.shared.u32 [r3644+11264], r3345; +st.shared.u32 [r3644+12288], r3382; +st.shared.u32 [r3644+13312], r3419; +st.shared.u32 [r3644+14336], r3456; +st.shared.u32 [r3644+15360], r3493; +barrier.sync 0; +mad.lo.s32 r3645, r3639, -60, r3644; +ld.shared.u32 r3524, [r3645]; +ld.shared.u32 r3536, [r3645+2048]; +ld.shared.u32 r3548, [r3645+4096]; +ld.shared.u32 r3560, [r3645+6144]; +ld.shared.u32 r3572, [r3645+8192]; +ld.shared.u32 r3584, [r3645+10240]; +ld.shared.u32 r3596, [r3645+12288]; +ld.shared.u32 r3608, [r3645+14336]; +ld.shared.u32 r3525, [r3645+16384]; +ld.shared.u32 r3537, [r3645+18432]; +ld.shared.u32 r3549, [r3645+20480]; +ld.shared.u32 r3561, [r3645+22528]; +ld.shared.u32 r3573, [r3645+24576]; +ld.shared.u32 r3585, [r3645+26624]; +ld.shared.u32 r3597, [r3645+28672]; +ld.shared.u32 r3609, [r3645+30720]; +barrier.sync 0; +st.shared.u32 [r3644], r2872; +st.shared.u32 [r3644+1024], r2984; +st.shared.u32 [r3644+2048], r3021; +st.shared.u32 [r3644+3072], r3058; +st.shared.u32 [r3644+4096], r3095; +st.shared.u32 [r3644+5120], r3132; +st.shared.u32 [r3644+6144], r3169; +st.shared.u32 [r3644+7168], r3206; +st.shared.u32 [r3644+8192], r3243; +st.shared.u32 [r3644+9216], r3280; +st.shared.u32 [r3644+10240], r3317; +st.shared.u32 [r3644+11264], r3354; +st.shared.u32 [r3644+12288], r3391; +st.shared.u32 [r3644+13312], r3428; +st.shared.u32 [r3644+14336], r3465; +st.shared.u32 [r3644+15360], r3502; +barrier.sync 0; +ld.shared.u32 r3527, [r3645]; +ld.shared.u32 r3539, [r3645+2048]; +ld.shared.u32 r3551, [r3645+4096]; +ld.shared.u32 r3563, [r3645+6144]; +ld.shared.u32 r3575, [r3645+8192]; +ld.shared.u32 r3587, [r3645+10240]; +ld.shared.u32 r3599, [r3645+12288]; +ld.shared.u32 r3611, [r3645+14336]; +ld.shared.u32 r3528, [r3645+16384]; +ld.shared.u32 r3540, [r3645+18432]; +ld.shared.u32 r3552, [r3645+20480]; +ld.shared.u32 r3564, [r3645+22528]; +ld.shared.u32 r3576, [r3645+24576]; +ld.shared.u32 r3588, [r3645+26624]; +ld.shared.u32 r3600, [r3645+28672]; +ld.shared.u32 r3612, [r3645+30720]; +{ +add.f16x2 %0, r3524, r3525; +} +{ +add.f16x2 %1, r3527, r3528; +} +{ +sub.f16x2 %16, r3524, r3525; +} +{ +sub.f16x2 %17, r3527, r3528; +} +{ +add.f16x2 %2, r3536, r3537; +} +{ +add.f16x2 %3, r3539, r3540; +} +{ +sub.f16x2 %18, r3536, r3537; +} +{ +sub.f16x2 %19, r3539, r3540; +} +{ +add.f16x2 %4, r3548, r3549; +} +{ +add.f16x2 %5, r3551, r3552; +} +{ +sub.f16x2 %20, r3548, r3549; +} +{ +sub.f16x2 %21, r3551, r3552; +} +{ +add.f16x2 %6, r3560, r3561; +} +{ +add.f16x2 %7, r3563, r3564; +} +{ +sub.f16x2 %22, r3560, r3561; +} +{ +sub.f16x2 %23, r3563, r3564; +} +{ +add.f16x2 %8, r3572, r3573; +} +{ +add.f16x2 %9, r3575, r3576; +} +{ +sub.f16x2 %24, r3572, r3573; +} +{ +sub.f16x2 %25, r3575, r3576; +} +{ +add.f16x2 %10, r3584, r3585; +} +{ +add.f16x2 %11, r3587, r3588; +} +{ +sub.f16x2 %26, r3584, r3585; +} +{ +sub.f16x2 %27, r3587, r3588; +} +{ +add.f16x2 %12, r3596, r3597; +} +{ +add.f16x2 %13, r3599, r3600; +} +{ +sub.f16x2 %28, r3596, r3597; +} +{ +sub.f16x2 %29, r3599, r3600; +} +{ +add.f16x2 %14, r3608, r3609; +} +{ +add.f16x2 %15, r3611, r3612; +} +{ +sub.f16x2 %30, r3608, r3609; +} +{ +sub.f16x2 %31, r3611, r3612; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1061, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<460>; +.reg .b32 r<3646>; +.reg .b64 rd<2>; +mov.u32 r3619, %tid.y; +shl.b32 r3620, r3619, 16; +mov.u32 r3621, %32; +add.s32 r3622, r3621, r3620; +mov.u32 r3623, %tid.x; +{ +add.f16x2 r1, %33, %49; +} +{ +add.f16x2 r4, %34, %50; +} +{ +sub.f16x2 r7, %33, %49; +} +{ +sub.f16x2 r10, %34, %50; +} +{ +add.f16x2 r13, %41, %57; +} +{ +add.f16x2 r16, %42, %58; +} +{ +sub.f16x2 r19, %41, %57; +} +{ +sub.f16x2 r22, %42, %58; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %37, %53; +} +{ +add.f16x2 r54, %38, %54; +} +{ +sub.f16x2 r57, %37, %53; +} +{ +sub.f16x2 r60, %38, %54; +} +{ +add.f16x2 r63, %45, %61; +} +{ +add.f16x2 r66, %46, %62; +} +{ +sub.f16x2 r69, %45, %61; +} +{ +sub.f16x2 r72, %46, %62; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f380, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r102, {low, high}; +} +mov.f32 f448, 0f3F800000; +mov.f32 f378, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r106, {low, high}; +} +mov.f32 f447, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +{ +add.f16x2 r197, %35, %51; +} +{ +add.f16x2 r200, %36, %52; +} +{ +sub.f16x2 r203, %35, %51; +} +{ +sub.f16x2 r206, %36, %52; +} +{ +add.f16x2 r209, %43, %59; +} +{ +add.f16x2 r212, %44, %60; +} +{ +sub.f16x2 r215, %43, %59; +} +{ +sub.f16x2 r218, %44, %60; +} +{ +neg.f16x2 r221, r218; +} +{ +add.f16x2 r223, r197, r209; +} +{ +add.f16x2 r226, r200, r212; +} +{ +sub.f16x2 r229, r197, r209; +} +{ +sub.f16x2 r232, r200, r212; +} +{ +add.f16x2 r235, r203, r221; +} +{ +add.f16x2 r238, r206, r215; +} +{ +sub.f16x2 r241, r203, r221; +} +{ +sub.f16x2 r244, r206, r215; +} +{ +add.f16x2 r247, %39, %55; +} +{ +add.f16x2 r250, %40, %56; +} +{ +sub.f16x2 r253, %39, %55; +} +{ +sub.f16x2 r256, %40, %56; +} +{ +add.f16x2 r259, %47, %63; +} +{ +add.f16x2 r262, %48, %64; +} +{ +sub.f16x2 r265, %47, %63; +} +{ +sub.f16x2 r268, %48, %64; +} +{ +neg.f16x2 r271, r268; +} +{ +add.f16x2 r273, r247, r259; +} +{ +add.f16x2 r276, r250, r262; +} +{ +sub.f16x2 r279, r247, r259; +} +{ +sub.f16x2 r282, r250, r262; +} +{ +add.f16x2 r285, r253, r271; +} +{ +add.f16x2 r288, r256, r265; +} +{ +sub.f16x2 r291, r253, r271; +} +{ +sub.f16x2 r294, r256, r265; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r297, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r298, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r301, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r302, {low, high}; +} +{ +mul.f16x2 r311, r285, r297; +} +{ +mul.f16x2 r314, r288, r298; +} +{ +sub.f16x2 r317, r311, r314; +} +{ +mul.f16x2 r320, r285, r298; +} +{ +fma.rn.f16x2 r323, r288, r297, r320; +} +{ +neg.f16x2 r327, r282; +} +{ +mul.f16x2 r329, r291, r301; +} +{ +mul.f16x2 r332, r294, r302; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r291, r302; +} +{ +fma.rn.f16x2 r341, r294, r301, r338; +} +{ +add.f16x2 r345, r223, r273; +} +{ +add.f16x2 r348, r226, r276; +} +{ +sub.f16x2 r351, r223, r273; +} +{ +sub.f16x2 r354, r226, r276; +} +{ +add.f16x2 r357, r235, r317; +} +{ +add.f16x2 r360, r238, r323; +} +{ +sub.f16x2 r363, r235, r317; +} +{ +sub.f16x2 r366, r238, r323; +} +{ +add.f16x2 r369, r229, r327; +} +{ +add.f16x2 r372, r232, r279; +} +{ +sub.f16x2 r375, r229, r327; +} +{ +sub.f16x2 r378, r232, r279; +} +{ +add.f16x2 r381, r241, r335; +} +{ +add.f16x2 r384, r244, r341; +} +{ +sub.f16x2 r387, r241, r335; +} +{ +sub.f16x2 r390, r244, r341; +} +mov.f32 f376, 0f3F6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r393, {low, high}; +} +mov.f32 f384, 0f3EC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r394, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r395, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r396, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r398, {low, high}; +} +mov.f32 f374, 0fBEC3EF15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r401, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r402, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r403, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r404, {low, high}; +} +mov.f32 f382, 0fBF6C835E; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r405, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r406, {low, high}; +} +{ +mul.f16x2 r423, r357, r393; +} +{ +mul.f16x2 r426, r360, r394; +} +{ +sub.f16x2 r429, r423, r426; +} +{ +mul.f16x2 r432, r357, r394; +} +{ +fma.rn.f16x2 r435, r360, r393, r432; +} +{ +mul.f16x2 r439, r369, r395; +} +{ +mul.f16x2 r442, r372, r396; +} +{ +sub.f16x2 r445, r439, r442; +} +{ +mul.f16x2 r448, r369, r396; +} +{ +fma.rn.f16x2 r451, r372, r395, r448; +} +{ +mul.f16x2 r455, r381, r397; +} +{ +mul.f16x2 r458, r384, r398; +} +{ +sub.f16x2 r461, r455, r458; +} +{ +mul.f16x2 r464, r381, r398; +} +{ +fma.rn.f16x2 r467, r384, r397, r464; +} +{ +neg.f16x2 r471, r354; +} +{ +mul.f16x2 r473, r363, r401; +} +{ +mul.f16x2 r476, r366, r402; +} +{ +sub.f16x2 r479, r473, r476; +} +{ +mul.f16x2 r482, r363, r402; +} +{ +fma.rn.f16x2 r485, r366, r401, r482; +} +{ +mul.f16x2 r489, r375, r403; +} +{ +mul.f16x2 r492, r378, r404; +} +{ +sub.f16x2 r495, r489, r492; +} +{ +mul.f16x2 r498, r375, r404; +} +{ +fma.rn.f16x2 r501, r378, r403, r498; +} +{ +mul.f16x2 r505, r387, r405; +} +{ +mul.f16x2 r508, r390, r406; +} +{ +sub.f16x2 r511, r505, r508; +} +{ +mul.f16x2 r514, r387, r406; +} +{ +fma.rn.f16x2 r517, r390, r405, r514; +} +{ +add.f16x2 r521, r149, r345; +} +{ +add.f16x2 r524, r152, r348; +} +{ +sub.f16x2 r527, r149, r345; +} +{ +sub.f16x2 r530, r152, r348; +} +{ +add.f16x2 r533, r161, r429; +} +{ +add.f16x2 r536, r164, r435; +} +{ +sub.f16x2 r539, r161, r429; +} +{ +sub.f16x2 r542, r164, r435; +} +{ +add.f16x2 r545, r173, r445; +} +{ +add.f16x2 r548, r176, r451; +} +{ +sub.f16x2 r551, r173, r445; +} +{ +sub.f16x2 r554, r176, r451; +} +{ +add.f16x2 r557, r185, r461; +} +{ +add.f16x2 r560, r188, r467; +} +{ +sub.f16x2 r563, r185, r461; +} +{ +sub.f16x2 r566, r188, r467; +} +{ +add.f16x2 r569, r155, r471; +} +{ +add.f16x2 r572, r158, r351; +} +{ +sub.f16x2 r575, r155, r471; +} +{ +sub.f16x2 r578, r158, r351; +} +{ +add.f16x2 r581, r167, r479; +} +{ +add.f16x2 r584, r170, r485; +} +{ +sub.f16x2 r587, r167, r479; +} +{ +sub.f16x2 r590, r170, r485; +} +{ +add.f16x2 r593, r179, r495; +} +{ +add.f16x2 r596, r182, r501; +} +{ +sub.f16x2 r599, r179, r495; +} +{ +sub.f16x2 r602, r182, r501; +} +{ +add.f16x2 r605, r191, r511; +} +{ +add.f16x2 r608, r194, r517; +} +{ +sub.f16x2 r611, r191, r511; +} +{ +sub.f16x2 r614, r194, r517; +} +and.b32 r3624, r3623, 511; +shl.b32 r3625, r3623, 7; +and.b32 r3626, r3625, -65536; +add.s32 r3627, r3622, r3626; +cvt.rn.f32.u32 f451, r3624; +mul.f32 f452, f451, 0f3A490FDB; +cos.approx.f32 f117, f452; +sin.approx.f32 f453, f452; +neg.f32 f118, f453; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f117; +cvt.rn.f16.f32 high, f118; +mov.b32 r617, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r622, {high, high}; +} +{ +mul.f16x2 r624, r536, r622; +} +{ +fma.rn.f16x2 r627, r533, r620, r624; +} +{ +mul.f16x2 r631, r533, r622; +} +{ +neg.f16x2 r634, r631; +} +{ +fma.rn.f16x2 r636, r536, r620, r634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r640, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r642, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r644, {low, high}; +} +{ +mul.f16x2 r645, r642, r644; +} +{ +mul.f16x2 r648, r617, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r651, {high, low}; +} +{ +fma.rn.f16x2 r653, r645, r651, r648; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r659, {high, high}; +} +{ +mul.f16x2 r661, r548, r659; +} +{ +fma.rn.f16x2 r664, r545, r657, r661; +} +{ +mul.f16x2 r668, r545, r659; +} +{ +neg.f16x2 r671, r668; +} +{ +fma.rn.f16x2 r673, r548, r657, r671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r677, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r679, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r681, {low, high}; +} +{ +mul.f16x2 r682, r679, r681; +} +{ +mul.f16x2 r685, r653, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r653; +mov.b32 r688, {high, low}; +} +{ +fma.rn.f16x2 r690, r682, r688, r685; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r696, {high, high}; +} +{ +mul.f16x2 r698, r560, r696; +} +{ +fma.rn.f16x2 r701, r557, r694, r698; +} +{ +mul.f16x2 r705, r557, r696; +} +{ +neg.f16x2 r708, r705; +} +{ +fma.rn.f16x2 r710, r560, r694, r708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r714, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r716, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r718, {low, high}; +} +{ +mul.f16x2 r719, r716, r718; +} +{ +mul.f16x2 r722, r690, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r690; +mov.b32 r725, {high, low}; +} +{ +fma.rn.f16x2 r727, r719, r725, r722; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r733, {high, high}; +} +{ +mul.f16x2 r735, r572, r733; +} +{ +fma.rn.f16x2 r738, r569, r731, r735; +} +{ +mul.f16x2 r742, r569, r733; +} +{ +neg.f16x2 r745, r742; +} +{ +fma.rn.f16x2 r747, r572, r731, r745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r751, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r753, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r755, {low, high}; +} +{ +mul.f16x2 r756, r753, r755; +} +{ +mul.f16x2 r759, r727, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r727; +mov.b32 r762, {high, low}; +} +{ +fma.rn.f16x2 r764, r756, r762, r759; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r770, {high, high}; +} +{ +mul.f16x2 r772, r584, r770; +} +{ +fma.rn.f16x2 r775, r581, r768, r772; +} +{ +mul.f16x2 r779, r581, r770; +} +{ +neg.f16x2 r782, r779; +} +{ +fma.rn.f16x2 r784, r584, r768, r782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r788, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r790, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r792, {low, high}; +} +{ +mul.f16x2 r793, r790, r792; +} +{ +mul.f16x2 r796, r764, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r764; +mov.b32 r799, {high, low}; +} +{ +fma.rn.f16x2 r801, r793, r799, r796; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r807, {high, high}; +} +{ +mul.f16x2 r809, r596, r807; +} +{ +fma.rn.f16x2 r812, r593, r805, r809; +} +{ +mul.f16x2 r816, r593, r807; +} +{ +neg.f16x2 r819, r816; +} +{ +fma.rn.f16x2 r821, r596, r805, r819; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r825, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r827, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r829, {low, high}; +} +{ +mul.f16x2 r830, r827, r829; +} +{ +mul.f16x2 r833, r801, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r801; +mov.b32 r836, {high, low}; +} +{ +fma.rn.f16x2 r838, r830, r836, r833; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r844, {high, high}; +} +{ +mul.f16x2 r846, r608, r844; +} +{ +fma.rn.f16x2 r849, r605, r842, r846; +} +{ +mul.f16x2 r853, r605, r844; +} +{ +neg.f16x2 r856, r853; +} +{ +fma.rn.f16x2 r858, r608, r842, r856; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r862, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r864, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r866, {low, high}; +} +{ +mul.f16x2 r867, r864, r866; +} +{ +mul.f16x2 r870, r838, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r838; +mov.b32 r873, {high, low}; +} +{ +fma.rn.f16x2 r875, r867, r873, r870; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r879, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r881, {high, high}; +} +{ +mul.f16x2 r883, r530, r881; +} +{ +fma.rn.f16x2 r886, r527, r879, r883; +} +{ +mul.f16x2 r890, r527, r881; +} +{ +neg.f16x2 r893, r890; +} +{ +fma.rn.f16x2 r895, r530, r879, r893; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r899, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r901, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r903, {low, high}; +} +{ +mul.f16x2 r904, r901, r903; +} +{ +mul.f16x2 r907, r875, r899; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r875; +mov.b32 r910, {high, low}; +} +{ +fma.rn.f16x2 r912, r904, r910, r907; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r916, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r918, {high, high}; +} +{ +mul.f16x2 r920, r542, r918; +} +{ +fma.rn.f16x2 r923, r539, r916, r920; +} +{ +mul.f16x2 r927, r539, r918; +} +{ +neg.f16x2 r930, r927; +} +{ +fma.rn.f16x2 r932, r542, r916, r930; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r936, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r938, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r940, {low, high}; +} +{ +mul.f16x2 r941, r938, r940; +} +{ +mul.f16x2 r944, r912, r936; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r912; +mov.b32 r947, {high, low}; +} +{ +fma.rn.f16x2 r949, r941, r947, r944; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r953, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r955, {high, high}; +} +{ +mul.f16x2 r957, r554, r955; +} +{ +fma.rn.f16x2 r960, r551, r953, r957; +} +{ +mul.f16x2 r964, r551, r955; +} +{ +neg.f16x2 r967, r964; +} +{ +fma.rn.f16x2 r969, r554, r953, r967; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r973, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r975, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r977, {low, high}; +} +{ +mul.f16x2 r978, r975, r977; +} +{ +mul.f16x2 r981, r949, r973; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r949; +mov.b32 r984, {high, low}; +} +{ +fma.rn.f16x2 r986, r978, r984, r981; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r990, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r992, {high, high}; +} +{ +mul.f16x2 r994, r566, r992; +} +{ +fma.rn.f16x2 r997, r563, r990, r994; +} +{ +mul.f16x2 r1001, r563, r992; +} +{ +neg.f16x2 r1004, r1001; +} +{ +fma.rn.f16x2 r1006, r566, r990, r1004; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1010, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1012, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1014, {low, high}; +} +{ +mul.f16x2 r1015, r1012, r1014; +} +{ +mul.f16x2 r1018, r986, r1010; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r986; +mov.b32 r1021, {high, low}; +} +{ +fma.rn.f16x2 r1023, r1015, r1021, r1018; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1027, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1029, {high, high}; +} +{ +mul.f16x2 r1031, r578, r1029; +} +{ +fma.rn.f16x2 r1034, r575, r1027, r1031; +} +{ +mul.f16x2 r1038, r575, r1029; +} +{ +neg.f16x2 r1041, r1038; +} +{ +fma.rn.f16x2 r1043, r578, r1027, r1041; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1047, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1049, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1051, {low, high}; +} +{ +mul.f16x2 r1052, r1049, r1051; +} +{ +mul.f16x2 r1055, r1023, r1047; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1023; +mov.b32 r1058, {high, low}; +} +{ +fma.rn.f16x2 r1060, r1052, r1058, r1055; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1064, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1066, {high, high}; +} +{ +mul.f16x2 r1068, r590, r1066; +} +{ +fma.rn.f16x2 r1071, r587, r1064, r1068; +} +{ +mul.f16x2 r1075, r587, r1066; +} +{ +neg.f16x2 r1078, r1075; +} +{ +fma.rn.f16x2 r1080, r590, r1064, r1078; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1084, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1086, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1088, {low, high}; +} +{ +mul.f16x2 r1089, r1086, r1088; +} +{ +mul.f16x2 r1092, r1060, r1084; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1060; +mov.b32 r1095, {high, low}; +} +{ +fma.rn.f16x2 r1097, r1089, r1095, r1092; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1101, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1103, {high, high}; +} +{ +mul.f16x2 r1105, r602, r1103; +} +{ +fma.rn.f16x2 r1108, r599, r1101, r1105; +} +{ +mul.f16x2 r1112, r599, r1103; +} +{ +neg.f16x2 r1115, r1112; +} +{ +fma.rn.f16x2 r1117, r602, r1101, r1115; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1121, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r617; +mov.b32 r1123, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1125, {low, high}; +} +{ +mul.f16x2 r1126, r1123, r1125; +} +{ +mul.f16x2 r1129, r1097, r1121; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1097; +mov.b32 r1132, {high, low}; +} +{ +fma.rn.f16x2 r1134, r1126, r1132, r1129; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1138, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1134; +mov.b32 r1140, {high, high}; +} +{ +mul.f16x2 r1142, r614, r1140; +} +{ +fma.rn.f16x2 r1145, r611, r1138, r1142; +} +{ +mul.f16x2 r1149, r611, r1140; +} +{ +neg.f16x2 r1152, r1149; +} +{ +fma.rn.f16x2 r1154, r614, r1138, r1152; +} +barrier.sync 0; +and.b32 r3628, r3625, 65408; +add.s32 r3629, r3627, r3628; +st.shared.v4.f32 [r3629], {r521, r524, r627, r636}; +st.shared.v4.f32 [r3629+16], {r664, r673, r701, r710}; +st.shared.v4.f32 [r3629+32], {r738, r747, r775, r784}; +st.shared.v4.f32 [r3629+48], {r812, r821, r849, r858}; +st.shared.v4.f32 [r3629+64], {r886, r895, r923, r932}; +st.shared.v4.f32 [r3629+80], {r960, r969, r997, r1006}; +st.shared.v4.f32 [r3629+96], {r1034, r1043, r1071, r1080}; +st.shared.v4.f32 [r3629+112], {r1108, r1117, r1145, r1154}; +barrier.sync 0; +mad.lo.s32 r3630, r3624, -120, r3629; +ld.shared.u32 r1176, [r3630]; +ld.shared.u32 r1179, [r3630+4]; +ld.shared.u32 r1372, [r3630+4096]; +ld.shared.u32 r1375, [r3630+4100]; +ld.shared.u32 r1226, [r3630+8192]; +ld.shared.u32 r1229, [r3630+8196]; +ld.shared.u32 r1422, [r3630+12288]; +ld.shared.u32 r1425, [r3630+12292]; +ld.shared.u32 r1188, [r3630+16384]; +ld.shared.u32 r1191, [r3630+16388]; +ld.shared.u32 r1384, [r3630+20480]; +ld.shared.u32 r1387, [r3630+20484]; +ld.shared.u32 r1238, [r3630+24576]; +ld.shared.u32 r1241, [r3630+24580]; +ld.shared.u32 r1434, [r3630+28672]; +ld.shared.u32 r1437, [r3630+28676]; +ld.shared.u32 r1177, [r3630+32768]; +ld.shared.u32 r1180, [r3630+32772]; +ld.shared.u32 r1373, [r3630+36864]; +ld.shared.u32 r1376, [r3630+36868]; +ld.shared.u32 r1227, [r3630+40960]; +ld.shared.u32 r1230, [r3630+40964]; +ld.shared.u32 r1423, [r3630+45056]; +ld.shared.u32 r1426, [r3630+45060]; +ld.shared.u32 r1189, [r3630+49152]; +ld.shared.u32 r1192, [r3630+49156]; +ld.shared.u32 r1385, [r3630+53248]; +ld.shared.u32 r1388, [r3630+53252]; +ld.shared.u32 r1239, [r3630+57344]; +ld.shared.u32 r1242, [r3630+57348]; +ld.shared.u32 r1435, [r3630+61440]; +ld.shared.u32 r1438, [r3630+61444]; +{ +add.f16x2 r1175, r1176, r1177; +} +{ +add.f16x2 r1178, r1179, r1180; +} +{ +sub.f16x2 r1181, r1176, r1177; +} +{ +sub.f16x2 r1184, r1179, r1180; +} +{ +add.f16x2 r1187, r1188, r1189; +} +{ +add.f16x2 r1190, r1191, r1192; +} +{ +sub.f16x2 r1193, r1188, r1189; +} +{ +sub.f16x2 r1196, r1191, r1192; +} +{ +neg.f16x2 r1199, r1196; +} +{ +add.f16x2 r1201, r1175, r1187; +} +{ +add.f16x2 r1204, r1178, r1190; +} +{ +sub.f16x2 r1207, r1175, r1187; +} +{ +sub.f16x2 r1210, r1178, r1190; +} +{ +add.f16x2 r1213, r1181, r1199; +} +{ +add.f16x2 r1216, r1184, r1193; +} +{ +sub.f16x2 r1219, r1181, r1199; +} +{ +sub.f16x2 r1222, r1184, r1193; +} +{ +add.f16x2 r1225, r1226, r1227; +} +{ +add.f16x2 r1228, r1229, r1230; +} +{ +sub.f16x2 r1231, r1226, r1227; +} +{ +sub.f16x2 r1234, r1229, r1230; +} +{ +add.f16x2 r1237, r1238, r1239; +} +{ +add.f16x2 r1240, r1241, r1242; +} +{ +sub.f16x2 r1243, r1238, r1239; +} +{ +sub.f16x2 r1246, r1241, r1242; +} +{ +neg.f16x2 r1249, r1246; +} +{ +add.f16x2 r1251, r1225, r1237; +} +{ +add.f16x2 r1254, r1228, r1240; +} +{ +sub.f16x2 r1257, r1225, r1237; +} +{ +sub.f16x2 r1260, r1228, r1240; +} +{ +add.f16x2 r1263, r1231, r1249; +} +{ +add.f16x2 r1266, r1234, r1243; +} +{ +sub.f16x2 r1269, r1231, r1249; +} +{ +sub.f16x2 r1272, r1234, r1243; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1275, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1276, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1280, {low, high}; +} +{ +mul.f16x2 r1289, r1263, r1275; +} +{ +mul.f16x2 r1292, r1266, r1276; +} +{ +sub.f16x2 r1295, r1289, r1292; +} +{ +mul.f16x2 r1298, r1263, r1276; +} +{ +fma.rn.f16x2 r1301, r1266, r1275, r1298; +} +{ +neg.f16x2 r1305, r1260; +} +{ +mul.f16x2 r1307, r1269, r1279; +} +{ +mul.f16x2 r1310, r1272, r1280; +} +{ +sub.f16x2 r1313, r1307, r1310; +} +{ +mul.f16x2 r1316, r1269, r1280; +} +{ +fma.rn.f16x2 r1319, r1272, r1279, r1316; +} +{ +add.f16x2 r1323, r1201, r1251; +} +{ +add.f16x2 r1326, r1204, r1254; +} +{ +sub.f16x2 r1329, r1201, r1251; +} +{ +sub.f16x2 r1332, r1204, r1254; +} +{ +add.f16x2 r1335, r1213, r1295; +} +{ +add.f16x2 r1338, r1216, r1301; +} +{ +sub.f16x2 r1341, r1213, r1295; +} +{ +sub.f16x2 r1344, r1216, r1301; +} +{ +add.f16x2 r1347, r1207, r1305; +} +{ +add.f16x2 r1350, r1210, r1257; +} +{ +sub.f16x2 r1353, r1207, r1305; +} +{ +sub.f16x2 r1356, r1210, r1257; +} +{ +add.f16x2 r1359, r1219, r1313; +} +{ +add.f16x2 r1362, r1222, r1319; +} +{ +sub.f16x2 r1365, r1219, r1313; +} +{ +sub.f16x2 r1368, r1222, r1319; +} +{ +add.f16x2 r1371, r1372, r1373; +} +{ +add.f16x2 r1374, r1375, r1376; +} +{ +sub.f16x2 r1377, r1372, r1373; +} +{ +sub.f16x2 r1380, r1375, r1376; +} +{ +add.f16x2 r1383, r1384, r1385; +} +{ +add.f16x2 r1386, r1387, r1388; +} +{ +sub.f16x2 r1389, r1384, r1385; +} +{ +sub.f16x2 r1392, r1387, r1388; +} +{ +neg.f16x2 r1395, r1392; +} +{ +add.f16x2 r1397, r1371, r1383; +} +{ +add.f16x2 r1400, r1374, r1386; +} +{ +sub.f16x2 r1403, r1371, r1383; +} +{ +sub.f16x2 r1406, r1374, r1386; +} +{ +add.f16x2 r1409, r1377, r1395; +} +{ +add.f16x2 r1412, r1380, r1389; +} +{ +sub.f16x2 r1415, r1377, r1395; +} +{ +sub.f16x2 r1418, r1380, r1389; +} +{ +add.f16x2 r1421, r1422, r1423; +} +{ +add.f16x2 r1424, r1425, r1426; +} +{ +sub.f16x2 r1427, r1422, r1423; +} +{ +sub.f16x2 r1430, r1425, r1426; +} +{ +add.f16x2 r1433, r1434, r1435; +} +{ +add.f16x2 r1436, r1437, r1438; +} +{ +sub.f16x2 r1439, r1434, r1435; +} +{ +sub.f16x2 r1442, r1437, r1438; +} +{ +neg.f16x2 r1445, r1442; +} +{ +add.f16x2 r1447, r1421, r1433; +} +{ +add.f16x2 r1450, r1424, r1436; +} +{ +sub.f16x2 r1453, r1421, r1433; +} +{ +sub.f16x2 r1456, r1424, r1436; +} +{ +add.f16x2 r1459, r1427, r1445; +} +{ +add.f16x2 r1462, r1430, r1439; +} +{ +sub.f16x2 r1465, r1427, r1445; +} +{ +sub.f16x2 r1468, r1430, r1439; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1471, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1472, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1476, {low, high}; +} +{ +mul.f16x2 r1485, r1459, r1471; +} +{ +mul.f16x2 r1488, r1462, r1472; +} +{ +sub.f16x2 r1491, r1485, r1488; +} +{ +mul.f16x2 r1494, r1459, r1472; +} +{ +fma.rn.f16x2 r1497, r1462, r1471, r1494; +} +{ +neg.f16x2 r1501, r1456; +} +{ +mul.f16x2 r1503, r1465, r1475; +} +{ +mul.f16x2 r1506, r1468, r1476; +} +{ +sub.f16x2 r1509, r1503, r1506; +} +{ +mul.f16x2 r1512, r1465, r1476; +} +{ +fma.rn.f16x2 r1515, r1468, r1475, r1512; +} +{ +add.f16x2 r1519, r1397, r1447; +} +{ +add.f16x2 r1522, r1400, r1450; +} +{ +sub.f16x2 r1525, r1397, r1447; +} +{ +sub.f16x2 r1528, r1400, r1450; +} +{ +add.f16x2 r1531, r1409, r1491; +} +{ +add.f16x2 r1534, r1412, r1497; +} +{ +sub.f16x2 r1537, r1409, r1491; +} +{ +sub.f16x2 r1540, r1412, r1497; +} +{ +add.f16x2 r1543, r1403, r1501; +} +{ +add.f16x2 r1546, r1406, r1453; +} +{ +sub.f16x2 r1549, r1403, r1501; +} +{ +sub.f16x2 r1552, r1406, r1453; +} +{ +add.f16x2 r1555, r1415, r1509; +} +{ +add.f16x2 r1558, r1418, r1515; +} +{ +sub.f16x2 r1561, r1415, r1509; +} +{ +sub.f16x2 r1564, r1418, r1515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1567, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1568, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1569, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1570, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1572, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r1575, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r1576, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r1577, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r1578, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r1579, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r1580, {low, high}; +} +{ +mul.f16x2 r1597, r1531, r1567; +} +{ +mul.f16x2 r1600, r1534, r1568; +} +{ +sub.f16x2 r1603, r1597, r1600; +} +{ +mul.f16x2 r1606, r1531, r1568; +} +{ +fma.rn.f16x2 r1609, r1534, r1567, r1606; +} +{ +mul.f16x2 r1613, r1543, r1569; +} +{ +mul.f16x2 r1616, r1546, r1570; +} +{ +sub.f16x2 r1619, r1613, r1616; +} +{ +mul.f16x2 r1622, r1543, r1570; +} +{ +fma.rn.f16x2 r1625, r1546, r1569, r1622; +} +{ +mul.f16x2 r1629, r1555, r1571; +} +{ +mul.f16x2 r1632, r1558, r1572; +} +{ +sub.f16x2 r1635, r1629, r1632; +} +{ +mul.f16x2 r1638, r1555, r1572; +} +{ +fma.rn.f16x2 r1641, r1558, r1571, r1638; +} +{ +neg.f16x2 r1645, r1528; +} +{ +mul.f16x2 r1647, r1537, r1575; +} +{ +mul.f16x2 r1650, r1540, r1576; +} +{ +sub.f16x2 r1653, r1647, r1650; +} +{ +mul.f16x2 r1656, r1537, r1576; +} +{ +fma.rn.f16x2 r1659, r1540, r1575, r1656; +} +{ +mul.f16x2 r1663, r1549, r1577; +} +{ +mul.f16x2 r1666, r1552, r1578; +} +{ +sub.f16x2 r1669, r1663, r1666; +} +{ +mul.f16x2 r1672, r1549, r1578; +} +{ +fma.rn.f16x2 r1675, r1552, r1577, r1672; +} +{ +mul.f16x2 r1679, r1561, r1579; +} +{ +mul.f16x2 r1682, r1564, r1580; +} +{ +sub.f16x2 r1685, r1679, r1682; +} +{ +mul.f16x2 r1688, r1561, r1580; +} +{ +fma.rn.f16x2 r1691, r1564, r1579, r1688; +} +{ +add.f16x2 r1695, r1323, r1519; +} +{ +add.f16x2 r1698, r1326, r1522; +} +{ +sub.f16x2 r1701, r1323, r1519; +} +{ +sub.f16x2 r1704, r1326, r1522; +} +{ +add.f16x2 r1707, r1335, r1603; +} +{ +add.f16x2 r1710, r1338, r1609; +} +{ +sub.f16x2 r1713, r1335, r1603; +} +{ +sub.f16x2 r1716, r1338, r1609; +} +{ +add.f16x2 r1719, r1347, r1619; +} +{ +add.f16x2 r1722, r1350, r1625; +} +{ +sub.f16x2 r1725, r1347, r1619; +} +{ +sub.f16x2 r1728, r1350, r1625; +} +{ +add.f16x2 r1731, r1359, r1635; +} +{ +add.f16x2 r1734, r1362, r1641; +} +{ +sub.f16x2 r1737, r1359, r1635; +} +{ +sub.f16x2 r1740, r1362, r1641; +} +{ +add.f16x2 r1743, r1329, r1645; +} +{ +add.f16x2 r1746, r1332, r1525; +} +{ +sub.f16x2 r1749, r1329, r1645; +} +{ +sub.f16x2 r1752, r1332, r1525; +} +{ +add.f16x2 r1755, r1341, r1653; +} +{ +add.f16x2 r1758, r1344, r1659; +} +{ +sub.f16x2 r1761, r1341, r1653; +} +{ +sub.f16x2 r1764, r1344, r1659; +} +{ +add.f16x2 r1767, r1353, r1669; +} +{ +add.f16x2 r1770, r1356, r1675; +} +{ +sub.f16x2 r1773, r1353, r1669; +} +{ +sub.f16x2 r1776, r1356, r1675; +} +{ +add.f16x2 r1779, r1365, r1685; +} +{ +add.f16x2 r1782, r1368, r1691; +} +{ +sub.f16x2 r1785, r1365, r1685; +} +{ +sub.f16x2 r1788, r1368, r1691; +} +and.b32 r3631, r3623, 496; +bfe.u32 r3632, r3623, 4, 5; +cvt.rn.f32.u32 f454, r3632; +mul.f32 f455, f454, 0f3C490FDB; +cos.approx.f32 f267, f455; +sin.approx.f32 f456, f455; +neg.f32 f268, f456; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f267; +cvt.rn.f16.f32 high, f268; +mov.b32 r1791, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1794, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1796, {high, high}; +} +{ +mul.f16x2 r1798, r1710, r1796; +} +{ +fma.rn.f16x2 r1801, r1707, r1794, r1798; +} +{ +mul.f16x2 r1805, r1707, r1796; +} +{ +neg.f16x2 r1808, r1805; +} +{ +fma.rn.f16x2 r1810, r1710, r1794, r1808; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1814, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1816, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1818, {low, high}; +} +{ +mul.f16x2 r1819, r1816, r1818; +} +{ +mul.f16x2 r1822, r1791, r1814; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1825, {high, low}; +} +{ +fma.rn.f16x2 r1827, r1819, r1825, r1822; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1831, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1833, {high, high}; +} +{ +mul.f16x2 r1835, r1722, r1833; +} +{ +fma.rn.f16x2 r1838, r1719, r1831, r1835; +} +{ +mul.f16x2 r1842, r1719, r1833; +} +{ +neg.f16x2 r1845, r1842; +} +{ +fma.rn.f16x2 r1847, r1722, r1831, r1845; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1851, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1853, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1855, {low, high}; +} +{ +mul.f16x2 r1856, r1853, r1855; +} +{ +mul.f16x2 r1859, r1827, r1851; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1827; +mov.b32 r1862, {high, low}; +} +{ +fma.rn.f16x2 r1864, r1856, r1862, r1859; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1868, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1870, {high, high}; +} +{ +mul.f16x2 r1872, r1734, r1870; +} +{ +fma.rn.f16x2 r1875, r1731, r1868, r1872; +} +{ +mul.f16x2 r1879, r1731, r1870; +} +{ +neg.f16x2 r1882, r1879; +} +{ +fma.rn.f16x2 r1884, r1734, r1868, r1882; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1888, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1890, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1892, {low, high}; +} +{ +mul.f16x2 r1893, r1890, r1892; +} +{ +mul.f16x2 r1896, r1864, r1888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1864; +mov.b32 r1899, {high, low}; +} +{ +fma.rn.f16x2 r1901, r1893, r1899, r1896; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1905, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1907, {high, high}; +} +{ +mul.f16x2 r1909, r1746, r1907; +} +{ +fma.rn.f16x2 r1912, r1743, r1905, r1909; +} +{ +mul.f16x2 r1916, r1743, r1907; +} +{ +neg.f16x2 r1919, r1916; +} +{ +fma.rn.f16x2 r1921, r1746, r1905, r1919; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1925, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1927, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1929, {low, high}; +} +{ +mul.f16x2 r1930, r1927, r1929; +} +{ +mul.f16x2 r1933, r1901, r1925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1901; +mov.b32 r1936, {high, low}; +} +{ +fma.rn.f16x2 r1938, r1930, r1936, r1933; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1942, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1944, {high, high}; +} +{ +mul.f16x2 r1946, r1758, r1944; +} +{ +fma.rn.f16x2 r1949, r1755, r1942, r1946; +} +{ +mul.f16x2 r1953, r1755, r1944; +} +{ +neg.f16x2 r1956, r1953; +} +{ +fma.rn.f16x2 r1958, r1758, r1942, r1956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1962, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1964, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r1966, {low, high}; +} +{ +mul.f16x2 r1967, r1964, r1966; +} +{ +mul.f16x2 r1970, r1938, r1962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1938; +mov.b32 r1973, {high, low}; +} +{ +fma.rn.f16x2 r1975, r1967, r1973, r1970; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1979, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r1981, {high, high}; +} +{ +mul.f16x2 r1983, r1770, r1981; +} +{ +fma.rn.f16x2 r1986, r1767, r1979, r1983; +} +{ +mul.f16x2 r1990, r1767, r1981; +} +{ +neg.f16x2 r1993, r1990; +} +{ +fma.rn.f16x2 r1995, r1770, r1979, r1993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r1999, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2001, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2003, {low, high}; +} +{ +mul.f16x2 r2004, r2001, r2003; +} +{ +mul.f16x2 r2007, r1975, r1999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1975; +mov.b32 r2010, {high, low}; +} +{ +fma.rn.f16x2 r2012, r2004, r2010, r2007; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2016, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2018, {high, high}; +} +{ +mul.f16x2 r2020, r1782, r2018; +} +{ +fma.rn.f16x2 r2023, r1779, r2016, r2020; +} +{ +mul.f16x2 r2027, r1779, r2018; +} +{ +neg.f16x2 r2030, r2027; +} +{ +fma.rn.f16x2 r2032, r1782, r2016, r2030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2036, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2038, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2040, {low, high}; +} +{ +mul.f16x2 r2041, r2038, r2040; +} +{ +mul.f16x2 r2044, r2012, r2036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2012; +mov.b32 r2047, {high, low}; +} +{ +fma.rn.f16x2 r2049, r2041, r2047, r2044; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2053, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2055, {high, high}; +} +{ +mul.f16x2 r2057, r1704, r2055; +} +{ +fma.rn.f16x2 r2060, r1701, r2053, r2057; +} +{ +mul.f16x2 r2064, r1701, r2055; +} +{ +neg.f16x2 r2067, r2064; +} +{ +fma.rn.f16x2 r2069, r1704, r2053, r2067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2073, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2075, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2077, {low, high}; +} +{ +mul.f16x2 r2078, r2075, r2077; +} +{ +mul.f16x2 r2081, r2049, r2073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2049; +mov.b32 r2084, {high, low}; +} +{ +fma.rn.f16x2 r2086, r2078, r2084, r2081; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2090, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2092, {high, high}; +} +{ +mul.f16x2 r2094, r1716, r2092; +} +{ +fma.rn.f16x2 r2097, r1713, r2090, r2094; +} +{ +mul.f16x2 r2101, r1713, r2092; +} +{ +neg.f16x2 r2104, r2101; +} +{ +fma.rn.f16x2 r2106, r1716, r2090, r2104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2112, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2114, {low, high}; +} +{ +mul.f16x2 r2115, r2112, r2114; +} +{ +mul.f16x2 r2118, r2086, r2110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2086; +mov.b32 r2121, {high, low}; +} +{ +fma.rn.f16x2 r2123, r2115, r2121, r2118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2129, {high, high}; +} +{ +mul.f16x2 r2131, r1728, r2129; +} +{ +fma.rn.f16x2 r2134, r1725, r2127, r2131; +} +{ +mul.f16x2 r2138, r1725, r2129; +} +{ +neg.f16x2 r2141, r2138; +} +{ +fma.rn.f16x2 r2143, r1728, r2127, r2141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2147, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2149, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2151, {low, high}; +} +{ +mul.f16x2 r2152, r2149, r2151; +} +{ +mul.f16x2 r2155, r2123, r2147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2123; +mov.b32 r2158, {high, low}; +} +{ +fma.rn.f16x2 r2160, r2152, r2158, r2155; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2164, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2166, {high, high}; +} +{ +mul.f16x2 r2168, r1740, r2166; +} +{ +fma.rn.f16x2 r2171, r1737, r2164, r2168; +} +{ +mul.f16x2 r2175, r1737, r2166; +} +{ +neg.f16x2 r2178, r2175; +} +{ +fma.rn.f16x2 r2180, r1740, r2164, r2178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2184, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2186, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2188, {low, high}; +} +{ +mul.f16x2 r2189, r2186, r2188; +} +{ +mul.f16x2 r2192, r2160, r2184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2160; +mov.b32 r2195, {high, low}; +} +{ +fma.rn.f16x2 r2197, r2189, r2195, r2192; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2201, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2203, {high, high}; +} +{ +mul.f16x2 r2205, r1752, r2203; +} +{ +fma.rn.f16x2 r2208, r1749, r2201, r2205; +} +{ +mul.f16x2 r2212, r1749, r2203; +} +{ +neg.f16x2 r2215, r2212; +} +{ +fma.rn.f16x2 r2217, r1752, r2201, r2215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2221, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2223, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2225, {low, high}; +} +{ +mul.f16x2 r2226, r2223, r2225; +} +{ +mul.f16x2 r2229, r2197, r2221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2197; +mov.b32 r2232, {high, low}; +} +{ +fma.rn.f16x2 r2234, r2226, r2232, r2229; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2238, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2240, {high, high}; +} +{ +mul.f16x2 r2242, r1764, r2240; +} +{ +fma.rn.f16x2 r2245, r1761, r2238, r2242; +} +{ +mul.f16x2 r2249, r1761, r2240; +} +{ +neg.f16x2 r2252, r2249; +} +{ +fma.rn.f16x2 r2254, r1764, r2238, r2252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2258, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2260, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2262, {low, high}; +} +{ +mul.f16x2 r2263, r2260, r2262; +} +{ +mul.f16x2 r2266, r2234, r2258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2234; +mov.b32 r2269, {high, low}; +} +{ +fma.rn.f16x2 r2271, r2263, r2269, r2266; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2275, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2277, {high, high}; +} +{ +mul.f16x2 r2279, r1776, r2277; +} +{ +fma.rn.f16x2 r2282, r1773, r2275, r2279; +} +{ +mul.f16x2 r2286, r1773, r2277; +} +{ +neg.f16x2 r2289, r2286; +} +{ +fma.rn.f16x2 r2291, r1776, r2275, r2289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2295, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1791; +mov.b32 r2297, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2299, {low, high}; +} +{ +mul.f16x2 r2300, r2297, r2299; +} +{ +mul.f16x2 r2303, r2271, r2295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2271; +mov.b32 r2306, {high, low}; +} +{ +fma.rn.f16x2 r2308, r2300, r2306, r2303; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2312, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2308; +mov.b32 r2314, {high, high}; +} +{ +mul.f16x2 r2316, r1788, r2314; +} +{ +fma.rn.f16x2 r2319, r1785, r2312, r2316; +} +{ +mul.f16x2 r2323, r1785, r2314; +} +{ +neg.f16x2 r2326, r2323; +} +{ +fma.rn.f16x2 r2328, r1788, r2312, r2326; +} +shl.b32 r3633, r3623, 3; +and.b32 r3634, r3633, 120; +add.s32 r3635, r3627, r3634; +barrier.sync 0; +and.b32 r3636, r3625, 63488; +add.s32 r3637, r3635, r3636; +st.shared.u32 [r3637], r1695; +st.shared.u32 [r3637+4], r1698; +st.shared.u32 [r3637+128], r1801; +st.shared.u32 [r3637+132], r1810; +st.shared.u32 [r3637+256], r1838; +st.shared.u32 [r3637+260], r1847; +st.shared.u32 [r3637+384], r1875; +st.shared.u32 [r3637+388], r1884; +st.shared.u32 [r3637+512], r1912; +st.shared.u32 [r3637+516], r1921; +st.shared.u32 [r3637+640], r1949; +st.shared.u32 [r3637+644], r1958; +st.shared.u32 [r3637+768], r1986; +st.shared.u32 [r3637+772], r1995; +st.shared.u32 [r3637+896], r2023; +st.shared.u32 [r3637+900], r2032; +st.shared.u32 [r3637+1024], r2060; +st.shared.u32 [r3637+1028], r2069; +st.shared.u32 [r3637+1152], r2097; +st.shared.u32 [r3637+1156], r2106; +st.shared.u32 [r3637+1280], r2134; +st.shared.u32 [r3637+1284], r2143; +st.shared.u32 [r3637+1408], r2171; +st.shared.u32 [r3637+1412], r2180; +st.shared.u32 [r3637+1536], r2208; +st.shared.u32 [r3637+1540], r2217; +st.shared.u32 [r3637+1664], r2245; +st.shared.u32 [r3637+1668], r2254; +st.shared.u32 [r3637+1792], r2282; +st.shared.u32 [r3637+1796], r2291; +st.shared.u32 [r3637+1920], r2319; +st.shared.u32 [r3637+1924], r2328; +barrier.sync 0; +mad.lo.s32 r3638, r3631, -120, r3637; +ld.shared.u32 r2350, [r3638]; +ld.shared.u32 r2353, [r3638+4]; +ld.shared.u32 r2546, [r3638+4096]; +ld.shared.u32 r2549, [r3638+4100]; +ld.shared.u32 r2400, [r3638+8192]; +ld.shared.u32 r2403, [r3638+8196]; +ld.shared.u32 r2596, [r3638+12288]; +ld.shared.u32 r2599, [r3638+12292]; +ld.shared.u32 r2362, [r3638+16384]; +ld.shared.u32 r2365, [r3638+16388]; +ld.shared.u32 r2558, [r3638+20480]; +ld.shared.u32 r2561, [r3638+20484]; +ld.shared.u32 r2412, [r3638+24576]; +ld.shared.u32 r2415, [r3638+24580]; +ld.shared.u32 r2608, [r3638+28672]; +ld.shared.u32 r2611, [r3638+28676]; +ld.shared.u32 r2351, [r3638+32768]; +ld.shared.u32 r2354, [r3638+32772]; +ld.shared.u32 r2547, [r3638+36864]; +ld.shared.u32 r2550, [r3638+36868]; +ld.shared.u32 r2401, [r3638+40960]; +ld.shared.u32 r2404, [r3638+40964]; +ld.shared.u32 r2597, [r3638+45056]; +ld.shared.u32 r2600, [r3638+45060]; +ld.shared.u32 r2363, [r3638+49152]; +ld.shared.u32 r2366, [r3638+49156]; +ld.shared.u32 r2559, [r3638+53248]; +ld.shared.u32 r2562, [r3638+53252]; +ld.shared.u32 r2413, [r3638+57344]; +ld.shared.u32 r2416, [r3638+57348]; +ld.shared.u32 r2609, [r3638+61440]; +ld.shared.u32 r2612, [r3638+61444]; +{ +add.f16x2 r2349, r2350, r2351; +} +{ +add.f16x2 r2352, r2353, r2354; +} +{ +sub.f16x2 r2355, r2350, r2351; +} +{ +sub.f16x2 r2358, r2353, r2354; +} +{ +add.f16x2 r2361, r2362, r2363; +} +{ +add.f16x2 r2364, r2365, r2366; +} +{ +sub.f16x2 r2367, r2362, r2363; +} +{ +sub.f16x2 r2370, r2365, r2366; +} +{ +neg.f16x2 r2373, r2370; +} +{ +add.f16x2 r2375, r2349, r2361; +} +{ +add.f16x2 r2378, r2352, r2364; +} +{ +sub.f16x2 r2381, r2349, r2361; +} +{ +sub.f16x2 r2384, r2352, r2364; +} +{ +add.f16x2 r2387, r2355, r2373; +} +{ +add.f16x2 r2390, r2358, r2367; +} +{ +sub.f16x2 r2393, r2355, r2373; +} +{ +sub.f16x2 r2396, r2358, r2367; +} +{ +add.f16x2 r2399, r2400, r2401; +} +{ +add.f16x2 r2402, r2403, r2404; +} +{ +sub.f16x2 r2405, r2400, r2401; +} +{ +sub.f16x2 r2408, r2403, r2404; +} +{ +add.f16x2 r2411, r2412, r2413; +} +{ +add.f16x2 r2414, r2415, r2416; +} +{ +sub.f16x2 r2417, r2412, r2413; +} +{ +sub.f16x2 r2420, r2415, r2416; +} +{ +neg.f16x2 r2423, r2420; +} +{ +add.f16x2 r2425, r2399, r2411; +} +{ +add.f16x2 r2428, r2402, r2414; +} +{ +sub.f16x2 r2431, r2399, r2411; +} +{ +sub.f16x2 r2434, r2402, r2414; +} +{ +add.f16x2 r2437, r2405, r2423; +} +{ +add.f16x2 r2440, r2408, r2417; +} +{ +sub.f16x2 r2443, r2405, r2423; +} +{ +sub.f16x2 r2446, r2408, r2417; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2449, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2450, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2454, {low, high}; +} +{ +mul.f16x2 r2463, r2437, r2449; +} +{ +mul.f16x2 r2466, r2440, r2450; +} +{ +sub.f16x2 r2469, r2463, r2466; +} +{ +mul.f16x2 r2472, r2437, r2450; +} +{ +fma.rn.f16x2 r2475, r2440, r2449, r2472; +} +{ +neg.f16x2 r2479, r2434; +} +{ +mul.f16x2 r2481, r2443, r2453; +} +{ +mul.f16x2 r2484, r2446, r2454; +} +{ +sub.f16x2 r2487, r2481, r2484; +} +{ +mul.f16x2 r2490, r2443, r2454; +} +{ +fma.rn.f16x2 r2493, r2446, r2453, r2490; +} +{ +add.f16x2 r2497, r2375, r2425; +} +{ +add.f16x2 r2500, r2378, r2428; +} +{ +sub.f16x2 r2503, r2375, r2425; +} +{ +sub.f16x2 r2506, r2378, r2428; +} +{ +add.f16x2 r2509, r2387, r2469; +} +{ +add.f16x2 r2512, r2390, r2475; +} +{ +sub.f16x2 r2515, r2387, r2469; +} +{ +sub.f16x2 r2518, r2390, r2475; +} +{ +add.f16x2 r2521, r2381, r2479; +} +{ +add.f16x2 r2524, r2384, r2431; +} +{ +sub.f16x2 r2527, r2381, r2479; +} +{ +sub.f16x2 r2530, r2384, r2431; +} +{ +add.f16x2 r2533, r2393, r2487; +} +{ +add.f16x2 r2536, r2396, r2493; +} +{ +sub.f16x2 r2539, r2393, r2487; +} +{ +sub.f16x2 r2542, r2396, r2493; +} +{ +add.f16x2 r2545, r2546, r2547; +} +{ +add.f16x2 r2548, r2549, r2550; +} +{ +sub.f16x2 r2551, r2546, r2547; +} +{ +sub.f16x2 r2554, r2549, r2550; +} +{ +add.f16x2 r2557, r2558, r2559; +} +{ +add.f16x2 r2560, r2561, r2562; +} +{ +sub.f16x2 r2563, r2558, r2559; +} +{ +sub.f16x2 r2566, r2561, r2562; +} +{ +neg.f16x2 r2569, r2566; +} +{ +add.f16x2 r2571, r2545, r2557; +} +{ +add.f16x2 r2574, r2548, r2560; +} +{ +sub.f16x2 r2577, r2545, r2557; +} +{ +sub.f16x2 r2580, r2548, r2560; +} +{ +add.f16x2 r2583, r2551, r2569; +} +{ +add.f16x2 r2586, r2554, r2563; +} +{ +sub.f16x2 r2589, r2551, r2569; +} +{ +sub.f16x2 r2592, r2554, r2563; +} +{ +add.f16x2 r2595, r2596, r2597; +} +{ +add.f16x2 r2598, r2599, r2600; +} +{ +sub.f16x2 r2601, r2596, r2597; +} +{ +sub.f16x2 r2604, r2599, r2600; +} +{ +add.f16x2 r2607, r2608, r2609; +} +{ +add.f16x2 r2610, r2611, r2612; +} +{ +sub.f16x2 r2613, r2608, r2609; +} +{ +sub.f16x2 r2616, r2611, r2612; +} +{ +neg.f16x2 r2619, r2616; +} +{ +add.f16x2 r2621, r2595, r2607; +} +{ +add.f16x2 r2624, r2598, r2610; +} +{ +sub.f16x2 r2627, r2595, r2607; +} +{ +sub.f16x2 r2630, r2598, r2610; +} +{ +add.f16x2 r2633, r2601, r2619; +} +{ +add.f16x2 r2636, r2604, r2613; +} +{ +sub.f16x2 r2639, r2601, r2619; +} +{ +sub.f16x2 r2642, r2604, r2613; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2645, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2646, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2650, {low, high}; +} +{ +mul.f16x2 r2659, r2633, r2645; +} +{ +mul.f16x2 r2662, r2636, r2646; +} +{ +sub.f16x2 r2665, r2659, r2662; +} +{ +mul.f16x2 r2668, r2633, r2646; +} +{ +fma.rn.f16x2 r2671, r2636, r2645, r2668; +} +{ +neg.f16x2 r2675, r2630; +} +{ +mul.f16x2 r2677, r2639, r2649; +} +{ +mul.f16x2 r2680, r2642, r2650; +} +{ +sub.f16x2 r2683, r2677, r2680; +} +{ +mul.f16x2 r2686, r2639, r2650; +} +{ +fma.rn.f16x2 r2689, r2642, r2649, r2686; +} +{ +add.f16x2 r2693, r2571, r2621; +} +{ +add.f16x2 r2696, r2574, r2624; +} +{ +sub.f16x2 r2699, r2571, r2621; +} +{ +sub.f16x2 r2702, r2574, r2624; +} +{ +add.f16x2 r2705, r2583, r2665; +} +{ +add.f16x2 r2708, r2586, r2671; +} +{ +sub.f16x2 r2711, r2583, r2665; +} +{ +sub.f16x2 r2714, r2586, r2671; +} +{ +add.f16x2 r2717, r2577, r2675; +} +{ +add.f16x2 r2720, r2580, r2627; +} +{ +sub.f16x2 r2723, r2577, r2675; +} +{ +sub.f16x2 r2726, r2580, r2627; +} +{ +add.f16x2 r2729, r2589, r2683; +} +{ +add.f16x2 r2732, r2592, r2689; +} +{ +sub.f16x2 r2735, r2589, r2683; +} +{ +sub.f16x2 r2738, r2592, r2689; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2741, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2742, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2743, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2744, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2745, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2746, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f374; +cvt.rn.f16.f32 high, f374; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f376; +cvt.rn.f16.f32 high, f376; +mov.b32 r2750, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f378; +cvt.rn.f16.f32 high, f378; +mov.b32 r2751, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f380; +cvt.rn.f16.f32 high, f380; +mov.b32 r2752, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f382; +cvt.rn.f16.f32 high, f382; +mov.b32 r2753, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f384; +cvt.rn.f16.f32 high, f384; +mov.b32 r2754, {low, high}; +} +{ +mul.f16x2 r2771, r2705, r2741; +} +{ +mul.f16x2 r2774, r2708, r2742; +} +{ +sub.f16x2 r2777, r2771, r2774; +} +{ +mul.f16x2 r2780, r2705, r2742; +} +{ +fma.rn.f16x2 r2783, r2708, r2741, r2780; +} +{ +mul.f16x2 r2787, r2717, r2743; +} +{ +mul.f16x2 r2790, r2720, r2744; +} +{ +sub.f16x2 r2793, r2787, r2790; +} +{ +mul.f16x2 r2796, r2717, r2744; +} +{ +fma.rn.f16x2 r2799, r2720, r2743, r2796; +} +{ +mul.f16x2 r2803, r2729, r2745; +} +{ +mul.f16x2 r2806, r2732, r2746; +} +{ +sub.f16x2 r2809, r2803, r2806; +} +{ +mul.f16x2 r2812, r2729, r2746; +} +{ +fma.rn.f16x2 r2815, r2732, r2745, r2812; +} +{ +neg.f16x2 r2819, r2702; +} +{ +mul.f16x2 r2821, r2711, r2749; +} +{ +mul.f16x2 r2824, r2714, r2750; +} +{ +sub.f16x2 r2827, r2821, r2824; +} +{ +mul.f16x2 r2830, r2711, r2750; +} +{ +fma.rn.f16x2 r2833, r2714, r2749, r2830; +} +{ +mul.f16x2 r2837, r2723, r2751; +} +{ +mul.f16x2 r2840, r2726, r2752; +} +{ +sub.f16x2 r2843, r2837, r2840; +} +{ +mul.f16x2 r2846, r2723, r2752; +} +{ +fma.rn.f16x2 r2849, r2726, r2751, r2846; +} +{ +mul.f16x2 r2853, r2735, r2753; +} +{ +mul.f16x2 r2856, r2738, r2754; +} +{ +sub.f16x2 r2859, r2853, r2856; +} +{ +mul.f16x2 r2862, r2735, r2754; +} +{ +fma.rn.f16x2 r2865, r2738, r2753, r2862; +} +{ +add.f16x2 r2869, r2497, r2693; +} +{ +add.f16x2 r2872, r2500, r2696; +} +{ +sub.f16x2 r2875, r2497, r2693; +} +{ +sub.f16x2 r2878, r2500, r2696; +} +{ +add.f16x2 r2881, r2509, r2777; +} +{ +add.f16x2 r2884, r2512, r2783; +} +{ +sub.f16x2 r2887, r2509, r2777; +} +{ +sub.f16x2 r2890, r2512, r2783; +} +{ +add.f16x2 r2893, r2521, r2793; +} +{ +add.f16x2 r2896, r2524, r2799; +} +{ +sub.f16x2 r2899, r2521, r2793; +} +{ +sub.f16x2 r2902, r2524, r2799; +} +{ +add.f16x2 r2905, r2533, r2809; +} +{ +add.f16x2 r2908, r2536, r2815; +} +{ +sub.f16x2 r2911, r2533, r2809; +} +{ +sub.f16x2 r2914, r2536, r2815; +} +{ +add.f16x2 r2917, r2503, r2819; +} +{ +add.f16x2 r2920, r2506, r2699; +} +{ +sub.f16x2 r2923, r2503, r2819; +} +{ +sub.f16x2 r2926, r2506, r2699; +} +{ +add.f16x2 r2929, r2515, r2827; +} +{ +add.f16x2 r2932, r2518, r2833; +} +{ +sub.f16x2 r2935, r2515, r2827; +} +{ +sub.f16x2 r2938, r2518, r2833; +} +{ +add.f16x2 r2941, r2527, r2843; +} +{ +add.f16x2 r2944, r2530, r2849; +} +{ +sub.f16x2 r2947, r2527, r2843; +} +{ +sub.f16x2 r2950, r2530, r2849; +} +{ +add.f16x2 r2953, r2539, r2859; +} +{ +add.f16x2 r2956, r2542, r2865; +} +{ +sub.f16x2 r2959, r2539, r2859; +} +{ +sub.f16x2 r2962, r2542, r2865; +} +and.b32 r3639, r3623, 256; +bfe.u32 r3640, r3623, 8, 1; +cvt.rn.f32.u32 f457, r3640; +mul.f32 f458, f457, 0f3E490FDB; +cos.approx.f32 f417, f458; +sin.approx.f32 f459, f458; +neg.f32 f418, f459; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f417; +cvt.rn.f16.f32 high, f418; +mov.b32 r2965, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2970, {high, high}; +} +{ +mul.f16x2 r2972, r2884, r2970; +} +{ +fma.rn.f16x2 r2975, r2881, r2968, r2972; +} +{ +mul.f16x2 r2979, r2881, r2970; +} +{ +neg.f16x2 r2982, r2979; +} +{ +fma.rn.f16x2 r2984, r2884, r2968, r2982; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2988, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2990, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r2992, {low, high}; +} +{ +mul.f16x2 r2993, r2990, r2992; +} +{ +mul.f16x2 r2996, r2965, r2988; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r2999, {high, low}; +} +{ +fma.rn.f16x2 r3001, r2993, r2999, r2996; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3007, {high, high}; +} +{ +mul.f16x2 r3009, r2896, r3007; +} +{ +fma.rn.f16x2 r3012, r2893, r3005, r3009; +} +{ +mul.f16x2 r3016, r2893, r3007; +} +{ +neg.f16x2 r3019, r3016; +} +{ +fma.rn.f16x2 r3021, r2896, r3005, r3019; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3025, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3027, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3029, {low, high}; +} +{ +mul.f16x2 r3030, r3027, r3029; +} +{ +mul.f16x2 r3033, r3001, r3025; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3001; +mov.b32 r3036, {high, low}; +} +{ +fma.rn.f16x2 r3038, r3030, r3036, r3033; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3044, {high, high}; +} +{ +mul.f16x2 r3046, r2908, r3044; +} +{ +fma.rn.f16x2 r3049, r2905, r3042, r3046; +} +{ +mul.f16x2 r3053, r2905, r3044; +} +{ +neg.f16x2 r3056, r3053; +} +{ +fma.rn.f16x2 r3058, r2908, r3042, r3056; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3062, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3064, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3066, {low, high}; +} +{ +mul.f16x2 r3067, r3064, r3066; +} +{ +mul.f16x2 r3070, r3038, r3062; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3038; +mov.b32 r3073, {high, low}; +} +{ +fma.rn.f16x2 r3075, r3067, r3073, r3070; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3081, {high, high}; +} +{ +mul.f16x2 r3083, r2920, r3081; +} +{ +fma.rn.f16x2 r3086, r2917, r3079, r3083; +} +{ +mul.f16x2 r3090, r2917, r3081; +} +{ +neg.f16x2 r3093, r3090; +} +{ +fma.rn.f16x2 r3095, r2920, r3079, r3093; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3099, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3101, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3103, {low, high}; +} +{ +mul.f16x2 r3104, r3101, r3103; +} +{ +mul.f16x2 r3107, r3075, r3099; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3075; +mov.b32 r3110, {high, low}; +} +{ +fma.rn.f16x2 r3112, r3104, r3110, r3107; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3118, {high, high}; +} +{ +mul.f16x2 r3120, r2932, r3118; +} +{ +fma.rn.f16x2 r3123, r2929, r3116, r3120; +} +{ +mul.f16x2 r3127, r2929, r3118; +} +{ +neg.f16x2 r3130, r3127; +} +{ +fma.rn.f16x2 r3132, r2932, r3116, r3130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3140, {low, high}; +} +{ +mul.f16x2 r3141, r3138, r3140; +} +{ +mul.f16x2 r3144, r3112, r3136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3112; +mov.b32 r3147, {high, low}; +} +{ +fma.rn.f16x2 r3149, r3141, r3147, r3144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3155, {high, high}; +} +{ +mul.f16x2 r3157, r2944, r3155; +} +{ +fma.rn.f16x2 r3160, r2941, r3153, r3157; +} +{ +mul.f16x2 r3164, r2941, r3155; +} +{ +neg.f16x2 r3167, r3164; +} +{ +fma.rn.f16x2 r3169, r2944, r3153, r3167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3177, {low, high}; +} +{ +mul.f16x2 r3178, r3175, r3177; +} +{ +mul.f16x2 r3181, r3149, r3173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3149; +mov.b32 r3184, {high, low}; +} +{ +fma.rn.f16x2 r3186, r3178, r3184, r3181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3192, {high, high}; +} +{ +mul.f16x2 r3194, r2956, r3192; +} +{ +fma.rn.f16x2 r3197, r2953, r3190, r3194; +} +{ +mul.f16x2 r3201, r2953, r3192; +} +{ +neg.f16x2 r3204, r3201; +} +{ +fma.rn.f16x2 r3206, r2956, r3190, r3204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3214, {low, high}; +} +{ +mul.f16x2 r3215, r3212, r3214; +} +{ +mul.f16x2 r3218, r3186, r3210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3186; +mov.b32 r3221, {high, low}; +} +{ +fma.rn.f16x2 r3223, r3215, r3221, r3218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3229, {high, high}; +} +{ +mul.f16x2 r3231, r2878, r3229; +} +{ +fma.rn.f16x2 r3234, r2875, r3227, r3231; +} +{ +mul.f16x2 r3238, r2875, r3229; +} +{ +neg.f16x2 r3241, r3238; +} +{ +fma.rn.f16x2 r3243, r2878, r3227, r3241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3251, {low, high}; +} +{ +mul.f16x2 r3252, r3249, r3251; +} +{ +mul.f16x2 r3255, r3223, r3247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3223; +mov.b32 r3258, {high, low}; +} +{ +fma.rn.f16x2 r3260, r3252, r3258, r3255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3266, {high, high}; +} +{ +mul.f16x2 r3268, r2890, r3266; +} +{ +fma.rn.f16x2 r3271, r2887, r3264, r3268; +} +{ +mul.f16x2 r3275, r2887, r3266; +} +{ +neg.f16x2 r3278, r3275; +} +{ +fma.rn.f16x2 r3280, r2890, r3264, r3278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3288, {low, high}; +} +{ +mul.f16x2 r3289, r3286, r3288; +} +{ +mul.f16x2 r3292, r3260, r3284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3260; +mov.b32 r3295, {high, low}; +} +{ +fma.rn.f16x2 r3297, r3289, r3295, r3292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3303, {high, high}; +} +{ +mul.f16x2 r3305, r2902, r3303; +} +{ +fma.rn.f16x2 r3308, r2899, r3301, r3305; +} +{ +mul.f16x2 r3312, r2899, r3303; +} +{ +neg.f16x2 r3315, r3312; +} +{ +fma.rn.f16x2 r3317, r2902, r3301, r3315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3325, {low, high}; +} +{ +mul.f16x2 r3326, r3323, r3325; +} +{ +mul.f16x2 r3329, r3297, r3321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3297; +mov.b32 r3332, {high, low}; +} +{ +fma.rn.f16x2 r3334, r3326, r3332, r3329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3340, {high, high}; +} +{ +mul.f16x2 r3342, r2914, r3340; +} +{ +fma.rn.f16x2 r3345, r2911, r3338, r3342; +} +{ +mul.f16x2 r3349, r2911, r3340; +} +{ +neg.f16x2 r3352, r3349; +} +{ +fma.rn.f16x2 r3354, r2914, r3338, r3352; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3358, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3360, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3362, {low, high}; +} +{ +mul.f16x2 r3363, r3360, r3362; +} +{ +mul.f16x2 r3366, r3334, r3358; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3334; +mov.b32 r3369, {high, low}; +} +{ +fma.rn.f16x2 r3371, r3363, r3369, r3366; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3377, {high, high}; +} +{ +mul.f16x2 r3379, r2926, r3377; +} +{ +fma.rn.f16x2 r3382, r2923, r3375, r3379; +} +{ +mul.f16x2 r3386, r2923, r3377; +} +{ +neg.f16x2 r3389, r3386; +} +{ +fma.rn.f16x2 r3391, r2926, r3375, r3389; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3395, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3397, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3399, {low, high}; +} +{ +mul.f16x2 r3400, r3397, r3399; +} +{ +mul.f16x2 r3403, r3371, r3395; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3371; +mov.b32 r3406, {high, low}; +} +{ +fma.rn.f16x2 r3408, r3400, r3406, r3403; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3414, {high, high}; +} +{ +mul.f16x2 r3416, r2938, r3414; +} +{ +fma.rn.f16x2 r3419, r2935, r3412, r3416; +} +{ +mul.f16x2 r3423, r2935, r3414; +} +{ +neg.f16x2 r3426, r3423; +} +{ +fma.rn.f16x2 r3428, r2938, r3412, r3426; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3432, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3434, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3436, {low, high}; +} +{ +mul.f16x2 r3437, r3434, r3436; +} +{ +mul.f16x2 r3440, r3408, r3432; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3408; +mov.b32 r3443, {high, low}; +} +{ +fma.rn.f16x2 r3445, r3437, r3443, r3440; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3451, {high, high}; +} +{ +mul.f16x2 r3453, r2950, r3451; +} +{ +fma.rn.f16x2 r3456, r2947, r3449, r3453; +} +{ +mul.f16x2 r3460, r2947, r3451; +} +{ +neg.f16x2 r3463, r3460; +} +{ +fma.rn.f16x2 r3465, r2950, r3449, r3463; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3469, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2965; +mov.b32 r3471, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f447; +cvt.rn.f16.f32 high, f448; +mov.b32 r3473, {low, high}; +} +{ +mul.f16x2 r3474, r3471, r3473; +} +{ +mul.f16x2 r3477, r3445, r3469; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3445; +mov.b32 r3480, {high, low}; +} +{ +fma.rn.f16x2 r3482, r3474, r3480, r3477; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3482; +mov.b32 r3488, {high, high}; +} +{ +mul.f16x2 r3490, r2962, r3488; +} +{ +fma.rn.f16x2 r3493, r2959, r3486, r3490; +} +{ +mul.f16x2 r3497, r2959, r3488; +} +{ +neg.f16x2 r3500, r3497; +} +{ +fma.rn.f16x2 r3502, r2962, r3486, r3500; +} +and.b32 r3641, r3633, 2040; +add.s32 r3642, r3627, r3641; +barrier.sync 0; +and.b32 r3643, r3625, 32768; +add.s32 r3644, r3642, r3643; +st.shared.u32 [r3644], r2869; +st.shared.u32 [r3644+4], r2872; +st.shared.u32 [r3644+2048], r2975; +st.shared.u32 [r3644+2052], r2984; +st.shared.u32 [r3644+4096], r3012; +st.shared.u32 [r3644+4100], r3021; +st.shared.u32 [r3644+6144], r3049; +st.shared.u32 [r3644+6148], r3058; +st.shared.u32 [r3644+8192], r3086; +st.shared.u32 [r3644+8196], r3095; +st.shared.u32 [r3644+10240], r3123; +st.shared.u32 [r3644+10244], r3132; +st.shared.u32 [r3644+12288], r3160; +st.shared.u32 [r3644+12292], r3169; +st.shared.u32 [r3644+14336], r3197; +st.shared.u32 [r3644+14340], r3206; +st.shared.u32 [r3644+16384], r3234; +st.shared.u32 [r3644+16388], r3243; +st.shared.u32 [r3644+18432], r3271; +st.shared.u32 [r3644+18436], r3280; +st.shared.u32 [r3644+20480], r3308; +st.shared.u32 [r3644+20484], r3317; +st.shared.u32 [r3644+22528], r3345; +st.shared.u32 [r3644+22532], r3354; +st.shared.u32 [r3644+24576], r3382; +st.shared.u32 [r3644+24580], r3391; +st.shared.u32 [r3644+26624], r3419; +st.shared.u32 [r3644+26628], r3428; +st.shared.u32 [r3644+28672], r3456; +st.shared.u32 [r3644+28676], r3465; +st.shared.u32 [r3644+30720], r3493; +st.shared.u32 [r3644+30724], r3502; +barrier.sync 0; +mad.lo.s32 r3645, r3639, -120, r3644; +ld.shared.u32 r3524, [r3645]; +ld.shared.u32 r3527, [r3645+4]; +ld.shared.u32 r3536, [r3645+4096]; +ld.shared.u32 r3539, [r3645+4100]; +ld.shared.u32 r3548, [r3645+8192]; +ld.shared.u32 r3551, [r3645+8196]; +ld.shared.u32 r3560, [r3645+12288]; +ld.shared.u32 r3563, [r3645+12292]; +ld.shared.u32 r3572, [r3645+16384]; +ld.shared.u32 r3575, [r3645+16388]; +ld.shared.u32 r3584, [r3645+20480]; +ld.shared.u32 r3587, [r3645+20484]; +ld.shared.u32 r3596, [r3645+24576]; +ld.shared.u32 r3599, [r3645+24580]; +ld.shared.u32 r3608, [r3645+28672]; +ld.shared.u32 r3611, [r3645+28676]; +ld.shared.u32 r3525, [r3645+32768]; +ld.shared.u32 r3528, [r3645+32772]; +ld.shared.u32 r3537, [r3645+36864]; +ld.shared.u32 r3540, [r3645+36868]; +ld.shared.u32 r3549, [r3645+40960]; +ld.shared.u32 r3552, [r3645+40964]; +ld.shared.u32 r3561, [r3645+45056]; +ld.shared.u32 r3564, [r3645+45060]; +ld.shared.u32 r3573, [r3645+49152]; +ld.shared.u32 r3576, [r3645+49156]; +ld.shared.u32 r3585, [r3645+53248]; +ld.shared.u32 r3588, [r3645+53252]; +ld.shared.u32 r3597, [r3645+57344]; +ld.shared.u32 r3600, [r3645+57348]; +ld.shared.u32 r3609, [r3645+61440]; +ld.shared.u32 r3612, [r3645+61444]; +{ +add.f16x2 %0, r3524, r3525; +} +{ +add.f16x2 %1, r3527, r3528; +} +{ +sub.f16x2 %16, r3524, r3525; +} +{ +sub.f16x2 %17, r3527, r3528; +} +{ +add.f16x2 %2, r3536, r3537; +} +{ +add.f16x2 %3, r3539, r3540; +} +{ +sub.f16x2 %18, r3536, r3537; +} +{ +sub.f16x2 %19, r3539, r3540; +} +{ +add.f16x2 %4, r3548, r3549; +} +{ +add.f16x2 %5, r3551, r3552; +} +{ +sub.f16x2 %20, r3548, r3549; +} +{ +sub.f16x2 %21, r3551, r3552; +} +{ +add.f16x2 %6, r3560, r3561; +} +{ +add.f16x2 %7, r3563, r3564; +} +{ +sub.f16x2 %22, r3560, r3561; +} +{ +sub.f16x2 %23, r3563, r3564; +} +{ +add.f16x2 %8, r3572, r3573; +} +{ +add.f16x2 %9, r3575, r3576; +} +{ +sub.f16x2 %24, r3572, r3573; +} +{ +sub.f16x2 %25, r3575, r3576; +} +{ +add.f16x2 %10, r3584, r3585; +} +{ +add.f16x2 %11, r3587, r3588; +} +{ +sub.f16x2 %26, r3584, r3585; +} +{ +sub.f16x2 %27, r3587, r3588; +} +{ +add.f16x2 %12, r3596, r3597; +} +{ +add.f16x2 %13, r3599, r3600; +} +{ +sub.f16x2 %28, r3596, r3597; +} +{ +sub.f16x2 %29, r3599, r3600; +} +{ +add.f16x2 %14, r3608, r3609; +} +{ +add.f16x2 %15, r3611, r3612; +} +{ +sub.f16x2 %30, r3608, r3609; +} +{ +sub.f16x2 %31, r3611, r3612; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[15].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1062, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<197>; +.reg .b32 r<1915>; +.reg .b64 rd<2>; +mov.u32 r1881, %tid.y; +shl.b32 r1882, r1881, 15; +mov.u32 r1883, %16; +add.s32 r1884, r1883, r1882; +mov.u32 r1885, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f150, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f182, 0f3F800000; +mov.f32 f148, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f181, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1886, r1885, 1023; +shl.b32 r1887, r1885, 5; +and.b32 r1888, r1887, -32768; +add.s32 r1889, r1884, r1888; +cvt.rn.f32.u32 f185, r1886; +mul.f32 f186, f185, 0f3A490FDB; +cos.approx.f32 f29, f186; +sin.approx.f32 f187, f186; +neg.f32 f30, f187; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1890, r1887, 32736; +add.s32 r1891, r1889, r1890; +st.shared.v4.f32 [r1891], {r149, r207, r244, r281}; +st.shared.v4.f32 [r1891+16], {r318, r355, r392, r429}; +barrier.sync 0; +mad.lo.s32 r1892, r1886, -28, r1891; +ld.shared.u32 r460, [r1892]; +ld.shared.u32 r510, [r1892+4096]; +ld.shared.u32 r472, [r1892+8192]; +ld.shared.u32 r522, [r1892+12288]; +ld.shared.u32 r461, [r1892+16384]; +ld.shared.u32 r511, [r1892+20480]; +ld.shared.u32 r473, [r1892+24576]; +ld.shared.u32 r523, [r1892+28672]; +barrier.sync 0; +st.shared.v4.f32 [r1891], {r152, r216, r253, r290}; +st.shared.v4.f32 [r1891+16], {r327, r364, r401, r438}; +barrier.sync 0; +ld.shared.u32 r463, [r1892]; +ld.shared.u32 r513, [r1892+4096]; +ld.shared.u32 r475, [r1892+8192]; +ld.shared.u32 r525, [r1892+12288]; +ld.shared.u32 r464, [r1892+16384]; +ld.shared.u32 r514, [r1892+20480]; +ld.shared.u32 r476, [r1892+24576]; +ld.shared.u32 r526, [r1892+28672]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1893, r1885, 1016; +bfe.u32 r1894, r1885, 3, 7; +shl.b32 r1895, r1885, 2; +and.b32 r1896, r1895, 28; +add.s32 r1897, r1889, r1896; +cvt.rn.f32.u32 f188, r1894; +mul.f32 f189, f188, 0f3BC90FDB; +cos.approx.f32 f75, f189; +sin.approx.f32 f190, f189; +neg.f32 f76, f190; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +barrier.sync 0; +and.b32 r1898, r1887, 32512; +add.s32 r1899, r1897, r1898; +st.shared.u32 [r1899], r607; +st.shared.u32 [r1899+32], r665; +st.shared.u32 [r1899+64], r702; +st.shared.u32 [r1899+96], r739; +st.shared.u32 [r1899+128], r776; +st.shared.u32 [r1899+160], r813; +st.shared.u32 [r1899+192], r850; +st.shared.u32 [r1899+224], r887; +barrier.sync 0; +mad.lo.s32 r1900, r1893, -28, r1899; +ld.shared.u32 r918, [r1900]; +ld.shared.u32 r968, [r1900+4096]; +ld.shared.u32 r930, [r1900+8192]; +ld.shared.u32 r980, [r1900+12288]; +ld.shared.u32 r919, [r1900+16384]; +ld.shared.u32 r969, [r1900+20480]; +ld.shared.u32 r931, [r1900+24576]; +ld.shared.u32 r981, [r1900+28672]; +barrier.sync 0; +st.shared.u32 [r1899], r610; +st.shared.u32 [r1899+32], r674; +st.shared.u32 [r1899+64], r711; +st.shared.u32 [r1899+96], r748; +st.shared.u32 [r1899+128], r785; +st.shared.u32 [r1899+160], r822; +st.shared.u32 [r1899+192], r859; +st.shared.u32 [r1899+224], r896; +barrier.sync 0; +ld.shared.u32 r921, [r1900]; +ld.shared.u32 r971, [r1900+4096]; +ld.shared.u32 r933, [r1900+8192]; +ld.shared.u32 r983, [r1900+12288]; +ld.shared.u32 r922, [r1900+16384]; +ld.shared.u32 r972, [r1900+20480]; +ld.shared.u32 r934, [r1900+24576]; +ld.shared.u32 r984, [r1900+28672]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1901, r1885, 960; +bfe.u32 r1902, r1885, 6, 4; +and.b32 r1903, r1895, 252; +add.s32 r1904, r1889, r1903; +cvt.rn.f32.u32 f191, r1902; +mul.f32 f192, f191, 0f3D490FDB; +cos.approx.f32 f121, f192; +sin.approx.f32 f193, f192; +neg.f32 f122, f193; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +barrier.sync 0; +and.b32 r1905, r1887, 30720; +add.s32 r1906, r1904, r1905; +st.shared.u32 [r1906], r1065; +st.shared.u32 [r1906+256], r1123; +st.shared.u32 [r1906+512], r1160; +st.shared.u32 [r1906+768], r1197; +st.shared.u32 [r1906+1024], r1234; +st.shared.u32 [r1906+1280], r1271; +st.shared.u32 [r1906+1536], r1308; +st.shared.u32 [r1906+1792], r1345; +barrier.sync 0; +mad.lo.s32 r1907, r1901, -28, r1906; +ld.shared.u32 r1376, [r1907]; +ld.shared.u32 r1426, [r1907+4096]; +ld.shared.u32 r1388, [r1907+8192]; +ld.shared.u32 r1438, [r1907+12288]; +ld.shared.u32 r1377, [r1907+16384]; +ld.shared.u32 r1427, [r1907+20480]; +ld.shared.u32 r1389, [r1907+24576]; +ld.shared.u32 r1439, [r1907+28672]; +barrier.sync 0; +st.shared.u32 [r1906], r1068; +st.shared.u32 [r1906+256], r1132; +st.shared.u32 [r1906+512], r1169; +st.shared.u32 [r1906+768], r1206; +st.shared.u32 [r1906+1024], r1243; +st.shared.u32 [r1906+1280], r1280; +st.shared.u32 [r1906+1536], r1317; +st.shared.u32 [r1906+1792], r1354; +barrier.sync 0; +ld.shared.u32 r1379, [r1907]; +ld.shared.u32 r1429, [r1907+4096]; +ld.shared.u32 r1391, [r1907+8192]; +ld.shared.u32 r1441, [r1907+12288]; +ld.shared.u32 r1380, [r1907+16384]; +ld.shared.u32 r1430, [r1907+20480]; +ld.shared.u32 r1392, [r1907+24576]; +ld.shared.u32 r1442, [r1907+28672]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1396; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1399; +} +{ +add.f16x2 r1416, r1384, r1393; +} +{ +sub.f16x2 r1419, r1381, r1399; +} +{ +sub.f16x2 r1422, r1384, r1393; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1446; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1449; +} +{ +add.f16x2 r1466, r1434, r1443; +} +{ +sub.f16x2 r1469, r1431, r1449; +} +{ +sub.f16x2 r1472, r1434, r1443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1460; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 r1523, r1401, r1451; +} +{ +add.f16x2 r1526, r1404, r1454; +} +{ +sub.f16x2 r1529, r1401, r1451; +} +{ +sub.f16x2 r1532, r1404, r1454; +} +{ +add.f16x2 r1535, r1413, r1495; +} +{ +add.f16x2 r1538, r1416, r1501; +} +{ +sub.f16x2 r1541, r1413, r1495; +} +{ +sub.f16x2 r1544, r1416, r1501; +} +{ +add.f16x2 r1547, r1407, r1505; +} +{ +add.f16x2 r1550, r1410, r1457; +} +{ +sub.f16x2 r1553, r1407, r1505; +} +{ +sub.f16x2 r1556, r1410, r1457; +} +{ +add.f16x2 r1559, r1419, r1513; +} +{ +add.f16x2 r1562, r1422, r1519; +} +{ +sub.f16x2 r1565, r1419, r1513; +} +{ +sub.f16x2 r1568, r1422, r1519; +} +and.b32 r1908, r1885, 512; +bfe.u32 r1909, r1885, 9, 1; +and.b32 r1910, r1895, 2044; +add.s32 r1911, r1889, r1910; +cvt.rn.f32.u32 f194, r1909; +mul.f32 f195, f194, 0f3EC90FDB; +cos.approx.f32 f167, f195; +sin.approx.f32 f196, f195; +neg.f32 f168, f196; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f167; +cvt.rn.f16.f32 high, f168; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1576, {high, high}; +} +{ +mul.f16x2 r1578, r1538, r1576; +} +{ +fma.rn.f16x2 r1581, r1535, r1574, r1578; +} +{ +mul.f16x2 r1585, r1535, r1576; +} +{ +neg.f16x2 r1588, r1585; +} +{ +fma.rn.f16x2 r1590, r1538, r1574, r1588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1598, {low, high}; +} +{ +mul.f16x2 r1599, r1596, r1598; +} +{ +mul.f16x2 r1602, r1571, r1594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1605, {high, low}; +} +{ +fma.rn.f16x2 r1607, r1599, r1605, r1602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1613, {high, high}; +} +{ +mul.f16x2 r1615, r1550, r1613; +} +{ +fma.rn.f16x2 r1618, r1547, r1611, r1615; +} +{ +mul.f16x2 r1622, r1547, r1613; +} +{ +neg.f16x2 r1625, r1622; +} +{ +fma.rn.f16x2 r1627, r1550, r1611, r1625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1635, {low, high}; +} +{ +mul.f16x2 r1636, r1633, r1635; +} +{ +mul.f16x2 r1639, r1607, r1631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1642, {high, low}; +} +{ +fma.rn.f16x2 r1644, r1636, r1642, r1639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1650, {high, high}; +} +{ +mul.f16x2 r1652, r1562, r1650; +} +{ +fma.rn.f16x2 r1655, r1559, r1648, r1652; +} +{ +mul.f16x2 r1659, r1559, r1650; +} +{ +neg.f16x2 r1662, r1659; +} +{ +fma.rn.f16x2 r1664, r1562, r1648, r1662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1672, {low, high}; +} +{ +mul.f16x2 r1673, r1670, r1672; +} +{ +mul.f16x2 r1676, r1644, r1668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1679, {high, low}; +} +{ +fma.rn.f16x2 r1681, r1673, r1679, r1676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1687, {high, high}; +} +{ +mul.f16x2 r1689, r1532, r1687; +} +{ +fma.rn.f16x2 r1692, r1529, r1685, r1689; +} +{ +mul.f16x2 r1696, r1529, r1687; +} +{ +neg.f16x2 r1699, r1696; +} +{ +fma.rn.f16x2 r1701, r1532, r1685, r1699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1707, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1709, {low, high}; +} +{ +mul.f16x2 r1710, r1707, r1709; +} +{ +mul.f16x2 r1713, r1681, r1705; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1716, {high, low}; +} +{ +fma.rn.f16x2 r1718, r1710, r1716, r1713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1724, {high, high}; +} +{ +mul.f16x2 r1726, r1544, r1724; +} +{ +fma.rn.f16x2 r1729, r1541, r1722, r1726; +} +{ +mul.f16x2 r1733, r1541, r1724; +} +{ +neg.f16x2 r1736, r1733; +} +{ +fma.rn.f16x2 r1738, r1544, r1722, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1744, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1744, r1746; +} +{ +mul.f16x2 r1750, r1718, r1742; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1753, {high, low}; +} +{ +fma.rn.f16x2 r1755, r1747, r1753, r1750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1761, {high, high}; +} +{ +mul.f16x2 r1763, r1556, r1761; +} +{ +fma.rn.f16x2 r1766, r1553, r1759, r1763; +} +{ +mul.f16x2 r1770, r1553, r1761; +} +{ +neg.f16x2 r1773, r1770; +} +{ +fma.rn.f16x2 r1775, r1556, r1759, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1781, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1781, r1783; +} +{ +mul.f16x2 r1787, r1755, r1779; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1790, {high, low}; +} +{ +fma.rn.f16x2 r1792, r1784, r1790, r1787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1798, {high, high}; +} +{ +mul.f16x2 r1800, r1568, r1798; +} +{ +fma.rn.f16x2 r1803, r1565, r1796, r1800; +} +{ +mul.f16x2 r1807, r1565, r1798; +} +{ +neg.f16x2 r1810, r1807; +} +{ +fma.rn.f16x2 r1812, r1568, r1796, r1810; +} +barrier.sync 0; +and.b32 r1912, r1887, 16384; +add.s32 r1913, r1911, r1912; +st.shared.u32 [r1913], r1523; +st.shared.u32 [r1913+2048], r1581; +st.shared.u32 [r1913+4096], r1618; +st.shared.u32 [r1913+6144], r1655; +st.shared.u32 [r1913+8192], r1692; +st.shared.u32 [r1913+10240], r1729; +st.shared.u32 [r1913+12288], r1766; +st.shared.u32 [r1913+14336], r1803; +barrier.sync 0; +mad.lo.s32 r1914, r1908, -28, r1913; +ld.shared.u32 r1834, [r1914]; +ld.shared.u32 r1846, [r1914+4096]; +ld.shared.u32 r1858, [r1914+8192]; +ld.shared.u32 r1870, [r1914+12288]; +ld.shared.u32 r1835, [r1914+16384]; +ld.shared.u32 r1847, [r1914+20480]; +ld.shared.u32 r1859, [r1914+24576]; +ld.shared.u32 r1871, [r1914+28672]; +barrier.sync 0; +st.shared.u32 [r1913], r1526; +st.shared.u32 [r1913+2048], r1590; +st.shared.u32 [r1913+4096], r1627; +st.shared.u32 [r1913+6144], r1664; +st.shared.u32 [r1913+8192], r1701; +st.shared.u32 [r1913+10240], r1738; +st.shared.u32 [r1913+12288], r1775; +st.shared.u32 [r1913+14336], r1812; +barrier.sync 0; +ld.shared.u32 r1837, [r1914]; +ld.shared.u32 r1849, [r1914+4096]; +ld.shared.u32 r1861, [r1914+8192]; +ld.shared.u32 r1873, [r1914+12288]; +ld.shared.u32 r1838, [r1914+16384]; +ld.shared.u32 r1850, [r1914+20480]; +ld.shared.u32 r1862, [r1914+24576]; +ld.shared.u32 r1874, [r1914+28672]; +{ +add.f16x2 %0, r1834, r1835; +} +{ +add.f16x2 %1, r1837, r1838; +} +{ +sub.f16x2 %8, r1834, r1835; +} +{ +sub.f16x2 %9, r1837, r1838; +} +{ +add.f16x2 %2, r1846, r1847; +} +{ +add.f16x2 %3, r1849, r1850; +} +{ +sub.f16x2 %10, r1846, r1847; +} +{ +sub.f16x2 %11, r1849, r1850; +} +{ +add.f16x2 %4, r1858, r1859; +} +{ +add.f16x2 %5, r1861, r1862; +} +{ +sub.f16x2 %12, r1858, r1859; +} +{ +sub.f16x2 %13, r1861, r1862; +} +{ +add.f16x2 %6, r1870, r1871; +} +{ +add.f16x2 %7, r1873, r1874; +} +{ +sub.f16x2 %14, r1870, r1871; +} +{ +sub.f16x2 %15, r1873, r1874; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1060, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<197>; +.reg .b32 r<1915>; +.reg .b64 rd<2>; +mov.u32 r1881, %tid.y; +shl.b32 r1882, r1881, 16; +mov.u32 r1883, %16; +add.s32 r1884, r1883, r1882; +mov.u32 r1885, %tid.x; +{ +add.f16x2 r1, %17, %25; +} +{ +add.f16x2 r4, %18, %26; +} +{ +sub.f16x2 r7, %17, %25; +} +{ +sub.f16x2 r10, %18, %26; +} +{ +add.f16x2 r13, %21, %29; +} +{ +add.f16x2 r16, %22, %30; +} +{ +sub.f16x2 r19, %21, %29; +} +{ +sub.f16x2 r22, %22, %30; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %19, %27; +} +{ +add.f16x2 r54, %20, %28; +} +{ +sub.f16x2 r57, %19, %27; +} +{ +sub.f16x2 r60, %20, %28; +} +{ +add.f16x2 r63, %23, %31; +} +{ +add.f16x2 r66, %24, %32; +} +{ +sub.f16x2 r69, %23, %31; +} +{ +sub.f16x2 r72, %24, %32; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f32 f150, 0f3F3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r101, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r102, {low, high}; +} +mov.f32 f182, 0f3F800000; +mov.f32 f148, 0fBF3504F3; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r105, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r106, {low, high}; +} +mov.f32 f181, 0fBF800000; +{ +mul.f16x2 r115, r89, r101; +} +{ +mul.f16x2 r118, r92, r102; +} +{ +sub.f16x2 r121, r115, r118; +} +{ +mul.f16x2 r124, r89, r102; +} +{ +fma.rn.f16x2 r127, r92, r101, r124; +} +{ +neg.f16x2 r131, r86; +} +{ +mul.f16x2 r133, r95, r105; +} +{ +mul.f16x2 r136, r98, r106; +} +{ +sub.f16x2 r139, r133, r136; +} +{ +mul.f16x2 r142, r95, r106; +} +{ +fma.rn.f16x2 r145, r98, r105, r142; +} +{ +add.f16x2 r149, r27, r77; +} +{ +add.f16x2 r152, r30, r80; +} +{ +sub.f16x2 r155, r27, r77; +} +{ +sub.f16x2 r158, r30, r80; +} +{ +add.f16x2 r161, r39, r121; +} +{ +add.f16x2 r164, r42, r127; +} +{ +sub.f16x2 r167, r39, r121; +} +{ +sub.f16x2 r170, r42, r127; +} +{ +add.f16x2 r173, r33, r131; +} +{ +add.f16x2 r176, r36, r83; +} +{ +sub.f16x2 r179, r33, r131; +} +{ +sub.f16x2 r182, r36, r83; +} +{ +add.f16x2 r185, r45, r139; +} +{ +add.f16x2 r188, r48, r145; +} +{ +sub.f16x2 r191, r45, r139; +} +{ +sub.f16x2 r194, r48, r145; +} +and.b32 r1886, r1885, 1023; +shl.b32 r1887, r1885, 6; +and.b32 r1888, r1887, -65536; +add.s32 r1889, r1884, r1888; +cvt.rn.f32.u32 f185, r1886; +mul.f32 f186, f185, 0f3A490FDB; +cos.approx.f32 f29, f186; +sin.approx.f32 f187, f186; +neg.f32 f30, f187; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r197, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r200, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r202, {high, high}; +} +{ +mul.f16x2 r204, r164, r202; +} +{ +fma.rn.f16x2 r207, r161, r200, r204; +} +{ +mul.f16x2 r211, r161, r202; +} +{ +neg.f16x2 r214, r211; +} +{ +fma.rn.f16x2 r216, r164, r200, r214; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r220, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r222, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r224, {low, high}; +} +{ +mul.f16x2 r225, r222, r224; +} +{ +mul.f16x2 r228, r197, r220; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r231, {high, low}; +} +{ +fma.rn.f16x2 r233, r225, r231, r228; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r237, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r239, {high, high}; +} +{ +mul.f16x2 r241, r176, r239; +} +{ +fma.rn.f16x2 r244, r173, r237, r241; +} +{ +mul.f16x2 r248, r173, r239; +} +{ +neg.f16x2 r251, r248; +} +{ +fma.rn.f16x2 r253, r176, r237, r251; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r259, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r261, {low, high}; +} +{ +mul.f16x2 r262, r259, r261; +} +{ +mul.f16x2 r265, r233, r257; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r233; +mov.b32 r268, {high, low}; +} +{ +fma.rn.f16x2 r270, r262, r268, r265; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r274, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r276, {high, high}; +} +{ +mul.f16x2 r278, r188, r276; +} +{ +fma.rn.f16x2 r281, r185, r274, r278; +} +{ +mul.f16x2 r285, r185, r276; +} +{ +neg.f16x2 r288, r285; +} +{ +fma.rn.f16x2 r290, r188, r274, r288; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r296, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r298, {low, high}; +} +{ +mul.f16x2 r299, r296, r298; +} +{ +mul.f16x2 r302, r270, r294; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r270; +mov.b32 r305, {high, low}; +} +{ +fma.rn.f16x2 r307, r299, r305, r302; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r311, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r313, {high, high}; +} +{ +mul.f16x2 r315, r158, r313; +} +{ +fma.rn.f16x2 r318, r155, r311, r315; +} +{ +mul.f16x2 r322, r155, r313; +} +{ +neg.f16x2 r325, r322; +} +{ +fma.rn.f16x2 r327, r158, r311, r325; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r331, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r333, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r335, {low, high}; +} +{ +mul.f16x2 r336, r333, r335; +} +{ +mul.f16x2 r339, r307, r331; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r307; +mov.b32 r342, {high, low}; +} +{ +fma.rn.f16x2 r344, r336, r342, r339; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r348, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r350, {high, high}; +} +{ +mul.f16x2 r352, r170, r350; +} +{ +fma.rn.f16x2 r355, r167, r348, r352; +} +{ +mul.f16x2 r359, r167, r350; +} +{ +neg.f16x2 r362, r359; +} +{ +fma.rn.f16x2 r364, r170, r348, r362; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r368, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r370, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r372, {low, high}; +} +{ +mul.f16x2 r373, r370, r372; +} +{ +mul.f16x2 r376, r344, r368; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r344; +mov.b32 r379, {high, low}; +} +{ +fma.rn.f16x2 r381, r373, r379, r376; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r385, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r387, {high, high}; +} +{ +mul.f16x2 r389, r182, r387; +} +{ +fma.rn.f16x2 r392, r179, r385, r389; +} +{ +mul.f16x2 r396, r179, r387; +} +{ +neg.f16x2 r399, r396; +} +{ +fma.rn.f16x2 r401, r182, r385, r399; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r405, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r197; +mov.b32 r407, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r409, {low, high}; +} +{ +mul.f16x2 r410, r407, r409; +} +{ +mul.f16x2 r413, r381, r405; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r381; +mov.b32 r416, {high, low}; +} +{ +fma.rn.f16x2 r418, r410, r416, r413; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r418; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r194, r424; +} +{ +fma.rn.f16x2 r429, r191, r422, r426; +} +{ +mul.f16x2 r433, r191, r424; +} +{ +neg.f16x2 r436, r433; +} +{ +fma.rn.f16x2 r438, r194, r422, r436; +} +barrier.sync 0; +and.b32 r1890, r1887, 65472; +add.s32 r1891, r1889, r1890; +st.shared.v4.f32 [r1891], {r149, r152, r207, r216}; +st.shared.v4.f32 [r1891+16], {r244, r253, r281, r290}; +st.shared.v4.f32 [r1891+32], {r318, r327, r355, r364}; +st.shared.v4.f32 [r1891+48], {r392, r401, r429, r438}; +barrier.sync 0; +mad.lo.s32 r1892, r1886, -56, r1891; +ld.shared.u32 r460, [r1892]; +ld.shared.u32 r463, [r1892+4]; +ld.shared.u32 r510, [r1892+8192]; +ld.shared.u32 r513, [r1892+8196]; +ld.shared.u32 r472, [r1892+16384]; +ld.shared.u32 r475, [r1892+16388]; +ld.shared.u32 r522, [r1892+24576]; +ld.shared.u32 r525, [r1892+24580]; +ld.shared.u32 r461, [r1892+32768]; +ld.shared.u32 r464, [r1892+32772]; +ld.shared.u32 r511, [r1892+40960]; +ld.shared.u32 r514, [r1892+40964]; +ld.shared.u32 r473, [r1892+49152]; +ld.shared.u32 r476, [r1892+49156]; +ld.shared.u32 r523, [r1892+57344]; +ld.shared.u32 r526, [r1892+57348]; +{ +add.f16x2 r459, r460, r461; +} +{ +add.f16x2 r462, r463, r464; +} +{ +sub.f16x2 r465, r460, r461; +} +{ +sub.f16x2 r468, r463, r464; +} +{ +add.f16x2 r471, r472, r473; +} +{ +add.f16x2 r474, r475, r476; +} +{ +sub.f16x2 r477, r472, r473; +} +{ +sub.f16x2 r480, r475, r476; +} +{ +neg.f16x2 r483, r480; +} +{ +add.f16x2 r485, r459, r471; +} +{ +add.f16x2 r488, r462, r474; +} +{ +sub.f16x2 r491, r459, r471; +} +{ +sub.f16x2 r494, r462, r474; +} +{ +add.f16x2 r497, r465, r483; +} +{ +add.f16x2 r500, r468, r477; +} +{ +sub.f16x2 r503, r465, r483; +} +{ +sub.f16x2 r506, r468, r477; +} +{ +add.f16x2 r509, r510, r511; +} +{ +add.f16x2 r512, r513, r514; +} +{ +sub.f16x2 r515, r510, r511; +} +{ +sub.f16x2 r518, r513, r514; +} +{ +add.f16x2 r521, r522, r523; +} +{ +add.f16x2 r524, r525, r526; +} +{ +sub.f16x2 r527, r522, r523; +} +{ +sub.f16x2 r530, r525, r526; +} +{ +neg.f16x2 r533, r530; +} +{ +add.f16x2 r535, r509, r521; +} +{ +add.f16x2 r538, r512, r524; +} +{ +sub.f16x2 r541, r509, r521; +} +{ +sub.f16x2 r544, r512, r524; +} +{ +add.f16x2 r547, r515, r533; +} +{ +add.f16x2 r550, r518, r527; +} +{ +sub.f16x2 r553, r515, r533; +} +{ +sub.f16x2 r556, r518, r527; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r559, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r560, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r563, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r564, {low, high}; +} +{ +mul.f16x2 r573, r547, r559; +} +{ +mul.f16x2 r576, r550, r560; +} +{ +sub.f16x2 r579, r573, r576; +} +{ +mul.f16x2 r582, r547, r560; +} +{ +fma.rn.f16x2 r585, r550, r559, r582; +} +{ +neg.f16x2 r589, r544; +} +{ +mul.f16x2 r591, r553, r563; +} +{ +mul.f16x2 r594, r556, r564; +} +{ +sub.f16x2 r597, r591, r594; +} +{ +mul.f16x2 r600, r553, r564; +} +{ +fma.rn.f16x2 r603, r556, r563, r600; +} +{ +add.f16x2 r607, r485, r535; +} +{ +add.f16x2 r610, r488, r538; +} +{ +sub.f16x2 r613, r485, r535; +} +{ +sub.f16x2 r616, r488, r538; +} +{ +add.f16x2 r619, r497, r579; +} +{ +add.f16x2 r622, r500, r585; +} +{ +sub.f16x2 r625, r497, r579; +} +{ +sub.f16x2 r628, r500, r585; +} +{ +add.f16x2 r631, r491, r589; +} +{ +add.f16x2 r634, r494, r541; +} +{ +sub.f16x2 r637, r491, r589; +} +{ +sub.f16x2 r640, r494, r541; +} +{ +add.f16x2 r643, r503, r597; +} +{ +add.f16x2 r646, r506, r603; +} +{ +sub.f16x2 r649, r503, r597; +} +{ +sub.f16x2 r652, r506, r603; +} +and.b32 r1893, r1885, 1016; +bfe.u32 r1894, r1885, 3, 7; +cvt.rn.f32.u32 f188, r1894; +mul.f32 f189, f188, 0f3BC90FDB; +cos.approx.f32 f75, f189; +sin.approx.f32 f190, f189; +neg.f32 f76, f190; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f75; +cvt.rn.f16.f32 high, f76; +mov.b32 r655, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r658, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r660, {high, high}; +} +{ +mul.f16x2 r662, r622, r660; +} +{ +fma.rn.f16x2 r665, r619, r658, r662; +} +{ +mul.f16x2 r669, r619, r660; +} +{ +neg.f16x2 r672, r669; +} +{ +fma.rn.f16x2 r674, r622, r658, r672; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r678, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r680, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r682, {low, high}; +} +{ +mul.f16x2 r683, r680, r682; +} +{ +mul.f16x2 r686, r655, r678; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r689, {high, low}; +} +{ +fma.rn.f16x2 r691, r683, r689, r686; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r695, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r697, {high, high}; +} +{ +mul.f16x2 r699, r634, r697; +} +{ +fma.rn.f16x2 r702, r631, r695, r699; +} +{ +mul.f16x2 r706, r631, r697; +} +{ +neg.f16x2 r709, r706; +} +{ +fma.rn.f16x2 r711, r634, r695, r709; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r715, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r717, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r719, {low, high}; +} +{ +mul.f16x2 r720, r717, r719; +} +{ +mul.f16x2 r723, r691, r715; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r691; +mov.b32 r726, {high, low}; +} +{ +fma.rn.f16x2 r728, r720, r726, r723; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r732, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r734, {high, high}; +} +{ +mul.f16x2 r736, r646, r734; +} +{ +fma.rn.f16x2 r739, r643, r732, r736; +} +{ +mul.f16x2 r743, r643, r734; +} +{ +neg.f16x2 r746, r743; +} +{ +fma.rn.f16x2 r748, r646, r732, r746; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r752, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r754, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r756, {low, high}; +} +{ +mul.f16x2 r757, r754, r756; +} +{ +mul.f16x2 r760, r728, r752; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r728; +mov.b32 r763, {high, low}; +} +{ +fma.rn.f16x2 r765, r757, r763, r760; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r769, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r771, {high, high}; +} +{ +mul.f16x2 r773, r616, r771; +} +{ +fma.rn.f16x2 r776, r613, r769, r773; +} +{ +mul.f16x2 r780, r613, r771; +} +{ +neg.f16x2 r783, r780; +} +{ +fma.rn.f16x2 r785, r616, r769, r783; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r789, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r791, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r793, {low, high}; +} +{ +mul.f16x2 r794, r791, r793; +} +{ +mul.f16x2 r797, r765, r789; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r765; +mov.b32 r800, {high, low}; +} +{ +fma.rn.f16x2 r802, r794, r800, r797; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r806, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r808, {high, high}; +} +{ +mul.f16x2 r810, r628, r808; +} +{ +fma.rn.f16x2 r813, r625, r806, r810; +} +{ +mul.f16x2 r817, r625, r808; +} +{ +neg.f16x2 r820, r817; +} +{ +fma.rn.f16x2 r822, r628, r806, r820; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r826, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r828, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r830, {low, high}; +} +{ +mul.f16x2 r831, r828, r830; +} +{ +mul.f16x2 r834, r802, r826; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r802; +mov.b32 r837, {high, low}; +} +{ +fma.rn.f16x2 r839, r831, r837, r834; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r843, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r845, {high, high}; +} +{ +mul.f16x2 r847, r640, r845; +} +{ +fma.rn.f16x2 r850, r637, r843, r847; +} +{ +mul.f16x2 r854, r637, r845; +} +{ +neg.f16x2 r857, r854; +} +{ +fma.rn.f16x2 r859, r640, r843, r857; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r863, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r655; +mov.b32 r865, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r867, {low, high}; +} +{ +mul.f16x2 r868, r865, r867; +} +{ +mul.f16x2 r871, r839, r863; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r839; +mov.b32 r874, {high, low}; +} +{ +fma.rn.f16x2 r876, r868, r874, r871; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r880, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r876; +mov.b32 r882, {high, high}; +} +{ +mul.f16x2 r884, r652, r882; +} +{ +fma.rn.f16x2 r887, r649, r880, r884; +} +{ +mul.f16x2 r891, r649, r882; +} +{ +neg.f16x2 r894, r891; +} +{ +fma.rn.f16x2 r896, r652, r880, r894; +} +shl.b32 r1895, r1885, 3; +and.b32 r1896, r1895, 56; +add.s32 r1897, r1889, r1896; +barrier.sync 0; +and.b32 r1898, r1887, 65024; +add.s32 r1899, r1897, r1898; +st.shared.u32 [r1899], r607; +st.shared.u32 [r1899+4], r610; +st.shared.u32 [r1899+64], r665; +st.shared.u32 [r1899+68], r674; +st.shared.u32 [r1899+128], r702; +st.shared.u32 [r1899+132], r711; +st.shared.u32 [r1899+192], r739; +st.shared.u32 [r1899+196], r748; +st.shared.u32 [r1899+256], r776; +st.shared.u32 [r1899+260], r785; +st.shared.u32 [r1899+320], r813; +st.shared.u32 [r1899+324], r822; +st.shared.u32 [r1899+384], r850; +st.shared.u32 [r1899+388], r859; +st.shared.u32 [r1899+448], r887; +st.shared.u32 [r1899+452], r896; +barrier.sync 0; +mad.lo.s32 r1900, r1893, -56, r1899; +ld.shared.u32 r918, [r1900]; +ld.shared.u32 r921, [r1900+4]; +ld.shared.u32 r968, [r1900+8192]; +ld.shared.u32 r971, [r1900+8196]; +ld.shared.u32 r930, [r1900+16384]; +ld.shared.u32 r933, [r1900+16388]; +ld.shared.u32 r980, [r1900+24576]; +ld.shared.u32 r983, [r1900+24580]; +ld.shared.u32 r919, [r1900+32768]; +ld.shared.u32 r922, [r1900+32772]; +ld.shared.u32 r969, [r1900+40960]; +ld.shared.u32 r972, [r1900+40964]; +ld.shared.u32 r931, [r1900+49152]; +ld.shared.u32 r934, [r1900+49156]; +ld.shared.u32 r981, [r1900+57344]; +ld.shared.u32 r984, [r1900+57348]; +{ +add.f16x2 r917, r918, r919; +} +{ +add.f16x2 r920, r921, r922; +} +{ +sub.f16x2 r923, r918, r919; +} +{ +sub.f16x2 r926, r921, r922; +} +{ +add.f16x2 r929, r930, r931; +} +{ +add.f16x2 r932, r933, r934; +} +{ +sub.f16x2 r935, r930, r931; +} +{ +sub.f16x2 r938, r933, r934; +} +{ +neg.f16x2 r941, r938; +} +{ +add.f16x2 r943, r917, r929; +} +{ +add.f16x2 r946, r920, r932; +} +{ +sub.f16x2 r949, r917, r929; +} +{ +sub.f16x2 r952, r920, r932; +} +{ +add.f16x2 r955, r923, r941; +} +{ +add.f16x2 r958, r926, r935; +} +{ +sub.f16x2 r961, r923, r941; +} +{ +sub.f16x2 r964, r926, r935; +} +{ +add.f16x2 r967, r968, r969; +} +{ +add.f16x2 r970, r971, r972; +} +{ +sub.f16x2 r973, r968, r969; +} +{ +sub.f16x2 r976, r971, r972; +} +{ +add.f16x2 r979, r980, r981; +} +{ +add.f16x2 r982, r983, r984; +} +{ +sub.f16x2 r985, r980, r981; +} +{ +sub.f16x2 r988, r983, r984; +} +{ +neg.f16x2 r991, r988; +} +{ +add.f16x2 r993, r967, r979; +} +{ +add.f16x2 r996, r970, r982; +} +{ +sub.f16x2 r999, r967, r979; +} +{ +sub.f16x2 r1002, r970, r982; +} +{ +add.f16x2 r1005, r973, r991; +} +{ +add.f16x2 r1008, r976, r985; +} +{ +sub.f16x2 r1011, r973, r991; +} +{ +sub.f16x2 r1014, r976, r985; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1017, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1018, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1022, {low, high}; +} +{ +mul.f16x2 r1031, r1005, r1017; +} +{ +mul.f16x2 r1034, r1008, r1018; +} +{ +sub.f16x2 r1037, r1031, r1034; +} +{ +mul.f16x2 r1040, r1005, r1018; +} +{ +fma.rn.f16x2 r1043, r1008, r1017, r1040; +} +{ +neg.f16x2 r1047, r1002; +} +{ +mul.f16x2 r1049, r1011, r1021; +} +{ +mul.f16x2 r1052, r1014, r1022; +} +{ +sub.f16x2 r1055, r1049, r1052; +} +{ +mul.f16x2 r1058, r1011, r1022; +} +{ +fma.rn.f16x2 r1061, r1014, r1021, r1058; +} +{ +add.f16x2 r1065, r943, r993; +} +{ +add.f16x2 r1068, r946, r996; +} +{ +sub.f16x2 r1071, r943, r993; +} +{ +sub.f16x2 r1074, r946, r996; +} +{ +add.f16x2 r1077, r955, r1037; +} +{ +add.f16x2 r1080, r958, r1043; +} +{ +sub.f16x2 r1083, r955, r1037; +} +{ +sub.f16x2 r1086, r958, r1043; +} +{ +add.f16x2 r1089, r949, r1047; +} +{ +add.f16x2 r1092, r952, r999; +} +{ +sub.f16x2 r1095, r949, r1047; +} +{ +sub.f16x2 r1098, r952, r999; +} +{ +add.f16x2 r1101, r961, r1055; +} +{ +add.f16x2 r1104, r964, r1061; +} +{ +sub.f16x2 r1107, r961, r1055; +} +{ +sub.f16x2 r1110, r964, r1061; +} +and.b32 r1901, r1885, 960; +bfe.u32 r1902, r1885, 6, 4; +cvt.rn.f32.u32 f191, r1902; +mul.f32 f192, f191, 0f3D490FDB; +cos.approx.f32 f121, f192; +sin.approx.f32 f193, f192; +neg.f32 f122, f193; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f121; +cvt.rn.f16.f32 high, f122; +mov.b32 r1113, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1118, {high, high}; +} +{ +mul.f16x2 r1120, r1080, r1118; +} +{ +fma.rn.f16x2 r1123, r1077, r1116, r1120; +} +{ +mul.f16x2 r1127, r1077, r1118; +} +{ +neg.f16x2 r1130, r1127; +} +{ +fma.rn.f16x2 r1132, r1080, r1116, r1130; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1136, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1138, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1140, {low, high}; +} +{ +mul.f16x2 r1141, r1138, r1140; +} +{ +mul.f16x2 r1144, r1113, r1136; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1147, {high, low}; +} +{ +fma.rn.f16x2 r1149, r1141, r1147, r1144; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1155, {high, high}; +} +{ +mul.f16x2 r1157, r1092, r1155; +} +{ +fma.rn.f16x2 r1160, r1089, r1153, r1157; +} +{ +mul.f16x2 r1164, r1089, r1155; +} +{ +neg.f16x2 r1167, r1164; +} +{ +fma.rn.f16x2 r1169, r1092, r1153, r1167; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1173, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1175, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1177, {low, high}; +} +{ +mul.f16x2 r1178, r1175, r1177; +} +{ +mul.f16x2 r1181, r1149, r1173; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1149; +mov.b32 r1184, {high, low}; +} +{ +fma.rn.f16x2 r1186, r1178, r1184, r1181; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1192, {high, high}; +} +{ +mul.f16x2 r1194, r1104, r1192; +} +{ +fma.rn.f16x2 r1197, r1101, r1190, r1194; +} +{ +mul.f16x2 r1201, r1101, r1192; +} +{ +neg.f16x2 r1204, r1201; +} +{ +fma.rn.f16x2 r1206, r1104, r1190, r1204; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1210, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1212, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1214, {low, high}; +} +{ +mul.f16x2 r1215, r1212, r1214; +} +{ +mul.f16x2 r1218, r1186, r1210; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1186; +mov.b32 r1221, {high, low}; +} +{ +fma.rn.f16x2 r1223, r1215, r1221, r1218; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1229, {high, high}; +} +{ +mul.f16x2 r1231, r1074, r1229; +} +{ +fma.rn.f16x2 r1234, r1071, r1227, r1231; +} +{ +mul.f16x2 r1238, r1071, r1229; +} +{ +neg.f16x2 r1241, r1238; +} +{ +fma.rn.f16x2 r1243, r1074, r1227, r1241; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1247, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1249, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1251, {low, high}; +} +{ +mul.f16x2 r1252, r1249, r1251; +} +{ +mul.f16x2 r1255, r1223, r1247; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1223; +mov.b32 r1258, {high, low}; +} +{ +fma.rn.f16x2 r1260, r1252, r1258, r1255; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1266, {high, high}; +} +{ +mul.f16x2 r1268, r1086, r1266; +} +{ +fma.rn.f16x2 r1271, r1083, r1264, r1268; +} +{ +mul.f16x2 r1275, r1083, r1266; +} +{ +neg.f16x2 r1278, r1275; +} +{ +fma.rn.f16x2 r1280, r1086, r1264, r1278; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1284, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1286, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1288, {low, high}; +} +{ +mul.f16x2 r1289, r1286, r1288; +} +{ +mul.f16x2 r1292, r1260, r1284; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1260; +mov.b32 r1295, {high, low}; +} +{ +fma.rn.f16x2 r1297, r1289, r1295, r1292; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1303, {high, high}; +} +{ +mul.f16x2 r1305, r1098, r1303; +} +{ +fma.rn.f16x2 r1308, r1095, r1301, r1305; +} +{ +mul.f16x2 r1312, r1095, r1303; +} +{ +neg.f16x2 r1315, r1312; +} +{ +fma.rn.f16x2 r1317, r1098, r1301, r1315; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1321, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1113; +mov.b32 r1323, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1325, {low, high}; +} +{ +mul.f16x2 r1326, r1323, r1325; +} +{ +mul.f16x2 r1329, r1297, r1321; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1297; +mov.b32 r1332, {high, low}; +} +{ +fma.rn.f16x2 r1334, r1326, r1332, r1329; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1334; +mov.b32 r1340, {high, high}; +} +{ +mul.f16x2 r1342, r1110, r1340; +} +{ +fma.rn.f16x2 r1345, r1107, r1338, r1342; +} +{ +mul.f16x2 r1349, r1107, r1340; +} +{ +neg.f16x2 r1352, r1349; +} +{ +fma.rn.f16x2 r1354, r1110, r1338, r1352; +} +and.b32 r1903, r1895, 504; +add.s32 r1904, r1889, r1903; +barrier.sync 0; +and.b32 r1905, r1887, 61440; +add.s32 r1906, r1904, r1905; +st.shared.u32 [r1906], r1065; +st.shared.u32 [r1906+4], r1068; +st.shared.u32 [r1906+512], r1123; +st.shared.u32 [r1906+516], r1132; +st.shared.u32 [r1906+1024], r1160; +st.shared.u32 [r1906+1028], r1169; +st.shared.u32 [r1906+1536], r1197; +st.shared.u32 [r1906+1540], r1206; +st.shared.u32 [r1906+2048], r1234; +st.shared.u32 [r1906+2052], r1243; +st.shared.u32 [r1906+2560], r1271; +st.shared.u32 [r1906+2564], r1280; +st.shared.u32 [r1906+3072], r1308; +st.shared.u32 [r1906+3076], r1317; +st.shared.u32 [r1906+3584], r1345; +st.shared.u32 [r1906+3588], r1354; +barrier.sync 0; +mad.lo.s32 r1907, r1901, -56, r1906; +ld.shared.u32 r1376, [r1907]; +ld.shared.u32 r1379, [r1907+4]; +ld.shared.u32 r1426, [r1907+8192]; +ld.shared.u32 r1429, [r1907+8196]; +ld.shared.u32 r1388, [r1907+16384]; +ld.shared.u32 r1391, [r1907+16388]; +ld.shared.u32 r1438, [r1907+24576]; +ld.shared.u32 r1441, [r1907+24580]; +ld.shared.u32 r1377, [r1907+32768]; +ld.shared.u32 r1380, [r1907+32772]; +ld.shared.u32 r1427, [r1907+40960]; +ld.shared.u32 r1430, [r1907+40964]; +ld.shared.u32 r1389, [r1907+49152]; +ld.shared.u32 r1392, [r1907+49156]; +ld.shared.u32 r1439, [r1907+57344]; +ld.shared.u32 r1442, [r1907+57348]; +{ +add.f16x2 r1375, r1376, r1377; +} +{ +add.f16x2 r1378, r1379, r1380; +} +{ +sub.f16x2 r1381, r1376, r1377; +} +{ +sub.f16x2 r1384, r1379, r1380; +} +{ +add.f16x2 r1387, r1388, r1389; +} +{ +add.f16x2 r1390, r1391, r1392; +} +{ +sub.f16x2 r1393, r1388, r1389; +} +{ +sub.f16x2 r1396, r1391, r1392; +} +{ +neg.f16x2 r1399, r1396; +} +{ +add.f16x2 r1401, r1375, r1387; +} +{ +add.f16x2 r1404, r1378, r1390; +} +{ +sub.f16x2 r1407, r1375, r1387; +} +{ +sub.f16x2 r1410, r1378, r1390; +} +{ +add.f16x2 r1413, r1381, r1399; +} +{ +add.f16x2 r1416, r1384, r1393; +} +{ +sub.f16x2 r1419, r1381, r1399; +} +{ +sub.f16x2 r1422, r1384, r1393; +} +{ +add.f16x2 r1425, r1426, r1427; +} +{ +add.f16x2 r1428, r1429, r1430; +} +{ +sub.f16x2 r1431, r1426, r1427; +} +{ +sub.f16x2 r1434, r1429, r1430; +} +{ +add.f16x2 r1437, r1438, r1439; +} +{ +add.f16x2 r1440, r1441, r1442; +} +{ +sub.f16x2 r1443, r1438, r1439; +} +{ +sub.f16x2 r1446, r1441, r1442; +} +{ +neg.f16x2 r1449, r1446; +} +{ +add.f16x2 r1451, r1425, r1437; +} +{ +add.f16x2 r1454, r1428, r1440; +} +{ +sub.f16x2 r1457, r1425, r1437; +} +{ +sub.f16x2 r1460, r1428, r1440; +} +{ +add.f16x2 r1463, r1431, r1449; +} +{ +add.f16x2 r1466, r1434, r1443; +} +{ +sub.f16x2 r1469, r1431, r1449; +} +{ +sub.f16x2 r1472, r1434, r1443; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1475, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1476, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f148; +cvt.rn.f16.f32 high, f148; +mov.b32 r1479, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f150; +cvt.rn.f16.f32 high, f150; +mov.b32 r1480, {low, high}; +} +{ +mul.f16x2 r1489, r1463, r1475; +} +{ +mul.f16x2 r1492, r1466, r1476; +} +{ +sub.f16x2 r1495, r1489, r1492; +} +{ +mul.f16x2 r1498, r1463, r1476; +} +{ +fma.rn.f16x2 r1501, r1466, r1475, r1498; +} +{ +neg.f16x2 r1505, r1460; +} +{ +mul.f16x2 r1507, r1469, r1479; +} +{ +mul.f16x2 r1510, r1472, r1480; +} +{ +sub.f16x2 r1513, r1507, r1510; +} +{ +mul.f16x2 r1516, r1469, r1480; +} +{ +fma.rn.f16x2 r1519, r1472, r1479, r1516; +} +{ +add.f16x2 r1523, r1401, r1451; +} +{ +add.f16x2 r1526, r1404, r1454; +} +{ +sub.f16x2 r1529, r1401, r1451; +} +{ +sub.f16x2 r1532, r1404, r1454; +} +{ +add.f16x2 r1535, r1413, r1495; +} +{ +add.f16x2 r1538, r1416, r1501; +} +{ +sub.f16x2 r1541, r1413, r1495; +} +{ +sub.f16x2 r1544, r1416, r1501; +} +{ +add.f16x2 r1547, r1407, r1505; +} +{ +add.f16x2 r1550, r1410, r1457; +} +{ +sub.f16x2 r1553, r1407, r1505; +} +{ +sub.f16x2 r1556, r1410, r1457; +} +{ +add.f16x2 r1559, r1419, r1513; +} +{ +add.f16x2 r1562, r1422, r1519; +} +{ +sub.f16x2 r1565, r1419, r1513; +} +{ +sub.f16x2 r1568, r1422, r1519; +} +and.b32 r1908, r1885, 512; +bfe.u32 r1909, r1885, 9, 1; +cvt.rn.f32.u32 f194, r1909; +mul.f32 f195, f194, 0f3EC90FDB; +cos.approx.f32 f167, f195; +sin.approx.f32 f196, f195; +neg.f32 f168, f196; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f167; +cvt.rn.f16.f32 high, f168; +mov.b32 r1571, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1574, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1576, {high, high}; +} +{ +mul.f16x2 r1578, r1538, r1576; +} +{ +fma.rn.f16x2 r1581, r1535, r1574, r1578; +} +{ +mul.f16x2 r1585, r1535, r1576; +} +{ +neg.f16x2 r1588, r1585; +} +{ +fma.rn.f16x2 r1590, r1538, r1574, r1588; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1596, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1598, {low, high}; +} +{ +mul.f16x2 r1599, r1596, r1598; +} +{ +mul.f16x2 r1602, r1571, r1594; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1605, {high, low}; +} +{ +fma.rn.f16x2 r1607, r1599, r1605, r1602; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1611, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1613, {high, high}; +} +{ +mul.f16x2 r1615, r1550, r1613; +} +{ +fma.rn.f16x2 r1618, r1547, r1611, r1615; +} +{ +mul.f16x2 r1622, r1547, r1613; +} +{ +neg.f16x2 r1625, r1622; +} +{ +fma.rn.f16x2 r1627, r1550, r1611, r1625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1633, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1635, {low, high}; +} +{ +mul.f16x2 r1636, r1633, r1635; +} +{ +mul.f16x2 r1639, r1607, r1631; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1607; +mov.b32 r1642, {high, low}; +} +{ +fma.rn.f16x2 r1644, r1636, r1642, r1639; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1648, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1650, {high, high}; +} +{ +mul.f16x2 r1652, r1562, r1650; +} +{ +fma.rn.f16x2 r1655, r1559, r1648, r1652; +} +{ +mul.f16x2 r1659, r1559, r1650; +} +{ +neg.f16x2 r1662, r1659; +} +{ +fma.rn.f16x2 r1664, r1562, r1648, r1662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1670, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1672, {low, high}; +} +{ +mul.f16x2 r1673, r1670, r1672; +} +{ +mul.f16x2 r1676, r1644, r1668; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1644; +mov.b32 r1679, {high, low}; +} +{ +fma.rn.f16x2 r1681, r1673, r1679, r1676; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1685, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1687, {high, high}; +} +{ +mul.f16x2 r1689, r1532, r1687; +} +{ +fma.rn.f16x2 r1692, r1529, r1685, r1689; +} +{ +mul.f16x2 r1696, r1529, r1687; +} +{ +neg.f16x2 r1699, r1696; +} +{ +fma.rn.f16x2 r1701, r1532, r1685, r1699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1707, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1709, {low, high}; +} +{ +mul.f16x2 r1710, r1707, r1709; +} +{ +mul.f16x2 r1713, r1681, r1705; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1681; +mov.b32 r1716, {high, low}; +} +{ +fma.rn.f16x2 r1718, r1710, r1716, r1713; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1722, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1724, {high, high}; +} +{ +mul.f16x2 r1726, r1544, r1724; +} +{ +fma.rn.f16x2 r1729, r1541, r1722, r1726; +} +{ +mul.f16x2 r1733, r1541, r1724; +} +{ +neg.f16x2 r1736, r1733; +} +{ +fma.rn.f16x2 r1738, r1544, r1722, r1736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1744, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1746, {low, high}; +} +{ +mul.f16x2 r1747, r1744, r1746; +} +{ +mul.f16x2 r1750, r1718, r1742; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1718; +mov.b32 r1753, {high, low}; +} +{ +fma.rn.f16x2 r1755, r1747, r1753, r1750; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1759, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1761, {high, high}; +} +{ +mul.f16x2 r1763, r1556, r1761; +} +{ +fma.rn.f16x2 r1766, r1553, r1759, r1763; +} +{ +mul.f16x2 r1770, r1553, r1761; +} +{ +neg.f16x2 r1773, r1770; +} +{ +fma.rn.f16x2 r1775, r1556, r1759, r1773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1571; +mov.b32 r1781, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f181; +cvt.rn.f16.f32 high, f182; +mov.b32 r1783, {low, high}; +} +{ +mul.f16x2 r1784, r1781, r1783; +} +{ +mul.f16x2 r1787, r1755, r1779; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1755; +mov.b32 r1790, {high, low}; +} +{ +fma.rn.f16x2 r1792, r1784, r1790, r1787; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1796, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r1792; +mov.b32 r1798, {high, high}; +} +{ +mul.f16x2 r1800, r1568, r1798; +} +{ +fma.rn.f16x2 r1803, r1565, r1796, r1800; +} +{ +mul.f16x2 r1807, r1565, r1798; +} +{ +neg.f16x2 r1810, r1807; +} +{ +fma.rn.f16x2 r1812, r1568, r1796, r1810; +} +and.b32 r1910, r1895, 4088; +add.s32 r1911, r1889, r1910; +barrier.sync 0; +and.b32 r1912, r1887, 32768; +add.s32 r1913, r1911, r1912; +st.shared.u32 [r1913], r1523; +st.shared.u32 [r1913+4], r1526; +st.shared.u32 [r1913+4096], r1581; +st.shared.u32 [r1913+4100], r1590; +st.shared.u32 [r1913+8192], r1618; +st.shared.u32 [r1913+8196], r1627; +st.shared.u32 [r1913+12288], r1655; +st.shared.u32 [r1913+12292], r1664; +st.shared.u32 [r1913+16384], r1692; +st.shared.u32 [r1913+16388], r1701; +st.shared.u32 [r1913+20480], r1729; +st.shared.u32 [r1913+20484], r1738; +st.shared.u32 [r1913+24576], r1766; +st.shared.u32 [r1913+24580], r1775; +st.shared.u32 [r1913+28672], r1803; +st.shared.u32 [r1913+28676], r1812; +barrier.sync 0; +mad.lo.s32 r1914, r1908, -56, r1913; +ld.shared.u32 r1834, [r1914]; +ld.shared.u32 r1837, [r1914+4]; +ld.shared.u32 r1846, [r1914+8192]; +ld.shared.u32 r1849, [r1914+8196]; +ld.shared.u32 r1858, [r1914+16384]; +ld.shared.u32 r1861, [r1914+16388]; +ld.shared.u32 r1870, [r1914+24576]; +ld.shared.u32 r1873, [r1914+24580]; +ld.shared.u32 r1835, [r1914+32768]; +ld.shared.u32 r1838, [r1914+32772]; +ld.shared.u32 r1847, [r1914+40960]; +ld.shared.u32 r1850, [r1914+40964]; +ld.shared.u32 r1859, [r1914+49152]; +ld.shared.u32 r1862, [r1914+49156]; +ld.shared.u32 r1871, [r1914+57344]; +ld.shared.u32 r1874, [r1914+57348]; +{ +add.f16x2 %0, r1834, r1835; +} +{ +add.f16x2 %1, r1837, r1838; +} +{ +sub.f16x2 %8, r1834, r1835; +} +{ +sub.f16x2 %9, r1837, r1838; +} +{ +add.f16x2 %2, r1846, r1847; +} +{ +add.f16x2 %3, r1849, r1850; +} +{ +sub.f16x2 %10, r1846, r1847; +} +{ +sub.f16x2 %11, r1849, r1850; +} +{ +add.f16x2 %4, r1858, r1859; +} +{ +add.f16x2 %5, r1861, r1862; +} +{ +sub.f16x2 %12, r1858, r1859; +} +{ +sub.f16x2 %13, r1861, r1862; +} +{ +add.f16x2 %6, r1870, r1871; +} +{ +add.f16x2 %7, r1873, r1874; +} +{ +sub.f16x2 %14, r1870, r1871; +} +{ +sub.f16x2 %15, r1873, r1874; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..625e13269bf72 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp32_fwd.hpp.inc @@ -0,0 +1,7775 @@ +#ifndef CUFFTDX_FFT_8192_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_8192_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<110, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2669>; +.reg .b32 r<44>; +.reg .b64 rd<13>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2659, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2657, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2656, f2659, f2657; +sub.f32 f140, f2659, f2657; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2655, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2652, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2650, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2649, f2652, f2650; +sub.f32 f156, f2652, f2650; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2648, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2648, 0fBF3504F3; +mul.f32 f2647, f157, 0f3F3504F3; +sub.f32 f163, f2647, f162; +mul.f32 f164, f2648, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2646, f2656, f2649; +sub.f32 f173, f2656, f2649; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2645, f2655, f165; +sub.f32 f177, f2655, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2644, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2643, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2641, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2638, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2637, f2641, f2638; +sub.f32 f197, f2641, f2638; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2636, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2634, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2632, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2631, f2634, f2632; +sub.f32 f213, f2634, f2632; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2630, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2630, 0fBF3504F3; +mul.f32 f2629, f214, 0f3F3504F3; +sub.f32 f220, f2629, f219; +mul.f32 f221, f2630, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2628, f2637, f2631; +sub.f32 f230, f2637, f2631; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2627, f2636, f222; +sub.f32 f234, f2636, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2626, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2625, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2623, f231, 0f3F6C835E; +mul.f32 f2624, f2627, 0fBEC3EF15; +sub.f32 f245, f2623, f2624; +mul.f32 f246, f2627, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2621, f235, 0f3F3504F3; +mul.f32 f2622, f2626, 0fBF3504F3; +sub.f32 f250, f2621, f2622; +mul.f32 f251, f2626, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2619, f239, 0f3EC3EF15; +mul.f32 f2620, f2625, 0fBF6C835E; +sub.f32 f255, f2619, f2620; +mul.f32 f256, f2625, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2617, f233, 0fBEC3EF15; +mul.f32 f2618, f234, 0fBF6C835E; +sub.f32 f260, f2617, f2618; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2615, f241, 0fBF6C835E; +mul.f32 f2616, f242, 0fBEC3EF15; +sub.f32 f269, f2615, f2616; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2614, f2646, f2628; +sub.f32 f275, f2646, f2628; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2613, f2645, f247; +sub.f32 f279, f2645, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2612, f2644, f252; +sub.f32 f283, f2644, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2611, f2643, f257; +sub.f32 f287, f2643, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2610, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2609, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2608, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2607, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2604, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2602, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2601, f2604, f2602; +sub.f32 f315, f2604, f2602; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2600, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2598, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2595, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2594, f2598, f2595; +sub.f32 f331, f2598, f2595; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2593, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2591, f332, 0f3F3504F3; +mul.f32 f2592, f2593, 0fBF3504F3; +sub.f32 f338, f2591, f2592; +mul.f32 f339, f2593, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2590, f2601, f2594; +sub.f32 f348, f2601, f2594; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2589, f2600, f340; +sub.f32 f352, f2600, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2588, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2587, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2585, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2583, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2582, f2585, f2583; +sub.f32 f372, f2585, f2583; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2581, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2578, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2577, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2576, f2578, f2577; +sub.f32 f388, f2578, f2577; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2575, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2573, f389, 0f3F3504F3; +mul.f32 f2574, f2575, 0fBF3504F3; +sub.f32 f395, f2573, f2574; +mul.f32 f396, f2575, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2572, f2582, f2576; +sub.f32 f405, f2582, f2576; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2571, f2581, f397; +sub.f32 f409, f2581, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2570, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2569, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2571, 0fBEC3EF15; +mul.f32 f2568, f406, 0f3F6C835E; +sub.f32 f420, f2568, f419; +mul.f32 f421, f2571, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2570, 0fBF3504F3; +mul.f32 f2567, f410, 0f3F3504F3; +sub.f32 f425, f2567, f424; +mul.f32 f426, f2570, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2565, f414, 0f3EC3EF15; +mul.f32 f2566, f2569, 0fBF6C835E; +sub.f32 f430, f2565, f2566; +mul.f32 f431, f2569, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2563, f408, 0fBEC3EF15; +mul.f32 f2564, f409, 0fBF6C835E; +sub.f32 f435, f2563, f2564; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2562, f416, 0fBF6C835E; +sub.f32 f444, f2562, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2561, f2590, f2572; +sub.f32 f450, f2590, f2572; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2560, f2589, f422; +sub.f32 f454, f2589, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2559, f2588, f427; +sub.f32 f458, f2588, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2558, f2587, f432; +sub.f32 f462, f2587, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2557, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2556, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2555, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2554, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2560, 0fBE47C5C2; +mul.f32 f2553, f451, 0f3F7B14BE; +sub.f32 f481, f2553, f480; +mul.f32 f482, f2560, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2559, 0fBEC3EF15; +mul.f32 f2552, f455, 0f3F6C835E; +sub.f32 f486, f2552, f485; +mul.f32 f487, f2559, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2558, 0fBF0E39DA; +mul.f32 f2551, f459, 0f3F54DB31; +sub.f32 f491, f2551, f490; +mul.f32 f492, f2558, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2557, 0fBF3504F3; +mul.f32 f2550, f463, 0f3F3504F3; +sub.f32 f496, f2550, f495; +mul.f32 f497, f2557, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2548, f467, 0f3F0E39DA; +mul.f32 f2549, f2556, 0fBF54DB31; +sub.f32 f501, f2548, f2549; +mul.f32 f502, f2556, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2546, f471, 0f3EC3EF15; +mul.f32 f2547, f2555, 0fBF6C835E; +sub.f32 f506, f2546, f2547; +mul.f32 f507, f2555, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2544, f475, 0f3E47C5C2; +mul.f32 f2545, f2554, 0fBF7B14BE; +sub.f32 f511, f2544, f2545; +mul.f32 f512, f2554, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2542, f453, 0fBE47C5C2; +mul.f32 f2543, f454, 0fBF7B14BE; +sub.f32 f516, f2542, f2543; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2541, f457, 0fBEC3EF15; +sub.f32 f521, f2541, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2540, f461, 0fBF0E39DA; +sub.f32 f526, f2540, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2538, f469, 0fBF54DB31; +mul.f32 f2539, f470, 0fBF0E39DA; +sub.f32 f535, f2538, f2539; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2537, f473, 0fBF6C835E; +sub.f32 f540, f2537, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2536, f477, 0fBF7B14BE; +sub.f32 f545, f2536, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2535, f2613, f483; +sub.f32 f553, f2613, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2534, f2612, f488; +sub.f32 f557, f2612, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2533, f2611, f493; +sub.f32 f561, f2611, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2532, f2610, f498; +sub.f32 f565, f2610, f498; +add.f32 f566, f292, f501; +sub.f32 f568, f292, f501; +add.f32 f2531, f2609, f503; +sub.f32 f569, f2609, f503; +add.f32 f570, f296, f506; +sub.f32 f572, f296, f506; +add.f32 f2530, f2608, f508; +sub.f32 f573, f2608, f508; +add.f32 f574, f300, f511; +sub.f32 f576, f300, f511; +add.f32 f2529, f2607, f513; +sub.f32 f577, f2607, f513; +add.f32 f578, f274, f450; +sub.f32 f580, f274, f450; +sub.f32 f2528, f275, f449; +add.f32 f581, f275, f449; +add.f32 f582, f278, f516; +sub.f32 f584, f278, f516; +add.f32 f2527, f279, f518; +sub.f32 f585, f279, f518; +add.f32 f586, f282, f521; +sub.f32 f588, f282, f521; +add.f32 f2526, f283, f523; +sub.f32 f589, f283, f523; +add.f32 f590, f286, f526; +sub.f32 f592, f286, f526; +add.f32 f2525, f287, f528; +sub.f32 f593, f287, f528; +add.f32 f594, f290, f531; +sub.f32 f596, f290, f531; +add.f32 f2524, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2523, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2522, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2521, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f615, f611, f2535; +mul.f32 f616, f610, f2535; +mul.f32 f618, f611, f611; +mul.f32 f2520, f610, f610; +sub.f32 f619, f2520, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f623, f621, f2534; +mul.f32 f624, f619, f2534; +mul.f32 f626, f611, f621; +mul.f32 f2519, f610, f619; +sub.f32 f627, f2519, f626; +mul.f32 f2518, f619, f554; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f631, f629, f2533; +mul.f32 f632, f627, f2533; +mul.f32 f2516, f610, f627; +mul.f32 f2517, f611, f629; +sub.f32 f635, f2516, f2517; +mul.f32 f2515, f627, f558; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f639, f637, f2532; +mul.f32 f640, f635, f2532; +mul.f32 f642, f611, f637; +mul.f32 f2514, f610, f635; +sub.f32 f643, f2514, f642; +mul.f32 f2513, f635, f562; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f647, f645, f2531; +mul.f32 f648, f643, f2531; +mul.f32 f2511, f610, f643; +mul.f32 f2512, f611, f645; +sub.f32 f651, f2511, f2512; +mul.f32 f2510, f643, f566; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f655, f653, f2530; +mul.f32 f656, f651, f2530; +mul.f32 f658, f611, f653; +mul.f32 f2509, f610, f651; +sub.f32 f659, f2509, f658; +mul.f32 f2508, f651, f570; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f663, f661, f2529; +mul.f32 f664, f659, f2529; +mul.f32 f666, f611, f661; +mul.f32 f2507, f610, f659; +sub.f32 f667, f2507, f666; +mul.f32 f2506, f659, f574; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f671, f669, f2528; +mul.f32 f672, f667, f2528; +mul.f32 f2504, f610, f667; +mul.f32 f2505, f611, f669; +sub.f32 f675, f2504, f2505; +mul.f32 f2503, f667, f578; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f679, f677, f2527; +mul.f32 f680, f675, f2527; +mul.f32 f682, f611, f677; +mul.f32 f2502, f610, f675; +sub.f32 f683, f2502, f682; +mul.f32 f2501, f675, f582; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f687, f685, f2526; +mul.f32 f688, f683, f2526; +mul.f32 f690, f611, f685; +mul.f32 f2500, f610, f683; +sub.f32 f691, f2500, f690; +mul.f32 f2499, f683, f586; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f695, f693, f2525; +mul.f32 f696, f691, f2525; +mul.f32 f2497, f610, f691; +mul.f32 f2498, f611, f693; +sub.f32 f699, f2497, f2498; +mul.f32 f2496, f691, f590; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f703, f701, f2524; +mul.f32 f704, f699, f2524; +mul.f32 f706, f611, f701; +mul.f32 f2495, f610, f699; +sub.f32 f707, f2495, f706; +mul.f32 f2494, f699, f594; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f711, f709, f2523; +mul.f32 f712, f707, f2523; +mul.f32 f2492, f610, f707; +mul.f32 f2493, f611, f709; +sub.f32 f715, f2492, f2493; +mul.f32 f2491, f707, f598; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f719, f717, f2522; +mul.f32 f720, f715, f2522; +mul.f32 f722, f611, f717; +mul.f32 f2490, f610, f715; +sub.f32 f723, f2490, f722; +mul.f32 f2489, f715, f602; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f727, f725, f2521; +mul.f32 f728, f723, f2521; +mul.f32 f730, f611, f725; +mul.f32 f2488, f610, f723; +sub.f32 f731, f2488, f730; +mul.f32 f2487, f723, f606; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2486, f2614, f2561; +mul.f32 f735, f733, f2486; +mul.f32 f736, f731, f2486; +mul.f32 f2484, f610, f731; +mul.f32 f2485, f611, f733; +sub.f32 f739, f2484, f2485; +sub.f32 f2483, f272, f447; +mul.f32 f2482, f731, f2483; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f743, f741, f553; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2481, f610, f739; +sub.f32 f747, f2481, f746; +mul.f32 f2480, f739, f552; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f751, f749, f557; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2479, f610, f747; +sub.f32 f755, f2479, f754; +mul.f32 f2478, f747, f556; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f759, f757, f561; +mul.f32 f760, f755, f561; +mul.f32 f2476, f610, f755; +mul.f32 f2477, f611, f757; +sub.f32 f763, f2476, f2477; +mul.f32 f2475, f755, f560; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f767, f765, f565; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2474, f610, f763; +sub.f32 f771, f2474, f770; +mul.f32 f2473, f763, f564; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f775, f773, f569; +mul.f32 f776, f771, f569; +mul.f32 f2471, f610, f771; +mul.f32 f2472, f611, f773; +sub.f32 f779, f2471, f2472; +mul.f32 f2470, f771, f568; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f783, f781, f573; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2469, f610, f779; +sub.f32 f787, f2469, f786; +mul.f32 f2468, f779, f572; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f791, f789, f577; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2467, f610, f787; +sub.f32 f795, f2467, f794; +mul.f32 f2466, f787, f576; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f799, f797, f581; +mul.f32 f800, f795, f581; +mul.f32 f2464, f610, f795; +mul.f32 f2465, f611, f797; +sub.f32 f803, f2464, f2465; +mul.f32 f2463, f795, f580; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f807, f805, f585; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2462, f610, f803; +sub.f32 f811, f2462, f810; +mul.f32 f2461, f803, f584; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f815, f813, f589; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2460, f610, f811; +sub.f32 f819, f2460, f818; +mul.f32 f2459, f811, f588; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f823, f821, f593; +mul.f32 f824, f819, f593; +mul.f32 f2457, f610, f819; +mul.f32 f2458, f611, f821; +sub.f32 f827, f2457, f2458; +mul.f32 f2456, f819, f592; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f831, f829, f597; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2455, f610, f827; +sub.f32 f835, f2455, f834; +mul.f32 f2454, f827, f596; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f839, f837, f601; +mul.f32 f840, f835, f601; +mul.f32 f2452, f610, f835; +mul.f32 f2453, f611, f837; +sub.f32 f843, f2452, f2453; +mul.f32 f2451, f835, f600; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f847, f845, f605; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2450, f610, f843; +sub.f32 f851, f2450, f850; +mul.f32 f2449, f610, f550; +mul.f32 f852, f610, f845; +mul.f32 f2448, f843, f604; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f851, f608; +mul.f32 f855, f853, f609; +mul.f32 f856, f851, f609; +mov.u32 r35, %tid.x; +shl.b32 r34, r35, 8; +barrier.sync 0; +and.b32 r11, r34, 65280; +add.s32 r12, r9, r11; +add.f32 f857, f2614, f2561; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 3; +sub.f32 f2666, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r33, %tid.x; +shl.b32 r32, r33, 3; +mov.u32 r31, %tid.x; +fma.rn.f32 f859, f611, f550, f616; +sub.f32 f860, f2449, f615; +st.shared.v4.f32 [r12], {f858, f857, f860, f859}; +fma.rn.f32 f861, f621, f554, f624; +sub.f32 f862, f2518, f623; +fma.rn.f32 f863, f629, f558, f632; +sub.f32 f864, f2515, f631; +st.shared.v4.f32 [r12+16], {f862, f861, f864, f863}; +fma.rn.f32 f865, f637, f562, f640; +sub.f32 f866, f2513, f639; +sub.f32 f867, f2510, f647; +fma.rn.f32 f868, f645, f566, f648; +st.shared.v4.f32 [r12+32], {f866, f865, f867, f868}; +fma.rn.f32 f869, f653, f570, f656; +sub.f32 f870, f2508, f655; +fma.rn.f32 f871, f661, f574, f664; +sub.f32 f872, f2506, f663; +st.shared.v4.f32 [r12+48], {f870, f869, f872, f871}; +fma.rn.f32 f873, f669, f578, f672; +sub.f32 f874, f2503, f671; +fma.rn.f32 f875, f677, f582, f680; +sub.f32 f876, f2501, f679; +st.shared.v4.f32 [r12+64], {f874, f873, f876, f875}; +fma.rn.f32 f877, f685, f586, f688; +sub.f32 f878, f2499, f687; +fma.rn.f32 f879, f693, f590, f696; +sub.f32 f880, f2496, f695; +st.shared.v4.f32 [r12+80], {f878, f877, f880, f879}; +fma.rn.f32 f881, f701, f594, f704; +sub.f32 f882, f2494, f703; +fma.rn.f32 f883, f709, f598, f712; +sub.f32 f884, f2491, f711; +st.shared.v4.f32 [r12+96], {f882, f881, f884, f883}; +fma.rn.f32 f885, f717, f602, f720; +sub.f32 f886, f2489, f719; +fma.rn.f32 f887, f725, f606, f728; +sub.f32 f888, f2487, f727; +st.shared.v4.f32 [r12+112], {f886, f885, f888, f887}; +fma.rn.f32 f889, f733, f2666, f736; +sub.f32 f890, f2482, f735; +fma.rn.f32 f891, f741, f552, f744; +sub.f32 f892, f2480, f743; +st.shared.v4.f32 [r12+128], {f890, f889, f892, f891}; +fma.rn.f32 f893, f749, f556, f752; +sub.f32 f894, f2478, f751; +fma.rn.f32 f895, f757, f560, f760; +sub.f32 f896, f2475, f759; +st.shared.v4.f32 [r12+144], {f894, f893, f896, f895}; +fma.rn.f32 f897, f765, f564, f768; +sub.f32 f898, f2473, f767; +fma.rn.f32 f899, f773, f568, f776; +sub.f32 f900, f2470, f775; +st.shared.v4.f32 [r12+160], {f898, f897, f900, f899}; +fma.rn.f32 f901, f781, f572, f784; +sub.f32 f902, f2468, f783; +fma.rn.f32 f903, f789, f576, f792; +sub.f32 f904, f2466, f791; +st.shared.v4.f32 [r12+176], {f902, f901, f904, f903}; +fma.rn.f32 f905, f797, f580, f800; +sub.f32 f906, f2463, f799; +fma.rn.f32 f907, f805, f584, f808; +sub.f32 f908, f2461, f807; +st.shared.v4.f32 [r12+192], {f906, f905, f908, f907}; +fma.rn.f32 f909, f813, f588, f816; +sub.f32 f910, f2459, f815; +fma.rn.f32 f911, f821, f592, f824; +sub.f32 f912, f2456, f823; +st.shared.v4.f32 [r12+208], {f910, f909, f912, f911}; +fma.rn.f32 f913, f829, f596, f832; +sub.f32 f914, f2454, f831; +fma.rn.f32 f915, f837, f600, f840; +sub.f32 f916, f2451, f839; +st.shared.v4.f32 [r12+224], {f914, f913, f916, f915}; +fma.rn.f32 f917, f845, f604, f848; +sub.f32 f918, f2448, f847; +fma.rn.f32 f919, f853, f608, f856; +sub.f32 f920, f854, f855; +st.shared.v4.f32 [r12+240], {f918, f917, f920, f919}; +barrier.sync 0; +and.b32 r21, r31, 255; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+2048]; +ld.shared.v2.f32 {f929, f930}, [r13+4096]; +ld.shared.v2.f32 {f933, f934}, [r13+6144]; +ld.shared.v2.f32 {f937, f938}, [r13+8192]; +ld.shared.v2.f32 {f941, f942}, [r13+10240]; +ld.shared.v2.f32 {f945, f946}, [r13+12288]; +ld.shared.v2.f32 {f949, f950}, [r13+14336]; +ld.shared.v2.f32 {f953, f954}, [r13+16384]; +ld.shared.v2.f32 {f957, f958}, [r13+18432]; +ld.shared.v2.f32 {f961, f962}, [r13+20480]; +ld.shared.v2.f32 {f965, f966}, [r13+22528]; +ld.shared.v2.f32 {f969, f970}, [r13+24576]; +ld.shared.v2.f32 {f973, f974}, [r13+26624]; +ld.shared.v2.f32 {f977, f978}, [r13+28672]; +ld.shared.v2.f32 {f981, f982}, [r13+30720]; +ld.shared.v2.f32 {f985, f986}, [r13+32768]; +ld.shared.v2.f32 {f989, f990}, [r13+34816]; +ld.shared.v2.f32 {f993, f994}, [r13+36864]; +ld.shared.v2.f32 {f997, f998}, [r13+38912]; +ld.shared.v2.f32 {f1001, f1002}, [r13+40960]; +ld.shared.v2.f32 {f1005, f1006}, [r13+43008]; +ld.shared.v2.f32 {f1009, f1010}, [r13+45056]; +ld.shared.v2.f32 {f1013, f1014}, [r13+47104]; +ld.shared.v2.f32 {f1017, f1018}, [r13+49152]; +ld.shared.v2.f32 {f1021, f1022}, [r13+51200]; +ld.shared.v2.f32 {f1025, f1026}, [r13+53248]; +ld.shared.v2.f32 {f1029, f1030}, [r13+55296]; +ld.shared.v2.f32 {f1033, f1034}, [r13+57344]; +ld.shared.v2.f32 {f1037, f1038}, [r13+59392]; +ld.shared.v2.f32 {f1041, f1042}, [r13+61440]; +ld.shared.v2.f32 {f1045, f1046}, [r13+63488]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2447, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2446, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2445, f2447, f2446; +sub.f32 f1060, f2447, f2446; +add.f32 f1061, f1051, f1056; +sub.f32 f1063, f1051, f1056; +sub.f32 f2444, f1052, f1055; +add.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2443, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2442, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2441, f2443, f2442; +sub.f32 f1076, f2443, f2442; +add.f32 f1077, f1067, f1072; +sub.f32 f1079, f1067, f1072; +sub.f32 f2440, f1068, f1071; +add.f32 f1080, f1068, f1071; +mul.f32 f1082, f2440, 0fBF3504F3; +mul.f32 f2439, f1077, 0f3F3504F3; +sub.f32 f1083, f2439, f1082; +mul.f32 f1084, f2440, 0f3F3504F3; +fma.rn.f32 f1085, f1077, 0fBF3504F3, f1084; +mul.f32 f1086, f1079, 0fBF3504F3; +mul.f32 f1087, f1080, 0fBF3504F3; +sub.f32 f1088, f1086, f1087; +add.f32 f1089, f1086, f1087; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2438, f2445, f2441; +sub.f32 f1093, f2445, f2441; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2437, f2444, f1085; +sub.f32 f1097, f2444, f1085; +add.f32 f1098, f1059, f1076; +sub.f32 f1100, f1059, f1076; +sub.f32 f2436, f1060, f1075; +add.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1088; +sub.f32 f1104, f1063, f1088; +add.f32 f2435, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2434, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2433, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2432, f2434, f2433; +sub.f32 f1117, f2434, f2433; +add.f32 f1118, f1108, f1113; +sub.f32 f1120, f1108, f1113; +sub.f32 f2431, f1109, f1112; +add.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2430, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2429, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2428, f2430, f2429; +sub.f32 f1133, f2430, f2429; +add.f32 f1134, f1124, f1129; +sub.f32 f1136, f1124, f1129; +sub.f32 f2427, f1125, f1128; +add.f32 f1137, f1125, f1128; +mul.f32 f1139, f2427, 0fBF3504F3; +mul.f32 f2426, f1134, 0f3F3504F3; +sub.f32 f1140, f2426, f1139; +mul.f32 f1141, f2427, 0f3F3504F3; +fma.rn.f32 f1142, f1134, 0fBF3504F3, f1141; +mul.f32 f1143, f1136, 0fBF3504F3; +mul.f32 f1144, f1137, 0fBF3504F3; +sub.f32 f1145, f1143, f1144; +add.f32 f1146, f1143, f1144; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2425, f2432, f2428; +sub.f32 f1150, f2432, f2428; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2424, f2431, f1142; +sub.f32 f1154, f2431, f1142; +add.f32 f1155, f1116, f1133; +sub.f32 f1157, f1116, f1133; +sub.f32 f2423, f1117, f1132; +add.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1145; +sub.f32 f1161, f1120, f1145; +add.f32 f2422, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2420, f1151, 0f3F6C835E; +mul.f32 f2421, f2424, 0fBEC3EF15; +sub.f32 f1165, f2420, f2421; +mul.f32 f1166, f2424, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0fBEC3EF15, f1166; +mul.f32 f2418, f1155, 0f3F3504F3; +mul.f32 f2419, f2423, 0fBF3504F3; +sub.f32 f1170, f2418, f2419; +mul.f32 f1171, f2423, 0f3F3504F3; +fma.rn.f32 f1172, f1155, 0fBF3504F3, f1171; +mul.f32 f1174, f2422, 0fBF6C835E; +mul.f32 f2417, f1159, 0f3EC3EF15; +sub.f32 f1175, f2417, f1174; +mul.f32 f1176, f2422, 0f3EC3EF15; +fma.rn.f32 f1177, f1159, 0fBF6C835E, f1176; +mul.f32 f1179, f1154, 0fBF6C835E; +mul.f32 f2416, f1153, 0fBEC3EF15; +sub.f32 f1180, f2416, f1179; +mul.f32 f1181, f1154, 0fBEC3EF15; +fma.rn.f32 f1182, f1153, 0fBF6C835E, f1181; +mul.f32 f1183, f1157, 0fBF3504F3; +mul.f32 f1184, f1158, 0fBF3504F3; +sub.f32 f1185, f1183, f1184; +add.f32 f1186, f1183, f1184; +mul.f32 f2414, f1161, 0fBF6C835E; +mul.f32 f2415, f1162, 0fBEC3EF15; +sub.f32 f1189, f2414, f2415; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0fBEC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2413, f2438, f2425; +sub.f32 f1195, f2438, f2425; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2412, f2437, f1167; +sub.f32 f1199, f2437, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2411, f2436, f1172; +sub.f32 f1203, f2436, f1172; +add.f32 f1204, f1102, f1175; +sub.f32 f1206, f1102, f1175; +add.f32 f2410, f2435, f1177; +sub.f32 f1207, f2435, f1177; +add.f32 f1208, f1092, f1150; +sub.f32 f1210, f1092, f1150; +sub.f32 f2409, f1093, f1149; +add.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1180; +sub.f32 f1214, f1096, f1180; +add.f32 f2408, f1097, f1182; +sub.f32 f1215, f1097, f1182; +add.f32 f1216, f1100, f1185; +sub.f32 f1218, f1100, f1185; +add.f32 f2407, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2406, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2405, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2404, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2403, f2405, f2404; +sub.f32 f1235, f2405, f2404; +add.f32 f1236, f1226, f1231; +sub.f32 f1238, f1226, f1231; +sub.f32 f2402, f1227, f1230; +add.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2401, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2400, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2399, f2401, f2400; +sub.f32 f1251, f2401, f2400; +add.f32 f1252, f1242, f1247; +sub.f32 f1254, f1242, f1247; +sub.f32 f2398, f1243, f1246; +add.f32 f1255, f1243, f1246; +mul.f32 f1257, f2398, 0fBF3504F3; +mul.f32 f2397, f1252, 0f3F3504F3; +sub.f32 f1258, f2397, f1257; +mul.f32 f1259, f2398, 0f3F3504F3; +fma.rn.f32 f1260, f1252, 0fBF3504F3, f1259; +mul.f32 f1261, f1254, 0fBF3504F3; +mul.f32 f1262, f1255, 0fBF3504F3; +sub.f32 f1263, f1261, f1262; +add.f32 f1264, f1261, f1262; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2396, f2403, f2399; +sub.f32 f1268, f2403, f2399; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2395, f2402, f1260; +sub.f32 f1272, f2402, f1260; +add.f32 f1273, f1234, f1251; +sub.f32 f1275, f1234, f1251; +sub.f32 f2394, f1235, f1250; +add.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1263; +sub.f32 f1279, f1238, f1263; +add.f32 f2393, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2392, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2391, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2390, f2392, f2391; +sub.f32 f1292, f2392, f2391; +add.f32 f1293, f1283, f1288; +sub.f32 f1295, f1283, f1288; +sub.f32 f2389, f1284, f1287; +add.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2388, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2387, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2386, f2388, f2387; +sub.f32 f1308, f2388, f2387; +add.f32 f1309, f1299, f1304; +sub.f32 f1311, f1299, f1304; +sub.f32 f2385, f1300, f1303; +add.f32 f1312, f1300, f1303; +mul.f32 f1314, f2385, 0fBF3504F3; +mul.f32 f2384, f1309, 0f3F3504F3; +sub.f32 f1315, f2384, f1314; +mul.f32 f1316, f2385, 0f3F3504F3; +fma.rn.f32 f1317, f1309, 0fBF3504F3, f1316; +mul.f32 f1318, f1311, 0fBF3504F3; +mul.f32 f1319, f1312, 0fBF3504F3; +sub.f32 f1320, f1318, f1319; +add.f32 f1321, f1318, f1319; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2383, f2390, f2386; +sub.f32 f1325, f2390, f2386; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2382, f2389, f1317; +sub.f32 f1329, f2389, f1317; +add.f32 f1330, f1291, f1308; +sub.f32 f1332, f1291, f1308; +sub.f32 f2381, f1292, f1307; +add.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1320; +sub.f32 f1336, f1295, f1320; +add.f32 f2380, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2378, f1326, 0f3F6C835E; +mul.f32 f2379, f2382, 0fBEC3EF15; +sub.f32 f1340, f2378, f2379; +mul.f32 f1341, f2382, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0fBEC3EF15, f1341; +mul.f32 f2376, f1330, 0f3F3504F3; +mul.f32 f2377, f2381, 0fBF3504F3; +sub.f32 f1345, f2376, f2377; +mul.f32 f1346, f2381, 0f3F3504F3; +fma.rn.f32 f1347, f1330, 0fBF3504F3, f1346; +mul.f32 f2374, f1334, 0f3EC3EF15; +mul.f32 f2375, f2380, 0fBF6C835E; +sub.f32 f1350, f2374, f2375; +mul.f32 f1351, f2380, 0f3EC3EF15; +fma.rn.f32 f1352, f1334, 0fBF6C835E, f1351; +mul.f32 f2372, f1328, 0fBEC3EF15; +mul.f32 f2373, f1329, 0fBF6C835E; +sub.f32 f1355, f2372, f2373; +mul.f32 f1356, f1329, 0fBEC3EF15; +fma.rn.f32 f1357, f1328, 0fBF6C835E, f1356; +mul.f32 f1358, f1332, 0fBF3504F3; +mul.f32 f1359, f1333, 0fBF3504F3; +sub.f32 f1360, f1358, f1359; +add.f32 f1361, f1358, f1359; +mul.f32 f2370, f1336, 0fBF6C835E; +mul.f32 f2371, f1337, 0fBEC3EF15; +sub.f32 f1364, f2370, f2371; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0fBEC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2369, f2396, f2383; +sub.f32 f1370, f2396, f2383; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2368, f2395, f1342; +sub.f32 f1374, f2395, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2367, f2394, f1347; +sub.f32 f1378, f2394, f1347; +add.f32 f1379, f1277, f1350; +sub.f32 f1381, f1277, f1350; +add.f32 f2366, f2393, f1352; +sub.f32 f1382, f2393, f1352; +add.f32 f1383, f1267, f1325; +sub.f32 f1385, f1267, f1325; +sub.f32 f2365, f1268, f1324; +add.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1355; +sub.f32 f1389, f1271, f1355; +add.f32 f2364, f1272, f1357; +sub.f32 f1390, f1272, f1357; +add.f32 f1391, f1275, f1360; +sub.f32 f1393, f1275, f1360; +add.f32 f2363, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2362, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2368, 0fBE47C5C2; +mul.f32 f2361, f1371, 0f3F7B14BE; +sub.f32 f1401, f2361, f1400; +mul.f32 f1402, f2368, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0fBE47C5C2, f1402; +mul.f32 f1405, f2367, 0fBEC3EF15; +mul.f32 f2360, f1375, 0f3F6C835E; +sub.f32 f1406, f2360, f1405; +mul.f32 f1407, f2367, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0fBEC3EF15, f1407; +mul.f32 f2358, f1379, 0f3F54DB31; +mul.f32 f2359, f2366, 0fBF0E39DA; +sub.f32 f1411, f2358, f2359; +mul.f32 f1412, f2366, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0fBF0E39DA, f1412; +mul.f32 f2356, f1383, 0f3F3504F3; +mul.f32 f2357, f2365, 0fBF3504F3; +sub.f32 f1416, f2356, f2357; +mul.f32 f1417, f2365, 0f3F3504F3; +fma.rn.f32 f1418, f1383, 0fBF3504F3, f1417; +mul.f32 f2354, f1387, 0f3F0E39DA; +mul.f32 f2355, f2364, 0fBF54DB31; +sub.f32 f1421, f2354, f2355; +mul.f32 f1422, f2364, 0f3F0E39DA; +fma.rn.f32 f1423, f1387, 0fBF54DB31, f1422; +mul.f32 f2352, f1391, 0f3EC3EF15; +mul.f32 f2353, f2363, 0fBF6C835E; +sub.f32 f1426, f2352, f2353; +mul.f32 f1427, f2363, 0f3EC3EF15; +fma.rn.f32 f1428, f1391, 0fBF6C835E, f1427; +mul.f32 f1430, f2362, 0fBF7B14BE; +mul.f32 f2351, f1395, 0f3E47C5C2; +sub.f32 f1431, f2351, f1430; +mul.f32 f1432, f2362, 0f3E47C5C2; +fma.rn.f32 f1433, f1395, 0fBF7B14BE, f1432; +mul.f32 f1435, f1374, 0fBF7B14BE; +mul.f32 f2350, f1373, 0fBE47C5C2; +sub.f32 f1436, f2350, f1435; +mul.f32 f1437, f1374, 0fBE47C5C2; +fma.rn.f32 f1438, f1373, 0fBF7B14BE, f1437; +mul.f32 f1440, f1378, 0fBF6C835E; +mul.f32 f2349, f1377, 0fBEC3EF15; +sub.f32 f1441, f2349, f1440; +mul.f32 f1442, f1378, 0fBEC3EF15; +fma.rn.f32 f1443, f1377, 0fBF6C835E, f1442; +mul.f32 f1445, f1382, 0fBF54DB31; +mul.f32 f2348, f1381, 0fBF0E39DA; +sub.f32 f1446, f2348, f1445; +mul.f32 f1447, f1382, 0fBF0E39DA; +fma.rn.f32 f1448, f1381, 0fBF54DB31, f1447; +mul.f32 f1449, f1385, 0fBF3504F3; +mul.f32 f1450, f1386, 0fBF3504F3; +sub.f32 f1451, f1449, f1450; +add.f32 f1452, f1449, f1450; +mul.f32 f1454, f1390, 0fBF0E39DA; +mul.f32 f2347, f1389, 0fBF54DB31; +sub.f32 f1455, f2347, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0fBF0E39DA, f1456; +mul.f32 f1459, f1394, 0fBEC3EF15; +mul.f32 f2346, f1393, 0fBF6C835E; +sub.f32 f1460, f2346, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0fBEC3EF15, f1461; +mul.f32 f1464, f1398, 0fBE47C5C2; +mul.f32 f2345, f1397, 0fBF7B14BE; +sub.f32 f1465, f2345, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0fBE47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2344, f2412, f1403; +sub.f32 f1473, f2412, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2343, f2411, f1408; +sub.f32 f1477, f2411, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2342, f2410, f1413; +sub.f32 f1481, f2410, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2341, f2409, f1418; +sub.f32 f1485, f2409, f1418; +add.f32 f1486, f1212, f1421; +sub.f32 f1488, f1212, f1421; +add.f32 f2340, f2408, f1423; +sub.f32 f1489, f2408, f1423; +add.f32 f1490, f1216, f1426; +sub.f32 f1492, f1216, f1426; +add.f32 f2339, f2407, f1428; +sub.f32 f1493, f2407, f1428; +add.f32 f1494, f1220, f1431; +sub.f32 f1496, f1220, f1431; +add.f32 f2338, f2406, f1433; +sub.f32 f1497, f2406, f1433; +add.f32 f1498, f1194, f1370; +sub.f32 f1500, f1194, f1370; +sub.f32 f2337, f1195, f1369; +add.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1436; +sub.f32 f1504, f1198, f1436; +add.f32 f2336, f1199, f1438; +sub.f32 f1505, f1199, f1438; +add.f32 f1506, f1202, f1441; +sub.f32 f1508, f1202, f1441; +add.f32 f2335, f1203, f1443; +sub.f32 f1509, f1203, f1443; +add.f32 f1510, f1206, f1446; +sub.f32 f1512, f1206, f1446; +add.f32 f2334, f1207, f1448; +sub.f32 f1513, f1207, f1448; +add.f32 f1514, f1210, f1451; +sub.f32 f1516, f1210, f1451; +add.f32 f2333, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2332, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2331, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2330, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r31, 224; +bfe.u32 r15, r31, 5, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1535, f1531, f2344; +mul.f32 f1536, f1530, f2344; +mul.f32 f2328, f1530, f1530; +mul.f32 f2329, f1531, f1531; +sub.f32 f1539, f2328, f2329; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1543, f1541, f2343; +mul.f32 f1544, f1539, f2343; +mul.f32 f1546, f1531, f1541; +mul.f32 f2327, f1530, f1539; +sub.f32 f1547, f2327, f1546; +mul.f32 f2326, f1539, f1474; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1551, f1549, f2342; +mul.f32 f1552, f1547, f2342; +mul.f32 f1554, f1531, f1549; +mul.f32 f2325, f1530, f1547; +sub.f32 f1555, f2325, f1554; +mul.f32 f2324, f1547, f1478; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1559, f1557, f2341; +mul.f32 f1560, f1555, f2341; +mul.f32 f2322, f1530, f1555; +mul.f32 f2323, f1531, f1557; +sub.f32 f1563, f2322, f2323; +mul.f32 f2321, f1555, f1482; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1567, f1565, f2340; +mul.f32 f1568, f1563, f2340; +mul.f32 f1570, f1531, f1565; +mul.f32 f2320, f1530, f1563; +sub.f32 f1571, f2320, f1570; +mul.f32 f2319, f1563, f1486; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1575, f1573, f2339; +mul.f32 f1576, f1571, f2339; +mul.f32 f1578, f1531, f1573; +mul.f32 f2318, f1530, f1571; +sub.f32 f1579, f2318, f1578; +mul.f32 f2317, f1571, f1490; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1583, f1581, f2338; +mul.f32 f1584, f1579, f2338; +mul.f32 f2315, f1530, f1579; +mul.f32 f2316, f1531, f1581; +sub.f32 f1587, f2315, f2316; +mul.f32 f2314, f1579, f1494; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1591, f1589, f2337; +mul.f32 f1592, f1587, f2337; +mul.f32 f1594, f1531, f1589; +mul.f32 f2313, f1530, f1587; +sub.f32 f1595, f2313, f1594; +mul.f32 f2312, f1587, f1498; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1599, f1597, f2336; +mul.f32 f1600, f1595, f2336; +mul.f32 f2310, f1530, f1595; +mul.f32 f2311, f1531, f1597; +sub.f32 f1603, f2310, f2311; +mul.f32 f2309, f1595, f1502; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1607, f1605, f2335; +mul.f32 f1608, f1603, f2335; +mul.f32 f1610, f1531, f1605; +mul.f32 f2308, f1530, f1603; +sub.f32 f1611, f2308, f1610; +mul.f32 f2307, f1603, f1506; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1615, f1613, f2334; +mul.f32 f1616, f1611, f2334; +mul.f32 f1618, f1531, f1613; +mul.f32 f2306, f1530, f1611; +sub.f32 f1619, f2306, f1618; +mul.f32 f2305, f1611, f1510; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1623, f1621, f2333; +mul.f32 f1624, f1619, f2333; +mul.f32 f2303, f1530, f1619; +mul.f32 f2304, f1531, f1621; +sub.f32 f1627, f2303, f2304; +mul.f32 f2302, f1619, f1514; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1631, f1629, f2332; +mul.f32 f1632, f1627, f2332; +mul.f32 f1634, f1531, f1629; +mul.f32 f2301, f1530, f1627; +sub.f32 f1635, f2301, f1634; +mul.f32 f2300, f1627, f1518; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1639, f1637, f2331; +mul.f32 f1640, f1635, f2331; +mul.f32 f1642, f1531, f1637; +mul.f32 f2299, f1530, f1635; +sub.f32 f1643, f2299, f1642; +mul.f32 f2298, f1635, f1522; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1647, f1645, f2330; +mul.f32 f1648, f1643, f2330; +mul.f32 f2296, f1530, f1643; +mul.f32 f2297, f1531, f1645; +sub.f32 f1651, f2296, f2297; +mul.f32 f2295, f1643, f1526; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2294, f2413, f2369; +mul.f32 f1655, f1653, f2294; +mul.f32 f1656, f1651, f2294; +mul.f32 f1658, f1531, f1653; +mul.f32 f2293, f1530, f1651; +sub.f32 f1659, f2293, f1658; +sub.f32 f2292, f1192, f1367; +mul.f32 f2291, f1651, f2292; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1663, f1661, f1473; +mul.f32 f1664, f1659, f1473; +mul.f32 f2289, f1530, f1659; +mul.f32 f2290, f1531, f1661; +sub.f32 f1667, f2289, f2290; +mul.f32 f2288, f1659, f1472; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1671, f1669, f1477; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2287, f1530, f1667; +sub.f32 f1675, f2287, f1674; +mul.f32 f2286, f1667, f1476; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1679, f1677, f1481; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2285, f1530, f1675; +sub.f32 f1683, f2285, f1682; +mul.f32 f2284, f1675, f1480; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1687, f1685, f1485; +mul.f32 f1688, f1683, f1485; +mul.f32 f2282, f1530, f1683; +mul.f32 f2283, f1531, f1685; +sub.f32 f1691, f2282, f2283; +mul.f32 f2281, f1683, f1484; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1695, f1693, f1489; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2280, f1530, f1691; +sub.f32 f1699, f2280, f1698; +mul.f32 f2279, f1691, f1488; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1703, f1701, f1493; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2278, f1530, f1699; +sub.f32 f1707, f2278, f1706; +mul.f32 f2277, f1699, f1492; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1711, f1709, f1497; +mul.f32 f1712, f1707, f1497; +mul.f32 f2275, f1530, f1707; +mul.f32 f2276, f1531, f1709; +sub.f32 f1715, f2275, f2276; +mul.f32 f2274, f1707, f1496; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1719, f1717, f1501; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2273, f1530, f1715; +sub.f32 f1723, f2273, f1722; +mul.f32 f2272, f1715, f1500; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1727, f1725, f1505; +mul.f32 f1728, f1723, f1505; +mul.f32 f2270, f1530, f1723; +mul.f32 f2271, f1531, f1725; +sub.f32 f1731, f2270, f2271; +mul.f32 f2269, f1723, f1504; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1735, f1733, f1509; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2268, f1530, f1731; +sub.f32 f1739, f2268, f1738; +mul.f32 f2267, f1731, f1508; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1743, f1741, f1513; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2266, f1530, f1739; +sub.f32 f1747, f2266, f1746; +mul.f32 f2265, f1739, f1512; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1751, f1749, f1517; +mul.f32 f1752, f1747, f1517; +mul.f32 f2263, f1530, f1747; +mul.f32 f2264, f1531, f1749; +sub.f32 f1755, f2263, f2264; +mul.f32 f2262, f1747, f1516; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1759, f1757, f1521; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2261, f1530, f1755; +sub.f32 f1763, f2261, f1762; +mul.f32 f2260, f1755, f1520; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1767, f1765, f1525; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2259, f1530, f1763; +sub.f32 f1771, f2259, f1770; +mul.f32 f2258, f1530, f1470; +mul.f32 f1772, f1530, f1765; +mul.f32 f2257, f1763, f1524; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1771, f1528; +mul.f32 f1775, f1773, f1529; +mul.f32 f1776, f1771, f1529; +mov.u32 r24, %tid.x; +shl.b32 r23, r24, 8; +and.b32 r16, r32, 248; +add.s32 r17, r9, r16; +sub.f32 f2664, f2413, f2369; +mul.f32 f2663, f1653, f2664; +barrier.sync 0; +and.b32 r18, r23, 57344; +add.s32 r19, r17, r18; +mov.u32 r30, %tid.x; +and.b32 r29, r30, 224; +add.f32 f1777, f2413, f2369; +sub.f32 f2665, f1192, f1367; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r37, %tid.x; +and.b32 r36, r37, 224; +fma.rn.f32 f1779, f1531, f1470, f1536; +sub.f32 f1780, f2258, f1535; +st.shared.v2.f32 [r19+256], {f1780, f1779}; +fma.rn.f32 f1781, f1541, f1474, f1544; +sub.f32 f1782, f2326, f1543; +st.shared.v2.f32 [r19+512], {f1782, f1781}; +fma.rn.f32 f1783, f1549, f1478, f1552; +sub.f32 f1784, f2324, f1551; +st.shared.v2.f32 [r19+768], {f1784, f1783}; +fma.rn.f32 f1785, f1557, f1482, f1560; +sub.f32 f1786, f2321, f1559; +st.shared.v2.f32 [r19+1024], {f1786, f1785}; +fma.rn.f32 f1787, f1565, f1486, f1568; +sub.f32 f1788, f2319, f1567; +st.shared.v2.f32 [r19+1280], {f1788, f1787}; +sub.f32 f1789, f2317, f1575; +fma.rn.f32 f1790, f1573, f1490, f1576; +st.shared.v2.f32 [r19+1536], {f1789, f1790}; +fma.rn.f32 f1791, f1581, f1494, f1584; +sub.f32 f1792, f2314, f1583; +st.shared.v2.f32 [r19+1792], {f1792, f1791}; +fma.rn.f32 f1793, f1589, f1498, f1592; +sub.f32 f1794, f2312, f1591; +st.shared.v2.f32 [r19+2048], {f1794, f1793}; +fma.rn.f32 f1795, f1597, f1502, f1600; +sub.f32 f1796, f2309, f1599; +st.shared.v2.f32 [r19+2304], {f1796, f1795}; +fma.rn.f32 f1797, f1605, f1506, f1608; +sub.f32 f1798, f2307, f1607; +st.shared.v2.f32 [r19+2560], {f1798, f1797}; +fma.rn.f32 f1799, f1613, f1510, f1616; +sub.f32 f1800, f2305, f1615; +st.shared.v2.f32 [r19+2816], {f1800, f1799}; +fma.rn.f32 f1801, f1621, f1514, f1624; +sub.f32 f1802, f2302, f1623; +st.shared.v2.f32 [r19+3072], {f1802, f1801}; +fma.rn.f32 f1803, f1629, f1518, f1632; +sub.f32 f1804, f2300, f1631; +st.shared.v2.f32 [r19+3328], {f1804, f1803}; +fma.rn.f32 f1805, f1637, f1522, f1640; +sub.f32 f1806, f2298, f1639; +st.shared.v2.f32 [r19+3584], {f1806, f1805}; +fma.rn.f32 f1807, f1645, f1526, f1648; +sub.f32 f1808, f2295, f1647; +st.shared.v2.f32 [r19+3840], {f1808, f1807}; +fma.rn.f32 f1809, f1653, f2665, f1656; +sub.f32 f1810, f2291, f2663; +st.shared.v2.f32 [r19+4096], {f1810, f1809}; +fma.rn.f32 f1811, f1661, f1472, f1664; +sub.f32 f1812, f2288, f1663; +st.shared.v2.f32 [r19+4352], {f1812, f1811}; +fma.rn.f32 f1813, f1669, f1476, f1672; +sub.f32 f1814, f2286, f1671; +st.shared.v2.f32 [r19+4608], {f1814, f1813}; +fma.rn.f32 f1815, f1677, f1480, f1680; +sub.f32 f1816, f2284, f1679; +st.shared.v2.f32 [r19+4864], {f1816, f1815}; +fma.rn.f32 f1817, f1685, f1484, f1688; +sub.f32 f1818, f2281, f1687; +st.shared.v2.f32 [r19+5120], {f1818, f1817}; +fma.rn.f32 f1819, f1693, f1488, f1696; +sub.f32 f1820, f2279, f1695; +st.shared.v2.f32 [r19+5376], {f1820, f1819}; +fma.rn.f32 f1821, f1701, f1492, f1704; +sub.f32 f1822, f2277, f1703; +st.shared.v2.f32 [r19+5632], {f1822, f1821}; +fma.rn.f32 f1823, f1709, f1496, f1712; +sub.f32 f1824, f2274, f1711; +st.shared.v2.f32 [r19+5888], {f1824, f1823}; +fma.rn.f32 f1825, f1717, f1500, f1720; +sub.f32 f1826, f2272, f1719; +st.shared.v2.f32 [r19+6144], {f1826, f1825}; +fma.rn.f32 f1827, f1725, f1504, f1728; +sub.f32 f1828, f2269, f1727; +st.shared.v2.f32 [r19+6400], {f1828, f1827}; +fma.rn.f32 f1829, f1733, f1508, f1736; +sub.f32 f1830, f2267, f1735; +st.shared.v2.f32 [r19+6656], {f1830, f1829}; +fma.rn.f32 f1831, f1741, f1512, f1744; +sub.f32 f1832, f2265, f1743; +st.shared.v2.f32 [r19+6912], {f1832, f1831}; +fma.rn.f32 f1833, f1749, f1516, f1752; +sub.f32 f1834, f2262, f1751; +st.shared.v2.f32 [r19+7168], {f1834, f1833}; +fma.rn.f32 f1835, f1757, f1520, f1760; +sub.f32 f1836, f2260, f1759; +st.shared.v2.f32 [r19+7424], {f1836, f1835}; +fma.rn.f32 f1837, f1765, f1524, f1768; +sub.f32 f1838, f2257, f1767; +st.shared.v2.f32 [r19+7680], {f1838, f1837}; +fma.rn.f32 f1839, f1773, f1528, f1776; +sub.f32 f1840, f1774, f1775; +st.shared.v2.f32 [r19+7936], {f1840, f1839}; +barrier.sync 0; +mad.lo.s32 r20, r36, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+2048]; +ld.shared.v2.f32 {f1849, f1850}, [r20+4096]; +ld.shared.v2.f32 {f1853, f1854}, [r20+6144]; +ld.shared.v2.f32 {f1857, f1858}, [r20+8192]; +ld.shared.v2.f32 {f1861, f1862}, [r20+10240]; +ld.shared.v2.f32 {f1865, f1866}, [r20+12288]; +ld.shared.v2.f32 {f1869, f1870}, [r20+14336]; +ld.shared.v2.f32 {f1873, f1874}, [r20+16384]; +ld.shared.v2.f32 {f1877, f1878}, [r20+18432]; +ld.shared.v2.f32 {f1881, f1882}, [r20+20480]; +ld.shared.v2.f32 {f1885, f1886}, [r20+22528]; +ld.shared.v2.f32 {f1889, f1890}, [r20+24576]; +ld.shared.v2.f32 {f1893, f1894}, [r20+26624]; +ld.shared.v2.f32 {f1897, f1898}, [r20+28672]; +ld.shared.v2.f32 {f1901, f1902}, [r20+30720]; +ld.shared.v2.f32 {f1905, f1906}, [r20+32768]; +ld.shared.v2.f32 {f1909, f1910}, [r20+34816]; +ld.shared.v2.f32 {f1913, f1914}, [r20+36864]; +ld.shared.v2.f32 {f1917, f1918}, [r20+38912]; +ld.shared.v2.f32 {f1921, f1922}, [r20+40960]; +ld.shared.v2.f32 {f1925, f1926}, [r20+43008]; +ld.shared.v2.f32 {f1929, f1930}, [r20+45056]; +ld.shared.v2.f32 {f1933, f1934}, [r20+47104]; +ld.shared.v2.f32 {f1937, f1938}, [r20+49152]; +ld.shared.v2.f32 {f1941, f1942}, [r20+51200]; +ld.shared.v2.f32 {f1945, f1946}, [r20+53248]; +ld.shared.v2.f32 {f1949, f1950}, [r20+55296]; +ld.shared.v2.f32 {f1953, f1954}, [r20+57344]; +ld.shared.v2.f32 {f1957, f1958}, [r20+59392]; +ld.shared.v2.f32 {f1961, f1962}, [r20+61440]; +ld.shared.v2.f32 {f1965, f1966}, [r20+63488]; +add.f32 f1969, f1841, f1905; +sub.f32 f1971, f1841, f1905; +add.f32 f2256, f1842, f1906; +sub.f32 f1972, f1842, f1906; +add.f32 f1973, f1873, f1937; +sub.f32 f1975, f1873, f1937; +add.f32 f2255, f1874, f1938; +sub.f32 f1976, f1874, f1938; +add.f32 f1977, f1969, f1973; +sub.f32 f1979, f1969, f1973; +add.f32 f2254, f2256, f2255; +sub.f32 f1980, f2256, f2255; +add.f32 f1981, f1971, f1976; +sub.f32 f1983, f1971, f1976; +sub.f32 f2253, f1972, f1975; +add.f32 f1984, f1972, f1975; +add.f32 f1985, f1857, f1921; +sub.f32 f1987, f1857, f1921; +add.f32 f2252, f1858, f1922; +sub.f32 f1988, f1858, f1922; +add.f32 f1989, f1889, f1953; +sub.f32 f1991, f1889, f1953; +add.f32 f2251, f1890, f1954; +sub.f32 f1992, f1890, f1954; +add.f32 f1993, f1985, f1989; +sub.f32 f1995, f1985, f1989; +add.f32 f2250, f2252, f2251; +sub.f32 f1996, f2252, f2251; +add.f32 f1997, f1987, f1992; +sub.f32 f1999, f1987, f1992; +sub.f32 f2249, f1988, f1991; +add.f32 f2000, f1988, f1991; +mul.f32 f2002, f2249, 0fBF3504F3; +mul.f32 f2248, f1997, 0f3F3504F3; +sub.f32 f2003, f2248, f2002; +mul.f32 f2004, f2249, 0f3F3504F3; +fma.rn.f32 f2005, f1997, 0fBF3504F3, f2004; +mul.f32 f2006, f1999, 0fBF3504F3; +mul.f32 f2007, f2000, 0fBF3504F3; +sub.f32 f2008, f2006, f2007; +add.f32 f2009, f2006, f2007; +add.f32 f2010, f1845, f1909; +sub.f32 f2012, f1845, f1909; +add.f32 f2247, f1846, f1910; +sub.f32 f2013, f1846, f1910; +add.f32 f2014, f1877, f1941; +sub.f32 f2016, f1877, f1941; +add.f32 f2246, f1878, f1942; +sub.f32 f2017, f1878, f1942; +add.f32 f2018, f2010, f2014; +sub.f32 f2020, f2010, f2014; +add.f32 f2245, f2247, f2246; +sub.f32 f2021, f2247, f2246; +add.f32 f2022, f2012, f2017; +sub.f32 f2024, f2012, f2017; +sub.f32 f2244, f2013, f2016; +add.f32 f2025, f2013, f2016; +add.f32 f2026, f1861, f1925; +sub.f32 f2028, f1861, f1925; +add.f32 f2243, f1862, f1926; +sub.f32 f2029, f1862, f1926; +add.f32 f2030, f1893, f1957; +sub.f32 f2032, f1893, f1957; +add.f32 f2242, f1894, f1958; +sub.f32 f2033, f1894, f1958; +add.f32 f2034, f2026, f2030; +sub.f32 f2036, f2026, f2030; +add.f32 f2241, f2243, f2242; +sub.f32 f2037, f2243, f2242; +add.f32 f2038, f2028, f2033; +sub.f32 f2040, f2028, f2033; +sub.f32 f2240, f2029, f2032; +add.f32 f2041, f2029, f2032; +mul.f32 f2238, f2038, 0f3F3504F3; +mul.f32 f2239, f2240, 0fBF3504F3; +sub.f32 f2044, f2238, f2239; +mul.f32 f2045, f2240, 0f3F3504F3; +fma.rn.f32 f2046, f2038, 0fBF3504F3, f2045; +mul.f32 f2047, f2040, 0fBF3504F3; +mul.f32 f2048, f2041, 0fBF3504F3; +sub.f32 f2049, f2047, f2048; +add.f32 f2050, f2047, f2048; +add.f32 f2051, f1849, f1913; +sub.f32 f2053, f1849, f1913; +add.f32 f2237, f1850, f1914; +sub.f32 f2054, f1850, f1914; +add.f32 f2055, f1881, f1945; +sub.f32 f2057, f1881, f1945; +add.f32 f2236, f1882, f1946; +sub.f32 f2058, f1882, f1946; +add.f32 f2059, f2051, f2055; +sub.f32 f2061, f2051, f2055; +add.f32 f2235, f2237, f2236; +sub.f32 f2062, f2237, f2236; +add.f32 f2063, f2053, f2058; +sub.f32 f2065, f2053, f2058; +sub.f32 f2234, f2054, f2057; +add.f32 f2066, f2054, f2057; +add.f32 f2067, f1865, f1929; +sub.f32 f2069, f1865, f1929; +add.f32 f2233, f1866, f1930; +sub.f32 f2070, f1866, f1930; +add.f32 f2071, f1897, f1961; +sub.f32 f2073, f1897, f1961; +add.f32 f2232, f1898, f1962; +sub.f32 f2074, f1898, f1962; +add.f32 f2075, f2067, f2071; +sub.f32 f2077, f2067, f2071; +add.f32 f2231, f2233, f2232; +sub.f32 f2078, f2233, f2232; +add.f32 f2079, f2069, f2074; +sub.f32 f2081, f2069, f2074; +sub.f32 f2230, f2070, f2073; +add.f32 f2082, f2070, f2073; +mul.f32 f2084, f2230, 0fBF3504F3; +mul.f32 f2229, f2079, 0f3F3504F3; +sub.f32 f2085, f2229, f2084; +mul.f32 f2086, f2230, 0f3F3504F3; +fma.rn.f32 f2087, f2079, 0fBF3504F3, f2086; +mul.f32 f2088, f2081, 0fBF3504F3; +mul.f32 f2089, f2082, 0fBF3504F3; +sub.f32 f2090, f2088, f2089; +add.f32 f2091, f2088, f2089; +add.f32 f2092, f1853, f1917; +sub.f32 f2094, f1853, f1917; +add.f32 f2228, f1854, f1918; +sub.f32 f2095, f1854, f1918; +add.f32 f2096, f1885, f1949; +sub.f32 f2098, f1885, f1949; +add.f32 f2227, f1886, f1950; +sub.f32 f2099, f1886, f1950; +add.f32 f2100, f2092, f2096; +sub.f32 f2102, f2092, f2096; +add.f32 f2226, f2228, f2227; +sub.f32 f2103, f2228, f2227; +add.f32 f2104, f2094, f2099; +sub.f32 f2106, f2094, f2099; +sub.f32 f2225, f2095, f2098; +add.f32 f2107, f2095, f2098; +add.f32 f2108, f1869, f1933; +sub.f32 f2110, f1869, f1933; +add.f32 f2224, f1870, f1934; +sub.f32 f2111, f1870, f1934; +add.f32 f2112, f1901, f1965; +sub.f32 f2114, f1901, f1965; +add.f32 f2223, f1902, f1966; +sub.f32 f2115, f1902, f1966; +add.f32 f2116, f2108, f2112; +sub.f32 f2118, f2108, f2112; +add.f32 f2222, f2224, f2223; +sub.f32 f2119, f2224, f2223; +add.f32 f2120, f2110, f2115; +sub.f32 f2122, f2110, f2115; +sub.f32 f2221, f2111, f2114; +add.f32 f2123, f2111, f2114; +mul.f32 f2125, f2221, 0fBF3504F3; +mul.f32 f2220, f2120, 0f3F3504F3; +sub.f32 f2126, f2220, f2125; +mul.f32 f2127, f2221, 0f3F3504F3; +fma.rn.f32 f2128, f2120, 0fBF3504F3, f2127; +mul.f32 f2129, f2122, 0fBF3504F3; +mul.f32 f2130, f2123, 0fBF3504F3; +sub.f32 f2131, f2129, f2130; +add.f32 f2132, f2129, f2130; +add.f32 %0, f1977, f1993; +add.f32 %1, f2254, f2250; +add.f32 %2, f2018, f2034; +add.f32 %3, f2245, f2241; +add.f32 %5, f2235, f2231; +add.f32 %4, f2059, f2075; +add.f32 %7, f2226, f2222; +add.f32 %6, f2100, f2116; +add.f32 %9, f2253, f2005; +add.f32 %8, f1981, f2003; +add.f32 %10, f2022, f2044; +add.f32 %11, f2244, f2046; +add.f32 %12, f2063, f2085; +add.f32 %13, f2234, f2087; +add.f32 %14, f2104, f2126; +add.f32 %15, f2225, f2128; +sub.f32 %17, f1980, f1995; +add.f32 %16, f1979, f1996; +sub.f32 %19, f2021, f2036; +add.f32 %18, f2020, f2037; +sub.f32 %21, f2062, f2077; +add.f32 %20, f2061, f2078; +add.f32 %22, f2102, f2119; +sub.f32 %23, f2103, f2118; +add.f32 %24, f1983, f2008; +add.f32 %25, f1984, f2009; +add.f32 %26, f2024, f2049; +add.f32 %27, f2025, f2050; +add.f32 %28, f2065, f2090; +add.f32 %29, f2066, f2091; +add.f32 %31, f2107, f2132; +add.f32 %30, f2106, f2131; +sub.f32 %33, f2254, f2250; +sub.f32 %32, f1977, f1993; +sub.f32 %35, f2245, f2241; +sub.f32 %34, f2018, f2034; +sub.f32 %37, f2235, f2231; +sub.f32 %36, f2059, f2075; +sub.f32 %39, f2226, f2222; +sub.f32 %38, f2100, f2116; +sub.f32 %41, f2253, f2005; +sub.f32 %40, f1981, f2003; +sub.f32 %43, f2244, f2046; +sub.f32 %42, f2022, f2044; +sub.f32 %45, f2234, f2087; +sub.f32 %44, f2063, f2085; +sub.f32 %47, f2225, f2128; +sub.f32 %46, f2104, f2126; +add.f32 %49, f1980, f1995; +sub.f32 %48, f1979, f1996; +add.f32 %51, f2021, f2036; +sub.f32 %50, f2020, f2037; +add.f32 %53, f2062, f2077; +sub.f32 %52, f2061, f2078; +add.f32 %55, f2103, f2118; +sub.f32 %54, f2102, f2119; +sub.f32 %57, f1984, f2009; +sub.f32 %56, f1983, f2008; +sub.f32 %59, f2025, f2050; +sub.f32 %58, f2024, f2049; +sub.f32 %61, f2066, f2091; +sub.f32 %60, f2065, f2090; +sub.f32 %63, f2107, f2132; +sub.f32 %62, f2106, f2131; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_8192), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<111, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2564>; +.reg .b32 r<36>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2562, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2560, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2559, f2562, f2560; +sub.f32 f140, f2562, f2560; +add.f32 f141, f131, f136; +sub.f32 f143, f131, f136; +sub.f32 f2558, f132, f135; +add.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2555, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2553, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2552, f2555, f2553; +sub.f32 f156, f2555, f2553; +add.f32 f157, f147, f152; +sub.f32 f159, f147, f152; +sub.f32 f2551, f148, f151; +add.f32 f160, f148, f151; +mul.f32 f162, f2551, 0fBF3504F3; +mul.f32 f2550, f157, 0f3F3504F3; +sub.f32 f163, f2550, f162; +mul.f32 f164, f2551, 0f3F3504F3; +fma.rn.f32 f165, f157, 0fBF3504F3, f164; +mul.f32 f166, f159, 0fBF3504F3; +mul.f32 f167, f160, 0fBF3504F3; +sub.f32 f168, f166, f167; +add.f32 f169, f166, f167; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2549, f2559, f2552; +sub.f32 f173, f2559, f2552; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2548, f2558, f165; +sub.f32 f177, f2558, f165; +add.f32 f178, f139, f156; +sub.f32 f180, f139, f156; +sub.f32 f2547, f140, f155; +add.f32 f181, f140, f155; +add.f32 f182, f143, f168; +sub.f32 f184, f143, f168; +add.f32 f2546, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2544, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2541, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2540, f2544, f2541; +sub.f32 f197, f2544, f2541; +add.f32 f198, f188, f193; +sub.f32 f200, f188, f193; +sub.f32 f2539, f189, f192; +add.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2537, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2535, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2534, f2537, f2535; +sub.f32 f213, f2537, f2535; +add.f32 f214, f204, f209; +sub.f32 f216, f204, f209; +sub.f32 f2533, f205, f208; +add.f32 f217, f205, f208; +mul.f32 f219, f2533, 0fBF3504F3; +mul.f32 f2532, f214, 0f3F3504F3; +sub.f32 f220, f2532, f219; +mul.f32 f221, f2533, 0f3F3504F3; +fma.rn.f32 f222, f214, 0fBF3504F3, f221; +mul.f32 f223, f216, 0fBF3504F3; +mul.f32 f224, f217, 0fBF3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2531, f2540, f2534; +sub.f32 f230, f2540, f2534; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2530, f2539, f222; +sub.f32 f234, f2539, f222; +add.f32 f235, f196, f213; +sub.f32 f237, f196, f213; +sub.f32 f2529, f197, f212; +add.f32 f238, f197, f212; +add.f32 f239, f200, f225; +sub.f32 f241, f200, f225; +add.f32 f2528, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2526, f231, 0f3F6C835E; +mul.f32 f2527, f2530, 0fBEC3EF15; +sub.f32 f245, f2526, f2527; +mul.f32 f246, f2530, 0f3F6C835E; +fma.rn.f32 f247, f231, 0fBEC3EF15, f246; +mul.f32 f2524, f235, 0f3F3504F3; +mul.f32 f2525, f2529, 0fBF3504F3; +sub.f32 f250, f2524, f2525; +mul.f32 f251, f2529, 0f3F3504F3; +fma.rn.f32 f252, f235, 0fBF3504F3, f251; +mul.f32 f2522, f239, 0f3EC3EF15; +mul.f32 f2523, f2528, 0fBF6C835E; +sub.f32 f255, f2522, f2523; +mul.f32 f256, f2528, 0f3EC3EF15; +fma.rn.f32 f257, f239, 0fBF6C835E, f256; +mul.f32 f2520, f233, 0fBEC3EF15; +mul.f32 f2521, f234, 0fBF6C835E; +sub.f32 f260, f2520, f2521; +mul.f32 f261, f234, 0fBEC3EF15; +fma.rn.f32 f262, f233, 0fBF6C835E, f261; +mul.f32 f263, f237, 0fBF3504F3; +mul.f32 f264, f238, 0fBF3504F3; +sub.f32 f265, f263, f264; +add.f32 f266, f263, f264; +mul.f32 f2518, f241, 0fBF6C835E; +mul.f32 f2519, f242, 0fBEC3EF15; +sub.f32 f269, f2518, f2519; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0fBEC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2517, f2549, f2531; +sub.f32 f275, f2549, f2531; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2516, f2548, f247; +sub.f32 f279, f2548, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2515, f2547, f252; +sub.f32 f283, f2547, f252; +add.f32 f284, f182, f255; +sub.f32 f286, f182, f255; +add.f32 f2514, f2546, f257; +sub.f32 f287, f2546, f257; +add.f32 f288, f172, f230; +sub.f32 f290, f172, f230; +sub.f32 f2513, f173, f229; +add.f32 f291, f173, f229; +add.f32 f292, f176, f260; +sub.f32 f294, f176, f260; +add.f32 f2512, f177, f262; +sub.f32 f295, f177, f262; +add.f32 f296, f180, f265; +sub.f32 f298, f180, f265; +add.f32 f2511, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2510, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2507, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2505, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2504, f2507, f2505; +sub.f32 f315, f2507, f2505; +add.f32 f316, f306, f311; +sub.f32 f318, f306, f311; +sub.f32 f2503, f307, f310; +add.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2501, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2498, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2497, f2501, f2498; +sub.f32 f331, f2501, f2498; +add.f32 f332, f322, f327; +sub.f32 f334, f322, f327; +sub.f32 f2496, f323, f326; +add.f32 f335, f323, f326; +mul.f32 f2494, f332, 0f3F3504F3; +mul.f32 f2495, f2496, 0fBF3504F3; +sub.f32 f338, f2494, f2495; +mul.f32 f339, f2496, 0f3F3504F3; +fma.rn.f32 f340, f332, 0fBF3504F3, f339; +mul.f32 f341, f334, 0fBF3504F3; +mul.f32 f342, f335, 0fBF3504F3; +sub.f32 f343, f341, f342; +add.f32 f344, f341, f342; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2493, f2504, f2497; +sub.f32 f348, f2504, f2497; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2492, f2503, f340; +sub.f32 f352, f2503, f340; +add.f32 f353, f314, f331; +sub.f32 f355, f314, f331; +sub.f32 f2491, f315, f330; +add.f32 f356, f315, f330; +add.f32 f357, f318, f343; +sub.f32 f359, f318, f343; +add.f32 f2490, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2488, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2486, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2485, f2488, f2486; +sub.f32 f372, f2488, f2486; +add.f32 f373, f363, f368; +sub.f32 f375, f363, f368; +sub.f32 f2484, f364, f367; +add.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2481, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2480, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2479, f2481, f2480; +sub.f32 f388, f2481, f2480; +add.f32 f389, f379, f384; +sub.f32 f391, f379, f384; +sub.f32 f2478, f380, f383; +add.f32 f392, f380, f383; +mul.f32 f2476, f389, 0f3F3504F3; +mul.f32 f2477, f2478, 0fBF3504F3; +sub.f32 f395, f2476, f2477; +mul.f32 f396, f2478, 0f3F3504F3; +fma.rn.f32 f397, f389, 0fBF3504F3, f396; +mul.f32 f398, f391, 0fBF3504F3; +mul.f32 f399, f392, 0fBF3504F3; +sub.f32 f400, f398, f399; +add.f32 f401, f398, f399; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2475, f2485, f2479; +sub.f32 f405, f2485, f2479; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2474, f2484, f397; +sub.f32 f409, f2484, f397; +add.f32 f410, f371, f388; +sub.f32 f412, f371, f388; +sub.f32 f2473, f372, f387; +add.f32 f413, f372, f387; +add.f32 f414, f375, f400; +sub.f32 f416, f375, f400; +add.f32 f2472, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2474, 0fBEC3EF15; +mul.f32 f2471, f406, 0f3F6C835E; +sub.f32 f420, f2471, f419; +mul.f32 f421, f2474, 0f3F6C835E; +fma.rn.f32 f422, f406, 0fBEC3EF15, f421; +mul.f32 f424, f2473, 0fBF3504F3; +mul.f32 f2470, f410, 0f3F3504F3; +sub.f32 f425, f2470, f424; +mul.f32 f426, f2473, 0f3F3504F3; +fma.rn.f32 f427, f410, 0fBF3504F3, f426; +mul.f32 f2468, f414, 0f3EC3EF15; +mul.f32 f2469, f2472, 0fBF6C835E; +sub.f32 f430, f2468, f2469; +mul.f32 f431, f2472, 0f3EC3EF15; +fma.rn.f32 f432, f414, 0fBF6C835E, f431; +mul.f32 f2466, f408, 0fBEC3EF15; +mul.f32 f2467, f409, 0fBF6C835E; +sub.f32 f435, f2466, f2467; +mul.f32 f436, f409, 0fBEC3EF15; +fma.rn.f32 f437, f408, 0fBF6C835E, f436; +mul.f32 f438, f412, 0fBF3504F3; +mul.f32 f439, f413, 0fBF3504F3; +sub.f32 f440, f438, f439; +add.f32 f441, f438, f439; +mul.f32 f443, f417, 0fBEC3EF15; +mul.f32 f2465, f416, 0fBF6C835E; +sub.f32 f444, f2465, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0fBEC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2464, f2493, f2475; +sub.f32 f450, f2493, f2475; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2463, f2492, f422; +sub.f32 f454, f2492, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2462, f2491, f427; +sub.f32 f458, f2491, f427; +add.f32 f459, f357, f430; +sub.f32 f461, f357, f430; +add.f32 f2461, f2490, f432; +sub.f32 f462, f2490, f432; +add.f32 f463, f347, f405; +sub.f32 f465, f347, f405; +sub.f32 f2460, f348, f404; +add.f32 f466, f348, f404; +add.f32 f467, f351, f435; +sub.f32 f469, f351, f435; +add.f32 f2459, f352, f437; +sub.f32 f470, f352, f437; +add.f32 f471, f355, f440; +sub.f32 f473, f355, f440; +add.f32 f2458, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2457, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2463, 0fBE47C5C2; +mul.f32 f2456, f451, 0f3F7B14BE; +sub.f32 f481, f2456, f480; +mul.f32 f482, f2463, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0fBE47C5C2, f482; +mul.f32 f485, f2462, 0fBEC3EF15; +mul.f32 f2455, f455, 0f3F6C835E; +sub.f32 f486, f2455, f485; +mul.f32 f487, f2462, 0f3F6C835E; +fma.rn.f32 f488, f455, 0fBEC3EF15, f487; +mul.f32 f490, f2461, 0fBF0E39DA; +mul.f32 f2454, f459, 0f3F54DB31; +sub.f32 f491, f2454, f490; +mul.f32 f492, f2461, 0f3F54DB31; +fma.rn.f32 f493, f459, 0fBF0E39DA, f492; +mul.f32 f495, f2460, 0fBF3504F3; +mul.f32 f2453, f463, 0f3F3504F3; +sub.f32 f496, f2453, f495; +mul.f32 f497, f2460, 0f3F3504F3; +fma.rn.f32 f498, f463, 0fBF3504F3, f497; +mul.f32 f2451, f467, 0f3F0E39DA; +mul.f32 f2452, f2459, 0fBF54DB31; +sub.f32 f501, f2451, f2452; +mul.f32 f502, f2459, 0f3F0E39DA; +fma.rn.f32 f503, f467, 0fBF54DB31, f502; +mul.f32 f2449, f471, 0f3EC3EF15; +mul.f32 f2450, f2458, 0fBF6C835E; +sub.f32 f506, f2449, f2450; +mul.f32 f507, f2458, 0f3EC3EF15; +fma.rn.f32 f508, f471, 0fBF6C835E, f507; +mul.f32 f2447, f475, 0f3E47C5C2; +mul.f32 f2448, f2457, 0fBF7B14BE; +sub.f32 f511, f2447, f2448; +mul.f32 f512, f2457, 0f3E47C5C2; +fma.rn.f32 f513, f475, 0fBF7B14BE, f512; +mul.f32 f2445, f453, 0fBE47C5C2; +mul.f32 f2446, f454, 0fBF7B14BE; +sub.f32 f516, f2445, f2446; +mul.f32 f517, f454, 0fBE47C5C2; +fma.rn.f32 f518, f453, 0fBF7B14BE, f517; +mul.f32 f520, f458, 0fBF6C835E; +mul.f32 f2444, f457, 0fBEC3EF15; +sub.f32 f521, f2444, f520; +mul.f32 f522, f458, 0fBEC3EF15; +fma.rn.f32 f523, f457, 0fBF6C835E, f522; +mul.f32 f525, f462, 0fBF54DB31; +mul.f32 f2443, f461, 0fBF0E39DA; +sub.f32 f526, f2443, f525; +mul.f32 f527, f462, 0fBF0E39DA; +fma.rn.f32 f528, f461, 0fBF54DB31, f527; +mul.f32 f529, f465, 0fBF3504F3; +mul.f32 f530, f466, 0fBF3504F3; +sub.f32 f531, f529, f530; +add.f32 f532, f529, f530; +mul.f32 f2441, f469, 0fBF54DB31; +mul.f32 f2442, f470, 0fBF0E39DA; +sub.f32 f535, f2441, f2442; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0fBF0E39DA, f536; +mul.f32 f539, f474, 0fBEC3EF15; +mul.f32 f2440, f473, 0fBF6C835E; +sub.f32 f540, f2440, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0fBEC3EF15, f541; +mul.f32 f544, f478, 0fBE47C5C2; +mul.f32 f2439, f477, 0fBF7B14BE; +sub.f32 f545, f2439, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0fBE47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2438, f2517, f2464; +sub.f32 f551, f2517, f2464; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2437, f2516, f483; +sub.f32 f555, f2516, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2436, f2515, f488; +sub.f32 f559, f2515, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2435, f2514, f493; +sub.f32 f563, f2514, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2434, f2513, f498; +sub.f32 f567, f2513, f498; +add.f32 f568, f292, f501; +sub.f32 f570, f292, f501; +add.f32 f2433, f2512, f503; +sub.f32 f571, f2512, f503; +add.f32 f572, f296, f506; +sub.f32 f574, f296, f506; +add.f32 f2432, f2511, f508; +sub.f32 f575, f2511, f508; +add.f32 f576, f300, f511; +sub.f32 f578, f300, f511; +add.f32 f2431, f2510, f513; +sub.f32 f579, f2510, f513; +add.f32 f580, f274, f450; +sub.f32 f582, f274, f450; +sub.f32 f2430, f275, f449; +add.f32 f583, f275, f449; +add.f32 f584, f278, f516; +sub.f32 f586, f278, f516; +add.f32 f2429, f279, f518; +sub.f32 f587, f279, f518; +add.f32 f588, f282, f521; +sub.f32 f590, f282, f521; +add.f32 f2428, f283, f523; +sub.f32 f591, f283, f523; +add.f32 f592, f286, f526; +sub.f32 f594, f286, f526; +add.f32 f2427, f287, f528; +sub.f32 f595, f287, f528; +add.f32 f596, f290, f531; +sub.f32 f598, f290, f531; +add.f32 f2426, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2425, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2424, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2423, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f617, f613, f2437; +mul.f32 f2422, f612, f552; +sub.f32 f618, f2422, f617; +mul.f32 f619, f612, f2437; +fma.rn.f32 f620, f613, f552, f619; +mul.f32 f2420, f612, f612; +mul.f32 f2421, f613, f613; +sub.f32 f623, f2420, f2421; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f2418, f623, f556; +mul.f32 f2419, f625, f2436; +sub.f32 f628, f2418, f2419; +mul.f32 f629, f623, f2436; +fma.rn.f32 f630, f625, f556, f629; +mul.f32 f2416, f612, f623; +mul.f32 f2417, f613, f625; +sub.f32 f633, f2416, f2417; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f2414, f633, f560; +mul.f32 f2415, f635, f2435; +sub.f32 f638, f2414, f2415; +mul.f32 f639, f633, f2435; +fma.rn.f32 f640, f635, f560, f639; +mul.f32 f642, f613, f635; +mul.f32 f2413, f612, f633; +sub.f32 f643, f2413, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f647, f645, f2434; +mul.f32 f2412, f643, f564; +sub.f32 f648, f2412, f647; +mul.f32 f649, f643, f2434; +fma.rn.f32 f650, f645, f564, f649; +mul.f32 f652, f613, f645; +mul.f32 f2411, f612, f643; +sub.f32 f653, f2411, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f657, f655, f2433; +mul.f32 f2410, f653, f568; +sub.f32 f658, f2410, f657; +mul.f32 f659, f653, f2433; +fma.rn.f32 f660, f655, f568, f659; +mul.f32 f662, f613, f655; +mul.f32 f2409, f612, f653; +sub.f32 f663, f2409, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f2407, f663, f572; +mul.f32 f2408, f665, f2432; +sub.f32 f668, f2407, f2408; +mul.f32 f669, f663, f2432; +fma.rn.f32 f670, f665, f572, f669; +mul.f32 f2405, f612, f663; +mul.f32 f2406, f613, f665; +sub.f32 f673, f2405, f2406; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f2403, f673, f576; +mul.f32 f2404, f675, f2431; +sub.f32 f678, f2403, f2404; +mul.f32 f679, f673, f2431; +fma.rn.f32 f680, f675, f576, f679; +mul.f32 f2401, f612, f673; +mul.f32 f2402, f613, f675; +sub.f32 f683, f2401, f2402; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f687, f685, f2430; +mul.f32 f2400, f683, f580; +sub.f32 f688, f2400, f687; +mul.f32 f689, f683, f2430; +fma.rn.f32 f690, f685, f580, f689; +mul.f32 f692, f613, f685; +mul.f32 f2399, f612, f683; +sub.f32 f693, f2399, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f697, f695, f2429; +mul.f32 f2398, f693, f584; +sub.f32 f698, f2398, f697; +mul.f32 f699, f693, f2429; +fma.rn.f32 f700, f695, f584, f699; +mul.f32 f702, f613, f695; +mul.f32 f2397, f612, f693; +sub.f32 f703, f2397, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f707, f705, f2428; +mul.f32 f2396, f703, f588; +sub.f32 f708, f2396, f707; +mul.f32 f709, f703, f2428; +fma.rn.f32 f710, f705, f588, f709; +mul.f32 f2394, f612, f703; +mul.f32 f2395, f613, f705; +sub.f32 f713, f2394, f2395; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f2392, f713, f592; +mul.f32 f2393, f715, f2427; +sub.f32 f718, f2392, f2393; +mul.f32 f719, f713, f2427; +fma.rn.f32 f720, f715, f592, f719; +mul.f32 f2390, f612, f713; +mul.f32 f2391, f613, f715; +sub.f32 f723, f2390, f2391; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f2388, f723, f596; +mul.f32 f2389, f725, f2426; +sub.f32 f728, f2388, f2389; +mul.f32 f729, f723, f2426; +fma.rn.f32 f730, f725, f596, f729; +mul.f32 f732, f613, f725; +mul.f32 f2387, f612, f723; +sub.f32 f733, f2387, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f737, f735, f2425; +mul.f32 f2386, f733, f600; +sub.f32 f738, f2386, f737; +mul.f32 f739, f733, f2425; +fma.rn.f32 f740, f735, f600, f739; +mul.f32 f742, f613, f735; +mul.f32 f2385, f612, f733; +sub.f32 f743, f2385, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f747, f745, f2424; +mul.f32 f2384, f743, f604; +sub.f32 f748, f2384, f747; +mul.f32 f749, f743, f2424; +fma.rn.f32 f750, f745, f604, f749; +mul.f32 f752, f613, f745; +mul.f32 f2383, f612, f743; +sub.f32 f753, f2383, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f757, f755, f2423; +mul.f32 f2382, f753, f608; +sub.f32 f758, f2382, f757; +mul.f32 f759, f753, f2423; +fma.rn.f32 f760, f755, f608, f759; +mul.f32 f2380, f612, f753; +mul.f32 f2381, f613, f755; +sub.f32 f763, f2380, f2381; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f2378, f763, f550; +mul.f32 f2379, f765, f551; +sub.f32 f768, f2378, f2379; +mul.f32 f769, f763, f551; +fma.rn.f32 f770, f765, f550, f769; +mul.f32 f2376, f612, f763; +mul.f32 f2377, f613, f765; +sub.f32 f773, f2376, f2377; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f777, f775, f555; +mul.f32 f2375, f773, f554; +sub.f32 f778, f2375, f777; +mul.f32 f779, f773, f555; +fma.rn.f32 f780, f775, f554, f779; +mul.f32 f782, f613, f775; +mul.f32 f2374, f612, f773; +sub.f32 f783, f2374, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f787, f785, f559; +mul.f32 f2373, f783, f558; +sub.f32 f788, f2373, f787; +mul.f32 f789, f783, f559; +fma.rn.f32 f790, f785, f558, f789; +mul.f32 f792, f613, f785; +mul.f32 f2372, f612, f783; +sub.f32 f793, f2372, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f797, f795, f563; +mul.f32 f2371, f793, f562; +sub.f32 f798, f2371, f797; +mul.f32 f799, f793, f563; +fma.rn.f32 f800, f795, f562, f799; +mul.f32 f802, f613, f795; +mul.f32 f2370, f612, f793; +sub.f32 f803, f2370, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f2368, f803, f566; +mul.f32 f2369, f805, f567; +sub.f32 f808, f2368, f2369; +mul.f32 f809, f803, f567; +fma.rn.f32 f810, f805, f566, f809; +mul.f32 f2366, f612, f803; +mul.f32 f2367, f613, f805; +sub.f32 f813, f2366, f2367; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f2364, f813, f570; +mul.f32 f2365, f815, f571; +sub.f32 f818, f2364, f2365; +mul.f32 f819, f813, f571; +fma.rn.f32 f820, f815, f570, f819; +mul.f32 f2362, f612, f813; +mul.f32 f2363, f613, f815; +sub.f32 f823, f2362, f2363; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f827, f825, f575; +mul.f32 f2361, f823, f574; +sub.f32 f828, f2361, f827; +mul.f32 f829, f823, f575; +fma.rn.f32 f830, f825, f574, f829; +mul.f32 f832, f613, f825; +mul.f32 f2360, f612, f823; +sub.f32 f833, f2360, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f837, f835, f579; +mul.f32 f2359, f833, f578; +sub.f32 f838, f2359, f837; +mul.f32 f839, f833, f579; +fma.rn.f32 f840, f835, f578, f839; +mul.f32 f842, f613, f835; +mul.f32 f2358, f612, f833; +sub.f32 f843, f2358, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f847, f845, f583; +mul.f32 f2357, f843, f582; +sub.f32 f848, f2357, f847; +mul.f32 f849, f843, f583; +fma.rn.f32 f850, f845, f582, f849; +mul.f32 f2355, f612, f843; +mul.f32 f2356, f613, f845; +sub.f32 f853, f2355, f2356; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f2353, f853, f586; +mul.f32 f2354, f855, f587; +sub.f32 f858, f2353, f2354; +mul.f32 f859, f853, f587; +fma.rn.f32 f860, f855, f586, f859; +mul.f32 f2351, f612, f853; +mul.f32 f2352, f613, f855; +sub.f32 f863, f2351, f2352; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f2349, f863, f590; +mul.f32 f2350, f865, f591; +sub.f32 f868, f2349, f2350; +mul.f32 f869, f863, f591; +fma.rn.f32 f870, f865, f590, f869; +mul.f32 f872, f613, f865; +mul.f32 f2348, f612, f863; +sub.f32 f873, f2348, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f877, f875, f595; +mul.f32 f2347, f873, f594; +sub.f32 f878, f2347, f877; +mul.f32 f879, f873, f595; +fma.rn.f32 f880, f875, f594, f879; +mul.f32 f882, f613, f875; +mul.f32 f2346, f612, f873; +sub.f32 f883, f2346, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f887, f885, f599; +mul.f32 f2345, f883, f598; +sub.f32 f888, f2345, f887; +mul.f32 f889, f883, f599; +fma.rn.f32 f890, f885, f598, f889; +mul.f32 f892, f613, f885; +mul.f32 f2344, f612, f883; +sub.f32 f893, f2344, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f2342, f893, f602; +mul.f32 f2343, f895, f603; +sub.f32 f898, f2342, f2343; +mul.f32 f899, f893, f603; +fma.rn.f32 f900, f895, f602, f899; +mul.f32 f2340, f612, f893; +mul.f32 f2341, f613, f895; +sub.f32 f903, f2340, f2341; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f2338, f903, f606; +mul.f32 f2339, f905, f607; +sub.f32 f908, f2338, f2339; +mul.f32 f909, f903, f607; +fma.rn.f32 f910, f905, f606, f909; +mul.f32 f2336, f612, f903; +mul.f32 f2337, f613, f905; +sub.f32 f913, f2336, f2337; +mov.u32 r32, %tid.x; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f917, f915, f611; +mul.f32 f2335, f913, f610; +sub.f32 f918, f2335, f917; +mul.f32 f919, f913, f611; +fma.rn.f32 f920, f915, f610, f919; +shl.b32 r8, r32, 7; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32640; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f618, f628, f638}; +st.shared.v4.f32 [r12+16], {f648, f658, f668, f678}; +st.shared.v4.f32 [r12+32], {f688, f698, f708, f718}; +st.shared.v4.f32 [r12+48], {f728, f738, f748, f758}; +st.shared.v4.f32 [r12+64], {f768, f778, f788, f798}; +st.shared.v4.f32 [r12+80], {f808, f818, f828, f838}; +st.shared.v4.f32 [r12+96], {f848, f858, f868, f878}; +st.shared.v4.f32 [r12+112], {f888, f898, f908, f918}; +barrier.sync 0; +and.b32 r23, r32, 255; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+1024]; +ld.shared.f32 f923, [r13+2048]; +ld.shared.f32 f924, [r13+3072]; +ld.shared.f32 f925, [r13+4096]; +ld.shared.f32 f926, [r13+5120]; +ld.shared.f32 f927, [r13+6144]; +ld.shared.f32 f928, [r13+7168]; +ld.shared.f32 f929, [r13+8192]; +ld.shared.f32 f930, [r13+9216]; +ld.shared.f32 f931, [r13+10240]; +ld.shared.f32 f932, [r13+11264]; +ld.shared.f32 f933, [r13+12288]; +ld.shared.f32 f934, [r13+13312]; +ld.shared.f32 f935, [r13+14336]; +ld.shared.f32 f936, [r13+15360]; +ld.shared.f32 f937, [r13+16384]; +ld.shared.f32 f938, [r13+17408]; +ld.shared.f32 f939, [r13+18432]; +ld.shared.f32 f940, [r13+19456]; +ld.shared.f32 f941, [r13+20480]; +ld.shared.f32 f942, [r13+21504]; +ld.shared.f32 f943, [r13+22528]; +ld.shared.f32 f944, [r13+23552]; +ld.shared.f32 f945, [r13+24576]; +ld.shared.f32 f946, [r13+25600]; +ld.shared.f32 f947, [r13+26624]; +ld.shared.f32 f948, [r13+27648]; +ld.shared.f32 f949, [r13+28672]; +ld.shared.f32 f950, [r13+29696]; +ld.shared.f32 f951, [r13+30720]; +ld.shared.f32 f952, [r13+31744]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2438, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+1024]; +ld.shared.f32 f955, [r13+2048]; +ld.shared.f32 f956, [r13+3072]; +ld.shared.f32 f957, [r13+4096]; +ld.shared.f32 f958, [r13+5120]; +ld.shared.f32 f959, [r13+6144]; +ld.shared.f32 f960, [r13+7168]; +ld.shared.f32 f961, [r13+8192]; +ld.shared.f32 f962, [r13+9216]; +ld.shared.f32 f963, [r13+10240]; +ld.shared.f32 f964, [r13+11264]; +ld.shared.f32 f965, [r13+12288]; +ld.shared.f32 f966, [r13+13312]; +ld.shared.f32 f967, [r13+14336]; +ld.shared.f32 f968, [r13+15360]; +ld.shared.f32 f969, [r13+16384]; +ld.shared.f32 f970, [r13+17408]; +ld.shared.f32 f971, [r13+18432]; +ld.shared.f32 f972, [r13+19456]; +ld.shared.f32 f973, [r13+20480]; +ld.shared.f32 f974, [r13+21504]; +ld.shared.f32 f975, [r13+22528]; +ld.shared.f32 f976, [r13+23552]; +ld.shared.f32 f977, [r13+24576]; +ld.shared.f32 f978, [r13+25600]; +ld.shared.f32 f979, [r13+26624]; +ld.shared.f32 f980, [r13+27648]; +ld.shared.f32 f981, [r13+28672]; +ld.shared.f32 f982, [r13+29696]; +ld.shared.f32 f983, [r13+30720]; +ld.shared.f32 f984, [r13+31744]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2334, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2333, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2332, f2334, f2333; +sub.f32 f996, f2334, f2333; +add.f32 f997, f987, f992; +sub.f32 f999, f987, f992; +sub.f32 f2331, f988, f991; +add.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2330, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2329, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2328, f2330, f2329; +sub.f32 f1012, f2330, f2329; +add.f32 f1013, f1003, f1008; +sub.f32 f1015, f1003, f1008; +sub.f32 f2327, f1004, f1007; +add.f32 f1016, f1004, f1007; +mul.f32 f1018, f2327, 0fBF3504F3; +mul.f32 f2326, f1013, 0f3F3504F3; +sub.f32 f1019, f2326, f1018; +mul.f32 f1020, f2327, 0f3F3504F3; +fma.rn.f32 f1021, f1013, 0fBF3504F3, f1020; +mul.f32 f1022, f1015, 0fBF3504F3; +mul.f32 f1023, f1016, 0fBF3504F3; +sub.f32 f1024, f1022, f1023; +add.f32 f1025, f1022, f1023; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2325, f2332, f2328; +sub.f32 f1029, f2332, f2328; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2324, f2331, f1021; +sub.f32 f1033, f2331, f1021; +add.f32 f1034, f995, f1012; +sub.f32 f1036, f995, f1012; +sub.f32 f2323, f996, f1011; +add.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1024; +sub.f32 f1040, f999, f1024; +add.f32 f2322, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2321, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2320, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2319, f2321, f2320; +sub.f32 f1053, f2321, f2320; +add.f32 f1054, f1044, f1049; +sub.f32 f1056, f1044, f1049; +sub.f32 f2318, f1045, f1048; +add.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2317, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2316, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2315, f2317, f2316; +sub.f32 f1069, f2317, f2316; +add.f32 f1070, f1060, f1065; +sub.f32 f1072, f1060, f1065; +sub.f32 f2314, f1061, f1064; +add.f32 f1073, f1061, f1064; +mul.f32 f1075, f2314, 0fBF3504F3; +mul.f32 f2313, f1070, 0f3F3504F3; +sub.f32 f1076, f2313, f1075; +mul.f32 f1077, f2314, 0f3F3504F3; +fma.rn.f32 f1078, f1070, 0fBF3504F3, f1077; +mul.f32 f1079, f1072, 0fBF3504F3; +mul.f32 f1080, f1073, 0fBF3504F3; +sub.f32 f1081, f1079, f1080; +add.f32 f1082, f1079, f1080; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2312, f2319, f2315; +sub.f32 f1086, f2319, f2315; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2311, f2318, f1078; +sub.f32 f1090, f2318, f1078; +add.f32 f1091, f1052, f1069; +sub.f32 f1093, f1052, f1069; +sub.f32 f2310, f1053, f1068; +add.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1081; +sub.f32 f1097, f1056, f1081; +add.f32 f2309, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2307, f1087, 0f3F6C835E; +mul.f32 f2308, f2311, 0fBEC3EF15; +sub.f32 f1101, f2307, f2308; +mul.f32 f1102, f2311, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0fBEC3EF15, f1102; +mul.f32 f2305, f1091, 0f3F3504F3; +mul.f32 f2306, f2310, 0fBF3504F3; +sub.f32 f1106, f2305, f2306; +mul.f32 f1107, f2310, 0f3F3504F3; +fma.rn.f32 f1108, f1091, 0fBF3504F3, f1107; +mul.f32 f1110, f2309, 0fBF6C835E; +mul.f32 f2304, f1095, 0f3EC3EF15; +sub.f32 f1111, f2304, f1110; +mul.f32 f1112, f2309, 0f3EC3EF15; +fma.rn.f32 f1113, f1095, 0fBF6C835E, f1112; +mul.f32 f1115, f1090, 0fBF6C835E; +mul.f32 f2303, f1089, 0fBEC3EF15; +sub.f32 f1116, f2303, f1115; +mul.f32 f1117, f1090, 0fBEC3EF15; +fma.rn.f32 f1118, f1089, 0fBF6C835E, f1117; +mul.f32 f1119, f1093, 0fBF3504F3; +mul.f32 f1120, f1094, 0fBF3504F3; +sub.f32 f1121, f1119, f1120; +add.f32 f1122, f1119, f1120; +mul.f32 f2301, f1097, 0fBF6C835E; +mul.f32 f2302, f1098, 0fBEC3EF15; +sub.f32 f1125, f2301, f2302; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0fBEC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2300, f2325, f2312; +sub.f32 f1131, f2325, f2312; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2299, f2324, f1103; +sub.f32 f1135, f2324, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2298, f2323, f1108; +sub.f32 f1139, f2323, f1108; +add.f32 f1140, f1038, f1111; +sub.f32 f1142, f1038, f1111; +add.f32 f2297, f2322, f1113; +sub.f32 f1143, f2322, f1113; +add.f32 f1144, f1028, f1086; +sub.f32 f1146, f1028, f1086; +sub.f32 f2296, f1029, f1085; +add.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1116; +sub.f32 f1150, f1032, f1116; +add.f32 f2295, f1033, f1118; +sub.f32 f1151, f1033, f1118; +add.f32 f1152, f1036, f1121; +sub.f32 f1154, f1036, f1121; +add.f32 f2294, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2293, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2292, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2291, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2290, f2292, f2291; +sub.f32 f1171, f2292, f2291; +add.f32 f1172, f1162, f1167; +sub.f32 f1174, f1162, f1167; +sub.f32 f2289, f1163, f1166; +add.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2288, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2287, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2286, f2288, f2287; +sub.f32 f1187, f2288, f2287; +add.f32 f1188, f1178, f1183; +sub.f32 f1190, f1178, f1183; +sub.f32 f2285, f1179, f1182; +add.f32 f1191, f1179, f1182; +mul.f32 f1193, f2285, 0fBF3504F3; +mul.f32 f2284, f1188, 0f3F3504F3; +sub.f32 f1194, f2284, f1193; +mul.f32 f1195, f2285, 0f3F3504F3; +fma.rn.f32 f1196, f1188, 0fBF3504F3, f1195; +mul.f32 f1197, f1190, 0fBF3504F3; +mul.f32 f1198, f1191, 0fBF3504F3; +sub.f32 f1199, f1197, f1198; +add.f32 f1200, f1197, f1198; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2283, f2290, f2286; +sub.f32 f1204, f2290, f2286; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2282, f2289, f1196; +sub.f32 f1208, f2289, f1196; +add.f32 f1209, f1170, f1187; +sub.f32 f1211, f1170, f1187; +sub.f32 f2281, f1171, f1186; +add.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1199; +sub.f32 f1215, f1174, f1199; +add.f32 f2280, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2279, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2278, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2277, f2279, f2278; +sub.f32 f1228, f2279, f2278; +add.f32 f1229, f1219, f1224; +sub.f32 f1231, f1219, f1224; +sub.f32 f2276, f1220, f1223; +add.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2275, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2274, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2273, f2275, f2274; +sub.f32 f1244, f2275, f2274; +add.f32 f1245, f1235, f1240; +sub.f32 f1247, f1235, f1240; +sub.f32 f2272, f1236, f1239; +add.f32 f1248, f1236, f1239; +mul.f32 f1250, f2272, 0fBF3504F3; +mul.f32 f2271, f1245, 0f3F3504F3; +sub.f32 f1251, f2271, f1250; +mul.f32 f1252, f2272, 0f3F3504F3; +fma.rn.f32 f1253, f1245, 0fBF3504F3, f1252; +mul.f32 f1254, f1247, 0fBF3504F3; +mul.f32 f1255, f1248, 0fBF3504F3; +sub.f32 f1256, f1254, f1255; +add.f32 f1257, f1254, f1255; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2270, f2277, f2273; +sub.f32 f1261, f2277, f2273; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2269, f2276, f1253; +sub.f32 f1265, f2276, f1253; +add.f32 f1266, f1227, f1244; +sub.f32 f1268, f1227, f1244; +sub.f32 f2268, f1228, f1243; +add.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1256; +sub.f32 f1272, f1231, f1256; +add.f32 f2267, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2265, f1262, 0f3F6C835E; +mul.f32 f2266, f2269, 0fBEC3EF15; +sub.f32 f1276, f2265, f2266; +mul.f32 f1277, f2269, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0fBEC3EF15, f1277; +mul.f32 f2263, f1266, 0f3F3504F3; +mul.f32 f2264, f2268, 0fBF3504F3; +sub.f32 f1281, f2263, f2264; +mul.f32 f1282, f2268, 0f3F3504F3; +fma.rn.f32 f1283, f1266, 0fBF3504F3, f1282; +mul.f32 f2261, f1270, 0f3EC3EF15; +mul.f32 f2262, f2267, 0fBF6C835E; +sub.f32 f1286, f2261, f2262; +mul.f32 f1287, f2267, 0f3EC3EF15; +fma.rn.f32 f1288, f1270, 0fBF6C835E, f1287; +mul.f32 f2259, f1264, 0fBEC3EF15; +mul.f32 f2260, f1265, 0fBF6C835E; +sub.f32 f1291, f2259, f2260; +mul.f32 f1292, f1265, 0fBEC3EF15; +fma.rn.f32 f1293, f1264, 0fBF6C835E, f1292; +mul.f32 f1294, f1268, 0fBF3504F3; +mul.f32 f1295, f1269, 0fBF3504F3; +sub.f32 f1296, f1294, f1295; +add.f32 f1297, f1294, f1295; +mul.f32 f2257, f1272, 0fBF6C835E; +mul.f32 f2258, f1273, 0fBEC3EF15; +sub.f32 f1300, f2257, f2258; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0fBEC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2256, f2283, f2270; +sub.f32 f1306, f2283, f2270; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2255, f2282, f1278; +sub.f32 f1310, f2282, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2254, f2281, f1283; +sub.f32 f1314, f2281, f1283; +add.f32 f1315, f1213, f1286; +sub.f32 f1317, f1213, f1286; +add.f32 f2253, f2280, f1288; +sub.f32 f1318, f2280, f1288; +add.f32 f1319, f1203, f1261; +sub.f32 f1321, f1203, f1261; +sub.f32 f2252, f1204, f1260; +add.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1291; +sub.f32 f1325, f1207, f1291; +add.f32 f2251, f1208, f1293; +sub.f32 f1326, f1208, f1293; +add.f32 f1327, f1211, f1296; +sub.f32 f1329, f1211, f1296; +add.f32 f2250, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2249, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2255, 0fBE47C5C2; +mul.f32 f2248, f1307, 0f3F7B14BE; +sub.f32 f1337, f2248, f1336; +mul.f32 f1338, f2255, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0fBE47C5C2, f1338; +mul.f32 f1341, f2254, 0fBEC3EF15; +mul.f32 f2247, f1311, 0f3F6C835E; +sub.f32 f1342, f2247, f1341; +mul.f32 f1343, f2254, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0fBEC3EF15, f1343; +mul.f32 f2245, f1315, 0f3F54DB31; +mul.f32 f2246, f2253, 0fBF0E39DA; +sub.f32 f1347, f2245, f2246; +mul.f32 f1348, f2253, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0fBF0E39DA, f1348; +mul.f32 f2243, f1319, 0f3F3504F3; +mul.f32 f2244, f2252, 0fBF3504F3; +sub.f32 f1352, f2243, f2244; +mul.f32 f1353, f2252, 0f3F3504F3; +fma.rn.f32 f1354, f1319, 0fBF3504F3, f1353; +mul.f32 f2241, f1323, 0f3F0E39DA; +mul.f32 f2242, f2251, 0fBF54DB31; +sub.f32 f1357, f2241, f2242; +mul.f32 f1358, f2251, 0f3F0E39DA; +fma.rn.f32 f1359, f1323, 0fBF54DB31, f1358; +mul.f32 f2239, f1327, 0f3EC3EF15; +mul.f32 f2240, f2250, 0fBF6C835E; +sub.f32 f1362, f2239, f2240; +mul.f32 f1363, f2250, 0f3EC3EF15; +fma.rn.f32 f1364, f1327, 0fBF6C835E, f1363; +mul.f32 f1366, f2249, 0fBF7B14BE; +mul.f32 f2238, f1331, 0f3E47C5C2; +sub.f32 f1367, f2238, f1366; +mul.f32 f1368, f2249, 0f3E47C5C2; +fma.rn.f32 f1369, f1331, 0fBF7B14BE, f1368; +mul.f32 f1371, f1310, 0fBF7B14BE; +mul.f32 f2237, f1309, 0fBE47C5C2; +sub.f32 f1372, f2237, f1371; +mul.f32 f1373, f1310, 0fBE47C5C2; +fma.rn.f32 f1374, f1309, 0fBF7B14BE, f1373; +mul.f32 f1376, f1314, 0fBF6C835E; +mul.f32 f2236, f1313, 0fBEC3EF15; +sub.f32 f1377, f2236, f1376; +mul.f32 f1378, f1314, 0fBEC3EF15; +fma.rn.f32 f1379, f1313, 0fBF6C835E, f1378; +mul.f32 f1381, f1318, 0fBF54DB31; +mul.f32 f2235, f1317, 0fBF0E39DA; +sub.f32 f1382, f2235, f1381; +mul.f32 f1383, f1318, 0fBF0E39DA; +fma.rn.f32 f1384, f1317, 0fBF54DB31, f1383; +mul.f32 f1385, f1321, 0fBF3504F3; +mul.f32 f1386, f1322, 0fBF3504F3; +sub.f32 f1387, f1385, f1386; +add.f32 f1388, f1385, f1386; +mul.f32 f1390, f1326, 0fBF0E39DA; +mul.f32 f2234, f1325, 0fBF54DB31; +sub.f32 f1391, f2234, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0fBF0E39DA, f1392; +mul.f32 f1395, f1330, 0fBEC3EF15; +mul.f32 f2233, f1329, 0fBF6C835E; +sub.f32 f1396, f2233, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0fBEC3EF15, f1397; +mul.f32 f1400, f1334, 0fBE47C5C2; +mul.f32 f2232, f1333, 0fBF7B14BE; +sub.f32 f1401, f2232, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0fBE47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2231, f2300, f2256; +sub.f32 f1407, f2300, f2256; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2230, f2299, f1339; +sub.f32 f1411, f2299, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2229, f2298, f1344; +sub.f32 f1415, f2298, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2228, f2297, f1349; +sub.f32 f1419, f2297, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2227, f2296, f1354; +sub.f32 f1423, f2296, f1354; +add.f32 f1424, f1148, f1357; +sub.f32 f1426, f1148, f1357; +add.f32 f2226, f2295, f1359; +sub.f32 f1427, f2295, f1359; +add.f32 f1428, f1152, f1362; +sub.f32 f1430, f1152, f1362; +add.f32 f2225, f2294, f1364; +sub.f32 f1431, f2294, f1364; +add.f32 f1432, f1156, f1367; +sub.f32 f1434, f1156, f1367; +add.f32 f2224, f2293, f1369; +sub.f32 f1435, f2293, f1369; +add.f32 f1436, f1130, f1306; +sub.f32 f1438, f1130, f1306; +sub.f32 f2223, f1131, f1305; +add.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1372; +sub.f32 f1442, f1134, f1372; +add.f32 f2222, f1135, f1374; +sub.f32 f1443, f1135, f1374; +add.f32 f1444, f1138, f1377; +sub.f32 f1446, f1138, f1377; +add.f32 f2221, f1139, f1379; +sub.f32 f1447, f1139, f1379; +add.f32 f1448, f1142, f1382; +sub.f32 f1450, f1142, f1382; +add.f32 f2220, f1143, f1384; +sub.f32 f1451, f1143, f1384; +add.f32 f1452, f1146, f1387; +sub.f32 f1454, f1146, f1387; +add.f32 f2219, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2218, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2217, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2216, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r32, 5, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1473, f1469, f2230; +mul.f32 f2215, f1468, f1408; +sub.f32 f1474, f2215, f1473; +mul.f32 f1475, f1468, f2230; +fma.rn.f32 f1476, f1469, f1408, f1475; +mul.f32 f1478, f1469, f1469; +mul.f32 f2214, f1468, f1468; +sub.f32 f1479, f2214, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1483, f1481, f2229; +mul.f32 f2213, f1479, f1412; +sub.f32 f1484, f2213, f1483; +mul.f32 f1485, f1479, f2229; +fma.rn.f32 f1486, f1481, f1412, f1485; +mul.f32 f2211, f1468, f1479; +mul.f32 f2212, f1469, f1481; +sub.f32 f1489, f2211, f2212; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f2209, f1489, f1416; +mul.f32 f2210, f1491, f2228; +sub.f32 f1494, f2209, f2210; +mul.f32 f1495, f1489, f2228; +fma.rn.f32 f1496, f1491, f1416, f1495; +mul.f32 f2207, f1468, f1489; +mul.f32 f2208, f1469, f1491; +sub.f32 f1499, f2207, f2208; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f2205, f1499, f1420; +mul.f32 f2206, f1501, f2227; +sub.f32 f1504, f2205, f2206; +mul.f32 f1505, f1499, f2227; +fma.rn.f32 f1506, f1501, f1420, f1505; +mul.f32 f1508, f1469, f1501; +mul.f32 f2204, f1468, f1499; +sub.f32 f1509, f2204, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1513, f1511, f2226; +mul.f32 f2203, f1509, f1424; +sub.f32 f1514, f2203, f1513; +mul.f32 f1515, f1509, f2226; +fma.rn.f32 f1516, f1511, f1424, f1515; +mul.f32 f1518, f1469, f1511; +mul.f32 f2202, f1468, f1509; +sub.f32 f1519, f2202, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1523, f1521, f2225; +mul.f32 f2201, f1519, f1428; +sub.f32 f1524, f2201, f1523; +mul.f32 f1525, f1519, f2225; +fma.rn.f32 f1526, f1521, f1428, f1525; +mul.f32 f1528, f1469, f1521; +mul.f32 f2200, f1468, f1519; +sub.f32 f1529, f2200, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f2198, f1529, f1432; +mul.f32 f2199, f1531, f2224; +sub.f32 f1534, f2198, f2199; +mul.f32 f1535, f1529, f2224; +fma.rn.f32 f1536, f1531, f1432, f1535; +mul.f32 f2196, f1468, f1529; +mul.f32 f2197, f1469, f1531; +sub.f32 f1539, f2196, f2197; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f2194, f1539, f1436; +mul.f32 f2195, f1541, f2223; +sub.f32 f1544, f2194, f2195; +mul.f32 f1545, f1539, f2223; +fma.rn.f32 f1546, f1541, f1436, f1545; +mul.f32 f2192, f1468, f1539; +mul.f32 f2193, f1469, f1541; +sub.f32 f1549, f2192, f2193; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1553, f1551, f2222; +mul.f32 f2191, f1549, f1440; +sub.f32 f1554, f2191, f1553; +mul.f32 f1555, f1549, f2222; +fma.rn.f32 f1556, f1551, f1440, f1555; +mul.f32 f1558, f1469, f1551; +mul.f32 f2190, f1468, f1549; +sub.f32 f1559, f2190, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1563, f1561, f2221; +mul.f32 f2189, f1559, f1444; +sub.f32 f1564, f2189, f1563; +mul.f32 f1565, f1559, f2221; +fma.rn.f32 f1566, f1561, f1444, f1565; +mul.f32 f1568, f1469, f1561; +mul.f32 f2188, f1468, f1559; +sub.f32 f1569, f2188, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1573, f1571, f2220; +mul.f32 f2187, f1569, f1448; +sub.f32 f1574, f2187, f1573; +mul.f32 f1575, f1569, f2220; +fma.rn.f32 f1576, f1571, f1448, f1575; +mul.f32 f1578, f1469, f1571; +mul.f32 f2186, f1468, f1569; +sub.f32 f1579, f2186, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f2184, f1579, f1452; +mul.f32 f2185, f1581, f2219; +sub.f32 f1584, f2184, f2185; +mul.f32 f1585, f1579, f2219; +fma.rn.f32 f1586, f1581, f1452, f1585; +mul.f32 f2182, f1468, f1579; +mul.f32 f2183, f1469, f1581; +sub.f32 f1589, f2182, f2183; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f2180, f1589, f1456; +mul.f32 f2181, f1591, f2218; +sub.f32 f1594, f2180, f2181; +mul.f32 f1595, f1589, f2218; +fma.rn.f32 f1596, f1591, f1456, f1595; +mul.f32 f1598, f1469, f1591; +mul.f32 f2179, f1468, f1589; +sub.f32 f1599, f2179, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1603, f1601, f2217; +mul.f32 f2178, f1599, f1460; +sub.f32 f1604, f2178, f1603; +mul.f32 f1605, f1599, f2217; +fma.rn.f32 f1606, f1601, f1460, f1605; +mul.f32 f1608, f1469, f1601; +mul.f32 f2177, f1468, f1599; +sub.f32 f1609, f2177, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1613, f1611, f2216; +mul.f32 f2176, f1609, f1464; +sub.f32 f1614, f2176, f1613; +mul.f32 f1615, f1609, f2216; +fma.rn.f32 f1616, f1611, f1464, f1615; +mul.f32 f1618, f1469, f1611; +mul.f32 f2175, f1468, f1609; +sub.f32 f1619, f2175, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1623, f1621, f1407; +mul.f32 f2174, f1619, f1406; +sub.f32 f1624, f2174, f1623; +mul.f32 f1625, f1619, f1407; +fma.rn.f32 f1626, f1621, f1406, f1625; +mul.f32 f2172, f1468, f1619; +mul.f32 f2173, f1469, f1621; +sub.f32 f1629, f2172, f2173; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f2170, f1629, f1410; +mul.f32 f2171, f1631, f1411; +sub.f32 f1634, f2170, f2171; +mul.f32 f1635, f1629, f1411; +fma.rn.f32 f1636, f1631, f1410, f1635; +mul.f32 f2168, f1468, f1629; +mul.f32 f2169, f1469, f1631; +sub.f32 f1639, f2168, f2169; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f2166, f1639, f1414; +mul.f32 f2167, f1641, f1415; +sub.f32 f1644, f2166, f2167; +mul.f32 f1645, f1639, f1415; +fma.rn.f32 f1646, f1641, f1414, f1645; +mul.f32 f1648, f1469, f1641; +mul.f32 f2165, f1468, f1639; +sub.f32 f1649, f2165, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1653, f1651, f1419; +mul.f32 f2164, f1649, f1418; +sub.f32 f1654, f2164, f1653; +mul.f32 f1655, f1649, f1419; +fma.rn.f32 f1656, f1651, f1418, f1655; +mul.f32 f1658, f1469, f1651; +mul.f32 f2163, f1468, f1649; +sub.f32 f1659, f2163, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1663, f1661, f1423; +mul.f32 f2162, f1659, f1422; +sub.f32 f1664, f2162, f1663; +mul.f32 f1665, f1659, f1423; +fma.rn.f32 f1666, f1661, f1422, f1665; +mul.f32 f1668, f1469, f1661; +mul.f32 f2161, f1468, f1659; +sub.f32 f1669, f2161, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f2159, f1669, f1426; +mul.f32 f2160, f1671, f1427; +sub.f32 f1674, f2159, f2160; +mul.f32 f1675, f1669, f1427; +fma.rn.f32 f1676, f1671, f1426, f1675; +mul.f32 f2157, f1468, f1669; +mul.f32 f2158, f1469, f1671; +sub.f32 f1679, f2157, f2158; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f2155, f1679, f1430; +mul.f32 f2156, f1681, f1431; +sub.f32 f1684, f2155, f2156; +mul.f32 f1685, f1679, f1431; +fma.rn.f32 f1686, f1681, f1430, f1685; +mul.f32 f2153, f1468, f1679; +mul.f32 f2154, f1469, f1681; +sub.f32 f1689, f2153, f2154; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1693, f1691, f1435; +mul.f32 f2152, f1689, f1434; +sub.f32 f1694, f2152, f1693; +mul.f32 f1695, f1689, f1435; +fma.rn.f32 f1696, f1691, f1434, f1695; +mul.f32 f1698, f1469, f1691; +mul.f32 f2151, f1468, f1689; +sub.f32 f1699, f2151, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1703, f1701, f1439; +mul.f32 f2150, f1699, f1438; +sub.f32 f1704, f2150, f1703; +mul.f32 f1705, f1699, f1439; +fma.rn.f32 f1706, f1701, f1438, f1705; +mul.f32 f1708, f1469, f1701; +mul.f32 f2149, f1468, f1699; +sub.f32 f1709, f2149, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1713, f1711, f1443; +mul.f32 f2148, f1709, f1442; +sub.f32 f1714, f2148, f1713; +mul.f32 f1715, f1709, f1443; +fma.rn.f32 f1716, f1711, f1442, f1715; +mul.f32 f2146, f1468, f1709; +mul.f32 f2147, f1469, f1711; +sub.f32 f1719, f2146, f2147; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f2144, f1719, f1446; +mul.f32 f2145, f1721, f1447; +sub.f32 f1724, f2144, f2145; +mul.f32 f1725, f1719, f1447; +fma.rn.f32 f1726, f1721, f1446, f1725; +mul.f32 f2142, f1468, f1719; +mul.f32 f2143, f1469, f1721; +sub.f32 f1729, f2142, f2143; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f2140, f1729, f1450; +mul.f32 f2141, f1731, f1451; +sub.f32 f1734, f2140, f2141; +mul.f32 f1735, f1729, f1451; +fma.rn.f32 f1736, f1731, f1450, f1735; +mul.f32 f1738, f1469, f1731; +mul.f32 f2139, f1468, f1729; +sub.f32 f1739, f2139, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1743, f1741, f1455; +mul.f32 f2138, f1739, f1454; +sub.f32 f1744, f2138, f1743; +mul.f32 f1745, f1739, f1455; +fma.rn.f32 f1746, f1741, f1454, f1745; +mul.f32 f1748, f1469, f1741; +mul.f32 f2137, f1468, f1739; +sub.f32 f1749, f2137, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1753, f1751, f1459; +mul.f32 f2136, f1749, f1458; +sub.f32 f1754, f2136, f1753; +mul.f32 f1755, f1749, f1459; +fma.rn.f32 f1756, f1751, f1458, f1755; +mul.f32 f1758, f1469, f1751; +mul.f32 f2135, f1468, f1749; +sub.f32 f1759, f2135, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f2133, f1759, f1462; +mul.f32 f2134, f1761, f1463; +sub.f32 f1764, f2133, f2134; +mul.f32 f1765, f1759, f1463; +fma.rn.f32 f1766, f1761, f1462, f1765; +mul.f32 f2131, f1468, f1759; +mul.f32 f2132, f1469, f1761; +sub.f32 f1769, f2131, f2132; +mov.u32 r28, %tid.x; +shl.b32 r27, r28, 7; +mul.f32 f1770, f1468, f1761; +mov.u32 r31, %tid.x; +shl.b32 r30, r31, 7; +fma.rn.f32 f1771, f1469, f1759, f1770; +mul.f32 f2129, f1769, f1466; +mul.f32 f2130, f1771, f1467; +sub.f32 f1774, f2129, f2130; +mul.f32 f1775, f1769, f1467; +mov.u32 r33, %tid.x; +fma.rn.f32 f1776, f1771, f1466, f1775; +and.b32 r22, r33, 224; +shl.b32 r16, r33, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r30, 28672; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1474; +st.shared.f32 [r20+256], f1484; +st.shared.f32 [r20+384], f1494; +st.shared.f32 [r20+512], f1504; +st.shared.f32 [r20+640], f1514; +st.shared.f32 [r20+768], f1524; +st.shared.f32 [r20+896], f1534; +st.shared.f32 [r20+1024], f1544; +st.shared.f32 [r20+1152], f1554; +st.shared.f32 [r20+1280], f1564; +st.shared.f32 [r20+1408], f1574; +st.shared.f32 [r20+1536], f1584; +st.shared.f32 [r20+1664], f1594; +st.shared.f32 [r20+1792], f1604; +st.shared.f32 [r20+1920], f1614; +st.shared.f32 [r20+2048], f1624; +st.shared.f32 [r20+2176], f1634; +st.shared.f32 [r20+2304], f1644; +st.shared.f32 [r20+2432], f1654; +st.shared.f32 [r20+2560], f1664; +st.shared.f32 [r20+2688], f1674; +st.shared.f32 [r20+2816], f1684; +st.shared.f32 [r20+2944], f1694; +st.shared.f32 [r20+3072], f1704; +st.shared.f32 [r20+3200], f1714; +st.shared.f32 [r20+3328], f1724; +st.shared.f32 [r20+3456], f1734; +st.shared.f32 [r20+3584], f1744; +st.shared.f32 [r20+3712], f1754; +st.shared.f32 [r20+3840], f1764; +st.shared.f32 [r20+3968], f1774; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+1024]; +ld.shared.f32 f1779, [r21+2048]; +ld.shared.f32 f1780, [r21+3072]; +ld.shared.f32 f1781, [r21+4096]; +ld.shared.f32 f1782, [r21+5120]; +ld.shared.f32 f1783, [r21+6144]; +ld.shared.f32 f1784, [r21+7168]; +ld.shared.f32 f1785, [r21+8192]; +ld.shared.f32 f1786, [r21+9216]; +ld.shared.f32 f1787, [r21+10240]; +ld.shared.f32 f1788, [r21+11264]; +ld.shared.f32 f1789, [r21+12288]; +ld.shared.f32 f1790, [r21+13312]; +ld.shared.f32 f1791, [r21+14336]; +ld.shared.f32 f1792, [r21+15360]; +ld.shared.f32 f1793, [r21+16384]; +ld.shared.f32 f1794, [r21+17408]; +ld.shared.f32 f1795, [r21+18432]; +ld.shared.f32 f1796, [r21+19456]; +ld.shared.f32 f1797, [r21+20480]; +ld.shared.f32 f1798, [r21+21504]; +ld.shared.f32 f1799, [r21+22528]; +ld.shared.f32 f1800, [r21+23552]; +ld.shared.f32 f1801, [r21+24576]; +ld.shared.f32 f1802, [r21+25600]; +ld.shared.f32 f1803, [r21+26624]; +ld.shared.f32 f1804, [r21+27648]; +ld.shared.f32 f1805, [r21+28672]; +ld.shared.f32 f1806, [r21+29696]; +ld.shared.f32 f1807, [r21+30720]; +ld.shared.f32 f1808, [r21+31744]; +barrier.sync 0; +st.shared.f32 [r20], f2231; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+1024]; +ld.shared.f32 f1811, [r21+2048]; +ld.shared.f32 f1812, [r21+3072]; +ld.shared.f32 f1813, [r21+4096]; +ld.shared.f32 f1814, [r21+5120]; +ld.shared.f32 f1815, [r21+6144]; +ld.shared.f32 f1816, [r21+7168]; +ld.shared.f32 f1817, [r21+8192]; +ld.shared.f32 f1818, [r21+9216]; +ld.shared.f32 f1819, [r21+10240]; +ld.shared.f32 f1820, [r21+11264]; +ld.shared.f32 f1821, [r21+12288]; +ld.shared.f32 f1822, [r21+13312]; +ld.shared.f32 f1823, [r21+14336]; +ld.shared.f32 f1824, [r21+15360]; +ld.shared.f32 f1825, [r21+16384]; +ld.shared.f32 f1826, [r21+17408]; +ld.shared.f32 f1827, [r21+18432]; +ld.shared.f32 f1828, [r21+19456]; +ld.shared.f32 f1829, [r21+20480]; +ld.shared.f32 f1830, [r21+21504]; +ld.shared.f32 f1831, [r21+22528]; +ld.shared.f32 f1832, [r21+23552]; +ld.shared.f32 f1833, [r21+24576]; +ld.shared.f32 f1834, [r21+25600]; +ld.shared.f32 f1835, [r21+26624]; +ld.shared.f32 f1836, [r21+27648]; +ld.shared.f32 f1837, [r21+28672]; +ld.shared.f32 f1838, [r21+29696]; +ld.shared.f32 f1839, [r21+30720]; +ld.shared.f32 f1840, [r21+31744]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2128, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2127, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1841, f1845; +sub.f32 f1851, f1841, f1845; +add.f32 f2126, f2128, f2127; +sub.f32 f1852, f2128, f2127; +add.f32 f1853, f1843, f1848; +sub.f32 f1855, f1843, f1848; +sub.f32 f2125, f1844, f1847; +add.f32 f1856, f1844, f1847; +add.f32 f1857, f1781, f1797; +sub.f32 f1859, f1781, f1797; +add.f32 f2124, f1813, f1829; +sub.f32 f1860, f1813, f1829; +add.f32 f1861, f1789, f1805; +sub.f32 f1863, f1789, f1805; +add.f32 f2123, f1821, f1837; +sub.f32 f1864, f1821, f1837; +add.f32 f1865, f1857, f1861; +sub.f32 f1867, f1857, f1861; +add.f32 f2122, f2124, f2123; +sub.f32 f1868, f2124, f2123; +add.f32 f1869, f1859, f1864; +sub.f32 f1871, f1859, f1864; +sub.f32 f2121, f1860, f1863; +add.f32 f1872, f1860, f1863; +mul.f32 f2119, f1869, 0f3F3504F3; +mul.f32 f2120, f2121, 0fBF3504F3; +sub.f32 f1875, f2119, f2120; +mul.f32 f1876, f2121, 0f3F3504F3; +fma.rn.f32 f1877, f1869, 0fBF3504F3, f1876; +mul.f32 f1878, f1871, 0fBF3504F3; +mul.f32 f1879, f1872, 0fBF3504F3; +sub.f32 f1880, f1878, f1879; +add.f32 f1881, f1878, f1879; +add.f32 f1882, f1778, f1794; +sub.f32 f1884, f1778, f1794; +add.f32 f2118, f1810, f1826; +sub.f32 f1885, f1810, f1826; +add.f32 f1886, f1786, f1802; +sub.f32 f1888, f1786, f1802; +add.f32 f2117, f1818, f1834; +sub.f32 f1889, f1818, f1834; +add.f32 f1890, f1882, f1886; +sub.f32 f1892, f1882, f1886; +add.f32 f2116, f2118, f2117; +sub.f32 f1893, f2118, f2117; +add.f32 f1894, f1884, f1889; +sub.f32 f1896, f1884, f1889; +sub.f32 f2115, f1885, f1888; +add.f32 f1897, f1885, f1888; +add.f32 f1898, f1782, f1798; +sub.f32 f1900, f1782, f1798; +add.f32 f2114, f1814, f1830; +sub.f32 f1901, f1814, f1830; +add.f32 f1902, f1790, f1806; +sub.f32 f1904, f1790, f1806; +add.f32 f2113, f1822, f1838; +sub.f32 f1905, f1822, f1838; +add.f32 f1906, f1898, f1902; +sub.f32 f1908, f1898, f1902; +add.f32 f2112, f2114, f2113; +sub.f32 f1909, f2114, f2113; +add.f32 f1910, f1900, f1905; +sub.f32 f1912, f1900, f1905; +sub.f32 f2111, f1901, f1904; +add.f32 f1913, f1901, f1904; +mul.f32 f1915, f2111, 0fBF3504F3; +mul.f32 f2110, f1910, 0f3F3504F3; +sub.f32 f1916, f2110, f1915; +mul.f32 f1917, f2111, 0f3F3504F3; +fma.rn.f32 f1918, f1910, 0fBF3504F3, f1917; +mul.f32 f1919, f1912, 0fBF3504F3; +mul.f32 f1920, f1913, 0fBF3504F3; +sub.f32 f1921, f1919, f1920; +add.f32 f1922, f1919, f1920; +add.f32 f1923, f1779, f1795; +sub.f32 f1925, f1779, f1795; +add.f32 f2109, f1811, f1827; +sub.f32 f1926, f1811, f1827; +add.f32 f1927, f1787, f1803; +sub.f32 f1929, f1787, f1803; +add.f32 f2108, f1819, f1835; +sub.f32 f1930, f1819, f1835; +add.f32 f1931, f1923, f1927; +sub.f32 f1933, f1923, f1927; +add.f32 f2107, f2109, f2108; +sub.f32 f1934, f2109, f2108; +add.f32 f1935, f1925, f1930; +sub.f32 f1937, f1925, f1930; +sub.f32 f2106, f1926, f1929; +add.f32 f1938, f1926, f1929; +add.f32 f1939, f1783, f1799; +sub.f32 f1941, f1783, f1799; +add.f32 f2105, f1815, f1831; +sub.f32 f1942, f1815, f1831; +add.f32 f1943, f1791, f1807; +sub.f32 f1945, f1791, f1807; +add.f32 f2104, f1823, f1839; +sub.f32 f1946, f1823, f1839; +add.f32 f1947, f1939, f1943; +sub.f32 f1949, f1939, f1943; +add.f32 f2103, f2105, f2104; +sub.f32 f1950, f2105, f2104; +add.f32 f1951, f1941, f1946; +sub.f32 f1953, f1941, f1946; +sub.f32 f2102, f1942, f1945; +add.f32 f1954, f1942, f1945; +mul.f32 f1956, f2102, 0fBF3504F3; +mul.f32 f2101, f1951, 0f3F3504F3; +sub.f32 f1957, f2101, f1956; +mul.f32 f1958, f2102, 0f3F3504F3; +fma.rn.f32 f1959, f1951, 0fBF3504F3, f1958; +mul.f32 f1960, f1953, 0fBF3504F3; +mul.f32 f1961, f1954, 0fBF3504F3; +sub.f32 f1962, f1960, f1961; +add.f32 f1963, f1960, f1961; +add.f32 f1964, f1780, f1796; +sub.f32 f1966, f1780, f1796; +add.f32 f2100, f1812, f1828; +sub.f32 f1967, f1812, f1828; +add.f32 f1968, f1788, f1804; +sub.f32 f1970, f1788, f1804; +add.f32 f2099, f1820, f1836; +sub.f32 f1971, f1820, f1836; +add.f32 f1972, f1964, f1968; +sub.f32 f1974, f1964, f1968; +add.f32 f2098, f2100, f2099; +sub.f32 f1975, f2100, f2099; +add.f32 f1976, f1966, f1971; +sub.f32 f1978, f1966, f1971; +sub.f32 f2097, f1967, f1970; +add.f32 f1979, f1967, f1970; +add.f32 f1980, f1784, f1800; +sub.f32 f1982, f1784, f1800; +add.f32 f2096, f1816, f1832; +sub.f32 f1983, f1816, f1832; +add.f32 f1984, f1792, f1808; +sub.f32 f1986, f1792, f1808; +add.f32 f2095, f1824, f1840; +sub.f32 f1987, f1824, f1840; +add.f32 f1988, f1980, f1984; +sub.f32 f1990, f1980, f1984; +add.f32 f2094, f2096, f2095; +sub.f32 f1991, f2096, f2095; +add.f32 f1992, f1982, f1987; +sub.f32 f1994, f1982, f1987; +sub.f32 f2093, f1983, f1986; +add.f32 f1995, f1983, f1986; +mul.f32 f2091, f1992, 0f3F3504F3; +mul.f32 f2092, f2093, 0fBF3504F3; +sub.f32 f1998, f2091, f2092; +mul.f32 f1999, f2093, 0f3F3504F3; +fma.rn.f32 f2000, f1992, 0fBF3504F3, f1999; +mul.f32 f2001, f1994, 0fBF3504F3; +mul.f32 f2002, f1995, 0fBF3504F3; +sub.f32 f2003, f2001, f2002; +add.f32 f2004, f2001, f2002; +add.f32 %0, f1849, f1865; +add.f32 %1, f2126, f2122; +add.f32 %3, f2116, f2112; +add.f32 %2, f1890, f1906; +add.f32 %5, f2107, f2103; +add.f32 %4, f1931, f1947; +add.f32 %7, f2098, f2094; +add.f32 %6, f1972, f1988; +add.f32 %8, f1853, f1875; +add.f32 %9, f2125, f1877; +add.f32 %11, f2115, f1918; +add.f32 %10, f1894, f1916; +add.f32 %13, f2106, f1959; +add.f32 %12, f1935, f1957; +add.f32 %14, f1976, f1998; +add.f32 %15, f2097, f2000; +add.f32 %16, f1851, f1868; +sub.f32 %17, f1852, f1867; +add.f32 %18, f1892, f1909; +sub.f32 %19, f1893, f1908; +add.f32 %20, f1933, f1950; +sub.f32 %21, f1934, f1949; +sub.f32 %23, f1975, f1990; +add.f32 %22, f1974, f1991; +add.f32 %25, f1856, f1881; +add.f32 %24, f1855, f1880; +add.f32 %27, f1897, f1922; +add.f32 %26, f1896, f1921; +add.f32 %28, f1937, f1962; +add.f32 %29, f1938, f1963; +add.f32 %30, f1978, f2003; +add.f32 %31, f1979, f2004; +sub.f32 %32, f1849, f1865; +sub.f32 %33, f2126, f2122; +sub.f32 %34, f1890, f1906; +sub.f32 %35, f2116, f2112; +sub.f32 %36, f1931, f1947; +sub.f32 %37, f2107, f2103; +sub.f32 %38, f1972, f1988; +sub.f32 %39, f2098, f2094; +sub.f32 %41, f2125, f1877; +sub.f32 %40, f1853, f1875; +sub.f32 %43, f2115, f1918; +sub.f32 %42, f1894, f1916; +sub.f32 %45, f2106, f1959; +sub.f32 %44, f1935, f1957; +sub.f32 %47, f2097, f2000; +sub.f32 %46, f1976, f1998; +add.f32 %49, f1852, f1867; +sub.f32 %48, f1851, f1868; +add.f32 %51, f1893, f1908; +sub.f32 %50, f1892, f1909; +add.f32 %53, f1934, f1949; +sub.f32 %52, f1933, f1950; +add.f32 %55, f1975, f1990; +sub.f32 %54, f1974, f1991; +sub.f32 %57, f1856, f1881; +sub.f32 %56, f1855, f1880; +sub.f32 %59, f1897, f1922; +sub.f32 %58, f1896, f1921; +sub.f32 %61, f1938, f1963; +sub.f32 %60, f1937, f1962; +sub.f32 %63, f1979, f2004; +sub.f32 %62, f1978, f2003; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_8192), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<109, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1165>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %36, %57; +add.f32 f66, %37, %59; +sub.f32 f67, %36, %57; +sub.f32 f68, %37, %59; +add.f32 f69, %46, %68; +add.f32 f70, %48, %69; +sub.f32 f71, %46, %68; +sub.f32 f72, %48, %69; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +add.f32 f77, f67, f72; +sub.f32 f78, f68, f71; +sub.f32 f79, f67, f72; +add.f32 f80, f68, f71; +add.f32 f81, %41, %62; +add.f32 f82, %43, %64; +sub.f32 f83, %41, %62; +sub.f32 f84, %43, %64; +add.f32 f85, %52, %73; +add.f32 f86, %53, %75; +sub.f32 f87, %52, %73; +sub.f32 f88, %53, %75; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +add.f32 f93, f83, f88; +sub.f32 f94, f84, f87; +sub.f32 f95, f83, f88; +add.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0fBF3504F3; +sub.f32 f99, f97, f98; +mul.f32 f100, f94, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f101; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f101; +add.f32 f114, f75, f92; +sub.f32 f115, f76, f91; +sub.f32 f116, f75, f92; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f104; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %60; +add.f32 f123, %40, %61; +sub.f32 f124, %38, %60; +sub.f32 f125, %40, %61; +add.f32 f126, %49, %70; +add.f32 f127, %51, %72; +sub.f32 f128, %49, %70; +sub.f32 f129, %51, %72; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +add.f32 f134, f124, f129; +sub.f32 f135, f125, f128; +sub.f32 f136, f124, f129; +add.f32 f137, f125, f128; +add.f32 f138, %44, %65; +add.f32 f139, %45, %67; +sub.f32 f140, %44, %65; +sub.f32 f141, %45, %67; +add.f32 f142, %54, %76; +add.f32 f143, %56, %77; +sub.f32 f144, %54, %76; +sub.f32 f145, %56, %77; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +add.f32 f150, f140, f145; +sub.f32 f151, f141, f144; +sub.f32 f152, f140, f145; +add.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0fBF3504F3; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f158; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f158; +add.f32 f171, f132, f149; +sub.f32 f172, f133, f148; +sub.f32 f173, f132, f149; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f161; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0fBEC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0fBF3504F3; +sub.f32 f186, f184, f185; +mul.f32 f187, f172, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f189, f175, 0f3EC3EF15; +mul.f32 f190, f176, 0fBF6C835E; +sub.f32 f191, f189, f190; +mul.f32 f192, f176, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f194, f169, 0fBEC3EF15; +mul.f32 f195, f170, 0fBF6C835E; +sub.f32 f196, f194, f195; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0fBEC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f188; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f188; +add.f32 f220, f118, f191; +add.f32 f221, f119, f193; +sub.f32 f222, f118, f191; +sub.f32 f223, f119, f193; +add.f32 f224, f108, f166; +sub.f32 f225, f109, f165; +sub.f32 f226, f108, f166; +add.f32 f227, f109, f165; +add.f32 f228, f112, f196; +add.f32 f229, f113, f198; +sub.f32 f230, f112, f196; +sub.f32 f231, f113, f198; +add.f32 f232, f116, f201; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f201; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f240, f212; +mul.f32 f245, f241, f213; +sub.f32 f246, f244, f245; +mul.f32 f247, f240, f213; +fma.rn.f32 f248, f241, f212, f247; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f251, f216; +mul.f32 f255, f253, f217; +sub.f32 f256, f254, f255; +mul.f32 f257, f251, f217; +fma.rn.f32 f258, f253, f216, f257; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f261, f220; +mul.f32 f265, f263, f221; +sub.f32 f266, f264, f265; +mul.f32 f267, f261, f221; +fma.rn.f32 f268, f263, f220, f267; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f271, f224; +mul.f32 f275, f273, f225; +sub.f32 f276, f274, f275; +mul.f32 f277, f271, f225; +fma.rn.f32 f278, f273, f224, f277; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f281, f228; +mul.f32 f285, f283, f229; +sub.f32 f286, f284, f285; +mul.f32 f287, f281, f229; +fma.rn.f32 f288, f283, f228, f287; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f291, f232; +mul.f32 f295, f293, f233; +sub.f32 f296, f294, f295; +mul.f32 f297, f291, f233; +fma.rn.f32 f298, f293, f232, f297; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f301, f236; +mul.f32 f305, f303, f237; +sub.f32 f306, f304, f305; +mul.f32 f307, f301, f237; +fma.rn.f32 f308, f303, f236, f307; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f311, f210; +mul.f32 f315, f313, f211; +sub.f32 f316, f314, f315; +mul.f32 f317, f311, f211; +fma.rn.f32 f318, f313, f210, f317; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f321, f214; +mul.f32 f325, f323, f215; +sub.f32 f326, f324, f325; +mul.f32 f327, f321, f215; +fma.rn.f32 f328, f323, f214, f327; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f331, f218; +mul.f32 f335, f333, f219; +sub.f32 f336, f334, f335; +mul.f32 f337, f331, f219; +fma.rn.f32 f338, f333, f218, f337; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f341, f222; +mul.f32 f345, f343, f223; +sub.f32 f346, f344, f345; +mul.f32 f347, f341, f223; +fma.rn.f32 f348, f343, f222, f347; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f351, f226; +mul.f32 f355, f353, f227; +sub.f32 f356, f354, f355; +mul.f32 f357, f351, f227; +fma.rn.f32 f358, f353, f226, f357; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f361, f230; +mul.f32 f365, f363, f231; +sub.f32 f366, f364, f365; +mul.f32 f367, f361, f231; +fma.rn.f32 f368, f363, f230, f367; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f371, f234; +mul.f32 f375, f373, f235; +sub.f32 f376, f374, f375; +mul.f32 f377, f371, f235; +fma.rn.f32 f378, f373, f234, f377; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f381, f238; +mul.f32 f385, f383, f239; +sub.f32 f386, f384, f385; +mul.f32 f387, f381, f239; +fma.rn.f32 f388, f383, f238, f387; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32704; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f246, f256, f266}; +st.shared.v4.f32 [r12+16], {f276, f286, f296, f306}; +st.shared.v4.f32 [r12+32], {f316, f326, f336, f346}; +st.shared.v4.f32 [r12+48], {f356, f366, f376, f386}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+2048]; +ld.shared.f32 f391, [r13+4096]; +ld.shared.f32 f392, [r13+6144]; +ld.shared.f32 f393, [r13+8192]; +ld.shared.f32 f394, [r13+10240]; +ld.shared.f32 f395, [r13+12288]; +ld.shared.f32 f396, [r13+14336]; +ld.shared.f32 f397, [r13+16384]; +ld.shared.f32 f398, [r13+18432]; +ld.shared.f32 f399, [r13+20480]; +ld.shared.f32 f400, [r13+22528]; +ld.shared.f32 f401, [r13+24576]; +ld.shared.f32 f402, [r13+26624]; +ld.shared.f32 f403, [r13+28672]; +ld.shared.f32 f404, [r13+30720]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+2048]; +ld.shared.f32 f407, [r13+4096]; +ld.shared.f32 f408, [r13+6144]; +ld.shared.f32 f409, [r13+8192]; +ld.shared.f32 f410, [r13+10240]; +ld.shared.f32 f411, [r13+12288]; +ld.shared.f32 f412, [r13+14336]; +ld.shared.f32 f413, [r13+16384]; +ld.shared.f32 f414, [r13+18432]; +ld.shared.f32 f415, [r13+20480]; +ld.shared.f32 f416, [r13+22528]; +ld.shared.f32 f417, [r13+24576]; +ld.shared.f32 f418, [r13+26624]; +ld.shared.f32 f419, [r13+28672]; +ld.shared.f32 f420, [r13+30720]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +add.f32 f433, f423, f428; +sub.f32 f434, f424, f427; +sub.f32 f435, f423, f428; +add.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +add.f32 f449, f439, f444; +sub.f32 f450, f440, f443; +sub.f32 f451, f439, f444; +add.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0fBF3504F3; +sub.f32 f455, f453, f454; +mul.f32 f456, f450, 0f3F3504F3; +fma.rn.f32 f457, f449, 0fBF3504F3, f456; +mul.f32 f458, f451, 0fBF3504F3; +mul.f32 f459, f452, 0fBF3504F3; +sub.f32 f460, f458, f459; +add.f32 f461, f458, f459; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f457; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f457; +add.f32 f470, f431, f448; +sub.f32 f471, f432, f447; +sub.f32 f472, f431, f448; +add.f32 f473, f432, f447; +add.f32 f474, f435, f460; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f460; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +add.f32 f490, f480, f485; +sub.f32 f491, f481, f484; +sub.f32 f492, f480, f485; +add.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +add.f32 f506, f496, f501; +sub.f32 f507, f497, f500; +sub.f32 f508, f496, f501; +add.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0fBF3504F3; +sub.f32 f512, f510, f511; +mul.f32 f513, f507, 0f3F3504F3; +fma.rn.f32 f514, f506, 0fBF3504F3, f513; +mul.f32 f515, f508, 0fBF3504F3; +mul.f32 f516, f509, 0fBF3504F3; +sub.f32 f517, f515, f516; +add.f32 f518, f515, f516; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f514; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f514; +add.f32 f527, f488, f505; +sub.f32 f528, f489, f504; +sub.f32 f529, f488, f505; +add.f32 f530, f489, f504; +add.f32 f531, f492, f517; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f517; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0fBEC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0fBEC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0fBF3504F3; +sub.f32 f542, f540, f541; +mul.f32 f543, f528, 0f3F3504F3; +fma.rn.f32 f544, f527, 0fBF3504F3, f543; +mul.f32 f545, f531, 0f3EC3EF15; +mul.f32 f546, f532, 0fBF6C835E; +sub.f32 f547, f545, f546; +mul.f32 f548, f532, 0f3EC3EF15; +fma.rn.f32 f549, f531, 0fBF6C835E, f548; +mul.f32 f550, f525, 0fBEC3EF15; +mul.f32 f551, f526, 0fBF6C835E; +sub.f32 f552, f550, f551; +mul.f32 f553, f526, 0fBEC3EF15; +fma.rn.f32 f554, f525, 0fBF6C835E, f553; +mul.f32 f555, f529, 0fBF3504F3; +mul.f32 f556, f530, 0fBF3504F3; +sub.f32 f557, f555, f556; +add.f32 f558, f555, f556; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0fBEC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0fBEC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f544; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f544; +add.f32 f576, f474, f547; +add.f32 f577, f475, f549; +sub.f32 f578, f474, f547; +sub.f32 f579, f475, f549; +add.f32 f580, f464, f522; +sub.f32 f581, f465, f521; +sub.f32 f582, f464, f522; +add.f32 f583, f465, f521; +add.f32 f584, f468, f552; +add.f32 f585, f469, f554; +sub.f32 f586, f468, f552; +sub.f32 f587, f469, f554; +add.f32 f588, f472, f557; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f557; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 496; +bfe.u32 r15, r5, 4, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f596, f568; +mul.f32 f601, f597, f569; +sub.f32 f602, f600, f601; +mul.f32 f603, f596, f569; +fma.rn.f32 f604, f597, f568, f603; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f607, f572; +mul.f32 f611, f609, f573; +sub.f32 f612, f610, f611; +mul.f32 f613, f607, f573; +fma.rn.f32 f614, f609, f572, f613; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f617, f576; +mul.f32 f621, f619, f577; +sub.f32 f622, f620, f621; +mul.f32 f623, f617, f577; +fma.rn.f32 f624, f619, f576, f623; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f627, f580; +mul.f32 f631, f629, f581; +sub.f32 f632, f630, f631; +mul.f32 f633, f627, f581; +fma.rn.f32 f634, f629, f580, f633; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f637, f584; +mul.f32 f641, f639, f585; +sub.f32 f642, f640, f641; +mul.f32 f643, f637, f585; +fma.rn.f32 f644, f639, f584, f643; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f647, f588; +mul.f32 f651, f649, f589; +sub.f32 f652, f650, f651; +mul.f32 f653, f647, f589; +fma.rn.f32 f654, f649, f588, f653; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f657, f592; +mul.f32 f661, f659, f593; +sub.f32 f662, f660, f661; +mul.f32 f663, f657, f593; +fma.rn.f32 f664, f659, f592, f663; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f667, f566; +mul.f32 f671, f669, f567; +sub.f32 f672, f670, f671; +mul.f32 f673, f667, f567; +fma.rn.f32 f674, f669, f566, f673; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f677, f570; +mul.f32 f681, f679, f571; +sub.f32 f682, f680, f681; +mul.f32 f683, f677, f571; +fma.rn.f32 f684, f679, f570, f683; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f687, f574; +mul.f32 f691, f689, f575; +sub.f32 f692, f690, f691; +mul.f32 f693, f687, f575; +fma.rn.f32 f694, f689, f574, f693; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f697, f578; +mul.f32 f701, f699, f579; +sub.f32 f702, f700, f701; +mul.f32 f703, f697, f579; +fma.rn.f32 f704, f699, f578, f703; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f707, f582; +mul.f32 f711, f709, f583; +sub.f32 f712, f710, f711; +mul.f32 f713, f707, f583; +fma.rn.f32 f714, f709, f582, f713; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f717, f586; +mul.f32 f721, f719, f587; +sub.f32 f722, f720, f721; +mul.f32 f723, f717, f587; +fma.rn.f32 f724, f719, f586, f723; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f727, f590; +mul.f32 f731, f729, f591; +sub.f32 f732, f730, f731; +mul.f32 f733, f727, f591; +fma.rn.f32 f734, f729, f590, f733; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f737, f594; +mul.f32 f741, f739, f595; +sub.f32 f742, f740, f741; +mul.f32 f743, f737, f595; +fma.rn.f32 f744, f739, f594, f743; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 31744; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f602; +st.shared.f32 [r20+128], f612; +st.shared.f32 [r20+192], f622; +st.shared.f32 [r20+256], f632; +st.shared.f32 [r20+320], f642; +st.shared.f32 [r20+384], f652; +st.shared.f32 [r20+448], f662; +st.shared.f32 [r20+512], f672; +st.shared.f32 [r20+576], f682; +st.shared.f32 [r20+640], f692; +st.shared.f32 [r20+704], f702; +st.shared.f32 [r20+768], f712; +st.shared.f32 [r20+832], f722; +st.shared.f32 [r20+896], f732; +st.shared.f32 [r20+960], f742; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+2048]; +ld.shared.f32 f747, [r21+4096]; +ld.shared.f32 f748, [r21+6144]; +ld.shared.f32 f749, [r21+8192]; +ld.shared.f32 f750, [r21+10240]; +ld.shared.f32 f751, [r21+12288]; +ld.shared.f32 f752, [r21+14336]; +ld.shared.f32 f753, [r21+16384]; +ld.shared.f32 f754, [r21+18432]; +ld.shared.f32 f755, [r21+20480]; +ld.shared.f32 f756, [r21+22528]; +ld.shared.f32 f757, [r21+24576]; +ld.shared.f32 f758, [r21+26624]; +ld.shared.f32 f759, [r21+28672]; +ld.shared.f32 f760, [r21+30720]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+2048]; +ld.shared.f32 f763, [r21+4096]; +ld.shared.f32 f764, [r21+6144]; +ld.shared.f32 f765, [r21+8192]; +ld.shared.f32 f766, [r21+10240]; +ld.shared.f32 f767, [r21+12288]; +ld.shared.f32 f768, [r21+14336]; +ld.shared.f32 f769, [r21+16384]; +ld.shared.f32 f770, [r21+18432]; +ld.shared.f32 f771, [r21+20480]; +ld.shared.f32 f772, [r21+22528]; +ld.shared.f32 f773, [r21+24576]; +ld.shared.f32 f774, [r21+26624]; +ld.shared.f32 f775, [r21+28672]; +ld.shared.f32 f776, [r21+30720]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +add.f32 f789, f779, f784; +sub.f32 f790, f780, f783; +sub.f32 f791, f779, f784; +add.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +add.f32 f805, f795, f800; +sub.f32 f806, f796, f799; +sub.f32 f807, f795, f800; +add.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0fBF3504F3; +sub.f32 f811, f809, f810; +mul.f32 f812, f806, 0f3F3504F3; +fma.rn.f32 f813, f805, 0fBF3504F3, f812; +mul.f32 f814, f807, 0fBF3504F3; +mul.f32 f815, f808, 0fBF3504F3; +sub.f32 f816, f814, f815; +add.f32 f817, f814, f815; +add.f32 f818, f785, f801; +add.f32 f819, f786, f802; +sub.f32 f820, f785, f801; +sub.f32 f821, f786, f802; +add.f32 f822, f789, f811; +add.f32 f823, f790, f813; +sub.f32 f824, f789, f811; +sub.f32 f825, f790, f813; +add.f32 f826, f787, f804; +sub.f32 f827, f788, f803; +sub.f32 f828, f787, f804; +add.f32 f829, f788, f803; +add.f32 f830, f791, f816; +add.f32 f831, f792, f817; +sub.f32 f832, f791, f816; +sub.f32 f833, f792, f817; +add.f32 f834, f746, f754; +add.f32 f835, f762, f770; +sub.f32 f836, f746, f754; +sub.f32 f837, f762, f770; +add.f32 f838, f750, f758; +add.f32 f839, f766, f774; +sub.f32 f840, f750, f758; +sub.f32 f841, f766, f774; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +add.f32 f846, f836, f841; +sub.f32 f847, f837, f840; +sub.f32 f848, f836, f841; +add.f32 f849, f837, f840; +add.f32 f850, f748, f756; +add.f32 f851, f764, f772; +sub.f32 f852, f748, f756; +sub.f32 f853, f764, f772; +add.f32 f854, f752, f760; +add.f32 f855, f768, f776; +sub.f32 f856, f752, f760; +sub.f32 f857, f768, f776; +add.f32 f858, f850, f854; +add.f32 f859, f851, f855; +sub.f32 f860, f850, f854; +sub.f32 f861, f851, f855; +add.f32 f862, f852, f857; +sub.f32 f863, f853, f856; +sub.f32 f864, f852, f857; +add.f32 f865, f853, f856; +mul.f32 f866, f862, 0f3F3504F3; +mul.f32 f867, f863, 0fBF3504F3; +sub.f32 f868, f866, f867; +mul.f32 f869, f863, 0f3F3504F3; +fma.rn.f32 f870, f862, 0fBF3504F3, f869; +mul.f32 f871, f864, 0fBF3504F3; +mul.f32 f872, f865, 0fBF3504F3; +sub.f32 f873, f871, f872; +add.f32 f874, f871, f872; +add.f32 f875, f842, f858; +add.f32 f876, f843, f859; +sub.f32 f877, f842, f858; +sub.f32 f878, f843, f859; +add.f32 f879, f846, f868; +add.f32 f880, f847, f870; +sub.f32 f881, f846, f868; +sub.f32 f882, f847, f870; +add.f32 f883, f844, f861; +sub.f32 f884, f845, f860; +sub.f32 f885, f844, f861; +add.f32 f886, f845, f860; +add.f32 f887, f848, f873; +add.f32 f888, f849, f874; +sub.f32 f889, f848, f873; +sub.f32 f890, f849, f874; +mul.f32 f891, f879, 0f3F6C835E; +mul.f32 f892, f880, 0fBEC3EF15; +sub.f32 f893, f891, f892; +mul.f32 f894, f880, 0f3F6C835E; +fma.rn.f32 f895, f879, 0fBEC3EF15, f894; +mul.f32 f896, f883, 0f3F3504F3; +mul.f32 f897, f884, 0fBF3504F3; +sub.f32 f898, f896, f897; +mul.f32 f899, f884, 0f3F3504F3; +fma.rn.f32 f900, f883, 0fBF3504F3, f899; +mul.f32 f901, f887, 0f3EC3EF15; +mul.f32 f902, f888, 0fBF6C835E; +sub.f32 f903, f901, f902; +mul.f32 f904, f888, 0f3EC3EF15; +fma.rn.f32 f905, f887, 0fBF6C835E, f904; +mul.f32 f906, f881, 0fBEC3EF15; +mul.f32 f907, f882, 0fBF6C835E; +sub.f32 f908, f906, f907; +mul.f32 f909, f882, 0fBEC3EF15; +fma.rn.f32 f910, f881, 0fBF6C835E, f909; +mul.f32 f911, f885, 0fBF3504F3; +mul.f32 f912, f886, 0fBF3504F3; +sub.f32 f913, f911, f912; +add.f32 f914, f911, f912; +mul.f32 f915, f889, 0fBF6C835E; +mul.f32 f916, f890, 0fBEC3EF15; +sub.f32 f917, f915, f916; +mul.f32 f918, f890, 0fBF6C835E; +fma.rn.f32 f919, f889, 0fBEC3EF15, f918; +add.f32 f920, f818, f875; +add.f32 f921, f819, f876; +sub.f32 f922, f818, f875; +sub.f32 f923, f819, f876; +add.f32 f924, f822, f893; +add.f32 f925, f823, f895; +sub.f32 f926, f822, f893; +sub.f32 f927, f823, f895; +add.f32 f928, f826, f898; +add.f32 f929, f827, f900; +sub.f32 f930, f826, f898; +sub.f32 f931, f827, f900; +add.f32 f932, f830, f903; +add.f32 f933, f831, f905; +sub.f32 f934, f830, f903; +sub.f32 f935, f831, f905; +add.f32 f936, f820, f878; +sub.f32 f937, f821, f877; +sub.f32 f938, f820, f878; +add.f32 f939, f821, f877; +add.f32 f940, f824, f908; +add.f32 f941, f825, f910; +sub.f32 f942, f824, f908; +sub.f32 f943, f825, f910; +add.f32 f944, f828, f913; +add.f32 f945, f829, f914; +sub.f32 f946, f828, f913; +sub.f32 f947, f829, f914; +add.f32 f948, f832, f917; +add.f32 f949, f833, f919; +sub.f32 f950, f832, f917; +sub.f32 f951, f833, f919; +and.b32 r22, r5, 256; +bfe.u32 r23, r5, 8, 1; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f952, f953}, [rd11]; +mul.f32 f956, f952, f924; +mul.f32 f957, f953, f925; +sub.f32 f958, f956, f957; +mul.f32 f959, f952, f925; +fma.rn.f32 f960, f953, f924, f959; +mul.f32 f961, f952, f952; +mul.f32 f962, f953, f953; +sub.f32 f963, f961, f962; +mul.f32 f964, f953, f952; +fma.rn.f32 f965, f953, f952, f964; +mul.f32 f966, f963, f928; +mul.f32 f967, f965, f929; +sub.f32 f968, f966, f967; +mul.f32 f969, f963, f929; +fma.rn.f32 f970, f965, f928, f969; +mul.f32 f971, f952, f963; +mul.f32 f972, f953, f965; +sub.f32 f973, f971, f972; +mul.f32 f974, f952, f965; +fma.rn.f32 f975, f953, f963, f974; +mul.f32 f976, f973, f932; +mul.f32 f977, f975, f933; +sub.f32 f978, f976, f977; +mul.f32 f979, f973, f933; +fma.rn.f32 f980, f975, f932, f979; +mul.f32 f981, f952, f973; +mul.f32 f982, f953, f975; +sub.f32 f983, f981, f982; +mul.f32 f984, f952, f975; +fma.rn.f32 f985, f953, f973, f984; +mul.f32 f986, f983, f936; +mul.f32 f987, f985, f937; +sub.f32 f988, f986, f987; +mul.f32 f989, f983, f937; +fma.rn.f32 f990, f985, f936, f989; +mul.f32 f991, f952, f983; +mul.f32 f992, f953, f985; +sub.f32 f993, f991, f992; +mul.f32 f994, f952, f985; +fma.rn.f32 f995, f953, f983, f994; +mul.f32 f996, f993, f940; +mul.f32 f997, f995, f941; +sub.f32 f998, f996, f997; +mul.f32 f999, f993, f941; +fma.rn.f32 f1000, f995, f940, f999; +mul.f32 f1001, f952, f993; +mul.f32 f1002, f953, f995; +sub.f32 f1003, f1001, f1002; +mul.f32 f1004, f952, f995; +fma.rn.f32 f1005, f953, f993, f1004; +mul.f32 f1006, f1003, f944; +mul.f32 f1007, f1005, f945; +sub.f32 f1008, f1006, f1007; +mul.f32 f1009, f1003, f945; +fma.rn.f32 f1010, f1005, f944, f1009; +mul.f32 f1011, f952, f1003; +mul.f32 f1012, f953, f1005; +sub.f32 f1013, f1011, f1012; +mul.f32 f1014, f952, f1005; +fma.rn.f32 f1015, f953, f1003, f1014; +mul.f32 f1016, f1013, f948; +mul.f32 f1017, f1015, f949; +sub.f32 f1018, f1016, f1017; +mul.f32 f1019, f1013, f949; +fma.rn.f32 f1020, f1015, f948, f1019; +mul.f32 f1021, f952, f1013; +mul.f32 f1022, f953, f1015; +sub.f32 f1023, f1021, f1022; +mul.f32 f1024, f952, f1015; +fma.rn.f32 f1025, f953, f1013, f1024; +mul.f32 f1026, f1023, f922; +mul.f32 f1027, f1025, f923; +sub.f32 f1028, f1026, f1027; +mul.f32 f1029, f1023, f923; +fma.rn.f32 f1030, f1025, f922, f1029; +mul.f32 f1031, f952, f1023; +mul.f32 f1032, f953, f1025; +sub.f32 f1033, f1031, f1032; +mul.f32 f1034, f952, f1025; +fma.rn.f32 f1035, f953, f1023, f1034; +mul.f32 f1036, f1033, f926; +mul.f32 f1037, f1035, f927; +sub.f32 f1038, f1036, f1037; +mul.f32 f1039, f1033, f927; +fma.rn.f32 f1040, f1035, f926, f1039; +mul.f32 f1041, f952, f1033; +mul.f32 f1042, f953, f1035; +sub.f32 f1043, f1041, f1042; +mul.f32 f1044, f952, f1035; +fma.rn.f32 f1045, f953, f1033, f1044; +mul.f32 f1046, f1043, f930; +mul.f32 f1047, f1045, f931; +sub.f32 f1048, f1046, f1047; +mul.f32 f1049, f1043, f931; +fma.rn.f32 f1050, f1045, f930, f1049; +mul.f32 f1051, f952, f1043; +mul.f32 f1052, f953, f1045; +sub.f32 f1053, f1051, f1052; +mul.f32 f1054, f952, f1045; +fma.rn.f32 f1055, f953, f1043, f1054; +mul.f32 f1056, f1053, f934; +mul.f32 f1057, f1055, f935; +sub.f32 f1058, f1056, f1057; +mul.f32 f1059, f1053, f935; +fma.rn.f32 f1060, f1055, f934, f1059; +mul.f32 f1061, f952, f1053; +mul.f32 f1062, f953, f1055; +sub.f32 f1063, f1061, f1062; +mul.f32 f1064, f952, f1055; +fma.rn.f32 f1065, f953, f1053, f1064; +mul.f32 f1066, f1063, f938; +mul.f32 f1067, f1065, f939; +sub.f32 f1068, f1066, f1067; +mul.f32 f1069, f1063, f939; +fma.rn.f32 f1070, f1065, f938, f1069; +mul.f32 f1071, f952, f1063; +mul.f32 f1072, f953, f1065; +sub.f32 f1073, f1071, f1072; +mul.f32 f1074, f952, f1065; +fma.rn.f32 f1075, f953, f1063, f1074; +mul.f32 f1076, f1073, f942; +mul.f32 f1077, f1075, f943; +sub.f32 f1078, f1076, f1077; +mul.f32 f1079, f1073, f943; +fma.rn.f32 f1080, f1075, f942, f1079; +mul.f32 f1081, f952, f1073; +mul.f32 f1082, f953, f1075; +sub.f32 f1083, f1081, f1082; +mul.f32 f1084, f952, f1075; +fma.rn.f32 f1085, f953, f1073, f1084; +mul.f32 f1086, f1083, f946; +mul.f32 f1087, f1085, f947; +sub.f32 f1088, f1086, f1087; +mul.f32 f1089, f1083, f947; +fma.rn.f32 f1090, f1085, f946, f1089; +mul.f32 f1091, f952, f1083; +mul.f32 f1092, f953, f1085; +sub.f32 f1093, f1091, f1092; +mul.f32 f1094, f952, f1085; +fma.rn.f32 f1095, f953, f1083, f1094; +mul.f32 f1096, f1093, f950; +mul.f32 f1097, f1095, f951; +sub.f32 f1098, f1096, f1097; +mul.f32 f1099, f1093, f951; +fma.rn.f32 f1100, f1095, f950, f1099; +and.b32 r24, r16, 1020; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 16384; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f920; +st.shared.f32 [r27+1024], f958; +st.shared.f32 [r27+2048], f968; +st.shared.f32 [r27+3072], f978; +st.shared.f32 [r27+4096], f988; +st.shared.f32 [r27+5120], f998; +st.shared.f32 [r27+6144], f1008; +st.shared.f32 [r27+7168], f1018; +st.shared.f32 [r27+8192], f1028; +st.shared.f32 [r27+9216], f1038; +st.shared.f32 [r27+10240], f1048; +st.shared.f32 [r27+11264], f1058; +st.shared.f32 [r27+12288], f1068; +st.shared.f32 [r27+13312], f1078; +st.shared.f32 [r27+14336], f1088; +st.shared.f32 [r27+15360], f1098; +barrier.sync 0; +mad.lo.s32 r28, r22, -60, r27; +ld.shared.f32 f1101, [r28]; +ld.shared.f32 f1102, [r28+2048]; +ld.shared.f32 f1103, [r28+4096]; +ld.shared.f32 f1104, [r28+6144]; +ld.shared.f32 f1105, [r28+8192]; +ld.shared.f32 f1106, [r28+10240]; +ld.shared.f32 f1107, [r28+12288]; +ld.shared.f32 f1108, [r28+14336]; +ld.shared.f32 f1109, [r28+16384]; +ld.shared.f32 f1110, [r28+18432]; +ld.shared.f32 f1111, [r28+20480]; +ld.shared.f32 f1112, [r28+22528]; +ld.shared.f32 f1113, [r28+24576]; +ld.shared.f32 f1114, [r28+26624]; +ld.shared.f32 f1115, [r28+28672]; +ld.shared.f32 f1116, [r28+30720]; +barrier.sync 0; +st.shared.f32 [r27], f921; +st.shared.f32 [r27+1024], f960; +st.shared.f32 [r27+2048], f970; +st.shared.f32 [r27+3072], f980; +st.shared.f32 [r27+4096], f990; +st.shared.f32 [r27+5120], f1000; +st.shared.f32 [r27+6144], f1010; +st.shared.f32 [r27+7168], f1020; +st.shared.f32 [r27+8192], f1030; +st.shared.f32 [r27+9216], f1040; +st.shared.f32 [r27+10240], f1050; +st.shared.f32 [r27+11264], f1060; +st.shared.f32 [r27+12288], f1070; +st.shared.f32 [r27+13312], f1080; +st.shared.f32 [r27+14336], f1090; +st.shared.f32 [r27+15360], f1100; +barrier.sync 0; +ld.shared.f32 f1117, [r28]; +ld.shared.f32 f1118, [r28+2048]; +ld.shared.f32 f1119, [r28+4096]; +ld.shared.f32 f1120, [r28+6144]; +ld.shared.f32 f1121, [r28+8192]; +ld.shared.f32 f1122, [r28+10240]; +ld.shared.f32 f1123, [r28+12288]; +ld.shared.f32 f1124, [r28+14336]; +ld.shared.f32 f1125, [r28+16384]; +ld.shared.f32 f1126, [r28+18432]; +ld.shared.f32 f1127, [r28+20480]; +ld.shared.f32 f1128, [r28+22528]; +ld.shared.f32 f1129, [r28+24576]; +ld.shared.f32 f1130, [r28+26624]; +ld.shared.f32 f1131, [r28+28672]; +ld.shared.f32 f1132, [r28+30720]; +add.f32 %0, f1101, f1109; +add.f32 %1, f1117, f1125; +add.f32 %2, f1102, f1110; +add.f32 %3, f1118, f1126; +add.f32 %4, f1103, f1111; +add.f32 %5, f1119, f1127; +add.f32 %6, f1104, f1112; +add.f32 %7, f1120, f1128; +add.f32 %8, f1105, f1113; +add.f32 %9, f1121, f1129; +add.f32 %10, f1106, f1114; +add.f32 %11, f1122, f1130; +add.f32 %12, f1107, f1115; +add.f32 %13, f1123, f1131; +add.f32 %14, f1108, f1116; +add.f32 %15, f1124, f1132; +sub.f32 %16, f1101, f1109; +sub.f32 %17, f1117, f1125; +sub.f32 %18, f1102, f1110; +sub.f32 %19, f1118, f1126; +sub.f32 %20, f1103, f1111; +sub.f32 %21, f1119, f1127; +sub.f32 %22, f1104, f1112; +sub.f32 %23, f1120, f1128; +sub.f32 %24, f1105, f1113; +sub.f32 %25, f1121, f1129; +sub.f32 %26, f1106, f1114; +sub.f32 %27, f1122, f1130; +sub.f32 %28, f1107, f1115; +sub.f32 %29, f1123, f1131; +sub.f32 %30, f1108, f1116; +sub.f32 %31, f1124, f1132; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_8192), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<113, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1529>; +.reg .b32 r<53>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %36, %52; +sub.f32 f67, %36, %52; +add.f32 f1514, %37, %68; +sub.f32 f68, %37, %68; +add.f32 f69, %44, %60; +sub.f32 f71, %44, %60; +add.f32 f1512, %69, %61; +sub.f32 f72, %69, %61; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1511, f1514, f1512; +sub.f32 f76, f1514, f1512; +add.f32 f77, f67, f72; +sub.f32 f79, f67, f72; +sub.f32 f1510, f68, f71; +add.f32 f80, f68, f71; +add.f32 f81, %40, %56; +sub.f32 f83, %40, %56; +add.f32 f1507, %71, %70; +sub.f32 f84, %71, %70; +add.f32 f85, %48, %64; +sub.f32 f87, %48, %64; +add.f32 f1505, %49, %72; +sub.f32 f88, %49, %72; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1504, f1507, f1505; +sub.f32 f92, f1507, f1505; +add.f32 f93, f83, f88; +sub.f32 f95, f83, f88; +sub.f32 f1503, f84, f87; +add.f32 f96, f84, f87; +mul.f32 f98, f1503, 0fBF3504F3; +mul.f32 f1502, f93, 0f3F3504F3; +sub.f32 f99, f1502, f98; +mul.f32 f100, f1503, 0f3F3504F3; +fma.rn.f32 f101, f93, 0fBF3504F3, f100; +mul.f32 f102, f95, 0fBF3504F3; +mul.f32 f103, f96, 0fBF3504F3; +sub.f32 f104, f102, f103; +add.f32 f105, f102, f103; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1501, f1511, f1504; +sub.f32 f109, f1511, f1504; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1500, f1510, f101; +sub.f32 f113, f1510, f101; +add.f32 f114, f75, f92; +sub.f32 f116, f75, f92; +sub.f32 f1499, f76, f91; +add.f32 f117, f76, f91; +add.f32 f118, f79, f104; +sub.f32 f120, f79, f104; +add.f32 f1498, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %54; +sub.f32 f124, %38, %54; +add.f32 f1496, %73, %55; +sub.f32 f125, %73, %55; +add.f32 f126, %46, %62; +sub.f32 f128, %46, %62; +add.f32 f1493, %74, %75; +sub.f32 f129, %74, %75; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1492, f1496, f1493; +sub.f32 f133, f1496, f1493; +add.f32 f134, f124, f129; +sub.f32 f136, f124, f129; +sub.f32 f1491, f125, f128; +add.f32 f137, f125, f128; +add.f32 f138, %42, %58; +sub.f32 f140, %42, %58; +add.f32 f1489, %43, %76; +sub.f32 f141, %43, %76; +add.f32 f142, %50, %66; +sub.f32 f144, %50, %66; +add.f32 f1487, %77, %67; +sub.f32 f145, %77, %67; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1486, f1489, f1487; +sub.f32 f149, f1489, f1487; +add.f32 f150, f140, f145; +sub.f32 f152, f140, f145; +sub.f32 f1485, f141, f144; +add.f32 f153, f141, f144; +mul.f32 f155, f1485, 0fBF3504F3; +mul.f32 f1484, f150, 0f3F3504F3; +sub.f32 f156, f1484, f155; +mul.f32 f157, f1485, 0f3F3504F3; +fma.rn.f32 f158, f150, 0fBF3504F3, f157; +mul.f32 f159, f152, 0fBF3504F3; +mul.f32 f160, f153, 0fBF3504F3; +sub.f32 f161, f159, f160; +add.f32 f162, f159, f160; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1483, f1492, f1486; +sub.f32 f166, f1492, f1486; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1482, f1491, f158; +sub.f32 f170, f1491, f158; +add.f32 f171, f132, f149; +sub.f32 f173, f132, f149; +sub.f32 f1481, f133, f148; +add.f32 f174, f133, f148; +add.f32 f175, f136, f161; +sub.f32 f177, f136, f161; +add.f32 f1480, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1478, f167, 0f3F6C835E; +mul.f32 f1479, f1482, 0fBEC3EF15; +sub.f32 f181, f1478, f1479; +mul.f32 f182, f1482, 0f3F6C835E; +fma.rn.f32 f183, f167, 0fBEC3EF15, f182; +mul.f32 f1476, f171, 0f3F3504F3; +mul.f32 f1477, f1481, 0fBF3504F3; +sub.f32 f186, f1476, f1477; +mul.f32 f187, f1481, 0f3F3504F3; +fma.rn.f32 f188, f171, 0fBF3504F3, f187; +mul.f32 f1474, f175, 0f3EC3EF15; +mul.f32 f1475, f1480, 0fBF6C835E; +sub.f32 f191, f1474, f1475; +mul.f32 f192, f1480, 0f3EC3EF15; +fma.rn.f32 f193, f175, 0fBF6C835E, f192; +mul.f32 f1472, f169, 0fBEC3EF15; +mul.f32 f1473, f170, 0fBF6C835E; +sub.f32 f196, f1472, f1473; +mul.f32 f197, f170, 0fBEC3EF15; +fma.rn.f32 f198, f169, 0fBF6C835E, f197; +mul.f32 f199, f173, 0fBF3504F3; +mul.f32 f200, f174, 0fBF3504F3; +sub.f32 f201, f199, f200; +add.f32 f202, f199, f200; +mul.f32 f1470, f177, 0fBF6C835E; +mul.f32 f1471, f178, 0fBEC3EF15; +sub.f32 f205, f1470, f1471; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0fBEC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1469, f1500, f183; +sub.f32 f213, f1500, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1468, f1499, f188; +sub.f32 f217, f1499, f188; +add.f32 f218, f118, f191; +sub.f32 f220, f118, f191; +add.f32 f1467, f1498, f193; +sub.f32 f221, f1498, f193; +add.f32 f222, f108, f166; +sub.f32 f224, f108, f166; +sub.f32 f1466, f109, f165; +add.f32 f225, f109, f165; +add.f32 f226, f112, f196; +sub.f32 f228, f112, f196; +add.f32 f1465, f113, f198; +sub.f32 f229, f113, f198; +add.f32 f230, f116, f201; +sub.f32 f232, f116, f201; +add.f32 f1464, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1463, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r30, %tid.x; +shl.b32 r7, r30, 7; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r30, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f243, f239, f1469; +mul.f32 f244, f238, f1469; +mul.f32 f246, f239, f239; +mul.f32 f1462, f238, f238; +sub.f32 f247, f1462, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f251, f249, f1468; +mul.f32 f252, f247, f1468; +mul.f32 f1460, f238, f247; +mul.f32 f1461, f239, f249; +sub.f32 f255, f1460, f1461; +mul.f32 f1459, f247, f214; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f259, f257, f1467; +mul.f32 f260, f255, f1467; +mul.f32 f262, f239, f257; +mul.f32 f1458, f238, f255; +sub.f32 f263, f1458, f262; +mul.f32 f1457, f255, f218; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f267, f265, f1466; +mul.f32 f268, f263, f1466; +mul.f32 f270, f239, f265; +mul.f32 f1456, f238, f263; +sub.f32 f271, f1456, f270; +mul.f32 f1455, f263, f222; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f275, f273, f1465; +mul.f32 f276, f271, f1465; +mul.f32 f1453, f238, f271; +mul.f32 f1454, f239, f273; +sub.f32 f279, f1453, f1454; +mul.f32 f1452, f271, f226; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f283, f281, f1464; +mul.f32 f284, f279, f1464; +mul.f32 f286, f239, f281; +mul.f32 f1451, f238, f279; +sub.f32 f287, f1451, f286; +mul.f32 f1450, f279, f230; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f291, f289, f1463; +mul.f32 f292, f287, f1463; +mul.f32 f294, f239, f289; +mul.f32 f1449, f238, f287; +sub.f32 f295, f1449, f294; +mul.f32 f1448, f287, f234; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1447, f1501, f1483; +mul.f32 f299, f297, f1447; +mul.f32 f300, f295, f1447; +mul.f32 f1445, f238, f295; +mul.f32 f1446, f239, f297; +sub.f32 f303, f1445, f1446; +sub.f32 f1444, f106, f163; +mul.f32 f1443, f295, f1444; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f307, f305, f213; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1442, f238, f303; +sub.f32 f311, f1442, f310; +mul.f32 f1441, f303, f212; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f315, f313, f217; +mul.f32 f316, f311, f217; +mul.f32 f1439, f238, f311; +mul.f32 f1440, f239, f313; +sub.f32 f319, f1439, f1440; +mul.f32 f1438, f311, f216; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f323, f321, f221; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1437, f238, f319; +sub.f32 f327, f1437, f326; +mul.f32 f1436, f319, f220; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f331, f329, f225; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1435, f238, f327; +sub.f32 f335, f1435, f334; +mul.f32 f1434, f327, f224; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f339, f337, f229; +mul.f32 f340, f335, f229; +mul.f32 f1432, f238, f335; +mul.f32 f1433, f239, f337; +sub.f32 f343, f1432, f1433; +mul.f32 f1431, f335, f228; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f347, f345, f233; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1430, f238, f343; +sub.f32 f351, f1430, f350; +mul.f32 f1429, f238, f210; +mul.f32 f352, f238, f345; +mul.f32 f1428, f343, f232; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f351, f236; +mul.f32 f355, f353, f237; +mul.f32 f356, f351, f237; +sub.f32 f1525, f1501, f1483; +mul.f32 f1524, f297, f1525; +barrier.sync 0; +and.b32 r11, r7, 65408; +add.s32 r12, r9, r11; +add.f32 f357, f1501, f1483; +mov.u32 r37, %tid.x; +shl.b32 r36, r37, 3; +sub.f32 f1526, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r47, %tid.x; +shl.b32 r46, r47, 7; +mov.u32 r45, %tid.x; +fma.rn.f32 f359, f239, f210, f244; +sub.f32 f360, f1429, f243; +st.shared.v4.f32 [r12], {f358, f357, f360, f359}; +fma.rn.f32 f361, f249, f214, f252; +sub.f32 f362, f1459, f251; +fma.rn.f32 f363, f257, f218, f260; +sub.f32 f364, f1457, f259; +st.shared.v4.f32 [r12+16], {f362, f361, f364, f363}; +sub.f32 f365, f1455, f267; +fma.rn.f32 f366, f265, f222, f268; +fma.rn.f32 f367, f273, f226, f276; +sub.f32 f368, f1452, f275; +st.shared.v4.f32 [r12+32], {f365, f366, f368, f367}; +fma.rn.f32 f369, f281, f230, f284; +sub.f32 f370, f1450, f283; +fma.rn.f32 f371, f289, f234, f292; +sub.f32 f372, f1448, f291; +st.shared.v4.f32 [r12+48], {f370, f369, f372, f371}; +fma.rn.f32 f373, f297, f1526, f300; +sub.f32 f374, f1443, f1524; +fma.rn.f32 f375, f305, f212, f308; +sub.f32 f376, f1441, f307; +st.shared.v4.f32 [r12+64], {f374, f373, f376, f375}; +fma.rn.f32 f377, f313, f216, f316; +sub.f32 f378, f1438, f315; +fma.rn.f32 f379, f321, f220, f324; +sub.f32 f380, f1436, f323; +st.shared.v4.f32 [r12+80], {f378, f377, f380, f379}; +fma.rn.f32 f381, f329, f224, f332; +sub.f32 f382, f1434, f331; +fma.rn.f32 f383, f337, f228, f340; +sub.f32 f384, f1431, f339; +st.shared.v4.f32 [r12+96], {f382, f381, f384, f383}; +fma.rn.f32 f385, f345, f232, f348; +sub.f32 f386, f1428, f347; +fma.rn.f32 f387, f353, f236, f356; +sub.f32 f388, f354, f355; +st.shared.v4.f32 [r12+112], {f386, f385, f388, f387}; +barrier.sync 0; +and.b32 r29, r45, 511; +mad.lo.s32 r13, r29, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+4096]; +ld.shared.v2.f32 {f397, f398}, [r13+8192]; +ld.shared.v2.f32 {f401, f402}, [r13+12288]; +ld.shared.v2.f32 {f405, f406}, [r13+16384]; +ld.shared.v2.f32 {f409, f410}, [r13+20480]; +ld.shared.v2.f32 {f413, f414}, [r13+24576]; +ld.shared.v2.f32 {f417, f418}, [r13+28672]; +ld.shared.v2.f32 {f421, f422}, [r13+32768]; +ld.shared.v2.f32 {f425, f426}, [r13+36864]; +ld.shared.v2.f32 {f429, f430}, [r13+40960]; +ld.shared.v2.f32 {f433, f434}, [r13+45056]; +ld.shared.v2.f32 {f437, f438}, [r13+49152]; +ld.shared.v2.f32 {f441, f442}, [r13+53248]; +ld.shared.v2.f32 {f445, f446}, [r13+57344]; +ld.shared.v2.f32 {f449, f450}, [r13+61440]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1427, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1426, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1425, f1427, f1426; +sub.f32 f464, f1427, f1426; +add.f32 f465, f455, f460; +sub.f32 f467, f455, f460; +sub.f32 f1424, f456, f459; +add.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1423, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1422, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1421, f1423, f1422; +sub.f32 f480, f1423, f1422; +add.f32 f481, f471, f476; +sub.f32 f483, f471, f476; +sub.f32 f1420, f472, f475; +add.f32 f484, f472, f475; +mul.f32 f1418, f481, 0f3F3504F3; +mul.f32 f1419, f1420, 0fBF3504F3; +sub.f32 f487, f1418, f1419; +mul.f32 f488, f1420, 0f3F3504F3; +fma.rn.f32 f489, f481, 0fBF3504F3, f488; +mul.f32 f490, f483, 0fBF3504F3; +mul.f32 f491, f484, 0fBF3504F3; +sub.f32 f492, f490, f491; +add.f32 f493, f490, f491; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1417, f1425, f1421; +sub.f32 f497, f1425, f1421; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1416, f1424, f489; +sub.f32 f501, f1424, f489; +add.f32 f502, f463, f480; +sub.f32 f504, f463, f480; +sub.f32 f1415, f464, f479; +add.f32 f505, f464, f479; +add.f32 f506, f467, f492; +sub.f32 f508, f467, f492; +add.f32 f1414, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1413, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1412, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1411, f1413, f1412; +sub.f32 f521, f1413, f1412; +add.f32 f522, f512, f517; +sub.f32 f524, f512, f517; +sub.f32 f1410, f513, f516; +add.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1409, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1408, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1407, f1409, f1408; +sub.f32 f537, f1409, f1408; +add.f32 f538, f528, f533; +sub.f32 f540, f528, f533; +sub.f32 f1406, f529, f532; +add.f32 f541, f529, f532; +mul.f32 f1404, f538, 0f3F3504F3; +mul.f32 f1405, f1406, 0fBF3504F3; +sub.f32 f544, f1404, f1405; +mul.f32 f545, f1406, 0f3F3504F3; +fma.rn.f32 f546, f538, 0fBF3504F3, f545; +mul.f32 f547, f540, 0fBF3504F3; +mul.f32 f548, f541, 0fBF3504F3; +sub.f32 f549, f547, f548; +add.f32 f550, f547, f548; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1403, f1411, f1407; +sub.f32 f554, f1411, f1407; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1402, f1410, f546; +sub.f32 f558, f1410, f546; +add.f32 f559, f520, f537; +sub.f32 f561, f520, f537; +sub.f32 f1401, f521, f536; +add.f32 f562, f521, f536; +add.f32 f563, f524, f549; +sub.f32 f565, f524, f549; +add.f32 f1400, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1402, 0fBEC3EF15; +mul.f32 f1399, f555, 0f3F6C835E; +sub.f32 f569, f1399, f568; +mul.f32 f570, f1402, 0f3F6C835E; +fma.rn.f32 f571, f555, 0fBEC3EF15, f570; +mul.f32 f573, f1401, 0fBF3504F3; +mul.f32 f1398, f559, 0f3F3504F3; +sub.f32 f574, f1398, f573; +mul.f32 f575, f1401, 0f3F3504F3; +fma.rn.f32 f576, f559, 0fBF3504F3, f575; +mul.f32 f1396, f563, 0f3EC3EF15; +mul.f32 f1397, f1400, 0fBF6C835E; +sub.f32 f579, f1396, f1397; +mul.f32 f580, f1400, 0f3EC3EF15; +fma.rn.f32 f581, f563, 0fBF6C835E, f580; +mul.f32 f1394, f557, 0fBEC3EF15; +mul.f32 f1395, f558, 0fBF6C835E; +sub.f32 f584, f1394, f1395; +mul.f32 f585, f558, 0fBEC3EF15; +fma.rn.f32 f586, f557, 0fBF6C835E, f585; +mul.f32 f587, f561, 0fBF3504F3; +mul.f32 f588, f562, 0fBF3504F3; +sub.f32 f589, f587, f588; +add.f32 f590, f587, f588; +mul.f32 f592, f566, 0fBEC3EF15; +mul.f32 f1393, f565, 0fBF6C835E; +sub.f32 f593, f1393, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0fBEC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1392, f1416, f571; +sub.f32 f601, f1416, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1391, f1415, f576; +sub.f32 f605, f1415, f576; +add.f32 f606, f506, f579; +sub.f32 f608, f506, f579; +add.f32 f1390, f1414, f581; +sub.f32 f609, f1414, f581; +add.f32 f610, f496, f554; +sub.f32 f612, f496, f554; +sub.f32 f1389, f497, f553; +add.f32 f613, f497, f553; +add.f32 f614, f500, f584; +sub.f32 f616, f500, f584; +add.f32 f1388, f501, f586; +sub.f32 f617, f501, f586; +add.f32 f618, f504, f589; +sub.f32 f620, f504, f589; +add.f32 f1387, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1386, f509, f595; +sub.f32 f625, f509, f595; +bfe.u32 r15, r45, 4, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f631, f627, f1392; +mul.f32 f632, f626, f1392; +mul.f32 f634, f627, f627; +mul.f32 f1385, f626, f626; +sub.f32 f635, f1385, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f639, f637, f1391; +mul.f32 f640, f635, f1391; +mul.f32 f1383, f626, f635; +mul.f32 f1384, f627, f637; +sub.f32 f643, f1383, f1384; +mul.f32 f1382, f635, f602; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f647, f645, f1390; +mul.f32 f648, f643, f1390; +mul.f32 f650, f627, f645; +mul.f32 f1381, f626, f643; +sub.f32 f651, f1381, f650; +mul.f32 f1380, f643, f606; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f655, f653, f1389; +mul.f32 f656, f651, f1389; +mul.f32 f658, f627, f653; +mul.f32 f1379, f626, f651; +sub.f32 f659, f1379, f658; +mul.f32 f1378, f651, f610; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f663, f661, f1388; +mul.f32 f664, f659, f1388; +mul.f32 f1376, f626, f659; +mul.f32 f1377, f627, f661; +sub.f32 f667, f1376, f1377; +mul.f32 f1375, f659, f614; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f671, f669, f1387; +mul.f32 f672, f667, f1387; +mul.f32 f674, f627, f669; +mul.f32 f1374, f626, f667; +sub.f32 f675, f1374, f674; +mul.f32 f1373, f667, f618; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f679, f677, f1386; +mul.f32 f680, f675, f1386; +mul.f32 f682, f627, f677; +mul.f32 f1372, f626, f675; +sub.f32 f683, f1372, f682; +mul.f32 f1371, f675, f622; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1370, f1417, f1403; +mul.f32 f687, f685, f1370; +mul.f32 f688, f683, f1370; +mul.f32 f1368, f626, f683; +mul.f32 f1369, f627, f685; +sub.f32 f691, f1368, f1369; +sub.f32 f1367, f494, f551; +mul.f32 f1366, f683, f1367; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f695, f693, f601; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1365, f626, f691; +sub.f32 f699, f1365, f698; +mul.f32 f1364, f691, f600; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f703, f701, f605; +mul.f32 f704, f699, f605; +mul.f32 f1362, f626, f699; +mul.f32 f1363, f627, f701; +sub.f32 f707, f1362, f1363; +mul.f32 f1361, f699, f604; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f711, f709, f609; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f1360, f626, f707; +sub.f32 f715, f1360, f714; +mul.f32 f1359, f707, f608; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f719, f717, f613; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f1358, f626, f715; +sub.f32 f723, f1358, f722; +mul.f32 f1357, f715, f612; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f727, f725, f617; +mul.f32 f728, f723, f617; +mul.f32 f1355, f626, f723; +mul.f32 f1356, f627, f725; +sub.f32 f731, f1355, f1356; +mul.f32 f1354, f723, f616; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f735, f733, f621; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f1353, f626, f731; +sub.f32 f739, f1353, f738; +mul.f32 f1352, f626, f598; +mul.f32 f740, f626, f733; +mul.f32 f1351, f731, f620; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f739, f624; +mul.f32 f743, f741, f625; +mul.f32 f744, f739, f625; +mov.u32 r41, %tid.x; +shl.b32 r40, r41, 3; +and.b32 r16, r40, 120; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r46, 63488; +add.s32 r19, r17, r18; +sub.f32 f1523, f1417, f1403; +mul.f32 f1522, f685, f1523; +add.f32 f745, f1417, f1403; +sub.f32 f1518, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r52, %tid.x; +fma.rn.f32 f747, f627, f598, f632; +sub.f32 f748, f1352, f631; +st.shared.v2.f32 [r19+128], {f748, f747}; +fma.rn.f32 f749, f637, f602, f640; +sub.f32 f750, f1382, f639; +st.shared.v2.f32 [r19+256], {f750, f749}; +fma.rn.f32 f751, f645, f606, f648; +sub.f32 f752, f1380, f647; +st.shared.v2.f32 [r19+384], {f752, f751}; +fma.rn.f32 f753, f653, f610, f656; +sub.f32 f754, f1378, f655; +st.shared.v2.f32 [r19+512], {f754, f753}; +sub.f32 f755, f1375, f663; +fma.rn.f32 f756, f661, f614, f664; +st.shared.v2.f32 [r19+640], {f755, f756}; +fma.rn.f32 f757, f669, f618, f672; +sub.f32 f758, f1373, f671; +st.shared.v2.f32 [r19+768], {f758, f757}; +fma.rn.f32 f759, f677, f622, f680; +sub.f32 f760, f1371, f679; +st.shared.v2.f32 [r19+896], {f760, f759}; +fma.rn.f32 f761, f685, f1518, f688; +sub.f32 f762, f1366, f1522; +st.shared.v2.f32 [r19+1024], {f762, f761}; +fma.rn.f32 f763, f693, f600, f696; +sub.f32 f764, f1364, f695; +st.shared.v2.f32 [r19+1152], {f764, f763}; +fma.rn.f32 f765, f701, f604, f704; +sub.f32 f766, f1361, f703; +st.shared.v2.f32 [r19+1280], {f766, f765}; +fma.rn.f32 f767, f709, f608, f712; +sub.f32 f768, f1359, f711; +st.shared.v2.f32 [r19+1408], {f768, f767}; +fma.rn.f32 f769, f717, f612, f720; +sub.f32 f770, f1357, f719; +st.shared.v2.f32 [r19+1536], {f770, f769}; +fma.rn.f32 f771, f725, f616, f728; +sub.f32 f772, f1354, f727; +st.shared.v2.f32 [r19+1664], {f772, f771}; +fma.rn.f32 f773, f733, f620, f736; +sub.f32 f774, f1351, f735; +st.shared.v2.f32 [r19+1792], {f774, f773}; +fma.rn.f32 f775, f741, f624, f744; +sub.f32 f776, f742, f743; +st.shared.v2.f32 [r19+1920], {f776, f775}; +barrier.sync 0; +and.b32 r28, r52, 496; +mad.lo.s32 r20, r28, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+4096]; +ld.shared.v2.f32 {f785, f786}, [r20+8192]; +ld.shared.v2.f32 {f789, f790}, [r20+12288]; +ld.shared.v2.f32 {f793, f794}, [r20+16384]; +ld.shared.v2.f32 {f797, f798}, [r20+20480]; +ld.shared.v2.f32 {f801, f802}, [r20+24576]; +ld.shared.v2.f32 {f805, f806}, [r20+28672]; +ld.shared.v2.f32 {f809, f810}, [r20+32768]; +ld.shared.v2.f32 {f813, f814}, [r20+36864]; +ld.shared.v2.f32 {f817, f818}, [r20+40960]; +ld.shared.v2.f32 {f821, f822}, [r20+45056]; +ld.shared.v2.f32 {f825, f826}, [r20+49152]; +ld.shared.v2.f32 {f829, f830}, [r20+53248]; +ld.shared.v2.f32 {f833, f834}, [r20+57344]; +ld.shared.v2.f32 {f837, f838}, [r20+61440]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f1350, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f1349, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f1348, f1350, f1349; +sub.f32 f852, f1350, f1349; +add.f32 f853, f843, f848; +sub.f32 f855, f843, f848; +sub.f32 f1347, f844, f847; +add.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f1346, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f1345, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f1344, f1346, f1345; +sub.f32 f868, f1346, f1345; +add.f32 f869, f859, f864; +sub.f32 f871, f859, f864; +sub.f32 f1343, f860, f863; +add.f32 f872, f860, f863; +mul.f32 f1341, f869, 0f3F3504F3; +mul.f32 f1342, f1343, 0fBF3504F3; +sub.f32 f875, f1341, f1342; +mul.f32 f876, f1343, 0f3F3504F3; +fma.rn.f32 f877, f869, 0fBF3504F3, f876; +mul.f32 f878, f871, 0fBF3504F3; +mul.f32 f879, f872, 0fBF3504F3; +sub.f32 f880, f878, f879; +add.f32 f881, f878, f879; +add.f32 f882, f849, f865; +sub.f32 f884, f849, f865; +add.f32 f1340, f1348, f1344; +sub.f32 f885, f1348, f1344; +add.f32 f886, f853, f875; +sub.f32 f888, f853, f875; +add.f32 f1339, f1347, f877; +sub.f32 f889, f1347, f877; +add.f32 f890, f851, f868; +sub.f32 f892, f851, f868; +sub.f32 f1338, f852, f867; +add.f32 f893, f852, f867; +add.f32 f894, f855, f880; +sub.f32 f896, f855, f880; +add.f32 f1337, f856, f881; +sub.f32 f897, f856, f881; +add.f32 f898, f781, f813; +sub.f32 f900, f781, f813; +add.f32 f1336, f782, f814; +sub.f32 f901, f782, f814; +add.f32 f902, f797, f829; +sub.f32 f904, f797, f829; +add.f32 f1335, f798, f830; +sub.f32 f905, f798, f830; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f1334, f1336, f1335; +sub.f32 f909, f1336, f1335; +add.f32 f910, f900, f905; +sub.f32 f912, f900, f905; +sub.f32 f1333, f901, f904; +add.f32 f913, f901, f904; +add.f32 f914, f789, f821; +sub.f32 f916, f789, f821; +add.f32 f1332, f790, f822; +sub.f32 f917, f790, f822; +add.f32 f918, f805, f837; +sub.f32 f920, f805, f837; +add.f32 f1331, f806, f838; +sub.f32 f921, f806, f838; +add.f32 f922, f914, f918; +sub.f32 f924, f914, f918; +add.f32 f1330, f1332, f1331; +sub.f32 f925, f1332, f1331; +add.f32 f926, f916, f921; +sub.f32 f928, f916, f921; +sub.f32 f1329, f917, f920; +add.f32 f929, f917, f920; +mul.f32 f1327, f926, 0f3F3504F3; +mul.f32 f1328, f1329, 0fBF3504F3; +sub.f32 f932, f1327, f1328; +mul.f32 f933, f1329, 0f3F3504F3; +fma.rn.f32 f934, f926, 0fBF3504F3, f933; +mul.f32 f935, f928, 0fBF3504F3; +mul.f32 f936, f929, 0fBF3504F3; +sub.f32 f937, f935, f936; +add.f32 f938, f935, f936; +add.f32 f939, f906, f922; +sub.f32 f941, f906, f922; +add.f32 f1326, f1334, f1330; +sub.f32 f942, f1334, f1330; +add.f32 f943, f910, f932; +sub.f32 f945, f910, f932; +add.f32 f1325, f1333, f934; +sub.f32 f946, f1333, f934; +add.f32 f947, f908, f925; +sub.f32 f949, f908, f925; +sub.f32 f1324, f909, f924; +add.f32 f950, f909, f924; +add.f32 f951, f912, f937; +sub.f32 f953, f912, f937; +add.f32 f1323, f913, f938; +sub.f32 f954, f913, f938; +mul.f32 f956, f1325, 0fBEC3EF15; +mul.f32 f1322, f943, 0f3F6C835E; +sub.f32 f957, f1322, f956; +mul.f32 f958, f1325, 0f3F6C835E; +fma.rn.f32 f959, f943, 0fBEC3EF15, f958; +mul.f32 f961, f1324, 0fBF3504F3; +mul.f32 f1321, f947, 0f3F3504F3; +sub.f32 f962, f1321, f961; +mul.f32 f963, f1324, 0f3F3504F3; +fma.rn.f32 f964, f947, 0fBF3504F3, f963; +mul.f32 f966, f1323, 0fBF6C835E; +mul.f32 f1320, f951, 0f3EC3EF15; +sub.f32 f967, f1320, f966; +mul.f32 f968, f1323, 0f3EC3EF15; +fma.rn.f32 f969, f951, 0fBF6C835E, f968; +mul.f32 f1318, f945, 0fBEC3EF15; +mul.f32 f1319, f946, 0fBF6C835E; +sub.f32 f972, f1318, f1319; +mul.f32 f973, f946, 0fBEC3EF15; +fma.rn.f32 f974, f945, 0fBF6C835E, f973; +mul.f32 f975, f949, 0fBF3504F3; +mul.f32 f976, f950, 0fBF3504F3; +sub.f32 f977, f975, f976; +add.f32 f978, f975, f976; +mul.f32 f980, f954, 0fBEC3EF15; +mul.f32 f1317, f953, 0fBF6C835E; +sub.f32 f981, f1317, f980; +mul.f32 f982, f954, 0fBF6C835E; +fma.rn.f32 f983, f953, 0fBEC3EF15, f982; +add.f32 f986, f886, f957; +sub.f32 f988, f886, f957; +add.f32 f1316, f1339, f959; +sub.f32 f989, f1339, f959; +add.f32 f990, f890, f962; +sub.f32 f992, f890, f962; +add.f32 f1315, f1338, f964; +sub.f32 f993, f1338, f964; +add.f32 f994, f894, f967; +sub.f32 f996, f894, f967; +add.f32 f1314, f1337, f969; +sub.f32 f997, f1337, f969; +add.f32 f998, f884, f942; +sub.f32 f1000, f884, f942; +sub.f32 f1313, f885, f941; +add.f32 f1001, f885, f941; +add.f32 f1002, f888, f972; +sub.f32 f1004, f888, f972; +add.f32 f1312, f889, f974; +sub.f32 f1005, f889, f974; +add.f32 f1006, f892, f977; +sub.f32 f1008, f892, f977; +add.f32 f1311, f893, f978; +sub.f32 f1009, f893, f978; +add.f32 f1010, f896, f981; +sub.f32 f1012, f896, f981; +add.f32 f1310, f897, f983; +sub.f32 f1013, f897, f983; +and.b32 r21, r52, 256; +bfe.u32 r22, r52, 8, 1; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1014, f1015}, [rd11]; +mul.f32 f1019, f1015, f1316; +mul.f32 f1020, f1014, f1316; +mul.f32 f1022, f1015, f1015; +mul.f32 f1309, f1014, f1014; +sub.f32 f1023, f1309, f1022; +mul.f32 f1024, f1015, f1014; +fma.rn.f32 f1025, f1015, f1014, f1024; +mul.f32 f1027, f1025, f1315; +mul.f32 f1028, f1023, f1315; +mul.f32 f1307, f1014, f1023; +mul.f32 f1308, f1015, f1025; +sub.f32 f1031, f1307, f1308; +mul.f32 f1306, f1023, f990; +mul.f32 f1032, f1014, f1025; +fma.rn.f32 f1033, f1015, f1023, f1032; +mul.f32 f1035, f1033, f1314; +mul.f32 f1036, f1031, f1314; +mul.f32 f1304, f1014, f1031; +mul.f32 f1305, f1015, f1033; +sub.f32 f1039, f1304, f1305; +mul.f32 f1303, f1031, f994; +mul.f32 f1040, f1014, f1033; +fma.rn.f32 f1041, f1015, f1031, f1040; +mul.f32 f1043, f1041, f1313; +mul.f32 f1044, f1039, f1313; +mul.f32 f1046, f1015, f1041; +mul.f32 f1302, f1014, f1039; +sub.f32 f1047, f1302, f1046; +mul.f32 f1301, f1039, f998; +mul.f32 f1048, f1014, f1041; +fma.rn.f32 f1049, f1015, f1039, f1048; +mul.f32 f1051, f1049, f1312; +mul.f32 f1052, f1047, f1312; +mul.f32 f1299, f1014, f1047; +mul.f32 f1300, f1015, f1049; +sub.f32 f1055, f1299, f1300; +mul.f32 f1298, f1047, f1002; +mul.f32 f1056, f1014, f1049; +fma.rn.f32 f1057, f1015, f1047, f1056; +mul.f32 f1059, f1057, f1311; +mul.f32 f1060, f1055, f1311; +mul.f32 f1062, f1015, f1057; +mul.f32 f1297, f1014, f1055; +sub.f32 f1063, f1297, f1062; +mul.f32 f1296, f1055, f1006; +mul.f32 f1064, f1014, f1057; +fma.rn.f32 f1065, f1015, f1055, f1064; +mul.f32 f1067, f1065, f1310; +mul.f32 f1068, f1063, f1310; +mul.f32 f1070, f1015, f1065; +mul.f32 f1295, f1014, f1063; +sub.f32 f1071, f1295, f1070; +mul.f32 f1294, f1063, f1010; +mul.f32 f1072, f1014, f1065; +fma.rn.f32 f1073, f1015, f1063, f1072; +sub.f32 f1293, f1340, f1326; +mul.f32 f1075, f1073, f1293; +mul.f32 f1076, f1071, f1293; +sub.f32 f1292, f882, f939; +mul.f32 f1290, f1014, f1071; +mul.f32 f1291, f1015, f1073; +sub.f32 f1079, f1290, f1291; +mul.f32 f1289, f1071, f1292; +mul.f32 f1080, f1014, f1073; +fma.rn.f32 f1081, f1015, f1071, f1080; +mul.f32 f1083, f1081, f989; +mul.f32 f1084, f1079, f989; +mul.f32 f1086, f1015, f1081; +mul.f32 f1288, f1014, f1079; +sub.f32 f1087, f1288, f1086; +mul.f32 f1287, f1079, f988; +mul.f32 f1088, f1014, f1081; +fma.rn.f32 f1089, f1015, f1079, f1088; +mul.f32 f1091, f1089, f993; +mul.f32 f1092, f1087, f993; +mul.f32 f1285, f1014, f1087; +mul.f32 f1286, f1015, f1089; +sub.f32 f1095, f1285, f1286; +mul.f32 f1284, f1087, f992; +mul.f32 f1096, f1014, f1089; +fma.rn.f32 f1097, f1015, f1087, f1096; +mul.f32 f1099, f1097, f997; +mul.f32 f1100, f1095, f997; +mul.f32 f1282, f1014, f1095; +mul.f32 f1283, f1015, f1097; +sub.f32 f1103, f1282, f1283; +mul.f32 f1281, f1095, f996; +mul.f32 f1104, f1014, f1097; +fma.rn.f32 f1105, f1015, f1095, f1104; +mul.f32 f1107, f1105, f1001; +mul.f32 f1108, f1103, f1001; +mul.f32 f1110, f1015, f1105; +mul.f32 f1280, f1014, f1103; +sub.f32 f1111, f1280, f1110; +mul.f32 f1279, f1103, f1000; +mul.f32 f1112, f1014, f1105; +fma.rn.f32 f1113, f1015, f1103, f1112; +mul.f32 f1115, f1113, f1005; +mul.f32 f1116, f1111, f1005; +mul.f32 f1277, f1014, f1111; +mul.f32 f1278, f1015, f1113; +sub.f32 f1119, f1277, f1278; +mul.f32 f1276, f1111, f1004; +mul.f32 f1120, f1014, f1113; +fma.rn.f32 f1121, f1015, f1111, f1120; +mul.f32 f1123, f1121, f1009; +mul.f32 f1124, f1119, f1009; +mul.f32 f1126, f1015, f1121; +mul.f32 f1275, f1014, f1119; +sub.f32 f1127, f1275, f1126; +mul.f32 f1274, f1014, f986; +mul.f32 f1128, f1014, f1121; +mul.f32 f1273, f1119, f1008; +fma.rn.f32 f1129, f1015, f1119, f1128; +mul.f32 f1130, f1127, f1012; +mul.f32 f1131, f1129, f1013; +mul.f32 f1132, f1127, f1013; +mov.u32 r34, %tid.x; +shl.b32 r33, r34, 3; +and.b32 r23, r33, 2040; +add.s32 r24, r9, r23; +mov.u32 r39, %tid.x; +shl.b32 r38, r39, 7; +barrier.sync 0; +and.b32 r25, r38, 32768; +add.s32 r26, r24, r25; +mov.u32 r43, %tid.x; +and.b32 r42, r43, 256; +add.f32 f1133, f1340, f1326; +sub.f32 f1521, f882, f939; +add.f32 f1134, f882, f939; +st.shared.v2.f32 [r26], {f1134, f1133}; +mov.u32 r49, %tid.x; +and.b32 r48, r49, 256; +fma.rn.f32 f1135, f1015, f986, f1020; +sub.f32 f1136, f1274, f1019; +st.shared.v2.f32 [r26+2048], {f1136, f1135}; +fma.rn.f32 f1137, f1025, f990, f1028; +sub.f32 f1138, f1306, f1027; +st.shared.v2.f32 [r26+4096], {f1138, f1137}; +fma.rn.f32 f1139, f1033, f994, f1036; +sub.f32 f1140, f1303, f1035; +st.shared.v2.f32 [r26+6144], {f1140, f1139}; +fma.rn.f32 f1141, f1041, f998, f1044; +sub.f32 f1142, f1301, f1043; +st.shared.v2.f32 [r26+8192], {f1142, f1141}; +sub.f32 f1143, f1298, f1051; +fma.rn.f32 f1144, f1049, f1002, f1052; +st.shared.v2.f32 [r26+10240], {f1143, f1144}; +fma.rn.f32 f1145, f1057, f1006, f1060; +sub.f32 f1146, f1296, f1059; +st.shared.v2.f32 [r26+12288], {f1146, f1145}; +fma.rn.f32 f1147, f1065, f1010, f1068; +sub.f32 f1148, f1294, f1067; +st.shared.v2.f32 [r26+14336], {f1148, f1147}; +fma.rn.f32 f1149, f1073, f1521, f1076; +sub.f32 f1150, f1289, f1075; +st.shared.v2.f32 [r26+16384], {f1150, f1149}; +fma.rn.f32 f1151, f1081, f988, f1084; +sub.f32 f1152, f1287, f1083; +st.shared.v2.f32 [r26+18432], {f1152, f1151}; +fma.rn.f32 f1153, f1089, f992, f1092; +sub.f32 f1154, f1284, f1091; +st.shared.v2.f32 [r26+20480], {f1154, f1153}; +fma.rn.f32 f1155, f1097, f996, f1100; +sub.f32 f1156, f1281, f1099; +st.shared.v2.f32 [r26+22528], {f1156, f1155}; +fma.rn.f32 f1157, f1105, f1000, f1108; +sub.f32 f1158, f1279, f1107; +st.shared.v2.f32 [r26+24576], {f1158, f1157}; +fma.rn.f32 f1159, f1113, f1004, f1116; +sub.f32 f1160, f1276, f1115; +st.shared.v2.f32 [r26+26624], {f1160, f1159}; +fma.rn.f32 f1161, f1121, f1008, f1124; +sub.f32 f1162, f1273, f1123; +st.shared.v2.f32 [r26+28672], {f1162, f1161}; +fma.rn.f32 f1163, f1129, f1012, f1132; +sub.f32 f1164, f1130, f1131; +st.shared.v2.f32 [r26+30720], {f1164, f1163}; +barrier.sync 0; +mad.lo.s32 r27, r48, -120, r26; +ld.shared.v2.f32 {f1165, f1166}, [r27]; +ld.shared.v2.f32 {f1169, f1170}, [r27+4096]; +ld.shared.v2.f32 {f1173, f1174}, [r27+8192]; +ld.shared.v2.f32 {f1177, f1178}, [r27+12288]; +ld.shared.v2.f32 {f1181, f1182}, [r27+16384]; +ld.shared.v2.f32 {f1185, f1186}, [r27+20480]; +ld.shared.v2.f32 {f1189, f1190}, [r27+24576]; +ld.shared.v2.f32 {f1193, f1194}, [r27+28672]; +ld.shared.v2.f32 {f1197, f1198}, [r27+32768]; +ld.shared.v2.f32 {f1201, f1202}, [r27+36864]; +ld.shared.v2.f32 {f1205, f1206}, [r27+40960]; +ld.shared.v2.f32 {f1209, f1210}, [r27+45056]; +ld.shared.v2.f32 {f1213, f1214}, [r27+49152]; +ld.shared.v2.f32 {f1217, f1218}, [r27+53248]; +ld.shared.v2.f32 {f1221, f1222}, [r27+57344]; +ld.shared.v2.f32 {f1225, f1226}, [r27+61440]; +add.f32 %0, f1165, f1197; +add.f32 %1, f1166, f1198; +add.f32 %3, f1170, f1202; +add.f32 %2, f1169, f1201; +add.f32 %5, f1174, f1206; +add.f32 %4, f1173, f1205; +add.f32 %7, f1178, f1210; +add.f32 %6, f1177, f1209; +add.f32 %8, f1181, f1213; +add.f32 %9, f1182, f1214; +add.f32 %10, f1185, f1217; +add.f32 %11, f1186, f1218; +add.f32 %12, f1189, f1221; +add.f32 %13, f1190, f1222; +add.f32 %15, f1194, f1226; +add.f32 %14, f1193, f1225; +sub.f32 %17, f1166, f1198; +sub.f32 %16, f1165, f1197; +sub.f32 %19, f1170, f1202; +sub.f32 %18, f1169, f1201; +sub.f32 %21, f1174, f1206; +sub.f32 %20, f1173, f1205; +sub.f32 %23, f1178, f1210; +sub.f32 %22, f1177, f1209; +sub.f32 %25, f1182, f1214; +sub.f32 %24, f1181, f1213; +sub.f32 %27, f1186, f1218; +sub.f32 %26, f1185, f1217; +sub.f32 %29, f1190, f1222; +sub.f32 %28, f1189, f1221; +sub.f32 %31, f1194, f1226; +sub.f32 %30, f1193, f1225; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_8192), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<114, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<617>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %21, %31; +add.f32 f34, %22, %33; +sub.f32 f35, %21, %31; +sub.f32 f36, %22, %33; +add.f32 f37, %26, %37; +add.f32 f38, %28, %38; +sub.f32 f39, %26, %37; +sub.f32 f40, %28, %38; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %23, %34; +add.f32 f50, %25, %36; +sub.f32 f51, %23, %34; +sub.f32 f52, %25, %36; +add.f32 f53, %29, %39; +add.f32 f54, %30, %40; +sub.f32 f55, %29, %39; +sub.f32 f56, %30, %40; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f69; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f69; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +sub.f32 f84, f43, f60; +add.f32 f85, f44, f59; +add.f32 f86, f47, f72; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f72; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f90, f78; +mul.f32 f95, f91, f79; +sub.f32 f96, f94, f95; +mul.f32 f97, f90, f79; +fma.rn.f32 f98, f91, f78, f97; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f101, f82; +mul.f32 f105, f103, f83; +sub.f32 f106, f104, f105; +mul.f32 f107, f101, f83; +fma.rn.f32 f108, f103, f82, f107; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f111, f86; +mul.f32 f115, f113, f87; +sub.f32 f116, f114, f115; +mul.f32 f117, f111, f87; +fma.rn.f32 f118, f113, f86, f117; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f121, f76; +mul.f32 f125, f123, f77; +sub.f32 f126, f124, f125; +mul.f32 f127, f121, f77; +fma.rn.f32 f128, f123, f76, f127; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f131, f80; +mul.f32 f135, f133, f81; +sub.f32 f136, f134, f135; +mul.f32 f137, f131, f81; +fma.rn.f32 f138, f133, f80, f137; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f141, f84; +mul.f32 f145, f143, f85; +sub.f32 f146, f144, f145; +mul.f32 f147, f141, f85; +fma.rn.f32 f148, f143, f84, f147; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f151, f88; +mul.f32 f155, f153, f89; +sub.f32 f156, f154, f155; +mul.f32 f157, f151, f89; +fma.rn.f32 f158, f153, f88, f157; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32736; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f96, f106, f116}; +st.shared.v4.f32 [r12+16], {f126, f136, f146, f156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+4096]; +ld.shared.f32 f161, [r13+8192]; +ld.shared.f32 f162, [r13+12288]; +ld.shared.f32 f163, [r13+16384]; +ld.shared.f32 f164, [r13+20480]; +ld.shared.f32 f165, [r13+24576]; +ld.shared.f32 f166, [r13+28672]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+4096]; +ld.shared.f32 f169, [r13+8192]; +ld.shared.f32 f170, [r13+12288]; +ld.shared.f32 f171, [r13+16384]; +ld.shared.f32 f172, [r13+20480]; +ld.shared.f32 f173, [r13+24576]; +ld.shared.f32 f174, [r13+28672]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +add.f32 f187, f177, f182; +sub.f32 f188, f178, f181; +sub.f32 f189, f177, f182; +add.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0fBF3504F3; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, 0f3F3504F3; +fma.rn.f32 f211, f203, 0fBF3504F3, f210; +mul.f32 f212, f205, 0fBF3504F3; +mul.f32 f213, f206, 0fBF3504F3; +sub.f32 f214, f212, f213; +add.f32 f215, f212, f213; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f211; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f211; +add.f32 f224, f185, f202; +sub.f32 f225, f186, f201; +sub.f32 f226, f185, f202; +add.f32 f227, f186, f201; +add.f32 f228, f189, f214; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f214; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 1016; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f232, f220; +mul.f32 f237, f233, f221; +sub.f32 f238, f236, f237; +mul.f32 f239, f232, f221; +fma.rn.f32 f240, f233, f220, f239; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f243, f224; +mul.f32 f247, f245, f225; +sub.f32 f248, f246, f247; +mul.f32 f249, f243, f225; +fma.rn.f32 f250, f245, f224, f249; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f253, f228; +mul.f32 f257, f255, f229; +sub.f32 f258, f256, f257; +mul.f32 f259, f253, f229; +fma.rn.f32 f260, f255, f228, f259; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f263, f218; +mul.f32 f267, f265, f219; +sub.f32 f268, f266, f267; +mul.f32 f269, f263, f219; +fma.rn.f32 f270, f265, f218, f269; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f273, f222; +mul.f32 f277, f275, f223; +sub.f32 f278, f276, f277; +mul.f32 f279, f273, f223; +fma.rn.f32 f280, f275, f222, f279; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f283, f226; +mul.f32 f287, f285, f227; +sub.f32 f288, f286, f287; +mul.f32 f289, f283, f227; +fma.rn.f32 f290, f285, f226, f289; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f293, f230; +mul.f32 f297, f295, f231; +sub.f32 f298, f296, f297; +mul.f32 f299, f293, f231; +fma.rn.f32 f300, f295, f230, f299; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 32512; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f238; +st.shared.f32 [r19+64], f248; +st.shared.f32 [r19+96], f258; +st.shared.f32 [r19+128], f268; +st.shared.f32 [r19+160], f278; +st.shared.f32 [r19+192], f288; +st.shared.f32 [r19+224], f298; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+4096]; +ld.shared.f32 f303, [r20+8192]; +ld.shared.f32 f304, [r20+12288]; +ld.shared.f32 f305, [r20+16384]; +ld.shared.f32 f306, [r20+20480]; +ld.shared.f32 f307, [r20+24576]; +ld.shared.f32 f308, [r20+28672]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+4096]; +ld.shared.f32 f311, [r20+8192]; +ld.shared.f32 f312, [r20+12288]; +ld.shared.f32 f313, [r20+16384]; +ld.shared.f32 f314, [r20+20480]; +ld.shared.f32 f315, [r20+24576]; +ld.shared.f32 f316, [r20+28672]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +add.f32 f329, f319, f324; +sub.f32 f330, f320, f323; +sub.f32 f331, f319, f324; +add.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +add.f32 f345, f335, f340; +sub.f32 f346, f336, f339; +sub.f32 f347, f335, f340; +add.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0fBF3504F3; +sub.f32 f351, f349, f350; +mul.f32 f352, f346, 0f3F3504F3; +fma.rn.f32 f353, f345, 0fBF3504F3, f352; +mul.f32 f354, f347, 0fBF3504F3; +mul.f32 f355, f348, 0fBF3504F3; +sub.f32 f356, f354, f355; +add.f32 f357, f354, f355; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f353; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f353; +add.f32 f366, f327, f344; +sub.f32 f367, f328, f343; +sub.f32 f368, f327, f344; +add.f32 f369, f328, f343; +add.f32 f370, f331, f356; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f356; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 960; +bfe.u32 r22, r5, 6, 4; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f374, f362; +mul.f32 f379, f375, f363; +sub.f32 f380, f378, f379; +mul.f32 f381, f374, f363; +fma.rn.f32 f382, f375, f362, f381; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f385, f366; +mul.f32 f389, f387, f367; +sub.f32 f390, f388, f389; +mul.f32 f391, f385, f367; +fma.rn.f32 f392, f387, f366, f391; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f395, f370; +mul.f32 f399, f397, f371; +sub.f32 f400, f398, f399; +mul.f32 f401, f395, f371; +fma.rn.f32 f402, f397, f370, f401; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f405, f360; +mul.f32 f409, f407, f361; +sub.f32 f410, f408, f409; +mul.f32 f411, f405, f361; +fma.rn.f32 f412, f407, f360, f411; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f415, f364; +mul.f32 f419, f417, f365; +sub.f32 f420, f418, f419; +mul.f32 f421, f415, f365; +fma.rn.f32 f422, f417, f364, f421; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f425, f368; +mul.f32 f429, f427, f369; +sub.f32 f430, f428, f429; +mul.f32 f431, f425, f369; +fma.rn.f32 f432, f427, f368, f431; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f435, f372; +mul.f32 f439, f437, f373; +sub.f32 f440, f438, f439; +mul.f32 f441, f435, f373; +fma.rn.f32 f442, f437, f372, f441; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 30720; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f380; +st.shared.f32 [r26+512], f390; +st.shared.f32 [r26+768], f400; +st.shared.f32 [r26+1024], f410; +st.shared.f32 [r26+1280], f420; +st.shared.f32 [r26+1536], f430; +st.shared.f32 [r26+1792], f440; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+4096]; +ld.shared.f32 f445, [r27+8192]; +ld.shared.f32 f446, [r27+12288]; +ld.shared.f32 f447, [r27+16384]; +ld.shared.f32 f448, [r27+20480]; +ld.shared.f32 f449, [r27+24576]; +ld.shared.f32 f450, [r27+28672]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+4096]; +ld.shared.f32 f453, [r27+8192]; +ld.shared.f32 f454, [r27+12288]; +ld.shared.f32 f455, [r27+16384]; +ld.shared.f32 f456, [r27+20480]; +ld.shared.f32 f457, [r27+24576]; +ld.shared.f32 f458, [r27+28672]; +add.f32 f459, f443, f447; +add.f32 f460, f451, f455; +sub.f32 f461, f443, f447; +sub.f32 f462, f451, f455; +add.f32 f463, f445, f449; +add.f32 f464, f453, f457; +sub.f32 f465, f445, f449; +sub.f32 f466, f453, f457; +add.f32 f467, f459, f463; +add.f32 f468, f460, f464; +sub.f32 f469, f459, f463; +sub.f32 f470, f460, f464; +add.f32 f471, f461, f466; +sub.f32 f472, f462, f465; +sub.f32 f473, f461, f466; +add.f32 f474, f462, f465; +add.f32 f475, f444, f448; +add.f32 f476, f452, f456; +sub.f32 f477, f444, f448; +sub.f32 f478, f452, f456; +add.f32 f479, f446, f450; +add.f32 f480, f454, f458; +sub.f32 f481, f446, f450; +sub.f32 f482, f454, f458; +add.f32 f483, f475, f479; +add.f32 f484, f476, f480; +sub.f32 f485, f475, f479; +sub.f32 f486, f476, f480; +add.f32 f487, f477, f482; +sub.f32 f488, f478, f481; +sub.f32 f489, f477, f482; +add.f32 f490, f478, f481; +mul.f32 f491, f487, 0f3F3504F3; +mul.f32 f492, f488, 0fBF3504F3; +sub.f32 f493, f491, f492; +mul.f32 f494, f488, 0f3F3504F3; +fma.rn.f32 f495, f487, 0fBF3504F3, f494; +mul.f32 f496, f489, 0fBF3504F3; +mul.f32 f497, f490, 0fBF3504F3; +sub.f32 f498, f496, f497; +add.f32 f499, f496, f497; +add.f32 f500, f467, f483; +add.f32 f501, f468, f484; +sub.f32 f502, f467, f483; +sub.f32 f503, f468, f484; +add.f32 f504, f471, f493; +add.f32 f505, f472, f495; +sub.f32 f506, f471, f493; +sub.f32 f507, f472, f495; +add.f32 f508, f469, f486; +sub.f32 f509, f470, f485; +sub.f32 f510, f469, f486; +add.f32 f511, f470, f485; +add.f32 f512, f473, f498; +add.f32 f513, f474, f499; +sub.f32 f514, f473, f498; +sub.f32 f515, f474, f499; +and.b32 r28, r5, 512; +bfe.u32 r29, r5, 9, 1; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f516, f517}, [rd14]; +mul.f32 f520, f516, f504; +mul.f32 f521, f517, f505; +sub.f32 f522, f520, f521; +mul.f32 f523, f516, f505; +fma.rn.f32 f524, f517, f504, f523; +mul.f32 f525, f516, f516; +mul.f32 f526, f517, f517; +sub.f32 f527, f525, f526; +mul.f32 f528, f517, f516; +fma.rn.f32 f529, f517, f516, f528; +mul.f32 f530, f527, f508; +mul.f32 f531, f529, f509; +sub.f32 f532, f530, f531; +mul.f32 f533, f527, f509; +fma.rn.f32 f534, f529, f508, f533; +mul.f32 f535, f516, f527; +mul.f32 f536, f517, f529; +sub.f32 f537, f535, f536; +mul.f32 f538, f516, f529; +fma.rn.f32 f539, f517, f527, f538; +mul.f32 f540, f537, f512; +mul.f32 f541, f539, f513; +sub.f32 f542, f540, f541; +mul.f32 f543, f537, f513; +fma.rn.f32 f544, f539, f512, f543; +mul.f32 f545, f516, f537; +mul.f32 f546, f517, f539; +sub.f32 f547, f545, f546; +mul.f32 f548, f516, f539; +fma.rn.f32 f549, f517, f537, f548; +mul.f32 f550, f547, f502; +mul.f32 f551, f549, f503; +sub.f32 f552, f550, f551; +mul.f32 f553, f547, f503; +fma.rn.f32 f554, f549, f502, f553; +mul.f32 f555, f516, f547; +mul.f32 f556, f517, f549; +sub.f32 f557, f555, f556; +mul.f32 f558, f516, f549; +fma.rn.f32 f559, f517, f547, f558; +mul.f32 f560, f557, f506; +mul.f32 f561, f559, f507; +sub.f32 f562, f560, f561; +mul.f32 f563, f557, f507; +fma.rn.f32 f564, f559, f506, f563; +mul.f32 f565, f516, f557; +mul.f32 f566, f517, f559; +sub.f32 f567, f565, f566; +mul.f32 f568, f516, f559; +fma.rn.f32 f569, f517, f557, f568; +mul.f32 f570, f567, f510; +mul.f32 f571, f569, f511; +sub.f32 f572, f570, f571; +mul.f32 f573, f567, f511; +fma.rn.f32 f574, f569, f510, f573; +mul.f32 f575, f516, f567; +mul.f32 f576, f517, f569; +sub.f32 f577, f575, f576; +mul.f32 f578, f516, f569; +fma.rn.f32 f579, f517, f567, f578; +mul.f32 f580, f577, f514; +mul.f32 f581, f579, f515; +sub.f32 f582, f580, f581; +mul.f32 f583, f577, f515; +fma.rn.f32 f584, f579, f514, f583; +and.b32 r30, r15, 2044; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 16384; +add.s32 r33, r31, r32; +st.shared.f32 [r33], f500; +st.shared.f32 [r33+2048], f522; +st.shared.f32 [r33+4096], f532; +st.shared.f32 [r33+6144], f542; +st.shared.f32 [r33+8192], f552; +st.shared.f32 [r33+10240], f562; +st.shared.f32 [r33+12288], f572; +st.shared.f32 [r33+14336], f582; +barrier.sync 0; +mad.lo.s32 r34, r28, -28, r33; +ld.shared.f32 f585, [r34]; +ld.shared.f32 f586, [r34+4096]; +ld.shared.f32 f587, [r34+8192]; +ld.shared.f32 f588, [r34+12288]; +ld.shared.f32 f589, [r34+16384]; +ld.shared.f32 f590, [r34+20480]; +ld.shared.f32 f591, [r34+24576]; +ld.shared.f32 f592, [r34+28672]; +barrier.sync 0; +st.shared.f32 [r33], f501; +st.shared.f32 [r33+2048], f524; +st.shared.f32 [r33+4096], f534; +st.shared.f32 [r33+6144], f544; +st.shared.f32 [r33+8192], f554; +st.shared.f32 [r33+10240], f564; +st.shared.f32 [r33+12288], f574; +st.shared.f32 [r33+14336], f584; +barrier.sync 0; +ld.shared.f32 f593, [r34]; +ld.shared.f32 f594, [r34+4096]; +ld.shared.f32 f595, [r34+8192]; +ld.shared.f32 f596, [r34+12288]; +ld.shared.f32 f597, [r34+16384]; +ld.shared.f32 f598, [r34+20480]; +ld.shared.f32 f599, [r34+24576]; +ld.shared.f32 f600, [r34+28672]; +add.f32 %0, f585, f589; +add.f32 %1, f593, f597; +add.f32 %2, f586, f590; +add.f32 %3, f594, f598; +add.f32 %4, f587, f591; +add.f32 %5, f595, f599; +add.f32 %6, f588, f592; +add.f32 %7, f596, f600; +sub.f32 %8, f585, f589; +sub.f32 %9, f593, f597; +sub.f32 %10, f586, f590; +sub.f32 %11, f594, f598; +sub.f32 %12, f587, f591; +sub.f32 %13, f595, f599; +sub.f32 %14, f588, f592; +sub.f32 %15, f596, f600; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_8192), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<112, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<681>; +.reg .b32 r<34>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %21, %31; +add.f32 f34, %22, %33; +sub.f32 f35, %21, %31; +sub.f32 f36, %22, %33; +add.f32 f37, %26, %37; +add.f32 f38, %28, %38; +sub.f32 f39, %26, %37; +sub.f32 f40, %28, %38; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %23, %34; +add.f32 f50, %25, %36; +sub.f32 f51, %23, %34; +sub.f32 f52, %25, %36; +add.f32 f53, %29, %39; +add.f32 f54, %30, %40; +sub.f32 f55, %29, %39; +sub.f32 f56, %30, %40; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f69; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f69; +add.f32 f80, f43, f60; +sub.f32 f81, f44, f59; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f47, f72; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f72; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f88, f76; +mul.f32 f93, f89, f77; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f97, f80; +mul.f32 f101, f99, f81; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f105, f84; +mul.f32 f109, f107, f85; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f113, f74; +mul.f32 f117, f115, f75; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f121, f78; +mul.f32 f125, f123, f79; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f129, f82; +mul.f32 f133, f131, f83; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f137, f86; +mul.f32 f141, f139, f87; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 65472; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f89, f76, f94; +sub.f32 f146, f92, f93; +st.shared.v4.f32 [r12], {f144, f143, f146, f145}; +fma.rn.f32 f147, f99, f80, f102; +sub.f32 f148, f100, f101; +sub.f32 f149, f108, f109; +fma.rn.f32 f150, f107, f84, f110; +st.shared.v4.f32 [r12+16], {f148, f147, f149, f150}; +fma.rn.f32 f151, f115, f74, f118; +sub.f32 f152, f116, f117; +fma.rn.f32 f153, f123, f78, f126; +sub.f32 f154, f124, f125; +st.shared.v4.f32 [r12+32], {f152, f151, f154, f153}; +fma.rn.f32 f155, f131, f82, f134; +sub.f32 f156, f132, f133; +fma.rn.f32 f157, f139, f86, f142; +sub.f32 f158, f140, f141; +st.shared.v4.f32 [r12+48], {f156, f155, f158, f157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+8192]; +ld.shared.v2.f32 {f167, f168}, [r13+16384]; +ld.shared.v2.f32 {f171, f172}, [r13+24576]; +ld.shared.v2.f32 {f175, f176}, [r13+32768]; +ld.shared.v2.f32 {f179, f180}, [r13+40960]; +ld.shared.v2.f32 {f183, f184}, [r13+49152]; +ld.shared.v2.f32 {f187, f188}, [r13+57344]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +add.f32 f203, f193, f198; +sub.f32 f204, f194, f197; +sub.f32 f205, f193, f198; +add.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +add.f32 f219, f209, f214; +sub.f32 f220, f210, f213; +sub.f32 f221, f209, f214; +add.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0fBF3504F3; +sub.f32 f225, f223, f224; +mul.f32 f226, f220, 0f3F3504F3; +fma.rn.f32 f227, f219, 0fBF3504F3, f226; +mul.f32 f228, f221, 0fBF3504F3; +mul.f32 f229, f222, 0fBF3504F3; +sub.f32 f230, f228, f229; +add.f32 f231, f228, f229; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f227; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f227; +add.f32 f238, f201, f218; +sub.f32 f239, f202, f217; +sub.f32 f240, f201, f218; +add.f32 f241, f202, f217; +add.f32 f242, f205, f230; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f230; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 1016; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f246, f234; +mul.f32 f251, f247, f235; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f255, f238; +mul.f32 f259, f257, f239; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f263, f242; +mul.f32 f267, f265, f243; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f271, f232; +mul.f32 f275, f273, f233; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f279, f236; +mul.f32 f283, f281, f237; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f287, f240; +mul.f32 f291, f289, f241; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f295, f244; +mul.f32 f299, f297, f245; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 65024; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f247, f234, f252; +sub.f32 f304, f250, f251; +st.shared.v2.f32 [r18+64], {f304, f303}; +fma.rn.f32 f305, f257, f238, f260; +sub.f32 f306, f258, f259; +st.shared.v2.f32 [r18+128], {f306, f305}; +fma.rn.f32 f307, f265, f242, f268; +sub.f32 f308, f266, f267; +st.shared.v2.f32 [r18+192], {f308, f307}; +sub.f32 f309, f274, f275; +fma.rn.f32 f310, f273, f232, f276; +st.shared.v2.f32 [r18+256], {f309, f310}; +fma.rn.f32 f311, f281, f236, f284; +sub.f32 f312, f282, f283; +st.shared.v2.f32 [r18+320], {f312, f311}; +fma.rn.f32 f313, f289, f240, f292; +sub.f32 f314, f290, f291; +st.shared.v2.f32 [r18+384], {f314, f313}; +fma.rn.f32 f315, f297, f244, f300; +sub.f32 f316, f298, f299; +st.shared.v2.f32 [r18+448], {f316, f315}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+8192]; +ld.shared.v2.f32 {f325, f326}, [r19+16384]; +ld.shared.v2.f32 {f329, f330}, [r19+24576]; +ld.shared.v2.f32 {f333, f334}, [r19+32768]; +ld.shared.v2.f32 {f337, f338}, [r19+40960]; +ld.shared.v2.f32 {f341, f342}, [r19+49152]; +ld.shared.v2.f32 {f345, f346}, [r19+57344]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +add.f32 f361, f351, f356; +sub.f32 f362, f352, f355; +sub.f32 f363, f351, f356; +add.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +add.f32 f377, f367, f372; +sub.f32 f378, f368, f371; +sub.f32 f379, f367, f372; +add.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0fBF3504F3; +sub.f32 f383, f381, f382; +mul.f32 f384, f378, 0f3F3504F3; +fma.rn.f32 f385, f377, 0fBF3504F3, f384; +mul.f32 f386, f379, 0fBF3504F3; +mul.f32 f387, f380, 0fBF3504F3; +sub.f32 f388, f386, f387; +add.f32 f389, f386, f387; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f385; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f385; +add.f32 f396, f359, f376; +sub.f32 f397, f360, f375; +sub.f32 f398, f359, f376; +add.f32 f399, f360, f375; +add.f32 f400, f363, f388; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f388; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 960; +bfe.u32 r21, r5, 6, 4; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f404, f392; +mul.f32 f409, f405, f393; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f413, f396; +mul.f32 f417, f415, f397; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f421, f400; +mul.f32 f425, f423, f401; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f429, f390; +mul.f32 f433, f431, f391; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f437, f394; +mul.f32 f441, f439, f395; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f445, f398; +mul.f32 f449, f447, f399; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f453, f402; +mul.f32 f457, f455, f403; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 61440; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f405, f392, f410; +sub.f32 f462, f408, f409; +st.shared.v2.f32 [r25+512], {f462, f461}; +fma.rn.f32 f463, f415, f396, f418; +sub.f32 f464, f416, f417; +st.shared.v2.f32 [r25+1024], {f464, f463}; +fma.rn.f32 f465, f423, f400, f426; +sub.f32 f466, f424, f425; +st.shared.v2.f32 [r25+1536], {f466, f465}; +sub.f32 f467, f432, f433; +fma.rn.f32 f468, f431, f390, f434; +st.shared.v2.f32 [r25+2048], {f467, f468}; +fma.rn.f32 f469, f439, f394, f442; +sub.f32 f470, f440, f441; +st.shared.v2.f32 [r25+2560], {f470, f469}; +fma.rn.f32 f471, f447, f398, f450; +sub.f32 f472, f448, f449; +st.shared.v2.f32 [r25+3072], {f472, f471}; +fma.rn.f32 f473, f455, f402, f458; +sub.f32 f474, f456, f457; +st.shared.v2.f32 [r25+3584], {f474, f473}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+8192]; +ld.shared.v2.f32 {f483, f484}, [r26+16384]; +ld.shared.v2.f32 {f487, f488}, [r26+24576]; +ld.shared.v2.f32 {f491, f492}, [r26+32768]; +ld.shared.v2.f32 {f495, f496}, [r26+40960]; +ld.shared.v2.f32 {f499, f500}, [r26+49152]; +ld.shared.v2.f32 {f503, f504}, [r26+57344]; +add.f32 f507, f475, f491; +add.f32 f508, f476, f492; +sub.f32 f509, f475, f491; +sub.f32 f510, f476, f492; +add.f32 f511, f483, f499; +add.f32 f512, f484, f500; +sub.f32 f513, f483, f499; +sub.f32 f514, f484, f500; +add.f32 f515, f507, f511; +add.f32 f516, f508, f512; +sub.f32 f517, f507, f511; +sub.f32 f518, f508, f512; +add.f32 f519, f509, f514; +sub.f32 f520, f510, f513; +sub.f32 f521, f509, f514; +add.f32 f522, f510, f513; +add.f32 f523, f479, f495; +add.f32 f524, f480, f496; +sub.f32 f525, f479, f495; +sub.f32 f526, f480, f496; +add.f32 f527, f487, f503; +add.f32 f528, f488, f504; +sub.f32 f529, f487, f503; +sub.f32 f530, f488, f504; +add.f32 f531, f523, f527; +add.f32 f532, f524, f528; +sub.f32 f533, f523, f527; +sub.f32 f534, f524, f528; +add.f32 f535, f525, f530; +sub.f32 f536, f526, f529; +sub.f32 f537, f525, f530; +add.f32 f538, f526, f529; +mul.f32 f539, f535, 0f3F3504F3; +mul.f32 f540, f536, 0fBF3504F3; +sub.f32 f541, f539, f540; +mul.f32 f542, f536, 0f3F3504F3; +fma.rn.f32 f543, f535, 0fBF3504F3, f542; +mul.f32 f544, f537, 0fBF3504F3; +mul.f32 f545, f538, 0fBF3504F3; +sub.f32 f546, f544, f545; +add.f32 f547, f544, f545; +sub.f32 f548, f515, f531; +sub.f32 f549, f516, f532; +add.f32 f550, f519, f541; +add.f32 f551, f520, f543; +sub.f32 f552, f519, f541; +sub.f32 f553, f520, f543; +add.f32 f554, f517, f534; +sub.f32 f555, f518, f533; +sub.f32 f556, f517, f534; +add.f32 f557, f518, f533; +add.f32 f558, f521, f546; +add.f32 f559, f522, f547; +sub.f32 f560, f521, f546; +sub.f32 f561, f522, f547; +and.b32 r27, r5, 512; +bfe.u32 r28, r5, 9, 1; +mul.wide.u32 rd12, r28, 8; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f562, f563}, [rd14]; +mul.f32 f566, f562, f550; +mul.f32 f567, f563, f551; +mul.f32 f568, f562, f551; +mul.f32 f569, f562, f562; +mul.f32 f570, f563, f563; +sub.f32 f571, f569, f570; +mul.f32 f572, f563, f562; +fma.rn.f32 f573, f563, f562, f572; +mul.f32 f574, f571, f554; +mul.f32 f575, f573, f555; +mul.f32 f576, f571, f555; +mul.f32 f577, f562, f571; +mul.f32 f578, f563, f573; +sub.f32 f579, f577, f578; +mul.f32 f580, f562, f573; +fma.rn.f32 f581, f563, f571, f580; +mul.f32 f582, f579, f558; +mul.f32 f583, f581, f559; +mul.f32 f584, f579, f559; +mul.f32 f585, f562, f579; +mul.f32 f586, f563, f581; +sub.f32 f587, f585, f586; +mul.f32 f588, f562, f581; +fma.rn.f32 f589, f563, f579, f588; +mul.f32 f590, f587, f548; +mul.f32 f591, f589, f549; +mul.f32 f592, f587, f549; +mul.f32 f593, f562, f587; +mul.f32 f594, f563, f589; +sub.f32 f595, f593, f594; +mul.f32 f596, f562, f589; +fma.rn.f32 f597, f563, f587, f596; +mul.f32 f598, f595, f552; +mul.f32 f599, f597, f553; +mul.f32 f600, f595, f553; +mul.f32 f601, f562, f595; +mul.f32 f602, f563, f597; +sub.f32 f603, f601, f602; +mul.f32 f604, f562, f597; +fma.rn.f32 f605, f563, f595, f604; +mul.f32 f606, f603, f556; +mul.f32 f607, f605, f557; +mul.f32 f608, f603, f557; +mul.f32 f609, f562, f603; +mul.f32 f610, f563, f605; +sub.f32 f611, f609, f610; +mul.f32 f612, f562, f605; +fma.rn.f32 f613, f563, f603, f612; +mul.f32 f614, f611, f560; +mul.f32 f615, f613, f561; +mul.f32 f616, f611, f561; +and.b32 r29, r10, 4088; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 32768; +add.s32 r32, r30, r31; +add.f32 f617, f516, f532; +add.f32 f618, f515, f531; +st.shared.v2.f32 [r32], {f618, f617}; +fma.rn.f32 f619, f563, f550, f568; +sub.f32 f620, f566, f567; +st.shared.v2.f32 [r32+4096], {f620, f619}; +fma.rn.f32 f621, f573, f554, f576; +sub.f32 f622, f574, f575; +st.shared.v2.f32 [r32+8192], {f622, f621}; +fma.rn.f32 f623, f581, f558, f584; +sub.f32 f624, f582, f583; +st.shared.v2.f32 [r32+12288], {f624, f623}; +sub.f32 f625, f590, f591; +fma.rn.f32 f626, f589, f548, f592; +st.shared.v2.f32 [r32+16384], {f625, f626}; +fma.rn.f32 f627, f597, f552, f600; +sub.f32 f628, f598, f599; +st.shared.v2.f32 [r32+20480], {f628, f627}; +fma.rn.f32 f629, f605, f556, f608; +sub.f32 f630, f606, f607; +st.shared.v2.f32 [r32+24576], {f630, f629}; +fma.rn.f32 f631, f613, f560, f616; +sub.f32 f632, f614, f615; +st.shared.v2.f32 [r32+28672], {f632, f631}; +barrier.sync 0; +mad.lo.s32 r33, r27, -56, r32; +ld.shared.v2.f32 {f633, f634}, [r33]; +ld.shared.v2.f32 {f637, f638}, [r33+8192]; +ld.shared.v2.f32 {f641, f642}, [r33+16384]; +ld.shared.v2.f32 {f645, f646}, [r33+24576]; +ld.shared.v2.f32 {f649, f650}, [r33+32768]; +ld.shared.v2.f32 {f653, f654}, [r33+40960]; +ld.shared.v2.f32 {f657, f658}, [r33+49152]; +ld.shared.v2.f32 {f661, f662}, [r33+57344]; +add.f32 %1, f634, f650; +add.f32 %0, f633, f649; +add.f32 %3, f638, f654; +add.f32 %2, f637, f653; +add.f32 %5, f642, f658; +add.f32 %4, f641, f657; +add.f32 %7, f646, f662; +add.f32 %6, f645, f661; +sub.f32 %9, f634, f650; +sub.f32 %8, f633, f649; +sub.f32 %11, f638, f654; +sub.f32 %10, f637, f653; +sub.f32 %13, f642, f658; +sub.f32 %12, f641, f657; +sub.f32 %15, f646, f662; +sub.f32 %14, f645, f661; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_8192), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..a5058d367f041 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp32_inv.hpp.inc @@ -0,0 +1,7775 @@ +#ifndef CUFFTDX_FFT_8192_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_8192_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<312, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2668>; +.reg .b32 r<38>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2660, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2658, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2657, f2660, f2658; +sub.f32 f140, f2660, f2658; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2656, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2653, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2651, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2650, f2653, f2651; +sub.f32 f156, f2653, f2651; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2649, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2649, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2647, f159, 0fBF3504F3; +mul.f32 f2648, f160, 0f3F3504F3; +sub.f32 f167, f2647, f2648; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2646, f2657, f2650; +sub.f32 f173, f2657, f2650; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2645, f2656, f164; +sub.f32 f177, f2656, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2644, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2643, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2641, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2638, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2637, f2641, f2638; +sub.f32 f197, f2641, f2638; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2636, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2634, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2632, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2631, f2634, f2632; +sub.f32 f213, f2634, f2632; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2630, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2630, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2628, f216, 0fBF3504F3; +mul.f32 f2629, f217, 0f3F3504F3; +sub.f32 f224, f2628, f2629; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2627, f2637, f2631; +sub.f32 f230, f2637, f2631; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2626, f2636, f221; +sub.f32 f234, f2636, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2625, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2624, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2622, f231, 0f3F6C835E; +mul.f32 f2623, f2626, 0f3EC3EF15; +sub.f32 f245, f2622, f2623; +mul.f32 f246, f2626, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2625, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2624, 0f3F6C835E; +mul.f32 f2621, f239, 0f3EC3EF15; +sub.f32 f254, f2621, f253; +mul.f32 f255, f2624, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2620, f233, 0fBEC3EF15; +sub.f32 f259, f2620, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2618, f237, 0fBF3504F3; +mul.f32 f2619, f238, 0f3F3504F3; +sub.f32 f264, f2618, f2619; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2616, f241, 0fBF6C835E; +mul.f32 f2617, f242, 0f3EC3EF15; +sub.f32 f269, f2616, f2617; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2615, f2646, f2627; +sub.f32 f275, f2646, f2627; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2614, f2645, f247; +sub.f32 f279, f2645, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2613, f2644, f251; +sub.f32 f283, f2644, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2612, f2643, f256; +sub.f32 f287, f2643, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2611, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2610, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2609, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2608, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2605, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2603, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2602, f2605, f2603; +sub.f32 f315, f2605, f2603; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2601, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2599, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2596, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2595, f2599, f2596; +sub.f32 f331, f2599, f2596; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2594, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2594, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2593, f334, 0fBF3504F3; +sub.f32 f342, f2593, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2592, f2602, f2595; +sub.f32 f348, f2602, f2595; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2591, f2601, f339; +sub.f32 f352, f2601, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2590, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2589, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2587, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2585, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2584, f2587, f2585; +sub.f32 f372, f2587, f2585; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2583, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2580, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2579, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2578, f2580, f2579; +sub.f32 f388, f2580, f2579; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2577, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2577, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2575, f391, 0fBF3504F3; +mul.f32 f2576, f392, 0f3F3504F3; +sub.f32 f399, f2575, f2576; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2574, f2584, f2578; +sub.f32 f405, f2584, f2578; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2573, f2583, f396; +sub.f32 f409, f2583, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2572, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2571, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2573, 0f3EC3EF15; +mul.f32 f2570, f406, 0f3F6C835E; +sub.f32 f420, f2570, f419; +mul.f32 f421, f2573, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2572, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2571, 0f3F6C835E; +mul.f32 f2569, f414, 0f3EC3EF15; +sub.f32 f429, f2569, f428; +mul.f32 f430, f2571, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2568, f408, 0fBEC3EF15; +sub.f32 f434, f2568, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2567, f412, 0fBF3504F3; +sub.f32 f439, f2567, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2566, f416, 0fBF6C835E; +sub.f32 f444, f2566, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2565, f2592, f2574; +sub.f32 f450, f2592, f2574; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2564, f2591, f422; +sub.f32 f454, f2591, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2563, f2590, f426; +sub.f32 f458, f2590, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2562, f2589, f431; +sub.f32 f462, f2589, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2561, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2560, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2559, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2558, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2564, 0f3E47C5C2; +mul.f32 f2557, f451, 0f3F7B14BE; +sub.f32 f481, f2557, f480; +mul.f32 f482, f2564, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2563, 0f3EC3EF15; +mul.f32 f2556, f455, 0f3F6C835E; +sub.f32 f486, f2556, f485; +mul.f32 f487, f2563, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2562, 0f3F0E39DA; +mul.f32 f2555, f459, 0f3F54DB31; +sub.f32 f491, f2555, f490; +mul.f32 f492, f2562, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2561, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2560, 0f3F54DB31; +mul.f32 f2554, f467, 0f3F0E39DA; +sub.f32 f500, f2554, f499; +mul.f32 f501, f2560, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2559, 0f3F6C835E; +mul.f32 f2553, f471, 0f3EC3EF15; +sub.f32 f505, f2553, f504; +mul.f32 f506, f2559, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2558, 0f3F7B14BE; +mul.f32 f2552, f475, 0f3E47C5C2; +sub.f32 f510, f2552, f509; +mul.f32 f511, f2558, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2551, f453, 0fBE47C5C2; +sub.f32 f515, f2551, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2549, f457, 0fBEC3EF15; +mul.f32 f2550, f458, 0f3F6C835E; +sub.f32 f520, f2549, f2550; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2547, f461, 0fBF0E39DA; +mul.f32 f2548, f462, 0f3F54DB31; +sub.f32 f525, f2547, f2548; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2545, f465, 0fBF3504F3; +mul.f32 f2546, f466, 0f3F3504F3; +sub.f32 f530, f2545, f2546; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2543, f469, 0fBF54DB31; +mul.f32 f2544, f470, 0f3F0E39DA; +sub.f32 f535, f2543, f2544; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2542, f473, 0fBF6C835E; +sub.f32 f540, f2542, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2541, f477, 0fBF7B14BE; +sub.f32 f545, f2541, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f550, f276, f481; +sub.f32 f552, f276, f481; +add.f32 f2540, f2614, f483; +sub.f32 f553, f2614, f483; +add.f32 f554, f280, f486; +sub.f32 f556, f280, f486; +add.f32 f2539, f2613, f488; +sub.f32 f557, f2613, f488; +add.f32 f558, f284, f491; +sub.f32 f560, f284, f491; +add.f32 f2538, f2612, f493; +sub.f32 f561, f2612, f493; +add.f32 f562, f288, f496; +sub.f32 f564, f288, f496; +add.f32 f2537, f2611, f497; +sub.f32 f565, f2611, f497; +add.f32 f566, f292, f500; +sub.f32 f568, f292, f500; +add.f32 f2536, f2610, f502; +sub.f32 f569, f2610, f502; +add.f32 f570, f296, f505; +sub.f32 f572, f296, f505; +add.f32 f2535, f2609, f507; +sub.f32 f573, f2609, f507; +add.f32 f574, f300, f510; +sub.f32 f576, f300, f510; +add.f32 f2534, f2608, f512; +sub.f32 f577, f2608, f512; +sub.f32 f578, f274, f450; +add.f32 f580, f274, f450; +add.f32 f2533, f275, f449; +sub.f32 f581, f275, f449; +add.f32 f582, f278, f515; +sub.f32 f584, f278, f515; +add.f32 f2532, f279, f517; +sub.f32 f585, f279, f517; +add.f32 f586, f282, f520; +sub.f32 f588, f282, f520; +add.f32 f2531, f283, f522; +sub.f32 f589, f283, f522; +add.f32 f590, f286, f525; +sub.f32 f592, f286, f525; +add.f32 f2530, f287, f527; +sub.f32 f593, f287, f527; +add.f32 f594, f290, f530; +sub.f32 f596, f290, f530; +add.f32 f2529, f291, f532; +sub.f32 f597, f291, f532; +add.f32 f598, f294, f535; +sub.f32 f600, f294, f535; +add.f32 f2528, f295, f537; +sub.f32 f601, f295, f537; +add.f32 f602, f298, f540; +sub.f32 f604, f298, f540; +add.f32 f2527, f299, f542; +sub.f32 f605, f299, f542; +add.f32 f606, f302, f545; +sub.f32 f608, f302, f545; +add.f32 f2526, f303, f547; +sub.f32 f609, f303, f547; +mov.u32 r22, %tid.x; +shl.b32 r7, r22, 8; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r22, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f610, f611}, [rd5]; +mul.f32 f614, f2540, f611; +mul.f32 f616, f610, f2540; +mul.f32 f618, f611, f611; +mul.f32 f2525, f610, f610; +sub.f32 f619, f2525, f618; +mul.f32 f620, f611, f610; +fma.rn.f32 f621, f611, f610, f620; +mul.f32 f622, f2539, f621; +mul.f32 f624, f619, f2539; +mul.f32 f626, f611, f621; +mul.f32 f2524, f610, f619; +sub.f32 f627, f2524, f626; +mul.f32 f2523, f554, f621; +mul.f32 f628, f610, f621; +fma.rn.f32 f629, f611, f619, f628; +mul.f32 f630, f2538, f629; +mul.f32 f632, f627, f2538; +mul.f32 f2521, f610, f627; +mul.f32 f2522, f611, f629; +sub.f32 f635, f2521, f2522; +mul.f32 f2520, f558, f629; +mul.f32 f636, f610, f629; +fma.rn.f32 f637, f611, f627, f636; +mul.f32 f638, f2537, f637; +mul.f32 f640, f635, f2537; +mul.f32 f642, f611, f637; +mul.f32 f2519, f610, f635; +sub.f32 f643, f2519, f642; +mul.f32 f2518, f562, f637; +mul.f32 f644, f610, f637; +fma.rn.f32 f645, f611, f635, f644; +mul.f32 f646, f2536, f645; +mul.f32 f648, f643, f2536; +mul.f32 f2516, f610, f643; +mul.f32 f2517, f611, f645; +sub.f32 f651, f2516, f2517; +mul.f32 f2515, f566, f645; +mul.f32 f652, f610, f645; +fma.rn.f32 f653, f611, f643, f652; +mul.f32 f654, f2535, f653; +mul.f32 f656, f651, f2535; +mul.f32 f658, f611, f653; +mul.f32 f2514, f610, f651; +sub.f32 f659, f2514, f658; +mul.f32 f2513, f570, f653; +mul.f32 f660, f610, f653; +fma.rn.f32 f661, f611, f651, f660; +mul.f32 f662, f2534, f661; +mul.f32 f664, f659, f2534; +mul.f32 f666, f611, f661; +mul.f32 f2512, f610, f659; +sub.f32 f667, f2512, f666; +mul.f32 f2511, f574, f661; +mul.f32 f668, f610, f661; +fma.rn.f32 f669, f611, f659, f668; +mul.f32 f670, f2533, f669; +mul.f32 f672, f667, f2533; +mul.f32 f2509, f610, f667; +mul.f32 f2510, f611, f669; +sub.f32 f675, f2509, f2510; +mul.f32 f2508, f578, f669; +mul.f32 f676, f610, f669; +fma.rn.f32 f677, f611, f667, f676; +mul.f32 f678, f2532, f677; +mul.f32 f680, f675, f2532; +mul.f32 f682, f611, f677; +mul.f32 f2507, f610, f675; +sub.f32 f683, f2507, f682; +mul.f32 f2506, f582, f677; +mul.f32 f684, f610, f677; +fma.rn.f32 f685, f611, f675, f684; +mul.f32 f686, f2531, f685; +mul.f32 f688, f683, f2531; +mul.f32 f690, f611, f685; +mul.f32 f2505, f610, f683; +sub.f32 f691, f2505, f690; +mul.f32 f2504, f586, f685; +mul.f32 f692, f610, f685; +fma.rn.f32 f693, f611, f683, f692; +mul.f32 f694, f2530, f693; +mul.f32 f696, f691, f2530; +mul.f32 f2502, f610, f691; +mul.f32 f2503, f611, f693; +sub.f32 f699, f2502, f2503; +mul.f32 f2501, f590, f693; +mul.f32 f700, f610, f693; +fma.rn.f32 f701, f611, f691, f700; +mul.f32 f702, f2529, f701; +mul.f32 f704, f699, f2529; +mul.f32 f706, f611, f701; +mul.f32 f2500, f610, f699; +sub.f32 f707, f2500, f706; +mul.f32 f2499, f594, f701; +mul.f32 f708, f610, f701; +fma.rn.f32 f709, f611, f699, f708; +mul.f32 f710, f2528, f709; +mul.f32 f712, f707, f2528; +mul.f32 f2497, f610, f707; +mul.f32 f2498, f611, f709; +sub.f32 f715, f2497, f2498; +mul.f32 f2496, f598, f709; +mul.f32 f716, f610, f709; +fma.rn.f32 f717, f611, f707, f716; +mul.f32 f718, f2527, f717; +mul.f32 f720, f715, f2527; +mul.f32 f722, f611, f717; +mul.f32 f2495, f610, f715; +sub.f32 f723, f2495, f722; +mul.f32 f2494, f602, f717; +mul.f32 f724, f610, f717; +fma.rn.f32 f725, f611, f715, f724; +mul.f32 f726, f2526, f725; +mul.f32 f728, f723, f2526; +mul.f32 f730, f611, f725; +mul.f32 f2493, f610, f723; +sub.f32 f731, f2493, f730; +mul.f32 f2492, f606, f725; +mul.f32 f732, f610, f725; +fma.rn.f32 f733, f611, f723, f732; +sub.f32 f2491, f2615, f2565; +mul.f32 f734, f2491, f733; +mul.f32 f736, f731, f2491; +mul.f32 f2489, f610, f731; +mul.f32 f2490, f611, f733; +sub.f32 f739, f2489, f2490; +sub.f32 f2488, f272, f447; +mul.f32 f2487, f2488, f733; +mul.f32 f740, f610, f733; +fma.rn.f32 f741, f611, f731, f740; +mul.f32 f742, f553, f741; +mul.f32 f744, f739, f553; +mul.f32 f746, f611, f741; +mul.f32 f2486, f610, f739; +sub.f32 f747, f2486, f746; +mul.f32 f2485, f552, f741; +mul.f32 f748, f610, f741; +fma.rn.f32 f749, f611, f739, f748; +mul.f32 f750, f557, f749; +mul.f32 f752, f747, f557; +mul.f32 f754, f611, f749; +mul.f32 f2484, f610, f747; +sub.f32 f755, f2484, f754; +mul.f32 f2483, f556, f749; +mul.f32 f756, f610, f749; +fma.rn.f32 f757, f611, f747, f756; +mul.f32 f758, f561, f757; +mul.f32 f760, f755, f561; +mul.f32 f2481, f610, f755; +mul.f32 f2482, f611, f757; +sub.f32 f763, f2481, f2482; +mul.f32 f2480, f560, f757; +mul.f32 f764, f610, f757; +fma.rn.f32 f765, f611, f755, f764; +mul.f32 f766, f565, f765; +mul.f32 f768, f763, f565; +mul.f32 f770, f611, f765; +mul.f32 f2479, f610, f763; +sub.f32 f771, f2479, f770; +mul.f32 f2478, f564, f765; +mul.f32 f772, f610, f765; +fma.rn.f32 f773, f611, f763, f772; +mul.f32 f774, f569, f773; +mul.f32 f776, f771, f569; +mul.f32 f2476, f610, f771; +mul.f32 f2477, f611, f773; +sub.f32 f779, f2476, f2477; +mul.f32 f2475, f568, f773; +mul.f32 f780, f610, f773; +fma.rn.f32 f781, f611, f771, f780; +mul.f32 f782, f573, f781; +mul.f32 f784, f779, f573; +mul.f32 f786, f611, f781; +mul.f32 f2474, f610, f779; +sub.f32 f787, f2474, f786; +mul.f32 f2473, f572, f781; +mul.f32 f788, f610, f781; +fma.rn.f32 f789, f611, f779, f788; +mul.f32 f790, f577, f789; +mul.f32 f792, f787, f577; +mul.f32 f794, f611, f789; +mul.f32 f2472, f610, f787; +sub.f32 f795, f2472, f794; +mul.f32 f2471, f576, f789; +mul.f32 f796, f610, f789; +fma.rn.f32 f797, f611, f787, f796; +mul.f32 f798, f581, f797; +mul.f32 f800, f795, f581; +mul.f32 f2469, f610, f795; +mul.f32 f2470, f611, f797; +sub.f32 f803, f2469, f2470; +mul.f32 f2468, f580, f797; +mul.f32 f804, f610, f797; +fma.rn.f32 f805, f611, f795, f804; +mul.f32 f806, f585, f805; +mul.f32 f808, f803, f585; +mul.f32 f810, f611, f805; +mul.f32 f2467, f610, f803; +sub.f32 f811, f2467, f810; +mul.f32 f2466, f584, f805; +mul.f32 f812, f610, f805; +fma.rn.f32 f813, f611, f803, f812; +mul.f32 f814, f589, f813; +mul.f32 f816, f811, f589; +mul.f32 f818, f611, f813; +mul.f32 f2465, f610, f811; +sub.f32 f819, f2465, f818; +mul.f32 f2464, f588, f813; +mul.f32 f820, f610, f813; +fma.rn.f32 f821, f611, f811, f820; +mul.f32 f822, f593, f821; +mul.f32 f824, f819, f593; +mul.f32 f2462, f610, f819; +mul.f32 f2463, f611, f821; +sub.f32 f827, f2462, f2463; +mul.f32 f2461, f592, f821; +mul.f32 f828, f610, f821; +fma.rn.f32 f829, f611, f819, f828; +mul.f32 f830, f597, f829; +mul.f32 f832, f827, f597; +mul.f32 f834, f611, f829; +mul.f32 f2460, f610, f827; +sub.f32 f835, f2460, f834; +mul.f32 f2459, f596, f829; +mul.f32 f836, f610, f829; +fma.rn.f32 f837, f611, f827, f836; +mul.f32 f838, f601, f837; +mul.f32 f840, f835, f601; +mul.f32 f2457, f610, f835; +mul.f32 f2458, f611, f837; +sub.f32 f843, f2457, f2458; +mul.f32 f2456, f600, f837; +mul.f32 f844, f610, f837; +fma.rn.f32 f845, f611, f835, f844; +mul.f32 f846, f605, f845; +mul.f32 f848, f843, f605; +mul.f32 f850, f611, f845; +mul.f32 f2455, f610, f843; +sub.f32 f851, f2455, f850; +mul.f32 f2454, f604, f845; +mul.f32 f852, f610, f845; +mul.f32 f2453, f550, f611; +fma.rn.f32 f853, f611, f843, f852; +mul.f32 f854, f609, f853; +mul.f32 f855, f608, f853; +mul.f32 f856, f851, f609; +barrier.sync 0; +and.b32 r11, r7, 65280; +add.s32 r12, r9, r11; +add.f32 f857, f2615, f2565; +mov.u32 r30, %tid.x; +shl.b32 r29, r30, 3; +sub.f32 f2667, f272, f447; +add.f32 f858, f272, f447; +mov.u32 r28, %tid.x; +shl.b32 r24, r28, 8; +mov.u32 r35, %tid.x; +mov.u32 r32, %tid.x; +shl.b32 r31, r32, 3; +fma.rn.f32 f859, f610, f550, f614; +sub.f32 f860, f616, f2453; +st.shared.v4.f32 [r12], {f858, f857, f859, f860}; +fma.rn.f32 f861, f619, f554, f622; +sub.f32 f862, f624, f2523; +fma.rn.f32 f863, f627, f558, f630; +sub.f32 f864, f632, f2520; +st.shared.v4.f32 [r12+16], {f861, f862, f863, f864}; +fma.rn.f32 f865, f635, f562, f638; +sub.f32 f866, f640, f2518; +sub.f32 f867, f648, f2515; +fma.rn.f32 f868, f643, f566, f646; +st.shared.v4.f32 [r12+32], {f865, f866, f868, f867}; +fma.rn.f32 f869, f651, f570, f654; +sub.f32 f870, f656, f2513; +fma.rn.f32 f871, f659, f574, f662; +sub.f32 f872, f664, f2511; +st.shared.v4.f32 [r12+48], {f869, f870, f871, f872}; +fma.rn.f32 f873, f667, f578, f670; +sub.f32 f874, f672, f2508; +fma.rn.f32 f875, f675, f582, f678; +sub.f32 f876, f680, f2506; +st.shared.v4.f32 [r12+64], {f873, f874, f875, f876}; +fma.rn.f32 f877, f683, f586, f686; +sub.f32 f878, f688, f2504; +fma.rn.f32 f879, f691, f590, f694; +sub.f32 f880, f696, f2501; +st.shared.v4.f32 [r12+80], {f877, f878, f879, f880}; +fma.rn.f32 f881, f699, f594, f702; +sub.f32 f882, f704, f2499; +fma.rn.f32 f883, f707, f598, f710; +sub.f32 f884, f712, f2496; +st.shared.v4.f32 [r12+96], {f881, f882, f883, f884}; +fma.rn.f32 f885, f715, f602, f718; +sub.f32 f886, f720, f2494; +fma.rn.f32 f887, f723, f606, f726; +sub.f32 f888, f728, f2492; +st.shared.v4.f32 [r12+112], {f885, f886, f887, f888}; +fma.rn.f32 f889, f731, f2667, f734; +sub.f32 f890, f736, f2487; +fma.rn.f32 f891, f739, f552, f742; +sub.f32 f892, f744, f2485; +st.shared.v4.f32 [r12+128], {f889, f890, f891, f892}; +fma.rn.f32 f893, f747, f556, f750; +sub.f32 f894, f752, f2483; +fma.rn.f32 f895, f755, f560, f758; +sub.f32 f896, f760, f2480; +st.shared.v4.f32 [r12+144], {f893, f894, f895, f896}; +fma.rn.f32 f897, f763, f564, f766; +sub.f32 f898, f768, f2478; +fma.rn.f32 f899, f771, f568, f774; +sub.f32 f900, f776, f2475; +st.shared.v4.f32 [r12+160], {f897, f898, f899, f900}; +fma.rn.f32 f901, f779, f572, f782; +sub.f32 f902, f784, f2473; +fma.rn.f32 f903, f787, f576, f790; +sub.f32 f904, f792, f2471; +st.shared.v4.f32 [r12+176], {f901, f902, f903, f904}; +fma.rn.f32 f905, f795, f580, f798; +sub.f32 f906, f800, f2468; +fma.rn.f32 f907, f803, f584, f806; +sub.f32 f908, f808, f2466; +st.shared.v4.f32 [r12+192], {f905, f906, f907, f908}; +fma.rn.f32 f909, f811, f588, f814; +sub.f32 f910, f816, f2464; +fma.rn.f32 f911, f819, f592, f822; +sub.f32 f912, f824, f2461; +st.shared.v4.f32 [r12+208], {f909, f910, f911, f912}; +fma.rn.f32 f913, f827, f596, f830; +sub.f32 f914, f832, f2459; +fma.rn.f32 f915, f835, f600, f838; +sub.f32 f916, f840, f2456; +st.shared.v4.f32 [r12+224], {f913, f914, f915, f916}; +fma.rn.f32 f917, f843, f604, f846; +sub.f32 f918, f848, f2454; +fma.rn.f32 f919, f851, f608, f854; +sub.f32 f920, f856, f855; +st.shared.v4.f32 [r12+240], {f917, f918, f919, f920}; +barrier.sync 0; +and.b32 r21, r35, 255; +mad.lo.s32 r13, r21, -248, r12; +ld.shared.v2.f32 {f921, f922}, [r13]; +ld.shared.v2.f32 {f925, f926}, [r13+2048]; +ld.shared.v2.f32 {f929, f930}, [r13+4096]; +ld.shared.v2.f32 {f933, f934}, [r13+6144]; +ld.shared.v2.f32 {f937, f938}, [r13+8192]; +ld.shared.v2.f32 {f941, f942}, [r13+10240]; +ld.shared.v2.f32 {f945, f946}, [r13+12288]; +ld.shared.v2.f32 {f949, f950}, [r13+14336]; +ld.shared.v2.f32 {f953, f954}, [r13+16384]; +ld.shared.v2.f32 {f957, f958}, [r13+18432]; +ld.shared.v2.f32 {f961, f962}, [r13+20480]; +ld.shared.v2.f32 {f965, f966}, [r13+22528]; +ld.shared.v2.f32 {f969, f970}, [r13+24576]; +ld.shared.v2.f32 {f973, f974}, [r13+26624]; +ld.shared.v2.f32 {f977, f978}, [r13+28672]; +ld.shared.v2.f32 {f981, f982}, [r13+30720]; +ld.shared.v2.f32 {f985, f986}, [r13+32768]; +ld.shared.v2.f32 {f989, f990}, [r13+34816]; +ld.shared.v2.f32 {f993, f994}, [r13+36864]; +ld.shared.v2.f32 {f997, f998}, [r13+38912]; +ld.shared.v2.f32 {f1001, f1002}, [r13+40960]; +ld.shared.v2.f32 {f1005, f1006}, [r13+43008]; +ld.shared.v2.f32 {f1009, f1010}, [r13+45056]; +ld.shared.v2.f32 {f1013, f1014}, [r13+47104]; +ld.shared.v2.f32 {f1017, f1018}, [r13+49152]; +ld.shared.v2.f32 {f1021, f1022}, [r13+51200]; +ld.shared.v2.f32 {f1025, f1026}, [r13+53248]; +ld.shared.v2.f32 {f1029, f1030}, [r13+55296]; +ld.shared.v2.f32 {f1033, f1034}, [r13+57344]; +ld.shared.v2.f32 {f1037, f1038}, [r13+59392]; +ld.shared.v2.f32 {f1041, f1042}, [r13+61440]; +ld.shared.v2.f32 {f1045, f1046}, [r13+63488]; +add.f32 f1049, f921, f985; +sub.f32 f1051, f921, f985; +add.f32 f2452, f922, f986; +sub.f32 f1052, f922, f986; +add.f32 f1053, f953, f1017; +sub.f32 f1055, f953, f1017; +add.f32 f2451, f954, f1018; +sub.f32 f1056, f954, f1018; +add.f32 f1057, f1049, f1053; +sub.f32 f1059, f1049, f1053; +add.f32 f2450, f2452, f2451; +sub.f32 f1060, f2452, f2451; +sub.f32 f1061, f1051, f1056; +add.f32 f1063, f1051, f1056; +add.f32 f2449, f1052, f1055; +sub.f32 f1064, f1052, f1055; +add.f32 f1065, f937, f1001; +sub.f32 f1067, f937, f1001; +add.f32 f2448, f938, f1002; +sub.f32 f1068, f938, f1002; +add.f32 f1069, f969, f1033; +sub.f32 f1071, f969, f1033; +add.f32 f2447, f970, f1034; +sub.f32 f1072, f970, f1034; +add.f32 f1073, f1065, f1069; +sub.f32 f1075, f1065, f1069; +add.f32 f2446, f2448, f2447; +sub.f32 f1076, f2448, f2447; +sub.f32 f1077, f1067, f1072; +add.f32 f1079, f1067, f1072; +add.f32 f2445, f1068, f1071; +sub.f32 f1080, f1068, f1071; +mul.f32 f1081, f1077, 0f3F3504F3; +mul.f32 f1082, f2445, 0f3F3504F3; +sub.f32 f1083, f1081, f1082; +add.f32 f1084, f1081, f1082; +mul.f32 f2443, f1079, 0fBF3504F3; +mul.f32 f2444, f1080, 0f3F3504F3; +sub.f32 f1087, f2443, f2444; +mul.f32 f1088, f1080, 0fBF3504F3; +fma.rn.f32 f1089, f1079, 0f3F3504F3, f1088; +add.f32 f1090, f1057, f1073; +sub.f32 f1092, f1057, f1073; +add.f32 f2442, f2450, f2446; +sub.f32 f1093, f2450, f2446; +add.f32 f1094, f1061, f1083; +sub.f32 f1096, f1061, f1083; +add.f32 f2441, f2449, f1084; +sub.f32 f1097, f2449, f1084; +sub.f32 f1098, f1059, f1076; +add.f32 f1100, f1059, f1076; +add.f32 f2440, f1060, f1075; +sub.f32 f1101, f1060, f1075; +add.f32 f1102, f1063, f1087; +sub.f32 f1104, f1063, f1087; +add.f32 f2439, f1064, f1089; +sub.f32 f1105, f1064, f1089; +add.f32 f1106, f929, f993; +sub.f32 f1108, f929, f993; +add.f32 f2438, f930, f994; +sub.f32 f1109, f930, f994; +add.f32 f1110, f961, f1025; +sub.f32 f1112, f961, f1025; +add.f32 f2437, f962, f1026; +sub.f32 f1113, f962, f1026; +add.f32 f1114, f1106, f1110; +sub.f32 f1116, f1106, f1110; +add.f32 f2436, f2438, f2437; +sub.f32 f1117, f2438, f2437; +sub.f32 f1118, f1108, f1113; +add.f32 f1120, f1108, f1113; +add.f32 f2435, f1109, f1112; +sub.f32 f1121, f1109, f1112; +add.f32 f1122, f945, f1009; +sub.f32 f1124, f945, f1009; +add.f32 f2434, f946, f1010; +sub.f32 f1125, f946, f1010; +add.f32 f1126, f977, f1041; +sub.f32 f1128, f977, f1041; +add.f32 f2433, f978, f1042; +sub.f32 f1129, f978, f1042; +add.f32 f1130, f1122, f1126; +sub.f32 f1132, f1122, f1126; +add.f32 f2432, f2434, f2433; +sub.f32 f1133, f2434, f2433; +sub.f32 f1134, f1124, f1129; +add.f32 f1136, f1124, f1129; +add.f32 f2431, f1125, f1128; +sub.f32 f1137, f1125, f1128; +mul.f32 f1138, f1134, 0f3F3504F3; +mul.f32 f1139, f2431, 0f3F3504F3; +sub.f32 f1140, f1138, f1139; +add.f32 f1141, f1138, f1139; +mul.f32 f2429, f1136, 0fBF3504F3; +mul.f32 f2430, f1137, 0f3F3504F3; +sub.f32 f1144, f2429, f2430; +mul.f32 f1145, f1137, 0fBF3504F3; +fma.rn.f32 f1146, f1136, 0f3F3504F3, f1145; +add.f32 f1147, f1114, f1130; +sub.f32 f1149, f1114, f1130; +add.f32 f2428, f2436, f2432; +sub.f32 f1150, f2436, f2432; +add.f32 f1151, f1118, f1140; +sub.f32 f1153, f1118, f1140; +add.f32 f2427, f2435, f1141; +sub.f32 f1154, f2435, f1141; +sub.f32 f1155, f1116, f1133; +add.f32 f1157, f1116, f1133; +add.f32 f2426, f1117, f1132; +sub.f32 f1158, f1117, f1132; +add.f32 f1159, f1120, f1144; +sub.f32 f1161, f1120, f1144; +add.f32 f2425, f1121, f1146; +sub.f32 f1162, f1121, f1146; +mul.f32 f2423, f1151, 0f3F6C835E; +mul.f32 f2424, f2427, 0f3EC3EF15; +sub.f32 f1165, f2423, f2424; +mul.f32 f1166, f2427, 0f3F6C835E; +fma.rn.f32 f1167, f1151, 0f3EC3EF15, f1166; +mul.f32 f1168, f1155, 0f3F3504F3; +mul.f32 f1169, f2426, 0f3F3504F3; +sub.f32 f1170, f1168, f1169; +add.f32 f1171, f1168, f1169; +mul.f32 f2421, f1159, 0f3EC3EF15; +mul.f32 f2422, f2425, 0f3F6C835E; +sub.f32 f1174, f2421, f2422; +mul.f32 f1175, f2425, 0f3EC3EF15; +fma.rn.f32 f1176, f1159, 0f3F6C835E, f1175; +mul.f32 f2419, f1153, 0fBEC3EF15; +mul.f32 f2420, f1154, 0f3F6C835E; +sub.f32 f1179, f2419, f2420; +mul.f32 f1180, f1154, 0fBEC3EF15; +fma.rn.f32 f1181, f1153, 0f3F6C835E, f1180; +mul.f32 f2417, f1157, 0fBF3504F3; +mul.f32 f2418, f1158, 0f3F3504F3; +sub.f32 f1184, f2417, f2418; +mul.f32 f1185, f1158, 0fBF3504F3; +fma.rn.f32 f1186, f1157, 0f3F3504F3, f1185; +mul.f32 f2415, f1161, 0fBF6C835E; +mul.f32 f2416, f1162, 0f3EC3EF15; +sub.f32 f1189, f2415, f2416; +mul.f32 f1190, f1162, 0fBF6C835E; +fma.rn.f32 f1191, f1161, 0f3EC3EF15, f1190; +add.f32 f1192, f1090, f1147; +sub.f32 f1194, f1090, f1147; +add.f32 f2414, f2442, f2428; +sub.f32 f1195, f2442, f2428; +add.f32 f1196, f1094, f1165; +sub.f32 f1198, f1094, f1165; +add.f32 f2413, f2441, f1167; +sub.f32 f1199, f2441, f1167; +add.f32 f1200, f1098, f1170; +sub.f32 f1202, f1098, f1170; +add.f32 f2412, f2440, f1171; +sub.f32 f1203, f2440, f1171; +add.f32 f1204, f1102, f1174; +sub.f32 f1206, f1102, f1174; +add.f32 f2411, f2439, f1176; +sub.f32 f1207, f2439, f1176; +sub.f32 f1208, f1092, f1150; +add.f32 f1210, f1092, f1150; +add.f32 f2410, f1093, f1149; +sub.f32 f1211, f1093, f1149; +add.f32 f1212, f1096, f1179; +sub.f32 f1214, f1096, f1179; +add.f32 f2409, f1097, f1181; +sub.f32 f1215, f1097, f1181; +add.f32 f1216, f1100, f1184; +sub.f32 f1218, f1100, f1184; +add.f32 f2408, f1101, f1186; +sub.f32 f1219, f1101, f1186; +add.f32 f1220, f1104, f1189; +sub.f32 f1222, f1104, f1189; +add.f32 f2407, f1105, f1191; +sub.f32 f1223, f1105, f1191; +add.f32 f1224, f925, f989; +sub.f32 f1226, f925, f989; +add.f32 f2406, f926, f990; +sub.f32 f1227, f926, f990; +add.f32 f1228, f957, f1021; +sub.f32 f1230, f957, f1021; +add.f32 f2405, f958, f1022; +sub.f32 f1231, f958, f1022; +add.f32 f1232, f1224, f1228; +sub.f32 f1234, f1224, f1228; +add.f32 f2404, f2406, f2405; +sub.f32 f1235, f2406, f2405; +sub.f32 f1236, f1226, f1231; +add.f32 f1238, f1226, f1231; +add.f32 f2403, f1227, f1230; +sub.f32 f1239, f1227, f1230; +add.f32 f1240, f941, f1005; +sub.f32 f1242, f941, f1005; +add.f32 f2402, f942, f1006; +sub.f32 f1243, f942, f1006; +add.f32 f1244, f973, f1037; +sub.f32 f1246, f973, f1037; +add.f32 f2401, f974, f1038; +sub.f32 f1247, f974, f1038; +add.f32 f1248, f1240, f1244; +sub.f32 f1250, f1240, f1244; +add.f32 f2400, f2402, f2401; +sub.f32 f1251, f2402, f2401; +sub.f32 f1252, f1242, f1247; +add.f32 f1254, f1242, f1247; +add.f32 f2399, f1243, f1246; +sub.f32 f1255, f1243, f1246; +mul.f32 f1256, f1252, 0f3F3504F3; +mul.f32 f1257, f2399, 0f3F3504F3; +sub.f32 f1258, f1256, f1257; +add.f32 f1259, f1256, f1257; +mul.f32 f2397, f1254, 0fBF3504F3; +mul.f32 f2398, f1255, 0f3F3504F3; +sub.f32 f1262, f2397, f2398; +mul.f32 f1263, f1255, 0fBF3504F3; +fma.rn.f32 f1264, f1254, 0f3F3504F3, f1263; +add.f32 f1265, f1232, f1248; +sub.f32 f1267, f1232, f1248; +add.f32 f2396, f2404, f2400; +sub.f32 f1268, f2404, f2400; +add.f32 f1269, f1236, f1258; +sub.f32 f1271, f1236, f1258; +add.f32 f2395, f2403, f1259; +sub.f32 f1272, f2403, f1259; +sub.f32 f1273, f1234, f1251; +add.f32 f1275, f1234, f1251; +add.f32 f2394, f1235, f1250; +sub.f32 f1276, f1235, f1250; +add.f32 f1277, f1238, f1262; +sub.f32 f1279, f1238, f1262; +add.f32 f2393, f1239, f1264; +sub.f32 f1280, f1239, f1264; +add.f32 f1281, f933, f997; +sub.f32 f1283, f933, f997; +add.f32 f2392, f934, f998; +sub.f32 f1284, f934, f998; +add.f32 f1285, f965, f1029; +sub.f32 f1287, f965, f1029; +add.f32 f2391, f966, f1030; +sub.f32 f1288, f966, f1030; +add.f32 f1289, f1281, f1285; +sub.f32 f1291, f1281, f1285; +add.f32 f2390, f2392, f2391; +sub.f32 f1292, f2392, f2391; +sub.f32 f1293, f1283, f1288; +add.f32 f1295, f1283, f1288; +add.f32 f2389, f1284, f1287; +sub.f32 f1296, f1284, f1287; +add.f32 f1297, f949, f1013; +sub.f32 f1299, f949, f1013; +add.f32 f2388, f950, f1014; +sub.f32 f1300, f950, f1014; +add.f32 f1301, f981, f1045; +sub.f32 f1303, f981, f1045; +add.f32 f2387, f982, f1046; +sub.f32 f1304, f982, f1046; +add.f32 f1305, f1297, f1301; +sub.f32 f1307, f1297, f1301; +add.f32 f2386, f2388, f2387; +sub.f32 f1308, f2388, f2387; +sub.f32 f1309, f1299, f1304; +add.f32 f1311, f1299, f1304; +add.f32 f2385, f1300, f1303; +sub.f32 f1312, f1300, f1303; +mul.f32 f1313, f1309, 0f3F3504F3; +mul.f32 f1314, f2385, 0f3F3504F3; +sub.f32 f1315, f1313, f1314; +add.f32 f1316, f1313, f1314; +mul.f32 f2383, f1311, 0fBF3504F3; +mul.f32 f2384, f1312, 0f3F3504F3; +sub.f32 f1319, f2383, f2384; +mul.f32 f1320, f1312, 0fBF3504F3; +fma.rn.f32 f1321, f1311, 0f3F3504F3, f1320; +add.f32 f1322, f1289, f1305; +sub.f32 f1324, f1289, f1305; +add.f32 f2382, f2390, f2386; +sub.f32 f1325, f2390, f2386; +add.f32 f1326, f1293, f1315; +sub.f32 f1328, f1293, f1315; +add.f32 f2381, f2389, f1316; +sub.f32 f1329, f2389, f1316; +sub.f32 f1330, f1291, f1308; +add.f32 f1332, f1291, f1308; +add.f32 f2380, f1292, f1307; +sub.f32 f1333, f1292, f1307; +add.f32 f1334, f1295, f1319; +sub.f32 f1336, f1295, f1319; +add.f32 f2379, f1296, f1321; +sub.f32 f1337, f1296, f1321; +mul.f32 f2377, f1326, 0f3F6C835E; +mul.f32 f2378, f2381, 0f3EC3EF15; +sub.f32 f1340, f2377, f2378; +mul.f32 f1341, f2381, 0f3F6C835E; +fma.rn.f32 f1342, f1326, 0f3EC3EF15, f1341; +mul.f32 f1343, f1330, 0f3F3504F3; +mul.f32 f1344, f2380, 0f3F3504F3; +sub.f32 f1345, f1343, f1344; +add.f32 f1346, f1343, f1344; +mul.f32 f1348, f2379, 0f3F6C835E; +mul.f32 f2376, f1334, 0f3EC3EF15; +sub.f32 f1349, f2376, f1348; +mul.f32 f1350, f2379, 0f3EC3EF15; +fma.rn.f32 f1351, f1334, 0f3F6C835E, f1350; +mul.f32 f1353, f1329, 0f3F6C835E; +mul.f32 f2375, f1328, 0fBEC3EF15; +sub.f32 f1354, f2375, f1353; +mul.f32 f1355, f1329, 0fBEC3EF15; +fma.rn.f32 f1356, f1328, 0f3F6C835E, f1355; +mul.f32 f2373, f1332, 0fBF3504F3; +mul.f32 f2374, f1333, 0f3F3504F3; +sub.f32 f1359, f2373, f2374; +mul.f32 f1360, f1333, 0fBF3504F3; +fma.rn.f32 f1361, f1332, 0f3F3504F3, f1360; +mul.f32 f2371, f1336, 0fBF6C835E; +mul.f32 f2372, f1337, 0f3EC3EF15; +sub.f32 f1364, f2371, f2372; +mul.f32 f1365, f1337, 0fBF6C835E; +fma.rn.f32 f1366, f1336, 0f3EC3EF15, f1365; +add.f32 f1367, f1265, f1322; +sub.f32 f1369, f1265, f1322; +add.f32 f2370, f2396, f2382; +sub.f32 f1370, f2396, f2382; +add.f32 f1371, f1269, f1340; +sub.f32 f1373, f1269, f1340; +add.f32 f2369, f2395, f1342; +sub.f32 f1374, f2395, f1342; +add.f32 f1375, f1273, f1345; +sub.f32 f1377, f1273, f1345; +add.f32 f2368, f2394, f1346; +sub.f32 f1378, f2394, f1346; +add.f32 f1379, f1277, f1349; +sub.f32 f1381, f1277, f1349; +add.f32 f2367, f2393, f1351; +sub.f32 f1382, f2393, f1351; +sub.f32 f1383, f1267, f1325; +add.f32 f1385, f1267, f1325; +add.f32 f2366, f1268, f1324; +sub.f32 f1386, f1268, f1324; +add.f32 f1387, f1271, f1354; +sub.f32 f1389, f1271, f1354; +add.f32 f2365, f1272, f1356; +sub.f32 f1390, f1272, f1356; +add.f32 f1391, f1275, f1359; +sub.f32 f1393, f1275, f1359; +add.f32 f2364, f1276, f1361; +sub.f32 f1394, f1276, f1361; +add.f32 f1395, f1279, f1364; +sub.f32 f1397, f1279, f1364; +add.f32 f2363, f1280, f1366; +sub.f32 f1398, f1280, f1366; +mul.f32 f1400, f2369, 0f3E47C5C2; +mul.f32 f2362, f1371, 0f3F7B14BE; +sub.f32 f1401, f2362, f1400; +mul.f32 f1402, f2369, 0f3F7B14BE; +fma.rn.f32 f1403, f1371, 0f3E47C5C2, f1402; +mul.f32 f1405, f2368, 0f3EC3EF15; +mul.f32 f2361, f1375, 0f3F6C835E; +sub.f32 f1406, f2361, f1405; +mul.f32 f1407, f2368, 0f3F6C835E; +fma.rn.f32 f1408, f1375, 0f3EC3EF15, f1407; +mul.f32 f2359, f1379, 0f3F54DB31; +mul.f32 f2360, f2367, 0f3F0E39DA; +sub.f32 f1411, f2359, f2360; +mul.f32 f1412, f2367, 0f3F54DB31; +fma.rn.f32 f1413, f1379, 0f3F0E39DA, f1412; +mul.f32 f1414, f1383, 0f3F3504F3; +mul.f32 f1415, f2366, 0f3F3504F3; +sub.f32 f1416, f1414, f1415; +add.f32 f1417, f1414, f1415; +mul.f32 f1419, f2365, 0f3F54DB31; +mul.f32 f2358, f1387, 0f3F0E39DA; +sub.f32 f1420, f2358, f1419; +mul.f32 f1421, f2365, 0f3F0E39DA; +fma.rn.f32 f1422, f1387, 0f3F54DB31, f1421; +mul.f32 f1424, f2364, 0f3F6C835E; +mul.f32 f2357, f1391, 0f3EC3EF15; +sub.f32 f1425, f2357, f1424; +mul.f32 f1426, f2364, 0f3EC3EF15; +fma.rn.f32 f1427, f1391, 0f3F6C835E, f1426; +mul.f32 f2355, f1395, 0f3E47C5C2; +mul.f32 f2356, f2363, 0f3F7B14BE; +sub.f32 f1430, f2355, f2356; +mul.f32 f1431, f2363, 0f3E47C5C2; +fma.rn.f32 f1432, f1395, 0f3F7B14BE, f1431; +mul.f32 f2353, f1373, 0fBE47C5C2; +mul.f32 f2354, f1374, 0f3F7B14BE; +sub.f32 f1435, f2353, f2354; +mul.f32 f1436, f1374, 0fBE47C5C2; +fma.rn.f32 f1437, f1373, 0f3F7B14BE, f1436; +mul.f32 f2351, f1377, 0fBEC3EF15; +mul.f32 f2352, f1378, 0f3F6C835E; +sub.f32 f1440, f2351, f2352; +mul.f32 f1441, f1378, 0fBEC3EF15; +fma.rn.f32 f1442, f1377, 0f3F6C835E, f1441; +mul.f32 f2349, f1381, 0fBF0E39DA; +mul.f32 f2350, f1382, 0f3F54DB31; +sub.f32 f1445, f2349, f2350; +mul.f32 f1446, f1382, 0fBF0E39DA; +fma.rn.f32 f1447, f1381, 0f3F54DB31, f1446; +mul.f32 f1449, f1386, 0f3F3504F3; +mul.f32 f2348, f1385, 0fBF3504F3; +sub.f32 f1450, f2348, f1449; +mul.f32 f1451, f1386, 0fBF3504F3; +fma.rn.f32 f1452, f1385, 0f3F3504F3, f1451; +mul.f32 f1454, f1390, 0f3F0E39DA; +mul.f32 f2347, f1389, 0fBF54DB31; +sub.f32 f1455, f2347, f1454; +mul.f32 f1456, f1390, 0fBF54DB31; +fma.rn.f32 f1457, f1389, 0f3F0E39DA, f1456; +mul.f32 f1459, f1394, 0f3EC3EF15; +mul.f32 f2346, f1393, 0fBF6C835E; +sub.f32 f1460, f2346, f1459; +mul.f32 f1461, f1394, 0fBF6C835E; +fma.rn.f32 f1462, f1393, 0f3EC3EF15, f1461; +mul.f32 f1464, f1398, 0f3E47C5C2; +mul.f32 f2345, f1397, 0fBF7B14BE; +sub.f32 f1465, f2345, f1464; +mul.f32 f1466, f1398, 0fBF7B14BE; +fma.rn.f32 f1467, f1397, 0f3E47C5C2, f1466; +add.f32 f1470, f1196, f1401; +sub.f32 f1472, f1196, f1401; +add.f32 f2344, f2413, f1403; +sub.f32 f1473, f2413, f1403; +add.f32 f1474, f1200, f1406; +sub.f32 f1476, f1200, f1406; +add.f32 f2343, f2412, f1408; +sub.f32 f1477, f2412, f1408; +add.f32 f1478, f1204, f1411; +sub.f32 f1480, f1204, f1411; +add.f32 f2342, f2411, f1413; +sub.f32 f1481, f2411, f1413; +add.f32 f1482, f1208, f1416; +sub.f32 f1484, f1208, f1416; +add.f32 f2341, f2410, f1417; +sub.f32 f1485, f2410, f1417; +add.f32 f1486, f1212, f1420; +sub.f32 f1488, f1212, f1420; +add.f32 f2340, f2409, f1422; +sub.f32 f1489, f2409, f1422; +add.f32 f1490, f1216, f1425; +sub.f32 f1492, f1216, f1425; +add.f32 f2339, f2408, f1427; +sub.f32 f1493, f2408, f1427; +add.f32 f1494, f1220, f1430; +sub.f32 f1496, f1220, f1430; +add.f32 f2338, f2407, f1432; +sub.f32 f1497, f2407, f1432; +sub.f32 f1498, f1194, f1370; +add.f32 f1500, f1194, f1370; +add.f32 f2337, f1195, f1369; +sub.f32 f1501, f1195, f1369; +add.f32 f1502, f1198, f1435; +sub.f32 f1504, f1198, f1435; +add.f32 f2336, f1199, f1437; +sub.f32 f1505, f1199, f1437; +add.f32 f1506, f1202, f1440; +sub.f32 f1508, f1202, f1440; +add.f32 f2335, f1203, f1442; +sub.f32 f1509, f1203, f1442; +add.f32 f1510, f1206, f1445; +sub.f32 f1512, f1206, f1445; +add.f32 f2334, f1207, f1447; +sub.f32 f1513, f1207, f1447; +add.f32 f1514, f1210, f1450; +sub.f32 f1516, f1210, f1450; +add.f32 f2333, f1211, f1452; +sub.f32 f1517, f1211, f1452; +add.f32 f1518, f1214, f1455; +sub.f32 f1520, f1214, f1455; +add.f32 f2332, f1215, f1457; +sub.f32 f1521, f1215, f1457; +add.f32 f1522, f1218, f1460; +sub.f32 f1524, f1218, f1460; +add.f32 f2331, f1219, f1462; +sub.f32 f1525, f1219, f1462; +add.f32 f1526, f1222, f1465; +sub.f32 f1528, f1222, f1465; +add.f32 f2330, f1223, f1467; +sub.f32 f1529, f1223, f1467; +and.b32 r14, r35, 224; +bfe.u32 r15, r35, 5, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1530, f1531}, [rd8]; +mul.f32 f1534, f2344, f1531; +mul.f32 f1536, f1530, f2344; +mul.f32 f2328, f1530, f1530; +mul.f32 f2329, f1531, f1531; +sub.f32 f1539, f2328, f2329; +mul.f32 f1540, f1531, f1530; +fma.rn.f32 f1541, f1531, f1530, f1540; +mul.f32 f1542, f2343, f1541; +mul.f32 f1544, f1539, f2343; +mul.f32 f1546, f1531, f1541; +mul.f32 f2327, f1530, f1539; +sub.f32 f1547, f2327, f1546; +mul.f32 f2326, f1474, f1541; +mul.f32 f1548, f1530, f1541; +fma.rn.f32 f1549, f1531, f1539, f1548; +mul.f32 f1550, f2342, f1549; +mul.f32 f1552, f1547, f2342; +mul.f32 f1554, f1531, f1549; +mul.f32 f2325, f1530, f1547; +sub.f32 f1555, f2325, f1554; +mul.f32 f2324, f1478, f1549; +mul.f32 f1556, f1530, f1549; +fma.rn.f32 f1557, f1531, f1547, f1556; +mul.f32 f1558, f2341, f1557; +mul.f32 f1560, f1555, f2341; +mul.f32 f2322, f1530, f1555; +mul.f32 f2323, f1531, f1557; +sub.f32 f1563, f2322, f2323; +mul.f32 f2321, f1482, f1557; +mul.f32 f1564, f1530, f1557; +fma.rn.f32 f1565, f1531, f1555, f1564; +mul.f32 f1566, f2340, f1565; +mul.f32 f1568, f1563, f2340; +mul.f32 f1570, f1531, f1565; +mul.f32 f2320, f1530, f1563; +sub.f32 f1571, f2320, f1570; +mul.f32 f2319, f1486, f1565; +mul.f32 f1572, f1530, f1565; +fma.rn.f32 f1573, f1531, f1563, f1572; +mul.f32 f1574, f2339, f1573; +mul.f32 f1576, f1571, f2339; +mul.f32 f1578, f1531, f1573; +mul.f32 f2318, f1530, f1571; +sub.f32 f1579, f2318, f1578; +mul.f32 f2317, f1490, f1573; +mul.f32 f1580, f1530, f1573; +fma.rn.f32 f1581, f1531, f1571, f1580; +mul.f32 f1582, f2338, f1581; +mul.f32 f1584, f1579, f2338; +mul.f32 f2315, f1530, f1579; +mul.f32 f2316, f1531, f1581; +sub.f32 f1587, f2315, f2316; +mul.f32 f2314, f1494, f1581; +mul.f32 f1588, f1530, f1581; +fma.rn.f32 f1589, f1531, f1579, f1588; +mul.f32 f1590, f2337, f1589; +mul.f32 f1592, f1587, f2337; +mul.f32 f1594, f1531, f1589; +mul.f32 f2313, f1530, f1587; +sub.f32 f1595, f2313, f1594; +mul.f32 f2312, f1498, f1589; +mul.f32 f1596, f1530, f1589; +fma.rn.f32 f1597, f1531, f1587, f1596; +mul.f32 f1598, f2336, f1597; +mul.f32 f1600, f1595, f2336; +mul.f32 f2310, f1530, f1595; +mul.f32 f2311, f1531, f1597; +sub.f32 f1603, f2310, f2311; +mul.f32 f2309, f1502, f1597; +mul.f32 f1604, f1530, f1597; +fma.rn.f32 f1605, f1531, f1595, f1604; +mul.f32 f1606, f2335, f1605; +mul.f32 f1608, f1603, f2335; +mul.f32 f1610, f1531, f1605; +mul.f32 f2308, f1530, f1603; +sub.f32 f1611, f2308, f1610; +mul.f32 f2307, f1506, f1605; +mul.f32 f1612, f1530, f1605; +fma.rn.f32 f1613, f1531, f1603, f1612; +mul.f32 f1614, f2334, f1613; +mul.f32 f1616, f1611, f2334; +mul.f32 f1618, f1531, f1613; +mul.f32 f2306, f1530, f1611; +sub.f32 f1619, f2306, f1618; +mul.f32 f2305, f1510, f1613; +mul.f32 f1620, f1530, f1613; +fma.rn.f32 f1621, f1531, f1611, f1620; +mul.f32 f1622, f2333, f1621; +mul.f32 f1624, f1619, f2333; +mul.f32 f2303, f1530, f1619; +mul.f32 f2304, f1531, f1621; +sub.f32 f1627, f2303, f2304; +mul.f32 f2302, f1514, f1621; +mul.f32 f1628, f1530, f1621; +fma.rn.f32 f1629, f1531, f1619, f1628; +mul.f32 f1630, f2332, f1629; +mul.f32 f1632, f1627, f2332; +mul.f32 f1634, f1531, f1629; +mul.f32 f2301, f1530, f1627; +sub.f32 f1635, f2301, f1634; +mul.f32 f2300, f1518, f1629; +mul.f32 f1636, f1530, f1629; +fma.rn.f32 f1637, f1531, f1627, f1636; +mul.f32 f1638, f2331, f1637; +mul.f32 f1640, f1635, f2331; +mul.f32 f1642, f1531, f1637; +mul.f32 f2299, f1530, f1635; +sub.f32 f1643, f2299, f1642; +mul.f32 f2298, f1522, f1637; +mul.f32 f1644, f1530, f1637; +fma.rn.f32 f1645, f1531, f1635, f1644; +mul.f32 f1646, f2330, f1645; +mul.f32 f1648, f1643, f2330; +mul.f32 f2296, f1530, f1643; +mul.f32 f2297, f1531, f1645; +sub.f32 f1651, f2296, f2297; +mul.f32 f2295, f1526, f1645; +mul.f32 f1652, f1530, f1645; +fma.rn.f32 f1653, f1531, f1643, f1652; +sub.f32 f2294, f2414, f2370; +mul.f32 f1654, f2294, f1653; +mul.f32 f1656, f1651, f2294; +mul.f32 f1658, f1531, f1653; +mul.f32 f2293, f1530, f1651; +sub.f32 f1659, f2293, f1658; +sub.f32 f2292, f1192, f1367; +mul.f32 f2291, f2292, f1653; +mul.f32 f1660, f1530, f1653; +fma.rn.f32 f1661, f1531, f1651, f1660; +mul.f32 f1662, f1473, f1661; +mul.f32 f1664, f1659, f1473; +mul.f32 f2289, f1530, f1659; +mul.f32 f2290, f1531, f1661; +sub.f32 f1667, f2289, f2290; +mul.f32 f2288, f1472, f1661; +mul.f32 f1668, f1530, f1661; +fma.rn.f32 f1669, f1531, f1659, f1668; +mul.f32 f1670, f1477, f1669; +mul.f32 f1672, f1667, f1477; +mul.f32 f1674, f1531, f1669; +mul.f32 f2287, f1530, f1667; +sub.f32 f1675, f2287, f1674; +mul.f32 f2286, f1476, f1669; +mul.f32 f1676, f1530, f1669; +fma.rn.f32 f1677, f1531, f1667, f1676; +mul.f32 f1678, f1481, f1677; +mul.f32 f1680, f1675, f1481; +mul.f32 f1682, f1531, f1677; +mul.f32 f2285, f1530, f1675; +sub.f32 f1683, f2285, f1682; +mul.f32 f2284, f1480, f1677; +mul.f32 f1684, f1530, f1677; +fma.rn.f32 f1685, f1531, f1675, f1684; +mul.f32 f1686, f1485, f1685; +mul.f32 f1688, f1683, f1485; +mul.f32 f2282, f1530, f1683; +mul.f32 f2283, f1531, f1685; +sub.f32 f1691, f2282, f2283; +mul.f32 f2281, f1484, f1685; +mul.f32 f1692, f1530, f1685; +fma.rn.f32 f1693, f1531, f1683, f1692; +mul.f32 f1694, f1489, f1693; +mul.f32 f1696, f1691, f1489; +mul.f32 f1698, f1531, f1693; +mul.f32 f2280, f1530, f1691; +sub.f32 f1699, f2280, f1698; +mul.f32 f2279, f1488, f1693; +mul.f32 f1700, f1530, f1693; +fma.rn.f32 f1701, f1531, f1691, f1700; +mul.f32 f1702, f1493, f1701; +mul.f32 f1704, f1699, f1493; +mul.f32 f1706, f1531, f1701; +mul.f32 f2278, f1530, f1699; +sub.f32 f1707, f2278, f1706; +mul.f32 f2277, f1492, f1701; +mul.f32 f1708, f1530, f1701; +fma.rn.f32 f1709, f1531, f1699, f1708; +mul.f32 f1710, f1497, f1709; +mul.f32 f1712, f1707, f1497; +mul.f32 f2275, f1530, f1707; +mul.f32 f2276, f1531, f1709; +sub.f32 f1715, f2275, f2276; +mul.f32 f2274, f1496, f1709; +mul.f32 f1716, f1530, f1709; +fma.rn.f32 f1717, f1531, f1707, f1716; +mul.f32 f1718, f1501, f1717; +mul.f32 f1720, f1715, f1501; +mul.f32 f1722, f1531, f1717; +mul.f32 f2273, f1530, f1715; +sub.f32 f1723, f2273, f1722; +mul.f32 f2272, f1500, f1717; +mul.f32 f1724, f1530, f1717; +fma.rn.f32 f1725, f1531, f1715, f1724; +mul.f32 f1726, f1505, f1725; +mul.f32 f1728, f1723, f1505; +mul.f32 f2270, f1530, f1723; +mul.f32 f2271, f1531, f1725; +sub.f32 f1731, f2270, f2271; +mul.f32 f2269, f1504, f1725; +mul.f32 f1732, f1530, f1725; +fma.rn.f32 f1733, f1531, f1723, f1732; +mul.f32 f1734, f1509, f1733; +mul.f32 f1736, f1731, f1509; +mul.f32 f1738, f1531, f1733; +mul.f32 f2268, f1530, f1731; +sub.f32 f1739, f2268, f1738; +mul.f32 f2267, f1508, f1733; +mul.f32 f1740, f1530, f1733; +fma.rn.f32 f1741, f1531, f1731, f1740; +mul.f32 f1742, f1513, f1741; +mul.f32 f1744, f1739, f1513; +mul.f32 f1746, f1531, f1741; +mul.f32 f2266, f1530, f1739; +sub.f32 f1747, f2266, f1746; +mul.f32 f2265, f1512, f1741; +mul.f32 f1748, f1530, f1741; +fma.rn.f32 f1749, f1531, f1739, f1748; +mul.f32 f1750, f1517, f1749; +mul.f32 f1752, f1747, f1517; +mul.f32 f2263, f1530, f1747; +mul.f32 f2264, f1531, f1749; +sub.f32 f1755, f2263, f2264; +mul.f32 f2262, f1516, f1749; +mul.f32 f1756, f1530, f1749; +fma.rn.f32 f1757, f1531, f1747, f1756; +mul.f32 f1758, f1521, f1757; +mul.f32 f1760, f1755, f1521; +mul.f32 f1762, f1531, f1757; +mul.f32 f2261, f1530, f1755; +sub.f32 f1763, f2261, f1762; +mul.f32 f2260, f1520, f1757; +mul.f32 f1764, f1530, f1757; +fma.rn.f32 f1765, f1531, f1755, f1764; +mul.f32 f1766, f1525, f1765; +mul.f32 f1768, f1763, f1525; +mul.f32 f1770, f1531, f1765; +mul.f32 f2259, f1530, f1763; +sub.f32 f1771, f2259, f1770; +mul.f32 f2258, f1524, f1765; +mul.f32 f1772, f1530, f1765; +mul.f32 f2257, f1470, f1531; +fma.rn.f32 f1773, f1531, f1763, f1772; +mul.f32 f1774, f1529, f1773; +mul.f32 f1775, f1528, f1773; +mul.f32 f1776, f1771, f1529; +and.b32 r16, r31, 248; +add.s32 r17, r9, r16; +mov.u32 r27, %tid.x; +shl.b32 r26, r27, 8; +barrier.sync 0; +and.b32 r18, r26, 57344; +add.s32 r19, r17, r18; +sub.f32 f2666, f2414, f2370; +mul.f32 f2665, f1651, f2666; +add.f32 f1777, f2414, f2370; +sub.f32 f2664, f1192, f1367; +add.f32 f1778, f1192, f1367; +st.shared.v2.f32 [r19], {f1778, f1777}; +mov.u32 r34, %tid.x; +and.b32 r33, r34, 224; +fma.rn.f32 f1779, f1530, f1470, f1534; +sub.f32 f1780, f1536, f2257; +st.shared.v2.f32 [r19+256], {f1779, f1780}; +fma.rn.f32 f1781, f1539, f1474, f1542; +sub.f32 f1782, f1544, f2326; +st.shared.v2.f32 [r19+512], {f1781, f1782}; +fma.rn.f32 f1783, f1547, f1478, f1550; +sub.f32 f1784, f1552, f2324; +st.shared.v2.f32 [r19+768], {f1783, f1784}; +fma.rn.f32 f1785, f1555, f1482, f1558; +sub.f32 f1786, f1560, f2321; +st.shared.v2.f32 [r19+1024], {f1785, f1786}; +fma.rn.f32 f1787, f1563, f1486, f1566; +sub.f32 f1788, f1568, f2319; +st.shared.v2.f32 [r19+1280], {f1787, f1788}; +sub.f32 f1789, f1576, f2317; +fma.rn.f32 f1790, f1571, f1490, f1574; +st.shared.v2.f32 [r19+1536], {f1790, f1789}; +fma.rn.f32 f1791, f1579, f1494, f1582; +sub.f32 f1792, f1584, f2314; +st.shared.v2.f32 [r19+1792], {f1791, f1792}; +fma.rn.f32 f1793, f1587, f1498, f1590; +sub.f32 f1794, f1592, f2312; +st.shared.v2.f32 [r19+2048], {f1793, f1794}; +fma.rn.f32 f1795, f1595, f1502, f1598; +sub.f32 f1796, f1600, f2309; +st.shared.v2.f32 [r19+2304], {f1795, f1796}; +fma.rn.f32 f1797, f1603, f1506, f1606; +sub.f32 f1798, f1608, f2307; +st.shared.v2.f32 [r19+2560], {f1797, f1798}; +fma.rn.f32 f1799, f1611, f1510, f1614; +sub.f32 f1800, f1616, f2305; +st.shared.v2.f32 [r19+2816], {f1799, f1800}; +fma.rn.f32 f1801, f1619, f1514, f1622; +sub.f32 f1802, f1624, f2302; +st.shared.v2.f32 [r19+3072], {f1801, f1802}; +fma.rn.f32 f1803, f1627, f1518, f1630; +sub.f32 f1804, f1632, f2300; +st.shared.v2.f32 [r19+3328], {f1803, f1804}; +fma.rn.f32 f1805, f1635, f1522, f1638; +sub.f32 f1806, f1640, f2298; +st.shared.v2.f32 [r19+3584], {f1805, f1806}; +fma.rn.f32 f1807, f1643, f1526, f1646; +sub.f32 f1808, f1648, f2295; +st.shared.v2.f32 [r19+3840], {f1807, f1808}; +fma.rn.f32 f1809, f1651, f2664, f1654; +sub.f32 f1810, f2665, f2291; +st.shared.v2.f32 [r19+4096], {f1809, f1810}; +fma.rn.f32 f1811, f1659, f1472, f1662; +sub.f32 f1812, f1664, f2288; +st.shared.v2.f32 [r19+4352], {f1811, f1812}; +fma.rn.f32 f1813, f1667, f1476, f1670; +sub.f32 f1814, f1672, f2286; +st.shared.v2.f32 [r19+4608], {f1813, f1814}; +fma.rn.f32 f1815, f1675, f1480, f1678; +sub.f32 f1816, f1680, f2284; +st.shared.v2.f32 [r19+4864], {f1815, f1816}; +fma.rn.f32 f1817, f1683, f1484, f1686; +sub.f32 f1818, f1688, f2281; +st.shared.v2.f32 [r19+5120], {f1817, f1818}; +fma.rn.f32 f1819, f1691, f1488, f1694; +sub.f32 f1820, f1696, f2279; +st.shared.v2.f32 [r19+5376], {f1819, f1820}; +fma.rn.f32 f1821, f1699, f1492, f1702; +sub.f32 f1822, f1704, f2277; +st.shared.v2.f32 [r19+5632], {f1821, f1822}; +fma.rn.f32 f1823, f1707, f1496, f1710; +sub.f32 f1824, f1712, f2274; +st.shared.v2.f32 [r19+5888], {f1823, f1824}; +fma.rn.f32 f1825, f1715, f1500, f1718; +sub.f32 f1826, f1720, f2272; +st.shared.v2.f32 [r19+6144], {f1825, f1826}; +fma.rn.f32 f1827, f1723, f1504, f1726; +sub.f32 f1828, f1728, f2269; +st.shared.v2.f32 [r19+6400], {f1827, f1828}; +fma.rn.f32 f1829, f1731, f1508, f1734; +sub.f32 f1830, f1736, f2267; +st.shared.v2.f32 [r19+6656], {f1829, f1830}; +fma.rn.f32 f1831, f1739, f1512, f1742; +sub.f32 f1832, f1744, f2265; +st.shared.v2.f32 [r19+6912], {f1831, f1832}; +fma.rn.f32 f1833, f1747, f1516, f1750; +sub.f32 f1834, f1752, f2262; +st.shared.v2.f32 [r19+7168], {f1833, f1834}; +fma.rn.f32 f1835, f1755, f1520, f1758; +sub.f32 f1836, f1760, f2260; +st.shared.v2.f32 [r19+7424], {f1835, f1836}; +fma.rn.f32 f1837, f1763, f1524, f1766; +sub.f32 f1838, f1768, f2258; +st.shared.v2.f32 [r19+7680], {f1837, f1838}; +fma.rn.f32 f1839, f1771, f1528, f1774; +sub.f32 f1840, f1776, f1775; +st.shared.v2.f32 [r19+7936], {f1839, f1840}; +barrier.sync 0; +mad.lo.s32 r20, r33, -248, r19; +ld.shared.v2.f32 {f1841, f1842}, [r20]; +ld.shared.v2.f32 {f1845, f1846}, [r20+2048]; +ld.shared.v2.f32 {f1849, f1850}, [r20+4096]; +ld.shared.v2.f32 {f1853, f1854}, [r20+6144]; +ld.shared.v2.f32 {f1857, f1858}, [r20+8192]; +ld.shared.v2.f32 {f1861, f1862}, [r20+10240]; +ld.shared.v2.f32 {f1865, f1866}, [r20+12288]; +ld.shared.v2.f32 {f1869, f1870}, [r20+14336]; +ld.shared.v2.f32 {f1873, f1874}, [r20+16384]; +ld.shared.v2.f32 {f1877, f1878}, [r20+18432]; +ld.shared.v2.f32 {f1881, f1882}, [r20+20480]; +ld.shared.v2.f32 {f1885, f1886}, [r20+22528]; +ld.shared.v2.f32 {f1889, f1890}, [r20+24576]; +ld.shared.v2.f32 {f1893, f1894}, [r20+26624]; +ld.shared.v2.f32 {f1897, f1898}, [r20+28672]; +ld.shared.v2.f32 {f1901, f1902}, [r20+30720]; +ld.shared.v2.f32 {f1905, f1906}, [r20+32768]; +ld.shared.v2.f32 {f1909, f1910}, [r20+34816]; +ld.shared.v2.f32 {f1913, f1914}, [r20+36864]; +ld.shared.v2.f32 {f1917, f1918}, [r20+38912]; +ld.shared.v2.f32 {f1921, f1922}, [r20+40960]; +ld.shared.v2.f32 {f1925, f1926}, [r20+43008]; +ld.shared.v2.f32 {f1929, f1930}, [r20+45056]; +ld.shared.v2.f32 {f1933, f1934}, [r20+47104]; +ld.shared.v2.f32 {f1937, f1938}, [r20+49152]; +ld.shared.v2.f32 {f1941, f1942}, [r20+51200]; +ld.shared.v2.f32 {f1945, f1946}, [r20+53248]; +ld.shared.v2.f32 {f1949, f1950}, [r20+55296]; +ld.shared.v2.f32 {f1953, f1954}, [r20+57344]; +ld.shared.v2.f32 {f1957, f1958}, [r20+59392]; +ld.shared.v2.f32 {f1961, f1962}, [r20+61440]; +ld.shared.v2.f32 {f1965, f1966}, [r20+63488]; +add.f32 f1969, f1841, f1905; +sub.f32 f1971, f1841, f1905; +add.f32 f2256, f1842, f1906; +sub.f32 f1972, f1842, f1906; +add.f32 f1973, f1873, f1937; +sub.f32 f1975, f1873, f1937; +add.f32 f2255, f1874, f1938; +sub.f32 f1976, f1874, f1938; +add.f32 f1977, f1969, f1973; +sub.f32 f1979, f1969, f1973; +add.f32 f2254, f2256, f2255; +sub.f32 f1980, f2256, f2255; +sub.f32 f1981, f1971, f1976; +add.f32 f1983, f1971, f1976; +add.f32 f2253, f1972, f1975; +sub.f32 f1984, f1972, f1975; +add.f32 f1985, f1857, f1921; +sub.f32 f1987, f1857, f1921; +add.f32 f2252, f1858, f1922; +sub.f32 f1988, f1858, f1922; +add.f32 f1989, f1889, f1953; +sub.f32 f1991, f1889, f1953; +add.f32 f2251, f1890, f1954; +sub.f32 f1992, f1890, f1954; +add.f32 f1993, f1985, f1989; +sub.f32 f1995, f1985, f1989; +add.f32 f2250, f2252, f2251; +sub.f32 f1996, f2252, f2251; +sub.f32 f1997, f1987, f1992; +add.f32 f1999, f1987, f1992; +add.f32 f2249, f1988, f1991; +sub.f32 f2000, f1988, f1991; +mul.f32 f2001, f1997, 0f3F3504F3; +mul.f32 f2002, f2249, 0f3F3504F3; +sub.f32 f2003, f2001, f2002; +add.f32 f2004, f2001, f2002; +mul.f32 f2006, f2000, 0f3F3504F3; +mul.f32 f2248, f1999, 0fBF3504F3; +sub.f32 f2007, f2248, f2006; +mul.f32 f2008, f2000, 0fBF3504F3; +fma.rn.f32 f2009, f1999, 0f3F3504F3, f2008; +add.f32 f2010, f1845, f1909; +sub.f32 f2012, f1845, f1909; +add.f32 f2247, f1846, f1910; +sub.f32 f2013, f1846, f1910; +add.f32 f2014, f1877, f1941; +sub.f32 f2016, f1877, f1941; +add.f32 f2246, f1878, f1942; +sub.f32 f2017, f1878, f1942; +add.f32 f2018, f2010, f2014; +sub.f32 f2020, f2010, f2014; +add.f32 f2245, f2247, f2246; +sub.f32 f2021, f2247, f2246; +sub.f32 f2022, f2012, f2017; +add.f32 f2024, f2012, f2017; +add.f32 f2244, f2013, f2016; +sub.f32 f2025, f2013, f2016; +add.f32 f2026, f1861, f1925; +sub.f32 f2028, f1861, f1925; +add.f32 f2243, f1862, f1926; +sub.f32 f2029, f1862, f1926; +add.f32 f2030, f1893, f1957; +sub.f32 f2032, f1893, f1957; +add.f32 f2242, f1894, f1958; +sub.f32 f2033, f1894, f1958; +add.f32 f2034, f2026, f2030; +sub.f32 f2036, f2026, f2030; +add.f32 f2241, f2243, f2242; +sub.f32 f2037, f2243, f2242; +sub.f32 f2038, f2028, f2033; +add.f32 f2040, f2028, f2033; +add.f32 f2240, f2029, f2032; +sub.f32 f2041, f2029, f2032; +mul.f32 f2042, f2038, 0f3F3504F3; +mul.f32 f2043, f2240, 0f3F3504F3; +sub.f32 f2044, f2042, f2043; +add.f32 f2045, f2042, f2043; +mul.f32 f2047, f2041, 0f3F3504F3; +mul.f32 f2239, f2040, 0fBF3504F3; +sub.f32 f2048, f2239, f2047; +mul.f32 f2049, f2041, 0fBF3504F3; +fma.rn.f32 f2050, f2040, 0f3F3504F3, f2049; +add.f32 f2051, f1849, f1913; +sub.f32 f2053, f1849, f1913; +add.f32 f2238, f1850, f1914; +sub.f32 f2054, f1850, f1914; +add.f32 f2055, f1881, f1945; +sub.f32 f2057, f1881, f1945; +add.f32 f2237, f1882, f1946; +sub.f32 f2058, f1882, f1946; +add.f32 f2059, f2051, f2055; +sub.f32 f2061, f2051, f2055; +add.f32 f2236, f2238, f2237; +sub.f32 f2062, f2238, f2237; +sub.f32 f2063, f2053, f2058; +add.f32 f2065, f2053, f2058; +add.f32 f2235, f2054, f2057; +sub.f32 f2066, f2054, f2057; +add.f32 f2067, f1865, f1929; +sub.f32 f2069, f1865, f1929; +add.f32 f2234, f1866, f1930; +sub.f32 f2070, f1866, f1930; +add.f32 f2071, f1897, f1961; +sub.f32 f2073, f1897, f1961; +add.f32 f2233, f1898, f1962; +sub.f32 f2074, f1898, f1962; +add.f32 f2075, f2067, f2071; +sub.f32 f2077, f2067, f2071; +add.f32 f2232, f2234, f2233; +sub.f32 f2078, f2234, f2233; +sub.f32 f2079, f2069, f2074; +add.f32 f2081, f2069, f2074; +add.f32 f2231, f2070, f2073; +sub.f32 f2082, f2070, f2073; +mul.f32 f2083, f2079, 0f3F3504F3; +mul.f32 f2084, f2231, 0f3F3504F3; +sub.f32 f2085, f2083, f2084; +add.f32 f2086, f2083, f2084; +mul.f32 f2229, f2081, 0fBF3504F3; +mul.f32 f2230, f2082, 0f3F3504F3; +sub.f32 f2089, f2229, f2230; +mul.f32 f2090, f2082, 0fBF3504F3; +fma.rn.f32 f2091, f2081, 0f3F3504F3, f2090; +add.f32 f2092, f1853, f1917; +sub.f32 f2094, f1853, f1917; +add.f32 f2228, f1854, f1918; +sub.f32 f2095, f1854, f1918; +add.f32 f2096, f1885, f1949; +sub.f32 f2098, f1885, f1949; +add.f32 f2227, f1886, f1950; +sub.f32 f2099, f1886, f1950; +add.f32 f2100, f2092, f2096; +sub.f32 f2102, f2092, f2096; +add.f32 f2226, f2228, f2227; +sub.f32 f2103, f2228, f2227; +sub.f32 f2104, f2094, f2099; +add.f32 f2106, f2094, f2099; +add.f32 f2225, f2095, f2098; +sub.f32 f2107, f2095, f2098; +add.f32 f2108, f1869, f1933; +sub.f32 f2110, f1869, f1933; +add.f32 f2224, f1870, f1934; +sub.f32 f2111, f1870, f1934; +add.f32 f2112, f1901, f1965; +sub.f32 f2114, f1901, f1965; +add.f32 f2223, f1902, f1966; +sub.f32 f2115, f1902, f1966; +add.f32 f2116, f2108, f2112; +sub.f32 f2118, f2108, f2112; +add.f32 f2222, f2224, f2223; +sub.f32 f2119, f2224, f2223; +sub.f32 f2120, f2110, f2115; +add.f32 f2122, f2110, f2115; +add.f32 f2221, f2111, f2114; +sub.f32 f2123, f2111, f2114; +mul.f32 f2124, f2120, 0f3F3504F3; +mul.f32 f2125, f2221, 0f3F3504F3; +sub.f32 f2126, f2124, f2125; +add.f32 f2127, f2124, f2125; +mul.f32 f2129, f2123, 0f3F3504F3; +mul.f32 f2220, f2122, 0fBF3504F3; +sub.f32 f2130, f2220, f2129; +mul.f32 f2131, f2123, 0fBF3504F3; +fma.rn.f32 f2132, f2122, 0f3F3504F3, f2131; +add.f32 %0, f1977, f1993; +add.f32 %1, f2254, f2250; +add.f32 %2, f2018, f2034; +add.f32 %3, f2245, f2241; +add.f32 %5, f2236, f2232; +add.f32 %4, f2059, f2075; +add.f32 %7, f2226, f2222; +add.f32 %6, f2100, f2116; +add.f32 %9, f2253, f2004; +add.f32 %8, f1981, f2003; +add.f32 %10, f2022, f2044; +add.f32 %11, f2244, f2045; +add.f32 %12, f2063, f2085; +add.f32 %13, f2235, f2086; +add.f32 %14, f2104, f2126; +add.f32 %15, f2225, f2127; +add.f32 %17, f1980, f1995; +sub.f32 %16, f1979, f1996; +add.f32 %19, f2021, f2036; +sub.f32 %18, f2020, f2037; +add.f32 %21, f2062, f2077; +sub.f32 %20, f2061, f2078; +sub.f32 %22, f2102, f2119; +add.f32 %23, f2103, f2118; +add.f32 %24, f1983, f2007; +add.f32 %25, f1984, f2009; +add.f32 %26, f2024, f2048; +add.f32 %27, f2025, f2050; +add.f32 %28, f2065, f2089; +add.f32 %29, f2066, f2091; +add.f32 %31, f2107, f2132; +add.f32 %30, f2106, f2130; +sub.f32 %33, f2254, f2250; +sub.f32 %32, f1977, f1993; +sub.f32 %35, f2245, f2241; +sub.f32 %34, f2018, f2034; +sub.f32 %37, f2236, f2232; +sub.f32 %36, f2059, f2075; +sub.f32 %39, f2226, f2222; +sub.f32 %38, f2100, f2116; +sub.f32 %41, f2253, f2004; +sub.f32 %40, f1981, f2003; +sub.f32 %43, f2244, f2045; +sub.f32 %42, f2022, f2044; +sub.f32 %45, f2235, f2086; +sub.f32 %44, f2063, f2085; +sub.f32 %47, f2225, f2127; +sub.f32 %46, f2104, f2126; +sub.f32 %49, f1980, f1995; +add.f32 %48, f1979, f1996; +sub.f32 %51, f2021, f2036; +add.f32 %50, f2020, f2037; +sub.f32 %53, f2062, f2077; +add.f32 %52, f2061, f2078; +sub.f32 %55, f2103, f2118; +add.f32 %54, f2102, f2119; +sub.f32 %57, f1984, f2009; +sub.f32 %56, f1983, f2007; +sub.f32 %59, f2025, f2050; +sub.f32 %58, f2024, f2048; +sub.f32 %61, f2066, f2091; +sub.f32 %60, f2065, f2089; +sub.f32 %63, f2107, f2132; +sub.f32 %62, f2106, f2130; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_8192), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<313, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<2476>; +.reg .b32 r<30>; +.reg .b64 rd<10>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %64; +add.s32 r4, r3, r2; +add.f32 f129, %67, %99; +sub.f32 f131, %67, %99; +add.f32 f2474, %68, %131; +sub.f32 f132, %68, %131; +add.f32 f133, %83, %115; +sub.f32 f135, %83, %115; +add.f32 f2472, %132, %116; +sub.f32 f136, %132, %116; +add.f32 f137, f129, f133; +sub.f32 f139, f129, f133; +add.f32 f2471, f2474, f2472; +sub.f32 f140, f2474, f2472; +sub.f32 f141, f131, f136; +add.f32 f143, f131, f136; +add.f32 f2470, f132, f135; +sub.f32 f144, f132, f135; +add.f32 f145, %75, %107; +sub.f32 f147, %75, %107; +add.f32 f2467, %133, %134; +sub.f32 f148, %133, %134; +add.f32 f149, %91, %123; +sub.f32 f151, %91, %123; +add.f32 f2465, %92, %135; +sub.f32 f152, %92, %135; +add.f32 f153, f145, f149; +sub.f32 f155, f145, f149; +add.f32 f2464, f2467, f2465; +sub.f32 f156, f2467, f2465; +sub.f32 f157, f147, f152; +add.f32 f159, f147, f152; +add.f32 f2463, f148, f151; +sub.f32 f160, f148, f151; +mul.f32 f161, f157, 0f3F3504F3; +mul.f32 f162, f2463, 0f3F3504F3; +sub.f32 f163, f161, f162; +add.f32 f164, f161, f162; +mul.f32 f2461, f159, 0fBF3504F3; +mul.f32 f2462, f160, 0f3F3504F3; +sub.f32 f167, f2461, f2462; +mul.f32 f168, f160, 0fBF3504F3; +fma.rn.f32 f169, f159, 0f3F3504F3, f168; +add.f32 f170, f137, f153; +sub.f32 f172, f137, f153; +add.f32 f2460, f2471, f2464; +sub.f32 f173, f2471, f2464; +add.f32 f174, f141, f163; +sub.f32 f176, f141, f163; +add.f32 f2459, f2470, f164; +sub.f32 f177, f2470, f164; +sub.f32 f178, f139, f156; +add.f32 f180, f139, f156; +add.f32 f2458, f140, f155; +sub.f32 f181, f140, f155; +add.f32 f182, f143, f167; +sub.f32 f184, f143, f167; +add.f32 f2457, f144, f169; +sub.f32 f185, f144, f169; +add.f32 f186, %71, %103; +sub.f32 f188, %71, %103; +add.f32 f2455, %136, %104; +sub.f32 f189, %136, %104; +add.f32 f190, %87, %119; +sub.f32 f192, %87, %119; +add.f32 f2452, %138, %137; +sub.f32 f193, %138, %137; +add.f32 f194, f186, f190; +sub.f32 f196, f186, f190; +add.f32 f2451, f2455, f2452; +sub.f32 f197, f2455, f2452; +sub.f32 f198, f188, f193; +add.f32 f200, f188, f193; +add.f32 f2450, f189, f192; +sub.f32 f201, f189, f192; +add.f32 f202, %79, %111; +sub.f32 f204, %79, %111; +add.f32 f2448, %80, %139; +sub.f32 f205, %80, %139; +add.f32 f206, %95, %127; +sub.f32 f208, %95, %127; +add.f32 f2446, %140, %128; +sub.f32 f209, %140, %128; +add.f32 f210, f202, f206; +sub.f32 f212, f202, f206; +add.f32 f2445, f2448, f2446; +sub.f32 f213, f2448, f2446; +sub.f32 f214, f204, f209; +add.f32 f216, f204, f209; +add.f32 f2444, f205, f208; +sub.f32 f217, f205, f208; +mul.f32 f218, f214, 0f3F3504F3; +mul.f32 f219, f2444, 0f3F3504F3; +sub.f32 f220, f218, f219; +add.f32 f221, f218, f219; +mul.f32 f2442, f216, 0fBF3504F3; +mul.f32 f2443, f217, 0f3F3504F3; +sub.f32 f224, f2442, f2443; +mul.f32 f225, f217, 0fBF3504F3; +fma.rn.f32 f226, f216, 0f3F3504F3, f225; +add.f32 f227, f194, f210; +sub.f32 f229, f194, f210; +add.f32 f2441, f2451, f2445; +sub.f32 f230, f2451, f2445; +add.f32 f231, f198, f220; +sub.f32 f233, f198, f220; +add.f32 f2440, f2450, f221; +sub.f32 f234, f2450, f221; +sub.f32 f235, f196, f213; +add.f32 f237, f196, f213; +add.f32 f2439, f197, f212; +sub.f32 f238, f197, f212; +add.f32 f239, f200, f224; +sub.f32 f241, f200, f224; +add.f32 f2438, f201, f226; +sub.f32 f242, f201, f226; +mul.f32 f2436, f231, 0f3F6C835E; +mul.f32 f2437, f2440, 0f3EC3EF15; +sub.f32 f245, f2436, f2437; +mul.f32 f246, f2440, 0f3F6C835E; +fma.rn.f32 f247, f231, 0f3EC3EF15, f246; +mul.f32 f248, f235, 0f3F3504F3; +mul.f32 f249, f2439, 0f3F3504F3; +sub.f32 f250, f248, f249; +add.f32 f251, f248, f249; +mul.f32 f253, f2438, 0f3F6C835E; +mul.f32 f2435, f239, 0f3EC3EF15; +sub.f32 f254, f2435, f253; +mul.f32 f255, f2438, 0f3EC3EF15; +fma.rn.f32 f256, f239, 0f3F6C835E, f255; +mul.f32 f258, f234, 0f3F6C835E; +mul.f32 f2434, f233, 0fBEC3EF15; +sub.f32 f259, f2434, f258; +mul.f32 f260, f234, 0fBEC3EF15; +fma.rn.f32 f261, f233, 0f3F6C835E, f260; +mul.f32 f2432, f237, 0fBF3504F3; +mul.f32 f2433, f238, 0f3F3504F3; +sub.f32 f264, f2432, f2433; +mul.f32 f265, f238, 0fBF3504F3; +fma.rn.f32 f266, f237, 0f3F3504F3, f265; +mul.f32 f2430, f241, 0fBF6C835E; +mul.f32 f2431, f242, 0f3EC3EF15; +sub.f32 f269, f2430, f2431; +mul.f32 f270, f242, 0fBF6C835E; +fma.rn.f32 f271, f241, 0f3EC3EF15, f270; +add.f32 f272, f170, f227; +sub.f32 f274, f170, f227; +add.f32 f2429, f2460, f2441; +sub.f32 f275, f2460, f2441; +add.f32 f276, f174, f245; +sub.f32 f278, f174, f245; +add.f32 f2428, f2459, f247; +sub.f32 f279, f2459, f247; +add.f32 f280, f178, f250; +sub.f32 f282, f178, f250; +add.f32 f2427, f2458, f251; +sub.f32 f283, f2458, f251; +add.f32 f284, f182, f254; +sub.f32 f286, f182, f254; +add.f32 f2426, f2457, f256; +sub.f32 f287, f2457, f256; +sub.f32 f288, f172, f230; +add.f32 f290, f172, f230; +add.f32 f2425, f173, f229; +sub.f32 f291, f173, f229; +add.f32 f292, f176, f259; +sub.f32 f294, f176, f259; +add.f32 f2424, f177, f261; +sub.f32 f295, f177, f261; +add.f32 f296, f180, f264; +sub.f32 f298, f180, f264; +add.f32 f2423, f181, f266; +sub.f32 f299, f181, f266; +add.f32 f300, f184, f269; +sub.f32 f302, f184, f269; +add.f32 f2422, f185, f271; +sub.f32 f303, f185, f271; +add.f32 f304, %69, %101; +sub.f32 f306, %69, %101; +add.f32 f2419, %142, %141; +sub.f32 f307, %142, %141; +add.f32 f308, %85, %117; +sub.f32 f310, %85, %117; +add.f32 f2417, %86, %143; +sub.f32 f311, %86, %143; +add.f32 f312, f304, f308; +sub.f32 f314, f304, f308; +add.f32 f2416, f2419, f2417; +sub.f32 f315, f2419, f2417; +sub.f32 f316, f306, f311; +add.f32 f318, f306, f311; +add.f32 f2415, f307, f310; +sub.f32 f319, f307, f310; +add.f32 f320, %77, %109; +sub.f32 f322, %77, %109; +add.f32 f2413, %144, %110; +sub.f32 f323, %144, %110; +add.f32 f324, %93, %125; +sub.f32 f326, %93, %125; +add.f32 f2410, %146, %145; +sub.f32 f327, %146, %145; +add.f32 f328, f320, f324; +sub.f32 f330, f320, f324; +add.f32 f2409, f2413, f2410; +sub.f32 f331, f2413, f2410; +sub.f32 f332, f322, f327; +add.f32 f334, f322, f327; +add.f32 f2408, f323, f326; +sub.f32 f335, f323, f326; +mul.f32 f336, f332, 0f3F3504F3; +mul.f32 f337, f2408, 0f3F3504F3; +sub.f32 f338, f336, f337; +add.f32 f339, f336, f337; +mul.f32 f341, f335, 0f3F3504F3; +mul.f32 f2407, f334, 0fBF3504F3; +sub.f32 f342, f2407, f341; +mul.f32 f343, f335, 0fBF3504F3; +fma.rn.f32 f344, f334, 0f3F3504F3, f343; +add.f32 f345, f312, f328; +sub.f32 f347, f312, f328; +add.f32 f2406, f2416, f2409; +sub.f32 f348, f2416, f2409; +add.f32 f349, f316, f338; +sub.f32 f351, f316, f338; +add.f32 f2405, f2415, f339; +sub.f32 f352, f2415, f339; +sub.f32 f353, f314, f331; +add.f32 f355, f314, f331; +add.f32 f2404, f315, f330; +sub.f32 f356, f315, f330; +add.f32 f357, f318, f342; +sub.f32 f359, f318, f342; +add.f32 f2403, f319, f344; +sub.f32 f360, f319, f344; +add.f32 f361, %73, %105; +sub.f32 f363, %73, %105; +add.f32 f2401, %74, %147; +sub.f32 f364, %74, %147; +add.f32 f365, %89, %121; +sub.f32 f367, %89, %121; +add.f32 f2399, %148, %122; +sub.f32 f368, %148, %122; +add.f32 f369, f361, f365; +sub.f32 f371, f361, f365; +add.f32 f2398, f2401, f2399; +sub.f32 f372, f2401, f2399; +sub.f32 f373, f363, f368; +add.f32 f375, f363, f368; +add.f32 f2397, f364, f367; +sub.f32 f376, f364, f367; +add.f32 f377, %81, %113; +sub.f32 f379, %81, %113; +add.f32 f2394, %149, %150; +sub.f32 f380, %149, %150; +add.f32 f381, %97, %129; +sub.f32 f383, %97, %129; +add.f32 f2393, %98, %130; +sub.f32 f384, %98, %130; +add.f32 f385, f377, f381; +sub.f32 f387, f377, f381; +add.f32 f2392, f2394, f2393; +sub.f32 f388, f2394, f2393; +sub.f32 f389, f379, f384; +add.f32 f391, f379, f384; +add.f32 f2391, f380, f383; +sub.f32 f392, f380, f383; +mul.f32 f393, f389, 0f3F3504F3; +mul.f32 f394, f2391, 0f3F3504F3; +sub.f32 f395, f393, f394; +add.f32 f396, f393, f394; +mul.f32 f2389, f391, 0fBF3504F3; +mul.f32 f2390, f392, 0f3F3504F3; +sub.f32 f399, f2389, f2390; +mul.f32 f400, f392, 0fBF3504F3; +fma.rn.f32 f401, f391, 0f3F3504F3, f400; +add.f32 f402, f369, f385; +sub.f32 f404, f369, f385; +add.f32 f2388, f2398, f2392; +sub.f32 f405, f2398, f2392; +add.f32 f406, f373, f395; +sub.f32 f408, f373, f395; +add.f32 f2387, f2397, f396; +sub.f32 f409, f2397, f396; +sub.f32 f410, f371, f388; +add.f32 f412, f371, f388; +add.f32 f2386, f372, f387; +sub.f32 f413, f372, f387; +add.f32 f414, f375, f399; +sub.f32 f416, f375, f399; +add.f32 f2385, f376, f401; +sub.f32 f417, f376, f401; +mul.f32 f419, f2387, 0f3EC3EF15; +mul.f32 f2384, f406, 0f3F6C835E; +sub.f32 f420, f2384, f419; +mul.f32 f421, f2387, 0f3F6C835E; +fma.rn.f32 f422, f406, 0f3EC3EF15, f421; +mul.f32 f423, f410, 0f3F3504F3; +mul.f32 f424, f2386, 0f3F3504F3; +sub.f32 f425, f423, f424; +add.f32 f426, f423, f424; +mul.f32 f428, f2385, 0f3F6C835E; +mul.f32 f2383, f414, 0f3EC3EF15; +sub.f32 f429, f2383, f428; +mul.f32 f430, f2385, 0f3EC3EF15; +fma.rn.f32 f431, f414, 0f3F6C835E, f430; +mul.f32 f433, f409, 0f3F6C835E; +mul.f32 f2382, f408, 0fBEC3EF15; +sub.f32 f434, f2382, f433; +mul.f32 f435, f409, 0fBEC3EF15; +fma.rn.f32 f436, f408, 0f3F6C835E, f435; +mul.f32 f438, f413, 0f3F3504F3; +mul.f32 f2381, f412, 0fBF3504F3; +sub.f32 f439, f2381, f438; +mul.f32 f440, f413, 0fBF3504F3; +fma.rn.f32 f441, f412, 0f3F3504F3, f440; +mul.f32 f443, f417, 0f3EC3EF15; +mul.f32 f2380, f416, 0fBF6C835E; +sub.f32 f444, f2380, f443; +mul.f32 f445, f417, 0fBF6C835E; +fma.rn.f32 f446, f416, 0f3EC3EF15, f445; +add.f32 f447, f345, f402; +sub.f32 f449, f345, f402; +add.f32 f2379, f2406, f2388; +sub.f32 f450, f2406, f2388; +add.f32 f451, f349, f420; +sub.f32 f453, f349, f420; +add.f32 f2378, f2405, f422; +sub.f32 f454, f2405, f422; +add.f32 f455, f353, f425; +sub.f32 f457, f353, f425; +add.f32 f2377, f2404, f426; +sub.f32 f458, f2404, f426; +add.f32 f459, f357, f429; +sub.f32 f461, f357, f429; +add.f32 f2376, f2403, f431; +sub.f32 f462, f2403, f431; +sub.f32 f463, f347, f405; +add.f32 f465, f347, f405; +add.f32 f2375, f348, f404; +sub.f32 f466, f348, f404; +add.f32 f467, f351, f434; +sub.f32 f469, f351, f434; +add.f32 f2374, f352, f436; +sub.f32 f470, f352, f436; +add.f32 f471, f355, f439; +sub.f32 f473, f355, f439; +add.f32 f2373, f356, f441; +sub.f32 f474, f356, f441; +add.f32 f475, f359, f444; +sub.f32 f477, f359, f444; +add.f32 f2372, f360, f446; +sub.f32 f478, f360, f446; +mul.f32 f480, f2378, 0f3E47C5C2; +mul.f32 f2371, f451, 0f3F7B14BE; +sub.f32 f481, f2371, f480; +mul.f32 f482, f2378, 0f3F7B14BE; +fma.rn.f32 f483, f451, 0f3E47C5C2, f482; +mul.f32 f485, f2377, 0f3EC3EF15; +mul.f32 f2370, f455, 0f3F6C835E; +sub.f32 f486, f2370, f485; +mul.f32 f487, f2377, 0f3F6C835E; +fma.rn.f32 f488, f455, 0f3EC3EF15, f487; +mul.f32 f490, f2376, 0f3F0E39DA; +mul.f32 f2369, f459, 0f3F54DB31; +sub.f32 f491, f2369, f490; +mul.f32 f492, f2376, 0f3F54DB31; +fma.rn.f32 f493, f459, 0f3F0E39DA, f492; +mul.f32 f494, f463, 0f3F3504F3; +mul.f32 f495, f2375, 0f3F3504F3; +sub.f32 f496, f494, f495; +add.f32 f497, f494, f495; +mul.f32 f499, f2374, 0f3F54DB31; +mul.f32 f2368, f467, 0f3F0E39DA; +sub.f32 f500, f2368, f499; +mul.f32 f501, f2374, 0f3F0E39DA; +fma.rn.f32 f502, f467, 0f3F54DB31, f501; +mul.f32 f504, f2373, 0f3F6C835E; +mul.f32 f2367, f471, 0f3EC3EF15; +sub.f32 f505, f2367, f504; +mul.f32 f506, f2373, 0f3EC3EF15; +fma.rn.f32 f507, f471, 0f3F6C835E, f506; +mul.f32 f509, f2372, 0f3F7B14BE; +mul.f32 f2366, f475, 0f3E47C5C2; +sub.f32 f510, f2366, f509; +mul.f32 f511, f2372, 0f3E47C5C2; +fma.rn.f32 f512, f475, 0f3F7B14BE, f511; +mul.f32 f514, f454, 0f3F7B14BE; +mul.f32 f2365, f453, 0fBE47C5C2; +sub.f32 f515, f2365, f514; +mul.f32 f516, f454, 0fBE47C5C2; +fma.rn.f32 f517, f453, 0f3F7B14BE, f516; +mul.f32 f2363, f457, 0fBEC3EF15; +mul.f32 f2364, f458, 0f3F6C835E; +sub.f32 f520, f2363, f2364; +mul.f32 f521, f458, 0fBEC3EF15; +fma.rn.f32 f522, f457, 0f3F6C835E, f521; +mul.f32 f2361, f461, 0fBF0E39DA; +mul.f32 f2362, f462, 0f3F54DB31; +sub.f32 f525, f2361, f2362; +mul.f32 f526, f462, 0fBF0E39DA; +fma.rn.f32 f527, f461, 0f3F54DB31, f526; +mul.f32 f2359, f465, 0fBF3504F3; +mul.f32 f2360, f466, 0f3F3504F3; +sub.f32 f530, f2359, f2360; +mul.f32 f531, f466, 0fBF3504F3; +fma.rn.f32 f532, f465, 0f3F3504F3, f531; +mul.f32 f2357, f469, 0fBF54DB31; +mul.f32 f2358, f470, 0f3F0E39DA; +sub.f32 f535, f2357, f2358; +mul.f32 f536, f470, 0fBF54DB31; +fma.rn.f32 f537, f469, 0f3F0E39DA, f536; +mul.f32 f539, f474, 0f3EC3EF15; +mul.f32 f2356, f473, 0fBF6C835E; +sub.f32 f540, f2356, f539; +mul.f32 f541, f474, 0fBF6C835E; +fma.rn.f32 f542, f473, 0f3EC3EF15, f541; +mul.f32 f544, f478, 0f3E47C5C2; +mul.f32 f2355, f477, 0fBF7B14BE; +sub.f32 f545, f2355, f544; +mul.f32 f546, f478, 0fBF7B14BE; +fma.rn.f32 f547, f477, 0f3E47C5C2, f546; +add.f32 f548, f272, f447; +sub.f32 f550, f272, f447; +add.f32 f2354, f2429, f2379; +sub.f32 f551, f2429, f2379; +add.f32 f552, f276, f481; +sub.f32 f554, f276, f481; +add.f32 f2353, f2428, f483; +sub.f32 f555, f2428, f483; +add.f32 f556, f280, f486; +sub.f32 f558, f280, f486; +add.f32 f2352, f2427, f488; +sub.f32 f559, f2427, f488; +add.f32 f560, f284, f491; +sub.f32 f562, f284, f491; +add.f32 f2351, f2426, f493; +sub.f32 f563, f2426, f493; +add.f32 f564, f288, f496; +sub.f32 f566, f288, f496; +add.f32 f2350, f2425, f497; +sub.f32 f567, f2425, f497; +add.f32 f568, f292, f500; +sub.f32 f570, f292, f500; +add.f32 f2349, f2424, f502; +sub.f32 f571, f2424, f502; +add.f32 f572, f296, f505; +sub.f32 f574, f296, f505; +add.f32 f2348, f2423, f507; +sub.f32 f575, f2423, f507; +add.f32 f576, f300, f510; +sub.f32 f578, f300, f510; +add.f32 f2347, f2422, f512; +sub.f32 f579, f2422, f512; +sub.f32 f580, f274, f450; +add.f32 f582, f274, f450; +add.f32 f2346, f275, f449; +sub.f32 f583, f275, f449; +add.f32 f584, f278, f515; +sub.f32 f586, f278, f515; +add.f32 f2345, f279, f517; +sub.f32 f587, f279, f517; +add.f32 f588, f282, f520; +sub.f32 f590, f282, f520; +add.f32 f2344, f283, f522; +sub.f32 f591, f283, f522; +add.f32 f592, f286, f525; +sub.f32 f594, f286, f525; +add.f32 f2343, f287, f527; +sub.f32 f595, f287, f527; +add.f32 f596, f290, f530; +sub.f32 f598, f290, f530; +add.f32 f2342, f291, f532; +sub.f32 f599, f291, f532; +add.f32 f600, f294, f535; +sub.f32 f602, f294, f535; +add.f32 f2341, f295, f537; +sub.f32 f603, f295, f537; +add.f32 f604, f298, f540; +sub.f32 f606, f298, f540; +add.f32 f2340, f299, f542; +sub.f32 f607, f299, f542; +add.f32 f608, f302, f545; +sub.f32 f610, f302, f545; +add.f32 f2339, f303, f547; +sub.f32 f611, f303, f547; +mov.u32 r24, %tid.x; +shl.b32 r7, r24, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 2040; +mov.u64 rd4, %65; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f612, f613}, [rd5]; +mul.f32 f616, f2353, f613; +fma.rn.f32 f617, f612, f552, f616; +mul.f32 f618, f552, f613; +mul.f32 f619, f612, f2353; +sub.f32 f620, f619, f618; +mul.f32 f2337, f612, f612; +mul.f32 f2338, f613, f613; +sub.f32 f623, f2337, f2338; +mul.f32 f624, f613, f612; +fma.rn.f32 f625, f613, f612, f624; +mul.f32 f626, f2352, f625; +fma.rn.f32 f627, f623, f556, f626; +mul.f32 f628, f556, f625; +mul.f32 f629, f623, f2352; +sub.f32 f630, f629, f628; +mul.f32 f2335, f612, f623; +mul.f32 f2336, f613, f625; +sub.f32 f633, f2335, f2336; +mul.f32 f634, f612, f625; +fma.rn.f32 f635, f613, f623, f634; +mul.f32 f636, f2351, f635; +fma.rn.f32 f637, f633, f560, f636; +mul.f32 f638, f560, f635; +mul.f32 f639, f633, f2351; +sub.f32 f640, f639, f638; +mul.f32 f642, f613, f635; +mul.f32 f2334, f612, f633; +sub.f32 f643, f2334, f642; +mul.f32 f644, f612, f635; +fma.rn.f32 f645, f613, f633, f644; +mul.f32 f646, f2350, f645; +fma.rn.f32 f647, f643, f564, f646; +mul.f32 f648, f564, f645; +mul.f32 f649, f643, f2350; +sub.f32 f650, f649, f648; +mul.f32 f652, f613, f645; +mul.f32 f2333, f612, f643; +sub.f32 f653, f2333, f652; +mul.f32 f654, f612, f645; +fma.rn.f32 f655, f613, f643, f654; +mul.f32 f656, f2349, f655; +fma.rn.f32 f657, f653, f568, f656; +mul.f32 f658, f568, f655; +mul.f32 f659, f653, f2349; +sub.f32 f660, f659, f658; +mul.f32 f662, f613, f655; +mul.f32 f2332, f612, f653; +sub.f32 f663, f2332, f662; +mul.f32 f664, f612, f655; +fma.rn.f32 f665, f613, f653, f664; +mul.f32 f666, f2348, f665; +fma.rn.f32 f667, f663, f572, f666; +mul.f32 f668, f572, f665; +mul.f32 f669, f663, f2348; +sub.f32 f670, f669, f668; +mul.f32 f2330, f612, f663; +mul.f32 f2331, f613, f665; +sub.f32 f673, f2330, f2331; +mul.f32 f674, f612, f665; +fma.rn.f32 f675, f613, f663, f674; +mul.f32 f676, f2347, f675; +fma.rn.f32 f677, f673, f576, f676; +mul.f32 f678, f576, f675; +mul.f32 f679, f673, f2347; +sub.f32 f680, f679, f678; +mul.f32 f2328, f612, f673; +mul.f32 f2329, f613, f675; +sub.f32 f683, f2328, f2329; +mul.f32 f684, f612, f675; +fma.rn.f32 f685, f613, f673, f684; +mul.f32 f686, f2346, f685; +fma.rn.f32 f687, f683, f580, f686; +mul.f32 f688, f580, f685; +mul.f32 f689, f683, f2346; +sub.f32 f690, f689, f688; +mul.f32 f692, f613, f685; +mul.f32 f2327, f612, f683; +sub.f32 f693, f2327, f692; +mul.f32 f694, f612, f685; +fma.rn.f32 f695, f613, f683, f694; +mul.f32 f696, f2345, f695; +fma.rn.f32 f697, f693, f584, f696; +mul.f32 f698, f584, f695; +mul.f32 f699, f693, f2345; +sub.f32 f700, f699, f698; +mul.f32 f702, f613, f695; +mul.f32 f2326, f612, f693; +sub.f32 f703, f2326, f702; +mul.f32 f704, f612, f695; +fma.rn.f32 f705, f613, f693, f704; +mul.f32 f706, f2344, f705; +fma.rn.f32 f707, f703, f588, f706; +mul.f32 f708, f588, f705; +mul.f32 f709, f703, f2344; +sub.f32 f710, f709, f708; +mul.f32 f2324, f612, f703; +mul.f32 f2325, f613, f705; +sub.f32 f713, f2324, f2325; +mul.f32 f714, f612, f705; +fma.rn.f32 f715, f613, f703, f714; +mul.f32 f716, f2343, f715; +fma.rn.f32 f717, f713, f592, f716; +mul.f32 f718, f592, f715; +mul.f32 f719, f713, f2343; +sub.f32 f720, f719, f718; +mul.f32 f2322, f612, f713; +mul.f32 f2323, f613, f715; +sub.f32 f723, f2322, f2323; +mul.f32 f724, f612, f715; +fma.rn.f32 f725, f613, f713, f724; +mul.f32 f726, f2342, f725; +fma.rn.f32 f727, f723, f596, f726; +mul.f32 f728, f596, f725; +mul.f32 f729, f723, f2342; +sub.f32 f730, f729, f728; +mul.f32 f732, f613, f725; +mul.f32 f2321, f612, f723; +sub.f32 f733, f2321, f732; +mul.f32 f734, f612, f725; +fma.rn.f32 f735, f613, f723, f734; +mul.f32 f736, f2341, f735; +fma.rn.f32 f737, f733, f600, f736; +mul.f32 f738, f600, f735; +mul.f32 f739, f733, f2341; +sub.f32 f740, f739, f738; +mul.f32 f742, f613, f735; +mul.f32 f2320, f612, f733; +sub.f32 f743, f2320, f742; +mul.f32 f744, f612, f735; +fma.rn.f32 f745, f613, f733, f744; +mul.f32 f746, f2340, f745; +fma.rn.f32 f747, f743, f604, f746; +mul.f32 f748, f604, f745; +mul.f32 f749, f743, f2340; +sub.f32 f750, f749, f748; +mul.f32 f752, f613, f745; +mul.f32 f2319, f612, f743; +sub.f32 f753, f2319, f752; +mul.f32 f754, f612, f745; +fma.rn.f32 f755, f613, f743, f754; +mul.f32 f756, f2339, f755; +fma.rn.f32 f757, f753, f608, f756; +mul.f32 f758, f608, f755; +mul.f32 f759, f753, f2339; +sub.f32 f760, f759, f758; +mul.f32 f2317, f612, f753; +mul.f32 f2318, f613, f755; +sub.f32 f763, f2317, f2318; +mul.f32 f764, f612, f755; +fma.rn.f32 f765, f613, f753, f764; +mul.f32 f766, f551, f765; +fma.rn.f32 f767, f763, f550, f766; +mul.f32 f768, f550, f765; +mul.f32 f769, f763, f551; +sub.f32 f770, f769, f768; +mul.f32 f2315, f612, f763; +mul.f32 f2316, f613, f765; +sub.f32 f773, f2315, f2316; +mul.f32 f774, f612, f765; +fma.rn.f32 f775, f613, f763, f774; +mul.f32 f776, f555, f775; +fma.rn.f32 f777, f773, f554, f776; +mul.f32 f778, f554, f775; +mul.f32 f779, f773, f555; +sub.f32 f780, f779, f778; +mul.f32 f782, f613, f775; +mul.f32 f2314, f612, f773; +sub.f32 f783, f2314, f782; +mul.f32 f784, f612, f775; +fma.rn.f32 f785, f613, f773, f784; +mul.f32 f786, f559, f785; +fma.rn.f32 f787, f783, f558, f786; +mul.f32 f788, f558, f785; +mul.f32 f789, f783, f559; +sub.f32 f790, f789, f788; +mul.f32 f792, f613, f785; +mul.f32 f2313, f612, f783; +sub.f32 f793, f2313, f792; +mul.f32 f794, f612, f785; +fma.rn.f32 f795, f613, f783, f794; +mul.f32 f796, f563, f795; +fma.rn.f32 f797, f793, f562, f796; +mul.f32 f798, f562, f795; +mul.f32 f799, f793, f563; +sub.f32 f800, f799, f798; +mul.f32 f802, f613, f795; +mul.f32 f2312, f612, f793; +sub.f32 f803, f2312, f802; +mul.f32 f804, f612, f795; +fma.rn.f32 f805, f613, f793, f804; +mul.f32 f806, f567, f805; +fma.rn.f32 f807, f803, f566, f806; +mul.f32 f808, f566, f805; +mul.f32 f809, f803, f567; +sub.f32 f810, f809, f808; +mul.f32 f2310, f612, f803; +mul.f32 f2311, f613, f805; +sub.f32 f813, f2310, f2311; +mul.f32 f814, f612, f805; +fma.rn.f32 f815, f613, f803, f814; +mul.f32 f816, f571, f815; +fma.rn.f32 f817, f813, f570, f816; +mul.f32 f818, f570, f815; +mul.f32 f819, f813, f571; +sub.f32 f820, f819, f818; +mul.f32 f2308, f612, f813; +mul.f32 f2309, f613, f815; +sub.f32 f823, f2308, f2309; +mul.f32 f824, f612, f815; +fma.rn.f32 f825, f613, f813, f824; +mul.f32 f826, f575, f825; +fma.rn.f32 f827, f823, f574, f826; +mul.f32 f828, f574, f825; +mul.f32 f829, f823, f575; +sub.f32 f830, f829, f828; +mul.f32 f832, f613, f825; +mul.f32 f2307, f612, f823; +sub.f32 f833, f2307, f832; +mul.f32 f834, f612, f825; +fma.rn.f32 f835, f613, f823, f834; +mul.f32 f836, f579, f835; +fma.rn.f32 f837, f833, f578, f836; +mul.f32 f838, f578, f835; +mul.f32 f839, f833, f579; +sub.f32 f840, f839, f838; +mul.f32 f842, f613, f835; +mul.f32 f2306, f612, f833; +sub.f32 f843, f2306, f842; +mul.f32 f844, f612, f835; +fma.rn.f32 f845, f613, f833, f844; +mul.f32 f846, f583, f845; +fma.rn.f32 f847, f843, f582, f846; +mul.f32 f848, f582, f845; +mul.f32 f849, f843, f583; +sub.f32 f850, f849, f848; +mul.f32 f2304, f612, f843; +mul.f32 f2305, f613, f845; +sub.f32 f853, f2304, f2305; +mul.f32 f854, f612, f845; +fma.rn.f32 f855, f613, f843, f854; +mul.f32 f856, f587, f855; +fma.rn.f32 f857, f853, f586, f856; +mul.f32 f858, f586, f855; +mul.f32 f859, f853, f587; +sub.f32 f860, f859, f858; +mul.f32 f2302, f612, f853; +mul.f32 f2303, f613, f855; +sub.f32 f863, f2302, f2303; +mul.f32 f864, f612, f855; +fma.rn.f32 f865, f613, f853, f864; +mul.f32 f866, f591, f865; +fma.rn.f32 f867, f863, f590, f866; +mul.f32 f868, f590, f865; +mul.f32 f869, f863, f591; +sub.f32 f870, f869, f868; +mul.f32 f872, f613, f865; +mul.f32 f2301, f612, f863; +sub.f32 f873, f2301, f872; +mul.f32 f874, f612, f865; +fma.rn.f32 f875, f613, f863, f874; +mul.f32 f876, f595, f875; +fma.rn.f32 f877, f873, f594, f876; +mul.f32 f878, f594, f875; +mul.f32 f879, f873, f595; +sub.f32 f880, f879, f878; +mul.f32 f882, f613, f875; +mul.f32 f2300, f612, f873; +sub.f32 f883, f2300, f882; +mul.f32 f884, f612, f875; +fma.rn.f32 f885, f613, f873, f884; +mul.f32 f886, f599, f885; +fma.rn.f32 f887, f883, f598, f886; +mul.f32 f888, f598, f885; +mul.f32 f889, f883, f599; +sub.f32 f890, f889, f888; +mul.f32 f892, f613, f885; +mul.f32 f2299, f612, f883; +sub.f32 f893, f2299, f892; +mul.f32 f894, f612, f885; +fma.rn.f32 f895, f613, f883, f894; +mul.f32 f896, f603, f895; +fma.rn.f32 f897, f893, f602, f896; +mul.f32 f898, f602, f895; +mul.f32 f899, f893, f603; +sub.f32 f900, f899, f898; +mul.f32 f2297, f612, f893; +mul.f32 f2298, f613, f895; +sub.f32 f903, f2297, f2298; +mul.f32 f904, f612, f895; +fma.rn.f32 f905, f613, f893, f904; +mul.f32 f906, f607, f905; +fma.rn.f32 f907, f903, f606, f906; +mul.f32 f908, f606, f905; +mul.f32 f909, f903, f607; +sub.f32 f910, f909, f908; +mul.f32 f2295, f612, f903; +mul.f32 f2296, f613, f905; +sub.f32 f913, f2295, f2296; +mul.f32 f914, f612, f905; +fma.rn.f32 f915, f613, f903, f914; +mul.f32 f916, f611, f915; +fma.rn.f32 f917, f913, f610, f916; +mul.f32 f918, f610, f915; +mul.f32 f919, f913, f611; +sub.f32 f920, f919, f918; +shl.b32 r8, r24, 7; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32640; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f548, f617, f627, f637}; +st.shared.v4.f32 [r12+16], {f647, f657, f667, f677}; +st.shared.v4.f32 [r12+32], {f687, f697, f707, f717}; +st.shared.v4.f32 [r12+48], {f727, f737, f747, f757}; +st.shared.v4.f32 [r12+64], {f767, f777, f787, f797}; +st.shared.v4.f32 [r12+80], {f807, f817, f827, f837}; +st.shared.v4.f32 [r12+96], {f847, f857, f867, f877}; +st.shared.v4.f32 [r12+112], {f887, f897, f907, f917}; +barrier.sync 0; +and.b32 r23, r24, 255; +mad.lo.s32 r13, r23, -124, r12; +ld.shared.f32 f921, [r13]; +ld.shared.f32 f922, [r13+1024]; +ld.shared.f32 f923, [r13+2048]; +ld.shared.f32 f924, [r13+3072]; +ld.shared.f32 f925, [r13+4096]; +ld.shared.f32 f926, [r13+5120]; +ld.shared.f32 f927, [r13+6144]; +ld.shared.f32 f928, [r13+7168]; +ld.shared.f32 f929, [r13+8192]; +ld.shared.f32 f930, [r13+9216]; +ld.shared.f32 f931, [r13+10240]; +ld.shared.f32 f932, [r13+11264]; +ld.shared.f32 f933, [r13+12288]; +ld.shared.f32 f934, [r13+13312]; +ld.shared.f32 f935, [r13+14336]; +ld.shared.f32 f936, [r13+15360]; +ld.shared.f32 f937, [r13+16384]; +ld.shared.f32 f938, [r13+17408]; +ld.shared.f32 f939, [r13+18432]; +ld.shared.f32 f940, [r13+19456]; +ld.shared.f32 f941, [r13+20480]; +ld.shared.f32 f942, [r13+21504]; +ld.shared.f32 f943, [r13+22528]; +ld.shared.f32 f944, [r13+23552]; +ld.shared.f32 f945, [r13+24576]; +ld.shared.f32 f946, [r13+25600]; +ld.shared.f32 f947, [r13+26624]; +ld.shared.f32 f948, [r13+27648]; +ld.shared.f32 f949, [r13+28672]; +ld.shared.f32 f950, [r13+29696]; +ld.shared.f32 f951, [r13+30720]; +ld.shared.f32 f952, [r13+31744]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f2354, f620, f630, f640}; +st.shared.v4.f32 [r12+16], {f650, f660, f670, f680}; +st.shared.v4.f32 [r12+32], {f690, f700, f710, f720}; +st.shared.v4.f32 [r12+48], {f730, f740, f750, f760}; +st.shared.v4.f32 [r12+64], {f770, f780, f790, f800}; +st.shared.v4.f32 [r12+80], {f810, f820, f830, f840}; +st.shared.v4.f32 [r12+96], {f850, f860, f870, f880}; +st.shared.v4.f32 [r12+112], {f890, f900, f910, f920}; +barrier.sync 0; +ld.shared.f32 f953, [r13]; +ld.shared.f32 f954, [r13+1024]; +ld.shared.f32 f955, [r13+2048]; +ld.shared.f32 f956, [r13+3072]; +ld.shared.f32 f957, [r13+4096]; +ld.shared.f32 f958, [r13+5120]; +ld.shared.f32 f959, [r13+6144]; +ld.shared.f32 f960, [r13+7168]; +ld.shared.f32 f961, [r13+8192]; +ld.shared.f32 f962, [r13+9216]; +ld.shared.f32 f963, [r13+10240]; +ld.shared.f32 f964, [r13+11264]; +ld.shared.f32 f965, [r13+12288]; +ld.shared.f32 f966, [r13+13312]; +ld.shared.f32 f967, [r13+14336]; +ld.shared.f32 f968, [r13+15360]; +ld.shared.f32 f969, [r13+16384]; +ld.shared.f32 f970, [r13+17408]; +ld.shared.f32 f971, [r13+18432]; +ld.shared.f32 f972, [r13+19456]; +ld.shared.f32 f973, [r13+20480]; +ld.shared.f32 f974, [r13+21504]; +ld.shared.f32 f975, [r13+22528]; +ld.shared.f32 f976, [r13+23552]; +ld.shared.f32 f977, [r13+24576]; +ld.shared.f32 f978, [r13+25600]; +ld.shared.f32 f979, [r13+26624]; +ld.shared.f32 f980, [r13+27648]; +ld.shared.f32 f981, [r13+28672]; +ld.shared.f32 f982, [r13+29696]; +ld.shared.f32 f983, [r13+30720]; +ld.shared.f32 f984, [r13+31744]; +add.f32 f985, f921, f937; +sub.f32 f987, f921, f937; +add.f32 f2294, f953, f969; +sub.f32 f988, f953, f969; +add.f32 f989, f929, f945; +sub.f32 f991, f929, f945; +add.f32 f2293, f961, f977; +sub.f32 f992, f961, f977; +add.f32 f993, f985, f989; +sub.f32 f995, f985, f989; +add.f32 f2292, f2294, f2293; +sub.f32 f996, f2294, f2293; +sub.f32 f997, f987, f992; +add.f32 f999, f987, f992; +add.f32 f2291, f988, f991; +sub.f32 f1000, f988, f991; +add.f32 f1001, f925, f941; +sub.f32 f1003, f925, f941; +add.f32 f2290, f957, f973; +sub.f32 f1004, f957, f973; +add.f32 f1005, f933, f949; +sub.f32 f1007, f933, f949; +add.f32 f2289, f965, f981; +sub.f32 f1008, f965, f981; +add.f32 f1009, f1001, f1005; +sub.f32 f1011, f1001, f1005; +add.f32 f2288, f2290, f2289; +sub.f32 f1012, f2290, f2289; +sub.f32 f1013, f1003, f1008; +add.f32 f1015, f1003, f1008; +add.f32 f2287, f1004, f1007; +sub.f32 f1016, f1004, f1007; +mul.f32 f1017, f1013, 0f3F3504F3; +mul.f32 f1018, f2287, 0f3F3504F3; +sub.f32 f1019, f1017, f1018; +add.f32 f1020, f1017, f1018; +mul.f32 f2285, f1015, 0fBF3504F3; +mul.f32 f2286, f1016, 0f3F3504F3; +sub.f32 f1023, f2285, f2286; +mul.f32 f1024, f1016, 0fBF3504F3; +fma.rn.f32 f1025, f1015, 0f3F3504F3, f1024; +add.f32 f1026, f993, f1009; +sub.f32 f1028, f993, f1009; +add.f32 f2284, f2292, f2288; +sub.f32 f1029, f2292, f2288; +add.f32 f1030, f997, f1019; +sub.f32 f1032, f997, f1019; +add.f32 f2283, f2291, f1020; +sub.f32 f1033, f2291, f1020; +sub.f32 f1034, f995, f1012; +add.f32 f1036, f995, f1012; +add.f32 f2282, f996, f1011; +sub.f32 f1037, f996, f1011; +add.f32 f1038, f999, f1023; +sub.f32 f1040, f999, f1023; +add.f32 f2281, f1000, f1025; +sub.f32 f1041, f1000, f1025; +add.f32 f1042, f923, f939; +sub.f32 f1044, f923, f939; +add.f32 f2280, f955, f971; +sub.f32 f1045, f955, f971; +add.f32 f1046, f931, f947; +sub.f32 f1048, f931, f947; +add.f32 f2279, f963, f979; +sub.f32 f1049, f963, f979; +add.f32 f1050, f1042, f1046; +sub.f32 f1052, f1042, f1046; +add.f32 f2278, f2280, f2279; +sub.f32 f1053, f2280, f2279; +sub.f32 f1054, f1044, f1049; +add.f32 f1056, f1044, f1049; +add.f32 f2277, f1045, f1048; +sub.f32 f1057, f1045, f1048; +add.f32 f1058, f927, f943; +sub.f32 f1060, f927, f943; +add.f32 f2276, f959, f975; +sub.f32 f1061, f959, f975; +add.f32 f1062, f935, f951; +sub.f32 f1064, f935, f951; +add.f32 f2275, f967, f983; +sub.f32 f1065, f967, f983; +add.f32 f1066, f1058, f1062; +sub.f32 f1068, f1058, f1062; +add.f32 f2274, f2276, f2275; +sub.f32 f1069, f2276, f2275; +sub.f32 f1070, f1060, f1065; +add.f32 f1072, f1060, f1065; +add.f32 f2273, f1061, f1064; +sub.f32 f1073, f1061, f1064; +mul.f32 f1074, f1070, 0f3F3504F3; +mul.f32 f1075, f2273, 0f3F3504F3; +sub.f32 f1076, f1074, f1075; +add.f32 f1077, f1074, f1075; +mul.f32 f2271, f1072, 0fBF3504F3; +mul.f32 f2272, f1073, 0f3F3504F3; +sub.f32 f1080, f2271, f2272; +mul.f32 f1081, f1073, 0fBF3504F3; +fma.rn.f32 f1082, f1072, 0f3F3504F3, f1081; +add.f32 f1083, f1050, f1066; +sub.f32 f1085, f1050, f1066; +add.f32 f2270, f2278, f2274; +sub.f32 f1086, f2278, f2274; +add.f32 f1087, f1054, f1076; +sub.f32 f1089, f1054, f1076; +add.f32 f2269, f2277, f1077; +sub.f32 f1090, f2277, f1077; +sub.f32 f1091, f1052, f1069; +add.f32 f1093, f1052, f1069; +add.f32 f2268, f1053, f1068; +sub.f32 f1094, f1053, f1068; +add.f32 f1095, f1056, f1080; +sub.f32 f1097, f1056, f1080; +add.f32 f2267, f1057, f1082; +sub.f32 f1098, f1057, f1082; +mul.f32 f2265, f1087, 0f3F6C835E; +mul.f32 f2266, f2269, 0f3EC3EF15; +sub.f32 f1101, f2265, f2266; +mul.f32 f1102, f2269, 0f3F6C835E; +fma.rn.f32 f1103, f1087, 0f3EC3EF15, f1102; +mul.f32 f1104, f1091, 0f3F3504F3; +mul.f32 f1105, f2268, 0f3F3504F3; +sub.f32 f1106, f1104, f1105; +add.f32 f1107, f1104, f1105; +mul.f32 f2263, f1095, 0f3EC3EF15; +mul.f32 f2264, f2267, 0f3F6C835E; +sub.f32 f1110, f2263, f2264; +mul.f32 f1111, f2267, 0f3EC3EF15; +fma.rn.f32 f1112, f1095, 0f3F6C835E, f1111; +mul.f32 f2261, f1089, 0fBEC3EF15; +mul.f32 f2262, f1090, 0f3F6C835E; +sub.f32 f1115, f2261, f2262; +mul.f32 f1116, f1090, 0fBEC3EF15; +fma.rn.f32 f1117, f1089, 0f3F6C835E, f1116; +mul.f32 f2259, f1093, 0fBF3504F3; +mul.f32 f2260, f1094, 0f3F3504F3; +sub.f32 f1120, f2259, f2260; +mul.f32 f1121, f1094, 0fBF3504F3; +fma.rn.f32 f1122, f1093, 0f3F3504F3, f1121; +mul.f32 f2257, f1097, 0fBF6C835E; +mul.f32 f2258, f1098, 0f3EC3EF15; +sub.f32 f1125, f2257, f2258; +mul.f32 f1126, f1098, 0fBF6C835E; +fma.rn.f32 f1127, f1097, 0f3EC3EF15, f1126; +add.f32 f1128, f1026, f1083; +sub.f32 f1130, f1026, f1083; +add.f32 f2256, f2284, f2270; +sub.f32 f1131, f2284, f2270; +add.f32 f1132, f1030, f1101; +sub.f32 f1134, f1030, f1101; +add.f32 f2255, f2283, f1103; +sub.f32 f1135, f2283, f1103; +add.f32 f1136, f1034, f1106; +sub.f32 f1138, f1034, f1106; +add.f32 f2254, f2282, f1107; +sub.f32 f1139, f2282, f1107; +add.f32 f1140, f1038, f1110; +sub.f32 f1142, f1038, f1110; +add.f32 f2253, f2281, f1112; +sub.f32 f1143, f2281, f1112; +sub.f32 f1144, f1028, f1086; +add.f32 f1146, f1028, f1086; +add.f32 f2252, f1029, f1085; +sub.f32 f1147, f1029, f1085; +add.f32 f1148, f1032, f1115; +sub.f32 f1150, f1032, f1115; +add.f32 f2251, f1033, f1117; +sub.f32 f1151, f1033, f1117; +add.f32 f1152, f1036, f1120; +sub.f32 f1154, f1036, f1120; +add.f32 f2250, f1037, f1122; +sub.f32 f1155, f1037, f1122; +add.f32 f1156, f1040, f1125; +sub.f32 f1158, f1040, f1125; +add.f32 f2249, f1041, f1127; +sub.f32 f1159, f1041, f1127; +add.f32 f1160, f922, f938; +sub.f32 f1162, f922, f938; +add.f32 f2248, f954, f970; +sub.f32 f1163, f954, f970; +add.f32 f1164, f930, f946; +sub.f32 f1166, f930, f946; +add.f32 f2247, f962, f978; +sub.f32 f1167, f962, f978; +add.f32 f1168, f1160, f1164; +sub.f32 f1170, f1160, f1164; +add.f32 f2246, f2248, f2247; +sub.f32 f1171, f2248, f2247; +sub.f32 f1172, f1162, f1167; +add.f32 f1174, f1162, f1167; +add.f32 f2245, f1163, f1166; +sub.f32 f1175, f1163, f1166; +add.f32 f1176, f926, f942; +sub.f32 f1178, f926, f942; +add.f32 f2244, f958, f974; +sub.f32 f1179, f958, f974; +add.f32 f1180, f934, f950; +sub.f32 f1182, f934, f950; +add.f32 f2243, f966, f982; +sub.f32 f1183, f966, f982; +add.f32 f1184, f1176, f1180; +sub.f32 f1186, f1176, f1180; +add.f32 f2242, f2244, f2243; +sub.f32 f1187, f2244, f2243; +sub.f32 f1188, f1178, f1183; +add.f32 f1190, f1178, f1183; +add.f32 f2241, f1179, f1182; +sub.f32 f1191, f1179, f1182; +mul.f32 f1192, f1188, 0f3F3504F3; +mul.f32 f1193, f2241, 0f3F3504F3; +sub.f32 f1194, f1192, f1193; +add.f32 f1195, f1192, f1193; +mul.f32 f2239, f1190, 0fBF3504F3; +mul.f32 f2240, f1191, 0f3F3504F3; +sub.f32 f1198, f2239, f2240; +mul.f32 f1199, f1191, 0fBF3504F3; +fma.rn.f32 f1200, f1190, 0f3F3504F3, f1199; +add.f32 f1201, f1168, f1184; +sub.f32 f1203, f1168, f1184; +add.f32 f2238, f2246, f2242; +sub.f32 f1204, f2246, f2242; +add.f32 f1205, f1172, f1194; +sub.f32 f1207, f1172, f1194; +add.f32 f2237, f2245, f1195; +sub.f32 f1208, f2245, f1195; +sub.f32 f1209, f1170, f1187; +add.f32 f1211, f1170, f1187; +add.f32 f2236, f1171, f1186; +sub.f32 f1212, f1171, f1186; +add.f32 f1213, f1174, f1198; +sub.f32 f1215, f1174, f1198; +add.f32 f2235, f1175, f1200; +sub.f32 f1216, f1175, f1200; +add.f32 f1217, f924, f940; +sub.f32 f1219, f924, f940; +add.f32 f2234, f956, f972; +sub.f32 f1220, f956, f972; +add.f32 f1221, f932, f948; +sub.f32 f1223, f932, f948; +add.f32 f2233, f964, f980; +sub.f32 f1224, f964, f980; +add.f32 f1225, f1217, f1221; +sub.f32 f1227, f1217, f1221; +add.f32 f2232, f2234, f2233; +sub.f32 f1228, f2234, f2233; +sub.f32 f1229, f1219, f1224; +add.f32 f1231, f1219, f1224; +add.f32 f2231, f1220, f1223; +sub.f32 f1232, f1220, f1223; +add.f32 f1233, f928, f944; +sub.f32 f1235, f928, f944; +add.f32 f2230, f960, f976; +sub.f32 f1236, f960, f976; +add.f32 f1237, f936, f952; +sub.f32 f1239, f936, f952; +add.f32 f2229, f968, f984; +sub.f32 f1240, f968, f984; +add.f32 f1241, f1233, f1237; +sub.f32 f1243, f1233, f1237; +add.f32 f2228, f2230, f2229; +sub.f32 f1244, f2230, f2229; +sub.f32 f1245, f1235, f1240; +add.f32 f1247, f1235, f1240; +add.f32 f2227, f1236, f1239; +sub.f32 f1248, f1236, f1239; +mul.f32 f1249, f1245, 0f3F3504F3; +mul.f32 f1250, f2227, 0f3F3504F3; +sub.f32 f1251, f1249, f1250; +add.f32 f1252, f1249, f1250; +mul.f32 f2225, f1247, 0fBF3504F3; +mul.f32 f2226, f1248, 0f3F3504F3; +sub.f32 f1255, f2225, f2226; +mul.f32 f1256, f1248, 0fBF3504F3; +fma.rn.f32 f1257, f1247, 0f3F3504F3, f1256; +add.f32 f1258, f1225, f1241; +sub.f32 f1260, f1225, f1241; +add.f32 f2224, f2232, f2228; +sub.f32 f1261, f2232, f2228; +add.f32 f1262, f1229, f1251; +sub.f32 f1264, f1229, f1251; +add.f32 f2223, f2231, f1252; +sub.f32 f1265, f2231, f1252; +sub.f32 f1266, f1227, f1244; +add.f32 f1268, f1227, f1244; +add.f32 f2222, f1228, f1243; +sub.f32 f1269, f1228, f1243; +add.f32 f1270, f1231, f1255; +sub.f32 f1272, f1231, f1255; +add.f32 f2221, f1232, f1257; +sub.f32 f1273, f1232, f1257; +mul.f32 f2219, f1262, 0f3F6C835E; +mul.f32 f2220, f2223, 0f3EC3EF15; +sub.f32 f1276, f2219, f2220; +mul.f32 f1277, f2223, 0f3F6C835E; +fma.rn.f32 f1278, f1262, 0f3EC3EF15, f1277; +mul.f32 f1279, f1266, 0f3F3504F3; +mul.f32 f1280, f2222, 0f3F3504F3; +sub.f32 f1281, f1279, f1280; +add.f32 f1282, f1279, f1280; +mul.f32 f1284, f2221, 0f3F6C835E; +mul.f32 f2218, f1270, 0f3EC3EF15; +sub.f32 f1285, f2218, f1284; +mul.f32 f1286, f2221, 0f3EC3EF15; +fma.rn.f32 f1287, f1270, 0f3F6C835E, f1286; +mul.f32 f1289, f1265, 0f3F6C835E; +mul.f32 f2217, f1264, 0fBEC3EF15; +sub.f32 f1290, f2217, f1289; +mul.f32 f1291, f1265, 0fBEC3EF15; +fma.rn.f32 f1292, f1264, 0f3F6C835E, f1291; +mul.f32 f2215, f1268, 0fBF3504F3; +mul.f32 f2216, f1269, 0f3F3504F3; +sub.f32 f1295, f2215, f2216; +mul.f32 f1296, f1269, 0fBF3504F3; +fma.rn.f32 f1297, f1268, 0f3F3504F3, f1296; +mul.f32 f2213, f1272, 0fBF6C835E; +mul.f32 f2214, f1273, 0f3EC3EF15; +sub.f32 f1300, f2213, f2214; +mul.f32 f1301, f1273, 0fBF6C835E; +fma.rn.f32 f1302, f1272, 0f3EC3EF15, f1301; +add.f32 f1303, f1201, f1258; +sub.f32 f1305, f1201, f1258; +add.f32 f2212, f2238, f2224; +sub.f32 f1306, f2238, f2224; +add.f32 f1307, f1205, f1276; +sub.f32 f1309, f1205, f1276; +add.f32 f2211, f2237, f1278; +sub.f32 f1310, f2237, f1278; +add.f32 f1311, f1209, f1281; +sub.f32 f1313, f1209, f1281; +add.f32 f2210, f2236, f1282; +sub.f32 f1314, f2236, f1282; +add.f32 f1315, f1213, f1285; +sub.f32 f1317, f1213, f1285; +add.f32 f2209, f2235, f1287; +sub.f32 f1318, f2235, f1287; +sub.f32 f1319, f1203, f1261; +add.f32 f1321, f1203, f1261; +add.f32 f2208, f1204, f1260; +sub.f32 f1322, f1204, f1260; +add.f32 f1323, f1207, f1290; +sub.f32 f1325, f1207, f1290; +add.f32 f2207, f1208, f1292; +sub.f32 f1326, f1208, f1292; +add.f32 f1327, f1211, f1295; +sub.f32 f1329, f1211, f1295; +add.f32 f2206, f1212, f1297; +sub.f32 f1330, f1212, f1297; +add.f32 f1331, f1215, f1300; +sub.f32 f1333, f1215, f1300; +add.f32 f2205, f1216, f1302; +sub.f32 f1334, f1216, f1302; +mul.f32 f1336, f2211, 0f3E47C5C2; +mul.f32 f2204, f1307, 0f3F7B14BE; +sub.f32 f1337, f2204, f1336; +mul.f32 f1338, f2211, 0f3F7B14BE; +fma.rn.f32 f1339, f1307, 0f3E47C5C2, f1338; +mul.f32 f1341, f2210, 0f3EC3EF15; +mul.f32 f2203, f1311, 0f3F6C835E; +sub.f32 f1342, f2203, f1341; +mul.f32 f1343, f2210, 0f3F6C835E; +fma.rn.f32 f1344, f1311, 0f3EC3EF15, f1343; +mul.f32 f2201, f1315, 0f3F54DB31; +mul.f32 f2202, f2209, 0f3F0E39DA; +sub.f32 f1347, f2201, f2202; +mul.f32 f1348, f2209, 0f3F54DB31; +fma.rn.f32 f1349, f1315, 0f3F0E39DA, f1348; +mul.f32 f1350, f1319, 0f3F3504F3; +mul.f32 f1351, f2208, 0f3F3504F3; +sub.f32 f1352, f1350, f1351; +add.f32 f1353, f1350, f1351; +mul.f32 f1355, f2207, 0f3F54DB31; +mul.f32 f2200, f1323, 0f3F0E39DA; +sub.f32 f1356, f2200, f1355; +mul.f32 f1357, f2207, 0f3F0E39DA; +fma.rn.f32 f1358, f1323, 0f3F54DB31, f1357; +mul.f32 f1360, f2206, 0f3F6C835E; +mul.f32 f2199, f1327, 0f3EC3EF15; +sub.f32 f1361, f2199, f1360; +mul.f32 f1362, f2206, 0f3EC3EF15; +fma.rn.f32 f1363, f1327, 0f3F6C835E, f1362; +mul.f32 f2197, f1331, 0f3E47C5C2; +mul.f32 f2198, f2205, 0f3F7B14BE; +sub.f32 f1366, f2197, f2198; +mul.f32 f1367, f2205, 0f3E47C5C2; +fma.rn.f32 f1368, f1331, 0f3F7B14BE, f1367; +mul.f32 f2195, f1309, 0fBE47C5C2; +mul.f32 f2196, f1310, 0f3F7B14BE; +sub.f32 f1371, f2195, f2196; +mul.f32 f1372, f1310, 0fBE47C5C2; +fma.rn.f32 f1373, f1309, 0f3F7B14BE, f1372; +mul.f32 f2193, f1313, 0fBEC3EF15; +mul.f32 f2194, f1314, 0f3F6C835E; +sub.f32 f1376, f2193, f2194; +mul.f32 f1377, f1314, 0fBEC3EF15; +fma.rn.f32 f1378, f1313, 0f3F6C835E, f1377; +mul.f32 f2191, f1317, 0fBF0E39DA; +mul.f32 f2192, f1318, 0f3F54DB31; +sub.f32 f1381, f2191, f2192; +mul.f32 f1382, f1318, 0fBF0E39DA; +fma.rn.f32 f1383, f1317, 0f3F54DB31, f1382; +mul.f32 f1385, f1322, 0f3F3504F3; +mul.f32 f2190, f1321, 0fBF3504F3; +sub.f32 f1386, f2190, f1385; +mul.f32 f1387, f1322, 0fBF3504F3; +fma.rn.f32 f1388, f1321, 0f3F3504F3, f1387; +mul.f32 f1390, f1326, 0f3F0E39DA; +mul.f32 f2189, f1325, 0fBF54DB31; +sub.f32 f1391, f2189, f1390; +mul.f32 f1392, f1326, 0fBF54DB31; +fma.rn.f32 f1393, f1325, 0f3F0E39DA, f1392; +mul.f32 f1395, f1330, 0f3EC3EF15; +mul.f32 f2188, f1329, 0fBF6C835E; +sub.f32 f1396, f2188, f1395; +mul.f32 f1397, f1330, 0fBF6C835E; +fma.rn.f32 f1398, f1329, 0f3EC3EF15, f1397; +mul.f32 f1400, f1334, 0f3E47C5C2; +mul.f32 f2187, f1333, 0fBF7B14BE; +sub.f32 f1401, f2187, f1400; +mul.f32 f1402, f1334, 0fBF7B14BE; +fma.rn.f32 f1403, f1333, 0f3E47C5C2, f1402; +add.f32 f1404, f1128, f1303; +sub.f32 f1406, f1128, f1303; +add.f32 f2186, f2256, f2212; +sub.f32 f1407, f2256, f2212; +add.f32 f1408, f1132, f1337; +sub.f32 f1410, f1132, f1337; +add.f32 f2185, f2255, f1339; +sub.f32 f1411, f2255, f1339; +add.f32 f1412, f1136, f1342; +sub.f32 f1414, f1136, f1342; +add.f32 f2184, f2254, f1344; +sub.f32 f1415, f2254, f1344; +add.f32 f1416, f1140, f1347; +sub.f32 f1418, f1140, f1347; +add.f32 f2183, f2253, f1349; +sub.f32 f1419, f2253, f1349; +add.f32 f1420, f1144, f1352; +sub.f32 f1422, f1144, f1352; +add.f32 f2182, f2252, f1353; +sub.f32 f1423, f2252, f1353; +add.f32 f1424, f1148, f1356; +sub.f32 f1426, f1148, f1356; +add.f32 f2181, f2251, f1358; +sub.f32 f1427, f2251, f1358; +add.f32 f1428, f1152, f1361; +sub.f32 f1430, f1152, f1361; +add.f32 f2180, f2250, f1363; +sub.f32 f1431, f2250, f1363; +add.f32 f1432, f1156, f1366; +sub.f32 f1434, f1156, f1366; +add.f32 f2179, f2249, f1368; +sub.f32 f1435, f2249, f1368; +sub.f32 f1436, f1130, f1306; +add.f32 f1438, f1130, f1306; +add.f32 f2178, f1131, f1305; +sub.f32 f1439, f1131, f1305; +add.f32 f1440, f1134, f1371; +sub.f32 f1442, f1134, f1371; +add.f32 f2177, f1135, f1373; +sub.f32 f1443, f1135, f1373; +add.f32 f1444, f1138, f1376; +sub.f32 f1446, f1138, f1376; +add.f32 f2176, f1139, f1378; +sub.f32 f1447, f1139, f1378; +add.f32 f1448, f1142, f1381; +sub.f32 f1450, f1142, f1381; +add.f32 f2175, f1143, f1383; +sub.f32 f1451, f1143, f1383; +add.f32 f1452, f1146, f1386; +sub.f32 f1454, f1146, f1386; +add.f32 f2174, f1147, f1388; +sub.f32 f1455, f1147, f1388; +add.f32 f1456, f1150, f1391; +sub.f32 f1458, f1150, f1391; +add.f32 f2173, f1151, f1393; +sub.f32 f1459, f1151, f1393; +add.f32 f1460, f1154, f1396; +sub.f32 f1462, f1154, f1396; +add.f32 f2172, f1155, f1398; +sub.f32 f1463, f1155, f1398; +add.f32 f1464, f1158, f1401; +sub.f32 f1466, f1158, f1401; +add.f32 f2171, f1159, f1403; +sub.f32 f1467, f1159, f1403; +bfe.u32 r15, r24, 5, 3; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %66; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f1468, f1469}, [rd8]; +mul.f32 f1472, f2185, f1469; +fma.rn.f32 f1473, f1468, f1408, f1472; +mul.f32 f1474, f1408, f1469; +mul.f32 f1475, f1468, f2185; +sub.f32 f1476, f1475, f1474; +mul.f32 f1478, f1469, f1469; +mul.f32 f2170, f1468, f1468; +sub.f32 f1479, f2170, f1478; +mul.f32 f1480, f1469, f1468; +fma.rn.f32 f1481, f1469, f1468, f1480; +mul.f32 f1482, f2184, f1481; +fma.rn.f32 f1483, f1479, f1412, f1482; +mul.f32 f1484, f1412, f1481; +mul.f32 f1485, f1479, f2184; +sub.f32 f1486, f1485, f1484; +mul.f32 f2168, f1468, f1479; +mul.f32 f2169, f1469, f1481; +sub.f32 f1489, f2168, f2169; +mul.f32 f1490, f1468, f1481; +fma.rn.f32 f1491, f1469, f1479, f1490; +mul.f32 f1492, f2183, f1491; +fma.rn.f32 f1493, f1489, f1416, f1492; +mul.f32 f1494, f1416, f1491; +mul.f32 f1495, f1489, f2183; +sub.f32 f1496, f1495, f1494; +mul.f32 f2166, f1468, f1489; +mul.f32 f2167, f1469, f1491; +sub.f32 f1499, f2166, f2167; +mul.f32 f1500, f1468, f1491; +fma.rn.f32 f1501, f1469, f1489, f1500; +mul.f32 f1502, f2182, f1501; +fma.rn.f32 f1503, f1499, f1420, f1502; +mul.f32 f1504, f1420, f1501; +mul.f32 f1505, f1499, f2182; +sub.f32 f1506, f1505, f1504; +mul.f32 f1508, f1469, f1501; +mul.f32 f2165, f1468, f1499; +sub.f32 f1509, f2165, f1508; +mul.f32 f1510, f1468, f1501; +fma.rn.f32 f1511, f1469, f1499, f1510; +mul.f32 f1512, f2181, f1511; +fma.rn.f32 f1513, f1509, f1424, f1512; +mul.f32 f1514, f1424, f1511; +mul.f32 f1515, f1509, f2181; +sub.f32 f1516, f1515, f1514; +mul.f32 f1518, f1469, f1511; +mul.f32 f2164, f1468, f1509; +sub.f32 f1519, f2164, f1518; +mul.f32 f1520, f1468, f1511; +fma.rn.f32 f1521, f1469, f1509, f1520; +mul.f32 f1522, f2180, f1521; +fma.rn.f32 f1523, f1519, f1428, f1522; +mul.f32 f1524, f1428, f1521; +mul.f32 f1525, f1519, f2180; +sub.f32 f1526, f1525, f1524; +mul.f32 f1528, f1469, f1521; +mul.f32 f2163, f1468, f1519; +sub.f32 f1529, f2163, f1528; +mul.f32 f1530, f1468, f1521; +fma.rn.f32 f1531, f1469, f1519, f1530; +mul.f32 f1532, f2179, f1531; +fma.rn.f32 f1533, f1529, f1432, f1532; +mul.f32 f1534, f1432, f1531; +mul.f32 f1535, f1529, f2179; +sub.f32 f1536, f1535, f1534; +mul.f32 f2161, f1468, f1529; +mul.f32 f2162, f1469, f1531; +sub.f32 f1539, f2161, f2162; +mul.f32 f1540, f1468, f1531; +fma.rn.f32 f1541, f1469, f1529, f1540; +mul.f32 f1542, f2178, f1541; +fma.rn.f32 f1543, f1539, f1436, f1542; +mul.f32 f1544, f1436, f1541; +mul.f32 f1545, f1539, f2178; +sub.f32 f1546, f1545, f1544; +mul.f32 f2159, f1468, f1539; +mul.f32 f2160, f1469, f1541; +sub.f32 f1549, f2159, f2160; +mul.f32 f1550, f1468, f1541; +fma.rn.f32 f1551, f1469, f1539, f1550; +mul.f32 f1552, f2177, f1551; +fma.rn.f32 f1553, f1549, f1440, f1552; +mul.f32 f1554, f1440, f1551; +mul.f32 f1555, f1549, f2177; +sub.f32 f1556, f1555, f1554; +mul.f32 f1558, f1469, f1551; +mul.f32 f2158, f1468, f1549; +sub.f32 f1559, f2158, f1558; +mul.f32 f1560, f1468, f1551; +fma.rn.f32 f1561, f1469, f1549, f1560; +mul.f32 f1562, f2176, f1561; +fma.rn.f32 f1563, f1559, f1444, f1562; +mul.f32 f1564, f1444, f1561; +mul.f32 f1565, f1559, f2176; +sub.f32 f1566, f1565, f1564; +mul.f32 f1568, f1469, f1561; +mul.f32 f2157, f1468, f1559; +sub.f32 f1569, f2157, f1568; +mul.f32 f1570, f1468, f1561; +fma.rn.f32 f1571, f1469, f1559, f1570; +mul.f32 f1572, f2175, f1571; +fma.rn.f32 f1573, f1569, f1448, f1572; +mul.f32 f1574, f1448, f1571; +mul.f32 f1575, f1569, f2175; +sub.f32 f1576, f1575, f1574; +mul.f32 f1578, f1469, f1571; +mul.f32 f2156, f1468, f1569; +sub.f32 f1579, f2156, f1578; +mul.f32 f1580, f1468, f1571; +fma.rn.f32 f1581, f1469, f1569, f1580; +mul.f32 f1582, f2174, f1581; +fma.rn.f32 f1583, f1579, f1452, f1582; +mul.f32 f1584, f1452, f1581; +mul.f32 f1585, f1579, f2174; +sub.f32 f1586, f1585, f1584; +mul.f32 f2154, f1468, f1579; +mul.f32 f2155, f1469, f1581; +sub.f32 f1589, f2154, f2155; +mul.f32 f1590, f1468, f1581; +fma.rn.f32 f1591, f1469, f1579, f1590; +mul.f32 f1592, f2173, f1591; +fma.rn.f32 f1593, f1589, f1456, f1592; +mul.f32 f1594, f1456, f1591; +mul.f32 f1595, f1589, f2173; +sub.f32 f1596, f1595, f1594; +mul.f32 f1598, f1469, f1591; +mul.f32 f2153, f1468, f1589; +sub.f32 f1599, f2153, f1598; +mul.f32 f1600, f1468, f1591; +fma.rn.f32 f1601, f1469, f1589, f1600; +mul.f32 f1602, f2172, f1601; +fma.rn.f32 f1603, f1599, f1460, f1602; +mul.f32 f1604, f1460, f1601; +mul.f32 f1605, f1599, f2172; +sub.f32 f1606, f1605, f1604; +mul.f32 f1608, f1469, f1601; +mul.f32 f2152, f1468, f1599; +sub.f32 f1609, f2152, f1608; +mul.f32 f1610, f1468, f1601; +fma.rn.f32 f1611, f1469, f1599, f1610; +mul.f32 f1612, f2171, f1611; +fma.rn.f32 f1613, f1609, f1464, f1612; +mul.f32 f1614, f1464, f1611; +mul.f32 f1615, f1609, f2171; +sub.f32 f1616, f1615, f1614; +mul.f32 f1618, f1469, f1611; +mul.f32 f2151, f1468, f1609; +sub.f32 f1619, f2151, f1618; +mul.f32 f1620, f1468, f1611; +fma.rn.f32 f1621, f1469, f1609, f1620; +mul.f32 f1622, f1407, f1621; +fma.rn.f32 f1623, f1619, f1406, f1622; +mul.f32 f1624, f1406, f1621; +mul.f32 f1625, f1619, f1407; +sub.f32 f1626, f1625, f1624; +mul.f32 f2149, f1468, f1619; +mul.f32 f2150, f1469, f1621; +sub.f32 f1629, f2149, f2150; +mul.f32 f1630, f1468, f1621; +fma.rn.f32 f1631, f1469, f1619, f1630; +mul.f32 f1632, f1411, f1631; +fma.rn.f32 f1633, f1629, f1410, f1632; +mul.f32 f1634, f1410, f1631; +mul.f32 f1635, f1629, f1411; +sub.f32 f1636, f1635, f1634; +mul.f32 f2147, f1468, f1629; +mul.f32 f2148, f1469, f1631; +sub.f32 f1639, f2147, f2148; +mul.f32 f1640, f1468, f1631; +fma.rn.f32 f1641, f1469, f1629, f1640; +mul.f32 f1642, f1415, f1641; +fma.rn.f32 f1643, f1639, f1414, f1642; +mul.f32 f1644, f1414, f1641; +mul.f32 f1645, f1639, f1415; +sub.f32 f1646, f1645, f1644; +mul.f32 f1648, f1469, f1641; +mul.f32 f2146, f1468, f1639; +sub.f32 f1649, f2146, f1648; +mul.f32 f1650, f1468, f1641; +fma.rn.f32 f1651, f1469, f1639, f1650; +mul.f32 f1652, f1419, f1651; +fma.rn.f32 f1653, f1649, f1418, f1652; +mul.f32 f1654, f1418, f1651; +mul.f32 f1655, f1649, f1419; +sub.f32 f1656, f1655, f1654; +mul.f32 f1658, f1469, f1651; +mul.f32 f2145, f1468, f1649; +sub.f32 f1659, f2145, f1658; +mul.f32 f1660, f1468, f1651; +fma.rn.f32 f1661, f1469, f1649, f1660; +mul.f32 f1662, f1423, f1661; +fma.rn.f32 f1663, f1659, f1422, f1662; +mul.f32 f1664, f1422, f1661; +mul.f32 f1665, f1659, f1423; +sub.f32 f1666, f1665, f1664; +mul.f32 f1668, f1469, f1661; +mul.f32 f2144, f1468, f1659; +sub.f32 f1669, f2144, f1668; +mul.f32 f1670, f1468, f1661; +fma.rn.f32 f1671, f1469, f1659, f1670; +mul.f32 f1672, f1427, f1671; +fma.rn.f32 f1673, f1669, f1426, f1672; +mul.f32 f1674, f1426, f1671; +mul.f32 f1675, f1669, f1427; +sub.f32 f1676, f1675, f1674; +mul.f32 f2142, f1468, f1669; +mul.f32 f2143, f1469, f1671; +sub.f32 f1679, f2142, f2143; +mul.f32 f1680, f1468, f1671; +fma.rn.f32 f1681, f1469, f1669, f1680; +mul.f32 f1682, f1431, f1681; +fma.rn.f32 f1683, f1679, f1430, f1682; +mul.f32 f1684, f1430, f1681; +mul.f32 f1685, f1679, f1431; +sub.f32 f1686, f1685, f1684; +mul.f32 f2140, f1468, f1679; +mul.f32 f2141, f1469, f1681; +sub.f32 f1689, f2140, f2141; +mul.f32 f1690, f1468, f1681; +fma.rn.f32 f1691, f1469, f1679, f1690; +mul.f32 f1692, f1435, f1691; +fma.rn.f32 f1693, f1689, f1434, f1692; +mul.f32 f1694, f1434, f1691; +mul.f32 f1695, f1689, f1435; +sub.f32 f1696, f1695, f1694; +mul.f32 f1698, f1469, f1691; +mul.f32 f2139, f1468, f1689; +sub.f32 f1699, f2139, f1698; +mul.f32 f1700, f1468, f1691; +fma.rn.f32 f1701, f1469, f1689, f1700; +mul.f32 f1702, f1439, f1701; +fma.rn.f32 f1703, f1699, f1438, f1702; +mul.f32 f1704, f1438, f1701; +mul.f32 f1705, f1699, f1439; +sub.f32 f1706, f1705, f1704; +mul.f32 f1708, f1469, f1701; +mul.f32 f2138, f1468, f1699; +sub.f32 f1709, f2138, f1708; +mul.f32 f1710, f1468, f1701; +fma.rn.f32 f1711, f1469, f1699, f1710; +mul.f32 f1712, f1443, f1711; +fma.rn.f32 f1713, f1709, f1442, f1712; +mul.f32 f1714, f1442, f1711; +mul.f32 f1715, f1709, f1443; +sub.f32 f1716, f1715, f1714; +mul.f32 f2136, f1468, f1709; +mul.f32 f2137, f1469, f1711; +sub.f32 f1719, f2136, f2137; +mul.f32 f1720, f1468, f1711; +fma.rn.f32 f1721, f1469, f1709, f1720; +mul.f32 f1722, f1447, f1721; +fma.rn.f32 f1723, f1719, f1446, f1722; +mul.f32 f1724, f1446, f1721; +mul.f32 f1725, f1719, f1447; +sub.f32 f1726, f1725, f1724; +mul.f32 f2134, f1468, f1719; +mul.f32 f2135, f1469, f1721; +sub.f32 f1729, f2134, f2135; +mul.f32 f1730, f1468, f1721; +fma.rn.f32 f1731, f1469, f1719, f1730; +mul.f32 f1732, f1451, f1731; +fma.rn.f32 f1733, f1729, f1450, f1732; +mul.f32 f1734, f1450, f1731; +mul.f32 f1735, f1729, f1451; +sub.f32 f1736, f1735, f1734; +mul.f32 f1738, f1469, f1731; +mul.f32 f2133, f1468, f1729; +sub.f32 f1739, f2133, f1738; +mul.f32 f1740, f1468, f1731; +fma.rn.f32 f1741, f1469, f1729, f1740; +mul.f32 f1742, f1455, f1741; +fma.rn.f32 f1743, f1739, f1454, f1742; +mul.f32 f1744, f1454, f1741; +mul.f32 f1745, f1739, f1455; +sub.f32 f1746, f1745, f1744; +mul.f32 f1748, f1469, f1741; +mul.f32 f2132, f1468, f1739; +sub.f32 f1749, f2132, f1748; +mul.f32 f1750, f1468, f1741; +fma.rn.f32 f1751, f1469, f1739, f1750; +mul.f32 f1752, f1459, f1751; +fma.rn.f32 f1753, f1749, f1458, f1752; +mul.f32 f1754, f1458, f1751; +mul.f32 f1755, f1749, f1459; +sub.f32 f1756, f1755, f1754; +mul.f32 f1758, f1469, f1751; +mul.f32 f2131, f1468, f1749; +sub.f32 f1759, f2131, f1758; +mul.f32 f1760, f1468, f1751; +fma.rn.f32 f1761, f1469, f1749, f1760; +mul.f32 f1762, f1463, f1761; +fma.rn.f32 f1763, f1759, f1462, f1762; +mul.f32 f1764, f1462, f1761; +mul.f32 f1765, f1759, f1463; +sub.f32 f1766, f1765, f1764; +mul.f32 f2129, f1468, f1759; +mul.f32 f2130, f1469, f1761; +sub.f32 f1769, f2129, f2130; +mul.f32 f1770, f1468, f1761; +mov.u32 r29, %tid.x; +fma.rn.f32 f1771, f1469, f1759, f1770; +shl.b32 r28, r29, 7; +mul.f32 f1772, f1467, f1771; +fma.rn.f32 f1773, f1769, f1466, f1772; +mul.f32 f1774, f1466, f1771; +mul.f32 f1775, f1769, f1467; +sub.f32 f1776, f1775, f1774; +and.b32 r22, r29, 224; +shl.b32 r16, r29, 2; +and.b32 r17, r16, 124; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r28, 28672; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f1404; +st.shared.f32 [r20+128], f1473; +st.shared.f32 [r20+256], f1483; +st.shared.f32 [r20+384], f1493; +st.shared.f32 [r20+512], f1503; +st.shared.f32 [r20+640], f1513; +st.shared.f32 [r20+768], f1523; +st.shared.f32 [r20+896], f1533; +st.shared.f32 [r20+1024], f1543; +st.shared.f32 [r20+1152], f1553; +st.shared.f32 [r20+1280], f1563; +st.shared.f32 [r20+1408], f1573; +st.shared.f32 [r20+1536], f1583; +st.shared.f32 [r20+1664], f1593; +st.shared.f32 [r20+1792], f1603; +st.shared.f32 [r20+1920], f1613; +st.shared.f32 [r20+2048], f1623; +st.shared.f32 [r20+2176], f1633; +st.shared.f32 [r20+2304], f1643; +st.shared.f32 [r20+2432], f1653; +st.shared.f32 [r20+2560], f1663; +st.shared.f32 [r20+2688], f1673; +st.shared.f32 [r20+2816], f1683; +st.shared.f32 [r20+2944], f1693; +st.shared.f32 [r20+3072], f1703; +st.shared.f32 [r20+3200], f1713; +st.shared.f32 [r20+3328], f1723; +st.shared.f32 [r20+3456], f1733; +st.shared.f32 [r20+3584], f1743; +st.shared.f32 [r20+3712], f1753; +st.shared.f32 [r20+3840], f1763; +st.shared.f32 [r20+3968], f1773; +barrier.sync 0; +mad.lo.s32 r21, r22, -124, r20; +ld.shared.f32 f1777, [r21]; +ld.shared.f32 f1778, [r21+1024]; +ld.shared.f32 f1779, [r21+2048]; +ld.shared.f32 f1780, [r21+3072]; +ld.shared.f32 f1781, [r21+4096]; +ld.shared.f32 f1782, [r21+5120]; +ld.shared.f32 f1783, [r21+6144]; +ld.shared.f32 f1784, [r21+7168]; +ld.shared.f32 f1785, [r21+8192]; +ld.shared.f32 f1786, [r21+9216]; +ld.shared.f32 f1787, [r21+10240]; +ld.shared.f32 f1788, [r21+11264]; +ld.shared.f32 f1789, [r21+12288]; +ld.shared.f32 f1790, [r21+13312]; +ld.shared.f32 f1791, [r21+14336]; +ld.shared.f32 f1792, [r21+15360]; +ld.shared.f32 f1793, [r21+16384]; +ld.shared.f32 f1794, [r21+17408]; +ld.shared.f32 f1795, [r21+18432]; +ld.shared.f32 f1796, [r21+19456]; +ld.shared.f32 f1797, [r21+20480]; +ld.shared.f32 f1798, [r21+21504]; +ld.shared.f32 f1799, [r21+22528]; +ld.shared.f32 f1800, [r21+23552]; +ld.shared.f32 f1801, [r21+24576]; +ld.shared.f32 f1802, [r21+25600]; +ld.shared.f32 f1803, [r21+26624]; +ld.shared.f32 f1804, [r21+27648]; +ld.shared.f32 f1805, [r21+28672]; +ld.shared.f32 f1806, [r21+29696]; +ld.shared.f32 f1807, [r21+30720]; +ld.shared.f32 f1808, [r21+31744]; +barrier.sync 0; +st.shared.f32 [r20], f2186; +st.shared.f32 [r20+128], f1476; +st.shared.f32 [r20+256], f1486; +st.shared.f32 [r20+384], f1496; +st.shared.f32 [r20+512], f1506; +st.shared.f32 [r20+640], f1516; +st.shared.f32 [r20+768], f1526; +st.shared.f32 [r20+896], f1536; +st.shared.f32 [r20+1024], f1546; +st.shared.f32 [r20+1152], f1556; +st.shared.f32 [r20+1280], f1566; +st.shared.f32 [r20+1408], f1576; +st.shared.f32 [r20+1536], f1586; +st.shared.f32 [r20+1664], f1596; +st.shared.f32 [r20+1792], f1606; +st.shared.f32 [r20+1920], f1616; +st.shared.f32 [r20+2048], f1626; +st.shared.f32 [r20+2176], f1636; +st.shared.f32 [r20+2304], f1646; +st.shared.f32 [r20+2432], f1656; +st.shared.f32 [r20+2560], f1666; +st.shared.f32 [r20+2688], f1676; +st.shared.f32 [r20+2816], f1686; +st.shared.f32 [r20+2944], f1696; +st.shared.f32 [r20+3072], f1706; +st.shared.f32 [r20+3200], f1716; +st.shared.f32 [r20+3328], f1726; +st.shared.f32 [r20+3456], f1736; +st.shared.f32 [r20+3584], f1746; +st.shared.f32 [r20+3712], f1756; +st.shared.f32 [r20+3840], f1766; +st.shared.f32 [r20+3968], f1776; +barrier.sync 0; +ld.shared.f32 f1809, [r21]; +ld.shared.f32 f1810, [r21+1024]; +ld.shared.f32 f1811, [r21+2048]; +ld.shared.f32 f1812, [r21+3072]; +ld.shared.f32 f1813, [r21+4096]; +ld.shared.f32 f1814, [r21+5120]; +ld.shared.f32 f1815, [r21+6144]; +ld.shared.f32 f1816, [r21+7168]; +ld.shared.f32 f1817, [r21+8192]; +ld.shared.f32 f1818, [r21+9216]; +ld.shared.f32 f1819, [r21+10240]; +ld.shared.f32 f1820, [r21+11264]; +ld.shared.f32 f1821, [r21+12288]; +ld.shared.f32 f1822, [r21+13312]; +ld.shared.f32 f1823, [r21+14336]; +ld.shared.f32 f1824, [r21+15360]; +ld.shared.f32 f1825, [r21+16384]; +ld.shared.f32 f1826, [r21+17408]; +ld.shared.f32 f1827, [r21+18432]; +ld.shared.f32 f1828, [r21+19456]; +ld.shared.f32 f1829, [r21+20480]; +ld.shared.f32 f1830, [r21+21504]; +ld.shared.f32 f1831, [r21+22528]; +ld.shared.f32 f1832, [r21+23552]; +ld.shared.f32 f1833, [r21+24576]; +ld.shared.f32 f1834, [r21+25600]; +ld.shared.f32 f1835, [r21+26624]; +ld.shared.f32 f1836, [r21+27648]; +ld.shared.f32 f1837, [r21+28672]; +ld.shared.f32 f1838, [r21+29696]; +ld.shared.f32 f1839, [r21+30720]; +ld.shared.f32 f1840, [r21+31744]; +add.f32 f1841, f1777, f1793; +sub.f32 f1843, f1777, f1793; +add.f32 f2128, f1809, f1825; +sub.f32 f1844, f1809, f1825; +add.f32 f1845, f1785, f1801; +sub.f32 f1847, f1785, f1801; +add.f32 f2127, f1817, f1833; +sub.f32 f1848, f1817, f1833; +add.f32 f1849, f1841, f1845; +sub.f32 f1851, f1841, f1845; +add.f32 f2126, f2128, f2127; +sub.f32 f1852, f2128, f2127; +sub.f32 f1853, f1843, f1848; +add.f32 f1855, f1843, f1848; +add.f32 f2125, f1844, f1847; +sub.f32 f1856, f1844, f1847; +add.f32 f1857, f1781, f1797; +sub.f32 f1859, f1781, f1797; +add.f32 f2124, f1813, f1829; +sub.f32 f1860, f1813, f1829; +add.f32 f1861, f1789, f1805; +sub.f32 f1863, f1789, f1805; +add.f32 f2123, f1821, f1837; +sub.f32 f1864, f1821, f1837; +add.f32 f1865, f1857, f1861; +sub.f32 f1867, f1857, f1861; +add.f32 f2122, f2124, f2123; +sub.f32 f1868, f2124, f2123; +sub.f32 f1869, f1859, f1864; +add.f32 f1871, f1859, f1864; +add.f32 f2121, f1860, f1863; +sub.f32 f1872, f1860, f1863; +mul.f32 f1873, f1869, 0f3F3504F3; +mul.f32 f1874, f2121, 0f3F3504F3; +sub.f32 f1875, f1873, f1874; +add.f32 f1876, f1873, f1874; +mul.f32 f2119, f1871, 0fBF3504F3; +mul.f32 f2120, f1872, 0f3F3504F3; +sub.f32 f1879, f2119, f2120; +mul.f32 f1880, f1872, 0fBF3504F3; +fma.rn.f32 f1881, f1871, 0f3F3504F3, f1880; +add.f32 f1882, f1778, f1794; +sub.f32 f1884, f1778, f1794; +add.f32 f2118, f1810, f1826; +sub.f32 f1885, f1810, f1826; +add.f32 f1886, f1786, f1802; +sub.f32 f1888, f1786, f1802; +add.f32 f2117, f1818, f1834; +sub.f32 f1889, f1818, f1834; +add.f32 f1890, f1882, f1886; +sub.f32 f1892, f1882, f1886; +add.f32 f2116, f2118, f2117; +sub.f32 f1893, f2118, f2117; +sub.f32 f1894, f1884, f1889; +add.f32 f1896, f1884, f1889; +add.f32 f2115, f1885, f1888; +sub.f32 f1897, f1885, f1888; +add.f32 f1898, f1782, f1798; +sub.f32 f1900, f1782, f1798; +add.f32 f2114, f1814, f1830; +sub.f32 f1901, f1814, f1830; +add.f32 f1902, f1790, f1806; +sub.f32 f1904, f1790, f1806; +add.f32 f2113, f1822, f1838; +sub.f32 f1905, f1822, f1838; +add.f32 f1906, f1898, f1902; +sub.f32 f1908, f1898, f1902; +add.f32 f2112, f2114, f2113; +sub.f32 f1909, f2114, f2113; +sub.f32 f1910, f1900, f1905; +add.f32 f1912, f1900, f1905; +add.f32 f2111, f1901, f1904; +sub.f32 f1913, f1901, f1904; +mul.f32 f1914, f1910, 0f3F3504F3; +mul.f32 f1915, f2111, 0f3F3504F3; +sub.f32 f1916, f1914, f1915; +add.f32 f1917, f1914, f1915; +mul.f32 f2109, f1912, 0fBF3504F3; +mul.f32 f2110, f1913, 0f3F3504F3; +sub.f32 f1920, f2109, f2110; +mul.f32 f1921, f1913, 0fBF3504F3; +fma.rn.f32 f1922, f1912, 0f3F3504F3, f1921; +add.f32 f1923, f1779, f1795; +sub.f32 f1925, f1779, f1795; +add.f32 f2108, f1811, f1827; +sub.f32 f1926, f1811, f1827; +add.f32 f1927, f1787, f1803; +sub.f32 f1929, f1787, f1803; +add.f32 f2107, f1819, f1835; +sub.f32 f1930, f1819, f1835; +add.f32 f1931, f1923, f1927; +sub.f32 f1933, f1923, f1927; +add.f32 f2106, f2108, f2107; +sub.f32 f1934, f2108, f2107; +sub.f32 f1935, f1925, f1930; +add.f32 f1937, f1925, f1930; +add.f32 f2105, f1926, f1929; +sub.f32 f1938, f1926, f1929; +add.f32 f1939, f1783, f1799; +sub.f32 f1941, f1783, f1799; +add.f32 f2104, f1815, f1831; +sub.f32 f1942, f1815, f1831; +add.f32 f1943, f1791, f1807; +sub.f32 f1945, f1791, f1807; +add.f32 f2103, f1823, f1839; +sub.f32 f1946, f1823, f1839; +add.f32 f1947, f1939, f1943; +sub.f32 f1949, f1939, f1943; +add.f32 f2102, f2104, f2103; +sub.f32 f1950, f2104, f2103; +sub.f32 f1951, f1941, f1946; +add.f32 f1953, f1941, f1946; +add.f32 f2101, f1942, f1945; +sub.f32 f1954, f1942, f1945; +mul.f32 f1955, f1951, 0f3F3504F3; +mul.f32 f1956, f2101, 0f3F3504F3; +sub.f32 f1957, f1955, f1956; +add.f32 f1958, f1955, f1956; +mul.f32 f1960, f1954, 0f3F3504F3; +mul.f32 f2100, f1953, 0fBF3504F3; +sub.f32 f1961, f2100, f1960; +mul.f32 f1962, f1954, 0fBF3504F3; +fma.rn.f32 f1963, f1953, 0f3F3504F3, f1962; +add.f32 f1964, f1780, f1796; +sub.f32 f1966, f1780, f1796; +add.f32 f2099, f1812, f1828; +sub.f32 f1967, f1812, f1828; +add.f32 f1968, f1788, f1804; +sub.f32 f1970, f1788, f1804; +add.f32 f2098, f1820, f1836; +sub.f32 f1971, f1820, f1836; +add.f32 f1972, f1964, f1968; +sub.f32 f1974, f1964, f1968; +add.f32 f2097, f2099, f2098; +sub.f32 f1975, f2099, f2098; +sub.f32 f1976, f1966, f1971; +add.f32 f1978, f1966, f1971; +add.f32 f2096, f1967, f1970; +sub.f32 f1979, f1967, f1970; +add.f32 f1980, f1784, f1800; +sub.f32 f1982, f1784, f1800; +add.f32 f2095, f1816, f1832; +sub.f32 f1983, f1816, f1832; +add.f32 f1984, f1792, f1808; +sub.f32 f1986, f1792, f1808; +add.f32 f2094, f1824, f1840; +sub.f32 f1987, f1824, f1840; +add.f32 f1988, f1980, f1984; +sub.f32 f1990, f1980, f1984; +add.f32 f2093, f2095, f2094; +sub.f32 f1991, f2095, f2094; +sub.f32 f1992, f1982, f1987; +add.f32 f1994, f1982, f1987; +add.f32 f2092, f1983, f1986; +sub.f32 f1995, f1983, f1986; +mul.f32 f1996, f1992, 0f3F3504F3; +mul.f32 f1997, f2092, 0f3F3504F3; +sub.f32 f1998, f1996, f1997; +add.f32 f1999, f1996, f1997; +mul.f32 f2001, f1995, 0f3F3504F3; +mul.f32 f2091, f1994, 0fBF3504F3; +sub.f32 f2002, f2091, f2001; +mul.f32 f2003, f1995, 0fBF3504F3; +fma.rn.f32 f2004, f1994, 0f3F3504F3, f2003; +add.f32 %0, f1849, f1865; +add.f32 %1, f2126, f2122; +add.f32 %3, f2116, f2112; +add.f32 %2, f1890, f1906; +add.f32 %5, f2106, f2102; +add.f32 %4, f1931, f1947; +add.f32 %7, f2097, f2093; +add.f32 %6, f1972, f1988; +add.f32 %8, f1853, f1875; +add.f32 %9, f2125, f1876; +add.f32 %11, f2115, f1917; +add.f32 %10, f1894, f1916; +add.f32 %13, f2105, f1958; +add.f32 %12, f1935, f1957; +add.f32 %14, f1976, f1998; +add.f32 %15, f2096, f1999; +sub.f32 %16, f1851, f1868; +add.f32 %17, f1852, f1867; +sub.f32 %18, f1892, f1909; +add.f32 %19, f1893, f1908; +sub.f32 %20, f1933, f1950; +add.f32 %21, f1934, f1949; +add.f32 %23, f1975, f1990; +sub.f32 %22, f1974, f1991; +add.f32 %25, f1856, f1881; +add.f32 %24, f1855, f1879; +add.f32 %27, f1897, f1922; +add.f32 %26, f1896, f1920; +add.f32 %28, f1937, f1961; +add.f32 %29, f1938, f1963; +add.f32 %30, f1978, f2002; +add.f32 %31, f1979, f2004; +sub.f32 %32, f1849, f1865; +sub.f32 %33, f2126, f2122; +sub.f32 %34, f1890, f1906; +sub.f32 %35, f2116, f2112; +sub.f32 %36, f1931, f1947; +sub.f32 %37, f2106, f2102; +sub.f32 %38, f1972, f1988; +sub.f32 %39, f2097, f2093; +sub.f32 %41, f2125, f1876; +sub.f32 %40, f1853, f1875; +sub.f32 %43, f2115, f1917; +sub.f32 %42, f1894, f1916; +sub.f32 %45, f2105, f1958; +sub.f32 %44, f1935, f1957; +sub.f32 %47, f2096, f1999; +sub.f32 %46, f1976, f1998; +sub.f32 %49, f1852, f1867; +add.f32 %48, f1851, f1868; +sub.f32 %51, f1893, f1908; +add.f32 %50, f1892, f1909; +sub.f32 %53, f1934, f1949; +add.f32 %52, f1933, f1950; +sub.f32 %55, f1975, f1990; +add.f32 %54, f1974, f1991; +sub.f32 %57, f1856, f1881; +sub.f32 %56, f1855, f1879; +sub.f32 %59, f1897, f1922; +sub.f32 %58, f1896, f1920; +sub.f32 %61, f1938, f1963; +sub.f32 %60, f1937, f1961; +sub.f32 %63, f1979, f2004; +sub.f32 %62, f1978, f2002; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y), "=f"(rmem[27].x), "=f"(rmem[27].y), "=f"(rmem[28].x), "=f"(rmem[28].y), "=f"(rmem[29].x), "=f"(rmem[29].y), "=f"(rmem[30].x), "=f"(rmem[30].y), "=f"(rmem[31].x), "=f"(rmem[31].y): "r"(smem), "l"(lut_sp_32_8192), "l"(lut_sp_32_256), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[27].x), "f"(rmem[27].y), "f"(rmem[28].x), "f"(rmem[28].y), "f"(rmem[29].x), "f"(rmem[29].y), "f"(rmem[30].x), "f"(rmem[30].y), "f"(rmem[31].x), "f"(rmem[31].y), "f"(rmem[16].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[20].y), "f"(rmem[28].y), "f"(rmem[2].y), "f"(rmem[26].y), "f"(rmem[10].y), "f"(rmem[22].y), "f"(rmem[14].y), "f"(rmem[17].y), "f"(rmem[1].y), "f"(rmem[25].y), "f"(rmem[5].y), "f"(rmem[29].y), "f"(rmem[13].y), "f"(rmem[19].y), "f"(rmem[11].y), "f"(rmem[7].y), "f"(rmem[23].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<311, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1165>; +.reg .b32 r<29>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f65, %36, %57; +add.f32 f66, %37, %59; +sub.f32 f67, %36, %57; +sub.f32 f68, %37, %59; +add.f32 f69, %46, %68; +add.f32 f70, %48, %69; +sub.f32 f71, %46, %68; +sub.f32 f72, %48, %69; +add.f32 f73, f65, f69; +add.f32 f74, f66, f70; +sub.f32 f75, f65, f69; +sub.f32 f76, f66, f70; +sub.f32 f77, f67, f72; +add.f32 f78, f68, f71; +add.f32 f79, f67, f72; +sub.f32 f80, f68, f71; +add.f32 f81, %41, %62; +add.f32 f82, %43, %64; +sub.f32 f83, %41, %62; +sub.f32 f84, %43, %64; +add.f32 f85, %52, %73; +add.f32 f86, %53, %75; +sub.f32 f87, %52, %73; +sub.f32 f88, %53, %75; +add.f32 f89, f81, f85; +add.f32 f90, f82, f86; +sub.f32 f91, f81, f85; +sub.f32 f92, f82, f86; +sub.f32 f93, f83, f88; +add.f32 f94, f84, f87; +add.f32 f95, f83, f88; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f94, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f101, f95, 0fBF3504F3; +mul.f32 f102, f96, 0f3F3504F3; +sub.f32 f103, f101, f102; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +add.f32 f107, f74, f90; +sub.f32 f108, f73, f89; +sub.f32 f109, f74, f90; +add.f32 f110, f77, f99; +add.f32 f111, f78, f100; +sub.f32 f112, f77, f99; +sub.f32 f113, f78, f100; +sub.f32 f114, f75, f92; +add.f32 f115, f76, f91; +add.f32 f116, f75, f92; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +add.f32 f119, f80, f105; +sub.f32 f120, f79, f103; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %60; +add.f32 f123, %40, %61; +sub.f32 f124, %38, %60; +sub.f32 f125, %40, %61; +add.f32 f126, %49, %70; +add.f32 f127, %51, %72; +sub.f32 f128, %49, %70; +sub.f32 f129, %51, %72; +add.f32 f130, f122, f126; +add.f32 f131, f123, f127; +sub.f32 f132, f122, f126; +sub.f32 f133, f123, f127; +sub.f32 f134, f124, f129; +add.f32 f135, f125, f128; +add.f32 f136, f124, f129; +sub.f32 f137, f125, f128; +add.f32 f138, %44, %65; +add.f32 f139, %45, %67; +sub.f32 f140, %44, %65; +sub.f32 f141, %45, %67; +add.f32 f142, %54, %76; +add.f32 f143, %56, %77; +sub.f32 f144, %54, %76; +sub.f32 f145, %56, %77; +add.f32 f146, f138, f142; +add.f32 f147, f139, f143; +sub.f32 f148, f138, f142; +sub.f32 f149, f139, f143; +sub.f32 f150, f140, f145; +add.f32 f151, f141, f144; +add.f32 f152, f140, f145; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f151, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f158, f152, 0fBF3504F3; +mul.f32 f159, f153, 0f3F3504F3; +sub.f32 f160, f158, f159; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +add.f32 f164, f131, f147; +sub.f32 f165, f130, f146; +sub.f32 f166, f131, f147; +add.f32 f167, f134, f156; +add.f32 f168, f135, f157; +sub.f32 f169, f134, f156; +sub.f32 f170, f135, f157; +sub.f32 f171, f132, f149; +add.f32 f172, f133, f148; +add.f32 f173, f132, f149; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +add.f32 f176, f137, f162; +sub.f32 f177, f136, f160; +sub.f32 f178, f137, f162; +mul.f32 f179, f167, 0f3F6C835E; +mul.f32 f180, f168, 0f3EC3EF15; +sub.f32 f181, f179, f180; +mul.f32 f182, f168, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f172, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f188, f175, 0f3EC3EF15; +mul.f32 f189, f176, 0f3F6C835E; +sub.f32 f190, f188, f189; +mul.f32 f191, f176, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f193, f169, 0fBEC3EF15; +mul.f32 f194, f170, 0f3F6C835E; +sub.f32 f195, f193, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f198, f173, 0fBF3504F3; +mul.f32 f199, f174, 0f3F3504F3; +sub.f32 f200, f198, f199; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f203, f177, 0fBF6C835E; +mul.f32 f204, f178, 0f3EC3EF15; +sub.f32 f205, f203, f204; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f208, f106, f163; +add.f32 f209, f107, f164; +sub.f32 f210, f106, f163; +sub.f32 f211, f107, f164; +add.f32 f212, f110, f181; +add.f32 f213, f111, f183; +sub.f32 f214, f110, f181; +sub.f32 f215, f111, f183; +add.f32 f216, f114, f186; +add.f32 f217, f115, f187; +sub.f32 f218, f114, f186; +sub.f32 f219, f115, f187; +add.f32 f220, f118, f190; +add.f32 f221, f119, f192; +sub.f32 f222, f118, f190; +sub.f32 f223, f119, f192; +sub.f32 f224, f108, f166; +add.f32 f225, f109, f165; +add.f32 f226, f108, f166; +sub.f32 f227, f109, f165; +add.f32 f228, f112, f195; +add.f32 f229, f113, f197; +sub.f32 f230, f112, f195; +sub.f32 f231, f113, f197; +add.f32 f232, f116, f200; +add.f32 f233, f117, f202; +sub.f32 f234, f116, f200; +sub.f32 f235, f117, f202; +add.f32 f236, f120, f205; +add.f32 f237, f121, f207; +sub.f32 f238, f120, f205; +sub.f32 f239, f121, f207; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f240, f241}, [rd5]; +mul.f32 f244, f213, f241; +fma.rn.f32 f245, f240, f212, f244; +mul.f32 f246, f212, f241; +mul.f32 f247, f240, f213; +sub.f32 f248, f247, f246; +mul.f32 f249, f240, f240; +mul.f32 f250, f241, f241; +sub.f32 f251, f249, f250; +mul.f32 f252, f241, f240; +fma.rn.f32 f253, f241, f240, f252; +mul.f32 f254, f217, f253; +fma.rn.f32 f255, f251, f216, f254; +mul.f32 f256, f216, f253; +mul.f32 f257, f251, f217; +sub.f32 f258, f257, f256; +mul.f32 f259, f240, f251; +mul.f32 f260, f241, f253; +sub.f32 f261, f259, f260; +mul.f32 f262, f240, f253; +fma.rn.f32 f263, f241, f251, f262; +mul.f32 f264, f221, f263; +fma.rn.f32 f265, f261, f220, f264; +mul.f32 f266, f220, f263; +mul.f32 f267, f261, f221; +sub.f32 f268, f267, f266; +mul.f32 f269, f240, f261; +mul.f32 f270, f241, f263; +sub.f32 f271, f269, f270; +mul.f32 f272, f240, f263; +fma.rn.f32 f273, f241, f261, f272; +mul.f32 f274, f225, f273; +fma.rn.f32 f275, f271, f224, f274; +mul.f32 f276, f224, f273; +mul.f32 f277, f271, f225; +sub.f32 f278, f277, f276; +mul.f32 f279, f240, f271; +mul.f32 f280, f241, f273; +sub.f32 f281, f279, f280; +mul.f32 f282, f240, f273; +fma.rn.f32 f283, f241, f271, f282; +mul.f32 f284, f229, f283; +fma.rn.f32 f285, f281, f228, f284; +mul.f32 f286, f228, f283; +mul.f32 f287, f281, f229; +sub.f32 f288, f287, f286; +mul.f32 f289, f240, f281; +mul.f32 f290, f241, f283; +sub.f32 f291, f289, f290; +mul.f32 f292, f240, f283; +fma.rn.f32 f293, f241, f281, f292; +mul.f32 f294, f233, f293; +fma.rn.f32 f295, f291, f232, f294; +mul.f32 f296, f232, f293; +mul.f32 f297, f291, f233; +sub.f32 f298, f297, f296; +mul.f32 f299, f240, f291; +mul.f32 f300, f241, f293; +sub.f32 f301, f299, f300; +mul.f32 f302, f240, f293; +fma.rn.f32 f303, f241, f291, f302; +mul.f32 f304, f237, f303; +fma.rn.f32 f305, f301, f236, f304; +mul.f32 f306, f236, f303; +mul.f32 f307, f301, f237; +sub.f32 f308, f307, f306; +mul.f32 f309, f240, f301; +mul.f32 f310, f241, f303; +sub.f32 f311, f309, f310; +mul.f32 f312, f240, f303; +fma.rn.f32 f313, f241, f301, f312; +mul.f32 f314, f211, f313; +fma.rn.f32 f315, f311, f210, f314; +mul.f32 f316, f210, f313; +mul.f32 f317, f311, f211; +sub.f32 f318, f317, f316; +mul.f32 f319, f240, f311; +mul.f32 f320, f241, f313; +sub.f32 f321, f319, f320; +mul.f32 f322, f240, f313; +fma.rn.f32 f323, f241, f311, f322; +mul.f32 f324, f215, f323; +fma.rn.f32 f325, f321, f214, f324; +mul.f32 f326, f214, f323; +mul.f32 f327, f321, f215; +sub.f32 f328, f327, f326; +mul.f32 f329, f240, f321; +mul.f32 f330, f241, f323; +sub.f32 f331, f329, f330; +mul.f32 f332, f240, f323; +fma.rn.f32 f333, f241, f321, f332; +mul.f32 f334, f219, f333; +fma.rn.f32 f335, f331, f218, f334; +mul.f32 f336, f218, f333; +mul.f32 f337, f331, f219; +sub.f32 f338, f337, f336; +mul.f32 f339, f240, f331; +mul.f32 f340, f241, f333; +sub.f32 f341, f339, f340; +mul.f32 f342, f240, f333; +fma.rn.f32 f343, f241, f331, f342; +mul.f32 f344, f223, f343; +fma.rn.f32 f345, f341, f222, f344; +mul.f32 f346, f222, f343; +mul.f32 f347, f341, f223; +sub.f32 f348, f347, f346; +mul.f32 f349, f240, f341; +mul.f32 f350, f241, f343; +sub.f32 f351, f349, f350; +mul.f32 f352, f240, f343; +fma.rn.f32 f353, f241, f341, f352; +mul.f32 f354, f227, f353; +fma.rn.f32 f355, f351, f226, f354; +mul.f32 f356, f226, f353; +mul.f32 f357, f351, f227; +sub.f32 f358, f357, f356; +mul.f32 f359, f240, f351; +mul.f32 f360, f241, f353; +sub.f32 f361, f359, f360; +mul.f32 f362, f240, f353; +fma.rn.f32 f363, f241, f351, f362; +mul.f32 f364, f231, f363; +fma.rn.f32 f365, f361, f230, f364; +mul.f32 f366, f230, f363; +mul.f32 f367, f361, f231; +sub.f32 f368, f367, f366; +mul.f32 f369, f240, f361; +mul.f32 f370, f241, f363; +sub.f32 f371, f369, f370; +mul.f32 f372, f240, f363; +fma.rn.f32 f373, f241, f361, f372; +mul.f32 f374, f235, f373; +fma.rn.f32 f375, f371, f234, f374; +mul.f32 f376, f234, f373; +mul.f32 f377, f371, f235; +sub.f32 f378, f377, f376; +mul.f32 f379, f240, f371; +mul.f32 f380, f241, f373; +sub.f32 f381, f379, f380; +mul.f32 f382, f240, f373; +fma.rn.f32 f383, f241, f371, f382; +mul.f32 f384, f239, f383; +fma.rn.f32 f385, f381, f238, f384; +mul.f32 f386, f238, f383; +mul.f32 f387, f381, f239; +sub.f32 f388, f387, f386; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32704; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f208, f245, f255, f265}; +st.shared.v4.f32 [r12+16], {f275, f285, f295, f305}; +st.shared.v4.f32 [r12+32], {f315, f325, f335, f345}; +st.shared.v4.f32 [r12+48], {f355, f365, f375, f385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -60, r12; +ld.shared.f32 f389, [r13]; +ld.shared.f32 f390, [r13+2048]; +ld.shared.f32 f391, [r13+4096]; +ld.shared.f32 f392, [r13+6144]; +ld.shared.f32 f393, [r13+8192]; +ld.shared.f32 f394, [r13+10240]; +ld.shared.f32 f395, [r13+12288]; +ld.shared.f32 f396, [r13+14336]; +ld.shared.f32 f397, [r13+16384]; +ld.shared.f32 f398, [r13+18432]; +ld.shared.f32 f399, [r13+20480]; +ld.shared.f32 f400, [r13+22528]; +ld.shared.f32 f401, [r13+24576]; +ld.shared.f32 f402, [r13+26624]; +ld.shared.f32 f403, [r13+28672]; +ld.shared.f32 f404, [r13+30720]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f209, f248, f258, f268}; +st.shared.v4.f32 [r12+16], {f278, f288, f298, f308}; +st.shared.v4.f32 [r12+32], {f318, f328, f338, f348}; +st.shared.v4.f32 [r12+48], {f358, f368, f378, f388}; +barrier.sync 0; +ld.shared.f32 f405, [r13]; +ld.shared.f32 f406, [r13+2048]; +ld.shared.f32 f407, [r13+4096]; +ld.shared.f32 f408, [r13+6144]; +ld.shared.f32 f409, [r13+8192]; +ld.shared.f32 f410, [r13+10240]; +ld.shared.f32 f411, [r13+12288]; +ld.shared.f32 f412, [r13+14336]; +ld.shared.f32 f413, [r13+16384]; +ld.shared.f32 f414, [r13+18432]; +ld.shared.f32 f415, [r13+20480]; +ld.shared.f32 f416, [r13+22528]; +ld.shared.f32 f417, [r13+24576]; +ld.shared.f32 f418, [r13+26624]; +ld.shared.f32 f419, [r13+28672]; +ld.shared.f32 f420, [r13+30720]; +add.f32 f421, f389, f397; +add.f32 f422, f405, f413; +sub.f32 f423, f389, f397; +sub.f32 f424, f405, f413; +add.f32 f425, f393, f401; +add.f32 f426, f409, f417; +sub.f32 f427, f393, f401; +sub.f32 f428, f409, f417; +add.f32 f429, f421, f425; +add.f32 f430, f422, f426; +sub.f32 f431, f421, f425; +sub.f32 f432, f422, f426; +sub.f32 f433, f423, f428; +add.f32 f434, f424, f427; +add.f32 f435, f423, f428; +sub.f32 f436, f424, f427; +add.f32 f437, f391, f399; +add.f32 f438, f407, f415; +sub.f32 f439, f391, f399; +sub.f32 f440, f407, f415; +add.f32 f441, f395, f403; +add.f32 f442, f411, f419; +sub.f32 f443, f395, f403; +sub.f32 f444, f411, f419; +add.f32 f445, f437, f441; +add.f32 f446, f438, f442; +sub.f32 f447, f437, f441; +sub.f32 f448, f438, f442; +sub.f32 f449, f439, f444; +add.f32 f450, f440, f443; +add.f32 f451, f439, f444; +sub.f32 f452, f440, f443; +mul.f32 f453, f449, 0f3F3504F3; +mul.f32 f454, f450, 0f3F3504F3; +sub.f32 f455, f453, f454; +add.f32 f456, f453, f454; +mul.f32 f457, f451, 0fBF3504F3; +mul.f32 f458, f452, 0f3F3504F3; +sub.f32 f459, f457, f458; +mul.f32 f460, f452, 0fBF3504F3; +fma.rn.f32 f461, f451, 0f3F3504F3, f460; +add.f32 f462, f429, f445; +add.f32 f463, f430, f446; +sub.f32 f464, f429, f445; +sub.f32 f465, f430, f446; +add.f32 f466, f433, f455; +add.f32 f467, f434, f456; +sub.f32 f468, f433, f455; +sub.f32 f469, f434, f456; +sub.f32 f470, f431, f448; +add.f32 f471, f432, f447; +add.f32 f472, f431, f448; +sub.f32 f473, f432, f447; +add.f32 f474, f435, f459; +add.f32 f475, f436, f461; +sub.f32 f476, f435, f459; +sub.f32 f477, f436, f461; +add.f32 f478, f390, f398; +add.f32 f479, f406, f414; +sub.f32 f480, f390, f398; +sub.f32 f481, f406, f414; +add.f32 f482, f394, f402; +add.f32 f483, f410, f418; +sub.f32 f484, f394, f402; +sub.f32 f485, f410, f418; +add.f32 f486, f478, f482; +add.f32 f487, f479, f483; +sub.f32 f488, f478, f482; +sub.f32 f489, f479, f483; +sub.f32 f490, f480, f485; +add.f32 f491, f481, f484; +add.f32 f492, f480, f485; +sub.f32 f493, f481, f484; +add.f32 f494, f392, f400; +add.f32 f495, f408, f416; +sub.f32 f496, f392, f400; +sub.f32 f497, f408, f416; +add.f32 f498, f396, f404; +add.f32 f499, f412, f420; +sub.f32 f500, f396, f404; +sub.f32 f501, f412, f420; +add.f32 f502, f494, f498; +add.f32 f503, f495, f499; +sub.f32 f504, f494, f498; +sub.f32 f505, f495, f499; +sub.f32 f506, f496, f501; +add.f32 f507, f497, f500; +add.f32 f508, f496, f501; +sub.f32 f509, f497, f500; +mul.f32 f510, f506, 0f3F3504F3; +mul.f32 f511, f507, 0f3F3504F3; +sub.f32 f512, f510, f511; +add.f32 f513, f510, f511; +mul.f32 f514, f508, 0fBF3504F3; +mul.f32 f515, f509, 0f3F3504F3; +sub.f32 f516, f514, f515; +mul.f32 f517, f509, 0fBF3504F3; +fma.rn.f32 f518, f508, 0f3F3504F3, f517; +add.f32 f519, f486, f502; +add.f32 f520, f487, f503; +sub.f32 f521, f486, f502; +sub.f32 f522, f487, f503; +add.f32 f523, f490, f512; +add.f32 f524, f491, f513; +sub.f32 f525, f490, f512; +sub.f32 f526, f491, f513; +sub.f32 f527, f488, f505; +add.f32 f528, f489, f504; +add.f32 f529, f488, f505; +sub.f32 f530, f489, f504; +add.f32 f531, f492, f516; +add.f32 f532, f493, f518; +sub.f32 f533, f492, f516; +sub.f32 f534, f493, f518; +mul.f32 f535, f523, 0f3F6C835E; +mul.f32 f536, f524, 0f3EC3EF15; +sub.f32 f537, f535, f536; +mul.f32 f538, f524, 0f3F6C835E; +fma.rn.f32 f539, f523, 0f3EC3EF15, f538; +mul.f32 f540, f527, 0f3F3504F3; +mul.f32 f541, f528, 0f3F3504F3; +sub.f32 f542, f540, f541; +add.f32 f543, f540, f541; +mul.f32 f544, f531, 0f3EC3EF15; +mul.f32 f545, f532, 0f3F6C835E; +sub.f32 f546, f544, f545; +mul.f32 f547, f532, 0f3EC3EF15; +fma.rn.f32 f548, f531, 0f3F6C835E, f547; +mul.f32 f549, f525, 0fBEC3EF15; +mul.f32 f550, f526, 0f3F6C835E; +sub.f32 f551, f549, f550; +mul.f32 f552, f526, 0fBEC3EF15; +fma.rn.f32 f553, f525, 0f3F6C835E, f552; +mul.f32 f554, f529, 0fBF3504F3; +mul.f32 f555, f530, 0f3F3504F3; +sub.f32 f556, f554, f555; +mul.f32 f557, f530, 0fBF3504F3; +fma.rn.f32 f558, f529, 0f3F3504F3, f557; +mul.f32 f559, f533, 0fBF6C835E; +mul.f32 f560, f534, 0f3EC3EF15; +sub.f32 f561, f559, f560; +mul.f32 f562, f534, 0fBF6C835E; +fma.rn.f32 f563, f533, 0f3EC3EF15, f562; +add.f32 f564, f462, f519; +add.f32 f565, f463, f520; +sub.f32 f566, f462, f519; +sub.f32 f567, f463, f520; +add.f32 f568, f466, f537; +add.f32 f569, f467, f539; +sub.f32 f570, f466, f537; +sub.f32 f571, f467, f539; +add.f32 f572, f470, f542; +add.f32 f573, f471, f543; +sub.f32 f574, f470, f542; +sub.f32 f575, f471, f543; +add.f32 f576, f474, f546; +add.f32 f577, f475, f548; +sub.f32 f578, f474, f546; +sub.f32 f579, f475, f548; +sub.f32 f580, f464, f522; +add.f32 f581, f465, f521; +add.f32 f582, f464, f522; +sub.f32 f583, f465, f521; +add.f32 f584, f468, f551; +add.f32 f585, f469, f553; +sub.f32 f586, f468, f551; +sub.f32 f587, f469, f553; +add.f32 f588, f472, f556; +add.f32 f589, f473, f558; +sub.f32 f590, f472, f556; +sub.f32 f591, f473, f558; +add.f32 f592, f476, f561; +add.f32 f593, f477, f563; +sub.f32 f594, f476, f561; +sub.f32 f595, f477, f563; +and.b32 r14, r5, 496; +bfe.u32 r15, r5, 4, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f596, f597}, [rd8]; +mul.f32 f600, f569, f597; +fma.rn.f32 f601, f596, f568, f600; +mul.f32 f602, f568, f597; +mul.f32 f603, f596, f569; +sub.f32 f604, f603, f602; +mul.f32 f605, f596, f596; +mul.f32 f606, f597, f597; +sub.f32 f607, f605, f606; +mul.f32 f608, f597, f596; +fma.rn.f32 f609, f597, f596, f608; +mul.f32 f610, f573, f609; +fma.rn.f32 f611, f607, f572, f610; +mul.f32 f612, f572, f609; +mul.f32 f613, f607, f573; +sub.f32 f614, f613, f612; +mul.f32 f615, f596, f607; +mul.f32 f616, f597, f609; +sub.f32 f617, f615, f616; +mul.f32 f618, f596, f609; +fma.rn.f32 f619, f597, f607, f618; +mul.f32 f620, f577, f619; +fma.rn.f32 f621, f617, f576, f620; +mul.f32 f622, f576, f619; +mul.f32 f623, f617, f577; +sub.f32 f624, f623, f622; +mul.f32 f625, f596, f617; +mul.f32 f626, f597, f619; +sub.f32 f627, f625, f626; +mul.f32 f628, f596, f619; +fma.rn.f32 f629, f597, f617, f628; +mul.f32 f630, f581, f629; +fma.rn.f32 f631, f627, f580, f630; +mul.f32 f632, f580, f629; +mul.f32 f633, f627, f581; +sub.f32 f634, f633, f632; +mul.f32 f635, f596, f627; +mul.f32 f636, f597, f629; +sub.f32 f637, f635, f636; +mul.f32 f638, f596, f629; +fma.rn.f32 f639, f597, f627, f638; +mul.f32 f640, f585, f639; +fma.rn.f32 f641, f637, f584, f640; +mul.f32 f642, f584, f639; +mul.f32 f643, f637, f585; +sub.f32 f644, f643, f642; +mul.f32 f645, f596, f637; +mul.f32 f646, f597, f639; +sub.f32 f647, f645, f646; +mul.f32 f648, f596, f639; +fma.rn.f32 f649, f597, f637, f648; +mul.f32 f650, f589, f649; +fma.rn.f32 f651, f647, f588, f650; +mul.f32 f652, f588, f649; +mul.f32 f653, f647, f589; +sub.f32 f654, f653, f652; +mul.f32 f655, f596, f647; +mul.f32 f656, f597, f649; +sub.f32 f657, f655, f656; +mul.f32 f658, f596, f649; +fma.rn.f32 f659, f597, f647, f658; +mul.f32 f660, f593, f659; +fma.rn.f32 f661, f657, f592, f660; +mul.f32 f662, f592, f659; +mul.f32 f663, f657, f593; +sub.f32 f664, f663, f662; +mul.f32 f665, f596, f657; +mul.f32 f666, f597, f659; +sub.f32 f667, f665, f666; +mul.f32 f668, f596, f659; +fma.rn.f32 f669, f597, f657, f668; +mul.f32 f670, f567, f669; +fma.rn.f32 f671, f667, f566, f670; +mul.f32 f672, f566, f669; +mul.f32 f673, f667, f567; +sub.f32 f674, f673, f672; +mul.f32 f675, f596, f667; +mul.f32 f676, f597, f669; +sub.f32 f677, f675, f676; +mul.f32 f678, f596, f669; +fma.rn.f32 f679, f597, f667, f678; +mul.f32 f680, f571, f679; +fma.rn.f32 f681, f677, f570, f680; +mul.f32 f682, f570, f679; +mul.f32 f683, f677, f571; +sub.f32 f684, f683, f682; +mul.f32 f685, f596, f677; +mul.f32 f686, f597, f679; +sub.f32 f687, f685, f686; +mul.f32 f688, f596, f679; +fma.rn.f32 f689, f597, f677, f688; +mul.f32 f690, f575, f689; +fma.rn.f32 f691, f687, f574, f690; +mul.f32 f692, f574, f689; +mul.f32 f693, f687, f575; +sub.f32 f694, f693, f692; +mul.f32 f695, f596, f687; +mul.f32 f696, f597, f689; +sub.f32 f697, f695, f696; +mul.f32 f698, f596, f689; +fma.rn.f32 f699, f597, f687, f698; +mul.f32 f700, f579, f699; +fma.rn.f32 f701, f697, f578, f700; +mul.f32 f702, f578, f699; +mul.f32 f703, f697, f579; +sub.f32 f704, f703, f702; +mul.f32 f705, f596, f697; +mul.f32 f706, f597, f699; +sub.f32 f707, f705, f706; +mul.f32 f708, f596, f699; +fma.rn.f32 f709, f597, f697, f708; +mul.f32 f710, f583, f709; +fma.rn.f32 f711, f707, f582, f710; +mul.f32 f712, f582, f709; +mul.f32 f713, f707, f583; +sub.f32 f714, f713, f712; +mul.f32 f715, f596, f707; +mul.f32 f716, f597, f709; +sub.f32 f717, f715, f716; +mul.f32 f718, f596, f709; +fma.rn.f32 f719, f597, f707, f718; +mul.f32 f720, f587, f719; +fma.rn.f32 f721, f717, f586, f720; +mul.f32 f722, f586, f719; +mul.f32 f723, f717, f587; +sub.f32 f724, f723, f722; +mul.f32 f725, f596, f717; +mul.f32 f726, f597, f719; +sub.f32 f727, f725, f726; +mul.f32 f728, f596, f719; +fma.rn.f32 f729, f597, f717, f728; +mul.f32 f730, f591, f729; +fma.rn.f32 f731, f727, f590, f730; +mul.f32 f732, f590, f729; +mul.f32 f733, f727, f591; +sub.f32 f734, f733, f732; +mul.f32 f735, f596, f727; +mul.f32 f736, f597, f729; +sub.f32 f737, f735, f736; +mul.f32 f738, f596, f729; +fma.rn.f32 f739, f597, f727, f738; +mul.f32 f740, f595, f739; +fma.rn.f32 f741, f737, f594, f740; +mul.f32 f742, f594, f739; +mul.f32 f743, f737, f595; +sub.f32 f744, f743, f742; +shl.b32 r16, r5, 2; +and.b32 r17, r16, 60; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 31744; +add.s32 r20, r18, r19; +st.shared.f32 [r20], f564; +st.shared.f32 [r20+64], f601; +st.shared.f32 [r20+128], f611; +st.shared.f32 [r20+192], f621; +st.shared.f32 [r20+256], f631; +st.shared.f32 [r20+320], f641; +st.shared.f32 [r20+384], f651; +st.shared.f32 [r20+448], f661; +st.shared.f32 [r20+512], f671; +st.shared.f32 [r20+576], f681; +st.shared.f32 [r20+640], f691; +st.shared.f32 [r20+704], f701; +st.shared.f32 [r20+768], f711; +st.shared.f32 [r20+832], f721; +st.shared.f32 [r20+896], f731; +st.shared.f32 [r20+960], f741; +barrier.sync 0; +mad.lo.s32 r21, r14, -60, r20; +ld.shared.f32 f745, [r21]; +ld.shared.f32 f746, [r21+2048]; +ld.shared.f32 f747, [r21+4096]; +ld.shared.f32 f748, [r21+6144]; +ld.shared.f32 f749, [r21+8192]; +ld.shared.f32 f750, [r21+10240]; +ld.shared.f32 f751, [r21+12288]; +ld.shared.f32 f752, [r21+14336]; +ld.shared.f32 f753, [r21+16384]; +ld.shared.f32 f754, [r21+18432]; +ld.shared.f32 f755, [r21+20480]; +ld.shared.f32 f756, [r21+22528]; +ld.shared.f32 f757, [r21+24576]; +ld.shared.f32 f758, [r21+26624]; +ld.shared.f32 f759, [r21+28672]; +ld.shared.f32 f760, [r21+30720]; +barrier.sync 0; +st.shared.f32 [r20], f565; +st.shared.f32 [r20+64], f604; +st.shared.f32 [r20+128], f614; +st.shared.f32 [r20+192], f624; +st.shared.f32 [r20+256], f634; +st.shared.f32 [r20+320], f644; +st.shared.f32 [r20+384], f654; +st.shared.f32 [r20+448], f664; +st.shared.f32 [r20+512], f674; +st.shared.f32 [r20+576], f684; +st.shared.f32 [r20+640], f694; +st.shared.f32 [r20+704], f704; +st.shared.f32 [r20+768], f714; +st.shared.f32 [r20+832], f724; +st.shared.f32 [r20+896], f734; +st.shared.f32 [r20+960], f744; +barrier.sync 0; +ld.shared.f32 f761, [r21]; +ld.shared.f32 f762, [r21+2048]; +ld.shared.f32 f763, [r21+4096]; +ld.shared.f32 f764, [r21+6144]; +ld.shared.f32 f765, [r21+8192]; +ld.shared.f32 f766, [r21+10240]; +ld.shared.f32 f767, [r21+12288]; +ld.shared.f32 f768, [r21+14336]; +ld.shared.f32 f769, [r21+16384]; +ld.shared.f32 f770, [r21+18432]; +ld.shared.f32 f771, [r21+20480]; +ld.shared.f32 f772, [r21+22528]; +ld.shared.f32 f773, [r21+24576]; +ld.shared.f32 f774, [r21+26624]; +ld.shared.f32 f775, [r21+28672]; +ld.shared.f32 f776, [r21+30720]; +add.f32 f777, f745, f753; +add.f32 f778, f761, f769; +sub.f32 f779, f745, f753; +sub.f32 f780, f761, f769; +add.f32 f781, f749, f757; +add.f32 f782, f765, f773; +sub.f32 f783, f749, f757; +sub.f32 f784, f765, f773; +add.f32 f785, f777, f781; +add.f32 f786, f778, f782; +sub.f32 f787, f777, f781; +sub.f32 f788, f778, f782; +sub.f32 f789, f779, f784; +add.f32 f790, f780, f783; +add.f32 f791, f779, f784; +sub.f32 f792, f780, f783; +add.f32 f793, f747, f755; +add.f32 f794, f763, f771; +sub.f32 f795, f747, f755; +sub.f32 f796, f763, f771; +add.f32 f797, f751, f759; +add.f32 f798, f767, f775; +sub.f32 f799, f751, f759; +sub.f32 f800, f767, f775; +add.f32 f801, f793, f797; +add.f32 f802, f794, f798; +sub.f32 f803, f793, f797; +sub.f32 f804, f794, f798; +sub.f32 f805, f795, f800; +add.f32 f806, f796, f799; +add.f32 f807, f795, f800; +sub.f32 f808, f796, f799; +mul.f32 f809, f805, 0f3F3504F3; +mul.f32 f810, f806, 0f3F3504F3; +sub.f32 f811, f809, f810; +add.f32 f812, f809, f810; +mul.f32 f813, f807, 0fBF3504F3; +mul.f32 f814, f808, 0f3F3504F3; +sub.f32 f815, f813, f814; +mul.f32 f816, f808, 0fBF3504F3; +fma.rn.f32 f817, f807, 0f3F3504F3, f816; +add.f32 f818, f785, f801; +add.f32 f819, f786, f802; +sub.f32 f820, f785, f801; +sub.f32 f821, f786, f802; +add.f32 f822, f789, f811; +add.f32 f823, f790, f812; +sub.f32 f824, f789, f811; +sub.f32 f825, f790, f812; +sub.f32 f826, f787, f804; +add.f32 f827, f788, f803; +add.f32 f828, f787, f804; +sub.f32 f829, f788, f803; +add.f32 f830, f791, f815; +add.f32 f831, f792, f817; +sub.f32 f832, f791, f815; +sub.f32 f833, f792, f817; +add.f32 f834, f746, f754; +add.f32 f835, f762, f770; +sub.f32 f836, f746, f754; +sub.f32 f837, f762, f770; +add.f32 f838, f750, f758; +add.f32 f839, f766, f774; +sub.f32 f840, f750, f758; +sub.f32 f841, f766, f774; +add.f32 f842, f834, f838; +add.f32 f843, f835, f839; +sub.f32 f844, f834, f838; +sub.f32 f845, f835, f839; +sub.f32 f846, f836, f841; +add.f32 f847, f837, f840; +add.f32 f848, f836, f841; +sub.f32 f849, f837, f840; +add.f32 f850, f748, f756; +add.f32 f851, f764, f772; +sub.f32 f852, f748, f756; +sub.f32 f853, f764, f772; +add.f32 f854, f752, f760; +add.f32 f855, f768, f776; +sub.f32 f856, f752, f760; +sub.f32 f857, f768, f776; +add.f32 f858, f850, f854; +add.f32 f859, f851, f855; +sub.f32 f860, f850, f854; +sub.f32 f861, f851, f855; +sub.f32 f862, f852, f857; +add.f32 f863, f853, f856; +add.f32 f864, f852, f857; +sub.f32 f865, f853, f856; +mul.f32 f866, f862, 0f3F3504F3; +mul.f32 f867, f863, 0f3F3504F3; +sub.f32 f868, f866, f867; +add.f32 f869, f866, f867; +mul.f32 f870, f864, 0fBF3504F3; +mul.f32 f871, f865, 0f3F3504F3; +sub.f32 f872, f870, f871; +mul.f32 f873, f865, 0fBF3504F3; +fma.rn.f32 f874, f864, 0f3F3504F3, f873; +add.f32 f875, f842, f858; +add.f32 f876, f843, f859; +sub.f32 f877, f842, f858; +sub.f32 f878, f843, f859; +add.f32 f879, f846, f868; +add.f32 f880, f847, f869; +sub.f32 f881, f846, f868; +sub.f32 f882, f847, f869; +sub.f32 f883, f844, f861; +add.f32 f884, f845, f860; +add.f32 f885, f844, f861; +sub.f32 f886, f845, f860; +add.f32 f887, f848, f872; +add.f32 f888, f849, f874; +sub.f32 f889, f848, f872; +sub.f32 f890, f849, f874; +mul.f32 f891, f879, 0f3F6C835E; +mul.f32 f892, f880, 0f3EC3EF15; +sub.f32 f893, f891, f892; +mul.f32 f894, f880, 0f3F6C835E; +fma.rn.f32 f895, f879, 0f3EC3EF15, f894; +mul.f32 f896, f883, 0f3F3504F3; +mul.f32 f897, f884, 0f3F3504F3; +sub.f32 f898, f896, f897; +add.f32 f899, f896, f897; +mul.f32 f900, f887, 0f3EC3EF15; +mul.f32 f901, f888, 0f3F6C835E; +sub.f32 f902, f900, f901; +mul.f32 f903, f888, 0f3EC3EF15; +fma.rn.f32 f904, f887, 0f3F6C835E, f903; +mul.f32 f905, f881, 0fBEC3EF15; +mul.f32 f906, f882, 0f3F6C835E; +sub.f32 f907, f905, f906; +mul.f32 f908, f882, 0fBEC3EF15; +fma.rn.f32 f909, f881, 0f3F6C835E, f908; +mul.f32 f910, f885, 0fBF3504F3; +mul.f32 f911, f886, 0f3F3504F3; +sub.f32 f912, f910, f911; +mul.f32 f913, f886, 0fBF3504F3; +fma.rn.f32 f914, f885, 0f3F3504F3, f913; +mul.f32 f915, f889, 0fBF6C835E; +mul.f32 f916, f890, 0f3EC3EF15; +sub.f32 f917, f915, f916; +mul.f32 f918, f890, 0fBF6C835E; +fma.rn.f32 f919, f889, 0f3EC3EF15, f918; +add.f32 f920, f818, f875; +add.f32 f921, f819, f876; +sub.f32 f922, f818, f875; +sub.f32 f923, f819, f876; +add.f32 f924, f822, f893; +add.f32 f925, f823, f895; +sub.f32 f926, f822, f893; +sub.f32 f927, f823, f895; +add.f32 f928, f826, f898; +add.f32 f929, f827, f899; +sub.f32 f930, f826, f898; +sub.f32 f931, f827, f899; +add.f32 f932, f830, f902; +add.f32 f933, f831, f904; +sub.f32 f934, f830, f902; +sub.f32 f935, f831, f904; +sub.f32 f936, f820, f878; +add.f32 f937, f821, f877; +add.f32 f938, f820, f878; +sub.f32 f939, f821, f877; +add.f32 f940, f824, f907; +add.f32 f941, f825, f909; +sub.f32 f942, f824, f907; +sub.f32 f943, f825, f909; +add.f32 f944, f828, f912; +add.f32 f945, f829, f914; +sub.f32 f946, f828, f912; +sub.f32 f947, f829, f914; +add.f32 f948, f832, f917; +add.f32 f949, f833, f919; +sub.f32 f950, f832, f917; +sub.f32 f951, f833, f919; +and.b32 r22, r5, 256; +bfe.u32 r23, r5, 8, 1; +mul.wide.u32 rd9, r23, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f952, f953}, [rd11]; +mul.f32 f956, f925, f953; +fma.rn.f32 f957, f952, f924, f956; +mul.f32 f958, f924, f953; +mul.f32 f959, f952, f925; +sub.f32 f960, f959, f958; +mul.f32 f961, f952, f952; +mul.f32 f962, f953, f953; +sub.f32 f963, f961, f962; +mul.f32 f964, f953, f952; +fma.rn.f32 f965, f953, f952, f964; +mul.f32 f966, f929, f965; +fma.rn.f32 f967, f963, f928, f966; +mul.f32 f968, f928, f965; +mul.f32 f969, f963, f929; +sub.f32 f970, f969, f968; +mul.f32 f971, f952, f963; +mul.f32 f972, f953, f965; +sub.f32 f973, f971, f972; +mul.f32 f974, f952, f965; +fma.rn.f32 f975, f953, f963, f974; +mul.f32 f976, f933, f975; +fma.rn.f32 f977, f973, f932, f976; +mul.f32 f978, f932, f975; +mul.f32 f979, f973, f933; +sub.f32 f980, f979, f978; +mul.f32 f981, f952, f973; +mul.f32 f982, f953, f975; +sub.f32 f983, f981, f982; +mul.f32 f984, f952, f975; +fma.rn.f32 f985, f953, f973, f984; +mul.f32 f986, f937, f985; +fma.rn.f32 f987, f983, f936, f986; +mul.f32 f988, f936, f985; +mul.f32 f989, f983, f937; +sub.f32 f990, f989, f988; +mul.f32 f991, f952, f983; +mul.f32 f992, f953, f985; +sub.f32 f993, f991, f992; +mul.f32 f994, f952, f985; +fma.rn.f32 f995, f953, f983, f994; +mul.f32 f996, f941, f995; +fma.rn.f32 f997, f993, f940, f996; +mul.f32 f998, f940, f995; +mul.f32 f999, f993, f941; +sub.f32 f1000, f999, f998; +mul.f32 f1001, f952, f993; +mul.f32 f1002, f953, f995; +sub.f32 f1003, f1001, f1002; +mul.f32 f1004, f952, f995; +fma.rn.f32 f1005, f953, f993, f1004; +mul.f32 f1006, f945, f1005; +fma.rn.f32 f1007, f1003, f944, f1006; +mul.f32 f1008, f944, f1005; +mul.f32 f1009, f1003, f945; +sub.f32 f1010, f1009, f1008; +mul.f32 f1011, f952, f1003; +mul.f32 f1012, f953, f1005; +sub.f32 f1013, f1011, f1012; +mul.f32 f1014, f952, f1005; +fma.rn.f32 f1015, f953, f1003, f1014; +mul.f32 f1016, f949, f1015; +fma.rn.f32 f1017, f1013, f948, f1016; +mul.f32 f1018, f948, f1015; +mul.f32 f1019, f1013, f949; +sub.f32 f1020, f1019, f1018; +mul.f32 f1021, f952, f1013; +mul.f32 f1022, f953, f1015; +sub.f32 f1023, f1021, f1022; +mul.f32 f1024, f952, f1015; +fma.rn.f32 f1025, f953, f1013, f1024; +mul.f32 f1026, f923, f1025; +fma.rn.f32 f1027, f1023, f922, f1026; +mul.f32 f1028, f922, f1025; +mul.f32 f1029, f1023, f923; +sub.f32 f1030, f1029, f1028; +mul.f32 f1031, f952, f1023; +mul.f32 f1032, f953, f1025; +sub.f32 f1033, f1031, f1032; +mul.f32 f1034, f952, f1025; +fma.rn.f32 f1035, f953, f1023, f1034; +mul.f32 f1036, f927, f1035; +fma.rn.f32 f1037, f1033, f926, f1036; +mul.f32 f1038, f926, f1035; +mul.f32 f1039, f1033, f927; +sub.f32 f1040, f1039, f1038; +mul.f32 f1041, f952, f1033; +mul.f32 f1042, f953, f1035; +sub.f32 f1043, f1041, f1042; +mul.f32 f1044, f952, f1035; +fma.rn.f32 f1045, f953, f1033, f1044; +mul.f32 f1046, f931, f1045; +fma.rn.f32 f1047, f1043, f930, f1046; +mul.f32 f1048, f930, f1045; +mul.f32 f1049, f1043, f931; +sub.f32 f1050, f1049, f1048; +mul.f32 f1051, f952, f1043; +mul.f32 f1052, f953, f1045; +sub.f32 f1053, f1051, f1052; +mul.f32 f1054, f952, f1045; +fma.rn.f32 f1055, f953, f1043, f1054; +mul.f32 f1056, f935, f1055; +fma.rn.f32 f1057, f1053, f934, f1056; +mul.f32 f1058, f934, f1055; +mul.f32 f1059, f1053, f935; +sub.f32 f1060, f1059, f1058; +mul.f32 f1061, f952, f1053; +mul.f32 f1062, f953, f1055; +sub.f32 f1063, f1061, f1062; +mul.f32 f1064, f952, f1055; +fma.rn.f32 f1065, f953, f1053, f1064; +mul.f32 f1066, f939, f1065; +fma.rn.f32 f1067, f1063, f938, f1066; +mul.f32 f1068, f938, f1065; +mul.f32 f1069, f1063, f939; +sub.f32 f1070, f1069, f1068; +mul.f32 f1071, f952, f1063; +mul.f32 f1072, f953, f1065; +sub.f32 f1073, f1071, f1072; +mul.f32 f1074, f952, f1065; +fma.rn.f32 f1075, f953, f1063, f1074; +mul.f32 f1076, f943, f1075; +fma.rn.f32 f1077, f1073, f942, f1076; +mul.f32 f1078, f942, f1075; +mul.f32 f1079, f1073, f943; +sub.f32 f1080, f1079, f1078; +mul.f32 f1081, f952, f1073; +mul.f32 f1082, f953, f1075; +sub.f32 f1083, f1081, f1082; +mul.f32 f1084, f952, f1075; +fma.rn.f32 f1085, f953, f1073, f1084; +mul.f32 f1086, f947, f1085; +fma.rn.f32 f1087, f1083, f946, f1086; +mul.f32 f1088, f946, f1085; +mul.f32 f1089, f1083, f947; +sub.f32 f1090, f1089, f1088; +mul.f32 f1091, f952, f1083; +mul.f32 f1092, f953, f1085; +sub.f32 f1093, f1091, f1092; +mul.f32 f1094, f952, f1085; +fma.rn.f32 f1095, f953, f1083, f1094; +mul.f32 f1096, f951, f1095; +fma.rn.f32 f1097, f1093, f950, f1096; +mul.f32 f1098, f950, f1095; +mul.f32 f1099, f1093, f951; +sub.f32 f1100, f1099, f1098; +and.b32 r24, r16, 1020; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 16384; +add.s32 r27, r25, r26; +st.shared.f32 [r27], f920; +st.shared.f32 [r27+1024], f957; +st.shared.f32 [r27+2048], f967; +st.shared.f32 [r27+3072], f977; +st.shared.f32 [r27+4096], f987; +st.shared.f32 [r27+5120], f997; +st.shared.f32 [r27+6144], f1007; +st.shared.f32 [r27+7168], f1017; +st.shared.f32 [r27+8192], f1027; +st.shared.f32 [r27+9216], f1037; +st.shared.f32 [r27+10240], f1047; +st.shared.f32 [r27+11264], f1057; +st.shared.f32 [r27+12288], f1067; +st.shared.f32 [r27+13312], f1077; +st.shared.f32 [r27+14336], f1087; +st.shared.f32 [r27+15360], f1097; +barrier.sync 0; +mad.lo.s32 r28, r22, -60, r27; +ld.shared.f32 f1101, [r28]; +ld.shared.f32 f1102, [r28+2048]; +ld.shared.f32 f1103, [r28+4096]; +ld.shared.f32 f1104, [r28+6144]; +ld.shared.f32 f1105, [r28+8192]; +ld.shared.f32 f1106, [r28+10240]; +ld.shared.f32 f1107, [r28+12288]; +ld.shared.f32 f1108, [r28+14336]; +ld.shared.f32 f1109, [r28+16384]; +ld.shared.f32 f1110, [r28+18432]; +ld.shared.f32 f1111, [r28+20480]; +ld.shared.f32 f1112, [r28+22528]; +ld.shared.f32 f1113, [r28+24576]; +ld.shared.f32 f1114, [r28+26624]; +ld.shared.f32 f1115, [r28+28672]; +ld.shared.f32 f1116, [r28+30720]; +barrier.sync 0; +st.shared.f32 [r27], f921; +st.shared.f32 [r27+1024], f960; +st.shared.f32 [r27+2048], f970; +st.shared.f32 [r27+3072], f980; +st.shared.f32 [r27+4096], f990; +st.shared.f32 [r27+5120], f1000; +st.shared.f32 [r27+6144], f1010; +st.shared.f32 [r27+7168], f1020; +st.shared.f32 [r27+8192], f1030; +st.shared.f32 [r27+9216], f1040; +st.shared.f32 [r27+10240], f1050; +st.shared.f32 [r27+11264], f1060; +st.shared.f32 [r27+12288], f1070; +st.shared.f32 [r27+13312], f1080; +st.shared.f32 [r27+14336], f1090; +st.shared.f32 [r27+15360], f1100; +barrier.sync 0; +ld.shared.f32 f1117, [r28]; +ld.shared.f32 f1118, [r28+2048]; +ld.shared.f32 f1119, [r28+4096]; +ld.shared.f32 f1120, [r28+6144]; +ld.shared.f32 f1121, [r28+8192]; +ld.shared.f32 f1122, [r28+10240]; +ld.shared.f32 f1123, [r28+12288]; +ld.shared.f32 f1124, [r28+14336]; +ld.shared.f32 f1125, [r28+16384]; +ld.shared.f32 f1126, [r28+18432]; +ld.shared.f32 f1127, [r28+20480]; +ld.shared.f32 f1128, [r28+22528]; +ld.shared.f32 f1129, [r28+24576]; +ld.shared.f32 f1130, [r28+26624]; +ld.shared.f32 f1131, [r28+28672]; +ld.shared.f32 f1132, [r28+30720]; +add.f32 %0, f1101, f1109; +add.f32 %1, f1117, f1125; +add.f32 %2, f1102, f1110; +add.f32 %3, f1118, f1126; +add.f32 %4, f1103, f1111; +add.f32 %5, f1119, f1127; +add.f32 %6, f1104, f1112; +add.f32 %7, f1120, f1128; +add.f32 %8, f1105, f1113; +add.f32 %9, f1121, f1129; +add.f32 %10, f1106, f1114; +add.f32 %11, f1122, f1130; +add.f32 %12, f1107, f1115; +add.f32 %13, f1123, f1131; +add.f32 %14, f1108, f1116; +add.f32 %15, f1124, f1132; +sub.f32 %16, f1101, f1109; +sub.f32 %17, f1117, f1125; +sub.f32 %18, f1102, f1110; +sub.f32 %19, f1118, f1126; +sub.f32 %20, f1103, f1111; +sub.f32 %21, f1119, f1127; +sub.f32 %22, f1104, f1112; +sub.f32 %23, f1120, f1128; +sub.f32 %24, f1105, f1113; +sub.f32 %25, f1121, f1129; +sub.f32 %26, f1106, f1114; +sub.f32 %27, f1122, f1130; +sub.f32 %28, f1107, f1115; +sub.f32 %29, f1123, f1131; +sub.f32 %30, f1108, f1116; +sub.f32 %31, f1124, f1132; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_8192), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<315, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1520>; +.reg .b32 r<60>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f32 f65, %36, %52; +sub.f32 f67, %36, %52; +add.f32 f1507, %37, %68; +sub.f32 f68, %37, %68; +add.f32 f69, %44, %60; +sub.f32 f71, %44, %60; +add.f32 f1505, %69, %61; +sub.f32 f72, %69, %61; +add.f32 f73, f65, f69; +sub.f32 f75, f65, f69; +add.f32 f1504, f1507, f1505; +sub.f32 f76, f1507, f1505; +sub.f32 f77, f67, f72; +add.f32 f79, f67, f72; +add.f32 f1503, f68, f71; +sub.f32 f80, f68, f71; +add.f32 f81, %40, %56; +sub.f32 f83, %40, %56; +add.f32 f1500, %71, %70; +sub.f32 f84, %71, %70; +add.f32 f85, %48, %64; +sub.f32 f87, %48, %64; +add.f32 f1498, %49, %72; +sub.f32 f88, %49, %72; +add.f32 f89, f81, f85; +sub.f32 f91, f81, f85; +add.f32 f1497, f1500, f1498; +sub.f32 f92, f1500, f1498; +sub.f32 f93, f83, f88; +add.f32 f95, f83, f88; +add.f32 f1496, f84, f87; +sub.f32 f96, f84, f87; +mul.f32 f97, f93, 0f3F3504F3; +mul.f32 f98, f1496, 0f3F3504F3; +sub.f32 f99, f97, f98; +add.f32 f100, f97, f98; +mul.f32 f1494, f95, 0fBF3504F3; +mul.f32 f1495, f96, 0f3F3504F3; +sub.f32 f103, f1494, f1495; +mul.f32 f104, f96, 0fBF3504F3; +fma.rn.f32 f105, f95, 0f3F3504F3, f104; +add.f32 f106, f73, f89; +sub.f32 f108, f73, f89; +add.f32 f1493, f1504, f1497; +sub.f32 f109, f1504, f1497; +add.f32 f110, f77, f99; +sub.f32 f112, f77, f99; +add.f32 f1492, f1503, f100; +sub.f32 f113, f1503, f100; +sub.f32 f114, f75, f92; +add.f32 f116, f75, f92; +add.f32 f1491, f76, f91; +sub.f32 f117, f76, f91; +add.f32 f118, f79, f103; +sub.f32 f120, f79, f103; +add.f32 f1490, f80, f105; +sub.f32 f121, f80, f105; +add.f32 f122, %38, %54; +sub.f32 f124, %38, %54; +add.f32 f1488, %73, %55; +sub.f32 f125, %73, %55; +add.f32 f126, %46, %62; +sub.f32 f128, %46, %62; +add.f32 f1485, %74, %75; +sub.f32 f129, %74, %75; +add.f32 f130, f122, f126; +sub.f32 f132, f122, f126; +add.f32 f1484, f1488, f1485; +sub.f32 f133, f1488, f1485; +sub.f32 f134, f124, f129; +add.f32 f136, f124, f129; +add.f32 f1483, f125, f128; +sub.f32 f137, f125, f128; +add.f32 f138, %42, %58; +sub.f32 f140, %42, %58; +add.f32 f1481, %43, %76; +sub.f32 f141, %43, %76; +add.f32 f142, %50, %66; +sub.f32 f144, %50, %66; +add.f32 f1479, %77, %67; +sub.f32 f145, %77, %67; +add.f32 f146, f138, f142; +sub.f32 f148, f138, f142; +add.f32 f1478, f1481, f1479; +sub.f32 f149, f1481, f1479; +sub.f32 f150, f140, f145; +add.f32 f152, f140, f145; +add.f32 f1477, f141, f144; +sub.f32 f153, f141, f144; +mul.f32 f154, f150, 0f3F3504F3; +mul.f32 f155, f1477, 0f3F3504F3; +sub.f32 f156, f154, f155; +add.f32 f157, f154, f155; +mul.f32 f1475, f152, 0fBF3504F3; +mul.f32 f1476, f153, 0f3F3504F3; +sub.f32 f160, f1475, f1476; +mul.f32 f161, f153, 0fBF3504F3; +fma.rn.f32 f162, f152, 0f3F3504F3, f161; +add.f32 f163, f130, f146; +sub.f32 f165, f130, f146; +add.f32 f1474, f1484, f1478; +sub.f32 f166, f1484, f1478; +add.f32 f167, f134, f156; +sub.f32 f169, f134, f156; +add.f32 f1473, f1483, f157; +sub.f32 f170, f1483, f157; +sub.f32 f171, f132, f149; +add.f32 f173, f132, f149; +add.f32 f1472, f133, f148; +sub.f32 f174, f133, f148; +add.f32 f175, f136, f160; +sub.f32 f177, f136, f160; +add.f32 f1471, f137, f162; +sub.f32 f178, f137, f162; +mul.f32 f1469, f167, 0f3F6C835E; +mul.f32 f1470, f1473, 0f3EC3EF15; +sub.f32 f181, f1469, f1470; +mul.f32 f182, f1473, 0f3F6C835E; +fma.rn.f32 f183, f167, 0f3EC3EF15, f182; +mul.f32 f184, f171, 0f3F3504F3; +mul.f32 f185, f1472, 0f3F3504F3; +sub.f32 f186, f184, f185; +add.f32 f187, f184, f185; +mul.f32 f189, f1471, 0f3F6C835E; +mul.f32 f1468, f175, 0f3EC3EF15; +sub.f32 f190, f1468, f189; +mul.f32 f191, f1471, 0f3EC3EF15; +fma.rn.f32 f192, f175, 0f3F6C835E, f191; +mul.f32 f194, f170, 0f3F6C835E; +mul.f32 f1467, f169, 0fBEC3EF15; +sub.f32 f195, f1467, f194; +mul.f32 f196, f170, 0fBEC3EF15; +fma.rn.f32 f197, f169, 0f3F6C835E, f196; +mul.f32 f1465, f173, 0fBF3504F3; +mul.f32 f1466, f174, 0f3F3504F3; +sub.f32 f200, f1465, f1466; +mul.f32 f201, f174, 0fBF3504F3; +fma.rn.f32 f202, f173, 0f3F3504F3, f201; +mul.f32 f1463, f177, 0fBF6C835E; +mul.f32 f1464, f178, 0f3EC3EF15; +sub.f32 f205, f1463, f1464; +mul.f32 f206, f178, 0fBF6C835E; +fma.rn.f32 f207, f177, 0f3EC3EF15, f206; +add.f32 f210, f110, f181; +sub.f32 f212, f110, f181; +add.f32 f1462, f1492, f183; +sub.f32 f213, f1492, f183; +add.f32 f214, f114, f186; +sub.f32 f216, f114, f186; +add.f32 f1461, f1491, f187; +sub.f32 f217, f1491, f187; +add.f32 f218, f118, f190; +sub.f32 f220, f118, f190; +add.f32 f1460, f1490, f192; +sub.f32 f221, f1490, f192; +sub.f32 f222, f108, f166; +add.f32 f224, f108, f166; +add.f32 f1459, f109, f165; +sub.f32 f225, f109, f165; +add.f32 f226, f112, f195; +sub.f32 f228, f112, f195; +add.f32 f1458, f113, f197; +sub.f32 f229, f113, f197; +add.f32 f230, f116, f200; +sub.f32 f232, f116, f200; +add.f32 f1457, f117, f202; +sub.f32 f233, f117, f202; +add.f32 f234, f120, f205; +sub.f32 f236, f120, f205; +add.f32 f1456, f121, f207; +sub.f32 f237, f121, f207; +mov.u32 r30, %tid.x; +shl.b32 r7, r30, 7; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r30, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 4088; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f238, f239}, [rd5]; +mul.f32 f242, f1462, f239; +mul.f32 f244, f238, f1462; +mul.f32 f246, f239, f239; +mul.f32 f1455, f238, f238; +sub.f32 f247, f1455, f246; +mul.f32 f248, f239, f238; +fma.rn.f32 f249, f239, f238, f248; +mul.f32 f250, f1461, f249; +mul.f32 f252, f247, f1461; +mul.f32 f1453, f238, f247; +mul.f32 f1454, f239, f249; +sub.f32 f255, f1453, f1454; +mul.f32 f1452, f214, f249; +mul.f32 f256, f238, f249; +fma.rn.f32 f257, f239, f247, f256; +mul.f32 f258, f1460, f257; +mul.f32 f260, f255, f1460; +mul.f32 f262, f239, f257; +mul.f32 f1451, f238, f255; +sub.f32 f263, f1451, f262; +mul.f32 f1450, f218, f257; +mul.f32 f264, f238, f257; +fma.rn.f32 f265, f239, f255, f264; +mul.f32 f266, f1459, f265; +mul.f32 f268, f263, f1459; +mul.f32 f270, f239, f265; +mul.f32 f1449, f238, f263; +sub.f32 f271, f1449, f270; +mul.f32 f1448, f222, f265; +mul.f32 f272, f238, f265; +fma.rn.f32 f273, f239, f263, f272; +mul.f32 f274, f1458, f273; +mul.f32 f276, f271, f1458; +mul.f32 f1446, f238, f271; +mul.f32 f1447, f239, f273; +sub.f32 f279, f1446, f1447; +mul.f32 f1445, f226, f273; +mul.f32 f280, f238, f273; +fma.rn.f32 f281, f239, f271, f280; +mul.f32 f282, f1457, f281; +mul.f32 f284, f279, f1457; +mul.f32 f286, f239, f281; +mul.f32 f1444, f238, f279; +sub.f32 f287, f1444, f286; +mul.f32 f1443, f230, f281; +mul.f32 f288, f238, f281; +fma.rn.f32 f289, f239, f279, f288; +mul.f32 f290, f1456, f289; +mul.f32 f292, f287, f1456; +mul.f32 f294, f239, f289; +mul.f32 f1442, f238, f287; +sub.f32 f295, f1442, f294; +mul.f32 f1441, f234, f289; +mul.f32 f296, f238, f289; +fma.rn.f32 f297, f239, f287, f296; +sub.f32 f1440, f1493, f1474; +mul.f32 f298, f1440, f297; +mul.f32 f300, f295, f1440; +mul.f32 f1438, f238, f295; +mul.f32 f1439, f239, f297; +sub.f32 f303, f1438, f1439; +sub.f32 f1437, f106, f163; +mul.f32 f1436, f1437, f297; +mul.f32 f304, f238, f297; +fma.rn.f32 f305, f239, f295, f304; +mul.f32 f306, f213, f305; +mul.f32 f308, f303, f213; +mul.f32 f310, f239, f305; +mul.f32 f1435, f238, f303; +sub.f32 f311, f1435, f310; +mul.f32 f1434, f212, f305; +mul.f32 f312, f238, f305; +fma.rn.f32 f313, f239, f303, f312; +mul.f32 f314, f217, f313; +mul.f32 f316, f311, f217; +mul.f32 f1432, f238, f311; +mul.f32 f1433, f239, f313; +sub.f32 f319, f1432, f1433; +mul.f32 f1431, f216, f313; +mul.f32 f320, f238, f313; +fma.rn.f32 f321, f239, f311, f320; +mul.f32 f322, f221, f321; +mul.f32 f324, f319, f221; +mul.f32 f326, f239, f321; +mul.f32 f1430, f238, f319; +sub.f32 f327, f1430, f326; +mul.f32 f1429, f220, f321; +mul.f32 f328, f238, f321; +fma.rn.f32 f329, f239, f319, f328; +mul.f32 f330, f225, f329; +mul.f32 f332, f327, f225; +mul.f32 f334, f239, f329; +mul.f32 f1428, f238, f327; +sub.f32 f335, f1428, f334; +mul.f32 f1427, f224, f329; +mul.f32 f336, f238, f329; +fma.rn.f32 f337, f239, f327, f336; +mul.f32 f338, f229, f337; +mul.f32 f340, f335, f229; +mul.f32 f1425, f238, f335; +mul.f32 f1426, f239, f337; +sub.f32 f343, f1425, f1426; +mul.f32 f1424, f228, f337; +mul.f32 f344, f238, f337; +fma.rn.f32 f345, f239, f335, f344; +mul.f32 f346, f233, f345; +mul.f32 f348, f343, f233; +mul.f32 f350, f239, f345; +mul.f32 f1423, f238, f343; +sub.f32 f351, f1423, f350; +mul.f32 f1422, f232, f345; +mul.f32 f352, f238, f345; +mul.f32 f1421, f210, f239; +fma.rn.f32 f353, f239, f343, f352; +mul.f32 f354, f237, f353; +mul.f32 f355, f236, f353; +mul.f32 f356, f351, f237; +barrier.sync 0; +and.b32 r11, r7, 65408; +add.s32 r12, r9, r11; +add.f32 f357, f1493, f1474; +mov.u32 r37, %tid.x; +shl.b32 r36, r37, 3; +mov.u32 r45, %tid.x; +shl.b32 r44, r45, 7; +sub.f32 f1519, f106, f163; +add.f32 f358, f106, f163; +mov.u32 r43, %tid.x; +shl.b32 r42, r43, 3; +mov.u32 r55, %tid.x; +shl.b32 r54, r55, 7; +mov.u32 r53, %tid.x; +shl.b32 r52, r53, 3; +mov.u32 r41, %tid.x; +fma.rn.f32 f359, f238, f210, f242; +sub.f32 f360, f244, f1421; +st.shared.v4.f32 [r12], {f358, f357, f359, f360}; +fma.rn.f32 f361, f247, f214, f250; +sub.f32 f362, f252, f1452; +fma.rn.f32 f363, f255, f218, f258; +sub.f32 f364, f260, f1450; +st.shared.v4.f32 [r12+16], {f361, f362, f363, f364}; +sub.f32 f365, f268, f1448; +fma.rn.f32 f366, f263, f222, f266; +fma.rn.f32 f367, f271, f226, f274; +sub.f32 f368, f276, f1445; +st.shared.v4.f32 [r12+32], {f366, f365, f367, f368}; +fma.rn.f32 f369, f279, f230, f282; +sub.f32 f370, f284, f1443; +fma.rn.f32 f371, f287, f234, f290; +sub.f32 f372, f292, f1441; +st.shared.v4.f32 [r12+48], {f369, f370, f371, f372}; +fma.rn.f32 f373, f295, f1519, f298; +sub.f32 f374, f300, f1436; +fma.rn.f32 f375, f303, f212, f306; +sub.f32 f376, f308, f1434; +st.shared.v4.f32 [r12+64], {f373, f374, f375, f376}; +fma.rn.f32 f377, f311, f216, f314; +sub.f32 f378, f316, f1431; +fma.rn.f32 f379, f319, f220, f322; +sub.f32 f380, f324, f1429; +st.shared.v4.f32 [r12+80], {f377, f378, f379, f380}; +fma.rn.f32 f381, f327, f224, f330; +sub.f32 f382, f332, f1427; +fma.rn.f32 f383, f335, f228, f338; +sub.f32 f384, f340, f1424; +st.shared.v4.f32 [r12+96], {f381, f382, f383, f384}; +fma.rn.f32 f385, f343, f232, f346; +sub.f32 f386, f348, f1422; +fma.rn.f32 f387, f351, f236, f354; +sub.f32 f388, f356, f355; +st.shared.v4.f32 [r12+112], {f385, f386, f387, f388}; +barrier.sync 0; +and.b32 r29, r41, 511; +mad.lo.s32 r13, r29, -120, r12; +ld.shared.v2.f32 {f389, f390}, [r13]; +ld.shared.v2.f32 {f393, f394}, [r13+4096]; +ld.shared.v2.f32 {f397, f398}, [r13+8192]; +ld.shared.v2.f32 {f401, f402}, [r13+12288]; +ld.shared.v2.f32 {f405, f406}, [r13+16384]; +ld.shared.v2.f32 {f409, f410}, [r13+20480]; +ld.shared.v2.f32 {f413, f414}, [r13+24576]; +ld.shared.v2.f32 {f417, f418}, [r13+28672]; +ld.shared.v2.f32 {f421, f422}, [r13+32768]; +ld.shared.v2.f32 {f425, f426}, [r13+36864]; +ld.shared.v2.f32 {f429, f430}, [r13+40960]; +ld.shared.v2.f32 {f433, f434}, [r13+45056]; +ld.shared.v2.f32 {f437, f438}, [r13+49152]; +ld.shared.v2.f32 {f441, f442}, [r13+53248]; +ld.shared.v2.f32 {f445, f446}, [r13+57344]; +ld.shared.v2.f32 {f449, f450}, [r13+61440]; +add.f32 f453, f389, f421; +sub.f32 f455, f389, f421; +add.f32 f1420, f390, f422; +sub.f32 f456, f390, f422; +add.f32 f457, f405, f437; +sub.f32 f459, f405, f437; +add.f32 f1419, f406, f438; +sub.f32 f460, f406, f438; +add.f32 f461, f453, f457; +sub.f32 f463, f453, f457; +add.f32 f1418, f1420, f1419; +sub.f32 f464, f1420, f1419; +sub.f32 f465, f455, f460; +add.f32 f467, f455, f460; +add.f32 f1417, f456, f459; +sub.f32 f468, f456, f459; +add.f32 f469, f397, f429; +sub.f32 f471, f397, f429; +add.f32 f1416, f398, f430; +sub.f32 f472, f398, f430; +add.f32 f473, f413, f445; +sub.f32 f475, f413, f445; +add.f32 f1415, f414, f446; +sub.f32 f476, f414, f446; +add.f32 f477, f469, f473; +sub.f32 f479, f469, f473; +add.f32 f1414, f1416, f1415; +sub.f32 f480, f1416, f1415; +sub.f32 f481, f471, f476; +add.f32 f483, f471, f476; +add.f32 f1413, f472, f475; +sub.f32 f484, f472, f475; +mul.f32 f485, f481, 0f3F3504F3; +mul.f32 f486, f1413, 0f3F3504F3; +sub.f32 f487, f485, f486; +add.f32 f488, f485, f486; +mul.f32 f490, f484, 0f3F3504F3; +mul.f32 f1412, f483, 0fBF3504F3; +sub.f32 f491, f1412, f490; +mul.f32 f492, f484, 0fBF3504F3; +fma.rn.f32 f493, f483, 0f3F3504F3, f492; +add.f32 f494, f461, f477; +sub.f32 f496, f461, f477; +add.f32 f1411, f1418, f1414; +sub.f32 f497, f1418, f1414; +add.f32 f498, f465, f487; +sub.f32 f500, f465, f487; +add.f32 f1410, f1417, f488; +sub.f32 f501, f1417, f488; +sub.f32 f502, f463, f480; +add.f32 f504, f463, f480; +add.f32 f1409, f464, f479; +sub.f32 f505, f464, f479; +add.f32 f506, f467, f491; +sub.f32 f508, f467, f491; +add.f32 f1408, f468, f493; +sub.f32 f509, f468, f493; +add.f32 f510, f393, f425; +sub.f32 f512, f393, f425; +add.f32 f1407, f394, f426; +sub.f32 f513, f394, f426; +add.f32 f514, f409, f441; +sub.f32 f516, f409, f441; +add.f32 f1406, f410, f442; +sub.f32 f517, f410, f442; +add.f32 f518, f510, f514; +sub.f32 f520, f510, f514; +add.f32 f1405, f1407, f1406; +sub.f32 f521, f1407, f1406; +sub.f32 f522, f512, f517; +add.f32 f524, f512, f517; +add.f32 f1404, f513, f516; +sub.f32 f525, f513, f516; +add.f32 f526, f401, f433; +sub.f32 f528, f401, f433; +add.f32 f1403, f402, f434; +sub.f32 f529, f402, f434; +add.f32 f530, f417, f449; +sub.f32 f532, f417, f449; +add.f32 f1402, f418, f450; +sub.f32 f533, f418, f450; +add.f32 f534, f526, f530; +sub.f32 f536, f526, f530; +add.f32 f1401, f1403, f1402; +sub.f32 f537, f1403, f1402; +sub.f32 f538, f528, f533; +add.f32 f540, f528, f533; +add.f32 f1400, f529, f532; +sub.f32 f541, f529, f532; +mul.f32 f542, f538, 0f3F3504F3; +mul.f32 f543, f1400, 0f3F3504F3; +sub.f32 f544, f542, f543; +add.f32 f545, f542, f543; +mul.f32 f547, f541, 0f3F3504F3; +mul.f32 f1399, f540, 0fBF3504F3; +sub.f32 f548, f1399, f547; +mul.f32 f549, f541, 0fBF3504F3; +fma.rn.f32 f550, f540, 0f3F3504F3, f549; +add.f32 f551, f518, f534; +sub.f32 f553, f518, f534; +add.f32 f1398, f1405, f1401; +sub.f32 f554, f1405, f1401; +add.f32 f555, f522, f544; +sub.f32 f557, f522, f544; +add.f32 f1397, f1404, f545; +sub.f32 f558, f1404, f545; +sub.f32 f559, f520, f537; +add.f32 f561, f520, f537; +add.f32 f1396, f521, f536; +sub.f32 f562, f521, f536; +add.f32 f563, f524, f548; +sub.f32 f565, f524, f548; +add.f32 f1395, f525, f550; +sub.f32 f566, f525, f550; +mul.f32 f568, f1397, 0f3EC3EF15; +mul.f32 f1394, f555, 0f3F6C835E; +sub.f32 f569, f1394, f568; +mul.f32 f570, f1397, 0f3F6C835E; +fma.rn.f32 f571, f555, 0f3EC3EF15, f570; +mul.f32 f572, f559, 0f3F3504F3; +mul.f32 f573, f1396, 0f3F3504F3; +sub.f32 f574, f572, f573; +add.f32 f575, f572, f573; +mul.f32 f577, f1395, 0f3F6C835E; +mul.f32 f1393, f563, 0f3EC3EF15; +sub.f32 f578, f1393, f577; +mul.f32 f579, f1395, 0f3EC3EF15; +fma.rn.f32 f580, f563, 0f3F6C835E, f579; +mul.f32 f582, f558, 0f3F6C835E; +mul.f32 f1392, f557, 0fBEC3EF15; +sub.f32 f583, f1392, f582; +mul.f32 f584, f558, 0fBEC3EF15; +fma.rn.f32 f585, f557, 0f3F6C835E, f584; +mul.f32 f587, f562, 0f3F3504F3; +mul.f32 f1391, f561, 0fBF3504F3; +sub.f32 f588, f1391, f587; +mul.f32 f589, f562, 0fBF3504F3; +fma.rn.f32 f590, f561, 0f3F3504F3, f589; +mul.f32 f592, f566, 0f3EC3EF15; +mul.f32 f1390, f565, 0fBF6C835E; +sub.f32 f593, f1390, f592; +mul.f32 f594, f566, 0fBF6C835E; +fma.rn.f32 f595, f565, 0f3EC3EF15, f594; +add.f32 f598, f498, f569; +sub.f32 f600, f498, f569; +add.f32 f1389, f1410, f571; +sub.f32 f601, f1410, f571; +add.f32 f602, f502, f574; +sub.f32 f604, f502, f574; +add.f32 f1388, f1409, f575; +sub.f32 f605, f1409, f575; +add.f32 f606, f506, f578; +sub.f32 f608, f506, f578; +add.f32 f1387, f1408, f580; +sub.f32 f609, f1408, f580; +sub.f32 f610, f496, f554; +add.f32 f612, f496, f554; +add.f32 f1386, f497, f553; +sub.f32 f613, f497, f553; +add.f32 f614, f500, f583; +sub.f32 f616, f500, f583; +add.f32 f1385, f501, f585; +sub.f32 f617, f501, f585; +add.f32 f618, f504, f588; +sub.f32 f620, f504, f588; +add.f32 f1384, f505, f590; +sub.f32 f621, f505, f590; +add.f32 f622, f508, f593; +sub.f32 f624, f508, f593; +add.f32 f1383, f509, f595; +sub.f32 f625, f509, f595; +bfe.u32 r15, r41, 4, 5; +mul.wide.u32 rd6, r15, 8; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f626, f627}, [rd8]; +mul.f32 f630, f1389, f627; +mul.f32 f632, f626, f1389; +mul.f32 f634, f627, f627; +mul.f32 f1382, f626, f626; +sub.f32 f635, f1382, f634; +mul.f32 f636, f627, f626; +fma.rn.f32 f637, f627, f626, f636; +mul.f32 f638, f1388, f637; +mul.f32 f640, f635, f1388; +mul.f32 f1380, f626, f635; +mul.f32 f1381, f627, f637; +sub.f32 f643, f1380, f1381; +mul.f32 f1379, f602, f637; +mul.f32 f644, f626, f637; +fma.rn.f32 f645, f627, f635, f644; +mul.f32 f646, f1387, f645; +mul.f32 f648, f643, f1387; +mul.f32 f650, f627, f645; +mul.f32 f1378, f626, f643; +sub.f32 f651, f1378, f650; +mul.f32 f1377, f606, f645; +mul.f32 f652, f626, f645; +fma.rn.f32 f653, f627, f643, f652; +mul.f32 f654, f1386, f653; +mul.f32 f656, f651, f1386; +mul.f32 f658, f627, f653; +mul.f32 f1376, f626, f651; +sub.f32 f659, f1376, f658; +mul.f32 f1375, f610, f653; +mul.f32 f660, f626, f653; +fma.rn.f32 f661, f627, f651, f660; +mul.f32 f662, f1385, f661; +mul.f32 f664, f659, f1385; +mul.f32 f1373, f626, f659; +mul.f32 f1374, f627, f661; +sub.f32 f667, f1373, f1374; +mul.f32 f1372, f614, f661; +mul.f32 f668, f626, f661; +fma.rn.f32 f669, f627, f659, f668; +mul.f32 f670, f1384, f669; +mul.f32 f672, f667, f1384; +mul.f32 f674, f627, f669; +mul.f32 f1371, f626, f667; +sub.f32 f675, f1371, f674; +mul.f32 f1370, f618, f669; +mul.f32 f676, f626, f669; +fma.rn.f32 f677, f627, f667, f676; +mul.f32 f678, f1383, f677; +mul.f32 f680, f675, f1383; +mul.f32 f682, f627, f677; +mul.f32 f1369, f626, f675; +sub.f32 f683, f1369, f682; +mul.f32 f1368, f622, f677; +mul.f32 f684, f626, f677; +fma.rn.f32 f685, f627, f675, f684; +sub.f32 f1367, f1411, f1398; +mul.f32 f686, f1367, f685; +mul.f32 f688, f683, f1367; +mul.f32 f1365, f626, f683; +mul.f32 f1366, f627, f685; +sub.f32 f691, f1365, f1366; +sub.f32 f1364, f494, f551; +mul.f32 f1363, f1364, f685; +mul.f32 f692, f626, f685; +fma.rn.f32 f693, f627, f683, f692; +mul.f32 f694, f601, f693; +mul.f32 f696, f691, f601; +mul.f32 f698, f627, f693; +mul.f32 f1362, f626, f691; +sub.f32 f699, f1362, f698; +mul.f32 f1361, f600, f693; +mul.f32 f700, f626, f693; +fma.rn.f32 f701, f627, f691, f700; +mul.f32 f702, f605, f701; +mul.f32 f704, f699, f605; +mul.f32 f1359, f626, f699; +mul.f32 f1360, f627, f701; +sub.f32 f707, f1359, f1360; +mul.f32 f1358, f604, f701; +mul.f32 f708, f626, f701; +fma.rn.f32 f709, f627, f699, f708; +mul.f32 f710, f609, f709; +mul.f32 f712, f707, f609; +mul.f32 f714, f627, f709; +mul.f32 f1357, f626, f707; +sub.f32 f715, f1357, f714; +mul.f32 f1356, f608, f709; +mul.f32 f716, f626, f709; +fma.rn.f32 f717, f627, f707, f716; +mul.f32 f718, f613, f717; +mul.f32 f720, f715, f613; +mul.f32 f722, f627, f717; +mul.f32 f1355, f626, f715; +sub.f32 f723, f1355, f722; +mul.f32 f1354, f612, f717; +mul.f32 f724, f626, f717; +fma.rn.f32 f725, f627, f715, f724; +mul.f32 f726, f617, f725; +mul.f32 f728, f723, f617; +mul.f32 f1352, f626, f723; +mul.f32 f1353, f627, f725; +sub.f32 f731, f1352, f1353; +mul.f32 f1351, f616, f725; +mul.f32 f732, f626, f725; +fma.rn.f32 f733, f627, f723, f732; +mul.f32 f734, f621, f733; +mul.f32 f736, f731, f621; +mul.f32 f738, f627, f733; +mul.f32 f1350, f626, f731; +sub.f32 f739, f1350, f738; +mul.f32 f1349, f620, f733; +mul.f32 f740, f626, f733; +mul.f32 f1348, f598, f627; +fma.rn.f32 f741, f627, f731, f740; +mul.f32 f742, f625, f741; +mul.f32 f743, f624, f741; +mul.f32 f744, f739, f625; +and.b32 r16, r52, 120; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r54, 63488; +add.s32 r19, r17, r18; +sub.f32 f1510, f1411, f1398; +mul.f32 f1509, f683, f1510; +add.f32 f745, f1411, f1398; +sub.f32 f1512, f494, f551; +add.f32 f746, f494, f551; +st.shared.v2.f32 [r19], {f746, f745}; +mov.u32 r40, %tid.x; +shl.b32 r39, r40, 3; +mov.u32 r51, %tid.x; +shl.b32 r50, r51, 7; +mov.u32 r49, %tid.x; +shl.b32 r48, r49, 3; +mov.u32 r38, %tid.x; +fma.rn.f32 f747, f626, f598, f630; +sub.f32 f748, f632, f1348; +st.shared.v2.f32 [r19+128], {f747, f748}; +fma.rn.f32 f749, f635, f602, f638; +sub.f32 f750, f640, f1379; +st.shared.v2.f32 [r19+256], {f749, f750}; +fma.rn.f32 f751, f643, f606, f646; +sub.f32 f752, f648, f1377; +st.shared.v2.f32 [r19+384], {f751, f752}; +fma.rn.f32 f753, f651, f610, f654; +sub.f32 f754, f656, f1375; +st.shared.v2.f32 [r19+512], {f753, f754}; +sub.f32 f755, f664, f1372; +fma.rn.f32 f756, f659, f614, f662; +st.shared.v2.f32 [r19+640], {f756, f755}; +fma.rn.f32 f757, f667, f618, f670; +sub.f32 f758, f672, f1370; +st.shared.v2.f32 [r19+768], {f757, f758}; +fma.rn.f32 f759, f675, f622, f678; +sub.f32 f760, f680, f1368; +st.shared.v2.f32 [r19+896], {f759, f760}; +fma.rn.f32 f761, f683, f1512, f686; +sub.f32 f762, f1509, f1363; +st.shared.v2.f32 [r19+1024], {f761, f762}; +fma.rn.f32 f763, f691, f600, f694; +sub.f32 f764, f696, f1361; +st.shared.v2.f32 [r19+1152], {f763, f764}; +fma.rn.f32 f765, f699, f604, f702; +sub.f32 f766, f704, f1358; +st.shared.v2.f32 [r19+1280], {f765, f766}; +fma.rn.f32 f767, f707, f608, f710; +sub.f32 f768, f712, f1356; +st.shared.v2.f32 [r19+1408], {f767, f768}; +fma.rn.f32 f769, f715, f612, f718; +sub.f32 f770, f720, f1354; +st.shared.v2.f32 [r19+1536], {f769, f770}; +fma.rn.f32 f771, f723, f616, f726; +sub.f32 f772, f728, f1351; +st.shared.v2.f32 [r19+1664], {f771, f772}; +fma.rn.f32 f773, f731, f620, f734; +sub.f32 f774, f736, f1349; +st.shared.v2.f32 [r19+1792], {f773, f774}; +fma.rn.f32 f775, f739, f624, f742; +sub.f32 f776, f744, f743; +st.shared.v2.f32 [r19+1920], {f775, f776}; +barrier.sync 0; +and.b32 r28, r38, 496; +mad.lo.s32 r20, r28, -120, r19; +ld.shared.v2.f32 {f777, f778}, [r20]; +ld.shared.v2.f32 {f781, f782}, [r20+4096]; +ld.shared.v2.f32 {f785, f786}, [r20+8192]; +ld.shared.v2.f32 {f789, f790}, [r20+12288]; +ld.shared.v2.f32 {f793, f794}, [r20+16384]; +ld.shared.v2.f32 {f797, f798}, [r20+20480]; +ld.shared.v2.f32 {f801, f802}, [r20+24576]; +ld.shared.v2.f32 {f805, f806}, [r20+28672]; +ld.shared.v2.f32 {f809, f810}, [r20+32768]; +ld.shared.v2.f32 {f813, f814}, [r20+36864]; +ld.shared.v2.f32 {f817, f818}, [r20+40960]; +ld.shared.v2.f32 {f821, f822}, [r20+45056]; +ld.shared.v2.f32 {f825, f826}, [r20+49152]; +ld.shared.v2.f32 {f829, f830}, [r20+53248]; +ld.shared.v2.f32 {f833, f834}, [r20+57344]; +ld.shared.v2.f32 {f837, f838}, [r20+61440]; +add.f32 f841, f777, f809; +sub.f32 f843, f777, f809; +add.f32 f1347, f778, f810; +sub.f32 f844, f778, f810; +add.f32 f845, f793, f825; +sub.f32 f847, f793, f825; +add.f32 f1346, f794, f826; +sub.f32 f848, f794, f826; +add.f32 f849, f841, f845; +sub.f32 f851, f841, f845; +add.f32 f1345, f1347, f1346; +sub.f32 f852, f1347, f1346; +sub.f32 f853, f843, f848; +add.f32 f855, f843, f848; +add.f32 f1344, f844, f847; +sub.f32 f856, f844, f847; +add.f32 f857, f785, f817; +sub.f32 f859, f785, f817; +add.f32 f1343, f786, f818; +sub.f32 f860, f786, f818; +add.f32 f861, f801, f833; +sub.f32 f863, f801, f833; +add.f32 f1342, f802, f834; +sub.f32 f864, f802, f834; +add.f32 f865, f857, f861; +sub.f32 f867, f857, f861; +add.f32 f1341, f1343, f1342; +sub.f32 f868, f1343, f1342; +sub.f32 f869, f859, f864; +add.f32 f871, f859, f864; +add.f32 f1340, f860, f863; +sub.f32 f872, f860, f863; +mul.f32 f873, f869, 0f3F3504F3; +mul.f32 f874, f1340, 0f3F3504F3; +sub.f32 f875, f873, f874; +add.f32 f876, f873, f874; +mul.f32 f878, f872, 0f3F3504F3; +mul.f32 f1339, f871, 0fBF3504F3; +sub.f32 f879, f1339, f878; +mul.f32 f880, f872, 0fBF3504F3; +fma.rn.f32 f881, f871, 0f3F3504F3, f880; +add.f32 f882, f849, f865; +sub.f32 f884, f849, f865; +add.f32 f1338, f1345, f1341; +sub.f32 f885, f1345, f1341; +add.f32 f886, f853, f875; +sub.f32 f888, f853, f875; +add.f32 f1337, f1344, f876; +sub.f32 f889, f1344, f876; +sub.f32 f890, f851, f868; +add.f32 f892, f851, f868; +add.f32 f1336, f852, f867; +sub.f32 f893, f852, f867; +add.f32 f894, f855, f879; +sub.f32 f896, f855, f879; +add.f32 f1335, f856, f881; +sub.f32 f897, f856, f881; +add.f32 f898, f781, f813; +sub.f32 f900, f781, f813; +add.f32 f1334, f782, f814; +sub.f32 f901, f782, f814; +add.f32 f902, f797, f829; +sub.f32 f904, f797, f829; +add.f32 f1333, f798, f830; +sub.f32 f905, f798, f830; +add.f32 f906, f898, f902; +sub.f32 f908, f898, f902; +add.f32 f1332, f1334, f1333; +sub.f32 f909, f1334, f1333; +sub.f32 f910, f900, f905; +add.f32 f912, f900, f905; +add.f32 f1331, f901, f904; +sub.f32 f913, f901, f904; +add.f32 f914, f789, f821; +sub.f32 f916, f789, f821; +add.f32 f1330, f790, f822; +sub.f32 f917, f790, f822; +add.f32 f918, f805, f837; +sub.f32 f920, f805, f837; +add.f32 f1329, f806, f838; +sub.f32 f921, f806, f838; +add.f32 f922, f914, f918; +sub.f32 f924, f914, f918; +add.f32 f1328, f1330, f1329; +sub.f32 f925, f1330, f1329; +sub.f32 f926, f916, f921; +add.f32 f928, f916, f921; +add.f32 f1327, f917, f920; +sub.f32 f929, f917, f920; +mul.f32 f930, f926, 0f3F3504F3; +mul.f32 f931, f1327, 0f3F3504F3; +sub.f32 f932, f930, f931; +add.f32 f933, f930, f931; +mul.f32 f935, f929, 0f3F3504F3; +mul.f32 f1326, f928, 0fBF3504F3; +sub.f32 f936, f1326, f935; +mul.f32 f937, f929, 0fBF3504F3; +fma.rn.f32 f938, f928, 0f3F3504F3, f937; +add.f32 f939, f906, f922; +sub.f32 f941, f906, f922; +add.f32 f1325, f1332, f1328; +sub.f32 f942, f1332, f1328; +add.f32 f943, f910, f932; +sub.f32 f945, f910, f932; +add.f32 f1324, f1331, f933; +sub.f32 f946, f1331, f933; +sub.f32 f947, f908, f925; +add.f32 f949, f908, f925; +add.f32 f1323, f909, f924; +sub.f32 f950, f909, f924; +add.f32 f951, f912, f936; +sub.f32 f953, f912, f936; +add.f32 f1322, f913, f938; +sub.f32 f954, f913, f938; +mul.f32 f956, f1324, 0f3EC3EF15; +mul.f32 f1321, f943, 0f3F6C835E; +sub.f32 f957, f1321, f956; +mul.f32 f958, f1324, 0f3F6C835E; +fma.rn.f32 f959, f943, 0f3EC3EF15, f958; +mul.f32 f960, f947, 0f3F3504F3; +mul.f32 f961, f1323, 0f3F3504F3; +sub.f32 f962, f960, f961; +add.f32 f963, f960, f961; +mul.f32 f965, f1322, 0f3F6C835E; +mul.f32 f1320, f951, 0f3EC3EF15; +sub.f32 f966, f1320, f965; +mul.f32 f967, f1322, 0f3EC3EF15; +fma.rn.f32 f968, f951, 0f3F6C835E, f967; +mul.f32 f970, f946, 0f3F6C835E; +mul.f32 f1319, f945, 0fBEC3EF15; +sub.f32 f971, f1319, f970; +mul.f32 f972, f946, 0fBEC3EF15; +fma.rn.f32 f973, f945, 0f3F6C835E, f972; +mul.f32 f975, f950, 0f3F3504F3; +mul.f32 f1318, f949, 0fBF3504F3; +sub.f32 f976, f1318, f975; +mul.f32 f977, f950, 0fBF3504F3; +fma.rn.f32 f978, f949, 0f3F3504F3, f977; +mul.f32 f980, f954, 0f3EC3EF15; +mul.f32 f1317, f953, 0fBF6C835E; +sub.f32 f981, f1317, f980; +mul.f32 f982, f954, 0fBF6C835E; +fma.rn.f32 f983, f953, 0f3EC3EF15, f982; +add.f32 f986, f886, f957; +sub.f32 f988, f886, f957; +add.f32 f1316, f1337, f959; +sub.f32 f989, f1337, f959; +add.f32 f990, f890, f962; +sub.f32 f992, f890, f962; +add.f32 f1315, f1336, f963; +sub.f32 f993, f1336, f963; +add.f32 f994, f894, f966; +sub.f32 f996, f894, f966; +add.f32 f1314, f1335, f968; +sub.f32 f997, f1335, f968; +sub.f32 f998, f884, f942; +add.f32 f1000, f884, f942; +add.f32 f1313, f885, f941; +sub.f32 f1001, f885, f941; +add.f32 f1002, f888, f971; +sub.f32 f1004, f888, f971; +add.f32 f1312, f889, f973; +sub.f32 f1005, f889, f973; +add.f32 f1006, f892, f976; +sub.f32 f1008, f892, f976; +add.f32 f1311, f893, f978; +sub.f32 f1009, f893, f978; +add.f32 f1010, f896, f981; +sub.f32 f1012, f896, f981; +add.f32 f1310, f897, f983; +sub.f32 f1013, f897, f983; +and.b32 r21, r38, 256; +bfe.u32 r22, r38, 8, 1; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f1014, f1015}, [rd11]; +mul.f32 f1018, f1316, f1015; +mul.f32 f1020, f1014, f1316; +mul.f32 f1022, f1015, f1015; +mul.f32 f1309, f1014, f1014; +sub.f32 f1023, f1309, f1022; +mul.f32 f1024, f1015, f1014; +fma.rn.f32 f1025, f1015, f1014, f1024; +mul.f32 f1026, f1315, f1025; +mul.f32 f1028, f1023, f1315; +mul.f32 f1307, f1014, f1023; +mul.f32 f1308, f1015, f1025; +sub.f32 f1031, f1307, f1308; +mul.f32 f1306, f990, f1025; +mul.f32 f1032, f1014, f1025; +fma.rn.f32 f1033, f1015, f1023, f1032; +mul.f32 f1034, f1314, f1033; +mul.f32 f1036, f1031, f1314; +mul.f32 f1304, f1014, f1031; +mul.f32 f1305, f1015, f1033; +sub.f32 f1039, f1304, f1305; +mul.f32 f1303, f994, f1033; +mul.f32 f1040, f1014, f1033; +fma.rn.f32 f1041, f1015, f1031, f1040; +mul.f32 f1042, f1313, f1041; +mul.f32 f1044, f1039, f1313; +mul.f32 f1046, f1015, f1041; +mul.f32 f1302, f1014, f1039; +sub.f32 f1047, f1302, f1046; +mul.f32 f1301, f998, f1041; +mul.f32 f1048, f1014, f1041; +fma.rn.f32 f1049, f1015, f1039, f1048; +mul.f32 f1050, f1312, f1049; +mul.f32 f1052, f1047, f1312; +mul.f32 f1299, f1014, f1047; +mul.f32 f1300, f1015, f1049; +sub.f32 f1055, f1299, f1300; +mul.f32 f1298, f1002, f1049; +mul.f32 f1056, f1014, f1049; +fma.rn.f32 f1057, f1015, f1047, f1056; +mul.f32 f1058, f1311, f1057; +mul.f32 f1060, f1055, f1311; +mul.f32 f1062, f1015, f1057; +mul.f32 f1297, f1014, f1055; +sub.f32 f1063, f1297, f1062; +mul.f32 f1296, f1006, f1057; +mul.f32 f1064, f1014, f1057; +fma.rn.f32 f1065, f1015, f1055, f1064; +mul.f32 f1066, f1310, f1065; +mul.f32 f1068, f1063, f1310; +mul.f32 f1070, f1015, f1065; +mul.f32 f1295, f1014, f1063; +sub.f32 f1071, f1295, f1070; +mul.f32 f1294, f1010, f1065; +mul.f32 f1072, f1014, f1065; +fma.rn.f32 f1073, f1015, f1063, f1072; +sub.f32 f1293, f1338, f1325; +mul.f32 f1074, f1293, f1073; +mul.f32 f1076, f1071, f1293; +sub.f32 f1292, f882, f939; +mul.f32 f1290, f1014, f1071; +mul.f32 f1291, f1015, f1073; +sub.f32 f1079, f1290, f1291; +mul.f32 f1289, f1292, f1073; +mul.f32 f1080, f1014, f1073; +fma.rn.f32 f1081, f1015, f1071, f1080; +mul.f32 f1082, f989, f1081; +mul.f32 f1084, f1079, f989; +mul.f32 f1086, f1015, f1081; +mul.f32 f1288, f1014, f1079; +sub.f32 f1087, f1288, f1086; +mul.f32 f1287, f988, f1081; +mul.f32 f1088, f1014, f1081; +fma.rn.f32 f1089, f1015, f1079, f1088; +mul.f32 f1090, f993, f1089; +mul.f32 f1092, f1087, f993; +mul.f32 f1285, f1014, f1087; +mul.f32 f1286, f1015, f1089; +sub.f32 f1095, f1285, f1286; +mul.f32 f1284, f992, f1089; +mul.f32 f1096, f1014, f1089; +fma.rn.f32 f1097, f1015, f1087, f1096; +mul.f32 f1098, f997, f1097; +mul.f32 f1100, f1095, f997; +mul.f32 f1282, f1014, f1095; +mul.f32 f1283, f1015, f1097; +sub.f32 f1103, f1282, f1283; +mul.f32 f1281, f996, f1097; +mul.f32 f1104, f1014, f1097; +fma.rn.f32 f1105, f1015, f1095, f1104; +mul.f32 f1106, f1001, f1105; +mul.f32 f1108, f1103, f1001; +mul.f32 f1110, f1015, f1105; +mul.f32 f1280, f1014, f1103; +sub.f32 f1111, f1280, f1110; +mul.f32 f1279, f1000, f1105; +mul.f32 f1112, f1014, f1105; +fma.rn.f32 f1113, f1015, f1103, f1112; +mul.f32 f1114, f1005, f1113; +mul.f32 f1116, f1111, f1005; +mul.f32 f1277, f1014, f1111; +mul.f32 f1278, f1015, f1113; +sub.f32 f1119, f1277, f1278; +mul.f32 f1276, f1004, f1113; +mul.f32 f1120, f1014, f1113; +fma.rn.f32 f1121, f1015, f1111, f1120; +mul.f32 f1122, f1009, f1121; +mul.f32 f1124, f1119, f1009; +mul.f32 f1126, f1015, f1121; +mul.f32 f1275, f1014, f1119; +sub.f32 f1127, f1275, f1126; +mul.f32 f1274, f1008, f1121; +mul.f32 f1128, f1014, f1121; +mul.f32 f1273, f986, f1015; +fma.rn.f32 f1129, f1015, f1119, f1128; +mul.f32 f1130, f1013, f1129; +mul.f32 f1131, f1012, f1129; +mul.f32 f1132, f1127, f1013; +and.b32 r23, r48, 2040; +add.s32 r24, r9, r23; +sub.f32 f1515, f1338, f1325; +mul.f32 f1514, f1071, f1515; +barrier.sync 0; +and.b32 r25, r50, 32768; +add.s32 r26, r24, r25; +sub.f32 f1518, f1338, f1325; +mul.f32 f1517, f1071, f1518; +add.f32 f1133, f1338, f1325; +sub.f32 f1516, f882, f939; +add.f32 f1134, f882, f939; +st.shared.v2.f32 [r26], {f1134, f1133}; +mov.u32 r47, %tid.x; +and.b32 r46, r47, 256; +fma.rn.f32 f1135, f1014, f986, f1018; +sub.f32 f1136, f1020, f1273; +st.shared.v2.f32 [r26+2048], {f1135, f1136}; +fma.rn.f32 f1137, f1023, f990, f1026; +sub.f32 f1138, f1028, f1306; +st.shared.v2.f32 [r26+4096], {f1137, f1138}; +fma.rn.f32 f1139, f1031, f994, f1034; +sub.f32 f1140, f1036, f1303; +st.shared.v2.f32 [r26+6144], {f1139, f1140}; +fma.rn.f32 f1141, f1039, f998, f1042; +sub.f32 f1142, f1044, f1301; +st.shared.v2.f32 [r26+8192], {f1141, f1142}; +sub.f32 f1143, f1052, f1298; +fma.rn.f32 f1144, f1047, f1002, f1050; +st.shared.v2.f32 [r26+10240], {f1144, f1143}; +fma.rn.f32 f1145, f1055, f1006, f1058; +sub.f32 f1146, f1060, f1296; +st.shared.v2.f32 [r26+12288], {f1145, f1146}; +fma.rn.f32 f1147, f1063, f1010, f1066; +sub.f32 f1148, f1068, f1294; +st.shared.v2.f32 [r26+14336], {f1147, f1148}; +fma.rn.f32 f1149, f1071, f1516, f1074; +sub.f32 f1150, f1517, f1289; +st.shared.v2.f32 [r26+16384], {f1149, f1150}; +fma.rn.f32 f1151, f1079, f988, f1082; +sub.f32 f1152, f1084, f1287; +st.shared.v2.f32 [r26+18432], {f1151, f1152}; +fma.rn.f32 f1153, f1087, f992, f1090; +sub.f32 f1154, f1092, f1284; +st.shared.v2.f32 [r26+20480], {f1153, f1154}; +fma.rn.f32 f1155, f1095, f996, f1098; +sub.f32 f1156, f1100, f1281; +st.shared.v2.f32 [r26+22528], {f1155, f1156}; +fma.rn.f32 f1157, f1103, f1000, f1106; +sub.f32 f1158, f1108, f1279; +st.shared.v2.f32 [r26+24576], {f1157, f1158}; +fma.rn.f32 f1159, f1111, f1004, f1114; +sub.f32 f1160, f1116, f1276; +st.shared.v2.f32 [r26+26624], {f1159, f1160}; +fma.rn.f32 f1161, f1119, f1008, f1122; +sub.f32 f1162, f1124, f1274; +st.shared.v2.f32 [r26+28672], {f1161, f1162}; +fma.rn.f32 f1163, f1127, f1012, f1130; +sub.f32 f1164, f1132, f1131; +st.shared.v2.f32 [r26+30720], {f1163, f1164}; +barrier.sync 0; +mad.lo.s32 r27, r46, -120, r26; +ld.shared.v2.f32 {f1165, f1166}, [r27]; +ld.shared.v2.f32 {f1169, f1170}, [r27+4096]; +ld.shared.v2.f32 {f1173, f1174}, [r27+8192]; +ld.shared.v2.f32 {f1177, f1178}, [r27+12288]; +ld.shared.v2.f32 {f1181, f1182}, [r27+16384]; +ld.shared.v2.f32 {f1185, f1186}, [r27+20480]; +ld.shared.v2.f32 {f1189, f1190}, [r27+24576]; +ld.shared.v2.f32 {f1193, f1194}, [r27+28672]; +ld.shared.v2.f32 {f1197, f1198}, [r27+32768]; +ld.shared.v2.f32 {f1201, f1202}, [r27+36864]; +ld.shared.v2.f32 {f1205, f1206}, [r27+40960]; +ld.shared.v2.f32 {f1209, f1210}, [r27+45056]; +ld.shared.v2.f32 {f1213, f1214}, [r27+49152]; +ld.shared.v2.f32 {f1217, f1218}, [r27+53248]; +ld.shared.v2.f32 {f1221, f1222}, [r27+57344]; +ld.shared.v2.f32 {f1225, f1226}, [r27+61440]; +add.f32 %0, f1165, f1197; +add.f32 %1, f1166, f1198; +add.f32 %3, f1170, f1202; +add.f32 %2, f1169, f1201; +add.f32 %5, f1174, f1206; +add.f32 %4, f1173, f1205; +add.f32 %7, f1178, f1210; +add.f32 %6, f1177, f1209; +add.f32 %8, f1181, f1213; +add.f32 %9, f1182, f1214; +add.f32 %10, f1185, f1217; +add.f32 %11, f1186, f1218; +add.f32 %12, f1189, f1221; +add.f32 %13, f1190, f1222; +add.f32 %15, f1194, f1226; +add.f32 %14, f1193, f1225; +sub.f32 %17, f1166, f1198; +sub.f32 %16, f1165, f1197; +sub.f32 %19, f1170, f1202; +sub.f32 %18, f1169, f1201; +sub.f32 %21, f1174, f1206; +sub.f32 %20, f1173, f1205; +sub.f32 %23, f1178, f1210; +sub.f32 %22, f1177, f1209; +sub.f32 %25, f1182, f1214; +sub.f32 %24, f1181, f1213; +sub.f32 %27, f1186, f1218; +sub.f32 %26, f1185, f1217; +sub.f32 %29, f1190, f1222; +sub.f32 %28, f1189, f1221; +sub.f32 %31, f1194, f1226; +sub.f32 %30, f1193, f1225; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y): "r"(smem), "l"(lut_sp_16_8192), "l"(lut_sp_16_512), "l"(lut_sp_16_32), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[8].y), "f"(rmem[4].y), "f"(rmem[10].y), "f"(rmem[2].y), "f"(rmem[14].y), "f"(rmem[1].y), "f"(rmem[5].y), "f"(rmem[13].y), "f"(rmem[11].y), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<316, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<617>; +.reg .b32 r<35>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 15; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %21, %31; +add.f32 f34, %22, %33; +sub.f32 f35, %21, %31; +sub.f32 f36, %22, %33; +add.f32 f37, %26, %37; +add.f32 f38, %28, %38; +sub.f32 f39, %26, %37; +sub.f32 f40, %28, %38; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %23, %34; +add.f32 f50, %25, %36; +sub.f32 f51, %23, %34; +sub.f32 f52, %25, %36; +add.f32 f53, %29, %39; +add.f32 f54, %30, %40; +sub.f32 f55, %29, %39; +sub.f32 f56, %30, %40; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 f74, f41, f57; +add.f32 f75, f42, f58; +sub.f32 f76, f41, f57; +sub.f32 f77, f42, f58; +add.f32 f78, f45, f67; +add.f32 f79, f46, f68; +sub.f32 f80, f45, f67; +sub.f32 f81, f46, f68; +sub.f32 f82, f43, f60; +add.f32 f83, f44, f59; +add.f32 f84, f43, f60; +sub.f32 f85, f44, f59; +add.f32 f86, f47, f71; +add.f32 f87, f48, f73; +sub.f32 f88, f47, f71; +sub.f32 f89, f48, f73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f90, f91}, [rd5]; +mul.f32 f94, f79, f91; +fma.rn.f32 f95, f90, f78, f94; +mul.f32 f96, f78, f91; +mul.f32 f97, f90, f79; +sub.f32 f98, f97, f96; +mul.f32 f99, f90, f90; +mul.f32 f100, f91, f91; +sub.f32 f101, f99, f100; +mul.f32 f102, f91, f90; +fma.rn.f32 f103, f91, f90, f102; +mul.f32 f104, f83, f103; +fma.rn.f32 f105, f101, f82, f104; +mul.f32 f106, f82, f103; +mul.f32 f107, f101, f83; +sub.f32 f108, f107, f106; +mul.f32 f109, f90, f101; +mul.f32 f110, f91, f103; +sub.f32 f111, f109, f110; +mul.f32 f112, f90, f103; +fma.rn.f32 f113, f91, f101, f112; +mul.f32 f114, f87, f113; +fma.rn.f32 f115, f111, f86, f114; +mul.f32 f116, f86, f113; +mul.f32 f117, f111, f87; +sub.f32 f118, f117, f116; +mul.f32 f119, f90, f111; +mul.f32 f120, f91, f113; +sub.f32 f121, f119, f120; +mul.f32 f122, f90, f113; +fma.rn.f32 f123, f91, f111, f122; +mul.f32 f124, f77, f123; +fma.rn.f32 f125, f121, f76, f124; +mul.f32 f126, f76, f123; +mul.f32 f127, f121, f77; +sub.f32 f128, f127, f126; +mul.f32 f129, f90, f121; +mul.f32 f130, f91, f123; +sub.f32 f131, f129, f130; +mul.f32 f132, f90, f123; +fma.rn.f32 f133, f91, f121, f132; +mul.f32 f134, f81, f133; +fma.rn.f32 f135, f131, f80, f134; +mul.f32 f136, f80, f133; +mul.f32 f137, f131, f81; +sub.f32 f138, f137, f136; +mul.f32 f139, f90, f131; +mul.f32 f140, f91, f133; +sub.f32 f141, f139, f140; +mul.f32 f142, f90, f133; +fma.rn.f32 f143, f91, f131, f142; +mul.f32 f144, f85, f143; +fma.rn.f32 f145, f141, f84, f144; +mul.f32 f146, f84, f143; +mul.f32 f147, f141, f85; +sub.f32 f148, f147, f146; +mul.f32 f149, f90, f141; +mul.f32 f150, f91, f143; +sub.f32 f151, f149, f150; +mul.f32 f152, f90, f143; +fma.rn.f32 f153, f91, f141, f152; +mul.f32 f154, f89, f153; +fma.rn.f32 f155, f151, f88, f154; +mul.f32 f156, f88, f153; +mul.f32 f157, f151, f89; +sub.f32 f158, f157, f156; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -32768; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32736; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f74, f95, f105, f115}; +st.shared.v4.f32 [r12+16], {f125, f135, f145, f155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -28, r12; +ld.shared.f32 f159, [r13]; +ld.shared.f32 f160, [r13+4096]; +ld.shared.f32 f161, [r13+8192]; +ld.shared.f32 f162, [r13+12288]; +ld.shared.f32 f163, [r13+16384]; +ld.shared.f32 f164, [r13+20480]; +ld.shared.f32 f165, [r13+24576]; +ld.shared.f32 f166, [r13+28672]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f75, f98, f108, f118}; +st.shared.v4.f32 [r12+16], {f128, f138, f148, f158}; +barrier.sync 0; +ld.shared.f32 f167, [r13]; +ld.shared.f32 f168, [r13+4096]; +ld.shared.f32 f169, [r13+8192]; +ld.shared.f32 f170, [r13+12288]; +ld.shared.f32 f171, [r13+16384]; +ld.shared.f32 f172, [r13+20480]; +ld.shared.f32 f173, [r13+24576]; +ld.shared.f32 f174, [r13+28672]; +add.f32 f175, f159, f163; +add.f32 f176, f167, f171; +sub.f32 f177, f159, f163; +sub.f32 f178, f167, f171; +add.f32 f179, f161, f165; +add.f32 f180, f169, f173; +sub.f32 f181, f161, f165; +sub.f32 f182, f169, f173; +add.f32 f183, f175, f179; +add.f32 f184, f176, f180; +sub.f32 f185, f175, f179; +sub.f32 f186, f176, f180; +sub.f32 f187, f177, f182; +add.f32 f188, f178, f181; +add.f32 f189, f177, f182; +sub.f32 f190, f178, f181; +add.f32 f191, f160, f164; +add.f32 f192, f168, f172; +sub.f32 f193, f160, f164; +sub.f32 f194, f168, f172; +add.f32 f195, f162, f166; +add.f32 f196, f170, f174; +sub.f32 f197, f162, f166; +sub.f32 f198, f170, f174; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +mul.f32 f207, f203, 0f3F3504F3; +mul.f32 f208, f204, 0f3F3504F3; +sub.f32 f209, f207, f208; +add.f32 f210, f207, f208; +mul.f32 f211, f205, 0fBF3504F3; +mul.f32 f212, f206, 0f3F3504F3; +sub.f32 f213, f211, f212; +mul.f32 f214, f206, 0fBF3504F3; +fma.rn.f32 f215, f205, 0f3F3504F3, f214; +add.f32 f216, f183, f199; +add.f32 f217, f184, f200; +sub.f32 f218, f183, f199; +sub.f32 f219, f184, f200; +add.f32 f220, f187, f209; +add.f32 f221, f188, f210; +sub.f32 f222, f187, f209; +sub.f32 f223, f188, f210; +sub.f32 f224, f185, f202; +add.f32 f225, f186, f201; +add.f32 f226, f185, f202; +sub.f32 f227, f186, f201; +add.f32 f228, f189, f213; +add.f32 f229, f190, f215; +sub.f32 f230, f189, f213; +sub.f32 f231, f190, f215; +and.b32 r14, r5, 1016; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f232, f233}, [rd8]; +mul.f32 f236, f221, f233; +fma.rn.f32 f237, f232, f220, f236; +mul.f32 f238, f220, f233; +mul.f32 f239, f232, f221; +sub.f32 f240, f239, f238; +mul.f32 f241, f232, f232; +mul.f32 f242, f233, f233; +sub.f32 f243, f241, f242; +mul.f32 f244, f233, f232; +fma.rn.f32 f245, f233, f232, f244; +mul.f32 f246, f225, f245; +fma.rn.f32 f247, f243, f224, f246; +mul.f32 f248, f224, f245; +mul.f32 f249, f243, f225; +sub.f32 f250, f249, f248; +mul.f32 f251, f232, f243; +mul.f32 f252, f233, f245; +sub.f32 f253, f251, f252; +mul.f32 f254, f232, f245; +fma.rn.f32 f255, f233, f243, f254; +mul.f32 f256, f229, f255; +fma.rn.f32 f257, f253, f228, f256; +mul.f32 f258, f228, f255; +mul.f32 f259, f253, f229; +sub.f32 f260, f259, f258; +mul.f32 f261, f232, f253; +mul.f32 f262, f233, f255; +sub.f32 f263, f261, f262; +mul.f32 f264, f232, f255; +fma.rn.f32 f265, f233, f253, f264; +mul.f32 f266, f219, f265; +fma.rn.f32 f267, f263, f218, f266; +mul.f32 f268, f218, f265; +mul.f32 f269, f263, f219; +sub.f32 f270, f269, f268; +mul.f32 f271, f232, f263; +mul.f32 f272, f233, f265; +sub.f32 f273, f271, f272; +mul.f32 f274, f232, f265; +fma.rn.f32 f275, f233, f263, f274; +mul.f32 f276, f223, f275; +fma.rn.f32 f277, f273, f222, f276; +mul.f32 f278, f222, f275; +mul.f32 f279, f273, f223; +sub.f32 f280, f279, f278; +mul.f32 f281, f232, f273; +mul.f32 f282, f233, f275; +sub.f32 f283, f281, f282; +mul.f32 f284, f232, f275; +fma.rn.f32 f285, f233, f273, f284; +mul.f32 f286, f227, f285; +fma.rn.f32 f287, f283, f226, f286; +mul.f32 f288, f226, f285; +mul.f32 f289, f283, f227; +sub.f32 f290, f289, f288; +mul.f32 f291, f232, f283; +mul.f32 f292, f233, f285; +sub.f32 f293, f291, f292; +mul.f32 f294, f232, f285; +fma.rn.f32 f295, f233, f283, f294; +mul.f32 f296, f231, f295; +fma.rn.f32 f297, f293, f230, f296; +mul.f32 f298, f230, f295; +mul.f32 f299, f293, f231; +sub.f32 f300, f299, f298; +shl.b32 r15, r5, 2; +and.b32 r16, r15, 28; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 32512; +add.s32 r19, r17, r18; +st.shared.f32 [r19], f216; +st.shared.f32 [r19+32], f237; +st.shared.f32 [r19+64], f247; +st.shared.f32 [r19+96], f257; +st.shared.f32 [r19+128], f267; +st.shared.f32 [r19+160], f277; +st.shared.f32 [r19+192], f287; +st.shared.f32 [r19+224], f297; +barrier.sync 0; +mad.lo.s32 r20, r14, -28, r19; +ld.shared.f32 f301, [r20]; +ld.shared.f32 f302, [r20+4096]; +ld.shared.f32 f303, [r20+8192]; +ld.shared.f32 f304, [r20+12288]; +ld.shared.f32 f305, [r20+16384]; +ld.shared.f32 f306, [r20+20480]; +ld.shared.f32 f307, [r20+24576]; +ld.shared.f32 f308, [r20+28672]; +barrier.sync 0; +st.shared.f32 [r19], f217; +st.shared.f32 [r19+32], f240; +st.shared.f32 [r19+64], f250; +st.shared.f32 [r19+96], f260; +st.shared.f32 [r19+128], f270; +st.shared.f32 [r19+160], f280; +st.shared.f32 [r19+192], f290; +st.shared.f32 [r19+224], f300; +barrier.sync 0; +ld.shared.f32 f309, [r20]; +ld.shared.f32 f310, [r20+4096]; +ld.shared.f32 f311, [r20+8192]; +ld.shared.f32 f312, [r20+12288]; +ld.shared.f32 f313, [r20+16384]; +ld.shared.f32 f314, [r20+20480]; +ld.shared.f32 f315, [r20+24576]; +ld.shared.f32 f316, [r20+28672]; +add.f32 f317, f301, f305; +add.f32 f318, f309, f313; +sub.f32 f319, f301, f305; +sub.f32 f320, f309, f313; +add.f32 f321, f303, f307; +add.f32 f322, f311, f315; +sub.f32 f323, f303, f307; +sub.f32 f324, f311, f315; +add.f32 f325, f317, f321; +add.f32 f326, f318, f322; +sub.f32 f327, f317, f321; +sub.f32 f328, f318, f322; +sub.f32 f329, f319, f324; +add.f32 f330, f320, f323; +add.f32 f331, f319, f324; +sub.f32 f332, f320, f323; +add.f32 f333, f302, f306; +add.f32 f334, f310, f314; +sub.f32 f335, f302, f306; +sub.f32 f336, f310, f314; +add.f32 f337, f304, f308; +add.f32 f338, f312, f316; +sub.f32 f339, f304, f308; +sub.f32 f340, f312, f316; +add.f32 f341, f333, f337; +add.f32 f342, f334, f338; +sub.f32 f343, f333, f337; +sub.f32 f344, f334, f338; +sub.f32 f345, f335, f340; +add.f32 f346, f336, f339; +add.f32 f347, f335, f340; +sub.f32 f348, f336, f339; +mul.f32 f349, f345, 0f3F3504F3; +mul.f32 f350, f346, 0f3F3504F3; +sub.f32 f351, f349, f350; +add.f32 f352, f349, f350; +mul.f32 f353, f347, 0fBF3504F3; +mul.f32 f354, f348, 0f3F3504F3; +sub.f32 f355, f353, f354; +mul.f32 f356, f348, 0fBF3504F3; +fma.rn.f32 f357, f347, 0f3F3504F3, f356; +add.f32 f358, f325, f341; +add.f32 f359, f326, f342; +sub.f32 f360, f325, f341; +sub.f32 f361, f326, f342; +add.f32 f362, f329, f351; +add.f32 f363, f330, f352; +sub.f32 f364, f329, f351; +sub.f32 f365, f330, f352; +sub.f32 f366, f327, f344; +add.f32 f367, f328, f343; +add.f32 f368, f327, f344; +sub.f32 f369, f328, f343; +add.f32 f370, f331, f355; +add.f32 f371, f332, f357; +sub.f32 f372, f331, f355; +sub.f32 f373, f332, f357; +and.b32 r21, r5, 960; +bfe.u32 r22, r5, 6, 4; +mul.wide.u32 rd9, r22, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f374, f375}, [rd11]; +mul.f32 f378, f363, f375; +fma.rn.f32 f379, f374, f362, f378; +mul.f32 f380, f362, f375; +mul.f32 f381, f374, f363; +sub.f32 f382, f381, f380; +mul.f32 f383, f374, f374; +mul.f32 f384, f375, f375; +sub.f32 f385, f383, f384; +mul.f32 f386, f375, f374; +fma.rn.f32 f387, f375, f374, f386; +mul.f32 f388, f367, f387; +fma.rn.f32 f389, f385, f366, f388; +mul.f32 f390, f366, f387; +mul.f32 f391, f385, f367; +sub.f32 f392, f391, f390; +mul.f32 f393, f374, f385; +mul.f32 f394, f375, f387; +sub.f32 f395, f393, f394; +mul.f32 f396, f374, f387; +fma.rn.f32 f397, f375, f385, f396; +mul.f32 f398, f371, f397; +fma.rn.f32 f399, f395, f370, f398; +mul.f32 f400, f370, f397; +mul.f32 f401, f395, f371; +sub.f32 f402, f401, f400; +mul.f32 f403, f374, f395; +mul.f32 f404, f375, f397; +sub.f32 f405, f403, f404; +mul.f32 f406, f374, f397; +fma.rn.f32 f407, f375, f395, f406; +mul.f32 f408, f361, f407; +fma.rn.f32 f409, f405, f360, f408; +mul.f32 f410, f360, f407; +mul.f32 f411, f405, f361; +sub.f32 f412, f411, f410; +mul.f32 f413, f374, f405; +mul.f32 f414, f375, f407; +sub.f32 f415, f413, f414; +mul.f32 f416, f374, f407; +fma.rn.f32 f417, f375, f405, f416; +mul.f32 f418, f365, f417; +fma.rn.f32 f419, f415, f364, f418; +mul.f32 f420, f364, f417; +mul.f32 f421, f415, f365; +sub.f32 f422, f421, f420; +mul.f32 f423, f374, f415; +mul.f32 f424, f375, f417; +sub.f32 f425, f423, f424; +mul.f32 f426, f374, f417; +fma.rn.f32 f427, f375, f415, f426; +mul.f32 f428, f369, f427; +fma.rn.f32 f429, f425, f368, f428; +mul.f32 f430, f368, f427; +mul.f32 f431, f425, f369; +sub.f32 f432, f431, f430; +mul.f32 f433, f374, f425; +mul.f32 f434, f375, f427; +sub.f32 f435, f433, f434; +mul.f32 f436, f374, f427; +fma.rn.f32 f437, f375, f425, f436; +mul.f32 f438, f373, f437; +fma.rn.f32 f439, f435, f372, f438; +mul.f32 f440, f372, f437; +mul.f32 f441, f435, f373; +sub.f32 f442, f441, f440; +and.b32 r23, r15, 252; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 30720; +add.s32 r26, r24, r25; +st.shared.f32 [r26], f358; +st.shared.f32 [r26+256], f379; +st.shared.f32 [r26+512], f389; +st.shared.f32 [r26+768], f399; +st.shared.f32 [r26+1024], f409; +st.shared.f32 [r26+1280], f419; +st.shared.f32 [r26+1536], f429; +st.shared.f32 [r26+1792], f439; +barrier.sync 0; +mad.lo.s32 r27, r21, -28, r26; +ld.shared.f32 f443, [r27]; +ld.shared.f32 f444, [r27+4096]; +ld.shared.f32 f445, [r27+8192]; +ld.shared.f32 f446, [r27+12288]; +ld.shared.f32 f447, [r27+16384]; +ld.shared.f32 f448, [r27+20480]; +ld.shared.f32 f449, [r27+24576]; +ld.shared.f32 f450, [r27+28672]; +barrier.sync 0; +st.shared.f32 [r26], f359; +st.shared.f32 [r26+256], f382; +st.shared.f32 [r26+512], f392; +st.shared.f32 [r26+768], f402; +st.shared.f32 [r26+1024], f412; +st.shared.f32 [r26+1280], f422; +st.shared.f32 [r26+1536], f432; +st.shared.f32 [r26+1792], f442; +barrier.sync 0; +ld.shared.f32 f451, [r27]; +ld.shared.f32 f452, [r27+4096]; +ld.shared.f32 f453, [r27+8192]; +ld.shared.f32 f454, [r27+12288]; +ld.shared.f32 f455, [r27+16384]; +ld.shared.f32 f456, [r27+20480]; +ld.shared.f32 f457, [r27+24576]; +ld.shared.f32 f458, [r27+28672]; +add.f32 f459, f443, f447; +add.f32 f460, f451, f455; +sub.f32 f461, f443, f447; +sub.f32 f462, f451, f455; +add.f32 f463, f445, f449; +add.f32 f464, f453, f457; +sub.f32 f465, f445, f449; +sub.f32 f466, f453, f457; +add.f32 f467, f459, f463; +add.f32 f468, f460, f464; +sub.f32 f469, f459, f463; +sub.f32 f470, f460, f464; +sub.f32 f471, f461, f466; +add.f32 f472, f462, f465; +add.f32 f473, f461, f466; +sub.f32 f474, f462, f465; +add.f32 f475, f444, f448; +add.f32 f476, f452, f456; +sub.f32 f477, f444, f448; +sub.f32 f478, f452, f456; +add.f32 f479, f446, f450; +add.f32 f480, f454, f458; +sub.f32 f481, f446, f450; +sub.f32 f482, f454, f458; +add.f32 f483, f475, f479; +add.f32 f484, f476, f480; +sub.f32 f485, f475, f479; +sub.f32 f486, f476, f480; +sub.f32 f487, f477, f482; +add.f32 f488, f478, f481; +add.f32 f489, f477, f482; +sub.f32 f490, f478, f481; +mul.f32 f491, f487, 0f3F3504F3; +mul.f32 f492, f488, 0f3F3504F3; +sub.f32 f493, f491, f492; +add.f32 f494, f491, f492; +mul.f32 f495, f489, 0fBF3504F3; +mul.f32 f496, f490, 0f3F3504F3; +sub.f32 f497, f495, f496; +mul.f32 f498, f490, 0fBF3504F3; +fma.rn.f32 f499, f489, 0f3F3504F3, f498; +add.f32 f500, f467, f483; +add.f32 f501, f468, f484; +sub.f32 f502, f467, f483; +sub.f32 f503, f468, f484; +add.f32 f504, f471, f493; +add.f32 f505, f472, f494; +sub.f32 f506, f471, f493; +sub.f32 f507, f472, f494; +sub.f32 f508, f469, f486; +add.f32 f509, f470, f485; +add.f32 f510, f469, f486; +sub.f32 f511, f470, f485; +add.f32 f512, f473, f497; +add.f32 f513, f474, f499; +sub.f32 f514, f473, f497; +sub.f32 f515, f474, f499; +and.b32 r28, r5, 512; +bfe.u32 r29, r5, 9, 1; +mul.wide.u32 rd12, r29, 8; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f516, f517}, [rd14]; +mul.f32 f520, f505, f517; +fma.rn.f32 f521, f516, f504, f520; +mul.f32 f522, f504, f517; +mul.f32 f523, f516, f505; +sub.f32 f524, f523, f522; +mul.f32 f525, f516, f516; +mul.f32 f526, f517, f517; +sub.f32 f527, f525, f526; +mul.f32 f528, f517, f516; +fma.rn.f32 f529, f517, f516, f528; +mul.f32 f530, f509, f529; +fma.rn.f32 f531, f527, f508, f530; +mul.f32 f532, f508, f529; +mul.f32 f533, f527, f509; +sub.f32 f534, f533, f532; +mul.f32 f535, f516, f527; +mul.f32 f536, f517, f529; +sub.f32 f537, f535, f536; +mul.f32 f538, f516, f529; +fma.rn.f32 f539, f517, f527, f538; +mul.f32 f540, f513, f539; +fma.rn.f32 f541, f537, f512, f540; +mul.f32 f542, f512, f539; +mul.f32 f543, f537, f513; +sub.f32 f544, f543, f542; +mul.f32 f545, f516, f537; +mul.f32 f546, f517, f539; +sub.f32 f547, f545, f546; +mul.f32 f548, f516, f539; +fma.rn.f32 f549, f517, f537, f548; +mul.f32 f550, f503, f549; +fma.rn.f32 f551, f547, f502, f550; +mul.f32 f552, f502, f549; +mul.f32 f553, f547, f503; +sub.f32 f554, f553, f552; +mul.f32 f555, f516, f547; +mul.f32 f556, f517, f549; +sub.f32 f557, f555, f556; +mul.f32 f558, f516, f549; +fma.rn.f32 f559, f517, f547, f558; +mul.f32 f560, f507, f559; +fma.rn.f32 f561, f557, f506, f560; +mul.f32 f562, f506, f559; +mul.f32 f563, f557, f507; +sub.f32 f564, f563, f562; +mul.f32 f565, f516, f557; +mul.f32 f566, f517, f559; +sub.f32 f567, f565, f566; +mul.f32 f568, f516, f559; +fma.rn.f32 f569, f517, f557, f568; +mul.f32 f570, f511, f569; +fma.rn.f32 f571, f567, f510, f570; +mul.f32 f572, f510, f569; +mul.f32 f573, f567, f511; +sub.f32 f574, f573, f572; +mul.f32 f575, f516, f567; +mul.f32 f576, f517, f569; +sub.f32 f577, f575, f576; +mul.f32 f578, f516, f569; +fma.rn.f32 f579, f517, f567, f578; +mul.f32 f580, f515, f579; +fma.rn.f32 f581, f577, f514, f580; +mul.f32 f582, f514, f579; +mul.f32 f583, f577, f515; +sub.f32 f584, f583, f582; +and.b32 r30, r15, 2044; +add.s32 r31, r10, r30; +barrier.sync 0; +and.b32 r32, r8, 16384; +add.s32 r33, r31, r32; +st.shared.f32 [r33], f500; +st.shared.f32 [r33+2048], f521; +st.shared.f32 [r33+4096], f531; +st.shared.f32 [r33+6144], f541; +st.shared.f32 [r33+8192], f551; +st.shared.f32 [r33+10240], f561; +st.shared.f32 [r33+12288], f571; +st.shared.f32 [r33+14336], f581; +barrier.sync 0; +mad.lo.s32 r34, r28, -28, r33; +ld.shared.f32 f585, [r34]; +ld.shared.f32 f586, [r34+4096]; +ld.shared.f32 f587, [r34+8192]; +ld.shared.f32 f588, [r34+12288]; +ld.shared.f32 f589, [r34+16384]; +ld.shared.f32 f590, [r34+20480]; +ld.shared.f32 f591, [r34+24576]; +ld.shared.f32 f592, [r34+28672]; +barrier.sync 0; +st.shared.f32 [r33], f501; +st.shared.f32 [r33+2048], f524; +st.shared.f32 [r33+4096], f534; +st.shared.f32 [r33+6144], f544; +st.shared.f32 [r33+8192], f554; +st.shared.f32 [r33+10240], f564; +st.shared.f32 [r33+12288], f574; +st.shared.f32 [r33+14336], f584; +barrier.sync 0; +ld.shared.f32 f593, [r34]; +ld.shared.f32 f594, [r34+4096]; +ld.shared.f32 f595, [r34+8192]; +ld.shared.f32 f596, [r34+12288]; +ld.shared.f32 f597, [r34+16384]; +ld.shared.f32 f598, [r34+20480]; +ld.shared.f32 f599, [r34+24576]; +ld.shared.f32 f600, [r34+28672]; +add.f32 %0, f585, f589; +add.f32 %1, f593, f597; +add.f32 %2, f586, f590; +add.f32 %3, f594, f598; +add.f32 %4, f587, f591; +add.f32 %5, f595, f599; +add.f32 %6, f588, f592; +add.f32 %7, f596, f600; +sub.f32 %8, f585, f589; +sub.f32 %9, f593, f597; +sub.f32 %10, f586, f590; +sub.f32 %11, f594, f598; +sub.f32 %12, f587, f591; +sub.f32 %13, f595, f599; +sub.f32 %14, f588, f592; +sub.f32 %15, f596, f600; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_8192), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<314, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<681>; +.reg .b32 r<34>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f33, %21, %31; +add.f32 f34, %22, %33; +sub.f32 f35, %21, %31; +sub.f32 f36, %22, %33; +add.f32 f37, %26, %37; +add.f32 f38, %28, %38; +sub.f32 f39, %26, %37; +sub.f32 f40, %28, %38; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %23, %34; +add.f32 f50, %25, %36; +sub.f32 f51, %23, %34; +sub.f32 f52, %25, %36; +add.f32 f53, %29, %39; +add.f32 f54, %30, %40; +sub.f32 f55, %29, %39; +sub.f32 f56, %30, %40; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +sub.f32 f74, f41, f57; +sub.f32 f75, f42, f58; +add.f32 f76, f45, f67; +add.f32 f77, f46, f68; +sub.f32 f78, f45, f67; +sub.f32 f79, f46, f68; +sub.f32 f80, f43, f60; +add.f32 f81, f44, f59; +add.f32 f82, f43, f60; +sub.f32 f83, f44, f59; +add.f32 f84, f47, f71; +add.f32 f85, f48, f73; +sub.f32 f86, f47, f71; +sub.f32 f87, f48, f73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -65536; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8184; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f88, f89}, [rd5]; +mul.f32 f92, f77, f89; +mul.f32 f93, f76, f89; +mul.f32 f94, f88, f77; +mul.f32 f95, f88, f88; +mul.f32 f96, f89, f89; +sub.f32 f97, f95, f96; +mul.f32 f98, f89, f88; +fma.rn.f32 f99, f89, f88, f98; +mul.f32 f100, f81, f99; +mul.f32 f101, f80, f99; +mul.f32 f102, f97, f81; +mul.f32 f103, f88, f97; +mul.f32 f104, f89, f99; +sub.f32 f105, f103, f104; +mul.f32 f106, f88, f99; +fma.rn.f32 f107, f89, f97, f106; +mul.f32 f108, f85, f107; +mul.f32 f109, f84, f107; +mul.f32 f110, f105, f85; +mul.f32 f111, f88, f105; +mul.f32 f112, f89, f107; +sub.f32 f113, f111, f112; +mul.f32 f114, f88, f107; +fma.rn.f32 f115, f89, f105, f114; +mul.f32 f116, f75, f115; +mul.f32 f117, f74, f115; +mul.f32 f118, f113, f75; +mul.f32 f119, f88, f113; +mul.f32 f120, f89, f115; +sub.f32 f121, f119, f120; +mul.f32 f122, f88, f115; +fma.rn.f32 f123, f89, f113, f122; +mul.f32 f124, f79, f123; +mul.f32 f125, f78, f123; +mul.f32 f126, f121, f79; +mul.f32 f127, f88, f121; +mul.f32 f128, f89, f123; +sub.f32 f129, f127, f128; +mul.f32 f130, f88, f123; +fma.rn.f32 f131, f89, f121, f130; +mul.f32 f132, f83, f131; +mul.f32 f133, f82, f131; +mul.f32 f134, f129, f83; +mul.f32 f135, f88, f129; +mul.f32 f136, f89, f131; +sub.f32 f137, f135, f136; +mul.f32 f138, f88, f131; +fma.rn.f32 f139, f89, f129, f138; +mul.f32 f140, f87, f139; +mul.f32 f141, f86, f139; +mul.f32 f142, f137, f87; +barrier.sync 0; +and.b32 r11, r7, 65472; +add.s32 r12, r9, r11; +add.f32 f143, f42, f58; +add.f32 f144, f41, f57; +fma.rn.f32 f145, f88, f76, f92; +sub.f32 f146, f94, f93; +st.shared.v4.f32 [r12], {f144, f143, f145, f146}; +fma.rn.f32 f147, f97, f80, f100; +sub.f32 f148, f102, f101; +sub.f32 f149, f110, f109; +fma.rn.f32 f150, f105, f84, f108; +st.shared.v4.f32 [r12+16], {f147, f148, f150, f149}; +fma.rn.f32 f151, f113, f74, f116; +sub.f32 f152, f118, f117; +fma.rn.f32 f153, f121, f78, f124; +sub.f32 f154, f126, f125; +st.shared.v4.f32 [r12+32], {f151, f152, f153, f154}; +fma.rn.f32 f155, f129, f82, f132; +sub.f32 f156, f134, f133; +fma.rn.f32 f157, f137, f86, f140; +sub.f32 f158, f142, f141; +st.shared.v4.f32 [r12+48], {f155, f156, f157, f158}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.v2.f32 {f159, f160}, [r13]; +ld.shared.v2.f32 {f163, f164}, [r13+8192]; +ld.shared.v2.f32 {f167, f168}, [r13+16384]; +ld.shared.v2.f32 {f171, f172}, [r13+24576]; +ld.shared.v2.f32 {f175, f176}, [r13+32768]; +ld.shared.v2.f32 {f179, f180}, [r13+40960]; +ld.shared.v2.f32 {f183, f184}, [r13+49152]; +ld.shared.v2.f32 {f187, f188}, [r13+57344]; +add.f32 f191, f159, f175; +add.f32 f192, f160, f176; +sub.f32 f193, f159, f175; +sub.f32 f194, f160, f176; +add.f32 f195, f167, f183; +add.f32 f196, f168, f184; +sub.f32 f197, f167, f183; +sub.f32 f198, f168, f184; +add.f32 f199, f191, f195; +add.f32 f200, f192, f196; +sub.f32 f201, f191, f195; +sub.f32 f202, f192, f196; +sub.f32 f203, f193, f198; +add.f32 f204, f194, f197; +add.f32 f205, f193, f198; +sub.f32 f206, f194, f197; +add.f32 f207, f163, f179; +add.f32 f208, f164, f180; +sub.f32 f209, f163, f179; +sub.f32 f210, f164, f180; +add.f32 f211, f171, f187; +add.f32 f212, f172, f188; +sub.f32 f213, f171, f187; +sub.f32 f214, f172, f188; +add.f32 f215, f207, f211; +add.f32 f216, f208, f212; +sub.f32 f217, f207, f211; +sub.f32 f218, f208, f212; +sub.f32 f219, f209, f214; +add.f32 f220, f210, f213; +add.f32 f221, f209, f214; +sub.f32 f222, f210, f213; +mul.f32 f223, f219, 0f3F3504F3; +mul.f32 f224, f220, 0f3F3504F3; +sub.f32 f225, f223, f224; +add.f32 f226, f223, f224; +mul.f32 f227, f221, 0fBF3504F3; +mul.f32 f228, f222, 0f3F3504F3; +sub.f32 f229, f227, f228; +mul.f32 f230, f222, 0fBF3504F3; +fma.rn.f32 f231, f221, 0f3F3504F3, f230; +sub.f32 f232, f199, f215; +sub.f32 f233, f200, f216; +add.f32 f234, f203, f225; +add.f32 f235, f204, f226; +sub.f32 f236, f203, f225; +sub.f32 f237, f204, f226; +sub.f32 f238, f201, f218; +add.f32 f239, f202, f217; +add.f32 f240, f201, f218; +sub.f32 f241, f202, f217; +add.f32 f242, f205, f229; +add.f32 f243, f206, f231; +sub.f32 f244, f205, f229; +sub.f32 f245, f206, f231; +and.b32 r14, r5, 1016; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f246, f247}, [rd8]; +mul.f32 f250, f235, f247; +mul.f32 f251, f234, f247; +mul.f32 f252, f246, f235; +mul.f32 f253, f246, f246; +mul.f32 f254, f247, f247; +sub.f32 f255, f253, f254; +mul.f32 f256, f247, f246; +fma.rn.f32 f257, f247, f246, f256; +mul.f32 f258, f239, f257; +mul.f32 f259, f238, f257; +mul.f32 f260, f255, f239; +mul.f32 f261, f246, f255; +mul.f32 f262, f247, f257; +sub.f32 f263, f261, f262; +mul.f32 f264, f246, f257; +fma.rn.f32 f265, f247, f255, f264; +mul.f32 f266, f243, f265; +mul.f32 f267, f242, f265; +mul.f32 f268, f263, f243; +mul.f32 f269, f246, f263; +mul.f32 f270, f247, f265; +sub.f32 f271, f269, f270; +mul.f32 f272, f246, f265; +fma.rn.f32 f273, f247, f263, f272; +mul.f32 f274, f233, f273; +mul.f32 f275, f232, f273; +mul.f32 f276, f271, f233; +mul.f32 f277, f246, f271; +mul.f32 f278, f247, f273; +sub.f32 f279, f277, f278; +mul.f32 f280, f246, f273; +fma.rn.f32 f281, f247, f271, f280; +mul.f32 f282, f237, f281; +mul.f32 f283, f236, f281; +mul.f32 f284, f279, f237; +mul.f32 f285, f246, f279; +mul.f32 f286, f247, f281; +sub.f32 f287, f285, f286; +mul.f32 f288, f246, f281; +fma.rn.f32 f289, f247, f279, f288; +mul.f32 f290, f241, f289; +mul.f32 f291, f240, f289; +mul.f32 f292, f287, f241; +mul.f32 f293, f246, f287; +mul.f32 f294, f247, f289; +sub.f32 f295, f293, f294; +mul.f32 f296, f246, f289; +fma.rn.f32 f297, f247, f287, f296; +mul.f32 f298, f245, f297; +mul.f32 f299, f244, f297; +mul.f32 f300, f295, f245; +and.b32 r15, r10, 56; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 65024; +add.s32 r18, r16, r17; +add.f32 f301, f200, f216; +add.f32 f302, f199, f215; +st.shared.v2.f32 [r18], {f302, f301}; +fma.rn.f32 f303, f246, f234, f250; +sub.f32 f304, f252, f251; +st.shared.v2.f32 [r18+64], {f303, f304}; +fma.rn.f32 f305, f255, f238, f258; +sub.f32 f306, f260, f259; +st.shared.v2.f32 [r18+128], {f305, f306}; +fma.rn.f32 f307, f263, f242, f266; +sub.f32 f308, f268, f267; +st.shared.v2.f32 [r18+192], {f307, f308}; +sub.f32 f309, f276, f275; +fma.rn.f32 f310, f271, f232, f274; +st.shared.v2.f32 [r18+256], {f310, f309}; +fma.rn.f32 f311, f279, f236, f282; +sub.f32 f312, f284, f283; +st.shared.v2.f32 [r18+320], {f311, f312}; +fma.rn.f32 f313, f287, f240, f290; +sub.f32 f314, f292, f291; +st.shared.v2.f32 [r18+384], {f313, f314}; +fma.rn.f32 f315, f295, f244, f298; +sub.f32 f316, f300, f299; +st.shared.v2.f32 [r18+448], {f315, f316}; +barrier.sync 0; +mad.lo.s32 r19, r14, -56, r18; +ld.shared.v2.f32 {f317, f318}, [r19]; +ld.shared.v2.f32 {f321, f322}, [r19+8192]; +ld.shared.v2.f32 {f325, f326}, [r19+16384]; +ld.shared.v2.f32 {f329, f330}, [r19+24576]; +ld.shared.v2.f32 {f333, f334}, [r19+32768]; +ld.shared.v2.f32 {f337, f338}, [r19+40960]; +ld.shared.v2.f32 {f341, f342}, [r19+49152]; +ld.shared.v2.f32 {f345, f346}, [r19+57344]; +add.f32 f349, f317, f333; +add.f32 f350, f318, f334; +sub.f32 f351, f317, f333; +sub.f32 f352, f318, f334; +add.f32 f353, f325, f341; +add.f32 f354, f326, f342; +sub.f32 f355, f325, f341; +sub.f32 f356, f326, f342; +add.f32 f357, f349, f353; +add.f32 f358, f350, f354; +sub.f32 f359, f349, f353; +sub.f32 f360, f350, f354; +sub.f32 f361, f351, f356; +add.f32 f362, f352, f355; +add.f32 f363, f351, f356; +sub.f32 f364, f352, f355; +add.f32 f365, f321, f337; +add.f32 f366, f322, f338; +sub.f32 f367, f321, f337; +sub.f32 f368, f322, f338; +add.f32 f369, f329, f345; +add.f32 f370, f330, f346; +sub.f32 f371, f329, f345; +sub.f32 f372, f330, f346; +add.f32 f373, f365, f369; +add.f32 f374, f366, f370; +sub.f32 f375, f365, f369; +sub.f32 f376, f366, f370; +sub.f32 f377, f367, f372; +add.f32 f378, f368, f371; +add.f32 f379, f367, f372; +sub.f32 f380, f368, f371; +mul.f32 f381, f377, 0f3F3504F3; +mul.f32 f382, f378, 0f3F3504F3; +sub.f32 f383, f381, f382; +add.f32 f384, f381, f382; +mul.f32 f385, f379, 0fBF3504F3; +mul.f32 f386, f380, 0f3F3504F3; +sub.f32 f387, f385, f386; +mul.f32 f388, f380, 0fBF3504F3; +fma.rn.f32 f389, f379, 0f3F3504F3, f388; +sub.f32 f390, f357, f373; +sub.f32 f391, f358, f374; +add.f32 f392, f361, f383; +add.f32 f393, f362, f384; +sub.f32 f394, f361, f383; +sub.f32 f395, f362, f384; +sub.f32 f396, f359, f376; +add.f32 f397, f360, f375; +add.f32 f398, f359, f376; +sub.f32 f399, f360, f375; +add.f32 f400, f363, f387; +add.f32 f401, f364, f389; +sub.f32 f402, f363, f387; +sub.f32 f403, f364, f389; +and.b32 r20, r5, 960; +bfe.u32 r21, r5, 6, 4; +mul.wide.u32 rd9, r21, 8; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f404, f405}, [rd11]; +mul.f32 f408, f393, f405; +mul.f32 f409, f392, f405; +mul.f32 f410, f404, f393; +mul.f32 f411, f404, f404; +mul.f32 f412, f405, f405; +sub.f32 f413, f411, f412; +mul.f32 f414, f405, f404; +fma.rn.f32 f415, f405, f404, f414; +mul.f32 f416, f397, f415; +mul.f32 f417, f396, f415; +mul.f32 f418, f413, f397; +mul.f32 f419, f404, f413; +mul.f32 f420, f405, f415; +sub.f32 f421, f419, f420; +mul.f32 f422, f404, f415; +fma.rn.f32 f423, f405, f413, f422; +mul.f32 f424, f401, f423; +mul.f32 f425, f400, f423; +mul.f32 f426, f421, f401; +mul.f32 f427, f404, f421; +mul.f32 f428, f405, f423; +sub.f32 f429, f427, f428; +mul.f32 f430, f404, f423; +fma.rn.f32 f431, f405, f421, f430; +mul.f32 f432, f391, f431; +mul.f32 f433, f390, f431; +mul.f32 f434, f429, f391; +mul.f32 f435, f404, f429; +mul.f32 f436, f405, f431; +sub.f32 f437, f435, f436; +mul.f32 f438, f404, f431; +fma.rn.f32 f439, f405, f429, f438; +mul.f32 f440, f395, f439; +mul.f32 f441, f394, f439; +mul.f32 f442, f437, f395; +mul.f32 f443, f404, f437; +mul.f32 f444, f405, f439; +sub.f32 f445, f443, f444; +mul.f32 f446, f404, f439; +fma.rn.f32 f447, f405, f437, f446; +mul.f32 f448, f399, f447; +mul.f32 f449, f398, f447; +mul.f32 f450, f445, f399; +mul.f32 f451, f404, f445; +mul.f32 f452, f405, f447; +sub.f32 f453, f451, f452; +mul.f32 f454, f404, f447; +fma.rn.f32 f455, f405, f445, f454; +mul.f32 f456, f403, f455; +mul.f32 f457, f402, f455; +mul.f32 f458, f453, f403; +and.b32 r22, r10, 504; +add.s32 r23, r9, r22; +barrier.sync 0; +and.b32 r24, r7, 61440; +add.s32 r25, r23, r24; +add.f32 f459, f358, f374; +add.f32 f460, f357, f373; +st.shared.v2.f32 [r25], {f460, f459}; +fma.rn.f32 f461, f404, f392, f408; +sub.f32 f462, f410, f409; +st.shared.v2.f32 [r25+512], {f461, f462}; +fma.rn.f32 f463, f413, f396, f416; +sub.f32 f464, f418, f417; +st.shared.v2.f32 [r25+1024], {f463, f464}; +fma.rn.f32 f465, f421, f400, f424; +sub.f32 f466, f426, f425; +st.shared.v2.f32 [r25+1536], {f465, f466}; +sub.f32 f467, f434, f433; +fma.rn.f32 f468, f429, f390, f432; +st.shared.v2.f32 [r25+2048], {f468, f467}; +fma.rn.f32 f469, f437, f394, f440; +sub.f32 f470, f442, f441; +st.shared.v2.f32 [r25+2560], {f469, f470}; +fma.rn.f32 f471, f445, f398, f448; +sub.f32 f472, f450, f449; +st.shared.v2.f32 [r25+3072], {f471, f472}; +fma.rn.f32 f473, f453, f402, f456; +sub.f32 f474, f458, f457; +st.shared.v2.f32 [r25+3584], {f473, f474}; +barrier.sync 0; +mad.lo.s32 r26, r20, -56, r25; +ld.shared.v2.f32 {f475, f476}, [r26]; +ld.shared.v2.f32 {f479, f480}, [r26+8192]; +ld.shared.v2.f32 {f483, f484}, [r26+16384]; +ld.shared.v2.f32 {f487, f488}, [r26+24576]; +ld.shared.v2.f32 {f491, f492}, [r26+32768]; +ld.shared.v2.f32 {f495, f496}, [r26+40960]; +ld.shared.v2.f32 {f499, f500}, [r26+49152]; +ld.shared.v2.f32 {f503, f504}, [r26+57344]; +add.f32 f507, f475, f491; +add.f32 f508, f476, f492; +sub.f32 f509, f475, f491; +sub.f32 f510, f476, f492; +add.f32 f511, f483, f499; +add.f32 f512, f484, f500; +sub.f32 f513, f483, f499; +sub.f32 f514, f484, f500; +add.f32 f515, f507, f511; +add.f32 f516, f508, f512; +sub.f32 f517, f507, f511; +sub.f32 f518, f508, f512; +sub.f32 f519, f509, f514; +add.f32 f520, f510, f513; +add.f32 f521, f509, f514; +sub.f32 f522, f510, f513; +add.f32 f523, f479, f495; +add.f32 f524, f480, f496; +sub.f32 f525, f479, f495; +sub.f32 f526, f480, f496; +add.f32 f527, f487, f503; +add.f32 f528, f488, f504; +sub.f32 f529, f487, f503; +sub.f32 f530, f488, f504; +add.f32 f531, f523, f527; +add.f32 f532, f524, f528; +sub.f32 f533, f523, f527; +sub.f32 f534, f524, f528; +sub.f32 f535, f525, f530; +add.f32 f536, f526, f529; +add.f32 f537, f525, f530; +sub.f32 f538, f526, f529; +mul.f32 f539, f535, 0f3F3504F3; +mul.f32 f540, f536, 0f3F3504F3; +sub.f32 f541, f539, f540; +add.f32 f542, f539, f540; +mul.f32 f543, f537, 0fBF3504F3; +mul.f32 f544, f538, 0f3F3504F3; +sub.f32 f545, f543, f544; +mul.f32 f546, f538, 0fBF3504F3; +fma.rn.f32 f547, f537, 0f3F3504F3, f546; +sub.f32 f548, f515, f531; +sub.f32 f549, f516, f532; +add.f32 f550, f519, f541; +add.f32 f551, f520, f542; +sub.f32 f552, f519, f541; +sub.f32 f553, f520, f542; +sub.f32 f554, f517, f534; +add.f32 f555, f518, f533; +add.f32 f556, f517, f534; +sub.f32 f557, f518, f533; +add.f32 f558, f521, f545; +add.f32 f559, f522, f547; +sub.f32 f560, f521, f545; +sub.f32 f561, f522, f547; +and.b32 r27, r5, 512; +bfe.u32 r28, r5, 9, 1; +mul.wide.u32 rd12, r28, 8; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f32 {f562, f563}, [rd14]; +mul.f32 f566, f551, f563; +mul.f32 f567, f550, f563; +mul.f32 f568, f562, f551; +mul.f32 f569, f562, f562; +mul.f32 f570, f563, f563; +sub.f32 f571, f569, f570; +mul.f32 f572, f563, f562; +fma.rn.f32 f573, f563, f562, f572; +mul.f32 f574, f555, f573; +mul.f32 f575, f554, f573; +mul.f32 f576, f571, f555; +mul.f32 f577, f562, f571; +mul.f32 f578, f563, f573; +sub.f32 f579, f577, f578; +mul.f32 f580, f562, f573; +fma.rn.f32 f581, f563, f571, f580; +mul.f32 f582, f559, f581; +mul.f32 f583, f558, f581; +mul.f32 f584, f579, f559; +mul.f32 f585, f562, f579; +mul.f32 f586, f563, f581; +sub.f32 f587, f585, f586; +mul.f32 f588, f562, f581; +fma.rn.f32 f589, f563, f579, f588; +mul.f32 f590, f549, f589; +mul.f32 f591, f548, f589; +mul.f32 f592, f587, f549; +mul.f32 f593, f562, f587; +mul.f32 f594, f563, f589; +sub.f32 f595, f593, f594; +mul.f32 f596, f562, f589; +fma.rn.f32 f597, f563, f587, f596; +mul.f32 f598, f553, f597; +mul.f32 f599, f552, f597; +mul.f32 f600, f595, f553; +mul.f32 f601, f562, f595; +mul.f32 f602, f563, f597; +sub.f32 f603, f601, f602; +mul.f32 f604, f562, f597; +fma.rn.f32 f605, f563, f595, f604; +mul.f32 f606, f557, f605; +mul.f32 f607, f556, f605; +mul.f32 f608, f603, f557; +mul.f32 f609, f562, f603; +mul.f32 f610, f563, f605; +sub.f32 f611, f609, f610; +mul.f32 f612, f562, f605; +fma.rn.f32 f613, f563, f603, f612; +mul.f32 f614, f561, f613; +mul.f32 f615, f560, f613; +mul.f32 f616, f611, f561; +and.b32 r29, r10, 4088; +add.s32 r30, r9, r29; +barrier.sync 0; +and.b32 r31, r7, 32768; +add.s32 r32, r30, r31; +add.f32 f617, f516, f532; +add.f32 f618, f515, f531; +st.shared.v2.f32 [r32], {f618, f617}; +fma.rn.f32 f619, f562, f550, f566; +sub.f32 f620, f568, f567; +st.shared.v2.f32 [r32+4096], {f619, f620}; +fma.rn.f32 f621, f571, f554, f574; +sub.f32 f622, f576, f575; +st.shared.v2.f32 [r32+8192], {f621, f622}; +fma.rn.f32 f623, f579, f558, f582; +sub.f32 f624, f584, f583; +st.shared.v2.f32 [r32+12288], {f623, f624}; +sub.f32 f625, f592, f591; +fma.rn.f32 f626, f587, f548, f590; +st.shared.v2.f32 [r32+16384], {f626, f625}; +fma.rn.f32 f627, f595, f552, f598; +sub.f32 f628, f600, f599; +st.shared.v2.f32 [r32+20480], {f627, f628}; +fma.rn.f32 f629, f603, f556, f606; +sub.f32 f630, f608, f607; +st.shared.v2.f32 [r32+24576], {f629, f630}; +fma.rn.f32 f631, f611, f560, f614; +sub.f32 f632, f616, f615; +st.shared.v2.f32 [r32+28672], {f631, f632}; +barrier.sync 0; +mad.lo.s32 r33, r27, -56, r32; +ld.shared.v2.f32 {f633, f634}, [r33]; +ld.shared.v2.f32 {f637, f638}, [r33+8192]; +ld.shared.v2.f32 {f641, f642}, [r33+16384]; +ld.shared.v2.f32 {f645, f646}, [r33+24576]; +ld.shared.v2.f32 {f649, f650}, [r33+32768]; +ld.shared.v2.f32 {f653, f654}, [r33+40960]; +ld.shared.v2.f32 {f657, f658}, [r33+49152]; +ld.shared.v2.f32 {f661, f662}, [r33+57344]; +add.f32 %1, f634, f650; +add.f32 %0, f633, f649; +add.f32 %3, f638, f654; +add.f32 %2, f637, f653; +add.f32 %5, f642, f658; +add.f32 %4, f641, f657; +add.f32 %7, f646, f662; +add.f32 %6, f645, f661; +sub.f32 %9, f634, f650; +sub.f32 %8, f633, f649; +sub.f32 %11, f638, f654; +sub.f32 %10, f637, f653; +sub.f32 %13, f642, f658; +sub.f32 %12, f641, f657; +sub.f32 %15, f646, f662; +sub.f32 %14, f645, f661; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "r"(smem), "l"(lut_sp_8_8192), "l"(lut_sp_8_1024), "l"(lut_sp_8_128), "l"(lut_sp_8_16), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..0559e0c40fa20 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp64_fwd.hpp.inc @@ -0,0 +1,3703 @@ +#ifndef CUFFTDX_FFT_8192_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_8192_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1162, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<59>; +.reg .f64 fd<1513>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %36, %52; +sub.f64 fd67, %36, %52; +add.f64 fd1498, %37, %68; +sub.f64 fd68, %37, %68; +add.f64 fd69, %44, %60; +sub.f64 fd71, %44, %60; +add.f64 fd1496, %69, %61; +sub.f64 fd72, %69, %61; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1495, fd1498, fd1496; +sub.f64 fd76, fd1498, fd1496; +add.f64 fd77, fd67, fd72; +sub.f64 fd79, fd67, fd72; +sub.f64 fd1494, fd68, fd71; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %56; +sub.f64 fd83, %40, %56; +add.f64 fd1491, %71, %70; +sub.f64 fd84, %71, %70; +add.f64 fd85, %48, %64; +sub.f64 fd87, %48, %64; +add.f64 fd1489, %49, %72; +sub.f64 fd88, %49, %72; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1488, fd1491, fd1489; +sub.f64 fd92, fd1491, fd1489; +add.f64 fd93, fd83, fd88; +sub.f64 fd95, fd83, fd88; +sub.f64 fd1487, fd84, fd87; +add.f64 fd96, fd84, fd87; +mul.f64 fd98, fd1487, 0dBFE6A09E667F3BCD; +mul.f64 fd1486, fd93, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd1486, fd98; +mul.f64 fd100, fd1487, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1485, fd1495, fd1488; +sub.f64 fd109, fd1495, fd1488; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1484, fd1494, fd101; +sub.f64 fd113, fd1494, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd116, fd75, fd92; +sub.f64 fd1483, fd76, fd91; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +sub.f64 fd120, fd79, fd104; +add.f64 fd1482, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %38, %54; +sub.f64 fd124, %38, %54; +add.f64 fd1480, %73, %55; +sub.f64 fd125, %73, %55; +add.f64 fd126, %46, %62; +sub.f64 fd128, %46, %62; +add.f64 fd1477, %74, %75; +sub.f64 fd129, %74, %75; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1476, fd1480, fd1477; +sub.f64 fd133, fd1480, fd1477; +add.f64 fd134, fd124, fd129; +sub.f64 fd136, fd124, fd129; +sub.f64 fd1475, fd125, fd128; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %58; +sub.f64 fd140, %42, %58; +add.f64 fd1473, %43, %76; +sub.f64 fd141, %43, %76; +add.f64 fd142, %50, %66; +sub.f64 fd144, %50, %66; +add.f64 fd1471, %77, %67; +sub.f64 fd145, %77, %67; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1470, fd1473, fd1471; +sub.f64 fd149, fd1473, fd1471; +add.f64 fd150, fd140, fd145; +sub.f64 fd152, fd140, fd145; +sub.f64 fd1469, fd141, fd144; +add.f64 fd153, fd141, fd144; +mul.f64 fd155, fd1469, 0dBFE6A09E667F3BCD; +mul.f64 fd1468, fd150, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd1468, fd155; +mul.f64 fd157, fd1469, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1467, fd1476, fd1470; +sub.f64 fd166, fd1476, fd1470; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1466, fd1475, fd158; +sub.f64 fd170, fd1475, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd173, fd132, fd149; +sub.f64 fd1465, fd133, fd148; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +sub.f64 fd177, fd136, fd161; +add.f64 fd1464, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1462, fd167, 0d3FED906BCF328D46; +mul.f64 fd1463, fd1466, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd1462, fd1463; +mul.f64 fd182, fd1466, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd1460, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd1461, fd1465, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd1460, fd1461; +mul.f64 fd187, fd1465, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd1458, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd1459, fd1464, 0dBFED906BCF328D46; +sub.f64 fd191, fd1458, fd1459; +mul.f64 fd192, fd1464, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd1456, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd1457, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd1456, fd1457; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd1454, fd177, 0dBFED906BCF328D46; +mul.f64 fd1455, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd1454, fd1455; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1453, fd1484, fd183; +sub.f64 fd213, fd1484, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1452, fd1483, fd188; +sub.f64 fd217, fd1483, fd188; +add.f64 fd218, fd118, fd191; +sub.f64 fd220, fd118, fd191; +add.f64 fd1451, fd1482, fd193; +sub.f64 fd221, fd1482, fd193; +add.f64 fd222, fd108, fd166; +sub.f64 fd224, fd108, fd166; +sub.f64 fd1450, fd109, fd165; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd196; +sub.f64 fd228, fd112, fd196; +add.f64 fd1449, fd113, fd198; +sub.f64 fd229, fd113, fd198; +add.f64 fd230, fd116, fd201; +sub.f64 fd232, fd116, fd201; +add.f64 fd1448, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1447, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r28, %tid.x; +shl.b32 r7, r28, 8; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r28, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd243, fd239, fd1453; +mul.f64 fd244, fd238, fd1453; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1446, fd238, fd238; +sub.f64 fd247, fd1446, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd251, fd249, fd1452; +mul.f64 fd252, fd247, fd1452; +mul.f64 fd1444, fd238, fd247; +mul.f64 fd1445, fd239, fd249; +sub.f64 fd255, fd1444, fd1445; +mul.f64 fd1443, fd247, fd214; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd259, fd257, fd1451; +mul.f64 fd260, fd255, fd1451; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1442, fd238, fd255; +sub.f64 fd263, fd1442, fd262; +mul.f64 fd1441, fd255, fd218; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd267, fd265, fd1450; +mul.f64 fd268, fd263, fd1450; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1440, fd238, fd263; +sub.f64 fd271, fd1440, fd270; +mul.f64 fd1439, fd263, fd222; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd275, fd273, fd1449; +mul.f64 fd276, fd271, fd1449; +mul.f64 fd1437, fd238, fd271; +mul.f64 fd1438, fd239, fd273; +sub.f64 fd279, fd1437, fd1438; +mul.f64 fd1436, fd271, fd226; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd283, fd281, fd1448; +mul.f64 fd284, fd279, fd1448; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1435, fd238, fd279; +sub.f64 fd287, fd1435, fd286; +mul.f64 fd1434, fd279, fd230; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd291, fd289, fd1447; +mul.f64 fd292, fd287, fd1447; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1433, fd238, fd287; +sub.f64 fd295, fd1433, fd294; +mul.f64 fd1432, fd287, fd234; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1431, fd1485, fd1467; +sub.f64 fd1430, fd106, fd163; +mul.f64 fd298, fd295, fd1430; +mul.f64 fd299, fd297, fd1431; +mul.f64 fd300, fd295, fd1431; +ld.global.v2.f64 {fd301, fd302}, [rd5+8192]; +mul.f64 fd306, fd302, fd213; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1428, fd238, fd301; +mul.f64 fd1429, fd239, fd302; +sub.f64 fd310, fd1428, fd1429; +mul.f64 fd1427, fd301, fd212; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd314, fd312, fd217; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1426, fd238, fd310; +sub.f64 fd318, fd1426, fd317; +mul.f64 fd1425, fd310, fd216; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd322, fd320, fd221; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1423, fd238, fd318; +mul.f64 fd1424, fd239, fd320; +sub.f64 fd326, fd1423, fd1424; +mul.f64 fd1422, fd318, fd220; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd330, fd328, fd225; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1420, fd238, fd326; +mul.f64 fd1421, fd239, fd328; +sub.f64 fd334, fd1420, fd1421; +mul.f64 fd1419, fd326, fd224; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd338, fd336, fd229; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1418, fd238, fd334; +sub.f64 fd342, fd1418, fd341; +mul.f64 fd1417, fd334, fd228; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd346, fd344, fd233; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1415, fd238, fd342; +mul.f64 fd1416, fd239, fd344; +sub.f64 fd350, fd1415, fd1416; +mul.f64 fd1414, fd238, fd210; +mul.f64 fd351, fd238, fd344; +mul.f64 fd1413, fd342, fd232; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd350, fd236; +mul.f64 fd354, fd352, fd237; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 130816; +add.s32 r12, r9, r11; +sub.f64 fd1512, fd1485, fd1467; +mul.f64 fd1511, fd297, fd1512; +add.f64 fd356, fd1485, fd1467; +and.b32 r34, r7, 130816; +add.s32 r33, r9, r34; +sub.f64 fd1510, fd106, fd163; +add.f64 fd357, fd106, fd163; +and.b32 r52, r7, 130816; +add.s32 r51, r9, r52; +st.shared.v2.f64 [r51], {fd357, fd356}; +mov.u32 r50, %tid.x; +fma.rn.f64 fd358, fd239, fd210, fd244; +sub.f64 fd359, fd1414, fd243; +st.shared.v2.f64 [r51+16], {fd359, fd358}; +fma.rn.f64 fd360, fd249, fd214, fd252; +sub.f64 fd361, fd1443, fd251; +st.shared.v2.f64 [r51+32], {fd361, fd360}; +fma.rn.f64 fd362, fd257, fd218, fd260; +sub.f64 fd363, fd1441, fd259; +st.shared.v2.f64 [r51+48], {fd363, fd362}; +sub.f64 fd364, fd1439, fd267; +fma.rn.f64 fd365, fd265, fd222, fd268; +st.shared.v2.f64 [r51+64], {fd364, fd365}; +fma.rn.f64 fd366, fd273, fd226, fd276; +sub.f64 fd367, fd1436, fd275; +st.shared.v2.f64 [r51+80], {fd367, fd366}; +fma.rn.f64 fd368, fd281, fd230, fd284; +sub.f64 fd369, fd1434, fd283; +st.shared.v2.f64 [r51+96], {fd369, fd368}; +fma.rn.f64 fd370, fd289, fd234, fd292; +sub.f64 fd371, fd1432, fd291; +st.shared.v2.f64 [r51+112], {fd371, fd370}; +fma.rn.f64 fd372, fd297, fd1510, fd300; +sub.f64 fd373, fd298, fd1511; +st.shared.v2.f64 [r51+128], {fd373, fd372}; +fma.rn.f64 fd374, fd302, fd212, fd307; +sub.f64 fd375, fd1427, fd306; +st.shared.v2.f64 [r51+144], {fd375, fd374}; +fma.rn.f64 fd376, fd312, fd216, fd315; +sub.f64 fd377, fd1425, fd314; +st.shared.v2.f64 [r51+160], {fd377, fd376}; +fma.rn.f64 fd378, fd320, fd220, fd323; +sub.f64 fd379, fd1422, fd322; +st.shared.v2.f64 [r51+176], {fd379, fd378}; +sub.f64 fd380, fd1419, fd330; +fma.rn.f64 fd381, fd328, fd224, fd331; +st.shared.v2.f64 [r51+192], {fd380, fd381}; +fma.rn.f64 fd382, fd336, fd228, fd339; +sub.f64 fd383, fd1417, fd338; +st.shared.v2.f64 [r51+208], {fd383, fd382}; +fma.rn.f64 fd384, fd344, fd232, fd347; +sub.f64 fd385, fd1413, fd346; +st.shared.v2.f64 [r51+224], {fd385, fd384}; +fma.rn.f64 fd386, fd352, fd236, fd355; +sub.f64 fd387, fd353, fd354; +st.shared.v2.f64 [r51+240], {fd387, fd386}; +barrier.sync 0; +and.b32 r27, r50, 511; +mad.lo.s32 r13, r27, -240, r51; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+8192]; +ld.shared.v2.f64 {fd396, fd397}, [r13+16384]; +ld.shared.v2.f64 {fd400, fd401}, [r13+24576]; +ld.shared.v2.f64 {fd404, fd405}, [r13+32768]; +ld.shared.v2.f64 {fd408, fd409}, [r13+40960]; +ld.shared.v2.f64 {fd412, fd413}, [r13+49152]; +ld.shared.v2.f64 {fd416, fd417}, [r13+57344]; +ld.shared.v2.f64 {fd420, fd421}, [r13+65536]; +ld.shared.v2.f64 {fd424, fd425}, [r13+73728]; +ld.shared.v2.f64 {fd428, fd429}, [r13+81920]; +ld.shared.v2.f64 {fd432, fd433}, [r13+90112]; +ld.shared.v2.f64 {fd436, fd437}, [r13+98304]; +ld.shared.v2.f64 {fd440, fd441}, [r13+106496]; +ld.shared.v2.f64 {fd444, fd445}, [r13+114688]; +ld.shared.v2.f64 {fd448, fd449}, [r13+122880]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd1412, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd1411, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd1410, fd1412, fd1411; +sub.f64 fd463, fd1412, fd1411; +add.f64 fd464, fd454, fd459; +sub.f64 fd466, fd454, fd459; +sub.f64 fd1409, fd455, fd458; +add.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd1408, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd1407, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd1406, fd1408, fd1407; +sub.f64 fd479, fd1408, fd1407; +add.f64 fd480, fd470, fd475; +sub.f64 fd482, fd470, fd475; +sub.f64 fd1405, fd471, fd474; +add.f64 fd483, fd471, fd474; +mul.f64 fd485, fd1405, 0dBFE6A09E667F3BCD; +mul.f64 fd1404, fd480, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd1404, fd485; +mul.f64 fd487, fd1405, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd488, fd480, 0dBFE6A09E667F3BCD, fd487; +mul.f64 fd489, fd482, 0dBFE6A09E667F3BCD; +mul.f64 fd490, fd483, 0dBFE6A09E667F3BCD; +sub.f64 fd491, fd489, fd490; +add.f64 fd492, fd489, fd490; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd1403, fd1410, fd1406; +sub.f64 fd496, fd1410, fd1406; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd1402, fd1409, fd488; +sub.f64 fd500, fd1409, fd488; +add.f64 fd501, fd462, fd479; +sub.f64 fd503, fd462, fd479; +sub.f64 fd1401, fd463, fd478; +add.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd491; +sub.f64 fd507, fd466, fd491; +add.f64 fd1400, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd1399, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd1398, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd1397, fd1399, fd1398; +sub.f64 fd520, fd1399, fd1398; +add.f64 fd521, fd511, fd516; +sub.f64 fd523, fd511, fd516; +sub.f64 fd1396, fd512, fd515; +add.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd1395, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd1394, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd1393, fd1395, fd1394; +sub.f64 fd536, fd1395, fd1394; +add.f64 fd537, fd527, fd532; +sub.f64 fd539, fd527, fd532; +sub.f64 fd1392, fd528, fd531; +add.f64 fd540, fd528, fd531; +mul.f64 fd542, fd1392, 0dBFE6A09E667F3BCD; +mul.f64 fd1391, fd537, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd1391, fd542; +mul.f64 fd544, fd1392, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd545, fd537, 0dBFE6A09E667F3BCD, fd544; +mul.f64 fd546, fd539, 0dBFE6A09E667F3BCD; +mul.f64 fd547, fd540, 0dBFE6A09E667F3BCD; +sub.f64 fd548, fd546, fd547; +add.f64 fd549, fd546, fd547; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd1390, fd1397, fd1393; +sub.f64 fd553, fd1397, fd1393; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd1389, fd1396, fd545; +sub.f64 fd557, fd1396, fd545; +add.f64 fd558, fd519, fd536; +sub.f64 fd560, fd519, fd536; +sub.f64 fd1388, fd520, fd535; +add.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd548; +sub.f64 fd564, fd523, fd548; +add.f64 fd1387, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd1385, fd554, 0d3FED906BCF328D46; +mul.f64 fd1386, fd1389, 0dBFD87DE2A6AEA963; +sub.f64 fd568, fd1385, fd1386; +mul.f64 fd569, fd1389, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0dBFD87DE2A6AEA963, fd569; +mul.f64 fd572, fd1388, 0dBFE6A09E667F3BCD; +mul.f64 fd1384, fd558, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd1384, fd572; +mul.f64 fd574, fd1388, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd575, fd558, 0dBFE6A09E667F3BCD, fd574; +mul.f64 fd577, fd1387, 0dBFED906BCF328D46; +mul.f64 fd1383, fd562, 0d3FD87DE2A6AEA963; +sub.f64 fd578, fd1383, fd577; +mul.f64 fd579, fd1387, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd580, fd562, 0dBFED906BCF328D46, fd579; +mul.f64 fd582, fd557, 0dBFED906BCF328D46; +mul.f64 fd1382, fd556, 0dBFD87DE2A6AEA963; +sub.f64 fd583, fd1382, fd582; +mul.f64 fd584, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd585, fd556, 0dBFED906BCF328D46, fd584; +mul.f64 fd586, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd587, fd561, 0dBFE6A09E667F3BCD; +sub.f64 fd588, fd586, fd587; +add.f64 fd589, fd586, fd587; +mul.f64 fd591, fd565, 0dBFD87DE2A6AEA963; +mul.f64 fd1381, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd1381, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0dBFD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd1380, fd1402, fd570; +sub.f64 fd600, fd1402, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd1379, fd1401, fd575; +sub.f64 fd604, fd1401, fd575; +add.f64 fd605, fd505, fd578; +sub.f64 fd607, fd505, fd578; +add.f64 fd1378, fd1400, fd580; +sub.f64 fd608, fd1400, fd580; +add.f64 fd609, fd495, fd553; +sub.f64 fd611, fd495, fd553; +sub.f64 fd1377, fd496, fd552; +add.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd583; +sub.f64 fd615, fd499, fd583; +add.f64 fd1376, fd500, fd585; +sub.f64 fd616, fd500, fd585; +add.f64 fd617, fd503, fd588; +sub.f64 fd619, fd503, fd588; +add.f64 fd1375, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd1374, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r50, 496; +mov.u64 rd7, %34; +cvt.u64.u32 rd12, r14; +add.s64 rd8, rd7, rd12; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd630, fd626, fd1380; +mul.f64 fd631, fd625, fd1380; +mul.f64 fd633, fd626, fd626; +mul.f64 fd1373, fd625, fd625; +sub.f64 fd634, fd1373, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd638, fd636, fd1379; +mul.f64 fd639, fd634, fd1379; +mul.f64 fd1371, fd625, fd634; +mul.f64 fd1372, fd626, fd636; +sub.f64 fd642, fd1371, fd1372; +mul.f64 fd1370, fd634, fd601; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd646, fd644, fd1378; +mul.f64 fd647, fd642, fd1378; +mul.f64 fd649, fd626, fd644; +mul.f64 fd1369, fd625, fd642; +sub.f64 fd650, fd1369, fd649; +mul.f64 fd1368, fd642, fd605; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd654, fd652, fd1377; +mul.f64 fd655, fd650, fd1377; +mul.f64 fd1366, fd625, fd650; +mul.f64 fd1367, fd626, fd652; +sub.f64 fd658, fd1366, fd1367; +mul.f64 fd1365, fd650, fd609; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd662, fd660, fd1376; +mul.f64 fd663, fd658, fd1376; +mul.f64 fd1363, fd625, fd658; +mul.f64 fd1364, fd626, fd660; +sub.f64 fd666, fd1363, fd1364; +mul.f64 fd1362, fd658, fd613; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd670, fd668, fd1375; +mul.f64 fd671, fd666, fd1375; +mul.f64 fd673, fd626, fd668; +mul.f64 fd1361, fd625, fd666; +sub.f64 fd674, fd1361, fd673; +mul.f64 fd1360, fd666, fd617; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd678, fd676, fd1374; +mul.f64 fd679, fd674, fd1374; +mul.f64 fd1358, fd625, fd674; +mul.f64 fd1359, fd626, fd676; +sub.f64 fd682, fd1358, fd1359; +mul.f64 fd1357, fd674, fd621; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd1356, fd1403, fd1390; +sub.f64 fd1355, fd493, fd550; +mul.f64 fd685, fd682, fd1355; +mul.f64 fd686, fd684, fd1356; +mul.f64 fd687, fd682, fd1356; +ld.global.v2.f64 {fd688, fd689}, [rd8+512]; +mul.f64 fd693, fd689, fd600; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd1354, fd625, fd688; +sub.f64 fd697, fd1354, fd696; +mul.f64 fd1353, fd688, fd599; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd701, fd699, fd604; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd1352, fd625, fd697; +sub.f64 fd705, fd1352, fd704; +mul.f64 fd1351, fd697, fd603; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd709, fd707, fd608; +mul.f64 fd710, fd705, fd608; +mul.f64 fd1349, fd625, fd705; +mul.f64 fd1350, fd626, fd707; +sub.f64 fd713, fd1349, fd1350; +mul.f64 fd1348, fd705, fd607; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd717, fd715, fd612; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd1347, fd625, fd713; +sub.f64 fd721, fd1347, fd720; +mul.f64 fd1346, fd713, fd611; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd725, fd723, fd616; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd1345, fd625, fd721; +sub.f64 fd729, fd1345, fd728; +mul.f64 fd1344, fd721, fd615; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd733, fd731, fd620; +mul.f64 fd734, fd729, fd620; +mul.f64 fd1342, fd625, fd729; +mul.f64 fd1343, fd626, fd731; +sub.f64 fd737, fd1342, fd1343; +mul.f64 fd1341, fd625, fd597; +mul.f64 fd738, fd625, fd731; +mul.f64 fd1340, fd729, fd619; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd737, fd623; +mul.f64 fd741, fd739, fd624; +mul.f64 fd742, fd737, fd624; +mov.u32 r43, %tid.x; +shl.b32 r42, r43, 4; +and.b32 r15, r42, 240; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r7, 126976; +add.s32 r18, r16, r17; +sub.f64 fd1503, fd1403, fd1390; +mul.f64 fd1502, fd684, fd1503; +add.f64 fd743, fd1403, fd1390; +sub.f64 fd1501, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r38, %tid.x; +shl.b32 r37, r38, 4; +mov.u32 r49, %tid.x; +and.b32 r48, r49, 496; +mov.u32 r41, %tid.x; +shl.b32 r40, r41, 8; +mov.u32 r36, %tid.x; +fma.rn.f64 fd745, fd626, fd597, fd631; +sub.f64 fd746, fd1341, fd630; +st.shared.v2.f64 [r18+256], {fd746, fd745}; +fma.rn.f64 fd747, fd636, fd601, fd639; +sub.f64 fd748, fd1370, fd638; +st.shared.v2.f64 [r18+512], {fd748, fd747}; +fma.rn.f64 fd749, fd644, fd605, fd647; +sub.f64 fd750, fd1368, fd646; +st.shared.v2.f64 [r18+768], {fd750, fd749}; +fma.rn.f64 fd751, fd652, fd609, fd655; +sub.f64 fd752, fd1365, fd654; +st.shared.v2.f64 [r18+1024], {fd752, fd751}; +sub.f64 fd753, fd1362, fd662; +fma.rn.f64 fd754, fd660, fd613, fd663; +st.shared.v2.f64 [r18+1280], {fd753, fd754}; +fma.rn.f64 fd755, fd668, fd617, fd671; +sub.f64 fd756, fd1360, fd670; +st.shared.v2.f64 [r18+1536], {fd756, fd755}; +fma.rn.f64 fd757, fd676, fd621, fd679; +sub.f64 fd758, fd1357, fd678; +st.shared.v2.f64 [r18+1792], {fd758, fd757}; +fma.rn.f64 fd759, fd684, fd1501, fd687; +sub.f64 fd760, fd685, fd1502; +st.shared.v2.f64 [r18+2048], {fd760, fd759}; +fma.rn.f64 fd761, fd689, fd599, fd694; +sub.f64 fd762, fd1353, fd693; +st.shared.v2.f64 [r18+2304], {fd762, fd761}; +fma.rn.f64 fd763, fd699, fd603, fd702; +sub.f64 fd764, fd1351, fd701; +st.shared.v2.f64 [r18+2560], {fd764, fd763}; +fma.rn.f64 fd765, fd707, fd607, fd710; +sub.f64 fd766, fd1348, fd709; +st.shared.v2.f64 [r18+2816], {fd766, fd765}; +fma.rn.f64 fd767, fd715, fd611, fd718; +sub.f64 fd768, fd1346, fd717; +st.shared.v2.f64 [r18+3072], {fd768, fd767}; +sub.f64 fd769, fd1344, fd725; +fma.rn.f64 fd770, fd723, fd615, fd726; +st.shared.v2.f64 [r18+3328], {fd769, fd770}; +fma.rn.f64 fd771, fd731, fd619, fd734; +sub.f64 fd772, fd1340, fd733; +st.shared.v2.f64 [r18+3584], {fd772, fd771}; +fma.rn.f64 fd773, fd739, fd623, fd742; +sub.f64 fd774, fd740, fd741; +st.shared.v2.f64 [r18+3840], {fd774, fd773}; +barrier.sync 0; +mad.lo.s32 r19, r48, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+8192]; +ld.shared.v2.f64 {fd783, fd784}, [r19+16384]; +ld.shared.v2.f64 {fd787, fd788}, [r19+24576]; +ld.shared.v2.f64 {fd791, fd792}, [r19+32768]; +ld.shared.v2.f64 {fd795, fd796}, [r19+40960]; +ld.shared.v2.f64 {fd799, fd800}, [r19+49152]; +ld.shared.v2.f64 {fd803, fd804}, [r19+57344]; +ld.shared.v2.f64 {fd807, fd808}, [r19+65536]; +ld.shared.v2.f64 {fd811, fd812}, [r19+73728]; +ld.shared.v2.f64 {fd815, fd816}, [r19+81920]; +ld.shared.v2.f64 {fd819, fd820}, [r19+90112]; +ld.shared.v2.f64 {fd823, fd824}, [r19+98304]; +ld.shared.v2.f64 {fd827, fd828}, [r19+106496]; +ld.shared.v2.f64 {fd831, fd832}, [r19+114688]; +ld.shared.v2.f64 {fd835, fd836}, [r19+122880]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd1339, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd1338, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd839, fd843; +sub.f64 fd849, fd839, fd843; +add.f64 fd1337, fd1339, fd1338; +sub.f64 fd850, fd1339, fd1338; +add.f64 fd851, fd841, fd846; +sub.f64 fd853, fd841, fd846; +sub.f64 fd1336, fd842, fd845; +add.f64 fd854, fd842, fd845; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd1335, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd1334, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd855, fd859; +sub.f64 fd865, fd855, fd859; +add.f64 fd1333, fd1335, fd1334; +sub.f64 fd866, fd1335, fd1334; +add.f64 fd867, fd857, fd862; +sub.f64 fd869, fd857, fd862; +sub.f64 fd1332, fd858, fd861; +add.f64 fd870, fd858, fd861; +mul.f64 fd872, fd1332, 0dBFE6A09E667F3BCD; +mul.f64 fd1331, fd867, 0d3FE6A09E667F3BCD; +sub.f64 fd873, fd1331, fd872; +mul.f64 fd874, fd1332, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd875, fd867, 0dBFE6A09E667F3BCD, fd874; +mul.f64 fd876, fd869, 0dBFE6A09E667F3BCD; +mul.f64 fd877, fd870, 0dBFE6A09E667F3BCD; +sub.f64 fd878, fd876, fd877; +add.f64 fd879, fd876, fd877; +add.f64 fd880, fd847, fd863; +sub.f64 fd882, fd847, fd863; +add.f64 fd1330, fd1337, fd1333; +sub.f64 fd883, fd1337, fd1333; +add.f64 fd884, fd851, fd873; +sub.f64 fd886, fd851, fd873; +add.f64 fd1329, fd1336, fd875; +sub.f64 fd887, fd1336, fd875; +add.f64 fd888, fd849, fd866; +sub.f64 fd890, fd849, fd866; +sub.f64 fd1328, fd850, fd865; +add.f64 fd891, fd850, fd865; +add.f64 fd892, fd853, fd878; +sub.f64 fd894, fd853, fd878; +add.f64 fd1327, fd854, fd879; +sub.f64 fd895, fd854, fd879; +add.f64 fd896, fd779, fd811; +sub.f64 fd898, fd779, fd811; +add.f64 fd1326, fd780, fd812; +sub.f64 fd899, fd780, fd812; +add.f64 fd900, fd795, fd827; +sub.f64 fd902, fd795, fd827; +add.f64 fd1325, fd796, fd828; +sub.f64 fd903, fd796, fd828; +add.f64 fd904, fd896, fd900; +sub.f64 fd906, fd896, fd900; +add.f64 fd1324, fd1326, fd1325; +sub.f64 fd907, fd1326, fd1325; +add.f64 fd908, fd898, fd903; +sub.f64 fd910, fd898, fd903; +sub.f64 fd1323, fd899, fd902; +add.f64 fd911, fd899, fd902; +add.f64 fd912, fd787, fd819; +sub.f64 fd914, fd787, fd819; +add.f64 fd1322, fd788, fd820; +sub.f64 fd915, fd788, fd820; +add.f64 fd916, fd803, fd835; +sub.f64 fd918, fd803, fd835; +add.f64 fd1321, fd804, fd836; +sub.f64 fd919, fd804, fd836; +add.f64 fd920, fd912, fd916; +sub.f64 fd922, fd912, fd916; +add.f64 fd1320, fd1322, fd1321; +sub.f64 fd923, fd1322, fd1321; +add.f64 fd924, fd914, fd919; +sub.f64 fd926, fd914, fd919; +sub.f64 fd1319, fd915, fd918; +add.f64 fd927, fd915, fd918; +mul.f64 fd929, fd1319, 0dBFE6A09E667F3BCD; +mul.f64 fd1318, fd924, 0d3FE6A09E667F3BCD; +sub.f64 fd930, fd1318, fd929; +mul.f64 fd931, fd1319, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd932, fd924, 0dBFE6A09E667F3BCD, fd931; +mul.f64 fd933, fd926, 0dBFE6A09E667F3BCD; +mul.f64 fd934, fd927, 0dBFE6A09E667F3BCD; +sub.f64 fd935, fd933, fd934; +add.f64 fd936, fd933, fd934; +add.f64 fd937, fd904, fd920; +sub.f64 fd939, fd904, fd920; +add.f64 fd1317, fd1324, fd1320; +sub.f64 fd940, fd1324, fd1320; +add.f64 fd941, fd908, fd930; +sub.f64 fd943, fd908, fd930; +add.f64 fd1316, fd1323, fd932; +sub.f64 fd944, fd1323, fd932; +add.f64 fd945, fd906, fd923; +sub.f64 fd947, fd906, fd923; +sub.f64 fd1315, fd907, fd922; +add.f64 fd948, fd907, fd922; +add.f64 fd949, fd910, fd935; +sub.f64 fd951, fd910, fd935; +add.f64 fd1314, fd911, fd936; +sub.f64 fd952, fd911, fd936; +mul.f64 fd954, fd1316, 0dBFD87DE2A6AEA963; +mul.f64 fd1313, fd941, 0d3FED906BCF328D46; +sub.f64 fd955, fd1313, fd954; +mul.f64 fd956, fd1316, 0d3FED906BCF328D46; +fma.rn.f64 fd957, fd941, 0dBFD87DE2A6AEA963, fd956; +mul.f64 fd959, fd1315, 0dBFE6A09E667F3BCD; +mul.f64 fd1312, fd945, 0d3FE6A09E667F3BCD; +sub.f64 fd960, fd1312, fd959; +mul.f64 fd961, fd1315, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd962, fd945, 0dBFE6A09E667F3BCD, fd961; +mul.f64 fd964, fd1314, 0dBFED906BCF328D46; +mul.f64 fd1311, fd949, 0d3FD87DE2A6AEA963; +sub.f64 fd965, fd1311, fd964; +mul.f64 fd966, fd1314, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd967, fd949, 0dBFED906BCF328D46, fd966; +mul.f64 fd969, fd944, 0dBFED906BCF328D46; +mul.f64 fd1310, fd943, 0dBFD87DE2A6AEA963; +sub.f64 fd970, fd1310, fd969; +mul.f64 fd971, fd944, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd972, fd943, 0dBFED906BCF328D46, fd971; +mul.f64 fd973, fd947, 0dBFE6A09E667F3BCD; +mul.f64 fd974, fd948, 0dBFE6A09E667F3BCD; +sub.f64 fd975, fd973, fd974; +add.f64 fd976, fd973, fd974; +mul.f64 fd978, fd952, 0dBFD87DE2A6AEA963; +mul.f64 fd1309, fd951, 0dBFED906BCF328D46; +sub.f64 fd979, fd1309, fd978; +mul.f64 fd980, fd952, 0dBFED906BCF328D46; +fma.rn.f64 fd981, fd951, 0dBFD87DE2A6AEA963, fd980; +add.f64 fd984, fd884, fd955; +sub.f64 fd986, fd884, fd955; +add.f64 fd1308, fd1329, fd957; +sub.f64 fd987, fd1329, fd957; +add.f64 fd988, fd888, fd960; +sub.f64 fd990, fd888, fd960; +add.f64 fd1307, fd1328, fd962; +sub.f64 fd991, fd1328, fd962; +add.f64 fd992, fd892, fd965; +sub.f64 fd994, fd892, fd965; +add.f64 fd1306, fd1327, fd967; +sub.f64 fd995, fd1327, fd967; +add.f64 fd996, fd882, fd940; +sub.f64 fd998, fd882, fd940; +sub.f64 fd1305, fd883, fd939; +add.f64 fd999, fd883, fd939; +add.f64 fd1000, fd886, fd970; +sub.f64 fd1002, fd886, fd970; +add.f64 fd1304, fd887, fd972; +sub.f64 fd1003, fd887, fd972; +add.f64 fd1004, fd890, fd975; +sub.f64 fd1006, fd890, fd975; +add.f64 fd1303, fd891, fd976; +sub.f64 fd1007, fd891, fd976; +add.f64 fd1008, fd894, fd979; +sub.f64 fd1010, fd894, fd979; +add.f64 fd1302, fd895, fd981; +sub.f64 fd1011, fd895, fd981; +and.b32 r20, r36, 256; +bfe.u32 r21, r36, 8, 1; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd1012, fd1013}, [rd11]; +mul.f64 fd1017, fd1013, fd1308; +mul.f64 fd1018, fd1012, fd1308; +mul.f64 fd1300, fd1012, fd1012; +mul.f64 fd1301, fd1013, fd1013; +sub.f64 fd1021, fd1300, fd1301; +mul.f64 fd1022, fd1013, fd1012; +fma.rn.f64 fd1023, fd1013, fd1012, fd1022; +mul.f64 fd1025, fd1023, fd1307; +mul.f64 fd1026, fd1021, fd1307; +mul.f64 fd1028, fd1013, fd1023; +mul.f64 fd1299, fd1012, fd1021; +sub.f64 fd1029, fd1299, fd1028; +mul.f64 fd1298, fd1021, fd988; +mul.f64 fd1030, fd1012, fd1023; +fma.rn.f64 fd1031, fd1013, fd1021, fd1030; +mul.f64 fd1033, fd1031, fd1306; +mul.f64 fd1034, fd1029, fd1306; +mul.f64 fd1296, fd1012, fd1029; +mul.f64 fd1297, fd1013, fd1031; +sub.f64 fd1037, fd1296, fd1297; +mul.f64 fd1295, fd1029, fd992; +mul.f64 fd1038, fd1012, fd1031; +fma.rn.f64 fd1039, fd1013, fd1029, fd1038; +mul.f64 fd1041, fd1039, fd1305; +mul.f64 fd1042, fd1037, fd1305; +mul.f64 fd1044, fd1013, fd1039; +mul.f64 fd1294, fd1012, fd1037; +sub.f64 fd1045, fd1294, fd1044; +mul.f64 fd1293, fd1037, fd996; +mul.f64 fd1046, fd1012, fd1039; +fma.rn.f64 fd1047, fd1013, fd1037, fd1046; +mul.f64 fd1049, fd1047, fd1304; +mul.f64 fd1050, fd1045, fd1304; +mul.f64 fd1052, fd1013, fd1047; +mul.f64 fd1292, fd1012, fd1045; +sub.f64 fd1053, fd1292, fd1052; +mul.f64 fd1291, fd1045, fd1000; +mul.f64 fd1054, fd1012, fd1047; +fma.rn.f64 fd1055, fd1013, fd1045, fd1054; +mul.f64 fd1057, fd1055, fd1303; +mul.f64 fd1058, fd1053, fd1303; +mul.f64 fd1289, fd1012, fd1053; +mul.f64 fd1290, fd1013, fd1055; +sub.f64 fd1061, fd1289, fd1290; +mul.f64 fd1288, fd1053, fd1004; +mul.f64 fd1062, fd1012, fd1055; +fma.rn.f64 fd1063, fd1013, fd1053, fd1062; +mul.f64 fd1065, fd1063, fd1302; +mul.f64 fd1066, fd1061, fd1302; +mul.f64 fd1068, fd1013, fd1063; +mul.f64 fd1287, fd1012, fd1061; +sub.f64 fd1069, fd1287, fd1068; +mul.f64 fd1286, fd1061, fd1008; +mul.f64 fd1070, fd1012, fd1063; +fma.rn.f64 fd1071, fd1013, fd1061, fd1070; +sub.f64 fd1285, fd1330, fd1317; +sub.f64 fd1284, fd880, fd937; +mul.f64 fd1072, fd1069, fd1284; +mul.f64 fd1073, fd1071, fd1285; +mul.f64 fd1074, fd1069, fd1285; +ld.global.v2.f64 {fd1075, fd1076}, [rd11+32]; +mul.f64 fd1080, fd1076, fd987; +mul.f64 fd1081, fd1075, fd987; +mul.f64 fd1083, fd1013, fd1076; +mul.f64 fd1283, fd1012, fd1075; +sub.f64 fd1084, fd1283, fd1083; +mul.f64 fd1282, fd1075, fd986; +mul.f64 fd1085, fd1012, fd1076; +fma.rn.f64 fd1086, fd1013, fd1075, fd1085; +mul.f64 fd1088, fd1086, fd991; +mul.f64 fd1089, fd1084, fd991; +mul.f64 fd1280, fd1012, fd1084; +mul.f64 fd1281, fd1013, fd1086; +sub.f64 fd1092, fd1280, fd1281; +mul.f64 fd1279, fd1084, fd990; +mul.f64 fd1093, fd1012, fd1086; +fma.rn.f64 fd1094, fd1013, fd1084, fd1093; +mul.f64 fd1096, fd1094, fd995; +mul.f64 fd1097, fd1092, fd995; +mul.f64 fd1099, fd1013, fd1094; +mul.f64 fd1278, fd1012, fd1092; +sub.f64 fd1100, fd1278, fd1099; +mul.f64 fd1277, fd1092, fd994; +mul.f64 fd1101, fd1012, fd1094; +fma.rn.f64 fd1102, fd1013, fd1092, fd1101; +mul.f64 fd1104, fd1102, fd999; +mul.f64 fd1105, fd1100, fd999; +mul.f64 fd1275, fd1012, fd1100; +mul.f64 fd1276, fd1013, fd1102; +sub.f64 fd1108, fd1275, fd1276; +mul.f64 fd1274, fd1100, fd998; +mul.f64 fd1109, fd1012, fd1102; +fma.rn.f64 fd1110, fd1013, fd1100, fd1109; +mul.f64 fd1112, fd1110, fd1003; +mul.f64 fd1113, fd1108, fd1003; +mul.f64 fd1115, fd1013, fd1110; +mul.f64 fd1273, fd1012, fd1108; +sub.f64 fd1116, fd1273, fd1115; +mul.f64 fd1272, fd1108, fd1002; +mul.f64 fd1117, fd1012, fd1110; +fma.rn.f64 fd1118, fd1013, fd1108, fd1117; +mul.f64 fd1120, fd1118, fd1007; +mul.f64 fd1121, fd1116, fd1007; +mul.f64 fd1123, fd1013, fd1118; +mul.f64 fd1271, fd1012, fd1116; +sub.f64 fd1124, fd1271, fd1123; +mul.f64 fd1270, fd1012, fd984; +mul.f64 fd1125, fd1012, fd1118; +mul.f64 fd1269, fd1116, fd1006; +fma.rn.f64 fd1126, fd1013, fd1116, fd1125; +mul.f64 fd1127, fd1124, fd1010; +mul.f64 fd1128, fd1126, fd1011; +mul.f64 fd1129, fd1124, fd1011; +mov.u32 r31, %tid.x; +and.b32 r30, r31, 256; +mov.u32 r47, %tid.x; +shl.b32 r46, r47, 8; +and.b32 r22, r37, 4080; +add.s32 r23, r9, r22; +sub.f64 fd1505, fd1330, fd1317; +mul.f64 fd1504, fd1071, fd1505; +barrier.sync 0; +and.b32 r24, r46, 65536; +add.s32 r25, r23, r24; +sub.f64 fd1508, fd1330, fd1317; +mul.f64 fd1507, fd1071, fd1508; +add.f64 fd1130, fd1330, fd1317; +sub.f64 fd1509, fd880, fd937; +add.f64 fd1131, fd880, fd937; +st.shared.v2.f64 [r25], {fd1131, fd1130}; +mov.u32 r45, %tid.x; +and.b32 r44, r45, 256; +fma.rn.f64 fd1132, fd1013, fd984, fd1018; +sub.f64 fd1133, fd1270, fd1017; +st.shared.v2.f64 [r25+4096], {fd1133, fd1132}; +fma.rn.f64 fd1134, fd1023, fd988, fd1026; +sub.f64 fd1135, fd1298, fd1025; +st.shared.v2.f64 [r25+8192], {fd1135, fd1134}; +fma.rn.f64 fd1136, fd1031, fd992, fd1034; +sub.f64 fd1137, fd1295, fd1033; +st.shared.v2.f64 [r25+12288], {fd1137, fd1136}; +fma.rn.f64 fd1138, fd1039, fd996, fd1042; +sub.f64 fd1139, fd1293, fd1041; +st.shared.v2.f64 [r25+16384], {fd1139, fd1138}; +sub.f64 fd1140, fd1291, fd1049; +fma.rn.f64 fd1141, fd1047, fd1000, fd1050; +st.shared.v2.f64 [r25+20480], {fd1140, fd1141}; +fma.rn.f64 fd1142, fd1055, fd1004, fd1058; +sub.f64 fd1143, fd1288, fd1057; +st.shared.v2.f64 [r25+24576], {fd1143, fd1142}; +fma.rn.f64 fd1144, fd1063, fd1008, fd1066; +sub.f64 fd1145, fd1286, fd1065; +st.shared.v2.f64 [r25+28672], {fd1145, fd1144}; +fma.rn.f64 fd1146, fd1071, fd1509, fd1074; +sub.f64 fd1147, fd1072, fd1507; +st.shared.v2.f64 [r25+32768], {fd1147, fd1146}; +fma.rn.f64 fd1148, fd1076, fd986, fd1081; +sub.f64 fd1149, fd1282, fd1080; +st.shared.v2.f64 [r25+36864], {fd1149, fd1148}; +fma.rn.f64 fd1150, fd1086, fd990, fd1089; +sub.f64 fd1151, fd1279, fd1088; +st.shared.v2.f64 [r25+40960], {fd1151, fd1150}; +fma.rn.f64 fd1152, fd1094, fd994, fd1097; +sub.f64 fd1153, fd1277, fd1096; +st.shared.v2.f64 [r25+45056], {fd1153, fd1152}; +fma.rn.f64 fd1154, fd1102, fd998, fd1105; +sub.f64 fd1155, fd1274, fd1104; +st.shared.v2.f64 [r25+49152], {fd1155, fd1154}; +sub.f64 fd1156, fd1272, fd1112; +fma.rn.f64 fd1157, fd1110, fd1002, fd1113; +st.shared.v2.f64 [r25+53248], {fd1156, fd1157}; +fma.rn.f64 fd1158, fd1118, fd1006, fd1121; +sub.f64 fd1159, fd1269, fd1120; +st.shared.v2.f64 [r25+57344], {fd1159, fd1158}; +fma.rn.f64 fd1160, fd1126, fd1010, fd1129; +sub.f64 fd1161, fd1127, fd1128; +st.shared.v2.f64 [r25+61440], {fd1161, fd1160}; +barrier.sync 0; +mad.lo.s32 r26, r44, -240, r25; +ld.shared.v2.f64 {fd1162, fd1163}, [r26]; +ld.shared.v2.f64 {fd1166, fd1167}, [r26+8192]; +ld.shared.v2.f64 {fd1170, fd1171}, [r26+16384]; +ld.shared.v2.f64 {fd1174, fd1175}, [r26+24576]; +ld.shared.v2.f64 {fd1178, fd1179}, [r26+32768]; +ld.shared.v2.f64 {fd1182, fd1183}, [r26+40960]; +ld.shared.v2.f64 {fd1186, fd1187}, [r26+49152]; +ld.shared.v2.f64 {fd1190, fd1191}, [r26+57344]; +ld.shared.v2.f64 {fd1194, fd1195}, [r26+65536]; +ld.shared.v2.f64 {fd1198, fd1199}, [r26+73728]; +ld.shared.v2.f64 {fd1202, fd1203}, [r26+81920]; +ld.shared.v2.f64 {fd1206, fd1207}, [r26+90112]; +ld.shared.v2.f64 {fd1210, fd1211}, [r26+98304]; +ld.shared.v2.f64 {fd1214, fd1215}, [r26+106496]; +ld.shared.v2.f64 {fd1218, fd1219}, [r26+114688]; +ld.shared.v2.f64 {fd1222, fd1223}, [r26+122880]; +add.f64 %0, fd1162, fd1194; +add.f64 %1, fd1163, fd1195; +add.f64 %2, fd1166, fd1198; +add.f64 %3, fd1167, fd1199; +add.f64 %4, fd1170, fd1202; +add.f64 %5, fd1171, fd1203; +add.f64 %7, fd1175, fd1207; +add.f64 %6, fd1174, fd1206; +add.f64 %9, fd1179, fd1211; +add.f64 %8, fd1178, fd1210; +add.f64 %11, fd1183, fd1215; +add.f64 %10, fd1182, fd1214; +add.f64 %12, fd1186, fd1218; +add.f64 %13, fd1187, fd1219; +add.f64 %14, fd1190, fd1222; +add.f64 %15, fd1191, fd1223; +sub.f64 %17, fd1163, fd1195; +sub.f64 %16, fd1162, fd1194; +sub.f64 %19, fd1167, fd1199; +sub.f64 %18, fd1166, fd1198; +sub.f64 %21, fd1171, fd1203; +sub.f64 %20, fd1170, fd1202; +sub.f64 %23, fd1175, fd1207; +sub.f64 %22, fd1174, fd1206; +sub.f64 %25, fd1179, fd1211; +sub.f64 %24, fd1178, fd1210; +sub.f64 %27, fd1183, fd1215; +sub.f64 %26, fd1182, fd1214; +sub.f64 %29, fd1187, fd1219; +sub.f64 %28, fd1186, fd1218; +sub.f64 %31, fd1191, fd1223; +sub.f64 %30, fd1190, fd1222; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_8192), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<499, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<36>; +.reg .f64 fd<613>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %21, %31; +add.f64 fd34, %22, %33; +sub.f64 fd35, %21, %31; +sub.f64 fd36, %22, %33; +add.f64 fd37, %26, %37; +add.f64 fd38, %28, %38; +sub.f64 fd39, %26, %37; +sub.f64 fd40, %28, %38; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %23, %34; +add.f64 fd50, %25, %36; +sub.f64 fd51, %23, %34; +sub.f64 fd52, %25, %36; +add.f64 fd53, %29, %39; +add.f64 fd54, %30, %40; +sub.f64 fd55, %29, %39; +sub.f64 fd56, %30, %40; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd69; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd69; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +sub.f64 fd84, fd43, fd60; +add.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd72; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd72; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd90, fd78; +mul.f64 fd95, fd91, fd79; +sub.f64 fd96, fd94, fd95; +mul.f64 fd97, fd90, fd79; +fma.rn.f64 fd98, fd91, fd78, fd97; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd101, fd82; +mul.f64 fd105, fd103, fd83; +sub.f64 fd106, fd104, fd105; +mul.f64 fd107, fd101, fd83; +fma.rn.f64 fd108, fd103, fd82, fd107; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd111, fd86; +mul.f64 fd115, fd113, fd87; +sub.f64 fd116, fd114, fd115; +mul.f64 fd117, fd111, fd87; +fma.rn.f64 fd118, fd113, fd86, fd117; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd121, fd76; +mul.f64 fd125, fd123, fd77; +sub.f64 fd126, fd124, fd125; +mul.f64 fd127, fd121, fd77; +fma.rn.f64 fd128, fd123, fd76, fd127; +ld.global.v2.f64 {fd129, fd130}, [rd5+16384]; +mul.f64 fd133, fd129, fd80; +mul.f64 fd134, fd130, fd81; +sub.f64 fd135, fd133, fd134; +mul.f64 fd136, fd129, fd81; +fma.rn.f64 fd137, fd130, fd80, fd136; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd140, fd84; +mul.f64 fd144, fd142, fd85; +sub.f64 fd145, fd143, fd144; +mul.f64 fd146, fd140, fd85; +fma.rn.f64 fd147, fd142, fd84, fd146; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd150, fd88; +mul.f64 fd154, fd152, fd89; +sub.f64 fd155, fd153, fd154; +mul.f64 fd156, fd150, fd89; +fma.rn.f64 fd157, fd152, fd88, fd156; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65472; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd96}; +st.shared.v2.f64 [r12+16], {fd106, fd116}; +st.shared.v2.f64 [r12+32], {fd126, fd135}; +st.shared.v2.f64 [r12+48], {fd145, fd155}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+8192]; +ld.shared.f64 fd160, [r13+16384]; +ld.shared.f64 fd161, [r13+24576]; +ld.shared.f64 fd162, [r13+32768]; +ld.shared.f64 fd163, [r13+40960]; +ld.shared.f64 fd164, [r13+49152]; +ld.shared.f64 fd165, [r13+57344]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+8192]; +ld.shared.f64 fd168, [r13+16384]; +ld.shared.f64 fd169, [r13+24576]; +ld.shared.f64 fd170, [r13+32768]; +ld.shared.f64 fd171, [r13+40960]; +ld.shared.f64 fd172, [r13+49152]; +ld.shared.f64 fd173, [r13+57344]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +add.f64 fd186, fd176, fd181; +sub.f64 fd187, fd177, fd180; +sub.f64 fd188, fd176, fd181; +add.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0dBFE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd210, fd202, 0dBFE6A09E667F3BCD, fd209; +mul.f64 fd211, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd212, fd205, 0dBFE6A09E667F3BCD; +sub.f64 fd213, fd211, fd212; +add.f64 fd214, fd211, fd212; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd210; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd210; +add.f64 fd223, fd184, fd201; +sub.f64 fd224, fd185, fd200; +sub.f64 fd225, fd184, fd201; +add.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd213; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd213; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 1016; +bfe.u32 r15, r5, 3, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd231, fd219; +mul.f64 fd236, fd232, fd220; +sub.f64 fd237, fd235, fd236; +mul.f64 fd238, fd231, fd220; +fma.rn.f64 fd239, fd232, fd219, fd238; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd242, fd223; +mul.f64 fd246, fd244, fd224; +sub.f64 fd247, fd245, fd246; +mul.f64 fd248, fd242, fd224; +fma.rn.f64 fd249, fd244, fd223, fd248; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd252, fd227; +mul.f64 fd256, fd254, fd228; +sub.f64 fd257, fd255, fd256; +mul.f64 fd258, fd252, fd228; +fma.rn.f64 fd259, fd254, fd227, fd258; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd262, fd217; +mul.f64 fd266, fd264, fd218; +sub.f64 fd267, fd265, fd266; +mul.f64 fd268, fd262, fd218; +fma.rn.f64 fd269, fd264, fd217, fd268; +ld.global.v2.f64 {fd270, fd271}, [rd8+2048]; +mul.f64 fd274, fd270, fd221; +mul.f64 fd275, fd271, fd222; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd270, fd222; +fma.rn.f64 fd278, fd271, fd221, fd277; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd281, fd225; +mul.f64 fd285, fd283, fd226; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd226; +fma.rn.f64 fd288, fd283, fd225, fd287; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd291, fd229; +mul.f64 fd295, fd293, fd230; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd230; +fma.rn.f64 fd298, fd293, fd229, fd297; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 65024; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd237; +st.shared.f64 [r20+128], fd247; +st.shared.f64 [r20+192], fd257; +st.shared.f64 [r20+256], fd267; +st.shared.f64 [r20+320], fd276; +st.shared.f64 [r20+384], fd286; +st.shared.f64 [r20+448], fd296; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+8192]; +ld.shared.f64 fd301, [r21+16384]; +ld.shared.f64 fd302, [r21+24576]; +ld.shared.f64 fd303, [r21+32768]; +ld.shared.f64 fd304, [r21+40960]; +ld.shared.f64 fd305, [r21+49152]; +ld.shared.f64 fd306, [r21+57344]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+8192]; +ld.shared.f64 fd309, [r21+16384]; +ld.shared.f64 fd310, [r21+24576]; +ld.shared.f64 fd311, [r21+32768]; +ld.shared.f64 fd312, [r21+40960]; +ld.shared.f64 fd313, [r21+49152]; +ld.shared.f64 fd314, [r21+57344]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +add.f64 fd327, fd317, fd322; +sub.f64 fd328, fd318, fd321; +sub.f64 fd329, fd317, fd322; +add.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +add.f64 fd343, fd333, fd338; +sub.f64 fd344, fd334, fd337; +sub.f64 fd345, fd333, fd338; +add.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0dBFE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +mul.f64 fd350, fd344, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd351, fd343, 0dBFE6A09E667F3BCD, fd350; +mul.f64 fd352, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd353, fd346, 0dBFE6A09E667F3BCD; +sub.f64 fd354, fd352, fd353; +add.f64 fd355, fd352, fd353; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd351; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd351; +add.f64 fd364, fd325, fd342; +sub.f64 fd365, fd326, fd341; +sub.f64 fd366, fd325, fd342; +add.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd354; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd354; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 960; +bfe.u32 r23, r5, 6, 4; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd372, fd360; +mul.f64 fd377, fd373, fd361; +sub.f64 fd378, fd376, fd377; +mul.f64 fd379, fd372, fd361; +fma.rn.f64 fd380, fd373, fd360, fd379; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd383, fd364; +mul.f64 fd387, fd385, fd365; +sub.f64 fd388, fd386, fd387; +mul.f64 fd389, fd383, fd365; +fma.rn.f64 fd390, fd385, fd364, fd389; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd393, fd368; +mul.f64 fd397, fd395, fd369; +sub.f64 fd398, fd396, fd397; +mul.f64 fd399, fd393, fd369; +fma.rn.f64 fd400, fd395, fd368, fd399; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd403, fd358; +mul.f64 fd407, fd405, fd359; +sub.f64 fd408, fd406, fd407; +mul.f64 fd409, fd403, fd359; +fma.rn.f64 fd410, fd405, fd358, fd409; +ld.global.v2.f64 {fd411, fd412}, [rd11+256]; +mul.f64 fd415, fd411, fd362; +mul.f64 fd416, fd412, fd363; +sub.f64 fd417, fd415, fd416; +mul.f64 fd418, fd411, fd363; +fma.rn.f64 fd419, fd412, fd362, fd418; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd422, fd366; +mul.f64 fd426, fd424, fd367; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd422, fd367; +fma.rn.f64 fd429, fd424, fd366, fd428; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd432, fd370; +mul.f64 fd436, fd434, fd371; +sub.f64 fd437, fd435, fd436; +mul.f64 fd438, fd432, fd371; +fma.rn.f64 fd439, fd434, fd370, fd438; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 61440; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd378; +st.shared.f64 [r27+1024], fd388; +st.shared.f64 [r27+1536], fd398; +st.shared.f64 [r27+2048], fd408; +st.shared.f64 [r27+2560], fd417; +st.shared.f64 [r27+3072], fd427; +st.shared.f64 [r27+3584], fd437; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+8192]; +ld.shared.f64 fd442, [r28+16384]; +ld.shared.f64 fd443, [r28+24576]; +ld.shared.f64 fd444, [r28+32768]; +ld.shared.f64 fd445, [r28+40960]; +ld.shared.f64 fd446, [r28+49152]; +ld.shared.f64 fd447, [r28+57344]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+8192]; +ld.shared.f64 fd450, [r28+16384]; +ld.shared.f64 fd451, [r28+24576]; +ld.shared.f64 fd452, [r28+32768]; +ld.shared.f64 fd453, [r28+40960]; +ld.shared.f64 fd454, [r28+49152]; +ld.shared.f64 fd455, [r28+57344]; +add.f64 fd456, fd440, fd444; +add.f64 fd457, fd448, fd452; +sub.f64 fd458, fd440, fd444; +sub.f64 fd459, fd448, fd452; +add.f64 fd460, fd442, fd446; +add.f64 fd461, fd450, fd454; +sub.f64 fd462, fd442, fd446; +sub.f64 fd463, fd450, fd454; +add.f64 fd464, fd456, fd460; +add.f64 fd465, fd457, fd461; +sub.f64 fd466, fd456, fd460; +sub.f64 fd467, fd457, fd461; +add.f64 fd468, fd458, fd463; +sub.f64 fd469, fd459, fd462; +sub.f64 fd470, fd458, fd463; +add.f64 fd471, fd459, fd462; +add.f64 fd472, fd441, fd445; +add.f64 fd473, fd449, fd453; +sub.f64 fd474, fd441, fd445; +sub.f64 fd475, fd449, fd453; +add.f64 fd476, fd443, fd447; +add.f64 fd477, fd451, fd455; +sub.f64 fd478, fd443, fd447; +sub.f64 fd479, fd451, fd455; +add.f64 fd480, fd472, fd476; +add.f64 fd481, fd473, fd477; +sub.f64 fd482, fd472, fd476; +sub.f64 fd483, fd473, fd477; +add.f64 fd484, fd474, fd479; +sub.f64 fd485, fd475, fd478; +sub.f64 fd486, fd474, fd479; +add.f64 fd487, fd475, fd478; +mul.f64 fd488, fd484, 0d3FE6A09E667F3BCD; +mul.f64 fd489, fd485, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd488, fd489; +mul.f64 fd491, fd485, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd492, fd484, 0dBFE6A09E667F3BCD, fd491; +mul.f64 fd493, fd486, 0dBFE6A09E667F3BCD; +mul.f64 fd494, fd487, 0dBFE6A09E667F3BCD; +sub.f64 fd495, fd493, fd494; +add.f64 fd496, fd493, fd494; +add.f64 fd497, fd464, fd480; +add.f64 fd498, fd465, fd481; +sub.f64 fd499, fd464, fd480; +sub.f64 fd500, fd465, fd481; +add.f64 fd501, fd468, fd490; +add.f64 fd502, fd469, fd492; +sub.f64 fd503, fd468, fd490; +sub.f64 fd504, fd469, fd492; +add.f64 fd505, fd466, fd483; +sub.f64 fd506, fd467, fd482; +sub.f64 fd507, fd466, fd483; +add.f64 fd508, fd467, fd482; +add.f64 fd509, fd470, fd495; +add.f64 fd510, fd471, fd496; +sub.f64 fd511, fd470, fd495; +sub.f64 fd512, fd471, fd496; +and.b32 r29, r5, 512; +bfe.u32 r30, r5, 9, 1; +mul.wide.u32 rd12, r30, 16; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd513, fd514}, [rd14]; +mul.f64 fd517, fd513, fd501; +mul.f64 fd518, fd514, fd502; +sub.f64 fd519, fd517, fd518; +mul.f64 fd520, fd513, fd502; +fma.rn.f64 fd521, fd514, fd501, fd520; +mul.f64 fd522, fd513, fd513; +mul.f64 fd523, fd514, fd514; +sub.f64 fd524, fd522, fd523; +mul.f64 fd525, fd514, fd513; +fma.rn.f64 fd526, fd514, fd513, fd525; +mul.f64 fd527, fd524, fd505; +mul.f64 fd528, fd526, fd506; +sub.f64 fd529, fd527, fd528; +mul.f64 fd530, fd524, fd506; +fma.rn.f64 fd531, fd526, fd505, fd530; +mul.f64 fd532, fd513, fd524; +mul.f64 fd533, fd514, fd526; +sub.f64 fd534, fd532, fd533; +mul.f64 fd535, fd513, fd526; +fma.rn.f64 fd536, fd514, fd524, fd535; +mul.f64 fd537, fd534, fd509; +mul.f64 fd538, fd536, fd510; +sub.f64 fd539, fd537, fd538; +mul.f64 fd540, fd534, fd510; +fma.rn.f64 fd541, fd536, fd509, fd540; +mul.f64 fd542, fd513, fd534; +mul.f64 fd543, fd514, fd536; +sub.f64 fd544, fd542, fd543; +mul.f64 fd545, fd513, fd536; +fma.rn.f64 fd546, fd514, fd534, fd545; +mul.f64 fd547, fd544, fd499; +mul.f64 fd548, fd546, fd500; +sub.f64 fd549, fd547, fd548; +mul.f64 fd550, fd544, fd500; +fma.rn.f64 fd551, fd546, fd499, fd550; +ld.global.v2.f64 {fd552, fd553}, [rd14+32]; +mul.f64 fd556, fd552, fd503; +mul.f64 fd557, fd553, fd504; +sub.f64 fd558, fd556, fd557; +mul.f64 fd559, fd552, fd504; +fma.rn.f64 fd560, fd553, fd503, fd559; +mul.f64 fd561, fd513, fd552; +mul.f64 fd562, fd514, fd553; +sub.f64 fd563, fd561, fd562; +mul.f64 fd564, fd513, fd553; +fma.rn.f64 fd565, fd514, fd552, fd564; +mul.f64 fd566, fd563, fd507; +mul.f64 fd567, fd565, fd508; +sub.f64 fd568, fd566, fd567; +mul.f64 fd569, fd563, fd508; +fma.rn.f64 fd570, fd565, fd507, fd569; +mul.f64 fd571, fd513, fd563; +mul.f64 fd572, fd514, fd565; +sub.f64 fd573, fd571, fd572; +mul.f64 fd574, fd513, fd565; +fma.rn.f64 fd575, fd514, fd563, fd574; +mul.f64 fd576, fd573, fd511; +mul.f64 fd577, fd575, fd512; +sub.f64 fd578, fd576, fd577; +mul.f64 fd579, fd573, fd512; +fma.rn.f64 fd580, fd575, fd511, fd579; +and.b32 r31, r16, 4088; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 32768; +add.s32 r34, r32, r33; +st.shared.f64 [r34], fd497; +st.shared.f64 [r34+4096], fd519; +st.shared.f64 [r34+8192], fd529; +st.shared.f64 [r34+12288], fd539; +st.shared.f64 [r34+16384], fd549; +st.shared.f64 [r34+20480], fd558; +st.shared.f64 [r34+24576], fd568; +st.shared.f64 [r34+28672], fd578; +barrier.sync 0; +mad.lo.s32 r35, r29, -56, r34; +ld.shared.f64 fd581, [r35]; +ld.shared.f64 fd582, [r35+8192]; +ld.shared.f64 fd583, [r35+16384]; +ld.shared.f64 fd584, [r35+24576]; +ld.shared.f64 fd585, [r35+32768]; +ld.shared.f64 fd586, [r35+40960]; +ld.shared.f64 fd587, [r35+49152]; +ld.shared.f64 fd588, [r35+57344]; +barrier.sync 0; +st.shared.f64 [r34], fd498; +st.shared.f64 [r34+4096], fd521; +st.shared.f64 [r34+8192], fd531; +st.shared.f64 [r34+12288], fd541; +st.shared.f64 [r34+16384], fd551; +st.shared.f64 [r34+20480], fd560; +st.shared.f64 [r34+24576], fd570; +st.shared.f64 [r34+28672], fd580; +barrier.sync 0; +ld.shared.f64 fd589, [r35]; +ld.shared.f64 fd590, [r35+8192]; +ld.shared.f64 fd591, [r35+16384]; +ld.shared.f64 fd592, [r35+24576]; +ld.shared.f64 fd593, [r35+32768]; +ld.shared.f64 fd594, [r35+40960]; +ld.shared.f64 fd595, [r35+49152]; +ld.shared.f64 fd596, [r35+57344]; +add.f64 %0, fd581, fd585; +add.f64 %1, fd589, fd593; +add.f64 %2, fd582, fd586; +add.f64 %3, fd590, fd594; +add.f64 %4, fd583, fd587; +add.f64 %5, fd591, fd595; +add.f64 %6, fd584, fd588; +add.f64 %7, fd592, fd596; +sub.f64 %8, fd581, fd585; +sub.f64 %9, fd589, fd593; +sub.f64 %10, fd582, fd586; +sub.f64 %11, fd590, fd594; +sub.f64 %12, fd583, fd587; +sub.f64 %13, fd591, fd595; +sub.f64 %14, fd584, fd588; +sub.f64 %15, fd592, fd596; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_8192), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<498, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<1162>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %36, %57; +add.f64 fd66, %37, %59; +sub.f64 fd67, %36, %57; +sub.f64 fd68, %37, %59; +add.f64 fd69, %46, %68; +add.f64 fd70, %48, %69; +sub.f64 fd71, %46, %68; +sub.f64 fd72, %48, %69; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +add.f64 fd77, fd67, fd72; +sub.f64 fd78, fd68, fd71; +sub.f64 fd79, fd67, fd72; +add.f64 fd80, fd68, fd71; +add.f64 fd81, %41, %62; +add.f64 fd82, %43, %64; +sub.f64 fd83, %41, %62; +sub.f64 fd84, %43, %64; +add.f64 fd85, %52, %73; +add.f64 fd86, %53, %75; +sub.f64 fd87, %52, %73; +sub.f64 fd88, %53, %75; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +add.f64 fd93, fd83, fd88; +sub.f64 fd94, fd84, fd87; +sub.f64 fd95, fd83, fd88; +add.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0dBFE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +mul.f64 fd100, fd94, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd101, fd93, 0dBFE6A09E667F3BCD, fd100; +mul.f64 fd102, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd103, fd96, 0dBFE6A09E667F3BCD; +sub.f64 fd104, fd102, fd103; +add.f64 fd105, fd102, fd103; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd101; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd101; +add.f64 fd114, fd75, fd92; +sub.f64 fd115, fd76, fd91; +sub.f64 fd116, fd75, fd92; +add.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd104; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd104; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %38, %60; +add.f64 fd123, %40, %61; +sub.f64 fd124, %38, %60; +sub.f64 fd125, %40, %61; +add.f64 fd126, %49, %70; +add.f64 fd127, %51, %72; +sub.f64 fd128, %49, %70; +sub.f64 fd129, %51, %72; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +add.f64 fd134, fd124, fd129; +sub.f64 fd135, fd125, fd128; +sub.f64 fd136, fd124, fd129; +add.f64 fd137, fd125, fd128; +add.f64 fd138, %44, %65; +add.f64 fd139, %45, %67; +sub.f64 fd140, %44, %65; +sub.f64 fd141, %45, %67; +add.f64 fd142, %54, %76; +add.f64 fd143, %56, %77; +sub.f64 fd144, %54, %76; +sub.f64 fd145, %56, %77; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +add.f64 fd150, fd140, fd145; +sub.f64 fd151, fd141, fd144; +sub.f64 fd152, fd140, fd145; +add.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0dBFE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +mul.f64 fd157, fd151, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd158, fd150, 0dBFE6A09E667F3BCD, fd157; +mul.f64 fd159, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd160, fd153, 0dBFE6A09E667F3BCD; +sub.f64 fd161, fd159, fd160; +add.f64 fd162, fd159, fd160; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd158; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd158; +add.f64 fd171, fd132, fd149; +sub.f64 fd172, fd133, fd148; +sub.f64 fd173, fd132, fd149; +add.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd161; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd161; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0dBFD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0dBFD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0dBFE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +mul.f64 fd187, fd172, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd188, fd171, 0dBFE6A09E667F3BCD, fd187; +mul.f64 fd189, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd190, fd176, 0dBFED906BCF328D46; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd193, fd175, 0dBFED906BCF328D46, fd192; +mul.f64 fd194, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd195, fd170, 0dBFED906BCF328D46; +sub.f64 fd196, fd194, fd195; +mul.f64 fd197, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd198, fd169, 0dBFED906BCF328D46, fd197; +mul.f64 fd199, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd200, fd174, 0dBFE6A09E667F3BCD; +sub.f64 fd201, fd199, fd200; +add.f64 fd202, fd199, fd200; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0dBFD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0dBFD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd188; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd188; +add.f64 fd220, fd118, fd191; +add.f64 fd221, fd119, fd193; +sub.f64 fd222, fd118, fd191; +sub.f64 fd223, fd119, fd193; +add.f64 fd224, fd108, fd166; +sub.f64 fd225, fd109, fd165; +sub.f64 fd226, fd108, fd166; +add.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd196; +add.f64 fd229, fd113, fd198; +sub.f64 fd230, fd112, fd196; +sub.f64 fd231, fd113, fd198; +add.f64 fd232, fd116, fd201; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd201; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd240, fd212; +mul.f64 fd245, fd241, fd213; +sub.f64 fd246, fd244, fd245; +mul.f64 fd247, fd240, fd213; +fma.rn.f64 fd248, fd241, fd212, fd247; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd251, fd216; +mul.f64 fd255, fd253, fd217; +sub.f64 fd256, fd254, fd255; +mul.f64 fd257, fd251, fd217; +fma.rn.f64 fd258, fd253, fd216, fd257; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd261, fd220; +mul.f64 fd265, fd263, fd221; +sub.f64 fd266, fd264, fd265; +mul.f64 fd267, fd261, fd221; +fma.rn.f64 fd268, fd263, fd220, fd267; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd271, fd224; +mul.f64 fd275, fd273, fd225; +sub.f64 fd276, fd274, fd275; +mul.f64 fd277, fd271, fd225; +fma.rn.f64 fd278, fd273, fd224, fd277; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd281, fd228; +mul.f64 fd285, fd283, fd229; +sub.f64 fd286, fd284, fd285; +mul.f64 fd287, fd281, fd229; +fma.rn.f64 fd288, fd283, fd228, fd287; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd291, fd232; +mul.f64 fd295, fd293, fd233; +sub.f64 fd296, fd294, fd295; +mul.f64 fd297, fd291, fd233; +fma.rn.f64 fd298, fd293, fd232, fd297; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd301, fd236; +mul.f64 fd305, fd303, fd237; +sub.f64 fd306, fd304, fd305; +mul.f64 fd307, fd301, fd237; +fma.rn.f64 fd308, fd303, fd236, fd307; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd311, fd210; +mul.f64 fd315, fd313, fd211; +sub.f64 fd316, fd314, fd315; +mul.f64 fd317, fd311, fd211; +fma.rn.f64 fd318, fd313, fd210, fd317; +ld.global.v2.f64 {fd319, fd320}, [rd5+8192]; +mul.f64 fd323, fd319, fd214; +mul.f64 fd324, fd320, fd215; +sub.f64 fd325, fd323, fd324; +mul.f64 fd326, fd319, fd215; +fma.rn.f64 fd327, fd320, fd214, fd326; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd330, fd218; +mul.f64 fd334, fd332, fd219; +sub.f64 fd335, fd333, fd334; +mul.f64 fd336, fd330, fd219; +fma.rn.f64 fd337, fd332, fd218, fd336; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd340, fd222; +mul.f64 fd344, fd342, fd223; +sub.f64 fd345, fd343, fd344; +mul.f64 fd346, fd340, fd223; +fma.rn.f64 fd347, fd342, fd222, fd346; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd350, fd226; +mul.f64 fd354, fd352, fd227; +sub.f64 fd355, fd353, fd354; +mul.f64 fd356, fd350, fd227; +fma.rn.f64 fd357, fd352, fd226, fd356; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd360, fd230; +mul.f64 fd364, fd362, fd231; +sub.f64 fd365, fd363, fd364; +mul.f64 fd366, fd360, fd231; +fma.rn.f64 fd367, fd362, fd230, fd366; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd370, fd234; +mul.f64 fd374, fd372, fd235; +sub.f64 fd375, fd373, fd374; +mul.f64 fd376, fd370, fd235; +fma.rn.f64 fd377, fd372, fd234, fd376; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd380, fd238; +mul.f64 fd384, fd382, fd239; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd380, fd239; +fma.rn.f64 fd387, fd382, fd238, fd386; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65408; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd246}; +st.shared.v2.f64 [r12+16], {fd256, fd266}; +st.shared.v2.f64 [r12+32], {fd276, fd286}; +st.shared.v2.f64 [r12+48], {fd296, fd306}; +st.shared.v2.f64 [r12+64], {fd316, fd325}; +st.shared.v2.f64 [r12+80], {fd335, fd345}; +st.shared.v2.f64 [r12+96], {fd355, fd365}; +st.shared.v2.f64 [r12+112], {fd375, fd385}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+4096]; +ld.shared.f64 fd390, [r13+8192]; +ld.shared.f64 fd391, [r13+12288]; +ld.shared.f64 fd392, [r13+16384]; +ld.shared.f64 fd393, [r13+20480]; +ld.shared.f64 fd394, [r13+24576]; +ld.shared.f64 fd395, [r13+28672]; +ld.shared.f64 fd396, [r13+32768]; +ld.shared.f64 fd397, [r13+36864]; +ld.shared.f64 fd398, [r13+40960]; +ld.shared.f64 fd399, [r13+45056]; +ld.shared.f64 fd400, [r13+49152]; +ld.shared.f64 fd401, [r13+53248]; +ld.shared.f64 fd402, [r13+57344]; +ld.shared.f64 fd403, [r13+61440]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+4096]; +ld.shared.f64 fd406, [r13+8192]; +ld.shared.f64 fd407, [r13+12288]; +ld.shared.f64 fd408, [r13+16384]; +ld.shared.f64 fd409, [r13+20480]; +ld.shared.f64 fd410, [r13+24576]; +ld.shared.f64 fd411, [r13+28672]; +ld.shared.f64 fd412, [r13+32768]; +ld.shared.f64 fd413, [r13+36864]; +ld.shared.f64 fd414, [r13+40960]; +ld.shared.f64 fd415, [r13+45056]; +ld.shared.f64 fd416, [r13+49152]; +ld.shared.f64 fd417, [r13+53248]; +ld.shared.f64 fd418, [r13+57344]; +ld.shared.f64 fd419, [r13+61440]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +add.f64 fd432, fd422, fd427; +sub.f64 fd433, fd423, fd426; +sub.f64 fd434, fd422, fd427; +add.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +add.f64 fd448, fd438, fd443; +sub.f64 fd449, fd439, fd442; +sub.f64 fd450, fd438, fd443; +add.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0dBFE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +mul.f64 fd455, fd449, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd456, fd448, 0dBFE6A09E667F3BCD, fd455; +mul.f64 fd457, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd458, fd451, 0dBFE6A09E667F3BCD; +sub.f64 fd459, fd457, fd458; +add.f64 fd460, fd457, fd458; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd456; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd456; +add.f64 fd469, fd430, fd447; +sub.f64 fd470, fd431, fd446; +sub.f64 fd471, fd430, fd447; +add.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd459; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd459; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +add.f64 fd489, fd479, fd484; +sub.f64 fd490, fd480, fd483; +sub.f64 fd491, fd479, fd484; +add.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +add.f64 fd505, fd495, fd500; +sub.f64 fd506, fd496, fd499; +sub.f64 fd507, fd495, fd500; +add.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0dBFE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +mul.f64 fd512, fd506, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd513, fd505, 0dBFE6A09E667F3BCD, fd512; +mul.f64 fd514, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd515, fd508, 0dBFE6A09E667F3BCD; +sub.f64 fd516, fd514, fd515; +add.f64 fd517, fd514, fd515; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd513; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd513; +add.f64 fd526, fd487, fd504; +sub.f64 fd527, fd488, fd503; +sub.f64 fd528, fd487, fd504; +add.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd516; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd516; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0dBFD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0dBFD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0dBFE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +mul.f64 fd542, fd527, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd543, fd526, 0dBFE6A09E667F3BCD, fd542; +mul.f64 fd544, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd545, fd531, 0dBFED906BCF328D46; +sub.f64 fd546, fd544, fd545; +mul.f64 fd547, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd548, fd530, 0dBFED906BCF328D46, fd547; +mul.f64 fd549, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd550, fd525, 0dBFED906BCF328D46; +sub.f64 fd551, fd549, fd550; +mul.f64 fd552, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd553, fd524, 0dBFED906BCF328D46, fd552; +mul.f64 fd554, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd555, fd529, 0dBFE6A09E667F3BCD; +sub.f64 fd556, fd554, fd555; +add.f64 fd557, fd554, fd555; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0dBFD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0dBFD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd543; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd543; +add.f64 fd575, fd473, fd546; +add.f64 fd576, fd474, fd548; +sub.f64 fd577, fd473, fd546; +sub.f64 fd578, fd474, fd548; +add.f64 fd579, fd463, fd521; +sub.f64 fd580, fd464, fd520; +sub.f64 fd581, fd463, fd521; +add.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd551; +add.f64 fd584, fd468, fd553; +sub.f64 fd585, fd467, fd551; +sub.f64 fd586, fd468, fd553; +add.f64 fd587, fd471, fd556; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd556; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 496; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd595, fd567; +mul.f64 fd600, fd596, fd568; +sub.f64 fd601, fd599, fd600; +mul.f64 fd602, fd595, fd568; +fma.rn.f64 fd603, fd596, fd567, fd602; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd606, fd571; +mul.f64 fd610, fd608, fd572; +sub.f64 fd611, fd609, fd610; +mul.f64 fd612, fd606, fd572; +fma.rn.f64 fd613, fd608, fd571, fd612; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd616, fd575; +mul.f64 fd620, fd618, fd576; +sub.f64 fd621, fd619, fd620; +mul.f64 fd622, fd616, fd576; +fma.rn.f64 fd623, fd618, fd575, fd622; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd626, fd579; +mul.f64 fd630, fd628, fd580; +sub.f64 fd631, fd629, fd630; +mul.f64 fd632, fd626, fd580; +fma.rn.f64 fd633, fd628, fd579, fd632; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd636, fd583; +mul.f64 fd640, fd638, fd584; +sub.f64 fd641, fd639, fd640; +mul.f64 fd642, fd636, fd584; +fma.rn.f64 fd643, fd638, fd583, fd642; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd646, fd587; +mul.f64 fd650, fd648, fd588; +sub.f64 fd651, fd649, fd650; +mul.f64 fd652, fd646, fd588; +fma.rn.f64 fd653, fd648, fd587, fd652; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd656, fd591; +mul.f64 fd660, fd658, fd592; +sub.f64 fd661, fd659, fd660; +mul.f64 fd662, fd656, fd592; +fma.rn.f64 fd663, fd658, fd591, fd662; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd666, fd565; +mul.f64 fd670, fd668, fd566; +sub.f64 fd671, fd669, fd670; +mul.f64 fd672, fd666, fd566; +fma.rn.f64 fd673, fd668, fd565, fd672; +ld.global.v2.f64 {fd674, fd675}, [rd8+512]; +mul.f64 fd678, fd674, fd569; +mul.f64 fd679, fd675, fd570; +sub.f64 fd680, fd678, fd679; +mul.f64 fd681, fd674, fd570; +fma.rn.f64 fd682, fd675, fd569, fd681; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd685, fd573; +mul.f64 fd689, fd687, fd574; +sub.f64 fd690, fd688, fd689; +mul.f64 fd691, fd685, fd574; +fma.rn.f64 fd692, fd687, fd573, fd691; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd695, fd577; +mul.f64 fd699, fd697, fd578; +sub.f64 fd700, fd698, fd699; +mul.f64 fd701, fd695, fd578; +fma.rn.f64 fd702, fd697, fd577, fd701; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd705, fd581; +mul.f64 fd709, fd707, fd582; +sub.f64 fd710, fd708, fd709; +mul.f64 fd711, fd705, fd582; +fma.rn.f64 fd712, fd707, fd581, fd711; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd715, fd585; +mul.f64 fd719, fd717, fd586; +sub.f64 fd720, fd718, fd719; +mul.f64 fd721, fd715, fd586; +fma.rn.f64 fd722, fd717, fd585, fd721; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd725, fd589; +mul.f64 fd729, fd727, fd590; +sub.f64 fd730, fd728, fd729; +mul.f64 fd731, fd725, fd590; +fma.rn.f64 fd732, fd727, fd589, fd731; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd735, fd593; +mul.f64 fd739, fd737, fd594; +sub.f64 fd740, fd738, fd739; +mul.f64 fd741, fd735, fd594; +fma.rn.f64 fd742, fd737, fd593, fd741; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 63488; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd601; +st.shared.f64 [r19+256], fd611; +st.shared.f64 [r19+384], fd621; +st.shared.f64 [r19+512], fd631; +st.shared.f64 [r19+640], fd641; +st.shared.f64 [r19+768], fd651; +st.shared.f64 [r19+896], fd661; +st.shared.f64 [r19+1024], fd671; +st.shared.f64 [r19+1152], fd680; +st.shared.f64 [r19+1280], fd690; +st.shared.f64 [r19+1408], fd700; +st.shared.f64 [r19+1536], fd710; +st.shared.f64 [r19+1664], fd720; +st.shared.f64 [r19+1792], fd730; +st.shared.f64 [r19+1920], fd740; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+4096]; +ld.shared.f64 fd745, [r20+8192]; +ld.shared.f64 fd746, [r20+12288]; +ld.shared.f64 fd747, [r20+16384]; +ld.shared.f64 fd748, [r20+20480]; +ld.shared.f64 fd749, [r20+24576]; +ld.shared.f64 fd750, [r20+28672]; +ld.shared.f64 fd751, [r20+32768]; +ld.shared.f64 fd752, [r20+36864]; +ld.shared.f64 fd753, [r20+40960]; +ld.shared.f64 fd754, [r20+45056]; +ld.shared.f64 fd755, [r20+49152]; +ld.shared.f64 fd756, [r20+53248]; +ld.shared.f64 fd757, [r20+57344]; +ld.shared.f64 fd758, [r20+61440]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+4096]; +ld.shared.f64 fd761, [r20+8192]; +ld.shared.f64 fd762, [r20+12288]; +ld.shared.f64 fd763, [r20+16384]; +ld.shared.f64 fd764, [r20+20480]; +ld.shared.f64 fd765, [r20+24576]; +ld.shared.f64 fd766, [r20+28672]; +ld.shared.f64 fd767, [r20+32768]; +ld.shared.f64 fd768, [r20+36864]; +ld.shared.f64 fd769, [r20+40960]; +ld.shared.f64 fd770, [r20+45056]; +ld.shared.f64 fd771, [r20+49152]; +ld.shared.f64 fd772, [r20+53248]; +ld.shared.f64 fd773, [r20+57344]; +ld.shared.f64 fd774, [r20+61440]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +add.f64 fd787, fd777, fd782; +sub.f64 fd788, fd778, fd781; +sub.f64 fd789, fd777, fd782; +add.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +add.f64 fd803, fd793, fd798; +sub.f64 fd804, fd794, fd797; +sub.f64 fd805, fd793, fd798; +add.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0dBFE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +mul.f64 fd810, fd804, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd811, fd803, 0dBFE6A09E667F3BCD, fd810; +mul.f64 fd812, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd813, fd806, 0dBFE6A09E667F3BCD; +sub.f64 fd814, fd812, fd813; +add.f64 fd815, fd812, fd813; +add.f64 fd816, fd783, fd799; +add.f64 fd817, fd784, fd800; +sub.f64 fd818, fd783, fd799; +sub.f64 fd819, fd784, fd800; +add.f64 fd820, fd787, fd809; +add.f64 fd821, fd788, fd811; +sub.f64 fd822, fd787, fd809; +sub.f64 fd823, fd788, fd811; +add.f64 fd824, fd785, fd802; +sub.f64 fd825, fd786, fd801; +sub.f64 fd826, fd785, fd802; +add.f64 fd827, fd786, fd801; +add.f64 fd828, fd789, fd814; +add.f64 fd829, fd790, fd815; +sub.f64 fd830, fd789, fd814; +sub.f64 fd831, fd790, fd815; +add.f64 fd832, fd744, fd752; +add.f64 fd833, fd760, fd768; +sub.f64 fd834, fd744, fd752; +sub.f64 fd835, fd760, fd768; +add.f64 fd836, fd748, fd756; +add.f64 fd837, fd764, fd772; +sub.f64 fd838, fd748, fd756; +sub.f64 fd839, fd764, fd772; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +add.f64 fd844, fd834, fd839; +sub.f64 fd845, fd835, fd838; +sub.f64 fd846, fd834, fd839; +add.f64 fd847, fd835, fd838; +add.f64 fd848, fd746, fd754; +add.f64 fd849, fd762, fd770; +sub.f64 fd850, fd746, fd754; +sub.f64 fd851, fd762, fd770; +add.f64 fd852, fd750, fd758; +add.f64 fd853, fd766, fd774; +sub.f64 fd854, fd750, fd758; +sub.f64 fd855, fd766, fd774; +add.f64 fd856, fd848, fd852; +add.f64 fd857, fd849, fd853; +sub.f64 fd858, fd848, fd852; +sub.f64 fd859, fd849, fd853; +add.f64 fd860, fd850, fd855; +sub.f64 fd861, fd851, fd854; +sub.f64 fd862, fd850, fd855; +add.f64 fd863, fd851, fd854; +mul.f64 fd864, fd860, 0d3FE6A09E667F3BCD; +mul.f64 fd865, fd861, 0dBFE6A09E667F3BCD; +sub.f64 fd866, fd864, fd865; +mul.f64 fd867, fd861, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd868, fd860, 0dBFE6A09E667F3BCD, fd867; +mul.f64 fd869, fd862, 0dBFE6A09E667F3BCD; +mul.f64 fd870, fd863, 0dBFE6A09E667F3BCD; +sub.f64 fd871, fd869, fd870; +add.f64 fd872, fd869, fd870; +add.f64 fd873, fd840, fd856; +add.f64 fd874, fd841, fd857; +sub.f64 fd875, fd840, fd856; +sub.f64 fd876, fd841, fd857; +add.f64 fd877, fd844, fd866; +add.f64 fd878, fd845, fd868; +sub.f64 fd879, fd844, fd866; +sub.f64 fd880, fd845, fd868; +add.f64 fd881, fd842, fd859; +sub.f64 fd882, fd843, fd858; +sub.f64 fd883, fd842, fd859; +add.f64 fd884, fd843, fd858; +add.f64 fd885, fd846, fd871; +add.f64 fd886, fd847, fd872; +sub.f64 fd887, fd846, fd871; +sub.f64 fd888, fd847, fd872; +mul.f64 fd889, fd877, 0d3FED906BCF328D46; +mul.f64 fd890, fd878, 0dBFD87DE2A6AEA963; +sub.f64 fd891, fd889, fd890; +mul.f64 fd892, fd878, 0d3FED906BCF328D46; +fma.rn.f64 fd893, fd877, 0dBFD87DE2A6AEA963, fd892; +mul.f64 fd894, fd881, 0d3FE6A09E667F3BCD; +mul.f64 fd895, fd882, 0dBFE6A09E667F3BCD; +sub.f64 fd896, fd894, fd895; +mul.f64 fd897, fd882, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd898, fd881, 0dBFE6A09E667F3BCD, fd897; +mul.f64 fd899, fd885, 0d3FD87DE2A6AEA963; +mul.f64 fd900, fd886, 0dBFED906BCF328D46; +sub.f64 fd901, fd899, fd900; +mul.f64 fd902, fd886, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd903, fd885, 0dBFED906BCF328D46, fd902; +mul.f64 fd904, fd879, 0dBFD87DE2A6AEA963; +mul.f64 fd905, fd880, 0dBFED906BCF328D46; +sub.f64 fd906, fd904, fd905; +mul.f64 fd907, fd880, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd908, fd879, 0dBFED906BCF328D46, fd907; +mul.f64 fd909, fd883, 0dBFE6A09E667F3BCD; +mul.f64 fd910, fd884, 0dBFE6A09E667F3BCD; +sub.f64 fd911, fd909, fd910; +add.f64 fd912, fd909, fd910; +mul.f64 fd913, fd887, 0dBFED906BCF328D46; +mul.f64 fd914, fd888, 0dBFD87DE2A6AEA963; +sub.f64 fd915, fd913, fd914; +mul.f64 fd916, fd888, 0dBFED906BCF328D46; +fma.rn.f64 fd917, fd887, 0dBFD87DE2A6AEA963, fd916; +add.f64 fd918, fd816, fd873; +add.f64 fd919, fd817, fd874; +sub.f64 fd920, fd816, fd873; +sub.f64 fd921, fd817, fd874; +add.f64 fd922, fd820, fd891; +add.f64 fd923, fd821, fd893; +sub.f64 fd924, fd820, fd891; +sub.f64 fd925, fd821, fd893; +add.f64 fd926, fd824, fd896; +add.f64 fd927, fd825, fd898; +sub.f64 fd928, fd824, fd896; +sub.f64 fd929, fd825, fd898; +add.f64 fd930, fd828, fd901; +add.f64 fd931, fd829, fd903; +sub.f64 fd932, fd828, fd901; +sub.f64 fd933, fd829, fd903; +add.f64 fd934, fd818, fd876; +sub.f64 fd935, fd819, fd875; +sub.f64 fd936, fd818, fd876; +add.f64 fd937, fd819, fd875; +add.f64 fd938, fd822, fd906; +add.f64 fd939, fd823, fd908; +sub.f64 fd940, fd822, fd906; +sub.f64 fd941, fd823, fd908; +add.f64 fd942, fd826, fd911; +add.f64 fd943, fd827, fd912; +sub.f64 fd944, fd826, fd911; +sub.f64 fd945, fd827, fd912; +add.f64 fd946, fd830, fd915; +add.f64 fd947, fd831, fd917; +sub.f64 fd948, fd830, fd915; +sub.f64 fd949, fd831, fd917; +and.b32 r21, r5, 256; +bfe.u32 r22, r5, 8, 1; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd950, fd951}, [rd11]; +mul.f64 fd954, fd950, fd922; +mul.f64 fd955, fd951, fd923; +sub.f64 fd956, fd954, fd955; +mul.f64 fd957, fd950, fd923; +fma.rn.f64 fd958, fd951, fd922, fd957; +mul.f64 fd959, fd950, fd950; +mul.f64 fd960, fd951, fd951; +sub.f64 fd961, fd959, fd960; +mul.f64 fd962, fd951, fd950; +fma.rn.f64 fd963, fd951, fd950, fd962; +mul.f64 fd964, fd961, fd926; +mul.f64 fd965, fd963, fd927; +sub.f64 fd966, fd964, fd965; +mul.f64 fd967, fd961, fd927; +fma.rn.f64 fd968, fd963, fd926, fd967; +mul.f64 fd969, fd950, fd961; +mul.f64 fd970, fd951, fd963; +sub.f64 fd971, fd969, fd970; +mul.f64 fd972, fd950, fd963; +fma.rn.f64 fd973, fd951, fd961, fd972; +mul.f64 fd974, fd971, fd930; +mul.f64 fd975, fd973, fd931; +sub.f64 fd976, fd974, fd975; +mul.f64 fd977, fd971, fd931; +fma.rn.f64 fd978, fd973, fd930, fd977; +mul.f64 fd979, fd950, fd971; +mul.f64 fd980, fd951, fd973; +sub.f64 fd981, fd979, fd980; +mul.f64 fd982, fd950, fd973; +fma.rn.f64 fd983, fd951, fd971, fd982; +mul.f64 fd984, fd981, fd934; +mul.f64 fd985, fd983, fd935; +sub.f64 fd986, fd984, fd985; +mul.f64 fd987, fd981, fd935; +fma.rn.f64 fd988, fd983, fd934, fd987; +mul.f64 fd989, fd950, fd981; +mul.f64 fd990, fd951, fd983; +sub.f64 fd991, fd989, fd990; +mul.f64 fd992, fd950, fd983; +fma.rn.f64 fd993, fd951, fd981, fd992; +mul.f64 fd994, fd991, fd938; +mul.f64 fd995, fd993, fd939; +sub.f64 fd996, fd994, fd995; +mul.f64 fd997, fd991, fd939; +fma.rn.f64 fd998, fd993, fd938, fd997; +mul.f64 fd999, fd950, fd991; +mul.f64 fd1000, fd951, fd993; +sub.f64 fd1001, fd999, fd1000; +mul.f64 fd1002, fd950, fd993; +fma.rn.f64 fd1003, fd951, fd991, fd1002; +mul.f64 fd1004, fd1001, fd942; +mul.f64 fd1005, fd1003, fd943; +sub.f64 fd1006, fd1004, fd1005; +mul.f64 fd1007, fd1001, fd943; +fma.rn.f64 fd1008, fd1003, fd942, fd1007; +mul.f64 fd1009, fd950, fd1001; +mul.f64 fd1010, fd951, fd1003; +sub.f64 fd1011, fd1009, fd1010; +mul.f64 fd1012, fd950, fd1003; +fma.rn.f64 fd1013, fd951, fd1001, fd1012; +mul.f64 fd1014, fd1011, fd946; +mul.f64 fd1015, fd1013, fd947; +sub.f64 fd1016, fd1014, fd1015; +mul.f64 fd1017, fd1011, fd947; +fma.rn.f64 fd1018, fd1013, fd946, fd1017; +mul.f64 fd1019, fd950, fd1011; +mul.f64 fd1020, fd951, fd1013; +sub.f64 fd1021, fd1019, fd1020; +mul.f64 fd1022, fd950, fd1013; +fma.rn.f64 fd1023, fd951, fd1011, fd1022; +mul.f64 fd1024, fd1021, fd920; +mul.f64 fd1025, fd1023, fd921; +sub.f64 fd1026, fd1024, fd1025; +mul.f64 fd1027, fd1021, fd921; +fma.rn.f64 fd1028, fd1023, fd920, fd1027; +ld.global.v2.f64 {fd1029, fd1030}, [rd11+32]; +mul.f64 fd1033, fd1029, fd924; +mul.f64 fd1034, fd1030, fd925; +sub.f64 fd1035, fd1033, fd1034; +mul.f64 fd1036, fd1029, fd925; +fma.rn.f64 fd1037, fd1030, fd924, fd1036; +mul.f64 fd1038, fd950, fd1029; +mul.f64 fd1039, fd951, fd1030; +sub.f64 fd1040, fd1038, fd1039; +mul.f64 fd1041, fd950, fd1030; +fma.rn.f64 fd1042, fd951, fd1029, fd1041; +mul.f64 fd1043, fd1040, fd928; +mul.f64 fd1044, fd1042, fd929; +sub.f64 fd1045, fd1043, fd1044; +mul.f64 fd1046, fd1040, fd929; +fma.rn.f64 fd1047, fd1042, fd928, fd1046; +mul.f64 fd1048, fd950, fd1040; +mul.f64 fd1049, fd951, fd1042; +sub.f64 fd1050, fd1048, fd1049; +mul.f64 fd1051, fd950, fd1042; +fma.rn.f64 fd1052, fd951, fd1040, fd1051; +mul.f64 fd1053, fd1050, fd932; +mul.f64 fd1054, fd1052, fd933; +sub.f64 fd1055, fd1053, fd1054; +mul.f64 fd1056, fd1050, fd933; +fma.rn.f64 fd1057, fd1052, fd932, fd1056; +mul.f64 fd1058, fd950, fd1050; +mul.f64 fd1059, fd951, fd1052; +sub.f64 fd1060, fd1058, fd1059; +mul.f64 fd1061, fd950, fd1052; +fma.rn.f64 fd1062, fd951, fd1050, fd1061; +mul.f64 fd1063, fd1060, fd936; +mul.f64 fd1064, fd1062, fd937; +sub.f64 fd1065, fd1063, fd1064; +mul.f64 fd1066, fd1060, fd937; +fma.rn.f64 fd1067, fd1062, fd936, fd1066; +mul.f64 fd1068, fd950, fd1060; +mul.f64 fd1069, fd951, fd1062; +sub.f64 fd1070, fd1068, fd1069; +mul.f64 fd1071, fd950, fd1062; +fma.rn.f64 fd1072, fd951, fd1060, fd1071; +mul.f64 fd1073, fd1070, fd940; +mul.f64 fd1074, fd1072, fd941; +sub.f64 fd1075, fd1073, fd1074; +mul.f64 fd1076, fd1070, fd941; +fma.rn.f64 fd1077, fd1072, fd940, fd1076; +mul.f64 fd1078, fd950, fd1070; +mul.f64 fd1079, fd951, fd1072; +sub.f64 fd1080, fd1078, fd1079; +mul.f64 fd1081, fd950, fd1072; +fma.rn.f64 fd1082, fd951, fd1070, fd1081; +mul.f64 fd1083, fd1080, fd944; +mul.f64 fd1084, fd1082, fd945; +sub.f64 fd1085, fd1083, fd1084; +mul.f64 fd1086, fd1080, fd945; +fma.rn.f64 fd1087, fd1082, fd944, fd1086; +mul.f64 fd1088, fd950, fd1080; +mul.f64 fd1089, fd951, fd1082; +sub.f64 fd1090, fd1088, fd1089; +mul.f64 fd1091, fd950, fd1082; +fma.rn.f64 fd1092, fd951, fd1080, fd1091; +mul.f64 fd1093, fd1090, fd948; +mul.f64 fd1094, fd1092, fd949; +sub.f64 fd1095, fd1093, fd1094; +mul.f64 fd1096, fd1090, fd949; +fma.rn.f64 fd1097, fd1092, fd948, fd1096; +and.b32 r23, r15, 2040; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 32768; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd918; +st.shared.f64 [r26+2048], fd956; +st.shared.f64 [r26+4096], fd966; +st.shared.f64 [r26+6144], fd976; +st.shared.f64 [r26+8192], fd986; +st.shared.f64 [r26+10240], fd996; +st.shared.f64 [r26+12288], fd1006; +st.shared.f64 [r26+14336], fd1016; +st.shared.f64 [r26+16384], fd1026; +st.shared.f64 [r26+18432], fd1035; +st.shared.f64 [r26+20480], fd1045; +st.shared.f64 [r26+22528], fd1055; +st.shared.f64 [r26+24576], fd1065; +st.shared.f64 [r26+26624], fd1075; +st.shared.f64 [r26+28672], fd1085; +st.shared.f64 [r26+30720], fd1095; +barrier.sync 0; +mad.lo.s32 r27, r21, -120, r26; +ld.shared.f64 fd1098, [r27]; +ld.shared.f64 fd1099, [r27+4096]; +ld.shared.f64 fd1100, [r27+8192]; +ld.shared.f64 fd1101, [r27+12288]; +ld.shared.f64 fd1102, [r27+16384]; +ld.shared.f64 fd1103, [r27+20480]; +ld.shared.f64 fd1104, [r27+24576]; +ld.shared.f64 fd1105, [r27+28672]; +ld.shared.f64 fd1106, [r27+32768]; +ld.shared.f64 fd1107, [r27+36864]; +ld.shared.f64 fd1108, [r27+40960]; +ld.shared.f64 fd1109, [r27+45056]; +ld.shared.f64 fd1110, [r27+49152]; +ld.shared.f64 fd1111, [r27+53248]; +ld.shared.f64 fd1112, [r27+57344]; +ld.shared.f64 fd1113, [r27+61440]; +barrier.sync 0; +st.shared.f64 [r26], fd919; +st.shared.f64 [r26+2048], fd958; +st.shared.f64 [r26+4096], fd968; +st.shared.f64 [r26+6144], fd978; +st.shared.f64 [r26+8192], fd988; +st.shared.f64 [r26+10240], fd998; +st.shared.f64 [r26+12288], fd1008; +st.shared.f64 [r26+14336], fd1018; +st.shared.f64 [r26+16384], fd1028; +st.shared.f64 [r26+18432], fd1037; +st.shared.f64 [r26+20480], fd1047; +st.shared.f64 [r26+22528], fd1057; +st.shared.f64 [r26+24576], fd1067; +st.shared.f64 [r26+26624], fd1077; +st.shared.f64 [r26+28672], fd1087; +st.shared.f64 [r26+30720], fd1097; +barrier.sync 0; +ld.shared.f64 fd1114, [r27]; +ld.shared.f64 fd1115, [r27+4096]; +ld.shared.f64 fd1116, [r27+8192]; +ld.shared.f64 fd1117, [r27+12288]; +ld.shared.f64 fd1118, [r27+16384]; +ld.shared.f64 fd1119, [r27+20480]; +ld.shared.f64 fd1120, [r27+24576]; +ld.shared.f64 fd1121, [r27+28672]; +ld.shared.f64 fd1122, [r27+32768]; +ld.shared.f64 fd1123, [r27+36864]; +ld.shared.f64 fd1124, [r27+40960]; +ld.shared.f64 fd1125, [r27+45056]; +ld.shared.f64 fd1126, [r27+49152]; +ld.shared.f64 fd1127, [r27+53248]; +ld.shared.f64 fd1128, [r27+57344]; +ld.shared.f64 fd1129, [r27+61440]; +add.f64 %0, fd1098, fd1106; +add.f64 %1, fd1114, fd1122; +add.f64 %2, fd1099, fd1107; +add.f64 %3, fd1115, fd1123; +add.f64 %4, fd1100, fd1108; +add.f64 %5, fd1116, fd1124; +add.f64 %6, fd1101, fd1109; +add.f64 %7, fd1117, fd1125; +add.f64 %8, fd1102, fd1110; +add.f64 %9, fd1118, fd1126; +add.f64 %10, fd1103, fd1111; +add.f64 %11, fd1119, fd1127; +add.f64 %12, fd1104, fd1112; +add.f64 %13, fd1120, fd1128; +add.f64 %14, fd1105, fd1113; +add.f64 %15, fd1121, fd1129; +sub.f64 %16, fd1098, fd1106; +sub.f64 %17, fd1114, fd1122; +sub.f64 %18, fd1099, fd1107; +sub.f64 %19, fd1115, fd1123; +sub.f64 %20, fd1100, fd1108; +sub.f64 %21, fd1116, fd1124; +sub.f64 %22, fd1101, fd1109; +sub.f64 %23, fd1117, fd1125; +sub.f64 %24, fd1102, fd1110; +sub.f64 %25, fd1118, fd1126; +sub.f64 %26, fd1103, fd1111; +sub.f64 %27, fd1119, fd1127; +sub.f64 %28, fd1104, fd1112; +sub.f64 %29, fd1120, fd1128; +sub.f64 %30, fd1105, fd1113; +sub.f64 %31, fd1121, fd1129; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_8192), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1163, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<677>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %21, %31; +add.f64 fd34, %22, %33; +sub.f64 fd35, %21, %31; +sub.f64 fd36, %22, %33; +add.f64 fd37, %26, %37; +add.f64 fd38, %28, %38; +sub.f64 fd39, %26, %37; +sub.f64 fd40, %28, %38; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %23, %34; +add.f64 fd50, %25, %36; +sub.f64 fd51, %23, %34; +sub.f64 fd52, %25, %36; +add.f64 fd53, %29, %39; +add.f64 fd54, %30, %40; +sub.f64 fd55, %29, %39; +sub.f64 fd56, %30, %40; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd69; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd69; +add.f64 fd80, fd43, fd60; +sub.f64 fd81, fd44, fd59; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd72; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd72; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd88, fd76; +mul.f64 fd93, fd89, fd77; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd97, fd80; +mul.f64 fd101, fd99, fd81; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd105, fd84; +mul.f64 fd109, fd107, fd85; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd113, fd74; +mul.f64 fd117, fd115, fd75; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+16384]; +mul.f64 fd123, fd119, fd78; +mul.f64 fd124, fd120, fd79; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd128, fd82; +mul.f64 fd132, fd130, fd83; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd136, fd86; +mul.f64 fd140, fd138, fd87; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 130944; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd89, fd76, fd94; +sub.f64 fd145, fd92, fd93; +st.shared.v2.f64 [r12+16], {fd145, fd144}; +fma.rn.f64 fd146, fd99, fd80, fd102; +sub.f64 fd147, fd100, fd101; +st.shared.v2.f64 [r12+32], {fd147, fd146}; +sub.f64 fd148, fd108, fd109; +fma.rn.f64 fd149, fd107, fd84, fd110; +st.shared.v2.f64 [r12+48], {fd148, fd149}; +fma.rn.f64 fd150, fd115, fd74, fd118; +sub.f64 fd151, fd116, fd117; +st.shared.v2.f64 [r12+64], {fd151, fd150}; +fma.rn.f64 fd152, fd120, fd78, fd125; +sub.f64 fd153, fd123, fd124; +st.shared.v2.f64 [r12+80], {fd153, fd152}; +fma.rn.f64 fd154, fd130, fd82, fd133; +sub.f64 fd155, fd131, fd132; +st.shared.v2.f64 [r12+96], {fd155, fd154}; +sub.f64 fd156, fd139, fd140; +fma.rn.f64 fd157, fd138, fd86, fd141; +st.shared.v2.f64 [r12+112], {fd156, fd157}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+16384]; +ld.shared.v2.f64 {fd166, fd167}, [r13+32768]; +ld.shared.v2.f64 {fd170, fd171}, [r13+49152]; +ld.shared.v2.f64 {fd174, fd175}, [r13+65536]; +ld.shared.v2.f64 {fd178, fd179}, [r13+81920]; +ld.shared.v2.f64 {fd182, fd183}, [r13+98304]; +ld.shared.v2.f64 {fd186, fd187}, [r13+114688]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +add.f64 fd202, fd192, fd197; +sub.f64 fd203, fd193, fd196; +sub.f64 fd204, fd192, fd197; +add.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +add.f64 fd218, fd208, fd213; +sub.f64 fd219, fd209, fd212; +sub.f64 fd220, fd208, fd213; +add.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0dBFE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +mul.f64 fd225, fd219, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd226, fd218, 0dBFE6A09E667F3BCD, fd225; +mul.f64 fd227, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd228, fd221, 0dBFE6A09E667F3BCD; +sub.f64 fd229, fd227, fd228; +add.f64 fd230, fd227, fd228; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd226; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd226; +add.f64 fd237, fd200, fd217; +sub.f64 fd238, fd201, fd216; +sub.f64 fd239, fd200, fd217; +add.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd229; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd229; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 1016; +bfe.u32 r15, r5, 3, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd245, fd233; +mul.f64 fd250, fd246, fd234; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd254, fd237; +mul.f64 fd258, fd256, fd238; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd262, fd241; +mul.f64 fd266, fd264, fd242; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd270, fd231; +mul.f64 fd274, fd272, fd232; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+2048]; +mul.f64 fd280, fd276, fd235; +mul.f64 fd281, fd277, fd236; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd285, fd239; +mul.f64 fd289, fd287, fd240; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd293, fd243; +mul.f64 fd297, fd295, fd244; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 130048; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd246, fd233, fd251; +sub.f64 fd302, fd249, fd250; +st.shared.v2.f64 [r19+128], {fd302, fd301}; +fma.rn.f64 fd303, fd256, fd237, fd259; +sub.f64 fd304, fd257, fd258; +st.shared.v2.f64 [r19+256], {fd304, fd303}; +fma.rn.f64 fd305, fd264, fd241, fd267; +sub.f64 fd306, fd265, fd266; +st.shared.v2.f64 [r19+384], {fd306, fd305}; +sub.f64 fd307, fd273, fd274; +fma.rn.f64 fd308, fd272, fd231, fd275; +st.shared.v2.f64 [r19+512], {fd307, fd308}; +fma.rn.f64 fd309, fd277, fd235, fd282; +sub.f64 fd310, fd280, fd281; +st.shared.v2.f64 [r19+640], {fd310, fd309}; +fma.rn.f64 fd311, fd287, fd239, fd290; +sub.f64 fd312, fd288, fd289; +st.shared.v2.f64 [r19+768], {fd312, fd311}; +fma.rn.f64 fd313, fd295, fd243, fd298; +sub.f64 fd314, fd296, fd297; +st.shared.v2.f64 [r19+896], {fd314, fd313}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+16384]; +ld.shared.v2.f64 {fd323, fd324}, [r20+32768]; +ld.shared.v2.f64 {fd327, fd328}, [r20+49152]; +ld.shared.v2.f64 {fd331, fd332}, [r20+65536]; +ld.shared.v2.f64 {fd335, fd336}, [r20+81920]; +ld.shared.v2.f64 {fd339, fd340}, [r20+98304]; +ld.shared.v2.f64 {fd343, fd344}, [r20+114688]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +add.f64 fd359, fd349, fd354; +sub.f64 fd360, fd350, fd353; +sub.f64 fd361, fd349, fd354; +add.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +add.f64 fd375, fd365, fd370; +sub.f64 fd376, fd366, fd369; +sub.f64 fd377, fd365, fd370; +add.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0dBFE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +mul.f64 fd382, fd376, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd383, fd375, 0dBFE6A09E667F3BCD, fd382; +mul.f64 fd384, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd385, fd378, 0dBFE6A09E667F3BCD; +sub.f64 fd386, fd384, fd385; +add.f64 fd387, fd384, fd385; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd383; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd383; +add.f64 fd394, fd357, fd374; +sub.f64 fd395, fd358, fd373; +sub.f64 fd396, fd357, fd374; +add.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd386; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd386; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 960; +bfe.u32 r22, r5, 6, 4; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd402, fd390; +mul.f64 fd407, fd403, fd391; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd411, fd394; +mul.f64 fd415, fd413, fd395; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd419, fd398; +mul.f64 fd423, fd421, fd399; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd427, fd388; +mul.f64 fd431, fd429, fd389; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+256]; +mul.f64 fd437, fd433, fd392; +mul.f64 fd438, fd434, fd393; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd442, fd396; +mul.f64 fd446, fd444, fd397; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd450, fd400; +mul.f64 fd454, fd452, fd401; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 122880; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd403, fd390, fd408; +sub.f64 fd459, fd406, fd407; +st.shared.v2.f64 [r26+1024], {fd459, fd458}; +fma.rn.f64 fd460, fd413, fd394, fd416; +sub.f64 fd461, fd414, fd415; +st.shared.v2.f64 [r26+2048], {fd461, fd460}; +fma.rn.f64 fd462, fd421, fd398, fd424; +sub.f64 fd463, fd422, fd423; +st.shared.v2.f64 [r26+3072], {fd463, fd462}; +sub.f64 fd464, fd430, fd431; +fma.rn.f64 fd465, fd429, fd388, fd432; +st.shared.v2.f64 [r26+4096], {fd464, fd465}; +fma.rn.f64 fd466, fd434, fd392, fd439; +sub.f64 fd467, fd437, fd438; +st.shared.v2.f64 [r26+5120], {fd467, fd466}; +fma.rn.f64 fd468, fd444, fd396, fd447; +sub.f64 fd469, fd445, fd446; +st.shared.v2.f64 [r26+6144], {fd469, fd468}; +fma.rn.f64 fd470, fd452, fd400, fd455; +sub.f64 fd471, fd453, fd454; +st.shared.v2.f64 [r26+7168], {fd471, fd470}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+16384]; +ld.shared.v2.f64 {fd480, fd481}, [r27+32768]; +ld.shared.v2.f64 {fd484, fd485}, [r27+49152]; +ld.shared.v2.f64 {fd488, fd489}, [r27+65536]; +ld.shared.v2.f64 {fd492, fd493}, [r27+81920]; +ld.shared.v2.f64 {fd496, fd497}, [r27+98304]; +ld.shared.v2.f64 {fd500, fd501}, [r27+114688]; +add.f64 fd504, fd472, fd488; +add.f64 fd505, fd473, fd489; +sub.f64 fd506, fd472, fd488; +sub.f64 fd507, fd473, fd489; +add.f64 fd508, fd480, fd496; +add.f64 fd509, fd481, fd497; +sub.f64 fd510, fd480, fd496; +sub.f64 fd511, fd481, fd497; +add.f64 fd512, fd504, fd508; +add.f64 fd513, fd505, fd509; +sub.f64 fd514, fd504, fd508; +sub.f64 fd515, fd505, fd509; +add.f64 fd516, fd506, fd511; +sub.f64 fd517, fd507, fd510; +sub.f64 fd518, fd506, fd511; +add.f64 fd519, fd507, fd510; +add.f64 fd520, fd476, fd492; +add.f64 fd521, fd477, fd493; +sub.f64 fd522, fd476, fd492; +sub.f64 fd523, fd477, fd493; +add.f64 fd524, fd484, fd500; +add.f64 fd525, fd485, fd501; +sub.f64 fd526, fd484, fd500; +sub.f64 fd527, fd485, fd501; +add.f64 fd528, fd520, fd524; +add.f64 fd529, fd521, fd525; +sub.f64 fd530, fd520, fd524; +sub.f64 fd531, fd521, fd525; +add.f64 fd532, fd522, fd527; +sub.f64 fd533, fd523, fd526; +sub.f64 fd534, fd522, fd527; +add.f64 fd535, fd523, fd526; +mul.f64 fd536, fd532, 0d3FE6A09E667F3BCD; +mul.f64 fd537, fd533, 0dBFE6A09E667F3BCD; +sub.f64 fd538, fd536, fd537; +mul.f64 fd539, fd533, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd540, fd532, 0dBFE6A09E667F3BCD, fd539; +mul.f64 fd541, fd534, 0dBFE6A09E667F3BCD; +mul.f64 fd542, fd535, 0dBFE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +sub.f64 fd545, fd512, fd528; +sub.f64 fd546, fd513, fd529; +add.f64 fd547, fd516, fd538; +add.f64 fd548, fd517, fd540; +sub.f64 fd549, fd516, fd538; +sub.f64 fd550, fd517, fd540; +add.f64 fd551, fd514, fd531; +sub.f64 fd552, fd515, fd530; +sub.f64 fd553, fd514, fd531; +add.f64 fd554, fd515, fd530; +add.f64 fd555, fd518, fd543; +add.f64 fd556, fd519, fd544; +sub.f64 fd557, fd518, fd543; +sub.f64 fd558, fd519, fd544; +and.b32 r28, r5, 512; +bfe.u32 r29, r5, 9, 1; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd559, fd560}, [rd14]; +mul.f64 fd563, fd559, fd547; +mul.f64 fd564, fd560, fd548; +mul.f64 fd565, fd559, fd548; +mul.f64 fd566, fd559, fd559; +mul.f64 fd567, fd560, fd560; +sub.f64 fd568, fd566, fd567; +mul.f64 fd569, fd560, fd559; +fma.rn.f64 fd570, fd560, fd559, fd569; +mul.f64 fd571, fd568, fd551; +mul.f64 fd572, fd570, fd552; +mul.f64 fd573, fd568, fd552; +mul.f64 fd574, fd559, fd568; +mul.f64 fd575, fd560, fd570; +sub.f64 fd576, fd574, fd575; +mul.f64 fd577, fd559, fd570; +fma.rn.f64 fd578, fd560, fd568, fd577; +mul.f64 fd579, fd576, fd555; +mul.f64 fd580, fd578, fd556; +mul.f64 fd581, fd576, fd556; +mul.f64 fd582, fd559, fd576; +mul.f64 fd583, fd560, fd578; +sub.f64 fd584, fd582, fd583; +mul.f64 fd585, fd559, fd578; +fma.rn.f64 fd586, fd560, fd576, fd585; +mul.f64 fd587, fd584, fd545; +mul.f64 fd588, fd586, fd546; +mul.f64 fd589, fd584, fd546; +ld.global.v2.f64 {fd590, fd591}, [rd14+32]; +mul.f64 fd594, fd590, fd549; +mul.f64 fd595, fd591, fd550; +mul.f64 fd596, fd590, fd550; +mul.f64 fd597, fd559, fd590; +mul.f64 fd598, fd560, fd591; +sub.f64 fd599, fd597, fd598; +mul.f64 fd600, fd559, fd591; +fma.rn.f64 fd601, fd560, fd590, fd600; +mul.f64 fd602, fd599, fd553; +mul.f64 fd603, fd601, fd554; +mul.f64 fd604, fd599, fd554; +mul.f64 fd605, fd559, fd599; +mul.f64 fd606, fd560, fd601; +sub.f64 fd607, fd605, fd606; +mul.f64 fd608, fd559, fd601; +fma.rn.f64 fd609, fd560, fd599, fd608; +mul.f64 fd610, fd607, fd557; +mul.f64 fd611, fd609, fd558; +mul.f64 fd612, fd607, fd558; +and.b32 r30, r10, 8176; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 65536; +add.s32 r33, r31, r32; +add.f64 fd613, fd513, fd529; +add.f64 fd614, fd512, fd528; +st.shared.v2.f64 [r33], {fd614, fd613}; +fma.rn.f64 fd615, fd560, fd547, fd565; +sub.f64 fd616, fd563, fd564; +st.shared.v2.f64 [r33+8192], {fd616, fd615}; +fma.rn.f64 fd617, fd570, fd551, fd573; +sub.f64 fd618, fd571, fd572; +st.shared.v2.f64 [r33+16384], {fd618, fd617}; +fma.rn.f64 fd619, fd578, fd555, fd581; +sub.f64 fd620, fd579, fd580; +st.shared.v2.f64 [r33+24576], {fd620, fd619}; +sub.f64 fd621, fd587, fd588; +fma.rn.f64 fd622, fd586, fd545, fd589; +st.shared.v2.f64 [r33+32768], {fd621, fd622}; +fma.rn.f64 fd623, fd591, fd549, fd596; +sub.f64 fd624, fd594, fd595; +st.shared.v2.f64 [r33+40960], {fd624, fd623}; +fma.rn.f64 fd625, fd601, fd553, fd604; +sub.f64 fd626, fd602, fd603; +st.shared.v2.f64 [r33+49152], {fd626, fd625}; +fma.rn.f64 fd627, fd609, fd557, fd612; +sub.f64 fd628, fd610, fd611; +st.shared.v2.f64 [r33+57344], {fd628, fd627}; +barrier.sync 0; +mad.lo.s32 r34, r28, -112, r33; +ld.shared.v2.f64 {fd629, fd630}, [r34]; +ld.shared.v2.f64 {fd633, fd634}, [r34+16384]; +ld.shared.v2.f64 {fd637, fd638}, [r34+32768]; +ld.shared.v2.f64 {fd641, fd642}, [r34+49152]; +ld.shared.v2.f64 {fd645, fd646}, [r34+65536]; +ld.shared.v2.f64 {fd649, fd650}, [r34+81920]; +ld.shared.v2.f64 {fd653, fd654}, [r34+98304]; +ld.shared.v2.f64 {fd657, fd658}, [r34+114688]; +add.f64 %1, fd630, fd646; +add.f64 %0, fd629, fd645; +add.f64 %3, fd634, fd650; +add.f64 %2, fd633, fd649; +add.f64 %5, fd638, fd654; +add.f64 %4, fd637, fd653; +add.f64 %7, fd642, fd658; +add.f64 %6, fd641, fd657; +sub.f64 %9, fd630, fd646; +sub.f64 %8, fd629, fd645; +sub.f64 %11, fd634, fd650; +sub.f64 %10, fd633, fd649; +sub.f64 %13, fd638, fd654; +sub.f64 %12, fd637, fd653; +sub.f64 %15, fd642, fd658; +sub.f64 %14, fd641, fd657; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_8192), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..e890d63d14ac5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8192_fp64_inv.hpp.inc @@ -0,0 +1,3696 @@ +#ifndef CUFFTDX_FFT_8192_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_8192_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1168, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<63>; +.reg .f64 fd<1512>; +.reg .b64 rd<16>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +add.f64 fd65, %36, %52; +sub.f64 fd67, %36, %52; +add.f64 fd1503, %37, %68; +sub.f64 fd68, %37, %68; +add.f64 fd69, %44, %60; +sub.f64 fd71, %44, %60; +add.f64 fd1501, %69, %61; +sub.f64 fd72, %69, %61; +add.f64 fd73, fd65, fd69; +sub.f64 fd75, fd65, fd69; +add.f64 fd1500, fd1503, fd1501; +sub.f64 fd76, fd1503, fd1501; +sub.f64 fd77, fd67, fd72; +add.f64 fd79, fd67, fd72; +add.f64 fd1499, fd68, fd71; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %40, %56; +sub.f64 fd83, %40, %56; +add.f64 fd1496, %71, %70; +sub.f64 fd84, %71, %70; +add.f64 fd85, %48, %64; +sub.f64 fd87, %48, %64; +add.f64 fd1494, %49, %72; +sub.f64 fd88, %49, %72; +add.f64 fd89, fd81, fd85; +sub.f64 fd91, fd81, fd85; +add.f64 fd1493, fd1496, fd1494; +sub.f64 fd92, fd1496, fd1494; +sub.f64 fd93, fd83, fd88; +add.f64 fd95, fd83, fd88; +add.f64 fd1492, fd84, fd87; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd1492, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd1490, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd1491, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd1490, fd1491; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +sub.f64 fd108, fd73, fd89; +add.f64 fd1489, fd1500, fd1493; +sub.f64 fd109, fd1500, fd1493; +add.f64 fd110, fd77, fd99; +sub.f64 fd112, fd77, fd99; +add.f64 fd1488, fd1499, fd100; +sub.f64 fd113, fd1499, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd116, fd75, fd92; +add.f64 fd1487, fd76, fd91; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +sub.f64 fd120, fd79, fd103; +add.f64 fd1486, fd80, fd105; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %38, %54; +sub.f64 fd124, %38, %54; +add.f64 fd1484, %73, %55; +sub.f64 fd125, %73, %55; +add.f64 fd126, %46, %62; +sub.f64 fd128, %46, %62; +add.f64 fd1481, %74, %75; +sub.f64 fd129, %74, %75; +add.f64 fd130, fd122, fd126; +sub.f64 fd132, fd122, fd126; +add.f64 fd1480, fd1484, fd1481; +sub.f64 fd133, fd1484, fd1481; +sub.f64 fd134, fd124, fd129; +add.f64 fd136, fd124, fd129; +add.f64 fd1479, fd125, fd128; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %42, %58; +sub.f64 fd140, %42, %58; +add.f64 fd1477, %43, %76; +sub.f64 fd141, %43, %76; +add.f64 fd142, %50, %66; +sub.f64 fd144, %50, %66; +add.f64 fd1475, %77, %67; +sub.f64 fd145, %77, %67; +add.f64 fd146, fd138, fd142; +sub.f64 fd148, fd138, fd142; +add.f64 fd1474, fd1477, fd1475; +sub.f64 fd149, fd1477, fd1475; +sub.f64 fd150, fd140, fd145; +add.f64 fd152, fd140, fd145; +add.f64 fd1473, fd141, fd144; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd1473, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd1471, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd1472, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd1471, fd1472; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +sub.f64 fd165, fd130, fd146; +add.f64 fd1470, fd1480, fd1474; +sub.f64 fd166, fd1480, fd1474; +add.f64 fd167, fd134, fd156; +sub.f64 fd169, fd134, fd156; +add.f64 fd1469, fd1479, fd157; +sub.f64 fd170, fd1479, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd173, fd132, fd149; +add.f64 fd1468, fd133, fd148; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +sub.f64 fd177, fd136, fd160; +add.f64 fd1467, fd137, fd162; +sub.f64 fd178, fd137, fd162; +mul.f64 fd1465, fd167, 0d3FED906BCF328D46; +mul.f64 fd1466, fd1469, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd1465, fd1466; +mul.f64 fd182, fd1469, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd1468, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd189, fd1467, 0d3FED906BCF328D46; +mul.f64 fd1464, fd175, 0d3FD87DE2A6AEA963; +sub.f64 fd190, fd1464, fd189; +mul.f64 fd191, fd1467, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +mul.f64 fd1463, fd169, 0dBFD87DE2A6AEA963; +sub.f64 fd195, fd1463, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd1461, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd1462, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd1461, fd1462; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd1459, fd177, 0dBFED906BCF328D46; +mul.f64 fd1460, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd1459, fd1460; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd210, fd110, fd181; +sub.f64 fd212, fd110, fd181; +add.f64 fd1458, fd1488, fd183; +sub.f64 fd213, fd1488, fd183; +add.f64 fd214, fd114, fd186; +sub.f64 fd216, fd114, fd186; +add.f64 fd1457, fd1487, fd187; +sub.f64 fd217, fd1487, fd187; +add.f64 fd218, fd118, fd190; +sub.f64 fd220, fd118, fd190; +add.f64 fd1456, fd1486, fd192; +sub.f64 fd221, fd1486, fd192; +sub.f64 fd222, fd108, fd166; +add.f64 fd224, fd108, fd166; +add.f64 fd1455, fd109, fd165; +sub.f64 fd225, fd109, fd165; +add.f64 fd226, fd112, fd195; +sub.f64 fd228, fd112, fd195; +add.f64 fd1454, fd113, fd197; +sub.f64 fd229, fd113, fd197; +add.f64 fd230, fd116, fd200; +sub.f64 fd232, fd116, fd200; +add.f64 fd1453, fd117, fd202; +sub.f64 fd233, fd117, fd202; +add.f64 fd234, fd120, fd205; +sub.f64 fd236, fd120, fd205; +add.f64 fd1452, fd121, fd207; +sub.f64 fd237, fd121, fd207; +mov.u32 r28, %tid.x; +shl.b32 r7, r28, 8; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r28, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd238, fd239}, [rd5]; +mul.f64 fd242, fd1458, fd239; +mul.f64 fd244, fd238, fd1458; +mul.f64 fd246, fd239, fd239; +mul.f64 fd1451, fd238, fd238; +sub.f64 fd247, fd1451, fd246; +mul.f64 fd248, fd239, fd238; +fma.rn.f64 fd249, fd239, fd238, fd248; +mul.f64 fd250, fd1457, fd249; +mul.f64 fd252, fd247, fd1457; +mul.f64 fd1449, fd238, fd247; +mul.f64 fd1450, fd239, fd249; +sub.f64 fd255, fd1449, fd1450; +mul.f64 fd1448, fd214, fd249; +mul.f64 fd256, fd238, fd249; +fma.rn.f64 fd257, fd239, fd247, fd256; +mul.f64 fd258, fd1456, fd257; +mul.f64 fd260, fd255, fd1456; +mul.f64 fd262, fd239, fd257; +mul.f64 fd1447, fd238, fd255; +sub.f64 fd263, fd1447, fd262; +mul.f64 fd1446, fd218, fd257; +mul.f64 fd264, fd238, fd257; +fma.rn.f64 fd265, fd239, fd255, fd264; +mul.f64 fd266, fd1455, fd265; +mul.f64 fd268, fd263, fd1455; +mul.f64 fd270, fd239, fd265; +mul.f64 fd1445, fd238, fd263; +sub.f64 fd271, fd1445, fd270; +mul.f64 fd1444, fd222, fd265; +mul.f64 fd272, fd238, fd265; +fma.rn.f64 fd273, fd239, fd263, fd272; +mul.f64 fd274, fd1454, fd273; +mul.f64 fd276, fd271, fd1454; +mul.f64 fd1442, fd238, fd271; +mul.f64 fd1443, fd239, fd273; +sub.f64 fd279, fd1442, fd1443; +mul.f64 fd1441, fd226, fd273; +mul.f64 fd280, fd238, fd273; +fma.rn.f64 fd281, fd239, fd271, fd280; +mul.f64 fd282, fd1453, fd281; +mul.f64 fd284, fd279, fd1453; +mul.f64 fd286, fd239, fd281; +mul.f64 fd1440, fd238, fd279; +sub.f64 fd287, fd1440, fd286; +mul.f64 fd1439, fd230, fd281; +mul.f64 fd288, fd238, fd281; +fma.rn.f64 fd289, fd239, fd279, fd288; +mul.f64 fd290, fd1452, fd289; +mul.f64 fd292, fd287, fd1452; +mul.f64 fd294, fd239, fd289; +mul.f64 fd1438, fd238, fd287; +sub.f64 fd295, fd1438, fd294; +mul.f64 fd1437, fd234, fd289; +mul.f64 fd296, fd238, fd289; +fma.rn.f64 fd297, fd239, fd287, fd296; +sub.f64 fd1436, fd1489, fd1470; +mul.f64 fd298, fd1436, fd297; +sub.f64 fd1435, fd106, fd163; +mul.f64 fd299, fd1435, fd297; +mul.f64 fd300, fd295, fd1436; +ld.global.v2.f64 {fd301, fd302}, [rd5+8192]; +mul.f64 fd305, fd213, fd302; +mul.f64 fd307, fd301, fd213; +mul.f64 fd1433, fd238, fd301; +mul.f64 fd1434, fd239, fd302; +sub.f64 fd310, fd1433, fd1434; +mul.f64 fd1432, fd212, fd302; +mul.f64 fd311, fd238, fd302; +fma.rn.f64 fd312, fd239, fd301, fd311; +mul.f64 fd313, fd217, fd312; +mul.f64 fd315, fd310, fd217; +mul.f64 fd317, fd239, fd312; +mul.f64 fd1431, fd238, fd310; +sub.f64 fd318, fd1431, fd317; +mul.f64 fd1430, fd216, fd312; +mul.f64 fd319, fd238, fd312; +fma.rn.f64 fd320, fd239, fd310, fd319; +mul.f64 fd321, fd221, fd320; +mul.f64 fd323, fd318, fd221; +mul.f64 fd1428, fd238, fd318; +mul.f64 fd1429, fd239, fd320; +sub.f64 fd326, fd1428, fd1429; +mul.f64 fd1427, fd220, fd320; +mul.f64 fd327, fd238, fd320; +fma.rn.f64 fd328, fd239, fd318, fd327; +mul.f64 fd329, fd225, fd328; +mul.f64 fd331, fd326, fd225; +mul.f64 fd1425, fd238, fd326; +mul.f64 fd1426, fd239, fd328; +sub.f64 fd334, fd1425, fd1426; +mul.f64 fd1424, fd224, fd328; +mul.f64 fd335, fd238, fd328; +fma.rn.f64 fd336, fd239, fd326, fd335; +mul.f64 fd337, fd229, fd336; +mul.f64 fd339, fd334, fd229; +mul.f64 fd341, fd239, fd336; +mul.f64 fd1423, fd238, fd334; +sub.f64 fd342, fd1423, fd341; +mul.f64 fd1422, fd228, fd336; +mul.f64 fd343, fd238, fd336; +fma.rn.f64 fd344, fd239, fd334, fd343; +mul.f64 fd345, fd233, fd344; +mul.f64 fd347, fd342, fd233; +mul.f64 fd1420, fd238, fd342; +mul.f64 fd1421, fd239, fd344; +sub.f64 fd350, fd1420, fd1421; +mul.f64 fd1419, fd232, fd344; +mul.f64 fd351, fd238, fd344; +mul.f64 fd1418, fd210, fd239; +fma.rn.f64 fd352, fd239, fd342, fd351; +mul.f64 fd353, fd237, fd352; +mul.f64 fd354, fd236, fd352; +mul.f64 fd355, fd350, fd237; +barrier.sync 0; +and.b32 r11, r7, 130816; +add.s32 r12, r9, r11; +mov.u32 r55, %tid.x; +add.f64 fd356, fd1489, fd1470; +shl.b32 r54, r55, 8; +and.b32 r36, r54, 130816; +add.s32 r35, r9, r36; +sub.f64 fd1511, fd106, fd163; +add.f64 fd357, fd106, fd163; +shl.b32 r53, r55, 8; +and.b32 r47, r53, 130816; +add.s32 r46, r9, r47; +st.shared.v2.f64 [r46], {fd357, fd356}; +shl.b32 r52, r55, 8; +shl.b32 r45, r55, 4; +fma.rn.f64 fd358, fd238, fd210, fd242; +sub.f64 fd359, fd244, fd1418; +st.shared.v2.f64 [r46+16], {fd358, fd359}; +fma.rn.f64 fd360, fd247, fd214, fd250; +sub.f64 fd361, fd252, fd1448; +st.shared.v2.f64 [r46+32], {fd360, fd361}; +fma.rn.f64 fd362, fd255, fd218, fd258; +sub.f64 fd363, fd260, fd1446; +st.shared.v2.f64 [r46+48], {fd362, fd363}; +sub.f64 fd364, fd268, fd1444; +fma.rn.f64 fd365, fd263, fd222, fd266; +st.shared.v2.f64 [r46+64], {fd365, fd364}; +fma.rn.f64 fd366, fd271, fd226, fd274; +sub.f64 fd367, fd276, fd1441; +st.shared.v2.f64 [r46+80], {fd366, fd367}; +fma.rn.f64 fd368, fd279, fd230, fd282; +sub.f64 fd369, fd284, fd1439; +st.shared.v2.f64 [r46+96], {fd368, fd369}; +fma.rn.f64 fd370, fd287, fd234, fd290; +sub.f64 fd371, fd292, fd1437; +st.shared.v2.f64 [r46+112], {fd370, fd371}; +fma.rn.f64 fd372, fd295, fd1511, fd298; +sub.f64 fd373, fd300, fd299; +st.shared.v2.f64 [r46+128], {fd372, fd373}; +fma.rn.f64 fd374, fd301, fd212, fd305; +sub.f64 fd375, fd307, fd1432; +st.shared.v2.f64 [r46+144], {fd374, fd375}; +fma.rn.f64 fd376, fd310, fd216, fd313; +sub.f64 fd377, fd315, fd1430; +st.shared.v2.f64 [r46+160], {fd376, fd377}; +fma.rn.f64 fd378, fd318, fd220, fd321; +sub.f64 fd379, fd323, fd1427; +st.shared.v2.f64 [r46+176], {fd378, fd379}; +sub.f64 fd380, fd331, fd1424; +fma.rn.f64 fd381, fd326, fd224, fd329; +st.shared.v2.f64 [r46+192], {fd381, fd380}; +fma.rn.f64 fd382, fd334, fd228, fd337; +sub.f64 fd383, fd339, fd1422; +st.shared.v2.f64 [r46+208], {fd382, fd383}; +fma.rn.f64 fd384, fd342, fd232, fd345; +sub.f64 fd385, fd347, fd1419; +st.shared.v2.f64 [r46+224], {fd384, fd385}; +fma.rn.f64 fd386, fd350, fd236, fd353; +sub.f64 fd387, fd355, fd354; +st.shared.v2.f64 [r46+240], {fd386, fd387}; +barrier.sync 0; +and.b32 r27, r55, 511; +mad.lo.s32 r13, r27, -240, r46; +ld.shared.v2.f64 {fd388, fd389}, [r13]; +ld.shared.v2.f64 {fd392, fd393}, [r13+8192]; +ld.shared.v2.f64 {fd396, fd397}, [r13+16384]; +ld.shared.v2.f64 {fd400, fd401}, [r13+24576]; +ld.shared.v2.f64 {fd404, fd405}, [r13+32768]; +ld.shared.v2.f64 {fd408, fd409}, [r13+40960]; +ld.shared.v2.f64 {fd412, fd413}, [r13+49152]; +ld.shared.v2.f64 {fd416, fd417}, [r13+57344]; +ld.shared.v2.f64 {fd420, fd421}, [r13+65536]; +ld.shared.v2.f64 {fd424, fd425}, [r13+73728]; +ld.shared.v2.f64 {fd428, fd429}, [r13+81920]; +ld.shared.v2.f64 {fd432, fd433}, [r13+90112]; +ld.shared.v2.f64 {fd436, fd437}, [r13+98304]; +ld.shared.v2.f64 {fd440, fd441}, [r13+106496]; +ld.shared.v2.f64 {fd444, fd445}, [r13+114688]; +ld.shared.v2.f64 {fd448, fd449}, [r13+122880]; +add.f64 fd452, fd388, fd420; +sub.f64 fd454, fd388, fd420; +add.f64 fd1417, fd389, fd421; +sub.f64 fd455, fd389, fd421; +add.f64 fd456, fd404, fd436; +sub.f64 fd458, fd404, fd436; +add.f64 fd1416, fd405, fd437; +sub.f64 fd459, fd405, fd437; +add.f64 fd460, fd452, fd456; +sub.f64 fd462, fd452, fd456; +add.f64 fd1415, fd1417, fd1416; +sub.f64 fd463, fd1417, fd1416; +sub.f64 fd464, fd454, fd459; +add.f64 fd466, fd454, fd459; +add.f64 fd1414, fd455, fd458; +sub.f64 fd467, fd455, fd458; +add.f64 fd468, fd396, fd428; +sub.f64 fd470, fd396, fd428; +add.f64 fd1413, fd397, fd429; +sub.f64 fd471, fd397, fd429; +add.f64 fd472, fd412, fd444; +sub.f64 fd474, fd412, fd444; +add.f64 fd1412, fd413, fd445; +sub.f64 fd475, fd413, fd445; +add.f64 fd476, fd468, fd472; +sub.f64 fd478, fd468, fd472; +add.f64 fd1411, fd1413, fd1412; +sub.f64 fd479, fd1413, fd1412; +sub.f64 fd480, fd470, fd475; +add.f64 fd482, fd470, fd475; +add.f64 fd1410, fd471, fd474; +sub.f64 fd483, fd471, fd474; +mul.f64 fd484, fd480, 0d3FE6A09E667F3BCD; +mul.f64 fd485, fd1410, 0d3FE6A09E667F3BCD; +sub.f64 fd486, fd484, fd485; +add.f64 fd487, fd484, fd485; +mul.f64 fd489, fd483, 0d3FE6A09E667F3BCD; +mul.f64 fd1409, fd482, 0dBFE6A09E667F3BCD; +sub.f64 fd490, fd1409, fd489; +mul.f64 fd491, fd483, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd492, fd482, 0d3FE6A09E667F3BCD, fd491; +add.f64 fd493, fd460, fd476; +sub.f64 fd495, fd460, fd476; +add.f64 fd1408, fd1415, fd1411; +sub.f64 fd496, fd1415, fd1411; +add.f64 fd497, fd464, fd486; +sub.f64 fd499, fd464, fd486; +add.f64 fd1407, fd1414, fd487; +sub.f64 fd500, fd1414, fd487; +sub.f64 fd501, fd462, fd479; +add.f64 fd503, fd462, fd479; +add.f64 fd1406, fd463, fd478; +sub.f64 fd504, fd463, fd478; +add.f64 fd505, fd466, fd490; +sub.f64 fd507, fd466, fd490; +add.f64 fd1405, fd467, fd492; +sub.f64 fd508, fd467, fd492; +add.f64 fd509, fd392, fd424; +sub.f64 fd511, fd392, fd424; +add.f64 fd1404, fd393, fd425; +sub.f64 fd512, fd393, fd425; +add.f64 fd513, fd408, fd440; +sub.f64 fd515, fd408, fd440; +add.f64 fd1403, fd409, fd441; +sub.f64 fd516, fd409, fd441; +add.f64 fd517, fd509, fd513; +sub.f64 fd519, fd509, fd513; +add.f64 fd1402, fd1404, fd1403; +sub.f64 fd520, fd1404, fd1403; +sub.f64 fd521, fd511, fd516; +add.f64 fd523, fd511, fd516; +add.f64 fd1401, fd512, fd515; +sub.f64 fd524, fd512, fd515; +add.f64 fd525, fd400, fd432; +sub.f64 fd527, fd400, fd432; +add.f64 fd1400, fd401, fd433; +sub.f64 fd528, fd401, fd433; +add.f64 fd529, fd416, fd448; +sub.f64 fd531, fd416, fd448; +add.f64 fd1399, fd417, fd449; +sub.f64 fd532, fd417, fd449; +add.f64 fd533, fd525, fd529; +sub.f64 fd535, fd525, fd529; +add.f64 fd1398, fd1400, fd1399; +sub.f64 fd536, fd1400, fd1399; +sub.f64 fd537, fd527, fd532; +add.f64 fd539, fd527, fd532; +add.f64 fd1397, fd528, fd531; +sub.f64 fd540, fd528, fd531; +mul.f64 fd541, fd537, 0d3FE6A09E667F3BCD; +mul.f64 fd542, fd1397, 0d3FE6A09E667F3BCD; +sub.f64 fd543, fd541, fd542; +add.f64 fd544, fd541, fd542; +mul.f64 fd546, fd540, 0d3FE6A09E667F3BCD; +mul.f64 fd1396, fd539, 0dBFE6A09E667F3BCD; +sub.f64 fd547, fd1396, fd546; +mul.f64 fd548, fd540, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd549, fd539, 0d3FE6A09E667F3BCD, fd548; +add.f64 fd550, fd517, fd533; +sub.f64 fd552, fd517, fd533; +add.f64 fd1395, fd1402, fd1398; +sub.f64 fd553, fd1402, fd1398; +add.f64 fd554, fd521, fd543; +sub.f64 fd556, fd521, fd543; +add.f64 fd1394, fd1401, fd544; +sub.f64 fd557, fd1401, fd544; +sub.f64 fd558, fd519, fd536; +add.f64 fd560, fd519, fd536; +add.f64 fd1393, fd520, fd535; +sub.f64 fd561, fd520, fd535; +add.f64 fd562, fd523, fd547; +sub.f64 fd564, fd523, fd547; +add.f64 fd1392, fd524, fd549; +sub.f64 fd565, fd524, fd549; +mul.f64 fd1390, fd554, 0d3FED906BCF328D46; +mul.f64 fd1391, fd1394, 0d3FD87DE2A6AEA963; +sub.f64 fd568, fd1390, fd1391; +mul.f64 fd569, fd1394, 0d3FED906BCF328D46; +fma.rn.f64 fd570, fd554, 0d3FD87DE2A6AEA963, fd569; +mul.f64 fd571, fd558, 0d3FE6A09E667F3BCD; +mul.f64 fd572, fd1393, 0d3FE6A09E667F3BCD; +sub.f64 fd573, fd571, fd572; +add.f64 fd574, fd571, fd572; +mul.f64 fd1388, fd562, 0d3FD87DE2A6AEA963; +mul.f64 fd1389, fd1392, 0d3FED906BCF328D46; +sub.f64 fd577, fd1388, fd1389; +mul.f64 fd578, fd1392, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd579, fd562, 0d3FED906BCF328D46, fd578; +mul.f64 fd1386, fd556, 0dBFD87DE2A6AEA963; +mul.f64 fd1387, fd557, 0d3FED906BCF328D46; +sub.f64 fd582, fd1386, fd1387; +mul.f64 fd583, fd557, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd584, fd556, 0d3FED906BCF328D46, fd583; +mul.f64 fd1384, fd560, 0dBFE6A09E667F3BCD; +mul.f64 fd1385, fd561, 0d3FE6A09E667F3BCD; +sub.f64 fd587, fd1384, fd1385; +mul.f64 fd588, fd561, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd589, fd560, 0d3FE6A09E667F3BCD, fd588; +mul.f64 fd591, fd565, 0d3FD87DE2A6AEA963; +mul.f64 fd1383, fd564, 0dBFED906BCF328D46; +sub.f64 fd592, fd1383, fd591; +mul.f64 fd593, fd565, 0dBFED906BCF328D46; +fma.rn.f64 fd594, fd564, 0d3FD87DE2A6AEA963, fd593; +add.f64 fd597, fd497, fd568; +sub.f64 fd599, fd497, fd568; +add.f64 fd1382, fd1407, fd570; +sub.f64 fd600, fd1407, fd570; +add.f64 fd601, fd501, fd573; +sub.f64 fd603, fd501, fd573; +add.f64 fd1381, fd1406, fd574; +sub.f64 fd604, fd1406, fd574; +add.f64 fd605, fd505, fd577; +sub.f64 fd607, fd505, fd577; +add.f64 fd1380, fd1405, fd579; +sub.f64 fd608, fd1405, fd579; +sub.f64 fd609, fd495, fd553; +add.f64 fd611, fd495, fd553; +add.f64 fd1379, fd496, fd552; +sub.f64 fd612, fd496, fd552; +add.f64 fd613, fd499, fd582; +sub.f64 fd615, fd499, fd582; +add.f64 fd1378, fd500, fd584; +sub.f64 fd616, fd500, fd584; +add.f64 fd617, fd503, fd587; +sub.f64 fd619, fd503, fd587; +add.f64 fd1377, fd504, fd589; +sub.f64 fd620, fd504, fd589; +add.f64 fd621, fd507, fd592; +sub.f64 fd623, fd507, fd592; +add.f64 fd1376, fd508, fd594; +sub.f64 fd624, fd508, fd594; +and.b32 r14, r55, 496; +mov.u64 rd7, %34; +cvt.u64.u32 rd12, r14; +add.s64 rd8, rd7, rd12; +ld.global.v2.f64 {fd625, fd626}, [rd8]; +mul.f64 fd629, fd1382, fd626; +mul.f64 fd631, fd625, fd1382; +mul.f64 fd633, fd626, fd626; +mul.f64 fd1375, fd625, fd625; +sub.f64 fd634, fd1375, fd633; +mul.f64 fd635, fd626, fd625; +fma.rn.f64 fd636, fd626, fd625, fd635; +mul.f64 fd637, fd1381, fd636; +mul.f64 fd639, fd634, fd1381; +mul.f64 fd1373, fd625, fd634; +mul.f64 fd1374, fd626, fd636; +sub.f64 fd642, fd1373, fd1374; +mul.f64 fd1372, fd601, fd636; +mul.f64 fd643, fd625, fd636; +fma.rn.f64 fd644, fd626, fd634, fd643; +mul.f64 fd645, fd1380, fd644; +mul.f64 fd647, fd642, fd1380; +mul.f64 fd649, fd626, fd644; +mul.f64 fd1371, fd625, fd642; +sub.f64 fd650, fd1371, fd649; +mul.f64 fd1370, fd605, fd644; +mul.f64 fd651, fd625, fd644; +fma.rn.f64 fd652, fd626, fd642, fd651; +mul.f64 fd653, fd1379, fd652; +mul.f64 fd655, fd650, fd1379; +mul.f64 fd1368, fd625, fd650; +mul.f64 fd1369, fd626, fd652; +sub.f64 fd658, fd1368, fd1369; +mul.f64 fd1367, fd609, fd652; +mul.f64 fd659, fd625, fd652; +fma.rn.f64 fd660, fd626, fd650, fd659; +mul.f64 fd661, fd1378, fd660; +mul.f64 fd663, fd658, fd1378; +mul.f64 fd1365, fd625, fd658; +mul.f64 fd1366, fd626, fd660; +sub.f64 fd666, fd1365, fd1366; +mul.f64 fd1364, fd613, fd660; +mul.f64 fd667, fd625, fd660; +fma.rn.f64 fd668, fd626, fd658, fd667; +mul.f64 fd669, fd1377, fd668; +mul.f64 fd671, fd666, fd1377; +mul.f64 fd673, fd626, fd668; +mul.f64 fd1363, fd625, fd666; +sub.f64 fd674, fd1363, fd673; +mul.f64 fd1362, fd617, fd668; +mul.f64 fd675, fd625, fd668; +fma.rn.f64 fd676, fd626, fd666, fd675; +mul.f64 fd677, fd1376, fd676; +mul.f64 fd679, fd674, fd1376; +mul.f64 fd1360, fd625, fd674; +mul.f64 fd1361, fd626, fd676; +sub.f64 fd682, fd1360, fd1361; +mul.f64 fd1359, fd621, fd676; +mul.f64 fd683, fd625, fd676; +fma.rn.f64 fd684, fd626, fd674, fd683; +sub.f64 fd1358, fd1408, fd1395; +mul.f64 fd685, fd1358, fd684; +sub.f64 fd1357, fd493, fd550; +mul.f64 fd686, fd1357, fd684; +mul.f64 fd687, fd682, fd1358; +ld.global.v2.f64 {fd688, fd689}, [rd8+512]; +mul.f64 fd692, fd600, fd689; +mul.f64 fd694, fd688, fd600; +mul.f64 fd696, fd626, fd689; +mul.f64 fd1356, fd625, fd688; +sub.f64 fd697, fd1356, fd696; +mul.f64 fd1355, fd599, fd689; +mul.f64 fd698, fd625, fd689; +fma.rn.f64 fd699, fd626, fd688, fd698; +mul.f64 fd700, fd604, fd699; +mul.f64 fd702, fd697, fd604; +mul.f64 fd704, fd626, fd699; +mul.f64 fd1354, fd625, fd697; +sub.f64 fd705, fd1354, fd704; +mul.f64 fd1353, fd603, fd699; +mul.f64 fd706, fd625, fd699; +fma.rn.f64 fd707, fd626, fd697, fd706; +mul.f64 fd708, fd608, fd707; +mul.f64 fd710, fd705, fd608; +mul.f64 fd1351, fd625, fd705; +mul.f64 fd1352, fd626, fd707; +sub.f64 fd713, fd1351, fd1352; +mul.f64 fd1350, fd607, fd707; +mul.f64 fd714, fd625, fd707; +fma.rn.f64 fd715, fd626, fd705, fd714; +mul.f64 fd716, fd612, fd715; +mul.f64 fd718, fd713, fd612; +mul.f64 fd720, fd626, fd715; +mul.f64 fd1349, fd625, fd713; +sub.f64 fd721, fd1349, fd720; +mul.f64 fd1348, fd611, fd715; +mul.f64 fd722, fd625, fd715; +fma.rn.f64 fd723, fd626, fd713, fd722; +mul.f64 fd724, fd616, fd723; +mul.f64 fd726, fd721, fd616; +mul.f64 fd728, fd626, fd723; +mul.f64 fd1347, fd625, fd721; +sub.f64 fd729, fd1347, fd728; +mul.f64 fd1346, fd615, fd723; +mul.f64 fd730, fd625, fd723; +fma.rn.f64 fd731, fd626, fd721, fd730; +mul.f64 fd732, fd620, fd731; +mul.f64 fd734, fd729, fd620; +mul.f64 fd1344, fd625, fd729; +mul.f64 fd1345, fd626, fd731; +sub.f64 fd737, fd1344, fd1345; +mul.f64 fd1343, fd619, fd731; +mul.f64 fd738, fd625, fd731; +mul.f64 fd1342, fd597, fd626; +fma.rn.f64 fd739, fd626, fd729, fd738; +mul.f64 fd740, fd624, fd739; +mul.f64 fd741, fd623, fd739; +mul.f64 fd742, fd737, fd624; +and.b32 r15, r45, 240; +add.s32 r16, r9, r15; +barrier.sync 0; +and.b32 r17, r52, 126976; +add.s32 r18, r16, r17; +sub.f64 fd1506, fd1408, fd1395; +mul.f64 fd1505, fd682, fd1506; +add.f64 fd743, fd1408, fd1395; +sub.f64 fd1509, fd493, fd550; +add.f64 fd744, fd493, fd550; +st.shared.v2.f64 [r18], {fd744, fd743}; +mov.u32 r62, %tid.x; +fma.rn.f64 fd745, fd625, fd597, fd629; +shl.b32 r50, r62, 8; +shl.b32 r49, r62, 4; +and.b32 r48, r62, 496; +sub.f64 fd746, fd631, fd1342; +st.shared.v2.f64 [r18+256], {fd745, fd746}; +fma.rn.f64 fd747, fd634, fd601, fd637; +sub.f64 fd748, fd639, fd1372; +st.shared.v2.f64 [r18+512], {fd747, fd748}; +fma.rn.f64 fd749, fd642, fd605, fd645; +sub.f64 fd750, fd647, fd1370; +st.shared.v2.f64 [r18+768], {fd749, fd750}; +fma.rn.f64 fd751, fd650, fd609, fd653; +sub.f64 fd752, fd655, fd1367; +st.shared.v2.f64 [r18+1024], {fd751, fd752}; +sub.f64 fd753, fd663, fd1364; +fma.rn.f64 fd754, fd658, fd613, fd661; +st.shared.v2.f64 [r18+1280], {fd754, fd753}; +fma.rn.f64 fd755, fd666, fd617, fd669; +sub.f64 fd756, fd671, fd1362; +st.shared.v2.f64 [r18+1536], {fd755, fd756}; +fma.rn.f64 fd757, fd674, fd621, fd677; +sub.f64 fd758, fd679, fd1359; +st.shared.v2.f64 [r18+1792], {fd757, fd758}; +fma.rn.f64 fd759, fd682, fd1509, fd685; +sub.f64 fd760, fd1505, fd686; +st.shared.v2.f64 [r18+2048], {fd759, fd760}; +fma.rn.f64 fd761, fd688, fd599, fd692; +sub.f64 fd762, fd694, fd1355; +st.shared.v2.f64 [r18+2304], {fd761, fd762}; +fma.rn.f64 fd763, fd697, fd603, fd700; +sub.f64 fd764, fd702, fd1353; +st.shared.v2.f64 [r18+2560], {fd763, fd764}; +fma.rn.f64 fd765, fd705, fd607, fd708; +sub.f64 fd766, fd710, fd1350; +st.shared.v2.f64 [r18+2816], {fd765, fd766}; +fma.rn.f64 fd767, fd713, fd611, fd716; +sub.f64 fd768, fd718, fd1348; +st.shared.v2.f64 [r18+3072], {fd767, fd768}; +sub.f64 fd769, fd726, fd1346; +fma.rn.f64 fd770, fd721, fd615, fd724; +st.shared.v2.f64 [r18+3328], {fd770, fd769}; +fma.rn.f64 fd771, fd729, fd619, fd732; +sub.f64 fd772, fd734, fd1343; +st.shared.v2.f64 [r18+3584], {fd771, fd772}; +fma.rn.f64 fd773, fd737, fd623, fd740; +sub.f64 fd774, fd742, fd741; +st.shared.v2.f64 [r18+3840], {fd773, fd774}; +barrier.sync 0; +mad.lo.s32 r19, r48, -240, r18; +ld.shared.v2.f64 {fd775, fd776}, [r19]; +ld.shared.v2.f64 {fd779, fd780}, [r19+8192]; +ld.shared.v2.f64 {fd783, fd784}, [r19+16384]; +ld.shared.v2.f64 {fd787, fd788}, [r19+24576]; +ld.shared.v2.f64 {fd791, fd792}, [r19+32768]; +ld.shared.v2.f64 {fd795, fd796}, [r19+40960]; +ld.shared.v2.f64 {fd799, fd800}, [r19+49152]; +ld.shared.v2.f64 {fd803, fd804}, [r19+57344]; +ld.shared.v2.f64 {fd807, fd808}, [r19+65536]; +ld.shared.v2.f64 {fd811, fd812}, [r19+73728]; +ld.shared.v2.f64 {fd815, fd816}, [r19+81920]; +ld.shared.v2.f64 {fd819, fd820}, [r19+90112]; +ld.shared.v2.f64 {fd823, fd824}, [r19+98304]; +ld.shared.v2.f64 {fd827, fd828}, [r19+106496]; +ld.shared.v2.f64 {fd831, fd832}, [r19+114688]; +ld.shared.v2.f64 {fd835, fd836}, [r19+122880]; +add.f64 fd839, fd775, fd807; +sub.f64 fd841, fd775, fd807; +add.f64 fd1341, fd776, fd808; +sub.f64 fd842, fd776, fd808; +add.f64 fd843, fd791, fd823; +sub.f64 fd845, fd791, fd823; +add.f64 fd1340, fd792, fd824; +sub.f64 fd846, fd792, fd824; +add.f64 fd847, fd839, fd843; +sub.f64 fd849, fd839, fd843; +add.f64 fd1339, fd1341, fd1340; +sub.f64 fd850, fd1341, fd1340; +sub.f64 fd851, fd841, fd846; +add.f64 fd853, fd841, fd846; +add.f64 fd1338, fd842, fd845; +sub.f64 fd854, fd842, fd845; +add.f64 fd855, fd783, fd815; +sub.f64 fd857, fd783, fd815; +add.f64 fd1337, fd784, fd816; +sub.f64 fd858, fd784, fd816; +add.f64 fd859, fd799, fd831; +sub.f64 fd861, fd799, fd831; +add.f64 fd1336, fd800, fd832; +sub.f64 fd862, fd800, fd832; +add.f64 fd863, fd855, fd859; +sub.f64 fd865, fd855, fd859; +add.f64 fd1335, fd1337, fd1336; +sub.f64 fd866, fd1337, fd1336; +sub.f64 fd867, fd857, fd862; +add.f64 fd869, fd857, fd862; +add.f64 fd1334, fd858, fd861; +sub.f64 fd870, fd858, fd861; +mul.f64 fd871, fd867, 0d3FE6A09E667F3BCD; +mul.f64 fd872, fd1334, 0d3FE6A09E667F3BCD; +sub.f64 fd873, fd871, fd872; +add.f64 fd874, fd871, fd872; +mul.f64 fd876, fd870, 0d3FE6A09E667F3BCD; +mul.f64 fd1333, fd869, 0dBFE6A09E667F3BCD; +sub.f64 fd877, fd1333, fd876; +mul.f64 fd878, fd870, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd879, fd869, 0d3FE6A09E667F3BCD, fd878; +add.f64 fd880, fd847, fd863; +sub.f64 fd882, fd847, fd863; +add.f64 fd1332, fd1339, fd1335; +sub.f64 fd883, fd1339, fd1335; +add.f64 fd884, fd851, fd873; +sub.f64 fd886, fd851, fd873; +add.f64 fd1331, fd1338, fd874; +sub.f64 fd887, fd1338, fd874; +sub.f64 fd888, fd849, fd866; +add.f64 fd890, fd849, fd866; +add.f64 fd1330, fd850, fd865; +sub.f64 fd891, fd850, fd865; +add.f64 fd892, fd853, fd877; +sub.f64 fd894, fd853, fd877; +add.f64 fd1329, fd854, fd879; +sub.f64 fd895, fd854, fd879; +add.f64 fd896, fd779, fd811; +sub.f64 fd898, fd779, fd811; +add.f64 fd1328, fd780, fd812; +sub.f64 fd899, fd780, fd812; +add.f64 fd900, fd795, fd827; +sub.f64 fd902, fd795, fd827; +add.f64 fd1327, fd796, fd828; +sub.f64 fd903, fd796, fd828; +add.f64 fd904, fd896, fd900; +sub.f64 fd906, fd896, fd900; +add.f64 fd1326, fd1328, fd1327; +sub.f64 fd907, fd1328, fd1327; +sub.f64 fd908, fd898, fd903; +add.f64 fd910, fd898, fd903; +add.f64 fd1325, fd899, fd902; +sub.f64 fd911, fd899, fd902; +add.f64 fd912, fd787, fd819; +sub.f64 fd914, fd787, fd819; +add.f64 fd1324, fd788, fd820; +sub.f64 fd915, fd788, fd820; +add.f64 fd916, fd803, fd835; +sub.f64 fd918, fd803, fd835; +add.f64 fd1323, fd804, fd836; +sub.f64 fd919, fd804, fd836; +add.f64 fd920, fd912, fd916; +sub.f64 fd922, fd912, fd916; +add.f64 fd1322, fd1324, fd1323; +sub.f64 fd923, fd1324, fd1323; +sub.f64 fd924, fd914, fd919; +add.f64 fd926, fd914, fd919; +add.f64 fd1321, fd915, fd918; +sub.f64 fd927, fd915, fd918; +mul.f64 fd928, fd924, 0d3FE6A09E667F3BCD; +mul.f64 fd929, fd1321, 0d3FE6A09E667F3BCD; +sub.f64 fd930, fd928, fd929; +add.f64 fd931, fd928, fd929; +mul.f64 fd933, fd927, 0d3FE6A09E667F3BCD; +mul.f64 fd1320, fd926, 0dBFE6A09E667F3BCD; +sub.f64 fd934, fd1320, fd933; +mul.f64 fd935, fd927, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd936, fd926, 0d3FE6A09E667F3BCD, fd935; +add.f64 fd937, fd904, fd920; +sub.f64 fd939, fd904, fd920; +add.f64 fd1319, fd1326, fd1322; +sub.f64 fd940, fd1326, fd1322; +add.f64 fd941, fd908, fd930; +sub.f64 fd943, fd908, fd930; +add.f64 fd1318, fd1325, fd931; +sub.f64 fd944, fd1325, fd931; +sub.f64 fd945, fd906, fd923; +add.f64 fd947, fd906, fd923; +add.f64 fd1317, fd907, fd922; +sub.f64 fd948, fd907, fd922; +add.f64 fd949, fd910, fd934; +sub.f64 fd951, fd910, fd934; +add.f64 fd1316, fd911, fd936; +sub.f64 fd952, fd911, fd936; +mul.f64 fd954, fd1318, 0d3FD87DE2A6AEA963; +mul.f64 fd1315, fd941, 0d3FED906BCF328D46; +sub.f64 fd955, fd1315, fd954; +mul.f64 fd956, fd1318, 0d3FED906BCF328D46; +fma.rn.f64 fd957, fd941, 0d3FD87DE2A6AEA963, fd956; +mul.f64 fd958, fd945, 0d3FE6A09E667F3BCD; +mul.f64 fd959, fd1317, 0d3FE6A09E667F3BCD; +sub.f64 fd960, fd958, fd959; +add.f64 fd961, fd958, fd959; +mul.f64 fd1313, fd949, 0d3FD87DE2A6AEA963; +mul.f64 fd1314, fd1316, 0d3FED906BCF328D46; +sub.f64 fd964, fd1313, fd1314; +mul.f64 fd965, fd1316, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd966, fd949, 0d3FED906BCF328D46, fd965; +mul.f64 fd1311, fd943, 0dBFD87DE2A6AEA963; +mul.f64 fd1312, fd944, 0d3FED906BCF328D46; +sub.f64 fd969, fd1311, fd1312; +mul.f64 fd970, fd944, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd971, fd943, 0d3FED906BCF328D46, fd970; +mul.f64 fd973, fd948, 0d3FE6A09E667F3BCD; +mul.f64 fd1310, fd947, 0dBFE6A09E667F3BCD; +sub.f64 fd974, fd1310, fd973; +mul.f64 fd975, fd948, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd976, fd947, 0d3FE6A09E667F3BCD, fd975; +mul.f64 fd978, fd952, 0d3FD87DE2A6AEA963; +mul.f64 fd1309, fd951, 0dBFED906BCF328D46; +sub.f64 fd979, fd1309, fd978; +mul.f64 fd980, fd952, 0dBFED906BCF328D46; +fma.rn.f64 fd981, fd951, 0d3FD87DE2A6AEA963, fd980; +add.f64 fd984, fd884, fd955; +sub.f64 fd986, fd884, fd955; +add.f64 fd1308, fd1331, fd957; +sub.f64 fd987, fd1331, fd957; +add.f64 fd988, fd888, fd960; +sub.f64 fd990, fd888, fd960; +add.f64 fd1307, fd1330, fd961; +sub.f64 fd991, fd1330, fd961; +add.f64 fd992, fd892, fd964; +sub.f64 fd994, fd892, fd964; +add.f64 fd1306, fd1329, fd966; +sub.f64 fd995, fd1329, fd966; +sub.f64 fd996, fd882, fd940; +add.f64 fd998, fd882, fd940; +add.f64 fd1305, fd883, fd939; +sub.f64 fd999, fd883, fd939; +add.f64 fd1000, fd886, fd969; +sub.f64 fd1002, fd886, fd969; +add.f64 fd1304, fd887, fd971; +sub.f64 fd1003, fd887, fd971; +add.f64 fd1004, fd890, fd974; +sub.f64 fd1006, fd890, fd974; +add.f64 fd1303, fd891, fd976; +sub.f64 fd1007, fd891, fd976; +add.f64 fd1008, fd894, fd979; +sub.f64 fd1010, fd894, fd979; +add.f64 fd1302, fd895, fd981; +sub.f64 fd1011, fd895, fd981; +and.b32 r20, r62, 256; +bfe.u32 r21, r62, 8, 1; +mul.wide.u32 rd9, r21, 16; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd1012, fd1013}, [rd11]; +mul.f64 fd1016, fd1308, fd1013; +mul.f64 fd1018, fd1012, fd1308; +mul.f64 fd1300, fd1012, fd1012; +mul.f64 fd1301, fd1013, fd1013; +sub.f64 fd1021, fd1300, fd1301; +mul.f64 fd1022, fd1013, fd1012; +fma.rn.f64 fd1023, fd1013, fd1012, fd1022; +mul.f64 fd1024, fd1307, fd1023; +mul.f64 fd1026, fd1021, fd1307; +mul.f64 fd1028, fd1013, fd1023; +mul.f64 fd1299, fd1012, fd1021; +sub.f64 fd1029, fd1299, fd1028; +mul.f64 fd1298, fd988, fd1023; +mul.f64 fd1030, fd1012, fd1023; +fma.rn.f64 fd1031, fd1013, fd1021, fd1030; +mul.f64 fd1032, fd1306, fd1031; +mul.f64 fd1034, fd1029, fd1306; +mul.f64 fd1296, fd1012, fd1029; +mul.f64 fd1297, fd1013, fd1031; +sub.f64 fd1037, fd1296, fd1297; +mul.f64 fd1295, fd992, fd1031; +mul.f64 fd1038, fd1012, fd1031; +fma.rn.f64 fd1039, fd1013, fd1029, fd1038; +mul.f64 fd1040, fd1305, fd1039; +mul.f64 fd1042, fd1037, fd1305; +mul.f64 fd1044, fd1013, fd1039; +mul.f64 fd1294, fd1012, fd1037; +sub.f64 fd1045, fd1294, fd1044; +mul.f64 fd1293, fd996, fd1039; +mul.f64 fd1046, fd1012, fd1039; +fma.rn.f64 fd1047, fd1013, fd1037, fd1046; +mul.f64 fd1048, fd1304, fd1047; +mul.f64 fd1050, fd1045, fd1304; +mul.f64 fd1052, fd1013, fd1047; +mul.f64 fd1292, fd1012, fd1045; +sub.f64 fd1053, fd1292, fd1052; +mul.f64 fd1291, fd1000, fd1047; +mul.f64 fd1054, fd1012, fd1047; +fma.rn.f64 fd1055, fd1013, fd1045, fd1054; +mul.f64 fd1056, fd1303, fd1055; +mul.f64 fd1058, fd1053, fd1303; +mul.f64 fd1289, fd1012, fd1053; +mul.f64 fd1290, fd1013, fd1055; +sub.f64 fd1061, fd1289, fd1290; +mul.f64 fd1288, fd1004, fd1055; +mul.f64 fd1062, fd1012, fd1055; +fma.rn.f64 fd1063, fd1013, fd1053, fd1062; +mul.f64 fd1064, fd1302, fd1063; +mul.f64 fd1066, fd1061, fd1302; +mul.f64 fd1068, fd1013, fd1063; +mul.f64 fd1287, fd1012, fd1061; +sub.f64 fd1069, fd1287, fd1068; +mul.f64 fd1286, fd1008, fd1063; +mul.f64 fd1070, fd1012, fd1063; +fma.rn.f64 fd1071, fd1013, fd1061, fd1070; +sub.f64 fd1285, fd1332, fd1319; +mul.f64 fd1072, fd1285, fd1071; +sub.f64 fd1284, fd880, fd937; +mul.f64 fd1073, fd1284, fd1071; +mul.f64 fd1074, fd1069, fd1285; +ld.global.v2.f64 {fd1075, fd1076}, [rd11+32]; +mul.f64 fd1079, fd987, fd1076; +mul.f64 fd1081, fd1075, fd987; +mul.f64 fd1083, fd1013, fd1076; +mul.f64 fd1283, fd1012, fd1075; +sub.f64 fd1084, fd1283, fd1083; +mul.f64 fd1282, fd986, fd1076; +mul.f64 fd1085, fd1012, fd1076; +fma.rn.f64 fd1086, fd1013, fd1075, fd1085; +mul.f64 fd1087, fd991, fd1086; +mul.f64 fd1089, fd1084, fd991; +mul.f64 fd1280, fd1012, fd1084; +mul.f64 fd1281, fd1013, fd1086; +sub.f64 fd1092, fd1280, fd1281; +mul.f64 fd1279, fd990, fd1086; +mul.f64 fd1093, fd1012, fd1086; +fma.rn.f64 fd1094, fd1013, fd1084, fd1093; +mul.f64 fd1095, fd995, fd1094; +mul.f64 fd1097, fd1092, fd995; +mul.f64 fd1099, fd1013, fd1094; +mul.f64 fd1278, fd1012, fd1092; +sub.f64 fd1100, fd1278, fd1099; +mul.f64 fd1277, fd994, fd1094; +mul.f64 fd1101, fd1012, fd1094; +fma.rn.f64 fd1102, fd1013, fd1092, fd1101; +mul.f64 fd1103, fd999, fd1102; +mul.f64 fd1105, fd1100, fd999; +mul.f64 fd1275, fd1012, fd1100; +mul.f64 fd1276, fd1013, fd1102; +sub.f64 fd1108, fd1275, fd1276; +mul.f64 fd1274, fd998, fd1102; +mul.f64 fd1109, fd1012, fd1102; +fma.rn.f64 fd1110, fd1013, fd1100, fd1109; +mul.f64 fd1111, fd1003, fd1110; +mul.f64 fd1113, fd1108, fd1003; +mul.f64 fd1115, fd1013, fd1110; +mul.f64 fd1273, fd1012, fd1108; +sub.f64 fd1116, fd1273, fd1115; +mul.f64 fd1272, fd1002, fd1110; +mul.f64 fd1117, fd1012, fd1110; +fma.rn.f64 fd1118, fd1013, fd1108, fd1117; +mul.f64 fd1119, fd1007, fd1118; +mul.f64 fd1121, fd1116, fd1007; +mul.f64 fd1123, fd1013, fd1118; +mul.f64 fd1271, fd1012, fd1116; +sub.f64 fd1124, fd1271, fd1123; +mul.f64 fd1270, fd1006, fd1118; +mul.f64 fd1125, fd1012, fd1118; +mul.f64 fd1269, fd984, fd1013; +fma.rn.f64 fd1126, fd1013, fd1116, fd1125; +mul.f64 fd1127, fd1011, fd1126; +mul.f64 fd1128, fd1010, fd1126; +mul.f64 fd1129, fd1124, fd1011; +and.b32 r22, r49, 4080; +add.s32 r23, r9, r22; +mov.u32 r39, %tid.x; +and.b32 r38, r39, 256; +barrier.sync 0; +and.b32 r24, r50, 65536; +add.s32 r25, r23, r24; +sub.f64 fd1508, fd1332, fd1319; +mul.f64 fd1507, fd1069, fd1508; +add.f64 fd1130, fd1332, fd1319; +sub.f64 fd1510, fd880, fd937; +add.f64 fd1131, fd880, fd937; +st.shared.v2.f64 [r25], {fd1131, fd1130}; +mov.u32 r57, %tid.x; +and.b32 r56, r57, 256; +fma.rn.f64 fd1132, fd1012, fd984, fd1016; +sub.f64 fd1133, fd1018, fd1269; +st.shared.v2.f64 [r25+4096], {fd1132, fd1133}; +fma.rn.f64 fd1134, fd1021, fd988, fd1024; +sub.f64 fd1135, fd1026, fd1298; +st.shared.v2.f64 [r25+8192], {fd1134, fd1135}; +fma.rn.f64 fd1136, fd1029, fd992, fd1032; +sub.f64 fd1137, fd1034, fd1295; +st.shared.v2.f64 [r25+12288], {fd1136, fd1137}; +fma.rn.f64 fd1138, fd1037, fd996, fd1040; +sub.f64 fd1139, fd1042, fd1293; +st.shared.v2.f64 [r25+16384], {fd1138, fd1139}; +sub.f64 fd1140, fd1050, fd1291; +fma.rn.f64 fd1141, fd1045, fd1000, fd1048; +st.shared.v2.f64 [r25+20480], {fd1141, fd1140}; +fma.rn.f64 fd1142, fd1053, fd1004, fd1056; +sub.f64 fd1143, fd1058, fd1288; +st.shared.v2.f64 [r25+24576], {fd1142, fd1143}; +fma.rn.f64 fd1144, fd1061, fd1008, fd1064; +sub.f64 fd1145, fd1066, fd1286; +st.shared.v2.f64 [r25+28672], {fd1144, fd1145}; +fma.rn.f64 fd1146, fd1069, fd1510, fd1072; +sub.f64 fd1147, fd1507, fd1073; +st.shared.v2.f64 [r25+32768], {fd1146, fd1147}; +fma.rn.f64 fd1148, fd1075, fd986, fd1079; +sub.f64 fd1149, fd1081, fd1282; +st.shared.v2.f64 [r25+36864], {fd1148, fd1149}; +fma.rn.f64 fd1150, fd1084, fd990, fd1087; +sub.f64 fd1151, fd1089, fd1279; +st.shared.v2.f64 [r25+40960], {fd1150, fd1151}; +fma.rn.f64 fd1152, fd1092, fd994, fd1095; +sub.f64 fd1153, fd1097, fd1277; +st.shared.v2.f64 [r25+45056], {fd1152, fd1153}; +fma.rn.f64 fd1154, fd1100, fd998, fd1103; +sub.f64 fd1155, fd1105, fd1274; +st.shared.v2.f64 [r25+49152], {fd1154, fd1155}; +sub.f64 fd1156, fd1113, fd1272; +fma.rn.f64 fd1157, fd1108, fd1002, fd1111; +st.shared.v2.f64 [r25+53248], {fd1157, fd1156}; +fma.rn.f64 fd1158, fd1116, fd1006, fd1119; +sub.f64 fd1159, fd1121, fd1270; +st.shared.v2.f64 [r25+57344], {fd1158, fd1159}; +fma.rn.f64 fd1160, fd1124, fd1010, fd1127; +sub.f64 fd1161, fd1129, fd1128; +st.shared.v2.f64 [r25+61440], {fd1160, fd1161}; +barrier.sync 0; +mad.lo.s32 r26, r56, -240, r25; +ld.shared.v2.f64 {fd1162, fd1163}, [r26]; +ld.shared.v2.f64 {fd1166, fd1167}, [r26+8192]; +ld.shared.v2.f64 {fd1170, fd1171}, [r26+16384]; +ld.shared.v2.f64 {fd1174, fd1175}, [r26+24576]; +ld.shared.v2.f64 {fd1178, fd1179}, [r26+32768]; +ld.shared.v2.f64 {fd1182, fd1183}, [r26+40960]; +ld.shared.v2.f64 {fd1186, fd1187}, [r26+49152]; +ld.shared.v2.f64 {fd1190, fd1191}, [r26+57344]; +ld.shared.v2.f64 {fd1194, fd1195}, [r26+65536]; +ld.shared.v2.f64 {fd1198, fd1199}, [r26+73728]; +ld.shared.v2.f64 {fd1202, fd1203}, [r26+81920]; +ld.shared.v2.f64 {fd1206, fd1207}, [r26+90112]; +ld.shared.v2.f64 {fd1210, fd1211}, [r26+98304]; +ld.shared.v2.f64 {fd1214, fd1215}, [r26+106496]; +ld.shared.v2.f64 {fd1218, fd1219}, [r26+114688]; +ld.shared.v2.f64 {fd1222, fd1223}, [r26+122880]; +add.f64 %0, fd1162, fd1194; +add.f64 %1, fd1163, fd1195; +add.f64 %2, fd1166, fd1198; +add.f64 %3, fd1167, fd1199; +add.f64 %4, fd1170, fd1202; +add.f64 %5, fd1171, fd1203; +add.f64 %7, fd1175, fd1207; +add.f64 %6, fd1174, fd1206; +add.f64 %9, fd1179, fd1211; +add.f64 %8, fd1178, fd1210; +add.f64 %11, fd1183, fd1215; +add.f64 %10, fd1182, fd1214; +add.f64 %12, fd1186, fd1218; +add.f64 %13, fd1187, fd1219; +add.f64 %14, fd1190, fd1222; +add.f64 %15, fd1191, fd1223; +sub.f64 %17, fd1163, fd1195; +sub.f64 %16, fd1162, fd1194; +sub.f64 %19, fd1167, fd1199; +sub.f64 %18, fd1166, fd1198; +sub.f64 %21, fd1171, fd1203; +sub.f64 %20, fd1170, fd1202; +sub.f64 %23, fd1175, fd1207; +sub.f64 %22, fd1174, fd1206; +sub.f64 %25, fd1179, fd1211; +sub.f64 %24, fd1178, fd1210; +sub.f64 %27, fd1183, fd1215; +sub.f64 %26, fd1182, fd1214; +sub.f64 %29, fd1187, fd1219; +sub.f64 %28, fd1186, fd1218; +sub.f64 %31, fd1191, fd1223; +sub.f64 %30, fd1190, fd1222; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_8192), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[8].y), "d"(rmem[4].y), "d"(rmem[10].y), "d"(rmem[2].y), "d"(rmem[14].y), "d"(rmem[1].y), "d"(rmem[5].y), "d"(rmem[13].y), "d"(rmem[11].y), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<670, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<36>; +.reg .f64 fd<613>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %21, %31; +add.f64 fd34, %22, %33; +sub.f64 fd35, %21, %31; +sub.f64 fd36, %22, %33; +add.f64 fd37, %26, %37; +add.f64 fd38, %28, %38; +sub.f64 fd39, %26, %37; +sub.f64 fd40, %28, %38; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %23, %34; +add.f64 fd50, %25, %36; +sub.f64 fd51, %23, %34; +sub.f64 fd52, %25, %36; +add.f64 fd53, %29, %39; +add.f64 fd54, %30, %40; +sub.f64 fd55, %29, %39; +sub.f64 fd56, %30, %40; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 fd74, fd41, fd57; +add.f64 fd75, fd42, fd58; +sub.f64 fd76, fd41, fd57; +sub.f64 fd77, fd42, fd58; +add.f64 fd78, fd45, fd67; +add.f64 fd79, fd46, fd68; +sub.f64 fd80, fd45, fd67; +sub.f64 fd81, fd46, fd68; +sub.f64 fd82, fd43, fd60; +add.f64 fd83, fd44, fd59; +add.f64 fd84, fd43, fd60; +sub.f64 fd85, fd44, fd59; +add.f64 fd86, fd47, fd71; +add.f64 fd87, fd48, fd73; +sub.f64 fd88, fd47, fd71; +sub.f64 fd89, fd48, fd73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd90, fd91}, [rd5]; +mul.f64 fd94, fd79, fd91; +fma.rn.f64 fd95, fd90, fd78, fd94; +mul.f64 fd96, fd78, fd91; +mul.f64 fd97, fd90, fd79; +sub.f64 fd98, fd97, fd96; +mul.f64 fd99, fd90, fd90; +mul.f64 fd100, fd91, fd91; +sub.f64 fd101, fd99, fd100; +mul.f64 fd102, fd91, fd90; +fma.rn.f64 fd103, fd91, fd90, fd102; +mul.f64 fd104, fd83, fd103; +fma.rn.f64 fd105, fd101, fd82, fd104; +mul.f64 fd106, fd82, fd103; +mul.f64 fd107, fd101, fd83; +sub.f64 fd108, fd107, fd106; +mul.f64 fd109, fd90, fd101; +mul.f64 fd110, fd91, fd103; +sub.f64 fd111, fd109, fd110; +mul.f64 fd112, fd90, fd103; +fma.rn.f64 fd113, fd91, fd101, fd112; +mul.f64 fd114, fd87, fd113; +fma.rn.f64 fd115, fd111, fd86, fd114; +mul.f64 fd116, fd86, fd113; +mul.f64 fd117, fd111, fd87; +sub.f64 fd118, fd117, fd116; +mul.f64 fd119, fd90, fd111; +mul.f64 fd120, fd91, fd113; +sub.f64 fd121, fd119, fd120; +mul.f64 fd122, fd90, fd113; +fma.rn.f64 fd123, fd91, fd111, fd122; +mul.f64 fd124, fd77, fd123; +fma.rn.f64 fd125, fd121, fd76, fd124; +mul.f64 fd126, fd76, fd123; +mul.f64 fd127, fd121, fd77; +sub.f64 fd128, fd127, fd126; +ld.global.v2.f64 {fd129, fd130}, [rd5+16384]; +mul.f64 fd133, fd81, fd130; +fma.rn.f64 fd134, fd129, fd80, fd133; +mul.f64 fd135, fd80, fd130; +mul.f64 fd136, fd129, fd81; +sub.f64 fd137, fd136, fd135; +mul.f64 fd138, fd90, fd129; +mul.f64 fd139, fd91, fd130; +sub.f64 fd140, fd138, fd139; +mul.f64 fd141, fd90, fd130; +fma.rn.f64 fd142, fd91, fd129, fd141; +mul.f64 fd143, fd85, fd142; +fma.rn.f64 fd144, fd140, fd84, fd143; +mul.f64 fd145, fd84, fd142; +mul.f64 fd146, fd140, fd85; +sub.f64 fd147, fd146, fd145; +mul.f64 fd148, fd90, fd140; +mul.f64 fd149, fd91, fd142; +sub.f64 fd150, fd148, fd149; +mul.f64 fd151, fd90, fd142; +fma.rn.f64 fd152, fd91, fd140, fd151; +mul.f64 fd153, fd89, fd152; +fma.rn.f64 fd154, fd150, fd88, fd153; +mul.f64 fd155, fd88, fd152; +mul.f64 fd156, fd150, fd89; +sub.f64 fd157, fd156, fd155; +shl.b32 r8, r5, 6; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65472; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd74, fd95}; +st.shared.v2.f64 [r12+16], {fd105, fd115}; +st.shared.v2.f64 [r12+32], {fd125, fd134}; +st.shared.v2.f64 [r12+48], {fd144, fd154}; +barrier.sync 0; +mad.lo.s32 r13, r6, -56, r12; +ld.shared.f64 fd158, [r13]; +ld.shared.f64 fd159, [r13+8192]; +ld.shared.f64 fd160, [r13+16384]; +ld.shared.f64 fd161, [r13+24576]; +ld.shared.f64 fd162, [r13+32768]; +ld.shared.f64 fd163, [r13+40960]; +ld.shared.f64 fd164, [r13+49152]; +ld.shared.f64 fd165, [r13+57344]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd75, fd98}; +st.shared.v2.f64 [r12+16], {fd108, fd118}; +st.shared.v2.f64 [r12+32], {fd128, fd137}; +st.shared.v2.f64 [r12+48], {fd147, fd157}; +barrier.sync 0; +ld.shared.f64 fd166, [r13]; +ld.shared.f64 fd167, [r13+8192]; +ld.shared.f64 fd168, [r13+16384]; +ld.shared.f64 fd169, [r13+24576]; +ld.shared.f64 fd170, [r13+32768]; +ld.shared.f64 fd171, [r13+40960]; +ld.shared.f64 fd172, [r13+49152]; +ld.shared.f64 fd173, [r13+57344]; +add.f64 fd174, fd158, fd162; +add.f64 fd175, fd166, fd170; +sub.f64 fd176, fd158, fd162; +sub.f64 fd177, fd166, fd170; +add.f64 fd178, fd160, fd164; +add.f64 fd179, fd168, fd172; +sub.f64 fd180, fd160, fd164; +sub.f64 fd181, fd168, fd172; +add.f64 fd182, fd174, fd178; +add.f64 fd183, fd175, fd179; +sub.f64 fd184, fd174, fd178; +sub.f64 fd185, fd175, fd179; +sub.f64 fd186, fd176, fd181; +add.f64 fd187, fd177, fd180; +add.f64 fd188, fd176, fd181; +sub.f64 fd189, fd177, fd180; +add.f64 fd190, fd159, fd163; +add.f64 fd191, fd167, fd171; +sub.f64 fd192, fd159, fd163; +sub.f64 fd193, fd167, fd171; +add.f64 fd194, fd161, fd165; +add.f64 fd195, fd169, fd173; +sub.f64 fd196, fd161, fd165; +sub.f64 fd197, fd169, fd173; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +mul.f64 fd206, fd202, 0d3FE6A09E667F3BCD; +mul.f64 fd207, fd203, 0d3FE6A09E667F3BCD; +sub.f64 fd208, fd206, fd207; +add.f64 fd209, fd206, fd207; +mul.f64 fd210, fd204, 0dBFE6A09E667F3BCD; +mul.f64 fd211, fd205, 0d3FE6A09E667F3BCD; +sub.f64 fd212, fd210, fd211; +mul.f64 fd213, fd205, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd214, fd204, 0d3FE6A09E667F3BCD, fd213; +add.f64 fd215, fd182, fd198; +add.f64 fd216, fd183, fd199; +sub.f64 fd217, fd182, fd198; +sub.f64 fd218, fd183, fd199; +add.f64 fd219, fd186, fd208; +add.f64 fd220, fd187, fd209; +sub.f64 fd221, fd186, fd208; +sub.f64 fd222, fd187, fd209; +sub.f64 fd223, fd184, fd201; +add.f64 fd224, fd185, fd200; +add.f64 fd225, fd184, fd201; +sub.f64 fd226, fd185, fd200; +add.f64 fd227, fd188, fd212; +add.f64 fd228, fd189, fd214; +sub.f64 fd229, fd188, fd212; +sub.f64 fd230, fd189, fd214; +and.b32 r14, r5, 1016; +bfe.u32 r15, r5, 3, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd231, fd232}, [rd8]; +mul.f64 fd235, fd220, fd232; +fma.rn.f64 fd236, fd231, fd219, fd235; +mul.f64 fd237, fd219, fd232; +mul.f64 fd238, fd231, fd220; +sub.f64 fd239, fd238, fd237; +mul.f64 fd240, fd231, fd231; +mul.f64 fd241, fd232, fd232; +sub.f64 fd242, fd240, fd241; +mul.f64 fd243, fd232, fd231; +fma.rn.f64 fd244, fd232, fd231, fd243; +mul.f64 fd245, fd224, fd244; +fma.rn.f64 fd246, fd242, fd223, fd245; +mul.f64 fd247, fd223, fd244; +mul.f64 fd248, fd242, fd224; +sub.f64 fd249, fd248, fd247; +mul.f64 fd250, fd231, fd242; +mul.f64 fd251, fd232, fd244; +sub.f64 fd252, fd250, fd251; +mul.f64 fd253, fd231, fd244; +fma.rn.f64 fd254, fd232, fd242, fd253; +mul.f64 fd255, fd228, fd254; +fma.rn.f64 fd256, fd252, fd227, fd255; +mul.f64 fd257, fd227, fd254; +mul.f64 fd258, fd252, fd228; +sub.f64 fd259, fd258, fd257; +mul.f64 fd260, fd231, fd252; +mul.f64 fd261, fd232, fd254; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd231, fd254; +fma.rn.f64 fd264, fd232, fd252, fd263; +mul.f64 fd265, fd218, fd264; +fma.rn.f64 fd266, fd262, fd217, fd265; +mul.f64 fd267, fd217, fd264; +mul.f64 fd268, fd262, fd218; +sub.f64 fd269, fd268, fd267; +ld.global.v2.f64 {fd270, fd271}, [rd8+2048]; +mul.f64 fd274, fd222, fd271; +fma.rn.f64 fd275, fd270, fd221, fd274; +mul.f64 fd276, fd221, fd271; +mul.f64 fd277, fd270, fd222; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd231, fd270; +mul.f64 fd280, fd232, fd271; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd231, fd271; +fma.rn.f64 fd283, fd232, fd270, fd282; +mul.f64 fd284, fd226, fd283; +fma.rn.f64 fd285, fd281, fd225, fd284; +mul.f64 fd286, fd225, fd283; +mul.f64 fd287, fd281, fd226; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd231, fd281; +mul.f64 fd290, fd232, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd231, fd283; +fma.rn.f64 fd293, fd232, fd281, fd292; +mul.f64 fd294, fd230, fd293; +fma.rn.f64 fd295, fd291, fd229, fd294; +mul.f64 fd296, fd229, fd293; +mul.f64 fd297, fd291, fd230; +sub.f64 fd298, fd297, fd296; +shl.b32 r16, r5, 3; +and.b32 r17, r16, 56; +add.s32 r18, r10, r17; +barrier.sync 0; +and.b32 r19, r8, 65024; +add.s32 r20, r18, r19; +st.shared.f64 [r20], fd215; +st.shared.f64 [r20+64], fd236; +st.shared.f64 [r20+128], fd246; +st.shared.f64 [r20+192], fd256; +st.shared.f64 [r20+256], fd266; +st.shared.f64 [r20+320], fd275; +st.shared.f64 [r20+384], fd285; +st.shared.f64 [r20+448], fd295; +barrier.sync 0; +mad.lo.s32 r21, r14, -56, r20; +ld.shared.f64 fd299, [r21]; +ld.shared.f64 fd300, [r21+8192]; +ld.shared.f64 fd301, [r21+16384]; +ld.shared.f64 fd302, [r21+24576]; +ld.shared.f64 fd303, [r21+32768]; +ld.shared.f64 fd304, [r21+40960]; +ld.shared.f64 fd305, [r21+49152]; +ld.shared.f64 fd306, [r21+57344]; +barrier.sync 0; +st.shared.f64 [r20], fd216; +st.shared.f64 [r20+64], fd239; +st.shared.f64 [r20+128], fd249; +st.shared.f64 [r20+192], fd259; +st.shared.f64 [r20+256], fd269; +st.shared.f64 [r20+320], fd278; +st.shared.f64 [r20+384], fd288; +st.shared.f64 [r20+448], fd298; +barrier.sync 0; +ld.shared.f64 fd307, [r21]; +ld.shared.f64 fd308, [r21+8192]; +ld.shared.f64 fd309, [r21+16384]; +ld.shared.f64 fd310, [r21+24576]; +ld.shared.f64 fd311, [r21+32768]; +ld.shared.f64 fd312, [r21+40960]; +ld.shared.f64 fd313, [r21+49152]; +ld.shared.f64 fd314, [r21+57344]; +add.f64 fd315, fd299, fd303; +add.f64 fd316, fd307, fd311; +sub.f64 fd317, fd299, fd303; +sub.f64 fd318, fd307, fd311; +add.f64 fd319, fd301, fd305; +add.f64 fd320, fd309, fd313; +sub.f64 fd321, fd301, fd305; +sub.f64 fd322, fd309, fd313; +add.f64 fd323, fd315, fd319; +add.f64 fd324, fd316, fd320; +sub.f64 fd325, fd315, fd319; +sub.f64 fd326, fd316, fd320; +sub.f64 fd327, fd317, fd322; +add.f64 fd328, fd318, fd321; +add.f64 fd329, fd317, fd322; +sub.f64 fd330, fd318, fd321; +add.f64 fd331, fd300, fd304; +add.f64 fd332, fd308, fd312; +sub.f64 fd333, fd300, fd304; +sub.f64 fd334, fd308, fd312; +add.f64 fd335, fd302, fd306; +add.f64 fd336, fd310, fd314; +sub.f64 fd337, fd302, fd306; +sub.f64 fd338, fd310, fd314; +add.f64 fd339, fd331, fd335; +add.f64 fd340, fd332, fd336; +sub.f64 fd341, fd331, fd335; +sub.f64 fd342, fd332, fd336; +sub.f64 fd343, fd333, fd338; +add.f64 fd344, fd334, fd337; +add.f64 fd345, fd333, fd338; +sub.f64 fd346, fd334, fd337; +mul.f64 fd347, fd343, 0d3FE6A09E667F3BCD; +mul.f64 fd348, fd344, 0d3FE6A09E667F3BCD; +sub.f64 fd349, fd347, fd348; +add.f64 fd350, fd347, fd348; +mul.f64 fd351, fd345, 0dBFE6A09E667F3BCD; +mul.f64 fd352, fd346, 0d3FE6A09E667F3BCD; +sub.f64 fd353, fd351, fd352; +mul.f64 fd354, fd346, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd355, fd345, 0d3FE6A09E667F3BCD, fd354; +add.f64 fd356, fd323, fd339; +add.f64 fd357, fd324, fd340; +sub.f64 fd358, fd323, fd339; +sub.f64 fd359, fd324, fd340; +add.f64 fd360, fd327, fd349; +add.f64 fd361, fd328, fd350; +sub.f64 fd362, fd327, fd349; +sub.f64 fd363, fd328, fd350; +sub.f64 fd364, fd325, fd342; +add.f64 fd365, fd326, fd341; +add.f64 fd366, fd325, fd342; +sub.f64 fd367, fd326, fd341; +add.f64 fd368, fd329, fd353; +add.f64 fd369, fd330, fd355; +sub.f64 fd370, fd329, fd353; +sub.f64 fd371, fd330, fd355; +and.b32 r22, r5, 960; +bfe.u32 r23, r5, 6, 4; +mul.wide.u32 rd9, r23, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd372, fd373}, [rd11]; +mul.f64 fd376, fd361, fd373; +fma.rn.f64 fd377, fd372, fd360, fd376; +mul.f64 fd378, fd360, fd373; +mul.f64 fd379, fd372, fd361; +sub.f64 fd380, fd379, fd378; +mul.f64 fd381, fd372, fd372; +mul.f64 fd382, fd373, fd373; +sub.f64 fd383, fd381, fd382; +mul.f64 fd384, fd373, fd372; +fma.rn.f64 fd385, fd373, fd372, fd384; +mul.f64 fd386, fd365, fd385; +fma.rn.f64 fd387, fd383, fd364, fd386; +mul.f64 fd388, fd364, fd385; +mul.f64 fd389, fd383, fd365; +sub.f64 fd390, fd389, fd388; +mul.f64 fd391, fd372, fd383; +mul.f64 fd392, fd373, fd385; +sub.f64 fd393, fd391, fd392; +mul.f64 fd394, fd372, fd385; +fma.rn.f64 fd395, fd373, fd383, fd394; +mul.f64 fd396, fd369, fd395; +fma.rn.f64 fd397, fd393, fd368, fd396; +mul.f64 fd398, fd368, fd395; +mul.f64 fd399, fd393, fd369; +sub.f64 fd400, fd399, fd398; +mul.f64 fd401, fd372, fd393; +mul.f64 fd402, fd373, fd395; +sub.f64 fd403, fd401, fd402; +mul.f64 fd404, fd372, fd395; +fma.rn.f64 fd405, fd373, fd393, fd404; +mul.f64 fd406, fd359, fd405; +fma.rn.f64 fd407, fd403, fd358, fd406; +mul.f64 fd408, fd358, fd405; +mul.f64 fd409, fd403, fd359; +sub.f64 fd410, fd409, fd408; +ld.global.v2.f64 {fd411, fd412}, [rd11+256]; +mul.f64 fd415, fd363, fd412; +fma.rn.f64 fd416, fd411, fd362, fd415; +mul.f64 fd417, fd362, fd412; +mul.f64 fd418, fd411, fd363; +sub.f64 fd419, fd418, fd417; +mul.f64 fd420, fd372, fd411; +mul.f64 fd421, fd373, fd412; +sub.f64 fd422, fd420, fd421; +mul.f64 fd423, fd372, fd412; +fma.rn.f64 fd424, fd373, fd411, fd423; +mul.f64 fd425, fd367, fd424; +fma.rn.f64 fd426, fd422, fd366, fd425; +mul.f64 fd427, fd366, fd424; +mul.f64 fd428, fd422, fd367; +sub.f64 fd429, fd428, fd427; +mul.f64 fd430, fd372, fd422; +mul.f64 fd431, fd373, fd424; +sub.f64 fd432, fd430, fd431; +mul.f64 fd433, fd372, fd424; +fma.rn.f64 fd434, fd373, fd422, fd433; +mul.f64 fd435, fd371, fd434; +fma.rn.f64 fd436, fd432, fd370, fd435; +mul.f64 fd437, fd370, fd434; +mul.f64 fd438, fd432, fd371; +sub.f64 fd439, fd438, fd437; +and.b32 r24, r16, 504; +add.s32 r25, r10, r24; +barrier.sync 0; +and.b32 r26, r8, 61440; +add.s32 r27, r25, r26; +st.shared.f64 [r27], fd356; +st.shared.f64 [r27+512], fd377; +st.shared.f64 [r27+1024], fd387; +st.shared.f64 [r27+1536], fd397; +st.shared.f64 [r27+2048], fd407; +st.shared.f64 [r27+2560], fd416; +st.shared.f64 [r27+3072], fd426; +st.shared.f64 [r27+3584], fd436; +barrier.sync 0; +mad.lo.s32 r28, r22, -56, r27; +ld.shared.f64 fd440, [r28]; +ld.shared.f64 fd441, [r28+8192]; +ld.shared.f64 fd442, [r28+16384]; +ld.shared.f64 fd443, [r28+24576]; +ld.shared.f64 fd444, [r28+32768]; +ld.shared.f64 fd445, [r28+40960]; +ld.shared.f64 fd446, [r28+49152]; +ld.shared.f64 fd447, [r28+57344]; +barrier.sync 0; +st.shared.f64 [r27], fd357; +st.shared.f64 [r27+512], fd380; +st.shared.f64 [r27+1024], fd390; +st.shared.f64 [r27+1536], fd400; +st.shared.f64 [r27+2048], fd410; +st.shared.f64 [r27+2560], fd419; +st.shared.f64 [r27+3072], fd429; +st.shared.f64 [r27+3584], fd439; +barrier.sync 0; +ld.shared.f64 fd448, [r28]; +ld.shared.f64 fd449, [r28+8192]; +ld.shared.f64 fd450, [r28+16384]; +ld.shared.f64 fd451, [r28+24576]; +ld.shared.f64 fd452, [r28+32768]; +ld.shared.f64 fd453, [r28+40960]; +ld.shared.f64 fd454, [r28+49152]; +ld.shared.f64 fd455, [r28+57344]; +add.f64 fd456, fd440, fd444; +add.f64 fd457, fd448, fd452; +sub.f64 fd458, fd440, fd444; +sub.f64 fd459, fd448, fd452; +add.f64 fd460, fd442, fd446; +add.f64 fd461, fd450, fd454; +sub.f64 fd462, fd442, fd446; +sub.f64 fd463, fd450, fd454; +add.f64 fd464, fd456, fd460; +add.f64 fd465, fd457, fd461; +sub.f64 fd466, fd456, fd460; +sub.f64 fd467, fd457, fd461; +sub.f64 fd468, fd458, fd463; +add.f64 fd469, fd459, fd462; +add.f64 fd470, fd458, fd463; +sub.f64 fd471, fd459, fd462; +add.f64 fd472, fd441, fd445; +add.f64 fd473, fd449, fd453; +sub.f64 fd474, fd441, fd445; +sub.f64 fd475, fd449, fd453; +add.f64 fd476, fd443, fd447; +add.f64 fd477, fd451, fd455; +sub.f64 fd478, fd443, fd447; +sub.f64 fd479, fd451, fd455; +add.f64 fd480, fd472, fd476; +add.f64 fd481, fd473, fd477; +sub.f64 fd482, fd472, fd476; +sub.f64 fd483, fd473, fd477; +sub.f64 fd484, fd474, fd479; +add.f64 fd485, fd475, fd478; +add.f64 fd486, fd474, fd479; +sub.f64 fd487, fd475, fd478; +mul.f64 fd488, fd484, 0d3FE6A09E667F3BCD; +mul.f64 fd489, fd485, 0d3FE6A09E667F3BCD; +sub.f64 fd490, fd488, fd489; +add.f64 fd491, fd488, fd489; +mul.f64 fd492, fd486, 0dBFE6A09E667F3BCD; +mul.f64 fd493, fd487, 0d3FE6A09E667F3BCD; +sub.f64 fd494, fd492, fd493; +mul.f64 fd495, fd487, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd496, fd486, 0d3FE6A09E667F3BCD, fd495; +add.f64 fd497, fd464, fd480; +add.f64 fd498, fd465, fd481; +sub.f64 fd499, fd464, fd480; +sub.f64 fd500, fd465, fd481; +add.f64 fd501, fd468, fd490; +add.f64 fd502, fd469, fd491; +sub.f64 fd503, fd468, fd490; +sub.f64 fd504, fd469, fd491; +sub.f64 fd505, fd466, fd483; +add.f64 fd506, fd467, fd482; +add.f64 fd507, fd466, fd483; +sub.f64 fd508, fd467, fd482; +add.f64 fd509, fd470, fd494; +add.f64 fd510, fd471, fd496; +sub.f64 fd511, fd470, fd494; +sub.f64 fd512, fd471, fd496; +and.b32 r29, r5, 512; +bfe.u32 r30, r5, 9, 1; +mul.wide.u32 rd12, r30, 16; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd513, fd514}, [rd14]; +mul.f64 fd517, fd502, fd514; +fma.rn.f64 fd518, fd513, fd501, fd517; +mul.f64 fd519, fd501, fd514; +mul.f64 fd520, fd513, fd502; +sub.f64 fd521, fd520, fd519; +mul.f64 fd522, fd513, fd513; +mul.f64 fd523, fd514, fd514; +sub.f64 fd524, fd522, fd523; +mul.f64 fd525, fd514, fd513; +fma.rn.f64 fd526, fd514, fd513, fd525; +mul.f64 fd527, fd506, fd526; +fma.rn.f64 fd528, fd524, fd505, fd527; +mul.f64 fd529, fd505, fd526; +mul.f64 fd530, fd524, fd506; +sub.f64 fd531, fd530, fd529; +mul.f64 fd532, fd513, fd524; +mul.f64 fd533, fd514, fd526; +sub.f64 fd534, fd532, fd533; +mul.f64 fd535, fd513, fd526; +fma.rn.f64 fd536, fd514, fd524, fd535; +mul.f64 fd537, fd510, fd536; +fma.rn.f64 fd538, fd534, fd509, fd537; +mul.f64 fd539, fd509, fd536; +mul.f64 fd540, fd534, fd510; +sub.f64 fd541, fd540, fd539; +mul.f64 fd542, fd513, fd534; +mul.f64 fd543, fd514, fd536; +sub.f64 fd544, fd542, fd543; +mul.f64 fd545, fd513, fd536; +fma.rn.f64 fd546, fd514, fd534, fd545; +mul.f64 fd547, fd500, fd546; +fma.rn.f64 fd548, fd544, fd499, fd547; +mul.f64 fd549, fd499, fd546; +mul.f64 fd550, fd544, fd500; +sub.f64 fd551, fd550, fd549; +ld.global.v2.f64 {fd552, fd553}, [rd14+32]; +mul.f64 fd556, fd504, fd553; +fma.rn.f64 fd557, fd552, fd503, fd556; +mul.f64 fd558, fd503, fd553; +mul.f64 fd559, fd552, fd504; +sub.f64 fd560, fd559, fd558; +mul.f64 fd561, fd513, fd552; +mul.f64 fd562, fd514, fd553; +sub.f64 fd563, fd561, fd562; +mul.f64 fd564, fd513, fd553; +fma.rn.f64 fd565, fd514, fd552, fd564; +mul.f64 fd566, fd508, fd565; +fma.rn.f64 fd567, fd563, fd507, fd566; +mul.f64 fd568, fd507, fd565; +mul.f64 fd569, fd563, fd508; +sub.f64 fd570, fd569, fd568; +mul.f64 fd571, fd513, fd563; +mul.f64 fd572, fd514, fd565; +sub.f64 fd573, fd571, fd572; +mul.f64 fd574, fd513, fd565; +fma.rn.f64 fd575, fd514, fd563, fd574; +mul.f64 fd576, fd512, fd575; +fma.rn.f64 fd577, fd573, fd511, fd576; +mul.f64 fd578, fd511, fd575; +mul.f64 fd579, fd573, fd512; +sub.f64 fd580, fd579, fd578; +and.b32 r31, r16, 4088; +add.s32 r32, r10, r31; +barrier.sync 0; +and.b32 r33, r8, 32768; +add.s32 r34, r32, r33; +st.shared.f64 [r34], fd497; +st.shared.f64 [r34+4096], fd518; +st.shared.f64 [r34+8192], fd528; +st.shared.f64 [r34+12288], fd538; +st.shared.f64 [r34+16384], fd548; +st.shared.f64 [r34+20480], fd557; +st.shared.f64 [r34+24576], fd567; +st.shared.f64 [r34+28672], fd577; +barrier.sync 0; +mad.lo.s32 r35, r29, -56, r34; +ld.shared.f64 fd581, [r35]; +ld.shared.f64 fd582, [r35+8192]; +ld.shared.f64 fd583, [r35+16384]; +ld.shared.f64 fd584, [r35+24576]; +ld.shared.f64 fd585, [r35+32768]; +ld.shared.f64 fd586, [r35+40960]; +ld.shared.f64 fd587, [r35+49152]; +ld.shared.f64 fd588, [r35+57344]; +barrier.sync 0; +st.shared.f64 [r34], fd498; +st.shared.f64 [r34+4096], fd521; +st.shared.f64 [r34+8192], fd531; +st.shared.f64 [r34+12288], fd541; +st.shared.f64 [r34+16384], fd551; +st.shared.f64 [r34+20480], fd560; +st.shared.f64 [r34+24576], fd570; +st.shared.f64 [r34+28672], fd580; +barrier.sync 0; +ld.shared.f64 fd589, [r35]; +ld.shared.f64 fd590, [r35+8192]; +ld.shared.f64 fd591, [r35+16384]; +ld.shared.f64 fd592, [r35+24576]; +ld.shared.f64 fd593, [r35+32768]; +ld.shared.f64 fd594, [r35+40960]; +ld.shared.f64 fd595, [r35+49152]; +ld.shared.f64 fd596, [r35+57344]; +add.f64 %0, fd581, fd585; +add.f64 %1, fd589, fd593; +add.f64 %2, fd582, fd586; +add.f64 %3, fd590, fd594; +add.f64 %4, fd583, fd587; +add.f64 %5, fd591, fd595; +add.f64 %6, fd584, fd588; +add.f64 %7, fd592, fd596; +sub.f64 %8, fd581, fd585; +sub.f64 %9, fd589, fd593; +sub.f64 %10, fd582, fd586; +sub.f64 %11, fd590, fd594; +sub.f64 %12, fd583, fd587; +sub.f64 %13, fd591, fd595; +sub.f64 %14, fd584, fd588; +sub.f64 %15, fd592, fd596; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_8192), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<669, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<1162>; +.reg .b64 rd<12>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 16; +mov.u32 r3, %32; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd65, %36, %57; +add.f64 fd66, %37, %59; +sub.f64 fd67, %36, %57; +sub.f64 fd68, %37, %59; +add.f64 fd69, %46, %68; +add.f64 fd70, %48, %69; +sub.f64 fd71, %46, %68; +sub.f64 fd72, %48, %69; +add.f64 fd73, fd65, fd69; +add.f64 fd74, fd66, fd70; +sub.f64 fd75, fd65, fd69; +sub.f64 fd76, fd66, fd70; +sub.f64 fd77, fd67, fd72; +add.f64 fd78, fd68, fd71; +add.f64 fd79, fd67, fd72; +sub.f64 fd80, fd68, fd71; +add.f64 fd81, %41, %62; +add.f64 fd82, %43, %64; +sub.f64 fd83, %41, %62; +sub.f64 fd84, %43, %64; +add.f64 fd85, %52, %73; +add.f64 fd86, %53, %75; +sub.f64 fd87, %52, %73; +sub.f64 fd88, %53, %75; +add.f64 fd89, fd81, fd85; +add.f64 fd90, fd82, fd86; +sub.f64 fd91, fd81, fd85; +sub.f64 fd92, fd82, fd86; +sub.f64 fd93, fd83, fd88; +add.f64 fd94, fd84, fd87; +add.f64 fd95, fd83, fd88; +sub.f64 fd96, fd84, fd87; +mul.f64 fd97, fd93, 0d3FE6A09E667F3BCD; +mul.f64 fd98, fd94, 0d3FE6A09E667F3BCD; +sub.f64 fd99, fd97, fd98; +add.f64 fd100, fd97, fd98; +mul.f64 fd101, fd95, 0dBFE6A09E667F3BCD; +mul.f64 fd102, fd96, 0d3FE6A09E667F3BCD; +sub.f64 fd103, fd101, fd102; +mul.f64 fd104, fd96, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd105, fd95, 0d3FE6A09E667F3BCD, fd104; +add.f64 fd106, fd73, fd89; +add.f64 fd107, fd74, fd90; +sub.f64 fd108, fd73, fd89; +sub.f64 fd109, fd74, fd90; +add.f64 fd110, fd77, fd99; +add.f64 fd111, fd78, fd100; +sub.f64 fd112, fd77, fd99; +sub.f64 fd113, fd78, fd100; +sub.f64 fd114, fd75, fd92; +add.f64 fd115, fd76, fd91; +add.f64 fd116, fd75, fd92; +sub.f64 fd117, fd76, fd91; +add.f64 fd118, fd79, fd103; +add.f64 fd119, fd80, fd105; +sub.f64 fd120, fd79, fd103; +sub.f64 fd121, fd80, fd105; +add.f64 fd122, %38, %60; +add.f64 fd123, %40, %61; +sub.f64 fd124, %38, %60; +sub.f64 fd125, %40, %61; +add.f64 fd126, %49, %70; +add.f64 fd127, %51, %72; +sub.f64 fd128, %49, %70; +sub.f64 fd129, %51, %72; +add.f64 fd130, fd122, fd126; +add.f64 fd131, fd123, fd127; +sub.f64 fd132, fd122, fd126; +sub.f64 fd133, fd123, fd127; +sub.f64 fd134, fd124, fd129; +add.f64 fd135, fd125, fd128; +add.f64 fd136, fd124, fd129; +sub.f64 fd137, fd125, fd128; +add.f64 fd138, %44, %65; +add.f64 fd139, %45, %67; +sub.f64 fd140, %44, %65; +sub.f64 fd141, %45, %67; +add.f64 fd142, %54, %76; +add.f64 fd143, %56, %77; +sub.f64 fd144, %54, %76; +sub.f64 fd145, %56, %77; +add.f64 fd146, fd138, fd142; +add.f64 fd147, fd139, fd143; +sub.f64 fd148, fd138, fd142; +sub.f64 fd149, fd139, fd143; +sub.f64 fd150, fd140, fd145; +add.f64 fd151, fd141, fd144; +add.f64 fd152, fd140, fd145; +sub.f64 fd153, fd141, fd144; +mul.f64 fd154, fd150, 0d3FE6A09E667F3BCD; +mul.f64 fd155, fd151, 0d3FE6A09E667F3BCD; +sub.f64 fd156, fd154, fd155; +add.f64 fd157, fd154, fd155; +mul.f64 fd158, fd152, 0dBFE6A09E667F3BCD; +mul.f64 fd159, fd153, 0d3FE6A09E667F3BCD; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd153, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd162, fd152, 0d3FE6A09E667F3BCD, fd161; +add.f64 fd163, fd130, fd146; +add.f64 fd164, fd131, fd147; +sub.f64 fd165, fd130, fd146; +sub.f64 fd166, fd131, fd147; +add.f64 fd167, fd134, fd156; +add.f64 fd168, fd135, fd157; +sub.f64 fd169, fd134, fd156; +sub.f64 fd170, fd135, fd157; +sub.f64 fd171, fd132, fd149; +add.f64 fd172, fd133, fd148; +add.f64 fd173, fd132, fd149; +sub.f64 fd174, fd133, fd148; +add.f64 fd175, fd136, fd160; +add.f64 fd176, fd137, fd162; +sub.f64 fd177, fd136, fd160; +sub.f64 fd178, fd137, fd162; +mul.f64 fd179, fd167, 0d3FED906BCF328D46; +mul.f64 fd180, fd168, 0d3FD87DE2A6AEA963; +sub.f64 fd181, fd179, fd180; +mul.f64 fd182, fd168, 0d3FED906BCF328D46; +fma.rn.f64 fd183, fd167, 0d3FD87DE2A6AEA963, fd182; +mul.f64 fd184, fd171, 0d3FE6A09E667F3BCD; +mul.f64 fd185, fd172, 0d3FE6A09E667F3BCD; +sub.f64 fd186, fd184, fd185; +add.f64 fd187, fd184, fd185; +mul.f64 fd188, fd175, 0d3FD87DE2A6AEA963; +mul.f64 fd189, fd176, 0d3FED906BCF328D46; +sub.f64 fd190, fd188, fd189; +mul.f64 fd191, fd176, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd192, fd175, 0d3FED906BCF328D46, fd191; +mul.f64 fd193, fd169, 0dBFD87DE2A6AEA963; +mul.f64 fd194, fd170, 0d3FED906BCF328D46; +sub.f64 fd195, fd193, fd194; +mul.f64 fd196, fd170, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd197, fd169, 0d3FED906BCF328D46, fd196; +mul.f64 fd198, fd173, 0dBFE6A09E667F3BCD; +mul.f64 fd199, fd174, 0d3FE6A09E667F3BCD; +sub.f64 fd200, fd198, fd199; +mul.f64 fd201, fd174, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd202, fd173, 0d3FE6A09E667F3BCD, fd201; +mul.f64 fd203, fd177, 0dBFED906BCF328D46; +mul.f64 fd204, fd178, 0d3FD87DE2A6AEA963; +sub.f64 fd205, fd203, fd204; +mul.f64 fd206, fd178, 0dBFED906BCF328D46; +fma.rn.f64 fd207, fd177, 0d3FD87DE2A6AEA963, fd206; +add.f64 fd208, fd106, fd163; +add.f64 fd209, fd107, fd164; +sub.f64 fd210, fd106, fd163; +sub.f64 fd211, fd107, fd164; +add.f64 fd212, fd110, fd181; +add.f64 fd213, fd111, fd183; +sub.f64 fd214, fd110, fd181; +sub.f64 fd215, fd111, fd183; +add.f64 fd216, fd114, fd186; +add.f64 fd217, fd115, fd187; +sub.f64 fd218, fd114, fd186; +sub.f64 fd219, fd115, fd187; +add.f64 fd220, fd118, fd190; +add.f64 fd221, fd119, fd192; +sub.f64 fd222, fd118, fd190; +sub.f64 fd223, fd119, fd192; +sub.f64 fd224, fd108, fd166; +add.f64 fd225, fd109, fd165; +add.f64 fd226, fd108, fd166; +sub.f64 fd227, fd109, fd165; +add.f64 fd228, fd112, fd195; +add.f64 fd229, fd113, fd197; +sub.f64 fd230, fd112, fd195; +sub.f64 fd231, fd113, fd197; +add.f64 fd232, fd116, fd200; +add.f64 fd233, fd117, fd202; +sub.f64 fd234, fd116, fd200; +sub.f64 fd235, fd117, fd202; +add.f64 fd236, fd120, fd205; +add.f64 fd237, fd121, fd207; +sub.f64 fd238, fd120, fd205; +sub.f64 fd239, fd121, fd207; +and.b32 r6, r5, 511; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8176; +mov.u64 rd4, %33; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd240, fd241}, [rd5]; +mul.f64 fd244, fd213, fd241; +fma.rn.f64 fd245, fd240, fd212, fd244; +mul.f64 fd246, fd212, fd241; +mul.f64 fd247, fd240, fd213; +sub.f64 fd248, fd247, fd246; +mul.f64 fd249, fd240, fd240; +mul.f64 fd250, fd241, fd241; +sub.f64 fd251, fd249, fd250; +mul.f64 fd252, fd241, fd240; +fma.rn.f64 fd253, fd241, fd240, fd252; +mul.f64 fd254, fd217, fd253; +fma.rn.f64 fd255, fd251, fd216, fd254; +mul.f64 fd256, fd216, fd253; +mul.f64 fd257, fd251, fd217; +sub.f64 fd258, fd257, fd256; +mul.f64 fd259, fd240, fd251; +mul.f64 fd260, fd241, fd253; +sub.f64 fd261, fd259, fd260; +mul.f64 fd262, fd240, fd253; +fma.rn.f64 fd263, fd241, fd251, fd262; +mul.f64 fd264, fd221, fd263; +fma.rn.f64 fd265, fd261, fd220, fd264; +mul.f64 fd266, fd220, fd263; +mul.f64 fd267, fd261, fd221; +sub.f64 fd268, fd267, fd266; +mul.f64 fd269, fd240, fd261; +mul.f64 fd270, fd241, fd263; +sub.f64 fd271, fd269, fd270; +mul.f64 fd272, fd240, fd263; +fma.rn.f64 fd273, fd241, fd261, fd272; +mul.f64 fd274, fd225, fd273; +fma.rn.f64 fd275, fd271, fd224, fd274; +mul.f64 fd276, fd224, fd273; +mul.f64 fd277, fd271, fd225; +sub.f64 fd278, fd277, fd276; +mul.f64 fd279, fd240, fd271; +mul.f64 fd280, fd241, fd273; +sub.f64 fd281, fd279, fd280; +mul.f64 fd282, fd240, fd273; +fma.rn.f64 fd283, fd241, fd271, fd282; +mul.f64 fd284, fd229, fd283; +fma.rn.f64 fd285, fd281, fd228, fd284; +mul.f64 fd286, fd228, fd283; +mul.f64 fd287, fd281, fd229; +sub.f64 fd288, fd287, fd286; +mul.f64 fd289, fd240, fd281; +mul.f64 fd290, fd241, fd283; +sub.f64 fd291, fd289, fd290; +mul.f64 fd292, fd240, fd283; +fma.rn.f64 fd293, fd241, fd281, fd292; +mul.f64 fd294, fd233, fd293; +fma.rn.f64 fd295, fd291, fd232, fd294; +mul.f64 fd296, fd232, fd293; +mul.f64 fd297, fd291, fd233; +sub.f64 fd298, fd297, fd296; +mul.f64 fd299, fd240, fd291; +mul.f64 fd300, fd241, fd293; +sub.f64 fd301, fd299, fd300; +mul.f64 fd302, fd240, fd293; +fma.rn.f64 fd303, fd241, fd291, fd302; +mul.f64 fd304, fd237, fd303; +fma.rn.f64 fd305, fd301, fd236, fd304; +mul.f64 fd306, fd236, fd303; +mul.f64 fd307, fd301, fd237; +sub.f64 fd308, fd307, fd306; +mul.f64 fd309, fd240, fd301; +mul.f64 fd310, fd241, fd303; +sub.f64 fd311, fd309, fd310; +mul.f64 fd312, fd240, fd303; +fma.rn.f64 fd313, fd241, fd301, fd312; +mul.f64 fd314, fd211, fd313; +fma.rn.f64 fd315, fd311, fd210, fd314; +mul.f64 fd316, fd210, fd313; +mul.f64 fd317, fd311, fd211; +sub.f64 fd318, fd317, fd316; +ld.global.v2.f64 {fd319, fd320}, [rd5+8192]; +mul.f64 fd323, fd215, fd320; +fma.rn.f64 fd324, fd319, fd214, fd323; +mul.f64 fd325, fd214, fd320; +mul.f64 fd326, fd319, fd215; +sub.f64 fd327, fd326, fd325; +mul.f64 fd328, fd240, fd319; +mul.f64 fd329, fd241, fd320; +sub.f64 fd330, fd328, fd329; +mul.f64 fd331, fd240, fd320; +fma.rn.f64 fd332, fd241, fd319, fd331; +mul.f64 fd333, fd219, fd332; +fma.rn.f64 fd334, fd330, fd218, fd333; +mul.f64 fd335, fd218, fd332; +mul.f64 fd336, fd330, fd219; +sub.f64 fd337, fd336, fd335; +mul.f64 fd338, fd240, fd330; +mul.f64 fd339, fd241, fd332; +sub.f64 fd340, fd338, fd339; +mul.f64 fd341, fd240, fd332; +fma.rn.f64 fd342, fd241, fd330, fd341; +mul.f64 fd343, fd223, fd342; +fma.rn.f64 fd344, fd340, fd222, fd343; +mul.f64 fd345, fd222, fd342; +mul.f64 fd346, fd340, fd223; +sub.f64 fd347, fd346, fd345; +mul.f64 fd348, fd240, fd340; +mul.f64 fd349, fd241, fd342; +sub.f64 fd350, fd348, fd349; +mul.f64 fd351, fd240, fd342; +fma.rn.f64 fd352, fd241, fd340, fd351; +mul.f64 fd353, fd227, fd352; +fma.rn.f64 fd354, fd350, fd226, fd353; +mul.f64 fd355, fd226, fd352; +mul.f64 fd356, fd350, fd227; +sub.f64 fd357, fd356, fd355; +mul.f64 fd358, fd240, fd350; +mul.f64 fd359, fd241, fd352; +sub.f64 fd360, fd358, fd359; +mul.f64 fd361, fd240, fd352; +fma.rn.f64 fd362, fd241, fd350, fd361; +mul.f64 fd363, fd231, fd362; +fma.rn.f64 fd364, fd360, fd230, fd363; +mul.f64 fd365, fd230, fd362; +mul.f64 fd366, fd360, fd231; +sub.f64 fd367, fd366, fd365; +mul.f64 fd368, fd240, fd360; +mul.f64 fd369, fd241, fd362; +sub.f64 fd370, fd368, fd369; +mul.f64 fd371, fd240, fd362; +fma.rn.f64 fd372, fd241, fd360, fd371; +mul.f64 fd373, fd235, fd372; +fma.rn.f64 fd374, fd370, fd234, fd373; +mul.f64 fd375, fd234, fd372; +mul.f64 fd376, fd370, fd235; +sub.f64 fd377, fd376, fd375; +mul.f64 fd378, fd240, fd370; +mul.f64 fd379, fd241, fd372; +sub.f64 fd380, fd378, fd379; +mul.f64 fd381, fd240, fd372; +fma.rn.f64 fd382, fd241, fd370, fd381; +mul.f64 fd383, fd239, fd382; +fma.rn.f64 fd384, fd380, fd238, fd383; +mul.f64 fd385, fd238, fd382; +mul.f64 fd386, fd380, fd239; +sub.f64 fd387, fd386, fd385; +shl.b32 r8, r5, 7; +and.b32 r9, r8, -65536; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 65408; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd208, fd245}; +st.shared.v2.f64 [r12+16], {fd255, fd265}; +st.shared.v2.f64 [r12+32], {fd275, fd285}; +st.shared.v2.f64 [r12+48], {fd295, fd305}; +st.shared.v2.f64 [r12+64], {fd315, fd324}; +st.shared.v2.f64 [r12+80], {fd334, fd344}; +st.shared.v2.f64 [r12+96], {fd354, fd364}; +st.shared.v2.f64 [r12+112], {fd374, fd384}; +barrier.sync 0; +mad.lo.s32 r13, r6, -120, r12; +ld.shared.f64 fd388, [r13]; +ld.shared.f64 fd389, [r13+4096]; +ld.shared.f64 fd390, [r13+8192]; +ld.shared.f64 fd391, [r13+12288]; +ld.shared.f64 fd392, [r13+16384]; +ld.shared.f64 fd393, [r13+20480]; +ld.shared.f64 fd394, [r13+24576]; +ld.shared.f64 fd395, [r13+28672]; +ld.shared.f64 fd396, [r13+32768]; +ld.shared.f64 fd397, [r13+36864]; +ld.shared.f64 fd398, [r13+40960]; +ld.shared.f64 fd399, [r13+45056]; +ld.shared.f64 fd400, [r13+49152]; +ld.shared.f64 fd401, [r13+53248]; +ld.shared.f64 fd402, [r13+57344]; +ld.shared.f64 fd403, [r13+61440]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd209, fd248}; +st.shared.v2.f64 [r12+16], {fd258, fd268}; +st.shared.v2.f64 [r12+32], {fd278, fd288}; +st.shared.v2.f64 [r12+48], {fd298, fd308}; +st.shared.v2.f64 [r12+64], {fd318, fd327}; +st.shared.v2.f64 [r12+80], {fd337, fd347}; +st.shared.v2.f64 [r12+96], {fd357, fd367}; +st.shared.v2.f64 [r12+112], {fd377, fd387}; +barrier.sync 0; +ld.shared.f64 fd404, [r13]; +ld.shared.f64 fd405, [r13+4096]; +ld.shared.f64 fd406, [r13+8192]; +ld.shared.f64 fd407, [r13+12288]; +ld.shared.f64 fd408, [r13+16384]; +ld.shared.f64 fd409, [r13+20480]; +ld.shared.f64 fd410, [r13+24576]; +ld.shared.f64 fd411, [r13+28672]; +ld.shared.f64 fd412, [r13+32768]; +ld.shared.f64 fd413, [r13+36864]; +ld.shared.f64 fd414, [r13+40960]; +ld.shared.f64 fd415, [r13+45056]; +ld.shared.f64 fd416, [r13+49152]; +ld.shared.f64 fd417, [r13+53248]; +ld.shared.f64 fd418, [r13+57344]; +ld.shared.f64 fd419, [r13+61440]; +add.f64 fd420, fd388, fd396; +add.f64 fd421, fd404, fd412; +sub.f64 fd422, fd388, fd396; +sub.f64 fd423, fd404, fd412; +add.f64 fd424, fd392, fd400; +add.f64 fd425, fd408, fd416; +sub.f64 fd426, fd392, fd400; +sub.f64 fd427, fd408, fd416; +add.f64 fd428, fd420, fd424; +add.f64 fd429, fd421, fd425; +sub.f64 fd430, fd420, fd424; +sub.f64 fd431, fd421, fd425; +sub.f64 fd432, fd422, fd427; +add.f64 fd433, fd423, fd426; +add.f64 fd434, fd422, fd427; +sub.f64 fd435, fd423, fd426; +add.f64 fd436, fd390, fd398; +add.f64 fd437, fd406, fd414; +sub.f64 fd438, fd390, fd398; +sub.f64 fd439, fd406, fd414; +add.f64 fd440, fd394, fd402; +add.f64 fd441, fd410, fd418; +sub.f64 fd442, fd394, fd402; +sub.f64 fd443, fd410, fd418; +add.f64 fd444, fd436, fd440; +add.f64 fd445, fd437, fd441; +sub.f64 fd446, fd436, fd440; +sub.f64 fd447, fd437, fd441; +sub.f64 fd448, fd438, fd443; +add.f64 fd449, fd439, fd442; +add.f64 fd450, fd438, fd443; +sub.f64 fd451, fd439, fd442; +mul.f64 fd452, fd448, 0d3FE6A09E667F3BCD; +mul.f64 fd453, fd449, 0d3FE6A09E667F3BCD; +sub.f64 fd454, fd452, fd453; +add.f64 fd455, fd452, fd453; +mul.f64 fd456, fd450, 0dBFE6A09E667F3BCD; +mul.f64 fd457, fd451, 0d3FE6A09E667F3BCD; +sub.f64 fd458, fd456, fd457; +mul.f64 fd459, fd451, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd460, fd450, 0d3FE6A09E667F3BCD, fd459; +add.f64 fd461, fd428, fd444; +add.f64 fd462, fd429, fd445; +sub.f64 fd463, fd428, fd444; +sub.f64 fd464, fd429, fd445; +add.f64 fd465, fd432, fd454; +add.f64 fd466, fd433, fd455; +sub.f64 fd467, fd432, fd454; +sub.f64 fd468, fd433, fd455; +sub.f64 fd469, fd430, fd447; +add.f64 fd470, fd431, fd446; +add.f64 fd471, fd430, fd447; +sub.f64 fd472, fd431, fd446; +add.f64 fd473, fd434, fd458; +add.f64 fd474, fd435, fd460; +sub.f64 fd475, fd434, fd458; +sub.f64 fd476, fd435, fd460; +add.f64 fd477, fd389, fd397; +add.f64 fd478, fd405, fd413; +sub.f64 fd479, fd389, fd397; +sub.f64 fd480, fd405, fd413; +add.f64 fd481, fd393, fd401; +add.f64 fd482, fd409, fd417; +sub.f64 fd483, fd393, fd401; +sub.f64 fd484, fd409, fd417; +add.f64 fd485, fd477, fd481; +add.f64 fd486, fd478, fd482; +sub.f64 fd487, fd477, fd481; +sub.f64 fd488, fd478, fd482; +sub.f64 fd489, fd479, fd484; +add.f64 fd490, fd480, fd483; +add.f64 fd491, fd479, fd484; +sub.f64 fd492, fd480, fd483; +add.f64 fd493, fd391, fd399; +add.f64 fd494, fd407, fd415; +sub.f64 fd495, fd391, fd399; +sub.f64 fd496, fd407, fd415; +add.f64 fd497, fd395, fd403; +add.f64 fd498, fd411, fd419; +sub.f64 fd499, fd395, fd403; +sub.f64 fd500, fd411, fd419; +add.f64 fd501, fd493, fd497; +add.f64 fd502, fd494, fd498; +sub.f64 fd503, fd493, fd497; +sub.f64 fd504, fd494, fd498; +sub.f64 fd505, fd495, fd500; +add.f64 fd506, fd496, fd499; +add.f64 fd507, fd495, fd500; +sub.f64 fd508, fd496, fd499; +mul.f64 fd509, fd505, 0d3FE6A09E667F3BCD; +mul.f64 fd510, fd506, 0d3FE6A09E667F3BCD; +sub.f64 fd511, fd509, fd510; +add.f64 fd512, fd509, fd510; +mul.f64 fd513, fd507, 0dBFE6A09E667F3BCD; +mul.f64 fd514, fd508, 0d3FE6A09E667F3BCD; +sub.f64 fd515, fd513, fd514; +mul.f64 fd516, fd508, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd517, fd507, 0d3FE6A09E667F3BCD, fd516; +add.f64 fd518, fd485, fd501; +add.f64 fd519, fd486, fd502; +sub.f64 fd520, fd485, fd501; +sub.f64 fd521, fd486, fd502; +add.f64 fd522, fd489, fd511; +add.f64 fd523, fd490, fd512; +sub.f64 fd524, fd489, fd511; +sub.f64 fd525, fd490, fd512; +sub.f64 fd526, fd487, fd504; +add.f64 fd527, fd488, fd503; +add.f64 fd528, fd487, fd504; +sub.f64 fd529, fd488, fd503; +add.f64 fd530, fd491, fd515; +add.f64 fd531, fd492, fd517; +sub.f64 fd532, fd491, fd515; +sub.f64 fd533, fd492, fd517; +mul.f64 fd534, fd522, 0d3FED906BCF328D46; +mul.f64 fd535, fd523, 0d3FD87DE2A6AEA963; +sub.f64 fd536, fd534, fd535; +mul.f64 fd537, fd523, 0d3FED906BCF328D46; +fma.rn.f64 fd538, fd522, 0d3FD87DE2A6AEA963, fd537; +mul.f64 fd539, fd526, 0d3FE6A09E667F3BCD; +mul.f64 fd540, fd527, 0d3FE6A09E667F3BCD; +sub.f64 fd541, fd539, fd540; +add.f64 fd542, fd539, fd540; +mul.f64 fd543, fd530, 0d3FD87DE2A6AEA963; +mul.f64 fd544, fd531, 0d3FED906BCF328D46; +sub.f64 fd545, fd543, fd544; +mul.f64 fd546, fd531, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd547, fd530, 0d3FED906BCF328D46, fd546; +mul.f64 fd548, fd524, 0dBFD87DE2A6AEA963; +mul.f64 fd549, fd525, 0d3FED906BCF328D46; +sub.f64 fd550, fd548, fd549; +mul.f64 fd551, fd525, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd552, fd524, 0d3FED906BCF328D46, fd551; +mul.f64 fd553, fd528, 0dBFE6A09E667F3BCD; +mul.f64 fd554, fd529, 0d3FE6A09E667F3BCD; +sub.f64 fd555, fd553, fd554; +mul.f64 fd556, fd529, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd557, fd528, 0d3FE6A09E667F3BCD, fd556; +mul.f64 fd558, fd532, 0dBFED906BCF328D46; +mul.f64 fd559, fd533, 0d3FD87DE2A6AEA963; +sub.f64 fd560, fd558, fd559; +mul.f64 fd561, fd533, 0dBFED906BCF328D46; +fma.rn.f64 fd562, fd532, 0d3FD87DE2A6AEA963, fd561; +add.f64 fd563, fd461, fd518; +add.f64 fd564, fd462, fd519; +sub.f64 fd565, fd461, fd518; +sub.f64 fd566, fd462, fd519; +add.f64 fd567, fd465, fd536; +add.f64 fd568, fd466, fd538; +sub.f64 fd569, fd465, fd536; +sub.f64 fd570, fd466, fd538; +add.f64 fd571, fd469, fd541; +add.f64 fd572, fd470, fd542; +sub.f64 fd573, fd469, fd541; +sub.f64 fd574, fd470, fd542; +add.f64 fd575, fd473, fd545; +add.f64 fd576, fd474, fd547; +sub.f64 fd577, fd473, fd545; +sub.f64 fd578, fd474, fd547; +sub.f64 fd579, fd463, fd521; +add.f64 fd580, fd464, fd520; +add.f64 fd581, fd463, fd521; +sub.f64 fd582, fd464, fd520; +add.f64 fd583, fd467, fd550; +add.f64 fd584, fd468, fd552; +sub.f64 fd585, fd467, fd550; +sub.f64 fd586, fd468, fd552; +add.f64 fd587, fd471, fd555; +add.f64 fd588, fd472, fd557; +sub.f64 fd589, fd471, fd555; +sub.f64 fd590, fd472, fd557; +add.f64 fd591, fd475, fd560; +add.f64 fd592, fd476, fd562; +sub.f64 fd593, fd475, fd560; +sub.f64 fd594, fd476, fd562; +and.b32 r14, r5, 496; +cvt.u64.u32 rd6, r14; +mov.u64 rd7, %34; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd595, fd596}, [rd8]; +mul.f64 fd599, fd568, fd596; +fma.rn.f64 fd600, fd595, fd567, fd599; +mul.f64 fd601, fd567, fd596; +mul.f64 fd602, fd595, fd568; +sub.f64 fd603, fd602, fd601; +mul.f64 fd604, fd595, fd595; +mul.f64 fd605, fd596, fd596; +sub.f64 fd606, fd604, fd605; +mul.f64 fd607, fd596, fd595; +fma.rn.f64 fd608, fd596, fd595, fd607; +mul.f64 fd609, fd572, fd608; +fma.rn.f64 fd610, fd606, fd571, fd609; +mul.f64 fd611, fd571, fd608; +mul.f64 fd612, fd606, fd572; +sub.f64 fd613, fd612, fd611; +mul.f64 fd614, fd595, fd606; +mul.f64 fd615, fd596, fd608; +sub.f64 fd616, fd614, fd615; +mul.f64 fd617, fd595, fd608; +fma.rn.f64 fd618, fd596, fd606, fd617; +mul.f64 fd619, fd576, fd618; +fma.rn.f64 fd620, fd616, fd575, fd619; +mul.f64 fd621, fd575, fd618; +mul.f64 fd622, fd616, fd576; +sub.f64 fd623, fd622, fd621; +mul.f64 fd624, fd595, fd616; +mul.f64 fd625, fd596, fd618; +sub.f64 fd626, fd624, fd625; +mul.f64 fd627, fd595, fd618; +fma.rn.f64 fd628, fd596, fd616, fd627; +mul.f64 fd629, fd580, fd628; +fma.rn.f64 fd630, fd626, fd579, fd629; +mul.f64 fd631, fd579, fd628; +mul.f64 fd632, fd626, fd580; +sub.f64 fd633, fd632, fd631; +mul.f64 fd634, fd595, fd626; +mul.f64 fd635, fd596, fd628; +sub.f64 fd636, fd634, fd635; +mul.f64 fd637, fd595, fd628; +fma.rn.f64 fd638, fd596, fd626, fd637; +mul.f64 fd639, fd584, fd638; +fma.rn.f64 fd640, fd636, fd583, fd639; +mul.f64 fd641, fd583, fd638; +mul.f64 fd642, fd636, fd584; +sub.f64 fd643, fd642, fd641; +mul.f64 fd644, fd595, fd636; +mul.f64 fd645, fd596, fd638; +sub.f64 fd646, fd644, fd645; +mul.f64 fd647, fd595, fd638; +fma.rn.f64 fd648, fd596, fd636, fd647; +mul.f64 fd649, fd588, fd648; +fma.rn.f64 fd650, fd646, fd587, fd649; +mul.f64 fd651, fd587, fd648; +mul.f64 fd652, fd646, fd588; +sub.f64 fd653, fd652, fd651; +mul.f64 fd654, fd595, fd646; +mul.f64 fd655, fd596, fd648; +sub.f64 fd656, fd654, fd655; +mul.f64 fd657, fd595, fd648; +fma.rn.f64 fd658, fd596, fd646, fd657; +mul.f64 fd659, fd592, fd658; +fma.rn.f64 fd660, fd656, fd591, fd659; +mul.f64 fd661, fd591, fd658; +mul.f64 fd662, fd656, fd592; +sub.f64 fd663, fd662, fd661; +mul.f64 fd664, fd595, fd656; +mul.f64 fd665, fd596, fd658; +sub.f64 fd666, fd664, fd665; +mul.f64 fd667, fd595, fd658; +fma.rn.f64 fd668, fd596, fd656, fd667; +mul.f64 fd669, fd566, fd668; +fma.rn.f64 fd670, fd666, fd565, fd669; +mul.f64 fd671, fd565, fd668; +mul.f64 fd672, fd666, fd566; +sub.f64 fd673, fd672, fd671; +ld.global.v2.f64 {fd674, fd675}, [rd8+512]; +mul.f64 fd678, fd570, fd675; +fma.rn.f64 fd679, fd674, fd569, fd678; +mul.f64 fd680, fd569, fd675; +mul.f64 fd681, fd674, fd570; +sub.f64 fd682, fd681, fd680; +mul.f64 fd683, fd595, fd674; +mul.f64 fd684, fd596, fd675; +sub.f64 fd685, fd683, fd684; +mul.f64 fd686, fd595, fd675; +fma.rn.f64 fd687, fd596, fd674, fd686; +mul.f64 fd688, fd574, fd687; +fma.rn.f64 fd689, fd685, fd573, fd688; +mul.f64 fd690, fd573, fd687; +mul.f64 fd691, fd685, fd574; +sub.f64 fd692, fd691, fd690; +mul.f64 fd693, fd595, fd685; +mul.f64 fd694, fd596, fd687; +sub.f64 fd695, fd693, fd694; +mul.f64 fd696, fd595, fd687; +fma.rn.f64 fd697, fd596, fd685, fd696; +mul.f64 fd698, fd578, fd697; +fma.rn.f64 fd699, fd695, fd577, fd698; +mul.f64 fd700, fd577, fd697; +mul.f64 fd701, fd695, fd578; +sub.f64 fd702, fd701, fd700; +mul.f64 fd703, fd595, fd695; +mul.f64 fd704, fd596, fd697; +sub.f64 fd705, fd703, fd704; +mul.f64 fd706, fd595, fd697; +fma.rn.f64 fd707, fd596, fd695, fd706; +mul.f64 fd708, fd582, fd707; +fma.rn.f64 fd709, fd705, fd581, fd708; +mul.f64 fd710, fd581, fd707; +mul.f64 fd711, fd705, fd582; +sub.f64 fd712, fd711, fd710; +mul.f64 fd713, fd595, fd705; +mul.f64 fd714, fd596, fd707; +sub.f64 fd715, fd713, fd714; +mul.f64 fd716, fd595, fd707; +fma.rn.f64 fd717, fd596, fd705, fd716; +mul.f64 fd718, fd586, fd717; +fma.rn.f64 fd719, fd715, fd585, fd718; +mul.f64 fd720, fd585, fd717; +mul.f64 fd721, fd715, fd586; +sub.f64 fd722, fd721, fd720; +mul.f64 fd723, fd595, fd715; +mul.f64 fd724, fd596, fd717; +sub.f64 fd725, fd723, fd724; +mul.f64 fd726, fd595, fd717; +fma.rn.f64 fd727, fd596, fd715, fd726; +mul.f64 fd728, fd590, fd727; +fma.rn.f64 fd729, fd725, fd589, fd728; +mul.f64 fd730, fd589, fd727; +mul.f64 fd731, fd725, fd590; +sub.f64 fd732, fd731, fd730; +mul.f64 fd733, fd595, fd725; +mul.f64 fd734, fd596, fd727; +sub.f64 fd735, fd733, fd734; +mul.f64 fd736, fd595, fd727; +fma.rn.f64 fd737, fd596, fd725, fd736; +mul.f64 fd738, fd594, fd737; +fma.rn.f64 fd739, fd735, fd593, fd738; +mul.f64 fd740, fd593, fd737; +mul.f64 fd741, fd735, fd594; +sub.f64 fd742, fd741, fd740; +shl.b32 r15, r5, 3; +and.b32 r16, r15, 120; +add.s32 r17, r10, r16; +barrier.sync 0; +and.b32 r18, r8, 63488; +add.s32 r19, r17, r18; +st.shared.f64 [r19], fd563; +st.shared.f64 [r19+128], fd600; +st.shared.f64 [r19+256], fd610; +st.shared.f64 [r19+384], fd620; +st.shared.f64 [r19+512], fd630; +st.shared.f64 [r19+640], fd640; +st.shared.f64 [r19+768], fd650; +st.shared.f64 [r19+896], fd660; +st.shared.f64 [r19+1024], fd670; +st.shared.f64 [r19+1152], fd679; +st.shared.f64 [r19+1280], fd689; +st.shared.f64 [r19+1408], fd699; +st.shared.f64 [r19+1536], fd709; +st.shared.f64 [r19+1664], fd719; +st.shared.f64 [r19+1792], fd729; +st.shared.f64 [r19+1920], fd739; +barrier.sync 0; +mad.lo.s32 r20, r14, -120, r19; +ld.shared.f64 fd743, [r20]; +ld.shared.f64 fd744, [r20+4096]; +ld.shared.f64 fd745, [r20+8192]; +ld.shared.f64 fd746, [r20+12288]; +ld.shared.f64 fd747, [r20+16384]; +ld.shared.f64 fd748, [r20+20480]; +ld.shared.f64 fd749, [r20+24576]; +ld.shared.f64 fd750, [r20+28672]; +ld.shared.f64 fd751, [r20+32768]; +ld.shared.f64 fd752, [r20+36864]; +ld.shared.f64 fd753, [r20+40960]; +ld.shared.f64 fd754, [r20+45056]; +ld.shared.f64 fd755, [r20+49152]; +ld.shared.f64 fd756, [r20+53248]; +ld.shared.f64 fd757, [r20+57344]; +ld.shared.f64 fd758, [r20+61440]; +barrier.sync 0; +st.shared.f64 [r19], fd564; +st.shared.f64 [r19+128], fd603; +st.shared.f64 [r19+256], fd613; +st.shared.f64 [r19+384], fd623; +st.shared.f64 [r19+512], fd633; +st.shared.f64 [r19+640], fd643; +st.shared.f64 [r19+768], fd653; +st.shared.f64 [r19+896], fd663; +st.shared.f64 [r19+1024], fd673; +st.shared.f64 [r19+1152], fd682; +st.shared.f64 [r19+1280], fd692; +st.shared.f64 [r19+1408], fd702; +st.shared.f64 [r19+1536], fd712; +st.shared.f64 [r19+1664], fd722; +st.shared.f64 [r19+1792], fd732; +st.shared.f64 [r19+1920], fd742; +barrier.sync 0; +ld.shared.f64 fd759, [r20]; +ld.shared.f64 fd760, [r20+4096]; +ld.shared.f64 fd761, [r20+8192]; +ld.shared.f64 fd762, [r20+12288]; +ld.shared.f64 fd763, [r20+16384]; +ld.shared.f64 fd764, [r20+20480]; +ld.shared.f64 fd765, [r20+24576]; +ld.shared.f64 fd766, [r20+28672]; +ld.shared.f64 fd767, [r20+32768]; +ld.shared.f64 fd768, [r20+36864]; +ld.shared.f64 fd769, [r20+40960]; +ld.shared.f64 fd770, [r20+45056]; +ld.shared.f64 fd771, [r20+49152]; +ld.shared.f64 fd772, [r20+53248]; +ld.shared.f64 fd773, [r20+57344]; +ld.shared.f64 fd774, [r20+61440]; +add.f64 fd775, fd743, fd751; +add.f64 fd776, fd759, fd767; +sub.f64 fd777, fd743, fd751; +sub.f64 fd778, fd759, fd767; +add.f64 fd779, fd747, fd755; +add.f64 fd780, fd763, fd771; +sub.f64 fd781, fd747, fd755; +sub.f64 fd782, fd763, fd771; +add.f64 fd783, fd775, fd779; +add.f64 fd784, fd776, fd780; +sub.f64 fd785, fd775, fd779; +sub.f64 fd786, fd776, fd780; +sub.f64 fd787, fd777, fd782; +add.f64 fd788, fd778, fd781; +add.f64 fd789, fd777, fd782; +sub.f64 fd790, fd778, fd781; +add.f64 fd791, fd745, fd753; +add.f64 fd792, fd761, fd769; +sub.f64 fd793, fd745, fd753; +sub.f64 fd794, fd761, fd769; +add.f64 fd795, fd749, fd757; +add.f64 fd796, fd765, fd773; +sub.f64 fd797, fd749, fd757; +sub.f64 fd798, fd765, fd773; +add.f64 fd799, fd791, fd795; +add.f64 fd800, fd792, fd796; +sub.f64 fd801, fd791, fd795; +sub.f64 fd802, fd792, fd796; +sub.f64 fd803, fd793, fd798; +add.f64 fd804, fd794, fd797; +add.f64 fd805, fd793, fd798; +sub.f64 fd806, fd794, fd797; +mul.f64 fd807, fd803, 0d3FE6A09E667F3BCD; +mul.f64 fd808, fd804, 0d3FE6A09E667F3BCD; +sub.f64 fd809, fd807, fd808; +add.f64 fd810, fd807, fd808; +mul.f64 fd811, fd805, 0dBFE6A09E667F3BCD; +mul.f64 fd812, fd806, 0d3FE6A09E667F3BCD; +sub.f64 fd813, fd811, fd812; +mul.f64 fd814, fd806, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd815, fd805, 0d3FE6A09E667F3BCD, fd814; +add.f64 fd816, fd783, fd799; +add.f64 fd817, fd784, fd800; +sub.f64 fd818, fd783, fd799; +sub.f64 fd819, fd784, fd800; +add.f64 fd820, fd787, fd809; +add.f64 fd821, fd788, fd810; +sub.f64 fd822, fd787, fd809; +sub.f64 fd823, fd788, fd810; +sub.f64 fd824, fd785, fd802; +add.f64 fd825, fd786, fd801; +add.f64 fd826, fd785, fd802; +sub.f64 fd827, fd786, fd801; +add.f64 fd828, fd789, fd813; +add.f64 fd829, fd790, fd815; +sub.f64 fd830, fd789, fd813; +sub.f64 fd831, fd790, fd815; +add.f64 fd832, fd744, fd752; +add.f64 fd833, fd760, fd768; +sub.f64 fd834, fd744, fd752; +sub.f64 fd835, fd760, fd768; +add.f64 fd836, fd748, fd756; +add.f64 fd837, fd764, fd772; +sub.f64 fd838, fd748, fd756; +sub.f64 fd839, fd764, fd772; +add.f64 fd840, fd832, fd836; +add.f64 fd841, fd833, fd837; +sub.f64 fd842, fd832, fd836; +sub.f64 fd843, fd833, fd837; +sub.f64 fd844, fd834, fd839; +add.f64 fd845, fd835, fd838; +add.f64 fd846, fd834, fd839; +sub.f64 fd847, fd835, fd838; +add.f64 fd848, fd746, fd754; +add.f64 fd849, fd762, fd770; +sub.f64 fd850, fd746, fd754; +sub.f64 fd851, fd762, fd770; +add.f64 fd852, fd750, fd758; +add.f64 fd853, fd766, fd774; +sub.f64 fd854, fd750, fd758; +sub.f64 fd855, fd766, fd774; +add.f64 fd856, fd848, fd852; +add.f64 fd857, fd849, fd853; +sub.f64 fd858, fd848, fd852; +sub.f64 fd859, fd849, fd853; +sub.f64 fd860, fd850, fd855; +add.f64 fd861, fd851, fd854; +add.f64 fd862, fd850, fd855; +sub.f64 fd863, fd851, fd854; +mul.f64 fd864, fd860, 0d3FE6A09E667F3BCD; +mul.f64 fd865, fd861, 0d3FE6A09E667F3BCD; +sub.f64 fd866, fd864, fd865; +add.f64 fd867, fd864, fd865; +mul.f64 fd868, fd862, 0dBFE6A09E667F3BCD; +mul.f64 fd869, fd863, 0d3FE6A09E667F3BCD; +sub.f64 fd870, fd868, fd869; +mul.f64 fd871, fd863, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd872, fd862, 0d3FE6A09E667F3BCD, fd871; +add.f64 fd873, fd840, fd856; +add.f64 fd874, fd841, fd857; +sub.f64 fd875, fd840, fd856; +sub.f64 fd876, fd841, fd857; +add.f64 fd877, fd844, fd866; +add.f64 fd878, fd845, fd867; +sub.f64 fd879, fd844, fd866; +sub.f64 fd880, fd845, fd867; +sub.f64 fd881, fd842, fd859; +add.f64 fd882, fd843, fd858; +add.f64 fd883, fd842, fd859; +sub.f64 fd884, fd843, fd858; +add.f64 fd885, fd846, fd870; +add.f64 fd886, fd847, fd872; +sub.f64 fd887, fd846, fd870; +sub.f64 fd888, fd847, fd872; +mul.f64 fd889, fd877, 0d3FED906BCF328D46; +mul.f64 fd890, fd878, 0d3FD87DE2A6AEA963; +sub.f64 fd891, fd889, fd890; +mul.f64 fd892, fd878, 0d3FED906BCF328D46; +fma.rn.f64 fd893, fd877, 0d3FD87DE2A6AEA963, fd892; +mul.f64 fd894, fd881, 0d3FE6A09E667F3BCD; +mul.f64 fd895, fd882, 0d3FE6A09E667F3BCD; +sub.f64 fd896, fd894, fd895; +add.f64 fd897, fd894, fd895; +mul.f64 fd898, fd885, 0d3FD87DE2A6AEA963; +mul.f64 fd899, fd886, 0d3FED906BCF328D46; +sub.f64 fd900, fd898, fd899; +mul.f64 fd901, fd886, 0d3FD87DE2A6AEA963; +fma.rn.f64 fd902, fd885, 0d3FED906BCF328D46, fd901; +mul.f64 fd903, fd879, 0dBFD87DE2A6AEA963; +mul.f64 fd904, fd880, 0d3FED906BCF328D46; +sub.f64 fd905, fd903, fd904; +mul.f64 fd906, fd880, 0dBFD87DE2A6AEA963; +fma.rn.f64 fd907, fd879, 0d3FED906BCF328D46, fd906; +mul.f64 fd908, fd883, 0dBFE6A09E667F3BCD; +mul.f64 fd909, fd884, 0d3FE6A09E667F3BCD; +sub.f64 fd910, fd908, fd909; +mul.f64 fd911, fd884, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd912, fd883, 0d3FE6A09E667F3BCD, fd911; +mul.f64 fd913, fd887, 0dBFED906BCF328D46; +mul.f64 fd914, fd888, 0d3FD87DE2A6AEA963; +sub.f64 fd915, fd913, fd914; +mul.f64 fd916, fd888, 0dBFED906BCF328D46; +fma.rn.f64 fd917, fd887, 0d3FD87DE2A6AEA963, fd916; +add.f64 fd918, fd816, fd873; +add.f64 fd919, fd817, fd874; +sub.f64 fd920, fd816, fd873; +sub.f64 fd921, fd817, fd874; +add.f64 fd922, fd820, fd891; +add.f64 fd923, fd821, fd893; +sub.f64 fd924, fd820, fd891; +sub.f64 fd925, fd821, fd893; +add.f64 fd926, fd824, fd896; +add.f64 fd927, fd825, fd897; +sub.f64 fd928, fd824, fd896; +sub.f64 fd929, fd825, fd897; +add.f64 fd930, fd828, fd900; +add.f64 fd931, fd829, fd902; +sub.f64 fd932, fd828, fd900; +sub.f64 fd933, fd829, fd902; +sub.f64 fd934, fd818, fd876; +add.f64 fd935, fd819, fd875; +add.f64 fd936, fd818, fd876; +sub.f64 fd937, fd819, fd875; +add.f64 fd938, fd822, fd905; +add.f64 fd939, fd823, fd907; +sub.f64 fd940, fd822, fd905; +sub.f64 fd941, fd823, fd907; +add.f64 fd942, fd826, fd910; +add.f64 fd943, fd827, fd912; +sub.f64 fd944, fd826, fd910; +sub.f64 fd945, fd827, fd912; +add.f64 fd946, fd830, fd915; +add.f64 fd947, fd831, fd917; +sub.f64 fd948, fd830, fd915; +sub.f64 fd949, fd831, fd917; +and.b32 r21, r5, 256; +bfe.u32 r22, r5, 8, 1; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %35; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd950, fd951}, [rd11]; +mul.f64 fd954, fd923, fd951; +fma.rn.f64 fd955, fd950, fd922, fd954; +mul.f64 fd956, fd922, fd951; +mul.f64 fd957, fd950, fd923; +sub.f64 fd958, fd957, fd956; +mul.f64 fd959, fd950, fd950; +mul.f64 fd960, fd951, fd951; +sub.f64 fd961, fd959, fd960; +mul.f64 fd962, fd951, fd950; +fma.rn.f64 fd963, fd951, fd950, fd962; +mul.f64 fd964, fd927, fd963; +fma.rn.f64 fd965, fd961, fd926, fd964; +mul.f64 fd966, fd926, fd963; +mul.f64 fd967, fd961, fd927; +sub.f64 fd968, fd967, fd966; +mul.f64 fd969, fd950, fd961; +mul.f64 fd970, fd951, fd963; +sub.f64 fd971, fd969, fd970; +mul.f64 fd972, fd950, fd963; +fma.rn.f64 fd973, fd951, fd961, fd972; +mul.f64 fd974, fd931, fd973; +fma.rn.f64 fd975, fd971, fd930, fd974; +mul.f64 fd976, fd930, fd973; +mul.f64 fd977, fd971, fd931; +sub.f64 fd978, fd977, fd976; +mul.f64 fd979, fd950, fd971; +mul.f64 fd980, fd951, fd973; +sub.f64 fd981, fd979, fd980; +mul.f64 fd982, fd950, fd973; +fma.rn.f64 fd983, fd951, fd971, fd982; +mul.f64 fd984, fd935, fd983; +fma.rn.f64 fd985, fd981, fd934, fd984; +mul.f64 fd986, fd934, fd983; +mul.f64 fd987, fd981, fd935; +sub.f64 fd988, fd987, fd986; +mul.f64 fd989, fd950, fd981; +mul.f64 fd990, fd951, fd983; +sub.f64 fd991, fd989, fd990; +mul.f64 fd992, fd950, fd983; +fma.rn.f64 fd993, fd951, fd981, fd992; +mul.f64 fd994, fd939, fd993; +fma.rn.f64 fd995, fd991, fd938, fd994; +mul.f64 fd996, fd938, fd993; +mul.f64 fd997, fd991, fd939; +sub.f64 fd998, fd997, fd996; +mul.f64 fd999, fd950, fd991; +mul.f64 fd1000, fd951, fd993; +sub.f64 fd1001, fd999, fd1000; +mul.f64 fd1002, fd950, fd993; +fma.rn.f64 fd1003, fd951, fd991, fd1002; +mul.f64 fd1004, fd943, fd1003; +fma.rn.f64 fd1005, fd1001, fd942, fd1004; +mul.f64 fd1006, fd942, fd1003; +mul.f64 fd1007, fd1001, fd943; +sub.f64 fd1008, fd1007, fd1006; +mul.f64 fd1009, fd950, fd1001; +mul.f64 fd1010, fd951, fd1003; +sub.f64 fd1011, fd1009, fd1010; +mul.f64 fd1012, fd950, fd1003; +fma.rn.f64 fd1013, fd951, fd1001, fd1012; +mul.f64 fd1014, fd947, fd1013; +fma.rn.f64 fd1015, fd1011, fd946, fd1014; +mul.f64 fd1016, fd946, fd1013; +mul.f64 fd1017, fd1011, fd947; +sub.f64 fd1018, fd1017, fd1016; +mul.f64 fd1019, fd950, fd1011; +mul.f64 fd1020, fd951, fd1013; +sub.f64 fd1021, fd1019, fd1020; +mul.f64 fd1022, fd950, fd1013; +fma.rn.f64 fd1023, fd951, fd1011, fd1022; +mul.f64 fd1024, fd921, fd1023; +fma.rn.f64 fd1025, fd1021, fd920, fd1024; +mul.f64 fd1026, fd920, fd1023; +mul.f64 fd1027, fd1021, fd921; +sub.f64 fd1028, fd1027, fd1026; +ld.global.v2.f64 {fd1029, fd1030}, [rd11+32]; +mul.f64 fd1033, fd925, fd1030; +fma.rn.f64 fd1034, fd1029, fd924, fd1033; +mul.f64 fd1035, fd924, fd1030; +mul.f64 fd1036, fd1029, fd925; +sub.f64 fd1037, fd1036, fd1035; +mul.f64 fd1038, fd950, fd1029; +mul.f64 fd1039, fd951, fd1030; +sub.f64 fd1040, fd1038, fd1039; +mul.f64 fd1041, fd950, fd1030; +fma.rn.f64 fd1042, fd951, fd1029, fd1041; +mul.f64 fd1043, fd929, fd1042; +fma.rn.f64 fd1044, fd1040, fd928, fd1043; +mul.f64 fd1045, fd928, fd1042; +mul.f64 fd1046, fd1040, fd929; +sub.f64 fd1047, fd1046, fd1045; +mul.f64 fd1048, fd950, fd1040; +mul.f64 fd1049, fd951, fd1042; +sub.f64 fd1050, fd1048, fd1049; +mul.f64 fd1051, fd950, fd1042; +fma.rn.f64 fd1052, fd951, fd1040, fd1051; +mul.f64 fd1053, fd933, fd1052; +fma.rn.f64 fd1054, fd1050, fd932, fd1053; +mul.f64 fd1055, fd932, fd1052; +mul.f64 fd1056, fd1050, fd933; +sub.f64 fd1057, fd1056, fd1055; +mul.f64 fd1058, fd950, fd1050; +mul.f64 fd1059, fd951, fd1052; +sub.f64 fd1060, fd1058, fd1059; +mul.f64 fd1061, fd950, fd1052; +fma.rn.f64 fd1062, fd951, fd1050, fd1061; +mul.f64 fd1063, fd937, fd1062; +fma.rn.f64 fd1064, fd1060, fd936, fd1063; +mul.f64 fd1065, fd936, fd1062; +mul.f64 fd1066, fd1060, fd937; +sub.f64 fd1067, fd1066, fd1065; +mul.f64 fd1068, fd950, fd1060; +mul.f64 fd1069, fd951, fd1062; +sub.f64 fd1070, fd1068, fd1069; +mul.f64 fd1071, fd950, fd1062; +fma.rn.f64 fd1072, fd951, fd1060, fd1071; +mul.f64 fd1073, fd941, fd1072; +fma.rn.f64 fd1074, fd1070, fd940, fd1073; +mul.f64 fd1075, fd940, fd1072; +mul.f64 fd1076, fd1070, fd941; +sub.f64 fd1077, fd1076, fd1075; +mul.f64 fd1078, fd950, fd1070; +mul.f64 fd1079, fd951, fd1072; +sub.f64 fd1080, fd1078, fd1079; +mul.f64 fd1081, fd950, fd1072; +fma.rn.f64 fd1082, fd951, fd1070, fd1081; +mul.f64 fd1083, fd945, fd1082; +fma.rn.f64 fd1084, fd1080, fd944, fd1083; +mul.f64 fd1085, fd944, fd1082; +mul.f64 fd1086, fd1080, fd945; +sub.f64 fd1087, fd1086, fd1085; +mul.f64 fd1088, fd950, fd1080; +mul.f64 fd1089, fd951, fd1082; +sub.f64 fd1090, fd1088, fd1089; +mul.f64 fd1091, fd950, fd1082; +fma.rn.f64 fd1092, fd951, fd1080, fd1091; +mul.f64 fd1093, fd949, fd1092; +fma.rn.f64 fd1094, fd1090, fd948, fd1093; +mul.f64 fd1095, fd948, fd1092; +mul.f64 fd1096, fd1090, fd949; +sub.f64 fd1097, fd1096, fd1095; +and.b32 r23, r15, 2040; +add.s32 r24, r10, r23; +barrier.sync 0; +and.b32 r25, r8, 32768; +add.s32 r26, r24, r25; +st.shared.f64 [r26], fd918; +st.shared.f64 [r26+2048], fd955; +st.shared.f64 [r26+4096], fd965; +st.shared.f64 [r26+6144], fd975; +st.shared.f64 [r26+8192], fd985; +st.shared.f64 [r26+10240], fd995; +st.shared.f64 [r26+12288], fd1005; +st.shared.f64 [r26+14336], fd1015; +st.shared.f64 [r26+16384], fd1025; +st.shared.f64 [r26+18432], fd1034; +st.shared.f64 [r26+20480], fd1044; +st.shared.f64 [r26+22528], fd1054; +st.shared.f64 [r26+24576], fd1064; +st.shared.f64 [r26+26624], fd1074; +st.shared.f64 [r26+28672], fd1084; +st.shared.f64 [r26+30720], fd1094; +barrier.sync 0; +mad.lo.s32 r27, r21, -120, r26; +ld.shared.f64 fd1098, [r27]; +ld.shared.f64 fd1099, [r27+4096]; +ld.shared.f64 fd1100, [r27+8192]; +ld.shared.f64 fd1101, [r27+12288]; +ld.shared.f64 fd1102, [r27+16384]; +ld.shared.f64 fd1103, [r27+20480]; +ld.shared.f64 fd1104, [r27+24576]; +ld.shared.f64 fd1105, [r27+28672]; +ld.shared.f64 fd1106, [r27+32768]; +ld.shared.f64 fd1107, [r27+36864]; +ld.shared.f64 fd1108, [r27+40960]; +ld.shared.f64 fd1109, [r27+45056]; +ld.shared.f64 fd1110, [r27+49152]; +ld.shared.f64 fd1111, [r27+53248]; +ld.shared.f64 fd1112, [r27+57344]; +ld.shared.f64 fd1113, [r27+61440]; +barrier.sync 0; +st.shared.f64 [r26], fd919; +st.shared.f64 [r26+2048], fd958; +st.shared.f64 [r26+4096], fd968; +st.shared.f64 [r26+6144], fd978; +st.shared.f64 [r26+8192], fd988; +st.shared.f64 [r26+10240], fd998; +st.shared.f64 [r26+12288], fd1008; +st.shared.f64 [r26+14336], fd1018; +st.shared.f64 [r26+16384], fd1028; +st.shared.f64 [r26+18432], fd1037; +st.shared.f64 [r26+20480], fd1047; +st.shared.f64 [r26+22528], fd1057; +st.shared.f64 [r26+24576], fd1067; +st.shared.f64 [r26+26624], fd1077; +st.shared.f64 [r26+28672], fd1087; +st.shared.f64 [r26+30720], fd1097; +barrier.sync 0; +ld.shared.f64 fd1114, [r27]; +ld.shared.f64 fd1115, [r27+4096]; +ld.shared.f64 fd1116, [r27+8192]; +ld.shared.f64 fd1117, [r27+12288]; +ld.shared.f64 fd1118, [r27+16384]; +ld.shared.f64 fd1119, [r27+20480]; +ld.shared.f64 fd1120, [r27+24576]; +ld.shared.f64 fd1121, [r27+28672]; +ld.shared.f64 fd1122, [r27+32768]; +ld.shared.f64 fd1123, [r27+36864]; +ld.shared.f64 fd1124, [r27+40960]; +ld.shared.f64 fd1125, [r27+45056]; +ld.shared.f64 fd1126, [r27+49152]; +ld.shared.f64 fd1127, [r27+53248]; +ld.shared.f64 fd1128, [r27+57344]; +ld.shared.f64 fd1129, [r27+61440]; +add.f64 %0, fd1098, fd1106; +add.f64 %1, fd1114, fd1122; +add.f64 %2, fd1099, fd1107; +add.f64 %3, fd1115, fd1123; +add.f64 %4, fd1100, fd1108; +add.f64 %5, fd1116, fd1124; +add.f64 %6, fd1101, fd1109; +add.f64 %7, fd1117, fd1125; +add.f64 %8, fd1102, fd1110; +add.f64 %9, fd1118, fd1126; +add.f64 %10, fd1103, fd1111; +add.f64 %11, fd1119, fd1127; +add.f64 %12, fd1104, fd1112; +add.f64 %13, fd1120, fd1128; +add.f64 %14, fd1105, fd1113; +add.f64 %15, fd1121, fd1129; +sub.f64 %16, fd1098, fd1106; +sub.f64 %17, fd1114, fd1122; +sub.f64 %18, fd1099, fd1107; +sub.f64 %19, fd1115, fd1123; +sub.f64 %20, fd1100, fd1108; +sub.f64 %21, fd1116, fd1124; +sub.f64 %22, fd1101, fd1109; +sub.f64 %23, fd1117, fd1125; +sub.f64 %24, fd1102, fd1110; +sub.f64 %25, fd1118, fd1126; +sub.f64 %26, fd1103, fd1111; +sub.f64 %27, fd1119, fd1127; +sub.f64 %28, fd1104, fd1112; +sub.f64 %29, fd1120, fd1128; +sub.f64 %30, fd1105, fd1113; +sub.f64 %31, fd1121, fd1129; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y): "r"(smem), "l"(lut_dp_16_8192), "l"(lut_dp_16_512), "l"(lut_dp_16_32), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1169, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<35>; +.reg .f64 fd<677>; +.reg .b64 rd<15>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 17; +mov.u32 r3, %16; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd33, %21, %31; +add.f64 fd34, %22, %33; +sub.f64 fd35, %21, %31; +sub.f64 fd36, %22, %33; +add.f64 fd37, %26, %37; +add.f64 fd38, %28, %38; +sub.f64 fd39, %26, %37; +sub.f64 fd40, %28, %38; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %23, %34; +add.f64 fd50, %25, %36; +sub.f64 fd51, %23, %34; +sub.f64 fd52, %25, %36; +add.f64 fd53, %29, %39; +add.f64 fd54, %30, %40; +sub.f64 fd55, %29, %39; +sub.f64 fd56, %30, %40; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +sub.f64 fd74, fd41, fd57; +sub.f64 fd75, fd42, fd58; +add.f64 fd76, fd45, fd67; +add.f64 fd77, fd46, fd68; +sub.f64 fd78, fd45, fd67; +sub.f64 fd79, fd46, fd68; +sub.f64 fd80, fd43, fd60; +add.f64 fd81, fd44, fd59; +add.f64 fd82, fd43, fd60; +sub.f64 fd83, fd44, fd59; +add.f64 fd84, fd47, fd71; +add.f64 fd85, fd48, fd73; +sub.f64 fd86, fd47, fd71; +sub.f64 fd87, fd48, fd73; +and.b32 r6, r5, 1023; +shl.b32 r7, r5, 7; +and.b32 r8, r7, -131072; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16368; +mov.u64 rd4, %17; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd88, fd89}, [rd5]; +mul.f64 fd92, fd77, fd89; +mul.f64 fd93, fd76, fd89; +mul.f64 fd94, fd88, fd77; +mul.f64 fd95, fd88, fd88; +mul.f64 fd96, fd89, fd89; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd89, fd88; +fma.rn.f64 fd99, fd89, fd88, fd98; +mul.f64 fd100, fd81, fd99; +mul.f64 fd101, fd80, fd99; +mul.f64 fd102, fd97, fd81; +mul.f64 fd103, fd88, fd97; +mul.f64 fd104, fd89, fd99; +sub.f64 fd105, fd103, fd104; +mul.f64 fd106, fd88, fd99; +fma.rn.f64 fd107, fd89, fd97, fd106; +mul.f64 fd108, fd85, fd107; +mul.f64 fd109, fd84, fd107; +mul.f64 fd110, fd105, fd85; +mul.f64 fd111, fd88, fd105; +mul.f64 fd112, fd89, fd107; +sub.f64 fd113, fd111, fd112; +mul.f64 fd114, fd88, fd107; +fma.rn.f64 fd115, fd89, fd105, fd114; +mul.f64 fd116, fd75, fd115; +mul.f64 fd117, fd74, fd115; +mul.f64 fd118, fd113, fd75; +ld.global.v2.f64 {fd119, fd120}, [rd5+16384]; +mul.f64 fd123, fd79, fd120; +mul.f64 fd124, fd78, fd120; +mul.f64 fd125, fd119, fd79; +mul.f64 fd126, fd88, fd119; +mul.f64 fd127, fd89, fd120; +sub.f64 fd128, fd126, fd127; +mul.f64 fd129, fd88, fd120; +fma.rn.f64 fd130, fd89, fd119, fd129; +mul.f64 fd131, fd83, fd130; +mul.f64 fd132, fd82, fd130; +mul.f64 fd133, fd128, fd83; +mul.f64 fd134, fd88, fd128; +mul.f64 fd135, fd89, fd130; +sub.f64 fd136, fd134, fd135; +mul.f64 fd137, fd88, fd130; +fma.rn.f64 fd138, fd89, fd128, fd137; +mul.f64 fd139, fd87, fd138; +mul.f64 fd140, fd86, fd138; +mul.f64 fd141, fd136, fd87; +barrier.sync 0; +and.b32 r11, r7, 130944; +add.s32 r12, r9, r11; +add.f64 fd142, fd42, fd58; +add.f64 fd143, fd41, fd57; +st.shared.v2.f64 [r12], {fd143, fd142}; +fma.rn.f64 fd144, fd88, fd76, fd92; +sub.f64 fd145, fd94, fd93; +st.shared.v2.f64 [r12+16], {fd144, fd145}; +fma.rn.f64 fd146, fd97, fd80, fd100; +sub.f64 fd147, fd102, fd101; +st.shared.v2.f64 [r12+32], {fd146, fd147}; +sub.f64 fd148, fd110, fd109; +fma.rn.f64 fd149, fd105, fd84, fd108; +st.shared.v2.f64 [r12+48], {fd149, fd148}; +fma.rn.f64 fd150, fd113, fd74, fd116; +sub.f64 fd151, fd118, fd117; +st.shared.v2.f64 [r12+64], {fd150, fd151}; +fma.rn.f64 fd152, fd119, fd78, fd123; +sub.f64 fd153, fd125, fd124; +st.shared.v2.f64 [r12+80], {fd152, fd153}; +fma.rn.f64 fd154, fd128, fd82, fd131; +sub.f64 fd155, fd133, fd132; +st.shared.v2.f64 [r12+96], {fd154, fd155}; +sub.f64 fd156, fd141, fd140; +fma.rn.f64 fd157, fd136, fd86, fd139; +st.shared.v2.f64 [r12+112], {fd157, fd156}; +barrier.sync 0; +mad.lo.s32 r13, r6, -112, r12; +ld.shared.v2.f64 {fd158, fd159}, [r13]; +ld.shared.v2.f64 {fd162, fd163}, [r13+16384]; +ld.shared.v2.f64 {fd166, fd167}, [r13+32768]; +ld.shared.v2.f64 {fd170, fd171}, [r13+49152]; +ld.shared.v2.f64 {fd174, fd175}, [r13+65536]; +ld.shared.v2.f64 {fd178, fd179}, [r13+81920]; +ld.shared.v2.f64 {fd182, fd183}, [r13+98304]; +ld.shared.v2.f64 {fd186, fd187}, [r13+114688]; +add.f64 fd190, fd158, fd174; +add.f64 fd191, fd159, fd175; +sub.f64 fd192, fd158, fd174; +sub.f64 fd193, fd159, fd175; +add.f64 fd194, fd166, fd182; +add.f64 fd195, fd167, fd183; +sub.f64 fd196, fd166, fd182; +sub.f64 fd197, fd167, fd183; +add.f64 fd198, fd190, fd194; +add.f64 fd199, fd191, fd195; +sub.f64 fd200, fd190, fd194; +sub.f64 fd201, fd191, fd195; +sub.f64 fd202, fd192, fd197; +add.f64 fd203, fd193, fd196; +add.f64 fd204, fd192, fd197; +sub.f64 fd205, fd193, fd196; +add.f64 fd206, fd162, fd178; +add.f64 fd207, fd163, fd179; +sub.f64 fd208, fd162, fd178; +sub.f64 fd209, fd163, fd179; +add.f64 fd210, fd170, fd186; +add.f64 fd211, fd171, fd187; +sub.f64 fd212, fd170, fd186; +sub.f64 fd213, fd171, fd187; +add.f64 fd214, fd206, fd210; +add.f64 fd215, fd207, fd211; +sub.f64 fd216, fd206, fd210; +sub.f64 fd217, fd207, fd211; +sub.f64 fd218, fd208, fd213; +add.f64 fd219, fd209, fd212; +add.f64 fd220, fd208, fd213; +sub.f64 fd221, fd209, fd212; +mul.f64 fd222, fd218, 0d3FE6A09E667F3BCD; +mul.f64 fd223, fd219, 0d3FE6A09E667F3BCD; +sub.f64 fd224, fd222, fd223; +add.f64 fd225, fd222, fd223; +mul.f64 fd226, fd220, 0dBFE6A09E667F3BCD; +mul.f64 fd227, fd221, 0d3FE6A09E667F3BCD; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd221, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd230, fd220, 0d3FE6A09E667F3BCD, fd229; +sub.f64 fd231, fd198, fd214; +sub.f64 fd232, fd199, fd215; +add.f64 fd233, fd202, fd224; +add.f64 fd234, fd203, fd225; +sub.f64 fd235, fd202, fd224; +sub.f64 fd236, fd203, fd225; +sub.f64 fd237, fd200, fd217; +add.f64 fd238, fd201, fd216; +add.f64 fd239, fd200, fd217; +sub.f64 fd240, fd201, fd216; +add.f64 fd241, fd204, fd228; +add.f64 fd242, fd205, fd230; +sub.f64 fd243, fd204, fd228; +sub.f64 fd244, fd205, fd230; +and.b32 r14, r5, 1016; +bfe.u32 r15, r5, 3, 7; +mul.wide.u32 rd6, r15, 16; +mov.u64 rd7, %18; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd245, fd246}, [rd8]; +mul.f64 fd249, fd234, fd246; +mul.f64 fd250, fd233, fd246; +mul.f64 fd251, fd245, fd234; +mul.f64 fd252, fd245, fd245; +mul.f64 fd253, fd246, fd246; +sub.f64 fd254, fd252, fd253; +mul.f64 fd255, fd246, fd245; +fma.rn.f64 fd256, fd246, fd245, fd255; +mul.f64 fd257, fd238, fd256; +mul.f64 fd258, fd237, fd256; +mul.f64 fd259, fd254, fd238; +mul.f64 fd260, fd245, fd254; +mul.f64 fd261, fd246, fd256; +sub.f64 fd262, fd260, fd261; +mul.f64 fd263, fd245, fd256; +fma.rn.f64 fd264, fd246, fd254, fd263; +mul.f64 fd265, fd242, fd264; +mul.f64 fd266, fd241, fd264; +mul.f64 fd267, fd262, fd242; +mul.f64 fd268, fd245, fd262; +mul.f64 fd269, fd246, fd264; +sub.f64 fd270, fd268, fd269; +mul.f64 fd271, fd245, fd264; +fma.rn.f64 fd272, fd246, fd262, fd271; +mul.f64 fd273, fd232, fd272; +mul.f64 fd274, fd231, fd272; +mul.f64 fd275, fd270, fd232; +ld.global.v2.f64 {fd276, fd277}, [rd8+2048]; +mul.f64 fd280, fd236, fd277; +mul.f64 fd281, fd235, fd277; +mul.f64 fd282, fd276, fd236; +mul.f64 fd283, fd245, fd276; +mul.f64 fd284, fd246, fd277; +sub.f64 fd285, fd283, fd284; +mul.f64 fd286, fd245, fd277; +fma.rn.f64 fd287, fd246, fd276, fd286; +mul.f64 fd288, fd240, fd287; +mul.f64 fd289, fd239, fd287; +mul.f64 fd290, fd285, fd240; +mul.f64 fd291, fd245, fd285; +mul.f64 fd292, fd246, fd287; +sub.f64 fd293, fd291, fd292; +mul.f64 fd294, fd245, fd287; +fma.rn.f64 fd295, fd246, fd285, fd294; +mul.f64 fd296, fd244, fd295; +mul.f64 fd297, fd243, fd295; +mul.f64 fd298, fd293, fd244; +and.b32 r16, r10, 112; +add.s32 r17, r9, r16; +barrier.sync 0; +and.b32 r18, r7, 130048; +add.s32 r19, r17, r18; +add.f64 fd299, fd199, fd215; +add.f64 fd300, fd198, fd214; +st.shared.v2.f64 [r19], {fd300, fd299}; +fma.rn.f64 fd301, fd245, fd233, fd249; +sub.f64 fd302, fd251, fd250; +st.shared.v2.f64 [r19+128], {fd301, fd302}; +fma.rn.f64 fd303, fd254, fd237, fd257; +sub.f64 fd304, fd259, fd258; +st.shared.v2.f64 [r19+256], {fd303, fd304}; +fma.rn.f64 fd305, fd262, fd241, fd265; +sub.f64 fd306, fd267, fd266; +st.shared.v2.f64 [r19+384], {fd305, fd306}; +sub.f64 fd307, fd275, fd274; +fma.rn.f64 fd308, fd270, fd231, fd273; +st.shared.v2.f64 [r19+512], {fd308, fd307}; +fma.rn.f64 fd309, fd276, fd235, fd280; +sub.f64 fd310, fd282, fd281; +st.shared.v2.f64 [r19+640], {fd309, fd310}; +fma.rn.f64 fd311, fd285, fd239, fd288; +sub.f64 fd312, fd290, fd289; +st.shared.v2.f64 [r19+768], {fd311, fd312}; +fma.rn.f64 fd313, fd293, fd243, fd296; +sub.f64 fd314, fd298, fd297; +st.shared.v2.f64 [r19+896], {fd313, fd314}; +barrier.sync 0; +mad.lo.s32 r20, r14, -112, r19; +ld.shared.v2.f64 {fd315, fd316}, [r20]; +ld.shared.v2.f64 {fd319, fd320}, [r20+16384]; +ld.shared.v2.f64 {fd323, fd324}, [r20+32768]; +ld.shared.v2.f64 {fd327, fd328}, [r20+49152]; +ld.shared.v2.f64 {fd331, fd332}, [r20+65536]; +ld.shared.v2.f64 {fd335, fd336}, [r20+81920]; +ld.shared.v2.f64 {fd339, fd340}, [r20+98304]; +ld.shared.v2.f64 {fd343, fd344}, [r20+114688]; +add.f64 fd347, fd315, fd331; +add.f64 fd348, fd316, fd332; +sub.f64 fd349, fd315, fd331; +sub.f64 fd350, fd316, fd332; +add.f64 fd351, fd323, fd339; +add.f64 fd352, fd324, fd340; +sub.f64 fd353, fd323, fd339; +sub.f64 fd354, fd324, fd340; +add.f64 fd355, fd347, fd351; +add.f64 fd356, fd348, fd352; +sub.f64 fd357, fd347, fd351; +sub.f64 fd358, fd348, fd352; +sub.f64 fd359, fd349, fd354; +add.f64 fd360, fd350, fd353; +add.f64 fd361, fd349, fd354; +sub.f64 fd362, fd350, fd353; +add.f64 fd363, fd319, fd335; +add.f64 fd364, fd320, fd336; +sub.f64 fd365, fd319, fd335; +sub.f64 fd366, fd320, fd336; +add.f64 fd367, fd327, fd343; +add.f64 fd368, fd328, fd344; +sub.f64 fd369, fd327, fd343; +sub.f64 fd370, fd328, fd344; +add.f64 fd371, fd363, fd367; +add.f64 fd372, fd364, fd368; +sub.f64 fd373, fd363, fd367; +sub.f64 fd374, fd364, fd368; +sub.f64 fd375, fd365, fd370; +add.f64 fd376, fd366, fd369; +add.f64 fd377, fd365, fd370; +sub.f64 fd378, fd366, fd369; +mul.f64 fd379, fd375, 0d3FE6A09E667F3BCD; +mul.f64 fd380, fd376, 0d3FE6A09E667F3BCD; +sub.f64 fd381, fd379, fd380; +add.f64 fd382, fd379, fd380; +mul.f64 fd383, fd377, 0dBFE6A09E667F3BCD; +mul.f64 fd384, fd378, 0d3FE6A09E667F3BCD; +sub.f64 fd385, fd383, fd384; +mul.f64 fd386, fd378, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd387, fd377, 0d3FE6A09E667F3BCD, fd386; +sub.f64 fd388, fd355, fd371; +sub.f64 fd389, fd356, fd372; +add.f64 fd390, fd359, fd381; +add.f64 fd391, fd360, fd382; +sub.f64 fd392, fd359, fd381; +sub.f64 fd393, fd360, fd382; +sub.f64 fd394, fd357, fd374; +add.f64 fd395, fd358, fd373; +add.f64 fd396, fd357, fd374; +sub.f64 fd397, fd358, fd373; +add.f64 fd398, fd361, fd385; +add.f64 fd399, fd362, fd387; +sub.f64 fd400, fd361, fd385; +sub.f64 fd401, fd362, fd387; +and.b32 r21, r5, 960; +bfe.u32 r22, r5, 6, 4; +mul.wide.u32 rd9, r22, 16; +mov.u64 rd10, %19; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd402, fd403}, [rd11]; +mul.f64 fd406, fd391, fd403; +mul.f64 fd407, fd390, fd403; +mul.f64 fd408, fd402, fd391; +mul.f64 fd409, fd402, fd402; +mul.f64 fd410, fd403, fd403; +sub.f64 fd411, fd409, fd410; +mul.f64 fd412, fd403, fd402; +fma.rn.f64 fd413, fd403, fd402, fd412; +mul.f64 fd414, fd395, fd413; +mul.f64 fd415, fd394, fd413; +mul.f64 fd416, fd411, fd395; +mul.f64 fd417, fd402, fd411; +mul.f64 fd418, fd403, fd413; +sub.f64 fd419, fd417, fd418; +mul.f64 fd420, fd402, fd413; +fma.rn.f64 fd421, fd403, fd411, fd420; +mul.f64 fd422, fd399, fd421; +mul.f64 fd423, fd398, fd421; +mul.f64 fd424, fd419, fd399; +mul.f64 fd425, fd402, fd419; +mul.f64 fd426, fd403, fd421; +sub.f64 fd427, fd425, fd426; +mul.f64 fd428, fd402, fd421; +fma.rn.f64 fd429, fd403, fd419, fd428; +mul.f64 fd430, fd389, fd429; +mul.f64 fd431, fd388, fd429; +mul.f64 fd432, fd427, fd389; +ld.global.v2.f64 {fd433, fd434}, [rd11+256]; +mul.f64 fd437, fd393, fd434; +mul.f64 fd438, fd392, fd434; +mul.f64 fd439, fd433, fd393; +mul.f64 fd440, fd402, fd433; +mul.f64 fd441, fd403, fd434; +sub.f64 fd442, fd440, fd441; +mul.f64 fd443, fd402, fd434; +fma.rn.f64 fd444, fd403, fd433, fd443; +mul.f64 fd445, fd397, fd444; +mul.f64 fd446, fd396, fd444; +mul.f64 fd447, fd442, fd397; +mul.f64 fd448, fd402, fd442; +mul.f64 fd449, fd403, fd444; +sub.f64 fd450, fd448, fd449; +mul.f64 fd451, fd402, fd444; +fma.rn.f64 fd452, fd403, fd442, fd451; +mul.f64 fd453, fd401, fd452; +mul.f64 fd454, fd400, fd452; +mul.f64 fd455, fd450, fd401; +and.b32 r23, r10, 1008; +add.s32 r24, r9, r23; +barrier.sync 0; +and.b32 r25, r7, 122880; +add.s32 r26, r24, r25; +add.f64 fd456, fd356, fd372; +add.f64 fd457, fd355, fd371; +st.shared.v2.f64 [r26], {fd457, fd456}; +fma.rn.f64 fd458, fd402, fd390, fd406; +sub.f64 fd459, fd408, fd407; +st.shared.v2.f64 [r26+1024], {fd458, fd459}; +fma.rn.f64 fd460, fd411, fd394, fd414; +sub.f64 fd461, fd416, fd415; +st.shared.v2.f64 [r26+2048], {fd460, fd461}; +fma.rn.f64 fd462, fd419, fd398, fd422; +sub.f64 fd463, fd424, fd423; +st.shared.v2.f64 [r26+3072], {fd462, fd463}; +sub.f64 fd464, fd432, fd431; +fma.rn.f64 fd465, fd427, fd388, fd430; +st.shared.v2.f64 [r26+4096], {fd465, fd464}; +fma.rn.f64 fd466, fd433, fd392, fd437; +sub.f64 fd467, fd439, fd438; +st.shared.v2.f64 [r26+5120], {fd466, fd467}; +fma.rn.f64 fd468, fd442, fd396, fd445; +sub.f64 fd469, fd447, fd446; +st.shared.v2.f64 [r26+6144], {fd468, fd469}; +fma.rn.f64 fd470, fd450, fd400, fd453; +sub.f64 fd471, fd455, fd454; +st.shared.v2.f64 [r26+7168], {fd470, fd471}; +barrier.sync 0; +mad.lo.s32 r27, r21, -112, r26; +ld.shared.v2.f64 {fd472, fd473}, [r27]; +ld.shared.v2.f64 {fd476, fd477}, [r27+16384]; +ld.shared.v2.f64 {fd480, fd481}, [r27+32768]; +ld.shared.v2.f64 {fd484, fd485}, [r27+49152]; +ld.shared.v2.f64 {fd488, fd489}, [r27+65536]; +ld.shared.v2.f64 {fd492, fd493}, [r27+81920]; +ld.shared.v2.f64 {fd496, fd497}, [r27+98304]; +ld.shared.v2.f64 {fd500, fd501}, [r27+114688]; +add.f64 fd504, fd472, fd488; +add.f64 fd505, fd473, fd489; +sub.f64 fd506, fd472, fd488; +sub.f64 fd507, fd473, fd489; +add.f64 fd508, fd480, fd496; +add.f64 fd509, fd481, fd497; +sub.f64 fd510, fd480, fd496; +sub.f64 fd511, fd481, fd497; +add.f64 fd512, fd504, fd508; +add.f64 fd513, fd505, fd509; +sub.f64 fd514, fd504, fd508; +sub.f64 fd515, fd505, fd509; +sub.f64 fd516, fd506, fd511; +add.f64 fd517, fd507, fd510; +add.f64 fd518, fd506, fd511; +sub.f64 fd519, fd507, fd510; +add.f64 fd520, fd476, fd492; +add.f64 fd521, fd477, fd493; +sub.f64 fd522, fd476, fd492; +sub.f64 fd523, fd477, fd493; +add.f64 fd524, fd484, fd500; +add.f64 fd525, fd485, fd501; +sub.f64 fd526, fd484, fd500; +sub.f64 fd527, fd485, fd501; +add.f64 fd528, fd520, fd524; +add.f64 fd529, fd521, fd525; +sub.f64 fd530, fd520, fd524; +sub.f64 fd531, fd521, fd525; +sub.f64 fd532, fd522, fd527; +add.f64 fd533, fd523, fd526; +add.f64 fd534, fd522, fd527; +sub.f64 fd535, fd523, fd526; +mul.f64 fd536, fd532, 0d3FE6A09E667F3BCD; +mul.f64 fd537, fd533, 0d3FE6A09E667F3BCD; +sub.f64 fd538, fd536, fd537; +add.f64 fd539, fd536, fd537; +mul.f64 fd540, fd534, 0dBFE6A09E667F3BCD; +mul.f64 fd541, fd535, 0d3FE6A09E667F3BCD; +sub.f64 fd542, fd540, fd541; +mul.f64 fd543, fd535, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd544, fd534, 0d3FE6A09E667F3BCD, fd543; +sub.f64 fd545, fd512, fd528; +sub.f64 fd546, fd513, fd529; +add.f64 fd547, fd516, fd538; +add.f64 fd548, fd517, fd539; +sub.f64 fd549, fd516, fd538; +sub.f64 fd550, fd517, fd539; +sub.f64 fd551, fd514, fd531; +add.f64 fd552, fd515, fd530; +add.f64 fd553, fd514, fd531; +sub.f64 fd554, fd515, fd530; +add.f64 fd555, fd518, fd542; +add.f64 fd556, fd519, fd544; +sub.f64 fd557, fd518, fd542; +sub.f64 fd558, fd519, fd544; +and.b32 r28, r5, 512; +bfe.u32 r29, r5, 9, 1; +mul.wide.u32 rd12, r29, 16; +mov.u64 rd13, %20; +add.s64 rd14, rd13, rd12; +ld.global.v2.f64 {fd559, fd560}, [rd14]; +mul.f64 fd563, fd548, fd560; +mul.f64 fd564, fd547, fd560; +mul.f64 fd565, fd559, fd548; +mul.f64 fd566, fd559, fd559; +mul.f64 fd567, fd560, fd560; +sub.f64 fd568, fd566, fd567; +mul.f64 fd569, fd560, fd559; +fma.rn.f64 fd570, fd560, fd559, fd569; +mul.f64 fd571, fd552, fd570; +mul.f64 fd572, fd551, fd570; +mul.f64 fd573, fd568, fd552; +mul.f64 fd574, fd559, fd568; +mul.f64 fd575, fd560, fd570; +sub.f64 fd576, fd574, fd575; +mul.f64 fd577, fd559, fd570; +fma.rn.f64 fd578, fd560, fd568, fd577; +mul.f64 fd579, fd556, fd578; +mul.f64 fd580, fd555, fd578; +mul.f64 fd581, fd576, fd556; +mul.f64 fd582, fd559, fd576; +mul.f64 fd583, fd560, fd578; +sub.f64 fd584, fd582, fd583; +mul.f64 fd585, fd559, fd578; +fma.rn.f64 fd586, fd560, fd576, fd585; +mul.f64 fd587, fd546, fd586; +mul.f64 fd588, fd545, fd586; +mul.f64 fd589, fd584, fd546; +ld.global.v2.f64 {fd590, fd591}, [rd14+32]; +mul.f64 fd594, fd550, fd591; +mul.f64 fd595, fd549, fd591; +mul.f64 fd596, fd590, fd550; +mul.f64 fd597, fd559, fd590; +mul.f64 fd598, fd560, fd591; +sub.f64 fd599, fd597, fd598; +mul.f64 fd600, fd559, fd591; +fma.rn.f64 fd601, fd560, fd590, fd600; +mul.f64 fd602, fd554, fd601; +mul.f64 fd603, fd553, fd601; +mul.f64 fd604, fd599, fd554; +mul.f64 fd605, fd559, fd599; +mul.f64 fd606, fd560, fd601; +sub.f64 fd607, fd605, fd606; +mul.f64 fd608, fd559, fd601; +fma.rn.f64 fd609, fd560, fd599, fd608; +mul.f64 fd610, fd558, fd609; +mul.f64 fd611, fd557, fd609; +mul.f64 fd612, fd607, fd558; +and.b32 r30, r10, 8176; +add.s32 r31, r9, r30; +barrier.sync 0; +and.b32 r32, r7, 65536; +add.s32 r33, r31, r32; +add.f64 fd613, fd513, fd529; +add.f64 fd614, fd512, fd528; +st.shared.v2.f64 [r33], {fd614, fd613}; +fma.rn.f64 fd615, fd559, fd547, fd563; +sub.f64 fd616, fd565, fd564; +st.shared.v2.f64 [r33+8192], {fd615, fd616}; +fma.rn.f64 fd617, fd568, fd551, fd571; +sub.f64 fd618, fd573, fd572; +st.shared.v2.f64 [r33+16384], {fd617, fd618}; +fma.rn.f64 fd619, fd576, fd555, fd579; +sub.f64 fd620, fd581, fd580; +st.shared.v2.f64 [r33+24576], {fd619, fd620}; +sub.f64 fd621, fd589, fd588; +fma.rn.f64 fd622, fd584, fd545, fd587; +st.shared.v2.f64 [r33+32768], {fd622, fd621}; +fma.rn.f64 fd623, fd590, fd549, fd594; +sub.f64 fd624, fd596, fd595; +st.shared.v2.f64 [r33+40960], {fd623, fd624}; +fma.rn.f64 fd625, fd599, fd553, fd602; +sub.f64 fd626, fd604, fd603; +st.shared.v2.f64 [r33+49152], {fd625, fd626}; +fma.rn.f64 fd627, fd607, fd557, fd610; +sub.f64 fd628, fd612, fd611; +st.shared.v2.f64 [r33+57344], {fd627, fd628}; +barrier.sync 0; +mad.lo.s32 r34, r28, -112, r33; +ld.shared.v2.f64 {fd629, fd630}, [r34]; +ld.shared.v2.f64 {fd633, fd634}, [r34+16384]; +ld.shared.v2.f64 {fd637, fd638}, [r34+32768]; +ld.shared.v2.f64 {fd641, fd642}, [r34+49152]; +ld.shared.v2.f64 {fd645, fd646}, [r34+65536]; +ld.shared.v2.f64 {fd649, fd650}, [r34+81920]; +ld.shared.v2.f64 {fd653, fd654}, [r34+98304]; +ld.shared.v2.f64 {fd657, fd658}, [r34+114688]; +add.f64 %1, fd630, fd646; +add.f64 %0, fd629, fd645; +add.f64 %3, fd634, fd650; +add.f64 %2, fd633, fd649; +add.f64 %5, fd638, fd654; +add.f64 %4, fd637, fd653; +add.f64 %7, fd642, fd658; +add.f64 %6, fd641, fd657; +sub.f64 %9, fd630, fd646; +sub.f64 %8, fd629, fd645; +sub.f64 %11, fd634, fd650; +sub.f64 %10, fd633, fd649; +sub.f64 %13, fd638, fd654; +sub.f64 %12, fd637, fd653; +sub.f64 %15, fd642, fd658; +sub.f64 %14, fd641, fd657; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "r"(smem), "l"(lut_dp_8_8192), "l"(lut_dp_8_1024), "l"(lut_dp_8_128), "l"(lut_dp_8_16), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..f1343f36a1e51 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp16_fwd.hpp.inc @@ -0,0 +1,16867 @@ +#ifndef CUFFTDX_FFT_81_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_81_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<872, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<136>; +.reg .b32 r<1527>; +.reg .b64 rd<4>; +mov.u32 r1516, %tid.y; +mov.u32 r1517, %18; +mad.lo.s32 r1518, r1516, 648, r1517; +mov.u32 r1519, %tid.x; +mov.f32 f130, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1, {low, high}; +} +mov.f32 f132, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r265, {low, high}; +} +mov.f32 f92, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r266, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r267, {low, high}; +} +mov.f32 f96, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r268, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r271, {low, high}; +} +mov.f32 f104, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r1519, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1520, rd3; +mul.lo.s32 r1521, r1520, 9; +sub.s32 r1522, r1519, r1521; +cvt.rn.f32.u32 f133, r1522; +mul.f32 f134, f133, 0f3D9EDD1F; +cos.approx.f32 f57, f134; +sin.approx.f32 f135, f134; +neg.f32 f58, f135; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +mad.lo.s32 r1523, r1520, 648, r1518; +barrier.sync 0; +mad.lo.s32 r1524, r1522, 72, r1523; +st.shared.v2.f32 [r1524], {r352, r358}; +st.shared.v2.f32 [r1524+8], {r621, r628}; +st.shared.v2.f32 [r1524+16], {r658, r665}; +st.shared.v2.f32 [r1524+24], {r695, r702}; +st.shared.v2.f32 [r1524+32], {r732, r739}; +st.shared.v2.f32 [r1524+40], {r769, r776}; +st.shared.v2.f32 [r1524+48], {r806, r813}; +st.shared.v2.f32 [r1524+56], {r843, r850}; +st.shared.v2.f32 [r1524+64], {r880, r887}; +barrier.sync 0; +shl.b32 r1525, r1522, 6; +sub.s32 r1526, r1524, r1525; +ld.shared.u32 r916, [r1526]; +ld.shared.u32 r922, [r1526+4]; +ld.shared.u32 r1004, [r1526+72]; +ld.shared.u32 r1010, [r1526+76]; +ld.shared.u32 r1092, [r1526+144]; +ld.shared.u32 r1098, [r1526+148]; +ld.shared.u32 r913, [r1526+216]; +ld.shared.u32 r919, [r1526+220]; +ld.shared.u32 r1001, [r1526+288]; +ld.shared.u32 r1007, [r1526+292]; +ld.shared.u32 r1089, [r1526+360]; +ld.shared.u32 r1095, [r1526+364]; +ld.shared.u32 r914, [r1526+432]; +ld.shared.u32 r920, [r1526+436]; +ld.shared.u32 r1002, [r1526+504]; +ld.shared.u32 r1008, [r1526+508]; +ld.shared.u32 r1090, [r1526+576]; +ld.shared.u32 r1096, [r1526+580]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 %0, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 %1, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 %6, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 %12, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 %7, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 %13, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 %2, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 %3, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 %8, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 %14, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 %9, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 %15, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 %4, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 %5, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 %10, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 %16, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 %11, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 %17, r1504, r1510; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<873, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<136>; +.reg .b32 r<1527>; +.reg .b64 rd<4>; +mov.u32 r1516, %tid.y; +mov.u32 r1517, %18; +mad.lo.s32 r1518, r1516, 324, r1517; +mov.u32 r1519, %tid.x; +mov.f32 f130, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1, {low, high}; +} +mov.f32 f132, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %25, %31; +} +{ +add.f16x2 r8, %19, r5; +} +{ +add.f16x2 r11, %26, %32; +} +{ +add.f16x2 r14, %20, r11; +} +{ +add.f16x2 r17, %25, %31; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %19, r20; +} +{ +sub.f16x2 r26, %26, %32; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %25, %31; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %19, r38; +} +{ +sub.f16x2 r44, %26, %32; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %26, %32; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %20, r56; +} +{ +sub.f16x2 r62, %25, %31; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %26, %32; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %20, r74; +} +{ +sub.f16x2 r80, %25, %31; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %27, %33; +} +{ +add.f16x2 r96, %21, r93; +} +{ +add.f16x2 r99, %28, %34; +} +{ +add.f16x2 r102, %22, r99; +} +{ +add.f16x2 r105, %27, %33; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %21, r108; +} +{ +sub.f16x2 r114, %28, %34; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %27, %33; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %21, r126; +} +{ +sub.f16x2 r132, %28, %34; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %28, %34; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %22, r144; +} +{ +sub.f16x2 r150, %27, %33; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %28, %34; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %22, r162; +} +{ +sub.f16x2 r168, %27, %33; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %29, %35; +} +{ +add.f16x2 r184, %23, r181; +} +{ +add.f16x2 r187, %30, %36; +} +{ +add.f16x2 r190, %24, r187; +} +{ +add.f16x2 r193, %29, %35; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %23, r196; +} +{ +sub.f16x2 r202, %30, %36; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %29, %35; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %23, r214; +} +{ +sub.f16x2 r220, %30, %36; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %30, %36; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %24, r232; +} +{ +sub.f16x2 r238, %29, %35; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %30, %36; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %24, r250; +} +{ +sub.f16x2 r256, %29, %35; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r265, {low, high}; +} +mov.f32 f92, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r266, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r267, {low, high}; +} +mov.f32 f96, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r268, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r271, {low, high}; +} +mov.f32 f104, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +mul.wide.u32 rd2, r1519, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1520, rd3; +mul.lo.s32 r1521, r1520, 9; +sub.s32 r1522, r1519, r1521; +mad.lo.s32 r1523, r1520, 324, r1518; +cvt.rn.f32.u32 f133, r1522; +mul.f32 f134, f133, 0f3D9EDD1F; +cos.approx.f32 f57, f134; +sin.approx.f32 f135, f134; +neg.f32 f58, f135; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r612, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r614, {high, high}; +} +{ +mul.f16x2 r616, r446, r614; +} +{ +neg.f16x2 r619, r616; +} +{ +fma.rn.f16x2 r621, r440, r612, r619; +} +{ +mul.f16x2 r625, r440, r614; +} +{ +fma.rn.f16x2 r628, r446, r612, r625; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r632, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r634, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r636, {low, high}; +} +{ +mul.f16x2 r637, r634, r636; +} +{ +mul.f16x2 r640, r609, r632; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r643, {high, low}; +} +{ +fma.rn.f16x2 r645, r637, r643, r640; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r649, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r651, {high, high}; +} +{ +mul.f16x2 r653, r534, r651; +} +{ +neg.f16x2 r656, r653; +} +{ +fma.rn.f16x2 r658, r528, r649, r656; +} +{ +mul.f16x2 r662, r528, r651; +} +{ +fma.rn.f16x2 r665, r534, r649, r662; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r669, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r671, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r673, {low, high}; +} +{ +mul.f16x2 r674, r671, r673; +} +{ +mul.f16x2 r677, r645, r669; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r645; +mov.b32 r680, {high, low}; +} +{ +fma.rn.f16x2 r682, r674, r680, r677; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r686, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r688, {high, high}; +} +{ +mul.f16x2 r690, r412, r688; +} +{ +neg.f16x2 r693, r690; +} +{ +fma.rn.f16x2 r695, r376, r686, r693; +} +{ +mul.f16x2 r699, r376, r688; +} +{ +fma.rn.f16x2 r702, r412, r686, r699; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r706, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r708, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r710, {low, high}; +} +{ +mul.f16x2 r711, r708, r710; +} +{ +mul.f16x2 r714, r682, r706; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r682; +mov.b32 r717, {high, low}; +} +{ +fma.rn.f16x2 r719, r711, r717, r714; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r723, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r725, {high, high}; +} +{ +mul.f16x2 r727, r500, r725; +} +{ +neg.f16x2 r730, r727; +} +{ +fma.rn.f16x2 r732, r464, r723, r730; +} +{ +mul.f16x2 r736, r464, r725; +} +{ +fma.rn.f16x2 r739, r500, r723, r736; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r743, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r745, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r747, {low, high}; +} +{ +mul.f16x2 r748, r745, r747; +} +{ +mul.f16x2 r751, r719, r743; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r719; +mov.b32 r754, {high, low}; +} +{ +fma.rn.f16x2 r756, r748, r754, r751; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r760, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r762, {high, high}; +} +{ +mul.f16x2 r764, r588, r762; +} +{ +neg.f16x2 r767, r764; +} +{ +fma.rn.f16x2 r769, r552, r760, r767; +} +{ +mul.f16x2 r773, r552, r762; +} +{ +fma.rn.f16x2 r776, r588, r760, r773; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r780, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r782, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r784, {low, high}; +} +{ +mul.f16x2 r785, r782, r784; +} +{ +mul.f16x2 r788, r756, r780; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r756; +mov.b32 r791, {high, low}; +} +{ +fma.rn.f16x2 r793, r785, r791, r788; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r797, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r799, {high, high}; +} +{ +mul.f16x2 r801, r430, r799; +} +{ +neg.f16x2 r804, r801; +} +{ +fma.rn.f16x2 r806, r394, r797, r804; +} +{ +mul.f16x2 r810, r394, r799; +} +{ +fma.rn.f16x2 r813, r430, r797, r810; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r817, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r819, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r821, {low, high}; +} +{ +mul.f16x2 r822, r819, r821; +} +{ +mul.f16x2 r825, r793, r817; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r793; +mov.b32 r828, {high, low}; +} +{ +fma.rn.f16x2 r830, r822, r828, r825; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r834, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r836, {high, high}; +} +{ +mul.f16x2 r838, r518, r836; +} +{ +neg.f16x2 r841, r838; +} +{ +fma.rn.f16x2 r843, r482, r834, r841; +} +{ +mul.f16x2 r847, r482, r836; +} +{ +fma.rn.f16x2 r850, r518, r834, r847; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r854, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r609; +mov.b32 r856, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r858, {low, high}; +} +{ +mul.f16x2 r859, r856, r858; +} +{ +mul.f16x2 r862, r830, r854; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r830; +mov.b32 r865, {high, low}; +} +{ +fma.rn.f16x2 r867, r859, r865, r862; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r871, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r867; +mov.b32 r873, {high, high}; +} +{ +mul.f16x2 r875, r606, r873; +} +{ +neg.f16x2 r878, r875; +} +{ +fma.rn.f16x2 r880, r570, r871, r878; +} +{ +mul.f16x2 r884, r570, r873; +} +{ +fma.rn.f16x2 r887, r606, r871, r884; +} +barrier.sync 0; +mad.lo.s32 r1524, r1522, 36, r1523; +st.shared.u32 [r1524], r352; +st.shared.u32 [r1524+4], r621; +st.shared.u32 [r1524+8], r658; +st.shared.u32 [r1524+12], r695; +st.shared.u32 [r1524+16], r732; +st.shared.u32 [r1524+20], r769; +st.shared.u32 [r1524+24], r806; +st.shared.u32 [r1524+28], r843; +st.shared.u32 [r1524+32], r880; +barrier.sync 0; +shl.b32 r1525, r1522, 5; +sub.s32 r1526, r1524, r1525; +ld.shared.u32 r916, [r1526]; +ld.shared.u32 r1004, [r1526+36]; +ld.shared.u32 r1092, [r1526+72]; +ld.shared.u32 r913, [r1526+108]; +ld.shared.u32 r1001, [r1526+144]; +ld.shared.u32 r1089, [r1526+180]; +ld.shared.u32 r914, [r1526+216]; +ld.shared.u32 r1002, [r1526+252]; +ld.shared.u32 r1090, [r1526+288]; +barrier.sync 0; +st.shared.u32 [r1524], r358; +st.shared.u32 [r1524+4], r628; +st.shared.u32 [r1524+8], r665; +st.shared.u32 [r1524+12], r702; +st.shared.u32 [r1524+16], r739; +st.shared.u32 [r1524+20], r776; +st.shared.u32 [r1524+24], r813; +st.shared.u32 [r1524+28], r850; +st.shared.u32 [r1524+32], r887; +barrier.sync 0; +ld.shared.u32 r922, [r1526]; +ld.shared.u32 r1010, [r1526+36]; +ld.shared.u32 r1098, [r1526+72]; +ld.shared.u32 r919, [r1526+108]; +ld.shared.u32 r1007, [r1526+144]; +ld.shared.u32 r1095, [r1526+180]; +ld.shared.u32 r920, [r1526+216]; +ld.shared.u32 r1008, [r1526+252]; +ld.shared.u32 r1096, [r1526+288]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r908, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r909, {low, high}; +} +{ +neg.f16x2 r910, r909; +} +{ +add.f16x2 r912, r913, r914; +} +{ +add.f16x2 r915, r916, r912; +} +{ +add.f16x2 r918, r919, r920; +} +{ +add.f16x2 r921, r922, r918; +} +{ +add.f16x2 r924, r913, r914; +} +{ +mul.f16x2 r927, r924, r908; +} +{ +add.f16x2 r930, r916, r927; +} +{ +sub.f16x2 r933, r919, r920; +} +{ +mul.f16x2 r936, r933, r910; +} +{ +add.f16x2 r939, r930, r936; +} +{ +add.f16x2 r942, r913, r914; +} +{ +mul.f16x2 r945, r942, r908; +} +{ +add.f16x2 r948, r916, r945; +} +{ +sub.f16x2 r951, r919, r920; +} +{ +mul.f16x2 r954, r951, r910; +} +{ +sub.f16x2 r957, r948, r954; +} +{ +add.f16x2 r960, r919, r920; +} +{ +mul.f16x2 r963, r960, r908; +} +{ +add.f16x2 r966, r922, r963; +} +{ +sub.f16x2 r969, r913, r914; +} +{ +mul.f16x2 r972, r969, r910; +} +{ +sub.f16x2 r975, r966, r972; +} +{ +add.f16x2 r978, r919, r920; +} +{ +mul.f16x2 r981, r978, r908; +} +{ +add.f16x2 r984, r922, r981; +} +{ +sub.f16x2 r987, r913, r914; +} +{ +mul.f16x2 r990, r987, r910; +} +{ +add.f16x2 r993, r984, r990; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r996, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r997, {low, high}; +} +{ +neg.f16x2 r998, r997; +} +{ +add.f16x2 r1000, r1001, r1002; +} +{ +add.f16x2 r1003, r1004, r1000; +} +{ +add.f16x2 r1006, r1007, r1008; +} +{ +add.f16x2 r1009, r1010, r1006; +} +{ +add.f16x2 r1012, r1001, r1002; +} +{ +mul.f16x2 r1015, r1012, r996; +} +{ +add.f16x2 r1018, r1004, r1015; +} +{ +sub.f16x2 r1021, r1007, r1008; +} +{ +mul.f16x2 r1024, r1021, r998; +} +{ +add.f16x2 r1027, r1018, r1024; +} +{ +add.f16x2 r1030, r1001, r1002; +} +{ +mul.f16x2 r1033, r1030, r996; +} +{ +add.f16x2 r1036, r1004, r1033; +} +{ +sub.f16x2 r1039, r1007, r1008; +} +{ +mul.f16x2 r1042, r1039, r998; +} +{ +sub.f16x2 r1045, r1036, r1042; +} +{ +add.f16x2 r1048, r1007, r1008; +} +{ +mul.f16x2 r1051, r1048, r996; +} +{ +add.f16x2 r1054, r1010, r1051; +} +{ +sub.f16x2 r1057, r1001, r1002; +} +{ +mul.f16x2 r1060, r1057, r998; +} +{ +sub.f16x2 r1063, r1054, r1060; +} +{ +add.f16x2 r1066, r1007, r1008; +} +{ +mul.f16x2 r1069, r1066, r996; +} +{ +add.f16x2 r1072, r1010, r1069; +} +{ +sub.f16x2 r1075, r1001, r1002; +} +{ +mul.f16x2 r1078, r1075, r998; +} +{ +add.f16x2 r1081, r1072, r1078; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1084, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1085, {low, high}; +} +{ +neg.f16x2 r1086, r1085; +} +{ +add.f16x2 r1088, r1089, r1090; +} +{ +add.f16x2 r1091, r1092, r1088; +} +{ +add.f16x2 r1094, r1095, r1096; +} +{ +add.f16x2 r1097, r1098, r1094; +} +{ +add.f16x2 r1100, r1089, r1090; +} +{ +mul.f16x2 r1103, r1100, r1084; +} +{ +add.f16x2 r1106, r1092, r1103; +} +{ +sub.f16x2 r1109, r1095, r1096; +} +{ +mul.f16x2 r1112, r1109, r1086; +} +{ +add.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1089, r1090; +} +{ +mul.f16x2 r1121, r1118, r1084; +} +{ +add.f16x2 r1124, r1092, r1121; +} +{ +sub.f16x2 r1127, r1095, r1096; +} +{ +mul.f16x2 r1130, r1127, r1086; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1095, r1096; +} +{ +mul.f16x2 r1139, r1136, r1084; +} +{ +add.f16x2 r1142, r1098, r1139; +} +{ +sub.f16x2 r1145, r1089, r1090; +} +{ +mul.f16x2 r1148, r1145, r1086; +} +{ +sub.f16x2 r1151, r1142, r1148; +} +{ +add.f16x2 r1154, r1095, r1096; +} +{ +mul.f16x2 r1157, r1154, r1084; +} +{ +add.f16x2 r1160, r1098, r1157; +} +{ +sub.f16x2 r1163, r1089, r1090; +} +{ +mul.f16x2 r1166, r1163, r1086; +} +{ +add.f16x2 r1169, r1160, r1166; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1172, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1174, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1175, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1178, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1179, {low, high}; +} +{ +mul.f16x2 r1188, r1027, r1172; +} +{ +mul.f16x2 r1191, r1063, r1173; +} +{ +sub.f16x2 r1194, r1188, r1191; +} +{ +mul.f16x2 r1197, r1027, r1173; +} +{ +fma.rn.f16x2 r1200, r1063, r1172, r1197; +} +{ +mul.f16x2 r1204, r1115, r1174; +} +{ +mul.f16x2 r1207, r1151, r1175; +} +{ +sub.f16x2 r1210, r1204, r1207; +} +{ +mul.f16x2 r1213, r1115, r1175; +} +{ +fma.rn.f16x2 r1216, r1151, r1174, r1213; +} +{ +mul.f16x2 r1220, r1045, r1174; +} +{ +mul.f16x2 r1223, r1081, r1175; +} +{ +sub.f16x2 r1226, r1220, r1223; +} +{ +mul.f16x2 r1229, r1045, r1175; +} +{ +fma.rn.f16x2 r1232, r1081, r1174, r1229; +} +{ +mul.f16x2 r1236, r1133, r1178; +} +{ +mul.f16x2 r1239, r1169, r1179; +} +{ +sub.f16x2 r1242, r1236, r1239; +} +{ +mul.f16x2 r1245, r1133, r1179; +} +{ +fma.rn.f16x2 r1248, r1169, r1178, r1245; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1252, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1253, {low, high}; +} +{ +neg.f16x2 r1254, r1253; +} +{ +add.f16x2 r1256, r1003, r1091; +} +{ +add.f16x2 %0, r915, r1256; +} +{ +add.f16x2 r1262, r1009, r1097; +} +{ +add.f16x2 %1, r921, r1262; +} +{ +add.f16x2 r1268, r1003, r1091; +} +{ +mul.f16x2 r1271, r1268, r1252; +} +{ +add.f16x2 r1274, r915, r1271; +} +{ +sub.f16x2 r1277, r1009, r1097; +} +{ +mul.f16x2 r1280, r1277, r1254; +} +{ +add.f16x2 %6, r1274, r1280; +} +{ +add.f16x2 r1286, r1003, r1091; +} +{ +mul.f16x2 r1289, r1286, r1252; +} +{ +add.f16x2 r1292, r915, r1289; +} +{ +sub.f16x2 r1295, r1009, r1097; +} +{ +mul.f16x2 r1298, r1295, r1254; +} +{ +sub.f16x2 %12, r1292, r1298; +} +{ +add.f16x2 r1304, r1009, r1097; +} +{ +mul.f16x2 r1307, r1304, r1252; +} +{ +add.f16x2 r1310, r921, r1307; +} +{ +sub.f16x2 r1313, r1003, r1091; +} +{ +mul.f16x2 r1316, r1313, r1254; +} +{ +sub.f16x2 %7, r1310, r1316; +} +{ +add.f16x2 r1322, r1009, r1097; +} +{ +mul.f16x2 r1325, r1322, r1252; +} +{ +add.f16x2 r1328, r921, r1325; +} +{ +sub.f16x2 r1331, r1003, r1091; +} +{ +mul.f16x2 r1334, r1331, r1254; +} +{ +add.f16x2 %13, r1328, r1334; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1340, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1341, {low, high}; +} +{ +neg.f16x2 r1342, r1341; +} +{ +add.f16x2 r1344, r1194, r1210; +} +{ +add.f16x2 %2, r939, r1344; +} +{ +add.f16x2 r1350, r1200, r1216; +} +{ +add.f16x2 %3, r975, r1350; +} +{ +add.f16x2 r1356, r1194, r1210; +} +{ +mul.f16x2 r1359, r1356, r1340; +} +{ +add.f16x2 r1362, r939, r1359; +} +{ +sub.f16x2 r1365, r1200, r1216; +} +{ +mul.f16x2 r1368, r1365, r1342; +} +{ +add.f16x2 %8, r1362, r1368; +} +{ +add.f16x2 r1374, r1194, r1210; +} +{ +mul.f16x2 r1377, r1374, r1340; +} +{ +add.f16x2 r1380, r939, r1377; +} +{ +sub.f16x2 r1383, r1200, r1216; +} +{ +mul.f16x2 r1386, r1383, r1342; +} +{ +sub.f16x2 %14, r1380, r1386; +} +{ +add.f16x2 r1392, r1200, r1216; +} +{ +mul.f16x2 r1395, r1392, r1340; +} +{ +add.f16x2 r1398, r975, r1395; +} +{ +sub.f16x2 r1401, r1194, r1210; +} +{ +mul.f16x2 r1404, r1401, r1342; +} +{ +sub.f16x2 %9, r1398, r1404; +} +{ +add.f16x2 r1410, r1200, r1216; +} +{ +mul.f16x2 r1413, r1410, r1340; +} +{ +add.f16x2 r1416, r975, r1413; +} +{ +sub.f16x2 r1419, r1194, r1210; +} +{ +mul.f16x2 r1422, r1419, r1342; +} +{ +add.f16x2 %15, r1416, r1422; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1428, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1429, {low, high}; +} +{ +neg.f16x2 r1430, r1429; +} +{ +add.f16x2 r1432, r1226, r1242; +} +{ +add.f16x2 %4, r957, r1432; +} +{ +add.f16x2 r1438, r1232, r1248; +} +{ +add.f16x2 %5, r993, r1438; +} +{ +add.f16x2 r1444, r1226, r1242; +} +{ +mul.f16x2 r1447, r1444, r1428; +} +{ +add.f16x2 r1450, r957, r1447; +} +{ +sub.f16x2 r1453, r1232, r1248; +} +{ +mul.f16x2 r1456, r1453, r1430; +} +{ +add.f16x2 %10, r1450, r1456; +} +{ +add.f16x2 r1462, r1226, r1242; +} +{ +mul.f16x2 r1465, r1462, r1428; +} +{ +add.f16x2 r1468, r957, r1465; +} +{ +sub.f16x2 r1471, r1232, r1248; +} +{ +mul.f16x2 r1474, r1471, r1430; +} +{ +sub.f16x2 %16, r1468, r1474; +} +{ +add.f16x2 r1480, r1232, r1248; +} +{ +mul.f16x2 r1483, r1480, r1428; +} +{ +add.f16x2 r1486, r993, r1483; +} +{ +sub.f16x2 r1489, r1226, r1242; +} +{ +mul.f16x2 r1492, r1489, r1430; +} +{ +sub.f16x2 %11, r1486, r1492; +} +{ +add.f16x2 r1498, r1232, r1248; +} +{ +mul.f16x2 r1501, r1498, r1428; +} +{ +add.f16x2 r1504, r993, r1501; +} +{ +sub.f16x2 r1507, r1226, r1242; +} +{ +mul.f16x2 r1510, r1507, r1430; +} +{ +add.f16x2 %17, r1504, r1510; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<875, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<404>; +.reg .b32 r<4748>; +.reg .b64 rd<4>; +mov.u32 r4746, %tid.y; +mov.u32 r4747, %54; +mad.lo.s32 r4684, r4746, 648, r4747; +mov.u32 r4685, %tid.x; +mov.f32 f398, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1, {low, high}; +} +mov.f32 f400, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %74, %58; +} +{ +add.f16x2 r8, %83, r5; +} +{ +add.f16x2 r11, %91, %73; +} +{ +add.f16x2 r14, %97, r11; +} +{ +add.f16x2 r17, %74, %58; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %83, r20; +} +{ +sub.f16x2 r26, %91, %73; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %74, %58; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %83, r38; +} +{ +sub.f16x2 r44, %91, %73; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %91, %73; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %97, r56; +} +{ +sub.f16x2 r62, %74, %58; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %91, %73; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %97, r74; +} +{ +sub.f16x2 r80, %74, %58; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %98, %81; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %57, %95; +} +{ +add.f16x2 r102, %63, r99; +} +{ +add.f16x2 r105, %98, %81; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %57, %95; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %98, %81; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %57, %95; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %57, %95; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %63, r144; +} +{ +sub.f16x2 r150, %98, %81; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %57, %95; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %63, r162; +} +{ +sub.f16x2 r168, %98, %81; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %64, %103; +} +{ +add.f16x2 r184, %72, r181; +} +{ +add.f16x2 r187, %77, %61; +} +{ +add.f16x2 r190, %88, r187; +} +{ +add.f16x2 r193, %64, %103; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %72, r196; +} +{ +sub.f16x2 r202, %77, %61; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %64, %103; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %72, r214; +} +{ +sub.f16x2 r220, %77, %61; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %77, %61; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %88, r232; +} +{ +sub.f16x2 r238, %64, %103; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %77, %61; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %88, r250; +} +{ +sub.f16x2 r256, %64, %103; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f178, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r265, {low, high}; +} +mov.f32 f180, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r266, {low, high}; +} +mov.f32 f190, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r267, {low, high}; +} +mov.f32 f192, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r268, {low, high}; +} +mov.f32 f214, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r271, {low, high}; +} +mov.f32 f216, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %78, %60; +} +{ +add.f16x2 r616, %86, r613; +} +{ +add.f16x2 r619, %93, %76; +} +{ +add.f16x2 r622, %100, r619; +} +{ +add.f16x2 r625, %78, %60; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %86, r628; +} +{ +sub.f16x2 r634, %93, %76; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %78, %60; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %86, r646; +} +{ +sub.f16x2 r652, %93, %76; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %93, %76; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %100, r664; +} +{ +sub.f16x2 r670, %78, %60; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %93, %76; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %100, r682; +} +{ +sub.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %101, %85; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %59, %99; +} +{ +add.f16x2 r710, %68, r707; +} +{ +add.f16x2 r713, %101, %85; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %59, %99; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %101, %85; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %59, %99; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %59, %99; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %68, r752; +} +{ +sub.f16x2 r758, %101, %85; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %59, %99; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %68, r770; +} +{ +sub.f16x2 r776, %101, %85; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %67, %107; +} +{ +add.f16x2 r792, %75, r789; +} +{ +add.f16x2 r795, %84, %66; +} +{ +add.f16x2 r798, %92, r795; +} +{ +add.f16x2 r801, %67, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %75, r804; +} +{ +sub.f16x2 r810, %84, %66; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %67, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %75, r822; +} +{ +sub.f16x2 r828, %84, %66; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %84, %66; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %92, r840; +} +{ +sub.f16x2 r846, %67, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %84, %66; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %92, r858; +} +{ +sub.f16x2 r864, %67, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %82, %65; +} +{ +add.f16x2 r1224, %90, r1221; +} +{ +add.f16x2 r1227, %96, %80; +} +{ +add.f16x2 r1230, %104, r1227; +} +{ +add.f16x2 r1233, %82, %65; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %90, r1236; +} +{ +sub.f16x2 r1242, %96, %80; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %82, %65; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %90, r1254; +} +{ +sub.f16x2 r1260, %96, %80; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %96, %80; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %104, r1272; +} +{ +sub.f16x2 r1278, %82, %65; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %96, %80; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %104, r1290; +} +{ +sub.f16x2 r1296, %82, %65; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %105, %89; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %62, %102; +} +{ +add.f16x2 r1318, %70, r1315; +} +{ +add.f16x2 r1321, %105, %89; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %62, %102; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %105, %89; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %62, %102; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %62, %102; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %70, r1360; +} +{ +sub.f16x2 r1366, %105, %89; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %62, %102; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %70, r1378; +} +{ +sub.f16x2 r1384, %105, %89; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %71, %55; +} +{ +add.f16x2 r1400, %79, r1397; +} +{ +add.f16x2 r1403, %87, %69; +} +{ +add.f16x2 r1406, %94, r1403; +} +{ +add.f16x2 r1409, %71, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %79, r1412; +} +{ +sub.f16x2 r1418, %87, %69; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %71, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %79, r1430; +} +{ +sub.f16x2 r1436, %87, %69; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %87, %69; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %94, r1448; +} +{ +sub.f16x2 r1454, %71, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %87, %69; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %94, r1466; +} +{ +sub.f16x2 r1472, %71, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1825, {low, high}; +} +mov.f32 f172, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1826, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1827, {low, high}; +} +mov.f32 f176, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1830, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1831, {low, high}; +} +mov.f32 f184, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1832, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1833, {low, high}; +} +mov.f32 f188, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1836, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1837, {low, high}; +} +mov.f32 f196, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1838, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1839, {low, high}; +} +mov.f32 f200, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1840, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1843, {low, high}; +} +mov.f32 f208, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1844, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1848, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1851, {low, high}; +} +mov.f32 f224, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1855, {low, high}; +} +mov.f32 f232, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r4685, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r4686, rd3; +mul.lo.s32 r4687, r4686, 3; +sub.s32 r4688, r4685, r4687; +cvt.rn.f32.u32 f401, r4688; +mul.f32 f402, f401, 0f3D9EDD1F; +cos.approx.f32 f309, f402; +sin.approx.f32 f403, f402; +neg.f32 f310, f403; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +mad.lo.s32 r4689, r4686, 648, r4684; +barrier.sync 0; +mad.lo.s32 r4690, r4688, 216, r4689; +st.shared.v2.f32 [r4690], {r2140, r2146}; +st.shared.v2.f32 [r4690+8], {r2937, r2944}; +st.shared.v2.f32 [r4690+16], {r2974, r2981}; +st.shared.v2.f32 [r4690+24], {r3011, r3018}; +st.shared.v2.f32 [r4690+32], {r3048, r3055}; +st.shared.v2.f32 [r4690+40], {r3085, r3092}; +st.shared.v2.f32 [r4690+48], {r3122, r3129}; +st.shared.v2.f32 [r4690+56], {r3159, r3166}; +st.shared.v2.f32 [r4690+64], {r3196, r3203}; +st.shared.v2.f32 [r4690+72], {r3233, r3240}; +st.shared.v2.f32 [r4690+80], {r3270, r3277}; +st.shared.v2.f32 [r4690+88], {r3307, r3314}; +st.shared.v2.f32 [r4690+96], {r3344, r3351}; +st.shared.v2.f32 [r4690+104], {r3381, r3388}; +st.shared.v2.f32 [r4690+112], {r3418, r3425}; +st.shared.v2.f32 [r4690+120], {r3455, r3462}; +st.shared.v2.f32 [r4690+128], {r3492, r3499}; +st.shared.v2.f32 [r4690+136], {r3529, r3536}; +st.shared.v2.f32 [r4690+144], {r3566, r3573}; +st.shared.v2.f32 [r4690+152], {r3603, r3610}; +st.shared.v2.f32 [r4690+160], {r3640, r3647}; +st.shared.v2.f32 [r4690+168], {r3677, r3684}; +st.shared.v2.f32 [r4690+176], {r3714, r3721}; +st.shared.v2.f32 [r4690+184], {r3751, r3758}; +st.shared.v2.f32 [r4690+192], {r3788, r3795}; +st.shared.v2.f32 [r4690+200], {r3825, r3832}; +st.shared.v2.f32 [r4690+208], {r3862, r3869}; +barrier.sync 0; +mad.lo.s32 r4691, r4688, -208, r4690; +ld.shared.u32 r3898, [r4691]; +ld.shared.u32 r3904, [r4691+4]; +ld.shared.u32 r3986, [r4691+24]; +ld.shared.u32 r3992, [r4691+28]; +ld.shared.u32 r4074, [r4691+48]; +ld.shared.u32 r4080, [r4691+52]; +ld.shared.u32 r4162, [r4691+72]; +ld.shared.u32 r4168, [r4691+76]; +ld.shared.u32 r4250, [r4691+96]; +ld.shared.u32 r4256, [r4691+100]; +ld.shared.u32 r4338, [r4691+120]; +ld.shared.u32 r4344, [r4691+124]; +ld.shared.u32 r4426, [r4691+144]; +ld.shared.u32 r4432, [r4691+148]; +ld.shared.u32 r4514, [r4691+168]; +ld.shared.u32 r4520, [r4691+172]; +ld.shared.u32 r4602, [r4691+192]; +ld.shared.u32 r4608, [r4691+196]; +ld.shared.u32 r3895, [r4691+216]; +ld.shared.u32 r3901, [r4691+220]; +ld.shared.u32 r3983, [r4691+240]; +ld.shared.u32 r3989, [r4691+244]; +ld.shared.u32 r4071, [r4691+264]; +ld.shared.u32 r4077, [r4691+268]; +ld.shared.u32 r4159, [r4691+288]; +ld.shared.u32 r4165, [r4691+292]; +ld.shared.u32 r4247, [r4691+312]; +ld.shared.u32 r4253, [r4691+316]; +ld.shared.u32 r4335, [r4691+336]; +ld.shared.u32 r4341, [r4691+340]; +ld.shared.u32 r4423, [r4691+360]; +ld.shared.u32 r4429, [r4691+364]; +ld.shared.u32 r4511, [r4691+384]; +ld.shared.u32 r4517, [r4691+388]; +ld.shared.u32 r4599, [r4691+408]; +ld.shared.u32 r4605, [r4691+412]; +ld.shared.u32 r3896, [r4691+432]; +ld.shared.u32 r3902, [r4691+436]; +ld.shared.u32 r3984, [r4691+456]; +ld.shared.u32 r3990, [r4691+460]; +ld.shared.u32 r4072, [r4691+480]; +ld.shared.u32 r4078, [r4691+484]; +ld.shared.u32 r4160, [r4691+504]; +ld.shared.u32 r4166, [r4691+508]; +ld.shared.u32 r4248, [r4691+528]; +ld.shared.u32 r4254, [r4691+532]; +ld.shared.u32 r4336, [r4691+552]; +ld.shared.u32 r4342, [r4691+556]; +ld.shared.u32 r4424, [r4691+576]; +ld.shared.u32 r4430, [r4691+580]; +ld.shared.u32 r4512, [r4691+600]; +ld.shared.u32 r4518, [r4691+604]; +ld.shared.u32 r4600, [r4691+624]; +ld.shared.u32 r4606, [r4691+628]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 %0, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 %1, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 %18, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 %36, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 %19, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 %37, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 %2, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 %3, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 %20, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 %38, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 %21, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 %39, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 %4, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 %5, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 %22, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 %40, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 %23, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 %41, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4155, {low, high}; +} +{ +neg.f16x2 r4156, r4155; +} +{ +add.f16x2 r4158, r4159, r4160; +} +{ +add.f16x2 %6, r4162, r4158; +} +{ +add.f16x2 r4164, r4165, r4166; +} +{ +add.f16x2 %7, r4168, r4164; +} +{ +add.f16x2 r4170, r4159, r4160; +} +{ +mul.f16x2 r4173, r4170, r4154; +} +{ +add.f16x2 r4176, r4162, r4173; +} +{ +sub.f16x2 r4179, r4165, r4166; +} +{ +mul.f16x2 r4182, r4179, r4156; +} +{ +add.f16x2 %24, r4176, r4182; +} +{ +add.f16x2 r4188, r4159, r4160; +} +{ +mul.f16x2 r4191, r4188, r4154; +} +{ +add.f16x2 r4194, r4162, r4191; +} +{ +sub.f16x2 r4197, r4165, r4166; +} +{ +mul.f16x2 r4200, r4197, r4156; +} +{ +sub.f16x2 %42, r4194, r4200; +} +{ +add.f16x2 r4206, r4165, r4166; +} +{ +mul.f16x2 r4209, r4206, r4154; +} +{ +add.f16x2 r4212, r4168, r4209; +} +{ +sub.f16x2 r4215, r4159, r4160; +} +{ +mul.f16x2 r4218, r4215, r4156; +} +{ +sub.f16x2 %25, r4212, r4218; +} +{ +add.f16x2 r4224, r4165, r4166; +} +{ +mul.f16x2 r4227, r4224, r4154; +} +{ +add.f16x2 r4230, r4168, r4227; +} +{ +sub.f16x2 r4233, r4159, r4160; +} +{ +mul.f16x2 r4236, r4233, r4156; +} +{ +add.f16x2 %43, r4230, r4236; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4243, {low, high}; +} +{ +neg.f16x2 r4244, r4243; +} +{ +add.f16x2 r4246, r4247, r4248; +} +{ +add.f16x2 %8, r4250, r4246; +} +{ +add.f16x2 r4252, r4253, r4254; +} +{ +add.f16x2 %9, r4256, r4252; +} +{ +add.f16x2 r4258, r4247, r4248; +} +{ +mul.f16x2 r4261, r4258, r4242; +} +{ +add.f16x2 r4264, r4250, r4261; +} +{ +sub.f16x2 r4267, r4253, r4254; +} +{ +mul.f16x2 r4270, r4267, r4244; +} +{ +add.f16x2 %26, r4264, r4270; +} +{ +add.f16x2 r4276, r4247, r4248; +} +{ +mul.f16x2 r4279, r4276, r4242; +} +{ +add.f16x2 r4282, r4250, r4279; +} +{ +sub.f16x2 r4285, r4253, r4254; +} +{ +mul.f16x2 r4288, r4285, r4244; +} +{ +sub.f16x2 %44, r4282, r4288; +} +{ +add.f16x2 r4294, r4253, r4254; +} +{ +mul.f16x2 r4297, r4294, r4242; +} +{ +add.f16x2 r4300, r4256, r4297; +} +{ +sub.f16x2 r4303, r4247, r4248; +} +{ +mul.f16x2 r4306, r4303, r4244; +} +{ +sub.f16x2 %27, r4300, r4306; +} +{ +add.f16x2 r4312, r4253, r4254; +} +{ +mul.f16x2 r4315, r4312, r4242; +} +{ +add.f16x2 r4318, r4256, r4315; +} +{ +sub.f16x2 r4321, r4247, r4248; +} +{ +mul.f16x2 r4324, r4321, r4244; +} +{ +add.f16x2 %45, r4318, r4324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4331, {low, high}; +} +{ +neg.f16x2 r4332, r4331; +} +{ +add.f16x2 r4334, r4335, r4336; +} +{ +add.f16x2 %10, r4338, r4334; +} +{ +add.f16x2 r4340, r4341, r4342; +} +{ +add.f16x2 %11, r4344, r4340; +} +{ +add.f16x2 r4346, r4335, r4336; +} +{ +mul.f16x2 r4349, r4346, r4330; +} +{ +add.f16x2 r4352, r4338, r4349; +} +{ +sub.f16x2 r4355, r4341, r4342; +} +{ +mul.f16x2 r4358, r4355, r4332; +} +{ +add.f16x2 %28, r4352, r4358; +} +{ +add.f16x2 r4364, r4335, r4336; +} +{ +mul.f16x2 r4367, r4364, r4330; +} +{ +add.f16x2 r4370, r4338, r4367; +} +{ +sub.f16x2 r4373, r4341, r4342; +} +{ +mul.f16x2 r4376, r4373, r4332; +} +{ +sub.f16x2 %46, r4370, r4376; +} +{ +add.f16x2 r4382, r4341, r4342; +} +{ +mul.f16x2 r4385, r4382, r4330; +} +{ +add.f16x2 r4388, r4344, r4385; +} +{ +sub.f16x2 r4391, r4335, r4336; +} +{ +mul.f16x2 r4394, r4391, r4332; +} +{ +sub.f16x2 %29, r4388, r4394; +} +{ +add.f16x2 r4400, r4341, r4342; +} +{ +mul.f16x2 r4403, r4400, r4330; +} +{ +add.f16x2 r4406, r4344, r4403; +} +{ +sub.f16x2 r4409, r4335, r4336; +} +{ +mul.f16x2 r4412, r4409, r4332; +} +{ +add.f16x2 %47, r4406, r4412; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4419, {low, high}; +} +{ +neg.f16x2 r4420, r4419; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 %12, r4426, r4422; +} +{ +add.f16x2 r4428, r4429, r4430; +} +{ +add.f16x2 %13, r4432, r4428; +} +{ +add.f16x2 r4434, r4423, r4424; +} +{ +mul.f16x2 r4437, r4434, r4418; +} +{ +add.f16x2 r4440, r4426, r4437; +} +{ +sub.f16x2 r4443, r4429, r4430; +} +{ +mul.f16x2 r4446, r4443, r4420; +} +{ +add.f16x2 %30, r4440, r4446; +} +{ +add.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4418; +} +{ +add.f16x2 r4458, r4426, r4455; +} +{ +sub.f16x2 r4461, r4429, r4430; +} +{ +mul.f16x2 r4464, r4461, r4420; +} +{ +sub.f16x2 %48, r4458, r4464; +} +{ +add.f16x2 r4470, r4429, r4430; +} +{ +mul.f16x2 r4473, r4470, r4418; +} +{ +add.f16x2 r4476, r4432, r4473; +} +{ +sub.f16x2 r4479, r4423, r4424; +} +{ +mul.f16x2 r4482, r4479, r4420; +} +{ +sub.f16x2 %31, r4476, r4482; +} +{ +add.f16x2 r4488, r4429, r4430; +} +{ +mul.f16x2 r4491, r4488, r4418; +} +{ +add.f16x2 r4494, r4432, r4491; +} +{ +sub.f16x2 r4497, r4423, r4424; +} +{ +mul.f16x2 r4500, r4497, r4420; +} +{ +add.f16x2 %49, r4494, r4500; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4506, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4507, {low, high}; +} +{ +neg.f16x2 r4508, r4507; +} +{ +add.f16x2 r4510, r4511, r4512; +} +{ +add.f16x2 %14, r4514, r4510; +} +{ +add.f16x2 r4516, r4517, r4518; +} +{ +add.f16x2 %15, r4520, r4516; +} +{ +add.f16x2 r4522, r4511, r4512; +} +{ +mul.f16x2 r4525, r4522, r4506; +} +{ +add.f16x2 r4528, r4514, r4525; +} +{ +sub.f16x2 r4531, r4517, r4518; +} +{ +mul.f16x2 r4534, r4531, r4508; +} +{ +add.f16x2 %32, r4528, r4534; +} +{ +add.f16x2 r4540, r4511, r4512; +} +{ +mul.f16x2 r4543, r4540, r4506; +} +{ +add.f16x2 r4546, r4514, r4543; +} +{ +sub.f16x2 r4549, r4517, r4518; +} +{ +mul.f16x2 r4552, r4549, r4508; +} +{ +sub.f16x2 %50, r4546, r4552; +} +{ +add.f16x2 r4558, r4517, r4518; +} +{ +mul.f16x2 r4561, r4558, r4506; +} +{ +add.f16x2 r4564, r4520, r4561; +} +{ +sub.f16x2 r4567, r4511, r4512; +} +{ +mul.f16x2 r4570, r4567, r4508; +} +{ +sub.f16x2 %33, r4564, r4570; +} +{ +add.f16x2 r4576, r4517, r4518; +} +{ +mul.f16x2 r4579, r4576, r4506; +} +{ +add.f16x2 r4582, r4520, r4579; +} +{ +sub.f16x2 r4585, r4511, r4512; +} +{ +mul.f16x2 r4588, r4585, r4508; +} +{ +add.f16x2 %51, r4582, r4588; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4595, {low, high}; +} +{ +neg.f16x2 r4596, r4595; +} +{ +add.f16x2 r4598, r4599, r4600; +} +{ +add.f16x2 %16, r4602, r4598; +} +{ +add.f16x2 r4604, r4605, r4606; +} +{ +add.f16x2 %17, r4608, r4604; +} +{ +add.f16x2 r4610, r4599, r4600; +} +{ +mul.f16x2 r4613, r4610, r4594; +} +{ +add.f16x2 r4616, r4602, r4613; +} +{ +sub.f16x2 r4619, r4605, r4606; +} +{ +mul.f16x2 r4622, r4619, r4596; +} +{ +add.f16x2 %34, r4616, r4622; +} +{ +add.f16x2 r4628, r4599, r4600; +} +{ +mul.f16x2 r4631, r4628, r4594; +} +{ +add.f16x2 r4634, r4602, r4631; +} +{ +sub.f16x2 r4637, r4605, r4606; +} +{ +mul.f16x2 r4640, r4637, r4596; +} +{ +sub.f16x2 %52, r4634, r4640; +} +{ +add.f16x2 r4646, r4605, r4606; +} +{ +mul.f16x2 r4649, r4646, r4594; +} +{ +add.f16x2 r4652, r4608, r4649; +} +{ +sub.f16x2 r4655, r4599, r4600; +} +{ +mul.f16x2 r4658, r4655, r4596; +} +{ +sub.f16x2 %35, r4652, r4658; +} +{ +add.f16x2 r4664, r4605, r4606; +} +{ +mul.f16x2 r4667, r4664, r4594; +} +{ +add.f16x2 r4670, r4608, r4667; +} +{ +sub.f16x2 r4673, r4599, r4600; +} +{ +mul.f16x2 r4676, r4673, r4596; +} +{ +add.f16x2 %53, r4670, r4676; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<874, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<404>; +.reg .b32 r<4748>; +.reg .b64 rd<4>; +mov.u32 r4746, %tid.y; +mov.u32 r4747, %54; +mad.lo.s32 r4684, r4746, 324, r4747; +mov.u32 r4685, %tid.x; +mov.f32 f398, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1, {low, high}; +} +mov.f32 f400, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %74, %58; +} +{ +add.f16x2 r8, %83, r5; +} +{ +add.f16x2 r11, %91, %73; +} +{ +add.f16x2 r14, %97, r11; +} +{ +add.f16x2 r17, %74, %58; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %83, r20; +} +{ +sub.f16x2 r26, %91, %73; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %74, %58; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %83, r38; +} +{ +sub.f16x2 r44, %91, %73; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %91, %73; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %97, r56; +} +{ +sub.f16x2 r62, %74, %58; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %91, %73; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %97, r74; +} +{ +sub.f16x2 r80, %74, %58; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r90, {low, high}; +} +{ +neg.f16x2 r91, r90; +} +{ +add.f16x2 r93, %98, %81; +} +{ +add.f16x2 r96, %106, r93; +} +{ +add.f16x2 r99, %57, %95; +} +{ +add.f16x2 r102, %63, r99; +} +{ +add.f16x2 r105, %98, %81; +} +{ +mul.f16x2 r108, r105, r89; +} +{ +add.f16x2 r111, %106, r108; +} +{ +sub.f16x2 r114, %57, %95; +} +{ +mul.f16x2 r117, r114, r91; +} +{ +add.f16x2 r120, r111, r117; +} +{ +add.f16x2 r123, %98, %81; +} +{ +mul.f16x2 r126, r123, r89; +} +{ +add.f16x2 r129, %106, r126; +} +{ +sub.f16x2 r132, %57, %95; +} +{ +mul.f16x2 r135, r132, r91; +} +{ +sub.f16x2 r138, r129, r135; +} +{ +add.f16x2 r141, %57, %95; +} +{ +mul.f16x2 r144, r141, r89; +} +{ +add.f16x2 r147, %63, r144; +} +{ +sub.f16x2 r150, %98, %81; +} +{ +mul.f16x2 r153, r150, r91; +} +{ +sub.f16x2 r156, r147, r153; +} +{ +add.f16x2 r159, %57, %95; +} +{ +mul.f16x2 r162, r159, r89; +} +{ +add.f16x2 r165, %63, r162; +} +{ +sub.f16x2 r168, %98, %81; +} +{ +mul.f16x2 r171, r168, r91; +} +{ +add.f16x2 r174, r165, r171; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r177, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r178, {low, high}; +} +{ +neg.f16x2 r179, r178; +} +{ +add.f16x2 r181, %64, %103; +} +{ +add.f16x2 r184, %72, r181; +} +{ +add.f16x2 r187, %77, %61; +} +{ +add.f16x2 r190, %88, r187; +} +{ +add.f16x2 r193, %64, %103; +} +{ +mul.f16x2 r196, r193, r177; +} +{ +add.f16x2 r199, %72, r196; +} +{ +sub.f16x2 r202, %77, %61; +} +{ +mul.f16x2 r205, r202, r179; +} +{ +add.f16x2 r208, r199, r205; +} +{ +add.f16x2 r211, %64, %103; +} +{ +mul.f16x2 r214, r211, r177; +} +{ +add.f16x2 r217, %72, r214; +} +{ +sub.f16x2 r220, %77, %61; +} +{ +mul.f16x2 r223, r220, r179; +} +{ +sub.f16x2 r226, r217, r223; +} +{ +add.f16x2 r229, %77, %61; +} +{ +mul.f16x2 r232, r229, r177; +} +{ +add.f16x2 r235, %88, r232; +} +{ +sub.f16x2 r238, %64, %103; +} +{ +mul.f16x2 r241, r238, r179; +} +{ +sub.f16x2 r244, r235, r241; +} +{ +add.f16x2 r247, %77, %61; +} +{ +mul.f16x2 r250, r247, r177; +} +{ +add.f16x2 r253, %88, r250; +} +{ +sub.f16x2 r256, %64, %103; +} +{ +mul.f16x2 r259, r256, r179; +} +{ +add.f16x2 r262, r253, r259; +} +mov.f32 f178, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r265, {low, high}; +} +mov.f32 f180, 0fBF248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r266, {low, high}; +} +mov.f32 f190, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r267, {low, high}; +} +mov.f32 f192, 0fBF7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r268, {low, high}; +} +mov.f32 f214, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r271, {low, high}; +} +mov.f32 f216, 0fBEAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r272, {low, high}; +} +{ +mul.f16x2 r281, r120, r265; +} +{ +mul.f16x2 r284, r156, r266; +} +{ +sub.f16x2 r287, r281, r284; +} +{ +mul.f16x2 r290, r120, r266; +} +{ +fma.rn.f16x2 r293, r156, r265, r290; +} +{ +mul.f16x2 r297, r208, r267; +} +{ +mul.f16x2 r300, r244, r268; +} +{ +sub.f16x2 r303, r297, r300; +} +{ +mul.f16x2 r306, r208, r268; +} +{ +fma.rn.f16x2 r309, r244, r267, r306; +} +{ +mul.f16x2 r313, r138, r267; +} +{ +mul.f16x2 r316, r174, r268; +} +{ +sub.f16x2 r319, r313, r316; +} +{ +mul.f16x2 r322, r138, r268; +} +{ +fma.rn.f16x2 r325, r174, r267, r322; +} +{ +mul.f16x2 r329, r226, r271; +} +{ +mul.f16x2 r332, r262, r272; +} +{ +sub.f16x2 r335, r329, r332; +} +{ +mul.f16x2 r338, r226, r272; +} +{ +fma.rn.f16x2 r341, r262, r271, r338; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r345, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r346, {low, high}; +} +{ +neg.f16x2 r347, r346; +} +{ +add.f16x2 r349, r96, r184; +} +{ +add.f16x2 r352, r8, r349; +} +{ +add.f16x2 r355, r102, r190; +} +{ +add.f16x2 r358, r14, r355; +} +{ +add.f16x2 r361, r96, r184; +} +{ +mul.f16x2 r364, r361, r345; +} +{ +add.f16x2 r367, r8, r364; +} +{ +sub.f16x2 r370, r102, r190; +} +{ +mul.f16x2 r373, r370, r347; +} +{ +add.f16x2 r376, r367, r373; +} +{ +add.f16x2 r379, r96, r184; +} +{ +mul.f16x2 r382, r379, r345; +} +{ +add.f16x2 r385, r8, r382; +} +{ +sub.f16x2 r388, r102, r190; +} +{ +mul.f16x2 r391, r388, r347; +} +{ +sub.f16x2 r394, r385, r391; +} +{ +add.f16x2 r397, r102, r190; +} +{ +mul.f16x2 r400, r397, r345; +} +{ +add.f16x2 r403, r14, r400; +} +{ +sub.f16x2 r406, r96, r184; +} +{ +mul.f16x2 r409, r406, r347; +} +{ +sub.f16x2 r412, r403, r409; +} +{ +add.f16x2 r415, r102, r190; +} +{ +mul.f16x2 r418, r415, r345; +} +{ +add.f16x2 r421, r14, r418; +} +{ +sub.f16x2 r424, r96, r184; +} +{ +mul.f16x2 r427, r424, r347; +} +{ +add.f16x2 r430, r421, r427; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r433, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r434, {low, high}; +} +{ +neg.f16x2 r435, r434; +} +{ +add.f16x2 r437, r287, r303; +} +{ +add.f16x2 r440, r32, r437; +} +{ +add.f16x2 r443, r293, r309; +} +{ +add.f16x2 r446, r68, r443; +} +{ +add.f16x2 r449, r287, r303; +} +{ +mul.f16x2 r452, r449, r433; +} +{ +add.f16x2 r455, r32, r452; +} +{ +sub.f16x2 r458, r293, r309; +} +{ +mul.f16x2 r461, r458, r435; +} +{ +add.f16x2 r464, r455, r461; +} +{ +add.f16x2 r467, r287, r303; +} +{ +mul.f16x2 r470, r467, r433; +} +{ +add.f16x2 r473, r32, r470; +} +{ +sub.f16x2 r476, r293, r309; +} +{ +mul.f16x2 r479, r476, r435; +} +{ +sub.f16x2 r482, r473, r479; +} +{ +add.f16x2 r485, r293, r309; +} +{ +mul.f16x2 r488, r485, r433; +} +{ +add.f16x2 r491, r68, r488; +} +{ +sub.f16x2 r494, r287, r303; +} +{ +mul.f16x2 r497, r494, r435; +} +{ +sub.f16x2 r500, r491, r497; +} +{ +add.f16x2 r503, r293, r309; +} +{ +mul.f16x2 r506, r503, r433; +} +{ +add.f16x2 r509, r68, r506; +} +{ +sub.f16x2 r512, r287, r303; +} +{ +mul.f16x2 r515, r512, r435; +} +{ +add.f16x2 r518, r509, r515; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r521, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r522, {low, high}; +} +{ +neg.f16x2 r523, r522; +} +{ +add.f16x2 r525, r319, r335; +} +{ +add.f16x2 r528, r50, r525; +} +{ +add.f16x2 r531, r325, r341; +} +{ +add.f16x2 r534, r86, r531; +} +{ +add.f16x2 r537, r319, r335; +} +{ +mul.f16x2 r540, r537, r521; +} +{ +add.f16x2 r543, r50, r540; +} +{ +sub.f16x2 r546, r325, r341; +} +{ +mul.f16x2 r549, r546, r523; +} +{ +add.f16x2 r552, r543, r549; +} +{ +add.f16x2 r555, r319, r335; +} +{ +mul.f16x2 r558, r555, r521; +} +{ +add.f16x2 r561, r50, r558; +} +{ +sub.f16x2 r564, r325, r341; +} +{ +mul.f16x2 r567, r564, r523; +} +{ +sub.f16x2 r570, r561, r567; +} +{ +add.f16x2 r573, r325, r341; +} +{ +mul.f16x2 r576, r573, r521; +} +{ +add.f16x2 r579, r86, r576; +} +{ +sub.f16x2 r582, r319, r335; +} +{ +mul.f16x2 r585, r582, r523; +} +{ +sub.f16x2 r588, r579, r585; +} +{ +add.f16x2 r591, r325, r341; +} +{ +mul.f16x2 r594, r591, r521; +} +{ +add.f16x2 r597, r86, r594; +} +{ +sub.f16x2 r600, r319, r335; +} +{ +mul.f16x2 r603, r600, r523; +} +{ +add.f16x2 r606, r597, r603; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r609, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r610, {low, high}; +} +{ +neg.f16x2 r611, r610; +} +{ +add.f16x2 r613, %78, %60; +} +{ +add.f16x2 r616, %86, r613; +} +{ +add.f16x2 r619, %93, %76; +} +{ +add.f16x2 r622, %100, r619; +} +{ +add.f16x2 r625, %78, %60; +} +{ +mul.f16x2 r628, r625, r609; +} +{ +add.f16x2 r631, %86, r628; +} +{ +sub.f16x2 r634, %93, %76; +} +{ +mul.f16x2 r637, r634, r611; +} +{ +add.f16x2 r640, r631, r637; +} +{ +add.f16x2 r643, %78, %60; +} +{ +mul.f16x2 r646, r643, r609; +} +{ +add.f16x2 r649, %86, r646; +} +{ +sub.f16x2 r652, %93, %76; +} +{ +mul.f16x2 r655, r652, r611; +} +{ +sub.f16x2 r658, r649, r655; +} +{ +add.f16x2 r661, %93, %76; +} +{ +mul.f16x2 r664, r661, r609; +} +{ +add.f16x2 r667, %100, r664; +} +{ +sub.f16x2 r670, %78, %60; +} +{ +mul.f16x2 r673, r670, r611; +} +{ +sub.f16x2 r676, r667, r673; +} +{ +add.f16x2 r679, %93, %76; +} +{ +mul.f16x2 r682, r679, r609; +} +{ +add.f16x2 r685, %100, r682; +} +{ +sub.f16x2 r688, %78, %60; +} +{ +mul.f16x2 r691, r688, r611; +} +{ +add.f16x2 r694, r685, r691; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r697, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r698, {low, high}; +} +{ +neg.f16x2 r699, r698; +} +{ +add.f16x2 r701, %101, %85; +} +{ +add.f16x2 r704, %108, r701; +} +{ +add.f16x2 r707, %59, %99; +} +{ +add.f16x2 r710, %68, r707; +} +{ +add.f16x2 r713, %101, %85; +} +{ +mul.f16x2 r716, r713, r697; +} +{ +add.f16x2 r719, %108, r716; +} +{ +sub.f16x2 r722, %59, %99; +} +{ +mul.f16x2 r725, r722, r699; +} +{ +add.f16x2 r728, r719, r725; +} +{ +add.f16x2 r731, %101, %85; +} +{ +mul.f16x2 r734, r731, r697; +} +{ +add.f16x2 r737, %108, r734; +} +{ +sub.f16x2 r740, %59, %99; +} +{ +mul.f16x2 r743, r740, r699; +} +{ +sub.f16x2 r746, r737, r743; +} +{ +add.f16x2 r749, %59, %99; +} +{ +mul.f16x2 r752, r749, r697; +} +{ +add.f16x2 r755, %68, r752; +} +{ +sub.f16x2 r758, %101, %85; +} +{ +mul.f16x2 r761, r758, r699; +} +{ +sub.f16x2 r764, r755, r761; +} +{ +add.f16x2 r767, %59, %99; +} +{ +mul.f16x2 r770, r767, r697; +} +{ +add.f16x2 r773, %68, r770; +} +{ +sub.f16x2 r776, %101, %85; +} +{ +mul.f16x2 r779, r776, r699; +} +{ +add.f16x2 r782, r773, r779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r786, {low, high}; +} +{ +neg.f16x2 r787, r786; +} +{ +add.f16x2 r789, %67, %107; +} +{ +add.f16x2 r792, %75, r789; +} +{ +add.f16x2 r795, %84, %66; +} +{ +add.f16x2 r798, %92, r795; +} +{ +add.f16x2 r801, %67, %107; +} +{ +mul.f16x2 r804, r801, r785; +} +{ +add.f16x2 r807, %75, r804; +} +{ +sub.f16x2 r810, %84, %66; +} +{ +mul.f16x2 r813, r810, r787; +} +{ +add.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %67, %107; +} +{ +mul.f16x2 r822, r819, r785; +} +{ +add.f16x2 r825, %75, r822; +} +{ +sub.f16x2 r828, %84, %66; +} +{ +mul.f16x2 r831, r828, r787; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %84, %66; +} +{ +mul.f16x2 r840, r837, r785; +} +{ +add.f16x2 r843, %92, r840; +} +{ +sub.f16x2 r846, %67, %107; +} +{ +mul.f16x2 r849, r846, r787; +} +{ +sub.f16x2 r852, r843, r849; +} +{ +add.f16x2 r855, %84, %66; +} +{ +mul.f16x2 r858, r855, r785; +} +{ +add.f16x2 r861, %92, r858; +} +{ +sub.f16x2 r864, %67, %107; +} +{ +mul.f16x2 r867, r864, r787; +} +{ +add.f16x2 r870, r861, r867; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r873, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r874, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r875, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r876, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r879, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r880, {low, high}; +} +{ +mul.f16x2 r889, r728, r873; +} +{ +mul.f16x2 r892, r764, r874; +} +{ +sub.f16x2 r895, r889, r892; +} +{ +mul.f16x2 r898, r728, r874; +} +{ +fma.rn.f16x2 r901, r764, r873, r898; +} +{ +mul.f16x2 r905, r816, r875; +} +{ +mul.f16x2 r908, r852, r876; +} +{ +sub.f16x2 r911, r905, r908; +} +{ +mul.f16x2 r914, r816, r876; +} +{ +fma.rn.f16x2 r917, r852, r875, r914; +} +{ +mul.f16x2 r921, r746, r875; +} +{ +mul.f16x2 r924, r782, r876; +} +{ +sub.f16x2 r927, r921, r924; +} +{ +mul.f16x2 r930, r746, r876; +} +{ +fma.rn.f16x2 r933, r782, r875, r930; +} +{ +mul.f16x2 r937, r834, r879; +} +{ +mul.f16x2 r940, r870, r880; +} +{ +sub.f16x2 r943, r937, r940; +} +{ +mul.f16x2 r946, r834, r880; +} +{ +fma.rn.f16x2 r949, r870, r879, r946; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r953, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r954, {low, high}; +} +{ +neg.f16x2 r955, r954; +} +{ +add.f16x2 r957, r704, r792; +} +{ +add.f16x2 r960, r616, r957; +} +{ +add.f16x2 r963, r710, r798; +} +{ +add.f16x2 r966, r622, r963; +} +{ +add.f16x2 r969, r704, r792; +} +{ +mul.f16x2 r972, r969, r953; +} +{ +add.f16x2 r975, r616, r972; +} +{ +sub.f16x2 r978, r710, r798; +} +{ +mul.f16x2 r981, r978, r955; +} +{ +add.f16x2 r984, r975, r981; +} +{ +add.f16x2 r987, r704, r792; +} +{ +mul.f16x2 r990, r987, r953; +} +{ +add.f16x2 r993, r616, r990; +} +{ +sub.f16x2 r996, r710, r798; +} +{ +mul.f16x2 r999, r996, r955; +} +{ +sub.f16x2 r1002, r993, r999; +} +{ +add.f16x2 r1005, r710, r798; +} +{ +mul.f16x2 r1008, r1005, r953; +} +{ +add.f16x2 r1011, r622, r1008; +} +{ +sub.f16x2 r1014, r704, r792; +} +{ +mul.f16x2 r1017, r1014, r955; +} +{ +sub.f16x2 r1020, r1011, r1017; +} +{ +add.f16x2 r1023, r710, r798; +} +{ +mul.f16x2 r1026, r1023, r953; +} +{ +add.f16x2 r1029, r622, r1026; +} +{ +sub.f16x2 r1032, r704, r792; +} +{ +mul.f16x2 r1035, r1032, r955; +} +{ +add.f16x2 r1038, r1029, r1035; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1041, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1042, {low, high}; +} +{ +neg.f16x2 r1043, r1042; +} +{ +add.f16x2 r1045, r895, r911; +} +{ +add.f16x2 r1048, r640, r1045; +} +{ +add.f16x2 r1051, r901, r917; +} +{ +add.f16x2 r1054, r676, r1051; +} +{ +add.f16x2 r1057, r895, r911; +} +{ +mul.f16x2 r1060, r1057, r1041; +} +{ +add.f16x2 r1063, r640, r1060; +} +{ +sub.f16x2 r1066, r901, r917; +} +{ +mul.f16x2 r1069, r1066, r1043; +} +{ +add.f16x2 r1072, r1063, r1069; +} +{ +add.f16x2 r1075, r895, r911; +} +{ +mul.f16x2 r1078, r1075, r1041; +} +{ +add.f16x2 r1081, r640, r1078; +} +{ +sub.f16x2 r1084, r901, r917; +} +{ +mul.f16x2 r1087, r1084, r1043; +} +{ +sub.f16x2 r1090, r1081, r1087; +} +{ +add.f16x2 r1093, r901, r917; +} +{ +mul.f16x2 r1096, r1093, r1041; +} +{ +add.f16x2 r1099, r676, r1096; +} +{ +sub.f16x2 r1102, r895, r911; +} +{ +mul.f16x2 r1105, r1102, r1043; +} +{ +sub.f16x2 r1108, r1099, r1105; +} +{ +add.f16x2 r1111, r901, r917; +} +{ +mul.f16x2 r1114, r1111, r1041; +} +{ +add.f16x2 r1117, r676, r1114; +} +{ +sub.f16x2 r1120, r895, r911; +} +{ +mul.f16x2 r1123, r1120, r1043; +} +{ +add.f16x2 r1126, r1117, r1123; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1129, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1130, {low, high}; +} +{ +neg.f16x2 r1131, r1130; +} +{ +add.f16x2 r1133, r927, r943; +} +{ +add.f16x2 r1136, r658, r1133; +} +{ +add.f16x2 r1139, r933, r949; +} +{ +add.f16x2 r1142, r694, r1139; +} +{ +add.f16x2 r1145, r927, r943; +} +{ +mul.f16x2 r1148, r1145, r1129; +} +{ +add.f16x2 r1151, r658, r1148; +} +{ +sub.f16x2 r1154, r933, r949; +} +{ +mul.f16x2 r1157, r1154, r1131; +} +{ +add.f16x2 r1160, r1151, r1157; +} +{ +add.f16x2 r1163, r927, r943; +} +{ +mul.f16x2 r1166, r1163, r1129; +} +{ +add.f16x2 r1169, r658, r1166; +} +{ +sub.f16x2 r1172, r933, r949; +} +{ +mul.f16x2 r1175, r1172, r1131; +} +{ +sub.f16x2 r1178, r1169, r1175; +} +{ +add.f16x2 r1181, r933, r949; +} +{ +mul.f16x2 r1184, r1181, r1129; +} +{ +add.f16x2 r1187, r694, r1184; +} +{ +sub.f16x2 r1190, r927, r943; +} +{ +mul.f16x2 r1193, r1190, r1131; +} +{ +sub.f16x2 r1196, r1187, r1193; +} +{ +add.f16x2 r1199, r933, r949; +} +{ +mul.f16x2 r1202, r1199, r1129; +} +{ +add.f16x2 r1205, r694, r1202; +} +{ +sub.f16x2 r1208, r927, r943; +} +{ +mul.f16x2 r1211, r1208, r1131; +} +{ +add.f16x2 r1214, r1205, r1211; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1217, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1218, {low, high}; +} +{ +neg.f16x2 r1219, r1218; +} +{ +add.f16x2 r1221, %82, %65; +} +{ +add.f16x2 r1224, %90, r1221; +} +{ +add.f16x2 r1227, %96, %80; +} +{ +add.f16x2 r1230, %104, r1227; +} +{ +add.f16x2 r1233, %82, %65; +} +{ +mul.f16x2 r1236, r1233, r1217; +} +{ +add.f16x2 r1239, %90, r1236; +} +{ +sub.f16x2 r1242, %96, %80; +} +{ +mul.f16x2 r1245, r1242, r1219; +} +{ +add.f16x2 r1248, r1239, r1245; +} +{ +add.f16x2 r1251, %82, %65; +} +{ +mul.f16x2 r1254, r1251, r1217; +} +{ +add.f16x2 r1257, %90, r1254; +} +{ +sub.f16x2 r1260, %96, %80; +} +{ +mul.f16x2 r1263, r1260, r1219; +} +{ +sub.f16x2 r1266, r1257, r1263; +} +{ +add.f16x2 r1269, %96, %80; +} +{ +mul.f16x2 r1272, r1269, r1217; +} +{ +add.f16x2 r1275, %104, r1272; +} +{ +sub.f16x2 r1278, %82, %65; +} +{ +mul.f16x2 r1281, r1278, r1219; +} +{ +sub.f16x2 r1284, r1275, r1281; +} +{ +add.f16x2 r1287, %96, %80; +} +{ +mul.f16x2 r1290, r1287, r1217; +} +{ +add.f16x2 r1293, %104, r1290; +} +{ +sub.f16x2 r1296, %82, %65; +} +{ +mul.f16x2 r1299, r1296, r1219; +} +{ +add.f16x2 r1302, r1293, r1299; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1305, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1306, {low, high}; +} +{ +neg.f16x2 r1307, r1306; +} +{ +add.f16x2 r1309, %105, %89; +} +{ +add.f16x2 r1312, %56, r1309; +} +{ +add.f16x2 r1315, %62, %102; +} +{ +add.f16x2 r1318, %70, r1315; +} +{ +add.f16x2 r1321, %105, %89; +} +{ +mul.f16x2 r1324, r1321, r1305; +} +{ +add.f16x2 r1327, %56, r1324; +} +{ +sub.f16x2 r1330, %62, %102; +} +{ +mul.f16x2 r1333, r1330, r1307; +} +{ +add.f16x2 r1336, r1327, r1333; +} +{ +add.f16x2 r1339, %105, %89; +} +{ +mul.f16x2 r1342, r1339, r1305; +} +{ +add.f16x2 r1345, %56, r1342; +} +{ +sub.f16x2 r1348, %62, %102; +} +{ +mul.f16x2 r1351, r1348, r1307; +} +{ +sub.f16x2 r1354, r1345, r1351; +} +{ +add.f16x2 r1357, %62, %102; +} +{ +mul.f16x2 r1360, r1357, r1305; +} +{ +add.f16x2 r1363, %70, r1360; +} +{ +sub.f16x2 r1366, %105, %89; +} +{ +mul.f16x2 r1369, r1366, r1307; +} +{ +sub.f16x2 r1372, r1363, r1369; +} +{ +add.f16x2 r1375, %62, %102; +} +{ +mul.f16x2 r1378, r1375, r1305; +} +{ +add.f16x2 r1381, %70, r1378; +} +{ +sub.f16x2 r1384, %105, %89; +} +{ +mul.f16x2 r1387, r1384, r1307; +} +{ +add.f16x2 r1390, r1381, r1387; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1393, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1394, {low, high}; +} +{ +neg.f16x2 r1395, r1394; +} +{ +add.f16x2 r1397, %71, %55; +} +{ +add.f16x2 r1400, %79, r1397; +} +{ +add.f16x2 r1403, %87, %69; +} +{ +add.f16x2 r1406, %94, r1403; +} +{ +add.f16x2 r1409, %71, %55; +} +{ +mul.f16x2 r1412, r1409, r1393; +} +{ +add.f16x2 r1415, %79, r1412; +} +{ +sub.f16x2 r1418, %87, %69; +} +{ +mul.f16x2 r1421, r1418, r1395; +} +{ +add.f16x2 r1424, r1415, r1421; +} +{ +add.f16x2 r1427, %71, %55; +} +{ +mul.f16x2 r1430, r1427, r1393; +} +{ +add.f16x2 r1433, %79, r1430; +} +{ +sub.f16x2 r1436, %87, %69; +} +{ +mul.f16x2 r1439, r1436, r1395; +} +{ +sub.f16x2 r1442, r1433, r1439; +} +{ +add.f16x2 r1445, %87, %69; +} +{ +mul.f16x2 r1448, r1445, r1393; +} +{ +add.f16x2 r1451, %94, r1448; +} +{ +sub.f16x2 r1454, %71, %55; +} +{ +mul.f16x2 r1457, r1454, r1395; +} +{ +sub.f16x2 r1460, r1451, r1457; +} +{ +add.f16x2 r1463, %87, %69; +} +{ +mul.f16x2 r1466, r1463, r1393; +} +{ +add.f16x2 r1469, %94, r1466; +} +{ +sub.f16x2 r1472, %71, %55; +} +{ +mul.f16x2 r1475, r1472, r1395; +} +{ +add.f16x2 r1478, r1469, r1475; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1481, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1482, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1483, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1484, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1487, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1488, {low, high}; +} +{ +mul.f16x2 r1497, r1336, r1481; +} +{ +mul.f16x2 r1500, r1372, r1482; +} +{ +sub.f16x2 r1503, r1497, r1500; +} +{ +mul.f16x2 r1506, r1336, r1482; +} +{ +fma.rn.f16x2 r1509, r1372, r1481, r1506; +} +{ +mul.f16x2 r1513, r1424, r1483; +} +{ +mul.f16x2 r1516, r1460, r1484; +} +{ +sub.f16x2 r1519, r1513, r1516; +} +{ +mul.f16x2 r1522, r1424, r1484; +} +{ +fma.rn.f16x2 r1525, r1460, r1483, r1522; +} +{ +mul.f16x2 r1529, r1354, r1483; +} +{ +mul.f16x2 r1532, r1390, r1484; +} +{ +sub.f16x2 r1535, r1529, r1532; +} +{ +mul.f16x2 r1538, r1354, r1484; +} +{ +fma.rn.f16x2 r1541, r1390, r1483, r1538; +} +{ +mul.f16x2 r1545, r1442, r1487; +} +{ +mul.f16x2 r1548, r1478, r1488; +} +{ +sub.f16x2 r1551, r1545, r1548; +} +{ +mul.f16x2 r1554, r1442, r1488; +} +{ +fma.rn.f16x2 r1557, r1478, r1487, r1554; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1561, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1562, {low, high}; +} +{ +neg.f16x2 r1563, r1562; +} +{ +add.f16x2 r1565, r1312, r1400; +} +{ +add.f16x2 r1568, r1224, r1565; +} +{ +add.f16x2 r1571, r1318, r1406; +} +{ +add.f16x2 r1574, r1230, r1571; +} +{ +add.f16x2 r1577, r1312, r1400; +} +{ +mul.f16x2 r1580, r1577, r1561; +} +{ +add.f16x2 r1583, r1224, r1580; +} +{ +sub.f16x2 r1586, r1318, r1406; +} +{ +mul.f16x2 r1589, r1586, r1563; +} +{ +add.f16x2 r1592, r1583, r1589; +} +{ +add.f16x2 r1595, r1312, r1400; +} +{ +mul.f16x2 r1598, r1595, r1561; +} +{ +add.f16x2 r1601, r1224, r1598; +} +{ +sub.f16x2 r1604, r1318, r1406; +} +{ +mul.f16x2 r1607, r1604, r1563; +} +{ +sub.f16x2 r1610, r1601, r1607; +} +{ +add.f16x2 r1613, r1318, r1406; +} +{ +mul.f16x2 r1616, r1613, r1561; +} +{ +add.f16x2 r1619, r1230, r1616; +} +{ +sub.f16x2 r1622, r1312, r1400; +} +{ +mul.f16x2 r1625, r1622, r1563; +} +{ +sub.f16x2 r1628, r1619, r1625; +} +{ +add.f16x2 r1631, r1318, r1406; +} +{ +mul.f16x2 r1634, r1631, r1561; +} +{ +add.f16x2 r1637, r1230, r1634; +} +{ +sub.f16x2 r1640, r1312, r1400; +} +{ +mul.f16x2 r1643, r1640, r1563; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1649, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1650, {low, high}; +} +{ +neg.f16x2 r1651, r1650; +} +{ +add.f16x2 r1653, r1503, r1519; +} +{ +add.f16x2 r1656, r1248, r1653; +} +{ +add.f16x2 r1659, r1509, r1525; +} +{ +add.f16x2 r1662, r1284, r1659; +} +{ +add.f16x2 r1665, r1503, r1519; +} +{ +mul.f16x2 r1668, r1665, r1649; +} +{ +add.f16x2 r1671, r1248, r1668; +} +{ +sub.f16x2 r1674, r1509, r1525; +} +{ +mul.f16x2 r1677, r1674, r1651; +} +{ +add.f16x2 r1680, r1671, r1677; +} +{ +add.f16x2 r1683, r1503, r1519; +} +{ +mul.f16x2 r1686, r1683, r1649; +} +{ +add.f16x2 r1689, r1248, r1686; +} +{ +sub.f16x2 r1692, r1509, r1525; +} +{ +mul.f16x2 r1695, r1692, r1651; +} +{ +sub.f16x2 r1698, r1689, r1695; +} +{ +add.f16x2 r1701, r1509, r1525; +} +{ +mul.f16x2 r1704, r1701, r1649; +} +{ +add.f16x2 r1707, r1284, r1704; +} +{ +sub.f16x2 r1710, r1503, r1519; +} +{ +mul.f16x2 r1713, r1710, r1651; +} +{ +sub.f16x2 r1716, r1707, r1713; +} +{ +add.f16x2 r1719, r1509, r1525; +} +{ +mul.f16x2 r1722, r1719, r1649; +} +{ +add.f16x2 r1725, r1284, r1722; +} +{ +sub.f16x2 r1728, r1503, r1519; +} +{ +mul.f16x2 r1731, r1728, r1651; +} +{ +add.f16x2 r1734, r1725, r1731; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1737, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1738, {low, high}; +} +{ +neg.f16x2 r1739, r1738; +} +{ +add.f16x2 r1741, r1535, r1551; +} +{ +add.f16x2 r1744, r1266, r1741; +} +{ +add.f16x2 r1747, r1541, r1557; +} +{ +add.f16x2 r1750, r1302, r1747; +} +{ +add.f16x2 r1753, r1535, r1551; +} +{ +mul.f16x2 r1756, r1753, r1737; +} +{ +add.f16x2 r1759, r1266, r1756; +} +{ +sub.f16x2 r1762, r1541, r1557; +} +{ +mul.f16x2 r1765, r1762, r1739; +} +{ +add.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1535, r1551; +} +{ +mul.f16x2 r1774, r1771, r1737; +} +{ +add.f16x2 r1777, r1266, r1774; +} +{ +sub.f16x2 r1780, r1541, r1557; +} +{ +mul.f16x2 r1783, r1780, r1739; +} +{ +sub.f16x2 r1786, r1777, r1783; +} +{ +add.f16x2 r1789, r1541, r1557; +} +{ +mul.f16x2 r1792, r1789, r1737; +} +{ +add.f16x2 r1795, r1302, r1792; +} +{ +sub.f16x2 r1798, r1535, r1551; +} +{ +mul.f16x2 r1801, r1798, r1739; +} +{ +sub.f16x2 r1804, r1795, r1801; +} +{ +add.f16x2 r1807, r1541, r1557; +} +{ +mul.f16x2 r1810, r1807, r1737; +} +{ +add.f16x2 r1813, r1302, r1810; +} +{ +sub.f16x2 r1816, r1535, r1551; +} +{ +mul.f16x2 r1819, r1816, r1739; +} +{ +add.f16x2 r1822, r1813, r1819; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1825, {low, high}; +} +mov.f32 f172, 0fBE6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1826, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1827, {low, high}; +} +mov.f32 f176, 0fBEE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1828, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1829, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1830, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1831, {low, high}; +} +mov.f32 f184, 0fBF4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1832, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1833, {low, high}; +} +mov.f32 f188, 0fBF6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1834, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1835, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1836, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1837, {low, high}; +} +mov.f32 f196, 0fBF7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1838, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1839, {low, high}; +} +mov.f32 f200, 0fBF753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1840, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1843, {low, high}; +} +mov.f32 f208, 0fBF3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1844, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1847, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1848, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1851, {low, high}; +} +mov.f32 f224, 0f3DEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1852, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1855, {low, high}; +} +mov.f32 f232, 0f3F0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1856, {low, high}; +} +{ +mul.f16x2 r1877, r1048, r1825; +} +{ +mul.f16x2 r1880, r1054, r1826; +} +{ +sub.f16x2 r1883, r1877, r1880; +} +{ +mul.f16x2 r1886, r1048, r1826; +} +{ +fma.rn.f16x2 r1889, r1054, r1825, r1886; +} +{ +mul.f16x2 r1893, r1656, r1827; +} +{ +mul.f16x2 r1896, r1662, r1828; +} +{ +sub.f16x2 r1899, r1893, r1896; +} +{ +mul.f16x2 r1902, r1656, r1828; +} +{ +fma.rn.f16x2 r1905, r1662, r1827, r1902; +} +{ +mul.f16x2 r1909, r1136, r1827; +} +{ +mul.f16x2 r1912, r1142, r1828; +} +{ +sub.f16x2 r1915, r1909, r1912; +} +{ +mul.f16x2 r1918, r1136, r1828; +} +{ +fma.rn.f16x2 r1921, r1142, r1827, r1918; +} +{ +mul.f16x2 r1925, r1744, r1831; +} +{ +mul.f16x2 r1928, r1750, r1832; +} +{ +sub.f16x2 r1931, r1925, r1928; +} +{ +mul.f16x2 r1934, r1744, r1832; +} +{ +fma.rn.f16x2 r1937, r1750, r1831, r1934; +} +{ +mul.f16x2 r1941, r984, r1829; +} +{ +mul.f16x2 r1944, r1020, r1830; +} +{ +sub.f16x2 r1947, r1941, r1944; +} +{ +mul.f16x2 r1950, r984, r1830; +} +{ +fma.rn.f16x2 r1953, r1020, r1829, r1950; +} +{ +mul.f16x2 r1957, r1592, r1835; +} +{ +mul.f16x2 r1960, r1628, r1836; +} +{ +sub.f16x2 r1963, r1957, r1960; +} +{ +mul.f16x2 r1966, r1592, r1836; +} +{ +fma.rn.f16x2 r1969, r1628, r1835, r1966; +} +{ +mul.f16x2 r1973, r1072, r1831; +} +{ +mul.f16x2 r1976, r1108, r1832; +} +{ +sub.f16x2 r1979, r1973, r1976; +} +{ +mul.f16x2 r1982, r1072, r1832; +} +{ +fma.rn.f16x2 r1985, r1108, r1831, r1982; +} +{ +mul.f16x2 r1989, r1680, r1839; +} +{ +mul.f16x2 r1992, r1716, r1840; +} +{ +sub.f16x2 r1995, r1989, r1992; +} +{ +mul.f16x2 r1998, r1680, r1840; +} +{ +fma.rn.f16x2 r2001, r1716, r1839, r1998; +} +{ +mul.f16x2 r2005, r1160, r1833; +} +{ +mul.f16x2 r2008, r1196, r1834; +} +{ +sub.f16x2 r2011, r2005, r2008; +} +{ +mul.f16x2 r2014, r1160, r1834; +} +{ +fma.rn.f16x2 r2017, r1196, r1833, r2014; +} +{ +mul.f16x2 r2021, r1768, r1843; +} +{ +mul.f16x2 r2024, r1804, r1844; +} +{ +sub.f16x2 r2027, r2021, r2024; +} +{ +mul.f16x2 r2030, r1768, r1844; +} +{ +fma.rn.f16x2 r2033, r1804, r1843, r2030; +} +{ +mul.f16x2 r2037, r1002, r1835; +} +{ +mul.f16x2 r2040, r1038, r1836; +} +{ +sub.f16x2 r2043, r2037, r2040; +} +{ +mul.f16x2 r2046, r1002, r1836; +} +{ +fma.rn.f16x2 r2049, r1038, r1835, r2046; +} +{ +mul.f16x2 r2053, r1610, r1847; +} +{ +mul.f16x2 r2056, r1646, r1848; +} +{ +sub.f16x2 r2059, r2053, r2056; +} +{ +mul.f16x2 r2062, r1610, r1848; +} +{ +fma.rn.f16x2 r2065, r1646, r1847, r2062; +} +{ +mul.f16x2 r2069, r1090, r1837; +} +{ +mul.f16x2 r2072, r1126, r1838; +} +{ +sub.f16x2 r2075, r2069, r2072; +} +{ +mul.f16x2 r2078, r1090, r1838; +} +{ +fma.rn.f16x2 r2081, r1126, r1837, r2078; +} +{ +mul.f16x2 r2085, r1698, r1851; +} +{ +mul.f16x2 r2088, r1734, r1852; +} +{ +sub.f16x2 r2091, r2085, r2088; +} +{ +mul.f16x2 r2094, r1698, r1852; +} +{ +fma.rn.f16x2 r2097, r1734, r1851, r2094; +} +{ +mul.f16x2 r2101, r1178, r1839; +} +{ +mul.f16x2 r2104, r1214, r1840; +} +{ +sub.f16x2 r2107, r2101, r2104; +} +{ +mul.f16x2 r2110, r1178, r1840; +} +{ +fma.rn.f16x2 r2113, r1214, r1839, r2110; +} +{ +mul.f16x2 r2117, r1786, r1855; +} +{ +mul.f16x2 r2120, r1822, r1856; +} +{ +sub.f16x2 r2123, r2117, r2120; +} +{ +mul.f16x2 r2126, r1786, r1856; +} +{ +fma.rn.f16x2 r2129, r1822, r1855, r2126; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2133, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2134, {low, high}; +} +{ +neg.f16x2 r2135, r2134; +} +{ +add.f16x2 r2137, r960, r1568; +} +{ +add.f16x2 r2140, r352, r2137; +} +{ +add.f16x2 r2143, r966, r1574; +} +{ +add.f16x2 r2146, r358, r2143; +} +{ +add.f16x2 r2149, r960, r1568; +} +{ +mul.f16x2 r2152, r2149, r2133; +} +{ +add.f16x2 r2155, r352, r2152; +} +{ +sub.f16x2 r2158, r966, r1574; +} +{ +mul.f16x2 r2161, r2158, r2135; +} +{ +add.f16x2 r2164, r2155, r2161; +} +{ +add.f16x2 r2167, r960, r1568; +} +{ +mul.f16x2 r2170, r2167, r2133; +} +{ +add.f16x2 r2173, r352, r2170; +} +{ +sub.f16x2 r2176, r966, r1574; +} +{ +mul.f16x2 r2179, r2176, r2135; +} +{ +sub.f16x2 r2182, r2173, r2179; +} +{ +add.f16x2 r2185, r966, r1574; +} +{ +mul.f16x2 r2188, r2185, r2133; +} +{ +add.f16x2 r2191, r358, r2188; +} +{ +sub.f16x2 r2194, r960, r1568; +} +{ +mul.f16x2 r2197, r2194, r2135; +} +{ +sub.f16x2 r2200, r2191, r2197; +} +{ +add.f16x2 r2203, r966, r1574; +} +{ +mul.f16x2 r2206, r2203, r2133; +} +{ +add.f16x2 r2209, r358, r2206; +} +{ +sub.f16x2 r2212, r960, r1568; +} +{ +mul.f16x2 r2215, r2212, r2135; +} +{ +add.f16x2 r2218, r2209, r2215; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2221, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2222, {low, high}; +} +{ +neg.f16x2 r2223, r2222; +} +{ +add.f16x2 r2225, r1883, r1899; +} +{ +add.f16x2 r2228, r440, r2225; +} +{ +add.f16x2 r2231, r1889, r1905; +} +{ +add.f16x2 r2234, r446, r2231; +} +{ +add.f16x2 r2237, r1883, r1899; +} +{ +mul.f16x2 r2240, r2237, r2221; +} +{ +add.f16x2 r2243, r440, r2240; +} +{ +sub.f16x2 r2246, r1889, r1905; +} +{ +mul.f16x2 r2249, r2246, r2223; +} +{ +add.f16x2 r2252, r2243, r2249; +} +{ +add.f16x2 r2255, r1883, r1899; +} +{ +mul.f16x2 r2258, r2255, r2221; +} +{ +add.f16x2 r2261, r440, r2258; +} +{ +sub.f16x2 r2264, r1889, r1905; +} +{ +mul.f16x2 r2267, r2264, r2223; +} +{ +sub.f16x2 r2270, r2261, r2267; +} +{ +add.f16x2 r2273, r1889, r1905; +} +{ +mul.f16x2 r2276, r2273, r2221; +} +{ +add.f16x2 r2279, r446, r2276; +} +{ +sub.f16x2 r2282, r1883, r1899; +} +{ +mul.f16x2 r2285, r2282, r2223; +} +{ +sub.f16x2 r2288, r2279, r2285; +} +{ +add.f16x2 r2291, r1889, r1905; +} +{ +mul.f16x2 r2294, r2291, r2221; +} +{ +add.f16x2 r2297, r446, r2294; +} +{ +sub.f16x2 r2300, r1883, r1899; +} +{ +mul.f16x2 r2303, r2300, r2223; +} +{ +add.f16x2 r2306, r2297, r2303; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2309, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2310, {low, high}; +} +{ +neg.f16x2 r2311, r2310; +} +{ +add.f16x2 r2313, r1915, r1931; +} +{ +add.f16x2 r2316, r528, r2313; +} +{ +add.f16x2 r2319, r1921, r1937; +} +{ +add.f16x2 r2322, r534, r2319; +} +{ +add.f16x2 r2325, r1915, r1931; +} +{ +mul.f16x2 r2328, r2325, r2309; +} +{ +add.f16x2 r2331, r528, r2328; +} +{ +sub.f16x2 r2334, r1921, r1937; +} +{ +mul.f16x2 r2337, r2334, r2311; +} +{ +add.f16x2 r2340, r2331, r2337; +} +{ +add.f16x2 r2343, r1915, r1931; +} +{ +mul.f16x2 r2346, r2343, r2309; +} +{ +add.f16x2 r2349, r528, r2346; +} +{ +sub.f16x2 r2352, r1921, r1937; +} +{ +mul.f16x2 r2355, r2352, r2311; +} +{ +sub.f16x2 r2358, r2349, r2355; +} +{ +add.f16x2 r2361, r1921, r1937; +} +{ +mul.f16x2 r2364, r2361, r2309; +} +{ +add.f16x2 r2367, r534, r2364; +} +{ +sub.f16x2 r2370, r1915, r1931; +} +{ +mul.f16x2 r2373, r2370, r2311; +} +{ +sub.f16x2 r2376, r2367, r2373; +} +{ +add.f16x2 r2379, r1921, r1937; +} +{ +mul.f16x2 r2382, r2379, r2309; +} +{ +add.f16x2 r2385, r534, r2382; +} +{ +sub.f16x2 r2388, r1915, r1931; +} +{ +mul.f16x2 r2391, r2388, r2311; +} +{ +add.f16x2 r2394, r2385, r2391; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2397, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2398, {low, high}; +} +{ +neg.f16x2 r2399, r2398; +} +{ +add.f16x2 r2401, r1947, r1963; +} +{ +add.f16x2 r2404, r376, r2401; +} +{ +add.f16x2 r2407, r1953, r1969; +} +{ +add.f16x2 r2410, r412, r2407; +} +{ +add.f16x2 r2413, r1947, r1963; +} +{ +mul.f16x2 r2416, r2413, r2397; +} +{ +add.f16x2 r2419, r376, r2416; +} +{ +sub.f16x2 r2422, r1953, r1969; +} +{ +mul.f16x2 r2425, r2422, r2399; +} +{ +add.f16x2 r2428, r2419, r2425; +} +{ +add.f16x2 r2431, r1947, r1963; +} +{ +mul.f16x2 r2434, r2431, r2397; +} +{ +add.f16x2 r2437, r376, r2434; +} +{ +sub.f16x2 r2440, r1953, r1969; +} +{ +mul.f16x2 r2443, r2440, r2399; +} +{ +sub.f16x2 r2446, r2437, r2443; +} +{ +add.f16x2 r2449, r1953, r1969; +} +{ +mul.f16x2 r2452, r2449, r2397; +} +{ +add.f16x2 r2455, r412, r2452; +} +{ +sub.f16x2 r2458, r1947, r1963; +} +{ +mul.f16x2 r2461, r2458, r2399; +} +{ +sub.f16x2 r2464, r2455, r2461; +} +{ +add.f16x2 r2467, r1953, r1969; +} +{ +mul.f16x2 r2470, r2467, r2397; +} +{ +add.f16x2 r2473, r412, r2470; +} +{ +sub.f16x2 r2476, r1947, r1963; +} +{ +mul.f16x2 r2479, r2476, r2399; +} +{ +add.f16x2 r2482, r2473, r2479; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2485, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2486, {low, high}; +} +{ +neg.f16x2 r2487, r2486; +} +{ +add.f16x2 r2489, r1979, r1995; +} +{ +add.f16x2 r2492, r464, r2489; +} +{ +add.f16x2 r2495, r1985, r2001; +} +{ +add.f16x2 r2498, r500, r2495; +} +{ +add.f16x2 r2501, r1979, r1995; +} +{ +mul.f16x2 r2504, r2501, r2485; +} +{ +add.f16x2 r2507, r464, r2504; +} +{ +sub.f16x2 r2510, r1985, r2001; +} +{ +mul.f16x2 r2513, r2510, r2487; +} +{ +add.f16x2 r2516, r2507, r2513; +} +{ +add.f16x2 r2519, r1979, r1995; +} +{ +mul.f16x2 r2522, r2519, r2485; +} +{ +add.f16x2 r2525, r464, r2522; +} +{ +sub.f16x2 r2528, r1985, r2001; +} +{ +mul.f16x2 r2531, r2528, r2487; +} +{ +sub.f16x2 r2534, r2525, r2531; +} +{ +add.f16x2 r2537, r1985, r2001; +} +{ +mul.f16x2 r2540, r2537, r2485; +} +{ +add.f16x2 r2543, r500, r2540; +} +{ +sub.f16x2 r2546, r1979, r1995; +} +{ +mul.f16x2 r2549, r2546, r2487; +} +{ +sub.f16x2 r2552, r2543, r2549; +} +{ +add.f16x2 r2555, r1985, r2001; +} +{ +mul.f16x2 r2558, r2555, r2485; +} +{ +add.f16x2 r2561, r500, r2558; +} +{ +sub.f16x2 r2564, r1979, r1995; +} +{ +mul.f16x2 r2567, r2564, r2487; +} +{ +add.f16x2 r2570, r2561, r2567; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2573, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2574, {low, high}; +} +{ +neg.f16x2 r2575, r2574; +} +{ +add.f16x2 r2577, r2011, r2027; +} +{ +add.f16x2 r2580, r552, r2577; +} +{ +add.f16x2 r2583, r2017, r2033; +} +{ +add.f16x2 r2586, r588, r2583; +} +{ +add.f16x2 r2589, r2011, r2027; +} +{ +mul.f16x2 r2592, r2589, r2573; +} +{ +add.f16x2 r2595, r552, r2592; +} +{ +sub.f16x2 r2598, r2017, r2033; +} +{ +mul.f16x2 r2601, r2598, r2575; +} +{ +add.f16x2 r2604, r2595, r2601; +} +{ +add.f16x2 r2607, r2011, r2027; +} +{ +mul.f16x2 r2610, r2607, r2573; +} +{ +add.f16x2 r2613, r552, r2610; +} +{ +sub.f16x2 r2616, r2017, r2033; +} +{ +mul.f16x2 r2619, r2616, r2575; +} +{ +sub.f16x2 r2622, r2613, r2619; +} +{ +add.f16x2 r2625, r2017, r2033; +} +{ +mul.f16x2 r2628, r2625, r2573; +} +{ +add.f16x2 r2631, r588, r2628; +} +{ +sub.f16x2 r2634, r2011, r2027; +} +{ +mul.f16x2 r2637, r2634, r2575; +} +{ +sub.f16x2 r2640, r2631, r2637; +} +{ +add.f16x2 r2643, r2017, r2033; +} +{ +mul.f16x2 r2646, r2643, r2573; +} +{ +add.f16x2 r2649, r588, r2646; +} +{ +sub.f16x2 r2652, r2011, r2027; +} +{ +mul.f16x2 r2655, r2652, r2575; +} +{ +add.f16x2 r2658, r2649, r2655; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2661, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2662, {low, high}; +} +{ +neg.f16x2 r2663, r2662; +} +{ +add.f16x2 r2665, r2043, r2059; +} +{ +add.f16x2 r2668, r394, r2665; +} +{ +add.f16x2 r2671, r2049, r2065; +} +{ +add.f16x2 r2674, r430, r2671; +} +{ +add.f16x2 r2677, r2043, r2059; +} +{ +mul.f16x2 r2680, r2677, r2661; +} +{ +add.f16x2 r2683, r394, r2680; +} +{ +sub.f16x2 r2686, r2049, r2065; +} +{ +mul.f16x2 r2689, r2686, r2663; +} +{ +add.f16x2 r2692, r2683, r2689; +} +{ +add.f16x2 r2695, r2043, r2059; +} +{ +mul.f16x2 r2698, r2695, r2661; +} +{ +add.f16x2 r2701, r394, r2698; +} +{ +sub.f16x2 r2704, r2049, r2065; +} +{ +mul.f16x2 r2707, r2704, r2663; +} +{ +sub.f16x2 r2710, r2701, r2707; +} +{ +add.f16x2 r2713, r2049, r2065; +} +{ +mul.f16x2 r2716, r2713, r2661; +} +{ +add.f16x2 r2719, r430, r2716; +} +{ +sub.f16x2 r2722, r2043, r2059; +} +{ +mul.f16x2 r2725, r2722, r2663; +} +{ +sub.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2049, r2065; +} +{ +mul.f16x2 r2734, r2731, r2661; +} +{ +add.f16x2 r2737, r430, r2734; +} +{ +sub.f16x2 r2740, r2043, r2059; +} +{ +mul.f16x2 r2743, r2740, r2663; +} +{ +add.f16x2 r2746, r2737, r2743; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2749, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2750, {low, high}; +} +{ +neg.f16x2 r2751, r2750; +} +{ +add.f16x2 r2753, r2075, r2091; +} +{ +add.f16x2 r2756, r482, r2753; +} +{ +add.f16x2 r2759, r2081, r2097; +} +{ +add.f16x2 r2762, r518, r2759; +} +{ +add.f16x2 r2765, r2075, r2091; +} +{ +mul.f16x2 r2768, r2765, r2749; +} +{ +add.f16x2 r2771, r482, r2768; +} +{ +sub.f16x2 r2774, r2081, r2097; +} +{ +mul.f16x2 r2777, r2774, r2751; +} +{ +add.f16x2 r2780, r2771, r2777; +} +{ +add.f16x2 r2783, r2075, r2091; +} +{ +mul.f16x2 r2786, r2783, r2749; +} +{ +add.f16x2 r2789, r482, r2786; +} +{ +sub.f16x2 r2792, r2081, r2097; +} +{ +mul.f16x2 r2795, r2792, r2751; +} +{ +sub.f16x2 r2798, r2789, r2795; +} +{ +add.f16x2 r2801, r2081, r2097; +} +{ +mul.f16x2 r2804, r2801, r2749; +} +{ +add.f16x2 r2807, r518, r2804; +} +{ +sub.f16x2 r2810, r2075, r2091; +} +{ +mul.f16x2 r2813, r2810, r2751; +} +{ +sub.f16x2 r2816, r2807, r2813; +} +{ +add.f16x2 r2819, r2081, r2097; +} +{ +mul.f16x2 r2822, r2819, r2749; +} +{ +add.f16x2 r2825, r518, r2822; +} +{ +sub.f16x2 r2828, r2075, r2091; +} +{ +mul.f16x2 r2831, r2828, r2751; +} +{ +add.f16x2 r2834, r2825, r2831; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2837, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2838, {low, high}; +} +{ +neg.f16x2 r2839, r2838; +} +{ +add.f16x2 r2841, r2107, r2123; +} +{ +add.f16x2 r2844, r570, r2841; +} +{ +add.f16x2 r2847, r2113, r2129; +} +{ +add.f16x2 r2850, r606, r2847; +} +{ +add.f16x2 r2853, r2107, r2123; +} +{ +mul.f16x2 r2856, r2853, r2837; +} +{ +add.f16x2 r2859, r570, r2856; +} +{ +sub.f16x2 r2862, r2113, r2129; +} +{ +mul.f16x2 r2865, r2862, r2839; +} +{ +add.f16x2 r2868, r2859, r2865; +} +{ +add.f16x2 r2871, r2107, r2123; +} +{ +mul.f16x2 r2874, r2871, r2837; +} +{ +add.f16x2 r2877, r570, r2874; +} +{ +sub.f16x2 r2880, r2113, r2129; +} +{ +mul.f16x2 r2883, r2880, r2839; +} +{ +sub.f16x2 r2886, r2877, r2883; +} +{ +add.f16x2 r2889, r2113, r2129; +} +{ +mul.f16x2 r2892, r2889, r2837; +} +{ +add.f16x2 r2895, r606, r2892; +} +{ +sub.f16x2 r2898, r2107, r2123; +} +{ +mul.f16x2 r2901, r2898, r2839; +} +{ +sub.f16x2 r2904, r2895, r2901; +} +{ +add.f16x2 r2907, r2113, r2129; +} +{ +mul.f16x2 r2910, r2907, r2837; +} +{ +add.f16x2 r2913, r606, r2910; +} +{ +sub.f16x2 r2916, r2107, r2123; +} +{ +mul.f16x2 r2919, r2916, r2839; +} +{ +add.f16x2 r2922, r2913, r2919; +} +mul.wide.u32 rd2, r4685, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r4686, rd3; +mul.lo.s32 r4687, r4686, 3; +sub.s32 r4688, r4685, r4687; +mad.lo.s32 r4689, r4686, 324, r4684; +cvt.rn.f32.u32 f401, r4688; +mul.f32 f402, f401, 0f3D9EDD1F; +cos.approx.f32 f309, f402; +sin.approx.f32 f403, f402; +neg.f32 f310, f403; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2925, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2928, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2930, {high, high}; +} +{ +mul.f16x2 r2932, r2234, r2930; +} +{ +neg.f16x2 r2935, r2932; +} +{ +fma.rn.f16x2 r2937, r2228, r2928, r2935; +} +{ +mul.f16x2 r2941, r2228, r2930; +} +{ +fma.rn.f16x2 r2944, r2234, r2928, r2941; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2950, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2952, {low, high}; +} +{ +mul.f16x2 r2953, r2950, r2952; +} +{ +mul.f16x2 r2956, r2925, r2948; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2959, {high, low}; +} +{ +fma.rn.f16x2 r2961, r2953, r2959, r2956; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2965, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2967, {high, high}; +} +{ +mul.f16x2 r2969, r2322, r2967; +} +{ +neg.f16x2 r2972, r2969; +} +{ +fma.rn.f16x2 r2974, r2316, r2965, r2972; +} +{ +mul.f16x2 r2978, r2316, r2967; +} +{ +fma.rn.f16x2 r2981, r2322, r2965, r2978; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r2987, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2989, {low, high}; +} +{ +mul.f16x2 r2990, r2987, r2989; +} +{ +mul.f16x2 r2993, r2961, r2985; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2961; +mov.b32 r2996, {high, low}; +} +{ +fma.rn.f16x2 r2998, r2990, r2996, r2993; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3002, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3004, {high, high}; +} +{ +mul.f16x2 r3006, r2410, r3004; +} +{ +neg.f16x2 r3009, r3006; +} +{ +fma.rn.f16x2 r3011, r2404, r3002, r3009; +} +{ +mul.f16x2 r3015, r2404, r3004; +} +{ +fma.rn.f16x2 r3018, r2410, r3002, r3015; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3024, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3026, {low, high}; +} +{ +mul.f16x2 r3027, r3024, r3026; +} +{ +mul.f16x2 r3030, r2998, r3022; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2998; +mov.b32 r3033, {high, low}; +} +{ +fma.rn.f16x2 r3035, r3027, r3033, r3030; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3039, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3041, {high, high}; +} +{ +mul.f16x2 r3043, r2498, r3041; +} +{ +neg.f16x2 r3046, r3043; +} +{ +fma.rn.f16x2 r3048, r2492, r3039, r3046; +} +{ +mul.f16x2 r3052, r2492, r3041; +} +{ +fma.rn.f16x2 r3055, r2498, r3039, r3052; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3061, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3063, {low, high}; +} +{ +mul.f16x2 r3064, r3061, r3063; +} +{ +mul.f16x2 r3067, r3035, r3059; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3035; +mov.b32 r3070, {high, low}; +} +{ +fma.rn.f16x2 r3072, r3064, r3070, r3067; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3076, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3078, {high, high}; +} +{ +mul.f16x2 r3080, r2586, r3078; +} +{ +neg.f16x2 r3083, r3080; +} +{ +fma.rn.f16x2 r3085, r2580, r3076, r3083; +} +{ +mul.f16x2 r3089, r2580, r3078; +} +{ +fma.rn.f16x2 r3092, r2586, r3076, r3089; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3098, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3100, {low, high}; +} +{ +mul.f16x2 r3101, r3098, r3100; +} +{ +mul.f16x2 r3104, r3072, r3096; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3072; +mov.b32 r3107, {high, low}; +} +{ +fma.rn.f16x2 r3109, r3101, r3107, r3104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3113, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3115, {high, high}; +} +{ +mul.f16x2 r3117, r2674, r3115; +} +{ +neg.f16x2 r3120, r3117; +} +{ +fma.rn.f16x2 r3122, r2668, r3113, r3120; +} +{ +mul.f16x2 r3126, r2668, r3115; +} +{ +fma.rn.f16x2 r3129, r2674, r3113, r3126; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3135, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3137, {low, high}; +} +{ +mul.f16x2 r3138, r3135, r3137; +} +{ +mul.f16x2 r3141, r3109, r3133; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3109; +mov.b32 r3144, {high, low}; +} +{ +fma.rn.f16x2 r3146, r3138, r3144, r3141; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3150, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3152, {high, high}; +} +{ +mul.f16x2 r3154, r2762, r3152; +} +{ +neg.f16x2 r3157, r3154; +} +{ +fma.rn.f16x2 r3159, r2756, r3150, r3157; +} +{ +mul.f16x2 r3163, r2756, r3152; +} +{ +fma.rn.f16x2 r3166, r2762, r3150, r3163; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3172, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3174, {low, high}; +} +{ +mul.f16x2 r3175, r3172, r3174; +} +{ +mul.f16x2 r3178, r3146, r3170; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3146; +mov.b32 r3181, {high, low}; +} +{ +fma.rn.f16x2 r3183, r3175, r3181, r3178; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3187, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3189, {high, high}; +} +{ +mul.f16x2 r3191, r2850, r3189; +} +{ +neg.f16x2 r3194, r3191; +} +{ +fma.rn.f16x2 r3196, r2844, r3187, r3194; +} +{ +mul.f16x2 r3200, r2844, r3189; +} +{ +fma.rn.f16x2 r3203, r2850, r3187, r3200; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3209, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3211, {low, high}; +} +{ +mul.f16x2 r3212, r3209, r3211; +} +{ +mul.f16x2 r3215, r3183, r3207; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3183; +mov.b32 r3218, {high, low}; +} +{ +fma.rn.f16x2 r3220, r3212, r3218, r3215; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3224, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3226, {high, high}; +} +{ +mul.f16x2 r3228, r2200, r3226; +} +{ +neg.f16x2 r3231, r3228; +} +{ +fma.rn.f16x2 r3233, r2164, r3224, r3231; +} +{ +mul.f16x2 r3237, r2164, r3226; +} +{ +fma.rn.f16x2 r3240, r2200, r3224, r3237; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3246, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3248, {low, high}; +} +{ +mul.f16x2 r3249, r3246, r3248; +} +{ +mul.f16x2 r3252, r3220, r3244; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3220; +mov.b32 r3255, {high, low}; +} +{ +fma.rn.f16x2 r3257, r3249, r3255, r3252; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3261, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3263, {high, high}; +} +{ +mul.f16x2 r3265, r2288, r3263; +} +{ +neg.f16x2 r3268, r3265; +} +{ +fma.rn.f16x2 r3270, r2252, r3261, r3268; +} +{ +mul.f16x2 r3274, r2252, r3263; +} +{ +fma.rn.f16x2 r3277, r2288, r3261, r3274; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3283, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3285, {low, high}; +} +{ +mul.f16x2 r3286, r3283, r3285; +} +{ +mul.f16x2 r3289, r3257, r3281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3257; +mov.b32 r3292, {high, low}; +} +{ +fma.rn.f16x2 r3294, r3286, r3292, r3289; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3298, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3300, {high, high}; +} +{ +mul.f16x2 r3302, r2376, r3300; +} +{ +neg.f16x2 r3305, r3302; +} +{ +fma.rn.f16x2 r3307, r2340, r3298, r3305; +} +{ +mul.f16x2 r3311, r2340, r3300; +} +{ +fma.rn.f16x2 r3314, r2376, r3298, r3311; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3320, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3322, {low, high}; +} +{ +mul.f16x2 r3323, r3320, r3322; +} +{ +mul.f16x2 r3326, r3294, r3318; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3294; +mov.b32 r3329, {high, low}; +} +{ +fma.rn.f16x2 r3331, r3323, r3329, r3326; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3335, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3337, {high, high}; +} +{ +mul.f16x2 r3339, r2464, r3337; +} +{ +neg.f16x2 r3342, r3339; +} +{ +fma.rn.f16x2 r3344, r2428, r3335, r3342; +} +{ +mul.f16x2 r3348, r2428, r3337; +} +{ +fma.rn.f16x2 r3351, r2464, r3335, r3348; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3357, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3359, {low, high}; +} +{ +mul.f16x2 r3360, r3357, r3359; +} +{ +mul.f16x2 r3363, r3331, r3355; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3331; +mov.b32 r3366, {high, low}; +} +{ +fma.rn.f16x2 r3368, r3360, r3366, r3363; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3372, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3374, {high, high}; +} +{ +mul.f16x2 r3376, r2552, r3374; +} +{ +neg.f16x2 r3379, r3376; +} +{ +fma.rn.f16x2 r3381, r2516, r3372, r3379; +} +{ +mul.f16x2 r3385, r2516, r3374; +} +{ +fma.rn.f16x2 r3388, r2552, r3372, r3385; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3394, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3396, {low, high}; +} +{ +mul.f16x2 r3397, r3394, r3396; +} +{ +mul.f16x2 r3400, r3368, r3392; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3368; +mov.b32 r3403, {high, low}; +} +{ +fma.rn.f16x2 r3405, r3397, r3403, r3400; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3409, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3411, {high, high}; +} +{ +mul.f16x2 r3413, r2640, r3411; +} +{ +neg.f16x2 r3416, r3413; +} +{ +fma.rn.f16x2 r3418, r2604, r3409, r3416; +} +{ +mul.f16x2 r3422, r2604, r3411; +} +{ +fma.rn.f16x2 r3425, r2640, r3409, r3422; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3431, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3433, {low, high}; +} +{ +mul.f16x2 r3434, r3431, r3433; +} +{ +mul.f16x2 r3437, r3405, r3429; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3405; +mov.b32 r3440, {high, low}; +} +{ +fma.rn.f16x2 r3442, r3434, r3440, r3437; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3446, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3448, {high, high}; +} +{ +mul.f16x2 r3450, r2728, r3448; +} +{ +neg.f16x2 r3453, r3450; +} +{ +fma.rn.f16x2 r3455, r2692, r3446, r3453; +} +{ +mul.f16x2 r3459, r2692, r3448; +} +{ +fma.rn.f16x2 r3462, r2728, r3446, r3459; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3468, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3470, {low, high}; +} +{ +mul.f16x2 r3471, r3468, r3470; +} +{ +mul.f16x2 r3474, r3442, r3466; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3442; +mov.b32 r3477, {high, low}; +} +{ +fma.rn.f16x2 r3479, r3471, r3477, r3474; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3483, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3485, {high, high}; +} +{ +mul.f16x2 r3487, r2816, r3485; +} +{ +neg.f16x2 r3490, r3487; +} +{ +fma.rn.f16x2 r3492, r2780, r3483, r3490; +} +{ +mul.f16x2 r3496, r2780, r3485; +} +{ +fma.rn.f16x2 r3499, r2816, r3483, r3496; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3505, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3507, {low, high}; +} +{ +mul.f16x2 r3508, r3505, r3507; +} +{ +mul.f16x2 r3511, r3479, r3503; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3479; +mov.b32 r3514, {high, low}; +} +{ +fma.rn.f16x2 r3516, r3508, r3514, r3511; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3520, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3522, {high, high}; +} +{ +mul.f16x2 r3524, r2904, r3522; +} +{ +neg.f16x2 r3527, r3524; +} +{ +fma.rn.f16x2 r3529, r2868, r3520, r3527; +} +{ +mul.f16x2 r3533, r2868, r3522; +} +{ +fma.rn.f16x2 r3536, r2904, r3520, r3533; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3542, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3544, {low, high}; +} +{ +mul.f16x2 r3545, r3542, r3544; +} +{ +mul.f16x2 r3548, r3516, r3540; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3516; +mov.b32 r3551, {high, low}; +} +{ +fma.rn.f16x2 r3553, r3545, r3551, r3548; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3557, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3559, {high, high}; +} +{ +mul.f16x2 r3561, r2218, r3559; +} +{ +neg.f16x2 r3564, r3561; +} +{ +fma.rn.f16x2 r3566, r2182, r3557, r3564; +} +{ +mul.f16x2 r3570, r2182, r3559; +} +{ +fma.rn.f16x2 r3573, r2218, r3557, r3570; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3579, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3581, {low, high}; +} +{ +mul.f16x2 r3582, r3579, r3581; +} +{ +mul.f16x2 r3585, r3553, r3577; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3553; +mov.b32 r3588, {high, low}; +} +{ +fma.rn.f16x2 r3590, r3582, r3588, r3585; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3594, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3596, {high, high}; +} +{ +mul.f16x2 r3598, r2306, r3596; +} +{ +neg.f16x2 r3601, r3598; +} +{ +fma.rn.f16x2 r3603, r2270, r3594, r3601; +} +{ +mul.f16x2 r3607, r2270, r3596; +} +{ +fma.rn.f16x2 r3610, r2306, r3594, r3607; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3616, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3618, {low, high}; +} +{ +mul.f16x2 r3619, r3616, r3618; +} +{ +mul.f16x2 r3622, r3590, r3614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3590; +mov.b32 r3625, {high, low}; +} +{ +fma.rn.f16x2 r3627, r3619, r3625, r3622; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3631, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3633, {high, high}; +} +{ +mul.f16x2 r3635, r2394, r3633; +} +{ +neg.f16x2 r3638, r3635; +} +{ +fma.rn.f16x2 r3640, r2358, r3631, r3638; +} +{ +mul.f16x2 r3644, r2358, r3633; +} +{ +fma.rn.f16x2 r3647, r2394, r3631, r3644; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3653, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3655, {low, high}; +} +{ +mul.f16x2 r3656, r3653, r3655; +} +{ +mul.f16x2 r3659, r3627, r3651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3627; +mov.b32 r3662, {high, low}; +} +{ +fma.rn.f16x2 r3664, r3656, r3662, r3659; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3668, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3670, {high, high}; +} +{ +mul.f16x2 r3672, r2482, r3670; +} +{ +neg.f16x2 r3675, r3672; +} +{ +fma.rn.f16x2 r3677, r2446, r3668, r3675; +} +{ +mul.f16x2 r3681, r2446, r3670; +} +{ +fma.rn.f16x2 r3684, r2482, r3668, r3681; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3690, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3692, {low, high}; +} +{ +mul.f16x2 r3693, r3690, r3692; +} +{ +mul.f16x2 r3696, r3664, r3688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3664; +mov.b32 r3699, {high, low}; +} +{ +fma.rn.f16x2 r3701, r3693, r3699, r3696; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3705, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3707, {high, high}; +} +{ +mul.f16x2 r3709, r2570, r3707; +} +{ +neg.f16x2 r3712, r3709; +} +{ +fma.rn.f16x2 r3714, r2534, r3705, r3712; +} +{ +mul.f16x2 r3718, r2534, r3707; +} +{ +fma.rn.f16x2 r3721, r2570, r3705, r3718; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3727, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3729, {low, high}; +} +{ +mul.f16x2 r3730, r3727, r3729; +} +{ +mul.f16x2 r3733, r3701, r3725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3701; +mov.b32 r3736, {high, low}; +} +{ +fma.rn.f16x2 r3738, r3730, r3736, r3733; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3742, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3744, {high, high}; +} +{ +mul.f16x2 r3746, r2658, r3744; +} +{ +neg.f16x2 r3749, r3746; +} +{ +fma.rn.f16x2 r3751, r2622, r3742, r3749; +} +{ +mul.f16x2 r3755, r2622, r3744; +} +{ +fma.rn.f16x2 r3758, r2658, r3742, r3755; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3764, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3766, {low, high}; +} +{ +mul.f16x2 r3767, r3764, r3766; +} +{ +mul.f16x2 r3770, r3738, r3762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3738; +mov.b32 r3773, {high, low}; +} +{ +fma.rn.f16x2 r3775, r3767, r3773, r3770; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3779, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3781, {high, high}; +} +{ +mul.f16x2 r3783, r2746, r3781; +} +{ +neg.f16x2 r3786, r3783; +} +{ +fma.rn.f16x2 r3788, r2710, r3779, r3786; +} +{ +mul.f16x2 r3792, r2710, r3781; +} +{ +fma.rn.f16x2 r3795, r2746, r3779, r3792; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3801, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3803, {low, high}; +} +{ +mul.f16x2 r3804, r3801, r3803; +} +{ +mul.f16x2 r3807, r3775, r3799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3775; +mov.b32 r3810, {high, low}; +} +{ +fma.rn.f16x2 r3812, r3804, r3810, r3807; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3816, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3818, {high, high}; +} +{ +mul.f16x2 r3820, r2834, r3818; +} +{ +neg.f16x2 r3823, r3820; +} +{ +fma.rn.f16x2 r3825, r2798, r3816, r3823; +} +{ +mul.f16x2 r3829, r2798, r3818; +} +{ +fma.rn.f16x2 r3832, r2834, r3816, r3829; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3836, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2925; +mov.b32 r3838, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3840, {low, high}; +} +{ +mul.f16x2 r3841, r3838, r3840; +} +{ +mul.f16x2 r3844, r3812, r3836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3812; +mov.b32 r3847, {high, low}; +} +{ +fma.rn.f16x2 r3849, r3841, r3847, r3844; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3853, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3849; +mov.b32 r3855, {high, high}; +} +{ +mul.f16x2 r3857, r2922, r3855; +} +{ +neg.f16x2 r3860, r3857; +} +{ +fma.rn.f16x2 r3862, r2886, r3853, r3860; +} +{ +mul.f16x2 r3866, r2886, r3855; +} +{ +fma.rn.f16x2 r3869, r2922, r3853, r3866; +} +barrier.sync 0; +mad.lo.s32 r4690, r4688, 108, r4689; +st.shared.u32 [r4690], r2140; +st.shared.u32 [r4690+4], r2937; +st.shared.u32 [r4690+8], r2974; +st.shared.u32 [r4690+12], r3011; +st.shared.u32 [r4690+16], r3048; +st.shared.u32 [r4690+20], r3085; +st.shared.u32 [r4690+24], r3122; +st.shared.u32 [r4690+28], r3159; +st.shared.u32 [r4690+32], r3196; +st.shared.u32 [r4690+36], r3233; +st.shared.u32 [r4690+40], r3270; +st.shared.u32 [r4690+44], r3307; +st.shared.u32 [r4690+48], r3344; +st.shared.u32 [r4690+52], r3381; +st.shared.u32 [r4690+56], r3418; +st.shared.u32 [r4690+60], r3455; +st.shared.u32 [r4690+64], r3492; +st.shared.u32 [r4690+68], r3529; +st.shared.u32 [r4690+72], r3566; +st.shared.u32 [r4690+76], r3603; +st.shared.u32 [r4690+80], r3640; +st.shared.u32 [r4690+84], r3677; +st.shared.u32 [r4690+88], r3714; +st.shared.u32 [r4690+92], r3751; +st.shared.u32 [r4690+96], r3788; +st.shared.u32 [r4690+100], r3825; +st.shared.u32 [r4690+104], r3862; +barrier.sync 0; +mad.lo.s32 r4691, r4688, -104, r4690; +ld.shared.u32 r3898, [r4691]; +ld.shared.u32 r3986, [r4691+12]; +ld.shared.u32 r4074, [r4691+24]; +ld.shared.u32 r4162, [r4691+36]; +ld.shared.u32 r4250, [r4691+48]; +ld.shared.u32 r4338, [r4691+60]; +ld.shared.u32 r4426, [r4691+72]; +ld.shared.u32 r4514, [r4691+84]; +ld.shared.u32 r4602, [r4691+96]; +ld.shared.u32 r3895, [r4691+108]; +ld.shared.u32 r3983, [r4691+120]; +ld.shared.u32 r4071, [r4691+132]; +ld.shared.u32 r4159, [r4691+144]; +ld.shared.u32 r4247, [r4691+156]; +ld.shared.u32 r4335, [r4691+168]; +ld.shared.u32 r4423, [r4691+180]; +ld.shared.u32 r4511, [r4691+192]; +ld.shared.u32 r4599, [r4691+204]; +ld.shared.u32 r3896, [r4691+216]; +ld.shared.u32 r3984, [r4691+228]; +ld.shared.u32 r4072, [r4691+240]; +ld.shared.u32 r4160, [r4691+252]; +ld.shared.u32 r4248, [r4691+264]; +ld.shared.u32 r4336, [r4691+276]; +ld.shared.u32 r4424, [r4691+288]; +ld.shared.u32 r4512, [r4691+300]; +ld.shared.u32 r4600, [r4691+312]; +barrier.sync 0; +st.shared.u32 [r4690], r2146; +st.shared.u32 [r4690+4], r2944; +st.shared.u32 [r4690+8], r2981; +st.shared.u32 [r4690+12], r3018; +st.shared.u32 [r4690+16], r3055; +st.shared.u32 [r4690+20], r3092; +st.shared.u32 [r4690+24], r3129; +st.shared.u32 [r4690+28], r3166; +st.shared.u32 [r4690+32], r3203; +st.shared.u32 [r4690+36], r3240; +st.shared.u32 [r4690+40], r3277; +st.shared.u32 [r4690+44], r3314; +st.shared.u32 [r4690+48], r3351; +st.shared.u32 [r4690+52], r3388; +st.shared.u32 [r4690+56], r3425; +st.shared.u32 [r4690+60], r3462; +st.shared.u32 [r4690+64], r3499; +st.shared.u32 [r4690+68], r3536; +st.shared.u32 [r4690+72], r3573; +st.shared.u32 [r4690+76], r3610; +st.shared.u32 [r4690+80], r3647; +st.shared.u32 [r4690+84], r3684; +st.shared.u32 [r4690+88], r3721; +st.shared.u32 [r4690+92], r3758; +st.shared.u32 [r4690+96], r3795; +st.shared.u32 [r4690+100], r3832; +st.shared.u32 [r4690+104], r3869; +barrier.sync 0; +ld.shared.u32 r3904, [r4691]; +ld.shared.u32 r3992, [r4691+12]; +ld.shared.u32 r4080, [r4691+24]; +ld.shared.u32 r4168, [r4691+36]; +ld.shared.u32 r4256, [r4691+48]; +ld.shared.u32 r4344, [r4691+60]; +ld.shared.u32 r4432, [r4691+72]; +ld.shared.u32 r4520, [r4691+84]; +ld.shared.u32 r4608, [r4691+96]; +ld.shared.u32 r3901, [r4691+108]; +ld.shared.u32 r3989, [r4691+120]; +ld.shared.u32 r4077, [r4691+132]; +ld.shared.u32 r4165, [r4691+144]; +ld.shared.u32 r4253, [r4691+156]; +ld.shared.u32 r4341, [r4691+168]; +ld.shared.u32 r4429, [r4691+180]; +ld.shared.u32 r4517, [r4691+192]; +ld.shared.u32 r4605, [r4691+204]; +ld.shared.u32 r3902, [r4691+216]; +ld.shared.u32 r3990, [r4691+228]; +ld.shared.u32 r4078, [r4691+240]; +ld.shared.u32 r4166, [r4691+252]; +ld.shared.u32 r4254, [r4691+264]; +ld.shared.u32 r4342, [r4691+276]; +ld.shared.u32 r4430, [r4691+288]; +ld.shared.u32 r4518, [r4691+300]; +ld.shared.u32 r4606, [r4691+312]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3890, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3891, {low, high}; +} +{ +neg.f16x2 r3892, r3891; +} +{ +add.f16x2 r3894, r3895, r3896; +} +{ +add.f16x2 %0, r3898, r3894; +} +{ +add.f16x2 r3900, r3901, r3902; +} +{ +add.f16x2 %1, r3904, r3900; +} +{ +add.f16x2 r3906, r3895, r3896; +} +{ +mul.f16x2 r3909, r3906, r3890; +} +{ +add.f16x2 r3912, r3898, r3909; +} +{ +sub.f16x2 r3915, r3901, r3902; +} +{ +mul.f16x2 r3918, r3915, r3892; +} +{ +add.f16x2 %18, r3912, r3918; +} +{ +add.f16x2 r3924, r3895, r3896; +} +{ +mul.f16x2 r3927, r3924, r3890; +} +{ +add.f16x2 r3930, r3898, r3927; +} +{ +sub.f16x2 r3933, r3901, r3902; +} +{ +mul.f16x2 r3936, r3933, r3892; +} +{ +sub.f16x2 %36, r3930, r3936; +} +{ +add.f16x2 r3942, r3901, r3902; +} +{ +mul.f16x2 r3945, r3942, r3890; +} +{ +add.f16x2 r3948, r3904, r3945; +} +{ +sub.f16x2 r3951, r3895, r3896; +} +{ +mul.f16x2 r3954, r3951, r3892; +} +{ +sub.f16x2 %19, r3948, r3954; +} +{ +add.f16x2 r3960, r3901, r3902; +} +{ +mul.f16x2 r3963, r3960, r3890; +} +{ +add.f16x2 r3966, r3904, r3963; +} +{ +sub.f16x2 r3969, r3895, r3896; +} +{ +mul.f16x2 r3972, r3969, r3892; +} +{ +add.f16x2 %37, r3966, r3972; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3978, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3979, {low, high}; +} +{ +neg.f16x2 r3980, r3979; +} +{ +add.f16x2 r3982, r3983, r3984; +} +{ +add.f16x2 %2, r3986, r3982; +} +{ +add.f16x2 r3988, r3989, r3990; +} +{ +add.f16x2 %3, r3992, r3988; +} +{ +add.f16x2 r3994, r3983, r3984; +} +{ +mul.f16x2 r3997, r3994, r3978; +} +{ +add.f16x2 r4000, r3986, r3997; +} +{ +sub.f16x2 r4003, r3989, r3990; +} +{ +mul.f16x2 r4006, r4003, r3980; +} +{ +add.f16x2 %20, r4000, r4006; +} +{ +add.f16x2 r4012, r3983, r3984; +} +{ +mul.f16x2 r4015, r4012, r3978; +} +{ +add.f16x2 r4018, r3986, r4015; +} +{ +sub.f16x2 r4021, r3989, r3990; +} +{ +mul.f16x2 r4024, r4021, r3980; +} +{ +sub.f16x2 %38, r4018, r4024; +} +{ +add.f16x2 r4030, r3989, r3990; +} +{ +mul.f16x2 r4033, r4030, r3978; +} +{ +add.f16x2 r4036, r3992, r4033; +} +{ +sub.f16x2 r4039, r3983, r3984; +} +{ +mul.f16x2 r4042, r4039, r3980; +} +{ +sub.f16x2 %21, r4036, r4042; +} +{ +add.f16x2 r4048, r3989, r3990; +} +{ +mul.f16x2 r4051, r4048, r3978; +} +{ +add.f16x2 r4054, r3992, r4051; +} +{ +sub.f16x2 r4057, r3983, r3984; +} +{ +mul.f16x2 r4060, r4057, r3980; +} +{ +add.f16x2 %39, r4054, r4060; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4066, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4067, {low, high}; +} +{ +neg.f16x2 r4068, r4067; +} +{ +add.f16x2 r4070, r4071, r4072; +} +{ +add.f16x2 %4, r4074, r4070; +} +{ +add.f16x2 r4076, r4077, r4078; +} +{ +add.f16x2 %5, r4080, r4076; +} +{ +add.f16x2 r4082, r4071, r4072; +} +{ +mul.f16x2 r4085, r4082, r4066; +} +{ +add.f16x2 r4088, r4074, r4085; +} +{ +sub.f16x2 r4091, r4077, r4078; +} +{ +mul.f16x2 r4094, r4091, r4068; +} +{ +add.f16x2 %22, r4088, r4094; +} +{ +add.f16x2 r4100, r4071, r4072; +} +{ +mul.f16x2 r4103, r4100, r4066; +} +{ +add.f16x2 r4106, r4074, r4103; +} +{ +sub.f16x2 r4109, r4077, r4078; +} +{ +mul.f16x2 r4112, r4109, r4068; +} +{ +sub.f16x2 %40, r4106, r4112; +} +{ +add.f16x2 r4118, r4077, r4078; +} +{ +mul.f16x2 r4121, r4118, r4066; +} +{ +add.f16x2 r4124, r4080, r4121; +} +{ +sub.f16x2 r4127, r4071, r4072; +} +{ +mul.f16x2 r4130, r4127, r4068; +} +{ +sub.f16x2 %23, r4124, r4130; +} +{ +add.f16x2 r4136, r4077, r4078; +} +{ +mul.f16x2 r4139, r4136, r4066; +} +{ +add.f16x2 r4142, r4080, r4139; +} +{ +sub.f16x2 r4145, r4071, r4072; +} +{ +mul.f16x2 r4148, r4145, r4068; +} +{ +add.f16x2 %41, r4142, r4148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4155, {low, high}; +} +{ +neg.f16x2 r4156, r4155; +} +{ +add.f16x2 r4158, r4159, r4160; +} +{ +add.f16x2 %6, r4162, r4158; +} +{ +add.f16x2 r4164, r4165, r4166; +} +{ +add.f16x2 %7, r4168, r4164; +} +{ +add.f16x2 r4170, r4159, r4160; +} +{ +mul.f16x2 r4173, r4170, r4154; +} +{ +add.f16x2 r4176, r4162, r4173; +} +{ +sub.f16x2 r4179, r4165, r4166; +} +{ +mul.f16x2 r4182, r4179, r4156; +} +{ +add.f16x2 %24, r4176, r4182; +} +{ +add.f16x2 r4188, r4159, r4160; +} +{ +mul.f16x2 r4191, r4188, r4154; +} +{ +add.f16x2 r4194, r4162, r4191; +} +{ +sub.f16x2 r4197, r4165, r4166; +} +{ +mul.f16x2 r4200, r4197, r4156; +} +{ +sub.f16x2 %42, r4194, r4200; +} +{ +add.f16x2 r4206, r4165, r4166; +} +{ +mul.f16x2 r4209, r4206, r4154; +} +{ +add.f16x2 r4212, r4168, r4209; +} +{ +sub.f16x2 r4215, r4159, r4160; +} +{ +mul.f16x2 r4218, r4215, r4156; +} +{ +sub.f16x2 %25, r4212, r4218; +} +{ +add.f16x2 r4224, r4165, r4166; +} +{ +mul.f16x2 r4227, r4224, r4154; +} +{ +add.f16x2 r4230, r4168, r4227; +} +{ +sub.f16x2 r4233, r4159, r4160; +} +{ +mul.f16x2 r4236, r4233, r4156; +} +{ +add.f16x2 %43, r4230, r4236; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4242, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4243, {low, high}; +} +{ +neg.f16x2 r4244, r4243; +} +{ +add.f16x2 r4246, r4247, r4248; +} +{ +add.f16x2 %8, r4250, r4246; +} +{ +add.f16x2 r4252, r4253, r4254; +} +{ +add.f16x2 %9, r4256, r4252; +} +{ +add.f16x2 r4258, r4247, r4248; +} +{ +mul.f16x2 r4261, r4258, r4242; +} +{ +add.f16x2 r4264, r4250, r4261; +} +{ +sub.f16x2 r4267, r4253, r4254; +} +{ +mul.f16x2 r4270, r4267, r4244; +} +{ +add.f16x2 %26, r4264, r4270; +} +{ +add.f16x2 r4276, r4247, r4248; +} +{ +mul.f16x2 r4279, r4276, r4242; +} +{ +add.f16x2 r4282, r4250, r4279; +} +{ +sub.f16x2 r4285, r4253, r4254; +} +{ +mul.f16x2 r4288, r4285, r4244; +} +{ +sub.f16x2 %44, r4282, r4288; +} +{ +add.f16x2 r4294, r4253, r4254; +} +{ +mul.f16x2 r4297, r4294, r4242; +} +{ +add.f16x2 r4300, r4256, r4297; +} +{ +sub.f16x2 r4303, r4247, r4248; +} +{ +mul.f16x2 r4306, r4303, r4244; +} +{ +sub.f16x2 %27, r4300, r4306; +} +{ +add.f16x2 r4312, r4253, r4254; +} +{ +mul.f16x2 r4315, r4312, r4242; +} +{ +add.f16x2 r4318, r4256, r4315; +} +{ +sub.f16x2 r4321, r4247, r4248; +} +{ +mul.f16x2 r4324, r4321, r4244; +} +{ +add.f16x2 %45, r4318, r4324; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4330, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4331, {low, high}; +} +{ +neg.f16x2 r4332, r4331; +} +{ +add.f16x2 r4334, r4335, r4336; +} +{ +add.f16x2 %10, r4338, r4334; +} +{ +add.f16x2 r4340, r4341, r4342; +} +{ +add.f16x2 %11, r4344, r4340; +} +{ +add.f16x2 r4346, r4335, r4336; +} +{ +mul.f16x2 r4349, r4346, r4330; +} +{ +add.f16x2 r4352, r4338, r4349; +} +{ +sub.f16x2 r4355, r4341, r4342; +} +{ +mul.f16x2 r4358, r4355, r4332; +} +{ +add.f16x2 %28, r4352, r4358; +} +{ +add.f16x2 r4364, r4335, r4336; +} +{ +mul.f16x2 r4367, r4364, r4330; +} +{ +add.f16x2 r4370, r4338, r4367; +} +{ +sub.f16x2 r4373, r4341, r4342; +} +{ +mul.f16x2 r4376, r4373, r4332; +} +{ +sub.f16x2 %46, r4370, r4376; +} +{ +add.f16x2 r4382, r4341, r4342; +} +{ +mul.f16x2 r4385, r4382, r4330; +} +{ +add.f16x2 r4388, r4344, r4385; +} +{ +sub.f16x2 r4391, r4335, r4336; +} +{ +mul.f16x2 r4394, r4391, r4332; +} +{ +sub.f16x2 %29, r4388, r4394; +} +{ +add.f16x2 r4400, r4341, r4342; +} +{ +mul.f16x2 r4403, r4400, r4330; +} +{ +add.f16x2 r4406, r4344, r4403; +} +{ +sub.f16x2 r4409, r4335, r4336; +} +{ +mul.f16x2 r4412, r4409, r4332; +} +{ +add.f16x2 %47, r4406, r4412; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4418, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4419, {low, high}; +} +{ +neg.f16x2 r4420, r4419; +} +{ +add.f16x2 r4422, r4423, r4424; +} +{ +add.f16x2 %12, r4426, r4422; +} +{ +add.f16x2 r4428, r4429, r4430; +} +{ +add.f16x2 %13, r4432, r4428; +} +{ +add.f16x2 r4434, r4423, r4424; +} +{ +mul.f16x2 r4437, r4434, r4418; +} +{ +add.f16x2 r4440, r4426, r4437; +} +{ +sub.f16x2 r4443, r4429, r4430; +} +{ +mul.f16x2 r4446, r4443, r4420; +} +{ +add.f16x2 %30, r4440, r4446; +} +{ +add.f16x2 r4452, r4423, r4424; +} +{ +mul.f16x2 r4455, r4452, r4418; +} +{ +add.f16x2 r4458, r4426, r4455; +} +{ +sub.f16x2 r4461, r4429, r4430; +} +{ +mul.f16x2 r4464, r4461, r4420; +} +{ +sub.f16x2 %48, r4458, r4464; +} +{ +add.f16x2 r4470, r4429, r4430; +} +{ +mul.f16x2 r4473, r4470, r4418; +} +{ +add.f16x2 r4476, r4432, r4473; +} +{ +sub.f16x2 r4479, r4423, r4424; +} +{ +mul.f16x2 r4482, r4479, r4420; +} +{ +sub.f16x2 %31, r4476, r4482; +} +{ +add.f16x2 r4488, r4429, r4430; +} +{ +mul.f16x2 r4491, r4488, r4418; +} +{ +add.f16x2 r4494, r4432, r4491; +} +{ +sub.f16x2 r4497, r4423, r4424; +} +{ +mul.f16x2 r4500, r4497, r4420; +} +{ +add.f16x2 %49, r4494, r4500; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4506, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4507, {low, high}; +} +{ +neg.f16x2 r4508, r4507; +} +{ +add.f16x2 r4510, r4511, r4512; +} +{ +add.f16x2 %14, r4514, r4510; +} +{ +add.f16x2 r4516, r4517, r4518; +} +{ +add.f16x2 %15, r4520, r4516; +} +{ +add.f16x2 r4522, r4511, r4512; +} +{ +mul.f16x2 r4525, r4522, r4506; +} +{ +add.f16x2 r4528, r4514, r4525; +} +{ +sub.f16x2 r4531, r4517, r4518; +} +{ +mul.f16x2 r4534, r4531, r4508; +} +{ +add.f16x2 %32, r4528, r4534; +} +{ +add.f16x2 r4540, r4511, r4512; +} +{ +mul.f16x2 r4543, r4540, r4506; +} +{ +add.f16x2 r4546, r4514, r4543; +} +{ +sub.f16x2 r4549, r4517, r4518; +} +{ +mul.f16x2 r4552, r4549, r4508; +} +{ +sub.f16x2 %50, r4546, r4552; +} +{ +add.f16x2 r4558, r4517, r4518; +} +{ +mul.f16x2 r4561, r4558, r4506; +} +{ +add.f16x2 r4564, r4520, r4561; +} +{ +sub.f16x2 r4567, r4511, r4512; +} +{ +mul.f16x2 r4570, r4567, r4508; +} +{ +sub.f16x2 %33, r4564, r4570; +} +{ +add.f16x2 r4576, r4517, r4518; +} +{ +mul.f16x2 r4579, r4576, r4506; +} +{ +add.f16x2 r4582, r4520, r4579; +} +{ +sub.f16x2 r4585, r4511, r4512; +} +{ +mul.f16x2 r4588, r4585, r4508; +} +{ +add.f16x2 %51, r4582, r4588; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4594, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4595, {low, high}; +} +{ +neg.f16x2 r4596, r4595; +} +{ +add.f16x2 r4598, r4599, r4600; +} +{ +add.f16x2 %16, r4602, r4598; +} +{ +add.f16x2 r4604, r4605, r4606; +} +{ +add.f16x2 %17, r4608, r4604; +} +{ +add.f16x2 r4610, r4599, r4600; +} +{ +mul.f16x2 r4613, r4610, r4594; +} +{ +add.f16x2 r4616, r4602, r4613; +} +{ +sub.f16x2 r4619, r4605, r4606; +} +{ +mul.f16x2 r4622, r4619, r4596; +} +{ +add.f16x2 %34, r4616, r4622; +} +{ +add.f16x2 r4628, r4599, r4600; +} +{ +mul.f16x2 r4631, r4628, r4594; +} +{ +add.f16x2 r4634, r4602, r4631; +} +{ +sub.f16x2 r4637, r4605, r4606; +} +{ +mul.f16x2 r4640, r4637, r4596; +} +{ +sub.f16x2 %52, r4634, r4640; +} +{ +add.f16x2 r4646, r4605, r4606; +} +{ +mul.f16x2 r4649, r4646, r4594; +} +{ +add.f16x2 r4652, r4608, r4649; +} +{ +sub.f16x2 r4655, r4599, r4600; +} +{ +mul.f16x2 r4658, r4655, r4596; +} +{ +sub.f16x2 %35, r4652, r4658; +} +{ +add.f16x2 r4664, r4605, r4606; +} +{ +mul.f16x2 r4667, r4664, r4594; +} +{ +add.f16x2 r4670, r4608, r4667; +} +{ +sub.f16x2 r4673, r4599, r4600; +} +{ +mul.f16x2 r4676, r4673, r4596; +} +{ +add.f16x2 %53, r4670, r4676; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[23].y)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[4].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<876, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<611>; +.reg .b64 rd<8>; +mov.u32 r584, %tid.y; +mov.u32 r585, %6; +mad.lo.s32 r586, r584, 648, r585; +mov.u32 r587, %tid.x; +mov.f32 f38, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r1, {low, high}; +} +mov.f32 f40, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r587, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r588, rd3; +sub.s32 r589, r587, r588; +shr.u32 r590, r589, 1; +add.s32 r591, r590, r588; +shr.u32 r592, r591, 4; +mul.lo.s32 r593, r592, 27; +sub.s32 r594, r587, r593; +mad.lo.s32 r595, r592, 648, r586; +cvt.rn.f32.u32 f41, r594; +mul.f32 f42, f41, 0f3D9EDD1F; +cos.approx.f32 f5, f42; +sin.approx.f32 f43, f42; +neg.f32 f6, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r596, r594, 24, r595; +st.shared.v2.f32 [r596], {r8, r14}; +st.shared.v2.f32 [r596+8], {r101, r108}; +st.shared.v2.f32 [r596+16], {r138, r145}; +barrier.sync 0; +shl.b32 r597, r594, 4; +sub.s32 r598, r596, r597; +ld.shared.u32 r174, [r598]; +ld.shared.u32 r180, [r598+4]; +ld.shared.u32 r171, [r598+216]; +ld.shared.u32 r177, [r598+220]; +ld.shared.u32 r172, [r598+432]; +ld.shared.u32 r178, [r598+436]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r594, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r599, rd5; +mul.lo.s32 r600, r599, 3; +sub.s32 r601, r594, r600; +shl.b32 r602, r601, 3; +add.s32 r603, r595, r602; +cvt.rn.f32.u32 f44, r599; +mul.f32 f45, f44, 0f3E6E4BAE; +cos.approx.f32 f17, f45; +sin.approx.f32 f46, f45; +neg.f32 f18, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r604, r599, 72, r603; +st.shared.u32 [r604], r173; +st.shared.u32 [r604+4], r179; +st.shared.u32 [r604+24], r266; +st.shared.u32 [r604+28], r273; +st.shared.u32 [r604+48], r303; +st.shared.u32 [r604+52], r310; +barrier.sync 0; +ld.shared.u32 r339, [r598]; +ld.shared.u32 r345, [r598+4]; +ld.shared.u32 r336, [r598+216]; +ld.shared.u32 r342, [r598+220]; +ld.shared.u32 r337, [r598+432]; +ld.shared.u32 r343, [r598+436]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r594, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r605, rd7; +mul.lo.s32 r606, r605, 9; +sub.s32 r607, r594, r606; +shl.b32 r608, r607, 3; +add.s32 r609, r595, r608; +cvt.rn.f32.u32 f47, r605; +mul.f32 f48, f47, 0f3F32B8C2; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r610, r605, 216, r609; +st.shared.u32 [r610], r338; +st.shared.u32 [r610+4], r344; +st.shared.u32 [r610+72], r431; +st.shared.u32 [r610+76], r438; +st.shared.u32 [r610+144], r468; +st.shared.u32 [r610+148], r475; +barrier.sync 0; +ld.shared.u32 r504, [r598]; +ld.shared.u32 r510, [r598+4]; +ld.shared.u32 r501, [r598+216]; +ld.shared.u32 r507, [r598+220]; +ld.shared.u32 r502, [r598+432]; +ld.shared.u32 r508, [r598+436]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 %0, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 %1, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 %2, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 %4, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 %3, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 %5, r572, r578; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<877, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<611>; +.reg .b64 rd<8>; +mov.u32 r584, %tid.y; +mov.u32 r585, %6; +mad.lo.s32 r586, r584, 324, r585; +mov.u32 r587, %tid.x; +mov.f32 f38, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r1, {low, high}; +} +mov.f32 f40, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r587, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r588, rd3; +sub.s32 r589, r587, r588; +shr.u32 r590, r589, 1; +add.s32 r591, r590, r588; +shr.u32 r592, r591, 4; +mul.lo.s32 r593, r592, 27; +sub.s32 r594, r587, r593; +mad.lo.s32 r595, r592, 324, r586; +cvt.rn.f32.u32 f41, r594; +mul.f32 f42, f41, 0f3D9EDD1F; +cos.approx.f32 f5, f42; +sin.approx.f32 f43, f42; +neg.f32 f6, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r596, r594, 12, r595; +st.shared.u32 [r596], r8; +st.shared.u32 [r596+4], r101; +st.shared.u32 [r596+8], r138; +barrier.sync 0; +shl.b32 r597, r594, 3; +sub.s32 r598, r596, r597; +ld.shared.u32 r174, [r598]; +ld.shared.u32 r171, [r598+108]; +ld.shared.u32 r172, [r598+216]; +barrier.sync 0; +st.shared.u32 [r596], r14; +st.shared.u32 [r596+4], r108; +st.shared.u32 [r596+8], r145; +barrier.sync 0; +ld.shared.u32 r180, [r598]; +ld.shared.u32 r177, [r598+108]; +ld.shared.u32 r178, [r598+216]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 r173, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 r179, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 r197, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 r215, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 r233, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 r251, r242, r248; +} +mul.wide.u32 rd4, r594, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r599, rd5; +mul.lo.s32 r600, r599, 3; +sub.s32 r601, r594, r600; +shl.b32 r602, r601, 2; +add.s32 r603, r595, r602; +cvt.rn.f32.u32 f44, r599; +mul.f32 f45, f44, 0f3E6E4BAE; +cos.approx.f32 f17, f45; +sin.approx.f32 f46, f45; +neg.f32 f18, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r254, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r257, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r259, {high, high}; +} +{ +mul.f16x2 r261, r233, r259; +} +{ +neg.f16x2 r264, r261; +} +{ +fma.rn.f16x2 r266, r197, r257, r264; +} +{ +mul.f16x2 r270, r197, r259; +} +{ +fma.rn.f16x2 r273, r233, r257, r270; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r277, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r279, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r281, {low, high}; +} +{ +mul.f16x2 r282, r279, r281; +} +{ +mul.f16x2 r285, r254, r277; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r254; +mov.b32 r288, {high, low}; +} +{ +fma.rn.f16x2 r290, r282, r288, r285; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r294, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r290; +mov.b32 r296, {high, high}; +} +{ +mul.f16x2 r298, r251, r296; +} +{ +neg.f16x2 r301, r298; +} +{ +fma.rn.f16x2 r303, r215, r294, r301; +} +{ +mul.f16x2 r307, r215, r296; +} +{ +fma.rn.f16x2 r310, r251, r294, r307; +} +barrier.sync 0; +mad.lo.s32 r604, r599, 36, r603; +st.shared.u32 [r604], r173; +st.shared.u32 [r604+12], r266; +st.shared.u32 [r604+24], r303; +barrier.sync 0; +ld.shared.u32 r339, [r598]; +ld.shared.u32 r336, [r598+108]; +ld.shared.u32 r337, [r598+216]; +barrier.sync 0; +st.shared.u32 [r604], r179; +st.shared.u32 [r604+12], r273; +st.shared.u32 [r604+24], r310; +barrier.sync 0; +ld.shared.u32 r345, [r598]; +ld.shared.u32 r342, [r598+108]; +ld.shared.u32 r343, [r598+216]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r331, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r332, {low, high}; +} +{ +neg.f16x2 r333, r332; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r342, r343; +} +{ +add.f16x2 r344, r345, r341; +} +{ +add.f16x2 r347, r336, r337; +} +{ +mul.f16x2 r350, r347, r331; +} +{ +add.f16x2 r353, r339, r350; +} +{ +sub.f16x2 r356, r342, r343; +} +{ +mul.f16x2 r359, r356, r333; +} +{ +add.f16x2 r362, r353, r359; +} +{ +add.f16x2 r365, r336, r337; +} +{ +mul.f16x2 r368, r365, r331; +} +{ +add.f16x2 r371, r339, r368; +} +{ +sub.f16x2 r374, r342, r343; +} +{ +mul.f16x2 r377, r374, r333; +} +{ +sub.f16x2 r380, r371, r377; +} +{ +add.f16x2 r383, r342, r343; +} +{ +mul.f16x2 r386, r383, r331; +} +{ +add.f16x2 r389, r345, r386; +} +{ +sub.f16x2 r392, r336, r337; +} +{ +mul.f16x2 r395, r392, r333; +} +{ +sub.f16x2 r398, r389, r395; +} +{ +add.f16x2 r401, r342, r343; +} +{ +mul.f16x2 r404, r401, r331; +} +{ +add.f16x2 r407, r345, r404; +} +{ +sub.f16x2 r410, r336, r337; +} +{ +mul.f16x2 r413, r410, r333; +} +{ +add.f16x2 r416, r407, r413; +} +mul.wide.u32 rd6, r594, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r605, rd7; +mul.lo.s32 r606, r605, 9; +sub.s32 r607, r594, r606; +shl.b32 r608, r607, 2; +add.s32 r609, r595, r608; +cvt.rn.f32.u32 f47, r605; +mul.f32 f48, f47, 0f3F32B8C2; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r419, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r422, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r424, {high, high}; +} +{ +mul.f16x2 r426, r398, r424; +} +{ +neg.f16x2 r429, r426; +} +{ +fma.rn.f16x2 r431, r362, r422, r429; +} +{ +mul.f16x2 r435, r362, r424; +} +{ +fma.rn.f16x2 r438, r398, r422, r435; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r442, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r444, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r446, {low, high}; +} +{ +mul.f16x2 r447, r444, r446; +} +{ +mul.f16x2 r450, r419, r442; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r419; +mov.b32 r453, {high, low}; +} +{ +fma.rn.f16x2 r455, r447, r453, r450; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r459, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r455; +mov.b32 r461, {high, high}; +} +{ +mul.f16x2 r463, r416, r461; +} +{ +neg.f16x2 r466, r463; +} +{ +fma.rn.f16x2 r468, r380, r459, r466; +} +{ +mul.f16x2 r472, r380, r461; +} +{ +fma.rn.f16x2 r475, r416, r459, r472; +} +barrier.sync 0; +mad.lo.s32 r610, r605, 108, r609; +st.shared.u32 [r610], r338; +st.shared.u32 [r610+36], r431; +st.shared.u32 [r610+72], r468; +barrier.sync 0; +ld.shared.u32 r504, [r598]; +ld.shared.u32 r501, [r598+108]; +ld.shared.u32 r502, [r598+216]; +barrier.sync 0; +st.shared.u32 [r610], r344; +st.shared.u32 [r610+36], r438; +st.shared.u32 [r610+72], r475; +barrier.sync 0; +ld.shared.u32 r510, [r598]; +ld.shared.u32 r507, [r598+108]; +ld.shared.u32 r508, [r598+216]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r496, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r497, {low, high}; +} +{ +neg.f16x2 r498, r497; +} +{ +add.f16x2 r500, r501, r502; +} +{ +add.f16x2 %0, r504, r500; +} +{ +add.f16x2 r506, r507, r508; +} +{ +add.f16x2 %1, r510, r506; +} +{ +add.f16x2 r512, r501, r502; +} +{ +mul.f16x2 r515, r512, r496; +} +{ +add.f16x2 r518, r504, r515; +} +{ +sub.f16x2 r521, r507, r508; +} +{ +mul.f16x2 r524, r521, r498; +} +{ +add.f16x2 %2, r518, r524; +} +{ +add.f16x2 r530, r501, r502; +} +{ +mul.f16x2 r533, r530, r496; +} +{ +add.f16x2 r536, r504, r533; +} +{ +sub.f16x2 r539, r507, r508; +} +{ +mul.f16x2 r542, r539, r498; +} +{ +sub.f16x2 %4, r536, r542; +} +{ +add.f16x2 r548, r507, r508; +} +{ +mul.f16x2 r551, r548, r496; +} +{ +add.f16x2 r554, r510, r551; +} +{ +sub.f16x2 r557, r501, r502; +} +{ +mul.f16x2 r560, r557, r498; +} +{ +sub.f16x2 %3, r554, r560; +} +{ +add.f16x2 r566, r507, r508; +} +{ +mul.f16x2 r569, r566, r496; +} +{ +add.f16x2 r572, r510, r569; +} +{ +sub.f16x2 r575, r501, r502; +} +{ +mul.f16x2 r578, r575, r498; +} +{ +add.f16x2 %5, r572, r578; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..5047f5c08fed4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp16_inv.hpp.inc @@ -0,0 +1,16555 @@ +#ifndef CUFFTDX_FFT_81_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_81_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1074, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<136>; +.reg .b32 r<1503>; +.reg .b64 rd<4>; +mov.u32 r1492, %tid.y; +mov.u32 r1493, %18; +mad.lo.s32 r1494, r1492, 648, r1493; +mov.u32 r1495, %tid.x; +mov.f32 f130, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1, {low, high}; +} +mov.f32 f132, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r259, {low, high}; +} +mov.f32 f92, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r260, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r261, {low, high}; +} +mov.f32 f96, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r262, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r265, {low, high}; +} +mov.f32 f104, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r1495, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1496, rd3; +mul.lo.s32 r1497, r1496, 9; +sub.s32 r1498, r1495, r1497; +cvt.rn.f32.u32 f133, r1498; +mul.f32 f134, f133, 0f3D9EDD1F; +cos.approx.f32 f57, f134; +sin.approx.f32 f135, f134; +neg.f32 f58, f135; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +mad.lo.s32 r1499, r1496, 648, r1494; +barrier.sync 0; +mad.lo.s32 r1500, r1498, 72, r1499; +st.shared.v2.f32 [r1500], {r344, r350}; +st.shared.v2.f32 [r1500+8], {r607, r616}; +st.shared.v2.f32 [r1500+16], {r644, r653}; +st.shared.v2.f32 [r1500+24], {r681, r690}; +st.shared.v2.f32 [r1500+32], {r718, r727}; +st.shared.v2.f32 [r1500+40], {r755, r764}; +st.shared.v2.f32 [r1500+48], {r792, r801}; +st.shared.v2.f32 [r1500+56], {r829, r838}; +st.shared.v2.f32 [r1500+64], {r866, r875}; +barrier.sync 0; +shl.b32 r1501, r1498, 6; +sub.s32 r1502, r1500, r1501; +ld.shared.u32 r902, [r1502]; +ld.shared.u32 r908, [r1502+4]; +ld.shared.u32 r988, [r1502+72]; +ld.shared.u32 r994, [r1502+76]; +ld.shared.u32 r1074, [r1502+144]; +ld.shared.u32 r1080, [r1502+148]; +ld.shared.u32 r899, [r1502+216]; +ld.shared.u32 r905, [r1502+220]; +ld.shared.u32 r985, [r1502+288]; +ld.shared.u32 r991, [r1502+292]; +ld.shared.u32 r1071, [r1502+360]; +ld.shared.u32 r1077, [r1502+364]; +ld.shared.u32 r900, [r1502+432]; +ld.shared.u32 r906, [r1502+436]; +ld.shared.u32 r986, [r1502+504]; +ld.shared.u32 r992, [r1502+508]; +ld.shared.u32 r1072, [r1502+576]; +ld.shared.u32 r1078, [r1502+580]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 %0, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 %1, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 %6, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 %12, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 %7, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 %13, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 %2, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 %3, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 %8, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 %14, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 %9, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 %15, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 %4, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 %5, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 %10, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 %16, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 %11, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 %17, r1480, r1486; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1075, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<136>; +.reg .b32 r<1503>; +.reg .b64 rd<4>; +mov.u32 r1492, %tid.y; +mov.u32 r1493, %18; +mad.lo.s32 r1494, r1492, 324, r1493; +mov.u32 r1495, %tid.x; +mov.f32 f130, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1, {low, high}; +} +mov.f32 f132, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %25, %31; +} +{ +add.f16x2 r6, %19, r3; +} +{ +add.f16x2 r9, %26, %32; +} +{ +add.f16x2 r12, %20, r9; +} +{ +add.f16x2 r15, %25, %31; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %19, r18; +} +{ +sub.f16x2 r24, %26, %32; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %25, %31; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %19, r36; +} +{ +sub.f16x2 r42, %26, %32; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %26, %32; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %20, r54; +} +{ +sub.f16x2 r60, %25, %31; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %26, %32; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %20, r72; +} +{ +sub.f16x2 r78, %25, %31; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %27, %33; +} +{ +add.f16x2 r92, %21, r89; +} +{ +add.f16x2 r95, %28, %34; +} +{ +add.f16x2 r98, %22, r95; +} +{ +add.f16x2 r101, %27, %33; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %21, r104; +} +{ +sub.f16x2 r110, %28, %34; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %27, %33; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %21, r122; +} +{ +sub.f16x2 r128, %28, %34; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %28, %34; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %22, r140; +} +{ +sub.f16x2 r146, %27, %33; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %28, %34; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %22, r158; +} +{ +sub.f16x2 r164, %27, %33; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %30, %36; +} +{ +add.f16x2 r184, %24, r181; +} +{ +add.f16x2 r187, %29, %35; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %23, r190; +} +{ +sub.f16x2 r196, %30, %36; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %29, %35; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %23, r208; +} +{ +sub.f16x2 r214, %30, %36; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %30, %36; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %24, r226; +} +{ +sub.f16x2 r232, %29, %35; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %30, %36; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %24, r244; +} +{ +sub.f16x2 r250, %29, %35; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f90, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r259, {low, high}; +} +mov.f32 f92, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r260, {low, high}; +} +mov.f32 f94, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r261, {low, high}; +} +mov.f32 f96, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r262, {low, high}; +} +mov.f32 f102, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r265, {low, high}; +} +mov.f32 f104, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +mul.wide.u32 rd2, r1495, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r1496, rd3; +mul.lo.s32 r1497, r1496, 9; +sub.s32 r1498, r1495, r1497; +mad.lo.s32 r1499, r1496, 324, r1494; +cvt.rn.f32.u32 f133, r1498; +mul.f32 f134, f133, 0f3D9EDD1F; +cos.approx.f32 f57, f134; +sin.approx.f32 f135, f134; +neg.f32 f58, f135; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f57; +cvt.rn.f16.f32 high, f58; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r600, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r602, {high, high}; +} +{ +mul.f16x2 r604, r436, r602; +} +{ +fma.rn.f16x2 r607, r430, r600, r604; +} +{ +mul.f16x2 r611, r430, r602; +} +{ +neg.f16x2 r614, r611; +} +{ +fma.rn.f16x2 r616, r436, r600, r614; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r620, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r622, {high, high}; +} +mov.f32 f73, 0fBF800000; +mov.f32 f74, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r624, {low, high}; +} +{ +mul.f16x2 r625, r622, r624; +} +{ +mul.f16x2 r628, r597, r620; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r631, {high, low}; +} +{ +fma.rn.f16x2 r633, r625, r631, r628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r637, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r639, {high, high}; +} +{ +mul.f16x2 r641, r522, r639; +} +{ +fma.rn.f16x2 r644, r516, r637, r641; +} +{ +mul.f16x2 r648, r516, r639; +} +{ +neg.f16x2 r651, r648; +} +{ +fma.rn.f16x2 r653, r522, r637, r651; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r657, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r659, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r661, {low, high}; +} +{ +mul.f16x2 r662, r659, r661; +} +{ +mul.f16x2 r665, r633, r657; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r633; +mov.b32 r668, {high, low}; +} +{ +fma.rn.f16x2 r670, r662, r668, r665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r674, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r676, {high, high}; +} +{ +mul.f16x2 r678, r404, r676; +} +{ +fma.rn.f16x2 r681, r368, r674, r678; +} +{ +mul.f16x2 r685, r368, r676; +} +{ +neg.f16x2 r688, r685; +} +{ +fma.rn.f16x2 r690, r404, r674, r688; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r694, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r696, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r698, {low, high}; +} +{ +mul.f16x2 r699, r696, r698; +} +{ +mul.f16x2 r702, r670, r694; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r670; +mov.b32 r705, {high, low}; +} +{ +fma.rn.f16x2 r707, r699, r705, r702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r711, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r713, {high, high}; +} +{ +mul.f16x2 r715, r490, r713; +} +{ +fma.rn.f16x2 r718, r454, r711, r715; +} +{ +mul.f16x2 r722, r454, r713; +} +{ +neg.f16x2 r725, r722; +} +{ +fma.rn.f16x2 r727, r490, r711, r725; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r731, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r733, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r735, {low, high}; +} +{ +mul.f16x2 r736, r733, r735; +} +{ +mul.f16x2 r739, r707, r731; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r707; +mov.b32 r742, {high, low}; +} +{ +fma.rn.f16x2 r744, r736, r742, r739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r748, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r750, {high, high}; +} +{ +mul.f16x2 r752, r576, r750; +} +{ +fma.rn.f16x2 r755, r540, r748, r752; +} +{ +mul.f16x2 r759, r540, r750; +} +{ +neg.f16x2 r762, r759; +} +{ +fma.rn.f16x2 r764, r576, r748, r762; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r768, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r770, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r772, {low, high}; +} +{ +mul.f16x2 r773, r770, r772; +} +{ +mul.f16x2 r776, r744, r768; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r744; +mov.b32 r779, {high, low}; +} +{ +fma.rn.f16x2 r781, r773, r779, r776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r785, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r787, {high, high}; +} +{ +mul.f16x2 r789, r422, r787; +} +{ +fma.rn.f16x2 r792, r386, r785, r789; +} +{ +mul.f16x2 r796, r386, r787; +} +{ +neg.f16x2 r799, r796; +} +{ +fma.rn.f16x2 r801, r422, r785, r799; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r805, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r807, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r809, {low, high}; +} +{ +mul.f16x2 r810, r807, r809; +} +{ +mul.f16x2 r813, r781, r805; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r781; +mov.b32 r816, {high, low}; +} +{ +fma.rn.f16x2 r818, r810, r816, r813; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r822, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r824, {high, high}; +} +{ +mul.f16x2 r826, r508, r824; +} +{ +fma.rn.f16x2 r829, r472, r822, r826; +} +{ +mul.f16x2 r833, r472, r824; +} +{ +neg.f16x2 r836, r833; +} +{ +fma.rn.f16x2 r838, r508, r822, r836; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r842, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r597; +mov.b32 r844, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f73; +cvt.rn.f16.f32 high, f74; +mov.b32 r846, {low, high}; +} +{ +mul.f16x2 r847, r844, r846; +} +{ +mul.f16x2 r850, r818, r842; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r818; +mov.b32 r853, {high, low}; +} +{ +fma.rn.f16x2 r855, r847, r853, r850; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r859, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r855; +mov.b32 r861, {high, high}; +} +{ +mul.f16x2 r863, r594, r861; +} +{ +fma.rn.f16x2 r866, r558, r859, r863; +} +{ +mul.f16x2 r870, r558, r861; +} +{ +neg.f16x2 r873, r870; +} +{ +fma.rn.f16x2 r875, r594, r859, r873; +} +barrier.sync 0; +mad.lo.s32 r1500, r1498, 36, r1499; +st.shared.u32 [r1500], r344; +st.shared.u32 [r1500+4], r607; +st.shared.u32 [r1500+8], r644; +st.shared.u32 [r1500+12], r681; +st.shared.u32 [r1500+16], r718; +st.shared.u32 [r1500+20], r755; +st.shared.u32 [r1500+24], r792; +st.shared.u32 [r1500+28], r829; +st.shared.u32 [r1500+32], r866; +barrier.sync 0; +shl.b32 r1501, r1498, 5; +sub.s32 r1502, r1500, r1501; +ld.shared.u32 r902, [r1502]; +ld.shared.u32 r988, [r1502+36]; +ld.shared.u32 r1074, [r1502+72]; +ld.shared.u32 r899, [r1502+108]; +ld.shared.u32 r985, [r1502+144]; +ld.shared.u32 r1071, [r1502+180]; +ld.shared.u32 r900, [r1502+216]; +ld.shared.u32 r986, [r1502+252]; +ld.shared.u32 r1072, [r1502+288]; +barrier.sync 0; +st.shared.u32 [r1500], r350; +st.shared.u32 [r1500+4], r616; +st.shared.u32 [r1500+8], r653; +st.shared.u32 [r1500+12], r690; +st.shared.u32 [r1500+16], r727; +st.shared.u32 [r1500+20], r764; +st.shared.u32 [r1500+24], r801; +st.shared.u32 [r1500+28], r838; +st.shared.u32 [r1500+32], r875; +barrier.sync 0; +ld.shared.u32 r908, [r1502]; +ld.shared.u32 r994, [r1502+36]; +ld.shared.u32 r1080, [r1502+72]; +ld.shared.u32 r905, [r1502+108]; +ld.shared.u32 r991, [r1502+144]; +ld.shared.u32 r1077, [r1502+180]; +ld.shared.u32 r906, [r1502+216]; +ld.shared.u32 r992, [r1502+252]; +ld.shared.u32 r1078, [r1502+288]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r896, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r897, {low, high}; +} +{ +add.f16x2 r898, r899, r900; +} +{ +add.f16x2 r901, r902, r898; +} +{ +add.f16x2 r904, r905, r906; +} +{ +add.f16x2 r907, r908, r904; +} +{ +add.f16x2 r910, r899, r900; +} +{ +mul.f16x2 r913, r910, r896; +} +{ +add.f16x2 r916, r902, r913; +} +{ +sub.f16x2 r919, r905, r906; +} +{ +mul.f16x2 r922, r919, r897; +} +{ +add.f16x2 r925, r916, r922; +} +{ +add.f16x2 r928, r899, r900; +} +{ +mul.f16x2 r931, r928, r896; +} +{ +add.f16x2 r934, r902, r931; +} +{ +sub.f16x2 r937, r905, r906; +} +{ +mul.f16x2 r940, r937, r897; +} +{ +sub.f16x2 r943, r934, r940; +} +{ +add.f16x2 r946, r905, r906; +} +{ +mul.f16x2 r949, r946, r896; +} +{ +add.f16x2 r952, r908, r949; +} +{ +sub.f16x2 r955, r899, r900; +} +{ +mul.f16x2 r958, r955, r897; +} +{ +sub.f16x2 r961, r952, r958; +} +{ +add.f16x2 r964, r905, r906; +} +{ +mul.f16x2 r967, r964, r896; +} +{ +add.f16x2 r970, r908, r967; +} +{ +sub.f16x2 r973, r899, r900; +} +{ +mul.f16x2 r976, r973, r897; +} +{ +add.f16x2 r979, r970, r976; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r982, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r983, {low, high}; +} +{ +add.f16x2 r984, r985, r986; +} +{ +add.f16x2 r987, r988, r984; +} +{ +add.f16x2 r990, r991, r992; +} +{ +add.f16x2 r993, r994, r990; +} +{ +add.f16x2 r996, r985, r986; +} +{ +mul.f16x2 r999, r996, r982; +} +{ +add.f16x2 r1002, r988, r999; +} +{ +sub.f16x2 r1005, r991, r992; +} +{ +mul.f16x2 r1008, r1005, r983; +} +{ +add.f16x2 r1011, r1002, r1008; +} +{ +add.f16x2 r1014, r985, r986; +} +{ +mul.f16x2 r1017, r1014, r982; +} +{ +add.f16x2 r1020, r988, r1017; +} +{ +sub.f16x2 r1023, r991, r992; +} +{ +mul.f16x2 r1026, r1023, r983; +} +{ +sub.f16x2 r1029, r1020, r1026; +} +{ +add.f16x2 r1032, r991, r992; +} +{ +mul.f16x2 r1035, r1032, r982; +} +{ +add.f16x2 r1038, r994, r1035; +} +{ +sub.f16x2 r1041, r985, r986; +} +{ +mul.f16x2 r1044, r1041, r983; +} +{ +sub.f16x2 r1047, r1038, r1044; +} +{ +add.f16x2 r1050, r991, r992; +} +{ +mul.f16x2 r1053, r1050, r982; +} +{ +add.f16x2 r1056, r994, r1053; +} +{ +sub.f16x2 r1059, r985, r986; +} +{ +mul.f16x2 r1062, r1059, r983; +} +{ +add.f16x2 r1065, r1056, r1062; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1068, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1069, {low, high}; +} +{ +add.f16x2 r1070, r1071, r1072; +} +{ +add.f16x2 r1073, r1074, r1070; +} +{ +add.f16x2 r1076, r1077, r1078; +} +{ +add.f16x2 r1079, r1080, r1076; +} +{ +add.f16x2 r1082, r1071, r1072; +} +{ +mul.f16x2 r1085, r1082, r1068; +} +{ +add.f16x2 r1088, r1074, r1085; +} +{ +sub.f16x2 r1091, r1077, r1078; +} +{ +mul.f16x2 r1094, r1091, r1069; +} +{ +add.f16x2 r1097, r1088, r1094; +} +{ +add.f16x2 r1100, r1071, r1072; +} +{ +mul.f16x2 r1103, r1100, r1068; +} +{ +add.f16x2 r1106, r1074, r1103; +} +{ +sub.f16x2 r1109, r1077, r1078; +} +{ +mul.f16x2 r1112, r1109, r1069; +} +{ +sub.f16x2 r1115, r1106, r1112; +} +{ +add.f16x2 r1118, r1077, r1078; +} +{ +mul.f16x2 r1121, r1118, r1068; +} +{ +add.f16x2 r1124, r1080, r1121; +} +{ +sub.f16x2 r1127, r1071, r1072; +} +{ +mul.f16x2 r1130, r1127, r1069; +} +{ +sub.f16x2 r1133, r1124, r1130; +} +{ +add.f16x2 r1136, r1077, r1078; +} +{ +mul.f16x2 r1139, r1136, r1068; +} +{ +add.f16x2 r1142, r1080, r1139; +} +{ +sub.f16x2 r1145, r1071, r1072; +} +{ +mul.f16x2 r1148, r1145, r1069; +} +{ +add.f16x2 r1151, r1142, r1148; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f90; +cvt.rn.f16.f32 high, f90; +mov.b32 r1154, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f92; +cvt.rn.f16.f32 high, f92; +mov.b32 r1155, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f94; +cvt.rn.f16.f32 high, f94; +mov.b32 r1156, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f96; +cvt.rn.f16.f32 high, f96; +mov.b32 r1157, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f102; +cvt.rn.f16.f32 high, f102; +mov.b32 r1160, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f104; +cvt.rn.f16.f32 high, f104; +mov.b32 r1161, {low, high}; +} +{ +mul.f16x2 r1170, r1011, r1154; +} +{ +mul.f16x2 r1173, r1047, r1155; +} +{ +sub.f16x2 r1176, r1170, r1173; +} +{ +mul.f16x2 r1179, r1011, r1155; +} +{ +fma.rn.f16x2 r1182, r1047, r1154, r1179; +} +{ +mul.f16x2 r1186, r1097, r1156; +} +{ +mul.f16x2 r1189, r1133, r1157; +} +{ +sub.f16x2 r1192, r1186, r1189; +} +{ +mul.f16x2 r1195, r1097, r1157; +} +{ +fma.rn.f16x2 r1198, r1133, r1156, r1195; +} +{ +mul.f16x2 r1202, r1029, r1156; +} +{ +mul.f16x2 r1205, r1065, r1157; +} +{ +sub.f16x2 r1208, r1202, r1205; +} +{ +mul.f16x2 r1211, r1029, r1157; +} +{ +fma.rn.f16x2 r1214, r1065, r1156, r1211; +} +{ +mul.f16x2 r1218, r1115, r1160; +} +{ +mul.f16x2 r1221, r1151, r1161; +} +{ +sub.f16x2 r1224, r1218, r1221; +} +{ +mul.f16x2 r1227, r1115, r1161; +} +{ +fma.rn.f16x2 r1230, r1151, r1160, r1227; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1234, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1235, {low, high}; +} +{ +add.f16x2 r1236, r987, r1073; +} +{ +add.f16x2 %0, r901, r1236; +} +{ +add.f16x2 r1242, r993, r1079; +} +{ +add.f16x2 %1, r907, r1242; +} +{ +add.f16x2 r1248, r987, r1073; +} +{ +mul.f16x2 r1251, r1248, r1234; +} +{ +add.f16x2 r1254, r901, r1251; +} +{ +sub.f16x2 r1257, r993, r1079; +} +{ +mul.f16x2 r1260, r1257, r1235; +} +{ +add.f16x2 %6, r1254, r1260; +} +{ +add.f16x2 r1266, r987, r1073; +} +{ +mul.f16x2 r1269, r1266, r1234; +} +{ +add.f16x2 r1272, r901, r1269; +} +{ +sub.f16x2 r1275, r993, r1079; +} +{ +mul.f16x2 r1278, r1275, r1235; +} +{ +sub.f16x2 %12, r1272, r1278; +} +{ +add.f16x2 r1284, r993, r1079; +} +{ +mul.f16x2 r1287, r1284, r1234; +} +{ +add.f16x2 r1290, r907, r1287; +} +{ +sub.f16x2 r1293, r987, r1073; +} +{ +mul.f16x2 r1296, r1293, r1235; +} +{ +sub.f16x2 %7, r1290, r1296; +} +{ +add.f16x2 r1302, r993, r1079; +} +{ +mul.f16x2 r1305, r1302, r1234; +} +{ +add.f16x2 r1308, r907, r1305; +} +{ +sub.f16x2 r1311, r987, r1073; +} +{ +mul.f16x2 r1314, r1311, r1235; +} +{ +add.f16x2 %13, r1308, r1314; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1320, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1321, {low, high}; +} +{ +add.f16x2 r1322, r1176, r1192; +} +{ +add.f16x2 %2, r925, r1322; +} +{ +add.f16x2 r1328, r1182, r1198; +} +{ +add.f16x2 %3, r961, r1328; +} +{ +add.f16x2 r1334, r1176, r1192; +} +{ +mul.f16x2 r1337, r1334, r1320; +} +{ +add.f16x2 r1340, r925, r1337; +} +{ +sub.f16x2 r1343, r1182, r1198; +} +{ +mul.f16x2 r1346, r1343, r1321; +} +{ +add.f16x2 %8, r1340, r1346; +} +{ +add.f16x2 r1352, r1176, r1192; +} +{ +mul.f16x2 r1355, r1352, r1320; +} +{ +add.f16x2 r1358, r925, r1355; +} +{ +sub.f16x2 r1361, r1182, r1198; +} +{ +mul.f16x2 r1364, r1361, r1321; +} +{ +sub.f16x2 %14, r1358, r1364; +} +{ +add.f16x2 r1370, r1182, r1198; +} +{ +mul.f16x2 r1373, r1370, r1320; +} +{ +add.f16x2 r1376, r961, r1373; +} +{ +sub.f16x2 r1379, r1176, r1192; +} +{ +mul.f16x2 r1382, r1379, r1321; +} +{ +sub.f16x2 %9, r1376, r1382; +} +{ +add.f16x2 r1388, r1182, r1198; +} +{ +mul.f16x2 r1391, r1388, r1320; +} +{ +add.f16x2 r1394, r961, r1391; +} +{ +sub.f16x2 r1397, r1176, r1192; +} +{ +mul.f16x2 r1400, r1397, r1321; +} +{ +add.f16x2 %15, r1394, r1400; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f130; +cvt.rn.f16.f32 high, f130; +mov.b32 r1406, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f132; +cvt.rn.f16.f32 high, f132; +mov.b32 r1407, {low, high}; +} +{ +add.f16x2 r1408, r1208, r1224; +} +{ +add.f16x2 %4, r943, r1408; +} +{ +add.f16x2 r1414, r1214, r1230; +} +{ +add.f16x2 %5, r979, r1414; +} +{ +add.f16x2 r1420, r1208, r1224; +} +{ +mul.f16x2 r1423, r1420, r1406; +} +{ +add.f16x2 r1426, r943, r1423; +} +{ +sub.f16x2 r1429, r1214, r1230; +} +{ +mul.f16x2 r1432, r1429, r1407; +} +{ +add.f16x2 %10, r1426, r1432; +} +{ +add.f16x2 r1438, r1208, r1224; +} +{ +mul.f16x2 r1441, r1438, r1406; +} +{ +add.f16x2 r1444, r943, r1441; +} +{ +sub.f16x2 r1447, r1214, r1230; +} +{ +mul.f16x2 r1450, r1447, r1407; +} +{ +sub.f16x2 %16, r1444, r1450; +} +{ +add.f16x2 r1456, r1214, r1230; +} +{ +mul.f16x2 r1459, r1456, r1406; +} +{ +add.f16x2 r1462, r979, r1459; +} +{ +sub.f16x2 r1465, r1208, r1224; +} +{ +mul.f16x2 r1468, r1465, r1407; +} +{ +sub.f16x2 %11, r1462, r1468; +} +{ +add.f16x2 r1474, r1214, r1230; +} +{ +mul.f16x2 r1477, r1474, r1406; +} +{ +add.f16x2 r1480, r979, r1477; +} +{ +sub.f16x2 r1483, r1208, r1224; +} +{ +mul.f16x2 r1486, r1483, r1407; +} +{ +add.f16x2 %17, r1480, r1486; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1077, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<404>; +.reg .b32 r<4676>; +.reg .b64 rd<4>; +mov.u32 r4674, %tid.y; +mov.u32 r4675, %54; +mad.lo.s32 r4612, r4674, 648, r4675; +mov.u32 r4613, %tid.x; +mov.f32 f398, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1, {low, high}; +} +mov.f32 f400, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %106, %91; +} +{ +add.f16x2 r6, %61, r3; +} +{ +add.f16x2 r9, %70, %104; +} +{ +add.f16x2 r12, %76, r9; +} +{ +add.f16x2 r15, %106, %91; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %61, r18; +} +{ +sub.f16x2 r24, %70, %104; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %106, %91; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %61, r36; +} +{ +sub.f16x2 r42, %70, %104; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %70, %104; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %76, r54; +} +{ +sub.f16x2 r60, %106, %91; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %70, %104; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %76, r72; +} +{ +sub.f16x2 r78, %106, %91; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %105, %90; +} +{ +add.f16x2 r92, %60, r89; +} +{ +add.f16x2 r95, %69, %102; +} +{ +add.f16x2 r98, %75, r95; +} +{ +add.f16x2 r101, %105, %90; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %60, r104; +} +{ +sub.f16x2 r110, %69, %102; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %105, %90; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %60, r122; +} +{ +sub.f16x2 r128, %69, %102; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %69, %102; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %75, r140; +} +{ +sub.f16x2 r146, %105, %90; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %69, %102; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %75, r158; +} +{ +sub.f16x2 r164, %105, %90; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %103, %88; +} +{ +add.f16x2 r178, %59, r175; +} +{ +add.f16x2 r181, %64, %101; +} +{ +add.f16x2 r184, %74, r181; +} +{ +add.f16x2 r187, %103, %88; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %59, r190; +} +{ +sub.f16x2 r196, %64, %101; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %103, %88; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %59, r208; +} +{ +sub.f16x2 r214, %64, %101; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %64, %101; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %74, r226; +} +{ +sub.f16x2 r232, %103, %88; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %64, %101; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %74, r244; +} +{ +sub.f16x2 r250, %103, %88; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f178, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r259, {low, high}; +} +mov.f32 f180, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r260, {low, high}; +} +mov.f32 f190, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r261, {low, high}; +} +mov.f32 f192, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r262, {low, high}; +} +mov.f32 f214, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r265, {low, high}; +} +mov.f32 f216, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %85, %68; +} +{ +add.f16x2 r602, %89, r599; +} +{ +add.f16x2 r605, %97, %83; +} +{ +add.f16x2 r608, %100, r605; +} +{ +add.f16x2 r611, %85, %68; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %89, r614; +} +{ +sub.f16x2 r620, %97, %83; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %85, %68; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %89, r632; +} +{ +sub.f16x2 r638, %97, %83; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %97, %83; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %100, r650; +} +{ +sub.f16x2 r656, %85, %68; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %97, %83; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %100, r668; +} +{ +sub.f16x2 r674, %85, %68; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %84, %67; +} +{ +add.f16x2 r688, %87, r685; +} +{ +add.f16x2 r691, %96, %81; +} +{ +add.f16x2 r694, %99, r691; +} +{ +add.f16x2 r697, %84, %67; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %87, r700; +} +{ +sub.f16x2 r706, %96, %81; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %84, %67; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %87, r718; +} +{ +sub.f16x2 r724, %96, %81; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %96, %81; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %99, r736; +} +{ +sub.f16x2 r742, %84, %67; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %96, %81; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %99, r754; +} +{ +sub.f16x2 r760, %84, %67; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %82, %66; +} +{ +add.f16x2 r774, %86, r771; +} +{ +add.f16x2 r777, %94, %80; +} +{ +add.f16x2 r780, %98, r777; +} +{ +add.f16x2 r783, %82, %66; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %86, r786; +} +{ +sub.f16x2 r792, %94, %80; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %82, %66; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %86, r804; +} +{ +sub.f16x2 r810, %94, %80; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %94, %80; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %98, r822; +} +{ +sub.f16x2 r828, %82, %66; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %94, %80; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %98, r840; +} +{ +sub.f16x2 r846, %82, %66; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %58, %95; +} +{ +add.f16x2 r1198, %65, r1195; +} +{ +add.f16x2 r1201, %73, %56; +} +{ +add.f16x2 r1204, %79, r1201; +} +{ +add.f16x2 r1207, %58, %95; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %65, r1210; +} +{ +sub.f16x2 r1216, %73, %56; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %58, %95; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %65, r1228; +} +{ +sub.f16x2 r1234, %73, %56; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %73, %56; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %79, r1246; +} +{ +sub.f16x2 r1252, %58, %95; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %73, %56; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %79, r1264; +} +{ +sub.f16x2 r1270, %58, %95; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %57, %93; +} +{ +add.f16x2 r1284, %63, r1281; +} +{ +add.f16x2 r1287, %72, %108; +} +{ +add.f16x2 r1290, %78, r1287; +} +{ +add.f16x2 r1293, %57, %93; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %63, r1296; +} +{ +sub.f16x2 r1302, %72, %108; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %57, %93; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %63, r1314; +} +{ +sub.f16x2 r1320, %72, %108; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %72, %108; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %78, r1332; +} +{ +sub.f16x2 r1338, %57, %93; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %72, %108; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %78, r1350; +} +{ +sub.f16x2 r1356, %57, %93; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %55, %92; +} +{ +add.f16x2 r1370, %62, r1367; +} +{ +add.f16x2 r1373, %71, %107; +} +{ +add.f16x2 r1376, %77, r1373; +} +{ +add.f16x2 r1379, %55, %92; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %62, r1382; +} +{ +sub.f16x2 r1388, %71, %107; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %55, %92; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %62, r1400; +} +{ +sub.f16x2 r1406, %71, %107; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %71, %107; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %77, r1418; +} +{ +sub.f16x2 r1424, %55, %92; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %71, %107; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %77, r1436; +} +{ +sub.f16x2 r1442, %55, %92; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1789, {low, high}; +} +mov.f32 f172, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1790, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1791, {low, high}; +} +mov.f32 f176, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1794, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1795, {low, high}; +} +mov.f32 f184, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1796, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1797, {low, high}; +} +mov.f32 f188, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1800, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1801, {low, high}; +} +mov.f32 f196, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1802, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1803, {low, high}; +} +mov.f32 f200, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1804, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1807, {low, high}; +} +mov.f32 f208, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1808, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1812, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1815, {low, high}; +} +mov.f32 f224, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1819, {low, high}; +} +mov.f32 f232, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r4613, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r4614, rd3; +mul.lo.s32 r4615, r4614, 3; +sub.s32 r4616, r4613, r4615; +cvt.rn.f32.u32 f401, r4616; +mul.f32 f402, f401, 0f3D9EDD1F; +cos.approx.f32 f309, f402; +sin.approx.f32 f403, f402; +neg.f32 f310, f403; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +mad.lo.s32 r4617, r4614, 648, r4612; +barrier.sync 0; +mad.lo.s32 r4618, r4616, 216, r4617; +st.shared.v2.f32 [r4618], {r2102, r2108}; +st.shared.v2.f32 [r4618+8], {r2881, r2890}; +st.shared.v2.f32 [r4618+16], {r2918, r2927}; +st.shared.v2.f32 [r4618+24], {r2955, r2964}; +st.shared.v2.f32 [r4618+32], {r2992, r3001}; +st.shared.v2.f32 [r4618+40], {r3029, r3038}; +st.shared.v2.f32 [r4618+48], {r3066, r3075}; +st.shared.v2.f32 [r4618+56], {r3103, r3112}; +st.shared.v2.f32 [r4618+64], {r3140, r3149}; +st.shared.v2.f32 [r4618+72], {r3177, r3186}; +st.shared.v2.f32 [r4618+80], {r3214, r3223}; +st.shared.v2.f32 [r4618+88], {r3251, r3260}; +st.shared.v2.f32 [r4618+96], {r3288, r3297}; +st.shared.v2.f32 [r4618+104], {r3325, r3334}; +st.shared.v2.f32 [r4618+112], {r3362, r3371}; +st.shared.v2.f32 [r4618+120], {r3399, r3408}; +st.shared.v2.f32 [r4618+128], {r3436, r3445}; +st.shared.v2.f32 [r4618+136], {r3473, r3482}; +st.shared.v2.f32 [r4618+144], {r3510, r3519}; +st.shared.v2.f32 [r4618+152], {r3547, r3556}; +st.shared.v2.f32 [r4618+160], {r3584, r3593}; +st.shared.v2.f32 [r4618+168], {r3621, r3630}; +st.shared.v2.f32 [r4618+176], {r3658, r3667}; +st.shared.v2.f32 [r4618+184], {r3695, r3704}; +st.shared.v2.f32 [r4618+192], {r3732, r3741}; +st.shared.v2.f32 [r4618+200], {r3769, r3778}; +st.shared.v2.f32 [r4618+208], {r3806, r3815}; +barrier.sync 0; +mad.lo.s32 r4619, r4616, -208, r4618; +ld.shared.u32 r3842, [r4619]; +ld.shared.u32 r3848, [r4619+4]; +ld.shared.u32 r3928, [r4619+24]; +ld.shared.u32 r3934, [r4619+28]; +ld.shared.u32 r4014, [r4619+48]; +ld.shared.u32 r4020, [r4619+52]; +ld.shared.u32 r4100, [r4619+72]; +ld.shared.u32 r4106, [r4619+76]; +ld.shared.u32 r4186, [r4619+96]; +ld.shared.u32 r4192, [r4619+100]; +ld.shared.u32 r4272, [r4619+120]; +ld.shared.u32 r4278, [r4619+124]; +ld.shared.u32 r4358, [r4619+144]; +ld.shared.u32 r4364, [r4619+148]; +ld.shared.u32 r4444, [r4619+168]; +ld.shared.u32 r4450, [r4619+172]; +ld.shared.u32 r4530, [r4619+192]; +ld.shared.u32 r4536, [r4619+196]; +ld.shared.u32 r3839, [r4619+216]; +ld.shared.u32 r3845, [r4619+220]; +ld.shared.u32 r3925, [r4619+240]; +ld.shared.u32 r3931, [r4619+244]; +ld.shared.u32 r4011, [r4619+264]; +ld.shared.u32 r4017, [r4619+268]; +ld.shared.u32 r4097, [r4619+288]; +ld.shared.u32 r4103, [r4619+292]; +ld.shared.u32 r4183, [r4619+312]; +ld.shared.u32 r4189, [r4619+316]; +ld.shared.u32 r4269, [r4619+336]; +ld.shared.u32 r4275, [r4619+340]; +ld.shared.u32 r4355, [r4619+360]; +ld.shared.u32 r4361, [r4619+364]; +ld.shared.u32 r4441, [r4619+384]; +ld.shared.u32 r4447, [r4619+388]; +ld.shared.u32 r4527, [r4619+408]; +ld.shared.u32 r4533, [r4619+412]; +ld.shared.u32 r3840, [r4619+432]; +ld.shared.u32 r3846, [r4619+436]; +ld.shared.u32 r3926, [r4619+456]; +ld.shared.u32 r3932, [r4619+460]; +ld.shared.u32 r4012, [r4619+480]; +ld.shared.u32 r4018, [r4619+484]; +ld.shared.u32 r4098, [r4619+504]; +ld.shared.u32 r4104, [r4619+508]; +ld.shared.u32 r4184, [r4619+528]; +ld.shared.u32 r4190, [r4619+532]; +ld.shared.u32 r4270, [r4619+552]; +ld.shared.u32 r4276, [r4619+556]; +ld.shared.u32 r4356, [r4619+576]; +ld.shared.u32 r4362, [r4619+580]; +ld.shared.u32 r4442, [r4619+600]; +ld.shared.u32 r4448, [r4619+604]; +ld.shared.u32 r4528, [r4619+624]; +ld.shared.u32 r4534, [r4619+628]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 %0, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 %1, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 %18, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 %36, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 %19, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 %37, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 %2, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 %3, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 %20, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 %38, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 %21, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 %39, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 %4, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 %5, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 %22, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 %40, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 %23, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 %41, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4095, {low, high}; +} +{ +add.f16x2 r4096, r4097, r4098; +} +{ +add.f16x2 %6, r4100, r4096; +} +{ +add.f16x2 r4102, r4103, r4104; +} +{ +add.f16x2 %7, r4106, r4102; +} +{ +add.f16x2 r4108, r4097, r4098; +} +{ +mul.f16x2 r4111, r4108, r4094; +} +{ +add.f16x2 r4114, r4100, r4111; +} +{ +sub.f16x2 r4117, r4103, r4104; +} +{ +mul.f16x2 r4120, r4117, r4095; +} +{ +add.f16x2 %24, r4114, r4120; +} +{ +add.f16x2 r4126, r4097, r4098; +} +{ +mul.f16x2 r4129, r4126, r4094; +} +{ +add.f16x2 r4132, r4100, r4129; +} +{ +sub.f16x2 r4135, r4103, r4104; +} +{ +mul.f16x2 r4138, r4135, r4095; +} +{ +sub.f16x2 %42, r4132, r4138; +} +{ +add.f16x2 r4144, r4103, r4104; +} +{ +mul.f16x2 r4147, r4144, r4094; +} +{ +add.f16x2 r4150, r4106, r4147; +} +{ +sub.f16x2 r4153, r4097, r4098; +} +{ +mul.f16x2 r4156, r4153, r4095; +} +{ +sub.f16x2 %25, r4150, r4156; +} +{ +add.f16x2 r4162, r4103, r4104; +} +{ +mul.f16x2 r4165, r4162, r4094; +} +{ +add.f16x2 r4168, r4106, r4165; +} +{ +sub.f16x2 r4171, r4097, r4098; +} +{ +mul.f16x2 r4174, r4171, r4095; +} +{ +add.f16x2 %43, r4168, r4174; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4180, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4181, {low, high}; +} +{ +add.f16x2 r4182, r4183, r4184; +} +{ +add.f16x2 %8, r4186, r4182; +} +{ +add.f16x2 r4188, r4189, r4190; +} +{ +add.f16x2 %9, r4192, r4188; +} +{ +add.f16x2 r4194, r4183, r4184; +} +{ +mul.f16x2 r4197, r4194, r4180; +} +{ +add.f16x2 r4200, r4186, r4197; +} +{ +sub.f16x2 r4203, r4189, r4190; +} +{ +mul.f16x2 r4206, r4203, r4181; +} +{ +add.f16x2 %26, r4200, r4206; +} +{ +add.f16x2 r4212, r4183, r4184; +} +{ +mul.f16x2 r4215, r4212, r4180; +} +{ +add.f16x2 r4218, r4186, r4215; +} +{ +sub.f16x2 r4221, r4189, r4190; +} +{ +mul.f16x2 r4224, r4221, r4181; +} +{ +sub.f16x2 %44, r4218, r4224; +} +{ +add.f16x2 r4230, r4189, r4190; +} +{ +mul.f16x2 r4233, r4230, r4180; +} +{ +add.f16x2 r4236, r4192, r4233; +} +{ +sub.f16x2 r4239, r4183, r4184; +} +{ +mul.f16x2 r4242, r4239, r4181; +} +{ +sub.f16x2 %27, r4236, r4242; +} +{ +add.f16x2 r4248, r4189, r4190; +} +{ +mul.f16x2 r4251, r4248, r4180; +} +{ +add.f16x2 r4254, r4192, r4251; +} +{ +sub.f16x2 r4257, r4183, r4184; +} +{ +mul.f16x2 r4260, r4257, r4181; +} +{ +add.f16x2 %45, r4254, r4260; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4267, {low, high}; +} +{ +add.f16x2 r4268, r4269, r4270; +} +{ +add.f16x2 %10, r4272, r4268; +} +{ +add.f16x2 r4274, r4275, r4276; +} +{ +add.f16x2 %11, r4278, r4274; +} +{ +add.f16x2 r4280, r4269, r4270; +} +{ +mul.f16x2 r4283, r4280, r4266; +} +{ +add.f16x2 r4286, r4272, r4283; +} +{ +sub.f16x2 r4289, r4275, r4276; +} +{ +mul.f16x2 r4292, r4289, r4267; +} +{ +add.f16x2 %28, r4286, r4292; +} +{ +add.f16x2 r4298, r4269, r4270; +} +{ +mul.f16x2 r4301, r4298, r4266; +} +{ +add.f16x2 r4304, r4272, r4301; +} +{ +sub.f16x2 r4307, r4275, r4276; +} +{ +mul.f16x2 r4310, r4307, r4267; +} +{ +sub.f16x2 %46, r4304, r4310; +} +{ +add.f16x2 r4316, r4275, r4276; +} +{ +mul.f16x2 r4319, r4316, r4266; +} +{ +add.f16x2 r4322, r4278, r4319; +} +{ +sub.f16x2 r4325, r4269, r4270; +} +{ +mul.f16x2 r4328, r4325, r4267; +} +{ +sub.f16x2 %29, r4322, r4328; +} +{ +add.f16x2 r4334, r4275, r4276; +} +{ +mul.f16x2 r4337, r4334, r4266; +} +{ +add.f16x2 r4340, r4278, r4337; +} +{ +sub.f16x2 r4343, r4269, r4270; +} +{ +mul.f16x2 r4346, r4343, r4267; +} +{ +add.f16x2 %47, r4340, r4346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4352, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4353, {low, high}; +} +{ +add.f16x2 r4354, r4355, r4356; +} +{ +add.f16x2 %12, r4358, r4354; +} +{ +add.f16x2 r4360, r4361, r4362; +} +{ +add.f16x2 %13, r4364, r4360; +} +{ +add.f16x2 r4366, r4355, r4356; +} +{ +mul.f16x2 r4369, r4366, r4352; +} +{ +add.f16x2 r4372, r4358, r4369; +} +{ +sub.f16x2 r4375, r4361, r4362; +} +{ +mul.f16x2 r4378, r4375, r4353; +} +{ +add.f16x2 %30, r4372, r4378; +} +{ +add.f16x2 r4384, r4355, r4356; +} +{ +mul.f16x2 r4387, r4384, r4352; +} +{ +add.f16x2 r4390, r4358, r4387; +} +{ +sub.f16x2 r4393, r4361, r4362; +} +{ +mul.f16x2 r4396, r4393, r4353; +} +{ +sub.f16x2 %48, r4390, r4396; +} +{ +add.f16x2 r4402, r4361, r4362; +} +{ +mul.f16x2 r4405, r4402, r4352; +} +{ +add.f16x2 r4408, r4364, r4405; +} +{ +sub.f16x2 r4411, r4355, r4356; +} +{ +mul.f16x2 r4414, r4411, r4353; +} +{ +sub.f16x2 %31, r4408, r4414; +} +{ +add.f16x2 r4420, r4361, r4362; +} +{ +mul.f16x2 r4423, r4420, r4352; +} +{ +add.f16x2 r4426, r4364, r4423; +} +{ +sub.f16x2 r4429, r4355, r4356; +} +{ +mul.f16x2 r4432, r4429, r4353; +} +{ +add.f16x2 %49, r4426, r4432; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4438, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4439, {low, high}; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 %14, r4444, r4440; +} +{ +add.f16x2 r4446, r4447, r4448; +} +{ +add.f16x2 %15, r4450, r4446; +} +{ +add.f16x2 r4452, r4441, r4442; +} +{ +mul.f16x2 r4455, r4452, r4438; +} +{ +add.f16x2 r4458, r4444, r4455; +} +{ +sub.f16x2 r4461, r4447, r4448; +} +{ +mul.f16x2 r4464, r4461, r4439; +} +{ +add.f16x2 %32, r4458, r4464; +} +{ +add.f16x2 r4470, r4441, r4442; +} +{ +mul.f16x2 r4473, r4470, r4438; +} +{ +add.f16x2 r4476, r4444, r4473; +} +{ +sub.f16x2 r4479, r4447, r4448; +} +{ +mul.f16x2 r4482, r4479, r4439; +} +{ +sub.f16x2 %50, r4476, r4482; +} +{ +add.f16x2 r4488, r4447, r4448; +} +{ +mul.f16x2 r4491, r4488, r4438; +} +{ +add.f16x2 r4494, r4450, r4491; +} +{ +sub.f16x2 r4497, r4441, r4442; +} +{ +mul.f16x2 r4500, r4497, r4439; +} +{ +sub.f16x2 %33, r4494, r4500; +} +{ +add.f16x2 r4506, r4447, r4448; +} +{ +mul.f16x2 r4509, r4506, r4438; +} +{ +add.f16x2 r4512, r4450, r4509; +} +{ +sub.f16x2 r4515, r4441, r4442; +} +{ +mul.f16x2 r4518, r4515, r4439; +} +{ +add.f16x2 %51, r4512, r4518; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4524, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4525, {low, high}; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 %16, r4530, r4526; +} +{ +add.f16x2 r4532, r4533, r4534; +} +{ +add.f16x2 %17, r4536, r4532; +} +{ +add.f16x2 r4538, r4527, r4528; +} +{ +mul.f16x2 r4541, r4538, r4524; +} +{ +add.f16x2 r4544, r4530, r4541; +} +{ +sub.f16x2 r4547, r4533, r4534; +} +{ +mul.f16x2 r4550, r4547, r4525; +} +{ +add.f16x2 %34, r4544, r4550; +} +{ +add.f16x2 r4556, r4527, r4528; +} +{ +mul.f16x2 r4559, r4556, r4524; +} +{ +add.f16x2 r4562, r4530, r4559; +} +{ +sub.f16x2 r4565, r4533, r4534; +} +{ +mul.f16x2 r4568, r4565, r4525; +} +{ +sub.f16x2 %52, r4562, r4568; +} +{ +add.f16x2 r4574, r4533, r4534; +} +{ +mul.f16x2 r4577, r4574, r4524; +} +{ +add.f16x2 r4580, r4536, r4577; +} +{ +sub.f16x2 r4583, r4527, r4528; +} +{ +mul.f16x2 r4586, r4583, r4525; +} +{ +sub.f16x2 %35, r4580, r4586; +} +{ +add.f16x2 r4592, r4533, r4534; +} +{ +mul.f16x2 r4595, r4592, r4524; +} +{ +add.f16x2 r4598, r4536, r4595; +} +{ +sub.f16x2 r4601, r4527, r4528; +} +{ +mul.f16x2 r4604, r4601, r4525; +} +{ +add.f16x2 %53, r4598, r4604; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1076, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<404>; +.reg .b32 r<4676>; +.reg .b64 rd<4>; +mov.u32 r4674, %tid.y; +mov.u32 r4675, %54; +mad.lo.s32 r4612, r4674, 324, r4675; +mov.u32 r4613, %tid.x; +mov.f32 f398, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1, {low, high}; +} +mov.f32 f400, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %106, %91; +} +{ +add.f16x2 r6, %61, r3; +} +{ +add.f16x2 r9, %70, %104; +} +{ +add.f16x2 r12, %76, r9; +} +{ +add.f16x2 r15, %106, %91; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %61, r18; +} +{ +sub.f16x2 r24, %70, %104; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %106, %91; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %61, r36; +} +{ +sub.f16x2 r42, %70, %104; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %70, %104; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %76, r54; +} +{ +sub.f16x2 r60, %106, %91; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %70, %104; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %76, r72; +} +{ +sub.f16x2 r78, %106, %91; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r88, {low, high}; +} +{ +add.f16x2 r89, %105, %90; +} +{ +add.f16x2 r92, %60, r89; +} +{ +add.f16x2 r95, %69, %102; +} +{ +add.f16x2 r98, %75, r95; +} +{ +add.f16x2 r101, %105, %90; +} +{ +mul.f16x2 r104, r101, r87; +} +{ +add.f16x2 r107, %60, r104; +} +{ +sub.f16x2 r110, %69, %102; +} +{ +mul.f16x2 r113, r110, r88; +} +{ +add.f16x2 r116, r107, r113; +} +{ +add.f16x2 r119, %105, %90; +} +{ +mul.f16x2 r122, r119, r87; +} +{ +add.f16x2 r125, %60, r122; +} +{ +sub.f16x2 r128, %69, %102; +} +{ +mul.f16x2 r131, r128, r88; +} +{ +sub.f16x2 r134, r125, r131; +} +{ +add.f16x2 r137, %69, %102; +} +{ +mul.f16x2 r140, r137, r87; +} +{ +add.f16x2 r143, %75, r140; +} +{ +sub.f16x2 r146, %105, %90; +} +{ +mul.f16x2 r149, r146, r88; +} +{ +sub.f16x2 r152, r143, r149; +} +{ +add.f16x2 r155, %69, %102; +} +{ +mul.f16x2 r158, r155, r87; +} +{ +add.f16x2 r161, %75, r158; +} +{ +sub.f16x2 r164, %105, %90; +} +{ +mul.f16x2 r167, r164, r88; +} +{ +add.f16x2 r170, r161, r167; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r173, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r174, {low, high}; +} +{ +add.f16x2 r175, %103, %88; +} +{ +add.f16x2 r178, %59, r175; +} +{ +add.f16x2 r181, %64, %101; +} +{ +add.f16x2 r184, %74, r181; +} +{ +add.f16x2 r187, %103, %88; +} +{ +mul.f16x2 r190, r187, r173; +} +{ +add.f16x2 r193, %59, r190; +} +{ +sub.f16x2 r196, %64, %101; +} +{ +mul.f16x2 r199, r196, r174; +} +{ +add.f16x2 r202, r193, r199; +} +{ +add.f16x2 r205, %103, %88; +} +{ +mul.f16x2 r208, r205, r173; +} +{ +add.f16x2 r211, %59, r208; +} +{ +sub.f16x2 r214, %64, %101; +} +{ +mul.f16x2 r217, r214, r174; +} +{ +sub.f16x2 r220, r211, r217; +} +{ +add.f16x2 r223, %64, %101; +} +{ +mul.f16x2 r226, r223, r173; +} +{ +add.f16x2 r229, %74, r226; +} +{ +sub.f16x2 r232, %103, %88; +} +{ +mul.f16x2 r235, r232, r174; +} +{ +sub.f16x2 r238, r229, r235; +} +{ +add.f16x2 r241, %64, %101; +} +{ +mul.f16x2 r244, r241, r173; +} +{ +add.f16x2 r247, %74, r244; +} +{ +sub.f16x2 r250, %103, %88; +} +{ +mul.f16x2 r253, r250, r174; +} +{ +add.f16x2 r256, r247, r253; +} +mov.f32 f178, 0f3F441B7D; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r259, {low, high}; +} +mov.f32 f180, 0f3F248DBB; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r260, {low, high}; +} +mov.f32 f190, 0f3E31D0D4; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r261, {low, high}; +} +mov.f32 f192, 0f3F7C1C5C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r262, {low, high}; +} +mov.f32 f214, 0fBF708FB2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r265, {low, high}; +} +mov.f32 f216, 0f3EAF1D44; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r266, {low, high}; +} +{ +mul.f16x2 r275, r116, r259; +} +{ +mul.f16x2 r278, r152, r260; +} +{ +sub.f16x2 r281, r275, r278; +} +{ +mul.f16x2 r284, r116, r260; +} +{ +fma.rn.f16x2 r287, r152, r259, r284; +} +{ +mul.f16x2 r291, r202, r261; +} +{ +mul.f16x2 r294, r238, r262; +} +{ +sub.f16x2 r297, r291, r294; +} +{ +mul.f16x2 r300, r202, r262; +} +{ +fma.rn.f16x2 r303, r238, r261, r300; +} +{ +mul.f16x2 r307, r134, r261; +} +{ +mul.f16x2 r310, r170, r262; +} +{ +sub.f16x2 r313, r307, r310; +} +{ +mul.f16x2 r316, r134, r262; +} +{ +fma.rn.f16x2 r319, r170, r261, r316; +} +{ +mul.f16x2 r323, r220, r265; +} +{ +mul.f16x2 r326, r256, r266; +} +{ +sub.f16x2 r329, r323, r326; +} +{ +mul.f16x2 r332, r220, r266; +} +{ +fma.rn.f16x2 r335, r256, r265, r332; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r339, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r340, {low, high}; +} +{ +add.f16x2 r341, r92, r178; +} +{ +add.f16x2 r344, r6, r341; +} +{ +add.f16x2 r347, r98, r184; +} +{ +add.f16x2 r350, r12, r347; +} +{ +add.f16x2 r353, r92, r178; +} +{ +mul.f16x2 r356, r353, r339; +} +{ +add.f16x2 r359, r6, r356; +} +{ +sub.f16x2 r362, r98, r184; +} +{ +mul.f16x2 r365, r362, r340; +} +{ +add.f16x2 r368, r359, r365; +} +{ +add.f16x2 r371, r92, r178; +} +{ +mul.f16x2 r374, r371, r339; +} +{ +add.f16x2 r377, r6, r374; +} +{ +sub.f16x2 r380, r98, r184; +} +{ +mul.f16x2 r383, r380, r340; +} +{ +sub.f16x2 r386, r377, r383; +} +{ +add.f16x2 r389, r98, r184; +} +{ +mul.f16x2 r392, r389, r339; +} +{ +add.f16x2 r395, r12, r392; +} +{ +sub.f16x2 r398, r92, r178; +} +{ +mul.f16x2 r401, r398, r340; +} +{ +sub.f16x2 r404, r395, r401; +} +{ +add.f16x2 r407, r98, r184; +} +{ +mul.f16x2 r410, r407, r339; +} +{ +add.f16x2 r413, r12, r410; +} +{ +sub.f16x2 r416, r92, r178; +} +{ +mul.f16x2 r419, r416, r340; +} +{ +add.f16x2 r422, r413, r419; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r425, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r426, {low, high}; +} +{ +add.f16x2 r427, r281, r297; +} +{ +add.f16x2 r430, r30, r427; +} +{ +add.f16x2 r433, r287, r303; +} +{ +add.f16x2 r436, r66, r433; +} +{ +add.f16x2 r439, r281, r297; +} +{ +mul.f16x2 r442, r439, r425; +} +{ +add.f16x2 r445, r30, r442; +} +{ +sub.f16x2 r448, r287, r303; +} +{ +mul.f16x2 r451, r448, r426; +} +{ +add.f16x2 r454, r445, r451; +} +{ +add.f16x2 r457, r281, r297; +} +{ +mul.f16x2 r460, r457, r425; +} +{ +add.f16x2 r463, r30, r460; +} +{ +sub.f16x2 r466, r287, r303; +} +{ +mul.f16x2 r469, r466, r426; +} +{ +sub.f16x2 r472, r463, r469; +} +{ +add.f16x2 r475, r287, r303; +} +{ +mul.f16x2 r478, r475, r425; +} +{ +add.f16x2 r481, r66, r478; +} +{ +sub.f16x2 r484, r281, r297; +} +{ +mul.f16x2 r487, r484, r426; +} +{ +sub.f16x2 r490, r481, r487; +} +{ +add.f16x2 r493, r287, r303; +} +{ +mul.f16x2 r496, r493, r425; +} +{ +add.f16x2 r499, r66, r496; +} +{ +sub.f16x2 r502, r281, r297; +} +{ +mul.f16x2 r505, r502, r426; +} +{ +add.f16x2 r508, r499, r505; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r511, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r512, {low, high}; +} +{ +add.f16x2 r513, r313, r329; +} +{ +add.f16x2 r516, r48, r513; +} +{ +add.f16x2 r519, r319, r335; +} +{ +add.f16x2 r522, r84, r519; +} +{ +add.f16x2 r525, r313, r329; +} +{ +mul.f16x2 r528, r525, r511; +} +{ +add.f16x2 r531, r48, r528; +} +{ +sub.f16x2 r534, r319, r335; +} +{ +mul.f16x2 r537, r534, r512; +} +{ +add.f16x2 r540, r531, r537; +} +{ +add.f16x2 r543, r313, r329; +} +{ +mul.f16x2 r546, r543, r511; +} +{ +add.f16x2 r549, r48, r546; +} +{ +sub.f16x2 r552, r319, r335; +} +{ +mul.f16x2 r555, r552, r512; +} +{ +sub.f16x2 r558, r549, r555; +} +{ +add.f16x2 r561, r319, r335; +} +{ +mul.f16x2 r564, r561, r511; +} +{ +add.f16x2 r567, r84, r564; +} +{ +sub.f16x2 r570, r313, r329; +} +{ +mul.f16x2 r573, r570, r512; +} +{ +sub.f16x2 r576, r567, r573; +} +{ +add.f16x2 r579, r319, r335; +} +{ +mul.f16x2 r582, r579, r511; +} +{ +add.f16x2 r585, r84, r582; +} +{ +sub.f16x2 r588, r313, r329; +} +{ +mul.f16x2 r591, r588, r512; +} +{ +add.f16x2 r594, r585, r591; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r597, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r598, {low, high}; +} +{ +add.f16x2 r599, %85, %68; +} +{ +add.f16x2 r602, %89, r599; +} +{ +add.f16x2 r605, %97, %83; +} +{ +add.f16x2 r608, %100, r605; +} +{ +add.f16x2 r611, %85, %68; +} +{ +mul.f16x2 r614, r611, r597; +} +{ +add.f16x2 r617, %89, r614; +} +{ +sub.f16x2 r620, %97, %83; +} +{ +mul.f16x2 r623, r620, r598; +} +{ +add.f16x2 r626, r617, r623; +} +{ +add.f16x2 r629, %85, %68; +} +{ +mul.f16x2 r632, r629, r597; +} +{ +add.f16x2 r635, %89, r632; +} +{ +sub.f16x2 r638, %97, %83; +} +{ +mul.f16x2 r641, r638, r598; +} +{ +sub.f16x2 r644, r635, r641; +} +{ +add.f16x2 r647, %97, %83; +} +{ +mul.f16x2 r650, r647, r597; +} +{ +add.f16x2 r653, %100, r650; +} +{ +sub.f16x2 r656, %85, %68; +} +{ +mul.f16x2 r659, r656, r598; +} +{ +sub.f16x2 r662, r653, r659; +} +{ +add.f16x2 r665, %97, %83; +} +{ +mul.f16x2 r668, r665, r597; +} +{ +add.f16x2 r671, %100, r668; +} +{ +sub.f16x2 r674, %85, %68; +} +{ +mul.f16x2 r677, r674, r598; +} +{ +add.f16x2 r680, r671, r677; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r683, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r684, {low, high}; +} +{ +add.f16x2 r685, %84, %67; +} +{ +add.f16x2 r688, %87, r685; +} +{ +add.f16x2 r691, %96, %81; +} +{ +add.f16x2 r694, %99, r691; +} +{ +add.f16x2 r697, %84, %67; +} +{ +mul.f16x2 r700, r697, r683; +} +{ +add.f16x2 r703, %87, r700; +} +{ +sub.f16x2 r706, %96, %81; +} +{ +mul.f16x2 r709, r706, r684; +} +{ +add.f16x2 r712, r703, r709; +} +{ +add.f16x2 r715, %84, %67; +} +{ +mul.f16x2 r718, r715, r683; +} +{ +add.f16x2 r721, %87, r718; +} +{ +sub.f16x2 r724, %96, %81; +} +{ +mul.f16x2 r727, r724, r684; +} +{ +sub.f16x2 r730, r721, r727; +} +{ +add.f16x2 r733, %96, %81; +} +{ +mul.f16x2 r736, r733, r683; +} +{ +add.f16x2 r739, %99, r736; +} +{ +sub.f16x2 r742, %84, %67; +} +{ +mul.f16x2 r745, r742, r684; +} +{ +sub.f16x2 r748, r739, r745; +} +{ +add.f16x2 r751, %96, %81; +} +{ +mul.f16x2 r754, r751, r683; +} +{ +add.f16x2 r757, %99, r754; +} +{ +sub.f16x2 r760, %84, %67; +} +{ +mul.f16x2 r763, r760, r684; +} +{ +add.f16x2 r766, r757, r763; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r769, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r770, {low, high}; +} +{ +add.f16x2 r771, %82, %66; +} +{ +add.f16x2 r774, %86, r771; +} +{ +add.f16x2 r777, %94, %80; +} +{ +add.f16x2 r780, %98, r777; +} +{ +add.f16x2 r783, %82, %66; +} +{ +mul.f16x2 r786, r783, r769; +} +{ +add.f16x2 r789, %86, r786; +} +{ +sub.f16x2 r792, %94, %80; +} +{ +mul.f16x2 r795, r792, r770; +} +{ +add.f16x2 r798, r789, r795; +} +{ +add.f16x2 r801, %82, %66; +} +{ +mul.f16x2 r804, r801, r769; +} +{ +add.f16x2 r807, %86, r804; +} +{ +sub.f16x2 r810, %94, %80; +} +{ +mul.f16x2 r813, r810, r770; +} +{ +sub.f16x2 r816, r807, r813; +} +{ +add.f16x2 r819, %94, %80; +} +{ +mul.f16x2 r822, r819, r769; +} +{ +add.f16x2 r825, %98, r822; +} +{ +sub.f16x2 r828, %82, %66; +} +{ +mul.f16x2 r831, r828, r770; +} +{ +sub.f16x2 r834, r825, r831; +} +{ +add.f16x2 r837, %94, %80; +} +{ +mul.f16x2 r840, r837, r769; +} +{ +add.f16x2 r843, %98, r840; +} +{ +sub.f16x2 r846, %82, %66; +} +{ +mul.f16x2 r849, r846, r770; +} +{ +add.f16x2 r852, r843, r849; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r855, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r856, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r857, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r858, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r861, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r862, {low, high}; +} +{ +mul.f16x2 r871, r712, r855; +} +{ +mul.f16x2 r874, r748, r856; +} +{ +sub.f16x2 r877, r871, r874; +} +{ +mul.f16x2 r880, r712, r856; +} +{ +fma.rn.f16x2 r883, r748, r855, r880; +} +{ +mul.f16x2 r887, r798, r857; +} +{ +mul.f16x2 r890, r834, r858; +} +{ +sub.f16x2 r893, r887, r890; +} +{ +mul.f16x2 r896, r798, r858; +} +{ +fma.rn.f16x2 r899, r834, r857, r896; +} +{ +mul.f16x2 r903, r730, r857; +} +{ +mul.f16x2 r906, r766, r858; +} +{ +sub.f16x2 r909, r903, r906; +} +{ +mul.f16x2 r912, r730, r858; +} +{ +fma.rn.f16x2 r915, r766, r857, r912; +} +{ +mul.f16x2 r919, r816, r861; +} +{ +mul.f16x2 r922, r852, r862; +} +{ +sub.f16x2 r925, r919, r922; +} +{ +mul.f16x2 r928, r816, r862; +} +{ +fma.rn.f16x2 r931, r852, r861, r928; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r935, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r936, {low, high}; +} +{ +add.f16x2 r937, r688, r774; +} +{ +add.f16x2 r940, r602, r937; +} +{ +add.f16x2 r943, r694, r780; +} +{ +add.f16x2 r946, r608, r943; +} +{ +add.f16x2 r949, r688, r774; +} +{ +mul.f16x2 r952, r949, r935; +} +{ +add.f16x2 r955, r602, r952; +} +{ +sub.f16x2 r958, r694, r780; +} +{ +mul.f16x2 r961, r958, r936; +} +{ +add.f16x2 r964, r955, r961; +} +{ +add.f16x2 r967, r688, r774; +} +{ +mul.f16x2 r970, r967, r935; +} +{ +add.f16x2 r973, r602, r970; +} +{ +sub.f16x2 r976, r694, r780; +} +{ +mul.f16x2 r979, r976, r936; +} +{ +sub.f16x2 r982, r973, r979; +} +{ +add.f16x2 r985, r694, r780; +} +{ +mul.f16x2 r988, r985, r935; +} +{ +add.f16x2 r991, r608, r988; +} +{ +sub.f16x2 r994, r688, r774; +} +{ +mul.f16x2 r997, r994, r936; +} +{ +sub.f16x2 r1000, r991, r997; +} +{ +add.f16x2 r1003, r694, r780; +} +{ +mul.f16x2 r1006, r1003, r935; +} +{ +add.f16x2 r1009, r608, r1006; +} +{ +sub.f16x2 r1012, r688, r774; +} +{ +mul.f16x2 r1015, r1012, r936; +} +{ +add.f16x2 r1018, r1009, r1015; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1021, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1022, {low, high}; +} +{ +add.f16x2 r1023, r877, r893; +} +{ +add.f16x2 r1026, r626, r1023; +} +{ +add.f16x2 r1029, r883, r899; +} +{ +add.f16x2 r1032, r662, r1029; +} +{ +add.f16x2 r1035, r877, r893; +} +{ +mul.f16x2 r1038, r1035, r1021; +} +{ +add.f16x2 r1041, r626, r1038; +} +{ +sub.f16x2 r1044, r883, r899; +} +{ +mul.f16x2 r1047, r1044, r1022; +} +{ +add.f16x2 r1050, r1041, r1047; +} +{ +add.f16x2 r1053, r877, r893; +} +{ +mul.f16x2 r1056, r1053, r1021; +} +{ +add.f16x2 r1059, r626, r1056; +} +{ +sub.f16x2 r1062, r883, r899; +} +{ +mul.f16x2 r1065, r1062, r1022; +} +{ +sub.f16x2 r1068, r1059, r1065; +} +{ +add.f16x2 r1071, r883, r899; +} +{ +mul.f16x2 r1074, r1071, r1021; +} +{ +add.f16x2 r1077, r662, r1074; +} +{ +sub.f16x2 r1080, r877, r893; +} +{ +mul.f16x2 r1083, r1080, r1022; +} +{ +sub.f16x2 r1086, r1077, r1083; +} +{ +add.f16x2 r1089, r883, r899; +} +{ +mul.f16x2 r1092, r1089, r1021; +} +{ +add.f16x2 r1095, r662, r1092; +} +{ +sub.f16x2 r1098, r877, r893; +} +{ +mul.f16x2 r1101, r1098, r1022; +} +{ +add.f16x2 r1104, r1095, r1101; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1107, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1108, {low, high}; +} +{ +add.f16x2 r1109, r909, r925; +} +{ +add.f16x2 r1112, r644, r1109; +} +{ +add.f16x2 r1115, r915, r931; +} +{ +add.f16x2 r1118, r680, r1115; +} +{ +add.f16x2 r1121, r909, r925; +} +{ +mul.f16x2 r1124, r1121, r1107; +} +{ +add.f16x2 r1127, r644, r1124; +} +{ +sub.f16x2 r1130, r915, r931; +} +{ +mul.f16x2 r1133, r1130, r1108; +} +{ +add.f16x2 r1136, r1127, r1133; +} +{ +add.f16x2 r1139, r909, r925; +} +{ +mul.f16x2 r1142, r1139, r1107; +} +{ +add.f16x2 r1145, r644, r1142; +} +{ +sub.f16x2 r1148, r915, r931; +} +{ +mul.f16x2 r1151, r1148, r1108; +} +{ +sub.f16x2 r1154, r1145, r1151; +} +{ +add.f16x2 r1157, r915, r931; +} +{ +mul.f16x2 r1160, r1157, r1107; +} +{ +add.f16x2 r1163, r680, r1160; +} +{ +sub.f16x2 r1166, r909, r925; +} +{ +mul.f16x2 r1169, r1166, r1108; +} +{ +sub.f16x2 r1172, r1163, r1169; +} +{ +add.f16x2 r1175, r915, r931; +} +{ +mul.f16x2 r1178, r1175, r1107; +} +{ +add.f16x2 r1181, r680, r1178; +} +{ +sub.f16x2 r1184, r909, r925; +} +{ +mul.f16x2 r1187, r1184, r1108; +} +{ +add.f16x2 r1190, r1181, r1187; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1193, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1194, {low, high}; +} +{ +add.f16x2 r1195, %58, %95; +} +{ +add.f16x2 r1198, %65, r1195; +} +{ +add.f16x2 r1201, %73, %56; +} +{ +add.f16x2 r1204, %79, r1201; +} +{ +add.f16x2 r1207, %58, %95; +} +{ +mul.f16x2 r1210, r1207, r1193; +} +{ +add.f16x2 r1213, %65, r1210; +} +{ +sub.f16x2 r1216, %73, %56; +} +{ +mul.f16x2 r1219, r1216, r1194; +} +{ +add.f16x2 r1222, r1213, r1219; +} +{ +add.f16x2 r1225, %58, %95; +} +{ +mul.f16x2 r1228, r1225, r1193; +} +{ +add.f16x2 r1231, %65, r1228; +} +{ +sub.f16x2 r1234, %73, %56; +} +{ +mul.f16x2 r1237, r1234, r1194; +} +{ +sub.f16x2 r1240, r1231, r1237; +} +{ +add.f16x2 r1243, %73, %56; +} +{ +mul.f16x2 r1246, r1243, r1193; +} +{ +add.f16x2 r1249, %79, r1246; +} +{ +sub.f16x2 r1252, %58, %95; +} +{ +mul.f16x2 r1255, r1252, r1194; +} +{ +sub.f16x2 r1258, r1249, r1255; +} +{ +add.f16x2 r1261, %73, %56; +} +{ +mul.f16x2 r1264, r1261, r1193; +} +{ +add.f16x2 r1267, %79, r1264; +} +{ +sub.f16x2 r1270, %58, %95; +} +{ +mul.f16x2 r1273, r1270, r1194; +} +{ +add.f16x2 r1276, r1267, r1273; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1279, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1280, {low, high}; +} +{ +add.f16x2 r1281, %57, %93; +} +{ +add.f16x2 r1284, %63, r1281; +} +{ +add.f16x2 r1287, %72, %108; +} +{ +add.f16x2 r1290, %78, r1287; +} +{ +add.f16x2 r1293, %57, %93; +} +{ +mul.f16x2 r1296, r1293, r1279; +} +{ +add.f16x2 r1299, %63, r1296; +} +{ +sub.f16x2 r1302, %72, %108; +} +{ +mul.f16x2 r1305, r1302, r1280; +} +{ +add.f16x2 r1308, r1299, r1305; +} +{ +add.f16x2 r1311, %57, %93; +} +{ +mul.f16x2 r1314, r1311, r1279; +} +{ +add.f16x2 r1317, %63, r1314; +} +{ +sub.f16x2 r1320, %72, %108; +} +{ +mul.f16x2 r1323, r1320, r1280; +} +{ +sub.f16x2 r1326, r1317, r1323; +} +{ +add.f16x2 r1329, %72, %108; +} +{ +mul.f16x2 r1332, r1329, r1279; +} +{ +add.f16x2 r1335, %78, r1332; +} +{ +sub.f16x2 r1338, %57, %93; +} +{ +mul.f16x2 r1341, r1338, r1280; +} +{ +sub.f16x2 r1344, r1335, r1341; +} +{ +add.f16x2 r1347, %72, %108; +} +{ +mul.f16x2 r1350, r1347, r1279; +} +{ +add.f16x2 r1353, %78, r1350; +} +{ +sub.f16x2 r1356, %57, %93; +} +{ +mul.f16x2 r1359, r1356, r1280; +} +{ +add.f16x2 r1362, r1353, r1359; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1365, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1366, {low, high}; +} +{ +add.f16x2 r1367, %55, %92; +} +{ +add.f16x2 r1370, %62, r1367; +} +{ +add.f16x2 r1373, %71, %107; +} +{ +add.f16x2 r1376, %77, r1373; +} +{ +add.f16x2 r1379, %55, %92; +} +{ +mul.f16x2 r1382, r1379, r1365; +} +{ +add.f16x2 r1385, %62, r1382; +} +{ +sub.f16x2 r1388, %71, %107; +} +{ +mul.f16x2 r1391, r1388, r1366; +} +{ +add.f16x2 r1394, r1385, r1391; +} +{ +add.f16x2 r1397, %55, %92; +} +{ +mul.f16x2 r1400, r1397, r1365; +} +{ +add.f16x2 r1403, %62, r1400; +} +{ +sub.f16x2 r1406, %71, %107; +} +{ +mul.f16x2 r1409, r1406, r1366; +} +{ +sub.f16x2 r1412, r1403, r1409; +} +{ +add.f16x2 r1415, %71, %107; +} +{ +mul.f16x2 r1418, r1415, r1365; +} +{ +add.f16x2 r1421, %77, r1418; +} +{ +sub.f16x2 r1424, %55, %92; +} +{ +mul.f16x2 r1427, r1424, r1366; +} +{ +sub.f16x2 r1430, r1421, r1427; +} +{ +add.f16x2 r1433, %71, %107; +} +{ +mul.f16x2 r1436, r1433, r1365; +} +{ +add.f16x2 r1439, %77, r1436; +} +{ +sub.f16x2 r1442, %55, %92; +} +{ +mul.f16x2 r1445, r1442, r1366; +} +{ +add.f16x2 r1448, r1439, r1445; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1451, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1452, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1453, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1454, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1457, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1458, {low, high}; +} +{ +mul.f16x2 r1467, r1308, r1451; +} +{ +mul.f16x2 r1470, r1344, r1452; +} +{ +sub.f16x2 r1473, r1467, r1470; +} +{ +mul.f16x2 r1476, r1308, r1452; +} +{ +fma.rn.f16x2 r1479, r1344, r1451, r1476; +} +{ +mul.f16x2 r1483, r1394, r1453; +} +{ +mul.f16x2 r1486, r1430, r1454; +} +{ +sub.f16x2 r1489, r1483, r1486; +} +{ +mul.f16x2 r1492, r1394, r1454; +} +{ +fma.rn.f16x2 r1495, r1430, r1453, r1492; +} +{ +mul.f16x2 r1499, r1326, r1453; +} +{ +mul.f16x2 r1502, r1362, r1454; +} +{ +sub.f16x2 r1505, r1499, r1502; +} +{ +mul.f16x2 r1508, r1326, r1454; +} +{ +fma.rn.f16x2 r1511, r1362, r1453, r1508; +} +{ +mul.f16x2 r1515, r1412, r1457; +} +{ +mul.f16x2 r1518, r1448, r1458; +} +{ +sub.f16x2 r1521, r1515, r1518; +} +{ +mul.f16x2 r1524, r1412, r1458; +} +{ +fma.rn.f16x2 r1527, r1448, r1457, r1524; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1531, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1532, {low, high}; +} +{ +add.f16x2 r1533, r1284, r1370; +} +{ +add.f16x2 r1536, r1198, r1533; +} +{ +add.f16x2 r1539, r1290, r1376; +} +{ +add.f16x2 r1542, r1204, r1539; +} +{ +add.f16x2 r1545, r1284, r1370; +} +{ +mul.f16x2 r1548, r1545, r1531; +} +{ +add.f16x2 r1551, r1198, r1548; +} +{ +sub.f16x2 r1554, r1290, r1376; +} +{ +mul.f16x2 r1557, r1554, r1532; +} +{ +add.f16x2 r1560, r1551, r1557; +} +{ +add.f16x2 r1563, r1284, r1370; +} +{ +mul.f16x2 r1566, r1563, r1531; +} +{ +add.f16x2 r1569, r1198, r1566; +} +{ +sub.f16x2 r1572, r1290, r1376; +} +{ +mul.f16x2 r1575, r1572, r1532; +} +{ +sub.f16x2 r1578, r1569, r1575; +} +{ +add.f16x2 r1581, r1290, r1376; +} +{ +mul.f16x2 r1584, r1581, r1531; +} +{ +add.f16x2 r1587, r1204, r1584; +} +{ +sub.f16x2 r1590, r1284, r1370; +} +{ +mul.f16x2 r1593, r1590, r1532; +} +{ +sub.f16x2 r1596, r1587, r1593; +} +{ +add.f16x2 r1599, r1290, r1376; +} +{ +mul.f16x2 r1602, r1599, r1531; +} +{ +add.f16x2 r1605, r1204, r1602; +} +{ +sub.f16x2 r1608, r1284, r1370; +} +{ +mul.f16x2 r1611, r1608, r1532; +} +{ +add.f16x2 r1614, r1605, r1611; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1617, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1618, {low, high}; +} +{ +add.f16x2 r1619, r1473, r1489; +} +{ +add.f16x2 r1622, r1222, r1619; +} +{ +add.f16x2 r1625, r1479, r1495; +} +{ +add.f16x2 r1628, r1258, r1625; +} +{ +add.f16x2 r1631, r1473, r1489; +} +{ +mul.f16x2 r1634, r1631, r1617; +} +{ +add.f16x2 r1637, r1222, r1634; +} +{ +sub.f16x2 r1640, r1479, r1495; +} +{ +mul.f16x2 r1643, r1640, r1618; +} +{ +add.f16x2 r1646, r1637, r1643; +} +{ +add.f16x2 r1649, r1473, r1489; +} +{ +mul.f16x2 r1652, r1649, r1617; +} +{ +add.f16x2 r1655, r1222, r1652; +} +{ +sub.f16x2 r1658, r1479, r1495; +} +{ +mul.f16x2 r1661, r1658, r1618; +} +{ +sub.f16x2 r1664, r1655, r1661; +} +{ +add.f16x2 r1667, r1479, r1495; +} +{ +mul.f16x2 r1670, r1667, r1617; +} +{ +add.f16x2 r1673, r1258, r1670; +} +{ +sub.f16x2 r1676, r1473, r1489; +} +{ +mul.f16x2 r1679, r1676, r1618; +} +{ +sub.f16x2 r1682, r1673, r1679; +} +{ +add.f16x2 r1685, r1479, r1495; +} +{ +mul.f16x2 r1688, r1685, r1617; +} +{ +add.f16x2 r1691, r1258, r1688; +} +{ +sub.f16x2 r1694, r1473, r1489; +} +{ +mul.f16x2 r1697, r1694, r1618; +} +{ +add.f16x2 r1700, r1691, r1697; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r1703, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r1704, {low, high}; +} +{ +add.f16x2 r1705, r1505, r1521; +} +{ +add.f16x2 r1708, r1240, r1705; +} +{ +add.f16x2 r1711, r1511, r1527; +} +{ +add.f16x2 r1714, r1276, r1711; +} +{ +add.f16x2 r1717, r1505, r1521; +} +{ +mul.f16x2 r1720, r1717, r1703; +} +{ +add.f16x2 r1723, r1240, r1720; +} +{ +sub.f16x2 r1726, r1511, r1527; +} +{ +mul.f16x2 r1729, r1726, r1704; +} +{ +add.f16x2 r1732, r1723, r1729; +} +{ +add.f16x2 r1735, r1505, r1521; +} +{ +mul.f16x2 r1738, r1735, r1703; +} +{ +add.f16x2 r1741, r1240, r1738; +} +{ +sub.f16x2 r1744, r1511, r1527; +} +{ +mul.f16x2 r1747, r1744, r1704; +} +{ +sub.f16x2 r1750, r1741, r1747; +} +{ +add.f16x2 r1753, r1511, r1527; +} +{ +mul.f16x2 r1756, r1753, r1703; +} +{ +add.f16x2 r1759, r1276, r1756; +} +{ +sub.f16x2 r1762, r1505, r1521; +} +{ +mul.f16x2 r1765, r1762, r1704; +} +{ +sub.f16x2 r1768, r1759, r1765; +} +{ +add.f16x2 r1771, r1511, r1527; +} +{ +mul.f16x2 r1774, r1771, r1703; +} +{ +add.f16x2 r1777, r1276, r1774; +} +{ +sub.f16x2 r1780, r1505, r1521; +} +{ +mul.f16x2 r1783, r1780, r1704; +} +{ +add.f16x2 r1786, r1777, r1783; +} +mov.f32 f170, 0f3F791978; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f170; +cvt.rn.f16.f32 high, f170; +mov.b32 r1789, {low, high}; +} +mov.f32 f172, 0f3E6C2691; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f172; +cvt.rn.f16.f32 high, f172; +mov.b32 r1790, {low, high}; +} +mov.f32 f174, 0f3F64C51C; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f174; +cvt.rn.f16.f32 high, f174; +mov.b32 r1791, {low, high}; +} +mov.f32 f176, 0f3EE5C902; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f176; +cvt.rn.f16.f32 high, f176; +mov.b32 r1792, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f178; +cvt.rn.f16.f32 high, f178; +mov.b32 r1793, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f180; +cvt.rn.f16.f32 high, f180; +mov.b32 r1794, {low, high}; +} +mov.f32 f182, 0f3F18DF63; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f182; +cvt.rn.f16.f32 high, f182; +mov.b32 r1795, {low, high}; +} +mov.f32 f184, 0f3F4D57F2; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f184; +cvt.rn.f16.f32 high, f184; +mov.b32 r1796, {low, high}; +} +mov.f32 f186, 0f3ECACAF8; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f186; +cvt.rn.f16.f32 high, f186; +mov.b32 r1797, {low, high}; +} +mov.f32 f188, 0f3F6B1036; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f188; +cvt.rn.f16.f32 high, f188; +mov.b32 r1798, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f190; +cvt.rn.f16.f32 high, f190; +mov.b32 r1799, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f192; +cvt.rn.f16.f32 high, f192; +mov.b32 r1800, {low, high}; +} +mov.f32 f194, 0fBD6E2946; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f194; +cvt.rn.f16.f32 high, f194; +mov.b32 r1801, {low, high}; +} +mov.f32 f196, 0f3F7F9120; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f196; +cvt.rn.f16.f32 high, f196; +mov.b32 r1802, {low, high}; +} +mov.f32 f198, 0fBE92D7E0; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f198; +cvt.rn.f16.f32 high, f198; +mov.b32 r1803, {low, high}; +} +mov.f32 f200, 0f3F753ECD; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f200; +cvt.rn.f16.f32 high, f200; +mov.b32 r1804, {low, high}; +} +mov.f32 f206, 0fBF2FAD88; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f206; +cvt.rn.f16.f32 high, f206; +mov.b32 r1807, {low, high}; +} +mov.f32 f208, 0f3F3A3529; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f208; +cvt.rn.f16.f32 high, f208; +mov.b32 r1808, {low, high}; +} +mov.f32 f230, 0fBF55E287; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f214; +cvt.rn.f16.f32 high, f214; +mov.b32 r1811, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f216; +cvt.rn.f16.f32 high, f216; +mov.b32 r1812, {low, high}; +} +mov.f32 f222, 0fBF7E44DE; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f222; +cvt.rn.f16.f32 high, f222; +mov.b32 r1815, {low, high}; +} +mov.f32 f224, 0fBDEDC21F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f224; +cvt.rn.f16.f32 high, f224; +mov.b32 r1816, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f230; +cvt.rn.f16.f32 high, f230; +mov.b32 r1819, {low, high}; +} +mov.f32 f232, 0fBF0CAC9F; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f232; +cvt.rn.f16.f32 high, f232; +mov.b32 r1820, {low, high}; +} +{ +mul.f16x2 r1841, r1026, r1789; +} +{ +mul.f16x2 r1844, r1032, r1790; +} +{ +sub.f16x2 r1847, r1841, r1844; +} +{ +mul.f16x2 r1850, r1026, r1790; +} +{ +fma.rn.f16x2 r1853, r1032, r1789, r1850; +} +{ +mul.f16x2 r1857, r1622, r1791; +} +{ +mul.f16x2 r1860, r1628, r1792; +} +{ +sub.f16x2 r1863, r1857, r1860; +} +{ +mul.f16x2 r1866, r1622, r1792; +} +{ +fma.rn.f16x2 r1869, r1628, r1791, r1866; +} +{ +mul.f16x2 r1873, r1112, r1791; +} +{ +mul.f16x2 r1876, r1118, r1792; +} +{ +sub.f16x2 r1879, r1873, r1876; +} +{ +mul.f16x2 r1882, r1112, r1792; +} +{ +fma.rn.f16x2 r1885, r1118, r1791, r1882; +} +{ +mul.f16x2 r1889, r1708, r1795; +} +{ +mul.f16x2 r1892, r1714, r1796; +} +{ +sub.f16x2 r1895, r1889, r1892; +} +{ +mul.f16x2 r1898, r1708, r1796; +} +{ +fma.rn.f16x2 r1901, r1714, r1795, r1898; +} +{ +mul.f16x2 r1905, r964, r1793; +} +{ +mul.f16x2 r1908, r1000, r1794; +} +{ +sub.f16x2 r1911, r1905, r1908; +} +{ +mul.f16x2 r1914, r964, r1794; +} +{ +fma.rn.f16x2 r1917, r1000, r1793, r1914; +} +{ +mul.f16x2 r1921, r1560, r1799; +} +{ +mul.f16x2 r1924, r1596, r1800; +} +{ +sub.f16x2 r1927, r1921, r1924; +} +{ +mul.f16x2 r1930, r1560, r1800; +} +{ +fma.rn.f16x2 r1933, r1596, r1799, r1930; +} +{ +mul.f16x2 r1937, r1050, r1795; +} +{ +mul.f16x2 r1940, r1086, r1796; +} +{ +sub.f16x2 r1943, r1937, r1940; +} +{ +mul.f16x2 r1946, r1050, r1796; +} +{ +fma.rn.f16x2 r1949, r1086, r1795, r1946; +} +{ +mul.f16x2 r1953, r1646, r1803; +} +{ +mul.f16x2 r1956, r1682, r1804; +} +{ +sub.f16x2 r1959, r1953, r1956; +} +{ +mul.f16x2 r1962, r1646, r1804; +} +{ +fma.rn.f16x2 r1965, r1682, r1803, r1962; +} +{ +mul.f16x2 r1969, r1136, r1797; +} +{ +mul.f16x2 r1972, r1172, r1798; +} +{ +sub.f16x2 r1975, r1969, r1972; +} +{ +mul.f16x2 r1978, r1136, r1798; +} +{ +fma.rn.f16x2 r1981, r1172, r1797, r1978; +} +{ +mul.f16x2 r1985, r1732, r1807; +} +{ +mul.f16x2 r1988, r1768, r1808; +} +{ +sub.f16x2 r1991, r1985, r1988; +} +{ +mul.f16x2 r1994, r1732, r1808; +} +{ +fma.rn.f16x2 r1997, r1768, r1807, r1994; +} +{ +mul.f16x2 r2001, r982, r1799; +} +{ +mul.f16x2 r2004, r1018, r1800; +} +{ +sub.f16x2 r2007, r2001, r2004; +} +{ +mul.f16x2 r2010, r982, r1800; +} +{ +fma.rn.f16x2 r2013, r1018, r1799, r2010; +} +{ +mul.f16x2 r2017, r1578, r1811; +} +{ +mul.f16x2 r2020, r1614, r1812; +} +{ +sub.f16x2 r2023, r2017, r2020; +} +{ +mul.f16x2 r2026, r1578, r1812; +} +{ +fma.rn.f16x2 r2029, r1614, r1811, r2026; +} +{ +mul.f16x2 r2033, r1068, r1801; +} +{ +mul.f16x2 r2036, r1104, r1802; +} +{ +sub.f16x2 r2039, r2033, r2036; +} +{ +mul.f16x2 r2042, r1068, r1802; +} +{ +fma.rn.f16x2 r2045, r1104, r1801, r2042; +} +{ +mul.f16x2 r2049, r1664, r1815; +} +{ +mul.f16x2 r2052, r1700, r1816; +} +{ +sub.f16x2 r2055, r2049, r2052; +} +{ +mul.f16x2 r2058, r1664, r1816; +} +{ +fma.rn.f16x2 r2061, r1700, r1815, r2058; +} +{ +mul.f16x2 r2065, r1154, r1803; +} +{ +mul.f16x2 r2068, r1190, r1804; +} +{ +sub.f16x2 r2071, r2065, r2068; +} +{ +mul.f16x2 r2074, r1154, r1804; +} +{ +fma.rn.f16x2 r2077, r1190, r1803, r2074; +} +{ +mul.f16x2 r2081, r1750, r1819; +} +{ +mul.f16x2 r2084, r1786, r1820; +} +{ +sub.f16x2 r2087, r2081, r2084; +} +{ +mul.f16x2 r2090, r1750, r1820; +} +{ +fma.rn.f16x2 r2093, r1786, r1819, r2090; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2097, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2098, {low, high}; +} +{ +add.f16x2 r2099, r940, r1536; +} +{ +add.f16x2 r2102, r344, r2099; +} +{ +add.f16x2 r2105, r946, r1542; +} +{ +add.f16x2 r2108, r350, r2105; +} +{ +add.f16x2 r2111, r940, r1536; +} +{ +mul.f16x2 r2114, r2111, r2097; +} +{ +add.f16x2 r2117, r344, r2114; +} +{ +sub.f16x2 r2120, r946, r1542; +} +{ +mul.f16x2 r2123, r2120, r2098; +} +{ +add.f16x2 r2126, r2117, r2123; +} +{ +add.f16x2 r2129, r940, r1536; +} +{ +mul.f16x2 r2132, r2129, r2097; +} +{ +add.f16x2 r2135, r344, r2132; +} +{ +sub.f16x2 r2138, r946, r1542; +} +{ +mul.f16x2 r2141, r2138, r2098; +} +{ +sub.f16x2 r2144, r2135, r2141; +} +{ +add.f16x2 r2147, r946, r1542; +} +{ +mul.f16x2 r2150, r2147, r2097; +} +{ +add.f16x2 r2153, r350, r2150; +} +{ +sub.f16x2 r2156, r940, r1536; +} +{ +mul.f16x2 r2159, r2156, r2098; +} +{ +sub.f16x2 r2162, r2153, r2159; +} +{ +add.f16x2 r2165, r946, r1542; +} +{ +mul.f16x2 r2168, r2165, r2097; +} +{ +add.f16x2 r2171, r350, r2168; +} +{ +sub.f16x2 r2174, r940, r1536; +} +{ +mul.f16x2 r2177, r2174, r2098; +} +{ +add.f16x2 r2180, r2171, r2177; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2183, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2184, {low, high}; +} +{ +add.f16x2 r2185, r1847, r1863; +} +{ +add.f16x2 r2188, r430, r2185; +} +{ +add.f16x2 r2191, r1853, r1869; +} +{ +add.f16x2 r2194, r436, r2191; +} +{ +add.f16x2 r2197, r1847, r1863; +} +{ +mul.f16x2 r2200, r2197, r2183; +} +{ +add.f16x2 r2203, r430, r2200; +} +{ +sub.f16x2 r2206, r1853, r1869; +} +{ +mul.f16x2 r2209, r2206, r2184; +} +{ +add.f16x2 r2212, r2203, r2209; +} +{ +add.f16x2 r2215, r1847, r1863; +} +{ +mul.f16x2 r2218, r2215, r2183; +} +{ +add.f16x2 r2221, r430, r2218; +} +{ +sub.f16x2 r2224, r1853, r1869; +} +{ +mul.f16x2 r2227, r2224, r2184; +} +{ +sub.f16x2 r2230, r2221, r2227; +} +{ +add.f16x2 r2233, r1853, r1869; +} +{ +mul.f16x2 r2236, r2233, r2183; +} +{ +add.f16x2 r2239, r436, r2236; +} +{ +sub.f16x2 r2242, r1847, r1863; +} +{ +mul.f16x2 r2245, r2242, r2184; +} +{ +sub.f16x2 r2248, r2239, r2245; +} +{ +add.f16x2 r2251, r1853, r1869; +} +{ +mul.f16x2 r2254, r2251, r2183; +} +{ +add.f16x2 r2257, r436, r2254; +} +{ +sub.f16x2 r2260, r1847, r1863; +} +{ +mul.f16x2 r2263, r2260, r2184; +} +{ +add.f16x2 r2266, r2257, r2263; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2269, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2270, {low, high}; +} +{ +add.f16x2 r2271, r1879, r1895; +} +{ +add.f16x2 r2274, r516, r2271; +} +{ +add.f16x2 r2277, r1885, r1901; +} +{ +add.f16x2 r2280, r522, r2277; +} +{ +add.f16x2 r2283, r1879, r1895; +} +{ +mul.f16x2 r2286, r2283, r2269; +} +{ +add.f16x2 r2289, r516, r2286; +} +{ +sub.f16x2 r2292, r1885, r1901; +} +{ +mul.f16x2 r2295, r2292, r2270; +} +{ +add.f16x2 r2298, r2289, r2295; +} +{ +add.f16x2 r2301, r1879, r1895; +} +{ +mul.f16x2 r2304, r2301, r2269; +} +{ +add.f16x2 r2307, r516, r2304; +} +{ +sub.f16x2 r2310, r1885, r1901; +} +{ +mul.f16x2 r2313, r2310, r2270; +} +{ +sub.f16x2 r2316, r2307, r2313; +} +{ +add.f16x2 r2319, r1885, r1901; +} +{ +mul.f16x2 r2322, r2319, r2269; +} +{ +add.f16x2 r2325, r522, r2322; +} +{ +sub.f16x2 r2328, r1879, r1895; +} +{ +mul.f16x2 r2331, r2328, r2270; +} +{ +sub.f16x2 r2334, r2325, r2331; +} +{ +add.f16x2 r2337, r1885, r1901; +} +{ +mul.f16x2 r2340, r2337, r2269; +} +{ +add.f16x2 r2343, r522, r2340; +} +{ +sub.f16x2 r2346, r1879, r1895; +} +{ +mul.f16x2 r2349, r2346, r2270; +} +{ +add.f16x2 r2352, r2343, r2349; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2355, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2356, {low, high}; +} +{ +add.f16x2 r2357, r1911, r1927; +} +{ +add.f16x2 r2360, r368, r2357; +} +{ +add.f16x2 r2363, r1917, r1933; +} +{ +add.f16x2 r2366, r404, r2363; +} +{ +add.f16x2 r2369, r1911, r1927; +} +{ +mul.f16x2 r2372, r2369, r2355; +} +{ +add.f16x2 r2375, r368, r2372; +} +{ +sub.f16x2 r2378, r1917, r1933; +} +{ +mul.f16x2 r2381, r2378, r2356; +} +{ +add.f16x2 r2384, r2375, r2381; +} +{ +add.f16x2 r2387, r1911, r1927; +} +{ +mul.f16x2 r2390, r2387, r2355; +} +{ +add.f16x2 r2393, r368, r2390; +} +{ +sub.f16x2 r2396, r1917, r1933; +} +{ +mul.f16x2 r2399, r2396, r2356; +} +{ +sub.f16x2 r2402, r2393, r2399; +} +{ +add.f16x2 r2405, r1917, r1933; +} +{ +mul.f16x2 r2408, r2405, r2355; +} +{ +add.f16x2 r2411, r404, r2408; +} +{ +sub.f16x2 r2414, r1911, r1927; +} +{ +mul.f16x2 r2417, r2414, r2356; +} +{ +sub.f16x2 r2420, r2411, r2417; +} +{ +add.f16x2 r2423, r1917, r1933; +} +{ +mul.f16x2 r2426, r2423, r2355; +} +{ +add.f16x2 r2429, r404, r2426; +} +{ +sub.f16x2 r2432, r1911, r1927; +} +{ +mul.f16x2 r2435, r2432, r2356; +} +{ +add.f16x2 r2438, r2429, r2435; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2441, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2442, {low, high}; +} +{ +add.f16x2 r2443, r1943, r1959; +} +{ +add.f16x2 r2446, r454, r2443; +} +{ +add.f16x2 r2449, r1949, r1965; +} +{ +add.f16x2 r2452, r490, r2449; +} +{ +add.f16x2 r2455, r1943, r1959; +} +{ +mul.f16x2 r2458, r2455, r2441; +} +{ +add.f16x2 r2461, r454, r2458; +} +{ +sub.f16x2 r2464, r1949, r1965; +} +{ +mul.f16x2 r2467, r2464, r2442; +} +{ +add.f16x2 r2470, r2461, r2467; +} +{ +add.f16x2 r2473, r1943, r1959; +} +{ +mul.f16x2 r2476, r2473, r2441; +} +{ +add.f16x2 r2479, r454, r2476; +} +{ +sub.f16x2 r2482, r1949, r1965; +} +{ +mul.f16x2 r2485, r2482, r2442; +} +{ +sub.f16x2 r2488, r2479, r2485; +} +{ +add.f16x2 r2491, r1949, r1965; +} +{ +mul.f16x2 r2494, r2491, r2441; +} +{ +add.f16x2 r2497, r490, r2494; +} +{ +sub.f16x2 r2500, r1943, r1959; +} +{ +mul.f16x2 r2503, r2500, r2442; +} +{ +sub.f16x2 r2506, r2497, r2503; +} +{ +add.f16x2 r2509, r1949, r1965; +} +{ +mul.f16x2 r2512, r2509, r2441; +} +{ +add.f16x2 r2515, r490, r2512; +} +{ +sub.f16x2 r2518, r1943, r1959; +} +{ +mul.f16x2 r2521, r2518, r2442; +} +{ +add.f16x2 r2524, r2515, r2521; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2527, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2528, {low, high}; +} +{ +add.f16x2 r2529, r1975, r1991; +} +{ +add.f16x2 r2532, r540, r2529; +} +{ +add.f16x2 r2535, r1981, r1997; +} +{ +add.f16x2 r2538, r576, r2535; +} +{ +add.f16x2 r2541, r1975, r1991; +} +{ +mul.f16x2 r2544, r2541, r2527; +} +{ +add.f16x2 r2547, r540, r2544; +} +{ +sub.f16x2 r2550, r1981, r1997; +} +{ +mul.f16x2 r2553, r2550, r2528; +} +{ +add.f16x2 r2556, r2547, r2553; +} +{ +add.f16x2 r2559, r1975, r1991; +} +{ +mul.f16x2 r2562, r2559, r2527; +} +{ +add.f16x2 r2565, r540, r2562; +} +{ +sub.f16x2 r2568, r1981, r1997; +} +{ +mul.f16x2 r2571, r2568, r2528; +} +{ +sub.f16x2 r2574, r2565, r2571; +} +{ +add.f16x2 r2577, r1981, r1997; +} +{ +mul.f16x2 r2580, r2577, r2527; +} +{ +add.f16x2 r2583, r576, r2580; +} +{ +sub.f16x2 r2586, r1975, r1991; +} +{ +mul.f16x2 r2589, r2586, r2528; +} +{ +sub.f16x2 r2592, r2583, r2589; +} +{ +add.f16x2 r2595, r1981, r1997; +} +{ +mul.f16x2 r2598, r2595, r2527; +} +{ +add.f16x2 r2601, r576, r2598; +} +{ +sub.f16x2 r2604, r1975, r1991; +} +{ +mul.f16x2 r2607, r2604, r2528; +} +{ +add.f16x2 r2610, r2601, r2607; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2613, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2614, {low, high}; +} +{ +add.f16x2 r2615, r2007, r2023; +} +{ +add.f16x2 r2618, r386, r2615; +} +{ +add.f16x2 r2621, r2013, r2029; +} +{ +add.f16x2 r2624, r422, r2621; +} +{ +add.f16x2 r2627, r2007, r2023; +} +{ +mul.f16x2 r2630, r2627, r2613; +} +{ +add.f16x2 r2633, r386, r2630; +} +{ +sub.f16x2 r2636, r2013, r2029; +} +{ +mul.f16x2 r2639, r2636, r2614; +} +{ +add.f16x2 r2642, r2633, r2639; +} +{ +add.f16x2 r2645, r2007, r2023; +} +{ +mul.f16x2 r2648, r2645, r2613; +} +{ +add.f16x2 r2651, r386, r2648; +} +{ +sub.f16x2 r2654, r2013, r2029; +} +{ +mul.f16x2 r2657, r2654, r2614; +} +{ +sub.f16x2 r2660, r2651, r2657; +} +{ +add.f16x2 r2663, r2013, r2029; +} +{ +mul.f16x2 r2666, r2663, r2613; +} +{ +add.f16x2 r2669, r422, r2666; +} +{ +sub.f16x2 r2672, r2007, r2023; +} +{ +mul.f16x2 r2675, r2672, r2614; +} +{ +sub.f16x2 r2678, r2669, r2675; +} +{ +add.f16x2 r2681, r2013, r2029; +} +{ +mul.f16x2 r2684, r2681, r2613; +} +{ +add.f16x2 r2687, r422, r2684; +} +{ +sub.f16x2 r2690, r2007, r2023; +} +{ +mul.f16x2 r2693, r2690, r2614; +} +{ +add.f16x2 r2696, r2687, r2693; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2699, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2700, {low, high}; +} +{ +add.f16x2 r2701, r2039, r2055; +} +{ +add.f16x2 r2704, r472, r2701; +} +{ +add.f16x2 r2707, r2045, r2061; +} +{ +add.f16x2 r2710, r508, r2707; +} +{ +add.f16x2 r2713, r2039, r2055; +} +{ +mul.f16x2 r2716, r2713, r2699; +} +{ +add.f16x2 r2719, r472, r2716; +} +{ +sub.f16x2 r2722, r2045, r2061; +} +{ +mul.f16x2 r2725, r2722, r2700; +} +{ +add.f16x2 r2728, r2719, r2725; +} +{ +add.f16x2 r2731, r2039, r2055; +} +{ +mul.f16x2 r2734, r2731, r2699; +} +{ +add.f16x2 r2737, r472, r2734; +} +{ +sub.f16x2 r2740, r2045, r2061; +} +{ +mul.f16x2 r2743, r2740, r2700; +} +{ +sub.f16x2 r2746, r2737, r2743; +} +{ +add.f16x2 r2749, r2045, r2061; +} +{ +mul.f16x2 r2752, r2749, r2699; +} +{ +add.f16x2 r2755, r508, r2752; +} +{ +sub.f16x2 r2758, r2039, r2055; +} +{ +mul.f16x2 r2761, r2758, r2700; +} +{ +sub.f16x2 r2764, r2755, r2761; +} +{ +add.f16x2 r2767, r2045, r2061; +} +{ +mul.f16x2 r2770, r2767, r2699; +} +{ +add.f16x2 r2773, r508, r2770; +} +{ +sub.f16x2 r2776, r2039, r2055; +} +{ +mul.f16x2 r2779, r2776, r2700; +} +{ +add.f16x2 r2782, r2773, r2779; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r2785, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r2786, {low, high}; +} +{ +add.f16x2 r2787, r2071, r2087; +} +{ +add.f16x2 r2790, r558, r2787; +} +{ +add.f16x2 r2793, r2077, r2093; +} +{ +add.f16x2 r2796, r594, r2793; +} +{ +add.f16x2 r2799, r2071, r2087; +} +{ +mul.f16x2 r2802, r2799, r2785; +} +{ +add.f16x2 r2805, r558, r2802; +} +{ +sub.f16x2 r2808, r2077, r2093; +} +{ +mul.f16x2 r2811, r2808, r2786; +} +{ +add.f16x2 r2814, r2805, r2811; +} +{ +add.f16x2 r2817, r2071, r2087; +} +{ +mul.f16x2 r2820, r2817, r2785; +} +{ +add.f16x2 r2823, r558, r2820; +} +{ +sub.f16x2 r2826, r2077, r2093; +} +{ +mul.f16x2 r2829, r2826, r2786; +} +{ +sub.f16x2 r2832, r2823, r2829; +} +{ +add.f16x2 r2835, r2077, r2093; +} +{ +mul.f16x2 r2838, r2835, r2785; +} +{ +add.f16x2 r2841, r594, r2838; +} +{ +sub.f16x2 r2844, r2071, r2087; +} +{ +mul.f16x2 r2847, r2844, r2786; +} +{ +sub.f16x2 r2850, r2841, r2847; +} +{ +add.f16x2 r2853, r2077, r2093; +} +{ +mul.f16x2 r2856, r2853, r2785; +} +{ +add.f16x2 r2859, r594, r2856; +} +{ +sub.f16x2 r2862, r2071, r2087; +} +{ +mul.f16x2 r2865, r2862, r2786; +} +{ +add.f16x2 r2868, r2859, r2865; +} +mul.wide.u32 rd2, r4613, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r4614, rd3; +mul.lo.s32 r4615, r4614, 3; +sub.s32 r4616, r4613, r4615; +mad.lo.s32 r4617, r4614, 324, r4612; +cvt.rn.f32.u32 f401, r4616; +mul.f32 f402, f401, 0f3D9EDD1F; +cos.approx.f32 f309, f402; +sin.approx.f32 f403, f402; +neg.f32 f310, f403; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f309; +cvt.rn.f16.f32 high, f310; +mov.b32 r2871, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2874, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2876, {high, high}; +} +{ +mul.f16x2 r2878, r2194, r2876; +} +{ +fma.rn.f16x2 r2881, r2188, r2874, r2878; +} +{ +mul.f16x2 r2885, r2188, r2876; +} +{ +neg.f16x2 r2888, r2885; +} +{ +fma.rn.f16x2 r2890, r2194, r2874, r2888; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2894, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2896, {high, high}; +} +mov.f32 f361, 0fBF800000; +mov.f32 f362, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2898, {low, high}; +} +{ +mul.f16x2 r2899, r2896, r2898; +} +{ +mul.f16x2 r2902, r2871, r2894; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2905, {high, low}; +} +{ +fma.rn.f16x2 r2907, r2899, r2905, r2902; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2911, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2913, {high, high}; +} +{ +mul.f16x2 r2915, r2280, r2913; +} +{ +fma.rn.f16x2 r2918, r2274, r2911, r2915; +} +{ +mul.f16x2 r2922, r2274, r2913; +} +{ +neg.f16x2 r2925, r2922; +} +{ +fma.rn.f16x2 r2927, r2280, r2911, r2925; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2931, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2933, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2935, {low, high}; +} +{ +mul.f16x2 r2936, r2933, r2935; +} +{ +mul.f16x2 r2939, r2907, r2931; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2907; +mov.b32 r2942, {high, low}; +} +{ +fma.rn.f16x2 r2944, r2936, r2942, r2939; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2948, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2950, {high, high}; +} +{ +mul.f16x2 r2952, r2366, r2950; +} +{ +fma.rn.f16x2 r2955, r2360, r2948, r2952; +} +{ +mul.f16x2 r2959, r2360, r2950; +} +{ +neg.f16x2 r2962, r2959; +} +{ +fma.rn.f16x2 r2964, r2366, r2948, r2962; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2968, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r2970, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r2972, {low, high}; +} +{ +mul.f16x2 r2973, r2970, r2972; +} +{ +mul.f16x2 r2976, r2944, r2968; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2944; +mov.b32 r2979, {high, low}; +} +{ +fma.rn.f16x2 r2981, r2973, r2979, r2976; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2985, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r2987, {high, high}; +} +{ +mul.f16x2 r2989, r2452, r2987; +} +{ +fma.rn.f16x2 r2992, r2446, r2985, r2989; +} +{ +mul.f16x2 r2996, r2446, r2987; +} +{ +neg.f16x2 r2999, r2996; +} +{ +fma.rn.f16x2 r3001, r2452, r2985, r2999; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3005, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3007, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3009, {low, high}; +} +{ +mul.f16x2 r3010, r3007, r3009; +} +{ +mul.f16x2 r3013, r2981, r3005; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2981; +mov.b32 r3016, {high, low}; +} +{ +fma.rn.f16x2 r3018, r3010, r3016, r3013; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3022, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3024, {high, high}; +} +{ +mul.f16x2 r3026, r2538, r3024; +} +{ +fma.rn.f16x2 r3029, r2532, r3022, r3026; +} +{ +mul.f16x2 r3033, r2532, r3024; +} +{ +neg.f16x2 r3036, r3033; +} +{ +fma.rn.f16x2 r3038, r2538, r3022, r3036; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3042, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3044, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3046, {low, high}; +} +{ +mul.f16x2 r3047, r3044, r3046; +} +{ +mul.f16x2 r3050, r3018, r3042; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3018; +mov.b32 r3053, {high, low}; +} +{ +fma.rn.f16x2 r3055, r3047, r3053, r3050; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3059, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3061, {high, high}; +} +{ +mul.f16x2 r3063, r2624, r3061; +} +{ +fma.rn.f16x2 r3066, r2618, r3059, r3063; +} +{ +mul.f16x2 r3070, r2618, r3061; +} +{ +neg.f16x2 r3073, r3070; +} +{ +fma.rn.f16x2 r3075, r2624, r3059, r3073; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3079, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3081, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3083, {low, high}; +} +{ +mul.f16x2 r3084, r3081, r3083; +} +{ +mul.f16x2 r3087, r3055, r3079; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3055; +mov.b32 r3090, {high, low}; +} +{ +fma.rn.f16x2 r3092, r3084, r3090, r3087; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3096, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3098, {high, high}; +} +{ +mul.f16x2 r3100, r2710, r3098; +} +{ +fma.rn.f16x2 r3103, r2704, r3096, r3100; +} +{ +mul.f16x2 r3107, r2704, r3098; +} +{ +neg.f16x2 r3110, r3107; +} +{ +fma.rn.f16x2 r3112, r2710, r3096, r3110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3116, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3118, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3120, {low, high}; +} +{ +mul.f16x2 r3121, r3118, r3120; +} +{ +mul.f16x2 r3124, r3092, r3116; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3092; +mov.b32 r3127, {high, low}; +} +{ +fma.rn.f16x2 r3129, r3121, r3127, r3124; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3133, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3135, {high, high}; +} +{ +mul.f16x2 r3137, r2796, r3135; +} +{ +fma.rn.f16x2 r3140, r2790, r3133, r3137; +} +{ +mul.f16x2 r3144, r2790, r3135; +} +{ +neg.f16x2 r3147, r3144; +} +{ +fma.rn.f16x2 r3149, r2796, r3133, r3147; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3153, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3155, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3157, {low, high}; +} +{ +mul.f16x2 r3158, r3155, r3157; +} +{ +mul.f16x2 r3161, r3129, r3153; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3129; +mov.b32 r3164, {high, low}; +} +{ +fma.rn.f16x2 r3166, r3158, r3164, r3161; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3170, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3172, {high, high}; +} +{ +mul.f16x2 r3174, r2162, r3172; +} +{ +fma.rn.f16x2 r3177, r2126, r3170, r3174; +} +{ +mul.f16x2 r3181, r2126, r3172; +} +{ +neg.f16x2 r3184, r3181; +} +{ +fma.rn.f16x2 r3186, r2162, r3170, r3184; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3190, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3192, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3194, {low, high}; +} +{ +mul.f16x2 r3195, r3192, r3194; +} +{ +mul.f16x2 r3198, r3166, r3190; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3166; +mov.b32 r3201, {high, low}; +} +{ +fma.rn.f16x2 r3203, r3195, r3201, r3198; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3207, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3209, {high, high}; +} +{ +mul.f16x2 r3211, r2248, r3209; +} +{ +fma.rn.f16x2 r3214, r2212, r3207, r3211; +} +{ +mul.f16x2 r3218, r2212, r3209; +} +{ +neg.f16x2 r3221, r3218; +} +{ +fma.rn.f16x2 r3223, r2248, r3207, r3221; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3227, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3229, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3231, {low, high}; +} +{ +mul.f16x2 r3232, r3229, r3231; +} +{ +mul.f16x2 r3235, r3203, r3227; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3203; +mov.b32 r3238, {high, low}; +} +{ +fma.rn.f16x2 r3240, r3232, r3238, r3235; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3244, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3246, {high, high}; +} +{ +mul.f16x2 r3248, r2334, r3246; +} +{ +fma.rn.f16x2 r3251, r2298, r3244, r3248; +} +{ +mul.f16x2 r3255, r2298, r3246; +} +{ +neg.f16x2 r3258, r3255; +} +{ +fma.rn.f16x2 r3260, r2334, r3244, r3258; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3264, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3266, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3268, {low, high}; +} +{ +mul.f16x2 r3269, r3266, r3268; +} +{ +mul.f16x2 r3272, r3240, r3264; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3240; +mov.b32 r3275, {high, low}; +} +{ +fma.rn.f16x2 r3277, r3269, r3275, r3272; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3281, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3283, {high, high}; +} +{ +mul.f16x2 r3285, r2420, r3283; +} +{ +fma.rn.f16x2 r3288, r2384, r3281, r3285; +} +{ +mul.f16x2 r3292, r2384, r3283; +} +{ +neg.f16x2 r3295, r3292; +} +{ +fma.rn.f16x2 r3297, r2420, r3281, r3295; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3301, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3303, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3305, {low, high}; +} +{ +mul.f16x2 r3306, r3303, r3305; +} +{ +mul.f16x2 r3309, r3277, r3301; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3277; +mov.b32 r3312, {high, low}; +} +{ +fma.rn.f16x2 r3314, r3306, r3312, r3309; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3318, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3320, {high, high}; +} +{ +mul.f16x2 r3322, r2506, r3320; +} +{ +fma.rn.f16x2 r3325, r2470, r3318, r3322; +} +{ +mul.f16x2 r3329, r2470, r3320; +} +{ +neg.f16x2 r3332, r3329; +} +{ +fma.rn.f16x2 r3334, r2506, r3318, r3332; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3338, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3340, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3342, {low, high}; +} +{ +mul.f16x2 r3343, r3340, r3342; +} +{ +mul.f16x2 r3346, r3314, r3338; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3314; +mov.b32 r3349, {high, low}; +} +{ +fma.rn.f16x2 r3351, r3343, r3349, r3346; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3355, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3357, {high, high}; +} +{ +mul.f16x2 r3359, r2592, r3357; +} +{ +fma.rn.f16x2 r3362, r2556, r3355, r3359; +} +{ +mul.f16x2 r3366, r2556, r3357; +} +{ +neg.f16x2 r3369, r3366; +} +{ +fma.rn.f16x2 r3371, r2592, r3355, r3369; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3375, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3377, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3379, {low, high}; +} +{ +mul.f16x2 r3380, r3377, r3379; +} +{ +mul.f16x2 r3383, r3351, r3375; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3351; +mov.b32 r3386, {high, low}; +} +{ +fma.rn.f16x2 r3388, r3380, r3386, r3383; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3392, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3394, {high, high}; +} +{ +mul.f16x2 r3396, r2678, r3394; +} +{ +fma.rn.f16x2 r3399, r2642, r3392, r3396; +} +{ +mul.f16x2 r3403, r2642, r3394; +} +{ +neg.f16x2 r3406, r3403; +} +{ +fma.rn.f16x2 r3408, r2678, r3392, r3406; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3412, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3414, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3416, {low, high}; +} +{ +mul.f16x2 r3417, r3414, r3416; +} +{ +mul.f16x2 r3420, r3388, r3412; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3388; +mov.b32 r3423, {high, low}; +} +{ +fma.rn.f16x2 r3425, r3417, r3423, r3420; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3429, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3431, {high, high}; +} +{ +mul.f16x2 r3433, r2764, r3431; +} +{ +fma.rn.f16x2 r3436, r2728, r3429, r3433; +} +{ +mul.f16x2 r3440, r2728, r3431; +} +{ +neg.f16x2 r3443, r3440; +} +{ +fma.rn.f16x2 r3445, r2764, r3429, r3443; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3449, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3451, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3453, {low, high}; +} +{ +mul.f16x2 r3454, r3451, r3453; +} +{ +mul.f16x2 r3457, r3425, r3449; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3425; +mov.b32 r3460, {high, low}; +} +{ +fma.rn.f16x2 r3462, r3454, r3460, r3457; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3466, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3468, {high, high}; +} +{ +mul.f16x2 r3470, r2850, r3468; +} +{ +fma.rn.f16x2 r3473, r2814, r3466, r3470; +} +{ +mul.f16x2 r3477, r2814, r3468; +} +{ +neg.f16x2 r3480, r3477; +} +{ +fma.rn.f16x2 r3482, r2850, r3466, r3480; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3486, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3488, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3490, {low, high}; +} +{ +mul.f16x2 r3491, r3488, r3490; +} +{ +mul.f16x2 r3494, r3462, r3486; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3462; +mov.b32 r3497, {high, low}; +} +{ +fma.rn.f16x2 r3499, r3491, r3497, r3494; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3503, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3505, {high, high}; +} +{ +mul.f16x2 r3507, r2180, r3505; +} +{ +fma.rn.f16x2 r3510, r2144, r3503, r3507; +} +{ +mul.f16x2 r3514, r2144, r3505; +} +{ +neg.f16x2 r3517, r3514; +} +{ +fma.rn.f16x2 r3519, r2180, r3503, r3517; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3523, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3525, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3527, {low, high}; +} +{ +mul.f16x2 r3528, r3525, r3527; +} +{ +mul.f16x2 r3531, r3499, r3523; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3499; +mov.b32 r3534, {high, low}; +} +{ +fma.rn.f16x2 r3536, r3528, r3534, r3531; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3540, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3542, {high, high}; +} +{ +mul.f16x2 r3544, r2266, r3542; +} +{ +fma.rn.f16x2 r3547, r2230, r3540, r3544; +} +{ +mul.f16x2 r3551, r2230, r3542; +} +{ +neg.f16x2 r3554, r3551; +} +{ +fma.rn.f16x2 r3556, r2266, r3540, r3554; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3560, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3562, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3564, {low, high}; +} +{ +mul.f16x2 r3565, r3562, r3564; +} +{ +mul.f16x2 r3568, r3536, r3560; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3536; +mov.b32 r3571, {high, low}; +} +{ +fma.rn.f16x2 r3573, r3565, r3571, r3568; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3577, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3579, {high, high}; +} +{ +mul.f16x2 r3581, r2352, r3579; +} +{ +fma.rn.f16x2 r3584, r2316, r3577, r3581; +} +{ +mul.f16x2 r3588, r2316, r3579; +} +{ +neg.f16x2 r3591, r3588; +} +{ +fma.rn.f16x2 r3593, r2352, r3577, r3591; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3597, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3599, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3601, {low, high}; +} +{ +mul.f16x2 r3602, r3599, r3601; +} +{ +mul.f16x2 r3605, r3573, r3597; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3573; +mov.b32 r3608, {high, low}; +} +{ +fma.rn.f16x2 r3610, r3602, r3608, r3605; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3614, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3616, {high, high}; +} +{ +mul.f16x2 r3618, r2438, r3616; +} +{ +fma.rn.f16x2 r3621, r2402, r3614, r3618; +} +{ +mul.f16x2 r3625, r2402, r3616; +} +{ +neg.f16x2 r3628, r3625; +} +{ +fma.rn.f16x2 r3630, r2438, r3614, r3628; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3634, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3636, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3638, {low, high}; +} +{ +mul.f16x2 r3639, r3636, r3638; +} +{ +mul.f16x2 r3642, r3610, r3634; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3610; +mov.b32 r3645, {high, low}; +} +{ +fma.rn.f16x2 r3647, r3639, r3645, r3642; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3651, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3653, {high, high}; +} +{ +mul.f16x2 r3655, r2524, r3653; +} +{ +fma.rn.f16x2 r3658, r2488, r3651, r3655; +} +{ +mul.f16x2 r3662, r2488, r3653; +} +{ +neg.f16x2 r3665, r3662; +} +{ +fma.rn.f16x2 r3667, r2524, r3651, r3665; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3671, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3673, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3675, {low, high}; +} +{ +mul.f16x2 r3676, r3673, r3675; +} +{ +mul.f16x2 r3679, r3647, r3671; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3647; +mov.b32 r3682, {high, low}; +} +{ +fma.rn.f16x2 r3684, r3676, r3682, r3679; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3688, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3690, {high, high}; +} +{ +mul.f16x2 r3692, r2610, r3690; +} +{ +fma.rn.f16x2 r3695, r2574, r3688, r3692; +} +{ +mul.f16x2 r3699, r2574, r3690; +} +{ +neg.f16x2 r3702, r3699; +} +{ +fma.rn.f16x2 r3704, r2610, r3688, r3702; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3708, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3710, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3712, {low, high}; +} +{ +mul.f16x2 r3713, r3710, r3712; +} +{ +mul.f16x2 r3716, r3684, r3708; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3684; +mov.b32 r3719, {high, low}; +} +{ +fma.rn.f16x2 r3721, r3713, r3719, r3716; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3725, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3727, {high, high}; +} +{ +mul.f16x2 r3729, r2696, r3727; +} +{ +fma.rn.f16x2 r3732, r2660, r3725, r3729; +} +{ +mul.f16x2 r3736, r2660, r3727; +} +{ +neg.f16x2 r3739, r3736; +} +{ +fma.rn.f16x2 r3741, r2696, r3725, r3739; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3745, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3747, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3749, {low, high}; +} +{ +mul.f16x2 r3750, r3747, r3749; +} +{ +mul.f16x2 r3753, r3721, r3745; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3721; +mov.b32 r3756, {high, low}; +} +{ +fma.rn.f16x2 r3758, r3750, r3756, r3753; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3762, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3764, {high, high}; +} +{ +mul.f16x2 r3766, r2782, r3764; +} +{ +fma.rn.f16x2 r3769, r2746, r3762, r3766; +} +{ +mul.f16x2 r3773, r2746, r3764; +} +{ +neg.f16x2 r3776, r3773; +} +{ +fma.rn.f16x2 r3778, r2782, r3762, r3776; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3782, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r2871; +mov.b32 r3784, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f361; +cvt.rn.f16.f32 high, f362; +mov.b32 r3786, {low, high}; +} +{ +mul.f16x2 r3787, r3784, r3786; +} +{ +mul.f16x2 r3790, r3758, r3782; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3758; +mov.b32 r3793, {high, low}; +} +{ +fma.rn.f16x2 r3795, r3787, r3793, r3790; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3799, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r3795; +mov.b32 r3801, {high, high}; +} +{ +mul.f16x2 r3803, r2868, r3801; +} +{ +fma.rn.f16x2 r3806, r2832, r3799, r3803; +} +{ +mul.f16x2 r3810, r2832, r3801; +} +{ +neg.f16x2 r3813, r3810; +} +{ +fma.rn.f16x2 r3815, r2868, r3799, r3813; +} +barrier.sync 0; +mad.lo.s32 r4618, r4616, 108, r4617; +st.shared.u32 [r4618], r2102; +st.shared.u32 [r4618+4], r2881; +st.shared.u32 [r4618+8], r2918; +st.shared.u32 [r4618+12], r2955; +st.shared.u32 [r4618+16], r2992; +st.shared.u32 [r4618+20], r3029; +st.shared.u32 [r4618+24], r3066; +st.shared.u32 [r4618+28], r3103; +st.shared.u32 [r4618+32], r3140; +st.shared.u32 [r4618+36], r3177; +st.shared.u32 [r4618+40], r3214; +st.shared.u32 [r4618+44], r3251; +st.shared.u32 [r4618+48], r3288; +st.shared.u32 [r4618+52], r3325; +st.shared.u32 [r4618+56], r3362; +st.shared.u32 [r4618+60], r3399; +st.shared.u32 [r4618+64], r3436; +st.shared.u32 [r4618+68], r3473; +st.shared.u32 [r4618+72], r3510; +st.shared.u32 [r4618+76], r3547; +st.shared.u32 [r4618+80], r3584; +st.shared.u32 [r4618+84], r3621; +st.shared.u32 [r4618+88], r3658; +st.shared.u32 [r4618+92], r3695; +st.shared.u32 [r4618+96], r3732; +st.shared.u32 [r4618+100], r3769; +st.shared.u32 [r4618+104], r3806; +barrier.sync 0; +mad.lo.s32 r4619, r4616, -104, r4618; +ld.shared.u32 r3842, [r4619]; +ld.shared.u32 r3928, [r4619+12]; +ld.shared.u32 r4014, [r4619+24]; +ld.shared.u32 r4100, [r4619+36]; +ld.shared.u32 r4186, [r4619+48]; +ld.shared.u32 r4272, [r4619+60]; +ld.shared.u32 r4358, [r4619+72]; +ld.shared.u32 r4444, [r4619+84]; +ld.shared.u32 r4530, [r4619+96]; +ld.shared.u32 r3839, [r4619+108]; +ld.shared.u32 r3925, [r4619+120]; +ld.shared.u32 r4011, [r4619+132]; +ld.shared.u32 r4097, [r4619+144]; +ld.shared.u32 r4183, [r4619+156]; +ld.shared.u32 r4269, [r4619+168]; +ld.shared.u32 r4355, [r4619+180]; +ld.shared.u32 r4441, [r4619+192]; +ld.shared.u32 r4527, [r4619+204]; +ld.shared.u32 r3840, [r4619+216]; +ld.shared.u32 r3926, [r4619+228]; +ld.shared.u32 r4012, [r4619+240]; +ld.shared.u32 r4098, [r4619+252]; +ld.shared.u32 r4184, [r4619+264]; +ld.shared.u32 r4270, [r4619+276]; +ld.shared.u32 r4356, [r4619+288]; +ld.shared.u32 r4442, [r4619+300]; +ld.shared.u32 r4528, [r4619+312]; +barrier.sync 0; +st.shared.u32 [r4618], r2108; +st.shared.u32 [r4618+4], r2890; +st.shared.u32 [r4618+8], r2927; +st.shared.u32 [r4618+12], r2964; +st.shared.u32 [r4618+16], r3001; +st.shared.u32 [r4618+20], r3038; +st.shared.u32 [r4618+24], r3075; +st.shared.u32 [r4618+28], r3112; +st.shared.u32 [r4618+32], r3149; +st.shared.u32 [r4618+36], r3186; +st.shared.u32 [r4618+40], r3223; +st.shared.u32 [r4618+44], r3260; +st.shared.u32 [r4618+48], r3297; +st.shared.u32 [r4618+52], r3334; +st.shared.u32 [r4618+56], r3371; +st.shared.u32 [r4618+60], r3408; +st.shared.u32 [r4618+64], r3445; +st.shared.u32 [r4618+68], r3482; +st.shared.u32 [r4618+72], r3519; +st.shared.u32 [r4618+76], r3556; +st.shared.u32 [r4618+80], r3593; +st.shared.u32 [r4618+84], r3630; +st.shared.u32 [r4618+88], r3667; +st.shared.u32 [r4618+92], r3704; +st.shared.u32 [r4618+96], r3741; +st.shared.u32 [r4618+100], r3778; +st.shared.u32 [r4618+104], r3815; +barrier.sync 0; +ld.shared.u32 r3848, [r4619]; +ld.shared.u32 r3934, [r4619+12]; +ld.shared.u32 r4020, [r4619+24]; +ld.shared.u32 r4106, [r4619+36]; +ld.shared.u32 r4192, [r4619+48]; +ld.shared.u32 r4278, [r4619+60]; +ld.shared.u32 r4364, [r4619+72]; +ld.shared.u32 r4450, [r4619+84]; +ld.shared.u32 r4536, [r4619+96]; +ld.shared.u32 r3845, [r4619+108]; +ld.shared.u32 r3931, [r4619+120]; +ld.shared.u32 r4017, [r4619+132]; +ld.shared.u32 r4103, [r4619+144]; +ld.shared.u32 r4189, [r4619+156]; +ld.shared.u32 r4275, [r4619+168]; +ld.shared.u32 r4361, [r4619+180]; +ld.shared.u32 r4447, [r4619+192]; +ld.shared.u32 r4533, [r4619+204]; +ld.shared.u32 r3846, [r4619+216]; +ld.shared.u32 r3932, [r4619+228]; +ld.shared.u32 r4018, [r4619+240]; +ld.shared.u32 r4104, [r4619+252]; +ld.shared.u32 r4190, [r4619+264]; +ld.shared.u32 r4276, [r4619+276]; +ld.shared.u32 r4362, [r4619+288]; +ld.shared.u32 r4448, [r4619+300]; +ld.shared.u32 r4534, [r4619+312]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3836, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3837, {low, high}; +} +{ +add.f16x2 r3838, r3839, r3840; +} +{ +add.f16x2 %0, r3842, r3838; +} +{ +add.f16x2 r3844, r3845, r3846; +} +{ +add.f16x2 %1, r3848, r3844; +} +{ +add.f16x2 r3850, r3839, r3840; +} +{ +mul.f16x2 r3853, r3850, r3836; +} +{ +add.f16x2 r3856, r3842, r3853; +} +{ +sub.f16x2 r3859, r3845, r3846; +} +{ +mul.f16x2 r3862, r3859, r3837; +} +{ +add.f16x2 %18, r3856, r3862; +} +{ +add.f16x2 r3868, r3839, r3840; +} +{ +mul.f16x2 r3871, r3868, r3836; +} +{ +add.f16x2 r3874, r3842, r3871; +} +{ +sub.f16x2 r3877, r3845, r3846; +} +{ +mul.f16x2 r3880, r3877, r3837; +} +{ +sub.f16x2 %36, r3874, r3880; +} +{ +add.f16x2 r3886, r3845, r3846; +} +{ +mul.f16x2 r3889, r3886, r3836; +} +{ +add.f16x2 r3892, r3848, r3889; +} +{ +sub.f16x2 r3895, r3839, r3840; +} +{ +mul.f16x2 r3898, r3895, r3837; +} +{ +sub.f16x2 %19, r3892, r3898; +} +{ +add.f16x2 r3904, r3845, r3846; +} +{ +mul.f16x2 r3907, r3904, r3836; +} +{ +add.f16x2 r3910, r3848, r3907; +} +{ +sub.f16x2 r3913, r3839, r3840; +} +{ +mul.f16x2 r3916, r3913, r3837; +} +{ +add.f16x2 %37, r3910, r3916; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r3922, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r3923, {low, high}; +} +{ +add.f16x2 r3924, r3925, r3926; +} +{ +add.f16x2 %2, r3928, r3924; +} +{ +add.f16x2 r3930, r3931, r3932; +} +{ +add.f16x2 %3, r3934, r3930; +} +{ +add.f16x2 r3936, r3925, r3926; +} +{ +mul.f16x2 r3939, r3936, r3922; +} +{ +add.f16x2 r3942, r3928, r3939; +} +{ +sub.f16x2 r3945, r3931, r3932; +} +{ +mul.f16x2 r3948, r3945, r3923; +} +{ +add.f16x2 %20, r3942, r3948; +} +{ +add.f16x2 r3954, r3925, r3926; +} +{ +mul.f16x2 r3957, r3954, r3922; +} +{ +add.f16x2 r3960, r3928, r3957; +} +{ +sub.f16x2 r3963, r3931, r3932; +} +{ +mul.f16x2 r3966, r3963, r3923; +} +{ +sub.f16x2 %38, r3960, r3966; +} +{ +add.f16x2 r3972, r3931, r3932; +} +{ +mul.f16x2 r3975, r3972, r3922; +} +{ +add.f16x2 r3978, r3934, r3975; +} +{ +sub.f16x2 r3981, r3925, r3926; +} +{ +mul.f16x2 r3984, r3981, r3923; +} +{ +sub.f16x2 %21, r3978, r3984; +} +{ +add.f16x2 r3990, r3931, r3932; +} +{ +mul.f16x2 r3993, r3990, r3922; +} +{ +add.f16x2 r3996, r3934, r3993; +} +{ +sub.f16x2 r3999, r3925, r3926; +} +{ +mul.f16x2 r4002, r3999, r3923; +} +{ +add.f16x2 %39, r3996, r4002; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4008, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4009, {low, high}; +} +{ +add.f16x2 r4010, r4011, r4012; +} +{ +add.f16x2 %4, r4014, r4010; +} +{ +add.f16x2 r4016, r4017, r4018; +} +{ +add.f16x2 %5, r4020, r4016; +} +{ +add.f16x2 r4022, r4011, r4012; +} +{ +mul.f16x2 r4025, r4022, r4008; +} +{ +add.f16x2 r4028, r4014, r4025; +} +{ +sub.f16x2 r4031, r4017, r4018; +} +{ +mul.f16x2 r4034, r4031, r4009; +} +{ +add.f16x2 %22, r4028, r4034; +} +{ +add.f16x2 r4040, r4011, r4012; +} +{ +mul.f16x2 r4043, r4040, r4008; +} +{ +add.f16x2 r4046, r4014, r4043; +} +{ +sub.f16x2 r4049, r4017, r4018; +} +{ +mul.f16x2 r4052, r4049, r4009; +} +{ +sub.f16x2 %40, r4046, r4052; +} +{ +add.f16x2 r4058, r4017, r4018; +} +{ +mul.f16x2 r4061, r4058, r4008; +} +{ +add.f16x2 r4064, r4020, r4061; +} +{ +sub.f16x2 r4067, r4011, r4012; +} +{ +mul.f16x2 r4070, r4067, r4009; +} +{ +sub.f16x2 %23, r4064, r4070; +} +{ +add.f16x2 r4076, r4017, r4018; +} +{ +mul.f16x2 r4079, r4076, r4008; +} +{ +add.f16x2 r4082, r4020, r4079; +} +{ +sub.f16x2 r4085, r4011, r4012; +} +{ +mul.f16x2 r4088, r4085, r4009; +} +{ +add.f16x2 %41, r4082, r4088; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4094, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4095, {low, high}; +} +{ +add.f16x2 r4096, r4097, r4098; +} +{ +add.f16x2 %6, r4100, r4096; +} +{ +add.f16x2 r4102, r4103, r4104; +} +{ +add.f16x2 %7, r4106, r4102; +} +{ +add.f16x2 r4108, r4097, r4098; +} +{ +mul.f16x2 r4111, r4108, r4094; +} +{ +add.f16x2 r4114, r4100, r4111; +} +{ +sub.f16x2 r4117, r4103, r4104; +} +{ +mul.f16x2 r4120, r4117, r4095; +} +{ +add.f16x2 %24, r4114, r4120; +} +{ +add.f16x2 r4126, r4097, r4098; +} +{ +mul.f16x2 r4129, r4126, r4094; +} +{ +add.f16x2 r4132, r4100, r4129; +} +{ +sub.f16x2 r4135, r4103, r4104; +} +{ +mul.f16x2 r4138, r4135, r4095; +} +{ +sub.f16x2 %42, r4132, r4138; +} +{ +add.f16x2 r4144, r4103, r4104; +} +{ +mul.f16x2 r4147, r4144, r4094; +} +{ +add.f16x2 r4150, r4106, r4147; +} +{ +sub.f16x2 r4153, r4097, r4098; +} +{ +mul.f16x2 r4156, r4153, r4095; +} +{ +sub.f16x2 %25, r4150, r4156; +} +{ +add.f16x2 r4162, r4103, r4104; +} +{ +mul.f16x2 r4165, r4162, r4094; +} +{ +add.f16x2 r4168, r4106, r4165; +} +{ +sub.f16x2 r4171, r4097, r4098; +} +{ +mul.f16x2 r4174, r4171, r4095; +} +{ +add.f16x2 %43, r4168, r4174; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4180, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4181, {low, high}; +} +{ +add.f16x2 r4182, r4183, r4184; +} +{ +add.f16x2 %8, r4186, r4182; +} +{ +add.f16x2 r4188, r4189, r4190; +} +{ +add.f16x2 %9, r4192, r4188; +} +{ +add.f16x2 r4194, r4183, r4184; +} +{ +mul.f16x2 r4197, r4194, r4180; +} +{ +add.f16x2 r4200, r4186, r4197; +} +{ +sub.f16x2 r4203, r4189, r4190; +} +{ +mul.f16x2 r4206, r4203, r4181; +} +{ +add.f16x2 %26, r4200, r4206; +} +{ +add.f16x2 r4212, r4183, r4184; +} +{ +mul.f16x2 r4215, r4212, r4180; +} +{ +add.f16x2 r4218, r4186, r4215; +} +{ +sub.f16x2 r4221, r4189, r4190; +} +{ +mul.f16x2 r4224, r4221, r4181; +} +{ +sub.f16x2 %44, r4218, r4224; +} +{ +add.f16x2 r4230, r4189, r4190; +} +{ +mul.f16x2 r4233, r4230, r4180; +} +{ +add.f16x2 r4236, r4192, r4233; +} +{ +sub.f16x2 r4239, r4183, r4184; +} +{ +mul.f16x2 r4242, r4239, r4181; +} +{ +sub.f16x2 %27, r4236, r4242; +} +{ +add.f16x2 r4248, r4189, r4190; +} +{ +mul.f16x2 r4251, r4248, r4180; +} +{ +add.f16x2 r4254, r4192, r4251; +} +{ +sub.f16x2 r4257, r4183, r4184; +} +{ +mul.f16x2 r4260, r4257, r4181; +} +{ +add.f16x2 %45, r4254, r4260; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4266, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4267, {low, high}; +} +{ +add.f16x2 r4268, r4269, r4270; +} +{ +add.f16x2 %10, r4272, r4268; +} +{ +add.f16x2 r4274, r4275, r4276; +} +{ +add.f16x2 %11, r4278, r4274; +} +{ +add.f16x2 r4280, r4269, r4270; +} +{ +mul.f16x2 r4283, r4280, r4266; +} +{ +add.f16x2 r4286, r4272, r4283; +} +{ +sub.f16x2 r4289, r4275, r4276; +} +{ +mul.f16x2 r4292, r4289, r4267; +} +{ +add.f16x2 %28, r4286, r4292; +} +{ +add.f16x2 r4298, r4269, r4270; +} +{ +mul.f16x2 r4301, r4298, r4266; +} +{ +add.f16x2 r4304, r4272, r4301; +} +{ +sub.f16x2 r4307, r4275, r4276; +} +{ +mul.f16x2 r4310, r4307, r4267; +} +{ +sub.f16x2 %46, r4304, r4310; +} +{ +add.f16x2 r4316, r4275, r4276; +} +{ +mul.f16x2 r4319, r4316, r4266; +} +{ +add.f16x2 r4322, r4278, r4319; +} +{ +sub.f16x2 r4325, r4269, r4270; +} +{ +mul.f16x2 r4328, r4325, r4267; +} +{ +sub.f16x2 %29, r4322, r4328; +} +{ +add.f16x2 r4334, r4275, r4276; +} +{ +mul.f16x2 r4337, r4334, r4266; +} +{ +add.f16x2 r4340, r4278, r4337; +} +{ +sub.f16x2 r4343, r4269, r4270; +} +{ +mul.f16x2 r4346, r4343, r4267; +} +{ +add.f16x2 %47, r4340, r4346; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4352, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4353, {low, high}; +} +{ +add.f16x2 r4354, r4355, r4356; +} +{ +add.f16x2 %12, r4358, r4354; +} +{ +add.f16x2 r4360, r4361, r4362; +} +{ +add.f16x2 %13, r4364, r4360; +} +{ +add.f16x2 r4366, r4355, r4356; +} +{ +mul.f16x2 r4369, r4366, r4352; +} +{ +add.f16x2 r4372, r4358, r4369; +} +{ +sub.f16x2 r4375, r4361, r4362; +} +{ +mul.f16x2 r4378, r4375, r4353; +} +{ +add.f16x2 %30, r4372, r4378; +} +{ +add.f16x2 r4384, r4355, r4356; +} +{ +mul.f16x2 r4387, r4384, r4352; +} +{ +add.f16x2 r4390, r4358, r4387; +} +{ +sub.f16x2 r4393, r4361, r4362; +} +{ +mul.f16x2 r4396, r4393, r4353; +} +{ +sub.f16x2 %48, r4390, r4396; +} +{ +add.f16x2 r4402, r4361, r4362; +} +{ +mul.f16x2 r4405, r4402, r4352; +} +{ +add.f16x2 r4408, r4364, r4405; +} +{ +sub.f16x2 r4411, r4355, r4356; +} +{ +mul.f16x2 r4414, r4411, r4353; +} +{ +sub.f16x2 %31, r4408, r4414; +} +{ +add.f16x2 r4420, r4361, r4362; +} +{ +mul.f16x2 r4423, r4420, r4352; +} +{ +add.f16x2 r4426, r4364, r4423; +} +{ +sub.f16x2 r4429, r4355, r4356; +} +{ +mul.f16x2 r4432, r4429, r4353; +} +{ +add.f16x2 %49, r4426, r4432; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4438, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4439, {low, high}; +} +{ +add.f16x2 r4440, r4441, r4442; +} +{ +add.f16x2 %14, r4444, r4440; +} +{ +add.f16x2 r4446, r4447, r4448; +} +{ +add.f16x2 %15, r4450, r4446; +} +{ +add.f16x2 r4452, r4441, r4442; +} +{ +mul.f16x2 r4455, r4452, r4438; +} +{ +add.f16x2 r4458, r4444, r4455; +} +{ +sub.f16x2 r4461, r4447, r4448; +} +{ +mul.f16x2 r4464, r4461, r4439; +} +{ +add.f16x2 %32, r4458, r4464; +} +{ +add.f16x2 r4470, r4441, r4442; +} +{ +mul.f16x2 r4473, r4470, r4438; +} +{ +add.f16x2 r4476, r4444, r4473; +} +{ +sub.f16x2 r4479, r4447, r4448; +} +{ +mul.f16x2 r4482, r4479, r4439; +} +{ +sub.f16x2 %50, r4476, r4482; +} +{ +add.f16x2 r4488, r4447, r4448; +} +{ +mul.f16x2 r4491, r4488, r4438; +} +{ +add.f16x2 r4494, r4450, r4491; +} +{ +sub.f16x2 r4497, r4441, r4442; +} +{ +mul.f16x2 r4500, r4497, r4439; +} +{ +sub.f16x2 %33, r4494, r4500; +} +{ +add.f16x2 r4506, r4447, r4448; +} +{ +mul.f16x2 r4509, r4506, r4438; +} +{ +add.f16x2 r4512, r4450, r4509; +} +{ +sub.f16x2 r4515, r4441, r4442; +} +{ +mul.f16x2 r4518, r4515, r4439; +} +{ +add.f16x2 %51, r4512, r4518; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f398; +cvt.rn.f16.f32 high, f398; +mov.b32 r4524, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f400; +cvt.rn.f16.f32 high, f400; +mov.b32 r4525, {low, high}; +} +{ +add.f16x2 r4526, r4527, r4528; +} +{ +add.f16x2 %16, r4530, r4526; +} +{ +add.f16x2 r4532, r4533, r4534; +} +{ +add.f16x2 %17, r4536, r4532; +} +{ +add.f16x2 r4538, r4527, r4528; +} +{ +mul.f16x2 r4541, r4538, r4524; +} +{ +add.f16x2 r4544, r4530, r4541; +} +{ +sub.f16x2 r4547, r4533, r4534; +} +{ +mul.f16x2 r4550, r4547, r4525; +} +{ +add.f16x2 %34, r4544, r4550; +} +{ +add.f16x2 r4556, r4527, r4528; +} +{ +mul.f16x2 r4559, r4556, r4524; +} +{ +add.f16x2 r4562, r4530, r4559; +} +{ +sub.f16x2 r4565, r4533, r4534; +} +{ +mul.f16x2 r4568, r4565, r4525; +} +{ +sub.f16x2 %52, r4562, r4568; +} +{ +add.f16x2 r4574, r4533, r4534; +} +{ +mul.f16x2 r4577, r4574, r4524; +} +{ +add.f16x2 r4580, r4536, r4577; +} +{ +sub.f16x2 r4583, r4527, r4528; +} +{ +mul.f16x2 r4586, r4583, r4525; +} +{ +sub.f16x2 %35, r4580, r4586; +} +{ +add.f16x2 r4592, r4533, r4534; +} +{ +mul.f16x2 r4595, r4592, r4524; +} +{ +add.f16x2 r4598, r4536, r4595; +} +{ +sub.f16x2 r4601, r4527, r4528; +} +{ +mul.f16x2 r4604, r4601, r4525; +} +{ +add.f16x2 %53, r4598, r4604; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)), "=r"(__HALF2_TO_UI(rmem[9].x)), "=r"(__HALF2_TO_UI(rmem[9].y)), "=r"(__HALF2_TO_UI(rmem[10].x)), "=r"(__HALF2_TO_UI(rmem[10].y)), "=r"(__HALF2_TO_UI(rmem[11].x)), "=r"(__HALF2_TO_UI(rmem[11].y)), "=r"(__HALF2_TO_UI(rmem[12].x)), "=r"(__HALF2_TO_UI(rmem[12].y)), "=r"(__HALF2_TO_UI(rmem[13].x)), "=r"(__HALF2_TO_UI(rmem[13].y)), "=r"(__HALF2_TO_UI(rmem[14].x)), "=r"(__HALF2_TO_UI(rmem[14].y)), "=r"(__HALF2_TO_UI(rmem[15].x)), "=r"(__HALF2_TO_UI(rmem[15].y)), "=r"(__HALF2_TO_UI(rmem[16].x)), "=r"(__HALF2_TO_UI(rmem[16].y)), "=r"(__HALF2_TO_UI(rmem[17].x)), "=r"(__HALF2_TO_UI(rmem[17].y)), "=r"(__HALF2_TO_UI(rmem[18].x)), "=r"(__HALF2_TO_UI(rmem[18].y)), "=r"(__HALF2_TO_UI(rmem[19].x)), "=r"(__HALF2_TO_UI(rmem[19].y)), "=r"(__HALF2_TO_UI(rmem[20].x)), "=r"(__HALF2_TO_UI(rmem[20].y)), "=r"(__HALF2_TO_UI(rmem[21].x)), "=r"(__HALF2_TO_UI(rmem[21].y)), "=r"(__HALF2_TO_UI(rmem[22].x)), "=r"(__HALF2_TO_UI(rmem[22].y)), "=r"(__HALF2_TO_UI(rmem[23].x)), "=r"(__HALF2_TO_UI(rmem[23].y)), "=r"(__HALF2_TO_UI(rmem[24].x)), "=r"(__HALF2_TO_UI(rmem[24].y)), "=r"(__HALF2_TO_UI(rmem[25].x)), "=r"(__HALF2_TO_UI(rmem[25].y)), "=r"(__HALF2_TO_UI(rmem[26].x)), "=r"(__HALF2_TO_UI(rmem[26].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[17].x)), "r"(__HALF2_TO_UI(rmem[20].y)), "r"(__HALF2_TO_UI(rmem[14].x)), "r"(__HALF2_TO_UI(rmem[11].x)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[15].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[25].x)), "r"(__HALF2_TO_UI(rmem[22].x)), "r"(__HALF2_TO_UI(rmem[19].x)), "r"(__HALF2_TO_UI(rmem[12].y)), "r"(__HALF2_TO_UI(rmem[9].y)), "r"(__HALF2_TO_UI(rmem[17].y)), "r"(__HALF2_TO_UI(rmem[14].y)), "r"(__HALF2_TO_UI(rmem[11].y)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[8].y)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[25].y)), "r"(__HALF2_TO_UI(rmem[22].y)), "r"(__HALF2_TO_UI(rmem[16].x)), "r"(__HALF2_TO_UI(rmem[19].y)), "r"(__HALF2_TO_UI(rmem[13].x)), "r"(__HALF2_TO_UI(rmem[10].x)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[24].x)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[21].x)), "r"(__HALF2_TO_UI(rmem[18].x)), "r"(__HALF2_TO_UI(rmem[26].x)), "r"(__HALF2_TO_UI(rmem[23].x)), "r"(__HALF2_TO_UI(rmem[16].y)), "r"(__HALF2_TO_UI(rmem[20].x)), "r"(__HALF2_TO_UI(rmem[13].y)), "r"(__HALF2_TO_UI(rmem[10].y)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[24].y)), "r"(__HALF2_TO_UI(rmem[21].y)), "r"(__HALF2_TO_UI(rmem[15].x)), "r"(__HALF2_TO_UI(rmem[18].y)), "r"(__HALF2_TO_UI(rmem[12].x)), "r"(__HALF2_TO_UI(rmem[9].x)), "r"(__HALF2_TO_UI(rmem[26].y)), "r"(__HALF2_TO_UI(rmem[23].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1078, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<603>; +.reg .b64 rd<8>; +mov.u32 r576, %tid.y; +mov.u32 r577, %6; +mad.lo.s32 r578, r576, 648, r577; +mov.u32 r579, %tid.x; +mov.f32 f38, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r1, {low, high}; +} +mov.f32 f40, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r579, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r580, rd3; +sub.s32 r581, r579, r580; +shr.u32 r582, r581, 1; +add.s32 r583, r582, r580; +shr.u32 r584, r583, 4; +mul.lo.s32 r585, r584, 27; +sub.s32 r586, r579, r585; +mad.lo.s32 r587, r584, 648, r578; +cvt.rn.f32.u32 f41, r586; +mul.f32 f42, f41, 0f3D9EDD1F; +cos.approx.f32 f5, f42; +sin.approx.f32 f43, f42; +neg.f32 f6, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r588, r586, 24, r587; +st.shared.v2.f32 [r588], {r6, r12}; +st.shared.v2.f32 [r588+8], {r97, r106}; +st.shared.v2.f32 [r588+16], {r134, r143}; +barrier.sync 0; +shl.b32 r589, r586, 4; +sub.s32 r590, r588, r589; +ld.shared.u32 r170, [r590]; +ld.shared.u32 r176, [r590+4]; +ld.shared.u32 r167, [r590+216]; +ld.shared.u32 r173, [r590+220]; +ld.shared.u32 r168, [r590+432]; +ld.shared.u32 r174, [r590+436]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r586, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r591, rd5; +mul.lo.s32 r592, r591, 3; +sub.s32 r593, r586, r592; +shl.b32 r594, r593, 3; +add.s32 r595, r587, r594; +cvt.rn.f32.u32 f44, r591; +mul.f32 f45, f44, 0f3E6E4BAE; +cos.approx.f32 f17, f45; +sin.approx.f32 f46, f45; +neg.f32 f18, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r596, r591, 72, r595; +st.shared.u32 [r596], r169; +st.shared.u32 [r596+4], r175; +st.shared.u32 [r596+24], r260; +st.shared.u32 [r596+28], r269; +st.shared.u32 [r596+48], r297; +st.shared.u32 [r596+52], r306; +barrier.sync 0; +ld.shared.u32 r333, [r590]; +ld.shared.u32 r339, [r590+4]; +ld.shared.u32 r330, [r590+216]; +ld.shared.u32 r336, [r590+220]; +ld.shared.u32 r331, [r590+432]; +ld.shared.u32 r337, [r590+436]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r586, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r597, rd7; +mul.lo.s32 r598, r597, 9; +sub.s32 r599, r586, r598; +shl.b32 r600, r599, 3; +add.s32 r601, r587, r600; +cvt.rn.f32.u32 f47, r597; +mul.f32 f48, f47, 0f3F32B8C2; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r602, r597, 216, r601; +st.shared.u32 [r602], r332; +st.shared.u32 [r602+4], r338; +st.shared.u32 [r602+72], r423; +st.shared.u32 [r602+76], r432; +st.shared.u32 [r602+144], r460; +st.shared.u32 [r602+148], r469; +barrier.sync 0; +ld.shared.u32 r496, [r590]; +ld.shared.u32 r502, [r590+4]; +ld.shared.u32 r493, [r590+216]; +ld.shared.u32 r499, [r590+220]; +ld.shared.u32 r494, [r590+432]; +ld.shared.u32 r500, [r590+436]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 %0, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 %1, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 %2, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 %4, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 %3, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 %5, r564, r570; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1079, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<50>; +.reg .b32 r<603>; +.reg .b64 rd<8>; +mov.u32 r576, %tid.y; +mov.u32 r577, %6; +mad.lo.s32 r578, r576, 324, r577; +mov.u32 r579, %tid.x; +mov.f32 f38, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r1, {low, high}; +} +mov.f32 f40, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r579, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r580, rd3; +sub.s32 r581, r579, r580; +shr.u32 r582, r581, 1; +add.s32 r583, r582, r580; +shr.u32 r584, r583, 4; +mul.lo.s32 r585, r584, 27; +sub.s32 r586, r579, r585; +mad.lo.s32 r587, r584, 324, r578; +cvt.rn.f32.u32 f41, r586; +mul.f32 f42, f41, 0f3D9EDD1F; +cos.approx.f32 f5, f42; +sin.approx.f32 f43, f42; +neg.f32 f6, f43; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f33, 0fBF800000; +mov.f32 f34, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r588, r586, 12, r587; +st.shared.u32 [r588], r6; +st.shared.u32 [r588+4], r97; +st.shared.u32 [r588+8], r134; +barrier.sync 0; +shl.b32 r589, r586, 3; +sub.s32 r590, r588, r589; +ld.shared.u32 r170, [r590]; +ld.shared.u32 r167, [r590+108]; +ld.shared.u32 r168, [r590+216]; +barrier.sync 0; +st.shared.u32 [r588], r12; +st.shared.u32 [r588+4], r106; +st.shared.u32 [r588+8], r143; +barrier.sync 0; +ld.shared.u32 r176, [r590]; +ld.shared.u32 r173, [r590+108]; +ld.shared.u32 r174, [r590+216]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 r169, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 r175, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 r193, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 r211, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 r229, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 r247, r238, r244; +} +mul.wide.u32 rd4, r586, -1431655765; +shr.u64 rd5, rd4, 33; +cvt.u32.u64 r591, rd5; +mul.lo.s32 r592, r591, 3; +sub.s32 r593, r586, r592; +shl.b32 r594, r593, 2; +add.s32 r595, r587, r594; +cvt.rn.f32.u32 f44, r591; +mul.f32 f45, f44, 0f3E6E4BAE; +cos.approx.f32 f17, f45; +sin.approx.f32 f46, f45; +neg.f32 f18, f46; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f17; +cvt.rn.f16.f32 high, f18; +mov.b32 r250, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r253, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r255, {high, high}; +} +{ +mul.f16x2 r257, r229, r255; +} +{ +fma.rn.f16x2 r260, r193, r253, r257; +} +{ +mul.f16x2 r264, r193, r255; +} +{ +neg.f16x2 r267, r264; +} +{ +fma.rn.f16x2 r269, r229, r253, r267; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r273, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r275, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r277, {low, high}; +} +{ +mul.f16x2 r278, r275, r277; +} +{ +mul.f16x2 r281, r250, r273; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r250; +mov.b32 r284, {high, low}; +} +{ +fma.rn.f16x2 r286, r278, r284, r281; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r290, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r286; +mov.b32 r292, {high, high}; +} +{ +mul.f16x2 r294, r247, r292; +} +{ +fma.rn.f16x2 r297, r211, r290, r294; +} +{ +mul.f16x2 r301, r211, r292; +} +{ +neg.f16x2 r304, r301; +} +{ +fma.rn.f16x2 r306, r247, r290, r304; +} +barrier.sync 0; +mad.lo.s32 r596, r591, 36, r595; +st.shared.u32 [r596], r169; +st.shared.u32 [r596+12], r260; +st.shared.u32 [r596+24], r297; +barrier.sync 0; +ld.shared.u32 r333, [r590]; +ld.shared.u32 r330, [r590+108]; +ld.shared.u32 r331, [r590+216]; +barrier.sync 0; +st.shared.u32 [r596], r175; +st.shared.u32 [r596+12], r269; +st.shared.u32 [r596+24], r306; +barrier.sync 0; +ld.shared.u32 r339, [r590]; +ld.shared.u32 r336, [r590+108]; +ld.shared.u32 r337, [r590+216]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r327, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r328, {low, high}; +} +{ +add.f16x2 r329, r330, r331; +} +{ +add.f16x2 r332, r333, r329; +} +{ +add.f16x2 r335, r336, r337; +} +{ +add.f16x2 r338, r339, r335; +} +{ +add.f16x2 r341, r330, r331; +} +{ +mul.f16x2 r344, r341, r327; +} +{ +add.f16x2 r347, r333, r344; +} +{ +sub.f16x2 r350, r336, r337; +} +{ +mul.f16x2 r353, r350, r328; +} +{ +add.f16x2 r356, r347, r353; +} +{ +add.f16x2 r359, r330, r331; +} +{ +mul.f16x2 r362, r359, r327; +} +{ +add.f16x2 r365, r333, r362; +} +{ +sub.f16x2 r368, r336, r337; +} +{ +mul.f16x2 r371, r368, r328; +} +{ +sub.f16x2 r374, r365, r371; +} +{ +add.f16x2 r377, r336, r337; +} +{ +mul.f16x2 r380, r377, r327; +} +{ +add.f16x2 r383, r339, r380; +} +{ +sub.f16x2 r386, r330, r331; +} +{ +mul.f16x2 r389, r386, r328; +} +{ +sub.f16x2 r392, r383, r389; +} +{ +add.f16x2 r395, r336, r337; +} +{ +mul.f16x2 r398, r395, r327; +} +{ +add.f16x2 r401, r339, r398; +} +{ +sub.f16x2 r404, r330, r331; +} +{ +mul.f16x2 r407, r404, r328; +} +{ +add.f16x2 r410, r401, r407; +} +mul.wide.u32 rd6, r586, 954437177; +shr.u64 rd7, rd6, 33; +cvt.u32.u64 r597, rd7; +mul.lo.s32 r598, r597, 9; +sub.s32 r599, r586, r598; +shl.b32 r600, r599, 2; +add.s32 r601, r587, r600; +cvt.rn.f32.u32 f47, r597; +mul.f32 f48, f47, 0f3F32B8C2; +cos.approx.f32 f29, f48; +sin.approx.f32 f49, f48; +neg.f32 f30, f49; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f29; +cvt.rn.f16.f32 high, f30; +mov.b32 r413, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r416, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r418, {high, high}; +} +{ +mul.f16x2 r420, r392, r418; +} +{ +fma.rn.f16x2 r423, r356, r416, r420; +} +{ +mul.f16x2 r427, r356, r418; +} +{ +neg.f16x2 r430, r427; +} +{ +fma.rn.f16x2 r432, r392, r416, r430; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r436, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r438, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f33; +cvt.rn.f16.f32 high, f34; +mov.b32 r440, {low, high}; +} +{ +mul.f16x2 r441, r438, r440; +} +{ +mul.f16x2 r444, r413, r436; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r413; +mov.b32 r447, {high, low}; +} +{ +fma.rn.f16x2 r449, r441, r447, r444; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r453, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r449; +mov.b32 r455, {high, high}; +} +{ +mul.f16x2 r457, r410, r455; +} +{ +fma.rn.f16x2 r460, r374, r453, r457; +} +{ +mul.f16x2 r464, r374, r455; +} +{ +neg.f16x2 r467, r464; +} +{ +fma.rn.f16x2 r469, r410, r453, r467; +} +barrier.sync 0; +mad.lo.s32 r602, r597, 108, r601; +st.shared.u32 [r602], r332; +st.shared.u32 [r602+36], r423; +st.shared.u32 [r602+72], r460; +barrier.sync 0; +ld.shared.u32 r496, [r590]; +ld.shared.u32 r493, [r590+108]; +ld.shared.u32 r494, [r590+216]; +barrier.sync 0; +st.shared.u32 [r602], r338; +st.shared.u32 [r602+36], r432; +st.shared.u32 [r602+72], r469; +barrier.sync 0; +ld.shared.u32 r502, [r590]; +ld.shared.u32 r499, [r590+108]; +ld.shared.u32 r500, [r590+216]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f38; +cvt.rn.f16.f32 high, f38; +mov.b32 r490, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f40; +cvt.rn.f16.f32 high, f40; +mov.b32 r491, {low, high}; +} +{ +add.f16x2 r492, r493, r494; +} +{ +add.f16x2 %0, r496, r492; +} +{ +add.f16x2 r498, r499, r500; +} +{ +add.f16x2 %1, r502, r498; +} +{ +add.f16x2 r504, r493, r494; +} +{ +mul.f16x2 r507, r504, r490; +} +{ +add.f16x2 r510, r496, r507; +} +{ +sub.f16x2 r513, r499, r500; +} +{ +mul.f16x2 r516, r513, r491; +} +{ +add.f16x2 %2, r510, r516; +} +{ +add.f16x2 r522, r493, r494; +} +{ +mul.f16x2 r525, r522, r490; +} +{ +add.f16x2 r528, r496, r525; +} +{ +sub.f16x2 r531, r499, r500; +} +{ +mul.f16x2 r534, r531, r491; +} +{ +sub.f16x2 %4, r528, r534; +} +{ +add.f16x2 r540, r499, r500; +} +{ +mul.f16x2 r543, r540, r490; +} +{ +add.f16x2 r546, r502, r543; +} +{ +sub.f16x2 r549, r493, r494; +} +{ +mul.f16x2 r552, r549, r491; +} +{ +sub.f16x2 %3, r546, r552; +} +{ +add.f16x2 r558, r499, r500; +} +{ +mul.f16x2 r561, r558, r490; +} +{ +add.f16x2 r564, r502, r561; +} +{ +sub.f16x2 r567, r493, r494; +} +{ +mul.f16x2 r570, r567, r491; +} +{ +add.f16x2 %5, r564, r570; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..65d0549cd35a9 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp32_fwd.hpp.inc @@ -0,0 +1,3320 @@ +#ifndef CUFFTDX_FFT_81_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_81_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<126, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<384>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 648, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0f3F5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0f3F5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0f3F5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0f3F5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0f3F5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 648, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f151, f120; +mul.f32 f156, f152, f122; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f160, f136; +mul.f32 f164, f162, f138; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f168, f111; +mul.f32 f172, f170, f117; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f176, f127; +mul.f32 f180, f178, f133; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f184, f143; +mul.f32 f188, f186, f149; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f192, f112; +mul.f32 f196, f194, f118; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f200, f128; +mul.f32 f204, f202, f134; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f208, f144; +mul.f32 f212, f210, f150; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f152, f120, f157; +sub.f32 f217, f155, f156; +st.shared.v2.f32 [r9+8], {f217, f216}; +fma.rn.f32 f218, f162, f136, f165; +sub.f32 f219, f163, f164; +st.shared.v2.f32 [r9+16], {f219, f218}; +sub.f32 f220, f171, f172; +fma.rn.f32 f221, f170, f111, f173; +st.shared.v2.f32 [r9+24], {f220, f221}; +fma.rn.f32 f222, f178, f127, f181; +sub.f32 f223, f179, f180; +st.shared.v2.f32 [r9+32], {f223, f222}; +sub.f32 f224, f187, f188; +fma.rn.f32 f225, f186, f143, f189; +st.shared.v2.f32 [r9+40], {f224, f225}; +fma.rn.f32 f226, f194, f112, f197; +sub.f32 f227, f195, f196; +st.shared.v2.f32 [r9+48], {f227, f226}; +fma.rn.f32 f228, f202, f128, f205; +sub.f32 f229, f203, f204; +st.shared.v2.f32 [r9+56], {f229, f228}; +fma.rn.f32 f230, f210, f144, f213; +sub.f32 f231, f211, f212; +st.shared.v2.f32 [r9+64], {f231, f230}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+72]; +ld.shared.v2.f32 {f240, f241}, [r11+144]; +ld.shared.v2.f32 {f244, f245}, [r11+216]; +ld.shared.v2.f32 {f248, f249}, [r11+288]; +ld.shared.v2.f32 {f252, f253}, [r11+360]; +ld.shared.v2.f32 {f256, f257}, [r11+432]; +ld.shared.v2.f32 {f260, f261}, [r11+504]; +ld.shared.v2.f32 {f264, f265}, [r11+576]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0f3F5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0f3F5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0f3F5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0f3F5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0f3F5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0f3F5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0fBF248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0fBF248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0fBF7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0fBF7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0fBF7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0fBF7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0fBEAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0fBEAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0f3F5DB3D7; +mul.f32 f342, f337, 0f3F000000; +sub.f32 f343, f271, f342; +sub.f32 f344, f285, f301; +mul.f32 f345, f344, 0f3F5DB3D7; +add.f32 f346, f318, f323; +add.f32 f347, f320, f325; +mul.f32 f348, f346, 0f3F000000; +sub.f32 f349, f276, f348; +sub.f32 f350, f320, f325; +mul.f32 f351, f350, 0f3F5DB3D7; +mul.f32 f352, f347, 0f3F000000; +sub.f32 f353, f282, f352; +sub.f32 f354, f318, f323; +mul.f32 f355, f354, 0f3F5DB3D7; +add.f32 f356, f328, f333; +add.f32 f357, f330, f335; +mul.f32 f358, f356, 0f3F000000; +sub.f32 f359, f277, f358; +sub.f32 f360, f330, f335; +mul.f32 f361, f360, 0f3F5DB3D7; +mul.f32 f362, f357, 0f3F000000; +sub.f32 f363, f283, f362; +sub.f32 f364, f328, f333; +mul.f32 f365, f364, 0f3F5DB3D7; +add.f32 %1, f271, f337; +add.f32 %0, f269, f336; +add.f32 %3, f282, f347; +add.f32 %2, f276, f346; +add.f32 %5, f283, f357; +add.f32 %4, f277, f356; +sub.f32 %7, f343, f345; +add.f32 %6, f341, f339; +sub.f32 %9, f353, f355; +add.f32 %8, f351, f349; +sub.f32 %11, f363, f365; +add.f32 %10, f361, f359; +add.f32 %13, f345, f343; +sub.f32 %12, f339, f341; +add.f32 %15, f355, f353; +sub.f32 %14, f349, f351; +add.f32 %17, f365, f363; +sub.f32 %16, f359, f361; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<127, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<366>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 324, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0f3F5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0f3F5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0f3F5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0f3F5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f153, f122; +mul.f32 f158, f154, f124; +sub.f32 f159, f157, f158; +mul.f32 f160, f153, f124; +fma.rn.f32 f161, f154, f122, f160; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f164, f138; +mul.f32 f168, f166, f140; +sub.f32 f169, f167, f168; +mul.f32 f170, f164, f140; +fma.rn.f32 f171, f166, f138, f170; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f174, f113; +mul.f32 f178, f176, f119; +sub.f32 f179, f177, f178; +mul.f32 f180, f174, f119; +fma.rn.f32 f181, f176, f113, f180; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f184, f129; +mul.f32 f188, f186, f135; +sub.f32 f189, f187, f188; +mul.f32 f190, f184, f135; +fma.rn.f32 f191, f186, f129, f190; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f194, f145; +mul.f32 f198, f196, f151; +sub.f32 f199, f197, f198; +mul.f32 f200, f194, f151; +fma.rn.f32 f201, f196, f145, f200; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f204, f114; +mul.f32 f208, f206, f120; +sub.f32 f209, f207, f208; +mul.f32 f210, f204, f120; +fma.rn.f32 f211, f206, f114, f210; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f214, f130; +mul.f32 f218, f216, f136; +sub.f32 f219, f217, f218; +mul.f32 f220, f214, f136; +fma.rn.f32 f221, f216, f130, f220; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f224, f146; +mul.f32 f228, f226, f152; +sub.f32 f229, f227, f228; +mul.f32 f230, f224, f152; +fma.rn.f32 f231, f226, f146, f230; +mad.lo.s32 r8, r5, 324, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f159; +st.shared.f32 [r9+8], f169; +st.shared.f32 [r9+12], f179; +st.shared.f32 [r9+16], f189; +st.shared.f32 [r9+20], f199; +st.shared.f32 [r9+24], f209; +st.shared.f32 [r9+28], f219; +st.shared.f32 [r9+32], f229; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+36]; +ld.shared.f32 f234, [r11+72]; +ld.shared.f32 f235, [r11+108]; +ld.shared.f32 f236, [r11+144]; +ld.shared.f32 f237, [r11+180]; +ld.shared.f32 f238, [r11+216]; +ld.shared.f32 f239, [r11+252]; +ld.shared.f32 f240, [r11+288]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+36]; +ld.shared.f32 f243, [r11+72]; +ld.shared.f32 f244, [r11+108]; +ld.shared.f32 f245, [r11+144]; +ld.shared.f32 f246, [r11+180]; +ld.shared.f32 f247, [r11+216]; +ld.shared.f32 f248, [r11+252]; +ld.shared.f32 f249, [r11+288]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0f3F5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0f3F5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0f3F5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0f3F5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0f3F5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0f3F5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0fBF248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0fBF248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0fBF7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0fBF7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0fBF7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0fBF7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0fBEAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0fBEAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f269, f285; +mul.f32 f320, f318, 0f3F000000; +sub.f32 f321, f251, f320; +sub.f32 f322, f269, f285; +mul.f32 f323, f322, 0f3F5DB3D7; +mul.f32 f324, f319, 0f3F000000; +sub.f32 f325, f253, f324; +sub.f32 f326, f267, f283; +mul.f32 f327, f326, 0f3F5DB3D7; +add.f32 f328, f300, f305; +add.f32 f329, f302, f307; +mul.f32 f330, f328, 0f3F000000; +sub.f32 f331, f258, f330; +sub.f32 f332, f302, f307; +mul.f32 f333, f332, 0f3F5DB3D7; +mul.f32 f334, f329, 0f3F000000; +sub.f32 f335, f264, f334; +sub.f32 f336, f300, f305; +mul.f32 f337, f336, 0f3F5DB3D7; +add.f32 f338, f310, f315; +add.f32 f339, f312, f317; +mul.f32 f340, f338, 0f3F000000; +sub.f32 f341, f259, f340; +sub.f32 f342, f312, f317; +mul.f32 f343, f342, 0f3F5DB3D7; +mul.f32 f344, f339, 0f3F000000; +sub.f32 f345, f265, f344; +sub.f32 f346, f310, f315; +mul.f32 f347, f346, 0f3F5DB3D7; +add.f32 %0, f251, f318; +add.f32 %1, f253, f319; +add.f32 %3, f264, f329; +add.f32 %2, f258, f328; +add.f32 %5, f265, f339; +add.f32 %4, f259, f338; +add.f32 %6, f323, f321; +sub.f32 %7, f325, f327; +sub.f32 %9, f335, f337; +add.f32 %8, f333, f331; +sub.f32 %11, f345, f347; +add.f32 %10, f343, f341; +sub.f32 %12, f321, f323; +add.f32 %13, f327, f325; +add.f32 %15, f337, f335; +sub.f32 %14, f331, f333; +add.f32 %17, f347, f345; +sub.f32 %16, f341, f343; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<129, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1386>; +.reg .b32 r<14>; +.reg .b64 rd<11>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 648, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1385, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1384, %57, f1385; +mul.f32 f119, f1385, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1383, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1382, %63, f1383; +mul.f32 f135, f1383, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1381, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1380, %69, f1381; +mul.f32 f151, f1381, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f1379, f133, 0f3F441B7D; +sub.f32 f159, f1379, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f1377, f149, 0f3E31D0D4; +mul.f32 f1378, f155, 0fBF7C1C5C; +sub.f32 f164, f1377, f1378; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f1375, f134, 0f3E31D0D4; +mul.f32 f1376, f140, 0fBF7C1C5C; +sub.f32 f169, f1375, f1376; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f1373, f150, 0fBF708FB2; +mul.f32 f1374, f156, 0fBEAF1D44; +sub.f32 f174, f1373, f1374; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1372, f1382, f1380; +sub.f32 f183, f1382, f1380; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1371, f1384, f1372; +mul.f32 f187, f1372, 0f3F000000; +sub.f32 f188, f1384, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1370, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1369, f123, f1370; +mul.f32 f203, f1370, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1368, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1367, f124, f1368; +mul.f32 f219, f1368, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1364, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1362, %112, f1364; +mul.f32 f235, f1364, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1359, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1357, %115, f1359; +mul.f32 f251, f1359, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1354, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1352, %118, f1354; +mul.f32 f267, f1354, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f1351, f249, 0f3F441B7D; +sub.f32 f275, f1351, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f1350, f265, 0f3E31D0D4; +sub.f32 f280, f1350, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f1348, f250, 0f3E31D0D4; +mul.f32 f1349, f256, 0fBF7C1C5C; +sub.f32 f285, f1348, f1349; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f1346, f266, 0fBF708FB2; +mul.f32 f1347, f272, 0fBEAF1D44; +sub.f32 f290, f1346, f1347; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1345, f1357, f1352; +sub.f32 f299, f1357, f1352; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1344, f1362, f1345; +mul.f32 f303, f1345, 0f3F000000; +sub.f32 f304, f1362, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1343, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1342, f239, f1343; +mul.f32 f319, f1343, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1341, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1340, f240, f1341; +mul.f32 f335, f1341, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1337, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1335, %121, f1337; +mul.f32 f351, f1337, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1332, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1330, %124, f1332; +mul.f32 f367, f1332, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1328, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1326, %126, f1328; +mul.f32 f383, f1328, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f1325, f365, 0f3F441B7D; +sub.f32 f391, f1325, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f1324, f381, 0f3E31D0D4; +sub.f32 f396, f1324, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f1322, f366, 0f3E31D0D4; +mul.f32 f1323, f372, 0fBF7C1C5C; +sub.f32 f401, f1322, f1323; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f1320, f382, 0fBF708FB2; +mul.f32 f1321, f388, 0fBEAF1D44; +sub.f32 f406, f1320, f1321; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1319, f1330, f1326; +sub.f32 f415, f1330, f1326; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1318, f1335, f1319; +mul.f32 f419, f1319, 0f3F000000; +sub.f32 f420, f1335, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1317, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1316, f355, f1317; +mul.f32 f435, f1317, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1315, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1314, f356, f1315; +mul.f32 f451, f1315, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1342, 0fBE6C2691; +mul.f32 f1313, f310, 0f3F791978; +sub.f32 f459, f1313, f458; +mul.f32 f460, f1342, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f1311, f426, 0f3F64C51C; +mul.f32 f1312, f1316, 0fBEE5C902; +sub.f32 f464, f1311, f1312; +mul.f32 f465, f1316, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f1309, f326, 0f3F64C51C; +mul.f32 f1310, f1340, 0fBEE5C902; +sub.f32 f469, f1309, f1310; +mul.f32 f470, f1340, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f1307, f442, 0f3F18DF63; +mul.f32 f1308, f1314, 0fBF4D57F2; +sub.f32 f474, f1307, f1308; +mul.f32 f475, f1314, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f1305, f301, 0f3F441B7D; +mul.f32 f1306, f307, 0fBF248DBB; +sub.f32 f479, f1305, f1306; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f1304, f417, 0f3E31D0D4; +sub.f32 f484, f1304, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f1303, f317, 0f3F18DF63; +sub.f32 f489, f1303, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f1302, f433, 0fBE92D7E0; +sub.f32 f494, f1302, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f1301, f333, 0f3ECACAF8; +sub.f32 f499, f1301, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f1300, f449, 0fBF2FAD88; +sub.f32 f504, f1300, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f1299, f302, 0f3E31D0D4; +sub.f32 f509, f1299, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f1297, f418, 0fBF708FB2; +mul.f32 f1298, f424, 0fBEAF1D44; +sub.f32 f514, f1297, f1298; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f1295, f318, 0fBD6E2946; +mul.f32 f1296, f324, 0fBF7F9120; +sub.f32 f519, f1295, f1296; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f1293, f434, 0fBF7E44DE; +mul.f32 f1294, f440, 0f3DEDC21F; +sub.f32 f524, f1293, f1294; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f1292, f334, 0fBE92D7E0; +sub.f32 f529, f1292, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f1291, f450, 0fBF55E287; +sub.f32 f534, f1291, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f1290, f1344, f1318; +sub.f32 f541, f1344, f1318; +mul.f32 f542, f541, 0f3F5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f1290, 0f3F000000; +sub.f32 f546, f1371, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0f3F5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f1289, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0f3F5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f1288, f1369, f1289; +mul.f32 f561, f1289, 0f3F000000; +sub.f32 f562, f1369, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0f3F5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f1287, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0f3F5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f1286, f1367, f1287; +mul.f32 f577, f1287, 0f3F000000; +sub.f32 f578, f1367, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0f3F5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f1285, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0f3F5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f1284, f191, f1285; +mul.f32 f593, f1285, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0f3F5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f1283, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0f3F5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f1282, f207, f1283; +mul.f32 f609, f1283, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0f3F5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f1281, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0f3F5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f1280, f223, f1281; +mul.f32 f625, f1281, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0f3F5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f1279, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0f3F5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f1278, f192, f1279; +mul.f32 f641, f1279, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0f3F5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f1277, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0f3F5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f1276, f208, f1277; +mul.f32 f657, f1277, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0f3F5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f1275, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0f3F5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f1274, f224, f1275; +mul.f32 f673, f1275, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0f3F5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 648, r3; +mul.wide.u32 rd7, r7, 8; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f684, f680, f1288; +mul.f32 f685, f679, f1288; +mul.f32 f1272, f679, f679; +mul.f32 f1273, f680, f680; +sub.f32 f688, f1272, f1273; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f692, f690, f1286; +mul.f32 f693, f688, f1286; +mul.f32 f695, f680, f690; +mul.f32 f1271, f679, f688; +sub.f32 f696, f1271, f695; +mul.f32 f1270, f688, f568; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f700, f698, f1284; +mul.f32 f701, f696, f1284; +mul.f32 f1268, f679, f696; +mul.f32 f1269, f680, f698; +sub.f32 f704, f1268, f1269; +mul.f32 f1267, f696, f584; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f708, f706, f1282; +mul.f32 f709, f704, f1282; +mul.f32 f711, f680, f706; +mul.f32 f1266, f679, f704; +sub.f32 f712, f1266, f711; +mul.f32 f1265, f704, f600; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f716, f714, f1280; +mul.f32 f717, f712, f1280; +mul.f32 f719, f680, f714; +mul.f32 f1264, f679, f712; +sub.f32 f720, f1264, f719; +mul.f32 f1263, f712, f616; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f724, f722, f1278; +mul.f32 f725, f720, f1278; +mul.f32 f1261, f679, f720; +mul.f32 f1262, f680, f722; +sub.f32 f728, f1261, f1262; +mul.f32 f1260, f720, f632; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f732, f730, f1276; +mul.f32 f733, f728, f1276; +mul.f32 f735, f680, f730; +mul.f32 f1259, f679, f728; +sub.f32 f736, f1259, f735; +mul.f32 f1258, f728, f648; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f740, f738, f1274; +mul.f32 f741, f736, f1274; +mul.f32 f743, f680, f738; +mul.f32 f1257, f679, f736; +sub.f32 f744, f1257, f743; +mul.f32 f1256, f736, f664; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f748, f746, f549; +mul.f32 f749, f744, f549; +mul.f32 f1254, f679, f744; +mul.f32 f1255, f680, f746; +sub.f32 f752, f1254, f1255; +mul.f32 f1253, f744, f543; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f756, f754, f565; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f1252, f679, f752; +sub.f32 f760, f1252, f759; +mul.f32 f1251, f752, f559; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f764, f762, f581; +mul.f32 f765, f760, f581; +mul.f32 f1249, f679, f760; +mul.f32 f1250, f680, f762; +sub.f32 f768, f1249, f1250; +mul.f32 f1248, f760, f575; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f772, f770, f597; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f1247, f679, f768; +sub.f32 f776, f1247, f775; +mul.f32 f1246, f768, f591; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f780, f778, f613; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f1245, f679, f776; +sub.f32 f784, f1245, f783; +mul.f32 f1244, f776, f607; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f788, f786, f629; +mul.f32 f789, f784, f629; +mul.f32 f1242, f679, f784; +mul.f32 f1243, f680, f786; +sub.f32 f792, f1242, f1243; +mul.f32 f1241, f784, f623; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f796, f794, f645; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f1240, f679, f792; +sub.f32 f800, f1240, f799; +mul.f32 f1239, f792, f639; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f804, f802, f661; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f1238, f679, f800; +sub.f32 f808, f1238, f807; +mul.f32 f1237, f800, f655; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f812, f810, f677; +mul.f32 f813, f808, f677; +mul.f32 f1235, f679, f808; +mul.f32 f1236, f680, f810; +sub.f32 f816, f1235, f1236; +mul.f32 f1234, f808, f671; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f820, f818, f550; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f1233, f679, f816; +sub.f32 f824, f1233, f823; +mul.f32 f1232, f816, f544; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f828, f826, f566; +mul.f32 f829, f824, f566; +mul.f32 f1230, f679, f824; +mul.f32 f1231, f680, f826; +sub.f32 f832, f1230, f1231; +mul.f32 f1229, f824, f560; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f836, f834, f582; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f1228, f679, f832; +sub.f32 f840, f1228, f839; +mul.f32 f1227, f832, f576; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f844, f842, f598; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f1226, f679, f840; +sub.f32 f848, f1226, f847; +mul.f32 f1225, f840, f592; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f852, f850, f614; +mul.f32 f853, f848, f614; +mul.f32 f1223, f679, f848; +mul.f32 f1224, f680, f850; +sub.f32 f856, f1223, f1224; +mul.f32 f1222, f848, f608; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f860, f858, f630; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f1221, f679, f856; +sub.f32 f864, f1221, f863; +mul.f32 f1220, f856, f624; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f868, f866, f646; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f1219, f679, f864; +sub.f32 f872, f1219, f871; +mul.f32 f1218, f864, f640; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f876, f874, f662; +mul.f32 f877, f872, f662; +mul.f32 f1216, f679, f872; +mul.f32 f1217, f680, f874; +sub.f32 f880, f1216, f1217; +mul.f32 f1215, f679, f552; +mul.f32 f881, f679, f874; +mul.f32 f1214, f872, f656; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f880, f672; +mul.f32 f884, f882, f678; +mul.f32 f885, f880, f678; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +add.f32 f886, f1371, f1290; +add.f32 f887, f178, f537; +st.shared.v2.f32 [r9], {f887, f886}; +fma.rn.f32 f888, f680, f552, f685; +sub.f32 f889, f1215, f684; +st.shared.v2.f32 [r9+8], {f889, f888}; +fma.rn.f32 f890, f690, f568, f693; +sub.f32 f891, f1270, f692; +st.shared.v2.f32 [r9+16], {f891, f890}; +fma.rn.f32 f892, f698, f584, f701; +sub.f32 f893, f1267, f700; +st.shared.v2.f32 [r9+24], {f893, f892}; +fma.rn.f32 f894, f706, f600, f709; +sub.f32 f895, f1265, f708; +st.shared.v2.f32 [r9+32], {f895, f894}; +fma.rn.f32 f896, f714, f616, f717; +sub.f32 f897, f1263, f716; +st.shared.v2.f32 [r9+40], {f897, f896}; +fma.rn.f32 f898, f722, f632, f725; +sub.f32 f899, f1260, f724; +st.shared.v2.f32 [r9+48], {f899, f898}; +sub.f32 f900, f1258, f732; +fma.rn.f32 f901, f730, f648, f733; +st.shared.v2.f32 [r9+56], {f900, f901}; +fma.rn.f32 f902, f738, f664, f741; +sub.f32 f903, f1256, f740; +st.shared.v2.f32 [r9+64], {f903, f902}; +fma.rn.f32 f904, f746, f543, f749; +sub.f32 f905, f1253, f748; +st.shared.v2.f32 [r9+72], {f905, f904}; +fma.rn.f32 f906, f754, f559, f757; +sub.f32 f907, f1251, f756; +st.shared.v2.f32 [r9+80], {f907, f906}; +fma.rn.f32 f908, f762, f575, f765; +sub.f32 f909, f1248, f764; +st.shared.v2.f32 [r9+88], {f909, f908}; +fma.rn.f32 f910, f770, f591, f773; +sub.f32 f911, f1246, f772; +st.shared.v2.f32 [r9+96], {f911, f910}; +fma.rn.f32 f912, f778, f607, f781; +sub.f32 f913, f1244, f780; +st.shared.v2.f32 [r9+104], {f913, f912}; +fma.rn.f32 f914, f786, f623, f789; +sub.f32 f915, f1241, f788; +st.shared.v2.f32 [r9+112], {f915, f914}; +fma.rn.f32 f916, f794, f639, f797; +sub.f32 f917, f1239, f796; +st.shared.v2.f32 [r9+120], {f917, f916}; +fma.rn.f32 f918, f802, f655, f805; +sub.f32 f919, f1237, f804; +st.shared.v2.f32 [r9+128], {f919, f918}; +fma.rn.f32 f920, f810, f671, f813; +sub.f32 f921, f1234, f812; +st.shared.v2.f32 [r9+136], {f921, f920}; +fma.rn.f32 f922, f818, f544, f821; +sub.f32 f923, f1232, f820; +st.shared.v2.f32 [r9+144], {f923, f922}; +fma.rn.f32 f924, f826, f560, f829; +sub.f32 f925, f1229, f828; +st.shared.v2.f32 [r9+152], {f925, f924}; +fma.rn.f32 f926, f834, f576, f837; +sub.f32 f927, f1227, f836; +st.shared.v2.f32 [r9+160], {f927, f926}; +fma.rn.f32 f928, f842, f592, f845; +sub.f32 f929, f1225, f844; +st.shared.v2.f32 [r9+168], {f929, f928}; +fma.rn.f32 f930, f850, f608, f853; +sub.f32 f931, f1222, f852; +st.shared.v2.f32 [r9+176], {f931, f930}; +fma.rn.f32 f932, f858, f624, f861; +sub.f32 f933, f1220, f860; +st.shared.v2.f32 [r9+184], {f933, f932}; +fma.rn.f32 f934, f866, f640, f869; +sub.f32 f935, f1218, f868; +st.shared.v2.f32 [r9+192], {f935, f934}; +fma.rn.f32 f936, f874, f656, f877; +sub.f32 f937, f1214, f876; +st.shared.v2.f32 [r9+200], {f937, f936}; +fma.rn.f32 f938, f882, f672, f885; +sub.f32 f939, f883, f884; +st.shared.v2.f32 [r9+208], {f939, f938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+24]; +ld.shared.v2.f32 {f948, f949}, [r10+48]; +ld.shared.v2.f32 {f952, f953}, [r10+72]; +ld.shared.v2.f32 {f956, f957}, [r10+96]; +ld.shared.v2.f32 {f960, f961}, [r10+120]; +ld.shared.v2.f32 {f964, f965}, [r10+144]; +ld.shared.v2.f32 {f968, f969}, [r10+168]; +ld.shared.v2.f32 {f972, f973}, [r10+192]; +ld.shared.v2.f32 {f976, f977}, [r10+216]; +ld.shared.v2.f32 {f980, f981}, [r10+240]; +ld.shared.v2.f32 {f984, f985}, [r10+264]; +ld.shared.v2.f32 {f988, f989}, [r10+288]; +ld.shared.v2.f32 {f992, f993}, [r10+312]; +ld.shared.v2.f32 {f996, f997}, [r10+336]; +ld.shared.v2.f32 {f1000, f1001}, [r10+360]; +ld.shared.v2.f32 {f1004, f1005}, [r10+384]; +ld.shared.v2.f32 {f1008, f1009}, [r10+408]; +ld.shared.v2.f32 {f1012, f1013}, [r10+432]; +ld.shared.v2.f32 {f1016, f1017}, [r10+456]; +ld.shared.v2.f32 {f1020, f1021}, [r10+480]; +ld.shared.v2.f32 {f1024, f1025}, [r10+504]; +ld.shared.v2.f32 {f1028, f1029}, [r10+528]; +ld.shared.v2.f32 {f1032, f1033}, [r10+552]; +ld.shared.v2.f32 {f1036, f1037}, [r10+576]; +ld.shared.v2.f32 {f1040, f1041}, [r10+600]; +ld.shared.v2.f32 {f1044, f1045}, [r10+624]; +add.f32 f1048, f976, f1012; +mul.f32 f1050, f1048, 0f3F000000; +sub.f32 f1051, f940, f1050; +add.f32 f1213, f977, f1013; +sub.f32 f1052, f977, f1013; +mul.f32 f1053, f1052, 0f3F5DB3D7; +mul.f32 f1054, f1213, 0f3F000000; +sub.f32 f1055, f941, f1054; +sub.f32 f1056, f976, f1012; +mul.f32 f1057, f1056, 0f3F5DB3D7; +add.f32 f1058, f980, f1016; +mul.f32 f1060, f1058, 0f3F000000; +sub.f32 f1061, f944, f1060; +add.f32 f1212, f981, f1017; +sub.f32 f1062, f981, f1017; +mul.f32 f1063, f1062, 0f3F5DB3D7; +mul.f32 f1064, f1212, 0f3F000000; +sub.f32 f1065, f945, f1064; +sub.f32 f1066, f980, f1016; +mul.f32 f1067, f1066, 0f3F5DB3D7; +add.f32 f1068, f984, f1020; +mul.f32 f1070, f1068, 0f3F000000; +sub.f32 f1071, f948, f1070; +add.f32 f1211, f985, f1021; +sub.f32 f1072, f985, f1021; +mul.f32 f1073, f1072, 0f3F5DB3D7; +mul.f32 f1074, f1211, 0f3F000000; +sub.f32 f1075, f949, f1074; +sub.f32 f1076, f984, f1020; +mul.f32 f1077, f1076, 0f3F5DB3D7; +add.f32 f1078, f988, f1024; +mul.f32 f1080, f1078, 0f3F000000; +sub.f32 f1081, f952, f1080; +add.f32 f1210, f989, f1025; +sub.f32 f1082, f989, f1025; +mul.f32 f1083, f1082, 0f3F5DB3D7; +mul.f32 f1084, f1210, 0f3F000000; +sub.f32 f1085, f953, f1084; +sub.f32 f1086, f988, f1024; +mul.f32 f1087, f1086, 0f3F5DB3D7; +add.f32 f1088, f992, f1028; +mul.f32 f1090, f1088, 0f3F000000; +sub.f32 f1091, f956, f1090; +add.f32 f1209, f993, f1029; +sub.f32 f1092, f993, f1029; +mul.f32 f1093, f1092, 0f3F5DB3D7; +mul.f32 f1094, f1209, 0f3F000000; +sub.f32 f1095, f957, f1094; +sub.f32 f1096, f992, f1028; +mul.f32 f1097, f1096, 0f3F5DB3D7; +add.f32 f1098, f996, f1032; +mul.f32 f1100, f1098, 0f3F000000; +sub.f32 f1101, f960, f1100; +add.f32 f1208, f997, f1033; +sub.f32 f1102, f997, f1033; +mul.f32 f1103, f1102, 0f3F5DB3D7; +mul.f32 f1104, f1208, 0f3F000000; +sub.f32 f1105, f961, f1104; +sub.f32 f1106, f996, f1032; +mul.f32 f1107, f1106, 0f3F5DB3D7; +add.f32 f1108, f1000, f1036; +mul.f32 f1110, f1108, 0f3F000000; +sub.f32 f1111, f964, f1110; +add.f32 f1207, f1001, f1037; +sub.f32 f1112, f1001, f1037; +mul.f32 f1113, f1112, 0f3F5DB3D7; +mul.f32 f1114, f1207, 0f3F000000; +sub.f32 f1115, f965, f1114; +sub.f32 f1116, f1000, f1036; +mul.f32 f1117, f1116, 0f3F5DB3D7; +add.f32 f1118, f1004, f1040; +mul.f32 f1120, f1118, 0f3F000000; +sub.f32 f1121, f968, f1120; +add.f32 f1206, f1005, f1041; +sub.f32 f1122, f1005, f1041; +mul.f32 f1123, f1122, 0f3F5DB3D7; +mul.f32 f1124, f1206, 0f3F000000; +sub.f32 f1125, f969, f1124; +sub.f32 f1126, f1004, f1040; +mul.f32 f1127, f1126, 0f3F5DB3D7; +add.f32 f1128, f1008, f1044; +mul.f32 f1130, f1128, 0f3F000000; +sub.f32 f1131, f972, f1130; +add.f32 f1205, f1009, f1045; +sub.f32 f1132, f1009, f1045; +mul.f32 f1133, f1132, 0f3F5DB3D7; +mul.f32 f1134, f1205, 0f3F000000; +sub.f32 f1135, f973, f1134; +sub.f32 f1136, f1008, f1044; +mul.f32 f1137, f1136, 0f3F5DB3D7; +add.f32 %1, f941, f1213; +add.f32 %0, f940, f1048; +add.f32 %3, f945, f1212; +add.f32 %2, f944, f1058; +add.f32 %5, f949, f1211; +add.f32 %4, f948, f1068; +add.f32 %7, f953, f1210; +add.f32 %6, f952, f1078; +add.f32 %9, f957, f1209; +add.f32 %8, f956, f1088; +add.f32 %11, f961, f1208; +add.f32 %10, f960, f1098; +add.f32 %13, f965, f1207; +add.f32 %12, f964, f1108; +add.f32 %15, f969, f1206; +add.f32 %14, f968, f1118; +add.f32 %17, f973, f1205; +add.f32 %16, f972, f1128; +add.f32 %18, f1053, f1051; +sub.f32 %19, f1055, f1057; +sub.f32 %21, f1065, f1067; +add.f32 %20, f1063, f1061; +sub.f32 %23, f1075, f1077; +add.f32 %22, f1073, f1071; +add.f32 %24, f1083, f1081; +sub.f32 %25, f1085, f1087; +add.f32 %26, f1093, f1091; +sub.f32 %27, f1095, f1097; +add.f32 %28, f1103, f1101; +sub.f32 %29, f1105, f1107; +add.f32 %30, f1113, f1111; +sub.f32 %31, f1115, f1117; +sub.f32 %33, f1125, f1127; +add.f32 %32, f1123, f1121; +sub.f32 %35, f1135, f1137; +add.f32 %34, f1133, f1131; +add.f32 %37, f1057, f1055; +sub.f32 %36, f1051, f1053; +add.f32 %39, f1067, f1065; +sub.f32 %38, f1061, f1063; +add.f32 %41, f1077, f1075; +sub.f32 %40, f1071, f1073; +add.f32 %43, f1087, f1085; +sub.f32 %42, f1081, f1083; +add.f32 %45, f1097, f1095; +sub.f32 %44, f1091, f1093; +add.f32 %47, f1107, f1105; +sub.f32 %46, f1101, f1103; +add.f32 %49, f1117, f1115; +sub.f32 %48, f1111, f1113; +add.f32 %51, f1127, f1125; +sub.f32 %50, f1121, f1123; +add.f32 %53, f1137, f1135; +sub.f32 %52, f1131, f1133; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<128, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1351>; +.reg .b32 r<14>; +.reg .b64 rd<8>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 324, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1342, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0f3F5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1341, %57, f1342; +mul.f32 f119, f1342, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0f3F5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1340, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0f3F5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1339, %63, f1340; +mul.f32 f135, f1340, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0f3F5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1338, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0f3F5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1337, %69, f1338; +mul.f32 f151, f1338, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0f3F5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0fBF248DBB; +mul.f32 f1336, f133, 0f3F441B7D; +sub.f32 f159, f1336, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0fBF248DBB, f160; +mul.f32 f1334, f149, 0f3E31D0D4; +mul.f32 f1335, f155, 0fBF7C1C5C; +sub.f32 f164, f1334, f1335; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0fBF7C1C5C, f165; +mul.f32 f1332, f134, 0f3E31D0D4; +mul.f32 f1333, f140, 0fBF7C1C5C; +sub.f32 f169, f1332, f1333; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0fBF7C1C5C, f170; +mul.f32 f1330, f150, 0fBF708FB2; +mul.f32 f1331, f156, 0fBEAF1D44; +sub.f32 f174, f1330, f1331; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0fBEAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1329, f1339, f1337; +sub.f32 f183, f1339, f1337; +mul.f32 f184, f183, 0f3F5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1328, f1341, f1329; +mul.f32 f187, f1329, 0f3F000000; +sub.f32 f188, f1341, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0f3F5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1327, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0f3F5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1326, f123, f1327; +mul.f32 f203, f1327, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0f3F5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1325, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0f3F5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1324, f124, f1325; +mul.f32 f219, f1325, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0f3F5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1321, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0f3F5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1319, %112, f1321; +mul.f32 f235, f1321, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0f3F5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1316, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0f3F5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1314, %115, f1316; +mul.f32 f251, f1316, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0f3F5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1311, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0f3F5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1309, %118, f1311; +mul.f32 f267, f1311, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0f3F5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0fBF248DBB; +mul.f32 f1308, f249, 0f3F441B7D; +sub.f32 f275, f1308, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0fBF248DBB, f276; +mul.f32 f279, f271, 0fBF7C1C5C; +mul.f32 f1307, f265, 0f3E31D0D4; +sub.f32 f280, f1307, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0fBF7C1C5C, f281; +mul.f32 f1305, f250, 0f3E31D0D4; +mul.f32 f1306, f256, 0fBF7C1C5C; +sub.f32 f285, f1305, f1306; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0fBF7C1C5C, f286; +mul.f32 f1303, f266, 0fBF708FB2; +mul.f32 f1304, f272, 0fBEAF1D44; +sub.f32 f290, f1303, f1304; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0fBEAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1302, f1314, f1309; +sub.f32 f299, f1314, f1309; +mul.f32 f300, f299, 0f3F5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1301, f1319, f1302; +mul.f32 f303, f1302, 0f3F000000; +sub.f32 f304, f1319, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0f3F5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1300, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0f3F5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1299, f239, f1300; +mul.f32 f319, f1300, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0f3F5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1298, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0f3F5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1297, f240, f1298; +mul.f32 f335, f1298, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0f3F5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1294, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0f3F5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1292, %121, f1294; +mul.f32 f351, f1294, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0f3F5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1289, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0f3F5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1287, %124, f1289; +mul.f32 f367, f1289, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0f3F5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1285, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0f3F5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1283, %126, f1285; +mul.f32 f383, f1285, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0f3F5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0fBF248DBB; +mul.f32 f1282, f365, 0f3F441B7D; +sub.f32 f391, f1282, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0fBF248DBB, f392; +mul.f32 f395, f387, 0fBF7C1C5C; +mul.f32 f1281, f381, 0f3E31D0D4; +sub.f32 f396, f1281, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0fBF7C1C5C, f397; +mul.f32 f1279, f366, 0f3E31D0D4; +mul.f32 f1280, f372, 0fBF7C1C5C; +sub.f32 f401, f1279, f1280; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0fBF7C1C5C, f402; +mul.f32 f1277, f382, 0fBF708FB2; +mul.f32 f1278, f388, 0fBEAF1D44; +sub.f32 f406, f1277, f1278; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0fBEAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1276, f1287, f1283; +sub.f32 f415, f1287, f1283; +mul.f32 f416, f415, 0f3F5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1275, f1292, f1276; +mul.f32 f419, f1276, 0f3F000000; +sub.f32 f420, f1292, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0f3F5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1274, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0f3F5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1273, f355, f1274; +mul.f32 f435, f1274, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0f3F5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1272, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0f3F5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1271, f356, f1272; +mul.f32 f451, f1272, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0f3F5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1299, 0fBE6C2691; +mul.f32 f1270, f310, 0f3F791978; +sub.f32 f459, f1270, f458; +mul.f32 f460, f1299, 0f3F791978; +fma.rn.f32 f461, f310, 0fBE6C2691, f460; +mul.f32 f1268, f426, 0f3F64C51C; +mul.f32 f1269, f1273, 0fBEE5C902; +sub.f32 f464, f1268, f1269; +mul.f32 f465, f1273, 0f3F64C51C; +fma.rn.f32 f466, f426, 0fBEE5C902, f465; +mul.f32 f1266, f326, 0f3F64C51C; +mul.f32 f1267, f1297, 0fBEE5C902; +sub.f32 f469, f1266, f1267; +mul.f32 f470, f1297, 0f3F64C51C; +fma.rn.f32 f471, f326, 0fBEE5C902, f470; +mul.f32 f1264, f442, 0f3F18DF63; +mul.f32 f1265, f1271, 0fBF4D57F2; +sub.f32 f474, f1264, f1265; +mul.f32 f475, f1271, 0f3F18DF63; +fma.rn.f32 f476, f442, 0fBF4D57F2, f475; +mul.f32 f1262, f301, 0f3F441B7D; +mul.f32 f1263, f307, 0fBF248DBB; +sub.f32 f479, f1262, f1263; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0fBF248DBB, f480; +mul.f32 f483, f423, 0fBF7C1C5C; +mul.f32 f1261, f417, 0f3E31D0D4; +sub.f32 f484, f1261, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0fBF7C1C5C, f485; +mul.f32 f488, f323, 0fBF4D57F2; +mul.f32 f1260, f317, 0f3F18DF63; +sub.f32 f489, f1260, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0fBF4D57F2, f490; +mul.f32 f493, f439, 0fBF753ECD; +mul.f32 f1259, f433, 0fBE92D7E0; +sub.f32 f494, f1259, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0fBF753ECD, f495; +mul.f32 f498, f339, 0fBF6B1036; +mul.f32 f1258, f333, 0f3ECACAF8; +sub.f32 f499, f1258, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0fBF6B1036, f500; +mul.f32 f503, f455, 0fBF3A3529; +mul.f32 f1257, f449, 0fBF2FAD88; +sub.f32 f504, f1257, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0fBF3A3529, f505; +mul.f32 f508, f308, 0fBF7C1C5C; +mul.f32 f1256, f302, 0f3E31D0D4; +sub.f32 f509, f1256, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0fBF7C1C5C, f510; +mul.f32 f1254, f418, 0fBF708FB2; +mul.f32 f1255, f424, 0fBEAF1D44; +sub.f32 f514, f1254, f1255; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0fBEAF1D44, f515; +mul.f32 f1252, f318, 0fBD6E2946; +mul.f32 f1253, f324, 0fBF7F9120; +sub.f32 f519, f1252, f1253; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0fBF7F9120, f520; +mul.f32 f1250, f434, 0fBF7E44DE; +mul.f32 f1251, f440, 0f3DEDC21F; +sub.f32 f524, f1250, f1251; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0f3DEDC21F, f525; +mul.f32 f528, f340, 0fBF753ECD; +mul.f32 f1249, f334, 0fBE92D7E0; +sub.f32 f529, f1249, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0fBF753ECD, f530; +mul.f32 f533, f456, 0f3F0CAC9F; +mul.f32 f1248, f450, 0fBF55E287; +sub.f32 f534, f1248, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0f3F0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f1247, f1301, f1275; +sub.f32 f543, f1301, f1275; +mul.f32 f544, f543, 0f3F5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f1246, f1328, f1247; +mul.f32 f547, f1247, 0f3F000000; +sub.f32 f548, f1328, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0f3F5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f1245, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0f3F5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f1244, f1326, f1245; +mul.f32 f563, f1245, 0f3F000000; +sub.f32 f564, f1326, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0f3F5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f1243, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0f3F5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f1242, f1324, f1243; +mul.f32 f579, f1243, 0f3F000000; +sub.f32 f580, f1324, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0f3F5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f1241, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0f3F5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f1240, f191, f1241; +mul.f32 f595, f1241, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0f3F5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f1239, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0f3F5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f1238, f207, f1239; +mul.f32 f611, f1239, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0f3F5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f1237, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0f3F5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f1236, f223, f1237; +mul.f32 f627, f1237, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0f3F5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f1235, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0f3F5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f1234, f192, f1235; +mul.f32 f643, f1235, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0f3F5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f1233, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0f3F5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f1232, f208, f1233; +mul.f32 f659, f1233, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0f3F5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f1231, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0f3F5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f1230, f224, f1231; +mul.f32 f675, f1231, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0f3F5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f686, f682, f1244; +mul.f32 f1229, f681, f554; +sub.f32 f687, f1229, f686; +mul.f32 f688, f681, f1244; +fma.rn.f32 f689, f682, f554, f688; +mul.f32 f691, f682, f682; +mul.f32 f1228, f681, f681; +sub.f32 f692, f1228, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f696, f694, f1242; +mul.f32 f1227, f692, f570; +sub.f32 f697, f1227, f696; +mul.f32 f698, f692, f1242; +fma.rn.f32 f699, f694, f570, f698; +mul.f32 f701, f682, f694; +mul.f32 f1226, f681, f692; +sub.f32 f702, f1226, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f706, f704, f1240; +mul.f32 f1225, f702, f586; +sub.f32 f707, f1225, f706; +mul.f32 f708, f702, f1240; +fma.rn.f32 f709, f704, f586, f708; +mul.f32 f1223, f681, f702; +mul.f32 f1224, f682, f704; +sub.f32 f712, f1223, f1224; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f1221, f712, f602; +mul.f32 f1222, f714, f1238; +sub.f32 f717, f1221, f1222; +mul.f32 f718, f712, f1238; +fma.rn.f32 f719, f714, f602, f718; +mul.f32 f1219, f681, f712; +mul.f32 f1220, f682, f714; +sub.f32 f722, f1219, f1220; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f1217, f722, f618; +mul.f32 f1218, f724, f1236; +sub.f32 f727, f1217, f1218; +mul.f32 f728, f722, f1236; +fma.rn.f32 f729, f724, f618, f728; +mul.f32 f731, f682, f724; +mul.f32 f1216, f681, f722; +sub.f32 f732, f1216, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f736, f734, f1234; +mul.f32 f1215, f732, f634; +sub.f32 f737, f1215, f736; +mul.f32 f738, f732, f1234; +fma.rn.f32 f739, f734, f634, f738; +mul.f32 f741, f682, f734; +mul.f32 f1214, f681, f732; +sub.f32 f742, f1214, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f746, f744, f1232; +mul.f32 f1213, f742, f650; +sub.f32 f747, f1213, f746; +mul.f32 f748, f742, f1232; +fma.rn.f32 f749, f744, f650, f748; +mul.f32 f751, f682, f744; +mul.f32 f1212, f681, f742; +sub.f32 f752, f1212, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f756, f754, f1230; +mul.f32 f1211, f752, f666; +sub.f32 f757, f1211, f756; +mul.f32 f758, f752, f1230; +fma.rn.f32 f759, f754, f666, f758; +mul.f32 f1209, f681, f752; +mul.f32 f1210, f682, f754; +sub.f32 f762, f1209, f1210; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f1207, f762, f545; +mul.f32 f1208, f764, f551; +sub.f32 f767, f1207, f1208; +mul.f32 f768, f762, f551; +fma.rn.f32 f769, f764, f545, f768; +mul.f32 f1205, f681, f762; +mul.f32 f1206, f682, f764; +sub.f32 f772, f1205, f1206; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f776, f774, f567; +mul.f32 f1204, f772, f561; +sub.f32 f777, f1204, f776; +mul.f32 f778, f772, f567; +fma.rn.f32 f779, f774, f561, f778; +mul.f32 f781, f682, f774; +mul.f32 f1203, f681, f772; +sub.f32 f782, f1203, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f786, f784, f583; +mul.f32 f1202, f782, f577; +sub.f32 f787, f1202, f786; +mul.f32 f788, f782, f583; +fma.rn.f32 f789, f784, f577, f788; +mul.f32 f791, f682, f784; +mul.f32 f1201, f681, f782; +sub.f32 f792, f1201, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f796, f794, f599; +mul.f32 f1200, f792, f593; +sub.f32 f797, f1200, f796; +mul.f32 f798, f792, f599; +fma.rn.f32 f799, f794, f593, f798; +mul.f32 f801, f682, f794; +mul.f32 f1199, f681, f792; +sub.f32 f802, f1199, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f1197, f802, f609; +mul.f32 f1198, f804, f615; +sub.f32 f807, f1197, f1198; +mul.f32 f808, f802, f615; +fma.rn.f32 f809, f804, f609, f808; +mul.f32 f1195, f681, f802; +mul.f32 f1196, f682, f804; +sub.f32 f812, f1195, f1196; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f1193, f812, f625; +mul.f32 f1194, f814, f631; +sub.f32 f817, f1193, f1194; +mul.f32 f818, f812, f631; +fma.rn.f32 f819, f814, f625, f818; +mul.f32 f1191, f681, f812; +mul.f32 f1192, f682, f814; +sub.f32 f822, f1191, f1192; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f826, f824, f647; +mul.f32 f1190, f822, f641; +sub.f32 f827, f1190, f826; +mul.f32 f828, f822, f647; +fma.rn.f32 f829, f824, f641, f828; +mul.f32 f831, f682, f824; +mul.f32 f1189, f681, f822; +sub.f32 f832, f1189, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f836, f834, f663; +mul.f32 f1188, f832, f657; +sub.f32 f837, f1188, f836; +mul.f32 f838, f832, f663; +fma.rn.f32 f839, f834, f657, f838; +mul.f32 f841, f682, f834; +mul.f32 f1187, f681, f832; +sub.f32 f842, f1187, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f846, f844, f679; +mul.f32 f1186, f842, f673; +sub.f32 f847, f1186, f846; +mul.f32 f848, f842, f679; +fma.rn.f32 f849, f844, f673, f848; +mul.f32 f1184, f681, f842; +mul.f32 f1185, f682, f844; +sub.f32 f852, f1184, f1185; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f1182, f852, f546; +mul.f32 f1183, f854, f552; +sub.f32 f857, f1182, f1183; +mul.f32 f858, f852, f552; +fma.rn.f32 f859, f854, f546, f858; +mul.f32 f1180, f681, f852; +mul.f32 f1181, f682, f854; +sub.f32 f862, f1180, f1181; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f1178, f862, f562; +mul.f32 f1179, f864, f568; +sub.f32 f867, f1178, f1179; +mul.f32 f868, f862, f568; +fma.rn.f32 f869, f864, f562, f868; +mul.f32 f871, f682, f864; +mul.f32 f1177, f681, f862; +sub.f32 f872, f1177, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f876, f874, f584; +mul.f32 f1176, f872, f578; +sub.f32 f877, f1176, f876; +mul.f32 f878, f872, f584; +fma.rn.f32 f879, f874, f578, f878; +mul.f32 f881, f682, f874; +mul.f32 f1175, f681, f872; +sub.f32 f882, f1175, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f886, f884, f600; +mul.f32 f1174, f882, f594; +sub.f32 f887, f1174, f886; +mul.f32 f888, f882, f600; +fma.rn.f32 f889, f884, f594, f888; +mul.f32 f891, f682, f884; +mul.f32 f1173, f681, f882; +sub.f32 f892, f1173, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f1171, f892, f610; +mul.f32 f1172, f894, f616; +sub.f32 f897, f1171, f1172; +mul.f32 f898, f892, f616; +fma.rn.f32 f899, f894, f610, f898; +mul.f32 f1169, f681, f892; +mul.f32 f1170, f682, f894; +sub.f32 f902, f1169, f1170; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f1167, f902, f626; +mul.f32 f1168, f904, f632; +sub.f32 f907, f1167, f1168; +mul.f32 f908, f902, f632; +fma.rn.f32 f909, f904, f626, f908; +mul.f32 f1165, f681, f902; +mul.f32 f1166, f682, f904; +sub.f32 f912, f1165, f1166; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f916, f914, f648; +mul.f32 f1164, f912, f642; +sub.f32 f917, f1164, f916; +mul.f32 f918, f912, f648; +fma.rn.f32 f919, f914, f642, f918; +mul.f32 f921, f682, f914; +mul.f32 f1163, f681, f912; +sub.f32 f922, f1163, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f926, f924, f664; +mul.f32 f1162, f922, f658; +sub.f32 f927, f1162, f926; +mul.f32 f928, f922, f664; +fma.rn.f32 f929, f924, f658, f928; +mul.f32 f931, f682, f924; +mul.f32 f1161, f681, f922; +sub.f32 f932, f1161, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f936, f934, f680; +mul.f32 f1160, f932, f674; +sub.f32 f937, f1160, f936; +mul.f32 f938, f932, f680; +fma.rn.f32 f939, f934, f674, f938; +mad.lo.s32 r8, r5, 324, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f687; +st.shared.f32 [r9+8], f697; +st.shared.f32 [r9+12], f707; +st.shared.f32 [r9+16], f717; +st.shared.f32 [r9+20], f727; +st.shared.f32 [r9+24], f737; +st.shared.f32 [r9+28], f747; +st.shared.f32 [r9+32], f757; +st.shared.f32 [r9+36], f767; +st.shared.f32 [r9+40], f777; +st.shared.f32 [r9+44], f787; +st.shared.f32 [r9+48], f797; +st.shared.f32 [r9+52], f807; +st.shared.f32 [r9+56], f817; +st.shared.f32 [r9+60], f827; +st.shared.f32 [r9+64], f837; +st.shared.f32 [r9+68], f847; +st.shared.f32 [r9+72], f857; +st.shared.f32 [r9+76], f867; +st.shared.f32 [r9+80], f877; +st.shared.f32 [r9+84], f887; +st.shared.f32 [r9+88], f897; +st.shared.f32 [r9+92], f907; +st.shared.f32 [r9+96], f917; +st.shared.f32 [r9+100], f927; +st.shared.f32 [r9+104], f937; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+12]; +ld.shared.f32 f942, [r10+24]; +ld.shared.f32 f943, [r10+36]; +ld.shared.f32 f944, [r10+48]; +ld.shared.f32 f945, [r10+60]; +ld.shared.f32 f946, [r10+72]; +ld.shared.f32 f947, [r10+84]; +ld.shared.f32 f948, [r10+96]; +ld.shared.f32 f949, [r10+108]; +ld.shared.f32 f950, [r10+120]; +ld.shared.f32 f951, [r10+132]; +ld.shared.f32 f952, [r10+144]; +ld.shared.f32 f953, [r10+156]; +ld.shared.f32 f954, [r10+168]; +ld.shared.f32 f955, [r10+180]; +ld.shared.f32 f956, [r10+192]; +ld.shared.f32 f957, [r10+204]; +ld.shared.f32 f958, [r10+216]; +ld.shared.f32 f959, [r10+228]; +ld.shared.f32 f960, [r10+240]; +ld.shared.f32 f961, [r10+252]; +ld.shared.f32 f962, [r10+264]; +ld.shared.f32 f963, [r10+276]; +ld.shared.f32 f964, [r10+288]; +ld.shared.f32 f965, [r10+300]; +ld.shared.f32 f966, [r10+312]; +barrier.sync 0; +st.shared.f32 [r9], f1246; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +ld.shared.f32 f967, [r10]; +ld.shared.f32 f968, [r10+12]; +ld.shared.f32 f969, [r10+24]; +ld.shared.f32 f970, [r10+36]; +ld.shared.f32 f971, [r10+48]; +ld.shared.f32 f972, [r10+60]; +ld.shared.f32 f973, [r10+72]; +ld.shared.f32 f974, [r10+84]; +ld.shared.f32 f975, [r10+96]; +ld.shared.f32 f976, [r10+108]; +ld.shared.f32 f977, [r10+120]; +ld.shared.f32 f978, [r10+132]; +ld.shared.f32 f979, [r10+144]; +ld.shared.f32 f980, [r10+156]; +ld.shared.f32 f981, [r10+168]; +ld.shared.f32 f982, [r10+180]; +ld.shared.f32 f983, [r10+192]; +ld.shared.f32 f984, [r10+204]; +ld.shared.f32 f985, [r10+216]; +ld.shared.f32 f986, [r10+228]; +ld.shared.f32 f987, [r10+240]; +ld.shared.f32 f988, [r10+252]; +ld.shared.f32 f989, [r10+264]; +ld.shared.f32 f990, [r10+276]; +ld.shared.f32 f991, [r10+288]; +ld.shared.f32 f992, [r10+300]; +ld.shared.f32 f993, [r10+312]; +add.f32 f994, f949, f958; +mul.f32 f996, f994, 0f3F000000; +sub.f32 f997, f940, f996; +add.f32 f1159, f976, f985; +sub.f32 f998, f976, f985; +mul.f32 f999, f998, 0f3F5DB3D7; +mul.f32 f1000, f1159, 0f3F000000; +sub.f32 f1001, f967, f1000; +sub.f32 f1002, f949, f958; +mul.f32 f1003, f1002, 0f3F5DB3D7; +add.f32 f1004, f950, f959; +mul.f32 f1006, f1004, 0f3F000000; +sub.f32 f1007, f941, f1006; +add.f32 f1158, f977, f986; +sub.f32 f1008, f977, f986; +mul.f32 f1009, f1008, 0f3F5DB3D7; +mul.f32 f1010, f1158, 0f3F000000; +sub.f32 f1011, f968, f1010; +sub.f32 f1012, f950, f959; +mul.f32 f1013, f1012, 0f3F5DB3D7; +add.f32 f1014, f951, f960; +mul.f32 f1016, f1014, 0f3F000000; +sub.f32 f1017, f942, f1016; +add.f32 f1157, f978, f987; +sub.f32 f1018, f978, f987; +mul.f32 f1019, f1018, 0f3F5DB3D7; +mul.f32 f1020, f1157, 0f3F000000; +sub.f32 f1021, f969, f1020; +sub.f32 f1022, f951, f960; +mul.f32 f1023, f1022, 0f3F5DB3D7; +add.f32 f1024, f952, f961; +mul.f32 f1026, f1024, 0f3F000000; +sub.f32 f1027, f943, f1026; +add.f32 f1156, f979, f988; +sub.f32 f1028, f979, f988; +mul.f32 f1029, f1028, 0f3F5DB3D7; +mul.f32 f1030, f1156, 0f3F000000; +sub.f32 f1031, f970, f1030; +sub.f32 f1032, f952, f961; +mul.f32 f1033, f1032, 0f3F5DB3D7; +add.f32 f1034, f953, f962; +mul.f32 f1036, f1034, 0f3F000000; +sub.f32 f1037, f944, f1036; +add.f32 f1155, f980, f989; +sub.f32 f1038, f980, f989; +mul.f32 f1039, f1038, 0f3F5DB3D7; +mul.f32 f1040, f1155, 0f3F000000; +sub.f32 f1041, f971, f1040; +sub.f32 f1042, f953, f962; +mul.f32 f1043, f1042, 0f3F5DB3D7; +add.f32 f1044, f954, f963; +mul.f32 f1046, f1044, 0f3F000000; +sub.f32 f1047, f945, f1046; +add.f32 f1154, f981, f990; +sub.f32 f1048, f981, f990; +mul.f32 f1049, f1048, 0f3F5DB3D7; +mul.f32 f1050, f1154, 0f3F000000; +sub.f32 f1051, f972, f1050; +sub.f32 f1052, f954, f963; +mul.f32 f1053, f1052, 0f3F5DB3D7; +add.f32 f1054, f955, f964; +mul.f32 f1056, f1054, 0f3F000000; +sub.f32 f1057, f946, f1056; +add.f32 f1153, f982, f991; +sub.f32 f1058, f982, f991; +mul.f32 f1059, f1058, 0f3F5DB3D7; +mul.f32 f1060, f1153, 0f3F000000; +sub.f32 f1061, f973, f1060; +sub.f32 f1062, f955, f964; +mul.f32 f1063, f1062, 0f3F5DB3D7; +add.f32 f1064, f956, f965; +mul.f32 f1066, f1064, 0f3F000000; +sub.f32 f1067, f947, f1066; +add.f32 f1152, f983, f992; +sub.f32 f1068, f983, f992; +mul.f32 f1069, f1068, 0f3F5DB3D7; +mul.f32 f1070, f1152, 0f3F000000; +sub.f32 f1071, f974, f1070; +sub.f32 f1072, f956, f965; +mul.f32 f1073, f1072, 0f3F5DB3D7; +add.f32 f1074, f957, f966; +mul.f32 f1076, f1074, 0f3F000000; +sub.f32 f1077, f948, f1076; +add.f32 f1151, f984, f993; +sub.f32 f1078, f984, f993; +mul.f32 f1079, f1078, 0f3F5DB3D7; +mul.f32 f1080, f1151, 0f3F000000; +sub.f32 f1081, f975, f1080; +sub.f32 f1082, f957, f966; +mul.f32 f1344, f1153, 0f3F000000; +sub.f32 f1343, f973, f1344; +mul.f32 f1083, f1082, 0f3F5DB3D7; +add.f32 %0, f940, f994; +mul.f32 f1346, f1034, 0f3F000000; +sub.f32 f1345, f944, f1346; +add.f32 %1, f967, f1159; +mul.f32 f1348, f1154, 0f3F000000; +sub.f32 f1347, f972, f1348; +mul.f32 f1350, f1024, 0f3F000000; +sub.f32 f1349, f943, f1350; +add.f32 %2, f941, f1004; +add.f32 %3, f968, f1158; +add.f32 %4, f942, f1014; +add.f32 %5, f969, f1157; +add.f32 %6, f943, f1024; +add.f32 %7, f970, f1156; +add.f32 %8, f944, f1034; +add.f32 %9, f971, f1155; +add.f32 %10, f945, f1044; +add.f32 %11, f972, f1154; +add.f32 %12, f946, f1054; +add.f32 %13, f973, f1153; +add.f32 %14, f947, f1064; +add.f32 %15, f974, f1152; +add.f32 %16, f948, f1074; +add.f32 %17, f975, f1151; +sub.f32 %19, f1001, f1003; +add.f32 %18, f999, f997; +sub.f32 %21, f1011, f1013; +add.f32 %20, f1009, f1007; +add.f32 %22, f1019, f1017; +sub.f32 %23, f1021, f1023; +add.f32 %24, f1029, f1349; +sub.f32 %25, f1031, f1033; +add.f32 %26, f1039, f1345; +sub.f32 %27, f1041, f1043; +sub.f32 %29, f1347, f1053; +add.f32 %28, f1049, f1047; +sub.f32 %31, f1343, f1063; +add.f32 %30, f1059, f1057; +sub.f32 %33, f1071, f1073; +add.f32 %32, f1069, f1067; +add.f32 %34, f1079, f1077; +sub.f32 %35, f1081, f1083; +sub.f32 %36, f997, f999; +add.f32 %37, f1003, f1001; +sub.f32 %38, f1007, f1009; +add.f32 %39, f1013, f1011; +sub.f32 %40, f1017, f1019; +add.f32 %41, f1023, f1021; +sub.f32 %42, f1349, f1029; +add.f32 %43, f1033, f1031; +sub.f32 %44, f1345, f1039; +add.f32 %45, f1043, f1041; +sub.f32 %46, f1047, f1049; +add.f32 %47, f1053, f1347; +sub.f32 %48, f1057, f1059; +add.f32 %49, f1063, f1343; +sub.f32 %50, f1067, f1069; +add.f32 %51, f1073, f1071; +sub.f32 %52, f1077, f1079; +add.f32 %53, f1083, f1081; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<130, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<170>; +.reg .b32 r<28>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 648, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %12, %15; +add.f32 f14, %14, %16; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %10, f15; +sub.f32 f17, %14, %16; +mul.f32 f18, f17, 0f3F5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %11, f21; +sub.f32 f23, %12, %15; +mul.f32 f24, f23, 0f3F5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 648, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f27, f19; +mul.f32 f32, f28, f25; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f36, f20; +mul.f32 f40, f38, f26; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r13, r11, 24, r12; +add.f32 f42, %11, f14; +add.f32 f43, %10, f13; +st.shared.v2.f32 [r13], {f43, f42}; +fma.rn.f32 f44, f28, f19, f33; +sub.f32 f45, f31, f32; +st.shared.v2.f32 [r13+8], {f45, f44}; +sub.f32 f46, f39, f40; +fma.rn.f32 f47, f38, f20, f41; +st.shared.v2.f32 [r13+16], {f46, f47}; +barrier.sync 0; +shl.b32 r14, r11, 4; +sub.s32 r15, r13, r14; +ld.shared.v2.f32 {f48, f49}, [r15]; +ld.shared.v2.f32 {f52, f53}, [r15+216]; +ld.shared.v2.f32 {f56, f57}, [r15+432]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0f3F5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0f3F5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f74, f66; +mul.f32 f79, f75, f72; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f83, f67; +mul.f32 f87, f85, f73; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r21, r16, 72, r20; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r21], {f90, f89}; +fma.rn.f32 f91, f75, f66, f80; +sub.f32 f92, f78, f79; +st.shared.v2.f32 [r21+24], {f92, f91}; +fma.rn.f32 f93, f85, f67, f88; +sub.f32 f94, f86, f87; +st.shared.v2.f32 [r21+48], {f94, f93}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r15]; +ld.shared.v2.f32 {f99, f100}, [r15+216]; +ld.shared.v2.f32 {f103, f104}, [r15+432]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0f3F5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0f3F5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 3; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f121, f113; +mul.f32 f126, f122, f119; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f130, f114; +mul.f32 f134, f132, f120; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r27, r22, 216, r26; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r27], {f137, f136}; +fma.rn.f32 f138, f122, f113, f127; +sub.f32 f139, f125, f126; +st.shared.v2.f32 [r27+72], {f139, f138}; +fma.rn.f32 f140, f132, f114, f135; +sub.f32 f141, f133, f134; +st.shared.v2.f32 [r27+144], {f141, f140}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r15]; +ld.shared.v2.f32 {f146, f147}, [r15+216]; +ld.shared.v2.f32 {f150, f151}, [r15+432]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0f3F5DB3D7; +mul.f32 f160, f155, 0f3F000000; +sub.f32 f161, f143, f160; +sub.f32 f162, f146, f150; +mul.f32 f163, f162, 0f3F5DB3D7; +add.f32 %1, f143, f155; +add.f32 %0, f142, f154; +sub.f32 %3, f161, f163; +add.f32 %2, f159, f157; +add.f32 %5, f163, f161; +sub.f32 %4, f157, f159; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<131, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<152>; +.reg .b32 r<28>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 324, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %12, %15; +add.f32 f14, %10, f13; +add.f32 f15, %14, %16; +add.f32 f16, %11, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %10, f17; +sub.f32 f19, %14, %16; +mul.f32 f20, f19, 0f3F5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %11, f23; +sub.f32 f25, %12, %15; +mul.f32 f26, f25, 0f3F5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 324, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f29, f21; +mul.f32 f34, f30, f27; +sub.f32 f35, f33, f34; +mul.f32 f36, f29, f27; +fma.rn.f32 f37, f30, f21, f36; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f40, f22; +mul.f32 f44, f42, f28; +sub.f32 f45, f43, f44; +mul.f32 f46, f40, f28; +fma.rn.f32 f47, f42, f22, f46; +barrier.sync 0; +mad.lo.s32 r13, r11, 12, r12; +st.shared.f32 [r13], f14; +st.shared.f32 [r13+4], f35; +st.shared.f32 [r13+8], f45; +barrier.sync 0; +shl.b32 r14, r11, 3; +sub.s32 r15, r13, r14; +ld.shared.f32 f48, [r15]; +ld.shared.f32 f49, [r15+108]; +ld.shared.f32 f50, [r15+216]; +barrier.sync 0; +st.shared.f32 [r13], f16; +st.shared.f32 [r13+4], f37; +st.shared.f32 [r13+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r15]; +ld.shared.f32 f52, [r15+108]; +ld.shared.f32 f53, [r15+216]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0f3F5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0f3F5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 2; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f70, f62; +mul.f32 f75, f71, f68; +sub.f32 f76, f74, f75; +mul.f32 f77, f70, f68; +fma.rn.f32 f78, f71, f62, f77; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f81, f63; +mul.f32 f85, f83, f69; +sub.f32 f86, f84, f85; +mul.f32 f87, f81, f69; +fma.rn.f32 f88, f83, f63, f87; +barrier.sync 0; +mad.lo.s32 r21, r16, 36, r20; +st.shared.f32 [r21], f55; +st.shared.f32 [r21+12], f76; +st.shared.f32 [r21+24], f86; +barrier.sync 0; +ld.shared.f32 f89, [r15]; +ld.shared.f32 f90, [r15+108]; +ld.shared.f32 f91, [r15+216]; +barrier.sync 0; +st.shared.f32 [r21], f57; +st.shared.f32 [r21+12], f78; +st.shared.f32 [r21+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r15]; +ld.shared.f32 f93, [r15+108]; +ld.shared.f32 f94, [r15+216]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0f3F5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0f3F5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 2; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f111, f103; +mul.f32 f116, f112, f109; +sub.f32 f117, f115, f116; +mul.f32 f118, f111, f109; +fma.rn.f32 f119, f112, f103, f118; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f122, f104; +mul.f32 f126, f124, f110; +sub.f32 f127, f125, f126; +mul.f32 f128, f122, f110; +fma.rn.f32 f129, f124, f104, f128; +barrier.sync 0; +mad.lo.s32 r27, r22, 108, r26; +st.shared.f32 [r27], f96; +st.shared.f32 [r27+36], f117; +st.shared.f32 [r27+72], f127; +barrier.sync 0; +ld.shared.f32 f130, [r15]; +ld.shared.f32 f131, [r15+108]; +ld.shared.f32 f132, [r15+216]; +barrier.sync 0; +st.shared.f32 [r27], f98; +st.shared.f32 [r27+36], f119; +st.shared.f32 [r27+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r15]; +ld.shared.f32 f134, [r15+108]; +ld.shared.f32 f135, [r15+216]; +add.f32 f136, f131, f132; +add.f32 f137, f134, f135; +mul.f32 f138, f136, 0f3F000000; +sub.f32 f139, f130, f138; +sub.f32 f140, f134, f135; +mul.f32 f141, f140, 0f3F5DB3D7; +mul.f32 f142, f137, 0f3F000000; +sub.f32 f143, f133, f142; +sub.f32 f144, f131, f132; +mul.f32 f145, f144, 0f3F5DB3D7; +add.f32 %0, f130, f136; +add.f32 %1, f133, f137; +add.f32 %2, f141, f139; +sub.f32 %3, f143, f145; +sub.f32 %4, f139, f141; +add.f32 %5, f145, f143; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..8414a09dea69e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp32_inv.hpp.inc @@ -0,0 +1,3320 @@ +#ifndef CUFFTDX_FFT_81_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_81_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<328, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<384>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 648, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +add.f32 f111, f110, f108; +sub.f32 f112, f108, f110; +mul.f32 f113, f106, 0f3F000000; +sub.f32 f114, f40, f113; +sub.f32 f115, f54, f70; +mul.f32 f116, f115, 0fBF5DB3D7; +sub.f32 f117, f114, f116; +add.f32 f118, f116, f114; +add.f32 f119, f87, f92; +add.f32 f120, f45, f119; +add.f32 f121, f89, f94; +add.f32 f122, f51, f121; +mul.f32 f123, f119, 0f3F000000; +sub.f32 f124, f45, f123; +sub.f32 f125, f89, f94; +mul.f32 f126, f125, 0fBF5DB3D7; +add.f32 f127, f126, f124; +sub.f32 f128, f124, f126; +mul.f32 f129, f121, 0f3F000000; +sub.f32 f130, f51, f129; +sub.f32 f131, f87, f92; +mul.f32 f132, f131, 0fBF5DB3D7; +sub.f32 f133, f130, f132; +add.f32 f134, f132, f130; +add.f32 f135, f97, f102; +add.f32 f136, f46, f135; +add.f32 f137, f99, f104; +add.f32 f138, f52, f137; +mul.f32 f139, f135, 0f3F000000; +sub.f32 f140, f46, f139; +sub.f32 f141, f99, f104; +mul.f32 f142, f141, 0fBF5DB3D7; +add.f32 f143, f142, f140; +sub.f32 f144, f140, f142; +mul.f32 f145, f137, 0f3F000000; +sub.f32 f146, f52, f145; +sub.f32 f147, f97, f102; +mul.f32 f148, f147, 0fBF5DB3D7; +sub.f32 f149, f146, f148; +add.f32 f150, f148, f146; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 648, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f151, f152}, [rd6]; +mul.f32 f155, f122, f152; +mul.f32 f156, f120, f152; +mul.f32 f157, f151, f122; +mul.f32 f158, f151, f151; +mul.f32 f159, f152, f152; +sub.f32 f160, f158, f159; +mul.f32 f161, f152, f151; +fma.rn.f32 f162, f152, f151, f161; +mul.f32 f163, f138, f162; +mul.f32 f164, f136, f162; +mul.f32 f165, f160, f138; +mul.f32 f166, f151, f160; +mul.f32 f167, f152, f162; +sub.f32 f168, f166, f167; +mul.f32 f169, f151, f162; +fma.rn.f32 f170, f152, f160, f169; +mul.f32 f171, f117, f170; +mul.f32 f172, f111, f170; +mul.f32 f173, f168, f117; +mul.f32 f174, f151, f168; +mul.f32 f175, f152, f170; +sub.f32 f176, f174, f175; +mul.f32 f177, f151, f170; +fma.rn.f32 f178, f152, f168, f177; +mul.f32 f179, f133, f178; +mul.f32 f180, f127, f178; +mul.f32 f181, f176, f133; +mul.f32 f182, f151, f176; +mul.f32 f183, f152, f178; +sub.f32 f184, f182, f183; +mul.f32 f185, f151, f178; +fma.rn.f32 f186, f152, f176, f185; +mul.f32 f187, f149, f186; +mul.f32 f188, f143, f186; +mul.f32 f189, f184, f149; +mul.f32 f190, f151, f184; +mul.f32 f191, f152, f186; +sub.f32 f192, f190, f191; +mul.f32 f193, f151, f186; +fma.rn.f32 f194, f152, f184, f193; +mul.f32 f195, f118, f194; +mul.f32 f196, f112, f194; +mul.f32 f197, f192, f118; +mul.f32 f198, f151, f192; +mul.f32 f199, f152, f194; +sub.f32 f200, f198, f199; +mul.f32 f201, f151, f194; +fma.rn.f32 f202, f152, f192, f201; +mul.f32 f203, f134, f202; +mul.f32 f204, f128, f202; +mul.f32 f205, f200, f134; +mul.f32 f206, f151, f200; +mul.f32 f207, f152, f202; +sub.f32 f208, f206, f207; +mul.f32 f209, f151, f202; +fma.rn.f32 f210, f152, f200, f209; +mul.f32 f211, f150, f210; +mul.f32 f212, f144, f210; +mul.f32 f213, f208, f150; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +add.f32 f214, f40, f106; +add.f32 f215, f38, f105; +st.shared.v2.f32 [r9], {f215, f214}; +fma.rn.f32 f216, f151, f120, f155; +sub.f32 f217, f157, f156; +st.shared.v2.f32 [r9+8], {f216, f217}; +fma.rn.f32 f218, f160, f136, f163; +sub.f32 f219, f165, f164; +st.shared.v2.f32 [r9+16], {f218, f219}; +sub.f32 f220, f173, f172; +fma.rn.f32 f221, f168, f111, f171; +st.shared.v2.f32 [r9+24], {f221, f220}; +fma.rn.f32 f222, f176, f127, f179; +sub.f32 f223, f181, f180; +st.shared.v2.f32 [r9+32], {f222, f223}; +sub.f32 f224, f189, f188; +fma.rn.f32 f225, f184, f143, f187; +st.shared.v2.f32 [r9+40], {f225, f224}; +fma.rn.f32 f226, f192, f112, f195; +sub.f32 f227, f197, f196; +st.shared.v2.f32 [r9+48], {f226, f227}; +fma.rn.f32 f228, f200, f128, f203; +sub.f32 f229, f205, f204; +st.shared.v2.f32 [r9+56], {f228, f229}; +fma.rn.f32 f230, f208, f144, f211; +sub.f32 f231, f213, f212; +st.shared.v2.f32 [r9+64], {f230, f231}; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f232, f233}, [r11]; +ld.shared.v2.f32 {f236, f237}, [r11+72]; +ld.shared.v2.f32 {f240, f241}, [r11+144]; +ld.shared.v2.f32 {f244, f245}, [r11+216]; +ld.shared.v2.f32 {f248, f249}, [r11+288]; +ld.shared.v2.f32 {f252, f253}, [r11+360]; +ld.shared.v2.f32 {f256, f257}, [r11+432]; +ld.shared.v2.f32 {f260, f261}, [r11+504]; +ld.shared.v2.f32 {f264, f265}, [r11+576]; +add.f32 f268, f244, f256; +add.f32 f269, f232, f268; +add.f32 f270, f245, f257; +add.f32 f271, f233, f270; +mul.f32 f272, f268, 0f3F000000; +sub.f32 f273, f232, f272; +sub.f32 f274, f245, f257; +mul.f32 f275, f274, 0fBF5DB3D7; +add.f32 f276, f275, f273; +sub.f32 f277, f273, f275; +mul.f32 f278, f270, 0f3F000000; +sub.f32 f279, f233, f278; +sub.f32 f280, f244, f256; +mul.f32 f281, f280, 0fBF5DB3D7; +sub.f32 f282, f279, f281; +add.f32 f283, f281, f279; +add.f32 f284, f248, f260; +add.f32 f285, f236, f284; +add.f32 f286, f249, f261; +add.f32 f287, f237, f286; +mul.f32 f288, f284, 0f3F000000; +sub.f32 f289, f236, f288; +sub.f32 f290, f249, f261; +mul.f32 f291, f290, 0fBF5DB3D7; +add.f32 f292, f291, f289; +sub.f32 f293, f289, f291; +mul.f32 f294, f286, 0f3F000000; +sub.f32 f295, f237, f294; +sub.f32 f296, f248, f260; +mul.f32 f297, f296, 0fBF5DB3D7; +sub.f32 f298, f295, f297; +add.f32 f299, f297, f295; +add.f32 f300, f252, f264; +add.f32 f301, f240, f300; +add.f32 f302, f253, f265; +add.f32 f303, f241, f302; +mul.f32 f304, f300, 0f3F000000; +sub.f32 f305, f240, f304; +sub.f32 f306, f253, f265; +mul.f32 f307, f306, 0fBF5DB3D7; +add.f32 f308, f307, f305; +sub.f32 f309, f305, f307; +mul.f32 f310, f302, 0f3F000000; +sub.f32 f311, f241, f310; +sub.f32 f312, f252, f264; +mul.f32 f313, f312, 0fBF5DB3D7; +sub.f32 f314, f311, f313; +add.f32 f315, f313, f311; +mul.f32 f316, f292, 0f3F441B7D; +mul.f32 f317, f298, 0f3F248DBB; +sub.f32 f318, f316, f317; +mul.f32 f319, f298, 0f3F441B7D; +fma.rn.f32 f320, f292, 0f3F248DBB, f319; +mul.f32 f321, f308, 0f3E31D0D4; +mul.f32 f322, f314, 0f3F7C1C5C; +sub.f32 f323, f321, f322; +mul.f32 f324, f314, 0f3E31D0D4; +fma.rn.f32 f325, f308, 0f3F7C1C5C, f324; +mul.f32 f326, f293, 0f3E31D0D4; +mul.f32 f327, f299, 0f3F7C1C5C; +sub.f32 f328, f326, f327; +mul.f32 f329, f299, 0f3E31D0D4; +fma.rn.f32 f330, f293, 0f3F7C1C5C, f329; +mul.f32 f331, f309, 0fBF708FB2; +mul.f32 f332, f315, 0f3EAF1D44; +sub.f32 f333, f331, f332; +mul.f32 f334, f315, 0fBF708FB2; +fma.rn.f32 f335, f309, 0f3EAF1D44, f334; +add.f32 f336, f285, f301; +add.f32 f337, f287, f303; +mul.f32 f338, f336, 0f3F000000; +sub.f32 f339, f269, f338; +sub.f32 f340, f287, f303; +mul.f32 f341, f340, 0fBF5DB3D7; +mul.f32 f342, f337, 0f3F000000; +sub.f32 f343, f271, f342; +sub.f32 f344, f285, f301; +mul.f32 f345, f344, 0fBF5DB3D7; +add.f32 f346, f318, f323; +add.f32 f347, f320, f325; +mul.f32 f348, f346, 0f3F000000; +sub.f32 f349, f276, f348; +sub.f32 f350, f320, f325; +mul.f32 f351, f350, 0fBF5DB3D7; +mul.f32 f352, f347, 0f3F000000; +sub.f32 f353, f282, f352; +sub.f32 f354, f318, f323; +mul.f32 f355, f354, 0fBF5DB3D7; +add.f32 f356, f328, f333; +add.f32 f357, f330, f335; +mul.f32 f358, f356, 0f3F000000; +sub.f32 f359, f277, f358; +sub.f32 f360, f330, f335; +mul.f32 f361, f360, 0fBF5DB3D7; +mul.f32 f362, f357, 0f3F000000; +sub.f32 f363, f283, f362; +sub.f32 f364, f328, f333; +mul.f32 f365, f364, 0fBF5DB3D7; +add.f32 %1, f271, f337; +add.f32 %0, f269, f336; +add.f32 %3, f282, f347; +add.f32 %2, f276, f346; +add.f32 %5, f283, f357; +add.f32 %4, f277, f356; +sub.f32 %7, f343, f345; +add.f32 %6, f341, f339; +sub.f32 %9, f353, f355; +add.f32 %8, f351, f349; +sub.f32 %11, f363, f365; +add.f32 %10, f361, f359; +add.f32 %13, f345, f343; +sub.f32 %12, f339, f341; +add.f32 %15, f355, f353; +sub.f32 %14, f349, f351; +add.f32 %17, f365, f363; +sub.f32 %16, f359, f361; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<329, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<366>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 324, r2; +add.f32 f37, %28, %36; +add.f32 f38, %20, f37; +add.f32 f39, %29, %37; +add.f32 f40, %21, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %20, f41; +sub.f32 f43, %29, %37; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %21, f47; +sub.f32 f49, %28, %36; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %30, %38; +add.f32 f54, %22, f53; +add.f32 f55, %32, %40; +add.f32 f56, %24, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %22, f57; +sub.f32 f59, %32, %40; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %24, f63; +sub.f32 f65, %30, %38; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %33, %41; +add.f32 f70, %25, f69; +add.f32 f71, %35, %42; +add.f32 f72, %27, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %25, f73; +sub.f32 f75, %35, %42; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %27, f79; +sub.f32 f81, %33, %41; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mov.u32 r4, %tid.x; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f38, f105; +add.f32 f107, f56, f72; +add.f32 f108, f40, f107; +mul.f32 f109, f105, 0f3F000000; +sub.f32 f110, f38, f109; +sub.f32 f111, f56, f72; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f107, 0f3F000000; +sub.f32 f116, f40, f115; +sub.f32 f117, f54, f70; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +add.f32 f121, f87, f92; +add.f32 f122, f45, f121; +add.f32 f123, f89, f94; +add.f32 f124, f51, f123; +mul.f32 f125, f121, 0f3F000000; +sub.f32 f126, f45, f125; +sub.f32 f127, f89, f94; +mul.f32 f128, f127, 0fBF5DB3D7; +add.f32 f129, f128, f126; +sub.f32 f130, f126, f128; +mul.f32 f131, f123, 0f3F000000; +sub.f32 f132, f51, f131; +sub.f32 f133, f87, f92; +mul.f32 f134, f133, 0fBF5DB3D7; +sub.f32 f135, f132, f134; +add.f32 f136, f134, f132; +add.f32 f137, f97, f102; +add.f32 f138, f46, f137; +add.f32 f139, f99, f104; +add.f32 f140, f52, f139; +mul.f32 f141, f137, 0f3F000000; +sub.f32 f142, f46, f141; +sub.f32 f143, f99, f104; +mul.f32 f144, f143, 0fBF5DB3D7; +add.f32 f145, f144, f142; +sub.f32 f146, f142, f144; +mul.f32 f147, f139, 0f3F000000; +sub.f32 f148, f52, f147; +sub.f32 f149, f97, f102; +mul.f32 f150, f149, 0fBF5DB3D7; +sub.f32 f151, f148, f150; +add.f32 f152, f150, f148; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f153, f154}, [rd6]; +mul.f32 f157, f124, f154; +fma.rn.f32 f158, f153, f122, f157; +mul.f32 f159, f122, f154; +mul.f32 f160, f153, f124; +sub.f32 f161, f160, f159; +mul.f32 f162, f153, f153; +mul.f32 f163, f154, f154; +sub.f32 f164, f162, f163; +mul.f32 f165, f154, f153; +fma.rn.f32 f166, f154, f153, f165; +mul.f32 f167, f140, f166; +fma.rn.f32 f168, f164, f138, f167; +mul.f32 f169, f138, f166; +mul.f32 f170, f164, f140; +sub.f32 f171, f170, f169; +mul.f32 f172, f153, f164; +mul.f32 f173, f154, f166; +sub.f32 f174, f172, f173; +mul.f32 f175, f153, f166; +fma.rn.f32 f176, f154, f164, f175; +mul.f32 f177, f119, f176; +fma.rn.f32 f178, f174, f113, f177; +mul.f32 f179, f113, f176; +mul.f32 f180, f174, f119; +sub.f32 f181, f180, f179; +mul.f32 f182, f153, f174; +mul.f32 f183, f154, f176; +sub.f32 f184, f182, f183; +mul.f32 f185, f153, f176; +fma.rn.f32 f186, f154, f174, f185; +mul.f32 f187, f135, f186; +fma.rn.f32 f188, f184, f129, f187; +mul.f32 f189, f129, f186; +mul.f32 f190, f184, f135; +sub.f32 f191, f190, f189; +mul.f32 f192, f153, f184; +mul.f32 f193, f154, f186; +sub.f32 f194, f192, f193; +mul.f32 f195, f153, f186; +fma.rn.f32 f196, f154, f184, f195; +mul.f32 f197, f151, f196; +fma.rn.f32 f198, f194, f145, f197; +mul.f32 f199, f145, f196; +mul.f32 f200, f194, f151; +sub.f32 f201, f200, f199; +mul.f32 f202, f153, f194; +mul.f32 f203, f154, f196; +sub.f32 f204, f202, f203; +mul.f32 f205, f153, f196; +fma.rn.f32 f206, f154, f194, f205; +mul.f32 f207, f120, f206; +fma.rn.f32 f208, f204, f114, f207; +mul.f32 f209, f114, f206; +mul.f32 f210, f204, f120; +sub.f32 f211, f210, f209; +mul.f32 f212, f153, f204; +mul.f32 f213, f154, f206; +sub.f32 f214, f212, f213; +mul.f32 f215, f153, f206; +fma.rn.f32 f216, f154, f204, f215; +mul.f32 f217, f136, f216; +fma.rn.f32 f218, f214, f130, f217; +mul.f32 f219, f130, f216; +mul.f32 f220, f214, f136; +sub.f32 f221, f220, f219; +mul.f32 f222, f153, f214; +mul.f32 f223, f154, f216; +sub.f32 f224, f222, f223; +mul.f32 f225, f153, f216; +fma.rn.f32 f226, f154, f214, f225; +mul.f32 f227, f152, f226; +fma.rn.f32 f228, f224, f146, f227; +mul.f32 f229, f146, f226; +mul.f32 f230, f224, f152; +sub.f32 f231, f230, f229; +mad.lo.s32 r8, r5, 324, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 36, r8; +st.shared.f32 [r9], f106; +st.shared.f32 [r9+4], f158; +st.shared.f32 [r9+8], f168; +st.shared.f32 [r9+12], f178; +st.shared.f32 [r9+16], f188; +st.shared.f32 [r9+20], f198; +st.shared.f32 [r9+24], f208; +st.shared.f32 [r9+28], f218; +st.shared.f32 [r9+32], f228; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.f32 f232, [r11]; +ld.shared.f32 f233, [r11+36]; +ld.shared.f32 f234, [r11+72]; +ld.shared.f32 f235, [r11+108]; +ld.shared.f32 f236, [r11+144]; +ld.shared.f32 f237, [r11+180]; +ld.shared.f32 f238, [r11+216]; +ld.shared.f32 f239, [r11+252]; +ld.shared.f32 f240, [r11+288]; +barrier.sync 0; +st.shared.f32 [r9], f108; +st.shared.f32 [r9+4], f161; +st.shared.f32 [r9+8], f171; +st.shared.f32 [r9+12], f181; +st.shared.f32 [r9+16], f191; +st.shared.f32 [r9+20], f201; +st.shared.f32 [r9+24], f211; +st.shared.f32 [r9+28], f221; +st.shared.f32 [r9+32], f231; +barrier.sync 0; +ld.shared.f32 f241, [r11]; +ld.shared.f32 f242, [r11+36]; +ld.shared.f32 f243, [r11+72]; +ld.shared.f32 f244, [r11+108]; +ld.shared.f32 f245, [r11+144]; +ld.shared.f32 f246, [r11+180]; +ld.shared.f32 f247, [r11+216]; +ld.shared.f32 f248, [r11+252]; +ld.shared.f32 f249, [r11+288]; +add.f32 f250, f235, f238; +add.f32 f251, f232, f250; +add.f32 f252, f244, f247; +add.f32 f253, f241, f252; +mul.f32 f254, f250, 0f3F000000; +sub.f32 f255, f232, f254; +sub.f32 f256, f244, f247; +mul.f32 f257, f256, 0fBF5DB3D7; +add.f32 f258, f257, f255; +sub.f32 f259, f255, f257; +mul.f32 f260, f252, 0f3F000000; +sub.f32 f261, f241, f260; +sub.f32 f262, f235, f238; +mul.f32 f263, f262, 0fBF5DB3D7; +sub.f32 f264, f261, f263; +add.f32 f265, f263, f261; +add.f32 f266, f236, f239; +add.f32 f267, f233, f266; +add.f32 f268, f245, f248; +add.f32 f269, f242, f268; +mul.f32 f270, f266, 0f3F000000; +sub.f32 f271, f233, f270; +sub.f32 f272, f245, f248; +mul.f32 f273, f272, 0fBF5DB3D7; +add.f32 f274, f273, f271; +sub.f32 f275, f271, f273; +mul.f32 f276, f268, 0f3F000000; +sub.f32 f277, f242, f276; +sub.f32 f278, f236, f239; +mul.f32 f279, f278, 0fBF5DB3D7; +sub.f32 f280, f277, f279; +add.f32 f281, f279, f277; +add.f32 f282, f237, f240; +add.f32 f283, f234, f282; +add.f32 f284, f246, f249; +add.f32 f285, f243, f284; +mul.f32 f286, f282, 0f3F000000; +sub.f32 f287, f234, f286; +sub.f32 f288, f246, f249; +mul.f32 f289, f288, 0fBF5DB3D7; +add.f32 f290, f289, f287; +sub.f32 f291, f287, f289; +mul.f32 f292, f284, 0f3F000000; +sub.f32 f293, f243, f292; +sub.f32 f294, f237, f240; +mul.f32 f295, f294, 0fBF5DB3D7; +sub.f32 f296, f293, f295; +add.f32 f297, f295, f293; +mul.f32 f298, f274, 0f3F441B7D; +mul.f32 f299, f280, 0f3F248DBB; +sub.f32 f300, f298, f299; +mul.f32 f301, f280, 0f3F441B7D; +fma.rn.f32 f302, f274, 0f3F248DBB, f301; +mul.f32 f303, f290, 0f3E31D0D4; +mul.f32 f304, f296, 0f3F7C1C5C; +sub.f32 f305, f303, f304; +mul.f32 f306, f296, 0f3E31D0D4; +fma.rn.f32 f307, f290, 0f3F7C1C5C, f306; +mul.f32 f308, f275, 0f3E31D0D4; +mul.f32 f309, f281, 0f3F7C1C5C; +sub.f32 f310, f308, f309; +mul.f32 f311, f281, 0f3E31D0D4; +fma.rn.f32 f312, f275, 0f3F7C1C5C, f311; +mul.f32 f313, f291, 0fBF708FB2; +mul.f32 f314, f297, 0f3EAF1D44; +sub.f32 f315, f313, f314; +mul.f32 f316, f297, 0fBF708FB2; +fma.rn.f32 f317, f291, 0f3EAF1D44, f316; +add.f32 f318, f267, f283; +add.f32 f319, f269, f285; +mul.f32 f320, f318, 0f3F000000; +sub.f32 f321, f251, f320; +sub.f32 f322, f269, f285; +mul.f32 f323, f322, 0fBF5DB3D7; +mul.f32 f324, f319, 0f3F000000; +sub.f32 f325, f253, f324; +sub.f32 f326, f267, f283; +mul.f32 f327, f326, 0fBF5DB3D7; +add.f32 f328, f300, f305; +add.f32 f329, f302, f307; +mul.f32 f330, f328, 0f3F000000; +sub.f32 f331, f258, f330; +sub.f32 f332, f302, f307; +mul.f32 f333, f332, 0fBF5DB3D7; +mul.f32 f334, f329, 0f3F000000; +sub.f32 f335, f264, f334; +sub.f32 f336, f300, f305; +mul.f32 f337, f336, 0fBF5DB3D7; +add.f32 f338, f310, f315; +add.f32 f339, f312, f317; +mul.f32 f340, f338, 0f3F000000; +sub.f32 f341, f259, f340; +sub.f32 f342, f312, f317; +mul.f32 f343, f342, 0fBF5DB3D7; +mul.f32 f344, f339, 0f3F000000; +sub.f32 f345, f265, f344; +sub.f32 f346, f310, f315; +mul.f32 f347, f346, 0fBF5DB3D7; +add.f32 %0, f251, f318; +add.f32 %1, f253, f319; +add.f32 %3, f264, f329; +add.f32 %2, f258, f328; +add.f32 %5, f265, f339; +add.f32 %4, f259, f338; +add.f32 %6, f323, f321; +sub.f32 %7, f325, f327; +sub.f32 %9, f335, f337; +add.f32 %8, f333, f331; +sub.f32 %11, f345, f347; +add.f32 %10, f343, f341; +sub.f32 %12, f321, f323; +add.f32 %13, f327, f325; +add.f32 %15, f337, f335; +sub.f32 %14, f331, f333; +add.f32 %17, f347, f345; +sub.f32 %16, f341, f343; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "r"(smem), "l"(lut_sp_9_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<331, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1386>; +.reg .b32 r<14>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 648, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1385, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1384, %57, f1385; +mul.f32 f119, f1385, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1383, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1382, %63, f1383; +mul.f32 f135, f1383, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1381, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1380, %69, f1381; +mul.f32 f151, f1381, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f1379, f133, 0f3F441B7D; +sub.f32 f159, f1379, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f1377, f149, 0f3E31D0D4; +mul.f32 f1378, f155, 0f3F7C1C5C; +sub.f32 f164, f1377, f1378; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f1375, f134, 0f3E31D0D4; +mul.f32 f1376, f140, 0f3F7C1C5C; +sub.f32 f169, f1375, f1376; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f1373, f150, 0fBF708FB2; +mul.f32 f1374, f156, 0f3EAF1D44; +sub.f32 f174, f1373, f1374; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1372, f1382, f1380; +sub.f32 f183, f1382, f1380; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1371, f1384, f1372; +mul.f32 f187, f1372, 0f3F000000; +sub.f32 f188, f1384, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1370, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1369, f123, f1370; +mul.f32 f203, f1370, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1368, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1367, f124, f1368; +mul.f32 f219, f1368, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1364, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1362, %112, f1364; +mul.f32 f235, f1364, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1359, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1357, %115, f1359; +mul.f32 f251, f1359, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1354, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1352, %118, f1354; +mul.f32 f267, f1354, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f1351, f249, 0f3F441B7D; +sub.f32 f275, f1351, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f1350, f265, 0f3E31D0D4; +sub.f32 f280, f1350, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f1348, f250, 0f3E31D0D4; +mul.f32 f1349, f256, 0f3F7C1C5C; +sub.f32 f285, f1348, f1349; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f1346, f266, 0fBF708FB2; +mul.f32 f1347, f272, 0f3EAF1D44; +sub.f32 f290, f1346, f1347; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1345, f1357, f1352; +sub.f32 f299, f1357, f1352; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1344, f1362, f1345; +mul.f32 f303, f1345, 0f3F000000; +sub.f32 f304, f1362, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1343, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1342, f239, f1343; +mul.f32 f319, f1343, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1341, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1340, f240, f1341; +mul.f32 f335, f1341, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1337, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1335, %121, f1337; +mul.f32 f351, f1337, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1332, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1330, %124, f1332; +mul.f32 f367, f1332, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1328, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1326, %126, f1328; +mul.f32 f383, f1328, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f1325, f365, 0f3F441B7D; +sub.f32 f391, f1325, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f1324, f381, 0f3E31D0D4; +sub.f32 f396, f1324, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f1322, f366, 0f3E31D0D4; +mul.f32 f1323, f372, 0f3F7C1C5C; +sub.f32 f401, f1322, f1323; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f1320, f382, 0fBF708FB2; +mul.f32 f1321, f388, 0f3EAF1D44; +sub.f32 f406, f1320, f1321; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1319, f1330, f1326; +sub.f32 f415, f1330, f1326; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1318, f1335, f1319; +mul.f32 f419, f1319, 0f3F000000; +sub.f32 f420, f1335, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1317, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1316, f355, f1317; +mul.f32 f435, f1317, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1315, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1314, f356, f1315; +mul.f32 f451, f1315, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1342, 0f3E6C2691; +mul.f32 f1313, f310, 0f3F791978; +sub.f32 f459, f1313, f458; +mul.f32 f460, f1342, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f1311, f426, 0f3F64C51C; +mul.f32 f1312, f1316, 0f3EE5C902; +sub.f32 f464, f1311, f1312; +mul.f32 f465, f1316, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f1309, f326, 0f3F64C51C; +mul.f32 f1310, f1340, 0f3EE5C902; +sub.f32 f469, f1309, f1310; +mul.f32 f470, f1340, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f1307, f442, 0f3F18DF63; +mul.f32 f1308, f1314, 0f3F4D57F2; +sub.f32 f474, f1307, f1308; +mul.f32 f475, f1314, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f1305, f301, 0f3F441B7D; +mul.f32 f1306, f307, 0f3F248DBB; +sub.f32 f479, f1305, f1306; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f1304, f417, 0f3E31D0D4; +sub.f32 f484, f1304, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f1303, f317, 0f3F18DF63; +sub.f32 f489, f1303, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f1302, f433, 0fBE92D7E0; +sub.f32 f494, f1302, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f1301, f333, 0f3ECACAF8; +sub.f32 f499, f1301, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f1300, f449, 0fBF2FAD88; +sub.f32 f504, f1300, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f1299, f302, 0f3E31D0D4; +sub.f32 f509, f1299, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f1297, f418, 0fBF708FB2; +mul.f32 f1298, f424, 0f3EAF1D44; +sub.f32 f514, f1297, f1298; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f1295, f318, 0fBD6E2946; +mul.f32 f1296, f324, 0f3F7F9120; +sub.f32 f519, f1295, f1296; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f1293, f434, 0fBF7E44DE; +mul.f32 f1294, f440, 0fBDEDC21F; +sub.f32 f524, f1293, f1294; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f1292, f334, 0fBE92D7E0; +sub.f32 f529, f1292, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f1291, f450, 0fBF55E287; +sub.f32 f534, f1291, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +mul.f32 f539, f537, 0f3F000000; +sub.f32 f540, f178, f539; +add.f32 f1290, f1344, f1318; +sub.f32 f541, f1344, f1318; +mul.f32 f542, f541, 0fBF5DB3D7; +add.f32 f543, f542, f540; +sub.f32 f544, f540, f542; +mul.f32 f545, f1290, 0f3F000000; +sub.f32 f546, f1371, f545; +sub.f32 f547, f294, f410; +mul.f32 f548, f547, 0fBF5DB3D7; +sub.f32 f549, f546, f548; +add.f32 f550, f548, f546; +add.f32 f551, f459, f464; +add.f32 f552, f194, f551; +mul.f32 f555, f551, 0f3F000000; +sub.f32 f556, f194, f555; +add.f32 f1289, f461, f466; +sub.f32 f557, f461, f466; +mul.f32 f558, f557, 0fBF5DB3D7; +add.f32 f559, f558, f556; +sub.f32 f560, f556, f558; +add.f32 f1288, f1369, f1289; +mul.f32 f561, f1289, 0f3F000000; +sub.f32 f562, f1369, f561; +sub.f32 f563, f459, f464; +mul.f32 f564, f563, 0fBF5DB3D7; +sub.f32 f565, f562, f564; +add.f32 f566, f564, f562; +add.f32 f567, f469, f474; +add.f32 f568, f210, f567; +mul.f32 f571, f567, 0f3F000000; +sub.f32 f572, f210, f571; +add.f32 f1287, f471, f476; +sub.f32 f573, f471, f476; +mul.f32 f574, f573, 0fBF5DB3D7; +add.f32 f575, f574, f572; +sub.f32 f576, f572, f574; +add.f32 f1286, f1367, f1287; +mul.f32 f577, f1287, 0f3F000000; +sub.f32 f578, f1367, f577; +sub.f32 f579, f469, f474; +mul.f32 f580, f579, 0fBF5DB3D7; +sub.f32 f581, f578, f580; +add.f32 f582, f580, f578; +add.f32 f583, f479, f484; +add.f32 f584, f185, f583; +mul.f32 f587, f583, 0f3F000000; +sub.f32 f588, f185, f587; +add.f32 f1285, f481, f486; +sub.f32 f589, f481, f486; +mul.f32 f590, f589, 0fBF5DB3D7; +add.f32 f591, f590, f588; +sub.f32 f592, f588, f590; +add.f32 f1284, f191, f1285; +mul.f32 f593, f1285, 0f3F000000; +sub.f32 f594, f191, f593; +sub.f32 f595, f479, f484; +mul.f32 f596, f595, 0fBF5DB3D7; +sub.f32 f597, f594, f596; +add.f32 f598, f596, f594; +add.f32 f599, f489, f494; +add.f32 f600, f201, f599; +mul.f32 f603, f599, 0f3F000000; +sub.f32 f604, f201, f603; +add.f32 f1283, f491, f496; +sub.f32 f605, f491, f496; +mul.f32 f606, f605, 0fBF5DB3D7; +add.f32 f607, f606, f604; +sub.f32 f608, f604, f606; +add.f32 f1282, f207, f1283; +mul.f32 f609, f1283, 0f3F000000; +sub.f32 f610, f207, f609; +sub.f32 f611, f489, f494; +mul.f32 f612, f611, 0fBF5DB3D7; +sub.f32 f613, f610, f612; +add.f32 f614, f612, f610; +add.f32 f615, f499, f504; +add.f32 f616, f217, f615; +mul.f32 f619, f615, 0f3F000000; +sub.f32 f620, f217, f619; +add.f32 f1281, f501, f506; +sub.f32 f621, f501, f506; +mul.f32 f622, f621, 0fBF5DB3D7; +add.f32 f623, f622, f620; +sub.f32 f624, f620, f622; +add.f32 f1280, f223, f1281; +mul.f32 f625, f1281, 0f3F000000; +sub.f32 f626, f223, f625; +sub.f32 f627, f499, f504; +mul.f32 f628, f627, 0fBF5DB3D7; +sub.f32 f629, f626, f628; +add.f32 f630, f628, f626; +add.f32 f631, f509, f514; +add.f32 f632, f186, f631; +mul.f32 f635, f631, 0f3F000000; +sub.f32 f636, f186, f635; +add.f32 f1279, f511, f516; +sub.f32 f637, f511, f516; +mul.f32 f638, f637, 0fBF5DB3D7; +add.f32 f639, f638, f636; +sub.f32 f640, f636, f638; +add.f32 f1278, f192, f1279; +mul.f32 f641, f1279, 0f3F000000; +sub.f32 f642, f192, f641; +sub.f32 f643, f509, f514; +mul.f32 f644, f643, 0fBF5DB3D7; +sub.f32 f645, f642, f644; +add.f32 f646, f644, f642; +add.f32 f647, f519, f524; +add.f32 f648, f202, f647; +mul.f32 f651, f647, 0f3F000000; +sub.f32 f652, f202, f651; +add.f32 f1277, f521, f526; +sub.f32 f653, f521, f526; +mul.f32 f654, f653, 0fBF5DB3D7; +add.f32 f655, f654, f652; +sub.f32 f656, f652, f654; +add.f32 f1276, f208, f1277; +mul.f32 f657, f1277, 0f3F000000; +sub.f32 f658, f208, f657; +sub.f32 f659, f519, f524; +mul.f32 f660, f659, 0fBF5DB3D7; +sub.f32 f661, f658, f660; +add.f32 f662, f660, f658; +add.f32 f663, f529, f534; +add.f32 f664, f218, f663; +mul.f32 f667, f663, 0f3F000000; +sub.f32 f668, f218, f667; +add.f32 f1275, f531, f536; +sub.f32 f669, f531, f536; +mul.f32 f670, f669, 0fBF5DB3D7; +add.f32 f671, f670, f668; +sub.f32 f672, f668, f670; +add.f32 f1274, f224, f1275; +mul.f32 f673, f1275, 0f3F000000; +sub.f32 f674, f224, f673; +sub.f32 f675, f529, f534; +mul.f32 f676, f675, 0fBF5DB3D7; +sub.f32 f677, f674, f676; +add.f32 f678, f676, f674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 648, r3; +mul.wide.u32 rd7, r7, 8; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f32 {f679, f680}, [rd6]; +mul.f32 f683, f1288, f680; +mul.f32 f685, f679, f1288; +mul.f32 f1272, f679, f679; +mul.f32 f1273, f680, f680; +sub.f32 f688, f1272, f1273; +mul.f32 f689, f680, f679; +fma.rn.f32 f690, f680, f679, f689; +mul.f32 f691, f1286, f690; +mul.f32 f693, f688, f1286; +mul.f32 f695, f680, f690; +mul.f32 f1271, f679, f688; +sub.f32 f696, f1271, f695; +mul.f32 f1270, f568, f690; +mul.f32 f697, f679, f690; +fma.rn.f32 f698, f680, f688, f697; +mul.f32 f699, f1284, f698; +mul.f32 f701, f696, f1284; +mul.f32 f1268, f679, f696; +mul.f32 f1269, f680, f698; +sub.f32 f704, f1268, f1269; +mul.f32 f1267, f584, f698; +mul.f32 f705, f679, f698; +fma.rn.f32 f706, f680, f696, f705; +mul.f32 f707, f1282, f706; +mul.f32 f709, f704, f1282; +mul.f32 f711, f680, f706; +mul.f32 f1266, f679, f704; +sub.f32 f712, f1266, f711; +mul.f32 f1265, f600, f706; +mul.f32 f713, f679, f706; +fma.rn.f32 f714, f680, f704, f713; +mul.f32 f715, f1280, f714; +mul.f32 f717, f712, f1280; +mul.f32 f719, f680, f714; +mul.f32 f1264, f679, f712; +sub.f32 f720, f1264, f719; +mul.f32 f1263, f616, f714; +mul.f32 f721, f679, f714; +fma.rn.f32 f722, f680, f712, f721; +mul.f32 f723, f1278, f722; +mul.f32 f725, f720, f1278; +mul.f32 f1261, f679, f720; +mul.f32 f1262, f680, f722; +sub.f32 f728, f1261, f1262; +mul.f32 f1260, f632, f722; +mul.f32 f729, f679, f722; +fma.rn.f32 f730, f680, f720, f729; +mul.f32 f731, f1276, f730; +mul.f32 f733, f728, f1276; +mul.f32 f735, f680, f730; +mul.f32 f1259, f679, f728; +sub.f32 f736, f1259, f735; +mul.f32 f1258, f648, f730; +mul.f32 f737, f679, f730; +fma.rn.f32 f738, f680, f728, f737; +mul.f32 f739, f1274, f738; +mul.f32 f741, f736, f1274; +mul.f32 f743, f680, f738; +mul.f32 f1257, f679, f736; +sub.f32 f744, f1257, f743; +mul.f32 f1256, f664, f738; +mul.f32 f745, f679, f738; +fma.rn.f32 f746, f680, f736, f745; +mul.f32 f747, f549, f746; +mul.f32 f749, f744, f549; +mul.f32 f1254, f679, f744; +mul.f32 f1255, f680, f746; +sub.f32 f752, f1254, f1255; +mul.f32 f1253, f543, f746; +mul.f32 f753, f679, f746; +fma.rn.f32 f754, f680, f744, f753; +mul.f32 f755, f565, f754; +mul.f32 f757, f752, f565; +mul.f32 f759, f680, f754; +mul.f32 f1252, f679, f752; +sub.f32 f760, f1252, f759; +mul.f32 f1251, f559, f754; +mul.f32 f761, f679, f754; +fma.rn.f32 f762, f680, f752, f761; +mul.f32 f763, f581, f762; +mul.f32 f765, f760, f581; +mul.f32 f1249, f679, f760; +mul.f32 f1250, f680, f762; +sub.f32 f768, f1249, f1250; +mul.f32 f1248, f575, f762; +mul.f32 f769, f679, f762; +fma.rn.f32 f770, f680, f760, f769; +mul.f32 f771, f597, f770; +mul.f32 f773, f768, f597; +mul.f32 f775, f680, f770; +mul.f32 f1247, f679, f768; +sub.f32 f776, f1247, f775; +mul.f32 f1246, f591, f770; +mul.f32 f777, f679, f770; +fma.rn.f32 f778, f680, f768, f777; +mul.f32 f779, f613, f778; +mul.f32 f781, f776, f613; +mul.f32 f783, f680, f778; +mul.f32 f1245, f679, f776; +sub.f32 f784, f1245, f783; +mul.f32 f1244, f607, f778; +mul.f32 f785, f679, f778; +fma.rn.f32 f786, f680, f776, f785; +mul.f32 f787, f629, f786; +mul.f32 f789, f784, f629; +mul.f32 f1242, f679, f784; +mul.f32 f1243, f680, f786; +sub.f32 f792, f1242, f1243; +mul.f32 f1241, f623, f786; +mul.f32 f793, f679, f786; +fma.rn.f32 f794, f680, f784, f793; +mul.f32 f795, f645, f794; +mul.f32 f797, f792, f645; +mul.f32 f799, f680, f794; +mul.f32 f1240, f679, f792; +sub.f32 f800, f1240, f799; +mul.f32 f1239, f639, f794; +mul.f32 f801, f679, f794; +fma.rn.f32 f802, f680, f792, f801; +mul.f32 f803, f661, f802; +mul.f32 f805, f800, f661; +mul.f32 f807, f680, f802; +mul.f32 f1238, f679, f800; +sub.f32 f808, f1238, f807; +mul.f32 f1237, f655, f802; +mul.f32 f809, f679, f802; +fma.rn.f32 f810, f680, f800, f809; +mul.f32 f811, f677, f810; +mul.f32 f813, f808, f677; +mul.f32 f1235, f679, f808; +mul.f32 f1236, f680, f810; +sub.f32 f816, f1235, f1236; +mul.f32 f1234, f671, f810; +mul.f32 f817, f679, f810; +fma.rn.f32 f818, f680, f808, f817; +mul.f32 f819, f550, f818; +mul.f32 f821, f816, f550; +mul.f32 f823, f680, f818; +mul.f32 f1233, f679, f816; +sub.f32 f824, f1233, f823; +mul.f32 f1232, f544, f818; +mul.f32 f825, f679, f818; +fma.rn.f32 f826, f680, f816, f825; +mul.f32 f827, f566, f826; +mul.f32 f829, f824, f566; +mul.f32 f1230, f679, f824; +mul.f32 f1231, f680, f826; +sub.f32 f832, f1230, f1231; +mul.f32 f1229, f560, f826; +mul.f32 f833, f679, f826; +fma.rn.f32 f834, f680, f824, f833; +mul.f32 f835, f582, f834; +mul.f32 f837, f832, f582; +mul.f32 f839, f680, f834; +mul.f32 f1228, f679, f832; +sub.f32 f840, f1228, f839; +mul.f32 f1227, f576, f834; +mul.f32 f841, f679, f834; +fma.rn.f32 f842, f680, f832, f841; +mul.f32 f843, f598, f842; +mul.f32 f845, f840, f598; +mul.f32 f847, f680, f842; +mul.f32 f1226, f679, f840; +sub.f32 f848, f1226, f847; +mul.f32 f1225, f592, f842; +mul.f32 f849, f679, f842; +fma.rn.f32 f850, f680, f840, f849; +mul.f32 f851, f614, f850; +mul.f32 f853, f848, f614; +mul.f32 f1223, f679, f848; +mul.f32 f1224, f680, f850; +sub.f32 f856, f1223, f1224; +mul.f32 f1222, f608, f850; +mul.f32 f857, f679, f850; +fma.rn.f32 f858, f680, f848, f857; +mul.f32 f859, f630, f858; +mul.f32 f861, f856, f630; +mul.f32 f863, f680, f858; +mul.f32 f1221, f679, f856; +sub.f32 f864, f1221, f863; +mul.f32 f1220, f624, f858; +mul.f32 f865, f679, f858; +fma.rn.f32 f866, f680, f856, f865; +mul.f32 f867, f646, f866; +mul.f32 f869, f864, f646; +mul.f32 f871, f680, f866; +mul.f32 f1219, f679, f864; +sub.f32 f872, f1219, f871; +mul.f32 f1218, f640, f866; +mul.f32 f873, f679, f866; +fma.rn.f32 f874, f680, f864, f873; +mul.f32 f875, f662, f874; +mul.f32 f877, f872, f662; +mul.f32 f1216, f679, f872; +mul.f32 f1217, f680, f874; +sub.f32 f880, f1216, f1217; +mul.f32 f1215, f656, f874; +mul.f32 f881, f679, f874; +mul.f32 f1214, f552, f680; +fma.rn.f32 f882, f680, f872, f881; +mul.f32 f883, f678, f882; +mul.f32 f884, f672, f882; +mul.f32 f885, f880, f678; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +add.f32 f886, f1371, f1290; +add.f32 f887, f178, f537; +st.shared.v2.f32 [r9], {f887, f886}; +fma.rn.f32 f888, f679, f552, f683; +sub.f32 f889, f685, f1214; +st.shared.v2.f32 [r9+8], {f888, f889}; +fma.rn.f32 f890, f688, f568, f691; +sub.f32 f891, f693, f1270; +st.shared.v2.f32 [r9+16], {f890, f891}; +fma.rn.f32 f892, f696, f584, f699; +sub.f32 f893, f701, f1267; +st.shared.v2.f32 [r9+24], {f892, f893}; +fma.rn.f32 f894, f704, f600, f707; +sub.f32 f895, f709, f1265; +st.shared.v2.f32 [r9+32], {f894, f895}; +fma.rn.f32 f896, f712, f616, f715; +sub.f32 f897, f717, f1263; +st.shared.v2.f32 [r9+40], {f896, f897}; +fma.rn.f32 f898, f720, f632, f723; +sub.f32 f899, f725, f1260; +st.shared.v2.f32 [r9+48], {f898, f899}; +sub.f32 f900, f733, f1258; +fma.rn.f32 f901, f728, f648, f731; +st.shared.v2.f32 [r9+56], {f901, f900}; +fma.rn.f32 f902, f736, f664, f739; +sub.f32 f903, f741, f1256; +st.shared.v2.f32 [r9+64], {f902, f903}; +fma.rn.f32 f904, f744, f543, f747; +sub.f32 f905, f749, f1253; +st.shared.v2.f32 [r9+72], {f904, f905}; +fma.rn.f32 f906, f752, f559, f755; +sub.f32 f907, f757, f1251; +st.shared.v2.f32 [r9+80], {f906, f907}; +fma.rn.f32 f908, f760, f575, f763; +sub.f32 f909, f765, f1248; +st.shared.v2.f32 [r9+88], {f908, f909}; +fma.rn.f32 f910, f768, f591, f771; +sub.f32 f911, f773, f1246; +st.shared.v2.f32 [r9+96], {f910, f911}; +fma.rn.f32 f912, f776, f607, f779; +sub.f32 f913, f781, f1244; +st.shared.v2.f32 [r9+104], {f912, f913}; +fma.rn.f32 f914, f784, f623, f787; +sub.f32 f915, f789, f1241; +st.shared.v2.f32 [r9+112], {f914, f915}; +fma.rn.f32 f916, f792, f639, f795; +sub.f32 f917, f797, f1239; +st.shared.v2.f32 [r9+120], {f916, f917}; +fma.rn.f32 f918, f800, f655, f803; +sub.f32 f919, f805, f1237; +st.shared.v2.f32 [r9+128], {f918, f919}; +fma.rn.f32 f920, f808, f671, f811; +sub.f32 f921, f813, f1234; +st.shared.v2.f32 [r9+136], {f920, f921}; +fma.rn.f32 f922, f816, f544, f819; +sub.f32 f923, f821, f1232; +st.shared.v2.f32 [r9+144], {f922, f923}; +fma.rn.f32 f924, f824, f560, f827; +sub.f32 f925, f829, f1229; +st.shared.v2.f32 [r9+152], {f924, f925}; +fma.rn.f32 f926, f832, f576, f835; +sub.f32 f927, f837, f1227; +st.shared.v2.f32 [r9+160], {f926, f927}; +fma.rn.f32 f928, f840, f592, f843; +sub.f32 f929, f845, f1225; +st.shared.v2.f32 [r9+168], {f928, f929}; +fma.rn.f32 f930, f848, f608, f851; +sub.f32 f931, f853, f1222; +st.shared.v2.f32 [r9+176], {f930, f931}; +fma.rn.f32 f932, f856, f624, f859; +sub.f32 f933, f861, f1220; +st.shared.v2.f32 [r9+184], {f932, f933}; +fma.rn.f32 f934, f864, f640, f867; +sub.f32 f935, f869, f1218; +st.shared.v2.f32 [r9+192], {f934, f935}; +fma.rn.f32 f936, f872, f656, f875; +sub.f32 f937, f877, f1215; +st.shared.v2.f32 [r9+200], {f936, f937}; +fma.rn.f32 f938, f880, f672, f883; +sub.f32 f939, f885, f884; +st.shared.v2.f32 [r9+208], {f938, f939}; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.v2.f32 {f940, f941}, [r10]; +ld.shared.v2.f32 {f944, f945}, [r10+24]; +ld.shared.v2.f32 {f948, f949}, [r10+48]; +ld.shared.v2.f32 {f952, f953}, [r10+72]; +ld.shared.v2.f32 {f956, f957}, [r10+96]; +ld.shared.v2.f32 {f960, f961}, [r10+120]; +ld.shared.v2.f32 {f964, f965}, [r10+144]; +ld.shared.v2.f32 {f968, f969}, [r10+168]; +ld.shared.v2.f32 {f972, f973}, [r10+192]; +ld.shared.v2.f32 {f976, f977}, [r10+216]; +ld.shared.v2.f32 {f980, f981}, [r10+240]; +ld.shared.v2.f32 {f984, f985}, [r10+264]; +ld.shared.v2.f32 {f988, f989}, [r10+288]; +ld.shared.v2.f32 {f992, f993}, [r10+312]; +ld.shared.v2.f32 {f996, f997}, [r10+336]; +ld.shared.v2.f32 {f1000, f1001}, [r10+360]; +ld.shared.v2.f32 {f1004, f1005}, [r10+384]; +ld.shared.v2.f32 {f1008, f1009}, [r10+408]; +ld.shared.v2.f32 {f1012, f1013}, [r10+432]; +ld.shared.v2.f32 {f1016, f1017}, [r10+456]; +ld.shared.v2.f32 {f1020, f1021}, [r10+480]; +ld.shared.v2.f32 {f1024, f1025}, [r10+504]; +ld.shared.v2.f32 {f1028, f1029}, [r10+528]; +ld.shared.v2.f32 {f1032, f1033}, [r10+552]; +ld.shared.v2.f32 {f1036, f1037}, [r10+576]; +ld.shared.v2.f32 {f1040, f1041}, [r10+600]; +ld.shared.v2.f32 {f1044, f1045}, [r10+624]; +add.f32 f1048, f976, f1012; +mul.f32 f1050, f1048, 0f3F000000; +sub.f32 f1051, f940, f1050; +add.f32 f1213, f977, f1013; +sub.f32 f1052, f977, f1013; +mul.f32 f1053, f1052, 0fBF5DB3D7; +mul.f32 f1054, f1213, 0f3F000000; +sub.f32 f1055, f941, f1054; +sub.f32 f1056, f976, f1012; +mul.f32 f1057, f1056, 0fBF5DB3D7; +add.f32 f1058, f980, f1016; +mul.f32 f1060, f1058, 0f3F000000; +sub.f32 f1061, f944, f1060; +add.f32 f1212, f981, f1017; +sub.f32 f1062, f981, f1017; +mul.f32 f1063, f1062, 0fBF5DB3D7; +mul.f32 f1064, f1212, 0f3F000000; +sub.f32 f1065, f945, f1064; +sub.f32 f1066, f980, f1016; +mul.f32 f1067, f1066, 0fBF5DB3D7; +add.f32 f1068, f984, f1020; +mul.f32 f1070, f1068, 0f3F000000; +sub.f32 f1071, f948, f1070; +add.f32 f1211, f985, f1021; +sub.f32 f1072, f985, f1021; +mul.f32 f1073, f1072, 0fBF5DB3D7; +mul.f32 f1074, f1211, 0f3F000000; +sub.f32 f1075, f949, f1074; +sub.f32 f1076, f984, f1020; +mul.f32 f1077, f1076, 0fBF5DB3D7; +add.f32 f1078, f988, f1024; +mul.f32 f1080, f1078, 0f3F000000; +sub.f32 f1081, f952, f1080; +add.f32 f1210, f989, f1025; +sub.f32 f1082, f989, f1025; +mul.f32 f1083, f1082, 0fBF5DB3D7; +mul.f32 f1084, f1210, 0f3F000000; +sub.f32 f1085, f953, f1084; +sub.f32 f1086, f988, f1024; +mul.f32 f1087, f1086, 0fBF5DB3D7; +add.f32 f1088, f992, f1028; +mul.f32 f1090, f1088, 0f3F000000; +sub.f32 f1091, f956, f1090; +add.f32 f1209, f993, f1029; +sub.f32 f1092, f993, f1029; +mul.f32 f1093, f1092, 0fBF5DB3D7; +mul.f32 f1094, f1209, 0f3F000000; +sub.f32 f1095, f957, f1094; +sub.f32 f1096, f992, f1028; +mul.f32 f1097, f1096, 0fBF5DB3D7; +add.f32 f1098, f996, f1032; +mul.f32 f1100, f1098, 0f3F000000; +sub.f32 f1101, f960, f1100; +add.f32 f1208, f997, f1033; +sub.f32 f1102, f997, f1033; +mul.f32 f1103, f1102, 0fBF5DB3D7; +mul.f32 f1104, f1208, 0f3F000000; +sub.f32 f1105, f961, f1104; +sub.f32 f1106, f996, f1032; +mul.f32 f1107, f1106, 0fBF5DB3D7; +add.f32 f1108, f1000, f1036; +mul.f32 f1110, f1108, 0f3F000000; +sub.f32 f1111, f964, f1110; +add.f32 f1207, f1001, f1037; +sub.f32 f1112, f1001, f1037; +mul.f32 f1113, f1112, 0fBF5DB3D7; +mul.f32 f1114, f1207, 0f3F000000; +sub.f32 f1115, f965, f1114; +sub.f32 f1116, f1000, f1036; +mul.f32 f1117, f1116, 0fBF5DB3D7; +add.f32 f1118, f1004, f1040; +mul.f32 f1120, f1118, 0f3F000000; +sub.f32 f1121, f968, f1120; +add.f32 f1206, f1005, f1041; +sub.f32 f1122, f1005, f1041; +mul.f32 f1123, f1122, 0fBF5DB3D7; +mul.f32 f1124, f1206, 0f3F000000; +sub.f32 f1125, f969, f1124; +sub.f32 f1126, f1004, f1040; +mul.f32 f1127, f1126, 0fBF5DB3D7; +add.f32 f1128, f1008, f1044; +mul.f32 f1130, f1128, 0f3F000000; +sub.f32 f1131, f972, f1130; +add.f32 f1205, f1009, f1045; +sub.f32 f1132, f1009, f1045; +mul.f32 f1133, f1132, 0fBF5DB3D7; +mul.f32 f1134, f1205, 0f3F000000; +sub.f32 f1135, f973, f1134; +sub.f32 f1136, f1008, f1044; +mul.f32 f1137, f1136, 0fBF5DB3D7; +add.f32 %1, f941, f1213; +add.f32 %0, f940, f1048; +add.f32 %3, f945, f1212; +add.f32 %2, f944, f1058; +add.f32 %5, f949, f1211; +add.f32 %4, f948, f1068; +add.f32 %7, f953, f1210; +add.f32 %6, f952, f1078; +add.f32 %9, f957, f1209; +add.f32 %8, f956, f1088; +add.f32 %11, f961, f1208; +add.f32 %10, f960, f1098; +add.f32 %13, f965, f1207; +add.f32 %12, f964, f1108; +add.f32 %15, f969, f1206; +add.f32 %14, f968, f1118; +add.f32 %17, f973, f1205; +add.f32 %16, f972, f1128; +add.f32 %18, f1053, f1051; +sub.f32 %19, f1055, f1057; +sub.f32 %21, f1065, f1067; +add.f32 %20, f1063, f1061; +sub.f32 %23, f1075, f1077; +add.f32 %22, f1073, f1071; +add.f32 %24, f1083, f1081; +sub.f32 %25, f1085, f1087; +add.f32 %26, f1093, f1091; +sub.f32 %27, f1095, f1097; +add.f32 %28, f1103, f1101; +sub.f32 %29, f1105, f1107; +add.f32 %30, f1113, f1111; +sub.f32 %31, f1115, f1117; +sub.f32 %33, f1125, f1127; +add.f32 %32, f1123, f1121; +sub.f32 %35, f1135, f1137; +add.f32 %34, f1133, f1131; +add.f32 %37, f1057, f1055; +sub.f32 %36, f1051, f1053; +add.f32 %39, f1067, f1065; +sub.f32 %38, f1061, f1063; +add.f32 %41, f1077, f1075; +sub.f32 %40, f1071, f1073; +add.f32 %43, f1087, f1085; +sub.f32 %42, f1081, f1083; +add.f32 %45, f1097, f1095; +sub.f32 %44, f1091, f1093; +add.f32 %47, f1107, f1105; +sub.f32 %46, f1101, f1103; +add.f32 %49, f1117, f1115; +sub.f32 %48, f1111, f1113; +add.f32 %51, f1127, f1125; +sub.f32 %50, f1121, f1123; +add.f32 %53, f1137, f1135; +sub.f32 %52, f1131, f1133; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<330, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<1316>; +.reg .b32 r<14>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 324, r13; +add.f32 f109, %74, %92; +add.f32 f110, %56, f109; +mul.f32 f113, f109, 0f3F000000; +sub.f32 f114, %56, f113; +add.f32 f1307, %75, %93; +sub.f32 f115, %75, %93; +mul.f32 f116, f115, 0fBF5DB3D7; +add.f32 f117, f116, f114; +sub.f32 f118, f114, f116; +add.f32 f1306, %57, f1307; +mul.f32 f119, f1307, 0f3F000000; +sub.f32 f120, %57, f119; +sub.f32 f121, %74, %92; +mul.f32 f122, f121, 0fBF5DB3D7; +sub.f32 f123, f120, f122; +add.f32 f124, f122, f120; +add.f32 f125, %80, %98; +add.f32 f126, %62, f125; +mul.f32 f129, f125, 0f3F000000; +sub.f32 f130, %62, f129; +add.f32 f1305, %81, %99; +sub.f32 f131, %81, %99; +mul.f32 f132, f131, 0fBF5DB3D7; +add.f32 f133, f132, f130; +sub.f32 f134, f130, f132; +add.f32 f1304, %63, f1305; +mul.f32 f135, f1305, 0f3F000000; +sub.f32 f136, %63, f135; +sub.f32 f137, %80, %98; +mul.f32 f138, f137, 0fBF5DB3D7; +sub.f32 f139, f136, f138; +add.f32 f140, f138, f136; +add.f32 f141, %86, %104; +add.f32 f142, %68, f141; +mul.f32 f145, f141, 0f3F000000; +sub.f32 f146, %68, f145; +add.f32 f1303, %87, %105; +sub.f32 f147, %87, %105; +mul.f32 f148, f147, 0fBF5DB3D7; +add.f32 f149, f148, f146; +sub.f32 f150, f146, f148; +add.f32 f1302, %69, f1303; +mul.f32 f151, f1303, 0f3F000000; +sub.f32 f152, %69, f151; +sub.f32 f153, %86, %104; +mul.f32 f154, f153, 0fBF5DB3D7; +sub.f32 f155, f152, f154; +add.f32 f156, f154, f152; +mul.f32 f158, f139, 0f3F248DBB; +mul.f32 f1301, f133, 0f3F441B7D; +sub.f32 f159, f1301, f158; +mul.f32 f160, f139, 0f3F441B7D; +fma.rn.f32 f161, f133, 0f3F248DBB, f160; +mul.f32 f1299, f149, 0f3E31D0D4; +mul.f32 f1300, f155, 0f3F7C1C5C; +sub.f32 f164, f1299, f1300; +mul.f32 f165, f155, 0f3E31D0D4; +fma.rn.f32 f166, f149, 0f3F7C1C5C, f165; +mul.f32 f1297, f134, 0f3E31D0D4; +mul.f32 f1298, f140, 0f3F7C1C5C; +sub.f32 f169, f1297, f1298; +mul.f32 f170, f140, 0f3E31D0D4; +fma.rn.f32 f171, f134, 0f3F7C1C5C, f170; +mul.f32 f1295, f150, 0fBF708FB2; +mul.f32 f1296, f156, 0f3EAF1D44; +sub.f32 f174, f1295, f1296; +mul.f32 f175, f156, 0fBF708FB2; +fma.rn.f32 f176, f150, 0f3EAF1D44, f175; +add.f32 f177, f126, f142; +add.f32 f178, f110, f177; +mul.f32 f181, f177, 0f3F000000; +sub.f32 f182, f110, f181; +add.f32 f1294, f1304, f1302; +sub.f32 f183, f1304, f1302; +mul.f32 f184, f183, 0fBF5DB3D7; +add.f32 f185, f184, f182; +sub.f32 f186, f182, f184; +add.f32 f1293, f1306, f1294; +mul.f32 f187, f1294, 0f3F000000; +sub.f32 f188, f1306, f187; +sub.f32 f189, f126, f142; +mul.f32 f190, f189, 0fBF5DB3D7; +sub.f32 f191, f188, f190; +add.f32 f192, f190, f188; +add.f32 f193, f159, f164; +add.f32 f194, f117, f193; +mul.f32 f197, f193, 0f3F000000; +sub.f32 f198, f117, f197; +add.f32 f1292, f161, f166; +sub.f32 f199, f161, f166; +mul.f32 f200, f199, 0fBF5DB3D7; +add.f32 f201, f200, f198; +sub.f32 f202, f198, f200; +add.f32 f1291, f123, f1292; +mul.f32 f203, f1292, 0f3F000000; +sub.f32 f204, f123, f203; +sub.f32 f205, f159, f164; +mul.f32 f206, f205, 0fBF5DB3D7; +sub.f32 f207, f204, f206; +add.f32 f208, f206, f204; +add.f32 f209, f169, f174; +add.f32 f210, f118, f209; +mul.f32 f213, f209, 0f3F000000; +sub.f32 f214, f118, f213; +add.f32 f1290, f171, f176; +sub.f32 f215, f171, f176; +mul.f32 f216, f215, 0fBF5DB3D7; +add.f32 f217, f216, f214; +sub.f32 f218, f214, f216; +add.f32 f1289, f124, f1290; +mul.f32 f219, f1290, 0f3F000000; +sub.f32 f220, f124, f219; +sub.f32 f221, f169, f174; +mul.f32 f222, f221, 0fBF5DB3D7; +sub.f32 f223, f220, f222; +add.f32 f224, f222, f220; +add.f32 f225, %76, %94; +add.f32 f226, %58, f225; +mul.f32 f229, f225, 0f3F000000; +sub.f32 f230, %58, f229; +add.f32 f1286, %111, %110; +sub.f32 f231, %111, %110; +mul.f32 f232, f231, 0fBF5DB3D7; +add.f32 f233, f232, f230; +sub.f32 f234, f230, f232; +add.f32 f1284, %112, f1286; +mul.f32 f235, f1286, 0f3F000000; +sub.f32 f236, %112, f235; +sub.f32 f237, %76, %94; +mul.f32 f238, f237, 0fBF5DB3D7; +sub.f32 f239, f236, f238; +add.f32 f240, f238, f236; +add.f32 f241, %82, %100; +add.f32 f242, %64, f241; +mul.f32 f245, f241, 0f3F000000; +sub.f32 f246, %64, f245; +add.f32 f1281, %114, %113; +sub.f32 f247, %114, %113; +mul.f32 f248, f247, 0fBF5DB3D7; +add.f32 f249, f248, f246; +sub.f32 f250, f246, f248; +add.f32 f1279, %115, f1281; +mul.f32 f251, f1281, 0f3F000000; +sub.f32 f252, %115, f251; +sub.f32 f253, %82, %100; +mul.f32 f254, f253, 0fBF5DB3D7; +sub.f32 f255, f252, f254; +add.f32 f256, f254, f252; +add.f32 f257, %88, %106; +add.f32 f258, %70, f257; +mul.f32 f261, f257, 0f3F000000; +sub.f32 f262, %70, f261; +add.f32 f1276, %116, %117; +sub.f32 f263, %116, %117; +mul.f32 f264, f263, 0fBF5DB3D7; +add.f32 f265, f264, f262; +sub.f32 f266, f262, f264; +add.f32 f1274, %118, f1276; +mul.f32 f267, f1276, 0f3F000000; +sub.f32 f268, %118, f267; +sub.f32 f269, %88, %106; +mul.f32 f270, f269, 0fBF5DB3D7; +sub.f32 f271, f268, f270; +add.f32 f272, f270, f268; +mul.f32 f274, f255, 0f3F248DBB; +mul.f32 f1273, f249, 0f3F441B7D; +sub.f32 f275, f1273, f274; +mul.f32 f276, f255, 0f3F441B7D; +fma.rn.f32 f277, f249, 0f3F248DBB, f276; +mul.f32 f279, f271, 0f3F7C1C5C; +mul.f32 f1272, f265, 0f3E31D0D4; +sub.f32 f280, f1272, f279; +mul.f32 f281, f271, 0f3E31D0D4; +fma.rn.f32 f282, f265, 0f3F7C1C5C, f281; +mul.f32 f1270, f250, 0f3E31D0D4; +mul.f32 f1271, f256, 0f3F7C1C5C; +sub.f32 f285, f1270, f1271; +mul.f32 f286, f256, 0f3E31D0D4; +fma.rn.f32 f287, f250, 0f3F7C1C5C, f286; +mul.f32 f1268, f266, 0fBF708FB2; +mul.f32 f1269, f272, 0f3EAF1D44; +sub.f32 f290, f1268, f1269; +mul.f32 f291, f272, 0fBF708FB2; +fma.rn.f32 f292, f266, 0f3EAF1D44, f291; +add.f32 f293, f242, f258; +add.f32 f294, f226, f293; +mul.f32 f297, f293, 0f3F000000; +sub.f32 f298, f226, f297; +add.f32 f1267, f1279, f1274; +sub.f32 f299, f1279, f1274; +mul.f32 f300, f299, 0fBF5DB3D7; +add.f32 f301, f300, f298; +sub.f32 f302, f298, f300; +add.f32 f1266, f1284, f1267; +mul.f32 f303, f1267, 0f3F000000; +sub.f32 f304, f1284, f303; +sub.f32 f305, f242, f258; +mul.f32 f306, f305, 0fBF5DB3D7; +sub.f32 f307, f304, f306; +add.f32 f308, f306, f304; +add.f32 f309, f275, f280; +add.f32 f310, f233, f309; +mul.f32 f313, f309, 0f3F000000; +sub.f32 f314, f233, f313; +add.f32 f1265, f277, f282; +sub.f32 f315, f277, f282; +mul.f32 f316, f315, 0fBF5DB3D7; +add.f32 f317, f316, f314; +sub.f32 f318, f314, f316; +add.f32 f1264, f239, f1265; +mul.f32 f319, f1265, 0f3F000000; +sub.f32 f320, f239, f319; +sub.f32 f321, f275, f280; +mul.f32 f322, f321, 0fBF5DB3D7; +sub.f32 f323, f320, f322; +add.f32 f324, f322, f320; +add.f32 f325, f285, f290; +add.f32 f326, f234, f325; +mul.f32 f329, f325, 0f3F000000; +sub.f32 f330, f234, f329; +add.f32 f1263, f287, f292; +sub.f32 f331, f287, f292; +mul.f32 f332, f331, 0fBF5DB3D7; +add.f32 f333, f332, f330; +sub.f32 f334, f330, f332; +add.f32 f1262, f240, f1263; +mul.f32 f335, f1263, 0f3F000000; +sub.f32 f336, f240, f335; +sub.f32 f337, f285, f290; +mul.f32 f338, f337, 0fBF5DB3D7; +sub.f32 f339, f336, f338; +add.f32 f340, f338, f336; +add.f32 f341, %78, %96; +add.f32 f342, %60, f341; +mul.f32 f345, f341, 0f3F000000; +sub.f32 f346, %60, f345; +add.f32 f1259, %119, %120; +sub.f32 f347, %119, %120; +mul.f32 f348, f347, 0fBF5DB3D7; +add.f32 f349, f348, f346; +sub.f32 f350, f346, f348; +add.f32 f1257, %121, f1259; +mul.f32 f351, f1259, 0f3F000000; +sub.f32 f352, %121, f351; +sub.f32 f353, %78, %96; +mul.f32 f354, f353, 0fBF5DB3D7; +sub.f32 f355, f352, f354; +add.f32 f356, f354, f352; +add.f32 f357, %84, %102; +add.f32 f358, %66, f357; +mul.f32 f361, f357, 0f3F000000; +sub.f32 f362, %66, f361; +add.f32 f1254, %123, %122; +sub.f32 f363, %123, %122; +mul.f32 f364, f363, 0fBF5DB3D7; +add.f32 f365, f364, f362; +sub.f32 f366, f362, f364; +add.f32 f1252, %124, f1254; +mul.f32 f367, f1254, 0f3F000000; +sub.f32 f368, %124, f367; +sub.f32 f369, %84, %102; +mul.f32 f370, f369, 0fBF5DB3D7; +sub.f32 f371, f368, f370; +add.f32 f372, f370, f368; +add.f32 f373, %90, %108; +add.f32 f374, %72, f373; +mul.f32 f377, f373, 0f3F000000; +sub.f32 f378, %72, f377; +add.f32 f1250, %125, %109; +sub.f32 f379, %125, %109; +mul.f32 f380, f379, 0fBF5DB3D7; +add.f32 f381, f380, f378; +sub.f32 f382, f378, f380; +add.f32 f1248, %126, f1250; +mul.f32 f383, f1250, 0f3F000000; +sub.f32 f384, %126, f383; +sub.f32 f385, %90, %108; +mul.f32 f386, f385, 0fBF5DB3D7; +sub.f32 f387, f384, f386; +add.f32 f388, f386, f384; +mul.f32 f390, f371, 0f3F248DBB; +mul.f32 f1247, f365, 0f3F441B7D; +sub.f32 f391, f1247, f390; +mul.f32 f392, f371, 0f3F441B7D; +fma.rn.f32 f393, f365, 0f3F248DBB, f392; +mul.f32 f395, f387, 0f3F7C1C5C; +mul.f32 f1246, f381, 0f3E31D0D4; +sub.f32 f396, f1246, f395; +mul.f32 f397, f387, 0f3E31D0D4; +fma.rn.f32 f398, f381, 0f3F7C1C5C, f397; +mul.f32 f1244, f366, 0f3E31D0D4; +mul.f32 f1245, f372, 0f3F7C1C5C; +sub.f32 f401, f1244, f1245; +mul.f32 f402, f372, 0f3E31D0D4; +fma.rn.f32 f403, f366, 0f3F7C1C5C, f402; +mul.f32 f1242, f382, 0fBF708FB2; +mul.f32 f1243, f388, 0f3EAF1D44; +sub.f32 f406, f1242, f1243; +mul.f32 f407, f388, 0fBF708FB2; +fma.rn.f32 f408, f382, 0f3EAF1D44, f407; +add.f32 f409, f358, f374; +add.f32 f410, f342, f409; +mul.f32 f413, f409, 0f3F000000; +sub.f32 f414, f342, f413; +add.f32 f1241, f1252, f1248; +sub.f32 f415, f1252, f1248; +mul.f32 f416, f415, 0fBF5DB3D7; +add.f32 f417, f416, f414; +sub.f32 f418, f414, f416; +add.f32 f1240, f1257, f1241; +mul.f32 f419, f1241, 0f3F000000; +sub.f32 f420, f1257, f419; +sub.f32 f421, f358, f374; +mul.f32 f422, f421, 0fBF5DB3D7; +sub.f32 f423, f420, f422; +add.f32 f424, f422, f420; +add.f32 f425, f391, f396; +add.f32 f426, f349, f425; +mul.f32 f429, f425, 0f3F000000; +sub.f32 f430, f349, f429; +add.f32 f1239, f393, f398; +sub.f32 f431, f393, f398; +mul.f32 f432, f431, 0fBF5DB3D7; +add.f32 f433, f432, f430; +sub.f32 f434, f430, f432; +add.f32 f1238, f355, f1239; +mul.f32 f435, f1239, 0f3F000000; +sub.f32 f436, f355, f435; +sub.f32 f437, f391, f396; +mul.f32 f438, f437, 0fBF5DB3D7; +sub.f32 f439, f436, f438; +add.f32 f440, f438, f436; +add.f32 f441, f401, f406; +add.f32 f442, f350, f441; +mul.f32 f445, f441, 0f3F000000; +sub.f32 f446, f350, f445; +add.f32 f1237, f403, f408; +sub.f32 f447, f403, f408; +mul.f32 f448, f447, 0fBF5DB3D7; +add.f32 f449, f448, f446; +sub.f32 f450, f446, f448; +add.f32 f1236, f356, f1237; +mul.f32 f451, f1237, 0f3F000000; +sub.f32 f452, f356, f451; +sub.f32 f453, f401, f406; +mul.f32 f454, f453, 0fBF5DB3D7; +sub.f32 f455, f452, f454; +add.f32 f456, f454, f452; +mul.f32 f458, f1264, 0f3E6C2691; +mul.f32 f1235, f310, 0f3F791978; +sub.f32 f459, f1235, f458; +mul.f32 f460, f1264, 0f3F791978; +fma.rn.f32 f461, f310, 0f3E6C2691, f460; +mul.f32 f1233, f426, 0f3F64C51C; +mul.f32 f1234, f1238, 0f3EE5C902; +sub.f32 f464, f1233, f1234; +mul.f32 f465, f1238, 0f3F64C51C; +fma.rn.f32 f466, f426, 0f3EE5C902, f465; +mul.f32 f1231, f326, 0f3F64C51C; +mul.f32 f1232, f1262, 0f3EE5C902; +sub.f32 f469, f1231, f1232; +mul.f32 f470, f1262, 0f3F64C51C; +fma.rn.f32 f471, f326, 0f3EE5C902, f470; +mul.f32 f1229, f442, 0f3F18DF63; +mul.f32 f1230, f1236, 0f3F4D57F2; +sub.f32 f474, f1229, f1230; +mul.f32 f475, f1236, 0f3F18DF63; +fma.rn.f32 f476, f442, 0f3F4D57F2, f475; +mul.f32 f1227, f301, 0f3F441B7D; +mul.f32 f1228, f307, 0f3F248DBB; +sub.f32 f479, f1227, f1228; +mul.f32 f480, f307, 0f3F441B7D; +fma.rn.f32 f481, f301, 0f3F248DBB, f480; +mul.f32 f483, f423, 0f3F7C1C5C; +mul.f32 f1226, f417, 0f3E31D0D4; +sub.f32 f484, f1226, f483; +mul.f32 f485, f423, 0f3E31D0D4; +fma.rn.f32 f486, f417, 0f3F7C1C5C, f485; +mul.f32 f488, f323, 0f3F4D57F2; +mul.f32 f1225, f317, 0f3F18DF63; +sub.f32 f489, f1225, f488; +mul.f32 f490, f323, 0f3F18DF63; +fma.rn.f32 f491, f317, 0f3F4D57F2, f490; +mul.f32 f493, f439, 0f3F753ECD; +mul.f32 f1224, f433, 0fBE92D7E0; +sub.f32 f494, f1224, f493; +mul.f32 f495, f439, 0fBE92D7E0; +fma.rn.f32 f496, f433, 0f3F753ECD, f495; +mul.f32 f498, f339, 0f3F6B1036; +mul.f32 f1223, f333, 0f3ECACAF8; +sub.f32 f499, f1223, f498; +mul.f32 f500, f339, 0f3ECACAF8; +fma.rn.f32 f501, f333, 0f3F6B1036, f500; +mul.f32 f503, f455, 0f3F3A3529; +mul.f32 f1222, f449, 0fBF2FAD88; +sub.f32 f504, f1222, f503; +mul.f32 f505, f455, 0fBF2FAD88; +fma.rn.f32 f506, f449, 0f3F3A3529, f505; +mul.f32 f508, f308, 0f3F7C1C5C; +mul.f32 f1221, f302, 0f3E31D0D4; +sub.f32 f509, f1221, f508; +mul.f32 f510, f308, 0f3E31D0D4; +fma.rn.f32 f511, f302, 0f3F7C1C5C, f510; +mul.f32 f1219, f418, 0fBF708FB2; +mul.f32 f1220, f424, 0f3EAF1D44; +sub.f32 f514, f1219, f1220; +mul.f32 f515, f424, 0fBF708FB2; +fma.rn.f32 f516, f418, 0f3EAF1D44, f515; +mul.f32 f1217, f318, 0fBD6E2946; +mul.f32 f1218, f324, 0f3F7F9120; +sub.f32 f519, f1217, f1218; +mul.f32 f520, f324, 0fBD6E2946; +fma.rn.f32 f521, f318, 0f3F7F9120, f520; +mul.f32 f1215, f434, 0fBF7E44DE; +mul.f32 f1216, f440, 0fBDEDC21F; +sub.f32 f524, f1215, f1216; +mul.f32 f525, f440, 0fBF7E44DE; +fma.rn.f32 f526, f434, 0fBDEDC21F, f525; +mul.f32 f528, f340, 0f3F753ECD; +mul.f32 f1214, f334, 0fBE92D7E0; +sub.f32 f529, f1214, f528; +mul.f32 f530, f340, 0fBE92D7E0; +fma.rn.f32 f531, f334, 0f3F753ECD, f530; +mul.f32 f533, f456, 0fBF0CAC9F; +mul.f32 f1213, f450, 0fBF55E287; +sub.f32 f534, f1213, f533; +mul.f32 f535, f456, 0fBF55E287; +fma.rn.f32 f536, f450, 0fBF0CAC9F, f535; +add.f32 f537, f294, f410; +add.f32 f538, f178, f537; +mul.f32 f541, f537, 0f3F000000; +sub.f32 f542, f178, f541; +add.f32 f1212, f1266, f1240; +sub.f32 f543, f1266, f1240; +mul.f32 f544, f543, 0fBF5DB3D7; +add.f32 f545, f544, f542; +sub.f32 f546, f542, f544; +add.f32 f1211, f1293, f1212; +mul.f32 f547, f1212, 0f3F000000; +sub.f32 f548, f1293, f547; +sub.f32 f549, f294, f410; +mul.f32 f550, f549, 0fBF5DB3D7; +sub.f32 f551, f548, f550; +add.f32 f552, f550, f548; +add.f32 f553, f459, f464; +add.f32 f554, f194, f553; +mul.f32 f557, f553, 0f3F000000; +sub.f32 f558, f194, f557; +add.f32 f1210, f461, f466; +sub.f32 f559, f461, f466; +mul.f32 f560, f559, 0fBF5DB3D7; +add.f32 f561, f560, f558; +sub.f32 f562, f558, f560; +add.f32 f1209, f1291, f1210; +mul.f32 f563, f1210, 0f3F000000; +sub.f32 f564, f1291, f563; +sub.f32 f565, f459, f464; +mul.f32 f566, f565, 0fBF5DB3D7; +sub.f32 f567, f564, f566; +add.f32 f568, f566, f564; +add.f32 f569, f469, f474; +add.f32 f570, f210, f569; +mul.f32 f573, f569, 0f3F000000; +sub.f32 f574, f210, f573; +add.f32 f1208, f471, f476; +sub.f32 f575, f471, f476; +mul.f32 f576, f575, 0fBF5DB3D7; +add.f32 f577, f576, f574; +sub.f32 f578, f574, f576; +add.f32 f1207, f1289, f1208; +mul.f32 f579, f1208, 0f3F000000; +sub.f32 f580, f1289, f579; +sub.f32 f581, f469, f474; +mul.f32 f582, f581, 0fBF5DB3D7; +sub.f32 f583, f580, f582; +add.f32 f584, f582, f580; +add.f32 f585, f479, f484; +add.f32 f586, f185, f585; +mul.f32 f589, f585, 0f3F000000; +sub.f32 f590, f185, f589; +add.f32 f1206, f481, f486; +sub.f32 f591, f481, f486; +mul.f32 f592, f591, 0fBF5DB3D7; +add.f32 f593, f592, f590; +sub.f32 f594, f590, f592; +add.f32 f1205, f191, f1206; +mul.f32 f595, f1206, 0f3F000000; +sub.f32 f596, f191, f595; +sub.f32 f597, f479, f484; +mul.f32 f598, f597, 0fBF5DB3D7; +sub.f32 f599, f596, f598; +add.f32 f600, f598, f596; +add.f32 f601, f489, f494; +add.f32 f602, f201, f601; +mul.f32 f605, f601, 0f3F000000; +sub.f32 f606, f201, f605; +add.f32 f1204, f491, f496; +sub.f32 f607, f491, f496; +mul.f32 f608, f607, 0fBF5DB3D7; +add.f32 f609, f608, f606; +sub.f32 f610, f606, f608; +add.f32 f1203, f207, f1204; +mul.f32 f611, f1204, 0f3F000000; +sub.f32 f612, f207, f611; +sub.f32 f613, f489, f494; +mul.f32 f614, f613, 0fBF5DB3D7; +sub.f32 f615, f612, f614; +add.f32 f616, f614, f612; +add.f32 f617, f499, f504; +add.f32 f618, f217, f617; +mul.f32 f621, f617, 0f3F000000; +sub.f32 f622, f217, f621; +add.f32 f1202, f501, f506; +sub.f32 f623, f501, f506; +mul.f32 f624, f623, 0fBF5DB3D7; +add.f32 f625, f624, f622; +sub.f32 f626, f622, f624; +add.f32 f1201, f223, f1202; +mul.f32 f627, f1202, 0f3F000000; +sub.f32 f628, f223, f627; +sub.f32 f629, f499, f504; +mul.f32 f630, f629, 0fBF5DB3D7; +sub.f32 f631, f628, f630; +add.f32 f632, f630, f628; +add.f32 f633, f509, f514; +add.f32 f634, f186, f633; +mul.f32 f637, f633, 0f3F000000; +sub.f32 f638, f186, f637; +add.f32 f1200, f511, f516; +sub.f32 f639, f511, f516; +mul.f32 f640, f639, 0fBF5DB3D7; +add.f32 f641, f640, f638; +sub.f32 f642, f638, f640; +add.f32 f1199, f192, f1200; +mul.f32 f643, f1200, 0f3F000000; +sub.f32 f644, f192, f643; +sub.f32 f645, f509, f514; +mul.f32 f646, f645, 0fBF5DB3D7; +sub.f32 f647, f644, f646; +add.f32 f648, f646, f644; +add.f32 f649, f519, f524; +add.f32 f650, f202, f649; +mul.f32 f653, f649, 0f3F000000; +sub.f32 f654, f202, f653; +add.f32 f1198, f521, f526; +sub.f32 f655, f521, f526; +mul.f32 f656, f655, 0fBF5DB3D7; +add.f32 f657, f656, f654; +sub.f32 f658, f654, f656; +add.f32 f1197, f208, f1198; +mul.f32 f659, f1198, 0f3F000000; +sub.f32 f660, f208, f659; +sub.f32 f661, f519, f524; +mul.f32 f662, f661, 0fBF5DB3D7; +sub.f32 f663, f660, f662; +add.f32 f664, f662, f660; +add.f32 f665, f529, f534; +add.f32 f666, f218, f665; +mul.f32 f669, f665, 0f3F000000; +sub.f32 f670, f218, f669; +add.f32 f1196, f531, f536; +sub.f32 f671, f531, f536; +mul.f32 f672, f671, 0fBF5DB3D7; +add.f32 f673, f672, f670; +sub.f32 f674, f670, f672; +add.f32 f1195, f224, f1196; +mul.f32 f675, f1196, 0f3F000000; +sub.f32 f676, f224, f675; +sub.f32 f677, f529, f534; +mul.f32 f678, f677, 0fBF5DB3D7; +sub.f32 f679, f676, f678; +add.f32 f680, f678, f676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 8; +add.s64 rd6, rd5, rd7; +ld.global.v2.f32 {f681, f682}, [rd6]; +mul.f32 f685, f1209, f682; +fma.rn.f32 f686, f681, f554, f685; +mul.f32 f687, f554, f682; +mul.f32 f688, f681, f1209; +sub.f32 f689, f688, f687; +mul.f32 f691, f682, f682; +mul.f32 f1194, f681, f681; +sub.f32 f692, f1194, f691; +mul.f32 f693, f682, f681; +fma.rn.f32 f694, f682, f681, f693; +mul.f32 f695, f1207, f694; +fma.rn.f32 f696, f692, f570, f695; +mul.f32 f697, f570, f694; +mul.f32 f698, f692, f1207; +sub.f32 f699, f698, f697; +mul.f32 f701, f682, f694; +mul.f32 f1193, f681, f692; +sub.f32 f702, f1193, f701; +mul.f32 f703, f681, f694; +fma.rn.f32 f704, f682, f692, f703; +mul.f32 f705, f1205, f704; +fma.rn.f32 f706, f702, f586, f705; +mul.f32 f707, f586, f704; +mul.f32 f708, f702, f1205; +sub.f32 f709, f708, f707; +mul.f32 f1191, f681, f702; +mul.f32 f1192, f682, f704; +sub.f32 f712, f1191, f1192; +mul.f32 f713, f681, f704; +fma.rn.f32 f714, f682, f702, f713; +mul.f32 f715, f1203, f714; +fma.rn.f32 f716, f712, f602, f715; +mul.f32 f717, f602, f714; +mul.f32 f718, f712, f1203; +sub.f32 f719, f718, f717; +mul.f32 f1189, f681, f712; +mul.f32 f1190, f682, f714; +sub.f32 f722, f1189, f1190; +mul.f32 f723, f681, f714; +fma.rn.f32 f724, f682, f712, f723; +mul.f32 f725, f1201, f724; +fma.rn.f32 f726, f722, f618, f725; +mul.f32 f727, f618, f724; +mul.f32 f728, f722, f1201; +sub.f32 f729, f728, f727; +mul.f32 f731, f682, f724; +mul.f32 f1188, f681, f722; +sub.f32 f732, f1188, f731; +mul.f32 f733, f681, f724; +fma.rn.f32 f734, f682, f722, f733; +mul.f32 f735, f1199, f734; +fma.rn.f32 f736, f732, f634, f735; +mul.f32 f737, f634, f734; +mul.f32 f738, f732, f1199; +sub.f32 f739, f738, f737; +mul.f32 f741, f682, f734; +mul.f32 f1187, f681, f732; +sub.f32 f742, f1187, f741; +mul.f32 f743, f681, f734; +fma.rn.f32 f744, f682, f732, f743; +mul.f32 f745, f1197, f744; +fma.rn.f32 f746, f742, f650, f745; +mul.f32 f747, f650, f744; +mul.f32 f748, f742, f1197; +sub.f32 f749, f748, f747; +mul.f32 f751, f682, f744; +mul.f32 f1186, f681, f742; +sub.f32 f752, f1186, f751; +mul.f32 f753, f681, f744; +fma.rn.f32 f754, f682, f742, f753; +mul.f32 f755, f1195, f754; +fma.rn.f32 f756, f752, f666, f755; +mul.f32 f757, f666, f754; +mul.f32 f758, f752, f1195; +sub.f32 f759, f758, f757; +mul.f32 f1184, f681, f752; +mul.f32 f1185, f682, f754; +sub.f32 f762, f1184, f1185; +mul.f32 f763, f681, f754; +fma.rn.f32 f764, f682, f752, f763; +mul.f32 f765, f551, f764; +fma.rn.f32 f766, f762, f545, f765; +mul.f32 f767, f545, f764; +mul.f32 f768, f762, f551; +sub.f32 f769, f768, f767; +mul.f32 f1182, f681, f762; +mul.f32 f1183, f682, f764; +sub.f32 f772, f1182, f1183; +mul.f32 f773, f681, f764; +fma.rn.f32 f774, f682, f762, f773; +mul.f32 f775, f567, f774; +fma.rn.f32 f776, f772, f561, f775; +mul.f32 f777, f561, f774; +mul.f32 f778, f772, f567; +sub.f32 f779, f778, f777; +mul.f32 f781, f682, f774; +mul.f32 f1181, f681, f772; +sub.f32 f782, f1181, f781; +mul.f32 f783, f681, f774; +fma.rn.f32 f784, f682, f772, f783; +mul.f32 f785, f583, f784; +fma.rn.f32 f786, f782, f577, f785; +mul.f32 f787, f577, f784; +mul.f32 f788, f782, f583; +sub.f32 f789, f788, f787; +mul.f32 f791, f682, f784; +mul.f32 f1180, f681, f782; +sub.f32 f792, f1180, f791; +mul.f32 f793, f681, f784; +fma.rn.f32 f794, f682, f782, f793; +mul.f32 f795, f599, f794; +fma.rn.f32 f796, f792, f593, f795; +mul.f32 f797, f593, f794; +mul.f32 f798, f792, f599; +sub.f32 f799, f798, f797; +mul.f32 f801, f682, f794; +mul.f32 f1179, f681, f792; +sub.f32 f802, f1179, f801; +mul.f32 f803, f681, f794; +fma.rn.f32 f804, f682, f792, f803; +mul.f32 f805, f615, f804; +fma.rn.f32 f806, f802, f609, f805; +mul.f32 f807, f609, f804; +mul.f32 f808, f802, f615; +sub.f32 f809, f808, f807; +mul.f32 f1177, f681, f802; +mul.f32 f1178, f682, f804; +sub.f32 f812, f1177, f1178; +mul.f32 f813, f681, f804; +fma.rn.f32 f814, f682, f802, f813; +mul.f32 f815, f631, f814; +fma.rn.f32 f816, f812, f625, f815; +mul.f32 f817, f625, f814; +mul.f32 f818, f812, f631; +sub.f32 f819, f818, f817; +mul.f32 f1175, f681, f812; +mul.f32 f1176, f682, f814; +sub.f32 f822, f1175, f1176; +mul.f32 f823, f681, f814; +fma.rn.f32 f824, f682, f812, f823; +mul.f32 f825, f647, f824; +fma.rn.f32 f826, f822, f641, f825; +mul.f32 f827, f641, f824; +mul.f32 f828, f822, f647; +sub.f32 f829, f828, f827; +mul.f32 f831, f682, f824; +mul.f32 f1174, f681, f822; +sub.f32 f832, f1174, f831; +mul.f32 f833, f681, f824; +fma.rn.f32 f834, f682, f822, f833; +mul.f32 f835, f663, f834; +fma.rn.f32 f836, f832, f657, f835; +mul.f32 f837, f657, f834; +mul.f32 f838, f832, f663; +sub.f32 f839, f838, f837; +mul.f32 f841, f682, f834; +mul.f32 f1173, f681, f832; +sub.f32 f842, f1173, f841; +mul.f32 f843, f681, f834; +fma.rn.f32 f844, f682, f832, f843; +mul.f32 f845, f679, f844; +fma.rn.f32 f846, f842, f673, f845; +mul.f32 f847, f673, f844; +mul.f32 f848, f842, f679; +sub.f32 f849, f848, f847; +mul.f32 f1171, f681, f842; +mul.f32 f1172, f682, f844; +sub.f32 f852, f1171, f1172; +mul.f32 f853, f681, f844; +fma.rn.f32 f854, f682, f842, f853; +mul.f32 f855, f552, f854; +fma.rn.f32 f856, f852, f546, f855; +mul.f32 f857, f546, f854; +mul.f32 f858, f852, f552; +sub.f32 f859, f858, f857; +mul.f32 f1169, f681, f852; +mul.f32 f1170, f682, f854; +sub.f32 f862, f1169, f1170; +mul.f32 f863, f681, f854; +fma.rn.f32 f864, f682, f852, f863; +mul.f32 f865, f568, f864; +fma.rn.f32 f866, f862, f562, f865; +mul.f32 f867, f562, f864; +mul.f32 f868, f862, f568; +sub.f32 f869, f868, f867; +mul.f32 f871, f682, f864; +mul.f32 f1168, f681, f862; +sub.f32 f872, f1168, f871; +mul.f32 f873, f681, f864; +fma.rn.f32 f874, f682, f862, f873; +mul.f32 f875, f584, f874; +fma.rn.f32 f876, f872, f578, f875; +mul.f32 f877, f578, f874; +mul.f32 f878, f872, f584; +sub.f32 f879, f878, f877; +mul.f32 f881, f682, f874; +mul.f32 f1167, f681, f872; +sub.f32 f882, f1167, f881; +mul.f32 f883, f681, f874; +fma.rn.f32 f884, f682, f872, f883; +mul.f32 f885, f600, f884; +fma.rn.f32 f886, f882, f594, f885; +mul.f32 f887, f594, f884; +mul.f32 f888, f882, f600; +sub.f32 f889, f888, f887; +mul.f32 f891, f682, f884; +mul.f32 f1166, f681, f882; +sub.f32 f892, f1166, f891; +mul.f32 f893, f681, f884; +fma.rn.f32 f894, f682, f882, f893; +mul.f32 f895, f616, f894; +fma.rn.f32 f896, f892, f610, f895; +mul.f32 f897, f610, f894; +mul.f32 f898, f892, f616; +sub.f32 f899, f898, f897; +mul.f32 f1164, f681, f892; +mul.f32 f1165, f682, f894; +sub.f32 f902, f1164, f1165; +mul.f32 f903, f681, f894; +fma.rn.f32 f904, f682, f892, f903; +mul.f32 f905, f632, f904; +fma.rn.f32 f906, f902, f626, f905; +mul.f32 f907, f626, f904; +mul.f32 f908, f902, f632; +sub.f32 f909, f908, f907; +mul.f32 f1162, f681, f902; +mul.f32 f1163, f682, f904; +sub.f32 f912, f1162, f1163; +mul.f32 f913, f681, f904; +fma.rn.f32 f914, f682, f902, f913; +mul.f32 f915, f648, f914; +fma.rn.f32 f916, f912, f642, f915; +mul.f32 f917, f642, f914; +mul.f32 f918, f912, f648; +sub.f32 f919, f918, f917; +mul.f32 f921, f682, f914; +mul.f32 f1161, f681, f912; +sub.f32 f922, f1161, f921; +mul.f32 f923, f681, f914; +fma.rn.f32 f924, f682, f912, f923; +mul.f32 f925, f664, f924; +fma.rn.f32 f926, f922, f658, f925; +mul.f32 f927, f658, f924; +mul.f32 f928, f922, f664; +sub.f32 f929, f928, f927; +mul.f32 f931, f682, f924; +mul.f32 f1160, f681, f922; +sub.f32 f932, f1160, f931; +mul.f32 f933, f681, f924; +fma.rn.f32 f934, f682, f922, f933; +mul.f32 f935, f680, f934; +fma.rn.f32 f936, f932, f674, f935; +mul.f32 f937, f674, f934; +mul.f32 f938, f932, f680; +sub.f32 f939, f938, f937; +mad.lo.s32 r8, r5, 324, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 108, r8; +st.shared.f32 [r9], f538; +st.shared.f32 [r9+4], f686; +st.shared.f32 [r9+8], f696; +st.shared.f32 [r9+12], f706; +st.shared.f32 [r9+16], f716; +st.shared.f32 [r9+20], f726; +st.shared.f32 [r9+24], f736; +st.shared.f32 [r9+28], f746; +st.shared.f32 [r9+32], f756; +st.shared.f32 [r9+36], f766; +st.shared.f32 [r9+40], f776; +st.shared.f32 [r9+44], f786; +st.shared.f32 [r9+48], f796; +st.shared.f32 [r9+52], f806; +st.shared.f32 [r9+56], f816; +st.shared.f32 [r9+60], f826; +st.shared.f32 [r9+64], f836; +st.shared.f32 [r9+68], f846; +st.shared.f32 [r9+72], f856; +st.shared.f32 [r9+76], f866; +st.shared.f32 [r9+80], f876; +st.shared.f32 [r9+84], f886; +st.shared.f32 [r9+88], f896; +st.shared.f32 [r9+92], f906; +st.shared.f32 [r9+96], f916; +st.shared.f32 [r9+100], f926; +st.shared.f32 [r9+104], f936; +barrier.sync 0; +mad.lo.s32 r10, r7, -104, r9; +ld.shared.f32 f940, [r10]; +ld.shared.f32 f941, [r10+12]; +ld.shared.f32 f942, [r10+24]; +ld.shared.f32 f943, [r10+36]; +ld.shared.f32 f944, [r10+48]; +ld.shared.f32 f945, [r10+60]; +ld.shared.f32 f946, [r10+72]; +ld.shared.f32 f947, [r10+84]; +ld.shared.f32 f948, [r10+96]; +ld.shared.f32 f949, [r10+108]; +ld.shared.f32 f950, [r10+120]; +ld.shared.f32 f951, [r10+132]; +ld.shared.f32 f952, [r10+144]; +ld.shared.f32 f953, [r10+156]; +ld.shared.f32 f954, [r10+168]; +ld.shared.f32 f955, [r10+180]; +ld.shared.f32 f956, [r10+192]; +ld.shared.f32 f957, [r10+204]; +ld.shared.f32 f958, [r10+216]; +ld.shared.f32 f959, [r10+228]; +ld.shared.f32 f960, [r10+240]; +ld.shared.f32 f961, [r10+252]; +ld.shared.f32 f962, [r10+264]; +ld.shared.f32 f963, [r10+276]; +ld.shared.f32 f964, [r10+288]; +ld.shared.f32 f965, [r10+300]; +ld.shared.f32 f966, [r10+312]; +barrier.sync 0; +st.shared.f32 [r9], f1211; +st.shared.f32 [r9+4], f689; +st.shared.f32 [r9+8], f699; +st.shared.f32 [r9+12], f709; +st.shared.f32 [r9+16], f719; +st.shared.f32 [r9+20], f729; +st.shared.f32 [r9+24], f739; +st.shared.f32 [r9+28], f749; +st.shared.f32 [r9+32], f759; +st.shared.f32 [r9+36], f769; +st.shared.f32 [r9+40], f779; +st.shared.f32 [r9+44], f789; +st.shared.f32 [r9+48], f799; +st.shared.f32 [r9+52], f809; +st.shared.f32 [r9+56], f819; +st.shared.f32 [r9+60], f829; +st.shared.f32 [r9+64], f839; +st.shared.f32 [r9+68], f849; +st.shared.f32 [r9+72], f859; +st.shared.f32 [r9+76], f869; +st.shared.f32 [r9+80], f879; +st.shared.f32 [r9+84], f889; +st.shared.f32 [r9+88], f899; +st.shared.f32 [r9+92], f909; +st.shared.f32 [r9+96], f919; +st.shared.f32 [r9+100], f929; +st.shared.f32 [r9+104], f939; +barrier.sync 0; +ld.shared.f32 f967, [r10]; +ld.shared.f32 f968, [r10+12]; +ld.shared.f32 f969, [r10+24]; +ld.shared.f32 f970, [r10+36]; +ld.shared.f32 f971, [r10+48]; +ld.shared.f32 f972, [r10+60]; +ld.shared.f32 f973, [r10+72]; +ld.shared.f32 f974, [r10+84]; +ld.shared.f32 f975, [r10+96]; +ld.shared.f32 f976, [r10+108]; +ld.shared.f32 f977, [r10+120]; +ld.shared.f32 f978, [r10+132]; +ld.shared.f32 f979, [r10+144]; +ld.shared.f32 f980, [r10+156]; +ld.shared.f32 f981, [r10+168]; +ld.shared.f32 f982, [r10+180]; +ld.shared.f32 f983, [r10+192]; +ld.shared.f32 f984, [r10+204]; +ld.shared.f32 f985, [r10+216]; +ld.shared.f32 f986, [r10+228]; +ld.shared.f32 f987, [r10+240]; +ld.shared.f32 f988, [r10+252]; +ld.shared.f32 f989, [r10+264]; +ld.shared.f32 f990, [r10+276]; +ld.shared.f32 f991, [r10+288]; +ld.shared.f32 f992, [r10+300]; +ld.shared.f32 f993, [r10+312]; +add.f32 f994, f949, f958; +mul.f32 f996, f994, 0f3F000000; +sub.f32 f997, f940, f996; +add.f32 f1159, f976, f985; +sub.f32 f998, f976, f985; +mul.f32 f999, f998, 0fBF5DB3D7; +mul.f32 f1000, f1159, 0f3F000000; +sub.f32 f1001, f967, f1000; +sub.f32 f1002, f949, f958; +mul.f32 f1003, f1002, 0fBF5DB3D7; +add.f32 f1004, f950, f959; +mul.f32 f1006, f1004, 0f3F000000; +sub.f32 f1007, f941, f1006; +add.f32 f1158, f977, f986; +sub.f32 f1008, f977, f986; +mul.f32 f1009, f1008, 0fBF5DB3D7; +mul.f32 f1010, f1158, 0f3F000000; +sub.f32 f1011, f968, f1010; +sub.f32 f1012, f950, f959; +mul.f32 f1013, f1012, 0fBF5DB3D7; +add.f32 f1014, f951, f960; +mul.f32 f1016, f1014, 0f3F000000; +sub.f32 f1017, f942, f1016; +add.f32 f1157, f978, f987; +sub.f32 f1018, f978, f987; +mul.f32 f1019, f1018, 0fBF5DB3D7; +mul.f32 f1020, f1157, 0f3F000000; +sub.f32 f1021, f969, f1020; +sub.f32 f1022, f951, f960; +mul.f32 f1023, f1022, 0fBF5DB3D7; +add.f32 f1024, f952, f961; +mul.f32 f1026, f1024, 0f3F000000; +sub.f32 f1027, f943, f1026; +add.f32 f1156, f979, f988; +sub.f32 f1028, f979, f988; +mul.f32 f1029, f1028, 0fBF5DB3D7; +mul.f32 f1030, f1156, 0f3F000000; +sub.f32 f1031, f970, f1030; +sub.f32 f1032, f952, f961; +mul.f32 f1033, f1032, 0fBF5DB3D7; +add.f32 f1034, f953, f962; +mul.f32 f1036, f1034, 0f3F000000; +sub.f32 f1037, f944, f1036; +add.f32 f1155, f980, f989; +sub.f32 f1038, f980, f989; +mul.f32 f1039, f1038, 0fBF5DB3D7; +mul.f32 f1040, f1155, 0f3F000000; +sub.f32 f1041, f971, f1040; +sub.f32 f1042, f953, f962; +mul.f32 f1043, f1042, 0fBF5DB3D7; +add.f32 f1044, f954, f963; +mul.f32 f1046, f1044, 0f3F000000; +sub.f32 f1047, f945, f1046; +add.f32 f1154, f981, f990; +sub.f32 f1048, f981, f990; +mul.f32 f1049, f1048, 0fBF5DB3D7; +mul.f32 f1050, f1154, 0f3F000000; +sub.f32 f1051, f972, f1050; +sub.f32 f1052, f954, f963; +mul.f32 f1053, f1052, 0fBF5DB3D7; +add.f32 f1054, f955, f964; +mul.f32 f1056, f1054, 0f3F000000; +sub.f32 f1057, f946, f1056; +add.f32 f1153, f982, f991; +sub.f32 f1058, f982, f991; +mul.f32 f1059, f1058, 0fBF5DB3D7; +mul.f32 f1060, f1153, 0f3F000000; +sub.f32 f1061, f973, f1060; +sub.f32 f1062, f955, f964; +mul.f32 f1063, f1062, 0fBF5DB3D7; +add.f32 f1064, f956, f965; +mul.f32 f1066, f1064, 0f3F000000; +sub.f32 f1067, f947, f1066; +add.f32 f1152, f983, f992; +sub.f32 f1068, f983, f992; +mul.f32 f1069, f1068, 0fBF5DB3D7; +mul.f32 f1070, f1152, 0f3F000000; +sub.f32 f1071, f974, f1070; +sub.f32 f1072, f956, f965; +mul.f32 f1073, f1072, 0fBF5DB3D7; +add.f32 f1074, f957, f966; +mul.f32 f1076, f1074, 0f3F000000; +sub.f32 f1077, f948, f1076; +add.f32 f1151, f984, f993; +sub.f32 f1078, f984, f993; +mul.f32 f1079, f1078, 0fBF5DB3D7; +mul.f32 f1080, f1151, 0f3F000000; +sub.f32 f1081, f975, f1080; +sub.f32 f1082, f957, f966; +mul.f32 f1309, f1153, 0f3F000000; +sub.f32 f1308, f973, f1309; +mul.f32 f1083, f1082, 0fBF5DB3D7; +add.f32 %0, f940, f994; +mul.f32 f1311, f1034, 0f3F000000; +sub.f32 f1310, f944, f1311; +add.f32 %1, f967, f1159; +mul.f32 f1313, f1154, 0f3F000000; +sub.f32 f1312, f972, f1313; +mul.f32 f1315, f1034, 0f3F000000; +sub.f32 f1314, f944, f1315; +add.f32 %2, f941, f1004; +add.f32 %3, f968, f1158; +add.f32 %4, f942, f1014; +add.f32 %5, f969, f1157; +add.f32 %6, f943, f1024; +add.f32 %7, f970, f1156; +add.f32 %8, f944, f1034; +add.f32 %9, f971, f1155; +add.f32 %10, f945, f1044; +add.f32 %11, f972, f1154; +add.f32 %12, f946, f1054; +add.f32 %13, f973, f1153; +add.f32 %14, f947, f1064; +add.f32 %15, f974, f1152; +add.f32 %16, f948, f1074; +add.f32 %17, f975, f1151; +sub.f32 %19, f1001, f1003; +add.f32 %18, f999, f997; +sub.f32 %21, f1011, f1013; +add.f32 %20, f1009, f1007; +add.f32 %22, f1019, f1017; +sub.f32 %23, f1021, f1023; +add.f32 %24, f1029, f1027; +sub.f32 %25, f1031, f1033; +add.f32 %26, f1039, f1314; +sub.f32 %27, f1041, f1043; +sub.f32 %29, f1312, f1053; +add.f32 %28, f1049, f1047; +sub.f32 %31, f1308, f1063; +add.f32 %30, f1059, f1057; +sub.f32 %33, f1071, f1073; +add.f32 %32, f1069, f1067; +add.f32 %34, f1079, f1077; +sub.f32 %35, f1081, f1083; +sub.f32 %36, f997, f999; +add.f32 %37, f1003, f1001; +sub.f32 %38, f1007, f1009; +add.f32 %39, f1013, f1011; +sub.f32 %40, f1017, f1019; +add.f32 %41, f1023, f1021; +sub.f32 %42, f1027, f1029; +add.f32 %43, f1033, f1031; +sub.f32 %44, f1314, f1039; +add.f32 %45, f1043, f1041; +sub.f32 %46, f1047, f1049; +add.f32 %47, f1053, f1312; +sub.f32 %48, f1057, f1059; +add.f32 %49, f1063, f1308; +sub.f32 %50, f1067, f1069; +add.f32 %51, f1073, f1071; +sub.f32 %52, f1077, f1079; +add.f32 %53, f1083, f1081; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y), "=f"(rmem[9].x), "=f"(rmem[9].y), "=f"(rmem[10].x), "=f"(rmem[10].y), "=f"(rmem[11].x), "=f"(rmem[11].y), "=f"(rmem[12].x), "=f"(rmem[12].y), "=f"(rmem[13].x), "=f"(rmem[13].y), "=f"(rmem[14].x), "=f"(rmem[14].y), "=f"(rmem[15].x), "=f"(rmem[15].y), "=f"(rmem[16].x), "=f"(rmem[16].y), "=f"(rmem[17].x), "=f"(rmem[17].y), "=f"(rmem[18].x), "=f"(rmem[18].y), "=f"(rmem[19].x), "=f"(rmem[19].y), "=f"(rmem[20].x), "=f"(rmem[20].y), "=f"(rmem[21].x), "=f"(rmem[21].y), "=f"(rmem[22].x), "=f"(rmem[22].y), "=f"(rmem[23].x), "=f"(rmem[23].y), "=f"(rmem[24].x), "=f"(rmem[24].y), "=f"(rmem[25].x), "=f"(rmem[25].y), "=f"(rmem[26].x), "=f"(rmem[26].y): "r"(smem), "l"(lut_sp_27_81), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y), "f"(rmem[9].x), "f"(rmem[9].y), "f"(rmem[10].x), "f"(rmem[10].y), "f"(rmem[11].x), "f"(rmem[11].y), "f"(rmem[12].x), "f"(rmem[12].y), "f"(rmem[13].x), "f"(rmem[13].y), "f"(rmem[14].x), "f"(rmem[14].y), "f"(rmem[15].x), "f"(rmem[15].y), "f"(rmem[16].x), "f"(rmem[16].y), "f"(rmem[17].x), "f"(rmem[17].y), "f"(rmem[18].x), "f"(rmem[18].y), "f"(rmem[19].x), "f"(rmem[19].y), "f"(rmem[20].x), "f"(rmem[20].y), "f"(rmem[21].x), "f"(rmem[21].y), "f"(rmem[22].x), "f"(rmem[22].y), "f"(rmem[23].x), "f"(rmem[23].y), "f"(rmem[24].x), "f"(rmem[24].y), "f"(rmem[25].x), "f"(rmem[25].y), "f"(rmem[26].x), "f"(rmem[26].y), "f"(rmem[19].y), "f"(rmem[10].y), "f"(rmem[1].y), "f"(rmem[22].y), "f"(rmem[13].y), "f"(rmem[4].y), "f"(rmem[16].y), "f"(rmem[25].y), "f"(rmem[7].y), "f"(rmem[11].y), "f"(rmem[20].y), "f"(rmem[2].y), "f"(rmem[23].y), "f"(rmem[14].y), "f"(rmem[5].y), "f"(rmem[17].y), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<332, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<170>; +.reg .b32 r<28>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 648, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %12, %15; +add.f32 f14, %14, %16; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %10, f15; +sub.f32 f17, %14, %16; +mul.f32 f18, f17, 0fBF5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %11, f21; +sub.f32 f23, %12, %15; +mul.f32 f24, f23, 0fBF5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 648, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f25, f28; +mul.f32 f32, f19, f28; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f26, f38; +mul.f32 f40, f20, f38; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r13, r11, 24, r12; +add.f32 f42, %11, f14; +add.f32 f43, %10, f13; +st.shared.v2.f32 [r13], {f43, f42}; +fma.rn.f32 f44, f27, f19, f31; +sub.f32 f45, f33, f32; +st.shared.v2.f32 [r13+8], {f44, f45}; +sub.f32 f46, f41, f40; +fma.rn.f32 f47, f36, f20, f39; +st.shared.v2.f32 [r13+16], {f47, f46}; +barrier.sync 0; +shl.b32 r14, r11, 4; +sub.s32 r15, r13, r14; +ld.shared.v2.f32 {f48, f49}, [r15]; +ld.shared.v2.f32 {f52, f53}, [r15+216]; +ld.shared.v2.f32 {f56, f57}, [r15+432]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0fBF5DB3D7; +add.f32 f66, f65, f63; +sub.f32 f67, f63, f65; +mul.f32 f68, f61, 0f3F000000; +sub.f32 f69, f49, f68; +sub.f32 f70, f52, f56; +mul.f32 f71, f70, 0fBF5DB3D7; +sub.f32 f72, f69, f71; +add.f32 f73, f71, f69; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f74, f75}, [rd11]; +mul.f32 f78, f72, f75; +mul.f32 f79, f66, f75; +mul.f32 f80, f74, f72; +mul.f32 f81, f74, f74; +mul.f32 f82, f75, f75; +sub.f32 f83, f81, f82; +mul.f32 f84, f75, f74; +fma.rn.f32 f85, f75, f74, f84; +mul.f32 f86, f73, f85; +mul.f32 f87, f67, f85; +mul.f32 f88, f83, f73; +barrier.sync 0; +mad.lo.s32 r21, r16, 72, r20; +add.f32 f89, f49, f61; +add.f32 f90, f48, f60; +st.shared.v2.f32 [r21], {f90, f89}; +fma.rn.f32 f91, f74, f66, f78; +sub.f32 f92, f80, f79; +st.shared.v2.f32 [r21+24], {f91, f92}; +fma.rn.f32 f93, f83, f67, f86; +sub.f32 f94, f88, f87; +st.shared.v2.f32 [r21+48], {f93, f94}; +barrier.sync 0; +ld.shared.v2.f32 {f95, f96}, [r15]; +ld.shared.v2.f32 {f99, f100}, [r15+216]; +ld.shared.v2.f32 {f103, f104}, [r15+432]; +add.f32 f107, f99, f103; +add.f32 f108, f100, f104; +mul.f32 f109, f107, 0f3F000000; +sub.f32 f110, f95, f109; +sub.f32 f111, f100, f104; +mul.f32 f112, f111, 0fBF5DB3D7; +add.f32 f113, f112, f110; +sub.f32 f114, f110, f112; +mul.f32 f115, f108, 0f3F000000; +sub.f32 f116, f96, f115; +sub.f32 f117, f99, f103; +mul.f32 f118, f117, 0fBF5DB3D7; +sub.f32 f119, f116, f118; +add.f32 f120, f118, f116; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 3; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f121, f122}, [rd16]; +mul.f32 f125, f119, f122; +mul.f32 f126, f113, f122; +mul.f32 f127, f121, f119; +mul.f32 f128, f121, f121; +mul.f32 f129, f122, f122; +sub.f32 f130, f128, f129; +mul.f32 f131, f122, f121; +fma.rn.f32 f132, f122, f121, f131; +mul.f32 f133, f120, f132; +mul.f32 f134, f114, f132; +mul.f32 f135, f130, f120; +barrier.sync 0; +mad.lo.s32 r27, r22, 216, r26; +add.f32 f136, f96, f108; +add.f32 f137, f95, f107; +st.shared.v2.f32 [r27], {f137, f136}; +fma.rn.f32 f138, f121, f113, f125; +sub.f32 f139, f127, f126; +st.shared.v2.f32 [r27+72], {f138, f139}; +fma.rn.f32 f140, f130, f114, f133; +sub.f32 f141, f135, f134; +st.shared.v2.f32 [r27+144], {f140, f141}; +barrier.sync 0; +ld.shared.v2.f32 {f142, f143}, [r15]; +ld.shared.v2.f32 {f146, f147}, [r15+216]; +ld.shared.v2.f32 {f150, f151}, [r15+432]; +add.f32 f154, f146, f150; +add.f32 f155, f147, f151; +mul.f32 f156, f154, 0f3F000000; +sub.f32 f157, f142, f156; +sub.f32 f158, f147, f151; +mul.f32 f159, f158, 0fBF5DB3D7; +mul.f32 f160, f155, 0f3F000000; +sub.f32 f161, f143, f160; +sub.f32 f162, f146, f150; +mul.f32 f163, f162, 0fBF5DB3D7; +add.f32 %1, f143, f155; +add.f32 %0, f142, f154; +sub.f32 %3, f161, f163; +add.f32 %2, f159, f157; +add.f32 %5, f163, f161; +sub.f32 %4, f157, f159; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<333, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<152>; +.reg .b32 r<28>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 324, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %12, %15; +add.f32 f14, %10, f13; +add.f32 f15, %14, %16; +add.f32 f16, %11, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %10, f17; +sub.f32 f19, %14, %16; +mul.f32 f20, f19, 0fBF5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %11, f23; +sub.f32 f25, %12, %15; +mul.f32 f26, f25, 0fBF5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 324, r3; +mul.wide.u32 rd4, r11, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f27, f30; +fma.rn.f32 f34, f29, f21, f33; +mul.f32 f35, f21, f30; +mul.f32 f36, f29, f27; +sub.f32 f37, f36, f35; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f28, f42; +fma.rn.f32 f44, f40, f22, f43; +mul.f32 f45, f22, f42; +mul.f32 f46, f40, f28; +sub.f32 f47, f46, f45; +barrier.sync 0; +mad.lo.s32 r13, r11, 12, r12; +st.shared.f32 [r13], f14; +st.shared.f32 [r13+4], f34; +st.shared.f32 [r13+8], f44; +barrier.sync 0; +shl.b32 r14, r11, 3; +sub.s32 r15, r13, r14; +ld.shared.f32 f48, [r15]; +ld.shared.f32 f49, [r15+108]; +ld.shared.f32 f50, [r15+216]; +barrier.sync 0; +st.shared.f32 [r13], f16; +st.shared.f32 [r13+4], f37; +st.shared.f32 [r13+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r15]; +ld.shared.f32 f52, [r15+108]; +ld.shared.f32 f53, [r15+216]; +add.f32 f54, f49, f50; +add.f32 f55, f48, f54; +add.f32 f56, f52, f53; +add.f32 f57, f51, f56; +mul.f32 f58, f54, 0f3F000000; +sub.f32 f59, f48, f58; +sub.f32 f60, f52, f53; +mul.f32 f61, f60, 0fBF5DB3D7; +add.f32 f62, f61, f59; +sub.f32 f63, f59, f61; +mul.f32 f64, f56, 0f3F000000; +sub.f32 f65, f51, f64; +sub.f32 f66, f49, f50; +mul.f32 f67, f66, 0fBF5DB3D7; +sub.f32 f68, f65, f67; +add.f32 f69, f67, f65; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 2; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 8; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f32 {f70, f71}, [rd11]; +mul.f32 f74, f68, f71; +fma.rn.f32 f75, f70, f62, f74; +mul.f32 f76, f62, f71; +mul.f32 f77, f70, f68; +sub.f32 f78, f77, f76; +mul.f32 f79, f70, f70; +mul.f32 f80, f71, f71; +sub.f32 f81, f79, f80; +mul.f32 f82, f71, f70; +fma.rn.f32 f83, f71, f70, f82; +mul.f32 f84, f69, f83; +fma.rn.f32 f85, f81, f63, f84; +mul.f32 f86, f63, f83; +mul.f32 f87, f81, f69; +sub.f32 f88, f87, f86; +barrier.sync 0; +mad.lo.s32 r21, r16, 36, r20; +st.shared.f32 [r21], f55; +st.shared.f32 [r21+12], f75; +st.shared.f32 [r21+24], f85; +barrier.sync 0; +ld.shared.f32 f89, [r15]; +ld.shared.f32 f90, [r15+108]; +ld.shared.f32 f91, [r15+216]; +barrier.sync 0; +st.shared.f32 [r21], f57; +st.shared.f32 [r21+12], f78; +st.shared.f32 [r21+24], f88; +barrier.sync 0; +ld.shared.f32 f92, [r15]; +ld.shared.f32 f93, [r15+108]; +ld.shared.f32 f94, [r15+216]; +add.f32 f95, f90, f91; +add.f32 f96, f89, f95; +add.f32 f97, f93, f94; +add.f32 f98, f92, f97; +mul.f32 f99, f95, 0f3F000000; +sub.f32 f100, f89, f99; +sub.f32 f101, f93, f94; +mul.f32 f102, f101, 0fBF5DB3D7; +add.f32 f103, f102, f100; +sub.f32 f104, f100, f102; +mul.f32 f105, f97, 0f3F000000; +sub.f32 f106, f92, f105; +sub.f32 f107, f90, f91; +mul.f32 f108, f107, 0fBF5DB3D7; +sub.f32 f109, f106, f108; +add.f32 f110, f108, f106; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 2; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 8; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f32 {f111, f112}, [rd16]; +mul.f32 f115, f109, f112; +fma.rn.f32 f116, f111, f103, f115; +mul.f32 f117, f103, f112; +mul.f32 f118, f111, f109; +sub.f32 f119, f118, f117; +mul.f32 f120, f111, f111; +mul.f32 f121, f112, f112; +sub.f32 f122, f120, f121; +mul.f32 f123, f112, f111; +fma.rn.f32 f124, f112, f111, f123; +mul.f32 f125, f110, f124; +fma.rn.f32 f126, f122, f104, f125; +mul.f32 f127, f104, f124; +mul.f32 f128, f122, f110; +sub.f32 f129, f128, f127; +barrier.sync 0; +mad.lo.s32 r27, r22, 108, r26; +st.shared.f32 [r27], f96; +st.shared.f32 [r27+36], f116; +st.shared.f32 [r27+72], f126; +barrier.sync 0; +ld.shared.f32 f130, [r15]; +ld.shared.f32 f131, [r15+108]; +ld.shared.f32 f132, [r15+216]; +barrier.sync 0; +st.shared.f32 [r27], f98; +st.shared.f32 [r27+36], f119; +st.shared.f32 [r27+72], f129; +barrier.sync 0; +ld.shared.f32 f133, [r15]; +ld.shared.f32 f134, [r15+108]; +ld.shared.f32 f135, [r15+216]; +add.f32 f136, f131, f132; +add.f32 f137, f134, f135; +mul.f32 f138, f136, 0f3F000000; +sub.f32 f139, f130, f138; +sub.f32 f140, f134, f135; +mul.f32 f141, f140, 0fBF5DB3D7; +mul.f32 f142, f137, 0f3F000000; +sub.f32 f143, f133, f142; +sub.f32 f144, f131, f132; +mul.f32 f145, f144, 0fBF5DB3D7; +add.f32 %0, f130, f136; +add.f32 %1, f133, f137; +add.f32 %2, f141, f139; +sub.f32 %3, f143, f145; +sub.f32 %4, f139, f141; +add.f32 %5, f145, f143; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_81), "l"(lut_sp_3_27), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..2237aeb4deed7 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp64_fwd.hpp.inc @@ -0,0 +1,3280 @@ +#ifndef CUFFTDX_FFT_81_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_81_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<509, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<383>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 1296, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0d3FEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1296, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd151, fd120; +mul.f64 fd156, fd152, fd122; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd160, fd136; +mul.f64 fd164, fd162, fd138; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd168, fd111; +mul.f64 fd172, fd170, fd117; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd176, fd127; +mul.f64 fd180, fd178, fd133; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+144]; +mul.f64 fd186, fd182, fd143; +mul.f64 fd187, fd183, fd149; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd191, fd112; +mul.f64 fd195, fd193, fd118; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd199, fd128; +mul.f64 fd203, fd201, fd134; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd207, fd144; +mul.f64 fd211, fd209, fd150; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd152, fd120, fd157; +sub.f64 fd216, fd155, fd156; +st.shared.v2.f64 [r9+16], {fd216, fd215}; +fma.rn.f64 fd217, fd162, fd136, fd165; +sub.f64 fd218, fd163, fd164; +st.shared.v2.f64 [r9+32], {fd218, fd217}; +sub.f64 fd219, fd171, fd172; +fma.rn.f64 fd220, fd170, fd111, fd173; +st.shared.v2.f64 [r9+48], {fd219, fd220}; +fma.rn.f64 fd221, fd178, fd127, fd181; +sub.f64 fd222, fd179, fd180; +st.shared.v2.f64 [r9+64], {fd222, fd221}; +fma.rn.f64 fd223, fd183, fd143, fd188; +sub.f64 fd224, fd186, fd187; +st.shared.v2.f64 [r9+80], {fd224, fd223}; +fma.rn.f64 fd225, fd193, fd112, fd196; +sub.f64 fd226, fd194, fd195; +st.shared.v2.f64 [r9+96], {fd226, fd225}; +fma.rn.f64 fd227, fd201, fd128, fd204; +sub.f64 fd228, fd202, fd203; +st.shared.v2.f64 [r9+112], {fd228, fd227}; +fma.rn.f64 fd229, fd209, fd144, fd212; +sub.f64 fd230, fd210, fd211; +st.shared.v2.f64 [r9+128], {fd230, fd229}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+144]; +ld.shared.v2.f64 {fd239, fd240}, [r11+288]; +ld.shared.v2.f64 {fd243, fd244}, [r11+432]; +ld.shared.v2.f64 {fd247, fd248}, [r11+576]; +ld.shared.v2.f64 {fd251, fd252}, [r11+720]; +ld.shared.v2.f64 {fd255, fd256}, [r11+864]; +ld.shared.v2.f64 {fd259, fd260}, [r11+1008]; +ld.shared.v2.f64 {fd263, fd264}, [r11+1152]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0d3FEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0d3FEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0d3FEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0d3FEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0d3FEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0dBFE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0dBFE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0dBFEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0dBFEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0dBFEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0dBFEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0dBFD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0dBFD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0d3FEBB67AE8584CAA; +mul.f64 fd341, fd336, 0d3FE0000000000000; +sub.f64 fd342, fd270, fd341; +sub.f64 fd343, fd284, fd300; +mul.f64 fd344, fd343, 0d3FEBB67AE8584CAA; +add.f64 fd345, fd317, fd322; +add.f64 fd346, fd319, fd324; +mul.f64 fd347, fd345, 0d3FE0000000000000; +sub.f64 fd348, fd275, fd347; +sub.f64 fd349, fd319, fd324; +mul.f64 fd350, fd349, 0d3FEBB67AE8584CAA; +mul.f64 fd351, fd346, 0d3FE0000000000000; +sub.f64 fd352, fd281, fd351; +sub.f64 fd353, fd317, fd322; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +add.f64 fd355, fd327, fd332; +add.f64 fd356, fd329, fd334; +mul.f64 fd357, fd355, 0d3FE0000000000000; +sub.f64 fd358, fd276, fd357; +sub.f64 fd359, fd329, fd334; +mul.f64 fd360, fd359, 0d3FEBB67AE8584CAA; +mul.f64 fd361, fd356, 0d3FE0000000000000; +sub.f64 fd362, fd282, fd361; +sub.f64 fd363, fd327, fd332; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 %1, fd270, fd336; +add.f64 %0, fd268, fd335; +add.f64 %3, fd281, fd346; +add.f64 %2, fd275, fd345; +add.f64 %5, fd282, fd356; +add.f64 %4, fd276, fd355; +sub.f64 %7, fd342, fd344; +add.f64 %6, fd340, fd338; +sub.f64 %9, fd352, fd354; +add.f64 %8, fd350, fd348; +sub.f64 %11, fd362, fd364; +add.f64 %10, fd360, fd358; +add.f64 %13, fd344, fd342; +sub.f64 %12, fd338, fd340; +add.f64 %15, fd354, fd352; +sub.f64 %14, fd348, fd350; +add.f64 %17, fd364, fd362; +sub.f64 %16, fd358, fd360; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<510, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<365>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 648, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0d3FEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0d3FEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0d3FEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0d3FEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0d3FEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd153, fd122; +mul.f64 fd158, fd154, fd124; +sub.f64 fd159, fd157, fd158; +mul.f64 fd160, fd153, fd124; +fma.rn.f64 fd161, fd154, fd122, fd160; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd164, fd138; +mul.f64 fd168, fd166, fd140; +sub.f64 fd169, fd167, fd168; +mul.f64 fd170, fd164, fd140; +fma.rn.f64 fd171, fd166, fd138, fd170; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd174, fd113; +mul.f64 fd178, fd176, fd119; +sub.f64 fd179, fd177, fd178; +mul.f64 fd180, fd174, fd119; +fma.rn.f64 fd181, fd176, fd113, fd180; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd184, fd129; +mul.f64 fd188, fd186, fd135; +sub.f64 fd189, fd187, fd188; +mul.f64 fd190, fd184, fd135; +fma.rn.f64 fd191, fd186, fd129, fd190; +ld.global.v2.f64 {fd192, fd193}, [rd6+144]; +mul.f64 fd196, fd192, fd145; +mul.f64 fd197, fd193, fd151; +sub.f64 fd198, fd196, fd197; +mul.f64 fd199, fd192, fd151; +fma.rn.f64 fd200, fd193, fd145, fd199; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd203, fd114; +mul.f64 fd207, fd205, fd120; +sub.f64 fd208, fd206, fd207; +mul.f64 fd209, fd203, fd120; +fma.rn.f64 fd210, fd205, fd114, fd209; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd213, fd130; +mul.f64 fd217, fd215, fd136; +sub.f64 fd218, fd216, fd217; +mul.f64 fd219, fd213, fd136; +fma.rn.f64 fd220, fd215, fd130, fd219; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd223, fd146; +mul.f64 fd227, fd225, fd152; +sub.f64 fd228, fd226, fd227; +mul.f64 fd229, fd223, fd152; +fma.rn.f64 fd230, fd225, fd146, fd229; +mad.lo.s32 r8, r5, 648, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd159; +st.shared.f64 [r9+16], fd169; +st.shared.f64 [r9+24], fd179; +st.shared.f64 [r9+32], fd189; +st.shared.f64 [r9+40], fd198; +st.shared.f64 [r9+48], fd208; +st.shared.f64 [r9+56], fd218; +st.shared.f64 [r9+64], fd228; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+72]; +ld.shared.f64 fd233, [r11+144]; +ld.shared.f64 fd234, [r11+216]; +ld.shared.f64 fd235, [r11+288]; +ld.shared.f64 fd236, [r11+360]; +ld.shared.f64 fd237, [r11+432]; +ld.shared.f64 fd238, [r11+504]; +ld.shared.f64 fd239, [r11+576]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+72]; +ld.shared.f64 fd242, [r11+144]; +ld.shared.f64 fd243, [r11+216]; +ld.shared.f64 fd244, [r11+288]; +ld.shared.f64 fd245, [r11+360]; +ld.shared.f64 fd246, [r11+432]; +ld.shared.f64 fd247, [r11+504]; +ld.shared.f64 fd248, [r11+576]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0d3FEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0d3FEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0d3FEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0d3FEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0d3FEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0d3FEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0dBFE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0dBFE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0dBFEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0dBFEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0dBFEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0dBFEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0dBFD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0dBFD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd268, fd284; +mul.f64 fd319, fd317, 0d3FE0000000000000; +sub.f64 fd320, fd250, fd319; +sub.f64 fd321, fd268, fd284; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +mul.f64 fd323, fd318, 0d3FE0000000000000; +sub.f64 fd324, fd252, fd323; +sub.f64 fd325, fd266, fd282; +mul.f64 fd326, fd325, 0d3FEBB67AE8584CAA; +add.f64 fd327, fd299, fd304; +add.f64 fd328, fd301, fd306; +mul.f64 fd329, fd327, 0d3FE0000000000000; +sub.f64 fd330, fd257, fd329; +sub.f64 fd331, fd301, fd306; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +mul.f64 fd333, fd328, 0d3FE0000000000000; +sub.f64 fd334, fd263, fd333; +sub.f64 fd335, fd299, fd304; +mul.f64 fd336, fd335, 0d3FEBB67AE8584CAA; +add.f64 fd337, fd309, fd314; +add.f64 fd338, fd311, fd316; +mul.f64 fd339, fd337, 0d3FE0000000000000; +sub.f64 fd340, fd258, fd339; +sub.f64 fd341, fd311, fd316; +mul.f64 fd342, fd341, 0d3FEBB67AE8584CAA; +mul.f64 fd343, fd338, 0d3FE0000000000000; +sub.f64 fd344, fd264, fd343; +sub.f64 fd345, fd309, fd314; +mul.f64 fd346, fd345, 0d3FEBB67AE8584CAA; +add.f64 %0, fd250, fd317; +add.f64 %1, fd252, fd318; +add.f64 %3, fd263, fd328; +add.f64 %2, fd257, fd327; +add.f64 %5, fd264, fd338; +add.f64 %4, fd258, fd337; +add.f64 %6, fd322, fd320; +sub.f64 %7, fd324, fd326; +sub.f64 %9, fd334, fd336; +add.f64 %8, fd332, fd330; +sub.f64 %11, fd344, fd346; +add.f64 %10, fd342, fd340; +sub.f64 %12, fd320, fd322; +add.f64 %13, fd326, fd324; +add.f64 %15, fd336, fd334; +sub.f64 %14, fd330, fd332; +add.f64 %17, fd346, fd344; +sub.f64 %16, fd340, fd342; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<512, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1383>; +.reg .b64 rd<11>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 1296, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1382, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1381, %57, fd1382; +mul.f64 fd119, fd1382, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1380, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1379, %63, fd1380; +mul.f64 fd135, fd1380, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1378, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1377, %69, fd1378; +mul.f64 fd151, fd1378, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd1376, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1376, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd1374, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1375, fd155, 0dBFEF838B8C811C17; +sub.f64 fd164, fd1374, fd1375; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd1372, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1373, fd140, 0dBFEF838B8C811C17; +sub.f64 fd169, fd1372, fd1373; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd1370, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1371, fd156, 0dBFD5E3A8748A0BF5; +sub.f64 fd174, fd1370, fd1371; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1369, fd1379, fd1377; +sub.f64 fd183, fd1379, fd1377; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1368, fd1381, fd1369; +mul.f64 fd187, fd1369, 0d3FE0000000000000; +sub.f64 fd188, fd1381, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1367, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1366, fd123, fd1367; +mul.f64 fd203, fd1367, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1365, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1364, fd124, fd1365; +mul.f64 fd219, fd1365, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1361, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1359, %112, fd1361; +mul.f64 fd235, fd1361, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1356, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1354, %115, fd1356; +mul.f64 fd251, fd1356, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1351, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1349, %118, fd1351; +mul.f64 fd267, fd1351, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd1348, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1348, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd1347, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1347, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd1345, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1346, fd256, 0dBFEF838B8C811C17; +sub.f64 fd285, fd1345, fd1346; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd1343, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1344, fd272, 0dBFD5E3A8748A0BF5; +sub.f64 fd290, fd1343, fd1344; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1342, fd1354, fd1349; +sub.f64 fd299, fd1354, fd1349; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1341, fd1359, fd1342; +mul.f64 fd303, fd1342, 0d3FE0000000000000; +sub.f64 fd304, fd1359, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1340, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1339, fd239, fd1340; +mul.f64 fd319, fd1340, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1338, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1337, fd240, fd1338; +mul.f64 fd335, fd1338, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1334, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1332, %121, fd1334; +mul.f64 fd351, fd1334, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1329, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1327, %124, fd1329; +mul.f64 fd367, fd1329, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1325, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1323, %126, fd1325; +mul.f64 fd383, fd1325, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd1322, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1322, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd1321, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1321, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd1319, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1320, fd372, 0dBFEF838B8C811C17; +sub.f64 fd401, fd1319, fd1320; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd1317, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1318, fd388, 0dBFD5E3A8748A0BF5; +sub.f64 fd406, fd1317, fd1318; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1316, fd1327, fd1323; +sub.f64 fd415, fd1327, fd1323; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1315, fd1332, fd1316; +mul.f64 fd419, fd1316, 0d3FE0000000000000; +sub.f64 fd420, fd1332, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1314, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1313, fd355, fd1314; +mul.f64 fd435, fd1314, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1312, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1311, fd356, fd1312; +mul.f64 fd451, fd1312, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1339, 0dBFCD84D223638000; +mul.f64 fd1310, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1310, fd458; +mul.f64 fd460, fd1339, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd1308, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1309, fd1313, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd1308, fd1309; +mul.f64 fd465, fd1313, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd1306, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1307, fd1337, 0dBFDCB920325BAFA6; +sub.f64 fd469, fd1306, fd1307; +mul.f64 fd470, fd1337, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd1304, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1305, fd1311, 0dBFE9AAFE4207DF5F; +sub.f64 fd474, fd1304, fd1305; +mul.f64 fd475, fd1311, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd1302, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1303, fd307, 0dBFE491B7523C161D; +sub.f64 fd479, fd1302, fd1303; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd1301, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1301, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd1300, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1300, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd1299, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1299, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0dBFED6206BEB6C24B; +mul.f64 fd1298, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1298, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0dBFE746A51650EADE; +mul.f64 fd1297, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1297, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0dBFEF838B8C811C17; +mul.f64 fd1296, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1296, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd1294, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1295, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd1294, fd1295; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd1292, fd318, 0dBFADC528B5343A86; +mul.f64 fd1293, fd324, 0dBFEFF223F3635CE3; +sub.f64 fd519, fd1292, fd1293; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd1290, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1291, fd440, 0d3FBDB843E577175E; +sub.f64 fd524, fd1290, fd1291; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd1289, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1289, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd1288, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1288, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd1287, fd1341, fd1315; +sub.f64 fd541, fd1341, fd1315; +mul.f64 fd542, fd541, 0d3FEBB67AE8584CAA; +add.f64 fd543, fd542, fd540; +sub.f64 fd544, fd540, fd542; +mul.f64 fd545, fd1287, 0d3FE0000000000000; +sub.f64 fd546, fd1368, fd545; +sub.f64 fd547, fd294, fd410; +mul.f64 fd548, fd547, 0d3FEBB67AE8584CAA; +sub.f64 fd549, fd546, fd548; +add.f64 fd550, fd548, fd546; +add.f64 fd551, fd459, fd464; +add.f64 fd552, fd194, fd551; +mul.f64 fd555, fd551, 0d3FE0000000000000; +sub.f64 fd556, fd194, fd555; +add.f64 fd1286, fd461, fd466; +sub.f64 fd557, fd461, fd466; +mul.f64 fd558, fd557, 0d3FEBB67AE8584CAA; +add.f64 fd559, fd558, fd556; +sub.f64 fd560, fd556, fd558; +add.f64 fd1285, fd1366, fd1286; +mul.f64 fd561, fd1286, 0d3FE0000000000000; +sub.f64 fd562, fd1366, fd561; +sub.f64 fd563, fd459, fd464; +mul.f64 fd564, fd563, 0d3FEBB67AE8584CAA; +sub.f64 fd565, fd562, fd564; +add.f64 fd566, fd564, fd562; +add.f64 fd567, fd469, fd474; +add.f64 fd568, fd210, fd567; +mul.f64 fd571, fd567, 0d3FE0000000000000; +sub.f64 fd572, fd210, fd571; +add.f64 fd1284, fd471, fd476; +sub.f64 fd573, fd471, fd476; +mul.f64 fd574, fd573, 0d3FEBB67AE8584CAA; +add.f64 fd575, fd574, fd572; +sub.f64 fd576, fd572, fd574; +add.f64 fd1283, fd1364, fd1284; +mul.f64 fd577, fd1284, 0d3FE0000000000000; +sub.f64 fd578, fd1364, fd577; +sub.f64 fd579, fd469, fd474; +mul.f64 fd580, fd579, 0d3FEBB67AE8584CAA; +sub.f64 fd581, fd578, fd580; +add.f64 fd582, fd580, fd578; +add.f64 fd583, fd479, fd484; +add.f64 fd584, fd185, fd583; +mul.f64 fd587, fd583, 0d3FE0000000000000; +sub.f64 fd588, fd185, fd587; +add.f64 fd1282, fd481, fd486; +sub.f64 fd589, fd481, fd486; +mul.f64 fd590, fd589, 0d3FEBB67AE8584CAA; +add.f64 fd591, fd590, fd588; +sub.f64 fd592, fd588, fd590; +add.f64 fd1281, fd191, fd1282; +mul.f64 fd593, fd1282, 0d3FE0000000000000; +sub.f64 fd594, fd191, fd593; +sub.f64 fd595, fd479, fd484; +mul.f64 fd596, fd595, 0d3FEBB67AE8584CAA; +sub.f64 fd597, fd594, fd596; +add.f64 fd598, fd596, fd594; +add.f64 fd599, fd489, fd494; +add.f64 fd600, fd201, fd599; +mul.f64 fd603, fd599, 0d3FE0000000000000; +sub.f64 fd604, fd201, fd603; +add.f64 fd1280, fd491, fd496; +sub.f64 fd605, fd491, fd496; +mul.f64 fd606, fd605, 0d3FEBB67AE8584CAA; +add.f64 fd607, fd606, fd604; +sub.f64 fd608, fd604, fd606; +add.f64 fd1279, fd207, fd1280; +mul.f64 fd609, fd1280, 0d3FE0000000000000; +sub.f64 fd610, fd207, fd609; +sub.f64 fd611, fd489, fd494; +mul.f64 fd612, fd611, 0d3FEBB67AE8584CAA; +sub.f64 fd613, fd610, fd612; +add.f64 fd614, fd612, fd610; +add.f64 fd615, fd499, fd504; +add.f64 fd616, fd217, fd615; +mul.f64 fd619, fd615, 0d3FE0000000000000; +sub.f64 fd620, fd217, fd619; +add.f64 fd1278, fd501, fd506; +sub.f64 fd621, fd501, fd506; +mul.f64 fd622, fd621, 0d3FEBB67AE8584CAA; +add.f64 fd623, fd622, fd620; +sub.f64 fd624, fd620, fd622; +add.f64 fd1277, fd223, fd1278; +mul.f64 fd625, fd1278, 0d3FE0000000000000; +sub.f64 fd626, fd223, fd625; +sub.f64 fd627, fd499, fd504; +mul.f64 fd628, fd627, 0d3FEBB67AE8584CAA; +sub.f64 fd629, fd626, fd628; +add.f64 fd630, fd628, fd626; +add.f64 fd631, fd509, fd514; +add.f64 fd632, fd186, fd631; +mul.f64 fd635, fd631, 0d3FE0000000000000; +sub.f64 fd636, fd186, fd635; +add.f64 fd1276, fd511, fd516; +sub.f64 fd637, fd511, fd516; +mul.f64 fd638, fd637, 0d3FEBB67AE8584CAA; +add.f64 fd639, fd638, fd636; +sub.f64 fd640, fd636, fd638; +add.f64 fd1275, fd192, fd1276; +mul.f64 fd641, fd1276, 0d3FE0000000000000; +sub.f64 fd642, fd192, fd641; +sub.f64 fd643, fd509, fd514; +mul.f64 fd644, fd643, 0d3FEBB67AE8584CAA; +sub.f64 fd645, fd642, fd644; +add.f64 fd646, fd644, fd642; +add.f64 fd647, fd519, fd524; +add.f64 fd648, fd202, fd647; +mul.f64 fd651, fd647, 0d3FE0000000000000; +sub.f64 fd652, fd202, fd651; +add.f64 fd1274, fd521, fd526; +sub.f64 fd653, fd521, fd526; +mul.f64 fd654, fd653, 0d3FEBB67AE8584CAA; +add.f64 fd655, fd654, fd652; +sub.f64 fd656, fd652, fd654; +add.f64 fd1273, fd208, fd1274; +mul.f64 fd657, fd1274, 0d3FE0000000000000; +sub.f64 fd658, fd208, fd657; +sub.f64 fd659, fd519, fd524; +mul.f64 fd660, fd659, 0d3FEBB67AE8584CAA; +sub.f64 fd661, fd658, fd660; +add.f64 fd662, fd660, fd658; +add.f64 fd663, fd529, fd534; +add.f64 fd664, fd218, fd663; +mul.f64 fd667, fd663, 0d3FE0000000000000; +sub.f64 fd668, fd218, fd667; +add.f64 fd1272, fd531, fd536; +sub.f64 fd669, fd531, fd536; +mul.f64 fd670, fd669, 0d3FEBB67AE8584CAA; +add.f64 fd671, fd670, fd668; +sub.f64 fd672, fd668, fd670; +add.f64 fd1271, fd224, fd1272; +mul.f64 fd673, fd1272, 0d3FE0000000000000; +sub.f64 fd674, fd224, fd673; +sub.f64 fd675, fd529, fd534; +mul.f64 fd676, fd675, 0d3FEBB67AE8584CAA; +sub.f64 fd677, fd674, fd676; +add.f64 fd678, fd676, fd674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 1296, r3; +mul.wide.u32 rd7, r7, 16; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd679, fd680}, [rd6]; +mul.f64 fd684, fd680, fd1285; +mul.f64 fd685, fd679, fd1285; +mul.f64 fd1269, fd679, fd679; +mul.f64 fd1270, fd680, fd680; +sub.f64 fd688, fd1269, fd1270; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd692, fd690, fd1283; +mul.f64 fd693, fd688, fd1283; +mul.f64 fd695, fd680, fd690; +mul.f64 fd1268, fd679, fd688; +sub.f64 fd696, fd1268, fd695; +mul.f64 fd1267, fd688, fd568; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd700, fd698, fd1281; +mul.f64 fd701, fd696, fd1281; +mul.f64 fd1265, fd679, fd696; +mul.f64 fd1266, fd680, fd698; +sub.f64 fd704, fd1265, fd1266; +mul.f64 fd1264, fd696, fd584; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd708, fd706, fd1279; +mul.f64 fd709, fd704, fd1279; +mul.f64 fd711, fd680, fd706; +mul.f64 fd1263, fd679, fd704; +sub.f64 fd712, fd1263, fd711; +mul.f64 fd1262, fd704, fd600; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd716, fd714, fd1277; +mul.f64 fd717, fd712, fd1277; +mul.f64 fd719, fd680, fd714; +mul.f64 fd1261, fd679, fd712; +sub.f64 fd720, fd1261, fd719; +mul.f64 fd1260, fd712, fd616; +mul.f64 fd721, fd679, fd714; +fma.rn.f64 fd722, fd680, fd712, fd721; +mul.f64 fd724, fd722, fd1275; +mul.f64 fd725, fd720, fd1275; +mul.f64 fd1258, fd679, fd720; +mul.f64 fd1259, fd680, fd722; +sub.f64 fd728, fd1258, fd1259; +mul.f64 fd1257, fd720, fd632; +mul.f64 fd729, fd679, fd722; +fma.rn.f64 fd730, fd680, fd720, fd729; +mul.f64 fd732, fd730, fd1273; +mul.f64 fd733, fd728, fd1273; +mul.f64 fd735, fd680, fd730; +mul.f64 fd1256, fd679, fd728; +sub.f64 fd736, fd1256, fd735; +mul.f64 fd1255, fd728, fd648; +mul.f64 fd737, fd679, fd730; +fma.rn.f64 fd738, fd680, fd728, fd737; +mul.f64 fd740, fd738, fd1271; +mul.f64 fd741, fd736, fd1271; +mul.f64 fd743, fd680, fd738; +mul.f64 fd1254, fd679, fd736; +sub.f64 fd744, fd1254, fd743; +mul.f64 fd1253, fd736, fd664; +mul.f64 fd745, fd679, fd738; +fma.rn.f64 fd746, fd680, fd736, fd745; +mul.f64 fd748, fd746, fd549; +mul.f64 fd749, fd744, fd549; +mul.f64 fd1251, fd679, fd744; +mul.f64 fd1252, fd680, fd746; +sub.f64 fd752, fd1251, fd1252; +mul.f64 fd1250, fd744, fd543; +mul.f64 fd753, fd679, fd746; +fma.rn.f64 fd754, fd680, fd744, fd753; +mul.f64 fd756, fd754, fd565; +mul.f64 fd757, fd752, fd565; +mul.f64 fd759, fd680, fd754; +mul.f64 fd1249, fd679, fd752; +sub.f64 fd760, fd1249, fd759; +mul.f64 fd1248, fd752, fd559; +mul.f64 fd761, fd679, fd754; +fma.rn.f64 fd762, fd680, fd752, fd761; +mul.f64 fd764, fd762, fd581; +mul.f64 fd765, fd760, fd581; +mul.f64 fd1246, fd679, fd760; +mul.f64 fd1247, fd680, fd762; +sub.f64 fd768, fd1246, fd1247; +mul.f64 fd1245, fd760, fd575; +mul.f64 fd769, fd679, fd762; +fma.rn.f64 fd770, fd680, fd760, fd769; +mul.f64 fd772, fd770, fd597; +mul.f64 fd773, fd768, fd597; +mul.f64 fd775, fd680, fd770; +mul.f64 fd1244, fd679, fd768; +sub.f64 fd776, fd1244, fd775; +mul.f64 fd1243, fd768, fd591; +mul.f64 fd777, fd679, fd770; +fma.rn.f64 fd778, fd680, fd768, fd777; +mul.f64 fd779, fd776, fd607; +mul.f64 fd780, fd778, fd613; +mul.f64 fd781, fd776, fd613; +ld.global.v2.f64 {fd782, fd783}, [rd6+48]; +mul.f64 fd787, fd783, fd629; +mul.f64 fd788, fd782, fd629; +mul.f64 fd790, fd680, fd783; +mul.f64 fd1242, fd679, fd782; +sub.f64 fd791, fd1242, fd790; +mul.f64 fd1241, fd782, fd623; +mul.f64 fd792, fd679, fd783; +fma.rn.f64 fd793, fd680, fd782, fd792; +mul.f64 fd795, fd793, fd645; +mul.f64 fd796, fd791, fd645; +mul.f64 fd1239, fd679, fd791; +mul.f64 fd1240, fd680, fd793; +sub.f64 fd799, fd1239, fd1240; +mul.f64 fd1238, fd791, fd639; +mul.f64 fd800, fd679, fd793; +fma.rn.f64 fd801, fd680, fd791, fd800; +mul.f64 fd803, fd801, fd661; +mul.f64 fd804, fd799, fd661; +mul.f64 fd806, fd680, fd801; +mul.f64 fd1237, fd679, fd799; +sub.f64 fd807, fd1237, fd806; +mul.f64 fd1236, fd799, fd655; +mul.f64 fd808, fd679, fd801; +fma.rn.f64 fd809, fd680, fd799, fd808; +mul.f64 fd811, fd809, fd677; +mul.f64 fd812, fd807, fd677; +mul.f64 fd814, fd680, fd809; +mul.f64 fd1235, fd679, fd807; +sub.f64 fd815, fd1235, fd814; +mul.f64 fd1234, fd807, fd671; +mul.f64 fd816, fd679, fd809; +fma.rn.f64 fd817, fd680, fd807, fd816; +mul.f64 fd819, fd817, fd550; +mul.f64 fd820, fd815, fd550; +mul.f64 fd1232, fd679, fd815; +mul.f64 fd1233, fd680, fd817; +sub.f64 fd823, fd1232, fd1233; +mul.f64 fd1231, fd815, fd544; +mul.f64 fd824, fd679, fd817; +fma.rn.f64 fd825, fd680, fd815, fd824; +mul.f64 fd827, fd825, fd566; +mul.f64 fd828, fd823, fd566; +mul.f64 fd830, fd680, fd825; +mul.f64 fd1230, fd679, fd823; +sub.f64 fd831, fd1230, fd830; +mul.f64 fd1229, fd823, fd560; +mul.f64 fd832, fd679, fd825; +fma.rn.f64 fd833, fd680, fd823, fd832; +mul.f64 fd835, fd833, fd582; +mul.f64 fd836, fd831, fd582; +mul.f64 fd1227, fd679, fd831; +mul.f64 fd1228, fd680, fd833; +sub.f64 fd839, fd1227, fd1228; +mul.f64 fd1226, fd831, fd576; +mul.f64 fd840, fd679, fd833; +fma.rn.f64 fd841, fd680, fd831, fd840; +mul.f64 fd843, fd841, fd598; +mul.f64 fd844, fd839, fd598; +mul.f64 fd1224, fd679, fd839; +mul.f64 fd1225, fd680, fd841; +sub.f64 fd847, fd1224, fd1225; +mul.f64 fd1223, fd839, fd592; +mul.f64 fd848, fd679, fd841; +fma.rn.f64 fd849, fd680, fd839, fd848; +mul.f64 fd851, fd849, fd614; +mul.f64 fd852, fd847, fd614; +mul.f64 fd854, fd680, fd849; +mul.f64 fd1222, fd679, fd847; +sub.f64 fd855, fd1222, fd854; +mul.f64 fd1221, fd847, fd608; +mul.f64 fd856, fd679, fd849; +fma.rn.f64 fd857, fd680, fd847, fd856; +mul.f64 fd859, fd857, fd630; +mul.f64 fd860, fd855, fd630; +mul.f64 fd1219, fd679, fd855; +mul.f64 fd1220, fd680, fd857; +sub.f64 fd863, fd1219, fd1220; +mul.f64 fd1218, fd855, fd624; +mul.f64 fd864, fd679, fd857; +fma.rn.f64 fd865, fd680, fd855, fd864; +mul.f64 fd867, fd865, fd646; +mul.f64 fd868, fd863, fd646; +mul.f64 fd870, fd680, fd865; +mul.f64 fd1217, fd679, fd863; +sub.f64 fd871, fd1217, fd870; +mul.f64 fd1216, fd863, fd640; +mul.f64 fd872, fd679, fd865; +fma.rn.f64 fd873, fd680, fd863, fd872; +mul.f64 fd875, fd873, fd662; +mul.f64 fd876, fd871, fd662; +mul.f64 fd878, fd680, fd873; +mul.f64 fd1215, fd679, fd871; +sub.f64 fd879, fd1215, fd878; +mul.f64 fd1214, fd679, fd552; +mul.f64 fd880, fd679, fd873; +mul.f64 fd1213, fd871, fd656; +fma.rn.f64 fd881, fd680, fd871, fd880; +mul.f64 fd882, fd879, fd672; +mul.f64 fd883, fd881, fd678; +mul.f64 fd884, fd879, fd678; +barrier.sync 0; +mad.lo.s32 r9, r7, 432, r8; +add.f64 fd885, fd1368, fd1287; +add.f64 fd886, fd178, fd537; +st.shared.v2.f64 [r9], {fd886, fd885}; +fma.rn.f64 fd887, fd680, fd552, fd685; +sub.f64 fd888, fd1214, fd684; +st.shared.v2.f64 [r9+16], {fd888, fd887}; +fma.rn.f64 fd889, fd690, fd568, fd693; +sub.f64 fd890, fd1267, fd692; +st.shared.v2.f64 [r9+32], {fd890, fd889}; +fma.rn.f64 fd891, fd698, fd584, fd701; +sub.f64 fd892, fd1264, fd700; +st.shared.v2.f64 [r9+48], {fd892, fd891}; +fma.rn.f64 fd893, fd706, fd600, fd709; +sub.f64 fd894, fd1262, fd708; +st.shared.v2.f64 [r9+64], {fd894, fd893}; +fma.rn.f64 fd895, fd714, fd616, fd717; +sub.f64 fd896, fd1260, fd716; +st.shared.v2.f64 [r9+80], {fd896, fd895}; +fma.rn.f64 fd897, fd722, fd632, fd725; +sub.f64 fd898, fd1257, fd724; +st.shared.v2.f64 [r9+96], {fd898, fd897}; +sub.f64 fd899, fd1255, fd732; +fma.rn.f64 fd900, fd730, fd648, fd733; +st.shared.v2.f64 [r9+112], {fd899, fd900}; +fma.rn.f64 fd901, fd738, fd664, fd741; +sub.f64 fd902, fd1253, fd740; +st.shared.v2.f64 [r9+128], {fd902, fd901}; +fma.rn.f64 fd903, fd746, fd543, fd749; +sub.f64 fd904, fd1250, fd748; +st.shared.v2.f64 [r9+144], {fd904, fd903}; +fma.rn.f64 fd905, fd754, fd559, fd757; +sub.f64 fd906, fd1248, fd756; +st.shared.v2.f64 [r9+160], {fd906, fd905}; +fma.rn.f64 fd907, fd762, fd575, fd765; +sub.f64 fd908, fd1245, fd764; +st.shared.v2.f64 [r9+176], {fd908, fd907}; +fma.rn.f64 fd909, fd770, fd591, fd773; +sub.f64 fd910, fd1243, fd772; +st.shared.v2.f64 [r9+192], {fd910, fd909}; +fma.rn.f64 fd911, fd778, fd607, fd781; +sub.f64 fd912, fd779, fd780; +st.shared.v2.f64 [r9+208], {fd912, fd911}; +fma.rn.f64 fd913, fd783, fd623, fd788; +sub.f64 fd914, fd1241, fd787; +st.shared.v2.f64 [r9+224], {fd914, fd913}; +fma.rn.f64 fd915, fd793, fd639, fd796; +sub.f64 fd916, fd1238, fd795; +st.shared.v2.f64 [r9+240], {fd916, fd915}; +fma.rn.f64 fd917, fd801, fd655, fd804; +sub.f64 fd918, fd1236, fd803; +st.shared.v2.f64 [r9+256], {fd918, fd917}; +fma.rn.f64 fd919, fd809, fd671, fd812; +sub.f64 fd920, fd1234, fd811; +st.shared.v2.f64 [r9+272], {fd920, fd919}; +fma.rn.f64 fd921, fd817, fd544, fd820; +sub.f64 fd922, fd1231, fd819; +st.shared.v2.f64 [r9+288], {fd922, fd921}; +fma.rn.f64 fd923, fd825, fd560, fd828; +sub.f64 fd924, fd1229, fd827; +st.shared.v2.f64 [r9+304], {fd924, fd923}; +sub.f64 fd925, fd1226, fd835; +fma.rn.f64 fd926, fd833, fd576, fd836; +st.shared.v2.f64 [r9+320], {fd925, fd926}; +fma.rn.f64 fd927, fd841, fd592, fd844; +sub.f64 fd928, fd1223, fd843; +st.shared.v2.f64 [r9+336], {fd928, fd927}; +fma.rn.f64 fd929, fd849, fd608, fd852; +sub.f64 fd930, fd1221, fd851; +st.shared.v2.f64 [r9+352], {fd930, fd929}; +fma.rn.f64 fd931, fd857, fd624, fd860; +sub.f64 fd932, fd1218, fd859; +st.shared.v2.f64 [r9+368], {fd932, fd931}; +fma.rn.f64 fd933, fd865, fd640, fd868; +sub.f64 fd934, fd1216, fd867; +st.shared.v2.f64 [r9+384], {fd934, fd933}; +fma.rn.f64 fd935, fd873, fd656, fd876; +sub.f64 fd936, fd1213, fd875; +st.shared.v2.f64 [r9+400], {fd936, fd935}; +fma.rn.f64 fd937, fd881, fd672, fd884; +sub.f64 fd938, fd882, fd883; +st.shared.v2.f64 [r9+416], {fd938, fd937}; +barrier.sync 0; +mad.lo.s32 r10, r7, -416, r9; +ld.shared.v2.f64 {fd939, fd940}, [r10]; +ld.shared.v2.f64 {fd943, fd944}, [r10+48]; +ld.shared.v2.f64 {fd947, fd948}, [r10+96]; +ld.shared.v2.f64 {fd951, fd952}, [r10+144]; +ld.shared.v2.f64 {fd955, fd956}, [r10+192]; +ld.shared.v2.f64 {fd959, fd960}, [r10+240]; +ld.shared.v2.f64 {fd963, fd964}, [r10+288]; +ld.shared.v2.f64 {fd967, fd968}, [r10+336]; +ld.shared.v2.f64 {fd971, fd972}, [r10+384]; +ld.shared.v2.f64 {fd975, fd976}, [r10+432]; +ld.shared.v2.f64 {fd979, fd980}, [r10+480]; +ld.shared.v2.f64 {fd983, fd984}, [r10+528]; +ld.shared.v2.f64 {fd987, fd988}, [r10+576]; +ld.shared.v2.f64 {fd991, fd992}, [r10+624]; +ld.shared.v2.f64 {fd995, fd996}, [r10+672]; +ld.shared.v2.f64 {fd999, fd1000}, [r10+720]; +ld.shared.v2.f64 {fd1003, fd1004}, [r10+768]; +ld.shared.v2.f64 {fd1007, fd1008}, [r10+816]; +ld.shared.v2.f64 {fd1011, fd1012}, [r10+864]; +ld.shared.v2.f64 {fd1015, fd1016}, [r10+912]; +ld.shared.v2.f64 {fd1019, fd1020}, [r10+960]; +ld.shared.v2.f64 {fd1023, fd1024}, [r10+1008]; +ld.shared.v2.f64 {fd1027, fd1028}, [r10+1056]; +ld.shared.v2.f64 {fd1031, fd1032}, [r10+1104]; +ld.shared.v2.f64 {fd1035, fd1036}, [r10+1152]; +ld.shared.v2.f64 {fd1039, fd1040}, [r10+1200]; +ld.shared.v2.f64 {fd1043, fd1044}, [r10+1248]; +add.f64 fd1047, fd975, fd1011; +mul.f64 fd1049, fd1047, 0d3FE0000000000000; +sub.f64 fd1050, fd939, fd1049; +add.f64 fd1212, fd976, fd1012; +sub.f64 fd1051, fd976, fd1012; +mul.f64 fd1052, fd1051, 0d3FEBB67AE8584CAA; +mul.f64 fd1053, fd1212, 0d3FE0000000000000; +sub.f64 fd1054, fd940, fd1053; +sub.f64 fd1055, fd975, fd1011; +mul.f64 fd1056, fd1055, 0d3FEBB67AE8584CAA; +add.f64 fd1057, fd979, fd1015; +mul.f64 fd1059, fd1057, 0d3FE0000000000000; +sub.f64 fd1060, fd943, fd1059; +add.f64 fd1211, fd980, fd1016; +sub.f64 fd1061, fd980, fd1016; +mul.f64 fd1062, fd1061, 0d3FEBB67AE8584CAA; +mul.f64 fd1063, fd1211, 0d3FE0000000000000; +sub.f64 fd1064, fd944, fd1063; +sub.f64 fd1065, fd979, fd1015; +mul.f64 fd1066, fd1065, 0d3FEBB67AE8584CAA; +add.f64 fd1067, fd983, fd1019; +mul.f64 fd1069, fd1067, 0d3FE0000000000000; +sub.f64 fd1070, fd947, fd1069; +add.f64 fd1210, fd984, fd1020; +sub.f64 fd1071, fd984, fd1020; +mul.f64 fd1072, fd1071, 0d3FEBB67AE8584CAA; +mul.f64 fd1073, fd1210, 0d3FE0000000000000; +sub.f64 fd1074, fd948, fd1073; +sub.f64 fd1075, fd983, fd1019; +mul.f64 fd1076, fd1075, 0d3FEBB67AE8584CAA; +add.f64 fd1077, fd987, fd1023; +mul.f64 fd1079, fd1077, 0d3FE0000000000000; +sub.f64 fd1080, fd951, fd1079; +add.f64 fd1209, fd988, fd1024; +sub.f64 fd1081, fd988, fd1024; +mul.f64 fd1082, fd1081, 0d3FEBB67AE8584CAA; +mul.f64 fd1083, fd1209, 0d3FE0000000000000; +sub.f64 fd1084, fd952, fd1083; +sub.f64 fd1085, fd987, fd1023; +mul.f64 fd1086, fd1085, 0d3FEBB67AE8584CAA; +add.f64 fd1087, fd991, fd1027; +mul.f64 fd1089, fd1087, 0d3FE0000000000000; +sub.f64 fd1090, fd955, fd1089; +add.f64 fd1208, fd992, fd1028; +sub.f64 fd1091, fd992, fd1028; +mul.f64 fd1092, fd1091, 0d3FEBB67AE8584CAA; +mul.f64 fd1093, fd1208, 0d3FE0000000000000; +sub.f64 fd1094, fd956, fd1093; +sub.f64 fd1095, fd991, fd1027; +mul.f64 fd1096, fd1095, 0d3FEBB67AE8584CAA; +add.f64 fd1097, fd995, fd1031; +mul.f64 fd1099, fd1097, 0d3FE0000000000000; +sub.f64 fd1100, fd959, fd1099; +add.f64 fd1207, fd996, fd1032; +sub.f64 fd1101, fd996, fd1032; +mul.f64 fd1102, fd1101, 0d3FEBB67AE8584CAA; +mul.f64 fd1103, fd1207, 0d3FE0000000000000; +sub.f64 fd1104, fd960, fd1103; +sub.f64 fd1105, fd995, fd1031; +mul.f64 fd1106, fd1105, 0d3FEBB67AE8584CAA; +add.f64 fd1107, fd999, fd1035; +mul.f64 fd1109, fd1107, 0d3FE0000000000000; +sub.f64 fd1110, fd963, fd1109; +add.f64 fd1206, fd1000, fd1036; +sub.f64 fd1111, fd1000, fd1036; +mul.f64 fd1112, fd1111, 0d3FEBB67AE8584CAA; +mul.f64 fd1113, fd1206, 0d3FE0000000000000; +sub.f64 fd1114, fd964, fd1113; +sub.f64 fd1115, fd999, fd1035; +mul.f64 fd1116, fd1115, 0d3FEBB67AE8584CAA; +add.f64 fd1117, fd1003, fd1039; +mul.f64 fd1119, fd1117, 0d3FE0000000000000; +sub.f64 fd1120, fd967, fd1119; +add.f64 fd1205, fd1004, fd1040; +sub.f64 fd1121, fd1004, fd1040; +mul.f64 fd1122, fd1121, 0d3FEBB67AE8584CAA; +mul.f64 fd1123, fd1205, 0d3FE0000000000000; +sub.f64 fd1124, fd968, fd1123; +sub.f64 fd1125, fd1003, fd1039; +mul.f64 fd1126, fd1125, 0d3FEBB67AE8584CAA; +add.f64 fd1127, fd1007, fd1043; +mul.f64 fd1129, fd1127, 0d3FE0000000000000; +sub.f64 fd1130, fd971, fd1129; +add.f64 fd1204, fd1008, fd1044; +sub.f64 fd1131, fd1008, fd1044; +mul.f64 fd1132, fd1131, 0d3FEBB67AE8584CAA; +mul.f64 fd1133, fd1204, 0d3FE0000000000000; +sub.f64 fd1134, fd972, fd1133; +sub.f64 fd1135, fd1007, fd1043; +mul.f64 fd1136, fd1135, 0d3FEBB67AE8584CAA; +add.f64 %1, fd940, fd1212; +add.f64 %0, fd939, fd1047; +add.f64 %3, fd944, fd1211; +add.f64 %2, fd943, fd1057; +add.f64 %5, fd948, fd1210; +add.f64 %4, fd947, fd1067; +add.f64 %7, fd952, fd1209; +add.f64 %6, fd951, fd1077; +add.f64 %9, fd956, fd1208; +add.f64 %8, fd955, fd1087; +add.f64 %11, fd960, fd1207; +add.f64 %10, fd959, fd1097; +add.f64 %13, fd964, fd1206; +add.f64 %12, fd963, fd1107; +add.f64 %15, fd968, fd1205; +add.f64 %14, fd967, fd1117; +add.f64 %17, fd972, fd1204; +add.f64 %16, fd971, fd1127; +sub.f64 %19, fd1054, fd1056; +add.f64 %18, fd1052, fd1050; +add.f64 %20, fd1062, fd1060; +sub.f64 %21, fd1064, fd1066; +add.f64 %22, fd1072, fd1070; +sub.f64 %23, fd1074, fd1076; +add.f64 %24, fd1082, fd1080; +sub.f64 %25, fd1084, fd1086; +sub.f64 %27, fd1094, fd1096; +add.f64 %26, fd1092, fd1090; +sub.f64 %29, fd1104, fd1106; +add.f64 %28, fd1102, fd1100; +sub.f64 %31, fd1114, fd1116; +add.f64 %30, fd1112, fd1110; +add.f64 %32, fd1122, fd1120; +sub.f64 %33, fd1124, fd1126; +add.f64 %34, fd1132, fd1130; +sub.f64 %35, fd1134, fd1136; +add.f64 %37, fd1056, fd1054; +sub.f64 %36, fd1050, fd1052; +add.f64 %39, fd1066, fd1064; +sub.f64 %38, fd1060, fd1062; +add.f64 %41, fd1076, fd1074; +sub.f64 %40, fd1070, fd1072; +add.f64 %43, fd1086, fd1084; +sub.f64 %42, fd1080, fd1082; +add.f64 %45, fd1096, fd1094; +sub.f64 %44, fd1090, fd1092; +add.f64 %47, fd1106, fd1104; +sub.f64 %46, fd1100, fd1102; +add.f64 %49, fd1116, fd1114; +sub.f64 %48, fd1110, fd1112; +add.f64 %51, fd1126, fd1124; +sub.f64 %50, fd1120, fd1122; +add.f64 %53, fd1136, fd1134; +sub.f64 %52, fd1130, fd1132; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<511, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1349>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 648, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1340, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1339, %57, fd1340; +mul.f64 fd119, fd1340, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0d3FEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1338, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0d3FEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1337, %63, fd1338; +mul.f64 fd135, fd1338, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1336, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0d3FEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1335, %69, fd1336; +mul.f64 fd151, fd1336, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0d3FEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0dBFE491B7523C161D; +mul.f64 fd1334, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1334, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0dBFE491B7523C161D, fd160; +mul.f64 fd1332, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1333, fd155, 0dBFEF838B8C811C17; +sub.f64 fd164, fd1332, fd1333; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0dBFEF838B8C811C17, fd165; +mul.f64 fd1330, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1331, fd140, 0dBFEF838B8C811C17; +sub.f64 fd169, fd1330, fd1331; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0dBFEF838B8C811C17, fd170; +mul.f64 fd1328, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1329, fd156, 0dBFD5E3A8748A0BF5; +sub.f64 fd174, fd1328, fd1329; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0dBFD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1327, fd1337, fd1335; +sub.f64 fd183, fd1337, fd1335; +mul.f64 fd184, fd183, 0d3FEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1326, fd1339, fd1327; +mul.f64 fd187, fd1327, 0d3FE0000000000000; +sub.f64 fd188, fd1339, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0d3FEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1325, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0d3FEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1324, fd123, fd1325; +mul.f64 fd203, fd1325, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0d3FEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1323, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0d3FEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1322, fd124, fd1323; +mul.f64 fd219, fd1323, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0d3FEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1319, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0d3FEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1317, %112, fd1319; +mul.f64 fd235, fd1319, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0d3FEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1314, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0d3FEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1312, %115, fd1314; +mul.f64 fd251, fd1314, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0d3FEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1309, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0d3FEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1307, %118, fd1309; +mul.f64 fd267, fd1309, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0d3FEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0dBFE491B7523C161D; +mul.f64 fd1306, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1306, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0dBFE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0dBFEF838B8C811C17; +mul.f64 fd1305, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1305, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0dBFEF838B8C811C17, fd281; +mul.f64 fd1303, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1304, fd256, 0dBFEF838B8C811C17; +sub.f64 fd285, fd1303, fd1304; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0dBFEF838B8C811C17, fd286; +mul.f64 fd1301, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1302, fd272, 0dBFD5E3A8748A0BF5; +sub.f64 fd290, fd1301, fd1302; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0dBFD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1300, fd1312, fd1307; +sub.f64 fd299, fd1312, fd1307; +mul.f64 fd300, fd299, 0d3FEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1299, fd1317, fd1300; +mul.f64 fd303, fd1300, 0d3FE0000000000000; +sub.f64 fd304, fd1317, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0d3FEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1298, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0d3FEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1297, fd239, fd1298; +mul.f64 fd319, fd1298, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0d3FEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1296, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0d3FEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1295, fd240, fd1296; +mul.f64 fd335, fd1296, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0d3FEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1292, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0d3FEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1290, %121, fd1292; +mul.f64 fd351, fd1292, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0d3FEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1287, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0d3FEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1285, %124, fd1287; +mul.f64 fd367, fd1287, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0d3FEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1283, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0d3FEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1281, %126, fd1283; +mul.f64 fd383, fd1283, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0d3FEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0dBFE491B7523C161D; +mul.f64 fd1280, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1280, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0dBFE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0dBFEF838B8C811C17; +mul.f64 fd1279, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1279, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0dBFEF838B8C811C17, fd397; +mul.f64 fd1277, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1278, fd372, 0dBFEF838B8C811C17; +sub.f64 fd401, fd1277, fd1278; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0dBFEF838B8C811C17, fd402; +mul.f64 fd1275, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1276, fd388, 0dBFD5E3A8748A0BF5; +sub.f64 fd406, fd1275, fd1276; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0dBFD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1274, fd1285, fd1281; +sub.f64 fd415, fd1285, fd1281; +mul.f64 fd416, fd415, 0d3FEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1273, fd1290, fd1274; +mul.f64 fd419, fd1274, 0d3FE0000000000000; +sub.f64 fd420, fd1290, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0d3FEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1272, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0d3FEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1271, fd355, fd1272; +mul.f64 fd435, fd1272, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0d3FEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1270, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0d3FEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1269, fd356, fd1270; +mul.f64 fd451, fd1270, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0d3FEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1297, 0dBFCD84D223638000; +mul.f64 fd1268, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1268, fd458; +mul.f64 fd460, fd1297, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0dBFCD84D223638000, fd460; +mul.f64 fd1266, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1267, fd1271, 0dBFDCB920325BAFA6; +sub.f64 fd464, fd1266, fd1267; +mul.f64 fd465, fd1271, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0dBFDCB920325BAFA6, fd465; +mul.f64 fd1264, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1265, fd1295, 0dBFDCB920325BAFA6; +sub.f64 fd469, fd1264, fd1265; +mul.f64 fd470, fd1295, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0dBFDCB920325BAFA6, fd470; +mul.f64 fd1262, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1263, fd1269, 0dBFE9AAFE4207DF5F; +sub.f64 fd474, fd1262, fd1263; +mul.f64 fd475, fd1269, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0dBFE9AAFE4207DF5F, fd475; +mul.f64 fd1260, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1261, fd307, 0dBFE491B7523C161D; +sub.f64 fd479, fd1260, fd1261; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0dBFE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0dBFEF838B8C811C17; +mul.f64 fd1259, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1259, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0dBFEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0dBFE9AAFE4207DF5F; +mul.f64 fd1258, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1258, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0dBFE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0dBFEEA7D99F29CADE; +mul.f64 fd1257, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1257, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0dBFEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0dBFED6206BEB6C24B; +mul.f64 fd1256, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1256, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0dBFED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0dBFE746A51650EADE; +mul.f64 fd1255, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1255, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0dBFE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0dBFEF838B8C811C17; +mul.f64 fd1254, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1254, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0dBFEF838B8C811C17, fd510; +mul.f64 fd1252, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1253, fd424, 0dBFD5E3A8748A0BF5; +sub.f64 fd514, fd1252, fd1253; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0dBFD5E3A8748A0BF5, fd515; +mul.f64 fd1250, fd318, 0dBFADC528B5343A86; +mul.f64 fd1251, fd324, 0dBFEFF223F3635CE3; +sub.f64 fd519, fd1250, fd1251; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0dBFEFF223F3635CE3, fd520; +mul.f64 fd1248, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1249, fd440, 0d3FBDB843E577175E; +sub.f64 fd524, fd1248, fd1249; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0d3FBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0dBFEEA7D99F29CADE; +mul.f64 fd1247, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1247, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0dBFEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0d3FE19593DA358510; +mul.f64 fd1246, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1246, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0d3FE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +add.f64 fd538, fd178, fd537; +mul.f64 fd541, fd537, 0d3FE0000000000000; +sub.f64 fd542, fd178, fd541; +add.f64 fd1245, fd1299, fd1273; +sub.f64 fd543, fd1299, fd1273; +mul.f64 fd544, fd543, 0d3FEBB67AE8584CAA; +add.f64 fd545, fd544, fd542; +sub.f64 fd546, fd542, fd544; +add.f64 fd1244, fd1326, fd1245; +mul.f64 fd547, fd1245, 0d3FE0000000000000; +sub.f64 fd548, fd1326, fd547; +sub.f64 fd549, fd294, fd410; +mul.f64 fd550, fd549, 0d3FEBB67AE8584CAA; +sub.f64 fd551, fd548, fd550; +add.f64 fd552, fd550, fd548; +add.f64 fd553, fd459, fd464; +add.f64 fd554, fd194, fd553; +mul.f64 fd557, fd553, 0d3FE0000000000000; +sub.f64 fd558, fd194, fd557; +add.f64 fd1243, fd461, fd466; +sub.f64 fd559, fd461, fd466; +mul.f64 fd560, fd559, 0d3FEBB67AE8584CAA; +add.f64 fd561, fd560, fd558; +sub.f64 fd562, fd558, fd560; +add.f64 fd1242, fd1324, fd1243; +mul.f64 fd563, fd1243, 0d3FE0000000000000; +sub.f64 fd564, fd1324, fd563; +sub.f64 fd565, fd459, fd464; +mul.f64 fd566, fd565, 0d3FEBB67AE8584CAA; +sub.f64 fd567, fd564, fd566; +add.f64 fd568, fd566, fd564; +add.f64 fd569, fd469, fd474; +add.f64 fd570, fd210, fd569; +mul.f64 fd573, fd569, 0d3FE0000000000000; +sub.f64 fd574, fd210, fd573; +add.f64 fd1241, fd471, fd476; +sub.f64 fd575, fd471, fd476; +mul.f64 fd576, fd575, 0d3FEBB67AE8584CAA; +add.f64 fd577, fd576, fd574; +sub.f64 fd578, fd574, fd576; +add.f64 fd1240, fd1322, fd1241; +mul.f64 fd579, fd1241, 0d3FE0000000000000; +sub.f64 fd580, fd1322, fd579; +sub.f64 fd581, fd469, fd474; +mul.f64 fd582, fd581, 0d3FEBB67AE8584CAA; +sub.f64 fd583, fd580, fd582; +add.f64 fd584, fd582, fd580; +add.f64 fd585, fd479, fd484; +add.f64 fd586, fd185, fd585; +mul.f64 fd589, fd585, 0d3FE0000000000000; +sub.f64 fd590, fd185, fd589; +add.f64 fd1239, fd481, fd486; +sub.f64 fd591, fd481, fd486; +mul.f64 fd592, fd591, 0d3FEBB67AE8584CAA; +add.f64 fd593, fd592, fd590; +sub.f64 fd594, fd590, fd592; +add.f64 fd1238, fd191, fd1239; +mul.f64 fd595, fd1239, 0d3FE0000000000000; +sub.f64 fd596, fd191, fd595; +sub.f64 fd597, fd479, fd484; +mul.f64 fd598, fd597, 0d3FEBB67AE8584CAA; +sub.f64 fd599, fd596, fd598; +add.f64 fd600, fd598, fd596; +add.f64 fd601, fd489, fd494; +add.f64 fd602, fd201, fd601; +mul.f64 fd605, fd601, 0d3FE0000000000000; +sub.f64 fd606, fd201, fd605; +add.f64 fd1237, fd491, fd496; +sub.f64 fd607, fd491, fd496; +mul.f64 fd608, fd607, 0d3FEBB67AE8584CAA; +add.f64 fd609, fd608, fd606; +sub.f64 fd610, fd606, fd608; +add.f64 fd1236, fd207, fd1237; +mul.f64 fd611, fd1237, 0d3FE0000000000000; +sub.f64 fd612, fd207, fd611; +sub.f64 fd613, fd489, fd494; +mul.f64 fd614, fd613, 0d3FEBB67AE8584CAA; +sub.f64 fd615, fd612, fd614; +add.f64 fd616, fd614, fd612; +add.f64 fd617, fd499, fd504; +add.f64 fd618, fd217, fd617; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd217, fd621; +add.f64 fd1235, fd501, fd506; +sub.f64 fd623, fd501, fd506; +mul.f64 fd624, fd623, 0d3FEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +add.f64 fd1234, fd223, fd1235; +mul.f64 fd627, fd1235, 0d3FE0000000000000; +sub.f64 fd628, fd223, fd627; +sub.f64 fd629, fd499, fd504; +mul.f64 fd630, fd629, 0d3FEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd509, fd514; +add.f64 fd634, fd186, fd633; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd186, fd637; +add.f64 fd1233, fd511, fd516; +sub.f64 fd639, fd511, fd516; +mul.f64 fd640, fd639, 0d3FEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +add.f64 fd1232, fd192, fd1233; +mul.f64 fd643, fd1233, 0d3FE0000000000000; +sub.f64 fd644, fd192, fd643; +sub.f64 fd645, fd509, fd514; +mul.f64 fd646, fd645, 0d3FEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +add.f64 fd649, fd519, fd524; +add.f64 fd650, fd202, fd649; +mul.f64 fd653, fd649, 0d3FE0000000000000; +sub.f64 fd654, fd202, fd653; +add.f64 fd1231, fd521, fd526; +sub.f64 fd655, fd521, fd526; +mul.f64 fd656, fd655, 0d3FEBB67AE8584CAA; +add.f64 fd657, fd656, fd654; +sub.f64 fd658, fd654, fd656; +add.f64 fd1230, fd208, fd1231; +mul.f64 fd659, fd1231, 0d3FE0000000000000; +sub.f64 fd660, fd208, fd659; +sub.f64 fd661, fd519, fd524; +mul.f64 fd662, fd661, 0d3FEBB67AE8584CAA; +sub.f64 fd663, fd660, fd662; +add.f64 fd664, fd662, fd660; +add.f64 fd665, fd529, fd534; +add.f64 fd666, fd218, fd665; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd218, fd669; +add.f64 fd1229, fd531, fd536; +sub.f64 fd671, fd531, fd536; +mul.f64 fd672, fd671, 0d3FEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +add.f64 fd1228, fd224, fd1229; +mul.f64 fd675, fd1229, 0d3FE0000000000000; +sub.f64 fd676, fd224, fd675; +sub.f64 fd677, fd529, fd534; +mul.f64 fd678, fd677, 0d3FEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd681, fd682}, [rd6]; +mul.f64 fd686, fd682, fd1242; +mul.f64 fd1227, fd681, fd554; +sub.f64 fd687, fd1227, fd686; +mul.f64 fd688, fd681, fd1242; +fma.rn.f64 fd689, fd682, fd554, fd688; +mul.f64 fd691, fd682, fd682; +mul.f64 fd1226, fd681, fd681; +sub.f64 fd692, fd1226, fd691; +mul.f64 fd693, fd682, fd681; +fma.rn.f64 fd694, fd682, fd681, fd693; +mul.f64 fd696, fd694, fd1240; +mul.f64 fd1225, fd692, fd570; +sub.f64 fd697, fd1225, fd696; +mul.f64 fd698, fd692, fd1240; +fma.rn.f64 fd699, fd694, fd570, fd698; +mul.f64 fd701, fd682, fd694; +mul.f64 fd1224, fd681, fd692; +sub.f64 fd702, fd1224, fd701; +mul.f64 fd703, fd681, fd694; +fma.rn.f64 fd704, fd682, fd692, fd703; +mul.f64 fd706, fd704, fd1238; +mul.f64 fd1223, fd702, fd586; +sub.f64 fd707, fd1223, fd706; +mul.f64 fd708, fd702, fd1238; +fma.rn.f64 fd709, fd704, fd586, fd708; +mul.f64 fd1221, fd681, fd702; +mul.f64 fd1222, fd682, fd704; +sub.f64 fd712, fd1221, fd1222; +mul.f64 fd713, fd681, fd704; +fma.rn.f64 fd714, fd682, fd702, fd713; +mul.f64 fd1219, fd712, fd602; +mul.f64 fd1220, fd714, fd1236; +sub.f64 fd717, fd1219, fd1220; +mul.f64 fd718, fd712, fd1236; +fma.rn.f64 fd719, fd714, fd602, fd718; +mul.f64 fd1217, fd681, fd712; +mul.f64 fd1218, fd682, fd714; +sub.f64 fd722, fd1217, fd1218; +mul.f64 fd723, fd681, fd714; +fma.rn.f64 fd724, fd682, fd712, fd723; +mul.f64 fd1215, fd722, fd618; +mul.f64 fd1216, fd724, fd1234; +sub.f64 fd727, fd1215, fd1216; +mul.f64 fd728, fd722, fd1234; +fma.rn.f64 fd729, fd724, fd618, fd728; +mul.f64 fd731, fd682, fd724; +mul.f64 fd1214, fd681, fd722; +sub.f64 fd732, fd1214, fd731; +mul.f64 fd733, fd681, fd724; +fma.rn.f64 fd734, fd682, fd722, fd733; +mul.f64 fd736, fd734, fd1232; +mul.f64 fd1213, fd732, fd634; +sub.f64 fd737, fd1213, fd736; +mul.f64 fd738, fd732, fd1232; +fma.rn.f64 fd739, fd734, fd634, fd738; +mul.f64 fd741, fd682, fd734; +mul.f64 fd1212, fd681, fd732; +sub.f64 fd742, fd1212, fd741; +mul.f64 fd743, fd681, fd734; +fma.rn.f64 fd744, fd682, fd732, fd743; +mul.f64 fd746, fd744, fd1230; +mul.f64 fd1211, fd742, fd650; +sub.f64 fd747, fd1211, fd746; +mul.f64 fd748, fd742, fd1230; +fma.rn.f64 fd749, fd744, fd650, fd748; +mul.f64 fd751, fd682, fd744; +mul.f64 fd1210, fd681, fd742; +sub.f64 fd752, fd1210, fd751; +mul.f64 fd753, fd681, fd744; +fma.rn.f64 fd754, fd682, fd742, fd753; +mul.f64 fd756, fd754, fd1228; +mul.f64 fd1209, fd752, fd666; +sub.f64 fd757, fd1209, fd756; +mul.f64 fd758, fd752, fd1228; +fma.rn.f64 fd759, fd754, fd666, fd758; +mul.f64 fd1207, fd681, fd752; +mul.f64 fd1208, fd682, fd754; +sub.f64 fd762, fd1207, fd1208; +mul.f64 fd763, fd681, fd754; +fma.rn.f64 fd764, fd682, fd752, fd763; +mul.f64 fd1205, fd762, fd545; +mul.f64 fd1206, fd764, fd551; +sub.f64 fd767, fd1205, fd1206; +mul.f64 fd768, fd762, fd551; +fma.rn.f64 fd769, fd764, fd545, fd768; +mul.f64 fd1203, fd681, fd762; +mul.f64 fd1204, fd682, fd764; +sub.f64 fd772, fd1203, fd1204; +mul.f64 fd773, fd681, fd764; +fma.rn.f64 fd774, fd682, fd762, fd773; +mul.f64 fd776, fd774, fd567; +mul.f64 fd1202, fd772, fd561; +sub.f64 fd777, fd1202, fd776; +mul.f64 fd778, fd772, fd567; +fma.rn.f64 fd779, fd774, fd561, fd778; +mul.f64 fd781, fd682, fd774; +mul.f64 fd1201, fd681, fd772; +sub.f64 fd782, fd1201, fd781; +mul.f64 fd783, fd681, fd774; +fma.rn.f64 fd784, fd682, fd772, fd783; +mul.f64 fd786, fd784, fd583; +mul.f64 fd1200, fd782, fd577; +sub.f64 fd787, fd1200, fd786; +mul.f64 fd788, fd782, fd583; +fma.rn.f64 fd789, fd784, fd577, fd788; +mul.f64 fd791, fd682, fd784; +mul.f64 fd1199, fd681, fd782; +sub.f64 fd792, fd1199, fd791; +mul.f64 fd793, fd681, fd784; +fma.rn.f64 fd794, fd682, fd782, fd793; +mul.f64 fd796, fd794, fd599; +mul.f64 fd1198, fd792, fd593; +sub.f64 fd797, fd1198, fd796; +mul.f64 fd798, fd792, fd599; +fma.rn.f64 fd799, fd794, fd593, fd798; +mul.f64 fd801, fd682, fd794; +mul.f64 fd1197, fd681, fd792; +sub.f64 fd802, fd1197, fd801; +mul.f64 fd803, fd681, fd794; +fma.rn.f64 fd804, fd682, fd792, fd803; +mul.f64 fd1195, fd802, fd609; +mul.f64 fd1196, fd804, fd615; +sub.f64 fd807, fd1195, fd1196; +mul.f64 fd808, fd802, fd615; +fma.rn.f64 fd809, fd804, fd609, fd808; +ld.global.v2.f64 {fd810, fd811}, [rd6+48]; +mul.f64 fd815, fd811, fd631; +mul.f64 fd1194, fd810, fd625; +sub.f64 fd816, fd1194, fd815; +mul.f64 fd817, fd810, fd631; +fma.rn.f64 fd818, fd811, fd625, fd817; +mul.f64 fd820, fd682, fd811; +mul.f64 fd1193, fd681, fd810; +sub.f64 fd821, fd1193, fd820; +mul.f64 fd822, fd681, fd811; +fma.rn.f64 fd823, fd682, fd810, fd822; +mul.f64 fd1191, fd821, fd641; +mul.f64 fd1192, fd823, fd647; +sub.f64 fd826, fd1191, fd1192; +mul.f64 fd827, fd821, fd647; +fma.rn.f64 fd828, fd823, fd641, fd827; +mul.f64 fd1189, fd681, fd821; +mul.f64 fd1190, fd682, fd823; +sub.f64 fd831, fd1189, fd1190; +mul.f64 fd832, fd681, fd823; +fma.rn.f64 fd833, fd682, fd821, fd832; +mul.f64 fd1187, fd831, fd657; +mul.f64 fd1188, fd833, fd663; +sub.f64 fd836, fd1187, fd1188; +mul.f64 fd837, fd831, fd663; +fma.rn.f64 fd838, fd833, fd657, fd837; +mul.f64 fd840, fd682, fd833; +mul.f64 fd1186, fd681, fd831; +sub.f64 fd841, fd1186, fd840; +mul.f64 fd842, fd681, fd833; +fma.rn.f64 fd843, fd682, fd831, fd842; +mul.f64 fd845, fd843, fd679; +mul.f64 fd1185, fd841, fd673; +sub.f64 fd846, fd1185, fd845; +mul.f64 fd847, fd841, fd679; +fma.rn.f64 fd848, fd843, fd673, fd847; +mul.f64 fd850, fd682, fd843; +mul.f64 fd1184, fd681, fd841; +sub.f64 fd851, fd1184, fd850; +mul.f64 fd852, fd681, fd843; +fma.rn.f64 fd853, fd682, fd841, fd852; +mul.f64 fd855, fd853, fd552; +mul.f64 fd1183, fd851, fd546; +sub.f64 fd856, fd1183, fd855; +mul.f64 fd857, fd851, fd552; +fma.rn.f64 fd858, fd853, fd546, fd857; +mul.f64 fd860, fd682, fd853; +mul.f64 fd1182, fd681, fd851; +sub.f64 fd861, fd1182, fd860; +mul.f64 fd862, fd681, fd853; +fma.rn.f64 fd863, fd682, fd851, fd862; +mul.f64 fd865, fd863, fd568; +mul.f64 fd1181, fd861, fd562; +sub.f64 fd866, fd1181, fd865; +mul.f64 fd867, fd861, fd568; +fma.rn.f64 fd868, fd863, fd562, fd867; +mul.f64 fd1179, fd681, fd861; +mul.f64 fd1180, fd682, fd863; +sub.f64 fd871, fd1179, fd1180; +mul.f64 fd872, fd681, fd863; +fma.rn.f64 fd873, fd682, fd861, fd872; +mul.f64 fd1177, fd871, fd578; +mul.f64 fd1178, fd873, fd584; +sub.f64 fd876, fd1177, fd1178; +mul.f64 fd877, fd871, fd584; +fma.rn.f64 fd878, fd873, fd578, fd877; +mul.f64 fd1175, fd681, fd871; +mul.f64 fd1176, fd682, fd873; +sub.f64 fd881, fd1175, fd1176; +mul.f64 fd882, fd681, fd873; +fma.rn.f64 fd883, fd682, fd871, fd882; +mul.f64 fd1173, fd881, fd594; +mul.f64 fd1174, fd883, fd600; +sub.f64 fd886, fd1173, fd1174; +mul.f64 fd887, fd881, fd600; +fma.rn.f64 fd888, fd883, fd594, fd887; +mul.f64 fd890, fd682, fd883; +mul.f64 fd1172, fd681, fd881; +sub.f64 fd891, fd1172, fd890; +mul.f64 fd892, fd681, fd883; +fma.rn.f64 fd893, fd682, fd881, fd892; +mul.f64 fd895, fd893, fd616; +mul.f64 fd1171, fd891, fd610; +sub.f64 fd896, fd1171, fd895; +mul.f64 fd897, fd891, fd616; +fma.rn.f64 fd898, fd893, fd610, fd897; +mul.f64 fd900, fd682, fd893; +mul.f64 fd1170, fd681, fd891; +sub.f64 fd901, fd1170, fd900; +mul.f64 fd902, fd681, fd893; +fma.rn.f64 fd903, fd682, fd891, fd902; +mul.f64 fd905, fd903, fd632; +mul.f64 fd1169, fd901, fd626; +sub.f64 fd906, fd1169, fd905; +mul.f64 fd907, fd901, fd632; +fma.rn.f64 fd908, fd903, fd626, fd907; +mul.f64 fd910, fd682, fd903; +mul.f64 fd1168, fd681, fd901; +sub.f64 fd911, fd1168, fd910; +mul.f64 fd912, fd681, fd903; +fma.rn.f64 fd913, fd682, fd901, fd912; +mul.f64 fd1166, fd911, fd642; +mul.f64 fd1167, fd913, fd648; +sub.f64 fd916, fd1166, fd1167; +mul.f64 fd917, fd911, fd648; +fma.rn.f64 fd918, fd913, fd642, fd917; +mul.f64 fd1164, fd681, fd911; +mul.f64 fd1165, fd682, fd913; +sub.f64 fd921, fd1164, fd1165; +mul.f64 fd922, fd681, fd913; +fma.rn.f64 fd923, fd682, fd911, fd922; +mul.f64 fd1162, fd921, fd658; +mul.f64 fd1163, fd923, fd664; +sub.f64 fd926, fd1162, fd1163; +mul.f64 fd927, fd921, fd664; +fma.rn.f64 fd928, fd923, fd658, fd927; +mul.f64 fd1160, fd681, fd921; +mul.f64 fd1161, fd682, fd923; +sub.f64 fd931, fd1160, fd1161; +mul.f64 fd932, fd681, fd923; +fma.rn.f64 fd933, fd682, fd921, fd932; +mul.f64 fd935, fd933, fd680; +mul.f64 fd1159, fd931, fd674; +sub.f64 fd936, fd1159, fd935; +mul.f64 fd937, fd931, fd680; +fma.rn.f64 fd938, fd933, fd674, fd937; +mad.lo.s32 r8, r5, 648, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +st.shared.f64 [r9], fd538; +st.shared.f64 [r9+8], fd687; +st.shared.f64 [r9+16], fd697; +st.shared.f64 [r9+24], fd707; +st.shared.f64 [r9+32], fd717; +st.shared.f64 [r9+40], fd727; +st.shared.f64 [r9+48], fd737; +st.shared.f64 [r9+56], fd747; +st.shared.f64 [r9+64], fd757; +st.shared.f64 [r9+72], fd767; +st.shared.f64 [r9+80], fd777; +st.shared.f64 [r9+88], fd787; +st.shared.f64 [r9+96], fd797; +st.shared.f64 [r9+104], fd807; +st.shared.f64 [r9+112], fd816; +st.shared.f64 [r9+120], fd826; +st.shared.f64 [r9+128], fd836; +st.shared.f64 [r9+136], fd846; +st.shared.f64 [r9+144], fd856; +st.shared.f64 [r9+152], fd866; +st.shared.f64 [r9+160], fd876; +st.shared.f64 [r9+168], fd886; +st.shared.f64 [r9+176], fd896; +st.shared.f64 [r9+184], fd906; +st.shared.f64 [r9+192], fd916; +st.shared.f64 [r9+200], fd926; +st.shared.f64 [r9+208], fd936; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.f64 fd939, [r10]; +ld.shared.f64 fd940, [r10+24]; +ld.shared.f64 fd941, [r10+48]; +ld.shared.f64 fd942, [r10+72]; +ld.shared.f64 fd943, [r10+96]; +ld.shared.f64 fd944, [r10+120]; +ld.shared.f64 fd945, [r10+144]; +ld.shared.f64 fd946, [r10+168]; +ld.shared.f64 fd947, [r10+192]; +ld.shared.f64 fd948, [r10+216]; +ld.shared.f64 fd949, [r10+240]; +ld.shared.f64 fd950, [r10+264]; +ld.shared.f64 fd951, [r10+288]; +ld.shared.f64 fd952, [r10+312]; +ld.shared.f64 fd953, [r10+336]; +ld.shared.f64 fd954, [r10+360]; +ld.shared.f64 fd955, [r10+384]; +ld.shared.f64 fd956, [r10+408]; +ld.shared.f64 fd957, [r10+432]; +ld.shared.f64 fd958, [r10+456]; +ld.shared.f64 fd959, [r10+480]; +ld.shared.f64 fd960, [r10+504]; +ld.shared.f64 fd961, [r10+528]; +ld.shared.f64 fd962, [r10+552]; +ld.shared.f64 fd963, [r10+576]; +ld.shared.f64 fd964, [r10+600]; +ld.shared.f64 fd965, [r10+624]; +barrier.sync 0; +st.shared.f64 [r9], fd1244; +st.shared.f64 [r9+8], fd689; +st.shared.f64 [r9+16], fd699; +st.shared.f64 [r9+24], fd709; +st.shared.f64 [r9+32], fd719; +st.shared.f64 [r9+40], fd729; +st.shared.f64 [r9+48], fd739; +st.shared.f64 [r9+56], fd749; +st.shared.f64 [r9+64], fd759; +st.shared.f64 [r9+72], fd769; +st.shared.f64 [r9+80], fd779; +st.shared.f64 [r9+88], fd789; +st.shared.f64 [r9+96], fd799; +st.shared.f64 [r9+104], fd809; +st.shared.f64 [r9+112], fd818; +st.shared.f64 [r9+120], fd828; +st.shared.f64 [r9+128], fd838; +st.shared.f64 [r9+136], fd848; +st.shared.f64 [r9+144], fd858; +st.shared.f64 [r9+152], fd868; +st.shared.f64 [r9+160], fd878; +st.shared.f64 [r9+168], fd888; +st.shared.f64 [r9+176], fd898; +st.shared.f64 [r9+184], fd908; +st.shared.f64 [r9+192], fd918; +st.shared.f64 [r9+200], fd928; +st.shared.f64 [r9+208], fd938; +barrier.sync 0; +ld.shared.f64 fd966, [r10]; +ld.shared.f64 fd967, [r10+24]; +ld.shared.f64 fd968, [r10+48]; +ld.shared.f64 fd969, [r10+72]; +ld.shared.f64 fd970, [r10+96]; +ld.shared.f64 fd971, [r10+120]; +ld.shared.f64 fd972, [r10+144]; +ld.shared.f64 fd973, [r10+168]; +ld.shared.f64 fd974, [r10+192]; +ld.shared.f64 fd975, [r10+216]; +ld.shared.f64 fd976, [r10+240]; +ld.shared.f64 fd977, [r10+264]; +ld.shared.f64 fd978, [r10+288]; +ld.shared.f64 fd979, [r10+312]; +ld.shared.f64 fd980, [r10+336]; +ld.shared.f64 fd981, [r10+360]; +ld.shared.f64 fd982, [r10+384]; +ld.shared.f64 fd983, [r10+408]; +ld.shared.f64 fd984, [r10+432]; +ld.shared.f64 fd985, [r10+456]; +ld.shared.f64 fd986, [r10+480]; +ld.shared.f64 fd987, [r10+504]; +ld.shared.f64 fd988, [r10+528]; +ld.shared.f64 fd989, [r10+552]; +ld.shared.f64 fd990, [r10+576]; +ld.shared.f64 fd991, [r10+600]; +ld.shared.f64 fd992, [r10+624]; +add.f64 fd993, fd948, fd957; +mul.f64 fd995, fd993, 0d3FE0000000000000; +sub.f64 fd996, fd939, fd995; +add.f64 fd1158, fd975, fd984; +sub.f64 fd997, fd975, fd984; +mul.f64 fd998, fd997, 0d3FEBB67AE8584CAA; +mul.f64 fd999, fd1158, 0d3FE0000000000000; +sub.f64 fd1000, fd966, fd999; +sub.f64 fd1001, fd948, fd957; +mul.f64 fd1002, fd1001, 0d3FEBB67AE8584CAA; +add.f64 fd1003, fd949, fd958; +mul.f64 fd1005, fd1003, 0d3FE0000000000000; +sub.f64 fd1006, fd940, fd1005; +add.f64 fd1157, fd976, fd985; +sub.f64 fd1007, fd976, fd985; +mul.f64 fd1008, fd1007, 0d3FEBB67AE8584CAA; +mul.f64 fd1009, fd1157, 0d3FE0000000000000; +sub.f64 fd1010, fd967, fd1009; +sub.f64 fd1011, fd949, fd958; +mul.f64 fd1012, fd1011, 0d3FEBB67AE8584CAA; +add.f64 fd1013, fd950, fd959; +mul.f64 fd1015, fd1013, 0d3FE0000000000000; +sub.f64 fd1016, fd941, fd1015; +add.f64 fd1156, fd977, fd986; +sub.f64 fd1017, fd977, fd986; +mul.f64 fd1018, fd1017, 0d3FEBB67AE8584CAA; +mul.f64 fd1019, fd1156, 0d3FE0000000000000; +sub.f64 fd1020, fd968, fd1019; +sub.f64 fd1021, fd950, fd959; +mul.f64 fd1022, fd1021, 0d3FEBB67AE8584CAA; +add.f64 fd1023, fd951, fd960; +mul.f64 fd1025, fd1023, 0d3FE0000000000000; +sub.f64 fd1026, fd942, fd1025; +add.f64 fd1155, fd978, fd987; +sub.f64 fd1027, fd978, fd987; +mul.f64 fd1028, fd1027, 0d3FEBB67AE8584CAA; +mul.f64 fd1029, fd1155, 0d3FE0000000000000; +sub.f64 fd1030, fd969, fd1029; +sub.f64 fd1031, fd951, fd960; +mul.f64 fd1032, fd1031, 0d3FEBB67AE8584CAA; +add.f64 fd1033, fd952, fd961; +mul.f64 fd1035, fd1033, 0d3FE0000000000000; +sub.f64 fd1036, fd943, fd1035; +add.f64 fd1154, fd979, fd988; +sub.f64 fd1037, fd979, fd988; +mul.f64 fd1038, fd1037, 0d3FEBB67AE8584CAA; +mul.f64 fd1039, fd1154, 0d3FE0000000000000; +sub.f64 fd1040, fd970, fd1039; +sub.f64 fd1041, fd952, fd961; +mul.f64 fd1042, fd1041, 0d3FEBB67AE8584CAA; +add.f64 fd1043, fd953, fd962; +mul.f64 fd1045, fd1043, 0d3FE0000000000000; +sub.f64 fd1046, fd944, fd1045; +add.f64 fd1153, fd980, fd989; +sub.f64 fd1047, fd980, fd989; +mul.f64 fd1048, fd1047, 0d3FEBB67AE8584CAA; +mul.f64 fd1049, fd1153, 0d3FE0000000000000; +sub.f64 fd1050, fd971, fd1049; +sub.f64 fd1051, fd953, fd962; +mul.f64 fd1052, fd1051, 0d3FEBB67AE8584CAA; +add.f64 fd1053, fd954, fd963; +mul.f64 fd1055, fd1053, 0d3FE0000000000000; +sub.f64 fd1056, fd945, fd1055; +add.f64 fd1152, fd981, fd990; +sub.f64 fd1057, fd981, fd990; +mul.f64 fd1058, fd1057, 0d3FEBB67AE8584CAA; +mul.f64 fd1059, fd1152, 0d3FE0000000000000; +sub.f64 fd1060, fd972, fd1059; +sub.f64 fd1061, fd954, fd963; +mul.f64 fd1062, fd1061, 0d3FEBB67AE8584CAA; +add.f64 fd1063, fd955, fd964; +mul.f64 fd1065, fd1063, 0d3FE0000000000000; +sub.f64 fd1066, fd946, fd1065; +add.f64 fd1151, fd982, fd991; +sub.f64 fd1067, fd982, fd991; +mul.f64 fd1068, fd1067, 0d3FEBB67AE8584CAA; +mul.f64 fd1069, fd1151, 0d3FE0000000000000; +sub.f64 fd1070, fd973, fd1069; +sub.f64 fd1071, fd955, fd964; +mul.f64 fd1072, fd1071, 0d3FEBB67AE8584CAA; +add.f64 fd1073, fd956, fd965; +mul.f64 fd1075, fd1073, 0d3FE0000000000000; +sub.f64 fd1076, fd947, fd1075; +add.f64 fd1150, fd983, fd992; +sub.f64 fd1077, fd983, fd992; +mul.f64 fd1078, fd1077, 0d3FEBB67AE8584CAA; +mul.f64 fd1079, fd1150, 0d3FE0000000000000; +sub.f64 fd1080, fd974, fd1079; +sub.f64 fd1081, fd956, fd965; +mul.f64 fd1342, fd1013, 0d3FE0000000000000; +sub.f64 fd1341, fd941, fd1342; +mul.f64 fd1082, fd1081, 0d3FEBB67AE8584CAA; +add.f64 %0, fd939, fd993; +mul.f64 fd1344, fd1155, 0d3FE0000000000000; +sub.f64 fd1343, fd969, fd1344; +add.f64 %1, fd966, fd1158; +mul.f64 fd1346, fd1013, 0d3FE0000000000000; +sub.f64 fd1345, fd941, fd1346; +mul.f64 fd1348, fd1003, 0d3FE0000000000000; +sub.f64 fd1347, fd940, fd1348; +add.f64 %2, fd940, fd1003; +add.f64 %3, fd967, fd1157; +add.f64 %4, fd941, fd1013; +add.f64 %5, fd968, fd1156; +add.f64 %6, fd942, fd1023; +add.f64 %7, fd969, fd1155; +add.f64 %8, fd943, fd1033; +add.f64 %9, fd970, fd1154; +add.f64 %10, fd944, fd1043; +add.f64 %11, fd971, fd1153; +add.f64 %12, fd945, fd1053; +add.f64 %13, fd972, fd1152; +add.f64 %14, fd946, fd1063; +add.f64 %15, fd973, fd1151; +add.f64 %16, fd947, fd1073; +add.f64 %17, fd974, fd1150; +add.f64 %18, fd998, fd996; +sub.f64 %19, fd1000, fd1002; +add.f64 %20, fd1008, fd1347; +sub.f64 %21, fd1010, fd1012; +sub.f64 %23, fd1020, fd1022; +add.f64 %22, fd1018, fd1345; +sub.f64 %25, fd1343, fd1032; +add.f64 %24, fd1028, fd1026; +sub.f64 %27, fd1040, fd1042; +add.f64 %26, fd1038, fd1036; +sub.f64 %29, fd1050, fd1052; +add.f64 %28, fd1048, fd1046; +add.f64 %30, fd1058, fd1056; +sub.f64 %31, fd1060, fd1062; +add.f64 %32, fd1068, fd1066; +sub.f64 %33, fd1070, fd1072; +sub.f64 %35, fd1080, fd1082; +add.f64 %34, fd1078, fd1076; +sub.f64 %36, fd996, fd998; +add.f64 %37, fd1002, fd1000; +sub.f64 %38, fd1347, fd1008; +add.f64 %39, fd1012, fd1010; +sub.f64 %40, fd1345, fd1018; +add.f64 %41, fd1022, fd1020; +sub.f64 %42, fd1026, fd1028; +add.f64 %43, fd1032, fd1343; +sub.f64 %44, fd1036, fd1038; +add.f64 %45, fd1042, fd1040; +sub.f64 %46, fd1046, fd1048; +add.f64 %47, fd1052, fd1050; +sub.f64 %48, fd1056, fd1058; +add.f64 %49, fd1062, fd1060; +sub.f64 %50, fd1066, fd1068; +add.f64 %51, fd1072, fd1070; +sub.f64 %52, fd1076, fd1078; +add.f64 %53, fd1082, fd1080; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<513, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<167>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 1296, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %12, %15; +add.f64 fd14, %14, %16; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %10, fd15; +sub.f64 fd17, %14, %16; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %11, fd21; +sub.f64 fd23, %12, %15; +mul.f64 fd24, fd23, 0d3FEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 1296, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd27, fd19; +mul.f64 fd32, fd28, fd25; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+432]; +mul.f64 fd38, fd34, fd20; +mul.f64 fd39, fd35, fd26; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r13, r11, 48, r12; +add.f64 fd41, %11, fd14; +add.f64 fd42, %10, fd13; +st.shared.v2.f64 [r13], {fd42, fd41}; +fma.rn.f64 fd43, fd28, fd19, fd33; +sub.f64 fd44, fd31, fd32; +st.shared.v2.f64 [r13+16], {fd44, fd43}; +fma.rn.f64 fd45, fd35, fd20, fd40; +sub.f64 fd46, fd38, fd39; +st.shared.v2.f64 [r13+32], {fd46, fd45}; +barrier.sync 0; +shl.b32 r14, r11, 5; +sub.s32 r15, r13, r14; +ld.shared.v2.f64 {fd47, fd48}, [r15]; +ld.shared.v2.f64 {fd51, fd52}, [r15+432]; +ld.shared.v2.f64 {fd55, fd56}, [r15+864]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0d3FEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0d3FEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 4; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd73, fd65; +mul.f64 fd78, fd74, fd71; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+144]; +mul.f64 fd84, fd80, fd66; +mul.f64 fd85, fd81, fd72; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r21, r16, 144, r20; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r21], {fd88, fd87}; +fma.rn.f64 fd89, fd74, fd65, fd79; +sub.f64 fd90, fd77, fd78; +st.shared.v2.f64 [r21+48], {fd90, fd89}; +fma.rn.f64 fd91, fd81, fd66, fd86; +sub.f64 fd92, fd84, fd85; +st.shared.v2.f64 [r21+96], {fd92, fd91}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r15]; +ld.shared.v2.f64 {fd97, fd98}, [r15+432]; +ld.shared.v2.f64 {fd101, fd102}, [r15+864]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0d3FEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 4; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd119, fd111; +mul.f64 fd124, fd120, fd117; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+48]; +mul.f64 fd130, fd126, fd112; +mul.f64 fd131, fd127, fd118; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r27, r22, 432, r26; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r27], {fd134, fd133}; +fma.rn.f64 fd135, fd120, fd111, fd125; +sub.f64 fd136, fd123, fd124; +st.shared.v2.f64 [r27+144], {fd136, fd135}; +fma.rn.f64 fd137, fd127, fd112, fd132; +sub.f64 fd138, fd130, fd131; +st.shared.v2.f64 [r27+288], {fd138, fd137}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r15]; +ld.shared.v2.f64 {fd143, fd144}, [r15+432]; +ld.shared.v2.f64 {fd147, fd148}, [r15+864]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0d3FEBB67AE8584CAA; +mul.f64 fd157, fd152, 0d3FE0000000000000; +sub.f64 fd158, fd140, fd157; +sub.f64 fd159, fd143, fd147; +mul.f64 fd160, fd159, 0d3FEBB67AE8584CAA; +add.f64 %1, fd140, fd152; +add.f64 %0, fd139, fd151; +sub.f64 %3, fd158, fd160; +add.f64 %2, fd156, fd154; +add.f64 %5, fd160, fd158; +sub.f64 %4, fd154, fd156; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<514, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<149>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 648, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %12, %15; +add.f64 fd14, %10, fd13; +add.f64 fd15, %14, %16; +add.f64 fd16, %11, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %10, fd17; +sub.f64 fd19, %14, %16; +mul.f64 fd20, fd19, 0d3FEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %11, fd23; +sub.f64 fd25, %12, %15; +mul.f64 fd26, fd25, 0d3FEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 648, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd29, fd21; +mul.f64 fd34, fd30, fd27; +sub.f64 fd35, fd33, fd34; +mul.f64 fd36, fd29, fd27; +fma.rn.f64 fd37, fd30, fd21, fd36; +ld.global.v2.f64 {fd38, fd39}, [rd6+432]; +mul.f64 fd42, fd38, fd22; +mul.f64 fd43, fd39, fd28; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd38, fd28; +fma.rn.f64 fd46, fd39, fd22, fd45; +barrier.sync 0; +mad.lo.s32 r13, r11, 24, r12; +st.shared.f64 [r13], fd14; +st.shared.f64 [r13+8], fd35; +st.shared.f64 [r13+16], fd44; +barrier.sync 0; +shl.b32 r14, r11, 4; +sub.s32 r15, r13, r14; +ld.shared.f64 fd47, [r15]; +ld.shared.f64 fd48, [r15+216]; +ld.shared.f64 fd49, [r15+432]; +barrier.sync 0; +st.shared.f64 [r13], fd16; +st.shared.f64 [r13+8], fd37; +st.shared.f64 [r13+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r15]; +ld.shared.f64 fd51, [r15+216]; +ld.shared.f64 fd52, [r15+432]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd69, fd61; +mul.f64 fd74, fd70, fd67; +sub.f64 fd75, fd73, fd74; +mul.f64 fd76, fd69, fd67; +fma.rn.f64 fd77, fd70, fd61, fd76; +ld.global.v2.f64 {fd78, fd79}, [rd11+144]; +mul.f64 fd82, fd78, fd62; +mul.f64 fd83, fd79, fd68; +sub.f64 fd84, fd82, fd83; +mul.f64 fd85, fd78, fd68; +fma.rn.f64 fd86, fd79, fd62, fd85; +barrier.sync 0; +mad.lo.s32 r21, r16, 72, r20; +st.shared.f64 [r21], fd54; +st.shared.f64 [r21+24], fd75; +st.shared.f64 [r21+48], fd84; +barrier.sync 0; +ld.shared.f64 fd87, [r15]; +ld.shared.f64 fd88, [r15+216]; +ld.shared.f64 fd89, [r15+432]; +barrier.sync 0; +st.shared.f64 [r21], fd56; +st.shared.f64 [r21+24], fd77; +st.shared.f64 [r21+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r15]; +ld.shared.f64 fd91, [r15+216]; +ld.shared.f64 fd92, [r15+432]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0d3FEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0d3FEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 3; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd109, fd101; +mul.f64 fd114, fd110, fd107; +sub.f64 fd115, fd113, fd114; +mul.f64 fd116, fd109, fd107; +fma.rn.f64 fd117, fd110, fd101, fd116; +ld.global.v2.f64 {fd118, fd119}, [rd16+48]; +mul.f64 fd122, fd118, fd102; +mul.f64 fd123, fd119, fd108; +sub.f64 fd124, fd122, fd123; +mul.f64 fd125, fd118, fd108; +fma.rn.f64 fd126, fd119, fd102, fd125; +barrier.sync 0; +mad.lo.s32 r27, r22, 216, r26; +st.shared.f64 [r27], fd94; +st.shared.f64 [r27+72], fd115; +st.shared.f64 [r27+144], fd124; +barrier.sync 0; +ld.shared.f64 fd127, [r15]; +ld.shared.f64 fd128, [r15+216]; +ld.shared.f64 fd129, [r15+432]; +barrier.sync 0; +st.shared.f64 [r27], fd96; +st.shared.f64 [r27+72], fd117; +st.shared.f64 [r27+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r15]; +ld.shared.f64 fd131, [r15+216]; +ld.shared.f64 fd132, [r15+432]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd131, fd132; +mul.f64 fd135, fd133, 0d3FE0000000000000; +sub.f64 fd136, fd127, fd135; +sub.f64 fd137, fd131, fd132; +mul.f64 fd138, fd137, 0d3FEBB67AE8584CAA; +mul.f64 fd139, fd134, 0d3FE0000000000000; +sub.f64 fd140, fd130, fd139; +sub.f64 fd141, fd128, fd129; +mul.f64 fd142, fd141, 0d3FEBB67AE8584CAA; +add.f64 %0, fd127, fd133; +add.f64 %1, fd130, fd134; +add.f64 %2, fd138, fd136; +sub.f64 %3, fd140, fd142; +sub.f64 %4, fd136, fd138; +add.f64 %5, fd142, fd140; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..0dd3f4c8d2e91 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_81_fp64_inv.hpp.inc @@ -0,0 +1,3280 @@ +#ifndef CUFFTDX_FFT_81_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_81_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<680, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<383>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 1296, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd40, fd113; +sub.f64 fd115, fd54, fd70; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +add.f64 fd119, fd87, fd92; +add.f64 fd120, fd45, fd119; +add.f64 fd121, fd89, fd94; +add.f64 fd122, fd51, fd121; +mul.f64 fd123, fd119, 0d3FE0000000000000; +sub.f64 fd124, fd45, fd123; +sub.f64 fd125, fd89, fd94; +mul.f64 fd126, fd125, 0dBFEBB67AE8584CAA; +add.f64 fd127, fd126, fd124; +sub.f64 fd128, fd124, fd126; +mul.f64 fd129, fd121, 0d3FE0000000000000; +sub.f64 fd130, fd51, fd129; +sub.f64 fd131, fd87, fd92; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +sub.f64 fd133, fd130, fd132; +add.f64 fd134, fd132, fd130; +add.f64 fd135, fd97, fd102; +add.f64 fd136, fd46, fd135; +add.f64 fd137, fd99, fd104; +add.f64 fd138, fd52, fd137; +mul.f64 fd139, fd135, 0d3FE0000000000000; +sub.f64 fd140, fd46, fd139; +sub.f64 fd141, fd99, fd104; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 fd143, fd142, fd140; +sub.f64 fd144, fd140, fd142; +mul.f64 fd145, fd137, 0d3FE0000000000000; +sub.f64 fd146, fd52, fd145; +sub.f64 fd147, fd97, fd102; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +sub.f64 fd149, fd146, fd148; +add.f64 fd150, fd148, fd146; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 1296, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd151, fd152}, [rd6]; +mul.f64 fd155, fd122, fd152; +mul.f64 fd156, fd120, fd152; +mul.f64 fd157, fd151, fd122; +mul.f64 fd158, fd151, fd151; +mul.f64 fd159, fd152, fd152; +sub.f64 fd160, fd158, fd159; +mul.f64 fd161, fd152, fd151; +fma.rn.f64 fd162, fd152, fd151, fd161; +mul.f64 fd163, fd138, fd162; +mul.f64 fd164, fd136, fd162; +mul.f64 fd165, fd160, fd138; +mul.f64 fd166, fd151, fd160; +mul.f64 fd167, fd152, fd162; +sub.f64 fd168, fd166, fd167; +mul.f64 fd169, fd151, fd162; +fma.rn.f64 fd170, fd152, fd160, fd169; +mul.f64 fd171, fd117, fd170; +mul.f64 fd172, fd111, fd170; +mul.f64 fd173, fd168, fd117; +mul.f64 fd174, fd151, fd168; +mul.f64 fd175, fd152, fd170; +sub.f64 fd176, fd174, fd175; +mul.f64 fd177, fd151, fd170; +fma.rn.f64 fd178, fd152, fd168, fd177; +mul.f64 fd179, fd133, fd178; +mul.f64 fd180, fd127, fd178; +mul.f64 fd181, fd176, fd133; +ld.global.v2.f64 {fd182, fd183}, [rd6+144]; +mul.f64 fd186, fd149, fd183; +mul.f64 fd187, fd143, fd183; +mul.f64 fd188, fd182, fd149; +mul.f64 fd189, fd151, fd182; +mul.f64 fd190, fd152, fd183; +sub.f64 fd191, fd189, fd190; +mul.f64 fd192, fd151, fd183; +fma.rn.f64 fd193, fd152, fd182, fd192; +mul.f64 fd194, fd118, fd193; +mul.f64 fd195, fd112, fd193; +mul.f64 fd196, fd191, fd118; +mul.f64 fd197, fd151, fd191; +mul.f64 fd198, fd152, fd193; +sub.f64 fd199, fd197, fd198; +mul.f64 fd200, fd151, fd193; +fma.rn.f64 fd201, fd152, fd191, fd200; +mul.f64 fd202, fd134, fd201; +mul.f64 fd203, fd128, fd201; +mul.f64 fd204, fd199, fd134; +mul.f64 fd205, fd151, fd199; +mul.f64 fd206, fd152, fd201; +sub.f64 fd207, fd205, fd206; +mul.f64 fd208, fd151, fd201; +fma.rn.f64 fd209, fd152, fd199, fd208; +mul.f64 fd210, fd150, fd209; +mul.f64 fd211, fd144, fd209; +mul.f64 fd212, fd207, fd150; +barrier.sync 0; +mad.lo.s32 r9, r7, 144, r8; +add.f64 fd213, fd40, fd106; +add.f64 fd214, fd38, fd105; +st.shared.v2.f64 [r9], {fd214, fd213}; +fma.rn.f64 fd215, fd151, fd120, fd155; +sub.f64 fd216, fd157, fd156; +st.shared.v2.f64 [r9+16], {fd215, fd216}; +fma.rn.f64 fd217, fd160, fd136, fd163; +sub.f64 fd218, fd165, fd164; +st.shared.v2.f64 [r9+32], {fd217, fd218}; +sub.f64 fd219, fd173, fd172; +fma.rn.f64 fd220, fd168, fd111, fd171; +st.shared.v2.f64 [r9+48], {fd220, fd219}; +fma.rn.f64 fd221, fd176, fd127, fd179; +sub.f64 fd222, fd181, fd180; +st.shared.v2.f64 [r9+64], {fd221, fd222}; +fma.rn.f64 fd223, fd182, fd143, fd186; +sub.f64 fd224, fd188, fd187; +st.shared.v2.f64 [r9+80], {fd223, fd224}; +fma.rn.f64 fd225, fd191, fd112, fd194; +sub.f64 fd226, fd196, fd195; +st.shared.v2.f64 [r9+96], {fd225, fd226}; +fma.rn.f64 fd227, fd199, fd128, fd202; +sub.f64 fd228, fd204, fd203; +st.shared.v2.f64 [r9+112], {fd227, fd228}; +fma.rn.f64 fd229, fd207, fd144, fd210; +sub.f64 fd230, fd212, fd211; +st.shared.v2.f64 [r9+128], {fd229, fd230}; +barrier.sync 0; +shl.b32 r10, r7, 7; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd231, fd232}, [r11]; +ld.shared.v2.f64 {fd235, fd236}, [r11+144]; +ld.shared.v2.f64 {fd239, fd240}, [r11+288]; +ld.shared.v2.f64 {fd243, fd244}, [r11+432]; +ld.shared.v2.f64 {fd247, fd248}, [r11+576]; +ld.shared.v2.f64 {fd251, fd252}, [r11+720]; +ld.shared.v2.f64 {fd255, fd256}, [r11+864]; +ld.shared.v2.f64 {fd259, fd260}, [r11+1008]; +ld.shared.v2.f64 {fd263, fd264}, [r11+1152]; +add.f64 fd267, fd243, fd255; +add.f64 fd268, fd231, fd267; +add.f64 fd269, fd244, fd256; +add.f64 fd270, fd232, fd269; +mul.f64 fd271, fd267, 0d3FE0000000000000; +sub.f64 fd272, fd231, fd271; +sub.f64 fd273, fd244, fd256; +mul.f64 fd274, fd273, 0dBFEBB67AE8584CAA; +add.f64 fd275, fd274, fd272; +sub.f64 fd276, fd272, fd274; +mul.f64 fd277, fd269, 0d3FE0000000000000; +sub.f64 fd278, fd232, fd277; +sub.f64 fd279, fd243, fd255; +mul.f64 fd280, fd279, 0dBFEBB67AE8584CAA; +sub.f64 fd281, fd278, fd280; +add.f64 fd282, fd280, fd278; +add.f64 fd283, fd247, fd259; +add.f64 fd284, fd235, fd283; +add.f64 fd285, fd248, fd260; +add.f64 fd286, fd236, fd285; +mul.f64 fd287, fd283, 0d3FE0000000000000; +sub.f64 fd288, fd235, fd287; +sub.f64 fd289, fd248, fd260; +mul.f64 fd290, fd289, 0dBFEBB67AE8584CAA; +add.f64 fd291, fd290, fd288; +sub.f64 fd292, fd288, fd290; +mul.f64 fd293, fd285, 0d3FE0000000000000; +sub.f64 fd294, fd236, fd293; +sub.f64 fd295, fd247, fd259; +mul.f64 fd296, fd295, 0dBFEBB67AE8584CAA; +sub.f64 fd297, fd294, fd296; +add.f64 fd298, fd296, fd294; +add.f64 fd299, fd251, fd263; +add.f64 fd300, fd239, fd299; +add.f64 fd301, fd252, fd264; +add.f64 fd302, fd240, fd301; +mul.f64 fd303, fd299, 0d3FE0000000000000; +sub.f64 fd304, fd239, fd303; +sub.f64 fd305, fd252, fd264; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +add.f64 fd307, fd306, fd304; +sub.f64 fd308, fd304, fd306; +mul.f64 fd309, fd301, 0d3FE0000000000000; +sub.f64 fd310, fd240, fd309; +sub.f64 fd311, fd251, fd263; +mul.f64 fd312, fd311, 0dBFEBB67AE8584CAA; +sub.f64 fd313, fd310, fd312; +add.f64 fd314, fd312, fd310; +mul.f64 fd315, fd291, 0d3FE8836FA2CF5039; +mul.f64 fd316, fd297, 0d3FE491B7523C161D; +sub.f64 fd317, fd315, fd316; +mul.f64 fd318, fd297, 0d3FE8836FA2CF5039; +fma.rn.f64 fd319, fd291, 0d3FE491B7523C161D, fd318; +mul.f64 fd320, fd307, 0d3FC63A1A7E0B738A; +mul.f64 fd321, fd313, 0d3FEF838B8C811C17; +sub.f64 fd322, fd320, fd321; +mul.f64 fd323, fd313, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd324, fd307, 0d3FEF838B8C811C17, fd323; +mul.f64 fd325, fd292, 0d3FC63A1A7E0B738A; +mul.f64 fd326, fd298, 0d3FEF838B8C811C17; +sub.f64 fd327, fd325, fd326; +mul.f64 fd328, fd298, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd329, fd292, 0d3FEF838B8C811C17, fd328; +mul.f64 fd330, fd308, 0dBFEE11F642522D1C; +mul.f64 fd331, fd314, 0d3FD5E3A8748A0BF5; +sub.f64 fd332, fd330, fd331; +mul.f64 fd333, fd314, 0dBFEE11F642522D1C; +fma.rn.f64 fd334, fd308, 0d3FD5E3A8748A0BF5, fd333; +add.f64 fd335, fd284, fd300; +add.f64 fd336, fd286, fd302; +mul.f64 fd337, fd335, 0d3FE0000000000000; +sub.f64 fd338, fd268, fd337; +sub.f64 fd339, fd286, fd302; +mul.f64 fd340, fd339, 0dBFEBB67AE8584CAA; +mul.f64 fd341, fd336, 0d3FE0000000000000; +sub.f64 fd342, fd270, fd341; +sub.f64 fd343, fd284, fd300; +mul.f64 fd344, fd343, 0dBFEBB67AE8584CAA; +add.f64 fd345, fd317, fd322; +add.f64 fd346, fd319, fd324; +mul.f64 fd347, fd345, 0d3FE0000000000000; +sub.f64 fd348, fd275, fd347; +sub.f64 fd349, fd319, fd324; +mul.f64 fd350, fd349, 0dBFEBB67AE8584CAA; +mul.f64 fd351, fd346, 0d3FE0000000000000; +sub.f64 fd352, fd281, fd351; +sub.f64 fd353, fd317, fd322; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +add.f64 fd355, fd327, fd332; +add.f64 fd356, fd329, fd334; +mul.f64 fd357, fd355, 0d3FE0000000000000; +sub.f64 fd358, fd276, fd357; +sub.f64 fd359, fd329, fd334; +mul.f64 fd360, fd359, 0dBFEBB67AE8584CAA; +mul.f64 fd361, fd356, 0d3FE0000000000000; +sub.f64 fd362, fd282, fd361; +sub.f64 fd363, fd327, fd332; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 %1, fd270, fd336; +add.f64 %0, fd268, fd335; +add.f64 %3, fd281, fd346; +add.f64 %2, fd275, fd345; +add.f64 %5, fd282, fd356; +add.f64 %4, fd276, fd355; +sub.f64 %7, fd342, fd344; +add.f64 %6, fd340, fd338; +sub.f64 %9, fd352, fd354; +add.f64 %8, fd350, fd348; +sub.f64 %11, fd362, fd364; +add.f64 %10, fd360, fd358; +add.f64 %13, fd344, fd342; +sub.f64 %12, fd338, fd340; +add.f64 %15, fd354, fd352; +sub.f64 %14, fd348, fd350; +add.f64 %17, fd364, fd362; +sub.f64 %16, fd358, fd360; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<681, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<365>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %18; +mad.lo.s32 r3, r1, 648, r2; +add.f64 fd37, %28, %36; +add.f64 fd38, %20, fd37; +add.f64 fd39, %29, %37; +add.f64 fd40, %21, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %20, fd41; +sub.f64 fd43, %29, %37; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %21, fd47; +sub.f64 fd49, %28, %36; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %30, %38; +add.f64 fd54, %22, fd53; +add.f64 fd55, %32, %40; +add.f64 fd56, %24, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %22, fd57; +sub.f64 fd59, %32, %40; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %24, fd63; +sub.f64 fd65, %30, %38; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %33, %41; +add.f64 fd70, %25, fd69; +add.f64 fd71, %35, %42; +add.f64 fd72, %27, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %25, fd73; +sub.f64 fd75, %35, %42; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %27, fd79; +sub.f64 fd81, %33, %41; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mov.u32 r4, %tid.x; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd38, fd105; +add.f64 fd107, fd56, fd72; +add.f64 fd108, fd40, fd107; +mul.f64 fd109, fd105, 0d3FE0000000000000; +sub.f64 fd110, fd38, fd109; +sub.f64 fd111, fd56, fd72; +mul.f64 fd112, fd111, 0dBFEBB67AE8584CAA; +add.f64 fd113, fd112, fd110; +sub.f64 fd114, fd110, fd112; +mul.f64 fd115, fd107, 0d3FE0000000000000; +sub.f64 fd116, fd40, fd115; +sub.f64 fd117, fd54, fd70; +mul.f64 fd118, fd117, 0dBFEBB67AE8584CAA; +sub.f64 fd119, fd116, fd118; +add.f64 fd120, fd118, fd116; +add.f64 fd121, fd87, fd92; +add.f64 fd122, fd45, fd121; +add.f64 fd123, fd89, fd94; +add.f64 fd124, fd51, fd123; +mul.f64 fd125, fd121, 0d3FE0000000000000; +sub.f64 fd126, fd45, fd125; +sub.f64 fd127, fd89, fd94; +mul.f64 fd128, fd127, 0dBFEBB67AE8584CAA; +add.f64 fd129, fd128, fd126; +sub.f64 fd130, fd126, fd128; +mul.f64 fd131, fd123, 0d3FE0000000000000; +sub.f64 fd132, fd51, fd131; +sub.f64 fd133, fd87, fd92; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +sub.f64 fd135, fd132, fd134; +add.f64 fd136, fd134, fd132; +add.f64 fd137, fd97, fd102; +add.f64 fd138, fd46, fd137; +add.f64 fd139, fd99, fd104; +add.f64 fd140, fd52, fd139; +mul.f64 fd141, fd137, 0d3FE0000000000000; +sub.f64 fd142, fd46, fd141; +sub.f64 fd143, fd99, fd104; +mul.f64 fd144, fd143, 0dBFEBB67AE8584CAA; +add.f64 fd145, fd144, fd142; +sub.f64 fd146, fd142, fd144; +mul.f64 fd147, fd139, 0d3FE0000000000000; +sub.f64 fd148, fd52, fd147; +sub.f64 fd149, fd97, fd102; +mul.f64 fd150, fd149, 0dBFEBB67AE8584CAA; +sub.f64 fd151, fd148, fd150; +add.f64 fd152, fd150, fd148; +mul.wide.u32 rd2, r4, 954437177; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 9; +sub.s32 r7, r4, r6; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %19; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd153, fd154}, [rd6]; +mul.f64 fd157, fd124, fd154; +fma.rn.f64 fd158, fd153, fd122, fd157; +mul.f64 fd159, fd122, fd154; +mul.f64 fd160, fd153, fd124; +sub.f64 fd161, fd160, fd159; +mul.f64 fd162, fd153, fd153; +mul.f64 fd163, fd154, fd154; +sub.f64 fd164, fd162, fd163; +mul.f64 fd165, fd154, fd153; +fma.rn.f64 fd166, fd154, fd153, fd165; +mul.f64 fd167, fd140, fd166; +fma.rn.f64 fd168, fd164, fd138, fd167; +mul.f64 fd169, fd138, fd166; +mul.f64 fd170, fd164, fd140; +sub.f64 fd171, fd170, fd169; +mul.f64 fd172, fd153, fd164; +mul.f64 fd173, fd154, fd166; +sub.f64 fd174, fd172, fd173; +mul.f64 fd175, fd153, fd166; +fma.rn.f64 fd176, fd154, fd164, fd175; +mul.f64 fd177, fd119, fd176; +fma.rn.f64 fd178, fd174, fd113, fd177; +mul.f64 fd179, fd113, fd176; +mul.f64 fd180, fd174, fd119; +sub.f64 fd181, fd180, fd179; +mul.f64 fd182, fd153, fd174; +mul.f64 fd183, fd154, fd176; +sub.f64 fd184, fd182, fd183; +mul.f64 fd185, fd153, fd176; +fma.rn.f64 fd186, fd154, fd174, fd185; +mul.f64 fd187, fd135, fd186; +fma.rn.f64 fd188, fd184, fd129, fd187; +mul.f64 fd189, fd129, fd186; +mul.f64 fd190, fd184, fd135; +sub.f64 fd191, fd190, fd189; +ld.global.v2.f64 {fd192, fd193}, [rd6+144]; +mul.f64 fd196, fd151, fd193; +fma.rn.f64 fd197, fd192, fd145, fd196; +mul.f64 fd198, fd145, fd193; +mul.f64 fd199, fd192, fd151; +sub.f64 fd200, fd199, fd198; +mul.f64 fd201, fd153, fd192; +mul.f64 fd202, fd154, fd193; +sub.f64 fd203, fd201, fd202; +mul.f64 fd204, fd153, fd193; +fma.rn.f64 fd205, fd154, fd192, fd204; +mul.f64 fd206, fd120, fd205; +fma.rn.f64 fd207, fd203, fd114, fd206; +mul.f64 fd208, fd114, fd205; +mul.f64 fd209, fd203, fd120; +sub.f64 fd210, fd209, fd208; +mul.f64 fd211, fd153, fd203; +mul.f64 fd212, fd154, fd205; +sub.f64 fd213, fd211, fd212; +mul.f64 fd214, fd153, fd205; +fma.rn.f64 fd215, fd154, fd203, fd214; +mul.f64 fd216, fd136, fd215; +fma.rn.f64 fd217, fd213, fd130, fd216; +mul.f64 fd218, fd130, fd215; +mul.f64 fd219, fd213, fd136; +sub.f64 fd220, fd219, fd218; +mul.f64 fd221, fd153, fd213; +mul.f64 fd222, fd154, fd215; +sub.f64 fd223, fd221, fd222; +mul.f64 fd224, fd153, fd215; +fma.rn.f64 fd225, fd154, fd213, fd224; +mul.f64 fd226, fd152, fd225; +fma.rn.f64 fd227, fd223, fd146, fd226; +mul.f64 fd228, fd146, fd225; +mul.f64 fd229, fd223, fd152; +sub.f64 fd230, fd229, fd228; +mad.lo.s32 r8, r5, 648, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 72, r8; +st.shared.f64 [r9], fd106; +st.shared.f64 [r9+8], fd158; +st.shared.f64 [r9+16], fd168; +st.shared.f64 [r9+24], fd178; +st.shared.f64 [r9+32], fd188; +st.shared.f64 [r9+40], fd197; +st.shared.f64 [r9+48], fd207; +st.shared.f64 [r9+56], fd217; +st.shared.f64 [r9+64], fd227; +barrier.sync 0; +shl.b32 r10, r7, 6; +sub.s32 r11, r9, r10; +ld.shared.f64 fd231, [r11]; +ld.shared.f64 fd232, [r11+72]; +ld.shared.f64 fd233, [r11+144]; +ld.shared.f64 fd234, [r11+216]; +ld.shared.f64 fd235, [r11+288]; +ld.shared.f64 fd236, [r11+360]; +ld.shared.f64 fd237, [r11+432]; +ld.shared.f64 fd238, [r11+504]; +ld.shared.f64 fd239, [r11+576]; +barrier.sync 0; +st.shared.f64 [r9], fd108; +st.shared.f64 [r9+8], fd161; +st.shared.f64 [r9+16], fd171; +st.shared.f64 [r9+24], fd181; +st.shared.f64 [r9+32], fd191; +st.shared.f64 [r9+40], fd200; +st.shared.f64 [r9+48], fd210; +st.shared.f64 [r9+56], fd220; +st.shared.f64 [r9+64], fd230; +barrier.sync 0; +ld.shared.f64 fd240, [r11]; +ld.shared.f64 fd241, [r11+72]; +ld.shared.f64 fd242, [r11+144]; +ld.shared.f64 fd243, [r11+216]; +ld.shared.f64 fd244, [r11+288]; +ld.shared.f64 fd245, [r11+360]; +ld.shared.f64 fd246, [r11+432]; +ld.shared.f64 fd247, [r11+504]; +ld.shared.f64 fd248, [r11+576]; +add.f64 fd249, fd234, fd237; +add.f64 fd250, fd231, fd249; +add.f64 fd251, fd243, fd246; +add.f64 fd252, fd240, fd251; +mul.f64 fd253, fd249, 0d3FE0000000000000; +sub.f64 fd254, fd231, fd253; +sub.f64 fd255, fd243, fd246; +mul.f64 fd256, fd255, 0dBFEBB67AE8584CAA; +add.f64 fd257, fd256, fd254; +sub.f64 fd258, fd254, fd256; +mul.f64 fd259, fd251, 0d3FE0000000000000; +sub.f64 fd260, fd240, fd259; +sub.f64 fd261, fd234, fd237; +mul.f64 fd262, fd261, 0dBFEBB67AE8584CAA; +sub.f64 fd263, fd260, fd262; +add.f64 fd264, fd262, fd260; +add.f64 fd265, fd235, fd238; +add.f64 fd266, fd232, fd265; +add.f64 fd267, fd244, fd247; +add.f64 fd268, fd241, fd267; +mul.f64 fd269, fd265, 0d3FE0000000000000; +sub.f64 fd270, fd232, fd269; +sub.f64 fd271, fd244, fd247; +mul.f64 fd272, fd271, 0dBFEBB67AE8584CAA; +add.f64 fd273, fd272, fd270; +sub.f64 fd274, fd270, fd272; +mul.f64 fd275, fd267, 0d3FE0000000000000; +sub.f64 fd276, fd241, fd275; +sub.f64 fd277, fd235, fd238; +mul.f64 fd278, fd277, 0dBFEBB67AE8584CAA; +sub.f64 fd279, fd276, fd278; +add.f64 fd280, fd278, fd276; +add.f64 fd281, fd236, fd239; +add.f64 fd282, fd233, fd281; +add.f64 fd283, fd245, fd248; +add.f64 fd284, fd242, fd283; +mul.f64 fd285, fd281, 0d3FE0000000000000; +sub.f64 fd286, fd233, fd285; +sub.f64 fd287, fd245, fd248; +mul.f64 fd288, fd287, 0dBFEBB67AE8584CAA; +add.f64 fd289, fd288, fd286; +sub.f64 fd290, fd286, fd288; +mul.f64 fd291, fd283, 0d3FE0000000000000; +sub.f64 fd292, fd242, fd291; +sub.f64 fd293, fd236, fd239; +mul.f64 fd294, fd293, 0dBFEBB67AE8584CAA; +sub.f64 fd295, fd292, fd294; +add.f64 fd296, fd294, fd292; +mul.f64 fd297, fd273, 0d3FE8836FA2CF5039; +mul.f64 fd298, fd279, 0d3FE491B7523C161D; +sub.f64 fd299, fd297, fd298; +mul.f64 fd300, fd279, 0d3FE8836FA2CF5039; +fma.rn.f64 fd301, fd273, 0d3FE491B7523C161D, fd300; +mul.f64 fd302, fd289, 0d3FC63A1A7E0B738A; +mul.f64 fd303, fd295, 0d3FEF838B8C811C17; +sub.f64 fd304, fd302, fd303; +mul.f64 fd305, fd295, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd306, fd289, 0d3FEF838B8C811C17, fd305; +mul.f64 fd307, fd274, 0d3FC63A1A7E0B738A; +mul.f64 fd308, fd280, 0d3FEF838B8C811C17; +sub.f64 fd309, fd307, fd308; +mul.f64 fd310, fd280, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd311, fd274, 0d3FEF838B8C811C17, fd310; +mul.f64 fd312, fd290, 0dBFEE11F642522D1C; +mul.f64 fd313, fd296, 0d3FD5E3A8748A0BF5; +sub.f64 fd314, fd312, fd313; +mul.f64 fd315, fd296, 0dBFEE11F642522D1C; +fma.rn.f64 fd316, fd290, 0d3FD5E3A8748A0BF5, fd315; +add.f64 fd317, fd266, fd282; +add.f64 fd318, fd268, fd284; +mul.f64 fd319, fd317, 0d3FE0000000000000; +sub.f64 fd320, fd250, fd319; +sub.f64 fd321, fd268, fd284; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +mul.f64 fd323, fd318, 0d3FE0000000000000; +sub.f64 fd324, fd252, fd323; +sub.f64 fd325, fd266, fd282; +mul.f64 fd326, fd325, 0dBFEBB67AE8584CAA; +add.f64 fd327, fd299, fd304; +add.f64 fd328, fd301, fd306; +mul.f64 fd329, fd327, 0d3FE0000000000000; +sub.f64 fd330, fd257, fd329; +sub.f64 fd331, fd301, fd306; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +mul.f64 fd333, fd328, 0d3FE0000000000000; +sub.f64 fd334, fd263, fd333; +sub.f64 fd335, fd299, fd304; +mul.f64 fd336, fd335, 0dBFEBB67AE8584CAA; +add.f64 fd337, fd309, fd314; +add.f64 fd338, fd311, fd316; +mul.f64 fd339, fd337, 0d3FE0000000000000; +sub.f64 fd340, fd258, fd339; +sub.f64 fd341, fd311, fd316; +mul.f64 fd342, fd341, 0dBFEBB67AE8584CAA; +mul.f64 fd343, fd338, 0d3FE0000000000000; +sub.f64 fd344, fd264, fd343; +sub.f64 fd345, fd309, fd314; +mul.f64 fd346, fd345, 0dBFEBB67AE8584CAA; +add.f64 %0, fd250, fd317; +add.f64 %1, fd252, fd318; +add.f64 %3, fd263, fd328; +add.f64 %2, fd257, fd327; +add.f64 %5, fd264, fd338; +add.f64 %4, fd258, fd337; +add.f64 %6, fd322, fd320; +sub.f64 %7, fd324, fd326; +sub.f64 %9, fd334, fd336; +add.f64 %8, fd332, fd330; +sub.f64 %11, fd344, fd346; +add.f64 %10, fd342, fd340; +sub.f64 %12, fd320, fd322; +add.f64 %13, fd326, fd324; +add.f64 %15, fd336, fd334; +sub.f64 %14, fd330, fd332; +add.f64 %17, fd346, fd344; +sub.f64 %16, fd340, fd342; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "r"(smem), "l"(lut_dp_9_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<683, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1383>; +.reg .b64 rd<9>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 1296, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1382, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1381, %57, fd1382; +mul.f64 fd119, fd1382, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1380, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1379, %63, fd1380; +mul.f64 fd135, fd1380, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1378, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1377, %69, fd1378; +mul.f64 fd151, fd1378, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd1376, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1376, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd1374, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1375, fd155, 0d3FEF838B8C811C17; +sub.f64 fd164, fd1374, fd1375; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd1372, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1373, fd140, 0d3FEF838B8C811C17; +sub.f64 fd169, fd1372, fd1373; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd1370, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1371, fd156, 0d3FD5E3A8748A0BF5; +sub.f64 fd174, fd1370, fd1371; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1369, fd1379, fd1377; +sub.f64 fd183, fd1379, fd1377; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1368, fd1381, fd1369; +mul.f64 fd187, fd1369, 0d3FE0000000000000; +sub.f64 fd188, fd1381, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1367, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1366, fd123, fd1367; +mul.f64 fd203, fd1367, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1365, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1364, fd124, fd1365; +mul.f64 fd219, fd1365, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1361, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1359, %112, fd1361; +mul.f64 fd235, fd1361, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1356, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1354, %115, fd1356; +mul.f64 fd251, fd1356, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1351, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1349, %118, fd1351; +mul.f64 fd267, fd1351, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd1348, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1348, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd1347, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1347, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd1345, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1346, fd256, 0d3FEF838B8C811C17; +sub.f64 fd285, fd1345, fd1346; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd1343, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1344, fd272, 0d3FD5E3A8748A0BF5; +sub.f64 fd290, fd1343, fd1344; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1342, fd1354, fd1349; +sub.f64 fd299, fd1354, fd1349; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1341, fd1359, fd1342; +mul.f64 fd303, fd1342, 0d3FE0000000000000; +sub.f64 fd304, fd1359, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1340, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1339, fd239, fd1340; +mul.f64 fd319, fd1340, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1338, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1337, fd240, fd1338; +mul.f64 fd335, fd1338, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1334, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1332, %121, fd1334; +mul.f64 fd351, fd1334, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1329, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1327, %124, fd1329; +mul.f64 fd367, fd1329, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1325, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1323, %126, fd1325; +mul.f64 fd383, fd1325, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd1322, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1322, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd1321, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1321, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd1319, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1320, fd372, 0d3FEF838B8C811C17; +sub.f64 fd401, fd1319, fd1320; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd1317, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1318, fd388, 0d3FD5E3A8748A0BF5; +sub.f64 fd406, fd1317, fd1318; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1316, fd1327, fd1323; +sub.f64 fd415, fd1327, fd1323; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1315, fd1332, fd1316; +mul.f64 fd419, fd1316, 0d3FE0000000000000; +sub.f64 fd420, fd1332, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1314, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1313, fd355, fd1314; +mul.f64 fd435, fd1314, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1312, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1311, fd356, fd1312; +mul.f64 fd451, fd1312, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1339, 0d3FCD84D223638000; +mul.f64 fd1310, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1310, fd458; +mul.f64 fd460, fd1339, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd1308, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1309, fd1313, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd1308, fd1309; +mul.f64 fd465, fd1313, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd1306, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1307, fd1337, 0d3FDCB920325BAFA6; +sub.f64 fd469, fd1306, fd1307; +mul.f64 fd470, fd1337, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd1304, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1305, fd1311, 0d3FE9AAFE4207DF5F; +sub.f64 fd474, fd1304, fd1305; +mul.f64 fd475, fd1311, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd1302, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1303, fd307, 0d3FE491B7523C161D; +sub.f64 fd479, fd1302, fd1303; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd1301, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1301, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd1300, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1300, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd1299, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1299, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0d3FED6206BEB6C24B; +mul.f64 fd1298, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1298, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0d3FE746A51650EADE; +mul.f64 fd1297, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1297, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0d3FEF838B8C811C17; +mul.f64 fd1296, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1296, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd1294, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1295, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd1294, fd1295; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd1292, fd318, 0dBFADC528B5343A86; +mul.f64 fd1293, fd324, 0d3FEFF223F3635CE3; +sub.f64 fd519, fd1292, fd1293; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd1290, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1291, fd440, 0dBFBDB843E577175E; +sub.f64 fd524, fd1290, fd1291; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd1289, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1289, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd1288, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1288, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +mul.f64 fd539, fd537, 0d3FE0000000000000; +sub.f64 fd540, fd178, fd539; +add.f64 fd1287, fd1341, fd1315; +sub.f64 fd541, fd1341, fd1315; +mul.f64 fd542, fd541, 0dBFEBB67AE8584CAA; +add.f64 fd543, fd542, fd540; +sub.f64 fd544, fd540, fd542; +mul.f64 fd545, fd1287, 0d3FE0000000000000; +sub.f64 fd546, fd1368, fd545; +sub.f64 fd547, fd294, fd410; +mul.f64 fd548, fd547, 0dBFEBB67AE8584CAA; +sub.f64 fd549, fd546, fd548; +add.f64 fd550, fd548, fd546; +add.f64 fd551, fd459, fd464; +add.f64 fd552, fd194, fd551; +mul.f64 fd555, fd551, 0d3FE0000000000000; +sub.f64 fd556, fd194, fd555; +add.f64 fd1286, fd461, fd466; +sub.f64 fd557, fd461, fd466; +mul.f64 fd558, fd557, 0dBFEBB67AE8584CAA; +add.f64 fd559, fd558, fd556; +sub.f64 fd560, fd556, fd558; +add.f64 fd1285, fd1366, fd1286; +mul.f64 fd561, fd1286, 0d3FE0000000000000; +sub.f64 fd562, fd1366, fd561; +sub.f64 fd563, fd459, fd464; +mul.f64 fd564, fd563, 0dBFEBB67AE8584CAA; +sub.f64 fd565, fd562, fd564; +add.f64 fd566, fd564, fd562; +add.f64 fd567, fd469, fd474; +add.f64 fd568, fd210, fd567; +mul.f64 fd571, fd567, 0d3FE0000000000000; +sub.f64 fd572, fd210, fd571; +add.f64 fd1284, fd471, fd476; +sub.f64 fd573, fd471, fd476; +mul.f64 fd574, fd573, 0dBFEBB67AE8584CAA; +add.f64 fd575, fd574, fd572; +sub.f64 fd576, fd572, fd574; +add.f64 fd1283, fd1364, fd1284; +mul.f64 fd577, fd1284, 0d3FE0000000000000; +sub.f64 fd578, fd1364, fd577; +sub.f64 fd579, fd469, fd474; +mul.f64 fd580, fd579, 0dBFEBB67AE8584CAA; +sub.f64 fd581, fd578, fd580; +add.f64 fd582, fd580, fd578; +add.f64 fd583, fd479, fd484; +add.f64 fd584, fd185, fd583; +mul.f64 fd587, fd583, 0d3FE0000000000000; +sub.f64 fd588, fd185, fd587; +add.f64 fd1282, fd481, fd486; +sub.f64 fd589, fd481, fd486; +mul.f64 fd590, fd589, 0dBFEBB67AE8584CAA; +add.f64 fd591, fd590, fd588; +sub.f64 fd592, fd588, fd590; +add.f64 fd1281, fd191, fd1282; +mul.f64 fd593, fd1282, 0d3FE0000000000000; +sub.f64 fd594, fd191, fd593; +sub.f64 fd595, fd479, fd484; +mul.f64 fd596, fd595, 0dBFEBB67AE8584CAA; +sub.f64 fd597, fd594, fd596; +add.f64 fd598, fd596, fd594; +add.f64 fd599, fd489, fd494; +add.f64 fd600, fd201, fd599; +mul.f64 fd603, fd599, 0d3FE0000000000000; +sub.f64 fd604, fd201, fd603; +add.f64 fd1280, fd491, fd496; +sub.f64 fd605, fd491, fd496; +mul.f64 fd606, fd605, 0dBFEBB67AE8584CAA; +add.f64 fd607, fd606, fd604; +sub.f64 fd608, fd604, fd606; +add.f64 fd1279, fd207, fd1280; +mul.f64 fd609, fd1280, 0d3FE0000000000000; +sub.f64 fd610, fd207, fd609; +sub.f64 fd611, fd489, fd494; +mul.f64 fd612, fd611, 0dBFEBB67AE8584CAA; +sub.f64 fd613, fd610, fd612; +add.f64 fd614, fd612, fd610; +add.f64 fd615, fd499, fd504; +add.f64 fd616, fd217, fd615; +mul.f64 fd619, fd615, 0d3FE0000000000000; +sub.f64 fd620, fd217, fd619; +add.f64 fd1278, fd501, fd506; +sub.f64 fd621, fd501, fd506; +mul.f64 fd622, fd621, 0dBFEBB67AE8584CAA; +add.f64 fd623, fd622, fd620; +sub.f64 fd624, fd620, fd622; +add.f64 fd1277, fd223, fd1278; +mul.f64 fd625, fd1278, 0d3FE0000000000000; +sub.f64 fd626, fd223, fd625; +sub.f64 fd627, fd499, fd504; +mul.f64 fd628, fd627, 0dBFEBB67AE8584CAA; +sub.f64 fd629, fd626, fd628; +add.f64 fd630, fd628, fd626; +add.f64 fd631, fd509, fd514; +add.f64 fd632, fd186, fd631; +mul.f64 fd635, fd631, 0d3FE0000000000000; +sub.f64 fd636, fd186, fd635; +add.f64 fd1276, fd511, fd516; +sub.f64 fd637, fd511, fd516; +mul.f64 fd638, fd637, 0dBFEBB67AE8584CAA; +add.f64 fd639, fd638, fd636; +sub.f64 fd640, fd636, fd638; +add.f64 fd1275, fd192, fd1276; +mul.f64 fd641, fd1276, 0d3FE0000000000000; +sub.f64 fd642, fd192, fd641; +sub.f64 fd643, fd509, fd514; +mul.f64 fd644, fd643, 0dBFEBB67AE8584CAA; +sub.f64 fd645, fd642, fd644; +add.f64 fd646, fd644, fd642; +add.f64 fd647, fd519, fd524; +add.f64 fd648, fd202, fd647; +mul.f64 fd651, fd647, 0d3FE0000000000000; +sub.f64 fd652, fd202, fd651; +add.f64 fd1274, fd521, fd526; +sub.f64 fd653, fd521, fd526; +mul.f64 fd654, fd653, 0dBFEBB67AE8584CAA; +add.f64 fd655, fd654, fd652; +sub.f64 fd656, fd652, fd654; +add.f64 fd1273, fd208, fd1274; +mul.f64 fd657, fd1274, 0d3FE0000000000000; +sub.f64 fd658, fd208, fd657; +sub.f64 fd659, fd519, fd524; +mul.f64 fd660, fd659, 0dBFEBB67AE8584CAA; +sub.f64 fd661, fd658, fd660; +add.f64 fd662, fd660, fd658; +add.f64 fd663, fd529, fd534; +add.f64 fd664, fd218, fd663; +mul.f64 fd667, fd663, 0d3FE0000000000000; +sub.f64 fd668, fd218, fd667; +add.f64 fd1272, fd531, fd536; +sub.f64 fd669, fd531, fd536; +mul.f64 fd670, fd669, 0dBFEBB67AE8584CAA; +add.f64 fd671, fd670, fd668; +sub.f64 fd672, fd668, fd670; +add.f64 fd1271, fd224, fd1272; +mul.f64 fd673, fd1272, 0d3FE0000000000000; +sub.f64 fd674, fd224, fd673; +sub.f64 fd675, fd529, fd534; +mul.f64 fd676, fd675, 0dBFEBB67AE8584CAA; +sub.f64 fd677, fd674, fd676; +add.f64 fd678, fd676, fd674; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mad.lo.s32 r8, r5, 1296, r3; +mul.wide.u32 rd7, r7, 16; +mov.u64 rd8, %55; +add.s64 rd6, rd8, rd7; +ld.global.v2.f64 {fd679, fd680}, [rd6]; +mul.f64 fd683, fd1285, fd680; +mul.f64 fd685, fd679, fd1285; +mul.f64 fd1269, fd679, fd679; +mul.f64 fd1270, fd680, fd680; +sub.f64 fd688, fd1269, fd1270; +mul.f64 fd689, fd680, fd679; +fma.rn.f64 fd690, fd680, fd679, fd689; +mul.f64 fd691, fd1283, fd690; +mul.f64 fd693, fd688, fd1283; +mul.f64 fd695, fd680, fd690; +mul.f64 fd1268, fd679, fd688; +sub.f64 fd696, fd1268, fd695; +mul.f64 fd1267, fd568, fd690; +mul.f64 fd697, fd679, fd690; +fma.rn.f64 fd698, fd680, fd688, fd697; +mul.f64 fd699, fd1281, fd698; +mul.f64 fd701, fd696, fd1281; +mul.f64 fd1265, fd679, fd696; +mul.f64 fd1266, fd680, fd698; +sub.f64 fd704, fd1265, fd1266; +mul.f64 fd1264, fd584, fd698; +mul.f64 fd705, fd679, fd698; +fma.rn.f64 fd706, fd680, fd696, fd705; +mul.f64 fd707, fd1279, fd706; +mul.f64 fd709, fd704, fd1279; +mul.f64 fd711, fd680, fd706; +mul.f64 fd1263, fd679, fd704; +sub.f64 fd712, fd1263, fd711; +mul.f64 fd1262, fd600, fd706; +mul.f64 fd713, fd679, fd706; +fma.rn.f64 fd714, fd680, fd704, fd713; +mul.f64 fd715, fd1277, fd714; +mul.f64 fd717, fd712, fd1277; +mul.f64 fd719, fd680, fd714; +mul.f64 fd1261, fd679, fd712; +sub.f64 fd720, fd1261, fd719; +mul.f64 fd1260, fd616, fd714; +mul.f64 fd721, fd679, fd714; +fma.rn.f64 fd722, fd680, fd712, fd721; +mul.f64 fd723, fd1275, fd722; +mul.f64 fd725, fd720, fd1275; +mul.f64 fd1258, fd679, fd720; +mul.f64 fd1259, fd680, fd722; +sub.f64 fd728, fd1258, fd1259; +mul.f64 fd1257, fd632, fd722; +mul.f64 fd729, fd679, fd722; +fma.rn.f64 fd730, fd680, fd720, fd729; +mul.f64 fd731, fd1273, fd730; +mul.f64 fd733, fd728, fd1273; +mul.f64 fd735, fd680, fd730; +mul.f64 fd1256, fd679, fd728; +sub.f64 fd736, fd1256, fd735; +mul.f64 fd1255, fd648, fd730; +mul.f64 fd737, fd679, fd730; +fma.rn.f64 fd738, fd680, fd728, fd737; +mul.f64 fd739, fd1271, fd738; +mul.f64 fd741, fd736, fd1271; +mul.f64 fd743, fd680, fd738; +mul.f64 fd1254, fd679, fd736; +sub.f64 fd744, fd1254, fd743; +mul.f64 fd1253, fd664, fd738; +mul.f64 fd745, fd679, fd738; +fma.rn.f64 fd746, fd680, fd736, fd745; +mul.f64 fd747, fd549, fd746; +mul.f64 fd749, fd744, fd549; +mul.f64 fd1251, fd679, fd744; +mul.f64 fd1252, fd680, fd746; +sub.f64 fd752, fd1251, fd1252; +mul.f64 fd1250, fd543, fd746; +mul.f64 fd753, fd679, fd746; +fma.rn.f64 fd754, fd680, fd744, fd753; +mul.f64 fd755, fd565, fd754; +mul.f64 fd757, fd752, fd565; +mul.f64 fd759, fd680, fd754; +mul.f64 fd1249, fd679, fd752; +sub.f64 fd760, fd1249, fd759; +mul.f64 fd1248, fd559, fd754; +mul.f64 fd761, fd679, fd754; +fma.rn.f64 fd762, fd680, fd752, fd761; +mul.f64 fd763, fd581, fd762; +mul.f64 fd765, fd760, fd581; +mul.f64 fd1246, fd679, fd760; +mul.f64 fd1247, fd680, fd762; +sub.f64 fd768, fd1246, fd1247; +mul.f64 fd1245, fd575, fd762; +mul.f64 fd769, fd679, fd762; +fma.rn.f64 fd770, fd680, fd760, fd769; +mul.f64 fd771, fd597, fd770; +mul.f64 fd773, fd768, fd597; +mul.f64 fd775, fd680, fd770; +mul.f64 fd1244, fd679, fd768; +sub.f64 fd776, fd1244, fd775; +mul.f64 fd1243, fd591, fd770; +mul.f64 fd777, fd679, fd770; +fma.rn.f64 fd778, fd680, fd768, fd777; +mul.f64 fd779, fd613, fd778; +mul.f64 fd780, fd607, fd778; +mul.f64 fd781, fd776, fd613; +ld.global.v2.f64 {fd782, fd783}, [rd6+48]; +mul.f64 fd786, fd629, fd783; +mul.f64 fd788, fd782, fd629; +mul.f64 fd790, fd680, fd783; +mul.f64 fd1242, fd679, fd782; +sub.f64 fd791, fd1242, fd790; +mul.f64 fd1241, fd623, fd783; +mul.f64 fd792, fd679, fd783; +fma.rn.f64 fd793, fd680, fd782, fd792; +mul.f64 fd794, fd645, fd793; +mul.f64 fd796, fd791, fd645; +mul.f64 fd1239, fd679, fd791; +mul.f64 fd1240, fd680, fd793; +sub.f64 fd799, fd1239, fd1240; +mul.f64 fd1238, fd639, fd793; +mul.f64 fd800, fd679, fd793; +fma.rn.f64 fd801, fd680, fd791, fd800; +mul.f64 fd802, fd661, fd801; +mul.f64 fd804, fd799, fd661; +mul.f64 fd806, fd680, fd801; +mul.f64 fd1237, fd679, fd799; +sub.f64 fd807, fd1237, fd806; +mul.f64 fd1236, fd655, fd801; +mul.f64 fd808, fd679, fd801; +fma.rn.f64 fd809, fd680, fd799, fd808; +mul.f64 fd810, fd677, fd809; +mul.f64 fd812, fd807, fd677; +mul.f64 fd814, fd680, fd809; +mul.f64 fd1235, fd679, fd807; +sub.f64 fd815, fd1235, fd814; +mul.f64 fd1234, fd671, fd809; +mul.f64 fd816, fd679, fd809; +fma.rn.f64 fd817, fd680, fd807, fd816; +mul.f64 fd818, fd550, fd817; +mul.f64 fd820, fd815, fd550; +mul.f64 fd1232, fd679, fd815; +mul.f64 fd1233, fd680, fd817; +sub.f64 fd823, fd1232, fd1233; +mul.f64 fd1231, fd544, fd817; +mul.f64 fd824, fd679, fd817; +fma.rn.f64 fd825, fd680, fd815, fd824; +mul.f64 fd826, fd566, fd825; +mul.f64 fd828, fd823, fd566; +mul.f64 fd830, fd680, fd825; +mul.f64 fd1230, fd679, fd823; +sub.f64 fd831, fd1230, fd830; +mul.f64 fd1229, fd560, fd825; +mul.f64 fd832, fd679, fd825; +fma.rn.f64 fd833, fd680, fd823, fd832; +mul.f64 fd834, fd582, fd833; +mul.f64 fd836, fd831, fd582; +mul.f64 fd1227, fd679, fd831; +mul.f64 fd1228, fd680, fd833; +sub.f64 fd839, fd1227, fd1228; +mul.f64 fd1226, fd576, fd833; +mul.f64 fd840, fd679, fd833; +fma.rn.f64 fd841, fd680, fd831, fd840; +mul.f64 fd842, fd598, fd841; +mul.f64 fd844, fd839, fd598; +mul.f64 fd1224, fd679, fd839; +mul.f64 fd1225, fd680, fd841; +sub.f64 fd847, fd1224, fd1225; +mul.f64 fd1223, fd592, fd841; +mul.f64 fd848, fd679, fd841; +fma.rn.f64 fd849, fd680, fd839, fd848; +mul.f64 fd850, fd614, fd849; +mul.f64 fd852, fd847, fd614; +mul.f64 fd854, fd680, fd849; +mul.f64 fd1222, fd679, fd847; +sub.f64 fd855, fd1222, fd854; +mul.f64 fd1221, fd608, fd849; +mul.f64 fd856, fd679, fd849; +fma.rn.f64 fd857, fd680, fd847, fd856; +mul.f64 fd858, fd630, fd857; +mul.f64 fd860, fd855, fd630; +mul.f64 fd1219, fd679, fd855; +mul.f64 fd1220, fd680, fd857; +sub.f64 fd863, fd1219, fd1220; +mul.f64 fd1218, fd624, fd857; +mul.f64 fd864, fd679, fd857; +fma.rn.f64 fd865, fd680, fd855, fd864; +mul.f64 fd866, fd646, fd865; +mul.f64 fd868, fd863, fd646; +mul.f64 fd870, fd680, fd865; +mul.f64 fd1217, fd679, fd863; +sub.f64 fd871, fd1217, fd870; +mul.f64 fd1216, fd640, fd865; +mul.f64 fd872, fd679, fd865; +fma.rn.f64 fd873, fd680, fd863, fd872; +mul.f64 fd874, fd662, fd873; +mul.f64 fd876, fd871, fd662; +mul.f64 fd878, fd680, fd873; +mul.f64 fd1215, fd679, fd871; +sub.f64 fd879, fd1215, fd878; +mul.f64 fd1214, fd656, fd873; +mul.f64 fd880, fd679, fd873; +mul.f64 fd1213, fd552, fd680; +fma.rn.f64 fd881, fd680, fd871, fd880; +mul.f64 fd882, fd678, fd881; +mul.f64 fd883, fd672, fd881; +mul.f64 fd884, fd879, fd678; +barrier.sync 0; +mad.lo.s32 r9, r7, 432, r8; +add.f64 fd885, fd1368, fd1287; +add.f64 fd886, fd178, fd537; +st.shared.v2.f64 [r9], {fd886, fd885}; +fma.rn.f64 fd887, fd679, fd552, fd683; +sub.f64 fd888, fd685, fd1213; +st.shared.v2.f64 [r9+16], {fd887, fd888}; +fma.rn.f64 fd889, fd688, fd568, fd691; +sub.f64 fd890, fd693, fd1267; +st.shared.v2.f64 [r9+32], {fd889, fd890}; +fma.rn.f64 fd891, fd696, fd584, fd699; +sub.f64 fd892, fd701, fd1264; +st.shared.v2.f64 [r9+48], {fd891, fd892}; +fma.rn.f64 fd893, fd704, fd600, fd707; +sub.f64 fd894, fd709, fd1262; +st.shared.v2.f64 [r9+64], {fd893, fd894}; +fma.rn.f64 fd895, fd712, fd616, fd715; +sub.f64 fd896, fd717, fd1260; +st.shared.v2.f64 [r9+80], {fd895, fd896}; +fma.rn.f64 fd897, fd720, fd632, fd723; +sub.f64 fd898, fd725, fd1257; +st.shared.v2.f64 [r9+96], {fd897, fd898}; +sub.f64 fd899, fd733, fd1255; +fma.rn.f64 fd900, fd728, fd648, fd731; +st.shared.v2.f64 [r9+112], {fd900, fd899}; +fma.rn.f64 fd901, fd736, fd664, fd739; +sub.f64 fd902, fd741, fd1253; +st.shared.v2.f64 [r9+128], {fd901, fd902}; +fma.rn.f64 fd903, fd744, fd543, fd747; +sub.f64 fd904, fd749, fd1250; +st.shared.v2.f64 [r9+144], {fd903, fd904}; +fma.rn.f64 fd905, fd752, fd559, fd755; +sub.f64 fd906, fd757, fd1248; +st.shared.v2.f64 [r9+160], {fd905, fd906}; +fma.rn.f64 fd907, fd760, fd575, fd763; +sub.f64 fd908, fd765, fd1245; +st.shared.v2.f64 [r9+176], {fd907, fd908}; +fma.rn.f64 fd909, fd768, fd591, fd771; +sub.f64 fd910, fd773, fd1243; +st.shared.v2.f64 [r9+192], {fd909, fd910}; +fma.rn.f64 fd911, fd776, fd607, fd779; +sub.f64 fd912, fd781, fd780; +st.shared.v2.f64 [r9+208], {fd911, fd912}; +fma.rn.f64 fd913, fd782, fd623, fd786; +sub.f64 fd914, fd788, fd1241; +st.shared.v2.f64 [r9+224], {fd913, fd914}; +fma.rn.f64 fd915, fd791, fd639, fd794; +sub.f64 fd916, fd796, fd1238; +st.shared.v2.f64 [r9+240], {fd915, fd916}; +fma.rn.f64 fd917, fd799, fd655, fd802; +sub.f64 fd918, fd804, fd1236; +st.shared.v2.f64 [r9+256], {fd917, fd918}; +fma.rn.f64 fd919, fd807, fd671, fd810; +sub.f64 fd920, fd812, fd1234; +st.shared.v2.f64 [r9+272], {fd919, fd920}; +fma.rn.f64 fd921, fd815, fd544, fd818; +sub.f64 fd922, fd820, fd1231; +st.shared.v2.f64 [r9+288], {fd921, fd922}; +fma.rn.f64 fd923, fd823, fd560, fd826; +sub.f64 fd924, fd828, fd1229; +st.shared.v2.f64 [r9+304], {fd923, fd924}; +sub.f64 fd925, fd836, fd1226; +fma.rn.f64 fd926, fd831, fd576, fd834; +st.shared.v2.f64 [r9+320], {fd926, fd925}; +fma.rn.f64 fd927, fd839, fd592, fd842; +sub.f64 fd928, fd844, fd1223; +st.shared.v2.f64 [r9+336], {fd927, fd928}; +fma.rn.f64 fd929, fd847, fd608, fd850; +sub.f64 fd930, fd852, fd1221; +st.shared.v2.f64 [r9+352], {fd929, fd930}; +fma.rn.f64 fd931, fd855, fd624, fd858; +sub.f64 fd932, fd860, fd1218; +st.shared.v2.f64 [r9+368], {fd931, fd932}; +fma.rn.f64 fd933, fd863, fd640, fd866; +sub.f64 fd934, fd868, fd1216; +st.shared.v2.f64 [r9+384], {fd933, fd934}; +fma.rn.f64 fd935, fd871, fd656, fd874; +sub.f64 fd936, fd876, fd1214; +st.shared.v2.f64 [r9+400], {fd935, fd936}; +fma.rn.f64 fd937, fd879, fd672, fd882; +sub.f64 fd938, fd884, fd883; +st.shared.v2.f64 [r9+416], {fd937, fd938}; +barrier.sync 0; +mad.lo.s32 r10, r7, -416, r9; +ld.shared.v2.f64 {fd939, fd940}, [r10]; +ld.shared.v2.f64 {fd943, fd944}, [r10+48]; +ld.shared.v2.f64 {fd947, fd948}, [r10+96]; +ld.shared.v2.f64 {fd951, fd952}, [r10+144]; +ld.shared.v2.f64 {fd955, fd956}, [r10+192]; +ld.shared.v2.f64 {fd959, fd960}, [r10+240]; +ld.shared.v2.f64 {fd963, fd964}, [r10+288]; +ld.shared.v2.f64 {fd967, fd968}, [r10+336]; +ld.shared.v2.f64 {fd971, fd972}, [r10+384]; +ld.shared.v2.f64 {fd975, fd976}, [r10+432]; +ld.shared.v2.f64 {fd979, fd980}, [r10+480]; +ld.shared.v2.f64 {fd983, fd984}, [r10+528]; +ld.shared.v2.f64 {fd987, fd988}, [r10+576]; +ld.shared.v2.f64 {fd991, fd992}, [r10+624]; +ld.shared.v2.f64 {fd995, fd996}, [r10+672]; +ld.shared.v2.f64 {fd999, fd1000}, [r10+720]; +ld.shared.v2.f64 {fd1003, fd1004}, [r10+768]; +ld.shared.v2.f64 {fd1007, fd1008}, [r10+816]; +ld.shared.v2.f64 {fd1011, fd1012}, [r10+864]; +ld.shared.v2.f64 {fd1015, fd1016}, [r10+912]; +ld.shared.v2.f64 {fd1019, fd1020}, [r10+960]; +ld.shared.v2.f64 {fd1023, fd1024}, [r10+1008]; +ld.shared.v2.f64 {fd1027, fd1028}, [r10+1056]; +ld.shared.v2.f64 {fd1031, fd1032}, [r10+1104]; +ld.shared.v2.f64 {fd1035, fd1036}, [r10+1152]; +ld.shared.v2.f64 {fd1039, fd1040}, [r10+1200]; +ld.shared.v2.f64 {fd1043, fd1044}, [r10+1248]; +add.f64 fd1047, fd975, fd1011; +mul.f64 fd1049, fd1047, 0d3FE0000000000000; +sub.f64 fd1050, fd939, fd1049; +add.f64 fd1212, fd976, fd1012; +sub.f64 fd1051, fd976, fd1012; +mul.f64 fd1052, fd1051, 0dBFEBB67AE8584CAA; +mul.f64 fd1053, fd1212, 0d3FE0000000000000; +sub.f64 fd1054, fd940, fd1053; +sub.f64 fd1055, fd975, fd1011; +mul.f64 fd1056, fd1055, 0dBFEBB67AE8584CAA; +add.f64 fd1057, fd979, fd1015; +mul.f64 fd1059, fd1057, 0d3FE0000000000000; +sub.f64 fd1060, fd943, fd1059; +add.f64 fd1211, fd980, fd1016; +sub.f64 fd1061, fd980, fd1016; +mul.f64 fd1062, fd1061, 0dBFEBB67AE8584CAA; +mul.f64 fd1063, fd1211, 0d3FE0000000000000; +sub.f64 fd1064, fd944, fd1063; +sub.f64 fd1065, fd979, fd1015; +mul.f64 fd1066, fd1065, 0dBFEBB67AE8584CAA; +add.f64 fd1067, fd983, fd1019; +mul.f64 fd1069, fd1067, 0d3FE0000000000000; +sub.f64 fd1070, fd947, fd1069; +add.f64 fd1210, fd984, fd1020; +sub.f64 fd1071, fd984, fd1020; +mul.f64 fd1072, fd1071, 0dBFEBB67AE8584CAA; +mul.f64 fd1073, fd1210, 0d3FE0000000000000; +sub.f64 fd1074, fd948, fd1073; +sub.f64 fd1075, fd983, fd1019; +mul.f64 fd1076, fd1075, 0dBFEBB67AE8584CAA; +add.f64 fd1077, fd987, fd1023; +mul.f64 fd1079, fd1077, 0d3FE0000000000000; +sub.f64 fd1080, fd951, fd1079; +add.f64 fd1209, fd988, fd1024; +sub.f64 fd1081, fd988, fd1024; +mul.f64 fd1082, fd1081, 0dBFEBB67AE8584CAA; +mul.f64 fd1083, fd1209, 0d3FE0000000000000; +sub.f64 fd1084, fd952, fd1083; +sub.f64 fd1085, fd987, fd1023; +mul.f64 fd1086, fd1085, 0dBFEBB67AE8584CAA; +add.f64 fd1087, fd991, fd1027; +mul.f64 fd1089, fd1087, 0d3FE0000000000000; +sub.f64 fd1090, fd955, fd1089; +add.f64 fd1208, fd992, fd1028; +sub.f64 fd1091, fd992, fd1028; +mul.f64 fd1092, fd1091, 0dBFEBB67AE8584CAA; +mul.f64 fd1093, fd1208, 0d3FE0000000000000; +sub.f64 fd1094, fd956, fd1093; +sub.f64 fd1095, fd991, fd1027; +mul.f64 fd1096, fd1095, 0dBFEBB67AE8584CAA; +add.f64 fd1097, fd995, fd1031; +mul.f64 fd1099, fd1097, 0d3FE0000000000000; +sub.f64 fd1100, fd959, fd1099; +add.f64 fd1207, fd996, fd1032; +sub.f64 fd1101, fd996, fd1032; +mul.f64 fd1102, fd1101, 0dBFEBB67AE8584CAA; +mul.f64 fd1103, fd1207, 0d3FE0000000000000; +sub.f64 fd1104, fd960, fd1103; +sub.f64 fd1105, fd995, fd1031; +mul.f64 fd1106, fd1105, 0dBFEBB67AE8584CAA; +add.f64 fd1107, fd999, fd1035; +mul.f64 fd1109, fd1107, 0d3FE0000000000000; +sub.f64 fd1110, fd963, fd1109; +add.f64 fd1206, fd1000, fd1036; +sub.f64 fd1111, fd1000, fd1036; +mul.f64 fd1112, fd1111, 0dBFEBB67AE8584CAA; +mul.f64 fd1113, fd1206, 0d3FE0000000000000; +sub.f64 fd1114, fd964, fd1113; +sub.f64 fd1115, fd999, fd1035; +mul.f64 fd1116, fd1115, 0dBFEBB67AE8584CAA; +add.f64 fd1117, fd1003, fd1039; +mul.f64 fd1119, fd1117, 0d3FE0000000000000; +sub.f64 fd1120, fd967, fd1119; +add.f64 fd1205, fd1004, fd1040; +sub.f64 fd1121, fd1004, fd1040; +mul.f64 fd1122, fd1121, 0dBFEBB67AE8584CAA; +mul.f64 fd1123, fd1205, 0d3FE0000000000000; +sub.f64 fd1124, fd968, fd1123; +sub.f64 fd1125, fd1003, fd1039; +mul.f64 fd1126, fd1125, 0dBFEBB67AE8584CAA; +add.f64 fd1127, fd1007, fd1043; +mul.f64 fd1129, fd1127, 0d3FE0000000000000; +sub.f64 fd1130, fd971, fd1129; +add.f64 fd1204, fd1008, fd1044; +sub.f64 fd1131, fd1008, fd1044; +mul.f64 fd1132, fd1131, 0dBFEBB67AE8584CAA; +mul.f64 fd1133, fd1204, 0d3FE0000000000000; +sub.f64 fd1134, fd972, fd1133; +sub.f64 fd1135, fd1007, fd1043; +mul.f64 fd1136, fd1135, 0dBFEBB67AE8584CAA; +add.f64 %1, fd940, fd1212; +add.f64 %0, fd939, fd1047; +add.f64 %3, fd944, fd1211; +add.f64 %2, fd943, fd1057; +add.f64 %5, fd948, fd1210; +add.f64 %4, fd947, fd1067; +add.f64 %7, fd952, fd1209; +add.f64 %6, fd951, fd1077; +add.f64 %9, fd956, fd1208; +add.f64 %8, fd955, fd1087; +add.f64 %11, fd960, fd1207; +add.f64 %10, fd959, fd1097; +add.f64 %13, fd964, fd1206; +add.f64 %12, fd963, fd1107; +add.f64 %15, fd968, fd1205; +add.f64 %14, fd967, fd1117; +add.f64 %17, fd972, fd1204; +add.f64 %16, fd971, fd1127; +sub.f64 %19, fd1054, fd1056; +add.f64 %18, fd1052, fd1050; +add.f64 %20, fd1062, fd1060; +sub.f64 %21, fd1064, fd1066; +add.f64 %22, fd1072, fd1070; +sub.f64 %23, fd1074, fd1076; +add.f64 %24, fd1082, fd1080; +sub.f64 %25, fd1084, fd1086; +sub.f64 %27, fd1094, fd1096; +add.f64 %26, fd1092, fd1090; +sub.f64 %29, fd1104, fd1106; +add.f64 %28, fd1102, fd1100; +sub.f64 %31, fd1114, fd1116; +add.f64 %30, fd1112, fd1110; +add.f64 %32, fd1122, fd1120; +sub.f64 %33, fd1124, fd1126; +add.f64 %34, fd1132, fd1130; +sub.f64 %35, fd1134, fd1136; +add.f64 %37, fd1056, fd1054; +sub.f64 %36, fd1050, fd1052; +add.f64 %39, fd1066, fd1064; +sub.f64 %38, fd1060, fd1062; +add.f64 %41, fd1076, fd1074; +sub.f64 %40, fd1070, fd1072; +add.f64 %43, fd1086, fd1084; +sub.f64 %42, fd1080, fd1082; +add.f64 %45, fd1096, fd1094; +sub.f64 %44, fd1090, fd1092; +add.f64 %47, fd1106, fd1104; +sub.f64 %46, fd1100, fd1102; +add.f64 %49, fd1116, fd1114; +sub.f64 %48, fd1110, fd1112; +add.f64 %51, fd1126, fd1124; +sub.f64 %50, fd1120, fd1122; +add.f64 %53, fd1136, fd1134; +sub.f64 %52, fd1130, fd1132; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<682, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<1313>; +.reg .b64 rd<8>; +mov.u32 r12, %tid.y; +mov.u32 r13, %54; +mad.lo.s32 r3, r12, 648, r13; +add.f64 fd109, %74, %92; +add.f64 fd110, %56, fd109; +mul.f64 fd113, fd109, 0d3FE0000000000000; +sub.f64 fd114, %56, fd113; +add.f64 fd1304, %75, %93; +sub.f64 fd115, %75, %93; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +add.f64 fd117, fd116, fd114; +sub.f64 fd118, fd114, fd116; +add.f64 fd1303, %57, fd1304; +mul.f64 fd119, fd1304, 0d3FE0000000000000; +sub.f64 fd120, %57, fd119; +sub.f64 fd121, %74, %92; +mul.f64 fd122, fd121, 0dBFEBB67AE8584CAA; +sub.f64 fd123, fd120, fd122; +add.f64 fd124, fd122, fd120; +add.f64 fd125, %80, %98; +add.f64 fd126, %62, fd125; +mul.f64 fd129, fd125, 0d3FE0000000000000; +sub.f64 fd130, %62, fd129; +add.f64 fd1302, %81, %99; +sub.f64 fd131, %81, %99; +mul.f64 fd132, fd131, 0dBFEBB67AE8584CAA; +add.f64 fd133, fd132, fd130; +sub.f64 fd134, fd130, fd132; +add.f64 fd1301, %63, fd1302; +mul.f64 fd135, fd1302, 0d3FE0000000000000; +sub.f64 fd136, %63, fd135; +sub.f64 fd137, %80, %98; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +sub.f64 fd139, fd136, fd138; +add.f64 fd140, fd138, fd136; +add.f64 fd141, %86, %104; +add.f64 fd142, %68, fd141; +mul.f64 fd145, fd141, 0d3FE0000000000000; +sub.f64 fd146, %68, fd145; +add.f64 fd1300, %87, %105; +sub.f64 fd147, %87, %105; +mul.f64 fd148, fd147, 0dBFEBB67AE8584CAA; +add.f64 fd149, fd148, fd146; +sub.f64 fd150, fd146, fd148; +add.f64 fd1299, %69, fd1300; +mul.f64 fd151, fd1300, 0d3FE0000000000000; +sub.f64 fd152, %69, fd151; +sub.f64 fd153, %86, %104; +mul.f64 fd154, fd153, 0dBFEBB67AE8584CAA; +sub.f64 fd155, fd152, fd154; +add.f64 fd156, fd154, fd152; +mul.f64 fd158, fd139, 0d3FE491B7523C161D; +mul.f64 fd1298, fd133, 0d3FE8836FA2CF5039; +sub.f64 fd159, fd1298, fd158; +mul.f64 fd160, fd139, 0d3FE8836FA2CF5039; +fma.rn.f64 fd161, fd133, 0d3FE491B7523C161D, fd160; +mul.f64 fd1296, fd149, 0d3FC63A1A7E0B738A; +mul.f64 fd1297, fd155, 0d3FEF838B8C811C17; +sub.f64 fd164, fd1296, fd1297; +mul.f64 fd165, fd155, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd166, fd149, 0d3FEF838B8C811C17, fd165; +mul.f64 fd1294, fd134, 0d3FC63A1A7E0B738A; +mul.f64 fd1295, fd140, 0d3FEF838B8C811C17; +sub.f64 fd169, fd1294, fd1295; +mul.f64 fd170, fd140, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd171, fd134, 0d3FEF838B8C811C17, fd170; +mul.f64 fd1292, fd150, 0dBFEE11F642522D1C; +mul.f64 fd1293, fd156, 0d3FD5E3A8748A0BF5; +sub.f64 fd174, fd1292, fd1293; +mul.f64 fd175, fd156, 0dBFEE11F642522D1C; +fma.rn.f64 fd176, fd150, 0d3FD5E3A8748A0BF5, fd175; +add.f64 fd177, fd126, fd142; +add.f64 fd178, fd110, fd177; +mul.f64 fd181, fd177, 0d3FE0000000000000; +sub.f64 fd182, fd110, fd181; +add.f64 fd1291, fd1301, fd1299; +sub.f64 fd183, fd1301, fd1299; +mul.f64 fd184, fd183, 0dBFEBB67AE8584CAA; +add.f64 fd185, fd184, fd182; +sub.f64 fd186, fd182, fd184; +add.f64 fd1290, fd1303, fd1291; +mul.f64 fd187, fd1291, 0d3FE0000000000000; +sub.f64 fd188, fd1303, fd187; +sub.f64 fd189, fd126, fd142; +mul.f64 fd190, fd189, 0dBFEBB67AE8584CAA; +sub.f64 fd191, fd188, fd190; +add.f64 fd192, fd190, fd188; +add.f64 fd193, fd159, fd164; +add.f64 fd194, fd117, fd193; +mul.f64 fd197, fd193, 0d3FE0000000000000; +sub.f64 fd198, fd117, fd197; +add.f64 fd1289, fd161, fd166; +sub.f64 fd199, fd161, fd166; +mul.f64 fd200, fd199, 0dBFEBB67AE8584CAA; +add.f64 fd201, fd200, fd198; +sub.f64 fd202, fd198, fd200; +add.f64 fd1288, fd123, fd1289; +mul.f64 fd203, fd1289, 0d3FE0000000000000; +sub.f64 fd204, fd123, fd203; +sub.f64 fd205, fd159, fd164; +mul.f64 fd206, fd205, 0dBFEBB67AE8584CAA; +sub.f64 fd207, fd204, fd206; +add.f64 fd208, fd206, fd204; +add.f64 fd209, fd169, fd174; +add.f64 fd210, fd118, fd209; +mul.f64 fd213, fd209, 0d3FE0000000000000; +sub.f64 fd214, fd118, fd213; +add.f64 fd1287, fd171, fd176; +sub.f64 fd215, fd171, fd176; +mul.f64 fd216, fd215, 0dBFEBB67AE8584CAA; +add.f64 fd217, fd216, fd214; +sub.f64 fd218, fd214, fd216; +add.f64 fd1286, fd124, fd1287; +mul.f64 fd219, fd1287, 0d3FE0000000000000; +sub.f64 fd220, fd124, fd219; +sub.f64 fd221, fd169, fd174; +mul.f64 fd222, fd221, 0dBFEBB67AE8584CAA; +sub.f64 fd223, fd220, fd222; +add.f64 fd224, fd222, fd220; +add.f64 fd225, %76, %94; +add.f64 fd226, %58, fd225; +mul.f64 fd229, fd225, 0d3FE0000000000000; +sub.f64 fd230, %58, fd229; +add.f64 fd1283, %111, %110; +sub.f64 fd231, %111, %110; +mul.f64 fd232, fd231, 0dBFEBB67AE8584CAA; +add.f64 fd233, fd232, fd230; +sub.f64 fd234, fd230, fd232; +add.f64 fd1281, %112, fd1283; +mul.f64 fd235, fd1283, 0d3FE0000000000000; +sub.f64 fd236, %112, fd235; +sub.f64 fd237, %76, %94; +mul.f64 fd238, fd237, 0dBFEBB67AE8584CAA; +sub.f64 fd239, fd236, fd238; +add.f64 fd240, fd238, fd236; +add.f64 fd241, %82, %100; +add.f64 fd242, %64, fd241; +mul.f64 fd245, fd241, 0d3FE0000000000000; +sub.f64 fd246, %64, fd245; +add.f64 fd1278, %114, %113; +sub.f64 fd247, %114, %113; +mul.f64 fd248, fd247, 0dBFEBB67AE8584CAA; +add.f64 fd249, fd248, fd246; +sub.f64 fd250, fd246, fd248; +add.f64 fd1276, %115, fd1278; +mul.f64 fd251, fd1278, 0d3FE0000000000000; +sub.f64 fd252, %115, fd251; +sub.f64 fd253, %82, %100; +mul.f64 fd254, fd253, 0dBFEBB67AE8584CAA; +sub.f64 fd255, fd252, fd254; +add.f64 fd256, fd254, fd252; +add.f64 fd257, %88, %106; +add.f64 fd258, %70, fd257; +mul.f64 fd261, fd257, 0d3FE0000000000000; +sub.f64 fd262, %70, fd261; +add.f64 fd1273, %116, %117; +sub.f64 fd263, %116, %117; +mul.f64 fd264, fd263, 0dBFEBB67AE8584CAA; +add.f64 fd265, fd264, fd262; +sub.f64 fd266, fd262, fd264; +add.f64 fd1271, %118, fd1273; +mul.f64 fd267, fd1273, 0d3FE0000000000000; +sub.f64 fd268, %118, fd267; +sub.f64 fd269, %88, %106; +mul.f64 fd270, fd269, 0dBFEBB67AE8584CAA; +sub.f64 fd271, fd268, fd270; +add.f64 fd272, fd270, fd268; +mul.f64 fd274, fd255, 0d3FE491B7523C161D; +mul.f64 fd1270, fd249, 0d3FE8836FA2CF5039; +sub.f64 fd275, fd1270, fd274; +mul.f64 fd276, fd255, 0d3FE8836FA2CF5039; +fma.rn.f64 fd277, fd249, 0d3FE491B7523C161D, fd276; +mul.f64 fd279, fd271, 0d3FEF838B8C811C17; +mul.f64 fd1269, fd265, 0d3FC63A1A7E0B738A; +sub.f64 fd280, fd1269, fd279; +mul.f64 fd281, fd271, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd282, fd265, 0d3FEF838B8C811C17, fd281; +mul.f64 fd1267, fd250, 0d3FC63A1A7E0B738A; +mul.f64 fd1268, fd256, 0d3FEF838B8C811C17; +sub.f64 fd285, fd1267, fd1268; +mul.f64 fd286, fd256, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd287, fd250, 0d3FEF838B8C811C17, fd286; +mul.f64 fd1265, fd266, 0dBFEE11F642522D1C; +mul.f64 fd1266, fd272, 0d3FD5E3A8748A0BF5; +sub.f64 fd290, fd1265, fd1266; +mul.f64 fd291, fd272, 0dBFEE11F642522D1C; +fma.rn.f64 fd292, fd266, 0d3FD5E3A8748A0BF5, fd291; +add.f64 fd293, fd242, fd258; +add.f64 fd294, fd226, fd293; +mul.f64 fd297, fd293, 0d3FE0000000000000; +sub.f64 fd298, fd226, fd297; +add.f64 fd1264, fd1276, fd1271; +sub.f64 fd299, fd1276, fd1271; +mul.f64 fd300, fd299, 0dBFEBB67AE8584CAA; +add.f64 fd301, fd300, fd298; +sub.f64 fd302, fd298, fd300; +add.f64 fd1263, fd1281, fd1264; +mul.f64 fd303, fd1264, 0d3FE0000000000000; +sub.f64 fd304, fd1281, fd303; +sub.f64 fd305, fd242, fd258; +mul.f64 fd306, fd305, 0dBFEBB67AE8584CAA; +sub.f64 fd307, fd304, fd306; +add.f64 fd308, fd306, fd304; +add.f64 fd309, fd275, fd280; +add.f64 fd310, fd233, fd309; +mul.f64 fd313, fd309, 0d3FE0000000000000; +sub.f64 fd314, fd233, fd313; +add.f64 fd1262, fd277, fd282; +sub.f64 fd315, fd277, fd282; +mul.f64 fd316, fd315, 0dBFEBB67AE8584CAA; +add.f64 fd317, fd316, fd314; +sub.f64 fd318, fd314, fd316; +add.f64 fd1261, fd239, fd1262; +mul.f64 fd319, fd1262, 0d3FE0000000000000; +sub.f64 fd320, fd239, fd319; +sub.f64 fd321, fd275, fd280; +mul.f64 fd322, fd321, 0dBFEBB67AE8584CAA; +sub.f64 fd323, fd320, fd322; +add.f64 fd324, fd322, fd320; +add.f64 fd325, fd285, fd290; +add.f64 fd326, fd234, fd325; +mul.f64 fd329, fd325, 0d3FE0000000000000; +sub.f64 fd330, fd234, fd329; +add.f64 fd1260, fd287, fd292; +sub.f64 fd331, fd287, fd292; +mul.f64 fd332, fd331, 0dBFEBB67AE8584CAA; +add.f64 fd333, fd332, fd330; +sub.f64 fd334, fd330, fd332; +add.f64 fd1259, fd240, fd1260; +mul.f64 fd335, fd1260, 0d3FE0000000000000; +sub.f64 fd336, fd240, fd335; +sub.f64 fd337, fd285, fd290; +mul.f64 fd338, fd337, 0dBFEBB67AE8584CAA; +sub.f64 fd339, fd336, fd338; +add.f64 fd340, fd338, fd336; +add.f64 fd341, %78, %96; +add.f64 fd342, %60, fd341; +mul.f64 fd345, fd341, 0d3FE0000000000000; +sub.f64 fd346, %60, fd345; +add.f64 fd1256, %119, %120; +sub.f64 fd347, %119, %120; +mul.f64 fd348, fd347, 0dBFEBB67AE8584CAA; +add.f64 fd349, fd348, fd346; +sub.f64 fd350, fd346, fd348; +add.f64 fd1254, %121, fd1256; +mul.f64 fd351, fd1256, 0d3FE0000000000000; +sub.f64 fd352, %121, fd351; +sub.f64 fd353, %78, %96; +mul.f64 fd354, fd353, 0dBFEBB67AE8584CAA; +sub.f64 fd355, fd352, fd354; +add.f64 fd356, fd354, fd352; +add.f64 fd357, %84, %102; +add.f64 fd358, %66, fd357; +mul.f64 fd361, fd357, 0d3FE0000000000000; +sub.f64 fd362, %66, fd361; +add.f64 fd1251, %123, %122; +sub.f64 fd363, %123, %122; +mul.f64 fd364, fd363, 0dBFEBB67AE8584CAA; +add.f64 fd365, fd364, fd362; +sub.f64 fd366, fd362, fd364; +add.f64 fd1249, %124, fd1251; +mul.f64 fd367, fd1251, 0d3FE0000000000000; +sub.f64 fd368, %124, fd367; +sub.f64 fd369, %84, %102; +mul.f64 fd370, fd369, 0dBFEBB67AE8584CAA; +sub.f64 fd371, fd368, fd370; +add.f64 fd372, fd370, fd368; +add.f64 fd373, %90, %108; +add.f64 fd374, %72, fd373; +mul.f64 fd377, fd373, 0d3FE0000000000000; +sub.f64 fd378, %72, fd377; +add.f64 fd1247, %125, %109; +sub.f64 fd379, %125, %109; +mul.f64 fd380, fd379, 0dBFEBB67AE8584CAA; +add.f64 fd381, fd380, fd378; +sub.f64 fd382, fd378, fd380; +add.f64 fd1245, %126, fd1247; +mul.f64 fd383, fd1247, 0d3FE0000000000000; +sub.f64 fd384, %126, fd383; +sub.f64 fd385, %90, %108; +mul.f64 fd386, fd385, 0dBFEBB67AE8584CAA; +sub.f64 fd387, fd384, fd386; +add.f64 fd388, fd386, fd384; +mul.f64 fd390, fd371, 0d3FE491B7523C161D; +mul.f64 fd1244, fd365, 0d3FE8836FA2CF5039; +sub.f64 fd391, fd1244, fd390; +mul.f64 fd392, fd371, 0d3FE8836FA2CF5039; +fma.rn.f64 fd393, fd365, 0d3FE491B7523C161D, fd392; +mul.f64 fd395, fd387, 0d3FEF838B8C811C17; +mul.f64 fd1243, fd381, 0d3FC63A1A7E0B738A; +sub.f64 fd396, fd1243, fd395; +mul.f64 fd397, fd387, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd398, fd381, 0d3FEF838B8C811C17, fd397; +mul.f64 fd1241, fd366, 0d3FC63A1A7E0B738A; +mul.f64 fd1242, fd372, 0d3FEF838B8C811C17; +sub.f64 fd401, fd1241, fd1242; +mul.f64 fd402, fd372, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd403, fd366, 0d3FEF838B8C811C17, fd402; +mul.f64 fd1239, fd382, 0dBFEE11F642522D1C; +mul.f64 fd1240, fd388, 0d3FD5E3A8748A0BF5; +sub.f64 fd406, fd1239, fd1240; +mul.f64 fd407, fd388, 0dBFEE11F642522D1C; +fma.rn.f64 fd408, fd382, 0d3FD5E3A8748A0BF5, fd407; +add.f64 fd409, fd358, fd374; +add.f64 fd410, fd342, fd409; +mul.f64 fd413, fd409, 0d3FE0000000000000; +sub.f64 fd414, fd342, fd413; +add.f64 fd1238, fd1249, fd1245; +sub.f64 fd415, fd1249, fd1245; +mul.f64 fd416, fd415, 0dBFEBB67AE8584CAA; +add.f64 fd417, fd416, fd414; +sub.f64 fd418, fd414, fd416; +add.f64 fd1237, fd1254, fd1238; +mul.f64 fd419, fd1238, 0d3FE0000000000000; +sub.f64 fd420, fd1254, fd419; +sub.f64 fd421, fd358, fd374; +mul.f64 fd422, fd421, 0dBFEBB67AE8584CAA; +sub.f64 fd423, fd420, fd422; +add.f64 fd424, fd422, fd420; +add.f64 fd425, fd391, fd396; +add.f64 fd426, fd349, fd425; +mul.f64 fd429, fd425, 0d3FE0000000000000; +sub.f64 fd430, fd349, fd429; +add.f64 fd1236, fd393, fd398; +sub.f64 fd431, fd393, fd398; +mul.f64 fd432, fd431, 0dBFEBB67AE8584CAA; +add.f64 fd433, fd432, fd430; +sub.f64 fd434, fd430, fd432; +add.f64 fd1235, fd355, fd1236; +mul.f64 fd435, fd1236, 0d3FE0000000000000; +sub.f64 fd436, fd355, fd435; +sub.f64 fd437, fd391, fd396; +mul.f64 fd438, fd437, 0dBFEBB67AE8584CAA; +sub.f64 fd439, fd436, fd438; +add.f64 fd440, fd438, fd436; +add.f64 fd441, fd401, fd406; +add.f64 fd442, fd350, fd441; +mul.f64 fd445, fd441, 0d3FE0000000000000; +sub.f64 fd446, fd350, fd445; +add.f64 fd1234, fd403, fd408; +sub.f64 fd447, fd403, fd408; +mul.f64 fd448, fd447, 0dBFEBB67AE8584CAA; +add.f64 fd449, fd448, fd446; +sub.f64 fd450, fd446, fd448; +add.f64 fd1233, fd356, fd1234; +mul.f64 fd451, fd1234, 0d3FE0000000000000; +sub.f64 fd452, fd356, fd451; +sub.f64 fd453, fd401, fd406; +mul.f64 fd454, fd453, 0dBFEBB67AE8584CAA; +sub.f64 fd455, fd452, fd454; +add.f64 fd456, fd454, fd452; +mul.f64 fd458, fd1261, 0d3FCD84D223638000; +mul.f64 fd1232, fd310, 0d3FEF232EFF15C9E6; +sub.f64 fd459, fd1232, fd458; +mul.f64 fd460, fd1261, 0d3FEF232EFF15C9E6; +fma.rn.f64 fd461, fd310, 0d3FCD84D223638000, fd460; +mul.f64 fd1230, fd426, 0d3FEC98A37A9A7850; +mul.f64 fd1231, fd1235, 0d3FDCB920325BAFA6; +sub.f64 fd464, fd1230, fd1231; +mul.f64 fd465, fd1235, 0d3FEC98A37A9A7850; +fma.rn.f64 fd466, fd426, 0d3FDCB920325BAFA6, fd465; +mul.f64 fd1228, fd326, 0d3FEC98A37A9A7850; +mul.f64 fd1229, fd1259, 0d3FDCB920325BAFA6; +sub.f64 fd469, fd1228, fd1229; +mul.f64 fd470, fd1259, 0d3FEC98A37A9A7850; +fma.rn.f64 fd471, fd326, 0d3FDCB920325BAFA6, fd470; +mul.f64 fd1226, fd442, 0d3FE31BEC55BC71BC; +mul.f64 fd1227, fd1233, 0d3FE9AAFE4207DF5F; +sub.f64 fd474, fd1226, fd1227; +mul.f64 fd475, fd1233, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd476, fd442, 0d3FE9AAFE4207DF5F, fd475; +mul.f64 fd1224, fd301, 0d3FE8836FA2CF5039; +mul.f64 fd1225, fd307, 0d3FE491B7523C161D; +sub.f64 fd479, fd1224, fd1225; +mul.f64 fd480, fd307, 0d3FE8836FA2CF5039; +fma.rn.f64 fd481, fd301, 0d3FE491B7523C161D, fd480; +mul.f64 fd483, fd423, 0d3FEF838B8C811C17; +mul.f64 fd1223, fd417, 0d3FC63A1A7E0B738A; +sub.f64 fd484, fd1223, fd483; +mul.f64 fd485, fd423, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd486, fd417, 0d3FEF838B8C811C17, fd485; +mul.f64 fd488, fd323, 0d3FE9AAFE4207DF5F; +mul.f64 fd1222, fd317, 0d3FE31BEC55BC71BC; +sub.f64 fd489, fd1222, fd488; +mul.f64 fd490, fd323, 0d3FE31BEC55BC71BC; +fma.rn.f64 fd491, fd317, 0d3FE9AAFE4207DF5F, fd490; +mul.f64 fd493, fd439, 0d3FEEA7D99F29CADE; +mul.f64 fd1221, fd433, 0dBFD25AFBF23865BF; +sub.f64 fd494, fd1221, fd493; +mul.f64 fd495, fd439, 0dBFD25AFBF23865BF; +fma.rn.f64 fd496, fd433, 0d3FEEA7D99F29CADE, fd495; +mul.f64 fd498, fd339, 0d3FED6206BEB6C24B; +mul.f64 fd1220, fd333, 0d3FD9595EF26FB670; +sub.f64 fd499, fd1220, fd498; +mul.f64 fd500, fd339, 0d3FD9595EF26FB670; +fma.rn.f64 fd501, fd333, 0d3FED6206BEB6C24B, fd500; +mul.f64 fd503, fd455, 0d3FE746A51650EADE; +mul.f64 fd1219, fd449, 0dBFE5F5B105F99707; +sub.f64 fd504, fd1219, fd503; +mul.f64 fd505, fd455, 0dBFE5F5B105F99707; +fma.rn.f64 fd506, fd449, 0d3FE746A51650EADE, fd505; +mul.f64 fd508, fd308, 0d3FEF838B8C811C17; +mul.f64 fd1218, fd302, 0d3FC63A1A7E0B738A; +sub.f64 fd509, fd1218, fd508; +mul.f64 fd510, fd308, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd511, fd302, 0d3FEF838B8C811C17, fd510; +mul.f64 fd1216, fd418, 0dBFEE11F642522D1C; +mul.f64 fd1217, fd424, 0d3FD5E3A8748A0BF5; +sub.f64 fd514, fd1216, fd1217; +mul.f64 fd515, fd424, 0dBFEE11F642522D1C; +fma.rn.f64 fd516, fd418, 0d3FD5E3A8748A0BF5, fd515; +mul.f64 fd1214, fd318, 0dBFADC528B5343A86; +mul.f64 fd1215, fd324, 0d3FEFF223F3635CE3; +sub.f64 fd519, fd1214, fd1215; +mul.f64 fd520, fd324, 0dBFADC528B5343A86; +fma.rn.f64 fd521, fd318, 0d3FEFF223F3635CE3, fd520; +mul.f64 fd1212, fd434, 0dBFEFC89BCEF44CF4; +mul.f64 fd1213, fd440, 0dBFBDB843E577175E; +sub.f64 fd524, fd1212, fd1213; +mul.f64 fd525, fd440, 0dBFEFC89BCEF44CF4; +fma.rn.f64 fd526, fd434, 0dBFBDB843E577175E, fd525; +mul.f64 fd528, fd340, 0d3FEEA7D99F29CADE; +mul.f64 fd1211, fd334, 0dBFD25AFBF23865BF; +sub.f64 fd529, fd1211, fd528; +mul.f64 fd530, fd340, 0dBFD25AFBF23865BF; +fma.rn.f64 fd531, fd334, 0d3FEEA7D99F29CADE, fd530; +mul.f64 fd533, fd456, 0dBFE19593DA358510; +mul.f64 fd1210, fd450, 0dBFEABC50EF4734A7; +sub.f64 fd534, fd1210, fd533; +mul.f64 fd535, fd456, 0dBFEABC50EF4734A7; +fma.rn.f64 fd536, fd450, 0dBFE19593DA358510, fd535; +add.f64 fd537, fd294, fd410; +add.f64 fd538, fd178, fd537; +mul.f64 fd541, fd537, 0d3FE0000000000000; +sub.f64 fd542, fd178, fd541; +add.f64 fd1209, fd1263, fd1237; +sub.f64 fd543, fd1263, fd1237; +mul.f64 fd544, fd543, 0dBFEBB67AE8584CAA; +add.f64 fd545, fd544, fd542; +sub.f64 fd546, fd542, fd544; +add.f64 fd1208, fd1290, fd1209; +mul.f64 fd547, fd1209, 0d3FE0000000000000; +sub.f64 fd548, fd1290, fd547; +sub.f64 fd549, fd294, fd410; +mul.f64 fd550, fd549, 0dBFEBB67AE8584CAA; +sub.f64 fd551, fd548, fd550; +add.f64 fd552, fd550, fd548; +add.f64 fd553, fd459, fd464; +add.f64 fd554, fd194, fd553; +mul.f64 fd557, fd553, 0d3FE0000000000000; +sub.f64 fd558, fd194, fd557; +add.f64 fd1207, fd461, fd466; +sub.f64 fd559, fd461, fd466; +mul.f64 fd560, fd559, 0dBFEBB67AE8584CAA; +add.f64 fd561, fd560, fd558; +sub.f64 fd562, fd558, fd560; +add.f64 fd1206, fd1288, fd1207; +mul.f64 fd563, fd1207, 0d3FE0000000000000; +sub.f64 fd564, fd1288, fd563; +sub.f64 fd565, fd459, fd464; +mul.f64 fd566, fd565, 0dBFEBB67AE8584CAA; +sub.f64 fd567, fd564, fd566; +add.f64 fd568, fd566, fd564; +add.f64 fd569, fd469, fd474; +add.f64 fd570, fd210, fd569; +mul.f64 fd573, fd569, 0d3FE0000000000000; +sub.f64 fd574, fd210, fd573; +add.f64 fd1205, fd471, fd476; +sub.f64 fd575, fd471, fd476; +mul.f64 fd576, fd575, 0dBFEBB67AE8584CAA; +add.f64 fd577, fd576, fd574; +sub.f64 fd578, fd574, fd576; +add.f64 fd1204, fd1286, fd1205; +mul.f64 fd579, fd1205, 0d3FE0000000000000; +sub.f64 fd580, fd1286, fd579; +sub.f64 fd581, fd469, fd474; +mul.f64 fd582, fd581, 0dBFEBB67AE8584CAA; +sub.f64 fd583, fd580, fd582; +add.f64 fd584, fd582, fd580; +add.f64 fd585, fd479, fd484; +add.f64 fd586, fd185, fd585; +mul.f64 fd589, fd585, 0d3FE0000000000000; +sub.f64 fd590, fd185, fd589; +add.f64 fd1203, fd481, fd486; +sub.f64 fd591, fd481, fd486; +mul.f64 fd592, fd591, 0dBFEBB67AE8584CAA; +add.f64 fd593, fd592, fd590; +sub.f64 fd594, fd590, fd592; +add.f64 fd1202, fd191, fd1203; +mul.f64 fd595, fd1203, 0d3FE0000000000000; +sub.f64 fd596, fd191, fd595; +sub.f64 fd597, fd479, fd484; +mul.f64 fd598, fd597, 0dBFEBB67AE8584CAA; +sub.f64 fd599, fd596, fd598; +add.f64 fd600, fd598, fd596; +add.f64 fd601, fd489, fd494; +add.f64 fd602, fd201, fd601; +mul.f64 fd605, fd601, 0d3FE0000000000000; +sub.f64 fd606, fd201, fd605; +add.f64 fd1201, fd491, fd496; +sub.f64 fd607, fd491, fd496; +mul.f64 fd608, fd607, 0dBFEBB67AE8584CAA; +add.f64 fd609, fd608, fd606; +sub.f64 fd610, fd606, fd608; +add.f64 fd1200, fd207, fd1201; +mul.f64 fd611, fd1201, 0d3FE0000000000000; +sub.f64 fd612, fd207, fd611; +sub.f64 fd613, fd489, fd494; +mul.f64 fd614, fd613, 0dBFEBB67AE8584CAA; +sub.f64 fd615, fd612, fd614; +add.f64 fd616, fd614, fd612; +add.f64 fd617, fd499, fd504; +add.f64 fd618, fd217, fd617; +mul.f64 fd621, fd617, 0d3FE0000000000000; +sub.f64 fd622, fd217, fd621; +add.f64 fd1199, fd501, fd506; +sub.f64 fd623, fd501, fd506; +mul.f64 fd624, fd623, 0dBFEBB67AE8584CAA; +add.f64 fd625, fd624, fd622; +sub.f64 fd626, fd622, fd624; +add.f64 fd1198, fd223, fd1199; +mul.f64 fd627, fd1199, 0d3FE0000000000000; +sub.f64 fd628, fd223, fd627; +sub.f64 fd629, fd499, fd504; +mul.f64 fd630, fd629, 0dBFEBB67AE8584CAA; +sub.f64 fd631, fd628, fd630; +add.f64 fd632, fd630, fd628; +add.f64 fd633, fd509, fd514; +add.f64 fd634, fd186, fd633; +mul.f64 fd637, fd633, 0d3FE0000000000000; +sub.f64 fd638, fd186, fd637; +add.f64 fd1197, fd511, fd516; +sub.f64 fd639, fd511, fd516; +mul.f64 fd640, fd639, 0dBFEBB67AE8584CAA; +add.f64 fd641, fd640, fd638; +sub.f64 fd642, fd638, fd640; +add.f64 fd1196, fd192, fd1197; +mul.f64 fd643, fd1197, 0d3FE0000000000000; +sub.f64 fd644, fd192, fd643; +sub.f64 fd645, fd509, fd514; +mul.f64 fd646, fd645, 0dBFEBB67AE8584CAA; +sub.f64 fd647, fd644, fd646; +add.f64 fd648, fd646, fd644; +add.f64 fd649, fd519, fd524; +add.f64 fd650, fd202, fd649; +mul.f64 fd653, fd649, 0d3FE0000000000000; +sub.f64 fd654, fd202, fd653; +add.f64 fd1195, fd521, fd526; +sub.f64 fd655, fd521, fd526; +mul.f64 fd656, fd655, 0dBFEBB67AE8584CAA; +add.f64 fd657, fd656, fd654; +sub.f64 fd658, fd654, fd656; +add.f64 fd1194, fd208, fd1195; +mul.f64 fd659, fd1195, 0d3FE0000000000000; +sub.f64 fd660, fd208, fd659; +sub.f64 fd661, fd519, fd524; +mul.f64 fd662, fd661, 0dBFEBB67AE8584CAA; +sub.f64 fd663, fd660, fd662; +add.f64 fd664, fd662, fd660; +add.f64 fd665, fd529, fd534; +add.f64 fd666, fd218, fd665; +mul.f64 fd669, fd665, 0d3FE0000000000000; +sub.f64 fd670, fd218, fd669; +add.f64 fd1193, fd531, fd536; +sub.f64 fd671, fd531, fd536; +mul.f64 fd672, fd671, 0dBFEBB67AE8584CAA; +add.f64 fd673, fd672, fd670; +sub.f64 fd674, fd670, fd672; +add.f64 fd1192, fd224, fd1193; +mul.f64 fd675, fd1193, 0d3FE0000000000000; +sub.f64 fd676, fd224, fd675; +sub.f64 fd677, fd529, fd534; +mul.f64 fd678, fd677, 0dBFEBB67AE8584CAA; +sub.f64 fd679, fd676, fd678; +add.f64 fd680, fd678, fd676; +mov.u32 r11, %tid.x; +mul.wide.u32 rd2, r11, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r11, r6; +mov.u64 rd5, %55; +mul.wide.u32 rd7, r7, 16; +add.s64 rd6, rd5, rd7; +ld.global.v2.f64 {fd681, fd682}, [rd6]; +mul.f64 fd685, fd1206, fd682; +fma.rn.f64 fd686, fd681, fd554, fd685; +mul.f64 fd687, fd554, fd682; +mul.f64 fd688, fd681, fd1206; +sub.f64 fd689, fd688, fd687; +mul.f64 fd691, fd682, fd682; +mul.f64 fd1191, fd681, fd681; +sub.f64 fd692, fd1191, fd691; +mul.f64 fd693, fd682, fd681; +fma.rn.f64 fd694, fd682, fd681, fd693; +mul.f64 fd695, fd1204, fd694; +fma.rn.f64 fd696, fd692, fd570, fd695; +mul.f64 fd697, fd570, fd694; +mul.f64 fd698, fd692, fd1204; +sub.f64 fd699, fd698, fd697; +mul.f64 fd701, fd682, fd694; +mul.f64 fd1190, fd681, fd692; +sub.f64 fd702, fd1190, fd701; +mul.f64 fd703, fd681, fd694; +fma.rn.f64 fd704, fd682, fd692, fd703; +mul.f64 fd705, fd1202, fd704; +fma.rn.f64 fd706, fd702, fd586, fd705; +mul.f64 fd707, fd586, fd704; +mul.f64 fd708, fd702, fd1202; +sub.f64 fd709, fd708, fd707; +mul.f64 fd1188, fd681, fd702; +mul.f64 fd1189, fd682, fd704; +sub.f64 fd712, fd1188, fd1189; +mul.f64 fd713, fd681, fd704; +fma.rn.f64 fd714, fd682, fd702, fd713; +mul.f64 fd715, fd1200, fd714; +fma.rn.f64 fd716, fd712, fd602, fd715; +mul.f64 fd717, fd602, fd714; +mul.f64 fd718, fd712, fd1200; +sub.f64 fd719, fd718, fd717; +mul.f64 fd1186, fd681, fd712; +mul.f64 fd1187, fd682, fd714; +sub.f64 fd722, fd1186, fd1187; +mul.f64 fd723, fd681, fd714; +fma.rn.f64 fd724, fd682, fd712, fd723; +mul.f64 fd725, fd1198, fd724; +fma.rn.f64 fd726, fd722, fd618, fd725; +mul.f64 fd727, fd618, fd724; +mul.f64 fd728, fd722, fd1198; +sub.f64 fd729, fd728, fd727; +mul.f64 fd731, fd682, fd724; +mul.f64 fd1185, fd681, fd722; +sub.f64 fd732, fd1185, fd731; +mul.f64 fd733, fd681, fd724; +fma.rn.f64 fd734, fd682, fd722, fd733; +mul.f64 fd735, fd1196, fd734; +fma.rn.f64 fd736, fd732, fd634, fd735; +mul.f64 fd737, fd634, fd734; +mul.f64 fd738, fd732, fd1196; +sub.f64 fd739, fd738, fd737; +mul.f64 fd741, fd682, fd734; +mul.f64 fd1184, fd681, fd732; +sub.f64 fd742, fd1184, fd741; +mul.f64 fd743, fd681, fd734; +fma.rn.f64 fd744, fd682, fd732, fd743; +mul.f64 fd745, fd1194, fd744; +fma.rn.f64 fd746, fd742, fd650, fd745; +mul.f64 fd747, fd650, fd744; +mul.f64 fd748, fd742, fd1194; +sub.f64 fd749, fd748, fd747; +mul.f64 fd751, fd682, fd744; +mul.f64 fd1183, fd681, fd742; +sub.f64 fd752, fd1183, fd751; +mul.f64 fd753, fd681, fd744; +fma.rn.f64 fd754, fd682, fd742, fd753; +mul.f64 fd755, fd1192, fd754; +fma.rn.f64 fd756, fd752, fd666, fd755; +mul.f64 fd757, fd666, fd754; +mul.f64 fd758, fd752, fd1192; +sub.f64 fd759, fd758, fd757; +mul.f64 fd1181, fd681, fd752; +mul.f64 fd1182, fd682, fd754; +sub.f64 fd762, fd1181, fd1182; +mul.f64 fd763, fd681, fd754; +fma.rn.f64 fd764, fd682, fd752, fd763; +mul.f64 fd765, fd551, fd764; +fma.rn.f64 fd766, fd762, fd545, fd765; +mul.f64 fd767, fd545, fd764; +mul.f64 fd768, fd762, fd551; +sub.f64 fd769, fd768, fd767; +mul.f64 fd1179, fd681, fd762; +mul.f64 fd1180, fd682, fd764; +sub.f64 fd772, fd1179, fd1180; +mul.f64 fd773, fd681, fd764; +fma.rn.f64 fd774, fd682, fd762, fd773; +mul.f64 fd775, fd567, fd774; +fma.rn.f64 fd776, fd772, fd561, fd775; +mul.f64 fd777, fd561, fd774; +mul.f64 fd778, fd772, fd567; +sub.f64 fd779, fd778, fd777; +mul.f64 fd781, fd682, fd774; +mul.f64 fd1178, fd681, fd772; +sub.f64 fd782, fd1178, fd781; +mul.f64 fd783, fd681, fd774; +fma.rn.f64 fd784, fd682, fd772, fd783; +mul.f64 fd785, fd583, fd784; +fma.rn.f64 fd786, fd782, fd577, fd785; +mul.f64 fd787, fd577, fd784; +mul.f64 fd788, fd782, fd583; +sub.f64 fd789, fd788, fd787; +mul.f64 fd791, fd682, fd784; +mul.f64 fd1177, fd681, fd782; +sub.f64 fd792, fd1177, fd791; +mul.f64 fd793, fd681, fd784; +fma.rn.f64 fd794, fd682, fd782, fd793; +mul.f64 fd795, fd599, fd794; +fma.rn.f64 fd796, fd792, fd593, fd795; +mul.f64 fd797, fd593, fd794; +mul.f64 fd798, fd792, fd599; +sub.f64 fd799, fd798, fd797; +mul.f64 fd801, fd682, fd794; +mul.f64 fd1176, fd681, fd792; +sub.f64 fd802, fd1176, fd801; +mul.f64 fd803, fd681, fd794; +fma.rn.f64 fd804, fd682, fd792, fd803; +mul.f64 fd805, fd615, fd804; +fma.rn.f64 fd806, fd802, fd609, fd805; +mul.f64 fd807, fd609, fd804; +mul.f64 fd808, fd802, fd615; +sub.f64 fd809, fd808, fd807; +ld.global.v2.f64 {fd810, fd811}, [rd6+48]; +mul.f64 fd814, fd631, fd811; +fma.rn.f64 fd815, fd810, fd625, fd814; +mul.f64 fd816, fd625, fd811; +mul.f64 fd817, fd810, fd631; +sub.f64 fd818, fd817, fd816; +mul.f64 fd820, fd682, fd811; +mul.f64 fd1175, fd681, fd810; +sub.f64 fd821, fd1175, fd820; +mul.f64 fd822, fd681, fd811; +fma.rn.f64 fd823, fd682, fd810, fd822; +mul.f64 fd824, fd647, fd823; +fma.rn.f64 fd825, fd821, fd641, fd824; +mul.f64 fd826, fd641, fd823; +mul.f64 fd827, fd821, fd647; +sub.f64 fd828, fd827, fd826; +mul.f64 fd1173, fd681, fd821; +mul.f64 fd1174, fd682, fd823; +sub.f64 fd831, fd1173, fd1174; +mul.f64 fd832, fd681, fd823; +fma.rn.f64 fd833, fd682, fd821, fd832; +mul.f64 fd834, fd663, fd833; +fma.rn.f64 fd835, fd831, fd657, fd834; +mul.f64 fd836, fd657, fd833; +mul.f64 fd837, fd831, fd663; +sub.f64 fd838, fd837, fd836; +mul.f64 fd840, fd682, fd833; +mul.f64 fd1172, fd681, fd831; +sub.f64 fd841, fd1172, fd840; +mul.f64 fd842, fd681, fd833; +fma.rn.f64 fd843, fd682, fd831, fd842; +mul.f64 fd844, fd679, fd843; +fma.rn.f64 fd845, fd841, fd673, fd844; +mul.f64 fd846, fd673, fd843; +mul.f64 fd847, fd841, fd679; +sub.f64 fd848, fd847, fd846; +mul.f64 fd850, fd682, fd843; +mul.f64 fd1171, fd681, fd841; +sub.f64 fd851, fd1171, fd850; +mul.f64 fd852, fd681, fd843; +fma.rn.f64 fd853, fd682, fd841, fd852; +mul.f64 fd854, fd552, fd853; +fma.rn.f64 fd855, fd851, fd546, fd854; +mul.f64 fd856, fd546, fd853; +mul.f64 fd857, fd851, fd552; +sub.f64 fd858, fd857, fd856; +mul.f64 fd860, fd682, fd853; +mul.f64 fd1170, fd681, fd851; +sub.f64 fd861, fd1170, fd860; +mul.f64 fd862, fd681, fd853; +fma.rn.f64 fd863, fd682, fd851, fd862; +mul.f64 fd864, fd568, fd863; +fma.rn.f64 fd865, fd861, fd562, fd864; +mul.f64 fd866, fd562, fd863; +mul.f64 fd867, fd861, fd568; +sub.f64 fd868, fd867, fd866; +mul.f64 fd1168, fd681, fd861; +mul.f64 fd1169, fd682, fd863; +sub.f64 fd871, fd1168, fd1169; +mul.f64 fd872, fd681, fd863; +fma.rn.f64 fd873, fd682, fd861, fd872; +mul.f64 fd874, fd584, fd873; +fma.rn.f64 fd875, fd871, fd578, fd874; +mul.f64 fd876, fd578, fd873; +mul.f64 fd877, fd871, fd584; +sub.f64 fd878, fd877, fd876; +mul.f64 fd1166, fd681, fd871; +mul.f64 fd1167, fd682, fd873; +sub.f64 fd881, fd1166, fd1167; +mul.f64 fd882, fd681, fd873; +fma.rn.f64 fd883, fd682, fd871, fd882; +mul.f64 fd884, fd600, fd883; +fma.rn.f64 fd885, fd881, fd594, fd884; +mul.f64 fd886, fd594, fd883; +mul.f64 fd887, fd881, fd600; +sub.f64 fd888, fd887, fd886; +mul.f64 fd890, fd682, fd883; +mul.f64 fd1165, fd681, fd881; +sub.f64 fd891, fd1165, fd890; +mul.f64 fd892, fd681, fd883; +fma.rn.f64 fd893, fd682, fd881, fd892; +mul.f64 fd894, fd616, fd893; +fma.rn.f64 fd895, fd891, fd610, fd894; +mul.f64 fd896, fd610, fd893; +mul.f64 fd897, fd891, fd616; +sub.f64 fd898, fd897, fd896; +mul.f64 fd900, fd682, fd893; +mul.f64 fd1164, fd681, fd891; +sub.f64 fd901, fd1164, fd900; +mul.f64 fd902, fd681, fd893; +fma.rn.f64 fd903, fd682, fd891, fd902; +mul.f64 fd904, fd632, fd903; +fma.rn.f64 fd905, fd901, fd626, fd904; +mul.f64 fd906, fd626, fd903; +mul.f64 fd907, fd901, fd632; +sub.f64 fd908, fd907, fd906; +mul.f64 fd910, fd682, fd903; +mul.f64 fd1163, fd681, fd901; +sub.f64 fd911, fd1163, fd910; +mul.f64 fd912, fd681, fd903; +fma.rn.f64 fd913, fd682, fd901, fd912; +mul.f64 fd914, fd648, fd913; +fma.rn.f64 fd915, fd911, fd642, fd914; +mul.f64 fd916, fd642, fd913; +mul.f64 fd917, fd911, fd648; +sub.f64 fd918, fd917, fd916; +mul.f64 fd1161, fd681, fd911; +mul.f64 fd1162, fd682, fd913; +sub.f64 fd921, fd1161, fd1162; +mul.f64 fd922, fd681, fd913; +fma.rn.f64 fd923, fd682, fd911, fd922; +mul.f64 fd924, fd664, fd923; +fma.rn.f64 fd925, fd921, fd658, fd924; +mul.f64 fd926, fd658, fd923; +mul.f64 fd927, fd921, fd664; +sub.f64 fd928, fd927, fd926; +mul.f64 fd1159, fd681, fd921; +mul.f64 fd1160, fd682, fd923; +sub.f64 fd931, fd1159, fd1160; +mul.f64 fd932, fd681, fd923; +fma.rn.f64 fd933, fd682, fd921, fd932; +mul.f64 fd934, fd680, fd933; +fma.rn.f64 fd935, fd931, fd674, fd934; +mul.f64 fd936, fd674, fd933; +mul.f64 fd937, fd931, fd680; +sub.f64 fd938, fd937, fd936; +mad.lo.s32 r8, r5, 648, r3; +barrier.sync 0; +mad.lo.s32 r9, r7, 216, r8; +st.shared.f64 [r9], fd538; +st.shared.f64 [r9+8], fd686; +st.shared.f64 [r9+16], fd696; +st.shared.f64 [r9+24], fd706; +st.shared.f64 [r9+32], fd716; +st.shared.f64 [r9+40], fd726; +st.shared.f64 [r9+48], fd736; +st.shared.f64 [r9+56], fd746; +st.shared.f64 [r9+64], fd756; +st.shared.f64 [r9+72], fd766; +st.shared.f64 [r9+80], fd776; +st.shared.f64 [r9+88], fd786; +st.shared.f64 [r9+96], fd796; +st.shared.f64 [r9+104], fd806; +st.shared.f64 [r9+112], fd815; +st.shared.f64 [r9+120], fd825; +st.shared.f64 [r9+128], fd835; +st.shared.f64 [r9+136], fd845; +st.shared.f64 [r9+144], fd855; +st.shared.f64 [r9+152], fd865; +st.shared.f64 [r9+160], fd875; +st.shared.f64 [r9+168], fd885; +st.shared.f64 [r9+176], fd895; +st.shared.f64 [r9+184], fd905; +st.shared.f64 [r9+192], fd915; +st.shared.f64 [r9+200], fd925; +st.shared.f64 [r9+208], fd935; +barrier.sync 0; +mad.lo.s32 r10, r7, -208, r9; +ld.shared.f64 fd939, [r10]; +ld.shared.f64 fd940, [r10+24]; +ld.shared.f64 fd941, [r10+48]; +ld.shared.f64 fd942, [r10+72]; +ld.shared.f64 fd943, [r10+96]; +ld.shared.f64 fd944, [r10+120]; +ld.shared.f64 fd945, [r10+144]; +ld.shared.f64 fd946, [r10+168]; +ld.shared.f64 fd947, [r10+192]; +ld.shared.f64 fd948, [r10+216]; +ld.shared.f64 fd949, [r10+240]; +ld.shared.f64 fd950, [r10+264]; +ld.shared.f64 fd951, [r10+288]; +ld.shared.f64 fd952, [r10+312]; +ld.shared.f64 fd953, [r10+336]; +ld.shared.f64 fd954, [r10+360]; +ld.shared.f64 fd955, [r10+384]; +ld.shared.f64 fd956, [r10+408]; +ld.shared.f64 fd957, [r10+432]; +ld.shared.f64 fd958, [r10+456]; +ld.shared.f64 fd959, [r10+480]; +ld.shared.f64 fd960, [r10+504]; +ld.shared.f64 fd961, [r10+528]; +ld.shared.f64 fd962, [r10+552]; +ld.shared.f64 fd963, [r10+576]; +ld.shared.f64 fd964, [r10+600]; +ld.shared.f64 fd965, [r10+624]; +barrier.sync 0; +st.shared.f64 [r9], fd1208; +st.shared.f64 [r9+8], fd689; +st.shared.f64 [r9+16], fd699; +st.shared.f64 [r9+24], fd709; +st.shared.f64 [r9+32], fd719; +st.shared.f64 [r9+40], fd729; +st.shared.f64 [r9+48], fd739; +st.shared.f64 [r9+56], fd749; +st.shared.f64 [r9+64], fd759; +st.shared.f64 [r9+72], fd769; +st.shared.f64 [r9+80], fd779; +st.shared.f64 [r9+88], fd789; +st.shared.f64 [r9+96], fd799; +st.shared.f64 [r9+104], fd809; +st.shared.f64 [r9+112], fd818; +st.shared.f64 [r9+120], fd828; +st.shared.f64 [r9+128], fd838; +st.shared.f64 [r9+136], fd848; +st.shared.f64 [r9+144], fd858; +st.shared.f64 [r9+152], fd868; +st.shared.f64 [r9+160], fd878; +st.shared.f64 [r9+168], fd888; +st.shared.f64 [r9+176], fd898; +st.shared.f64 [r9+184], fd908; +st.shared.f64 [r9+192], fd918; +st.shared.f64 [r9+200], fd928; +st.shared.f64 [r9+208], fd938; +barrier.sync 0; +ld.shared.f64 fd966, [r10]; +ld.shared.f64 fd967, [r10+24]; +ld.shared.f64 fd968, [r10+48]; +ld.shared.f64 fd969, [r10+72]; +ld.shared.f64 fd970, [r10+96]; +ld.shared.f64 fd971, [r10+120]; +ld.shared.f64 fd972, [r10+144]; +ld.shared.f64 fd973, [r10+168]; +ld.shared.f64 fd974, [r10+192]; +ld.shared.f64 fd975, [r10+216]; +ld.shared.f64 fd976, [r10+240]; +ld.shared.f64 fd977, [r10+264]; +ld.shared.f64 fd978, [r10+288]; +ld.shared.f64 fd979, [r10+312]; +ld.shared.f64 fd980, [r10+336]; +ld.shared.f64 fd981, [r10+360]; +ld.shared.f64 fd982, [r10+384]; +ld.shared.f64 fd983, [r10+408]; +ld.shared.f64 fd984, [r10+432]; +ld.shared.f64 fd985, [r10+456]; +ld.shared.f64 fd986, [r10+480]; +ld.shared.f64 fd987, [r10+504]; +ld.shared.f64 fd988, [r10+528]; +ld.shared.f64 fd989, [r10+552]; +ld.shared.f64 fd990, [r10+576]; +ld.shared.f64 fd991, [r10+600]; +ld.shared.f64 fd992, [r10+624]; +add.f64 fd993, fd948, fd957; +mul.f64 fd995, fd993, 0d3FE0000000000000; +sub.f64 fd996, fd939, fd995; +add.f64 fd1158, fd975, fd984; +sub.f64 fd997, fd975, fd984; +mul.f64 fd998, fd997, 0dBFEBB67AE8584CAA; +mul.f64 fd999, fd1158, 0d3FE0000000000000; +sub.f64 fd1000, fd966, fd999; +sub.f64 fd1001, fd948, fd957; +mul.f64 fd1002, fd1001, 0dBFEBB67AE8584CAA; +add.f64 fd1003, fd949, fd958; +mul.f64 fd1005, fd1003, 0d3FE0000000000000; +sub.f64 fd1006, fd940, fd1005; +add.f64 fd1157, fd976, fd985; +sub.f64 fd1007, fd976, fd985; +mul.f64 fd1008, fd1007, 0dBFEBB67AE8584CAA; +mul.f64 fd1009, fd1157, 0d3FE0000000000000; +sub.f64 fd1010, fd967, fd1009; +sub.f64 fd1011, fd949, fd958; +mul.f64 fd1012, fd1011, 0dBFEBB67AE8584CAA; +add.f64 fd1013, fd950, fd959; +mul.f64 fd1015, fd1013, 0d3FE0000000000000; +sub.f64 fd1016, fd941, fd1015; +add.f64 fd1156, fd977, fd986; +sub.f64 fd1017, fd977, fd986; +mul.f64 fd1018, fd1017, 0dBFEBB67AE8584CAA; +mul.f64 fd1019, fd1156, 0d3FE0000000000000; +sub.f64 fd1020, fd968, fd1019; +sub.f64 fd1021, fd950, fd959; +mul.f64 fd1022, fd1021, 0dBFEBB67AE8584CAA; +add.f64 fd1023, fd951, fd960; +mul.f64 fd1025, fd1023, 0d3FE0000000000000; +sub.f64 fd1026, fd942, fd1025; +add.f64 fd1155, fd978, fd987; +sub.f64 fd1027, fd978, fd987; +mul.f64 fd1028, fd1027, 0dBFEBB67AE8584CAA; +mul.f64 fd1029, fd1155, 0d3FE0000000000000; +sub.f64 fd1030, fd969, fd1029; +sub.f64 fd1031, fd951, fd960; +mul.f64 fd1032, fd1031, 0dBFEBB67AE8584CAA; +add.f64 fd1033, fd952, fd961; +mul.f64 fd1035, fd1033, 0d3FE0000000000000; +sub.f64 fd1036, fd943, fd1035; +add.f64 fd1154, fd979, fd988; +sub.f64 fd1037, fd979, fd988; +mul.f64 fd1038, fd1037, 0dBFEBB67AE8584CAA; +mul.f64 fd1039, fd1154, 0d3FE0000000000000; +sub.f64 fd1040, fd970, fd1039; +sub.f64 fd1041, fd952, fd961; +mul.f64 fd1042, fd1041, 0dBFEBB67AE8584CAA; +add.f64 fd1043, fd953, fd962; +mul.f64 fd1045, fd1043, 0d3FE0000000000000; +sub.f64 fd1046, fd944, fd1045; +add.f64 fd1153, fd980, fd989; +sub.f64 fd1047, fd980, fd989; +mul.f64 fd1048, fd1047, 0dBFEBB67AE8584CAA; +mul.f64 fd1049, fd1153, 0d3FE0000000000000; +sub.f64 fd1050, fd971, fd1049; +sub.f64 fd1051, fd953, fd962; +mul.f64 fd1052, fd1051, 0dBFEBB67AE8584CAA; +add.f64 fd1053, fd954, fd963; +mul.f64 fd1055, fd1053, 0d3FE0000000000000; +sub.f64 fd1056, fd945, fd1055; +add.f64 fd1152, fd981, fd990; +sub.f64 fd1057, fd981, fd990; +mul.f64 fd1058, fd1057, 0dBFEBB67AE8584CAA; +mul.f64 fd1059, fd1152, 0d3FE0000000000000; +sub.f64 fd1060, fd972, fd1059; +sub.f64 fd1061, fd954, fd963; +mul.f64 fd1062, fd1061, 0dBFEBB67AE8584CAA; +add.f64 fd1063, fd955, fd964; +mul.f64 fd1065, fd1063, 0d3FE0000000000000; +sub.f64 fd1066, fd946, fd1065; +add.f64 fd1151, fd982, fd991; +sub.f64 fd1067, fd982, fd991; +mul.f64 fd1068, fd1067, 0dBFEBB67AE8584CAA; +mul.f64 fd1069, fd1151, 0d3FE0000000000000; +sub.f64 fd1070, fd973, fd1069; +sub.f64 fd1071, fd955, fd964; +mul.f64 fd1072, fd1071, 0dBFEBB67AE8584CAA; +add.f64 fd1073, fd956, fd965; +mul.f64 fd1075, fd1073, 0d3FE0000000000000; +sub.f64 fd1076, fd947, fd1075; +add.f64 fd1150, fd983, fd992; +sub.f64 fd1077, fd983, fd992; +mul.f64 fd1078, fd1077, 0dBFEBB67AE8584CAA; +mul.f64 fd1079, fd1150, 0d3FE0000000000000; +sub.f64 fd1080, fd974, fd1079; +sub.f64 fd1081, fd956, fd965; +mul.f64 fd1306, fd1013, 0d3FE0000000000000; +sub.f64 fd1305, fd941, fd1306; +mul.f64 fd1082, fd1081, 0dBFEBB67AE8584CAA; +add.f64 %0, fd939, fd993; +mul.f64 fd1308, fd1155, 0d3FE0000000000000; +sub.f64 fd1307, fd969, fd1308; +add.f64 %1, fd966, fd1158; +mul.f64 fd1310, fd1003, 0d3FE0000000000000; +sub.f64 fd1309, fd940, fd1310; +mul.f64 fd1312, fd1156, 0d3FE0000000000000; +sub.f64 fd1311, fd968, fd1312; +add.f64 %2, fd940, fd1003; +add.f64 %3, fd967, fd1157; +add.f64 %4, fd941, fd1013; +add.f64 %5, fd968, fd1156; +add.f64 %6, fd942, fd1023; +add.f64 %7, fd969, fd1155; +add.f64 %8, fd943, fd1033; +add.f64 %9, fd970, fd1154; +add.f64 %10, fd944, fd1043; +add.f64 %11, fd971, fd1153; +add.f64 %12, fd945, fd1053; +add.f64 %13, fd972, fd1152; +add.f64 %14, fd946, fd1063; +add.f64 %15, fd973, fd1151; +add.f64 %16, fd947, fd1073; +add.f64 %17, fd974, fd1150; +add.f64 %18, fd998, fd996; +sub.f64 %19, fd1000, fd1002; +add.f64 %20, fd1008, fd1309; +sub.f64 %21, fd1010, fd1012; +sub.f64 %23, fd1311, fd1022; +add.f64 %22, fd1018, fd1305; +sub.f64 %25, fd1307, fd1032; +add.f64 %24, fd1028, fd1026; +sub.f64 %27, fd1040, fd1042; +add.f64 %26, fd1038, fd1036; +sub.f64 %29, fd1050, fd1052; +add.f64 %28, fd1048, fd1046; +add.f64 %30, fd1058, fd1056; +sub.f64 %31, fd1060, fd1062; +add.f64 %32, fd1068, fd1066; +sub.f64 %33, fd1070, fd1072; +sub.f64 %35, fd1080, fd1082; +add.f64 %34, fd1078, fd1076; +sub.f64 %36, fd996, fd998; +add.f64 %37, fd1002, fd1000; +sub.f64 %38, fd1309, fd1008; +add.f64 %39, fd1012, fd1010; +sub.f64 %40, fd1305, fd1018; +add.f64 %41, fd1022, fd1311; +sub.f64 %42, fd1026, fd1028; +add.f64 %43, fd1032, fd1307; +sub.f64 %44, fd1036, fd1038; +add.f64 %45, fd1042, fd1040; +sub.f64 %46, fd1046, fd1048; +add.f64 %47, fd1052, fd1050; +sub.f64 %48, fd1056, fd1058; +add.f64 %49, fd1062, fd1060; +sub.f64 %50, fd1066, fd1068; +add.f64 %51, fd1072, fd1070; +sub.f64 %52, fd1076, fd1078; +add.f64 %53, fd1082, fd1080; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y), "=d"(rmem[9].x), "=d"(rmem[9].y), "=d"(rmem[10].x), "=d"(rmem[10].y), "=d"(rmem[11].x), "=d"(rmem[11].y), "=d"(rmem[12].x), "=d"(rmem[12].y), "=d"(rmem[13].x), "=d"(rmem[13].y), "=d"(rmem[14].x), "=d"(rmem[14].y), "=d"(rmem[15].x), "=d"(rmem[15].y), "=d"(rmem[16].x), "=d"(rmem[16].y), "=d"(rmem[17].x), "=d"(rmem[17].y), "=d"(rmem[18].x), "=d"(rmem[18].y), "=d"(rmem[19].x), "=d"(rmem[19].y), "=d"(rmem[20].x), "=d"(rmem[20].y), "=d"(rmem[21].x), "=d"(rmem[21].y), "=d"(rmem[22].x), "=d"(rmem[22].y), "=d"(rmem[23].x), "=d"(rmem[23].y), "=d"(rmem[24].x), "=d"(rmem[24].y), "=d"(rmem[25].x), "=d"(rmem[25].y), "=d"(rmem[26].x), "=d"(rmem[26].y): "r"(smem), "l"(lut_dp_27_81), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y), "d"(rmem[9].x), "d"(rmem[9].y), "d"(rmem[10].x), "d"(rmem[10].y), "d"(rmem[11].x), "d"(rmem[11].y), "d"(rmem[12].x), "d"(rmem[12].y), "d"(rmem[13].x), "d"(rmem[13].y), "d"(rmem[14].x), "d"(rmem[14].y), "d"(rmem[15].x), "d"(rmem[15].y), "d"(rmem[16].x), "d"(rmem[16].y), "d"(rmem[17].x), "d"(rmem[17].y), "d"(rmem[18].x), "d"(rmem[18].y), "d"(rmem[19].x), "d"(rmem[19].y), "d"(rmem[20].x), "d"(rmem[20].y), "d"(rmem[21].x), "d"(rmem[21].y), "d"(rmem[22].x), "d"(rmem[22].y), "d"(rmem[23].x), "d"(rmem[23].y), "d"(rmem[24].x), "d"(rmem[24].y), "d"(rmem[25].x), "d"(rmem[25].y), "d"(rmem[26].x), "d"(rmem[26].y), "d"(rmem[19].y), "d"(rmem[10].y), "d"(rmem[1].y), "d"(rmem[22].y), "d"(rmem[13].y), "d"(rmem[4].y), "d"(rmem[16].y), "d"(rmem[25].y), "d"(rmem[7].y), "d"(rmem[11].y), "d"(rmem[20].y), "d"(rmem[2].y), "d"(rmem[23].y), "d"(rmem[14].y), "d"(rmem[5].y), "d"(rmem[17].y), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<684, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<167>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 1296, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %12, %15; +add.f64 fd14, %14, %16; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %10, fd15; +sub.f64 fd17, %14, %16; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %11, fd21; +sub.f64 fd23, %12, %15; +mul.f64 fd24, fd23, 0dBFEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 1296, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd25, fd28; +mul.f64 fd32, fd19, fd28; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+432]; +mul.f64 fd38, fd26, fd35; +mul.f64 fd39, fd20, fd35; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r13, r11, 48, r12; +add.f64 fd41, %11, fd14; +add.f64 fd42, %10, fd13; +st.shared.v2.f64 [r13], {fd42, fd41}; +fma.rn.f64 fd43, fd27, fd19, fd31; +sub.f64 fd44, fd33, fd32; +st.shared.v2.f64 [r13+16], {fd43, fd44}; +fma.rn.f64 fd45, fd34, fd20, fd38; +sub.f64 fd46, fd40, fd39; +st.shared.v2.f64 [r13+32], {fd45, fd46}; +barrier.sync 0; +shl.b32 r14, r11, 5; +sub.s32 r15, r13, r14; +ld.shared.v2.f64 {fd47, fd48}, [r15]; +ld.shared.v2.f64 {fd51, fd52}, [r15+432]; +ld.shared.v2.f64 {fd55, fd56}, [r15+864]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0dBFEBB67AE8584CAA; +add.f64 fd65, fd64, fd62; +sub.f64 fd66, fd62, fd64; +mul.f64 fd67, fd60, 0d3FE0000000000000; +sub.f64 fd68, fd48, fd67; +sub.f64 fd69, fd51, fd55; +mul.f64 fd70, fd69, 0dBFEBB67AE8584CAA; +sub.f64 fd71, fd68, fd70; +add.f64 fd72, fd70, fd68; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 4; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd73, fd74}, [rd11]; +mul.f64 fd77, fd71, fd74; +mul.f64 fd78, fd65, fd74; +mul.f64 fd79, fd73, fd71; +ld.global.v2.f64 {fd80, fd81}, [rd11+144]; +mul.f64 fd84, fd72, fd81; +mul.f64 fd85, fd66, fd81; +mul.f64 fd86, fd80, fd72; +barrier.sync 0; +mad.lo.s32 r21, r16, 144, r20; +add.f64 fd87, fd48, fd60; +add.f64 fd88, fd47, fd59; +st.shared.v2.f64 [r21], {fd88, fd87}; +fma.rn.f64 fd89, fd73, fd65, fd77; +sub.f64 fd90, fd79, fd78; +st.shared.v2.f64 [r21+48], {fd89, fd90}; +fma.rn.f64 fd91, fd80, fd66, fd84; +sub.f64 fd92, fd86, fd85; +st.shared.v2.f64 [r21+96], {fd91, fd92}; +barrier.sync 0; +ld.shared.v2.f64 {fd93, fd94}, [r15]; +ld.shared.v2.f64 {fd97, fd98}, [r15+432]; +ld.shared.v2.f64 {fd101, fd102}, [r15+864]; +add.f64 fd105, fd97, fd101; +add.f64 fd106, fd98, fd102; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd93, fd107; +sub.f64 fd109, fd98, fd102; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +add.f64 fd111, fd110, fd108; +sub.f64 fd112, fd108, fd110; +mul.f64 fd113, fd106, 0d3FE0000000000000; +sub.f64 fd114, fd94, fd113; +sub.f64 fd115, fd97, fd101; +mul.f64 fd116, fd115, 0dBFEBB67AE8584CAA; +sub.f64 fd117, fd114, fd116; +add.f64 fd118, fd116, fd114; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 4; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd119, fd120}, [rd16]; +mul.f64 fd123, fd117, fd120; +mul.f64 fd124, fd111, fd120; +mul.f64 fd125, fd119, fd117; +ld.global.v2.f64 {fd126, fd127}, [rd16+48]; +mul.f64 fd130, fd118, fd127; +mul.f64 fd131, fd112, fd127; +mul.f64 fd132, fd126, fd118; +barrier.sync 0; +mad.lo.s32 r27, r22, 432, r26; +add.f64 fd133, fd94, fd106; +add.f64 fd134, fd93, fd105; +st.shared.v2.f64 [r27], {fd134, fd133}; +fma.rn.f64 fd135, fd119, fd111, fd123; +sub.f64 fd136, fd125, fd124; +st.shared.v2.f64 [r27+144], {fd135, fd136}; +fma.rn.f64 fd137, fd126, fd112, fd130; +sub.f64 fd138, fd132, fd131; +st.shared.v2.f64 [r27+288], {fd137, fd138}; +barrier.sync 0; +ld.shared.v2.f64 {fd139, fd140}, [r15]; +ld.shared.v2.f64 {fd143, fd144}, [r15+432]; +ld.shared.v2.f64 {fd147, fd148}, [r15+864]; +add.f64 fd151, fd143, fd147; +add.f64 fd152, fd144, fd148; +mul.f64 fd153, fd151, 0d3FE0000000000000; +sub.f64 fd154, fd139, fd153; +sub.f64 fd155, fd144, fd148; +mul.f64 fd156, fd155, 0dBFEBB67AE8584CAA; +mul.f64 fd157, fd152, 0d3FE0000000000000; +sub.f64 fd158, fd140, fd157; +sub.f64 fd159, fd143, fd147; +mul.f64 fd160, fd159, 0dBFEBB67AE8584CAA; +add.f64 %1, fd140, fd152; +add.f64 %0, fd139, fd151; +sub.f64 %3, fd158, fd160; +add.f64 %2, fd156, fd154; +add.f64 %5, fd160, fd158; +sub.f64 %4, fd154, fd156; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<685, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<28>; +.reg .f64 fd<149>; +.reg .b64 rd<17>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 648, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %12, %15; +add.f64 fd14, %10, fd13; +add.f64 fd15, %14, %16; +add.f64 fd16, %11, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %10, fd17; +sub.f64 fd19, %14, %16; +mul.f64 fd20, fd19, 0dBFEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %11, fd23; +sub.f64 fd25, %12, %15; +mul.f64 fd26, fd25, 0dBFEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, 795364315; +shr.u64 rd3, rd2, 32; +cvt.u32.u64 r5, rd3; +sub.s32 r6, r4, r5; +shr.u32 r7, r6, 1; +add.s32 r8, r7, r5; +shr.u32 r9, r8, 4; +mul.lo.s32 r10, r9, 27; +sub.s32 r11, r4, r10; +mad.lo.s32 r12, r9, 648, r3; +mul.wide.u32 rd4, r11, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd27, fd30; +fma.rn.f64 fd34, fd29, fd21, fd33; +mul.f64 fd35, fd21, fd30; +mul.f64 fd36, fd29, fd27; +sub.f64 fd37, fd36, fd35; +ld.global.v2.f64 {fd38, fd39}, [rd6+432]; +mul.f64 fd42, fd28, fd39; +fma.rn.f64 fd43, fd38, fd22, fd42; +mul.f64 fd44, fd22, fd39; +mul.f64 fd45, fd38, fd28; +sub.f64 fd46, fd45, fd44; +barrier.sync 0; +mad.lo.s32 r13, r11, 24, r12; +st.shared.f64 [r13], fd14; +st.shared.f64 [r13+8], fd34; +st.shared.f64 [r13+16], fd43; +barrier.sync 0; +shl.b32 r14, r11, 4; +sub.s32 r15, r13, r14; +ld.shared.f64 fd47, [r15]; +ld.shared.f64 fd48, [r15+216]; +ld.shared.f64 fd49, [r15+432]; +barrier.sync 0; +st.shared.f64 [r13], fd16; +st.shared.f64 [r13+8], fd37; +st.shared.f64 [r13+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r15]; +ld.shared.f64 fd51, [r15+216]; +ld.shared.f64 fd52, [r15+432]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd47, fd53; +add.f64 fd55, fd51, fd52; +add.f64 fd56, fd50, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, fd47, fd57; +sub.f64 fd59, fd51, fd52; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, fd50, fd63; +sub.f64 fd65, fd48, fd49; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +mul.wide.u32 rd7, r11, -1431655765; +shr.u64 rd8, rd7, 33; +cvt.u32.u64 r16, rd8; +mul.lo.s32 r17, r16, 3; +sub.s32 r18, r11, r17; +shl.b32 r19, r18, 3; +add.s32 r20, r12, r19; +mul.wide.u32 rd9, r16, 16; +mov.u64 rd10, %8; +add.s64 rd11, rd10, rd9; +ld.global.v2.f64 {fd69, fd70}, [rd11]; +mul.f64 fd73, fd67, fd70; +fma.rn.f64 fd74, fd69, fd61, fd73; +mul.f64 fd75, fd61, fd70; +mul.f64 fd76, fd69, fd67; +sub.f64 fd77, fd76, fd75; +ld.global.v2.f64 {fd78, fd79}, [rd11+144]; +mul.f64 fd82, fd68, fd79; +fma.rn.f64 fd83, fd78, fd62, fd82; +mul.f64 fd84, fd62, fd79; +mul.f64 fd85, fd78, fd68; +sub.f64 fd86, fd85, fd84; +barrier.sync 0; +mad.lo.s32 r21, r16, 72, r20; +st.shared.f64 [r21], fd54; +st.shared.f64 [r21+24], fd74; +st.shared.f64 [r21+48], fd83; +barrier.sync 0; +ld.shared.f64 fd87, [r15]; +ld.shared.f64 fd88, [r15+216]; +ld.shared.f64 fd89, [r15+432]; +barrier.sync 0; +st.shared.f64 [r21], fd56; +st.shared.f64 [r21+24], fd77; +st.shared.f64 [r21+48], fd86; +barrier.sync 0; +ld.shared.f64 fd90, [r15]; +ld.shared.f64 fd91, [r15+216]; +ld.shared.f64 fd92, [r15+432]; +add.f64 fd93, fd88, fd89; +add.f64 fd94, fd87, fd93; +add.f64 fd95, fd91, fd92; +add.f64 fd96, fd90, fd95; +mul.f64 fd97, fd93, 0d3FE0000000000000; +sub.f64 fd98, fd87, fd97; +sub.f64 fd99, fd91, fd92; +mul.f64 fd100, fd99, 0dBFEBB67AE8584CAA; +add.f64 fd101, fd100, fd98; +sub.f64 fd102, fd98, fd100; +mul.f64 fd103, fd95, 0d3FE0000000000000; +sub.f64 fd104, fd90, fd103; +sub.f64 fd105, fd88, fd89; +mul.f64 fd106, fd105, 0dBFEBB67AE8584CAA; +sub.f64 fd107, fd104, fd106; +add.f64 fd108, fd106, fd104; +mul.wide.u32 rd12, r11, 954437177; +shr.u64 rd13, rd12, 33; +cvt.u32.u64 r22, rd13; +mul.lo.s32 r23, r22, 9; +sub.s32 r24, r11, r23; +shl.b32 r25, r24, 3; +add.s32 r26, r12, r25; +mul.wide.u32 rd14, r22, 16; +mov.u64 rd15, %9; +add.s64 rd16, rd15, rd14; +ld.global.v2.f64 {fd109, fd110}, [rd16]; +mul.f64 fd113, fd107, fd110; +fma.rn.f64 fd114, fd109, fd101, fd113; +mul.f64 fd115, fd101, fd110; +mul.f64 fd116, fd109, fd107; +sub.f64 fd117, fd116, fd115; +ld.global.v2.f64 {fd118, fd119}, [rd16+48]; +mul.f64 fd122, fd108, fd119; +fma.rn.f64 fd123, fd118, fd102, fd122; +mul.f64 fd124, fd102, fd119; +mul.f64 fd125, fd118, fd108; +sub.f64 fd126, fd125, fd124; +barrier.sync 0; +mad.lo.s32 r27, r22, 216, r26; +st.shared.f64 [r27], fd94; +st.shared.f64 [r27+72], fd114; +st.shared.f64 [r27+144], fd123; +barrier.sync 0; +ld.shared.f64 fd127, [r15]; +ld.shared.f64 fd128, [r15+216]; +ld.shared.f64 fd129, [r15+432]; +barrier.sync 0; +st.shared.f64 [r27], fd96; +st.shared.f64 [r27+72], fd117; +st.shared.f64 [r27+144], fd126; +barrier.sync 0; +ld.shared.f64 fd130, [r15]; +ld.shared.f64 fd131, [r15+216]; +ld.shared.f64 fd132, [r15+432]; +add.f64 fd133, fd128, fd129; +add.f64 fd134, fd131, fd132; +mul.f64 fd135, fd133, 0d3FE0000000000000; +sub.f64 fd136, fd127, fd135; +sub.f64 fd137, fd131, fd132; +mul.f64 fd138, fd137, 0dBFEBB67AE8584CAA; +mul.f64 fd139, fd134, 0d3FE0000000000000; +sub.f64 fd140, fd130, fd139; +sub.f64 fd141, fd128, fd129; +mul.f64 fd142, fd141, 0dBFEBB67AE8584CAA; +add.f64 %0, fd127, fd133; +add.f64 %1, fd130, fd134; +add.f64 %2, fd138, fd136; +sub.f64 %3, fd140, fd142; +sub.f64 %4, fd136, fd138; +add.f64 %5, fd142, fd140; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_81), "l"(lut_dp_3_27), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..a2a38852feee2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp16_fwd.hpp.inc @@ -0,0 +1,1065 @@ +#ifndef CUFFTDX_FFT_8_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_8_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<766, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<15>; +.reg .b32 r<183>; +.reg .f64 fd<15>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %29, %21; +} +{ +add.f16x2 r4, %30, %22; +} +{ +sub.f16x2 r7, %29, %21; +} +{ +sub.f16x2 r10, %30, %22; +} +{ +add.f16x2 r13, %17, %25; +} +{ +add.f16x2 r16, %18, %26; +} +{ +sub.f16x2 r19, %17, %25; +} +{ +sub.f16x2 r22, %18, %26; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +{ +add.f16x2 r51, %31, %23; +} +{ +add.f16x2 r54, %16, %24; +} +{ +sub.f16x2 r57, %31, %23; +} +{ +sub.f16x2 r60, %16, %24; +} +{ +add.f16x2 r63, %19, %27; +} +{ +add.f16x2 r66, %20, %28; +} +{ +sub.f16x2 r69, %19, %27; +} +{ +sub.f16x2 r72, %20, %28; +} +{ +neg.f16x2 r75, r69; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r72; +} +{ +add.f16x2 r92, r60, r75; +} +{ +sub.f16x2 r95, r57, r72; +} +{ +sub.f16x2 r98, r60, r75; +} +mov.f64 fd1, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs1, fd1; +} +mov.f64 fd6, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs2, fd6; +} +{ +cvt.rn.f16.f64 rs5, fd6; +} +{ +cvt.rn.f16.f64 rs6, fd6; +} +mov.b32 r115, {rs1, rs1}; +{ +mul.f16x2 r101, r89, r115; +} +mov.b32 r112, {rs2, rs2}; +{ +mul.f16x2 r104, r92, r112; +} +{ +sub.f16x2 r107, r101, r104; +} +{ +mul.f16x2 r110, r89, r112; +} +{ +fma.rn.f16x2 r113, r92, r115, r110; +} +{ +neg.f16x2 r117, r83; +} +mov.b32 r133, {rs5, rs5}; +{ +mul.f16x2 r119, r95, r133; +} +mov.b32 r130, {rs6, rs6}; +{ +mul.f16x2 r122, r98, r130; +} +{ +sub.f16x2 r125, r119, r122; +} +{ +mul.f16x2 r128, r95, r130; +} +{ +fma.rn.f16x2 r131, r98, r133, r128; +} +{ +add.f16x2 %0, r27, r77; +} +{ +add.f16x2 %1, r30, r80; +} +{ +sub.f16x2 %8, r27, r77; +} +{ +sub.f16x2 %9, r30, r80; +} +{ +add.f16x2 %2, r39, r107; +} +{ +add.f16x2 %3, r42, r113; +} +{ +sub.f16x2 %10, r39, r107; +} +{ +sub.f16x2 %11, r42, r113; +} +{ +add.f16x2 %4, r33, r86; +} +{ +add.f16x2 %5, r36, r117; +} +{ +sub.f16x2 %12, r33, r86; +} +{ +sub.f16x2 %13, r36, r117; +} +{ +add.f16x2 %6, r45, r125; +} +{ +add.f16x2 %7, r48, r131; +} +{ +sub.f16x2 %14, r45, r125; +} +{ +sub.f16x2 %15, r48, r131; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<767, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<201>; +.reg .b64 rd<2>; +mov.u32 r189, %tid.y; +shl.b32 r190, r189, 5; +mov.u32 r191, %8; +add.s32 r192, r191, r190; +mov.u32 r193, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r194, r193, 1; +shl.b32 r195, r193, 4; +and.b32 r196, r195, -32; +add.s32 r197, r192, r196; +cvt.rn.f32.u32 f11, r194; +mul.f32 f12, f11, 0f3F490FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r198, r195, 16; +add.s32 r199, r197, r198; +st.shared.v4.f32 [r199], {r27, r63, r100, r137}; +barrier.sync 0; +mad.lo.s32 r200, r194, -12, r199; +ld.shared.u32 r166, [r200]; +ld.shared.u32 r178, [r200+8]; +ld.shared.u32 r167, [r200+16]; +ld.shared.u32 r179, [r200+24]; +barrier.sync 0; +st.shared.v4.f32 [r199], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r200]; +ld.shared.u32 r181, [r200+8]; +ld.shared.u32 r170, [r200+16]; +ld.shared.u32 r182, [r200+24]; +{ +add.f16x2 %0, r166, r167; +} +{ +add.f16x2 %1, r169, r170; +} +{ +sub.f16x2 %4, r166, r167; +} +{ +sub.f16x2 %5, r169, r170; +} +{ +add.f16x2 %2, r178, r179; +} +{ +add.f16x2 %3, r181, r182; +} +{ +sub.f16x2 %6, r178, r179; +} +{ +sub.f16x2 %7, r181, r182; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<768, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<201>; +.reg .b64 rd<2>; +mov.u32 r189, %tid.y; +shl.b32 r190, r189, 6; +mov.u32 r191, %8; +add.s32 r192, r191, r190; +mov.u32 r193, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r19; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r22; +} +{ +add.f16x2 r42, r10, r25; +} +{ +sub.f16x2 r45, r7, r22; +} +{ +sub.f16x2 r48, r10, r25; +} +and.b32 r194, r193, 1; +shl.b32 r195, r193, 5; +and.b32 r196, r195, -64; +add.s32 r197, r192, r196; +cvt.rn.f32.u32 f11, r194; +mul.f32 f12, f11, 0f3F490FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +neg.f16x2 r61, r58; +} +{ +fma.rn.f16x2 r63, r39, r54, r61; +} +{ +mul.f16x2 r67, r39, r56; +} +{ +fma.rn.f16x2 r70, r42, r54, r67; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +neg.f16x2 r98, r95; +} +{ +fma.rn.f16x2 r100, r33, r91, r98; +} +{ +mul.f16x2 r104, r33, r93; +} +{ +fma.rn.f16x2 r107, r36, r91, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +neg.f16x2 r135, r132; +} +{ +fma.rn.f16x2 r137, r45, r128, r135; +} +{ +mul.f16x2 r141, r45, r130; +} +{ +fma.rn.f16x2 r144, r48, r128, r141; +} +barrier.sync 0; +and.b32 r198, r195, 32; +add.s32 r199, r197, r198; +st.shared.v4.f32 [r199], {r27, r30, r63, r70}; +st.shared.v4.f32 [r199+16], {r100, r107, r137, r144}; +barrier.sync 0; +mad.lo.s32 r200, r194, -24, r199; +ld.shared.u32 r166, [r200]; +ld.shared.u32 r169, [r200+4]; +ld.shared.u32 r178, [r200+16]; +ld.shared.u32 r181, [r200+20]; +ld.shared.u32 r167, [r200+32]; +ld.shared.u32 r170, [r200+36]; +ld.shared.u32 r179, [r200+48]; +ld.shared.u32 r182, [r200+52]; +{ +add.f16x2 %0, r166, r167; +} +{ +add.f16x2 %1, r169, r170; +} +{ +sub.f16x2 %4, r166, r167; +} +{ +sub.f16x2 %5, r169, r170; +} +{ +add.f16x2 %2, r178, r179; +} +{ +add.f16x2 %3, r181, r182; +} +{ +sub.f16x2 %6, r178, r179; +} +{ +sub.f16x2 %7, r181, r182; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<769, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<19>; +.reg .b32 r<138>; +.reg .b64 rd<2>; +mov.u32 r117, %tid.y; +shl.b32 r118, r117, 5; +mov.u32 r119, %4; +add.s32 r120, r119, r118; +mov.u32 r121, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r122, r121, 3; +shl.b32 r123, r121, 3; +and.b32 r124, r123, -32; +add.s32 r125, r120, r124; +cvt.rn.f32.u32 f13, r122; +mul.f32 f14, f13, 0f3F490FDB; +cos.approx.f32 f1, f14; +sin.approx.f32 f15, f14; +neg.f32 f2, f15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r126, r123, 24; +add.s32 r127, r125, r126; +st.shared.v2.f32 [r127], {r1, r25}; +barrier.sync 0; +shl.b32 r128, r121, 2; +and.b32 r129, r128, 12; +sub.s32 r130, r127, r129; +ld.shared.u32 r54, [r130]; +ld.shared.u32 r55, [r130+16]; +barrier.sync 0; +st.shared.v2.f32 [r127], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r130]; +ld.shared.u32 r58, [r130+16]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r131, r121, 1, 1; +and.b32 r132, r128, 4; +add.s32 r133, r125, r132; +cvt.rn.f32.u32 f16, r131; +mul.f32 f17, f16, 0f3FC90FDB; +cos.approx.f32 f7, f17; +sin.approx.f32 f18, f17; +neg.f32 f8, f18; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +barrier.sync 0; +and.b32 r134, r123, 16; +add.s32 r135, r133, r134; +st.shared.u32 [r135], r53; +st.shared.u32 [r135+8], r77; +barrier.sync 0; +and.b32 r136, r128, 8; +sub.s32 r137, r135, r136; +ld.shared.u32 r106, [r137]; +ld.shared.u32 r107, [r137+16]; +barrier.sync 0; +st.shared.u32 [r135], r56; +st.shared.u32 [r135+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r137]; +ld.shared.u32 r110, [r137+16]; +{ +add.f16x2 %0, r106, r107; +} +{ +add.f16x2 %1, r109, r110; +} +{ +sub.f16x2 %2, r106, r107; +} +{ +sub.f16x2 %3, r109, r110; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<770, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<19>; +.reg .b32 r<138>; +.reg .b64 rd<2>; +mov.u32 r117, %tid.y; +shl.b32 r118, r117, 6; +mov.u32 r119, %4; +add.s32 r120, r119, r118; +mov.u32 r121, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r122, r121, 3; +shl.b32 r123, r121, 4; +and.b32 r124, r123, -64; +add.s32 r125, r120, r124; +cvt.rn.f32.u32 f13, r122; +mul.f32 f14, f13, 0f3F490FDB; +cos.approx.f32 f1, f14; +sin.approx.f32 f15, f14; +neg.f32 f2, f15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +neg.f16x2 r23, r20; +} +{ +fma.rn.f16x2 r25, r7, r16, r23; +} +{ +mul.f16x2 r29, r7, r18; +} +{ +fma.rn.f16x2 r32, r10, r16, r29; +} +barrier.sync 0; +and.b32 r126, r123, 48; +add.s32 r127, r125, r126; +st.shared.v2.f32 [r127], {r1, r4}; +st.shared.v2.f32 [r127+8], {r25, r32}; +barrier.sync 0; +shl.b32 r128, r121, 3; +and.b32 r129, r128, 24; +sub.s32 r130, r127, r129; +ld.shared.u32 r54, [r130]; +ld.shared.u32 r57, [r130+4]; +ld.shared.u32 r55, [r130+32]; +ld.shared.u32 r58, [r130+36]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r131, r121, 1, 1; +cvt.rn.f32.u32 f16, r131; +mul.f32 f17, f16, 0f3FC90FDB; +cos.approx.f32 f7, f17; +sin.approx.f32 f18, f17; +neg.f32 f8, f18; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +neg.f16x2 r75, r72; +} +{ +fma.rn.f16x2 r77, r59, r68, r75; +} +{ +mul.f16x2 r81, r59, r70; +} +{ +fma.rn.f16x2 r84, r62, r68, r81; +} +and.b32 r132, r128, 8; +add.s32 r133, r125, r132; +barrier.sync 0; +and.b32 r134, r123, 32; +add.s32 r135, r133, r134; +st.shared.u32 [r135], r53; +st.shared.u32 [r135+4], r56; +st.shared.u32 [r135+16], r77; +st.shared.u32 [r135+20], r84; +barrier.sync 0; +and.b32 r136, r128, 16; +sub.s32 r137, r135, r136; +ld.shared.u32 r106, [r137]; +ld.shared.u32 r109, [r137+4]; +ld.shared.u32 r107, [r137+32]; +ld.shared.u32 r110, [r137+36]; +{ +add.f16x2 %0, r106, r107; +} +{ +add.f16x2 %1, r109, r110; +} +{ +sub.f16x2 %2, r106, r107; +} +{ +sub.f16x2 %3, r109, r110; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..e0db138f48f27 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp16_inv.hpp.inc @@ -0,0 +1,1065 @@ +#ifndef CUFFTDX_FFT_8_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_8_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<968, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<15>; +.reg .b32 r<183>; +.reg .f64 fd<15>; +.reg .b64 rd<2>; +{ +add.f16x2 r1, %29, %21; +} +{ +add.f16x2 r4, %30, %22; +} +{ +sub.f16x2 r7, %29, %21; +} +{ +sub.f16x2 r10, %30, %22; +} +{ +add.f16x2 r13, %17, %25; +} +{ +add.f16x2 r16, %18, %26; +} +{ +sub.f16x2 r19, %17, %25; +} +{ +sub.f16x2 r22, %18, %26; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +{ +add.f16x2 r51, %31, %23; +} +{ +add.f16x2 r54, %16, %24; +} +{ +sub.f16x2 r57, %31, %23; +} +{ +sub.f16x2 r60, %16, %24; +} +{ +add.f16x2 r63, %19, %27; +} +{ +add.f16x2 r66, %20, %28; +} +{ +sub.f16x2 r69, %19, %27; +} +{ +sub.f16x2 r72, %20, %28; +} +{ +neg.f16x2 r75, r72; +} +{ +add.f16x2 r77, r51, r63; +} +{ +add.f16x2 r80, r54, r66; +} +{ +sub.f16x2 r83, r51, r63; +} +{ +sub.f16x2 r86, r54, r66; +} +{ +add.f16x2 r89, r57, r75; +} +{ +add.f16x2 r92, r60, r69; +} +{ +sub.f16x2 r95, r57, r75; +} +{ +sub.f16x2 r98, r60, r69; +} +mov.f64 fd6, 0d3FE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs1, fd6; +} +{ +cvt.rn.f16.f64 rs2, fd6; +} +mov.f64 fd5, 0dBFE6A09E667F3BCD; +{ +cvt.rn.f16.f64 rs5, fd5; +} +{ +cvt.rn.f16.f64 rs6, fd6; +} +mov.b32 r115, {rs1, rs1}; +{ +mul.f16x2 r101, r89, r115; +} +mov.b32 r112, {rs2, rs2}; +{ +mul.f16x2 r104, r92, r112; +} +{ +sub.f16x2 r107, r101, r104; +} +{ +mul.f16x2 r110, r89, r112; +} +{ +fma.rn.f16x2 r113, r92, r115, r110; +} +{ +neg.f16x2 r117, r86; +} +mov.b32 r133, {rs5, rs5}; +{ +mul.f16x2 r119, r95, r133; +} +mov.b32 r130, {rs6, rs6}; +{ +mul.f16x2 r122, r98, r130; +} +{ +sub.f16x2 r125, r119, r122; +} +{ +mul.f16x2 r128, r95, r130; +} +{ +fma.rn.f16x2 r131, r98, r133, r128; +} +{ +add.f16x2 %0, r27, r77; +} +{ +add.f16x2 %1, r30, r80; +} +{ +sub.f16x2 %8, r27, r77; +} +{ +sub.f16x2 %9, r30, r80; +} +{ +add.f16x2 %2, r39, r107; +} +{ +add.f16x2 %3, r42, r113; +} +{ +sub.f16x2 %10, r39, r107; +} +{ +sub.f16x2 %11, r42, r113; +} +{ +add.f16x2 %4, r33, r117; +} +{ +add.f16x2 %5, r36, r83; +} +{ +sub.f16x2 %12, r33, r117; +} +{ +sub.f16x2 %13, r36, r83; +} +{ +add.f16x2 %6, r45, r125; +} +{ +add.f16x2 %7, r48, r131; +} +{ +sub.f16x2 %14, r45, r125; +} +{ +sub.f16x2 %15, r48, r131; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)): "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<969, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<201>; +.reg .b64 rd<2>; +mov.u32 r189, %tid.y; +shl.b32 r190, r189, 5; +mov.u32 r191, %8; +add.s32 r192, r191, r190; +mov.u32 r193, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r194, r193, 1; +shl.b32 r195, r193, 4; +and.b32 r196, r195, -32; +add.s32 r197, r192, r196; +cvt.rn.f32.u32 f11, r194; +mul.f32 f12, f11, 0f3F490FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r198, r195, 16; +add.s32 r199, r197, r198; +st.shared.v4.f32 [r199], {r27, r61, r98, r135}; +barrier.sync 0; +mad.lo.s32 r200, r194, -12, r199; +ld.shared.u32 r166, [r200]; +ld.shared.u32 r178, [r200+8]; +ld.shared.u32 r167, [r200+16]; +ld.shared.u32 r179, [r200+24]; +barrier.sync 0; +st.shared.v4.f32 [r199], {r30, r70, r107, r144}; +barrier.sync 0; +ld.shared.u32 r169, [r200]; +ld.shared.u32 r181, [r200+8]; +ld.shared.u32 r170, [r200+16]; +ld.shared.u32 r182, [r200+24]; +{ +add.f16x2 %0, r166, r167; +} +{ +add.f16x2 %1, r169, r170; +} +{ +sub.f16x2 %4, r166, r167; +} +{ +sub.f16x2 %5, r169, r170; +} +{ +add.f16x2 %2, r178, r179; +} +{ +add.f16x2 %3, r181, r182; +} +{ +sub.f16x2 %6, r178, r179; +} +{ +sub.f16x2 %7, r181, r182; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<970, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<14>; +.reg .b32 r<201>; +.reg .b64 rd<2>; +mov.u32 r189, %tid.y; +shl.b32 r190, r189, 6; +mov.u32 r191, %8; +add.s32 r192, r191, r190; +mov.u32 r193, %tid.x; +{ +add.f16x2 r1, %9, %13; +} +{ +add.f16x2 r4, %10, %14; +} +{ +sub.f16x2 r7, %9, %13; +} +{ +sub.f16x2 r10, %10, %14; +} +{ +add.f16x2 r13, %11, %15; +} +{ +add.f16x2 r16, %12, %16; +} +{ +sub.f16x2 r19, %11, %15; +} +{ +sub.f16x2 r22, %12, %16; +} +{ +neg.f16x2 r25, r22; +} +{ +add.f16x2 r27, r1, r13; +} +{ +add.f16x2 r30, r4, r16; +} +{ +sub.f16x2 r33, r1, r13; +} +{ +sub.f16x2 r36, r4, r16; +} +{ +add.f16x2 r39, r7, r25; +} +{ +add.f16x2 r42, r10, r19; +} +{ +sub.f16x2 r45, r7, r25; +} +{ +sub.f16x2 r48, r10, r19; +} +and.b32 r194, r193, 1; +shl.b32 r195, r193, 5; +and.b32 r196, r195, -64; +add.s32 r197, r192, r196; +cvt.rn.f32.u32 f11, r194; +mul.f32 f12, f11, 0f3F490FDB; +cos.approx.f32 f1, f12; +sin.approx.f32 f13, f12; +neg.f32 f2, f13; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r51, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r54, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r56, {high, high}; +} +{ +mul.f16x2 r58, r42, r56; +} +{ +fma.rn.f16x2 r61, r39, r54, r58; +} +{ +mul.f16x2 r65, r39, r56; +} +{ +neg.f16x2 r68, r65; +} +{ +fma.rn.f16x2 r70, r42, r54, r68; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r74, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r76, {high, high}; +} +mov.f32 f7, 0fBF800000; +mov.f32 f8, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r78, {low, high}; +} +{ +mul.f16x2 r79, r76, r78; +} +{ +mul.f16x2 r82, r51, r74; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r85, {high, low}; +} +{ +fma.rn.f16x2 r87, r79, r85, r82; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r91, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r93, {high, high}; +} +{ +mul.f16x2 r95, r36, r93; +} +{ +fma.rn.f16x2 r98, r33, r91, r95; +} +{ +mul.f16x2 r102, r33, r93; +} +{ +neg.f16x2 r105, r102; +} +{ +fma.rn.f16x2 r107, r36, r91, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r111, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r51; +mov.b32 r113, {high, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r115, {low, high}; +} +{ +mul.f16x2 r116, r113, r115; +} +{ +mul.f16x2 r119, r87, r111; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r122, {high, low}; +} +{ +fma.rn.f16x2 r124, r116, r122, r119; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r128, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r124; +mov.b32 r130, {high, high}; +} +{ +mul.f16x2 r132, r48, r130; +} +{ +fma.rn.f16x2 r135, r45, r128, r132; +} +{ +mul.f16x2 r139, r45, r130; +} +{ +neg.f16x2 r142, r139; +} +{ +fma.rn.f16x2 r144, r48, r128, r142; +} +barrier.sync 0; +and.b32 r198, r195, 32; +add.s32 r199, r197, r198; +st.shared.v4.f32 [r199], {r27, r30, r61, r70}; +st.shared.v4.f32 [r199+16], {r98, r107, r135, r144}; +barrier.sync 0; +mad.lo.s32 r200, r194, -24, r199; +ld.shared.u32 r166, [r200]; +ld.shared.u32 r169, [r200+4]; +ld.shared.u32 r178, [r200+16]; +ld.shared.u32 r181, [r200+20]; +ld.shared.u32 r167, [r200+32]; +ld.shared.u32 r170, [r200+36]; +ld.shared.u32 r179, [r200+48]; +ld.shared.u32 r182, [r200+52]; +{ +add.f16x2 %0, r166, r167; +} +{ +add.f16x2 %1, r169, r170; +} +{ +sub.f16x2 %4, r166, r167; +} +{ +sub.f16x2 %5, r169, r170; +} +{ +add.f16x2 %2, r178, r179; +} +{ +add.f16x2 %3, r181, r182; +} +{ +sub.f16x2 %6, r178, r179; +} +{ +sub.f16x2 %7, r181, r182; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<971, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<19>; +.reg .b32 r<138>; +.reg .b64 rd<2>; +mov.u32 r117, %tid.y; +shl.b32 r118, r117, 5; +mov.u32 r119, %4; +add.s32 r120, r119, r118; +mov.u32 r121, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r122, r121, 3; +shl.b32 r123, r121, 3; +and.b32 r124, r123, -32; +add.s32 r125, r120, r124; +cvt.rn.f32.u32 f13, r122; +mul.f32 f14, f13, 0f3F490FDB; +cos.approx.f32 f1, f14; +sin.approx.f32 f15, f14; +neg.f32 f2, f15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r126, r123, 24; +add.s32 r127, r125, r126; +st.shared.v2.f32 [r127], {r1, r23}; +barrier.sync 0; +shl.b32 r128, r121, 2; +and.b32 r129, r128, 12; +sub.s32 r130, r127, r129; +ld.shared.u32 r54, [r130]; +ld.shared.u32 r55, [r130+16]; +barrier.sync 0; +st.shared.v2.f32 [r127], {r4, r32}; +barrier.sync 0; +ld.shared.u32 r57, [r130]; +ld.shared.u32 r58, [r130+16]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r131, r121, 1, 1; +and.b32 r132, r128, 4; +add.s32 r133, r125, r132; +cvt.rn.f32.u32 f16, r131; +mul.f32 f17, f16, 0f3FC90FDB; +cos.approx.f32 f7, f17; +sin.approx.f32 f18, f17; +neg.f32 f8, f18; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +barrier.sync 0; +and.b32 r134, r123, 16; +add.s32 r135, r133, r134; +st.shared.u32 [r135], r53; +st.shared.u32 [r135+8], r75; +barrier.sync 0; +and.b32 r136, r128, 8; +sub.s32 r137, r135, r136; +ld.shared.u32 r106, [r137]; +ld.shared.u32 r107, [r137+16]; +barrier.sync 0; +st.shared.u32 [r135], r56; +st.shared.u32 [r135+8], r84; +barrier.sync 0; +ld.shared.u32 r109, [r137]; +ld.shared.u32 r110, [r137+16]; +{ +add.f16x2 %0, r106, r107; +} +{ +add.f16x2 %1, r109, r110; +} +{ +sub.f16x2 %2, r106, r107; +} +{ +sub.f16x2 %3, r109, r110; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<972, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<19>; +.reg .b32 r<138>; +.reg .b64 rd<2>; +mov.u32 r117, %tid.y; +shl.b32 r118, r117, 6; +mov.u32 r119, %4; +add.s32 r120, r119, r118; +mov.u32 r121, %tid.x; +{ +add.f16x2 r1, %5, %7; +} +{ +add.f16x2 r4, %6, %8; +} +{ +sub.f16x2 r7, %5, %7; +} +{ +sub.f16x2 r10, %6, %8; +} +and.b32 r122, r121, 3; +shl.b32 r123, r121, 4; +and.b32 r124, r123, -64; +add.s32 r125, r120, r124; +cvt.rn.f32.u32 f13, r122; +mul.f32 f14, f13, 0f3F490FDB; +cos.approx.f32 f1, f14; +sin.approx.f32 f15, f14; +neg.f32 f2, f15; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f1; +cvt.rn.f16.f32 high, f2; +mov.b32 r13, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r16, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r13; +mov.b32 r18, {high, high}; +} +{ +mul.f16x2 r20, r10, r18; +} +{ +fma.rn.f16x2 r23, r7, r16, r20; +} +{ +mul.f16x2 r27, r7, r18; +} +{ +neg.f16x2 r30, r27; +} +{ +fma.rn.f16x2 r32, r10, r16, r30; +} +barrier.sync 0; +and.b32 r126, r123, 48; +add.s32 r127, r125, r126; +st.shared.v2.f32 [r127], {r1, r4}; +st.shared.v2.f32 [r127+8], {r23, r32}; +barrier.sync 0; +shl.b32 r128, r121, 3; +and.b32 r129, r128, 24; +sub.s32 r130, r127, r129; +ld.shared.u32 r54, [r130]; +ld.shared.u32 r57, [r130+4]; +ld.shared.u32 r55, [r130+32]; +ld.shared.u32 r58, [r130+36]; +{ +add.f16x2 r53, r54, r55; +} +{ +add.f16x2 r56, r57, r58; +} +{ +sub.f16x2 r59, r54, r55; +} +{ +sub.f16x2 r62, r57, r58; +} +bfe.u32 r131, r121, 1, 1; +cvt.rn.f32.u32 f16, r131; +mul.f32 f17, f16, 0f3FC90FDB; +cos.approx.f32 f7, f17; +sin.approx.f32 f18, f17; +neg.f32 f8, f18; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f7; +cvt.rn.f16.f32 high, f8; +mov.b32 r65, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r68, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r65; +mov.b32 r70, {high, high}; +} +{ +mul.f16x2 r72, r62, r70; +} +{ +fma.rn.f16x2 r75, r59, r68, r72; +} +{ +mul.f16x2 r79, r59, r70; +} +{ +neg.f16x2 r82, r79; +} +{ +fma.rn.f16x2 r84, r62, r68, r82; +} +and.b32 r132, r128, 8; +add.s32 r133, r125, r132; +barrier.sync 0; +and.b32 r134, r123, 32; +add.s32 r135, r133, r134; +st.shared.u32 [r135], r53; +st.shared.u32 [r135+4], r56; +st.shared.u32 [r135+16], r75; +st.shared.u32 [r135+20], r84; +barrier.sync 0; +and.b32 r136, r128, 16; +sub.s32 r137, r135, r136; +ld.shared.u32 r106, [r137]; +ld.shared.u32 r109, [r137+4]; +ld.shared.u32 r107, [r137+32]; +ld.shared.u32 r110, [r137+36]; +{ +add.f16x2 %0, r106, r107; +} +{ +add.f16x2 %1, r109, r110; +} +{ +sub.f16x2 %2, r106, r107; +} +{ +sub.f16x2 %3, r109, r110; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..48432f8223232 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp32_fwd.hpp.inc @@ -0,0 +1,417 @@ +#ifndef CUFFTDX_FFT_8_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_8_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<20, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<90>; +.reg .b64 rd<2>; +add.f32 f33, %16, %26; +add.f32 f34, %17, %28; +sub.f32 f35, %16, %26; +sub.f32 f36, %17, %28; +add.f32 f37, %21, %32; +add.f32 f38, %23, %33; +sub.f32 f39, %21, %32; +sub.f32 f40, %23, %33; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +add.f32 f45, f35, f40; +sub.f32 f46, f36, f39; +sub.f32 f47, f35, f40; +add.f32 f48, f36, f39; +add.f32 f49, %18, %29; +add.f32 f50, %20, %31; +sub.f32 f51, %18, %29; +sub.f32 f52, %20, %31; +add.f32 f53, %24, %34; +add.f32 f54, %25, %35; +sub.f32 f55, %24, %34; +sub.f32 f56, %25, %35; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +add.f32 f61, f51, f56; +sub.f32 f62, f52, f55; +sub.f32 f63, f51, f56; +add.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0fBF3504F3; +sub.f32 f67, f65, f66; +mul.f32 f68, f62, 0f3F3504F3; +fma.rn.f32 f69, f61, 0fBF3504F3, f68; +mul.f32 f70, f63, 0fBF3504F3; +mul.f32 f71, f64, 0fBF3504F3; +sub.f32 f72, f70, f71; +add.f32 f73, f70, f71; +add.f32 %1, f42, f58; +add.f32 %0, f41, f57; +add.f32 %3, f46, f69; +add.f32 %2, f45, f67; +sub.f32 %5, f44, f59; +add.f32 %4, f43, f60; +add.f32 %7, f48, f73; +add.f32 %6, f47, f72; +sub.f32 %9, f42, f58; +sub.f32 %8, f41, f57; +sub.f32 %11, f46, f69; +sub.f32 %10, f45, f67; +add.f32 %13, f44, f59; +sub.f32 %12, f43, f60; +sub.f32 %15, f48, f73; +sub.f32 %14, f47, f72; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<21, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<78>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +sub.f32 f31, f19, f24; +add.f32 f32, f20, f23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f33, f29; +mul.f32 f38, f34, f30; +sub.f32 f39, f37, f38; +mul.f32 f40, f33, f30; +fma.rn.f32 f41, f34, f29, f40; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f44, f27; +mul.f32 f48, f46, f28; +sub.f32 f49, f47, f48; +mul.f32 f50, f44, f28; +fma.rn.f32 f51, f46, f27, f50; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f54, f31; +mul.f32 f58, f56, f32; +sub.f32 f59, f57, f58; +mul.f32 f60, f54, f32; +fma.rn.f32 f61, f56, f31, f60; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -32; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f39, f49, f59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+8]; +ld.shared.f32 f64, [r13+16]; +ld.shared.f32 f65, [r13+24]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+8]; +ld.shared.f32 f68, [r13+16]; +ld.shared.f32 f69, [r13+24]; +add.f32 %0, f62, f64; +add.f32 %1, f66, f68; +add.f32 %2, f63, f65; +add.f32 %3, f67, f69; +sub.f32 %4, f62, f64; +sub.f32 %5, f66, f68; +sub.f32 %6, f63, f65; +sub.f32 %7, f67, f69; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<22, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<86>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +add.f32 f27, f19, f24; +sub.f32 f28, f20, f23; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -64; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f31, f27; +mul.f32 f36, f32, f28; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f40, f25; +mul.f32 f44, f42, f26; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f48, f29; +mul.f32 f52, f50, f30; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 32; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f32, f27, f37; +sub.f32 f57, f35, f36; +st.shared.v4.f32 [r12], {f55, f54, f57, f56}; +sub.f32 f58, f43, f44; +fma.rn.f32 f59, f42, f25, f45; +fma.rn.f32 f60, f50, f29, f53; +sub.f32 f61, f51, f52; +st.shared.v4.f32 [r12+16], {f58, f59, f61, f60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+16]; +ld.shared.v2.f32 {f70, f71}, [r13+32]; +ld.shared.v2.f32 {f74, f75}, [r13+48]; +add.f32 %1, f63, f71; +add.f32 %0, f62, f70; +add.f32 %3, f67, f75; +add.f32 %2, f66, f74; +sub.f32 %5, f63, f71; +sub.f32 %4, f62, f70; +sub.f32 %7, f67, f75; +sub.f32 %6, f66, f74; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<23, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<47>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %7, %9; +add.f32 f10, %8, %10; +sub.f32 f11, %7, %9; +sub.f32 f12, %8, %10; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f13, f11; +mul.f32 f18, f14, f12; +sub.f32 f19, f17, f18; +mul.f32 f20, f13, f12; +fma.rn.f32 f21, f14, f11, f20; +and.b32 r7, r6, -32; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 24; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f19}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 12; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+16]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+16]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f30, f28; +mul.f32 f35, f31, f29; +sub.f32 f36, f34, f35; +mul.f32 f37, f30, f29; +fma.rn.f32 f38, f31, f28, f37; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f36; +barrier.sync 0; +and.b32 r19, r11, 8; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+16]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+16]; +add.f32 %0, f39, f40; +add.f32 %1, f41, f42; +sub.f32 %2, f39, f40; +sub.f32 %3, f41, f42; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<24, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<55>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %7, %9; +sub.f32 f10, %8, %10; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f11, f9; +mul.f32 f16, f12, f10; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 48; +add.s32 r11, r8, r10; +add.f32 f18, %8, %10; +add.f32 f19, %7, %9; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f15, f16; +fma.rn.f32 f21, f12, f9, f17; +st.shared.v2.f32 [r11+8], {f20, f21}; +barrier.sync 0; +and.b32 r12, r9, 24; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+32]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f32, f30; +mul.f32 f37, f33, f31; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 32; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f33, f30, f38; +sub.f32 f42, f36, f37; +st.shared.v2.f32 [r18+16], {f42, f41}; +barrier.sync 0; +and.b32 r19, r9, 16; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+32]; +add.f32 %1, f44, f48; +add.f32 %0, f43, f47; +sub.f32 %3, f44, f48; +sub.f32 %2, f43, f47; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..63a206ba325db --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp32_inv.hpp.inc @@ -0,0 +1,417 @@ +#ifndef CUFFTDX_FFT_8_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_8_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<222, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<90>; +.reg .b64 rd<2>; +add.f32 f33, %16, %26; +add.f32 f34, %17, %28; +sub.f32 f35, %16, %26; +sub.f32 f36, %17, %28; +add.f32 f37, %21, %32; +add.f32 f38, %23, %33; +sub.f32 f39, %21, %32; +sub.f32 f40, %23, %33; +add.f32 f41, f33, f37; +add.f32 f42, f34, f38; +sub.f32 f43, f33, f37; +sub.f32 f44, f34, f38; +sub.f32 f45, f35, f40; +add.f32 f46, f36, f39; +add.f32 f47, f35, f40; +sub.f32 f48, f36, f39; +add.f32 f49, %18, %29; +add.f32 f50, %20, %31; +sub.f32 f51, %18, %29; +sub.f32 f52, %20, %31; +add.f32 f53, %24, %34; +add.f32 f54, %25, %35; +sub.f32 f55, %24, %34; +sub.f32 f56, %25, %35; +add.f32 f57, f49, f53; +add.f32 f58, f50, f54; +sub.f32 f59, f49, f53; +sub.f32 f60, f50, f54; +sub.f32 f61, f51, f56; +add.f32 f62, f52, f55; +add.f32 f63, f51, f56; +sub.f32 f64, f52, f55; +mul.f32 f65, f61, 0f3F3504F3; +mul.f32 f66, f62, 0f3F3504F3; +sub.f32 f67, f65, f66; +add.f32 f68, f65, f66; +mul.f32 f69, f63, 0fBF3504F3; +mul.f32 f70, f64, 0f3F3504F3; +sub.f32 f71, f69, f70; +mul.f32 f72, f64, 0fBF3504F3; +fma.rn.f32 f73, f63, 0f3F3504F3, f72; +add.f32 %1, f42, f58; +add.f32 %0, f41, f57; +add.f32 %3, f46, f68; +add.f32 %2, f45, f67; +add.f32 %5, f44, f59; +sub.f32 %4, f43, f60; +add.f32 %7, f48, f73; +add.f32 %6, f47, f71; +sub.f32 %9, f42, f58; +sub.f32 %8, f41, f57; +sub.f32 %11, f46, f68; +sub.f32 %10, f45, f67; +sub.f32 %13, f44, f59; +add.f32 %12, f43, f60; +sub.f32 %15, f48, f73; +sub.f32 %14, f47, f71; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<223, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<78>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +add.f32 f25, f17, f21; +add.f32 f26, f18, f22; +sub.f32 f27, f17, f21; +sub.f32 f28, f18, f22; +sub.f32 f29, f19, f24; +add.f32 f30, f20, f23; +add.f32 f31, f19, f24; +sub.f32 f32, f20, f23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 3; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f33, f34}, [rd5]; +mul.f32 f37, f30, f34; +fma.rn.f32 f38, f33, f29, f37; +mul.f32 f39, f29, f34; +mul.f32 f40, f33, f30; +sub.f32 f41, f40, f39; +mul.f32 f42, f33, f33; +mul.f32 f43, f34, f34; +sub.f32 f44, f42, f43; +mul.f32 f45, f34, f33; +fma.rn.f32 f46, f34, f33, f45; +mul.f32 f47, f28, f46; +fma.rn.f32 f48, f44, f27, f47; +mul.f32 f49, f27, f46; +mul.f32 f50, f44, f28; +sub.f32 f51, f50, f49; +mul.f32 f52, f33, f44; +mul.f32 f53, f34, f46; +sub.f32 f54, f52, f53; +mul.f32 f55, f33, f46; +fma.rn.f32 f56, f34, f44, f55; +mul.f32 f57, f32, f56; +fma.rn.f32 f58, f54, f31, f57; +mul.f32 f59, f31, f56; +mul.f32 f60, f54, f32; +sub.f32 f61, f60, f59; +shl.b32 r8, r5, 4; +and.b32 r9, r8, -32; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 16; +add.s32 r12, r10, r11; +st.shared.v4.f32 [r12], {f25, f38, f48, f58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -12, r12; +ld.shared.f32 f62, [r13]; +ld.shared.f32 f63, [r13+8]; +ld.shared.f32 f64, [r13+16]; +ld.shared.f32 f65, [r13+24]; +barrier.sync 0; +st.shared.v4.f32 [r12], {f26, f41, f51, f61}; +barrier.sync 0; +ld.shared.f32 f66, [r13]; +ld.shared.f32 f67, [r13+8]; +ld.shared.f32 f68, [r13+16]; +ld.shared.f32 f69, [r13+24]; +add.f32 %0, f62, f64; +add.f32 %1, f66, f68; +add.f32 %2, f63, f65; +add.f32 %3, f67, f69; +sub.f32 %4, f62, f64; +sub.f32 %5, f66, f68; +sub.f32 %6, f63, f65; +sub.f32 %7, f67, f69; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<224, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<86>; +.reg .b32 r<14>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f17, %10, %15; +add.f32 f18, %11, %17; +sub.f32 f19, %10, %15; +sub.f32 f20, %11, %17; +add.f32 f21, %12, %18; +add.f32 f22, %14, %19; +sub.f32 f23, %12, %18; +sub.f32 f24, %14, %19; +sub.f32 f25, f17, f21; +sub.f32 f26, f18, f22; +sub.f32 f27, f19, f24; +add.f32 f28, f20, f23; +add.f32 f29, f19, f24; +sub.f32 f30, f20, f23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 5; +and.b32 r8, r7, -64; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 3; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 8; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f31, f32}, [rd5]; +mul.f32 f35, f28, f32; +mul.f32 f36, f27, f32; +mul.f32 f37, f31, f28; +mul.f32 f38, f31, f31; +mul.f32 f39, f32, f32; +sub.f32 f40, f38, f39; +mul.f32 f41, f32, f31; +fma.rn.f32 f42, f32, f31, f41; +mul.f32 f43, f26, f42; +mul.f32 f44, f25, f42; +mul.f32 f45, f40, f26; +mul.f32 f46, f31, f40; +mul.f32 f47, f32, f42; +sub.f32 f48, f46, f47; +mul.f32 f49, f31, f42; +fma.rn.f32 f50, f32, f40, f49; +mul.f32 f51, f30, f50; +mul.f32 f52, f29, f50; +mul.f32 f53, f48, f30; +barrier.sync 0; +and.b32 r11, r7, 32; +add.s32 r12, r9, r11; +add.f32 f54, f18, f22; +add.f32 f55, f17, f21; +fma.rn.f32 f56, f31, f27, f35; +sub.f32 f57, f37, f36; +st.shared.v4.f32 [r12], {f55, f54, f56, f57}; +sub.f32 f58, f45, f44; +fma.rn.f32 f59, f40, f25, f43; +fma.rn.f32 f60, f48, f29, f51; +sub.f32 f61, f53, f52; +st.shared.v4.f32 [r12+16], {f59, f58, f60, f61}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.v2.f32 {f62, f63}, [r13]; +ld.shared.v2.f32 {f66, f67}, [r13+16]; +ld.shared.v2.f32 {f70, f71}, [r13+32]; +ld.shared.v2.f32 {f74, f75}, [r13+48]; +add.f32 %1, f63, f71; +add.f32 %0, f62, f70; +add.f32 %3, f67, f75; +add.f32 %2, f66, f74; +sub.f32 %5, f63, f71; +sub.f32 %4, f62, f70; +sub.f32 %7, f67, f75; +sub.f32 %6, f66, f74; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y): "r"(smem), "l"(lut_sp_4_8), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<225, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<47>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 5; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f32 f9, %7, %9; +add.f32 f10, %8, %10; +sub.f32 f11, %7, %9; +sub.f32 f12, %8, %10; +shl.b32 r6, r5, 3; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f13, f14}, [rd5]; +mul.f32 f17, f12, f14; +fma.rn.f32 f18, f13, f11, f17; +mul.f32 f19, f11, f14; +mul.f32 f20, f13, f12; +sub.f32 f21, f20, f19; +and.b32 r7, r6, -32; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 24; +add.s32 r10, r8, r9; +st.shared.v2.f32 [r10], {f9, f18}; +barrier.sync 0; +shl.b32 r11, r5, 2; +and.b32 r12, r11, 12; +sub.s32 r13, r10, r12; +ld.shared.f32 f22, [r13]; +ld.shared.f32 f23, [r13+16]; +barrier.sync 0; +st.shared.v2.f32 [r10], {f10, f21}; +barrier.sync 0; +ld.shared.f32 f24, [r13]; +ld.shared.f32 f25, [r13+16]; +add.f32 f26, f22, f23; +add.f32 f27, f24, f25; +sub.f32 f28, f22, f23; +sub.f32 f29, f24, f25; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f30, f31}, [rd8]; +mul.f32 f34, f29, f31; +fma.rn.f32 f35, f30, f28, f34; +mul.f32 f36, f28, f31; +mul.f32 f37, f30, f29; +sub.f32 f38, f37, f36; +and.b32 r15, r11, 4; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 16; +add.s32 r18, r16, r17; +st.shared.f32 [r18], f26; +st.shared.f32 [r18+8], f35; +barrier.sync 0; +and.b32 r19, r11, 8; +sub.s32 r20, r18, r19; +ld.shared.f32 f39, [r20]; +ld.shared.f32 f40, [r20+16]; +barrier.sync 0; +st.shared.f32 [r18], f27; +st.shared.f32 [r18+8], f38; +barrier.sync 0; +ld.shared.f32 f41, [r20]; +ld.shared.f32 f42, [r20+16]; +add.f32 %0, f39, f40; +add.f32 %1, f41, f42; +sub.f32 %2, f39, f40; +sub.f32 %3, f41, f42; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<226, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<55>; +.reg .b32 r<21>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f32 f9, %7, %9; +sub.f32 f10, %8, %10; +shl.b32 r6, r5, 4; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 3; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 24; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f32 {f11, f12}, [rd5]; +mul.f32 f15, f10, f12; +mul.f32 f16, f9, f12; +mul.f32 f17, f11, f10; +barrier.sync 0; +and.b32 r10, r6, 48; +add.s32 r11, r8, r10; +add.f32 f18, %8, %10; +add.f32 f19, %7, %9; +st.shared.v2.f32 [r11], {f19, f18}; +sub.f32 f20, f17, f16; +fma.rn.f32 f21, f11, f9, f15; +st.shared.v2.f32 [r11+8], {f21, f20}; +barrier.sync 0; +and.b32 r12, r9, 24; +sub.s32 r13, r11, r12; +ld.shared.v2.f32 {f22, f23}, [r13]; +ld.shared.v2.f32 {f26, f27}, [r13+32]; +sub.f32 f30, f22, f26; +sub.f32 f31, f23, f27; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 8; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f32 {f32, f33}, [rd8]; +mul.f32 f36, f31, f33; +mul.f32 f37, f30, f33; +mul.f32 f38, f32, f31; +and.b32 r15, r9, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 32; +add.s32 r18, r16, r17; +add.f32 f39, f23, f27; +add.f32 f40, f22, f26; +st.shared.v2.f32 [r18], {f40, f39}; +fma.rn.f32 f41, f32, f30, f36; +sub.f32 f42, f38, f37; +st.shared.v2.f32 [r18+16], {f41, f42}; +barrier.sync 0; +and.b32 r19, r9, 16; +sub.s32 r20, r18, r19; +ld.shared.v2.f32 {f43, f44}, [r20]; +ld.shared.v2.f32 {f47, f48}, [r20+32]; +add.f32 %1, f44, f48; +add.f32 %0, f43, f47; +sub.f32 %3, f44, f48; +sub.f32 %2, f43, f47; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y): "r"(smem), "l"(lut_sp_2_8), "l"(lut_sp_2_4), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..2c2e3866b92c2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp64_fwd.hpp.inc @@ -0,0 +1,413 @@ +#ifndef CUFFTDX_FFT_8_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_8_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<424, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<90>; +.reg .b64 rd<2>; +add.f64 fd33, %16, %26; +add.f64 fd34, %17, %28; +sub.f64 fd35, %16, %26; +sub.f64 fd36, %17, %28; +add.f64 fd37, %21, %32; +add.f64 fd38, %23, %33; +sub.f64 fd39, %21, %32; +sub.f64 fd40, %23, %33; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +add.f64 fd45, fd35, fd40; +sub.f64 fd46, fd36, fd39; +sub.f64 fd47, fd35, fd40; +add.f64 fd48, fd36, fd39; +add.f64 fd49, %18, %29; +add.f64 fd50, %20, %31; +sub.f64 fd51, %18, %29; +sub.f64 fd52, %20, %31; +add.f64 fd53, %24, %34; +add.f64 fd54, %25, %35; +sub.f64 fd55, %24, %34; +sub.f64 fd56, %25, %35; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +add.f64 fd61, fd51, fd56; +sub.f64 fd62, fd52, fd55; +sub.f64 fd63, fd51, fd56; +add.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0dBFE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +mul.f64 fd68, fd62, 0d3FE6A09E667F3BCD; +fma.rn.f64 fd69, fd61, 0dBFE6A09E667F3BCD, fd68; +mul.f64 fd70, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd71, fd64, 0dBFE6A09E667F3BCD; +sub.f64 fd72, fd70, fd71; +add.f64 fd73, fd70, fd71; +add.f64 %1, fd42, fd58; +add.f64 %0, fd41, fd57; +add.f64 %3, fd46, fd69; +add.f64 %2, fd45, fd67; +sub.f64 %5, fd44, fd59; +add.f64 %4, fd43, fd60; +add.f64 %7, fd48, fd73; +add.f64 %6, fd47, fd72; +sub.f64 %9, fd42, fd58; +sub.f64 %8, fd41, fd57; +sub.f64 %11, fd46, fd69; +sub.f64 %10, fd45, fd67; +add.f64 %13, fd44, fd59; +sub.f64 %12, fd43, fd60; +sub.f64 %15, fd48, fd73; +sub.f64 %14, fd47, fd72; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<425, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<77>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +sub.f64 fd31, fd19, fd24; +add.f64 fd32, fd20, fd23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd33, fd29; +mul.f64 fd38, fd34, fd30; +sub.f64 fd39, fd37, fd38; +mul.f64 fd40, fd33, fd30; +fma.rn.f64 fd41, fd34, fd29, fd40; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd44, fd27; +mul.f64 fd48, fd46, fd28; +sub.f64 fd49, fd47, fd48; +mul.f64 fd50, fd44, fd28; +fma.rn.f64 fd51, fd46, fd27, fd50; +ld.global.v2.f64 {fd52, fd53}, [rd5+32]; +mul.f64 fd56, fd52, fd31; +mul.f64 fd57, fd53, fd32; +sub.f64 fd58, fd56, fd57; +mul.f64 fd59, fd52, fd32; +fma.rn.f64 fd60, fd53, fd31, fd59; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -64; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd39}; +st.shared.v2.f64 [r12+16], {fd49, fd58}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+16]; +ld.shared.f64 fd63, [r13+32]; +ld.shared.f64 fd64, [r13+48]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+16]; +ld.shared.f64 fd67, [r13+32]; +ld.shared.f64 fd68, [r13+48]; +add.f64 %0, fd61, fd63; +add.f64 %1, fd65, fd67; +add.f64 %2, fd62, fd64; +add.f64 %3, fd66, fd68; +sub.f64 %4, fd61, fd63; +sub.f64 %5, fd65, fd67; +sub.f64 %6, fd62, fd64; +sub.f64 %7, fd66, fd68; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<426, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<47>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %7, %9; +add.f64 fd10, %8, %10; +sub.f64 fd11, %7, %9; +sub.f64 fd12, %8, %10; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd13, fd11; +mul.f64 fd18, fd14, fd12; +sub.f64 fd19, fd17, fd18; +mul.f64 fd20, fd13, fd12; +fma.rn.f64 fd21, fd14, fd11, fd20; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 48; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd19}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 24; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+32]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+32]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd30, fd28; +mul.f64 fd35, fd31, fd29; +sub.f64 fd36, fd34, fd35; +mul.f64 fd37, fd30, fd29; +fma.rn.f64 fd38, fd31, fd28, fd37; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 32; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd36; +barrier.sync 0; +and.b32 r19, r11, 16; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+32]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+32]; +add.f64 %0, fd39, fd40; +add.f64 %1, fd41, fd42; +sub.f64 %2, fd39, fd40; +sub.f64 %3, fd41, fd42; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<427, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<85>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +add.f64 fd27, fd19, fd24; +sub.f64 fd28, fd20, fd23; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -128; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd31, fd27; +mul.f64 fd36, fd32, fd28; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd40, fd25; +mul.f64 fd44, fd42, fd26; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+32]; +mul.f64 fd50, fd46, fd29; +mul.f64 fd51, fd47, fd30; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 64; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd32, fd27, fd37; +sub.f64 fd56, fd35, fd36; +st.shared.v2.f64 [r12+16], {fd56, fd55}; +sub.f64 fd57, fd43, fd44; +fma.rn.f64 fd58, fd42, fd25, fd45; +st.shared.v2.f64 [r12+32], {fd57, fd58}; +fma.rn.f64 fd59, fd47, fd29, fd52; +sub.f64 fd60, fd50, fd51; +st.shared.v2.f64 [r12+48], {fd60, fd59}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+32]; +ld.shared.v2.f64 {fd69, fd70}, [r13+64]; +ld.shared.v2.f64 {fd73, fd74}, [r13+96]; +add.f64 %1, fd62, fd70; +add.f64 %0, fd61, fd69; +add.f64 %3, fd66, fd74; +add.f64 %2, fd65, fd73; +sub.f64 %5, fd62, fd70; +sub.f64 %4, fd61, fd69; +sub.f64 %7, fd66, fd74; +sub.f64 %6, fd65, fd73; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<428, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<55>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %7, %9; +sub.f64 fd10, %8, %10; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd11, fd9; +mul.f64 fd16, fd12, fd10; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 96; +add.s32 r11, r8, r10; +add.f64 fd18, %8, %10; +add.f64 fd19, %7, %9; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd15, fd16; +fma.rn.f64 fd21, fd12, fd9, fd17; +st.shared.v2.f64 [r11+16], {fd20, fd21}; +barrier.sync 0; +and.b32 r12, r9, 48; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+64]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd32, fd30; +mul.f64 fd37, fd33, fd31; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 64; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd33, fd30, fd38; +sub.f64 fd42, fd36, fd37; +st.shared.v2.f64 [r18+32], {fd42, fd41}; +barrier.sync 0; +and.b32 r19, r9, 32; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+64]; +add.f64 %1, fd44, fd48; +add.f64 %0, fd43, fd47; +sub.f64 %3, fd44, fd48; +sub.f64 %2, fd43, fd47; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..f5a27dbcef523 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_8_fp64_inv.hpp.inc @@ -0,0 +1,413 @@ +#ifndef CUFFTDX_FFT_8_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_8_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<595, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<90>; +.reg .b64 rd<2>; +add.f64 fd33, %16, %26; +add.f64 fd34, %17, %28; +sub.f64 fd35, %16, %26; +sub.f64 fd36, %17, %28; +add.f64 fd37, %21, %32; +add.f64 fd38, %23, %33; +sub.f64 fd39, %21, %32; +sub.f64 fd40, %23, %33; +add.f64 fd41, fd33, fd37; +add.f64 fd42, fd34, fd38; +sub.f64 fd43, fd33, fd37; +sub.f64 fd44, fd34, fd38; +sub.f64 fd45, fd35, fd40; +add.f64 fd46, fd36, fd39; +add.f64 fd47, fd35, fd40; +sub.f64 fd48, fd36, fd39; +add.f64 fd49, %18, %29; +add.f64 fd50, %20, %31; +sub.f64 fd51, %18, %29; +sub.f64 fd52, %20, %31; +add.f64 fd53, %24, %34; +add.f64 fd54, %25, %35; +sub.f64 fd55, %24, %34; +sub.f64 fd56, %25, %35; +add.f64 fd57, fd49, fd53; +add.f64 fd58, fd50, fd54; +sub.f64 fd59, fd49, fd53; +sub.f64 fd60, fd50, fd54; +sub.f64 fd61, fd51, fd56; +add.f64 fd62, fd52, fd55; +add.f64 fd63, fd51, fd56; +sub.f64 fd64, fd52, fd55; +mul.f64 fd65, fd61, 0d3FE6A09E667F3BCD; +mul.f64 fd66, fd62, 0d3FE6A09E667F3BCD; +sub.f64 fd67, fd65, fd66; +add.f64 fd68, fd65, fd66; +mul.f64 fd69, fd63, 0dBFE6A09E667F3BCD; +mul.f64 fd70, fd64, 0d3FE6A09E667F3BCD; +sub.f64 fd71, fd69, fd70; +mul.f64 fd72, fd64, 0dBFE6A09E667F3BCD; +fma.rn.f64 fd73, fd63, 0d3FE6A09E667F3BCD, fd72; +add.f64 %1, fd42, fd58; +add.f64 %0, fd41, fd57; +add.f64 %3, fd46, fd68; +add.f64 %2, fd45, fd67; +add.f64 %5, fd44, fd59; +sub.f64 %4, fd43, fd60; +add.f64 %7, fd48, fd73; +add.f64 %6, fd47, fd71; +sub.f64 %9, fd42, fd58; +sub.f64 %8, fd41, fd57; +sub.f64 %11, fd46, fd68; +sub.f64 %10, fd45, fd67; +sub.f64 %13, fd44, fd59; +add.f64 %12, fd43, fd60; +sub.f64 %15, fd48, fd73; +sub.f64 %14, fd47, fd71; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<596, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<77>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +add.f64 fd25, fd17, fd21; +add.f64 fd26, fd18, fd22; +sub.f64 fd27, fd17, fd21; +sub.f64 fd28, fd18, fd22; +sub.f64 fd29, fd19, fd24; +add.f64 fd30, fd20, fd23; +add.f64 fd31, fd19, fd24; +sub.f64 fd32, fd20, fd23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 4; +cvt.u64.u32 rd2, r7; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd33, fd34}, [rd5]; +mul.f64 fd37, fd30, fd34; +fma.rn.f64 fd38, fd33, fd29, fd37; +mul.f64 fd39, fd29, fd34; +mul.f64 fd40, fd33, fd30; +sub.f64 fd41, fd40, fd39; +mul.f64 fd42, fd33, fd33; +mul.f64 fd43, fd34, fd34; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd34, fd33; +fma.rn.f64 fd46, fd34, fd33, fd45; +mul.f64 fd47, fd28, fd46; +fma.rn.f64 fd48, fd44, fd27, fd47; +mul.f64 fd49, fd27, fd46; +mul.f64 fd50, fd44, fd28; +sub.f64 fd51, fd50, fd49; +ld.global.v2.f64 {fd52, fd53}, [rd5+32]; +mul.f64 fd56, fd32, fd53; +fma.rn.f64 fd57, fd52, fd31, fd56; +mul.f64 fd58, fd31, fd53; +mul.f64 fd59, fd52, fd32; +sub.f64 fd60, fd59, fd58; +shl.b32 r8, r5, 5; +and.b32 r9, r8, -64; +add.s32 r10, r4, r9; +barrier.sync 0; +and.b32 r11, r8, 32; +add.s32 r12, r10, r11; +st.shared.v2.f64 [r12], {fd25, fd38}; +st.shared.v2.f64 [r12+16], {fd48, fd57}; +barrier.sync 0; +mad.lo.s32 r13, r6, -24, r12; +ld.shared.f64 fd61, [r13]; +ld.shared.f64 fd62, [r13+16]; +ld.shared.f64 fd63, [r13+32]; +ld.shared.f64 fd64, [r13+48]; +barrier.sync 0; +st.shared.v2.f64 [r12], {fd26, fd41}; +st.shared.v2.f64 [r12+16], {fd51, fd60}; +barrier.sync 0; +ld.shared.f64 fd65, [r13]; +ld.shared.f64 fd66, [r13+16]; +ld.shared.f64 fd67, [r13+32]; +ld.shared.f64 fd68, [r13+48]; +add.f64 %0, fd61, fd63; +add.f64 %1, fd65, fd67; +add.f64 %2, fd62, fd64; +add.f64 %3, fd66, fd68; +sub.f64 %4, fd61, fd63; +sub.f64 %5, fd65, fd67; +sub.f64 %6, fd62, fd64; +sub.f64 %7, fd66, fd68; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<597, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<47>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 6; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd9, %7, %9; +add.f64 fd10, %8, %10; +sub.f64 fd11, %7, %9; +sub.f64 fd12, %8, %10; +shl.b32 r6, r5, 4; +cvt.u64.u32 rd2, r6; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd13, fd14}, [rd5]; +mul.f64 fd17, fd12, fd14; +fma.rn.f64 fd18, fd13, fd11, fd17; +mul.f64 fd19, fd11, fd14; +mul.f64 fd20, fd13, fd12; +sub.f64 fd21, fd20, fd19; +and.b32 r7, r6, -64; +add.s32 r8, r4, r7; +barrier.sync 0; +and.b32 r9, r6, 48; +add.s32 r10, r8, r9; +st.shared.v2.f64 [r10], {fd9, fd18}; +barrier.sync 0; +shl.b32 r11, r5, 3; +and.b32 r12, r11, 24; +sub.s32 r13, r10, r12; +ld.shared.f64 fd22, [r13]; +ld.shared.f64 fd23, [r13+32]; +barrier.sync 0; +st.shared.v2.f64 [r10], {fd10, fd21}; +barrier.sync 0; +ld.shared.f64 fd24, [r13]; +ld.shared.f64 fd25, [r13+32]; +add.f64 fd26, fd22, fd23; +add.f64 fd27, fd24, fd25; +sub.f64 fd28, fd22, fd23; +sub.f64 fd29, fd24, fd25; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd30, fd31}, [rd8]; +mul.f64 fd34, fd29, fd31; +fma.rn.f64 fd35, fd30, fd28, fd34; +mul.f64 fd36, fd28, fd31; +mul.f64 fd37, fd30, fd29; +sub.f64 fd38, fd37, fd36; +and.b32 r15, r11, 8; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 32; +add.s32 r18, r16, r17; +st.shared.f64 [r18], fd26; +st.shared.f64 [r18+16], fd35; +barrier.sync 0; +and.b32 r19, r11, 16; +sub.s32 r20, r18, r19; +ld.shared.f64 fd39, [r20]; +ld.shared.f64 fd40, [r20+32]; +barrier.sync 0; +st.shared.f64 [r18], fd27; +st.shared.f64 [r18+16], fd38; +barrier.sync 0; +ld.shared.f64 fd41, [r20]; +ld.shared.f64 fd42, [r20+32]; +add.f64 %0, fd39, fd40; +add.f64 %1, fd41, fd42; +sub.f64 %2, fd39, fd40; +sub.f64 %3, fd41, fd42; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<598, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<14>; +.reg .f64 fd<85>; +.reg .b64 rd<6>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %8; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +add.f64 fd17, %10, %15; +add.f64 fd18, %11, %17; +sub.f64 fd19, %10, %15; +sub.f64 fd20, %11, %17; +add.f64 fd21, %12, %18; +add.f64 fd22, %14, %19; +sub.f64 fd23, %12, %18; +sub.f64 fd24, %14, %19; +sub.f64 fd25, fd17, fd21; +sub.f64 fd26, fd18, fd22; +sub.f64 fd27, fd19, fd24; +add.f64 fd28, fd20, fd23; +add.f64 fd29, fd19, fd24; +sub.f64 fd30, fd20, fd23; +and.b32 r6, r5, 1; +shl.b32 r7, r5, 6; +and.b32 r8, r7, -128; +add.s32 r9, r4, r8; +shl.b32 r10, r5, 4; +cvt.u64.u32 rd2, r10; +and.b64 rd3, rd2, 16; +mov.u64 rd4, %9; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd31, fd32}, [rd5]; +mul.f64 fd35, fd28, fd32; +mul.f64 fd36, fd27, fd32; +mul.f64 fd37, fd31, fd28; +mul.f64 fd38, fd31, fd31; +mul.f64 fd39, fd32, fd32; +sub.f64 fd40, fd38, fd39; +mul.f64 fd41, fd32, fd31; +fma.rn.f64 fd42, fd32, fd31, fd41; +mul.f64 fd43, fd26, fd42; +mul.f64 fd44, fd25, fd42; +mul.f64 fd45, fd40, fd26; +ld.global.v2.f64 {fd46, fd47}, [rd5+32]; +mul.f64 fd50, fd30, fd47; +mul.f64 fd51, fd29, fd47; +mul.f64 fd52, fd46, fd30; +barrier.sync 0; +and.b32 r11, r7, 64; +add.s32 r12, r9, r11; +add.f64 fd53, fd18, fd22; +add.f64 fd54, fd17, fd21; +st.shared.v2.f64 [r12], {fd54, fd53}; +fma.rn.f64 fd55, fd31, fd27, fd35; +sub.f64 fd56, fd37, fd36; +st.shared.v2.f64 [r12+16], {fd55, fd56}; +sub.f64 fd57, fd45, fd44; +fma.rn.f64 fd58, fd40, fd25, fd43; +st.shared.v2.f64 [r12+32], {fd58, fd57}; +fma.rn.f64 fd59, fd46, fd29, fd50; +sub.f64 fd60, fd52, fd51; +st.shared.v2.f64 [r12+48], {fd59, fd60}; +barrier.sync 0; +mad.lo.s32 r13, r6, -48, r12; +ld.shared.v2.f64 {fd61, fd62}, [r13]; +ld.shared.v2.f64 {fd65, fd66}, [r13+32]; +ld.shared.v2.f64 {fd69, fd70}, [r13+64]; +ld.shared.v2.f64 {fd73, fd74}, [r13+96]; +add.f64 %1, fd62, fd70; +add.f64 %0, fd61, fd69; +add.f64 %3, fd66, fd74; +add.f64 %2, fd65, fd73; +sub.f64 %5, fd62, fd70; +sub.f64 %4, fd61, fd69; +sub.f64 %7, fd66, fd74; +sub.f64 %6, fd65, fd73; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y): "r"(smem), "l"(lut_dp_4_8), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<599, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<21>; +.reg .f64 fd<55>; +.reg .b64 rd<9>; +mov.u32 r1, %tid.y; +shl.b32 r2, r1, 7; +mov.u32 r3, %4; +add.s32 r4, r3, r2; +mov.u32 r5, %tid.x; +sub.f64 fd9, %7, %9; +sub.f64 fd10, %8, %10; +shl.b32 r6, r5, 5; +and.b32 r7, r6, -128; +add.s32 r8, r4, r7; +shl.b32 r9, r5, 4; +cvt.u64.u32 rd2, r9; +and.b64 rd3, rd2, 48; +mov.u64 rd4, %5; +add.s64 rd5, rd4, rd3; +ld.global.v2.f64 {fd11, fd12}, [rd5]; +mul.f64 fd15, fd10, fd12; +mul.f64 fd16, fd9, fd12; +mul.f64 fd17, fd11, fd10; +barrier.sync 0; +and.b32 r10, r6, 96; +add.s32 r11, r8, r10; +add.f64 fd18, %8, %10; +add.f64 fd19, %7, %9; +st.shared.v2.f64 [r11], {fd19, fd18}; +sub.f64 fd20, fd17, fd16; +fma.rn.f64 fd21, fd11, fd9, fd15; +st.shared.v2.f64 [r11+16], {fd21, fd20}; +barrier.sync 0; +and.b32 r12, r9, 48; +sub.s32 r13, r11, r12; +ld.shared.v2.f64 {fd22, fd23}, [r13]; +ld.shared.v2.f64 {fd26, fd27}, [r13+64]; +sub.f64 fd30, fd22, fd26; +sub.f64 fd31, fd23, fd27; +bfe.u32 r14, r5, 1, 1; +mul.wide.u32 rd6, r14, 16; +mov.u64 rd7, %6; +add.s64 rd8, rd7, rd6; +ld.global.v2.f64 {fd32, fd33}, [rd8]; +mul.f64 fd36, fd31, fd33; +mul.f64 fd37, fd30, fd33; +mul.f64 fd38, fd32, fd31; +and.b32 r15, r9, 16; +add.s32 r16, r8, r15; +barrier.sync 0; +and.b32 r17, r6, 64; +add.s32 r18, r16, r17; +add.f64 fd39, fd23, fd27; +add.f64 fd40, fd22, fd26; +st.shared.v2.f64 [r18], {fd40, fd39}; +fma.rn.f64 fd41, fd32, fd30, fd36; +sub.f64 fd42, fd38, fd37; +st.shared.v2.f64 [r18+32], {fd41, fd42}; +barrier.sync 0; +and.b32 r19, r9, 32; +sub.s32 r20, r18, r19; +ld.shared.v2.f64 {fd43, fd44}, [r20]; +ld.shared.v2.f64 {fd47, fd48}, [r20+64]; +add.f64 %1, fd44, fd48; +add.f64 %0, fd43, fd47; +sub.f64 %3, fd44, fd48; +sub.f64 %2, fd43, fd47; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y): "r"(smem), "l"(lut_dp_2_8), "l"(lut_dp_2_4), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..e7884aaf99c37 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp16_fwd.hpp.inc @@ -0,0 +1,1345 @@ +#ifndef CUFFTDX_FFT_9_FP16_FWD_PTX_HPP +#define CUFFTDX_FFT_9_FP16_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<864, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<41>; +.reg .b32 r<569>; +.reg .f64 fd<29>; +.reg .b64 rd<2>; +mov.f64 fd27, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd27; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd28, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd28; +} +{ +neg.f16 rs3, rs2; +} +mov.b32 r81, {rs3, rs3}; +{ +add.f16x2 r1, %24, %30; +} +{ +add.f16x2 r4, %18, r1; +} +{ +add.f16x2 r7, %25, %31; +} +{ +add.f16x2 r10, %19, r7; +} +{ +add.f16x2 r13, %24, %30; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %18, r16; +} +{ +sub.f16x2 r22, %25, %31; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %24, %30; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %18, r34; +} +{ +sub.f16x2 r40, %25, %31; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %25, %31; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %19, r52; +} +{ +sub.f16x2 r58, %24, %30; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %25, %31; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %19, r70; +} +{ +sub.f16x2 r76, %24, %30; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs5, fd27; +} +mov.b32 r156, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd28; +} +{ +neg.f16 rs7, rs6; +} +mov.b32 r165, {rs7, rs7}; +{ +add.f16x2 r85, %26, %32; +} +{ +add.f16x2 r88, %20, r85; +} +{ +add.f16x2 r91, %27, %33; +} +{ +add.f16x2 r94, %21, r91; +} +{ +add.f16x2 r97, %26, %32; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %20, r100; +} +{ +sub.f16x2 r106, %27, %33; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %26, %32; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %20, r118; +} +{ +sub.f16x2 r124, %27, %33; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %27, %33; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %21, r136; +} +{ +sub.f16x2 r142, %26, %32; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %27, %33; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %21, r154; +} +{ +sub.f16x2 r160, %26, %32; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +{ +cvt.rn.f16.f64 rs9, fd27; +} +mov.b32 r240, {rs9, rs9}; +{ +cvt.rn.f16.f64 rs10, fd28; +} +{ +neg.f16 rs11, rs10; +} +mov.b32 r249, {rs11, rs11}; +{ +add.f16x2 r169, %28, %34; +} +{ +add.f16x2 r172, %22, r169; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %28, %34; +} +{ +mul.f16x2 r184, r181, r240; +} +{ +add.f16x2 r187, %22, r184; +} +{ +sub.f16x2 r190, %29, %35; +} +{ +mul.f16x2 r193, r190, r249; +} +{ +add.f16x2 r196, r187, r193; +} +{ +add.f16x2 r199, %28, %34; +} +{ +mul.f16x2 r202, r199, r240; +} +{ +add.f16x2 r205, %22, r202; +} +{ +sub.f16x2 r208, %29, %35; +} +{ +mul.f16x2 r211, r208, r249; +} +{ +sub.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %29, %35; +} +{ +mul.f16x2 r220, r217, r240; +} +{ +add.f16x2 r223, %23, r220; +} +{ +sub.f16x2 r226, %28, %34; +} +{ +mul.f16x2 r229, r226, r249; +} +{ +sub.f16x2 r232, r223, r229; +} +{ +add.f16x2 r235, %29, %35; +} +{ +mul.f16x2 r238, r235, r240; +} +{ +add.f16x2 r241, %23, r238; +} +{ +sub.f16x2 r244, %28, %34; +} +{ +mul.f16x2 r247, r244, r249; +} +{ +add.f16x2 r250, r241, r247; +} +mov.f64 fd7, 0d3FE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs13, fd7; +} +mov.f64 fd8, 0dBFE491B7523C161D; +{ +cvt.rn.f16.f64 rs14, fd8; +} +mov.f64 fd9, 0d3FC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs15, fd9; +} +mov.f64 fd10, 0dBFEF838B8C811C17; +{ +cvt.rn.f16.f64 rs16, fd10; +} +mov.f64 fd13, 0dBFEE11F642522D1C; +{ +cvt.rn.f16.f64 rs19, fd13; +} +mov.f64 fd14, 0dBFD5E3A8748A0BF5; +{ +cvt.rn.f16.f64 rs20, fd14; +} +mov.b32 r267, {rs13, rs13}; +{ +mul.f16x2 r253, r112, r267; +} +mov.b32 r264, {rs14, rs14}; +{ +mul.f16x2 r256, r148, r264; +} +{ +sub.f16x2 r259, r253, r256; +} +{ +mul.f16x2 r262, r112, r264; +} +{ +fma.rn.f16x2 r265, r148, r267, r262; +} +mov.b32 r299, {rs15, rs15}; +{ +mul.f16x2 r269, r196, r299; +} +mov.b32 r296, {rs16, rs16}; +{ +mul.f16x2 r272, r232, r296; +} +{ +sub.f16x2 r275, r269, r272; +} +{ +mul.f16x2 r278, r196, r296; +} +{ +fma.rn.f16x2 r281, r232, r299, r278; +} +{ +mul.f16x2 r285, r130, r299; +} +{ +mul.f16x2 r288, r166, r296; +} +{ +sub.f16x2 r291, r285, r288; +} +{ +mul.f16x2 r294, r130, r296; +} +{ +fma.rn.f16x2 r297, r166, r299, r294; +} +mov.b32 r315, {rs19, rs19}; +{ +mul.f16x2 r301, r214, r315; +} +mov.b32 r312, {rs20, rs20}; +{ +mul.f16x2 r304, r250, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r214, r312; +} +{ +fma.rn.f16x2 r313, r250, r315, r310; +} +{ +cvt.rn.f16.f64 rs29, fd27; +} +mov.b32 r388, {rs29, rs29}; +{ +cvt.rn.f16.f64 rs30, fd28; +} +{ +neg.f16 rs31, rs30; +} +mov.b32 r397, {rs31, rs31}; +{ +add.f16x2 r317, r88, r172; +} +{ +add.f16x2 %0, r4, r317; +} +{ +add.f16x2 r323, r94, r178; +} +{ +add.f16x2 %1, r10, r323; +} +{ +add.f16x2 r329, r88, r172; +} +{ +mul.f16x2 r332, r329, r388; +} +{ +add.f16x2 r335, r4, r332; +} +{ +sub.f16x2 r338, r94, r178; +} +{ +mul.f16x2 r341, r338, r397; +} +{ +add.f16x2 %6, r335, r341; +} +{ +add.f16x2 r347, r88, r172; +} +{ +mul.f16x2 r350, r347, r388; +} +{ +add.f16x2 r353, r4, r350; +} +{ +sub.f16x2 r356, r94, r178; +} +{ +mul.f16x2 r359, r356, r397; +} +{ +sub.f16x2 %12, r353, r359; +} +{ +add.f16x2 r365, r94, r178; +} +{ +mul.f16x2 r368, r365, r388; +} +{ +add.f16x2 r371, r10, r368; +} +{ +sub.f16x2 r374, r88, r172; +} +{ +mul.f16x2 r377, r374, r397; +} +{ +sub.f16x2 %7, r371, r377; +} +{ +add.f16x2 r383, r94, r178; +} +{ +mul.f16x2 r386, r383, r388; +} +{ +add.f16x2 r389, r10, r386; +} +{ +sub.f16x2 r392, r88, r172; +} +{ +mul.f16x2 r395, r392, r397; +} +{ +add.f16x2 %13, r389, r395; +} +{ +cvt.rn.f16.f64 rs33, fd27; +} +mov.b32 r472, {rs33, rs33}; +{ +cvt.rn.f16.f64 rs34, fd28; +} +{ +neg.f16 rs35, rs34; +} +mov.b32 r481, {rs35, rs35}; +{ +add.f16x2 r401, r259, r275; +} +{ +add.f16x2 %2, r28, r401; +} +{ +add.f16x2 r407, r265, r281; +} +{ +add.f16x2 %3, r64, r407; +} +{ +add.f16x2 r413, r259, r275; +} +{ +mul.f16x2 r416, r413, r472; +} +{ +add.f16x2 r419, r28, r416; +} +{ +sub.f16x2 r422, r265, r281; +} +{ +mul.f16x2 r425, r422, r481; +} +{ +add.f16x2 %8, r419, r425; +} +{ +add.f16x2 r431, r259, r275; +} +{ +mul.f16x2 r434, r431, r472; +} +{ +add.f16x2 r437, r28, r434; +} +{ +sub.f16x2 r440, r265, r281; +} +{ +mul.f16x2 r443, r440, r481; +} +{ +sub.f16x2 %14, r437, r443; +} +{ +add.f16x2 r449, r265, r281; +} +{ +mul.f16x2 r452, r449, r472; +} +{ +add.f16x2 r455, r64, r452; +} +{ +sub.f16x2 r458, r259, r275; +} +{ +mul.f16x2 r461, r458, r481; +} +{ +sub.f16x2 %9, r455, r461; +} +{ +add.f16x2 r467, r265, r281; +} +{ +mul.f16x2 r470, r467, r472; +} +{ +add.f16x2 r473, r64, r470; +} +{ +sub.f16x2 r476, r259, r275; +} +{ +mul.f16x2 r479, r476, r481; +} +{ +add.f16x2 %15, r473, r479; +} +{ +cvt.rn.f16.f64 rs37, fd27; +} +mov.b32 r556, {rs37, rs37}; +{ +cvt.rn.f16.f64 rs38, fd28; +} +{ +neg.f16 rs39, rs38; +} +mov.b32 r565, {rs39, rs39}; +{ +add.f16x2 r485, r291, r307; +} +{ +add.f16x2 %4, r46, r485; +} +{ +add.f16x2 r491, r297, r313; +} +{ +add.f16x2 %5, r82, r491; +} +{ +add.f16x2 r497, r291, r307; +} +{ +mul.f16x2 r500, r497, r556; +} +{ +add.f16x2 r503, r46, r500; +} +{ +sub.f16x2 r506, r297, r313; +} +{ +mul.f16x2 r509, r506, r565; +} +{ +add.f16x2 %10, r503, r509; +} +{ +add.f16x2 r515, r291, r307; +} +{ +mul.f16x2 r518, r515, r556; +} +{ +add.f16x2 r521, r46, r518; +} +{ +sub.f16x2 r524, r297, r313; +} +{ +mul.f16x2 r527, r524, r565; +} +{ +sub.f16x2 %16, r521, r527; +} +{ +add.f16x2 r533, r297, r313; +} +{ +mul.f16x2 r536, r533, r556; +} +{ +add.f16x2 r539, r82, r536; +} +{ +sub.f16x2 r542, r291, r307; +} +{ +mul.f16x2 r545, r542, r565; +} +{ +sub.f16x2 %11, r539, r545; +} +{ +add.f16x2 r551, r297, r313; +} +{ +mul.f16x2 r554, r551, r556; +} +{ +add.f16x2 r557, r82, r554; +} +{ +sub.f16x2 r560, r291, r307; +} +{ +mul.f16x2 r563, r560, r565; +} +{ +add.f16x2 %17, r557, r563; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<865, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<20>; +.reg .b32 r<265>; +.reg .b64 rd<4>; +mov.u32 r254, %tid.y; +mov.u32 r255, %6; +mad.lo.s32 r256, r254, 72, r255; +mov.u32 r257, %tid.x; +mov.f32 f14, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r1, {low, high}; +} +mov.f32 f16, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r257, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r258, rd3; +mul.lo.s32 r259, r258, 3; +sub.s32 r260, r257, r259; +mad.lo.s32 r261, r258, 72, r256; +cvt.rn.f32.u32 f17, r260; +mul.f32 f18, f17, 0f3F32B8C2; +cos.approx.f32 f5, f18; +sin.approx.f32 f19, f18; +neg.f32 f6, f19; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f9, 0fBF800000; +mov.f32 f10, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f9; +cvt.rn.f16.f32 high, f10; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r262, r260, 24, r261; +st.shared.v2.f32 [r262], {r8, r14}; +st.shared.v2.f32 [r262+8], {r101, r108}; +st.shared.v2.f32 [r262+16], {r138, r145}; +barrier.sync 0; +shl.b32 r263, r260, 4; +sub.s32 r264, r262, r263; +ld.shared.u32 r174, [r264]; +ld.shared.u32 r180, [r264+4]; +ld.shared.u32 r171, [r264+24]; +ld.shared.u32 r177, [r264+28]; +ld.shared.u32 r172, [r264+48]; +ld.shared.u32 r178, [r264+52]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 %0, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 %1, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 %2, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 %4, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 %3, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 %5, r242, r248; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<866, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<20>; +.reg .b32 r<265>; +.reg .b64 rd<4>; +mov.u32 r254, %tid.y; +mov.u32 r255, %6; +mad.lo.s32 r256, r254, 36, r255; +mov.u32 r257, %tid.x; +mov.f32 f14, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r1, {low, high}; +} +mov.f32 f16, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r2, {low, high}; +} +{ +neg.f16x2 r3, r2; +} +{ +add.f16x2 r5, %9, %11; +} +{ +add.f16x2 r8, %7, r5; +} +{ +add.f16x2 r11, %10, %12; +} +{ +add.f16x2 r14, %8, r11; +} +{ +add.f16x2 r17, %9, %11; +} +{ +mul.f16x2 r20, r17, r1; +} +{ +add.f16x2 r23, %7, r20; +} +{ +sub.f16x2 r26, %10, %12; +} +{ +mul.f16x2 r29, r26, r3; +} +{ +add.f16x2 r32, r23, r29; +} +{ +add.f16x2 r35, %9, %11; +} +{ +mul.f16x2 r38, r35, r1; +} +{ +add.f16x2 r41, %7, r38; +} +{ +sub.f16x2 r44, %10, %12; +} +{ +mul.f16x2 r47, r44, r3; +} +{ +sub.f16x2 r50, r41, r47; +} +{ +add.f16x2 r53, %10, %12; +} +{ +mul.f16x2 r56, r53, r1; +} +{ +add.f16x2 r59, %8, r56; +} +{ +sub.f16x2 r62, %9, %11; +} +{ +mul.f16x2 r65, r62, r3; +} +{ +sub.f16x2 r68, r59, r65; +} +{ +add.f16x2 r71, %10, %12; +} +{ +mul.f16x2 r74, r71, r1; +} +{ +add.f16x2 r77, %8, r74; +} +{ +sub.f16x2 r80, %9, %11; +} +{ +mul.f16x2 r83, r80, r3; +} +{ +add.f16x2 r86, r77, r83; +} +mul.wide.u32 rd2, r257, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r258, rd3; +mul.lo.s32 r259, r258, 3; +sub.s32 r260, r257, r259; +mad.lo.s32 r261, r258, 36, r256; +cvt.rn.f32.u32 f17, r260; +mul.f32 f18, f17, 0f3F32B8C2; +cos.approx.f32 f5, f18; +sin.approx.f32 f19, f18; +neg.f32 f6, f19; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r89, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r92, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r94, {high, high}; +} +{ +mul.f16x2 r96, r68, r94; +} +{ +neg.f16x2 r99, r96; +} +{ +fma.rn.f16x2 r101, r32, r92, r99; +} +{ +mul.f16x2 r105, r32, r94; +} +{ +fma.rn.f16x2 r108, r68, r92, r105; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r112, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r114, {high, high}; +} +mov.f32 f9, 0fBF800000; +mov.f32 f10, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f9; +cvt.rn.f16.f32 high, f10; +mov.b32 r116, {low, high}; +} +{ +mul.f16x2 r117, r114, r116; +} +{ +mul.f16x2 r120, r89, r112; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r89; +mov.b32 r123, {high, low}; +} +{ +fma.rn.f16x2 r125, r117, r123, r120; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r129, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r125; +mov.b32 r131, {high, high}; +} +{ +mul.f16x2 r133, r86, r131; +} +{ +neg.f16x2 r136, r133; +} +{ +fma.rn.f16x2 r138, r50, r129, r136; +} +{ +mul.f16x2 r142, r50, r131; +} +{ +fma.rn.f16x2 r145, r86, r129, r142; +} +barrier.sync 0; +mad.lo.s32 r262, r260, 12, r261; +st.shared.u32 [r262], r8; +st.shared.u32 [r262+4], r101; +st.shared.u32 [r262+8], r138; +barrier.sync 0; +shl.b32 r263, r260, 3; +sub.s32 r264, r262, r263; +ld.shared.u32 r174, [r264]; +ld.shared.u32 r171, [r264+12]; +ld.shared.u32 r172, [r264+24]; +barrier.sync 0; +st.shared.u32 [r262], r14; +st.shared.u32 [r262+4], r108; +st.shared.u32 [r262+8], r145; +barrier.sync 0; +ld.shared.u32 r180, [r264]; +ld.shared.u32 r177, [r264+12]; +ld.shared.u32 r178, [r264+24]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r166, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r167, {low, high}; +} +{ +neg.f16x2 r168, r167; +} +{ +add.f16x2 r170, r171, r172; +} +{ +add.f16x2 %0, r174, r170; +} +{ +add.f16x2 r176, r177, r178; +} +{ +add.f16x2 %1, r180, r176; +} +{ +add.f16x2 r182, r171, r172; +} +{ +mul.f16x2 r185, r182, r166; +} +{ +add.f16x2 r188, r174, r185; +} +{ +sub.f16x2 r191, r177, r178; +} +{ +mul.f16x2 r194, r191, r168; +} +{ +add.f16x2 %2, r188, r194; +} +{ +add.f16x2 r200, r171, r172; +} +{ +mul.f16x2 r203, r200, r166; +} +{ +add.f16x2 r206, r174, r203; +} +{ +sub.f16x2 r209, r177, r178; +} +{ +mul.f16x2 r212, r209, r168; +} +{ +sub.f16x2 %4, r206, r212; +} +{ +add.f16x2 r218, r177, r178; +} +{ +mul.f16x2 r221, r218, r166; +} +{ +add.f16x2 r224, r180, r221; +} +{ +sub.f16x2 r227, r171, r172; +} +{ +mul.f16x2 r230, r227, r168; +} +{ +sub.f16x2 %3, r224, r230; +} +{ +add.f16x2 r236, r177, r178; +} +{ +mul.f16x2 r239, r236, r166; +} +{ +add.f16x2 r242, r180, r239; +} +{ +sub.f16x2 r245, r171, r172; +} +{ +mul.f16x2 r248, r245, r168; +} +{ +add.f16x2 %5, r242, r248; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..ac433cce6c604 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp16_inv.hpp.inc @@ -0,0 +1,1315 @@ +#ifndef CUFFTDX_FFT_9_FP16_INV_PTX_HPP +#define CUFFTDX_FFT_9_FP16_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1066, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b16 rs<29>; +.reg .b32 r<569>; +.reg .f64 fd<29>; +.reg .b64 rd<2>; +mov.f64 fd27, 0dBFE0000000000000; +{ +cvt.rn.f16.f64 rs1, fd27; +} +mov.b32 r72, {rs1, rs1}; +mov.f64 fd28, 0dBFEBB67AE8584CAA; +{ +cvt.rn.f16.f64 rs2, fd28; +} +mov.b32 r81, {rs2, rs2}; +{ +add.f16x2 r1, %24, %30; +} +{ +add.f16x2 r4, %18, r1; +} +{ +add.f16x2 r7, %25, %31; +} +{ +add.f16x2 r10, %19, r7; +} +{ +add.f16x2 r13, %24, %30; +} +{ +mul.f16x2 r16, r13, r72; +} +{ +add.f16x2 r19, %18, r16; +} +{ +sub.f16x2 r22, %25, %31; +} +{ +mul.f16x2 r25, r22, r81; +} +{ +add.f16x2 r28, r19, r25; +} +{ +add.f16x2 r31, %24, %30; +} +{ +mul.f16x2 r34, r31, r72; +} +{ +add.f16x2 r37, %18, r34; +} +{ +sub.f16x2 r40, %25, %31; +} +{ +mul.f16x2 r43, r40, r81; +} +{ +sub.f16x2 r46, r37, r43; +} +{ +add.f16x2 r49, %25, %31; +} +{ +mul.f16x2 r52, r49, r72; +} +{ +add.f16x2 r55, %19, r52; +} +{ +sub.f16x2 r58, %24, %30; +} +{ +mul.f16x2 r61, r58, r81; +} +{ +sub.f16x2 r64, r55, r61; +} +{ +add.f16x2 r67, %25, %31; +} +{ +mul.f16x2 r70, r67, r72; +} +{ +add.f16x2 r73, %19, r70; +} +{ +sub.f16x2 r76, %24, %30; +} +{ +mul.f16x2 r79, r76, r81; +} +{ +add.f16x2 r82, r73, r79; +} +{ +cvt.rn.f16.f64 rs3, fd27; +} +mov.b32 r156, {rs3, rs3}; +{ +cvt.rn.f16.f64 rs4, fd28; +} +mov.b32 r165, {rs4, rs4}; +{ +add.f16x2 r85, %26, %32; +} +{ +add.f16x2 r88, %20, r85; +} +{ +add.f16x2 r91, %27, %33; +} +{ +add.f16x2 r94, %21, r91; +} +{ +add.f16x2 r97, %26, %32; +} +{ +mul.f16x2 r100, r97, r156; +} +{ +add.f16x2 r103, %20, r100; +} +{ +sub.f16x2 r106, %27, %33; +} +{ +mul.f16x2 r109, r106, r165; +} +{ +add.f16x2 r112, r103, r109; +} +{ +add.f16x2 r115, %26, %32; +} +{ +mul.f16x2 r118, r115, r156; +} +{ +add.f16x2 r121, %20, r118; +} +{ +sub.f16x2 r124, %27, %33; +} +{ +mul.f16x2 r127, r124, r165; +} +{ +sub.f16x2 r130, r121, r127; +} +{ +add.f16x2 r133, %27, %33; +} +{ +mul.f16x2 r136, r133, r156; +} +{ +add.f16x2 r139, %21, r136; +} +{ +sub.f16x2 r142, %26, %32; +} +{ +mul.f16x2 r145, r142, r165; +} +{ +sub.f16x2 r148, r139, r145; +} +{ +add.f16x2 r151, %27, %33; +} +{ +mul.f16x2 r154, r151, r156; +} +{ +add.f16x2 r157, %21, r154; +} +{ +sub.f16x2 r160, %26, %32; +} +{ +mul.f16x2 r163, r160, r165; +} +{ +add.f16x2 r166, r157, r163; +} +{ +cvt.rn.f16.f64 rs5, fd27; +} +mov.b32 r240, {rs5, rs5}; +{ +cvt.rn.f16.f64 rs6, fd28; +} +mov.b32 r249, {rs6, rs6}; +{ +add.f16x2 r169, %28, %34; +} +{ +add.f16x2 r172, %22, r169; +} +{ +add.f16x2 r175, %29, %35; +} +{ +add.f16x2 r178, %23, r175; +} +{ +add.f16x2 r181, %28, %34; +} +{ +mul.f16x2 r184, r181, r240; +} +{ +add.f16x2 r187, %22, r184; +} +{ +sub.f16x2 r190, %29, %35; +} +{ +mul.f16x2 r193, r190, r249; +} +{ +add.f16x2 r196, r187, r193; +} +{ +add.f16x2 r199, %28, %34; +} +{ +mul.f16x2 r202, r199, r240; +} +{ +add.f16x2 r205, %22, r202; +} +{ +sub.f16x2 r208, %29, %35; +} +{ +mul.f16x2 r211, r208, r249; +} +{ +sub.f16x2 r214, r205, r211; +} +{ +add.f16x2 r217, %29, %35; +} +{ +mul.f16x2 r220, r217, r240; +} +{ +add.f16x2 r223, %23, r220; +} +{ +sub.f16x2 r226, %28, %34; +} +{ +mul.f16x2 r229, r226, r249; +} +{ +sub.f16x2 r232, r223, r229; +} +{ +add.f16x2 r235, %29, %35; +} +{ +mul.f16x2 r238, r235, r240; +} +{ +add.f16x2 r241, %23, r238; +} +{ +sub.f16x2 r244, %28, %34; +} +{ +mul.f16x2 r247, r244, r249; +} +{ +add.f16x2 r250, r241, r247; +} +mov.f64 fd7, 0d3FE8836FA2CF5039; +{ +cvt.rn.f16.f64 rs7, fd7; +} +mov.f64 fd8, 0d3FE491B7523C161D; +{ +cvt.rn.f16.f64 rs8, fd8; +} +mov.f64 fd9, 0d3FC63A1A7E0B738A; +{ +cvt.rn.f16.f64 rs9, fd9; +} +mov.f64 fd10, 0d3FEF838B8C811C17; +{ +cvt.rn.f16.f64 rs10, fd10; +} +mov.f64 fd13, 0dBFEE11F642522D1C; +{ +cvt.rn.f16.f64 rs13, fd13; +} +mov.f64 fd14, 0d3FD5E3A8748A0BF5; +{ +cvt.rn.f16.f64 rs14, fd14; +} +mov.b32 r267, {rs7, rs7}; +{ +mul.f16x2 r253, r112, r267; +} +mov.b32 r264, {rs8, rs8}; +{ +mul.f16x2 r256, r148, r264; +} +{ +sub.f16x2 r259, r253, r256; +} +{ +mul.f16x2 r262, r112, r264; +} +{ +fma.rn.f16x2 r265, r148, r267, r262; +} +mov.b32 r299, {rs9, rs9}; +{ +mul.f16x2 r269, r196, r299; +} +mov.b32 r296, {rs10, rs10}; +{ +mul.f16x2 r272, r232, r296; +} +{ +sub.f16x2 r275, r269, r272; +} +{ +mul.f16x2 r278, r196, r296; +} +{ +fma.rn.f16x2 r281, r232, r299, r278; +} +{ +mul.f16x2 r285, r130, r299; +} +{ +mul.f16x2 r288, r166, r296; +} +{ +sub.f16x2 r291, r285, r288; +} +{ +mul.f16x2 r294, r130, r296; +} +{ +fma.rn.f16x2 r297, r166, r299, r294; +} +mov.b32 r315, {rs13, rs13}; +{ +mul.f16x2 r301, r214, r315; +} +mov.b32 r312, {rs14, rs14}; +{ +mul.f16x2 r304, r250, r312; +} +{ +sub.f16x2 r307, r301, r304; +} +{ +mul.f16x2 r310, r214, r312; +} +{ +fma.rn.f16x2 r313, r250, r315, r310; +} +{ +cvt.rn.f16.f64 rs23, fd27; +} +mov.b32 r388, {rs23, rs23}; +{ +cvt.rn.f16.f64 rs24, fd28; +} +mov.b32 r397, {rs24, rs24}; +{ +add.f16x2 r317, r88, r172; +} +{ +add.f16x2 %0, r4, r317; +} +{ +add.f16x2 r323, r94, r178; +} +{ +add.f16x2 %1, r10, r323; +} +{ +add.f16x2 r329, r88, r172; +} +{ +mul.f16x2 r332, r329, r388; +} +{ +add.f16x2 r335, r4, r332; +} +{ +sub.f16x2 r338, r94, r178; +} +{ +mul.f16x2 r341, r338, r397; +} +{ +add.f16x2 %6, r335, r341; +} +{ +add.f16x2 r347, r88, r172; +} +{ +mul.f16x2 r350, r347, r388; +} +{ +add.f16x2 r353, r4, r350; +} +{ +sub.f16x2 r356, r94, r178; +} +{ +mul.f16x2 r359, r356, r397; +} +{ +sub.f16x2 %12, r353, r359; +} +{ +add.f16x2 r365, r94, r178; +} +{ +mul.f16x2 r368, r365, r388; +} +{ +add.f16x2 r371, r10, r368; +} +{ +sub.f16x2 r374, r88, r172; +} +{ +mul.f16x2 r377, r374, r397; +} +{ +sub.f16x2 %7, r371, r377; +} +{ +add.f16x2 r383, r94, r178; +} +{ +mul.f16x2 r386, r383, r388; +} +{ +add.f16x2 r389, r10, r386; +} +{ +sub.f16x2 r392, r88, r172; +} +{ +mul.f16x2 r395, r392, r397; +} +{ +add.f16x2 %13, r389, r395; +} +{ +cvt.rn.f16.f64 rs25, fd27; +} +mov.b32 r472, {rs25, rs25}; +{ +cvt.rn.f16.f64 rs26, fd28; +} +mov.b32 r481, {rs26, rs26}; +{ +add.f16x2 r401, r259, r275; +} +{ +add.f16x2 %2, r28, r401; +} +{ +add.f16x2 r407, r265, r281; +} +{ +add.f16x2 %3, r64, r407; +} +{ +add.f16x2 r413, r259, r275; +} +{ +mul.f16x2 r416, r413, r472; +} +{ +add.f16x2 r419, r28, r416; +} +{ +sub.f16x2 r422, r265, r281; +} +{ +mul.f16x2 r425, r422, r481; +} +{ +add.f16x2 %8, r419, r425; +} +{ +add.f16x2 r431, r259, r275; +} +{ +mul.f16x2 r434, r431, r472; +} +{ +add.f16x2 r437, r28, r434; +} +{ +sub.f16x2 r440, r265, r281; +} +{ +mul.f16x2 r443, r440, r481; +} +{ +sub.f16x2 %14, r437, r443; +} +{ +add.f16x2 r449, r265, r281; +} +{ +mul.f16x2 r452, r449, r472; +} +{ +add.f16x2 r455, r64, r452; +} +{ +sub.f16x2 r458, r259, r275; +} +{ +mul.f16x2 r461, r458, r481; +} +{ +sub.f16x2 %9, r455, r461; +} +{ +add.f16x2 r467, r265, r281; +} +{ +mul.f16x2 r470, r467, r472; +} +{ +add.f16x2 r473, r64, r470; +} +{ +sub.f16x2 r476, r259, r275; +} +{ +mul.f16x2 r479, r476, r481; +} +{ +add.f16x2 %15, r473, r479; +} +{ +cvt.rn.f16.f64 rs27, fd27; +} +mov.b32 r556, {rs27, rs27}; +{ +cvt.rn.f16.f64 rs28, fd28; +} +mov.b32 r565, {rs28, rs28}; +{ +add.f16x2 r485, r291, r307; +} +{ +add.f16x2 %4, r46, r485; +} +{ +add.f16x2 r491, r297, r313; +} +{ +add.f16x2 %5, r82, r491; +} +{ +add.f16x2 r497, r291, r307; +} +{ +mul.f16x2 r500, r497, r556; +} +{ +add.f16x2 r503, r46, r500; +} +{ +sub.f16x2 r506, r297, r313; +} +{ +mul.f16x2 r509, r506, r565; +} +{ +add.f16x2 %10, r503, r509; +} +{ +add.f16x2 r515, r291, r307; +} +{ +mul.f16x2 r518, r515, r556; +} +{ +add.f16x2 r521, r46, r518; +} +{ +sub.f16x2 r524, r297, r313; +} +{ +mul.f16x2 r527, r524, r565; +} +{ +sub.f16x2 %16, r521, r527; +} +{ +add.f16x2 r533, r297, r313; +} +{ +mul.f16x2 r536, r533, r556; +} +{ +add.f16x2 r539, r82, r536; +} +{ +sub.f16x2 r542, r291, r307; +} +{ +mul.f16x2 r545, r542, r565; +} +{ +sub.f16x2 %11, r539, r545; +} +{ +add.f16x2 r551, r297, r313; +} +{ +mul.f16x2 r554, r551, r556; +} +{ +add.f16x2 r557, r82, r554; +} +{ +sub.f16x2 r560, r291, r307; +} +{ +mul.f16x2 r563, r560, r565; +} +{ +add.f16x2 %17, r557, r563; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)), "=r"(__HALF2_TO_UI(rmem[3].x)), "=r"(__HALF2_TO_UI(rmem[3].y)), "=r"(__HALF2_TO_UI(rmem[4].x)), "=r"(__HALF2_TO_UI(rmem[4].y)), "=r"(__HALF2_TO_UI(rmem[5].x)), "=r"(__HALF2_TO_UI(rmem[5].y)), "=r"(__HALF2_TO_UI(rmem[6].x)), "=r"(__HALF2_TO_UI(rmem[6].y)), "=r"(__HALF2_TO_UI(rmem[7].x)), "=r"(__HALF2_TO_UI(rmem[7].y)), "=r"(__HALF2_TO_UI(rmem[8].x)), "=r"(__HALF2_TO_UI(rmem[8].y)): "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y)), "r"(__HALF2_TO_UI(rmem[3].x)), "r"(__HALF2_TO_UI(rmem[3].y)), "r"(__HALF2_TO_UI(rmem[4].x)), "r"(__HALF2_TO_UI(rmem[4].y)), "r"(__HALF2_TO_UI(rmem[5].x)), "r"(__HALF2_TO_UI(rmem[5].y)), "r"(__HALF2_TO_UI(rmem[6].x)), "r"(__HALF2_TO_UI(rmem[6].y)), "r"(__HALF2_TO_UI(rmem[7].x)), "r"(__HALF2_TO_UI(rmem[7].y)), "r"(__HALF2_TO_UI(rmem[8].x)), "r"(__HALF2_TO_UI(rmem[8].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1067, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<20>; +.reg .b32 r<261>; +.reg .b64 rd<4>; +mov.u32 r250, %tid.y; +mov.u32 r251, %6; +mad.lo.s32 r252, r250, 72, r251; +mov.u32 r253, %tid.x; +mov.f32 f14, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r1, {low, high}; +} +mov.f32 f16, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r253, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r254, rd3; +mul.lo.s32 r255, r254, 3; +sub.s32 r256, r253, r255; +mad.lo.s32 r257, r254, 72, r252; +cvt.rn.f32.u32 f17, r256; +mul.f32 f18, f17, 0f3F32B8C2; +cos.approx.f32 f5, f18; +sin.approx.f32 f19, f18; +neg.f32 f6, f19; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f9, 0fBF800000; +mov.f32 f10, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f9; +cvt.rn.f16.f32 high, f10; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r258, r256, 24, r257; +st.shared.v2.f32 [r258], {r6, r12}; +st.shared.v2.f32 [r258+8], {r97, r106}; +st.shared.v2.f32 [r258+16], {r134, r143}; +barrier.sync 0; +shl.b32 r259, r256, 4; +sub.s32 r260, r258, r259; +ld.shared.u32 r170, [r260]; +ld.shared.u32 r176, [r260+4]; +ld.shared.u32 r167, [r260+24]; +ld.shared.u32 r173, [r260+28]; +ld.shared.u32 r168, [r260+48]; +ld.shared.u32 r174, [r260+52]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 %0, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 %1, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 %2, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 %4, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 %3, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 %5, r238, r244; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<1068, __half2, 1>(cufftdx::detail::complex<__half2> *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<20>; +.reg .b32 r<261>; +.reg .b64 rd<4>; +mov.u32 r250, %tid.y; +mov.u32 r251, %6; +mad.lo.s32 r252, r250, 36, r251; +mov.u32 r253, %tid.x; +mov.f32 f14, 0fBF000000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r1, {low, high}; +} +mov.f32 f16, 0fBF5DB3D7; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r2, {low, high}; +} +{ +add.f16x2 r3, %9, %11; +} +{ +add.f16x2 r6, %7, r3; +} +{ +add.f16x2 r9, %10, %12; +} +{ +add.f16x2 r12, %8, r9; +} +{ +add.f16x2 r15, %9, %11; +} +{ +mul.f16x2 r18, r15, r1; +} +{ +add.f16x2 r21, %7, r18; +} +{ +sub.f16x2 r24, %10, %12; +} +{ +mul.f16x2 r27, r24, r2; +} +{ +add.f16x2 r30, r21, r27; +} +{ +add.f16x2 r33, %9, %11; +} +{ +mul.f16x2 r36, r33, r1; +} +{ +add.f16x2 r39, %7, r36; +} +{ +sub.f16x2 r42, %10, %12; +} +{ +mul.f16x2 r45, r42, r2; +} +{ +sub.f16x2 r48, r39, r45; +} +{ +add.f16x2 r51, %10, %12; +} +{ +mul.f16x2 r54, r51, r1; +} +{ +add.f16x2 r57, %8, r54; +} +{ +sub.f16x2 r60, %9, %11; +} +{ +mul.f16x2 r63, r60, r2; +} +{ +sub.f16x2 r66, r57, r63; +} +{ +add.f16x2 r69, %10, %12; +} +{ +mul.f16x2 r72, r69, r1; +} +{ +add.f16x2 r75, %8, r72; +} +{ +sub.f16x2 r78, %9, %11; +} +{ +mul.f16x2 r81, r78, r2; +} +{ +add.f16x2 r84, r75, r81; +} +mul.wide.u32 rd2, r253, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r254, rd3; +mul.lo.s32 r255, r254, 3; +sub.s32 r256, r253, r255; +mad.lo.s32 r257, r254, 36, r252; +cvt.rn.f32.u32 f17, r256; +mul.f32 f18, f17, 0f3F32B8C2; +cos.approx.f32 f5, f18; +sin.approx.f32 f19, f18; +neg.f32 f6, f19; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f5; +cvt.rn.f16.f32 high, f6; +mov.b32 r87, {low, high}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r90, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r92, {high, high}; +} +{ +mul.f16x2 r94, r66, r92; +} +{ +fma.rn.f16x2 r97, r30, r90, r94; +} +{ +mul.f16x2 r101, r30, r92; +} +{ +neg.f16x2 r104, r101; +} +{ +fma.rn.f16x2 r106, r66, r90, r104; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r110, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r112, {high, high}; +} +mov.f32 f9, 0fBF800000; +mov.f32 f10, 0f3F800000; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f9; +cvt.rn.f16.f32 high, f10; +mov.b32 r114, {low, high}; +} +{ +mul.f16x2 r115, r112, r114; +} +{ +mul.f16x2 r118, r87, r110; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r87; +mov.b32 r121, {high, low}; +} +{ +fma.rn.f16x2 r123, r115, r121, r118; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r127, {low, low}; +} +{ +.reg .f16 low, high; +mov.b32 {low, high}, r123; +mov.b32 r129, {high, high}; +} +{ +mul.f16x2 r131, r84, r129; +} +{ +fma.rn.f16x2 r134, r48, r127, r131; +} +{ +mul.f16x2 r138, r48, r129; +} +{ +neg.f16x2 r141, r138; +} +{ +fma.rn.f16x2 r143, r84, r127, r141; +} +barrier.sync 0; +mad.lo.s32 r258, r256, 12, r257; +st.shared.u32 [r258], r6; +st.shared.u32 [r258+4], r97; +st.shared.u32 [r258+8], r134; +barrier.sync 0; +shl.b32 r259, r256, 3; +sub.s32 r260, r258, r259; +ld.shared.u32 r170, [r260]; +ld.shared.u32 r167, [r260+12]; +ld.shared.u32 r168, [r260+24]; +barrier.sync 0; +st.shared.u32 [r258], r12; +st.shared.u32 [r258+4], r106; +st.shared.u32 [r258+8], r143; +barrier.sync 0; +ld.shared.u32 r176, [r260]; +ld.shared.u32 r173, [r260+12]; +ld.shared.u32 r174, [r260+24]; +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f14; +cvt.rn.f16.f32 high, f14; +mov.b32 r164, {low, high}; +} +{ +.reg .f16 low, high; +cvt.rn.f16.f32 low, f16; +cvt.rn.f16.f32 high, f16; +mov.b32 r165, {low, high}; +} +{ +add.f16x2 r166, r167, r168; +} +{ +add.f16x2 %0, r170, r166; +} +{ +add.f16x2 r172, r173, r174; +} +{ +add.f16x2 %1, r176, r172; +} +{ +add.f16x2 r178, r167, r168; +} +{ +mul.f16x2 r181, r178, r164; +} +{ +add.f16x2 r184, r170, r181; +} +{ +sub.f16x2 r187, r173, r174; +} +{ +mul.f16x2 r190, r187, r165; +} +{ +add.f16x2 %2, r184, r190; +} +{ +add.f16x2 r196, r167, r168; +} +{ +mul.f16x2 r199, r196, r164; +} +{ +add.f16x2 r202, r170, r199; +} +{ +sub.f16x2 r205, r173, r174; +} +{ +mul.f16x2 r208, r205, r165; +} +{ +sub.f16x2 %4, r202, r208; +} +{ +add.f16x2 r214, r173, r174; +} +{ +mul.f16x2 r217, r214, r164; +} +{ +add.f16x2 r220, r176, r217; +} +{ +sub.f16x2 r223, r167, r168; +} +{ +mul.f16x2 r226, r223, r165; +} +{ +sub.f16x2 %3, r220, r226; +} +{ +add.f16x2 r232, r173, r174; +} +{ +mul.f16x2 r235, r232, r164; +} +{ +add.f16x2 r238, r176, r235; +} +{ +sub.f16x2 r241, r167, r168; +} +{ +mul.f16x2 r244, r241, r165; +} +{ +add.f16x2 %5, r238, r244; +} +})" + : "=r"(__HALF2_TO_UI(rmem[0].x)), "=r"(__HALF2_TO_UI(rmem[0].y)), "=r"(__HALF2_TO_UI(rmem[1].x)), "=r"(__HALF2_TO_UI(rmem[1].y)), "=r"(__HALF2_TO_UI(rmem[2].x)), "=r"(__HALF2_TO_UI(rmem[2].y)): "r"(smem), "r"(__HALF2_TO_UI(rmem[0].x)), "r"(__HALF2_TO_UI(rmem[0].y)), "r"(__HALF2_TO_UI(rmem[1].x)), "r"(__HALF2_TO_UI(rmem[1].y)), "r"(__HALF2_TO_UI(rmem[2].x)), "r"(__HALF2_TO_UI(rmem[2].y))); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..f6915f7936e7e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp32_fwd.hpp.inc @@ -0,0 +1,310 @@ +#ifndef CUFFTDX_FFT_9_FP32_FWD_PTX_HPP +#define CUFFTDX_FFT_9_FP32_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<118, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<153>; +.reg .b64 rd<2>; +add.f32 f37, %26, %34; +add.f32 f38, %18, f37; +add.f32 f39, %27, %35; +add.f32 f40, %19, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %18, f41; +sub.f32 f43, %27, %35; +mul.f32 f44, f43, 0f3F5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %19, f47; +sub.f32 f49, %26, %34; +mul.f32 f50, f49, 0f3F5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %28, %36; +add.f32 f54, %20, f53; +add.f32 f55, %30, %38; +add.f32 f56, %22, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %20, f57; +sub.f32 f59, %30, %38; +mul.f32 f60, f59, 0f3F5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %22, f63; +sub.f32 f65, %28, %36; +mul.f32 f66, f65, 0f3F5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %31, %39; +add.f32 f70, %23, f69; +add.f32 f71, %33, %40; +add.f32 f72, %25, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %23, f73; +sub.f32 f75, %33, %40; +mul.f32 f76, f75, 0f3F5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %25, f79; +sub.f32 f81, %31, %39; +mul.f32 f82, f81, 0f3F5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0fBF248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0fBF248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0fBF7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0fBF7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0fBF7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0fBF7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0fBEAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0fBEAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0f3F5DB3D7; +mul.f32 f111, f106, 0f3F000000; +sub.f32 f112, f40, f111; +sub.f32 f113, f54, f70; +mul.f32 f114, f113, 0f3F5DB3D7; +add.f32 f115, f87, f92; +add.f32 f116, f89, f94; +mul.f32 f117, f115, 0f3F000000; +sub.f32 f118, f45, f117; +sub.f32 f119, f89, f94; +mul.f32 f120, f119, 0f3F5DB3D7; +mul.f32 f121, f116, 0f3F000000; +sub.f32 f122, f51, f121; +sub.f32 f123, f87, f92; +mul.f32 f124, f123, 0f3F5DB3D7; +add.f32 f125, f97, f102; +add.f32 f126, f99, f104; +mul.f32 f127, f125, 0f3F000000; +sub.f32 f128, f46, f127; +sub.f32 f129, f99, f104; +mul.f32 f130, f129, 0f3F5DB3D7; +mul.f32 f131, f126, 0f3F000000; +sub.f32 f132, f52, f131; +sub.f32 f133, f97, f102; +mul.f32 f134, f133, 0f3F5DB3D7; +add.f32 %1, f40, f106; +add.f32 %0, f38, f105; +add.f32 %3, f51, f116; +add.f32 %2, f45, f115; +add.f32 %5, f52, f126; +add.f32 %4, f46, f125; +sub.f32 %7, f112, f114; +add.f32 %6, f110, f108; +sub.f32 %9, f122, f124; +add.f32 %8, f120, f118; +sub.f32 %11, f132, f134; +add.f32 %10, f130, f128; +add.f32 %13, f114, f112; +sub.f32 %12, f108, f110; +add.f32 %15, f124, f122; +sub.f32 %14, f118, f120; +add.f32 %17, f134, f132; +sub.f32 %16, f128, f130; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<119, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<76>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 72, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %10, %13; +add.f32 f14, %12, %14; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %8, f15; +sub.f32 f17, %12, %14; +mul.f32 f18, f17, 0f3F5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %9, f21; +sub.f32 f23, %10, %13; +mul.f32 f24, f23, 0f3F5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 72, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f27, f19; +mul.f32 f32, f28, f25; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f36, f20; +mul.f32 f40, f38, f26; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %9, f14; +add.f32 f43, %8, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f28, f19, f33; +sub.f32 f45, f31, f32; +st.shared.v2.f32 [r9+8], {f45, f44}; +sub.f32 f46, f39, f40; +fma.rn.f32 f47, f38, f20, f41; +st.shared.v2.f32 [r9+16], {f46, f47}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+24]; +ld.shared.v2.f32 {f56, f57}, [r11+48]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0f3F5DB3D7; +mul.f32 f66, f61, 0f3F000000; +sub.f32 f67, f49, f66; +sub.f32 f68, f52, f56; +mul.f32 f69, f68, 0f3F5DB3D7; +add.f32 %1, f49, f61; +add.f32 %0, f48, f60; +sub.f32 %3, f67, f69; +add.f32 %2, f65, f63; +add.f32 %5, f69, f67; +sub.f32 %4, f63, f65; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<120, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<70>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 36, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %10, %13; +add.f32 f14, %8, f13; +add.f32 f15, %12, %14; +add.f32 f16, %9, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %8, f17; +sub.f32 f19, %12, %14; +mul.f32 f20, f19, 0f3F5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %9, f23; +sub.f32 f25, %10, %13; +mul.f32 f26, f25, 0f3F5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 36, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f29, f21; +mul.f32 f34, f30, f27; +sub.f32 f35, f33, f34; +mul.f32 f36, f29, f27; +fma.rn.f32 f37, f30, f21, f36; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f40, f22; +mul.f32 f44, f42, f28; +sub.f32 f45, f43, f44; +mul.f32 f46, f40, f28; +fma.rn.f32 f47, f42, f22, f46; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f35; +st.shared.f32 [r9+8], f45; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+12]; +ld.shared.f32 f50, [r11+24]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+12]; +ld.shared.f32 f53, [r11+24]; +add.f32 f54, f49, f50; +add.f32 f55, f52, f53; +mul.f32 f56, f54, 0f3F000000; +sub.f32 f57, f48, f56; +sub.f32 f58, f52, f53; +mul.f32 f59, f58, 0f3F5DB3D7; +mul.f32 f60, f55, 0f3F000000; +sub.f32 f61, f51, f60; +sub.f32 f62, f49, f50; +mul.f32 f63, f62, 0f3F5DB3D7; +add.f32 %0, f48, f54; +add.f32 %1, f51, f55; +add.f32 %2, f59, f57; +sub.f32 %3, f61, f63; +sub.f32 %4, f57, f59; +add.f32 %5, f63, f61; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..b4e846e29f702 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp32_inv.hpp.inc @@ -0,0 +1,310 @@ +#ifndef CUFFTDX_FFT_9_FP32_INV_PTX_HPP +#define CUFFTDX_FFT_9_FP32_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<320, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<153>; +.reg .b64 rd<2>; +add.f32 f37, %26, %34; +add.f32 f38, %18, f37; +add.f32 f39, %27, %35; +add.f32 f40, %19, f39; +mul.f32 f41, f37, 0f3F000000; +sub.f32 f42, %18, f41; +sub.f32 f43, %27, %35; +mul.f32 f44, f43, 0fBF5DB3D7; +add.f32 f45, f44, f42; +sub.f32 f46, f42, f44; +mul.f32 f47, f39, 0f3F000000; +sub.f32 f48, %19, f47; +sub.f32 f49, %26, %34; +mul.f32 f50, f49, 0fBF5DB3D7; +sub.f32 f51, f48, f50; +add.f32 f52, f50, f48; +add.f32 f53, %28, %36; +add.f32 f54, %20, f53; +add.f32 f55, %30, %38; +add.f32 f56, %22, f55; +mul.f32 f57, f53, 0f3F000000; +sub.f32 f58, %20, f57; +sub.f32 f59, %30, %38; +mul.f32 f60, f59, 0fBF5DB3D7; +add.f32 f61, f60, f58; +sub.f32 f62, f58, f60; +mul.f32 f63, f55, 0f3F000000; +sub.f32 f64, %22, f63; +sub.f32 f65, %28, %36; +mul.f32 f66, f65, 0fBF5DB3D7; +sub.f32 f67, f64, f66; +add.f32 f68, f66, f64; +add.f32 f69, %31, %39; +add.f32 f70, %23, f69; +add.f32 f71, %33, %40; +add.f32 f72, %25, f71; +mul.f32 f73, f69, 0f3F000000; +sub.f32 f74, %23, f73; +sub.f32 f75, %33, %40; +mul.f32 f76, f75, 0fBF5DB3D7; +add.f32 f77, f76, f74; +sub.f32 f78, f74, f76; +mul.f32 f79, f71, 0f3F000000; +sub.f32 f80, %25, f79; +sub.f32 f81, %31, %39; +mul.f32 f82, f81, 0fBF5DB3D7; +sub.f32 f83, f80, f82; +add.f32 f84, f82, f80; +mul.f32 f85, f61, 0f3F441B7D; +mul.f32 f86, f67, 0f3F248DBB; +sub.f32 f87, f85, f86; +mul.f32 f88, f67, 0f3F441B7D; +fma.rn.f32 f89, f61, 0f3F248DBB, f88; +mul.f32 f90, f77, 0f3E31D0D4; +mul.f32 f91, f83, 0f3F7C1C5C; +sub.f32 f92, f90, f91; +mul.f32 f93, f83, 0f3E31D0D4; +fma.rn.f32 f94, f77, 0f3F7C1C5C, f93; +mul.f32 f95, f62, 0f3E31D0D4; +mul.f32 f96, f68, 0f3F7C1C5C; +sub.f32 f97, f95, f96; +mul.f32 f98, f68, 0f3E31D0D4; +fma.rn.f32 f99, f62, 0f3F7C1C5C, f98; +mul.f32 f100, f78, 0fBF708FB2; +mul.f32 f101, f84, 0f3EAF1D44; +sub.f32 f102, f100, f101; +mul.f32 f103, f84, 0fBF708FB2; +fma.rn.f32 f104, f78, 0f3EAF1D44, f103; +add.f32 f105, f54, f70; +add.f32 f106, f56, f72; +mul.f32 f107, f105, 0f3F000000; +sub.f32 f108, f38, f107; +sub.f32 f109, f56, f72; +mul.f32 f110, f109, 0fBF5DB3D7; +mul.f32 f111, f106, 0f3F000000; +sub.f32 f112, f40, f111; +sub.f32 f113, f54, f70; +mul.f32 f114, f113, 0fBF5DB3D7; +add.f32 f115, f87, f92; +add.f32 f116, f89, f94; +mul.f32 f117, f115, 0f3F000000; +sub.f32 f118, f45, f117; +sub.f32 f119, f89, f94; +mul.f32 f120, f119, 0fBF5DB3D7; +mul.f32 f121, f116, 0f3F000000; +sub.f32 f122, f51, f121; +sub.f32 f123, f87, f92; +mul.f32 f124, f123, 0fBF5DB3D7; +add.f32 f125, f97, f102; +add.f32 f126, f99, f104; +mul.f32 f127, f125, 0f3F000000; +sub.f32 f128, f46, f127; +sub.f32 f129, f99, f104; +mul.f32 f130, f129, 0fBF5DB3D7; +mul.f32 f131, f126, 0f3F000000; +sub.f32 f132, f52, f131; +sub.f32 f133, f97, f102; +mul.f32 f134, f133, 0fBF5DB3D7; +add.f32 %1, f40, f106; +add.f32 %0, f38, f105; +add.f32 %3, f51, f116; +add.f32 %2, f45, f115; +add.f32 %5, f52, f126; +add.f32 %4, f46, f125; +sub.f32 %7, f112, f114; +add.f32 %6, f110, f108; +sub.f32 %9, f122, f124; +add.f32 %8, f120, f118; +sub.f32 %11, f132, f134; +add.f32 %10, f130, f128; +add.f32 %13, f114, f112; +sub.f32 %12, f108, f110; +add.f32 %15, f124, f122; +sub.f32 %14, f118, f120; +add.f32 %17, f134, f132; +sub.f32 %16, f128, f130; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y), "=f"(rmem[3].x), "=f"(rmem[3].y), "=f"(rmem[4].x), "=f"(rmem[4].y), "=f"(rmem[5].x), "=f"(rmem[5].y), "=f"(rmem[6].x), "=f"(rmem[6].y), "=f"(rmem[7].x), "=f"(rmem[7].y), "=f"(rmem[8].x), "=f"(rmem[8].y): "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y), "f"(rmem[2].y), "f"(rmem[3].x), "f"(rmem[3].y), "f"(rmem[4].x), "f"(rmem[4].y), "f"(rmem[4].y), "f"(rmem[5].x), "f"(rmem[5].y), "f"(rmem[5].y), "f"(rmem[6].x), "f"(rmem[6].y), "f"(rmem[7].x), "f"(rmem[7].y), "f"(rmem[7].y), "f"(rmem[8].x), "f"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<321, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<76>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 72, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %10, %13; +add.f32 f14, %12, %14; +mul.f32 f15, f13, 0f3F000000; +sub.f32 f16, %8, f15; +sub.f32 f17, %12, %14; +mul.f32 f18, f17, 0fBF5DB3D7; +add.f32 f19, f18, f16; +sub.f32 f20, f16, f18; +mul.f32 f21, f14, 0f3F000000; +sub.f32 f22, %9, f21; +sub.f32 f23, %10, %13; +mul.f32 f24, f23, 0fBF5DB3D7; +sub.f32 f25, f22, f24; +add.f32 f26, f24, f22; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 72, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f27, f28}, [rd6]; +mul.f32 f31, f25, f28; +mul.f32 f32, f19, f28; +mul.f32 f33, f27, f25; +mul.f32 f34, f27, f27; +mul.f32 f35, f28, f28; +sub.f32 f36, f34, f35; +mul.f32 f37, f28, f27; +fma.rn.f32 f38, f28, f27, f37; +mul.f32 f39, f26, f38; +mul.f32 f40, f20, f38; +mul.f32 f41, f36, f26; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +add.f32 f42, %9, f14; +add.f32 f43, %8, f13; +st.shared.v2.f32 [r9], {f43, f42}; +fma.rn.f32 f44, f27, f19, f31; +sub.f32 f45, f33, f32; +st.shared.v2.f32 [r9+8], {f44, f45}; +sub.f32 f46, f41, f40; +fma.rn.f32 f47, f36, f20, f39; +st.shared.v2.f32 [r9+16], {f47, f46}; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.v2.f32 {f48, f49}, [r11]; +ld.shared.v2.f32 {f52, f53}, [r11+24]; +ld.shared.v2.f32 {f56, f57}, [r11+48]; +add.f32 f60, f52, f56; +add.f32 f61, f53, f57; +mul.f32 f62, f60, 0f3F000000; +sub.f32 f63, f48, f62; +sub.f32 f64, f53, f57; +mul.f32 f65, f64, 0fBF5DB3D7; +mul.f32 f66, f61, 0f3F000000; +sub.f32 f67, f49, f66; +sub.f32 f68, f52, f56; +mul.f32 f69, f68, 0fBF5DB3D7; +add.f32 %1, f49, f61; +add.f32 %0, f48, f60; +sub.f32 %3, f67, f69; +add.f32 %2, f65, f63; +add.f32 %5, f69, f67; +sub.f32 %4, f63, f65; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<322, float, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f32 f<70>; +.reg .b32 r<12>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 36, r2; +mov.u32 r4, %tid.x; +add.f32 f13, %10, %13; +add.f32 f14, %8, f13; +add.f32 f15, %12, %14; +add.f32 f16, %9, f15; +mul.f32 f17, f13, 0f3F000000; +sub.f32 f18, %8, f17; +sub.f32 f19, %12, %14; +mul.f32 f20, f19, 0fBF5DB3D7; +add.f32 f21, f20, f18; +sub.f32 f22, f18, f20; +mul.f32 f23, f15, 0f3F000000; +sub.f32 f24, %9, f23; +sub.f32 f25, %10, %13; +mul.f32 f26, f25, 0fBF5DB3D7; +sub.f32 f27, f24, f26; +add.f32 f28, f26, f24; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 36, r3; +mul.wide.u32 rd4, r7, 8; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f32 {f29, f30}, [rd6]; +mul.f32 f33, f27, f30; +fma.rn.f32 f34, f29, f21, f33; +mul.f32 f35, f21, f30; +mul.f32 f36, f29, f27; +sub.f32 f37, f36, f35; +mul.f32 f38, f29, f29; +mul.f32 f39, f30, f30; +sub.f32 f40, f38, f39; +mul.f32 f41, f30, f29; +fma.rn.f32 f42, f30, f29, f41; +mul.f32 f43, f28, f42; +fma.rn.f32 f44, f40, f22, f43; +mul.f32 f45, f22, f42; +mul.f32 f46, f40, f28; +sub.f32 f47, f46, f45; +barrier.sync 0; +mad.lo.s32 r9, r7, 12, r8; +st.shared.f32 [r9], f14; +st.shared.f32 [r9+4], f34; +st.shared.f32 [r9+8], f44; +barrier.sync 0; +shl.b32 r10, r7, 3; +sub.s32 r11, r9, r10; +ld.shared.f32 f48, [r11]; +ld.shared.f32 f49, [r11+12]; +ld.shared.f32 f50, [r11+24]; +barrier.sync 0; +st.shared.f32 [r9], f16; +st.shared.f32 [r9+4], f37; +st.shared.f32 [r9+8], f47; +barrier.sync 0; +ld.shared.f32 f51, [r11]; +ld.shared.f32 f52, [r11+12]; +ld.shared.f32 f53, [r11+24]; +add.f32 f54, f49, f50; +add.f32 f55, f52, f53; +mul.f32 f56, f54, 0f3F000000; +sub.f32 f57, f48, f56; +sub.f32 f58, f52, f53; +mul.f32 f59, f58, 0fBF5DB3D7; +mul.f32 f60, f55, 0f3F000000; +sub.f32 f61, f51, f60; +sub.f32 f62, f49, f50; +mul.f32 f63, f62, 0fBF5DB3D7; +add.f32 %0, f48, f54; +add.f32 %1, f51, f55; +add.f32 %2, f59, f57; +sub.f32 %3, f61, f63; +sub.f32 %4, f57, f59; +add.f32 %5, f63, f61; +})" + : "=f"(rmem[0].x), "=f"(rmem[0].y), "=f"(rmem[1].x), "=f"(rmem[1].y), "=f"(rmem[2].x), "=f"(rmem[2].y): "r"(smem), "l"(lut_sp_3_9), "f"(rmem[0].x), "f"(rmem[0].y), "f"(rmem[1].x), "f"(rmem[1].y), "f"(rmem[1].y), "f"(rmem[2].x), "f"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..6f250bd32a298 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp64_fwd.hpp.inc @@ -0,0 +1,302 @@ +#ifndef CUFFTDX_FFT_9_FP64_FWD_PTX_HPP +#define CUFFTDX_FFT_9_FP64_FWD_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<501, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<153>; +.reg .b64 rd<2>; +add.f64 fd37, %26, %34; +add.f64 fd38, %18, fd37; +add.f64 fd39, %27, %35; +add.f64 fd40, %19, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %18, fd41; +sub.f64 fd43, %27, %35; +mul.f64 fd44, fd43, 0d3FEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %19, fd47; +sub.f64 fd49, %26, %34; +mul.f64 fd50, fd49, 0d3FEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %28, %36; +add.f64 fd54, %20, fd53; +add.f64 fd55, %30, %38; +add.f64 fd56, %22, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %20, fd57; +sub.f64 fd59, %30, %38; +mul.f64 fd60, fd59, 0d3FEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %22, fd63; +sub.f64 fd65, %28, %36; +mul.f64 fd66, fd65, 0d3FEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %31, %39; +add.f64 fd70, %23, fd69; +add.f64 fd71, %33, %40; +add.f64 fd72, %25, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %23, fd73; +sub.f64 fd75, %33, %40; +mul.f64 fd76, fd75, 0d3FEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %25, fd79; +sub.f64 fd81, %31, %39; +mul.f64 fd82, fd81, 0d3FEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0dBFE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0dBFE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0dBFEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0dBFEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0dBFEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0dBFEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0dBFD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0dBFD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0d3FEBB67AE8584CAA; +mul.f64 fd111, fd106, 0d3FE0000000000000; +sub.f64 fd112, fd40, fd111; +sub.f64 fd113, fd54, fd70; +mul.f64 fd114, fd113, 0d3FEBB67AE8584CAA; +add.f64 fd115, fd87, fd92; +add.f64 fd116, fd89, fd94; +mul.f64 fd117, fd115, 0d3FE0000000000000; +sub.f64 fd118, fd45, fd117; +sub.f64 fd119, fd89, fd94; +mul.f64 fd120, fd119, 0d3FEBB67AE8584CAA; +mul.f64 fd121, fd116, 0d3FE0000000000000; +sub.f64 fd122, fd51, fd121; +sub.f64 fd123, fd87, fd92; +mul.f64 fd124, fd123, 0d3FEBB67AE8584CAA; +add.f64 fd125, fd97, fd102; +add.f64 fd126, fd99, fd104; +mul.f64 fd127, fd125, 0d3FE0000000000000; +sub.f64 fd128, fd46, fd127; +sub.f64 fd129, fd99, fd104; +mul.f64 fd130, fd129, 0d3FEBB67AE8584CAA; +mul.f64 fd131, fd126, 0d3FE0000000000000; +sub.f64 fd132, fd52, fd131; +sub.f64 fd133, fd97, fd102; +mul.f64 fd134, fd133, 0d3FEBB67AE8584CAA; +add.f64 %1, fd40, fd106; +add.f64 %0, fd38, fd105; +add.f64 %3, fd51, fd116; +add.f64 %2, fd45, fd115; +add.f64 %5, fd52, fd126; +add.f64 %4, fd46, fd125; +sub.f64 %7, fd112, fd114; +add.f64 %6, fd110, fd108; +sub.f64 %9, fd122, fd124; +add.f64 %8, fd120, fd118; +sub.f64 %11, fd132, fd134; +add.f64 %10, fd130, fd128; +add.f64 %13, fd114, fd112; +sub.f64 %12, fd108, fd110; +add.f64 %15, fd124, fd122; +sub.f64 %14, fd118, fd120; +add.f64 %17, fd134, fd132; +sub.f64 %16, fd128, fd130; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<502, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<69>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 72, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %10, %13; +add.f64 fd14, %8, fd13; +add.f64 fd15, %12, %14; +add.f64 fd16, %9, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %8, fd17; +sub.f64 fd19, %12, %14; +mul.f64 fd20, fd19, 0d3FEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %9, fd23; +sub.f64 fd25, %10, %13; +mul.f64 fd26, fd25, 0d3FEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 72, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd29, fd21; +mul.f64 fd34, fd30, fd27; +sub.f64 fd35, fd33, fd34; +mul.f64 fd36, fd29, fd27; +fma.rn.f64 fd37, fd30, fd21, fd36; +ld.global.v2.f64 {fd38, fd39}, [rd6+48]; +mul.f64 fd42, fd38, fd22; +mul.f64 fd43, fd39, fd28; +sub.f64 fd44, fd42, fd43; +mul.f64 fd45, fd38, fd28; +fma.rn.f64 fd46, fd39, fd22, fd45; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd35; +st.shared.f64 [r9+16], fd44; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+24]; +ld.shared.f64 fd49, [r11+48]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+24]; +ld.shared.f64 fd52, [r11+48]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd51, fd52; +mul.f64 fd55, fd53, 0d3FE0000000000000; +sub.f64 fd56, fd47, fd55; +sub.f64 fd57, fd51, fd52; +mul.f64 fd58, fd57, 0d3FEBB67AE8584CAA; +mul.f64 fd59, fd54, 0d3FE0000000000000; +sub.f64 fd60, fd50, fd59; +sub.f64 fd61, fd48, fd49; +mul.f64 fd62, fd61, 0d3FEBB67AE8584CAA; +add.f64 %0, fd47, fd53; +add.f64 %1, fd50, fd54; +add.f64 %2, fd58, fd56; +sub.f64 %3, fd60, fd62; +sub.f64 %4, fd56, fd58; +add.f64 %5, fd62, fd60; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<503, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<75>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 144, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %10, %13; +add.f64 fd14, %12, %14; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %8, fd15; +sub.f64 fd17, %12, %14; +mul.f64 fd18, fd17, 0d3FEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %9, fd21; +sub.f64 fd23, %10, %13; +mul.f64 fd24, fd23, 0d3FEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 144, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd27, fd19; +mul.f64 fd32, fd28, fd25; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+48]; +mul.f64 fd38, fd34, fd20; +mul.f64 fd39, fd35, fd26; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %9, fd14; +add.f64 fd42, %8, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd28, fd19, fd33; +sub.f64 fd44, fd31, fd32; +st.shared.v2.f64 [r9+16], {fd44, fd43}; +fma.rn.f64 fd45, fd35, fd20, fd40; +sub.f64 fd46, fd38, fd39; +st.shared.v2.f64 [r9+32], {fd46, fd45}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+48]; +ld.shared.v2.f64 {fd55, fd56}, [r11+96]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0d3FEBB67AE8584CAA; +mul.f64 fd65, fd60, 0d3FE0000000000000; +sub.f64 fd66, fd48, fd65; +sub.f64 fd67, fd51, fd55; +mul.f64 fd68, fd67, 0d3FEBB67AE8584CAA; +add.f64 %1, fd48, fd60; +add.f64 %0, fd47, fd59; +sub.f64 %3, fd66, fd68; +add.f64 %2, fd64, fd62; +add.f64 %5, fd68, fd66; +sub.f64 %4, fd62, fd64; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..f87a02bcbfcd2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/fft_9_fp64_inv.hpp.inc @@ -0,0 +1,302 @@ +#ifndef CUFFTDX_FFT_9_FP64_INV_PTX_HPP +#define CUFFTDX_FFT_9_FP64_INV_PTX_HPP + + + +template<> __forceinline__ __device__ void cufftdx_private_function<672, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .f64 fd<153>; +.reg .b64 rd<2>; +add.f64 fd37, %26, %34; +add.f64 fd38, %18, fd37; +add.f64 fd39, %27, %35; +add.f64 fd40, %19, fd39; +mul.f64 fd41, fd37, 0d3FE0000000000000; +sub.f64 fd42, %18, fd41; +sub.f64 fd43, %27, %35; +mul.f64 fd44, fd43, 0dBFEBB67AE8584CAA; +add.f64 fd45, fd44, fd42; +sub.f64 fd46, fd42, fd44; +mul.f64 fd47, fd39, 0d3FE0000000000000; +sub.f64 fd48, %19, fd47; +sub.f64 fd49, %26, %34; +mul.f64 fd50, fd49, 0dBFEBB67AE8584CAA; +sub.f64 fd51, fd48, fd50; +add.f64 fd52, fd50, fd48; +add.f64 fd53, %28, %36; +add.f64 fd54, %20, fd53; +add.f64 fd55, %30, %38; +add.f64 fd56, %22, fd55; +mul.f64 fd57, fd53, 0d3FE0000000000000; +sub.f64 fd58, %20, fd57; +sub.f64 fd59, %30, %38; +mul.f64 fd60, fd59, 0dBFEBB67AE8584CAA; +add.f64 fd61, fd60, fd58; +sub.f64 fd62, fd58, fd60; +mul.f64 fd63, fd55, 0d3FE0000000000000; +sub.f64 fd64, %22, fd63; +sub.f64 fd65, %28, %36; +mul.f64 fd66, fd65, 0dBFEBB67AE8584CAA; +sub.f64 fd67, fd64, fd66; +add.f64 fd68, fd66, fd64; +add.f64 fd69, %31, %39; +add.f64 fd70, %23, fd69; +add.f64 fd71, %33, %40; +add.f64 fd72, %25, fd71; +mul.f64 fd73, fd69, 0d3FE0000000000000; +sub.f64 fd74, %23, fd73; +sub.f64 fd75, %33, %40; +mul.f64 fd76, fd75, 0dBFEBB67AE8584CAA; +add.f64 fd77, fd76, fd74; +sub.f64 fd78, fd74, fd76; +mul.f64 fd79, fd71, 0d3FE0000000000000; +sub.f64 fd80, %25, fd79; +sub.f64 fd81, %31, %39; +mul.f64 fd82, fd81, 0dBFEBB67AE8584CAA; +sub.f64 fd83, fd80, fd82; +add.f64 fd84, fd82, fd80; +mul.f64 fd85, fd61, 0d3FE8836FA2CF5039; +mul.f64 fd86, fd67, 0d3FE491B7523C161D; +sub.f64 fd87, fd85, fd86; +mul.f64 fd88, fd67, 0d3FE8836FA2CF5039; +fma.rn.f64 fd89, fd61, 0d3FE491B7523C161D, fd88; +mul.f64 fd90, fd77, 0d3FC63A1A7E0B738A; +mul.f64 fd91, fd83, 0d3FEF838B8C811C17; +sub.f64 fd92, fd90, fd91; +mul.f64 fd93, fd83, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd94, fd77, 0d3FEF838B8C811C17, fd93; +mul.f64 fd95, fd62, 0d3FC63A1A7E0B738A; +mul.f64 fd96, fd68, 0d3FEF838B8C811C17; +sub.f64 fd97, fd95, fd96; +mul.f64 fd98, fd68, 0d3FC63A1A7E0B738A; +fma.rn.f64 fd99, fd62, 0d3FEF838B8C811C17, fd98; +mul.f64 fd100, fd78, 0dBFEE11F642522D1C; +mul.f64 fd101, fd84, 0d3FD5E3A8748A0BF5; +sub.f64 fd102, fd100, fd101; +mul.f64 fd103, fd84, 0dBFEE11F642522D1C; +fma.rn.f64 fd104, fd78, 0d3FD5E3A8748A0BF5, fd103; +add.f64 fd105, fd54, fd70; +add.f64 fd106, fd56, fd72; +mul.f64 fd107, fd105, 0d3FE0000000000000; +sub.f64 fd108, fd38, fd107; +sub.f64 fd109, fd56, fd72; +mul.f64 fd110, fd109, 0dBFEBB67AE8584CAA; +mul.f64 fd111, fd106, 0d3FE0000000000000; +sub.f64 fd112, fd40, fd111; +sub.f64 fd113, fd54, fd70; +mul.f64 fd114, fd113, 0dBFEBB67AE8584CAA; +add.f64 fd115, fd87, fd92; +add.f64 fd116, fd89, fd94; +mul.f64 fd117, fd115, 0d3FE0000000000000; +sub.f64 fd118, fd45, fd117; +sub.f64 fd119, fd89, fd94; +mul.f64 fd120, fd119, 0dBFEBB67AE8584CAA; +mul.f64 fd121, fd116, 0d3FE0000000000000; +sub.f64 fd122, fd51, fd121; +sub.f64 fd123, fd87, fd92; +mul.f64 fd124, fd123, 0dBFEBB67AE8584CAA; +add.f64 fd125, fd97, fd102; +add.f64 fd126, fd99, fd104; +mul.f64 fd127, fd125, 0d3FE0000000000000; +sub.f64 fd128, fd46, fd127; +sub.f64 fd129, fd99, fd104; +mul.f64 fd130, fd129, 0dBFEBB67AE8584CAA; +mul.f64 fd131, fd126, 0d3FE0000000000000; +sub.f64 fd132, fd52, fd131; +sub.f64 fd133, fd97, fd102; +mul.f64 fd134, fd133, 0dBFEBB67AE8584CAA; +add.f64 %1, fd40, fd106; +add.f64 %0, fd38, fd105; +add.f64 %3, fd51, fd116; +add.f64 %2, fd45, fd115; +add.f64 %5, fd52, fd126; +add.f64 %4, fd46, fd125; +sub.f64 %7, fd112, fd114; +add.f64 %6, fd110, fd108; +sub.f64 %9, fd122, fd124; +add.f64 %8, fd120, fd118; +sub.f64 %11, fd132, fd134; +add.f64 %10, fd130, fd128; +add.f64 %13, fd114, fd112; +sub.f64 %12, fd108, fd110; +add.f64 %15, fd124, fd122; +sub.f64 %14, fd118, fd120; +add.f64 %17, fd134, fd132; +sub.f64 %16, fd128, fd130; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y), "=d"(rmem[3].x), "=d"(rmem[3].y), "=d"(rmem[4].x), "=d"(rmem[4].y), "=d"(rmem[5].x), "=d"(rmem[5].y), "=d"(rmem[6].x), "=d"(rmem[6].y), "=d"(rmem[7].x), "=d"(rmem[7].y), "=d"(rmem[8].x), "=d"(rmem[8].y): "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y), "d"(rmem[2].y), "d"(rmem[3].x), "d"(rmem[3].y), "d"(rmem[4].x), "d"(rmem[4].y), "d"(rmem[4].y), "d"(rmem[5].x), "d"(rmem[5].y), "d"(rmem[5].y), "d"(rmem[6].x), "d"(rmem[6].y), "d"(rmem[7].x), "d"(rmem[7].y), "d"(rmem[7].y), "d"(rmem[8].x), "d"(rmem[8].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<673, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<69>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 72, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %10, %13; +add.f64 fd14, %8, fd13; +add.f64 fd15, %12, %14; +add.f64 fd16, %9, fd15; +mul.f64 fd17, fd13, 0d3FE0000000000000; +sub.f64 fd18, %8, fd17; +sub.f64 fd19, %12, %14; +mul.f64 fd20, fd19, 0dBFEBB67AE8584CAA; +add.f64 fd21, fd20, fd18; +sub.f64 fd22, fd18, fd20; +mul.f64 fd23, fd15, 0d3FE0000000000000; +sub.f64 fd24, %9, fd23; +sub.f64 fd25, %10, %13; +mul.f64 fd26, fd25, 0dBFEBB67AE8584CAA; +sub.f64 fd27, fd24, fd26; +add.f64 fd28, fd26, fd24; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 72, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd29, fd30}, [rd6]; +mul.f64 fd33, fd27, fd30; +fma.rn.f64 fd34, fd29, fd21, fd33; +mul.f64 fd35, fd21, fd30; +mul.f64 fd36, fd29, fd27; +sub.f64 fd37, fd36, fd35; +ld.global.v2.f64 {fd38, fd39}, [rd6+48]; +mul.f64 fd42, fd28, fd39; +fma.rn.f64 fd43, fd38, fd22, fd42; +mul.f64 fd44, fd22, fd39; +mul.f64 fd45, fd38, fd28; +sub.f64 fd46, fd45, fd44; +barrier.sync 0; +mad.lo.s32 r9, r7, 24, r8; +st.shared.f64 [r9], fd14; +st.shared.f64 [r9+8], fd34; +st.shared.f64 [r9+16], fd43; +barrier.sync 0; +shl.b32 r10, r7, 4; +sub.s32 r11, r9, r10; +ld.shared.f64 fd47, [r11]; +ld.shared.f64 fd48, [r11+24]; +ld.shared.f64 fd49, [r11+48]; +barrier.sync 0; +st.shared.f64 [r9], fd16; +st.shared.f64 [r9+8], fd37; +st.shared.f64 [r9+16], fd46; +barrier.sync 0; +ld.shared.f64 fd50, [r11]; +ld.shared.f64 fd51, [r11+24]; +ld.shared.f64 fd52, [r11+48]; +add.f64 fd53, fd48, fd49; +add.f64 fd54, fd51, fd52; +mul.f64 fd55, fd53, 0d3FE0000000000000; +sub.f64 fd56, fd47, fd55; +sub.f64 fd57, fd51, fd52; +mul.f64 fd58, fd57, 0dBFEBB67AE8584CAA; +mul.f64 fd59, fd54, 0d3FE0000000000000; +sub.f64 fd60, fd50, fd59; +sub.f64 fd61, fd48, fd49; +mul.f64 fd62, fd61, 0dBFEBB67AE8584CAA; +add.f64 %0, fd47, fd53; +add.f64 %1, fd50, fd54; +add.f64 %2, fd58, fd56; +sub.f64 %3, fd60, fd62; +sub.f64 %4, fd56, fd58; +add.f64 %5, fd62, fd60; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + + + +template<> __forceinline__ __device__ void cufftdx_private_function<674, double, 1>(cufftdx::detail::complex *rmem, unsigned smem){ + +asm volatile (R"({ +.reg .b32 r<12>; +.reg .f64 fd<75>; +.reg .b64 rd<7>; +mov.u32 r1, %tid.y; +mov.u32 r2, %6; +mad.lo.s32 r3, r1, 144, r2; +mov.u32 r4, %tid.x; +add.f64 fd13, %10, %13; +add.f64 fd14, %12, %14; +mul.f64 fd15, fd13, 0d3FE0000000000000; +sub.f64 fd16, %8, fd15; +sub.f64 fd17, %12, %14; +mul.f64 fd18, fd17, 0dBFEBB67AE8584CAA; +add.f64 fd19, fd18, fd16; +sub.f64 fd20, fd16, fd18; +mul.f64 fd21, fd14, 0d3FE0000000000000; +sub.f64 fd22, %9, fd21; +sub.f64 fd23, %10, %13; +mul.f64 fd24, fd23, 0dBFEBB67AE8584CAA; +sub.f64 fd25, fd22, fd24; +add.f64 fd26, fd24, fd22; +mul.wide.u32 rd2, r4, -1431655765; +shr.u64 rd3, rd2, 33; +cvt.u32.u64 r5, rd3; +mul.lo.s32 r6, r5, 3; +sub.s32 r7, r4, r6; +mad.lo.s32 r8, r5, 144, r3; +mul.wide.u32 rd4, r7, 16; +mov.u64 rd5, %7; +add.s64 rd6, rd5, rd4; +ld.global.v2.f64 {fd27, fd28}, [rd6]; +mul.f64 fd31, fd25, fd28; +mul.f64 fd32, fd19, fd28; +mul.f64 fd33, fd27, fd25; +ld.global.v2.f64 {fd34, fd35}, [rd6+48]; +mul.f64 fd38, fd26, fd35; +mul.f64 fd39, fd20, fd35; +mul.f64 fd40, fd34, fd26; +barrier.sync 0; +mad.lo.s32 r9, r7, 48, r8; +add.f64 fd41, %9, fd14; +add.f64 fd42, %8, fd13; +st.shared.v2.f64 [r9], {fd42, fd41}; +fma.rn.f64 fd43, fd27, fd19, fd31; +sub.f64 fd44, fd33, fd32; +st.shared.v2.f64 [r9+16], {fd43, fd44}; +fma.rn.f64 fd45, fd34, fd20, fd38; +sub.f64 fd46, fd40, fd39; +st.shared.v2.f64 [r9+32], {fd45, fd46}; +barrier.sync 0; +shl.b32 r10, r7, 5; +sub.s32 r11, r9, r10; +ld.shared.v2.f64 {fd47, fd48}, [r11]; +ld.shared.v2.f64 {fd51, fd52}, [r11+48]; +ld.shared.v2.f64 {fd55, fd56}, [r11+96]; +add.f64 fd59, fd51, fd55; +add.f64 fd60, fd52, fd56; +mul.f64 fd61, fd59, 0d3FE0000000000000; +sub.f64 fd62, fd47, fd61; +sub.f64 fd63, fd52, fd56; +mul.f64 fd64, fd63, 0dBFEBB67AE8584CAA; +mul.f64 fd65, fd60, 0d3FE0000000000000; +sub.f64 fd66, fd48, fd65; +sub.f64 fd67, fd51, fd55; +mul.f64 fd68, fd67, 0dBFEBB67AE8584CAA; +add.f64 %1, fd48, fd60; +add.f64 %0, fd47, fd59; +sub.f64 %3, fd66, fd68; +add.f64 %2, fd64, fd62; +add.f64 %5, fd68, fd66; +sub.f64 %4, fd62, fd64; +})" + : "=d"(rmem[0].x), "=d"(rmem[0].y), "=d"(rmem[1].x), "=d"(rmem[1].y), "=d"(rmem[2].x), "=d"(rmem[2].y): "r"(smem), "l"(lut_dp_3_9), "d"(rmem[0].x), "d"(rmem[0].y), "d"(rmem[1].x), "d"(rmem[1].y), "d"(rmem[1].y), "d"(rmem[2].x), "d"(rmem[2].y)); +}; + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_defines.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_defines.hpp.inc new file mode 100644 index 0000000000000..aaf04eef851d8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_defines.hpp.inc @@ -0,0 +1,23 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#ifndef CUFFTDX_DATABASE_LUT_DEFINES_INC_HPP + +#define CUFFTDX_DATABASE_LUT_DEFINES_INC_HPP + + + + + +#include "lut_defines_0.hpp.inc" + + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_defines_0.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_defines_0.hpp.inc new file mode 100644 index 0000000000000..fedffa1c41297 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_defines_0.hpp.inc @@ -0,0 +1,16311 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +// Pre-computed twiddles for N=2 +#define T_2_0 {1.0,0.0} +#define T_2_1 {-1.0,0.0} +// Pre-computed twiddles for N=3 +#define T_3_1 {-0.5,-0.8660254037844385965883020617184229195118} +#define T_3_2 {-0.5,0.8660254037844385965883020617184229195118} +// Pre-computed twiddles for N=4 +#define T_4_1 {0.0,-1.0} +#define T_4_3 {-0.0,1.0} +// Pre-computed twiddles for N=5 +#define T_5_1 {0.3090169943749474512628694355953484773636,-0.9510565162951535311819384332920890301466} +#define T_5_2 {-0.8090169943749474512628694355953484773636,-0.5877852522924731371034567928290925920010} +#define T_5_3 {-0.8090169943749474512628694355953484773636,0.5877852522924731371034567928290925920010} +#define T_5_4 {0.3090169943749474512628694355953484773636,0.9510565162951535311819384332920890301466} +// Pre-computed twiddles for N=6 +#define T_6_1 {0.5,-0.8660254037844385965883020617184229195118} +#define T_6_5 {0.5,0.8660254037844385965883020617184229195118} +// Pre-computed twiddles for N=7 +#define T_7_1 {0.6234898018587334833640056785952765494585,-0.7818314824680298036341241640911903232336} +#define T_7_2 {-0.2225209339563143928764077372761676087976,-0.9749279121818236193419693336181808263063} +#define T_7_3 {-0.9009688679024191459987491725769359618425,-0.4338837391175581204016964420588919892907} +#define T_7_4 {-0.9009688679024191459987491725769359618425,0.4338837391175581204016964420588919892907} +#define T_7_5 {-0.2225209339563143928764077372761676087976,0.9749279121818236193419693336181808263063} +#define T_7_6 {0.6234898018587334833640056785952765494585,0.7818314824680298036341241640911903232336} +// Pre-computed twiddles for N=8 +#define T_8_1 {0.7071067811865475727373109293694142252207,-0.7071067811865475727373109293694142252207} +#define T_8_3 {-0.7071067811865475727373109293694142252207,-0.7071067811865475727373109293694142252207} +#define T_8_5 {-0.7071067811865475727373109293694142252207,0.7071067811865475727373109293694142252207} +#define T_8_7 {0.7071067811865475727373109293694142252207,0.7071067811865475727373109293694142252207} +// Pre-computed twiddles for N=9 +#define T_9_1 {0.7660444431189780134516809084743726998568,-0.6427876096865393629187224178167525678873} +#define T_9_2 {0.1736481776669303589422099776129471138120,-0.9848077530122080203156542665965389460325} +#define T_9_4 {-0.9396926207859084279050421173451468348503,-0.3420201433256687129080830800376133993268} +#define T_9_5 {-0.9396926207859084279050421173451468348503,0.3420201433256687129080830800376133993268} +#define T_9_7 {0.1736481776669303589422099776129471138120,0.9848077530122080203156542665965389460325} +#define T_9_8 {0.7660444431189780134516809084743726998568,0.6427876096865393629187224178167525678873} +// Pre-computed twiddles for N=10 +#define T_10_1 {0.8090169943749474512628694355953484773636,-0.5877852522924731371034567928290925920010} +#define T_10_3 {-0.3090169943749474512628694355953484773636,-0.9510565162951535311819384332920890301466} +#define T_10_7 {-0.3090169943749474512628694355953484773636,0.9510565162951535311819384332920890301466} +#define T_10_9 {0.8090169943749474512628694355953484773636,0.5877852522924731371034567928290925920010} +// Pre-computed twiddles for N=11 +#define T_11_1 {0.8412535328311812055090967987780459225178,-0.5406408174555975554298470342473592609167} +#define T_11_2 {0.4154150130018864350844864929968025535345,-0.9096319953545183301102383666147943586111} +#define T_11_3 {-0.1423148382732851435772403192459023557603,-0.9898214418809326842207951813179533928633} +#define T_11_4 {-0.6548607339452851006100786435126792639494,-0.7557495743542582689045161714602727442980} +#define T_11_5 {-0.9594929736144973686506887133873533457518,-0.2817325568414296710351152341900160536170} +#define T_11_6 {-0.9594929736144973686506887133873533457518,0.2817325568414296710351152341900160536170} +#define T_11_7 {-0.6548607339452851006100786435126792639494,0.7557495743542582689045161714602727442980} +#define T_11_8 {-0.1423148382732851435772403192459023557603,0.9898214418809326842207951813179533928633} +#define T_11_9 {0.4154150130018864350844864929968025535345,0.9096319953545183301102383666147943586111} +#define T_11_10 {0.8412535328311812055090967987780459225178,0.5406408174555975554298470342473592609167} +// Pre-computed twiddles for N=12 +#define T_12_1 {0.8660254037844385965883020617184229195118,-0.5} +#define T_12_5 {-0.8660254037844385965883020617184229195118,-0.5} +#define T_12_7 {-0.8660254037844385965883020617184229195118,0.5} +#define T_12_11 {0.8660254037844385965883020617184229195118,0.5} +// Pre-computed twiddles for N=13 +#define T_13_1 {0.8854560256532099105086786039464641362429,-0.4647231720437685620339607339701615273952} +#define T_13_2 {0.5680647467311558118652214943722356110811,-0.8229838658936563522416918203816749155521} +#define T_13_3 {0.1205366802553230476391377123945858329535,-0.9927088740980539727232212499075103551149} +#define T_13_4 {-0.3546048870425356214219902994955191388726,-0.9350162426854148334243177487223874777555} +#define T_13_5 {-0.7485107481711010812830409122398123145103,-0.6631226582407951930520084715681150555611} +#define T_13_6 {-0.9709418174260520117968553677201271057129,-0.2393156642875577666540465315847541205585} +#define T_13_7 {-0.9709418174260520117968553677201271057129,0.2393156642875577666540465315847541205585} +#define T_13_8 {-0.7485107481711010812830409122398123145103,0.6631226582407951930520084715681150555611} +#define T_13_9 {-0.3546048870425356214219902994955191388726,0.9350162426854148334243177487223874777555} +#define T_13_10 {0.1205366802553230476391377123945858329535,0.9927088740980539727232212499075103551149} +#define T_13_11 {0.5680647467311558118652214943722356110811,0.8229838658936563522416918203816749155521} +#define T_13_12 {0.8854560256532099105086786039464641362429,0.4647231720437685620339607339701615273952} +// Pre-computed twiddles for N=14 +#define T_14_1 {0.9009688679024191459987491725769359618425,-0.4338837391175581204016964420588919892907} +#define T_14_3 {0.2225209339563143928764077372761676087976,-0.9749279121818236193419693336181808263063} +#define T_14_5 {-0.6234898018587334833640056785952765494585,-0.7818314824680298036341241640911903232336} +#define T_14_9 {-0.6234898018587334833640056785952765494585,0.7818314824680298036341241640911903232336} +#define T_14_11 {0.2225209339563143928764077372761676087976,0.9749279121818236193419693336181808263063} +#define T_14_13 {0.9009688679024191459987491725769359618425,0.4338837391175581204016964420588919892907} +// Pre-computed twiddles for N=15 +#define T_15_1 {0.9135454576426008665990252666233573108912,-0.4067366430758002082690438783174613490701} +#define T_15_2 {0.6691306063588582375700752891134470701218,-0.7431448254773942441175904605188407003880} +#define T_15_4 {-0.1045284632676534708473070622858358547091,-0.9945218953682732898613494398887269198895} +#define T_15_7 {-0.9781476007338056888329447247087955474854,-0.2079116908177593425754992040310753509402} +#define T_15_8 {-0.9781476007338056888329447247087955474854,0.2079116908177593425754992040310753509402} +#define T_15_11 {-0.1045284632676534708473070622858358547091,0.9945218953682732898613494398887269198895} +#define T_15_13 {0.6691306063588582375700752891134470701218,0.7431448254773942441175904605188407003880} +#define T_15_14 {0.9135454576426008665990252666233573108912,0.4067366430758002082690438783174613490701} +// Pre-computed twiddles for N=16 +#define T_16_1 {0.9238795325112867384831361050601117312908,-0.3826834323650897817792326804919866845012} +#define T_16_3 {0.3826834323650897817792326804919866845012,-0.9238795325112867384831361050601117312908} +#define T_16_5 {-0.3826834323650897817792326804919866845012,-0.9238795325112867384831361050601117312908} +#define T_16_7 {-0.9238795325112867384831361050601117312908,-0.3826834323650897817792326804919866845012} +#define T_16_9 {-0.9238795325112867384831361050601117312908,0.3826834323650897817792326804919866845012} +#define T_16_11 {-0.3826834323650897817792326804919866845012,0.9238795325112867384831361050601117312908} +#define T_16_13 {0.3826834323650897817792326804919866845012,0.9238795325112867384831361050601117312908} +#define T_16_15 {0.9238795325112867384831361050601117312908,0.3826834323650897817792326804919866845012} +// Pre-computed twiddles for N=17 +#define T_17_1 {0.9324722294043558123988191255193669348955,-0.3612416661871529210969811174436472356319} +#define T_17_2 {0.7390089172206590895086719683604314923286,-0.6736956436465572073402086061832960695028} +#define T_17_3 {0.4457383557765382531279385602829279378057,-0.8951632913550623404930206561402883380651} +#define T_17_4 {0.0922683594633020021102964847159455530345,-0.9957341762950344676852409975253976881504} +#define T_17_5 {-0.2736629900720828900695380525576183572412,-0.9618256431728190447572046650748234242201} +#define T_17_6 {-0.6026346363792564053696310111263301223516,-0.7980172272802394939361647629993967711926} +#define T_17_7 {-0.8502171357296141041004489125043619424105,-0.5264321628773558359171147458255290985107} +#define T_17_8 {-0.9829730996839017853616837783192750066519,-0.1837495178165703402228814411500934511423} +#define T_17_9 {-0.9829730996839017853616837783192750066519,0.1837495178165703402228814411500934511423} +#define T_17_10 {-0.8502171357296141041004489125043619424105,0.5264321628773558359171147458255290985107} +#define T_17_11 {-0.6026346363792564053696310111263301223516,0.7980172272802394939361647629993967711926} +#define T_17_12 {-0.2736629900720828900695380525576183572412,0.9618256431728190447572046650748234242201} +#define T_17_13 {0.0922683594633020021102964847159455530345,0.9957341762950344676852409975253976881504} +#define T_17_14 {0.4457383557765382531279385602829279378057,0.8951632913550623404930206561402883380651} +#define T_17_15 {0.7390089172206590895086719683604314923286,0.6736956436465572073402086061832960695028} +#define T_17_16 {0.9324722294043558123988191255193669348955,0.3612416661871529210969811174436472356319} +// Pre-computed twiddles for N=18 +#define T_18_1 {0.9396926207859084279050421173451468348503,-0.3420201433256687129080830800376133993268} +#define T_18_5 {-0.1736481776669303589422099776129471138120,-0.9848077530122080203156542665965389460325} +#define T_18_7 {-0.7660444431189780134516809084743726998568,-0.6427876096865393629187224178167525678873} +#define T_18_11 {-0.7660444431189780134516809084743726998568,0.6427876096865393629187224178167525678873} +#define T_18_13 {-0.1736481776669303589422099776129471138120,0.9848077530122080203156542665965389460325} +#define T_18_17 {0.9396926207859084279050421173451468348503,0.3420201433256687129080830800376133993268} +// Pre-computed twiddles for N=19 +#define T_19_1 {0.9458172417006346366363800370891112834215,-0.3246994692046835107035462897329125553370} +#define T_19_2 {0.7891405093963935746259608094987925142050,-0.6142127126896678168677112807927187532187} +#define T_19_3 {0.5469481581224269195473652871442027390003,-0.8371664782625285461392650177003815770149} +#define T_19_4 {0.2454854871407991523390990096231689676642,-0.9694002659393303744650438602548092603683} +#define T_19_5 {-0.0825793454723323244737187565078784245998,-0.9965844930066698470483288474497385323048} +#define T_19_6 {-0.4016954246529694794176634786708746105433,-0.9157733266550573958397762908134609460831} +#define T_19_7 {-0.6772815716257410967315877314831595867872,-0.7357239106731315869680543073627632111311} +#define T_19_8 {-0.8794737512064890738727740426838863641024,-0.4759473930370735628159195584885310381651} +#define T_19_9 {-0.9863613034027223225308489418239332735538,-0.1645945902807338934170644506593816913664} +#define T_19_10 {-0.9863613034027223225308489418239332735538,0.1645945902807338934170644506593816913664} +#define T_19_11 {-0.8794737512064890738727740426838863641024,0.4759473930370735628159195584885310381651} +#define T_19_12 {-0.6772815716257410967315877314831595867872,0.7357239106731315869680543073627632111311} +#define T_19_13 {-0.4016954246529694794176634786708746105433,0.9157733266550573958397762908134609460831} +#define T_19_14 {-0.0825793454723323244737187565078784245998,0.9965844930066698470483288474497385323048} +#define T_19_15 {0.2454854871407991523390990096231689676642,0.9694002659393303744650438602548092603683} +#define T_19_16 {0.5469481581224269195473652871442027390003,0.8371664782625285461392650177003815770149} +#define T_19_17 {0.7891405093963935746259608094987925142050,0.6142127126896678168677112807927187532187} +#define T_19_18 {0.9458172417006346366363800370891112834215,0.3246994692046835107035462897329125553370} +// Pre-computed twiddles for N=20 +#define T_20_1 {0.9510565162951535311819384332920890301466,-0.3090169943749474512628694355953484773636} +#define T_20_3 {0.5877852522924731371034567928290925920010,-0.8090169943749474512628694355953484773636} +#define T_20_7 {-0.5877852522924731371034567928290925920010,-0.8090169943749474512628694355953484773636} +#define T_20_9 {-0.9510565162951535311819384332920890301466,-0.3090169943749474512628694355953484773636} +#define T_20_11 {-0.9510565162951535311819384332920890301466,0.3090169943749474512628694355953484773636} +#define T_20_13 {-0.5877852522924731371034567928290925920010,0.8090169943749474512628694355953484773636} +#define T_20_17 {0.5877852522924731371034567928290925920010,0.8090169943749474512628694355953484773636} +#define T_20_19 {0.9510565162951535311819384332920890301466,0.3090169943749474512628694355953484773636} +// Pre-computed twiddles for N=21 +#define T_21_1 {0.9555728057861406776751778124889824539423,-0.2947551744109042082264693362958496436477} +#define T_21_2 {0.8262387743159949060611779714236035943031,-0.5633200580636220644592526696214918047190} +#define T_21_4 {0.3653410243663950396353357064072042703629,-0.9308737486442042463608004254638217389584} +#define T_21_5 {0.0747300935864242538153590089677891228348,-0.9972037971811801293497978804225567728281} +#define T_21_8 {-0.7330518718298263403099213064706418663263,-0.6801727377709193556043487660645041614771} +#define T_21_10 {-0.9888308262251285229993413850024808198214,-0.1490422661761744427266762613726314157248} +#define T_21_11 {-0.9888308262251285229993413850024808198214,0.1490422661761744427266762613726314157248} +#define T_21_13 {-0.7330518718298263403099213064706418663263,0.6801727377709193556043487660645041614771} +#define T_21_16 {0.0747300935864242538153590089677891228348,0.9972037971811801293497978804225567728281} +#define T_21_17 {0.3653410243663950396353357064072042703629,0.9308737486442042463608004254638217389584} +#define T_21_19 {0.8262387743159949060611779714236035943031,0.5633200580636220644592526696214918047190} +#define T_21_20 {0.9555728057861406776751778124889824539423,0.2947551744109042082264693362958496436477} +// Pre-computed twiddles for N=22 +#define T_22_1 {0.9594929736144973686506887133873533457518,-0.2817325568414296710351152341900160536170} +#define T_22_3 {0.6548607339452851006100786435126792639494,-0.7557495743542582689045161714602727442980} +#define T_22_5 {0.1423148382732851435772403192459023557603,-0.9898214418809326842207951813179533928633} +#define T_22_7 {-0.4154150130018864350844864929968025535345,-0.9096319953545183301102383666147943586111} +#define T_22_9 {-0.8412535328311812055090967987780459225178,-0.5406408174555975554298470342473592609167} +#define T_22_13 {-0.8412535328311812055090967987780459225178,0.5406408174555975554298470342473592609167} +#define T_22_15 {-0.4154150130018864350844864929968025535345,0.9096319953545183301102383666147943586111} +#define T_22_17 {0.1423148382732851435772403192459023557603,0.9898214418809326842207951813179533928633} +#define T_22_19 {0.6548607339452851006100786435126792639494,0.7557495743542582689045161714602727442980} +#define T_22_21 {0.9594929736144973686506887133873533457518,0.2817325568414296710351152341900160536170} +// Pre-computed twiddles for N=23 +#define T_23_1 {0.9629172873477992444790629633644130080938,-0.2697967711570242732577185051923152059317} +#define T_23_2 {0.8544194045464885700624790842994116246700,-0.5195839500354335616094658689689822494984} +#define T_23_3 {0.6825531432186541236362131712667178362608,-0.7308359642781241394260405286331661045551} +#define T_23_4 {0.4600650377311521488365997356595471501350,-0.8878852184023752203145818384655285626650} +#define T_23_5 {0.2034560130526337773382294926705071702600,-0.9790840876823229121939107244543265551329} +#define T_23_6 {-0.0682424133646709757394788198325841221958,-0.9976687691905391508484512996801640838385} +#define T_23_7 {-0.3348796121709861628445992209890391677618,-0.9422609221188205097519130504224449396133} +#define T_23_8 {-0.5766803221148671942941632551082875579596,-0.8169698930104419831721429545723367482424} +#define T_23_9 {-0.7757112907044197980255262336868327111006,-0.6310879443260527521530889316636603325605} +#define T_23_10 {-0.9172113015054530471559246507240459322929,-0.3984010898462414518306218269572127610445} +#define T_23_11 {-0.9906859460363307556818313059920910745859,-0.1361666490962465947411885736073600128293} +#define T_23_12 {-0.9906859460363307556818313059920910745859,0.1361666490962465947411885736073600128293} +#define T_23_13 {-0.9172113015054530471559246507240459322929,0.3984010898462414518306218269572127610445} +#define T_23_14 {-0.7757112907044197980255262336868327111006,0.6310879443260527521530889316636603325605} +#define T_23_15 {-0.5766803221148671942941632551082875579596,0.8169698930104419831721429545723367482424} +#define T_23_16 {-0.3348796121709861628445992209890391677618,0.9422609221188205097519130504224449396133} +#define T_23_17 {-0.0682424133646709757394788198325841221958,0.9976687691905391508484512996801640838385} +#define T_23_18 {0.2034560130526337773382294926705071702600,0.9790840876823229121939107244543265551329} +#define T_23_19 {0.4600650377311521488365997356595471501350,0.8878852184023752203145818384655285626650} +#define T_23_20 {0.6825531432186541236362131712667178362608,0.7308359642781241394260405286331661045551} +#define T_23_21 {0.8544194045464885700624790842994116246700,0.5195839500354335616094658689689822494984} +#define T_23_22 {0.9629172873477992444790629633644130080938,0.2697967711570242732577185051923152059317} +// Pre-computed twiddles for N=24 +#define T_24_1 {0.9659258262890683122137147620378527790308,-0.2588190451025207394764038326684385538101} +#define T_24_5 {0.2588190451025207394764038326684385538101,-0.9659258262890683122137147620378527790308} +#define T_24_7 {-0.2588190451025207394764038326684385538101,-0.9659258262890683122137147620378527790308} +#define T_24_11 {-0.9659258262890683122137147620378527790308,-0.2588190451025207394764038326684385538101} +#define T_24_13 {-0.9659258262890683122137147620378527790308,0.2588190451025207394764038326684385538101} +#define T_24_17 {-0.2588190451025207394764038326684385538101,0.9659258262890683122137147620378527790308} +#define T_24_19 {0.2588190451025207394764038326684385538101,0.9659258262890683122137147620378527790308} +#define T_24_23 {0.9659258262890683122137147620378527790308,0.2588190451025207394764038326684385538101} +// Pre-computed twiddles for N=25 +#define T_25_1 {0.9685831611286310760533524444326758384705,-0.2486898871648547948431939857982797548175} +#define T_25_2 {0.8763066800438635839398671123490203171968,-0.4817536741017152679411594817793229594827} +#define T_25_3 {0.7289686274214115524472390461596660315990,-0.6845471059286887260952880751574411988258} +#define T_25_4 {0.5358267949789966566598309327673632651567,-0.8443279255020150753097141205216757953167} +#define T_25_6 {0.0627905195293133738809743249476014170796,-0.9980267284282715589682766221812926232815} +#define T_25_7 {-0.1873813145857246287295794218152877874672,-0.9822872507286887211463977109815459698439} +#define T_25_8 {-0.4257792915650726595089281545369885861874,-0.9048270524660195768262838100781664252281} +#define T_25_9 {-0.6374239897486897454825793829513713717461,-0.7705132427757892532582673084107227623463} +#define T_25_11 {-0.9297764858882514582560929738974664360285,-0.3681245526846779747565108209528261795640} +#define T_25_12 {-0.9921147013144778759041741977853234857321,-0.1253332335643042583228634612169116735458} +#define T_25_13 {-0.9921147013144778759041741977853234857321,0.1253332335643042583228634612169116735458} +#define T_25_14 {-0.9297764858882514582560929738974664360285,0.3681245526846779747565108209528261795640} +#define T_25_16 {-0.6374239897486897454825793829513713717461,0.7705132427757892532582673084107227623463} +#define T_25_17 {-0.4257792915650726595089281545369885861874,0.9048270524660195768262838100781664252281} +#define T_25_18 {-0.1873813145857246287295794218152877874672,0.9822872507286887211463977109815459698439} +#define T_25_19 {0.0627905195293133738809743249476014170796,0.9980267284282715589682766221812926232815} +#define T_25_21 {0.5358267949789966566598309327673632651567,0.8443279255020150753097141205216757953167} +#define T_25_22 {0.7289686274214115524472390461596660315990,0.6845471059286887260952880751574411988258} +#define T_25_23 {0.8763066800438635839398671123490203171968,0.4817536741017152679411594817793229594827} +#define T_25_24 {0.9685831611286310760533524444326758384705,0.2486898871648547948431939857982797548175} +// Pre-computed twiddles for N=26 +#define T_26_1 {0.9709418174260520117968553677201271057129,-0.2393156642875577666540465315847541205585} +#define T_26_3 {0.7485107481711010812830409122398123145103,-0.6631226582407951930520084715681150555611} +#define T_26_5 {0.3546048870425356214219902994955191388726,-0.9350162426854148334243177487223874777555} +#define T_26_7 {-0.1205366802553230476391377123945858329535,-0.9927088740980539727232212499075103551149} +#define T_26_9 {-0.5680647467311558118652214943722356110811,-0.8229838658936563522416918203816749155521} +#define T_26_11 {-0.8854560256532099105086786039464641362429,-0.4647231720437685620339607339701615273952} +#define T_26_15 {-0.8854560256532099105086786039464641362429,0.4647231720437685620339607339701615273952} +#define T_26_17 {-0.5680647467311558118652214943722356110811,0.8229838658936563522416918203816749155521} +#define T_26_19 {-0.1205366802553230476391377123945858329535,0.9927088740980539727232212499075103551149} +#define T_26_21 {0.3546048870425356214219902994955191388726,0.9350162426854148334243177487223874777555} +#define T_26_23 {0.7485107481711010812830409122398123145103,0.6631226582407951930520084715681150555611} +#define T_26_25 {0.9709418174260520117968553677201271057129,0.2393156642875577666540465315847541205585} +// Pre-computed twiddles for N=27 +#define T_27_1 {0.9730448705798238062669724968145601451397,-0.2306158707424401654861867427825927734375} +#define T_27_2 {0.8936326403234122750518508837558329105377,-0.4487991802004621666455363993009086698294} +#define T_27_4 {0.5971585917027861789563303318573161959648,-0.8021231927550437346141620764683466404676} +#define T_27_5 {0.3960797660391568442150855844374746084213,-0.9182161068802739967154025180207099765539} +#define T_27_7 {-0.0581448289104758292422658882969699334353,-0.9983081582712681756319739179161842912436} +#define T_27_8 {-0.2868032327110902612865572791633894667029,-0.9579895123154889002847767187631689012051} +#define T_27_10 {-0.6862416378687336004915664489089976996183,-0.7273736415730487347985899759805761277676} +#define T_27_11 {-0.8354878114129363764206459563865792006254,-0.5495089780708060089864375186152756214142} +#define T_27_13 {-0.9932383577419430231714159162947908043861,-0.1160929141252302343456648259234498254955} +#define T_27_14 {-0.9932383577419430231714159162947908043861,0.1160929141252302343456648259234498254955} +#define T_27_16 {-0.8354878114129363764206459563865792006254,0.5495089780708060089864375186152756214142} +#define T_27_17 {-0.6862416378687336004915664489089976996183,0.7273736415730487347985899759805761277676} +#define T_27_19 {-0.2868032327110902612865572791633894667029,0.9579895123154889002847767187631689012051} +#define T_27_20 {-0.0581448289104758292422658882969699334353,0.9983081582712681756319739179161842912436} +#define T_27_22 {0.3960797660391568442150855844374746084213,0.9182161068802739967154025180207099765539} +#define T_27_23 {0.5971585917027861789563303318573161959648,0.8021231927550437346141620764683466404676} +#define T_27_25 {0.8936326403234122750518508837558329105377,0.4487991802004621666455363993009086698294} +#define T_27_26 {0.9730448705798238062669724968145601451397,0.2306158707424401654861867427825927734375} +// Pre-computed twiddles for N=28 +#define T_28_1 {0.9749279121818236193419693336181808263063,-0.2225209339563143928764077372761676087976} +#define T_28_3 {0.7818314824680298036341241640911903232336,-0.6234898018587334833640056785952765494585} +#define T_28_5 {0.4338837391175581204016964420588919892907,-0.9009688679024191459987491725769359618425} +#define T_28_9 {-0.4338837391175581204016964420588919892907,-0.9009688679024191459987491725769359618425} +#define T_28_11 {-0.7818314824680298036341241640911903232336,-0.6234898018587334833640056785952765494585} +#define T_28_13 {-0.9749279121818236193419693336181808263063,-0.2225209339563143928764077372761676087976} +#define T_28_15 {-0.9749279121818236193419693336181808263063,0.2225209339563143928764077372761676087976} +#define T_28_17 {-0.7818314824680298036341241640911903232336,0.6234898018587334833640056785952765494585} +#define T_28_19 {-0.4338837391175581204016964420588919892907,0.9009688679024191459987491725769359618425} +#define T_28_23 {0.4338837391175581204016964420588919892907,0.9009688679024191459987491725769359618425} +#define T_28_25 {0.7818314824680298036341241640911903232336,0.6234898018587334833640056785952765494585} +#define T_28_27 {0.9749279121818236193419693336181808263063,0.2225209339563143928764077372761676087976} +// Pre-computed twiddles for N=29 +#define T_29_1 {0.9766205557100866574415931609109975397587,-0.2149704402110240719636635731148999184370} +#define T_29_2 {0.9075754196709570065237926428380887955427,-0.4198891015602645992998986912425607442856} +#define T_29_3 {0.7960930657056437542706817112048156559467,-0.6051742151937651303938991986797191202641} +#define T_29_4 {0.6473862847818276922140512397163547575474,-0.7621620551276364619042169579188339412212} +#define T_29_5 {0.4684084406997901517399895965354517102242,-0.8835120444460229371941295539727434515953} +#define T_29_6 {0.2675283385292208104822009318013442680240,-0.9635499925192229087045348023821134120226} +#define T_29_7 {0.0541389085854175264311827220353734446689,-0.9985334138511238188939955762180034071207} +#define T_29_8 {-0.1617819965527647341207995168588240630925,-0.9868265225415261410191192226193379610777} +#define T_29_9 {-0.3701381553399143431448692354024387896061,-0.9289767198167914186157645417551975697279} +#define T_29_10 {-0.5611870653623823868016984306450467556715,-0.8276889981568905652054013444285374134779} +#define T_29_11 {-0.7259954919231308423377413419075310230255,-0.6876994588534233177057330976822413504124} +#define T_29_12 {-0.8568571761675892739518189955560956150293,-0.5155538571770217348699816284351982176304} +#define T_29_13 {-0.9476531711828024562294103816384449601173,-0.3193015301359799495983793349296320229769} +#define T_29_14 {-0.9941379571543595972116236225701868534088,-0.1081190184239417678702110947597248014063} +#define T_29_15 {-0.9941379571543595972116236225701868534088,0.1081190184239417678702110947597248014063} +#define T_29_16 {-0.9476531711828024562294103816384449601173,0.3193015301359799495983793349296320229769} +#define T_29_17 {-0.8568571761675892739518189955560956150293,0.5155538571770217348699816284351982176304} +#define T_29_18 {-0.7259954919231308423377413419075310230255,0.6876994588534233177057330976822413504124} +#define T_29_19 {-0.5611870653623823868016984306450467556715,0.8276889981568905652054013444285374134779} +#define T_29_20 {-0.3701381553399143431448692354024387896061,0.9289767198167914186157645417551975697279} +#define T_29_21 {-0.1617819965527647341207995168588240630925,0.9868265225415261410191192226193379610777} +#define T_29_22 {0.0541389085854175264311827220353734446689,0.9985334138511238188939955762180034071207} +#define T_29_23 {0.2675283385292208104822009318013442680240,0.9635499925192229087045348023821134120226} +#define T_29_24 {0.4684084406997901517399895965354517102242,0.8835120444460229371941295539727434515953} +#define T_29_25 {0.6473862847818276922140512397163547575474,0.7621620551276364619042169579188339412212} +#define T_29_26 {0.7960930657056437542706817112048156559467,0.6051742151937651303938991986797191202641} +#define T_29_27 {0.9075754196709570065237926428380887955427,0.4198891015602645992998986912425607442856} +#define T_29_28 {0.9766205557100866574415931609109975397587,0.2149704402110240719636635731148999184370} +// Pre-computed twiddles for N=30 +#define T_30_1 {0.9781476007338056888329447247087955474854,-0.2079116908177593425754992040310753509402} +#define T_30_7 {0.1045284632676534708473070622858358547091,-0.9945218953682732898613494398887269198895} +#define T_30_11 {-0.6691306063588582375700752891134470701218,-0.7431448254773942441175904605188407003880} +#define T_30_13 {-0.9135454576426008665990252666233573108912,-0.4067366430758002082690438783174613490701} +#define T_30_17 {-0.9135454576426008665990252666233573108912,0.4067366430758002082690438783174613490701} +#define T_30_19 {-0.6691306063588582375700752891134470701218,0.7431448254773942441175904605188407003880} +#define T_30_23 {0.1045284632676534708473070622858358547091,0.9945218953682732898613494398887269198895} +#define T_30_29 {0.9781476007338056888329447247087955474854,0.2079116908177593425754992040310753509402} +// Pre-computed twiddles for N=31 +#define T_31_1 {0.9795299412524944848712493694620206952095,-0.2012985200886600878344978582390467636287} +#define T_31_2 {0.9189578116202306024007384621654637157917,-0.3943558551133185541281989117123885080218} +#define T_31_3 {0.8207634412072762897238931145693641155958,-0.5712682150947923087613844472798518836498} +#define T_31_4 {0.6889669190756865235769623723172117024660,-0.7247927872291199946630513295531272888184} +#define T_31_5 {0.5289640103269624971105145050387363880873,-0.8486442574947509198679540531884413212538} +#define T_31_6 {0.3473052528448202780353426533110905438662,-0.9377521321470804194220249883073847740889} +#define T_31_7 {0.1514277775045766716299766585507313720882,-0.9884683243281113806943949384731240570545} +#define T_31_8 {-0.0506491688387127117487729321965161943808,-0.9987165071710527586290595536411274224520} +#define T_31_9 {-0.2506525322587205284463607313227839767933,-0.9680771188662042892048020803485997021198} +#define T_31_10 {-0.4403941515576342835558421029418241232634,-0.8978045395707416886921237164642661809921} +#define T_31_11 {-0.6121059825476627969109699733962770551443,-0.7907757369376985367992460851382929831743} +#define T_31_12 {-0.7587581226927908595314420381328091025352,-0.6513724827222222568678944298881106078625} +#define T_31_13 {-0.8743466161445820894471125939162448048592,-0.4853019625310810436502606535213999450207} +#define T_31_14 {-0.9541392564000488185627091297646984457970,-0.2993631229733579313112556974374456331134} +#define T_31_15 {-0.9948693233918951550620590751350391656160,-0.1011683219874321831843744234902260359377} +#define T_31_16 {-0.9948693233918951550620590751350391656160,0.1011683219874321831843744234902260359377} +#define T_31_17 {-0.9541392564000488185627091297646984457970,0.2993631229733579313112556974374456331134} +#define T_31_18 {-0.8743466161445820894471125939162448048592,0.4853019625310810436502606535213999450207} +#define T_31_19 {-0.7587581226927908595314420381328091025352,0.6513724827222222568678944298881106078625} +#define T_31_20 {-0.6121059825476627969109699733962770551443,0.7907757369376985367992460851382929831743} +#define T_31_21 {-0.4403941515576342835558421029418241232634,0.8978045395707416886921237164642661809921} +#define T_31_22 {-0.2506525322587205284463607313227839767933,0.9680771188662042892048020803485997021198} +#define T_31_23 {-0.0506491688387127117487729321965161943808,0.9987165071710527586290595536411274224520} +#define T_31_24 {0.1514277775045766716299766585507313720882,0.9884683243281113806943949384731240570545} +#define T_31_25 {0.3473052528448202780353426533110905438662,0.9377521321470804194220249883073847740889} +#define T_31_26 {0.5289640103269624971105145050387363880873,0.8486442574947509198679540531884413212538} +#define T_31_27 {0.6889669190756865235769623723172117024660,0.7247927872291199946630513295531272888184} +#define T_31_28 {0.8207634412072762897238931145693641155958,0.5712682150947923087613844472798518836498} +#define T_31_29 {0.9189578116202306024007384621654637157917,0.3943558551133185541281989117123885080218} +#define T_31_30 {0.9795299412524944848712493694620206952095,0.2012985200886600878344978582390467636287} +// Pre-computed twiddles for N=32 +#define T_32_1 {0.9807852804032304305792422383092343807220,-0.1950903220161282758393639369387528859079} +#define T_32_3 {0.8314696123025452356714026791451033204794,-0.5555702330196021776487214083317667245865} +#define T_32_5 {0.5555702330196021776487214083317667245865,-0.8314696123025452356714026791451033204794} +#define T_32_7 {0.1950903220161282758393639369387528859079,-0.9807852804032304305792422383092343807220} +#define T_32_9 {-0.1950903220161282758393639369387528859079,-0.9807852804032304305792422383092343807220} +#define T_32_11 {-0.5555702330196021776487214083317667245865,-0.8314696123025452356714026791451033204794} +#define T_32_13 {-0.8314696123025452356714026791451033204794,-0.5555702330196021776487214083317667245865} +#define T_32_15 {-0.9807852804032304305792422383092343807220,-0.1950903220161282758393639369387528859079} +#define T_32_17 {-0.9807852804032304305792422383092343807220,0.1950903220161282758393639369387528859079} +#define T_32_19 {-0.8314696123025452356714026791451033204794,0.5555702330196021776487214083317667245865} +#define T_32_21 {-0.5555702330196021776487214083317667245865,0.8314696123025452356714026791451033204794} +#define T_32_23 {-0.1950903220161282758393639369387528859079,0.9807852804032304305792422383092343807220} +#define T_32_25 {0.1950903220161282758393639369387528859079,0.9807852804032304305792422383092343807220} +#define T_32_27 {0.5555702330196021776487214083317667245865,0.8314696123025452356714026791451033204794} +#define T_32_29 {0.8314696123025452356714026791451033204794,0.5555702330196021776487214083317667245865} +#define T_32_31 {0.9807852804032304305792422383092343807220,0.1950903220161282758393639369387528859079} +// Pre-computed twiddles for N=33 +#define T_33_1 {0.9819286972627067067165285152441356331110,-0.1892512443604102145844336746449698694050} +#define T_33_2 {0.9283679330160725662324239237932488322258,-0.3716624556603275175703515742497984319925} +#define T_33_4 {0.7237340381050701987319939689768943935633,-0.6900790114821120369015261530876159667969} +#define T_33_5 {0.5800569095711981537633050720614846795797,-0.8145759520503357276055567126604728400707} +#define T_33_7 {0.2357589355094272254298459756682859733701,-0.9718115683235416524254901560198049992323} +#define T_33_8 {0.0475819158237422989987663868305389769375,-0.9988673391830079628661565038783010095358} +#define T_33_10 {-0.3270679633174216616176011029892833903432,-0.9450008187146684557333742304763291031122} +#define T_33_13 {-0.7860530947427875059219104514340870082378,-0.6181589862206051666504436070681549608707} +#define T_33_14 {-0.8888354486549234767522875699796713888645,-0.4582265217274104074363094696309417486191} +#define T_33_16 {-0.9954719225730845888477915650582872331142,-0.0950560433041826580158328852121485397220} +#define T_33_17 {-0.9954719225730845888477915650582872331142,0.0950560433041826580158328852121485397220} +#define T_33_19 {-0.8888354486549234767522875699796713888645,0.4582265217274104074363094696309417486191} +#define T_33_20 {-0.7860530947427875059219104514340870082378,0.6181589862206051666504436070681549608707} +#define T_33_23 {-0.3270679633174216616176011029892833903432,0.9450008187146684557333742304763291031122} +#define T_33_25 {0.0475819158237422989987663868305389769375,0.9988673391830079628661565038783010095358} +#define T_33_26 {0.2357589355094272254298459756682859733701,0.9718115683235416524254901560198049992323} +#define T_33_28 {0.5800569095711981537633050720614846795797,0.8145759520503357276055567126604728400707} +#define T_33_29 {0.7237340381050701987319939689768943935633,0.6900790114821120369015261530876159667969} +#define T_33_31 {0.9283679330160725662324239237932488322258,0.3716624556603275175703515742497984319925} +#define T_33_32 {0.9819286972627067067165285152441356331110,0.1892512443604102145844336746449698694050} +// Pre-computed twiddles for N=34 +#define T_34_1 {0.9829730996839017853616837783192750066519,-0.1837495178165703402228814411500934511423} +#define T_34_3 {0.8502171357296141041004489125043619424105,-0.5264321628773558359171147458255290985107} +#define T_34_5 {0.6026346363792564053696310111263301223516,-0.7980172272802394939361647629993967711926} +#define T_34_7 {0.2736629900720828900695380525576183572412,-0.9618256431728190447572046650748234242201} +#define T_34_9 {-0.0922683594633020021102964847159455530345,-0.9957341762950344676852409975253976881504} +#define T_34_11 {-0.4457383557765382531279385602829279378057,-0.8951632913550623404930206561402883380651} +#define T_34_13 {-0.7390089172206590895086719683604314923286,-0.6736956436465572073402086061832960695028} +#define T_34_15 {-0.9324722294043558123988191255193669348955,-0.3612416661871529210969811174436472356319} +#define T_34_19 {-0.9324722294043558123988191255193669348955,0.3612416661871529210969811174436472356319} +#define T_34_21 {-0.7390089172206590895086719683604314923286,0.6736956436465572073402086061832960695028} +#define T_34_23 {-0.4457383557765382531279385602829279378057,0.8951632913550623404930206561402883380651} +#define T_34_25 {-0.0922683594633020021102964847159455530345,0.9957341762950344676852409975253976881504} +#define T_34_27 {0.2736629900720828900695380525576183572412,0.9618256431728190447572046650748234242201} +#define T_34_29 {0.6026346363792564053696310111263301223516,0.7980172272802394939361647629993967711926} +#define T_34_31 {0.8502171357296141041004489125043619424105,0.5264321628773558359171147458255290985107} +#define T_34_33 {0.9829730996839017853616837783192750066519,0.1837495178165703402228814411500934511423} +// Pre-computed twiddles for N=35 +#define T_35_1 {0.9839295885986296674730056111002340912819,-0.1785568947986366505897848355743917636573} +#define T_35_2 {0.9362348706397372044563098825165070593357,-0.3513748240813426848561107362911570817232} +#define T_35_3 {0.8584487936018661002890439704060554504395,-0.5128992774059061288127736588648986071348} +#define T_35_4 {0.7530714660036109675544935271318536251783,-0.6579387259397125875537426509254146367311} +#define T_35_6 {0.4738686624729986918147517371835419908166,-0.8805955318567380007976908018463291227818} +#define T_35_8 {0.1342332658176554893714182981057092547417,-0.9909497617679347003516454606142360717058} +#define T_35_9 {-0.0448648303505149240177907188353856327012,-0.9989930665413147004016991559183225035667} +#define T_35_11 {-0.3930250316539236066226692400960018858314,-0.9195277725514506839132877757947426289320} +#define T_35_12 {-0.5508969814521025298859058239031583070755,-0.8345732537213026436262452989467419683933} +#define T_35_13 {-0.6910626489868646471492752425547223538160,-0.7227948638273915538121627832879312336445} +#define T_35_16 {-0.9639628606958532408555129222804680466652,-0.2660368455666751175137108020862797275186} +#define T_35_17 {-0.9959742939952390683444605201657395809889,-0.0896393089034334955567118186081643216312} +#define T_35_18 {-0.9959742939952390683444605201657395809889,0.0896393089034334955567118186081643216312} +#define T_35_19 {-0.9639628606958532408555129222804680466652,0.2660368455666751175137108020862797275186} +#define T_35_22 {-0.6910626489868646471492752425547223538160,0.7227948638273915538121627832879312336445} +#define T_35_23 {-0.5508969814521025298859058239031583070755,0.8345732537213026436262452989467419683933} +#define T_35_24 {-0.3930250316539236066226692400960018858314,0.9195277725514506839132877757947426289320} +#define T_35_26 {-0.0448648303505149240177907188353856327012,0.9989930665413147004016991559183225035667} +#define T_35_27 {0.1342332658176554893714182981057092547417,0.9909497617679347003516454606142360717058} +#define T_35_29 {0.4738686624729986918147517371835419908166,0.8805955318567380007976908018463291227818} +#define T_35_31 {0.7530714660036109675544935271318536251783,0.6579387259397125875537426509254146367311} +#define T_35_32 {0.8584487936018661002890439704060554504395,0.5128992774059061288127736588648986071348} +#define T_35_33 {0.9362348706397372044563098825165070593357,0.3513748240813426848561107362911570817232} +#define T_35_34 {0.9839295885986296674730056111002340912819,0.1785568947986366505897848355743917636573} +// Pre-computed twiddles for N=36 +#define T_36_1 {0.9848077530122080203156542665965389460325,-0.1736481776669303589422099776129471138120} +#define T_36_5 {0.6427876096865393629187224178167525678873,-0.7660444431189780134516809084743726998568} +#define T_36_7 {0.3420201433256687129080830800376133993268,-0.9396926207859084279050421173451468348503} +#define T_36_11 {-0.3420201433256687129080830800376133993268,-0.9396926207859084279050421173451468348503} +#define T_36_13 {-0.6427876096865393629187224178167525678873,-0.7660444431189780134516809084743726998568} +#define T_36_17 {-0.9848077530122080203156542665965389460325,-0.1736481776669303589422099776129471138120} +#define T_36_19 {-0.9848077530122080203156542665965389460325,0.1736481776669303589422099776129471138120} +#define T_36_23 {-0.6427876096865393629187224178167525678873,0.7660444431189780134516809084743726998568} +#define T_36_25 {-0.3420201433256687129080830800376133993268,0.9396926207859084279050421173451468348503} +#define T_36_29 {0.3420201433256687129080830800376133993268,0.9396926207859084279050421173451468348503} +#define T_36_31 {0.6427876096865393629187224178167525678873,0.7660444431189780134516809084743726998568} +#define T_36_35 {0.9848077530122080203156542665965389460325,0.1736481776669303589422099776129471138120} +// Pre-computed twiddles for N=37 +#define T_37_1 {0.9856159103477084570954502851236611604691,-0.1690008203218490712149701948874280788004} +#define T_37_2 {0.9428774454610842070678700110875070095062,-0.3331397947420575711419132858281955122948} +#define T_37_3 {0.8730141131611881544216657857759855687618,-0.4876949438136345404970484196383040398359} +#define T_37_4 {0.7780357543184395208157866363762877881527,-0.6282199972956422628911354877345729619265} +#define T_37_5 {0.6606747233900814864071548981883097440004,-0.7506723052527243256193401066411752253771} +#define T_37_6 {0.5243072835572316625629696318355854600668,-0.8515291377333112921022006958082783967257} +#define T_37_7 {0.3728564777803086149887690226023551076651,-0.9278890272965093499735189652710687369108} +#define T_37_8 {0.2106792699957263093057946434782934375107,-0.9775552389476861447903388580016326159239} +#define T_37_9 {0.0424412031961483027231452069827355444431,-0.9990989662046815267260058135434519499540} +#define T_37_10 {-0.1270178197468787573143345071002840995789,-0.9919004352588768336218549848126713186502} +#define T_37_11 {-0.2928227712765503754077656139997998252511,-0.9561667347392509563874796185700688511133} +#define T_37_12 {-0.4502037448176732792326504295488120988011,-0.8929258581495684854445471501094289124012} +#define T_37_13 {-0.5946331763042865814483661779377143830061,-0.8039971303669405378045098586881067603827} +#define T_37_14 {-0.7219560939545244249160305116674862802029,-0.6919388689775461953601620734843891113997} +#define T_37_15 {-0.8285096492438420723303238446533214300871,-0.5599747861375954371965235623065382242203} +#define T_37_16 {-0.9112284903881356923349699172831606119871,-0.4119012482439926570165766861464362591505} +#define T_37_17 {-0.9677329469334988720774504145083483308554,-0.2519780613851251849233392476890003308654} +#define T_37_18 {-0.9963974885425265215488366266072262078524,-0.0848059244755091923284595623044879175723} +#define T_37_19 {-0.9963974885425265215488366266072262078524,0.0848059244755091923284595623044879175723} +#define T_37_20 {-0.9677329469334988720774504145083483308554,0.2519780613851251849233392476890003308654} +#define T_37_21 {-0.9112284903881356923349699172831606119871,0.4119012482439926570165766861464362591505} +#define T_37_22 {-0.8285096492438420723303238446533214300871,0.5599747861375954371965235623065382242203} +#define T_37_23 {-0.7219560939545244249160305116674862802029,0.6919388689775461953601620734843891113997} +#define T_37_24 {-0.5946331763042865814483661779377143830061,0.8039971303669405378045098586881067603827} +#define T_37_25 {-0.4502037448176732792326504295488120988011,0.8929258581495684854445471501094289124012} +#define T_37_26 {-0.2928227712765503754077656139997998252511,0.9561667347392509563874796185700688511133} +#define T_37_27 {-0.1270178197468787573143345071002840995789,0.9919004352588768336218549848126713186502} +#define T_37_28 {0.0424412031961483027231452069827355444431,0.9990989662046815267260058135434519499540} +#define T_37_29 {0.2106792699957263093057946434782934375107,0.9775552389476861447903388580016326159239} +#define T_37_30 {0.3728564777803086149887690226023551076651,0.9278890272965093499735189652710687369108} +#define T_37_31 {0.5243072835572316625629696318355854600668,0.8515291377333112921022006958082783967257} +#define T_37_32 {0.6606747233900814864071548981883097440004,0.7506723052527243256193401066411752253771} +#define T_37_33 {0.7780357543184395208157866363762877881527,0.6282199972956422628911354877345729619265} +#define T_37_34 {0.8730141131611881544216657857759855687618,0.4876949438136345404970484196383040398359} +#define T_37_35 {0.9428774454610842070678700110875070095062,0.3331397947420575711419132858281955122948} +#define T_37_36 {0.9856159103477084570954502851236611604691,0.1690008203218490712149701948874280788004} +// Pre-computed twiddles for N=38 +#define T_38_1 {0.9863613034027223225308489418239332735538,-0.1645945902807338934170644506593816913664} +#define T_38_3 {0.8794737512064890738727740426838863641024,-0.4759473930370735628159195584885310381651} +#define T_38_5 {0.6772815716257410967315877314831595867872,-0.7357239106731315869680543073627632111311} +#define T_38_7 {0.4016954246529694794176634786708746105433,-0.9157733266550573958397762908134609460831} +#define T_38_9 {0.0825793454723323244737187565078784245998,-0.9965844930066698470483288474497385323048} +#define T_38_11 {-0.2454854871407991523390990096231689676642,-0.9694002659393303744650438602548092603683} +#define T_38_13 {-0.5469481581224269195473652871442027390003,-0.8371664782625285461392650177003815770149} +#define T_38_15 {-0.7891405093963935746259608094987925142050,-0.6142127126896678168677112807927187532187} +#define T_38_17 {-0.9458172417006346366363800370891112834215,-0.3246994692046835107035462897329125553370} +#define T_38_21 {-0.9458172417006346366363800370891112834215,0.3246994692046835107035462897329125553370} +#define T_38_23 {-0.7891405093963935746259608094987925142050,0.6142127126896678168677112807927187532187} +#define T_38_25 {-0.5469481581224269195473652871442027390003,0.8371664782625285461392650177003815770149} +#define T_38_27 {-0.2454854871407991523390990096231689676642,0.9694002659393303744650438602548092603683} +#define T_38_29 {0.0825793454723323244737187565078784245998,0.9965844930066698470483288474497385323048} +#define T_38_31 {0.4016954246529694794176634786708746105433,0.9157733266550573958397762908134609460831} +#define T_38_33 {0.6772815716257410967315877314831595867872,0.7357239106731315869680543073627632111311} +#define T_38_35 {0.8794737512064890738727740426838863641024,0.4759473930370735628159195584885310381651} +#define T_38_37 {0.9863613034027223225308489418239332735538,0.1645945902807338934170644506593816913664} +// Pre-computed twiddles for N=39 +#define T_39_1 {0.9870502626379128452427380580047611147165,-0.1604112808577602411475027111009694635868} +#define T_39_2 {0.9485364419471454811016997155093122273684,-0.3166679938014724804418165149400010704994} +#define T_39_4 {0.7994427634035011775992529692302923649549,-0.6007422642379789401445577823324128985405} +#define T_39_5 {0.6927243535095993776096179317391943186522,-0.7212024473438145566461798807722516357899} +#define T_39_7 {0.4286925614030541731303003416542196646333,-0.9034504346103823246849628958443645387888} +#define T_39_8 {0.2782174639164526341872374359809327870607,-0.9605181116313723510558020279859192669392} +#define T_39_10 {-0.0402659401094151436772961005772231146693,-0.9991889981715695867592330614570528268814} +#define T_39_11 {-0.2000256937760444275742344188984134234488,-0.9797906520422676734938249865081161260605} +#define T_39_14 {-0.6324453755953772793318989897670689970255,-0.7746049618276545922768150376214180141687} +#define T_39_16 {-0.8451900855437947113202312721114140003920,-0.5344658261278011357475747900025453418493} +#define T_39_17 {-0.9199794436588242252383906816248781979084,-0.3919666098600750880898146988329244777560} +#define T_39_19 {-0.9967573081342100405066730672842822968960,-0.0804665687167258752987564207614923361689} +#define T_39_20 {-0.9967573081342100405066730672842822968960,0.0804665687167258752987564207614923361689} +#define T_39_22 {-0.9199794436588242252383906816248781979084,0.3919666098600750880898146988329244777560} +#define T_39_23 {-0.8451900855437947113202312721114140003920,0.5344658261278011357475747900025453418493} +#define T_39_25 {-0.6324453755953772793318989897670689970255,0.7746049618276545922768150376214180141687} +#define T_39_28 {-0.2000256937760444275742344188984134234488,0.9797906520422676734938249865081161260605} +#define T_39_29 {-0.0402659401094151436772961005772231146693,0.9991889981715695867592330614570528268814} +#define T_39_31 {0.2782174639164526341872374359809327870607,0.9605181116313723510558020279859192669392} +#define T_39_32 {0.4286925614030541731303003416542196646333,0.9034504346103823246849628958443645387888} +#define T_39_34 {0.6927243535095993776096179317391943186522,0.7212024473438145566461798807722516357899} +#define T_39_35 {0.7994427634035011775992529692302923649549,0.6007422642379789401445577823324128985405} +#define T_39_37 {0.9485364419471454811016997155093122273684,0.3166679938014724804418165149400010704994} +#define T_39_38 {0.9870502626379128452427380580047611147165,0.1604112808577602411475027111009694635868} +// Pre-computed twiddles for N=40 +#define T_40_1 {0.9876883405951377703502203075913712382317,-0.1564344650402308689596253543641068972647} +#define T_40_3 {0.8910065241883678988088490768859628587961,-0.4539904997395468044807387286709854379296} +#define T_40_7 {0.4539904997395468044807387286709854379296,-0.8910065241883678988088490768859628587961} +#define T_40_9 {0.1564344650402308689596253543641068972647,-0.9876883405951377703502203075913712382317} +#define T_40_11 {-0.1564344650402308689596253543641068972647,-0.9876883405951377703502203075913712382317} +#define T_40_13 {-0.4539904997395468044807387286709854379296,-0.8910065241883678988088490768859628587961} +#define T_40_17 {-0.8910065241883678988088490768859628587961,-0.4539904997395468044807387286709854379296} +#define T_40_19 {-0.9876883405951377703502203075913712382317,-0.1564344650402308689596253543641068972647} +#define T_40_21 {-0.9876883405951377703502203075913712382317,0.1564344650402308689596253543641068972647} +#define T_40_23 {-0.8910065241883678988088490768859628587961,0.4539904997395468044807387286709854379296} +#define T_40_27 {-0.4539904997395468044807387286709854379296,0.8910065241883678988088490768859628587961} +#define T_40_29 {-0.1564344650402308689596253543641068972647,0.9876883405951377703502203075913712382317} +#define T_40_31 {0.1564344650402308689596253543641068972647,0.9876883405951377703502203075913712382317} +#define T_40_33 {0.4539904997395468044807387286709854379296,0.8910065241883678988088490768859628587961} +#define T_40_37 {0.8910065241883678988088490768859628587961,0.4539904997395468044807387286709854379296} +#define T_40_39 {0.9876883405951377703502203075913712382317,0.1564344650402308689596253543641068972647} +// Pre-computed twiddles for N=41 +#define T_41_1 {0.9882804237803485269964198778325226157904,-0.1526492842188744980536085904532228596509} +#define T_41_2 {0.9533963920549305415264029761601705104113,-0.3017205985951922908583355820155702531338} +#define T_41_3 {0.8961655569610555982507094086031429469585,-0.4437198378669596765533356119703967124224} +#define T_41_4 {0.8179293607667176280173748637025710195303,-0.5753186602186205478304259486321825534105} +#define T_41_5 {0.7205215936007870469737213170446921139956,-0.6934325007922417238859225108171813189983} +#define T_41_6 {0.6062254109666380674781294146669097244740,-0.7952928712734264049544208319275639951229} +#define T_41_7 {0.4777198185122629170429320311086485162377,-0.8785122509109424271400712314061820507050} +#define T_41_8 {0.3380168784085027522401389887818368151784,-0.9411400479795615225953042681794613599777} +#define T_41_9 {0.1903911091646683650946414445570553652942,-0.9817083199968549278580098871316295117140} +#define T_41_10 {0.0383027336900353468363000786212069215253,-0.9992661810508099984318164388241712003946} +#define T_41_11 {-0.1146834253984004292625797916116425767541,-0.9934020897596750332425585838791448622942} +#define T_41_12 {-0.2649815021966616668258609479380538687110,-0.9642534954531409852052092901431024074554} +#define T_41_13 {-0.4090686371713398994387489437940530478954,-0.9125036164765499835382911442138720303774} +#define T_41_14 {-0.5435675500012211447398158270516432821751,-0.8393654261319499765292562187823932617903} +#define T_41_15 {-0.6653257001655653590788119799981359392405,-0.7465532216119626918171547913516405969858} +#define T_41_16 {-0.7714891798219428720173596047970931977034,-0.6362424423265597983601082887616939842701} +#define T_41_17 {-0.8595696069872011957357926803524605929852,-0.5110186794471103244319465375156141817570} +#define T_41_18 {-0.9275024511020946160755329401581548154354,-0.3738170718407687886930546028452226892114} +#define T_41_19 {-0.9736954238777790759939989584381692111492,-0.2278535089031375748813701420658617280424} +#define T_41_20 {-0.9970658011837404410826479761453811079264,-0.0765492528364956487108017313403252046555} +#define T_41_21 {-0.9970658011837404410826479761453811079264,0.0765492528364956487108017313403252046555} +#define T_41_22 {-0.9736954238777790759939989584381692111492,0.2278535089031375748813701420658617280424} +#define T_41_23 {-0.9275024511020946160755329401581548154354,0.3738170718407687886930546028452226892114} +#define T_41_24 {-0.8595696069872011957357926803524605929852,0.5110186794471103244319465375156141817570} +#define T_41_25 {-0.7714891798219428720173596047970931977034,0.6362424423265597983601082887616939842701} +#define T_41_26 {-0.6653257001655653590788119799981359392405,0.7465532216119626918171547913516405969858} +#define T_41_27 {-0.5435675500012211447398158270516432821751,0.8393654261319499765292562187823932617903} +#define T_41_28 {-0.4090686371713398994387489437940530478954,0.9125036164765499835382911442138720303774} +#define T_41_29 {-0.2649815021966616668258609479380538687110,0.9642534954531409852052092901431024074554} +#define T_41_30 {-0.1146834253984004292625797916116425767541,0.9934020897596750332425585838791448622942} +#define T_41_31 {0.0383027336900353468363000786212069215253,0.9992661810508099984318164388241712003946} +#define T_41_32 {0.1903911091646683650946414445570553652942,0.9817083199968549278580098871316295117140} +#define T_41_33 {0.3380168784085027522401389887818368151784,0.9411400479795615225953042681794613599777} +#define T_41_34 {0.4777198185122629170429320311086485162377,0.8785122509109424271400712314061820507050} +#define T_41_35 {0.6062254109666380674781294146669097244740,0.7952928712734264049544208319275639951229} +#define T_41_36 {0.7205215936007870469737213170446921139956,0.6934325007922417238859225108171813189983} +#define T_41_37 {0.8179293607667176280173748637025710195303,0.5753186602186205478304259486321825534105} +#define T_41_38 {0.8961655569610555982507094086031429469585,0.4437198378669596765533356119703967124224} +#define T_41_39 {0.9533963920549305415264029761601705104113,0.3017205985951922908583355820155702531338} +#define T_41_40 {0.9882804237803485269964198778325226157904,0.1526492842188744980536085904532228596509} +// Pre-computed twiddles for N=42 +#define T_42_1 {0.9888308262251285229993413850024808198214,-0.1490422661761744427266762613726314157248} +#define T_42_5 {0.7330518718298263403099213064706418663263,-0.6801727377709193556043487660645041614771} +#define T_42_11 {-0.0747300935864242538153590089677891228348,-0.9972037971811801293497978804225567728281} +#define T_42_13 {-0.3653410243663950396353357064072042703629,-0.9308737486442042463608004254638217389584} +#define T_42_17 {-0.8262387743159949060611779714236035943031,-0.5633200580636220644592526696214918047190} +#define T_42_19 {-0.9555728057861406776751778124889824539423,-0.2947551744109042082264693362958496436477} +#define T_42_23 {-0.9555728057861406776751778124889824539423,0.2947551744109042082264693362958496436477} +#define T_42_25 {-0.8262387743159949060611779714236035943031,0.5633200580636220644592526696214918047190} +#define T_42_29 {-0.3653410243663950396353357064072042703629,0.9308737486442042463608004254638217389584} +#define T_42_31 {-0.0747300935864242538153590089677891228348,0.9972037971811801293497978804225567728281} +#define T_42_37 {0.7330518718298263403099213064706418663263,0.6801727377709193556043487660645041614771} +#define T_42_41 {0.9888308262251285229993413850024808198214,0.1490422661761744427266762613726314157248} +// Pre-computed twiddles for N=43 +#define T_43_1 {0.9893433680751102521355733188102021813393,-0.1456011677350048660883885531802661716938} +#define T_43_2 {0.9576005999084059583381645097688306123018,-0.2880990993652375875555549100681673735380} +#define T_43_3 {0.9054482374931466237555355291988234966993,-0.4244566988758151082627989580942085012794} +#define T_43_4 {0.8339978178898779237471217129495926201344,-0.5517677407704458802939484485250432044268} +#define T_43_5 {0.7447721827437818742367880986421369016171,-0.6673188112222394607897513196803629398346} +#define T_43_6 {0.6396730215588912749069550045533105731010,-0.7686471397785320514017826099006924778223} +#define T_43_7 {0.5209403404879302534880025632446631789207,-0.8535930890373464308140682987868785858154} +#define T_43_8 {0.3911047204901559881662365114607382565737,-0.9203461835691594261632531015493441373110} +#define T_43_9 {0.2529333823916807277853990854055155068636,-0.9674836970574252070775855827378109097481} +#define T_43_10 {0.1093712083778743821005363656695408280939,-0.9940009752399459053151531406911090016365} +#define T_43_11 {-0.0365220230576588367199519780115224421024,-0.9993328483702393905119265582470688968897} +#define T_43_12 {-0.1816368509794364372034181087656179443002,-0.9833656768294660732721013118862174451351} +#define T_43_13 {-0.3228804047714461944451613817363977432251,-0.9464397731576092986927051242673769593239} +#define T_43_14 {-0.4572423233046385426803226437186822295189,-0.8893421488825189458182762791693676263094} +#define T_43_15 {-0.5818589155579528826578439293371047824621,-0.8132897407355653696114927697635721415281} +#define T_43_16 {-0.6940741952206338494235637881502043455839,-0.7199034737579957932140928278386127203703} +#define T_43_17 {-0.7914964884292541302812651338172145187855,-0.6111737140978492632825691543985158205032} +#define T_43_18 {-0.8720494081438076028334194234048482030630,-0.4894178478110854935145823674247367307544} +#define T_43_19 {-0.9340161087325480337995031732134521007538,-0.3572308898011327671362380442587891593575} +#define T_43_20 {-0.9760758775559271738231359449855517596006,-0.2174301755815569725527325317671056836843} +#define T_43_21 {-0.9973322836635516441816662336350418627262,-0.0729953146609075287365442363807233050466} +#define T_43_22 {-0.9973322836635516441816662336350418627262,0.0729953146609075287365442363807233050466} +#define T_43_23 {-0.9760758775559271738231359449855517596006,0.2174301755815569725527325317671056836843} +#define T_43_24 {-0.9340161087325480337995031732134521007538,0.3572308898011327671362380442587891593575} +#define T_43_25 {-0.8720494081438076028334194234048482030630,0.4894178478110854935145823674247367307544} +#define T_43_26 {-0.7914964884292541302812651338172145187855,0.6111737140978492632825691543985158205032} +#define T_43_27 {-0.6940741952206338494235637881502043455839,0.7199034737579957932140928278386127203703} +#define T_43_28 {-0.5818589155579528826578439293371047824621,0.8132897407355653696114927697635721415281} +#define T_43_29 {-0.4572423233046385426803226437186822295189,0.8893421488825189458182762791693676263094} +#define T_43_30 {-0.3228804047714461944451613817363977432251,0.9464397731576092986927051242673769593239} +#define T_43_31 {-0.1816368509794364372034181087656179443002,0.9833656768294660732721013118862174451351} +#define T_43_32 {-0.0365220230576588367199519780115224421024,0.9993328483702393905119265582470688968897} +#define T_43_33 {0.1093712083778743821005363656695408280939,0.9940009752399459053151531406911090016365} +#define T_43_34 {0.2529333823916807277853990854055155068636,0.9674836970574252070775855827378109097481} +#define T_43_35 {0.3911047204901559881662365114607382565737,0.9203461835691594261632531015493441373110} +#define T_43_36 {0.5209403404879302534880025632446631789207,0.8535930890373464308140682987868785858154} +#define T_43_37 {0.6396730215588912749069550045533105731010,0.7686471397785320514017826099006924778223} +#define T_43_38 {0.7447721827437818742367880986421369016171,0.6673188112222394607897513196803629398346} +#define T_43_39 {0.8339978178898779237471217129495926201344,0.5517677407704458802939484485250432044268} +#define T_43_40 {0.9054482374931466237555355291988234966993,0.4244566988758151082627989580942085012794} +#define T_43_41 {0.9576005999084059583381645097688306123018,0.2880990993652375875555549100681673735380} +#define T_43_42 {0.9893433680751102521355733188102021813393,0.1456011677350048660883885531802661716938} +// Pre-computed twiddles for N=44 +#define T_44_1 {0.9898214418809326842207951813179533928633,-0.1423148382732851435772403192459023557603} +#define T_44_3 {0.9096319953545183301102383666147943586111,-0.4154150130018864350844864929968025535345} +#define T_44_5 {0.7557495743542582689045161714602727442980,-0.6548607339452851006100786435126792639494} +#define T_44_7 {0.5406408174555975554298470342473592609167,-0.8412535328311812055090967987780459225178} +#define T_44_9 {0.2817325568414296710351152341900160536170,-0.9594929736144973686506887133873533457518} +#define T_44_13 {-0.2817325568414296710351152341900160536170,-0.9594929736144973686506887133873533457518} +#define T_44_15 {-0.5406408174555975554298470342473592609167,-0.8412535328311812055090967987780459225178} +#define T_44_17 {-0.7557495743542582689045161714602727442980,-0.6548607339452851006100786435126792639494} +#define T_44_19 {-0.9096319953545183301102383666147943586111,-0.4154150130018864350844864929968025535345} +#define T_44_21 {-0.9898214418809326842207951813179533928633,-0.1423148382732851435772403192459023557603} +#define T_44_23 {-0.9898214418809326842207951813179533928633,0.1423148382732851435772403192459023557603} +#define T_44_25 {-0.9096319953545183301102383666147943586111,0.4154150130018864350844864929968025535345} +#define T_44_27 {-0.7557495743542582689045161714602727442980,0.6548607339452851006100786435126792639494} +#define T_44_29 {-0.5406408174555975554298470342473592609167,0.8412535328311812055090967987780459225178} +#define T_44_31 {-0.2817325568414296710351152341900160536170,0.9594929736144973686506887133873533457518} +#define T_44_35 {0.2817325568414296710351152341900160536170,0.9594929736144973686506887133873533457518} +#define T_44_37 {0.5406408174555975554298470342473592609167,0.8412535328311812055090967987780459225178} +#define T_44_39 {0.7557495743542582689045161714602727442980,0.6548607339452851006100786435126792639494} +#define T_44_41 {0.9096319953545183301102383666147943586111,0.4154150130018864350844864929968025535345} +#define T_44_43 {0.9898214418809326842207951813179533928633,0.1423148382732851435772403192459023557603} +// Pre-computed twiddles for N=45 +#define T_45_1 {0.9902680687415703619791429446195252239704,-0.1391731009600654378477457839835551567376} +#define T_45_2 {0.9612616959383188941501430235803127288818,-0.2756373558169991633270967668067896738648} +#define T_45_4 {0.8480480961564259567708745635172817856073,-0.5299192642332049008047079041716642677784} +#define T_45_7 {0.5591929034707467938147829045192338526249,-0.8290375725550417351783494268602225929499} +#define T_45_8 {0.4383711467890774038380641286494210362434,-0.8987940462991670376169395240140147507191} +#define T_45_11 {0.0348994967025009691918846499447681708261,-0.9993908270190957621181837566837202757597} +#define T_45_13 {-0.2419218955996677300479547056966111995280,-0.9702957262759964729426087615138385444880} +#define T_45_14 {-0.3746065934159120147661781174974748864770,-0.9271838545667874242894868075381964445114} +#define T_45_16 {-0.6156614753256582917018135958642233163118,-0.7880107536067219031750141766679007560015} +#define T_45_17 {-0.7193398003386511918577639335126150399446,-0.6946583704589972541043607634492218494415} +#define T_45_19 {-0.8829475928589269884128043486271053552628,-0.4694715627858907502911733899964019656181} +#define T_45_22 {-0.9975640502598241976528470331686548888683,-0.0697564737441253024385900971537921577692} +#define T_45_23 {-0.9975640502598241976528470331686548888683,0.0697564737441253024385900971537921577692} +#define T_45_26 {-0.8829475928589269884128043486271053552628,0.4694715627858907502911733899964019656181} +#define T_45_28 {-0.7193398003386511918577639335126150399446,0.6946583704589972541043607634492218494415} +#define T_45_29 {-0.6156614753256582917018135958642233163118,0.7880107536067219031750141766679007560015} +#define T_45_31 {-0.3746065934159120147661781174974748864770,0.9271838545667874242894868075381964445114} +#define T_45_32 {-0.2419218955996677300479547056966111995280,0.9702957262759964729426087615138385444880} +#define T_45_34 {0.0348994967025009691918846499447681708261,0.9993908270190957621181837566837202757597} +#define T_45_37 {0.4383711467890774038380641286494210362434,0.8987940462991670376169395240140147507191} +#define T_45_38 {0.5591929034707467938147829045192338526249,0.8290375725550417351783494268602225929499} +#define T_45_41 {0.8480480961564259567708745635172817856073,0.5299192642332049008047079041716642677784} +#define T_45_43 {0.9612616959383188941501430235803127288818,0.2756373558169991633270967668067896738648} +#define T_45_44 {0.9902680687415703619791429446195252239704,0.1391731009600654378477457839835551567376} +// Pre-computed twiddles for N=46 +#define T_46_1 {0.9906859460363307556818313059920910745859,-0.1361666490962465947411885736073600128293} +#define T_46_3 {0.9172113015054530471559246507240459322929,-0.3984010898462414518306218269572127610445} +#define T_46_5 {0.7757112907044197980255262336868327111006,-0.6310879443260527521530889316636603325605} +#define T_46_7 {0.5766803221148671942941632551082875579596,-0.8169698930104419831721429545723367482424} +#define T_46_9 {0.3348796121709861628445992209890391677618,-0.9422609221188205097519130504224449396133} +#define T_46_11 {0.0682424133646709757394788198325841221958,-0.9976687691905391508484512996801640838385} +#define T_46_13 {-0.2034560130526337773382294926705071702600,-0.9790840876823229121939107244543265551329} +#define T_46_15 {-0.4600650377311521488365997356595471501350,-0.8878852184023752203145818384655285626650} +#define T_46_17 {-0.6825531432186541236362131712667178362608,-0.7308359642781241394260405286331661045551} +#define T_46_19 {-0.8544194045464885700624790842994116246700,-0.5195839500354335616094658689689822494984} +#define T_46_21 {-0.9629172873477992444790629633644130080938,-0.2697967711570242732577185051923152059317} +#define T_46_25 {-0.9629172873477992444790629633644130080938,0.2697967711570242732577185051923152059317} +#define T_46_27 {-0.8544194045464885700624790842994116246700,0.5195839500354335616094658689689822494984} +#define T_46_29 {-0.6825531432186541236362131712667178362608,0.7308359642781241394260405286331661045551} +#define T_46_31 {-0.4600650377311521488365997356595471501350,0.8878852184023752203145818384655285626650} +#define T_46_33 {-0.2034560130526337773382294926705071702600,0.9790840876823229121939107244543265551329} +#define T_46_35 {0.0682424133646709757394788198325841221958,0.9976687691905391508484512996801640838385} +#define T_46_37 {0.3348796121709861628445992209890391677618,0.9422609221188205097519130504224449396133} +#define T_46_39 {0.5766803221148671942941632551082875579596,0.8169698930104419831721429545723367482424} +#define T_46_41 {0.7757112907044197980255262336868327111006,0.6310879443260527521530889316636603325605} +#define T_46_43 {0.9172113015054530471559246507240459322929,0.3984010898462414518306218269572127610445} +#define T_46_45 {0.9906859460363307556818313059920910745859,0.1361666490962465947411885736073600128293} +// Pre-computed twiddles for N=47 +#define T_47_1 {0.9910774881547801395953456449205987155437,-0.1332869553737788537173258873735903762281} +#define T_47_2 {0.9644691750543765706993326602969318628311,-0.2641954018712859841855333797866478562355} +#define T_47_3 {0.9206498866764287747344042145414277911186,-0.3903892751634948088401699806126998737454} +#define T_47_4 {0.8604015792601393819438726495718583464622,-0.5096166425919174125525046292750630527735} +#define T_47_5 {0.7847993852786609991767363680992275476456,-0.6197498889602448546298774090246297419071} +#define T_47_6 {0.6951924276746422881601006338314618915319,-0.7188236838779293869094999536173418164253} +#define T_47_7 {0.5931797447293551961422508611576631665230,-0.8050700531275629678873428929364308714867} +#define T_47_8 {0.4805817551866837789020792115479707717896,-0.8769499282066715029060333108645863831043} +#define T_47_9 {0.3594077728375128510940328396827680990100,-0.9331806110416025479281643129070289433002} +#define T_47_10 {0.2318201502675282799081912799010751768947,-0.9727586637650371681473870921763591468334} +#define T_47_11 {0.1000956916240983429755573297370574437082,-0.9949778150885040250983593068667687475681} +#define T_47_12 {-0.0334149770076745677993557137597235850990,-0.9994415637302546118192481117148417979479} +#define T_47_13 {-0.1663293545831300290771537220280151814222,-0.9860702539900285668039714437327347695827} +#define T_47_14 {-0.2962755808856339934109769274073187261820,-0.9551024972069124263640560457133688032627} +#define T_47_15 {-0.4209347624283349964535716480895644053817,-0.9070909137343406980491522517695557326078} +#define T_47_16 {-0.5380823531633727174749992627766914665699,-0.8428922714167970120513473375467583537102} +#define T_47_17 {-0.6456278515588024236038222625211346894503,-0.7636521965473320738837514909391757100821} +#define T_47_18 {-0.7416521056479575824127437044808175414801,-0.6707847301392234617978260757809039205313} +#define T_47_19 {-0.8244415603417603044533734646392986178398,-0.5659470943305952017610138682357501238585} +#define T_47_20 {-0.8925188358598812321531568159116432070732,-0.4510101192161018457404964010493131354451} +#define T_47_21 {-0.9446690916079187871901012840680778026581,-0.3280248578395690839393239457422168925405} +#define T_47_22 {-0.9799617050365868609063113581214565783739,-0.1991859851038360884611932988264015875757} +#define T_47_23 {-0.9977668786231531639074887607421260327101,-0.0667926337451215518248304192638897802681} +#define T_47_24 {-0.9977668786231531639074887607421260327101,0.0667926337451215518248304192638897802681} +#define T_47_25 {-0.9799617050365868609063113581214565783739,0.1991859851038360884611932988264015875757} +#define T_47_26 {-0.9446690916079187871901012840680778026581,0.3280248578395690839393239457422168925405} +#define T_47_27 {-0.8925188358598812321531568159116432070732,0.4510101192161018457404964010493131354451} +#define T_47_28 {-0.8244415603417603044533734646392986178398,0.5659470943305952017610138682357501238585} +#define T_47_29 {-0.7416521056479575824127437044808175414801,0.6707847301392234617978260757809039205313} +#define T_47_30 {-0.6456278515588024236038222625211346894503,0.7636521965473320738837514909391757100821} +#define T_47_31 {-0.5380823531633727174749992627766914665699,0.8428922714167970120513473375467583537102} +#define T_47_32 {-0.4209347624283349964535716480895644053817,0.9070909137343406980491522517695557326078} +#define T_47_33 {-0.2962755808856339934109769274073187261820,0.9551024972069124263640560457133688032627} +#define T_47_34 {-0.1663293545831300290771537220280151814222,0.9860702539900285668039714437327347695827} +#define T_47_35 {-0.0334149770076745677993557137597235850990,0.9994415637302546118192481117148417979479} +#define T_47_36 {0.1000956916240983429755573297370574437082,0.9949778150885040250983593068667687475681} +#define T_47_37 {0.2318201502675282799081912799010751768947,0.9727586637650371681473870921763591468334} +#define T_47_38 {0.3594077728375128510940328396827680990100,0.9331806110416025479281643129070289433002} +#define T_47_39 {0.4805817551866837789020792115479707717896,0.8769499282066715029060333108645863831043} +#define T_47_40 {0.5931797447293551961422508611576631665230,0.8050700531275629678873428929364308714867} +#define T_47_41 {0.6951924276746422881601006338314618915319,0.7188236838779293869094999536173418164253} +#define T_47_42 {0.7847993852786609991767363680992275476456,0.6197498889602448546298774090246297419071} +#define T_47_43 {0.8604015792601393819438726495718583464622,0.5096166425919174125525046292750630527735} +#define T_47_44 {0.9206498866764287747344042145414277911186,0.3903892751634948088401699806126998737454} +#define T_47_45 {0.9644691750543765706993326602969318628311,0.2641954018712859841855333797866478562355} +#define T_47_46 {0.9910774881547801395953456449205987155437,0.1332869553737788537173258873735903762281} +// Pre-computed twiddles for N=48 +#define T_48_1 {0.9914448613738103821546587823831941932440,-0.1305261922200516011560722517970134504139} +#define T_48_5 {0.7933533402912351650826394688920117914677,-0.6087614290087206558865773331490345299244} +#define T_48_7 {0.6087614290087206558865773331490345299244,-0.7933533402912351650826394688920117914677} +#define T_48_11 {0.1305261922200516011560722517970134504139,-0.9914448613738103821546587823831941932440} +#define T_48_13 {-0.1305261922200516011560722517970134504139,-0.9914448613738103821546587823831941932440} +#define T_48_17 {-0.6087614290087206558865773331490345299244,-0.7933533402912351650826394688920117914677} +#define T_48_19 {-0.7933533402912351650826394688920117914677,-0.6087614290087206558865773331490345299244} +#define T_48_23 {-0.9914448613738103821546587823831941932440,-0.1305261922200516011560722517970134504139} +#define T_48_25 {-0.9914448613738103821546587823831941932440,0.1305261922200516011560722517970134504139} +#define T_48_29 {-0.7933533402912351650826394688920117914677,0.6087614290087206558865773331490345299244} +#define T_48_31 {-0.6087614290087206558865773331490345299244,0.7933533402912351650826394688920117914677} +#define T_48_35 {-0.1305261922200516011560722517970134504139,0.9914448613738103821546587823831941932440} +#define T_48_37 {0.1305261922200516011560722517970134504139,0.9914448613738103821546587823831941932440} +#define T_48_41 {0.6087614290087206558865773331490345299244,0.7933533402912351650826394688920117914677} +#define T_48_43 {0.7933533402912351650826394688920117914677,0.6087614290087206558865773331490345299244} +#define T_48_47 {0.9914448613738103821546587823831941932440,0.1305261922200516011560722517970134504139} +// Pre-computed twiddles for N=49 +#define T_49_1 {0.9917900138232461637599612913618329912424,-0.1278771616845060243861098570050671696663} +#define T_49_2 {0.9672948630390294511016691103577613830566,-0.2536545839095074028612941674509784206748} +#define T_49_3 {0.9269167573460217468550581543240696191788,-0.3752670048793741441883753395813982933760} +#define T_49_4 {0.8713187041233892982106112867768388241529,-0.4907175520039379068570895015000132843852} +#define T_49_5 {0.8014136218679566159295291072339750826359,-0.5981105304912159859043185861082747578621} +#define T_49_6 {0.7183493500977276013941263954620808362961,-0.6956825506034863826343439541233237832785} +#define T_49_8 {0.5183925683105250481474968182737939059734,-0.8551427630053461959036553707846906036139} +#define T_49_9 {0.4047833431223938349674540404521394520998,-0.9144126230158124979396916387486271560192} +#define T_49_10 {0.2845275866310324475172421898605534806848,-0.9586678530366605777146560285473242402077} +#define T_49_11 {0.1595998950333792354427941972971893846989,-0.9871817834144501757620560056238900870085} +#define T_49_12 {0.0320515775716551723495406633901438908651,-0.9994862162006878936182374673080630600452} +#define T_49_13 {-0.0960230259076817610486287435378471855074,-0.9953791129491982303534314269199967384338} +#define T_49_15 {-0.3453650544213076045707566663622856140137,-0.9384684220497604423272264284605626016855} +#define T_49_16 {-0.4625382902408352592260598612483590841293,-0.8865993063730001066957697730686049908400} +#define T_49_17 {-0.5721166601221696623724710661917924880981,-0.8201722545969558630929441278567537665367} +#define T_49_18 {-0.6723008902613167858675069510354660451412,-0.7402779970753154969997922307811677455902} +#define T_49_19 {-0.7614459583691344235489850689191371202469,-0.6482283953077884186555479573144111782312} +#define T_49_20 {-0.8380881048918407127956697877380065619946,-0.5455349012105487060964037482335697859526} +#define T_49_22 {-0.9490557470106686377420146527583710849285,-0.3151082180236207119783387042843969538808} +#define T_49_23 {-0.9815591569910653291231028561014682054520,-0.1911586287013722906813484314625384286046} +#define T_49_24 {-0.9979453927503363352968790422892197966576,-0.0640702199807129252340232028473110403866} +#define T_49_25 {-0.9979453927503363352968790422892197966576,0.0640702199807129252340232028473110403866} +#define T_49_26 {-0.9815591569910653291231028561014682054520,0.1911586287013722906813484314625384286046} +#define T_49_27 {-0.9490557470106686377420146527583710849285,0.3151082180236207119783387042843969538808} +#define T_49_29 {-0.8380881048918407127956697877380065619946,0.5455349012105487060964037482335697859526} +#define T_49_30 {-0.7614459583691344235489850689191371202469,0.6482283953077884186555479573144111782312} +#define T_49_31 {-0.6723008902613167858675069510354660451412,0.7402779970753154969997922307811677455902} +#define T_49_32 {-0.5721166601221696623724710661917924880981,0.8201722545969558630929441278567537665367} +#define T_49_33 {-0.4625382902408352592260598612483590841293,0.8865993063730001066957697730686049908400} +#define T_49_34 {-0.3453650544213076045707566663622856140137,0.9384684220497604423272264284605626016855} +#define T_49_36 {-0.0960230259076817610486287435378471855074,0.9953791129491982303534314269199967384338} +#define T_49_37 {0.0320515775716551723495406633901438908651,0.9994862162006878936182374673080630600452} +#define T_49_38 {0.1595998950333792354427941972971893846989,0.9871817834144501757620560056238900870085} +#define T_49_39 {0.2845275866310324475172421898605534806848,0.9586678530366605777146560285473242402077} +#define T_49_40 {0.4047833431223938349674540404521394520998,0.9144126230158124979396916387486271560192} +#define T_49_41 {0.5183925683105250481474968182737939059734,0.8551427630053461959036553707846906036139} +#define T_49_43 {0.7183493500977276013941263954620808362961,0.6956825506034863826343439541233237832785} +#define T_49_44 {0.8014136218679566159295291072339750826359,0.5981105304912159859043185861082747578621} +#define T_49_45 {0.8713187041233892982106112867768388241529,0.4907175520039379068570895015000132843852} +#define T_49_46 {0.9269167573460217468550581543240696191788,0.3752670048793741441883753395813982933760} +#define T_49_47 {0.9672948630390294511016691103577613830566,0.2536545839095074028612941674509784206748} +#define T_49_48 {0.9917900138232461637599612913618329912424,0.1278771616845060243861098570050671696663} +// Pre-computed twiddles for N=50 +#define T_50_1 {0.9921147013144778759041741977853234857321,-0.1253332335643042583228634612169116735458} +#define T_50_3 {0.9297764858882514582560929738974664360285,-0.3681245526846779747565108209528261795640} +#define T_50_7 {0.6374239897486897454825793829513713717461,-0.7705132427757892532582673084107227623463} +#define T_50_9 {0.4257792915650726595089281545369885861874,-0.9048270524660195768262838100781664252281} +#define T_50_11 {0.1873813145857246287295794218152877874672,-0.9822872507286887211463977109815459698439} +#define T_50_13 {-0.0627905195293133738809743249476014170796,-0.9980267284282715589682766221812926232815} +#define T_50_17 {-0.5358267949789966566598309327673632651567,-0.8443279255020150753097141205216757953167} +#define T_50_19 {-0.7289686274214115524472390461596660315990,-0.6845471059286887260952880751574411988258} +#define T_50_21 {-0.8763066800438635839398671123490203171968,-0.4817536741017152679411594817793229594827} +#define T_50_23 {-0.9685831611286310760533524444326758384705,-0.2486898871648547948431939857982797548175} +#define T_50_27 {-0.9685831611286310760533524444326758384705,0.2486898871648547948431939857982797548175} +#define T_50_29 {-0.8763066800438635839398671123490203171968,0.4817536741017152679411594817793229594827} +#define T_50_31 {-0.7289686274214115524472390461596660315990,0.6845471059286887260952880751574411988258} +#define T_50_33 {-0.5358267949789966566598309327673632651567,0.8443279255020150753097141205216757953167} +#define T_50_37 {-0.0627905195293133738809743249476014170796,0.9980267284282715589682766221812926232815} +#define T_50_39 {0.1873813145857246287295794218152877874672,0.9822872507286887211463977109815459698439} +#define T_50_41 {0.4257792915650726595089281545369885861874,0.9048270524660195768262838100781664252281} +#define T_50_43 {0.6374239897486897454825793829513713717461,0.7705132427757892532582673084107227623463} +#define T_50_47 {0.9297764858882514582560929738974664360285,0.3681245526846779747565108209528261795640} +#define T_50_49 {0.9921147013144778759041741977853234857321,0.1253332335643042583228634612169116735458} +// Pre-computed twiddles for N=51 +#define T_51_1 {0.9924205096719357355183888103056233376265,-0.1228882906647141282086010960483690723777} +#define T_51_2 {0.9697969360350094936151776892074849456549,-0.2439137201083771411269651707698358222842} +#define T_51_4 {0.8810121942857844778984599543036893010139,-0.4730935568360100651474908772797789424658} +#define T_51_5 {0.8161969123562217154344011760258581489325,-0.5777738314082511150715504300023894757032} +#define T_51_7 {0.6506183002042421614774525551183614879847,-0.7594049166547071072486119192035403102636} +#define T_51_8 {0.5523649729605057956405289587564766407013,-0.8336023852211195128703025147842708975077} +#define T_51_10 {0.3323547994796596793953824544587405398488,-0.9431544344712774474714933603536337614059} +#define T_51_11 {0.2139330832064974263762024975221720524132,-0.9768483177596006816756357693520840257406} +#define T_51_13 {-0.0307950585561703529813293300776422256604,-0.9995257197133659010646056231053080409765} +#define T_51_14 {-0.1533916548786853628616455580413457937539,-0.9881654720812593772905074729351326823235} +#define T_51_16 {-0.3897858732926793856599090304371202364564,-0.9209055179449536776559170903055928647518} +#define T_51_19 {-0.6961339459629266590567908679076936095953,-0.7179119230644419591413907255628146231174} +#define T_51_20 {-0.7790805745256704772927491831069346517324,-0.6269238058941064561935263554914854466915} +#define T_51_22 {-0.9084652718195236342779708138550631701946,-0.4179603448867834081248417987808352336287} +#define T_51_23 {-0.9529420004271565991516013127693440765142,-0.3031526741130434743354271631687879562378} +#define T_51_25 {-0.9981033287370441042796187502972315996885,-0.0615609061339428345616120452632458182052} +#define T_51_26 {-0.9981033287370441042796187502972315996885,0.0615609061339428345616120452632458182052} +#define T_51_28 {-0.9529420004271565991516013127693440765142,0.3031526741130434743354271631687879562378} +#define T_51_29 {-0.9084652718195236342779708138550631701946,0.4179603448867834081248417987808352336287} +#define T_51_31 {-0.7790805745256704772927491831069346517324,0.6269238058941064561935263554914854466915} +#define T_51_32 {-0.6961339459629266590567908679076936095953,0.7179119230644419591413907255628146231174} +#define T_51_35 {-0.3897858732926793856599090304371202364564,0.9209055179449536776559170903055928647518} +#define T_51_37 {-0.1533916548786853628616455580413457937539,0.9881654720812593772905074729351326823235} +#define T_51_38 {-0.0307950585561703529813293300776422256604,0.9995257197133659010646056231053080409765} +#define T_51_40 {0.2139330832064974263762024975221720524132,0.9768483177596006816756357693520840257406} +#define T_51_41 {0.3323547994796596793953824544587405398488,0.9431544344712774474714933603536337614059} +#define T_51_43 {0.5523649729605057956405289587564766407013,0.8336023852211195128703025147842708975077} +#define T_51_44 {0.6506183002042421614774525551183614879847,0.7594049166547071072486119192035403102636} +#define T_51_46 {0.8161969123562217154344011760258581489325,0.5777738314082511150715504300023894757032} +#define T_51_47 {0.8810121942857844778984599543036893010139,0.4730935568360100651474908772797789424658} +#define T_51_49 {0.9697969360350094936151776892074849456549,0.2439137201083771411269651707698358222842} +#define T_51_50 {0.9924205096719357355183888103056233376265,0.1228882906647141282086010960483690723777} +// Pre-computed twiddles for N=52 +#define T_52_1 {0.9927088740980539727232212499075103551149,-0.1205366802553230476391377123945858329535} +#define T_52_3 {0.9350162426854148334243177487223874777555,-0.3546048870425356214219902994955191388726} +#define T_52_5 {0.8229838658936563522416918203816749155521,-0.5680647467311558118652214943722356110811} +#define T_52_7 {0.6631226582407951930520084715681150555611,-0.7485107481711010812830409122398123145103} +#define T_52_9 {0.4647231720437685620339607339701615273952,-0.8854560256532099105086786039464641362429} +#define T_52_11 {0.2393156642875577666540465315847541205585,-0.9709418174260520117968553677201271057129} +#define T_52_15 {-0.2393156642875577666540465315847541205585,-0.9709418174260520117968553677201271057129} +#define T_52_17 {-0.4647231720437685620339607339701615273952,-0.8854560256532099105086786039464641362429} +#define T_52_19 {-0.6631226582407951930520084715681150555611,-0.7485107481711010812830409122398123145103} +#define T_52_21 {-0.8229838658936563522416918203816749155521,-0.5680647467311558118652214943722356110811} +#define T_52_23 {-0.9350162426854148334243177487223874777555,-0.3546048870425356214219902994955191388726} +#define T_52_25 {-0.9927088740980539727232212499075103551149,-0.1205366802553230476391377123945858329535} +#define T_52_27 {-0.9927088740980539727232212499075103551149,0.1205366802553230476391377123945858329535} +#define T_52_29 {-0.9350162426854148334243177487223874777555,0.3546048870425356214219902994955191388726} +#define T_52_31 {-0.8229838658936563522416918203816749155521,0.5680647467311558118652214943722356110811} +#define T_52_33 {-0.6631226582407951930520084715681150555611,0.7485107481711010812830409122398123145103} +#define T_52_35 {-0.4647231720437685620339607339701615273952,0.8854560256532099105086786039464641362429} +#define T_52_37 {-0.2393156642875577666540465315847541205585,0.9709418174260520117968553677201271057129} +#define T_52_41 {0.2393156642875577666540465315847541205585,0.9709418174260520117968553677201271057129} +#define T_52_43 {0.4647231720437685620339607339701615273952,0.8854560256532099105086786039464641362429} +#define T_52_45 {0.6631226582407951930520084715681150555611,0.7485107481711010812830409122398123145103} +#define T_52_47 {0.8229838658936563522416918203816749155521,0.5680647467311558118652214943722356110811} +#define T_52_49 {0.9350162426854148334243177487223874777555,0.3546048870425356214219902994955191388726} +#define T_52_51 {0.9927088740980539727232212499075103551149,0.1205366802553230476391377123945858329535} +// Pre-computed twiddles for N=53 +#define T_53_1 {0.9929810960135169173312874590919818729162,-0.1182731709213658000123814417747780680656} +#define T_53_2 {0.9720229140804107492712660132383462041616,-0.2348860457809836821407856177756912074983} +#define T_53_3 {0.9374196611341208962286941641650628298521,-0.3482016354343988040476176593074342235923} +#define T_53_4 {0.8896570909947473237977533244702499359846,-0.4566292373937130832928232848644256591797} +#define T_53_5 {0.8294056854502017683827830296650063246489,-0.5586467658036524674969314219197258353233} +#define T_53_6 {0.7575112421616200242269201226008590310812,-0.6528221181905216052854257213766686618328} +#define T_53_7 {0.6749830015182105569238046882674098014832,-0.7378332790417272413208138459594920277596} +#define T_53_8 {0.5829794791144721211040291564131621271372,-0.8124868780056813122314451902639120817184} +#define T_53_9 {0.4827922027307448571420422922528814524412,-0.8757349421956368562547368128434754908085} +#define T_53_10 {0.3758275821142381678541255496384110301733,-0.9266896074318334841635191878594923764467} +#define T_53_11 {0.2635871660690676243632424302631989121437,-0.9646355819083586302653543498308863490820} +#define T_53_12 {0.1476465640024812453479086116203689016402,-0.9890401873221640327571435591380577534437} +#define T_53_13 {0.0296333278225597400445945339697573217563,-0.9995608365087943480986609756655525416136} +#define T_53_14 {-0.0887958953229348035840473585267318412662,-0.9960498426152168960356902971398085355759} +#define T_53_15 {-0.2059786187410983793188989920963649637997,-0.9785564922995040326725302293198183178902} +#define T_53_16 {-0.3202698538628376279113751934346510097384,-0.9473263538541913941770644669304601848125} +#define T_53_17 {-0.4300652022765204529619609274959657341242,-0.9027978299657435368175129042356275022030} +#define T_53_18 {-0.5338233779647907351062485759030096232891,-0.8455960035018260656514144102402497082949} +#define T_53_19 {-0.6300878435817111045125216151063796132803,-0.7765238627180424124674118502298370003700} +#define T_53_20 {-0.7175072570443311592569557433307636529207,-0.6965510290629970402420667596743442118168} +#define T_53_21 {-0.7948544414133532631794309963879641145468,-0.6068001458185933172373438537761103361845} +#define T_53_22 {-0.8610436117673555544982377796259243041277,-0.5085311186492205148823586569051258265972} +#define T_53_23 {-0.9151456172430184610178116599854547530413,-0.4031234292879722347358040224207798019052} +#define T_53_24 {-0.9564009842765224256311284989351406693459,-0.2920567706369758265161351573624415323138} +#define T_53_25 {-0.9842305779475968297020926911500282585621,-0.1768902751225729652606588615526561625302} +#define T_53_26 {-0.9982437317643214624851566441066097468138,-0.0592406278937142871665599841435323469341} +#define T_53_27 {-0.9982437317643214624851566441066097468138,0.0592406278937142871665599841435323469341} +#define T_53_28 {-0.9842305779475968297020926911500282585621,0.1768902751225729652606588615526561625302} +#define T_53_29 {-0.9564009842765224256311284989351406693459,0.2920567706369758265161351573624415323138} +#define T_53_30 {-0.9151456172430184610178116599854547530413,0.4031234292879722347358040224207798019052} +#define T_53_31 {-0.8610436117673555544982377796259243041277,0.5085311186492205148823586569051258265972} +#define T_53_32 {-0.7948544414133532631794309963879641145468,0.6068001458185933172373438537761103361845} +#define T_53_33 {-0.7175072570443311592569557433307636529207,0.6965510290629970402420667596743442118168} +#define T_53_34 {-0.6300878435817111045125216151063796132803,0.7765238627180424124674118502298370003700} +#define T_53_35 {-0.5338233779647907351062485759030096232891,0.8455960035018260656514144102402497082949} +#define T_53_36 {-0.4300652022765204529619609274959657341242,0.9027978299657435368175129042356275022030} +#define T_53_37 {-0.3202698538628376279113751934346510097384,0.9473263538541913941770644669304601848125} +#define T_53_38 {-0.2059786187410983793188989920963649637997,0.9785564922995040326725302293198183178902} +#define T_53_39 {-0.0887958953229348035840473585267318412662,0.9960498426152168960356902971398085355759} +#define T_53_40 {0.0296333278225597400445945339697573217563,0.9995608365087943480986609756655525416136} +#define T_53_41 {0.1476465640024812453479086116203689016402,0.9890401873221640327571435591380577534437} +#define T_53_42 {0.2635871660690676243632424302631989121437,0.9646355819083586302653543498308863490820} +#define T_53_43 {0.3758275821142381678541255496384110301733,0.9266896074318334841635191878594923764467} +#define T_53_44 {0.4827922027307448571420422922528814524412,0.8757349421956368562547368128434754908085} +#define T_53_45 {0.5829794791144721211040291564131621271372,0.8124868780056813122314451902639120817184} +#define T_53_46 {0.6749830015182105569238046882674098014832,0.7378332790417272413208138459594920277596} +#define T_53_47 {0.7575112421616200242269201226008590310812,0.6528221181905216052854257213766686618328} +#define T_53_48 {0.8294056854502017683827830296650063246489,0.5586467658036524674969314219197258353233} +#define T_53_49 {0.8896570909947473237977533244702499359846,0.4566292373937130832928232848644256591797} +#define T_53_50 {0.9374196611341208962286941641650628298521,0.3482016354343988040476176593074342235923} +#define T_53_51 {0.9720229140804107492712660132383462041616,0.2348860457809836821407856177756912074983} +#define T_53_52 {0.9929810960135169173312874590919818729162,0.1182731709213658000123814417747780680656} +// Pre-computed twiddles for N=54 +#define T_54_1 {0.9932383577419430231714159162947908043861,-0.1160929141252302343456648259234498254955} +#define T_54_5 {0.8354878114129363764206459563865792006254,-0.5495089780708060089864375186152756214142} +#define T_54_7 {0.6862416378687336004915664489089976996183,-0.7273736415730487347985899759805761277676} +#define T_54_11 {0.2868032327110902612865572791633894667029,-0.9579895123154889002847767187631689012051} +#define T_54_13 {0.0581448289104758292422658882969699334353,-0.9983081582712681756319739179161842912436} +#define T_54_17 {-0.3960797660391568442150855844374746084213,-0.9182161068802739967154025180207099765539} +#define T_54_19 {-0.5971585917027861789563303318573161959648,-0.8021231927550437346141620764683466404676} +#define T_54_23 {-0.8936326403234122750518508837558329105377,-0.4487991802004621666455363993009086698294} +#define T_54_25 {-0.9730448705798238062669724968145601451397,-0.2306158707424401654861867427825927734375} +#define T_54_29 {-0.9730448705798238062669724968145601451397,0.2306158707424401654861867427825927734375} +#define T_54_31 {-0.8936326403234122750518508837558329105377,0.4487991802004621666455363993009086698294} +#define T_54_35 {-0.5971585917027861789563303318573161959648,0.8021231927550437346141620764683466404676} +#define T_54_37 {-0.3960797660391568442150855844374746084213,0.9182161068802739967154025180207099765539} +#define T_54_41 {0.0581448289104758292422658882969699334353,0.9983081582712681756319739179161842912436} +#define T_54_43 {0.2868032327110902612865572791633894667029,0.9579895123154889002847767187631689012051} +#define T_54_47 {0.6862416378687336004915664489089976996183,0.7273736415730487347985899759805761277676} +#define T_54_49 {0.8354878114129363764206459563865792006254,0.5495089780708060089864375186152756214142} +#define T_54_53 {0.9932383577419430231714159162947908043861,0.1160929141252302343456648259234498254955} +// Pre-computed twiddles for N=55 +#define T_55_1 {0.9934817353485502211540847383730579167604,-0.1139914098905406292239206322847167029977} +#define T_55_2 {0.9740119169423334666291225403256248682737,-0.2264967674257643837076869886004715226591} +#define T_55_3 {0.9418443636395247287040888295450713485479,-0.3360493932154301477410740517370868474245} +#define T_55_4 {0.8973984286913584407230359829554799944162,-0.4412211012432212808853648766671540215611} +#define T_55_6 {0.7741416106390824891647639560687821358442,-0.6330124538088703900484688347205519676208} +#define T_55_7 {0.6969375686552934379136559073231182992458,-0.7171318047589635424898801829840522259474} +#define T_55_8 {0.6106478796354380955690999144280795007944,-0.7919022459222750942586799283162690699100} +#define T_55_9 {0.5163974616389619409773104052874259650707,-0.8563490302515889274204141656809952110052} +#define T_55_12 {0.1985904666457454781358649142930516973138,-0.9800825610923934450013916830357629805803} +#define T_55_13 {0.0855750084788397397117520881693053524941,-0.9963317308626913337832320394227281212807} +#define T_55_14 {-0.0285560507936962540420644529604032868519,-0.9995921928281892299139599344925954937935} +#define T_55_16 {-0.2542183341934869389433515607379376888275,-0.9671468547019571593281739296799059957266} +#define T_55_17 {-0.3628077053506409965777379511564504355192,-0.9318640292114522827304767815803643316031} +#define T_55_18 {-0.4666673232256737247070077501120977103710,-0.8844329309978142816817126004025340080261} +#define T_55_19 {-0.5644432188667691807637538659037090837955,-0.8254718969627740099426205233612563461065} +#define T_55_21 {-0.7367411378764049389999968298070598393679,-0.6761749002740193859040118695702403783798} +#define T_55_23 {-0.8707460771197771398632880845980253070593,-0.4917329246456037528112403833802090957761} +#define T_55_24 {-0.9211236531148501160970454293419606983662,-0.3892701063173915065895869247469818219543} +#define T_55_26 {-0.9853538358476929825968682052916847169399,-0.1705221926326237757987058785147382877767} +#define T_55_27 {-0.9983691039261356481304687804367858916521,-0.0570888108627679857765002680025645531714} +#define T_55_28 {-0.9983691039261356481304687804367858916521,0.0570888108627679857765002680025645531714} +#define T_55_29 {-0.9853538358476929825968682052916847169399,0.1705221926326237757987058785147382877767} +#define T_55_31 {-0.9211236531148501160970454293419606983662,0.3892701063173915065895869247469818219543} +#define T_55_32 {-0.8707460771197771398632880845980253070593,0.4917329246456037528112403833802090957761} +#define T_55_34 {-0.7367411378764049389999968298070598393679,0.6761749002740193859040118695702403783798} +#define T_55_36 {-0.5644432188667691807637538659037090837955,0.8254718969627740099426205233612563461065} +#define T_55_37 {-0.4666673232256737247070077501120977103710,0.8844329309978142816817126004025340080261} +#define T_55_38 {-0.3628077053506409965777379511564504355192,0.9318640292114522827304767815803643316031} +#define T_55_39 {-0.2542183341934869389433515607379376888275,0.9671468547019571593281739296799059957266} +#define T_55_41 {-0.0285560507936962540420644529604032868519,0.9995921928281892299139599344925954937935} +#define T_55_42 {0.0855750084788397397117520881693053524941,0.9963317308626913337832320394227281212807} +#define T_55_43 {0.1985904666457454781358649142930516973138,0.9800825610923934450013916830357629805803} +#define T_55_46 {0.5163974616389619409773104052874259650707,0.8563490302515889274204141656809952110052} +#define T_55_47 {0.6106478796354380955690999144280795007944,0.7919022459222750942586799283162690699100} +#define T_55_48 {0.6969375686552934379136559073231182992458,0.7171318047589635424898801829840522259474} +#define T_55_49 {0.7741416106390824891647639560687821358442,0.6330124538088703900484688347205519676208} +#define T_55_51 {0.8973984286913584407230359829554799944162,0.4412211012432212808853648766671540215611} +#define T_55_52 {0.9418443636395247287040888295450713485479,0.3360493932154301477410740517370868474245} +#define T_55_53 {0.9740119169423334666291225403256248682737,0.2264967674257643837076869886004715226591} +#define T_55_54 {0.9934817353485502211540847383730579167604,0.1139914098905406292239206322847167029977} +// Pre-computed twiddles for N=56 +#define T_56_1 {0.9937122098932426039752385804604273289442,-0.1119644761033078556033970585303904954344} +#define T_56_3 {0.9438833303083675740907665385748259723186,-0.3302790619551670969755718942906241863966} +#define T_56_5 {0.8467241992282841245298641297267749905586,-0.5320320765153365716315647659939713776112} +#define T_56_9 {0.5320320765153365716315647659939713776112,-0.8467241992282841245298641297267749905586} +#define T_56_11 {0.3302790619551670969755718942906241863966,-0.9438833303083675740907665385748259723186} +#define T_56_13 {0.1119644761033078556033970585303904954344,-0.9937122098932426039752385804604273289442} +#define T_56_15 {-0.1119644761033078556033970585303904954344,-0.9937122098932426039752385804604273289442} +#define T_56_17 {-0.3302790619551670969755718942906241863966,-0.9438833303083675740907665385748259723186} +#define T_56_19 {-0.5320320765153365716315647659939713776112,-0.8467241992282841245298641297267749905586} +#define T_56_23 {-0.8467241992282841245298641297267749905586,-0.5320320765153365716315647659939713776112} +#define T_56_25 {-0.9438833303083675740907665385748259723186,-0.3302790619551670969755718942906241863966} +#define T_56_27 {-0.9937122098932426039752385804604273289442,-0.1119644761033078556033970585303904954344} +#define T_56_29 {-0.9937122098932426039752385804604273289442,0.1119644761033078556033970585303904954344} +#define T_56_31 {-0.9438833303083675740907665385748259723186,0.3302790619551670969755718942906241863966} +#define T_56_33 {-0.8467241992282841245298641297267749905586,0.5320320765153365716315647659939713776112} +#define T_56_37 {-0.5320320765153365716315647659939713776112,0.8467241992282841245298641297267749905586} +#define T_56_39 {-0.3302790619551670969755718942906241863966,0.9438833303083675740907665385748259723186} +#define T_56_41 {-0.1119644761033078556033970585303904954344,0.9937122098932426039752385804604273289442} +#define T_56_43 {0.1119644761033078556033970585303904954344,0.9937122098932426039752385804604273289442} +#define T_56_45 {0.3302790619551670969755718942906241863966,0.9438833303083675740907665385748259723186} +#define T_56_47 {0.5320320765153365716315647659939713776112,0.8467241992282841245298641297267749905586} +#define T_56_51 {0.8467241992282841245298641297267749905586,0.5320320765153365716315647659939713776112} +#define T_56_53 {0.9438833303083675740907665385748259723186,0.3302790619551670969755718942906241863966} +#define T_56_55 {0.9937122098932426039752385804604273289442,0.1119644761033078556033970585303904954344} +// Pre-computed twiddles for N=57 +#define T_57_1 {0.9939306773179494847170190041651949286461,-0.1100082209940792948854593191754247527570} +#define T_57_2 {0.9757963826274356211598615118418820202351,-0.2186810912063757994516777216631453484297} +#define T_57_4 {0.9043571606975775445391718676546588540077,-0.4267764354964036499495705356821417808533} +#define T_57_5 {0.8519194088383270635489452615729533135891,-0.5236729139878778527261715680651832371950} +#define T_57_7 {0.7167825131684512074414783455722499638796,-0.6972968010939953886762054935388732701540} +#define T_57_8 {0.6357237482099680203617708684760145843029,-0.7719166509163208900190511485561728477478} +#define T_57_10 {0.4515333583108893655833071534289047122002,-0.8922542386183940354982269127503968775272} +#define T_57_11 {0.3506375551927543576802293046057457104325,-0.9365112411970547556805399835866410285234} +#define T_57_13 {0.1373535578184081618058343110533314757049,-0.9905220846375032417441275356395635753870} +#define T_57_14 {0.0275543423681619964460409732964762952179,-0.9996203070249514155420911265537142753601} +#define T_57_16 {-0.1917106319237384115528044503662385977805,-0.9814514932524178725259389466373249888420} +#define T_57_17 {-0.2985148110016945244282737803587224334478,-0.9544050018795073864197320290259085595608} +#define T_57_20 {-0.5922352526649800052993555254943203181028,-0.8057651056609781425876803950814064592123} +#define T_57_22 {-0.7541066097768962528391512023517861962318,-0.6567520240477343618223926569044124335051} +#define T_57_23 {-0.8217778152252451784320896877034101635218,-0.5698080575102661970987583117675967514515} +#define T_57_25 {-0.9264940672148017641873707361810375005007,-0.3763093719478354803875674861046718433499} +#define T_57_26 {-0.9622680003092504152917285864532459527254,-0.2721034648453349857888383667159359902143} +#define T_57_28 {-0.9984815164333161741083699780574534088373,-0.0550877603558654407867045676994166569784} +#define T_57_29 {-0.9984815164333161741083699780574534088373,0.0550877603558654407867045676994166569784} +#define T_57_31 {-0.9622680003092504152917285864532459527254,0.2721034648453349857888383667159359902143} +#define T_57_32 {-0.9264940672148017641873707361810375005007,0.3763093719478354803875674861046718433499} +#define T_57_34 {-0.8217778152252451784320896877034101635218,0.5698080575102661970987583117675967514515} +#define T_57_35 {-0.7541066097768962528391512023517861962318,0.6567520240477343618223926569044124335051} +#define T_57_37 {-0.5922352526649800052993555254943203181028,0.8057651056609781425876803950814064592123} +#define T_57_40 {-0.2985148110016945244282737803587224334478,0.9544050018795073864197320290259085595608} +#define T_57_41 {-0.1917106319237384115528044503662385977805,0.9814514932524178725259389466373249888420} +#define T_57_43 {0.0275543423681619964460409732964762952179,0.9996203070249514155420911265537142753601} +#define T_57_44 {0.1373535578184081618058343110533314757049,0.9905220846375032417441275356395635753870} +#define T_57_46 {0.3506375551927543576802293046057457104325,0.9365112411970547556805399835866410285234} +#define T_57_47 {0.4515333583108893655833071534289047122002,0.8922542386183940354982269127503968775272} +#define T_57_49 {0.6357237482099680203617708684760145843029,0.7719166509163208900190511485561728477478} +#define T_57_50 {0.7167825131684512074414783455722499638796,0.6972968010939953886762054935388732701540} +#define T_57_52 {0.8519194088383270635489452615729533135891,0.5236729139878778527261715680651832371950} +#define T_57_53 {0.9043571606975775445391718676546588540077,0.4267764354964036499495705356821417808533} +#define T_57_55 {0.9757963826274356211598615118418820202351,0.2186810912063757994516777216631453484297} +#define T_57_56 {0.9939306773179494847170190041651949286461,0.1100082209940792948854593191754247527570} +// Pre-computed twiddles for N=58 +#define T_58_1 {0.9941379571543595972116236225701868534088,-0.1081190184239417678702110947597248014063} +#define T_58_3 {0.9476531711828024562294103816384449601173,-0.3193015301359799495983793349296320229769} +#define T_58_5 {0.8568571761675892739518189955560956150293,-0.5155538571770217348699816284351982176304} +#define T_58_7 {0.7259954919231308423377413419075310230255,-0.6876994588534233177057330976822413504124} +#define T_58_9 {0.5611870653623823868016984306450467556715,-0.8276889981568905652054013444285374134779} +#define T_58_11 {0.3701381553399143431448692354024387896061,-0.9289767198167914186157645417551975697279} +#define T_58_13 {0.1617819965527647341207995168588240630925,-0.9868265225415261410191192226193379610777} +#define T_58_15 {-0.0541389085854175264311827220353734446689,-0.9985334138511238188939955762180034071207} +#define T_58_17 {-0.2675283385292208104822009318013442680240,-0.9635499925192229087045348023821134120226} +#define T_58_19 {-0.4684084406997901517399895965354517102242,-0.8835120444460229371941295539727434515953} +#define T_58_21 {-0.6473862847818276922140512397163547575474,-0.7621620551276364619042169579188339412212} +#define T_58_23 {-0.7960930657056437542706817112048156559467,-0.6051742151937651303938991986797191202641} +#define T_58_25 {-0.9075754196709570065237926428380887955427,-0.4198891015602645992998986912425607442856} +#define T_58_27 {-0.9766205557100866574415931609109975397587,-0.2149704402110240719636635731148999184370} +#define T_58_31 {-0.9766205557100866574415931609109975397587,0.2149704402110240719636635731148999184370} +#define T_58_33 {-0.9075754196709570065237926428380887955427,0.4198891015602645992998986912425607442856} +#define T_58_35 {-0.7960930657056437542706817112048156559467,0.6051742151937651303938991986797191202641} +#define T_58_37 {-0.6473862847818276922140512397163547575474,0.7621620551276364619042169579188339412212} +#define T_58_39 {-0.4684084406997901517399895965354517102242,0.8835120444460229371941295539727434515953} +#define T_58_41 {-0.2675283385292208104822009318013442680240,0.9635499925192229087045348023821134120226} +#define T_58_43 {-0.0541389085854175264311827220353734446689,0.9985334138511238188939955762180034071207} +#define T_58_45 {0.1617819965527647341207995168588240630925,0.9868265225415261410191192226193379610777} +#define T_58_47 {0.3701381553399143431448692354024387896061,0.9289767198167914186157645417551975697279} +#define T_58_49 {0.5611870653623823868016984306450467556715,0.8276889981568905652054013444285374134779} +#define T_58_51 {0.7259954919231308423377413419075310230255,0.6876994588534233177057330976822413504124} +#define T_58_53 {0.8568571761675892739518189955560956150293,0.5155538571770217348699816284351982176304} +#define T_58_55 {0.9476531711828024562294103816384449601173,0.3193015301359799495983793349296320229769} +#define T_58_57 {0.9941379571543595972116236225701868534088,0.1081190184239417678702110947597248014063} +// Pre-computed twiddles for N=59 +#define T_59_1 {0.9943348002101370930461143871070817112923,-0.1062934856473654043895749055081978440285} +#define T_59_2 {0.9774033898178666746048293134663254022598,-0.2113826236296243166190578222085605375469} +#define T_59_3 {0.9493976084683812821651827107416465878487,-0.3140767120219488117704997875989647582173} +#define T_59_4 {0.9106347728549132325426285206049215048552,-0.4132121857683782040204789609560975804925} +#define T_59_5 {0.8615540813938060704657573296572081744671,-0.5076658003388400119959555922832805663347} +#define T_59_6 {0.8027116379309636728933696758758742362261,-0.5963673585385014241921908251242712140083} +#define T_59_7 {0.7347741508630672990776133701729122549295,-0.6783118362696161085523272049613296985626} +#define T_59_8 {0.6585113790650386489389234156988095492125,-0.7525707698561384662028217462648171931505} +#define T_59_9 {0.5747874102144069352604560663166921585798,-0.8183027759081690222231486586679238826036} +#define T_59_10 {0.4845508703326501520614044693502364680171,-0.8747630845319612769017680875549558550119} +#define T_59_11 {0.3888241754733206545502355311327846720815,-0.9213119778704129680946266489627305418253} +#define T_59_12 {0.2886919473396210022997365740593522787094,-0.9574220383620054564133283747651148587465} +#define T_59_13 {0.1852887240871143315068536594480974599719,-0.9826841245925209689104917742952238768339} +#define T_59_14 {0.0797861055530830753923510201275348663330,-0.9968120070307501245920889232365880161524} +#define T_59_15 {-0.0266205214377747655107864233059444813989,-0.9996456111234526265008071277406997978687} +#define T_59_16 {-0.1327255272837219723935930915104108862579,-0.9911528310040071954034601731109432876110} +#define T_59_17 {-0.2373266998711148212386490286007756367326,-0.9714298932647099649173583202355075627565} +#define T_59_18 {-0.3392388661180302800168817611847771331668,-0.9407002666710332361432733705441933125257} +#define T_59_19 {-0.4373073204588553974936360191350104287267,-0.8993121301712192483535091014346107840538} +#define T_59_20 {-0.5304209081197425179610149825748521834612,-0.8477344278896709051096536313707474619150} +#define T_59_21 {-0.6175246149461919298317980064894072711468,-0.7865515558026424658422115498979110270739} +#define T_59_22 {-0.6976315211349847222876974228711333125830,-0.7164567402983151866990851885930169373751} +#define T_59_23 {-0.7698339834299062767186683231557253748178,-0.6382441836448200511711092985933646559715} +#define T_59_24 {-0.8333139190825149533736748708179220557213,-0.5528000653611934156472784707148093730211} +#define T_59_25 {-0.8873520750565715431434909987729042768478,-0.4610925014493258422554333719745045527816} +#define T_59_26 {-0.9313361774523384317703289525525178760290,-0.3641605752528221939812169694050680845976} +#define T_59_27 {-0.9647678688145159275890705430356319993734,-0.2631025642275212628184988261637045070529} +#define T_59_28 {-0.9872683547213445542922727327095344662666,-0.1590634960190720481598702917835908010602} +#define T_59_29 {-0.9985826956767619444477190882025752216578,-0.0532221748421786572391667391457303892821} +#define T_59_30 {-0.9985826956767619444477190882025752216578,0.0532221748421786572391667391457303892821} +#define T_59_31 {-0.9872683547213445542922727327095344662666,0.1590634960190720481598702917835908010602} +#define T_59_32 {-0.9647678688145159275890705430356319993734,0.2631025642275212628184988261637045070529} +#define T_59_33 {-0.9313361774523384317703289525525178760290,0.3641605752528221939812169694050680845976} +#define T_59_34 {-0.8873520750565715431434909987729042768478,0.4610925014493258422554333719745045527816} +#define T_59_35 {-0.8333139190825149533736748708179220557213,0.5528000653611934156472784707148093730211} +#define T_59_36 {-0.7698339834299062767186683231557253748178,0.6382441836448200511711092985933646559715} +#define T_59_37 {-0.6976315211349847222876974228711333125830,0.7164567402983151866990851885930169373751} +#define T_59_38 {-0.6175246149461919298317980064894072711468,0.7865515558026424658422115498979110270739} +#define T_59_39 {-0.5304209081197425179610149825748521834612,0.8477344278896709051096536313707474619150} +#define T_59_40 {-0.4373073204588553974936360191350104287267,0.8993121301712192483535091014346107840538} +#define T_59_41 {-0.3392388661180302800168817611847771331668,0.9407002666710332361432733705441933125257} +#define T_59_42 {-0.2373266998711148212386490286007756367326,0.9714298932647099649173583202355075627565} +#define T_59_43 {-0.1327255272837219723935930915104108862579,0.9911528310040071954034601731109432876110} +#define T_59_44 {-0.0266205214377747655107864233059444813989,0.9996456111234526265008071277406997978687} +#define T_59_45 {0.0797861055530830753923510201275348663330,0.9968120070307501245920889232365880161524} +#define T_59_46 {0.1852887240871143315068536594480974599719,0.9826841245925209689104917742952238768339} +#define T_59_47 {0.2886919473396210022997365740593522787094,0.9574220383620054564133283747651148587465} +#define T_59_48 {0.3888241754733206545502355311327846720815,0.9213119778704129680946266489627305418253} +#define T_59_49 {0.4845508703326501520614044693502364680171,0.8747630845319612769017680875549558550119} +#define T_59_50 {0.5747874102144069352604560663166921585798,0.8183027759081690222231486586679238826036} +#define T_59_51 {0.6585113790650386489389234156988095492125,0.7525707698561384662028217462648171931505} +#define T_59_52 {0.7347741508630672990776133701729122549295,0.6783118362696161085523272049613296985626} +#define T_59_53 {0.8027116379309636728933696758758742362261,0.5963673585385014241921908251242712140083} +#define T_59_54 {0.8615540813938060704657573296572081744671,0.5076658003388400119959555922832805663347} +#define T_59_55 {0.9106347728549132325426285206049215048552,0.4132121857683782040204789609560975804925} +#define T_59_56 {0.9493976084683812821651827107416465878487,0.3140767120219488117704997875989647582173} +#define T_59_57 {0.9774033898178666746048293134663254022598,0.2113826236296243166190578222085605375469} +#define T_59_58 {0.9943348002101370930461143871070817112923,0.1062934856473654043895749055081978440285} +// Pre-computed twiddles for N=60 +#define T_60_1 {0.9945218953682732898613494398887269198895,-0.1045284632676534708473070622858358547091} +#define T_60_7 {0.7431448254773942441175904605188407003880,-0.6691306063588582375700752891134470701218} +#define T_60_11 {0.4067366430758002082690438783174613490701,-0.9135454576426008665990252666233573108912} +#define T_60_13 {0.2079116908177593425754992040310753509402,-0.9781476007338056888329447247087955474854} +#define T_60_17 {-0.2079116908177593425754992040310753509402,-0.9781476007338056888329447247087955474854} +#define T_60_19 {-0.4067366430758002082690438783174613490701,-0.9135454576426008665990252666233573108912} +#define T_60_23 {-0.7431448254773942441175904605188407003880,-0.6691306063588582375700752891134470701218} +#define T_60_29 {-0.9945218953682732898613494398887269198895,-0.1045284632676534708473070622858358547091} +#define T_60_31 {-0.9945218953682732898613494398887269198895,0.1045284632676534708473070622858358547091} +#define T_60_37 {-0.7431448254773942441175904605188407003880,0.6691306063588582375700752891134470701218} +#define T_60_41 {-0.4067366430758002082690438783174613490701,0.9135454576426008665990252666233573108912} +#define T_60_43 {-0.2079116908177593425754992040310753509402,0.9781476007338056888329447247087955474854} +#define T_60_47 {0.2079116908177593425754992040310753509402,0.9781476007338056888329447247087955474854} +#define T_60_49 {0.4067366430758002082690438783174613490701,0.9135454576426008665990252666233573108912} +#define T_60_53 {0.7431448254773942441175904605188407003880,0.6691306063588582375700752891134470701218} +#define T_60_59 {0.9945218953682732898613494398887269198895,0.1045284632676534708473070622858358547091} +// Pre-computed twiddles for N=61 +#define T_61_1 {0.9946998756145890574487111734924837946892,-0.1028209971373603981437128140896675176919} +#define T_61_2 {0.9788556850953578081231398755335249006748,-0.2045520661262008321745753391951438970864} +#define T_61_3 {0.9526353808033825032453023595735430717468,-0.3041148323275179032876280871278140693903} +#define T_61_4 {0.9163169044870047574846694260486401617527,-0.4004539056512664840781212660658638924360} +#define T_61_5 {0.8702852410301552543714365128835197538137,-0.4925480679538644102066768937220331281424} +#define T_61_6 {0.8150283375168113630948596437519881874323,-0.5794210982045636670889621200331021100283} +#define T_61_7 {0.7511319308705198816156212160422001034021,-0.6601521206712317457743210979970172047615} +#define T_61_8 {0.6792733388972931329874427319737151265144,-0.7338853664321991487895502359606325626373} +#define T_61_9 {0.6002142805483682286293856122938450425863,-0.7998392447397193461000597380916588008404} +#define T_61_10 {0.5147928015098307552577239221136551350355,-0.8573146280763322968354600561724510043859} +#define T_61_11 {0.4239143907098607000349943518813233822584,-0.9057022630804715257468728850653860718012} +#define T_61_12 {0.3285423819108347354145394092483911663294,-0.9444892287836612609908115700818598270416} +#define T_61_13 {0.2296877421317955469959315450978465378284,-0.9732643737003825457065886439522728323936} +#define T_61_14 {0.1283983551465509576061663210566621273756,-0.9917226741361014896725123435317073017359} +#define T_61_15 {0.0257479136549885571916274074055763776414,-0.9996684675143130416330450316308997571468} +#define T_61_16 {-0.0771754621266463464168694486033928114921,-0.9970175264485267030423187861742917448282} +#define T_61_17 {-0.1792807588107356575601869508318486623466,-0.9837979515735163493772574838658329099417} +#define T_61_18 {-0.2794856348516094324274661175877554342151,-0.9601498736716017612380369428137782961130} +#define T_61_19 {-0.3767278936351851070796215026348363608122,-0.9263239682514949890901334583759307861328} +#define T_61_20 {-0.4699767430273200652379728126106783747673,-0.8826787983255474090427128430746961385012} +#define T_61_21 {-0.5582437220268647859455768411862663924694,-0.8296770135526189138275299228553194552660} +#define T_61_22 {-0.6405931786981751141496488344273529946804,-0.7678804460365999950965942844050005078316} +#define T_61_23 {-0.7161521883143933120408064496587030589581,-0.6979441547663435274628795923490542918444} +#define T_61_24 {-0.7841198065767104319334634965343866497278,-0.6206094818274228153143212693976238369942} +#define T_61_25 {-0.8437755598231856390967209335940424352884,-0.5366961939916005119854958138603251427412} +#define T_61_26 {-0.8944870822287955824947403016267344355583,-0.4470937929851139247539038024115143343806} +#define T_61_27 {-0.9357168190404936414239500663825310766697,-0.3527520865490947898912565960927167907357} +#define T_61_28 {-0.9670277247913203755302902209223248064518,-0.2546711202412287322616180063050705939531} +#define T_61_29 {-0.9880878960910771668935126399446744471788,-0.1538905767040617822427606142809963785112} +#define T_61_30 {-0.9986740898848305292290206125471740961075,-0.0514787547703465311887960353942617075518} +#define T_61_31 {-0.9986740898848305292290206125471740961075,0.0514787547703465311887960353942617075518} +#define T_61_32 {-0.9880878960910771668935126399446744471788,0.1538905767040617822427606142809963785112} +#define T_61_33 {-0.9670277247913203755302902209223248064518,0.2546711202412287322616180063050705939531} +#define T_61_34 {-0.9357168190404936414239500663825310766697,0.3527520865490947898912565960927167907357} +#define T_61_35 {-0.8944870822287955824947403016267344355583,0.4470937929851139247539038024115143343806} +#define T_61_36 {-0.8437755598231856390967209335940424352884,0.5366961939916005119854958138603251427412} +#define T_61_37 {-0.7841198065767104319334634965343866497278,0.6206094818274228153143212693976238369942} +#define T_61_38 {-0.7161521883143933120408064496587030589581,0.6979441547663435274628795923490542918444} +#define T_61_39 {-0.6405931786981751141496488344273529946804,0.7678804460365999950965942844050005078316} +#define T_61_40 {-0.5582437220268647859455768411862663924694,0.8296770135526189138275299228553194552660} +#define T_61_41 {-0.4699767430273200652379728126106783747673,0.8826787983255474090427128430746961385012} +#define T_61_42 {-0.3767278936351851070796215026348363608122,0.9263239682514949890901334583759307861328} +#define T_61_43 {-0.2794856348516094324274661175877554342151,0.9601498736716017612380369428137782961130} +#define T_61_44 {-0.1792807588107356575601869508318486623466,0.9837979515735163493772574838658329099417} +#define T_61_45 {-0.0771754621266463464168694486033928114921,0.9970175264485267030423187861742917448282} +#define T_61_46 {0.0257479136549885571916274074055763776414,0.9996684675143130416330450316308997571468} +#define T_61_47 {0.1283983551465509576061663210566621273756,0.9917226741361014896725123435317073017359} +#define T_61_48 {0.2296877421317955469959315450978465378284,0.9732643737003825457065886439522728323936} +#define T_61_49 {0.3285423819108347354145394092483911663294,0.9444892287836612609908115700818598270416} +#define T_61_50 {0.4239143907098607000349943518813233822584,0.9057022630804715257468728850653860718012} +#define T_61_51 {0.5147928015098307552577239221136551350355,0.8573146280763322968354600561724510043859} +#define T_61_52 {0.6002142805483682286293856122938450425863,0.7998392447397193461000597380916588008404} +#define T_61_53 {0.6792733388972931329874427319737151265144,0.7338853664321991487895502359606325626373} +#define T_61_54 {0.7511319308705198816156212160422001034021,0.6601521206712317457743210979970172047615} +#define T_61_55 {0.8150283375168113630948596437519881874323,0.5794210982045636670889621200331021100283} +#define T_61_56 {0.8702852410301552543714365128835197538137,0.4925480679538644102066768937220331281424} +#define T_61_57 {0.9163169044870047574846694260486401617527,0.4004539056512664840781212660658638924360} +#define T_61_58 {0.9526353808033825032453023595735430717468,0.3041148323275179032876280871278140693903} +#define T_61_59 {0.9788556850953578081231398755335249006748,0.2045520661262008321745753391951438970864} +#define T_61_60 {0.9946998756145890574487111734924837946892,0.1028209971373603981437128140896675176919} +// Pre-computed twiddles for N=62 +#define T_62_1 {0.9948693233918951550620590751350391656160,-0.1011683219874321831843744234902260359377} +#define T_62_3 {0.9541392564000488185627091297646984457970,-0.2993631229733579313112556974374456331134} +#define T_62_5 {0.8743466161445820894471125939162448048592,-0.4853019625310810436502606535213999450207} +#define T_62_7 {0.7587581226927908595314420381328091025352,-0.6513724827222222568678944298881106078625} +#define T_62_9 {0.6121059825476627969109699733962770551443,-0.7907757369376985367992460851382929831743} +#define T_62_11 {0.4403941515576342835558421029418241232634,-0.8978045395707416886921237164642661809921} +#define T_62_13 {0.2506525322587205284463607313227839767933,-0.9680771188662042892048020803485997021198} +#define T_62_15 {0.0506491688387127117487729321965161943808,-0.9987165071710527586290595536411274224520} +#define T_62_17 {-0.1514277775045766716299766585507313720882,-0.9884683243281113806943949384731240570545} +#define T_62_19 {-0.3473052528448202780353426533110905438662,-0.9377521321470804194220249883073847740889} +#define T_62_21 {-0.5289640103269624971105145050387363880873,-0.8486442574947509198679540531884413212538} +#define T_62_23 {-0.6889669190756865235769623723172117024660,-0.7247927872291199946630513295531272888184} +#define T_62_25 {-0.8207634412072762897238931145693641155958,-0.5712682150947923087613844472798518836498} +#define T_62_27 {-0.9189578116202306024007384621654637157917,-0.3943558551133185541281989117123885080218} +#define T_62_29 {-0.9795299412524944848712493694620206952095,-0.2012985200886600878344978582390467636287} +#define T_62_33 {-0.9795299412524944848712493694620206952095,0.2012985200886600878344978582390467636287} +#define T_62_35 {-0.9189578116202306024007384621654637157917,0.3943558551133185541281989117123885080218} +#define T_62_37 {-0.8207634412072762897238931145693641155958,0.5712682150947923087613844472798518836498} +#define T_62_39 {-0.6889669190756865235769623723172117024660,0.7247927872291199946630513295531272888184} +#define T_62_41 {-0.5289640103269624971105145050387363880873,0.8486442574947509198679540531884413212538} +#define T_62_43 {-0.3473052528448202780353426533110905438662,0.9377521321470804194220249883073847740889} +#define T_62_45 {-0.1514277775045766716299766585507313720882,0.9884683243281113806943949384731240570545} +#define T_62_47 {0.0506491688387127117487729321965161943808,0.9987165071710527586290595536411274224520} +#define T_62_49 {0.2506525322587205284463607313227839767933,0.9680771188662042892048020803485997021198} +#define T_62_51 {0.4403941515576342835558421029418241232634,0.8978045395707416886921237164642661809921} +#define T_62_53 {0.6121059825476627969109699733962770551443,0.7907757369376985367992460851382929831743} +#define T_62_55 {0.7587581226927908595314420381328091025352,0.6513724827222222568678944298881106078625} +#define T_62_57 {0.8743466161445820894471125939162448048592,0.4853019625310810436502606535213999450207} +#define T_62_59 {0.9541392564000488185627091297646984457970,0.2993631229733579313112556974374456331134} +#define T_62_61 {0.9948693233918951550620590751350391656160,0.1011683219874321831843744234902260359377} +// Pre-computed twiddles for N=63 +#define T_63_1 {0.9950307753654014097932645199762191623449,-0.0995678465958166614147017980940290726721} +#define T_63_2 {0.9801724878485438274822172388667240738869,-0.1981461431993975785825767843562061898410} +#define T_63_4 {0.9214762118704076243602685281075537204742,-0.3884347962746947380274775696307187899947} +#define T_63_5 {0.8782215733702285476169890898745507001877,-0.4782539786213181876028954775392776355147} +#define T_63_8 {0.6982368180860728523740021955745760351419,-0.7158668492597184407344457213184796273708} +#define T_63_10 {0.5425462638657594371949244305142201483250,-0.8400259231507714030584565989556722342968} +#define T_63_11 {0.4562106573531629627460404208250110968947,-0.8898718088114686564082944641995709389448} +#define T_63_13 {0.2708404681430051086366006529715377837420,-0.9626242469500120302328127763757947832346} +#define T_63_16 {-0.0249306917380728750022189643686942872591,-0.9996891820008162454769262694753706455231} +#define T_63_17 {-0.1243437046474851759292334918427513912320,-0.9922392066001720634105254248424898833036} +#define T_63_19 {-0.3184866502516844333214862672321032732725,-0.9479273461671317013710336141230072826147} +#define T_63_20 {-0.4112871031306115643388920943834818899632,-0.9115058523116731370450338545197155326605} +#define T_63_22 {-0.5837436722347898454543724255927372723818,-0.8119380057158565033859076720545999705791} +#define T_63_23 {-0.6616858375968593941607309716346208006144,-0.7497812029677342060551836766535416245461} +#define T_63_25 {-0.7971325072229225039421862675226293504238,-0.6038044103254773808941990864695981144905} +#define T_63_26 {-0.8532908816321556066952780383871868252754,-0.5214352033794981133851820231939200311899} +#define T_63_29 {-0.9690772862290779610106028485461138188839,-0.2467573976902936450095182863151421770453} +#define T_63_31 {-0.9987569212189223444298136200814042240381,-0.0498458856606971631442171144499297952279} +#define T_63_32 {-0.9987569212189223444298136200814042240381,0.0498458856606971631442171144499297952279} +#define T_63_34 {-0.9690772862290779610106028485461138188839,0.2467573976902936450095182863151421770453} +#define T_63_37 {-0.8532908816321556066952780383871868252754,0.5214352033794981133851820231939200311899} +#define T_63_38 {-0.7971325072229225039421862675226293504238,0.6038044103254773808941990864695981144905} +#define T_63_40 {-0.6616858375968593941607309716346208006144,0.7497812029677342060551836766535416245461} +#define T_63_41 {-0.5837436722347898454543724255927372723818,0.8119380057158565033859076720545999705791} +#define T_63_43 {-0.4112871031306115643388920943834818899632,0.9115058523116731370450338545197155326605} +#define T_63_44 {-0.3184866502516844333214862672321032732725,0.9479273461671317013710336141230072826147} +#define T_63_46 {-0.1243437046474851759292334918427513912320,0.9922392066001720634105254248424898833036} +#define T_63_47 {-0.0249306917380728750022189643686942872591,0.9996891820008162454769262694753706455231} +#define T_63_50 {0.2708404681430051086366006529715377837420,0.9626242469500120302328127763757947832346} +#define T_63_52 {0.4562106573531629627460404208250110968947,0.8898718088114686564082944641995709389448} +#define T_63_53 {0.5425462638657594371949244305142201483250,0.8400259231507714030584565989556722342968} +#define T_63_55 {0.6982368180860728523740021955745760351419,0.7158668492597184407344457213184796273708} +#define T_63_58 {0.8782215733702285476169890898745507001877,0.4782539786213181876028954775392776355147} +#define T_63_59 {0.9214762118704076243602685281075537204742,0.3884347962746947380274775696307187899947} +#define T_63_61 {0.9801724878485438274822172388667240738869,0.1981461431993975785825767843562061898410} +#define T_63_62 {0.9950307753654014097932645199762191623449,0.0995678465958166614147017980940290726721} +// Pre-computed twiddles for N=64 +#define T_64_1 {0.9951847266721969287317506314138881862164,-0.0980171403295606036287779261328978464007} +#define T_64_3 {0.9569403357322088243819280251045711338520,-0.2902846772544623865641710835916455835104} +#define T_64_5 {0.8819212643483550495560052695509511977434,-0.4713967368259976420397094898362411186099} +#define T_64_7 {0.7730104533627369933768136434082407504320,-0.6343932841636454877942696839454583823681} +#define T_64_9 {0.6343932841636454877942696839454583823681,-0.7730104533627369933768136434082407504320} +#define T_64_11 {0.4713967368259976420397094898362411186099,-0.8819212643483550495560052695509511977434} +#define T_64_13 {0.2902846772544623865641710835916455835104,-0.9569403357322088243819280251045711338520} +#define T_64_15 {0.0980171403295606036287779261328978464007,-0.9951847266721969287317506314138881862164} +#define T_64_17 {-0.0980171403295606036287779261328978464007,-0.9951847266721969287317506314138881862164} +#define T_64_19 {-0.2902846772544623865641710835916455835104,-0.9569403357322088243819280251045711338520} +#define T_64_21 {-0.4713967368259976420397094898362411186099,-0.8819212643483550495560052695509511977434} +#define T_64_23 {-0.6343932841636454877942696839454583823681,-0.7730104533627369933768136434082407504320} +#define T_64_25 {-0.7730104533627369933768136434082407504320,-0.6343932841636454877942696839454583823681} +#define T_64_27 {-0.8819212643483550495560052695509511977434,-0.4713967368259976420397094898362411186099} +#define T_64_29 {-0.9569403357322088243819280251045711338520,-0.2902846772544623865641710835916455835104} +#define T_64_31 {-0.9951847266721969287317506314138881862164,-0.0980171403295606036287779261328978464007} +#define T_64_33 {-0.9951847266721969287317506314138881862164,0.0980171403295606036287779261328978464007} +#define T_64_35 {-0.9569403357322088243819280251045711338520,0.2902846772544623865641710835916455835104} +#define T_64_37 {-0.8819212643483550495560052695509511977434,0.4713967368259976420397094898362411186099} +#define T_64_39 {-0.7730104533627369933768136434082407504320,0.6343932841636454877942696839454583823681} +#define T_64_41 {-0.6343932841636454877942696839454583823681,0.7730104533627369933768136434082407504320} +#define T_64_43 {-0.4713967368259976420397094898362411186099,0.8819212643483550495560052695509511977434} +#define T_64_45 {-0.2902846772544623865641710835916455835104,0.9569403357322088243819280251045711338520} +#define T_64_47 {-0.0980171403295606036287779261328978464007,0.9951847266721969287317506314138881862164} +#define T_64_49 {0.0980171403295606036287779261328978464007,0.9951847266721969287317506314138881862164} +#define T_64_51 {0.2902846772544623865641710835916455835104,0.9569403357322088243819280251045711338520} +#define T_64_53 {0.4713967368259976420397094898362411186099,0.8819212643483550495560052695509511977434} +#define T_64_55 {0.6343932841636454877942696839454583823681,0.7730104533627369933768136434082407504320} +#define T_64_57 {0.7730104533627369933768136434082407504320,0.6343932841636454877942696839454583823681} +#define T_64_59 {0.8819212643483550495560052695509511977434,0.4713967368259976420397094898362411186099} +#define T_64_61 {0.9569403357322088243819280251045711338520,0.2902846772544623865641710835916455835104} +#define T_64_63 {0.9951847266721969287317506314138881862164,0.0980171403295606036287779261328978464007} +// Pre-computed twiddles for N=72 +#define T_72_1 {0.9961946980917455451987052583717741072178,-0.0871557427476581797476384849687747191638} +#define T_72_5 {0.9063077870366499366738821663602720946074,-0.4226182617406994412867504706809995695949} +#define T_72_7 {0.8191520442889917985596071048348676413298,-0.5735764363510460484008035564329475164413} +#define T_72_11 {0.5735764363510460484008035564329475164413,-0.8191520442889917985596071048348676413298} +#define T_72_35 {-0.9961946980917455451987052583717741072178,-0.0871557427476581797476384849687747191638} +// Pre-computed twiddles for N=80 +#define T_80_1 {0.9969173337331279638462433467793744057417,-0.0784590957278449435685629964609688613564} +#define T_80_3 {0.9723699203976765570445195407955907285213,-0.2334453638559054189371977372502442449331} +#define T_80_7 {0.8526401643540921782005170825868844985962,-0.5224985647159489099422557956131640821695} +// Pre-computed twiddles for N=81 +#define T_81_1 {0.9969929411677920239043260153266601264477,-0.0774924206719309477398027752315101679415} +#define T_81_2 {0.9879898494768090122519765827746596187353,-0.1545187928078404770992904104787157848477} +#define T_81_4 {0.9522478853384153119066013459814712405205,-0.3053259976951130938438438988669076934457} +#define T_81_5 {0.9257239692688904453277132233779411762953,-0.3781998581716424778775831327948253601789} +#define T_81_7 {0.8561668995302664786350987924379296600819,-0.5166993711518629295298410397663246840239} +#define T_81_8 {0.8135520702629676081940601761743891984224,-0.5814920712880267128497280282317660748959} +#define T_81_10 {0.7139297345578989872905140146031044423580,-0.7002173477671684853618216948234476149082} +#define T_81_11 {0.6575213685690636244984830227622296661139,-0.7534358963276607257242289961141068488359} +#define T_81_13 {0.5332044328016912748680056211014743894339,-0.8459864259198410785245414444943889975548} +#define T_81_14 {0.4660435197025388220914976500353077426553,-0.8847617971766578159886762477981392294168} +#define T_81_16 {0.3237339420583210114479300045786658301950,-0.9461481568757503790578766711405478417873} +#define T_81_17 {0.2494411440579812899720479890675051137805,-0.9683899605278059041779670224059373140335} +#define T_81_19 {0.0968108707031790916008873182363458909094,-0.9953027957931658287193954492977354675531} +#define T_81_20 {0.0193913317718243730092275711740512633696,-0.9998119704485014525019437314767856150866} +#define T_81_22 {-0.1353312997501311121251177382873720489442,-0.9908004033648453168225955778325442224741} +#define T_81_23 {-0.2117038722294107555566000655744574032724,-0.9773338582506355676216003303125035017729} +#define T_81_25 {-0.3601777248047104418304797945893369615078,-0.9328837047320005515516072591708507388830} +#define T_81_26 {-0.4313860656812534277726456366508500650525,-0.9021674247810377300638151609746273607016} +#define T_81_28 {-0.5656068754865386516428316099336370825768,-0.8246750041091067684462245779286604374647} +#define T_81_29 {-0.6278121246720985704214967881853226572275,-0.7783649119241600189411656174343079328537} +#define T_81_31 {-0.7405440131090045285944256647781003266573,-0.6720078605555224182666052001877687871456} +#define T_81_32 {-0.7903926695187593054470198694616556167603,-0.6126005451932028389450124450377188622952} +#define T_81_34 {-0.8755582313020908724610080753336660563946,-0.4831125992966384674609514604526339098811} +#define T_81_35 {-0.9103629409661466720393718787818215787411,-0.4138107245051391713808186523237964138389} +#define T_81_37 {-0.9633708786158803327737132349284365773201,-0.2681726127606373633049940963246626779437} +#define T_81_38 {-0.9812553106273846914575642585987225174904,-0.1927122605480896810892232906553545035422} +#define T_81_40 {-0.9992479525042300414483520398789551109076,-0.0387753712568167097085591876748367212713} +#define T_81_41 {-0.9992479525042300414483520398789551109076,0.0387753712568167097085591876748367212713} +#define T_81_43 {-0.9812553106273846914575642585987225174904,0.1927122605480896810892232906553545035422} +#define T_81_44 {-0.9633708786158803327737132349284365773201,0.2681726127606373633049940963246626779437} +#define T_81_46 {-0.9103629409661466720393718787818215787411,0.4138107245051391713808186523237964138389} +#define T_81_47 {-0.8755582313020908724610080753336660563946,0.4831125992966384674609514604526339098811} +#define T_81_49 {-0.7903926695187593054470198694616556167603,0.6126005451932028389450124450377188622952} +#define T_81_50 {-0.7405440131090045285944256647781003266573,0.6720078605555224182666052001877687871456} +#define T_81_52 {-0.6278121246720985704214967881853226572275,0.7783649119241600189411656174343079328537} +#define T_81_53 {-0.5656068754865386516428316099336370825768,0.8246750041091067684462245779286604374647} +// Pre-computed twiddles for N=96 +#define T_96_1 {0.9978589232386034790778239766950719058514,-0.0654031292301430616786817040519963484257} +#define T_96_5 {0.9469301294951056879867223869950976222754,-0.3214394653031615867178061307640746235847} +#define T_96_7 {0.8968727415326882557167209597537294030190,-0.4422886902190013014291025683633051812649} +#define T_96_11 {0.7518398074789773843917828344274312257767,-0.6593458151000688438614361075451597571373} +#define T_96_35 {-0.6593458151000688438614361075451597571373,-0.7518398074789773843917828344274312257767} +#define T_96_49 {-0.9978589232386034790778239766950719058514,0.0654031292301430616786817040519963484257} +// Pre-computed twiddles for N=100 +#define T_100_1 {0.9980267284282715589682766221812926232815,-0.0627905195293133738809743249476014170796} +#define T_100_3 {0.9822872507286887211463977109815459698439,-0.1873813145857246287295794218152877874672} +#define T_100_7 {0.9048270524660195768262838100781664252281,-0.4257792915650726595089281545369885861874} +#define T_100_9 {0.8443279255020150753097141205216757953167,-0.5358267949789966566598309327673632651567} +#define T_100_21 {0.2486898871648547948431939857982797548175,-0.9685831611286310760533524444326758384705} +#define T_100_27 {-0.1253332335643042583228634612169116735458,-0.9921147013144778759041741977853234857321} +#define T_100_33 {-0.4817536741017152679411594817793229594827,-0.8763066800438635839398671123490203171968} +#define T_100_39 {-0.7705132427757892532582673084107227623463,-0.6374239897486897454825793829513713717461} +#define T_100_51 {-0.9980267284282715589682766221812926232815,0.0627905195293133738809743249476014170796} +#define T_100_57 {-0.9048270524660195768262838100781664252281,0.4257792915650726595089281545369885861874} +// Pre-computed twiddles for N=108 +#define T_108_1 {0.9983081582712681756319739179161842912436,-0.0581448289104758292422658882969699334353} +#define T_108_5 {0.9579895123154889002847767187631689012051,-0.2868032327110902612865572791633894667029} +#define T_108_7 {0.9182161068802739967154025180207099765539,-0.3960797660391568442150855844374746084213} +#define T_108_11 {0.8021231927550437346141620764683466404676,-0.5971585917027861789563303318573161959648} +#define T_108_13 {0.7273736415730487347985899759805761277676,-0.6862416378687336004915664489089976996183} +#define T_108_17 {0.5495089780708060089864375186152756214142,-0.8354878114129363764206459563865792006254} +#define T_108_19 {0.4487991802004621666455363993009086698294,-0.8936326403234122750518508837558329105377} +#define T_108_23 {0.2306158707424401654861867427825927734375,-0.9730448705798238062669724968145601451397} +#define T_108_25 {0.1160929141252302343456648259234498254955,-0.9932383577419430231714159162947908043861} +#define T_108_29 {-0.1160929141252302343456648259234498254955,-0.9932383577419430231714159162947908043861} +#define T_108_31 {-0.2306158707424401654861867427825927734375,-0.9730448705798238062669724968145601451397} +#define T_108_35 {-0.4487991802004621666455363993009086698294,-0.8936326403234122750518508837558329105377} +#define T_108_37 {-0.5495089780708060089864375186152756214142,-0.8354878114129363764206459563865792006254} +#define T_108_41 {-0.7273736415730487347985899759805761277676,-0.6862416378687336004915664489089976996183} +#define T_108_43 {-0.8021231927550437346141620764683466404676,-0.5971585917027861789563303318573161959648} +#define T_108_47 {-0.9182161068802739967154025180207099765539,-0.3960797660391568442150855844374746084213} +#define T_108_49 {-0.9579895123154889002847767187631689012051,-0.2868032327110902612865572791633894667029} +#define T_108_53 {-0.9983081582712681756319739179161842912436,-0.0581448289104758292422658882969699334353} +#define T_108_55 {-0.9983081582712681756319739179161842912436,0.0581448289104758292422658882969699334353} +#define T_108_59 {-0.9579895123154889002847767187631689012051,0.2868032327110902612865572791633894667029} +#define T_108_61 {-0.9182161068802739967154025180207099765539,0.3960797660391568442150855844374746084213} +#define T_108_65 {-0.8021231927550437346141620764683466404676,0.5971585917027861789563303318573161959648} +#define T_108_67 {-0.7273736415730487347985899759805761277676,0.6862416378687336004915664489089976996183} +#define T_108_71 {-0.5495089780708060089864375186152756214142,0.8354878114129363764206459563865792006254} +// Pre-computed twiddles for N=121 +#define T_121_1 {0.9986520883988230234962202302995137870312,-0.0519038181318997385305102909569541225210} +#define T_121_2 {0.9946119873266613398143931590311694890261,-0.1036677127465887443547032376045535784215} +#define T_121_3 {0.9878905881817251488641318246664013713598,-0.1551521375359205257016270707026706077158} +#define T_121_4 {0.9785060106677815339182302523113321512938,-0.2062182995929881801355776360651361756027} +#define T_121_5 {0.9664835539466369862537931112456135451794,-0.2567285335732630846550250680593308061361} +#define T_121_6 {0.9518556284360696340129948112007696181536,-0.3065466728160248788270791919785551726818} +#define T_121_7 {0.9346616684370731009678934242401737719774,-0.3555384164256048529217935083579504862428} +#define T_121_8 {0.9149480258259531373354889183246996253729,-0.4035716913228565294602390167710836976767} +#define T_121_9 {0.8927678450978636659485232485167216509581,-0.4505170082908268036625543118134373798966} +#define T_121_10 {0.8681809200986438934677380530047230422497,-0.4962478110547915988171041590248933061957} +#define T_121_12 {0.8120582747708500193084546481259167194366,-0.5835763517965671942633321123139467090368} +#define T_121_18 {0.5940688504813661507952815554745029658079,-0.8044141973434756343053209093341138213873} +#define T_121_24 {0.3188772832476185481453967440756969153881,-0.9477960108739739153449477271351497620344} +#define T_121_30 {0.0129814231979309014503032315701602783520,-0.9999157377758168818715489578607957810163} +#define T_121_36 {-0.2941644017754964091082570121216122061014,-0.9557548350534588177396244645933620631695} +#define T_121_42 {-0.5729855062290020173421112303913105279207,-0.8195655005254273151038546529889572411776} +#define T_121_48 {-0.7966345564572361315214266141992993652821,-0.6044612340408462847207715640251990407705} +#define T_121_54 {-0.9435766665119819096574360628437716513872,-0.3311541550609564121820937998563749715686} +#define T_121_60 {-0.9996629653035123963533692403871100395918,-0.0259606587086783635598408181976992636919} +// Pre-computed twiddles for N=125 +#define T_125_1 {0.9987369566060174674859695187478791922331,-0.0502443181797695564982930704900354612619} +#define T_125_2 {0.9949510169813001736471846925269346684217,-0.1003617148512148948835687178871012292802} +#define T_125_3 {0.9886517447379140621777082742482889443636,-0.1502255891207570637213564168632728978992} +#define T_125_4 {0.9798550523842468606972033740021288394928,-0.1997099805144070261420097267546225339174} +#define T_125_6 {0.9548645447466429514449259841057937592268,-0.2970415815770349188262855477660195901990} +#define T_125_7 {0.9387338576538740708699037895712535828352,-0.3446429231745170573830705507134553045034} +#define T_125_8 {0.9202318473658703812390058374148793518543,-0.3913736668372024274376030916755553334951} +#define T_125_9 {0.8994052515663710778426320757716894149780,-0.4371157666509328798554179229540750384331} +#define T_125_11 {0.8509944817946918371731612751318607479334,-0.5251746299612957136204727248696144670248} +#define T_125_12 {0.8235325976284274540617502680106554180384,-0.5672689491267565164989150616747792810202} +#define T_125_13 {0.7939903986478353825617659822455607354641,-0.6079302976946053815865411706909071654081} +#define T_125_14 {0.7624425110114478876255361683433875441551,-0.6470559615694443378330902305606286972761} +#define T_125_16 {0.6936533058128049322732522341539151966572,-0.7203090248879069346088499514735303819180} +#define T_125_17 {0.6565857557529564125786691874964162707329,-0.7542513807361037603271824991679750382900} +#define T_125_18 {0.6178596130903343253137904866889584809542,-0.7862884321366189199764562545169610530138} +#define T_125_19 {0.5775727034222676303798493790964130312204,-0.8163392507171839396207246863923501223326} +#define T_125_21 {0.4927273415482915641661065819789655506611,-0.8701837546695256886408742502680979669094} +#define T_125_22 {0.4483832160900322327812261846702313050628,-0.8938414241512637747888447847799398005009} +#define T_125_23 {0.4029064357136626361999276468850439414382,-0.9152411726209175313329069467727094888687} +#define T_125_24 {0.3564118787132506960269040519051486626267,-0.9343289424566120215942532922781538218260} +#define T_125_26 {0.2608415062898969405402738175325794145465,-0.9653816388332738807065425135078839957714} +#define T_125_27 {0.2120071099220546539942944264112156815827,-0.9772681235681934808212645293679088354111} +#define T_125_28 {0.1626371651948836094536687824074761010706,-0.9866859442078680375587396156333852559328} +#define T_125_29 {0.1128563848734816937868785657883563544601,-0.9936113105200083950307998748030513525009} +#define T_125_31 {0.0125660398833526074091615498673490947112,-0.9999210442038161117395134169782977551222} +#define T_125_32 {-0.0376901826699345410265351574707892723382,-0.9992894726405892047438328518182970583439} +#define T_125_33 {-0.0878511965507431796806869783722504507750,-0.9961336091431725048295220403815619647503} +#define T_125_34 {-0.1377902906846380504113369624974438920617,-0.9904614256966511876001391101453918963671} +#define T_125_36 {-0.2364989970237246774686923345143441110849,-0.9716317329146739734113680242444388568401} +#define T_125_37 {-0.2850192624699761090312222222564741969109,-0.9585217890173758359395606021280400454998} +#define T_125_38 {-0.3328195445229866211711566847952781245112,-0.9429905358928645231486598277115263044834} +#define T_125_39 {-0.3797790955218011132998867651622276753187,-0.9250772068344580434384738509834278374910} +#define T_125_41 {-0.4707039321653325725236527432571165263653,-0.8822912264349532796003927614947315305471} +#define T_125_42 {-0.5144395337815064195297054538968950510025,-0.8575266561936522036191377083014231175184} +#define T_125_43 {-0.5568756164881879522354779510351363569498,-0.8305958991958126702570552879478782415390} +#define T_125_44 {-0.5979049830575188240189277166791725903749,-0.8015669848708765199418735392100643366575} +#define T_125_46 {-0.6753328081210244748788795732252765446901,-0.7375131173581739307820726025965996086597} +#define T_125_47 {-0.7115356772092853443467674878775142133236,-0.7026499697988491943689837171405088156462} +#define T_125_48 {-0.7459411454241821060051620406738948076963,-0.6660118674342516698061444913037121295929} +#define T_125_49 {-0.7784623015670234513763148243015166372061,-0.6276913612907004669239086069865152239799} +#define T_125_51 {-0.8375280400421417636991350263997446745634,-0.5463943467342691207022653543390333652496} +#define T_125_52 {-0.8639234171928352745695178782625589519739,-0.5036232016357607976786425751924980431795} +#define T_125_53 {-0.8881364488135444767991089065617416054010,-0.4595798606214878678777324694237904623151} +#define T_125_54 {-0.9101059706849956576490967563586309552193,-0.4143755809932841427389860200491966679692} +#define T_125_56 {-0.9470983049947442333760250221530441194773,-0.3209436098072094845079504921159241348505} +#define T_125_57 {-0.9620276715860859306772567833831999450922,-0.2729519355173252170843056774174328893423} +#define T_125_58 {-0.9745268727865771252183435535698663443327,-0.2242707609493811982126487691857619211078} +#define T_125_59 {-0.9845643345292053316697433729132171720266,-0.1750230589752760357313832173531409353018} +#define T_125_61 {-0.9971589002606139340301183437986765056849,-0.0753268055279327219952989480589167214930} +#define T_125_62 {-0.9996841892832999398876836494309827685356,-0.0251300954433374794383393435737161780708} +#define T_125_63 {-0.9996841892832999398876836494309827685356,0.0251300954433374794383393435737161780708} +#define T_125_66 {-0.9845643345292053316697433729132171720266,0.1750230589752760357313832173531409353018} +#define T_125_69 {-0.9470983049947442333760250221530441194773,0.3209436098072094845079504921159241348505} +#define T_125_72 {-0.8881364488135444767991089065617416054010,0.4595798606214878678777324694237904623151} +// Pre-computed twiddles for N=128 +#define T_128_1 {0.9987954562051724050064649418345652520657,-0.0490676743274180149345653489945107139647} +#define T_128_3 {0.9891765099647810144389836750633548945189,-0.1467304744553617479319029826001496985555} +#define T_128_5 {0.9700312531945439742386838588572572916746,-0.2429801799032638986997056917971349321306} +#define T_128_7 {0.9415440651830208063088889502978418022394,-0.3368898533922200511092626129538984969258} +#define T_128_9 {0.9039892931234433381959547659789677709341,-0.4275550934302820849097770405933260917664} +#define T_128_11 {0.8577286100002721180857179206213913857937,-0.5141027441932217723064013625844381749630} +#define T_128_13 {0.8032075314806449428672863177780527621508,-0.5956993044924333569056784654094371944666} +#define T_128_15 {0.7409511253549591058842338497925084084272,-0.6715589548470184411144145997241139411926} +#define T_128_17 {0.6715589548470184411144145997241139411926,-0.7409511253549591058842338497925084084272} +#define T_128_19 {0.5956993044924333569056784654094371944666,-0.8032075314806449428672863177780527621508} +#define T_128_21 {0.5141027441932217723064013625844381749630,-0.8577286100002721180857179206213913857937} +#define T_128_23 {0.4275550934302820849097770405933260917664,-0.9039892931234433381959547659789677709341} +#define T_128_25 {0.3368898533922200511092626129538984969258,-0.9415440651830208063088889502978418022394} +#define T_128_27 {0.2429801799032638986997056917971349321306,-0.9700312531945439742386838588572572916746} +#define T_128_29 {0.1467304744553617479319029826001496985555,-0.9891765099647810144389836750633548945189} +#define T_128_31 {0.0490676743274180149345653489945107139647,-0.9987954562051724050064649418345652520657} +#define T_128_33 {-0.0490676743274180149345653489945107139647,-0.9987954562051724050064649418345652520657} +#define T_128_35 {-0.1467304744553617479319029826001496985555,-0.9891765099647810144389836750633548945189} +#define T_128_37 {-0.2429801799032638986997056917971349321306,-0.9700312531945439742386838588572572916746} +#define T_128_39 {-0.3368898533922200511092626129538984969258,-0.9415440651830208063088889502978418022394} +#define T_128_41 {-0.4275550934302820849097770405933260917664,-0.9039892931234433381959547659789677709341} +#define T_128_43 {-0.5141027441932217723064013625844381749630,-0.8577286100002721180857179206213913857937} +#define T_128_45 {-0.5956993044924333569056784654094371944666,-0.8032075314806449428672863177780527621508} +#define T_128_47 {-0.6715589548470184411144145997241139411926,-0.7409511253549591058842338497925084084272} +#define T_128_49 {-0.7409511253549591058842338497925084084272,-0.6715589548470184411144145997241139411926} +#define T_128_51 {-0.8032075314806449428672863177780527621508,-0.5956993044924333569056784654094371944666} +#define T_128_53 {-0.8577286100002721180857179206213913857937,-0.5141027441932217723064013625844381749630} +#define T_128_55 {-0.9039892931234433381959547659789677709341,-0.4275550934302820849097770405933260917664} +#define T_128_57 {-0.9415440651830208063088889502978418022394,-0.3368898533922200511092626129538984969258} +#define T_128_59 {-0.9700312531945439742386838588572572916746,-0.2429801799032638986997056917971349321306} +#define T_128_61 {-0.9891765099647810144389836750633548945189,-0.1467304744553617479319029826001496985555} +#define T_128_63 {-0.9987954562051724050064649418345652520657,-0.0490676743274180149345653489945107139647} +#define T_128_65 {-0.9987954562051724050064649418345652520657,0.0490676743274180149345653489945107139647} +#define T_128_67 {-0.9891765099647810144389836750633548945189,0.1467304744553617479319029826001496985555} +#define T_128_69 {-0.9700312531945439742386838588572572916746,0.2429801799032638986997056917971349321306} +#define T_128_71 {-0.9415440651830208063088889502978418022394,0.3368898533922200511092626129538984969258} +#define T_128_73 {-0.9039892931234433381959547659789677709341,0.4275550934302820849097770405933260917664} +#define T_128_75 {-0.8577286100002721180857179206213913857937,0.5141027441932217723064013625844381749630} +#define T_128_77 {-0.8032075314806449428672863177780527621508,0.5956993044924333569056784654094371944666} +#define T_128_79 {-0.7409511253549591058842338497925084084272,0.6715589548470184411144145997241139411926} +#define T_128_81 {-0.6715589548470184411144145997241139411926,0.7409511253549591058842338497925084084272} +#define T_128_83 {-0.5956993044924333569056784654094371944666,0.8032075314806449428672863177780527621508} +#define T_128_85 {-0.5141027441932217723064013625844381749630,0.8577286100002721180857179206213913857937} +#define T_128_87 {-0.4275550934302820849097770405933260917664,0.9039892931234433381959547659789677709341} +#define T_128_89 {-0.3368898533922200511092626129538984969258,0.9415440651830208063088889502978418022394} +#define T_128_91 {-0.2429801799032638986997056917971349321306,0.9700312531945439742386838588572572916746} +#define T_128_93 {-0.1467304744553617479319029826001496985555,0.9891765099647810144389836750633548945189} +#define T_128_95 {-0.0490676743274180149345653489945107139647,0.9987954562051724050064649418345652520657} +#define T_128_97 {0.0490676743274180149345653489945107139647,0.9987954562051724050064649418345652520657} +#define T_128_99 {0.1467304744553617479319029826001496985555,0.9891765099647810144389836750633548945189} +#define T_128_101 {0.2429801799032638986997056917971349321306,0.9700312531945439742386838588572572916746} +#define T_128_103 {0.3368898533922200511092626129538984969258,0.9415440651830208063088889502978418022394} +#define T_128_105 {0.4275550934302820849097770405933260917664,0.9039892931234433381959547659789677709341} +#define T_128_107 {0.5141027441932217723064013625844381749630,0.8577286100002721180857179206213913857937} +#define T_128_109 {0.5956993044924333569056784654094371944666,0.8032075314806449428672863177780527621508} +#define T_128_111 {0.6715589548470184411144145997241139411926,0.7409511253549591058842338497925084084272} +#define T_128_113 {0.7409511253549591058842338497925084084272,0.6715589548470184411144145997241139411926} +#define T_128_115 {0.8032075314806449428672863177780527621508,0.5956993044924333569056784654094371944666} +#define T_128_117 {0.8577286100002721180857179206213913857937,0.5141027441932217723064013625844381749630} +#define T_128_119 {0.9039892931234433381959547659789677709341,0.4275550934302820849097770405933260917664} +#define T_128_121 {0.9415440651830208063088889502978418022394,0.3368898533922200511092626129538984969258} +#define T_128_123 {0.9700312531945439742386838588572572916746,0.2429801799032638986997056917971349321306} +#define T_128_125 {0.9891765099647810144389836750633548945189,0.1467304744553617479319029826001496985555} +#define T_128_127 {0.9987954562051724050064649418345652520657,0.0490676743274180149345653489945107139647} +// Pre-computed twiddles for N=144 +#define T_144_1 {0.9990482215818577982346937460533808916807,-0.0436193873653360000841594512621668400243} +#define T_144_5 {0.9762960071199333622615768035757355391979,-0.2164396139381028760784175801745732314885} +#define T_144_7 {0.9537169507482269326104074025352019816637,-0.3007057995042731191048801520082633942366} +#define T_144_11 {0.8870108331782217137018164976325351744890,-0.4617486132350339111951598169980570673943} +#define T_144_13 {0.8433914458128857205565509502775967121124,-0.5372996083468238870395339290553238242865} +#define T_144_17 {0.7372773368101240842875654379895422607660,-0.6755902076156602431566966515674721449614} +#define T_144_19 {0.6755902076156602431566966515674721449614,-0.7372773368101240842875654379895422607660} +#define T_144_23 {0.5372996083468238870395339290553238242865,-0.8433914458128857205565509502775967121124} +#define T_144_35 {0.0436193873653360000841594512621668400243,-0.9990482215818577982346937460533808916807} +#define T_144_49 {-0.5372996083468238870395339290553238242865,-0.8433914458128857205565509502775967121124} +#define T_144_77 {-0.9762960071199333622615768035757355391979,0.2164396139381028760784175801745732314885} +// Pre-computed twiddles for N=162 +#define T_162_1 {0.9992479525042300414483520398789551109076,-0.0387753712568167097085591876748367212713} +#define T_162_5 {0.9812553106273846914575642585987225174904,-0.1927122605480896810892232906553545035422} +#define T_162_7 {0.9633708786158803327737132349284365773201,-0.2681726127606373633049940963246626779437} +#define T_162_11 {0.9103629409661466720393718787818215787411,-0.4138107245051391713808186523237964138389} +#define T_162_13 {0.8755582313020908724610080753336660563946,-0.4831125992966384674609514604526339098811} +#define T_162_17 {0.7903926695187593054470198694616556167603,-0.6126005451932028389450124450377188622952} +#define T_162_19 {0.7405440131090045285944256647781003266573,-0.6720078605555224182666052001877687871456} +#define T_162_23 {0.6278121246720985704214967881853226572275,-0.7783649119241600189411656174343079328537} +#define T_162_25 {0.5656068754865386516428316099336370825768,-0.8246750041091067684462245779286604374647} +#define T_162_29 {0.4313860656812534277726456366508500650525,-0.9021674247810377300638151609746273607016} +#define T_162_31 {0.3601777248047104418304797945893369615078,-0.9328837047320005515516072591708507388830} +#define T_162_35 {0.2117038722294107555566000655744574032724,-0.9773338582506355676216003303125035017729} +#define T_162_37 {0.1353312997501311121251177382873720489442,-0.9908004033648453168225955778325442224741} +#define T_162_41 {-0.0193913317718243730092275711740512633696,-0.9998119704485014525019437314767856150866} +#define T_162_43 {-0.0968108707031790916008873182363458909094,-0.9953027957931658287193954492977354675531} +#define T_162_47 {-0.2494411440579812899720479890675051137805,-0.9683899605278059041779670224059373140335} +#define T_162_49 {-0.3237339420583210114479300045786658301950,-0.9461481568757503790578766711405478417873} +#define T_162_53 {-0.4660435197025388220914976500353077426553,-0.8847617971766578159886762477981392294168} +#define T_162_55 {-0.5332044328016912748680056211014743894339,-0.8459864259198410785245414444943889975548} +#define T_162_59 {-0.6575213685690636244984830227622296661139,-0.7534358963276607257242289961141068488359} +#define T_162_61 {-0.7139297345578989872905140146031044423580,-0.7002173477671684853618216948234476149082} +#define T_162_65 {-0.8135520702629676081940601761743891984224,-0.5814920712880267128497280282317660748959} +#define T_162_67 {-0.8561668995302664786350987924379296600819,-0.5166993711518629295298410397663246840239} +#define T_162_71 {-0.9257239692688904453277132233779411762953,-0.3781998581716424778775831327948253601789} +#define T_162_73 {-0.9522478853384153119066013459814712405205,-0.3053259976951130938438438988669076934457} +#define T_162_77 {-0.9879898494768090122519765827746596187353,-0.1545187928078404770992904104787157848477} +#define T_162_79 {-0.9969929411677920239043260153266601264477,-0.0774924206719309477398027752315101679415} +#define T_162_83 {-0.9969929411677920239043260153266601264477,0.0774924206719309477398027752315101679415} +#define T_162_85 {-0.9879898494768090122519765827746596187353,0.1545187928078404770992904104787157848477} +#define T_162_89 {-0.9522478853384153119066013459814712405205,0.3053259976951130938438438988669076934457} +#define T_162_91 {-0.9257239692688904453277132233779411762953,0.3781998581716424778775831327948253601789} +#define T_162_95 {-0.8561668995302664786350987924379296600819,0.5166993711518629295298410397663246840239} +#define T_162_97 {-0.8135520702629676081940601761743891984224,0.5814920712880267128497280282317660748959} +#define T_162_101 {-0.7139297345578989872905140146031044423580,0.7002173477671684853618216948234476149082} +#define T_162_103 {-0.6575213685690636244984830227622296661139,0.7534358963276607257242289961141068488359} +#define T_162_107 {-0.5332044328016912748680056211014743894339,0.8459864259198410785245414444943889975548} +// Pre-computed twiddles for N=192 +#define T_192_1 {0.9994645874763656845374271142645739018917,-0.0327190828217761436880195446974539663643} +#define T_192_5 {0.9866433320848789767509856574179138988256,-0.1628954733945887356494353070957004092634} +#define T_192_7 {0.9738769792773336320124144549481570720673,-0.2270762630343731991455769048116053454578} +#define T_192_11 {0.9359059267573256457595221036172006279230,-0.3522500479212334889567159734724555164576} +#define T_192_13 {0.9108638249211757909051812021061778068542,-0.4127070298043947205179904358374187722802} +#define T_192_19 {0.8128466845916152427875545072311069816351,-0.5824776968678021527026089643186423927546} +#define T_192_35 {0.4127070298043947205179904358374187722802,-0.9108638249211757909051812021061778068542} +#define T_192_49 {-0.0327190828217761436880195446974539663643,-0.9994645874763656845374271142645739018917} +#define T_192_77 {-0.8128466845916152427875545072311069816351,-0.5824776968678021527026089643186423927546} +#define T_192_91 {-0.9866433320848789767509856574179138988256,-0.1628954733945887356494353070957004092634} +// Pre-computed twiddles for N=200 +#define T_200_1 {0.9995065603657316000152377455378882586956,-0.0314107590781282919323302849079482257366} +#define T_200_3 {0.9955619646030799962943547143368050456047,-0.0941083133185143250187820740393362939358} +#define T_200_7 {0.9759167619387474346126509772147983312607,-0.2181432413965425642921047710842685773969} +#define T_200_9 {0.9602936856769430651326047154725529253483,-0.2789911060392292752041498715698253363371} +#define T_200_11 {0.9408807689542254548697997051931452006102,-0.3387379202452913684595614540739916265011} +#define T_200_13 {0.9177546256839811400496387250314000993967,-0.3971478906347806203669392743904609233141} +#define T_200_17 {0.8607420270039436438480606739176437258720,-0.5090414157503713177277404611231759190559} +#define T_200_19 {0.8270805742745618305278298976190853863955,-0.5620833778521305834630084063974209129810} +#define T_200_21 {0.7901550123756904131511191735626198351383,-0.6129070536529764856581437015847768634558} +#define T_200_23 {0.7501110696304595926164893171517178416252,-0.6613118653236518262872323248302564024925} +#define T_200_27 {0.6613118653236518262872323248302564024925,-0.7501110696304595926164893171517178416252} +#define T_200_33 {0.5090414157503713177277404611231759190559,-0.8607420270039436438480606739176437258720} +#define T_200_39 {0.3387379202452913684595614540739916265011,-0.9408807689542254548697997051931452006102} +#define T_200_51 {-0.0314107590781282919323302849079482257366,-0.9995065603657316000152377455378882586956} +#define T_200_57 {-0.2181432413965425642921047710842685773969,-0.9759167619387474346126509772147983312607} +#define T_200_63 {-0.3971478906347806203669392743904609233141,-0.9177546256839811400496387250314000993967} +#define T_200_69 {-0.5620833778521305834630084063974209129810,-0.8270805742745618305278298976190853863955} +#define T_200_81 {-0.8270805742745618305278298976190853863955,-0.5620833778521305834630084063974209129810} +#define T_200_87 {-0.9177546256839811400496387250314000993967,-0.3971478906347806203669392743904609233141} +#define T_200_93 {-0.9759167619387474346126509772147983312607,-0.2181432413965425642921047710842685773969} +#define T_200_99 {-0.9995065603657316000152377455378882586956,-0.0314107590781282919323302849079482257366} +#define T_200_111 {-0.9408807689542254548697997051931452006102,0.3387379202452913684595614540739916265011} +#define T_200_117 {-0.8607420270039436438480606739176437258720,0.5090414157503713177277404611231759190559} +// Pre-computed twiddles for N=216 +#define T_216_1 {0.9995769500822005992901608806278090924025,-0.0290847187431114082012406640842527849600} +#define T_216_5 {0.9894416385809444713927973680256400257349,-0.1449318593072467242777179308177437633276} +#define T_216_7 {0.9793406217655514778286374166782479733229,-0.2022175723320379370218091708011343143880} +#define T_216_11 {0.9492426435730338951657358848024159669876,-0.3145447561516136469883520021539879962802} +#define T_216_13 {0.9293475242268224301156465116946492344141,-0.3692061473126844739489627045259112492204} +#define T_216_17 {0.8802013911801110923605051539198029786348,-0.4746003697476404159516505387728102505207} +#define T_216_19 {0.8511166724369997327315218171861488372087,-0.5249765803345601833385103418549988418818} +#define T_216_23 {0.7844156649195757058379285808769054710865,-0.6202354912682600529549858947575557976961} +#define T_216_25 {0.7470250712409959303883510983723681420088,-0.6647958656139377753291341832664329558611} +#define T_216_29 {0.6647958656139377753291341832664329558611,-0.7470250712409959303883510983723681420088} +#define T_216_31 {0.6202354912682600529549858947575557976961,-0.7844156649195757058379285808769054710865} +#define T_216_35 {0.5249765803345601833385103418549988418818,-0.8511166724369997327315218171861488372087} +#define T_216_49 {0.1449318593072467242777179308177437633276,-0.9894416385809444713927973680256400257349} +#define T_216_77 {-0.6202354912682600529549858947575557976961,-0.7844156649195757058379285808769054710865} +#define T_216_91 {-0.8802013911801110923605051539198029786348,-0.4746003697476404159516505387728102505207} +#define T_216_119 {-0.9492426435730338951657358848024159669876,0.3145447561516136469883520021539879962802} +// Pre-computed twiddles for N=243 +#define T_243_1 {0.9996657333896874497725093533517792820930,-0.0258538485811760469779141402568711782806} +#define T_243_2 {0.9986631570270831925739685175358317792416,-0.0516904130056945673721280343215767061338} +#define T_243_4 {0.9946562024066014640055755080538801848888,-0.1032426220806014760533741991821443662047} +#define T_243_5 {0.9916545029310011027945392925175838172436,-0.1289238023666266774736044453675276599824} +#define T_243_7 {0.9836646919866011140598516249156091362238,-0.1800104823028513545413176188958459533751} +#define T_243_8 {0.9786819219718442264266400343331042677164,-0.2053818288103820000056742856031632982194} +#define T_243_10 {0.9667573063666620170053533911413978785276,-0.2556957383037036235506889170210342854261} +#define T_243_11 {0.9598234327779119334778101801930461078882,-0.2806046647695387696863633664179360494018} +#define T_243_13 {0.9440357285532999798149944581382442265749,-0.3298432100481077378084648898948216810822} +#define T_243_14 {0.9351924525221896322690895431151147931814,-0.3541399112575875918196288694161921739578} +#define T_243_16 {0.9156366087890058791387559722352307289839,-0.4020069659164738906653724370698910206556} +#define T_243_17 {0.9049371148181253321496342323371209204197,-0.4255453186496674367056414212129311636090} +#define T_243_19 {0.8817307427216080073506532244209665805101,-0.4717530045899035462397819173929747194052} +#define T_243_20 {0.8692393788266478127368941386521328240633,-0.4943914464238468586820829386851983144879} +#define T_243_22 {0.8425220442191495573425186194072011858225,-0.5386618651851877714875627134460955858231} +#define T_243_23 {0.8283139349323630495902648362971376627684,-0.5602642458669525593606408619962166994810} +#define T_243_25 {0.7982463190078928905180077890690881758928,-0.6023311499419146120004597833030857145786} +#define T_243_26 {0.7824069135703197730435931589454412460327,-0.6227675502122490547307620545325335115194} +#define T_243_28 {0.7491698465089360325919187744148075580597,-0.6623779442899478953066250142001081258059} +#define T_243_29 {0.7317944049686120333575445329188369214535,-0.6815254572329891846749205797095783054829} +#define T_243_31 {0.6955877784024419030473040947981644421816,-0.7184411197427074124988166659022681415081} +#define T_243_32 {0.6767807987092619992353093039127998054028,-0.7361845899626352496269987568666692823172} +#define T_243_34 {0.6378223635507059841032173608255106955767,-0.7701835057663797279303707910003140568733} +#define T_243_35 {0.6176969530934570684621576219797134399414,-0.7864162219455163027603816772170830518007} +#define T_243_37 {0.5762210099555802322868203191319480538368,-0.8172939175631805941435459317290224134922} +#define T_243_38 {0.5548982053207968112573666985554154962301,-0.8319182542364116361710557612241245806217} +#define T_243_40 {0.5111541954058731462851028481964021921158,-0.8594890275733452211781582263938616961241} +#define T_243_41 {0.4887622344495642368045196235470939427614,-0.8724170322591536796963396227511111646891} +#define T_243_43 {0.4430132393803352108818671695189550518990,-0.8965150694404092623912561066390480846167} +#define T_243_44 {0.4196867899904290766244230326265096664429,-0.9076689915974487377070545335300266742706} +#define T_243_46 {0.3722079496062699610625656987394904717803,-0.9281493641919367210135760615230537950993} +#define T_243_47 {0.3480872997940905322344917749433079734445,-0.9374621228199350175813719943107571452856} +#define T_243_49 {0.2991641574276416304378756194637389853597,-0.9542016594570611465186971145158167928457} +#define T_243_50 {0.2743943716193014692450447000737767666578,-0.9616172465298491456309193381457589566708} +#define T_243_52 {0.2243211568052675963791386948287254199386,-0.9745152736666299331247387272014748305082} +#define T_243_53 {0.1990512034071405256252518256587791256607,-0.9799890909710011754896186175756156444550} +#define T_243_55 {0.1481290623512488136004350280927610583603,-0.9889680383545971231740168150281533598900} +#define T_243_56 {0.1225109178364454237675218450931424740702,-0.9924671657092095644614460070442873984575} +#define T_243_58 {0.0710461022867300423522252117436437401921,-0.9974730328935532686074338926118798553944} +#define T_243_59 {0.0452338371907063732657583443597104633227,-0.9989764261347735541107795143034309148788} +#define T_243_61 {-0.0064641373965392589021061731102690828266,-0.9999791072456057916539862162608187645674} +#define T_243_62 {-0.0323152850743106193065479203596623847261,-0.9994777247895852978487596374179702252150} +#define T_243_64 {-0.0839355009969068172059891708158829715103,-0.9964711895847256917235768014506902545691} +#define T_243_65 {-0.1096700594125315647575646949007932562381,-0.9939680468045498651363800490798894315958} +#define T_243_67 {-0.1609020666180572822767658180964644998312,-0.9869703769404826187994217434606980532408} +#define T_243_68 {-0.1863652651091820544593247177544981241226,-0.9824805280313622102283943604561500251293} +#define T_243_70 {-0.2369009482781190834010942580789560452104,-0.9715338083180264927563030141755007207394} +#define T_243_71 {-0.2619396481729058812071286865830188617110,-0.9650842557595965542915905643894802778959} +#define T_243_73 {-0.3114750797604245446947857089980971068144,-0.9502543210573878296187899650249164551497} +#define T_243_74 {-0.3359386953715421397781426549045136198401,-0.9418838532176173616861092341423500329256} +#define T_243_76 {-0.3841759634635175113537286506470991298556,-0.9232598924987903687977564004540909081697} +#define T_243_77 {-0.4079173677281835996843994962546275928617,-0.9130188503561741386604921899561304599047} +#define T_243_79 {-0.4545663677185005635550396618782542645931,-0.8907128703118692980567061567853670567274} +#define T_243_80 {-0.4774427770379489621710433766565984115005,-0.8786628446988590246391481741738971322775} +#define T_243_82 {-0.5222229563517384320903147454373538494110,-0.8528089961176830158251505054067820310593} +#define T_243_83 {-0.5440967893085826290189288556575775146484,-0.8390224573061747515012598341854754835367} +#define T_243_85 {-0.5867388346784178088100247805414255708456,-0.8097762282755726070959667595161590725183} +#define T_243_86 {-0.6074785394674836469519618731283117085695,-0.7943360901321637745908788019733037799597} +#define T_243_88 {-0.6477259966150589187705577387532684952021,-0.7618733709147660349003672308754175901413} +#define T_243_89 {-0.6672068422114196817318543253350071609020,-0.7448724922470058018575400637928396463394} +#define T_243_91 {-0.7048176581937560802870734733005519956350,-0.7093885174558928197185991848527919501066} +#define T_243_92 {-0.7229224844997927945655646908562630414963,-0.6909291435484877785810908790153916925192} +#define T_243_94 {-0.7576704634441179253556697403837461024523,-0.6526373179832544169087782393035013228655} +#define T_243_95 {-0.7742903859041323499923237250186502933502,-0.6328304656828950269797928740445058792830} +#define T_243_97 {-0.8059665493764742727478278538910672068596,-0.5919610808880759744710076120099984109402} +#define T_243_98 {-0.8210016138212185010658572537067811936140,-0.5709258709350581995067841489799320697784} +#define T_243_100 {-0.8494154576472973117162723610817920416594,-0.5277247201996817516089777200249955058098} +#define T_243_101 {-0.8627752414301085304160210398549679666758,-0.5055876608217588774607520463177934288979} +#define T_243_103 {-0.8877558814098559514249586754885967820883,-0.4603145609495857271120655695995083078742} +#define T_243_104 {-0.8993600372190930780647022402263246476650,-0.4372087870266007092467930306156631559134} +#define T_243_106 {-0.9207572368443383004077418263477738946676,-0.3901360157672949524609862237412016838789} +#define T_243_107 {-0.9305359759215685588884525714092887938023,-0.3662004881423480129321035292377928271890} +#define T_243_109 {-0.9482210499160764749504437531868461519480,-0.3176111466810532801829936033755075186491} +#define T_243_110 {-0.9561155617738796852478344590053893625736,-0.2929898164336406929386669162340695038438} +#define T_243_112 {-0.9699821500217434833146512573875952512026,-0.2431761267871417053765270566145773045719} +#define T_243_113 {-0.9759449561369035741620336921187117695808,-0.2180170694944258691361227420202339999378} +#define T_243_115 {-0.9859096633447964608265579045109916478395,-0.1672786170535553174065768189393565990031} +#define T_243_116 {-0.9899049026997269740135720894613768905401,-0.1417331422464204460087699999348842538893} +#define T_243_118 {-0.9959077999460093089112433517584577202797,-0.0903750740342681158079329861720907501876} +#define T_243_119 {-0.9979114447011320221392338680743705481291,-0.0645968152039976400979881532293802592903} +#define T_243_121 {-0.9999164298554372720673200092278420925140,-0.0129280046858085244376734834759190562181} +#define T_243_122 {-0.9999164298554372720673200092278420925140,0.0129280046858085244376734834759190562181} +#define T_243_124 {-0.9979114447011320221392338680743705481291,0.0645968152039976400979881532293802592903} +#define T_243_125 {-0.9959077999460093089112433517584577202797,0.0903750740342681158079329861720907501876} +#define T_243_128 {-0.9859096633447964608265579045109916478395,0.1672786170535553174065768189393565990031} +#define T_243_130 {-0.9759449561369035741620336921187117695808,0.2180170694944258691361227420202339999378} +#define T_243_134 {-0.9482210499160764749504437531868461519480,0.3176111466810532801829936033755075186491} +#define T_243_136 {-0.9305359759215685588884525714092887938023,0.3662004881423480129321035292377928271890} +#define T_243_140 {-0.8877558814098559514249586754885967820883,0.4603145609495857271120655695995083078742} +#define T_243_142 {-0.8627752414301085304160210398549679666758,0.5055876608217588774607520463177934288979} +#define T_243_146 {-0.8059665493764742727478278538910672068596,0.5919610808880759744710076120099984109402} +#define T_243_148 {-0.7742903859041323499923237250186502933502,0.6328304656828950269797928740445058792830} +#define T_243_152 {-0.7048176581937560802870734733005519956350,0.7093885174558928197185991848527919501066} +#define T_243_154 {-0.6672068422114196817318543253350071609020,0.7448724922470058018575400637928396463394} +#define T_243_158 {-0.5867388346784178088100247805414255708456,0.8097762282755726070959667595161590725183} +#define T_243_160 {-0.5440967893085826290189288556575775146484,0.8390224573061747515012598341854754835367} +// Pre-computed twiddles for N=250 +#define T_250_1 {0.9996841892832999398876836494309827685356,-0.0251300954433374794383393435737161780708} +#define T_250_3 {0.9971589002606139340301183437986765056849,-0.0753268055279327219952989480589167214930} +#define T_250_7 {0.9845643345292053316697433729132171720266,-0.1750230589752760357313832173531409353018} +#define T_250_9 {0.9745268727865771252183435535698663443327,-0.2242707609493811982126487691857619211078} +#define T_250_11 {0.9620276715860859306772567833831999450922,-0.2729519355173252170843056774174328893423} +#define T_250_13 {0.9470983049947442333760250221530441194773,-0.3209436098072094845079504921159241348505} +#define T_250_17 {0.9101059706849956576490967563586309552193,-0.4143755809932841427389860200491966679692} +#define T_250_19 {0.8881364488135444767991089065617416054010,-0.4595798606214878678777324694237904623151} +#define T_250_21 {0.8639234171928352745695178782625589519739,-0.5036232016357607976786425751924980431795} +#define T_250_23 {0.8375280400421417636991350263997446745634,-0.5463943467342691207022653543390333652496} +#define T_250_27 {0.7784623015670234513763148243015166372061,-0.6276913612907004669239086069865152239799} +#define T_250_33 {0.6753328081210244748788795732252765446901,-0.7375131173581739307820726025965996086597} +#define T_250_39 {0.5568756164881879522354779510351363569498,-0.8305958991958126702570552879478782415390} +#define T_250_51 {0.2850192624699761090312222222564741969109,-0.9585217890173758359395606021280400454998} +#define T_250_57 {0.1377902906846380504113369624974438920617,-0.9904614256966511876001391101453918963671} +#define T_250_63 {-0.0125660398833526074091615498673490947112,-0.9999210442038161117395134169782977551222} +#define T_250_69 {-0.1626371651948836094536687824074761010706,-0.9866859442078680375587396156333852559328} +#define T_250_81 {-0.4483832160900322327812261846702313050628,-0.8938414241512637747888447847799398005009} +#define T_250_87 {-0.5775727034222676303798493790964130312204,-0.8163392507171839396207246863923501223326} +#define T_250_93 {-0.6936533058128049322732522341539151966572,-0.7203090248879069346088499514735303819180} +#define T_250_99 {-0.7939903986478353825617659822455607354641,-0.6079302976946053815865411706909071654081} +#define T_250_111 {-0.9387338576538740708699037895712535828352,-0.3446429231745170573830705507134553045034} +#define T_250_117 {-0.9798550523842468606972033740021288394928,-0.1997099805144070261420097267546225339174} +#define T_250_123 {-0.9987369566060174674859695187478791922331,-0.0502443181797695564982930704900354612619} +#define T_250_129 {-0.9949510169813001736471846925269346684217,0.1003617148512148948835687178871012292802} +#define T_250_141 {-0.9202318473658703812390058374148793518543,0.3913736668372024274376030916755553334951} +#define T_250_147 {-0.8509944817946918371731612751318607479334,0.5251746299612957136204727248696144670248} +// Pre-computed twiddles for N=256 +#define T_256_1 {0.9996988186962042499672520534659270197153,-0.0245412285229122881236030195850617019460} +#define T_256_3 {0.9972904566786902069708276030723936855793,-0.0735645635996674263079597722025937400758} +#define T_256_5 {0.9924795345987099670637121562322136014700,-0.1224106751992161956632543251544120721519} +#define T_256_7 {0.9852776423889412216183814052783418446779,-0.1709618887603012171716443390323547646403} +#define T_256_9 {0.9757021300385285700329518476792145520449,-0.2191012401568697975928756704888655804098} +#define T_256_11 {0.9637760657954398402225137942878063768148,-0.2667127574748983653840639362897491082549} +#define T_256_13 {0.9495281805930366747503512669936753809452,-0.3136817403988914620960315460251877084374} +#define T_256_15 {0.9329927988347388456702447001589462161064,-0.3598950365349881663767916961660375818610} +#define T_256_17 {0.9142097557035306909511973572080023586750,-0.4052413140049898609973411112150643020868} +#define T_256_19 {0.8932243011955153244585403626842889934778,-0.4496113296546065951631021562207024544477} +#define T_256_21 {0.8700869911087114605408032730338163673878,-0.4928981922297840378988098564150277525187} +#define T_256_23 {0.8448535652497071168909315019845962524414,-0.5349976198870972643462096129951532930136} +#define T_256_25 {0.8175848131515837113880706965574063360691,-0.5758081914178453386554679127584677189589} +#define T_256_27 {0.7883464276266062276121715512999799102545,-0.6152315905806268192534957961470354348421} +#define T_256_29 {0.7572088465064845674845628309412859380245,-0.6531728429537767555146388076536823064089} +#define T_256_31 {0.7242470829514668917425979088875465095043,-0.6895405447370669405060539247642736881971} +#define T_256_33 {0.6895405447370669405060539247642736881971,-0.7242470829514668917425979088875465095043} +#define T_256_35 {0.6531728429537767555146388076536823064089,-0.7572088465064845674845628309412859380245} +#define T_256_37 {0.6152315905806268192534957961470354348421,-0.7883464276266062276121715512999799102545} +#define T_256_39 {0.5758081914178453386554679127584677189589,-0.8175848131515837113880706965574063360691} +#define T_256_41 {0.5349976198870972643462096129951532930136,-0.8448535652497071168909315019845962524414} +#define T_256_43 {0.4928981922297840378988098564150277525187,-0.8700869911087114605408032730338163673878} +#define T_256_45 {0.4496113296546065951631021562207024544477,-0.8932243011955153244585403626842889934778} +#define T_256_47 {0.4052413140049898609973411112150643020868,-0.9142097557035306909511973572080023586750} +#define T_256_49 {0.3598950365349881663767916961660375818610,-0.9329927988347388456702447001589462161064} +#define T_256_51 {0.3136817403988914620960315460251877084374,-0.9495281805930366747503512669936753809452} +#define T_256_53 {0.2667127574748983653840639362897491082549,-0.9637760657954398402225137942878063768148} +#define T_256_55 {0.2191012401568697975928756704888655804098,-0.9757021300385285700329518476792145520449} +#define T_256_57 {0.1709618887603012171716443390323547646403,-0.9852776423889412216183814052783418446779} +#define T_256_59 {0.1224106751992161956632543251544120721519,-0.9924795345987099670637121562322136014700} +#define T_256_61 {0.0735645635996674263079597722025937400758,-0.9972904566786902069708276030723936855793} +#define T_256_63 {0.0245412285229122881236030195850617019460,-0.9996988186962042499672520534659270197153} +#define T_256_65 {-0.0245412285229122881236030195850617019460,-0.9996988186962042499672520534659270197153} +#define T_256_67 {-0.0735645635996674263079597722025937400758,-0.9972904566786902069708276030723936855793} +#define T_256_69 {-0.1224106751992161956632543251544120721519,-0.9924795345987099670637121562322136014700} +#define T_256_71 {-0.1709618887603012171716443390323547646403,-0.9852776423889412216183814052783418446779} +#define T_256_73 {-0.2191012401568697975928756704888655804098,-0.9757021300385285700329518476792145520449} +#define T_256_75 {-0.2667127574748983653840639362897491082549,-0.9637760657954398402225137942878063768148} +#define T_256_77 {-0.3136817403988914620960315460251877084374,-0.9495281805930366747503512669936753809452} +#define T_256_79 {-0.3598950365349881663767916961660375818610,-0.9329927988347388456702447001589462161064} +#define T_256_81 {-0.4052413140049898609973411112150643020868,-0.9142097557035306909511973572080023586750} +#define T_256_83 {-0.4496113296546065951631021562207024544477,-0.8932243011955153244585403626842889934778} +#define T_256_85 {-0.4928981922297840378988098564150277525187,-0.8700869911087114605408032730338163673878} +#define T_256_87 {-0.5349976198870972643462096129951532930136,-0.8448535652497071168909315019845962524414} +#define T_256_89 {-0.5758081914178453386554679127584677189589,-0.8175848131515837113880706965574063360691} +#define T_256_91 {-0.6152315905806268192534957961470354348421,-0.7883464276266062276121715512999799102545} +#define T_256_93 {-0.6531728429537767555146388076536823064089,-0.7572088465064845674845628309412859380245} +#define T_256_95 {-0.6895405447370669405060539247642736881971,-0.7242470829514668917425979088875465095043} +#define T_256_97 {-0.7242470829514668917425979088875465095043,-0.6895405447370669405060539247642736881971} +#define T_256_99 {-0.7572088465064845674845628309412859380245,-0.6531728429537767555146388076536823064089} +#define T_256_101 {-0.7883464276266062276121715512999799102545,-0.6152315905806268192534957961470354348421} +#define T_256_103 {-0.8175848131515837113880706965574063360691,-0.5758081914178453386554679127584677189589} +#define T_256_105 {-0.8448535652497071168909315019845962524414,-0.5349976198870972643462096129951532930136} +#define T_256_107 {-0.8700869911087114605408032730338163673878,-0.4928981922297840378988098564150277525187} +#define T_256_109 {-0.8932243011955153244585403626842889934778,-0.4496113296546065951631021562207024544477} +#define T_256_111 {-0.9142097557035306909511973572080023586750,-0.4052413140049898609973411112150643020868} +#define T_256_113 {-0.9329927988347388456702447001589462161064,-0.3598950365349881663767916961660375818610} +#define T_256_115 {-0.9495281805930366747503512669936753809452,-0.3136817403988914620960315460251877084374} +#define T_256_117 {-0.9637760657954398402225137942878063768148,-0.2667127574748983653840639362897491082549} +#define T_256_119 {-0.9757021300385285700329518476792145520449,-0.2191012401568697975928756704888655804098} +#define T_256_121 {-0.9852776423889412216183814052783418446779,-0.1709618887603012171716443390323547646403} +#define T_256_123 {-0.9924795345987099670637121562322136014700,-0.1224106751992161956632543251544120721519} +#define T_256_125 {-0.9972904566786902069708276030723936855793,-0.0735645635996674263079597722025937400758} +#define T_256_127 {-0.9996988186962042499672520534659270197153,-0.0245412285229122881236030195850617019460} +#define T_256_129 {-0.9996988186962042499672520534659270197153,0.0245412285229122881236030195850617019460} +#define T_256_131 {-0.9972904566786902069708276030723936855793,0.0735645635996674263079597722025937400758} +#define T_256_133 {-0.9924795345987099670637121562322136014700,0.1224106751992161956632543251544120721519} +#define T_256_135 {-0.9852776423889412216183814052783418446779,0.1709618887603012171716443390323547646403} +#define T_256_137 {-0.9757021300385285700329518476792145520449,0.2191012401568697975928756704888655804098} +#define T_256_139 {-0.9637760657954398402225137942878063768148,0.2667127574748983653840639362897491082549} +#define T_256_141 {-0.9495281805930366747503512669936753809452,0.3136817403988914620960315460251877084374} +#define T_256_143 {-0.9329927988347388456702447001589462161064,0.3598950365349881663767916961660375818610} +#define T_256_145 {-0.9142097557035306909511973572080023586750,0.4052413140049898609973411112150643020868} +#define T_256_147 {-0.8932243011955153244585403626842889934778,0.4496113296546065951631021562207024544477} +#define T_256_149 {-0.8700869911087114605408032730338163673878,0.4928981922297840378988098564150277525187} +#define T_256_151 {-0.8448535652497071168909315019845962524414,0.5349976198870972643462096129951532930136} +#define T_256_153 {-0.8175848131515837113880706965574063360691,0.5758081914178453386554679127584677189589} +#define T_256_155 {-0.7883464276266062276121715512999799102545,0.6152315905806268192534957961470354348421} +#define T_256_157 {-0.7572088465064845674845628309412859380245,0.6531728429537767555146388076536823064089} +#define T_256_159 {-0.7242470829514668917425979088875465095043,0.6895405447370669405060539247642736881971} +#define T_256_161 {-0.6895405447370669405060539247642736881971,0.7242470829514668917425979088875465095043} +#define T_256_163 {-0.6531728429537767555146388076536823064089,0.7572088465064845674845628309412859380245} +#define T_256_165 {-0.6152315905806268192534957961470354348421,0.7883464276266062276121715512999799102545} +#define T_256_167 {-0.5758081914178453386554679127584677189589,0.8175848131515837113880706965574063360691} +#define T_256_169 {-0.5349976198870972643462096129951532930136,0.8448535652497071168909315019845962524414} +#define T_256_171 {-0.4928981922297840378988098564150277525187,0.8700869911087114605408032730338163673878} +#define T_256_173 {-0.4496113296546065951631021562207024544477,0.8932243011955153244585403626842889934778} +#define T_256_175 {-0.4052413140049898609973411112150643020868,0.9142097557035306909511973572080023586750} +#define T_256_177 {-0.3598950365349881663767916961660375818610,0.9329927988347388456702447001589462161064} +#define T_256_179 {-0.3136817403988914620960315460251877084374,0.9495281805930366747503512669936753809452} +#define T_256_181 {-0.2667127574748983653840639362897491082549,0.9637760657954398402225137942878063768148} +#define T_256_183 {-0.2191012401568697975928756704888655804098,0.9757021300385285700329518476792145520449} +#define T_256_185 {-0.1709618887603012171716443390323547646403,0.9852776423889412216183814052783418446779} +#define T_256_187 {-0.1224106751992161956632543251544120721519,0.9924795345987099670637121562322136014700} +#define T_256_189 {-0.0735645635996674263079597722025937400758,0.9972904566786902069708276030723936855793} +#define T_256_191 {-0.0245412285229122881236030195850617019460,0.9996988186962042499672520534659270197153} +#define T_256_193 {0.0245412285229122881236030195850617019460,0.9996988186962042499672520534659270197153} +#define T_256_195 {0.0735645635996674263079597722025937400758,0.9972904566786902069708276030723936855793} +#define T_256_197 {0.1224106751992161956632543251544120721519,0.9924795345987099670637121562322136014700} +#define T_256_199 {0.1709618887603012171716443390323547646403,0.9852776423889412216183814052783418446779} +#define T_256_201 {0.2191012401568697975928756704888655804098,0.9757021300385285700329518476792145520449} +#define T_256_203 {0.2667127574748983653840639362897491082549,0.9637760657954398402225137942878063768148} +#define T_256_205 {0.3136817403988914620960315460251877084374,0.9495281805930366747503512669936753809452} +#define T_256_207 {0.3598950365349881663767916961660375818610,0.9329927988347388456702447001589462161064} +#define T_256_209 {0.4052413140049898609973411112150643020868,0.9142097557035306909511973572080023586750} +#define T_256_211 {0.4496113296546065951631021562207024544477,0.8932243011955153244585403626842889934778} +#define T_256_213 {0.4928981922297840378988098564150277525187,0.8700869911087114605408032730338163673878} +#define T_256_215 {0.5349976198870972643462096129951532930136,0.8448535652497071168909315019845962524414} +#define T_256_217 {0.5758081914178453386554679127584677189589,0.8175848131515837113880706965574063360691} +#define T_256_219 {0.6152315905806268192534957961470354348421,0.7883464276266062276121715512999799102545} +#define T_256_221 {0.6531728429537767555146388076536823064089,0.7572088465064845674845628309412859380245} +#define T_256_223 {0.6895405447370669405060539247642736881971,0.7242470829514668917425979088875465095043} +#define T_256_225 {0.7242470829514668917425979088875465095043,0.6895405447370669405060539247642736881971} +#define T_256_227 {0.7572088465064845674845628309412859380245,0.6531728429537767555146388076536823064089} +#define T_256_229 {0.7883464276266062276121715512999799102545,0.6152315905806268192534957961470354348421} +#define T_256_231 {0.8175848131515837113880706965574063360691,0.5758081914178453386554679127584677189589} +#define T_256_233 {0.8448535652497071168909315019845962524414,0.5349976198870972643462096129951532930136} +#define T_256_235 {0.8700869911087114605408032730338163673878,0.4928981922297840378988098564150277525187} +#define T_256_237 {0.8932243011955153244585403626842889934778,0.4496113296546065951631021562207024544477} +#define T_256_239 {0.9142097557035306909511973572080023586750,0.4052413140049898609973411112150643020868} +#define T_256_241 {0.9329927988347388456702447001589462161064,0.3598950365349881663767916961660375818610} +#define T_256_243 {0.9495281805930366747503512669936753809452,0.3136817403988914620960315460251877084374} +#define T_256_245 {0.9637760657954398402225137942878063768148,0.2667127574748983653840639362897491082549} +#define T_256_247 {0.9757021300385285700329518476792145520449,0.2191012401568697975928756704888655804098} +#define T_256_249 {0.9852776423889412216183814052783418446779,0.1709618887603012171716443390323547646403} +#define T_256_251 {0.9924795345987099670637121562322136014700,0.1224106751992161956632543251544120721519} +#define T_256_253 {0.9972904566786902069708276030723936855793,0.0735645635996674263079597722025937400758} +#define T_256_255 {0.9996988186962042499672520534659270197153,0.0245412285229122881236030195850617019460} +// Pre-computed twiddles for N=288 +#define T_288_1 {0.9997620270799091324320784224255476146936,-0.0218148850345611204637474855871914769523} +#define T_288_5 {0.9940563382223196464693160123715642839670,-0.1088668748519645707029113168573530856520} +#define T_288_7 {0.9883615104677606622018970483622979372740,-0.1521233861899166928122184572202968411148} +#define T_288_11 {0.9713420698132614328201839271059725433588,-0.2376858923261730982545714141451753675938} +#define T_288_13 {0.9600498543859287137181013349618297070265,-0.2798290140309920293049117390182800590992} +#define T_288_17 {0.9320078692827985511470956225821282714605,-0.3624380382837016356667447780637303367257} +#define T_288_19 {0.9153114791194471022706125040713232010603,-0.4027466898587372390316829751100158318877} +#define T_288_23 {0.8767267557075076878092545484832953661680,-0.4809887689193876325610688127198955044150} +#define T_288_25 {0.8549118706729466055094235343858599662781,-0.5187732581605214443598583784478250890970} +#define T_288_35 {0.7223639620597556154635299208166543394327,-0.6915130557822694035152721880876924842596} +#define T_288_49 {0.4809887689193876325610688127198955044150,-0.8767267557075076878092545484832953661680} +#define T_288_77 {-0.1088668748519645707029113168573530856520,-0.9940563382223196464693160123715642839670} +#define T_288_91 {-0.4027466898587372390316829751100158318877,-0.9153114791194471022706125040713232010603} +#define T_288_119 {-0.8549118706729466055094235343858599662781,-0.5187732581605214443598583784478250890970} +#define T_288_133 {-0.9713420698132614328201839271059725433588,-0.2376858923261730982545714141451753675938} +#define T_288_161 {-0.9320078692827985511470956225821282714605,0.3624380382837016356667447780637303367257} +// Pre-computed twiddles for N=324 +#define T_324_1 {0.9998119704485014525019437314767856150866,-0.0193913317718243730092275711740512633696} +#define T_324_5 {0.9953027957931658287193954492977354675531,-0.0968108707031790916008873182363458909094} +#define T_324_7 {0.9908004033648453168225955778325442224741,-0.1353312997501311121251177382873720489442} +#define T_324_11 {0.9773338582506355676216003303125035017729,-0.2117038722294107555566000655744574032724} +#define T_324_13 {0.9683899605278059041779670224059373140335,-0.2494411440579812899720479890675051137805} +#define T_324_17 {0.9461481568757503790578766711405478417873,-0.3237339420583210114479300045786658301950} +#define T_324_19 {0.9328837047320005515516072591708507388830,-0.3601777248047104418304797945893369615078} +#define T_324_23 {0.9021674247810377300638151609746273607016,-0.4313860656812534277726456366508500650525} +#define T_324_25 {0.8847617971766578159886762477981392294168,-0.4660435197025388220914976500353077426553} +#define T_324_29 {0.8459864259198410785245414444943889975548,-0.5332044328016912748680056211014743894339} +#define T_324_31 {0.8246750041091067684462245779286604374647,-0.5656068754865386516428316099336370825768} +#define T_324_35 {0.7783649119241600189411656174343079328537,-0.6278121246720985704214967881853226572275} +#define T_324_37 {0.7534358963276607257242289961141068488359,-0.6575213685690636244984830227622296661139} +#define T_324_41 {0.7002173477671684853618216948234476149082,-0.7139297345578989872905140146031044423580} +#define T_324_43 {0.6720078605555224182666052001877687871456,-0.7405440131090045285944256647781003266573} +#define T_324_47 {0.6126005451932028389450124450377188622952,-0.7903926695187593054470198694616556167603} +#define T_324_49 {0.5814920712880267128497280282317660748959,-0.8135520702629676081940601761743891984224} +#define T_324_53 {0.5166993711518629295298410397663246840239,-0.8561668995302664786350987924379296600819} +#define T_324_55 {0.4831125992966384674609514604526339098811,-0.8755582313020908724610080753336660563946} +#define T_324_59 {0.4138107245051391713808186523237964138389,-0.9103629409661466720393718787818215787411} +#define T_324_61 {0.3781998581716424778775831327948253601789,-0.9257239692688904453277132233779411762953} +#define T_324_65 {0.3053259976951130938438438988669076934457,-0.9522478853384153119066013459814712405205} +#define T_324_67 {0.2681726127606373633049940963246626779437,-0.9633708786158803327737132349284365773201} +#define T_324_71 {0.1927122605480896810892232906553545035422,-0.9812553106273846914575642585987225174904} +#define T_324_73 {0.1545187928078404770992904104787157848477,-0.9879898494768090122519765827746596187353} +#define T_324_77 {0.0774924206719309477398027752315101679415,-0.9969929411677920239043260153266601264477} +#define T_324_79 {0.0387753712568167097085591876748367212713,-0.9992479525042300414483520398789551109076} +#define T_324_83 {-0.0387753712568167097085591876748367212713,-0.9992479525042300414483520398789551109076} +#define T_324_85 {-0.0774924206719309477398027752315101679415,-0.9969929411677920239043260153266601264477} +#define T_324_89 {-0.1545187928078404770992904104787157848477,-0.9879898494768090122519765827746596187353} +#define T_324_91 {-0.1927122605480896810892232906553545035422,-0.9812553106273846914575642585987225174904} +#define T_324_95 {-0.2681726127606373633049940963246626779437,-0.9633708786158803327737132349284365773201} +#define T_324_97 {-0.3053259976951130938438438988669076934457,-0.9522478853384153119066013459814712405205} +#define T_324_101 {-0.3781998581716424778775831327948253601789,-0.9257239692688904453277132233779411762953} +#define T_324_103 {-0.4138107245051391713808186523237964138389,-0.9103629409661466720393718787818215787411} +#define T_324_107 {-0.4831125992966384674609514604526339098811,-0.8755582313020908724610080753336660563946} +#define T_324_109 {-0.5166993711518629295298410397663246840239,-0.8561668995302664786350987924379296600819} +#define T_324_113 {-0.5814920712880267128497280282317660748959,-0.8135520702629676081940601761743891984224} +#define T_324_115 {-0.6126005451932028389450124450377188622952,-0.7903926695187593054470198694616556167603} +#define T_324_119 {-0.6720078605555224182666052001877687871456,-0.7405440131090045285944256647781003266573} +#define T_324_121 {-0.7002173477671684853618216948234476149082,-0.7139297345578989872905140146031044423580} +#define T_324_125 {-0.7534358963276607257242289961141068488359,-0.6575213685690636244984830227622296661139} +#define T_324_127 {-0.7783649119241600189411656174343079328537,-0.6278121246720985704214967881853226572275} +#define T_324_131 {-0.8246750041091067684462245779286604374647,-0.5656068754865386516428316099336370825768} +#define T_324_133 {-0.8459864259198410785245414444943889975548,-0.5332044328016912748680056211014743894339} +#define T_324_137 {-0.8847617971766578159886762477981392294168,-0.4660435197025388220914976500353077426553} +#define T_324_139 {-0.9021674247810377300638151609746273607016,-0.4313860656812534277726456366508500650525} +#define T_324_143 {-0.9328837047320005515516072591708507388830,-0.3601777248047104418304797945893369615078} +#define T_324_145 {-0.9461481568757503790578766711405478417873,-0.3237339420583210114479300045786658301950} +#define T_324_149 {-0.9683899605278059041779670224059373140335,-0.2494411440579812899720479890675051137805} +#define T_324_151 {-0.9773338582506355676216003303125035017729,-0.2117038722294107555566000655744574032724} +#define T_324_155 {-0.9908004033648453168225955778325442224741,-0.1353312997501311121251177382873720489442} +#define T_324_157 {-0.9953027957931658287193954492977354675531,-0.0968108707031790916008873182363458909094} +#define T_324_161 {-0.9998119704485014525019437314767856150866,-0.0193913317718243730092275711740512633696} +#define T_324_163 {-0.9998119704485014525019437314767856150866,0.0193913317718243730092275711740512633696} +#define T_324_167 {-0.9953027957931658287193954492977354675531,0.0968108707031790916008873182363458909094} +#define T_324_169 {-0.9908004033648453168225955778325442224741,0.1353312997501311121251177382873720489442} +#define T_324_173 {-0.9773338582506355676216003303125035017729,0.2117038722294107555566000655744574032724} +#define T_324_175 {-0.9683899605278059041779670224059373140335,0.2494411440579812899720479890675051137805} +#define T_324_179 {-0.9461481568757503790578766711405478417873,0.3237339420583210114479300045786658301950} +#define T_324_181 {-0.9328837047320005515516072591708507388830,0.3601777248047104418304797945893369615078} +#define T_324_185 {-0.9021674247810377300638151609746273607016,0.4313860656812534277726456366508500650525} +#define T_324_187 {-0.8847617971766578159886762477981392294168,0.4660435197025388220914976500353077426553} +#define T_324_191 {-0.8459864259198410785245414444943889975548,0.5332044328016912748680056211014743894339} +#define T_324_193 {-0.8246750041091067684462245779286604374647,0.5656068754865386516428316099336370825768} +#define T_324_197 {-0.7783649119241600189411656174343079328537,0.6278121246720985704214967881853226572275} +#define T_324_199 {-0.7534358963276607257242289961141068488359,0.6575213685690636244984830227622296661139} +#define T_324_203 {-0.7002173477671684853618216948234476149082,0.7139297345578989872905140146031044423580} +#define T_324_205 {-0.6720078605555224182666052001877687871456,0.7405440131090045285944256647781003266573} +#define T_324_209 {-0.6126005451932028389450124450377188622952,0.7903926695187593054470198694616556167603} +#define T_324_211 {-0.5814920712880267128497280282317660748959,0.8135520702629676081940601761743891984224} +#define T_324_215 {-0.5166993711518629295298410397663246840239,0.8561668995302664786350987924379296600819} +// Pre-computed twiddles for N=343 +#define T_343_1 {0.9998322241852237723946927872020751237869,-0.0183173000419952144990354270248644752428} +#define T_343_2 {0.9993289530383430152582491245993878692389,-0.0366284536841123345429771518411143915728} +#define T_343_3 {0.9984903554328112829097108260612003505230,-0.0549273165889077666368400798546645091847} +#define T_343_4 {0.9973167127614216420639081661647651344538,-0.0732077485431148339278806247421016450971} +#define T_343_5 {0.9958084188418849791446518793236464262009,-0.0914636155180024029709429100876150187105} +#define T_343_6 {0.9939659797846838173995820397976785898209,-0.1096887917276581358994036463627708144486} +#define T_343_8 {0.9892812511064961178419707721332088112831,-0.1460226222513681115788841680114273913205} +#define T_343_9 {0.9864405334538516312292699694808106869459,-0.1641190846893822163909959499505930580199} +#define T_343_10 {0.9832688140727500192284082913829479366541,-0.1821604767010881475997052802995312958956} +#define T_343_11 {0.9797671572387982585894405929138883948326,-0.2001407444679969560574761544557986781001} +#define T_343_12 {0.9759367379386528762452712726371828466654,-0.2180538546819596745773850443583796732128} +#define T_343_13 {0.9717788414757521087139480187033768743277,-0.2358937965696536742665756491987849585712} +#define T_343_15 {0.9624863072347562553687794206780381500721,-0.2713302570403868241299960573087446391582} +#define T_343_16 {0.9573547875816684760152952549105975776911,-0.2889148848613694031683962748502381145954} +#define T_343_17 {0.9519020259695478713979355234187096357346,-0.3064025668219348408349844703479902818799} +#define T_343_18 {0.9461298520814387380539756122743710875511,-0.3237874349019040920438783359713852405548} +#define T_343_19 {0.9400402027796954662619555165292695164680,-0.3410636555804633651689528051065281033516} +#define T_343_20 {0.9336351214560646427642609523900318890810,-0.3582254317936114929032953568821540102363} +#define T_343_22 {0.9198873648075919229327723769529256969690,-0.3921826565101322326079014146671397611499} +#define T_343_23 {0.9125493025648960809803611482493579387665,-0.4089667106114160954888347987434826791286} +#define T_343_24 {0.9049050329166776762690460600424557924271,-0.4256135352665215698664269439177587628365} +#define T_343_25 {0.8969571209100737352315491079934872686863,-0.4421175446063087544068537226849002763629} +#define T_343_26 {0.8887082334799096816624341954593546688557,-0.4584732006835495754515363842074293643236} +#define T_343_27 {0.8801611385538047338528144791780505329370,-0.4746750153311948272616405120061244815588} +#define T_343_29 {0.8621838972819457769247719625127501785755,-0.5065954276024560165581078763352707028389} +#define T_343_30 {0.8527597832287951984397977867047302424908,-0.5223033142797182115657506074057891964912} +#define T_343_31 {0.8430495242407655132055310787109192460775,-0.5378359412287532226315533989691175520420} +#define T_343_32 {0.8330563786110832813491811066342052072287,-0.5531880964512770892937965072633232921362} +#define T_343_33 {0.8227836995560490196410796670534182339907,-0.5683546285065876890740810267743654549122} +#define T_343_34 {0.8122349340898594727278236860001925379038,-0.5833304482401391322099470926332287490368} +#define T_343_36 {0.7903233939992904133475803973851725459099,-0.6126899157791340089218579123553354293108} +#define T_343_37 {0.7789679718278943454379259492270648479462,-0.6270637119674019599457892581995110958815} +#define T_343_38 {0.7673511656841821571006789781677071005106,-0.6412270959052860375138038762088399380445} +#define T_343_39 {0.7554768736063856016116346836497541517019,-0.6551753150462258368236234673531726002693} +#define T_343_40 {0.7433490800325611713716966733045410364866,-0.6689036890425593684383898107626009732485} +#define T_343_41 {0.7309718544636055748142666743660811334848,-0.6824076113160209011709866899764165282249} +#define T_343_43 {0.7054858024368363400569137411366682499647,-0.7087240524774456318368720530997961759567} +#define T_343_44 {0.6923855278653112677034187072422355413437,-0.7215277408406929327000511875667143613100} +#define T_343_45 {0.6790529222016324917277074746380094438791,-0.7340893193947338746241371154610533267260} +#define T_343_46 {0.6654924592233565361354408196348231285810,-0.7464045730814148349452352704247459769249} +#define T_343_47 {0.6517086891659332748005795110657345503569,-0.7584693694972929334241484866652172058821} +#define T_343_48 {0.6377062371958668318683294273796491324902,-0.7702796602802703951340390631230548024178} +#define T_343_52 {0.5796046485099634049120709278213325887918,-0.8148978165547149377090363486786372959614} +#define T_343_60 {0.4543984957848437034044764004647731781006,-0.8907985221297076705582185240928083658218} +#define T_343_64 {0.3879658598892251064604863586282590404153,-0.9216737446409191125695770097081549465656} +#define T_343_68 {0.3194511763119168290536720178351970389485,-0.9476027363578751749884077071328647434711} +#define T_343_72 {0.2492221342051153532004548196709947660565,-0.9684463474154092477164113006438128650188} +#define T_343_76 {0.1776556229537462061607300256582675501704,-0.9840927190224081355651719604793470352888} +#define T_343_80 {0.1051357095705101191152053274890931788832,-0.9944578837603457177607424455345608294010} +#define T_343_88 {-0.0412045616053484248242355647562362719327,-0.9991507314228974534486837910662870854139} +#define T_343_92 {-0.1142395734336983242895868784216872882098,-0.9934532298310205788993698661215603351593} +#define T_343_96 {-0.1866615100829776829893802414517267607152,-0.9824242874916837564569505047984421253204} +#define T_343_100 {-0.2580817138363782348164932045619934797287,-0.9661230920453551807014491714653559029102} +#define T_343_104 {-0.3281169028512835872390951408306136727333,-0.9446371250714643563739514320332091301680} +#define T_343_108 {-0.3963912280698235757192549044702900573611,-0.9180816926119903520486786874243989586830} +#define T_343_116 {-0.5262031062287327864979147307167295366526,-0.8503589189249636248746355704497545957565} +#define T_343_120 {-0.5870440140569427311945105429913382977247,-0.8095550170062020267636171411140821874142} +#define T_343_124 {-0.6447345064623469790276999447087291628122,-0.7644065777953207296846471763274166733027} +#define T_343_128 {-0.6989649831208283314865070678933989256620,-0.7151558937538725624349922327382955700159} +#define T_343_132 {-0.7494444121404670866581909649539738893509,-0.6620672723458165265242314490024000406265} +#define T_343_136 {-0.7959018919058652441478329819801729172468,-0.6054256176118288923504451304324902594090} +#define T_343_144 {-0.8757766556444949168636071590299252420664,-0.4827165311320337770517596709396457299590} +#define T_343_148 {-0.9087652857492778402104249835247173905373,-0.4173076268378441699979930490371771156788} +#define T_343_151 {-0.9303149613070847978590904858720023185015,-0.3667616020907821794416747707145987078547} +#define T_343_152 {-0.9368769592658326983425354228529613465071,-0.3496592100843436967672062110068509355187} +#define T_343_155 {-0.9546684501846829373761238457518629729748,-0.2976712116110247774436459167191060259938} +#define T_343_156 {-0.9599608128045554744645073697029147297144,-0.2801343211383018894089502737188013270497} +#define T_343_159 {-0.9738986397233733161016289159306325018406,-0.2269833464044513360313715111260535195470} +#define T_343_160 {-0.9778929651462107131010270677506923675537,-0.2091060704942636350178730708648799918592} +#define T_343_163 {-0.9879023296787865504597903054673224687576,-0.1550773581643241361138763068083790130913} +#define T_343_164 {-0.9905771820597212284909005575173068791628,-0.1369556365492923810656122896034503355622} +#define T_343_167 {-0.9966043682058220998243314170395024120808,-0.0823391357318874334003666604075988288969} +#define T_343_171 {-0.9999580551666213956707451870897784829140,-0.0091590341951608318649125095589624834247} +#define T_343_172 {-0.9999580551666213956707451870897784829140,0.0091590341951608318649125095589624834247} +#define T_343_176 {-0.9966043682058220998243314170395024120808,0.0823391357318874334003666604075988288969} +#define T_343_180 {-0.9879023296787865504597903054673224687576,0.1550773581643241361138763068083790130913} +#define T_343_184 {-0.9738986397233733161016289159306325018406,0.2269833464044513360313715111260535195470} +#define T_343_188 {-0.9546684501846829373761238457518629729748,0.2976712116110247774436459167191060259938} +#define T_343_192 {-0.9303149613070847978590904858720023185015,0.3667616020907821794416747707145987078547} +// Pre-computed twiddles for N=400 +#define T_400_1 {0.9998766324816605877146002967492677271366,-0.0157073173118206753373637951654018252157} +#define T_400_3 {0.9988898749619700145885303754766937345266,-0.0471064507096426582366532898049626965076} +#define T_400_7 {0.9939609554551797065968798960966523736715,-0.1097343110910452695927475019743724260479} +#define T_400_9 {0.9900236577165575369008365669287741184235,-0.1409012319375826660650119492856902070343} +#define T_400_11 {0.9851093261547738677563756937161087989807,-0.1719291002794095524208728420489933341742} +#define T_400_13 {0.9792228106217657490617511939490213990211,-0.2027872953565124924946161399930133484304} +#define T_400_17 {0.9645574184577980769716987197170965373516,-0.2638730499653729189013517952844267711043} +#define T_400_19 {0.9557930147983301205627526542230043560266,-0.2940403252323039451354702578100841492414} +#define T_400_21 {0.9460853588275452974443169296137057244778,-0.3239174181981493982895869976346148177981} +#define T_400_23 {0.9354440308298673789622057483938988298178,-0.3534748437792571418292197904520435258746} +#define T_400_27 {0.9114032766354452919088657836255151778460,-0.4115143586051087654098523671564180403948} +#define T_400_29 {0.8980275757606156483703330195567104965448,-0.4399391698559151353542517881578532978892} +#define T_400_31 {0.8837656300886934657157212313904892653227,-0.4679298142605733956678193408151855692267} +#define T_400_33 {0.8686315144381911990123512623540591448545,-0.4954586684324075451790747592895058915019} +#define T_400_37 {0.8358073613682702740490526593930553644896,-0.5490228179981317957469855173258110880852} +#define T_400_39 {0.8181497174250234039050155843142420053482,-0.5750052520432785696513633411086630076170} +// Pre-computed twiddles for N=432 +#define T_432_1 {0.9998942319271074685715916530170943588018,-0.0145438976515826559610111701204004930332} +#define T_432_5 {0.9973569167005722491126107343006879091263,-0.0726579707226106125128595181195123586804} +#define T_432_7 {0.9948217482960330926289316266775131225586,-0.1016350781828018745578390280570602044463} +#define T_432_11 {0.9872291131173740463822241508751176297665,-0.1593068680675225889942225876438897103071} +#define T_432_13 {0.9821780704706306597273623992805369198322,-0.1879527544001118599670263620282639749348} +#define T_432_17 {0.9695878998781159996411815882311202585697,-0.2447433439543237887470183977711712941527} +#define T_432_19 {0.9620594244736132560902319710294250398874,-0.2728399966674610044314874812698690220714} +#define T_432_23 {0.9445675372676047842901425610762089490891,-0.3283171752135610188538805687130661681294} +#define T_432_25 {0.9346189253489883830283702081942465156317,-0.3556507618148764815479978551593376323581} +#define T_432_29 {0.9123584453530141358257310457702260464430,-0.4093923145260925378075000935496063902974} +#define T_432_31 {0.9000654118641211454132644576020538806915,-0.4357548099170793776124810392502695322037} +#define T_432_35 {0.8732057547721957524444746923109050840139,-0.4873517311272423380152929439645959064364} +#define T_432_37 {0.8586618571206131589335086573555599898100,-0.5125425007998651860674499403103254735470} +#define T_432_41 {0.8274074411415105467781927472969982773066,-0.5616021067834928715001296950504183769226} +#define T_432_43 {0.8107233671702123167790432489709928631783,-0.5854294337699404993102803018700797110796} +#define T_432_47 {0.7753120572814657940341476205503568053246,-0.6315783513024976203453775269736070185900} +#define T_432_49 {0.7566147828674928410563893521612044423819,-0.6538608952570696963846330618252977728844} +#define T_432_53 {0.7173160805192894118320623420004267245531,-0.6967479032106550507208453382190782576799} +#define T_432_55 {0.6967479032106550507208453382190782576799,-0.7173160805192894118320623420004267245531} +#define T_432_59 {0.6538608952570696963846330618252977728844,-0.7566147828674928410563893521612044423819} +#define T_432_61 {0.6315783513024976203453775269736070185900,-0.7753120572814657940341476205503568053246} +#define T_432_65 {0.5854294337699404993102803018700797110796,-0.8107233671702123167790432489709928631783} +#define T_432_67 {0.5616021067834928715001296950504183769226,-0.8274074411415105467781927472969982773066} +#define T_432_71 {0.5125425007998651860674499403103254735470,-0.8586618571206131589335086573555599898100} +#define T_432_77 {0.4357548099170793776124810392502695322037,-0.9000654118641211454132644576020538806915} +#define T_432_91 {0.2447433439543237887470183977711712941527,-0.9695878998781159996411815882311202585697} +#define T_432_119 {-0.1593068680675225889942225876438897103071,-0.9872291131173740463822241508751176297665} +#define T_432_133 {-0.3556507618148764815479978551593376323581,-0.9346189253489883830283702081942465156317} +#define T_432_161 {-0.6967479032106550507208453382190782576799,-0.7173160805192894118320623420004267245531} +#define T_432_175 {-0.8274074411415105467781927472969982773066,-0.5616021067834928715001296950504183769226} +#define T_432_203 {-0.9821780704706306597273623992805369198322,-0.1879527544001118599670263620282639749348} +#define T_432_217 {-0.9998942319271074685715916530170943588018,0.0145438976515826559610111701204004930332} +#define T_432_245 {-0.9123584453530141358257310457702260464430,0.4093923145260925378075000935496063902974} +// Pre-computed twiddles for N=500 +#define T_500_1 {0.9999210442038161117395134169782977551222,-0.0125660398833526074091615498673490947112} +#define T_500_3 {0.9992894726405892047438328518182970583439,-0.0376901826699345410265351574707892723382} +#define T_500_7 {0.9961336091431725048295220403815619647503,-0.0878511965507431796806869783722504507750} +#define T_500_9 {0.9936113105200083950307998748030513525009,-0.1128563848734816937868785657883563544601} +#define T_500_11 {0.9904614256966511876001391101453918963671,-0.1377902906846380504113369624974438920617} +#define T_500_13 {0.9866859442078680375587396156333852559328,-0.1626371651948836094536687824074761010706} +#define T_500_17 {0.9772681235681934808212645293679088354111,-0.2120071099220546539942944264112156815827} +#define T_500_19 {0.9716317329146739734113680242444388568401,-0.2364989970237246774686923345143441110849} +#define T_500_21 {0.9653816388332738807065425135078839957714,-0.2608415062898969405402738175325794145465} +#define T_500_23 {0.9585217890173758359395606021280400454998,-0.2850192624699761090312222222564741969109} +#define T_500_27 {0.9429905358928645231486598277115263044834,-0.3328195445229866211711566847952781245112} +#define T_500_29 {0.9343289424566120215942532922781538218260,-0.3564118787132506960269040519051486626267} +#define T_500_31 {0.9250772068344580434384738509834278374910,-0.3797790955218011132998867651622276753187} +#define T_500_33 {0.9152411726209175313329069467727094888687,-0.4029064357136626361999276468850439414382} +#define T_500_37 {0.8938414241512637747888447847799398005009,-0.4483832160900322327812261846702313050628} +#define T_500_39 {0.8822912264349532796003927614947315305471,-0.4707039321653325725236527432571165263653} +#define T_500_41 {0.8701837546695256886408742502680979669094,-0.4927273415482915641661065819789655506611} +#define T_500_43 {0.8575266561936522036191377083014231175184,-0.5144395337815064195297054538968950510025} +#define T_500_47 {0.8305958991958126702570552879478782415390,-0.5568756164881879522354779510351363569498} +#define T_500_49 {0.8163392507171839396207246863923501223326,-0.5775727034222676303798493790964130312204} +#define T_500_51 {0.8015669848708765199418735392100643366575,-0.5979049830575188240189277166791725903749} +#define T_500_57 {0.7542513807361037603271824991679750382900,-0.6565857557529564125786691874964162707329} +#define T_500_63 {0.7026499697988491943689837171405088156462,-0.7115356772092853443467674878775142133236} +#define T_500_69 {0.6470559615694443378330902305606286972761,-0.7624425110114478876255361683433875441551} +#define T_500_81 {0.5251746299612957136204727248696144670248,-0.8509944817946918371731612751318607479334} +#define T_500_87 {0.4595798606214878678777324694237904623151,-0.8881364488135444767991089065617416054010} +#define T_500_93 {0.3913736668372024274376030916755553334951,-0.9202318473658703812390058374148793518543} +#define T_500_99 {0.3209436098072094845079504921159241348505,-0.9470983049947442333760250221530441194773} +#define T_500_111 {0.1750230589752760357313832173531409353018,-0.9845643345292053316697433729132171720266} +#define T_500_117 {0.1003617148512148948835687178871012292802,-0.9949510169813001736471846925269346684217} +#define T_500_123 {0.0251300954433374794383393435737161780708,-0.9996841892832999398876836494309827685356} +#define T_500_129 {-0.0502443181797695564982930704900354612619,-0.9987369566060174674859695187478791922331} +#define T_500_141 {-0.1997099805144070261420097267546225339174,-0.9798550523842468606972033740021288394928} +#define T_500_147 {-0.2729519355173252170843056774174328893423,-0.9620276715860859306772567833831999450922} +#define T_500_153 {-0.3446429231745170573830705507134553045034,-0.9387338576538740708699037895712535828352} +#define T_500_159 {-0.4143755809932841427389860200491966679692,-0.9101059706849956576490967563586309552193} +#define T_500_171 {-0.5463943467342691207022653543390333652496,-0.8375280400421417636991350263997446745634} +#define T_500_177 {-0.6079302976946053815865411706909071654081,-0.7939903986478353825617659822455607354641} +#define T_500_183 {-0.6660118674342516698061444913037121295929,-0.7459411454241821060051620406738948076963} +#define T_500_189 {-0.7203090248879069346088499514735303819180,-0.6936533058128049322732522341539151966572} +#define T_500_201 {-0.8163392507171839396207246863923501223326,-0.5775727034222676303798493790964130312204} +#define T_500_207 {-0.8575266561936522036191377083014231175184,-0.5144395337815064195297054538968950510025} +#define T_500_213 {-0.8938414241512637747888447847799398005009,-0.4483832160900322327812261846702313050628} +#define T_500_219 {-0.9250772068344580434384738509834278374910,-0.3797790955218011132998867651622276753187} +#define T_500_231 {-0.9716317329146739734113680242444388568401,-0.2364989970237246774686923345143441110849} +#define T_500_237 {-0.9866859442078680375587396156333852559328,-0.1626371651948836094536687824074761010706} +#define T_500_243 {-0.9961336091431725048295220403815619647503,-0.0878511965507431796806869783722504507750} +#define T_500_249 {-0.9999210442038161117395134169782977551222,-0.0125660398833526074091615498673490947112} +#define T_500_261 {-0.9904614256966511876001391101453918963671,0.1377902906846380504113369624974438920617} +#define T_500_267 {-0.9772681235681934808212645293679088354111,0.2120071099220546539942944264112156815827} +#define T_500_273 {-0.9585217890173758359395606021280400454998,0.2850192624699761090312222222564741969109} +#define T_500_279 {-0.9343289424566120215942532922781538218260,0.3564118787132506960269040519051486626267} +#define T_500_291 {-0.8701837546695256886408742502680979669094,0.4927273415482915641661065819789655506611} +#define T_500_297 {-0.8305958991958126702570552879478782415390,0.5568756164881879522354779510351363569498} +// Pre-computed twiddles for N=512 +#define T_512_1 {0.9999247018391445029905639785283710807562,-0.0122715382857199253874291855481715174392} +#define T_512_3 {0.9993223845883495437547594519855920225382,-0.0368072229413588317137318028926529223099} +#define T_512_5 {0.9981181129001491791896683025697711855173,-0.0613207363022085782944259335636161267757} +#define T_512_7 {0.9963126121827780012907282980449963361025,-0.0857973123444398938497457152152492199093} +#define T_512_9 {0.9939069700023560605117722843715455383062,-0.1102222072938830593757941755939100403339} +#define T_512_11 {0.9909026354277800097136719159607309848070,-0.1345807085071261954833943264020490460098} +#define T_512_13 {0.9873014181578584347320770575606729835272,-0.1588581433338614457007054170389892533422} +#define T_512_15 {0.9831054874312162850102936317853163927794,-0.1830398879551409507815407096131821162999} +#define T_512_17 {0.9783173707196276547293223302403930574656,-0.2071113761922185603214785487580229528248} +#define T_512_19 {0.9729399522055601767789312361855991184711,-0.2310581082806711095134488687108387239277} +#define T_512_21 {0.9669764710448520705909913885989226400852,-0.2548656596045145716900037768937181681395} +#define T_512_23 {0.9604305194155657865451303223380818963051,-0.2785196893850531152381222455005627125502} +#define T_512_25 {0.9533060403541938621074791626597288995981,-0.3020059493192280841711294669948983937502} +#define T_512_27 {0.9456073253805212797118429080001078546047,-0.3253102921622629262188297616376075893641} +#define T_512_29 {0.9373390119125749597728258777351584285498,-0.3484186802494345647218665362743195146322} +#define T_512_31 {0.9285060804732155892438072442018892616034,-0.3713171939518375430644425705395406112075} +#define T_512_33 {0.9191138516900577704049624117033090442419,-0.3939920400610480988312644967663800343871} +#define T_512_35 {0.9091679830905223802517411968437954783440,-0.4164295600976372080381793239212129265070} +#define T_512_37 {0.8986744656939538167250702827004715800285,-0.4386162385385276585303415686212247237563} +#define T_512_39 {0.8876396204028539349550896986329462379217,-0.4605387109582400051444039945636177435517} +#define T_512_41 {0.8760700941954066012229418447532225400209,-0.4821837720791227743788454063178505748510} +#define T_512_43 {0.8639728561215866964317910969839431345463,-0.5035383837257175754231752762279938906431} +#define T_512_45 {0.8513551931052651955411647577420808374882,-0.5245896826784689492839675040158908814192} +#define T_512_47 {0.8382247055548380787470819086593110114336,-0.5453249884220464638318048855580855160952} +#define T_512_49 {0.8245893027850252909871642259531654417515,-0.5657318107836132314858446079597342759371} +#define T_512_51 {0.8104571982525947682063360844040289521217,-0.5857978574564388640766310345497913658619} +#define T_512_53 {0.7958369046088835663255167673924006521702,-0.6055110414043255451232994346355553716421} +#define T_512_55 {0.7807372285720944882214666904474142938852,-0.6248594881423863434122267790371552109718} +#define T_512_57 {0.7651672656224589585960416115995030850172,-0.6438315428897914971528848582238424569368} +#define T_512_59 {0.7491363945234593701982817037787754088640,-0.6624157775901717837285787027212791144848} +#define T_512_61 {0.7326542716724128156968731673259753733873,-0.6806009977954530221211371099343523383141} +#define T_512_63 {0.7157308252838187057065510998654644936323,-0.6983762494089728045665310673939529806376} +#define T_512_65 {0.6983762494089728045665310673939529806376,-0.7157308252838187057065510998654644936323} +#define T_512_67 {0.6806009977954530221211371099343523383141,-0.7326542716724128156968731673259753733873} +#define T_512_69 {0.6624157775901717837285787027212791144848,-0.7491363945234593701982817037787754088640} +#define T_512_71 {0.6438315428897914971528848582238424569368,-0.7651672656224589585960416115995030850172} +#define T_512_73 {0.6248594881423863434122267790371552109718,-0.7807372285720944882214666904474142938852} +#define T_512_75 {0.6055110414043255451232994346355553716421,-0.7958369046088835663255167673924006521702} +#define T_512_77 {0.5857978574564388640766310345497913658619,-0.8104571982525947682063360844040289521217} +#define T_512_79 {0.5657318107836132314858446079597342759371,-0.8245893027850252909871642259531654417515} +#define T_512_81 {0.5453249884220464638318048855580855160952,-0.8382247055548380787470819086593110114336} +#define T_512_83 {0.5245896826784689492839675040158908814192,-0.8513551931052651955411647577420808374882} +#define T_512_85 {0.5035383837257175754231752762279938906431,-0.8639728561215866964317910969839431345463} +#define T_512_87 {0.4821837720791227743788454063178505748510,-0.8760700941954066012229418447532225400209} +#define T_512_89 {0.4605387109582400051444039945636177435517,-0.8876396204028539349550896986329462379217} +#define T_512_91 {0.4386162385385276585303415686212247237563,-0.8986744656939538167250702827004715800285} +#define T_512_93 {0.4164295600976372080381793239212129265070,-0.9091679830905223802517411968437954783440} +#define T_512_95 {0.3939920400610480988312644967663800343871,-0.9191138516900577704049624117033090442419} +#define T_512_97 {0.3713171939518375430644425705395406112075,-0.9285060804732155892438072442018892616034} +#define T_512_99 {0.3484186802494345647218665362743195146322,-0.9373390119125749597728258777351584285498} +#define T_512_101 {0.3253102921622629262188297616376075893641,-0.9456073253805212797118429080001078546047} +#define T_512_103 {0.3020059493192280841711294669948983937502,-0.9533060403541938621074791626597288995981} +#define T_512_105 {0.2785196893850531152381222455005627125502,-0.9604305194155657865451303223380818963051} +#define T_512_107 {0.2548656596045145716900037768937181681395,-0.9669764710448520705909913885989226400852} +#define T_512_109 {0.2310581082806711095134488687108387239277,-0.9729399522055601767789312361855991184711} +#define T_512_111 {0.2071113761922185603214785487580229528248,-0.9783173707196276547293223302403930574656} +#define T_512_113 {0.1830398879551409507815407096131821162999,-0.9831054874312162850102936317853163927794} +#define T_512_115 {0.1588581433338614457007054170389892533422,-0.9873014181578584347320770575606729835272} +#define T_512_117 {0.1345807085071261954833943264020490460098,-0.9909026354277800097136719159607309848070} +#define T_512_119 {0.1102222072938830593757941755939100403339,-0.9939069700023560605117722843715455383062} +#define T_512_121 {0.0857973123444398938497457152152492199093,-0.9963126121827780012907282980449963361025} +#define T_512_123 {0.0613207363022085782944259335636161267757,-0.9981181129001491791896683025697711855173} +#define T_512_125 {0.0368072229413588317137318028926529223099,-0.9993223845883495437547594519855920225382} +#define T_512_127 {0.0122715382857199253874291855481715174392,-0.9999247018391445029905639785283710807562} +#define T_512_129 {-0.0122715382857199253874291855481715174392,-0.9999247018391445029905639785283710807562} +#define T_512_131 {-0.0368072229413588317137318028926529223099,-0.9993223845883495437547594519855920225382} +#define T_512_133 {-0.0613207363022085782944259335636161267757,-0.9981181129001491791896683025697711855173} +#define T_512_135 {-0.0857973123444398938497457152152492199093,-0.9963126121827780012907282980449963361025} +#define T_512_137 {-0.1102222072938830593757941755939100403339,-0.9939069700023560605117722843715455383062} +#define T_512_139 {-0.1345807085071261954833943264020490460098,-0.9909026354277800097136719159607309848070} +#define T_512_141 {-0.1588581433338614457007054170389892533422,-0.9873014181578584347320770575606729835272} +#define T_512_143 {-0.1830398879551409507815407096131821162999,-0.9831054874312162850102936317853163927794} +#define T_512_145 {-0.2071113761922185603214785487580229528248,-0.9783173707196276547293223302403930574656} +#define T_512_147 {-0.2310581082806711095134488687108387239277,-0.9729399522055601767789312361855991184711} +#define T_512_149 {-0.2548656596045145716900037768937181681395,-0.9669764710448520705909913885989226400852} +#define T_512_151 {-0.2785196893850531152381222455005627125502,-0.9604305194155657865451303223380818963051} +#define T_512_153 {-0.3020059493192280841711294669948983937502,-0.9533060403541938621074791626597288995981} +#define T_512_155 {-0.3253102921622629262188297616376075893641,-0.9456073253805212797118429080001078546047} +#define T_512_157 {-0.3484186802494345647218665362743195146322,-0.9373390119125749597728258777351584285498} +#define T_512_159 {-0.3713171939518375430644425705395406112075,-0.9285060804732155892438072442018892616034} +#define T_512_161 {-0.3939920400610480988312644967663800343871,-0.9191138516900577704049624117033090442419} +#define T_512_163 {-0.4164295600976372080381793239212129265070,-0.9091679830905223802517411968437954783440} +#define T_512_165 {-0.4386162385385276585303415686212247237563,-0.8986744656939538167250702827004715800285} +#define T_512_167 {-0.4605387109582400051444039945636177435517,-0.8876396204028539349550896986329462379217} +#define T_512_169 {-0.4821837720791227743788454063178505748510,-0.8760700941954066012229418447532225400209} +#define T_512_171 {-0.5035383837257175754231752762279938906431,-0.8639728561215866964317910969839431345463} +#define T_512_173 {-0.5245896826784689492839675040158908814192,-0.8513551931052651955411647577420808374882} +#define T_512_175 {-0.5453249884220464638318048855580855160952,-0.8382247055548380787470819086593110114336} +#define T_512_177 {-0.5657318107836132314858446079597342759371,-0.8245893027850252909871642259531654417515} +#define T_512_179 {-0.5857978574564388640766310345497913658619,-0.8104571982525947682063360844040289521217} +#define T_512_181 {-0.6055110414043255451232994346355553716421,-0.7958369046088835663255167673924006521702} +#define T_512_183 {-0.6248594881423863434122267790371552109718,-0.7807372285720944882214666904474142938852} +#define T_512_185 {-0.6438315428897914971528848582238424569368,-0.7651672656224589585960416115995030850172} +#define T_512_187 {-0.6624157775901717837285787027212791144848,-0.7491363945234593701982817037787754088640} +#define T_512_189 {-0.6806009977954530221211371099343523383141,-0.7326542716724128156968731673259753733873} +#define T_512_191 {-0.6983762494089728045665310673939529806376,-0.7157308252838187057065510998654644936323} +#define T_512_193 {-0.7157308252838187057065510998654644936323,-0.6983762494089728045665310673939529806376} +#define T_512_195 {-0.7326542716724128156968731673259753733873,-0.6806009977954530221211371099343523383141} +#define T_512_197 {-0.7491363945234593701982817037787754088640,-0.6624157775901717837285787027212791144848} +#define T_512_199 {-0.7651672656224589585960416115995030850172,-0.6438315428897914971528848582238424569368} +#define T_512_201 {-0.7807372285720944882214666904474142938852,-0.6248594881423863434122267790371552109718} +#define T_512_203 {-0.7958369046088835663255167673924006521702,-0.6055110414043255451232994346355553716421} +#define T_512_205 {-0.8104571982525947682063360844040289521217,-0.5857978574564388640766310345497913658619} +#define T_512_207 {-0.8245893027850252909871642259531654417515,-0.5657318107836132314858446079597342759371} +#define T_512_209 {-0.8382247055548380787470819086593110114336,-0.5453249884220464638318048855580855160952} +#define T_512_211 {-0.8513551931052651955411647577420808374882,-0.5245896826784689492839675040158908814192} +#define T_512_213 {-0.8639728561215866964317910969839431345463,-0.5035383837257175754231752762279938906431} +#define T_512_215 {-0.8760700941954066012229418447532225400209,-0.4821837720791227743788454063178505748510} +#define T_512_217 {-0.8876396204028539349550896986329462379217,-0.4605387109582400051444039945636177435517} +#define T_512_219 {-0.8986744656939538167250702827004715800285,-0.4386162385385276585303415686212247237563} +#define T_512_221 {-0.9091679830905223802517411968437954783440,-0.4164295600976372080381793239212129265070} +#define T_512_223 {-0.9191138516900577704049624117033090442419,-0.3939920400610480988312644967663800343871} +#define T_512_225 {-0.9285060804732155892438072442018892616034,-0.3713171939518375430644425705395406112075} +#define T_512_227 {-0.9373390119125749597728258777351584285498,-0.3484186802494345647218665362743195146322} +#define T_512_229 {-0.9456073253805212797118429080001078546047,-0.3253102921622629262188297616376075893641} +#define T_512_231 {-0.9533060403541938621074791626597288995981,-0.3020059493192280841711294669948983937502} +#define T_512_233 {-0.9604305194155657865451303223380818963051,-0.2785196893850531152381222455005627125502} +#define T_512_235 {-0.9669764710448520705909913885989226400852,-0.2548656596045145716900037768937181681395} +#define T_512_237 {-0.9729399522055601767789312361855991184711,-0.2310581082806711095134488687108387239277} +#define T_512_239 {-0.9783173707196276547293223302403930574656,-0.2071113761922185603214785487580229528248} +#define T_512_241 {-0.9831054874312162850102936317853163927794,-0.1830398879551409507815407096131821162999} +#define T_512_243 {-0.9873014181578584347320770575606729835272,-0.1588581433338614457007054170389892533422} +#define T_512_245 {-0.9909026354277800097136719159607309848070,-0.1345807085071261954833943264020490460098} +#define T_512_247 {-0.9939069700023560605117722843715455383062,-0.1102222072938830593757941755939100403339} +#define T_512_249 {-0.9963126121827780012907282980449963361025,-0.0857973123444398938497457152152492199093} +#define T_512_251 {-0.9981181129001491791896683025697711855173,-0.0613207363022085782944259335636161267757} +#define T_512_253 {-0.9993223845883495437547594519855920225382,-0.0368072229413588317137318028926529223099} +#define T_512_255 {-0.9999247018391445029905639785283710807562,-0.0122715382857199253874291855481715174392} +#define T_512_257 {-0.9999247018391445029905639785283710807562,0.0122715382857199253874291855481715174392} +#define T_512_259 {-0.9993223845883495437547594519855920225382,0.0368072229413588317137318028926529223099} +#define T_512_261 {-0.9981181129001491791896683025697711855173,0.0613207363022085782944259335636161267757} +#define T_512_263 {-0.9963126121827780012907282980449963361025,0.0857973123444398938497457152152492199093} +#define T_512_265 {-0.9939069700023560605117722843715455383062,0.1102222072938830593757941755939100403339} +#define T_512_267 {-0.9909026354277800097136719159607309848070,0.1345807085071261954833943264020490460098} +#define T_512_269 {-0.9873014181578584347320770575606729835272,0.1588581433338614457007054170389892533422} +#define T_512_271 {-0.9831054874312162850102936317853163927794,0.1830398879551409507815407096131821162999} +#define T_512_273 {-0.9783173707196276547293223302403930574656,0.2071113761922185603214785487580229528248} +#define T_512_275 {-0.9729399522055601767789312361855991184711,0.2310581082806711095134488687108387239277} +#define T_512_277 {-0.9669764710448520705909913885989226400852,0.2548656596045145716900037768937181681395} +#define T_512_279 {-0.9604305194155657865451303223380818963051,0.2785196893850531152381222455005627125502} +#define T_512_281 {-0.9533060403541938621074791626597288995981,0.3020059493192280841711294669948983937502} +#define T_512_283 {-0.9456073253805212797118429080001078546047,0.3253102921622629262188297616376075893641} +#define T_512_285 {-0.9373390119125749597728258777351584285498,0.3484186802494345647218665362743195146322} +#define T_512_287 {-0.9285060804732155892438072442018892616034,0.3713171939518375430644425705395406112075} +#define T_512_289 {-0.9191138516900577704049624117033090442419,0.3939920400610480988312644967663800343871} +#define T_512_291 {-0.9091679830905223802517411968437954783440,0.4164295600976372080381793239212129265070} +#define T_512_293 {-0.8986744656939538167250702827004715800285,0.4386162385385276585303415686212247237563} +#define T_512_295 {-0.8876396204028539349550896986329462379217,0.4605387109582400051444039945636177435517} +#define T_512_297 {-0.8760700941954066012229418447532225400209,0.4821837720791227743788454063178505748510} +#define T_512_299 {-0.8639728561215866964317910969839431345463,0.5035383837257175754231752762279938906431} +#define T_512_301 {-0.8513551931052651955411647577420808374882,0.5245896826784689492839675040158908814192} +#define T_512_303 {-0.8382247055548380787470819086593110114336,0.5453249884220464638318048855580855160952} +#define T_512_305 {-0.8245893027850252909871642259531654417515,0.5657318107836132314858446079597342759371} +#define T_512_307 {-0.8104571982525947682063360844040289521217,0.5857978574564388640766310345497913658619} +#define T_512_309 {-0.7958369046088835663255167673924006521702,0.6055110414043255451232994346355553716421} +#define T_512_311 {-0.7807372285720944882214666904474142938852,0.6248594881423863434122267790371552109718} +#define T_512_313 {-0.7651672656224589585960416115995030850172,0.6438315428897914971528848582238424569368} +#define T_512_315 {-0.7491363945234593701982817037787754088640,0.6624157775901717837285787027212791144848} +#define T_512_317 {-0.7326542716724128156968731673259753733873,0.6806009977954530221211371099343523383141} +#define T_512_319 {-0.7157308252838187057065510998654644936323,0.6983762494089728045665310673939529806376} +#define T_512_321 {-0.6983762494089728045665310673939529806376,0.7157308252838187057065510998654644936323} +#define T_512_323 {-0.6806009977954530221211371099343523383141,0.7326542716724128156968731673259753733873} +#define T_512_325 {-0.6624157775901717837285787027212791144848,0.7491363945234593701982817037787754088640} +#define T_512_327 {-0.6438315428897914971528848582238424569368,0.7651672656224589585960416115995030850172} +#define T_512_329 {-0.6248594881423863434122267790371552109718,0.7807372285720944882214666904474142938852} +#define T_512_331 {-0.6055110414043255451232994346355553716421,0.7958369046088835663255167673924006521702} +#define T_512_333 {-0.5857978574564388640766310345497913658619,0.8104571982525947682063360844040289521217} +#define T_512_335 {-0.5657318107836132314858446079597342759371,0.8245893027850252909871642259531654417515} +#define T_512_337 {-0.5453249884220464638318048855580855160952,0.8382247055548380787470819086593110114336} +#define T_512_339 {-0.5245896826784689492839675040158908814192,0.8513551931052651955411647577420808374882} +#define T_512_341 {-0.5035383837257175754231752762279938906431,0.8639728561215866964317910969839431345463} +#define T_512_343 {-0.4821837720791227743788454063178505748510,0.8760700941954066012229418447532225400209} +#define T_512_345 {-0.4605387109582400051444039945636177435517,0.8876396204028539349550896986329462379217} +#define T_512_347 {-0.4386162385385276585303415686212247237563,0.8986744656939538167250702827004715800285} +#define T_512_349 {-0.4164295600976372080381793239212129265070,0.9091679830905223802517411968437954783440} +#define T_512_351 {-0.3939920400610480988312644967663800343871,0.9191138516900577704049624117033090442419} +#define T_512_353 {-0.3713171939518375430644425705395406112075,0.9285060804732155892438072442018892616034} +#define T_512_355 {-0.3484186802494345647218665362743195146322,0.9373390119125749597728258777351584285498} +#define T_512_357 {-0.3253102921622629262188297616376075893641,0.9456073253805212797118429080001078546047} +#define T_512_359 {-0.3020059493192280841711294669948983937502,0.9533060403541938621074791626597288995981} +#define T_512_361 {-0.2785196893850531152381222455005627125502,0.9604305194155657865451303223380818963051} +#define T_512_363 {-0.2548656596045145716900037768937181681395,0.9669764710448520705909913885989226400852} +#define T_512_365 {-0.2310581082806711095134488687108387239277,0.9729399522055601767789312361855991184711} +#define T_512_367 {-0.2071113761922185603214785487580229528248,0.9783173707196276547293223302403930574656} +#define T_512_369 {-0.1830398879551409507815407096131821162999,0.9831054874312162850102936317853163927794} +#define T_512_371 {-0.1588581433338614457007054170389892533422,0.9873014181578584347320770575606729835272} +#define T_512_373 {-0.1345807085071261954833943264020490460098,0.9909026354277800097136719159607309848070} +#define T_512_375 {-0.1102222072938830593757941755939100403339,0.9939069700023560605117722843715455383062} +#define T_512_377 {-0.0857973123444398938497457152152492199093,0.9963126121827780012907282980449963361025} +#define T_512_379 {-0.0613207363022085782944259335636161267757,0.9981181129001491791896683025697711855173} +#define T_512_381 {-0.0368072229413588317137318028926529223099,0.9993223845883495437547594519855920225382} +#define T_512_383 {-0.0122715382857199253874291855481715174392,0.9999247018391445029905639785283710807562} +#define T_512_385 {0.0122715382857199253874291855481715174392,0.9999247018391445029905639785283710807562} +#define T_512_387 {0.0368072229413588317137318028926529223099,0.9993223845883495437547594519855920225382} +#define T_512_389 {0.0613207363022085782944259335636161267757,0.9981181129001491791896683025697711855173} +#define T_512_391 {0.0857973123444398938497457152152492199093,0.9963126121827780012907282980449963361025} +#define T_512_393 {0.1102222072938830593757941755939100403339,0.9939069700023560605117722843715455383062} +#define T_512_395 {0.1345807085071261954833943264020490460098,0.9909026354277800097136719159607309848070} +#define T_512_397 {0.1588581433338614457007054170389892533422,0.9873014181578584347320770575606729835272} +#define T_512_399 {0.1830398879551409507815407096131821162999,0.9831054874312162850102936317853163927794} +#define T_512_401 {0.2071113761922185603214785487580229528248,0.9783173707196276547293223302403930574656} +#define T_512_403 {0.2310581082806711095134488687108387239277,0.9729399522055601767789312361855991184711} +#define T_512_405 {0.2548656596045145716900037768937181681395,0.9669764710448520705909913885989226400852} +#define T_512_407 {0.2785196893850531152381222455005627125502,0.9604305194155657865451303223380818963051} +#define T_512_409 {0.3020059493192280841711294669948983937502,0.9533060403541938621074791626597288995981} +#define T_512_411 {0.3253102921622629262188297616376075893641,0.9456073253805212797118429080001078546047} +#define T_512_413 {0.3484186802494345647218665362743195146322,0.9373390119125749597728258777351584285498} +#define T_512_415 {0.3713171939518375430644425705395406112075,0.9285060804732155892438072442018892616034} +#define T_512_417 {0.3939920400610480988312644967663800343871,0.9191138516900577704049624117033090442419} +#define T_512_419 {0.4164295600976372080381793239212129265070,0.9091679830905223802517411968437954783440} +#define T_512_421 {0.4386162385385276585303415686212247237563,0.8986744656939538167250702827004715800285} +#define T_512_423 {0.4605387109582400051444039945636177435517,0.8876396204028539349550896986329462379217} +#define T_512_425 {0.4821837720791227743788454063178505748510,0.8760700941954066012229418447532225400209} +#define T_512_427 {0.5035383837257175754231752762279938906431,0.8639728561215866964317910969839431345463} +#define T_512_429 {0.5245896826784689492839675040158908814192,0.8513551931052651955411647577420808374882} +#define T_512_431 {0.5453249884220464638318048855580855160952,0.8382247055548380787470819086593110114336} +#define T_512_433 {0.5657318107836132314858446079597342759371,0.8245893027850252909871642259531654417515} +#define T_512_435 {0.5857978574564388640766310345497913658619,0.8104571982525947682063360844040289521217} +#define T_512_437 {0.6055110414043255451232994346355553716421,0.7958369046088835663255167673924006521702} +#define T_512_439 {0.6248594881423863434122267790371552109718,0.7807372285720944882214666904474142938852} +#define T_512_441 {0.6438315428897914971528848582238424569368,0.7651672656224589585960416115995030850172} +#define T_512_443 {0.6624157775901717837285787027212791144848,0.7491363945234593701982817037787754088640} +#define T_512_445 {0.6806009977954530221211371099343523383141,0.7326542716724128156968731673259753733873} +#define T_512_447 {0.6983762494089728045665310673939529806376,0.7157308252838187057065510998654644936323} +#define T_512_449 {0.7157308252838187057065510998654644936323,0.6983762494089728045665310673939529806376} +#define T_512_451 {0.7326542716724128156968731673259753733873,0.6806009977954530221211371099343523383141} +#define T_512_453 {0.7491363945234593701982817037787754088640,0.6624157775901717837285787027212791144848} +#define T_512_455 {0.7651672656224589585960416115995030850172,0.6438315428897914971528848582238424569368} +#define T_512_457 {0.7807372285720944882214666904474142938852,0.6248594881423863434122267790371552109718} +#define T_512_459 {0.7958369046088835663255167673924006521702,0.6055110414043255451232994346355553716421} +#define T_512_461 {0.8104571982525947682063360844040289521217,0.5857978574564388640766310345497913658619} +#define T_512_463 {0.8245893027850252909871642259531654417515,0.5657318107836132314858446079597342759371} +#define T_512_465 {0.8382247055548380787470819086593110114336,0.5453249884220464638318048855580855160952} +#define T_512_467 {0.8513551931052651955411647577420808374882,0.5245896826784689492839675040158908814192} +#define T_512_469 {0.8639728561215866964317910969839431345463,0.5035383837257175754231752762279938906431} +#define T_512_471 {0.8760700941954066012229418447532225400209,0.4821837720791227743788454063178505748510} +#define T_512_473 {0.8876396204028539349550896986329462379217,0.4605387109582400051444039945636177435517} +#define T_512_475 {0.8986744656939538167250702827004715800285,0.4386162385385276585303415686212247237563} +#define T_512_477 {0.9091679830905223802517411968437954783440,0.4164295600976372080381793239212129265070} +#define T_512_479 {0.9191138516900577704049624117033090442419,0.3939920400610480988312644967663800343871} +#define T_512_481 {0.9285060804732155892438072442018892616034,0.3713171939518375430644425705395406112075} +#define T_512_483 {0.9373390119125749597728258777351584285498,0.3484186802494345647218665362743195146322} +#define T_512_485 {0.9456073253805212797118429080001078546047,0.3253102921622629262188297616376075893641} +#define T_512_487 {0.9533060403541938621074791626597288995981,0.3020059493192280841711294669948983937502} +#define T_512_489 {0.9604305194155657865451303223380818963051,0.2785196893850531152381222455005627125502} +#define T_512_491 {0.9669764710448520705909913885989226400852,0.2548656596045145716900037768937181681395} +#define T_512_493 {0.9729399522055601767789312361855991184711,0.2310581082806711095134488687108387239277} +#define T_512_495 {0.9783173707196276547293223302403930574656,0.2071113761922185603214785487580229528248} +#define T_512_497 {0.9831054874312162850102936317853163927794,0.1830398879551409507815407096131821162999} +#define T_512_499 {0.9873014181578584347320770575606729835272,0.1588581433338614457007054170389892533422} +#define T_512_501 {0.9909026354277800097136719159607309848070,0.1345807085071261954833943264020490460098} +#define T_512_503 {0.9939069700023560605117722843715455383062,0.1102222072938830593757941755939100403339} +#define T_512_505 {0.9963126121827780012907282980449963361025,0.0857973123444398938497457152152492199093} +#define T_512_507 {0.9981181129001491791896683025697711855173,0.0613207363022085782944259335636161267757} +#define T_512_509 {0.9993223845883495437547594519855920225382,0.0368072229413588317137318028926529223099} +#define T_512_511 {0.9999247018391445029905639785283710807562,0.0122715382857199253874291855481715174392} +// Pre-computed twiddles for N=576 +#define T_576_1 {0.9999405050001497485823165334295481443405,-0.0109080914941823738095738605125006870367} +#define T_576_5 {0.9985129789397630961644836133928038179874,-0.0545145016380063213157214363491220865399} +#define T_576_7 {0.9970861323044666324477702801232226192951,-0.0762839745039523009495496808085590600967} +#define T_576_11 {0.9928096670090550412979268912749830633402,-0.1197036553049625196010197214491199702024} +#define T_576_13 {0.9899620837148078722478317104105371981859,-0.1413331978235674857291570560846594162285} +#define T_576_17 {0.9828549916653012141765088927058968693018,-0.1843802195426633983021957874370855279267} +#define T_576_19 {0.9785988655009383307969983434304594993591,-0.2057772106922348964719304831305635161698} +#define T_576_23 {0.9686915803565930849217124887218233197927,-0.2482672393737162808147189707597135566175} +#define T_576_25 {0.9630451367077627056190181065176147967577,-0.2693400539532260218322790024103596806526} +#define T_576_29 {0.9503800829845611142232542079000268131495,-0.3110911407710909570845103644387563690543} +#define T_576_31 {0.9433675007898252307114717041258700191975,-0.3317495417533520840613903146731900051236} +#define T_576_35 {0.9279989121921845551810292818117886781693,-0.3725829021441029453853843733668327331543} +#define T_576_37 {0.9196502204050922557598823914304375648499,-0.3927384270845740710242921522876713424921} +#define T_576_41 {0.9016439075888160736838017328409478068352,-0.4324792063301661770857720057392725721002} +#define T_576_43 {0.8919948565893142111704605667910072952509,-0.4520455461767197147260333167650969699025} +#define T_576_47 {0.8714279253502613675763655010086949914694,-0.4905235681593079766216192183492239564657} +#define T_576_49 {0.8605198338560789972362385924498084932566,-0.5094169368408417719606973150803241878748} +#define T_576_53 {0.8374803549513079037325269382563419640064,-0.5464674327630434369496015278855338692665} +#define T_576_59 {0.7999465651001297361588626699813175946474,-0.6000712399244809702736347389873117208481} +#define T_576_67 {0.7446028722923262455068993403983768075705,-0.6675077247298474292236392102495301514864} +#define T_576_71 {0.7147778973340463970487235201289877295494,-0.6993515264033670630894334863114636391401} +#define T_576_77 {0.6675077247298474292236392102495301514864,-0.7446028722923262455068993403983768075705} +#define T_576_91 {0.5464674327630434369496015278855338692665,-0.8374803549513079037325269382563419640064} +#define T_576_119 {0.2693400539532260218322790024103596806526,-0.9630451367077627056190181065176147967577} +#define T_576_133 {0.1197036553049625196010197214491199702024,-0.9928096670090550412979268912749830633402} +#define T_576_161 {-0.1843802195426633983021957874370855279267,-0.9828549916653012141765088927058968693018} +#define T_576_175 {-0.3317495417533520840613903146731900051236,-0.9433675007898252307114717041258700191975} +#define T_576_203 {-0.6000712399244809702736347389873117208481,-0.7999465651001297361588626699813175946474} +#define T_576_217 {-0.7147778973340463970487235201289877295494,-0.6993515264033670630894334863114636391401} +#define T_576_245 {-0.8919948565893142111704605667910072952509,-0.4520455461767197147260333167650969699025} +#define T_576_259 {-0.9503800829845611142232542079000268131495,-0.3110911407710909570845103644387563690543} +#define T_576_287 {-0.9999405050001497485823165334295481443405,-0.0109080914941823738095738605125006870367} +#define T_576_301 {-0.9899620837148078722478317104105371981859,0.1413331978235674857291570560846594162285} +#define T_576_329 {-0.9016439075888160736838017328409478068352,0.4324792063301661770857720057392725721002} +// Pre-computed twiddles for N=625 +#define T_625_1 {0.9999494680510517818916582655219826847315,-0.0100529271567306524581830728948261821643} +#define T_625_2 {0.9997978773111629857694993006589356809855,-0.0201048383254575774303773982865095604211} +#define T_625_3 {0.9995452431006844884464612732699606567621,-0.0301547176208567559163586935255807475187} +#define T_625_4 {0.9991915909518144234269243497692514210939,-0.0402015493629532053021513604562642285600} +#define T_625_6 {0.9981813860104127300587606441695243120193,-0.0602820091099435170778342296671326039359} +#define T_625_7 {0.9975249353131302454400497481401544064283,-0.0703136077053038016648756070026138331741} +#define T_625_8 {0.9967676708576360677938055232516489923000,-0.0803381001333942218467498719292052555829} +#define T_625_9 {0.9959096691760279673744093997811432927847,-0.0903544732799355793106954592985857743770} +#define T_625_11 {0.9938918111585802739327277777192648500204,-0.1103588134763917733005555987801926676184} +#define T_625_12 {0.9927321587553373793966215998807456344366,-0.1203447588097115023009209267002006527036} +#define T_625_13 {0.9914721769705635567859758339182008057833,-0.1303185416326145262821967207855777814984} +#define T_625_14 {0.9901119931429293030689109400555025786161,-0.1402791539557319833164683586801402270794} +#define T_625_16 {0.9870915793339134491901631918153725564480,-0.1601568419021817957226261341929784975946} +#define T_625_17 {0.9854316546073247362613756195059977471828,-0.1700719086088891551789004097372526302934} +#define T_625_18 {0.9836721383166108223505830210342537611723,-0.1799697871855897857162887021331698633730} +#define T_625_19 {0.9818132082853465725236219441285356879234,-0.1898494773140938784994347088286303915083} +#define T_625_21 {0.9777978685121797575163782312301918864250,-0.2095503002456405583497911493395804427564} +#define T_625_22 {0.9756418645761659735882176391896791756153,-0.2193694420067258954443190077654435299337} +#define T_625_23 {0.9733872584703672226069670614378992468119,-0.2291664134369226779597283893963322043419} +#define T_625_24 {0.9710342780540647256160013967019040137529,-0.2389402244161105970388092600842355750501} +#define T_625_26 {0.9660341554134970198930432161432690918446,-0.2584144163442346764369972333952318876982} +#define T_625_27 {0.9633875185211158154174881929066032171249,-0.2681128291554261688922622397512895986438} +#define T_625_28 {0.9606435179309280592008235544199123978615,-0.2777841454390272413199625134438974782825} +#define T_625_29 {0.9578024309623294207938215549802407622337,-0.2874273877741164162635811862855916842818} +#define T_625_31 {0.9518301561981014069502293750701937824488,-0.3066257551998819441685384390439139679074} +#define T_625_32 {0.9486995719838390295208796487713698297739,-0.3161789400287130513689248800801578909159} +#define T_625_33 {0.9454731084928994500060639438743237406015,-0.3257001705814321357301821535656927153468} +#define T_625_34 {0.9421510918042592663823597831651568412781,-0.3351884846053667099674555629462702199817} +#define T_625_36 {0.9352217514007470366976804143632762134075,-0.3540625307864689874648433942638803273439} +#define T_625_37 {0.9316151279920259486999611908686347305775,-0.3634463554589605904787674717226764187217} +#define T_625_38 {0.9279143519271306761453388389782048761845,-0.3727934488260932766046096276113530620933} +#define T_625_39 {0.9241197972209157507350596461037639528513,-0.3821028662341773296517999369825702160597} +#define T_625_41 {0.9162508952933617889513584486849140375853,-0.4006049136919229147757448572519933804870} +#define T_625_42 {0.9121773433339237513450825645122677087784,-0.4097956738525492248470527556492015719414} +#define T_625_43 {0.9080116031765956829957531226682476699352,-0.4189450184650349573445282658212818205357} +#define T_625_44 {0.9037540958273154734214926975255366414785,-0.4280520228609502875194436910533113405108} +#define T_625_46 {0.8949655099049148576639822749712038785219,-0.4461353338177057592517371631402056664228} +#define T_625_47 {0.8904353195405447785049091180553659796715,-0.4551098128086537575320846826798515394330} +#define T_625_48 {0.8858151383119573507585187144286464899778,-0.4640382966279483212268530678556999191642} +#define T_625_49 {0.8811054331526765315274474232865031808615,-0.4729198829282125204898079573467839509249} +#define T_625_51 {0.8714193639662127788270140626991633325815,-0.4905387773710874199650788796134293079376} +#define T_625_52 {0.8664439788509370465874326328048482537270,-0.4992743048795490468094726566050667315722} +#define T_625_53 {0.8613810275298499075091740451171062886715,-0.5079593737806400444156906814896501600742} +#define T_625_54 {0.8562310216845464561075118581356946378946,-0.5165931063274438184151904351892881095409} +#define T_625_56 {0.8456719370854187678432367647474166005850,-0.5337030773999673627372430928517132997513} +#define T_625_57 {0.8402639254738420993007252945972140878439,-0.5421775867253177949578457628376781940460} +#define T_625_58 {0.8347709935146954141060859910794533789158,-0.5505973014704015344733534220722503960133} +#define T_625_59 {0.8291936963450933228969574884104076772928,-0.5589613707060275826776774010795634239912} +#define T_625_61 {0.8177882694974004662569200263533275574446,-0.5755191971363310399212309675931464880705} +#define T_625_62 {0.8119612924962041899945575096353422850370,-0.5837112809325288864670255861710757017136} +#define T_625_63 {0.8060522555218471163485105535073671489954,-0.5918443725914295239221019073738716542721} +#define T_625_64 {0.8000617557646388933179082414426375180483,-0.5999176501510879999656822292308788746595} +#define T_625_66 {0.7878387977664520480658438827958889305592,-0.6158815054325884608132923858647700399160} +#define T_625_67 {0.7816075748252524491022086294833570718765,-0.6237704697849901780415393659495748579502} +#define T_625_68 {0.7752973595759157232620850663806777447462,-0.6315963934623227693521130277076736092567} +#define T_625_69 {0.7689087897533913862346821588289458304644,-0.6393584855462347560361990872479509562254} +#define T_625_71 {0.7558991768574195857510744644969236105680,-0.6546880435950205301409710045845713466406} +#define T_625_72 {0.7492794485861613562605043625808320939541,-0.6622539602950049397733778278052341192961} +#define T_625_73 {0.7425839952132153864994279501843266189098,-0.6697529470283645203210198815213516354561} +#define T_625_74 {0.7358134934071977051317503537575248628855,-0.6771842459182698670261402185133192688227} +#define T_625_76 {0.7220500890246939196259745585848577320576,-0.6918407829402886122238669486250728368759} +#define T_625_77 {0.7150585774315030285208649729611352086067,-0.6990645398256409848158909881021827459335} +#define T_625_78 {0.7079947992312527471625571706681512296200,-0.7062176465237173239586354611674323678017} +#define T_625_79 {0.7008594683169017125479172136692795902491,-0.7132993801136727762823852572182659059763} +#define T_625_81 {0.6863770400018340822612117335665971040726,-0.7272458724243958672062149162229616194963} +#define T_625_82 {0.6790314062517740500979357420874293893576,-0.7341092216582886242903782658686395734549} +#define T_625_83 {0.6716171469410040506531345272378530353308,-0.7408983789527587848766643219278194010258} +#define T_625_84 {0.6641350113834701973658525275823194533587,-0.7476126581691069361568224849179387092590} +#define T_625_86 {0.6489701430066630027937435443163849413395,-0.7608138757185697320650774599926080554724} +#define T_625_87 {0.6412889428080990050773380062310025095940,-0.7672994798851818787355227868829388171434} +#define T_625_88 {0.6335429314492969643524133971368428319693,-0.7737075377755030514137501995719503611326} +#define T_625_89 {0.6257328917723579131404676445527002215385,-0.7800374017662247139170972332067321985960} +#define T_625_91 {0.6099238911074591484862139623146504163742,-0.7924599971331904857407835152116604149342} +#define T_625_92 {0.6019265278387284645589261344866827130318,-0.7985514730335249167225697419780772179365} +#define T_625_93 {0.5938683315288471042947548994561657309532,-0.8045622442093236514892851118929684162140} +#define T_625_94 {0.5857501165705443213482794817537069320679,-0.8104917031886222922310025751357898116112} +#define T_625_96 {0.5693369185252644681938249959785025566816,-0.8221042958190623028968957441975362598896} +#define T_625_97 {0.5610435942200587833283975669473875313997,-0.8277862558563279238299514872778672724962} +#define T_625_98 {0.5526935686623315469034878333332017064095,-0.8333845565879515193685733720485586673021} +#define T_625_99 {0.5442876857382129562878958495275583118200,-0.8388986322278397800289440056076273322105} +#define T_625_101 {0.5273117514752824197543645823316182941198,-0.8496718877049364015974219910276588052511} +#define T_625_102 {0.5187434157905574938496329195913858711720,-0.8549299787549532458896806019765790551901} +#define T_625_103 {0.5101226538742247296198684125556610524654,-0.8601016672488885905778488449868746101856} +#define T_625_104 {0.5014503369740861415948529611341655254364,-0.8651864305157446199956439158995635807514} +#define T_625_106 {0.4839545491767599516563791439693886786699,-0.8750931346611736660889846461941488087177} +#define T_625_107 {0.4751328464720838140600278620695462450385,-0.8799140743296104405501978362735826522112} +#define T_625_108 {0.4662631249899246155266041569120716303587,-0.8846460864518815858659195328073110431433} +#define T_625_109 {0.4573462811389086279945104251964949071407,-0.8892886927923970841192158331978134810925} +#define T_625_111 {0.4393748356855865511150227575853932648897,-0.8983038204117045344432312958815600723028} +#define T_625_112 {0.4303220503476088421024314811802469193935,-0.9026754305865591909707745799096301198006} +#define T_625_113 {0.4212257749858722144509215468133334070444,-0.9069558128638635086105068694450892508030} +#define T_625_114 {0.4120869289054210904410524562990758568048,-0.9111445346514999155118630369543097913265} +#define T_625_116 {0.3936852232270234286382049049279885366559,-0.9192453127499148335388667874212842434645} +#define T_625_117 {0.3844242233771806849240704195835860446095,-0.9231565503644826309326276714273262768984} +#define T_625_118 {0.3751243721168776579233394841139670461416,-0.9269744901797019043243608393822796642780} +#define T_625_119 {0.3657866093253325234790906961279688403010,-0.9306987463396931836712155927671119570732} +#define T_625_121 {0.3470011277274495098410511673137079924345,-0.9378647116486887869157840214029420167208} +#define T_625_122 {0.3375553074551057575192203330516349524260,-0.9413056965773068940350754019164014607668} +#define T_625_123 {0.3280753725276349674899734054633881896734,-0.9446515494831168036071744609216693788767} +#define T_625_124 {0.3185622810242127456170635468879481777549,-0.9479019322211821307888612864189781248569} +#define T_625_126 {0.2994404772637143663516212654940318316221,-0.9541149828904684770947142169461585581303} +#define T_625_127 {0.2898336975306610163727327744709327816963,-0.9570770229065711509619518437830265611410} +#define T_625_128 {0.2801976260743935798203096965153235942125,-0.9599423369881517897894696034200023859739} +#define T_625_129 {0.2705332367538536564310902576835360378027,-0.9627106355554005956420837719633709639311} +#define T_625_131 {0.2511234141665813490718051070871297270060,-0.9679550768797691162603769043926149606705} +#define T_625_132 {0.2413799425321770664165654807220562361181,-0.9704306896132062210114099798374809324741} +#define T_625_133 {0.2316120760999064442930972518297494389117,-0.9728082268385126463172696276160422712564} +#define T_625_134 {0.2218208020484253029103172139002708718181,-0.9750874482725092606827388408419210463762} +#define T_625_136 {0.2021719915307735038201286670300760306418,-0.9793500323380199823475322773447260260582} +#define T_625_137 {0.1923164408499827837673024077957961708307,-0.9813329641761735500082863836723845452070} +#define T_625_138 {0.1824414539200502061166986322859884239733,-0.9832167186798331792729754852189216762781} +#define T_625_139 {0.1725480287456465922080184327569440938532,-0.9850011054694260970165942126186564564705} +#define T_625_141 {0.1527098648982631867720982654645922593772,-0.9882710646187887748581601954356301575899} +#define T_625_142 {0.1427671311474489335147097790468251332641,-0.9897563065037409124613532185321673750877} +#define T_625_143 {0.1328099687938694151156937550695147365332,-0.9911415197583902658706733745930250734091} +#define T_625_144 {0.1228393841471641101392719974683132022619,-0.9924265643876858389305084529041778296232} +#define T_625_146 {0.1028619798936414586609799926009145565331,-0.9946956384202959577933711443620268255472} +#define T_625_147 {0.0928571792811678370371808455274731386453,-0.9956794385021441318173174295225180685520} +#define T_625_148 {0.0828429941602083913521070712704386096448,-0.9965626113388820206395735112891998142004} +#define T_625_149 {0.0728204366033457090034985981219506356865,-0.9973450676736201891969813004834577441216} +#define T_625_151 {0.0527542566006264715405826848382275784388,-0.9986075247115431263722484800382517278194} +#define T_625_152 {0.0427126621211368767694338544060883577913,-0.9990873978258987264666757255326956510544} +#define T_625_153 {0.0326667509335237943313146047330519650131,-0.9994662992734908435465968068456277251244} +#define T_625_154 {0.0226175383167297565367714895501194405369,-0.9997441907610622457980298349866643548012} +#define T_625_156 {0.0025132714770037269634561649667148230947,-0.9999968417282540933399559435201808810234} +#define T_625_157 {-0.0075397509303570921582182684517192683415,-0.9999715756739829819466081062273588031530} +#define T_625_158 {-0.0175920113410997225322773829248035326600,-0.9998452485944885337332266317389439791441} +#define T_625_159 {-0.0276424938346043792825046381267384276725,-0.9996178732568780089806637079163920134306} +#define T_625_161 {-0.0477340623884916423480540004220529226586,-0.9988600799350685344180078573117498308420} +#define T_625_162 {-0.0577731179166413380543509958897629985586,-0.9983297385364165998922203471011016517878} +#define T_625_163 {-0.0678063346683007578397095471700595226139,-0.9976985020430021444681756292993668466806} +#define T_625_164 {-0.0778326986474765225576533111961907707155,-0.9969664342500459408569213337614201009274} +#define T_625_166 {-0.0978608158696515489927136854930722620338,-0.9952001108909334092800236248876899480820} +#define T_625_167 {-0.1078605449930568199734537415679369587451,-0.9941660338363007554107753094285726547241} +#define T_625_168 {-0.1178493733093558537561307275609578937292,-0.9930314824871324663746463556890375912189} +#define T_625_169 {-0.1278262913086236074455825928453123196959,-0.9917965715056101805302546381426509469748} +#define T_625_171 {-0.1477403644367836499373680680946563370526,-0.9890261799952952959102958629955537617207} +#define T_625_172 {-0.1576755069718226431874086301831994205713,-0.9874909794530675188894974780851043760777} +#define T_625_173 {-0.1675947142055242422564731441525509580970,-0.9858559792233186902521424599399324506521} +#define T_625_174 {-0.1774969836641414056011001321166986599565,-0.9841213445455451180876593753055203706026} +#define T_625_176 {-0.1972467080212628731672452886414248496294,-0.9803538831334196457234497756871860474348} +#define T_625_177 {-0.2070921669356412253559795999535708688200,-0.9783214371534033615418479712388943880796} +#define T_625_178 {-0.2169166963084052934807033352626604028046,-0.9761901181955526807243472831032704561949} +#define T_625_179 {-0.2267193032343216008150932339049177244306,-0.9739601416592692517681939534668345004320} +#define T_625_181 {-0.2462547893026399703497730797607800923288,-0.9692051272798304362154908631055150181055} +#define T_625_182 {-0.2559856941126726903590338224603328853846,-0.9666805699969629461776321477373130619526} +#define T_625_183 {-0.2656907280106526325624827222782187163830,-0.9640583162076707957055532460799440741539} +#define T_625_184 {-0.2753689101680248940695605597284156829119,-0.9613386309271433338707879556750413030386} +#define T_625_186 {-0.2946408096142865695732382391724968329072,-0.9556080751593917232966646224667783826590} +#define T_625_187 {-0.3042325792098980463684654296230291947722,-0.9525977838244708006953942458494566380978} +#define T_625_188 {-0.3137936018751875111298943465953925624490,-0.9494912192443889509263499348890036344528} +#define T_625_189 {-0.3233229113359364914614957342564594000578,-0.9462886953806719514403766879695467650890} +#define T_625_191 {-0.3422825416695712275050311745872022584081,-0.9395970741058202735374038638838101178408} +#define T_625_192 {-0.3517109464083127967626296594971790909767,-0.9361086529760148655654461435915436595678} +#define T_625_193 {-0.3611038058678774920196019593277014791965,-0.9325256250568850768800643891154322773218} +#define T_625_194 {-0.3704601707692759537948745673929806798697,-0.9288483524631988563768913991225417703390} +#define T_625_196 {-0.3890596383185932483073088405944872647524,-0.9212125692973394119889007924939505755901} +#define T_625_197 {-0.3983008612318227892323818650766042992473,-0.9172548304271764818551559983461629599333} +#define T_625_198 {-0.4075018303074807168862037087819771841168,-0.9132043902084862052603853044274728745222} +#define T_625_199 {-0.4166616156597678921613692182290833443403,-0.9090616579945454134303872706368565559387} +#define T_625_201 {-0.4348539365555282865116737411881331354380,-0.9005010015886493190961914478975813835859} +#define T_625_202 {-0.4438846335121400255019352698582224547863,-0.8960839425699989035933867853600531816483} +#define T_625_203 {-0.4528704697574726933950728380295913666487,-0.8915763218152700231300400446343701332808} +#define T_625_204 {-0.4618105371478895548875698295887559652328,-0.8869785948821864440816398200695402920246} +#define T_625_206 {-0.4795497560086358079800561426964122802019,-0.8775146901972968871419311653880868107080} +#define T_625_207 {-0.4883471146843615384725012518174480646849,-0.8726494689045877217736801867431495338678} +#define T_625_208 {-0.4970951190971508171223547378758667036891,-0.8676960542550539168260570477286819368601} +#define T_625_209 {-0.5057928851395788738543046747508924454451,-0.8626549468600877013102490309393033385277} +#define T_625_211 {-0.5230341911589180270425458729732781648636,-0.8523117005407917501358383560727816075087} +#define T_625_212 {-0.5315759886622380969711798570642713457346,-0.8470106069452519870210949193278793245554} +#define T_625_213 {-0.5400640630241156925350765050097834318876,-0.8416239111562147101608388766180723905563} +#define T_625_214 {-0.5484975564066704167842658534937072545290,-0.8361521575741531764336400556203443557024} +#define T_625_216 {-0.5651973965492598228621545786154456436634,-0.8249556975583226536841152665147092193365} +#define T_625_217 {-0.5734620555583554990519701277662534266710,-0.8192321226824456070758628811745438724756} +#define T_625_218 {-0.5816687582568207393052261977572925388813,-0.8134257530149683335096710834477562457323} +#define T_625_219 {-0.5898166752432919546222933604440186172724,-0.8075371753702420551945806437288410961628} +#define T_625_221 {-0.6059328642635862793852652430359739810228,-0.7955157848875950010736346484918612986803} +#define T_625_222 {-0.6138995075325276440381117026845458894968,-0.7893841869782546316614002535061445087194} +#define T_625_223 {-0.6218041077243211534053557443257886916399,-0.7831728108260403997675780374265741556883} +#define T_625_224 {-0.6296458659692600878798884878051467239857,-0.7768822841768375786486444667389150708914} +#define T_625_226 {-0.6451376929751027056170187279349192976952,-0.7640663303030451602992911830369848757982} +#define T_625_227 {-0.6527861960715837241764347709249705076218,-0.7575421983087092803188511425105389207602} +#define T_625_228 {-0.6603687260505968215085204064962454140186,-0.7509415061469912888725275479373522102833} +#define T_625_229 {-0.6678845165921063475877872406272217631340,-0.7442649209095699713856220114394091069698} +#define T_625_231 {-0.6827128478839764591512562219577375799417,-0.7306867778563879145536930082016624510288} +#define T_625_232 {-0.6900238900253774509963022865122184157372,-0.7237865923006904234071612336265388876200} +#define T_625_233 {-0.6972651956628107550173467643617186695337,-0.7168132580507298179384179093176499009132} +#define T_625_234 {-0.7044360329617029847781850548926740884781,-0.7097674798588468059179490410315338522196} +#define T_625_236 {-0.7185634108878365955419553756655659526587,-0.6954614471940469000443840741354506462812} +#define T_625_237 {-0.7255185237471978298984254251990932971239,-0.6882026385445545457741900463588535785675} +#define T_625_238 {-0.7324003128765530501453895340091548860073,-0.6808742774538681929996641883917618542910} +#define T_625_239 {-0.7392080827754685667940748317050747573376,-0.6734771045547250922780335713468957692385} +#define T_625_241 {-0.7525988203531378539778984304575715214014,-0.6584793205584099640148565413255710154772} +#define T_625_242 {-0.7591804347117565576752440392738208174706,-0.6508802251957486184608114854199811816216} +#define T_625_243 {-0.7656853233364364630375575870857574045658,-0.6432153493404652033760271478968206793070} +#define T_625_244 {-0.7721128288177779852574644792184699326754,-0.6354854676347906350386551821429748088121} +#define T_625_246 {-0.7847330998817070302209231158485636115074,-0.6198338180109623785085659619653597474098} +#define T_625_247 {-0.7909245900105082993647442890505772083998,-0.6119136319095279663571318451431579887867} +#define T_625_248 {-0.7970361462173010114184990015928633511066,-0.6039316034312769421177335971151478588581} +#define T_625_249 {-0.8030671508443926187581496378697920590639,-0.5958885392711202877435994196275714784861} +#define T_625_251 {-0.8148850754945862195910422087763436138630,-0.5796225614451032681628817044838797301054} +#define T_625_252 {-0.8206708011521579582137064790003933012486,-0.5714012916823650867215178550395648926497} +#define T_625_253 {-0.8263735866196754153989445512706879526377,-0.5631222738778264291781283645832445472479} +#define T_625_254 {-0.8319928555514105594781426589179318398237,-0.5547863447412973991745843704848084598780} +#define T_625_256 {-0.8429785806845486728633431994239799678326,-0.5379471279847712894550681994587648659945} +#define T_625_257 {-0.8483439266257484234756702790036797523499,-0.5294455422016571821686170551402028650045} +#define T_625_258 {-0.8536235356229666670913047710200771689415,-0.5208904485883242285737537713430356234312} +#define T_625_259 {-0.8588168740983388449450330881518311798573,-0.5122827117558800269492280676786322146654} +#define T_625_261 {-0.8689426488193063846665609162300825119019,-0.4949127933918132660195965399907436221838} +#define T_625_262 {-0.8738740617146393274694560204807203263044,-0.4861523673318466909343271709076361730695} +#define T_625_263 {-0.8787171574910247473511049065564293414354,-0.4773428088186660311897924202639842405915} +#define T_625_264 {-0.8834714466863258230944211391033604741096,-0.4684850081805932986434015674603870138526} +#define T_625_266 {-0.8927116924093823069341624432126991450787,-0.4506282661302732917008029289718251675367} +#define T_625_267 {-0.8971967150818874747741915598453488200903,-0.4416311293899813361640838138555409386754} +#define T_625_268 {-0.9015910635571866560411535829189233481884,-0.4325893596863208401259726088028401136398} +#define T_625_269 {-0.9058942937252941751680168636084999889135,-0.4235038708157818909683101082919165492058} +#define T_625_271 {-0.9142256687878009824288483287091366946697,-0.4052054127593781029936792492662789300084} +#define T_625_272 {-0.9182529716809616493478074517042841762304,-0.3959942928870098799443155712651787325740} +#define T_625_273 {-0.9221874723495493419989088579313829541206,-0.3867431522878577720980786125437589362264} +#define T_625_274 {-0.9260287731575901348790580414060968905687,-0.3774529259182506923941957666102098301053} +#define T_625_276 {-0.9334302317830763806583149744255933910608,-0.3587589753488995003571915276552317664027} +#define T_625_277 {-0.9369896415802629885405394816189073026180,-0.3493571404326670237772134441911475732923} +#define T_625_278 {-0.9404543555519829789446362156013492494822,-0.3399199981220644151136411892366595566273} +#define T_625_279 {-0.9438240235407373557308119416120462119579,-0.3304485021714786974200706026749685406685} +#define T_625_281 {-0.9502768690023574160719022074772510677576,-0.3114062816307281988947863737848820164800} +#define T_625_282 {-0.9533593943255084246501951383834239095449,-0.3018374815215959516834232090332079678774} +#define T_625_283 {-0.9563455694321727484918937989277765154839,-0.2922381765400498543350238378479843959212} +#define T_625_284 {-0.9592350925278542161933614806912373751402,-0.2826093368292684759879307421215344220400} +#define T_625_286 {-0.9647230243779428837669343010929878801107,-0.2632669486188402574455835747357923537493} +#define T_625_287 {-0.9673208785005658061351141441264189779758,-0.2535553549363405156924500261084176599979} +#define T_625_288 {-0.9698209714046909235563020956760738044977,-0.2438181359613384724394080649290117435157} +#define T_625_289 {-0.9722230504211841761730283906217664480209,-0.2340562757751382949589213922081398777664} +#define T_625_291 {-0.9767322056676015495924048082088120281696,-0.2144625804463383411668786493464722298086} +#define T_625_292 {-0.9788388261847201787801964201207738369703,-0.2046327255189626659337420733209000900388} +#define T_625_293 {-0.9808465214346523408650568853772711008787,-0.1947821896107088668337325998436426743865} +#define T_625_294 {-0.9827550885118900847459144642925821244717,-0.1849119682551322307872965211572591215372} +#define T_625_296 {-0.9862740766371433931425372065859846770763,-0.1651164611828580541796895886363927274942} +#define T_625_297 {-0.9878841420425021890849848205107264220715,-0.1551931760772660706670933450368465855718} +#define T_625_298 {-0.9893943680257961670321265046368353068829,-0.1452542065443727936635553987798630259931} +#define T_625_299 {-0.9908046019577008145517993398243561387062,-0.1353005570551802927425910638703498989344} +#define T_625_301 {-0.9933245336923797186301499095861800014973,-0.1153532434083082286457866416640172246844} +#define T_625_302 {-0.9944339768210305186002528898825403302908,-0.1053615952038983727412357893626904115081} +#define T_625_303 {-0.9954429185757831533010175917297601699829,-0.0953592987459886270995568224861926864833} +#define T_625_304 {-0.9963512569890512482473354793910402804613,-0.0853473649056469552132142553091398440301} +#define T_625_306 {-0.9978657667668942021776956607936881482601,-0.0652986333296357579492052991554373875260} +#define T_625_307 {-0.9984717850692077512064770417055115103722,-0.0552638617969271883634263531348551623523} +#define T_625_308 {-0.9989768939209825449054847013030666857958,-0.0452235050829326162835641866877267602831} +#define T_625_309 {-0.9993810422739491938770584056328516453505,-0.0351785779052378247411247969012038083747} +#define T_625_311 {-0.9998863043118163540512455256248358637094,-0.0150790732360371205339477640450240869541} +#define T_625_312 {-0.9999873669329657488447082869242876768112,-0.0050265270788188622791414772450480086263} +#define T_625_318 {-0.9984717850692077512064770417055115103722,0.0552638617969271883634263531348551623523} +#define T_625_321 {-0.9963512569890512482473354793910402804613,0.0853473649056469552132142553091398440301} +#define T_625_324 {-0.9933245336923797186301499095861800014973,0.1153532434083082286457866416640172246844} +#define T_625_327 {-0.9893943680257961670321265046368353068829,0.1452542065443727936635553987798630259931} +#define T_625_333 {-0.9788388261847201787801964201207738369703,0.2046327255189626659337420733209000900388} +#define T_625_336 {-0.9722230504211841761730283906217664480209,0.2340562757751382949589213922081398777664} +#define T_625_339 {-0.9647230243779428837669343010929878801107,0.2632669486188402574455835747357923537493} +#define T_625_342 {-0.9563455694321727484918937989277765154839,0.2922381765400498543350238378479843959212} +#define T_625_348 {-0.9369896415802629885405394816189073026180,0.3493571404326670237772134441911475732923} +#define T_625_351 {-0.9260287731575901348790580414060968905687,0.3774529259182506923941957666102098301053} +#define T_625_354 {-0.9142256687878009824288483287091366946697,0.4052054127593781029936792492662789300084} +#define T_625_357 {-0.9015910635571866560411535829189233481884,0.4325893596863208401259726088028401136398} +#define T_625_363 {-0.8738740617146393274694560204807203263044,0.4861523673318466909343271709076361730695} +#define T_625_366 {-0.8588168740983388449450330881518311798573,0.5122827117558800269492280676786322146654} +#define T_625_369 {-0.8429785806845486728633431994239799678326,0.5379471279847712894550681994587648659945} +#define T_625_372 {-0.8263735866196754153989445512706879526377,0.5631222738778264291781283645832445472479} +// Pre-computed twiddles for N=648 +#define T_648_1 {0.9999529915072261632147387899749446660280,-0.0096961216859783958682550419894141668919} +#define T_648_5 {0.9988250086459503629399137025757227092981,-0.0484623782270029584973158875982335302979} +#define T_648_7 {0.9976974499728976653045720013324171304703,-0.0678218129924093443561261551622010301799} +#define T_648_11 {0.9943173181260184234631083199928980320692,-0.1064568967924682529879731873734272085130} +#define T_648_13 {0.9920660160815423234126342322269920259714,-0.1257180167521627089932678700279211625457} +#define T_648_17 {0.9864451725452738628519000485539436340332,-0.1640911989173239882155996838264400139451} +#define T_648_19 {0.9830777448228600334800830751191824674606,-0.1831888305383266302150957471894798800349} +#define T_648_23 {0.9752352087524931434359132254030555486679,-0.2211702683668877733591529022305621765554} +#define T_648_25 {0.9707630496616199700099514302564784884453,-0.2400397913090059065499559665113338269293} +#define T_648_29 {0.9607253577167205182973930277512408792973,-0.2775009676380957857055875592777738347650} +#define T_648_31 {0.9551635996281230278626139806874562054873,-0.2960785334086999398550688056275248527527} +#define T_648_35 {0.9429647161808759614487485123390797525644,-0.3328926914756765720326825430674944072962} +#define T_648_37 {0.9363321783233931050460796541301533579826,-0.3511154394727888372607083056209376081824} +#define T_648_41 {0.9220133805339184585037060060130897909403,-0.3871580118200006470630114563391543924809} +#define T_648_43 {0.9143325053161793780631683148385491222143,-0.4049642820326735948022189859329955652356} +#define T_648_47 {0.8979422434636881167335786813055165112019,-0.4401133120043048685410269627027446404099} +#define T_648_49 {0.8892390205361061505939801463682670146227,-0.4574428536505807940670820244122296571732} +#define T_648_53 {0.8708327540784920328675866585399489849806,-0.4915794080553705924785390379838645458221} +#define T_648_55 {0.8611366323925137011841002276923973113298,-0.5083735834518555707361997519910801202059} +#define T_648_59 {0.8407766423091030949521496040688361972570,-0.5413821549953696798951341406791470944881} +#define T_648_61 {0.8301204304712788140108159495866857469082,-0.5575841379685928522746962698874995112419} +#define T_648_65 {0.8078756085237110973196195118362084031105,-0.5893530360933448841720405653177294880152} +#define T_648_67 {0.7962953637817556939992869047273416072130,-0.6049080042615416763496227758878376334906} +#define T_648_71 {0.7722409794060690613193287390458863228559,-0.6353297330724850811023429741908330470324} +#define T_648_73 {0.7597758856425493867448039964074268937111,-0.6501850533471834614474005320516880601645} +#define T_648_77 {0.7339933312612353377701879253436345607042,-0.6791566753437932035808444197755306959152} +#define T_648_79 {0.7206855664077145284807102143531665205956,-0.6932620820235242398155151022365316748619} +#define T_648_83 {0.6932620820235242398155151022365316748619,-0.7206855664077145284807102143531665205956} +#define T_648_85 {0.6791566753437932035808444197755306959152,-0.7339933312612353377701879253436345607042} +#define T_648_89 {0.6501850533471834614474005320516880601645,-0.7597758856425493867448039964074268937111} +#define T_648_91 {0.6353297330724850811023429741908330470324,-0.7722409794060690613193287390458863228559} +#define T_648_95 {0.6049080042615416763496227758878376334906,-0.7962953637817556939992869047273416072130} +#define T_648_97 {0.5893530360933448841720405653177294880152,-0.8078756085237110973196195118362084031105} +#define T_648_101 {0.5575841379685928522746962698874995112419,-0.8301204304712788140108159495866857469082} +#define T_648_103 {0.5413821549953696798951341406791470944881,-0.8407766423091030949521496040688361972570} +#define T_648_107 {0.5083735834518555707361997519910801202059,-0.8611366323925137011841002276923973113298} +// Pre-computed twiddles for N=729 +#define T_729_1 {0.9999628574261159030811541015282273292542,-0.0086188031766284319001858449382780236192} +#define T_729_2 {0.9998514324636050831784928050183225423098,-0.0172369661041893024178150994885072577745} +#define T_729_4 {0.9994057739990460653700665716314688324928,-0.0344688105012005627236781890587735688314} +#define T_729_5 {0.9990715736028027427195752352417912334204,-0.0430812119005435992247221577144955517724} +#define T_729_7 {0.9981805546111732763137069923686794936657,-0.0602957742808772068077161065957625396550} +#define T_729_8 {0.9976238022052646980952772537420969456434,-0.0688966564755575938683307413157308474183} +#define T_729_10 {0.9962880183623609164911272273457143455744,-0.0860824283323837757819774196832440793514} +#define T_729_11 {0.9955090861542660318761477356019895523787,-0.0946660413469276734721447041920328047127} +#define T_729_13 {0.9937294304759625163114833412691950798035,-0.1118115334208380495795154274674132466316} +#define T_729_14 {0.9927288392077390222212329717876855283976,-0.1203721388247922519099120108876377344131} +#define T_729_16 {0.9905065014529780276220094492600765079260,-0.1374658887847514665558890101237921044230} +#define T_729_17 {0.9892849200531290687621321922051720321178,-0.1459977635290147790492198964784620329738} +#define T_729_19 {0.9866213859285269949239705056243110448122,-0.1630283436353094916881900644511915743351} +#define T_729_20 {0.9851796310647999987963885359931737184525,-0.1715257838781811539075050632163765840232} +#define T_729_22 {0.9820766812314035698605607649369630962610,-0.1884818086222438404675472156668547540903} +#define T_729_23 {0.9804157167648209769694744863954838365316,-0.1969391335426312061862574864790076389909} +#define T_729_25 {0.9768754256476755992721905386133585125208,-0.2138092672586307252391435440586064942181} +#define T_729_26 {0.9749963619881616416762426524655893445015,-0.2222208228538668783880183354995097033679} +#define T_729_28 {0.9710210963894898572590363983181305229664,-0.2389937872969776500031713339922134764493} +#define T_729_29 {0.9689251897535244850701019458938390016556,-0.2473539501627100845038143006604514084756} +#define T_729_31 {0.9645176072704398917423418424732517451048,-0.2640185320489935594245878291985718533397} +#define T_729_32 {0.9622062588412366723744639784854371100664,-0.2723217131386166944828630676056491211057} +#define T_729_34 {0.9573693060890517969596658076625317335129,-0.2888667716414740072039535334624815732241} +#define T_729_35 {0.9548440610798200056663631585252005606890,-0.2971074200025925482471222949243383482099} +#define T_729_37 {0.9495809717221362911132587214524392038584,-0.3135218942007774178826196020963834598660} +#define T_729_38 {0.9468435183430549129113273920665960758924,-0.3216945006862024225569030022597871720791} +#define T_729_40 {0.9411578109299509886653822832158766686916,-0.3379674169584142573441454260319005697966} +#define T_729_41 {0.9382099792595437470765773468883708119392,-0.3460665179092111687353394700039643794298} +#define T_729_43 {0.9321054548753081592238345365331042557955,-0.3621869972703255502466390680638141930103} +#define T_729_44 {0.9289492156369756337142007396323606371880,-0.3702071781684515561217097001645015552640} +#define T_729_46 {0.9224299553589553335442019488255027681589,-0.3861644435424831378611543186707422137260} +#define T_729_47 {0.9190674186034806236733629702939651906490,-0.3941003426305724199885105463181389495730} +#define T_729_49 {0.9121377807737452991787563405523542314768,-0.4098837260555083794599795510293915867805} +#define T_729_50 {0.9085711944686548635630174430843908339739,-0.4177300379213850223258930327574489638209} +#define T_729_52 {0.9012358117803006551937983203970361500978,-0.4333289876810718643795894422510173171759} +#define T_729_53 {0.8974675603070237928093888513103593140841,-0.4410804668045941734355608332407427951694} +#define T_729_55 {0.8897313367070638356892686715582385659218,-0.4564845544829106982476218945521395653486} +#define T_729_56 {0.8857639392668940070407757048087660223246,-0.4641360187427756023303970778215443715453} +#define T_729_58 {0.8776320466778070317204196726379450410604,-0.4793349461953754775400682319741463288665} +#define T_729_59 {0.8734681556077324993125898799917194992304,-0.4868812803335387240366571859340183436871} +#define T_729_61 {0.8649460304698609602880310376349370926619,-0.5018648865725020558059554787178058177233} +#define T_729_62 {0.8605884294693890712579786850255914032459,-0.5093010456138984398677393983234651386738} +#define T_729_64 {0.8516817691064979545245705594425089657307,-0.5240593136006895802481153623375575989485} +#define T_729_65 {0.8471333713766597828254134583403356373310,-0.5313803262259660931121629801054950803518} +#define T_729_67 {0.8378481301870863706682257543434388935566,-0.5459033895681570935920490228454582393169} +#define T_729_68 {0.8331119764828649465471244184300303459167,-0.5531043614371651262473505994421429932117} +#define T_729_70 {0.8234543619588061691416669418686069548130,-0.5673825109844463643327117097214795649052} +#define T_729_71 {0.8185336185562906985424547201546374708414,-0.5744586280082707441252409807930234819651} +#define T_729_73 {0.8085100871338877226435215561650693416595,-0.5884823183433409710119121882598847150803} +#define T_729_74 {0.8034080437135130425829743217036593705416,-0.5954288499026780812073411652818322181702} +#define T_729_76 {0.7930252964565097650861957845336291939020,-0.6091887057226723634073550783796235918999} +#define T_729_77 {0.7877453639037954591373136281617917120457,-0.6160010078304067882015715440502390265465} +#define T_729_79 {0.7770103420236545987975773641665000468493,-0.6294878302145986426197055152442771941423} +#define T_729_80 {0.7715560501489147071652041631750762462616,-0.6361613486204623635345001275709364563227} +#define T_729_82 {0.7604759303643872092592914668784942477942,-0.6493661211800471066979412171349395066500} +#define T_729_83 {0.7548509255429350917054875935718882828951,-0.6558963944152874647741668923117686063051} +#define T_729_85 {0.7434331152821854749035423992609139531851,-0.6688102893211382848903667763806879520416} +#define T_729_86 {0.7376411580166103432176782916940283030272,-0.6751929516811575604506856507214251905680} +#define T_729_88 {0.7258932904651047568478361426969058811665,-0.6878073355655224263571767551184166222811} +#define T_729_89 {0.7199382528712511275514884800941217690706,-0.6940381200284969631653098076640162616968} +#define T_729_91 {0.7078681818687191373840050800936296582222,-0.7063445597566911926534771737351547926664} +#define T_729_92 {0.7017540450870486390400060372485313564539,-0.7124193008362171841696408591815270483494} +#define T_729_94 {0.6893698398769307900124658772256225347519,-0.7244095691444548679172044103324878960848} +#define T_729_95 {0.6831006914109973848781010019592940807343,-0.7303242056743136645380332083732355386019} +#define T_729_97 {0.6704106312458877336979412575601600110531,-0.7419902866699067400801936855714302510023} +#define T_729_98 {0.6639906622297059302084676346566993743181,-0.7477408645190902758415063544816803187132} +#define T_729_100 {0.6510032308363964403952195425517857074738,-0.7590749590393385259901037898089271038771} +#define T_729_101 {0.6444367332325289243755150891956873238087,-0.7646576337555169855519920929509680718184} +#define T_729_103 {0.6311606131403567632176532242738176137209,-0.7756521645817079368967483787855599075556} +#define T_729_104 {0.6244519768705949491760520686511881649494,-0.7810632039613734090011121224961243569851} +#define T_729_106 {0.6108960436068826549416144189308397471905,-0.7917108208844045869057026720838621258736} +#define T_729_107 {0.6040497536174378456763633948867209255695,-0.7969466074679737443986482503532897680998} +#define T_729_109 {0.5902230697739088149944564065663143992424,-0.8072401922022116593780083348974585533142} +#define T_729_110 {0.5832437030370770658649348661128897219896,-0.8122972256924172462788646953413262963295} +#define T_729_112 {0.5691555122112117448551771303755231201649,-0.8222298966345076287609572318615391850471} +#define T_729_113 {0.5620477346655154970989087814814411103725,-0.8271047962364638239662895102810580283403} +#define T_729_115 {0.5477074552808995910879730217857286334038,-0.8366699130659123184727832267526537179947} +#define T_729_116 {0.5404760187117529923739311925601214170456,-0.8413594197472877844745653419522568583488} +#define T_729_118 {0.5258932377215480569176975222944747656584,-0.8505505878657348972637919359840452671051} +#define T_729_119 {0.5185429765845305238869400454859714955091,-0.8550515665355248318135750196233857423067} +#define T_729_121 {0.5037274430622776799637563271971885114908,-0.8638626413417470617517324171785730868578} +#define T_729_122 {0.4962632712511393373588930444384459406137,-0.8681720829461853083941491604491602629423} +#define T_729_124 {0.4812248898731794621852486670832149684429,-0.8765971739439651910785755717370193451643} +#define T_729_125 {0.4736517974347407289315015077590942382812,-0.8807121974781769724316404790442902594805} +#define T_729_127 {0.4584006218586094139055830964935012161732,-0.8887456722142955678478415393328759819269} +#define T_729_128 {0.4507236716567481482620394217519788071513,-0.8926635266483445851903866241627838462591} +#define T_729_130 {0.4352698977999716056785928230965510010719,-0.9003000144780639590891269108396954834461} +#define T_729_131 {0.4274942221309309120513830748677719384432,-0.9040180805961075538590421274420805275440} +#define T_729_133 {0.4118481813547164582800519383454229682684,-0.9112524762736244898775339606800116598606} +#define T_729_134 {0.4039789785159937918201933371165068820119,-0.9147682684249460738357129230280406773090} +#define T_729_136 {0.3881511307183715953073033233522437512875,-0.9215957355164191566743170369591098278761} +#define T_729_137 {0.3801936615334845503966221258451696485281,-0.9249069032771688458893777351477183401585} +#define T_729_139 {0.3641945881565183396055829234683187678456,-0.9313228773940346316706495599646586924791} +#define T_729_140 {0.3561541724579690981755675238673575222492,-0.9344272071385653077868482796475291252136} +#define T_729_142 {0.3399945694137104790222281280875904485583,-0.9404273989889839757694289801293052732944} +#define T_729_143 {0.3318765824865012037392375532363075762987,-0.9433228153697336137994966520636808127165} +#define T_729_145 {0.3155672530064174696740053605026332661510,-0.9489032136261230654383780347416177392006} +#define T_729_146 {0.3073771219944935184642531567078549414873,-0.9515877809610536752771281499008182436228} +#define T_729_148 {0.2909289694071482412773832493257941678166,-0.9567446549417951695559736435825470834970} +#define T_729_149 {0.2826721696851723919508003746159374713898,-0.9592165785084603157883975654840469360352} +#define T_729_151 {0.2660961901269880969245207325002411380410,-0.9639464806719830747283594973850995302200} +#define T_729_152 {0.2577782416398705089832787962222937494516,-0.9662041079073595550852360247517935931683} +#define T_729_154 {0.2410855167038462310014068634700379334390,-0.9705038761569371175141895946580916643143} +#define T_729_155 {0.2327119802764780476245221052522538229823,-0.9725456977622183307730097112653311342001} +#define T_729_157 {0.2159136696037763092359540451070643030107,-0.9764124575599347766186042463232297450304} +#define T_729_158 {0.2074901432234333154003280696997535414994,-0.9782371085095469265269230163539759814739} +#define T_729_160 {0.1905974770427891762381022999761626124382,-0.9816682747980213230576396199467126280069} +#define T_729_161 {0.1821295921166914411060133716091513633728,-0.9832745352521886639252102213504258543253} +#define T_729_163 {0.1651538637366315731558330526240752078593,-0.9862678141827716515877000347245484590530} +#define T_729_164 {0.1566472813271606312479633515977184288204,-0.9876546103030194867500313193886540830135} +#define T_729_166 {0.1395998395860514607047520030391751788557,-0.9902080007693073726571242332283873111010} +#define T_729_167 {0.1310602466261420184778785369417164474726,-0.9913744054363592406176053373201284557581} +#define T_729_169 {0.1139524883051141190826172078232048079371,-0.9934862004120000866791428961732890456915} +#define T_729_170 {0.1053855937963505129406627247590222395957,-0.9944314338455872981370475827134214341640} +#define T_729_172 {0.0882289560001719153081722879505832679570,-0.9961002215254847191872045186755713075399} +#define T_729_173 {0.0796404871961308297168002923172025475651,-0.9968236518056555750533220816578250378370} +#define T_729_175 {0.0624464397071225266189209435196971753612,-0.9980483165498075237564989947713911533356} +#define T_729_176 {0.0538421382845137644346245053839083993807,-0.9985494600393869379928446505800820887089} +#define T_729_178 {0.0366221758946191494610289396405278239399,-0.9993291831187276486048176593612879514694} +#define T_729_179 {0.0280077941147840007996006050916548701935,-0.9996077047866447351509577856631949543953} +#define T_729_181 {0.0107734289409186501096371557650854811072,-0.9999419649303928903094629276893101632595} +#define T_729_182 {0.0021547258042520799938002173945506001473,-0.9999976785756596875387458567274734377861} +#define T_729_184 {-0.0150825204079289010294928630173671990633,-0.9998862523198048801731374624068848788738} +#define T_729_185 {-0.0236997830120613227344872342428061529063,-0.9997191206959988862834620704234112054110} +#define T_729_187 {-0.0409283865908329977378876662896800553426,-0.9991620825326946730982058397785294800997} +#define T_729_188 {-0.0495384477361095545977498488809942500666,-0.9987722173728585950414071703562512993813} +#define T_729_190 {-0.0667468907876345302065956843762251082808,-0.9977699397006223325234941512462683022022} +#define T_729_191 {-0.0753439943621479896185277880249486770481,-0.9971576016425671706855382581125013530254} +#define T_729_193 {-0.0925207724705710832235894258701591752470,-0.9957107545173190521481387804669793695211} +#define T_729_194 {-0.1010991710249807412624889479957346338779,-0.9948763529293787044593955215532332658768} +#define T_729_196 {-0.1182328009435131693782849993112904485315,-0.9929859036164871977092616361915133893490} +#define T_729_197 {-0.1267867595334056241007658627495402470231,-0.9919299963238425998923730730894021689892} +#define T_729_199 {-0.1438657868612569501465259236283600330353,-0.9895972086514752685815210497821681201458} +#define T_729_200 {-0.1523895868811470377668371156687499023974,-0.9883205015632293699567867406585719436407} +#define T_729_202 {-0.1694025937211726118292176579416263848543,-0.9855469350774418435534585114510264247656} +#define T_729_203 {-0.1778905367275810389671164557512383908033,-0.9840502817146963288053029828006401658058} +#define T_729_205 {-0.1948261493195256044419494401154224760830,-0.9808377906368238585699259601824451237917} +#define T_729_206 {-0.2032725608405778205955982684827176854014,-0.9791221915620713645012074266560375690460} +#define T_729_208 {-0.2201194571648120157369987737183691933751,-0.9754729235491206296160271449480205774307} +#define T_729_209 {-0.2285186904938111274177003906515892595053,-0.9735395256973359279228930063254665583372} +#define T_729_211 {-0.2452656078404777673096504031491349451244,-0.9694559204062044299732292529370170086622} +#define T_729_212 {-0.2536120478109155640389360542030772194266,-0.9673060163180801085403004435647744685411} +#define T_729_214 {-0.2702477903094253619542541855480521917343,-0.9627908037745649405536596532328985631466} +#define T_729_215 {-0.2785358570489075979992321663303300738335,-0.9604258307324051724407354413415305316448} +#define T_729_217 {-0.2950493031527505194411276079335948452353,-0.9554820295060898471817267818551044911146} +#define T_729_218 {-0.3032734558133272084745613028644584119320,-0.9529035685729390081633027875795960426331} +#define T_729_220 {-0.3196535657351958215066645152546698227525,-0.9475344837591794799891431466676294803619} +#define T_729_221 {-0.3278083061976017620331447233184007927775,-0.9447442587218296550233276320795994251966} +#define T_729_223 {-0.3440441292898558378610118779761251062155,-0.9389534797321883452525526081444695591927} +#define T_729_224 {-0.3521240058391863492204265639884397387505,-0.9359533559487698273215983135742135345936} +#define T_729_226 {-0.3682046879147244933250249232514761388302,-0.9297447541113745828056380560155957937241} +#define T_729_227 {-0.3762042988850878844075964479998219758272,-0.9265367372643026344647410041943658143282} +#define T_729_229 {-0.3921190894737323073826473773806355893612,-0.9199144632357353357221541045873891562223} +#define T_729_230 {-0.4000330868594427213835729162383358925581,-0.9165006979908446593796611523430328816175} +#define T_729_232 {-0.4157713463949855858814430575876031070948,-0.9094691789812895388322999679076019674540} +#define T_729_233 {-0.4235944394258828182842080423142760992050,-0.9058519475540537380098271569295320659876} +#define T_729_235 {-0.4391456463589897829535857454175129532814,-0.8984158843675600358835708902915939688683} +#define T_729_236 {-0.4468726050374945968357565106998663395643,-0.8945976049973548560600988821533974260092} +#define T_729_238 {-0.4622263628697101389697365902975434437394,-0.8867619688891935636476659965410362929106} +#define T_729_239 {-0.4698520214672515238341077292716363444924,-0.8827451942226235415134283357474487274885} +#define T_729_241 {-0.4849980657014033025298260781710268929601,-0.8745152235758376635388344766397494822741} +#define T_729_242 {-0.4925173262118797135045156210253480821848,-0.8703026389602073198403786591370590031147} +#define T_729_244 {-0.5074455312142361895766384805028792470694,-0.8616838357835788775318519583379384130239} +#define T_729_245 {-0.5148533667622017251375154955894686281681,-0.8572782574716483194876559537078719586134} +#define T_729_247 {-0.5295537525317945970471100736176595091820,-0.8482763837214229996064318584103602916002} +#define T_729_248 {-0.5368452107330925482386874136864207684994,-0.8436807569886499713618377427337691187859} +#define T_729_250 {-0.5513079495736786794779504816688131541014,-0.8343018307164776770079583911865483969450} +#define T_729_251 {-0.5584781558462748041193890458089299499989,-0.8295192278920024975263913802336901426315} +#define T_729_253 {-0.5726935789364780982069191850314382463694,-0.8197695192216699622278497372462879866362} +#define T_729_254 {-0.5797377397592804459947046780143864452839,-0.8148031376343618514823674559011124074459} +#define T_729_256 {-0.5936963436165197949279104250308591872454,-0.8046891645700066098001457248756196349859} +#define T_729_257 {-0.6006097497340067148385855944070499390364,-0.7995423244109430838122420936997514218092} +#define T_729_259 {-0.6143022025678901432144130012602545320988,-0.7890708484795512234200032253284007310867} +#define T_729_260 {-0.6210802321384044644148048064380418509245,-0.7837469905823597482452669282793067395687} +#define T_729_262 {-0.6344973800893406457035439416358713060617,-0.7729250123134603356334082491230219602585} +#define T_729_263 {-0.6411355017749441609353766580170486122370,-0.7674276958540071635894719292991794645786} +#define T_729_265 {-0.6542683750338017523162648103607352823019,-0.7562624500995858145557804164127446711063} +#define T_729_266 {-0.6607621510296252109739612023986410349607,-0.7505953502165483293140368914464488625526} +#define T_729_268 {-0.6736019698343483907976292357489001005888,-0.7390943013143083106797348591499030590057} +#define T_729_269 {-0.6799470588354110667239638132741674780846,-0.7332612066522229410381328307266812771559} +#define T_729_271 {-0.6924852393405822592598042319878004491329,-0.7214320434354274391708372604625765234232} +#define T_729_272 {-0.6986773994440992341381502228614408522844,-0.7154368536118548282942697369435336440802} +#define T_729_274 {-0.7109055594595243832145570195280015468597,-0.7032874842690866046268638456240296363831} +#define T_729_275 {-0.7169406510007588773092379597073886543512,-0.6971342072675877910015174165891949087381} +#define T_729_277 {-0.7288506155952406695419654170109424740076,-0.6846727540558619207189394728629849851131} +#define T_729_278 {-0.7347246039150080454405156160646583884954,-0.6783655035465280258577536187658552080393} +#define T_729_280 {-0.7463084108815584150065092217118944972754,-0.6656002973612940021297390558174811303616} +#define T_729_281 {-0.7520173690235292252026511050644330680370,-0.6591432899506214360130229579226579517126} +#define T_729_283 {-0.7632672742023699496982658274646382778883,-0.6460828647562820714611575567687395960093} +#define T_729_284 {-0.7688073855383711352473596889467444270849,-0.6394804171682306748181190414470620453358} +#define T_729_286 {-0.7797158679941611492125730364932678639889,-0.6261335042929038197101476725947577506304} +#define T_729_287 {-0.7850834287757186835676748160040006041527,-0.6193900304830236569486601183598395437002} +#define T_729_289 {-0.7956431958255496539322848548181354999542,-0.6057655527813594620312187544186599552631} +#define T_729_290 {-0.8008346176599674404172901631682179868221,-0.5988855609859147222095998586155474185944} +#define T_729_292 {-0.8110386097487645162829039691132493317127,-0.5849926268738703249994159705238416790962} +#define T_729_293 {-0.8160504219980837525838524015853181481361,-0.5779807165959340853333969789673574268818} +#define T_729_295 {-0.8258918174181526516974827245576307177544,-0.5638286139614953063059488158614840358496} +#define T_729_296 {-0.8307206695193891654582785122329369187355,-0.5566894728960282145990845492633525282145} +#define T_729_298 {-0.8401928889709542325192614953266456723213,-0.5422876628899478967937852758041117340326} +#define T_729_299 {-0.8448355526760610301195697502407710999250,-0.5350260637899191262789599932148121297359} +#define T_729_301 {-0.8539322636657457055164854864415246993303,-0.5203841745006234642545450697070918977261} +#define T_729_302 {-0.8583856351998035982830970169743523001671,-0.5130049719862660451141778139572124928236} +#define T_729_304 {-0.8671007562741130936245781413163058459759,-0.4981327920031576872439416092674946412444} +#define T_729_305 {-0.8713618584103077768787670720485039055347,-0.4906409193164943949305722981080180034041} +#define T_729_307 {-0.8796895632212818894046790774154942482710,-0.4755483911859550993916911920678103342652} +#define T_729_308 {-0.8837555472712789184086545901664067059755,-0.4679488568927615554926546792557928711176} +#define T_729_310 {-0.8916902684716001559195319714490324258804,-0.4526460704712298999119468589924508705735} +#define T_729_311 {-0.8955584161899875494938783049292396754026,-0.4449439551126423419979971640714211389422} +#define T_729_313 {-0.9030948491549369849806794263713527470827,-0.4294411408212092107206103719363454729319} +#define T_729_314 {-0.9067625745564626971173538549919612705708,-0.4216415935172139750619635378825478255749} +#define T_729_316 {-0.9138956809302382078286086652951780706644,-0.4059491155022463271251353944535367190838} +#define T_729_317 {-0.9173605320186272171767427607846911996603,-0.3980573505093220609829529621492838487029} +#define T_729_319 {-0.9240855430826517835640743214753456413746,-0.3821856997136868772102502589405048638582} +#define T_729_320 {-0.9273452034898469475976412468298804014921,-0.3742069929389006399667039204359753057361} +#define T_729_322 {-0.9336576233508152578011163313931319862604,-0.3581667800884208996414770354022039100528} +#define T_729_323 {-0.9367099138855462525654616001702379435301,-0.3501064655623092880531999071536120027304} +#define T_729_325 {-0.9426055224810786503653048384876456111670,-0.3339084140721415017161177729576593264937} +#define T_729_326 {-0.9454484025857242679435898935480508953333,-0.3257718803827338582834727276349440217018} +#define T_729_328 {-0.9509232585056187625482948533317539840937,-0.3094268191884086416365562399732880294323} +#define T_729_329 {-0.9535548276203866802092079524300061166286,-0.3012195058787771051456161330861505120993} +#define T_729_331 {-0.9586052707415827489612070166913326829672,-0.2847383621966959044158329561469145119190} +#define T_729_332 {-0.9610237695750973863084709591930732131004,-0.2764657561284400988377285557362483814359} +#define T_729_334 {-0.9656464235085896463672838763159234076738,-0.2598595481506683069206076197588117793202} +#define T_729_335 {-0.9678502352140367914756780010065995156765,-0.2515271798357659460698698694613995030522} +#define T_729_337 {-0.9720420095621031819632662518415600061417,-0.2348070093640053646044663082648185081780} +#define T_729_338 {-0.9740296608181456816666354825429152697325,-0.2264204492674814217512135883225710131228} +#define T_729_340 {-0.9777877532403812521621944142680149525404,-0.2095974942911469351969344643293879926205} +#define T_729_341 {-0.9795579152361233443713217639015056192875,-0.2011623491070327340857204490021103993058} +#define T_729_343 {-0.9828798133228976441344570957880932837725,-0.1842478563303950878093928622547537088394} +#define T_729_344 {-0.9844313026462394589088944485411047935486,-0.1757697652334669902174368871783372014761} +#define T_729_346 {-0.9873147855983258613932207481411751359701,-0.1587750425568573708900999008619692176580} +#define T_729_347 {-0.9886465650271132332704837608616799116135,-0.1502596734326611949139618218396208249032} +#define T_729_349 {-0.9910897051403672053382365447760093957186,-0.1331960823927642589836750630638562142849} +#define T_729_350 {-0.9922008843358094409836667182389646768570,-0.1246491280484453706423053631624497938901} +#define T_729_352 {-0.9942020482899025513034985124249942600727,-0.1075280762227344422132802037594956345856} +#define T_729_353 {-0.9950918843917935241449868044583126902580,-0.0989552505812072158430936497097718529403} +#define T_729_355 {-0.9966497343421426569065602052432950586081,-0.0817881839615998140757824330648873001337} +#define T_729_356 {-0.9973176324654871027775016045779921114445,-0.0731952182416002322673520552598347421736} +#define T_729_358 {-0.9984311269376477948611636747955344617367,-0.0559936135824322667176033974101301282644} +#define T_729_359 {-0.9988766405703624062795142890536226332188,-0.0473862524670078388844274286384461447597} +#define T_729_361 {-0.9995450351562887858491990300535690039396,-0.0301616096124420057789095039879612158984} +#define T_729_362 {-0.9997678664577099860721887125691864639521,-0.0215456074084403106927432247630349593237} +#define T_729_364 {-0.9999907143134170173226493716356344521046,-0.0043094416044383030209297125168177444721} +#define T_729_365 {-0.9999907143134170173226493716356344521046,0.0043094416044383030209297125168177444721} +#define T_729_368 {-0.9995450351562887858491990300535690039396,0.0301616096124420057789095039879612158984} +#define T_729_370 {-0.9988766405703624062795142890536226332188,0.0473862524670078388844274286384461447597} +#define T_729_374 {-0.9966497343421426569065602052432950586081,0.0817881839615998140757824330648873001337} +#define T_729_376 {-0.9950918843917935241449868044583126902580,0.0989552505812072158430936497097718529403} +#define T_729_380 {-0.9910897051403672053382365447760093957186,0.1331960823927642589836750630638562142849} +#define T_729_382 {-0.9886465650271132332704837608616799116135,0.1502596734326611949139618218396208249032} +#define T_729_385 {-0.9844313026462394589088944485411047935486,0.1757697652334669902174368871783372014761} +#define T_729_386 {-0.9828798133228976441344570957880932837725,0.1842478563303950878093928622547537088394} +#define T_729_388 {-0.9795579152361233443713217639015056192875,0.2011623491070327340857204490021103993058} +#define T_729_392 {-0.9720420095621031819632662518415600061417,0.2348070093640053646044663082648185081780} +#define T_729_394 {-0.9678502352140367914756780010065995156765,0.2515271798357659460698698694613995030522} +#define T_729_395 {-0.9656464235085896463672838763159234076738,0.2598595481506683069206076197588117793202} +#define T_729_398 {-0.9586052707415827489612070166913326829672,0.2847383621966959044158329561469145119190} +#define T_729_400 {-0.9535548276203866802092079524300061166286,0.3012195058787771051456161330861505120993} +#define T_729_404 {-0.9426055224810786503653048384876456111670,0.3339084140721415017161177729576593264937} +#define T_729_406 {-0.9367099138855462525654616001702379435301,0.3501064655623092880531999071536120027304} +#define T_729_410 {-0.9240855430826517835640743214753456413746,0.3821856997136868772102502589405048638582} +#define T_729_412 {-0.9173605320186272171767427607846911996603,0.3980573505093220609829529621492838487029} +#define T_729_416 {-0.9030948491549369849806794263713527470827,0.4294411408212092107206103719363454729319} +#define T_729_418 {-0.8955584161899875494938783049292396754026,0.4449439551126423419979971640714211389422} +#define T_729_422 {-0.8796895632212818894046790774154942482710,0.4755483911859550993916911920678103342652} +#define T_729_424 {-0.8713618584103077768787670720485039055347,0.4906409193164943949305722981080180034041} +#define T_729_428 {-0.8539322636657457055164854864415246993303,0.5203841745006234642545450697070918977261} +#define T_729_430 {-0.8448355526760610301195697502407710999250,0.5350260637899191262789599932148121297359} +#define T_729_434 {-0.8258918174181526516974827245576307177544,0.5638286139614953063059488158614840358496} +#define T_729_436 {-0.8160504219980837525838524015853181481361,0.5779807165959340853333969789673574268818} +#define T_729_440 {-0.7956431958255496539322848548181354999542,0.6057655527813594620312187544186599552631} +#define T_729_442 {-0.7850834287757186835676748160040006041527,0.6193900304830236569486601183598395437002} +#define T_729_446 {-0.7632672742023699496982658274646382778883,0.6460828647562820714611575567687395960093} +#define T_729_448 {-0.7520173690235292252026511050644330680370,0.6591432899506214360130229579226579517126} +#define T_729_452 {-0.7288506155952406695419654170109424740076,0.6846727540558619207189394728629849851131} +#define T_729_454 {-0.7169406510007588773092379597073886543512,0.6971342072675877910015174165891949087381} +#define T_729_458 {-0.6924852393405822592598042319878004491329,0.7214320434354274391708372604625765234232} +#define T_729_460 {-0.6799470588354110667239638132741674780846,0.7332612066522229410381328307266812771559} +#define T_729_464 {-0.6542683750338017523162648103607352823019,0.7562624500995858145557804164127446711063} +#define T_729_466 {-0.6411355017749441609353766580170486122370,0.7674276958540071635894719292991794645786} +#define T_729_470 {-0.6143022025678901432144130012602545320988,0.7890708484795512234200032253284007310867} +#define T_729_472 {-0.6006097497340067148385855944070499390364,0.7995423244109430838122420936997514218092} +#define T_729_476 {-0.5726935789364780982069191850314382463694,0.8197695192216699622278497372462879866362} +#define T_729_478 {-0.5584781558462748041193890458089299499989,0.8295192278920024975263913802336901426315} +#define T_729_482 {-0.5295537525317945970471100736176595091820,0.8482763837214229996064318584103602916002} +#define T_729_484 {-0.5148533667622017251375154955894686281681,0.8572782574716483194876559537078719586134} +// Pre-computed twiddles for N=864 +#define T_864_1 {0.9999735576321774344066284356813412159681,-0.0072721411184216390097279081317083182512} +#define T_864_5 {0.9993390107217300366571066660981159657240,-0.0363530143139996486656428942296770401299} +#define T_864_7 {0.9987045980408904011227377850445918738842,-0.0508834536169025311580504933317570248619} +#define T_864_11 {0.9968021652056575643285896148881874978542,-0.0799089697175039831433807080429687630385} +#define T_864_13 {0.9955345474845739284219803266751114279032,-0.0943979065693972135830946967871568631381} +#define T_864_17 {0.9923678501135846374836546601727604866028,-0.1233128138554221014944189960260700900108} +#define T_864_19 {0.9904694403346357978179526071471627801657,-0.1377326677415107725188647691538790240884} +#define T_864_23 {0.9860445064163191331019220342568587511778,-0.1664819250435242703467508817993802949786} +#define T_864_25 {0.9835189183104177113037280832941178232431,-0.1808052469523652727723117550340248271823} +#define T_864_29 {0.9778441709579840379618076440237928181887,-0.2093341284250920253384720126632601022720} +#define T_864_31 {0.9746962121256348421383108870941214263439,-0.2235336530993475012696336534645524807274} +#define T_864_35 {0.9677824535432009156465937849134206771851,-0.2517878523954285574504297073872294276953} +#define T_864_37 {0.9640181163029596378422070301894564181566,-0.2658365502328328133785362297203391790390} +#define T_864_41 {0.9558785072229395440857047105964738875628,-0.2937622838780443723649682397081051021814} +#define T_864_43 {0.9515049572046937464975258080812636762857,-0.3076334123837881628915624787623528391123} +#define T_864_47 {0.9421549918355964781113698336412198841572,-0.3351775221569416318700973533850628882647} +#define T_864_49 {0.9371805543403823879344827219028957188129,-0.3488446768495307992452580947428941726685} +#define T_864_53 {0.9266380308727052428352521928900387138128,-0.3759547309721680830030265951791079714894} +#define T_864_55 {0.9210721750250234762802392651792615652084,-0.3893918956458550795218798157293349504471} +#define T_864_59 {0.9093571617513851457204054895555600523949,-0.4160162885891190431841835106752114370465} +#define T_864_61 {0.9032104824741843396296303581038955599070,-0.4291978848372289268020551844529109075665} +#define T_864_65 {0.8903452795881888759410571765329223126173,-0.4552859355559204512786664054146967828274} +#define T_864_67 {0.8836294774348275593567336727574001997709,-0.4681868714608020121126230606023455038667} +#define T_864_71 {0.8696385745813789425184836545668076723814,-0.4936889198676301937496191385434940457344} +#define T_864_77 {0.8472764631208279384466663941566366702318,-0.5311521392609280800556348367535974830389} +#define T_864_83 {0.8233015127566802870973106109886430203915,-0.5676042803684286930376856616931036114693} +#define T_864_85 {0.8149592551819879737706742162117734551430,-0.5795182588954549407489480472577270120382} +#define T_864_89 {0.7977593611696013331524568457098212093115,-0.6029759544677295579617748444434255361557} +#define T_864_91 {0.7889053631311945258275386549939867109060,-0.6145147093624676593037747807102277874947} +#define T_864_97 {0.7613497448832312608502093098650220781565,-0.6483413961534761904559331924247089773417} +#define T_864_103 {0.7323448541235920705716466727608349174261,-0.6809339282475904697733426473860163241625} +#define T_864_119 {0.6483413961534761904559331924247089773417,-0.7613497448832312608502093098650220781565} +#define T_864_133 {0.5676042803684286930376856616931036114693,-0.8233015127566802870973106109886430203915} +#define T_864_161 {0.3893918956458550795218798157293349504471,-0.9210721750250234762802392651792615652084} +#define T_864_175 {0.2937622838780443723649682397081051021814,-0.9558785072229395440857047105964738875628} +#define T_864_203 {0.0943979065693972135830946967871568631381,-0.9955345474845739284219803266751114279032} +#define T_864_217 {-0.0072721411184216390097279081317083182512,-0.9999735576321774344066284356813412159681} +#define T_864_245 {-0.2093341284250920253384720126632601022720,-0.9778441709579840379618076440237928181887} +#define T_864_259 {-0.3076334123837881628915624787623528391123,-0.9515049572046937464975258080812636762857} +#define T_864_287 {-0.4936889198676301937496191385434940457344,-0.8696385745813789425184836545668076723814} +#define T_864_301 {-0.5795182588954549407489480472577270120382,-0.8149592551819879737706742162117734551430} +#define T_864_329 {-0.7323448541235920705716466727608349174261,-0.6809339282475904697733426473860163241625} +#define T_864_343 {-0.7977593611696013331524568457098212093115,-0.6029759544677295579617748444434255361557} +#define T_864_371 {-0.9032104824741843396296303581038955599070,-0.4291978848372289268020551844529109075665} +#define T_864_385 {-0.9421549918355964781113698336412198841572,-0.3351775221569416318700973533850628882647} +#define T_864_413 {-0.9904694403346357978179526071471627801657,-0.1377326677415107725188647691538790240884} +#define T_864_427 {-0.9993390107217300366571066660981159657240,-0.0363530143139996486656428942296770401299} +#define T_864_455 {-0.9860445064163191331019220342568587511778,0.1664819250435242703467508817993802949786} +#define T_864_469 {-0.9640181163029596378422070301894564181566,0.2658365502328328133785362297203391790390} +#define T_864_497 {-0.8903452795881888759410571765329223126173,0.4552859355559204512786664054146967828274} +// Pre-computed twiddles for N=1000 +#define T_1000_1 {0.9999802608561371153328423133643809705973,-0.0062831439655589510842603750973012211034} +#define T_1000_3 {0.9998223523808089652220587595365941524506,-0.0188484397154081752268073302047923789360} +#define T_1000_7 {0.9990329346781247066289211034018080681562,-0.0439681183178649015452776893653208389878} +#define T_1000_9 {0.9984015501089750221908047933538910001516,-0.0565185344820245266772573700109205674380} +#define T_1000_11 {0.9976125063612252263922641759563703089952,-0.0690600257144057955160931783211708534509} +#define T_1000_13 {0.9966659280340298687761446672084275633097,-0.0815906115681575561104921234800713136792} +#define T_1000_17 {0.9943007903969989236259152676211670041084,-0.1066111542752599117278577978140674531460} +#define T_1000_19 {0.9928826045698136981698667113960254937410,-0.1190971600948697445288004814756277482957} +#define T_1000_21 {0.9913076310695065895828292923397384583950,-0.1315643590922825068023627181901247240603} +#define T_1000_23 {0.9895761186026509337310130831610877066851,-0.1440107825522521645122964173424406908453} +#define T_1000_27 {0.9856445951489980483728459148551337420940,-0.1688334447127338711069910459627863019705} +#define T_1000_29 {0.9834452049953297247952832549344748258591,-0.1812057636271373617375957110198214650154} +#define T_1000_31 {0.9810905174433340869555308927374426275492,-0.1935494680508602627355685399379581212997} +#define T_1000_33 {0.9785809043254720673843394251889549195766,-0.2058626087698813278414888827683171257377} +#define T_1000_37 {0.9730985109821265188756456154806073755026,-0.2303894266765905973759487324059591628611} +#define T_1000_39 {0.9701265964901057969882458564825356006622,-0.2425992307954074334030991622057626955211} +#define T_1000_41 {0.9670014877624351079532516450854018330574,-0.2547707256833821598540623654116643592715} +#define T_1000_43 {0.9637236782900097109560988428711425513029,-0.2669019893203755655619602293882053345442} +#define T_1000_47 {0.9567120515588304341392245078168343752623,-0.2910361668282718250466700737888459116220} +#define T_1000_49 {0.9529793415172188586481638594705145806074,-0.3030352696327739958626068528246833011508} +#define T_1000_51 {0.9490961449902945989975933116511441767216,-0.3149865196553047752558995853178203105927} +#define T_1000_53 {0.9450630751798048123646367457695305347443,-0.3268880296549424580021536712592933326960} +#define T_1000_57 {0.9365498867481923950251143651257734745741,-0.3505343201912590189728291534265736117959} +#define T_1000_59 {0.9320711124582109485814385152480099350214,-0.3622753667045456871242947727296268567443} +#define T_1000_61 {0.9274451533346612652763951700762845575809,-0.3739592057378004485990174998732982203364} +#define T_1000_63 {0.9226727398701148885606926342006772756577,-0.3855839922773965366964432632812531664968} +#define T_1000_67 {0.9126915874035027576027800932934042066336,-0.4086490747363490383392559124331455677748} +#define T_1000_69 {0.9074844245411168852655237060389481484890,-0.4200857284118062473510235577123239636421} +#define T_1000_71 {0.9021339593682028423771157576993573457003,-0.4314560456809589661375525793118868023157} +#define T_1000_73 {0.8966410367852358787033040243841242045164,-0.4427582310389015507468002397217787802219} +#define T_1000_77 {0.8852313113324552507066300677252002060413,-0.4651510780774583686536516324849799275398} +#define T_1000_79 {0.8793163101905562273330474454269278794527,-0.4762382036679391195121979762916453182697} +#define T_1000_81 {0.8732624548099201522433077116147615015507,-0.4872501257253323281304346892284229397774} +#define T_1000_83 {0.8670707011644900319424777990207076072693,-0.4981851053394908412919050988421076908708} +#define T_1000_87 {0.8542774316992951533222822035895660519600,-0.5198173426207094438566969074599910527468} +#define T_1000_89 {0.8476779360850832389928655175026506185532,-0.5305111843067340515744945150800049304962} +#define T_1000_91 {0.8409445822981690410458099904644768685102,-0.5411212521268758957049271884898189455271} +#define T_1000_93 {0.8340784336131711329542781641066540032625,-0.5516458706284302904165883774112444370985} +#define T_1000_97 {0.8199521093254523718130144516180735081434,-0.5724321255945908903939312040165532380342} +#define T_1000_99 {0.8126941644330939462648188964521978050470,-0.5826904796685760112850971381703857332468} +#define T_1000_101 {0.8053078857111219956976810863125137984753,-0.5928568201610592280914602270058821886778} +#define T_1000_103 {0.7977944395385709874801705154823139309883,-0.6029295416890246839258793443150352686644} +#define T_1000_107 {0.7823908105765881026982810908521059900522,-0.6227877804881124523816993132641073316336} +#define T_1000_109 {0.7745030601987338281233519410307053476572,-0.6325701619131244290983318023791071027517} +#define T_1000_111 {0.7664930068093498416459397049038670957088,-0.6422526531765844204358018032507970929146} +#define T_1000_113 {0.7583619152887218772818300749349873512983,-0.6518337253008787968155957059934735298157} +#define T_1000_117 {0.7417417727387392023530310325440950691700,-0.6706855765367200072901709972938988357782} +#define T_1000_119 {0.7332553462225600560131510974315460771322,-0.6799533787224192327158789339591749012470} +#define T_1000_121 {0.7246531301870466901249301372445188462734,-0.6891138083873484809771525760879740118980} +#define T_1000_123 {0.7159364830218312159004767636361066251993,-0.6981654189934727305910655559273436665535} +#define T_1000_129 {0.6891138083873484809771525760879740118980,-0.7246531301870466901249301372445188462734} +#define T_1000_141 {0.6325701619131244290983318023791071027517,-0.7745030601987338281233519410307053476572} +#define T_1000_147 {0.6029295416890246839258793443150352686644,-0.7977944395385709874801705154823139309883} +#define T_1000_153 {0.5724321255945908903939312040165532380342,-0.8199521093254523718130144516180735081434} +#define T_1000_159 {0.5411212521268758957049271884898189455271,-0.8409445822981690410458099904644768685102} +#define T_1000_171 {0.4762382036679391195121979762916453182697,-0.8793163101905562273330474454269278794527} +#define T_1000_177 {0.4427582310389015507468002397217787802219,-0.8966410367852358787033040243841242045164} +#define T_1000_183 {0.4086490747363490383392559124331455677748,-0.9126915874035027576027800932934042066336} +#define T_1000_189 {0.3739592057378004485990174998732982203364,-0.9274451533346612652763951700762845575809} +#define T_1000_201 {0.3030352696327739958626068528246833011508,-0.9529793415172188586481638594705145806074} +#define T_1000_207 {0.2669019893203755655619602293882053345442,-0.9637236782900097109560988428711425513029} +#define T_1000_213 {0.2303894266765905973759487324059591628611,-0.9730985109821265188756456154806073755026} +#define T_1000_219 {0.1935494680508602627355685399379581212997,-0.9810905174433340869555308927374426275492} +#define T_1000_231 {0.1190971600948697445288004814756277482957,-0.9928826045698136981698667113960254937410} +#define T_1000_237 {0.0815906115681575561104921234800713136792,-0.9966659280340298687761446672084275633097} +#define T_1000_243 {0.0439681183178649015452776893653208389878,-0.9990329346781247066289211034018080681562} +#define T_1000_249 {0.0062831439655589510842603750973012211034,-0.9999802608561371153328423133643809705973} +#define T_1000_261 {-0.0690600257144057955160931783211708534509,-0.9976125063612252263922641759563703089952} +#define T_1000_267 {-0.1066111542752599117278577978140674531460,-0.9943007903969989236259152676211670041084} +#define T_1000_273 {-0.1440107825522521645122964173424406908453,-0.9895761186026509337310130831610877066851} +#define T_1000_279 {-0.1812057636271373617375957110198214650154,-0.9834452049953297247952832549344748258591} +#define T_1000_291 {-0.2547707256833821598540623654116643592715,-0.9670014877624351079532516450854018330574} +#define T_1000_297 {-0.2910361668282718250466700737888459116220,-0.9567120515588304341392245078168343752623} +#define T_1000_303 {-0.3268880296549424580021536712592933326960,-0.9450630751798048123646367457695305347443} +#define T_1000_309 {-0.3622753667045456871242947727296268567443,-0.9320711124582109485814385152480099350214} +#define T_1000_321 {-0.4314560456809589661375525793118868023157,-0.9021339593682028423771157576993573457003} +#define T_1000_327 {-0.4651510780774583686536516324849799275398,-0.8852313113324552507066300677252002060413} +#define T_1000_333 {-0.4981851053394908412919050988421076908708,-0.8670707011644900319424777990207076072693} +#define T_1000_339 {-0.5305111843067340515744945150800049304962,-0.8476779360850832389928655175026506185532} +#define T_1000_351 {-0.5928568201610592280914602270058821886778,-0.8053078857111219956976810863125137984753} +#define T_1000_357 {-0.6227877804881124523816993132641073316336,-0.7823908105765881026982810908521059900522} +#define T_1000_363 {-0.6518337253008787968155957059934735298157,-0.7583619152887218772818300749349873512983} +#define T_1000_369 {-0.6799533787224192327158789339591749012470,-0.7332553462225600560131510974315460771322} +#define T_1000_381 {-0.7332553462225600560131510974315460771322,-0.6799533787224192327158789339591749012470} +#define T_1000_387 {-0.7583619152887218772818300749349873512983,-0.6518337253008787968155957059934735298157} +#define T_1000_393 {-0.7823908105765881026982810908521059900522,-0.6227877804881124523816993132641073316336} +#define T_1000_399 {-0.8053078857111219956976810863125137984753,-0.5928568201610592280914602270058821886778} +#define T_1000_411 {-0.8476779360850832389928655175026506185532,-0.5305111843067340515744945150800049304962} +#define T_1000_417 {-0.8670707011644900319424777990207076072693,-0.4981851053394908412919050988421076908708} +#define T_1000_423 {-0.8852313113324552507066300677252002060413,-0.4651510780774583686536516324849799275398} +#define T_1000_429 {-0.9021339593682028423771157576993573457003,-0.4314560456809589661375525793118868023157} +#define T_1000_441 {-0.9320711124582109485814385152480099350214,-0.3622753667045456871242947727296268567443} +#define T_1000_447 {-0.9450630751798048123646367457695305347443,-0.3268880296549424580021536712592933326960} +#define T_1000_453 {-0.9567120515588304341392245078168343752623,-0.2910361668282718250466700737888459116220} +#define T_1000_459 {-0.9670014877624351079532516450854018330574,-0.2547707256833821598540623654116643592715} +#define T_1000_471 {-0.9834452049953297247952832549344748258591,-0.1812057636271373617375957110198214650154} +#define T_1000_477 {-0.9895761186026509337310130831610877066851,-0.1440107825522521645122964173424406908453} +#define T_1000_483 {-0.9943007903969989236259152676211670041084,-0.1066111542752599117278577978140674531460} +#define T_1000_489 {-0.9976125063612252263922641759563703089952,-0.0690600257144057955160931783211708534509} +#define T_1000_501 {-0.9999802608561371153328423133643809705973,0.0062831439655589510842603750973012211034} +#define T_1000_507 {-0.9990329346781247066289211034018080681562,0.0439681183178649015452776893653208389878} +#define T_1000_513 {-0.9966659280340298687761446672084275633097,0.0815906115681575561104921234800713136792} +#define T_1000_519 {-0.9928826045698136981698667113960254937410,0.1190971600948697445288004814756277482957} +#define T_1000_531 {-0.9810905174433340869555308927374426275492,0.1935494680508602627355685399379581212997} +#define T_1000_537 {-0.9730985109821265188756456154806073755026,0.2303894266765905973759487324059591628611} +#define T_1000_543 {-0.9637236782900097109560988428711425513029,0.2669019893203755655619602293882053345442} +#define T_1000_549 {-0.9529793415172188586481638594705145806074,0.3030352696327739958626068528246833011508} +#define T_1000_561 {-0.9274451533346612652763951700762845575809,0.3739592057378004485990174998732982203364} +#define T_1000_567 {-0.9126915874035027576027800932934042066336,0.4086490747363490383392559124331455677748} +#define T_1000_573 {-0.8966410367852358787033040243841242045164,0.4427582310389015507468002397217787802219} +#define T_1000_579 {-0.8793163101905562273330474454269278794527,0.4762382036679391195121979762916453182697} +#define T_1000_591 {-0.8409445822981690410458099904644768685102,0.5411212521268758957049271884898189455271} +#define T_1000_597 {-0.8199521093254523718130144516180735081434,0.5724321255945908903939312040165532380342} +// Pre-computed twiddles for N=1024 +#define T_1024_1 {0.9999811752826011090888869148329831659794,-0.0061358846491544752690949771078976482386} +#define T_1024_3 {0.9998305817958234031905817573715467005968,-0.0184067299058048201854109748865084839053} +#define T_1024_5 {0.9995294175010931425617854984011501073837,-0.0306748031766366259509570824093316332437} +#define T_1024_7 {0.9990777277526453614697743432770948857069,-0.0429382569349408202419304814156930660829} +#define T_1024_9 {0.9984755805732947742114902212051674723625,-0.0551952443496899411434775117868412053213} +#define T_1024_11 {0.9977230666441916362430220033274963498116,-0.0674439195636640648201520775728567969054} +#define T_1024_13 {0.9968202992911656679098086897283792495728,-0.0796824379714301256338870871331891976297} +#define T_1024_15 {0.9957674144676598171344039656105451285839,-0.0919089564971327238618314936502429191023} +#define T_1024_17 {0.9945645707342554153740366018610075116158,-0.1041216338720545725449184715216688346118} +#define T_1024_19 {0.9932119492347945000076947508205194026232,-0.1163186309119047662230528317195421550423} +#define T_1024_21 {0.9917097536690995251973390622879378497601,-0.1284981107937931688045551936738775111735} +#define T_1024_23 {0.9900582102622971225613923706987407058477,-0.1406582393328492386341821429596166126430} +#define T_1024_25 {0.9882575677307494643741847539786249399185,-0.1527971852584434353516940063855145126581} +#define T_1024_27 {0.9863080972445986693841746273392345756292,-0.1649131204899699221222419964760774746537} +#define T_1024_29 {0.9842100923869290252099517601891420781612,-0.1770042204121487494639097803883487358689} +#define T_1024_31 {0.9819638691095552429644044423184823244810,-0.1890686641498062203758934174402384087443} +#define T_1024_33 {0.9795697656854405188653345248894765973091,-0.2011046348420919005484108765813289210200} +#define T_1024_35 {0.9770281426577543948397419626417104154825,-0.2131103199160913619358836967876413837075} +#define T_1024_37 {0.9743393827855758582145995205792132765055,-0.2250839113597928320409380376077024266124} +#define T_1024_39 {0.9715038909862517835236417340638581663370,-0.2370236059943671980132506860172725282609} +#define T_1024_41 {0.9685220942744172667460134107386693358421,-0.2489276057457201762890264262750861234963} +#define T_1024_43 {0.9653944416976893982962337759090587496758,-0.2607941179152755140080444107297807931900} +#define T_1024_45 {0.9621214042690415801928338623838499188423,-0.2726213554499489766236308696534251794219} +#define T_1024_47 {0.9587034748958715990596601841389201581478,-0.2844075372112718214090421042783418670297} +#define T_1024_49 {0.9551411683057706714095047573209740221500,-0.2961508882436238443425224886595970019698} +#define T_1024_51 {0.9514350209690083381985914456890895962715,-0.3078496400415348666079751183133339509368} +#define T_1024_53 {0.9475855910177410912353934691054746508598,-0.3195020308160156918830807626363821327686} +#define T_1024_55 {0.9435934581619603855884292897826526314020,-0.3311063057598764292066562120453454554081} +#define T_1024_57 {0.9394592236021899189779560401802882552147,-0.3426607173119943783312635332549689337611} +#define T_1024_59 {0.9351835099389476102516027822275646030903,-0.3541635254204903993091591019037878140807} +#define T_1024_61 {0.9307669610789837122410972369834780693054,-0.3656129978047738537938471381494309753180} +#define T_1024_63 {0.9262102421383113792785479745361953973770,-0.3770074102164182594520980273955501616001} +#define T_1024_65 {0.9215140393420419018255529408634174615145,-0.3883450466988263016787641390692442655563} +#define T_1024_67 {0.9166790599210427048504357117053586989641,-0.3996241998456468436096145069313934072852} +#define T_1024_69 {0.9117060320054298783176705001096706837416,-0.4108431710579039664033018652844475582242} +#define T_1024_71 {0.9065957045149153348262416329816915094852,-0.4220002707997996815869612419191980734468} +#define T_1024_73 {0.9013488470460220280955354610341601073742,-0.4330938188531519572599393086420604959130} +#define T_1024_75 {0.8959662497561851068894611671566963195801,-0.4441221445704292558609438401617808267474} +#define T_1024_77 {0.8904487232447578781702191008662339299917,-0.4550835871263438359157760260131908580661} +#define T_1024_79 {0.8847970984309377895371540034830104559660,-0.4659764957679661812051108427112922072411} +#define T_1024_81 {0.8790122264286335251881610020063817501068,-0.4767992300633221436356734557193703949451} +#define T_1024_83 {0.8730949784182900907936186740698758512735,-0.4875501601484359404103940960339969024062} +#define T_1024_85 {0.8670462455156926484534096744027920067310,-0.4982276669727818685373676999006420373917} +#define T_1024_87 {0.8608669386377673093946327753656078130007,-0.5088301425431069890947810563375242054462} +#define T_1024_89 {0.8545579883654005337589865121117327362299,-0.5193559901655896426930780762631911784410} +#define T_1024_91 {0.8481203448032972325165701477089896798134,-0.5298036246862947162838963777176104485989} +#define T_1024_93 {0.8415549774368984436989649111637845635414,-0.5401714727298928542253975138009991496801} +#define T_1024_95 {0.8348628749863800102559707738691940903664,-0.5504579729366048113092801941093057394028} +#define T_1024_97 {0.8280450452577557962641208177956286817789,-0.5606615761973360312353520384931471198797} +#define T_1024_99 {0.8211025149911046483452992106322199106216,-0.5707807458869672556645014083187561482191} +#define T_1024_101 {0.8140363297059484137818685667298268526793,-0.5808139580957645264902566850651055574417} +#define T_1024_103 {0.8068475535437992229859105464129243046045,-0.5907597018588742754374720789201091974974} +#define T_1024_105 {0.7995372691079050131435224102460779249668,-0.6006164793838689730520741250074934214354} +#define T_1024_107 {0.7921065773002123888701930809475015848875,-0.6103828062763094752796177999698556959629} +#define T_1024_109 {0.7845565971555752415866891169571317732334,-0.6200572117632892066296790289925411343575} +#define T_1024_111 {0.7768884656732324422989677259465679526329,-0.6296382389149269842576472910877782851458} +#define T_1024_113 {0.7691033376455795878001708842930383980274,-0.6391244448637757313846918805211316794157} +#define T_1024_115 {0.7612023854842617787141989538213238120079,-0.6485144010221124410975335194962099194527} +#define T_1024_117 {0.7531867990436125204212203243514522910118,-0.6578066932970786373502392052614595741034} +#define T_1024_119 {0.7450577854414659473292203983874060213566,-0.6669999223036374713657892243645619601011} +#define T_1024_121 {0.7368165688773699040226006218290422111750,-0.6760927035753159231035169796086847782135} +#define T_1024_123 {0.7284643904482251963727890142763499170542,-0.6850836677727003554139173502335324883461} +#define T_1024_125 {0.7200025079613816547663418532465584576130,-0.6939714608896540015692266933911014348269} +#define T_1024_127 {0.7114321957452164335578004283888731151819,-0.7027547444572252999250849825330078601837} +#define T_1024_129 {0.7027547444572252999250849825330078601837,-0.7114321957452164335578004283888731151819} +#define T_1024_131 {0.6939714608896540015692266933911014348269,-0.7200025079613816547663418532465584576130} +#define T_1024_133 {0.6850836677727003554139173502335324883461,-0.7284643904482251963727890142763499170542} +#define T_1024_135 {0.6760927035753159231035169796086847782135,-0.7368165688773699040226006218290422111750} +#define T_1024_137 {0.6669999223036374713657892243645619601011,-0.7450577854414659473292203983874060213566} +#define T_1024_139 {0.6578066932970786373502392052614595741034,-0.7531867990436125204212203243514522910118} +#define T_1024_141 {0.6485144010221124410975335194962099194527,-0.7612023854842617787141989538213238120079} +#define T_1024_143 {0.6391244448637757313846918805211316794157,-0.7691033376455795878001708842930383980274} +#define T_1024_145 {0.6296382389149269842576472910877782851458,-0.7768884656732324422989677259465679526329} +#define T_1024_147 {0.6200572117632892066296790289925411343575,-0.7845565971555752415866891169571317732334} +#define T_1024_149 {0.6103828062763094752796177999698556959629,-0.7921065773002123888701930809475015848875} +#define T_1024_151 {0.6006164793838689730520741250074934214354,-0.7995372691079050131435224102460779249668} +#define T_1024_153 {0.5907597018588742754374720789201091974974,-0.8068475535437992229859105464129243046045} +#define T_1024_155 {0.5808139580957645264902566850651055574417,-0.8140363297059484137818685667298268526793} +#define T_1024_157 {0.5707807458869672556645014083187561482191,-0.8211025149911046483452992106322199106216} +#define T_1024_159 {0.5606615761973360312353520384931471198797,-0.8280450452577557962641208177956286817789} +#define T_1024_161 {0.5504579729366048113092801941093057394028,-0.8348628749863800102559707738691940903664} +#define T_1024_163 {0.5401714727298928542253975138009991496801,-0.8415549774368984436989649111637845635414} +#define T_1024_165 {0.5298036246862947162838963777176104485989,-0.8481203448032972325165701477089896798134} +#define T_1024_167 {0.5193559901655896426930780762631911784410,-0.8545579883654005337589865121117327362299} +#define T_1024_169 {0.5088301425431069890947810563375242054462,-0.8608669386377673093946327753656078130007} +#define T_1024_171 {0.4982276669727818685373676999006420373917,-0.8670462455156926484534096744027920067310} +#define T_1024_173 {0.4875501601484359404103940960339969024062,-0.8730949784182900907936186740698758512735} +#define T_1024_175 {0.4767992300633221436356734557193703949451,-0.8790122264286335251881610020063817501068} +#define T_1024_177 {0.4659764957679661812051108427112922072411,-0.8847970984309377895371540034830104559660} +#define T_1024_179 {0.4550835871263438359157760260131908580661,-0.8904487232447578781702191008662339299917} +#define T_1024_181 {0.4441221445704292558609438401617808267474,-0.8959662497561851068894611671566963195801} +#define T_1024_183 {0.4330938188531519572599393086420604959130,-0.9013488470460220280955354610341601073742} +#define T_1024_185 {0.4220002707997996815869612419191980734468,-0.9065957045149153348262416329816915094852} +#define T_1024_187 {0.4108431710579039664033018652844475582242,-0.9117060320054298783176705001096706837416} +#define T_1024_189 {0.3996241998456468436096145069313934072852,-0.9166790599210427048504357117053586989641} +#define T_1024_191 {0.3883450466988263016787641390692442655563,-0.9215140393420419018255529408634174615145} +#define T_1024_193 {0.3770074102164182594520980273955501616001,-0.9262102421383113792785479745361953973770} +#define T_1024_195 {0.3656129978047738537938471381494309753180,-0.9307669610789837122410972369834780693054} +#define T_1024_197 {0.3541635254204903993091591019037878140807,-0.9351835099389476102516027822275646030903} +#define T_1024_199 {0.3426607173119943783312635332549689337611,-0.9394592236021899189779560401802882552147} +#define T_1024_201 {0.3311063057598764292066562120453454554081,-0.9435934581619603855884292897826526314020} +#define T_1024_203 {0.3195020308160156918830807626363821327686,-0.9475855910177410912353934691054746508598} +#define T_1024_205 {0.3078496400415348666079751183133339509368,-0.9514350209690083381985914456890895962715} +#define T_1024_207 {0.2961508882436238443425224886595970019698,-0.9551411683057706714095047573209740221500} +#define T_1024_209 {0.2844075372112718214090421042783418670297,-0.9587034748958715990596601841389201581478} +#define T_1024_211 {0.2726213554499489766236308696534251794219,-0.9621214042690415801928338623838499188423} +#define T_1024_213 {0.2607941179152755140080444107297807931900,-0.9653944416976893982962337759090587496758} +#define T_1024_215 {0.2489276057457201762890264262750861234963,-0.9685220942744172667460134107386693358421} +#define T_1024_217 {0.2370236059943671980132506860172725282609,-0.9715038909862517835236417340638581663370} +#define T_1024_219 {0.2250839113597928320409380376077024266124,-0.9743393827855758582145995205792132765055} +#define T_1024_221 {0.2131103199160913619358836967876413837075,-0.9770281426577543948397419626417104154825} +#define T_1024_223 {0.2011046348420919005484108765813289210200,-0.9795697656854405188653345248894765973091} +#define T_1024_225 {0.1890686641498062203758934174402384087443,-0.9819638691095552429644044423184823244810} +#define T_1024_227 {0.1770042204121487494639097803883487358689,-0.9842100923869290252099517601891420781612} +#define T_1024_229 {0.1649131204899699221222419964760774746537,-0.9863080972445986693841746273392345756292} +#define T_1024_231 {0.1527971852584434353516940063855145126581,-0.9882575677307494643741847539786249399185} +#define T_1024_233 {0.1406582393328492386341821429596166126430,-0.9900582102622971225613923706987407058477} +#define T_1024_235 {0.1284981107937931688045551936738775111735,-0.9917097536690995251973390622879378497601} +#define T_1024_237 {0.1163186309119047662230528317195421550423,-0.9932119492347945000076947508205194026232} +#define T_1024_239 {0.1041216338720545725449184715216688346118,-0.9945645707342554153740366018610075116158} +#define T_1024_241 {0.0919089564971327238618314936502429191023,-0.9957674144676598171344039656105451285839} +#define T_1024_243 {0.0796824379714301256338870871331891976297,-0.9968202992911656679098086897283792495728} +#define T_1024_245 {0.0674439195636640648201520775728567969054,-0.9977230666441916362430220033274963498116} +#define T_1024_247 {0.0551952443496899411434775117868412053213,-0.9984755805732947742114902212051674723625} +#define T_1024_249 {0.0429382569349408202419304814156930660829,-0.9990777277526453614697743432770948857069} +#define T_1024_251 {0.0306748031766366259509570824093316332437,-0.9995294175010931425617854984011501073837} +#define T_1024_253 {0.0184067299058048201854109748865084839053,-0.9998305817958234031905817573715467005968} +#define T_1024_255 {0.0061358846491544752690949771078976482386,-0.9999811752826011090888869148329831659794} +#define T_1024_257 {-0.0061358846491544752690949771078976482386,-0.9999811752826011090888869148329831659794} +#define T_1024_259 {-0.0184067299058048201854109748865084839053,-0.9998305817958234031905817573715467005968} +#define T_1024_261 {-0.0306748031766366259509570824093316332437,-0.9995294175010931425617854984011501073837} +#define T_1024_263 {-0.0429382569349408202419304814156930660829,-0.9990777277526453614697743432770948857069} +#define T_1024_265 {-0.0551952443496899411434775117868412053213,-0.9984755805732947742114902212051674723625} +#define T_1024_267 {-0.0674439195636640648201520775728567969054,-0.9977230666441916362430220033274963498116} +#define T_1024_269 {-0.0796824379714301256338870871331891976297,-0.9968202992911656679098086897283792495728} +#define T_1024_271 {-0.0919089564971327238618314936502429191023,-0.9957674144676598171344039656105451285839} +#define T_1024_273 {-0.1041216338720545725449184715216688346118,-0.9945645707342554153740366018610075116158} +#define T_1024_275 {-0.1163186309119047662230528317195421550423,-0.9932119492347945000076947508205194026232} +#define T_1024_277 {-0.1284981107937931688045551936738775111735,-0.9917097536690995251973390622879378497601} +#define T_1024_279 {-0.1406582393328492386341821429596166126430,-0.9900582102622971225613923706987407058477} +#define T_1024_281 {-0.1527971852584434353516940063855145126581,-0.9882575677307494643741847539786249399185} +#define T_1024_283 {-0.1649131204899699221222419964760774746537,-0.9863080972445986693841746273392345756292} +#define T_1024_285 {-0.1770042204121487494639097803883487358689,-0.9842100923869290252099517601891420781612} +#define T_1024_287 {-0.1890686641498062203758934174402384087443,-0.9819638691095552429644044423184823244810} +#define T_1024_289 {-0.2011046348420919005484108765813289210200,-0.9795697656854405188653345248894765973091} +#define T_1024_291 {-0.2131103199160913619358836967876413837075,-0.9770281426577543948397419626417104154825} +#define T_1024_293 {-0.2250839113597928320409380376077024266124,-0.9743393827855758582145995205792132765055} +#define T_1024_295 {-0.2370236059943671980132506860172725282609,-0.9715038909862517835236417340638581663370} +#define T_1024_297 {-0.2489276057457201762890264262750861234963,-0.9685220942744172667460134107386693358421} +#define T_1024_299 {-0.2607941179152755140080444107297807931900,-0.9653944416976893982962337759090587496758} +#define T_1024_301 {-0.2726213554499489766236308696534251794219,-0.9621214042690415801928338623838499188423} +#define T_1024_303 {-0.2844075372112718214090421042783418670297,-0.9587034748958715990596601841389201581478} +#define T_1024_305 {-0.2961508882436238443425224886595970019698,-0.9551411683057706714095047573209740221500} +#define T_1024_307 {-0.3078496400415348666079751183133339509368,-0.9514350209690083381985914456890895962715} +#define T_1024_309 {-0.3195020308160156918830807626363821327686,-0.9475855910177410912353934691054746508598} +#define T_1024_311 {-0.3311063057598764292066562120453454554081,-0.9435934581619603855884292897826526314020} +#define T_1024_313 {-0.3426607173119943783312635332549689337611,-0.9394592236021899189779560401802882552147} +#define T_1024_315 {-0.3541635254204903993091591019037878140807,-0.9351835099389476102516027822275646030903} +#define T_1024_317 {-0.3656129978047738537938471381494309753180,-0.9307669610789837122410972369834780693054} +#define T_1024_319 {-0.3770074102164182594520980273955501616001,-0.9262102421383113792785479745361953973770} +#define T_1024_321 {-0.3883450466988263016787641390692442655563,-0.9215140393420419018255529408634174615145} +#define T_1024_323 {-0.3996241998456468436096145069313934072852,-0.9166790599210427048504357117053586989641} +#define T_1024_325 {-0.4108431710579039664033018652844475582242,-0.9117060320054298783176705001096706837416} +#define T_1024_327 {-0.4220002707997996815869612419191980734468,-0.9065957045149153348262416329816915094852} +#define T_1024_329 {-0.4330938188531519572599393086420604959130,-0.9013488470460220280955354610341601073742} +#define T_1024_331 {-0.4441221445704292558609438401617808267474,-0.8959662497561851068894611671566963195801} +#define T_1024_333 {-0.4550835871263438359157760260131908580661,-0.8904487232447578781702191008662339299917} +#define T_1024_335 {-0.4659764957679661812051108427112922072411,-0.8847970984309377895371540034830104559660} +#define T_1024_337 {-0.4767992300633221436356734557193703949451,-0.8790122264286335251881610020063817501068} +#define T_1024_339 {-0.4875501601484359404103940960339969024062,-0.8730949784182900907936186740698758512735} +#define T_1024_341 {-0.4982276669727818685373676999006420373917,-0.8670462455156926484534096744027920067310} +#define T_1024_343 {-0.5088301425431069890947810563375242054462,-0.8608669386377673093946327753656078130007} +#define T_1024_345 {-0.5193559901655896426930780762631911784410,-0.8545579883654005337589865121117327362299} +#define T_1024_347 {-0.5298036246862947162838963777176104485989,-0.8481203448032972325165701477089896798134} +#define T_1024_349 {-0.5401714727298928542253975138009991496801,-0.8415549774368984436989649111637845635414} +#define T_1024_351 {-0.5504579729366048113092801941093057394028,-0.8348628749863800102559707738691940903664} +#define T_1024_353 {-0.5606615761973360312353520384931471198797,-0.8280450452577557962641208177956286817789} +#define T_1024_355 {-0.5707807458869672556645014083187561482191,-0.8211025149911046483452992106322199106216} +#define T_1024_357 {-0.5808139580957645264902566850651055574417,-0.8140363297059484137818685667298268526793} +#define T_1024_359 {-0.5907597018588742754374720789201091974974,-0.8068475535437992229859105464129243046045} +#define T_1024_361 {-0.6006164793838689730520741250074934214354,-0.7995372691079050131435224102460779249668} +#define T_1024_363 {-0.6103828062763094752796177999698556959629,-0.7921065773002123888701930809475015848875} +#define T_1024_365 {-0.6200572117632892066296790289925411343575,-0.7845565971555752415866891169571317732334} +#define T_1024_367 {-0.6296382389149269842576472910877782851458,-0.7768884656732324422989677259465679526329} +#define T_1024_369 {-0.6391244448637757313846918805211316794157,-0.7691033376455795878001708842930383980274} +#define T_1024_371 {-0.6485144010221124410975335194962099194527,-0.7612023854842617787141989538213238120079} +#define T_1024_373 {-0.6578066932970786373502392052614595741034,-0.7531867990436125204212203243514522910118} +#define T_1024_375 {-0.6669999223036374713657892243645619601011,-0.7450577854414659473292203983874060213566} +#define T_1024_377 {-0.6760927035753159231035169796086847782135,-0.7368165688773699040226006218290422111750} +#define T_1024_379 {-0.6850836677727003554139173502335324883461,-0.7284643904482251963727890142763499170542} +#define T_1024_381 {-0.6939714608896540015692266933911014348269,-0.7200025079613816547663418532465584576130} +#define T_1024_383 {-0.7027547444572252999250849825330078601837,-0.7114321957452164335578004283888731151819} +#define T_1024_385 {-0.7114321957452164335578004283888731151819,-0.7027547444572252999250849825330078601837} +#define T_1024_387 {-0.7200025079613816547663418532465584576130,-0.6939714608896540015692266933911014348269} +#define T_1024_389 {-0.7284643904482251963727890142763499170542,-0.6850836677727003554139173502335324883461} +#define T_1024_391 {-0.7368165688773699040226006218290422111750,-0.6760927035753159231035169796086847782135} +#define T_1024_393 {-0.7450577854414659473292203983874060213566,-0.6669999223036374713657892243645619601011} +#define T_1024_395 {-0.7531867990436125204212203243514522910118,-0.6578066932970786373502392052614595741034} +#define T_1024_397 {-0.7612023854842617787141989538213238120079,-0.6485144010221124410975335194962099194527} +#define T_1024_399 {-0.7691033376455795878001708842930383980274,-0.6391244448637757313846918805211316794157} +#define T_1024_401 {-0.7768884656732324422989677259465679526329,-0.6296382389149269842576472910877782851458} +#define T_1024_403 {-0.7845565971555752415866891169571317732334,-0.6200572117632892066296790289925411343575} +#define T_1024_405 {-0.7921065773002123888701930809475015848875,-0.6103828062763094752796177999698556959629} +#define T_1024_407 {-0.7995372691079050131435224102460779249668,-0.6006164793838689730520741250074934214354} +#define T_1024_409 {-0.8068475535437992229859105464129243046045,-0.5907597018588742754374720789201091974974} +#define T_1024_411 {-0.8140363297059484137818685667298268526793,-0.5808139580957645264902566850651055574417} +#define T_1024_413 {-0.8211025149911046483452992106322199106216,-0.5707807458869672556645014083187561482191} +#define T_1024_415 {-0.8280450452577557962641208177956286817789,-0.5606615761973360312353520384931471198797} +#define T_1024_417 {-0.8348628749863800102559707738691940903664,-0.5504579729366048113092801941093057394028} +#define T_1024_419 {-0.8415549774368984436989649111637845635414,-0.5401714727298928542253975138009991496801} +#define T_1024_421 {-0.8481203448032972325165701477089896798134,-0.5298036246862947162838963777176104485989} +#define T_1024_423 {-0.8545579883654005337589865121117327362299,-0.5193559901655896426930780762631911784410} +#define T_1024_425 {-0.8608669386377673093946327753656078130007,-0.5088301425431069890947810563375242054462} +#define T_1024_427 {-0.8670462455156926484534096744027920067310,-0.4982276669727818685373676999006420373917} +#define T_1024_429 {-0.8730949784182900907936186740698758512735,-0.4875501601484359404103940960339969024062} +#define T_1024_431 {-0.8790122264286335251881610020063817501068,-0.4767992300633221436356734557193703949451} +#define T_1024_433 {-0.8847970984309377895371540034830104559660,-0.4659764957679661812051108427112922072411} +#define T_1024_435 {-0.8904487232447578781702191008662339299917,-0.4550835871263438359157760260131908580661} +#define T_1024_437 {-0.8959662497561851068894611671566963195801,-0.4441221445704292558609438401617808267474} +#define T_1024_439 {-0.9013488470460220280955354610341601073742,-0.4330938188531519572599393086420604959130} +#define T_1024_441 {-0.9065957045149153348262416329816915094852,-0.4220002707997996815869612419191980734468} +#define T_1024_443 {-0.9117060320054298783176705001096706837416,-0.4108431710579039664033018652844475582242} +#define T_1024_445 {-0.9166790599210427048504357117053586989641,-0.3996241998456468436096145069313934072852} +#define T_1024_447 {-0.9215140393420419018255529408634174615145,-0.3883450466988263016787641390692442655563} +#define T_1024_449 {-0.9262102421383113792785479745361953973770,-0.3770074102164182594520980273955501616001} +#define T_1024_451 {-0.9307669610789837122410972369834780693054,-0.3656129978047738537938471381494309753180} +#define T_1024_453 {-0.9351835099389476102516027822275646030903,-0.3541635254204903993091591019037878140807} +#define T_1024_455 {-0.9394592236021899189779560401802882552147,-0.3426607173119943783312635332549689337611} +#define T_1024_457 {-0.9435934581619603855884292897826526314020,-0.3311063057598764292066562120453454554081} +#define T_1024_459 {-0.9475855910177410912353934691054746508598,-0.3195020308160156918830807626363821327686} +#define T_1024_461 {-0.9514350209690083381985914456890895962715,-0.3078496400415348666079751183133339509368} +#define T_1024_463 {-0.9551411683057706714095047573209740221500,-0.2961508882436238443425224886595970019698} +#define T_1024_465 {-0.9587034748958715990596601841389201581478,-0.2844075372112718214090421042783418670297} +#define T_1024_467 {-0.9621214042690415801928338623838499188423,-0.2726213554499489766236308696534251794219} +#define T_1024_469 {-0.9653944416976893982962337759090587496758,-0.2607941179152755140080444107297807931900} +#define T_1024_471 {-0.9685220942744172667460134107386693358421,-0.2489276057457201762890264262750861234963} +#define T_1024_473 {-0.9715038909862517835236417340638581663370,-0.2370236059943671980132506860172725282609} +#define T_1024_475 {-0.9743393827855758582145995205792132765055,-0.2250839113597928320409380376077024266124} +#define T_1024_477 {-0.9770281426577543948397419626417104154825,-0.2131103199160913619358836967876413837075} +#define T_1024_479 {-0.9795697656854405188653345248894765973091,-0.2011046348420919005484108765813289210200} +#define T_1024_481 {-0.9819638691095552429644044423184823244810,-0.1890686641498062203758934174402384087443} +#define T_1024_483 {-0.9842100923869290252099517601891420781612,-0.1770042204121487494639097803883487358689} +#define T_1024_485 {-0.9863080972445986693841746273392345756292,-0.1649131204899699221222419964760774746537} +#define T_1024_487 {-0.9882575677307494643741847539786249399185,-0.1527971852584434353516940063855145126581} +#define T_1024_489 {-0.9900582102622971225613923706987407058477,-0.1406582393328492386341821429596166126430} +#define T_1024_491 {-0.9917097536690995251973390622879378497601,-0.1284981107937931688045551936738775111735} +#define T_1024_493 {-0.9932119492347945000076947508205194026232,-0.1163186309119047662230528317195421550423} +#define T_1024_495 {-0.9945645707342554153740366018610075116158,-0.1041216338720545725449184715216688346118} +#define T_1024_497 {-0.9957674144676598171344039656105451285839,-0.0919089564971327238618314936502429191023} +#define T_1024_499 {-0.9968202992911656679098086897283792495728,-0.0796824379714301256338870871331891976297} +#define T_1024_501 {-0.9977230666441916362430220033274963498116,-0.0674439195636640648201520775728567969054} +#define T_1024_503 {-0.9984755805732947742114902212051674723625,-0.0551952443496899411434775117868412053213} +#define T_1024_505 {-0.9990777277526453614697743432770948857069,-0.0429382569349408202419304814156930660829} +#define T_1024_507 {-0.9995294175010931425617854984011501073837,-0.0306748031766366259509570824093316332437} +#define T_1024_509 {-0.9998305817958234031905817573715467005968,-0.0184067299058048201854109748865084839053} +#define T_1024_511 {-0.9999811752826011090888869148329831659794,-0.0061358846491544752690949771078976482386} +#define T_1024_513 {-0.9999811752826011090888869148329831659794,0.0061358846491544752690949771078976482386} +#define T_1024_515 {-0.9998305817958234031905817573715467005968,0.0184067299058048201854109748865084839053} +#define T_1024_517 {-0.9995294175010931425617854984011501073837,0.0306748031766366259509570824093316332437} +#define T_1024_519 {-0.9990777277526453614697743432770948857069,0.0429382569349408202419304814156930660829} +#define T_1024_521 {-0.9984755805732947742114902212051674723625,0.0551952443496899411434775117868412053213} +#define T_1024_523 {-0.9977230666441916362430220033274963498116,0.0674439195636640648201520775728567969054} +#define T_1024_525 {-0.9968202992911656679098086897283792495728,0.0796824379714301256338870871331891976297} +#define T_1024_527 {-0.9957674144676598171344039656105451285839,0.0919089564971327238618314936502429191023} +#define T_1024_529 {-0.9945645707342554153740366018610075116158,0.1041216338720545725449184715216688346118} +#define T_1024_531 {-0.9932119492347945000076947508205194026232,0.1163186309119047662230528317195421550423} +#define T_1024_533 {-0.9917097536690995251973390622879378497601,0.1284981107937931688045551936738775111735} +#define T_1024_535 {-0.9900582102622971225613923706987407058477,0.1406582393328492386341821429596166126430} +#define T_1024_537 {-0.9882575677307494643741847539786249399185,0.1527971852584434353516940063855145126581} +#define T_1024_539 {-0.9863080972445986693841746273392345756292,0.1649131204899699221222419964760774746537} +#define T_1024_541 {-0.9842100923869290252099517601891420781612,0.1770042204121487494639097803883487358689} +#define T_1024_543 {-0.9819638691095552429644044423184823244810,0.1890686641498062203758934174402384087443} +#define T_1024_545 {-0.9795697656854405188653345248894765973091,0.2011046348420919005484108765813289210200} +#define T_1024_547 {-0.9770281426577543948397419626417104154825,0.2131103199160913619358836967876413837075} +#define T_1024_549 {-0.9743393827855758582145995205792132765055,0.2250839113597928320409380376077024266124} +#define T_1024_551 {-0.9715038909862517835236417340638581663370,0.2370236059943671980132506860172725282609} +#define T_1024_553 {-0.9685220942744172667460134107386693358421,0.2489276057457201762890264262750861234963} +#define T_1024_555 {-0.9653944416976893982962337759090587496758,0.2607941179152755140080444107297807931900} +#define T_1024_557 {-0.9621214042690415801928338623838499188423,0.2726213554499489766236308696534251794219} +#define T_1024_559 {-0.9587034748958715990596601841389201581478,0.2844075372112718214090421042783418670297} +#define T_1024_561 {-0.9551411683057706714095047573209740221500,0.2961508882436238443425224886595970019698} +#define T_1024_563 {-0.9514350209690083381985914456890895962715,0.3078496400415348666079751183133339509368} +#define T_1024_565 {-0.9475855910177410912353934691054746508598,0.3195020308160156918830807626363821327686} +#define T_1024_567 {-0.9435934581619603855884292897826526314020,0.3311063057598764292066562120453454554081} +#define T_1024_569 {-0.9394592236021899189779560401802882552147,0.3426607173119943783312635332549689337611} +#define T_1024_571 {-0.9351835099389476102516027822275646030903,0.3541635254204903993091591019037878140807} +#define T_1024_573 {-0.9307669610789837122410972369834780693054,0.3656129978047738537938471381494309753180} +#define T_1024_575 {-0.9262102421383113792785479745361953973770,0.3770074102164182594520980273955501616001} +#define T_1024_577 {-0.9215140393420419018255529408634174615145,0.3883450466988263016787641390692442655563} +#define T_1024_579 {-0.9166790599210427048504357117053586989641,0.3996241998456468436096145069313934072852} +#define T_1024_581 {-0.9117060320054298783176705001096706837416,0.4108431710579039664033018652844475582242} +#define T_1024_583 {-0.9065957045149153348262416329816915094852,0.4220002707997996815869612419191980734468} +#define T_1024_585 {-0.9013488470460220280955354610341601073742,0.4330938188531519572599393086420604959130} +#define T_1024_587 {-0.8959662497561851068894611671566963195801,0.4441221445704292558609438401617808267474} +#define T_1024_589 {-0.8904487232447578781702191008662339299917,0.4550835871263438359157760260131908580661} +#define T_1024_591 {-0.8847970984309377895371540034830104559660,0.4659764957679661812051108427112922072411} +#define T_1024_593 {-0.8790122264286335251881610020063817501068,0.4767992300633221436356734557193703949451} +#define T_1024_595 {-0.8730949784182900907936186740698758512735,0.4875501601484359404103940960339969024062} +#define T_1024_597 {-0.8670462455156926484534096744027920067310,0.4982276669727818685373676999006420373917} +#define T_1024_599 {-0.8608669386377673093946327753656078130007,0.5088301425431069890947810563375242054462} +#define T_1024_601 {-0.8545579883654005337589865121117327362299,0.5193559901655896426930780762631911784410} +#define T_1024_603 {-0.8481203448032972325165701477089896798134,0.5298036246862947162838963777176104485989} +#define T_1024_605 {-0.8415549774368984436989649111637845635414,0.5401714727298928542253975138009991496801} +#define T_1024_607 {-0.8348628749863800102559707738691940903664,0.5504579729366048113092801941093057394028} +#define T_1024_609 {-0.8280450452577557962641208177956286817789,0.5606615761973360312353520384931471198797} +#define T_1024_611 {-0.8211025149911046483452992106322199106216,0.5707807458869672556645014083187561482191} +#define T_1024_613 {-0.8140363297059484137818685667298268526793,0.5808139580957645264902566850651055574417} +#define T_1024_615 {-0.8068475535437992229859105464129243046045,0.5907597018588742754374720789201091974974} +#define T_1024_617 {-0.7995372691079050131435224102460779249668,0.6006164793838689730520741250074934214354} +#define T_1024_619 {-0.7921065773002123888701930809475015848875,0.6103828062763094752796177999698556959629} +#define T_1024_621 {-0.7845565971555752415866891169571317732334,0.6200572117632892066296790289925411343575} +#define T_1024_623 {-0.7768884656732324422989677259465679526329,0.6296382389149269842576472910877782851458} +#define T_1024_625 {-0.7691033376455795878001708842930383980274,0.6391244448637757313846918805211316794157} +#define T_1024_627 {-0.7612023854842617787141989538213238120079,0.6485144010221124410975335194962099194527} +#define T_1024_629 {-0.7531867990436125204212203243514522910118,0.6578066932970786373502392052614595741034} +#define T_1024_631 {-0.7450577854414659473292203983874060213566,0.6669999223036374713657892243645619601011} +#define T_1024_633 {-0.7368165688773699040226006218290422111750,0.6760927035753159231035169796086847782135} +#define T_1024_635 {-0.7284643904482251963727890142763499170542,0.6850836677727003554139173502335324883461} +#define T_1024_637 {-0.7200025079613816547663418532465584576130,0.6939714608896540015692266933911014348269} +#define T_1024_639 {-0.7114321957452164335578004283888731151819,0.7027547444572252999250849825330078601837} +#define T_1024_641 {-0.7027547444572252999250849825330078601837,0.7114321957452164335578004283888731151819} +#define T_1024_643 {-0.6939714608896540015692266933911014348269,0.7200025079613816547663418532465584576130} +#define T_1024_645 {-0.6850836677727003554139173502335324883461,0.7284643904482251963727890142763499170542} +#define T_1024_647 {-0.6760927035753159231035169796086847782135,0.7368165688773699040226006218290422111750} +#define T_1024_649 {-0.6669999223036374713657892243645619601011,0.7450577854414659473292203983874060213566} +#define T_1024_651 {-0.6578066932970786373502392052614595741034,0.7531867990436125204212203243514522910118} +#define T_1024_653 {-0.6485144010221124410975335194962099194527,0.7612023854842617787141989538213238120079} +#define T_1024_655 {-0.6391244448637757313846918805211316794157,0.7691033376455795878001708842930383980274} +#define T_1024_657 {-0.6296382389149269842576472910877782851458,0.7768884656732324422989677259465679526329} +#define T_1024_659 {-0.6200572117632892066296790289925411343575,0.7845565971555752415866891169571317732334} +#define T_1024_661 {-0.6103828062763094752796177999698556959629,0.7921065773002123888701930809475015848875} +#define T_1024_663 {-0.6006164793838689730520741250074934214354,0.7995372691079050131435224102460779249668} +#define T_1024_665 {-0.5907597018588742754374720789201091974974,0.8068475535437992229859105464129243046045} +#define T_1024_667 {-0.5808139580957645264902566850651055574417,0.8140363297059484137818685667298268526793} +#define T_1024_669 {-0.5707807458869672556645014083187561482191,0.8211025149911046483452992106322199106216} +#define T_1024_671 {-0.5606615761973360312353520384931471198797,0.8280450452577557962641208177956286817789} +#define T_1024_673 {-0.5504579729366048113092801941093057394028,0.8348628749863800102559707738691940903664} +#define T_1024_675 {-0.5401714727298928542253975138009991496801,0.8415549774368984436989649111637845635414} +#define T_1024_677 {-0.5298036246862947162838963777176104485989,0.8481203448032972325165701477089896798134} +#define T_1024_679 {-0.5193559901655896426930780762631911784410,0.8545579883654005337589865121117327362299} +#define T_1024_681 {-0.5088301425431069890947810563375242054462,0.8608669386377673093946327753656078130007} +#define T_1024_683 {-0.4982276669727818685373676999006420373917,0.8670462455156926484534096744027920067310} +#define T_1024_685 {-0.4875501601484359404103940960339969024062,0.8730949784182900907936186740698758512735} +#define T_1024_687 {-0.4767992300633221436356734557193703949451,0.8790122264286335251881610020063817501068} +#define T_1024_689 {-0.4659764957679661812051108427112922072411,0.8847970984309377895371540034830104559660} +#define T_1024_691 {-0.4550835871263438359157760260131908580661,0.8904487232447578781702191008662339299917} +#define T_1024_693 {-0.4441221445704292558609438401617808267474,0.8959662497561851068894611671566963195801} +#define T_1024_695 {-0.4330938188531519572599393086420604959130,0.9013488470460220280955354610341601073742} +#define T_1024_697 {-0.4220002707997996815869612419191980734468,0.9065957045149153348262416329816915094852} +#define T_1024_699 {-0.4108431710579039664033018652844475582242,0.9117060320054298783176705001096706837416} +#define T_1024_701 {-0.3996241998456468436096145069313934072852,0.9166790599210427048504357117053586989641} +#define T_1024_703 {-0.3883450466988263016787641390692442655563,0.9215140393420419018255529408634174615145} +#define T_1024_705 {-0.3770074102164182594520980273955501616001,0.9262102421383113792785479745361953973770} +#define T_1024_707 {-0.3656129978047738537938471381494309753180,0.9307669610789837122410972369834780693054} +#define T_1024_709 {-0.3541635254204903993091591019037878140807,0.9351835099389476102516027822275646030903} +#define T_1024_711 {-0.3426607173119943783312635332549689337611,0.9394592236021899189779560401802882552147} +#define T_1024_713 {-0.3311063057598764292066562120453454554081,0.9435934581619603855884292897826526314020} +#define T_1024_715 {-0.3195020308160156918830807626363821327686,0.9475855910177410912353934691054746508598} +#define T_1024_717 {-0.3078496400415348666079751183133339509368,0.9514350209690083381985914456890895962715} +#define T_1024_719 {-0.2961508882436238443425224886595970019698,0.9551411683057706714095047573209740221500} +#define T_1024_721 {-0.2844075372112718214090421042783418670297,0.9587034748958715990596601841389201581478} +#define T_1024_723 {-0.2726213554499489766236308696534251794219,0.9621214042690415801928338623838499188423} +#define T_1024_725 {-0.2607941179152755140080444107297807931900,0.9653944416976893982962337759090587496758} +#define T_1024_727 {-0.2489276057457201762890264262750861234963,0.9685220942744172667460134107386693358421} +#define T_1024_729 {-0.2370236059943671980132506860172725282609,0.9715038909862517835236417340638581663370} +#define T_1024_731 {-0.2250839113597928320409380376077024266124,0.9743393827855758582145995205792132765055} +#define T_1024_733 {-0.2131103199160913619358836967876413837075,0.9770281426577543948397419626417104154825} +#define T_1024_735 {-0.2011046348420919005484108765813289210200,0.9795697656854405188653345248894765973091} +#define T_1024_737 {-0.1890686641498062203758934174402384087443,0.9819638691095552429644044423184823244810} +#define T_1024_739 {-0.1770042204121487494639097803883487358689,0.9842100923869290252099517601891420781612} +#define T_1024_741 {-0.1649131204899699221222419964760774746537,0.9863080972445986693841746273392345756292} +#define T_1024_743 {-0.1527971852584434353516940063855145126581,0.9882575677307494643741847539786249399185} +#define T_1024_745 {-0.1406582393328492386341821429596166126430,0.9900582102622971225613923706987407058477} +#define T_1024_747 {-0.1284981107937931688045551936738775111735,0.9917097536690995251973390622879378497601} +#define T_1024_749 {-0.1163186309119047662230528317195421550423,0.9932119492347945000076947508205194026232} +#define T_1024_751 {-0.1041216338720545725449184715216688346118,0.9945645707342554153740366018610075116158} +#define T_1024_753 {-0.0919089564971327238618314936502429191023,0.9957674144676598171344039656105451285839} +#define T_1024_755 {-0.0796824379714301256338870871331891976297,0.9968202992911656679098086897283792495728} +#define T_1024_757 {-0.0674439195636640648201520775728567969054,0.9977230666441916362430220033274963498116} +#define T_1024_759 {-0.0551952443496899411434775117868412053213,0.9984755805732947742114902212051674723625} +#define T_1024_761 {-0.0429382569349408202419304814156930660829,0.9990777277526453614697743432770948857069} +#define T_1024_763 {-0.0306748031766366259509570824093316332437,0.9995294175010931425617854984011501073837} +#define T_1024_765 {-0.0184067299058048201854109748865084839053,0.9998305817958234031905817573715467005968} +#define T_1024_767 {-0.0061358846491544752690949771078976482386,0.9999811752826011090888869148329831659794} +#define T_1024_769 {0.0061358846491544752690949771078976482386,0.9999811752826011090888869148329831659794} +#define T_1024_771 {0.0184067299058048201854109748865084839053,0.9998305817958234031905817573715467005968} +#define T_1024_773 {0.0306748031766366259509570824093316332437,0.9995294175010931425617854984011501073837} +#define T_1024_775 {0.0429382569349408202419304814156930660829,0.9990777277526453614697743432770948857069} +#define T_1024_777 {0.0551952443496899411434775117868412053213,0.9984755805732947742114902212051674723625} +#define T_1024_779 {0.0674439195636640648201520775728567969054,0.9977230666441916362430220033274963498116} +#define T_1024_781 {0.0796824379714301256338870871331891976297,0.9968202992911656679098086897283792495728} +#define T_1024_783 {0.0919089564971327238618314936502429191023,0.9957674144676598171344039656105451285839} +#define T_1024_785 {0.1041216338720545725449184715216688346118,0.9945645707342554153740366018610075116158} +#define T_1024_787 {0.1163186309119047662230528317195421550423,0.9932119492347945000076947508205194026232} +#define T_1024_789 {0.1284981107937931688045551936738775111735,0.9917097536690995251973390622879378497601} +#define T_1024_791 {0.1406582393328492386341821429596166126430,0.9900582102622971225613923706987407058477} +#define T_1024_793 {0.1527971852584434353516940063855145126581,0.9882575677307494643741847539786249399185} +#define T_1024_795 {0.1649131204899699221222419964760774746537,0.9863080972445986693841746273392345756292} +#define T_1024_797 {0.1770042204121487494639097803883487358689,0.9842100923869290252099517601891420781612} +#define T_1024_799 {0.1890686641498062203758934174402384087443,0.9819638691095552429644044423184823244810} +#define T_1024_801 {0.2011046348420919005484108765813289210200,0.9795697656854405188653345248894765973091} +#define T_1024_803 {0.2131103199160913619358836967876413837075,0.9770281426577543948397419626417104154825} +#define T_1024_805 {0.2250839113597928320409380376077024266124,0.9743393827855758582145995205792132765055} +#define T_1024_807 {0.2370236059943671980132506860172725282609,0.9715038909862517835236417340638581663370} +#define T_1024_809 {0.2489276057457201762890264262750861234963,0.9685220942744172667460134107386693358421} +#define T_1024_811 {0.2607941179152755140080444107297807931900,0.9653944416976893982962337759090587496758} +#define T_1024_813 {0.2726213554499489766236308696534251794219,0.9621214042690415801928338623838499188423} +#define T_1024_815 {0.2844075372112718214090421042783418670297,0.9587034748958715990596601841389201581478} +#define T_1024_817 {0.2961508882436238443425224886595970019698,0.9551411683057706714095047573209740221500} +#define T_1024_819 {0.3078496400415348666079751183133339509368,0.9514350209690083381985914456890895962715} +#define T_1024_821 {0.3195020308160156918830807626363821327686,0.9475855910177410912353934691054746508598} +#define T_1024_823 {0.3311063057598764292066562120453454554081,0.9435934581619603855884292897826526314020} +#define T_1024_825 {0.3426607173119943783312635332549689337611,0.9394592236021899189779560401802882552147} +#define T_1024_827 {0.3541635254204903993091591019037878140807,0.9351835099389476102516027822275646030903} +#define T_1024_829 {0.3656129978047738537938471381494309753180,0.9307669610789837122410972369834780693054} +#define T_1024_831 {0.3770074102164182594520980273955501616001,0.9262102421383113792785479745361953973770} +#define T_1024_833 {0.3883450466988263016787641390692442655563,0.9215140393420419018255529408634174615145} +#define T_1024_835 {0.3996241998456468436096145069313934072852,0.9166790599210427048504357117053586989641} +#define T_1024_837 {0.4108431710579039664033018652844475582242,0.9117060320054298783176705001096706837416} +#define T_1024_839 {0.4220002707997996815869612419191980734468,0.9065957045149153348262416329816915094852} +#define T_1024_841 {0.4330938188531519572599393086420604959130,0.9013488470460220280955354610341601073742} +#define T_1024_843 {0.4441221445704292558609438401617808267474,0.8959662497561851068894611671566963195801} +#define T_1024_845 {0.4550835871263438359157760260131908580661,0.8904487232447578781702191008662339299917} +#define T_1024_847 {0.4659764957679661812051108427112922072411,0.8847970984309377895371540034830104559660} +#define T_1024_849 {0.4767992300633221436356734557193703949451,0.8790122264286335251881610020063817501068} +#define T_1024_851 {0.4875501601484359404103940960339969024062,0.8730949784182900907936186740698758512735} +#define T_1024_853 {0.4982276669727818685373676999006420373917,0.8670462455156926484534096744027920067310} +#define T_1024_855 {0.5088301425431069890947810563375242054462,0.8608669386377673093946327753656078130007} +#define T_1024_857 {0.5193559901655896426930780762631911784410,0.8545579883654005337589865121117327362299} +#define T_1024_859 {0.5298036246862947162838963777176104485989,0.8481203448032972325165701477089896798134} +#define T_1024_861 {0.5401714727298928542253975138009991496801,0.8415549774368984436989649111637845635414} +#define T_1024_863 {0.5504579729366048113092801941093057394028,0.8348628749863800102559707738691940903664} +#define T_1024_865 {0.5606615761973360312353520384931471198797,0.8280450452577557962641208177956286817789} +#define T_1024_867 {0.5707807458869672556645014083187561482191,0.8211025149911046483452992106322199106216} +#define T_1024_869 {0.5808139580957645264902566850651055574417,0.8140363297059484137818685667298268526793} +#define T_1024_871 {0.5907597018588742754374720789201091974974,0.8068475535437992229859105464129243046045} +#define T_1024_873 {0.6006164793838689730520741250074934214354,0.7995372691079050131435224102460779249668} +#define T_1024_875 {0.6103828062763094752796177999698556959629,0.7921065773002123888701930809475015848875} +#define T_1024_877 {0.6200572117632892066296790289925411343575,0.7845565971555752415866891169571317732334} +#define T_1024_879 {0.6296382389149269842576472910877782851458,0.7768884656732324422989677259465679526329} +#define T_1024_881 {0.6391244448637757313846918805211316794157,0.7691033376455795878001708842930383980274} +#define T_1024_883 {0.6485144010221124410975335194962099194527,0.7612023854842617787141989538213238120079} +#define T_1024_885 {0.6578066932970786373502392052614595741034,0.7531867990436125204212203243514522910118} +#define T_1024_887 {0.6669999223036374713657892243645619601011,0.7450577854414659473292203983874060213566} +#define T_1024_889 {0.6760927035753159231035169796086847782135,0.7368165688773699040226006218290422111750} +#define T_1024_891 {0.6850836677727003554139173502335324883461,0.7284643904482251963727890142763499170542} +#define T_1024_893 {0.6939714608896540015692266933911014348269,0.7200025079613816547663418532465584576130} +#define T_1024_895 {0.7027547444572252999250849825330078601837,0.7114321957452164335578004283888731151819} +#define T_1024_897 {0.7114321957452164335578004283888731151819,0.7027547444572252999250849825330078601837} +#define T_1024_899 {0.7200025079613816547663418532465584576130,0.6939714608896540015692266933911014348269} +#define T_1024_901 {0.7284643904482251963727890142763499170542,0.6850836677727003554139173502335324883461} +#define T_1024_903 {0.7368165688773699040226006218290422111750,0.6760927035753159231035169796086847782135} +#define T_1024_905 {0.7450577854414659473292203983874060213566,0.6669999223036374713657892243645619601011} +#define T_1024_907 {0.7531867990436125204212203243514522910118,0.6578066932970786373502392052614595741034} +#define T_1024_909 {0.7612023854842617787141989538213238120079,0.6485144010221124410975335194962099194527} +#define T_1024_911 {0.7691033376455795878001708842930383980274,0.6391244448637757313846918805211316794157} +#define T_1024_913 {0.7768884656732324422989677259465679526329,0.6296382389149269842576472910877782851458} +#define T_1024_915 {0.7845565971555752415866891169571317732334,0.6200572117632892066296790289925411343575} +#define T_1024_917 {0.7921065773002123888701930809475015848875,0.6103828062763094752796177999698556959629} +#define T_1024_919 {0.7995372691079050131435224102460779249668,0.6006164793838689730520741250074934214354} +#define T_1024_921 {0.8068475535437992229859105464129243046045,0.5907597018588742754374720789201091974974} +#define T_1024_923 {0.8140363297059484137818685667298268526793,0.5808139580957645264902566850651055574417} +#define T_1024_925 {0.8211025149911046483452992106322199106216,0.5707807458869672556645014083187561482191} +#define T_1024_927 {0.8280450452577557962641208177956286817789,0.5606615761973360312353520384931471198797} +#define T_1024_929 {0.8348628749863800102559707738691940903664,0.5504579729366048113092801941093057394028} +#define T_1024_931 {0.8415549774368984436989649111637845635414,0.5401714727298928542253975138009991496801} +#define T_1024_933 {0.8481203448032972325165701477089896798134,0.5298036246862947162838963777176104485989} +#define T_1024_935 {0.8545579883654005337589865121117327362299,0.5193559901655896426930780762631911784410} +#define T_1024_937 {0.8608669386377673093946327753656078130007,0.5088301425431069890947810563375242054462} +#define T_1024_939 {0.8670462455156926484534096744027920067310,0.4982276669727818685373676999006420373917} +#define T_1024_941 {0.8730949784182900907936186740698758512735,0.4875501601484359404103940960339969024062} +#define T_1024_943 {0.8790122264286335251881610020063817501068,0.4767992300633221436356734557193703949451} +#define T_1024_945 {0.8847970984309377895371540034830104559660,0.4659764957679661812051108427112922072411} +#define T_1024_947 {0.8904487232447578781702191008662339299917,0.4550835871263438359157760260131908580661} +#define T_1024_949 {0.8959662497561851068894611671566963195801,0.4441221445704292558609438401617808267474} +#define T_1024_951 {0.9013488470460220280955354610341601073742,0.4330938188531519572599393086420604959130} +#define T_1024_953 {0.9065957045149153348262416329816915094852,0.4220002707997996815869612419191980734468} +#define T_1024_955 {0.9117060320054298783176705001096706837416,0.4108431710579039664033018652844475582242} +#define T_1024_957 {0.9166790599210427048504357117053586989641,0.3996241998456468436096145069313934072852} +#define T_1024_959 {0.9215140393420419018255529408634174615145,0.3883450466988263016787641390692442655563} +#define T_1024_961 {0.9262102421383113792785479745361953973770,0.3770074102164182594520980273955501616001} +#define T_1024_963 {0.9307669610789837122410972369834780693054,0.3656129978047738537938471381494309753180} +#define T_1024_965 {0.9351835099389476102516027822275646030903,0.3541635254204903993091591019037878140807} +#define T_1024_967 {0.9394592236021899189779560401802882552147,0.3426607173119943783312635332549689337611} +#define T_1024_969 {0.9435934581619603855884292897826526314020,0.3311063057598764292066562120453454554081} +#define T_1024_971 {0.9475855910177410912353934691054746508598,0.3195020308160156918830807626363821327686} +#define T_1024_973 {0.9514350209690083381985914456890895962715,0.3078496400415348666079751183133339509368} +#define T_1024_975 {0.9551411683057706714095047573209740221500,0.2961508882436238443425224886595970019698} +#define T_1024_977 {0.9587034748958715990596601841389201581478,0.2844075372112718214090421042783418670297} +#define T_1024_979 {0.9621214042690415801928338623838499188423,0.2726213554499489766236308696534251794219} +#define T_1024_981 {0.9653944416976893982962337759090587496758,0.2607941179152755140080444107297807931900} +#define T_1024_983 {0.9685220942744172667460134107386693358421,0.2489276057457201762890264262750861234963} +#define T_1024_985 {0.9715038909862517835236417340638581663370,0.2370236059943671980132506860172725282609} +#define T_1024_987 {0.9743393827855758582145995205792132765055,0.2250839113597928320409380376077024266124} +#define T_1024_989 {0.9770281426577543948397419626417104154825,0.2131103199160913619358836967876413837075} +#define T_1024_991 {0.9795697656854405188653345248894765973091,0.2011046348420919005484108765813289210200} +#define T_1024_993 {0.9819638691095552429644044423184823244810,0.1890686641498062203758934174402384087443} +#define T_1024_995 {0.9842100923869290252099517601891420781612,0.1770042204121487494639097803883487358689} +#define T_1024_997 {0.9863080972445986693841746273392345756292,0.1649131204899699221222419964760774746537} +#define T_1024_999 {0.9882575677307494643741847539786249399185,0.1527971852584434353516940063855145126581} +#define T_1024_1001 {0.9900582102622971225613923706987407058477,0.1406582393328492386341821429596166126430} +#define T_1024_1003 {0.9917097536690995251973390622879378497601,0.1284981107937931688045551936738775111735} +#define T_1024_1005 {0.9932119492347945000076947508205194026232,0.1163186309119047662230528317195421550423} +#define T_1024_1007 {0.9945645707342554153740366018610075116158,0.1041216338720545725449184715216688346118} +#define T_1024_1009 {0.9957674144676598171344039656105451285839,0.0919089564971327238618314936502429191023} +#define T_1024_1011 {0.9968202992911656679098086897283792495728,0.0796824379714301256338870871331891976297} +#define T_1024_1013 {0.9977230666441916362430220033274963498116,0.0674439195636640648201520775728567969054} +#define T_1024_1015 {0.9984755805732947742114902212051674723625,0.0551952443496899411434775117868412053213} +#define T_1024_1017 {0.9990777277526453614697743432770948857069,0.0429382569349408202419304814156930660829} +#define T_1024_1019 {0.9995294175010931425617854984011501073837,0.0306748031766366259509570824093316332437} +#define T_1024_1021 {0.9998305817958234031905817573715467005968,0.0184067299058048201854109748865084839053} +#define T_1024_1023 {0.9999811752826011090888869148329831659794,0.0061358846491544752690949771078976482386} +// Pre-computed twiddles for N=1250 +#define T_1250_1 {0.9999873669329657488447082869242876768112,-0.0050265270788188622791414772450480086263} +#define T_1250_3 {0.9998863043118163540512455256248358637094,-0.0150790732360371205339477640450240869541} +#define T_1250_7 {0.9993810422739491938770584056328516453505,-0.0351785779052378247411247969012038083747} +#define T_1250_9 {0.9989768939209825449054847013030666857958,-0.0452235050829326162835641866877267602831} +#define T_1250_11 {0.9984717850692077512064770417055115103722,-0.0552638617969271883634263531348551623523} +#define T_1250_13 {0.9978657667668942021776956607936881482601,-0.0652986333296357579492052991554373875260} +#define T_1250_17 {0.9963512569890512482473354793910402804613,-0.0853473649056469552132142553091398440301} +#define T_1250_19 {0.9954429185757831533010175917297601699829,-0.0953592987459886270995568224861926864833} +#define T_1250_21 {0.9944339768210305186002528898825403302908,-0.1053615952038983727412357893626904115081} +#define T_1250_23 {0.9933245336923797186301499095861800014973,-0.1153532434083082286457866416640172246844} +#define T_1250_27 {0.9908046019577008145517993398243561387062,-0.1353005570551802927425910638703498989344} +#define T_1250_29 {0.9893943680257961670321265046368353068829,-0.1452542065443727936635553987798630259931} +#define T_1250_31 {0.9878841420425021890849848205107264220715,-0.1551931760772660706670933450368465855718} +#define T_1250_33 {0.9862740766371433931425372065859846770763,-0.1651164611828580541796895886363927274942} +#define T_1250_37 {0.9827550885118900847459144642925821244717,-0.1849119682551322307872965211572591215372} +#define T_1250_39 {0.9808465214346523408650568853772711008787,-0.1947821896107088668337325998436426743865} +#define T_1250_41 {0.9788388261847201787801964201207738369703,-0.2046327255189626659337420733209000900388} +#define T_1250_43 {0.9767322056676015495924048082088120281696,-0.2144625804463383411668786493464722298086} +#define T_1250_47 {0.9722230504211841761730283906217664480209,-0.2340562757751382949589213922081398777664} +#define T_1250_49 {0.9698209714046909235563020956760738044977,-0.2438181359613384724394080649290117435157} +#define T_1250_51 {0.9673208785005658061351141441264189779758,-0.2535553549363405156924500261084176599979} +#define T_1250_53 {0.9647230243779428837669343010929878801107,-0.2632669486188402574455835747357923537493} +#define T_1250_57 {0.9592350925278542161933614806912373751402,-0.2826093368292684759879307421215344220400} +#define T_1250_59 {0.9563455694321727484918937989277765154839,-0.2922381765400498543350238378479843959212} +#define T_1250_61 {0.9533593943255084246501951383834239095449,-0.3018374815215959516834232090332079678774} +#define T_1250_63 {0.9502768690023574160719022074772510677576,-0.3114062816307281988947863737848820164800} +#define T_1250_67 {0.9438240235407373557308119416120462119579,-0.3304485021714786974200706026749685406685} +#define T_1250_69 {0.9404543555519829789446362156013492494822,-0.3399199981220644151136411892366595566273} +#define T_1250_71 {0.9369896415802629885405394816189073026180,-0.3493571404326670237772134441911475732923} +#define T_1250_73 {0.9334302317830763806583149744255933910608,-0.3587589753488995003571915276552317664027} +#define T_1250_77 {0.9260287731575901348790580414060968905687,-0.3774529259182506923941957666102098301053} +#define T_1250_79 {0.9221874723495493419989088579313829541206,-0.3867431522878577720980786125437589362264} +#define T_1250_81 {0.9182529716809616493478074517042841762304,-0.3959942928870098799443155712651787325740} +#define T_1250_83 {0.9142256687878009824288483287091366946697,-0.4052054127593781029936792492662789300084} +#define T_1250_87 {0.9058942937252941751680168636084999889135,-0.4235038708157818909683101082919165492058} +#define T_1250_89 {0.9015910635571866560411535829189233481884,-0.4325893596863208401259726088028401136398} +#define T_1250_91 {0.8971967150818874747741915598453488200903,-0.4416311293899813361640838138555409386754} +#define T_1250_93 {0.8927116924093823069341624432126991450787,-0.4506282661302732917008029289718251675367} +#define T_1250_97 {0.8834714466863258230944211391033604741096,-0.4684850081805932986434015674603870138526} +#define T_1250_99 {0.8787171574910247473511049065564293414354,-0.4773428088186660311897924202639842405915} +#define T_1250_101 {0.8738740617146393274694560204807203263044,-0.4861523673318466909343271709076361730695} +#define T_1250_103 {0.8689426488193063846665609162300825119019,-0.4949127933918132660195965399907436221838} +#define T_1250_107 {0.8588168740983388449450330881518311798573,-0.5122827117558800269492280676786322146654} +#define T_1250_109 {0.8536235356229666670913047710200771689415,-0.5208904485883242285737537713430356234312} +#define T_1250_111 {0.8483439266257484234756702790036797523499,-0.5294455422016571821686170551402028650045} +#define T_1250_113 {0.8429785806845486728633431994239799678326,-0.5379471279847712894550681994587648659945} +#define T_1250_117 {0.8319928555514105594781426589179318398237,-0.5547863447412973991745843704848084598780} +#define T_1250_119 {0.8263735866196754153989445512706879526377,-0.5631222738778264291781283645832445472479} +#define T_1250_121 {0.8206708011521579582137064790003933012486,-0.5714012916823650867215178550395648926497} +#define T_1250_123 {0.8148850754945862195910422087763436138630,-0.5796225614451032681628817044838797301054} +#define T_1250_129 {0.7970361462173010114184990015928633511066,-0.6039316034312769421177335971151478588581} +#define T_1250_141 {0.7591804347117565576752440392738208174706,-0.6508802251957486184608114854199811816216} +#define T_1250_147 {0.7392080827754685667940748317050747573376,-0.6734771045547250922780335713468957692385} +#define T_1250_153 {0.7185634108878365955419553756655659526587,-0.6954614471940469000443840741354506462812} +#define T_1250_159 {0.6972651956628107550173467643617186695337,-0.7168132580507298179384179093176499009132} +#define T_1250_171 {0.6527861960715837241764347709249705076218,-0.7575421983087092803188511425105389207602} +#define T_1250_177 {0.6296458659692600878798884878051467239857,-0.7768822841768375786486444667389150708914} +#define T_1250_183 {0.6059328642635862793852652430359739810228,-0.7955157848875950010736346484918612986803} +#define T_1250_189 {0.5816687582568207393052261977572925388813,-0.8134257530149683335096710834477562457323} +#define T_1250_201 {0.5315759886622380969711798570642713457346,-0.8470106069452519870210949193278793245554} +#define T_1250_207 {0.5057928851395788738543046747508924454451,-0.8626549468600877013102490309393033385277} +#define T_1250_213 {0.4795497560086358079800561426964122802019,-0.8775146901972968871419311653880868107080} +#define T_1250_219 {0.4528704697574726933950728380295913666487,-0.8915763218152700231300400446343701332808} +#define T_1250_231 {0.3983008612318227892323818650766042992473,-0.9172548304271764818551559983461629599333} +#define T_1250_237 {0.3704601707692759537948745673929806798697,-0.9288483524631988563768913991225417703390} +#define T_1250_243 {0.3422825416695712275050311745872022584081,-0.9395970741058202735374038638838101178408} +#define T_1250_249 {0.3137936018751875111298943465953925624490,-0.9494912192443889509263499348890036344528} +#define T_1250_261 {0.2559856941126726903590338224603328853846,-0.9666805699969629461776321477373130619526} +#define T_1250_267 {0.2267193032343216008150932339049177244306,-0.9739601416592692517681939534668345004320} +#define T_1250_273 {0.1972467080212628731672452886414248496294,-0.9803538831334196457234497756871860474348} +#define T_1250_279 {0.1675947142055242422564731441525509580970,-0.9858559792233186902521424599399324506521} +#define T_1250_291 {0.1078605449930568199734537415679369587451,-0.9941660338363007554107753094285726547241} +#define T_1250_297 {0.0778326986474765225576533111961907707155,-0.9969664342500459408569213337614201009274} +#define T_1250_303 {0.0477340623884916423480540004220529226586,-0.9988600799350685344180078573117498308420} +#define T_1250_309 {0.0175920113410997225322773829248035326600,-0.9998452485944885337332266317389439791441} +#define T_1250_321 {-0.0427126621211368767694338544060883577913,-0.9990873978258987264666757255326956510544} +#define T_1250_327 {-0.0728204366033457090034985981219506356865,-0.9973450676736201891969813004834577441216} +#define T_1250_333 {-0.1028619798936414586609799926009145565331,-0.9946956384202959577933711443620268255472} +#define T_1250_339 {-0.1328099687938694151156937550695147365332,-0.9911415197583902658706733745930250734091} +#define T_1250_351 {-0.1923164408499827837673024077957961708307,-0.9813329641761735500082863836723845452070} +#define T_1250_357 {-0.2218208020484253029103172139002708718181,-0.9750874482725092606827388408419210463762} +#define T_1250_363 {-0.2511234141665813490718051070871297270060,-0.9679550768797691162603769043926149606705} +#define T_1250_369 {-0.2801976260743935798203096965153235942125,-0.9599423369881517897894696034200023859739} +#define T_1250_381 {-0.3375553074551057575192203330516349524260,-0.9413056965773068940350754019164014607668} +#define T_1250_387 {-0.3657866093253325234790906961279688403010,-0.9306987463396931836712155927671119570732} +#define T_1250_393 {-0.3936852232270234286382049049279885366559,-0.9192453127499148335388667874212842434645} +#define T_1250_399 {-0.4212257749858722144509215468133334070444,-0.9069558128638635086105068694450892508030} +#define T_1250_411 {-0.4751328464720838140600278620695462450385,-0.8799140743296104405501978362735826522112} +#define T_1250_417 {-0.5014503369740861415948529611341655254364,-0.8651864305157446199956439158995635807514} +#define T_1250_423 {-0.5273117514752824197543645823316182941198,-0.8496718877049364015974219910276588052511} +#define T_1250_429 {-0.5526935686623315469034878333332017064095,-0.8333845565879515193685733720485586673021} +#define T_1250_441 {-0.6019265278387284645589261344866827130318,-0.7985514730335249167225697419780772179365} +#define T_1250_447 {-0.6257328917723579131404676445527002215385,-0.7800374017662247139170972332067321985960} +#define T_1250_453 {-0.6489701430066630027937435443163849413395,-0.7608138757185697320650774599926080554724} +#define T_1250_459 {-0.6716171469410040506531345272378530353308,-0.7408983789527587848766643219278194010258} +#define T_1250_471 {-0.7150585774315030285208649729611352086067,-0.6990645398256409848158909881021827459335} +#define T_1250_477 {-0.7358134934071977051317503537575248628855,-0.6771842459182698670261402185133192688227} +#define T_1250_483 {-0.7558991768574195857510744644969236105680,-0.6546880435950205301409710045845713466406} +#define T_1250_489 {-0.7752973595759157232620850663806777447462,-0.6315963934623227693521130277076736092567} +#define T_1250_501 {-0.8119612924962041899945575096353422850370,-0.5837112809325288864670255861710757017136} +#define T_1250_507 {-0.8291936963450933228969574884104076772928,-0.5589613707060275826776774010795634239912} +#define T_1250_513 {-0.8456719370854187678432367647474166005850,-0.5337030773999673627372430928517132997513} +#define T_1250_519 {-0.8613810275298499075091740451171062886715,-0.5079593737806400444156906814896501600742} +#define T_1250_531 {-0.8904353195405447785049091180553659796715,-0.4551098128086537575320846826798515394330} +#define T_1250_537 {-0.9037540958273154734214926975255366414785,-0.4280520228609502875194436910533113405108} +#define T_1250_543 {-0.9162508952933617889513584486849140375853,-0.4006049136919229147757448572519933804870} +#define T_1250_549 {-0.9279143519271306761453388389782048761845,-0.3727934488260932766046096276113530620933} +#define T_1250_561 {-0.9486995719838390295208796487713698297739,-0.3161789400287130513689248800801578909159} +#define T_1250_567 {-0.9578024309623294207938215549802407622337,-0.2874273877741164162635811862855916842818} +#define T_1250_573 {-0.9660341554134970198930432161432690918446,-0.2584144163442346764369972333952318876982} +#define T_1250_579 {-0.9733872584703672226069670614378992468119,-0.2291664134369226779597283893963322043419} +#define T_1250_591 {-0.9854316546073247362613756195059977471828,-0.1700719086088891551789004097372526302934} +#define T_1250_597 {-0.9901119931429293030689109400555025786161,-0.1402791539557319833164683586801402270794} +#define T_1250_603 {-0.9938918111585802739327277777192648500204,-0.1103588134763917733005555987801926676184} +#define T_1250_609 {-0.9967676708576360677938055232516489923000,-0.0803381001333942218467498719292052555829} +#define T_1250_621 {-0.9997978773111629857694993006589356809855,-0.0201048383254575774303773982865095604211} +#define T_1250_627 {-0.9999494680510517818916582655219826847315,0.0100529271567306524581830728948261821643} +#define T_1250_633 {-0.9991915909518144234269243497692514210939,0.0402015493629532053021513604562642285600} +#define T_1250_639 {-0.9975249353131302454400497481401544064283,0.0703136077053038016648756070026138331741} +#define T_1250_651 {-0.9914721769705635567859758339182008057833,0.1303185416326145262821967207855777814984} +#define T_1250_657 {-0.9870915793339134491901631918153725564480,0.1601568419021817957226261341929784975946} +#define T_1250_663 {-0.9818132082853465725236219441285356879234,0.1898494773140938784994347088286303915083} +#define T_1250_669 {-0.9756418645761659735882176391896791756153,0.2193694420067258954443190077654435299337} +#define T_1250_681 {-0.9606435179309280592008235544199123978615,0.2777841454390272413199625134438974782825} +#define T_1250_687 {-0.9518301561981014069502293750701937824488,0.3066257551998819441685384390439139679074} +#define T_1250_693 {-0.9421510918042592663823597831651568412781,0.3351884846053667099674555629462702199817} +#define T_1250_699 {-0.9316151279920259486999611908686347305775,0.3634463554589605904787674717226764187217} +#define T_1250_711 {-0.9080116031765956829957531226682476699352,0.4189450184650349573445282658212818205357} +#define T_1250_717 {-0.8949655099049148576639822749712038785219,0.4461353338177057592517371631402056664228} +#define T_1250_723 {-0.8811054331526765315274474232865031808615,0.4729198829282125204898079573467839509249} +#define T_1250_729 {-0.8664439788509370465874326328048482537270,0.4992743048795490468094726566050667315722} +#define T_1250_741 {-0.8347709935146954141060859910794533789158,0.5505973014704015344733534220722503960133} +#define T_1250_747 {-0.8177882694974004662569200263533275574446,0.5755191971363310399212309675931464880705} +// Pre-computed twiddles for N=1296 +#define T_1296_1 {0.9999882478077495306933997198939323425293,-0.0048481178190018601320554481048930028919} +#define T_1296_5 {0.9997062090049132487834526727965567260981,-0.0242383101107481388480913153671281179413} +#define T_1296_7 {0.9994241967185149011498879190185107290745,-0.0339304437570622369202411050537193659693} +#define T_1296_11 {0.9985783189429906503420397712034173309803,-0.0533042300102982355425318417019298067316} +#define T_1296_13 {0.9980145329807432741375805562711320817471,-0.0629840611522379661968429331864172127098} +#define T_1296_17 {0.9966055319295779035826399194775149226189,-0.0823250492095999042430065628650481812656} +#define T_1296_19 {0.9957604493106914089750603125139605253935,-0.0919843877436274381942382660781731829047} +#define T_1296_23 {0.9937895171394426352406981095555238425732,-0.1112762131982996532952867596577561926097} +#define T_1296_25 {0.9926638528881819301119548981660045683384,-0.1209068863596693688133854038824210874736} +#define T_1296_29 {0.9901326572022358663005547896318603307009,-0.1401332264013062656182739829091588035226} +#define T_1296_31 {0.9887273637429387918018619529902935028076,-0.1497270856790396120761243992092204280198} +#define T_1296_35 {0.9856380461865491549033890805731061846018,-0.1688716729044927866709713271120563149452} +#define T_1296_37 {0.9839543125377806509490596909017767757177,-0.1784206009358321187718843248148914426565} +#define T_1296_41 {0.9803094869820240253588394807593431323767,-0.1974672371129975179115945138619281351566} +#define T_1296_43 {0.9783487377505475368622001042240299284458,-0.2069631545515053794304094481049105525017} +#define T_1296_47 {0.9741514880817274679714046214940026402473,-0.2258957243246448032270023986711748875678} +#define T_1296_49 {0.9719153822571453549272746386122889816761,-0.2353305966761375933593569698132341727614} +#define T_1296_53 {0.9671692597675166647164246569445822387934,-0.2541330812010785256127576303697424009442} +#define T_1296_55 {0.9646596893185994670005811713053844869137,-0.2634989256216107578190133153839269652963} +#define T_1296_59 {0.9593687097016201370536236936459317803383,-0.2821554161192877385744282037194352596998} +#define T_1296_61 {0.9565877979755121884863910963758826255798,-0.2914443081694350068389098851184826344252} +#define T_1296_65 {0.9507564379281665534193734856671653687954,-0.3099390193863051368161620757746277377009} +#define T_1296_67 {0.9477065378538220841164729790762066841125,-0.3191430997360308330357270278909709304571} +#define T_1296_71 {0.9413397312888872603409140538133215159178,-0.3374603832999741315212816061830380931497} +#define T_1296_73 {0.9380234233862577708507046736485790461302,-0.3465718643784075347902273733780020847917} +#define T_1296_77 {0.9311265572577218652128294706926681101322,-0.3646962220388118569580626626702724024653} +#define T_1296_79 {0.9275466474543785366790871194098144769669,-0.3737073946233104848602124548051506280899} +#define T_1296_83 {0.9201255571995390347694865340599790215492,-0.3916234913641390424920984969503479078412} +#define T_1296_85 {0.9162850744565779192996046731423120945692,-0.4005267311030605847932406504696700721979} +#define T_1296_89 {0.9083460390586792776446145580848678946495,-0.4182194081178063904680186624318594112992} +#define T_1296_91 {0.9042482328079178843793783926230389624834,-0.4270071819814714175755909764120588079095} +#define T_1296_95 {0.8957979694835052075774228796944953501225,-0.4444614694990209891045651602325960993767} +#define T_1296_97 {0.8914463068781385279848450409190263599157,-0.4531263421534081903274682190385647118092} +#define T_1296_101 {0.8824919653936212915468217943271156400442,-0.4703274721039624739482576387672452256083} +#define T_1296_103 {0.8778901283746644024219563107180874794722,-0.4788621122017435172146804234216688200831} +#define T_1296_107 {0.8684392849969004846144571274635381996632,-0.4957955307120791577091267754440195858479} +#define T_1296_109 {0.8635911671778986331560190592426806688309,-0.5041927170956703729842729444499127566814} +#define T_1296_113 {0.8536518182639163399016979383304715156555,-0.5208440968031696760576210181170608848333} +#define T_1296_115 {0.8485615216365590685043684970878530293703,-0.5290967246145524827127815115090925246477} +#define T_1296_119 {0.8381420768678403687701461421966087073088,-0.5454519767895824600145715521648526191711} +#define T_1296_121 {0.8328139083312672275027921386936213821173,-0.5535530634817222850330153960385359823704} +#define T_1296_125 {0.8219231835983180634030986766447313129902,-0.5695983499481065415182001743232831358910} +#define T_1296_127 {0.8163616513150518949615275232645217329264,-0.5775410411928850740181928813399281352758} +#define T_1296_131 {0.8050088612582783076376813369279261678457,-0.5932627860363820504474574590858537703753} +#define T_1296_133 {0.7992186708398696382005255145486444234848,-0.6010403615240428321087051699578296393156} +#define T_1296_137 {0.7874134210530723265719643677584826946259,-0.6164252625789253814403423348267097026110} +#define T_1296_139 {0.7813994715786822808922806871123611927032,-0.6240311417041268793326480590621940791607} +#define T_1296_143 {0.7691517504817649841797333465365227311850,-0.6390661818081415646020104759372770786285} +#define T_1296_145 {0.7629191303530551415690297289984300732613,-0.6464939292378065749389293159765657037497} +#define T_1296_149 {0.7502393007408245662048784652142785489559,-0.6611663872459931923231124528683722019196} +#define T_1296_151 {0.7437932833766611739889640375622548162937,-0.6684097183642424555571892597072292119265} +#define T_1296_155 {0.7306920736508674130149643133336212486029,-0.6827071799122924611324947363755200058222} +#define T_1296_157 {0.7240381130254825992054179550905246287584,-0.6897599661378576163528464348928537219763} +#define T_1296_161 {0.7105266081175209968989747721934691071510,-0.7036703341459058513862601103028282523155} +#define T_1296_163 {0.7036703341459058513862601103028282523155,-0.7105266081175209968989747721934691071510} +#define T_1296_167 {0.6897599661378576163528464348928537219763,-0.7240381130254825992054179550905246287584} +#define T_1296_169 {0.6827071799122924611324947363755200058222,-0.7306920736508674130149643133336212486029} +#define T_1296_173 {0.6684097183642424555571892597072292119265,-0.7437932833766611739889640375622548162937} +#define T_1296_175 {0.6611663872459931923231124528683722019196,-0.7502393007408245662048784652142785489559} +#define T_1296_179 {0.6464939292378065749389293159765657037497,-0.7629191303530551415690297289984300732613} +#define T_1296_181 {0.6390661818081415646020104759372770786285,-0.7691517504817649841797333465365227311850} +#define T_1296_185 {0.6240311417041268793326480590621940791607,-0.7813994715786822808922806871123611927032} +#define T_1296_187 {0.6164252625789253814403423348267097026110,-0.7874134210530723265719643677584826946259} +#define T_1296_191 {0.6010403615240428321087051699578296393156,-0.7992186708398696382005255145486444234848} +#define T_1296_193 {0.5932627860363820504474574590858537703753,-0.8050088612582783076376813369279261678457} +#define T_1296_197 {0.5775410411928850740181928813399281352758,-0.8163616513150518949615275232645217329264} +#define T_1296_199 {0.5695983499481065415182001743232831358910,-0.8219231835983180634030986766447313129902} +#define T_1296_203 {0.5535530634817222850330153960385359823704,-0.8328139083312672275027921386936213821173} +#define T_1296_205 {0.5454519767895824600145715521648526191711,-0.8381420768678403687701461421966087073088} +#define T_1296_209 {0.5290967246145524827127815115090925246477,-0.8485615216365590685043684970878530293703} +#define T_1296_211 {0.5208440968031696760576210181170608848333,-0.8536518182639163399016979383304715156555} +#define T_1296_215 {0.5041927170956703729842729444499127566814,-0.8635911671778986331560190592426806688309} +// Pre-computed twiddles for N=1331 +#define T_1331_1 {0.9999888577519229215795348864048719406128,-0.0047206325851934256149222690623901144136} +#define T_1331_2 {0.9999554312559911783964139431191142648458,-0.0094411599734681612983155929441636544652} +#define T_1331_3 {0.9998997212570973625034298493119422346354,-0.0141614769702497790559991841519149602391} +#define T_1331_4 {0.9998217289967107346271291135053616017103,-0.0188814783856523157878903873552189907059} +#define T_1331_5 {0.9997214562128495796144989071763120591640,-0.0236010590368223860391339030684321187437} +#define T_1331_6 {0.9995989051400421265825002592464443296194,-0.0283201137502831178094364616981692961417} +#define T_1331_7 {0.9994540785092775880826820866786874830723,-0.0330385373642778992775426161188079277053} +#define T_1331_8 {0.9992869795479443206787095732579473406076,-0.0377562247311138479699543779588566394523} +#define T_1331_9 {0.9990976119797584376058807720255572348833,-0.0424730707195049780877660339228896191344} +#define T_1331_10 {0.9988859800246808751111871060857083648443,-0.0471889702169150174193568147984478855506} +#define T_1331_12 {0.9983959423143420330859498790232464671135,-0.0566175093964489117848160049106809310615} +#define T_1331_13 {0.9981175474793243518334406871872488409281,-0.0613299389683276963136648873842204920948} +#define T_1331_14 {0.9978169100976583827389276848407462239265,-0.0660410018334174064857933217354002408683} +#define T_1331_15 {0.9974940368688970382393677027721423655748,-0.0707505930080557510208905114268418401480} +#define T_1331_16 {0.9971489349881073049886026637977920472622,-0.0754586075413763018859825137951702345163} +#define T_1331_17 {0.9967816121457107048087209477671422064304,-0.0801649405176473039968954026335268281400} +#define T_1331_18 {0.9963920765273118762550552673928905278444,-0.0848694870586096522524499619066773448139} +#define T_1331_19 {0.9959803368135157208840269049687776714563,-0.0895721423258140497791757184131711255759} +#define T_1331_20 {0.9955464021797342244468609351315535604954,-0.0942728015229572641198174665078113321215} +#define T_1331_21 {0.9950902822959821758530551960575394332409,-0.0989713598982175091212099005133495666087} +#define T_1331_23 {0.9941115279303341933925253215420525521040,-0.1083617554122139614580788702369318343699} +#define T_1331_24 {0.9935889152594860052047920362383592873812,-0.1130533832907174573989550481201149523258} +#define T_1331_25 {0.9930441609602771446674296385026536881924,-0.1177424918315358032261386256323021370918} +#define T_1331_26 {0.9924772771722824016293884596962016075850,-0.1224289765402477581091034153359942138195} +#define T_1331_27 {0.9918882765282215352442563016666099429131,-0.1271127329809028649876267991203349083662} +#define T_1331_28 {0.9912771721536771662997011844709049910307,-0.1317936567783487000760089813411468639970} +#define T_1331_29 {0.9906439776668022334504826176271308213472,-0.1364716436205569427553285777321434579790} +#define T_1331_30 {0.9899887071780169023327289323788136243820,-0.1411465892609477801311612665813299827278} +#define T_1331_31 {0.9893113752896939283587585123314056545496,-0.1458183895207130487126079287918400950730} +#define T_1331_32 {0.9886119970958331393262596975546330213547,-0.1504869402911378772902395439814426936209} +#define T_1331_34 {0.9871471646236041896926849403826054185629,-0.1598138772934909745959686233618413098156} +#define T_1331_35 {0.9863817429882896226089883384702261537313,-0.1644720556793274890416967082273913547397} +#define T_1331_36 {0.9855943403328170093757876202289480715990,-0.1691265688882717943553046779925352893770} +#define T_1331_37 {0.9847849742040577503132681158604100346565,-0.1737773131968421147508507829115842469037} +#define T_1331_38 {0.9839536626383281747720843668503221124411,-0.1784241849655448797218326717484160326421} +#define T_1331_39 {0.9831004241609878624430507443321403115988,-0.1830670806411839601768321017516427673399} +#define T_1331_40 {0.9822252777860261963027710407914128154516,-0.1877058967591685445519544828130165114999} +#define T_1331_41 {0.9813282430156392566189538229082245379686,-0.1923405299458187112104923244260135106742} +#define T_1331_42 {0.9804093398397950576139692202559672296047,-0.1969708769206690301967199729915591888130} +#define T_1331_43 {0.9794685887357876818981594624347053468227,-0.2015968344987701388326684082130668684840} +#define T_1331_45 {0.9775216270863439360638835751160513609648,-0.2108351692163020485448754470780841074884} +#define T_1331_46 {0.9765154599279670666334141060360707342625,-0.2154473404840983596919556930515682324767} +#define T_1331_47 {0.9754875316145789065913618287595454603434,-0.2200547106164642552350585447129560634494} +#define T_1331_48 {0.9744378650530443053412454901263117790222,-0.2246571769404777374479209584023919887841} +#define T_1331_49 {0.9733664836346533899558153279940597712994,-0.2292546368924956723400754299291293136775} +#define T_1331_50 {0.9722734112346014256900161853991448879242,-0.2338469880204395168199482668569544330239} +#define T_1331_51 {0.9711586722114561309737723604484926909208,-0.2384341279860781315225182197536923922598} +#define T_1331_52 {0.9700222914066148893752483672869857400656,-0.2430159545673086785022576350456802174449} +#define T_1331_53 {0.9688642941437515254676782205933704972267,-0.2475923656604342715237265792893595062196} +#define T_1331_54 {0.9676847062282514233544361559324897825718,-0.2521632592824394891728445600165287032723} +#define T_1331_56 {0.9652608640659816607865195692284032702446,-0.2612880867980675869688411694369278848171} +#define T_1331_57 {0.9640166638333133031224519982060883194208,-0.2658418173495067371447930781869217753410} +#define T_1331_58 {0.9627509809750074420975352040841244161129,-0.2703896237499895938150018537271535024047} +#define T_1331_59 {0.9614638436961685519221987306082155555487,-0.2749314046539418354697659196972381323576} +#define T_1331_60 {0.9601552806800025541278387208876665681601,-0.2794670588500645092011609449400566518307} +#define T_1331_61 {0.9588253210871769960377264396811369806528,-0.2839964852635890046883559989510104060173} +#define T_1331_62 {0.9574739945551712372306951692735310643911,-0.2885195829585298632480316882720217108727} +#define T_1331_63 {0.9561013311976160888860931663657538592815,-0.2930362511399338676376657986111240461469} +#define T_1331_64 {0.9547073616036230170323051424929872155190,-0.2975463891561264118124086053285282105207} +#define T_1331_65 {0.9532921168371020215204225678462535142899,-0.3020498965009540959236744583904510363936} +#define T_1331_67 {0.9503979284119460357871389533102046698332,-0.3110366178929442337874888835358433425426} +#define T_1331_68 {0.9489190492488420725081255113764200359583,-0.3155196316755484176042045874055474996567} +#define T_1331_69 {0.9474190239028343896166006743442267179489,-0.3199956142621339649778633429377805441618} +#define T_1331_70 {0.9458978858012323520654263120377436280251,-0.3244644659076841297640214634157018736005} +#define T_1331_71 {0.9443556688418320854694343324808869510889,-0.3289260870260916624907565619650995358825} +#define T_1331_72 {0.9427924073921613024040766504185739904642,-0.3333803781923779241402883144473889842629} +#define T_1331_73 {0.9412081362887139146522486043977551162243,-0.3378272401449084472169204218516824766994} +#define T_1331_74 {0.9396028908361733211762611972517333924770,-0.3422665737876052771682111597328912466764} +#define T_1331_75 {0.9379767068066253710156843226286582648754,-0.3466982801921548174028941957658389583230} +#define T_1331_76 {0.9363296204387621113340856027207337319851,-0.3511222606002128432400866131501970812678} +#define T_1331_78 {0.9329728879710285083248777482367586344481,-0.3599466492565234210232461009582038968801} +#define T_1331_79 {0.9312633166742499968293600431934464722872,-0.3643468608577209444376876490423455834389} +#define T_1331_80 {0.9295329926436727285832262168696615844965,-0.3687389531726989466697830266639357432723} +#define T_1331_81 {0.9277819544386957106851809840009082108736,-0.3731228283258931077703834944259142503142} +#define T_1331_82 {0.9260102410803232597658052327460609376431,-0.3774983886248544640018565132777439430356} +#define T_1331_83 {0.9242178920502946981585523644753266125917,-0.3818655365624263331447707514598732814193} +#define T_1331_84 {0.9224049472902052793088500948215369135141,-0.3862241748189172430016924408846534788609} +#define T_1331_85 {0.9205714472006154558414436905877664685249,-0.3905742062642698630980930829537101089954} +#define T_1331_86 {0.9187174326401509327766348178556654602289,-0.3949155339602250514019488036865368485451} +#define T_1331_87 {0.9168429449245920626054839885910041630268,-0.3992480611624825148631146021216409280896} +#define T_1331_89 {0.9130327175715514664844363323936704546213,-0.4078863280914273303778827539645135402679} +#define T_1331_90 {0.9110970628430665740893346082884818315506,-0.4121918753186884498695974343718262389302} +#define T_1331_91 {0.9091411047755887375743100164982024580240,-0.4164882370576893366731496826105285435915} +#define T_1331_92 {0.9071648869566579787004911850090138614178,-0.4207753175661732569778905599378049373627} +#define T_1331_93 {0.9051684534252927294417645498469937592745,-0.4250530213087111430780851151212118566036} +#define T_1331_94 {0.9031518486710083948310057166963815689087,-0.4293212529588303905114798908471129834652} +#define T_1331_95 {0.9011151176328257017544842710776720196009,-0.4335799174011392143057719295029528439045} +#define T_1331_96 {0.8990583056982697218728617372107692062855,-0.4378289197334462312660718907864065840840} +#define T_1331_97 {0.8969814587023582364011531353753525763750,-0.4420681652688753238145125123992329463363} +#define T_1331_98 {0.8948846229265798868368619878310710191727,-0.4462975595379756188485487200523493811488} +#define T_1331_100 {0.8906311723875368624447901311214081943035,-0.4547264174991393903546565979922888800502} +#define T_1331_101 {0.8884746524102742348816263984190300107002,-0.4589256933583500397411114590795477852225} +#define T_1331_102 {0.8862983332230369715176721001625992357731,-0.4631147422897120558005212842545006424189} +#define T_1331_103 {0.8841022633240016581623876845696941018105,-0.4672934709423806620698371716571273282170} +#define T_1331_104 {0.8818864916514795870838838709460105746984,-0.4714617861954931710144478529400657862425} +#define T_1331_105 {0.8796510675828258518649249708687420934439,-0.4756195951602444349504139609052799642086} +#define T_1331_106 {0.8773960409333396714970376706332899630070,-0.4797668051819565238069742463267175480723} +#define T_1331_107 {0.8751214619551536122443735621345695108175,-0.4839033238421436844411971378576708957553} +#define T_1331_108 {0.8728273813361143718125845225586090236902,-0.4880290589605717488375091761554358527064} +#define T_1331_109 {0.8705138501986525723097543050243984907866,-0.4921439185973122687478564785124035552144} +#define T_1331_111 {0.8658286430242602138562801883381325751543,-0.5003406448798339845041027729166671633720} +#define T_1331_112 {0.8634570713948109821345155978633556514978,-0.5044223288656999315549001039471477270126} +#define T_1331_113 {0.8610662580595750403844590437074657529593,-0.5084927720541181361824101259117014706135} +#define T_1331_114 {0.8586562562966230105487852597434539347887,-0.5125518837373130853762859260314144194126} +#define T_1331_115 {0.8562271198116297998836898841545917093754,-0.5165995734600261624080985711771063506603} +#define T_1331_116 {0.8537789027366781136052509282308164983988,-0.5206357510215312567325440795684698969126} +#define T_1331_117 {0.8513116596290519755285686187562532722950,-0.5246603264776450448181321917218156158924} +#define T_1331_118 {0.8488254454700209228334983890817966312170,-0.5286732101427312757735421655524987727404} +#define T_1331_119 {0.8463203156636148749569770188827533274889,-0.5326743125916992838142505206633359193802} +#define T_1331_120 {0.8437963260353892325227320725389290601015,-0.5366635446619969496140356568503193557262} +#define T_1331_126 {0.8282595110357071632734005106613039970398,-0.5603446996080995168298954922647681087255} +#define T_1331_138 {0.7952056137059991813842430019576568156481,-0.6063398650348377216801054601091891527176} +#define T_1331_144 {0.7777150468726141685493757904623635113239,-0.6286169786666023773236133820319082587957} +#define T_1331_150 {0.7596006050236041584611257349024526774883,-0.6503898222203161605392551791737787425518} +#define T_1331_156 {0.7408768193780026489747569939936511218548,-0.6716409297447070159137183509301394224167} +#define T_1331_162 {0.7215587099641721513521019915060605853796,-0.6923532538197821883940719089878257364035} +#define T_1331_168 {0.7016617735708928771387604683695826679468,-0.7125101792320928106860833395330701023340} +#define T_1331_174 {0.6812019713159972456040236465923953801394,-0.7320955363032882745599749796383548527956} +#define T_1331_180 {0.6601957158425255656197805365081876516342,-0.7510936138612651635781958248117007315159} +#define T_1331_186 {0.6386598581526726770007940103823784738779,-0.7694891718435081795490759759559296071529} +#define T_1331_192 {0.6166116740900869919173032940307166427374,-0.7872674535225118175318925750616472214460} +#define T_1331_204 {0.5710494709478671770952473707438912242651,-0.8209156483647762625466270947072189301252} +#define T_1331_210 {0.5475720013992103618605256087903399020433,-0.8367585692920408435568901950318831950426} +#define T_1331_216 {0.5236552752201175664481525018345564603806,-0.8519302510969679609331706160446628928185} +#define T_1331_222 {0.4993184781624636081609480697807157412171,-0.8664185232123798874326325858419295400381} +#define T_1331_228 {0.4745811329546638090981502955401083454490,-0.8802117632953263548500899560167454183102} +#define T_1331_234 {0.4494630836407417784350570855167461559176,-0.8932989065504085557378743942535948008299} +#define T_1331_240 {0.4239844796616416000567539867915911599994,-0.9056694546058440264957312137994449585676} +#define T_1331_246 {0.3981657596915530450409903551189927384257,-0.9173134839351531066853340234956704080105} +#define T_1331_252 {0.3720276352422174914735819584166165441275,-0.9282216538177093800854322580562438815832} +#define T_1331_258 {0.3455910740483661425237471576110692694783,-0.9383852138317699820291295509377960115671} +#define T_1331_270 {0.2919076923683351365568228175106924027205,-0.9564464956996776567876850094762630760670} +#define T_1331_276 {0.2647039361390695066944545033038593828678,-0.9643297289788816950562022611848078668118} +#define T_1331_282 {0.2372878371334118663416035133195691742003,-0.9714393868628898864869825047207996249199} +#define T_1331_288 {0.2096813882641448412158524661208502948284,-0.9777697660572354232400016371684614568949} +#define T_1331_294 {0.1819067351407546440089646466731210239232,-0.9833157883968055568146837686072103679180} +#define T_1331_300 {0.1539861583044511150752953199116745963693,-0.9880730049194930142419934782083146274090} +#define T_1331_306 {0.1259420553549464583120709448849083855748,-0.9920375994351079418365202400309499353170} +#define T_1331_312 {0.0977969229833310083055408767904737032950,-0.9952063915866861121273245771590154618025} +#define T_1331_318 {0.0695733389254589440486853391121258027852,-0.9975768394017391349493095731304492801428} +#define T_1331_324 {0.0412939438503206746045037789372145198286,-0.9991470413313980891700794018106535077095} +#define T_1331_336 {-0.0153415110186981250295357170898569165729,-0.9998823120946100928563282650429755449295} +#define T_1331_342 {-0.0436521384328999798030146450855681905523,-0.9990467911015153967824176106660161167383} +#define T_1331_348 {-0.0719277485503986291348610393470153212547,-0.9974098450428843243287246878026053309441} +#define T_1331_354 {-0.1001456589674334651807185991856385953724,-0.9949727870600172385806558850163128226995} +#define T_1331_360 {-0.1282832335663504630396403172198915854096,-0.9917375721357748474815707595553249120712} +#define T_1331_366 {-0.1563179006740629817251431177282938733697,-0.9877067955263109100272345131088513880968} +#define T_1331_372 {-0.1842271711688159563369282523126457817852,-0.9828836906791850180908909351273905485868} +#define T_1331_378 {-0.2119886565207281803324690372392069548368,-0.9772721266395234529511526488931849598885} +#define T_1331_384 {-0.2395800867516407473001294192727073095739,-0.9708766049463115610507202291046269237995} +#define T_1331_390 {-0.2669793282998646488657357167539885267615,-0.9637022560213051036726028542034327983856} +#define T_1331_402 {-0.3211134995920587043016780626203399151564,-0.9470407173821729918117284796608146280050} +#define T_1331_408 {-0.3478050034603221152273988536762772127986,-0.9375668933830616502689281333005055785179} +#define T_1331_414 {-0.3742175017302744599589914287207648158073,-0.9273409628603451748674046939413528889418} +#define T_1331_420 {-0.4003298065673261962871265495778061449528,-0.9163711289503654322530223907961044460535} +#define T_1331_426 {-0.4261209709489738317600426853459794074297,-0.9046661915411141619003387859265785664320} +#define T_1331_432 {-0.4515703054682857486845648509188322350383,-0.8922355402130532908699933614116162061691} +#define T_1331_438 {-0.4766573949307317814572115821647457778454,-0.8790891467068902009884823200991377234459} +#define T_1331_444 {-0.5013621147310426406917827080178540199995,-0.8652375569243494490834223142883274704218} +#define T_1331_450 {-0.5256646469969610269146187420119531452656,-0.8506918824683590285573586697864811867476} +#define T_1331_456 {-0.5495454964869354030909676112059969455004,-0.8354637917294367444043245995999313890934} +#define T_1331_468 {-0.5959658728883110434537684341194108128548,-0.8030097623020989727748997211165260523558} +#define T_1331_474 {-0.6184681618509685341322779095207806676626,-0.7858098579024598295816872450814116746187} +#define T_1331_480 {-0.6404743220120940883077764738118276000023,-0.7679795849130029994356050337955821305513} +#define T_1331_486 {-0.6619667002562314994662528988556005060673,-0.7495332465954239742345066588313784450293} +#define T_1331_492 {-0.6829280556184968986599415075033903121948,-0.7304856404126909286134150534053333103657} +#define T_1331_498 {-0.7033415731151031469181589272920973598957,-0.7108520461586729233260939508909359574318} +#define T_1331_504 {-0.7231908772321671285965294373454526066780,-0.6906482137008453925375306425848975777626} +#define T_1331_510 {-0.7424600450619787128658799701952375471592,-0.6698903503459089359139966290968004614115} +#define T_1331_516 {-0.7611336190761933684356677076721098273993,-0.6485951078384541990118350440752692520618} +#define T_1331_522 {-0.7791966195257030713960944012796971946955,-0.6267795690031039423928405085462145507336} +#define T_1331_534 {-0.8134334413370495342476829137012828141451,-0.5816580064905534364072536845924332737923} +#define T_1331_540 {-0.8295797982723868146948120738670695573092,-0.5583881788669472490127532182668801397085} +#define T_1331_546 {-0.8450606748217003305612138319702353328466,-0.5346704179865315875730402694898657500744} +#define T_1331_552 {-0.8598636523849668700236748009047005325556,-0.5105237499932642197819632201571948826313} +#define T_1331_558 {-0.8739768561657610401738338623545132577419,-0.4859675450959793363558958390058251097798} +#define T_1331_564 {-0.8873889646970948286153202388959471136332,-0.4610215020298057964076576809020480141044} +#define T_1331_570 {-0.9000892189233820062099766801111400127411,-0.4357056322540439063928374707757029682398} +#define T_1331_576 {-0.9120674308312418654409725604637060314417,-0.4100402438991784803690165972511749714613} +#define T_1331_582 {-0.9233139916222190546335468752658925950527,-0.3840459254759054363859149816562421619892} +#define T_1331_588 {-0.9338198794208626418722474227251950651407,-0.3577435293592400866735658837569644674659} +#define T_1331_600 {-0.9525765261012730800160852595581673085690,-0.3042991323037755324598663264623610302806} +#define T_1331_606 {-0.9608122385938927534354547788097988814116,-0.2772000039108812319277319602406350895762} +#define T_1331_611 {-0.9670868239715223468877525192510802298784,-0.2544466052056378324586205508239800110459} +#define T_1331_612 {-0.9682771973859432002029734576353803277016,-0.2498785085244891179812043446872849017382} +#define T_1331_617 {-0.9739048872200914974683882974204607307911,-0.2269565391188826108948006776699912734330} +#define T_1331_618 {-0.9749654141642219995489426764834206551313,-0.2223565631673307674276429679594002664089} +#define T_1331_623 {-0.9799416939799570069880019218544475734234,-0.1992844108295785676698841371035086922348} +#define T_1331_624 {-0.9808715237099848716084693478478584438562,-0.1946562456610438085480296876994543708861} +#define T_1331_629 {-0.9851924015867951078817554844135884195566,-0.1714524186345675882403583045743289403617} +#define T_1331_630 {-0.9859907882028695613740865155705250799656,-0.1667997769155701581755835150033817626536} +#define T_1331_635 {-0.9896527979769412786126281389442738145590,-0.1434828890718732874454133252584142610431} +#define T_1331_636 {-0.9903191010215264267202428527525626122952,-0.1388095031037706594823077921319054439664} +#define T_1331_641 {-0.9933193050662645351422952444409020245075,-0.1153982590105817329639847912403638474643} +#define T_1331_642 {-0.9938529900379077242078551535087171941996,-0.1107078777355546828076171550492290407419} +#define T_1331_647 {-0.9961889816204698711032961000455543398857,-0.0872210576522157166046156362426700070500} +#define T_1331_648 {-0.9965896204025731508480134834826458245516,-0.0825174436459055610226798194162256550044} +#define T_1331_653 {-0.9982595256145260220748127721890341490507,-0.0589738884580409197799788501015427755192} +#define T_1331_654 {-0.9985267968187769849208734740386717021465,-0.0542608149112499896227213014299195492640} +#define T_1331_659 {-0.9995292760793262898388888970657717436552,-0.0306794110168016366524401661308729671873} +#define T_1331_665 {-0.9999972144341010560353311120707076042891,-0.0023603228674316028765090003105342475465} +#define T_1331_666 {-0.9999972144341010560353311120707076042891,0.0023603228674316028765090003105342475465} +#define T_1331_672 {-0.9995292760793262898388888970657717436552,0.0306794110168016366524401661308729671873} +#define T_1331_678 {-0.9982595256145260220748127721890341490507,0.0589738884580409197799788501015427755192} +#define T_1331_684 {-0.9961889816204698711032961000455543398857,0.0872210576522157166046156362426700070500} +#define T_1331_690 {-0.9933193050662645351422952444409020245075,0.1153982590105817329639847912403638474643} +#define T_1331_696 {-0.9896527979769412786126281389442738145590,0.1434828890718732874454133252584142610431} +#define T_1331_702 {-0.9851924015867951078817554844135884195566,0.1714524186345675882403583045743289403617} +#define T_1331_708 {-0.9799416939799570069880019218544475734234,0.1992844108295785676698841371035086922348} +#define T_1331_714 {-0.9739048872200914974683882974204607307911,0.2269565391188826108948006776699912734330} +#define T_1331_720 {-0.9670868239715223468877525192510802298784,0.2544466052056378324586205508239800110459} +// Pre-computed twiddles for N=1728 +#define T_1728_1 {0.9999933893861943090541899437084794044495,-0.0036360945960279551607274850510975738871} +#define T_1728_5 {0.9998347390248375488042142933409195393324,-0.0181795115208017304953003190348681528121} +#define T_1728_7 {0.9996760970536633372773849259829148650169,-0.0254499701287601541588312414887695922516} +#define T_1728_11 {0.9992002214785726810220012339414097368717,-0.0399864651747465493780708811755175702274} +#define T_1728_13 {0.9988830130412104102433090702106710523367,-0.0472517328540768138944727638772747013718} +#define T_1728_17 {0.9980901387433863680698209464026149362326,-0.0617743874369278758718060373666958184913} +#define T_1728_19 {0.9976145148138723106612246738222893327475,-0.0690310063137001966859784829466661904007} +#define T_1728_23 {0.9965050191585387295845066546462476253510,-0.0835329084363788249012472419963160064071} +#define T_1728_25 {0.9958712061081035349374701581837143748999,-0.0907774247530250127935502746368001680821} +#define T_1728_29 {0.9944456171551021572696527073276229202747,-0.1052516722955411793138935649949416983873} +#define T_1728_31 {0.9936539166444308124326312281482387334108,-0.1124806380546562040168012686081056017429} +#define T_1728_35 {0.9919129128968936504051612246257718652487,-0.1269203420591022979380113611114211380482} +#define T_1728_37 {0.9909637017325507724052613411913625895977,-0.1341303166645042843629909157243673689663} +#define T_1728_41 {0.9889081118139692039292754088819492608309,-0.1485286046138259152815663810542901046574} +#define T_1728_43 {0.9878018417690599406455476128030568361282,-0.1557161565080936171945324986154446378350} +#define T_1728_47 {0.9854326440289048472109811882546637207270,-0.1700661755970356014966426982937264256179} +#define T_1728_49 {0.9841698416280552086377042542153503745794,-0.1772278838947438373008225198645959608257} +#define T_1728_53 {0.9814881636761355609976931191340554505587,-0.1915228042914142447816772119040251709521} +#define T_1728_55 {0.9800694299448950985720330209005624055862,-0.1986552604072901184739663449363433755934} +#define T_1728_59 {0.9770765481146769237952298681193497031927,-0.2128882785037904123992547056332114152610} +#define T_1728_61 {0.9755025582934635508536302950233221054077,-0.2199880877750153285177248108084313571453} +#define T_1728_65 {0.9721998970346039659062853388604708015919,-0.2341524294255892535421281763774459250271} +#define T_1728_67 {0.9704714002573253672778719192137941718102,-0.2412162127274744549687568451190600171685} +#define T_1728_71 {0.9668605314577121134789194911718368530273,-0.2553051364726344751865383386757457628846} +#define T_1728_73 {0.9649783503952192864616677070443984121084,-0.2623295318269015519163644967193249613047} +#define T_1728_77 {0.9610609926328369523318428946367930620909,-0.2763363321019985385795791898999596014619} +#define T_1728_79 {0.9590260231013771896613206990878097712994,-0.2833179962768987425469902063923655077815} +#define T_1728_83 {0.9548040408263572809133279406523797661066,-0.2972360066036075498807633721298770979047} +#define T_1728_85 {0.9526172513612137793259648788080085068941,-0.3041716167051193298043187951407162472606} +#define T_1728_89 {0.9480926540084575471212247066432610154152,-0.3179942128643213350436269593046745285392} +#define T_1728_91 {0.9457550854029795894817311818769667297602,-0.3248804679176677256968730489461449906230} +#define T_1728_95 {0.9409300264357753906807602106709964573383,-0.3386010711022204300313376279518706724048} +#define T_1728_97 {0.9384427912460171494757332766312174499035,-0.3454346936229544251517609154689125716686} +#define T_1728_101 {0.9333195671311090846344882265839260071516,-0.3590467735688473438493417688732733950019} +#define T_1728_103 {0.9306838491463124141134244382556062191725,-0.3658245111227567680778349767933832481503} +#define T_1728_107 {0.9252648982609064098880935489432886242867,-0.3793215892171634129326207585108932107687} +#define T_1728_109 {0.9224819519400822009913554211379960179329,-0.3860402159682536638030114772845990955830} +#define T_1728_113 {0.9167698534113102315501464545377530157566,-0.3994158683330004677713986893650144338608} +#define T_1728_115 {0.9138410032861832288375580901629291474819,-0.4060721865788176065947823190072085708380} +#define T_1728_119 {0.9078384757635787893903511758253443986177,-0.4193200471278017982257324547390453517437} +#define T_1728_121 {0.9047651158081826405776837418670766055584,-0.4259108888213658516974646772723644971848} +#define T_1728_125 {0.8984750161697507842006871214834973216057,-0.4390246522904678316834292672865558415651} +#define T_1728_127 {0.8952586091369715282084484897495713084936,-0.4455468805480913840888490540237398818135} +#define T_1728_131 {0.8886839311294693066756167354469653218985,-0.4585203054961389246280134557309793308377} +#define T_1728_133 {0.8853260078548547129528856203251052647829,-0.4649708160904140741287449145602295175195} +#define T_1728_137 {0.8784698806689305028427838806237559765577,-0.4777977278697702057108642748062266036868} +#define T_1728_139 {0.8749720393420946651374947578005958348513,-0.4841734507070126203309712309419410303235} +#define T_1728_143 {0.8678377261229622829930008265364449471235,-0.4968477444023736144806946413154946640134} +#define T_1728_145 {0.8642016315269342996430168568622320890427,-0.5031456449838206390623440711351577192545} +#define T_1728_149 {0.8567925278212928930088310153223574161530,-0.5156612883178249839843942936568055301905} +#define T_1728_157 {0.8414321982753925066589317793841473758221,-0.5403627075450715411264468457375187426805} +#define T_1728_161 {0.8334842217000436592755363562901038676500,-0.5525432581952950394921231236367020756006} +#define T_1728_163 {0.8294440098560504903701939838356338441372,-0.5585898625234044789777954065357334911823} +#define T_1728_173 {0.8085893309906420078903011017246171832085,-0.5883734305762845506748703883204143494368} +#define T_1728_175 {0.8042892153566539858999817624862771481276,-0.5942380483114303446612325387832242995501} +#define T_1728_185 {0.7821552445324865265874336728302296251059,-0.6230836006912123803402892008307389914989} +#define T_1728_187 {0.7776034106230155229511069592263083904982,-0.6287550682002125412850546126719564199448} +#define T_1728_191 {0.7683766156489815202235149627085775136948,-0.6399979504059504220592202727857511490583} +#define T_1728_199 {0.7494373936011673453805315148201771080494,-0.6620752170805738190395572928537148982286} +#define T_1728_203 {0.7397289728574233391711345575458835810423,-0.6729049314095575384797598417208064347506} +#define T_1728_205 {0.7348159530513419523956031298439484089613,-0.6782665516898559454617156916356179863214} +#define T_1728_215 {0.7096732139225847557639781371108256280422,-0.7045309996308105615625549944525118917227} +#define T_1728_217 {0.7045309996308105615625549944525118917227,-0.7096732139225847557639781371108256280422} +#define T_1728_245 {0.6287550682002125412850546126719564199448,-0.7776034106230155229511069592263083904982} +#define T_1728_259 {0.5883734305762845506748703883204143494368,-0.8085893309906420078903011017246171832085} +#define T_1728_287 {0.5031456449838206390623440711351577192545,-0.8642016315269342996430168568622320890427} +#define T_1728_301 {0.4585203054961389246280134557309793308377,-0.8886839311294693066756167354469653218985} +#define T_1728_329 {0.3658245111227567680778349767933832481503,-0.9306838491463124141134244382556062191725} +#define T_1728_343 {0.3179942128643213350436269593046745285392,-0.9480926540084575471212247066432610154152} +#define T_1728_371 {0.2199880877750153285177248108084313571453,-0.9755025582934635508536302950233221054077} +#define T_1728_385 {0.1700661755970356014966426982937264256179,-0.9854326440289048472109811882546637207270} +#define T_1728_413 {0.0690310063137001966859784829466661904007,-0.9976145148138723106612246738222893327475} +#define T_1728_427 {0.0181795115208017304953003190348681528121,-0.9998347390248375488042142933409195393324} +#define T_1728_455 {-0.0835329084363788249012472419963160064071,-0.9965050191585387295845066546462476253510} +#define T_1728_469 {-0.1341303166645042843629909157243673689663,-0.9909637017325507724052613411913625895977} +#define T_1728_497 {-0.2341524294255892535421281763774459250271,-0.9721998970346039659062853388604708015919} +#define T_1728_511 {-0.2833179962768987425469902063923655077815,-0.9590260231013771896613206990878097712994} +#define T_1728_539 {-0.3793215892171634129326207585108932107687,-0.9252648982609064098880935489432886242867} +#define T_1728_553 {-0.4259108888213658516974646772723644971848,-0.9047651158081826405776837418670766055584} +#define T_1728_581 {-0.5156612883178249839843942936568055301905,-0.8567925278212928930088310153223574161530} +#define T_1728_595 {-0.5585898625234044789777954065357334911823,-0.8294440098560504903701939838356338441372} +#define T_1728_623 {-0.6399979504059504220592202727857511490583,-0.7683766156489815202235149627085775136948} +#define T_1728_637 {-0.6782665516898559454617156916356179863214,-0.7348159530513419523956031298439484089613} +#define T_1728_665 {-0.7494373936011673453805315148201771080494,-0.6620752170805738190395572928537148982286} +#define T_1728_679 {-0.7821552445324865265874336728302296251059,-0.6230836006912123803402892008307389914989} +#define T_1728_707 {-0.8414321982753925066589317793841473758221,-0.5403627075450715411264468457375187426805} +#define T_1728_721 {-0.8678377261229622829930008265364449471235,-0.4968477444023736144806946413154946640134} +#define T_1728_749 {-0.9138410032861832288375580901629291474819,-0.4060721865788176065947823190072085708380} +#define T_1728_763 {-0.9333195671311090846344882265839260071516,-0.3590467735688473438493417688732733950019} +#define T_1728_791 {-0.9649783503952192864616677070443984121084,-0.2623295318269015519163644967193249613047} +#define T_1728_805 {-0.9770765481146769237952298681193497031927,-0.2128882785037904123992547056332114152610} +#define T_1728_833 {-0.9936539166444308124326312281482387334108,-0.1124806380546562040168012686081056017429} +#define T_1728_847 {-0.9980901387433863680698209464026149362326,-0.0617743874369278758718060373666958184913} +#define T_1728_875 {-0.9992002214785726810220012339414097368717,0.0399864651747465493780708811755175702274} +#define T_1728_889 {-0.9958712061081035349374701581837143748999,0.0907774247530250127935502746368001680821} +#define T_1728_917 {-0.9814881636761355609976931191340554505587,0.1915228042914142447816772119040251709521} +#define T_1728_931 {-0.9704714002573253672778719192137941718102,0.2412162127274744549687568451190600171685} +#define T_1728_959 {-0.9409300264357753906807602106709964573383,0.3386010711022204300313376279518706724048} +#define T_1728_973 {-0.9224819519400822009913554211379960179329,0.3860402159682536638030114772845990955830} +#define T_1728_1001 {-0.8784698806689305028427838806237559765577,0.4777977278697702057108642748062266036868} +// Pre-computed twiddles for N=2000 +#define T_2000_1 {0.9999950652018582131219659459020476788282,-0.0031415874858795635182762140402701334096} +#define T_2000_3 {0.9999555871089498282344720792025327682495,-0.0094246384331440075793340227505723305512} +#define T_2000_7 {0.9997582044369840437170182667614426463842,-0.0219893760925051091892346022405035910197} +#define T_2000_9 {0.9996003076502565365402119823556859046221,-0.0282705667702732518775654568798927357420} +#define T_2000_11 {0.9994029483549728976043979855603538453579,-0.0345506413744722726266722645505069522187} +#define T_2000_13 {0.9991661343425400909623590450792107731104,-0.0408293519785099950913931365903408732265} +#define T_2000_17 {0.9985741811195097961117994600499514490366,-0.0533816897587604741182154555190209066495} +#define T_2000_19 {0.9982190652782118034380687277007382363081,-0.0596548213901706980699657378863776102662} +#define T_2000_21 {0.9978245414574414828834392210410442203283,-0.0659255979513778539446988702366070356220} +#define T_2000_23 {0.9973906252323236909518300308263860642910,-0.0721937718828605939913600764157308731228} +#define T_2000_27 {0.9964046856445923916467677372565958648920,-0.0847213221420734380329164991962898056954} +#define T_2000_29 {0.9958527012051856530661098076961934566498,-0.0909802039035699089408737449957698117942} +#define T_2000_31 {0.9952614022063083210056788630026858299971,-0.0972354939223993297581216666003456339240} +#define T_2000_33 {0.9946308119914323286891999487124849110842,-0.1034869452504225267874815585855685640126} +#define T_2000_37 {0.9932518590423393689547992835287004709244,-0.1159773448089613662181918130045232828707} +#define T_2000_39 {0.9925035507468237261363697143679019063711,-0.1222157999398894440146179363182454835624} +#define T_2000_41 {0.9917160601105629025298071610450278967619,-0.1284494302003028332048728543668403290212} +#define T_2000_43 {0.9908894182223386692953681631479412317276,-0.1346779894971525948221824364736676216125} +#define T_2000_47 {0.9891188127719617861899337185604963451624,-0.1471189118386373728686322692738031037152} +#define T_2000_49 {0.9881749191102805474073988989403005689383,-0.1533307837369606296817892143735662102699} +#define T_2000_51 {0.9871920139948191508239005997893400490284,-0.1595366023984862990925392978169838897884} +#define T_2000_53 {0.9861701362289888583845254288462456315756,-0.1657361228281196974254640963408746756613} +#define T_2000_57 {0.9840096256511396966004667774541303515434,-0.1781152902642101354402370816387701779604} +#define T_2000_59 {0.9828710781323791723096405803516972810030,-0.1842944485623333183976768623324460349977} +#define T_2000_61 {0.9816937285463989137923590533318929374218,-0.1904663312311898892836836694186786189675} +#define T_2000_63 {0.9804776233729444179232359601883217692375,-0.1966306946154200430143532685178797692060} +#define T_2000_67 {0.9779293398307218332377033220836892724037,-0.2089358904024117002951044241854106076062} +#define T_2000_69 {0.9765972620638245782487274482264183461666,-0.2150762370171133697205334556201705709100} +#define T_2000_71 {0.9752266299092233747813907029922120273113,-0.2212080927902471405577244922824320383370} +#define T_2000_73 {0.9738174974771288683328407387307379394770,-0.2273312156466464339921174087066901847720} +#define T_2000_77 {0.9708839558187309926040597929386422038078,-0.2395502960419218507759353542496683076024} +#define T_2000_79 {0.9693596624036292519477342466416303068399,-0.2456457711924263376257471236385754309595} +#define T_2000_81 {0.9677971003288654561202974946354515850544,-0.2517315486684971181929881822725292295218} +#define T_2000_83 {0.9661963312817146709576832108723465353251,-0.2578073882140598516166107856406597420573} +#define T_2000_87 {0.9628804265585876320798774941067676991224,-0.2699282944604963208057313295284984633327} +#define T_2000_89 {0.9611654217888518925150265204138122498989,-0.2759728826487457542526726683718152344227} +#define T_2000_91 {0.9594124718540428808921660674968734383583,-0.2820065759001294525276648528233636170626} +#define T_2000_93 {0.9576216459576222250760224596888292580843,-0.2880291360147692047988243757572490721941} +#define T_2000_97 {0.9539266505673935636622218225966207683086,-0.3000399062412762440921198958676541224122} +#define T_2000_99 {0.9520226269456766310383954987628385424614,-0.3060276421885007036571835214999737218022} +#define T_2000_101 {0.9500810191007716953848216689948458224535,-0.3120032966884148661890208131808321923018} +#define T_2000_103 {0.9481019036840320213244126534846145659685,-0.3179666338324109720048227245570160448551} +#define T_2000_107 {0.9440314641410497698004178346309345215559,-0.3298554148588528911467676607571775093675} +#define T_2000_109 {0.9419403007087906454231074349081609398127,-0.3357803893925806471010275799926603212953} +#define T_2000_111 {0.9398119510863196524041995871812105178833,-0.3416921078914832987649674578278791159391} +#define T_2000_113 {0.9376464992972356471412354039784986525774,-0.3475903369710370283129918789200019091368} +#define T_2000_117 {0.9332046326338986341752956832351628690958,-0.3593453960058906604402295670297462493181} +#define T_2000_119 {0.9309283931169357595436508745478931814432,-0.3652017618915878172103361976041924208403} +#define T_2000_121 {0.9286154021410173431405610244837589561939,-0.3710437102370510165094685817166464403272} +#define T_2000_123 {0.9262657510190666076965726460912264883518,-0.3768710104121626436324277165113016963005} +#define T_2000_127 {0.9214568408214984795989721533260308206081,-0.3884807466313661139523105703119654208422} +#define T_2000_129 {0.9189977715934213309267875047225970774889,-0.3942627243429510097705303905968321487308} +#define T_2000_131 {0.9165024219068980126934320651344023644924,-0.4000291372372647424882075029017869383097} +#define T_2000_133 {0.9139708902740611984327756545098964124918,-0.4057797576662000027525323275767732411623} +#define T_2000_137 {0.9087996823560401393748975351627450436354,-0.4172327136617652865169247888843528926373} +#define T_2000_139 {0.9061602102212897857924644995364360511303,-0.4229345970853032876668464723479701206088} +#define T_2000_141 {0.9034849644330348317211587527708616107702,-0.4286197837751283268303836848645005375147} +#define T_2000_143 {0.9007740506053980578116124888765625655651,-0.4342880492898045785565841470088344067335} +#define T_2000_147 {0.8952456483248116825279794284142553806305,-0.4455729223768962743434940421138890087605} +#define T_2000_149 {0.8924283781237178914125252049416303634644,-0.4511890844418450230435269077133852988482} +#define T_2000_151 {0.8895758763783380151934920831990893930197,-0.4567874343342994691852254618424922227859} +#define T_2000_153 {0.8866882557005565379526501601503696292639,-0.4623677510409917590372685936017660424113} +#define T_2000_157 {0.8808081149230035933328508690465241670609,-0.4734734044123121154967748225317336618900} +#define T_2000_159 {0.8778158269611217034977812545548658818007,-0.4789983026447610092724005426134681329131} +#define T_2000_161 {0.8747888843334528097273050661897286772728,-0.4845042908443979445820559703861363232136} +#define T_2000_163 {0.8717274065385088865909324340464081615210,-0.4899911516442365666534897172823548316956} +#define T_2000_167 {0.8655013302530190077987981567275710403919,-0.5009066253607098451183787801710423082113} +#define T_2000_169 {0.8623369775573039852645251812646165490150,-0.5063348073531325210794307167816441506147} +#define T_2000_171 {0.8591385812742724503721092332853004336357,-0.5117430001143449258549367186788003891706} +#define T_2000_173 {0.8559062676711329809009498603700194507837,-0.5171309901381572204925873847969342023134} +#define T_2000_177 {0.8493404002633165861269048946269322186708,-0.5278455119450663479696572721877600997686} +#define T_2000_179 {0.8460071056678422074526224605506286025047,-0.5331716207371886007138073182431980967522} +#define T_2000_181 {0.8426404121604322838834377762395888566971,-0.5384766808266602255983457325783092528582} +#define T_2000_183 {0.8392404526523816743477368618187028914690,-0.5437604827787925199089613670366816222668} +#define T_2000_187 {0.8323412738406634758803193108178675174713,-0.5542634787366941262476416341087315231562} +#define T_2000_189 {0.8288423269047618946459010658145416527987,-0.5594822581021670071521612044307403266430} +#define T_2000_191 {0.8253106586929995769352785828232299536467,-0.5646789500660770455553461033559869974852} +#define T_2000_193 {0.8217464086295902525591827725293114781380,-0.5698533494719237868508798783295787870884} +#define T_2000_197 {0.8145207270705093760554404980211984366179,-0.5801344543918494078837966299033723771572} +#define T_2000_199 {0.8108595808323734166478402585198637098074,-0.5852407540255101237036683414771687239408} +// Pre-computed twiddles for N=2048 +#define T_2048_1 {0.9999952938095761911796444110223092138767,-0.0030679567629659761432425746363605867373} +#define T_2048_3 {0.9999576445519638978609577861789148300886,-0.0092037547820598194364682953505507612135} +#define T_2048_5 {0.9998823474542125611108644989144522696733,-0.0153392062849881018887776562564795312937} +#define T_2048_7 {0.9997694053512152789764400040439795702696,-0.0214740802754695078724544998749479418620} +#define T_2048_9 {0.9996188224951786382987961587787140160799,-0.0276081457789657432055907548829054576345} +#define T_2048_11 {0.9994306045554617323745105750276707112789,-0.0337411718513775868433235416432580677792} +#define T_2048_13 {0.9992047586183638863133182894671335816383,-0.0398729275877398106620042028680472867563} +#define T_2048_15 {0.9989412931868568712445721757831051945686,-0.0460031821309146299325831819260201882571} +#define T_2048_17 {0.9986402181802652711084533621033187955618,-0.0521317046802833236607277456187148345634} +#define T_2048_19 {0.9983015449338928926081848658213857561350,-0.0582582645004357593809807269735756563023} +#define T_2048_21 {0.9979252861985959954793656834226567298174,-0.0643826309298574650519242368318373337388} +#define T_2048_23 {0.9975114561403034540987277978274505585432,-0.0705045733896138698826305812872305978090} +#define T_2048_25 {0.9970600703394829622538964031264185905457,-0.0766238613920314920457954599442018661648} +#define T_2048_27 {0.9965711457905548353863878219272010028362,-0.0827402645493756916383887300980859436095} +#define T_2048_29 {0.9960447009012519670179131026088725775480,-0.0888535525825246003117641180324426386505} +#define T_2048_31 {0.9954807554919269385607094591250643134117,-0.0949634953296390055266229524022492114455} +#define T_2048_33 {0.9948793307948056163780847782618366181850,-0.1010698627548278216714550126198446378112} +#define T_2048_35 {0.9942404494531879022289899694442283362150,-0.1071724249568088427331247203255770727992} +#define T_2048_37 {0.9935641355205953040297117695445194840431,-0.1132709521775643463081806316949950996786} +#define T_2048_39 {0.9928504144598651048880810776608996093273,-0.1193652148109913685436467289946449454874} +#define T_2048_41 {0.9920993131421917965440115949604660272598,-0.1254549834115462336736612769527710042894} +#define T_2048_43 {0.9913108598461154441494613820395898073912,-0.1315400287028831161073583189136115834117} +#define T_2048_45 {0.9904850842564570934101197963173035532236,-0.1376201215864860383231160767536493949592} +#define T_2048_47 {0.9896220174632008870219124219147488474846,-0.1436950331502944433470503327043843455613} +#define T_2048_49 {0.9887216919603237785807436921459157019854,-0.1497645346773215091484843242142233066261} +#define T_2048_51 {0.9877841416445721778316624295257497578859,-0.1558283976542652327079707674784003756940} +#define T_2048_53 {0.9868094018141855272574503032956272363663,-0.1618863937801118257908683517598547041416} +#define T_2048_55 {0.9857975091675674761404479795601218938828,-0.1679382949747311726262921638408442959189} +#define T_2048_57 {0.9847485018019042080084091139724478125572,-0.1739838733874638221355013456559390760958} +#define T_2048_59 {0.9836624192117302545312895745155401527882,-0.1800229014056995147097239851063932292163} +#define T_2048_61 {0.9825393022874412407574595818005036562681,-0.1860551516634466606703313118487130850554} +#define T_2048_63 {0.9813791933137545608900609295233152806759,-0.1920803970498924373444538105104584246874} +#define T_2048_65 {0.9801821359681174294919969725015107542276,-0.1980984107179535880227660982200177386403} +#define T_2048_67 {0.9789481753190621970972529197752010077238,-0.2041089660928168680875671725516440346837} +#define T_2048_69 {0.9776773578245099294292685954133048653603,-0.2101118368804696101559414955772808752954} +#define T_2048_71 {0.9763697313300211400033390418684575706720,-0.2161067970762195200595812138999463059008} +#define T_2048_73 {0.9750253450669941202022528159432113170624,-0.2220936209732035371278158208951936103404} +#define T_2048_75 {0.9736442496508119770481926025240682065487,-0.2280720831708857310182736455317353829741} +#define T_2048_77 {0.9722264970789362692471513582859188318253,-0.2340419585835434301834823145327391102910} +#define T_2048_79 {0.9707721407289503501303329358051996678114,-0.2400030224487414987066813409910537302494} +#define T_2048_81 {0.9692812353565485317119510000338777899742,-0.2459550503357946227289687612937996163964} +#define T_2048_83 {0.9677538370934755107555247377604246139526,-0.2518978181542169680895426608913112431765} +#define T_2048_85 {0.9661900034454125041349925595568493008614,-0.2578311021621589871344326638791244477034} +#define T_2048_87 {0.9645897932898127580259028945874888449907,-0.2637546789748314024492970020219217985868} +#define T_2048_89 {0.9629532668736838774137254404195118695498,-0.2696683255729150907598068442894145846367} +#define T_2048_91 {0.9612804858113206396552641308517195284367,-0.2755718193109581437560962058341829106212} +#define T_2048_93 {0.9595715130819845173348880962294060736895,-0.2814649379257579964175306486140470951796} +#define T_2048_95 {0.9578264130275329080177471041679382324219,-0.2873474595447295110162144737842027097940} +#define T_2048_97 {0.9560452513499964055654345429502427577972,-0.2932191626942586282211777870543301105499} +#define T_2048_99 {0.9542280951091056673263324228173587471247,-0.2990798263080404750802188118541380390525} +#define T_2048_101 {0.9523750127197658787991940698702819645405,-0.3049292297354024294797625316277844831347} +#define T_2048_103 {0.9504860739494817023498285379901062697172,-0.3107671527496114749489208861632505431771} +#define T_2048_105 {0.9485613499157302674902325634320732206106,-0.3165933755561658458077545219566673040390} +#define T_2048_107 {0.9466009130832835349877996122813783586025,-0.3224076788010698524367114714550552889705} +#define T_2048_109 {0.9446048372614802568492109458020422607660,-0.3282098435790925528010575362714007496834} +#define T_2048_111 {0.9425731976014468660451939285849221050739,-0.3339996514420093820518786742468364536762} +#define T_2048_113 {0.9405060705932682951768697421357501298189,-0.3397768844068268512259578528755810111761} +#define T_2048_115 {0.9384035340631080579498757288092747330666,-0.3455413249639890382880480501626152545214} +#define T_2048_117 {0.9362656671702782595900771411834284663200,-0.3512927560855671482720197218441171571612} +#define T_2048_119 {0.9340925504042588700670535217795986682177,-0.3570309612334300330971359471732284873724} +#define T_2048_121 {0.9318842655816681475045015758951194584370,-0.3627557243673972253716897284903097897768} +#define T_2048_123 {0.9296408958431812141753880496253259480000,-0.3684668299533723212491054255224298685789} +#define T_2048_125 {0.9273625256504011149516486511856783181429,-0.3741640629714579890929826433421112596989} +#define T_2048_127 {0.9250492407826775842494271273608319461346,-0.3798472089240511606611505612818291410804} +#define T_2048_129 {0.9227011283338785174734653082850854843855,-0.3855160538439188488979425528668798506260} +#define T_2048_131 {0.9203182767091105942469653200532775372267,-0.3911703843022538706897250904148677363992} +#define T_2048_133 {0.9179007756213904967168559778656344860792,-0.3968099874167103080502272405283292755485} +#define T_2048_135 {0.9154487160882678331574879848631098866463,-0.4024346508594184301799145941913593560457} +#define T_2048_137 {0.9129621904283982125605234614340588450432,-0.4080441628649786878213490126654505729675} +#define T_2048_139 {0.9104412922580672473671370426018256694078,-0.4136383122384345578659292641532374545932} +#define T_2048_141 {0.9078861164876662614986457811028230935335,-0.4192168883632239606562563949410105124116} +#define T_2048_143 {0.9052967593181188155071481560298707336187,-0.4247796812091088058949139849573839455843} +#define T_2048_145 {0.9026733182372588260022894246503710746765,-0.4303264813400826116485120564902899786830} +#define T_2048_147 {0.9000158920161602793541533173993229866028,-0.4358570799222554748020286297105485573411} +#define T_2048_149 {0.8973245807054183176276751510158646851778,-0.4413712687317166705192050812911475077271} +#define T_2048_151 {0.8945994856313826959492985224642325192690,-0.4468688401623742145751805310283089056611} +#define T_2048_153 {0.8918407093923427231274558835139032453299,-0.4523495872337708889610041751438984647393} +#define T_2048_155 {0.8890483558546645737052926961041521281004,-0.4578133035988772303603866475896211341023} +#define T_2048_157 {0.8862225301488806383787277809460647404194,-0.4632597835518602047422120904229814186692} +#define T_2048_159 {0.8833633386657315789136646344559267163277,-0.4686888220358279566468695520597975701094} +#define T_2048_161 {0.8804708890521607544954463264730293303728,-0.4741002146505500225437401695671724155545} +#define T_2048_163 {0.8775452902072612415551589037931989878416,-0.4794937576601530082598401349969208240509} +#define T_2048_165 {0.8745866522781761132065980746119748800993,-0.4848692480007911198569559019233565777540} +#define T_2048_167 {0.8715950866559510901154794737522024661303,-0.4902264832882911593792130133806494995952} +#define T_2048_169 {0.8685707059713408950685220588638912886381,-0.4955652618257725405825908637780230492353} +#define T_2048_171 {0.8655136240905690891977997125650290399790,-0.5008853826112408258453001508314628154039} +#define T_2048_173 {0.8624239561110405016819413503981195390224,-0.5061866453451553393705353300902061164379} +#define T_2048_175 {0.8593018183570083623479263223998714238405,-0.5114688504379704125923922219953965395689} +#define T_2048_177 {0.8561473283751944718389381705492269247770,-0.5167317990176498732068921526661142706871} +#define T_2048_179 {0.8529606049303636305936038297659251838923,-0.5219752929371543892500540096079930663109} +#define T_2048_181 {0.8497417680008524376589207349752541631460,-0.5271991347819013906672580560552887618542} +#define T_2048_183 {0.8464909387740521262699644466920290142298,-0.5324031278771980124631113540090154856443} +#define T_2048_185 {0.8432082396418454361963767951237969100475,-0.5375870762956455051195803207519929856062} +#define T_2048_187 {0.8398937941959995212570788680750411003828,-0.5427507848645158894385076564503833651543} +#define T_2048_189 {0.8365477272235120054233448172453790903091,-0.5478940591731001896746988677477929741144} +#define T_2048_191 {0.8331701647019131851124029708444140851498,-0.5530167055800275788257636122580152004957} +#define T_2048_193 {0.8297612337945230454039347023353911936283,-0.5581185312205561022125266390503384172916} +#define T_2048_195 {0.8263210628456635342686809053702745586634,-0.5631993440138340911715886250021867454052} +#define T_2048_197 {0.8228497813758263168537609999475535005331,-0.5682589526701316007262221319251693785191} +#define T_2048_199 {0.8193475200767970090254266324336640536785,-0.5732971666980422043025100720115005970001} +#define T_2048_201 {0.8158144108067337807455032816505990922451,-0.5783137964116555895799365316634066402912} +#define T_2048_203 {0.8122505865852038819951985715306363999844,-0.5833086529376982909411708533298224210739} +#define T_2048_205 {0.8086561815881749826218083398998714983463,-0.5882815482226453340786065382417291402817} +#define T_2048_207 {0.8050313311429635465543697137036360800266,-0.5932322950397997951554884821234736591578} +#define T_2048_209 {0.8013761717231402403882611906738020479679,-0.5981607069963422729230728691618423908949} +#define T_2048_211 {0.7976908409433911550934226397657766938210,-0.6030665985403481643700729364354629069567} +#define T_2048_213 {0.7939754775543371723145469331939239054918,-0.6079497849677736320828103089297655969858} +#define T_2048_215 {0.7902302214373100319733111973619088530540,-0.6128100824294097082045595925592351704836} +#define T_2048_217 {0.7864552135990857673064624577818904072046,-0.6176473079378039798825739126186817884445} +#define T_2048_219 {0.7826505961665757293843626030138693749905,-0.6224612793741499672250938601791858673096} +#define T_2048_221 {0.7788165123814759782661099052347708493471,-0.6272518154951440827460373839130625128746} +#define T_2048_223 {0.7749531065948739305682124722807202488184,-0.6320187359398090620743460021913051605225} +#define T_2048_225 {0.7710605242618138177590481063816696405411,-0.6367618612362841989948947230004705488682} +#define T_2048_227 {0.7671389119358204000675982570101041346788,-0.6414810128085831619770829092885833233595} +#define T_2048_229 {0.7631884172633812690733634553907904773951,-0.6461760129833163945889396018174011260271} +#define T_2048_231 {0.7592091889783880720443676182185299694538,-0.6508466849963808753543048624123912304640} +#define T_2048_233 {0.7552013768965365470009487580682616680861,-0.6554928529996153496739452748443000018597} +#define T_2048_235 {0.7511651319096863677060582631384022533894,-0.6601143420674204786990912907640449702740} +#define T_2048_237 {0.7471006059801801324482539712334983050823,-0.6647109782033449043581185833318158984184} +#define T_2048_239 {0.7430079521351217186619919630174990743399,-0.6692825883466361203133487833838444203138} +#define T_2048_241 {0.7388873244606151136082416996941901743412,-0.6738290003787560378256671356211882084608} +#define T_2048_243 {0.7347388780959634990708195800834801048040,-0.6783500431298614685715619998518377542496} +#define T_2048_245 {0.7305627692278275908677187544526532292366,-0.6828455463852480811226541845826432108879} +#define T_2048_247 {0.7263591550843460087349967579939402639866,-0.6873153408917591633553456631489098072052} +#define T_2048_249 {0.7221281939292153451148692511196713894606,-0.6917592583641577475006556596781592816114} +#define T_2048_251 {0.7178700450557317092048492668254766613245,-0.6961771314914629860126638050132896751165} +#define T_2048_253 {0.7135848687807936352456295026058796793222,-0.7005687939432483357649061872507445514202} +#define T_2048_255 {0.7092728264388656889138928818283602595329,-0.7049340803759048812437981723633129149675} +#define T_2048_257 {0.7049340803759048812437981723633129149675,-0.7092728264388656889138928818283602595329} +#define T_2048_259 {0.7005687939432483357649061872507445514202,-0.7135848687807936352456295026058796793222} +#define T_2048_261 {0.6961771314914629860126638050132896751165,-0.7178700450557317092048492668254766613245} +#define T_2048_263 {0.6917592583641577475006556596781592816114,-0.7221281939292153451148692511196713894606} +#define T_2048_265 {0.6873153408917591633553456631489098072052,-0.7263591550843460087349967579939402639866} +#define T_2048_267 {0.6828455463852480811226541845826432108879,-0.7305627692278275908677187544526532292366} +#define T_2048_269 {0.6783500431298614685715619998518377542496,-0.7347388780959634990708195800834801048040} +#define T_2048_271 {0.6738290003787560378256671356211882084608,-0.7388873244606151136082416996941901743412} +#define T_2048_273 {0.6692825883466361203133487833838444203138,-0.7430079521351217186619919630174990743399} +#define T_2048_275 {0.6647109782033449043581185833318158984184,-0.7471006059801801324482539712334983050823} +#define T_2048_277 {0.6601143420674204786990912907640449702740,-0.7511651319096863677060582631384022533894} +#define T_2048_279 {0.6554928529996153496739452748443000018597,-0.7552013768965365470009487580682616680861} +#define T_2048_281 {0.6508466849963808753543048624123912304640,-0.7592091889783880720443676182185299694538} +#define T_2048_283 {0.6461760129833163945889396018174011260271,-0.7631884172633812690733634553907904773951} +#define T_2048_285 {0.6414810128085831619770829092885833233595,-0.7671389119358204000675982570101041346788} +#define T_2048_287 {0.6367618612362841989948947230004705488682,-0.7710605242618138177590481063816696405411} +#define T_2048_289 {0.6320187359398090620743460021913051605225,-0.7749531065948739305682124722807202488184} +#define T_2048_291 {0.6272518154951440827460373839130625128746,-0.7788165123814759782661099052347708493471} +#define T_2048_293 {0.6224612793741499672250938601791858673096,-0.7826505961665757293843626030138693749905} +#define T_2048_295 {0.6176473079378039798825739126186817884445,-0.7864552135990857673064624577818904072046} +#define T_2048_297 {0.6128100824294097082045595925592351704836,-0.7902302214373100319733111973619088530540} +#define T_2048_299 {0.6079497849677736320828103089297655969858,-0.7939754775543371723145469331939239054918} +#define T_2048_301 {0.6030665985403481643700729364354629069567,-0.7976908409433911550934226397657766938210} +#define T_2048_303 {0.5981607069963422729230728691618423908949,-0.8013761717231402403882611906738020479679} +#define T_2048_305 {0.5932322950397997951554884821234736591578,-0.8050313311429635465543697137036360800266} +#define T_2048_307 {0.5882815482226453340786065382417291402817,-0.8086561815881749826218083398998714983463} +#define T_2048_309 {0.5833086529376982909411708533298224210739,-0.8122505865852038819951985715306363999844} +#define T_2048_311 {0.5783137964116555895799365316634066402912,-0.8158144108067337807455032816505990922451} +#define T_2048_313 {0.5732971666980422043025100720115005970001,-0.8193475200767970090254266324336640536785} +#define T_2048_315 {0.5682589526701316007262221319251693785191,-0.8228497813758263168537609999475535005331} +#define T_2048_317 {0.5631993440138340911715886250021867454052,-0.8263210628456635342686809053702745586634} +#define T_2048_319 {0.5581185312205561022125266390503384172916,-0.8297612337945230454039347023353911936283} +#define T_2048_321 {0.5530167055800275788257636122580152004957,-0.8331701647019131851124029708444140851498} +#define T_2048_323 {0.5478940591731001896746988677477929741144,-0.8365477272235120054233448172453790903091} +#define T_2048_325 {0.5427507848645158894385076564503833651543,-0.8398937941959995212570788680750411003828} +#define T_2048_327 {0.5375870762956455051195803207519929856062,-0.8432082396418454361963767951237969100475} +#define T_2048_329 {0.5324031278771980124631113540090154856443,-0.8464909387740521262699644466920290142298} +#define T_2048_331 {0.5271991347819013906672580560552887618542,-0.8497417680008524376589207349752541631460} +#define T_2048_333 {0.5219752929371543892500540096079930663109,-0.8529606049303636305936038297659251838923} +#define T_2048_335 {0.5167317990176498732068921526661142706871,-0.8561473283751944718389381705492269247770} +#define T_2048_337 {0.5114688504379704125923922219953965395689,-0.8593018183570083623479263223998714238405} +#define T_2048_339 {0.5061866453451553393705353300902061164379,-0.8624239561110405016819413503981195390224} +#define T_2048_341 {0.5008853826112408258453001508314628154039,-0.8655136240905690891977997125650290399790} +#define T_2048_343 {0.4955652618257725405825908637780230492353,-0.8685707059713408950685220588638912886381} +#define T_2048_345 {0.4902264832882911593792130133806494995952,-0.8715950866559510901154794737522024661303} +#define T_2048_347 {0.4848692480007911198569559019233565777540,-0.8745866522781761132065980746119748800993} +#define T_2048_349 {0.4794937576601530082598401349969208240509,-0.8775452902072612415551589037931989878416} +#define T_2048_351 {0.4741002146505500225437401695671724155545,-0.8804708890521607544954463264730293303728} +#define T_2048_353 {0.4686888220358279566468695520597975701094,-0.8833633386657315789136646344559267163277} +#define T_2048_355 {0.4632597835518602047422120904229814186692,-0.8862225301488806383787277809460647404194} +#define T_2048_357 {0.4578133035988772303603866475896211341023,-0.8890483558546645737052926961041521281004} +#define T_2048_359 {0.4523495872337708889610041751438984647393,-0.8918407093923427231274558835139032453299} +#define T_2048_361 {0.4468688401623742145751805310283089056611,-0.8945994856313826959492985224642325192690} +#define T_2048_363 {0.4413712687317166705192050812911475077271,-0.8973245807054183176276751510158646851778} +#define T_2048_365 {0.4358570799222554748020286297105485573411,-0.9000158920161602793541533173993229866028} +#define T_2048_367 {0.4303264813400826116485120564902899786830,-0.9026733182372588260022894246503710746765} +#define T_2048_369 {0.4247796812091088058949139849573839455843,-0.9052967593181188155071481560298707336187} +#define T_2048_371 {0.4192168883632239606562563949410105124116,-0.9078861164876662614986457811028230935335} +#define T_2048_373 {0.4136383122384345578659292641532374545932,-0.9104412922580672473671370426018256694078} +#define T_2048_375 {0.4080441628649786878213490126654505729675,-0.9129621904283982125605234614340588450432} +#define T_2048_377 {0.4024346508594184301799145941913593560457,-0.9154487160882678331574879848631098866463} +#define T_2048_379 {0.3968099874167103080502272405283292755485,-0.9179007756213904967168559778656344860792} +#define T_2048_381 {0.3911703843022538706897250904148677363992,-0.9203182767091105942469653200532775372267} +#define T_2048_383 {0.3855160538439188488979425528668798506260,-0.9227011283338785174734653082850854843855} +#define T_2048_385 {0.3798472089240511606611505612818291410804,-0.9250492407826775842494271273608319461346} +#define T_2048_387 {0.3741640629714579890929826433421112596989,-0.9273625256504011149516486511856783181429} +#define T_2048_389 {0.3684668299533723212491054255224298685789,-0.9296408958431812141753880496253259480000} +#define T_2048_391 {0.3627557243673972253716897284903097897768,-0.9318842655816681475045015758951194584370} +#define T_2048_393 {0.3570309612334300330971359471732284873724,-0.9340925504042588700670535217795986682177} +#define T_2048_395 {0.3512927560855671482720197218441171571612,-0.9362656671702782595900771411834284663200} +#define T_2048_397 {0.3455413249639890382880480501626152545214,-0.9384035340631080579498757288092747330666} +#define T_2048_399 {0.3397768844068268512259578528755810111761,-0.9405060705932682951768697421357501298189} +#define T_2048_401 {0.3339996514420093820518786742468364536762,-0.9425731976014468660451939285849221050739} +#define T_2048_403 {0.3282098435790925528010575362714007496834,-0.9446048372614802568492109458020422607660} +#define T_2048_405 {0.3224076788010698524367114714550552889705,-0.9466009130832835349877996122813783586025} +#define T_2048_407 {0.3165933755561658458077545219566673040390,-0.9485613499157302674902325634320732206106} +#define T_2048_409 {0.3107671527496114749489208861632505431771,-0.9504860739494817023498285379901062697172} +#define T_2048_411 {0.3049292297354024294797625316277844831347,-0.9523750127197658787991940698702819645405} +#define T_2048_413 {0.2990798263080404750802188118541380390525,-0.9542280951091056673263324228173587471247} +#define T_2048_415 {0.2932191626942586282211777870543301105499,-0.9560452513499964055654345429502427577972} +#define T_2048_417 {0.2873474595447295110162144737842027097940,-0.9578264130275329080177471041679382324219} +#define T_2048_419 {0.2814649379257579964175306486140470951796,-0.9595715130819845173348880962294060736895} +#define T_2048_421 {0.2755718193109581437560962058341829106212,-0.9612804858113206396552641308517195284367} +#define T_2048_423 {0.2696683255729150907598068442894145846367,-0.9629532668736838774137254404195118695498} +#define T_2048_425 {0.2637546789748314024492970020219217985868,-0.9645897932898127580259028945874888449907} +#define T_2048_427 {0.2578311021621589871344326638791244477034,-0.9661900034454125041349925595568493008614} +#define T_2048_429 {0.2518978181542169680895426608913112431765,-0.9677538370934755107555247377604246139526} +#define T_2048_431 {0.2459550503357946227289687612937996163964,-0.9692812353565485317119510000338777899742} +#define T_2048_433 {0.2400030224487414987066813409910537302494,-0.9707721407289503501303329358051996678114} +#define T_2048_435 {0.2340419585835434301834823145327391102910,-0.9722264970789362692471513582859188318253} +#define T_2048_437 {0.2280720831708857310182736455317353829741,-0.9736442496508119770481926025240682065487} +#define T_2048_439 {0.2220936209732035371278158208951936103404,-0.9750253450669941202022528159432113170624} +#define T_2048_441 {0.2161067970762195200595812138999463059008,-0.9763697313300211400033390418684575706720} +#define T_2048_443 {0.2101118368804696101559414955772808752954,-0.9776773578245099294292685954133048653603} +#define T_2048_445 {0.2041089660928168680875671725516440346837,-0.9789481753190621970972529197752010077238} +#define T_2048_447 {0.1980984107179535880227660982200177386403,-0.9801821359681174294919969725015107542276} +#define T_2048_449 {0.1920803970498924373444538105104584246874,-0.9813791933137545608900609295233152806759} +#define T_2048_451 {0.1860551516634466606703313118487130850554,-0.9825393022874412407574595818005036562681} +#define T_2048_453 {0.1800229014056995147097239851063932292163,-0.9836624192117302545312895745155401527882} +#define T_2048_455 {0.1739838733874638221355013456559390760958,-0.9847485018019042080084091139724478125572} +#define T_2048_457 {0.1679382949747311726262921638408442959189,-0.9857975091675674761404479795601218938828} +#define T_2048_459 {0.1618863937801118257908683517598547041416,-0.9868094018141855272574503032956272363663} +#define T_2048_461 {0.1558283976542652327079707674784003756940,-0.9877841416445721778316624295257497578859} +#define T_2048_463 {0.1497645346773215091484843242142233066261,-0.9887216919603237785807436921459157019854} +#define T_2048_465 {0.1436950331502944433470503327043843455613,-0.9896220174632008870219124219147488474846} +#define T_2048_467 {0.1376201215864860383231160767536493949592,-0.9904850842564570934101197963173035532236} +#define T_2048_469 {0.1315400287028831161073583189136115834117,-0.9913108598461154441494613820395898073912} +#define T_2048_471 {0.1254549834115462336736612769527710042894,-0.9920993131421917965440115949604660272598} +#define T_2048_473 {0.1193652148109913685436467289946449454874,-0.9928504144598651048880810776608996093273} +#define T_2048_475 {0.1132709521775643463081806316949950996786,-0.9935641355205953040297117695445194840431} +#define T_2048_477 {0.1071724249568088427331247203255770727992,-0.9942404494531879022289899694442283362150} +#define T_2048_479 {0.1010698627548278216714550126198446378112,-0.9948793307948056163780847782618366181850} +#define T_2048_481 {0.0949634953296390055266229524022492114455,-0.9954807554919269385607094591250643134117} +#define T_2048_483 {0.0888535525825246003117641180324426386505,-0.9960447009012519670179131026088725775480} +#define T_2048_485 {0.0827402645493756916383887300980859436095,-0.9965711457905548353863878219272010028362} +#define T_2048_487 {0.0766238613920314920457954599442018661648,-0.9970600703394829622538964031264185905457} +#define T_2048_489 {0.0705045733896138698826305812872305978090,-0.9975114561403034540987277978274505585432} +#define T_2048_491 {0.0643826309298574650519242368318373337388,-0.9979252861985959954793656834226567298174} +#define T_2048_493 {0.0582582645004357593809807269735756563023,-0.9983015449338928926081848658213857561350} +#define T_2048_495 {0.0521317046802833236607277456187148345634,-0.9986402181802652711084533621033187955618} +#define T_2048_497 {0.0460031821309146299325831819260201882571,-0.9989412931868568712445721757831051945686} +#define T_2048_499 {0.0398729275877398106620042028680472867563,-0.9992047586183638863133182894671335816383} +#define T_2048_501 {0.0337411718513775868433235416432580677792,-0.9994306045554617323745105750276707112789} +#define T_2048_503 {0.0276081457789657432055907548829054576345,-0.9996188224951786382987961587787140160799} +#define T_2048_505 {0.0214740802754695078724544998749479418620,-0.9997694053512152789764400040439795702696} +#define T_2048_507 {0.0153392062849881018887776562564795312937,-0.9998823474542125611108644989144522696733} +#define T_2048_509 {0.0092037547820598194364682953505507612135,-0.9999576445519638978609577861789148300886} +#define T_2048_511 {0.0030679567629659761432425746363605867373,-0.9999952938095761911796444110223092138767} +#define T_2048_513 {-0.0030679567629659761432425746363605867373,-0.9999952938095761911796444110223092138767} +#define T_2048_515 {-0.0092037547820598194364682953505507612135,-0.9999576445519638978609577861789148300886} +#define T_2048_517 {-0.0153392062849881018887776562564795312937,-0.9998823474542125611108644989144522696733} +#define T_2048_519 {-0.0214740802754695078724544998749479418620,-0.9997694053512152789764400040439795702696} +#define T_2048_521 {-0.0276081457789657432055907548829054576345,-0.9996188224951786382987961587787140160799} +#define T_2048_523 {-0.0337411718513775868433235416432580677792,-0.9994306045554617323745105750276707112789} +#define T_2048_525 {-0.0398729275877398106620042028680472867563,-0.9992047586183638863133182894671335816383} +#define T_2048_527 {-0.0460031821309146299325831819260201882571,-0.9989412931868568712445721757831051945686} +#define T_2048_529 {-0.0521317046802833236607277456187148345634,-0.9986402181802652711084533621033187955618} +#define T_2048_531 {-0.0582582645004357593809807269735756563023,-0.9983015449338928926081848658213857561350} +#define T_2048_533 {-0.0643826309298574650519242368318373337388,-0.9979252861985959954793656834226567298174} +#define T_2048_535 {-0.0705045733896138698826305812872305978090,-0.9975114561403034540987277978274505585432} +#define T_2048_537 {-0.0766238613920314920457954599442018661648,-0.9970600703394829622538964031264185905457} +#define T_2048_539 {-0.0827402645493756916383887300980859436095,-0.9965711457905548353863878219272010028362} +#define T_2048_541 {-0.0888535525825246003117641180324426386505,-0.9960447009012519670179131026088725775480} +#define T_2048_543 {-0.0949634953296390055266229524022492114455,-0.9954807554919269385607094591250643134117} +#define T_2048_545 {-0.1010698627548278216714550126198446378112,-0.9948793307948056163780847782618366181850} +#define T_2048_547 {-0.1071724249568088427331247203255770727992,-0.9942404494531879022289899694442283362150} +#define T_2048_549 {-0.1132709521775643463081806316949950996786,-0.9935641355205953040297117695445194840431} +#define T_2048_551 {-0.1193652148109913685436467289946449454874,-0.9928504144598651048880810776608996093273} +#define T_2048_553 {-0.1254549834115462336736612769527710042894,-0.9920993131421917965440115949604660272598} +#define T_2048_555 {-0.1315400287028831161073583189136115834117,-0.9913108598461154441494613820395898073912} +#define T_2048_557 {-0.1376201215864860383231160767536493949592,-0.9904850842564570934101197963173035532236} +#define T_2048_559 {-0.1436950331502944433470503327043843455613,-0.9896220174632008870219124219147488474846} +#define T_2048_561 {-0.1497645346773215091484843242142233066261,-0.9887216919603237785807436921459157019854} +#define T_2048_563 {-0.1558283976542652327079707674784003756940,-0.9877841416445721778316624295257497578859} +#define T_2048_565 {-0.1618863937801118257908683517598547041416,-0.9868094018141855272574503032956272363663} +#define T_2048_567 {-0.1679382949747311726262921638408442959189,-0.9857975091675674761404479795601218938828} +#define T_2048_569 {-0.1739838733874638221355013456559390760958,-0.9847485018019042080084091139724478125572} +#define T_2048_571 {-0.1800229014056995147097239851063932292163,-0.9836624192117302545312895745155401527882} +#define T_2048_573 {-0.1860551516634466606703313118487130850554,-0.9825393022874412407574595818005036562681} +#define T_2048_575 {-0.1920803970498924373444538105104584246874,-0.9813791933137545608900609295233152806759} +#define T_2048_577 {-0.1980984107179535880227660982200177386403,-0.9801821359681174294919969725015107542276} +#define T_2048_579 {-0.2041089660928168680875671725516440346837,-0.9789481753190621970972529197752010077238} +#define T_2048_581 {-0.2101118368804696101559414955772808752954,-0.9776773578245099294292685954133048653603} +#define T_2048_583 {-0.2161067970762195200595812138999463059008,-0.9763697313300211400033390418684575706720} +#define T_2048_585 {-0.2220936209732035371278158208951936103404,-0.9750253450669941202022528159432113170624} +#define T_2048_587 {-0.2280720831708857310182736455317353829741,-0.9736442496508119770481926025240682065487} +#define T_2048_589 {-0.2340419585835434301834823145327391102910,-0.9722264970789362692471513582859188318253} +#define T_2048_591 {-0.2400030224487414987066813409910537302494,-0.9707721407289503501303329358051996678114} +#define T_2048_593 {-0.2459550503357946227289687612937996163964,-0.9692812353565485317119510000338777899742} +#define T_2048_595 {-0.2518978181542169680895426608913112431765,-0.9677538370934755107555247377604246139526} +#define T_2048_597 {-0.2578311021621589871344326638791244477034,-0.9661900034454125041349925595568493008614} +#define T_2048_599 {-0.2637546789748314024492970020219217985868,-0.9645897932898127580259028945874888449907} +#define T_2048_601 {-0.2696683255729150907598068442894145846367,-0.9629532668736838774137254404195118695498} +#define T_2048_603 {-0.2755718193109581437560962058341829106212,-0.9612804858113206396552641308517195284367} +#define T_2048_605 {-0.2814649379257579964175306486140470951796,-0.9595715130819845173348880962294060736895} +#define T_2048_607 {-0.2873474595447295110162144737842027097940,-0.9578264130275329080177471041679382324219} +#define T_2048_609 {-0.2932191626942586282211777870543301105499,-0.9560452513499964055654345429502427577972} +#define T_2048_611 {-0.2990798263080404750802188118541380390525,-0.9542280951091056673263324228173587471247} +#define T_2048_613 {-0.3049292297354024294797625316277844831347,-0.9523750127197658787991940698702819645405} +#define T_2048_615 {-0.3107671527496114749489208861632505431771,-0.9504860739494817023498285379901062697172} +#define T_2048_617 {-0.3165933755561658458077545219566673040390,-0.9485613499157302674902325634320732206106} +#define T_2048_619 {-0.3224076788010698524367114714550552889705,-0.9466009130832835349877996122813783586025} +#define T_2048_621 {-0.3282098435790925528010575362714007496834,-0.9446048372614802568492109458020422607660} +#define T_2048_623 {-0.3339996514420093820518786742468364536762,-0.9425731976014468660451939285849221050739} +#define T_2048_625 {-0.3397768844068268512259578528755810111761,-0.9405060705932682951768697421357501298189} +#define T_2048_627 {-0.3455413249639890382880480501626152545214,-0.9384035340631080579498757288092747330666} +#define T_2048_629 {-0.3512927560855671482720197218441171571612,-0.9362656671702782595900771411834284663200} +#define T_2048_631 {-0.3570309612334300330971359471732284873724,-0.9340925504042588700670535217795986682177} +#define T_2048_633 {-0.3627557243673972253716897284903097897768,-0.9318842655816681475045015758951194584370} +#define T_2048_635 {-0.3684668299533723212491054255224298685789,-0.9296408958431812141753880496253259480000} +#define T_2048_637 {-0.3741640629714579890929826433421112596989,-0.9273625256504011149516486511856783181429} +#define T_2048_639 {-0.3798472089240511606611505612818291410804,-0.9250492407826775842494271273608319461346} +#define T_2048_641 {-0.3855160538439188488979425528668798506260,-0.9227011283338785174734653082850854843855} +#define T_2048_643 {-0.3911703843022538706897250904148677363992,-0.9203182767091105942469653200532775372267} +#define T_2048_645 {-0.3968099874167103080502272405283292755485,-0.9179007756213904967168559778656344860792} +#define T_2048_647 {-0.4024346508594184301799145941913593560457,-0.9154487160882678331574879848631098866463} +#define T_2048_649 {-0.4080441628649786878213490126654505729675,-0.9129621904283982125605234614340588450432} +#define T_2048_651 {-0.4136383122384345578659292641532374545932,-0.9104412922580672473671370426018256694078} +#define T_2048_653 {-0.4192168883632239606562563949410105124116,-0.9078861164876662614986457811028230935335} +#define T_2048_655 {-0.4247796812091088058949139849573839455843,-0.9052967593181188155071481560298707336187} +#define T_2048_657 {-0.4303264813400826116485120564902899786830,-0.9026733182372588260022894246503710746765} +#define T_2048_659 {-0.4358570799222554748020286297105485573411,-0.9000158920161602793541533173993229866028} +#define T_2048_661 {-0.4413712687317166705192050812911475077271,-0.8973245807054183176276751510158646851778} +#define T_2048_663 {-0.4468688401623742145751805310283089056611,-0.8945994856313826959492985224642325192690} +#define T_2048_665 {-0.4523495872337708889610041751438984647393,-0.8918407093923427231274558835139032453299} +#define T_2048_667 {-0.4578133035988772303603866475896211341023,-0.8890483558546645737052926961041521281004} +#define T_2048_669 {-0.4632597835518602047422120904229814186692,-0.8862225301488806383787277809460647404194} +#define T_2048_671 {-0.4686888220358279566468695520597975701094,-0.8833633386657315789136646344559267163277} +#define T_2048_673 {-0.4741002146505500225437401695671724155545,-0.8804708890521607544954463264730293303728} +#define T_2048_675 {-0.4794937576601530082598401349969208240509,-0.8775452902072612415551589037931989878416} +#define T_2048_677 {-0.4848692480007911198569559019233565777540,-0.8745866522781761132065980746119748800993} +#define T_2048_679 {-0.4902264832882911593792130133806494995952,-0.8715950866559510901154794737522024661303} +#define T_2048_681 {-0.4955652618257725405825908637780230492353,-0.8685707059713408950685220588638912886381} +#define T_2048_683 {-0.5008853826112408258453001508314628154039,-0.8655136240905690891977997125650290399790} +#define T_2048_685 {-0.5061866453451553393705353300902061164379,-0.8624239561110405016819413503981195390224} +#define T_2048_687 {-0.5114688504379704125923922219953965395689,-0.8593018183570083623479263223998714238405} +#define T_2048_689 {-0.5167317990176498732068921526661142706871,-0.8561473283751944718389381705492269247770} +#define T_2048_691 {-0.5219752929371543892500540096079930663109,-0.8529606049303636305936038297659251838923} +#define T_2048_693 {-0.5271991347819013906672580560552887618542,-0.8497417680008524376589207349752541631460} +#define T_2048_695 {-0.5324031278771980124631113540090154856443,-0.8464909387740521262699644466920290142298} +#define T_2048_697 {-0.5375870762956455051195803207519929856062,-0.8432082396418454361963767951237969100475} +#define T_2048_699 {-0.5427507848645158894385076564503833651543,-0.8398937941959995212570788680750411003828} +#define T_2048_701 {-0.5478940591731001896746988677477929741144,-0.8365477272235120054233448172453790903091} +#define T_2048_703 {-0.5530167055800275788257636122580152004957,-0.8331701647019131851124029708444140851498} +#define T_2048_705 {-0.5581185312205561022125266390503384172916,-0.8297612337945230454039347023353911936283} +#define T_2048_707 {-0.5631993440138340911715886250021867454052,-0.8263210628456635342686809053702745586634} +#define T_2048_709 {-0.5682589526701316007262221319251693785191,-0.8228497813758263168537609999475535005331} +#define T_2048_711 {-0.5732971666980422043025100720115005970001,-0.8193475200767970090254266324336640536785} +#define T_2048_713 {-0.5783137964116555895799365316634066402912,-0.8158144108067337807455032816505990922451} +#define T_2048_715 {-0.5833086529376982909411708533298224210739,-0.8122505865852038819951985715306363999844} +#define T_2048_717 {-0.5882815482226453340786065382417291402817,-0.8086561815881749826218083398998714983463} +#define T_2048_719 {-0.5932322950397997951554884821234736591578,-0.8050313311429635465543697137036360800266} +#define T_2048_721 {-0.5981607069963422729230728691618423908949,-0.8013761717231402403882611906738020479679} +#define T_2048_723 {-0.6030665985403481643700729364354629069567,-0.7976908409433911550934226397657766938210} +#define T_2048_725 {-0.6079497849677736320828103089297655969858,-0.7939754775543371723145469331939239054918} +#define T_2048_727 {-0.6128100824294097082045595925592351704836,-0.7902302214373100319733111973619088530540} +#define T_2048_729 {-0.6176473079378039798825739126186817884445,-0.7864552135990857673064624577818904072046} +#define T_2048_731 {-0.6224612793741499672250938601791858673096,-0.7826505961665757293843626030138693749905} +#define T_2048_733 {-0.6272518154951440827460373839130625128746,-0.7788165123814759782661099052347708493471} +#define T_2048_735 {-0.6320187359398090620743460021913051605225,-0.7749531065948739305682124722807202488184} +#define T_2048_737 {-0.6367618612362841989948947230004705488682,-0.7710605242618138177590481063816696405411} +#define T_2048_739 {-0.6414810128085831619770829092885833233595,-0.7671389119358204000675982570101041346788} +#define T_2048_741 {-0.6461760129833163945889396018174011260271,-0.7631884172633812690733634553907904773951} +#define T_2048_743 {-0.6508466849963808753543048624123912304640,-0.7592091889783880720443676182185299694538} +#define T_2048_745 {-0.6554928529996153496739452748443000018597,-0.7552013768965365470009487580682616680861} +#define T_2048_747 {-0.6601143420674204786990912907640449702740,-0.7511651319096863677060582631384022533894} +#define T_2048_749 {-0.6647109782033449043581185833318158984184,-0.7471006059801801324482539712334983050823} +#define T_2048_751 {-0.6692825883466361203133487833838444203138,-0.7430079521351217186619919630174990743399} +#define T_2048_753 {-0.6738290003787560378256671356211882084608,-0.7388873244606151136082416996941901743412} +#define T_2048_755 {-0.6783500431298614685715619998518377542496,-0.7347388780959634990708195800834801048040} +#define T_2048_757 {-0.6828455463852480811226541845826432108879,-0.7305627692278275908677187544526532292366} +#define T_2048_759 {-0.6873153408917591633553456631489098072052,-0.7263591550843460087349967579939402639866} +#define T_2048_761 {-0.6917592583641577475006556596781592816114,-0.7221281939292153451148692511196713894606} +#define T_2048_763 {-0.6961771314914629860126638050132896751165,-0.7178700450557317092048492668254766613245} +#define T_2048_765 {-0.7005687939432483357649061872507445514202,-0.7135848687807936352456295026058796793222} +#define T_2048_767 {-0.7049340803759048812437981723633129149675,-0.7092728264388656889138928818283602595329} +#define T_2048_769 {-0.7092728264388656889138928818283602595329,-0.7049340803759048812437981723633129149675} +#define T_2048_771 {-0.7135848687807936352456295026058796793222,-0.7005687939432483357649061872507445514202} +#define T_2048_773 {-0.7178700450557317092048492668254766613245,-0.6961771314914629860126638050132896751165} +#define T_2048_775 {-0.7221281939292153451148692511196713894606,-0.6917592583641577475006556596781592816114} +#define T_2048_777 {-0.7263591550843460087349967579939402639866,-0.6873153408917591633553456631489098072052} +#define T_2048_779 {-0.7305627692278275908677187544526532292366,-0.6828455463852480811226541845826432108879} +#define T_2048_781 {-0.7347388780959634990708195800834801048040,-0.6783500431298614685715619998518377542496} +#define T_2048_783 {-0.7388873244606151136082416996941901743412,-0.6738290003787560378256671356211882084608} +#define T_2048_785 {-0.7430079521351217186619919630174990743399,-0.6692825883466361203133487833838444203138} +#define T_2048_787 {-0.7471006059801801324482539712334983050823,-0.6647109782033449043581185833318158984184} +#define T_2048_789 {-0.7511651319096863677060582631384022533894,-0.6601143420674204786990912907640449702740} +#define T_2048_791 {-0.7552013768965365470009487580682616680861,-0.6554928529996153496739452748443000018597} +#define T_2048_793 {-0.7592091889783880720443676182185299694538,-0.6508466849963808753543048624123912304640} +#define T_2048_795 {-0.7631884172633812690733634553907904773951,-0.6461760129833163945889396018174011260271} +#define T_2048_797 {-0.7671389119358204000675982570101041346788,-0.6414810128085831619770829092885833233595} +#define T_2048_799 {-0.7710605242618138177590481063816696405411,-0.6367618612362841989948947230004705488682} +#define T_2048_801 {-0.7749531065948739305682124722807202488184,-0.6320187359398090620743460021913051605225} +#define T_2048_803 {-0.7788165123814759782661099052347708493471,-0.6272518154951440827460373839130625128746} +#define T_2048_805 {-0.7826505961665757293843626030138693749905,-0.6224612793741499672250938601791858673096} +#define T_2048_807 {-0.7864552135990857673064624577818904072046,-0.6176473079378039798825739126186817884445} +#define T_2048_809 {-0.7902302214373100319733111973619088530540,-0.6128100824294097082045595925592351704836} +#define T_2048_811 {-0.7939754775543371723145469331939239054918,-0.6079497849677736320828103089297655969858} +#define T_2048_813 {-0.7976908409433911550934226397657766938210,-0.6030665985403481643700729364354629069567} +#define T_2048_815 {-0.8013761717231402403882611906738020479679,-0.5981607069963422729230728691618423908949} +#define T_2048_817 {-0.8050313311429635465543697137036360800266,-0.5932322950397997951554884821234736591578} +#define T_2048_819 {-0.8086561815881749826218083398998714983463,-0.5882815482226453340786065382417291402817} +#define T_2048_821 {-0.8122505865852038819951985715306363999844,-0.5833086529376982909411708533298224210739} +#define T_2048_823 {-0.8158144108067337807455032816505990922451,-0.5783137964116555895799365316634066402912} +#define T_2048_825 {-0.8193475200767970090254266324336640536785,-0.5732971666980422043025100720115005970001} +#define T_2048_827 {-0.8228497813758263168537609999475535005331,-0.5682589526701316007262221319251693785191} +#define T_2048_829 {-0.8263210628456635342686809053702745586634,-0.5631993440138340911715886250021867454052} +#define T_2048_831 {-0.8297612337945230454039347023353911936283,-0.5581185312205561022125266390503384172916} +#define T_2048_833 {-0.8331701647019131851124029708444140851498,-0.5530167055800275788257636122580152004957} +#define T_2048_835 {-0.8365477272235120054233448172453790903091,-0.5478940591731001896746988677477929741144} +#define T_2048_837 {-0.8398937941959995212570788680750411003828,-0.5427507848645158894385076564503833651543} +#define T_2048_839 {-0.8432082396418454361963767951237969100475,-0.5375870762956455051195803207519929856062} +#define T_2048_841 {-0.8464909387740521262699644466920290142298,-0.5324031278771980124631113540090154856443} +#define T_2048_843 {-0.8497417680008524376589207349752541631460,-0.5271991347819013906672580560552887618542} +#define T_2048_845 {-0.8529606049303636305936038297659251838923,-0.5219752929371543892500540096079930663109} +#define T_2048_847 {-0.8561473283751944718389381705492269247770,-0.5167317990176498732068921526661142706871} +#define T_2048_849 {-0.8593018183570083623479263223998714238405,-0.5114688504379704125923922219953965395689} +#define T_2048_851 {-0.8624239561110405016819413503981195390224,-0.5061866453451553393705353300902061164379} +#define T_2048_853 {-0.8655136240905690891977997125650290399790,-0.5008853826112408258453001508314628154039} +#define T_2048_855 {-0.8685707059713408950685220588638912886381,-0.4955652618257725405825908637780230492353} +#define T_2048_857 {-0.8715950866559510901154794737522024661303,-0.4902264832882911593792130133806494995952} +#define T_2048_859 {-0.8745866522781761132065980746119748800993,-0.4848692480007911198569559019233565777540} +#define T_2048_861 {-0.8775452902072612415551589037931989878416,-0.4794937576601530082598401349969208240509} +#define T_2048_863 {-0.8804708890521607544954463264730293303728,-0.4741002146505500225437401695671724155545} +#define T_2048_865 {-0.8833633386657315789136646344559267163277,-0.4686888220358279566468695520597975701094} +#define T_2048_867 {-0.8862225301488806383787277809460647404194,-0.4632597835518602047422120904229814186692} +#define T_2048_869 {-0.8890483558546645737052926961041521281004,-0.4578133035988772303603866475896211341023} +#define T_2048_871 {-0.8918407093923427231274558835139032453299,-0.4523495872337708889610041751438984647393} +#define T_2048_873 {-0.8945994856313826959492985224642325192690,-0.4468688401623742145751805310283089056611} +#define T_2048_875 {-0.8973245807054183176276751510158646851778,-0.4413712687317166705192050812911475077271} +#define T_2048_877 {-0.9000158920161602793541533173993229866028,-0.4358570799222554748020286297105485573411} +#define T_2048_879 {-0.9026733182372588260022894246503710746765,-0.4303264813400826116485120564902899786830} +#define T_2048_881 {-0.9052967593181188155071481560298707336187,-0.4247796812091088058949139849573839455843} +#define T_2048_883 {-0.9078861164876662614986457811028230935335,-0.4192168883632239606562563949410105124116} +#define T_2048_885 {-0.9104412922580672473671370426018256694078,-0.4136383122384345578659292641532374545932} +#define T_2048_887 {-0.9129621904283982125605234614340588450432,-0.4080441628649786878213490126654505729675} +#define T_2048_889 {-0.9154487160882678331574879848631098866463,-0.4024346508594184301799145941913593560457} +#define T_2048_891 {-0.9179007756213904967168559778656344860792,-0.3968099874167103080502272405283292755485} +#define T_2048_893 {-0.9203182767091105942469653200532775372267,-0.3911703843022538706897250904148677363992} +#define T_2048_895 {-0.9227011283338785174734653082850854843855,-0.3855160538439188488979425528668798506260} +#define T_2048_897 {-0.9250492407826775842494271273608319461346,-0.3798472089240511606611505612818291410804} +#define T_2048_899 {-0.9273625256504011149516486511856783181429,-0.3741640629714579890929826433421112596989} +#define T_2048_901 {-0.9296408958431812141753880496253259480000,-0.3684668299533723212491054255224298685789} +#define T_2048_903 {-0.9318842655816681475045015758951194584370,-0.3627557243673972253716897284903097897768} +#define T_2048_905 {-0.9340925504042588700670535217795986682177,-0.3570309612334300330971359471732284873724} +#define T_2048_907 {-0.9362656671702782595900771411834284663200,-0.3512927560855671482720197218441171571612} +#define T_2048_909 {-0.9384035340631080579498757288092747330666,-0.3455413249639890382880480501626152545214} +#define T_2048_911 {-0.9405060705932682951768697421357501298189,-0.3397768844068268512259578528755810111761} +#define T_2048_913 {-0.9425731976014468660451939285849221050739,-0.3339996514420093820518786742468364536762} +#define T_2048_915 {-0.9446048372614802568492109458020422607660,-0.3282098435790925528010575362714007496834} +#define T_2048_917 {-0.9466009130832835349877996122813783586025,-0.3224076788010698524367114714550552889705} +#define T_2048_919 {-0.9485613499157302674902325634320732206106,-0.3165933755561658458077545219566673040390} +#define T_2048_921 {-0.9504860739494817023498285379901062697172,-0.3107671527496114749489208861632505431771} +#define T_2048_923 {-0.9523750127197658787991940698702819645405,-0.3049292297354024294797625316277844831347} +#define T_2048_925 {-0.9542280951091056673263324228173587471247,-0.2990798263080404750802188118541380390525} +#define T_2048_927 {-0.9560452513499964055654345429502427577972,-0.2932191626942586282211777870543301105499} +#define T_2048_929 {-0.9578264130275329080177471041679382324219,-0.2873474595447295110162144737842027097940} +#define T_2048_931 {-0.9595715130819845173348880962294060736895,-0.2814649379257579964175306486140470951796} +#define T_2048_933 {-0.9612804858113206396552641308517195284367,-0.2755718193109581437560962058341829106212} +#define T_2048_935 {-0.9629532668736838774137254404195118695498,-0.2696683255729150907598068442894145846367} +#define T_2048_937 {-0.9645897932898127580259028945874888449907,-0.2637546789748314024492970020219217985868} +#define T_2048_939 {-0.9661900034454125041349925595568493008614,-0.2578311021621589871344326638791244477034} +#define T_2048_941 {-0.9677538370934755107555247377604246139526,-0.2518978181542169680895426608913112431765} +#define T_2048_943 {-0.9692812353565485317119510000338777899742,-0.2459550503357946227289687612937996163964} +#define T_2048_945 {-0.9707721407289503501303329358051996678114,-0.2400030224487414987066813409910537302494} +#define T_2048_947 {-0.9722264970789362692471513582859188318253,-0.2340419585835434301834823145327391102910} +#define T_2048_949 {-0.9736442496508119770481926025240682065487,-0.2280720831708857310182736455317353829741} +#define T_2048_951 {-0.9750253450669941202022528159432113170624,-0.2220936209732035371278158208951936103404} +#define T_2048_953 {-0.9763697313300211400033390418684575706720,-0.2161067970762195200595812138999463059008} +#define T_2048_955 {-0.9776773578245099294292685954133048653603,-0.2101118368804696101559414955772808752954} +#define T_2048_957 {-0.9789481753190621970972529197752010077238,-0.2041089660928168680875671725516440346837} +#define T_2048_959 {-0.9801821359681174294919969725015107542276,-0.1980984107179535880227660982200177386403} +#define T_2048_961 {-0.9813791933137545608900609295233152806759,-0.1920803970498924373444538105104584246874} +#define T_2048_963 {-0.9825393022874412407574595818005036562681,-0.1860551516634466606703313118487130850554} +#define T_2048_965 {-0.9836624192117302545312895745155401527882,-0.1800229014056995147097239851063932292163} +#define T_2048_967 {-0.9847485018019042080084091139724478125572,-0.1739838733874638221355013456559390760958} +#define T_2048_969 {-0.9857975091675674761404479795601218938828,-0.1679382949747311726262921638408442959189} +#define T_2048_971 {-0.9868094018141855272574503032956272363663,-0.1618863937801118257908683517598547041416} +#define T_2048_973 {-0.9877841416445721778316624295257497578859,-0.1558283976542652327079707674784003756940} +#define T_2048_975 {-0.9887216919603237785807436921459157019854,-0.1497645346773215091484843242142233066261} +#define T_2048_977 {-0.9896220174632008870219124219147488474846,-0.1436950331502944433470503327043843455613} +#define T_2048_979 {-0.9904850842564570934101197963173035532236,-0.1376201215864860383231160767536493949592} +#define T_2048_981 {-0.9913108598461154441494613820395898073912,-0.1315400287028831161073583189136115834117} +#define T_2048_983 {-0.9920993131421917965440115949604660272598,-0.1254549834115462336736612769527710042894} +#define T_2048_985 {-0.9928504144598651048880810776608996093273,-0.1193652148109913685436467289946449454874} +#define T_2048_987 {-0.9935641355205953040297117695445194840431,-0.1132709521775643463081806316949950996786} +#define T_2048_989 {-0.9942404494531879022289899694442283362150,-0.1071724249568088427331247203255770727992} +#define T_2048_991 {-0.9948793307948056163780847782618366181850,-0.1010698627548278216714550126198446378112} +#define T_2048_993 {-0.9954807554919269385607094591250643134117,-0.0949634953296390055266229524022492114455} +#define T_2048_995 {-0.9960447009012519670179131026088725775480,-0.0888535525825246003117641180324426386505} +#define T_2048_997 {-0.9965711457905548353863878219272010028362,-0.0827402645493756916383887300980859436095} +#define T_2048_999 {-0.9970600703394829622538964031264185905457,-0.0766238613920314920457954599442018661648} +#define T_2048_1001 {-0.9975114561403034540987277978274505585432,-0.0705045733896138698826305812872305978090} +#define T_2048_1003 {-0.9979252861985959954793656834226567298174,-0.0643826309298574650519242368318373337388} +#define T_2048_1005 {-0.9983015449338928926081848658213857561350,-0.0582582645004357593809807269735756563023} +#define T_2048_1007 {-0.9986402181802652711084533621033187955618,-0.0521317046802833236607277456187148345634} +#define T_2048_1009 {-0.9989412931868568712445721757831051945686,-0.0460031821309146299325831819260201882571} +#define T_2048_1011 {-0.9992047586183638863133182894671335816383,-0.0398729275877398106620042028680472867563} +#define T_2048_1013 {-0.9994306045554617323745105750276707112789,-0.0337411718513775868433235416432580677792} +#define T_2048_1015 {-0.9996188224951786382987961587787140160799,-0.0276081457789657432055907548829054576345} +#define T_2048_1017 {-0.9997694053512152789764400040439795702696,-0.0214740802754695078724544998749479418620} +#define T_2048_1019 {-0.9998823474542125611108644989144522696733,-0.0153392062849881018887776562564795312937} +#define T_2048_1021 {-0.9999576445519638978609577861789148300886,-0.0092037547820598194364682953505507612135} +#define T_2048_1023 {-0.9999952938095761911796444110223092138767,-0.0030679567629659761432425746363605867373} +#define T_2048_1025 {-0.9999952938095761911796444110223092138767,0.0030679567629659761432425746363605867373} +#define T_2048_1029 {-0.9998823474542125611108644989144522696733,0.0153392062849881018887776562564795312937} +#define T_2048_1035 {-0.9994306045554617323745105750276707112789,0.0337411718513775868433235416432580677792} +#define T_2048_1037 {-0.9992047586183638863133182894671335816383,0.0398729275877398106620042028680472867563} +#define T_2048_1041 {-0.9986402181802652711084533621033187955618,0.0521317046802833236607277456187148345634} +#define T_2048_1045 {-0.9979252861985959954793656834226567298174,0.0643826309298574650519242368318373337388} +#define T_2048_1047 {-0.9975114561403034540987277978274505585432,0.0705045733896138698826305812872305978090} +#define T_2048_1053 {-0.9960447009012519670179131026088725775480,0.0888535525825246003117641180324426386505} +#define T_2048_1055 {-0.9954807554919269385607094591250643134117,0.0949634953296390055266229524022492114455} +#define T_2048_1059 {-0.9942404494531879022289899694442283362150,0.1071724249568088427331247203255770727992} +#define T_2048_1065 {-0.9920993131421917965440115949604660272598,0.1254549834115462336736612769527710042894} +#define T_2048_1071 {-0.9896220174632008870219124219147488474846,0.1436950331502944433470503327043843455613} +#define T_2048_1075 {-0.9877841416445721778316624295257497578859,0.1558283976542652327079707674784003756940} +#define T_2048_1077 {-0.9868094018141855272574503032956272363663,0.1618863937801118257908683517598547041416} +#define T_2048_1083 {-0.9836624192117302545312895745155401527882,0.1800229014056995147097239851063932292163} +#define T_2048_1085 {-0.9825393022874412407574595818005036562681,0.1860551516634466606703313118487130850554} +#define T_2048_1089 {-0.9801821359681174294919969725015107542276,0.1980984107179535880227660982200177386403} +#define T_2048_1095 {-0.9763697313300211400033390418684575706720,0.2161067970762195200595812138999463059008} +#define T_2048_1101 {-0.9722264970789362692471513582859188318253,0.2340419585835434301834823145327391102910} +#define T_2048_1105 {-0.9692812353565485317119510000338777899742,0.2459550503357946227289687612937996163964} +#define T_2048_1107 {-0.9677538370934755107555247377604246139526,0.2518978181542169680895426608913112431765} +#define T_2048_1113 {-0.9629532668736838774137254404195118695498,0.2696683255729150907598068442894145846367} +#define T_2048_1115 {-0.9612804858113206396552641308517195284367,0.2755718193109581437560962058341829106212} +#define T_2048_1119 {-0.9578264130275329080177471041679382324219,0.2873474595447295110162144737842027097940} +#define T_2048_1125 {-0.9523750127197658787991940698702819645405,0.3049292297354024294797625316277844831347} +#define T_2048_1131 {-0.9466009130832835349877996122813783586025,0.3224076788010698524367114714550552889705} +#define T_2048_1135 {-0.9425731976014468660451939285849221050739,0.3339996514420093820518786742468364536762} +#define T_2048_1137 {-0.9405060705932682951768697421357501298189,0.3397768844068268512259578528755810111761} +#define T_2048_1143 {-0.9340925504042588700670535217795986682177,0.3570309612334300330971359471732284873724} +#define T_2048_1145 {-0.9318842655816681475045015758951194584370,0.3627557243673972253716897284903097897768} +#define T_2048_1149 {-0.9273625256504011149516486511856783181429,0.3741640629714579890929826433421112596989} +#define T_2048_1155 {-0.9203182767091105942469653200532775372267,0.3911703843022538706897250904148677363992} +#define T_2048_1161 {-0.9129621904283982125605234614340588450432,0.4080441628649786878213490126654505729675} +#define T_2048_1165 {-0.9078861164876662614986457811028230935335,0.4192168883632239606562563949410105124116} +#define T_2048_1167 {-0.9052967593181188155071481560298707336187,0.4247796812091088058949139849573839455843} +#define T_2048_1173 {-0.8973245807054183176276751510158646851778,0.4413712687317166705192050812911475077271} +#define T_2048_1175 {-0.8945994856313826959492985224642325192690,0.4468688401623742145751805310283089056611} +#define T_2048_1179 {-0.8890483558546645737052926961041521281004,0.4578133035988772303603866475896211341023} +#define T_2048_1185 {-0.8804708890521607544954463264730293303728,0.4741002146505500225437401695671724155545} +#define T_2048_1191 {-0.8715950866559510901154794737522024661303,0.4902264832882911593792130133806494995952} +#define T_2048_1195 {-0.8655136240905690891977997125650290399790,0.5008853826112408258453001508314628154039} +#define T_2048_1197 {-0.8624239561110405016819413503981195390224,0.5061866453451553393705353300902061164379} +#define T_2048_1203 {-0.8529606049303636305936038297659251838923,0.5219752929371543892500540096079930663109} +#define T_2048_1205 {-0.8497417680008524376589207349752541631460,0.5271991347819013906672580560552887618542} +#define T_2048_1209 {-0.8432082396418454361963767951237969100475,0.5375870762956455051195803207519929856062} +#define T_2048_1215 {-0.8331701647019131851124029708444140851498,0.5530167055800275788257636122580152004957} +#define T_2048_1221 {-0.8228497813758263168537609999475535005331,0.5682589526701316007262221319251693785191} +#define T_2048_1225 {-0.8158144108067337807455032816505990922451,0.5783137964116555895799365316634066402912} +#define T_2048_1227 {-0.8122505865852038819951985715306363999844,0.5833086529376982909411708533298224210739} +#define T_2048_1233 {-0.8013761717231402403882611906738020479679,0.5981607069963422729230728691618423908949} +#define T_2048_1235 {-0.7976908409433911550934226397657766938210,0.6030665985403481643700729364354629069567} +#define T_2048_1239 {-0.7902302214373100319733111973619088530540,0.6128100824294097082045595925592351704836} +#define T_2048_1245 {-0.7788165123814759782661099052347708493471,0.6272518154951440827460373839130625128746} +#define T_2048_1251 {-0.7671389119358204000675982570101041346788,0.6414810128085831619770829092885833233595} +#define T_2048_1255 {-0.7592091889783880720443676182185299694538,0.6508466849963808753543048624123912304640} +#define T_2048_1257 {-0.7552013768965365470009487580682616680861,0.6554928529996153496739452748443000018597} +#define T_2048_1263 {-0.7430079521351217186619919630174990743399,0.6692825883466361203133487833838444203138} +#define T_2048_1265 {-0.7388873244606151136082416996941901743412,0.6738290003787560378256671356211882084608} +#define T_2048_1269 {-0.7305627692278275908677187544526532292366,0.6828455463852480811226541845826432108879} +#define T_2048_1275 {-0.7178700450557317092048492668254766613245,0.6961771314914629860126638050132896751165} +#define T_2048_1281 {-0.7049340803759048812437981723633129149675,0.7092728264388656889138928818283602595329} +#define T_2048_1287 {-0.6917592583641577475006556596781592816114,0.7221281939292153451148692511196713894606} +#define T_2048_1293 {-0.6783500431298614685715619998518377542496,0.7347388780959634990708195800834801048040} +#define T_2048_1299 {-0.6647109782033449043581185833318158984184,0.7471006059801801324482539712334983050823} +#define T_2048_1305 {-0.6508466849963808753543048624123912304640,0.7592091889783880720443676182185299694538} +#define T_2048_1311 {-0.6367618612362841989948947230004705488682,0.7710605242618138177590481063816696405411} +#define T_2048_1317 {-0.6224612793741499672250938601791858673096,0.7826505961665757293843626030138693749905} +#define T_2048_1323 {-0.6079497849677736320828103089297655969858,0.7939754775543371723145469331939239054918} +#define T_2048_1329 {-0.5932322950397997951554884821234736591578,0.8050313311429635465543697137036360800266} +#define T_2048_1335 {-0.5783137964116555895799365316634066402912,0.8158144108067337807455032816505990922451} +#define T_2048_1341 {-0.5631993440138340911715886250021867454052,0.8263210628456635342686809053702745586634} +#define T_2048_1347 {-0.5478940591731001896746988677477929741144,0.8365477272235120054233448172453790903091} +#define T_2048_1353 {-0.5324031278771980124631113540090154856443,0.8464909387740521262699644466920290142298} +#define T_2048_1359 {-0.5167317990176498732068921526661142706871,0.8561473283751944718389381705492269247770} +#define T_2048_1365 {-0.5008853826112408258453001508314628154039,0.8655136240905690891977997125650290399790} +#define T_2048_1371 {-0.4848692480007911198569559019233565777540,0.8745866522781761132065980746119748800993} +#define T_2048_1377 {-0.4686888220358279566468695520597975701094,0.8833633386657315789136646344559267163277} +#define T_2048_1383 {-0.4523495872337708889610041751438984647393,0.8918407093923427231274558835139032453299} +#define T_2048_1389 {-0.4358570799222554748020286297105485573411,0.9000158920161602793541533173993229866028} +#define T_2048_1395 {-0.4192168883632239606562563949410105124116,0.9078861164876662614986457811028230935335} +#define T_2048_1401 {-0.4024346508594184301799145941913593560457,0.9154487160882678331574879848631098866463} +#define T_2048_1407 {-0.3855160538439188488979425528668798506260,0.9227011283338785174734653082850854843855} +#define T_2048_1413 {-0.3684668299533723212491054255224298685789,0.9296408958431812141753880496253259480000} +#define T_2048_1419 {-0.3512927560855671482720197218441171571612,0.9362656671702782595900771411834284663200} +#define T_2048_1425 {-0.3339996514420093820518786742468364536762,0.9425731976014468660451939285849221050739} +#define T_2048_1431 {-0.3165933755561658458077545219566673040390,0.9485613499157302674902325634320732206106} +#define T_2048_1437 {-0.2990798263080404750802188118541380390525,0.9542280951091056673263324228173587471247} +#define T_2048_1443 {-0.2814649379257579964175306486140470951796,0.9595715130819845173348880962294060736895} +#define T_2048_1449 {-0.2637546789748314024492970020219217985868,0.9645897932898127580259028945874888449907} +#define T_2048_1455 {-0.2459550503357946227289687612937996163964,0.9692812353565485317119510000338777899742} +#define T_2048_1461 {-0.2280720831708857310182736455317353829741,0.9736442496508119770481926025240682065487} +#define T_2048_1467 {-0.2101118368804696101559414955772808752954,0.9776773578245099294292685954133048653603} +#define T_2048_1473 {-0.1920803970498924373444538105104584246874,0.9813791933137545608900609295233152806759} +#define T_2048_1479 {-0.1739838733874638221355013456559390760958,0.9847485018019042080084091139724478125572} +#define T_2048_1485 {-0.1558283976542652327079707674784003756940,0.9877841416445721778316624295257497578859} +#define T_2048_1491 {-0.1376201215864860383231160767536493949592,0.9904850842564570934101197963173035532236} +#define T_2048_1497 {-0.1193652148109913685436467289946449454874,0.9928504144598651048880810776608996093273} +#define T_2048_1503 {-0.1010698627548278216714550126198446378112,0.9948793307948056163780847782618366181850} +#define T_2048_1509 {-0.0827402645493756916383887300980859436095,0.9965711457905548353863878219272010028362} +#define T_2048_1515 {-0.0643826309298574650519242368318373337388,0.9979252861985959954793656834226567298174} +#define T_2048_1521 {-0.0460031821309146299325831819260201882571,0.9989412931868568712445721757831051945686} +#define T_2048_1527 {-0.0276081457789657432055907548829054576345,0.9996188224951786382987961587787140160799} +#define T_2048_1533 {-0.0092037547820598194364682953505507612135,0.9999576445519638978609577861789148300886} +// Pre-computed twiddles for N=2187 +#define T_2187_1 {0.9999958730246369942307183009688742458820,-0.0028729660099040786602753172473967424594} +#define T_2187_2 {0.9999834921326118397644222568487748503685,-0.0057459083064882740879419920076998096192} +#define T_2187_4 {0.9999339690754668330541221621388103812933,-0.0114916269075918536790581470086181070656} +#define T_2187_5 {0.9998968273191078948158860839612316340208,-0.0143643557872330171476527027607517084107} +#define T_2187_7 {0.9997977848836454572989396183402277529240,-0.0201094341480766991558937206718837842345} +#define T_2187_8 {0.9997358850220334769431929089478217065334,-0.0229817362096855111464055454462140914984} +#define T_2187_10 {0.9995873305656355345405472689890302717686,-0.0287257475562743211372485063748172251508} +#define T_2187_11 {0.9995006771970108561120582635339815169573,-0.0315974094304676961231237442007113713771} +#define T_2187_13 {0.9993026217550673440115360790514387190342,-0.0373399270680699682523062676864356035367} +#define T_2187_14 {0.9991912213164882983207348843279760330915,-0.0402107354330212413140444027703779283911} +#define T_2187_16 {0.9989436796015768171841386902087833732367,-0.0459513327778657137967499579644936602563} +#define T_2187_17 {0.9988075403684415487504111297312192618847,-0.0488210743751512771249068123324832413346} +#define T_2187_19 {0.9985105307692350162795946744154207408428,-0.0545593249861159035329905009348294697702} +#define T_2187_20 {0.9983496628546664197756399516947567462921,-0.0574277866365571557083491427420085528865} +#define T_2187_22 {0.9980032074345668302584044795366935431957,-0.0631632642468473587404886870899645145983} +#define T_2187_23 {0.9978176227886617066786811847123317420483,-0.0660302328663467247293894502035982441157} +#define T_2187_25 {0.9974217472841611087375213173800148069859,-0.0717625114151607124135523463337449356914} +#define T_2187_26 {0.9972114596931024310677571520500350743532,-0.0746277740305306552803799036155396606773} +#define T_2187_28 {0.9967661935118711236114563689625356346369,-0.0803564276947093814040812276289216242731} +#define T_2187_29 {0.9965312185969036340438265142438467592001,-0.0832197714594930942899608794505184050649} +#define T_2187_31 {0.9960365948156058024665071570780128240585,-0.0889443746851526079177929773322830442339} +#define T_2187_32 {0.9957769500318757183165985225059557706118,-0.0918055868954354425959252239408669993281} +#define T_2187_34 {0.9952330053937118448104115486785303801298,-0.0975257144295791911225990133971208706498} +#define T_2187_35 {0.9949487100289704510913679769146256148815,-0.1003845825397893104513613593553600367159} +#define T_2187_37 {0.9943554849409484974742667873215395957232,-0.1060998094618981757442099933541612699628} +#define T_2187_38 {0.9940465601141187379496955145441461354494,-0.1089561211005955487785001878364710137248} +#define T_2187_40 {0.9934040986440521026068495302752126008272,-0.1146660228541930975909579615290567744523} +#define T_2187_41 {0.9930705673036605007908406150818336755037,-0.1175195658398455744730881633586250245571} +#define T_2187_43 {0.9923789171768944150642255408456549048424,-0.1232237182640363026830954140677931718528} +#define T_2187_44 {0.9920208040993658826778300863225013017654,-0.1260742806207815203123345781932584941387} +#define T_2187_46 {0.9912800166952320246593899355502799153328,-0.1317722599817597040061656343823415227234} +#define T_2187_47 {0.9908973484830494449226989672752097249031,-0.1346196299551517816528445337098673917353} +#define T_2187_49 {0.9901074788310497698518020115443505346775,-0.1403110129776786174637948079180205240846} +#define T_2187_50 {0.9897002839107778005001136989449150860310,-0.1431549790504183239381319481253740377724} +#define T_2187_52 {0.9888613906864961444753703290189150720835,-0.1488393429492649577827734219681588001549} +#define T_2187_53 {0.9884296993066696845886554001481272280216,-0.1516796938569123232021240710309939458966} +#define T_2187_55 {0.9875418448274129179509372988832183182240,-0.1573566163682665608458677297676331363618} +#define T_2187_56 {0.9870856890562899055296952610660810023546,-0.1601931411149347395106445901546976529062} +#define T_2187_58 {0.9861489392764595240947755883098579943180,-0.1658622005277689548385922080342425033450} +#define T_2187_59 {0.9856683529996386194937940672389231622219,-0.1686946884017979653158647579402895644307} +#define T_2187_61 {0.9846827775058303311439544813765678554773,-0.1743554635891959858629007840136182494462} +#define T_2187_62 {0.9841777964237343745423913787817582488060,-0.1771837041788055511215560500204446725547} +#define T_2187_64 {0.9831434684295693449129771579464431852102,-0.1828357746292460783710254190737032331526} +#define T_2187_65 {0.9826141300547924783970188400417100638151,-0.1856595578381660671674069362779846414924} +#define T_2187_67 {0.9815311263954785703234051652543712407351,-0.1913025036867605221946320170900435186923} +#define T_2187_68 {0.9809774700500003552505745574308093637228,-0.1941216197498379925079348140570800751448} +#define T_2187_70 {0.9798458711766244721985685828258283436298,-0.1997550218095201501888880102342227473855} +#define T_2187_71 {0.9792679379888882262861216076998971402645,-0.2025692613083018012165581467343145050108} +#define T_2187_73 {0.9780878279624398707881027803523465991020,-0.2081927011009672146002458248403854668140} +#define T_2187_74 {0.9774856608642977784384697770292405039072,-0.2110018549792559983124817790667293593287} +#define T_2187_76 {0.9762571273494247137136881065089255571365,-0.2166149147668487151552341174465254880488} +#define T_2187_77 {0.9756307710729488213985405309358611702919,-0.2194187743462336914745947069604881107807} +#define T_2187_79 {0.9743539053314446141129678835568483918905,-0.2250210371617779592234853680565720424056} +#define T_2187_80 {0.9737034064056033777490029024193063378334,-0.2278193941571359515396721917568356730044} +#define T_2187_82 {0.9723783032896283762269717954040970653296,-0.2334104438357106625634429519777768291533} +#define T_2187_83 {0.9717037100368306479225566363311372697353,-0.2362030903706787698936864217102993279696} +#define T_2187_85 {0.9703304679818656186540692942799068987370,-0.2417825115803323710039762772794347256422} +#define T_2187_86 {0.9696318305143708515814182646863628178835,-0.2445692402027500877981225357871153391898} +#define T_2187_88 {0.9682105515319048283373604135704226791859,-0.2501366184753535670814983404852682724595} +#define T_2187_89 {0.9674879217481004989309667507768608629704,-0.2529172221726733726931968249118654057384} +#define T_2187_91 {0.9660187114180527334639236869406886398792,-0.2584721439347090754523605937720276415348} +#define T_2187_92 {0.9652721429985990919675487020867876708508,-0.2612464161493744385644788508216151967645} +#define T_2187_94 {0.9637551104614759944766433363838586956263,-0.2667884687526583809003000169468577951193} +#define T_2187_95 {0.9629846588653184769057702396821696311235,-0.2695562033974479021480874507687985897064} +#define T_2187_97 {0.9614199168141062123993378918385133147240,-0.2750849751497835837810157499916385859251} +#define T_2187_98 {0.9606256392743554028967878366529475897551,-0.2778459666231201663499916776345344260335} +#define T_2187_100 {0.9590133039461485875420976299210451543331,-0.2833610468188809128342597887240117415786} +#define T_2187_101 {0.9581952594658289523721350633422844111919,-0.2861150900201049895876792561466572806239} +#define T_2187_103 {0.9565354506331955608544603819609619677067,-0.2916160689707435471618168776331003755331} +#define T_2187_104 {0.9556936999808621768792704642692115157843,-0.2943629593153489487633578391978517174721} +#define T_2187_106 {0.9539865409429466591717527990113012492657,-0.2998494283798311399458214054902782663703} +#define T_2187_107 {0.9531211466481711580556179796985816210508,-0.3025889618146626602879223355557769536972} +#define T_2187_109 {0.9513667642215349884438069238967727869749,-0.3080605134298232683498497408436378464103} +#define T_2187_110 {0.9504777905702603835180752867017872631550,-0.3107924864482349280869755148160038515925} +#define T_2187_112 {0.9486763150794612631244717704248614609241,-0.3162487141590537009783190569578437134624} +#define T_2187_113 {0.9477638281092265470917368475056719034910,-0.3189729238160260993417693953233538195491} +#define T_2187_115 {0.9459153933771375921679691600729711353779,-0.3244134223058212640466990706045180559158} +#define T_2187_116 {0.9449794608721723276900661403487902134657,-0.3271296662330375748517496958811534568667} +#define T_2187_118 {0.9430842042100405775428839660889934748411,-0.3325540313535748637718825193587690591812} +#define T_2187_119 {0.9421248956962291476457949102041311562061,-0.3352621077744535327269659319426864385605} +#define T_2187_121 {0.9401829578934759457098380153183825314045,-0.3406699365759684461352208018070086836815} +#define T_2187_122 {0.9392003446331931293400430149631574749947,-0.3433696443206523674085417496826266869903} +#define T_2187_124 {0.9372118699469550451297550353046972304583,-0.3487605350817832849941169115481898188591} +#define T_2187_125 {0.9362060249337719186613071542524266988039,-0.3514516736020836806808631536114262416959} +#define T_2187_127 {0.9341711610781849861595560469140764325857,-0.3568252258597138237838919394562253728509} +#define T_2187_128 {0.9331421590314470382310219065402634441853,-0.3595075952440081046290742960991337895393} +#define T_2187_130 {0.9310610571666733115137049026088789105415,-0.3648634098230139621854561937652761116624} +#define T_2187_131 {0.9300089745259497719942487492517102509737,-0.3675368108110959597389921782450983300805} +#define T_2187_133 {0.9278817892469488626261409081052988767624,-0.3728744898540001240228036749613238498569} +#define T_2187_134 {0.9268067041663539118445669373613782227039,-0.3755387238518824721822397805226501077414} +#define T_2187_136 {0.9246335934913986198679936023836489766836,-0.3808578708484084973662220363621599972248} +#define T_2187_137 {0.9235355858337865875284933281363919377327,-0.3835127399430749983721966600569430738688} +#define T_2187_139 {0.9213167111927244024016658840992022305727,-0.3888129597596022279937244547909358516335} +#define T_2187_140 {0.9201958625237562916510114519041962921619,-0.3914582667337097587889616079337429255247} +#define T_2187_142 {0.9179313887460178733590510091744363307953,-0.3967391656426259571865955422254046425223} +#define T_2187_143 {0.9167877823281026516966107919870410114527,-0.3993747139891551389823121098743285983801} +#define T_2187_145 {0.9144778776304572920352597975579556077719,-0.4046358996981044842122798854688880965114} +#define T_2187_146 {0.9133115984165666167982067236152943223715,-0.4072614936349569503271084158768644556403} +#define T_2187_148 {0.9109564343906260130978580491500906646252,-0.4125025753159815566917245632794220000505} +#define T_2187_149 {0.9097675690179838348115026747109368443489,-0.4151180198005237631520003560581244528294} +#define T_2187_151 {0.9073673206174556193914781943021807819605,-0.4203386081190965128939751593861728906631} +#define T_2187_152 {0.9061559574011017748063068211195059120655,-0.4229437088626484264608507146476767957211} +#define T_2187_154 {0.9037108029287931332262928663112688809633,-0.4281434160065946681328341583139263093472} +#define T_2187_155 {0.9024770318550230374654574916348792612553,-0.4307379794888625546001037491805618628860} +#define T_2187_157 {0.8999871529495953037525168838328681886196,-0.4359164191971687807303226236399495974183} +#define T_2187_158 {0.8987310656692755195251720579108223319054,-0.4385002526806209277587811357079772278666} +#define T_2187_160 {0.8961966472917510806439622683683410286903,-0.4436570402721281558555688206979539245367} +#define T_2187_161 {0.8949183371135107645244488594471476972103,-0.4462299518163125311431826958141755312681} +#define T_2187_163 {0.8923395675335332732913684594677761197090,-0.4513647042182925561704109895799774676561} +#define T_2187_164 {0.8910391294168331643987812640261836349964,-0.4539265026940948466460667987121269106865} +#define T_2187_166 {0.8884162001986819490184643655084073543549,-0.4590388384707067004342206928413361310959} +#define T_2187_167 {0.8870937307467594568066715510212816298008,-0.4615893335745486214527488755265949293971} +#define T_2187_169 {0.8844268367351190152092499374703038483858,-0.4666788729551736847334098001738311722875} +#define T_2187_170 {0.8830824341878128480587406556878704577684,-0.4692178752231495608704392452636966481805} +#define T_2187_172 {0.8803717734932990923724105414294172078371,-0.4742842401306028299323713781632250174880} +#define T_2187_173 {0.8790055377197512065379214618587866425514,-0.4768115609525540032898049958021147176623} +#define T_2187_175 {0.8762513117041941246299074919079430401325,-0.4818543750311697904109564660757314413786} +#define T_2187_176 {0.8748633441954307699006676557473838329315,-0.4843698266646954686542869694676483049989} +#define T_2187_178 {0.8720657574569171677225654093490447849035,-0.4893887153082847607521443933364935219288} +#define T_2187_179 {0.8706561613183099179735791040002368390560,-0.4918921108926886942569467464636545628309} +#define T_2187_181 {0.8678154216759838002204219264967832714319,-0.4968867012723668930007647759339306503534} +#define T_2187_182 {0.8663843016195901247655797305924352258444,-0.4993778548425382157738283694925485178828} +#define T_2187_184 {0.8635006200982157098522407068230677396059,-0.5043477759344209276903825411864090710878} +#define T_2187_185 {0.8620480824350003068445857934420928359032,-0.5068265024346485514428195529035292565823} +#define T_2187_187 {0.8591216732492856777980705373920500278473,-0.5117713850474131520584819554642308503389} +#define T_2187_188 {0.8576478258812235688779423981031868606806,-0.5142375003451327142300897321547381579876} +#define T_2187_190 {0.8546789064199074026362268341472372412682,-0.5191569771474434658031782419129740446806} +#define T_2187_191 {0.8531838588319681226934676487871911376715,-0.5216102980469157213150310781202279031277} +#define T_2187_193 {0.8501726496416712741677201847778633236885,-0.5265040035947116114911636941542383283377} +#define T_2187_194 {0.8486565128936859325747832372144330292940,-0.5289443478506308249365019946708343923092} +#define T_2187_196 {0.8456032376625279844972737919306382536888,-0.5338119186142720185017651601810939610004} +#define T_2187_197 {0.8440661243809388647463265442638657987118,-0.5362391049453046898420893739967141300440} +#define T_2187_199 {0.8409710099219216417054667545016855001450,-0.5410801793365777045963227465108502656221} +#define T_2187_200 {0.8394130342914158937617230549221858382225,-0.5434940274388285752493743530067149549723} +#define T_2187_202 {0.8362763105255744955357499748060945421457,-0.5483082458378070178639518417185172438622} +#define T_2187_203 {0.8346975882806021429516363241418730467558,-0.5507085763982130233173961642023641616106} +#define T_2187_205 {0.8315194882199247183862667043285910040140,-0.5554955811799715537091515216161496937275} +#define T_2187_206 {0.8299201366361010911987250437960028648376,-0.5578822158896225014146352805255446583033} +#define T_2187_208 {0.8267008963662204612532491410092916339636,-0.5626416514508036925690248608589172363281} +#define T_2187_209 {0.8250810342516141648872007863246835768223,-0.5650144130181873336482567538041621446609} +#define T_2187_211 {0.8218208929142694074698738404549658298492,-0.5697459258034170970219634000386577099562} +#define T_2187_212 {0.8201806406005770488931716499791946262121,-0.5721046379675898130301447963574901223183} +#define T_2187_214 {0.8168798403758492643333966043428517878056,-0.5768078764957418336223327059997245669365} +#define T_2187_215 {0.8152193197094562693294506061647552996874,-0.5791523640394220517890744304168038070202} +#define T_2187_217 {0.8118781057977778603529372958291787654161,-0.5838269789297272360784063494065776467323} +#define T_2187_218 {0.8101974401307073803124580990697722882032,-0.5861570676923113509815266297664493322372} +#define T_2187_220 {0.8068160607346474000323155451042111963034,-0.5908027116903118436397335244691930711269} +#define T_2187_221 {0.8051153749153968641749656853789929300547,-0.5931182285808116461112149409018456935883} +#define T_2187_223 {0.8016940812212235423217521201877389103174,-0.5977345565841565289133541227784007787704} +#define T_2187_224 {0.7999735015854902986376373519306071102619,-0.6000353295940576980882497082347981631756} +#define T_2187_226 {0.7965125477445111901175778257311321794987,-0.6046219986781391497743243235163390636444} +#define T_2187_227 {0.7947722021058077901400906739581841975451,-0.6069078568941788098811684903921559453011} +#define T_2187_229 {0.7912718452154904325013262678112369030714,-0.6114645263376061734561517369002103805542} +#define T_2187_230 {0.7895118628556496709336443018401041626930,-0.6137352999544687381927587921381928026676} +#define T_2187_232 {0.7859723629405228617628154097474180161953,-0.6182616312643790523750908505462575703859} +#define T_2187_233 {0.7841928746000946803818010266695637255907,-0.6205171515973110230035558743111323565245} +#define T_2187_235 {0.7806144945924325950770139570522587746382,-0.6250128085345140194206692285661119967699} +#define T_2187_236 {0.7788156324609710745576762747077737003565,-0.6272529080318540728455900534754619002342} +#define T_2187_238 {0.7751986381812614457231802589376457035542,-0.6317175566358099736419262626441195607185} +#define T_2187_239 {0.7733805358875058821865877689560875296593,-0.6339420688914360058063834912900347262621} +#define T_2187_241 {0.7697251960247032398498845395806711167097,-0.6383753775050633461063398499391041696072} +#define T_2187_242 {0.7678879886266511967107817326905205845833,-0.6405841372707538061703758103249128907919} +#define T_2187_244 {0.7641945747182173898082169216650072485209,-0.6449857765650672813961818974348716437817} +#define T_2187_245 {0.7623383986930919453683941355848219245672,-0.6471786197627764636308711487799882888794} +#define T_2187_247 {0.7586071851048249436999526551517192274332,-0.6515482627613505828279016895976383239031} +#define T_2187_248 {0.7567321783389363565319740700942929834127,-0.6537250264953975431581056909635663032532} +#define T_2187_250 {0.7529634422445888874975139515299815684557,-0.6580623485986564213945371193403843790293} +#define T_2187_251 {0.7510697440230919008641308209917042404413,-0.6602228711678255201888987357961013913155} +#define T_2187_253 {0.7472637653837815863155924489547032862902,-0.6645275501771560344721478941210079938173} +#define T_2187_254 {0.7453515163803283716248415657901205122471,-0.6666716710867089945580232779320795089006} +#define T_2187_256 {0.7415085779237409191466667834902182221413,-0.6709433872283946387327091542829293757677} +#define T_2187_257 {0.7395779201900313237771911190066020935774,-0.6730709472019927863684074509365018457174} +#define T_2187_259 {0.7356983073894177715956743668357376009226,-0.6773093831509687801073482660285662859678} +#define T_2187_260 {0.7337493843446467600699634203920140862465,-0.6794202241425051358447717575472779572010} +#define T_2187_262 {0.7298333853976173291044915458769537508488,-0.6836250650459307909301287509151734411716} +#define T_2187_263 {0.7278663418178212829445783427217975258827,-0.6857190302512703450332764987251721322536} +#define T_2187_265 {0.7239142476249363911122713943768758326769,-0.6898899637519176897271222514973487704992} +#define T_2187_266 {0.7219292296322382673778861317259725183249,-0.6919668976205458621464572388504166156054} +#define T_2187_268 {0.7179413337753990376199908496346324682236,-0.6961036138800035244500463704753201454878} +#define T_2187_269 {0.7159384888271538294191032036906108260155,-0.6981633621265808109512818191433325409889} +#define T_2187_271 {0.7119150875477933126944662944879382848740,-0.7022655538482702741731600326602347195148} +#define T_2187_272 {0.7098945644256339226885188509186264127493,-0.7043079634640938557765821315115317702293} +#define T_2187_274 {0.7058359566027107012686769849096890538931,-0.7083753259160967541419040571781806647778} +#define T_2187_275 {0.7037979054014957824847442680038511753082,-0.7104002451804666273815769272914621978998} +#define T_2187_277 {0.6997043925292916188851677361526526510715,-0.7144324762181621935042130644433200359344} +#define T_2187_278 {0.6976489646459559379465531492314767092466,-0.7164397547096515994624610357277560979128} +#define T_2187_280 {0.6935208508116794678954875053022988140583,-0.7204365547981613771000297674618195742369} +#define T_2187_281 {0.6914481989339869016930606449022889137268,-0.7224260434057908630833821916894521564245} +#define T_2187_283 {0.6872857907951843703386884953943081200123,-0.7263871156422297969967871722474228590727} +#define T_2187_284 {0.6851960688903859786336170145659707486629,-0.7283586665765441336972685348882805556059} +#define T_2187_286 {0.6809996756521610183909842817229218780994,-0.7322837167120757051463897369103506207466} +#define T_2187_287 {0.6788930389555574151927430648356676101685,-0.7342371835161224380428279800980817526579} +#define T_2187_289 {0.6746629723476017526095915854966733604670,-0.7381259199778172908068540891690645366907} +#define T_2187_290 {0.6725395773510111085968787847377825528383,-0.7400611575380262596723923707031644880772} +#define T_2187_292 {0.6682761516044478655729221827641595155001,-0.7439132914505218741041403518465813249350} +#define T_2187_293 {0.6661361560445813179143215165822766721249,-0.7458301560074849234638350026216357946396} +#define T_2187_295 {0.6618396878686220174969889740168582648039,-0.7496454012144442291543100509443320333958} +#define T_2187_296 {0.6596832507153659319598659749317448586226,-0.7515437503735944435589999557123519480228} +#define T_2187_298 {0.6553540592737839842740754647820722311735,-0.7553218234589634816344982937152963131666} +#define T_2187_299 {0.6531813407183909570008495393267367035151,-0.7572015162011532796171309200872201472521} +#define T_2187_301 {0.6488197476058121804243228325503878295422,-0.7609421365102146950221140286885201931000} +#define T_2187_302 {0.6466309090490012234653249834082089364529,-0.7628030332021906723127813165774568915367} +#define T_2187_304 {0.6422372382670146206962158430542331188917,-0.7665059228624120368777994372067041695118} +#define T_2187_305 {0.6400324423069811974329468284850008785725,-0.7683478852671885572789278739946894347668} +#define T_2187_307 {0.6356070202400700974720848535071127116680,-0.7720127692088643023282656940864399075508} +#define T_2187_308 {0.6333864306604080063323181093437597155571,-0.7738356604959919504693743874668143689632} +#define T_2187_310 {0.6289295860517044589599322534922976046801,-0.7774622664726769105669745840714313089848} +#define T_2187_311 {0.6266933678092387882685443400987423956394,-0.7792659512284071388066308827546890825033} +#define T_2187_313 {0.6222054317361032094169104311731643974781,-0.7828540098371403743726659740786999464035} +#define T_2187_314 {0.6199537509486364728061857931606937199831,-0.7846383540744844564684967735956888645887} +#define T_2187_316 {0.6154350567980632069620128277165349572897,-0.7881875987758021340212621907994616776705} +#define T_2187_317 {0.6131680807320354364975401040283031761646,-0.7899524699444847586349283119488973170519} +#define T_2187_319 {0.6086189641758871227139593429455999284983,-0.7934626370822196461674025158572476357222} +#define T_2187_320 {0.6063368612339504748476315398875158280134,-0.7952079040785252628253942930314224213362} +#define T_2187_322 {0.6017576602040227706780228800198528915644,-0.7986787328993929513387683982728049159050} +#define T_2187_323 {0.5994605999125314221842586448474321514368,-0.8004042660759048688490224776614923030138} +#define T_2187_325 {0.5948516545754494178055438169394619762897,-0.8038354987488735003964279712818097323179} +#define T_2187_326 {0.5925398075718665280575692122511100023985,-0.8055411699241047385200431563134770840406} +#define T_2187_328 {0.5879014603038157371628358305315487086773,-0.8089325515595481297381752483488526195288} +#define T_2187_329 {0.5855749983240376987936315344995819032192,-0.8106182340274633579824126172752585262060} +#define T_2187_331 {0.5809075936853302923879027730436064302921,-0.8139695126960955207096048980019986629486} +#define T_2187_332 {0.5785666895509289364696314805769361555576,-0.8156350812355235291306598810479044914246} +#define T_2187_334 {0.5738705742604082171709478643606416881084,-0.8189460079871134778883856597531121224165} +#define T_2187_335 {0.5715154018657930823366086769965477287769,-0.8205913388710491807032099131902214139700} +#define T_2187_337 {0.5667909247750773094054466128000058233738,-0.8238616677529139176172634506656322628260} +#define T_2187_338 {0.5644216590745777528681514922936912626028,-0.8254866387577094455352266777481418102980} +#define T_2187_340 {0.5596691711421457604558327147969976067543,-0.8287161268329846786073744624445680528879} +#define T_2187_341 {0.5572859881370144652379394756280817091465,-0.8303206172474288937479514061124064028263} +#define T_2187_343 {0.5525058424021346281662658839195501059294,-0.8335090246131157121212140737043228000402} +#define T_2187_344 {0.5501089191274732836944849623250775039196,-0.8350929152474011463169745184131897985935} +#define T_2187_346 {0.5453014706839777181457407095876988023520,-0.8382400050521873202669098645856138318777} +#define T_2187_347 {0.5428909851955860954575427967938594520092,-0.8398031782467638706179968721698969602585} +#define T_2187_349 {0.5380565911654929811547276585770305246115,-0.8429087167086182219577494834084063768387} +#define T_2187_350 {0.5356327225266416247606571232608985155821,-0.8444510563429343807939631005865521728992} +#define T_2187_352 {0.5307717420336256486379511443374212831259,-0.8475148127664733355146609028452076017857} +#define T_2187_353 {0.5283346703017546275304994196631014347076,-0.8490362042676014020514685398666188120842} +#define T_2187_355 {0.5234474644444703228529647276445757597685,-0.8520579510612262819080342524102889001369} +#define T_2187_356 {0.5209973706578132635058864252641797065735,-0.8535582814123737760425569831568282097578} +#define T_2187_358 {0.5160843024830696901261717357556335628033,-0.8565377941051774968173049273900687694550} +#define T_2187_359 {0.5136213686472057560195025871507823467255,-0.8580169518540826656405329231347423046827} +#define T_2187_361 {0.5086828031229982949312784512585494667292,-0.8609540091125245098169216362293809652328} +#define T_2187_362 {0.5062072121973312244236353762971702963114,-0.8624118843797353717306464204739313572645} +#define T_2187_364 {0.5012435161857295984333404703647829592228,-0.8653062680240830584210698361857794225216} +#define T_2187_365 {0.4987554520698973536951825735741294920444,-0.8667427525111198738372308980615343898535} +#define T_2187_367 {0.4937669942997933159034573691315017640591,-0.8695942475316564834741939193918369710445} +#define T_2187_368 {0.4912666418200062334875610758899711072445,-0.8710092345290569859628249105298891663551} +#define T_2187_370 {0.4862537928597223668703009025193750858307,-0.8738176291020536279319230743567459285259} +#define T_2187_371 {0.4837413377550338622334891169884940609336,-0.8752110134972995725277655765239614993334} +#define T_2187_373 {0.4787044699847959328131707934517180547118,-0.8779760990007504650733949347340967506170} +#define T_2187_374 {0.4761800988933048150997251468652393668890,-0.8793477772860760488526921108132228255272} +#define T_2187_376 {0.4711195864775793440415441182267386466265,-0.8820693483151960112564893279341049492359} +#define T_2187_377 {0.4685834869225651289070810889825224876404,-0.8834192185952772780055397561227437108755} +#define T_2187_379 {0.4634997057822647370528557075886055827141,-0.8860970729777601917476204107515513896942} +#define T_2187_380 {0.4609520661582576228632035508780973032117,-0.8874250349772837553885551642451900988817} +#define T_2187_382 {0.4558453939428155909929785138956503942609,-0.8900589737883211061131305541493929922581} +#define T_2187_383 {0.4532864035016014869761136196757433936000,-0.8913649288594337471991480015276465564966} +#define T_2187_385 {0.4481572195609179187769655072770547121763,-0.8939547564364916931722859771980438381433} +#define T_2187_386 {0.4455870683974799129067889680300140753388,-0.8952386075661279418724802781071048229933} +#define T_2187_388 {0.4404357537537417210948831325367791578174,-0.8977841315234823538204977921850513666868} +#define T_2187_389 {0.4378546327921387648629547584278043359518,-0.8990457833405711696173057134728878736496} +#define T_2187_391 {0.4326815701115155343714491209539119154215,-0.9015468145835986435443487607699353247881} +#define T_2187_392 {0.4300896710906996767143084525741869583726,-0.9027861733661486365321025004959665238857} +#define T_2187_394 {0.4248952446549170147704899136442691087723,-0.9052425261053737015615183736372273415327} +#define T_2187_395 {0.4222927601144900178198327012069057673216,-0.9064594997874344528554502176120877265930} +#define T_2187_397 {0.4170773557922832219801989594998303800821,-0.9088709915523308646712052905058953911066} +#define T_2187_398 {0.4144644790581937798812361961608985438943,-0.9100654897308325663729533516743686050177} +#define T_2187_400 {0.4092284842766436558925136068864958360791,-0.9124319413833783531941890032612718641758} +#define T_2187_401 {0.4066054094468259938466303538007196038961,-0.9136038753248472144008474060683511197567} +#define T_2187_403 {0.4013492131625785996895672269602073356509,-0.9159251110728316991327346840989775955677} +#define T_2187_404 {0.3987161350925340630446669365483103320003,-0.9170743937199817841232629689329769462347} +#define T_2187_406 {0.3934401277629070992070126067119417712092,-0.9193502411300642496172486062278039753437} +#define T_2187_407 {0.3907972420512292321959080254600849002600,-0.9204767871082643049263083412370178848505} +#define T_2187_409 {0.3855018156052067990202658620546571910381,-0.9227070771187837472382398118497803807259} +#define T_2187_410 {0.3828493185790511899035948317759903147817,-0.9238108027423992396620633371639996767044} +#define T_2187_412 {0.3775348663881692989896521339687751606107,-0.9259953696759326557952363145886920392513} +#define T_2187_413 {0.3748729550886695238709478417149512097239,-0.9270761929545425772403177688829600811005} +#define T_2187_415 {0.3695398719377944729558294056914746761322,-0.9292148745302124535072607613983564078808} +#define T_2187_416 {0.3668687441054248044025598574080504477024,-0.9302727151746998934811472281580790877342} +#define T_2187_418 {0.3615174261634265251430520038411486893892,-0.9323653525202293401719089160906150937080} +#define T_2187_419 {0.3588372802233126268589558094390667974949,-0.9334001319487457148937892270623706281185} +#define T_2187_421 {0.3534681250136352259616501214622985571623,-0.9354465696122601370277038768108468502760} +#define T_2187_422 {0.3507791600608142768003006040089530870318,-0.9364582109560627420918876850919332355261} +#define T_2187_424 {0.3453925664319458799234041634917957708240,-0.9384582969176381572751211024296935647726} +#define T_2187_425 {0.3426949822165768488879677988734329119325,-0.9394467250268000446666860625555273145437} +#define T_2187_427 {0.3372913503124211342942828650848241522908,-0.9414003107097551614756980598031077533960} +#define T_2187_428 {0.3345853472249459836795892897498561069369,-0.9423654521587484511613297399890143424273} +#define T_2187_430 {0.3291650784550974595532579769496805965900,-0.9442723924406820623644875922764185816050} +#define T_2187_431 {0.3264508575113553856539283515303395688534,-0.9452141755338314688117407058598473668098} +#define T_2187_433 {0.3210143545212805760158403245441149920225,-0.9470743287574030500053368086810223758221} +#define T_2187_434 {0.3182921173475755649562302096455823630095,-0.9479926835342122881655768651398830115795} +#define T_2187_436 {0.3128397839887023246241426477354252710938,-0.9498059115176648026235284305585082620382} +#define T_2187_437 {0.3101097328068250225108215545333223417401,-0.9507007697580135419102020932768937200308} +#define T_2187_439 {0.3046419741065427566617529464565450325608,-0.9524669378054382296028279597521759569645} +#define T_2187_440 {0.3019043117187485969488136561267310753465,-0.9533382330346500399542719605960883200169} +#define T_2187_442 {0.2964215338503202179509798952494747936726,-0.9550572099459924135800292788189835846424} +#define T_2187_443 {0.2936764636242643056185386285505956038833,-0.9559048774397723713391883393342141062021} +#define T_2187_445 {0.2881790738766539239357200585800455883145,-0.9575765355205786422132518964644987136126} +#define T_2187_446 {0.2854267997302835646600271957140648737550,-0.9584005123098215950250278183375485241413} +#define T_2187_448 {0.2799152064779009685402400009479606524110,-0.9600247273807243075793849129695445299149} +#define T_2187_449 {0.2771559328643078967679969082382740452886,-0.9608249522561927991048946751106996089220} +#define T_2187_451 {0.2716305455366722632071230236761039122939,-0.9624016036621353409330481554206926375628} +#define T_2187_452 {0.2688644774289045136228537558054085820913,-0.9631780171790063072023713175440207123756} +#define T_2187_454 {0.2633257064802299596273371662391582503915,-0.9647069877982058505594409325567539781332} +#define T_2187_455 {0.2605530493560656579710155256179859861732,-0.9654595322804864210297637328039854764938} +#define T_2187_457 {0.2550013062347706305210692789842141792178,-0.9669407085331348516987759467156138271093} +#define T_2187_458 {0.2522222660614539813117573885392630472779,-0.9676693280779468109287222432612907141447} +#define T_2187_460 {0.2466579631795965121821012644431903026998,-0.9691025999346472019624343374744057655334} +#define T_2187_461 {0.2438727463985374543931072821578709408641,-0.9698072404163802229248858566279523074627} +#define T_2187_463 {0.2382962971011793884557050660077948123217,-0.9711925014063208516645886447804514318705} +#define T_2187_464 {0.2355051106126180848754358976293588057160,-0.9718731104806525022965502103033941239119} +#define T_2187_466 {0.2299169291471194753739837324246764183044,-0.9732102576995160791994976534624584019184} +#define T_2187_467 {0.2271199802947566626087905206077266484499,-0.9738667848073007116127541848982218652964} +#define T_2187_469 {0.2215204817800034697850009024477913044393,-0.9751557189249084878213125193724408745766} +#define T_2187_470 {0.2187179783355979179049199956352822482586,-0.9757881152959329007501310115912929177284} +#define T_2187_472 {0.2131075787311647040667139663128182291985,-0.9770287405636235433803449268452823162079} +#define T_2187_473 {0.2102997288790989516282792237689136527479,-0.9776369592202298619554312608670443296432} +#define T_2187_475 {0.2046788449543494037285995545971672981977,-0.9788291834779719868819825023820158094168} +#define T_2187_476 {0.2018658572761645730864188408304471522570,-0.9794131792385475376860881624452304095030} +#define T_2187_478 {0.1962349065792916014139279923256253823638,-0.9805569139217858998236465595255140215158} +#define T_2187_479 {0.1934169900381930429222876455241930671036,-0.9811166434041191930504055562778376042843} +#define T_2187_481 {0.1877763908652012592170876814634539186954,-0.9822118035503539790198601622250862419605} +#define T_2187_482 {0.1849537547905356626998241154069546610117,-0.9827472251748576859142758621601387858391} +#define T_2187_484 {0.1793039261541681250733404340280685573816,-0.9837937294299555768262166566273663192987} +#define T_2187_485 {0.1764767802258733753184571924066403880715,-0.9843048034227551701391689675801899284124} +#define T_2187_487 {0.1708181418244852090015939438671921379864,-0.9853025740469929516507363587152212858200} +#define T_2187_488 {0.1679866960575143453038293728241114877164,-0.9857892624428814531967191214789636433125} +#define T_2187_490 {0.1623196682438953486471433507176698185503,-0.9867382253167208405741916976694483309984} +#define T_2187_491 {0.1594841329726155720880598209987510927022,-0.9872004919619788987361630461236927658319} +#define T_2187_493 {0.1538091367227643058157582345302216708660,-0.9881005765915724659009811148280277848244} +#define T_2187_494 {0.1509697225853322277711043852832517586648,-0.9885383871466538741046292670944239944220} +#define T_2187_496 {0.1452871794671836691570376842719269916415,-0.9893895266690820866628541807585861533880} +#define T_2187_497 {0.1424440973898981055434376230550697073340,-0.9898028486111651869094885114463977515697} +#define T_2187_499 {0.1367544295320075320443464761410723440349,-0.9906049797994027628078583802562206983566} +#define T_2187_500 {0.1339078907136406482170087883787346072495,-0.9909937824248059579090863735473249107599} +#define T_2187_502 {0.1282115207738256101865914615700603462756,-0.9917468456924191100299026402353774756193} +#define T_2187_503 {0.1253617366699342206004530453355982899666,-0.9921111001188819278340247365122195333242} +#define T_2187_505 {0.1196590878038774202751781672304787207395,-0.9928150395244543791051228254218585789204} +#define T_2187_506 {0.1168062701110947620986024730882490985096,-0.9931547186932829784922205362818203866482} +#define T_2187_508 {0.1110977659409099066456505511268915142864,-0.9938094819445720817796541268762666732073} +#define T_2187_509 {0.1082421265812198440947611288720509037375,-0.9941245606226490894030689560167957097292} +#define T_2187_511 {0.1025281911639827486792952981886628549546,-0.9947300990804700537850635555514600127935} +#define T_2187_512 {0.0996699422689771019623350412075524218380,-0.9950205538621293976930815006198827177286} +#define T_2187_514 {0.0939510000652245685914820683137804735452,-0.9955768225439682872490720910718664526939} +#define T_2187_515 {0.0910903539603448442196764744949177838862,-0.9958426318527335840968817137763835489750} +#define T_2187_517 {0.0853668298025435645648428817366948351264,-0.9963495894360892002339369355468079447746} +#define T_2187_518 {0.0825039989913083776640334576768509577960,-0.9965907335262766952865831626695580780506} +#define T_2187_520 {0.0767763180522961635743328656644735019654,-0.9970483423517300103355864848708733916283} +#define T_2187_521 {0.0739115152005153791536784524396352935582,-0.9972648033099152931058029025734867900610} +#define T_2187_523 {0.0681801029619171494733365079810027964413,-0.9976730293839271013212055549956858158112} +#define T_2187_524 {0.0653135408818939222630461927110445685685,-0.9978647911302759299090325839642900973558} +#define T_2187_526 {0.0595788231025147635433469872623390983790,-0.9982236041277120497383634756261017173529} +#define T_2187_527 {0.0567107147372368017301802467500237980857,-0.9983906524171747287610401144775096327066} +#define T_2187_529 {0.0509731174214343371597912835113675100729,-0.9987000256835586453618702762469183653593} +#define T_2187_530 {0.0481036758287553753432597147821070393547,-0.9988423481069282905409068007429596036673} +#define T_2187_532 {0.0423636251947940578599371974632958881557,-0.9991022586604211275229658895113971084356} +#define T_2187_533 {0.0394930635316068090467922502284636721015,-0.9992198446452554838614901200344320386648} +#define T_2187_535 {0.0337509859799961786652744422099203802645,-0.9994302731783634152762374469602946192026} +#define T_2187_536 {0.0308795174863979796087143370186822721735,-0.9995231139897702288266145842499099671841} +#define T_2187_538 {0.0251358395682174558249943885357424733229,-0.9996840448707785542481474294618237763643} +#define T_2187_539 {0.0222636775516696604204636145141193992458,-0.9997521336120644974698734586127102375031} +#define T_2187_541 {0.0165188259368821283024075796674878802150,-0.9998635548861989352786849849508143961430} +#define T_2187_542 {0.0136461837563645500825959544499710318632,-0.9999068864993817529196462601248640567064} +#define T_2187_544 {0.0079005852021211200875150382216816069558,-0.9999687898896965077000231758574955165386} +#define T_2187_545 {0.0050276762522826262333253133363086817553,-0.9999873611558809383126344982883892953396} +#define T_2187_547 {-0.0007182424287790828898100925492542501161,-0.9999997420638734313413920062885154038668} +#define T_2187_548 {-0.0035912047334726291715478740940170609974,-0.9999935516034902382997984204848762601614} +#define T_2187_550 {-0.0093370167049343314596976384223125933204,-0.9999564091094430562378647664445452392101} +#define T_2187_551 {-0.0122098189460535938577478987099311780185,-0.9999254573823513903008119996229652315378} +#define T_2187_553 {-0.0179550973794239399072836249615647830069,-0.9998387942454000087977306066022720187902} +#define T_2187_554 {-0.0208275261504299244452731443288939772174,-0.9997830835508536573286164639284834265709} +#define T_2187_556 {-0.0265718442568515234025738891432411037385,-0.9996469062087821599860149035521317273378} +#define T_2187_557 {-0.0294436861789485262996812764413334662095,-0.9995664406852581285178871439711656421423} +#define T_2187_559 {-0.0351866172409020100397825103755167219788,-0.9993807592540203676989563064125832170248} +#define T_2187_560 {-0.0380576589788884783738254213858454022557,-0.9992755448789121253128087118966504931450} +#define T_2187_562 {-0.0437987763818912939628447134055022615939,-0.9990403731518807672884463499940466135740} +#define T_2187_563 {-0.0466688046600074735170693429608945734799,-0.9989104177410534912695538878324441611767} +#define T_2187_565 {-0.0524076819243049774055265288552618585527,-0.9986257731879950583220306725706905126572} +#define T_2187_566 {-0.0552764835420761352358098861259350087494,-0.9984710863952058756964902386243920773268} +#define T_2187_568 {-0.0610126943543227232003012261429830687121,-0.9981369901609828954036629511392675340176} +#define T_2187_569 {-0.0638800562023967566371851489748223684728,-0.9979575834771629017083682811062317341566} +#define T_2187_571 {-0.0696131744473246216742268188681919127703,-0.9975740603801636074976499912736471742392} +#define T_2187_572 {-0.0724788835233029504756530059239594265819,-0.9973699471325649934527746154344640672207} +#define T_2187_574 {-0.0782084833153760816681909773251391015947,-0.9969370256628592441572322968568187206984} +#define T_2187_575 {-0.0810723267396365188108831034696777351201,-0.9967082210140646436613565128936897963285} +#define T_2187_577 {-0.0867979824546877276603140671795699745417,-0.9962259333312879494570779570494778454304} +#define T_2187_578 {-0.0896597474861983295690848194681166205555,-0.9959724542780845624179164587985724210739} +#define T_2187_580 {-0.0953810337930467433409376099007204174995,-0.9954408362090484407858070881047751754522} +#define T_2187_581 {-0.0982405078451692725938215744463377632201,-0.9951627015811651544296978499914985150099} +#define T_2187_583 {-0.1039569997372161436199888839837512932718,-0.9945817926171967027215714551857672631741} +#define T_2187_584 {-0.1068139703934980894173278898051648866385,-0.9942790230759054326270529600151348859072} +#define T_2187_586 {-0.1125252432202985264364514250701176933944,-0.9936488663699126755446400238724891096354} +#define T_2187_587 {-0.1153794982502525240386503924128192011267,-0.9933214844064935933332094464276451617479} +#define T_2187_589 {-0.1210851277490606336950662580420612357557,-0.9926421267697607131452741668908856809139} +#define T_2187_590 {-0.1239364551239300060725412322426564060152,-0.9922901567038312498070240508241113275290} +#define T_2187_592 {-0.1296360174512153629056143699926906265318,-0.9915616486025405906801211131096351891756} +#define T_2187_593 {-0.1324842053597247437668471548022353090346,-0.9911851165802492147349767037667334079742} +#define T_2187_595 {-0.1381772771226576490555260079418076202273,-0.9904075121317327257131069018214475363493} +#define T_2187_596 {-0.1410221139867473549855958481202833354473,-0.9900064461238162749623370473273098468781} +#define T_2187_598 {-0.1467082722746506362465623851676355116069,-0.9891798030925352813724771294801030308008} +#define T_2187_599 {-0.1495495467651938858999471904098754748702,-0.9887542328922416245973181503359228372574} +#define T_2187_601 {-0.1552283691809588916932227675715694203973,-0.9878786126854958160592445892689283937216} +#define T_2187_602 {-0.1580658702334602760952719791021081618965,-0.9874285699063699572874952536949422210455} +#define T_2187_604 {-0.1637369349249247207911395207702298648655,-0.9865040375697359253948093282815534621477} +#define T_2187_605 {-0.1665704517551992724921916533276089467108,-0.9860295556432722152706560336810071021318} +#define T_2187_607 {-0.1722333374464845301421433987343334592879,-0.9850561798557712078761028351436834782362} +#define T_2187_608 {-0.1750626595663157814009025514678796753287,-0.9845572940289296637317306704062502831221} +#define T_2187_610 {-0.1807169455891214915332909640710568055511,-0.9835351470979257770821391204663086682558} +#define T_2187_611 {-0.1835418628218977166177694471116410568357,-0.9830118944305138439787583592988085001707} +#define T_2187_613 {-0.1891871291467511484452046488513587974012,-0.9819410522863429857665096278651617467403} +#define T_2187_614 {-0.1920074316430782912501484815948060713708,-0.9813934716482622944155878030869644135237} +#define T_2187_616 {-0.1976432589105363013537441929656779393554,-0.9802740138385913626350998129055369645357} +#define T_2187_617 {-0.2004587371638266168893949270568555220962,-0.9797021459069505944228239968651905655861} +#define T_2187_619 {-0.2060847067156278689115112001672969199717,-0.9785341555908683153219840278325136750937} +#define T_2187_620 {-0.2088951515776633072185575201729079708457,-0.9779380428469612862585336188203655183315} +#define T_2187_622 {-0.2145108454878282833178104738180991262197,-0.9767216067888009334296839369926601648331} +#define T_2187_623 {-0.2173160481842969782295682534822844900191,-0.9761012935149508962240361142903566360474} +#define T_2187_625 {-0.2229210492901737561410868693201337009668,-0.9748365020778447798122101630724500864744} +#define T_2187_626 {-0.2257208014361788694923660614222171716392,-0.9741920343541149440724780106393154710531} +#define T_2187_628 {-0.2313146933694320561691881721344543620944,-0.9728789814932818913462142518255859613419} +#define T_2187_629 {-0.2341087869849717006953682130188099108636,-0.9722104071940523839501224756531883031130} +#define T_2187_631 {-0.2396911542025124131072288946597836911678,-0.9708491904498182112348558803205378353596} +#define T_2187_632 {-0.2424793817279295160549423826523707248271,-0.9701565592402295878926565819710958749056} +#define T_2187_634 {-0.2480498095427841054316786539857275784016,-0.9687472797307812300005025463178753852844} +#define T_2187_635 {-0.2508319638541849361246249827672727406025,-0.9680306430630452041441458277404308319092} +#define T_2187_637 {-0.2563900384663000409091182518750429153442,-0.9665734054769193894784962139965500682592} +#define T_2187_638 {-0.2591659128909403753127094205410685390234,-0.9658328165864967784770556136209052056074} +#define T_2187_640 {-0.2647112214179221933996188909077318385243,-0.9643277291748028057227770659665111452341} +#define T_2187_641 {-0.2674806097495602830171890218480257317424,-0.9635632430764486944241298260749317705631} +#define T_2187_643 {-0.2730127402573453987422169575438601896167,-0.9620104176448274202471111493650823831558} +#define T_2187_644 {-0.2757754367715598009525024281174410134554,-0.9612220911285047630912004024139605462551} +#define T_2187_646 {-0.2812939783050159570088055716041708365083,-0.9596216430288226906242243785527534782887} +#define T_2187_647 {-0.2840497777744878660222127564338734373450,-0.9588095346554831310825761647720355540514} +#define T_2187_649 {-0.2895543203879418214796714892145246267319,-0.9571615827772639306658675195649266242981} +#define T_2187_650 {-0.2923030180977003733566732535109622403979,-0.9563257528744980584534118861483875662088} +#define T_2187_652 {-0.2977931528853907106046960961975855752826,-0.9546304196360905214291392439918126910925} +#define T_2187_653 {-0.3005345446480204296690885712450835853815,-0.9537709302936460131761009506590198725462} +#define T_2187_655 {-0.3060098637744731453480540039890911430120,-0.9520283416331303261159746398334391415119} +#define T_2187_656 {-0.3087437459452818666605367070587817579508,-0.9511452566983000789235802585608325898647} +#define T_2187_658 {-0.3142038426756065816469742912886431440711,-0.9493555420641324182895459671271964907646} +#define T_2187_659 {-0.3169300121677531834052388148847967386246,-0.9484489271370113439019178258604370057583} +#define T_2187_661 {-0.3223744808978574738489442097488790750504,-0.9466122194784075682960633457696530967951} +#define T_2187_662 {-0.3250927351974384760247005488054128363729,-0.9456821419070194911782323288207408040762} +#define T_2187_664 {-0.3305211714841581605028864032647106796503,-0.9437985776640795965164443259709514677525} +#define T_2187_665 {-0.3332313086652511913143825950101017951965,-0.9428451065393742558384815311001148074865} +#define T_2187_667 {-0.3386433092563945757014209902990842238069,-0.9409148256329463722025252536695916205645} +#define T_2187_668 {-0.3413451279960587059214560667896876111627,-0.9399380317836673048859097434615250676870} +#define T_2187_670 {-0.3467402908603621769501046401273924857378,-0.9379611776049536775445858438615687191486} +#define T_2187_671 {-0.3494335904535930681369393369095632806420,-0.9369611335923762052146912537864409387112} +#define T_2187_673 {-0.3548115148105860372496067611791659146547,-0.9349378529922817149255820368125569075346} +#define T_2187_674 {-0.3574960951852254043004109007597435265779,-0.9339146331048230331717263652535621076822} +#define T_2187_676 {-0.3628563815350022703221100073278648778796,-0.9318450763830460337189265374036040157080} +#define T_2187_677 {-0.3655320432666003260813170072651701048017,-0.9307987566307468485504728050727862864733} +#define T_2187_679 {-0.3708742934194975693351636891748057678342,-0.9286830775246140978751441252825316041708} +#define T_2187_680 {-0.3735408377461271745012538758601294830441,-0.9276137356334916983513494415092281997204} +#define T_2187_682 {-0.3788646548523026957866477459901943802834,-0.9254520913065383824758214359462726861238} +#define T_2187_683 {-0.3815218836893245479835456990258535370231,-0.9243598067128132589331812596356030553579} +#define T_2187_685 {-0.3868268722682379756605541842873208224773,-0.9221523577431075535670856879733037203550} +#define T_2187_686 {-0.3894745882230152278502544049842981621623,-0.9210372115873021181542412705312017351389} +#define T_2187_688 {-0.3947603541928060844057313261146191507578,-0.9187841219555169525179394440783653408289} +#define T_2187_689 {-0.3973983605793678375306399175315164029598,-0.9176461970764281383949878545536193996668} +#define T_2187_691 {-0.4026645112861299558026928480103379115462,-0.9153476341536597171710809561773203313351} +#define T_2187_692 {-0.4052926121397819048119970375410048291087,-0.9141870150822053453509852261049672961235} +#define T_2187_694 {-0.4105387563867318734267541913141030818224,-0.9118431496175400940984445696813054382801} +#define T_2187_695 {-0.4131567564786137736199123082769801840186,-0.9106599225704793409974513451743405312300} +#define T_2187_697 {-0.4183825045551509691499347809440223500133,-0.9082709286783096080952759621141012758017} +#define T_2187_698 {-0.4209902094067392019915985201805597171187,-0.9070651815518385729930628258443903177977} +#define T_2187_700 {-0.4261951731173955759679472521384013816714,-0.9046312366989287534480013164284173399210} +#define T_2187_701 {-0.4287923890149498151735940609796671196818,-0.9034030590621507927906463919498492032290} +#define T_2187_703 {-0.4339761817082270489720485784346237778664,-0.9009243440544538739089830414741300046444} +#define T_2187_704 {-0.4365627157171805827751143169734859839082,-0.8996738271427262567669913551071658730507} +#define T_2187_706 {-0.4417249523142724454416452317673247307539,-0.8971505261119527840918408401194028556347} +#define T_2187_707 {-0.4443006122935642121518640124122612178326,-0.8958777628201092246840175903344061225653} +#define T_2187_709 {-0.4494409093169621782770661866379668936133,-0.8933100632100492433096405875403434038162} +#define T_2187_710 {-0.4520055039333100710408075428858865052462,-0.8920151480854988657043236344179604202509} +#define T_2187_712 {-0.4571234795352902557930008242692565545440,-0.8894032406380971700343707198044285178185} +#define T_2187_713 {-0.4596768182774041422433697334781754761934,-0.8880862698738013483179543072765227407217} +#define T_2187_715 {-0.4647720922683927224916544673760654404759,-0.8854303486149888158251997083425521850586} +#define T_2187_716 {-0.4673139854611269017325980712485034018755,-0.8840914200423153346264371066354215145111} +#define T_2187_718 {-0.4723861793379424134364796827867394313216,-0.8813916822675951223686752200592309236526} +#define T_2187_719 {-0.4749164381563854564483051490242360159755,-0.8800308953490508789840873760113026946783} +#define T_2187_721 {-0.4799651751303563584905020888982107862830,-0.8772875416088423694560560761601664125919} +#define T_2187_722 {-0.4824836116138579988898982264799997210503,-0.8759049974306847285987487339298240840435} +#define T_2187_724 {-0.4875085166388121171721081736905034631491,-0.8731182315154260020761967098223976790905} +#define T_2187_725 {-0.4900149437049460821036461766198044642806,-0.8717140327801534693819007770798634737730} +#define T_2187_727 {-0.4950156435050710457268507980188587680459,-0.8688840617051624137801013603166211396456} +#define T_2187_728 {-0.4975098749635323280848808735754573717713,-0.8674583127238855162488562200451269745827} +#define T_2187_730 {-0.5024859980611046106346861961355898529291,-0.8645853467139814618747095664730295538902} +#define T_2187_731 {-0.5049678486275408495487226900877431035042,-0.8631381533986740572927942594105843454599} +#define T_2187_733 {-0.5099190253705208064616272167768329381943,-0.8602224058725616018250548222567886114120} +#define T_2187_734 {-0.5123883106802957776437779102707281708717,-0.8587538757281929502340744875255040824413} +#define T_2187_736 {-0.5173141732697874584090413918602280318737,-0.8557955632826079739317037820001132786274} +#define T_2187_737 {-0.5197707098916770629415395887917838990688,-0.8543058053991569034124609061109367758036} +#define T_2187_739 {-0.5246708924092501336033933512226212769747,-0.8513051477927765509079449657292570918798} +#define T_2187_740 {-0.5271144978590684981867298120050691068172,-0.8497942728371273846121880524151492863894} +#define T_2187_742 {-0.5319886362939404422789380078029353171587,-0.8467514929742453455574491272273007780313} +#define T_2187_743 {-0.5344191290480956313402316482097376137972,-0.8452196131819675883889431133866310119629} +#define T_2187_745 {-0.5392668613241726749407689567306078970432,-0.8421349370959356761545677727553993463516} +#define T_2187_746 {-0.5416840608331512374462590742041356861591,-0.8405821662629459067872517152864020317793} +#define T_2187_748 {-0.5465050268359249452387871315295342355967,-0.8374558230993829344157575178542174398899} +#define T_2187_749 {-0.5489087535377041859874225337989628314972,-0.8358822765734921222957609643344767391682} +#define T_2187_751 {-0.5537025951410026181065404671244323253632,-0.8327144985732618520657410954299848526716} +#define T_2187_752 {-0.5560926704743892612370359529450070112944,-0.8311202932456061009958148133591748774052} +#define T_2187_754 {-0.5608590315669804704512557691487018018961,-0.8279113157275655998645902400312479585409} +#define T_2187_755 {-0.5632352779848753820957085736154112964869,-0.8262965700239232047508153300441335886717} +#define T_2187_757 {-0.5679738044969212529267110767250414937735,-0.8230466313674413836309895486920140683651} +#define T_2187_758 {-0.5703360454795080025647280308476183563471,-0.8214114652394356452802526291634421795607} +#define T_2187_760 {-0.5750463854088665449637574056396260857582,-0.8181208068666859789530576563265640288591} +#define T_2187_761 {-0.5773944454767246936555125103041063994169,-0.8164653417828742210105019694310612976551} +#define T_2187_763 {-0.5820762489150981267016504716593772172928,-0.8131342081409002053860035630350466817617} +#define T_2187_764 {-0.5844099536422385776646137856005225330591,-0.8114585670777507697692954025114886462688} +#define T_2187_766 {-0.5890628728011666481734209810383617877960,-0.8080872056203071140956240014929790049791} +#define T_2187_767 {-0.5913820488279888376581538977916352450848,-0.8063915130530642239037319995986763387918} +#define T_2187_769 {-0.5960057380646842650762096127436961978674,-0.8029801742222349991706664695811923593283} +#define T_2187_770 {-0.5983102131108544163851092889672145247459,-0.8012645561156713780448512807197403162718} +#define T_2187_772 {-0.6029043289538791317028199046035297214985,-0.7978134933232657877155702408344950526953} +#define T_2187_773 {-0.6051939318311279070172759020351804792881,-0.7960780771223260332547511097800452262163} +#define T_2187_775 {-0.6097581330059075321869954677822533994913,-0.7925875467310535826825912408821750432253} +#define T_2187_776 {-0.6120326936307467491360512212850153446198,-0.7908324613513866285785525178653188049793} +#define T_2187_778 {-0.6165666410849226508616993669420480728149,-0.7873027226558131363987058648490346968174} +#define T_2187_779 {-0.6188259904912802866761012410279363393784,-0.7855280984741958016925877927860710769892} +#define T_2187_781 {-0.6233293474198957628829020904959179461002,-0.7819594136814818075009725362178869545460} +#define T_2187_782 {-0.6255733177716673587553941615624353289604,-0.7801653825261336550056512351147830486298} +#define T_2187_784 {-0.6300457496421875136505263981234747916460,-0.7765580167365564445702830198570154607296} +#define T_2187_785 {-0.6322741742457038682800884998869150876999,-0.7747447118773469476593618310289457440376} +#define T_2187_787 {-0.6367153488228664004466850201424676924944,-0.7710989330646071948649478144943714141846} +#define T_2187_788 {-0.6389280621392764425436894271115306764841,-0.7692664892031556567175698546634521335363} +#define T_2187_790 {-0.6433376495097715697113471833290532231331,-0.7655825681944712357562821125611662864685} +#define T_2187_791 {-0.6455344871673397433298191572248470038176,-0.7637311214541407933253935880202334374189} +#define T_2187_793 {-0.6499121597643181535985945629363413900137,-0.7600093319101282052230317276553250849247} +#define T_2187_794 {-0.6520929585706343178941324367769993841648,-0.7581390198259135848601886209507938474417} +#define T_2187_796 {-0.6564383911980401498098558477067854255438,-0.7543796382202589967391759273596107959747} +#define T_2187_797 {-0.6586029891521428814016303476819302886724,-0.7524905997285695757881285317125730216503} +#define T_2187_799 {-0.6629158590088712887933297679410316050053,-0.7486939053274911382018785843683872371912} +#define T_2187_800 {-0.6650640953132815891279960851534269750118,-0.7467862807558306448285634360217954963446} +#define T_2187_802 {-0.6693440820171580041275660732935648411512,-0.7429525555973329753456368962361011654139} +#define T_2187_803 {-0.6714757970898246330904157730401493608952,-0.7410264866538743833146440920245368033648} +#define T_2187_805 {-0.6757225827014042840445995352638419717550,-0.7371560155267973257764424488414078950882} +#define T_2187_806 {-0.6778376181875580552826932034804485738277,-0.7352116452898568299545445370313245803118} +#define T_2187_808 {-0.6820508872337441852451433987880591303110,-0.7313047157127200437187752868339885026217} +#define T_2187_809 {-0.6841490860176615562693314132047817111015,-0.7293421886201280068817709434370044618845} +#define T_2187_811 {-0.6883285255151405657159102702280506491661,-0.7253990908197724962747088284231722354889} +#define T_2187_812 {-0.6904097317318146354025998334691394120455,-0.7234185526581439207305379568424541503191} +#define T_2187_814 {-0.6945550312103059287238693286781199276447,-0.7194395795481723920872241251345258206129} +#define T_2187_815 {-0.6966190902570253973280500758846756070852,-0.7174411774420773602045642292068805545568} +#define T_2187_817 {-0.7007299417823446008313226229802239686251,-0.7134266246010951828537827168474905192852} +#define T_2187_818 {-0.7027767003301782500201966286113020032644,-0.7114105070021293775184290097968187183142} +#define T_2187_820 {-0.7068527985271123581512142663996201008558,-0.7073606726517877030246950198488775640726} +#define T_2187_821 {-0.7088821045322990510584304502117447555065,-0.7053269893275448954028661319171078503132} +#define T_2187_823 {-0.7129231466072910583520183536165859550238,-0.7012421743103868232438458107935730367899} +#define T_2187_824 {-0.7149348493225342604517891231807880103588,-0.6991910763333333278524150955490767955780} +#define T_2187_826 {-0.7189405350861767240999711248150561004877,-0.6950715840904460041116408319794572889805} +#define T_2187_827 {-0.7209344850718429897895589419931638985872,-0.6930032238266981003960154339438304305077} +#define T_2187_829 {-0.7249045169611768590911538012733217328787,-0.6888493603751714156047114556713495403528} +#define T_2187_830 {-0.7268805660963968406917956599500030279160,-0.6867638914731769572696862269367557018995} +#define T_2187_832 {-0.7308146491970166636065187049098312854767,-0.6825759653833713969106611330062150955200} +#define T_2187_833 {-0.7327726506906884207381835949490778148174,-0.6804735427624958310488523238745983690023} +#define T_2187_835 {-0.7366704927586497086977601611579302698374,-0.6762518651351198117893659400579053908587} +#define T_2187_836 {-0.7386103011603428747378075058804824948311,-0.6741326449741382731417616014368832111359} +#define T_2187_838 {-0.7424716126438716257140981724660377949476,-0.6698775294171380734198351092345546931028} +#define T_2187_839 {-0.7443930838546325423621397021634038537741,-0.6677416691426334427461597442743368446827} +#define T_2187_841 {-0.7482175779156345907239256121101789176464,-0.6634534317478969489556561711651738733053} +#define T_2187_842 {-0.7501205691986903012491438857978209853172,-0.6613010900225652077821791863243561238050} +#define T_2187_844 {-0.7539079617340592731622450628492515534163,-0.6569800493424410303688887324824463576078} +#define T_2187_845 {-0.7557923317254211514892858758685179054737,-0.6548113860533053554036087007261812686920} +#define T_2187_847 {-0.7595423413881428054139632877195253968239,-0.6504578630769389802068758399400394409895} +#define T_2187_848 {-0.7614079501071077116236551773909013718367,-0.6482730393234722443551731885236222296953} +#define T_2187_850 {-0.7651202983271602198200866951083298772573,-0.6438873574529612175965098685992415994406} +#define T_2187_851 {-0.7669670071867086269534752318577375262976,-0.6416865355351187849564098542032297700644} +#define T_2187_853 {-0.7706414181917558003931389976060017943382,-0.6372690205614889302765391221328172832727} +#define T_2187_854 {-0.7724690900088472256257432491111103445292,-0.6350523639676523002250974059279542416334} +#define T_2187_856 {-0.7761052908447246823087084521830547600985,-0.6306033440466560779924520829808898270130} +#define T_2187_857 {-0.7779137898504882020489503702265210449696,-0.6283710174414878224524727556854486465454} +#define T_2187_859 {-0.7815115104014794811249089434568304568529,-0.6238908230692276069007107253128197044134} +#define T_2187_860 {-0.7833007022512991079921107484551612287760,-0.6216429922814393771446361824928317219019} +#define T_2187_862 {-0.7868596752602015076405450599850155413151,-0.6171319562698162064506846036238130182028} +#define T_2187_863 {-0.7886294270436963183001921606773976236582,-0.6148687882798502535308671212987974286079} +#define T_2187_865 {-0.7921493881316739038567220632103271782398,-0.6103272457318407173687546674045734107494} +#define T_2187_866 {-0.7938995683825712523784545737726148217916,-0.6080489086594655923079244530526921153069} +#define T_2187_868 {-0.7973802560687943685735490362276323139668,-0.6034771969442290773244508272910024970770} +#define T_2187_869 {-0.7991107347746956302003695782332215458155,-0.6011838600360509543563125589571427553892} +#define T_2187_871 {-0.8025518904957653631981884245760738849640,-0.5965823187638676916577651354600675404072} +#define T_2187_872 {-0.8042625391078040975045837512880098074675,-0.5942741523807575365623279139981605112553} +#define T_2187_874 {-0.8076639072369591332289928686805069446564,-0.5896431233778010039259243058040738105774} +#define T_2187_875 {-0.8093545986793505564449446865182835608721,-0.5873202989822396968833118080510757863522} +#define T_2187_877 {-0.8127159265454567682596120903326664119959,-0.5826601262651835977379732867120765149593} +#define T_2187_878 {-0.8143865352249370914705650648102164268494,-0.5803228164085246776338067320466507226229} +#define T_2187_880 {-0.8177075731312573037001811826485209167004,-0.5756338461589879385016388368967454880476} +#define T_2187_881 {-0.8193579749464140471459927539399359375238,-0.5732822244686395229962272424018010497093} +#define T_2187_883 {-0.8226384761891566421709853784705046564341,-0.5685648050074699755285223545797634869814} +#define T_2187_884 {-0.8242685485396469280416908986808266490698,-0.5661990461739967450682797789340838789940} +#define T_2187_886 {-0.8275082694262922977657126466510817408562,-0.5614535279353962682336032230523414909840} +#define T_2187_887 {-0.8291178912219508978509452390426304191351,-0.5590738076995427352500200868234969675541} +#define T_2187_889 {-0.8323165910893542962512015037646051496267,-0.5543005432050347458527994604082778096199} +#define T_2187_890 {-0.8339056427591881037741927684692200273275,-0.5519070383446702532381777928094379603863} +#define T_2187_892 {-0.8370630839914572352000732280430383980274,-0.5471063821769133195260792490444146096706} +#define T_2187_893 {-0.8386314474925281592376791195420082658529,-0.5446992704939001006536614113429095596075} +#define T_2187_895 {-0.8417473955386743922346681756607722491026,-0.5398715792703467908353331949911080300808} +#define T_2187_896 {-0.8432949543648681212104634141724091023207,-0.5374510395773320903245462432096246629953} +#define T_2187_898 {-0.8463691777562305507132123238989152014256,-0.5325966719237383850682476804649922996759} +#define T_2187_899 {-0.8478958169469108518967459531268104910851,-0.5301628840308708623396682924067135900259} +#define T_2187_901 {-0.8509280873143505452560475532663986086845,-0.5252822005546552430743645345501136034727} +#define T_2187_902 {-0.8524336934628996553797719570866320282221,-0.5228353452562281011850586764921899884939} +#define T_2187_904 {-0.8554237855537641932457404436718206852674,-0.5179287085196838669176599978527519851923} +#define T_2187_905 {-0.8569082468160068577489596464147325605154,-0.5154689675807045956545948683924507349730} +#define T_2187_907 {-0.8598559385108635044758784715668298304081,-0.5105367420740666295486676062864717096090} +#define T_2187_908 {-0.8613191446133755535541354220185894519091,-0.5080642982167543619809180199808906763792} +#define T_2187_910 {-0.8642242169425111697478314454201608896255,-0.5031068503311233452990336445509456098080} +#define T_2187_911 {-0.8656660591908109658731973468093201518059,-0.5006218872213352710787148680537939071655} +#define T_2187_913 {-0.8685282963504983300140338542405515909195,-0.4956395852214599551111007258441532030702} +#define T_2187_914 {-0.8699486676371215310155093902722001075745,-0.4931422874550479562572036229539662599564} +#define T_2187_916 {-0.8727678570056501827778561164450366050005,-0.4881355014519686008611643046606332063675} +#define T_2187_917 {-0.8741666518181057110581377855851314961910,-0.4856260545410662765597464840539032593369} +#define T_2187_919 {-0.8769425839715770942817130162438843399286,-0.4805951564646209761555439854419091716409} +#define T_2187_920 {-0.8783196984001856444379541244416031986475,-0.4780737468238628884442675825994228944182} +#define T_2187_922 {-0.8810521671280692190819650022604037076235,-0.4730191103950586728466021213534986600280} +#define T_2187_923 {-0.8824074988736820834844820637954398989677,-0.4704859253277325903397354522894602268934} +#define T_2187_925 {-0.8850963011941348490552172734169289469719,-0.4654079260309834653597249598533380776644} +#define T_2187_926 {-0.8864297495757328393395368948404211550951,-0.4628631537151164376808765155146829783916} +#define T_2187_928 {-0.8890746857506770517431959888199344277382,-0.4577621687703505304334328229742823168635} +#define T_2187_929 {-0.8903861517128502933715594735986087471247,-0.4552059982447300146013446919823763892055} +#define T_2187_931 {-0.8929870252628112625714607020199764519930,-0.4500824065793677108970882727589923888445} +#define T_2187_932 {-0.8942764113831168648616198879608418792486,-0.4475150277294990819321185426815645769238} +#define T_2187_934 {-0.8968330291018187239160397439263761043549,-0.4423692099503043206887298310903133824468} +#define T_2187_935 {-0.8981002395980182129164859361480921506882,-0.4397908134943046554177215057279681786895} +#define T_2187_937 {-0.9006124115667358820402910168922971934080,-0.4346231518591116005367780417145695537329} +#define T_2187_938 {-0.9018573523039102868281702285457868129015,-0.4320339293335427321984809623245382681489} +#define T_2187_940 {-0.9043248919055778545228463372041005641222,-0.4268448077228595982646197626309003680944} +#define T_2187_941 {-0.9055474704031206689691657629737164825201,-0.4242449514684993872037921391893178224564} +#define T_2187_943 {-0.9079701943361940807974974632088560611010,-0.4190347553569916949633977765188319608569} +#define T_2187_944 {-0.9091703197746808795542960979219060391188,-0.4164244585045462354599976606550626456738} +#define T_2187_946 {-0.9115480480667549345596967214078176766634,-0.4111935749324021061035239199554780498147} +#define T_2187_947 {-0.9127256312946900873583899738150648772717,-0.4085730313881587583146881570428377017379} +#define T_2187_949 {-0.9150581873158672996382279052340891212225,-0.4033218489323373012744866628054296597838} +#define T_2187_950 {-0.9162131408563067846984040443203411996365,-0.4006912533637604911795904172322480008006} +#define T_2187_952 {-0.9185003513323183321759302089049015194178,-0.3954201621091263940677151822455925866961} +#define T_2187_953 {-0.9196325893893677605461789426044560968876,-0.3927797099303966810168731171870604157448} +#define T_2187_955 {-0.9218742844144454107180308710667304694653,-0.3874891014407426670373979504802264273167} +#define T_2187_956 {-0.9229837228796333725711065198993310332298,-0.3848389887982402446375829185853945091367} +#define T_2187_958 {-0.9251797359291310529627594405610579997301,-0.3795292560871998399640858679049415513873} +#define T_2187_959 {-0.9262662923876572307335663936100900173187,-0.3768696798449340246150995881180278956890} +#define T_2187_961 {-0.9284164603304210228174042640603147447109,-0.3715412173467861900455488921579672023654} +#define T_2187_962 {-0.9294800540672782940276874796836636960506,-0.3688723750717711191704495377052808180451} +#define T_2187_964 {-0.9315842171777648506036939579644240438938,-0.3635255786121399101951112697861390188336} +#define T_2187_965 {-0.9326247691837353803734345092379953712225,-0.3608476685597183375442398300947388634086} +#define T_2187_967 {-0.9346827711538769900556644643074832856655,-0.3554829353261689250942367834795732051134} +#define T_2187_968 {-0.9357002041314012030781555040448438376188,-0.3527961564252848347678082063794136047363} +#define T_2187_970 {-0.9377118920822176129092895280336961150169,-0.3474138849378181070903792715398594737053} +#define T_2187_971 {-0.9387061304511364889791025234444532543421,-0.3447184367762392009915117796481354162097} +#define T_2187_973 {-0.9406713549440908206378253453294746577740,-0.3393190268576879997652895326609723269939} +#define T_2187_974 {-0.9416423248472600704417345696128904819489,-0.3366151096671791687064967391052050516009} +#define T_2187_976 {-0.9435609398953606063997767705586738884449,-0.3311989624135071030863741725625004619360} +#define T_2187_977 {-0.9445085692041376157490617515577469021082,-0.3284867770549564913729057025193469598889} +#define T_2187_979 {-0.9463804322827819026642259814252611249685,-0.3230542948054627161447172056796262040734} +#define T_2187_980 {-0.9473046506023835577892100445751566439867,-0.3203340427539603796347478237294126302004} +#define T_2187_982 {-0.9491296226599461594020112897851504385471,-0.3148856290613914477027890370663953945041} +#define T_2187_983 {-0.9500303613346777753534411203872878104448,-0.3121575123912630478351104557077633216977} +#define T_2187_985 {-0.9518083068028398985305216228880453854799,-0.3066935719918353342450245690997689962387} +#define T_2187_986 {-0.9526854989211960278439050853194203227758,-0.3039577933616308125230887071666074916720} +#define T_2187_988 {-0.9544162857250156895005943624710198491812,-0.2984787321449641761539339768205536529422} +#define T_2187_989 {-0.9552698661246501465882374759530648589134,-0.2957354947824033519765407618251629173756} +#define T_2187_991 {-0.9569533656923739917132820664846803992987,-0.2902417197613699761937766652408754453063} +#define T_2187_992 {-0.9577832709649407583185620751464739441872,-0.2874912274482456786550699234794592484832} +#define T_2187_994 {-0.9594193582375544204765560607484076172113,-0.2819831467287347570582767275482183322310} +#define T_2187_995 {-0.9602255267334176558335911977337673306465,-0.2792256037857749895181314059300348162651} +#define T_2187_997 {-0.9618140801739365475242493630503304302692,-0.2737036265363766984748394861526321619749} +#define T_2187_998 {-0.9625964520067503693567800837627146393061,-0.2709392378080663355000012870732462033629} +#define T_2187_1000 {-0.9641373536092473495173749142850283533335,-0.2654037742296765922667134418588830158114} +#define T_2187_1001 {-0.9648958706604046087207393611606676131487,-0.2626327450690408848998913526884280145168} +#define T_2187_1003 {-0.9663890059587764147508437417855020612478,-0.2570842063643896668878596756258048117161} +#define T_2187_1004 {-0.9671236118817257976232326655008364468813,-0.2543067426177392231778640052652917802334} +#define T_2187_1006 {-0.9685688699581961325080214919580612331629,-0.2487455409608440304314314062139601446688} +#define T_2187_1007 {-0.9692795101826281456425249416497536003590,-0.2459618489524837137150115040640230290592} +#define T_2187_1009 {-0.9706767836759873091523331822827458381653,-0.2403883974580314220048649076488800346851} +#define T_2187_1010 {-0.9713634054118879257444518771080765873194,-0.2375986839749334444960027212800923734903} +#define T_2187_1012 {-0.9727125905254679905098669223662000149488,-0.2320133966675918812949674929768661968410} +#define T_2187_1013 {-0.9733751427670399580804883044038433581591,-0.2292178689440345362715589772051316685975} +#define T_2187_1015 {-0.9746761392764252684983716790156904608011,-0.2236211607276966939483742180527769960463} +#define T_2187_1016 {-0.9753145728058776331437229600851424038410,-0.2208200264298700032927769143498153425753} +#define T_2187_1018 {-0.9765672840663504050695564728812314569950,-0.2152123130568326936362666401691967621446} +#define T_2187_1019 {-0.9771815514575538097474804999365005642176,-0.2124057802674119421748599734200979582965} +#define T_2187_1021 {-0.9783858844112728325725925060396548360586,-0.2067874783074917510727885883170529268682} +#define T_2187_1022 {-0.9789759400332835870273129330598749220371,-0.2039757555101798791596934279368724673986} +#define T_2187_1024 {-0.9801318052161975824532191836624406278133,-0.1983472823197681145224180454533779993653} +#define T_2187_1025 {-0.9806976052366465079757062994758598506451,-0.1955305783838086064463368529686704277992} +#define T_2187_1027 {-0.9818049167851393699280038163124118000269,-0.1898923520748679039105155652578105218709} +#define T_2187_1028 {-0.9823464191734883055318050537607632577419,-0.1870708762395288660140835190759389661252} +#define T_2187_1030 {-0.9834050948307582196150633535580709576607,-0.1814233156485337561392157113004941493273} +#define T_2187_1031 {-0.9839222593614223022484566172352060675621,-0.1785972775075645446740679744834778830409} +#define T_2187_1033 {-0.9849322204835914140730324106698390096426,-0.1729408021643880633000378566066501662135} +#define T_2187_1034 {-0.9854250087389271328675022232346236705780,-0.1701104116504495722406176128060906194150} +#define T_2187_1036 {-0.9863861803008843187612342262582387775183,-0.1644454417471984120080463753765798173845} +#define T_2187_1037 {-0.9868545556740432322939682308060582727194,-0.1616109091162682143139051049729459919035} +#define T_2187_1039 {-0.9877668662750170858188880629313644021749,-0.1559378654760687765712390273620258085430} +#define T_2187_1040 {-0.9882107939726653134115963439398910850286,-0.1530994012918222013652780333359260112047} +#define T_2187_1042 {-0.9890741758415280138194702885812148451805,-0.1474187053375595746196324853372061625123} +#define T_2187_1043 {-0.9894936228864309457620151988521683961153,-0.1445765204557279970387639878026675432920} +#define T_2187_1045 {-0.9903080118867326753218094381736591458321,-0.1388885941787405542413580405991524457932} +#define T_2187_1046 {-0.9907029471202047909983434692549053579569,-0.1360428997314478971603080026397947221994} +#define T_2187_1048 {-0.9914682827549383681287054059794172644615,-0.1303481656601796490058120525645790621638} +#define T_2187_1049 {-0.9918386768391572738678974019421730190516,-0.1274991730402584566572699031894444487989} +#define T_2187_1051 {-0.9925549022552520028739309054799377918243,-0.1217980542088713535875399429642129689455} +#define T_2187_1052 {-0.9929007276754384658801200203015469014645,-0.1189459750541593668904383207518549170345} +#define T_2187_1054 {-0.9935677896679835363613619847455993294716,-0.1132388949711082698490471898367104586214} +#define T_2187_1055 {-0.9938890207344451832582876704691443592310,-0.1103839411487269467349037199710437562317} +#define T_2187_1057 {-0.9945068697506417310094661843322683125734,-0.1046713237652991401738233889773255214095} +#define T_2187_1058 {-0.9948034826006812991749939101282507181168,-0.1018137073559150507229276172438403591514} +#define T_2187_1060 {-0.9953720727435240167579877379466779530048,-0.0960959770347370040299850302289996761829} +#define T_2187_1061 {-0.9956440453432121584498304400767665356398,-0.0932359103168073910516966407158179208636} +#define T_2187_1063 {-0.9961633343748981239684781030518934130669,-0.0875134918003209194559133266011485829949} +#define T_2187_1064 {-0.9964106465207099860847961281251627951860,-0.0846511872343245347360962682614626828581} +#define T_2187_1066 {-0.9968805958657769306086038341163657605648,-0.0789245056132347189148390498303342610598} +#define T_2187_1067 {-0.9971032291860932872396006132476031780243,-0.0760601758258892118869098908362502697855} +#define T_2187_1069 {-0.9975238039342845253187874732248019427061,-0.0703296565075865048877190588427765760571} +#define T_2187_1070 {-0.9977217418907562418439738394226878881454,-0.0674635142760533906836073470003611873835} +#define T_2187_1072 {-0.9980929107996144855619036206917371600866,-0.0617295829530121395456454536088131135330} +#define T_2187_1073 {-0.9982661386883912024714504696021322160959,-0.0588618411890906093053565939499094383791} +#define T_2187_1075 {-0.9985878741855792606330055605212692171335,-0.0531249238072464408100259447564894799143} +#define T_2187_1076 {-0.9987363791384016309393700794316828250885,-0.0502557955415571869228763546288973884657} +#define T_2187_1078 {-0.9990086573237505485067799781973008066416,-0.0445163182686655334308056808367837220430} +#define T_2187_1079 {-0.9991324283089063618135128308495040982962,-0.0416460166348257415647182710927154403180} +#define T_2187_1081 {-0.9993552289561907775450322333199437707663,-0.0359044058288039424908788532775361090899} +#define T_2187_1082 {-0.9994542567793339715720435378898400813341,-0.0330331440475946230828085958819428924471} +#define T_2187_1084 {-0.9996275633377750269303874119941610842943,-0.0272898262248498328641499455216035130434} +#define T_2187_1085 {-0.9997018406426091408079059874580707401037,-0.0244178175883766751530501437628117855638} +#define T_2187_1087 {-0.9998256402381036078708120840019546449184,-0.0186732193921220340770972256905224639922} +#define T_2187_1088 {-0.9998751615069280118675010271545033901930,-0.0158006772479710004553066227117597009055} +#define T_2187_1090 {-0.9999494449430044173965370646328665316105,-0.0100552254165323859402825235065392917022} +#define T_2187_1091 {-0.9999742064971245403270927454286720603704,-0.0071823631519211204171648788019410858396} +#define T_2187_1093 {-0.9999989682556269521285230439389124512672,-0.0014364844870368258245019177365975338034} +#define T_2187_1094 {-0.9999989682556269521285230439389124512672,0.0014364844870368258245019177365975338034} +#define T_2187_1096 {-0.9999742064971245403270927454286720603704,0.0071823631519211204171648788019410858396} +#define T_2187_1100 {-0.9998256402381036078708120840019546449184,0.0186732193921220340770972256905224639922} +#define T_2187_1102 {-0.9997018406426091408079059874580707401037,0.0244178175883766751530501437628117855638} +#define T_2187_1105 {-0.9994542567793339715720435378898400813341,0.0330331440475946230828085958819428924471} +#define T_2187_1106 {-0.9993552289561907775450322333199437707663,0.0359044058288039424908788532775361090899} +#define T_2187_1108 {-0.9991324283089063618135128308495040982962,0.0416460166348257415647182710927154403180} +#define T_2187_1112 {-0.9985878741855792606330055605212692171335,0.0531249238072464408100259447564894799143} +#define T_2187_1114 {-0.9982661386883912024714504696021322160959,0.0588618411890906093053565939499094383791} +#define T_2187_1115 {-0.9980929107996144855619036206917371600866,0.0617295829530121395456454536088131135330} +#define T_2187_1118 {-0.9975238039342845253187874732248019427061,0.0703296565075865048877190588427765760571} +#define T_2187_1120 {-0.9971032291860932872396006132476031780243,0.0760601758258892118869098908362502697855} +#define T_2187_1124 {-0.9961633343748981239684781030518934130669,0.0875134918003209194559133266011485829949} +#define T_2187_1126 {-0.9956440453432121584498304400767665356398,0.0932359103168073910516966407158179208636} +#define T_2187_1130 {-0.9945068697506417310094661843322683125734,0.1046713237652991401738233889773255214095} +#define T_2187_1132 {-0.9938890207344451832582876704691443592310,0.1103839411487269467349037199710437562317} +#define T_2187_1135 {-0.9929007276754384658801200203015469014645,0.1189459750541593668904383207518549170345} +#define T_2187_1136 {-0.9925549022552520028739309054799377918243,0.1217980542088713535875399429642129689455} +#define T_2187_1138 {-0.9918386768391572738678974019421730190516,0.1274991730402584566572699031894444487989} +#define T_2187_1142 {-0.9903080118867326753218094381736591458321,0.1388885941787405542413580405991524457932} +#define T_2187_1144 {-0.9894936228864309457620151988521683961153,0.1445765204557279970387639878026675432920} +#define T_2187_1145 {-0.9890741758415280138194702885812148451805,0.1474187053375595746196324853372061625123} +#define T_2187_1148 {-0.9877668662750170858188880629313644021749,0.1559378654760687765712390273620258085430} +#define T_2187_1150 {-0.9868545556740432322939682308060582727194,0.1616109091162682143139051049729459919035} +#define T_2187_1154 {-0.9849322204835914140730324106698390096426,0.1729408021643880633000378566066501662135} +#define T_2187_1156 {-0.9839222593614223022484566172352060675621,0.1785972775075645446740679744834778830409} +#define T_2187_1160 {-0.9818049167851393699280038163124118000269,0.1898923520748679039105155652578105218709} +#define T_2187_1162 {-0.9806976052366465079757062994758598506451,0.1955305783838086064463368529686704277992} +#define T_2187_1165 {-0.9789759400332835870273129330598749220371,0.2039757555101798791596934279368724673986} +#define T_2187_1166 {-0.9783858844112728325725925060396548360586,0.2067874783074917510727885883170529268682} +#define T_2187_1168 {-0.9771815514575538097474804999365005642176,0.2124057802674119421748599734200979582965} +#define T_2187_1172 {-0.9746761392764252684983716790156904608011,0.2236211607276966939483742180527769960463} +#define T_2187_1174 {-0.9733751427670399580804883044038433581591,0.2292178689440345362715589772051316685975} +#define T_2187_1175 {-0.9727125905254679905098669223662000149488,0.2320133966675918812949674929768661968410} +#define T_2187_1178 {-0.9706767836759873091523331822827458381653,0.2403883974580314220048649076488800346851} +#define T_2187_1180 {-0.9692795101826281456425249416497536003590,0.2459618489524837137150115040640230290592} +#define T_2187_1184 {-0.9663890059587764147508437417855020612478,0.2570842063643896668878596756258048117161} +#define T_2187_1186 {-0.9648958706604046087207393611606676131487,0.2626327450690408848998913526884280145168} +#define T_2187_1190 {-0.9618140801739365475242493630503304302692,0.2737036265363766984748394861526321619749} +#define T_2187_1192 {-0.9602255267334176558335911977337673306465,0.2792256037857749895181314059300348162651} +#define T_2187_1195 {-0.9577832709649407583185620751464739441872,0.2874912274482456786550699234794592484832} +#define T_2187_1196 {-0.9569533656923739917132820664846803992987,0.2902417197613699761937766652408754453063} +#define T_2187_1198 {-0.9552698661246501465882374759530648589134,0.2957354947824033519765407618251629173756} +#define T_2187_1202 {-0.9518083068028398985305216228880453854799,0.3066935719918353342450245690997689962387} +#define T_2187_1204 {-0.9500303613346777753534411203872878104448,0.3121575123912630478351104557077633216977} +#define T_2187_1205 {-0.9491296226599461594020112897851504385471,0.3148856290613914477027890370663953945041} +#define T_2187_1208 {-0.9463804322827819026642259814252611249685,0.3230542948054627161447172056796262040734} +#define T_2187_1210 {-0.9445085692041376157490617515577469021082,0.3284867770549564913729057025193469598889} +#define T_2187_1214 {-0.9406713549440908206378253453294746577740,0.3393190268576879997652895326609723269939} +#define T_2187_1216 {-0.9387061304511364889791025234444532543421,0.3447184367762392009915117796481354162097} +#define T_2187_1220 {-0.9346827711538769900556644643074832856655,0.3554829353261689250942367834795732051134} +#define T_2187_1222 {-0.9326247691837353803734345092379953712225,0.3608476685597183375442398300947388634086} +#define T_2187_1226 {-0.9284164603304210228174042640603147447109,0.3715412173467861900455488921579672023654} +#define T_2187_1228 {-0.9262662923876572307335663936100900173187,0.3768696798449340246150995881180278956890} +#define T_2187_1232 {-0.9218742844144454107180308710667304694653,0.3874891014407426670373979504802264273167} +#define T_2187_1234 {-0.9196325893893677605461789426044560968876,0.3927797099303966810168731171870604157448} +#define T_2187_1238 {-0.9150581873158672996382279052340891212225,0.4033218489323373012744866628054296597838} +#define T_2187_1240 {-0.9127256312946900873583899738150648772717,0.4085730313881587583146881570428377017379} +#define T_2187_1244 {-0.9079701943361940807974974632088560611010,0.4190347553569916949633977765188319608569} +#define T_2187_1246 {-0.9055474704031206689691657629737164825201,0.4242449514684993872037921391893178224564} +#define T_2187_1250 {-0.9006124115667358820402910168922971934080,0.4346231518591116005367780417145695537329} +#define T_2187_1252 {-0.8981002395980182129164859361480921506882,0.4397908134943046554177215057279681786895} +#define T_2187_1256 {-0.8929870252628112625714607020199764519930,0.4500824065793677108970882727589923888445} +#define T_2187_1258 {-0.8903861517128502933715594735986087471247,0.4552059982447300146013446919823763892055} +#define T_2187_1262 {-0.8850963011941348490552172734169289469719,0.4654079260309834653597249598533380776644} +#define T_2187_1264 {-0.8824074988736820834844820637954398989677,0.4704859253277325903397354522894602268934} +#define T_2187_1268 {-0.8769425839715770942817130162438843399286,0.4805951564646209761555439854419091716409} +#define T_2187_1270 {-0.8741666518181057110581377855851314961910,0.4856260545410662765597464840539032593369} +#define T_2187_1274 {-0.8685282963504983300140338542405515909195,0.4956395852214599551111007258441532030702} +#define T_2187_1276 {-0.8656660591908109658731973468093201518059,0.5006218872213352710787148680537939071655} +#define T_2187_1280 {-0.8598559385108635044758784715668298304081,0.5105367420740666295486676062864717096090} +#define T_2187_1282 {-0.8569082468160068577489596464147325605154,0.5154689675807045956545948683924507349730} +#define T_2187_1286 {-0.8509280873143505452560475532663986086845,0.5252822005546552430743645345501136034727} +#define T_2187_1288 {-0.8478958169469108518967459531268104910851,0.5301628840308708623396682924067135900259} +#define T_2187_1292 {-0.8417473955386743922346681756607722491026,0.5398715792703467908353331949911080300808} +#define T_2187_1294 {-0.8386314474925281592376791195420082658529,0.5446992704939001006536614113429095596075} +#define T_2187_1298 {-0.8323165910893542962512015037646051496267,0.5543005432050347458527994604082778096199} +#define T_2187_1300 {-0.8291178912219508978509452390426304191351,0.5590738076995427352500200868234969675541} +#define T_2187_1304 {-0.8226384761891566421709853784705046564341,0.5685648050074699755285223545797634869814} +#define T_2187_1306 {-0.8193579749464140471459927539399359375238,0.5732822244686395229962272424018010497093} +#define T_2187_1310 {-0.8127159265454567682596120903326664119959,0.5826601262651835977379732867120765149593} +#define T_2187_1312 {-0.8093545986793505564449446865182835608721,0.5873202989822396968833118080510757863522} +#define T_2187_1316 {-0.8025518904957653631981884245760738849640,0.5965823187638676916577651354600675404072} +#define T_2187_1318 {-0.7991107347746956302003695782332215458155,0.6011838600360509543563125589571427553892} +#define T_2187_1322 {-0.7921493881316739038567220632103271782398,0.6103272457318407173687546674045734107494} +#define T_2187_1324 {-0.7886294270436963183001921606773976236582,0.6148687882798502535308671212987974286079} +#define T_2187_1328 {-0.7815115104014794811249089434568304568529,0.6238908230692276069007107253128197044134} +#define T_2187_1330 {-0.7779137898504882020489503702265210449696,0.6283710174414878224524727556854486465454} +#define T_2187_1334 {-0.7706414181917558003931389976060017943382,0.6372690205614889302765391221328172832727} +#define T_2187_1336 {-0.7669670071867086269534752318577375262976,0.6416865355351187849564098542032297700644} +#define T_2187_1340 {-0.7595423413881428054139632877195253968239,0.6504578630769389802068758399400394409895} +#define T_2187_1342 {-0.7557923317254211514892858758685179054737,0.6548113860533053554036087007261812686920} +#define T_2187_1346 {-0.7482175779156345907239256121101789176464,0.6634534317478969489556561711651738733053} +#define T_2187_1348 {-0.7443930838546325423621397021634038537741,0.6677416691426334427461597442743368446827} +#define T_2187_1352 {-0.7366704927586497086977601611579302698374,0.6762518651351198117893659400579053908587} +#define T_2187_1354 {-0.7327726506906884207381835949490778148174,0.6804735427624958310488523238745983690023} +#define T_2187_1358 {-0.7249045169611768590911538012733217328787,0.6888493603751714156047114556713495403528} +#define T_2187_1360 {-0.7209344850718429897895589419931638985872,0.6930032238266981003960154339438304305077} +#define T_2187_1364 {-0.7129231466072910583520183536165859550238,0.7012421743103868232438458107935730367899} +#define T_2187_1366 {-0.7088821045322990510584304502117447555065,0.7053269893275448954028661319171078503132} +#define T_2187_1370 {-0.7007299417823446008313226229802239686251,0.7134266246010951828537827168474905192852} +#define T_2187_1372 {-0.6966190902570253973280500758846756070852,0.7174411774420773602045642292068805545568} +#define T_2187_1376 {-0.6883285255151405657159102702280506491661,0.7253990908197724962747088284231722354889} +#define T_2187_1378 {-0.6841490860176615562693314132047817111015,0.7293421886201280068817709434370044618845} +#define T_2187_1382 {-0.6757225827014042840445995352638419717550,0.7371560155267973257764424488414078950882} +#define T_2187_1384 {-0.6714757970898246330904157730401493608952,0.7410264866538743833146440920245368033648} +#define T_2187_1388 {-0.6629158590088712887933297679410316050053,0.7486939053274911382018785843683872371912} +#define T_2187_1390 {-0.6586029891521428814016303476819302886724,0.7524905997285695757881285317125730216503} +#define T_2187_1394 {-0.6499121597643181535985945629363413900137,0.7600093319101282052230317276553250849247} +#define T_2187_1396 {-0.6455344871673397433298191572248470038176,0.7637311214541407933253935880202334374189} +#define T_2187_1400 {-0.6367153488228664004466850201424676924944,0.7710989330646071948649478144943714141846} +#define T_2187_1402 {-0.6322741742457038682800884998869150876999,0.7747447118773469476593618310289457440376} +#define T_2187_1406 {-0.6233293474198957628829020904959179461002,0.7819594136814818075009725362178869545460} +#define T_2187_1408 {-0.6188259904912802866761012410279363393784,0.7855280984741958016925877927860710769892} +#define T_2187_1412 {-0.6097581330059075321869954677822533994913,0.7925875467310535826825912408821750432253} +#define T_2187_1414 {-0.6051939318311279070172759020351804792881,0.7960780771223260332547511097800452262163} +#define T_2187_1418 {-0.5960057380646842650762096127436961978674,0.8029801742222349991706664695811923593283} +#define T_2187_1420 {-0.5913820488279888376581538977916352450848,0.8063915130530642239037319995986763387918} +#define T_2187_1424 {-0.5820762489150981267016504716593772172928,0.8131342081409002053860035630350466817617} +#define T_2187_1426 {-0.5773944454767246936555125103041063994169,0.8164653417828742210105019694310612976551} +#define T_2187_1430 {-0.5679738044969212529267110767250414937735,0.8230466313674413836309895486920140683651} +#define T_2187_1432 {-0.5632352779848753820957085736154112964869,0.8262965700239232047508153300441335886717} +#define T_2187_1436 {-0.5537025951410026181065404671244323253632,0.8327144985732618520657410954299848526716} +#define T_2187_1438 {-0.5489087535377041859874225337989628314972,0.8358822765734921222957609643344767391682} +#define T_2187_1442 {-0.5392668613241726749407689567306078970432,0.8421349370959356761545677727553993463516} +#define T_2187_1444 {-0.5344191290480956313402316482097376137972,0.8452196131819675883889431133866310119629} +#define T_2187_1448 {-0.5246708924092501336033933512226212769747,0.8513051477927765509079449657292570918798} +#define T_2187_1450 {-0.5197707098916770629415395887917838990688,0.8543058053991569034124609061109367758036} +#define T_2187_1454 {-0.5099190253705208064616272167768329381943,0.8602224058725616018250548222567886114120} +#define T_2187_1456 {-0.5049678486275408495487226900877431035042,0.8631381533986740572927942594105843454599} +// Pre-computed twiddles for N=2401 +#define T_2401_1 {0.9999965759099859186065373251040000468493,-0.0026169005146941246887914989116552533233} +#define T_2401_2 {0.9999863036633923618623498441593255847692,-0.0052337831083824094824397121783476904966} +#define T_2401_3 {0.9999691833305656141206441134272608906031,-0.0078506298601817379317902023672104405705} +#define T_2401_4 {0.9999452150287487794955154640774708241224,-0.0104674228494544517553688223188146366738} +#define T_2401_5 {0.9999143989220811157281332270940765738487,-0.0130841441559310586728592795680015115067} +#define T_2401_6 {0.9998767352215968129414136456034611910582,-0.0157007758598329723309650063356457394548} +#define T_2401_8 {0.9997808661177813860376772936433553695679,-0.0209336987839891380813739374389115255326} +#define T_2401_9 {0.9997226613709792042428148306498769670725,-0.0235499541682451345170346002078076708131} +#define T_2401_10 {0.9996576103434136051362202124437317252159,-0.0261660482781753307712779843541284208186} +#define T_2401_11 {0.9995857134805659072185335389804095029831,-0.0287819631982962897964739568124059587717} +#define T_2401_12 {0.9995069712747985946066364704165607690811,-0.0313976810143517040541105700413027079776} +#define T_2401_13 {0.9994213842653525414760906642186455428600,-0.0340131838134350647506742859604855766520} +#define T_2401_15 {0.9992296782267557908596700144698843359947,-0.0392434727165446237129309281499445205554} +#define T_2401_16 {0.9991235605104423767031107672664802521467,-0.0418582230026108212395108409964450402185} +#define T_2401_17 {0.9990106006161163509915468239341862499714,-0.0444726866360302328096132384871452813968} +#define T_2401_18 {0.9988907993173471444592337320500519126654,-0.0470868457124852257678071509872097522020} +#define T_2401_19 {0.9987641574345557105019111077126581221819,-0.0497006823297438185549346201241860399023} +#define T_2401_20 {0.9986306758350084189501671971811447292566,-0.0523141785877822909633927395134378457442} +#define T_2401_22 {0.9983431971889038525347359609440900385380,-0.0575400784378807747532214023067353991792} +#define T_2401_23 {0.9981892021110521096005641084047965705395,-0.0601524462420378047688984679552959278226} +#define T_2401_24 {0.9980283712538420282101014890940859913826,-0.0627644021114138334072052316514600533992} +#define T_2401_25 {0.9978607057186723583441789742209948599339,-0.0653759281588648377692152280360460281372} +#define T_2401_26 {0.9976862066537469653226821719727013260126,-0.0679870065001903084445089575638121459633} +#define T_2401_27 {0.9975048752540667251764716638717800378799,-0.0705976192542556862941083295481803361326} +#define T_2401_29 {0.9971217204643825215981678411480970680714,-0.0758173764921325210597302657333784736693} +#define T_2401_30 {0.9969198996982916449738354458531830459833,-0.0784264852301067016959734701231354847550} +#define T_2401_31 {0.9967112518452538871827073307940736413002,-0.0810350568893910688794690599934256169945} +#define T_2401_32 {0.9964957783341273911403845886525232344866,-0.0836430736060172486112662681989604607224} +#define T_2401_33 {0.9962734806405135756790514278691262006760,-0.0862505175198172713280087009479757398367} +#define T_2401_34 {0.9960443602867469214956486212031450122595,-0.0888573707745457797013699519084184430540} +#define T_2401_36 {0.9955656579216972668078255992440972477198,-0.0940692339021539231103119504950882401317} +#define T_2401_37 {0.9953160791886541680639766127569600939751,-0.0966742080832565658932153951354848686606} +#define T_2401_38 {0.9950596843519158296587079348682891577482,-0.0992785202219781809374055114858492743224} +#define T_2401_39 {0.9947964751673201710957528121070936322212,-0.1018821524835203801151450875295267906040} +#define T_2401_40 {0.9945264534373712272952161583816632628441,-0.1044850870377407037192796224189805798233} +#define T_2401_41 {0.9942496210112263810287913656793534755707,-0.1070873060592747311181582858807814773172} +#define T_2401_43 {0.9936755317001697562773188110440969467163,-0.1122895262274487132581057835523097310215} +#define T_2401_44 {0.9933782787467250185997613698418717831373,-0.1148894917483484073850164008945284876972} +#define T_2401_45 {0.9930742229599913706294955773046240210533,-0.1174886704853252555880160912238352466375} +#define T_2401_46 {0.9927633664221975351438231882639229297638,-0.1200870446387353462913694102098816074431} +#define T_2401_47 {0.9924457112621449805800466492655687034130,-0.1226845964144446937682531029167876113206} +#define T_2401_48 {0.9921212596551934881361489715345669537783,-0.1252813080239510989954965225479099899530} +#define T_2401_50 {0.9914519760347338950268181179126258939505,-0.1304721396192362048438440069730859249830} +#define T_2401_51 {0.9911071486046004741510273561289068311453,-0.1330662240572656196402334671802236698568} +#define T_2401_52 {0.9907555338942861666851058544125407934189,-0.1356593972338370202024293575959745794535} +#define T_2401_53 {0.9903971343117118353305272648867685347795,-0.1382516413904336383122739562168135307729} +#define T_2401_54 {0.9900319523112622865923526660481002181768,-0.1408429387749008110386483849651995114982} +#define T_2401_55 {0.9896599903937697284561636479338631033897,-0.1434332716415675779142446799596655182540} +#define T_2401_57 {0.9888957370431165072588441944390069693327,-0.1486109728719632594451383056366466917098} +#define T_2401_58 {0.9885034508437005040804024247336201369762,-0.1511983057778620020972226711819530464709} +#define T_2401_59 {0.9881043951946945069764183244842570275068,-0.1537846032505428561787397256921394728124} +#define T_2401_60 {0.9876985728289037202998201792070176452398,-0.1563698475785750552358166487465496174991} +#define T_2401_61 {0.9872859865254725031391558331961277872324,-0.1589540210577400636271505618424271233380} +#define T_2401_62 {0.9868666391098662726832912994723301380873,-0.1615371059911526741004195173445623368025} +#define T_2401_64 {0.9860076724754767152347767478204332292080,-0.1666999394705737380650845125273917801678} +#define T_2401_65 {0.9855680591390515443706021869729738682508,-0.1692796526605690743405574494317988865077} +#define T_2401_66 {0.9851216964551273713723844593914691358805,-0.1718582065930278901522143542024423368275} +#define T_2401_67 {0.9846685874804761429146537921042181551456,-0.1744355836095486111947394647359033115208} +#define T_2401_68 {0.9842087353180697384402719762874767184258,-0.1770117660597894659879614209785358980298} +#define T_2401_69 {0.9837421431170586538783595642598811537027,-0.1795867363015892503863568663291516713798} +#define T_2401_71 {0.9827887514265866153451156606024596840143,-0.1847329696328485759693194268038496375084} +#define T_2401_72 {0.9823019584661236702771702766767702996731,-0.1873041974799757869440952617878792807460} +#define T_2401_73 {0.9818084385250072099182716556242667138577,-0.1898741426342386573455911502605886198580} +#define T_2401_74 {0.9813081949829504102567057088890578597784,-0.1924427874961900375438972332631237804890} +#define T_2401_75 {0.9808012312657112818925497776945121586323,-0.1950101144752875437227146449004067108035} +#define T_2401_76 {0.9802875508450685781980382671463303267956,-0.1975761059900137950329224167944630607963} +#define T_2401_78 {0.9792400540106493966874268153333105146885,-0.2027040123461310017205505573656409978867} +#define T_2401_79 {0.9787062447703198664328283484792336821556,-0.2052658920706960932189844015738344751298} +#define T_2401_80 {0.9781657331734314730908863566583022475243,-0.2078263660974786208868181347497738897800} +#define T_2401_81 {0.9776185229215048622108952258713543415070,-0.2103854168918915246155876275224727578461} +#define T_2401_82 {0.9770646177619344285858460352756083011627,-0.2129430269290942256965593060158425942063} +#define T_2401_83 {0.9765040214879623370336503285216167569160,-0.2154991786941127529519945937863667495549} +#define T_2401_85 {0.9753627709988658134321326542703900486231,-0.2206070373977538912058093956147786229849} +#define T_2401_86 {0.9747821245992301930627377259952481836081,-0.2231587093568405122390174710744759067893} +#define T_2401_87 {0.9741948027161170253762634274607989937067,-0.2257088530849106211562116186541970819235} +#define T_2401_88 {0.9736008093716124189853644566028378903866,-0.2282574511181208187782232243989710696042} +#define T_2401_89 {0.9730001486334894922336502531834412366152,-0.2308044860032129885762941512439283542335} +#define T_2401_90 {0.9723928246151812837538841449713800102472,-0.2333499402976337011583751746002235449851} +#define T_2401_92 {0.9711582034198688040405045285297092050314,-0.2384360373984872050190375603051506914198} +#define T_2401_93 {0.9705309146977726397764740795537363737822,-0.2409766453744114356183558811608236283064} +#define T_2401_94 {0.9698969796052496761262773361522704362869,-0.2435156030988855357932720835378859192133} +#define T_2401_95 {0.9692564024836015645902875803585629910231,-0.2460528931846699407515188795514404773712} +#define T_2401_96 {0.9686091877196157939877707576670218259096,-0.2485884982559453393324844228118308819830} +#define T_2401_97 {0.9679553397455354923906156727753113955259,-0.2511224009484318009377545877214288339019} +#define T_2401_99 {0.9666277621231609273877438681665807962418,-0.2561850297983274127311403844942105934024} +#define T_2401_100 {0.9659540415663571133464415652269963175058,-0.2587137212859428325728572417574469000101} +#define T_2401_101 {0.9652737059823776055722532873915042728186,-0.2612406410554191649175947986805113032460} +#define T_2401_102 {0.9645867600302832078895676204410847276449,-0.2637657718019548180876654441817663609982} +#define T_2401_103 {0.9638932084144031797734442079672589898109,-0.2662890962329999000601787884079385548830} +#define T_2401_104 {0.9631930558843040390826217844733037054539,-0.2688105970683744572191642419056734070182} +#define T_2401_106 {0.9617729673057018313642174689448438584805,-0.2738480588939518622915159085096092894673} +#define T_2401_107 {0.9610530409822211073134212711011059582233,-0.2763639853867091988881554698309628292918} +#define T_2401_108 {0.9603265331944991212154150161950383335352,-0.2788780192891412990086053014238132163882} +#define T_2401_109 {0.9595934489177918580438131357368547469378,-0.2813901433846914268421812721499009057879} +#define T_2401_110 {0.9588537931723926099891741614555940032005,-0.2839003404698814403417372886906377971172} +#define T_2401_111 {0.9581075710235968934114225703524425625801,-0.2864085933544299189534854122030083090067} +#define T_2401_113 {0.9565954480018038497135535180859733372927,-0.2914191978271645777631704277155222371221} +#define T_2401_114 {0.9558295574840972586017073808761779218912,-0.2939215151018294003293362948170397430658} +#define T_2401_115 {0.9550571212735046167097152647329494357109,-0.2964218195490445628692555146699305623770} +#define T_2401_116 {0.9542781446598083139321033740998245775700,-0.2989200940462752309478844381374074146152} +#define T_2401_117 {0.9534926329775803566235481412149965763092,-0.3014163214848877836438134636409813538194} +#define T_2401_118 {0.9527005916061462853505759085237514227629,-0.3039104847702673861675748412380926311016} +#define T_2401_120 {0.9510969415365063683864832455583382397890,-0.3088925505736636045739373912510927766562} +#define T_2401_121 {0.9502853438203850977572528790915384888649,-0.3113804189735966398977495828148676082492} +#define T_2401_122 {0.9494672383791510350548037422413472086191,-0.3138661549843632658784997602197108790278} +#define T_2401_123 {0.9486426308153377284781981870764866471291,-0.3163497415831958425869174789113458245993} +#define T_2401_124 {0.9478115267760062190305347940011415630579,-0.3188311617620461779765150822640862315893} +#define T_2401_125 {0.9469739319527065157799938788230065256357,-0.3213103985277021568123245742754079401493} +#define T_2401_127 {0.9452792929426139245663307519862428307533,-0.3262622539214609651558873792964732274413} +#define T_2401_128 {0.9444222603610140653884741368528921157122,-0.3287348386383665554966171384876361116767} +#define T_2401_129 {0.9435587602057526890320104939746670424938,-0.3312051721199155696062632614484755322337} +#define T_2401_130 {0.9426887983902342282860331579286139458418,-0.3336732374488197150341761698655318468809} +#define T_2401_131 {0.9418123808721137191213301775860600173473,-0.3361390177233231635334220754884881898761} +#define T_2401_132 {0.9409295136532562775499854978988878428936,-0.3386024960573187914114612340199528262019} +#define T_2401_134 {0.9391444543415923273954604155733250081539,-0.3435224794382935042946769499394576996565} +#define T_2401_135 {0.9382422744731934161066533306438941508532,-0.3459789507923406448597347662143874913454} +#define T_2401_136 {0.9373336693527887231169870574376545846462,-0.3484330528202468357790166919585317373276} +#define T_2401_137 {0.9364186452026698193051856833335477858782,-0.3508847687158795358719487467169528827071} +#define T_2401_138 {0.9354972082890868900761915938346646726131,-0.3533340816894469105235998540592845529318} +#define T_2401_139 {0.9345693649222056587078100164944771677256,-0.3557809749676127952788817765394924208522} +#define T_2401_141 {0.9326944842885311892644040199229493737221,-0.3606674354273625704436767591687384992838} +#define T_2401_142 {0.9317474598612580649614756111986935138702,-0.3631069691455855452666412475082324817777} +#define T_2401_143 {0.9307940546596390474221038857649546116590,-0.3655440162419143490524220396764576435089} +#define T_2401_144 {0.9298342752127644050830213018343783915043,-0.3679785600270116252552554669819073751569} +#define T_2401_145 {0.9288681280933768213969869975699111819267,-0.3704105838286833596306735216785455122590} +#define T_2401_146 {0.9278956199178255426218697721196804195642,-0.3728400709919926780955279355111997574568} +#define T_2401_148 {0.9259315470813924697779384587192907929420,-0.3776913688707475014183501116349361836910} +#define T_2401_149 {0.9249399958708349744895826916035730391741,-0.3801131463636318041920958421542309224606} +#define T_2401_150 {0.9239421105046704552066216820094268769026,-0.3825323207732586605622771003254456445575} +#define T_2401_151 {0.9229378978165976299408157501602545380592,-0.3849488755326862521322084376151906326413} +#define T_2401_152 {0.9219273646836456670428106008330360054970,-0.3873627940929125768043661537376465275884} +#define T_2401_153 {0.9209105180261274448128006042679771780968,-0.3897740599229887470400512938795145601034} +#define T_2401_155 {0.9188579120347765138987483624077867716551,-0.3945885673598399367989486563601531088352} +#define T_2401_156 {0.9178221667575591213861230244219768792391,-0.3969917759960013836639802775607677176595} +#define T_2401_157 {0.9167801360689099565703941152605693787336,-0.3993922659610110170724794897978426888585} +#define T_2401_158 {0.9157318271048426883140791687765158712864,-0.4017900208158814989722884547518333420157} +#define T_2401_159 {0.9146772470443658153982369185541756451130,-0.4041850241403559529373978875810280442238} +#define T_2401_160 {0.9136164031094335946647788659902289509773,-0.4065772595330203187380391227634390816092} +#define T_2401_162 {0.9114759527184500553786961063451599329710,-0.4113533610121520300850761486799456179142} +#define T_2401_163 {0.9103963609205883988906293780019041150808,-0.4137371943910165272839662975457031279802} +#define T_2401_164 {0.9093105345645501325080317656102124601603,-0.4161181944230895046743512466491665691137} +#define T_2401_165 {0.9082184810862695689692714040575083345175,-0.4184963448028540811129971643822500482202} +#define T_2401_166 {0.9071202079643256865892908535897731781006,-0.4208716292443082651608676769683370366693} +#define T_2401_167 {0.9060157227198905038889620300324168056250,-0.4232440314810766435194011592102469876409} +#define T_2401_169 {0.9037881461608909816618506738450378179550,-0.4279801243738545202255352251086151227355} +#define T_2401_170 {0.9026650701011720290267703603603877127171,-0.4303437825962473373131444986938731744885} +#define T_2401_171 {0.9015358124285478558235240598150994628668,-0.4327044937469428642629054593271575868130} +#define T_2401_172 {0.9004003808763784144630903938377741724253,-0.4350622416593662444839196723478380590677} +#define T_2401_173 {0.8992587832203031705802231954294256865978,-0.4374170101872352223182360830833204090595} +#define T_2401_174 {0.8981110272781884784620842765434645116329,-0.4397687832046710543210110699874348938465} +#define T_2401_176 {0.8957970720181174240082100368454121053219,-0.4444632783074073389961711200157878920436} +#define T_2401_177 {0.8946308885465431570693795038096141070127,-0.4468059682439601831838160705956397578120} +#define T_2401_178 {0.8934585784815852749218834105704445391893,-0.4491455983728047662850713095394894480705} +#define T_2401_179 {0.8922801498514343343515520246000960469246,-0.4514821526717328614708435452484991401434} +#define T_2401_180 {0.8910956107261815972719887213315814733505,-0.4538156151395998927355890373291913419962} +#define T_2401_181 {0.8899049692177641857071535014256369322538,-0.4561459697964349024879027183487778529525} +#define T_2401_183 {0.8875054117080797277949955059739295393229,-0.4607972918636478731890804283466422930360} +#define T_2401_184 {0.8862965121394144052402452871319837868214,-0.4631182274209349469806795696058543398976} +#define T_2401_185 {0.8850815430526756122375786617340054363012,-0.4654359914612263282585047363681951537728} +#define T_2401_186 {0.8838605127681903317338196757191326469183,-0.4677505681120565639297126381279667839408} +#define T_2401_187 {0.8826334296477937879643604901502840220928,-0.4700619415227881292551614933472592383623} +#define T_2401_188 {0.8814003020947721589450907231366727501154,-0.4723700958647196745943119822186417877674} +#define T_2401_190 {0.8789159475109068475617846161185298115015,-0.4769766841377101052046327822608873248100} +#define T_2401_191 {0.8776647374933705947341877617873251438141,-0.4792750865220231637486847375839715823531} +#define T_2401_192 {0.8764075170697077643566785809525754302740,-0.4815702067442607092750961328420089557767} +#define T_2401_193 {0.8751442948495897766747475543525069952011,-0.4838620290870261908544591733516426756978} +#define T_2401_194 {0.8738750794837901736400453955866396427155,-0.4861505378555077139246520800952566787601} +#define T_2401_195 {0.8725998796641240007332385175686795264482,-0.4884357173775852323238666485849535092711} +#define T_2401_197 {0.8700315616353069270161313397693447768688,-0.4929960261081514638625833413243526592851} +#define T_2401_198 {0.8687384610144601726489099746686406433582,-0.4952711240868249986668558904057135805488} +#define T_2401_199 {0.8674394111162350151644773177395109087229,-0.4975428303596781121065362185618141666055} +#define T_2401_200 {0.8661344208367589558861254772637039422989,-0.4998111293696571033606801393034402281046} +#define T_2401_201 {0.8648234991128403992277640099928248673677,-0.5020760055830421064726465374405961483717} +#define T_2401_202 {0.8635066549219072573606581499916501343250,-0.5043374434895530056266466090164612978697} +#define T_2401_204 {0.8608552352514381444947844101989176124334,-0.5088499424586692132521648090914823114872} +#define T_2401_205 {0.8595206779293014243137349694734439253807,-0.5111009726188692203763253019133117049932} +#define T_2401_206 {0.8581802344548241645583175341016612946987,-0.5133485026675961293562977516558021306992} +#define T_2401_207 {0.8568339140076047799610137190029490739107,-0.5155925172133595246748427598504349589348} +#define T_2401_208 {0.8554817258074880470530843012966215610504,-0.5178330008887438440368100600608158856630} +#define T_2401_209 {0.8541236791145021545190729739260859787464,-0.5200699383505136275118729827227070927620} +#define T_2401_211 {0.8513900474905720106022499749087728559971,-0.5245331133817974222921520777163095772266} +#define T_2401_212 {0.8500144812800293214749558501353021711111,-0.5267593203866852702432765909179579466581} +#define T_2401_213 {0.8486330940172923664732707038638181984425,-0.5289819200489157546840601753501687198877} +#define T_2401_214 {0.8472458951623496048455308482516556978226,-0.5312008971477260033111633674707263708115} +#define T_2401_215 {0.8458528942149886598045327446016017347574,-0.5334162364871614103734032141801435500383} +#define T_2401_216 {0.8444541007147307043467776566103566437960,-0.5356279228961794425245557249581906944513} +#define T_2401_218 {0.8416391744118856266254624642897397279739,-0.5400402763639760017611024522921070456505} +#define T_2401_219 {0.8402230608864206251595874164195265620947,-0.5422409132061636327648557198699563741684} +#define T_2401_220 {0.8388011933621707383323951034981291741133,-0.5444378366849590422660298827395308762789} +#define T_2401_221 {0.8373735815763410084144879874656908214092,-0.5466310317554347042445783699804451316595} +#define T_2401_222 {0.8359402353054736778403821517713367938995,-0.5488204833981958907784814982733223587275} +#define T_2401_223 {0.8345011643653821309385421045590192079544,-0.5510061766194835897181292239110916852951} +#define T_2401_225 {0.8316058879367300704643639619462192058563,-0.5553662279513966737454211397562175989151} +#define T_2401_226 {0.8301497022755438548458073455549310892820,-0.5575405562036056528185667957586701959372} +#define T_2401_227 {0.8286878315997460164865628939878661185503,-0.5597110663177127243272934720152989029884} +#define T_2401_228 {0.8272202859204901281842126081755850464106,-0.5618777434296738926278180770168546587229} +#define T_2401_229 {0.8257470752877933417579470187774859368801,-0.5640405727016942760698725578549783676863} +#define T_2401_230 {0.8242682097904673321764335014449898153543,-0.5661995393223295813811546395299956202507} +#define T_2401_232 {0.8212935547507319578031115270277950912714,-0.5705058254960299057856332183291669934988} +#define T_2401_233 {0.8197977855792959456238122584181837737560,-0.5726531155588719945725983961892779916525} +#define T_2401_234 {0.8182964022850375274131806691002566367388,-0.5747964839900849831622053898172453045845} +#define T_2401_235 {0.8167894151496998267347748878819402307272,-0.5769359161114960832250631028728093951941} +#define T_2401_236 {0.8152768344934021582659511295787524431944,-0.5790713972718889435142841648485045880079} +#define T_2401_237 {0.8137586706745689735242876849952153861523,-0.5812029128471040140269110452209133654833} +#define T_2401_239 {0.8107056351740962174545757079613395035267,-0.5854539888812486658764555613743141293526} +#define T_2401_240 {0.8091707844001934102351469846325926482677,-0.5875735202280440994115906505612656474113} +#define T_2401_241 {0.8076303922790853961899415480729658156633,-0.5896890277655932877109989931341260671616} +#define T_2401_242 {0.8060844693596548315994709810183849185705,-0.5918004970065195990969186823349446058273} +#define T_2401_243 {0.8045330262286601863408463941595982760191,-0.5939079134911017243680930732807610183954} +#define T_2401_244 {0.8029760735106632463242704034200869500637,-0.5960112627873720425597525718330871313810} +#define T_2401_246 {0.7998456820004904432863668262143619358540,-0.6002057022264702945690828528313431888819} +#define T_2401_247 {0.7982722646457990345325583803059998899698,-0.6022967636450218265053990762680768966675} +#define T_2401_248 {0.7966933805789279121611912160005886107683,-0.6043837004269055901417573295475449413061} +#define T_2401_249 {0.7951090406123592080334105958172585815191,-0.6064664982804026660545559934689663350582} +#define T_2401_250 {0.7935192555959383886587943379709031432867,-0.6085451429421384617057810828555375337601} +#define T_2401_251 {0.7919240364167994261634930808213539421558,-0.6106196201771800780022658727830275893211} +#define T_2401_253 {0.7887173393048988456754955223004799336195,-0.6147560155702431750057712633861228823662} +#define T_2401_254 {0.7871058833321762371326713036978617310524,-0.6168179054014842899178461266274098306894} +#define T_2401_255 {0.7854890371166634022159769301651977002621,-0.6188755711526646718922961554198991507292} +#define T_2401_256 {0.7838668117308141836119261824933346360922,-0.6209289987325187221145483817963395267725} +#define T_2401_257 {0.7822392182839201790756078480626456439495,-0.6229781740788047361689905301318503916264} +#define T_2401_258 {0.7806062679220341360419865850417409092188,-0.6250230831584009383306010931846685707569} +#define T_2401_260 {0.7773243412208471481150695581163745373487,-0.6291000465312143186125126703700516372919} +#define T_2401_261 {0.7756753873567709955949567302013747394085,-0.6311320729046522304273025838483590632677} +#define T_2401_262 {0.7740211215279988454796011865255422890186,-0.6331597771720333112099865502386819571257} +#define T_2401_263 {0.7723615550632407789066746772732585668564,-0.6351831454472737226168987945129629224539} +#define T_2401_264 {0.7706966993275068400492955333902500569820,-0.6372021638739832072317881284106988459826} +#define T_2401_265 {0.7690265657220287653927925930474884808064,-0.6392168186255601236567258638388011604548} +#define T_2401_267 {0.7656705106874079902468110958579927682877,-0.6432329819464202014245302052586339414120} +#define T_2401_268 {0.7639846122411344531144550273893401026726,-0.6452344630122940349892246558738406747580} +#define T_2401_269 {0.7622934818906974552277233669883571565151,-0.6472315253964047165879946987843140959740} +#define T_2401_270 {0.7605971312172622456060366857855115085840,-0.6492241554225095523378286088700406253338} +#define T_2401_271 {0.7588955718377434767063505205442197620869,-0.6512123394447193458489664408261887729168} +#define T_2401_272 {0.7571888154047261565438020625151693820953,-0.6531960638475921010481783923751208931208} +#define T_2401_274 {0.7537597581664073898011224628135096281767,-0.6571500794863521788258253764070104807615} +#define T_2401_275 {0.7520374808439073133214947119995485991240,-0.6591203436444286190010188875021412968636} +#define T_2401_276 {0.7503100534333504434414408024167641997337,-0.6610860940277315522806134140409994870424} +#define T_2401_277 {0.7485774877644707503421273031563032418489,-0.6630473171744484250922369028558023273945} +#define T_2401_278 {0.7468397957021897237694929572171531617641,-0.6650039996537701059153846472327131778002} +#define T_2401_279 {0.7450969891465355487980559701099991798401,-0.6669561280659827007255557873577345162630} +#define T_2401_281 {0.7415960803302630299782549627707339823246,-0.6708466692462519143091981277393642812967} +#define T_2401_282 {0.7398380020444984550564981873321812599897,-0.6727850553711821612878907217236701399088} +#define T_2401_283 {0.7380748572149042896484161246917210519314,-0.6747188341429329883069954121310729533434} +#define T_2401_284 {0.7363066579158135116500716321752406656742,-0.6766479923186391465250721921620424836874} +#define T_2401_285 {0.7345334162561735213969882352103013545275,-0.6785725166870781865924300291226245462894} +#define T_2401_286 {0.7327551443794620977811860029760282486677,-0.6804923940687607197830288896511774510145} +#define T_2401_288 {0.7291835587208940205883322960289660841227,-0.6843181553131062466732714710815344005823} +#define T_2401_289 {0.7273902693978987477052555732370819896460,-0.6862240129762673390700911113526672124863} +#define T_2401_290 {0.7255919987753878253045058954739943146706,-0.6881251712538479781144928892899770289660} +#define T_2401_291 {0.7237887591682423682470925996312871575356,-0.6900216171263738873520310335152316838503} +#define T_2401_292 {0.7219805629253717160764836080488748848438,-0.6919133376066423091188539729046169668436} +#define T_2401_293 {0.7201674224296292781133388416492380201817,-0.6938003197398109334059768116276245564222} +#define T_2401_295 {0.7165263583801533275874362516333349049091,-0.6975600173078128074521941925922874361277} +#define T_2401_296 {0.7146984597610818168433866048872005194426,-0.6994327069955602604878208694572094827890} +#define T_2401_297 {0.7128656667582921047099375755351502448320,-0.7013006068422126260131221897609066218138} +#define T_2401_298 {0.7110279919230805267815753722970839589834,-0.7031637040560553275625466085330117493868} +#define T_2401_299 {0.7091854478401751205041136927320621907711,-0.7050219858782634796412480682192835956812} +#define T_2401_300 {0.7073380471276494718679828110907692462206,-0.7068754395829883740987042983761057257652} +#define T_2401_302 {0.7036287264522408380074125489045400172472,-0.7105678119020011340722930981428362429142} +#define T_2401_303 {0.7017668318914535019459322029433678835630,-0.7124067052302585079104346732492558658123} +#define T_2401_304 {0.6999001315050635829706493495905306190252,-0.7142407198691451686300979417865164577961} +#define T_2401_305 {0.6980286380765713394325189256051089614630,-0.7160698432589986950347338279243558645248} +#define T_2401_306 {0.6961523644223007734055386208638083189726,-0.7178940628736527607145490037510171532631} +#define T_2401_307 {0.6942713233913115900008961034473031759262,-0.7197133662205231763309143389051314443350} +#define T_2401_309 {0.6904949907585673507526280445745214819908,-0.7233371743089981320551373755733948200941} +#define T_2401_310 {0.6885997250178180761892576811078470200300,-0.7251416542341125870407836373487953096628} +#define T_2401_311 {0.6866997436221845552140052859613206237555,-0.7269411682586328637967199028935283422470} +#define T_2401_312 {0.6847950595830814002340503066079691052437,-0.7287357040591629919745741972292307764292} +#define T_2401_313 {0.6828856859441276849764790313201956450939,-0.7305252493463988416877441522956360131502} +#define T_2401_314 {0.6809716357810580156240121141308918595314,-0.7323097918652119453497562062693759799004} +#define T_2401_316 {0.6771295583455470001865705853560939431190,-0.7358638197484399512404706911183893680573} +#define T_2401_317 {0.6752015573843435092271647590678185224533,-0.7376332807742320696320348361041396856308} +#define T_2401_318 {0.6732689325213198072717091235972475260496,-0.7393976903545226297964632067305501550436} +#define T_2401_319 {0.6713316969914386866946642840048298239708,-0.7411570364063172489110797869216185063124} +#define T_2401_320 {0.6693898640612380157577376849076244980097,-0.7429113068812973619259309998597018420696} +#define T_2401_321 {0.6674434470287392562326544975803699344397,-0.7446604897659030442014227446634322404861} +#define T_2401_323 {0.6635369140058056114384044121834449470043,-0.7481435448840363378053552878554910421371} +#define T_2401_324 {0.6615768247680122726706031244248151779175,-0.7498773932649754891954785307461861521006} +#define T_2401_325 {0.6596122049330204184514059306820854544640,-0.7516061063505266037765295550343580543995} +#define T_2401_326 {0.6576430679549003510686588924727402627468,-0.7533296723021510876705519876850303262472} +#define T_2401_327 {0.6556694273186567389899437330313958227634,-0.7550480793165589288307160131807904690504} +#define T_2401_328 {0.6536912965401358022177191742230206727982,-0.7567613156257898543444184724648948758841} +#define T_2401_330 {0.6497216187733013681437910236127208918333,-0.7601722292340077347105875560373533517122} +#define T_2401_331 {0.6477300989700559563644333138654474169016,-0.7618698831744443733526850337511859834194} +#define T_2401_332 {0.6457341433944830955127258675929624587297,-0.7635623196927630029762212870991788804531} +#define T_2401_333 {0.6437337657152458758602620036981534212828,-0.7652495271988535296969757837359793484211} +#define T_2401_334 {0.6417289796312908300990329735213890671730,-0.7669314941384154371561976404336746782064} +#define T_2401_335 {0.6397197988717540084735446725971996784210,-0.7686082089930360572438416966178920120001} +#define T_2401_337 {0.6356883083928622779623651695146691054106,-0.7719458365537191779637282706971745938063} +#define T_2401_338 {0.6336660262818799571604699849558528512716,-0.7736067264031071255558913435379508882761} +#define T_2401_339 {0.6316394047118719656808139006898272782564,-0.7752623184543615542096972603758331388235} +#define T_2401_340 {0.6296084575615076284194060463050846010447,-0.7769126013696902033345281779475044459105} +#define T_2401_341 {0.6275731987390785748814892031077761203051,-0.7785575638476586179947958044067490845919} +#define T_2401_342 {0.6255336421824037040906318907218519598246,-0.7801971946232673094101528477040119469166} +#define T_2401_344 {0.6214416917646544691322674225375521928072,-0.7834604161900486918312935813446529209614} +#define T_2401_348 {0.6132068346373660272519146019476465880871,-0.7899223872976521843014552359818480908871} +#define T_2401_352 {0.6049047884724643164844337661634199321270,-0.7962978066546982924833741890324745327234} +#define T_2401_356 {0.5965364629246703298548482052865438163280,-0.8025859757068544153568723231728654354811} +#define T_2401_360 {0.5881027749109331015375801143818534910679,-0.8087862054597990679738472863391507416964} +#define T_2401_368 {0.5710430148609823008953867429227102547884,-0.8209201393427255633028494230529759079218} +#define T_2401_372 {0.5624188120616966379827772470889613032341,-0.8268525139582693084250308857008349150419} +#define T_2401_376 {0.5537329850655110519141999247949570417404,-0.8326942903914007487742310331668704748154} +#define T_2401_380 {0.5449864855779902317323148963623680174351,-0.8384448285590120342192221869481727480888} +#define T_2401_384 {0.5361802719525800009492400022281799465418,-0.8441034983749666764651919947937130928040} +#define T_2401_388 {0.5273153090856010916098739471635781228542,-0.8496696798191383237508489401079714298248} +#define T_2401_396 {0.5094130272915455837079434786573983728886,-0.8605221482481801853481329089845530688763} +#define T_2401_400 {0.5003776699164557140164788506808690726757,-0.8658072461287087406844875658862292766571} +#define T_2401_404 {0.4912874861888437694190656657156068831682,-0.8709974775590607620756600226741284132004} +#define T_2401_408 {0.4821434721196179440383389191993046551943,-0.8760922738458770675507025771366897970438} +#define T_2401_412 {0.4729466296178740924993633143458282575011,-0.8810910767526212161726562044350430369377} +#define T_2401_416 {0.4636979663811163221431854708498576655984,-0.8859933385607462463795513940567616373301} +#define T_2401_424 {0.4450492367715148112061740448552882298827,-0.8955061009558182849588092722115106880665} +#define T_2401_428 {0.4356512137389021432909430586732923984528,-0.9001155592299365792996468371711671352386} +#define T_2401_432 {0.4262054564278490165207813333836384117603,-0.9046263918939845272149113952764309942722} +#define T_2401_436 {0.4167129998094408760422879822726827114820,-0.9090381046962866529170810281357262283564} +#define T_2401_440 {0.4071748839716036183489222821663133800030,-0.9133502142457246009144000709056854248047} +#define T_2401_444 {0.3975921540051412517158269110950641334057,-0.9175622480647032119804862304590642452240} +#define T_2401_452 {0.3782970563763480287278184732713270932436,-0.9256842534779287090529464876453857868910} +#define T_2401_456 {0.3685868028767549930968527860386529937387,-0.9295933351445093695275545542244799435139} +#define T_2401_460 {0.3588361633423636232187448058539303019643,-0.9334005613228076869702931617212016135454} +#define T_2401_464 {0.3490462061501870549840020885312696918845,-0.9371055148552702718234286294318735599518} +#define T_2401_468 {0.3392180039852719652948564998951042070985,-0.9407077897903514118738144134113099426031} +#define T_2401_472 {0.3293526337231644229852633998234523460269,-0.9442069914269938246675906157179269939661} +#define T_2401_480 {0.3095147166536484362175940532324602827430,-0.9508946525114186920646375256183091551065} +#define T_2401_484 {0.2995443434856727149551147704187314957380,-0.9540823791925607810782139495131559669971} +#define T_2401_488 {0.2895411492612042780869785474351374432445,-0.9571655671222721872837269074807409197092} +#define T_2401_492 {0.2795062300296592971982079234294360503554,-0.9601438784758288269571835371607448905706} +#define T_2401_496 {0.2694406853165609017075610154279274865985,-0.9630169869198268450816158292582258582115} +#define T_2401_500 {0.2593456180030645485956597440235782414675,-0.9657845776479393462565781192097347229719} +#define T_2401_508 {0.2390713431522510368232303790136938914657,-0.9710020045722761450335269728384446352720} +#define T_2401_512 {0.2288943570660635362656876168330200016499,-0.9734512690953325275700080965179949998856} +#define T_2401_516 {0.2186922910383331808414908437043777666986,-0.9757938726188051248300325823947787284851} +#define T_2401_520 {0.2084662629088480656491810805164277553558,-0.9780295584637608286016074998769909143448} +#define T_2401_524 {0.1982173931429223601163869261654326692224,-0.9801580816662300854602563049411401152611} +#define T_2401_528 {0.1879468047086269855139306628188933245838,-0.9821792090040478706569615496846381574869} +#define T_2401_536 {0.1673449754824731794400349826901219785213,-0.9858984020581281759731950842251535505056} +#define T_2401_540 {0.1570159920318584534371098015981260687113,-0.9875960602626213669097410274844150990248} +#define T_2401_544 {0.1466698043480248458969583680300274863839,-0.9891855076235761545788705006998497992754} +#define T_2401_548 {0.1363075460621619994228126415691804140806,-0.9906665699853365314453412793227471411228} +#define T_2401_552 {0.1259303525663144907031920638473820872605,-0.9920390850679844385595629319141153246164} +#define T_2401_556 {0.1155393608889769829772831144509837031364,-0.9933029024851214305869007148430682718754} +#define T_2401_564 {0.0947205385383906722784885801047494169325,-0.9955039023424253619154455918760504573584} +#define T_2401_568 {0.0842949889823097625596304283135395962745,-0.9964408436191645179746956273447722196579} +#define T_2401_572 {0.0738602032291328369772998030384769663215,-0.9972686049299612776053436391521245241165} +#define T_2401_576 {0.0634173246177348881591484541786485351622,-0.9979870955770564489029084143112413585186} +#define T_2401_580 {0.0529674973737249144001282274984987452626,-0.9985962368354711626849962158303242176771} +#define T_2401_584 {0.0425118664840732074505780246909125708044,-0.9990959619616327502811259364534635096788} +#define T_2401_592 {0.0215877767697255050494131722871316014789,-0.9997669567925019373788586563023272901773} +#define T_2401_596 {0.0111216105963364217390498822624067543074,-0.9999381529763445852765357813041191548109} +#define T_2401_600 {0.0006542258287137657198420659376836283627,-0.9999997859942596489801758252724539488554} +#define T_2401_604 {-0.0098132306223953271601123660161647421774,-0.9999518490931209591110473411390557885170} +#define T_2401_608 {-0.0202796118383893608783097306513809598982,-0.9997943475253719958217857310955878347158} +#define T_2401_612 {-0.0307437710184802932833481037278033909388,-0.9995272985484504602027300279587507247925} +#define T_2401_620 {-0.0516608374107706200262235540776600828394,-0.9986646874091513748084025792195461690426} +#define T_2401_624 {-0.0621114527412080993928888972277491120622,-0.9980692197630265383168080006726086139679} +#define T_2401_628 {-0.0725552625233399778315046546595112886280,-0.9973643937298790618228849780280143022537} +#define T_2401_632 {-0.0829911224295289406960307587723946198821,-0.9965502865374566976441883525694720447063} +#define T_2401_636 {-0.0934178890032050929237428249507502187043,-0.9956269873874376008515696412359829992056} +#define T_2401_640 {-0.1038344197841544608307273733771580737084,-0.9945945974456567029164943960495293140411} +#define T_2401_648 {-0.1246322098597495708904858702226192690432,-0.9922030096031131396472346750670112669468} +#define T_2401_652 {-0.1350111903417325343035315654560690745711,-0.9908440737484927041123228264041244983673} +#define T_2401_656 {-0.1453753776553525778325592909823171794415,-0.9893765711656828942821562122844625264406} +#define T_2401_660 {-0.1557236361972016702726051562422071583569,-0.9878006626488573527211656255531124770641} +#define T_2401_664 {-0.1660548321091863677168021240504458546638,-0.9861165208702215023706116880930494517088} +#define T_2401_668 {-0.1763678334027646288362234372470993548632,-0.9843243303610932359859475582197774201632} +#define T_2401_676 {-0.1969347342722635452183510551549261435866,-0.9804166004495807884566715983964968472719} +#define T_2401_680 {-0.2071863803340384668771889664640184491873,-0.9783014892169382692443946325511205941439} +#define T_2401_684 {-0.2174153249960329437051598233665572479367,-0.9760791855463722965779993501200806349516} +#define T_2401_688 {-0.2276204474733684590681548343127360567451,-0.9737499329355682187170373254048172384501} +#define T_2401_692 {-0.2378006295913619450033138491562567651272,-0.9713139866006006428733599022962152957916} +#define T_2401_696 {-0.2479547559080441110346271216258173808455,-0.9687716134479695817560696013970300555229} +#define T_2401_704 {-0.2681803937661663694846936323301633819938,-0.9633687125910951332130593982583377510309} +#define T_2401_708 {-0.2782496891856293119715815009840298444033,-0.9605087768823877825141721586987841874361} +#define T_2401_712 {-0.2882884968026468985868859817855991423130,-0.9575435982822245728485199833812657743692} +#define T_2401_716 {-0.2982957166656456937658958850079216063023,-0.9544735016850540709754113777307793498039} +#define T_2401_720 {-0.3082702522841207493087267721421085298061,-0.9512988234811837751081498026906047016382} +#define T_2401_724 {-0.3182110107487778338786199583410052582622,-0.9480199115199222648087129527993965893984} +#define T_2401_732 {-0.3379868432036098879756025326059898361564,-0.9411508347875267022075718159612733870745} +#define T_2401_736 {-0.3478197503569597781591937746270559728146,-0.9375614226607353884901385754346847534180} +#define T_2401_740 {-0.3576145469202618998316722809249768033624,-0.9338692819827707491953106000437401235104} +#define T_2401_744 {-0.3673701596782198985913225897093070670962,-0.9300748173012745034782255970640107989311} +#define T_2401_748 {-0.3770855197089049704040064625587547197938,-0.9261784443755236573281308665173128247261} +#define T_2401_752 {-0.3867595625008770077180031421448802575469,-0.9221805901308757213996614154893904924393} +#define T_2401_760 {-0.4059794610747021192942440848128171637654,-0.9138822009348329666877930321788880974054} +#define T_2401_764 {-0.4155232109333735768252893194585340097547,-0.9095825752374652184428782675240654498339} +#define T_2401_768 {-0.4250214319377146332534778139233822003007,-0.9051832866296276591810965328477323055267} +#define T_2401_772 {-0.4344730833681960668890553733945125713944,-0.9006848171411199199809516358072869479656} +#define T_2401_776 {-0.4438771296079139783152811560285044834018,-0.8960876596689852924271235679043456912041} +#define T_2401_780 {-0.4532325402560626326220472037675790488720,-0.8913923179235035965106703770288731902838} +#define T_2401_788 {-0.4717933599317410964246732874016743153334,-0.8817091501874745329558891171473078429699} +#define T_2401_792 {-0.4809967352513261351631967954745050519705,-0.8767223851810592716304881832911632955074} +#define T_2401_796 {-0.4901474077862859046739174573303898796439,-0.8716395577533091465483039428363554179668} +#define T_2401_800 {-0.4992443748979568662882400076341582462192,-0.8664612248293331875359513105649966746569} +#define T_2401_804 {-0.5082866398321755907474539526447188109159,-0.8611879537987722210701235781016293913126} +#define T_2401_808 {-0.5172732118284932845142520818626508116722,-0.8558203224536297115676575231191236525774} +#define T_2401_816 {-0.5350753445848783762528455554274842143059,-0.8448043416184446430250432058528531342745} +#define T_2401_820 {-0.5438889547662832946173239179188385605812,-0.8391571991487887238747589435661211609840} +#define T_2401_824 {-0.5526429710661868721643941171350888907909,-0.8334181102730716661497467612207401543856} +#define T_2401_828 {-0.5613364343075263329652102584077510982752,-0.8275877038229315640904815154499374330044} +#define T_2401_832 {-0.5699683919480341698360348345886450260878,-0.8216666186356678114321994144120253622532} +#define T_2401_836 {-0.5785378981846082124818053671333473175764,-0.8156555034842442042375409982923883944750} +#define T_2401_844 {-0.5954858075504105840991542208939790725708,-0.8033658276314940005491394003911409527063} +#define T_2401_848 {-0.6038623536981843065163388928340282291174,-0.7970886135092439817029230653133708983660} +#define T_2401_852 {-0.6121727346825839299881977240147534757853,-0.7907240624334425094232869923871476203203} +#define T_2401_856 {-0.6204160399356428712280830950476229190826,-0.7842728717675849781088004419871140271425} +#define T_2401_860 {-0.6285913662388784572954136820044368505478,-0.7777357483682617589337837671337183564901} +#define T_2401_864 {-0.6366978178222578721445756855246145278215,-0.7711134085077078204051304055610671639442} +#define T_2401_872 {-0.6527005515796339141232351721555460244417,-0.7576159910981563605503197322832420468330} +#define T_2401_876 {-0.6605950803350129074686947205918841063976,-0.7507423924604084142231386067578569054604} +#define T_2401_880 {-0.6684172277254222249709414427343290299177,-0.7437865350218845206242690437647979706526} +#define T_2401_884 {-0.6761661366786221316615979048947338014841,-0.7367491809354842491330828124773688614368} +#define T_2401_888 {-0.6838409581471041054356874155928380787373,-0.7296311012836902420986007200554013252258} +#define T_2401_892 {-0.6914408512011211982439817802514880895615,-0.7224330759940805757324255864659789949656} +#define T_2401_900 {-0.7064125294875237548453128511027898639441,-0.7078003519235056550940043962327763438225} +#define T_2401_904 {-0.7137826742739803442816537426551803946495,-0.7003672564492752217191195995837915688753} +#define T_2401_908 {-0.7210746099338575243464788400160614401102,-0.6928574217750252950409617369587067514658} +#define T_2401_912 {-0.7282875374901842668862173013621941208839,-0.6852716707529090678363559163699392229319} +#define T_2401_916 {-0.7354206666229031741366384267166722565889,-0.6776108345532299015445687473402358591557} +#define T_2401_920 {-0.7424732157554650990860523052106145769358,-0.6698757525733698425796092124073766171932} +#define T_2401_928 {-0.7563334919443222004176163864030968397856,-0.6541862494452996834581881557824090123177} +#define T_2401_932 {-0.7631397003309523530134583779727108776569,-0.6462335473950452380265119245450478047132} +#define T_2401_936 {-0.7698622915444960357334025502495933324099,-0.6382100375721596607903052245092112571001} +#define T_2401_940 {-0.7765005289910204000491944498207885771990,-0.6301165991121527865104212651203852146864} +#define T_2401_944 {-0.7830536853192302526593948641675524413586,-0.6219541188126114228396090766182169318199} +#define T_2401_948 {-0.7895210425001633058883498961222358047962,-0.6137234910360327422296222721342928707600} +#define T_2401_956 {-0.8021955343870335042311126017011702060699,-0.5970614077375138162295797883416526019573} +#define T_2401_960 {-0.8084012803496234411682053178083151578903,-0.5886317778790824961632210943207610398531} +#define T_2401_964 {-0.8145184498304065545681851290282793343067,-0.5801376516705940566964727622689679265022} +#define T_2401_968 {-0.8205463725714746692219137003121431916952,-0.5715799598129684966352215269580483436584} +#define T_2401_972 {-0.8264843880936796338687599927652627229691,-0.5629596399720108168551746530283708125353} +#define T_2401_976 {-0.8323318457690019878114640050625894218683,-0.5542776366756702044469307111285161226988} +#define T_2401_984 {-0.8437525347492144112138134914857801049948,-0.5367323915176684945294027784257195889950} +#define T_2401_988 {-0.8493245146898693587189654863323085010052,-0.5278710720875106066074522459530271589756} +#define T_2401_992 {-0.8548034341922839951521950752066913992167,-0.5189519138543353671266800120065454393625} +#define T_2401_996 {-0.8601886929315634144899149760021828114986,-0.5099758940897977099737659045786131173372} +#define T_2401_1000 {-0.8654797008452170814507553586736321449280,-0.5009439962958669445569626077485736459494} +#define T_2401_1004 {-0.8706758781978121142230975237907841801643,-0.4918572100970652338780553236574633046985} +#define T_2401_1012 {-0.8807814742933738338948046475707087665796,-0.4735229609444412401941804091620724648237} +#define T_2401_1016 {-0.8856897857667572759865493026154581457376,-0.4642775068732444831987038469378603622317} +#define T_2401_1020 {-0.8905010522612393231156602269038558006287,-0.4549811819423145164975608167878817766905} +#define T_2401_1024 {-0.8952147466066268099993408213777001947165,-0.4456350047494394561731212434096960350871} +#define T_2401_1028 {-0.8998303523237013434865616545721422880888,-0.4362399993547169674279473383649019524455} +#define T_2401_1032 {-0.9043473636808099236361613293411210179329,-0.4267971951683479092665152165864128619432} +#define T_2401_1033 {-0.9054611529137387426757754838035907596350,-0.4244291466948553193105908576399087905884} +#define T_2401_1037 {-0.9098542265985954946572178414498921483755,-0.4149280495949528302368491949891904368997} +#define T_2401_1040 {-0.9130836344576380714244123737444169819355,-0.4077723341346619623237756968592293560505} +#define T_2401_1041 {-0.9141476076081582746724052412901073694229,-0.4053814888525136272967586137383477762341} +#define T_2401_1044 {-0.9173019366446709677020976414496544748545,-0.3981923618402745668198861039854818955064} +#define T_2401_1045 {-0.9183408255169168699083570572838652879000,-0.3957905104836493270781261344382073730230} +#define T_2401_1048 {-0.9214197301112483362572902478859759867191,-0.3885687596316954595110360060061793774366} +#define T_2401_1049 {-0.9224334208742258622848453342157881706953,-0.3861561653713083686056961596477776765823} +#define T_2401_1052 {-0.9254365636709769304957262647803872823715,-0.3789025819664652572349439196841558441520} +#define T_2401_1053 {-0.9264249452546474694614175859896931797266,-0.3764795091501307311432356073055416345596} +#define T_2401_1056 {-0.9293519971996344608555773447733372449875,-0.3691948879671150818282399086456280201674} +#define T_2401_1060 {-0.9331656016833943523280936460650991648436,-0.3594467413051181114980181519058533012867} +#define T_2401_1061 {-0.9341030428027026211168504232773557305336,-0.3570035089837804709311797068949090316892} +#define T_2401_1065 {-0.9377887746816293557117205637041479349136,-0.3472062990228262924219393426028545945883} +#define T_2401_1068 {-0.9404856632937125260340849308704491704702,-0.3398333667240249211793923223012825474143} +#define T_2401_1069 {-0.9413717530984347936495737485529389232397,-0.3373710456876517493896017185761593282223} +#define T_2401_1072 {-0.9439913183615409320026401474024169147015,-0.3299702878412538731289771476440364494920} +#define T_2401_1073 {-0.9448515854663798174684075092955026775599,-0.3274988266263991754101425613043829798698} +#define T_2401_1076 {-0.9473935403548937594919721050246153026819,-0.3200710541330165082030134726664982736111} +#define T_2401_1077 {-0.9482278905004321023497482201491948217154,-0.3175907235375436399316129154613008722663} +#define T_2401_1080 {-0.9506919564925027099633325633476488292217,-0.3101367502577811463559953608637442812324} +#define T_2401_1081 {-0.9515002982590431424014809635991696268320,-0.3076478220513707562133731698850169777870} +#define T_2401_1084 {-0.9538862053671014473721356807800475507975,-0.3001684647166521502370528651226777583361} +#define T_2401_1088 {-0.9569759369850242558541708604025188833475,-0.2901672897341031065465699612104799598455} +#define T_2401_1089 {-0.9577319991431272327986334857996553182602,-0.2876619853531379167321802015067078173161} +#define T_2401_1093 {-0.9606906094614927615893407164548989385366,-0.2776212399880558234777083725930424407125} +#define T_2401_1096 {-0.9628405057730231586532454457483254373074,-0.2700706582410405953709187087952159345150} +#define T_2401_1097 {-0.9635439569650169344683376948523800820112,-0.2675500756796708179408028627221938222647} +#define T_2401_1100 {-0.9656147003626344149651572479342576116323,-0.2599774037172842278131668081186944618821} +#define T_2401_1101 {-0.9662917290125779823384277733566705137491,-0.2574495959248770948768481048318790271878} +#define T_2401_1104 {-0.9682830926050474129596068451064638793468,-0.2498556634843506762955200883880024775863} +#define T_2401_1105 {-0.9689336245309506656653297795855905860662,-0.2473209074326606082827595400885911658406} +#define T_2401_1108 {-0.9708453901246777428823975242266897112131,-0.2397065465807352580007716369436820968986} +#define T_2401_1109 {-0.9714693540477948863198776052740868180990,-0.2371651200028370987116232981861685402691} +#define T_2401_1112 {-0.9733013121707334569521208322839811444283,-0.2295311650445935325315360842068912461400} +#define T_2401_1116 {-0.9756505896479771289264704137167427688837,-0.2193306337918953796695120672666234895587} +#define T_2401_1117 {-0.9762212153809940451409943307226058095694,-0.2167767022538510113260912248733802698553} +#define T_2401_1121 {-0.9784368265361760297338378222775645554066,-0.2065463058924489059275941826854250393808} +#define T_2401_1124 {-0.9800281929684797610491386876674368977547,-0.1988585954565109670255651508341543376446} +#define T_2401_1125 {-0.9805452304245321215603325981646776199341,-0.1962932782641860773864550537837203592062} +#define T_2401_1128 {-0.9820560391579944425899384441436268389225,-0.1885893314938880127584752699476666748524} +#define T_2401_1129 {-0.9825461960283696782880724640563130378723,-0.1860187427927102454816576937446370720863} +#define T_2401_1132 {-0.9839762815237645954269396497693378478289,-0.1782994038090567057164292918969294987619} +#define T_2401_1133 {-0.9844395041020025383105007676931563764811,-0.1757238252582822568559350884243031032383} +#define T_2401_1136 {-0.9857887096649445846097137291508261114359,-0.1679899398688018408165589789859950542450} +#define T_2401_1137 {-0.9862249471957739155314470735902432352304,-0.1654096536744244227978839489878737367690} +#define T_2401_1140 {-0.9874931249938869726179291319567710161209,-0.1576620692804945877973921142256585881114} +#define T_2401_1144 {-0.9890893407579023355324920885323081165552,-0.1473169236683214700089905591084971092641} +#define T_2401_1145 {-0.9894714677603384522797114186687394976616,-0.1447280708370063950773953820316819474101} +#define T_2401_1149 {-0.9909321895100600130490420269779860973358,-0.1343629256632885571320912276860326528549} +#define T_2401_1152 {-0.9919564858766576342929965903749689459801,-0.1265793432090412118373734529086505062878} +#define T_2401_1153 {-0.9922843348767533822041286839521490037441,-0.1239830583515313250364542341230844613165} +#define T_2401_1156 {-0.9932271010784717857333703250333201140165,-0.1161891805774326325018108718722942285240} +#define T_2401_1157 {-0.9935277557059282171891823054465930908918,-0.1135896062231992742486141878544003702700} +#define T_2401_1160 {-0.9943888884439289199690392706543207168579,-0.1057862871039888102986381568371143657714} +#define T_2401_1161 {-0.9946623157560358086115570586116518825293,-0.1031837080882444662721653116932429838926} +#define T_2401_1164 {-0.9954417206760541869670078085619024932384,-0.0953718026331494550529299658592208288610} +#define T_2401_1165 {-0.9956878907133968059639528291882015764713,-0.0927665041203272477954300256897113285959} +#define T_2401_1168 {-0.9963854824160807144295404214062727987766,-0.0849468682793792367879959215315466281027} +#define T_2401_1172 {-0.9972200702560886087510994002514053136110,-0.0745126263021358842264518784759275149554} +#define T_2401_1173 {-0.9974116478149254172436144472158048301935,-0.0719027454490797773845045526286412496120} +#define T_2401_1177 {-0.9981096410871265200981383713951800018549,-0.0614584767865875014614118754252558574080} +#define T_2401_1180 {-0.9985613704252799260885353760386351495981,-0.0536207934889705584158292595020611770451} +#define T_2401_1181 {-0.9986982715433425639162123843561857938766,-0.0510074741223274369383844373260217253119} +#define T_2401_1184 {-0.9990679357882810140978335766703821718693,-0.0431654917699686724308349994316813535988} +#define T_2401_1185 {-0.9991774746873685053571989556076005101204,-0.0405508825720605967091181298656010767445} +#define T_2401_1188 {-0.9994650333350018067690712086914572864771,-0.0327054604105154675397493235777801601216} +#define T_2401_1189 {-0.9995471980129432276740431007056031376123,-0.0300898478639219230101886637385177891701} +#define T_2401_1192 {-0.9997526195554868877479748334735631942749,-0.0222418455156455675825188933458775863983} +#define T_2401_1193 {-0.9998074010095033825606947175401728600264,-0.0196255162128828909706701466575395897962} +#define T_2401_1196 {-0.9999306629389307365940453564689960330725,-0.0117757935830413735384114559678891964722} +#define T_2401_1200 {-0.9999991439771300782979324139887467026711,-0.0013084513774113655862341154545447352575} +#define T_2401_1208 {-0.9998074010095033825606947175401728600264,0.0196255162128828909706701466575395897962} +#define T_2401_1212 {-0.9995471980129432276740431007056031376123,0.0300898478639219230101886637385177891701} +#define T_2401_1216 {-0.9991774746873685053571989556076005101204,0.0405508825720605967091181298656010767445} +#define T_2401_1220 {-0.9986982715433425639162123843561857938766,0.0510074741223274369383844373260217253119} +#define T_2401_1224 {-0.9981096410871265200981383713951800018549,0.0614584767865875014614118754252558574080} +#define T_2401_1228 {-0.9974116478149254172436144472158048301935,0.0719027454490797773845045526286412496120} +#define T_2401_1236 {-0.9956878907133968059639528291882015764713,0.0927665041203272477954300256897113285959} +#define T_2401_1240 {-0.9946623157560358086115570586116518825293,0.1031837080882444662721653116932429838926} +#define T_2401_1244 {-0.9935277557059282171891823054465930908918,0.1135896062231992742486141878544003702700} +#define T_2401_1248 {-0.9922843348767533822041286839521490037441,0.1239830583515313250364542341230844613165} +#define T_2401_1252 {-0.9909321895100600130490420269779860973358,0.1343629256632885571320912276860326528549} +#define T_2401_1256 {-0.9894714677603384522797114186687394976616,0.1447280708370063950773953820316819474101} +#define T_2401_1264 {-0.9862249471957739155314470735902432352304,0.1654096536744244227978839489878737367690} +#define T_2401_1268 {-0.9844395041020025383105007676931563764811,0.1757238252582822568559350884243031032383} +#define T_2401_1272 {-0.9825461960283696782880724640563130378723,0.1860187427927102454816576937446370720863} +#define T_2401_1276 {-0.9805452304245321215603325981646776199341,0.1962932782641860773864550537837203592062} +#define T_2401_1280 {-0.9784368265361760297338378222775645554066,0.2065463058924489059275941826854250393808} +#define T_2401_1284 {-0.9762212153809940451409943307226058095694,0.2167767022538510113260912248733802698553} +#define T_2401_1292 {-0.9714693540477948863198776052740868180990,0.2371651200028370987116232981861685402691} +#define T_2401_1296 {-0.9689336245309506656653297795855905860662,0.2473209074326606082827595400885911658406} +#define T_2401_1300 {-0.9662917290125779823384277733566705137491,0.2574495959248770948768481048318790271878} +#define T_2401_1304 {-0.9635439569650169344683376948523800820112,0.2675500756796708179408028627221938222647} +#define T_2401_1308 {-0.9606906094614927615893407164548989385366,0.2776212399880558234777083725930424407125} +#define T_2401_1312 {-0.9577319991431272327986334857996553182602,0.2876619853531379167321802015067078173161} +#define T_2401_1320 {-0.9515002982590431424014809635991696268320,0.3076478220513707562133731698850169777870} +#define T_2401_1324 {-0.9482278905004321023497482201491948217154,0.3175907235375436399316129154613008722663} +#define T_2401_1328 {-0.9448515854663798174684075092955026775599,0.3274988266263991754101425613043829798698} +#define T_2401_1332 {-0.9413717530984347936495737485529389232397,0.3373710456876517493896017185761593282223} +#define T_2401_1336 {-0.9377887746816293557117205637041479349136,0.3472062990228262924219393426028545945883} +#define T_2401_1340 {-0.9341030428027026211168504232773557305336,0.3570035089837804709311797068949090316892} +#define T_2401_1348 {-0.9264249452546474694614175859896931797266,0.3764795091501307311432356073055416345596} +#define T_2401_1352 {-0.9224334208742258622848453342157881706953,0.3861561653713083686056961596477776765823} +#define T_2401_1356 {-0.9183408255169168699083570572838652879000,0.3957905104836493270781261344382073730230} +#define T_2401_1360 {-0.9141476076081582746724052412901073694229,0.4053814888525136272967586137383477762341} +#define T_2401_1364 {-0.9098542265985954946572178414498921483755,0.4149280495949528302368491949891904368997} +#define T_2401_1368 {-0.9054611529137387426757754838035907596350,0.4244291466948553193105908576399087905884} +// Pre-computed twiddles for N=2500 +#define T_2500_1 {0.9999968417282540933399559435201808810234,-0.0025132714770037269634561649667148230947} +#define T_2500_3 {0.9999715756739829819466081062273588031530,-0.0075397509303570921582182684517192683415} +#define T_2500_7 {0.9998452485944885337332266317389439791441,-0.0175920113410997225322773829248035326600} +#define T_2500_9 {0.9997441907610622457980298349866643548012,-0.0226175383167297565367714895501194405369} +#define T_2500_11 {0.9996178732568780089806637079163920134306,-0.0276424938346043792825046381267384276725} +#define T_2500_13 {0.9994662992734908435465968068456277251244,-0.0326667509335237943313146047330519650131} +#define T_2500_17 {0.9990873978258987264666757255326956510544,-0.0427126621211368767694338544060883577913} +#define T_2500_19 {0.9988600799350685344180078573117498308420,-0.0477340623884916423480540004220529226586} +#define T_2500_21 {0.9986075247115431263722484800382517278194,-0.0527542566006264715405826848382275784388} +#define T_2500_23 {0.9983297385364165998922203471011016517878,-0.0577731179166413380543509958897629985586} +#define T_2500_27 {0.9976985020430021444681756292993668466806,-0.0678063346683007578397095471700595226139} +#define T_2500_29 {0.9973450676736201891969813004834577441216,-0.0728204366033457090034985981219506356865} +#define T_2500_31 {0.9969664342500459408569213337614201009274,-0.0778326986474765225576533111961907707155} +#define T_2500_33 {0.9965626113388820206395735112891998142004,-0.0828429941602083913521070712704386096448} +#define T_2500_37 {0.9956794385021441318173174295225180685520,-0.0928571792811678370371808455274731386453} +#define T_2500_39 {0.9952001108909334092800236248876899480820,-0.0978608158696515489927136854930722620338} +#define T_2500_41 {0.9946956384202959577933711443620268255472,-0.1028619798936414586609799926009145565331} +#define T_2500_43 {0.9941660338363007554107753094285726547241,-0.1078605449930568199734537415679369587451} +#define T_2500_47 {0.9930314824871324663746463556890375912189,-0.1178493733093558537561307275609578937292} +#define T_2500_49 {0.9924265643876858389305084529041778296232,-0.1228393841471641101392719974683132022619} +#define T_2500_51 {0.9917965715056101805302546381426509469748,-0.1278262913086236074455825928453123196959} +#define T_2500_53 {0.9911415197583902658706733745930250734091,-0.1328099687938694151156937550695147365332} +#define T_2500_57 {0.9897563065037409124613532185321673750877,-0.1427671311474489335147097790468251332641} +#define T_2500_59 {0.9890261799952952959102958629955537617207,-0.1477403644367836499373680680946563370526} +#define T_2500_61 {0.9882710646187887748581601954356301575899,-0.1527098648982631867720982654645922593772} +#define T_2500_63 {0.9874909794530675188894974780851043760777,-0.1576755069718226431874086301831994205713} +#define T_2500_67 {0.9858559792233186902521424599399324506521,-0.1675947142055242422564731441525509580970} +#define T_2500_69 {0.9850011054694260970165942126186564564705,-0.1725480287456465922080184327569440938532} +#define T_2500_71 {0.9841213445455451180876593753055203706026,-0.1774969836641414056011001321166986599565} +#define T_2500_73 {0.9832167186798331792729754852189216762781,-0.1824414539200502061166986322859884239733} +#define T_2500_77 {0.9813329641761735500082863836723845452070,-0.1923164408499827837673024077957961708307} +#define T_2500_79 {0.9803538831334196457234497756871860474348,-0.1972467080212628731672452886414248496294} +#define T_2500_81 {0.9793500323380199823475322773447260260582,-0.2021719915307735038201286670300760306418} +#define T_2500_83 {0.9783214371534033615418479712388943880796,-0.2070921669356412253559795999535708688200} +#define T_2500_87 {0.9761901181955526807243472831032704561949,-0.2169166963084052934807033352626604028046} +#define T_2500_89 {0.9750874482725092606827388408419210463762,-0.2218208020484253029103172139002708718181} +#define T_2500_91 {0.9739601416592692517681939534668345004320,-0.2267193032343216008150932339049177244306} +#define T_2500_93 {0.9728082268385126463172696276160422712564,-0.2316120760999064442930972518297494389117} +#define T_2500_97 {0.9704306896132062210114099798374809324741,-0.2413799425321770664165654807220562361181} +#define T_2500_99 {0.9692051272798304362154908631055150181055,-0.2462547893026399703497730797607800923288} +#define T_2500_101 {0.9679550768797691162603769043926149606705,-0.2511234141665813490718051070871297270060} +#define T_2500_103 {0.9666805699969629461776321477373130619526,-0.2559856941126726903590338224603328853846} +#define T_2500_107 {0.9640583162076707957055532460799440741539,-0.2656907280106526325624827222782187163830} +#define T_2500_109 {0.9627106355554005956420837719633709639311,-0.2705332367538536564310902576835360378027} +#define T_2500_111 {0.9613386309271433338707879556750413030386,-0.2753689101680248940695605597284156829119} +#define T_2500_113 {0.9599423369881517897894696034200023859739,-0.2801976260743935798203096965153235942125} +#define T_2500_117 {0.9570770229065711509619518437830265611410,-0.2898336975306610163727327744709327816963} +#define T_2500_119 {0.9556080751593917232966646224667783826590,-0.2946408096142865695732382391724968329072} +#define T_2500_121 {0.9541149828904684770947142169461585581303,-0.2994404772637143663516212654940318316221} +#define T_2500_123 {0.9525977838244708006953942458494566380978,-0.3042325792098980463684654296230291947722} +#define T_2500_127 {0.9494912192443889509263499348890036344528,-0.3137936018751875111298943465953925624490} +#define T_2500_129 {0.9479019322211821307888612864189781248569,-0.3185622810242127456170635468879481777549} +#define T_2500_131 {0.9462886953806719514403766879695467650890,-0.3233229113359364914614957342564594000578} +#define T_2500_133 {0.9446515494831168036071744609216693788767,-0.3280753725276349674899734054633881896734} +#define T_2500_137 {0.9413056965773068940350754019164014607668,-0.3375553074551057575192203330516349524260} +#define T_2500_139 {0.9395970741058202735374038638838101178408,-0.3422825416695712275050311745872022584081} +#define T_2500_141 {0.9378647116486887869157840214029420167208,-0.3470011277274495098410511673137079924345} +#define T_2500_143 {0.9361086529760148655654461435915436595678,-0.3517109464083127967626296594971790909767} +#define T_2500_147 {0.9325256250568850768800643891154322773218,-0.3611038058678774920196019593277014791965} +#define T_2500_149 {0.9306987463396931836712155927671119570732,-0.3657866093253325234790906961279688403010} +#define T_2500_151 {0.9288483524631988563768913991225417703390,-0.3704601707692759537948745673929806798697} +#define T_2500_153 {0.9269744901797019043243608393822796642780,-0.3751243721168776579233394841139670461416} +#define T_2500_157 {0.9231565503644826309326276714273262768984,-0.3844242233771806849240704195835860446095} +#define T_2500_159 {0.9212125692973394119889007924939505755901,-0.3890596383185932483073088405944872647524} +#define T_2500_161 {0.9192453127499148335388667874212842434645,-0.3936852232270234286382049049279885366559} +#define T_2500_163 {0.9172548304271764818551559983461629599333,-0.3983008612318227892323818650766042992473} +#define T_2500_167 {0.9132043902084862052603853044274728745222,-0.4075018303074807168862037087819771841168} +#define T_2500_169 {0.9111445346514999155118630369543097913265,-0.4120869289054210904410524562990758568048} +#define T_2500_171 {0.9090616579945454134303872706368565559387,-0.4166616156597678921613692182290833443403} +#define T_2500_173 {0.9069558128638635086105068694450892508030,-0.4212257749858722144509215468133334070444} +#define T_2500_177 {0.9026754305865591909707745799096301198006,-0.4303220503476088421024314811802469193935} +#define T_2500_179 {0.9005010015886493190961914478975813835859,-0.4348539365555282865116737411881331354380} +#define T_2500_181 {0.8983038204117045344432312958815600723028,-0.4393748356855865511150227575853932648897} +#define T_2500_183 {0.8960839425699989035933867853600531816483,-0.4438846335121400255019352698582224547863} +#define T_2500_187 {0.8915763218152700231300400446343701332808,-0.4528704697574726933950728380295913666487} +#define T_2500_189 {0.8892886927923970841192158331978134810925,-0.4573462811389086279945104251964949071407} +#define T_2500_191 {0.8869785948821864440816398200695402920246,-0.4618105371478895548875698295887559652328} +#define T_2500_193 {0.8846460864518815858659195328073110431433,-0.4662631249899246155266041569120716303587} +#define T_2500_197 {0.8799140743296104405501978362735826522112,-0.4751328464720838140600278620695462450385} +#define T_2500_199 {0.8775146901972968871419311653880868107080,-0.4795497560086358079800561426964122802019} +#define T_2500_201 {0.8750931346611736660889846461941488087177,-0.4839545491767599516563791439693886786699} +#define T_2500_203 {0.8726494689045877217736801867431495338678,-0.4883471146843615384725012518174480646849} +#define T_2500_207 {0.8676960542550539168260570477286819368601,-0.4970951190971508171223547378758667036891} +#define T_2500_209 {0.8651864305157446199956439158995635807514,-0.5014503369740861415948529611341655254364} +#define T_2500_211 {0.8626549468600877013102490309393033385277,-0.5057928851395788738543046747508924454451} +#define T_2500_213 {0.8601016672488885905778488449868746101856,-0.5101226538742247296198684125556610524654} +#define T_2500_217 {0.8549299787549532458896806019765790551901,-0.5187434157905574938496329195913858711720} +#define T_2500_219 {0.8523117005407917501358383560727816075087,-0.5230341911589180270425458729732781648636} +#define T_2500_221 {0.8496718877049364015974219910276588052511,-0.5273117514752824197543645823316182941198} +#define T_2500_223 {0.8470106069452519870210949193278793245554,-0.5315759886622380969711798570642713457346} +#define T_2500_227 {0.8416239111562147101608388766180723905563,-0.5400640630241156925350765050097834318876} +#define T_2500_229 {0.8388986322278397800289440056076273322105,-0.5442876857382129562878958495275583118200} +#define T_2500_231 {0.8361521575741531764336400556203443557024,-0.5484975564066704167842658534937072545290} +#define T_2500_233 {0.8333845565879515193685733720485586673021,-0.5526935686623315469034878333332017064095} +#define T_2500_237 {0.8277862558563279238299514872778672724962,-0.5610435942200587833283975669473875313997} +#define T_2500_239 {0.8249556975583226536841152665147092193365,-0.5651973965492598228621545786154456436634} +#define T_2500_241 {0.8221042958190623028968957441975362598896,-0.5693369185252644681938249959785025566816} +#define T_2500_243 {0.8192321226824456070758628811745438724756,-0.5734620555583554990519701277662534266710} +#define T_2500_247 {0.8134257530149683335096710834477562457323,-0.5816687582568207393052261977572925388813} +#define T_2500_249 {0.8104917031886222922310025751357898116112,-0.5857501165705443213482794817537069320679} +#define T_2500_261 {0.7924599971331904857407835152116604149342,-0.6099238911074591484862139623146504163742} +#define T_2500_267 {0.7831728108260403997675780374265741556883,-0.6218041077243211534053557443257886916399} +#define T_2500_273 {0.7737075377755030514137501995719503611326,-0.6335429314492969643524133971368428319693} +#define T_2500_279 {0.7640663303030451602992911830369848757982,-0.6451376929751027056170187279349192976952} +#define T_2500_291 {0.7442649209095699713856220114394091069698,-0.6678845165921063475877872406272217631340} +#define T_2500_297 {0.7341092216582886242903782658686395734549,-0.6790314062517740500979357420874293893576} +#define T_2500_303 {0.7237865923006904234071612336265388876200,-0.6900238900253774509963022865122184157372} +#define T_2500_309 {0.7132993801136727762823852572182659059763,-0.7008594683169017125479172136692795902491} +#define T_2500_321 {0.6918407829402886122238669486250728368759,-0.7220500890246939196259745585848577320576} +#define T_2500_327 {0.6808742774538681929996641883917618542910,-0.7324003128765530501453895340091548860073} +#define T_2500_333 {0.6697529470283645203210198815213516354561,-0.7425839952132153864994279501843266189098} +#define T_2500_339 {0.6584793205584099640148565413255710154772,-0.7525988203531378539778984304575715214014} +#define T_2500_351 {0.6354854676347906350386551821429748088121,-0.7721128288177779852574644792184699326754} +#define T_2500_357 {0.6237704697849901780415393659495748579502,-0.7816075748252524491022086294833570718765} +#define T_2500_363 {0.6119136319095279663571318451431579887867,-0.7909245900105082993647442890505772083998} +#define T_2500_369 {0.5999176501510879999656822292308788746595,-0.8000617557646388933179082414426375180483} +#define T_2500_381 {0.5755191971363310399212309675931464880705,-0.8177882694974004662569200263533275574446} +#define T_2500_387 {0.5631222738778264291781283645832445472479,-0.8263735866196754153989445512706879526377} +#define T_2500_393 {0.5505973014704015344733534220722503960133,-0.8347709935146954141060859910794533789158} +#define T_2500_399 {0.5379471279847712894550681994587648659945,-0.8429785806845486728633431994239799678326} +#define T_2500_411 {0.5122827117558800269492280676786322146654,-0.8588168740983388449450330881518311798573} +#define T_2500_417 {0.4992743048795490468094726566050667315722,-0.8664439788509370465874326328048482537270} +#define T_2500_423 {0.4861523673318466909343271709076361730695,-0.8738740617146393274694560204807203263044} +#define T_2500_429 {0.4729198829282125204898079573467839509249,-0.8811054331526765315274474232865031808615} +#define T_2500_441 {0.4461353338177057592517371631402056664228,-0.8949655099049148576639822749712038785219} +#define T_2500_447 {0.4325893596863208401259726088028401136398,-0.9015910635571866560411535829189233481884} +#define T_2500_453 {0.4189450184650349573445282658212818205357,-0.9080116031765956829957531226682476699352} +#define T_2500_459 {0.4052054127593781029936792492662789300084,-0.9142256687878009824288483287091366946697} +#define T_2500_471 {0.3774529259182506923941957666102098301053,-0.9260287731575901348790580414060968905687} +#define T_2500_477 {0.3634463554589605904787674717226764187217,-0.9316151279920259486999611908686347305775} +#define T_2500_483 {0.3493571404326670237772134441911475732923,-0.9369896415802629885405394816189073026180} +#define T_2500_489 {0.3351884846053667099674555629462702199817,-0.9421510918042592663823597831651568412781} +#define T_2500_501 {0.3066257551998819441685384390439139679074,-0.9518301561981014069502293750701937824488} +#define T_2500_507 {0.2922381765400498543350238378479843959212,-0.9563455694321727484918937989277765154839} +#define T_2500_513 {0.2777841454390272413199625134438974782825,-0.9606435179309280592008235544199123978615} +#define T_2500_519 {0.2632669486188402574455835747357923537493,-0.9647230243779428837669343010929878801107} +#define T_2500_531 {0.2340562757751382949589213922081398777664,-0.9722230504211841761730283906217664480209} +#define T_2500_537 {0.2193694420067258954443190077654435299337,-0.9756418645761659735882176391896791756153} +#define T_2500_543 {0.2046327255189626659337420733209000900388,-0.9788388261847201787801964201207738369703} +#define T_2500_549 {0.1898494773140938784994347088286303915083,-0.9818132082853465725236219441285356879234} +#define T_2500_561 {0.1601568419021817957226261341929784975946,-0.9870915793339134491901631918153725564480} +#define T_2500_567 {0.1452542065443727936635553987798630259931,-0.9893943680257961670321265046368353068829} +#define T_2500_573 {0.1303185416326145262821967207855777814984,-0.9914721769705635567859758339182008057833} +#define T_2500_579 {0.1153532434083082286457866416640172246844,-0.9933245336923797186301499095861800014973} +#define T_2500_591 {0.0853473649056469552132142553091398440301,-0.9963512569890512482473354793910402804613} +#define T_2500_597 {0.0703136077053038016648756070026138331741,-0.9975249353131302454400497481401544064283} +#define T_2500_603 {0.0552638617969271883634263531348551623523,-0.9984717850692077512064770417055115103722} +#define T_2500_609 {0.0402015493629532053021513604562642285600,-0.9991915909518144234269243497692514210939} +#define T_2500_621 {0.0100529271567306524581830728948261821643,-0.9999494680510517818916582655219826847315} +#define T_2500_627 {-0.0050265270788188622791414772450480086263,-0.9999873669329657488447082869242876768112} +#define T_2500_633 {-0.0201048383254575774303773982865095604211,-0.9997978773111629857694993006589356809855} +#define T_2500_639 {-0.0351785779052378247411247969012038083747,-0.9993810422739491938770584056328516453505} +#define T_2500_651 {-0.0652986333296357579492052991554373875260,-0.9978657667668942021776956607936881482601} +#define T_2500_657 {-0.0803381001333942218467498719292052555829,-0.9967676708576360677938055232516489923000} +#define T_2500_663 {-0.0953592987459886270995568224861926864833,-0.9954429185757831533010175917297601699829} +#define T_2500_669 {-0.1103588134763917733005555987801926676184,-0.9938918111585802739327277777192648500204} +#define T_2500_681 {-0.1402791539557319833164683586801402270794,-0.9901119931429293030689109400555025786161} +#define T_2500_687 {-0.1551931760772660706670933450368465855718,-0.9878841420425021890849848205107264220715} +#define T_2500_693 {-0.1700719086088891551789004097372526302934,-0.9854316546073247362613756195059977471828} +#define T_2500_699 {-0.1849119682551322307872965211572591215372,-0.9827550885118900847459144642925821244717} +#define T_2500_711 {-0.2144625804463383411668786493464722298086,-0.9767322056676015495924048082088120281696} +#define T_2500_717 {-0.2291664134369226779597283893963322043419,-0.9733872584703672226069670614378992468119} +#define T_2500_723 {-0.2438181359613384724394080649290117435157,-0.9698209714046909235563020956760738044977} +#define T_2500_729 {-0.2584144163442346764369972333952318876982,-0.9660341554134970198930432161432690918446} +#define T_2500_741 {-0.2874273877741164162635811862855916842818,-0.9578024309623294207938215549802407622337} +#define T_2500_747 {-0.3018374815215959516834232090332079678774,-0.9533593943255084246501951383834239095449} +#define T_2500_753 {-0.3161789400287130513689248800801578909159,-0.9486995719838390295208796487713698297739} +#define T_2500_759 {-0.3304485021714786974200706026749685406685,-0.9438240235407373557308119416120462119579} +#define T_2500_771 {-0.3587589753488995003571915276552317664027,-0.9334302317830763806583149744255933910608} +#define T_2500_777 {-0.3727934488260932766046096276113530620933,-0.9279143519271306761453388389782048761845} +#define T_2500_783 {-0.3867431522878577720980786125437589362264,-0.9221874723495493419989088579313829541206} +#define T_2500_789 {-0.4006049136919229147757448572519933804870,-0.9162508952933617889513584486849140375853} +#define T_2500_801 {-0.4280520228609502875194436910533113405108,-0.9037540958273154734214926975255366414785} +#define T_2500_807 {-0.4416311293899813361640838138555409386754,-0.8971967150818874747741915598453488200903} +#define T_2500_813 {-0.4551098128086537575320846826798515394330,-0.8904353195405447785049091180553659796715} +#define T_2500_819 {-0.4684850081805932986434015674603870138526,-0.8834714466863258230944211391033604741096} +#define T_2500_831 {-0.4949127933918132660195965399907436221838,-0.8689426488193063846665609162300825119019} +#define T_2500_837 {-0.5079593737806400444156906814896501600742,-0.8613810275298499075091740451171062886715} +#define T_2500_843 {-0.5208904485883242285737537713430356234312,-0.8536235356229666670913047710200771689415} +#define T_2500_849 {-0.5337030773999673627372430928517132997513,-0.8456719370854187678432367647474166005850} +#define T_2500_861 {-0.5589613707060275826776774010795634239912,-0.8291936963450933228969574884104076772928} +#define T_2500_867 {-0.5714012916823650867215178550395648926497,-0.8206708011521579582137064790003933012486} +#define T_2500_873 {-0.5837112809325288864670255861710757017136,-0.8119612924962041899945575096353422850370} +#define T_2500_879 {-0.5958885392711202877435994196275714784861,-0.8030671508443926187581496378697920590639} +#define T_2500_891 {-0.6198338180109623785085659619653597474098,-0.7847330998817070302209231158485636115074} +#define T_2500_897 {-0.6315963934623227693521130277076736092567,-0.7752973595759157232620850663806777447462} +#define T_2500_903 {-0.6432153493404652033760271478968206793070,-0.7656853233364364630375575870857574045658} +#define T_2500_909 {-0.6546880435950205301409710045845713466406,-0.7558991768574195857510744644969236105680} +#define T_2500_921 {-0.6771842459182698670261402185133192688227,-0.7358134934071977051317503537575248628855} +#define T_2500_927 {-0.6882026385445545457741900463588535785675,-0.7255185237471978298984254251990932971239} +#define T_2500_933 {-0.6990645398256409848158909881021827459335,-0.7150585774315030285208649729611352086067} +#define T_2500_939 {-0.7097674798588468059179490410315338522196,-0.7044360329617029847781850548926740884781} +#define T_2500_951 {-0.7306867778563879145536930082016624510288,-0.6827128478839764591512562219577375799417} +#define T_2500_957 {-0.7408983789527587848766643219278194010258,-0.6716171469410040506531345272378530353308} +#define T_2500_963 {-0.7509415061469912888725275479373522102833,-0.6603687260505968215085204064962454140186} +#define T_2500_969 {-0.7608138757185697320650774599926080554724,-0.6489701430066630027937435443163849413395} +#define T_2500_981 {-0.7800374017662247139170972332067321985960,-0.6257328917723579131404676445527002215385} +#define T_2500_987 {-0.7893841869782546316614002535061445087194,-0.6138995075325276440381117026845458894968} +#define T_2500_993 {-0.7985514730335249167225697419780772179365,-0.6019265278387284645589261344866827130318} +#define T_2500_999 {-0.8075371753702420551945806437288410961628,-0.5898166752432919546222933604440186172724} +#define T_2500_1011 {-0.8249556975583226536841152665147092193365,-0.5651973965492598228621545786154456436634} +#define T_2500_1017 {-0.8333845565879515193685733720485586673021,-0.5526935686623315469034878333332017064095} +#define T_2500_1023 {-0.8416239111562147101608388766180723905563,-0.5400640630241156925350765050097834318876} +#define T_2500_1029 {-0.8496718877049364015974219910276588052511,-0.5273117514752824197543645823316182941198} +#define T_2500_1041 {-0.8651864305157446199956439158995635807514,-0.5014503369740861415948529611341655254364} +#define T_2500_1047 {-0.8726494689045877217736801867431495338678,-0.4883471146843615384725012518174480646849} +#define T_2500_1053 {-0.8799140743296104405501978362735826522112,-0.4751328464720838140600278620695462450385} +#define T_2500_1059 {-0.8869785948821864440816398200695402920246,-0.4618105371478895548875698295887559652328} +#define T_2500_1071 {-0.9005010015886493190961914478975813835859,-0.4348539365555282865116737411881331354380} +#define T_2500_1077 {-0.9069558128638635086105068694450892508030,-0.4212257749858722144509215468133334070444} +#define T_2500_1083 {-0.9132043902084862052603853044274728745222,-0.4075018303074807168862037087819771841168} +#define T_2500_1089 {-0.9192453127499148335388667874212842434645,-0.3936852232270234286382049049279885366559} +#define T_2500_1101 {-0.9306987463396931836712155927671119570732,-0.3657866093253325234790906961279688403010} +#define T_2500_1107 {-0.9361086529760148655654461435915436595678,-0.3517109464083127967626296594971790909767} +#define T_2500_1113 {-0.9413056965773068940350754019164014607668,-0.3375553074551057575192203330516349524260} +#define T_2500_1119 {-0.9462886953806719514403766879695467650890,-0.3233229113359364914614957342564594000578} +#define T_2500_1131 {-0.9556080751593917232966646224667783826590,-0.2946408096142865695732382391724968329072} +#define T_2500_1137 {-0.9599423369881517897894696034200023859739,-0.2801976260743935798203096965153235942125} +#define T_2500_1143 {-0.9640583162076707957055532460799440741539,-0.2656907280106526325624827222782187163830} +#define T_2500_1149 {-0.9679550768797691162603769043926149606705,-0.2511234141665813490718051070871297270060} +#define T_2500_1161 {-0.9750874482725092606827388408419210463762,-0.2218208020484253029103172139002708718181} +#define T_2500_1167 {-0.9783214371534033615418479712388943880796,-0.2070921669356412253559795999535708688200} +#define T_2500_1173 {-0.9813329641761735500082863836723845452070,-0.1923164408499827837673024077957961708307} +#define T_2500_1179 {-0.9841213445455451180876593753055203706026,-0.1774969836641414056011001321166986599565} +#define T_2500_1191 {-0.9890261799952952959102958629955537617207,-0.1477403644367836499373680680946563370526} +#define T_2500_1197 {-0.9911415197583902658706733745930250734091,-0.1328099687938694151156937550695147365332} +#define T_2500_1203 {-0.9930314824871324663746463556890375912189,-0.1178493733093558537561307275609578937292} +#define T_2500_1209 {-0.9946956384202959577933711443620268255472,-0.1028619798936414586609799926009145565331} +#define T_2500_1221 {-0.9973450676736201891969813004834577441216,-0.0728204366033457090034985981219506356865} +#define T_2500_1227 {-0.9983297385364165998922203471011016517878,-0.0577731179166413380543509958897629985586} +#define T_2500_1233 {-0.9990873978258987264666757255326956510544,-0.0427126621211368767694338544060883577913} +#define T_2500_1239 {-0.9996178732568780089806637079163920134306,-0.0276424938346043792825046381267384276725} +#define T_2500_1251 {-0.9999968417282540933399559435201808810234,0.0025132714770037269634561649667148230947} +#define T_2500_1257 {-0.9998452485944885337332266317389439791441,0.0175920113410997225322773829248035326600} +#define T_2500_1263 {-0.9994662992734908435465968068456277251244,0.0326667509335237943313146047330519650131} +#define T_2500_1269 {-0.9988600799350685344180078573117498308420,0.0477340623884916423480540004220529226586} +#define T_2500_1281 {-0.9969664342500459408569213337614201009274,0.0778326986474765225576533111961907707155} +#define T_2500_1287 {-0.9956794385021441318173174295225180685520,0.0928571792811678370371808455274731386453} +#define T_2500_1293 {-0.9941660338363007554107753094285726547241,0.1078605449930568199734537415679369587451} +#define T_2500_1299 {-0.9924265643876858389305084529041778296232,0.1228393841471641101392719974683132022619} +#define T_2500_1311 {-0.9882710646187887748581601954356301575899,0.1527098648982631867720982654645922593772} +#define T_2500_1317 {-0.9858559792233186902521424599399324506521,0.1675947142055242422564731441525509580970} +#define T_2500_1323 {-0.9832167186798331792729754852189216762781,0.1824414539200502061166986322859884239733} +#define T_2500_1329 {-0.9803538831334196457234497756871860474348,0.1972467080212628731672452886414248496294} +#define T_2500_1341 {-0.9739601416592692517681939534668345004320,0.2267193032343216008150932339049177244306} +#define T_2500_1347 {-0.9704306896132062210114099798374809324741,0.2413799425321770664165654807220562361181} +#define T_2500_1353 {-0.9666805699969629461776321477373130619526,0.2559856941126726903590338224603328853846} +#define T_2500_1359 {-0.9627106355554005956420837719633709639311,0.2705332367538536564310902576835360378027} +#define T_2500_1371 {-0.9541149828904684770947142169461585581303,0.2994404772637143663516212654940318316221} +#define T_2500_1377 {-0.9494912192443889509263499348890036344528,0.3137936018751875111298943465953925624490} +#define T_2500_1383 {-0.9446515494831168036071744609216693788767,0.3280753725276349674899734054633881896734} +#define T_2500_1389 {-0.9395970741058202735374038638838101178408,0.3422825416695712275050311745872022584081} +#define T_2500_1401 {-0.9288483524631988563768913991225417703390,0.3704601707692759537948745673929806798697} +#define T_2500_1407 {-0.9231565503644826309326276714273262768984,0.3844242233771806849240704195835860446095} +#define T_2500_1413 {-0.9172548304271764818551559983461629599333,0.3983008612318227892323818650766042992473} +#define T_2500_1419 {-0.9111445346514999155118630369543097913265,0.4120869289054210904410524562990758568048} +#define T_2500_1431 {-0.8983038204117045344432312958815600723028,0.4393748356855865511150227575853932648897} +#define T_2500_1437 {-0.8915763218152700231300400446343701332808,0.4528704697574726933950728380295913666487} +#define T_2500_1443 {-0.8846460864518815858659195328073110431433,0.4662631249899246155266041569120716303587} +#define T_2500_1449 {-0.8775146901972968871419311653880868107080,0.4795497560086358079800561426964122802019} +#define T_2500_1461 {-0.8626549468600877013102490309393033385277,0.5057928851395788738543046747508924454451} +#define T_2500_1467 {-0.8549299787549532458896806019765790551901,0.5187434157905574938496329195913858711720} +#define T_2500_1473 {-0.8470106069452519870210949193278793245554,0.5315759886622380969711798570642713457346} +#define T_2500_1479 {-0.8388986322278397800289440056076273322105,0.5442876857382129562878958495275583118200} +#define T_2500_1491 {-0.8221042958190623028968957441975362598896,0.5693369185252644681938249959785025566816} +#define T_2500_1497 {-0.8134257530149683335096710834477562457323,0.5816687582568207393052261977572925388813} +// Pre-computed twiddles for N=3125 +#define T_3125_1 {0.9999979787056996194394287158502265810966,-0.0020106179436128425105922978843864257215} +#define T_3125_2 {0.9999919148309696081966535530227702111006,-0.0040212277591245059252256233151001652004} +#define T_3125_3 {0.9999818084003239127000028929614927619696,-0.0060318213184666685452994805416437884560} +#define T_3125_4 {0.9999676594546185182110775713226757943630,-0.0080423904936367269363772791734845668543} +#define T_3125_6 {0.9999272342631642107591005697031505405903,-0.0120634231799758503966835121445910772309} +#define T_3125_7 {0.9999009581808376845302177571284119039774,-0.0140738704357640138919993688659815234132} +#define T_3125_8 {0.9998706399102956776658857052098028361797,-0.0160842607966839822031257511980584240519} +#define T_3125_9 {0.9998362795741025932372281204152386635542,-0.0180945861355546050441844840861449483782} +#define T_3125_11 {0.9997554332767213391974792102701030671597,-0.0221150092397703161573030428144193137996} +#define T_3125_12 {0.9997089476423616227762636299303267151117,-0.0241250907521987946469099739488228806295} +#define T_3125_13 {0.9996584205960060698359370690013747662306,-0.0261350747368104020795787079123329021968} +#define T_3125_14 {0.9996038523419146226700604529469273984432,-0.0281449530680667990567567215975941508077} +#define T_3125_16 {0.9994825931092484738726966497779358178377,-0.0321643602705289979870606487111217575148} +#define T_3125_17 {0.9994159026208749851960533305827993899584,-0.0341738728929250543120410554820409743115} +#define T_3125_18 {0.9993451719051660298021033668192103505135,-0.0361832473644121063727219222982967039570} +#define T_3125_19 {0.9992704012480567721254942625819239765406,-0.0381924755619158298736159906638931715861} +#define T_3125_21 {0.9991087413350364654363033878325950354338,-0.0422104606456653874690942984670982696116} +#define T_3125_22 {0.9990218527326497621743328636512160301208,-0.0442192012888505334156086234997928841040} +#define T_3125_23 {0.9989309254959092276848764413443859666586,-0.0462277631719966164425272836524527519941} +#define T_3125_24 {0.9988359599923963827450279495678842067719,-0.0482361381753142540174117414153442950919} +#define T_3125_26 {0.9986339157370026642368543434713501483202,-0.0522522950671169342240496291651652427390} +#define T_3125_27 {0.9985268378019036550341525071416981518269,-0.0542600607199319046047847336922131944448} +#define T_3125_28 {0.9984157232335926224209288193378597497940,-0.0562676070216439269677799472901824628934} +#define T_3125_29 {0.9983005724812599179784911029855720698833,-0.0582749258565692096478727535213693045080} +#define T_3125_31 {0.9980581643028729743605254043359309434891,-0.0622888486679549560620650083819782594219} +#define T_3125_32 {0.9979309078567751845056932324951048940420,-0.0642954364177768245003363745126989670098} +#define T_3125_33 {0.9977996171865649577270573900023009628057,-0.0663017642476003282725471876801748294383} +#define T_3125_34 {0.9976642928229964013553399126976728439331,-0.0683078240466674785347223064491117838770} +#define T_3125_36 {0.9973815452203315112811310427787248045206,-0.0723191071149511255367059447962674312294} +#define T_3125_37 {0.9972341231242675130275188166706357151270,-0.0743243141682004004877626357483677566051} +#define T_3125_38 {0.9970826696209049710972749380744062364101,-0.0763292207588244647764597061723179649562} +#define T_3125_39 {0.9969271853225081247984462606837041676044,-0.0783338187818107822835145270801149308681} +#define T_3125_41 {0.9966041268711401945878947117307689040899,-0.0823420567110898643514715899982547853142} +#define T_3125_42 {0.9964365540241614249694634963816497474909,-0.0843456804137256710429682016183505766094} +#define T_3125_43 {0.9962649529941279835654199814598541706800,-0.0863489631414753183724641871776839252561} +#define T_3125_44 {0.9960893244747522912163617547776084393263,-0.0883518967958908663096195823527523316443} +#define T_3125_46 {0.9957259878242276096571572452376130968332,-0.0923566844980165807532657140654919203371} +#define T_3125_47 {0.9955382811618993521562970272498205304146,-0.0943585223560176045154435087169986218214} +#define T_3125_48 {0.9953465499478639788577538638492114841938,-0.0963599787613317743106833290767099242657} +#define T_3125_49 {0.9951507949572119260395197670732159167528,-0.0983610456228942303669171565161377657205} +#define T_3125_51 {0.9947472168277486925802577388822101056576,-0.1023619783584111264662652729384717531502} +#define T_3125_52 {0.9945393953204378911792105100175831466913,-0.1043618280582404994616396720630291383713} +#define T_3125_53 {0.9943275532995045074002860019390936940908,-0.1063612558661334028808909124563797377050} +#define T_3125_54 {0.9941116916213386112133321148576214909554,-0.1083602536992257781012227724204421974719} +#define T_3125_56 {0.9936679128001156824367967601574491709471,-0.1123569271182763978922380943004100117832} +#define T_3125_57 {0.9934399974510738084987337970233056694269,-0.1143545865473282324709103363602480385453} +#define T_3125_58 {0.9932080660328227450506233253690879791975,-0.1163517836878320416138521409266104456037} +#define T_3125_59 {0.9929721194829655983582483713689725846052,-0.1183485104659414421934826577853527851403} +#define T_3125_61 {0.9924881848200004519355843513039872050285,-0.1223405206491314234362732804584084078670} +#define T_3125_62 {0.9922401986632412018707327661104500293732,-0.1243357879161571671877695166585908737034} +#define T_3125_63 {0.9919882012875655963313192842178978025913,-0.1263305525447439847663844147973577491939} +#define T_3125_64 {0.9917321937116951868773639944265596568584,-0.1283248064708791824628519862017128616571} +#define T_3125_66 {0.9912081521153112140254393125360365957022,-0.1323117499700989796451722213532775640488} +#define T_3125_67 {0.9909401202132821495638381747994571924210,-0.1343044234256112745562461441295454278588} +#define T_3125_68 {0.9906680823480190634100495117309037595987,-0.1362965539435923578714948689594166353345} +#define T_3125_69 {0.9903920396192590347439477227453608065844,-0.1382881334706781706334055570550845004618} +#define T_3125_71 {0.9898279440511426052751176030142232775688,-0.1422696073498770641396760083807748742402} +#define T_3125_72 {0.9895398934921924016805405699415132403374,-0.1442594856065292363567209577013272792101} +#define T_3125_73 {0.9892478426305488792280584675609134137630,-0.1462487806814293522794656610130914486945} +#define T_3125_74 {0.9889517926468532893480301027011591941118,-0.1482374845326757950569884769720374606550} +#define T_3125_76 {0.9883477001167012554461166473629418760538,-0.1522130864085841916999441991720232181251} +#define T_3125_77 {0.9880396600123424466488586404011584818363,-0.1541999683615233041056313822991796769202} +#define T_3125_78 {0.9877276256701168488660869115847162902355,-0.1561862269474280917602726503901067189872} +#define T_3125_79 {0.9874115983514510919150097834062762558460,-0.1581718541366722019514412522767088375986} +#define T_3125_81 {0.9867675699112092857490097230765968561172,-0.1621411822194679941411266099748900160193} +#define T_3125_82 {0.9864395713931752851522105629555881023407,-0.1641248670666592412992201843735529109836} +#define T_3125_83 {0.9861075851057746755046196085459087044001,-0.1661078884245337783376328388840192928910} +#define T_3125_84 {0.9857716123910913452732529549393802881241,-0.1680902382765520342022824706873507238925} +#define T_3125_86 {0.9850877131287842924933784161112271249294,-0.1720528914104672013607455482997465878725} +#define T_3125_87 {0.9847397893458840201219572918489575386047,-0.1740331786729877039388014736687182448804} +#define T_3125_88 {0.9843878846651364922237803511961828917265,-0.1760127623909639171806418289634166285396} +#define T_3125_89 {0.9840320005091477417735745802929159253836,-0.1779916345617533202094762145861750468612} +#define T_3125_91 {0.9833082995423005900903490328346379101276,-0.1819472122656161094944593514810549095273} +#define T_3125_92 {0.9829404856570675974580808542668819427490,-0.1839239018079161513963271090688067488372} +#define T_3125_93 {0.9825686981478320980443186272168532013893,-0.1858998478215473648678113249843590892851} +#define T_3125_94 {0.9821929385175778293159964960068464279175,-0.1878750423185728546382478043597075156868} +#define T_3125_96 {0.9814295089862313803408255807880777865648,-0.1918231448262820715289223016952746547759} +#define T_3125_97 {0.9810418421713706926823306275764480233192,-0.1937960368764116148021514618449145928025} +#define T_3125_98 {0.9806502094079417863881076300458516925573,-0.1957681454888915706380458914281916804612} +#define T_3125_99 {0.9802546122791551130859488694113679230213,-0.1977394626912981900890287079164409078658} +#define T_3125_101 {0.9794515313384734023216537934786174446344,-0.2016796909922251301860995909009943716228} +#define T_3125_102 {0.9790440507731043018324612603464629501104,-0.2036485861620234427693532097691786475480} +#define T_3125_103 {0.9786326123354158745826225640485063195229,-0.2056166580643687402663033481076126918197} +#define T_3125_104 {0.9782172176886845260668224000255577266216,-0.2075838987431560256347040649416157975793} +#define T_3125_106 {0.9773745665011578376279999247344676405191,-0.2115158546224699953608450186948175542057} +#define T_3125_107 {0.9769473133668545861141296882124152034521,-0.2134805539277166430700560795230558142066} +#define T_3125_108 {0.9765161108364788233870967815164476633072,-0.2154443902189095150223607788575463928282} +#define T_3125_109 {0.9760809606532049320648525281285401433706,-0.2174073555570663618397020400152541697025} +#define T_3125_111 {0.9751988243804468048381295375293120741844,-0.2213306416359798189930074840958695858717} +#define T_3125_112 {0.9747518418570766396413773691165260970592,-0.2232909465165048568113803639789693988860} +#define T_3125_113 {0.9743009188130220543655468645738437771797,-0.2252503487235948398392793023958802223206} +#define T_3125_114 {0.9738460570711792163578479630814399570227,-0.2272088403361927355650351501026307232678} +#define T_3125_116 {0.9729245248653198840926847879018168896437,-0.2311230601121219974114495698813698254526} +#define T_3125_117 {0.9724578581266789534254257887369021773338,-0.2330787724518731118816816660910262726247} +#define T_3125_118 {0.9719872601409859091958765020535793155432,-0.2350335425500356123507117445115000009537} +#define T_3125_119 {0.9715127328106749615699300193227827548981,-0.2369873625042782094851645524613559246063} +#define T_3125_121 {0.9705518978053511158776700540329329669476,-0.2408921203909155095423955117439618334174} +#define T_3125_122 {0.9700655940145989086786926236527506262064,-0.2428430425379804746821577055015950463712} +#define T_3125_123 {0.9695753686477343036287379618443083018064,-0.2447929829705298987452266601394512690604} +#define T_3125_124 {0.9690812236865368189953073851938825100660,-0.2467419338057568189981338946381583809853} +#define T_3125_126 {0.9680811829874793605910099358879961073399,-0.2506368351730497434637356946041109040380} +#define T_3125_127 {0.9675752912923726301031024377152789384127,-0.2525827699596319142472111707320436835289} +#define T_3125_128 {0.9670654880884228532522683963179588317871,-0.2545276836579875023858221538830548524857} +#define T_3125_129 {0.9665517754365546831962774376734159886837,-0.2564715684056305122773267157754162326455} +#define T_3125_131 {0.9655126301117747944502411883149761706591,-0.2603562196196650413426709747000131756067} +#define T_3125_132 {0.9649872016396999763188091492338571697474,-0.2622969703820098308355568406113889068365} +#define T_3125_133 {0.9644578721213640237763797813386190682650,-0.2642366607856120874764371819765074178576} +#define T_3125_134 {0.9639246436966282249869664155994541943073,-0.2661752829891015359464745415607467293739} +#define T_3125_136 {0.9628464987662028740089681377867236733437,-0.2700492914518839437931774227763526141644} +#define T_3125_137 {0.9623015866190096634369410821818746626377,-0.2719846620501544798287341109244152903557} +#define T_3125_138 {0.9617527842823917483272566641971934586763,-0.2739189331263305327013313217321410775185} +#define T_3125_139 {0.9612000939749314465387897143955342471600,-0.2758520968609500245705135057505685836077} +#define T_3125_141 {0.9600830584003897660849702333507593721151,-0.2797150710500845849360018746665446087718} +#define T_3125_142 {0.9595187176490236335979489012970589101315,-0.2816448658881842703749498468823730945587} +#define T_3125_143 {0.9589504979582272881089011207222938537598,-0.2835735221519596338524138445791322737932} +#define T_3125_144 {0.9583784016250791459867741650668904185295,-0.2855010320446468297106434874876867979765} +#define T_3125_146 {0.9572225882983903533229863569431472569704,-0.2893525815529045530638541094958782196045} +#define T_3125_147 {0.9566388759773274408360066445311531424522,-0.2912766055982449198147321567375911399722} +#define T_3125_148 {0.9560512963588494450561938720056787133217,-0.2931994521320998581259686943667475134134} +#define T_3125_149 {0.9554598518182989552727235604834277182817,-0.2951211133811917908253974474064307287335} +#define T_3125_151 {0.9542653775504630342396694686613045632839,-0.2989608489559664183943255011399742215872} +#define T_3125_152 {0.9536623526519457660910461527237202972174,-0.3008789077591778049480808476801030337811} +#define T_3125_153 {0.9530554724888725681708478987275157123804,-0.3027957502327464633928855164413107559085} +#define T_3125_154 {0.9524447395146103945151594416529405862093,-0.3047113686276669008989870235382113605738} +#define T_3125_156 {0.9512117250238529830852485247305594384670,-0.3085389022103143252806489726935978978872} +#define T_3125_157 {0.9505894484919280573720357097045052796602,-0.3104508019248977124249222470098175108433} +#define T_3125_158 {0.9499633291179345739863038033945485949516,-0.3123614466146081292130531892325961962342} +#define T_3125_159 {0.9493333694330156058427405696420464664698,-0.3142708285554951519458199982182122766972} +#define T_3125_161 {0.9480619393325872001909715436340775340796,-0.3180857733205522119845909401192329823971} +#define T_3125_162 {0.9474204740569465155175521431374363601208,-0.3199913207224700517627979934331960976124} +#define T_3125_163 {0.9467751787500973126299186333199031651020,-0.3218955745311223859417282255890313535929} +#define T_3125_164 {0.9461260560207028769141857083013746887445,-0.3237985270483944577613044657482532784343} +#define T_3125_166 {0.9448163388062852385829160084540490061045,-0.3276004974426728888481363810569746419787} +#define T_3125_167 {0.9441557496159099782673251866071950644255,-0.3294994999498771504065075532707851380110} +#define T_3125_168 {0.9434913435922639424191515900020021945238,-0.3313971704261588491213785800937330350280} +#define T_3125_169 {0.9428231234212672839944957559055183082819,-0.3332935012000171060009279244695790112019} +#define T_3125_171 {0.9414752514579873832190060056746006011963,-0.3370821129815692596132237213168991729617} +#define T_3125_172 {0.9407956051145959230908033532614354044199,-0.3389743786734643604674488415184896439314} +#define T_3125_173 {0.9401121555216156444600983377313241362572,-0.3408652740314002671517812359525123611093} +#define T_3125_174 {0.9394249054419518962788515636930242180824,-0.3427547914112649696249945918680168688297} +#define T_3125_176 {0.9380390149510041686298222884943243116140,-0.3465296616882165281481320562306791543961} +#define T_3125_177 {0.9373403801423052517094447466661222279072,-0.3484149993250558741131328588380711153150} +#define T_3125_178 {0.9366379560520705638282379368320107460022,-0.3502989284633905575638834761775797232985} +#define T_3125_179 {0.9359317455199115398656317665881942957640,-0.3521814414872701526348919287556782364845} +#define T_3125_181 {0.9345079765647911207437914526963140815496,-0.3559421887565168973743823244149098172784} +#define T_3125_182 {0.9337904238975416326695722091244533658028,-0.3578204077987299780438945617788704112172} +#define T_3125_183 {0.9330690962997690851565835146175231784582,-0.3596971803202413942024406878772424533963} +#define T_3125_184 {0.9323439966875040063598589767934754490852,-0.3615724987340318552497819837299175560474} +#define T_3125_186 {0.9308824931598510543651059379044454544783,-0.3653187429197957691862086448963964357972} +#define T_3125_187 {0.9301460951527206200140085456951055675745,-0.3671896535472451428283591212675673887134} +#define T_3125_188 {0.9294059369475888843226130120456218719482,-0.3690590797779866871763942981488071382046} +#define T_3125_189 {0.9286620215366109265531235905655194073915,-0.3709270140546993554941934689850313588977} +#define T_3125_191 {0.9271629311416688112856832049146760255098,-0.3746583765469405080672515850892523303628} +#define T_3125_192 {0.9264077622179104354316336866759229451418,-0.3765217896781055118005099302536109462380} +#define T_3125_193 {0.9256488482086927538361464939953293651342,-0.3783836806865756297035829902597470209002} +#define T_3125_194 {0.9248861921819930831389910963480360805988,-0.3802440420454914482562003286147955805063} +#define T_3125_196 {0.9233496664236803264103059518674854189157,-0.3839601457381718874195541957305977120996} +#define T_3125_197 {0.9225758029036089657992647516948636621237,-0.3858158730492581289794884469301905483007} +#define T_3125_198 {0.9217982097891134207756635987607296556234,-0.3876700406654939312645069549034815281630} +#define T_3125_199 {0.9210168902236826049545470596058294177055,-0.3895226410912425163424188667704584077001} +#define T_3125_201 {0.9194430843892817950546714200754649937153,-0.3932231104204381155753367238503415137529} +#define T_3125_202 {0.9186506044825611949633525910030584782362,-0.3950709643644100821369136156135937198997} +#define T_3125_203 {0.9178544108493787989644374647468794137239,-0.3969172211990049659036117191135417670012} +#define T_3125_204 {0.9170545067084178159078078351740259677172,-0.3987618734605659631675678156170761212707} +#define T_3125_206 {0.9154435798528809398177941147878300398588,-0.4024463344424223376627480774914147332311} +#define T_3125_207 {0.9146325636506195122521489793143700808287,-0.4042861282679576784282460266695125028491} +#define T_3125_208 {0.9138178499651823383231885600253008306026,-0.4061242877309994003809379137237556278706} +#define T_3125_209 {0.9129994420901216267694167072477284818888,-0.4079608054006250705825209479371551424265} +#define T_3125_211 {0.9113515570199959281438850666745565831661,-0.4116288856691536213183724157715914770961} +#define T_3125_212 {0.9105220864866522267888626629428472369909,-0.4134604334395170499583116452413378283381} +#define T_3125_213 {0.9096889350871010249122150526090990751982,-0.4152903097594453640795109095051884651184} +#define T_3125_214 {0.9088521061894305752559830580139532685280,-0.4171185072315014008559330704883905127645} +#define T_3125_216 {0.9071674294464044940866642718901857733727,-0.4207698360762124889511426317767472937703} +#define T_3125_217 {0.9063195884115039513417855232546571642160,-0.4225929526880470854521831824968103319407} +#define T_3125_218 {0.9054680834993666938004253097460605204105,-0.4244143609304284470162826892192242667079} +#define T_3125_219 {0.9046129181522766238288113527232781052589,-0.4262340534401522496033010156679665669799} +#define T_3125_221 {0.9028916199963485933466245114686898887157,-0.4298682618435202273943218642671126872301} +#define T_3125_222 {0.9020254941460109643358578068728093057871,-0.4316827630455549735799536392732989042997} +#define T_3125_223 {0.9011557217776930972519267015741206705570,-0.4334955191317726441724289543344639241695} +#define T_3125_224 {0.9002823064075267112116307544056326150894,-0.4353065227739462139666670736914966255426} +#define T_3125_226 {0.8985245607997978112280179630033671855927,-0.4389232434487041478732294308429118245840} +#define T_3125_227 {0.8976402376680776562523078609956428408623,-0.4407289458603747545062390145176323130727} +#define T_3125_228 {0.8967522857461649454791086100158281624317,-0.4425328665862328092472921525768470019102} +#define T_3125_229 {0.8958607086236841654525164813094306737185,-0.4443349983337688824391875641595106571913} +#define T_3125_231 {0.8940666932087770746306887303944677114487,-0.4479338657600277495696161622618092224002} +#define T_3125_232 {0.8931642621688168359739279367204289883375,-0.4497305868900101089025156397838145494461} +#define T_3125_233 {0.8922582204331918065420836683188099414110,-0.4515254899442484592952951061306521296501} +#define T_3125_234 {0.8913485716646558643105890951119363307953,-0.4533185676666882102381350705400109291077} +#define T_3125_236 {0.8895184677527612215897079295245930552483,-0.4568992181288779041992142992967274039984} +#define T_3125_237 {0.8885980200077596702357141111860983073711,-0.4586867763935311148593143570906249806285} +#define T_3125_238 {0.8876739800265316393534931194153614342213,-0.4604724803762507701065942455898039042950} +#define T_3125_239 {0.8867463515445908051049173081992194056511,-0.4622563228581703098640787175099831074476} +#define T_3125_241 {0.8848803440931434227678664683480747044086,-0.4658183944817981259056693943421123549342} +#define T_3125_242 {0.8839419726671373656046171163325197994709,-0.4675966092235163129586794639180880039930} +#define T_3125_243 {0.8830000278273887337476821812742855399847,-0.4693729336645122707238897419301792979240} +#define T_3125_244 {0.8820545133817931926500932604540139436722,-0.4711473606238370526178016461926745250821} +#define T_3125_246 {0.8801527909767796753470747717074118554592,-0.4746904934120601549096818416728638112545} +#define T_3125_247 {0.8791965907052431417056936879816930741072,-0.4764591849175301985219732614496024325490} +#define T_3125_248 {0.8782368362035910536178562324494123458862,-0.4782259502945305218446492290240712463856} +#define T_3125_249 {0.8772735313517161515051157039124518632889,-0.4799907824007556000900365233974298462272} +#define T_3125_251 {0.8753362861886154755453048892377410084009,-0.4835146182707635853503802536579314619303} +#define T_3125_252 {0.8743623537088749397838682853034697473049,-0.4852736077891278143603415173856774345040} +#define T_3125_253 {0.8733848865418503137902916932944208383560,-0.4870306355459368408133968841866590082645} +#define T_3125_254 {0.8724038886390392821823525082436390221119,-0.4887856944382504287283097710314905270934} +#define T_3125_256 {0.8704313165033988886065685619541909545660,-0.4922898772574547110458809129340806975961} +#define T_3125_257 {0.8694397502448671932739898693398572504520,-0.4940389870183759524202571355999680235982} +#define T_3125_258 {0.8684446691991120070852616663614753633738,-0.4957860995829199146633925465721404179931} +#define T_3125_259 {0.8674460773888366116324277754756622016430,-0.4975312078882292432524536707205697894096} +#define T_3125_261 {0.8654383776364853453344494482735171914101,-0.5010153835102553188107776804827153682709} +#define T_3125_262 {0.8644292778107135477583256033540237694979,-0.5027544367418832482741208877996541559696} +#define T_3125_263 {0.8634166834529969358769108112028334289789,-0.5044914575441562520907723410346079617739} +#define T_3125_264 {0.8624005986568381576518049769219942390919,-0.5062264388950138416234381111280526965857} +#define T_3125_266 {0.8603579741937387170835904726118315011263,-0.5096902551954921589683067395526450127363} +#define T_3125_267 {0.8593314427842885239172687761310953646898,-0.5114190761423287323594877307186834514141} +#define T_3125_268 {0.8583014374513434630031838423747103661299,-0.5131458296322377599096853373339399695396} +#define T_3125_269 {0.8572679623587912134397015506692696362734,-0.5148705086846654399579392702435143291950} +#define T_3125_271 {0.8551906196205338872573520347941666841507,-0.5183136155968193214960137993330135941505} +#define T_3125_272 {0.8541467603726708990308225111220963299274,-0.5200320295374807333388389452011324465275} +#define T_3125_273 {0.8530994481608509261150175007060170173645,-0.5217483412025873956707755496609024703503} +#define T_3125_274 {0.8520486872189265703525506978621706366539,-0.5234625436537975184947413254121784120798} +#define T_3125_276 {0.8499368361498657042929494309646543115377,-0.5268845932038214163739553441700991243124} +#define T_3125_277 {0.8488757545600743581459823872137349098921,-0.5285924264686966411730395520862657576799} +#define T_3125_278 {0.8478112413148340964497151617251802235842,-0.5302981228518540701699635064869653433561} +#define T_3125_279 {0.8467433007175340087258064158959314227104,-0.5320016754578651418938761707977391779423} +#define T_3125_281 {0.8445971547495705333830073868739418685436,-0.5354023218000927286652768088970333337784} +#define T_3125_282 {0.8435189580548921872704681845789309591055,-0.5370994017888950367023426224477589130402} +#define T_3125_283 {0.8424373513600895702069237813702784478664,-0.5387943105057783066413890082912985235453} +#define T_3125_284 {0.8413523390376533850343321319087408483028,-0.5404870410989236484411435412766877561808} +#define T_3125_286 {0.8391721150686640706695129665604326874018,-0.5438659405507801913870480348123237490654} +#define T_3125_287 {0.8380769122358595613064835561090148985386,-0.5455520957499909728838360933877993375063} +#define T_3125_288 {0.8369783214028830853337126427504699677229,-0.5472360455065182760492348279512953013182} +#define T_3125_289 {0.8358763470108853121942615871375892311335,-0.5489177830128461055281263725191820412874} +#define T_3125_291 {0.8336622653828029694267343074898235499859,-0.5522745940895824601213348614692222326994} +#define T_3125_292 {0.8325501670973394219288366002729162573814,-0.5539496540897848042561690817819908261299} +#define T_3125_293 {0.8314347031540608723432228543970268219709,-0.5556224746994300467406446841778233647346} +#define T_3125_294 {0.8303158780623289825228994232020340859890,-0.5572930491559927590472511838015634566545} +#define T_3125_296 {0.8280681625388729427683642825286369770765,-0.5606274326051968737161246281175408512354} +#define T_3125_297 {0.8269392811937380516340567737643141299486,-0.5622912281182979032934099450358189642429} +#define T_3125_298 {0.8258070568732914784604304259119089692831,-0.5639527505192898360064646112732589244843} +#define T_3125_299 {0.8246714941546503530034328832698520272970,-0.5656119930913210414047398444381542503834} +#define T_3125_301 {0.8223903718987130018192033276136498898268,-0.5689236119272046421357913459360133856535} +#define T_3125_302 {0.8212448215830556730310263446881435811520,-0.5705759748035444944846972248342353850603} +#define T_3125_303 {0.8200959513124441713216583593748509883881,-0.5722260310759527124346845994296018034220} +#define T_3125_304 {0.8189437657312883533933245416847057640553,-0.5738737740739305870363295980496332049370} +#define T_3125_306 {0.8166294672819763844628937476954888552427,-0.5771622936113857127438109273498412221670} +#define T_3125_307 {0.8154673637695768473321322744595818221569,-0.5788030568567313904893012477259617298841} +#define T_3125_308 {0.8143019636581082520976337946194689720869,-0.5804414802394372019378465665795374661684} +#define T_3125_309 {0.8131332716588036690907870251976419240236,-0.5820775571360317091063052430399693548679} +#define T_3125_311 {0.8107860309081392768959517525217961519957,-0.5853426450244555434565540963376406580210} +#define T_3125_312 {0.8096074916457082215615059794799890369177,-0.5869716428168779698282264689623843878508} +#define T_3125_313 {0.8084256794732602724451453468645922839642,-0.5885982677244280258932462857046630233526} +#define T_3125_314 {0.8072405991683758719190677766164299100637,-0.5902225131713306760516957183426711708307} +#define T_3125_316 {0.8048606533376586602557267724478151649237,-0.5934638394282142348501452033815439790487} +#define T_3125_317 {0.8036657974329676212832396231533493846655,-0.5950809071348468481943427832447923719883} +#define T_3125_318 {0.8024676926380851504916336125461384654045,-0.5966955691741875345002199537702836096287} +#define T_3125_319 {0.8012663437964558932335989993589464575052,-0.5983078190188222400536233180901035666466} +#define T_3125_321 {0.7988539334122881641064850555267184972763,-0.6015250560630998055700047189020551741123} +#define T_3125_322 {0.7976428816221325934776587018859572708607,-0.6031300302567769167083611137059051543474} +#define T_3125_323 {0.7964286052899561818563256565539631992579,-0.6047325662438688409494602638005744665861} +#define T_3125_324 {0.7952111093245786133110186710837297141552,-0.6063326575459819789415405466570518910885} +#define T_3125_326 {0.7927664781945572558541357466310728341341,-0.6095254802311372843703907165036071091890} +#define T_3125_327 {0.7915393529125513971678174129920080304146,-0.6111181987069108645371784405142534524202} +#define T_3125_328 {0.7903090277625803850014563067816197872162,-0.6127084466832206643971403536852449178696} +#define T_3125_329 {0.7890755077183425614251177648839075118303,-0.6142962177313482374785280626383610069752} +#define T_3125_331 {0.7865989029064184290263028742629103362560,-0.6174643033782754031335571198724210262299} +#define T_3125_332 {0.7853558281506265448257408934296108782291,-0.6190446051698080820457903428177814930677} +#define T_3125_333 {0.7841095785243161753186313944752328097820,-0.6206224044186765542718831056845374405384} +#define T_3125_334 {0.7828601590655618336356269537645857781172,-0.6221976947464874507787158108840230852365} +#define T_3125_336 {0.7803518308670708281127303962421137839556,-0.6253407231761022311289366371056530624628} +#define T_3125_337 {0.7790929322674731150044635796803049743176,-0.6269084485719350618992962154152337461710} +#define T_3125_338 {0.7778308841156683639539437535859178751707,-0.6284736396348200582195886454428546130657} +#define T_3125_339 {0.7765656915135982218956200995307881385088,-0.6300362900373335239834204912767745554447} +#define T_3125_341 {0.7740258934299651949473286549618933349848,-0.6331539436029315348619661563134286552668} +#define T_3125_342 {0.7727512982157611620692705400870181620121,-0.6347089341626253045092198590282350778580} +#define T_3125_343 {0.7714735790859676978215020426432602107525,-0.6362613588552169519019230392586905509233} +#define T_3125_344 {0.7701927412058775512804231766494922339916,-0.6378112114048921643671974379685707390308} +#define T_3125_346 {0.7676217299189967979700099931505974382162,-0.6409031750242518699067773013666737824678} +#define T_3125_347 {0.7663315669057469969871476678235922008753,-0.6424452735943992909284361303434707224369} +#define T_3125_348 {0.7650383059292404919204955149325542151928,-0.6439847750226070921542032010620459914207} +#define T_3125_349 {0.7637419522175991071222256323380861431360,-0.6455216730853042816917763957462739199400} +#define T_3125_351 {0.7611399875638927836618563560477923601866,-0.6485876342725299670277649966010358184576} +#define T_3125_352 {0.7598343871405003335794958729820791631937,-0.6501166850026387500349756010109558701515} +#define T_3125_353 {0.7585257150192760144946646505559328943491,-0.6516431075784475668299933204252738505602} +#define T_3125_354 {0.7572139764906425929424926835054066032171,-0.6531668958292577986668447920237667858601} +#define T_3125_356 {0.7545813214348007225140690934495069086552,-0.6562065447263611961403739769593812525272} +#define T_3125_357 {0.7532604155503335174870471746544353663921,-0.6577223930846043398190658990642987191677} +#define T_3125_358 {0.7519364645438969541046958511287812143564,-0.6592355825417987036374256604176480323076} +#define T_3125_359 {0.7506094737676803907433509266411419957876,-0.6607461069807418763133455286151729524136} +#define T_3125_361 {0.7479463943760847888242437875305768102407,-0.6637591363889572271261840796796604990959} +#define T_3125_362 {0.7466103165264401653189452190417796373367,-0.6652616291777910806359841444646008312702} +#define T_3125_363 {0.7452712204384407401747125732072163373232,-0.6667614325875462766290979743644129484892} +#define T_3125_364 {0.7439291115255011188622802364989183843136,-0.6682585405551346724450922920368611812592} +#define T_3125_366 {0.7412358769393354585020006197737529873848,-0.6712446459659654474094736542610917240381} +#define T_3125_367 {0.7398847621537489871457182744052261114120,-0.6727336313376122145513136274530552327633} +#define T_3125_368 {0.7385306563184570460833811011980287730694,-0.6742198971239494742491160650388337671757} +#define T_3125_369 {0.7371735649075524809958892547001596540213,-0.6757034373166163065960176936641801148653} +#define T_3125_371 {0.7344504473156021617796795908361673355103,-0.6786623169426102553103419268154539167881} +#define T_3125_372 {0.7330844321430003418527121539227664470673,-0.6801376444144042743289446661947295069695} +#define T_3125_373 {0.7317154534116298014723156484251376241446,-0.6816102223695100770228805231454316526651} +#define T_3125_374 {0.7303435166557081803517803564318455755711,-0.6830800448549008141441163388662971556187} +#define T_3125_376 {0.7275907912668513333187547686975449323654,-0.6860113996601494568139401053485926240683} +#define T_3125_377 {0.7262100137620521866921308173914439976215,-0.6874729201297459235675546551647130399942} +#define T_3125_378 {0.7248263004889297089761157621978782117367,-0.6889316614291520757262787810759618878365} +#define T_3125_379 {0.7234396570412673366234912464278750121593,-0.6903876176612769866025587361946236342192} +#define T_3125_381 {0.7206576020566615170537261292338371276855,-0.6932911513916375501054289998137392103672} +#define T_3125_382 {0.7192622017664218603272274776827543973923,-0.6947387171520809090452530654147267341614} +#define T_3125_383 {0.7178638937950043708013936338829807937145,-0.6961834743697056238076470435771625488997} +#define T_3125_384 {0.7164626837951929561043584726576227694750,-0.6976254172039528800297603083890862762928} +#define T_3125_386 {0.7136515803801591895094702522328589111567,-0.7005008364169890144879104809660930186510} +#define T_3125_387 {0.7122416983290714709298185880470555275679,-0.7019343011716409064248978211253415793180} +#define T_3125_388 {0.7108289369778131305821489149820990860462,-0.7033649282946884406797494193597231060266} +#define T_3125_389 {0.7094133020375968934345678462705109268427,-0.7047927120026946656849986538873054087162} +#define T_3125_391 {0.7065734342932037392870370240416377782822,-0.7076397260973324998545308517350349575281} +#define T_3125_392 {0.7051492129694435506692684612062294036150,-0.7090589449746574901567441884253639727831} +#define T_3125_393 {0.7037221410175132918496387901541311293840,-0.7104752974183741187630403146613389253616} +#define T_3125_394 {0.7022922242064775222303296686732210218906,-0.7118887777027523844353140702878590673208} +#define T_3125_396 {0.6994238791408284861006450228160247206688,-0.7147070989486502545773305428156163543463} +#define T_3125_397 {0.6979854624817544150872095087834168225527,-0.7161119285168564552179759630234912037849} +#define T_3125_398 {0.6965442241546061508117304583720397204161,-0.7175138631391436705442288257472682744265} +#define T_3125_399 {0.6951001699857173310448388292570598423481,-0.7189128971480667207316628264379687607288} +#define T_3125_401 {0.6922036374849456219493504249840043485165,-0.7217022407146939100286431312269996851683} +#define T_3125_402 {0.6907511708625518886961458520090673118830,-0.7230925389962294946855081434478051960468} +#define T_3125_403 {0.6892959118173488386460689980594906955957,-0.7244799141121096575091087288456037640572} +#define T_3125_404 {0.6878378662323501036013340126373805105686,-0.7258643604537473592586138693150132894516} +#define T_3125_406 {0.6849134390313202924716051711584441363811,-0.7286244444391705155084082434768788516521} +#define T_3125_407 {0.6834470692375453904077176048303954303265,-0.7300000709250717978449074507807381451130} +#define T_3125_408 {0.6819779365484390787344182172091677784920,-0.7313727463210076829014383292815182358027} +#define T_3125_409 {0.6805060469031005698781200408120639622211,-0.7327424650778164849640461397939361631870} +#define T_3125_411 {0.6775540205558249207129506430646870285273,-0.7354730105371893866816890295012854039669} +#define T_3125_412 {0.6760738957877159283071932804887183010578,-0.7368338262012816830548445068416185677052} +#define T_3125_413 {0.6745910379309825177784887273446656763554,-0.7381916631493474767466977937147021293640} +#define T_3125_414 {0.6731054529802089625434291519923135638237,-0.7395465158922105430860938213299959897995} +#define T_3125_416 {0.6701261258299767709090133394056465476751,-0.7422472468659658817458080193318892270327} +#define T_3125_417 {0.6686323956747121100008257599256467074156,-0.7435931141789138276720905196270905435085} +#define T_3125_418 {0.6671359625137466276001418918895069509745,-0.7449359754508146913565269642276689410210} +#define T_3125_419 {0.6656368323965439204314975540910381823778,-0.7462758252530329317053769955236930400133} +#define T_3125_421 {0.6626305055457700765586537272611167281866,-0.7489464687947712251414600359566975384951} +#define T_3125_422 {0.6611233209655417164540835983643773943186,-0.7502772517379781014312811748823150992393} +#define T_3125_423 {0.6596134637357122798562159005086869001389,-0.7516050016189196325555599287326913326979} +#define T_3125_424 {0.6581009399600133980001714917307253926992,-0.7529297130700494111721354784094728529453} +#define T_3125_426 {0.6550679172398078398131815447413828223944,-0.7555699992741243820049135138106066733599} +#define T_3125_427 {0.6535474305565642794135783333331346511841,-0.7568855633534786742089295330515597015619} +#define T_3125_428 {0.6520243018499277676269798575958702713251,-0.7581980676558824905342248712258879095316} +#define T_3125_429 {0.6504985372772812413089127403509337455034,-0.7595075068754209013377476367168128490448} +#define T_3125_431 {0.6474391252167425170327419436944182962179,-0.7621171689042173236217081466747913509607} +#define T_3125_432 {0.6459054900967945433976069580239709466696,-0.7634173811636852935436081679654307663441} +#define T_3125_433 {0.6443692438466751548631350487994495779276,-0.7647145072407506294354107012622989714146} +#define T_3125_434 {0.6428303926767959808330488158389925956726,-0.7660085418916660060517642705235630273819} +#define T_3125_436 {0.6397449004720320298034152983746025711298,-0.7685873160025670225437011140456888824701} +#define T_3125_437 {0.6381982719105230295042474608635529875755,-0.7698720450376297375072454087785445153713} +#define T_3125_438 {0.6366490633759549488956963614327833056450,-0.7711536617967390538908034613996278494596} +#define T_3125_439 {0.6350972811311405008538599759049247950315,-0.7724321610988457154078901112370658665895} +#define T_3125_441 {0.6319860206140206493330424564192071557045,-0.7749797866708877380759190600656438618898} +#define T_3125_442 {0.6304265549192614725981798073917161673307,-0.7762489026418211146562953217653557658195} +#define T_3125_443 {0.6288645406692977557483459349896293133497,-0.7775148805577891675255841619218699634075} +#define T_3125_444 {0.6272999841787104680079778518120292574167,-0.7787777153009641795122774965420830994844} +#define T_3125_446 {0.6241632697853498612516887078527361154556,-0.7812939348611767087504631490446627140045} +#define T_3125_447 {0.6225911245630223467273367532470729202032,-0.7825473095061736827915410685818642377853} +#define T_3125_448 {0.6210164624609116890141535805014427751303,-0.7837975206343377188034082792000845074654} +#define T_3125_449 {0.6194392898447290685126631615275982767344,-0.7850445631915794475474967839545570313931} +#define T_3125_451 {0.6162774385837108681585050362627953290939,-0.7875291224410057022708997465088032186031} +#define T_3125_452 {0.6146927727209392511298347017145715653896,-0.7887666290891391929207543398661073297262} +#define T_3125_453 {0.6131056219081716385232994070975109934807,-0.7900009470782893838958216292667202651501} +#define T_3125_454 {0.6115159925616057146058324178738985210657,-0.7912320714186161785264062018541153520346} +#define T_3125_456 {0.6083293239819429487624802277423441410065,-0.7936847192580138710127357626333832740784} +#define T_3125_457 {0.6067322976312361504369619069620966911316,-0.7949062328420384293536926634260453283787} +#define T_3125_458 {0.6051328185114591695636931945045944303274,-0.7961245329471874354609894908207934349775} +#define T_3125_459 {0.6035308930886481570965429455100093036890,-0.7973396146483747726207980122126173228025} +#define T_3125_461 {0.6003197292474887758473300891637336462736,-0.7997601032035930312957816568086855113506} +#define T_3125_462 {0.5987105038105547949456308742810506373644,-0.8009655002725845074351695984660182148218} +#define T_3125_463 {0.5970988580333630446617121378949377685785,-0.8021676593675749478151715265994425863028} +#define T_3125_464 {0.5954847984311341102525716451054904609919,-0.8033665756287297066151609215012285858393} +#define T_3125_466 {0.5922494638612127992871592141455039381981,-0.8057546602757603704247912901337258517742} +#define T_3125_467 {0.5906282019726468712761402457545045763254,-0.8069438190075924888233771525847259908915} +#define T_3125_468 {0.5890045524172444757482480781618505716324,-0.8081297155975404322703070647548884153366} +#define T_3125_469 {0.5873785217587526030769140561460517346859,-0.8093123452515119664596454640559386461973} +#define T_3125_471 {0.5841193434355919089639996855112258344889,-0.8116677846412121422048357999301515519619} +#define T_3125_472 {0.5824862089464400982663505601522047072649,-0.8128405848548682088861028205428738147020} +#define T_3125_473 {0.5808507197051799852971498694387264549732,-0.8140100990884415743664703768445178866386} +#define T_3125_474 {0.5792128823234214962667465442791581153870,-0.8151763226140674722941525942587759345770} +#define T_3125_476 {0.5759301896322867042599114029144402593374,-0.8174988786965510234594489702431019395590} +#define T_3125_477 {0.5742853475934862617080511881795246154070,-0.8186552018642699879436008814082015305758} +#define T_3125_478 {0.5726381839552860952835544594563543796539,-0.8198082155458018549509802141983527690172} +#define T_3125_479 {0.5709887053764910458397707770927809178829,-0.8209579150799867219845395993615966290236} +#define T_3125_481 {0.5676828300791010306625139492098242044449,-0.8232473531286830237263529852498322725296} +#define T_3125_482 {0.5660264467247998476651105193013790994883,-0.8243870823879382303545071408734656870365} +#define T_3125_483 {0.5643677751584373902815627843665424734354,-0.8255234789893716174091764514741953462362} +#define T_3125_484 {0.5627068220853403968106931642978452146053,-0.8266565383389991028550980445288587361574} +#define T_3125_486 {0.5593780982863387762904494593385607004166,-0.8289126269743748443019626392924692481756} +#define T_3125_487 {0.5577103410170949349122793137212283909321,-0.8300356471396850288613222801359370350838} +#define T_3125_488 {0.5560403291543840609278959163930267095566,-0.8311553118123496952662776493525598198175} +#define T_3125_489 {0.5543680694493768879027584262075833976269,-0.8322716164660254323237609241914469748735} +#define T_3125_491 {0.5510168335625656998999488678236957639456,-0.8344941276789691331217113656748551875353} +#define T_3125_492 {0.5493378709284297833903565333457663655281,-0.8356003252535387781918529981339816004038} +#define T_3125_493 {0.5476566875472788087364506282028742134571,-0.8367031448397587167775668604008387774229} +#define T_3125_494 {0.5459732902154454947662998165469616651535,-0.8378025819793830653026134314131923019886} +#define T_3125_496 {0.5425998809297863934020256238000001758337,-0.8399912911542486426696996204555034637451} +#define T_3125_497 {0.5409098826132664461496801777684595435858,-0.8410805543414390239576050589676015079021} +#define T_3125_498 {0.5392176976206208838249267500941641628742,-0.8421664173859680913025727022613864392042} +#define T_3125_499 {0.5375233327926577375777128509071189910173,-0.8432488758981382392931891445186920464039} +#define T_3125_501 {0.5341280910380421520500249243923462927341,-0.8454035618354449610478695831261575222015} +#define T_3125_502 {0.5324272278369555078114672141964547336102,-0.8464757805500725540426287807349581271410} +#define T_3125_503 {0.5307242122516268034715380963461939245462,-0.8475445773113590153258201098651625216007} +#define T_3125_504 {0.5290190511666476025354199919092934578657,-0.8486099477985984407268915674649178981781} +#define T_3125_506 {0.5256023200794416316838919556175824254751,-0.8507303927373867091787928984558675438166} +#define T_3125_507 {0.5238907638896531659966626648383680731058,-0.8517854586168489561259775655344128608704} +#define T_3125_508 {0.5221770898250346348845596367027610540390,-0.8528370810781258359867251783725805580616} +#define T_3125_509 {0.5204613048132650243715602300653699785471,-0.8538852558699403516584425233304500579834} +#define T_3125_511 {0.5170234297016305102800970416865311563015,-0.8559712455097794325808990834048017859459} +#define T_3125_512 {0.5153013534996803146626120906148571521044,-0.8570090519250058713751627692545298486948} +#define T_3125_513 {0.5135771941463525003257473144913092255592,-0.8580433938052083320258134335745126008987} +#define T_3125_514 {0.5118509586117139242844586988212540745735,-0.8590742669689679367195367376552894711494} +#define T_3125_516 {0.5083922869207100347921368665993213653564,-0.8611255904916137460247682611225172877312} +#define T_3125_517 {0.5066598647463314009087298472877591848373,-0.8621460325578429451098827485111542046070} +#define T_3125_518 {0.5049253943545591871711053499893750995398,-0.8631629893223485838760211663611698895693} +#define T_3125_519 {0.5031888827571434630314684000040870159864,-0.8641764566739930097938326980511192232370} +#define T_3125_521 {0.4997097640336128354654476879659341648221,-0.8661929067646946256431306210288312286139} +#define T_3125_522 {0.4979671709721436578810482842527562752366,-0.8671958813520737052726872207131236791611} +#define T_3125_523 {0.4962225648342655781597443365171784535050,-0.8681953502232681030292837931483518332243} +#define T_3125_524 {0.4944759526727033893678253662073984742165,-0.8691913093378363930341379273158963769674} +#define T_3125_526 {0.4909767385299455000335910881403833627701,-0.8711726822062877406693814918980933725834} +#define T_3125_527 {0.4892241506946329865357370181300211697817,-0.8721580879502952576842744747409597039223} +#define T_3125_528 {0.4874695851273456170815734367351979017258,-0.8731399679179584483179610288061667233706} +#define T_3125_529 {0.4857130489210702006808162423112662509084,-0.8741183181399404533706842812534887343645} +#define T_3125_531 {0.4821940930033059014014895637956215068698,-0.8760644135408761634664642770076170563698} +#define T_3125_532 {0.4804316875175080325810483827808639034629,-0.8770321508525668052769219684705603867769} +#define T_3125_533 {0.4786673398440468019110483055555960163474,-0.8779963426840818874197225341049488633871} +#define T_3125_534 {0.4769010571154539412930262187728658318520,-0.8789569851375905740198390958539675921202} +#define T_3125_536 {0.4733627150620847157114212677697651088238,-0.8808676063910237941101399883336853235960} +#define T_3125_537 {0.4715906700413696750473491192678920924664,-0.8818175774670926614362542750313878059387} +#define T_3125_538 {0.4698167185735876438812397282163146883249,-0.8827639837174748871362339741608593612909} +#define T_3125_539 {0.4680408678300946312411667804553871974349,-0.8837068213162394547666167454735841602087} +#define T_3125_541 {0.4644834972397605676874832170142326503992,-0.8855817753273388381884956288558896631002} +#define T_3125_542 {0.4627019917739053078697963883314514532685,-0.8865138841600062047376695772982202470303} +#define T_3125_543 {0.4609186157942525285946544499893207103014,-0.8874424091817508797319646873802412301302} +#define T_3125_544 {0.4591333765102576514038901223102584481239,-0.8883673466389284678612625612004194408655} +#define T_3125_546 {0.4555573369046968523754514990287134423852,-0.8902064439175334520371052349219098687172} +#define T_3125_547 {0.4537665510395878509797285005333833396435,-0.8911205963042472255608572595519945025444} +#define T_3125_548 {0.4519739307829922503323416549392277374864,-0.8920311462569964211510864515730645507574} +#define T_3125_549 {0.4501794833817361896066699955554213374853,-0.8929380900948021837493229213578160852194} +#define T_3125_551 {0.4465851361694501697741088719340041279793,-0.8947411447745731161163007527648005634546} +#define T_3125_552 {0.4447852508888873734882452026795363053679,-0.8956372483275298890248450334183871746063} +#define T_3125_553 {0.4429835675245394899235407137894071638584,-0.8965297311875561891980623840936459600925} +#define T_3125_554 {0.4411800933598711282890292295633116737008,-0.8974185897467108485869857759098522365093} +#define T_3125_556 {0.4375678017995996982314466094976523891091,-0.8991854196039136271068059613753575831652} +#define T_3125_557 {0.4357589990070051544357454531564144417644,-0.9000633837593955410838475472701247781515} +#define T_3125_558 {0.4339484346200486175604282834683544933796,-0.9009377093289021587452225503511726856232} +#define T_3125_559 {0.4321361159580969224514035431639058515429,-0.9018083927778950403819635539548471570015} +#define T_3125_561 {0.4285062451221053336780641984660178422928,-0.9035388192498173820510487530555110424757} +#define T_3125_562 {0.4266887076221399510700393875595182180405,-0.9043985552773444736018859657633583992720} +#define T_3125_563 {0.4248694451952689798979179158777697011828,-0.9052546351935815138745056174229830503464} +#define T_3125_564 {0.4230484651960221276212337215838488191366,-0.9061070555377493951354495038685854524374} +#define T_3125_566 {0.4194013819332059744837692960572894662619,-0.9078009037407470671610099088866263628006} +#define T_3125_567 {0.4175752934132940241696019256778527051210,-0.9086423247520453161385489693202544003725} +#define T_3125_568 {0.4157475168082608862896165646816371008754,-0.9094800724962394111727803647227119654417} +#define T_3125_569 {0.4139180595070555135350787168135866522789,-0.9103141435866596298254194152832496911287} +#define T_3125_571 {0.4102541324058653349204917049064533784986,-0.9119712423338307560882753932673949748278} +#define T_3125_572 {0.4084196774176305222425753527204506099224,-0.9127942632916130616393957097898237407207} +#define T_3125_573 {0.4065835713566633979887399163999361917377,-0.9136135941977117091994387010345235466957} +#define T_3125_574 {0.4047458216455854240756195849826326593757,-0.9144292317399088654283900723385158926249} +#define T_3125_576 {0.4010654209967756678167916106758639216423,-0.9160494135583948649426133670203853398561} +#define T_3125_577 {0.3992227849373896075846346320759039372206,-0.9168539512849550598971859471930656582117} +#define T_3125_578 {0.3973785349845239678323594034736743196845,-0.9176547825481832232341616872872691601515} +#define T_3125_579 {0.3955326785937225420042295809253118932247,-0.9184519041106480319314186999690718948841} +#define T_3125_581 {0.3918361763529286112550664711307035759091,-0.9200350052585589688547429432219360023737} +#define T_3125_582 {0.3899855454463738713144493885920383036137,-0.9208209784441785394903945416444912552834} +#define T_3125_583 {0.3881333379886985901485729755222564563155,-0.9216032291294071754350625269580632448196} +#define T_3125_584 {0.3862795614676156064071221862832317128778,-0.9223817541519273577321769153058994561434} +#define T_3125_586 {0.3825673312177624851848634079942712560296,-0.9239276146348905927041528229892719537020} +#define T_3125_587 {0.3807088924960120301932420261437073349953,-0.9246949438460556613961216498864814639091} +#define T_3125_588 {0.3788489147248325639161237177177099511027,-0.9254585348959815105374104859947692602873} +#define T_3125_589 {0.3769874054233490201504253036546288058162,-0.9262183846977836143565809834399260580540} +#define T_3125_591 {0.3732598223368956968926113404450006783009,-0.9277268482851129816069146727386396378279} +#define T_3125_592 {0.3713937636210107862133611433819169178605,-0.9284754559725425515637198259355500340462} +#define T_3125_593 {0.3695262035129306399383608550124336034060,-0.9292203102156776362008372416312340646982} +#define T_3125_594 {0.3676571495624325058848569369729375466704,-0.9299614080033790086687872644688468426466} +#define T_3125_596 {0.3639145903634553591565747865388402715325,-0.9314323222438646299181641552422661334276} +#define T_3125_597 {0.3620411002446035642066135551431216299534,-0.9321621327503477605347370626986958086491} +#define T_3125_598 {0.3601661465425268771056721561762969940901,-0.9328881749088189234697665597195737063885} +#define T_3125_599 {0.3582897368368917478953505906247301027179,-0.9336104457841882808821765138418413698673} +#define T_3125_601 {0.3545325797630116126946120402863016352057,-0.9350436620215035743441944759979378432035} +#define T_3125_602 {0.3526518475834068966356937835371354594827,-0.9357546015895458202393797364493366330862} +#define T_3125_603 {0.3507696897774631450417359701532404869795,-0.9364617582867024170312220121559221297503} +#define T_3125_604 {0.3488861139539699562917007824580650776625,-0.9371651292542299005461359229229856282473} +#define T_3125_606 {0.3451147387181257020216662567690946161747,-0.9385605026419553409411378197546582669020} +#define T_3125_607 {0.3432269545518931153083030949346721172333,-0.9392524994212326783227240412088576704264} +#define T_3125_608 {0.3413377828602865982077219086932018399239,-0.9399406991890625340246856467274483293295} +#define T_3125_609 {0.3394472312804500679916941408009734004736,-0.9406250991633364755273305490845814347267} +#define T_3125_611 {0.3356620190325234065475967781821964308619,-0.9419824886795984397735992388334125280380} +#define T_3125_612 {0.3337673736664892309988772467477247118950,-0.9426554727342191242200897249858826398849} +#define T_3125_613 {0.3318713790162748922263347139960387721658,-0.9433246460205712002888844835979398339987} +#define T_3125_614 {0.3299740427466067993478304742893669754267,-0.9439900058334624866063222725642845034599} +#define T_3125_616 {0.3261753760349019870545816957019269466400,-0.9453092742951855198541011304769199341536} +#define T_3125_617 {0.3242740609493119374739933391538215801120,-0.9459631776107577660894776272471062839031} +#define T_3125_618 {0.3223714349570995230109815565811004489660,-0.9466132567863713154210358879936393350363} +#define T_3125_619 {0.3204675057497989865140652909758500754833,-0.9472595091940236855521106917876750230789} +#define T_3125_621 {0.3166557684823805840501620423310669139028,-0.9485405232707946332482151774456724524498} +#define T_3125_622 {0.3147479758315483988617700106260599568486,-0.9491752797613004499766020671813748776913} +#define T_3125_623 {0.3128389107841369476403770022443495690823,-0.9498061991266401049571754811040591448545} +#define T_3125_624 {0.3109285810577109288033170741982758045197,-0.9504332788162662692954540943901520222425} +#define T_3125_627 {0.3051900810564923127721215223573381081223,-0.9522914545582836787218639074126258492470} +#define T_3125_633 {0.2936799788683579492953867884352803230286,-0.9559037974670259174558850645553320646286} +#define T_3125_636 {0.2879087954557846273928589653223752975464,-0.9576578332051585151774020232551265507936} +#define T_3125_637 {0.2859827294840936451336688151059206575155,-0.9582347720871058349700888356892392039299} +#define T_3125_639 {0.2821271370001109946024087093974230811000,-0.9593770262874343757175665814429521560669} +#define T_3125_642 {0.2763352138565692284188912708486896008253,-0.9610613141641090040323547327716369181871} +#define T_3125_648 {0.2647214167864538891805636922072153538465,-0.9643249304537203636655817717837635427713} +#define T_3125_651 {0.2588999654069744082995896405918756499887,-0.9659041401258550152419957157690078020096} +#define T_3125_654 {0.2530690944184412205508749593718675896525,-0.9674482071151044859647072371444664895535} +#define T_3125_657 {0.2472290159665959108359345464123180136085,-0.9689570752433714906359796259494032710791} +#define T_3125_663 {0.2355220869231894331718990542867686599493,-0.9718689966098031929320200106303673237562} +#define T_3125_666 {0.2296556622671614145136942397584789432585,-0.9732719439029522412454298319062218070030} +#define T_3125_669 {0.2237808820033907741819945158567861653864,-0.9746394804489424412707876399508677423000} +#define T_3125_672 {0.2178979598751789870103579005444771610200,-0.9759715564924189568074552880716510117054} +#define T_3125_676 {0.2100417685683246549199765240700799040496,-0.9776924135210881550150929797382559627295} +#define T_3125_678 {0.2061085464719858983961842113785678520799,-0.9785291345030076026745291528641246259212} +#define T_3125_681 {0.2002024841335826865762470561094232834876,-0.9797545434172492129221154755214229226112} +#define T_3125_684 {0.1942891377882884074157487930278875865042,-0.9809443057266215015843613400647882372141} +#define T_3125_687 {0.1883687225825620426000739371374947950244,-0.9820983781437651050083559312042780220509} +#define T_3125_689 {0.1844179583292170632180528855315060354769,-0.9828479112485731805648470071901101619005} +#define T_3125_693 {0.1765075474537503019689665961777791380882,-0.9842992866460190670707675053563434630632} +#define T_3125_696 {0.1705672190781642449941557515558088198304,-0.9853460426550367801112884080794174224138} +#define T_3125_699 {0.1646206849214434664308015499045723117888,-0.9863569486225536309120798250660300254822} +#define T_3125_702 {0.1586681613375257315912136846236535347998,-0.9873319677685762352226106486341450363398} +#define T_3125_708 {0.1467460123855430231820662356767570599914,-0.9891742050058432988990375633875373750925} +#define T_3125_711 {0.1407768207834000639966376411393866874278,-0.9900413560706029514335568819660693407059} +#define T_3125_714 {0.1348025072701223847637663766363402828574,-0.9908724862633377927778610683162696659565} +#define T_3125_717 {0.1288232892103495230706755592109402641654,-0.9916675653448723259941743890522047877312} +#define T_3125_723 {0.1168510097941769659835031802685989532620,-0.9931494557769646736034019340877421200275} +#define T_3125_726 {0.1108583840276059773177763645435334183276,-0.9938362132116075331467186515510547906160} +#define T_3125_728 {0.1068610459502429832490122407762100920081,-0.9942739646890187410477324192470405250788} +#define T_3125_729 {0.1048617248783490496499837263399967923760,-0.9944868117051816058804547537874896079302} +#define T_3125_732 {0.0988612505240514527482531548230326734483,-0.9951012275868323930794190346205141395330} +#define T_3125_738 {0.0868497295970191429681506178894778713584,-0.9962214234139540325330131054215598851442} +#define T_3125_741 {0.0808391200418448335751619993061467539519,-0.9967271626031168896275858060107566416264} +#define T_3125_744 {0.0748255693008505889940451538677734788507,-0.9971966376692229516720544779673218727112} +#define T_3125_747 {0.0688092961662518376453334667530725710094,-0.9976298315312673858912262403464410454035} +#define T_3125_753 {0.0567694583723854506729367130901664495468,-0.9983873139198564672369684558361768722534} +#define T_3125_754 {0.0547619681765601148959454747000563656911,-0.9984994375769219443483848408504854887724} +#define T_3125_756 {0.0507463317609364514271241830556391505525,-0.9987115748867683162615094261127524077892} +#define T_3125_759 {0.0447213588355825919506969512440264225006,-0.9989994995313556946925359625311102718115} +#define T_3125_762 {0.0386947588041149231896653759577020537108,-0.9992510773779987864884333248483017086983} +#define T_3125_767 {0.0286474051903728224588441264586435863748,-0.9995895788651752988940302202536258846521} +#define T_3125_768 {0.0266375545420212619940336651325196726248,-0.9996451573873705864770045081968419253826} +#define T_3125_771 {0.0206073889910615738740862212807769537903,-0.9997876452122076429418484622146934270859} +#define T_3125_774 {0.0145764736773600963454233792049308249261,-0.9998937575638390251597797941940370947123} +#define T_3125_777 {0.0085450280249109618335712212910948437639,-0.9999634905815579788423974605393595993519} +#define T_3125_783 {-0.0035185765117606818304374360906194851850,-0.9999938097905061162862239143578335642815} +#define T_3125_786 {-0.0095502964834544273181604623346174776088,-0.9999543948786254743410495393618475645781} +#define T_3125_789 {-0.0155816689848073432200781240908327163197,-0.9998785984266529425212866044603288173676} +#define T_3125_792 {-0.0216124745751913420466205906222967314534,-0.9997664231923057975848223577486351132393} +#define T_3125_793 {-0.0236225791999117480657321976877938141115,-0.9997209479409461252430446620564907789230} +#define T_3125_798 {-0.0336715073716536417158806671068305149674,-0.9994329540250914689636374532710760831833} +#define T_3125_801 {-0.0396992958315377092448628104648378212005,-0.9992116722248994831190316290303599089384} +#define T_3125_804 {-0.0457256399040274000289407752006809459999,-0.9989540359072419661856656603049486875534} +#define T_3125_806 {-0.0497422919037794133978458432920888299122,-0.9987620859825222519035037294088397175074} +#define T_3125_807 {-0.0517503203314449503058547463751892792061,-0.9986600544457522321550868582562543451786} +#define T_3125_813 {-0.0637938135309713688636534811848832760006,-0.9979631001971844828091207091347314417362} +#define T_3125_816 {-0.0698121881222662776789178451508632861078,-0.9975601527675317381493869106634519994259} +#define T_3125_819 {-0.0758280227228035064168665257966495119035,-0.9971209109079750554371912585338577628136} +#define T_3125_822 {-0.0818410984572735256126208014393341727555,-0.9966453905995386053362494749308098107576} +#define T_3125_828 {-0.0938580983366153892566430272381694521755,-0.9955855851591234317865541925129946321249} +#define T_3125_831 {-0.0998615852645850093916024547979759518057,-0.9950013385862573889184545805619563907385} +#define T_3125_832 {-0.1018619509602122080726971375952416565269,-0.9947985439005122554334548112819902598858} +#define T_3125_834 {-0.1058614389085903023657309063310094643384,-0.9943808906813338399643953380291350185871} +#define T_3125_837 {-0.1118574409747600945097545377393544185907,-0.9937242640182326613995655861799605190754} +#define T_3125_843 {-0.1238370179067089793534250929951667785645,-0.9923025712936419662213438641629181802273} +#define T_3125_846 {-0.1298201569171524294432629176299087703228,-0.9915375569578823711225368242594413459301} +#define T_3125_849 {-0.1357985726549467975221574533861712552607,-0.9907364673135228283484821076854132115841} +#define T_3125_852 {-0.1417720476062005463546711325761862099171,-0.9898993315067675657914492148847784847021} +#define T_3125_858 {-0.1537033060002351014805554996200953610241,-0.9881170445471518792857068547164089977741} +#define T_3125_861 {-0.1596606553456632326959407919275690801442,-0.9871719582395932501839297401602379977703} +#define T_3125_864 {-0.1656121957256392329682626041176263242960,-0.9861909554578831560434082348365336656570} +#define T_3125_867 {-0.1715577106040829802324765296361874789000,-0.9851740718940413810145173556520603597164} +#define T_3125_871 {-0.1794753169245356660965740047686267644167,-0.9837624767263881686574222840135917067528} +#define T_3125_873 {-0.1834297988160587811190538332084543071687,-0.9830328117139834898097205950762145221233} +#define T_3125_876 {-0.1893559402050386808458171117308665998280,-0.9819085130036635522543519982718862593174} +#define T_3125_879 {-0.1952751922190975875093243985247681848705,-0.9807484893201694475806107220705598592758} +#define T_3125_882 {-0.2011873394969094508422102762779104523361,-0.9795527828688739280238451101467944681644} +#define T_3125_884 {-0.2051247177052628045679227852815529331565,-0.9787358429046809638052195623458828777075} +#define T_3125_888 {-0.2129894596987791377529219971620477735996,-0.9770544969740542207148337183753028512001} +#define T_3125_891 {-0.2188790032239448790640778952365508303046,-0.9757520084261637194344984891358762979507} +#define T_3125_894 {-0.2247605832307023321359196188495843671262,-0.9744140188984323769361139966349583119154} +#define T_3125_897 {-0.2306339857283535854648448548687156289816,-0.9730405770711998414412846614141017198563} +#define T_3125_903 {-0.2423554037289404605726161889833747409284,-0.9701875376871124112554412022291216999292} +#define T_3125_906 {-0.2482029927691883075535628222496598027647,-0.9687080439329582848273503259406425058842} +#define T_3125_909 {-0.2540415513904704103431697603809880092740,-0.9671933054809276342211887822486460208893} +#define T_3125_912 {-0.2598708671673445325112083992280531674623,-0.9656433774420515314673707507608924061060} +#define T_3125_918 {-0.2715009221752375245806376824475592002273,-0.9624381794473844120574312910321168601513} +#define T_3125_921 {-0.2773012382676466325293063164281193166971,-0.9607830261069508770432889832591172307730} +#define T_3125_923 {-0.2811625236153397766614148167718667536974,-0.9596601665768218492047481049667112529278} +#define T_3125_924 {-0.2830914652538231712775029791373526677489,-0.9590929164061443579214483179384842514992} +#define T_3125_927 {-0.2888713924667845045490821576095186173916,-0.9573679118365628903930542037414852529764} +#define T_3125_933 {-0.3003995067864751455921634715195978060365,-0.9538134704031194788242942195211071521044} +#define T_3125_936 {-0.3061472744635230225362931832933099940419,-0.9519841628612086825711458004661835730076} +#define T_3125_939 {-0.3118839035232529033336845714075025171041,-0.9501202190897204413744248086004517972469} +#define T_3125_942 {-0.3176091852487462063869827488815644755960,-0.9482217069048923807628170834504999220371} +#define T_3125_948 {-0.3290248739011885081851005452335812151432,-0.9443212548462027733364720916142687201500} +#define T_3125_949 {-0.3309228781046149592270921857561916112900,-0.9436578027796719281283799318771343678236} +#define T_3125_951 {-0.3347148654888615348923508463485632091761,-0.9423194568832660156232350345817394554615} +#define T_3125_954 {-0.3403926790788573408796935382270021364093,-0.9402833743236759245220923730812501162291} +#define T_3125_957 {-0.3460581080941521614491307445860002189875,-0.9382130812466302760554981432505883276463} +#define T_3125_962 {-0.3554724088667650105932693804788868874311,-0.9346867745584396258706760818313341587782} +#define T_3125_963 {-0.3573509883529958353065580922702793031931,-0.9339701660776628111548802735342178493738} +#define T_3125_966 {-0.3629780287254309456557166413404047489166,-0.9317976983565693371858174032240640372038} +#define T_3125_969 {-0.3685918627958865156912793281662743538618,-0.9295913288540607055665532243438065052032} +#define T_3125_972 {-0.3741922863151183609176086974912323057652,-0.9273511378449181830063707820954732596874} +#define T_3125_978 {-0.3853520871499416799998982696706661954522,-0.9227696185555653851650959040853194892406} +#define T_3125_981 {-0.3909110584362747164988149961573071777821,-0.9204284569656847603269511637336108833551} +#define T_3125_984 {-0.3964558071276397255289225540764164179564,-0.9180538072437648411394661707163322716951} +#define T_3125_987 {-0.4019861314883396152808359147456940263510,-0.9156457557871598984533534348884131759405} +#define T_3125_988 {-0.4038263327426417914622902571863960474730,-0.9148356644685585736809230184007901698351} +#define T_3125_993 {-0.4130027029062933729974815832974854856730,-0.9107297993324342533227877538593020290136} +#define T_3125_996 {-0.4184885491454332084870770813722629100084,-0.9082220731925371159931614783999975770712} +#define T_3125_999 {-0.4239591694322628612212611187715083360672,-0.9056813030278950682472327571304049342871} +#define T_3125_1001 {-0.4275976924473718909780473040882498025894,-0.9039691440606160144355385455128271132708} +#define T_3125_1002 {-0.4294143647281138953530899016186594963074,-0.9031075812798552426841069973306730389595} +#define T_3125_1008 {-0.4402776870054798674658513846225105226040,-0.8978616587899856948951082813437096774578} +#define T_3125_1011 {-0.4456854187445748460838501614489359781146,-0.8951896489115995780849743823637254536152} +#define T_3125_1014 {-0.4510769350222312290554782521212473511696,-0.8924850691697591154039059802016708999872} +#define T_3125_1017 {-0.4564520396778375399549076973926275968552,-0.8897480179657282217675628999131731688976} +#define T_3125_1023 {-0.4671522324731054998103729758440749719739,-0.8841769006796058594233045369037427008152} +#define T_3125_1026 {-0.4724769313055195985029399707855191081762,-0.8813430372925852296361881599295884370804} +#define T_3125_1027 {-0.4742480204158500134425935357285197824240,-0.8803912852429012225741189467953518033028} +#define T_3125_1029 {-0.4777844399155526944333871597336838021874,-0.8784771078261411902943223140027839690447} +#define T_3125_1032 {-0.4830745651990608902082158238044939935207,-0.8755792165519566960796282728551886975765} +#define T_3125_1038 {-0.4936018965392355295485060651117237284780,-0.8696879714776269976184153165377210825682} +#define T_3125_1041 {-0.4988387195779070970580448829423403367400,-0.8666948320198258892688158994133118540049} +#define T_3125_1044 {-0.5040573932679996982386683157528750598431,-0.8636701594311737562037478710408322513103} +#define T_3125_1047 {-0.5092577277374680244648175175825599581003,-0.8606140637589365649517958445358090102673} +#define T_3125_1053 {-0.5196026228694325954649002596852369606495,-0.8544080490650858283885327182360924780369} +#define T_3125_1056 {-0.5247468071515470899868205378879792988300,-0.8512583558381421333649541338672861456871} +#define T_3125_1059 {-0.5298718994659674663694204355124384164810,-0.8480776911087377634146378113655373454094} +#define T_3125_1062 {-0.5349777133454384214772403538518119603395,-0.8448661705996316495514975031255744397640} +#define T_3125_1066 {-0.5417551555332476009851916387560777366161,-0.8405363474907832088689474403508938848972} +#define T_3125_1068 {-0.5451307634443250949018988649186212569475,-0.8383510307422584739711624024494085460901} +#define T_3125_1071 {-0.5501776302632950255144805851159617304802,-0.8350476484356237349970797367859631776810} +#define T_3125_1074 {-0.5552044798598639863129733384994324296713,-0.8317138844239272410518992728611920028925} +#define T_3125_1077 {-0.5602111293411610182246818112616892904043,-0.8283498600001698575212571995507460087538} +#define T_3125_1079 {-0.5635375833907871889749685578863136470318,-0.8260904261072583576464012367068789899349} +#define T_3125_1083 {-0.5701631000678066829578938268241472542286,-0.8215315205888744642948040564078837633133} +#define T_3125_1086 {-0.5751080592286204051788445212878286838531,-0.8180774536743385949577600513293873518705} +#define T_3125_1089 {-0.5800320941182662837221073459659237414598,-0.8145936224847201145848885062150657176971} +#define T_3125_1092 {-0.5849350255846010870897089262143708765507,-0.8110801537729437349355521291727200150490} +#define T_3125_1098 {-0.5946768654843063162118710351933259516954,-0.8039648161815045046552086205338127911091} +#define T_3125_1101 {-0.5995154194783735013274394987092819064856,-0.8003632061805876496407563536195084452629} +#define T_3125_1104 {-0.6043321611834190365541985556774307042360,-0.7967324764055861985312390061153564602137} +#define T_3125_1107 {-0.6091269153509694112003103327879216521978,-0.7930727589540652600064163380011450499296} +#define T_3125_1113 {-0.6186497640859212054920135415159165859222,-0.7856668946802035646825856929353903979063} +#define T_3125_1116 {-0.6233775121816188535817104821035172790289,-0.7819210173068988911637688943301327526569} +#define T_3125_1118 {-0.6265167545693004491624833462992683053017,-0.7794079523869068548691529940697364509106} +#define T_3125_1119 {-0.6280825798090190481559602631023153662682,-0.7781466911453439605494963871024083346128} +#define T_3125_1122 {-0.6327647957827083891046981989347841590643,-0.7743440535175997174732742678315844386816} +#define T_3125_1128 {-0.6420599921905801865662510863330680876970,-0.7666543982970632775675312586827203631401} +#define T_3125_1131 {-0.6466726344357786038230528902204241603613,-0.7627676604785298408728522190358489751816} +#define T_3125_1134 {-0.6512617486616028017465396260377019643784,-0.7588531707321458652515389076143037527800} +#define T_3125_1137 {-0.6558271679013949473358024988556280732155,-0.7549110714795720378234022973629180341959} +#define T_3125_1143 {-0.6648862578727927630239946665824390947819,-0.7469446191598906326092333074484486132860} +#define T_3125_1144 {-0.6663867341961572376973776954400818794966,-0.7456062771244486020449926400033291429281} +#define T_3125_1146 {-0.6693795990057219702507040892669465392828,-0.7429205559378060375053109964937902987003} +#define T_3125_1149 {-0.6738485859672582733992385328747332096100,-0.7388694628890318805147785496956203132868} +#define T_3125_1152 {-0.6782930561613581543767281800683122128248,-0.7347914874052942080240313771355431526899} +#define T_3125_1157 {-0.6856455859895435045459066714101936668158,-0.7279355262748586907761705333541613072157} +#define T_3125_1158 {-0.6871078003289499136485574126709252595901,-0.7265554835847788384128875804890412837267} +#define T_3125_1161 {-0.6914777535938475550025827942590694874525,-0.7223977549001701081721193986595608294010} +#define T_3125_1164 {-0.6958225486857885222491404419997707009315,-0.7182137430740332950662718758394476026297} +#define T_3125_1167 {-0.7001420275272269844180073050665669143200,-0.7140036003341045933723307825857773423195} +#define T_3125_1173 {-0.7087044087595608665708368789637461304665,-0.7055055357718756914309210515057202428579} +#define T_3125_1176 {-0.7129469996236330597838559697265736758709,-0.7012179231363523390641034893633332103491} +#define T_3125_1179 {-0.7171636511948903391910903337702620774508,-0.6969047979493422362295973471191246062517} +#define T_3125_1182 {-0.7213542100580581095670140712172724306583,-0.6925663171361389203184444340877234935760} +#define T_3125_1183 {-0.7227452382532805197357106408162508159876,-0.6911145495373459679555594448174815624952} +#define T_3125_1188 {-0.7296564407512542427269863765104673802853,-0.6838139209391772554980093445919919759035} +#define T_3125_1191 {-0.7337678105195681865779988584108650684357,-0.6794003239955946904515826645365450531244} +#define T_3125_1194 {-0.7378524834673538812523929664166644215584,-0.6749620082945841925692320728558115661144} +#define T_3125_1196 {-0.7405606937694141356232080397603567689657,-0.6719894782240001251949479410541243851185} +#define T_3125_1197 {-0.7419103109811409080265320881153456866741,-0.6704991353162707001800413308956194669008} +#define T_3125_1203 {-0.7499448401418243870253377281187567859888,-0.6615003679096886912702757399529218673706} +#define T_3125_1206 {-0.7539212494668445785350741061847656965256,-0.6569648008853685716701420460594817996025} +#define T_3125_1209 {-0.7578702287247495172906042171234730631113,-0.6524053313797304465282422825112007558346} +#define T_3125_1212 {-0.7617916342390397277384295193769503384829,-0.6478221252808622487862066918751224875450} +#define T_3125_1218 {-0.7695511543520733299672542671032715588808,-0.6385851711677866804350856000382918864489} +#define T_3125_1221 {-0.7733889866346496111404462681093718856573,-0.6339317592235222953434004011796787381172} +#define T_3125_1222 {-0.7746620179580198106705779537151101976633,-0.6323754880869503214313454009243287146091} +#define T_3125_1224 {-0.7771986805515482821249406697461381554604,-0.6292552828136862563823683558439370244741} +#define T_3125_1227 {-0.7809800974939162809818071764311753213406,-0.6245559120834522914833542017731815576553} +#define T_3125_1233 {-0.7884575511686865434413107323052827268839,-0.6150891724011064987109875801252201199532} +#define T_3125_1236 {-0.7921533158474011182548224496713373810053,-0.6103221478792717480388319017947651445866} +#define T_3125_1239 {-0.7958202594541076146938962665444705635309,-0.6055329178850615834406312387727666646242} +#define T_3125_1242 {-0.7994582485736659860364738960925024002790,-0.6007216566659857592824778294016141444445} +#define T_3125_1248 {-0.8066468349628769241022041569522116333246,-0.5910337415447387554578995150222908705473} +#define T_3125_1251 {-0.8101971706887579616207517574366647750139,-0.5861574401199147743568573787342756986618} +#define T_3125_1253 {-0.8125476926771126873205730589688755571842,-0.5828947135847952765175250533502548933029} +#define T_3125_1254 {-0.8137180288494633151330504006182309240103,-0.5812598124120951226245779253076761960983} +#define T_3125_1256 {-0.8160488278945919882190196403826121240854,-0.5779829673025517200457557009940501302481} +#define T_3125_1257 {-0.8172092813449084447441350675944704562426,-0.5763410366126450190549235230719204992056} +#define T_3125_1259 {-0.8195202726448849173124244771315716207027,-0.5730501921507865636939982323383446782827} +#define T_3125_1261 {-0.8218180120250051512442723833373747766018,-0.5697500812735952768761649167572613805532} +#define T_3125_1262 {-0.8229619006257250513058920660114381462336,-0.5680965675996416663906529720406979322433} +#define T_3125_1263 {-0.8241024623300465812292259215610101819038,-0.5664407573449796728937144507654011249542} +#define T_3125_1266 {-0.8275041400237616029400555817119311541319,-0.5614596140805985235999742144485935568810} +#define T_3125_1268 {-0.8297552064986845055116759795055259019136,-0.5581274919660610400740097247762605547905} +#define T_3125_1269 {-0.8308757104693854644850148361001629382372,-0.5564580431191501475041150115430355072021} +#define T_3125_1271 {-0.8331066372286018983928101988567505031824,-0.5531124035904914082451000467699486762285} +#define T_3125_1272 {-0.8342170509983982684687475739337969571352,-0.5514362264337877572728530140011571347713} +#define T_3125_1274 {-0.8364277568736554480111067277903202921152,-0.5480771912160778969180796593718696385622} +#define T_3125_1277 {-0.8397184446008868485833431805076543241739,-0.5430220380399558566253404023882467299700} +#define T_3125_1278 {-0.8408085571362411414497728401329368352890,-0.5413325874603082121794273007253650575876} +#define T_3125_1281 {-0.8440584829249893061842158203944563865662,-0.5362511327749018841259953660483006387949} +#define T_3125_1283 {-0.8462080465104597770675809442764148116112,-0.5328526456919882026141976893995888531208} +#define T_3125_1284 {-0.8472776991656882916714721432072110474110,-0.5311501675576290049107797131000552326441} +#define T_3125_1286 {-0.8494067245803212928834113881748635321856,-0.5277387765151716481071275666181463748217} +#define T_3125_1287 {-0.8504660887329518192956356870126910507679,-0.5260298773979239506104477186454460024834} +#define T_3125_1289 {-0.8525744985159912081584820953139569610357,-0.5226057065132432288834252176457084715366} +#define T_3125_1292 {-0.8557112530637190905835609555651899427176,-0.5174536224437120690566871417104266583920} +#define T_3125_1293 {-0.8567499249577132935939971503103151917458,-0.5157320681176928989231100786128081381321} +#define T_3125_1296 {-0.8598451429891451613940489551168866455555,-0.5105549236644150168373812448407988995314} +#define T_3125_1298 {-0.8618912486274211692816038521414157003164,-0.5070931625840214840295061549113597720861} +#define T_3125_1299 {-0.8629090771033275375501148118928540498018,-0.5053592035895690948166247835615649819374} +#define T_3125_1301 {-0.8649342647953848217667882636305876076221,-0.5018851637405385979207039781613275408745} +#define T_3125_1302 {-0.8659416158245348826838494460389483720064,-0.5001450969300744731071972637437283992767} +#define T_3125_1304 {-0.8679458118875658101387671194970607757568,-0.4966589047090913533466505214164499193430} +#define T_3125_1307 {-0.8709257803342459469675418404222000390291,-0.4914145756377040208384698871668661013246} +#define T_3125_1308 {-0.8719120669004601920093477929185610264540,-0.4896624833427281053843671543290838599205} +#define T_3125_1311 {-0.8748497620310663469567202810139860957861,-0.4843943578058961807464299909042892977595} +#define T_3125_1313 {-0.8767905487608365078244787582661956548691,-0.4808724712474931606287498198071261867881} +#define T_3125_1314 {-0.8777556273283773080606806615833193063736,-0.4791086084525791322086263335222611203790} +#define T_3125_1316 {-0.8796751353617080493307867072871886193752,-0.4755750794841552830938269380567362532020} +#define T_3125_1317 {-0.8806295570677167283690778276650235056877,-0.4738054275952494065471398698718985542655} +#define T_3125_1319 {-0.8825277165667645418167808202269952744246,-0.4702603847778935586276816138706635683775} +#define T_3125_1322 {-0.8853481885899753267565870373800862580538,-0.4649285804943050215420896620344137772918} +#define T_3125_1323 {-0.8862811927871681039192708340124227106571,-0.4631475437824910668105360400659264996648} +#define T_3125_1326 {-0.8890586931426912586928779091977048665285,-0.4577932285949738999697444796765921637416} +#define T_3125_1328 {-0.8908923957916445868221444470691494643688,-0.4542144197629834345697474873304599896073} +#define T_3125_1329 {-0.8918038466985458434521660819882526993752,-0.4524222574251586803661950852983864024282} +#define T_3125_1331 {-0.8936159292541071552662401700217742472887,-0.4488324531306960851750886831723619252443} +#define T_3125_1332 {-0.8945165535772630072131050837924703955650,-0.4470348256861601532463623698276933282614} +#define T_3125_1334 {-0.8963069501100714431274241178471129387617,-0.4434341565377907801170920265576569363475} +#define T_3125_1337 {-0.8989653604515890350512563600204885005951,-0.4380197263915685534918509347335202619433} +#define T_3125_1338 {-0.8998442336995690071788089881010819226503,-0.4362113651403817793372752475988818332553} +#define T_3125_1339 {-0.9007194692475072317350281991821248084307,-0.4344012404661027337837708728329744189978} +#define T_3125_1341 {-0.9024590131051098973458124419266823679209,-0.4307757301257246207448758923419518396258} +#define T_3125_1343 {-0.9041839638953846902680311359290499240160,-0.4271432539961619867519004856148967519403} +#define T_3125_1344 {-0.9050409581644697221136652842687908560038,-0.4253244221118022139194181363563984632492} +#define T_3125_1346 {-0.9067439671281731783381019340595230460167,-0.4216816074678410575238274304865626618266} +#define T_3125_1347 {-0.9075899749382267955866154807154089212418,-0.4198576394346407170132806641049683094025} +#define T_3125_1349 {-0.9092709801144442938181100544170476496220,-0.4162046188135324897849898206914076581597} +#define T_3125_1352 {-0.9117649109133807439064867139677517116070,-0.4107124873036063217313085260684601962566} +#define T_3125_1353 {-0.9125888538648014858267742965836077928543,-0.4088784462425575916277864507719641551375} +#define T_3125_1356 {-0.9150385341424104312935128291428554803133,-0.4033664351858106122961089567979797720909} +#define T_3125_1358 {-0.9166531642074609020909292667056433856487,-0.3996835955458388767169708444271236658096} +#define T_3125_1359 {-0.9174549223906167672026867876411415636539,-0.3978397483676402446306497040495742112398} +#define T_3125_1361 {-0.9190473088523105626990172822843305766582,-0.3941472365643651221134291517955716699362} +#define T_3125_1362 {-0.9198379306934851840082956186961382627487,-0.3922985868665948006039911888365168124437} +#define T_3125_1364 {-0.9214080156157081757939408817037474364042,-0.3885965372454093236243011233455035835505} +#define T_3125_1367 {-0.9237351986075890142302569074672646820545,-0.3830316995411713332231329331989400088787} +#define T_3125_1368 {-0.9245034618749669030890458998328540474176,-0.3811736467559129759585800911736441776156} +#define T_3125_1371 {-0.9267858150066293765334535237343516200781,-0.3755902729071904366797696184221422299743} +#define T_3125_1373 {-0.9282886558181315805882150016259402036667,-0.3718604193502803578752491375780664384365} +#define T_3125_1374 {-0.9290344487052275557203984135412611067295,-0.3699932338826940059206549449299927800894} +#define T_3125_1376 {-0.9305147643674520629986091080354526638985,-0.3662543833105415536088855787966167554259} +#define T_3125_1377 {-0.9312492811582734342223943713179323822260,-0.3643827333206102192519892923883162438869} +#define T_3125_1378 {-0.9319800332913663609701870882418006658554,-0.3625096102811947718080887170799542218447} +#define T_3125_1379 {-0.9327070178126004584839847666444256901741,-0.3606350217645611921923887166485656052828} +#define T_3125_1382 {-0.9348653363923824510806070975377224385738,-0.3550025391626905335584751810529269278049} +#define T_3125_1383 {-0.9355772212296750156568236889143008738756,-0.3531221645778693241091161780786933377385} +#define T_3125_1386 {-0.9376901713837241247873066640750039368868,-0.3474725060927296160784294443146791309118} +#define T_3125_1388 {-0.9390798560872231082186090134200640022755,-0.3436990309721579817114900379237951710820} +#define T_3125_1389 {-0.9397690053693372735921229832456447184086,-0.3418102054461312766520109107659664005041} +#define T_3125_1391 {-0.9411359038645714658599672475247643887997,-0.3380284166412879653940137814061017706990} +#define T_3125_1392 {-0.9418136475518830152964255830738693475723,-0.3361354686506876077167760286101838573813} +#define T_3125_1394 {-0.9431577101067120105781782513076905161142,-0.3323455037551485413693796999723417684436} +#define T_3125_1397 {-0.9451452012538651414885748636152129620314,-0.3266504990762922333580320355395087972283} +#define T_3125_1398 {-0.9458000601919896999092429723532404750586,-0.3247495129185395956739057510276325047016} +#define T_3125_1401 {-0.9477416856111047227528842995525337755680,-0.3190387082393325024476382623106474056840} +#define T_3125_1403 {-0.9490169502691866432186884594557341188192,-0.3152250435827903540797478854074142873287} +#define T_3125_1404 {-0.9496488291555378324915182020049542188644,-0.3133162959112024137375840382446767762303} +#define T_3125_1406 {-0.9509010672707387534785539173753932118416,-0.3094950084627696718087008775910362601280} +#define T_3125_1407 {-0.9515214214373051149209459254052489995956,-0.3075824841338179460592527902917936444283} +#define T_3125_1409 {-0.9527505874491961623817815052461810410023,-0.3037537129241576483451581225381232798100} +#define T_3125_1412 {-0.9545654435130974668766157265054062008858,-0.2980013658536542764387888837518403306603} +#define T_3125_1413 {-0.9551626809488136249370882069342769682407,-0.2960814970961117564840492377697955816984} +#define T_3125_1416 {-0.9569312156978840855359180750383529812098,-0.2903147402784942987530314439936773851514} +#define T_3125_1417 {-0.9575129934843711332348448195261880755424,-0.2883901303938790072756148674670839682221} +#define T_3125_1418 {-0.9580909004397456518731246433162596076727,-0.2864643546666100215603023571020457893610} +#define T_3125_1419 {-0.9586649342277675511780898887082003057003,-0.2845374208818063355153071825043298304081} +#define T_3125_1421 {-0.9598013730350903882637680908374022692442,-0.2806801103034470834529656713129952549934} +#define T_3125_1422 {-0.9603637734602369357972406760382000356913,-0.2787497491034112706564940253883833065629} +#define T_3125_1424 {-0.9614769249857414434856650586880277842283,-0.2748856538998767362613762088585644960403} +#define T_3125_1427 {-0.9631174953297583218869704069220460951328,-0.2690811962767615250768926671298686414957} +#define T_3125_1428 {-0.9636565680673772060771398173528723418713,-0.2671441910665556807025211583095369860530} +#define T_3125_1431 {-0.9652504036396454267787703429348766803741,-0.2613267270554267551752047893387498334050} +#define T_3125_1433 {-0.9662934537160118297904887185723055154085,-0.2574431224670444828817039706336800009012} +#define T_3125_1434 {-0.9668091203140533052007299374963622540236,-0.2554997551418913270815380656131310388446} +#define T_3125_1436 {-0.9678287262067215568350775356520898640156,-0.2516099297108022048163888939598109573126} +#define T_3125_1437 {-0.9683326613795012471896939132420811802149,-0.2496634873298301526833853358766646124423} +#define T_3125_1439 {-0.9693287859919468285596622081357054412365,-0.2457675825803282143589001407235627993941} +#define T_3125_1442 {-0.9707935784947135937628104329633060842752,-0.2399162936389029054939925345024676062167} +#define T_3125_1443 {-0.9712739962401419901993904204573482275009,-0.2379639137090004130659082193233189173043} +#define T_3125_1446 {-0.9726916830201622898854907361965160816908,-0.2321010335681512604111276232288219034672} +#define T_3125_1448 {-0.9736171497625963899835710435581859201193,-0.2281877421952325235565695038530975580215} +#define T_3125_1449 {-0.9740739801647710915943889631307683885098,-0.2262297088491279362099106720052077434957} +#define T_3125_1451 {-0.9749758257971561370425206405343487858772,-0.2223109064152127278823911638028221204877} +#define T_3125_1452 {-0.9754208373815757004976489952241536229849,-0.2203501531695081505635158691802644170821} +#define T_3125_1454 {-0.9762990290918822644528063392499461770058,-0.2164259822531666710432318723178468644619} +#define T_3125_1456 {-0.9771614337168401664968087061424739658833,-0.2124983116555269890834978241400676779449} +#define T_3125_1457 {-0.9775867115044055699613068100006785243750,-0.2105331838214632711459017855304409749806} +#define T_3125_1458 {-0.9780080373110748848120010734419338405132,-0.2085672048883505380612035651211044751108} +#define T_3125_1461 {-0.9792482858931585987249945901567116379738,-0.2026642409881689588235786914083291776478} +#define T_3125_1463 {-0.9800553275768880689966522368195001035929,-0.1987248220232155082598524131753947585821} +#define T_3125_1464 {-0.9804529062896414348671214611385948956013,-0.1967539035145059456155536281585227698088} +#define T_3125_1466 {-0.9812361714206964125040144608647096902132,-0.1928096882826524360865505514084361493587} +#define T_3125_1467 {-0.9816218546725793414609029241546522825956,-0.1908364075043480490023739548632875084877} +#define T_3125_1469 {-0.9823813147532685086105175287229940295219,-0.1868875395087635782331858536053914576769} +#define T_3125_1472 {-0.9834907159106259788572401703277137130499,-0.1809585911682682901613361536874435842037} +#define T_3125_1473 {-0.9838525665769012329420206697250250726938,-0.1789808013169127964836491173628019168973} +#define T_3125_1476 {-0.9849142489378492903995265805860981345177,-0.1730431224845188353800295999462832696736} +#define T_3125_1478 {-0.9856021315473265653750445380865130573511,-0.1690811588834378598189545073182671330869} +#define T_3125_1479 {-0.9859400969673335346499243314610794186592,-0.1670991477897030008659129407533328048885} +#define T_3125_1481 {-0.9866040692066136985616253696207422763109,-0.1631331070780893399696509504792629741132} +#define T_3125_1482 {-0.9869300733417204485320439744100440293550,-0.1611490934932815199953637375074322335422} +#define T_3125_1484 {-0.9875701110533692261839178172522224485874,-0.1571791199683850448476363226291141472757} +#define T_3125_1487 {-0.9885002219398999478983114386210218071938,-0.1512194141794249158117224851594073697925} +#define T_3125_1488 {-0.9888022683576072058286854371544905006886,-0.1492316122369873043762567021985887549818} +#define T_3125_1491 {-0.9896844188826626753652249135484453290701,-0.1432646188732108472052573233668226748705} +#define T_3125_1493 {-0.9902525167791625948510159105353523045778,-0.1392837140965672293457799923999118618667} +#define T_3125_1494 {-0.9905305615222100845684849446115549653769,-0.1372924130842458068091360701146186329424} +#define T_3125_1496 {-0.9910746369778019593255180552660021930933,-0.1333081540616251337016961997505859471858} +#define T_3125_1497 {-0.9913406654908731852060554956551641225815,-0.1313152121580459796845730124914553016424} +#define T_3125_1499 {-0.9918606987103513183967606892110779881477,-0.1273277438495385716077379356647725217044} +#define T_3125_1502 {-0.9926106733773698564959886425640434026718,-0.1213427010467644645563822791700658854097} +#define T_3125_1503 {-0.9928526408311242335003043990582227706909,-0.1193466949465402798802315942339191678911} +#define T_3125_1506 {-0.9935544571922120260509814215765800327063,-0.1133558141141820380104476839733251836151} +#define T_3125_1508 {-0.9940022536828588828328179261006880551577,-0.1093595888498011398093368029549310449511} +#define T_3125_1509 {-0.9942201248634163768258531490573659539223,-0.1073608090346411525484171534117194823921} +#define T_3125_1511 {-0.9946438086911856268912401901616249233484,-0.1033619554395718703743156652308243792504} +#define T_3125_1512 {-0.9948496196256179091577109829813707619905,-0.1013618978253825325674952750887314323336} +#define T_3125_1513 {-0.9950514087923183481620981183368712663651,-0.0993614304467405062837315199431031942368} +#define T_3125_1514 {-0.9952491753755364767286550886637996882200,-0.0973605613907123912387220343589433468878} +#define T_3125_1517 {-0.9958183317107344167951055169396568089724,-0.0913556250531387947777517410941072739661} +#define T_3125_1518 {-0.9960000001277981018787954781146254390478,-0.0893532301902183606623353284703625831753} +#define T_3125_1521 {-0.9965208440132535816502468151156790554523,-0.0833439106780613225833320711899432353675} +#define T_3125_1523 {-0.9968479318209600981148810205922927707434,-0.0793359995479635538018214901967439800501} +#define T_3125_1524 {-0.9970054312821827746660119373700581490993,-0.0773315588477872795092338265021680854261} +#define T_3125_1526 {-0.9973083381358417431883367498812731355429,-0.0733217476927928019092561839897825848311} +#define T_3125_1527 {-0.9974537443037503336995541758369654417038,-0.0713163934479914690678370448040368501097} +#define T_3125_1529 {-0.9977324591826413069384216214530169963837,-0.0673048281578588897255954748288786504418} +#define T_3125_1532 {-0.9981202795304783270680104578786995261908,-0.0612854598579447365502836930772900814191} +#define T_3125_1533 {-0.9982414836809191394095819305221084505320,-0.0592784974380861848231027977362828096375} +#define T_3125_1534 {-0.9983586523517171684716231538914144039154,-0.0572712953796496171410268516410724259913} +#define T_3125_1536 {-0.9985808813760417290694704206543974578381,-0.0532562048051470268705820387822313932702} +#define T_3125_1538 {-0.9987869630099333040362807878409512341022,-0.0492402530598089055446919815040018875152} +#define T_3125_1539 {-0.9988839475038877591117625343031249940395,-0.0472319745410916952410573799170379061252} +#define T_3125_1541 {-0.9990658018854734478964019217528402805328,-0.0432148528047473987068194389848940772936} +#define T_3125_1542 {-0.9991506710379421951628842180070932954550,-0.0412060258266908410740292367790971184149} +#define T_3125_1544 {-0.9993082915507774943719709881406743079424,-0.0371878802550813167271392956081399461254} +#define T_3125_1547 {-0.9995144231832957526506788781262002885342,-0.0311595546913544275968721564140651025809} +#define T_3125_1548 {-0.9995750528302664861257653683423995971680,-0.0291498500745719542337486274163893540390} +#define T_3125_1551 {-0.9997326956481690229949776949069928377867,-0.0231200616790997320881828613892139401287} +#define T_3125_1552 {-0.9997771605050388421531692983990069478750,-0.0211099344499641877748175033957522828132} +#define T_3125_1553 {-0.9998175836741561850473658523696940392256,-0.0190997218820482747569933223985572112724} +#define T_3125_1554 {-0.9998539649921069916160831780871376395226,-0.0170894321018144022816809268761062412523} +#define T_3125_1556 {-0.9999146015025498490658151240495499223471,-0.0130686534117702632395996431569074047729} +#define T_3125_1557 {-0.9999388564499135378937921814213041216135,-0.0110581807563140972067161627023779146839} +#define T_3125_1559 {-0.9999752392386617172448382007132750004530,-0.0070371094620699081029213672877631324809} +#define T_3125_1562 {-0.9999994946762972292120252859604079276323,-0.0010053094798131300031951429474474934977} +#define T_3125_1563 {-0.9999994946762972292120252859604079276323,0.0010053094798131300031951429474474934977} +#define T_3125_1566 {-0.9999752392386617172448382007132750004530,0.0070371094620699081029213672877631324809} +#define T_3125_1569 {-0.9999146015025498490658151240495499223471,0.0130686534117702632395996431569074047729} +#define T_3125_1572 {-0.9998175836741561850473658523696940392256,0.0190997218820482747569933223985572112724} +#define T_3125_1573 {-0.9997771605050388421531692983990069478750,0.0211099344499641877748175033957522828132} +#define T_3125_1578 {-0.9995144231832957526506788781262002885342,0.0311595546913544275968721564140651025809} +#define T_3125_1581 {-0.9993082915507774943719709881406743079424,0.0371878802550813167271392956081399461254} +#define T_3125_1584 {-0.9990658018854734478964019217528402805328,0.0432148528047473987068194389848940772936} +#define T_3125_1586 {-0.9988839475038877591117625343031249940395,0.0472319745410916952410573799170379061252} +#define T_3125_1587 {-0.9987869630099333040362807878409512341022,0.0492402530598089055446919815040018875152} +#define T_3125_1593 {-0.9981202795304783270680104578786995261908,0.0612854598579447365502836930772900814191} +#define T_3125_1596 {-0.9977324591826413069384216214530169963837,0.0673048281578588897255954748288786504418} +#define T_3125_1599 {-0.9973083381358417431883367498812731355429,0.0733217476927928019092561839897825848311} +#define T_3125_1602 {-0.9968479318209600981148810205922927707434,0.0793359995479635538018214901967439800501} +#define T_3125_1608 {-0.9958183317107344167951055169396568089724,0.0913556250531387947777517410941072739661} +#define T_3125_1611 {-0.9952491753755364767286550886637996882200,0.0973605613907123912387220343589433468878} +#define T_3125_1612 {-0.9950514087923183481620981183368712663651,0.0993614304467405062837315199431031942368} +#define T_3125_1614 {-0.9946438086911856268912401901616249233484,0.1033619554395718703743156652308243792504} +#define T_3125_1617 {-0.9940022536828588828328179261006880551577,0.1093595888498011398093368029549310449511} +#define T_3125_1623 {-0.9926106733773698564959886425640434026718,0.1213427010467644645563822791700658854097} +#define T_3125_1626 {-0.9918606987103513183967606892110779881477,0.1273277438495385716077379356647725217044} +#define T_3125_1629 {-0.9910746369778019593255180552660021930933,0.1333081540616251337016961997505859471858} +#define T_3125_1632 {-0.9902525167791625948510159105353523045778,0.1392837140965672293457799923999118618667} +#define T_3125_1638 {-0.9885002219398999478983114386210218071938,0.1512194141794249158117224851594073697925} +#define T_3125_1641 {-0.9875701110533692261839178172522224485874,0.1571791199683850448476363226291141472757} +#define T_3125_1644 {-0.9866040692066136985616253696207422763109,0.1631331070780893399696509504792629741132} +#define T_3125_1647 {-0.9856021315473265653750445380865130573511,0.1690811588834378598189545073182671330869} +#define T_3125_1653 {-0.9834907159106259788572401703277137130499,0.1809585911682682901613361536874435842037} +#define T_3125_1656 {-0.9823813147532685086105175287229940295219,0.1868875395087635782331858536053914576769} +#define T_3125_1659 {-0.9812361714206964125040144608647096902132,0.1928096882826524360865505514084361493587} +#define T_3125_1662 {-0.9800553275768880689966522368195001035929,0.1987248220232155082598524131753947585821} +#define T_3125_1668 {-0.9775867115044055699613068100006785243750,0.2105331838214632711459017855304409749806} +#define T_3125_1671 {-0.9762990290918822644528063392499461770058,0.2164259822531666710432318723178468644619} +#define T_3125_1674 {-0.9749758257971561370425206405343487858772,0.2223109064152127278823911638028221204877} +#define T_3125_1677 {-0.9736171497625963899835710435581859201193,0.2281877421952325235565695038530975580215} +#define T_3125_1683 {-0.9707935784947135937628104329633060842752,0.2399162936389029054939925345024676062167} +#define T_3125_1686 {-0.9693287859919468285596622081357054412365,0.2457675825803282143589001407235627993941} +#define T_3125_1689 {-0.9678287262067215568350775356520898640156,0.2516099297108022048163888939598109573126} +#define T_3125_1692 {-0.9662934537160118297904887185723055154085,0.2574431224670444828817039706336800009012} +#define T_3125_1698 {-0.9631174953297583218869704069220460951328,0.2690811962767615250768926671298686414957} +#define T_3125_1701 {-0.9614769249857414434856650586880277842283,0.2748856538998767362613762088585644960403} +#define T_3125_1704 {-0.9598013730350903882637680908374022692442,0.2806801103034470834529656713129952549934} +#define T_3125_1707 {-0.9580909004397456518731246433162596076727,0.2864643546666100215603023571020457893610} +#define T_3125_1713 {-0.9545654435130974668766157265054062008858,0.2980013658536542764387888837518403306603} +#define T_3125_1716 {-0.9527505874491961623817815052461810410023,0.3037537129241576483451581225381232798100} +#define T_3125_1719 {-0.9509010672707387534785539173753932118416,0.3094950084627696718087008775910362601280} +#define T_3125_1722 {-0.9490169502691866432186884594557341188192,0.3152250435827903540797478854074142873287} +#define T_3125_1728 {-0.9451452012538651414885748636152129620314,0.3266504990762922333580320355395087972283} +#define T_3125_1731 {-0.9431577101067120105781782513076905161142,0.3323455037551485413693796999723417684436} +#define T_3125_1734 {-0.9411359038645714658599672475247643887997,0.3380284166412879653940137814061017706990} +#define T_3125_1737 {-0.9390798560872231082186090134200640022755,0.3436990309721579817114900379237951710820} +#define T_3125_1743 {-0.9348653363923824510806070975377224385738,0.3550025391626905335584751810529269278049} +#define T_3125_1746 {-0.9327070178126004584839847666444256901741,0.3606350217645611921923887166485656052828} +#define T_3125_1749 {-0.9305147643674520629986091080354526638985,0.3662543833105415536088855787966167554259} +#define T_3125_1752 {-0.9282886558181315805882150016259402036667,0.3718604193502803578752491375780664384365} +#define T_3125_1758 {-0.9237351986075890142302569074672646820545,0.3830316995411713332231329331989400088787} +#define T_3125_1761 {-0.9214080156157081757939408817037474364042,0.3885965372454093236243011233455035835505} +#define T_3125_1764 {-0.9190473088523105626990172822843305766582,0.3941472365643651221134291517955716699362} +#define T_3125_1767 {-0.9166531642074609020909292667056433856487,0.3996835955458388767169708444271236658096} +#define T_3125_1773 {-0.9117649109133807439064867139677517116070,0.4107124873036063217313085260684601962566} +#define T_3125_1776 {-0.9092709801144442938181100544170476496220,0.4162046188135324897849898206914076581597} +#define T_3125_1779 {-0.9067439671281731783381019340595230460167,0.4216816074678410575238274304865626618266} +#define T_3125_1782 {-0.9041839638953846902680311359290499240160,0.4271432539961619867519004856148967519403} +#define T_3125_1788 {-0.8989653604515890350512563600204885005951,0.4380197263915685534918509347335202619433} +#define T_3125_1791 {-0.8963069501100714431274241178471129387617,0.4434341565377907801170920265576569363475} +#define T_3125_1794 {-0.8936159292541071552662401700217742472887,0.4488324531306960851750886831723619252443} +#define T_3125_1797 {-0.8908923957916445868221444470691494643688,0.4542144197629834345697474873304599896073} +#define T_3125_1803 {-0.8853481885899753267565870373800862580538,0.4649285804943050215420896620344137772918} +#define T_3125_1806 {-0.8825277165667645418167808202269952744246,0.4702603847778935586276816138706635683775} +#define T_3125_1809 {-0.8796751353617080493307867072871886193752,0.4755750794841552830938269380567362532020} +#define T_3125_1812 {-0.8767905487608365078244787582661956548691,0.4808724712474931606287498198071261867881} +#define T_3125_1818 {-0.8709257803342459469675418404222000390291,0.4914145756377040208384698871668661013246} +#define T_3125_1821 {-0.8679458118875658101387671194970607757568,0.4966589047090913533466505214164499193430} +#define T_3125_1824 {-0.8649342647953848217667882636305876076221,0.5018851637405385979207039781613275408745} +#define T_3125_1827 {-0.8618912486274211692816038521414157003164,0.5070931625840214840295061549113597720861} +#define T_3125_1833 {-0.8557112530637190905835609555651899427176,0.5174536224437120690566871417104266583920} +#define T_3125_1836 {-0.8525744985159912081584820953139569610357,0.5226057065132432288834252176457084715366} +#define T_3125_1839 {-0.8494067245803212928834113881748635321856,0.5277387765151716481071275666181463748217} +#define T_3125_1842 {-0.8462080465104597770675809442764148116112,0.5328526456919882026141976893995888531208} +#define T_3125_1848 {-0.8397184446008868485833431805076543241739,0.5430220380399558566253404023882467299700} +#define T_3125_1851 {-0.8364277568736554480111067277903202921152,0.5480771912160778969180796593718696385622} +#define T_3125_1854 {-0.8331066372286018983928101988567505031824,0.5531124035904914082451000467699486762285} +#define T_3125_1857 {-0.8297552064986845055116759795055259019136,0.5581274919660610400740097247762605547905} +#define T_3125_1863 {-0.8229619006257250513058920660114381462336,0.5680965675996416663906529720406979322433} +#define T_3125_1866 {-0.8195202726448849173124244771315716207027,0.5730501921507865636939982323383446782827} +#define T_3125_1869 {-0.8160488278945919882190196403826121240854,0.5779829673025517200457557009940501302481} +#define T_3125_1872 {-0.8125476926771126873205730589688755571842,0.5828947135847952765175250533502548933029} +// Pre-computed twiddles for N=4096 +#define T_4096_1 {0.9999988234517018792502085489104501903057,-0.0015339801862847657169808268662336558918} +#define T_4096_3 {0.9999894110819284032132259198988322168589,-0.0046019261204485704952471536444136290811} +#define T_4096_5 {0.9999705864309741398798792033630888909101,-0.0076698287395310978803442125695255526807} +#define T_4096_7 {0.9999423496760239116198931696999352425337,-0.0107376591672644922798030009403191797901} +#define T_4096_9 {0.9999047010828528980752594179648440331221,-0.0138053885280603905877372383770307351369} +#define T_4096_11 {0.9998576410058238606026748129806946963072,-0.0168729879472817138885698540207158657722} +#define T_4096_13 {0.9998011698878842556936774599307682365179,-0.0199404285515144379103968930166956852190} +#define T_4096_15 {0.9997352882605616830602457412169314920902,-0.0230076814688393721519688028820382896811} +#define T_4096_17 {0.9996599967439592226980948908021673560143,-0.0260747178291039008457552483832841971889} +#define T_4096_19 {0.9995752960467492176377390933339484035969,-0.0291415087641937256557334023909788811579} +#define T_4096_21 {0.9994811869661669456732511207519564777613,-0.0322080254083045858237710490357130765915} +#define T_4096_23 {0.9993776703880028478010899561922997236252,-0.0352742388982139540298810231888637645170} +#define T_4096_25 {0.9992647472865944235920210303447674959898,-0.0383401203735526940885591784535790793598} +#define T_4096_27 {0.9991424187248169053177093701378908008337,-0.0414056409770767394618040668774483492598} +#define T_4096_29 {0.9990106858540733769658004348457325249910,-0.0444707718549386676887280600567464716733} +#define T_4096_31 {0.9988695499142835609873714020068291574717,-0.0475354841569593025707440858695917995647} +#define T_4096_33 {0.9987190122338729381112898408900946378708,-0.0505997490368992816622828456729621393606} +#define T_4096_35 {0.9985590742297593136456157481006812304258,-0.0536635376527305266169953767985134618357} +#define T_4096_37 {0.9983897374073401609351208207954186946154,-0.0567268211669077482284251345845405012369} +#define T_4096_39 {0.9982110033604781884619683296477887779474,-0.0597895707466398751428471314284252002835} +#define T_4096_41 {0.9980228737714862408125782167189754545689,-0.0628517575641614201220619406740297563374} +#define T_4096_43 {0.9978253504111116445329798807506449520588,-0.0659133527970038185506140848701761569828} +#define T_4096_45 {0.9976184351385195547834427998168393969536,-0.0689743276282667461263287123074405826628} +#define T_4096_47 {0.9974021299012753027923849913349840790033,-0.0720346532468893185896519071320653893054} +#define T_4096_49 {0.9971764367353261881987691594986245036125,-0.0750943008479213192085666150887846015394} +#define T_4096_51 {0.9969413577649821611714742175536230206490,-0.0781532416327942458522670676757115870714} +#define T_4096_53 {0.9966968952028960604394569600117392838001,-0.0812114468095924413315600531859672628343} +#define T_4096_55 {0.9964430513500426300765866471920162439346,-0.0842688875933240710836003017902839928865} +#define T_4096_57 {0.9961798285956969811749672771838959306479,-0.0873255352061920731010502549906959757209} +#define T_4096_59 {0.9959072294174117212506303076224867254496,-0.0903813608778649829611495647441188339144} +#define T_4096_61 {0.9956252563809943056938323024951387196779,-0.0934363358457477866103602082148427143693} +#define T_4096_63 {0.9953339121404822797956057911505922675133,-0.0964904313552525927377701009390875697136} +#define T_4096_65 {0.9950331994381186317966125898237805813551,-0.0995436186600693329040723256184719502926} +#define T_4096_67 {0.9947231211043257026460651104571297764778,-0.1025958690224362951370906671400007326156} +#define T_4096_69 {0.9944036800576790957606476695218589156866,-0.1056471537134106158939417241526825819165} +#define T_4096_71 {0.9940748793048793663373885465261992067099,-0.1086974440131387165120457893863203935325} +#define T_4096_73 {0.9937367219407245988449517426488455384970,-0.1117467112111266008822596518257341813296} +#define T_4096_75 {0.9933892111480806530465770265436731278896,-0.1147949266065100837330703598127001896501} +#define T_4096_77 {0.9930323501978514100230199801444541662931,-0.1178420615083249772814255607045197393745} +#define T_4096_79 {0.9926661424489480189947698818286880850792,-0.1208880872357770835945345311301934998482} +#define T_4096_81 {0.9922905913482573669881503519718535244465,-0.1239329751185121730738813994321390055120} +#define T_4096_83 {0.9919057004306093272560929108294658362865,-0.1269766964968858657947237134067108854651} +#define T_4096_85 {0.9915114733187438966766080739034805446863,-0.1300192227222333463121373142712400294840} +#define T_4096_87 {0.9911079137232768898613244346051942557096,-0.1330605251571390645892734028166159987450} +#define T_4096_89 {0.9906950254426646340633055842772591859102,-0.1361005751757062010032228727141045965254} +#define T_4096_91 {0.9902728123631691081740768822783138602972,-0.1391393441638262007398196828944492153823} +#define T_4096_93 {0.9898412784588205282076955882075708359480,-0.1421768035194480583882636892667505890131} +#define T_4096_95 {0.9894004277913803768740308441920205950737,-0.1452129246528474637578653982927789911628} +#define T_4096_97 {0.9889502645103029898621116444701328873634,-0.1482476789868960309615175674480269663036} +#define T_4096_99 {0.9884907928526965870119624923972878605127,-0.1512810379573302221878350337647134438157} +#define T_4096_101 {0.9880220171432835263303218198416288942099,-0.1543129730130201049398408486013067886233} +#define T_4096_103 {0.9875439417943592257387308563920669257641,-0.1573434556162382480515304905566154047847} +#define T_4096_105 {0.9870565713057509737993200360506307333708,-0.1603724572429282568819530752080027014017} +#define T_4096_107 {0.9865599102647754081729658537369687110186,-0.1633999493829732252425657179628615267575} +#define T_4096_109 {0.9860539633461954389659354092145804315805,-0.1664259035404641318134366656522615812719} +#define T_4096_111 {0.9855387353121760618535063258605077862740,-0.1694502912339679590036922718354617245495} +#define T_4096_113 {0.9850142310122398381366792818880639970303,-0.1724730839967959783454176658779033459723} +#define T_4096_115 {0.9844804553832209315089585288660600781441,-0.1754942533772714252648938781931065022945} +#define T_4096_117 {0.9839374134492189227785274852067232131958,-0.1785137709389975346763179686604416929185} +#define T_4096_119 {0.9833851103215511813004923169501125812531,-0.1815316082611249937084352268357179127634} +#define T_4096_121 {0.9828235511987052364091255185485351830721,-0.1845477369386196164757762971930787898600} +#define T_4096_123 {0.9822527413662893724932700934004969894886,-0.1875621285825296025162600699331960640848} +#define T_4096_125 {0.9816726861969831130494412718690000474453,-0.1905747548202527674732209561625495553017} +#define T_4096_127 {0.9810833911504867055342060666589532047510,-0.1935855872958036349995580849281395785511} +#define T_4096_129 {0.9804848617734693849712357405223883688450,-0.1965945976700802233505527283341507427394} +#define T_4096_131 {0.9798771036995176375583582739636767655611,-0.1996017576211309707545638048031833022833} +#define T_4096_133 {0.9792601226490820209846788202412426471710,-0.2026070388444211334277866853881278075278} +#define T_4096_135 {0.9786339244294232075915829227596987038851,-0.2056104130530992668557388469707802869380} +#define T_4096_137 {0.9779985149345571393553200323367491364479,-0.2086118519782634850301406004291493445635} +#define T_4096_139 {0.9773539001451999608249820994387846440077,-0.2116113273692275809079177406601957045496} +#define T_4096_141 {0.9767000861287118418374575412599369883537,-0.2146088109937867582921455777977826073766} +#define T_4096_143 {0.9760370790390390238755458085506688803434,-0.2176042746384836412687491247197613120079} +#define T_4096_145 {0.9753648851166569766490965776029042899609,-0.2205976901088735342426616625743918120861} +#define T_4096_147 {0.9746835106885106680962849168281536549330,-0.2235890292297899872853150782248121686280} +#define T_4096_149 {0.9739929621679558335856086159765254706144,-0.2265782638456100006596471985176322050393} +#define T_4096_151 {0.9732932460546982467164411900739651173353,-0.2295653658205188962782017370045650750399} +#define T_4096_153 {0.9725843689347322129634676457499153912067,-0.2325503070387752446723794719218858517706} +#define T_4096_155 {0.9718663374802793963880276351119391620159,-0.2355330594049755144059332678807550109923} +#define T_4096_157 {0.9711391584497250928365019717602990567684,-0.2385135948443184439327069412684068083763} +#define T_4096_159 {0.9704028386875555023394213094434235244989,-0.2414918853028693301876472787625971250236} +#define T_4096_161 {0.9696573851242924479976181828533299267292,-0.2444679027478241783999379777014837600291} +#define T_4096_163 {0.9689028047764288720244962860306259244680,-0.2474416191677732967946212738752365112305} +#define T_4096_165 {0.9681391047463624444091578880033921450377,-0.2504130065729652798722781881224364042282} +#define T_4096_167 {0.9673662922223285054457164733321405947208,-0.2533820369955701590214403040590696036816} +#define T_4096_169 {0.9665843744783331192849118451704271137714,-0.2563486824899429139534845489833969622850} +#define T_4096_171 {0.9657933588740836849950710529810748994350,-0.2593129151328862347369863528001587837934} +#define T_4096_173 {0.9649932528549203247791865578619763255119,-0.2622747070239136446545558101206552237272} +#define T_4096_175 {0.9641840639517458289020623851683922111988,-0.2652340302855117903924053734954213723540} +#define T_4096_177 {0.9633657997809540463052258019160944968462,-0.2681908570634031763191273967095185071230} +#define T_4096_179 {0.9625384680443591634002586943097412586212,-0.2711451595268080105860519779525930061936} +#define T_4096_181 {0.9617020765291225403714747699268627911806,-0.2740969098687063842945121905358973890543} +#define T_4096_183 {0.9608566331076796585008992224175017327070,-0.2770460803060998955515970010310411453247} +#define T_4096_185 {0.9600021457376658462479213085316587239504,-0.2799926430802732180147529561509145423770} +#define T_4096_187 {0.9591386224618418943066444626310840249062,-0.2829365704570553363694784820836503058672} +#define T_4096_189 {0.9582660714080176722617920859192963689566,-0.2858778347270806152735644900531042367220} +#define T_4096_191 {0.9573845007889758562669157981872558593750,-0.2888164082060494797232763630745466798544} +#define T_4096_193 {0.9564939189023951016110913769807666540146,-0.2917522632349892619529896364838350564241} +#define T_4096_195 {0.9555943341307711058618679089704528450966,-0.2946853721805143266898596721148351207376} +#define T_4096_197 {0.9546857549413383381420317164156585931778,-0.2976157074350861964084913324768422171474} +#define T_4096_199 {0.9537681898859903251164382709248457103968,-0.3005432414172734545410037299006944522262} +#define T_4096_201 {0.9528416476011987157335170195437967777252,-0.3034679465720113156201875881379237398505} +#define T_4096_203 {0.9519061368079323459667762108438182622194,-0.3063897953708609733780576789285987615585} +#define T_4096_205 {0.9509616663115750823109806333377491682768,-0.3093087603122687267998003335378598421812} +#define T_4096_207 {0.9500082450018429991445145788020454347134,-0.3122248139218249396442672605189727619290} +#define T_4096_209 {0.9490458818527005568910226429579779505730,-0.3151379287525223893418058196402853354812} +#define T_4096_211 {0.9480745859222762250695382135745603591204,-0.3180480773850149489589966833591461181641} +#define T_4096_213 {0.9470943663527772171661922584462445229292,-0.3209552324278752144515181043971097096801} +#define T_4096_215 {0.9461052323704034483498048757610376924276,-0.3238593665178529090731274209247203543782} +#define T_4096_217 {0.9451071932852606050090571443433873355389,-0.3267604523201317889835593177849659696221} +#define T_4096_219 {0.9441002584912726591781506613187957555056,-0.3296584625285874925459950190997915342450} +#define T_4096_221 {0.9430844374660934947840473796532023698092,-0.3325533698660442238903556244622450321913} +#define T_4096_223 {0.9420597397710173126483823580201715230942,-0.3354451470845316585212003701599314808846} +#define T_4096_225 {0.9410261750508892575339814356993883848190,-0.3383337669655411827918101153045427054167} +#define T_4096_227 {0.9399837530340139357676321196777280420065,-0.3412192023202824109340269842505222186446} +#define T_4096_229 {0.9389324835320644879743667843285948038101,-0.3441014259899388694208255401463247835636} +#define T_4096_231 {0.9378723764399898854549064708407968282700,-0.3469804108459236813288839584856759756804} +#define T_4096_233 {0.9368034417359215604292899115534964948893,-0.3498561297901349176342478131118696182966} +#define T_4096_235 {0.9357256894810803693474099418381229043007,-0.3527285557552107264633889371907571330667} +#define T_4096_237 {0.9346391298196807806419883490889333188534,-0.3555976617047839072327519716054666787386} +#define T_4096_239 {0.9335437729788361727045753468701150268316,-0.3584634206337365402994521446089493110776} +#define T_4096_241 {0.9324396292684623555047096488124225288630,-0.3613258055684542835450656639295630156994} +#define T_4096_243 {0.9313267090811804260752637674158904701471,-0.3641847895670798918033028712670784443617} +#define T_4096_245 {0.9302050228922190688862770002742763608694,-0.3670403457197671803768912468513008207083} +#define T_4096_247 {0.9290745812593157459957637911429628729820,-0.3698924471489341003760387138754595071077} +#define T_4096_249 {0.9279353948226178872005220910068601369858,-0.3727410670095158140568969429295975714922} +#define T_4096_251 {0.9267874743045817487185900063195731490850,-0.3755861784892172150485123438556911423802} +#define T_4096_253 {0.9256308305098727151616344599460717290640,-0.3784277548087655596020795201184228062630} +#define T_4096_255 {0.9244654743252626039051733641827013343573,-0.3812657692221623761952287168242037296295} +#define T_4096_257 {0.9232914167195276355926125688711181282997,-0.3841001950169350420694058811932336539030} +#define T_4096_259 {0.9221086687433450723716532593243755400181,-0.3869310055143885818118576480628689751029} +#define T_4096_261 {0.9209172415291894120414895041903946548700,-0.3897581740698564667368941627501044422388} +#define T_4096_263 {0.9197171462912273609546787156432401388884,-0.3925816740729514697783031351718818768859} +#define T_4096_265 {0.9185083943252122518074997969961259514093,-0.3954014789478163538483102001919178292155} +#define T_4096_267 {0.9172909970083779063187989777361508458853,-0.3982175621533736165069683465844718739390} +#define T_4096_269 {0.9160649657993317207527184109494555741549,-0.4010298971835756232096059648029040545225} +#define T_4096_271 {0.9148303122379461971291902955272234976292,-0.4038384575676541299316113509121350944042} +#define T_4096_273 {0.9135870479452508075013383859186433255672,-0.4066432168703690286370999729115283116698} +#define T_4096_275 {0.9123351846233227480098548767273314297199,-0.4094441486922575923479428183782147243619} +#define T_4096_277 {0.9110747340551763606697477371199056506157,-0.4122412266698828875455262732430128380656} +#define T_4096_279 {0.9098057081046522220901806576875969767570,-0.4150344244760816314609996879880782216787} +#define T_4096_281 {0.9085281187163061211720105347922071814537,-0.4178237158202123269212791001336881890893} +#define T_4096_283 {0.9072419779152958163592757045989856123924,-0.4206090744484025090166312565997941419482} +#define T_4096_285 {0.9059472978072684590244989522034302353859,-0.4233904741437960472794088673254009336233} +#define T_4096_287 {0.9046440905782462404971511205076240003109,-0.4261678887267996707066686212783679366112} +#define T_4096_289 {0.9033323684945118170475097940652631223202,-0.4289412920553294927827892024652101099491} +#define T_4096_291 {0.9020121439024931797590056703484151512384,-0.4317106580250572589463331496517639607191} +#define T_4096_293 {0.9006834292286468590660319932794664055109,-0.4344759605696557058784890159586211666465} +#define T_4096_295 {0.8993462369793415733809638368256855756044,-0.4372371736610440873249672222300432622433} +#define T_4096_297 {0.8980005797407398793197330633120145648718,-0.4399942713096332558286860603402601554990} +#define T_4096_299 {0.8966464701786801549943106692808214575052,-0.4427472275645700228174916901480173692107} +#define T_4096_301 {0.8952839210385574730821645061951130628586,-0.4454960165139817407364830614824313670397} +#define T_4096_303 {0.8939129451452032526503899134695529937744,-0.4482406122852199414907659047457855194807} +#define T_4096_305 {0.8925335554027645779129329639545176178217,-0.4509809890451038638659042590006720274687} +#define T_4096_307 {0.8911457647945831839209063218731898814440,-0.4537171210001638699260695375414798036218} +#define T_4096_309 {0.8897495863830727769183681630238424986601,-0.4564489823968839177226186620828229933977} +#define T_4096_311 {0.8883450333095963546981010949821211397648,-0.4591765475219441450249746594636235386133} +#define T_4096_313 {0.8869321187943421946897615271154791116714,-0.4618997907024627314065412519994424656034} +#define T_4096_315 {0.8855108561361999530703315031132660806179,-0.4646186863062378158417686790926381945610} +#define T_4096_317 {0.8840812587126349875177311332663521170616,-0.4673332087419884151024973562016384676099} +#define T_4096_319 {0.8826433399795627909867334892624057829380,-0.4700433324595956197100576900993473827839} +#define T_4096_321 {0.8811971134712219821949474862776696681976,-0.4727490319503427906866477314906660467386} +#define T_4096_323 {0.8797425928000474071311032275843899697065,-0.4754502817471558673290132901456672698259} +#define T_4096_325 {0.8782797916565414642064979489077813923359,-0.4781470564248430643594645061966730281711} +#define T_4096_327 {0.8768087238091456514510468878143001347780,-0.4808393306003339584542288775992346927524} +#define T_4096_329 {0.8753294031041108924640070654277224093676,-0.4835270789329187413052579813665943220258} +#define T_4096_331 {0.8738418434653668631639789055043365806341,-0.4862102761244864179701608009054325520992} +#define T_4096_333 {0.8723460588943915405835127785394433885813,-0.4888888969197631717555907471250975504518} +#define T_4096_335 {0.8708420634700788642845736831077374517918,-0.4915629161065499519445154419372556731105} +#define T_4096_337 {0.8693298713486067308409133147506508976221,-0.4942323085159597284565791142085799947381} +#define T_4096_339 {0.8678094967633032119636027346132323145866,-0.4968970490226545244638600706821307539940} +#define T_4096_341 {0.8662809540245129946711699631123337894678,-0.4995571125450818938951158543204655870795} +#define T_4096_343 {0.8647442575194623781698055609012953937054,-0.5022124740457107883173648588126525282860} +#define T_4096_345 {0.8631994217121241597112657473189756274223,-0.5048631085312675903509216368547640740871} +#define T_4096_347 {0.8616464611430812992054484311665873974562,-0.5075089910529708703279538895003497600555} +#define T_4096_349 {0.8600853904293901397437593914219178259373,-0.5101500967067668090848542306048329919577} +#define T_4096_351 {0.8585162242644427399440587578283157199621,-0.5127864006335630664423774760507512837648} +#define T_4096_353 {0.8569389774178287622063976414210628718138,-0.5154178780194630382638365517777856439352} +#define T_4096_355 {0.8553536647351960287011252148658968508244,-0.5180445040959993363571811642032116651535} +#define T_4096_357 {0.8537603011381114104239031803444959223270,-0.5206662541403671573547740081266965717077} +#define T_4096_359 {0.8521589016239198288715783746738452464342,-0.5232831034756564303478398869629018008709} +#define T_4096_361 {0.8505494812656034797626602994569111615419,-0.5258950274710846306547296080680098384619} +#define T_4096_363 {0.8489320552116396134678666385298129171133,-0.5285020015422284833661592529097106307745} +#define T_4096_365 {0.8473066386858584264629712379246484488249,-0.5311040011512550007566346721432637423277} +#define T_4096_367 {0.8456732469872990654025102230662014335394,-0.5337010018071529637850858307501766830683} +#define T_4096_369 {0.8440318954900664083496053535782266408205,-0.5362929790659631823501740655046887695789} +#define T_4096_371 {0.8423825996431858476043430528079625219107,-0.5388799085310084224786919548932928591967} +#define T_4096_373 {0.8407253749704580725321534373506437987089,-0.5414617658531234445362656515499111264944} +#define T_4096_375 {0.8390602370703127421691647214174736291170,-0.5440385267308839312150325895345304161310} +#define T_4096_377 {0.8373872016156619357829526961722876876593,-0.5466101669108348604098068790335673838854} +#define T_4096_379 {0.8357062843537526042325680464273318648338,-0.5491766621877197662726644011854659765959} +#define T_4096_381 {0.8340175011060181331501439672138076275587,-0.5517379884047074467545712650462519377470} +#define T_4096_383 {0.8323208677679296840778988553211092948914,-0.5542941214536201144369442772585898637772} +#define T_4096_385 {0.8306164003088463143598119131638668477535,-0.5568450372751601040732793990173377096653} +#define T_4096_387 {0.8289041147718648749886938276176806539297,-0.5593907118591360250192678904568310827017} +#define T_4096_389 {0.8271840272736691312971402112452778965235,-0.5619311212446894687744247676164377480745} +#define T_4096_391 {0.8254561540043775513808554933348204940557,-0.5644662415205194960776680090930312871933} +#define T_4096_393 {0.8237205112273914275888841984851751476526,-0.5669960488251086783151322379126213490963} +#define T_4096_395 {0.8219771152792415547239102124876808375120,-0.5695205193469471405265380781202111393213} +#define T_4096_397 {0.8202259825694346861979511231766082346439,-0.5720396293247570485007713614322710782290} +#define T_4096_399 {0.8184671295802986579204230110917706042528,-0.5745533550477157636038327837013639509678} +#define T_4096_401 {0.8167005728668278452531126276880968362093,-0.5770616728556795527182998739590402692556} +#define T_4096_403 {0.8149263290565266215637052482634317129850,-0.5795645591394057438705544882395770400763} +#define T_4096_405 {0.8131444148492534829131272999802604317665,-0.5820619903407755479918250784976407885551} +#define T_4096_407 {0.8113548470170637294529569771839305758476,-0.5845539429530153263669944863067939877510} +#define T_4096_409 {0.8095576424040512586444151565956417471170,-0.5870403935209179691057101990736555308104} +#define T_4096_411 {0.8077528179261903584773563125054351985455,-0.5895213186410639405465872187050990760326} +#define T_4096_413 {0.8059403905711762794439323442929890006781,-0.5919966949620409923937813800876028835773} +#define T_4096_415 {0.8041203773982656954899539414327591657639,-0.5944664991846644319650749821448698639870} +#define T_4096_417 {0.8022927955381157216763199357956182211637,-0.5969307080621965022615427187702152878046} +#define T_4096_419 {0.8004576621926228208181441914348397403955,-0.5993892984005645407918905220867600291967} +#define T_4096_421 {0.7986149946347608219454627942468505352736,-0.6018422470585800265752141058328561484814} +#define T_4096_423 {0.7967648102084187167193363166006747633219,-0.6042895309481560728315230335283558815718} +#define T_4096_425 {0.7949071263282370125580200692638754844666,-0.6067311270345244755830549365782644599676} +#define T_4096_427 {0.7930419604794436416739245032658800482750,-0.6091670123364532063447995824390091001987} +#define T_4096_429 {0.7911693302176902031774829993082676082850,-0.6115971639264619064135786175029352307320} +#define T_4096_431 {0.7892892531688856516680630193150136619806,-0.6140215589310383803578474726236891001463} +#define T_4096_433 {0.7874017470290314291148092706862371414900,-0.6164401745308536462175652559380978345871} +#define T_4096_435 {0.7855068295640539322022277701762504875660,-0.6188529879609763195702498705941252410412} +#define T_4096_437 {0.7836045186096382009210969954438041895628,-0.6212599765110876637308479075727518647909} +#define T_4096_439 {0.7816948320710593867133297862892504781485,-0.6236611175256945305278577507124282419682} +#define T_4096_441 {0.7797777879230144426614401709230151027441,-0.6260563884043435223247797694057226181030} +#define T_4096_443 {0.7778534042094530365218929546244908124208,-0.6284457666018327115509123359515797346830} +#define T_4096_445 {0.7759216990434075755800336082756984978914,-0.6308292296284245814774749305797740817070} +#define T_4096_447 {0.7739826906068227874158083068323321640491,-0.6332067550500573016591943087405525147915} +#define T_4096_449 {0.7720363971503845235133667301852256059647,-0.6355783204885561143981931309099309146404} +#define T_4096_451 {0.7700828369933480077591525514435488730669,-0.6379439036218440550740638173010665923357} +#define T_4096_453 {0.7681220285233654188061791501240804791451,-0.6403034821841516732732202399347443133593} +#define T_4096_455 {0.7661539901963129173267930127622094005346,-0.6426570339662268649405518772255163639784} +#define T_4096_457 {0.7641787405361166740647149708820506930351,-0.6450045368155439273749607309582643210888} +#define T_4096_459 {0.7621962981345788978870814389665611088276,-0.6473459686365120591133859306864906102419} +#define T_4096_461 {0.7602066816512024205465536397241521626711,-0.6496813073906831936810135630366858094931} +#define T_4096_463 {0.7582099098130152814434268293553031980991,-0.6520105310969595002745791134657338261604} +#define T_4096_465 {0.7562060014143945352316222852095961570740,-0.6543336178318005513787625204713549464941} +#define T_4096_467 {0.7541949753168891712462595933175180107355,-0.6566505457294290470926512170990463346243} +#define T_4096_469 {0.7521768504490426998643215483753010630608,-0.6589612929820373166123204100586008280516} +#define T_4096_471 {0.7501516458062150727315042786358390003443,-0.6612658378399922654011788836214691400528} +#define T_4096_473 {0.7481193804504036037883452081587165594101,-0.6635641586120397672488024909398518502712} +#define T_4096_475 {0.7460800735100637792740485565445851534605,-0.6658562336655097224635824204597156494856} +#define T_4096_477 {0.7440337441799292905741936010599602013826,-0.6681420414265185625524168244737666100264} +#define T_4096_479 {0.7419804117208310678677207761211320757866,-0.6704215603801730871680319978622719645500} +#define T_4096_481 {0.7399200954595162027516153102624230086803,-0.6726947690707729687886740066460333764553} +#define T_4096_483 {0.7378528147884659826871711629792116582394,-0.6749616461020120361524732288671657443047} +#define T_4096_485 {0.7357785891657134813570451115083415061235,-0.6772221701371804458702285955951083451509} +#define T_4096_487 {0.7336974381146602608438911374832969158888,-0.6794763198993650776813524316821713000536} +#define T_4096_489 {0.7316093812238925186974825010111089795828,-0.6817240741716498186875128340034279972315} +#define T_4096_491 {0.7295144381469970129572288897179532796144,-0.6839654117973154034970661996339913457632} +#define T_4096_493 {0.7274126286023757659293664801225531846285,-0.6862003116800385882356749789323657751083} +#define T_4096_495 {0.7253039723730607679641479990095831453800,-0.6884287527840904363785057284985668957233} +#define T_4096_497 {0.7231884893065274599877056971308775246143,-0.6906507141345346045824271641322411596775} +#define T_4096_499 {0.7210661993145081050116118603909853845835,-0.6928661748174246293174860511498991400003} +#define T_4096_501 {0.7189371223728044935086245459388010203838,-0.6950751139800008804314757071551866829395} +#define T_4096_503 {0.7168012785210995385654086931026540696621,-0.6972775108308865155137823421682696789503} +#define T_4096_505 {0.7146586878627690930798621593567077070475,-0.6994733446402837673261387863021809607744} +#define T_4096_507 {0.7125093705646923236685097435838542878628,-0.7016625947401684548765388171887025237083} +#define T_4096_509 {0.7103533468570623066185021343699190765619,-0.7038452405244849385823613374668639153242} +#define T_4096_511 {0.7081906370331952915719853081100154668093,-0.7060212614493397431658650020835921168327} +#define T_4096_513 {0.7060212614493397431658650020835921168327,-0.7081906370331952915719853081100154668093} +#define T_4096_515 {0.7038452405244849385823613374668639153242,-0.7103533468570623066185021343699190765619} +#define T_4096_517 {0.7016625947401684548765388171887025237083,-0.7125093705646923236685097435838542878628} +#define T_4096_519 {0.6994733446402837673261387863021809607744,-0.7146586878627690930798621593567077070475} +#define T_4096_521 {0.6972775108308865155137823421682696789503,-0.7168012785210995385654086931026540696621} +#define T_4096_523 {0.6950751139800008804314757071551866829395,-0.7189371223728044935086245459388010203838} +#define T_4096_525 {0.6928661748174246293174860511498991400003,-0.7210661993145081050116118603909853845835} +#define T_4096_527 {0.6906507141345346045824271641322411596775,-0.7231884893065274599877056971308775246143} +#define T_4096_529 {0.6884287527840904363785057284985668957233,-0.7253039723730607679641479990095831453800} +#define T_4096_531 {0.6862003116800385882356749789323657751083,-0.7274126286023757659293664801225531846285} +#define T_4096_533 {0.6839654117973154034970661996339913457632,-0.7295144381469970129572288897179532796144} +#define T_4096_535 {0.6817240741716498186875128340034279972315,-0.7316093812238925186974825010111089795828} +#define T_4096_537 {0.6794763198993650776813524316821713000536,-0.7336974381146602608438911374832969158888} +#define T_4096_539 {0.6772221701371804458702285955951083451509,-0.7357785891657134813570451115083415061235} +#define T_4096_541 {0.6749616461020120361524732288671657443047,-0.7378528147884659826871711629792116582394} +#define T_4096_543 {0.6726947690707729687886740066460333764553,-0.7399200954595162027516153102624230086803} +#define T_4096_545 {0.6704215603801730871680319978622719645500,-0.7419804117208310678677207761211320757866} +#define T_4096_547 {0.6681420414265185625524168244737666100264,-0.7440337441799292905741936010599602013826} +#define T_4096_549 {0.6658562336655097224635824204597156494856,-0.7460800735100637792740485565445851534605} +#define T_4096_551 {0.6635641586120397672488024909398518502712,-0.7481193804504036037883452081587165594101} +#define T_4096_553 {0.6612658378399922654011788836214691400528,-0.7501516458062150727315042786358390003443} +#define T_4096_555 {0.6589612929820373166123204100586008280516,-0.7521768504490426998643215483753010630608} +#define T_4096_557 {0.6566505457294290470926512170990463346243,-0.7541949753168891712462595933175180107355} +#define T_4096_559 {0.6543336178318005513787625204713549464941,-0.7562060014143945352316222852095961570740} +#define T_4096_561 {0.6520105310969595002745791134657338261604,-0.7582099098130152814434268293553031980991} +#define T_4096_563 {0.6496813073906831936810135630366858094931,-0.7602066816512024205465536397241521626711} +#define T_4096_565 {0.6473459686365120591133859306864906102419,-0.7621962981345788978870814389665611088276} +#define T_4096_567 {0.6450045368155439273749607309582643210888,-0.7641787405361166740647149708820506930351} +#define T_4096_569 {0.6426570339662268649405518772255163639784,-0.7661539901963129173267930127622094005346} +#define T_4096_571 {0.6403034821841516732732202399347443133593,-0.7681220285233654188061791501240804791451} +#define T_4096_573 {0.6379439036218440550740638173010665923357,-0.7700828369933480077591525514435488730669} +#define T_4096_575 {0.6355783204885561143981931309099309146404,-0.7720363971503845235133667301852256059647} +#define T_4096_577 {0.6332067550500573016591943087405525147915,-0.7739826906068227874158083068323321640491} +#define T_4096_579 {0.6308292296284245814774749305797740817070,-0.7759216990434075755800336082756984978914} +#define T_4096_581 {0.6284457666018327115509123359515797346830,-0.7778534042094530365218929546244908124208} +#define T_4096_583 {0.6260563884043435223247797694057226181030,-0.7797777879230144426614401709230151027441} +#define T_4096_585 {0.6236611175256945305278577507124282419682,-0.7816948320710593867133297862892504781485} +#define T_4096_587 {0.6212599765110876637308479075727518647909,-0.7836045186096382009210969954438041895628} +#define T_4096_589 {0.6188529879609763195702498705941252410412,-0.7855068295640539322022277701762504875660} +#define T_4096_591 {0.6164401745308536462175652559380978345871,-0.7874017470290314291148092706862371414900} +#define T_4096_593 {0.6140215589310383803578474726236891001463,-0.7892892531688856516680630193150136619806} +#define T_4096_595 {0.6115971639264619064135786175029352307320,-0.7911693302176902031774829993082676082850} +#define T_4096_597 {0.6091670123364532063447995824390091001987,-0.7930419604794436416739245032658800482750} +#define T_4096_599 {0.6067311270345244755830549365782644599676,-0.7949071263282370125580200692638754844666} +#define T_4096_601 {0.6042895309481560728315230335283558815718,-0.7967648102084187167193363166006747633219} +#define T_4096_603 {0.6018422470585800265752141058328561484814,-0.7986149946347608219454627942468505352736} +#define T_4096_605 {0.5993892984005645407918905220867600291967,-0.8004576621926228208181441914348397403955} +#define T_4096_607 {0.5969307080621965022615427187702152878046,-0.8022927955381157216763199357956182211637} +#define T_4096_609 {0.5944664991846644319650749821448698639870,-0.8041203773982656954899539414327591657639} +#define T_4096_611 {0.5919966949620409923937813800876028835773,-0.8059403905711762794439323442929890006781} +#define T_4096_613 {0.5895213186410639405465872187050990760326,-0.8077528179261903584773563125054351985455} +#define T_4096_615 {0.5870403935209179691057101990736555308104,-0.8095576424040512586444151565956417471170} +#define T_4096_617 {0.5845539429530153263669944863067939877510,-0.8113548470170637294529569771839305758476} +#define T_4096_619 {0.5820619903407755479918250784976407885551,-0.8131444148492534829131272999802604317665} +#define T_4096_621 {0.5795645591394057438705544882395770400763,-0.8149263290565266215637052482634317129850} +#define T_4096_623 {0.5770616728556795527182998739590402692556,-0.8167005728668278452531126276880968362093} +#define T_4096_625 {0.5745533550477157636038327837013639509678,-0.8184671295802986579204230110917706042528} +#define T_4096_627 {0.5720396293247570485007713614322710782290,-0.8202259825694346861979511231766082346439} +#define T_4096_629 {0.5695205193469471405265380781202111393213,-0.8219771152792415547239102124876808375120} +#define T_4096_631 {0.5669960488251086783151322379126213490963,-0.8237205112273914275888841984851751476526} +#define T_4096_633 {0.5644662415205194960776680090930312871933,-0.8254561540043775513808554933348204940557} +#define T_4096_635 {0.5619311212446894687744247676164377480745,-0.8271840272736691312971402112452778965235} +#define T_4096_637 {0.5593907118591360250192678904568310827017,-0.8289041147718648749886938276176806539297} +#define T_4096_639 {0.5568450372751601040732793990173377096653,-0.8306164003088463143598119131638668477535} +#define T_4096_641 {0.5542941214536201144369442772585898637772,-0.8323208677679296840778988553211092948914} +#define T_4096_643 {0.5517379884047074467545712650462519377470,-0.8340175011060181331501439672138076275587} +#define T_4096_645 {0.5491766621877197662726644011854659765959,-0.8357062843537526042325680464273318648338} +#define T_4096_647 {0.5466101669108348604098068790335673838854,-0.8373872016156619357829526961722876876593} +#define T_4096_649 {0.5440385267308839312150325895345304161310,-0.8390602370703127421691647214174736291170} +#define T_4096_651 {0.5414617658531234445362656515499111264944,-0.8407253749704580725321534373506437987089} +#define T_4096_653 {0.5388799085310084224786919548932928591967,-0.8423825996431858476043430528079625219107} +#define T_4096_655 {0.5362929790659631823501740655046887695789,-0.8440318954900664083496053535782266408205} +#define T_4096_657 {0.5337010018071529637850858307501766830683,-0.8456732469872990654025102230662014335394} +#define T_4096_659 {0.5311040011512550007566346721432637423277,-0.8473066386858584264629712379246484488249} +#define T_4096_661 {0.5285020015422284833661592529097106307745,-0.8489320552116396134678666385298129171133} +#define T_4096_663 {0.5258950274710846306547296080680098384619,-0.8505494812656034797626602994569111615419} +#define T_4096_665 {0.5232831034756564303478398869629018008709,-0.8521589016239198288715783746738452464342} +#define T_4096_667 {0.5206662541403671573547740081266965717077,-0.8537603011381114104239031803444959223270} +#define T_4096_669 {0.5180445040959993363571811642032116651535,-0.8553536647351960287011252148658968508244} +#define T_4096_671 {0.5154178780194630382638365517777856439352,-0.8569389774178287622063976414210628718138} +#define T_4096_673 {0.5127864006335630664423774760507512837648,-0.8585162242644427399440587578283157199621} +#define T_4096_675 {0.5101500967067668090848542306048329919577,-0.8600853904293901397437593914219178259373} +#define T_4096_677 {0.5075089910529708703279538895003497600555,-0.8616464611430812992054484311665873974562} +#define T_4096_679 {0.5048631085312675903509216368547640740871,-0.8631994217121241597112657473189756274223} +#define T_4096_681 {0.5022124740457107883173648588126525282860,-0.8647442575194623781698055609012953937054} +#define T_4096_683 {0.4995571125450818938951158543204655870795,-0.8662809540245129946711699631123337894678} +#define T_4096_685 {0.4968970490226545244638600706821307539940,-0.8678094967633032119636027346132323145866} +#define T_4096_687 {0.4942323085159597284565791142085799947381,-0.8693298713486067308409133147506508976221} +#define T_4096_689 {0.4915629161065499519445154419372556731105,-0.8708420634700788642845736831077374517918} +#define T_4096_691 {0.4888888969197631717555907471250975504518,-0.8723460588943915405835127785394433885813} +#define T_4096_693 {0.4862102761244864179701608009054325520992,-0.8738418434653668631639789055043365806341} +#define T_4096_695 {0.4835270789329187413052579813665943220258,-0.8753294031041108924640070654277224093676} +#define T_4096_697 {0.4808393306003339584542288775992346927524,-0.8768087238091456514510468878143001347780} +#define T_4096_699 {0.4781470564248430643594645061966730281711,-0.8782797916565414642064979489077813923359} +#define T_4096_701 {0.4754502817471558673290132901456672698259,-0.8797425928000474071311032275843899697065} +#define T_4096_703 {0.4727490319503427906866477314906660467386,-0.8811971134712219821949474862776696681976} +#define T_4096_705 {0.4700433324595956197100576900993473827839,-0.8826433399795627909867334892624057829380} +#define T_4096_707 {0.4673332087419884151024973562016384676099,-0.8840812587126349875177311332663521170616} +#define T_4096_709 {0.4646186863062378158417686790926381945610,-0.8855108561361999530703315031132660806179} +#define T_4096_711 {0.4618997907024627314065412519994424656034,-0.8869321187943421946897615271154791116714} +#define T_4096_713 {0.4591765475219441450249746594636235386133,-0.8883450333095963546981010949821211397648} +#define T_4096_715 {0.4564489823968839177226186620828229933977,-0.8897495863830727769183681630238424986601} +#define T_4096_717 {0.4537171210001638699260695375414798036218,-0.8911457647945831839209063218731898814440} +#define T_4096_719 {0.4509809890451038638659042590006720274687,-0.8925335554027645779129329639545176178217} +#define T_4096_721 {0.4482406122852199414907659047457855194807,-0.8939129451452032526503899134695529937744} +#define T_4096_723 {0.4454960165139817407364830614824313670397,-0.8952839210385574730821645061951130628586} +#define T_4096_725 {0.4427472275645700228174916901480173692107,-0.8966464701786801549943106692808214575052} +#define T_4096_727 {0.4399942713096332558286860603402601554990,-0.8980005797407398793197330633120145648718} +#define T_4096_729 {0.4372371736610440873249672222300432622433,-0.8993462369793415733809638368256855756044} +#define T_4096_731 {0.4344759605696557058784890159586211666465,-0.9006834292286468590660319932794664055109} +#define T_4096_733 {0.4317106580250572589463331496517639607191,-0.9020121439024931797590056703484151512384} +#define T_4096_735 {0.4289412920553294927827892024652101099491,-0.9033323684945118170475097940652631223202} +#define T_4096_737 {0.4261678887267996707066686212783679366112,-0.9046440905782462404971511205076240003109} +#define T_4096_739 {0.4233904741437960472794088673254009336233,-0.9059472978072684590244989522034302353859} +#define T_4096_741 {0.4206090744484025090166312565997941419482,-0.9072419779152958163592757045989856123924} +#define T_4096_743 {0.4178237158202123269212791001336881890893,-0.9085281187163061211720105347922071814537} +#define T_4096_745 {0.4150344244760816314609996879880782216787,-0.9098057081046522220901806576875969767570} +#define T_4096_747 {0.4122412266698828875455262732430128380656,-0.9110747340551763606697477371199056506157} +#define T_4096_749 {0.4094441486922575923479428183782147243619,-0.9123351846233227480098548767273314297199} +#define T_4096_751 {0.4066432168703690286370999729115283116698,-0.9135870479452508075013383859186433255672} +#define T_4096_753 {0.4038384575676541299316113509121350944042,-0.9148303122379461971291902955272234976292} +#define T_4096_755 {0.4010298971835756232096059648029040545225,-0.9160649657993317207527184109494555741549} +#define T_4096_757 {0.3982175621533736165069683465844718739390,-0.9172909970083779063187989777361508458853} +#define T_4096_759 {0.3954014789478163538483102001919178292155,-0.9185083943252122518074997969961259514093} +#define T_4096_761 {0.3925816740729514697783031351718818768859,-0.9197171462912273609546787156432401388884} +#define T_4096_763 {0.3897581740698564667368941627501044422388,-0.9209172415291894120414895041903946548700} +#define T_4096_765 {0.3869310055143885818118576480628689751029,-0.9221086687433450723716532593243755400181} +#define T_4096_767 {0.3841001950169350420694058811932336539030,-0.9232914167195276355926125688711181282997} +#define T_4096_769 {0.3812657692221623761952287168242037296295,-0.9244654743252626039051733641827013343573} +#define T_4096_771 {0.3784277548087655596020795201184228062630,-0.9256308305098727151616344599460717290640} +#define T_4096_773 {0.3755861784892172150485123438556911423802,-0.9267874743045817487185900063195731490850} +#define T_4096_775 {0.3727410670095158140568969429295975714922,-0.9279353948226178872005220910068601369858} +#define T_4096_777 {0.3698924471489341003760387138754595071077,-0.9290745812593157459957637911429628729820} +#define T_4096_779 {0.3670403457197671803768912468513008207083,-0.9302050228922190688862770002742763608694} +#define T_4096_781 {0.3641847895670798918033028712670784443617,-0.9313267090811804260752637674158904701471} +#define T_4096_783 {0.3613258055684542835450656639295630156994,-0.9324396292684623555047096488124225288630} +#define T_4096_785 {0.3584634206337365402994521446089493110776,-0.9335437729788361727045753468701150268316} +#define T_4096_787 {0.3555976617047839072327519716054666787386,-0.9346391298196807806419883490889333188534} +#define T_4096_789 {0.3527285557552107264633889371907571330667,-0.9357256894810803693474099418381229043007} +#define T_4096_791 {0.3498561297901349176342478131118696182966,-0.9368034417359215604292899115534964948893} +#define T_4096_793 {0.3469804108459236813288839584856759756804,-0.9378723764399898854549064708407968282700} +#define T_4096_795 {0.3441014259899388694208255401463247835636,-0.9389324835320644879743667843285948038101} +#define T_4096_797 {0.3412192023202824109340269842505222186446,-0.9399837530340139357676321196777280420065} +#define T_4096_799 {0.3383337669655411827918101153045427054167,-0.9410261750508892575339814356993883848190} +#define T_4096_801 {0.3354451470845316585212003701599314808846,-0.9420597397710173126483823580201715230942} +#define T_4096_803 {0.3325533698660442238903556244622450321913,-0.9430844374660934947840473796532023698092} +#define T_4096_805 {0.3296584625285874925459950190997915342450,-0.9441002584912726591781506613187957555056} +#define T_4096_807 {0.3267604523201317889835593177849659696221,-0.9451071932852606050090571443433873355389} +#define T_4096_809 {0.3238593665178529090731274209247203543782,-0.9461052323704034483498048757610376924276} +#define T_4096_811 {0.3209552324278752144515181043971097096801,-0.9470943663527772171661922584462445229292} +#define T_4096_813 {0.3180480773850149489589966833591461181641,-0.9480745859222762250695382135745603591204} +#define T_4096_815 {0.3151379287525223893418058196402853354812,-0.9490458818527005568910226429579779505730} +#define T_4096_817 {0.3122248139218249396442672605189727619290,-0.9500082450018429991445145788020454347134} +#define T_4096_819 {0.3093087603122687267998003335378598421812,-0.9509616663115750823109806333377491682768} +#define T_4096_821 {0.3063897953708609733780576789285987615585,-0.9519061368079323459667762108438182622194} +#define T_4096_823 {0.3034679465720113156201875881379237398505,-0.9528416476011987157335170195437967777252} +#define T_4096_825 {0.3005432414172734545410037299006944522262,-0.9537681898859903251164382709248457103968} +#define T_4096_827 {0.2976157074350861964084913324768422171474,-0.9546857549413383381420317164156585931778} +#define T_4096_829 {0.2946853721805143266898596721148351207376,-0.9555943341307711058618679089704528450966} +#define T_4096_831 {0.2917522632349892619529896364838350564241,-0.9564939189023951016110913769807666540146} +#define T_4096_833 {0.2888164082060494797232763630745466798544,-0.9573845007889758562669157981872558593750} +#define T_4096_835 {0.2858778347270806152735644900531042367220,-0.9582660714080176722617920859192963689566} +#define T_4096_837 {0.2829365704570553363694784820836503058672,-0.9591386224618418943066444626310840249062} +#define T_4096_839 {0.2799926430802732180147529561509145423770,-0.9600021457376658462479213085316587239504} +#define T_4096_841 {0.2770460803060998955515970010310411453247,-0.9608566331076796585008992224175017327070} +#define T_4096_843 {0.2740969098687063842945121905358973890543,-0.9617020765291225403714747699268627911806} +#define T_4096_845 {0.2711451595268080105860519779525930061936,-0.9625384680443591634002586943097412586212} +#define T_4096_847 {0.2681908570634031763191273967095185071230,-0.9633657997809540463052258019160944968462} +#define T_4096_849 {0.2652340302855117903924053734954213723540,-0.9641840639517458289020623851683922111988} +#define T_4096_851 {0.2622747070239136446545558101206552237272,-0.9649932528549203247791865578619763255119} +#define T_4096_853 {0.2593129151328862347369863528001587837934,-0.9657933588740836849950710529810748994350} +#define T_4096_855 {0.2563486824899429139534845489833969622850,-0.9665843744783331192849118451704271137714} +#define T_4096_857 {0.2533820369955701590214403040590696036816,-0.9673662922223285054457164733321405947208} +#define T_4096_859 {0.2504130065729652798722781881224364042282,-0.9681391047463624444091578880033921450377} +#define T_4096_861 {0.2474416191677732967946212738752365112305,-0.9689028047764288720244962860306259244680} +#define T_4096_863 {0.2444679027478241783999379777014837600291,-0.9696573851242924479976181828533299267292} +#define T_4096_865 {0.2414918853028693301876472787625971250236,-0.9704028386875555023394213094434235244989} +#define T_4096_867 {0.2385135948443184439327069412684068083763,-0.9711391584497250928365019717602990567684} +#define T_4096_869 {0.2355330594049755144059332678807550109923,-0.9718663374802793963880276351119391620159} +#define T_4096_871 {0.2325503070387752446723794719218858517706,-0.9725843689347322129634676457499153912067} +#define T_4096_873 {0.2295653658205188962782017370045650750399,-0.9732932460546982467164411900739651173353} +#define T_4096_875 {0.2265782638456100006596471985176322050393,-0.9739929621679558335856086159765254706144} +#define T_4096_877 {0.2235890292297899872853150782248121686280,-0.9746835106885106680962849168281536549330} +#define T_4096_879 {0.2205976901088735342426616625743918120861,-0.9753648851166569766490965776029042899609} +#define T_4096_881 {0.2176042746384836412687491247197613120079,-0.9760370790390390238755458085506688803434} +#define T_4096_883 {0.2146088109937867582921455777977826073766,-0.9767000861287118418374575412599369883537} +#define T_4096_885 {0.2116113273692275809079177406601957045496,-0.9773539001451999608249820994387846440077} +#define T_4096_887 {0.2086118519782634850301406004291493445635,-0.9779985149345571393553200323367491364479} +#define T_4096_889 {0.2056104130530992668557388469707802869380,-0.9786339244294232075915829227596987038851} +#define T_4096_891 {0.2026070388444211334277866853881278075278,-0.9792601226490820209846788202412426471710} +#define T_4096_893 {0.1996017576211309707545638048031833022833,-0.9798771036995176375583582739636767655611} +#define T_4096_895 {0.1965945976700802233505527283341507427394,-0.9804848617734693849712357405223883688450} +#define T_4096_897 {0.1935855872958036349995580849281395785511,-0.9810833911504867055342060666589532047510} +#define T_4096_899 {0.1905747548202527674732209561625495553017,-0.9816726861969831130494412718690000474453} +#define T_4096_901 {0.1875621285825296025162600699331960640848,-0.9822527413662893724932700934004969894886} +#define T_4096_903 {0.1845477369386196164757762971930787898600,-0.9828235511987052364091255185485351830721} +#define T_4096_905 {0.1815316082611249937084352268357179127634,-0.9833851103215511813004923169501125812531} +#define T_4096_907 {0.1785137709389975346763179686604416929185,-0.9839374134492189227785274852067232131958} +#define T_4096_909 {0.1754942533772714252648938781931065022945,-0.9844804553832209315089585288660600781441} +#define T_4096_911 {0.1724730839967959783454176658779033459723,-0.9850142310122398381366792818880639970303} +#define T_4096_913 {0.1694502912339679590036922718354617245495,-0.9855387353121760618535063258605077862740} +#define T_4096_915 {0.1664259035404641318134366656522615812719,-0.9860539633461954389659354092145804315805} +#define T_4096_917 {0.1633999493829732252425657179628615267575,-0.9865599102647754081729658537369687110186} +#define T_4096_919 {0.1603724572429282568819530752080027014017,-0.9870565713057509737993200360506307333708} +#define T_4096_921 {0.1573434556162382480515304905566154047847,-0.9875439417943592257387308563920669257641} +#define T_4096_923 {0.1543129730130201049398408486013067886233,-0.9880220171432835263303218198416288942099} +#define T_4096_925 {0.1512810379573302221878350337647134438157,-0.9884907928526965870119624923972878605127} +#define T_4096_927 {0.1482476789868960309615175674480269663036,-0.9889502645103029898621116444701328873634} +#define T_4096_929 {0.1452129246528474637578653982927789911628,-0.9894004277913803768740308441920205950737} +#define T_4096_931 {0.1421768035194480583882636892667505890131,-0.9898412784588205282076955882075708359480} +#define T_4096_933 {0.1391393441638262007398196828944492153823,-0.9902728123631691081740768822783138602972} +#define T_4096_935 {0.1361005751757062010032228727141045965254,-0.9906950254426646340633055842772591859102} +#define T_4096_937 {0.1330605251571390645892734028166159987450,-0.9911079137232768898613244346051942557096} +#define T_4096_939 {0.1300192227222333463121373142712400294840,-0.9915114733187438966766080739034805446863} +#define T_4096_941 {0.1269766964968858657947237134067108854651,-0.9919057004306093272560929108294658362865} +#define T_4096_943 {0.1239329751185121730738813994321390055120,-0.9922905913482573669881503519718535244465} +#define T_4096_945 {0.1208880872357770835945345311301934998482,-0.9926661424489480189947698818286880850792} +#define T_4096_947 {0.1178420615083249772814255607045197393745,-0.9930323501978514100230199801444541662931} +#define T_4096_949 {0.1147949266065100837330703598127001896501,-0.9933892111480806530465770265436731278896} +#define T_4096_951 {0.1117467112111266008822596518257341813296,-0.9937367219407245988449517426488455384970} +#define T_4096_953 {0.1086974440131387165120457893863203935325,-0.9940748793048793663373885465261992067099} +#define T_4096_955 {0.1056471537134106158939417241526825819165,-0.9944036800576790957606476695218589156866} +#define T_4096_957 {0.1025958690224362951370906671400007326156,-0.9947231211043257026460651104571297764778} +#define T_4096_959 {0.0995436186600693329040723256184719502926,-0.9950331994381186317966125898237805813551} +#define T_4096_961 {0.0964904313552525927377701009390875697136,-0.9953339121404822797956057911505922675133} +#define T_4096_963 {0.0934363358457477866103602082148427143693,-0.9956252563809943056938323024951387196779} +#define T_4096_965 {0.0903813608778649829611495647441188339144,-0.9959072294174117212506303076224867254496} +#define T_4096_967 {0.0873255352061920731010502549906959757209,-0.9961798285956969811749672771838959306479} +#define T_4096_969 {0.0842688875933240710836003017902839928865,-0.9964430513500426300765866471920162439346} +#define T_4096_971 {0.0812114468095924413315600531859672628343,-0.9966968952028960604394569600117392838001} +#define T_4096_973 {0.0781532416327942458522670676757115870714,-0.9969413577649821611714742175536230206490} +#define T_4096_975 {0.0750943008479213192085666150887846015394,-0.9971764367353261881987691594986245036125} +#define T_4096_977 {0.0720346532468893185896519071320653893054,-0.9974021299012753027923849913349840790033} +#define T_4096_979 {0.0689743276282667461263287123074405826628,-0.9976184351385195547834427998168393969536} +#define T_4096_981 {0.0659133527970038185506140848701761569828,-0.9978253504111116445329798807506449520588} +#define T_4096_983 {0.0628517575641614201220619406740297563374,-0.9980228737714862408125782167189754545689} +#define T_4096_985 {0.0597895707466398751428471314284252002835,-0.9982110033604781884619683296477887779474} +#define T_4096_987 {0.0567268211669077482284251345845405012369,-0.9983897374073401609351208207954186946154} +#define T_4096_989 {0.0536635376527305266169953767985134618357,-0.9985590742297593136456157481006812304258} +#define T_4096_991 {0.0505997490368992816622828456729621393606,-0.9987190122338729381112898408900946378708} +#define T_4096_993 {0.0475354841569593025707440858695917995647,-0.9988695499142835609873714020068291574717} +#define T_4096_995 {0.0444707718549386676887280600567464716733,-0.9990106858540733769658004348457325249910} +#define T_4096_997 {0.0414056409770767394618040668774483492598,-0.9991424187248169053177093701378908008337} +#define T_4096_999 {0.0383401203735526940885591784535790793598,-0.9992647472865944235920210303447674959898} +#define T_4096_1001 {0.0352742388982139540298810231888637645170,-0.9993776703880028478010899561922997236252} +#define T_4096_1003 {0.0322080254083045858237710490357130765915,-0.9994811869661669456732511207519564777613} +#define T_4096_1005 {0.0291415087641937256557334023909788811579,-0.9995752960467492176377390933339484035969} +#define T_4096_1007 {0.0260747178291039008457552483832841971889,-0.9996599967439592226980948908021673560143} +#define T_4096_1009 {0.0230076814688393721519688028820382896811,-0.9997352882605616830602457412169314920902} +#define T_4096_1011 {0.0199404285515144379103968930166956852190,-0.9998011698878842556936774599307682365179} +#define T_4096_1013 {0.0168729879472817138885698540207158657722,-0.9998576410058238606026748129806946963072} +#define T_4096_1015 {0.0138053885280603905877372383770307351369,-0.9999047010828528980752594179648440331221} +#define T_4096_1017 {0.0107376591672644922798030009403191797901,-0.9999423496760239116198931696999352425337} +#define T_4096_1019 {0.0076698287395310978803442125695255526807,-0.9999705864309741398798792033630888909101} +#define T_4096_1021 {0.0046019261204485704952471536444136290811,-0.9999894110819284032132259198988322168589} +#define T_4096_1023 {0.0015339801862847657169808268662336558918,-0.9999988234517018792502085489104501903057} +#define T_4096_1025 {-0.0015339801862847657169808268662336558918,-0.9999988234517018792502085489104501903057} +#define T_4096_1029 {-0.0076698287395310978803442125695255526807,-0.9999705864309741398798792033630888909101} +#define T_4096_1035 {-0.0168729879472817138885698540207158657722,-0.9998576410058238606026748129806946963072} +#define T_4096_1037 {-0.0199404285515144379103968930166956852190,-0.9998011698878842556936774599307682365179} +#define T_4096_1041 {-0.0260747178291039008457552483832841971889,-0.9996599967439592226980948908021673560143} +#define T_4096_1045 {-0.0322080254083045858237710490357130765915,-0.9994811869661669456732511207519564777613} +#define T_4096_1047 {-0.0352742388982139540298810231888637645170,-0.9993776703880028478010899561922997236252} +#define T_4096_1053 {-0.0444707718549386676887280600567464716733,-0.9990106858540733769658004348457325249910} +#define T_4096_1055 {-0.0475354841569593025707440858695917995647,-0.9988695499142835609873714020068291574717} +#define T_4096_1059 {-0.0536635376527305266169953767985134618357,-0.9985590742297593136456157481006812304258} +#define T_4096_1065 {-0.0628517575641614201220619406740297563374,-0.9980228737714862408125782167189754545689} +#define T_4096_1071 {-0.0720346532468893185896519071320653893054,-0.9974021299012753027923849913349840790033} +#define T_4096_1075 {-0.0781532416327942458522670676757115870714,-0.9969413577649821611714742175536230206490} +#define T_4096_1077 {-0.0812114468095924413315600531859672628343,-0.9966968952028960604394569600117392838001} +#define T_4096_1083 {-0.0903813608778649829611495647441188339144,-0.9959072294174117212506303076224867254496} +#define T_4096_1085 {-0.0934363358457477866103602082148427143693,-0.9956252563809943056938323024951387196779} +#define T_4096_1089 {-0.0995436186600693329040723256184719502926,-0.9950331994381186317966125898237805813551} +#define T_4096_1095 {-0.1086974440131387165120457893863203935325,-0.9940748793048793663373885465261992067099} +#define T_4096_1101 {-0.1178420615083249772814255607045197393745,-0.9930323501978514100230199801444541662931} +#define T_4096_1105 {-0.1239329751185121730738813994321390055120,-0.9922905913482573669881503519718535244465} +#define T_4096_1107 {-0.1269766964968858657947237134067108854651,-0.9919057004306093272560929108294658362865} +#define T_4096_1113 {-0.1361005751757062010032228727141045965254,-0.9906950254426646340633055842772591859102} +#define T_4096_1115 {-0.1391393441638262007398196828944492153823,-0.9902728123631691081740768822783138602972} +#define T_4096_1119 {-0.1452129246528474637578653982927789911628,-0.9894004277913803768740308441920205950737} +#define T_4096_1125 {-0.1543129730130201049398408486013067886233,-0.9880220171432835263303218198416288942099} +#define T_4096_1131 {-0.1633999493829732252425657179628615267575,-0.9865599102647754081729658537369687110186} +#define T_4096_1135 {-0.1694502912339679590036922718354617245495,-0.9855387353121760618535063258605077862740} +#define T_4096_1137 {-0.1724730839967959783454176658779033459723,-0.9850142310122398381366792818880639970303} +#define T_4096_1139 {-0.1754942533772714252648938781931065022945,-0.9844804553832209315089585288660600781441} +#define T_4096_1143 {-0.1815316082611249937084352268357179127634,-0.9833851103215511813004923169501125812531} +#define T_4096_1145 {-0.1845477369386196164757762971930787898600,-0.9828235511987052364091255185485351830721} +#define T_4096_1149 {-0.1905747548202527674732209561625495553017,-0.9816726861969831130494412718690000474453} +#define T_4096_1155 {-0.1996017576211309707545638048031833022833,-0.9798771036995176375583582739636767655611} +#define T_4096_1161 {-0.2086118519782634850301406004291493445635,-0.9779985149345571393553200323367491364479} +#define T_4096_1165 {-0.2146088109937867582921455777977826073766,-0.9767000861287118418374575412599369883537} +#define T_4096_1167 {-0.2176042746384836412687491247197613120079,-0.9760370790390390238755458085506688803434} +#define T_4096_1173 {-0.2265782638456100006596471985176322050393,-0.9739929621679558335856086159765254706144} +#define T_4096_1175 {-0.2295653658205188962782017370045650750399,-0.9732932460546982467164411900739651173353} +#define T_4096_1179 {-0.2355330594049755144059332678807550109923,-0.9718663374802793963880276351119391620159} +#define T_4096_1185 {-0.2444679027478241783999379777014837600291,-0.9696573851242924479976181828533299267292} +#define T_4096_1191 {-0.2533820369955701590214403040590696036816,-0.9673662922223285054457164733321405947208} +#define T_4096_1195 {-0.2593129151328862347369863528001587837934,-0.9657933588740836849950710529810748994350} +#define T_4096_1197 {-0.2622747070239136446545558101206552237272,-0.9649932528549203247791865578619763255119} +#define T_4096_1203 {-0.2711451595268080105860519779525930061936,-0.9625384680443591634002586943097412586212} +#define T_4096_1205 {-0.2740969098687063842945121905358973890543,-0.9617020765291225403714747699268627911806} +#define T_4096_1207 {-0.2770460803060998955515970010310411453247,-0.9608566331076796585008992224175017327070} +#define T_4096_1209 {-0.2799926430802732180147529561509145423770,-0.9600021457376658462479213085316587239504} +#define T_4096_1215 {-0.2888164082060494797232763630745466798544,-0.9573845007889758562669157981872558593750} +#define T_4096_1221 {-0.2976157074350861964084913324768422171474,-0.9546857549413383381420317164156585931778} +#define T_4096_1225 {-0.3034679465720113156201875881379237398505,-0.9528416476011987157335170195437967777252} +#define T_4096_1227 {-0.3063897953708609733780576789285987615585,-0.9519061368079323459667762108438182622194} +#define T_4096_1233 {-0.3151379287525223893418058196402853354812,-0.9490458818527005568910226429579779505730} +#define T_4096_1235 {-0.3180480773850149489589966833591461181641,-0.9480745859222762250695382135745603591204} +#define T_4096_1239 {-0.3238593665178529090731274209247203543782,-0.9461052323704034483498048757610376924276} +#define T_4096_1241 {-0.3267604523201317889835593177849659696221,-0.9451071932852606050090571443433873355389} +#define T_4096_1245 {-0.3325533698660442238903556244622450321913,-0.9430844374660934947840473796532023698092} +#define T_4096_1251 {-0.3412192023202824109340269842505222186446,-0.9399837530340139357676321196777280420065} +#define T_4096_1255 {-0.3469804108459236813288839584856759756804,-0.9378723764399898854549064708407968282700} +#define T_4096_1257 {-0.3498561297901349176342478131118696182966,-0.9368034417359215604292899115534964948893} +#define T_4096_1263 {-0.3584634206337365402994521446089493110776,-0.9335437729788361727045753468701150268316} +#define T_4096_1265 {-0.3613258055684542835450656639295630156994,-0.9324396292684623555047096488124225288630} +#define T_4096_1269 {-0.3670403457197671803768912468513008207083,-0.9302050228922190688862770002742763608694} +#define T_4096_1275 {-0.3755861784892172150485123438556911423802,-0.9267874743045817487185900063195731490850} +#define T_4096_1281 {-0.3841001950169350420694058811932336539030,-0.9232914167195276355926125688711181282997} +#define T_4096_1285 {-0.3897581740698564667368941627501044422388,-0.9209172415291894120414895041903946548700} +#define T_4096_1287 {-0.3925816740729514697783031351718818768859,-0.9197171462912273609546787156432401388884} +#define T_4096_1293 {-0.4010298971835756232096059648029040545225,-0.9160649657993317207527184109494555741549} +#define T_4096_1295 {-0.4038384575676541299316113509121350944042,-0.9148303122379461971291902955272234976292} +#define T_4096_1299 {-0.4094441486922575923479428183782147243619,-0.9123351846233227480098548767273314297199} +#define T_4096_1305 {-0.4178237158202123269212791001336881890893,-0.9085281187163061211720105347922071814537} +#define T_4096_1309 {-0.4233904741437960472794088673254009336233,-0.9059472978072684590244989522034302353859} +#define T_4096_1311 {-0.4261678887267996707066686212783679366112,-0.9046440905782462404971511205076240003109} +#define T_4096_1315 {-0.4317106580250572589463331496517639607191,-0.9020121439024931797590056703484151512384} +#define T_4096_1317 {-0.4344759605696557058784890159586211666465,-0.9006834292286468590660319932794664055109} +#define T_4096_1323 {-0.4427472275645700228174916901480173692107,-0.8966464701786801549943106692808214575052} +#define T_4096_1325 {-0.4454960165139817407364830614824313670397,-0.8952839210385574730821645061951130628586} +#define T_4096_1329 {-0.4509809890451038638659042590006720274687,-0.8925335554027645779129329639545176178217} +#define T_4096_1335 {-0.4591765475219441450249746594636235386133,-0.8883450333095963546981010949821211397648} +#define T_4096_1341 {-0.4673332087419884151024973562016384676099,-0.8840812587126349875177311332663521170616} +#define T_4096_1343 {-0.4700433324595956197100576900993473827839,-0.8826433399795627909867334892624057829380} +#define T_4096_1345 {-0.4727490319503427906866477314906660467386,-0.8811971134712219821949474862776696681976} +#define T_4096_1347 {-0.4754502817471558673290132901456672698259,-0.8797425928000474071311032275843899697065} +#define T_4096_1353 {-0.4835270789329187413052579813665943220258,-0.8753294031041108924640070654277224093676} +#define T_4096_1355 {-0.4862102761244864179701608009054325520992,-0.8738418434653668631639789055043365806341} +#define T_4096_1359 {-0.4915629161065499519445154419372556731105,-0.8708420634700788642845736831077374517918} +#define T_4096_1365 {-0.4995571125450818938951158543204655870795,-0.8662809540245129946711699631123337894678} +#define T_4096_1371 {-0.5075089910529708703279538895003497600555,-0.8616464611430812992054484311665873974562} +#define T_4096_1375 {-0.5127864006335630664423774760507512837648,-0.8585162242644427399440587578283157199621} +#define T_4096_1377 {-0.5154178780194630382638365517777856439352,-0.8569389774178287622063976414210628718138} +#define T_4096_1383 {-0.5232831034756564303478398869629018008709,-0.8521589016239198288715783746738452464342} +#define T_4096_1385 {-0.5258950274710846306547296080680098384619,-0.8505494812656034797626602994569111615419} +#define T_4096_1389 {-0.5311040011512550007566346721432637423277,-0.8473066386858584264629712379246484488249} +#define T_4096_1395 {-0.5388799085310084224786919548932928591967,-0.8423825996431858476043430528079625219107} +#define T_4096_1401 {-0.5466101669108348604098068790335673838854,-0.8373872016156619357829526961722876876593} +#define T_4096_1405 {-0.5517379884047074467545712650462519377470,-0.8340175011060181331501439672138076275587} +#define T_4096_1407 {-0.5542941214536201144369442772585898637772,-0.8323208677679296840778988553211092948914} +#define T_4096_1411 {-0.5593907118591360250192678904568310827017,-0.8289041147718648749886938276176806539297} +#define T_4096_1413 {-0.5619311212446894687744247676164377480745,-0.8271840272736691312971402112452778965235} +#define T_4096_1415 {-0.5644662415205194960776680090930312871933,-0.8254561540043775513808554933348204940557} +#define T_4096_1419 {-0.5695205193469471405265380781202111393213,-0.8219771152792415547239102124876808375120} +#define T_4096_1425 {-0.5770616728556795527182998739590402692556,-0.8167005728668278452531126276880968362093} +#define T_4096_1431 {-0.5845539429530153263669944863067939877510,-0.8113548470170637294529569771839305758476} +#define T_4096_1435 {-0.5895213186410639405465872187050990760326,-0.8077528179261903584773563125054351985455} +#define T_4096_1437 {-0.5919966949620409923937813800876028835773,-0.8059403905711762794439323442929890006781} +#define T_4096_1443 {-0.5993892984005645407918905220867600291967,-0.8004576621926228208181441914348397403955} +#define T_4096_1445 {-0.6018422470585800265752141058328561484814,-0.7986149946347608219454627942468505352736} +#define T_4096_1449 {-0.6067311270345244755830549365782644599676,-0.7949071263282370125580200692638754844666} +#define T_4096_1455 {-0.6140215589310383803578474726236891001463,-0.7892892531688856516680630193150136619806} +#define T_4096_1461 {-0.6212599765110876637308479075727518647909,-0.7836045186096382009210969954438041895628} +#define T_4096_1465 {-0.6260563884043435223247797694057226181030,-0.7797777879230144426614401709230151027441} +#define T_4096_1467 {-0.6284457666018327115509123359515797346830,-0.7778534042094530365218929546244908124208} +#define T_4096_1473 {-0.6355783204885561143981931309099309146404,-0.7720363971503845235133667301852256059647} +#define T_4096_1475 {-0.6379439036218440550740638173010665923357,-0.7700828369933480077591525514435488730669} +#define T_4096_1479 {-0.6426570339662268649405518772255163639784,-0.7661539901963129173267930127622094005346} +#define T_4096_1485 {-0.6496813073906831936810135630366858094931,-0.7602066816512024205465536397241521626711} +#define T_4096_1491 {-0.6566505457294290470926512170990463346243,-0.7541949753168891712462595933175180107355} +#define T_4096_1495 {-0.6612658378399922654011788836214691400528,-0.7501516458062150727315042786358390003443} +#define T_4096_1497 {-0.6635641586120397672488024909398518502712,-0.7481193804504036037883452081587165594101} +#define T_4096_1503 {-0.6704215603801730871680319978622719645500,-0.7419804117208310678677207761211320757866} +#define T_4096_1505 {-0.6726947690707729687886740066460333764553,-0.7399200954595162027516153102624230086803} +#define T_4096_1509 {-0.6772221701371804458702285955951083451509,-0.7357785891657134813570451115083415061235} +#define T_4096_1513 {-0.6817240741716498186875128340034279972315,-0.7316093812238925186974825010111089795828} +#define T_4096_1515 {-0.6839654117973154034970661996339913457632,-0.7295144381469970129572288897179532796144} +#define T_4096_1521 {-0.6906507141345346045824271641322411596775,-0.7231884893065274599877056971308775246143} +#define T_4096_1525 {-0.6950751139800008804314757071551866829395,-0.7189371223728044935086245459388010203838} +#define T_4096_1527 {-0.6972775108308865155137823421682696789503,-0.7168012785210995385654086931026540696621} +#define T_4096_1533 {-0.7038452405244849385823613374668639153242,-0.7103533468570623066185021343699190765619} +#define T_4096_1535 {-0.7060212614493397431658650020835921168327,-0.7081906370331952915719853081100154668093} +#define T_4096_1539 {-0.7103533468570623066185021343699190765619,-0.7038452405244849385823613374668639153242} +#define T_4096_1545 {-0.7168012785210995385654086931026540696621,-0.6972775108308865155137823421682696789503} +#define T_4096_1547 {-0.7189371223728044935086245459388010203838,-0.6950751139800008804314757071551866829395} +#define T_4096_1551 {-0.7231884893065274599877056971308775246143,-0.6906507141345346045824271641322411596775} +#define T_4096_1555 {-0.7274126286023757659293664801225531846285,-0.6862003116800385882356749789323657751083} +#define T_4096_1557 {-0.7295144381469970129572288897179532796144,-0.6839654117973154034970661996339913457632} +#define T_4096_1563 {-0.7357785891657134813570451115083415061235,-0.6772221701371804458702285955951083451509} +#define T_4096_1565 {-0.7378528147884659826871711629792116582394,-0.6749616461020120361524732288671657443047} +#define T_4096_1569 {-0.7419804117208310678677207761211320757866,-0.6704215603801730871680319978622719645500} +#define T_4096_1575 {-0.7481193804504036037883452081587165594101,-0.6635641586120397672488024909398518502712} +#define T_4096_1581 {-0.7541949753168891712462595933175180107355,-0.6566505457294290470926512170990463346243} +#define T_4096_1585 {-0.7582099098130152814434268293553031980991,-0.6520105310969595002745791134657338261604} +#define T_4096_1587 {-0.7602066816512024205465536397241521626711,-0.6496813073906831936810135630366858094931} +#define T_4096_1593 {-0.7661539901963129173267930127622094005346,-0.6426570339662268649405518772255163639784} +#define T_4096_1595 {-0.7681220285233654188061791501240804791451,-0.6403034821841516732732202399347443133593} +#define T_4096_1599 {-0.7720363971503845235133667301852256059647,-0.6355783204885561143981931309099309146404} +#define T_4096_1605 {-0.7778534042094530365218929546244908124208,-0.6284457666018327115509123359515797346830} +#define T_4096_1611 {-0.7836045186096382009210969954438041895628,-0.6212599765110876637308479075727518647909} +#define T_4096_1615 {-0.7874017470290314291148092706862371414900,-0.6164401745308536462175652559380978345871} +#define T_4096_1617 {-0.7892892531688856516680630193150136619806,-0.6140215589310383803578474726236891001463} +#define T_4096_1623 {-0.7949071263282370125580200692638754844666,-0.6067311270345244755830549365782644599676} +#define T_4096_1625 {-0.7967648102084187167193363166006747633219,-0.6042895309481560728315230335283558815718} +#define T_4096_1629 {-0.8004576621926228208181441914348397403955,-0.5993892984005645407918905220867600291967} +#define T_4096_1635 {-0.8059403905711762794439323442929890006781,-0.5919966949620409923937813800876028835773} +#define T_4096_1641 {-0.8113548470170637294529569771839305758476,-0.5845539429530153263669944863067939877510} +#define T_4096_1645 {-0.8149263290565266215637052482634317129850,-0.5795645591394057438705544882395770400763} +#define T_4096_1647 {-0.8167005728668278452531126276880968362093,-0.5770616728556795527182998739590402692556} +#define T_4096_1649 {-0.8184671295802986579204230110917706042528,-0.5745533550477157636038327837013639509678} +#define T_4096_1653 {-0.8219771152792415547239102124876808375120,-0.5695205193469471405265380781202111393213} +#define T_4096_1655 {-0.8237205112273914275888841984851751476526,-0.5669960488251086783151322379126213490963} +#define T_4096_1659 {-0.8271840272736691312971402112452778965235,-0.5619311212446894687744247676164377480745} +#define T_4096_1665 {-0.8323208677679296840778988553211092948914,-0.5542941214536201144369442772585898637772} +#define T_4096_1671 {-0.8373872016156619357829526961722876876593,-0.5466101669108348604098068790335673838854} +#define T_4096_1675 {-0.8407253749704580725321534373506437987089,-0.5414617658531234445362656515499111264944} +#define T_4096_1677 {-0.8423825996431858476043430528079625219107,-0.5388799085310084224786919548932928591967} +#define T_4096_1683 {-0.8473066386858584264629712379246484488249,-0.5311040011512550007566346721432637423277} +#define T_4096_1685 {-0.8489320552116396134678666385298129171133,-0.5285020015422284833661592529097106307745} +#define T_4096_1689 {-0.8521589016239198288715783746738452464342,-0.5232831034756564303478398869629018008709} +#define T_4096_1695 {-0.8569389774178287622063976414210628718138,-0.5154178780194630382638365517777856439352} +#define T_4096_1701 {-0.8616464611430812992054484311665873974562,-0.5075089910529708703279538895003497600555} +#define T_4096_1705 {-0.8647442575194623781698055609012953937054,-0.5022124740457107883173648588126525282860} +#define T_4096_1707 {-0.8662809540245129946711699631123337894678,-0.4995571125450818938951158543204655870795} +#define T_4096_1713 {-0.8708420634700788642845736831077374517918,-0.4915629161065499519445154419372556731105} +#define T_4096_1715 {-0.8723460588943915405835127785394433885813,-0.4888888969197631717555907471250975504518} +#define T_4096_1717 {-0.8738418434653668631639789055043365806341,-0.4862102761244864179701608009054325520992} +#define T_4096_1719 {-0.8753294031041108924640070654277224093676,-0.4835270789329187413052579813665943220258} +#define T_4096_1725 {-0.8797425928000474071311032275843899697065,-0.4754502817471558673290132901456672698259} +#define T_4096_1731 {-0.8840812587126349875177311332663521170616,-0.4673332087419884151024973562016384676099} +#define T_4096_1735 {-0.8869321187943421946897615271154791116714,-0.4618997907024627314065412519994424656034} +#define T_4096_1737 {-0.8883450333095963546981010949821211397648,-0.4591765475219441450249746594636235386133} +#define T_4096_1743 {-0.8925335554027645779129329639545176178217,-0.4509809890451038638659042590006720274687} +#define T_4096_1745 {-0.8939129451452032526503899134695529937744,-0.4482406122852199414907659047457855194807} +#define T_4096_1749 {-0.8966464701786801549943106692808214575052,-0.4427472275645700228174916901480173692107} +#define T_4096_1751 {-0.8980005797407398793197330633120145648718,-0.4399942713096332558286860603402601554990} +#define T_4096_1755 {-0.9006834292286468590660319932794664055109,-0.4344759605696557058784890159586211666465} +#define T_4096_1761 {-0.9046440905782462404971511205076240003109,-0.4261678887267996707066686212783679366112} +#define T_4096_1765 {-0.9072419779152958163592757045989856123924,-0.4206090744484025090166312565997941419482} +#define T_4096_1767 {-0.9085281187163061211720105347922071814537,-0.4178237158202123269212791001336881890893} +#define T_4096_1773 {-0.9123351846233227480098548767273314297199,-0.4094441486922575923479428183782147243619} +#define T_4096_1775 {-0.9135870479452508075013383859186433255672,-0.4066432168703690286370999729115283116698} +#define T_4096_1779 {-0.9160649657993317207527184109494555741549,-0.4010298971835756232096059648029040545225} +#define T_4096_1785 {-0.9197171462912273609546787156432401388884,-0.3925816740729514697783031351718818768859} +#define T_4096_1791 {-0.9232914167195276355926125688711181282997,-0.3841001950169350420694058811932336539030} +#define T_4096_1795 {-0.9256308305098727151616344599460717290640,-0.3784277548087655596020795201184228062630} +#define T_4096_1797 {-0.9267874743045817487185900063195731490850,-0.3755861784892172150485123438556911423802} +#define T_4096_1803 {-0.9302050228922190688862770002742763608694,-0.3670403457197671803768912468513008207083} +#define T_4096_1805 {-0.9313267090811804260752637674158904701471,-0.3641847895670798918033028712670784443617} +#define T_4096_1809 {-0.9335437729788361727045753468701150268316,-0.3584634206337365402994521446089493110776} +#define T_4096_1815 {-0.9368034417359215604292899115534964948893,-0.3498561297901349176342478131118696182966} +#define T_4096_1819 {-0.9389324835320644879743667843285948038101,-0.3441014259899388694208255401463247835636} +#define T_4096_1821 {-0.9399837530340139357676321196777280420065,-0.3412192023202824109340269842505222186446} +#define T_4096_1825 {-0.9420597397710173126483823580201715230942,-0.3354451470845316585212003701599314808846} +#define T_4096_1827 {-0.9430844374660934947840473796532023698092,-0.3325533698660442238903556244622450321913} +#define T_4096_1833 {-0.9461052323704034483498048757610376924276,-0.3238593665178529090731274209247203543782} +#define T_4096_1835 {-0.9470943663527772171661922584462445229292,-0.3209552324278752144515181043971097096801} +#define T_4096_1839 {-0.9490458818527005568910226429579779505730,-0.3151379287525223893418058196402853354812} +#define T_4096_1845 {-0.9519061368079323459667762108438182622194,-0.3063897953708609733780576789285987615585} +#define T_4096_1851 {-0.9546857549413383381420317164156585931778,-0.2976157074350861964084913324768422171474} +#define T_4096_1853 {-0.9555943341307711058618679089704528450966,-0.2946853721805143266898596721148351207376} +#define T_4096_1855 {-0.9564939189023951016110913769807666540146,-0.2917522632349892619529896364838350564241} +#define T_4096_1857 {-0.9573845007889758562669157981872558593750,-0.2888164082060494797232763630745466798544} +#define T_4096_1863 {-0.9600021457376658462479213085316587239504,-0.2799926430802732180147529561509145423770} +#define T_4096_1865 {-0.9608566331076796585008992224175017327070,-0.2770460803060998955515970010310411453247} +#define T_4096_1869 {-0.9625384680443591634002586943097412586212,-0.2711451595268080105860519779525930061936} +#define T_4096_1875 {-0.9649932528549203247791865578619763255119,-0.2622747070239136446545558101206552237272} +#define T_4096_1881 {-0.9673662922223285054457164733321405947208,-0.2533820369955701590214403040590696036816} +#define T_4096_1885 {-0.9689028047764288720244962860306259244680,-0.2474416191677732967946212738752365112305} +#define T_4096_1887 {-0.9696573851242924479976181828533299267292,-0.2444679027478241783999379777014837600291} +#define T_4096_1893 {-0.9718663374802793963880276351119391620159,-0.2355330594049755144059332678807550109923} +#define T_4096_1895 {-0.9725843689347322129634676457499153912067,-0.2325503070387752446723794719218858517706} +#define T_4096_1899 {-0.9739929621679558335856086159765254706144,-0.2265782638456100006596471985176322050393} +#define T_4096_1905 {-0.9760370790390390238755458085506688803434,-0.2176042746384836412687491247197613120079} +#define T_4096_1911 {-0.9779985149345571393553200323367491364479,-0.2086118519782634850301406004291493445635} +#define T_4096_1915 {-0.9792601226490820209846788202412426471710,-0.2026070388444211334277866853881278075278} +#define T_4096_1917 {-0.9798771036995176375583582739636767655611,-0.1996017576211309707545638048031833022833} +#define T_4096_1921 {-0.9810833911504867055342060666589532047510,-0.1935855872958036349995580849281395785511} +#define T_4096_1923 {-0.9816726861969831130494412718690000474453,-0.1905747548202527674732209561625495553017} +#define T_4096_1925 {-0.9822527413662893724932700934004969894886,-0.1875621285825296025162600699331960640848} +#define T_4096_1929 {-0.9833851103215511813004923169501125812531,-0.1815316082611249937084352268357179127634} +#define T_4096_1935 {-0.9850142310122398381366792818880639970303,-0.1724730839967959783454176658779033459723} +#define T_4096_1941 {-0.9865599102647754081729658537369687110186,-0.1633999493829732252425657179628615267575} +#define T_4096_1945 {-0.9875439417943592257387308563920669257641,-0.1573434556162382480515304905566154047847} +#define T_4096_1947 {-0.9880220171432835263303218198416288942099,-0.1543129730130201049398408486013067886233} +#define T_4096_1953 {-0.9894004277913803768740308441920205950737,-0.1452129246528474637578653982927789911628} +#define T_4096_1955 {-0.9898412784588205282076955882075708359480,-0.1421768035194480583882636892667505890131} +#define T_4096_1959 {-0.9906950254426646340633055842772591859102,-0.1361005751757062010032228727141045965254} +#define T_4096_1965 {-0.9919057004306093272560929108294658362865,-0.1269766964968858657947237134067108854651} +#define T_4096_1971 {-0.9930323501978514100230199801444541662931,-0.1178420615083249772814255607045197393745} +#define T_4096_1975 {-0.9937367219407245988449517426488455384970,-0.1117467112111266008822596518257341813296} +#define T_4096_1977 {-0.9940748793048793663373885465261992067099,-0.1086974440131387165120457893863203935325} +#define T_4096_1983 {-0.9950331994381186317966125898237805813551,-0.0995436186600693329040723256184719502926} +#define T_4096_1985 {-0.9953339121404822797956057911505922675133,-0.0964904313552525927377701009390875697136} +#define T_4096_1989 {-0.9959072294174117212506303076224867254496,-0.0903813608778649829611495647441188339144} +#define T_4096_1995 {-0.9966968952028960604394569600117392838001,-0.0812114468095924413315600531859672628343} +#define T_4096_2001 {-0.9974021299012753027923849913349840790033,-0.0720346532468893185896519071320653893054} +#define T_4096_2005 {-0.9978253504111116445329798807506449520588,-0.0659133527970038185506140848701761569828} +#define T_4096_2007 {-0.9980228737714862408125782167189754545689,-0.0628517575641614201220619406740297563374} +#define T_4096_2013 {-0.9985590742297593136456157481006812304258,-0.0536635376527305266169953767985134618357} +#define T_4096_2015 {-0.9987190122338729381112898408900946378708,-0.0505997490368992816622828456729621393606} +#define T_4096_2019 {-0.9990106858540733769658004348457325249910,-0.0444707718549386676887280600567464716733} +#define T_4096_2023 {-0.9992647472865944235920210303447674959898,-0.0383401203735526940885591784535790793598} +#define T_4096_2025 {-0.9993776703880028478010899561922997236252,-0.0352742388982139540298810231888637645170} +#define T_4096_2031 {-0.9996599967439592226980948908021673560143,-0.0260747178291039008457552483832841971889} +#define T_4096_2035 {-0.9998011698878842556936774599307682365179,-0.0199404285515144379103968930166956852190} +#define T_4096_2037 {-0.9998576410058238606026748129806946963072,-0.0168729879472817138885698540207158657722} +#define T_4096_2043 {-0.9999705864309741398798792033630888909101,-0.0076698287395310978803442125695255526807} +#define T_4096_2045 {-0.9999894110819284032132259198988322168589,-0.0046019261204485704952471536444136290811} +#define T_4096_2049 {-0.9999988234517018792502085489104501903057,0.0015339801862847657169808268662336558918} +#define T_4096_2055 {-0.9999423496760239116198931696999352425337,0.0107376591672644922798030009403191797901} +#define T_4096_2057 {-0.9999047010828528980752594179648440331221,0.0138053885280603905877372383770307351369} +#define T_4096_2061 {-0.9998011698878842556936774599307682365179,0.0199404285515144379103968930166956852190} +#define T_4096_2065 {-0.9996599967439592226980948908021673560143,0.0260747178291039008457552483832841971889} +#define T_4096_2067 {-0.9995752960467492176377390933339484035969,0.0291415087641937256557334023909788811579} +#define T_4096_2073 {-0.9992647472865944235920210303447674959898,0.0383401203735526940885591784535790793598} +#define T_4096_2075 {-0.9991424187248169053177093701378908008337,0.0414056409770767394618040668774483492598} +#define T_4096_2079 {-0.9988695499142835609873714020068291574717,0.0475354841569593025707440858695917995647} +#define T_4096_2085 {-0.9983897374073401609351208207954186946154,0.0567268211669077482284251345845405012369} +#define T_4096_2091 {-0.9978253504111116445329798807506449520588,0.0659133527970038185506140848701761569828} +#define T_4096_2095 {-0.9974021299012753027923849913349840790033,0.0720346532468893185896519071320653893054} +#define T_4096_2097 {-0.9971764367353261881987691594986245036125,0.0750943008479213192085666150887846015394} +#define T_4096_2103 {-0.9964430513500426300765866471920162439346,0.0842688875933240710836003017902839928865} +#define T_4096_2105 {-0.9961798285956969811749672771838959306479,0.0873255352061920731010502549906959757209} +#define T_4096_2109 {-0.9956252563809943056938323024951387196779,0.0934363358457477866103602082148427143693} +#define T_4096_2115 {-0.9947231211043257026460651104571297764778,0.1025958690224362951370906671400007326156} +#define T_4096_2121 {-0.9937367219407245988449517426488455384970,0.1117467112111266008822596518257341813296} +#define T_4096_2125 {-0.9930323501978514100230199801444541662931,0.1178420615083249772814255607045197393745} +#define T_4096_2127 {-0.9926661424489480189947698818286880850792,0.1208880872357770835945345311301934998482} +#define T_4096_2133 {-0.9915114733187438966766080739034805446863,0.1300192227222333463121373142712400294840} +#define T_4096_2135 {-0.9911079137232768898613244346051942557096,0.1330605251571390645892734028166159987450} +#define T_4096_2139 {-0.9902728123631691081740768822783138602972,0.1391393441638262007398196828944492153823} +#define T_4096_2145 {-0.9889502645103029898621116444701328873634,0.1482476789868960309615175674480269663036} +#define T_4096_2151 {-0.9875439417943592257387308563920669257641,0.1573434556162382480515304905566154047847} +#define T_4096_2155 {-0.9865599102647754081729658537369687110186,0.1633999493829732252425657179628615267575} +#define T_4096_2157 {-0.9860539633461954389659354092145804315805,0.1664259035404641318134366656522615812719} +#define T_4096_2159 {-0.9855387353121760618535063258605077862740,0.1694502912339679590036922718354617245495} +#define T_4096_2163 {-0.9844804553832209315089585288660600781441,0.1754942533772714252648938781931065022945} +#define T_4096_2165 {-0.9839374134492189227785274852067232131958,0.1785137709389975346763179686604416929185} +#define T_4096_2169 {-0.9828235511987052364091255185485351830721,0.1845477369386196164757762971930787898600} +#define T_4096_2175 {-0.9810833911504867055342060666589532047510,0.1935855872958036349995580849281395785511} +#define T_4096_2181 {-0.9792601226490820209846788202412426471710,0.2026070388444211334277866853881278075278} +#define T_4096_2185 {-0.9779985149345571393553200323367491364479,0.2086118519782634850301406004291493445635} +#define T_4096_2187 {-0.9773539001451999608249820994387846440077,0.2116113273692275809079177406601957045496} +#define T_4096_2193 {-0.9753648851166569766490965776029042899609,0.2205976901088735342426616625743918120861} +#define T_4096_2195 {-0.9746835106885106680962849168281536549330,0.2235890292297899872853150782248121686280} +#define T_4096_2199 {-0.9732932460546982467164411900739651173353,0.2295653658205188962782017370045650750399} +#define T_4096_2205 {-0.9711391584497250928365019717602990567684,0.2385135948443184439327069412684068083763} +#define T_4096_2211 {-0.9689028047764288720244962860306259244680,0.2474416191677732967946212738752365112305} +#define T_4096_2215 {-0.9673662922223285054457164733321405947208,0.2533820369955701590214403040590696036816} +#define T_4096_2217 {-0.9665843744783331192849118451704271137714,0.2563486824899429139534845489833969622850} +#define T_4096_2223 {-0.9641840639517458289020623851683922111988,0.2652340302855117903924053734954213723540} +#define T_4096_2225 {-0.9633657997809540463052258019160944968462,0.2681908570634031763191273967095185071230} +#define T_4096_2229 {-0.9617020765291225403714747699268627911806,0.2740969098687063842945121905358973890543} +#define T_4096_2235 {-0.9591386224618418943066444626310840249062,0.2829365704570553363694784820836503058672} +#define T_4096_2241 {-0.9564939189023951016110913769807666540146,0.2917522632349892619529896364838350564241} +#define T_4096_2245 {-0.9546857549413383381420317164156585931778,0.2976157074350861964084913324768422171474} +#define T_4096_2247 {-0.9537681898859903251164382709248457103968,0.3005432414172734545410037299006944522262} +#define T_4096_2253 {-0.9509616663115750823109806333377491682768,0.3093087603122687267998003335378598421812} +#define T_4096_2255 {-0.9500082450018429991445145788020454347134,0.3122248139218249396442672605189727619290} +#define T_4096_2259 {-0.9480745859222762250695382135745603591204,0.3180480773850149489589966833591461181641} +#define T_4096_2265 {-0.9451071932852606050090571443433873355389,0.3267604523201317889835593177849659696221} +#define T_4096_2271 {-0.9420597397710173126483823580201715230942,0.3354451470845316585212003701599314808846} +#define T_4096_2275 {-0.9399837530340139357676321196777280420065,0.3412192023202824109340269842505222186446} +#define T_4096_2277 {-0.9389324835320644879743667843285948038101,0.3441014259899388694208255401463247835636} +#define T_4096_2283 {-0.9357256894810803693474099418381229043007,0.3527285557552107264633889371907571330667} +#define T_4096_2285 {-0.9346391298196807806419883490889333188534,0.3555976617047839072327519716054666787386} +#define T_4096_2289 {-0.9324396292684623555047096488124225288630,0.3613258055684542835450656639295630156994} +#define T_4096_2295 {-0.9290745812593157459957637911429628729820,0.3698924471489341003760387138754595071077} +#define T_4096_2301 {-0.9256308305098727151616344599460717290640,0.3784277548087655596020795201184228062630} +#define T_4096_2305 {-0.9232914167195276355926125688711181282997,0.3841001950169350420694058811932336539030} +#define T_4096_2307 {-0.9221086687433450723716532593243755400181,0.3869310055143885818118576480628689751029} +#define T_4096_2313 {-0.9185083943252122518074997969961259514093,0.3954014789478163538483102001919178292155} +#define T_4096_2315 {-0.9172909970083779063187989777361508458853,0.3982175621533736165069683465844718739390} +#define T_4096_2319 {-0.9148303122379461971291902955272234976292,0.4038384575676541299316113509121350944042} +#define T_4096_2325 {-0.9110747340551763606697477371199056506157,0.4122412266698828875455262732430128380656} +#define T_4096_2331 {-0.9072419779152958163592757045989856123924,0.4206090744484025090166312565997941419482} +#define T_4096_2335 {-0.9046440905782462404971511205076240003109,0.4261678887267996707066686212783679366112} +#define T_4096_2337 {-0.9033323684945118170475097940652631223202,0.4289412920553294927827892024652101099491} +#define T_4096_2343 {-0.8993462369793415733809638368256855756044,0.4372371736610440873249672222300432622433} +#define T_4096_2345 {-0.8980005797407398793197330633120145648718,0.4399942713096332558286860603402601554990} +#define T_4096_2349 {-0.8952839210385574730821645061951130628586,0.4454960165139817407364830614824313670397} +#define T_4096_2355 {-0.8911457647945831839209063218731898814440,0.4537171210001638699260695375414798036218} +#define T_4096_2361 {-0.8869321187943421946897615271154791116714,0.4618997907024627314065412519994424656034} +#define T_4096_2365 {-0.8840812587126349875177311332663521170616,0.4673332087419884151024973562016384676099} +#define T_4096_2367 {-0.8826433399795627909867334892624057829380,0.4700433324595956197100576900993473827839} +#define T_4096_2373 {-0.8782797916565414642064979489077813923359,0.4781470564248430643594645061966730281711} +#define T_4096_2375 {-0.8768087238091456514510468878143001347780,0.4808393306003339584542288775992346927524} +#define T_4096_2379 {-0.8738418434653668631639789055043365806341,0.4862102761244864179701608009054325520992} +#define T_4096_2385 {-0.8693298713486067308409133147506508976221,0.4942323085159597284565791142085799947381} +#define T_4096_2391 {-0.8647442575194623781698055609012953937054,0.5022124740457107883173648588126525282860} +#define T_4096_2395 {-0.8616464611430812992054484311665873974562,0.5075089910529708703279538895003497600555} +#define T_4096_2397 {-0.8600853904293901397437593914219178259373,0.5101500967067668090848542306048329919577} +#define T_4096_2403 {-0.8553536647351960287011252148658968508244,0.5180445040959993363571811642032116651535} +#define T_4096_2405 {-0.8537603011381114104239031803444959223270,0.5206662541403671573547740081266965717077} +#define T_4096_2409 {-0.8505494812656034797626602994569111615419,0.5258950274710846306547296080680098384619} +#define T_4096_2415 {-0.8456732469872990654025102230662014335394,0.5337010018071529637850858307501766830683} +#define T_4096_2421 {-0.8407253749704580725321534373506437987089,0.5414617658531234445362656515499111264944} +#define T_4096_2425 {-0.8373872016156619357829526961722876876593,0.5466101669108348604098068790335673838854} +#define T_4096_2427 {-0.8357062843537526042325680464273318648338,0.5491766621877197662726644011854659765959} +#define T_4096_2433 {-0.8306164003088463143598119131638668477535,0.5568450372751601040732793990173377096653} +#define T_4096_2435 {-0.8289041147718648749886938276176806539297,0.5593907118591360250192678904568310827017} +#define T_4096_2439 {-0.8254561540043775513808554933348204940557,0.5644662415205194960776680090930312871933} +#define T_4096_2445 {-0.8202259825694346861979511231766082346439,0.5720396293247570485007713614322710782290} +#define T_4096_2451 {-0.8149263290565266215637052482634317129850,0.5795645591394057438705544882395770400763} +#define T_4096_2455 {-0.8113548470170637294529569771839305758476,0.5845539429530153263669944863067939877510} +#define T_4096_2457 {-0.8095576424040512586444151565956417471170,0.5870403935209179691057101990736555308104} +#define T_4096_2463 {-0.8041203773982656954899539414327591657639,0.5944664991846644319650749821448698639870} +#define T_4096_2465 {-0.8022927955381157216763199357956182211637,0.5969307080621965022615427187702152878046} +#define T_4096_2469 {-0.7986149946347608219454627942468505352736,0.6018422470585800265752141058328561484814} +#define T_4096_2475 {-0.7930419604794436416739245032658800482750,0.6091670123364532063447995824390091001987} +#define T_4096_2481 {-0.7874017470290314291148092706862371414900,0.6164401745308536462175652559380978345871} +#define T_4096_2485 {-0.7836045186096382009210969954438041895628,0.6212599765110876637308479075727518647909} +#define T_4096_2487 {-0.7816948320710593867133297862892504781485,0.6236611175256945305278577507124282419682} +#define T_4096_2493 {-0.7759216990434075755800336082756984978914,0.6308292296284245814774749305797740817070} +#define T_4096_2495 {-0.7739826906068227874158083068323321640491,0.6332067550500573016591943087405525147915} +#define T_4096_2499 {-0.7700828369933480077591525514435488730669,0.6379439036218440550740638173010665923357} +#define T_4096_2505 {-0.7641787405361166740647149708820506930351,0.6450045368155439273749607309582643210888} +#define T_4096_2511 {-0.7582099098130152814434268293553031980991,0.6520105310969595002745791134657338261604} +#define T_4096_2515 {-0.7541949753168891712462595933175180107355,0.6566505457294290470926512170990463346243} +#define T_4096_2517 {-0.7521768504490426998643215483753010630608,0.6589612929820373166123204100586008280516} +#define T_4096_2523 {-0.7460800735100637792740485565445851534605,0.6658562336655097224635824204597156494856} +#define T_4096_2525 {-0.7440337441799292905741936010599602013826,0.6681420414265185625524168244737666100264} +#define T_4096_2529 {-0.7399200954595162027516153102624230086803,0.6726947690707729687886740066460333764553} +#define T_4096_2535 {-0.7336974381146602608438911374832969158888,0.6794763198993650776813524316821713000536} +#define T_4096_2541 {-0.7274126286023757659293664801225531846285,0.6862003116800385882356749789323657751083} +#define T_4096_2545 {-0.7231884893065274599877056971308775246143,0.6906507141345346045824271641322411596775} +#define T_4096_2547 {-0.7210661993145081050116118603909853845835,0.6928661748174246293174860511498991400003} +#define T_4096_2553 {-0.7146586878627690930798621593567077070475,0.6994733446402837673261387863021809607744} +#define T_4096_2555 {-0.7125093705646923236685097435838542878628,0.7016625947401684548765388171887025237083} +#define T_4096_2559 {-0.7081906370331952915719853081100154668093,0.7060212614493397431658650020835921168327} +#define T_4096_2565 {-0.7016625947401684548765388171887025237083,0.7125093705646923236685097435838542878628} +#define T_4096_2571 {-0.6950751139800008804314757071551866829395,0.7189371223728044935086245459388010203838} +#define T_4096_2577 {-0.6884287527840904363785057284985668957233,0.7253039723730607679641479990095831453800} +#define T_4096_2583 {-0.6817240741716498186875128340034279972315,0.7316093812238925186974825010111089795828} +#define T_4096_2589 {-0.6749616461020120361524732288671657443047,0.7378528147884659826871711629792116582394} +#define T_4096_2595 {-0.6681420414265185625524168244737666100264,0.7440337441799292905741936010599602013826} +#define T_4096_2601 {-0.6612658378399922654011788836214691400528,0.7501516458062150727315042786358390003443} +#define T_4096_2607 {-0.6543336178318005513787625204713549464941,0.7562060014143945352316222852095961570740} +#define T_4096_2613 {-0.6473459686365120591133859306864906102419,0.7621962981345788978870814389665611088276} +#define T_4096_2619 {-0.6403034821841516732732202399347443133593,0.7681220285233654188061791501240804791451} +#define T_4096_2625 {-0.6332067550500573016591943087405525147915,0.7739826906068227874158083068323321640491} +#define T_4096_2631 {-0.6260563884043435223247797694057226181030,0.7797777879230144426614401709230151027441} +#define T_4096_2637 {-0.6188529879609763195702498705941252410412,0.7855068295640539322022277701762504875660} +#define T_4096_2643 {-0.6115971639264619064135786175029352307320,0.7911693302176902031774829993082676082850} +#define T_4096_2649 {-0.6042895309481560728315230335283558815718,0.7967648102084187167193363166006747633219} +#define T_4096_2655 {-0.5969307080621965022615427187702152878046,0.8022927955381157216763199357956182211637} +#define T_4096_2661 {-0.5895213186410639405465872187050990760326,0.8077528179261903584773563125054351985455} +#define T_4096_2667 {-0.5820619903407755479918250784976407885551,0.8131444148492534829131272999802604317665} +#define T_4096_2673 {-0.5745533550477157636038327837013639509678,0.8184671295802986579204230110917706042528} +#define T_4096_2679 {-0.5669960488251086783151322379126213490963,0.8237205112273914275888841984851751476526} +#define T_4096_2685 {-0.5593907118591360250192678904568310827017,0.8289041147718648749886938276176806539297} +#define T_4096_2691 {-0.5517379884047074467545712650462519377470,0.8340175011060181331501439672138076275587} +#define T_4096_2697 {-0.5440385267308839312150325895345304161310,0.8390602370703127421691647214174736291170} +#define T_4096_2703 {-0.5362929790659631823501740655046887695789,0.8440318954900664083496053535782266408205} +#define T_4096_2709 {-0.5285020015422284833661592529097106307745,0.8489320552116396134678666385298129171133} +#define T_4096_2715 {-0.5206662541403671573547740081266965717077,0.8537603011381114104239031803444959223270} +#define T_4096_2721 {-0.5127864006335630664423774760507512837648,0.8585162242644427399440587578283157199621} +#define T_4096_2727 {-0.5048631085312675903509216368547640740871,0.8631994217121241597112657473189756274223} +#define T_4096_2733 {-0.4968970490226545244638600706821307539940,0.8678094967633032119636027346132323145866} +#define T_4096_2739 {-0.4888888969197631717555907471250975504518,0.8723460588943915405835127785394433885813} +#define T_4096_2745 {-0.4808393306003339584542288775992346927524,0.8768087238091456514510468878143001347780} +#define T_4096_2751 {-0.4727490319503427906866477314906660467386,0.8811971134712219821949474862776696681976} +#define T_4096_2757 {-0.4646186863062378158417686790926381945610,0.8855108561361999530703315031132660806179} +#define T_4096_2763 {-0.4564489823968839177226186620828229933977,0.8897495863830727769183681630238424986601} +#define T_4096_2769 {-0.4482406122852199414907659047457855194807,0.8939129451452032526503899134695529937744} +#define T_4096_2775 {-0.4399942713096332558286860603402601554990,0.8980005797407398793197330633120145648718} +#define T_4096_2781 {-0.4317106580250572589463331496517639607191,0.9020121439024931797590056703484151512384} +#define T_4096_2787 {-0.4233904741437960472794088673254009336233,0.9059472978072684590244989522034302353859} +#define T_4096_2793 {-0.4150344244760816314609996879880782216787,0.9098057081046522220901806576875969767570} +#define T_4096_2799 {-0.4066432168703690286370999729115283116698,0.9135870479452508075013383859186433255672} +#define T_4096_2805 {-0.3982175621533736165069683465844718739390,0.9172909970083779063187989777361508458853} +#define T_4096_2811 {-0.3897581740698564667368941627501044422388,0.9209172415291894120414895041903946548700} +#define T_4096_2817 {-0.3812657692221623761952287168242037296295,0.9244654743252626039051733641827013343573} +#define T_4096_2823 {-0.3727410670095158140568969429295975714922,0.9279353948226178872005220910068601369858} +#define T_4096_2829 {-0.3641847895670798918033028712670784443617,0.9313267090811804260752637674158904701471} +#define T_4096_2835 {-0.3555976617047839072327519716054666787386,0.9346391298196807806419883490889333188534} +#define T_4096_2841 {-0.3469804108459236813288839584856759756804,0.9378723764399898854549064708407968282700} +#define T_4096_2847 {-0.3383337669655411827918101153045427054167,0.9410261750508892575339814356993883848190} +#define T_4096_2853 {-0.3296584625285874925459950190997915342450,0.9441002584912726591781506613187957555056} +#define T_4096_2859 {-0.3209552324278752144515181043971097096801,0.9470943663527772171661922584462445229292} +#define T_4096_2865 {-0.3122248139218249396442672605189727619290,0.9500082450018429991445145788020454347134} +#define T_4096_2871 {-0.3034679465720113156201875881379237398505,0.9528416476011987157335170195437967777252} +#define T_4096_2877 {-0.2946853721805143266898596721148351207376,0.9555943341307711058618679089704528450966} +#define T_4096_2883 {-0.2858778347270806152735644900531042367220,0.9582660714080176722617920859192963689566} +#define T_4096_2889 {-0.2770460803060998955515970010310411453247,0.9608566331076796585008992224175017327070} +#define T_4096_2895 {-0.2681908570634031763191273967095185071230,0.9633657997809540463052258019160944968462} +#define T_4096_2901 {-0.2593129151328862347369863528001587837934,0.9657933588740836849950710529810748994350} +#define T_4096_2907 {-0.2504130065729652798722781881224364042282,0.9681391047463624444091578880033921450377} +#define T_4096_2913 {-0.2414918853028693301876472787625971250236,0.9704028386875555023394213094434235244989} +#define T_4096_2919 {-0.2325503070387752446723794719218858517706,0.9725843689347322129634676457499153912067} +#define T_4096_2925 {-0.2235890292297899872853150782248121686280,0.9746835106885106680962849168281536549330} +#define T_4096_2931 {-0.2146088109937867582921455777977826073766,0.9767000861287118418374575412599369883537} +#define T_4096_2937 {-0.2056104130530992668557388469707802869380,0.9786339244294232075915829227596987038851} +#define T_4096_2943 {-0.1965945976700802233505527283341507427394,0.9804848617734693849712357405223883688450} +#define T_4096_2949 {-0.1875621285825296025162600699331960640848,0.9822527413662893724932700934004969894886} +#define T_4096_2955 {-0.1785137709389975346763179686604416929185,0.9839374134492189227785274852067232131958} +#define T_4096_2961 {-0.1694502912339679590036922718354617245495,0.9855387353121760618535063258605077862740} +#define T_4096_2967 {-0.1603724572429282568819530752080027014017,0.9870565713057509737993200360506307333708} +#define T_4096_2973 {-0.1512810379573302221878350337647134438157,0.9884907928526965870119624923972878605127} +#define T_4096_2979 {-0.1421768035194480583882636892667505890131,0.9898412784588205282076955882075708359480} +#define T_4096_2985 {-0.1330605251571390645892734028166159987450,0.9911079137232768898613244346051942557096} +#define T_4096_2991 {-0.1239329751185121730738813994321390055120,0.9922905913482573669881503519718535244465} +#define T_4096_2997 {-0.1147949266065100837330703598127001896501,0.9933892111480806530465770265436731278896} +#define T_4096_3003 {-0.1056471537134106158939417241526825819165,0.9944036800576790957606476695218589156866} +#define T_4096_3009 {-0.0964904313552525927377701009390875697136,0.9953339121404822797956057911505922675133} +#define T_4096_3015 {-0.0873255352061920731010502549906959757209,0.9961798285956969811749672771838959306479} +#define T_4096_3021 {-0.0781532416327942458522670676757115870714,0.9969413577649821611714742175536230206490} +#define T_4096_3027 {-0.0689743276282667461263287123074405826628,0.9976184351385195547834427998168393969536} +#define T_4096_3033 {-0.0597895707466398751428471314284252002835,0.9982110033604781884619683296477887779474} +#define T_4096_3039 {-0.0505997490368992816622828456729621393606,0.9987190122338729381112898408900946378708} +#define T_4096_3045 {-0.0414056409770767394618040668774483492598,0.9991424187248169053177093701378908008337} +#define T_4096_3051 {-0.0322080254083045858237710490357130765915,0.9994811869661669456732511207519564777613} +#define T_4096_3057 {-0.0230076814688393721519688028820382896811,0.9997352882605616830602457412169314920902} +#define T_4096_3063 {-0.0138053885280603905877372383770307351369,0.9999047010828528980752594179648440331221} +#define T_4096_3069 {-0.0046019261204485704952471536444136290811,0.9999894110819284032132259198988322168589} +// Pre-computed twiddles for N=5000 +#define T_5000_1 {0.9999992104317517727096742419234942644835,-0.0012566367307023254638181919418116194720} +#define T_5000_3 {0.9999928938932473032608072571747470647097,-0.0037699022545064136621717487685145897558} +#define T_5000_7 {0.9999613114002182667761076118040364235640,-0.0087963459888593647367782679680203727912} +#define T_5000_9 {0.9999360456451857892545831418829038739204,-0.0113094924496577719369438952412565413397} +#define T_5000_11 {0.9999044637506320221120859059737995266914,-0.0138225674735552497146562345164966245648} +#define T_5000_13 {0.9998665659160452801046403692453168332577,-0.0163355551866041122377470884430294972844} +#define T_5000_17 {0.9997718234241997903311016671068500727415,-0.0213612051872230264415453149240420316346} +#define T_5000_19 {0.9997149793653861138409411069005727767944,-0.0238738357300562718810166273897266364656} +#define T_5000_21 {0.9996518205634257192571112682344391942024,-0.0263863154727678149435998733451924636029} +#define T_5000_23 {0.9995823474172642608692740395781584084034,-0.0288986285451700818105447154948706156574} +#define T_5000_27 {0.9994244598875400287241177466057706624269,-0.0339226912036606764733193131178268231452} +#define T_5000_29 {0.9993360465012808280249601011746563017368,-0.0364344090550387106342000720360374543816} +#define T_5000_31 {0.9992413207654209372776676900684833526611,-0.0389458967668873529133222177733841817826} +#define T_5000_33 {0.9991402832782997345262288035883102566004,-0.0414571384752852392652222590641031274572} +#define T_5000_37 {0.9989192756429681185892377470736391842365,-0.0464788204339129581987855033275991445407} +#define T_5000_39 {0.9987993068907620175522765748610254377127,-0.0489892289644702980111468093582516303286} +#define T_5000_41 {0.9986730291792944580464563841815106570721,-0.0514993280524322499802636343702033627778} +#define T_5000_43 {0.9985404433062039508683938038302585482597,-0.0540091018426487601278296324380789883435} +#define T_5000_47 {0.9982563504649325514250790547521319240332,-0.0590276101196191491093934189393621636555} +#define T_5000_49 {0.9981048452912363311284593692107591778040,-0.0615363129067472286459761221522057894617} +#define T_5000_51 {0.9979470355448755158889184713189024478197,-0.0640446269970785103664567827763676177710} +#define T_5000_53 {0.9977829222226621830316162231611087918282,-0.0665525365467379420580940063700836617500} +#define T_5000_57 {0.9974357890370038060012802816345356404781,-0.0715670786614176768214434787296340800822} +#define T_5000_59 {0.9972527713662407977679436044127214699984,-0.0740736795518646129465878402697853744030} +#define T_5000_61 {0.9970634545049750219192219446995295584202,-0.0765798125526930689721538669800793286413} +#define T_5000_63 {0.9968678396490348037062290131871122866869,-0.0790854618338049236703213296095782425255} +#define T_5000_67 {0.9964577209353436826333449971571099013090,-0.0840952459318636824647796856879722326994} +#define T_5000_69 {0.9962432196681255680559274878760334104300,-0.0865993491042914287358911451519816182554} +#define T_5000_71 {0.9960224255872819343338164799206424504519,-0.0891029052681641953181923554438981227577} +#define T_5000_73 {0.9957953400874682836629858684318605810404,-0.0916058986096605909477119666917133145034} +#define T_5000_77 {0.9953223006082435597718927056121174246073,-0.0966101335881140305117398270340345334262} +#define T_5000_79 {0.9950763496168069099923059184220619499683,-0.0991113436156031701873558859006152488291} +#define T_5000_81 {0.9948241131823303273762348908348940312862,-0.1016119276019798456367482231144094839692} +#define T_5000_83 {0.9945655928980761117941256088670343160629,-0.1041118697521965369423568859019724186510} +#define T_5000_87 {0.9940297073517352366422983322991058230400,-0.1091097653843306064525009446697367820889} +#define T_5000_89 {0.9937523454745929019082950617303140461445,-0.1116076872968228689275704823558044154197} +#define T_5000_91 {0.9934687065175402675976101818378083407879,-0.1141049042345042979773594993275764863938} +#define T_5000_93 {0.9931787922721951877846890965884085744619,-0.1166014004235955137112057400372577831149} +#define T_5000_97 {0.9925801452812780256707014814310241490602,-0.1215921674837524696366486409715435001999} +#define T_5000_99 {0.9922714163170857082363340850861277431250,-0.1240864068304210476867410761769860982895} +#define T_5000_101 {0.9919564196273364942157968471292406320572,-0.1265798623799041233706219600207987241447} +#define T_5000_103 {0.9916351572017207960385576370754279196262,-0.1290725183821812838136366963226464577019} +#define T_5000_107 {0.9909738432995269796776938164839521050453,-0.1340553687703875396231012473435839638114} +#define T_5000_109 {0.9906337960001668774268068773380946367979,-0.1365455316819254305560349393999786116183} +#define T_5000_111 {0.9902874913193500105990096926689147949219,-0.1390348320976738383247806041254079900682} +#define T_5000_113 {0.9899349314445247127736138281761668622494,-0.1415232542938584248126687725743977352977} +#define T_5000_117 {0.9892110550601855845087584384600631892681,-0.1464974011602746262195040571896242909133} +#define T_5000_119 {0.9888397431230684375691453169565647840500,-0.1489830944110912269717772460353444330394} +#define T_5000_121 {0.9884621851367073608329860689991619437933,-0.1514678466037124304754968306951923295856} +#define T_5000_123 {0.9880783834859637737579873828508425503969,-0.1539516420430929455331892086178413592279} +#define T_5000_127 {0.9872920589279519099079607258317992091179,-0.1589162999122667452311929991992656141520} +#define T_5000_129 {0.9868895409875367841223692266794387251139,-0.1613971309825826816464200419432017952204} +#define T_5000_131 {0.9864807893164143637321217283897567540407,-0.1638769425809012969530442660470725968480} +#define T_5000_133 {0.9860658064964824554010647261748090386391,-0.1663557190433848087529611348145408555865} +#define T_5000_137 {0.9852171579345608831701497365429531782866,-0.1713101039382865231619490486991708166897} +#define T_5000_139 {0.9847834975530967982493280032940674573183,-0.1737856810761170822665633295400766655803} +#define T_5000_141 {0.9843436167438402994989132821501698344946,-0.1762601604891348705539400043562636710703} +#define T_5000_143 {0.9838975182853179068942495177907403558493,-0.1787335265471830525463303729338804259896} +#define T_5000_147 {0.9829866797309323445119844109285622835159,-0.1836768561130047439089452154803439043462} +#define T_5000_149 {0.9825219453884206366467424231814220547676,-0.1861467883960220559558251807175111025572} +#define T_5000_151 {0.9820510049033092103698550090484786778688,-0.1886155448747545981813544813121552579105} +#define T_5000_153 {0.9815738612503140947396218507492449134588,-0.1910831099551947298564869015535805374384} +#define T_5000_157 {0.9806009765354312124330249389458913356066,-0.1960146035828930488609245230691158212721} +#define T_5000_159 {0.9801052416188119620343854876409750431776,-0.1984785009801572630561139476412790827453} +#define T_5000_161 {0.9796033158248074190765919411205686628819,-0.2009411446793378519526385161952930502594} +#define T_5000_163 {0.9790952023238538304283906654745806008577,-0.2034025191250387887276218634724500589073} +#define T_5000_167 {0.9780604250782479747172715178749058395624,-0.2083213980746022320200694366576499305665} +#define T_5000_169 {0.9775337678698112764408278962946496903896,-0.2107788715081519159699752208325662650168} +#define T_5000_171 {0.9770009360268150366835016029654070734978,-0.2132350135477925900939766279407194815576} +#define T_5000_173 {0.9764619329149148985180772797320969402790,-0.2156898086791962099173503020210773684084} +#define T_5000_177 {0.9753654265419087199262548892875202000141,-0.2205952962026171693121767702905344776809} +#define T_5000_179 {0.9748079302069329044044820875569712370634,-0.2230459576089092454953544120144215412438} +#define T_5000_181 {0.9742442764552697864743890932004433125257,-0.2254952101357094174094441996203386224806} +#define T_5000_183 {0.9736744688472628306286082988663110882044,-0.2279430383122075798318917350115953013301} +#define T_5000_187 {0.9725164064979238132124805815692525357008,-0.2328343597761399486056888008533860556781} +#define T_5000_189 {0.9719281590715429519633516974863596260548,-0.2352778221673293701510232267537503503263} +#define T_5000_191 {0.9713337724186744681631466846738476306200,-0.2377197984159224453115655251167481765151} +#define T_5000_193 {0.9707332502937873197623730447958223521709,-0.2401602730970699428603154501615790650249} +#define T_5000_197 {0.9695138148395850974381460218864958733320,-0.2450366561051525171599507757491664960980} +#define T_5000_199 {0.9688949092128869011730785132385790348053,-0.2474725336302022182799476013315143063664} +#define T_5000_201 {0.9682698835193558339895503195293713361025,-0.2499068479842302403781673092453274875879} +#define T_5000_203 {0.9676387417069935992230966803617775440216,-0.2523395837907840832947670151042984798551} +#define T_5000_207 {0.9663581257109223887624693816178478300571,-0.2572002583056110514014847012731479480863} +#define T_5000_209 {0.9657086596162800518072799604851752519608,-0.2596281663112222637401771407894557341933} +#define T_5000_211 {0.9650530935808888655103032760962378233671,-0.2620544343642292606233468177379108965397} +#define T_5000_213 {0.9643914317456601104083802056265994906425,-0.2644790471390043862243146577384322881699} +#define T_5000_217 {0.9630498374318312571062961069401353597641,-0.2693232456037231714596202891698339954019} +#define T_5000_219 {0.9623699134274699140689790510805323719978,-0.2717428006950765584370799388125305995345} +#define T_5000_221 {0.9616839105716952218472215463407337665558,-0.2741606393112107631360174764267867431045} +#define T_5000_223 {0.9609918331976738947730609652353450655937,-0.2765767461797430382119955538655631244183} +#define T_5000_227 {0.9595894724193817504342973734310362488031,-0.2814037036392603163825754108984256163239} +#define T_5000_229 {0.9588791978731835419225149053090717643499,-0.2838145237405585996626200540049467235804} +#define T_5000_231 {0.9581628665248286269573441131797153502703,-0.2862235511150740818742121973627945408225} +#define T_5000_233 {0.9574404828990550342382448434364050626755,-0.2886307705460806061203982153529068455100} +#define T_5000_237 {0.9559775771053230508655929043015930801630,-0.2934397247678576237106540247623343020678} +#define T_5000_239 {0.9552370641778726856330195005284622311592,-0.2958414291826597097845308326213853433728} +#define T_5000_241 {0.9544905174539615178019857921754010021687,-0.2982412649022077033578170812688767910004} +#define T_5000_243 {0.9537379416491842398784228862496092915535,-0.3006392167678348381620878626563353464007} +#define T_5000_247 {0.9522147218497959419991616414336021989584,-0.3054294083622529076826879190775798633695} +#define T_5000_249 {0.9514440874766688649444290604151319712400,-0.3078216178335902442064764272799948230386} +#define T_5000_251 {0.9506674432655832784888616515672765672207,-0.3102118829362908170033108490315498784184} +#define T_5000_253 {0.9498847941222460233845481525349896401167,-0.3126001885721411133189917563868220895529} +#define T_5000_257 {0.9483015008512654109296136084594763815403,-0.3173708611124177458862050116294994950294} +#define T_5000_259 {0.9475008667245626847730477493314538151026,-0.3197531978826834153650793268752750009298} +#define T_5000_261 {0.9466942476674268247549548505048733204603,-0.3221335149179680445996609705616720020771} +#define T_5000_263 {0.9458816487749021062469978460285346955061,-0.3245117971828955227842072872590506449342} +#define T_5000_267 {0.9442385320526907044325071183266118168831,-0.3292621973245331568058702487178379669785} +#define T_5000_269 {0.9434080246018221593828911863965913653374,-0.3316342851951342707295111722487490624189} +#define T_5000_271 {0.9425715580731357512433987722033634781837,-0.3340042782833495538241663780354429036379} +#define T_5000_273 {0.9417291377502087224016236177703831344843,-0.3363721616190146201930133429414127022028} +#define T_5000_277 {0.9400264570439446076832723520055878907442,-0.3411015392187625727693500721215968951583} +#define T_5000_279 {0.9391662074156642558975249812647234648466,-0.3434630036095261962358904384018387645483} +#define T_5000_281 {0.9383000255031889169643477544013876467943,-0.3458222985012897043866075819096295163035} +#define T_5000_283 {0.9374279167777940235950495662109460681677,-0.3481794089914642675864797638496384024620} +#define T_5000_287 {0.9356659409604988208997156107216142117977,-0.3528870172257748683009026535728480666876} +#define T_5000_289 {0.9347760849981954223153479688335210084915,-0.3552374852340987598786625767388613894582} +#define T_5000_291 {0.9338803244820960136607368440309073776007,-0.3575857093693972088566113143315305933356} +#define T_5000_293 {0.9329786650703109085469577621552161872387,-0.3599316747990104481402795499889180064201} +#define T_5000_297 {0.9311576723783917541865662315103691071272,-0.3646167702819711475825670277117751538754} +#define T_5000_299 {0.9302383506006371982621772076527122408152,-0.3669558707417092668912061981245642527938} +#define T_5000_301 {0.9293131529318833239372565913072321563959,-0.3692926533087302898294979058846365660429} +#define T_5000_303 {0.9283820852161814851655208258307538926601,-0.3716271032226454718028207935276441276073} +#define T_5000_307 {0.9265023632054938396152010682271793484688,-0.3762889461233681620555557856278028339148} +#define T_5000_309 {0.9255537207838540059157139694434590637684,-0.3786163096634422853270507403067313134670} +#define T_5000_311 {0.9245992320618827697842334600863978266716,-0.3809412816571296489520648265170166268945} +#define T_5000_313 {0.9236389030686498191613509334274567663670,-0.3832638474186436106094788556220009922981} +#define T_5000_317 {0.9217007485690897894414774782489985227585,-0.3879017015780925059509343100216938182712} +#define T_5000_319 {0.9207229353051999964918650221079587936401,-0.3902169606808199020520078192930668592453} +#define T_5000_321 {0.9197393062548454567561861949798185378313,-0.3925297549611438951977504530077567324042} +#define T_5000_323 {0.9187498676311617318290814182546455413103,-0.3948400698101989236832309870806057006121} +#define T_5000_327 {0.9167535866997925664279023294511716812849,-0.3994532028574384607821912140934728085995} +#define T_5000_329 {0.9157467570017023827944058211869560182095,-0.4017559919165675230701140208111610263586} +#define T_5000_331 {0.9147341429493940356820758097455836832523,-0.4040562432665005254683876501076156273484} +#define T_5000_333 {0.9137157509390884113642528063792269676924,-0.4063539423775998637999862239666981622577} +#define T_5000_337 {0.9116616588118107156191172180115245282650,-0.4109416258454448023229588216054253280163} +#define T_5000_339 {0.9106259716696010197978239375515840947628,-0.4132315812238883112605947189877042546868} +#define T_5000_341 {0.9095845325188364194346490876341704279184,-0.4155189264070768273562350714200874790549} +#define T_5000_343 {0.9085373479378128225292243769217748194933,-0.4178036469468950353522984642040682956576} +#define T_5000_347 {0.9064257689795853778491618868429213762283,-0.4223651563869436631470932752563385292888} +#define T_5000_349 {0.9053613879402618858094342613185290247202,-0.4246419164742013530045028346648905426264} +#define T_5000_351 {0.9042912881463557317474055707862135022879,-0.4269159942923252404334277798625407740474} +#define T_5000_353 {0.9032154763571986766379495747969485819340,-0.4291873754770037519001846249011578038335} +#define T_5000_357 {0.9010467440108171910395640225033275783062,-0.4337219905740368752766755733318859711289} +#define T_5000_359 {0.8999538371524847812921166223532054573298,-0.4359851958432979235347204394201980903745} +#define T_5000_361 {0.8988552456965994696957977794227190315723,-0.4382456471931076569248375562892761081457} +#define T_5000_363 {0.8977509765824618348517560662003234028816,-0.4405033303452267623434579491004114970565} +#define T_5000_367 {0.8955254333159045065926306961046066135168,-0.4450103350309535521489578968612477183342} +#define T_5000_369 {0.8944041732212255624290264677256345748901,-0.4472596280958700543095574175822548568249} +#define T_5000_371 {0.8932772635836871977232931385515257716179,-0.4495060960258935867095431149209616705775} +#define T_5000_373 {0.8921447115214631295998515270184725522995,-0.4517497246311116820827180617925478145480} +#define T_5000_377 {0.8898627087738110175152428382716607302427,-0.4562284071972453891596899211435811594129} +#define T_5000_379 {0.8887132725027525603778144613897893577814,-0.4584634328683677706450794175907503813505} +#define T_5000_381 {0.8875582226356564241598334774607792496681,-0.4606955626352770560849592129670782014728} +#define T_5000_383 {0.8863975664684456967634673674183432012796,-0.4629247823986284426922566126449964940548} +#define T_5000_387 {0.8840594645943864460235772639862261712551,-0.4673744356092733309715470113587798550725} +#define T_5000_389 {0.8828820336562601678309647468267939984798,-0.4695948409501388698394919174461392685771} +#define T_5000_391 {0.8816990259553703079831166178337298333645,-0.4718122800747679979060933419532375410199} +#define T_5000_393 {0.8805104489642362475265713328553829342127,-0.4740267389766100736281373428937513381243} +#define T_5000_397 {0.8781166171771599415762921125860884785652,-0.4784466601799422513785486898996168747544} +#define T_5000_399 {0.8769113775019603540528123630792833864689,-0.4806520945628078300693175606284057721496} +#define T_5000_401 {0.8757005987779062916587236031773500144482,-0.4828544928858136686677937632339308038354} +#define T_5000_403 {0.8744842886529343717327833473973441869020,-0.4850538412374150176731291139731183648109} +#define T_5000_407 {0.8720351049666303921981125313322991132736,-0.4894433324766390702720286753901746124029} +#define T_5000_409 {0.8708022468756737577422200047294609248638,-0.4916334476378494966475329874810995534062} +#define T_5000_411 {0.8695638883244519723447751857747789472342,-0.4938204573750058501424575752025702968240} +#define T_5000_413 {0.8683200371351107449413575523067265748978,-0.4960043478737660715971458103012992069125} +#define T_5000_417 {0.8658158883040750763981918680656235665083,-0.5003627159973307314544399559963494539261} +#define T_5000_419 {0.8645556064799456708058755793899763375521,-0.5025371660923133587672850808303337544203} +#define T_5000_421 {0.8632898636527268632079312737914733588696,-0.5047084418894302038083310435467865318060} +#define T_5000_423 {0.8620186678175382199995624432631302624941,-0.5068765296737230974244425851793494075537} +#define T_5000_427 {0.8594599492759001924113704262708779424429,-0.5112030864447778544246148157981224358082} +#define T_5000_429 {0.8581724427317077852350735156505834311247,-0.5133615281026556731447385573119390755892} +#define T_5000_431 {0.8568795155039573563371391173859592527151,-0.5155167270901142018146856571547687053680} +#define T_5000_433 {0.8555811757594800059933959346381016075611,-0.5176686697937453729423395998310297727585} +#define T_5000_437 {0.8529682915585590219365030861808918416500,-0.5219627319988210389922755894076544791460} +#define T_5000_439 {0.8516537636065120153716634376905858516693,-0.5241048243766349701289186668873298913240} +#define T_5000_441 {0.8503338561464273137602276619873009622097,-0.5262436062235313904977829224662855267525} +#define T_5000_443 {0.8490085775155576941486401665315497666597,-0.5283790640298017260434448871819768100977} +#define T_5000_447 {0.8463419402600584895424162823474034667015,-0.5326399535866978007803140826581511646509} +#define T_5000_449 {0.8450005984793591551351710222661495208740,-0.5347653584232292534039743259199894964695} +#define T_5000_451 {0.8436539192156289335144947472144849598408,-0.5368873853911163562813158023345749825239} +#define T_5000_453 {0.8423019109752259980794519833580125123262,-0.5390060210864833223354253277648240327835} +#define T_5000_457 {0.8395819417580838717185542918741703033447,-0.5432330651513449515377374154923018068075} +#define T_5000_459 {0.8382139979621486824967746542824897915125,-0.5453414468205318721416574589966330677271} +#define T_5000_461 {0.8368407595510399810123658426164183765650,-0.5474463838167520357558260002406314015388} +#define T_5000_463 {0.8354622351988777451126111373014282435179,-0.5495478628440793045228929258882999420166} +#define T_5000_467 {0.8326893635347630828746900988335255533457,-0.5537403939176471778793597877665888518095} +#define T_5000_469 {0.8312950337377751353074017970357090234756,-0.5558314194815828779283606309036258608103} +#define T_5000_471 {0.8298954530295520326532709987077396363020,-0.5579189341121831846592726833478081971407} +#define T_5000_473 {0.8284906302506062081292270704580005258322,-0.5600029246235713742407824611291289329529} +#define T_5000_477 {0.8256652940080988489768287763581611216068,-0.5641602806565876315758600867411587387323} +#define T_5000_479 {0.8242447983908965936095114557247143238783,-0.5662336199180955098597678443184122443199} +#define T_5000_481 {0.8228190963955773762350531796982977539301,-0.5683033825403166483525296825973782688379} +#define T_5000_483 {0.8213881970276499799865632667206227779388,-0.5703695554495055164423433780029881745577} +#define T_5000_487 {0.8185108423600949700826845401024911552668,-0.5744910799472588980307818928849883377552} +#define T_5000_489 {0.8170644052354033570750857506936881691217,-0.5765464055020347311497630471421871334314} +#define T_5000_491 {0.8156128070878603253390792815480381250381,-0.5785980892763651350918507887399755418301} +#define T_5000_493 {0.8141560570865490342740145024436060339212,-0.5806461183107004542236495581164490431547} +#define T_5000_497 {0.8112271383616037612185323268931824713945,-0.5847311604366945969601943033921997994184} +#define T_5000_499 {0.8097549881386122416770945164898876100779,-0.5867681477250071830908950687444303184748} +#define T_5000_501 {0.8082777230630205922423670017451513558626,-0.5888014286667950791809289512457326054573} +#define T_5000_503 {0.8067953524660377295774082995194476097822,-0.5908309904187508676542961438826750963926} +#define T_5000_507 {0.8038153321939218720615372149040922522545,-0.5948789050974787606662630423670634627342} +#define T_5000_509 {0.8023177013422165826384002684790175408125,-0.5968972324554218111813952418742701411247} +#define T_5000_511 {0.8008150026158564749323431897209957242012,-0.5989117894860358504161013115663081407547} +#define T_5000_513 {0.7993072455067035120279683724220376461744,-0.6009225634642839652954648954619187861681} +#define T_5000_517 {0.7962765942671635732708068644569721072912,-0.6049327114830929108535428895265795290470} +#define T_5000_519 {0.7947537192800170346984600655559916049242,-0.6069320601933794190330218043527565896511} +#define T_5000_521 {0.7932258241964373901566887070657685399055,-0.6089275751909111189519308027229271829128} +#define T_5000_523 {0.7916929186674402929568827858020085841417,-0.6109192438709307726441011254792101681232} +#define T_5000_527 {0.7886121150354394870873875333927571773529,-0.6148909919809614210706172343634534627199} +#define T_5000_529 {0.7870642363924658102547482485533691942692,-0.6168710463232528473653815126453991979361} +#define T_5000_531 {0.7855113862240121758162558762705884873867,-0.6188472041727511863129507219127845019102} +#define T_5000_533 {0.7839535743387240351864875265164300799370,-0.6208194530469697269836615305393934249878} +#define T_5000_537 {0.7808231048088696280018439210834912955761,-0.6247521740631536424359637749148532748222} +#define T_5000_539 {0.7792504669380501125530713579792063683271,-0.6267126213639157006340951738820876926184} +#define T_5000_541 {0.7776729068977652481464701850200071930885,-0.6286691100071480908795251707488205283880} +#define T_5000_543 {0.7760904346527415231804525319603271782398,-0.6306216276346051641255030517640989273787} +#define T_5000_547 {0.7729107935624618397341123454680200666189,-0.6345147005347043789313943307206500321627} +#define T_5000_549 {0.7713136448015469603944893606239929795265,-0.6364552312165822067768772285489831119776} +#define T_5000_551 {0.7697116240044489243743441875267308205366,-0.6383917417013114103951920696999877691269} +#define T_5000_553 {0.7681047412904017379631227413483429700136,-0.6403242197568393967443967085273470729589} +#define T_5000_557 {0.7648764307418843833019650446658488363028,-0.6441770297795128552564847268513403832912} +#define T_5000_559 {0.7632550232991781591351809765910729765892,-0.6460973374102163546695010154508054256439} +#define T_5000_561 {0.7616287947229219978950709446507971733809,-0.6480135639389881241356761165661737322807} +#define T_5000_563 {0.7599977552852593687404691991105210036039,-0.6499256972619000816138168374891392886639} +#define T_5000_567 {0.7567212850661638734450775700679514557123,-0.6537376360037822076165525686519686132669} +#define T_5000_569 {0.7550758749806975034246647737745661288500,-0.6556374173444755593109789515438023954630} +#define T_5000_571 {0.7534256954256273175474234449211508035660,-0.6575330573229074548891048834775574505329} +#define T_5000_573 {0.7517707568243841009802963526453822851181,-0.6594245439651853502027734066359698772430} +#define T_5000_577 {0.7484466443273399827162961628346238285303,-0.6631950094769594183929939390509389340878} +#define T_5000_579 {0.7467774914284401877040409090113826096058,-0.6650739645301460001292070955969393253326} +#define T_5000_581 {0.7451036214770371257642977980140130966902,-0.6669487186147103896516341592359822243452} +#define T_5000_583 {0.7434250450462031034604137857968453317881,-0.6688192598886868456276033612084574997425} +#define T_5000_587 {0.7400538151871081105070970806991681456566,-0.6725476567701397234699811633618082851171} +#define T_5000_589 {0.7383611830533670650211774955096188932657,-0.6744054888270351044710082533129025250673} +#define T_5000_591 {0.7366638870291005725121635805408004671335,-0.6762590609723292400801142321142833679914} +#define T_5000_593 {0.7349619378353529075553751681582070887089,-0.6781083614978530293981862087093759328127} +#define T_5000_597 {0.7315441229704821024526495421014260500669,-0.6817941009918963013447523735521826893091} +#define T_5000_599 {0.7298282788881353955190434135147370398045,-0.6836305166792820964261068183986935764551} +#define T_5000_601 {0.7281078248137233810410862133721821010113,-0.6854626141847768305837007574154995381832} +#define T_5000_603 {0.7263827716145693225158197492419276386499,-0.6872903819358571197994933754671365022659} +#define T_5000_607 {0.7229189114565106599741284298943355679512,-0.6909328820214982735947728542669210582972} +#define T_5000_609 {0.7211801263772293912879263189097400754690,-0.6927475913480489744245005567790940403938} +#define T_5000_611 {0.7194367859323145264482946004136465489864,-0.6945579249043100089622271298139821738005} +#define T_5000_613 {0.7176889011336515800110191776184365153313,-0.6963638712552308041026094542758073657751} +#define T_5000_617 {0.7141795426660783041583613339753355830908,-0.6999625567394810454757703155337367206812} +#define T_5000_619 {0.7124180911641833091607622918672859668732,-0.7017552731415571676265585665532853454351} +#define T_5000_621 {0.7106521396424310132999835332157090306282,-0.7035435568759298430308035676716826856136} +#define T_5000_623 {0.7088816992555312390678068368288222700357,-0.7053273966468270872454127129458356648684} +#define T_5000_627 {0.7053273966468270872454127129458356648684,-0.7088816992555312390678068368288222700357} +#define T_5000_633 {0.6999625567394810454757703155337367206812,-0.7141795426660783041583613339753355830908} +#define T_5000_639 {0.6945579249043100089622271298139821738005,-0.7194367859323145264482946004136465489864} +#define T_5000_651 {0.6836305166792820964261068183986935764551,-0.7298282788881353955190434135147370398045} +#define T_5000_657 {0.6781083614978530293981862087093759328127,-0.7349619378353529075553751681582070887089} +#define T_5000_663 {0.6725476567701397234699811633618082851171,-0.7400538151871081105070970806991681456566} +#define T_5000_669 {0.6669487186147103896516341592359822243452,-0.7451036214770371257642977980140130966902} +#define T_5000_681 {0.6556374173444755593109789515438023954630,-0.7550758749806975034246647737745661288500} +#define T_5000_687 {0.6499256972619000816138168374891392886639,-0.7599977552852593687404691991105210036039} +#define T_5000_693 {0.6441770297795128552564847268513403832912,-0.7648764307418843833019650446658488363028} +#define T_5000_699 {0.6383917417013114103951920696999877691269,-0.7697116240044489243743441875267308205366} +#define T_5000_711 {0.6267126213639157006340951738820876926184,-0.7792504669380501125530713579792063683271} +#define T_5000_717 {0.6208194530469697269836615305393934249878,-0.7839535743387240351864875265164300799370} +#define T_5000_723 {0.6148909919809614210706172343634534627199,-0.7886121150354394870873875333927571773529} +#define T_5000_729 {0.6089275751909111189519308027229271829128,-0.7932258241964373901566887070657685399055} +#define T_5000_741 {0.5968972324554218111813952418742701411247,-0.8023177013422165826384002684790175408125} +#define T_5000_747 {0.5908309904187508676542961438826750963926,-0.8067953524660377295774082995194476097822} +#define T_5000_753 {0.5847311604366945969601943033921997994184,-0.8112271383616037612185323268931824713945} +#define T_5000_759 {0.5785980892763651350918507887399755418301,-0.8156128070878603253390792815480381250381} +#define T_5000_771 {0.5662336199180955098597678443184122443199,-0.8242447983908965936095114557247143238783} +#define T_5000_777 {0.5600029246235713742407824611291289329529,-0.8284906302506062081292270704580005258322} +#define T_5000_783 {0.5537403939176471778793597877665888518095,-0.8326893635347630828746900988335255533457} +#define T_5000_789 {0.5474463838167520357558260002406314015388,-0.8368407595510399810123658426164183765650} +#define T_5000_801 {0.5347653584232292534039743259199894964695,-0.8450005984793591551351710222661495208740} +#define T_5000_807 {0.5283790640298017260434448871819768100977,-0.8490085775155576941486401665315497666597} +#define T_5000_813 {0.5219627319988210389922755894076544791460,-0.8529682915585590219365030861808918416500} +#define T_5000_819 {0.5155167270901142018146856571547687053680,-0.8568795155039573563371391173859592527151} +#define T_5000_831 {0.5025371660923133587672850808303337544203,-0.8645556064799456708058755793899763375521} +#define T_5000_837 {0.4960043478737660715971458103012992069125,-0.8683200371351107449413575523067265748978} +#define T_5000_843 {0.4894433324766390702720286753901746124029,-0.8720351049666303921981125313322991132736} +#define T_5000_849 {0.4828544928858136686677937632339308038354,-0.8757005987779062916587236031773500144482} +#define T_5000_861 {0.4695948409501388698394919174461392685771,-0.8828820336562601678309647468267939984798} +#define T_5000_867 {0.4629247823986284426922566126449964940548,-0.8863975664684456967634673674183432012796} +#define T_5000_873 {0.4562284071972453891596899211435811594129,-0.8898627087738110175152428382716607302427} +#define T_5000_879 {0.4495060960258935867095431149209616705775,-0.8932772635836871977232931385515257716179} +#define T_5000_891 {0.4359851958432979235347204394201980903745,-0.8999538371524847812921166223532054573298} +#define T_5000_897 {0.4291873754770037519001846249011578038335,-0.9032154763571986766379495747969485819340} +#define T_5000_903 {0.4223651563869436631470932752563385292888,-0.9064257689795853778491618868429213762283} +#define T_5000_909 {0.4155189264070768273562350714200874790549,-0.9095845325188364194346490876341704279184} +#define T_5000_921 {0.4017559919165675230701140208111610263586,-0.9157467570017023827944058211869560182095} +#define T_5000_927 {0.3948400698101989236832309870806057006121,-0.9187498676311617318290814182546455413103} +#define T_5000_933 {0.3879017015780925059509343100216938182712,-0.9217007485690897894414774782489985227585} +#define T_5000_939 {0.3809412816571296489520648265170166268945,-0.9245992320618827697842334600863978266716} +#define T_5000_951 {0.3669558707417092668912061981245642527938,-0.9302383506006371982621772076527122408152} +#define T_5000_957 {0.3599316747990104481402795499889180064201,-0.9329786650703109085469577621552161872387} +#define T_5000_963 {0.3528870172257748683009026535728480666876,-0.9356659409604988208997156107216142117977} +#define T_5000_969 {0.3458222985012897043866075819096295163035,-0.9383000255031889169643477544013876467943} +#define T_5000_981 {0.3316342851951342707295111722487490624189,-0.9434080246018221593828911863965913653374} +#define T_5000_987 {0.3245117971828955227842072872590506449342,-0.9458816487749021062469978460285346955061} +#define T_5000_993 {0.3173708611124177458862050116294994950294,-0.9483015008512654109296136084594763815403} +#define T_5000_999 {0.3102118829362908170033108490315498784184,-0.9506674432655832784888616515672765672207} +#define T_5000_1011 {0.2958414291826597097845308326213853433728,-0.9552370641778726856330195005284622311592} +#define T_5000_1017 {0.2886307705460806061203982153529068455100,-0.9574404828990550342382448434364050626755} +#define T_5000_1023 {0.2814037036392603163825754108984256163239,-0.9595894724193817504342973734310362488031} +#define T_5000_1029 {0.2741606393112107631360174764267867431045,-0.9616839105716952218472215463407337665558} +#define T_5000_1041 {0.2596281663112222637401771407894557341933,-0.9657086596162800518072799604851752519608} +#define T_5000_1047 {0.2523395837907840832947670151042984798551,-0.9676387417069935992230966803617775440216} +#define T_5000_1053 {0.2450366561051525171599507757491664960980,-0.9695138148395850974381460218864958733320} +#define T_5000_1059 {0.2377197984159224453115655251167481765151,-0.9713337724186744681631466846738476306200} +#define T_5000_1071 {0.2230459576089092454953544120144215412438,-0.9748079302069329044044820875569712370634} +#define T_5000_1077 {0.2156898086791962099173503020210773684084,-0.9764619329149148985180772797320969402790} +#define T_5000_1083 {0.2083213980746022320200694366576499305665,-0.9780604250782479747172715178749058395624} +#define T_5000_1089 {0.2009411446793378519526385161952930502594,-0.9796033158248074190765919411205686628819} +#define T_5000_1101 {0.1861467883960220559558251807175111025572,-0.9825219453884206366467424231814220547676} +#define T_5000_1107 {0.1787335265471830525463303729338804259896,-0.9838975182853179068942495177907403558493} +#define T_5000_1113 {0.1713101039382865231619490486991708166897,-0.9852171579345608831701497365429531782866} +#define T_5000_1119 {0.1638769425809012969530442660470725968480,-0.9864807893164143637321217283897567540407} +#define T_5000_1131 {0.1489830944110912269717772460353444330394,-0.9888397431230684375691453169565647840500} +#define T_5000_1137 {0.1415232542938584248126687725743977352977,-0.9899349314445247127736138281761668622494} +#define T_5000_1143 {0.1340553687703875396231012473435839638114,-0.9909738432995269796776938164839521050453} +#define T_5000_1149 {0.1265798623799041233706219600207987241447,-0.9919564196273364942157968471292406320572} +#define T_5000_1161 {0.1116076872968228689275704823558044154197,-0.9937523454745929019082950617303140461445} +#define T_5000_1167 {0.1041118697521965369423568859019724186510,-0.9945655928980761117941256088670343160629} +#define T_5000_1173 {0.0966101335881140305117398270340345334262,-0.9953223006082435597718927056121174246073} +#define T_5000_1179 {0.0891029052681641953181923554438981227577,-0.9960224255872819343338164799206424504519} +#define T_5000_1191 {0.0740736795518646129465878402697853744030,-0.9972527713662407977679436044127214699984} +#define T_5000_1197 {0.0665525365467379420580940063700836617500,-0.9977829222226621830316162231611087918282} +#define T_5000_1203 {0.0590276101196191491093934189393621636555,-0.9982563504649325514250790547521319240332} +#define T_5000_1209 {0.0514993280524322499802636343702033627778,-0.9986730291792944580464563841815106570721} +#define T_5000_1221 {0.0364344090550387106342000720360374543816,-0.9993360465012808280249601011746563017368} +#define T_5000_1227 {0.0288986285451700818105447154948706156574,-0.9995823474172642608692740395781584084034} +#define T_5000_1233 {0.0213612051872230264415453149240420316346,-0.9997718234241997903311016671068500727415} +#define T_5000_1239 {0.0138225674735552497146562345164966245648,-0.9999044637506320221120859059737995266914} +#define T_5000_1251 {-0.0012566367307023254638181919418116194720,-0.9999992104317517727096742419234942644835} +#define T_5000_1257 {-0.0087963459888593647367782679680203727912,-0.9999613114002182667761076118040364235640} +#define T_5000_1263 {-0.0163355551866041122377470884430294972844,-0.9998665659160452801046403692453168332577} +#define T_5000_1269 {-0.0238738357300562718810166273897266364656,-0.9997149793653861138409411069005727767944} +#define T_5000_1281 {-0.0389458967668873529133222177733841817826,-0.9992413207654209372776676900684833526611} +#define T_5000_1287 {-0.0464788204339129581987855033275991445407,-0.9989192756429681185892377470736391842365} +#define T_5000_1293 {-0.0540091018426487601278296324380789883435,-0.9985404433062039508683938038302585482597} +#define T_5000_1299 {-0.0615363129067472286459761221522057894617,-0.9981048452912363311284593692107591778040} +#define T_5000_1311 {-0.0765798125526930689721538669800793286413,-0.9970634545049750219192219446995295584202} +#define T_5000_1317 {-0.0840952459318636824647796856879722326994,-0.9964577209353436826333449971571099013090} +#define T_5000_1323 {-0.0916058986096605909477119666917133145034,-0.9957953400874682836629858684318605810404} +#define T_5000_1329 {-0.0991113436156031701873558859006152488291,-0.9950763496168069099923059184220619499683} +#define T_5000_1341 {-0.1141049042345042979773594993275764863938,-0.9934687065175402675976101818378083407879} +#define T_5000_1347 {-0.1215921674837524696366486409715435001999,-0.9925801452812780256707014814310241490602} +#define T_5000_1353 {-0.1290725183821812838136366963226464577019,-0.9916351572017207960385576370754279196262} +#define T_5000_1359 {-0.1365455316819254305560349393999786116183,-0.9906337960001668774268068773380946367979} +#define T_5000_1371 {-0.1514678466037124304754968306951923295856,-0.9884621851367073608329860689991619437933} +#define T_5000_1377 {-0.1589162999122667452311929991992656141520,-0.9872920589279519099079607258317992091179} +#define T_5000_1383 {-0.1663557190433848087529611348145408555865,-0.9860658064964824554010647261748090386391} +#define T_5000_1389 {-0.1737856810761170822665633295400766655803,-0.9847834975530967982493280032940674573183} +#define T_5000_1401 {-0.1886155448747545981813544813121552579105,-0.9820510049033092103698550090484786778688} +#define T_5000_1407 {-0.1960146035828930488609245230691158212721,-0.9806009765354312124330249389458913356066} +#define T_5000_1413 {-0.2034025191250387887276218634724500589073,-0.9790952023238538304283906654745806008577} +#define T_5000_1419 {-0.2107788715081519159699752208325662650168,-0.9775337678698112764408278962946496903896} +#define T_5000_1431 {-0.2254952101357094174094441996203386224806,-0.9742442764552697864743890932004433125257} +#define T_5000_1437 {-0.2328343597761399486056888008533860556781,-0.9725164064979238132124805815692525357008} +#define T_5000_1443 {-0.2401602730970699428603154501615790650249,-0.9707332502937873197623730447958223521709} +#define T_5000_1449 {-0.2474725336302022182799476013315143063664,-0.9688949092128869011730785132385790348053} +#define T_5000_1461 {-0.2620544343642292606233468177379108965397,-0.9650530935808888655103032760962378233671} +#define T_5000_1467 {-0.2693232456037231714596202891698339954019,-0.9630498374318312571062961069401353597641} +#define T_5000_1473 {-0.2765767461797430382119955538655631244183,-0.9609918331976738947730609652353450655937} +#define T_5000_1479 {-0.2838145237405585996626200540049467235804,-0.9588791978731835419225149053090717643499} +#define T_5000_1491 {-0.2982412649022077033578170812688767910004,-0.9544905174539615178019857921754010021687} +#define T_5000_1497 {-0.3054294083622529076826879190775798633695,-0.9522147218497959419991616414336021989584} +#define T_5000_1503 {-0.3126001885721411133189917563868220895529,-0.9498847941222460233845481525349896401167} +#define T_5000_1509 {-0.3197531978826834153650793268752750009298,-0.9475008667245626847730477493314538151026} +#define T_5000_1521 {-0.3340042782833495538241663780354429036379,-0.9425715580731357512433987722033634781837} +#define T_5000_1527 {-0.3411015392187625727693500721215968951583,-0.9400264570439446076832723520055878907442} +#define T_5000_1533 {-0.3481794089914642675864797638496384024620,-0.9374279167777940235950495662109460681677} +#define T_5000_1539 {-0.3552374852340987598786625767388613894582,-0.9347760849981954223153479688335210084915} +#define T_5000_1551 {-0.3692926533087302898294979058846365660429,-0.9293131529318833239372565913072321563959} +#define T_5000_1557 {-0.3762889461233681620555557856278028339148,-0.9265023632054938396152010682271793484688} +#define T_5000_1563 {-0.3832638474186436106094788556220009922981,-0.9236389030686498191613509334274567663670} +#define T_5000_1569 {-0.3902169606808199020520078192930668592453,-0.9207229353051999964918650221079587936401} +#define T_5000_1581 {-0.4040562432665005254683876501076156273484,-0.9147341429493940356820758097455836832523} +#define T_5000_1587 {-0.4109416258454448023229588216054253280163,-0.9116616588118107156191172180115245282650} +#define T_5000_1593 {-0.4178036469468950353522984642040682956576,-0.9085373479378128225292243769217748194933} +#define T_5000_1599 {-0.4246419164742013530045028346648905426264,-0.9053613879402618858094342613185290247202} +#define T_5000_1611 {-0.4382456471931076569248375562892761081457,-0.8988552456965994696957977794227190315723} +#define T_5000_1617 {-0.4450103350309535521489578968612477183342,-0.8955254333159045065926306961046066135168} +#define T_5000_1623 {-0.4517497246311116820827180617925478145480,-0.8921447115214631295998515270184725522995} +#define T_5000_1629 {-0.4584634328683677706450794175907503813505,-0.8887132725027525603778144613897893577814} +#define T_5000_1641 {-0.4718122800747679979060933419532375410199,-0.8816990259553703079831166178337298333645} +#define T_5000_1647 {-0.4784466601799422513785486898996168747544,-0.8781166171771599415762921125860884785652} +#define T_5000_1653 {-0.4850538412374150176731291139731183648109,-0.8744842886529343717327833473973441869020} +#define T_5000_1659 {-0.4916334476378494966475329874810995534062,-0.8708022468756737577422200047294609248638} +#define T_5000_1671 {-0.5047084418894302038083310435467865318060,-0.8632898636527268632079312737914733588696} +#define T_5000_1677 {-0.5112030864447778544246148157981224358082,-0.8594599492759001924113704262708779424429} +#define T_5000_1683 {-0.5176686697937453729423395998310297727585,-0.8555811757594800059933959346381016075611} +#define T_5000_1689 {-0.5241048243766349701289186668873298913240,-0.8516537636065120153716634376905858516693} +#define T_5000_1701 {-0.5368873853911163562813158023345749825239,-0.8436539192156289335144947472144849598408} +#define T_5000_1707 {-0.5432330651513449515377374154923018068075,-0.8395819417580838717185542918741703033447} +#define T_5000_1713 {-0.5495478628440793045228929258882999420166,-0.8354622351988777451126111373014282435179} +#define T_5000_1719 {-0.5558314194815828779283606309036258608103,-0.8312950337377751353074017970357090234756} +#define T_5000_1731 {-0.5683033825403166483525296825973782688379,-0.8228190963955773762350531796982977539301} +#define T_5000_1737 {-0.5744910799472588980307818928849883377552,-0.8185108423600949700826845401024911552668} +#define T_5000_1743 {-0.5806461183107004542236495581164490431547,-0.8141560570865490342740145024436060339212} +#define T_5000_1749 {-0.5867681477250071830908950687444303184748,-0.8097549881386122416770945164898876100779} +#define T_5000_1761 {-0.5989117894860358504161013115663081407547,-0.8008150026158564749323431897209957242012} +#define T_5000_1767 {-0.6049327114830929108535428895265795290470,-0.7962765942671635732708068644569721072912} +#define T_5000_1773 {-0.6109192438709307726441011254792101681232,-0.7916929186674402929568827858020085841417} +#define T_5000_1779 {-0.6168710463232528473653815126453991979361,-0.7870642363924658102547482485533691942692} +#define T_5000_1791 {-0.6286691100071480908795251707488205283880,-0.7776729068977652481464701850200071930885} +#define T_5000_1797 {-0.6345147005347043789313943307206500321627,-0.7729107935624618397341123454680200666189} +#define T_5000_1803 {-0.6403242197568393967443967085273470729589,-0.7681047412904017379631227413483429700136} +#define T_5000_1809 {-0.6460973374102163546695010154508054256439,-0.7632550232991781591351809765910729765892} +#define T_5000_1821 {-0.6575330573229074548891048834775574505329,-0.7534256954256273175474234449211508035660} +#define T_5000_1827 {-0.6631950094769594183929939390509389340878,-0.7484466443273399827162961628346238285303} +#define T_5000_1833 {-0.6688192598886868456276033612084574997425,-0.7434250450462031034604137857968453317881} +#define T_5000_1839 {-0.6744054888270351044710082533129025250673,-0.7383611830533670650211774955096188932657} +#define T_5000_1851 {-0.6854626141847768305837007574154995381832,-0.7281078248137233810410862133721821010113} +#define T_5000_1857 {-0.6909328820214982735947728542669210582972,-0.7229189114565106599741284298943355679512} +#define T_5000_1863 {-0.6963638712552308041026094542758073657751,-0.7176889011336515800110191776184365153313} +#define T_5000_1869 {-0.7017552731415571676265585665532853454351,-0.7124180911641833091607622918672859668732} +#define T_5000_1881 {-0.7124180911641833091607622918672859668732,-0.7017552731415571676265585665532853454351} +#define T_5000_1887 {-0.7176889011336515800110191776184365153313,-0.6963638712552308041026094542758073657751} +#define T_5000_1893 {-0.7229189114565106599741284298943355679512,-0.6909328820214982735947728542669210582972} +#define T_5000_1899 {-0.7281078248137233810410862133721821010113,-0.6854626141847768305837007574154995381832} +#define T_5000_1911 {-0.7383611830533670650211774955096188932657,-0.6744054888270351044710082533129025250673} +#define T_5000_1917 {-0.7434250450462031034604137857968453317881,-0.6688192598886868456276033612084574997425} +#define T_5000_1923 {-0.7484466443273399827162961628346238285303,-0.6631950094769594183929939390509389340878} +#define T_5000_1929 {-0.7534256954256273175474234449211508035660,-0.6575330573229074548891048834775574505329} +#define T_5000_1941 {-0.7632550232991781591351809765910729765892,-0.6460973374102163546695010154508054256439} +#define T_5000_1947 {-0.7681047412904017379631227413483429700136,-0.6403242197568393967443967085273470729589} +#define T_5000_1953 {-0.7729107935624618397341123454680200666189,-0.6345147005347043789313943307206500321627} +#define T_5000_1959 {-0.7776729068977652481464701850200071930885,-0.6286691100071480908795251707488205283880} +#define T_5000_1971 {-0.7870642363924658102547482485533691942692,-0.6168710463232528473653815126453991979361} +#define T_5000_1977 {-0.7916929186674402929568827858020085841417,-0.6109192438709307726441011254792101681232} +#define T_5000_1983 {-0.7962765942671635732708068644569721072912,-0.6049327114830929108535428895265795290470} +#define T_5000_1989 {-0.8008150026158564749323431897209957242012,-0.5989117894860358504161013115663081407547} +#define T_5000_2001 {-0.8097549881386122416770945164898876100779,-0.5867681477250071830908950687444303184748} +#define T_5000_2007 {-0.8141560570865490342740145024436060339212,-0.5806461183107004542236495581164490431547} +#define T_5000_2013 {-0.8185108423600949700826845401024911552668,-0.5744910799472588980307818928849883377552} +#define T_5000_2019 {-0.8228190963955773762350531796982977539301,-0.5683033825403166483525296825973782688379} +#define T_5000_2031 {-0.8312950337377751353074017970357090234756,-0.5558314194815828779283606309036258608103} +#define T_5000_2037 {-0.8354622351988777451126111373014282435179,-0.5495478628440793045228929258882999420166} +#define T_5000_2043 {-0.8395819417580838717185542918741703033447,-0.5432330651513449515377374154923018068075} +#define T_5000_2049 {-0.8436539192156289335144947472144849598408,-0.5368873853911163562813158023345749825239} +#define T_5000_2061 {-0.8516537636065120153716634376905858516693,-0.5241048243766349701289186668873298913240} +#define T_5000_2067 {-0.8555811757594800059933959346381016075611,-0.5176686697937453729423395998310297727585} +#define T_5000_2073 {-0.8594599492759001924113704262708779424429,-0.5112030864447778544246148157981224358082} +#define T_5000_2079 {-0.8632898636527268632079312737914733588696,-0.5047084418894302038083310435467865318060} +#define T_5000_2091 {-0.8708022468756737577422200047294609248638,-0.4916334476378494966475329874810995534062} +#define T_5000_2097 {-0.8744842886529343717327833473973441869020,-0.4850538412374150176731291139731183648109} +#define T_5000_2103 {-0.8781166171771599415762921125860884785652,-0.4784466601799422513785486898996168747544} +#define T_5000_2109 {-0.8816990259553703079831166178337298333645,-0.4718122800747679979060933419532375410199} +#define T_5000_2121 {-0.8887132725027525603778144613897893577814,-0.4584634328683677706450794175907503813505} +#define T_5000_2127 {-0.8921447115214631295998515270184725522995,-0.4517497246311116820827180617925478145480} +#define T_5000_2133 {-0.8955254333159045065926306961046066135168,-0.4450103350309535521489578968612477183342} +#define T_5000_2139 {-0.8988552456965994696957977794227190315723,-0.4382456471931076569248375562892761081457} +#define T_5000_2151 {-0.9053613879402618858094342613185290247202,-0.4246419164742013530045028346648905426264} +#define T_5000_2157 {-0.9085373479378128225292243769217748194933,-0.4178036469468950353522984642040682956576} +#define T_5000_2163 {-0.9116616588118107156191172180115245282650,-0.4109416258454448023229588216054253280163} +#define T_5000_2169 {-0.9147341429493940356820758097455836832523,-0.4040562432665005254683876501076156273484} +#define T_5000_2181 {-0.9207229353051999964918650221079587936401,-0.3902169606808199020520078192930668592453} +#define T_5000_2187 {-0.9236389030686498191613509334274567663670,-0.3832638474186436106094788556220009922981} +#define T_5000_2193 {-0.9265023632054938396152010682271793484688,-0.3762889461233681620555557856278028339148} +#define T_5000_2199 {-0.9293131529318833239372565913072321563959,-0.3692926533087302898294979058846365660429} +#define T_5000_2211 {-0.9347760849981954223153479688335210084915,-0.3552374852340987598786625767388613894582} +#define T_5000_2217 {-0.9374279167777940235950495662109460681677,-0.3481794089914642675864797638496384024620} +#define T_5000_2223 {-0.9400264570439446076832723520055878907442,-0.3411015392187625727693500721215968951583} +#define T_5000_2229 {-0.9425715580731357512433987722033634781837,-0.3340042782833495538241663780354429036379} +#define T_5000_2241 {-0.9475008667245626847730477493314538151026,-0.3197531978826834153650793268752750009298} +#define T_5000_2247 {-0.9498847941222460233845481525349896401167,-0.3126001885721411133189917563868220895529} +#define T_5000_2253 {-0.9522147218497959419991616414336021989584,-0.3054294083622529076826879190775798633695} +#define T_5000_2259 {-0.9544905174539615178019857921754010021687,-0.2982412649022077033578170812688767910004} +#define T_5000_2271 {-0.9588791978731835419225149053090717643499,-0.2838145237405585996626200540049467235804} +#define T_5000_2277 {-0.9609918331976738947730609652353450655937,-0.2765767461797430382119955538655631244183} +#define T_5000_2283 {-0.9630498374318312571062961069401353597641,-0.2693232456037231714596202891698339954019} +#define T_5000_2289 {-0.9650530935808888655103032760962378233671,-0.2620544343642292606233468177379108965397} +#define T_5000_2301 {-0.9688949092128869011730785132385790348053,-0.2474725336302022182799476013315143063664} +#define T_5000_2307 {-0.9707332502937873197623730447958223521709,-0.2401602730970699428603154501615790650249} +#define T_5000_2313 {-0.9725164064979238132124805815692525357008,-0.2328343597761399486056888008533860556781} +#define T_5000_2319 {-0.9742442764552697864743890932004433125257,-0.2254952101357094174094441996203386224806} +#define T_5000_2331 {-0.9775337678698112764408278962946496903896,-0.2107788715081519159699752208325662650168} +#define T_5000_2337 {-0.9790952023238538304283906654745806008577,-0.2034025191250387887276218634724500589073} +#define T_5000_2343 {-0.9806009765354312124330249389458913356066,-0.1960146035828930488609245230691158212721} +#define T_5000_2349 {-0.9820510049033092103698550090484786778688,-0.1886155448747545981813544813121552579105} +#define T_5000_2361 {-0.9847834975530967982493280032940674573183,-0.1737856810761170822665633295400766655803} +#define T_5000_2367 {-0.9860658064964824554010647261748090386391,-0.1663557190433848087529611348145408555865} +#define T_5000_2373 {-0.9872920589279519099079607258317992091179,-0.1589162999122667452311929991992656141520} +#define T_5000_2379 {-0.9884621851367073608329860689991619437933,-0.1514678466037124304754968306951923295856} +#define T_5000_2391 {-0.9906337960001668774268068773380946367979,-0.1365455316819254305560349393999786116183} +#define T_5000_2397 {-0.9916351572017207960385576370754279196262,-0.1290725183821812838136366963226464577019} +#define T_5000_2403 {-0.9925801452812780256707014814310241490602,-0.1215921674837524696366486409715435001999} +#define T_5000_2409 {-0.9934687065175402675976101818378083407879,-0.1141049042345042979773594993275764863938} +#define T_5000_2421 {-0.9950763496168069099923059184220619499683,-0.0991113436156031701873558859006152488291} +#define T_5000_2427 {-0.9957953400874682836629858684318605810404,-0.0916058986096605909477119666917133145034} +#define T_5000_2433 {-0.9964577209353436826333449971571099013090,-0.0840952459318636824647796856879722326994} +#define T_5000_2439 {-0.9970634545049750219192219446995295584202,-0.0765798125526930689721538669800793286413} +#define T_5000_2451 {-0.9981048452912363311284593692107591778040,-0.0615363129067472286459761221522057894617} +#define T_5000_2457 {-0.9985404433062039508683938038302585482597,-0.0540091018426487601278296324380789883435} +#define T_5000_2463 {-0.9989192756429681185892377470736391842365,-0.0464788204339129581987855033275991445407} +#define T_5000_2469 {-0.9992413207654209372776676900684833526611,-0.0389458967668873529133222177733841817826} +#define T_5000_2481 {-0.9997149793653861138409411069005727767944,-0.0238738357300562718810166273897266364656} +#define T_5000_2487 {-0.9998665659160452801046403692453168332577,-0.0163355551866041122377470884430294972844} +#define T_5000_2493 {-0.9999613114002182667761076118040364235640,-0.0087963459888593647367782679680203727912} +#define T_5000_2499 {-0.9999992104317517727096742419234942644835,-0.0012566367307023254638181919418116194720} +#define T_5000_2511 {-0.9999044637506320221120859059737995266914,0.0138225674735552497146562345164966245648} +#define T_5000_2517 {-0.9997718234241997903311016671068500727415,0.0213612051872230264415453149240420316346} +#define T_5000_2523 {-0.9995823474172642608692740395781584084034,0.0288986285451700818105447154948706156574} +#define T_5000_2529 {-0.9993360465012808280249601011746563017368,0.0364344090550387106342000720360374543816} +#define T_5000_2541 {-0.9986730291792944580464563841815106570721,0.0514993280524322499802636343702033627778} +#define T_5000_2547 {-0.9982563504649325514250790547521319240332,0.0590276101196191491093934189393621636555} +#define T_5000_2553 {-0.9977829222226621830316162231611087918282,0.0665525365467379420580940063700836617500} +#define T_5000_2559 {-0.9972527713662407977679436044127214699984,0.0740736795518646129465878402697853744030} +#define T_5000_2571 {-0.9960224255872819343338164799206424504519,0.0891029052681641953181923554438981227577} +#define T_5000_2577 {-0.9953223006082435597718927056121174246073,0.0966101335881140305117398270340345334262} +#define T_5000_2583 {-0.9945655928980761117941256088670343160629,0.1041118697521965369423568859019724186510} +#define T_5000_2589 {-0.9937523454745929019082950617303140461445,0.1116076872968228689275704823558044154197} +#define T_5000_2601 {-0.9919564196273364942157968471292406320572,0.1265798623799041233706219600207987241447} +#define T_5000_2607 {-0.9909738432995269796776938164839521050453,0.1340553687703875396231012473435839638114} +#define T_5000_2613 {-0.9899349314445247127736138281761668622494,0.1415232542938584248126687725743977352977} +#define T_5000_2619 {-0.9888397431230684375691453169565647840500,0.1489830944110912269717772460353444330394} +#define T_5000_2631 {-0.9864807893164143637321217283897567540407,0.1638769425809012969530442660470725968480} +#define T_5000_2637 {-0.9852171579345608831701497365429531782866,0.1713101039382865231619490486991708166897} +#define T_5000_2643 {-0.9838975182853179068942495177907403558493,0.1787335265471830525463303729338804259896} +#define T_5000_2649 {-0.9825219453884206366467424231814220547676,0.1861467883960220559558251807175111025572} +#define T_5000_2661 {-0.9796033158248074190765919411205686628819,0.2009411446793378519526385161952930502594} +#define T_5000_2667 {-0.9780604250782479747172715178749058395624,0.2083213980746022320200694366576499305665} +#define T_5000_2673 {-0.9764619329149148985180772797320969402790,0.2156898086791962099173503020210773684084} +#define T_5000_2679 {-0.9748079302069329044044820875569712370634,0.2230459576089092454953544120144215412438} +#define T_5000_2691 {-0.9713337724186744681631466846738476306200,0.2377197984159224453115655251167481765151} +#define T_5000_2697 {-0.9695138148395850974381460218864958733320,0.2450366561051525171599507757491664960980} +#define T_5000_2703 {-0.9676387417069935992230966803617775440216,0.2523395837907840832947670151042984798551} +#define T_5000_2709 {-0.9657086596162800518072799604851752519608,0.2596281663112222637401771407894557341933} +#define T_5000_2721 {-0.9616839105716952218472215463407337665558,0.2741606393112107631360174764267867431045} +#define T_5000_2727 {-0.9595894724193817504342973734310362488031,0.2814037036392603163825754108984256163239} +#define T_5000_2733 {-0.9574404828990550342382448434364050626755,0.2886307705460806061203982153529068455100} +#define T_5000_2739 {-0.9552370641778726856330195005284622311592,0.2958414291826597097845308326213853433728} +#define T_5000_2751 {-0.9506674432655832784888616515672765672207,0.3102118829362908170033108490315498784184} +#define T_5000_2757 {-0.9483015008512654109296136084594763815403,0.3173708611124177458862050116294994950294} +#define T_5000_2763 {-0.9458816487749021062469978460285346955061,0.3245117971828955227842072872590506449342} +#define T_5000_2769 {-0.9434080246018221593828911863965913653374,0.3316342851951342707295111722487490624189} +#define T_5000_2781 {-0.9383000255031889169643477544013876467943,0.3458222985012897043866075819096295163035} +#define T_5000_2787 {-0.9356659409604988208997156107216142117977,0.3528870172257748683009026535728480666876} +#define T_5000_2793 {-0.9329786650703109085469577621552161872387,0.3599316747990104481402795499889180064201} +#define T_5000_2799 {-0.9302383506006371982621772076527122408152,0.3669558707417092668912061981245642527938} +#define T_5000_2811 {-0.9245992320618827697842334600863978266716,0.3809412816571296489520648265170166268945} +#define T_5000_2817 {-0.9217007485690897894414774782489985227585,0.3879017015780925059509343100216938182712} +#define T_5000_2823 {-0.9187498676311617318290814182546455413103,0.3948400698101989236832309870806057006121} +#define T_5000_2829 {-0.9157467570017023827944058211869560182095,0.4017559919165675230701140208111610263586} +#define T_5000_2841 {-0.9095845325188364194346490876341704279184,0.4155189264070768273562350714200874790549} +#define T_5000_2847 {-0.9064257689795853778491618868429213762283,0.4223651563869436631470932752563385292888} +#define T_5000_2853 {-0.9032154763571986766379495747969485819340,0.4291873754770037519001846249011578038335} +#define T_5000_2859 {-0.8999538371524847812921166223532054573298,0.4359851958432979235347204394201980903745} +#define T_5000_2871 {-0.8932772635836871977232931385515257716179,0.4495060960258935867095431149209616705775} +#define T_5000_2877 {-0.8898627087738110175152428382716607302427,0.4562284071972453891596899211435811594129} +#define T_5000_2883 {-0.8863975664684456967634673674183432012796,0.4629247823986284426922566126449964940548} +#define T_5000_2889 {-0.8828820336562601678309647468267939984798,0.4695948409501388698394919174461392685771} +#define T_5000_2901 {-0.8757005987779062916587236031773500144482,0.4828544928858136686677937632339308038354} +#define T_5000_2907 {-0.8720351049666303921981125313322991132736,0.4894433324766390702720286753901746124029} +#define T_5000_2913 {-0.8683200371351107449413575523067265748978,0.4960043478737660715971458103012992069125} +#define T_5000_2919 {-0.8645556064799456708058755793899763375521,0.5025371660923133587672850808303337544203} +#define T_5000_2931 {-0.8568795155039573563371391173859592527151,0.5155167270901142018146856571547687053680} +#define T_5000_2937 {-0.8529682915585590219365030861808918416500,0.5219627319988210389922755894076544791460} +#define T_5000_2943 {-0.8490085775155576941486401665315497666597,0.5283790640298017260434448871819768100977} +#define T_5000_2949 {-0.8450005984793591551351710222661495208740,0.5347653584232292534039743259199894964695} +#define T_5000_2961 {-0.8368407595510399810123658426164183765650,0.5474463838167520357558260002406314015388} +#define T_5000_2967 {-0.8326893635347630828746900988335255533457,0.5537403939176471778793597877665888518095} +#define T_5000_2973 {-0.8284906302506062081292270704580005258322,0.5600029246235713742407824611291289329529} +#define T_5000_2979 {-0.8242447983908965936095114557247143238783,0.5662336199180955098597678443184122443199} +#define T_5000_2991 {-0.8156128070878603253390792815480381250381,0.5785980892763651350918507887399755418301} +#define T_5000_2997 {-0.8112271383616037612185323268931824713945,0.5847311604366945969601943033921997994184} +// Pre-computed twiddles for N=6561 +#define T_6561_1 {0.9999995414469015075908941980742383748293,-0.0009576565076647147170194052101521720033} +#define T_6561_2 {0.9999981657880266938676072641101200133562,-0.0019153121370567118605249268270540596859} +#define T_6561_4 {0.9999926631588352821111698176537174731493,-0.0038306172479365147715002493100655556191} +#define T_6561_5 {0.9999885361935653138587554167315829545259,-0.0047882649728861340424068693266690388555} +#define T_6561_7 {0.9999775309806009371271784402779303491116,-0.0067035463704822996836019122213201626437} +#define T_6561_8 {0.9999706527429996771871856253710575401783,-0.0076611782866124076771252582318538770778} +#define T_6561_10 {0.9999541450370988959761575642914976924658,-0.0095764201622866532842559550431360548828} +#define T_6561_11 {0.9999445155839388199581208027666434645653,-0.0105340283653505995681287643606083292980} +#define T_6561_13 {0.9999225055213553092769984687038231641054,-0.0124492149107908560423663146821127156727} +#define T_6561_14 {0.9999101249321176165807401048368774354458,-0.0134067914967377181667540497755908290856} +#define T_6561_16 {0.9998826126945214998187339006108231842518,-0.0153219069040886092525344253090224810876} +#define T_6561_17 {0.9998674810713947813667346053989604115486,-0.0162794439691284278937111196228215703741} +#define T_6561_19 {0.9998344668858706318914641997253056615591,-0.0181944724311217456702571837467985460535} +#define T_6561_20 {0.9998165843537506480842580458556767553091,-0.0191519620717907727847073573457237216644} +#define T_6561_22 {0.9997780684927959349295178981265053153038,-0.0210668877818759522901892466961726313457} +#define T_6561_23 {0.9997574351992842833780628097883891314268,-0.0220243220951018697784107303050404880196} +#define T_6561_25 {0.9997134179808070397754704572435002774000,-0.0239391292475764558278505944599601207301} +#define T_6561_26 {0.9996900340962099340558211224561091512442,-0.0248965003307436184842860171784195699729} +#define T_6561_28 {0.9996405158835259818772556172916665673256,-0.0268111731208837272832923304122232366353} +#define T_6561_29 {0.9996143816008526972183290126849897205830,-0.0277684730718983918684727996151195839047} +#define T_6561_31 {0.9995593628026830934629742841934785246849,-0.0296829956960891518102929609312923275866} +#define T_6561_32 {0.9995304783376448565235250498517416417599,-0.0306402166134446887757469824009604053572} +#define T_6561_34 {0.9994699594081120075372837163740769028664,-0.0325545732693107020550016272864013444632} +#define T_6561_35 {0.9994383249991196649375524430070072412491,-0.0335117072521528280848457370666437782347} +#define T_6561_37 {0.9993723064377437736993670114316046237946,-0.0354258821386885872084526738490239949897} +#define T_6561_38 {0.9993379223459063487311482276709284633398,-0.0363829212868805665359595025165617698804} +#define T_6561_40 {0.9992664046976014180501124428701587021351,-0.0382968986045808673646106967680680099875} +#define T_6561_41 {0.9992292712067232240968905898625962436199,-0.0392538350187687304360117934720619814470} +#define T_6561_43 {0.9991522550617923936755460090353153645992,-0.0411675989697591060423320641348254866898} +#define T_6561_44 {0.9991123724783717019448658902547322213650,-0.0421244247514368438944920569610985694453} +#define T_6561_46 {0.9990298584725023633978935322375036776066,-0.0440379595396039463794934931684110779315} +#define T_6561_47 {0.9989872271257279612299839754996355623007,-0.0449946667911787015481372975500562461093} +#define T_6561_49 {0.9988992159399867620805935075622983276844,-0.0469079566223006699798858676331292372197} +#define T_6561_50 {0.9988538361817356214800156521960161626339,-0.0478645374471579274078258947611175244674} +#define T_6561_52 {0.9987603285425625809779148767120204865932,-0.0497775665290347835156836708847549743950} +#define T_6561_53 {0.9987122007473970830560006106679793447256,-0.0507340130316035475832592283040867187083} +#define T_6561_55 {0.9986131974265994859507600267534144222736,-0.0526467655741875012465058603083889465779} +#define T_6561_56 {0.9985623219917639792342356486187782138586,-0.0536030698600054728020225525142450351268} +#define T_6561_58 {0.9984578238065104915932579388027079403400,-0.0555155300755312830496279730141395702958} +#define T_6561_59 {0.9984042011519284054443801323941443115473,-0.0564716842513100086842214864191191736609} +#define T_6561_61 {0.9982942089647418582032400991010945290327,-0.0583838363544252961223612885532929794863} +#define T_6561_62 {0.9982378395330119280615122079325374215841,-0.0593398325281153243837550803618796635419} +#define T_6561_64 {0.9981223542517623226189016349962912499905,-0.0612516607360108489288563760055694729090} +#define T_6561_65 {0.9980632385081550372873948617780115455389,-0.0622074910168668587795437474596838001162} +#define T_6561_67 {0.9979422610860524400777649134397506713867,-0.0641189795494068043302249293446948286146} +#define T_6561_68 {0.9978803995185061559425321320304647088051,-0.0650746360480527613612267145981604699045} +#define T_6561_70 {0.9977539309540920386965012767177540808916,-0.0669857691279049510812981793606013525277} +#define T_6561_71 {0.9976893240732092049682933065923862159252,-0.0679412439563991943369813952813274227083} +#define T_6561_73 {0.9975573654103488951960798658546991646290,-0.0698520058091653822662792094888573046774} +#define T_6561_74 {0.9974900137493915019959445089625660330057,-0.0708072910810657596414330328116193413734} +#define T_6561_76 {0.9973525660772653012031696562189608812332,-0.0727176659354117349565171934955287724733} +#define T_6561_77 {0.9972824701921506607149581213889177888632,-0.0736727537658407038989594184386078268290} +#define T_6561_79 {0.9971395346452448515961464181600604206324,-0.0755827258536265617072658073993807192892} +#define T_6561_80 {0.9970666951145409351298098954430315643549,-0.0765376083593362205315102642089186701924} +#define T_6561_82 {0.9969182728726385667172849025519099086523,-0.0784471619157464661320489085483131930232} +#define T_6561_83 {0.9968426902975592307498686750477645546198,-0.0794018312151836408441241133004950825125} +#define T_6561_85 {0.9966887825857301264065313262108247727156,-0.0813109504788573078659652537680813111365} +#define T_6561_86 {0.9966104575901302276008664193795993924141,-0.0822653986922286667438086738002311903983} +#define T_6561_88 {0.9964510656787211040352758573135361075401,-0.0841740679053894214067810253254720009863} +#define T_6561_89 {0.9963699989090911701694608382240403443575,-0.0851282871547264369249674587081244681031} +#define T_6561_91 {0.9962051241137150903171004756586626172066,-0.0870364905633126267892052396746294107288} +#define T_6561_92 {0.9961213162391764353031931022997014224529,-0.0879904729725366346881898493847984354943} +#define T_6561_94 {0.9959509599207015950739219078968744724989,-0.0898981948263313790370432343479478731751} +#define T_6561_95 {0.9958644116330002127313036908162757754326,-0.0908519325213185680034655433701118454337} +#define T_6561_97 {0.9956885751975395049129247126984409987926,-0.0927591570740796678151696141867432743311} +#define T_6561_98 {0.9955992872110407398977827142516616731882,-0.0937126421827261107955209240572003182024} +#define T_6561_100 {0.9954179721099392086358648157329298555851,-0.0956193536923160836149548913454054854810} +#define T_6561_101 {0.9953259451616217612368586742377374321222,-0.0965725783446026581069432381809747312218} +#define T_6561_103 {0.9951391528914453887821878197428304702044,-0.0984787610731186618950872002642427105457} +#define T_6561_104 {0.9950443877408949866492093860870227217674,-0.0994317174011760812613047733066196087748} +#define T_6561_106 {0.9948521198434181478376103768823668360710,-0.1013373556150797966113330517146096099168} +#define T_6561_107 {0.9947546172728217728220556637097615748644,-0.1022900357532534748594699181012401822954} +#define T_6561_109 {0.9945568753350140234203990985406562685966,-0.1041951137235009872128443930705543607473} +#define T_6561_110 {0.9944566361491531392147180667961947619915,-0.1051475098084160009204168773067067377269} +#define T_6561_112 {0.9942534218031667814230445401335600763559,-0.1070520118105876133940412842093792278320} +#define T_6561_113 {0.9941504468294104501779884230927564203739,-0.1080041159812136081219691163823881652206} +#define T_6561_115 {0.9939417617525667658640031731920316815376,-0.1099080262956436265797677265254606027156} +#define T_6561_116 {0.9938360518408655419619890381000004708767,-0.1108598306933596955303755748900584876537} +#define T_6561_118 {0.9936218977556406928286492075130809098482,-0.1127631336052662136548718763151555322111} +#define T_6561_119 {0.9935134537785189623448900420044083148241,-0.1137146303739257624521030720643466338515} +#define T_6561_121 {0.9932938324525303341872017881541978567839,-0.1156173101735403080381203722026839386672} +#define T_6561_122 {0.9931826553050793204846513617667369544506,-0.1165684914595358778743872107952483929694} +#define T_6561_124 {0.9929575685510704241565349548181984573603,-0.1184705324422331701450517016382946167141} +#define T_6561_125 {0.9928436591509409714362277554755564779043,-0.1194213903945613025614491675696626771241} +#define T_6561_127 {0.9926131088267667879065925262693781405687,-0.1213227768609887596840124501795799005777} +#define T_6561_128 {0.9924964681141612565795639966381713747978,-0.1222733036313147642060172870515089016408} +#define T_6561_130 {0.9922604561227728048322660470148548483849,-0.1241740198875222050967082054739876184613} +#define T_6561_131 {0.9921410850604380771144974460185039788485,-0.1251242076302449546254536016931524500251} +#define T_6561_133 {0.9918996133498667600036924341111443936825,-0.1270242379878140370763617283955682069063} +#define T_6561_134 {0.9917775129230854691542162981932051479816,-0.1279740788601307632799120028721517883241} +#define T_6561_136 {0.9915305834864269751705023736576549708843,-0.1298734076363045331081735866973758675158} +#define T_6561_137 {0.9914057547030101780194399907486513257027,-0.1308228937982754136459817573268082924187} +#define T_6561_139 {0.9911533695784077169221859548997599631548,-0.1327215053160877844540266323747346177697} +#define T_6561_140 {0.9910258134686860120865503631648607552052,-0.1336706289307008355127237564374809153378} +#define T_6561_142 {0.9907679747393141056477361416909843683243,-0.1355685075191058464039173259152448736131} +#define T_6561_143 {0.9906376923561293068587474408559501171112,-0.1365172607523415793107091076308279298246} +#define T_6561_145 {0.9903744021501761363168725438299588859081,-0.1384143907463428746495992527343332767487} +#define T_6561_146 {0.9902413945688726126803658189601264894009,-0.1393627657672387998299967648563324473798} +#define T_6561_148 {0.9899726550595224772166602633660659193993,-0.1412591315080189147135314442493836395442} +#define T_6561_149 {0.9898369233779383824511910461296793073416,-0.1422071204887343509604136215784819796681} +#define T_6561_151 {0.9895627367833538245989188908424694091082,-0.1441027063237838856668560083562624640763} +#define T_6561_152 {0.9894242821218117711623563081957399845123,-0.1450503014396644918537759849641588516533} +#define T_6561_154 {0.9891446507051155911938167264452204108238,-0.1469450917229113695583464505034498870373} +#define T_6561_155 {0.9890034742064133244099366493173874914646,-0.1478922851525537596195647438435116782784} +#define T_6561_157 {0.9887184002756695955227428385114762932062,-0.1497862642444923730877803791372571140528} +#define T_6561_158 {0.9885745031050711117970308805524837225676,-0.1507330481698086199759956116395187564194} +#define T_6561_160 {0.9882839890132657512111791220377199351788,-0.1526262004376288394791316704868222586811} +#define T_6561_161 {0.9881373723584911950013065506936982274055,-0.1535725670439111179010893692975514568388} +#define T_6561_163 {0.9878414205035133122123625071253627538681,-0.1554648768616274101539431740093277767301} +#define T_6561_164 {0.9876920855747293170878720047767274081707,-0.1564108183376123895058640300703700631857} +#define T_6561_166 {0.9873906983993510078079225422698073089123,-0.1583022700861927700710651834015152417123} +#define T_6561_167 {0.9872386464291607044430065798223949968815,-0.1592477786241260628852245417874655686319} +#define T_6561_169 {0.9869318264210167335193091275868937373161,-0.1611383566916209930663939076112001203001} +#define T_6561_170 {0.9867770586644496466632858755474444478750,-0.1620834244873217699911549516400555148721} +#define T_6561_172 {0.9864648083560173530415227105550002306700,-0.1639731132689930259704880199933541007340} +#define T_6561_173 {0.9863073260905188544001021000440232455730,-0.1649177325219182421722763365323771722615} +#define T_6561_175 {0.9859896480590968348423075440223328769207,-0.1668065164203677563925509730324847623706} +#define T_6561_176 {0.9858294525845178180034622528182808309793,-0.1677506793336766277580096584642888046801} +#define T_6561_178 {0.9855063494522048328505547942768316715956,-0.1696385427589751637711401599517557770014} +#define T_6561_179 {0.9853434420907908330988789202820044010878,-0.1705822415395935598425580792536493390799} +#define T_6561_181 {0.9850149165244643789662859489908441901207,-0.1724691689094094149137248450642800889909} +#define T_6561_182 {0.9848492986208448041196561462129466235638,-0.1734123957680940852910111971141304820776} +#define T_6561_184 {0.9845153533321385763699140625249128788710,-0.1752983715078217374916391690931050106883} +#define T_6561_185 {0.9843470262533153825046383644803427159786,-0.1762411186592246770121761301197693683207} +#define T_6561_187 {0.9840076639985974038538074637472163885832,-0.1781261272021132935350351544911973178387} +#define T_6561_188 {0.9838366291339338820520765693800058215857,-0.1790683868648459686756524433803861029446} +#define T_6561_190 {0.9834918527142835209531313012121245265007,-0.1809524126521279141499576326168607920408} +#define T_6561_191 {0.9833181114754931950727723233285360038280,-0.1818941770488254894289070762170013040304} +#define T_6561_193 {0.9829679237366779620543866258230991661549,-0.1837772045298447509686923240224132314324} +#define T_6561_194 {0.9827914775578124872978946768853347748518,-0.1847184658872302875920468068215996026993} +#define T_6561_196 {0.9824358813902647202809248483390547335148,-0.1866004795195708443333870718561229296029} +#define T_6561_197 {0.9822567317277020038090995512902736663818,-0.1875412300685194710858638700301526114345} +#define T_6561_199 {0.9818957300664948872892523468181025236845,-0.1894222143181335249462193814906640909612} +#define T_6561_200 {0.9817138783989275419017417334544006735086,-0.1903624462937366090820034969510743394494} +#define T_6561_202 {0.9813474742237511261322424616082571446896,-0.1922423856350728155195639601515722461045} +#define T_6561_203 {0.9811629220521738137250622457941062748432,-0.1931820912767019393641021451912820339203} +#define T_6561_205 {0.9807911183873100346986007025407161563635,-0.1950609701928336381371309471433050930500} +#define T_6561_206 {0.9806038672350072538108634034870192408562,-0.1960001417442047144668038072268245741725} +#define T_6561_208 {0.9802266671493052863084471937327180057764,-0.1978779447269579105928016815596492961049} +#define T_6561_209 {0.9800367185618388266021838717279024422169,-0.1988165744361952702590201624843757599592} +#define T_6561_211 {0.9796541251686896600858744932338595390320,-0.2006932859862766704850400856230407953262} +#define T_6561_212 {0.9794614807138857237589490978280082345009,-0.2016313661059769002381614200203330256045} +#define T_6561_214 {0.9790734971711962941753881750628352165222,-0.2035069707331018384888210448480094783008} +#define T_6561_215 {0.9788781584391328394190168182831257581711,-0.2044444935203978963578208549733972176909} +#define T_6561_217 {0.9784847879493001610029523362754844129086,-0.2063189757434181759165880976070184260607} +#define T_6561_218 {0.9782867565522933572808028657163958996534,-0.2072559334600431735218251105834497138858} +#define T_6561_220 {0.9778880023621780992471030913293361663818,-0.2091292778070749092123037371493410319090} +#define T_6561_221 {0.9776872799347695597305119008524343371391,-0.2100656627194259773450113470971700735390} +#define T_6561_223 {0.9772831453356690678546669914794620126486,-0.2119378537279772711787728667331975884736} +#define T_6561_224 {0.9770797335346116385679238192096818238497,-0.2128736581071795086472775437869131565094} +#define T_6561_226 {0.9766702218622330677888498939864803105593,-0.2147446803242779589382394078711513429880} +#define T_6561_227 {0.9764641223664777269775072454649489372969,-0.2156798964462481316139985665358835831285} +#define T_6561_229 {0.9760492370009103968442332188715226948261,-0.2175497344285685918929829085755045525730} +#define T_6561_230 {0.9758404515115922661649960900831501930952,-0.2184843545740789150233496229702723212540} +#define T_6561_232 {0.9754201958772792391272332679363898932934,-0.2203529928880708088634321484278189018369} +#define T_6561_233 {0.9752087261177034838155464058218058198690,-0.2212870093428126461176930206420365720987} +#define T_6561_235 {0.9747831036834142537372827064245939254761,-0.2231544325648274207374299749062629416585} +#define T_6561_236 {0.9745689513990417607303129443607758730650,-0.2240878376194749554972673877273336984217} +#define T_6561_238 {0.9741379656778428319796603318536654114723,-0.2259540303358933133193175990527379326522} +#define T_6561_239 {0.9739211326362761100838838501658756285906,-0.2268868162861672477248475843225605785847} +#define T_6561_241 {0.9734847871855024648013454680040013045073,-0.2287517630935265167124725849134847521782} +#define T_6561_242 {0.9732652751764707677040178168681450188160,-0.2296839222402575486636777668536524288356} +#define T_6561_244 {0.9728235735976962228477304961415939033031,-0.2315476077453786363236076795146800577641} +#define T_6561_245 {0.9726013844330406721283566184865776449442,-0.2324791323945711307707995274540735408664} +#define T_6561_247 {0.9721543303720483475416358487564139068127,-0.2343415412146857002007038772717351093888} +#define T_6561_248 {0.9719294658857076107949524157447740435600,-0.2352724236775811106348044177138945087790} +#define T_6561_250 {0.9714770630324595090954176157538313418627,-0.2371335404404585345261580187070649117231} +#define T_6561_251 {0.9712495250804538127198384245275519788265,-0.2380637730335988522245571630264748819172} +#define T_6561_253 {0.9707917771690610653223529880051501095295,-0.2399235823776731113543547735389438457787} +#define T_6561_254 {0.9705615676294774285537414471036754548550,-0.2408531574229643423823432613062323071063} +#define T_6561_256 {0.9700984784381686543142109258042182773352,-0.2427116439974607298157849299968802370131} +#define T_6561_257 {0.9698655992111453461035353029728867113590,-0.2436405538222362887612604254172765649855} +#define T_6561_259 {0.9693971725622357871188228273240383714437,-0.2454977022872981418100124528791639022529} +#define T_6561_260 {0.9691616255699464499429041097755543887615,-0.2464259392243821067403075630863895639777} +#define T_6561_262 {0.9686878653298066632615359594637993723154,-0.2482817342511975666763390790947596542537} +#define T_6561_263 {0.9684496525164446589784006391710136085749,-0.2492092906389679618506249880738323554397} +#define T_6561_265 {0.9679705625954683201328521136019844561815,-0.2510637169098963172864102944004116579890} +#define T_6561_266 {0.9677296859272304097032701974967494606972,-0.2519905850923483958681003969104494899511} +#define T_6561_268 {0.9672452702798021162422514862555544823408,-0.2538436273010465926702750039112288504839} +#define T_6561_269 {0.9670017317448723614958794314588885754347,-0.2547697996278561194394285394082544371486} +#define T_6561_271 {0.9665119943693349924274116347078233957291,-0.2566214424794049930866890463221352547407} +#define T_6561_272 {0.9662657959778685468066328212444204837084,-0.2575469113059912218410829609638312831521} +#define T_6561_274 {0.9657707409164902889742165825737174600363,-0.2593971395170219795822674768714932724833} +#define T_6561_275 {0.9655218847005965221441670109925325959921,-0.2603218972046107970719219792954390868545} +#define T_6561_277 {0.9650215160395374525137413002084940671921,-0.2621706955034309172170026158710243180394} +#define T_6561_278 {0.9647700040532634080392426767502911388874,-0.2630947344191179038119798860861919820309} +#define T_6561_280 {0.9642643259225417429192361851164605468512,-0.2649420875458373680899626378959510475397} +#define T_6561_281 {0.9640101602418548187856117692717816680670,-0.2658654000626508584481655361742014065385} +#define T_6561_283 {0.9634991768153130520246918422344606369734,-0.2677112927693079957869315421703504398465} +#define T_6561_284 {0.9632423595380842362700946068798657506704,-0.2686338712662719729884486241644481197000} +#define T_6561_286 {0.9627260750333543892764964766683988273144,-0.2704782883169593032945954291790258139372} +#define T_6561_287 {0.9624666082793409405127249556244350969791,-0.2714001251791562929760459610406542196870} +#define T_6561_289 {0.9619450269578097012512785113358404487371,-0.2732430513501463709147287772793788462877} +#define T_6561_290 {0.9616829128686373850953827968623954802752,-0.2741641389687803909147589820349821820855} +#define T_6561_292 {0.9611560390354114691291442795773036777973,-0.2760055590486513721337757942819735035300} +#define T_6561_293 {0.9608912797745571277019394074159208685160,-0.2769258898211106045827989419194636866450} +#define T_6561_295 {0.9603591177784269738992861675797030329704,-0.2787657886108719229589780752576189115644} +#define T_6561_296 {0.9600917155312004291900507269019726663828,-0.2796853549407914418800658040709095075727} +#define T_6561_298 {0.9595542697646052276994055318937171250582,-0.2815237172540091536987461040553171187639} +#define T_6561_299 {0.9592842267381314069751852002809755504131,-0.2824425115513338746531246670201653614640} +#define T_6561_301 {0.9587415016371217957313888291537296026945,-0.2842793222142560027876356798515189439058} +#define T_6561_302 {0.9584688200603226349016949825454503297806,-0.2851973368953029663863674159074435010552} +#define T_6561_304 {0.9579208201045249504446132959856186062098,-0.2870325807469848444775095686054555699229} +#define T_6561_305 {0.9576455022281006312923068435338791459799,-0.2879498082345062215381403802894055843353} +#define T_6561_307 {0.9570922319406794942509009160858113318682,-0.2897834701269354495956065420614322647452} +#define T_6561_308 {0.9568142800370903477968909101036842912436,-0.2906999028501807136315449042740510776639} +#define T_6561_310 {0.9562557439847114704178920874255709350109,-0.2925319676484024467022493354306789115071} +#define T_6561_311 {0.9559751603481586590405072456633206456900,-0.2934475980431809905013551542651839554310} +#define T_6561_313 {0.9554113631409510976055798892048187553883,-0.2952780506254226722262501425575464963913} +#define T_6561_314 {0.9551281500873581853383598172513302415609,-0.2961928711341664244294236141286091879010} +#define T_6561_316 {0.9545590963788762595143566613842267543077,-0.2980216963919626316226185736013576388359} +#define T_6561_317 {0.9542732562458698941654233749432023614645,-0.2989356994637881181908767302957130596042} +#define T_6561_319 {0.9536989507330549953323384215764235705137,-0.3007628823021052388853036063665058463812} +#define T_6561_320 {0.9534104858799454795814654062269255518913,-0.3016760603928761441672179444140056148171} +#define T_6561_322 {0.9528309333030868799596646567806601524353,-0.3035015857302371111714478502108249813318} +#define T_6561_323 {0.9525398461108489644999508527689613401890,-0.3044139313026265059036745697085279971361} +#define T_6561_325 {0.9519550512535450703666128902113996446133,-0.3062377840712351417806758036022074520588} +#define T_6561_326 {0.9516613441247980809123419021489098668098,-0.3071492895947876000661835860228165984154} +#define T_6561_328 {0.9510713118139167976394787729077506810427,-0.3089714547406529621120796491595683619380} +#define T_6561_329 {0.9507749871729048729562805419845972210169,-0.3098821126918469004429823598911752924323} +#define T_6561_331 {0.9501797222785439700487586378585547208786,-0.3117025751749076811769612049829447641969} +#define T_6561_332 {0.9498807825711161889614686515415087342262,-0.3126123780372170868346870520326774567366} +#define T_6561_334 {0.9492802900065626658943074289709329605103,-0.3144311228314656814220029446005355566740} +#define T_6561_335 {0.9489787377001525081610111556074116379023,-0.3153400630954226735447321061656111851335} +#define T_6561_337 {0.9483730224218425153281941675231792032719,-0.3171570751890291361974050232674926519394} +#define T_6561_338 {0.9480688600054478776257838035235181450844,-0.3180651453522855276467851126653840765357} +#define T_6561_340 {0.9474579270129253050214401810080744326115,-0.3198804097477215835354513728816527873278} +#define T_6561_341 {0.9471511569970878507973566229338757693768,-0.3207876023151109978748252160585252568126} +#define T_6561_343 {0.9465350113329636938530597944918554276228,-0.3226011040292737774848319531884044408798} +#define T_6561_344 {0.9462256362497479811324296861130278557539,-0.3235074115128734328905579786805901676416} +#define T_6561_346 {0.9456042829996580412199591592070646584034,-0.3253191355772091508669063841807655990124} +#define T_6561_347 {0.9452923054026307614350344010745175182819,-0.3262245504964017550619814755918923765421} +#define T_6561_349 {0.9446657496951942345475572437862865626812,-0.3280344819570292225208163472416345030069} +#define T_6561_350 {0.9443511721594031183002471152576617896557,-0.3289389968385647011750450019462732598186} +#define T_6561_352 {0.9437194191661799624881723502767272293568,-0.3307471207563988380151442925125593319535} +#define T_6561_353 {0.9434022442881324632679707065108232200146,-0.3316507281344558411007028553285636007786} +#define T_6561_355 {0.9427652992235807660748037051234859973192,-0.3334570295853309662703622961998917162418} +#define T_6561_356 {0.9424455296212227439767161740746814757586,-0.3343597220015787629954218118655262514949} +#define T_6561_358 {0.9418033977426555347634007375745568424463,-0.3361641860763716072035833803965942934155} +#define T_6561_359 {0.9414810360553494961166620669246185570955,-0.3370659560800314813455713647272204980254} +#define T_6561_361 {0.9408337226628917804305274330545216798782,-0.3388685678847845883510103703883942216635} +#define T_6561_362 {0.9405087715513952284496212996600661426783,-0.3397694080326913446121750439488096162677} +#define T_6561_364 {0.9398562819879398011480020613817032426596,-0.3415701526887356398454187456081854179502} +#define T_6561_365 {0.9395287441343831424944710306590422987938,-0.3424700555453991102083932673849631100893} +#define T_6561_367 {0.9388710837855468449575369049853179603815,-0.3442689181894769134828493406530469655991} +#define T_6561_368 {0.9385409618934114073240948528109584003687,-0.3451678763271432415216111166955670341849} +#define T_6561_370 {0.9378781361874901634223533619660884141922,-0.3469648421115310021889399649808183312416} +#define T_6561_371 {0.9375454329815858800500905090302694588900,-0.3478628481102439828909211883001262322068} +#define T_6561_373 {0.9368774473895105092680068992194719612598,-0.3496579022028746264183496350597124546766} +#define T_6561_374 {0.9365421656159531593743849953170865774155,-0.3505549486505370460065478255273774266243} +#define T_6561_376 {0.9358690256512437466440701427927706390619,-0.3523480762351225425987877315492369234562} +#define T_6561_377 {0.9355311680774325289178250386612489819527,-0.3532441557275571852869688882492482662201} +#define T_6561_379 {0.9348528792961532385419332058518193662167,-0.3550353420037107854412283813871908932924} +#define T_6561_380 {0.9345124487107480115710700374620500952005,-0.3559304471447217177448862912569893524051} +#define T_6561_382 {0.9338290167114610129672769289754796773195,-0.3577196773280801322947297649079700931907} +#define T_6561_383 {0.9334860159243590915778554517601151019335,-0.3586138007295135987639866925746900960803} +#define T_6561_385 {0.9327974463480783740010338078718632459641,-0.3604010600518591234120435728982556611300} +#define T_6561_386 {0.9324518781903912145736512684379704296589,-0.3612941943336646088980046442884486168623} +#define T_6561_388 {0.9317581767205362908157439960632473230362,-0.3630794680430468601706195386213948950171} +#define T_6561_389 {0.9314100440445665096689253914519213140011,-0.3639716058333380410694246620550984516740} +#define T_6561_391 {0.9307112164069150095357940699614118784666,-0.3657548791941957477824587385839549824595} +#define T_6561_392 {0.9303605220861324021086602442665025591850,-0.3666460131293112212347296008374541997910} +#define T_6561_394 {0.9296565740487733320307484063960146158934,-0.3684272714225939604482107370131416246295} +#define T_6561_395 {0.9293033209777917802441038475080858916044,-0.3693173941471581400719514931552112102509} +#define T_6561_397 {0.9285942583510773395971682475646957755089,-0.3710966226704477399778170365607365965843} +#define T_6561_398 {0.9282384494456301649023544086958281695843,-0.3719857268374315850678613060154020786285} +#define T_6561_400 {0.9275242780821284505066159908892586827278,-0.3737629109050634168553983727179002016783} +#define T_6561_401 {0.9271659162790449881796916997700463980436,-0.3746509891758448840271000790380639955401} +#define T_6561_403 {0.9264466420734912555090545538405422121286,-0.3764261141190292647706883144564926624298} +#define T_6561_404 {0.9260857303306718746327419466979335993528,-0.3773131591634540926705199126445222645998} +#define T_6561_406 {0.9253613592199200210686171885754447430372,-0.3790862103303972441281644023547414690256} +#define T_6561_407 {0.9249979005163126988264821193297393620014,-0.3799722148268392940551052561204414814711} +#define T_6561_409 {0.9242684384792860807777969966991804540157,-0.3817431775828641904446669741446385160089} +#define T_6561_410 {0.9239024358148609783469851208792533725500,-0.3826281342182861200384991207101847976446} +#define T_6561_412 {0.9231678888725033393924945812614168971777,-0.3843969939459534468362278403219534084201} +#define T_6561_413 {0.9227993452682283770371896025608293712139,-0.3852808954159666621208657488750759512186} +#define T_6561_415 {0.9220597194834539989116706237837206572294,-0.3870476375151954417930255658575333654881} +#define T_6561_416 {0.9216886379812699869873426905542146414518,-0.3879304765241208263759631336142774671316} +#define T_6561_418 {0.9209439394589136185231836861930787563324,-0.3896950864123087665547018332290463149548} +#define T_6561_419 {0.9205703231217089443916279378754552453756,-0.3905768556732366336703421438869554549456} +#define T_6561_421 {0.9198205580084756194381157001771498471498,-0.3923393187853806418630142616166267544031} +#define T_6561_422 {0.9194444099200611564270957387634553015232,-0.3932200110202309639717555000970605760813} +#define T_6561_424 {0.9186895844044750125689802189299371093512,-0.3949803128090474402256404573563486337662} +#define T_6561_425 {0.9183109076695584738203592678473796695471,-0.3958599207486299120795081307733198627830} +#define T_6561_427 {0.9175510279819122372302331314131151884794,-0.3976180466846745975573185205576010048389} +#define T_6561_428 {0.9171698257260728626150125819549430161715,-0.3984965630687485882432952166709583252668} +#define T_6561_430 {0.9164048981383760006380612139764707535505,-0.4002524986405366358432900142361177131534} +#define T_6561_431 {0.9160211735080383554929994716076180338860,-0.4011299162178712518489476224203826859593} +#define T_6561_433 {0.9152512043339653402540534443687647581100,-0.4028836469319969082469867771578719839454} +#define T_6561_434 {0.9148649604963736692297970876097679138184,-0.4037599584604306679480600905662868171930} +#define T_6561_436 {0.9140899560912123522626870908425189554691,-0.4055114698416871221731128116516629233956} +#define T_6561_437 {0.9137011962344040449934823300282005220652,-0.4063866680881878523656780544115463271737} +#define T_6561_439 {0.9129211629950025885804620884300675243139,-0.4081359456796864182415163213590858504176} +#define T_6561_440 {0.9125298903277825335322859245934523642063,-0.4090100234204109841407159819937078282237} +#define T_6561_442 {0.9117448346924966751103625028918031603098,-0.4107570527837006157945154427579836919904} +#define T_6561_443 {0.9113510524444102811614243364601861685514,-0.4116300028040543179663757200614782050252} +#define T_6561_445 {0.9105609808930507087509909069922287017107,-0.4133747695192407367592579703341471031308} +#define T_6561_446 {0.9101646923143570377945366089988965541124,-0.4142465846139371521417160693090409040451} +#define T_6561_448 {0.9093696113681353221380732065881602466106,-0.4159890742798019180881397005578037351370} +#define T_6561_449 {0.9089708197297808878190039649780374020338,-0.4168597472529221859005588157742749899626} +#define T_6561_451 {0.9081707359512559696312905543891247361898,-0.4185999454870414360208030757348751649261} +#define T_6561_452 {0.9077694445448473148374546326522249728441,-0.4194694691520937657180922997213201597333} +#define T_6561_454 {0.9069643645378713259219694009516388177872,-0.4212073615909569523907407528895419090986} +#define T_6561_455 {0.9065605766756479333423612843034788966179,-0.4220757287709359095728700594918336719275} +#define T_6561_457 {0.9057505070853115736184690831578336656094,-0.4238113010700645388872942476155003532767} +#define T_6561_458 {0.9053442261001185542568236996885389089584,-0.4246785045975101091642045503249391913414} +#define T_6561_460 {0.9045291736126963577646620251471176743507,-0.4264117424315759241615353403176413848996} +#define T_6561_461 {0.9041204028579568063861415794235654175282,-0.4272777751486330211072584006615215912461} +#define T_6561_463 {0.9033003742008518521799942391226068139076,-0.4290086642115763515548110262898262590170} +#define T_6561_464 {0.9028891170505396468470848958531860262156,-0.4298735189700534919943208933545975014567} +#define T_6561_466 {0.9020641189922279368218482886732090264559,-0.4316020449752013266042638406361220404506} +#define T_6561_467 {0.9016503788408392061626273061847314238548,-0.4324657146366300275452942969423020258546} +#define T_6561_469 {0.9008204181908141539025791644235141575336,-0.4341918633168138086375620332546532154083} +#define T_6561_470 {0.9004041984533397435797041907790116965771,-0.4350543407525072625574580342799890786409} +#define T_6561_472 {0.8995692820620556640065501596836838871241,-0.4367780978601809027672686625010101124644} +#define T_6561_473 {0.8991505861739528260301312911906279623508,-0.4376393759512927639221402387192938476801} +#define T_6561_475 {0.8983107209327684250510515084897633641958,-0.4393607272586499967736983762733871117234} +#define T_6561_476 {0.8978895523499323960692208856926299631596,-0.4402207988962334450633306914824061095715} +#define T_6561_478 {0.8970447451910539271580091735813766717911,-0.4419397301953254530992865056759910658002} +#define T_6561_479 {0.8966211073897895067474905772542115300894,-0.4427985882803914807759326777159003540874} +#define T_6561_481 {0.8957713652862137054810887093481142073870,-0.4445150853832439685753286084946012124419} +#define T_6561_482 {0.8953452617632062793262548439088277518749,-0.4453727228268202775751660738023929297924} +#define T_6561_484 {0.8944905917286626317874720371037255972624,-0.4470867715655507668159884815395344048738} +#define T_6561_485 {0.8940620260009496389486116640910040587187,-0.4479431812887401664902142783830640837550} +#define T_6561_487 {0.8932024350898425391065416079072747379541,-0.4496547675156747914115840103477239608765} +#define T_6561_488 {0.8927714106947844951989168293948750942945,-0.4505099424497137072798125245753908529878} +#define T_6561_490 {0.8919069060021349582001448652590624988079,-0.4522190520375041211664779439161065965891} +#define T_6561_491 {0.8914734264973859234615360946918372064829,-0.4530729851238206595809288046439178287983} +#define T_6561_493 {0.8906040151587729658544390076713170856237,-0.4547796039655604971585489693097770214081} +#define T_6561_494 {0.8901680841222515683242022532795090228319,-0.4556322881558331205908984884445089846849} +#define T_6561_496 {0.8892937733137535882832480638171546161175,-0.4573364021651746269547800238797208294272} +#define T_6561_497 {0.8888553943436129367583475868741516023874,-0.4581878304213899966157441667746752500534} +#define T_6561_499 {0.8879761912817482061299756423977669328451,-0.4598894255326600455369145947770448401570} +#define T_6561_500 {0.8875353679963468023217387781187426298857,-0.4607395908271713080850418009504210203886} +#define T_6561_502 {0.8866512799380142917371472321974579244852,-0.4624386529954876978720790248189587146044} +#define T_6561_503 {0.8862080159758857211826921229658182710409,-0.4632875483110723835444844098674366250634} +#define T_6561_505 {0.8853190502183044818806934017629828304052,-0.4649840635124597443272875807451782748103} +#define T_6561_506 {0.8848733492381277709881715054507367312908,-0.4658316818423777760926896007731556892395} +#define T_6561_508 {0.8839795131187775378833748618490062654018,-0.4675256360738831995504938277008477598429} +#define T_6561_509 {0.8835313787993465117764912974962498992682,-0.4683719704219346247064947874605422839522} +#define T_6561_511 {0.8826326796959064191483435024565551429987,-0.4700633497017433493070370786881539970636} +#define T_6561_512 {0.8821821157360998366669946335605345666409,-0.4709083930823261820997061022353591397405} +#define T_6561_514 {0.8812785610663880220272403676062822341919,-0.4725971834498770007826351502444595098495} +#define T_6561_515 {0.8808255711851383784605218352226074784994,-0.4734409288880447874703349953051656484604} +#define T_6561_517 {0.8799171684070505872199419172829948365688,-0.4751271164041451777748648055421654134989} +#define T_6561_518 {0.8794617563433142493067862233147025108337,-0.4759695569356646727143811403948348015547} +#define T_6561_520 {0.8785485129547623284196333770523779094219,-0.4776531276826059824180958912620553746819} +#define T_6561_521 {0.8780906824674878929926080672885291278362,-0.4784942563540145465950104153307620435953} +#define T_6561_523 {0.8771726060063376184672279123333282768726,-0.4801751964356867907746106993727153167129} +#define T_6561_524 {0.8767123608744353813193583846441470086575,-0.4810150063043498458448254950781119987369} +#define T_6561_526 {0.8757894589184448408403227404050994664431,-0.4826933018463563374034208663942990824580} +#define T_6561_527 {0.8753268029407554884357978153275325894356,-0.4835317859805245976900778259732760488987} +#define T_6561_529 {0.8743990831075116876291986045544035732746,-0.4852074231302967444179330414044670760632} +#define T_6561_530 {0.8739340201027748777917736333620268851519,-0.4860445746091634489083332937298109754920} +#define T_6561_532 {0.8730014900496317897804487984103616327047,-0.4877175395360747178763460851769195869565} +#define T_6561_533 {0.8725340238564546213595463086676318198442,-0.4885533514498328067077181913191452622414} +#define T_6561_535 {0.8715966912804693489391638649976812303066,-0.4902236303453131327501068881247192621231} +#define T_6561_536 {0.8711268257572947204536717435985337942839,-0.4910580957952122571619213431404205039144} +#define T_6561_538 {0.8701846983951641023580236833367962390184,-0.4927256748728620072697026444075163453817} +#define T_6561_539 {0.8697124374202389596177908970275893807411,-0.4935587869712654840448351478698896244168} +#define T_6561_541 {0.8687655230482357326948772424657363444567,-0.4952236524669690331812432759761577472091} +#define T_6561_542 {0.8682908705195788723329997083055786788464,-0.4960554043374106325536843087320448830724} +#define T_6561_544 {0.8673391769534878337211125653993804007769,-0.4977175425094502170253463191329501569271} +#define T_6561_545 {0.8668621367888577067262190212204586714506,-0.4985479272866910060990619513177080079913} +#define T_6561_547 {0.8659056718839105437623970829008612781763,-0.5002073244158601328379631922871340066195} +#define T_6561_548 {0.8654262480207732810555398828000761568546,-0.5010363352459450414499997350503690540791} +#define T_6561_550 {0.8644650196715841783401401698938570916653,-0.5026929776356614532062394573586061596870} +#define T_6561_551 {0.8639832160670807281732663795992266386747,-0.5035206076759760618344330396212171763182} +#define T_6561_553 {0.8630172322075810864561162816244177520275,-0.5051744816523952019693410875333938747644} +#define T_6561_554 {0.8625330528384945738551436988927889615297,-0.5060007240717216969727587638772092759609} +#define T_6561_556 {0.8615623214418676178993905523384455591440,-0.5076518159838492305624413347686640918255} +#define T_6561_557 {0.8610757703045909261518886523845139890909,-0.5084766639624236361783005122561007738113} +#define T_6561_559 {0.8601002993832056464640345438965596258640,-0.5101249601822281931617908412590622901917} +#define T_6561_560 {0.8596113804937076663392758746340405195951,-0.5109484069117958826566905372601468116045} +#define T_6561_562 {0.8586311780990532049884222942637279629707,-0.5125938938343217454729483506525866687298} +#define T_6561_563 {0.8581398954928464162250634217343758791685,-0.5134159325181939514948226133128628134727} +#define T_6561_565 {0.8571549697154648983499214409675914794207,-0.5150585965616731876082212693290784955025} +#define T_6561_566 {0.8566613274475717298983568070980254560709,-0.5158792204147830684490827479748986661434} +#define T_6561_568 {0.8556716864169920944149794195254798978567,-0.5175190480207475518525939151004422456026} +#define T_6561_569 {0.8551756885619113957019976623996626585722,-0.5183382502697062577112774306442588567734} +#define T_6561_571 {0.8541813404465818937438825741992332041264,-0.5199752279030998014519582284265197813511} +#define T_6561_572 {0.8536829910982557390042302358779124915600,-0.5207930017862524296745618812565226107836} +#define T_6561_574 {0.8526839441054765433847251188126392662525,-0.5224271159355422522452272460213862359524} +#define T_6561_575 {0.8521832473772558147473432654805947095156,-0.5232434547030233584763436738285236060619} +#define T_6561_577 {0.8511795097531118514666559349279850721359,-0.5248746918803117722518436494283378124237} +#define T_6561_578 {0.8506764697777225991970340146508533507586,-0.5256895887941014366973035976116079837084} +#define T_6561_580 {0.8496680498070146025924032073817215859890,-0.5273179355352373143261957011418417096138} +#define T_6561_581 {0.8491626707365239612457230578002054244280,-0.5281313838692165418819968181196600198746} +#define T_6561_583 {0.8481495767427007503869162974297069013119,-0.5297568267339058944997987055103294551373} +#define T_6561_584 {0.8476418627484828549611961534537840634584,-0.5305688197739122369256392630632035434246} +#define T_6561_586 {0.8466241030935722777783780657046008855104,-0.5321913453458299025911060198268387466669} +#define T_6561_587 {0.8461140583662732916891968670825008302927,-0.5330018763897128586393137084087356925011} +#define T_6561_589 {0.8450916414508133911454024200793355703354,-0.5346214712766126364584806651691906154156} +#define T_6561_590 {0.8445792702003173113567413565760944038630,-0.5354305336342888299583364641875959932804} +#define T_6561_592 {0.8435522044632870475311392510775476694107,-0.5370471844681143913646792498184368014336} +#define T_6561_593 {0.8430375109186810655970134575909469276667,-0.5378547714616230823736486854613758623600} +#define T_6561_595 {0.8420058048374302606120522796118166297674,-0.5394684648986179942298235800990369170904} +#define T_6561_596 {0.8414887932469694575843277561943978071213,-0.5402745698621759240509732080681715160608} +#define T_6561_598 {0.8404524553371492956443944422062486410141,-0.5418852925829941158397673461877275258303} +#define T_6561_599 {0.8399331299682218920921172866655979305506,-0.5426899088630505740837861594627611339092} +#define T_6561_601 {0.8388921687837144203214734261564444750547,-0.5442976475728660279429504953441210091114} +#define T_6561_602 {0.8383705339228064712386867540772072970867,-0.5451007685281579195901713319472037255764} +#define T_6561_604 {0.8373249580556541005194048921111971139908,-0.5467055099567746934141609926882665604353} +#define T_6561_605 {0.8368010180083138571660583693301305174828,-0.5475071289583812728096745559014379978180} +#define T_6561_607 {0.8357508360886483078644459965289570391178,-0.5491088598603425241506670317903626710176} +#define T_6561_608 {0.8352245951794512457411201467039063572884,-0.5499089702917400179771334478573407977819} +#define T_6561_610 {0.8341698158754221603672363016812596470118,-0.5515076774464379161244664828700479120016} +#define T_6561_611 {0.8336412784479348969668421887035947293043,-0.5523062727035541463749268586980178952217} +#define T_6561_613 {0.8325819104656387859009214480465743690729,-0.5539019429153386742115117158391512930393} +#define T_6561_614 {0.8320510808823829984604003584536258131266,-0.5546990164066077921845021592162083834410} +#define T_6561_616 {0.8309871329657909644339497390319593250751,-0.5562916365048955480432368858600966632366} +#define T_6561_617 {0.8304540156082079738197876395133789628744,-0.5570871816513122132263902130944188684225} +#define T_6561_619 {0.8293854965390937694635908883356023579836,-0.5586767384906953237688753688416909426451} +#define T_6561_620 {0.8288500958075080138343082580831833183765,-0.5594707487258692157894301999476738274097} +#define T_6561_622 {0.8277770144053753220703129045432433485985,-0.5610572291862239158177771969349123537540} +#define T_6561_623 {0.8272393347189580525835594926320482045412,-0.5618496979564336912815747382410336285830} +#define T_6561_625 {0.8261616998409678780390663632715586572886,-0.5634330889430283484387018688721582293510} +#define T_6561_626 {0.8256217456377006325141110210097394883633,-0.5642240097072757087914851581444963812828} +#define T_6561_628 {0.8245395661785981378244514417019672691822,-0.5658042981508792923506234728847630321980} +#define T_6561_629 {0.8239973419152364364492768800118938088417,-0.5665936643809432737839415494818240404129} +#define T_6561_631 {0.8229106268072776675381874156300909817219,-0.5681708372379327132151161094952840358019} +#define T_6561_632 {0.8223661369593137093758627997885923832655,-0.5689586424184233104384134094289038330317} +#define T_6561_634 {0.8212748951721915435797427562647499144077,-0.5705326866708916311310417768254410475492} +#define T_6561_635 {0.8207281442338177912532160007685888558626,-0.5713189242993037542106549153686501085758} +#define T_6561_637 {0.8196323847745882185122923146991524845362,-0.5728898269551672139954234808101318776608} +#define T_6561_638 {0.8190833772586599836884602154896128922701,-0.5736744905419340900820657225267495959997} +#define T_6561_640 {0.8179831091716674995595326436159666627645,-0.5752422386350396488197134203801397234201} +#define T_6561_641 {0.8174318496096657504779159353347495198250,-0.5760253217035868900097739242482930421829} +#define T_6561_643 {0.8163270819764689711917071690550073981285,-0.5775899022938189020237587101291865110397} +#define T_6561_644 {0.8157735749184630291708231197844725102186,-0.5783713983806173519752746869926340878010} +#define T_6561_646 {0.8146643168577595295332116620556917041540,-0.5799327985540048135959523278870619833469} +#define T_6561_647 {0.8141085668723694324100392805121373385191,-0.5807127012086241713006984355160966515541} +#define T_6561_649 {0.8129948275399205837032923227525316178799,-0.5822709080774470802310815997770987451077} +#define T_6561_650 {0.8124368392142791162058301779325120151043,-0.5830492108626094127643568754137959331274} +#define T_6561_652 {0.8113186278028345910229290893767029047012,-0.5846042115655049054012692977266851812601} +#define T_6561_653 {0.8107584057425500922988703678129240870476,-0.5853809080571372724932643905049189925194} +#define T_6561_655 {0.8096357314817715922217189472576137632132,-0.5869326897592062053377048869151622056961} +#define T_6561_656 {0.8090732803108892090548920350556727498770,-0.5877077735464940611009865278901997953653} +#define T_6561_658 {0.8079461524672750805109444627305492758751,-0.5892563234394068150123757732217200100422} +#define T_6561_659 {0.8073814768282391307607781527622137218714,-0.5900297881248468545578589328215457499027} +#define T_6561_661 {0.8062499047050467604336176918877754360437,-0.5915750934269486949190763880324084311724} +#define T_6561_662 {0.8056830092586620972738842283433768898249,-0.5923469326264014789273915084777399897575} +#define T_6561_664 {0.8045470021958320838706413269392214715481,-0.5938889805828188039882320481410715728998} +#define T_6561_665 {0.8039778916212259041174092999426648020744,-0.5946591879255616053256972008966840803623} +#define T_6561_667 {0.8028374589953046758239452174166217446327,-0.5961979658083064181894883404311258345842} +#define T_6561_668 {0.8022661379898874400851127575151622295380,-0.5969665349370857354571739961102139204741} +#define T_6561_670 {0.8011212892139496499765982662211172282696,-0.5985020300451613373127202066825702786446} +#define T_6561_671 {0.8005477624933770020021484015160240232944,-0.5992689546162455194178164674667641520500} +#define T_6561_673 {0.7993985070169480344759449508273974061012,-0.6008011542757508705037139407068025320768} +#define T_6561_674 {0.7988227793150810551736640263698063790798,-0.6015664279589821861193854601879138499498} +#define T_6561_676 {0.7976691266240589772706925941747613251209,-0.6030953195232168217998491854814346879721} +#define T_6561_677 {0.7970912026929258820118207040650304406881,-0.6038589360020641949589048635971266776323} +#define T_6561_679 {0.7959331623095030616710232607147190719843,-0.6053845068516324756657809302851092070341} +#define T_6561_680 {0.7953530469192594543059726674982812255621,-0.6061464598232428890867140580667182803154} +#define T_6561_682 {0.7941906284018444006633785647863987833261,-0.6076686973661581392391894951288122683764} +#define T_6561_683 {0.7936083263407340826489644314278848469257,-0.6084289805414094809421499121526721864939} +#define T_6561_685 {0.7924415392838717320245223163510672748089,-0.6099478722131981278664625278906896710396} +#define T_6561_686 {0.7918570553581874005288909756927751004696,-0.6107064793167504834769943045102991163731} +#define T_6561_688 {0.7906859093924807346809302543988451361656,-0.6122220125805554191700252886221278458834} +#define T_6561_689 {0.7900992484265236814877653159783221781254,-0.6129789373509031413789216458098962903023} +#define T_6561_691 {0.7889237532185537915552231424953788518906,-0.6144910996975875283609980215260293334723} +#define T_6561_692 {0.7883349200545948232132786870351992547512,-0.6152463358871107512726439381367526948452} +#define T_6561_694 {0.7871550853068411957025318770320154726505,-0.6167551148353611623065262392628937959671} +#define T_6561_695 {0.7865640848050803324298385632573626935482,-0.6175086562103777598764509093598462641239} +#define T_6561_697 {0.7853799202558408021346281202568206936121,-0.6190140393068069846194134697725530713797} +#define T_6561_698 {0.7847867572943668657003968291974160820246,-0.6197658796476234188688181347970385104418} +#define T_6561_700 {0.7835982727176774575994500082742888480425,-0.6212678544668736035916367654863279312849} +#define T_6561_701 {0.7830029521924284363620927251758985221386,-0.6220179875678368830449471715837717056274} +#define T_6561_703 {0.7818101573979820972937204714980907738209,-0.6235165417126817821724671375704929232597} +#define T_6561_704 {0.7812126842227042899935440800618380308151,-0.6242649613822301990495589052443392574787} +#define T_6561_706 {0.7800155890557705085086581675568595528603,-0.6257600824836778707904727525601629167795} +#define T_6561_707 {0.7794159681619780011274656317254994064569,-0.6265067825443924043327115214196965098381} +#define T_6561_709 {0.7782145825033213171195711765903979539871,-0.6279984582617869071086147414462175220251} +#define T_6561_710 {0.7776128188402554597402627223345916718245,-0.6287434325504418497487790773448068648577} +#define T_6561_712 {0.7764071526060539740754506965458858758211,-0.6302316505715653827124356212152633816004} +#define T_6561_713 {0.7758032511406423026301126810722053050995,-0.6309748929391798544230596235138364136219} +#define T_6561_715 {0.7745933142824056316655401133175473660231,-0.6324596409803541208205501789052505046129} +#define T_6561_716 {0.7739872799992212337727437443390954285860,-0.6332011452922421401723340750322677195072} +#define T_6561_718 {0.7727730825037085748974163834645878523588,-0.6346824110984299327498092679888941347599} +#define T_6561_719 {0.7721649204049284564987942758307326585054,-0.6354221712342515981930546331568621098995} +#define T_6561_721 {0.7709464722940660985628369417099747806787,-0.6368999425791579405142783798510208725929} +#define T_6561_722 {0.7703361873994303277157769116456620395184,-0.6376379524329695014372987316164653748274} +#define T_6561_724 {0.7691134987302289394150989210174884647131,-0.6391122171191425671565866650780662894249} +#define T_6561_725 {0.7685010960769989019070180802373215556145,-0.6398484705994469390333279079641215503216} +#define T_6561_727 {0.7672741769414705981233737475122325122356,-0.6413192164583786381015784172632265836000} +#define T_6561_728 {0.7666596615843875861528999848815146833658,-0.6420537074881759176392392873822245746851} +#define T_6561_730 {0.7654285221094625502047392728854902088642,-0.6435209223804024825099645568116102367640} +#define T_6561_740 {0.7592307966770407423595656837278511375189,-0.6508214788842989761974422435741871595383} +#define T_6561_742 {0.7579828778092586594183899251220282167196,-0.6522744490994526467275704817438963800669} +#define T_6561_745 {0.7561057873112047023056447869748808443546,-0.6544494162229066436253788197063840925694} +#define T_6561_755 {0.7498038335236644202907996259455103427172,-0.6616601931756337373613519048376474529505} +#define T_6561_760 {0.7466270335907169153344398182525765150785,-0.6652428674638506400640380888944491744041} +#define T_6561_770 {0.7402221518269924693811390170594677329063,-0.6723623769550293172159172172541730105877} +#define T_6561_775 {0.7369942168448649910672543228429276496172,-0.6758990489246334831818785460200160741806} +#define T_6561_784 {0.7311414021127619067286218523804564028978,-0.6822259523915699119811506534460932016373} +#define T_6561_785 {0.7304877287226010373899498517857864499092,-0.6829258218765019705642771441489458084106} +#define T_6561_790 {0.7272093247607052113323788944398984313011,-0.6864157617516363218967967441130895167589} +#define T_6561_800 {0.7206025728635472304972608981188386678696,-0.6933483482221877158835354748589452356100} +#define T_6561_805 {0.7172743764053348680675981086096726357937,-0.6967908358699459459018044071854092180729} +#define T_6561_812 {0.7125872902746757109326836143736727535725,-0.7015834617057297162290296910214237868786} +#define T_6561_815 {0.7105687240058933262787377316271886229515,-0.7036278053521169972839288675459101796150} +#define T_6561_820 {0.7071914218092664183856754789303522557020,-0.7070221304311401500086731175542809069157} +#define T_6561_826 {0.7031172632549419487801856121222954243422,-0.7110739160684216297880766433081589639187} +#define T_6561_830 {0.7003882525878117881745765771484002470970,-0.7137620721479893681760131585178896784782} +#define T_6561_835 {0.6969625415434140824899600374919828027487,-0.7171075342550409237318831401353236287832} +#define T_6561_845 {0.6900632593023605654281027454999275505543,-0.7237490574508564922240339001291431486607} +#define T_6561_850 {0.6865898462897769283230786641070153564215,-0.7270449662653476474716285338217858225107} +#define T_6561_854 {0.6837997779013278432103106752038002014160,-0.7296696949593662884225864218024071305990} +#define T_6561_860 {0.6795958746640144987338771898066624999046,-0.7335867004926228673866717144846916198730} +#define T_6561_865 {0.6760754764059095922590358895831741392612,-0.7368323759190569255395075742853805422783} +#define T_6561_868 {0.6739557918880625386748306482331827282906,-0.7387716768938387135179368669923860579729} +#define T_6561_875 {0.6689882585690437588610279817658010870218,-0.7432729713212758992924023004889022558928} +#define T_6561_880 {0.6654216014832697867475985731289256364107,-0.7464677436295827916978851135354489088058} +#define T_6561_890 {0.6582425998498291352944988830131478607655,-0.7528058712197571722413158568087965250015} +#define T_6561_895 {0.6546304198995328560428674791182857006788,-0.7559490811834890244824691762914881110191} +#define T_6561_896 {0.6539061801596668388469879573676735162735,-0.7565756456224276282895857548282947391272} +#define T_6561_905 {0.6473611158232067674944687496463302522898,-0.7621834331183883204374751585419289767742} +#define T_6561_910 {0.6437041583649666387501042663643602281809,-0.7652744321507480851352056561154313385487} +#define T_6561_920 {0.6363460518329346893295905829290859401226,-0.7714037220007665673904284631134942173958} +#define T_6561_925 {0.6326450714629606730809996406605932861567,-0.7744418722884406358630826616717968136072} +#define T_6561_935 {0.6251996807863771099533778397017158567905,-0.7804648353030468888036352836934383958578} +#define T_6561_938 {0.6229548516589464846049395418958738446236,-0.7822577917762020272007816856785211712122} +#define T_6561_940 {0.6214554411848026704845437961921561509371,-0.7834495099378149252089542642352171242237} +#define T_6561_950 {0.6139243026855002449693188282253686338663,-0.7893649033065266440445384432678110897541} +#define T_6561_952 {0.6124112964383240376164962981420103460550,-0.7905393121121373489756933850003406405449} +#define T_6561_955 {0.6101375764588004013688760096556507050991,-0.7922954864146213305176047470013145357370} +#define T_6561_965 {0.6025222441522776195554911282670218497515,-0.7981020895234538503615340232499875128269} +#define T_6561_970 {0.5986938126738456933395582382217980921268,-0.8009779763926434537069098951178602874279} +#define T_6561_980 {0.5909958579486023211302381241694092750549,-0.8066745910759774984555292576260399073362} +#define T_6561_985 {0.5871265111975166872682052598975133150816,-0.8094951882803456122417173901339992880821} +#define T_6561_994 {0.5801278241074792063614040671382099390030,-0.8145254493852980282042608450865373015404} +#define T_6561_995 {0.5793475224908055665196116024162620306015,-0.8150806390681633040173892368329688906670} +#define T_6561_1000 {0.5754380588888216019327614958456251770258,-0.8178453645905594537879323979723267257214} +#define T_6561_1010 {0.5675796413588816147210991402971558272839,-0.8233184989509972906418511229276191443205} +#define T_6561_1015 {0.5636308676056804856102644407656043767929,-0.8260267823031332001804116771381814032793} +#define T_6561_1022 {0.5580808945343530558247380213288124650717,-0.8297865479481674189088380444445647299290} +#define T_6561_1025 {0.5556946428005200555588771749171428382397,-0.8313864708803014869076264403702225536108} +#define T_6561_1030 {0.5517073737072506478540390162379480898380,-0.8340377532204691357620163216779474169016} +#define T_6561_1036 {0.5469059617409340878779744343773927539587,-0.8371940450171775349730296511552296578884} +#define T_6561_1040 {0.5436949792300481698603675795311573892832,-0.8392828900674893510824858822161331772804} +#define T_6561_1045 {0.5396700375511940261930021733860485255718,-0.8418766243158749551511732533981557935476} +#define T_6561_1055 {0.5315831267223868339399928117927629500628,-0.8470061271230868715775841337745077908039} +#define T_6561_1060 {0.5275213429859947344979786976182367652655,-0.8495417780746585822981842284207232296467} +#define T_6561_1064 {0.5242632032577547729346179039566777646542,-0.8515562774766668452386397802911233156919} +#define T_6561_1070 {0.5193615845021235521983271610224619507790,-0.8545545883929496211450782539031933993101} +#define T_6561_1075 {0.5152637968384280453548740297264885157347,-0.8570316328278945183782866479305084794760} +#define T_6561_1078 {0.5127994476069059892608947848202660679817,-0.8585084312539115192208782900706864893436} +#define T_6561_1085 {0.5070328744278093102693105720391031354666,-0.8619267162871060428130931541090831160545} +#define T_6561_1090 {0.5028999283962884980780927435262128710747,-0.8643446430787941059392665010818745940924} +#define T_6561_1100 {0.4945995404715846643917132041678996756673,-0.8691209896011586888420197283267043530941} +#define T_6561_1105 {0.4904322888864826035515420699084643274546,-0.8714792998216112085430040679057128727436} +#define T_6561_1106 {0.4895974861744676887020943922834703698754,-0.8719485658750990575427408657560590654612} +#define T_6561_1115 {0.4820641481942430917051467531564412638545,-0.8761359238301776874990878241078462451696} +#define T_6561_1120 {0.4778634509485957804209022015129448845983,-0.8784341308530190239878265856532379984856} +#define T_6561_1130 {0.4694292842158391265705574824096402153373,-0.8829700714750217116488784085959196090698} +#define T_6561_1135 {0.4651960081040394934248638492135796695948,-0.8852077010758924169309125318250153213739} +#define T_6561_1145 {0.4566975556819515280615462415880756452680,-0.8896220223410225003135565202683210372925} +#define T_6561_1148 {0.4541398170705430170634997466549975797534,-0.8909304274471344653463233953516464680433} +#define T_6561_1150 {0.4524325742208903378127615724224597215652,-0.8917986127954329322875537400250323116779} +#define T_6561_1160 {0.4438715897257107245721385879733134061098,-0.8960904038289720929810755478683859109879} +#define T_6561_1162 {0.4421544827447725367619568714872002601624,-0.8969389128523205245713256772432941943407} +#define T_6561_1165 {0.4395757829745305933499821549048647284508,-0.8982055060075775365646677528275176882744} +#define T_6561_1175 {0.4309540329257021684661310700903413817286,-0.9023738812183521584842083029798232018948} +#define T_6561_1180 {0.4266282873042013257247617730172351002693,-0.9044270586796283595276690903119742870331} +#define T_6561_1190 {0.4179475507598572336931397330772597342730,-0.9084711579427475758308219155878759920597} +#define T_6561_1195 {0.4135927588655808895268251035304274410009,-0.9104619870230483691386780265020206570625} +#define T_6561_1204 {0.4057303043000231035364322451641783118248,-0.9139928447053679017031413422955665737391} +#define T_6561_1205 {0.4048548270554438444079892178706359118223,-0.9143809758573866464104185070027597248554} +#define T_6561_1210 {0.4004718874795009098122022805910091847181,-0.9163090457583652481687863655679393559694} +#define T_6561_1220 {0.3916785634352715761430374641349771991372,-0.9201021154987537586933399325062055140734} +#define T_6561_1225 {0.3872683805769145948261211742646992206573,-0.9219670283721289605338711226067971438169} +#define T_6561_1232 {0.3810792303094104327421121070074150338769,-0.9245423842241021761267916190263349562883} +#define T_6561_1235 {0.3784214787602240837038891640986548736691,-0.9256333963362197714275225735036656260490} +#define T_6561_1240 {0.3739849626402305116101842941134236752987,-0.9274347673658700497156814890331588685513} +#define T_6561_1246 {0.3686498338125285267885544726595981046557,-0.9295683406990553621085382474120706319809} +#define T_6561_1250 {0.3650863085682348740945712961547542363405,-0.9309736770156391560959718844969756901264} +#define T_6561_1255 {0.3606243746411285089337184217583853751421,-0.9327111344970075990090663253795355558395} +#define T_6561_1265 {0.3516758045098217211332780607335735112429,-0.9361218555948630504204288627079222351313} +#define T_6561_1270 {0.3471893734749729731881018324202159419656,-0.9377950410116571156038389744935557246208} +#define T_6561_1274 {0.3435944923425571229103070436394773423672,-0.9391181101607296577427064221410546451807} +#define T_6561_1280 {0.3381927337802961286428171661100350320339,-0.9410768697711201502542621710745152086020} +#define T_6561_1285 {0.3336827313919400461728059781307820230722,-0.9426854378692897107328008132753893733025} +#define T_6561_1288 {0.3309730510704985140257861075951950624585,-0.9436402065750935097554474850767292082310} +#define T_6561_1295 {0.3246398785487658589232751182862557470798,-0.9458376971002173672076196453417651355267} +#define T_6561_1300 {0.3201072354249769325029717492725467309356,-0.9473813159591962795857966739276889711618} +#define T_6561_1310 {0.3110200353840468223687310000968864187598,-0.9504033552075172863737861916888505220413} +#define T_6561_1315 {0.3064656868147102586341645746870199218392,-0.9518817063087134933141442161286249756813} +#define T_6561_1316 {0.3055539705733464339587612812465522438288,-0.9521747586797617568521445718943141400814} +#define T_6561_1325 {0.2973360146776042878258294877014122903347,-0.9547729019906459058120162808336317539215} +#define T_6561_1330 {0.2927609004314233320798166460008360445499,-0.9561856802831666390929399312881287187338} +#define T_6561_1340 {0.2835906400636415969351844523771433159709,-0.9589454358138913558917693080729804933071} +#define T_6561_1345 {0.2789957041942205395734788453410146757960,-0.9602923497774888961231454231892712414265} +#define T_6561_1355 {0.2697867478364563975645751270349137485027,-0.9629200956942524092241342259512748569250} +#define T_6561_1358 {0.2670191977280116368831386353122070431709,-0.9636912099031976763896523152652662247419} +#define T_6561_1360 {0.2651729384874995654186591309553477913141,-0.9642008673994775236337773094419389963150} +#define T_6561_1370 {0.2559271863651841893982918918482027947903,-0.9666960614790981454191864941094536334276} +#define T_6561_1372 {0.2540751922410788288253513655945425853133,-0.9671844687998556455710286172688938677311} +#define T_6561_1375 {0.2512954555748510010459995100973173975945,-0.9679104266446498794351782635203562676907} +#define T_6561_1385 {0.2420148155060510841707355211838148534298,-0.9702725540154024663763721036957576870918} +#define T_6561_1390 {0.2373661190105070539768661319612874649465,-0.9714202620626614104182294795464258641005} +#define T_6561_1400 {0.2280525060122551850305683274200418964028,-0.9736488353105190451941552964854054152966} +#define T_6561_1405 {0.2233878030484605925476415677621844224632,-0.9747296494152530854648830427322536706924} +#define T_6561_1414 {0.2149785028517469387399074776112684048712,-0.9766187809537667519066417298745363950729} +#define T_6561_1415 {0.2140431389416004026582385222354787401855,-0.9768242086844627358743764489190652966499} +#define T_6561_1420 {0.2093633920493758737713108075695345178246,-0.9778379058256942979454606756917200982571} +#define T_6561_1430 {0.1999896050620032506017764717398677021265,-0.9797980189136656914428158415830694139004} +#define T_6561_1435 {0.1952957798854143278699524444164126180112,-0.9807443899196914838256589064258150756359} +#define T_6561_1442 {0.1887169262853305162774830705529893748462,-0.9820315278713902529261758900247514247894} +#define T_6561_1445 {0.1858948042549964652092597816590568982065,-0.9825696523661803238880452227022033184767} +#define T_6561_1450 {0.1811878693430973852329657347581814974546,-0.9834485019577327014062007037864532321692} +#define T_6561_1456 {0.1755340734013755565889880472241202369332,-0.9844733562037727425675370795943308621645} +#define T_6561_1460 {0.1717616449173518800019877517115673981607,-0.9851385371282992409192047489341348409653} +#define T_6561_1465 {0.1670425715243299413792499308328842744231,-0.9859496839588413052979376516304910182953} +#define T_6561_1475 {0.1575930433609461778221572103575454093516,-0.9875041431225668464577438498963601887226} +#define T_6561_1480 {0.1528628052457078889680985867016715928912,-0.9882474198157122913244165829382836818695} +#define T_6561_1484 {0.1490760861040109908071116251448984257877,-0.9888257280997039266878800845006480813026} +#define T_6561_1490 {0.1433919232109934494001635130189242772758,-0.9896659822171581799565842629817780107260} +#define T_6561_1495 {0.1386514964362340895931424711307045072317,-0.9903412354012091078914181707659736275673} +#define T_6561_1498 {0.1358057075174101679948535092989914119244,-0.9907354893238132786947858221537899225950} +#define T_6561_1505 {0.1291612148027687922979822587876697070897,-0.9916236083266034562200275104260072112083} +#define T_6561_1510 {0.1244115775335661866485281734640011563897,-0.9922306986661969530061355726502370089293} +#define T_6561_1520 {0.1149038545769471703072994728245248552412,-0.9933766175038346579739823027921374887228} +#define T_6561_1525 {0.1101459868789219920248712014654302038252,-0.9939154197286952374668089760234579443932} +#define T_6561_1526 {0.1091941068013669302594337295886361971498,-0.9940204459868275943534854377503506839275} +#define T_6561_1535 {0.1006227844736824611482362001879664603621,-0.9949246480235389711310745042283087968826} +#define T_6561_1540 {0.0958576681107661532355024291973677463830,-0.9953950509543264546508112289302516728640} +#define T_6561_1550 {0.0863209513255517868479671506065642461181,-0.9962673804567987456692890191334299743176} +#define T_6561_1555 {0.0815495695574042223086053127190098166466,-0.9966692870280502436486358419642783701420} +#define T_6561_1565 {0.0720013062494902766896842649657628498971,-0.9974045377370042153586382482899352908134} +#define T_6561_1568 {0.0691354997668307824643108006057445891201,-0.9976072787785735807020159882085863500834} +#define T_6561_1570 {0.0672246436286085963240921614669787231833,-0.9977378650171629947962514961545821279287} +#define T_6561_1580 {0.0576668040378417665059451735487527912483,-0.9983358852170250985480493000068236142397} +#define T_6561_1582 {0.0557545734268839118796812215350655606017,-0.9984445039870699645234708441421389579773} +#define T_6561_1585 {0.0528858462064026174642705768746964167804,-0.9986005644255528945762989678769372403622} +#define T_6561_1595 {0.0433204025486511323794758254734915681183,-0.9990612307176286455145941545197274535894} +#define T_6561_1600 {0.0385361360351279061409179860220319824293,-0.9992572072391983084571620565839111804962} +#define T_6561_1610 {0.0289650620953239498756026648607075912878,-0.9995804245671350285462608553643804043531} +#define T_6561_1615 {0.0241784741109212532317318533614525222220,-0.9997076579629003978411105890700127929449} +#define T_6561_1624 {0.0155612925220097678830066811883625632618,-0.9998789157568253038732564164092764258385} +#define T_6561_1625 {0.0146037448357796036113587234694932703860,-0.9998933596323017480855810390494298189878} +#define T_6561_1630 {0.0098158230707267955100769896148449333850,-0.9999518236482416444488308115978725254536} +#define T_6561_1640 {0.0002394141612237129233039384379466696373,-0.9999999713404292833729414269328117370605} +#define T_6561_1645 {-0.0045488534190304043963726243759992939886,-0.9999896539127652861367323566810227930546} +#define T_6561_1652 {-0.0112522282257614576750359702828063745983,-0.9999366916760057399926608923124149441719} +#define T_6561_1655 {-0.0141249659153338917583209166650703991763,-0.9999002376926863222550423415668774396181} +#define T_6561_1660 {-0.0189125912739831515685384744074326590635,-0.9998211409503717783664455964753869920969} +#define T_6561_1666 {-0.0246571596662237581265575414590784930624,-0.9996959660202667796724540494324173778296} +#define T_6561_1670 {-0.0284864313706895064259860816946456907317,-0.9995941792686484639318678091512992978096} +#define T_6561_1675 {-0.0332724266034471888309198561728408094496,-0.9994463195329292837243428948568180203438} +#define T_6561_1685 {-0.0428420187830579890508886364841600880027,-0.9990818592220519489544017233129125088453} +#define T_6561_1690 {-0.0476253963220062970229662369092693552375,-0.9988652670030987446025960707629565149546} +#define T_6561_1694 {-0.0514513174221848598732798052424186607823,-0.9986755038226989844574177368485834449530} +#define T_6561_1700 {-0.0571887659435611137026533867810940137133,-0.9983633832677621944284851451811846345663} +#define T_6561_1705 {-0.0619685387609312174350684188084414927289,-0.9980781032583747602870971604716032743454} +#define T_6561_1708 {-0.0648357274841893094219358317786827683449,-0.9978959507090886527080897394625935703516} +#define T_6561_1715 {-0.0715237124674656249423421172650705557317,-0.9974388996599597989600738401350099593401} +#define T_6561_1720 {-0.0762988942793062785430180383627885021269,-0.9970849907263448219651991166756488382816} +#define T_6561_1730 {-0.0858439004050448734028933017725648824126,-0.9963085991615493464124142519722227007151} +#define T_6561_1735 {-0.0906135058747375154997172330695320852101,-0.9958861343311739000583315828407648950815} +#define T_6561_1736 {-0.0915671811610689473370783275640860665590,-0.9957989010509179506058785591449122875929} +#define T_6561_1745 {-0.1001463748519380581036841704190010204911,-0.9949727150047960044787487277062609791756} +#define T_6561_1750 {-0.1049094197935165062585127770944382064044,-0.9944817814513183806823803934094030410051} +#define T_6561_1760 {-0.1144281845588810408997915146755985915661,-0.9934315228431995770108642318518832325935} +#define T_6561_1765 {-0.1191836861401136121729749106634699273854,-0.9928722218684813416444967515417374670506} +#define T_6561_1775 {-0.1286863825406829031638977767215692438185,-0.9916853406946142257538667763583362102509} +#define T_6561_1778 {-0.1315349297314883925125172936532180756330,-0.9913115364306683829909161431714892387390} +#define T_6561_1780 {-0.1334333594858753413436858181739808060229,-0.9910577877078173836267183105519507080317} +#define T_6561_1790 {-0.1429180266843227131712268374030827544630,-0.9897345288756269621188721430371515452862} +#define T_6561_1792 {-0.1448134150979865364394783000534516759217,-0.9894589808615910087041811493691056966782} +#define T_6561_1795 {-0.1476554994767999007265757427376229315996,-0.9890388533693995620765804233087692409754} +#define T_6561_1805 {-0.1571201803560408338800158389858552254736,-0.9875794899272084537145133253943640738726} +#define T_6561_1810 {-0.1618471714402654471065545749297598376870,-0.9868158354509646290608770868857391178608} +#define T_6561_1820 {-0.1712899130072997599949502500749076716602,-0.9852206685316501344473749668395612388849} +#define T_6561_1825 {-0.1760054469905863727685613184803514741361,-0.9843891926619490195804473842144943773746} +#define T_6561_1834 {-0.1844831663960210255126526135427411645651,-0.9828356736080035016556166738155297935009} +#define T_6561_1835 {-0.1854243007794892095230210316003649495542,-0.9826585514208063809604709604172967374325} +#define T_6561_1840 {-0.1901274046332720457641585198871325701475,-0.9817594257288371206726651507779024541378} +#define T_6561_1850 {-0.1995204271072507085094116519030649214983,-0.9798936672756591859112518250185530632734} +#define T_6561_1855 {-0.2042101303678635626503989897173596546054,-0.9789270772918380325577913936285767704248} +#define T_6561_1862 {-0.2107678250224293836634359422532725147903,-0.9775361496821048934435793853481300175190} +#define T_6561_1865 {-0.2135753833202974072413837802741909399629,-0.9769265866172278656165417487500235438347} +#define T_6561_1870 {-0.2182507182892242936222970683957100845873,-0.9758927317929146916242189035983756184578} +#define T_6561_1876 {-0.2238545055891598412056708866657572798431,-0.9746225732699981270101829977647867053747} +#define T_6561_1880 {-0.2275862692436046441635255632718326523900,-0.9737579216888443411903608648572117090225} +#define T_6561_1885 {-0.2322462711871599305712976502036326564848,-0.9726570153551867825569843262201175093651} +#define T_6561_1895 {-0.2415501937958476896817217038915259763598,-0.9703883263298195283041991388017777353525} +#define T_6561_1900 {-0.2461939011442444424915265699382871389389,-0.9692205956537335298506263825402129441500} +#define T_6561_1904 {-0.2499048079894625185026768576790345832705,-0.9682704100321096740699999827484134584665} +#define T_6561_1910 {-0.2554642755859628522330240230076014995575,-0.9668184958405270368331230201874859631062} +#define T_6561_1915 {-0.2600907301317282316333034941635560244322,-0.9655841817778212377021418433287180960178} +#define T_6561_1918 {-0.2628637472776415573427755134616745635867,-0.9648329650085325503994226892245933413506} +#define T_6561_1925 {-0.2693256435077092669772014232876244932413,-0.9630491668389306036957009382604155689478} +#define T_6561_1930 {-0.2739338906034062270933304716891143471003,-0.9617485240845867755510312235855963081121} +#define T_6561_1940 {-0.2831314373321083266432651726063340902328,-0.9590811171085865671770420703978743404150} +#define T_6561_1945 {-0.2877205260873227365969739821593975648284,-0.9577144140442046538197473637410439550877} +#define T_6561_1946 {-0.2886375555932776948431239816272864118218,-0.9574384374470964464975963892356958240271} +#define T_6561_1955 {-0.2968788082976385744871095084818080067635,-0.9549151654381523579573354254534933716059} +#define T_6561_1960 {-0.3014477917751909319399317155330209061503,-0.9534826840765704414337733396678231656551} +#define T_6561_1970 {-0.3105649196980645454502223401505034416914,-0.9505521714524325371797885964042507112026} +#define T_6561_1975 {-0.3151128551094062313353560966788791120052,-0.9490542073795354971466053939366247504950} +#define T_6561_1985 {-0.3241869474677778750759671311243437230587,-0.9459930354349994630069886625278741121292} +#define T_6561_1988 {-0.3269034153926433927672690060717286542058,-0.9450577532641192268769714246445801109076} +#define T_6561_1990 {-0.3287128963675310933112427846936043351889,-0.9444298977487258772711697929480578750372} +#define T_6561_2000 {-0.3377420807645302724964153640030417591333,-0.9412386981424242238247757086355704814196} +#define T_6561_2002 {-0.3395442271763814412466331305040512233973,-0.9405900902046512301879488404665607959032} +#define T_6561_2005 {-0.3422451092441318176717857113544596359134,-0.9396107093889853878465601155767217278481} +#define T_6561_2015 {-0.3512275225494375097134991392522351816297,-0.9362901406101552526095588291354943066835} +#define T_6561_2020 {-0.3557067014298466722799219041917240247130,-0.9345976367174795296222100660088472068310} +#define T_6561_2030 {-0.3646404901641356333108490161976078525186,-0.9311483839500871440009177604224532842636} +#define T_6561_2035 {-0.3690948951875658856636164273368194699287,-0.9293917141585025243344375667220447212458} +#define T_6561_2044 {-0.3770914303114725751875369041954400017858,-0.9261760379029722045629569038283079862595} +#define T_6561_2045 {-0.3779782159049697165542625043599400669336,-0.9258144891398579767738397094944957643747} +#define T_6561_2050 {-0.3824069279256048781157062421698356047273,-0.9239940159300281674603638748521916568279} +#define T_6561_2060 {-0.3912379475940962469948658508656080812216,-0.9202895568039219931222305604023858904839} +#define T_6561_2065 {-0.3956400527677522149794242523057619109750,-0.9184056558220502486378222783969249576330} +#define T_6561_2072 {-0.4017877380244478935367169469827786087990,-0.9157328287078047024394322761509101837873} +#define T_6561_2075 {-0.4044169491473827426908371762692695483565,-0.9145747269864410444739633021526969969273} +#define T_6561_2080 {-0.4087915391200748760347494226152775809169,-0.9126277869667569486722413785173557698727} +#define T_6561_2086 {-0.4140286664254220205805268051335588097572,-0.9102638427280228539117956643167417496443} +#define T_6561_2090 {-0.4175125011389857476729048357810825109482,-0.9086711789160413221821954721235670149326} +#define T_6561_2095 {-0.4218586732343634349007288619759492576122,-0.9066616016005876188543766147631686180830} +#define T_6561_2105 {-0.4305219013624917434590599896182538941503,-0.9025801307624852221067612845217809081078} +#define T_6561_2110 {-0.4348387587681015742369083909579785540700,-0.9005083308182200152458563024993054568768} +#define T_6561_2114 {-0.4382850711691390621638220181921496987343,-0.8988360230822208807310857991978991776705} +#define T_6561_2120 {-0.4434424653885050138235612848802702501416,-0.8963028393853076369879318008315749466419} +#define T_6561_2125 {-0.4477291173408439184377982655860250815749,-0.8941692443185399463700946398603264242411} +#define T_6561_2128 {-0.4502961874198361247323418865562416613102,-0.8928792435683337025409400666831061244011} +#define T_6561_2135 {-0.4562715271185671661591243264410877600312,-0.8898406000744687460723980620969086885452} +#define T_6561_2140 {-0.4605270890868871647150228909595170989633,-0.8876456501426446266123093664646148681641} +#define T_6561_2150 {-0.4690064393352941785053644707659259438515,-0.8831947462830771478081715031294152140617} +#define T_6561_2155 {-0.4732300332041213253297939900221535935998,-0.8809388944039343583014556315902154892683} +#define T_6561_2156 {-0.4740734530681042224742327562125865370035,-0.8804852986257544200654479027434717863798} +#define T_6561_2165 {-0.4816445742486173964280737891385797411203,-0.8763666493522377365366082813125103712082} +#define T_6561_2170 {-0.4858353284989467835153220676147611811757,-0.8740503610103484977145171797019429504871} +#define T_6561_2180 {-0.4941833240380158476234839781682239845395,-0.8693577182280821658011404906574171036482} +#define T_6561_2185 {-0.4983403739271455301640401103213662281632,-0.8669814713788022153551082737976685166359} +#define T_6561_2195 {-0.5066201013906278527443305392807815223932,-0.8621693991710387416915750691259745508432} +#define T_6561_2198 {-0.5090949939605489138472194099449552595615,-0.8607103386879402240339231866528280079365} +#define T_6561_2200 {-0.5107425891305945597764548438135534524918,-0.8597336841418838915274136525113135576248} +#define T_6561_2210 {-0.5189523400351312432121630990877747535706,-0.8548031754574040297356418705021496862173} +#define T_6561_2212 {-0.5205886030632838057385924912523478269577,-0.8538076518517614399428339311270974576473} +#define T_6561_2215 {-0.5230394149697108474583728821016848087311,-0.8523084948468733212934012044570408761501} +#define T_6561_2225 {-0.5311774952712813302468930487520992755890,-0.8472605670732752392027009591402020305395} +#define T_6561_2230 {-0.5352283140515177173313077219063416123390,-0.8447074356471416800573592809087131172419} +#define T_6561_2240 {-0.5432930444949979342794676995254121720791,-0.8395431304009081108219447742158081382513} +#define T_6561_2245 {-0.5473067712532243556111666293872985988855,-0.8369320749859995300923287686600815504789} +#define T_6561_2254 {-0.5544997956975473707430523973016534000635,-0.8321838598359008187799190636724233627319} +#define T_6561_2255 {-0.5552964877188933945362236954679246991873,-0.8316524578975649228951283475907985121012} +#define T_6561_2260 {-0.5592722942412098330322578476625494658947,-0.8289840172730554845870187818945851176977} +#define T_6561_2270 {-0.5671853480881328124496576492674648761749,-0.8235901777669180079399779970117378979921} +#define T_6561_2275 {-0.5711224139853041670278344099642708897591,-0.8208649025531539189515228827076498419046} +#define T_6561_2282 {-0.5766122873628716227756285661598667502403,-0.8170179129383622340654369509138632565737} +#define T_6561_2285 {-0.5789571723915223344647529302164912223816,-0.8153579536230778357364101793791633099318} +#define T_6561_2290 {-0.5828546852682615631024987123964820057154,-0.8125764061679588978748256522521842271090} +#define T_6561_2296 {-0.5875140531022674528216498401889111846685,-0.8092139626868446455887351476121693849564} +#define T_6561_2300 {-0.5906095315677171164736591890687122941017,-0.8069574841473134974023651011520996689796} +#define T_6561_2305 {-0.5944666871903195870530112188134808093309,-0.8041202384102558164968854725884739309549} +#define T_6561_2315 {-0.6021400212064472734496689554362092167139,-0.7983905027375383101073680336412508040667} +#define T_6561_2320 {-0.6059560236687406842293057707138359546661,-0.7954981441710399225897276664909441024065} +#define T_6561_2324 {-0.6089988267774052443925825173209886997938,-0.7931711221317527815699577331542968750000} +#define T_6561_2330 {-0.6135462620446567871823617679183371365070,-0.7896587771506305974966721805685665458441} +#define T_6561_2335 {-0.6173203239322324620275139750447124242783,-0.7867119025794662157125003432156518101692} +#define T_6561_2338 {-0.6195779728221622040251759244711138308048,-0.7849351155309463035081307680229656398296} +#define T_6561_2345 {-0.6248259004574533426179527850763406604528,-0.7807641091376656961031699211162049323320} +#define T_6561_2350 {-0.6285572430101455942974553181556984782219,-0.7777633266357348862385379106854088604450} +#define T_6561_2360 {-0.6359766089437677294426976004615426063538,-0.7717083340721324624666976887965574860573} +#define T_6561_2365 {-0.6396644622163476512355373415630310773849,-0.7686542628369862351078722895181272178888} +#define T_6561_2366 {-0.6404002756531765871983452598215080797672,-0.7680413315332290657977409864543005824089} +#define T_6561_2375 {-0.6469960866066232219040443851554300636053,-0.7624933205712132178177853347733616828918} +#define T_6561_2380 {-0.6506396896276731567354545404668897390366,-0.7593865907962854544521746902319137006998} +#define T_6561_2390 {-0.6578820596279149057750146312173455953598,-0.7531209701102025144692220237629953771830} +#define T_6561_2395 {-0.6614806605568511743697968086053151637316,-0.7499622228547727642578024642716627568007} +#define T_6561_2405 {-0.6686322817376013638579479447798803448677,-0.7435932166301471024638658491312526166439} +#define T_6561_2408 {-0.6707658403452213535800296995148528367281,-0.7416691900207053045335214847000315785408} +#define T_6561_2410 {-0.6721851380198133885102151907631196081638,-0.7403831036870603954369585153472144156694} +#define T_6561_2420 {-0.6792445346772119085798635751416441053152,-0.7339120261387858112911430907843168824911} +#define T_6561_2422 {-0.6806489594099490725298551296873483806849,-0.7326097146872634358061304737930186092854} +#define T_6561_2425 {-0.6827509131972847589153730041289236396551,-0.7306512098999589088776929202140308916569} +#define T_6561_2435 {-0.6897166286575744376108332289732061326504,-0.7240793963048731685105963151727337390184} +#define T_6561_2440 {-0.6931758058905639341418236654135398566723,-0.7207685496246123424413099201046861708164} +#define T_6561_2450 {-0.7000464028106681002583400186267681419849,-0.7140973560459693558755134290549904108047} +#define T_6561_2455 {-0.7034576649713965013077654475637245923281,-0.7107371621021307817045453703030943870544} +#define T_6561_2464 {-0.7095572404535746002807172772008925676346,-0.7046478003370961751272716355742886662483} +#define T_6561_2465 {-0.7102317256355079599927648814627900719643,-0.7039679651097829893302559867152012884617} +#define T_6561_2470 {-0.7135943688258505890331662158132530748844,-0.7005591172628017337942196718358900398016} +#define T_6561_2480 {-0.7202704954379708368605861323885619640350,-0.6936933136491515439203681125945877283812} +#define T_6561_2485 {-0.7235838257921020089113994799845386296511,-0.6902365152989697882901509728981181979179} +#define T_6561_2492 {-0.7281946000599902601990720540925394743681,-0.6853704286321893324185339224641211330891} +#define T_6561_2495 {-0.7301606407644699592296433365845587104559,-0.6832755217907477973682262017973698675632} +#define T_6561_2500 {-0.7334239745920391184696995878766756504774,-0.6797714862316717177037617148016579449177} +#define T_6561_2506 {-0.7373177719555797926531681696360465139151,-0.6755460777463367927353488084918353706598} +#define T_6561_2510 {-0.7399001208293899400914028774423059076071,-0.6727167391976017762900141860882285982370} +#define T_6561_2515 {-0.7431127847565978106203488096070941537619,-0.6691661894711166125304657725791912525892} +#define T_6561_2525 {-0.7494869259361931490559527446748688817024,-0.6620191446255276890298091529984958469868} +#define T_6561_2530 {-0.7526482570447393660728607756027486175299,-0.6584228133711012009854357529547996819019} +#define T_6561_2534 {-0.7551649007693587156708758811873849481344,-0.6555348752324353656106836751860100775957} +#define T_6561_2540 {-0.7589190778921114377553180929680820554495,-0.6511849454735476605549138184869661927223} +#define T_6561_2545 {-0.7620284238559846823335419685463421046734,-0.6475435747774537231791214253462385386229} +#define T_6561_2548 {-0.7638856496637209669842150105978362262249,-0.6453516206207551286766488374269101768732} +#define T_6561_2555 {-0.7681946304163371763706891215406358242035,-0.6402163773284054171242019037890713661909} +#define T_6561_2560 {-0.7712513496364195031418375947396270930767,-0.6365307185705976200651434737665113061666} +#define T_6561_2570 {-0.7773116695416287802444799126533325761557,-0.6291157035032631794635449296038132160902} +#define T_6561_2575 {-0.7803151312780869375274050980806350708008,-0.6253865172023314045191000332124531269073} +#define T_6561_2576 {-0.7809136789301702741283861541887745261192,-0.6246389565659083986659538823005277663469} +#define T_6561_2585 {-0.7862683140092496802964205926400609314442,-0.6178852145706773546507406535965856164694} +#define T_6561_2590 {-0.7892178985116851119840930550708435475826,-0.6141132702269178622600520611740648746490} +#define T_6561_2600 {-0.7950627156571568043830211536260321736336,-0.6065272278899497271353880023525562137365} +#define T_6561_2605 {-0.7979578142924882433462130393309053033590,-0.6027133038265829467761136584158521145582} +#define T_6561_2615 {-0.8036930598013604098994733249128330498934,-0.5950440871289511823860607364622410386801} +#define T_6561_2618 {-0.8053992844166191211741079314379021525383,-0.5927326485534586408832069537311326712370} +#define T_6561_2620 {-0.8065330751794125285769609945418778806925,-0.5911889703315176269882158521795645356178} +#define T_6561_2630 {-0.8121575656103758866777297953376546502113,-0.5834381617805163289247616376087535172701} +#define T_6561_2632 {-0.8132735421337250469875357339333277195692,-0.5818815563886383657177248096559196710587} +#define T_6561_2635 {-0.8149419117071479146119372671819292008877,-0.5795426477344864935048462939448654651642} +#define T_6561_2645 {-0.8204544864726895925244321006175596266985,-0.5717118466735102710885030319332145154476} +#define T_6561_2650 {-0.8231825887512789208244612382259219884872,-0.5677767392001390467370924852730240672827} +#define T_6561_2660 {-0.8285821103571644474783397527062334120274,-0.5598675614786657872556929760321509093046} +#define T_6561_2665 {-0.8312534058863195740585183557413984090090,-0.5558936725691286939721180715423543006182} +#define T_6561_2674 {-0.8360136691462815017317211641056928783655,-0.5487086157520874030169011348334606736898} +#define T_6561_2675 {-0.8365387601663097916215861005184706300497,-0.5479077502092968288138763455208390951157} +#define T_6561_2680 {-0.8391526977365881823089921454084105789661,-0.5438958998571384872988687675388064235449} +#define T_6561_2690 {-0.8443227940823420096805307366594206541777,-0.5358348807169864835842076900007668882608} +#define T_6561_2695 {-0.8468788343198500045971854888193774968386,-0.5317858967489191845956497672887053340673} +#define T_6561_2702 {-0.8504246572009178661843975532974582165480,-0.5260968565052458467690144061634782701731} +#define T_6561_2705 {-0.8519326059059665334771693778748158365488,-0.5236514441823579835144641947408672422171} +#define T_6561_2710 {-0.8544302213836566517457526970247272402048,-0.5195661620874434394323770902701653540134} +#define T_6561_2716 {-0.8574014960893781012174486022558994591236,-0.5146481074517773590670799421786796301603} +#define T_6561_2720 {-0.8593666253878095018237104341096710413694,-0.5113599546010307861365618009585887193680} +#define T_6561_2725 {-0.8618053007343123850247934569779317826033,-0.5072392173582809249410274787805974483490} +#define T_6561_2735 {-0.8666233185524315763004210566577967256308,-0.4989629482648693659108118936273967847228} +#define T_6561_2740 {-0.8690025505584005882653286789718549698591,-0.4948076061693014171538607115508057177067} +#define T_6561_2744 {-0.8708915933753176297216214152285829186440,-0.4914751596858180637816815305995987728238} +#define T_6561_2750 {-0.8737011880148561893122405308531597256660,-0.4864629832386313523784338030964136123657} +#define T_6561_2755 {-0.8760204857368022457464462604548316448927,-0.4822738937258133073981980487587861716747} +#define T_6561_2758 {-0.8774024269259784736263441118353512138128,-0.4797551263138342125813551319879479706287} +#define T_6561_2765 {-0.8805987732895484976225475293176714330912,-0.4738626388321222626665019106440013274550} +#define T_6561_2770 {-0.8828576581511441423444352949445601552725,-0.4696406663012452353811454486276488751173} +#define T_6561_2780 {-0.8873146510917795382056283415295183658600,-0.4611645150679672955362775610410608351231} +#define T_6561_2785 {-0.8895126569826107276739435292256530374289,-0.4569105307034805329990945210738573223352} +#define T_6561_2786 {-0.8899498124369744633810341838398016989231,-0.4560584736011314421766371651756344363093} +#define T_6561_2795 {-0.8938474356313146351737941586179658770561,-0.4483712321451085447421291974023915827274} +#define T_6561_2800 {-0.8959841090030605803562480105028953403234,-0.4440861137369548350406489589659031480551} +#define T_6561_2810 {-0.9001957788983643293789782546809874475002,-0.4354854298981391536038643153005978092551} +#define T_6561_2815 {-0.9022706788583852999252599147439468652010,-0.4311700616606267133512631062330910935998} +#define T_6561_2825 {-0.9063583709417406542030448690638877451420,-0.4225097672525856551395406768278917297721} +#define T_6561_2828 {-0.9075684866232428538168619525094982236624,-0.4199040867726779313606755295040784403682} +#define T_6561_2830 {-0.9083710693440535388631928981340024620295,-0.4181650396419346327192556600493844598532} +#define T_6561_2840 {-0.9123339401391586944711775686300825327635,-0.4094469216762510188623025442211655899882} +#define T_6561_2842 {-0.9131164853838890094195335223048459738493,-0.4076987663952075235940242237120401114225} +#define T_6561_2845 {-0.9142840216727835533916390886588487774134,-0.4050737312068520834529294916137587279081} +#define T_6561_2855 {-0.9181212534596305818723749325727112591267,-0.3962995886267315914786024677596287801862} +#define T_6561_2860 {-0.9200083157342879847107042223797179758549,-0.3918988376861543554419142765254946425557} +#define T_6561_2870 {-0.9237191167178956385797050643304828554392,-0.3830704809952220624147400940273655578494} +#define T_6561_2875 {-0.9255427703470375799810199168859981000423,-0.3786430776580114732077220196515554562211} +#define T_6561_2884 {-0.9287718434668342792903672489046584814787,-0.3706519429103513063594732557248789817095} +#define T_6561_2885 {-0.9291263748208343775658590857347007840872,-0.3697623285467226406098006918909959495068} +#define T_6561_2890 {-0.9308862435019931158564077122719027101994,-0.3653091863870220890042617156723281368613} +#define T_6561_2900 {-0.9343419120058172877563151814683806151152,-0.3563778773567650715037302688870113343000} +#define T_6561_2905 {-0.9360376325982531220404325722483918070793,-0.3518999152598045188078401679376838728786} +#define T_6561_2912 {-0.9383755781502409742955705951317213475704,-0.3456172367391431854422023661754792556167} +#define T_6561_2915 {-0.9393646520709372227386779741209466010332,-0.3429198892447724023035959817207185551524} +#define T_6561_2920 {-0.9409958746705709975444165138469543308020,-0.3384180312172608284804198319761781021953} +#define T_6561_2921 {-0.9413195314039033423370028685894794762135,-0.3375167252115601201900574324099579825997} +#define T_6561_2926 {-0.9429248598120958391177737212274223566055,-0.3330055686446389273669410613365471363068} +#define T_6561_2930 {-0.9441935585970782085496466606855392456055,-0.3293911412041705122355494950170395895839} +#define T_6561_2935 {-0.9457599466086924477892239337961655110121,-0.3248663161836313206798365627037128433585} +#define T_6561_2936 {-0.9460706232693528727750731377454940229654,-0.3239604540476169747265089426946360617876} +#define T_6561_2941 {-0.9476109861935706479840746396803297102451,-0.3194267034004020788806599284725962206721} +#define T_6561_2945 {-0.9488276351617761505963244417216628789902,-0.3157944248293682587558350860490463674068} +#define T_6561_2950 {-0.9503288653684698328305557879502885043621,-0.3112475664924573282021924569562543183565} +#define T_6561_2951 {-0.9506264978497703799220630571653600782156,-0.3103373351465795737524899777781683951616} +#define T_6561_2954 {-0.9515141632531146642648423039645422250032,-0.3076049367756066854617813532968284562230} +#define T_6561_2956 {-0.9521015774432685097039552601927425712347,-0.3057819259374885412228195491479709744453} +#define T_6561_2960 {-0.9532659255448255875009522242180537432432,-0.3021325457397251446067798497097101062536} +#define T_6561_2965 {-0.9547016881747087957421626924769952893257,-0.2975645923095709965977562205807771533728} +#define T_6561_2966 {-0.9549862150615673339615341319586150348186,-0.2966501795758461623719881572469603270292} +#define T_6561_2968 {-0.9555526411038189893432104327075649052858,-0.2948205387681056555138070507382508367300} +#define T_6561_2971 {-0.9563957069485135020769916991412173956633,-0.2920740517924402812610651380964554846287} +#define T_6561_2975 {-0.9575075139255918577063653174263890832663,-0.2884083230006245224252836578671121969819} +#define T_6561_2980 {-0.9588775127157050937043436533713247627020,-0.2838202170532302948124936392559902742505} +#define T_6561_2981 {-0.9591488752973184039873899564554449170828,-0.2829018116164848106741658284590812399983} +#define T_6561_2986 {-0.9604924886355992175168694302556104958057,-0.2783059095215071332418688143661711364985} +#define T_6561_2990 {-0.9615515250719859352557250531390309333801,-0.2746245885417727961019807025877526029944} +#define T_6561_2995 {-0.9628554773294333291033808563952334225178,-0.2700172768115201016314586013322696089745} +#define T_6561_2996 {-0.9631136196113913028327147003437858074903,-0.2690950681804560318255425954703241586685} +#define T_6561_3001 {-0.9643910771526262859509870395413599908352,-0.2644803401169870671338912870851345360279} +#define T_6561_3005 {-0.9653971245210648532975028501823544502258,-0.2607841865728435237770099774934351444244} +#define T_6561_3010 {-0.9666347611813460583007895365881267935038,-0.2561586197571382172988307956984499469399} +#define T_6561_3011 {-0.9668796298971860103677045117365196347237,-0.2552327982252291516118702929816208779812} +#define T_6561_3016 {-0.9680906680439368416912770953786093741655,-0.2506001964210007315614348044618964195251} +#define T_6561_3020 {-0.9690435187512188575098548426467459648848,-0.2468899729965882416937716925531276501715} +#define T_6561_3025 {-0.9702145844337495272924343225895427167416,-0.2422471055596883415805820050081820227206} +#define T_6561_3026 {-0.9704461290559477371076013696438167244196,-0.2413178621659133049615064692261512391269} +#define T_6561_3031 {-0.9715904979161097410056413536949548870325,-0.2366683425368206750150079642480704933405} +#define T_6561_3035 {-0.9724899553459115386644384670944418758154,-0.2329448148195340795307828329896437935531} +#define T_6561_3038 {-0.9731551844389849659222591071738861501217,-0.2301499228753404457936682092622504569590} +#define T_6561_3040 {-0.9735942084067195079200018881238065660000,-0.2282856047956005884902452862661448307335} +#define T_6561_3041 {-0.9738123811571173238377241432317532598972,-0.2273531312850237962042143635699176229537} +#define T_6561_3046 {-0.9748898445954841118776812436408363282681,-0.2226876532378769535380769184484961442649} +#define T_6561_3050 {-0.9757357231489366400367657661263365298510,-0.2189515895603902084776137826338526792824} +#define T_6561_3052 {-0.9761532940797033042201746866339817643166,-0.2170823494836560674237091461691306903958} +#define T_6561_3055 {-0.9767729357305248161225108560756780207157,-0.2142769983558011348545591090442030690610} +#define T_6561_3056 {-0.9769776915901869918101851908431854099035,-0.2133414871400065027096104586235014721751} +#define T_6561_3061 {-0.9779880272771754867733307037269696593285,-0.2086610133745609718669555832093465141952} +#define T_6561_3065 {-0.9787801524111627848867556167533621191978,-0.2049131846562852465876147789458627812564} +#define T_6561_3070 {-0.9797501104895256496263300505233928561211,-0.2002241768512538766522368405276210978627} +#define T_6561_3071 {-0.9799414072080302462453005318820942193270,-0.1992858209686416970374267521037836559117} +#define T_6561_3076 {-0.9808844066655576510171954396355431526899,-0.1945913172789499601122997773927636444569} +#define T_6561_3080 {-0.9816226149287323732295362788136117160320,-0.1908324968669579957403215075828484259546} +#define T_6561_3085 {-0.9825251183575194335872993178782053291798,-0.1861300400164963286808728071264340542257} +#define T_6561_3086 {-0.9827029164616757350714237873035017400980,-0.1851890330924506333776236033372697420418} +#define T_6561_3091 {-0.9835783851061777882662795491341967135668,-0.1804814681675749321598800634092185646296} +#define T_6561_3094 {-0.9840928430258779613382102979812771081924,-0.1776549360565159441094351677747908979654} +#define T_6561_3095 {-0.9842625241726888907223269598034676164389,-0.1767124316770245506802439194871112704277} +#define T_6561_3100 {-0.9850973867245034210071708002942614257336,-0.1719974961112926137296597062231739982963} +#define T_6561_3101 {-0.9852616495264975293721931848267558962107,-0.1710540323182276056801498498316504992545} +#define T_6561_3106 {-0.9860694067090800540853479105862788856030,-0.1663343775423554993242447608281509019434} +#define T_6561_3110 {-0.9866993354100042079579679921153001487255,-0.1625559026964449038654692003547097556293} +#define T_6561_3115 {-0.9874663848148310663077609206084161996841,-0.1578294613205278307255241543316515162587} +#define T_6561_3116 {-0.9876170784197955132555080126621760427952,-0.1568837353378212395860202832409413531423} +#define T_6561_3121 {-0.9883569574635104881821234812377952039242,-0.1521529645898249127888846032874425873160} +#define T_6561_3122 {-0.9885022145260651260656459271558560431004,-0.1512063883474009384766389985088608227670} +#define T_6561_3125 {-0.9889325458159820003700701818161178380251,-0.1483658310593132245713832162437029182911} +#define T_6561_3130 {-0.9896316237967341944425925248651765286922,-0.1436288591524666480303551452379906550050} +#define T_6561_3131 {-0.9897687171097425684607173934637103229761,-0.1426810661262885027156244177604094147682} +#define T_6561_3136 {-0.9904405653439810608418270021502394229174,-0.1379401555787695132782033624607720412314} +#define T_6561_3140 {-0.9909616945780131969101489630702417343855,-0.1341451448210951580275462902136496268213} +#define T_6561_3145 {-0.9915926568831918697100036297342739999294,-0.1293986198355009664684445169768878258765} +#define T_6561_3146 {-0.9917161216156749059535968626732937991619,-0.1284489553385461668710831872886046767235} +#define T_6561_3151 {-0.9923198004076699829667518315545748919249,-0.1236988832564066137509328768828709144145} +#define T_6561_3155 {-0.9927863629906622522369730177160818129778,-0.1198967783544373638671132198396662715822} +#define T_6561_3160 {-0.9933490794241223165172982589865569025278,-0.1151416797135109554295340217322518583387} +#define T_6561_3161 {-0.9934588900997051164054596483765635639429,-0.1141903397046437063089285857131471857429} +#define T_6561_3164 {-0.9937828550839724606191794009646400809288,-0.1113356948204315688411014662051456980407} +#define T_6561_3166 {-0.9939942748831379626395232662616763263941,-0.1094320862432258539831408938880485948175} +#define T_6561_3170 {-0.9944061745420654796490111948514822870493,-0.1056236717436732780450725499576947186142} +#define T_6561_3173 {-0.9947055238710431623871954798232764005661,-0.1027663406978841370564836665835173334926} +#define T_6561_3175 {-0.9949005289898809056836626041331328451633,-0.1008609806399640151930441334116039797664} +#define T_6561_3176 {-0.9949966629496399539434037251339759677649,-0.0999081614237826015623156195033516269177} +#define T_6561_3178 {-0.9951861932290133605505388914025388658047,-0.0980022489861583079306939225716632790864} +#define T_6561_3181 {-0.9954636432503436438423705112654715776443,-0.0951427084266184414085287812667957041413} +#define T_6561_3185 {-0.9958207949916222379016517152194865047932,-0.0913287701781502342379681635975430253893} +#define T_6561_3187 {-0.9959938915471814313562504139554221183062,-0.0894212950068455497687480715285346377641} +#define T_6561_3190 {-0.9962466854460456655573352691135369241238,-0.0865594693708769957751769652531947940588} +#define T_6561_3191 {-0.9963291228531842014248809391574468463659,-0.0856053675572072958477320980819058604538} +#define T_6561_3196 {-0.9967276023119397976302025199402123689651,-0.0808336983534195807621358653705101460218} +#define T_6561_3200 {-0.9970299324389642059429661458125337958336,-0.0770150233445034831936126806795073207468} +#define T_6561_3205 {-0.9973872710194756630031065469665918499231,-0.0722400969567667805559807447934872470796} +#define T_6561_3206 {-0.9974559948634170725156877779227215796709,-0.0712849094200927319020522077153145801276} +#define T_6561_3211 {-0.9977858912558363879696798903751187026501,-0.0665080086214896309959598852401541080326} +#define T_6561_3215 {-0.9980333373841873134679758550191763788462,-0.0626853848180018852209727242552617099136} +#define T_6561_3220 {-0.9983220503556280434054315264802426099777,-0.0579058181337146898881584888840734492987} +#define T_6561_3221 {-0.9983770464555259405159404195728711783886,-0.0569497419725541118018519171073421603069} +#define T_6561_3226 {-0.9986382917090186328579193286714144051075,-0.0521685952704591254791566257154045160860} +#define T_6561_3229 {-0.9987840489543886590695365157444030046463,-0.0492993261036830388022167426242958754301} +#define T_6561_3230 {-0.9988308027793350030165697717166040092707,-0.0483428114530914179725051837976934621111} +#define T_6561_3235 {-0.9990508305671226274569107772549614310265,-0.0435595907136703819340084464784013107419} +#define T_6561_3236 {-0.9990920875747868468153001231257803738117,-0.0426028232099046472169945332097995560616} +#define T_6561_3241 {-0.9992846277826065159999302522919606417418,-0.0378184171717637629406283394928323104978} +#define T_6561_3245 {-0.9994221640711219434294321217748802155256,-0.0339902627732619963540905416721216170117} +#define T_6561_3248 {-0.9995156923500888579425804891798179596663,-0.0311188165893638395143039332424450549297} +#define T_6561_3250 {-0.9995734612735428514795898990996647626162,-0.0292043749741210875714436667749396292493} +#define T_6561_3251 {-0.9996009706757813528810174830141477286816,-0.0282471135522869010170410319915390573442} +#define T_6561_3256 {-0.9997247661081490877066357825242448598146,-0.0234604354180949770114494867812027223408} +#define T_6561_3257 {-0.9997467747199108023536950895504560321569,-0.0225030317321025692678393426149341394193} +#define T_6561_3260 {-0.9998072992348884246993634405953343957663,-0.0196307003603634830368740438188979169354} +#define T_6561_3262 {-0.9998430642950282276615325827151536941528,-0.0177157212985553647044412173272576183081} +#define T_6561_3265 {-0.9998898346324667230078375723678618669510,-0.0148431330472520734559127575380443886388} +#define T_6561_3266 {-0.9999035907528417421730182468309067189693,-0.0138855752338038467702530098790703050327} +#define T_6561_3271 {-0.9999586158651443401623737372574396431446,-0.0090976127123922265194089220585738075897} +#define T_6561_3275 {-0.9999861287997795500359643483534455299377,-0.0052670872434970325906555821404708694899} +#define T_6561_3280 {-0.9999998853617187988263026454660575836897,-0.0004788283087244116936233295778180263369} +#define T_6561_3290 {-0.9999586158651443401623737372574396431446,0.0090976127123922265194089220585738075897} +#define T_6561_3295 {-0.9999035907528417421730182468309067189693,0.0138855752338038467702530098790703050327} +#define T_6561_3304 {-0.9997467747199108023536950895504560321569,0.0225030317321025692678393426149341394193} +#define T_6561_3305 {-0.9997247661081490877066357825242448598146,0.0234604354180949770114494867812027223408} +#define T_6561_3310 {-0.9996009706757813528810174830141477286816,0.0282471135522869010170410319915390573442} +#define T_6561_3320 {-0.9992846277826065159999302522919606417418,0.0378184171717637629406283394928323104978} +#define T_6561_3325 {-0.9990920875747868468153001231257803738117,0.0426028232099046472169945332097995560616} +#define T_6561_3332 {-0.9987840489543886590695365157444030046463,0.0492993261036830388022167426242958754301} +#define T_6561_3335 {-0.9986382917090186328579193286714144051075,0.0521685952704591254791566257154045160860} +#define T_6561_3340 {-0.9983770464555259405159404195728711783886,0.0569497419725541118018519171073421603069} +#define T_6561_3346 {-0.9980333373841873134679758550191763788462,0.0626853848180018852209727242552617099136} +#define T_6561_3350 {-0.9977858912558363879696798903751187026501,0.0665080086214896309959598852401541080326} +#define T_6561_3355 {-0.9974559948634170725156877779227215796709,0.0712849094200927319020522077153145801276} +#define T_6561_3365 {-0.9967276023119397976302025199402123689651,0.0808336983534195807621358653705101460218} +#define T_6561_3370 {-0.9963291228531842014248809391574468463659,0.0856053675572072958477320980819058604538} +#define T_6561_3374 {-0.9959938915471814313562504139554221183062,0.0894212950068455497687480715285346377641} +#define T_6561_3380 {-0.9954636432503436438423705112654715776443,0.0951427084266184414085287812667957041413} +#define T_6561_3385 {-0.9949966629496399539434037251339759677649,0.0999081614237826015623156195033516269177} +#define T_6561_3388 {-0.9947055238710431623871954798232764005661,0.1027663406978841370564836665835173334926} +#define T_6561_3395 {-0.9939942748831379626395232662616763263941,0.1094320862432258539831408938880485948175} +#define T_6561_3400 {-0.9934588900997051164054596483765635639429,0.1141903397046437063089285857131471857429} +#define T_6561_3410 {-0.9923198004076699829667518315545748919249,0.1236988832564066137509328768828709144145} +#define T_6561_3415 {-0.9917161216156749059535968626732937991619,0.1284489553385461668710831872886046767235} +#define T_6561_3425 {-0.9904405653439810608418270021502394229174,0.1379401555787695132782033624607720412314} +#define T_6561_3430 {-0.9897687171097425684607173934637103229761,0.1426810661262885027156244177604094147682} +#define T_6561_3440 {-0.9883569574635104881821234812377952039242,0.1521529645898249127888846032874425873160} +#define T_6561_3445 {-0.9876170784197955132555080126621760427952,0.1568837353378212395860202832409413531423} +#define T_6561_3455 {-0.9860694067090800540853479105862788856030,0.1663343775423554993242447608281509019434} +#define T_6561_3460 {-0.9852616495264975293721931848267558962107,0.1710540323182276056801498498316504992545} +#define T_6561_3470 {-0.9835783851061777882662795491341967135668,0.1804814681675749321598800634092185646296} +#define T_6561_3475 {-0.9827029164616757350714237873035017400980,0.1851890330924506333776236033372697420418} +#define T_6561_3485 {-0.9808844066655576510171954396355431526899,0.1945913172789499601122997773927636444569} +#define T_6561_3490 {-0.9799414072080302462453005318820942193270,0.1992858209686416970374267521037836559117} +#define T_6561_3500 {-0.9779880272771754867733307037269696593285,0.2086610133745609718669555832093465141952} +#define T_6561_3505 {-0.9769776915901869918101851908431854099035,0.2133414871400065027096104586235014721751} +#define T_6561_3515 {-0.9748898445954841118776812436408363282681,0.2226876532378769535380769184484961442649} +#define T_6561_3520 {-0.9738123811571173238377241432317532598972,0.2273531312850237962042143635699176229537} +#define T_6561_3530 {-0.9715904979161097410056413536949548870325,0.2366683425368206750150079642480704933405} +#define T_6561_3535 {-0.9704461290559477371076013696438167244196,0.2413178621659133049615064692261512391269} +#define T_6561_3545 {-0.9680906680439368416912770953786093741655,0.2506001964210007315614348044618964195251} +#define T_6561_3550 {-0.9668796298971860103677045117365196347237,0.2552327982252291516118702929816208779812} +#define T_6561_3560 {-0.9643910771526262859509870395413599908352,0.2644803401169870671338912870851345360279} +#define T_6561_3565 {-0.9631136196113913028327147003437858074903,0.2690950681804560318255425954703241586685} +#define T_6561_3575 {-0.9604924886355992175168694302556104958057,0.2783059095215071332418688143661711364985} +#define T_6561_3580 {-0.9591488752973184039873899564554449170828,0.2829018116164848106741658284590812399983} +#define T_6561_3590 {-0.9563957069485135020769916991412173956633,0.2920740517924402812610651380964554846287} +#define T_6561_3595 {-0.9549862150615673339615341319586150348186,0.2966501795758461623719881572469603270292} +#define T_6561_3605 {-0.9521015774432685097039552601927425712347,0.3057819259374885412228195491479709744453} +#define T_6561_3610 {-0.9506264978497703799220630571653600782156,0.3103373351465795737524899777781683951616} +#define T_6561_3620 {-0.9476109861935706479840746396803297102451,0.3194267034004020788806599284725962206721} +#define T_6561_3625 {-0.9460706232693528727750731377454940229654,0.3239604540476169747265089426946360617876} +#define T_6561_3635 {-0.9429248598120958391177737212274223566055,0.3330055686446389273669410613365471363068} +#define T_6561_3640 {-0.9413195314039033423370028685894794762135,0.3375167252115601201900574324099579825997} +// Pre-computed twiddles for N=8192 +#define T_8192_1 {0.9999997058628822266257429873803630471230,-0.0007669903187427044854995727973800967447} +#define T_8192_3 {0.9999973527669782091820138703042175620794,-0.0023009691514258054194363989353178112651} +#define T_8192_5 {0.9999926465807071895852686793659813702106,-0.0038349425697062279778937199381516620633} +#define T_8192_7 {0.9999855873151431984169335009937640279531,-0.0053689069639963433663853997757087199716} +#define T_8192_9 {0.9999761749868976146160548523766919970512,-0.0069028587247297558057712585366516577778} +#define T_8192_11 {0.9999644096181182773008799813396763056517,-0.0084367942423698005088850138122325006407} +#define T_8192_13 {0.9999502912364904849695790289842989295721,-0.0099707099074180290804170212481949420180} +#define T_8192_15 {0.9999338198752359962995228670479264110327,-0.0115046021104227152997978578241600189358} +#define T_8192_17 {0.9999149955731134742364929479663260281086,-0.0130384672419873327148254205098965030629} +#define T_8192_19 {0.9998938183744184859946813048736657947302,-0.0145723016927790660623998064693296328187} +#define T_8192_21 {0.9998702883289829479451782390242442488670,-0.0161061018535372871274269357400044100359} +#define T_8192_23 {0.9998444054921752366382747823081444948912,-0.0176398641150820566225743135646553128026} +#define T_8192_25 {0.9998161699249004108480676222825422883034,-0.0191735848683226225985798407691618194804} +#define T_8192_27 {0.9997855816935992123717369395308196544647,-0.0207072605042658945684319604652046109550} +#define T_8192_29 {0.9997526408702488431856636452721431851387,-0.0222408874140249609996367041730991331860} +#define T_8192_31 {0.9997173475323621882893121437518857419491,-0.0237744619888275582342274105940305162221} +#define T_8192_33 {0.9996797017629879267275327947572804987431,-0.0253079806200245706337970119648161926307} +#define T_8192_35 {0.9996397036507101985236545260704588145018,-0.0268414396990985307245303204126685159281} +#define T_8192_37 {0.9995973532896483826348799084371421486139,-0.0283748356176720985255546025882722460665} +#define T_8192_39 {0.9995526507794569859299826930509880185127,-0.0299081647675165582245249140669329790398} +#define T_8192_41 {0.9995055962253253101224004240066278725863,-0.0314414235405603009754216259352688211948} +#define T_8192_43 {0.9994561897379773407479319757840130478144,-0.0329746083288973354519235670068155741319} +#define T_8192_45 {0.9994044314336713030755277031857986003160,-0.0345077155247957567674177425942616537213} +#define T_8192_47 {0.9993503214341994400626845163060352206230,-0.0360407415207062223339029571889113867655} +#define T_8192_49 {0.9992938598668877903108409554988611489534,-0.0375736827092705005792794281660462729633} +#define T_8192_51 {0.9992350468645958549984698038315400481224,-0.0391065354833298878256542252529470715672} +#define T_8192_53 {0.9991738825657163758364731620531529188156,-0.0406392962359337361899491725125699304044} +#define T_8192_55 {0.9991103671141748909789725985319819301367,-0.0421719613603479467900392307910806266591} +#define T_8192_57 {0.9990445006594292909340992991928942501545,-0.0437045272500634213175274567220185417682} +#define T_8192_59 {0.9989762833564698185639940675173420459032,-0.0452369902988045899383529047099727904424} +#define T_8192_61 {0.9989057153658182919286900869337841868401,-0.0467693469005378628655655859347461955622} +#define T_8192_63 {0.9988327968535279932638104583020322024822,-0.0483015934494801443821465625205746619031} +#define T_8192_65 {0.9987575279911833359136608123662881553173,-0.0498337263401072844137829065402911510319} +#define T_8192_67 {0.9986799089558990871751120721455663442612,-0.0513657419671625925516877941845450550318} +#define T_8192_69 {0.9985999399303203682976004529336933046579,-0.0528976367256653243198449843021080596372} +#define T_8192_71 {0.9985176211026222103939176122366916388273,-0.0544294070109191327477837774040381191298} +#define T_8192_73 {0.9984329526665084442171860246162395924330,-0.0559610492185205685156113020184420747682} +#define T_8192_75 {0.9983459348212123662946737567835953086615,-0.0574925597443675731601508971380098955706} +#define T_8192_77 {0.9982565677714951846155599923804402351379,-0.0590239349846679306477170712241786532104} +#define T_8192_79 {0.9981648517276462406755399570101872086525,-0.0605551713359477883358295002835802733898} +#define T_8192_81 {0.9980707869054823433430101431440562009811,-0.0620862651950600946682001790577487554401} +#define T_8192_83 {0.9979743735263469917029510725114960223436,-0.0636172129591930923808718034706544131041} +#define T_8192_85 {0.9978756118171101530123223710688762366772,-0.0651480110258788325250378647979232482612} +#define T_8192_87 {0.9977745020101678186108529189368709921837,-0.0666786557930015705286663774131739046425} +#define T_8192_89 {0.9976710443434410047203186877595726400614,-0.0682091436588063287915773003078356850892} +#define T_8192_91 {0.9975652390603757524445427407044917345047,-0.0697394710219073066248540726519422605634} +#define T_8192_93 {0.9974570864099419065240681447903625667095,-0.0712696342812964012125576118705794215202} +#define T_8192_95 {0.9973465866466332263584604334027972072363,-0.0727996298363516730622890804625058081001} +#define T_8192_97 {0.9972337400304662757832829811377450823784,-0.0743294540868457559446014215609466191381} +#define T_8192_99 {0.9971185468269799789808871537388768047094,-0.0758591034329544472436523960823251400143} +#define T_8192_101 {0.9970010073072352874135049205506220459938,-0.0773885742752650485076770792147726751864} +#define T_8192_103 {0.9968811217478138475556193043303210288286,-0.0789178630147849557996408975668600760400} +#define T_8192_105 {0.9967588904308180008939643812482245266438,-0.0804469660529500141254999334705644287169} +#define T_8192_107 {0.9966343136438698957491055807622615247965,-0.0819758797916330800292783465010870713741} +#define T_8192_109 {0.9965073916801108211416249105241149663925,-0.0835046006331524315324799090376473031938} +#define T_8192_111 {0.9963781248382002075913987937383353710175,-0.0850331249802802752180141965254733804613} +#define T_8192_113 {0.9962465134223155160952956066466867923737,-0.0865614492362511700473959308510529808700} +#define T_8192_115 {0.9961125577421511279041510533716063946486,-0.0880895698047705066890955549752106890082} +#define T_8192_117 {0.9959762581129177894112558533379342406988,-0.0896174830900229729691019997517287265509} +#define T_8192_119 {0.9958376148553416129516335786320269107819,-0.0911451854966810193214854507459676824510} +#define T_8192_121 {0.9956966282956635216905283414234872907400,-0.0926726734299133103611723072390304878354} +#define T_8192_123 {0.9955532987656384724672875563555862754583,-0.0941999432953932042122957568608399014920} +#define T_8192_125 {0.9954076266025349006838496279669925570488,-0.0957269914993071763253951189653889741749} +#define T_8192_127 {0.9952596121491333880371144005039241164923,-0.0972538144483632710501908036349050235003} +#define T_8192_129 {0.9951092557537261074074308453418780118227,-0.0987804085497996364750861175707541406155} +#define T_8192_131 {0.9949565577701163787693872109230142086744,-0.1003067702113928649776397605819511227310} +#define T_8192_133 {0.9948015185576171148795765475369989871979,-0.1018328958414665419418554392905207350850} +#define T_8192_135 {0.9946441384810507102542942448053508996964,-0.1033587818488996279420177870633779093623} +#define T_8192_137 {0.9944844179107475978796060189779382199049,-0.1048844246431349658266185542743187397718} +#define T_8192_139 {0.9943223572225458051221380628703627735376,-0.1064098206341876767799803360503574367613} +#define T_8192_141 {0.9941579567977897324837499581917654722929,-0.1079349662326536535283949547192605677992} +#define T_8192_143 {0.9939912170233293764454174379352480173111,-0.1094598578497179841573228031847975216806} +#define T_8192_145 {0.9938221382915196633334176112839486449957,-0.1109844918971633898063799961164477281272} +#define T_8192_147 {0.9936507210002191170516994134231936186552,-0.1125088647873786901199011367680213879794} +#define T_8192_149 {0.9934769655527891929480688304465729743242,-0.1140329729333672131863508525384531822056} +#define T_8192_151 {0.9933008723580932786134667367150541394949,-0.1155568127487552748666743696048797573894} +#define T_8192_153 {0.9931224418304955836589442697004415094852,-0.1170803806478005887337090484834334347397} +#define T_8192_155 {0.9929416743898604735818480548914521932602,-0.1186036730454007176449593430334061849862} +#define T_8192_157 {0.9927585704615511374981906556058675050735,-0.1201266863571015114375839516469568479806} +#define T_8192_159 {0.9925731304764288109865333353809546679258,-0.1216494169991055307455951606243615970016} +#define T_8192_161 {0.9923853548708516658649614328169263899326,-0.1231718613882804846948459953637211583555} +#define T_8192_163 {0.9921952440866739220126646614517085254192,-0.1246940159421676547202295637362112756819} +#define T_8192_165 {0.9920027985712445151023075595730915665627,-0.1262158770789903461384540150902466848493} +#define T_8192_167 {0.9918080187774064304662147151248063892126,-0.1277374412176623119652418836267315782607} +#define T_8192_169 {0.9916109051634953708287412155186757445335,-0.1292587047777961350991660083309398032725} +#define T_8192_171 {0.9914114581933385350609455599624197930098,-0.1307796641797117076500001076055923476815} +#define T_8192_173 {0.9912096783362540630690773468813858926296,-0.1323003158444446825114937382750213146210} +#define T_8192_175 {0.9910055660670493704600403361837379634380,-0.1338206561937547445229057530013960786164} +#define T_8192_177 {0.9907991218660203713852752116508781909943,-0.1353406816501342146974451452479115687311} +#define T_8192_179 {0.9905903462189501462731300307495985180140,-0.1368603886368163768949557379528414458036} +#define T_8192_181 {0.9903792396171081646727429870225023478270,-0.1383797735777838877613277190903318114579} +#define T_8192_183 {0.9901658025572483978749005473218858242035,-0.1398988328977772144234847928601084277034} +#define T_8192_185 {0.9899500355416089858451300642627757042646,-0.1414175630223030444287957152482704259455} +#define T_8192_187 {0.9897319390779105718891628384881187230349,-0.1429359603776426679289102139591705054045} +#define T_8192_189 {0.9895115136793551924299094935122411698103,-0.1444540213908604708858973708629491738975} +#define T_8192_191 {0.9892887598646251667844353505643084645271,-0.1459717424898122339893546950406744144857} +#define T_8192_193 {0.9890636781578815428517259533691685646772,-0.1474891201031535981069708896029624156654} +#define T_8192_195 {0.9888362690887635420011747555690817534924,-0.1490061506603484742239373872507712803781} +#define T_8192_197 {0.9886065331923864496488363329262938350439,-0.1505228305916774256267842702072812244296} +#define T_8192_199 {0.9883744710093412821905189957760740071535,-0.1520391563282460500872161901497747749090} +#define T_8192_201 {0.9881400830856925665557355387136340141296,-0.1535551243019934453126751350282575003803} +#define T_8192_203 {0.9879033699729777850961909280158579349518,-0.1550707309457005356190251177395111881196} +#define T_8192_205 {0.9876643322282057102512453639064915478230,-0.1565859726929984541143880960589740425348} +#define T_8192_207 {0.9874229704138554053471921179152559489012,-0.1581008459783770081497067394593614153564} +#define T_8192_209 {0.9871792850978743372181156701117288321257,-0.1596153472371930337470047334136324934661} +#define T_8192_211 {0.9869332768536777100720769340114202350378,-0.1611294729056788055387983149557840079069} +#define T_8192_213 {0.9866849462601466891342738563253078609705,-0.1626432194209503356852053457259899005294} +#define T_8192_215 {0.9864342939016271794017143292876426130533,-0.1641565832210158393245080787892220541835} +#define T_8192_217 {0.9861813203679282713309817154367920011282,-0.1656695607447841167569890785671304911375} +#define T_8192_219 {0.9859260262543211306152102224586997181177,-0.1671821484320729356287671407699235714972} +#define T_8192_221 {0.9856684121615375548941528904833830893040,-0.1686943427236173298489063654415076598525} +#define T_8192_223 {0.9854084786957684194419471168657764792442,-0.1702061400610780650399789237781078554690} +#define T_8192_225 {0.9851462264686622338771826434822287410498,-0.1717175368870499652107497468023211695254} +#define T_8192_227 {0.9848816560973236988729695440270006656647,-0.1732285296450703226955880609239102341235} +#define T_8192_229 {0.9846147682043125959339135988557245582342,-0.1747391147796272248271520766138564795256} +#define T_8192_231 {0.9843455634176419000169744322192855179310,-0.1762492887361678806090736770784133113921} +#define T_8192_233 {0.9840740423707764472638359620759729295969,-0.1777590479611071694332480319644673727453} +#define T_8192_235 {0.9838002057026316027332768499036319553852,-0.1792683889018357457079133610022836364806} +#define T_8192_237 {0.9835240540575712619997261754178907722235,-0.1807773080067285875749405477108666673303} +#define T_8192_239 {0.9832455880854070739971461989625822752714,-0.1822858017251532958269422124431002885103} +#define T_8192_241 {0.9829648084413964426175880362279713153839,-0.1837938665074784483355330166887142695487} +#define T_8192_243 {0.9826817157862408613766547205159440636635,-0.1853014988050818989684387361194239929318} +#define T_8192_245 {0.9823963107860846921681741150678135454655,-0.1868086950703592707956346430364646948874} +#define T_8192_247 {0.9821085941125136109519644378451630473137,-0.1883154517567321162285765012711635790765} +#define T_8192_249 {0.9818185664425524983300874737324193120003,-0.1898217653186564379819145642613875679672} +#define T_8192_251 {0.9815262284586647734130337994429282844067,-0.1913276322116309047238758012099424377084} +#define T_8192_253 {0.9812315808487497292844636831432580947876,-0.1928330488922052332600998170164530165493} +#define T_8192_255 {0.9809346243061416448227873843279667198658,-0.1943380118179886262286260034670704044402} +#define T_8192_257 {0.9806353595296081193666282160847913473845,-0.1958425174476578767279733028772170655429} +#define T_8192_259 {0.9803337872233479632910757572972215712070,-0.1973465622409659170344298217969480901957} +#define T_8192_261 {0.9800299080969900877846612274879589676857,-0.1988501426587501175191619040560908615589} +#define T_8192_263 {0.9797237228655911733810057739901822060347,-0.2003532551629404467874451256648171693087} +#define T_8192_265 {0.9794152322496347817804007718223147094250,-0.2018558962165680481515295241479179821908} +#define T_8192_267 {0.9791044369750292464260610358905978500843,-0.2033580622837733165031437465586350299418} +#define T_8192_269 {0.9787913377731056741026804957073181867599,-0.2048597498298144192752090475551085546613} +#define T_8192_271 {0.9784759353806168347134075702342670410872,-0.2063609553210755120922215155587764456868} +#define T_8192_273 {0.9781582305397350518560983800853136926889,-0.2078616752250750654429367614284274168313} +#define T_8192_275 {0.9778382239980504264664773472759407013655,-0.2093619060104741635974789915053406730294} +#define T_8192_277 {0.9775159165085692825059027200040873140097,-0.2108616441470848590356013119162525981665} +#define T_8192_279 {0.9771913088297122795822247098840307444334,-0.2123608861058784436082191859895829111338} +#define T_8192_281 {0.9768644017253126365929460916959214955568,-0.2138596283589937752100951229294878430665} +#define T_8192_283 {0.9765351959646144663906852656509727239609,-0.2153578673797455489413721352320862933993} +#define T_8192_285 {0.9762036923222705553371270070783793926239,-0.2168555996426326237802584273595130071044} +#define T_8192_287 {0.9758698915783410310353929162374697625637,-0.2183528216233463215001364687850582413375} +#define T_8192_289 {0.9755337945182913639285970930359326303005,-0.2198495297987786978310964514093939214945} +#define T_8192_291 {0.9751954019329903688984018117480445653200,-0.2213457206470308413770453626057133078575} +#define T_8192_293 {0.9748547146187084289081781207642052322626,-0.2228413906474211447772404426359571516514} +#define T_8192_295 {0.9745117333771157186461664423404727131128,-0.2243365362804936036233982576959533616900} +#define T_8192_297 {0.9741664590152803171463347098324447870255,-0.2258311540280261708879550042183836922050} +#define T_8192_299 {0.9738188923456660983646315798978321254253,-0.2273252403730388615521462725155288353562} +#define T_8192_301 {0.9734690341861310658444494947616476565599,-0.2288187917998022180565698135978891514242} +#define T_8192_303 {0.9731168853599251322705754319031257182360,-0.2303118047938454426848409184458432719111} +#define T_8192_305 {0.9727624466956885651569564288365654647350,-0.2318042758419647519918527223126147873700} +#define T_8192_307 {0.9724057190274497664006503327982500195503,-0.2332962014322316202097340465115848928690} +#define T_8192_309 {0.9720467031946234959249864004959817975760,-0.2347875780540009671426560089457780122757} +#define T_8192_311 {0.9716854000420085402112135852803476154804,-0.2362784021979195681062435596686555072665} +#define T_8192_313 {0.9713218104197861579862660619255620986223,-0.2377686703559342140668064757846877910197} +#define T_8192_315 {0.9709559351835179707990164388320408761501,-0.2392583790212999828028728188655804842710} +#define T_8192_317 {0.9705877751941436315519240451976656913757,-0.2407475246885884267999955454797600395977} +#define T_8192_319 {0.9702173313179791591664979932829737663269,-0.2422361038536960387013152740109944716096} +#define T_8192_321 {0.9698446044267148291595503906137309968472,-0.2437241130138521616466107388987438753247} +#define T_8192_323 {0.9694695953974130642194495521835051476955,-0.2452115486676275657451640199724351987243} +#define T_8192_325 {0.9690923051125062137600707501405850052834,-0.2466984073149424416815378435785532929003} +#define T_8192_327 {0.9687127344597947775639568135375156998634,-0.2481846854570747828994115025125211104751} +#define T_8192_329 {0.9683308843324451853362688780180178582668,-0.2496703795966685734963874665481853298843} +#define T_8192_331 {0.9679467556289877983033420605352148413658,-0.2511554862377419206076467617094749584794} +#define T_8192_333 {0.9675603492533143556997288214915897697210,-0.2526400018856955198565117370890220627189} +#define T_8192_335 {0.9671716661146766425005694145511370152235,-0.2541239230473206212046477503463393077254} +#define T_8192_337 {0.9667807071276832697748204736853949725628,-0.2556072462308074388914747032686136662960} +#define T_8192_339 {0.9663874732122988975291377755638677626848,-0.2570899679457531172843687272688839584589} +#define T_8192_341 {0.9659919652938405709718949765374418348074,-0.2585720847031703351071030283492291346192} +#define T_8192_343 {0.9655941843029768323347639125131536275148,-0.2600535930154951880233227257122052833438} +#define T_8192_345 {0.9651941311757247232705481110315304249525,-0.2615344893965954597980783091770717874169} +#define T_8192_347 {0.9647918068534478974740409285004716366529,-0.2630147703617790044816615591116715222597} +#define T_8192_349 {0.9643872122828542892136738373665139079094,-0.2644944324278016289930803850438678637147} +#define T_8192_351 {0.9639803484159941149300721008330583572388,-0.2659734721128755863261972081090789288282} +#define T_8192_353 {0.9635712162102573197230981350003276020288,-0.2674518859366776801778087246930226683617} +#define T_8192_355 {0.9631598166283713569058022585522849112749,-0.2689296704203573140645744388166349381208} +#define T_8192_357 {0.9627461506383994116475832925061695277691,-0.2704068220865448179957013508101226761937} +#define T_8192_359 {0.9623302192137374033720220722898375242949,-0.2718833374593597751456286459870170801878} +#define T_8192_361 {0.9619120233331122094000420474912971258163,-0.2733592130644187379040488394821295514703} +#define T_8192_363 {0.9614915639805790004146501814830116927624,-0.2748344454288439431266510837303940206766} +#define T_8192_365 {0.9610688421455193530817950886557810008526,-0.2763090310812710836962935445626499131322} +#define T_8192_367 {0.9606438588226385855151079340430442243814,-0.2777829665518576351956880898796953260899} +#define T_8192_369 {0.9602166150119634258075507204921450465918,-0.2792562483722911825800849783263402059674} +#define T_8192_371 {0.9597871117188399026076695008669048547745,-0.2807288730757971917384452353871893137693} +#define T_8192_373 {0.9593553499539307916066377401875797659159,-0.2822008371971475582107302670920034870505} +#define T_8192_375 {0.9589213307332131730476021402864716947079,-0.2836721372726684342602254673693096265197} +#define T_8192_377 {0.9584850550779761002573309269791934639215,-0.2851427698402487220796786004939349368215} +#define T_8192_379 {0.9580465240148186012447695247828960418701,-0.2866127314393477898413209459249628707767} +#define T_8192_381 {0.9576057385756463480319666814466472715139,-0.2880820186110041314364593745267484337091} +#define T_8192_383 {0.9571626997976701023418399927322752773762,-0.2895506278978430825254974934068741276860} +#define T_8192_385 {0.9567174087234030510629168020386714488268,-0.2910185558440850361883178720745490863919} +#define T_8192_387 {0.9562698664006580306917726375104393810034,-0.2924857989955538806192691936303162947297} +#define T_8192_389 {0.9558200738825454179092844242404680699110,-0.2939523538996846596660361683461815118790} +#define T_8192_391 {0.9553680322274703540230689213785808533430,-0.2954182171055320105246266848553204908967} +#define T_8192_393 {0.9549137424991305245214334718184545636177,-0.2968833851637782683674515737948240712285} +#define T_8192_395 {0.9544572057665134945381169018219225108624,-0.2983478546267414044379506776749622076750} +#define T_8192_397 {0.9539984231038944884062402707058936357498,-0.2998116220483833527232775395532371476293} +#define T_8192_399 {0.9535373955908332810338379204040393233299,-0.3012746839843179480489254729036474600434} +#define T_8192_401 {0.9530741243121721995024131501850206404924,-0.3027370369918191972402610190329141914845} +#define T_8192_403 {0.9526086103580333475093766537611372768879,-0.3041986776298291061948475544340908527374} +#define T_8192_405 {0.9521408548238158298104849563969764858484,-0.3056596024589661730885836732340976595879} +#define T_8192_407 {0.9516708588101938648406985521432943642139,-0.3071198080415330489145731007738504558802} +#define T_8192_409 {0.9511986234231132320005031033360864967108,-0.3085792909415250306892630760557949542999} +#define T_8192_411 {0.9507241497737896063213725028617773205042,-0.3100380477246378885247679590975167229772} +#define T_8192_413 {0.9502474389787052277966949986875988543034,-0.3114960749582759147457977633166592568159} +#define T_8192_415 {0.9497684921596066809357239435485098510981,-0.3129533692115601950511916129471501335502} +#define T_8192_417 {0.9492873104435021192060162320558447390795,-0.3144099270553367131419975066819461062551} +#define T_8192_419 {0.9488038949626584894758707378059625625610,-0.3158657450621840112603422312531620264053} +#define T_8192_421 {0.9483182468545990895236741380358580499887,-0.3173208198064217389067209751374321058393} +#define T_8192_423 {0.9478303672621010145249442757631186395884,-0.3187751478641185354234721671673469245434} +#define T_8192_425 {0.9473402573331920484278612093476112931967,-0.3202287258130999125782523151428904384375} +#define T_8192_427 {0.9468479182211479994180081121157854795456,-0.3216815502329565812367206945054931566119} +#define T_8192_429 {0.9463533510844905904946244845632463693619,-0.3231336177050523339460141869494691491127} +#define T_8192_431 {0.9458565570869839067569273538538254797459,-0.3245849248125321495628270440647611394525} +#define T_8192_433 {0.9453575373976322859803644860221538692713,-0.3260354681403302423703394197218585759401} +#define T_8192_435 {0.9448562931906772099921454355353489518166,-0.3274852442751780001728434399410616606474} +#define T_8192_437 {0.9443528256455947511582849074329715222120,-0.3289342498056121999461254290508804842830} +#define T_8192_439 {0.9438471359470926858037387319200206547976,-0.3303824813219827793986382857838179916143} +#define T_8192_441 {0.9433392252851077186548423014755826443434,-0.3318299354164611081330349406925961375237} +#define T_8192_443 {0.9428290948548027072817490079614799469709,-0.3332766086830479257407944260194199159741} +#define T_8192_445 {0.9423167458565637755185662172152660787106,-0.3347224977175812798968479455652413889766} +#define T_8192_447 {0.9418021794959976489280961686745285987854,-0.3361675991177445199653561758168507367373} +#define T_8192_449 {0.9412853969839286571996694874542299658060,-0.3376119094830745681612427233631024137139} +#define T_8192_451 {0.9407663995363960696138860839710105210543,-0.3390554254149696356002152697328710928559} +#define T_8192_453 {0.9402451883746508753958437409892212599516,-0.3404981435166971603933916412643156945705} +#define T_8192_455 {0.9397217647251533412244839382765349000692,-0.3419400603934021898311357290367595851421} +#define T_8192_457 {0.9391961298195699026081229021656326949596,-0.3433811726521150409219274024508194997907} +#define T_8192_459 {0.9386682848947701662822851176315452903509,-0.3448214769017593495092910416133236140013} +#define T_8192_461 {0.9381382311928243566967466904316097497940,-0.3462609697531600083664216072065755724907} +#define T_8192_463 {0.9376059699609999853464614716358482837677,-0.3476996478190513828465668666467536240816} +#define T_8192_465 {0.9370715024517591862363019572512712329626,-0.3491375077140849714218973076640395447612} +#define T_8192_467 {0.9365348299227554962342878752679098397493,-0.3505745460548375658227371332031907513738} +#define T_8192_469 {0.9359959536368313015586295477987732738256,-0.3520107594598191336210391000349773094058} +#define T_8192_471 {0.9354548748620146181309564781258814036846,-0.3534461445494808118361618198832729831338} +#define T_8192_473 {0.9349115948715160939741508627776056528091,-0.3548806979462227895183445980364922434092} +#define T_8192_475 {0.9343661149437257895655761785747017711401,-0.3563144162744024123767871969903353601694} +#define T_8192_477 {0.9338184363622109573910279323172289878130,-0.3577472961603418988296709812857443466783} +#define T_8192_479 {0.9332685604157120451418450102210044860840,-0.3591793342323365001433899124094750732183} +#define T_8192_481 {0.9327164883981402532242555025732144713402,-0.3606105271206623275048741561477072536945} +#define T_8192_483 {0.9321622216085744261349077532941009849310,-0.3620408714575841790939136899396544322371} +#define T_8192_485 {0.9316057613512578328140989469829946756363,-0.3634703638773638112446917602937901392579} +#define T_8192_487 {0.9310471089355952800659110835113096982241,-0.3648990010162673214288986400788417086005} +#define T_8192_489 {0.9304862656761496708668346400372684001923,-0.3663267795125736414618700109713245183229} +#define T_8192_491 {0.9299232328926396728974168581771664321423,-0.3677536960065819759968519520043628290296} +#define T_8192_493 {0.9293580119099354996947681684105191379786,-0.3691797471406199626642319344682618975639} +#define T_8192_495 {0.9287906040580570232734203273139428347349,-0.3706049295590516656773161230375990271568} +#define T_8192_497 {0.9282210106721694442555303794506471604109,-0.3720292399082850143265943643200444057584} +#define T_8192_499 {0.9276492330925811824471338695730082690716,-0.3734526748367802961858785693038953468204} +#define T_8192_501 {0.9270752726647401020798611170903313905001,-0.3748752309950575956065677019068971276283} +#define T_8192_503 {0.9264991307392305142087707281461916863918,-0.3762969050357047873234250801033340394497} +#define T_8192_505 {0.9259208086717699570655781826644670218229,-0.3777176936133856410826581395667744800448} +#define T_8192_507 {0.9253403078232063094787918089423328638077,-0.3791375933848473156473346534767188131809} +#define T_8192_509 {0.9247576295595139050931265956023707985878,-0.3805566010089285189366137274191714823246} +#define T_8192_511 {0.9241727752517910898788500162481795996428,-0.3819747131465672795869181754824239760637} +#define T_8192_513 {0.9235857462762566694181032289634458720684,-0.3833919264608086630019556650950107723475} +#define T_8192_515 {0.9229965440142462451689198132953606545925,-0.3848082376168128759807984806684544309974} +#define T_8192_517 {0.9224051698522098829968740574258845299482,-0.3862236432818629827679046684352215379477} +#define T_8192_519 {0.9218116251817081163721923076082020998001,-0.3876381401253727321254416438023326918483} +#define T_8192_521 {0.9212159113994087267229815552127547562122,-0.3890517248188943844056097987049724906683} +#define T_8192_523 {0.9206180299070838568553654113202355802059,-0.3904643940361266496452685714757535606623} +#define T_8192_525 {0.9200179821116065692621077687363140285015,-0.3918761444529223481048063604248454794288} +#define T_8192_527 {0.9194157694249469603420266139437444508076,-0.3932869727472964038739178249670658260584} +#define T_8192_529 {0.9188113932641699399539447767892852425575,-0.3946968755994336164327762617176631465554} +#define T_8192_531 {0.9182048550514309015468938923731911927462,-0.3961058496916963211909035180724458768964} +#define T_8192_533 {0.9175961562139728355802503756422083824873,-0.3975138917086323275817960620770463719964} +#define T_8192_535 {0.9169852981841229988546615459199529141188,-0.3989209983369829126687022835540119558573} +#define T_8192_537 {0.9163722823992891397537619013746734708548,-0.4003271662656900931054337888781446963549} +#define T_8192_539 {0.9157571103019567226866115561279002577066,-0.4017323921859050073202013209083816036582} +#define T_8192_541 {0.9151397833396852643517149772378616034985,-0.4031366727909952984987285162787884473801} +#define T_8192_543 {0.9145203029651044479564347966515924781561,-0.4045400047765529971677267440099967643619} +#define T_8192_545 {0.9138986706359116807263376358605455607176,-0.4059423848404025148006724066362949088216} +#define T_8192_547 {0.9132748878148677640353980677900835871696,-0.4073438096826079712897694662387948483229} +#define T_8192_549 {0.9126489559697938958038321288768202066422,-0.4087442760054814105963316706038312986493} +#define T_8192_551 {0.9120208765735683398290234435989987105131,-0.4101437805135902392450475417717825621367} +#define T_8192_553 {0.9113906511041223179603321113972924649715,-0.4115423199137652199297576771641615778208} +#define T_8192_555 {0.9107582810444375676084405313304159790277,-0.4129398909151080210300222006480908021331} +#define T_8192_557 {0.9101237678825416788086499764176551252604,-0.4143364902289990991945956011477392166853} +#define T_8192_559 {0.9094871131115054296856214932631701231003,-0.4157321145691053598802966462244512513280} +#define T_8192_561 {0.9088483182294391227173946390394121408463,-0.4171267606513878734020295269147027283907} +#define T_8192_563 {0.9082073847394886989548012934392318129539,-0.4185204251941097575162586963415378704667} +#define T_8192_565 {0.9075643141498326293969967082375660538673,-0.4199131049178436159152738582633901387453} +#define T_8192_567 {0.9069191079736781402331757817592006176710,-0.4213047965454796428552697307168273255229} +#define T_8192_569 {0.9062717677292575491065917958621867001057,-0.4226954968022330061394598033075453713536} +#define T_8192_571 {0.9056222949398252675123899280151817947626,-0.4240852024156515631680974820483243092895} +#define T_8192_573 {0.9049706911336532488832062881556339561939,-0.4254739101156238545442533904861193150282} +#define T_8192_575 {0.9043169578440283240539088183140847831964,-0.4268616166343864870569291269930545240641} +#define T_8192_577 {0.9036610966092479824141037170193158090115,-0.4282483187065319607533808721200330182910} +#define T_8192_579 {0.9030031089726171522613640263443812727928,-0.4296340130690163849891405334346927702427} +#define T_8192_581 {0.9023429964824442039983409813430625945330,-0.4310186964611670279445831965858815237880} +#define T_8192_583 {0.9016807606920377304859925970959011465311,-0.4324023656246901436972507326572667807341} +#define T_8192_585 {0.9010164031597023281960900931153446435928,-0.4337850173036785772495704804896377027035} +#define T_8192_587 {0.9003499254487355996090514054230879992247,-0.4351666482446192585342714664875529706478} +#define T_8192_589 {0.8996813291274239343664476109552197158337,-0.4365472551964011960201617057464318349957} +#define T_8192_591 {0.8990106157690390675796265895769465714693,-0.4379268349103228596952419593435479328036} +#define T_8192_593 {0.8983377869518343050714292985503561794758,-0.4393053841400999526278781104338122531772} +#define T_8192_595 {0.8976628442590408596402085095178335905075,-0.4406828996418729049722173840564209967852} +#define T_8192_597 {0.8969857892788639652792426204541698098183,-0.4420593781742147565516631857462925836444} +#define T_8192_599 {0.8963066236044795465076617801969405263662,-0.4434348164981384843308376275672344490886} +#define T_8192_601 {0.8956253488340301105452567753673065453768,-0.4448092113771048849990563667233800515532} +#define T_8192_603 {0.8949419665706207505095903798064682632685,-0.4461825595770300689757448253658367320895} +#define T_8192_605 {0.8942564784223160367915284041373524814844,-0.4475548578662930099270056416571605950594} +#define T_8192_607 {0.8935688860021359092300485826854128390551,-0.4489261030157433163267910458671394735575} +#define T_8192_609 {0.8928791909280516803093519229150842875242,-0.4502962917987086699511678489216137677431} +#define T_8192_611 {0.8921873948229824824451839049288537353277,-0.4516654209910024864171873559826053678989} +#define T_8192_613 {0.8914934993147913822042482934193685650826,-0.4530334873709316312329065112862735986710} +#define T_8192_615 {0.8907975060362814945236209496215451508760,-0.4544004877193036362470479616604279726744} +#define T_8192_617 {0.8900994166251923189747685682959854602814,-0.4557664188194346932547773576516192406416} +#define T_8192_619 {0.8893992327241955209160551021341234445572,-0.4571312774571569814696658795583061873913} +#define T_8192_621 {0.8886969559808916008236678862886037677526,-0.4584950604208262725514089197531575337052} +#define T_8192_623 {0.8879925880478055644218216002627741545439,-0.4598577645013295356335447650053538382053} +#define T_8192_625 {0.8872861305823831479244745423784479498863,-0.4612193864920923758177195850294083356857} +#define T_8192_627 {0.8865775852469870432770449042436666786671,-0.4625799231890868057348598085809499025345} +#define T_8192_629 {0.8858669537088927903312196576735004782677,-0.4639393713908385175059834182320628315210} +#define T_8192_631 {0.8851542376402851131089732916734647005796,-0.4652977278984345987922210952092427760363} +#define T_8192_633 {0.8844394387182537009550742368446663022041,-0.4666549895155309157779299766843905672431} +#define T_8192_635 {0.8837225586247896558234060648828744888306,-0.4680111530483598292207148006127681583166} +#define T_8192_637 {0.8830035990467808293402640629210509359837,-0.4693662153057375219233904317661654204130} +#define T_8192_639 {0.8822825616760087141798862830910366028547,-0.4707201730990716592728517753130290657282} +#define T_8192_641 {0.8815594482091437811277501168660819530487,-0.4720730232423686612008850715938024222851} +#define T_8192_643 {0.8808342603477420373891959570755716413260,-0.4734247625522415292564915034745354205370} +#define T_8192_645 {0.8801069997982403636527237722475547343493,-0.4747753878479171185666984911222243681550} +#define T_8192_647 {0.8793776682719532944432216936547774821520,-0.4761248959512436318419759118114598095417} +#define T_8192_649 {0.8786462674850681331406576646259054541588,-0.4774732836866980578705010884732473641634} +#define T_8192_651 {0.8779127991586418433556104901072103530169,-0.4788205478813939430793311657907906919718} +#define T_8192_653 {0.8771772650185960529256590234581381082535,-0.4801666853650883859394582486856961622834} +#define T_8192_655 {0.8764396667957136122240058284660335630178,-0.4815116929701899195492842409294098615646} +#define T_8192_657 {0.8757000062256345973565885287825949490070,-0.4828555675317656725731296774029033258557} +#define T_8192_659 {0.8749582850488516472253763822664041072130,-0.4841983058875490297801036376768024638295} +#define T_8192_661 {0.8742145050107062997923890179663430899382,-0.4855399048779469595160662720445543527603} +#define T_8192_663 {0.8734686678613848842545053230423945933580,-0.4868803613460473966867425588134210556746} +#define T_8192_665 {0.8727207753559143021959698671707883477211,-0.4882196721376267922742897553689545020461} +#define T_8192_667 {0.8719708292541578087408993269491475075483,-0.4895578341011574963204111554659903049469} +#define T_8192_669 {0.8712188313208110157503938353329431265593,-0.4908948440878151409094698465196415781975} +#define T_8192_671 {0.8704647833253976729750434060406405478716,-0.4922306989514860786627536981541197746992} +#define T_8192_673 {0.8697086870422655602297368204744998365641,-0.4935653955487747657215891194937285035849} +#define T_8192_675 {0.8689505442505823795684705146413762122393,-0.4948989307390112002416060477116843685508} +#define T_8192_677 {0.8681903567343313143922500785265583544970,-0.4962313013842583053758517053211107850075} +#define T_8192_679 {0.8674281262823069216238991430145688354969,-0.4975625043493191457244506636925507336855} +#define T_8192_681 {0.8666638546881111349051707293256185948849,-0.4988925365017446433846259878919227048755} +#define T_8192_683 {0.8658975437501488237046487483894452452660,-0.5002213947118405723557543751667253673077} +#define T_8192_685 {0.8651291952716236854925568877661135047674,-0.5015490758526753856116897622996475547552} +#define T_8192_687 {0.8643588110605340268932650360511615872383,-0.5028755768000869874612135390634648501873} +#define T_8192_689 {0.8635863929296679897262833947024773806334,-0.5042008944326904495980556930589955300093} +#define T_8192_691 {0.8628119426966003313594910650863312184811,-0.5055250256318853940840085670060943812132} +#define T_8192_693 {0.8620354621836872066609203102416358888149,-0.5068479672818633208208893847768194973469} +#define T_8192_695 {0.8612569532180621711958679043164011090994,-0.5081697162696146019555953898816369473934} +#define T_8192_697 {0.8604764176316320734017040194885339587927,-0.5094902694849362534412762215652037411928} +#define T_8192_699 {0.8596938572610726136957737253396771848202,-0.5108096238204390404646915158082265406847} +#define T_8192_701 {0.8589092739478239035832984882290475070477,-0.5121277761715546938958709688449744135141} +#define T_8192_703 {0.8581226695380861357875801331829279661179,-0.5134447234365434598046817882277537137270} +#define T_8192_705 {0.8573340458828155874471121933311223983765,-0.5147604625165012048881862938287667930126} +#define T_8192_707 {0.8565434048377199571788764842494856566191,-0.5160749903153666329203019813576247543097} +#define T_8192_709 {0.8557507482632539241862446033337619155645,-0.5173883037399290563129738984571304172277} +#define T_8192_711 {0.8549560780246149294114843542047310620546,-0.5187003996998350574543223956425208598375} +#define T_8192_713 {0.8541593959917388456659637085977010428905,-0.5200112751075960382252105773659422993660} +#define T_8192_715 {0.8533607040392954257157498432206921279430,-0.5213209268785956584935092905652709305286} +#define T_8192_717 {0.8525600040466840834341155641595833003521,-0.5226293519310966084745473381190095096827} +#define T_8192_719 {0.8517572978980291198425334187049884349108,-0.5239365471862486023368887799733784049749} +#define T_8192_721 {0.8509525874821757263077870447887107729912,-0.5252425095680947064735732965345960110426} +#define T_8192_723 {0.8501458746926852105829652828106191009283,-0.5265472360035793331078934897959697991610} +#define T_8192_725 {0.8493371614278307779599686000437941402197,-0.5278507234225553457207524843397550284863} +#define T_8192_727 {0.8485264495905926462882007399457506835461,-0.5291529687577906093665092157607432454824} +#define T_8192_729 {0.8477137410886542712162849966262001544237,-0.5304539689449763173456631193403154611588} +#define T_8192_731 {0.8468990378343972391661509391269646584988,-0.5317537209227333194760944934387225657701} +#define T_8192_733 {0.8460823417448969374632383733114693313837,-0.5330522216326195605873294880439061671495} +#define T_8192_735 {0.8452636547419182244667013037542346864939,-0.5343494680191375190148050933203194290400} +#define T_8192_737 {0.8444429787519106556104020455677527934313,-0.5356454570297410899826218155794776976109} +#define T_8192_739 {0.8436203157060040425108127237763255834579,-0.5369401856148429130755062033131252974272} +#define T_8192_741 {0.8427956675400042341195216977212112396955,-0.5382336507278216997107733732264023274183} +#define T_8192_743 {0.8419690361943876766304128977935761213303,-0.5395258493250288944764747611770872026682} +#define T_8192_745 {0.8411404236142980828105919499648734927177,-0.5408167783657966687371754233026877045631} +#define T_8192_747 {0.8403098317495407698629605874884873628616,-0.5421064348124439158382870118657592684031} +#define T_8192_749 {0.8394772625545785516010255378205329179764,-0.5433948156302847998233573889592662453651} +#define T_8192_751 {0.8386427179885272975568000219936948269606,-0.5446819177876345285937986773205921053886} +#define T_8192_753 {0.8378062000151509369771929414127953350544,-0.5459677382558175695592694864899385720491} +#define T_8192_755 {0.8369677106028570179319103772286325693130,-0.5472522740091740889312177387182600796223} +#define T_8192_757 {0.8361272517246921553990546271961648017168,-0.5485355220250673902171456575160846114159} +#define T_8192_759 {0.8352848253583373683284207800170406699181,-0.5498174792838909086256649061397183686495} +#define T_8192_761 {0.8344404334861031946601883646508213132620,-0.5510981427690754275161566511087585240602} +#define T_8192_763 {0.8335940780949251394105203871731646358967,-0.5523775094670960728038267006922978907824} +#define T_8192_765 {0.8327457611763594558240697551809716969728,-0.5536555763674793073647606433951295912266} +#define T_8192_767 {0.8318954847265775942588561520096845924854,-0.5549323404628103695301888365065678954124} +#define T_8192_769 {0.8310432507463623164056798486853949725628,-0.5562077987487399344246341570396907627583} +#define T_8192_771 {0.8301890612411023662176035031734500080347,-0.5574819482239915524601769902801606804132} +#define T_8192_773 {0.8293329182207882510624585847835987806320,-0.5587547858903683106746029807254672050476} +#define T_8192_775 {0.8284748237000071346969320984499063342810,-0.5600263087527603822479704831494018435478} +#define T_8192_777 {0.8276147796979383963744680841045919805765,-0.5612965138191514657961533885099925100803} +#define T_8192_779 {0.8267527882383485238193543409579433500767,-0.5625653981006265569320135000452864915133} +#define T_8192_781 {0.8258888513495868943792288519034627825022,-0.5638329586113781655143384341499768197536} +#define T_8192_783 {0.8250229710645802239099566577351652085781,-0.5650991923687139761867115339555311948061} +#define T_8192_785 {0.8241551494208285699727412065840326249599,-0.5663640963930638427825670078163966536522} +#define T_8192_787 {0.8232853884604001137859086156822741031647,-0.5676276677079862276187327552179340273142} +#define T_8192_789 {0.8224136902299263862659017831902019679546,-0.5688899033401758620343002803565468639135} +#define T_8192_791 {0.8215400567805976050905769625387620180845,-0.5701508003194702967064699805632699280977} +#define T_8192_793 {0.8206644901681574566509880241937935352325,-0.5714103556788572291225136723369359970093} +#define T_8192_795 {0.8197869924528989882261953425768297165632,-0.5726685664544812759402248047990724444389} +#define T_8192_797 {0.8189075656996589458458402077667415142059,-0.5739254296856507453483686731487978249788} +#define T_8192_799 {0.8180262119778134444203487873892299830914,-0.5751809424148451865832498697272967547178} +#define T_8192_801 {0.8171429333612729717373213134123943746090,-0.5764351016877218292222551099257543683052} +#define T_8192_803 {0.8162577319284773924579212689423002302647,-0.5776879045531227996335132957028690725565} +#define T_8192_805 {0.8153706097623912851801719625655096024275,-0.5789393480630818933363457290397491306067} +#define T_8192_807 {0.8144815689504986133684383275976870208979,-0.5801894292728316804286237129417713731527} +#define T_8192_809 {0.8135906115847985065059333464887458831072,-0.5814381452408102779472187648934777826071} +#define T_8192_811 {0.8126977397617994869349900000088382512331,-0.5826854930286684552953602178604342043400} +#define T_8192_813 {0.8118029555825153620318701541691552847624,-0.5839314697012762955807829712284728884697} +#define T_8192_815 {0.8109062611524596730916414344392251223326,-0.5851760723267304120653875543212052434683} +#define T_8192_817 {0.8100076585816411434137762626050971448421,-0.5864192979763604984810854148236103355885} +#define T_8192_819 {0.8091071499845582382093311935022938996553,-0.5876611437247366565017614448152016848326} +#define T_8192_821 {0.8082047374801947237088484143896494060755,-0.5889016066496758350368168066779617220163} +#define T_8192_823 {0.8073004231920144491141400067135691642761,-0.5901406838322489356585265340982005000114} +#define T_8192_825 {0.8063942092479563505946771329035982489586,-0.5913783723567875849624897455214522778988} +#define T_8192_827 {0.8054860977804291222170718356210272759199,-0.5926146693108911289726847826386801898479} +#define T_8192_829 {0.8045760909263071081198859246796928346157,-0.5938495717854336275465243488724809139967} +#define T_8192_831 {0.8036641908269240852646930761693511158228,-0.5950830768745699606014909477380570024252} +#define T_8192_833 {0.8027503996280691556108877193764783442020,-0.5963151816757437106986117214546538889408} +#define T_8192_835 {0.8018347194799813060228643735172227025032,-0.5975458832896932692690938893065322190523} +#define T_8192_837 {0.8009171525373443012441043720173183828592,-0.5987751788204587199970774236135184764862} +#define T_8192_839 {0.7999977009592819099381699743389617651701,-0.6000030653753890552692951132485177367926} +#define T_8192_841 {0.7990763669093523535735812401981092989445,-0.6012295400651485044463129270297940820456} +#define T_8192_843 {0.7981531525555438655317175289383158087730,-0.6024546000037237503121900772384833544493} +#define T_8192_845 {0.7972280600702686959024845236854162067175,-0.6036782423084303683680218455265276134014} +#define T_8192_847 {0.7963010916303591146814255807839799672365,-0.6049004640999198212369947214028798043728} +#define T_8192_849 {0.7953722494170613055430862914363387972116,-0.6061212625021862310248366156884003430605} +#define T_8192_851 {0.7944415356160305918820085935294628143311,-0.6073406346425728186133596864237915724516} +#define T_8192_853 {0.7935089524173266628537248834618367254734,-0.6085585776517794531770277899340726435184} +#define T_8192_855 {0.7925745020154076891927275028137955814600,-0.6097750886638684253426845316425897181034} +#define T_8192_857 {0.7916381866091257712980677752057090401649,-0.6109901648162717746615157921041827648878} +#define T_8192_859 {0.7907000084017216101628378055465873330832,-0.6122038032497979509471974779444281011820} +#define T_8192_861 {0.7897599696008190672813498167670331895351,-0.6134160011086385866363457353145349770784} +#define T_8192_863 {0.7888180724184202796678277991304639726877,-0.6146267555403750471043622383149340748787} +#define T_8192_865 {0.7878743190709002197635868469660636037588,-0.6158360636959849809812794774188660085201} +#define T_8192_867 {0.7869287117790018104557248079800046980381,-0.6170439227298497586460257480212021619081} +#define T_8192_869 {0.7859812527678301519173942324414383620024,-0.6182503297997602453861532012524548918009} +#define T_8192_871 {0.7850319442668480807157038725563324987888,-0.6194552820669240178474979074962902814150} +#define T_8192_873 {0.7840807885098699525627807815908454358578,-0.6206587766959721363946300698444247245789} +#define T_8192_875 {0.7831277877350573124459742757608182728291,-0.6218608108549653623597919249732512980700} +#define T_8192_877 {0.7821729441849130104458254209021106362343,-0.6230613817154012634702553441456984728575} +#define T_8192_879 {0.7812162601062760947101537567505147308111,-0.6242604864522207641641671216348186135292} +#define T_8192_881 {0.7802577377503165934058415587060153484344,-0.6254581222438143628394868756004143506289} +#define T_8192_883 {0.7792973793725302966706180995970498770475,-0.6266542862720294593259495741222053766251} +#define T_8192_885 {0.7783351872327332054979365238978061825037,-0.6278489757221765721340034360764548182487} +#define T_8192_887 {0.7773711635950563136887581094924826174974,-0.6290421877830358887706552195595577359200} +#define T_8192_889 {0.7764053107279403898033365294395480304956,-0.6302339196468644821891302854055538773537} +#define T_8192_891 {0.7754376309041305370683971887046936899424,-0.6314241685094017508816932604531757533550} +#define T_8192_893 {0.7744681264006708643066190234094392508268,-0.6326129315698775235077278011885937303305} +#define T_8192_895 {0.7734967994988990458438138375640846788883,-0.6338002060310172769419523319811560213566} +#define T_8192_897 {0.7725236524844413255053154898632783442736,-0.6349859890990494637463825711165554821491} +#define T_8192_899 {0.7715486876472062993670419928093906491995,-0.6361702779837121735084792817360721528530} +#define T_8192_901 {0.7705719072813806969080019371176604181528,-0.6373530698982591280454812476818915456533} +#define T_8192_903 {0.7695933136854229417167516658082604408264,-0.6385343620594667868317628744989633560181} +#define T_8192_905 {0.7686129091620582665100869235175196081400,-0.6397141516876404532254696277959737926722} +#define T_8192_907 {0.7676306960182733840625246557465288788080,-0.6408924360066213798958756342472042888403} +#define T_8192_909 {0.7666466765653104920019700330158229917288,-0.6420692122437925419831117324065417051315} +#define T_8192_911 {0.7656608531186624988507105626922566443682,-0.6432444776300858535478255362249910831451} +#define T_8192_913 {0.7646732279980671398433855756593402475119,-0.6444182293999883848201193359273020178080} +#define T_8192_915 {0.7636838035275018699010729505971539765596,-0.6455904647915486904707904614042490720749} +#define T_8192_917 {0.7626925820351779794492586006526835262775,-0.6467611810463839150386888832144904881716} +#define T_8192_919 {0.7616995658535352653473182726884260773659,-0.6479303754096854550681428008829243481159} +#define T_8192_921 {0.7607047573192369238626042715623043477535,-0.6490980451302259535140137813868932425976} +#define T_8192_923 {0.7597081587731634444438100217666942626238,-0.6502641874603659610798445100954268127680} +#define T_8192_925 {0.7587097725604073916727543291926849633455,-0.6514287996560598203998893040989059954882} +#define T_8192_927 {0.7577096010302680761938631803786847740412,-0.6525918789768625494218667881796136498451} +#define T_8192_929 {0.7567076465362456705321392291807569563389,-0.6537534226859361696782002582040149718523} +#define T_8192_931 {0.7557039114360358800226435960212256759405,-0.6549134280500560345572580445150379091501} +#define T_8192_933 {0.7546983980915243916953727421059738844633,-0.6560718923396177126861061879026237875223} +#define T_8192_935 {0.7536911088687813231601353436417412012815,-0.6572288128286425390456315653864294290543} +#define T_8192_937 {0.7526820461380552274022193159908056259155,-0.6583841867947850534648068787646479904652} +#define T_8192_939 {0.7516712122737684298456883880135137587786,-0.6595380115193386627581162429123651236296} +#define T_8192_941 {0.7506586096545105890598392761603463441133,-0.6606902842872423020637029367208015173674} +#define T_8192_943 {0.7496442406630334787109859462361782789230,-0.6618410023870868741369122290052473545074} +#define T_8192_945 {0.7486281076862453254250340251019224524498,-0.6629901631111214665992292793816886842251} +#define T_8192_947 {0.7476102131152051466500552123761735856533,-0.6641377637552600132764268892060499638319} +#define T_8192_949 {0.7465905593451171995411641546525061130524,-0.6652838016190871783805960149038583040237} +#define T_8192_951 {0.7455691487753254298453953197167720645666,-0.6664282740058653509152009064564481377602} +#define T_8192_953 {0.7445459838093073656750675581861287355423,-0.6675711782225403068125046956993173807859} +#define T_8192_955 {0.7435210668546691215041732903046067804098,-0.6687125115797480923163220722926780581474} +#define T_8192_957 {0.7424944003231391809194406050664838403463,-0.6698522713918210191863522595667745918036} +#define T_8192_959 {0.7414659866305632895944199844961985945702,-0.6709904549767942150140243029454723000526} +#define T_8192_961 {0.7404358281968980159959414777404163032770,-0.6721270596564117294491325083072297275066} +#define T_8192_963 {0.7394039274462057553805038878635969012976,-0.6732620827561329734933792678930331021547} +#define T_8192_965 {0.7383702868066486235676393334870226681232,-0.6743955216051390477716154236986767500639} +#define T_8192_967 {0.7373349087104827948024876604904420673847,-0.6755273735363386267138707808044273406267} +#define T_8192_969 {0.7362977955940531726852782412606757134199,-0.6766576358863749529604092458612285554409} +#define T_8192_971 {0.7352589498977868398554846862680278718472,-0.6777863059956314994991544153890572488308} +#define T_8192_973 {0.7342183740661882840328189558931626379490,-0.6789133812082384089592324016848579049110} +#define T_8192_975 {0.7331760705478327366790836094878613948822,-0.6800388588720789329045146587304770946503} +#define T_8192_977 {0.7321320417953612880168634546862449496984,-0.6811627363387954270379509580379817634821} +#define T_8192_979 {0.7310862902654743367136802589811850339174,-0.6822850109637955684505072895262856036425} +#define T_8192_981 {0.7300388184189261497891720864572562277317,-0.6834056801062587949147086874290835112333} +#define T_8192_983 {0.7289896287205194225222726345236878842115,-0.6845247411291423000889722061401698738337} +#define T_8192_985 {0.7279387236390986171130634829751215875149,-0.6856421913991874728111497461213730275631} +#define T_8192_987 {0.7268861056475449666791632807871792465448,-0.6867580282869258923028610297478735446930} +#define T_8192_989 {0.7258317772227702580067898452398367226124,-0.6878722491666855454184315021848306059837} +#define T_8192_991 {0.7247757408457112804356370361347217112780,-0.6889848514165970438938302322640083730221} +#define T_8192_993 {0.7237179990013234975876343924028333276510,-0.6900958324185999526179102758760564029217} +#define T_8192_995 {0.7226585541785756072741264688374940305948,-0.6912051895584484517698342642688658088446} +#define T_8192_997 {0.7215974088704437683361447852803394198418,-0.6923129202257182202018270800181198865175} +#define T_8192_999 {0.7205345655739052723731674632290378212929,-0.6934190218138118755319965202943421900272} +#define T_8192_1001 {0.7194700267899329926279961000545881688595,-0.6945234917199655244601785852864850312471} +#define T_8192_1003 {0.7184037950234897218493301807029638439417,-0.6956263273452548689945729165629018098116} +#define T_8192_1005 {0.7173358727835217329982242517871782183647,-0.6967275260946012016560757729166653007269} +#define T_8192_1007 {0.7162662625829531171106623332889284938574,-0.6978270853767772896603105436952318996191} +#define T_8192_1009 {0.7151949669386801211601323302602395415306,-0.6989250026044141472780779622553382068872} +#define T_8192_1011 {0.7141219883715647087640832069155294448137,-0.7000212751940063649058743067143950611353} +#define T_8192_1013 {0.7130473294064292311134067858802154660225,-0.7011159005659186593817366883740760385990} +#define T_8192_1015 {0.7119709925720500987011973847984336316586,-0.7022088761443918691895760275656357407570} +#define T_8192_1017 {0.7108929804011516750961163779720664024353,-0.7033001993575487276189051044639199972153} +#define T_8192_1019 {0.7098132954304008368495715330936945974827,-0.7043898676374004130806838475109543651342} +#define T_8192_1021 {0.7087319402004006452244766478543169796467,-0.7054778784198522112447449217143002897501} +#define T_8192_1023 {0.7076489172556843509909185740980319678783,-0.7065642291447095102441267044923733919859} +#define T_8192_1025 {0.7065642291447095102441267044923733919859,-0.7076489172556843509909185740980319678783} +#define T_8192_1035 {0.7011159005659186593817366883740760385990,-0.7130473294064292311134067858802154660225} +#define T_8192_1037 {0.7000212751940063649058743067143950611353,-0.7141219883715647087640832069155294448137} +#define T_8192_1045 {0.6956263273452548689945729165629018098116,-0.7184037950234897218493301807029638439417} +#define T_8192_1053 {0.6912051895584484517698342642688658088446,-0.7226585541785756072741264688374940305948} +#define T_8192_1055 {0.6900958324185999526179102758760564029217,-0.7237179990013234975876343924028333276510} +#define T_8192_1065 {0.6845247411291423000889722061401698738337,-0.7289896287205194225222726345236878842115} +#define T_8192_1071 {0.6811627363387954270379509580379817634821,-0.7321320417953612880168634546862449496984} +#define T_8192_1075 {0.6789133812082384089592324016848579049110,-0.7342183740661882840328189558931626379490} +#define T_8192_1085 {0.6732620827561329734933792678930331021547,-0.7394039274462057553805038878635969012976} +#define T_8192_1089 {0.6709904549767942150140243029454723000526,-0.7414659866305632895944199844961985945702} +#define T_8192_1095 {0.6675711782225403068125046956993173807859,-0.7445459838093073656750675581861287355423} +#define T_8192_1105 {0.6618410023870868741369122290052473545074,-0.7496442406630334787109859462361782789230} +#define T_8192_1107 {0.6606902842872423020637029367208015173674,-0.7506586096545105890598392761603463441133} +#define T_8192_1115 {0.6560718923396177126861061879026237875223,-0.7546983980915243916953727421059738844633} +#define T_8192_1125 {0.6502641874603659610798445100954268127680,-0.7597081587731634444438100217666942626238} +#define T_8192_1135 {0.6444182293999883848201193359273020178080,-0.7646732279980671398433855756593402475119} +#define T_8192_1139 {0.6420692122437925419831117324065417051315,-0.7666466765653104920019700330158229917288} +#define T_8192_1143 {0.6397141516876404532254696277959737926722,-0.7686129091620582665100869235175196081400} +#define T_8192_1145 {0.6385343620594667868317628744989633560181,-0.7695933136854229417167516658082604408264} +#define T_8192_1155 {0.6326129315698775235077278011885937303305,-0.7744681264006708643066190234094392508268} +#define T_8192_1161 {0.6290421877830358887706552195595577359200,-0.7773711635950563136887581094924826174974} +#define T_8192_1165 {0.6266542862720294593259495741222053766251,-0.7792973793725302966706180995970498770475} +#define T_8192_1173 {0.6218608108549653623597919249732512980700,-0.7831277877350573124459742757608182728291} +#define T_8192_1175 {0.6206587766959721363946300698444247245789,-0.7840807885098699525627807815908454358578} +#define T_8192_1179 {0.6182503297997602453861532012524548918009,-0.7859812527678301519173942324414383620024} +#define T_8192_1185 {0.6146267555403750471043622383149340748787,-0.7888180724184202796678277991304639726877} +#define T_8192_1195 {0.6085585776517794531770277899340726435184,-0.7935089524173266628537248834618367254734} +#define T_8192_1197 {0.6073406346425728186133596864237915724516,-0.7944415356160305918820085935294628143311} +#define T_8192_1205 {0.6024546000037237503121900772384833544493,-0.7981531525555438655317175289383158087730} +#define T_8192_1207 {0.6012295400651485044463129270297940820456,-0.7990763669093523535735812401981092989445} +#define T_8192_1215 {0.5963151816757437106986117214546538889408,-0.8027503996280691556108877193764783442020} +#define T_8192_1225 {0.5901406838322489356585265340982005000114,-0.8073004231920144491141400067135691642761} +#define T_8192_1233 {0.5851760723267304120653875543212052434683,-0.8109062611524596730916414344392251223326} +#define T_8192_1235 {0.5839314697012762955807829712284728884697,-0.8118029555825153620318701541691552847624} +#define T_8192_1241 {0.5801894292728316804286237129417713731527,-0.8144815689504986133684383275976870208979} +#define T_8192_1245 {0.5776879045531227996335132957028690725565,-0.8162577319284773924579212689423002302647} +#define T_8192_1251 {0.5739254296856507453483686731487978249788,-0.8189075656996589458458402077667415142059} +#define T_8192_1255 {0.5714103556788572291225136723369359970093,-0.8206644901681574566509880241937935352325} +#define T_8192_1265 {0.5650991923687139761867115339555311948061,-0.8250229710645802239099566577351652085781} +#define T_8192_1269 {0.5625653981006265569320135000452864915133,-0.8267527882383485238193543409579433500767} +#define T_8192_1275 {0.5587547858903683106746029807254672050476,-0.8293329182207882510624585847835987806320} +#define T_8192_1285 {0.5523775094670960728038267006922978907824,-0.8335940780949251394105203871731646358967} +#define T_8192_1287 {0.5510981427690754275161566511087585240602,-0.8344404334861031946601883646508213132620} +#define T_8192_1295 {0.5459677382558175695592694864899385720491,-0.8378062000151509369771929414127953350544} +#define T_8192_1305 {0.5395258493250288944764747611770872026682,-0.8419690361943876766304128977935761213303} +#define T_8192_1309 {0.5369401856148429130755062033131252974272,-0.8436203157060040425108127237763255834579} +#define T_8192_1315 {0.5330522216326195605873294880439061671495,-0.8460823417448969374632383733114693313837} +#define T_8192_1323 {0.5278507234225553457207524843397550284863,-0.8493371614278307779599686000437941402197} +#define T_8192_1325 {0.5265472360035793331078934897959697991610,-0.8501458746926852105829652828106191009283} +#define T_8192_1335 {0.5200112751075960382252105773659422993660,-0.8541593959917388456659637085977010428905} +#define T_8192_1341 {0.5160749903153666329203019813576247543097,-0.8565434048377199571788764842494856566191} +#define T_8192_1343 {0.5147604625165012048881862938287667930126,-0.8573340458828155874471121933311223983765} +#define T_8192_1345 {0.5134447234365434598046817882277537137270,-0.8581226695380861357875801331829279661179} +#define T_8192_1355 {0.5068479672818633208208893847768194973469,-0.8620354621836872066609203102416358888149} +#define T_8192_1359 {0.5042008944326904495980556930589955300093,-0.8635863929296679897262833947024773806334} +#define T_8192_1365 {0.5002213947118405723557543751667253673077,-0.8658975437501488237046487483894452452660} +#define T_8192_1375 {0.4935653955487747657215891194937285035849,-0.8697086870422655602297368204744998365641} +#define T_8192_1377 {0.4922306989514860786627536981541197746992,-0.8704647833253976729750434060406405478716} +#define T_8192_1385 {0.4868803613460473966867425588134210556746,-0.8734686678613848842545053230423945933580} +#define T_8192_1395 {0.4801666853650883859394582486856961622834,-0.8771772650185960529256590234581381082535} +#define T_8192_1405 {0.4734247625522415292564915034745354205370,-0.8808342603477420373891959570755716413260} +#define T_8192_1411 {0.4693662153057375219233904317661654204130,-0.8830035990467808293402640629210509359837} +#define T_8192_1413 {0.4680111530483598292207148006127681583166,-0.8837225586247896558234060648828744888306} +#define T_8192_1415 {0.4666549895155309157779299766843905672431,-0.8844394387182537009550742368446663022041} +#define T_8192_1425 {0.4598577645013295356335447650053538382053,-0.8879925880478055644218216002627741545439} +#define T_8192_1431 {0.4557664188194346932547773576516192406416,-0.8900994166251923189747685682959854602814} +#define T_8192_1435 {0.4530334873709316312329065112862735986710,-0.8914934993147913822042482934193685650826} +#define T_8192_1445 {0.4461825595770300689757448253658367320895,-0.8949419665706207505095903798064682632685} +#define T_8192_1449 {0.4434348164981384843308376275672344490886,-0.8963066236044795465076617801969405263662} +#define T_8192_1455 {0.4393053841400999526278781104338122531772,-0.8983377869518343050714292985503561794758} +#define T_8192_1465 {0.4324023656246901436972507326572667807341,-0.9016807606920377304859925970959011465311} +#define T_8192_1467 {0.4310186964611670279445831965858815237880,-0.9023429964824442039983409813430625945330} +#define T_8192_1475 {0.4254739101156238545442533904861193150282,-0.9049706911336532488832062881556339561939} +#define T_8192_1479 {0.4226954968022330061394598033075453713536,-0.9062717677292575491065917958621867001057} +#define T_8192_1485 {0.4185204251941097575162586963415378704667,-0.9082073847394886989548012934392318129539} +#define T_8192_1495 {0.4115423199137652199297576771641615778208,-0.9113906511041223179603321113972924649715} +#define T_8192_1503 {0.4059423848404025148006724066362949088216,-0.9138986706359116807263376358605455607176} +#define T_8192_1505 {0.4045400047765529971677267440099967643619,-0.9145203029651044479564347966515924781561} +#define T_8192_1513 {0.3989209983369829126687022835540119558573,-0.9169852981841229988546615459199529141188} +#define T_8192_1515 {0.3975138917086323275817960620770463719964,-0.9175961562139728355802503756422083824873} +#define T_8192_1521 {0.3932869727472964038739178249670658260584,-0.9194157694249469603420266139437444508076} +#define T_8192_1525 {0.3904643940361266496452685714757535606623,-0.9206180299070838568553654113202355802059} +#define T_8192_1535 {0.3833919264608086630019556650950107723475,-0.9235857462762566694181032289634458720684} +#define T_8192_1539 {0.3805566010089285189366137274191714823246,-0.9247576295595139050931265956023707985878} +#define T_8192_1545 {0.3762969050357047873234250801033340394497,-0.9264991307392305142087707281461916863918} +#define T_8192_1547 {0.3748752309950575956065677019068971276283,-0.9270752726647401020798611170903313905001} +#define T_8192_1555 {0.3691797471406199626642319344682618975639,-0.9293580119099354996947681684105191379786} +#define T_8192_1557 {0.3677536960065819759968519520043628290296,-0.9299232328926396728974168581771664321423} +#define T_8192_1565 {0.3620408714575841790939136899396544322371,-0.9321622216085744261349077532941009849310} +#define T_8192_1575 {0.3548806979462227895183445980364922434092,-0.9349115948715160939741508627776056528091} +#define T_8192_1581 {0.3505745460548375658227371332031907513738,-0.9365348299227554962342878752679098397493} +#define T_8192_1585 {0.3476996478190513828465668666467536240816,-0.9376059699609999853464614716358482837677} +#define T_8192_1593 {0.3419400603934021898311357290367595851421,-0.9397217647251533412244839382765349000692} +#define T_8192_1595 {0.3404981435166971603933916412643156945705,-0.9402451883746508753958437409892212599516} +#define T_8192_1605 {0.3332766086830479257407944260194199159741,-0.9428290948548027072817490079614799469709} +#define T_8192_1611 {0.3289342498056121999461254290508804842830,-0.9443528256455947511582849074329715222120} +#define T_8192_1615 {0.3260354681403302423703394197218585759401,-0.9453575373976322859803644860221538692713} +#define T_8192_1625 {0.3187751478641185354234721671673469245434,-0.9478303672621010145249442757631186395884} +#define T_8192_1629 {0.3158657450621840112603422312531620264053,-0.9488038949626584894758707378059625625610} +#define T_8192_1635 {0.3114960749582759147457977633166592568159,-0.9502474389787052277966949986875988543034} +#define T_8192_1645 {0.3041986776298291061948475544340908527374,-0.9526086103580333475093766537611372768879} +#define T_8192_1647 {0.3027370369918191972402610190329141914845,-0.9530741243121721995024131501850206404924} +#define T_8192_1649 {0.3012746839843179480489254729036474600434,-0.9535373955908332810338379204040393233299} +#define T_8192_1655 {0.2968833851637782683674515737948240712285,-0.9549137424991305245214334718184545636177} +#define T_8192_1665 {0.2895506278978430825254974934068741276860,-0.9571626997976701023418399927322752773762} +#define T_8192_1675 {0.2822008371971475582107302670920034870505,-0.9593553499539307916066377401875797659159} +#define T_8192_1683 {0.2763090310812710836962935445626499131322,-0.9610688421455193530817950886557810008526} +#define T_8192_1685 {0.2748344454288439431266510837303940206766,-0.9614915639805790004146501814830116927624} +#define T_8192_1695 {0.2674518859366776801778087246930226683617,-0.9635712162102573197230981350003276020288} +#define T_8192_1701 {0.2630147703617790044816615591116715222597,-0.9647918068534478974740409285004716366529} +#define T_8192_1705 {0.2600535930154951880233227257122052833438,-0.9655941843029768323347639125131536275148} +#define T_8192_1715 {0.2526400018856955198565117370890220627189,-0.9675603492533143556997288214915897697210} +#define T_8192_1717 {0.2511554862377419206076467617094749584794,-0.9679467556289877983033420605352148413658} +#define T_8192_1719 {0.2496703795966685734963874665481853298843,-0.9683308843324451853362688780180178582668} +#define T_8192_1725 {0.2452115486676275657451640199724351987243,-0.9694695953974130642194495521835051476955} +#define T_8192_1735 {0.2377686703559342140668064757846877910197,-0.9713218104197861579862660619255620986223} +#define T_8192_1737 {0.2362784021979195681062435596686555072665,-0.9716854000420085402112135852803476154804} +#define T_8192_1745 {0.2303118047938454426848409184458432719111,-0.9731168853599251322705754319031257182360} +#define T_8192_1751 {0.2258311540280261708879550042183836922050,-0.9741664590152803171463347098324447870255} +#define T_8192_1755 {0.2228413906474211447772404426359571516514,-0.9748547146187084289081781207642052322626} +#define T_8192_1765 {0.2153578673797455489413721352320862933993,-0.9765351959646144663906852656509727239609} +#define T_8192_1773 {0.2093619060104741635974789915053406730294,-0.9778382239980504264664773472759407013655} +#define T_8192_1775 {0.2078616752250750654429367614284274168313,-0.9781582305397350518560983800853136926889} +#define T_8192_1785 {0.2003532551629404467874451256648171693087,-0.9797237228655911733810057739901822060347} +#define T_8192_1791 {0.1958425174476578767279733028772170655429,-0.9806353595296081193666282160847913473845} +#define T_8192_1795 {0.1928330488922052332600998170164530165493,-0.9812315808487497292844636831432580947876} +#define T_8192_1805 {0.1853014988050818989684387361194239929318,-0.9826817157862408613766547205159440636635} +#define T_8192_1809 {0.1822858017251532958269422124431002885103,-0.9832455880854070739971461989625822752714} +#define T_8192_1815 {0.1777590479611071694332480319644673727453,-0.9840740423707764472638359620759729295969} +#define T_8192_1819 {0.1747391147796272248271520766138564795256,-0.9846147682043125959339135988557245582342} +#define T_8192_1825 {0.1702061400610780650399789237781078554690,-0.9854084786957684194419471168657764792442} +#define T_8192_1827 {0.1686943427236173298489063654415076598525,-0.9856684121615375548941528904833830893040} +#define T_8192_1835 {0.1626432194209503356852053457259899005294,-0.9866849462601466891342738563253078609705} +#define T_8192_1845 {0.1550707309457005356190251177395111881196,-0.9879033699729777850961909280158579349518} +#define T_8192_1853 {0.1490061506603484742239373872507712803781,-0.9888362690887635420011747555690817534924} +#define T_8192_1855 {0.1474891201031535981069708896029624156654,-0.9890636781578815428517259533691685646772} +#define T_8192_1863 {0.1414175630223030444287957152482704259455,-0.9899500355416089858451300642627757042646} +#define T_8192_1865 {0.1398988328977772144234847928601084277034,-0.9901658025572483978749005473218858242035} +#define T_8192_1875 {0.1323003158444446825114937382750213146210,-0.9912096783362540630690773468813858926296} +#define T_8192_1881 {0.1277374412176623119652418836267315782607,-0.9918080187774064304662147151248063892126} +#define T_8192_1885 {0.1246940159421676547202295637362112756819,-0.9921952440866739220126646614517085254192} +#define T_8192_1887 {0.1231718613882804846948459953637211583555,-0.9923853548708516658649614328169263899326} +#define T_8192_1895 {0.1170803806478005887337090484834334347397,-0.9931224418304955836589442697004415094852} +#define T_8192_1899 {0.1140329729333672131863508525384531822056,-0.9934769655527891929480688304465729743242} +#define T_8192_1905 {0.1094598578497179841573228031847975216806,-0.9939912170233293764454174379352480173111} +#define T_8192_1915 {0.1018328958414665419418554392905207350850,-0.9948015185576171148795765475369989871979} +#define T_8192_1917 {0.1003067702113928649776397605819511227310,-0.9949565577701163787693872109230142086744} +#define T_8192_1921 {0.0972538144483632710501908036349050235003,-0.9952596121491333880371144005039241164923} +#define T_8192_1925 {0.0941999432953932042122957568608399014920,-0.9955532987656384724672875563555862754583} +#define T_8192_1935 {0.0865614492362511700473959308510529808700,-0.9962465134223155160952956066466867923737} +#define T_8192_1945 {0.0789178630147849557996408975668600760400,-0.9968811217478138475556193043303210288286} +#define T_8192_1953 {0.0727996298363516730622890804625058081001,-0.9973465866466332263584604334027972072363} +#define T_8192_1955 {0.0712696342812964012125576118705794215202,-0.9974570864099419065240681447903625667095} +#define T_8192_1965 {0.0636172129591930923808718034706544131041,-0.9979743735263469917029510725114960223436} +#define T_8192_1971 {0.0590239349846679306477170712241786532104,-0.9982565677714951846155599923804402351379} +#define T_8192_1975 {0.0559610492185205685156113020184420747682,-0.9984329526665084442171860246162395924330} +#define T_8192_1985 {0.0483015934494801443821465625205746619031,-0.9988327968535279932638104583020322024822} +#define T_8192_1989 {0.0452369902988045899383529047099727904424,-0.9989762833564698185639940675173420459032} +#define T_8192_1995 {0.0406392962359337361899491725125699304044,-0.9991738825657163758364731620531529188156} +#define T_8192_2005 {0.0329746083288973354519235670068155741319,-0.9994561897379773407479319757840130478144} +#define T_8192_2007 {0.0314414235405603009754216259352688211948,-0.9995055962253253101224004240066278725863} +#define T_8192_2015 {0.0253079806200245706337970119648161926307,-0.9996797017629879267275327947572804987431} +#define T_8192_2023 {0.0191735848683226225985798407691618194804,-0.9998161699249004108480676222825422883034} +#define T_8192_2025 {0.0176398641150820566225743135646553128026,-0.9998444054921752366382747823081444948912} +#define T_8192_2035 {0.0099707099074180290804170212481949420180,-0.9999502912364904849695790289842989295721} +#define T_8192_2043 {0.0038349425697062279778937199381516620633,-0.9999926465807071895852686793659813702106} +#define T_8192_2045 {0.0023009691514258054194363989353178112651,-0.9999973527669782091820138703042175620794} +#define T_8192_2055 {-0.0053689069639963433663853997757087199716,-0.9999855873151431984169335009937640279531} +#define T_8192_2057 {-0.0069028587247297558057712585366516577778,-0.9999761749868976146160548523766919970512} +#define T_8192_2061 {-0.0099707099074180290804170212481949420180,-0.9999502912364904849695790289842989295721} +#define T_8192_2065 {-0.0130384672419873327148254205098965030629,-0.9999149955731134742364929479663260281086} +#define T_8192_2075 {-0.0207072605042658945684319604652046109550,-0.9997855816935992123717369395308196544647} +#define T_8192_2079 {-0.0237744619888275582342274105940305162221,-0.9997173475323621882893121437518857419491} +#define T_8192_2085 {-0.0283748356176720985255546025882722460665,-0.9995973532896483826348799084371421486139} +#define T_8192_2091 {-0.0329746083288973354519235670068155741319,-0.9994561897379773407479319757840130478144} +#define T_8192_2095 {-0.0360407415207062223339029571889113867655,-0.9993503214341994400626845163060352206230} +#define T_8192_2097 {-0.0375736827092705005792794281660462729633,-0.9992938598668877903108409554988611489534} +#define T_8192_2105 {-0.0437045272500634213175274567220185417682,-0.9990445006594292909340992991928942501545} +#define T_8192_2115 {-0.0513657419671625925516877941845450550318,-0.9986799089558990871751120721455663442612} +#define T_8192_2125 {-0.0590239349846679306477170712241786532104,-0.9982565677714951846155599923804402351379} +#define T_8192_2133 {-0.0651480110258788325250378647979232482612,-0.9978756118171101530123223710688762366772} +#define T_8192_2135 {-0.0666786557930015705286663774131739046425,-0.9977745020101678186108529189368709921837} +#define T_8192_2145 {-0.0743294540868457559446014215609466191381,-0.9972337400304662757832829811377450823784} +#define T_8192_2151 {-0.0789178630147849557996408975668600760400,-0.9968811217478138475556193043303210288286} +#define T_8192_2155 {-0.0819758797916330800292783465010870713741,-0.9966343136438698957491055807622615247965} +#define T_8192_2159 {-0.0850331249802802752180141965254733804613,-0.9963781248382002075913987937383353710175} +#define T_8192_2165 {-0.0896174830900229729691019997517287265509,-0.9959762581129177894112558533379342406988} +#define T_8192_2169 {-0.0926726734299133103611723072390304878354,-0.9956966282956635216905283414234872907400} +#define T_8192_2175 {-0.0972538144483632710501908036349050235003,-0.9952596121491333880371144005039241164923} +#define T_8192_2185 {-0.1048844246431349658266185542743187397718,-0.9944844179107475978796060189779382199049} +#define T_8192_2187 {-0.1064098206341876767799803360503574367613,-0.9943223572225458051221380628703627735376} +#define T_8192_2193 {-0.1109844918971633898063799961164477281272,-0.9938221382915196633334176112839486449957} +#define T_8192_2195 {-0.1125088647873786901199011367680213879794,-0.9936507210002191170516994134231936186552} +#define T_8192_2205 {-0.1201266863571015114375839516469568479806,-0.9927585704615511374981906556058675050735} +#define T_8192_2215 {-0.1277374412176623119652418836267315782607,-0.9918080187774064304662147151248063892126} +#define T_8192_2223 {-0.1338206561937547445229057530013960786164,-0.9910055660670493704600403361837379634380} +#define T_8192_2225 {-0.1353406816501342146974451452479115687311,-0.9907991218660203713852752116508781909943} +#define T_8192_2227 {-0.1368603886368163768949557379528414458036,-0.9905903462189501462731300307495985180140} +#define T_8192_2235 {-0.1429359603776426679289102139591705054045,-0.9897319390779105718891628384881187230349} +#define T_8192_2241 {-0.1474891201031535981069708896029624156654,-0.9890636781578815428517259533691685646772} +#define T_8192_2245 {-0.1505228305916774256267842702072812244296,-0.9886065331923864496488363329262938350439} +#define T_8192_2255 {-0.1581008459783770081497067394593614153564,-0.9874229704138554053471921179152559489012} +#define T_8192_2259 {-0.1611294729056788055387983149557840079069,-0.9869332768536777100720769340114202350378} +#define T_8192_2261 {-0.1626432194209503356852053457259899005294,-0.9866849462601466891342738563253078609705} +#define T_8192_2265 {-0.1656695607447841167569890785671304911375,-0.9861813203679282713309817154367920011282} +#define T_8192_2275 {-0.1732285296450703226955880609239102341235,-0.9848816560973236988729695440270006656647} +#define T_8192_2277 {-0.1747391147796272248271520766138564795256,-0.9846147682043125959339135988557245582342} +#define T_8192_2285 {-0.1807773080067285875749405477108666673303,-0.9835240540575712619997261754178907722235} +#define T_8192_2295 {-0.1883154517567321162285765012711635790765,-0.9821085941125136109519644378451630473137} +#define T_8192_2305 {-0.1958425174476578767279733028772170655429,-0.9806353595296081193666282160847913473845} +#define T_8192_2313 {-0.2018558962165680481515295241479179821908,-0.9794152322496347817804007718223147094250} +#define T_8192_2315 {-0.2033580622837733165031437465586350299418,-0.9791044369750292464260610358905978500843} +#define T_8192_2325 {-0.2108616441470848590356013119162525981665,-0.9775159165085692825059027200040873140097} +#define T_8192_2329 {-0.2138596283589937752100951229294878430665,-0.9768644017253126365929460916959214955568} +#define T_8192_2331 {-0.2153578673797455489413721352320862933993,-0.9765351959646144663906852656509727239609} +#define T_8192_2335 {-0.2183528216233463215001364687850582413375,-0.9758698915783410310353929162374697625637} +#define T_8192_2345 {-0.2258311540280261708879550042183836922050,-0.9741664590152803171463347098324447870255} +#define T_8192_2349 {-0.2288187917998022180565698135978891514242,-0.9734690341861310658444494947616476565599} +#define T_8192_2355 {-0.2332962014322316202097340465115848928690,-0.9724057190274497664006503327982500195503} +#define T_8192_2363 {-0.2392583790212999828028728188655804842710,-0.9709559351835179707990164388320408761501} +#define T_8192_2365 {-0.2407475246885884267999955454797600395977,-0.9705877751941436315519240451976656913757} +#define T_8192_2367 {-0.2422361038536960387013152740109944716096,-0.9702173313179791591664979932829737663269} +#define T_8192_2375 {-0.2481846854570747828994115025125211104751,-0.9687127344597947775639568135375156998634} +#define T_8192_2385 {-0.2556072462308074388914747032686136662960,-0.9667807071276832697748204736853949725628} +#define T_8192_2395 {-0.2630147703617790044816615591116715222597,-0.9647918068534478974740409285004716366529} +#define T_8192_2397 {-0.2644944324278016289930803850438678637147,-0.9643872122828542892136738373665139079094} +#define T_8192_2403 {-0.2689296704203573140645744388166349381208,-0.9631598166283713569058022585522849112749} +#define T_8192_2405 {-0.2704068220865448179957013508101226761937,-0.9627461506383994116475832925061695277691} +#define T_8192_2415 {-0.2777829665518576351956880898796953260899,-0.9606438588226385855151079340430442243814} +#define T_8192_2421 {-0.2822008371971475582107302670920034870505,-0.9593553499539307916066377401875797659159} +#define T_8192_2425 {-0.2851427698402487220796786004939349368215,-0.9584850550779761002573309269791934639215} +#define T_8192_2431 {-0.2895506278978430825254974934068741276860,-0.9571626997976701023418399927322752773762} +#define T_8192_2435 {-0.2924857989955538806192691936303162947297,-0.9562698664006580306917726375104393810034} +#define T_8192_2439 {-0.2954182171055320105246266848553204908967,-0.9553680322274703540230689213785808533430} +#define T_8192_2445 {-0.2998116220483833527232775395532371476293,-0.9539984231038944884062402707058936357498} +#define T_8192_2455 {-0.3071198080415330489145731007738504558802,-0.9516708588101938648406985521432943642139} +#define T_8192_2457 {-0.3085792909415250306892630760557949542999,-0.9511986234231132320005031033360864967108} +#define T_8192_2465 {-0.3144099270553367131419975066819461062551,-0.9492873104435021192060162320558447390795} +#define T_8192_2475 {-0.3216815502329565812367206945054931566119,-0.9468479182211479994180081121157854795456} +#define T_8192_2485 {-0.3289342498056121999461254290508804842830,-0.9443528256455947511582849074329715222120} +#define T_8192_2493 {-0.3347224977175812798968479455652413889766,-0.9423167458565637755185662172152660787106} +#define T_8192_2495 {-0.3361675991177445199653561758168507367373,-0.9418021794959976489280961686745285987854} +#define T_8192_2499 {-0.3390554254149696356002152697328710928559,-0.9407663995363960696138860839710105210543} +#define T_8192_2505 {-0.3433811726521150409219274024508194997907,-0.9391961298195699026081229021656326949596} +#define T_8192_2511 {-0.3476996478190513828465668666467536240816,-0.9376059699609999853464614716358482837677} +#define T_8192_2515 {-0.3505745460548375658227371332031907513738,-0.9365348299227554962342878752679098397493} +#define T_8192_2525 {-0.3577472961603418988296709812857443466783,-0.9338184363622109573910279323172289878130} +#define T_8192_2529 {-0.3606105271206623275048741561477072536945,-0.9327164883981402532242555025732144713402} +#define T_8192_2533 {-0.3634703638773638112446917602937901392579,-0.9316057613512578328140989469829946756363} +#define T_8192_2535 {-0.3648990010162673214288986400788417086005,-0.9310471089355952800659110835113096982241} +#define T_8192_2545 {-0.3720292399082850143265943643200444057584,-0.9282210106721694442555303794506471604109} +#define T_8192_2547 {-0.3734526748367802961858785693038953468204,-0.9276492330925811824471338695730082690716} +#define T_8192_2555 {-0.3791375933848473156473346534767188131809,-0.9253403078232063094787918089423328638077} +#define T_8192_2565 {-0.3862236432818629827679046684352215379477,-0.9224051698522098829968740574258845299482} +#define T_8192_2567 {-0.3876381401253727321254416438023326918483,-0.9218116251817081163721923076082020998001} +#define T_8192_2575 {-0.3932869727472964038739178249670658260584,-0.9194157694249469603420266139437444508076} +#define T_8192_2583 {-0.3989209983369829126687022835540119558573,-0.9169852981841229988546615459199529141188} +#define T_8192_2585 {-0.4003271662656900931054337888781446963549,-0.9163722823992891397537619013746734708548} +#define T_8192_2595 {-0.4073438096826079712897694662387948483229,-0.9132748878148677640353980677900835871696} +#define T_8192_2601 {-0.4115423199137652199297576771641615778208,-0.9113906511041223179603321113972924649715} +#define T_8192_2605 {-0.4143364902289990991945956011477392166853,-0.9101237678825416788086499764176551252604} +#define T_8192_2615 {-0.4213047965454796428552697307168273255229,-0.9069191079736781402331757817592006176710} +#define T_8192_2619 {-0.4240852024156515631680974820483243092895,-0.9056222949398252675123899280151817947626} +#define T_8192_2625 {-0.4282483187065319607533808721200330182910,-0.9036610966092479824141037170193158090115} +#define T_8192_2635 {-0.4351666482446192585342714664875529706478,-0.9003499254487355996090514054230879992247} +#define T_8192_2637 {-0.4365472551964011960201617057464318349957,-0.8996813291274239343664476109552197158337} +#define T_8192_2645 {-0.4420593781742147565516631857462925836444,-0.8969857892788639652792426204541698098183} +#define T_8192_2655 {-0.4489261030157433163267910458671394735575,-0.8935688860021359092300485826854128390551} +#define T_8192_2665 {-0.4557664188194346932547773576516192406416,-0.8900994166251923189747685682959854602814} +#define T_8192_2669 {-0.4584950604208262725514089197531575337052,-0.8886969559808916008236678862886037677526} +#define T_8192_2673 {-0.4612193864920923758177195850294083356857,-0.8872861305823831479244745423784479498863} +#define T_8192_2675 {-0.4625799231890868057348598085809499025345,-0.8865775852469870432770449042436666786671} +#define T_8192_2685 {-0.4693662153057375219233904317661654204130,-0.8830035990467808293402640629210509359837} +#define T_8192_2691 {-0.4734247625522415292564915034745354205370,-0.8808342603477420373891959570755716413260} +#define T_8192_2695 {-0.4761248959512436318419759118114598095417,-0.8793776682719532944432216936547774821520} +#define T_8192_2703 {-0.4815116929701899195492842409294098615646,-0.8764396667957136122240058284660335630178} +#define T_8192_2705 {-0.4828555675317656725731296774029033258557,-0.8757000062256345973565885287825949490070} +#define T_8192_2709 {-0.4855399048779469595160662720445543527603,-0.8742145050107062997923890179663430899382} +#define T_8192_2715 {-0.4895578341011574963204111554659903049469,-0.8719708292541578087408993269491475075483} +#define T_8192_2725 {-0.4962313013842583053758517053211107850075,-0.8681903567343313143922500785265583544970} +#define T_8192_2727 {-0.4975625043493191457244506636925507336855,-0.8674281262823069216238991430145688354969} +#define T_8192_2735 {-0.5028755768000869874612135390634648501873,-0.8643588110605340268932650360511615872383} +#define T_8192_2737 {-0.5042008944326904495980556930589955300093,-0.8635863929296679897262833947024773806334} +#define T_8192_2745 {-0.5094902694849362534412762215652037411928,-0.8604764176316320734017040194885339587927} +#define T_8192_2755 {-0.5160749903153666329203019813576247543097,-0.8565434048377199571788764842494856566191} +#define T_8192_2763 {-0.5213209268785956584935092905652709305286,-0.8533607040392954257157498432206921279430} +#define T_8192_2765 {-0.5226293519310966084745473381190095096827,-0.8525600040466840834341155641595833003521} +#define T_8192_2771 {-0.5265472360035793331078934897959697991610,-0.8501458746926852105829652828106191009283} +#define T_8192_2775 {-0.5291529687577906093665092157607432454824,-0.8485264495905926462882007399457506835461} +#define T_8192_2781 {-0.5330522216326195605873294880439061671495,-0.8460823417448969374632383733114693313837} +#define T_8192_2785 {-0.5356454570297410899826218155794776976109,-0.8444429787519106556104020455677527934313} +#define T_8192_2795 {-0.5421064348124439158382870118657592684031,-0.8403098317495407698629605874884873628616} +#define T_8192_2799 {-0.5446819177876345285937986773205921053886,-0.8386427179885272975568000219936948269606} +#define T_8192_2805 {-0.5485355220250673902171456575160846114159,-0.8361272517246921553990546271961648017168} +#define T_8192_2815 {-0.5549323404628103695301888365065678954124,-0.8318954847265775942588561520096845924854} +#define T_8192_2817 {-0.5562077987487399344246341570396907627583,-0.8310432507463623164056798486853949725628} +#define T_8192_2825 {-0.5612965138191514657961533885099925100803,-0.8276147796979383963744680841045919805765} +#define T_8192_2835 {-0.5676276677079862276187327552179340273142,-0.8232853884604001137859086156822741031647} +#define T_8192_2839 {-0.5701508003194702967064699805632699280977,-0.8215400567805976050905769625387620180845} +#define T_8192_2845 {-0.5739254296856507453483686731487978249788,-0.8189075656996589458458402077667415142059} +#define T_8192_2853 {-0.5789393480630818933363457290397491306067,-0.8153706097623912851801719625655096024275} +#define T_8192_2855 {-0.5801894292728316804286237129417713731527,-0.8144815689504986133684383275976870208979} +#define T_8192_2865 {-0.5864192979763604984810854148236103355885,-0.8100076585816411434137762626050971448421} +#define T_8192_2871 {-0.5901406838322489356585265340982005000114,-0.8073004231920144491141400067135691642761} +#define T_8192_2873 {-0.5913783723567875849624897455214522778988,-0.8063942092479563505946771329035982489586} +#define T_8192_2875 {-0.5926146693108911289726847826386801898479,-0.8054860977804291222170718356210272759199} +#define T_8192_2885 {-0.5987751788204587199970774236135184764862,-0.8009171525373443012441043720173183828592} +#define T_8192_2889 {-0.6012295400651485044463129270297940820456,-0.7990763669093523535735812401981092989445} +#define T_8192_2895 {-0.6049004640999198212369947214028798043728,-0.7963010916303591146814255807839799672365} +#define T_8192_2905 {-0.6109901648162717746615157921041827648878,-0.7916381866091257712980677752057090401649} +#define T_8192_2907 {-0.6122038032497979509471974779444281011820,-0.7907000084017216101628378055465873330832} +#define T_8192_2915 {-0.6170439227298497586460257480212021619081,-0.7869287117790018104557248079800046980381} +#define T_8192_2925 {-0.6230613817154012634702553441456984728575,-0.7821729441849130104458254209021106362343} +#define T_8192_2935 {-0.6290421877830358887706552195595577359200,-0.7773711635950563136887581094924826174974} +#define T_8192_2941 {-0.6326129315698775235077278011885937303305,-0.7744681264006708643066190234094392508268} +#define T_8192_2943 {-0.6338002060310172769419523319811560213566,-0.7734967994988990458438138375640846788883} +#define T_8192_2945 {-0.6349859890990494637463825711165554821491,-0.7725236524844413255053154898632783442736} +#define T_8192_2955 {-0.6408924360066213798958756342472042888403,-0.7676306960182733840625246557465288788080} +#define T_8192_2961 {-0.6444182293999883848201193359273020178080,-0.7646732279980671398433855756593402475119} +#define T_8192_2965 {-0.6467611810463839150386888832144904881716,-0.7626925820351779794492586006526835262775} +#define T_8192_2975 {-0.6525918789768625494218667881796136498451,-0.7577096010302680761938631803786847740412} +#define T_8192_2979 {-0.6549134280500560345572580445150379091501,-0.7557039114360358800226435960212256759405} +#define T_8192_2985 {-0.6583841867947850534648068787646479904652,-0.7526820461380552274022193159908056259155} +#define T_8192_2995 {-0.6641377637552600132764268892060499638319,-0.7476102131152051466500552123761735856533} +#define T_8192_2997 {-0.6652838016190871783805960149038583040237,-0.7465905593451171995411641546525061130524} +#define T_8192_3005 {-0.6698522713918210191863522595667745918036,-0.7424944003231391809194406050664838403463} +#define T_8192_3009 {-0.6721270596564117294491325083072297275066,-0.7404358281968980159959414777404163032770} +#define T_8192_3015 {-0.6755273735363386267138707808044273406267,-0.7373349087104827948024876604904420673847} +#define T_8192_3025 {-0.6811627363387954270379509580379817634821,-0.7321320417953612880168634546862449496984} +#define T_8192_3033 {-0.6856421913991874728111497461213730275631,-0.7279387236390986171130634829751215875149} +#define T_8192_3035 {-0.6867580282869258923028610297478735446930,-0.7268861056475449666791632807871792465448} +#define T_8192_3043 {-0.6912051895584484517698342642688658088446,-0.7226585541785756072741264688374940305948} +#define T_8192_3045 {-0.6923129202257182202018270800181198865175,-0.7215974088704437683361447852803394198418} +#define T_8192_3051 {-0.6956263273452548689945729165629018098116,-0.7184037950234897218493301807029638439417} +#define T_8192_3055 {-0.6978270853767772896603105436952318996191,-0.7162662625829531171106623332889284938574} +#define T_8192_3065 {-0.7033001993575487276189051044639199972153,-0.7108929804011516750961163779720664024353} +#define T_8192_3069 {-0.7054778784198522112447449217143002897501,-0.7087319402004006452244766478543169796467} +#define T_8192_3075 {-0.7087319402004006452244766478543169796467,-0.7054778784198522112447449217143002897501} +#define T_8192_3077 {-0.7098132954304008368495715330936945974827,-0.7043898676374004130806838475109543651342} +#define T_8192_3085 {-0.7141219883715647087640832069155294448137,-0.7000212751940063649058743067143950611353} +#define T_8192_3087 {-0.7151949669386801211601323302602395415306,-0.6989250026044141472780779622553382068872} +#define T_8192_3095 {-0.7194700267899329926279961000545881688595,-0.6945234917199655244601785852864850312471} +#define T_8192_3105 {-0.7247757408457112804356370361347217112780,-0.6889848514165970438938302322640083730221} +#define T_8192_3111 {-0.7279387236390986171130634829751215875149,-0.6856421913991874728111497461213730275631} +#define T_8192_3115 {-0.7300388184189261497891720864572562277317,-0.6834056801062587949147086874290835112333} +#define T_8192_3123 {-0.7342183740661882840328189558931626379490,-0.6789133812082384089592324016848579049110} +#define T_8192_3125 {-0.7352589498977868398554846862680278718472,-0.6777863059956314994991544153890572488308} +#define T_8192_3135 {-0.7404358281968980159959414777404163032770,-0.6721270596564117294491325083072297275066} +#define T_8192_3141 {-0.7435210668546691215041732903046067804098,-0.6687125115797480923163220722926780581474} +#define T_8192_3145 {-0.7455691487753254298453953197167720645666,-0.6664282740058653509152009064564481377602} +#define T_8192_3155 {-0.7506586096545105890598392761603463441133,-0.6606902842872423020637029367208015173674} +#define T_8192_3159 {-0.7526820461380552274022193159908056259155,-0.6583841867947850534648068787646479904652} +#define T_8192_3165 {-0.7557039114360358800226435960212256759405,-0.6549134280500560345572580445150379091501} +#define T_8192_3175 {-0.7607047573192369238626042715623043477535,-0.6490980451302259535140137813868932425976} +#define T_8192_3177 {-0.7616995658535352653473182726884260773659,-0.6479303754096854550681428008829243481159} +#define T_8192_3179 {-0.7626925820351779794492586006526835262775,-0.6467611810463839150386888832144904881716} +#define T_8192_3185 {-0.7656608531186624988507105626922566443682,-0.6432444776300858535478255362249910831451} +#define T_8192_3195 {-0.7705719072813806969080019371176604181528,-0.6373530698982591280454812476818915456533} +#define T_8192_3205 {-0.7754376309041305370683971887046936899424,-0.6314241685094017508816932604531757533550} +#define T_8192_3213 {-0.7792973793725302966706180995970498770475,-0.6266542862720294593259495741222053766251} +#define T_8192_3215 {-0.7802577377503165934058415587060153484344,-0.6254581222438143628394868756004143506289} +#define T_8192_3225 {-0.7850319442668480807157038725563324987888,-0.6194552820669240178474979074962902814150} +#define T_8192_3231 {-0.7878743190709002197635868469660636037588,-0.6158360636959849809812794774188660085201} +#define T_8192_3235 {-0.7897599696008190672813498167670331895351,-0.6134160011086385866363457353145349770784} +#define T_8192_3245 {-0.7944415356160305918820085935294628143311,-0.6073406346425728186133596864237915724516} +#define T_8192_3247 {-0.7953722494170613055430862914363387972116,-0.6061212625021862310248366156884003430605} +#define T_8192_3249 {-0.7963010916303591146814255807839799672365,-0.6049004640999198212369947214028798043728} +#define T_8192_3255 {-0.7990763669093523535735812401981092989445,-0.6012295400651485044463129270297940820456} +#define T_8192_3265 {-0.8036641908269240852646930761693511158228,-0.5950830768745699606014909477380570024252} +#define T_8192_3267 {-0.8045760909263071081198859246796928346157,-0.5938495717854336275465243488724809139967} +#define T_8192_3275 {-0.8082047374801947237088484143896494060755,-0.5889016066496758350368168066779617220163} +#define T_8192_3281 {-0.8109062611524596730916414344392251223326,-0.5851760723267304120653875543212052434683} +#define T_8192_3285 {-0.8126977397617994869349900000088382512331,-0.5826854930286684552953602178604342043400} +#define T_8192_3295 {-0.8171429333612729717373213134123943746090,-0.5764351016877218292222551099257543683052} +#define T_8192_3303 {-0.8206644901681574566509880241937935352325,-0.5714103556788572291225136723369359970093} +#define T_8192_3305 {-0.8215400567805976050905769625387620180845,-0.5701508003194702967064699805632699280977} +#define T_8192_3315 {-0.8258888513495868943792288519034627825022,-0.5638329586113781655143384341499768197536} +#define T_8192_3321 {-0.8284748237000071346969320984499063342810,-0.5600263087527603822479704831494018435478} +#define T_8192_3325 {-0.8301890612411023662176035031734500080347,-0.5574819482239915524601769902801606804132} +#define T_8192_3335 {-0.8344404334861031946601883646508213132620,-0.5510981427690754275161566511087585240602} +#define T_8192_3339 {-0.8361272517246921553990546271961648017168,-0.5485355220250673902171456575160846114159} +#define T_8192_3345 {-0.8386427179885272975568000219936948269606,-0.5446819177876345285937986773205921053886} +#define T_8192_3349 {-0.8403098317495407698629605874884873628616,-0.5421064348124439158382870118657592684031} +#define T_8192_3355 {-0.8427956675400042341195216977212112396955,-0.5382336507278216997107733732264023274183} +#define T_8192_3357 {-0.8436203157060040425108127237763255834579,-0.5369401856148429130755062033131252974272} +#define T_8192_3365 {-0.8468990378343972391661509391269646584988,-0.5317537209227333194760944934387225657701} +#define T_8192_3375 {-0.8509525874821757263077870447887107729912,-0.5252425095680947064735732965345960110426} +#define T_8192_3383 {-0.8541593959917388456659637085977010428905,-0.5200112751075960382252105773659422993660} +#define T_8192_3385 {-0.8549560780246149294114843542047310620546,-0.5187003996998350574543223956425208598375} +#define T_8192_3393 {-0.8581226695380861357875801331829279661179,-0.5134447234365434598046817882277537137270} +#define T_8192_3395 {-0.8589092739478239035832984882290475070477,-0.5121277761715546938958709688449744135141} +#define T_8192_3405 {-0.8628119426966003313594910650863312184811,-0.5055250256318853940840085670060943812132} +#define T_8192_3411 {-0.8651291952716236854925568877661135047674,-0.5015490758526753856116897622996475547552} +#define T_8192_3415 {-0.8666638546881111349051707293256185948849,-0.4988925365017446433846259878919227048755} +#define T_8192_3417 {-0.8674281262823069216238991430145688354969,-0.4975625043493191457244506636925507336855} +#define T_8192_3425 {-0.8704647833253976729750434060406405478716,-0.4922306989514860786627536981541197746992} +#define T_8192_3429 {-0.8719708292541578087408993269491475075483,-0.4895578341011574963204111554659903049469} +#define T_8192_3435 {-0.8742145050107062997923890179663430899382,-0.4855399048779469595160662720445543527603} +#define T_8192_3445 {-0.8779127991586418433556104901072103530169,-0.4788205478813939430793311657907906919718} +#define T_8192_3447 {-0.8786462674850681331406576646259054541588,-0.4774732836866980578705010884732473641634} +#define T_8192_3451 {-0.8801069997982403636527237722475547343493,-0.4747753878479171185666984911222243681550} +#define T_8192_3455 {-0.8815594482091437811277501168660819530487,-0.4720730232423686612008850715938024222851} +#define T_8192_3465 {-0.8851542376402851131089732916734647005796,-0.4652977278984345987922210952092427760363} +#define T_8192_3475 {-0.8886969559808916008236678862886037677526,-0.4584950604208262725514089197531575337052} +#define T_8192_3483 {-0.8914934993147913822042482934193685650826,-0.4530334873709316312329065112862735986710} +#define T_8192_3485 {-0.8921873948229824824451839049288537353277,-0.4516654209910024864171873559826053678989} +#define T_8192_3495 {-0.8956253488340301105452567753673065453768,-0.4448092113771048849990563667233800515532} +#define T_8192_3501 {-0.8976628442590408596402085095178335905075,-0.4406828996418729049722173840564209967852} +#define T_8192_3505 {-0.8990106157690390675796265895769465714693,-0.4379268349103228596952419593435479328036} +#define T_8192_3515 {-0.9023429964824442039983409813430625945330,-0.4310186964611670279445831965858815237880} +#define T_8192_3519 {-0.9036610966092479824141037170193158090115,-0.4282483187065319607533808721200330182910} +#define T_8192_3525 {-0.9056222949398252675123899280151817947626,-0.4240852024156515631680974820483243092895} +#define T_8192_3535 {-0.9088483182294391227173946390394121408463,-0.4171267606513878734020295269147027283907} +#define T_8192_3537 {-0.9094871131115054296856214932631701231003,-0.4157321145691053598802966462244512513280} +#define T_8192_3545 {-0.9120208765735683398290234435989987105131,-0.4101437805135902392450475417717825621367} +#define T_8192_3553 {-0.9145203029651044479564347966515924781561,-0.4045400047765529971677267440099967643619} +#define T_8192_3555 {-0.9151397833396852643517149772378616034985,-0.4031366727909952984987285162787884473801} +#define T_8192_3565 {-0.9182048550514309015468938923731911927462,-0.3961058496916963211909035180724458768964} +#define T_8192_3573 {-0.9206180299070838568553654113202355802059,-0.3904643940361266496452685714757535606623} +#define T_8192_3575 {-0.9212159113994087267229815552127547562122,-0.3890517248188943844056097987049724906683} +#define T_8192_3585 {-0.9241727752517910898788500162481795996428,-0.3819747131465672795869181754824239760637} +#define T_8192_3587 {-0.9247576295595139050931265956023707985878,-0.3805566010089285189366137274191714823246} +#define T_8192_3591 {-0.9259208086717699570655781826644670218229,-0.3777176936133856410826581395667744800448} +#define T_8192_3595 {-0.9270752726647401020798611170903313905001,-0.3748752309950575956065677019068971276283} +#define T_8192_3605 {-0.9299232328926396728974168581771664321423,-0.3677536960065819759968519520043628290296} +#define T_8192_3609 {-0.9310471089355952800659110835113096982241,-0.3648990010162673214288986400788417086005} +#define T_8192_3615 {-0.9327164883981402532242555025732144713402,-0.3606105271206623275048741561477072536945} +#define T_8192_3621 {-0.9343661149437257895655761785747017711401,-0.3563144162744024123767871969903353601694} +#define T_8192_3625 {-0.9354548748620146181309564781258814036846,-0.3534461445494808118361618198832729831338} +#define T_8192_3627 {-0.9359959536368313015586295477987732738256,-0.3520107594598191336210391000349773094058} +#define T_8192_3635 {-0.9381382311928243566967466904316097497940,-0.3462609697531600083664216072065755724907} +#define T_8192_3645 {-0.9407663995363960696138860839710105210543,-0.3390554254149696356002152697328710928559} +#define T_8192_3655 {-0.9433392252851077186548423014755826443434,-0.3318299354164611081330349406925961375237} +#define T_8192_3663 {-0.9453575373976322859803644860221538692713,-0.3260354681403302423703394197218585759401} +#define T_8192_3665 {-0.9458565570869839067569273538538254797459,-0.3245849248125321495628270440647611394525} +#define T_8192_3675 {-0.9483182468545990895236741380358580499887,-0.3173208198064217389067209751374321058393} +#define T_8192_3681 {-0.9497684921596066809357239435485098510981,-0.3129533692115601950511916129471501335502} +#define T_8192_3685 {-0.9507241497737896063213725028617773205042,-0.3100380477246378885247679590975167229772} +#define T_8192_3689 {-0.9516708588101938648406985521432943642139,-0.3071198080415330489145731007738504558802} +#define T_8192_3695 {-0.9530741243121721995024131501850206404924,-0.3027370369918191972402610190329141914845} +#define T_8192_3699 {-0.9539984231038944884062402707058936357498,-0.2998116220483833527232775395532371476293} +#define T_8192_3705 {-0.9553680322274703540230689213785808533430,-0.2954182171055320105246266848553204908967} +#define T_8192_3715 {-0.9576057385756463480319666814466472715139,-0.2880820186110041314364593745267484337091} +#define T_8192_3717 {-0.9580465240148186012447695247828960418701,-0.2866127314393477898413209459249628707767} +#define T_8192_3723 {-0.9593553499539307916066377401875797659159,-0.2822008371971475582107302670920034870505} +#define T_8192_3725 {-0.9597871117188399026076695008669048547745,-0.2807288730757971917384452353871893137693} +#define T_8192_3735 {-0.9619120233331122094000420474912971258163,-0.2733592130644187379040488394821295514703} +#define T_8192_3745 {-0.9639803484159941149300721008330583572388,-0.2659734721128755863261972081090789288282} +#define T_8192_3753 {-0.9655941843029768323347639125131536275148,-0.2600535930154951880233227257122052833438} +#define T_8192_3755 {-0.9659919652938405709718949765374418348074,-0.2585720847031703351071030283492291346192} +#define T_8192_3757 {-0.9663874732122988975291377755638677626848,-0.2570899679457531172843687272688839584589} +#define T_8192_3765 {-0.9679467556289877983033420605352148413658,-0.2511554862377419206076467617094749584794} +#define T_8192_3771 {-0.9690923051125062137600707501405850052834,-0.2466984073149424416815378435785532929003} +#define T_8192_3775 {-0.9698446044267148291595503906137309968472,-0.2437241130138521616466107388987438753247} +#define T_8192_3785 {-0.9716854000420085402112135852803476154804,-0.2362784021979195681062435596686555072665} +#define T_8192_3789 {-0.9724057190274497664006503327982500195503,-0.2332962014322316202097340465115848928690} +#define T_8192_3791 {-0.9727624466956885651569564288365654647350,-0.2318042758419647519918527223126147873700} +#define T_8192_3795 {-0.9734690341861310658444494947616476565599,-0.2288187917998022180565698135978891514242} +#define T_8192_3805 {-0.9751954019329903688984018117480445653200,-0.2213457206470308413770453626057133078575} +#define T_8192_3807 {-0.9755337945182913639285970930359326303005,-0.2198495297987786978310964514093939214945} +#define T_8192_3815 {-0.9768644017253126365929460916959214955568,-0.2138596283589937752100951229294878430665} +#define T_8192_3825 {-0.9784759353806168347134075702342670410872,-0.2063609553210755120922215155587764456868} +#define T_8192_3835 {-0.9800299080969900877846612274879589676857,-0.1988501426587501175191619040560908615589} +#define T_8192_3843 {-0.9812315808487497292844636831432580947876,-0.1928330488922052332600998170164530165493} +#define T_8192_3845 {-0.9815262284586647734130337994429282844067,-0.1913276322116309047238758012099424377084} +#define T_8192_3855 {-0.9829648084413964426175880362279713153839,-0.1837938665074784483355330166887142695487} +#define T_8192_3859 {-0.9835240540575712619997261754178907722235,-0.1807773080067285875749405477108666673303} +#define T_8192_3861 {-0.9838002057026316027332768499036319553852,-0.1792683889018357457079133610022836364806} +#define T_8192_3865 {-0.9843455634176419000169744322192855179310,-0.1762492887361678806090736770784133113921} +#define T_8192_3875 {-0.9856684121615375548941528904833830893040,-0.1686943427236173298489063654415076598525} +#define T_8192_3879 {-0.9861813203679282713309817154367920011282,-0.1656695607447841167569890785671304911375} +#define T_8192_3885 {-0.9869332768536777100720769340114202350378,-0.1611294729056788055387983149557840079069} +#define T_8192_3893 {-0.9879033699729777850961909280158579349518,-0.1550707309457005356190251177395111881196} +#define T_8192_3895 {-0.9881400830856925665557355387136340141296,-0.1535551243019934453126751350282575003803} +#define T_8192_3897 {-0.9883744710093412821905189957760740071535,-0.1520391563282460500872161901497747749090} +#define T_8192_3905 {-0.9892887598646251667844353505643084645271,-0.1459717424898122339893546950406744144857} +#define T_8192_3915 {-0.9903792396171081646727429870225023478270,-0.1383797735777838877613277190903318114579} +#define T_8192_3925 {-0.9914114581933385350609455599624197930098,-0.1307796641797117076500001076055923476815} +#define T_8192_3927 {-0.9916109051634953708287412155186757445335,-0.1292587047777961350991660083309398032725} +#define T_8192_3933 {-0.9921952440866739220126646614517085254192,-0.1246940159421676547202295637362112756819} +#define T_8192_3935 {-0.9923853548708516658649614328169263899326,-0.1231718613882804846948459953637211583555} +#define T_8192_3945 {-0.9933008723580932786134667367150541394949,-0.1155568127487552748666743696048797573894} +#define T_8192_3951 {-0.9938221382915196633334176112839486449957,-0.1109844918971633898063799961164477281272} +#define T_8192_3955 {-0.9941579567977897324837499581917654722929,-0.1079349662326536535283949547192605677992} +#define T_8192_3961 {-0.9946441384810507102542942448053508996964,-0.1033587818488996279420177870633779093623} +#define T_8192_3965 {-0.9949565577701163787693872109230142086744,-0.1003067702113928649776397605819511227310} +#define T_8192_3969 {-0.9952596121491333880371144005039241164923,-0.0972538144483632710501908036349050235003} +#define T_8192_3975 {-0.9956966282956635216905283414234872907400,-0.0926726734299133103611723072390304878354} +#define T_8192_3985 {-0.9963781248382002075913987937383353710175,-0.0850331249802802752180141965254733804613} +#define T_8192_3987 {-0.9965073916801108211416249105241149663925,-0.0835046006331524315324799090376473031938} +#define T_8192_3995 {-0.9970010073072352874135049205506220459938,-0.0773885742752650485076770792147726751864} +#define T_8192_4005 {-0.9975652390603757524445427407044917345047,-0.0697394710219073066248540726519422605634} +#define T_8192_4015 {-0.9980707869054823433430101431440562009811,-0.0620862651950600946682001790577487554401} +#define T_8192_4023 {-0.9984329526665084442171860246162395924330,-0.0559610492185205685156113020184420747682} +#define T_8192_4025 {-0.9985176211026222103939176122366916388273,-0.0544294070109191327477837774040381191298} +#define T_8192_4029 {-0.9986799089558990871751120721455663442612,-0.0513657419671625925516877941845450550318} +#define T_8192_4035 {-0.9989057153658182919286900869337841868401,-0.0467693469005378628655655859347461955622} +#define T_8192_4041 {-0.9991103671141748909789725985319819301367,-0.0421719613603479467900392307910806266591} +#define T_8192_4045 {-0.9992350468645958549984698038315400481224,-0.0391065354833298878256542252529470715672} +#define T_8192_4055 {-0.9995055962253253101224004240066278725863,-0.0314414235405603009754216259352688211948} +#define T_8192_4059 {-0.9995973532896483826348799084371421486139,-0.0283748356176720985255546025882722460665} +#define T_8192_4063 {-0.9996797017629879267275327947572804987431,-0.0253079806200245706337970119648161926307} +#define T_8192_4065 {-0.9997173475323621882893121437518857419491,-0.0237744619888275582342274105940305162221} +#define T_8192_4075 {-0.9998702883289829479451782390242442488670,-0.0161061018535372871274269357400044100359} +#define T_8192_4077 {-0.9998938183744184859946813048736657947302,-0.0145723016927790660623998064693296328187} +#define T_8192_4085 {-0.9999644096181182773008799813396763056517,-0.0084367942423698005088850138122325006407} +#define T_8192_4095 {-0.9999997058628822266257429873803630471230,-0.0007669903187427044854995727973800967447} +#define T_8192_4097 {-0.9999997058628822266257429873803630471230,0.0007669903187427044854995727973800967447} +#define T_8192_4105 {-0.9999761749868976146160548523766919970512,0.0069028587247297558057712585366516577778} +#define T_8192_4113 {-0.9999149955731134742364929479663260281086,0.0130384672419873327148254205098965030629} +#define T_8192_4115 {-0.9998938183744184859946813048736657947302,0.0145723016927790660623998064693296328187} +#define T_8192_4125 {-0.9997526408702488431856636452721431851387,0.0222408874140249609996367041730991331860} +#define T_8192_4131 {-0.9996397036507101985236545260704588145018,0.0268414396990985307245303204126685159281} +#define T_8192_4135 {-0.9995526507794569859299826930509880185127,0.0299081647675165582245249140669329790398} +#define T_8192_4145 {-0.9992938598668877903108409554988611489534,0.0375736827092705005792794281660462729633} +#define T_8192_4149 {-0.9991738825657163758364731620531529188156,0.0406392962359337361899491725125699304044} +#define T_8192_4155 {-0.9989762833564698185639940675173420459032,0.0452369902988045899383529047099727904424} +#define T_8192_4165 {-0.9985999399303203682976004529336933046579,0.0528976367256653243198449843021080596372} +#define T_8192_4167 {-0.9985176211026222103939176122366916388273,0.0544294070109191327477837774040381191298} +#define T_8192_4175 {-0.9981648517276462406755399570101872086525,0.0605551713359477883358295002835802733898} +#define T_8192_4185 {-0.9976710443434410047203186877595726400614,0.0682091436588063287915773003078356850892} +#define T_8192_4195 {-0.9971185468269799789808871537388768047094,0.0758591034329544472436523960823251400143} +#define T_8192_4199 {-0.9968811217478138475556193043303210288286,0.0789178630147849557996408975668600760400} +#define T_8192_4203 {-0.9966343136438698957491055807622615247965,0.0819758797916330800292783465010870713741} +#define T_8192_4205 {-0.9965073916801108211416249105241149663925,0.0835046006331524315324799090376473031938} +#define T_8192_4215 {-0.9958376148553416129516335786320269107819,0.0911451854966810193214854507459676824510} +#define T_8192_4221 {-0.9954076266025349006838496279669925570488,0.0957269914993071763253951189653889741749} +#define T_8192_4225 {-0.9951092557537261074074308453418780118227,0.0987804085497996364750861175707541406155} +#define T_8192_4233 {-0.9944844179107475978796060189779382199049,0.1048844246431349658266185542743187397718} +#define T_8192_4235 {-0.9943223572225458051221380628703627735376,0.1064098206341876767799803360503574367613} +#define T_8192_4239 {-0.9939912170233293764454174379352480173111,0.1094598578497179841573228031847975216806} +#define T_8192_4245 {-0.9934769655527891929480688304465729743242,0.1140329729333672131863508525384531822056} +#define T_8192_4255 {-0.9925731304764288109865333353809546679258,0.1216494169991055307455951606243615970016} +#define T_8192_4257 {-0.9923853548708516658649614328169263899326,0.1231718613882804846948459953637211583555} +#define T_8192_4265 {-0.9916109051634953708287412155186757445335,0.1292587047777961350991660083309398032725} +#define T_8192_4267 {-0.9914114581933385350609455599624197930098,0.1307796641797117076500001076055923476815} +#define T_8192_4275 {-0.9905903462189501462731300307495985180140,0.1368603886368163768949557379528414458036} +#define T_8192_4285 {-0.9895115136793551924299094935122411698103,0.1444540213908604708858973708629491738975} +#define T_8192_4293 {-0.9886065331923864496488363329262938350439,0.1505228305916774256267842702072812244296} +#define T_8192_4295 {-0.9883744710093412821905189957760740071535,0.1520391563282460500872161901497747749090} +#define T_8192_4301 {-0.9876643322282057102512453639064915478230,0.1565859726929984541143880960589740425348} +#define T_8192_4305 {-0.9871792850978743372181156701117288321257,0.1596153472371930337470047334136324934661} +#define T_8192_4311 {-0.9864342939016271794017143292876426130533,0.1641565832210158393245080787892220541835} +#define T_8192_4315 {-0.9859260262543211306152102224586997181177,0.1671821484320729356287671407699235714972} +#define T_8192_4325 {-0.9846147682043125959339135988557245582342,0.1747391147796272248271520766138564795256} +#define T_8192_4329 {-0.9840740423707764472638359620759729295969,0.1777590479611071694332480319644673727453} +#define T_8192_4335 {-0.9832455880854070739971461989625822752714,0.1822858017251532958269422124431002885103} +#define T_8192_4345 {-0.9818185664425524983300874737324193120003,0.1898217653186564379819145642613875679672} +#define T_8192_4347 {-0.9815262284586647734130337994429282844067,0.1913276322116309047238758012099424377084} +#define T_8192_4355 {-0.9803337872233479632910757572972215712070,0.1973465622409659170344298217969480901957} +#define T_8192_4365 {-0.9787913377731056741026804957073181867599,0.2048597498298144192752090475551085546613} +#define T_8192_4375 {-0.9771913088297122795822247098840307444334,0.2123608861058784436082191859895829111338} +#define T_8192_4383 {-0.9758698915783410310353929162374697625637,0.2183528216233463215001364687850582413375} +#define T_8192_4385 {-0.9755337945182913639285970930359326303005,0.2198495297987786978310964514093939214945} +#define T_8192_4395 {-0.9738188923456660983646315798978321254253,0.2273252403730388615521462725155288353562} +#define T_8192_4401 {-0.9727624466956885651569564288365654647350,0.2318042758419647519918527223126147873700} +#define T_8192_4405 {-0.9720467031946234959249864004959817975760,0.2347875780540009671426560089457780122757} +#define T_8192_4415 {-0.9702173313179791591664979932829737663269,0.2422361038536960387013152740109944716096} +#define T_8192_4419 {-0.9694695953974130642194495521835051476955,0.2452115486676275657451640199724351987243} +#define T_8192_4425 {-0.9683308843324451853362688780180178582668,0.2496703795966685734963874665481853298843} +#define T_8192_4435 {-0.9663874732122988975291377755638677626848,0.2570899679457531172843687272688839584589} +#define T_8192_4437 {-0.9659919652938405709718949765374418348074,0.2585720847031703351071030283492291346192} +#define T_8192_4445 {-0.9643872122828542892136738373665139079094,0.2644944324278016289930803850438678637147} +#define T_8192_4455 {-0.9623302192137374033720220722898375242949,0.2718833374593597751456286459870170801878} +#define T_8192_4465 {-0.9602166150119634258075507204921450465918,0.2792562483722911825800849783263402059674} +#define T_8192_4473 {-0.9584850550779761002573309269791934639215,0.2851427698402487220796786004939349368215} +#define T_8192_4475 {-0.9580465240148186012447695247828960418701,0.2866127314393477898413209459249628707767} +#define T_8192_4485 {-0.9558200738825454179092844242404680699110,0.2939523538996846596660361683461815118790} +#define T_8192_4491 {-0.9544572057665134945381169018219225108624,0.2983478546267414044379506776749622076750} +#define T_8192_4495 {-0.9535373955908332810338379204040393233299,0.3012746839843179480489254729036474600434} +#define T_8192_4505 {-0.9511986234231132320005031033360864967108,0.3085792909415250306892630760557949542999} +#define T_8192_4509 {-0.9502474389787052277966949986875988543034,0.3114960749582759147457977633166592568159} +#define T_8192_4515 {-0.9488038949626584894758707378059625625610,0.3158657450621840112603422312531620264053} +#define T_8192_4525 {-0.9463533510844905904946244845632463693619,0.3231336177050523339460141869494691491127} +#define T_8192_4527 {-0.9458565570869839067569273538538254797459,0.3245849248125321495628270440647611394525} +#define T_8192_4535 {-0.9438471359470926858037387319200206547976,0.3303824813219827793986382857838179916143} +#define T_8192_4545 {-0.9412853969839286571996694874542299658060,0.3376119094830745681612427233631024137139} +#define T_8192_4555 {-0.9386682848947701662822851176315452903509,0.3448214769017593495092910416133236140013} +#define T_8192_4563 {-0.9365348299227554962342878752679098397493,0.3505745460548375658227371332031907513738} +#define T_8192_4565 {-0.9359959536368313015586295477987732738256,0.3520107594598191336210391000349773094058} +#define T_8192_4575 {-0.9332685604157120451418450102210044860840,0.3591793342323365001433899124094750732183} +#define T_8192_4581 {-0.9316057613512578328140989469829946756363,0.3634703638773638112446917602937901392579} +#define T_8192_4585 {-0.9304862656761496708668346400372684001923,0.3663267795125736414618700109713245183229} +#define T_8192_4595 {-0.9276492330925811824471338695730082690716,0.3734526748367802961858785693038953468204} +#define T_8192_4599 {-0.9264991307392305142087707281461916863918,0.3762969050357047873234250801033340394497} +#define T_8192_4605 {-0.9247576295595139050931265956023707985878,0.3805566010089285189366137274191714823246} +#define T_8192_4615 {-0.9218116251817081163721923076082020998001,0.3876381401253727321254416438023326918483} +#define T_8192_4625 {-0.9188113932641699399539447767892852425575,0.3946968755994336164327762617176631465554} +#define T_8192_4635 {-0.9157571103019567226866115561279002577066,0.4017323921859050073202013209083816036582} +#define T_8192_4645 {-0.9126489559697938958038321288768202066422,0.4087442760054814105963316706038312986493} +#define T_8192_4655 {-0.9094871131115054296856214932631701231003,0.4157321145691053598802966462244512513280} +#define T_8192_4665 {-0.9062717677292575491065917958621867001057,0.4226954968022330061394598033075453713536} +#define T_8192_4675 {-0.9030031089726171522613640263443812727928,0.4296340130690163849891405334346927702427} +#define T_8192_4685 {-0.8996813291274239343664476109552197158337,0.4365472551964011960201617057464318349957} +#define T_8192_4695 {-0.8963066236044795465076617801969405263662,0.4434348164981384843308376275672344490886} +#define T_8192_4705 {-0.8928791909280516803093519229150842875242,0.4502962917987086699511678489216137677431} +#define T_8192_4715 {-0.8893992327241955209160551021341234445572,0.4571312774571569814696658795583061873913} +#define T_8192_4725 {-0.8858669537088927903312196576735004782677,0.4639393713908385175059834182320628315210} +#define T_8192_4735 {-0.8822825616760087141798862830910366028547,0.4707201730990716592728517753130290657282} +#define T_8192_4745 {-0.8786462674850681331406576646259054541588,0.4774732836866980578705010884732473641634} +#define T_8192_4755 {-0.8749582850488516472253763822664041072130,0.4841983058875490297801036376768024638295} +#define T_8192_4765 {-0.8712188313208110157503938353329431265593,0.4908948440878151409094698465196415781975} +#define T_8192_4775 {-0.8674281262823069216238991430145688354969,0.4975625043493191457244506636925507336855} +#define T_8192_4785 {-0.8635863929296679897262833947024773806334,0.5042008944326904495980556930589955300093} +#define T_8192_4795 {-0.8596938572610726136957737253396771848202,0.5108096238204390404646915158082265406847} +#define T_8192_4805 {-0.8557507482632539241862446033337619155645,0.5173883037399290563129738984571304172277} +#define T_8192_4815 {-0.8517572978980291198425334187049884349108,0.5239365471862486023368887799733784049749} +#define T_8192_4825 {-0.8477137410886542712162849966262001544237,0.5304539689449763173456631193403154611588} +#define T_8192_4835 {-0.8436203157060040425108127237763255834579,0.5369401856148429130755062033131252974272} +#define T_8192_4845 {-0.8394772625545785516010255378205329179764,0.5433948156302847998233573889592662453651} +#define T_8192_4855 {-0.8352848253583373683284207800170406699181,0.5498174792838909086256649061397183686495} +#define T_8192_4865 {-0.8310432507463623164056798486853949725628,0.5562077987487399344246341570396907627583} +#define T_8192_4875 {-0.8267527882383485238193543409579433500767,0.5625653981006265569320135000452864915133} +#define T_8192_4885 {-0.8224136902299263862659017831902019679546,0.5688899033401758620343002803565468639135} +#define T_8192_4895 {-0.8180262119778134444203487873892299830914,0.5751809424148451865832498697272967547178} +#define T_8192_4905 {-0.8135906115847985065059333464887458831072,0.5814381452408102779472187648934777826071} +#define T_8192_4915 {-0.8091071499845582382093311935022938996553,0.5876611437247366565017614448152016848326} +#define T_8192_4925 {-0.8045760909263071081198859246796928346157,0.5938495717854336275465243488724809139967} +#define T_8192_4935 {-0.7999977009592819099381699743389617651701,0.6000030653753890552692951132485177367926} +#define T_8192_4945 {-0.7953722494170613055430862914363387972116,0.6061212625021862310248366156884003430605} +#define T_8192_4955 {-0.7907000084017216101628378055465873330832,0.6122038032497979509471974779444281011820} +#define T_8192_4965 {-0.7859812527678301519173942324414383620024,0.6182503297997602453861532012524548918009} +#define T_8192_4975 {-0.7812162601062760947101537567505147308111,0.6242604864522207641641671216348186135292} +#define T_8192_4985 {-0.7764053107279403898033365294395480304956,0.6302339196468644821891302854055538773537} +#define T_8192_4995 {-0.7715486876472062993670419928093906491995,0.6361702779837121735084792817360721528530} +#define T_8192_5005 {-0.7666466765653104920019700330158229917288,0.6420692122437925419831117324065417051315} +#define T_8192_5015 {-0.7616995658535352653473182726884260773659,0.6479303754096854550681428008829243481159} +#define T_8192_5025 {-0.7567076465362456705321392291807569563389,0.6537534226859361696782002582040149718523} +#define T_8192_5035 {-0.7516712122737684298456883880135137587786,0.6595380115193386627581162429123651236296} +#define T_8192_5045 {-0.7465905593451171995411641546525061130524,0.6652838016190871783805960149038583040237} +#define T_8192_5055 {-0.7414659866305632895944199844961985945702,0.6709904549767942150140243029454723000526} +#define T_8192_5065 {-0.7362977955940531726852782412606757134199,0.6766576358863749529604092458612285554409} +#define T_8192_5075 {-0.7310862902654743367136802589811850339174,0.6822850109637955684505072895262856036425} +#define T_8192_5085 {-0.7258317772227702580067898452398367226124,0.6878722491666855454184315021848306059837} +#define T_8192_5095 {-0.7205345655739052723731674632290378212929,0.6934190218138118755319965202943421900272} +#define T_8192_5105 {-0.7151949669386801211601323302602395415306,0.6989250026044141472780779622553382068872} +#define T_8192_5115 {-0.7098132954304008368495715330936945974827,0.7043898676374004130806838475109543651342} +// Pre-computed twiddles for N=10000 +#define T_10000_1 {0.9999998026079184310077607733546756207943,-0.0006283184893762571799138982875376768789} +#define T_10000_3 {0.9999982234717338380747264636738691478968,-0.0018849544759281138591594517706084843667} +#define T_10000_7 {0.9999903278032789000207003482501022517681,-0.0043982155348355573762364478795916511444} +#define T_10000_9 {0.9999840112834769145777613630343694239855,-0.0056548366384088812053310135752326459624} +#define T_10000_11 {0.9999761156524269445355912466766312718391,-0.0069114488122232892175489382680098060519} +#define T_10000_13 {0.9999666409225973495722428197041153907776,-0.0081680500719166337225685126099961053114} +#define T_10000_17 {0.9999429542289399730492505113943479955196,-0.0106812119115808640962272946239863813389} +#define T_10000_19 {0.9999287423025168264345552415761630982161,-0.0119377685229261672889000678310367220547} +#define T_10000_21 {0.9999129513521228806993690341187175363302,-0.0131943062829055109130838019382281345315} +#define T_10000_23 {0.9998955814026943000882852174981962889433,-0.0144508232072742583140811944986126036383} +#define T_10000_27 {0.9998561046189446965826164159807376563549,-0.0169637866123690667086165717591939028352} +#define T_10000_29 {0.9998339978469628075430364333442412316799,-0.0182202291247829002707181444975503836758} +#define T_10000_31 {0.9998103122006246623598713085812050849199,-0.0194766428649679537721262079230655217543} +#define T_10000_33 {0.9997850477173330085989277904445771127939,-0.0207330258488754304302403852489078417420} +#define T_10000_37 {0.9997297824019668599504484518547542393208,-0.0232456916119084672223138454683066811413} +#define T_10000_39 {0.9996997816571637773819247740902937948704,-0.0245019704231918197812056803286395734176} +#define T_10000_41 {0.9996682022499501218959494508453644812107,-0.0257582105425194426984170092964632203802} +#define T_10000_43 {0.9996350442301940031342155634774826467037,-0.0270144099861167157061725418998321401887} +#define T_10000_47 {0.9995639925649913992700135167979169636965,-0.0295266789113460305371550163044958026148} +#define T_10000_49 {0.9995260990317452742814907651336397975683,-0.0307827444257625232326702757745806593448} +#define T_10000_51 {0.9994866271103569621558904145786073058844,-0.0320387613300238335845016024450160330161} +#define T_10000_53 {0.9994455768631581582539524788444396108389,-0.0332947276407078301962982891382125671953} +#define T_10000_57 {0.9993587416531172218370215887262020260096,-0.0358065005480579442997957073657744331285} +#define T_10000_59 {0.9993129568273997342942038812907412648201,-0.0370623031782917958243395162298838840798} +#define T_10000_61 {0.9992655939501207118524916950264014303684,-0.0383180472820900605568361640962393721566} +#define T_10000_63 {0.9992166530960727710564128756232094019651,-0.0395737308764613909817242642930068541318} +#define T_10000_67 {0.9991140377692988572277954517630860209465,-0.0420849086054387727662273732676112558693} +#define T_10000_69 {0.9990603634586164849551437328045722097158,-0.0433403987745524174601818856444879202172} +#define T_10000_71 {0.9990051114952520627809917641570791602135,-0.0445958205032605942141898935915378388017} +#define T_10000_73 {0.9989482819664560198091862730507273226976,-0.0458511718090810344028440681540814694017} +#define T_10000_77 {0.9988298905740264910235737261245958507061,-0.0483616552226887597143445418623741716146} +#define T_10000_79 {0.9987683288973492334861248309607617557049,-0.0496167833660800736139684374848002335057} +#define T_10000_81 {0.9987051900291522565922264220716897398233,-0.0508718331577979396285194013671571156010} +#define T_10000_83 {0.9986404740691406933805751577892806380987,-0.0521268026159474318070863319007912650704} +#define T_10000_87 {0.9985063112849450472907619769102893769741,-0.0546364926045989768299548927643627393991} +#define T_10000_89 {0.9984368646726223817111645075783599168062,-0.0558912091719579728898992243557586334646} +#define T_10000_91 {0.9983658413922074092994307648041285574436,-0.0571458374794687473663401533485739491880} +#define T_10000_93 {0.9982932415558555261370088373951148241758,-0.0584003755459019321949476477584539679810} +#define T_10000_97 {0.9981433126764107655048974265810102224350,-0.0609091730313338214353180433136003557593} +#define T_10000_99 {0.9980659838700760566609915258595719933510,-0.0621634284885988486202279545977944508195} +#define T_10000_101 {0.9979870789813204412510572183236945420504,-0.0634175857813252114025814876185904722661} +#define T_10000_103 {0.9979065981347454705741029101773165166378,-0.0646716429290273425722901379231188911945} +#define T_10000_107 {0.9977409090789870482751666713738813996315,-0.0671794488682106033516916454573220107704} +#define T_10000_109 {0.9976557011314491907327806075045373290777,-0.0684331936995238310217359867237973958254} +#define T_10000_111 {0.9975689177493828330156588890531565994024,-0.0696868304654833237155031611109734512866} +#define T_10000_113 {0.9974805590698309076813643514469731599092,-0.0709403571864255094725493222540535498410} +#define T_10000_117 {0.9972991163788790247579640890762675553560,-0.0734470725754756970093950485534151084721} +#define T_10000_119 {0.9972060326540018726504399637633468955755,-0.0747002572851379470497690249430888798088} +#define T_10000_121 {0.9971113742046840977550914431049022823572,-0.0759533240328976178012965192465344443917} +#define T_10000_123 {0.9970151411804044627729126659687608480453,-0.0772062708399913061318997620219306554645} +#define T_10000_127 {0.9968179520173059415810712380334734916687,-0.0797117967180770575774673147861903999001} +#define T_10000_129 {0.9967169961898755259355198177217971533537,-0.0809643718325017691661571461736457422376} +#define T_10000_131 {0.9966144664102599692867556768760550767183,-0.0822168190931320264080994775213184766471} +#define T_10000_133 {0.9965103628403676472302663569280412048101,-0.0834691365221826436782137648151547182351} +#define T_10000_137 {0.9962974349898129355906917226093355566263,-0.0859733739754324349791048121005587745458} +#define T_10000_139 {0.9961886110453925802232788555556908249855,-0.0872252900450988660763229631811555009335} +#define T_10000_141 {0.9960782139831791948125783164869062602520,-0.0884770683741263747057459454481431748718} +#define T_10000_143 {0.9959662439775049946888429985847324132919,-0.0897287069857861385724362435212242417037} +#define T_10000_147 {0.9957375858455209671760144374275114387274,-0.0922315571511932580373027690257003996521} +#define T_10000_149 {0.9956208980802935260001618189562577754259,-0.0934827647525985905296863620606018230319} +#define T_10000_151 {0.9955026380937694874262433586409315466881,-0.0947338247319583354189020951707789208740} +#define T_10000_153 {0.9953828060726973570027098503487650305033,-0.0959847351136779852742719754132849629968} +#define T_10000_157 {0.9951384266863156913274224280030466616154,-0.0984860991830034809835225928509316872805} +#define T_10000_159 {0.9950138797069144613871571891650091856718,-0.0997365489206140237721243124724423978478} +#define T_10000_161 {0.9948877614647814171533468652341980487108,-0.1009868411606001520341990840279322583228} +#define T_10000_163 {0.9947600721590743599875850122771225869656,-0.1022369739285797496375352011455106548965} +#define T_10000_167 {0.9944999811659747113168350551859475672245,-0.1047367531522531702847800261224620044231} +#define T_10000_169 {0.9943675798893011341661463120544794946909,-0.1059863956604543966388831677249982021749} +#define T_10000_171 {0.9942336083704913507474998368707019835711,-0.1072358708016701162035744232525757979602} +#define T_10000_173 {0.9940980668211046866744595718046184629202,-0.1084851766028085418014370588934980332851} +#define T_10000_177 {0.9938222744892339921563007010263390839100,-0.1109832722938264570888833304707077331841} +#define T_10000_179 {0.9936820241422638089190400023653637617826,-0.1122320582388718623878531843729433603585} +#define T_10000_181 {0.9935402046357434402423791652836371213198,-0.1134806669541780638210681786404165904969} +#define T_10000_183 {0.9933968161936252982968653668649494647980,-0.1147290964680214297999327754951082170010} +#define T_10000_187 {0.9931053334107927677010252409672830253839,-0.1172254100058433695652482242621772456914} +#define T_10000_189 {0.9929572395303694065660238265991210937500,-0.1184732900878021355506319878259091638029} +#define T_10000_191 {0.9928075776349297720813069645373616367579,-0.1197209830842646677284690781561948824674} +#define T_10000_193 {0.9926563479608104811902080655272584408522,-0.1209684870249534416331016473122872412205} +#define T_10000_197 {0.9923491862342572744637436699122190475464,-0.1234629198593953230966846490446187090129} +#define T_10000_199 {0.9921932546668735808381711649417411535978,-0.1247098448140984972010159026467590592802} +#define T_10000_201 {0.9920357562909100090919878311979118734598,-0.1259565728349342772496299858175916597247} +#define T_10000_203 {0.9918766913550780550679064617725089192390,-0.1272031019531489337559548857825575396419} +#define T_10000_207 {0.9915538628110230900958299571357201784849,-0.1296955556082735883105527818770497106016} +#define T_10000_209 {0.9913900997125904002160723393899388611317,-0.1309414762092590556186166850238805636764} +#define T_10000_211 {0.9912247710738691930387744832842145115137,-0.1321871900357805551529111198760801926255} +#define T_10000_213 {0.9910578771559358512988069378479849547148,-0.1334326951206859235288249010409344919026} +#define T_10000_217 {0.9907193945390972977804722177097573876381,-0.1359230711986908202160151404314092360437} +#define T_10000_219 {0.9905478063747024108920413709711283445358,-0.1371679382591466100915766901380266062915} +#define T_10000_221 {0.9903746540001150400200913281878456473351,-0.1384125887127049536751144387380918487906} +#define T_10000_223 {0.9901999376887664627844287679181434214115,-0.1396570205938928999067627501062816008925} +#define T_10000_227 {0.9898458143618584781364688751637004315853,-0.1421452207789946908800260416683158837259} +#define T_10000_229 {0.9896664079055081941760363406501710414886,-0.1433889851537007986692628946912009268999} +#define T_10000_231 {0.9894854386308139515548987219517584890127,-0.1446325230976272802063675726458313874900} +#define T_10000_233 {0.9893029068235509315698550381057430058718,-0.1458758326470579735900656714875367470086} +#define T_10000_237 {0.9889331567667561939671827531128656119108,-0.1483617587093738199310877234893268905580} +#define T_10000_239 {0.9887459391011101939028549168142490088940,-0.1496043712966423910870616964530199766159} +#define T_10000_241 {0.9885571600706661854474077699705958366394,-0.1508467476381882554914426464165444485843} +#define T_10000_243 {0.9883668199735319292997814955015201121569,-0.1520888857721295339686662373424042016268} +#define T_10000_247 {0.9879814577839495237654432457929942756891,-0.1545724395715551147123534292404656298459} +#define T_10000_249 {0.9877864363000407044168582615384366363287,-0.1558138513151689219515816375860595144331} +#define T_10000_251 {0.9875898549665198533631382815656252205372,-0.1570550170074434814804220650330535136163} +#define T_10000_253 {0.9873917140938154313190011635015252977610,-0.1582959346884087459272905107354745268822} +#define T_10000_257 {0.9869907549848846439743965675006620585918,-0.1607770181784925200840774550670175813138} +#define T_10000_259 {0.9867879373818274624241553283354733139277,-0.1620171800696415531639615892345318570733} +#define T_10000_261 {0.9865835615059240781832272659812588244677,-0.1632570861135484996395916823530569672585} +#define T_10000_263 {0.9863776276799119946403493486286606639624,-0.1644967343522324709770288109211833216250} +#define T_10000_267 {0.9859610874808120151158163935178890824318,-0.1669752495840466088861120397268678061664} +#define T_10000_269 {0.9857504817654978435115253887488506734371,-0.1682141126632628880255282410871586762369} +#define T_10000_271 {0.9855383194156216442038953573501203209162,-0.1694527101094346899223808122769696637988} +#define T_10000_273 {0.9853246007662167516016893387131858617067,-0.1706910399666475286473144024057546630502} +#define T_10000_277 {0.9848924959212412844067330297548323869705,-0.1731668890926545412156656311708502471447} +#define T_10000_279 {0.9846741104080233286666157255240250378847,-0.1744044044517449876252612739335745573044} +#define T_10000_281 {0.9844541699599805850695588560483884066343,-0.1756416444024752609998074603936402127147} +#define T_10000_283 {0.9842326749244291184126609550730790942907,-0.1768786069910745495370463231665780767798} +#define T_10000_287 {0.9837850224923376796226648366427980363369,-0.1793516922689902814980200673744548112154} +#define T_10000_289 {0.9835588658027020203533652420446742326021,-0.1805878110529675062423393683275207877159} +#define T_10000_291 {0.9833311559393651535287972365040332078934,-0.1818236446641414705283068542485125362873} +#define T_10000_293 {0.9831018932619117745019821086316369473934,-0.1830591911509622526921248208964243531227} +#define T_10000_297 {0.9826387109152558974045632567140273749828,-0.1855294149476145459942699744715355336666} +#define T_10000_299 {0.9824047919774814285887032383470796048641,-0.1867640883566255016745571992942132055759} +#define T_10000_301 {0.9821693216884457289239662713953293859959,-0.1879984668396482982988260346246534027159} +#define T_10000_303 {0.9819323004199885795983959724253509193659,-0.1892325484474308705529210783424787223339} +#define T_10000_307 {0.9814536064444141505447305462439544498920,-0.1916998132426139256345720696117496117949} +#define T_10000_309 {0.9812119344932199727082888784934766590595,-0.1929329925338665319145547982770949602127} +#define T_10000_311 {0.9809687130744494387712961724901106208563,-0.1941658671575892392002771202896838076413} +#define T_10000_313 {0.9807239425721822057013810081116389483213,-0.1953984351669048047384080746269319206476} +#define T_10000_317 {0.9802297558657077081178954358620103448629,-0.1978626435572291286657531372839002870023} +#define T_10000_319 {0.9799803404418886421112233620078768581152,-0.1990942800469165108179225853746174834669} +#define T_10000_321 {0.9797293774953482747847033351717982441187,-0.2003256021395600217083199368062196299434} +#define T_10000_323 {0.9794768674223913729193213839607778936625,-0.2015566078907340685777427324865129776299} +#define T_10000_327 {0.9789672074946618174351442576153203845024,-0.2040176625934718157573399821558268740773} +#define T_10000_329 {0.9787100584447118167830126367334742099047,-0.2052477076586942394964552249803091399372} +#define T_10000_331 {0.9784513638779891442354141872783657163382,-0.2064774286097706967879616968275513499975} +#define T_10000_333 {0.9781911242030078001974402468476910144091,-0.2077068235048039612777870388526935130358} +#define T_10000_337 {0.9776660111745244519099173885479103773832,-0.2101646273617295002100746614814852364361} +#define T_10000_339 {0.9774011386502475806636880406585987657309,-0.2113930324424139750583151453611208125949} +#define T_10000_341 {0.9771347226761610826883952540811151266098,-0.2126211037046458474097931912183412350714} +#define T_10000_343 {0.9768667636729720937793786106340121477842,-0.2138488392091329737265681387725635431707} +#define T_10000_347 {0.9763262182742982187022562357014976441860,-0.2163032951903584300001881501884781755507} +#define T_10000_349 {0.9760536327324083050172021103207953274250,-0.2175300117911757313837739502559998072684} +#define T_10000_351 {0.9757795058686044020745953275763895362616,-0.2187563848824123913239247940509812906384} +#define T_10000_353 {0.9755038381157704607815617237065453082323,-0.2199824125274578434652283931427518837154} +#define T_10000_357 {0.9749478816867128694312327752413693815470,-0.2224334237352637855256176635521114803851} +#define T_10000_359 {0.9746675938884201695344700056011788547039,-0.2236584034275430343807755662055569700897} +#define T_10000_361 {0.9743857669569582213853209395892918109894,-0.2248830299326746995358661251884768716991} +#define T_10000_363 {0.9741024013373701384566061278746929019690,-0.2261073013168063694511289440924883820117} +#define T_10000_367 {0.9735310558261365265764197829412296414375,-0.2285547709894671375430164061981486156583} +#define T_10000_369 {0.9732430768367236195715008761908393353224,-0.2297779654131075210976575817767297849059} +#define T_10000_371 {0.9729535609636482673678870014555286616087,-0.2310007969859766696618663672779803164303} +#define T_10000_373 {0.9726625086640954265249092713929712772369,-0.2322232637770565988333260065701324492693} +#define T_10000_377 {0.9720757966264285121482657814340200275183,-0.2346670952926602493615604316801181994379} +#define T_10000_379 {0.9717801378148126500988723819318693131208,-0.2358884561580404104486063943113549612463} +#define T_10000_381 {0.9714829444297147231779376852500718086958,-0.2371094445233503655856566183501854538918} +#define T_10000_383 {0.9711842169404436564406069010146893560886,-0.2383300584604828575852053518246975727379} +#define T_10000_387 {0.9705821615387298928467885161808226257563,-0.2407701553407456396271868470648769289255} +#define T_10000_389 {0.9702788345770148037416902297991327941418,-0.2419896344306299018622752328155911527574} +#define T_10000_391 {0.9699739754125804580553449341095983982086,-0.2432087313858506683050819674463127739727} +#define T_10000_393 {0.9696675845268408755472933080454822629690,-0.2444274442812874870156747419969178736210} +#define T_10000_397 {0.9690502095291966266898953108466230332851,-0.2468637101953627122075118904831469990313} +#define T_10000_399 {0.9687392263922114388208228774601593613625,-0.2480812593668046961425943663925863802433} +#define T_10000_401 {0.9684267134837581947337525889452081173658,-0.2492984167840759379686232932726852595806} +#define T_10000_403 {0.9681126712973378012350167409749701619148,-0.2505151805251187302481241658824728801847} +#define T_10000_407 {0.9674800010766709812415342639724258333445,-0.2529475192933996474664581910474225878716} +#define T_10000_409 {0.9671613740414972681946892407722771167755,-0.2541630904796428058745050293509848415852} +#define T_10000_411 {0.9668412197264998342305375444993842393160,-0.2553782603076737989233890857576625421643} +#define T_10000_413 {0.9665195386372462671431549097178503870964,-0.2565930268585735474218267881951760500669} +#define T_10000_417 {0.9658715981702937769526329248037654906511,-0.2590213424564903887947764360433211550117} +#define T_10000_419 {0.9655453398157813893476486555300652980804,-0.2602348876688657663969195255049271509051} +#define T_10000_421 {0.9652175567333840522366017466993071138859,-0.2614480219348323530859090624289819970727} +#define T_10000_423 {0.9648882494407158283422631939174607396126,-0.2626607433386856094159611529903486371040} +#define T_10000_427 {0.9642250643070580107263367608538828790188,-0.2650849399004965856185833672498119994998} +#define T_10000_429 {0.9638911875133281403904561557283159345388,-0.2662964112303169983420048083644360303879} +#define T_10000_431 {0.9635557886038454977750689067761413753033,-0.2675074620417556281459781075682258233428} +#define T_10000_433 {0.9632188681082508585618029428587760776281,-0.2687180904223979149847423286701086908579} +#define T_10000_437 {0.9625404644893010841499858543102163821459,-0.2711380722449736047074964062630897387862} +#define T_10000_439 {0.9621989824372378929950855308561585843563,-0.2723474218654253764526629311149008572102} +#define T_10000_441 {0.9618559809416447059504662320250645279884,-0.2735563414121236491816091529472032561898} +#define T_10000_443 {0.9615114605441678063968424794438760727644,-0.2747648289760193929254228351055644452572} +#define T_10000_447 {0.9608178652221394111521135528164450079203,-0.2771805005226211093827259901445358991623} +#define T_10000_449 {0.9604687913928695586207595624728128314018,-0.2783876806906520062767640411038883030415} +#define T_10000_451 {0.9601182008522776145653665480494964867830,-0.2795944212465362177510996843921020627022} +#define T_10000_453 {0.9597660941539938317390578959020785987377,-0.2808007202846656280570414310204796493053} +#define T_10000_457 {0.9590573345108424074823005867074243724346,-0.2832119861887170131353741453494876623154} +#define T_10000_459 {0.9587006826852030316743480398145038634539,-0.2844169492469210136498247720737708732486} +#define T_10000_461 {0.9583425169403264609613302127399947494268,-0.2856214631719402596843337960308417677879} +#define T_10000_463 {0.9579828378418053791421016285312362015247,-0.2868255260616828783426512927690055221319} +#define T_10000_467 {0.9572589418581481934822363655257504433393,-0.2892322911305347798993636843079002574086} +#define T_10000_469 {0.9568947261161425688058557170734275132418,-0.2904349895090335698100147965305950492620} +#define T_10000_471 {0.9565289993067518370395418969565071165562,-0.2916372292510405750398660984501475468278} +#define T_10000_473 {0.9561617620075083534558757492050062865019,-0.2928390084580551611104226594761712476611} +#define T_10000_477 {0.9554227582615192337911480535694863647223,-0.2952411776767437379120906371099408715963} +#define T_10000_479 {0.9550509929817613263836051373800728470087,-0.2964415638950646525451304569287458434701} +#define T_10000_481 {0.9546777195461244591712102192104794085026,-0.2976414819916929910270653181214584037662} +#define T_10000_483 {0.9543029385440584544397779609425924718380,-0.2988409300717942418401662507676519453526} +#define T_10000_487 {0.9535488562103399123870417497528251260519,-0.3012384086067919075624388369760708883405} +#define T_10000_489 {0.9531695560694862789929970858793240040541,-0.3024364352757424323492330131557537242770} +#define T_10000_491 {0.9527887507437995395775942597538232803345,-0.3036339843562801954490737443848047405481} +#define T_10000_493 {0.9524064408346231136448523102444596588612,-0.3048310539573117483591602194792358204722} +#define T_10000_497 {0.9516373096830544886515212965605314821005,-0.3072237471602710101592492719646543264389} +#define T_10000_499 {0.9512504896552254063379905346664600074291,-0.3084193669838096729662879624811466783285} +#define T_10000_501 {0.9508621674730308725997929286677390336990,-0.3096144997710696955550702114123851060867} +#define T_10000_503 {0.9504723437496847004979372286470606923103,-0.3108091436347733549361294080881634727120} +#define T_10000_507 {0.9496881941442469887704191933153197169304,-0.3131969570462636021090929716592654585838} +#define T_10000_509 {0.9492938695004347060546479042386636137962,-0.3143901228233669242762005069380393251777} +#define T_10000_511 {0.9488980457920273003935562883270904421806,-0.3155827921355531806568706088000908493996} +#define T_10000_513 {0.9485007236440844424762985909183043986559,-0.3167749630994347587176207525772042572498} +#define T_10000_517 {0.9477015865416612561134002135077025741339,-0.3191578024526710577468691099056741222739} +#define T_10000_519 {0.9472997728491274749273998168064281344414,-0.3203484670791971744563397805904969573021} +#define T_10000_521 {0.9468964632409494486253720424429047852755,-0.3215386258317673995321683833026327192783} +#define T_10000_523 {0.9464916583540079475511674900189973413944,-0.3227282768309586158395063648640643805265} +#define T_10000_527 {0.9456775653031638251277968265640083700418,-0.3251060480555233445087992549815680831671} +#define T_10000_529 {0.9452682784248251923742145663709379732609,-0.3262941645260716283516444491397123783827} +#define T_10000_531 {0.9448574988388492856827838295430410653353,-0.3274817657335961085252051816496532410383} +#define T_10000_533 {0.9444452271939129950339975039241835474968,-0.3286688498027124927247655250539537519217} +#define T_10000_537 {0.9436162103336476203452320987707935273647,-0.3310414590282691471578857544955099001527} +#define T_10000_539 {0.9431994664274493400668575304734986275434,-0.3322269804380356683637387504859361797571} +#define T_10000_541 {0.9427812330805501739305896080622915178537,-0.3334119772160523131887543968332465738058} +#define T_10000_543 {0.9423615109533977074462995915382634848356,-0.3345964474910473884783357334526954218745} +#define T_10000_547 {0.9415176030118772576571473109652288258076,-0.3369638010510462855506830237573012709618} +#define T_10000_549 {0.9410934185301550458646602237422484904528,-0.3381466805976757195040249825979117304087} +#define T_10000_551 {0.9406677479334690783474570707767270505428,-0.3393290261645406924451151553512318059802} +#define T_10000_553 {0.9402405918940113238591038680169731378555,-0.3405108358845561622452180472464533522725} +#define T_10000_557 {0.9393818261872762809261416805384214967489,-0.3428728403199323704342305063619278371334} +#define T_10000_559 {0.9389502178761071071733113058144226670265,-0.3440530313053656308319716572441393509507} +#define T_10000_561 {0.9385171268343807104983511635509785264730,-0.3452326789841004828218729016953147947788} +#define T_10000_563 {0.9380825537460067975459310218866448849440,-0.3464117814933122541098953206528676673770} +#define T_10000_567 {0.9372089641766567780223340378142893314362,-0.3487683435561749756281813006353331729770} +#define T_10000_569 {0.9367699490751978386171572310558985918760,-0.3499457993884927953232022446172777563334} +#define T_10000_571 {0.9363294546861236078427737083984538912773,-0.3511227026086270464055871798336738720536} +#define T_10000_573 {0.9358874817050348848823659864137880504131,-0.3522990513580868676690727170353056862950} +#define T_10000_577 {0.9349991027608903770840242941631004214287,-0.3546500780154009935252190643950598314404} +#define T_10000_579 {0.9345526982007060778911977649840991944075,-0.3558247522106632310112672712421044707298} +#define T_10000_581 {0.9341048178542483704234200558857992291451,-0.3569988645100729551806750805553747341037} +#define T_10000_583 {0.9336554624287813908267708029597997665405,-0.3581724130595465571680335870041744783521} +#define T_10000_587 {0.9327523291815218442479817895218729972839,-0.3605178114968051739097631980257574468851} +#define T_10000_589 {0.9322985527859000143280354677699506282806,-0.3616896576808859564700071587139973416924} +#define T_10000_591 {0.9318433041636079439129503043659497052431,-0.3628609327076279811308268108405172824860} +#define T_10000_593 {0.9313865840335453505360874260077252984047,-0.3640316347274279995538392995513277128339} +#define T_10000_597 {0.9304687321373250608047555942903272807598,-0.3663713123523168468942401432286715134978} +#define T_10000_599 {0.9300076018205806205330077318649273365736,-0.3675402842627353838800274843379156664014} +#define T_10000_601 {0.9295450028948903931080849361023865640163,-0.3687086757768771039245336851308820769191} +#define T_10000_603 {0.9290809360907609226742920327524188905954,-0.3698764850496923939360272015619557350874} +#define T_10000_607 {0.9281484017808010467120993780554272234440,-0.3722103494957450520175257224764209240675} +#define T_10000_609 {0.9276799357475695728680875618010759353638,-0.3733764009834918407015891261835349723697} +#define T_10000_611 {0.9272100047810944989379322578315623104572,-0.3745418628589369802739383885636925697327} +#define T_10000_613 {0.9267386096234611070343589744879864156246,-0.3757067332816570459641525303595699369907} +#define T_10000_617 {0.9257914297146194737564428578480146825314,-0.3780346924119012963494412815634859725833} +#define T_10000_619 {0.9253156464591375529593619830848183482885,-0.3791977774432602754650645238143624737859} +#define T_10000_621 {0.9248384020039476238750353331852238625288,-0.3803602636695695426993779619806446135044} +#define T_10000_623 {0.9243596971026839526075491448864340782166,-0.3815221492551046922336865918623516336083} +#define T_10000_627 {0.9233979089880015589386630381341092288494,-0.3838441111657002191570597915415419265628} +#define T_10000_629 {0.9229148272933775931647915058420039713383,-0.3850041838240658709224817357608117163181} +#define T_10000_631 {0.9224302881902667339275581070978660136461,-0.3861636485082735603668879775796085596085} +#define T_10000_633 {0.9219442924438224773453498528397176414728,-0.3873225033873702871645150480617303401232} +#define T_10000_637 {0.9209679340930472246640192679478786885738,-0.3896383764112367198428898973361356183887} +#define T_10000_639 {0.9204775730305192471547570676193572580814,-0.3907953908989268421159124500263715162873} +#define T_10000_641 {0.9199857584082616890341910220740828663111,-0.3919517882673524966641309674741933122277} +#define T_10000_643 {0.9194924910029168563241341871616896241903,-0.3931075666904045307070703074714401736856} +#define T_10000_647 {0.9185016009610046383571102524001616984606,-0.3954172594008402707999039193964563310146} +#define T_10000_649 {0.9180039798891878044528880309371743351221,-0.3965711700409039752734940975642530247569} +#define T_10000_651 {0.9175049091637825782186155265662819147110,-0.3977244544409596627154712678020587190986} +#define T_10000_653 {0.9170043895728897664909595732751768082380,-0.3988771107798138038091906310000922530890} +#define T_10000_657 {0.9159990069584826866133653311408124864101,-0.4011805319941054714938388769951416179538} +#define T_10000_659 {0.9154941455226049962234924350923392921686,-0.4023312932321265189550274499197257682681} +#define T_10000_661 {0.9149878383965097672358979252749122679234,-0.4034814191341187794570544156158575788140} +#define T_10000_663 {0.9144800863797253320797153719468042254448,-0.4046309078838764561325547219894360750914} +#define T_10000_667 {0.9134602508836076051323971114470623433590,-0.4069279666668986772393168394046369940042} +#define T_10000_669 {0.9129481690147339678631510651030112057924,-0.4080755330727937613133349259442184120417} +#define T_10000_671 {0.9124346454760872138223248839494772255421,-0.4092224550717213360506718800024827942252} +#define T_10000_673 {0.9119196810785913376307121325226034969091,-0.4103687308525350019294819503556936979294} +#define T_10000_677 {0.9108854329621220990986785182030871510506,-0.4126593365203400920471210611140122637153} +#define T_10000_679 {0.9103661508763678522626605627010576426983,-0.4138036627901525710271357638703193515539} +#define T_10000_681 {0.9098454311981996056601929012686014175415,-0.4149473356074988017461180334066739305854} +#define T_10000_683 {0.9093232747499049306583174256957136094570,-0.4160903531663632803905272794509073719382} +#define T_10000_687 {0.9082746548434289524109885860525537282228,-0.4183744152897616586095352886331966146827} +#define T_10000_689 {0.9077481930411616106724181918252725154161,-0.4195154562474497361712622023333096876740} +#define T_10000_691 {0.9072202977805932100352492852834984660149,-0.4206558347329700797700979819637723267078} +#define T_10000_693 {0.9066909698953422580558481058687902987003,-0.4217955489455093576189881332538789138198} +#define T_10000_697 {0.9056280195965779045152999060519505292177,-0.4240729773536393576449654574389569461346} +#define T_10000_699 {0.9050943988616080293496679587406106293201,-0.4252106879528597671580314454331528395414} +#define T_10000_701 {0.9045593488590403241644821719091851264238,-0.4263477270863642587528374860994517803192} +#define T_10000_703 {0.9040228704337917031708116155641619116068,-0.4274840929586126714312399599293712526560} +#define T_10000_707 {0.9029456317061966830195274269499350339174,-0.4297547977425003606910536291252356022596} +#define T_10000_709 {0.9024048731049573168760957742051687091589,-0.4308891330683867759709926303912652656436} +#define T_10000_711 {0.9018626894832482143726792855886742472649,-0.4320227879615172739136141899507492780685} +#define T_10000_713 {0.9013190816972516117289160320069640874863,-0.4331557606316960296055640355916693806648} +#define T_10000_717 {0.9002275970683663031124410736083518713713,-0.4354196521478045922570743186952313408256} +#define T_10000_719 {0.8996797219490809682795884327788371592760,-0.4365505674187405982245024915755493566394} +#define T_10000_721 {0.8991304261127116559393357420049142092466,-0.4376807933167430886278737034444930031896} +#define T_10000_723 {0.8985797104266713963838242307247128337622,-0.4388103280570311293118379580846522003412} +#define T_10000_727 {0.8974740229864401896975323325023055076599,-0.4410673169307998686683447431278182193637} +#define T_10000_729 {0.8969190529782804510716687218518927693367,-0.4421947675001870470268272583780344575644} +#define T_10000_731 {0.8963626666125099617943305929657071828842,-0.4433215197836783416640571431344142183661} +#define T_10000_733 {0.8958048647677385734056088040233589708805,-0.4444475720019781528336011433566454797983} +#define T_10000_737 {0.8946850181668082324648594294558279216290,-0.4466975691313508911584051475074375048280} +#define T_10000_739 {0.8941229751790398250221869602683000266552,-0.4478215104893712439348973930464126169682} +#define T_10000_741 {0.8935595202490491661961868885555304586887,-0.4489447446761005711124425943125970661640} +#define T_10000_743 {0.8929946542666082898165313963545486330986,-0.4500672699177986624263780868204776197672} +#define T_10000_747 {0.8918606927146052187893587870348710566759,-0.4523101864767414270751544336235383525491} +#define T_10000_749 {0.8912915989357229618050837416376452893019,-0.4534305742521146376233787123055662959814} +#define T_10000_751 {0.8907210976857479201029832438507582992315,-0.4545502459987194598944881818169960752130} +#define T_10000_753 {0.8901491898655793377770351071376353502274,-0.4556691999484414612453520021517761051655} +#define T_10000_757 {0.8890011581293638665002276866289321333170,-0.4579049473904489020448238534299889579415} +#define T_10000_759 {0.8884250360262156975821312698826659470797,-0.4590217373521840449335229550342774018645} +#define T_10000_761 {0.8878475109786687635704538479330949485302,-0.4601378024559410473415255182771943509579} +#define T_10000_763 {0.8872685838987141027089933231764007359743,-0.4612531409393008652308765249472344294190} +#define T_10000_767 {0.8861065273006134557221002978621982038021,-0.4634816310008922912722084674896905198693} +#define T_10000_769 {0.8855233996175133626138631370849907398224,-0.4645947790600338245248224211536580696702} +#define T_10000_771 {0.8849388735720946641549744526855647563934,-0.4657071934606037966197789046418620273471} +#define T_10000_773 {0.8843529500874036664015420683426782488823,-0.4668188724459479410633377938211197033525} +#define T_10000_777 {0.8831769145034225054757826001150533556938,-0.4690400171501513670868632743804482743144} +#define T_10000_779 {0.8825868042612531327861802310508210211992,-0.4701494793615200729064440565707627683878} +#define T_10000_781 {0.8819953002940500441653171037614811211824,-0.4712581991426871286243738268240122124553} +#define T_10000_783 {0.8814024035358785091887057205894961953163,-0.4723661747428327117859225836582481861115} +#define T_10000_787 {0.8802124353938881595738052965316455811262,-0.4745798864026585794917423299921210855246} +#define T_10000_789 {0.8796153658891913140749352351122070103884,-0.4756856189665859546700232840521493926644} +#define T_10000_791 {0.8790169073517675180085007013985887169838,-0.4767906003579916451862175108544761314988} +#define T_10000_793 {0.8784170607266644781319087087467778474092,-0.4778948288319591619632831225317204371095} +#define T_10000_797 {0.8772132070045695062532331576221622526646,-0.4801010200538615158016853001754498109221} +#define T_10000_799 {0.8766092018086270032384277328674215823412,-0.4812029793179192771468422051839297637343} +#define T_10000_801 {0.8760038123271008947412497036566492170095,-0.4823041786967902644356342989340191707015} +#define T_10000_803 {0.8753970395159837991627682640682905912399,-0.4834046164515302668185370293940650299191} +#define T_10000_807 {0.8741793477398679401701997448981273919344,-0.4856032001388573271505322281882399693131} +#define T_10000_809 {0.8735684306977705704255754426412750035524,-0.4867013425995805886437040044256718829274} +#define T_10000_811 {0.8729561341718823364033141842810437083244,-0.4877987164924509344565706214780220761895} +#define T_10000_813 {0.8723424591291029095430076267803087830544,-0.4888953200845652102835003915970446541905} +#define T_10000_817 {0.8711109773713523463101182642276398837566,-0.4910862094409975120434808104619150981307} +#define T_10000_819 {0.8704931726010590731235083694627974182367,-0.4921804917456022998578646365785971283913} +#define T_10000_821 {0.8698739932032270028017251206620130687952,-0.4932739968300297039860424774815328419209} +#define T_10000_823 {0.8692534401556251122400453823502175509930,-0.4943667229674860164045924193487735465169} +#define T_10000_827 {0.8680082170330309931927104116766713559628,-0.4965498315004632789992911057197488844395} +#define T_10000_829 {0.8673835489244161589539316992159001529217,-0.4976402104485578292702996350271860137582} +#define T_10000_831 {0.8667575110987828512776331990608014166355,-0.4987298035548339836253717294312082231045} +#define T_10000_833 {0.8661301045447302726643101777881383895874,-0.4998186090986755369947047711320919916034} +#define T_10000_837 {0.8648711892165694692380384367424994707108,-0.5019938506228108776596741336106788367033} +#define T_10000_839 {0.8642396824304603430988436230109073221684,-0.5030802831681013831399695845902897417545} +#define T_10000_841 {0.8636068108919271457679656123218592256308,-0.5041659212809561640966649065376259386539} +#define T_10000_843 {0.8629725756003602255006512677937280386686,-0.5052507632470043530403813747398089617491} +#define T_10000_847 {0.8617000177664542182043305729166604578495,-0.5074180518874870093881668253743555396795} +#define T_10000_849 {0.8610616972336574415081145161821041256189,-0.5085004951394769268091522462782450020313} +#define T_10000_851 {0.8604220169669090090280860749771818518639,-0.5095821353997764813215098911314271390438} +#define T_10000_853 {0.8597809779763513349237769034516531974077,-0.5106629709603280886653919878881424665451} +#define T_10000_857 {0.8584948278751043382328589359531179070473,-0.5128222211563135957135273201856762170792} +#define T_10000_859 {0.8578497187954216007810259725374635308981,-0.5139006323819965693644462589873000979424} +#define T_10000_861 {0.8572032550539395723276925309619400650263,-0.5149782320884355168999491070280782878399} +#define T_10000_863 {0.8565554376715126538854860882565844804049,-0.5160550185739533057827088669000659137964} +#define T_10000_867 {0.8552557460779285358754009394033346325159,-0.5182061450819410719503821383113972842693} +#define T_10000_869 {0.8546038739191617050749982809065841138363,-0.5192804817074886392092025744204875081778} +#define T_10000_871 {0.8539506522242277242185082286596298217773,-0.5203539983182757788782168972829822450876} +#define T_10000_873 {0.8532960820246526800758601893903687596321,-0.5214266932190729786356087060994468629360} +#define T_10000_877 {0.8519829002483300106618457903095986694098,-0.5235696111162718713316621688136365264654} +#define T_10000_879 {0.8513242907452757135544629818468820303679,-0.5246398307287135631682417624688241630793} +#define T_10000_881 {0.8506643368849636788198154135898221284151,-0.5257092218632510860487627724069170653820} +#define T_10000_883 {0.8500030397095511558447356037504505366087,-0.5267777828311699384045141414389945566654} +#define T_10000_887 {0.8486764195926580489626189773844089359045,-0.5289124075188504914279974400415085256100} +#define T_10000_889 {0.8480110987460918181568558793514966964722,-0.5299784678677484928144281184358987957239} +#define T_10000_891 {0.8473444387742501415772267137072049081326,-0.5310436913083056431617023918079212307930} +#define T_10000_893 {0.8466764407298802419532535168400499969721,-0.5321080761583887319687846684246324002743} +#define T_10000_897 {0.8453364346451073263466469143168069422245,-0.5342343233652234735231445483805146068335} +#define T_10000_899 {0.8446644287207568657649403576215263456106,-0.5352961823643406225059493408480193465948} +#define T_10000_901 {0.8439910889559798512493671296397224068642,-0.5363571960577198094455297905369661748409} +#define T_10000_903 {0.8433164164140716234641104165348224341869,-0.5374173627698757282900032805628143250942} +#define T_10000_907 {0.8419630772625646963902568131743464618921,-0.5395351485552655201871630197274498641491} +#define T_10000_909 {0.8412844127900732660307880905747879296541,-0.5405927642842267566436476045055314898491} +#define T_10000_911 {0.8406044198146618295908183426945470273495,-0.5416495263434242257716277890722267329693} +#define T_10000_913 {0.8399230994101319902611635370703879743814,-0.5427054330640864732160366656898986548185} +#define T_10000_917 {0.8385564806194034659370117879007011651993,-0.5448146738214748596718095541291404515505} +#define T_10000_919 {0.8378711843912823864499728188093286007643,-0.5458680045274221237505685166979674249887} +#define T_10000_921 {0.8371845650501947844190908654127269983292,-0.5469204732332811902395519609854090958834} +#define T_10000_923 {0.8364966236804062260645764581568073481321,-0.5479720782770604037281714226992335170507} +#define T_10000_927 {0.8351167792022259339645984255184885114431,-0.5500726907372338603252615030214656144381} +#define T_10000_929 {0.8344248782727968771766313693660777062178,-0.5511216948364744405708393060194794088602} +#define T_10000_931 {0.8337316596725891093555560473760124295950,-0.5521698286393327270715758459118660539389} +#define T_10000_933 {0.8330371244962893051422270218608900904655,-0.5532170904906623665198139860876835882664} +#define T_10000_937 {0.8316441088045545271256742125842720270157,-0.5553089917250378926283360669913236051798} +#define T_10000_939 {0.8309456304888814814901820682280231267214,-0.5563536278046861127677402691915631294250} +#define T_10000_941 {0.8302458399966368141065231611719354987144,-0.5573973853260157929057072578871157020330} +#define T_10000_943 {0.8295447384328853424051430920371785759926,-0.5584402626407912739736616458685602992773} +#define T_10000_947 {0.8281386065214704217396501917392015457153,-0.5605233700646898853392485762014985084534} +#define T_10000_949 {0.8274335783942812216906759203993715345860,-0.5615635968843020187080128380330279469490} +#define T_10000_951 {0.8267272436365301624405788061267230659723,-0.5626029369183432882195461388619150966406} +#define T_10000_953 {0.8260196033636160173330154066206887364388,-0.5636413885255538991359003375691827386618} +#define T_10000_957 {0.8246004107442010955253408610587939620018,-0.5657156199014614639253295536036603152752} +#define T_10000_959 {0.8238888606387991808688298078777734190226,-0.5667513963946638755331264292181003838778} +#define T_10000_961 {0.8231760095004283517283738547121174633503,-0.5677862779100518730857061200367752462626} +#define T_10000_963 {0.8224618584547781496851825977500993758440,-0.5688202628134062521780833776574581861496} +#define T_10000_967 {0.8210296611546578082752034788427408784628,-0.5708855362542195610586759357829578220844} +#define T_10000_969 {0.8203116171618227703277170803630724549294,-0.5719168215303298552498745266348123550415} +#define T_10000_971 {0.8195922777849746765355121169704943895340,-0.5729472036717143668127505407028365880251} +#define T_10000_973 {0.8188716441600486506402489794709254056215,-0.5739766810512589856330123438965529203415} +#define T_10000_977 {0.8174264987199200138690002859220840036869,-0.5760329150235190542872487640124745666981} +#define T_10000_979 {0.8167019891867992598122327763121575117111,-0.5770596683691602279964172339532524347305} +#define T_10000_981 {0.8159761899697607345771643849730025976896,-0.5780855104588186188507847873552236706018} +#define T_10000_983 {0.8152491022149405131003163660352583974600,-0.5791104396725493819175767384876962751150} +#define T_10000_987 {0.8137910656866710334966796835942659527063,-0.5811575529996596545245779452670831233263} +#define T_10000_989 {0.8130601192156604595240310118242632597685,-0.5821797338803679222607456722471397370100} +#define T_10000_991 {0.8123278888117418938819014329055789858103,-0.5832009954198108525602606277971062809229} +#define T_10000_993 {0.8115943756312070656022683579067233949900,-0.5842213360052772586072933336254209280014} +#define T_10000_997 {0.8101235055755823255552172668103594332933,-0.5862592478707090437595184084784705191851} +#define T_10000_999 {0.8093861510231969980111443874193355441093,-0.5872768159325333359177534475747961550951} +// Pre-computed twiddles for N=15625 +#define T_15625_1 {0.9999999191482018545684695709496736526489,-0.0004021238488220144112819665060953866487} +#define T_15625_2 {0.9999996765928204078832663981302175670862,-0.0008042476326191562918319077901685432153} +#define T_15625_3 {0.9999992723338949618394622120831627398729,-0.0012063712863665636276788717395902494900} +#define T_15625_4 {0.9999987063714907975509049720130860805511,-0.0016084947450393956552144869576181918092} +#define T_15625_6 {0.9999970893366388891010387851565610617399,-0.0024127408170621371559516621374541500700} +#define T_15625_7 {0.9999960382644524914397266002197284251451,-0.0028148633003625645951362166385933960555} +#define T_15625_8 {0.9999948254893104016005622725060675293207,-0.0032169853284894729325560369659342541127} +#define T_15625_9 {0.9999934510114086849696946046606171876192,-0.0036191068364182848657306657003118743887} +#define T_15625_11 {0.9999902169482416391943502276262734085321,-0.0044233480315837361840358710196596803144} +#define T_15625_12 {0.9999883573634993361167744296835735440254,-0.0048254675887716815332884046085837326245} +#define T_15625_13 {0.9999863360770433473589946515858173370361,-0.0052275863656641649579448483109445078298} +#define T_15625_14 {0.9999841530892005225794605394185055047274,-0.0056297042972371317409718827207143476699} +#define T_15625_16 {0.9999793020107925478612287406576797366142,-0.0064339373643290051485443825640686554834} +#define T_15625_17 {0.9999766339210118815117311896756291389465,-0.0068360523698005309228831194445774599444} +#define T_15625_18 {0.9999738041314132352965771133312955498695,-0.0072381662698578043760311295784504181938} +#define T_15625_19 {0.9999708126424541321242145386349875479937,-0.0076402789994775609574984009952913766028} +#define T_15625_21 {0.9999643445684160969477716207620687782764,-0.0084445006873124263402363709474229835905} +#define T_15625_22 {0.9999608679843829950328881750465370714664,-0.0088466095154819957002700903103686869144} +#define T_15625_23 {0.9999572297030813183837949509324971586466,-0.0092487169131229909901925623216811800376} +#define T_15625_24 {0.9999534297250993741812408188707195222378,-0.0096508228152132006366636929328706173692} +#define T_15625_26 {0.9999453446815790291779535436944570392370,-0.0104550298726536310123247375258870306425} +#define T_15625_27 {0.9999410596173479159887165224063210189342,-0.0108571308979606775957371667118422919884} +#define T_15625_28 {0.9999366128590513325136157618544530123472,-0.0112592301676306127955484726044232957065} +#define T_15625_29 {0.9999320044074083702057009759300854057074,-0.0116613276166425382240898755981106660329} +#define T_15625_31 {0.9999223024270901261090216394222807139158,-0.0124655167926102494052598146367927256506} +#define T_15625_32 {0.9999172088999838114986573600617703050375,-0.0128676083895257527961453192233420850243} +#define T_15625_33 {0.9999119536826687193453722102276515215635,-0.0132696979057027042436800456925993785262} +#define T_15625_34 {0.9999065367759947253745167472516186535358,-0.0136717852761217822238348773566940508317} +#define T_15625_36 {0.9998952178980997640422856420627795159817,-0.0144759533196107802871521741394644777756} +#define T_15625_37 {0.9998893159287092213816094954381696879864,-0.0148780338626438336790958061328637995757} +#define T_15625_38 {0.9998832522736202932378546393010765314102,-0.0152801119998453062420740167226540506817} +#define T_15625_39 {0.9998770269338135285863700119080021977425,-0.0156821876661977187283891765900989412330} +#define T_15625_41 {0.9998640912040995809562105023360345512629,-0.0164863313262874344733432963039376772940} +#define T_15625_42 {0.9998573808162839471336269525636453181505,-0.0168883991899918192713947462380019715056} +#define T_15625_43 {0.9998505087479341302270086089265532791615,-0.0172904643227813209493959334395185578614} +#define T_15625_44 {0.9998434750001611304170978655747603625059,-0.0176925266596405575403316845495282905176} +#define T_15625_46 {0.9998289224709218103726016124710440635681,-0.0184966426855090008973814263981694239192} +#define T_15625_47 {0.9998214036918086078387091220065485686064,-0.0188986962444897543811350715259322896600} +#define T_15625_48 {0.9998137232379786798475151954335160553455,-0.0193007467474833570297843010621363646351} +#define T_15625_49 {0.9998058811106741439189704578893724828959,-0.0197027941294768034474760298735418473370} +#define T_15625_51 {0.9997897118407393923789072687213774770498,-0.0205068792704136936000391955303712165914} +#define T_15625_52 {0.9997813847007238630126835232658777385950,-0.0209089168993336800561610999693584744819} +#define T_15625_53 {0.9997728958924628761550934541446622461081,-0.0213109511472066096016142466851306380704} +#define T_15625_54 {0.9997642454173291115537836049043107777834,-0.0217129819490220962729942755231604678556} +#define T_15625_56 {0.9997464594720646413605891211773268878460,-0.0225170329544420073919486213753771153279} +#define T_15625_57 {0.9997373240048099685139959547086618840694,-0.0229190530280284909814270122296875342727} +#define T_15625_58 {0.9997280268764346944365684066724497824907,-0.0233210693955216777362160485154163325205} +#define T_15625_59 {0.9997185680884420611036489390244241803885,-0.0237230819919140717422223474386555608362} +#define T_15625_61 {0.9996991655397491349788197112502530217171,-0.0245270956113695787204065368314331863075} +#define T_15625_62 {0.9996892217821863324545006435073446482420,-0.0249290965044207946110788043370121158659} +#define T_15625_63 {0.9996791163712811512098710409190971404314,-0.0253310933663474513455327041810960508883} +#define T_15625_64 {0.9996688493086676174925742088817059993744,-0.0257330861321452136758480833123030606657} +#define T_15625_66 {0.9996478302349828259920627715473528951406,-0.0265370591153400170458809270712663419545} +#define T_15625_67 {0.9996370782273102939541331579675897955894,-0.0269390392027317356060667208339509670623} +#define T_15625_68 {0.9996261645747271940010136859200429171324,-0.0273410149339839356197323638753005070612} +#define T_15625_69 {0.9996150892789983366526485042413696646690,-0.0277429862440956923053114735466806450859} +#define T_15625_71 {0.9995924537652930430553510632307734340429,-0.0285469153408977778518806900365234469064} +#define T_15625_72 {0.9995808935509769010963054824969731271267,-0.0289488729975898827218028230845447978936} +#define T_15625_73 {0.9995691717008353682771826242969837039709,-0.0293508259731451136287372349897850654088} +#define T_15625_74 {0.9995572882167640393902274809079244732857,-0.0297527742025662303437805178418784635141} +#define T_15625_76 {0.9995330363545443796979839135019574314356,-0.0305566561630209940214619734888401580974} +#define T_15625_77 {0.9995206679803176896825789299327880144119,-0.0309585897640640428374325665572541765869} +#define T_15625_78 {0.9995081379800042631345036170387174934149,-0.0313605183589917846376238230732269585133} +#define T_15625_79 {0.9994954463556304791183038105373270809650,-0.0317624418828109308932106102929537883028} +#define T_15625_81 {0.9994695782429366959220828903198707848787,-0.0325662734571543280304162237825948977843} +#define T_15625_82 {0.9994564017587997950542444414168130606413,-0.0329681813776961235573637054585560690612} +#define T_15625_83 {0.9994430636589681782311345159541815519333,-0.0333700839671644267081518364648218266666} +#define T_15625_84 {0.9994295639455990087895997930900193750858,-0.0337719811605701469847673479307559318841} +#define T_15625_86 {0.9994020796870052292248942649166565388441,-0.0345757590992418137321173787768202601001} +#define T_15625_87 {0.9993880951462250639139028862700797617435,-0.0349776397145339784655604375984694343060} +#define T_15625_88 {0.9993739490007957915196357134846039116383,-0.0353795146738160176669296674845099914819} +#define T_15625_89 {0.9993596412530049155620304190961178392172,-0.0357813839121032886692042040976957650855} +#define T_15625_91 {0.9993305409596188182419496115471702069044,-0.0365851049657596946729221087935002287850} +#define T_15625_92 {0.9993157484187291661470453618676401674747,-0.0369869566511642419714966933952382532880} +#define T_15625_93 {0.9993007942848892710685504425782710313797,-0.0373888023556448803597085372985020512715} +#define T_15625_94 {0.9992856785605169767094935195927973836660,-0.0377906420142217133739670487102557672188} +#define T_15625_96 {0.9992549623499790145686461073637474328279,-0.0385943029337492901631456732047809055075} +#define T_15625_97 {0.9992393618687801515321211809350643306971,-0.0389961240647451676255563768336287466809} +#define T_15625_98 {0.9992235998069829427947752265026792883873,-0.0393979388899275417834644485992612317204} +#define T_15625_99 {0.9992076761671360163319377534207887947559,-0.0397997473443215121768901099130744114518} +#define T_15625_101 {0.9991753441636191945818268322909716516733,-0.0406033448808497815507578820870548952371} +#define T_15625_102 {0.9991589358051773395175132463918998837471,-0.0410051338330394768427744622840691590682} +#define T_15625_103 {0.9991423658791422912628377162036485970020,-0.0414069161545515540545636667957296594977} +#define T_15625_104 {0.9991256343881934620654305945208761841059,-0.0418086917804163793466187826197710819542} +#define T_15625_106 {0.9990916867224027830829413687752094119787,-0.0426122226853311095173815203906997339800} +#define T_15625_107 {0.9990744705530502089985134261951316148043,-0.0430139778344471729321085717856476549059} +#define T_15625_108 {0.9990570928297628494618720651487819850445,-0.0434157260280483292547160090180113911629} +#define T_15625_109 {0.9990395535553506789483435568399727344513,-0.0438174672011704471885629175176291028038} +#define T_15625_111 {0.9990039903645224761419285641750320792198,-0.0446209282261267534130944056869338965043} +#define T_15625_112 {0.9989859664538571770719954656669870018959,-0.0450226479480384036979678796797088580206} +#define T_15625_113 {0.9989677810035684224487795290770009160042,-0.0454243603896259603991403253075986867771} +#define T_15625_114 {0.9989494340165968599976054065336938947439,-0.0458260654859310861963805905361368786544} +#define T_15625_116 {0.9989122554444985757626795930264052003622,-0.0466294533828665869190643888941849581897} +#define T_15625_117 {0.9988934238653837116572731247288174927235,-0.0470311360535862546528029781711666146293} +#define T_15625_118 {0.9988744307616098661029013783263508230448,-0.0474328111192020840380934032509685494006} +#define T_15625_119 {0.9988552761362482490525849243567790836096,-0.0478344785147617745924009113878128118813} +#define T_15625_121 {0.9988164823331775465931059443391859531403,-0.0486377900359097198323965471900010015815} +#define T_15625_122 {0.9987968431617414433176804777758661657572,-0.0490394340315996185575286858693289104849} +#define T_15625_123 {0.9987770424812638658806918101618066430092,-0.0494410700974366734694598335408954881132} +#define T_15625_124 {0.9987570802949465864628564304439350962639,-0.0498426981684748846013199852222896879539} +#define T_15625_126 {0.9987166714177307946798123339249286800623,-0.0506459300663772843398291456651350017637} +#define T_15625_127 {0.9986962247333666109483374384581111371517,-0.0510475337633559747563261055347538786009} +#define T_15625_128 {0.9986756165562311604588785485248081386089,-0.0514491292057648735847052989811345469207} +#define T_15625_129 {0.9986548468896569996644529965124092996120,-0.0518507163286645519906237211671395925805} +#define T_15625_131 {0.9986128231016526690666523791151121258736,-0.0526538653561852837414747341426846105605} +#define T_15625_132 {0.9985915689870179523524029718828387558460,-0.0530554271309342379003837208983895834535} +#define T_15625_133 {0.9985701533965350984445308313297573477030,-0.0534569803264298287759359595838759560138} +#define T_15625_134 {0.9985485763336673370460516707680653780699,-0.0538585248777394484664959861675015417859} +#define T_15625_136 {0.9985049378047593515717039736045990139246,-0.0546615877880773995411978205538616748527} +#define T_15625_137 {0.9984828763457755940180504694581031799316,-0.0550631060172475719771689739445719169453} +#define T_15625_138 {0.9984606534285198620182200102135539054871,-0.0554646153425154830873822220382862724364} +#define T_15625_139 {0.9984382690565856144360168400453403592110,-0.0558661156989556165197718939907645108178} +#define T_15625_141 {0.9983930159631865253189175746229011565447,-0.0566690892456578262925681599426752654836} +#define T_15625_142 {0.9983701472490392747616283486422616988420,-0.0570705623060761974008059382867941167206} +#define T_15625_143 {0.9983471170948486905771801502851303666830,-0.0574720261379794011835642209007346536964} +#define T_15625_144 {0.9983239255043387938570731421350501477718,-0.0578734806764492834552093825095653301105} +#define T_15625_146 {0.9982770580293882645150915777776390314102,-0.0586763616134240509336628122127876849845} +#define T_15625_147 {0.9982533821525262363039132651465479284525,-0.0590777878821002050413468964507046621293} +#define T_15625_148 {0.9982295448545022154007710923906415700912,-0.0594792045976855893174572997850191313773} +#define T_15625_149 {0.9982055461391708961471636030182708054781,-0.0598806116952696823596546948920149588957} +#define T_15625_151 {0.9981570644721348140038230667414609342813,-0.0606833967767996806941255272249691188335} +#define T_15625_152 {0.9981325815282697799801781002315692603588,-0.0610847746309323563762561093426484148949} +#define T_15625_153 {0.9981079371827767943159415153786540031433,-0.0614861426074372885430818769236793741584} +#define T_15625_154 {0.9980831314396407805134003865532577037811,-0.0618875006414118314368444373485544929281} +#define T_15625_156 {0.9980330357765107018863659504859242588282,-0.0626901866221672432466505142656387761235} +#define T_15625_157 {0.9980077458646172683387476354255340993404,-0.0630915144391508975196813935326645150781} +#define T_15625_158 {0.9979822945712820692065747607557568699121,-0.0634928320540097679280933107293094508350} +#define T_15625_159 {0.9979566819006209232867377068032510578632,-0.0638941394018493619144649642294098157436} +#define T_15625_161 {0.9979049724439128521424891005153767764568,-0.0646967230369009799195723076081776525825} +#define T_15625_162 {0.9978788756662276826503443771798629313707,-0.0650979991943323593739378907230275217444} +#define T_15625_163 {0.9978526175279396337458592824987135827541,-0.0654992648251831593997351887992408592254} +#define T_15625_164 {0.9978261980332946423644102651451248675585,-0.0659005198645672835011666279569908510894} +#define T_15625_166 {0.9977728749920488082736369506164919584990,-0.0667029979093996666650312477031548041850} +#define T_15625_167 {0.9977459714540704016627614691969938576221,-0.0671042207850843508420979333095601759851} +#define T_15625_168 {0.9977189065769801468164246216474566608667,-0.0675054328097752093240657700334850233048} +#define T_15625_169 {0.9976916803651546539200012375658843666315,-0.0679066339185948053547292602161178365350} +#define T_15625_171 {0.9976367439549340687676703964825719594955,-0.0687090031291193586993060193890414666384} +#define T_15625_172 {0.9976090337654224260433011295390315353870,-0.0691101711010783242850052943140326533467} +#define T_15625_173 {0.9975811622589422222873167811485473066568,-0.0695113278976740855963356580105028115213} +#define T_15625_174 {0.9975531294400005188904856368026230484247,-0.0699124734540381292946520375153340864927} +#define T_15625_176 {0.9974965798828904217643298579787369817495,-0.0707147305866062253487669408968940842897} +#define T_15625_177 {0.9974680631538663799773303253459744155407,-0.0711158420330824103672284763888455927372} +#define T_15625_178 {0.9974393851306693203895292754168622195721,-0.0715169419798711708535776665485172998160} +#define T_15625_179 {0.9974105458179365335524835245450958609581,-0.0719180303621132221980616350265336222947} +#define T_15625_181 {0.9973523833425437246091860288288444280624,-0.0727201721735293155068902137827535625547} +#define T_15625_182 {0.9973230601892887348114413725852500647306,-0.0731212254729941557584993461205158382654} +#define T_15625_183 {0.9972935757653081934037686551164370030165,-0.0735222669484938556871966852668265346438} +#define T_15625_184 {0.9972639300753698421431181486696004867554,-0.0739232965351785814567264765173604246229} +#define T_15625_186 {0.9972041549168210172737758512084838002920,-0.0747253197827133369690599806745012756437} +#define T_15625_187 {0.9971740254578764783843780605820938944817,-0.0751263133138733163463029995909892022610} +#define T_15625_188 {0.9971437347523058880582880192378070205450,-0.0755272946968382624666205060748325195163} +#define T_15625_189 {0.9971132828050073282355469928006641566753,-0.0759282638667680281896465999125211965293} +#define T_15625_191 {0.9970518952049490790656705030414741486311,-0.0767301653081714080117947673898015636951} +#define T_15625_192 {0.9970209595621160048040110268630087375641,-0.0771310974499746643262554357534099835902} +#define T_15625_193 {0.9969898626974081912166525398788508027792,-0.0775320171194020801852531121767242439091} +#define T_15625_194 {0.9969586046158540604267273010918870568275,-0.0779329242516234865778557150406413711607} +#define T_15625_196 {0.9968956048224510979594015225302428007126,-0.0787347006451378506053373484974144957960} +#define T_15625_197 {0.9968638631207895617336589566548354923725,-0.0791355697767806842701787672922364436090} +#define T_15625_198 {0.9968319602226562992086655867751687765121,-0.0795364261119172882841255045605066698045} +#define T_15625_199 {0.9967998961332101837129471277876291424036,-0.0799372695857277354436476457522076088935} +#define T_15625_201 {0.9967352844011448942396214079053606837988,-0.0807389176901009975040324206929653882980} +#define T_15625_202 {0.9967027367689735850575516451499424874783,-0.0811397221910344079720545096279238350689} +#define T_15625_203 {0.9966700279663852635181342520809266716242,-0.0815405135713829498644855675593134947121} +#define T_15625_204 {0.9966371579986690321106834744568914175034,-0.0819412917663371875853783876664238050580} +#define T_15625_206 {0.9965709345891399228989371295028831809759,-0.0827428083408358466810383902156900148839} +#define T_15625_207 {0.9965375811580354792340585845522582530975,-0.0831435465907721382716744074059533886611} +#define T_15625_208 {0.9965040665832203270468880873522721230984,-0.0835442713960979355114488953404361382127} +#define T_15625_209 {0.9964703908701137979875284145236946642399,-0.0839449826920145997233291268457833211869} +#define T_15625_211 {0.9964025560508348311472559544199611991644,-0.0847463644964368822964928540386608801782} +#define T_15625_212 {0.9963683969556315078719421762798447161913,-0.0851470348753561867560790688003180548549} +#define T_15625_213 {0.9963340767440751477579397032968699932098,-0.0855476914856937303843764652810932602733} +#define T_15625_214 {0.9962995954217154226384423054696526378393,-0.0859483342626618934678717209862952586263} +#define T_15625_216 {0.9962301494669149048988288086547981947660,-0.0867495780573508540323146576156432274729} +#define T_15625_217 {0.9961951848457037961281912430422380566597,-0.0871501789455076530943600232603785116225} +#define T_15625_218 {0.9961600591361486900510158193355891853571,-0.0875507657411670886515864253851759713143} +#define T_15625_219 {0.9961247723439294876612848383956588804722,-0.0879513383795528375097561024631431791931} +#define T_15625_221 {0.9960537155343489601477813266683369874954,-0.0887524409254094592824912979267537593842} +#define T_15625_222 {0.9960179455284776661727619284647516906261,-0.0891529707033391488613105479998921509832} +#define T_15625_223 {0.9959820144629226712496006257424596697092,-0.0895534860649128683363073832879308611155} +#define T_15625_224 {0.9959459223434939945107657877088058739901,-0.0899539869453658130771245282630843576044} +#define T_15625_226 {0.9958732549663867894551572135242167860270,-0.0907549450038620669767297499674896243960} +#define T_15625_227 {0.9958366797204587506087136716814711689949,-0.0911554020523875629944399179294123314321} +#define T_15625_228 {0.9957999434441583419541643706907052546740,-0.0915558443607567135069658093016187194735} +#define T_15625_229 {0.9957630461434258117847662106214556843042,-0.0919562718642165793925258299168490339071} +#define T_15625_231 {0.9956887684925561643467517569661140441895,-0.0927570821974085385486219479389546904713} +#define T_15625_232 {0.9956513881544301058923451819282490760088,-0.0931574648976467301064730008874903433025} +#define T_15625_233 {0.9956138468158939325292067223927006125450,-0.0935578325339878197342002863479137886316} +#define T_15625_234 {0.9955761444830181217113818092911969870329,-0.0939581850416909281076272009158856235445} +#define T_15625_236 {0.9955002568586601707778527270420454442501,-0.0947588444122318823703565726646047551185} +#define T_15625_237 {0.9954620715794494367756328756513539701700,-0.0951591511456002236979401232019881717861} +#define T_15625_238 {0.9954237253304415444787878186616580933332,-0.0955594424913916024033611051891057286412} +#define T_15625_239 {0.9953852181178375335690589054138399660587,-0.0959597183848774765158040622736734803766} +#define T_15625_241 {0.9953077208267736564195615756034385412931,-0.0967602235560309775763698780792765319347} +#define T_15625_242 {0.9952687307608455435925520760065410286188,-0.0971604527042540383474644727357372175902} +#define T_15625_243 {0.9952295797563842638666642415046226233244,-0.0975606661412824760493123221749556250870} +#define T_15625_244 {0.9951902678197207530175205647537950426340,-0.0979608638024003497424274655713816173375} +#define T_15625_246 {0.9951111611752406771458367984450887888670,-0.0987612115380533117647843255326733924448} +#define T_15625_247 {0.9950713664802159907907253000303171575069,-0.0991613614831692996576251175611105281860} +#define T_15625_248 {0.9950314108785726086026102166215423494577,-0.0995614953935365226689313544738979544491} +#define T_15625_249 {0.9949912943767716955179025717370677739382,-0.0999616132044518768129393038179841823876} +#define T_15625_251 {0.9949105786986710553421175973198842257261,-0.1007618002691276770654837946494808420539} +#define T_15625_252 {0.9948699795354232211508360705920495092869,-0.1011618693934949880786078324490517843515} +#define T_15625_253 {0.9948292194981218639071585130295716226101,-0.1015619221596242016669719987476128153503} +#define T_15625_254 {0.9947882985933580446413770914659835398197,-0.1019619585028253561098665613826597109437} +#define T_15625_256 {0.9947059742079370492362500044691842049360,-0.1027619816616968800859766020039387512952} +#define T_15625_257 {0.9946645707405920022736722785339225083590,-0.1031619683480006210984925019147340208292} +#define T_15625_258 {0.9946230064324087516425265675934497267008,-0.1035619383526430875575030654545116703957} +#define T_15625_259 {0.9945812812901083654892886443121824413538,-0.1039618916109476681741696779681660700589} +#define T_15625_261 {0.9944973485301705773409253197314683347940,-0.1047617476298504102238950963510433211923} +#define T_15625_262 {0.9944551409261054297772375321073923259974,-0.1051616502611089770669039467065886128694} +#define T_15625_263 {0.9944127725150676555543327594932634383440,-0.1055615358873505105208323584520258009434} +#define T_15625_264 {0.9943702433039082189125679178687278181314,-0.1059614044439120411622567985432397108525} +#define T_15625_266 {0.9942847025087594436953963850100990384817,-0.1067610900893571912462221007444895803928} +#define T_15625_267 {0.9942416909386021517391895940818358212709,-0.1071609070489287496297237112230504862964} +#define T_15625_268 {0.9941985185959878457140348473330959677696,-0.1075607066801962663493696936711785383523} +#define T_15625_269 {0.9941551854878976079987751290900632739067,-0.1079604889185106775251199451304273679852} +#define T_15625_271 {0.9940680370033438961741012462880462408066,-0.1087600009576981524572758530666760634631} +#define T_15625_272 {0.9940242216409725939385566562123131006956,-0.1091597306292872437971652743726735934615} +#define T_15625_273 {0.9939802455413098147829487061244435608387,-0.1095594426493552986379143021622439846396} +#define T_15625_274 {0.9939361087114667592246064486971590667963,-0.1099591369532674778319858432951150462031} +#define T_15625_276 {0.9938473528898136288844966657052282243967,-0.1107584721540990219112998715900175739080} +#define T_15625_277 {0.9938027339123557402089659262856002897024,-0.1111581129217630165006980291764193680137} +#define T_15625_278 {0.9937579542334218363208719893009401857853,-0.1115577357147603809739422331404057331383} +#define T_15625_279 {0.9937130138602527917868201257078908383846,-0.1119573404684706813272399017478164751083} +#define T_15625_281 {0.9936226510603035633195645459636580199003,-0.1127564955995629253360235111358633730561} +#define T_15625_282 {0.9935772286481353576803599025879520922899,-0.1131560458477186004166981092566857114434} +#define T_15625_283 {0.9935316455709559679831954781548120081425,-0.1135555777981347158611669101446750573814} +#define T_15625_284 {0.9934859018361364979554650744830723851919,-0.1139550913862055203651379997609183192253} +#define T_15625_286 {0.9933939324231908507556454424047842621803,-0.1147540632169030405673737504912423901260} +#define T_15625_287 {0.9933477067599363330430151108885183930397,-0.1151535213303331306455845606251386925578} +#define T_15625_288 {0.9933013204687852759633415189455263316631,-0.1155529608230247001765533809702901635319} +#define T_15625_289 {0.9932547735572384572932946866785641759634,-0.1159523816303869303556695058432524092495} +#define T_15625_291 {0.9931611979030907644272474499302916228771,-0.1167511669307753074953382110834354534745} +#define T_15625_292 {0.9931141691756213418784682289697229862213,-0.1171505312946349991465311290994577575475} +#define T_15625_293 {0.9930669798580192830783630597579758614302,-0.1175498767148324547049398347553506027907} +#define T_15625_294 {0.9930196299579151508751806431973818689585,-0.1179492031267921209325422182701004203409} +#define T_15625_296 {0.9929244484408533688579723275324795395136,-0.1187477986677109992319500975099799688905} +#define T_15625_297 {0.9928766168392868518566274360637180507183,-0.1191470676675344675388146242767106741667} +#define T_15625_298 {0.9928286246860006380998697750328574329615,-0.1195463174008485895960518519132165238261} +#define T_15625_299 {0.9927804719887554085744341136887669563293,-0.1199455478030932720212575759433093480766} +#define T_15625_301 {0.9926836849935590789684169976681005209684,-0.1207439503561494320571512162132421508431} +#define T_15625_302 {0.9926350507112590149105812997731845825911,-0.1211431223778563631698901303934690076858} +#define T_15625_303 {0.9925862559163013409957443400344345718622,-0.1215422748102847200213361134046863298863} +#define T_15625_304 {0.9925373006165764122599171059846412390471,-0.1219414075888901466404590223646664526314} +#define T_15625_306 {0.9924389085345157734963095208513550460339,-0.1227396139264705365867769160104217007756} +#define T_15625_307 {0.9923894717680904475898273631173651665449,-0.1231386873563726641611637546702695544809} +#define T_15625_308 {0.9923398745287185240826488552556838840246,-0.1235377408743062038753990350414824206382} +#define T_15625_309 {0.9922901168244201430823636655986774712801,-0.1239367744157427453366437930526444688439} +#define T_15625_311 {0.9921901200532537989928982824494596570730,-0.1247347813110275122072678755102970171720} +#define T_15625_312 {0.9921398810025556791458711813902482390404,-0.1251337545358351399649166069139027968049} +#define T_15625_313 {0.9920894815192707882900435834017116576433,-0.1255327075260646296772648611295153386891} +#define T_15625_314 {0.9920389216115488295599789125844836235046,-0.1259316402172038773521478560724062845111} +#define T_15625_316 {0.9919373205555226391538781172130256891251,-0.1267294444441794398770184670866001397371} +#define T_15625_317 {0.9918862794236475988185475216596387326717,-0.1271283158510079502878653556763310916722} +#define T_15625_318 {0.9918350779001940953349958363105542957783,-0.1275271667007306097829655300301965326071} +#define T_15625_319 {0.9917837159934415058870627035503275692463,-0.1279259969288517873042820838236366398633} +#define T_15625_321 {0.9916805110632866959718967336812056601048,-0.1287235952623238810499373130369349382818} +#define T_15625_322 {0.9916286680565730149439218621409963816404,-0.1291223632387002717702984000425203703344} +#define T_15625_323 {0.9915766646999375488746863993583247065544,-0.1295211103355261850200719209169619716704} +#define T_15625_324 {0.9915245010017894600196086685173213481903,-0.1299198364883227541088928091994603164494} +#define T_15625_326 {0.9914196926147210708890611385868396610022,-0.1307172257039294349656444182983250357211} +#define T_15625_327 {0.9913670479427487691737042041495442390442,-0.1311158886377988552762730023459880612791} +#define T_15625_328 {0.9913142429631595087258233434113208204508,-0.1315145303697575485113446802643011324108} +#define T_15625_329 {0.9912612776844921258501130978402215987444,-0.1319131508353437176594979973742738366127} +#define T_15625_331 {0.9911548662642074569717465237772557884455,-0.1327103277095684485953341891217860393226} +#define T_15625_332 {0.9911014201397972955831505714741069823503,-0.1331088839893006525727514599566347897053} +#define T_15625_333 {0.9910478137507230389502410616842098534107,-0.1335074187448476168427191623777616769075} +#define T_15625_334 {0.9909940471056533084492912166751921176910,-0.1339059319117648083619087628903798758984} +#define T_15625_336 {0.9908860330823298090408002281037624925375,-0.1347028932219494490318822954577626660466} +#define T_15625_337 {0.9908317857215424018235694347822573035955,-0.1351013412363453458642226223673787899315} +#define T_15625_338 {0.9907773781396919110520116191764827817678,-0.1354997674043685040867757152227568440139} +#define T_15625_339 {0.9907228103455761880624663717753719538450,-0.1358981716615919876911533492602757178247} +#define T_15625_341 {0.9906131941558702358463506243424490094185,-0.1366949141859498118023452661873307079077} +#define T_15625_342 {0.9905581457780053833417355235724244266748,-0.1370932523242480161052014864253578707576} +#define T_15625_343 {0.9905029372233259410407413270149845629930,-0.1374915682940743011375417381714214570820} +#define T_15625_344 {0.9904475685007593233066813809273298829794,-0.1378898620310195222149474147954606451094} +#define T_15625_346 {0.9903363505878041150864987685054074972868,-0.1386863825486482904025820062088314443827} +#define T_15625_347 {0.9902805014153996943093716254225000739098,-0.1390846092005315892681238665318232960999} +#define T_15625_348 {0.9902244921110768816063796293747145682573,-0.1394828133619334109560838896868517622352} +#define T_15625_349 {0.9901683226838924323232049573562107980251,-0.1398809949684626519061936278376379050314} +#define T_15625_351 {0.9900555034972960966044297492771875113249,-0.1406772902593576013430265447823330760002} +#define T_15625_352 {0.9899988537561273949094697854889091104269,-0.1410754038149594769002703742444282397628} +#define T_15625_353 {0.9899420439285836481602132153057027608156,-0.1414734945581611891274320669253938831389} +#define T_15625_354 {0.9898850740238512857516184340056497603655,-0.1418715624245900641664519525875221006572} +#define T_15625_356 {0.9897706540196954394517092623573262244463,-0.1426676292696569259277339369873516261578} +#define T_15625_357 {0.9897132039387741553326804933021776378155,-0.1430656281195680223916610884771216660738} +#define T_15625_358 {0.9896555938176683220319773681694641709328,-0.1434636038352525844175033853389322757721} +#define T_15625_359 {0.9895978236656939319715320380055345594883,-0.1438615563523564233605611661914736032486} +#define T_15625_361 {0.9894818033065315709961851098341867327690,-0.1446573915334244397890017808094853535295} +#define T_15625_362 {0.9894235531181044818183067945938091725111,-0.1450552740686991970431307663602638058364} +#define T_15625_363 {0.9893651429363305993547328398562967777252,-0.1454531331480144462631187707302160561085} +#define T_15625_364 {0.9893065727706551459874617648893035948277,-0.1458509687070349836179872227148734964430} +#define T_15625_366 {0.9891889525255088688737714619492180645466,-0.1466465690068699256887185811137896962464} +#define T_15625_367 {0.9891299024650577198514156407327391207218,-0.1470443336190329075563454352959524840117} +#define T_15625_368 {0.9890706924587441273288845877686981111765,-0.1474420744535983041068050169997150078416} +#define T_15625_369 {0.9890113225161425436482431905460543930531,-0.1478397914462500351007179233420174568892} +#define T_15625_371 {0.9888921028605025531632577440177556127310,-0.1486351536485671920306828042157576419413} +#define T_15625_372 {0.9888322531667425030477147629426326602697,-0.1490327987296197198574532194470521062613} +#define T_15625_373 {0.9887722435752508420847561865230090916157,-0.1494304197115327026423869938298594206572} +#define T_15625_374 {0.9887120740957314746211181955004576593637,-0.1498280165300094335378844334627501666546} +#define T_15625_376 {0.9885912555115539124273027482558973133564,-0.1506231374194866579063756262257811613381} +#define T_15625_377 {0.9885306064264324232837566341913770884275,-0.1510206613619132498538277786792605184019} +#define T_15625_378 {0.9884697974923568608573987148702144622803,-0.1514181608837557591051847794005880132318} +#define T_15625_379 {0.9884088287191602484327290767396334558725,-0.1518156359207371297603827997590997256339} +#define T_15625_381 {0.9882864116948648636196139705134555697441,-0.1526105122830277438517043719912180677056} +#define T_15625_382 {0.9882249634635613677602350435336120426655,-0.1530079134798026652131142100188299082220} +#define T_15625_383 {0.9881633554327273749606774799758568406105,-0.1534052899346476928066351774759823456407} +#define T_15625_384 {0.9881015876123250274432052719930652529001,-0.1538026415833056159687686204051715321839} +#define T_15625_386 {0.9879775726427935111928491096477955579758,-0.1545972702050515401595021103275939822197} +#define T_15625_387 {0.9879153255137179678868619703280273824930,-0.1549945470496452148978505647392012178898} +#define T_15625_388 {0.9878529186351814317390562791842967271805,-0.1553917988310631881354595407174201682210} +#define T_15625_389 {0.9877903520172752749317623965907841920853,-0.1557890254850684830234541777826962061226} +#define T_15625_391 {0.9876647396038491510950052543194033205509,-0.1565834031539130866139686304450151510537} +#define T_15625_392 {0.9876016938286410473679666210955474525690,-0.1569805540402986754333625185608980245888} +#define T_15625_393 {0.9875384883546873826531964368768967688084,-0.1573776795423641183813145971726044081151} +#define T_15625_394 {0.9874751231922085370484865052276290953159,-0.1577747795958927834458762617941829375923} +#define T_15625_396 {0.9873479138426870527212031447561457753181,-0.1585689031004939575364431902926298789680} +#define T_15625_397 {0.9872840696762145151765821537992451339960,-0.1589659264231538249312336574803339317441} +#define T_15625_398 {0.9872200658623573321648336786893196403980,-0.1593629240404516866913553485574084334075} +#define T_15625_399 {0.9871559024114652247661183537275064736605,-0.1597598958881915887086933025784674100578} +#define T_15625_401 {0.9870270966401034629100763595488388091326,-0.1605537620182347080532991867585224099457} +#define T_15625_402 {0.9869624543404621475062299396086018532515,-0.1609506561721668893572001479697064496577} +#define T_15625_403 {0.9868976524454426968446796308853663504124,-0.1613475242997991221383102811159915290773} +#define T_15625_404 {0.9868326909655235068541401233233045786619,-0.1617443663369564355036800407106056809425} +#define T_15625_406 {0.9867022892930302768732531148998532444239,-0.1625379718831673203638388258696068078279} +#define T_15625_407 {0.9866368491215424807450062871794216334820,-0.1629347352638920165546210228058043867350} +#define T_15625_408 {0.9865712494073279881234839194803498685360,-0.1633314722974840704772248045628657564521} +#define T_15625_409 {0.9865054901609944248974670699681155383587,-0.1637281829197896609873197348861140199006} +#define T_15625_411 {0.9863734931145299311694429889030288904905,-0.1645215246739475667414609461047803051770} +#define T_15625_412 {0.9863072553357433713827617793867830187082,-0.1649181556775136947567972356409882195294} +#define T_15625_413 {0.9862408580675265934445405946462415158749,-0.1653147600132209571466290753960493020713} +#define T_15625_414 {0.9861743013206162311590219360368791967630,-0.1657113376169369878265769102654303424060} +#define T_15625_416 {0.9860407094337902966785236458235885947943,-0.1665044123718876223350093823682982474566} +#define T_15625_417 {0.9859736743154770000074904601206071674824,-0.1669009093948791710992907155741704627872} +#define T_15625_418 {0.9859064797616747810593551548663526773453,-0.1672973794293934612920082827258738689125} +#define T_15625_419 {0.9858391257832490595092167495749890804291,-0.1676938224113198316889139505292405374348} +#define T_15625_421 {0.9857039395961186833972078602528199553490,-0.1684866269609882061253358642716193571687} +#define T_15625_422 {0.9856361074092740981456017834716476500034,-0.1688829884005308701322434217217960394919} +#define T_15625_423 {0.9855681158415263709571263461839407682419,-0.1692793225310869342781217028459650464356} +#define T_15625_424 {0.9854999649038697073777370860625524073839,-0.1696756292885677752657613837072858586907} +#define T_15625_426 {0.9853631849629371775023400914506055414677,-0.1704681604279712769933752269935212098062} +#define T_15625_427 {0.9852945559817788412360073380114044994116,-0.1708643846817387845771207821599091403186} +#define T_15625_428 {0.9852257676749475168165304239664692431688,-0.1712605813061208182013217538042226806283} +#define T_15625_429 {0.9851568200535663066830238676629960536957,-0.1716567502370509312736857054915162734687} +#define T_15625_431 {0.9850184469117764241019585824687965214252,-0.1724490047623121746767083095619454979897} +#define T_15625_432 {0.9849490214137431864926952584937680512667,-0.1728450902285328938390307484951335936785} +#define T_15625_433 {0.9848794366459110438682955646072514355183,-0.1732411477450809256239949718292336910963} +#define T_15625_434 {0.9848096926195319955610329998307861387730,-0.1736371772479123054555572025492438115180} +#define T_15625_436 {0.9846697268362705202093820844311267137527,-0.1744291519562722048153347031984594650567} +#define T_15625_437 {0.9845995051020210997450021750410087406635,-0.1748250970337355547634672348067397251725} +#define T_15625_438 {0.9845291241544907867577762772270943969488,-0.1752210138413519691091835284169064834714} +#define T_15625_439 {0.9844585840050603664508344081696122884750,-0.1756169023151002983595958539808634668589} +#define T_15625_441 {0.9843170261461514636280867307505104690790,-0.1764085940049308909305381121157552115619} +#define T_15625_442 {0.9842460084595634484116999374236911535263,-0.1768043970929937813707510940730571746826} +#define T_15625_443 {0.9841748316168561494521327404072508215904,-0.1772001715911498131994505911279702559114} +#define T_15625_444 {0.9841034956295391378233716750401072204113,-0.1775959174354008462959342296016984619200} +#define T_15625_446 {0.9839603462672429357027681362524162977934,-0.1783873229062183651816297924597165547311} +#define T_15625_447 {0.9838885329154114511851503266370855271816,-0.1787829824048117743906516352581093087792} +#define T_15625_448 {0.9838165604652658879913929013127926737070,-0.1791786129935539484403506094167823903263} +#define T_15625_449 {0.9837444289284443810217339887458365410566,-0.1795742146084700341379658539153751917183} +#define T_15625_451 {0.9835996886414548612265207339078187942505,-0.1803653306609477591226919912514858879149} +#define T_15625_452 {0.9835270799146919040722991667280439287424,-0.1807608449705830622189495215934584848583} +#define T_15625_453 {0.9834543121480631144848416624881792813540,-0.1811563300505396878481434441710007376969} +#define T_15625_454 {0.9833813853533351911906379427819047123194,-0.1815517858368662362789081043956684879959} +#define T_15625_456 {0.9832350547267777463034121865348424762487,-0.1823426092728474001702920759271364659071} +#define T_15625_457 {0.9831616509186105190565285738557577133179,-0.1827379767946230304520582876648404635489} +#define T_15625_458 {0.9830880881296686357728731309180147945881,-0.1831333147670106131776890379114774987102} +#define T_15625_459 {0.9830143663718472479828847099270205944777,-0.1835286231260825628996968816863954998553} +#define T_15625_461 {0.9828664459972765721218479484377894550562,-0.1843191507485933688936796670532203279436} +#define T_15625_462 {0.9827922474044463729825338305090554058552,-0.1847143698842010350880116220650961622596} +#define T_15625_463 {0.9827178898905754023118674922443460673094,-0.1851095591508307991457371599608450196683} +#define T_15625_464 {0.9826433734676874864888418414921034127474,-0.1855047184845791397034275860278285108507} +#define T_15625_466 {0.9824938639430845777056333645305130630732,-0.1862949470978416122157739209797000512481} +#define T_15625_467 {0.9824188708655459123519904096610844135284,-0.1866900162495729598166605001097195781767} +#define T_15625_468 {0.9823437189273427350144629599526524543762,-0.1870850552128572874543976922723231837153} +#define T_15625_469 {0.9822684081406273248759930538653861731291,-0.1874800639238153598942204780541942454875} +#define T_15625_471 {0.9821173100703979308434554695850238204002,-0.1882699903332603341699069687820156104863} +#define T_15625_472 {0.9820415228113168470969185364083386957645,-0.1886649079040133014650848508608760312200} +#define T_15625_473 {0.9819655767525898859560129494639113545418,-0.1890597949669721722365522964537376537919} +#define T_15625_474 {0.9818894719064976683853274153079837560654,-0.1894546514582822749339641177357407286763} +#define T_15625_476 {0.9817367859014688447061303122609388083220,-0.1902442724705622478786892770585836842656} +#define T_15625_477 {0.9816602047672221553753502121253404766321,-0.1906390368638475885099126116983825340867} +#define T_15625_478 {0.9815834648949900431702531022892799228430,-0.1910337704301150740793246995963272638619} +#define T_15625_479 {0.9815065662971814708370743574050720781088,-0.1914284731055348465211807251762365922332} +#define T_15625_481 {0.9813522929746001377537822918384335935116,-0.1922177855285369107996018556150374934077} +#define T_15625_482 {0.9812749182747738663223913135880138725042,-0.1926123951484845775450338578593800775707} +#define T_15625_483 {0.9811973848992642244226658476691227406263,-0.1930069736223152532250679769276757724583} +#define T_15625_484 {0.9811196928606088496493953243771102279425,-0.1934015208862241430587403101526433601975} +#define T_15625_486 {0.9809638328441384613753939447633456438780,-0.1941905215290829211927103870038990862668} +#define T_15625_487 {0.9808856648915266207566787670657504349947,-0.1945849747804485607716173944936599582434} +#define T_15625_488 {0.9808073383261751709838449642120394855738,-0.1949793965667239725725323751248652115464} +#define T_15625_489 {0.9807288531607497583664212470466736704111,-0.1953737868241297581661797266860958188772} +#define T_15625_491 {0.9805714070804683046844729688018560409546,-0.1961624724972402256106818185799056664109} +#define T_15625_492 {0.9804924461910717869983500349917449057102,-0.1965567677854116179769761174611630849540} +#define T_15625_493 {0.9804133267525205752690453664399683475494,-0.1969510312896467496557306731119751930237} +#define T_15625_494 {0.9803340487776084355431294170557521283627,-0.1973452629461918406139631088080932386220} +#define T_15625_496 {0.9801750172700056662478118596482090651989,-0.1981336304612222876109228764107683673501} +#define T_15625_497 {0.9800952637630307995308953650237526744604,-0.1985277661922258130022811428716522641480} +#define T_15625_498 {0.9800153517711270856338501289428677409887,-0.1989218698205755253560766959708416834474} +#define T_15625_499 {0.9799352813072165213625908108951989561319,-0.1993159412825435405913765407603932544589} +#define T_15625_501 {0.9797746650151912817250376974698156118393,-0.2001039874524484230011722729614120908082} +#define T_15625_502 {0.9796941192130488307299174266518093645573,-0.2004979620329554457214271678822115063667} +#define T_15625_503 {0.9796134149908439781029301229864358901978,-0.2008919041922209158812506757385563105345} +#define T_15625_504 {0.9795325523616269514093346515437588095665,-0.2012858138665430396407884927612030878663} +#define T_15625_506 {0.9793703519344848507088840960932429879904,-0.2020735355055759130404879897469072602689} +#define T_15625_507 {0.9792890141627882405472860227746423333883,-0.2024673473429092762643222158658318221569} +#define T_15625_508 {0.9792075180365362729872913405415602028370,-0.2028611264405444369884889965760521590710} +#define T_15625_509 {0.9791258635689069622642932699818629771471,-0.2032548727348058859032420286894193850458} +#define T_15625_511 {0.9789620796623577092532286769710481166840,-0.2040422666585322286625370225010556168854} +#define T_15625_512 {0.9788799502499221372175952637917362153530,-0.2044359141606726937290261503221699967980} +#define T_15625_513 {0.9787976625490782955907320683763828128576,-0.2048295286047906449411470930499490350485} +#define T_15625_514 {0.9787152165731322073227715918619651347399,-0.2052231099272371628305933199953869916499} +#define T_15625_516 {0.9785498498492867236464576308208052068949,-0.2060101729525472824544607419738895259798} +#define T_15625_517 {0.9784669291281276048977133541484363377094,-0.2064036545281399404050404200461343862116} +#define T_15625_518 {0.9783838501853471258939975996327120810747,-0.2067971027275193884875648109300527721643} +#define T_15625_519 {0.9783006130343795403447870739910285919905,-0.2071905174870636023864989283538307063282} +#define T_15625_521 {0.9781336641617474070287130416545551270247,-0.2079772464321854863467109453267767094076} +#define T_15625_522 {0.9780499524670790423286348413967061787844,-0.2083705604905462249742953417808166705072} +#define T_15625_523 {0.9779660826182160482389349454024340957403,-0.2087638408546379775732049211001140065491} +#define T_15625_524 {0.9778820546287203541169219533912837505341,-0.2091570874608658925364323977191816084087} +#define T_15625_526 {0.9777135242822072580537451358395628631115,-0.2099434791453780591030664481877465732396} +#define T_15625_527 {0.9776290219524417235419377902871929109097,-0.2103366240964999467344398453860776498914} +#define T_15625_528 {0.9775443615365474459011352337256539613008,-0.2107297350354332965416404022107599303126} +#define T_15625_529 {0.9774595430482142521810828839079476892948,-0.2111228118986106239152178432050277478993} +#define T_15625_531 {0.9772894319091189885284620686434209346771,-0.2119088631434550007437422891598544083536} +#define T_15625_532 {0.9772041392858643593655187942204065620899,-0.2123018373980147532886775252336519770324} +#define T_15625_533 {0.9771186886451860287294834961357992142439,-0.2126947773226038618243904920745990239084} +#define T_15625_534 {0.9770330800009017213625384101760573685169,-0.2130876828536825973170465431394404731691} +#define T_15625_536 {0.9768613887569135290078747857478447258472,-0.2138733904811774000354063218765077181160} +#define T_15625_537 {0.9767753061849727691523526118544396013021,-0.2142661924505417370134807697468204423785} +#define T_15625_538 {0.9766890656649522828303133792360313236713,-0.2146589597722922126976641266082879155874} +#define T_15625_539 {0.9766026672107972483871662916499190032482,-0.2150516923829168813853129904600791633129} +#define T_15625_541 {0.9764293965559933674569492723094299435616,-0.2158370532167694366698640351387439295650} +#define T_15625_542 {0.9763425243833629973977394911344163119793,-0.2162296813130016037707292753111687488854} +#define T_15625_543 {0.9762554943326351430954446186660788953304,-0.2166222744441166914164398349385010078549} +#define T_15625_544 {0.9761683064178828805879106766951736062765,-0.2170148325466309813247534066249500028789} +#define T_15625_546 {0.9759934570527252217786440269264858216047,-0.2177998434119504667094702199392486363649} +#define T_15625_547 {0.9759057956305936532004352557123638689518,-0.2181922960478166473308192507829517126083} +#define T_15625_548 {0.9758179764009851098194303631316870450974,-0.2185847134012038617267847939729108475149} +#define T_15625_549 {0.9757299993781004543436097264930140227079,-0.2189770954086567855689793304918566718698} +#define T_15625_551 {0.9755535720094331564311573856684844940901,-0.2197617531319673023215699458887684158981} +#define T_15625_552 {0.9754651216921793599468060165236238390207,-0.2201540287209428903292263157709385268390} +#define T_15625_553 {0.9753765136387072542945020359184127300978,-0.2205462687102202889999347235061577521265} +#define T_15625_554 {0.9752877478633451557854527891322504729033,-0.2209384730363728732704942103737266734242} +#define T_15625_556 {0.9751097432043913659782674585585482418537,-0.2217227744456260196681540719509939663112} +#define T_15625_557 {0.9750205043495836498834705707849934697151,-0.2221148714019021974674927832893445156515} +#define T_15625_558 {0.9749311078304538913386068088584579527378,-0.2225069324414049032334617095330031588674} +#define T_15625_559 {0.9748415536614577492358080235135275870562,-0.2228989575007364054570757616602350026369} +#define T_15625_561 {0.9746619724318171806842769910872448235750,-0.2236828994253242108847246072400594130158} +#define T_15625_562 {0.9745719454002115256230354134459048509598,-0.2240748161638143332030637111529358662665} +#define T_15625_563 {0.9744817607768173628457475388131570070982,-0.2244666966686008069409297149832127615809} +#define T_15625_564 {0.9743914185762180268923771109257359057665,-0.2248585408763151827216120182129088789225} +#define T_15625_566 {0.9742102615018635169974459131481125950813,-0.2256421201470830140145551467867335304618} +#define T_15625_567 {0.9741194466574022436944346736709121614695,-0.2260338550834288806612448752275668084621} +#define T_15625_568 {0.9740284742943230433809276291867718100548,-0.2264255534692874993751843248901423066854} +#define T_15625_569 {0.9739373444273368152224179539189208298922,-0.2268172152413198139342398462758865207434} +#define T_15625_571 {0.9737546122406119941672386630671098828316,-0.2276004286905790319206488447889569215477} +#define T_15625_572 {0.9736630099504219870709675888065248727798,-0.2279919802411574936140681302276789210737} +#define T_15625_573 {0.9735712502154217906635835788620170205832,-0.2283834949246128231958863352701882831752} +#define T_15625_574 {0.9734793330504489805576895378180779516697,-0.2287749726776356629098785333553678356111} +#define T_15625_576 {0.9732950264900650516608493489911779761314,-0.2295578171391765009978769285226007923484} +#define T_15625_577 {0.9732026371244568707652433658950030803680,-0.2299491837211057043699469204511842690408} +#define T_15625_578 {0.9731100903884821740064126061042770743370,-0.2303405131194249388926209576311521232128} +#define T_15625_579 {0.9730173862971063236670943297212943434715,-0.2307318052708548228313389927279786206782} +#define T_15625_581 {0.9728315061081387327135416853707283735275,-0.2315142775799591545737854403341771103442} +#define T_15625_582 {0.9727383300406045041341940304846502840519,-0.2319054576111050369746635624323971569538} +#define T_15625_583 {0.9726449966777839239995273601380176842213,-0.2322966001423044102303094859962584450841} +#define T_15625_584 {0.9725515060347694751285985148570034652948,-0.2326877051103081184280085835780482739210} +#define T_15625_586 {0.9723640529686553568566864669264759868383,-0.2334698021037623638651581359226838685572} +#define T_15625_587 {0.9722700905758675515855316007218789309263,-0.2338607940027449538789028338214848190546} +#define T_15625_588 {0.9721759709635094948154687699570786207914,-0.2342517480855961509167428857836057431996} +#define T_15625_589 {0.9720816941468005678572694705508183687925,-0.2346426642890972191768383936505415476859} +#define T_15625_591 {0.9718926689613356373342867300380021333694,-0.2354243828052049458676719950744882225990} +#define T_15625_592 {0.9717979206231455169273658611928112804890,-0.2358151849914048858725124091506586410105} +#define T_15625_593 {0.9717030151417368477595459808071609586477,-0.2362059490454413435145397670567035675049} +#define T_15625_594 {0.9716079525324559096333132401923649013042,-0.2365966749041263916808475187281146645546} +#define T_15625_596 {0.9714173559917911315864103016792796552181,-0.2373780117827212210457332730584312230349} +#define T_15625_597 {0.9713218220912273048739393743744585663080,-0.2377686226762860677297339861979708075523} +#define T_15625_598 {0.9712261311244317907309664406056981533766,-0.2381591951218094971309824359195772558451} +#define T_15625_599 {0.9711302831068778784739947695925366133451,-0.2385497290561345851145347296551335602999} +#define T_15625_601 {0.9709381159815166917326223483541980385780,-0.2393306811385930155111623207631055265665} +#define T_15625_602 {0.9708417969047834494844551045389380306005,-0.2397210991604435681434637217535055242479} +#define T_15625_603 {0.9707453208394402066616635238460730761290,-0.2401114784185302830632480208805645816028} +#define T_15625_604 {0.9706486878010874841393729184346739202738,-0.2405018188497274056469876768460380844772} +#define T_15625_606 {0.9704549508678824709662080749694723635912,-0.2412823829789813856461222485449980013072} +#define T_15625_607 {0.9703578470043582315796015791420359164476,-0.2416726065508182363927858204988297075033} +#define T_15625_608 {0.9702605862304802819551241555018350481987,-0.2420627910433254814659420617317664436996} +#define T_15625_609 {0.9701631685619762635042206966318190097809,-0.2424529363934088410648115541334846056998} +#define T_15625_611 {0.9699678626041262630153028112545143812895,-0.2432331094139587868152574401392485015094} +#define T_15625_612 {0.9698699743463619071803805127274245023727,-0.2436231369582685379882036613707896322012} +#define T_15625_613 {0.9697719292571347349252164349309168756008,-0.2440131251078409280186320984284975565970} +#define T_15625_614 {0.9696737273522990641083652008092030882835,-0.2444030737996134017286919970501912757754} +#define T_15625_616 {0.9694768531593455085371147106343414634466,-0.2451828525575407147218953696210519410670} +#define T_15625_617 {0.9693781809030630469692368933465331792831,-0.2455726824976024735835977708120481111109} +#define T_15625_618 {0.9692793518948426001458074097172357141972,-0.2459624727276783495710787974530830979347} +#define T_15625_619 {0.9691803661506650513501881505362689495087,-0.2463522231847378174407481310481671243906} +#define T_15625_621 {0.9689819245184895235567523741337936371565,-0.2471316045277177630978826528007630258799} +#define T_15625_622 {0.9688824686625803206396767564001493155956,-0.2475212352876094978881837960216216742992} +#define T_15625_623 {0.9687828561348914568895906995749101042747,-0.2479108260224273108107695406943093985319} +#define T_15625_624 {0.9686830869515307140815707498404663056135,-0.2483003766691729841120661603781627491117} +#define T_15625_637 {0.9673718392767961526956810303090605884790,-0.2533608584099533711508911437704227864742} +#define T_15625_663 {0.9646700719176517502262413472635671496391,-0.2634609123691643572939824480272363871336} +#define T_15625_676 {0.9632796260667240151320811492041684687138,-0.2685002085748766642581131236511282622814} +#define T_15625_689 {0.9618628558586037602395890644402243196964,-0.2735321672482613486465652385959401726723} +#define T_15625_702 {0.9604198000105689247973828059912193566561,-0.2785566508767270965662987691757734864950} +#define T_15625_728 {0.9574549898544389980870050749217625707388,-0.2885826439736736692687202321394579485059} +#define T_15625_741 {0.9559333165682191157941360870609059929848,-0.2935838794533600082381497031747130677104} +#define T_15625_754 {0.9543855196836206200572405577986501157284,-0.2985770919180260052350206478877225890756} +#define T_15625_767 {0.9528116414985984272689734098094049841166,-0.3035621449139307981290869520307751372457} +#define T_15625_793 {0.9495858139816552023759754774800967425108,-0.3135072278031199588532729194412240758538} +#define T_15625_806 {0.9479339528046514251613530177564825862646,-0.3184669859187115315002358784113312140107} +#define T_15625_819 {0.9462561866346494987567439238773658871651,-0.3234180410175834241570669291832018643618} +#define T_15625_832 {0.9445525613213847648808041412848979234695,-0.3283602577980648806743602108326740562916} +#define T_15625_858 {0.9410679201961094086925641022389754652977,-0.3382176364085247022295277474768226966262} +#define T_15625_871 {0.9392869996118345943258987063018139451742,-0.3431325288575793330814178716536844149232} +#define T_15625_884 {0.9374804103371658925425435882061719894409,-0.3480380442337575597200327592872781679034} +#define T_15625_897 {0.9356482017422969033049184872652404010296,-0.3529340484798909605146377543860580772161} +#define T_15625_923 {0.9319071275719668090431468954193405807018,-0.3626969886566000500494055813760496675968} +#define T_15625_936 {0.9299983642320043264462015031313057988882,-0.3675636577870509125176567977177910506725} +#define T_15625_949 {0.9280641860400443698964068062196020036936,-0.3724202821944984642676956809737021103501} +#define T_15625_962 {0.9261046458530107772233463947486598044634,-0.3772667291578592485556953306513605639338} +#define T_15625_988 {0.9221096943853968053161906937020830810070,-0.3869285612622440084784614100499311462045} +#define T_15625_1001 {0.9200743922782346206901138430112041532993,-0.3917436823662090450426376264658756554127} +#define T_15625_1014 {0.9180139465198514647781280473282095044851,-0.3965480979591848864629355375654995441437} +#define T_15625_1027 {0.9159284134177916314456524560227990150452,-0.4013416767468425128129183576675131917000} +#define T_15625_1053 {0.9116823138391763059473760222317650914192,-0.4108958002131995468353409250994445756078} +#define T_15625_1066 {0.9095218633993759071998397303104866296053,-0.4156560837982850586946881321637192741036} +#define T_15625_1079 {0.9073365576862377768208034467534162104130,-0.4204050083979596341698936612374382093549} +#define T_15625_1092 {0.9051264564194596573543094564229249954224,-0.4251424442343435528002260070934426039457} +#define T_15625_1118 {0.9006321094901698076640172985207755118608,-0.4345823320790742627650615759193897247314} +#define T_15625_1131 {0.8983479866484783693536542159563396126032,-0.4392845261156205771690963501896476373076} +#define T_15625_1144 {0.8960393138914313215792617484112270176411,-0.4439747154523250860513883253588574007154} +#define T_15625_1157 {0.8937061543100819349660923762712627649307,-0.4486527719164163929832511712447740137577} +#define T_15625_1183 {0.8889666303828084314986313074768986552954,-0.4579719751969931751389708551869262009859} +#define T_15625_1196 {0.8865603955578637007661768620891962200403,-0.4626128673397273405321072914375690743327} +#define T_15625_1209 {0.8841299329470364565253248656517826020718,-0.4672411172693053948989927448565140366554} +#define T_15625_1222 {0.8816753089696343081982377043459564447403,-0.4718565985056263345498450689774472266436} +#define T_15625_1248 {0.8766938458918935861419186039711348712444,-0.4810487507262446449374238000018522143364} +#define T_15625_1261 {0.8741671429242104940371405064070131629705,-0.4856251705088224790429762833809945732355} +#define T_15625_1274 {0.8716165508515431037395160274172667413950,-0.4901883192015690227094637521076947450638} +#define T_15625_1287 {0.8690421393760785484516873111715540289879,-0.4947380721034599915242324641440063714981} +#define T_15625_1313 {0.8638221402782489954930156272894237190485,-0.5037968935643659662559912248980253934860} +#define T_15625_1326 {0.8611766953072149677339552908961195498705,-0.5083057145653039343713430753268767148256} +#define T_15625_1339 {0.8585077162321474997241921300883404910564,-0.5128006446659973960322531638666987419128} +#define T_15625_1352 {0.8558152759904954498537676954583730548620,-0.5172815610296891408736996709194499999285} +#define T_15625_1378 {0.8503603069609383391380674765969160944223,-0.5262008631172118144903038228221703320742} +#define T_15625_1391 {0.8475979272455864688140536600258201360703,-0.5306390050957293658129287905467208474874} +#define T_15625_1404 {0.8448123845046823232607380305125843733549,-0.5350626458532802276479856118385214358568} +#define T_15625_1417 {0.8420037548611087885319648194126784801483,-0.5394716645012912481504940842569340020418} +#define T_15625_1443 {0.8363175425099640403203693495015613734722,-0.5482453539156482147376436842023395001888} +#define T_15625_1456 {0.8334401151943278129863301728619262576103,-0.5526097849159619102010765345767140388489} +#define T_15625_1469 {0.8305399117556455657052083552116528153419,-0.5569591142812231332470673805801197886467} +#define T_15625_1482 {0.8276170114502299224312764636124484241009,-0.5612932231536292837503765440487768501043} +#define T_15625_1508 {0.8217034403635493644557641346182208508253,-0.5699153060716187768619533926539588719606} +#define T_15625_1521 {0.8187129311874437043172747507924214005470,-0.5742030444942486377968293709272984415293} +#define T_15625_1534 {0.8157000483505073562540133025322575122118,-0.5784750911845556453627636983583215624094} +#define T_15625_1547 {0.8126648741883382154327364332857541739941,-0.5827313293967061325062672949570696800947} +#define T_15625_1573 {0.8065279842742961724511019383498933166265,-0.5911959155664394671703121275641024112701} +#define T_15625_1586 {0.8034264362304052164986956086067948490381,-0.5954040322051157296812107233563438057899} +#define T_15625_1599 {0.8003029322726623595585238035710062831640,-0.5995958777341439116526089492253959178925} +#define T_15625_1612 {0.7971575577597024286902183121128473430872,-0.6037713375994150366565804688434582203627} +#define T_15625_1638 {0.7908015414886974125963092774327378720045,-0.6120726443642943159773039951687678694725} +#define T_15625_1651 {0.7875910734268857238404848430946003645658,-0.6161982644070704839833751975675113499165} +#define T_15625_1664 {0.7843590821975767735096951582818292081356,-0.6203070450786247747032575716730207204819} +#define T_15625_1677 {0.7811056561241288553887329726421739906073,-0.6243988740948323012958098843228071928024} +#define T_15625_1703 {0.7745348556646601823061359937128145247698,-0.6325312303440233652196411640034057199955} +#define T_15625_1716 {0.7712176608444619718696344534691888839006,-0.6365715353372285445487932520336471498013} +#define T_15625_1729 {0.7678793903068625859020812640665099024773,-0.6405944442016032747133635893987957388163} +#define T_15625_1742 {0.7645201352796049398818922782083973288536,-0.6445998469997138435871875117300078272820} +#define T_15625_1768 {0.7577390395318820415226923614682164043188,-0.6525576970424155076599959102168213576078} +#define T_15625_1781 {0.7543173841241564980819589436578098684549,-0.6565099268161066481397369898331817239523} +#define T_15625_1794 {0.7508751148471879410806195664918050169945,-0.6604442155876772968525756368762813508511} +#define T_15625_1807 {0.7474123257707806500249603232077788561583,-0.6643604558414899496909811205114237964153} +#define T_15625_1833 {0.7404255673000913473558171062904875725508,-0.6721383632023527621157654721173457801342} +#define T_15625_1846 {0.7369017888388682546718655430595390498638,-0.6759998177559488619081662363896612077951} +#define T_15625_1859 {0.7333578724391064396215256238065194338560,-0.6798427986906879771922262989392038434744} +#define T_15625_1872 {0.7297939149484073961815511211170814931393,-0.6836672009861791687868048938980791717768} +#define T_15625_1898 {0.7226062668203402372668620046169962733984,-0.6912598521192817591085599815414752811193} +#define T_15625_1911 {0.7189827726059125900803792319493368268013,-0.6950278934660928387145872875407803803682} +#define T_15625_1924 {0.7153396301410609448012678512895945459604,-0.6987769411977259315094102021248545497656} +#define T_15625_1937 {0.7116769389850211569026328106701839715242,-0.7025068928607821616694195654417853802443} +#define T_15625_1963 {0.7042933115047035341760306437208782881498,-0.7099091007796269892082818842027336359024} +#define T_15625_1976 {0.7005725769590597229452555438911076635122,-0.7135811547490181760977634439768735319376} +#define T_15625_1989 {0.6968326972739817826152375346282497048378,-0.7172337080825658128802047031058464199305} +#define T_15625_2002 {0.6930737746523254472208463994320482015610,-0.7208666609638550992755767765629570931196} +#define T_15625_2028 {0.6854992120099053787285470207280013710260,-0.7280733687849039181827492939191870391369} +#define T_15625_2041 {0.6816837789856224905093995403149165213108,-0.7316469267808626275595429433451499789953} +#define T_15625_2054 {0.6778497170120659776060279000375885516405,-0.7352004904423432574134267269982956349850} +#define T_15625_2067 {0.6739971308658905257971127866767346858978,-0.7387339626581057583365463869995437562466} +#define T_15625_2093 {0.6662368076905539338028461315843742340803,-0.7457402470554340689901096084213349968195} +#define T_15625_2106 {0.6623292827343087996183612631284631788731,-0.7492128677703393524467401221045292913914} +#define T_15625_2119 {0.6584036577454811522969180259678978472948,-0.7526650141114380465623412419517990201712} +#define T_15625_2132 {0.6544600400029459974504675301432143896818,-0.7560965917390067847492218788829632103443} +#define T_15625_2158 {0.6465192578278229351695927107357420027256,-0.7628976663077171593840830610133707523346} +#define T_15625_2171 {0.6425223103997067530812614677415695041418,-0.7662669773901410463778915982402395457029} +#define T_15625_2184 {0.6385078042208961468517713910841848701239,-0.7696153480466783047830858777160756289959} +#define T_15625_2197 {0.6344758489991974004240660178766120225191,-0.7729426867735715855900480164564214646816} +#define T_15625_2223 {0.6263600326395730721884547165245749056339,-0.7795339052996688167240790789946913719177} +#define T_15625_2236 {0.6222763932894265215267637358920183032751,-0.7827976049750682285832681372994557023048} +#define T_15625_2249 {0.6181757484658886303563463116006460040808,-0.7860399124781377588888631180452648550272} +#define T_15625_2262 {0.6140582102307493572723728902928996831179,-0.7892607392035972946331412458675913512707} +#define T_15625_2288 {0.6057729040780538420918333031295333057642,-0.7956375988380897101848177044303156435490} +#define T_15625_2301 {0.6016053625800675819235152630426455289125,-0.7987934574812849408687043251120485365391} +#define T_15625_2314 {0.5974213805034320756703891674987971782684,-0.8019274868199577355198925943113863468170} +#define T_15625_2327 {0.5932210721873648617830099283310119062662,-0.8050396012078370189257725542120169848204} +#define T_15625_2353 {0.5847719364214734039109089280827902257442,-0.8111977455428980299245722562773153185844} +#define T_15625_2366 {0.5805233398683293799535931611899286508560,-0.8142436072012602110703483049292117357254} +#define T_15625_2379 {0.5762588788628043579009840868820901960135,-0.8172672173357889979783408307412173599005} +#define T_15625_2392 {0.5719786699434316057377714059839490801096,-0.8202684933177324388964279933134093880653} +#define T_15625_2418 {0.5633714766658624029105340014211833477020,-0.8262037153628187802212323731509968638420} +#define T_15625_2431 {0.5590447275237174773465653743187431246042,-0.8291374992291281609269049113208893686533} +#define T_15625_2444 {0.5547027008934002356710379899595864117146,-0.8320486245536176106440962030319496989250} +#define T_15625_2457 {0.5503455154331452492399989750992972403765,-0.8349370117815029246344238345045596361160} +#define T_15625_2483 {0.5415861447237979575675126397982239723206,-0.8406452568373971034887404130131471902132} +#define T_15625_2496 {0.5371841988494386876951125486812088638544,-0.8434649586713646174374048314348328858614} +#define T_15625_2509 {0.5327675728880661987219014008587691932917,-0.8462616104249082171762097459577489644289} +#define T_15625_2522 {0.5283363875365542083883951818279456347227,-0.8490351356715598596380800699989777058363} +#define T_15625_2548 {0.5194308234366813437077325943391770124435,-0.8545125041003737154099439976562280207872} +#define T_15625_2561 {0.5149566880582039374303349177353084087372,-0.8572161975978555004118675242352765053511} +#define T_15625_2574 {0.5104684800227032859254450158914551138878,-0.8598964652231749772326452330162283033133} +#define T_15625_2587 {0.5059663219832369884798595194297377020121,-0.8625532337303914998116738388489466160536} +#define T_15625_2613 {0.4969206484073934171341591081727528944612,-0.8677959836196383891504524399351794272661} +#define T_15625_2626 {0.4923773800697898717082523489807499572635,-0.8703818217286075631378139405569527298212} +#define T_15625_2639 {0.4878206561190126522298271538602421060205,-0.8729438741772669319018973510537762194872} +#define T_15625_2652 {0.4832506010805116547501825152721721678972,-0.8754820709502417752645442305947653949261} +#define T_15625_2678 {0.4740709976602749953222826206911122426391,-0.8804866206691567276365617544797714799643} +#define T_15625_2691 {0.4694617001373271869546499601710820570588,-0.8829528368515332514832039123575668781996} +#define T_15625_2704 {0.4648395732373758471034363992657745257020,-0.8853949238348355343930506933247670531273} +#define T_15625_2717 {0.4602047432731927179005992911697831004858,-0.8878128148820869647295239701634272933006} +#define T_15625_2743 {0.4508974811354933276419387766509316861629,-0.8925757455284495378577958035748451948166} +#define T_15625_2756 {0.4462253033094027432348127604200271889567,-0.8949206549669258548007633180532138794661} +#define T_15625_2769 {0.4415409311069806719274311035405844449997,-0.8972411081516387110212917832541279494762} +#define T_15625_2782 {0.4368444925420288948991753841255558654666,-0.8995370416696009119306154389050789177418} +#define T_15625_2808 {0.4274159300249744108946003962046233937144,-0.9040550994053880851097915183345321565866} +#define T_15625_2821 {0.4226840637351753637851459188823355361819,-0.9062771001544275284445006946043577045202} +#define T_15625_2834 {0.4179406464004125631994668310653651133180,-0.9084743343025191153472519545175600796938} +#define T_15625_2847 {0.4131858076480640074024108798766974359751,-0.9106467418039868588763852130796294659376} +#define T_15625_2873 {0.4036423859571846017857410515716765075922,-0.9149168400782615728772384500189218670130} +#define T_15625_2886 {0.3988540638198111998313777348812436684966,-0.9170144141584809505118869310535956174135} +#define T_15625_2899 {0.3940548418600398061784062520018778741360,-0.9190869282100899706478003281517885625362} +#define T_15625_2912 {0.3892448512302687513297883015184197574854,-0.9211343255957439302150646653899457305670} +#define T_15625_2938 {0.3795930900381508932284191359940450638533,-0.9251535472532590453909051575465127825737} +#define T_15625_2951 {0.3747515832376467348474591290141688659787,-0.9271252616884501085436909306736197322607} +#define T_15625_2964 {0.3698998352836201375382074729714076966047,-0.9290716397873474141633209910651203244925} +#define T_15625_2977 {0.3650379787638916018188695034041302278638,-0.9309926283596302587497461900056805461645} +#define T_15625_3003 {0.3552844717562062548665835493011400103569,-0.9347582276337093043494519406522158533335} +#define T_15625_3016 {0.3503930878105860902138601886690594255924,-0.9366027354297887796974464436061680316925} +#define T_15625_3029 {0.3454921283766532180692365727736614644527,-0.9384216478906324265807370466063730418682} +#define T_15625_3042 {0.3405817273870717643013961151154944673181,-0.9402149153092809186205158766824752092361} +#define T_15625_3068 {0.3307331377580400300075780251063406467438,-0.9437243196976123904562427924247458577156} +#define T_15625_3081 {0.3257952182593295553658663266105577349663,-0.9454403607628329986312110122526064515114} +#define T_15625_3094 {0.3208483954790952163982353795290691778064,-0.9471305649795545011926378720090724527836} +#define T_15625_3107 {0.3158928046033477321508087243273621425033,-0.9487948861581364523587467374454718083143} +#define T_15625_3133 {0.3059558605037209222388128182501532137394,-0.9520456981802016827032275614328682422638} +#define T_15625_3146 {0.3009747788351188613198416987870587036014,-0.9536321001859948687240375875262543559074} +#define T_15625_3159 {0.2959854721741369543153155063919257372618,-0.9551924414806961527446560467069502919912} +#define T_15625_3172 {0.2909880768677789442122616492270026355982,-0.9567266794235391724754435927025042474270} +#define T_15625_3198 {0.2809695668084432051969656640721950680017,-0.9597166782584721778093239663576241582632} +#define T_15625_3211 {0.2759487258397647080343517700384836643934,-0.9611723574403345793371045147068798542023} +#define T_15625_3224 {0.2709203437868264452426103616744512692094,-0.9626017698521168730607655561470892280340} +#define T_15625_3237 {0.2658845580644784711843442437384510412812,-0.9640048764310567008806174271740019321442} +#define T_15625_3263 {0.2557913262788237274136804444424342364073,-0.9667320194348174577925192352267913520336} +#define T_15625_3276 {0.2507341560418000492305168336315546184778,-0.9680559813326945484135421793325804173946} +#define T_15625_3289 {0.2456701337803948725913016915001207962632,-0.9693534883458783113496792793739587068558} +#define T_15625_3302 {0.2405993978834282398526767110524815507233,-0.9706245050162971566010128299240022897720} +#define T_15625_3328 {0.2304383396516500037698449432355118915439,-0.9730869291171220236691397076356224715710} +#define T_15625_3341 {0.2253482949966720549905829784620436839759,-0.9742782692547816703054763820546213537455} +#define T_15625_3354 {0.2202520920582116403796391068681259639561,-0.9754429844660225112917828482750337570906} +#define T_15625_3367 {0.2151498701045173878032557013284531421959,-0.9765810429216866817014874868618790060282} +#define T_15625_3393 {0.2049279270430451327378307269100332632661,-0.9787770658928622857786194799700751900673} +#define T_15625_3406 {0.1998084852789520360349939664956764318049,-0.9798349703957961454037217663426417857409} +#define T_15625_3419 {0.1946835831793611026885315595791325904429,-0.9808660981196387629310606826038565486670} +#define T_15625_3432 {0.1895533607968071798666187532944604754448,-0.9818704208858903825785091612488031387329} +#define T_15625_3458 {0.1792775161160780894586963540859869681299,-0.9837985424949812207628951910010073333979} +#define T_15625_3471 {0.1741321746346029819552114759062533266842,-0.9847222886464103153159044268249999731779} +#define T_15625_3484 {0.1689820744958909215505116208078106865287,-0.9856191244588677902171980349521618336439} +#define T_15625_3497 {0.1638273564410849258976554665423464030027,-0.9864890254237630440314887891872785985470} +#define T_15625_3523 {0.1535046301749021790961080569104524329305,-0.9881479284575090948905540244595613330603} +#define T_15625_3536 {0.1483369040613982114962254854617640376091,-0.9889368851921135350480085435265209525824} +#define T_15625_3549 {0.1431651242198367923741386675828834995627,-0.9896988164119014941277896468818653374910} +#define T_15625_3562 {0.1379894319838205762884797422884730622172,-0.9904337012949340213197046978166326880455} +#define T_15625_3588 {0.1276268761935553386344111004291335120797,-0.9918222524591162603257998853223398327827} +#define T_15625_3601 {0.1224402958256342321963217045777128078043,-0.9924758807941537197550019300251733511686} +#define T_15625_3614 {0.1172503694281752245620609187426452990621,-0.9931023869012481908313816347799729555845} +#define T_15625_3627 {0.1120572388306872169128070026999921537936,-0.9937017536593373279174556955695152282715} +#define T_15625_3653 {0.1016619327876008838096311137633165344596,-0.9948190043529975268299381241376977413893} +#define T_15625_3666 {0.0964600414233243302897236048920603934675,-0.9953368577565138153318002878222614526749} +#define T_15625_3679 {0.0912555140138989612230702164197282399982,-0.9958275107477494447749677419778890907764} +#define T_15625_3692 {0.0860484927878478883522817000084614846855,-0.9962909499182152073259999269794207066298} +#define T_15625_3718 {0.0756275381368255172898074079057550989091,-0.9971361368817013293153195263585075736046} +#define T_15625_3731 {0.0704138894940972009939272879819327499717,-0.9975178615775824875200328278879169374704} +#define T_15625_3744 {0.0651983165914466339385668902650650124997,-0.9978723262590467601995669610914774239063} +#define T_15625_3757 {0.0599809619592464524084896027034119470045,-0.9981995212393379723891939647728577256203} +#define T_15625_3783 {0.0495414778672455466401736146053735865280,-0.9987720670757314067600418638903647661209} +#define T_15625_3796 {0.0443196336960575004715323643722513224930,-0.9990174022853892665807507000863552093506} +#define T_15625_3809 {0.0390965783647488768215438881270529236645,-0.9992354365014128880773114360636100172997} +#define T_15625_3822 {0.0338724546081709682043836551201820839196,-0.9994261637653966134919869546138215810061} +#define T_15625_3848 {0.0234215729006998184258403483681831858121,-0.9997256773349663117045338367461226880550} +#define T_15625_3861 {0.0181951005498915373281842988717471598648,-0.9998344554554915442068363518046680837870} +#define T_15625_3874 {0.0129681309661790530179947822375652322080,-0.9999159102540793320201828464632853865623} +#define T_15625_3887 {0.0077408069913817425730662158400718908524,-0.9999700395047455003805225715041160583496} +#define T_15625_3913 {-0.0027143327196699506588528638673096793354,-0.9999963161921582299740407506760675460100} +#define T_15625_3926 {-0.0079418627394773084332557644415828690398,-0.9999684629108196443070255554630421102047} +#define T_15625_3939 {-0.0131691757252834565350418927209830144420,-0.9999132826454084632672447696677409112453} +#define T_15625_3952 {-0.0183961288258845806486974083782115485519,-0.9998307769038825565388606264605186879635} +#define T_15625_3978 {-0.0288483840197344687528158146960777230561,-0.9995837987579880845601110195275396108627} +#define T_15625_3991 {-0.0340734004753638491935774368357670027763,-0.9994193331030001292702991122496314346790} +#define T_15625_4004 {-0.0392974857783552969370255425474169896916,-0.9992275554704743711909031844697892665863} +#define T_15625_4017 {-0.0445204971657106179927509970184473786503,-0.9990084711012804374874463064770679920912} +#define T_15625_4043 {-0.0549627272921596882193639999059087131172,-0.9984884068473742768645706746610812842846} +#define T_15625_4056 {-0.0601816606675975446227866427761910017580,-0.9981874411748978825897893329965882003307} +#define T_15625_4069 {-0.0653989494078858757397654244414297863841,-0.9978591971898363999216030606476124376059} +#define T_15625_4082 {-0.0706144509357619076794776447059120982885,-0.9975036838623909662970845602103509008884} +#define T_15625_4108 {-0.0810395222933242287410848803119733929634,-0.9967108887869490230215774317912291735411} +#define T_15625_4121 {-0.0862488072282668327694921117654303088784,-0.9962736287043340022151483026391360908747} +#define T_15625_4134 {-0.0914557351690953035383557789828046225011,-0.9958091426095063702916831971378996968269} +#define T_15625_4147 {-0.0966601638216851855034406071354169398546,-0.9953174431958705659084785111190285533667} +#define T_15625_4173 {-0.1070609544310389543708694759516220074147,-0.9942524589038314042355182209576014429331} +#define T_15625_4186 {-0.1122570321565996842583246007052366621792,-0.9936792031291548088489662404754199087620} +#define T_15625_4199 {-0.1174500421392829507816202294634422287345,-0.9930787922423278679673330771038308739662} +#define T_15625_4212 {-0.1226398424653121349559015129671024624258,-0.9924512426512868135120015722350217401981} +#define T_15625_4238 {-0.1330092469347425310211008309124736115336,-0.9911147966960500710342785168904811143875} +#define T_15625_4251 {-0.1381885677046561788916534396776114590466,-0.9904059368540435448480252489389386028051} +#define T_15625_4264 {-0.1433641120786846023804628202924504876137,-0.9896700113512030272389097262930590659380} +#define T_15625_4277 {-0.1485357386203486618203584157527075149119,-0.9889070402987873986688782679266296327114} +#define T_15625_4303 {-0.1588666729998587934247211705951485782862,-0.9873000456851787065559733491681981831789} +#define T_15625_4316 {-0.1640256985155219837668738591673900373280,-0.9864560660396868296828642996842972934246} +#define T_15625_4329 {-0.1691802415621703470360870369404437951744,-0.9855851286748221751210508045915048569441} +#define T_15625_4342 {-0.1743301612772457098721190504875266924500,-0.9846872573914264847871891106478869915009} +#define T_15625_4368 {-0.1846155678980194725369301522732712328434,-0.9828108119519706198374819905438926070929} +#define T_15625_4381 {-0.1897507737257105586703431754358462058008,-0.9818322890751220999305814984836615622044} +#define T_15625_4394 {-0.1948807940734953447758925904054194688797,-0.9808269348367650852793531157658435404301} +#define T_15625_4407 {-0.2000054887489682575729688096544123254716,-0.9797947767110652961264349869452416896820} +#define T_15625_4433 {-0.2102383410448789291535831580404192209244,-0.9776501623559917808492514268436934798956} +#define T_15625_4446 {-0.2153462190235058970610992901129066012800,-0.9765377647343087552656015759566798806190} +#define T_15625_4459 {-0.2204482120538413802979249567215447314084,-0.9753986804390626375038664264138787984848} +#define T_15625_4472 {-0.2255441807094060169269766902289120480418,-0.9742329405989733626114457365474663674831} +#define T_15625_4498 {-0.2357174880172767938990574521085363812745,-0.9718216224404687686444503924576565623283} +#define T_15625_4511 {-0.2407945486550084335242161159840179607272,-0.9705761100181844458489877069951035082340} +#define T_15625_4524 {-0.2458650288964176477968237577442778274417,-0.9693040738415183987797263398533686995506} +#define T_15625_4537 {-0.2509287901762017325424380942422430962324,-0.9680055486724793434305524897354189306498} +#define T_15625_4563 {-0.2610356025115389000390564433473628014326,-0.9653291740237823770343084106571041047573} +#define T_15625_4576 {-0.2660783773696818177434408880799310281873,-0.9639513976836784348023456914233975112438} +#define T_15625_4589 {-0.2711138808789271270605070185411022976041,-0.9625472786283107984317553018627222627401} +#define T_15625_4602 {-0.2761419754298108375500930833368329331279,-0.9611168552292285172100605450395960360765} +#define T_15625_4628 {-0.2861753882347468080915575683320639654994,-0.9581772524792538936466712584660854190588} +#define T_15625_4641 {-0.2911804322972399994284842250635847449303,-0.9566681534613726833882196842751000076532} +#define T_15625_4654 {-0.2961775190257464052123737019428517669439,-0.9551329107636034043693484818504657596350} +#define T_15625_4667 {-0.3011665118606498547926264564011944457889,-0.9535715663408221853813984125736169517040} +#define T_15625_4693 {-0.3111196707208528366983557589264819398522,-0.9503707437050806783318535053695086389780} +#define T_15625_4706 {-0.3160835647477605592214899843384046107531,-0.9487313529637081410683663307281676679850} +#define T_15625_4719 {-0.3210388208917152041976805776357650756836,-0.9470660354381088552599976537749171257019} +#define T_15625_4732 {-0.3259853037362403505028396466514095664024,-0.9453748366378232903528555652883369475603} +#define T_15625_4758 {-0.3358514090635675852780650529894046485424,-0.9419149807864912737898066552588716149330} +#define T_15625_4771 {-0.3407707619269635390679695774451829493046,-0.9401464182858523255248428540653549134731} +#define T_15625_4784 {-0.3456808022594857754405950345244491472840,-0.9383521636087585049423864802520256489515} +#define T_15625_4797 {-0.3505813958803087326820957514428300783038,-0.9365322657883252688293396204244345426559} +#define T_15625_4823 {-0.3603537075580053872236874212831025943160,-0.9328157403524017787788125133374705910683} +#define T_15625_4836 {-0.3652251585586512860892582921223947778344,-0.9309192143015461162747214984847232699394} +#define T_15625_4849 {-0.3700866287424405265404914189275586977601,-0.9289972482338443304783481835329439491034} +#define T_15625_4862 {-0.3749379852558648584803790981823112815619,-0.9270498946724889943737935027456842362881} +#define T_15625_4888 {-0.3846098272431338616073048797261435538530,-0.9230792386290609385923744412139058113098} +#define T_15625_4901 {-0.3894300484063709655302432111056987196207,-0.9210560446564647341105569466890301555395} +#define T_15625_4914 {-0.3942396272852512484341502840834436938167,-0.9190076802062027150341805281641427427530} +#define T_15625_4927 {-0.3990384324443441710350555240438552573323,-0.9169342012556629395447771457838825881481} +#define T_15625_4953 {-0.4086031973371425096708264845801750198007,-0.9127121271933797963527013052953407168388} +#define T_15625_4966 {-0.4133688956864267072255358925758628174663,-0.9105636474618257203417215350782498717308} +#define T_15625_4979 {-0.4181232975542242269462178683170350268483,-0.9083902839872196599557696572446729987860} +#define T_15625_4992 {-0.4228662730129734703687915953196352347732,-0.9061920961629037485352000658167526125908} +#define T_15625_5018 {-0.4323174265579182162788640653161564841866,-0.9017214884288490273078764403180684894323} +#define T_15625_5031 {-0.4370253463644459546166842756065307185054,-0.8994491906911896705878461943939328193665} +#define T_15625_5044 {-0.4417213232096480757249423731991555541754,-0.8971523129446401512154807278420776128769} +#define T_15625_5057 {-0.4464052287625929205105990149604622274637,-0.8948309179579219119204935850575566291809} +#define T_15625_5083 {-0.4557363143208942313222564735042396932840,-0.8901148306871462523304217029362916946411} +#define T_15625_5096 {-0.4603832393277824119870444974367273971438,-0.8877202672835952101948464587621856480837} +#define T_15625_5109 {-0.4650175830524451714431677373795537278056,-0.8853014443973657998654402945248875766993} +#define T_15625_5122 {-0.4696392188482513274649932100146543234587,-0.8828584281296770752689440087124239653349} +#define T_15625_5148 {-0.4788438618066220286983991627494106069207,-0.8779000831587389086507755564525723457336} +#define T_15625_5161 {-0.4834266174261220849750486650009406730533,-0.8753848899563767549381054777768440544605} +#define T_15625_5174 {-0.4879961620375107833602612572576617822051,-0.8728457743706270477446196309756487607956} +#define T_15625_5187 {-0.4925523707649770432936975339544005692005,-0.8702828057900492408904824515047948807478} +#define T_15625_5213 {-0.5016242828905056461152867086639162153006,-0.8650855904559883180482415809819940477610} +#define T_15625_5226 {-0.5061397383727513910045558986894320696592,-0.8624514857312050031268313432519789785147} +#define T_15625_5239 {-0.5106413621462161378516952936479356139898,-0.8597938120651118509840671322308480739594} +#define T_15625_5252 {-0.5151290311912193331522757944185286760330,-0.8571126420862055228511167115357238799334} +#define T_15625_5278 {-0.5240620149272320338340591661108192056417,-0.8516801069124543266397608931583818048239} +#define T_15625_5291 {-0.5285070854990385758043203168199397623539,-0.8489288901770937201263222959823906421661} +#define T_15625_5304 {-0.5329377131106484144495993859891314059496,-0.8461544740438309286645335305365733802319} +#define T_15625_5317 {-0.5373537766825530193059989869652781635523,-0.8433569343314827326807403551356401294470} +#define T_15625_5343 {-0.5461417293825290286690687935333698987961,-0.8376927906022949565922885994950775057077} +#define T_15625_5356 {-0.5505133783547830894633534626336768269539,-0.8348263413743025607161030166025739163160} +#define T_15625_5369 {-0.5548699829822602014317567409307230263948,-0.8319370781406886239395248594519216567278} +#define T_15625_5382 {-0.5592114242083405795114003922208212316036,-0.8290250798587938119155182903341483324766} +#define T_15625_5408 {-0.5678483423049885958633353766344953328371,-0.8231331970838478007124194846255704760551} +#define T_15625_5421 {-0.5721435831471892718980143399676308035851,-0.8201534736032611849765316947014071047306} +#define T_15625_5434 {-0.5764231885377095743194786336971446871758,-0.8171513370949228249884299657423980534077} +#define T_15625_5447 {-0.5806870415241526250937909026106353849173,-0.8141268696007564198424688584054820239544} +#define T_15625_5473 {-0.5891670246308010616687056426599156111479,-0.8080112728716654091343229993071872740984} +#define T_15625_5486 {-0.5933829230113347730224404585896991193295,-0.8049203107628258857531022840703371912241} +#define T_15625_5499 {-0.5975826055147809290346572197449859231710,-0.8018073519157614681418522195599507540464} +#define T_15625_5512 {-0.6017659573728630917344162298832088708878,-0.7986724814009317219287709121999796479940} +#define T_15625_5538 {-0.6100832123143611340765346540138125419617,-0.7923373486414673205757708274177275598049} +#define T_15625_5551 {-0.6142168881051155437944544246420264244080,-0.7891372595223646868589639780111610889435} +#define T_15625_5564 {-0.6183337786713944961292099833372049033642,-0.7859156049818294409448071746737696230412} +#define T_15625_5577 {-0.6224337715074469423726100103522185236216,-0.7826724730607402813475914626906160265207} +#define T_15625_5603 {-0.6305826162778229715399902488570660352707,-0.7761221321726470279145360109396278858185} +#define T_15625_5616 {-0.6346312455217723025313603102404158562422,-0.7728151022123492808191258518490940332413} +#define T_15625_5629 {-0.6386625316608323110756373353069648146629,-0.7694869528800188929196224307816009968519} +#define T_15625_5642 {-0.6426763645286364123876410303637385368347,-0.7661377751268079183688541888841427862644} +#define T_15625_5668 {-0.6506512321728106229556942707858979701996,-0.7593767010331586853055796382250264286995} +#define T_15625_5681 {-0.6546120490132265601346261973958462476730,-0.7559649894583114759782915825780946761370} +#define T_15625_5694 {-0.6585549767164341483649536712619010359049,-0.7525326189887165950764824629004579037428} +#define T_15625_5707 {-0.6624799075307125084677295490109827369452,-0.7490796834236652657068589178379625082016} +#define T_15625_5733 {-0.6702753499476112875044009342673234641552,-0.7421124950117786722714185998484026640654} +#define T_15625_5746 {-0.6741456485175824075994910344888921827078,-0.7385984325631947955415057549544144421816} +#define T_15625_5759 {-0.6779975241391478224528555074357427656651,-0.7350641858104539627660756195837166160345} +#define T_15625_5772 {-0.6818308715488439197827119642170146107674,-0.7315098513369070465373056322277989238501} +#define T_15625_5798 {-0.6894415632133179538953982046223245561123,-0.7243413083029135446366808537277393043041} +#define T_15625_5811 {-0.6932186994842868266175628377823159098625,-0.7207272956433063981762643379624933004379} +#define T_15625_5824 {-0.6969768915814519294471551802416797727346,-0.7170935870592185512606420161318965256214} +#define T_15625_5837 {-0.7007160368015187268397880870907101780176,-0.7134402818520780842348472106095869094133} +#define T_15625_5863 {-0.7081367784025226486122051028360147029161,-0.7060752814492917694977336395822931081057} +#define T_15625_5876 {-0.7118181719905766202671770770393777638674,-0.7023637875232421334814603142149280756712} +#define T_15625_5889 {-0.7154801131213081033521916651807259768248,-0.6986330995078319228142049723828677088022} +#define T_15625_5902 {-0.7191225017217542925251905217010062187910,-0.6948833193547284681201858802523929625750} +#define T_15625_5928 {-0.7263482237143014108582406151981558650732,-0.6873268930480453953180131065892055630684} +#define T_15625_5941 {-0.7299313596429851669356025922752451151609,-0.6835204533953195271678282551874872297049} +#define T_15625_5954 {-0.7334945481199452999021559662651270627975,-0.6796953346009646512726476430543698370457} +#define T_15625_5967 {-0.7370376917709161590508415429212618619204,-0.6758516411972378401529226721322629600763} +#define T_15625_5993 {-0.7440634578393833242770938340981956571341,-0.6681089512258462370652978279395028948784} +#define T_15625_6006 {-0.7475458882578269736995935090817511081696,-0.6642101662492200242482454086712095886469} +#define T_15625_6019 {-0.7510078898574232830753771850140765309334,-0.6602932298395164378845834107778500765562} +#define T_15625_6032 {-0.7544493680291257664549675610032863914967,-0.6563582490381701939341496654378715902567} +#define T_15625_6058 {-0.7612703784595403755375286891649011522532,-0.6484345848889216945565294736297801136971} +#define T_15625_6071 {-0.7646497243147303279897641914431005716324,-0.6444461180776922537560835735348518937826} +#define T_15625_6084 {-0.7680081739400766993597358123224694281816,-0.6404400399422485179456998594105243682861} +#define T_15625_6097 {-0.7713456355563853561463361074856948107481,-0.6364164599600766747400371059484314173460} +#define T_15625_6123 {-0.7779572305153925615428534001694060862064,-0.6283172347539262014848304715997073799372} +#define T_15625_6136 {-0.7812311831774426673646871677192393690348,-0.6242418108643260632462101966666523367167} +#define T_15625_6149 {-0.7844837864740962229959109208721201866865,-0.6201493277907060841513953164394479244947} +#define T_15625_6162 {-0.7877149515187116257664001750526949763298,-0.6160398973718129722598746411676984280348} +#define T_15625_6188 {-0.7941126142369802076714790928235743194818,-0.6077706441657980684922790715063456445932} +#define T_15625_6201 {-0.7972789370762923821800427504058461636305,-0.6036110473595536207724876476277131587267} +#define T_15625_6214 {-0.8004234719996623503845967206871137022972,-0.5994349551636155570832897865329869091511} +#define T_15625_6227 {-0.8035461330737241336663601032341830432415,-0.5952424817015876667980478487152140587568} +#define T_15625_6253 {-0.8097254929316175520170872914604842662811,-0.5868088497088714827754074576660059392452} +#define T_15625_6266 {-0.8127820228468553009903985184791963547468,-0.5825679216511787661403332094778306782246} +#define T_15625_6279 {-0.8158163411802116238291660010872874408960,-0.5783110732670891929174672441149596124887} +#define T_15625_6292 {-0.8188283650103026500843839130538981407881,-0.5740384208870993321127684794191736727953} +#define T_15625_6318 {-0.8247852005237064076936803758144378662109,-0.5654461716176610863726637035142630338669} +#define T_15625_6331 {-0.8277298494195362010827921039890497922897,-0.5611268095358766805347272565995808690786} +#define T_15625_6344 {-0.8306518782415831525511862309940624982119,-0.5567921130671034113390760467154905200005} +#define T_15625_6357 {-0.8335512071370939635173158421821426600218,-0.5524422006692586428755475935759022831917} +#define T_15625_6383 {-0.8392814488413593565496739756781607866287,-0.5436972039938670064884718158282339572906} +#define T_15625_6396 {-0.8421122050549515458683913493587169796228,-0.5393023586982419459445736720226705074310} +#define T_15625_6409 {-0.8449199481559622748605420383682940155268,-0.5348927754308577808828317756706383079290} +#define T_15625_6422 {-0.8477046014148207353500197314133401960135,-0.5304685746961264047882878003292717039585} +#define T_15625_6448 {-0.8532043346448555665162416516977827996016,-0.5215768048362862119304850239132065325975} +#define T_15625_6461 {-0.8559192643201695149812735508021432906389,-0.5171094787040939610989198627066798508167} +#define T_15625_6474 {-0.8586108035657139536667159518401604145765,-0.5126280210837473871876568409788887947798} +#define T_15625_6487 {-0.8612788788275190787757651378342416137457,-0.5081325544438297647786839661421254277229} +#define T_15625_6513 {-0.8665443463921256306292661975021474063396,-0.4991000858904392489812096300738630816340} +#define T_15625_6526 {-0.8691415948010406955503981407673563808203,-0.4945633308148751350330485365702770650387} +#define T_15625_6539 {-0.8717150914423762442240217751532327383757,-0.4900130603888122049482944930787198245525} +#define T_15625_6552 {-0.8742647659880125043940779505646787583828,-0.4854493989613394289150960503320675343275} +#define T_15625_6578 {-0.8792923707366457897194322868017479777336,-0.4762824023248487526061012431455310434103} +#define T_15625_6591 {-0.8817701635460348796868856879882514476776,-0.4716793176301023460261774289392633363605} +#define T_15625_6604 {-0.8842238594762635850443643903417978435755,-0.4670633429556433346974131382012274116278} +#define T_15625_6617 {-0.8866533914731077326010222350305411964655,-0.4624346044461162263949915995908668264747} +#define T_15625_6643 {-0.8914396987533006466719598392955958843231,-0.4531393422410203708494691454689018428326} +#define T_15625_6656 {-0.8937963432371816008270570819149725139141,-0.4484730725649448412895026194746606051922} +#define T_15625_6669 {-0.8961285621923167354907491244375705718994,-0.4437945470858460361718300646316492930055} +#define T_15625_6682 {-0.8984362918841857226226466082152910530567,-0.4391038936577472684419376491860020905733} +#define T_15625_6708 {-0.9029780318879621070493612933205440640450,-0.4296867160242942018655298852536361664534} +#define T_15625_6721 {-0.9052119180838984480175213320762850344181,-0.4249604491701192143793264222040306776762} +#define T_15625_6734 {-0.9074210667880185932077097277215216308832,-0.4202225690622700127185851215472212061286} +#define T_15625_6747 {-0.9096054176290471193766506985411979258060,-0.4154732051768042500050626131269382312894} +#define T_15625_6773 {-0.9138994876267197975394651621172670274973,-0.4059405455428406472861979636945761740208} +#define T_15625_6786 {-0.9160090894356807122989039271487854421139,-0.4011575103013964582743255959940142929554} +#define T_15625_6799 {-0.9180936586893905237971580390876624733210,-0.3963635122893238160735052133532008156180} +#define T_15625_6812 {-0.9201531384210616115737479958625044673681,-0.3915586825162616912976432104187551885843} +#define T_15625_6838 {-0.9241966048808896161403936275746673345566,-0.3819170532021801123256921073334524407983} +#define T_15625_6851 {-0.9261804811098180989503703131049405783415,-0.3770805171461208971095402375794947147369} +#define T_15625_6864 {-0.9281390468212709077278077529626898467541,-0.3722336762917921593185610618093051016331} +#define T_15625_6877 {-0.9300722484918660937225354246038477867842,-0.3673766630929140575290148262865841388702} +#define T_15625_6903 {-0.9338623490841224672465159528655931353569,-0.3576326508627036826482026299345307052135} +#define T_15625_6916 {-0.9357191444304987504310133772378321737051,-0.3527459181142362787397814827272668480873} +#define T_15625_6929 {-0.9375503685882778936999670804652851074934,-0.3478495455796720703389723894360940903425} +#define T_15625_6942 {-0.9393559715140482602180327376117929816246,-0.3429436670663252861857017705915495753288} +#define T_15625_6968 {-0.9428901169981303009848261353909038007259,-0.3331039286277663880042609889642335474491} +#define T_15625_6981 {-0.9446185629758587776194644902716390788555,-0.3281703376014101225521812921215314418077} +#define T_15625_6994 {-0.9463211945630514154714774122112430632114,-0.3232277783866346343444320154958404600620} +#define T_15625_7007 {-0.9479979652304545423291415318090002983809,-0.3182763860529364552398590149095980450511} +#define T_15625_7033 {-0.9512737412237461942510208245948888361454,-0.3083476435100115931042807915218872949481} +#define T_15625_7046 {-0.9528726570297330056291684741154313087463,-0.3033705646319312387859667978773359209299} +#define T_15625_7059 {-0.9544455328785735881425011939427349716425,-0.2983851952898059489704962743417127057910} +#define T_15625_7072 {-0.9559923257869591406432618896360509097576,-0.2933916717230410364791737265477422624826} +#define T_15625_7098 {-0.9590074944142373825073377702210564166307,-0.2833807079836706543929381041380111128092} +#define T_15625_7111 {-0.9604757877350665884463865040743257850409,-0.2783635413891397680785644297429826110601} +#define T_15625_7124 {-0.9619178333215651077736652041494380682707,-0.2733387677186419062458355710987234488130} +#define T_15625_7137 {-0.9633335917657334102059962788189295679331,-0.2683065242884177248328114728792570531368} +#define T_15625_7163 {-0.9660860931879931001731165451928973197937,-0.2582201784306568659310698876652168110013} +#define T_15625_7176 {-0.9674227609461497623399850454006809741259,-0.2531663516412256398346869445958873257041} +#define T_15625_7189 {-0.9687329911241623348061580145440530031919,-0.2481056063607463002540498564485460519791} +#define T_15625_7202 {-0.9700167479162623562771727847575675696135,-0.2430380808884861354446371706217178143561} +#define T_15625_7228 {-0.9725047017379128133640620035293977707624,-0.2328832434883479074461121172134880907834} +#define T_15625_7241 {-0.9737088307770447315192541282158344984055,-0.2277962090703015973769396396164665929973} +#define T_15625_7254 {-0.9748863504512846889227262181520927697420,-0.2227029494725541736155349781256518326700} +#define T_15625_7267 {-0.9760372285815565041033892157429363578558,-0.2176036038829191332499846112114028073847} +#define T_15625_7293 {-0.9782589351350307049770549383538309484720,-0.2073872123070171213932155751535901799798} +#define T_15625_7306 {-0.9793297028437786266863440687302500009537,-0.2022704455127249389967403203627327457070} +#define T_15625_7319 {-0.9803737075813089285603041389549616724253,-0.1971481511028649880046259568189270794392} +#define T_15625_7332 {-0.9813909208172211240395199638442136347294,-0.1920204690587095919251936493310495279729} +#define T_15625_7358 {-0.9833448623241182184884223715926054865122,-0.1817495027249347305797755325329490005970} +#define T_15625_7371 {-0.9842815371980906391513599373865872621536,-0.1766064991187006327777453407179564237595} +#define T_15625_7384 {-0.9851913137778793849719249919871799647808,-0.1714586692372713216947488490404793992639} +#define T_15625_7397 {-0.9860741672012500735888806957518681883812,-0.1663061537597486094774978937493870034814} +#define T_15625_7423 {-0.9877590088092192965874005494697485119104,-0.1559876293692183946149043549667112529278} +#define T_15625_7436 {-0.9885609509507260472815914909006096422672,-0.1508219024392554730162885334721067920327} +#define T_15625_7449 {-0.9893358778508892781289318918425124138594,-0.1456520538715823653852510233264183625579} +#define T_15625_7462 {-0.9900837683326261240424059906217735260725,-0.1404782249470240829136713500702171586454} +#define T_15625_7488 {-0.9914983590272830049627827975200489163399,-0.1301191916905613255028839603255619294941} +#define T_15625_7501 {-0.9921650205824864565684606532158795744181,-0.1249342704487214628095159696385962888598} +#define T_15625_7513 {-0.9927563354385803284785083633323665708303,-0.1201451557348903742550305651093367487192} +#define T_15625_7514 {-0.9928045684048868446680558008665684610605,-0.1197459350223894908316069063403119798750} +#define T_15625_7526 {-0.9933708396705184240360608782793860882521,-0.1149537945971733643446555106493178755045} +#define T_15625_7527 {-0.9934169850170199156380590466142166405916,-0.1145543271975964921827184639369079377502} +#define T_15625_7539 {-0.9939581972172153445654885217663832008839,-0.1097592920198704469214590062620118260384} +#define T_15625_7552 {-0.9945183920274545119610820620437152683735,-0.1045617899575476683615704587282380089164} +#define T_15625_7553 {-0.9945603584082716164260773439309559762478,-0.1041618619400125622664887714563519693911} +#define T_15625_7566 {-0.9950912839414584043495892728969920426607,-0.0989612885109108497694307970959926024079} +#define T_15625_7578 {-0.9955572329456070512776477698935195803642,-0.0941583556020723771995406536916561890393} +#define T_15625_7579 {-0.9955950157733680638827422626491170376539,-0.0937580106829653397992885288658726494759} +#define T_15625_7591 {-0.9960358506642359444072098995093256235123,-0.0889527076123709098398606442970049101859} +#define T_15625_7592 {-0.9960715401380946243392600081278942525387,-0.0885521706505507377737984597843023948371} +#define T_15625_7604 {-0.9964872488686137286251209843612741678953,-0.0837446287367822478353573956155742052943} +#define T_15625_7617 {-0.9969114152229999747945043964136857539415,-0.0785342613008833956289578281939611770213} +#define T_15625_7618 {-0.9969429151203380579104873504547867923975,-0.0781333730960230704587488048673549201339} +#define T_15625_7631 {-0.9973377419250538666872785142913926392794,-0.0729207002972041828936511365100159309804} +#define T_15625_7643 {-0.9976780067600909962521882334840483963490,-0.0681072303592791666870809308420575689524} +#define T_15625_7644 {-0.9977053136376348163594229845330119132996,-0.0677060347327236855541343629738548770547} +#define T_15625_7656 {-0.9980204109935003620179827521496918052435,-0.0628908518018686940598271917224337812513} +#define T_15625_7657 {-0.9980456202131378073971745834569446742535,-0.0624895189081586047152860885489644715562} +#define T_15625_7669 {-0.9983355414788999127040369785390794277191,-0.0576727545729508322969181222106271889061} +#define T_15625_7682 {-0.9986233896044520719215142889879643917084,-0.0524530812718826053298037948025012155995} +#define T_15625_7683 {-0.9986444014988790085496361825789790600538,-0.0520515067500052358573370270278246607631} +#define T_15625_7696 {-0.9989028598457143637290300830500200390816,-0.0468302956648048371302017756079294485971} +#define T_15625_7708 {-0.9991172080567208846346716200059745460749,-0.0420095770621783590192954704889416461810} +#define T_15625_7709 {-0.9991340203291136923269277758663520216942,-0.0416078048085004384426532908491935813800} +#define T_15625_7721 {-0.9993231648884429718648902962740976363420,-0.0367860315520161795799225501468754373491} +#define T_15625_7722 {-0.9993378766319587214184139156714081764221,-0.0363841769005174425766213630595302674919} +#define T_15625_7734 {-0.9995018123706973511843898450024425983429,-0.0315614807588519841186069925242918543518} +#define T_15625_7747 {-0.9996531456214332589382820515311323106289,-0.0263360674584048187751861291872046422213} +#define T_15625_7748 {-0.9996636551584880958642997939023189246655,-0.0259340809586820833509968053931515896693} +#define T_15625_7761 {-0.9997855684793470754101463171537034213543,-0.0207078985034413985688583892397218733095} +#define T_15625_7773 {-0.9998738536324546233302612563420552760363,-0.0158832245524876715381257241688217618503} +#define T_15625_7774 {-0.9998801598142443847194726913585327565670,-0.0154811501459412217268640787892763910349} +#define T_15625_7786 {-0.9999432223612655379341163097706157714128,-0.0106560806006979758020491999559453688562} +#define T_15625_7787 {-0.9999474265782024584581222370616160333157,-0.0102539787219552980257475383041310124099} +#define T_15625_7799 {-0.9999852647957738760453594295540824532509,-0.0054286454411795860910960875855835183756} +#define T_15625_7812 {-0.9999999797870502415975124677061103284359,-0.0002010619284750618811132699681465396679} +#define T_15625_7813 {-0.9999999797870502415975124677061103284359,0.0002010619284750618811132699681465396679} +#define T_15625_7826 {-0.9999852647957738760453594295540824532509,0.0054286454411795860910960875855835183756} +#define T_15625_7839 {-0.9999432223612655379341163097706157714128,0.0106560806006979758020491999559453688562} +#define T_15625_7852 {-0.9998738536324546233302612563420552760363,0.0158832245524876715381257241688217618503} +#define T_15625_7878 {-0.9996531456214332589382820515311323106289,0.0263360674584048187751861291872046422213} +#define T_15625_7891 {-0.9995018123706973511843898450024425983429,0.0315614807588519841186069925242918543518} +#define T_15625_7904 {-0.9993231648884429718648902962740976363420,0.0367860315520161795799225501468754373491} +#define T_15625_7917 {-0.9991172080567208846346716200059745460749,0.0420095770621783590192954704889416461810} +#define T_15625_7943 {-0.9986233896044520719215142889879643917084,0.0524530812718826053298037948025012155995} +#define T_15625_7956 {-0.9983355414788999127040369785390794277191,0.0576727545729508322969181222106271889061} +#define T_15625_7969 {-0.9980204109935003620179827521496918052435,0.0628908518018686940598271917224337812513} +#define T_15625_7982 {-0.9976780067600909962521882334840483963490,0.0681072303592791666870809308420575689524} +#define T_15625_8008 {-0.9969114152229999747945043964136857539415,0.0785342613008833956289578281939611770213} +#define T_15625_8021 {-0.9964872488686137286251209843612741678953,0.0837446287367822478353573956155742052943} +#define T_15625_8034 {-0.9960358506642359444072098995093256235123,0.0889527076123709098398606442970049101859} +#define T_15625_8047 {-0.9955572329456070512776477698935195803642,0.0941583556020723771995406536916561890393} +#define T_15625_8073 {-0.9945183920274545119610820620437152683735,0.1045617899575476683615704587282380089164} +#define T_15625_8086 {-0.9939581972172153445654885217663832008839,0.1097592920198704469214590062620118260384} +#define T_15625_8099 {-0.9933708396705184240360608782793860882521,0.1149537945971733643446555106493178755045} +#define T_15625_8112 {-0.9927563354385803284785083633323665708303,0.1201451557348903742550305651093367487192} +// Pre-computed twiddles for N=16384 +#define T_16384_1 {0.9999999264657178921211766464693937450647,-0.0003834951875713955632071772150482047437} +#define T_16384_3 {0.9999993381915255330483205398195423185825,-0.0011504853371138484743191332526635051181} +#define T_16384_5 {0.9999981616434869824416864503291435539722,-0.0019174748098554190121889373443764270633} +#define T_16384_7 {0.9999963968222943533348257005854975432158,-0.0026844631545959616800156588567460858030} +#define T_16384_9 {0.9999940437289858152780652744695544242859,-0.0034514499201359944043077110364947657217} +#define T_16384_11 {0.9999911023649455943385078171559143811464,-0.0042184346552769638391544582134429219877} +#define T_16384_13 {0.9999875727319040841223340976284816861153,-0.0049854169088215105620776057548937387764} +#define T_16384_15 {0.9999834548319377347525005461648106575012,-0.0057523962295737366551273694881274423096} +#define T_16384_17 {0.9999787486674688308241343293047975748777,-0.0065193721663394680818082171924743306590} +#define T_16384_19 {0.9999734542412659354937431999132968485355,-0.0072863442679265222681750380218090867857} +#define T_16384_21 {0.9999675715564437794569130346644669771194,-0.0080533120831449717808014909792291291524} +#define T_16384_23 {0.9999611006164628168590979839791543781757,-0.0088202751608074114741953053453471511602} +#define T_16384_25 {0.9999540414251297804071327846031635999680,-0.0095872330497292247708518431181801133789} +#define T_16384_27 {0.9999463939865974593246278345759492367506,-0.0103541852987288438697754955342134053353} +#define T_16384_29 {0.9999381583053645883296667307149618864059,-0.0111211314566280212307036734387111209799} +#define T_16384_31 {0.9999293343862760696794111936469562351704,-0.0118880710722520932520751557603944092989} +#define T_16384_33 {0.9999199222345227511254961427766829729080,-0.0126550036944302422142749620093127305154} +#define T_16384_35 {0.9999099218556415369363321588025428354740,-0.0134219288719957668964966046587505843490} +#define T_16384_37 {0.9998993332555153878971054837165866047144,-0.0141888461537863445199869616430987662170} +#define T_16384_39 {0.9998881564403733213097780208045151084661,-0.0149557550886442978954615767861469066702} +#define T_16384_41 {0.9998763914167904109930873346456792205572,-0.0157226552254168573663495322989547275938} +#define T_16384_43 {0.9998640381916876762602441885974258184433,-0.0164895461129564366298261290921800537035} +#define T_16384_45 {0.9998510967723321929412350073107518255711,-0.0172564273001208808022699514594933134504} +#define T_16384_47 {0.9998375671663370933828218767303042113781,-0.0180232983357737422402955473899055505171} +#define T_16384_49 {0.9998234493816615664485425440943799912930,-0.0187901587687845580965095848569035297260} +#define T_16384_51 {0.9998087434266105244518030303879640996456,-0.0195570081480290862419035846642145770602} +#define T_16384_53 {0.9997934493098352692896924054366536438465,-0.0203238460223895966993978845493984408677} +#define T_16384_55 {0.9997775670403329373314704753283876925707,-0.0210906719407551214440221798440688871779} +#define T_16384_57 {0.9997610966274466104408702449291013181210,-0.0218574854520217354281186317166429944336} +#define T_16384_59 {0.9997440380808654269984003803983796387911,-0.0226242861050928063815224078325627488084} +#define T_16384_61 {0.9997263914106244708790427466738037765026,-0.0233910734488792584895300308289733948186} +#define T_16384_63 {0.9997081566271048824745548699866048991680,-0.0241578470322998638264433424183152965270} +#define T_16384_65 {0.9996893337410336366488650128303561359644,-0.0249246064042814678696213803732462110929} +#define T_16384_67 {0.9996699227634837647826770989922806620598,-0.0256913511137592982802591023983040940948} +#define T_16384_69 {0.9996499237058742437511682510375976562500,-0.0264580807096771869479923111612151842564} +#define T_16384_71 {0.9996293365799701069462912528251763433218,-0.0272247947409878753022294262109426199459} +#define T_16384_73 {0.9996081613978821112098671619605738669634,-0.0279914927566532467650972648698370903730} +#define T_16384_75 {0.9995863981720670699004926973429974168539,-0.0287581743056446147155380543836145079695} +#define T_16384_77 {0.9995640469153277418712377766496501863003,-0.0295248389369429792283838764888059813529} +#define T_16384_79 {0.9995411076408129424919479788513854146004,-0.0302914861995392838134311119802077882923} +#define T_16384_81 {0.9995175803620169885377322316344361752272,-0.0310581156424347033795374528608590480871} +#define T_16384_83 {0.9994934650927805863673825115256477147341,-0.0318247268146408870959085390950349392369} +#define T_16384_85 {0.9994687618472900547672566062828991562128,-0.0325913192651802255395132590365392388776} +#define T_16384_87 {0.9994434706400777690404879649577196687460,-0.0333578925430861455980746654859103728086} +#define T_16384_89 {0.9994175914860217169177758478326722979546,-0.0341244461974033255757809968145011225715} +#define T_16384_91 {0.9993911244003460536688976389996241778135,-0.0348909797771880039740644008361414307728} +#define T_16384_93 {0.9993640693986205469911965337814763188362,-0.0356574928315082292917814754673599964008} +#define T_16384_95 {0.9993364264967612431433963138260878622532,-0.0364239849094441098253938093876058701426} +#define T_16384_97 {0.9993081957110294677448791844653896987438,-0.0371904555600881189802997539572970708832} +#define T_16384_99 {0.9992793770580327139541054748406168073416,-0.0379569043325453103765454443419002927840} +#define T_16384_101 {0.9992499705547244204240087128710001707077,-0.0387233307759336231601565714299795217812} +#define T_16384_103 {0.9992199762184035272127857751911506056786,-0.0394897344393841248644250185861892532557} +#define T_16384_105 {0.9991893940667149198731067372136749327183,-0.0402561148720412820267711140331812202930} +#define T_16384_107 {0.9991582241176494294521148731291759759188,-0.0410224716230632446833936910479678772390} +#define T_16384_109 {0.9991264663895433884022168058436363935471,-0.0417888042416220684138750129932304844260} +#define T_16384_111 {0.9990941209010790746702923570410348474979,-0.0425551122769040196525125452353677246720} +#define T_16384_113 {0.9990611876712846006753920846676919609308,-0.0433213952781098254884994958047172985971} +#define T_16384_115 {0.9990276667195336912641323579009622335434,-0.0440876527944549442827870677774626528844} +#define T_16384_117 {0.9989935580655456837106953571492340415716,-0.0448538843751698154682649999358545755967} +#define T_16384_119 {0.9989588617293860828283413866301998496056,-0.0456200895695001509833055308718030573800} +#define T_16384_121 {0.9989235777314657838132916367612779140472,-0.0463862679267071573163683240181853761896} +#define T_16384_123 {0.9988877060925412942893331091909203678370,-0.0471524189960678685729078551958082243800} +#define T_16384_125 {0.9988512468337151783970284668612293899059,-0.0479185423268753338255088181085739051923} +#define T_16384_127 {0.9988141999764353906599012589140329509974,-0.0486846374684389432418996079832140821964} +#define T_16384_129 {0.9987765655424956090513433082378469407558,-0.0494507039700846640073450544150546193123} +#define T_16384_131 {0.9987383435540352349946147114678751677275,-0.0502167413811553109415086737499223090708} +#define T_16384_133 {0.9986995340335392823405413764703553169966,-0.0509827492510108032375271136515948455781} +#define T_16384_135 {0.9986601370038384883898174848582129925489,-0.0517487271290284628344480211126210633665} +#define T_16384_137 {0.9986201524881088698037956419284455478191,-0.0525146745646032225840471596711722668260} +#define T_16384_139 {0.9985795805098724997606041142717003822327,-0.0532805911071479454399479891435476019979} +#define T_16384_141 {0.9985384210929967307990295921626966446638,-0.0540464763060936673189083023771672742441} +#define T_16384_143 {0.9984966742616946389077270396228414028883,-0.0548123297108898538398946698180225212127} +#define T_16384_145 {0.9984543400405248014806147693889215588570,-0.0555781508710046848187324997070390963927} +#define T_16384_147 {0.9984114184543912973168744429131038486958,-0.0563439393359252901904987709258421091363} +#define T_16384_149 {0.9983679095285438176432535328785888850689,-0.0571096946551580622597477088220330188051} +#define T_16384_151 {0.9983238132885775550917628606839571148157,-0.0578754163782288638673279024260409642011} +#define T_16384_153 {0.9982791297604332036996765964431688189507,-0.0586411040546833406406079802763997577131} +#define T_16384_155 {0.9982338589703968478872297964699100703001,-0.0594067572340871499769754393582843476906} +#define T_16384_157 {0.9981880009451002955245257908245548605919,-0.0601723754660262663551684170215594349429} +#define T_16384_159 {0.9981415557115205228200238707358948886395,-0.0609379583001072033798806160120875574648} +#define T_16384_161 {0.9980945232969800073874466761481016874313,-0.0617035052859573052153052685753209516406} +#define T_16384_163 {0.9980469037291468392680826582363806664944,-0.0624690159732249963853156771165231475607} +#define T_16384_165 {0.9979986970360343878638786918600089848042,-0.0632344899115800801459030822115892078727} +#define T_16384_167 {0.9979499032460011909151376130466815084219,-0.0639999266507139397130998759166686795652} +#define T_16384_169 {0.9979005223877516206343329940864350646734,-0.0647653257403398852076747971295844763517} +#define T_16384_171 {0.9978505544903351065499919059220701456070,-0.0655306867301933271274805292705423198640} +#define T_16384_173 {0.9977999995831464685736023056961130350828,-0.0662960091700321302310427995507779996842} +#define T_16384_175 {0.9977488576959256949550081117195077240467,-0.0670612926096368217043774961894087027758} +#define T_16384_177 {0.9976971288587584973939215160498861223459,-0.0678265365988108687167468247025681193918} +#define T_16384_179 {0.9976448131020754228615032843663357198238,-0.0685917406873809420986276563780847936869} +#define T_16384_181 {0.9975919104566526307564799935789778828621,-0.0693569044251972077752554923790739849210} +#define T_16384_183 {0.9975384209536113377936317192506976425648,-0.0701220273621335349334415809607889968902} +#define T_16384_185 {0.9974843446244179290260944981127977371216,-0.0708871090480878152106924972031265497208} +#define T_16384_187 {0.9974296815008841798899652530963066965342,-0.0716521490329822124953906836708483751863} +#define T_16384_189 {0.9973744316151671451819993308163248002529,-0.0724171468667634127269749910738028120250} +#define T_16384_191 {0.9973185949997686039480981889937538653612,-0.0731821020994028875739090267416031565517} +#define T_16384_193 {0.9972621716875361697063340216118376702070,-0.0739470142808971997450129265416762791574} +#define T_16384_195 {0.9972051617116618471570177462126594036818,-0.0747118829612682250340682799105707090348} +#define T_16384_197 {0.9971475651056834754726310166006442159414,-0.0754767076905634021199986705141782294959} +#define T_16384_199 {0.9970893819034833960301966726547107100487,-0.0762414880188560656337770637946960050613} +#define T_16384_201 {0.9970306121392894516120009029691573232412,-0.0770062234962456404474551163730211555958} +#define T_16384_203 {0.9969712558476743202717784697597380727530,-0.0777709136728579608632827557812561281025} +#define T_16384_205 {0.9969113130635557373793176338949706405401,-0.0785355580988454787805252976795600261539} +#define T_16384_207 {0.9968507838221966066427626174117904156446,-0.0793001563243876106401586412175674922764} +#define T_16384_209 {0.9967896681592045560194037534529343247414,-0.0800647078996908900805351549934130162001} +#define T_16384_211 {0.9967279661105324928271897988452110439539,-0.0808292123749893287598666802296065725386} +#define T_16384_213 {0.9966656777124781596555180840368848294020,-0.0815936693005446522786172636187984608114} +#define T_16384_215 {0.9966028030016841343652345130976755172014,-0.0823580782266465499796836979840009007603} +#define T_16384_217 {0.9965393420151379411109360262344125658274,-0.0831224387036129247485760629388096276671} +#define T_16384_219 {0.9964752947901721613632730623066890984774,-0.0838867502817902122025373046199092641473} +#define T_16384_221 {0.9964106613644641008420421712798997759819,-0.0846510125115536166129359685328381601721} +#define T_16384_223 {0.9963454417760359005384884767408948391676,-0.0854152249433073329498711245832964777946} +#define T_16384_225 {0.9962796360632546477376081384136341512203,-0.0861793871274848938268675624385650735348} +#define T_16384_227 {0.9962132442648320429512409646122250705957,-0.0869434986145493776676929087443568278104} +#define T_16384_229 {0.9961462664198246219626753372722305357456,-0.0877075589549936723843259755994949955493} +#define T_16384_231 {0.9960787025676339778712531369819771498442,-0.0884715676993407668105007246595050673932} +#define T_16384_233 {0.9960105527480058729139500428573228418827,-0.0892355243981440143796746156112931203097} +#define T_16384_235 {0.9959418170010313486884001576981972903013,-0.0899994286019873551696335312044539023191} +#define T_16384_237 {0.9958724953671457269521738453477155417204,-0.0907632798614856350916113569837762042880} +#define T_16384_239 {0.9958025878871291647342900432704482227564,-0.0915270777272848279348949063205509446561} +#define T_16384_241 {0.9957320946021064322906113375211134552956,-0.0922908217500623545559435001450765412301} +#define T_16384_243 {0.9956610155535469131038439627445768564939,-0.0930545114805272494118426607201399747282} +#define T_16384_245 {0.9955893507832646038835378021758515387774,-0.0938181464694205491383627304458059370518} +#define T_16384_247 {0.9955171003334181145660863876400981098413,-0.0945817262675154452056247578184411395341} +#define T_16384_249 {0.9954442642465103352478195120056625455618,-0.0953452504256176308627956927921331953257} +#define T_16384_251 {0.9953708425653889912965155417623464018106,-0.0961087184945655093049055039955419488251} +#define T_16384_253 {0.9952968353332460882398891044431366026402,-0.0968721300252304712286033350210345815867} +#define T_16384_255 {0.9952222425936183558548009386868216097355,-0.0976354845685172140212770841571909841150} +#define T_16384_257 {0.9951470643903864710111406566284131258726,-0.0983987816753638944167192903478280641139} +#define T_16384_259 {0.9950713007677761678948513690556865185499,-0.0991620208967425031953979441823321394622} +#define T_16384_261 {0.9949949517703570167626025977369863539934,-0.0999252017836590733512736051125102676451} +#define T_16384_263 {0.9949180174430432010979075130308046936989,-0.1006883238871539576475555577417253516614} +#define T_16384_265 {0.9948404978310931845442155463388189673424,-0.1014513867583020784168823524851177353412} +#define T_16384_267 {0.9947623929801099329495173151372000575066,-0.1022143899482132051170779618587403092533} +#define T_16384_269 {0.9946837029360402482325298478826880455017,-0.1029773330080322180091201289542368613183} +#define T_16384_271 {0.9946044277451756565611162841378245502710,-0.1037402154889393718351087159135204274207} +#define T_16384_273 {0.9945245674541517422184710994770284742117,-0.1045030369421505733740218602179083973169} +#define T_16384_275 {0.9944441221099480365808176429709419608116,-0.1052657969189176034863208997194305993617} +#define T_16384_277 {0.9943630917598885732289204497647006064653,-0.1060284949705284085474943367444211617112} +#define T_16384_279 {0.9942814764516415548811778535309713333845,-0.1067911306483073918816018021971103735268} +#define T_16384_281 {0.9941992762332189093044121364073362201452,-0.1075537035036156358058789805909327697009} +#define T_16384_283 {0.9941164911529770664699867666058707982302,-0.1083162130878511653087059585232054814696} +#define T_16384_285 {0.9940331212596164034422940858348738402128,-0.1090786589524492394831511887787200976163} +#define T_16384_287 {0.9939491666021811333564528467832133173943,-0.1098410406488826013271520309899642597884} +#define T_16384_289 {0.9938646272300597495075180631829425692558,-0.1106033577286617414214831001118000131100} +#define T_16384_291 {0.9937795031929845812612711597466841340065,-0.1113656097433351616077246148961421567947} +#define T_16384_293 {0.9936937945410317940542199721676297485828,-0.1121277962444896525440185541810933500528} +#define T_16384_295 {0.9936075013246216114382036721508484333754,-0.1128899167837505157496735819222521968186} +#define T_16384_297 {0.9935206235945180930357878423819784075022,-0.1136519709127818689164968191107618622482} +#define T_16384_299 {0.9934331614018293565848694015585351735353,-0.1144139581832869234645500000624451786280} +#define T_16384_301 {0.9933451147980069118048618292959872633219,-0.1151758781470081927089665896346559748054} +#define T_16384_303 {0.9932564838348464375528124037373345345259,-0.1159377303557277971712835551443276926875} +#define T_16384_305 {0.9931672685644872267118898889748379588127,-0.1166995143612676866240462913992814719677} +#define T_16384_307 {0.9930774690394122972136869975656736642122,-0.1174612297154899870355038160596450325102} +#define T_16384_309 {0.9929870853124483920382203905319329351187,-0.1182228759702971671030624634113337378949} +#define T_16384_311 {0.9928961174367659792139306773606222122908,-0.1189844526776323574424054640985559672117} +#define T_16384_313 {0.9928045654658791407953799534880090504885,-0.1197459593894796003876734857840347103775} +#define T_16384_315 {0.9927124294536454618409493377839680761099,-0.1205073956578641275472207894381426740438} +#define T_16384_317 {0.9926197094542661414351414350676350295544,-0.1212687610348525957260079621846671216190} +#define T_16384_319 {0.9925264055222861037108827986230608075857,-0.1220300550725533644813580735899449791759} +#define T_16384_321 {0.9924325177125936647826165426522493362427,-0.1227912773231167736787128319519979413599} +#define T_16384_323 {0.9923380460804204217239998797595035284758,-0.1235524273387353794140253171462973114103} +#define T_16384_325 {0.9922429906813416966571139710140414535999,-0.1243135046716442454473039447293558623642} +#define T_16384_327 {0.9921473515712760926632540758873801678419,-0.1250745088741211652472173909700359217823} +#define T_16384_329 {0.9920511288064857158275344772846437990665,-0.1258354394984869950580019803965114988387} +#define T_16384_331 {0.9919543224435759531942835565132554620504,-0.1265962960971058759440666108275763690472} +#define T_16384_333 {0.9918569325394954727670437932829372584820,-0.1273570782223854003234464471461251378059} +#define T_16384_335 {0.9917589591515361124862693031900562345982,-0.1281177854267771254459518104340531863272} +#define T_16384_337 {0.9916604023373332132962332252645865082741,-0.1288784172627765456375925623433431610465} +#define T_16384_339 {0.9915612621548652860781203344231471419334,-0.1296389732829235919009391864165081642568} +#define T_16384_341 {0.9914615386624537896054221164376940578222,-0.1303994530398027151818496349733322858810} +#define T_16384_343 {0.9913612319187634636108441554824821650982,-0.1311598560860432749475279479156597517431} +#define T_16384_345 {0.9912603419828024398086085966497194021940,-0.1319201819743197612311291777587030082941} +#define T_16384_347 {0.9911588689139213537160344458243343979120,-0.1326804302573520721875155459201778285205} +#define T_16384_349 {0.9910568127718143438542597323248628526926,-0.1334406004879056806267101364937843754888} +#define T_16384_351 {0.9909541736165184966367291963251773267984,-0.1342006922187920225919555150539963506162} +#define T_16384_353 {0.9908509515084136243245893638231791555882,-0.1349607050028687749154698849451960995793} +#define T_16384_355 {0.9907471465082227091158983967034146189690,-0.1357206383930399384851739341684151440859} +#define T_16384_357 {0.9906427586770115700787187051901128143072,-0.1364804919422562823339006854439503513277} +#define T_16384_359 {0.9905377880761887521288144853315316140652,-0.1372402652035155934395760368715855292976} +#define T_16384_361 {0.9904322347675059701188615690625738352537,-0.1379999577298627877475212244462454691529} +#define T_16384_363 {0.9903260988130573316823301865952089428902,-0.1387595690743903542596626721206121146679} +#define T_16384_365 {0.9902193802752800033672997415123973041773,-0.1395190987902384938124100699496921151876} +#define T_16384_367 {0.9901120792169537665472489607054740190506,-0.1402785464305954243879881460088654421270} +#define T_16384_369 {0.9900041957012009063987534318584948778152,-0.1410379115486977141813440539408475160599} +#define T_16384_371 {0.9898957297914866559906954535108525305986,-0.1417971936978303926224498354713432490826} +#define T_16384_373 {0.9897866815516186411727517224790062755346,-0.1425563924313273389543610392138361930847} +#define T_16384_375 {0.9896770510457472136423007214034441858530,-0.1433155073025715042778216457008966244757} +#define T_16384_377 {0.9895668383383651178775153312017209827900,-0.1440745378649951891070202236733166500926} +#define T_16384_379 {0.9894560434943077131819677560997661203146,-0.1448334836720802099030436238535912707448} +#define T_16384_381 {0.9893446665787526406177221360849216580391,-0.1455923442773583709186624446374480612576} +#define T_16384_383 {0.9892327076572200450499394719372503459454,-0.1463511192344114919539066477227606810629} +#define T_16384_385 {0.9891201667955726861691800877451896667480,-0.1471098080968717969341241769143380224705} +#define T_16384_387 {0.9890070440600152723575888558116275817156,-0.1478684104184222192213127300419728271663} +#define T_16384_389 {0.9888933395170951268227099717478267848492,-0.1486269257527965403919978371050092391670} +#define T_16384_391 {0.9887790532337015214636721793795004487038,-0.1493853536537797233041402478193049319088} +#define T_16384_393 {0.9886641852770662319827010833250824362040,-0.1501436936752081896528920879063662141562} +#define T_16384_395 {0.9885487357147632048182117614487651735544,-0.1509019453709700420152017841246561147273} +#define T_16384_397 {0.9884327046147083351002038398291915655136,-0.1516601082950053414055702205587294884026} +#define T_16384_399 {0.9883160920451596886948664177907630801201,-0.1524181820013063293206556636505411006510} +#define T_16384_401 {0.9881988980747176132268805304192937910557,-0.1531761660439178440729079966331482864916} +#define T_16384_403 {0.9880811227723240719456043734680861234665,-0.1539340599769373763017199507885379716754} +#define T_16384_405 {0.9879627662072634208811905409675091505051,-0.1546918633545154297959101086235023103654} +#define T_16384_407 {0.9878438284491617427107712501310743391514,-0.1554495757308558268050546757876873016357} +#define T_16384_409 {0.9877243095679869577807608038710895925760,-0.1562071966602159023285167904759873636067} +#define T_16384_411 {0.9876042096340491571737629783456213772297,-0.1569647256969067816712026797176804393530} +#define T_16384_413 {0.9874835287179997145301513228332623839378,-0.1577221623952936302437422000366495922208} +#define T_16384_415 {0.9873622668908323962710937848896719515324,-0.1584795063097959588738206093694316223264} +#define T_16384_417 {0.9872404242238822513755280851910356432199,-0.1592367569948878736063591077254386618733} +#define T_16384_419 {0.9871180007888262775139764926279895007610,-0.1599939140050982699925441465893527492881} +#define T_16384_421 {0.9869949966576829769593359742430038750172,-0.1607509768950112216678860477259149774909} +#define T_16384_423 {0.9868714119028124676091806577460374683142,-0.1615079452192661468856726969534065574408} +#define T_16384_425 {0.9867472465969164829857618315145373344421,-0.1622648185325580305615744691749569028616} +#define T_16384_427 {0.9866225008130384832583104071090929210186,-0.1630215963896378406072784628122462891042} +#define T_16384_429 {0.9864971746245628780869196816638577729464,-0.1637782783453126667083665779500734061003} +#define T_16384_431 {0.9863712681052160258232675005274359136820,-0.1645348639544459978800716726254904642701} +#define T_16384_433 {0.9862447813290654563544990196533035486937,-0.1652913527719580000230337191169383004308} +#define T_16384_435 {0.9861177143705200931478316306311171501875,-0.1660477443528257934790559602333814837039} +#define T_16384_437 {0.9859900673043301422282524981710594147444,-0.1668040382520837305868610656034434214234} +#define T_16384_439 {0.9858618402055869811562160975881852209568,-0.1675602340248235899711204410778009332716} +#define T_16384_441 {0.9857330331497234920945516023493837565184,-0.1683163312261948540982103850183193571866} +#define T_16384_443 {0.9856036462125133956746481089794542640448,-0.1690723294114050145875438602161011658609} +#define T_16384_445 {0.9854736794700718061079669496393762528896,-0.1698282281357198497673266501806210726500} +#define T_16384_447 {0.9853431329988547870968318420636933296919,-0.1705840269544636189635866685421206057072} +#define T_16384_449 {0.9852120068756593518344288895605131983757,-0.1713397254230193123003544997118297033012} +#define T_16384_451 {0.9850803011776237960717139685584697872400,-0.1720953230968290115221464020578423514962} +#define T_16384_453 {0.9849480159822270319835979535127989947796,-0.1728508195313940842829936173075111582875} +#define T_16384_455 {0.9848151513672891432804590294836089015007,-0.1736062142822754339466229112076689489186} +#define T_16384_457 {0.9846817074109709411189328420732636004686,-0.1743615069050937771422127298137638717890} +#define T_16384_459 {0.9845476841917739641019124974263831973076,-0.1751166969555299213201493557789945043623} +#define T_16384_461 {0.9844130817885407003231534872611518949270,-0.1758717839893250423077830646434449590743} +#define T_16384_463 {0.9842779002804543653226687638380099087954,-0.1766267675622808785984574342364794574678} +#define T_16384_465 {0.9841421397470385690198213524126913398504,-0.1773816472302600644184167322237044572830} +#define T_16384_467 {0.9840058002681578708248366638144943863153,-0.1781364225491862962602596098804497160017} +#define T_16384_469 {0.9838688819240172245272901818680111318827,-0.1788910930750447492165733365254709497094} +#define T_16384_471 {0.9837313847951620893184099259087815880775,-0.1796456583638821602466606464076903648674} +#define T_16384_473 {0.9835933089624786518356813758146017789841,-0.1804001179718072445101739731398993171751} +#define T_16384_475 {0.9834546545071932710513351594272535294294,-0.1811544714549908063894179122144123539329} +#define T_16384_477 {0.9833154215108728113392544400994665920734,-0.1819087183696661835785590710656833834946} +#define T_16384_479 {0.9831756100554244204303699916636105626822,-0.1826628582721293025947773003281326964498} +#define T_16384_481 {0.9830352202230956404349626609473489224911,-0.1834168907187390951118999282698496244848} +#define T_16384_483 {0.9828942520964740747757559802266769111156,-0.1841708152659177200050066858239006251097} +#define T_16384_485 {0.9827527057584878322771260172885376960039,-0.1849246314701507853950346316196373663843} +#define T_16384_487 {0.9826105812924047500089841378212440758944,-0.1856783388879876262045343082718318328261} +#define T_16384_489 {0.9824678787818331704428942430240567773581,-0.1864319370760416094690015142987249419093} +#define T_16384_491 {0.9823245983107212753182579945132602006197,-0.1871854255909903286259066135244211181998} +#define T_16384_493 {0.9821807399633570856423148143221624195576,-0.1879388039895759088260263069969369098544} +#define T_16384_495 {0.9820363038243690168016541974793653935194,-0.1886920718286052289780485580195090733469} +#define T_16384_497 {0.9818912899787251014060984743991866707802,-0.1894452286649502270599043640686431899667} +#define T_16384_499 {0.9817456985117329892887028108816593885422,-0.1901982740555481499189482974543352611363} +#define T_16384_501 {0.9815995295090407246618724457221105694771,-0.1909512075574018030721390459802933037281} +#define T_16384_503 {0.9814527830566355248720356030389666557312,-0.1917040287275798005062199536041589453816} +#define T_16384_505 {0.9813054592408446685780631923989858478308,-0.1924567371232168422334751767266425304115} +#define T_16384_507 {0.9811575581483348296174540337233338505030,-0.1932093323015139918474858404806582257152} +#define T_16384_509 {0.9810090798661126321178471698658540844917,-0.1939618138197388708121593481337185949087} +#define T_16384_511 {0.9808600244815238733409046290034893900156,-0.1947141812352259915286367686348967254162} +#define T_16384_513 {0.9807103920822539677715212746988981962204,-0.1954664341053769793798977616461343131959} +#define T_16384_515 {0.9805601827563278360955223433848004788160,-0.1962185719876608780420923494602902792394} +#define T_16384_517 {0.9804093965921099051996634443639777600765,-0.1969705944396143437735702264035353437066} +#define T_16384_519 {0.9802580336783035530601182472310028970242,-0.1977225010188419229706369151244871318340} +#define T_16384_521 {0.9801060941039517748762932569661643356085,-0.1984742912830163852344611541411723010242} +#define T_16384_523 {0.9799535779584367389816179638728499412537,-0.1992259647898788621489529759855940937996} +#define T_16384_525 {0.9798004853314797868435448435775469988585,-0.1999775210972391803476710947506944648921} +#define T_16384_527 {0.9796468163131412110189444319985341280699,-0.2007289597629761390695790623794891871512} +#define T_16384_529 {0.9794925709938208102656176379241514950991,-0.2014802803450377599592258093252894468606} +#define T_16384_531 {0.9793377494642567793192711178562603890896,-0.2022314824014414813557749539540964178741} +#define T_16384_533 {0.9791823518155269301388443636824376881123,-0.2029825654902744636043365744626498781145} +#define T_16384_535 {0.9790263781390475816834850775194354355335,-0.2037335291696939221228745964253903366625} +#define T_16384_537 {0.9788698285265741150240614842914510518312,-0.2044843729979272384245092553101130761206} +#define T_16384_539 {0.9787127030702004182316500191518571227789,-0.2052350965332723486955757152827573008835} +#define T_16384_541 {0.9785550018623595525113501025771256536245,-0.2059856993340979380846533786098007112741} +#define T_16384_543 {0.9783967249958230860684693652729038149118,-0.2067361809588436905027464263184810988605} +#define T_16384_545 {0.9782378725637010941085236481740139424801,-0.2074865409660206494457668213726719841361} +#define T_16384_547 {0.9780784446594423808818419274757616221905,-0.2082367789142113290168367711885366588831} +#define T_16384_549 {0.9779184413768343686612638521182816475630,-0.2089868943620700747487717308104038238525} +#define T_16384_551 {0.9777578628100027646752323562395758926868,-0.2097368868683233134042609435709891840816} +#define T_16384_553 {0.9775967090534118941747010467224754393101,-0.2104867559917697472648967504937900230289} +#define T_16384_555 {0.9774349802018642563439243531320244073868,-0.2112365012912806871980819778400473296642} +#define T_16384_557 {0.9772726763505008573673649152624420821667,-0.2119861223258003302127860933978809043765} +#define T_16384_559 {0.9771097975948008773627861955901607871056,-0.2127356186543459259929989002557704225183} +#define T_16384_561 {0.9769463440305816703812524792738258838654,-0.2134849898360080544534866930916905403137} +#define T_16384_563 {0.9767823157539986533848264116386417299509,-0.2142342354299509865622752613489865325391} +#define T_16384_565 {0.9766177128615456393134763857233338057995,-0.2149833549954128508741035830098553560674} +#define T_16384_567 {0.9764525354500540599289593046705704182386,-0.2157323480917058833306043652555672451854} +#define T_16384_569 {0.9762867836166936319486353568208869546652,-0.2164812142782167603272114320134278386831} +#define T_16384_571 {0.9761204574589719129562581656500697135925,-0.2172299531144067930021890333591727539897} +#define T_16384_573 {0.9759535570747343014019747897691559046507,-0.2179785641598122325479636174350162036717} +#define T_16384_575 {0.9757860825621639255800232604087796062231,-0.2187270469740444367445775242231320589781} +#define T_16384_577 {0.9756180340197817546510350439348258078098,-0.2194754011167903140488988356082700192928} +#define T_16384_579 {0.9754494115464463765974301168171223253012,-0.2202236261478123791057726066355826333165} +#define T_16384_581 {0.9752802152413542202680218906607478857040,-0.2209717216269491135705038686865009367466} +#define T_16384_583 {0.9751104452040388892442024371121078729630,-0.2217196871141152159090381701389560475945} +#define T_16384_585 {0.9749401015343718279737572629528585821390,-0.2224675221693018789537177326565142720938} +#define T_16384_587 {0.9747691843325617666593529975216370075941,-0.2232152263525770119478863762196851894259} +#define T_16384_589 {0.9745976936991550543254447802610229700804,-0.2239627992240854625904944441572297364473} +#define T_16384_591 {0.9744256297350349926844614856236148625612,-0.2247102403440494333697330375798628665507} +#define T_16384_593 {0.9742529925414225022706204981659539043903,-0.2254575492727685370741852466380805708468} +#define T_16384_595 {0.9740797822198756783507178624859079718590,-0.2262047255706201853708847693269490264356} +#define T_16384_597 {0.9739059988722895688795233581913635134697,-0.2269517687980598386054964521463261917233} +#define T_16384_599 {0.9737316426008963965443854249315336346626,-0.2276986785156211723357699838743428699672} +#define T_16384_601 {0.9735567135082655587652311623969580978155,-0.2284454542839164659095985143721918575466} +#define T_16384_603 {0.9733812116973032946276589427725411951542,-0.2291920956636367967540479639865225180984} +#define T_16384_605 {0.9732051372712527959052408732532057911158,-0.2299386022155522346643863329518353566527} +#define T_16384_607 {0.9730284903336942070595227960438933223486,-0.2306849735005122303821423201952711679041} +#define T_16384_609 {0.9728512709885441811508144382969476282597,-0.2314312090794457821285590171100920997560} +#define T_16384_611 {0.9726734793400564349497017246903851628304,-0.2321773085133617131603500638448167592287} +#define T_16384_613 {0.9724951154928211938255344648496247828007,-0.2329232713633489770810314212212688289583} +#define T_16384_615 {0.9723161795517653027687288158631417900324,-0.2336690971905768521299506801369716413319} +#define T_16384_617 {0.9721366716221522263907672822824679315090,-0.2344147855562951632268919865964562632143} +#define T_16384_619 {0.9719565918095817158572913285752292722464,-0.2351603360218347260612858917738776654005} +#define T_16384_621 {0.9717759402199901419550087666721083223820,-0.2359057481486073748477849676419282332063} +#define T_16384_623 {0.9715947169596501620247863684198819100857,-0.2366510214981063786598980414055404253304} +#define T_16384_625 {0.9714129221351709420062547906127292662859,-0.2373961556319066079634438892753678373992} +#define T_16384_627 {0.9712305558534973792816913373826537281275,-0.2381411501116648399278830083858338184655} +#define T_16384_629 {0.9710476182219111018767421228403691202402,-0.2388860044991200637376493887131800875068} +#define T_16384_631 {0.9708641093480294692596999084344133734703,-0.2396307183560935916144529755911207757890} +#define T_16384_633 {0.9706800293398061274530164155294187366962,-0.2403752912444894473953382885156315751374} +#define T_16384_635 {0.9704953783055305649440924753434956073761,-0.2411197227262946163328649618051713332534} +#define T_16384_637 {0.9703101563538281126852780289482325315475,-0.2418640123635792116285614383741631172597} +#define T_16384_639 {0.9701243635936602771607795148156583309174,-0.2426081597184968352554079729088698513806} +#define T_16384_641 {0.9699380001343239632305426312086638063192,-0.2433521643532847444912903256408753804862} +#define T_16384_643 {0.9697510660854521402640671112749259918928,-0.2440960258302642127414827655229601077735} +#define T_16384_645 {0.9695635615570131760065919479529839009047,-0.2448397437118406683165261483736685477197} +#define T_16384_647 {0.9693754866593112806683052440348546952009,-0.2455833175605040552547109200531849637628} +#define T_16384_649 {0.9691868415029859518128318995877634733915,-0.2463267469388290276111064258657279424369} +#define T_16384_651 {0.9689976261990124184464434620167594403028,-0.2470700314094752825244682981065125204623} +#define T_16384_653 {0.9688078408587009748842433509707916527987,-0.2478131705351876434839653029484907165170} +#define T_16384_655 {0.9686174855936975358616791709209792315960,-0.2485561638787965599295404217627947218716} +#define T_16384_657 {0.9684265605159831924453328610979951918125,-0.2492990110032181905186376980054774321616} +#define T_16384_659 {0.9682350657378743230552231580077204853296,-0.2500417114714546529263827778777340427041} +#define T_16384_661 {0.9680430013720222603978982078842818737030,-0.2507842648465944956903683760174317285419} +#define T_16384_663 {0.9678503675314136245333429542370140552521,-0.2515266706918126149439274286123691126704} +#define T_16384_665 {0.9676571643293698787857692877878434956074,-0.2522689285703708095276454059785464778543} +#define T_16384_667 {0.9674633918795474407659185089869424700737,-0.2530110380456179197672383907047333195806} +#define T_16384_669 {0.9672690502959377933933637905283831059933,-0.2537529986809899940070067714259494096041} +#define T_16384_671 {0.9670741396928670408073003272875212132931,-0.2544948100400107882101963241439079865813} +#define T_16384_673 {0.9668786601849959083665453363209962844849,-0.2552364716862917104478469809691887348890} +#define T_16384_675 {0.9666826118873201867387479069293476641178,-0.2559779831835323760103051426995079964399} +#define T_16384_677 {0.9664859949151698437219693005317822098732,-0.2567193440955207184295261413353728130460} +#define T_16384_679 {0.9662888093842096903784977257600985467434,-0.2574605539861331005013767025957349687815} +#define T_16384_681 {0.9660910554104388259233360258804168552160,-0.2582016124193349249082984897540882229805} +#define T_16384_683 {0.9658927331101908597688066038244869560003,-0.2589425189591805231970056411228142678738} +#define T_16384_685 {0.9656938426001336894799464971583802253008,-0.2596832731698137664011483138892799615860} +#define T_16384_687 {0.9654943839972695007745073780824895948172,-0.2604238746154680095301614528580103069544} +#define T_16384_689 {0.9652943574189346565006530909158755093813,-0.2611643228604664801473234092554775997996} +#define T_16384_691 {0.9650937629827995856146571895806118845940,-0.2619046174692226114366633282770635560155} +#define T_16384_693 {0.9648926008068688942032054001174401491880,-0.2626447580062400422029611490870593115687} +#define T_16384_695 {0.9646908710094810324164882331388071179390,-0.2633847440361132830055623799125896766782} +#define T_16384_697 {0.9644885737093084054905034463445190340281,-0.2641245751235275496249244042701320722699} +#define T_16384_699 {0.9642857090253574847693585070373956114054,-0.2648642508332592626629775622859597206116} +#define T_16384_701 {0.9640822770769681415714558170293457806110,-0.2656037707301763250988813069852767512202} +#define T_16384_703 {0.9638782779838142023010050252196379005909,-0.2663431343792381778001754355500452220440} +#define T_16384_705 {0.9636737118659032264034181025635916739702,-0.2670823413454962436119899393816012889147} +#define T_16384_707 {0.9634685788435759512537970294943079352379,-0.2678213911940941494016499291319632902741} +#define T_16384_709 {0.9632628790375070693130510335322469472885,-0.2685602834902678925921293284773128107190} +#define T_16384_711 {0.9630566125687043399494768891599960625172,-0.2692990177993461742289582616649568080902} +#define T_16384_713 {0.9628497795585090335279687678848858922720,-0.2700375936867505655136767472868086770177} +#define T_16384_715 {0.9626423801285957093654133132076822221279,-0.2707760107179960074041957795998314395547} +#define T_16384_717 {0.9624344144009721047083871781069319695234,-0.2715142684586906995924948660103837028146} +#define T_16384_719 {0.9622258824979790237108545625233091413975,-0.2722523664745367111272855709103168919683} +#define T_16384_721 {0.9620167845422905594787721383909229189157,-0.2729903043313299249028602844191482290626} +#define T_16384_723 {0.9618071206569135389585767370590474456549,-0.2737280815949605372594533037045039236546} +#define T_16384_725 {0.9615968909651878560040927368390839546919,-0.2744656978314132245166945267556002363563} +#define T_16384_727 {0.9613860955907862493319271379732526838779,-0.2752031526067673095070631461567245423794} +#define T_16384_729 {0.9611747346577140804768646376032847911119,-0.2759404454871972056650974991498515009880} +#define T_16384_731 {0.9609628082903097778810774798330385237932,-0.2766775760389724170273950676346430554986} +#define T_16384_733 {0.9607503166132439487157057556032668799162,-0.2774145438284581488552760220045456662774} +#define T_16384_735 {0.9605372597515200450146721777855418622494,-0.2781513484221151411013295273733092471957} +#define T_16384_737 {0.9603236378304739195854722311196383088827,-0.2788879893865002235209260561532573774457} +#define T_16384_739 {0.9601094509757739370314766347291879355907,-0.2796244662882665932279735443444224074483} +#define T_16384_741 {0.9598946993134205296627214920590631663799,-0.2803607786941638146949173915345454588532} +#define T_16384_743 {0.9596793829697467526074206034536473453045,-0.2810969261710382638419503109616925939918} +#define T_16384_745 {0.9594635020714175066558482285472564399242,-0.2818329082858333500816172545455629006028} +#define T_16384_747 {0.9592470567454300933718513988424092531204,-0.2825687246055897938745715691766235977411} +#define T_16384_749 {0.9590300471191136599813376051315572112799,-0.2833043746974457377518774592317640781403} +#define T_16384_751 {0.9588124733201293103945772600127384066582,-0.2840398581286371904042198366369120776653} +#define T_16384_753 {0.9585943354764702162285061604052316397429,-0.2847751744664983042376604771561687812209} +#define T_16384_755 {0.9583756337164611727175156374869402498007,-0.2855103232784613198624867891339818015695} +#define T_16384_757 {0.9581563681687588207580574817257001996040,-0.2862453041320571767158753573312424123287} +#define T_16384_759 {0.9579365389623514248640390178479719907045,-0.2869801165949155130618919429252855479717} +#define T_16384_761 {0.9577161462265588731668231048388406634331,-0.2877147602347651655918525648303329944611} +#define T_16384_763 {0.9574951900910325663929256734263617545366,-0.2884492346194342249354747309553204104304} +#define T_16384_765 {0.9572736706857551958194108010502532124519,-0.2891835393168502577054823632352054119110} +#define T_16384_767 {0.9570515881410409653184956368932034820318,-0.2899176738950407505868156476935837417841} +#define T_16384_769 {0.9568289425875353693129454768495634198189,-0.2906516379221332768700847282161703333259} +#define T_16384_771 {0.9566057341562150817537713010096922516823,-0.2913854309663556629850234003242803737521} +#define T_16384_773 {0.9563819629783877340756248486286494880915,-0.2921190525960364325896989612374454736710} +#define T_16384_775 {0.9561576291856921372414035431575030088425,-0.2928525023796048065705122098734136670828} +#define T_16384_777 {0.9559327329100981707199480297276750206947,-0.2935857798855912026425585281685926020145} +#define T_16384_779 {0.9557072742839065604414372501196339726448,-0.2943188846826274018830815748515306040645} +#define T_16384_781 {0.9554812534397487677750859802472405135632,-0.2950518163394467152649269792163977399468} +#define T_16384_783 {0.9552546705105869895291448301577474921942,-0.2957845744248842612122984974121209233999} +#define T_16384_785 {0.9550275256297141579509002440317999571562,-0.2965171585078774096899678625050000846386} +#define T_16384_787 {0.9547998189307537186820695751521270722151,-0.2972495681574658377144260157365351915359} +#define T_16384_789 {0.9545715505476596307588010859035421162844,-0.2979818029427918069096392628125613555312} +#define T_16384_791 {0.9543427206147164776339764102885965257883,-0.2987138624331003855516541989345569163561} +#define T_16384_793 {0.9541133292665388010433957788336556404829,-0.2994457461977398926578075588622596114874} +#define T_16384_795 {0.9538833766380717671395927936828229576349,-0.3001774538061620090090286794293206185102} +#define T_16384_797 {0.9536528628645905003580196535040158778429,-0.3009089848279218881721419620589585974813} +#define T_16384_799 {0.9534217880817003054616520785202737897635,-0.3016403388326788226336816478578839451075} +#define T_16384_801 {0.9531901524253366675409893105097580701113,-0.3023715153901959662441356613271636888385} +#define T_16384_803 {0.9529579560317646969025418002274818718433,-0.3031025140703410558629116167139727622271} +#define T_16384_805 {0.9527251990375795731580410574679262936115,-0.3038333344430863558471855867537669837475} +#define T_16384_807 {0.9524918815797063231798347260337322950363,-0.3045639760785091021411119527328992262483} +#define T_16384_809 {0.9522580037953995990562816587043926119804,-0.3052944385467916688092770982621004804969} +#define T_16384_811 {0.9520235658222435670694494547205977141857,-0.3060247214182217900813043343077879399061} +#define T_16384_813 {0.9517885677981521297397193848155438899994,-0.3067548242631927823964588242233730852604} +#define T_16384_815 {0.9515530098613685927588790036679711192846,-0.3074847466522040995151598963275318965316} +#define T_16384_817 {0.9513168921504655539678196873865090310574,-0.3082144881558611104743761188728967681527} +#define T_16384_819 {0.9510802148043450143788390960253309458494,-0.3089440483448757102102888438821537420154} +#define T_16384_821 {0.9508429779622381561310362485528457909822,-0.3096734267900664305805946696636965498328} +#define T_16384_823 {0.9506051817637053424903115228516981005669,-0.3104026230623587734314128283585887402296} +#define T_16384_825 {0.9503668263486358958047617306874599307775,-0.3111316367327852661084364171983906999230} +#define T_16384_827 {0.9501279118572480975046801177086308598518,-0.3118604673724860165684447110834298655391} +#define T_16384_829 {0.9498884384300892991248588259622920304537,-0.3125891145527087133793031625828007236123} +#define T_16384_831 {0.9496484062080354782153790438314899802208,-0.3133175778448090142980220207391539588571} +#define T_16384_833 {0.9494078153322915714085183935821987688541,-0.3140458568202507128042100248421775177121} +#define T_16384_835 {0.9491666659443906972626336937537416815758,-0.3147739510506060711669817919755587354302} +#define T_16384_837 {0.9489249581861951554628831217996776103973,-0.3155018601075559869784115107904653996229} +#define T_16384_839 {0.9486826921998950945535966638999525457621,-0.3162295835628903262204403290525078773499} +#define T_16384_841 {0.9484398681280096221613007401174400001764,-0.3169571209885081453094812786730471998453} +#define T_16384_843 {0.9481964861133855837493911167257465422153,-0.3176844719564179131410242007405031472445} +#define T_16384_845 {0.9479525462991986728411575313657522201538,-0.3184116360387377886453919018094893544912} +#define T_16384_847 {0.9477080488289522097744566053734160959721,-0.3191386128076958983434963101899484172463} +#define T_16384_849 {0.9474629938464776968132241563580464571714,-0.3198654018356305028802921697206329554319} +#define T_16384_851 {0.9472173814959348181474751982023008167744,-0.3205920026949903300916844273160677403212} +#define T_16384_853 {0.9469712119218108847817916284839157015085,-0.3213184149583349080714356205135118216276} +#define T_16384_855 {0.9467244852689211676022296160226687788963,-0.3220446381983345096600146462151315063238} +#define T_16384_857 {0.9464772016824086753317146758490707725286,-0.3227706719877707630672603045240975916386} +#define T_16384_859 {0.9462293613077438214631342816574033349752,-0.3234965158995367073835325300024123862386} +#define T_16384_861 {0.9459809642907247573262452533526811748743,-0.3242221695066370146243173167022177949548} +#define T_16384_863 {0.9457320107774771500430688320193439722061,-0.3249476323821884338194365682284114882350} +#define T_16384_865 {0.9454825009144537384386808298586402088404,-0.3256729040994197910130480977386469021440} +#define T_16384_867 {0.9452324348484349991750264052825514227152,-0.3263979842316724888640067092637764289975} +#define T_16384_869 {0.9449818127265281475501979002729058265686,-0.3271228723524005066458641977078514173627} +#define T_16384_871 {0.9447306346961678036322496154753025621176,-0.3278475680351708443360791989107383415103} +#define T_16384_873 {0.9444789009051155481699879601364955306053,-0.3285720708536637446606221146794268861413} +#define T_16384_875 {0.9442266115014598115706689895887393504381,-0.3292963803816728041162775753036839887500} +#define T_16384_877 {0.9439737666336159849223008677654433995485,-0.3300204961931054170598542896186700090766} +#define T_16384_879 {0.9437203664503261979490389421698637306690,-0.3307444178619829422416387387784197926521} +#define T_16384_881 {0.9434664111006593190111857438751030713320,-0.3314681449624408693388488700293237343431} +#define T_16384_883 {0.9432119007340106220382835999771486967802,-0.3321916770687292075336927155149169266224} +#define T_16384_885 {0.9429568355001021195960220211418345570564,-0.3329150137552126520468220860493602231145} +#define T_16384_887 {0.9427012155489820077747253890265710651875,-0.3336381545963708616930887274065753445029} +#define T_16384_889 {0.9424450410310248882339578813116531819105,-0.3343610991667987364373004766093799844384} +#define T_16384_891 {0.9421883120969317682025234717002604156733,-0.3350838470412065839276749557029688730836} +#define T_16384_893 {0.9419310288977295053669536173401866108179,-0.3358063977944205080738981905597029253840} +#define T_16384_895 {0.9416731915847713629830195714021101593971,-0.3365287510013824090471246108791092410684} +#define T_16384_897 {0.9414148003097362327196151454700157046318,-0.3372509062371505939026405940239783376455} +#define T_16384_899 {0.9411558552246291897702690221194643527269,-0.3379728630768997765798644650203641504049} +#define T_16384_901 {0.9408963564817808267193299798236694186926,-0.3386946210959212444358001903310650959611} +#define T_16384_903 {0.9406363042338475866088742805004585534334,-0.3394161798696233578453984591760672628880} +#define T_16384_905 {0.9403756986338115408941007444809656590223,-0.3401375389735317722461616085638524964452} +#define T_16384_907 {0.9401145398349802784210282879939768463373,-0.3408586979832894381381436232913983985782} +#define T_16384_909 {0.9398528279909866833818909981346223503351,-0.3415796564746571561954624485224485397339} +#define T_16384_911 {0.9395905632557891573597430578956846147776,-0.3423004140235135217551487585296854376793} +#define T_16384_913 {0.9393277457836713972838538211362902075052,-0.3430209702058555354398095005308277904987} +#define T_16384_915 {0.9390643757292419513404979625192936509848,-0.3437413245977984921353254321729764342308} +#define T_16384_917 {0.9388004532474347740844677900895476341248,-0.3444614767755765361023634341108845546842} +#define T_16384_919 {0.9385359784935085603052584701799787580967,-0.3451814263155426054652252787491306662560} +#define T_16384_921 {0.9382709516230471891162778774742037057877,-0.3459011727941689873233599428203888237476} +#define T_16384_923 {0.9380053727919588357764268948812969028950,-0.3466207157880473732625148386432556435466} +#define T_16384_925 {0.9377392421564769708908215761766768991947,-0.3473400548738891369104919704113854095340} +#define T_16384_927 {0.9374725598731591391654660583299119025469,-0.3480591896285256114929040904826251789927} +#define T_16384_929 {0.9372053260988879586079747241456061601639,-0.3487781196289084229000820869259769096971} +#define T_16384_931 {0.9369375409908698992822451145912054926157,-0.3494968444521095451982262147794244810939} +#define T_16384_933 {0.9366692047066361714868776289222296327353,-0.3502153636753216336963134835968958213925} +#define T_16384_935 {0.9364003174040420596213607495883479714394,-0.3509336768758583580130050449952250346541} +#define T_16384_937 {0.9361308792412670332083735047490336000919,-0.3516517836311546241212511176854604855180} +#define T_16384_939 {0.9358608903768146358714830057579092681408,-0.3523696835187666298594422187306918203831} +#define T_16384_941 {0.9355903509695123743128419846470933407545,-0.3530873761163724755540727073821472004056} +#define T_16384_943 {0.9353192611785116072908863316115457564592,-0.3538048610017720529974383225635392591357} +#define T_16384_945 {0.9350476211632874345980326324934139847755,-0.3545221377528874895368460329336812719703} +#define T_16384_947 {0.9347754310836386970606781687820330262184,-0.3552392059477633146080677306599682196975} +#define T_16384_949 {0.9345026910996878655168984550982713699341,-0.3559560651645668483133988502231659367681} +#define T_16384_951 {0.9342294013718808187718423141632229089737,-0.3566727149815882569328095996752381324768} +#define T_16384_953 {0.9339555620609867325754294142825528979301,-0.3573891549772409415020035794441355392337} +#define T_16384_955 {0.9336811733280984126892576568934600800276,-0.3581053847300616488347202448494499549270} +#define T_16384_957 {0.9334062353346315177304859389550983905792,-0.3588214038187108601007935249072033911943} +#define T_16384_959 {0.9331307482423252253056489280425012111664,-0.3595372118219731238930592098768102005124} +#define T_16384_961 {0.9328547122132411217876324371900409460068,-0.3602528083187568896939012574875960126519} +#define T_16384_963 {0.9325781274097644235610005125636234879494,-0.3609681928880952295202177992905490100384} +#define T_16384_965 {0.9323009939946027557766683457884937524796,-0.3616833651091458379234211406583199277520} +#define T_16384_967 {0.9320233121307864854188096614961978048086,-0.3623983245611913650563451483321841806173} +#define T_16384_969 {0.9317450819816687213048567173245828598738,-0.3631130708236395276955477129376959055662} +#define T_16384_971 {0.9314663037109250920408953788864891976118,-0.3638276034760234978193693677894771099091} +#define T_16384_973 {0.9311869774825537460216651197697501629591,-0.3645419220980021801636894451803527772427} +#define T_16384_975 {0.9309071034608751293859540965058840811253,-0.3652560262693602677330773076391778886318} +#define T_16384_977 {0.9306266818105317639719942235387861728668,-0.3659699155700087969123046605091076344252} +#define T_16384_979 {0.9303457126964884693620660982560366392136,-0.3666835895799849254217406269162893295288} +#define T_16384_981 {0.9300641962840323628824990009889006614685,-0.3673970478794527649846202166372677311301} +#define T_16384_983 {0.9297821327387721934698561199184041470289,-0.3681102900487030482601369385520229116082} +#define T_16384_985 {0.9294995222266385637155394761066418141127,-0.3688233156681539059995600382535485550761} +#define T_16384_987 {0.9292163649138840408880923860124312341213,-0.3695361243183507005127808042743708938360} +#define T_16384_989 {0.9289326609670828238662920739443507045507,-0.3702487155799664142463711868913378566504} +#define T_16384_991 {0.9286484105531305210945447470294311642647,-0.3709610890338019828504911856725811958313} +#define T_16384_993 {0.9283636138392443726274905202444642782211,-0.3716732442607865172234937745088245719671} +#define T_16384_995 {0.9280782709929630280853984913846943527460,-0.3723851808419773590230761328712105751038} +#define T_16384_997 {0.9277923821821463246095618160325102508068,-0.3730968983585606357777919583895709365606} +#define T_16384_999 {0.9275059475749751758399952450417913496494,-0.3738083963918512608870514668524265289307} +#define T_16384_1001 {0.9272189673399517939600400495692156255245,-0.3745196745232932111768775484961224719882} +#define T_16384_1003 {0.9269314416458991345848517084959894418716,-0.3752307323344599154779643868096172809601} +#define T_16384_1005 {0.9266433706619612298283072959748096764088,-0.3759415694070544766702823835657909512520} +#define T_16384_1007 {0.9263547545576028552360980938829015940428,-0.3766521853229096161719269275636179372668} +#define T_16384_1009 {0.9260655935026093077411246667907107621431,-0.3773625796639883400729331697220914065838} +#define T_16384_1011 {0.9257758876670867387304042495088651776314,-0.3780727520123840501575784855958772823215} +#define T_16384_1013 {0.9254856372214614879112559719942510128021,-0.3787827019503205439043824753753142431378} +#define T_16384_1015 {0.9251948423364805274005107094126287847757,-0.3794924290601526251087705077225109562278} +#define T_16384_1017 {0.9249035031832109066129987695603631436825,-0.3802019329243660483719224885135190561414} +#define T_16384_1019 {0.9246116199330399743061548178957309573889,-0.3809112131255780742122851734166033565998} +#define T_16384_1021 {0.9243191927576751565354129525076132267714,-0.3816202692465373580432697053765878081322} +#define T_16384_1023 {0.9240262218291438456319042415998410433531,-0.3823291008701245052847639271931257098913} +#define T_16384_1025 {0.9237327073197931781578517984598875045776,-0.3830377075793520713631323815206997096539} +#define T_16384_1027 {0.9234386494022903679734781690058298408985,-0.3837460889573650058004261609312379732728} +#define T_16384_1035 {0.9222569871152830334182226579287089407444,-0.3865773528248139245810932607128052040935} +#define T_16384_1037 {0.9219602147582092177557910872565116733313,-0.3872846012985757813140708094579167664051} +#define T_16384_1043 {0.9210666441942736426540250249672681093216,-0.3894049780789909376288449038838734850287} +#define T_16384_1045 {0.9207677034261287918326388535206206142902,-0.3901113127395469115654691449890378862619} +#define T_16384_1053 {0.9195665254777515285056210814218502491713,-0.3929343523042694852343004185968311503530} +#define T_16384_1055 {0.9192648781549852543548695393837988376617,-0.3936395353501729310607970546698197722435} +#define T_16384_1061 {0.9183566922190217196586559111892711371183,-0.3957536934212200763205657949583837762475} +#define T_16384_1063 {0.9180528828447703793003142891393508762121,-0.3964579477074539060055258232750929892063} +#define T_16384_1069 {0.9171382150373507124996308448316995054483,-0.3985693095536863017613882220757659524679} +#define T_16384_1071 {0.9168322464711838870599081019463483244181,-0.3992726284515409895803372819500509649515} +#define T_16384_1073 {0.9165257385562282088642405142309144139290,-0.3999757124675953345871448618709109723568} +#define T_16384_1079 {0.9156029805233202312209073170379269868135,-0.4020835510895869879810504698980366811156} +#define T_16384_1081 {0.9152943170194870470268710960226599127054,-0.4027856914437635271752924381871707737446} +#define T_16384_1087 {0.9143650965714985590082619637541938573122,-0.4048906891641176342133690013724844902754} +#define T_16384_1089 {0.9140542803840464580744651357235852628946,-0.4055918792476038703398444340564310550690} +#define T_16384_1091 {0.9137429264820113861134132093866355717182,-0.4062928307318374177015130044310353696346} +#define T_16384_1097 {0.9128056403216034953018720443651545792818,-0.4083942494662080036071927224838873371482} +#define T_16384_1099 {0.9124921373960126480540111515438184142113,-0.4090942424313209757613662986841518431902} +#define T_16384_1103 {0.9118635213427285224341289904259610921144,-0.4104935059710924094567019437818089500070} +#define T_16384_1105 {0.9115484085848339912416804509120993316174,-0.4111927757226001567403272929368540644646} +#define T_16384_1107 {0.9112327595864961882199395404313690960407,-0.4118918035799921661954670071281725540757} +#define T_16384_1109 {0.9109165745334033559288400283548980951309,-0.4125905891320482687945059296907857060432} +#define T_16384_1115 {0.9099648049072056599229085804836358875036,-0.4146854878461400661926461452821968123317} +#define T_16384_1117 {0.9096464774982795375990463071502745151520,-0.4153833000675062891993150060443440452218} +#define T_16384_1121 {0.9090082175032474509634994319640100002289,-0.4167781910219976460041380050824955105782} +#define T_16384_1123 {0.9086882852926133624649196462996769696474,-0.4174752689345443412705094488046597689390} +#define T_16384_1125 {0.9083678185240728897298367883195169270039,-0.4181721012571463225171441990823950618505} +#define T_16384_1127 {0.9080468173861483416331452644953969866037,-0.4188686875798751096944272376276785507798} +#define T_16384_1133 {0.9070806096460084511079458025051280856133,-0.4209569664517094356170900937286205589771} +#define T_16384_1135 {0.9067574729220565510345863913244102150202,-0.4216525646785583281150877610343741253018} +#define T_16384_1137 {0.9064338027760454608028339862357825040817,-0.4223479148580670528012603881506947800517} +#define T_16384_1139 {0.9061095993983819818495817344228271394968,-0.4230430165811790432783823234785813838243} +#define T_16384_1143 {0.9054595937112932535484333129716105759144,-0.4244324730227174158336822529236087575555} +#define T_16384_1145 {0.9051337917842496860743040087982080876827,-0.4251268269237623576017881532607134431601} +#define T_16384_1151 {0.9041531919699917763111329804814886301756,-0.4272083864467963176814180314977420493960} +#define T_16384_1153 {0.9038252613284873948629183360026217997074,-0.4279017375338541318008367397851543501019} +#define T_16384_1155 {0.9034967989898684548322194132197182625532,-0.4285948368973444000396000319597078487277} +#define T_16384_1157 {0.9031678051473607249377550942881498485804,-0.4292876841295346079796502181125106289983} +#define T_16384_1161 {0.9025082237251458305848927921033464372158,-0.4306726205698268028498887360910885035992} +#define T_16384_1163 {0.9021776365334536196272097186010796576738,-0.4313647089632063891251334553089691326022} +#define T_16384_1169 {0.9011826913706845187945759789727162569761,-0.4334394499510740850212187069701030850410} +#define T_16384_1171 {0.9008499824375314490865207517344970256090,-0.4341305208601433651693923820857889950275} +#define T_16384_1173 {0.9005167435575435197492311090172734111547,-0.4348213363814122933703743001387920230627} +#define T_16384_1179 {0.8995138491984879802743080290383659303188,-0.4368922465552803613419996509037446230650} +#define T_16384_1181 {0.8991784924946353285690747725311666727066,-0.4375820364629644521059503858850803226233} +#define T_16384_1187 {0.8981692493925180764691162949020508676767,-0.4396498600542034784766087796015199273825} +#define T_16384_1189 {0.8978317780213056487070844013942405581474,-0.4403386178557372465647290482593234628439} +#define T_16384_1191 {0.8974937784787903050087720657757017761469,-0.4410271166174072332566424847755115479231} +#define T_16384_1197 {0.8964766128133441203118536577676422894001,-0.4430910546137368788421895260398741811514} +#define T_16384_1199 {0.8961365025770867687171516990929376333952,-0.4437785131672182803441728538018651306629} +#define T_16384_1205 {0.8951130096260817570197332315728999674320,-0.4458393208299802901173336522333556786180} +#define T_16384_1207 {0.8947707918973295537767853602417744696140,-0.4465257327046513458057575007842388004065} +#define T_16384_1215 {0.8933966592941076090284013844211585819721,-0.4492687493718299229783497139578685164452} +#define T_16384_1217 {0.8930518117317074455741021665744483470917,-0.4499538438136905238451390687259845435619} +#define T_16384_1223 {0.8920141177012804734047790589102078229189,-0.4520075373504364724119852780859218910336} +#define T_16384_1225 {0.8916671699216722757341813121456652879715,-0.4526915705907009201958146604738431051373} +#define T_16384_1233 {0.8902741354006445950730608274170663207769,-0.4554250364622423608018664253904717043042} +#define T_16384_1235 {0.8899245669440967176910817215684801340103,-0.4561077341477141056103050686942879110575} +#define T_16384_1239 {0.8892238596778682113708214274083729833364,-0.4574723241679160556927286052086856216192} +#define T_16384_1241 {0.8888727212803956279074668600514996796846,-0.4581542156998931192291024672158528119326} +#define T_16384_1243 {0.8885210599820022592254531446087639778852,-0.4588358377115491770759092560183489695191} +#define T_16384_1251 {0.8871091899213001674695533438352867960930,-0.4615596225377330830141886508499737828970} +#define T_16384_1253 {0.8867549172275508428953116890625096857548,-0.4622398909362533392375382845784770324826} +#define T_16384_1257 {0.8860448070835555967050822800956666469574,-0.4635996115618140067660135628102580085397} +#define T_16384_1259 {0.8856889700510489626950061392562929540873,-0.4642790629889658182705147737578954547644} +#define T_16384_1261 {0.8853326119905405855448066176904831081629,-0.4649582412927066865648839666391722857952} +#define T_16384_1269 {0.8839019736658094705816779423912521451712,-0.4676722152851147673580101127299712970853} +#define T_16384_1271 {0.8835430136159618808022742086905054748058,-0.4683500219818765297752349852089537307620} +#define T_16384_1273 {0.8831835338005233904112856180290691554546,-0.4690275531603871894681390131154330447316} +#define T_16384_1275 {0.8828235344309666166040528878511395305395,-0.4697048084220724617487974228424718603492} +#define T_16384_1277 {0.8824630157190701540415034287434536963701,-0.4703817873685207096023930262163048610091} +#define T_16384_1279 {0.8821019778769175756494291817944031208754,-0.4710584896014824995980063704337226226926} +#define T_16384_1287 {0.8806526394581110084303077201184350997210,-0.4737625234391828477065189417771762236953} +#define T_16384_1289 {0.8802890091566208941031845824909396469593,-0.4744378361366792806741443655482726171613} +#define T_16384_1291 {0.8799248610037868623123813449637964367867,-0.4751128697346203022533472903887741267681} +#define T_16384_1293 {0.8795601952138278889492539747152477502823,-0.4757876238359011189515967998886480927467} +#define T_16384_1295 {0.8791950120012674840808131193625740706921,-0.4764620980435812502840065008058445528150} +#define T_16384_1297 {0.8788293115809333588828167194151319563389,-0.4771362919608848063290906793554313480854} +#define T_16384_1305 {0.8773613421290651359640833106823265552521,-0.4798302567965941900496318339719437062740} +#define T_16384_1307 {0.8769930589029258927169507842336315661669,-0.4805030433161575076361771152733126655221} +#define T_16384_1309 {0.8766242597643653100547567191824782639742,-0.4811755471681603557598805309680756181479} +#define T_16384_1313 {0.8758851146181036995130853028967976570129,-0.4825197052871843528265571876545436680317} +#define T_16384_1315 {0.8755147690452227404733775983913801610470,-0.4831913587634719120522674984385957941413} +#define T_16384_1323 {0.8740282385090756278600565565284341573715,-0.4858751262296953088437589940440375357866} +#define T_16384_1325 {0.8736553199069926334274782675493042916059,-0.4865453545130302703825009302818216383457} +#define T_16384_1327 {0.8732818873559942129958244549925439059734,-0.4872152965742687635852803396119270473719} +#define T_16384_1331 {0.8725334812862760580642884633562061935663,-0.4885543204541862305489985374151729047298} +#define T_16384_1333 {0.8721585082078244788306164991809055209160,-0.4892234014851519763311671340488828718662} +#define T_16384_1341 {0.8706534874206174334787533553026150912046,-0.4918968437002992910578313967562280595303} +#define T_16384_1343 {0.8702759512121719387423013358784373849630,-0.4925644818110106459840835668728686869144} +#define T_16384_1349 {0.8691402717112005626987070172617677599192,-0.4945656559950159514293943630036665126681} +#define T_16384_1351 {0.8687606889946553057058054037042893469334,-0.4952321326989312377797602948703570291400} +#define T_16384_1359 {0.8672372496706683975276064302306622266769,-0.4978951222734108728040780533774523064494} +#define T_16384_1361 {0.8668551138454704263480721238011028617620,-0.4985601383985251433550445199216483160853} +#define T_16384_1367 {0.8657056475794022709635555656859651207924,-0.5005534254693775331190863653318956494331} +#define T_16384_1369 {0.8653214733118897994401663709140848368406,-0.5012172660886099473387389480194542557001} +#define T_16384_1375 {0.8641658971368793018541509809438139200211,-0.5032070172658690276890069981163833290339} +#define T_16384_1377 {0.8637796880430467227895974247076082974672,-0.5038696761308989469085872769937850534916} +#define T_16384_1379 {0.8633929708098784239211909152800217270851,-0.5045320385823802711300345436029601842165} +#define T_16384_1385 {0.8622297725508112353765000079874880611897,-0.5065173435598985252070747264951933175325} +#define T_16384_1387 {0.8618410250382453341444488614797592163086,-0.5071785164624251773801688614184968173504} +#define T_16384_1393 {0.8606717414235783847331617835152428597212,-0.5091602434547546351950586540624499320984} +#define T_16384_1395 {0.8602809672906545079840157086437102407217,-0.5098202205851154467097785527585074305534} +#define T_16384_1397 {0.8598896870766022937360162359254900366068,-0.5104798978013757038141307020850945264101} +#define T_16384_1403 {0.8587128122509635197445732046617195010185,-0.5124571260857258003795777767663821578026} +#define T_16384_1405 {0.8583195100171734370775311617762781679630,-0.5131155997666405621515650636865757405758} +#define T_16384_1409 {0.8575313909994991545815423705789726227522,-0.5144316411832229318790155048191081732512} +#define T_16384_1411 {0.8571365746792448669921782311575952917337,-0.5150892081446972747826862359943334013224} +#define T_16384_1413 {0.8567412541276274717816363590827677398920,-0.5157464720924613832409022506908513605595} +#define T_16384_1415 {0.8563454295772036095613088946265634149313,-0.5164034326398639906940957189362961798906} +#define T_16384_1421 {0.8551549342631096228828369021357502788305,-0.5183724900160661119841165600519161671400} +#define T_16384_1423 {0.8547570960489572211571385196293704211712,-0.5190282330990808601356434337503742426634} +#define T_16384_1427 {0.8539599113602541757828134905139449983835,-0.5203388028867219627926488101365976035595} +#define T_16384_1429 {0.8535605653546668447617662422999273985624,-0.5209936288203739218616306061448995023966} +#define T_16384_1431 {0.8531607172213904188851074650301598012447,-0.5216481482668970937055519243585877120495} +#define T_16384_1433 {0.8527603671956452968672124370641540735960,-0.5223023608412546980872548374463804066181} +#define T_16384_1439 {0.8515563081202289774651603693200740963221,-0.5242631534836733608884173918340820819139} +#define T_16384_1441 {0.8511539528827153366918878418800886720419,-0.5249161347226130036602853579097427427769} +#define T_16384_1443 {0.8507510969332607864146211795741692185402,-0.5255688071669146754771873020217753946781} +#define T_16384_1445 {0.8503477405088549767597783102246467024088,-0.5262211704326280603538634750293567776680} +#define T_16384_1449 {0.8495395271846208906652009318349882960320,-0.5275249678933982000472724394057877361774} +#define T_16384_1451 {0.8491346707602436261197453859494999051094,-0.5281764013214643727067709733091760426760} +#define T_16384_1457 {0.8479171052969514121855354460421949625015,-0.5301288357982788523869999153248500078917} +#define T_16384_1459 {0.8475102522083143297138008165347855538130,-0.5307790240785701385561878851149231195450} +#define T_16384_1461 {0.8471029005512314968839859830040950328112,-0.5314289001152368019376126540009863674641} +#define T_16384_1463 {0.8466950505653374481340733836987055838108,-0.5320784635259735440016015672881621867418} +#define T_16384_1467 {0.8458778565671190019514824598445557057858,-0.5333766509413554457452733004174660891294} +#define T_16384_1469 {0.8454685130355288347203668308793567121029,-0.5340252741823103832530250656418502330780} +#define T_16384_1475 {0.8442374992013870205198600160656496882439,-0.5359692574599667080903486748866271227598} +#define T_16384_1477 {0.8438261676481867423760263591248076409101,-0.5366166218001211518640047870576381683350} +#define T_16384_1479 {0.8434143396937927583678629162022843956947,-0.5372636704625425307924047046981286257505} +#define T_16384_1485 {0.8421758798475855734011474851286038756371,-0.5392029185779182443027934823476243764162} +#define T_16384_1487 {0.8417620687140124902114735050417948514223,-0.5398487007248475855192282324424013495445} +#define T_16384_1493 {0.8405176651668625487090480419283267110586,-0.5417841401724915462168041813129093497992} +#define T_16384_1495 {0.8401018747490583971071487212611827999353,-0.5424286497255813577211824849655386060476} +#define T_16384_1497 {0.8396855901209661077189139177789911627769,-0.5430728401818718520388529213960282504559} +#define T_16384_1503 {0.8384337734253083374014181572420056909323,-0.5450034931812811622720005289011169224977} +#define T_16384_1505 {0.8380155144078637041715751365700270980597,-0.5456464034626485881673829680948983877897} +#define T_16384_1511 {0.8367577804435671895433301870070863515139,-0.5475732068565398735771054816723335534334} +#define T_16384_1513 {0.8363375509735835322544517111964523792267,-0.5482148309116677831198671810852829366922} +#define T_16384_1521 {0.8346517156117563285278038165415637195110,-0.5507780983539122265923992927127983421087} +#define T_16384_1523 {0.8342290286404934196440308369346894323826,-0.5514181061350260648623589077033102512360} +#define T_16384_1529 {0.8329580241901066717247203996521420776844,-0.5533361816629323026717202083091251552105} +#define T_16384_1531 {0.8325333756918886773945587265188805758953,-0.5539748896466955008222043943533208221197} +#define T_16384_1539 {0.8308298866220835687812495962134562432766,-0.5565264589357237223765650924178771674633} +#define T_16384_1541 {0.8304027918380475492554637639841530472040,-0.5571635337201963356079659206443466246128} +#define T_16384_1545 {0.8295471370078089101696150464704260230064,-0.5584366996197041022043094926630146801472} +#define T_16384_1547 {0.8291185774649659778745558469381649047136,-0.5590727899857684812801039697660598903894} +#define T_16384_1549 {0.8286895301730257124006584490416571497917,-0.5597085514637147873884259752230718731880} +#define T_16384_1557 {0.8269684685665414924926608364330604672432,-0.5622483010171831496748495737847406417131} +#define T_16384_1559 {0.8265369863208099632245762222737539559603,-0.5628824124483844393296294583706185221672} +#define T_16384_1563 {0.8256725633922213924975608279055450111628,-0.5641496415502876837777535001805517822504} +#define T_16384_1565 {0.8252396232178821344760422107356134802103,-0.5647827584755114038728152081603184342384} +#define T_16384_1567 {0.8248061975763343323109211269184015691280,-0.5654155431535896614292369122267700731754} +#define T_16384_1575 {0.8230676454418016652070377858763094991446,-0.5679433519523655604643863625824451446533} +#define T_16384_1577 {0.8226317962945149941944578131369780749083,-0.5685744698148691433914336812449619174004} +#define T_16384_1579 {0.8221954632141371677178653953887987881899,-0.5692052531996611985931622257339768111706} +#define T_16384_1581 {0.8217586464573517490705967247777152806520,-0.5698357017356679987685197374958079308271} +#define T_16384_1583 {0.8213213462811267406848969585553277283907,-0.5704658150520129922256273857783526182175} +#define T_16384_1585 {0.8208835629427145841319202190788928419352,-0.5710955927780166918594773051154334098101} +#define T_16384_1593 {0.8191276031221882414357082780043128877878,-0.5736113403719446113271374088071752339602} +#define T_16384_1595 {0.8186874078415696809329915595299098640680,-0.5742394345929678900475323644059244543314} +#define T_16384_1597 {0.8182467309482420736443941677862312644720,-0.5748671910037267407389549589424859732389} +#define T_16384_1599 {0.8178055727014442721767295552126597613096,-0.5754946092349282338318516849540174007416} +#define T_16384_1601 {0.8173639333606984580526955141976941376925,-0.5761216889174783917226818630297202616930} +#define T_16384_1603 {0.8169218131858094755770594019850250333548,-0.5767484296824824108185225668421480804682} +#define T_16384_1611 {0.8151485293508209384327756197308190166950,-0.5792519961961235530623071099398657679558} +#define T_16384_1613 {0.8147040089121870831689875558367930352688,-0.5798770368469603475602980324765667319298} +#define T_16384_1615 {0.8142590092041752658502673511975444853306,-0.5805017363710766042927957641950342804193} +#define T_16384_1619 {0.8133675730274265713859449533629231154919,-0.5817501105693696494824962428538128733635} +#define T_16384_1621 {0.8129211370830987659630295638635288923979,-0.5823737845091601084135390919982455670834} +#define T_16384_1629 {0.8111306137306691921295964675664436072111,-0.5848650506475044918985872755001764744520} +#define T_16384_1631 {0.8106817893154306675640441426367033272982,-0.5854870079449514541636290232418105006218} +#define T_16384_1633 {0.8102324879969823312464427544909995049238,-0.5861086208154764332078912048018537461758} +#define T_16384_1637 {0.8093324557079858427854901492537464946508,-0.5873508118132476640838035564229357987642} +#define T_16384_1639 {0.8088817252669036061973883988684974610806,-0.5879713892097450100848732290614861994982} +#define T_16384_1647 {0.8070740477155176062495911537553183734417,-0.5904502362638958112839304703811649233103} +#define T_16384_1649 {0.8066209407101696537978341439156793057919,-0.5910690805716714013584578424342907965183} +#define T_16384_1655 {0.8052587736758222147770425181079190224409,-0.5929235257755512966681976649852003902197} +#define T_16384_1657 {0.8048037702153028050133798387832939624786,-0.5935409770582263933036415437527466565371} +#define T_16384_1665 {0.8029790246008431386925963124667759984732,-0.5960072869110565330075246492924634367228} +#define T_16384_1667 {0.8025216565959463199675383293651975691319,-0.5966229887412133336965780472382903099060} +#define T_16384_1673 {0.8011467210419912499119732274266425520182,-0.5984679869163143095533996529411524534225} +#define T_16384_1675 {0.8006874662429616096304130223870743066072,-0.5990822826635971987130346860794816166162} +#define T_16384_1681 {0.7993068767850861622292768515762872993946,-0.6009230539129540860088241061021108180285} +#define T_16384_1683 {0.7988457395146045803002721186203416436911,-0.6015359377953776176539690823119599372149} +#define T_16384_1685 {0.7983841323037563775244507269235327839851,-0.6021484678097072107405551832925993949175} +#define T_16384_1691 {0.7969964937459087517268585543206427246332,-0.6039839310418180229333984243567101657391} +#define T_16384_1693 {0.7965330094918720016394786398450378328562,-0.6045950419825003585572176234563812613487} +#define T_16384_1699 {0.7951397463426795875207631070225033909082,-0.6064262393614735513480695772159378975630} +#define T_16384_1701 {0.7946743894079445480471690643753390759230,-0.6070359254764996492781392589677125215530} +#define T_16384_1703 {0.7942085649867406393909163853095378726721,-0.6076452544879308304004439378331881016493} +#define T_16384_1709 {0.7928082895460141221732897065521683543921,-0.6094710953171802447769778154906816780567} +#define T_16384_1711 {0.7923405979220070616264592899824492633343,-0.6100789923318096219517769895901437848806} +#define T_16384_1715 {0.7914038166087195014242183788155671209097,-0.6112937093224108942024486168520525097847} +#define T_16384_1717 {0.7909347274705232910818608615954872220755,-0.6119005285837960661510237514448817819357} +#define T_16384_1719 {0.7904651730458048808358739734103437513113,-0.6125069878798654610108087581465952098370} +#define T_16384_1721 {0.7899951536107910898820705369871575385332,-0.6131130868538549050228425585373770445585} +#define T_16384_1727 {0.7885823080103472326740643438824918121099,-0.6149292182788795857462105232116300612688} +#define T_16384_1729 {0.7881104313018880702657043002545833587646,-0.6155338724011474305086721869884058833122} +#define T_16384_1733 {0.7871652872876508943988937971880659461021,-0.6167420939820388303687082043325062841177} +#define T_16384_1735 {0.7866920205378767905202153087884653359652,-0.6173456607298968279451401031110435724258} +#define T_16384_1737 {0.7862182909974556599408401780237909406424,-0.6179488643092082567065403964079450815916} +#define T_16384_1739 {0.7857440989450703572316569989197887480259,-0.6185517043651238555668214758043177425861} +#define T_16384_1745 {0.7843187505070389198280622622405644506216,-0.6203580398472138268672892991162370890379} +#define T_16384_1747 {0.7838427111990652340622887095378246158361,-0.6209594222653351813789868174353614449501} +#define T_16384_1749 {0.7833662107766197202352032036287710070610,-0.6215604393890271639477873577561695128679} +#define T_16384_1751 {0.7828892495200154844070539184031076729298,-0.6221610908647268178839340180275030434132} +#define T_16384_1755 {0.7819339456269375210339944715087767690420,-0.6233612954589733412547047919360920786858} +#define T_16384_1757 {0.7814556035524445887219258111144881695509,-0.6239608478714706585321891907369717955589} +#define T_16384_1763 {0.7800178201947159939422249408380594104528,-0.6257573013386928950652077219274360686541} +#define T_16384_1765 {0.7795376409705132569882835014141164720058,-0.6263553833967799855386715535132680088282} +#define T_16384_1767 {0.7790570031644006299131888226838782429695,-0.6269530969861326630265807580144610255957} +#define T_16384_1769 {0.7785759070591249386694698841893114149570,-0.6275504417551315272660872324195224791765} +#define T_16384_1773 {0.7776123410834199223273799361777491867542,-0.6287440234266747918212558943196199834347} +#define T_16384_1775 {0.7771298717798316157967519757221452891827,-0.6293402596270657456400954288255888968706} +#define T_16384_1781 {0.7756797220128206493683364897151477634907,-0.6311267454783653363392659230157732963562} +#define T_16384_1783 {0.7751954257529413139238272378861438483000,-0.6317214986777923702021553253871388733387} +#define T_16384_1785 {0.7747106734655656579136007167107891291380,-0.6323158802517375720597669896960724145174} +#define T_16384_1791 {0.7732536832914725932752730841457378119230,-0.6340967917251837393521896046877373009920} +#define T_16384_1793 {0.7727671097484638540464629841153509914875,-0.6346896833027977358199223090196028351784} +#define T_16384_1799 {0.7713046626718447207338158477796241641045,-0.6364661164120771763563766398874577134848} +#define T_16384_1801 {0.7708162724530185361260237186797894537449,-0.6370575124128385890998060858692042529583} +#define T_16384_1803 {0.7703274287828388855459138540027197450399,-0.6376485336490788080610059296304825693369} +#define T_16384_1809 {0.7688581799412532724602442613104358315468,-0.6394193452949507028293396615481469780207} +#define T_16384_1811 {0.7683675253440662711312825194909237325191,-0.6400088639984884419931177035323344171047} +#define T_16384_1817 {0.7668928506434806724456620941055007278919,-0.6417751597186635015290789851860608905554} +#define T_16384_1819 {0.7664003897375141205472459660086315125227,-0.6423631703407243209724697408091742545366} +#define T_16384_1827 {0.7644260404786120677300687020760960876942,-0.6447114305161584235648319918254856020212} +#define T_16384_1829 {0.7639313282069510879779272727319039404392,-0.6452975482550383778956870628462638705969} +#define T_16384_1835 {0.7624444961506871010215036221779882907867,-0.6470536224220716503552353060513269156218} +#define T_16384_1837 {0.7619479880233553936719204102701041847467,-0.6476382196467103113945995573885738849640} +#define T_16384_1845 {0.7599574760951103291617414470238145440817,-0.6499727952208075310025492399290669709444} +#define T_16384_1847 {0.7594587297220282140486347088881302624941,-0.6505554840665039861846707935910671949387} +#define T_16384_1851 {0.7584598969595154338563247620186302810907,-0.6517197133002509135124569183972198516130} +#define T_16384_1853 {0.7579598111576723029614299775857944041491,-0.6523012530034154599078988212568219751120} +#define T_16384_1855 {0.7574592794676007212473223262350074946880,-0.6528824089745589587252538876782637089491} +#define T_16384_1863 {0.7554526997179581426777872366074007004499,-0.6552031887047318203798340618959628045559} +#define T_16384_1865 {0.7549499430087326379279488719475921243429,-0.6557824208921061437393973392318002879620} +#define T_16384_1869 {0.7539430975334996398729003885819111019373,-0.6569397275866271090905001983628608286381} +#define T_16384_1871 {0.7534390093597935766922546463320031762123,-0.6575178014129601233861421860638074576855} +#define T_16384_1873 {0.7529344779573302615105490076530259102583,-0.6580954884385111780531474323652219027281} +#define T_16384_1881 {0.7509119259998680018242112055304460227489,-0.6604023617395450296285730473755393177271} +#define T_16384_1883 {0.7504051829108693283387765404768288135529,-0.6609781096681680567783701008011121302843} +#define T_16384_1885 {0.7498979983778352176315706856257747858763,-0.6615534687603989993931463686749339103699} +#define T_16384_1887 {0.7493903726991295588533148475107736885548,-0.6621284386777687158698313396598678082228} +#define T_16384_1889 {0.7488823061733751451640728191705420613289,-0.6627030190820374366822420597600284963846} +#define T_16384_1891 {0.7483737990994545619116706802742555737495,-0.6632772096351940982472683572268579155207} +#define T_16384_1899 {0.7463353713088263230446273155394010245800,-0.6655700665845154517796800064388662576675} +#define T_16384_1901 {0.7458246659863759786546211216773372143507,-0.6661423028199835449925103603163734078407} +#define T_16384_1903 {0.7453135219144905176946735991805326193571,-0.6667141471810976716128038788156118243933} +#define T_16384_1905 {0.7448019393938626331319596829416695982218,-0.6672855993314564804208544046559836715460} +#define T_16384_1907 {0.7442899187254431447868796567490790039301,-0.6678566589348894355637753506016451865435} +#define T_16384_1909 {0.7437774602104407772884542282554320991039,-0.6684273256554568165554996994615066796541} +#define T_16384_1917 {0.7417232537177841367181940768205095082521,-0.6707060569983721576292623467452358454466} +#define T_16384_1919 {0.7412086104970042610418090589519124478102,-0.6712747542736134942487069565686397254467} +#define T_16384_1921 {0.7406935312422956441125165838457178324461,-0.6718430566552119342915716515562962740660} +#define T_16384_1925 {0.7396620658433800121756007683870848268270,-0.6729784754004420888051640758931171149015} +#define T_16384_1927 {0.7391456803059573976710794340760912746191,-0.6735455910961360981303869266412220895290} +#define T_16384_1935 {0.7370757929942656216937280078127514570951,-0.6758100882510369444844400277361273765564} +#define T_16384_1937 {0.7365572363979191461424989029183052480221,-0.6763752194676116991089998009556438773870} +#define T_16384_1939 {0.7360382465039274624274412417435087263584,-0.6769399527900711310834935829916503280401} +#define T_16384_1943 {0.7349989680444966033689979667542502284050,-0.6780682244240066047780146618606522679329} +#define T_16384_1945 {0.7344786800904383738952674320898950099945,-0.6786317620717494669690950104268267750740} +#define T_16384_1953 {0.7323932105898960376322293086559511721134,-0.6808819171352872290370328300923574715853} +#define T_16384_1955 {0.7318707653272182866999173711519688367844,-0.6814434553646778747193479830457363277674} +#define T_16384_1961 {0.7303008475255254872138266364345327019691,-0.6831256634789086845671590708661824464798} +#define T_16384_1963 {0.7297766819465660859123090631328523159027,-0.6836855962261166874682771776861045509577} +#define T_16384_1971 {0.7276757296298496102693320608523208647966,-0.6859213019783435560938755770621355623007} +#define T_16384_1973 {0.7271494205953710210366125465952791273594,-0.6864792204632389482910070910293143242598} +#define T_16384_1979 {0.7255679281520323042897757659375201910734,-0.6881505515780448334339780558366328477859} +#define T_16384_1981 {0.7250399099246752632197399179858621209860,-0.6887068527439077492147134762490168213844} +#define T_16384_1987 {0.7234532973525443777518262322701048105955,-0.6903733240426740414008577317872550338507} +#define T_16384_1989 {0.7229235749022177026645863406884018331766,-0.6909280026533862750781622708018403500319} +#define T_16384_1991 {0.7223934271745775514972365272114984691143,-0.6914822748089558546169541841663885861635} +#define T_16384_1997 {0.7208004354477492992003817562363110482693,-0.6931426492853653975600991543615236878395} +#define T_16384_1999 {0.7202685897320770802565448320819996297359,-0.6936952923621182431901388554251752793789} +#define T_16384_2005 {0.7186705115450672254340247491199988871813,-0.6953507717947476907127679623954463750124} +#define T_16384_2007 {0.7181369728472974944821771714487113058567,-0.6959017805909968323874181805877014994621} +#define T_16384_2009 {0.7176030116880490750119747644930612295866,-0.6964523800061578340248047425120603293180} +#define T_16384_2015 {0.7159985965838286947615642930031754076481,-0.6981017187272838819822595723962876945734} +#define T_16384_2017 {0.7154629487223036488074967564898543059826,-0.6986506773814695758773041234235279262066} +#define T_16384_2021 {0.7143903906493513922271176852518692612648,-0.6997473613725649910222159633121918886900} +#define T_16384_2023 {0.7138534810688824672197938525641802698374,-0.7002950860643237795954973989864811301231} +#define T_16384_2025 {0.7133161515468026125930123271245975047350,-0.7008423987905262331210565207584295421839} +#define T_16384_2027 {0.7127784023992089768029245533398352563381,-0.7013892992292022343647772686381358653307} +#define T_16384_2033 {0.7111626403680183505784384578873869031668,-0.7030275236040113284730068698991090059280} +#define T_16384_2035 {0.7106232158842750168403767929703462868929,-0.7035727716777355755795042568934150040150} +#define T_16384_2039 {0.7095431131103767663503845142258796840906,-0.7046620258234689293530550457944627851248} +#define T_16384_2041 {0.7090024354556182517583806657057721167803,-0.7052060312546978293468669107824098318815} +#define T_16384_2043 {0.7084613407129940476636420498834922909737,-0.7057496218313877855976556929817888885736} +#define T_16384_2045 {0.7079198292008163084787497609795536845922,-0.7062927972337584847650759911630302667618} +#define T_16384_2057 {0.7046620258234689293530550457944627851248,-0.7095431131103767663503845142258796840906} +#define T_16384_2061 {0.7035727716777355755795042568934150040150,-0.7106232158842750168403767929703462868929} +#define T_16384_2079 {0.6986506773814695758773041234235279262066,-0.7154629487223036488074967564898543059826} +#define T_16384_2091 {0.6953507717947476907127679623954463750124,-0.7186705115450672254340247491199988871813} +#define T_16384_2097 {0.6936952923621182431901388554251752793789,-0.7202685897320770802565448320819996297359} +#define T_16384_2115 {0.6887068527439077492147134762490168213844,-0.7250399099246752632197399179858621209860} +#define T_16384_2125 {0.6859213019783435560938755770621355623007,-0.7276757296298496102693320608523208647966} +#define T_16384_2133 {0.6836855962261166874682771776861045509577,-0.7297766819465660859123090631328523159027} +#define T_16384_2151 {0.6786317620717494669690950104268267750740,-0.7344786800904383738952674320898950099945} +#define T_16384_2159 {0.6763752194676116991089998009556438773870,-0.7365572363979191461424989029183052480221} +#define T_16384_2169 {0.6735455910961360981303869266412220895290,-0.7391456803059573976710794340760912746191} +#define T_16384_2187 {0.6684273256554568165554996994615066796541,-0.7437774602104407772884542282554320991039} +#define T_16384_2193 {0.6667141471810976716128038788156118243933,-0.7453135219144905176946735991805326193571} +#define T_16384_2205 {0.6632772096351940982472683572268579155207,-0.7483737990994545619116706802742555737495} +#define T_16384_2223 {0.6580954884385111780531474323652219027281,-0.7529344779573302615105490076530259102583} +#define T_16384_2227 {0.6569397275866271090905001983628608286381,-0.7539430975334996398729003885819111019373} +#define T_16384_2241 {0.6528824089745589587252538876782637089491,-0.7574592794676007212473223262350074946880} +#define T_16384_2259 {0.6476382196467103113945995573885738849640,-0.7619479880233553936719204102701041847467} +#define T_16384_2261 {0.6470536224220716503552353060513269156218,-0.7624444961506871010215036221779882907867} +#define T_16384_2277 {0.6423631703407243209724697408091742545366,-0.7664003897375141205472459660086315125227} +#define T_16384_2295 {0.6370575124128385890998060858692042529583,-0.7708162724530185361260237186797894537449} +#define T_16384_2313 {0.6317214986777923702021553253871388733387,-0.7751954257529413139238272378861438483000} +#define T_16384_2329 {0.6269530969861326630265807580144610255957,-0.7790570031644006299131888226838782429695} +#define T_16384_2331 {0.6263553833967799855386715535132680088282,-0.7795376409705132569882835014141164720058} +#define T_16384_2349 {0.6209594222653351813789868174353614449501,-0.7838427111990652340622887095378246158361} +#define T_16384_2363 {0.6167420939820388303687082043325062841177,-0.7871652872876508943988937971880659461021} +#define T_16384_2367 {0.6155338724011474305086721869884058833122,-0.7881104313018880702657043002545833587646} +#define T_16384_2385 {0.6100789923318096219517769895901437848806,-0.7923405979220070616264592899824492633343} +#define T_16384_2397 {0.6064262393614735513480695772159378975630,-0.7951397463426795875207631070225033909082} +#define T_16384_2403 {0.6045950419825003585572176234563812613487,-0.7965330094918720016394786398450378328562} +#define T_16384_2421 {0.5990822826635971987130346860794816166162,-0.8006874662429616096304130223870743066072} +#define T_16384_2431 {0.5960072869110565330075246492924634367228,-0.8029790246008431386925963124667759984732} +#define T_16384_2439 {0.5935409770582263933036415437527466565371,-0.8048037702153028050133798387832939624786} +#define T_16384_2457 {0.5879713892097450100848732290614861994982,-0.8088817252669036061973883988684974610806} +#define T_16384_2465 {0.5854870079449514541636290232418105006218,-0.8106817893154306675640441426367033272982} +#define T_16384_2475 {0.5823737845091601084135390919982455670834,-0.8129211370830987659630295638635288923979} +#define T_16384_2493 {0.5767484296824824108185225668421480804682,-0.8169218131858094755770594019850250333548} +#define T_16384_2499 {0.5748671910037267407389549589424859732389,-0.8182467309482420736443941677862312644720} +#define T_16384_2511 {0.5710955927780166918594773051154334098101,-0.8208835629427145841319202190788928419352} +#define T_16384_2529 {0.5654155431535896614292369122267700731754,-0.8248061975763343323109211269184015691280} +#define T_16384_2533 {0.5641496415502876837777535001805517822504,-0.8256725633922213924975608279055450111628} +#define T_16384_2547 {0.5597085514637147873884259752230718731880,-0.8286895301730257124006584490416571497917} +#define T_16384_2565 {0.5539748896466955008222043943533208221197,-0.8325333756918886773945587265188805758953} +#define T_16384_2567 {0.5533361816629323026717202083091251552105,-0.8329580241901066717247203996521420776844} +#define T_16384_2583 {0.5482148309116677831198671810852829366922,-0.8363375509735835322544517111964523792267} +#define T_16384_2601 {0.5424286497255813577211824849655386060476,-0.8401018747490583971071487212611827999353} +#define T_16384_2619 {0.5366166218001211518640047870576381683350,-0.8438261676481867423760263591248076409101} +#define T_16384_2635 {0.5314289001152368019376126540009863674641,-0.8471029005512314968839859830040950328112} +#define T_16384_2637 {0.5307790240785701385561878851149231195450,-0.8475102522083143297138008165347855538130} +#define T_16384_2655 {0.5249161347226130036602853579097427427769,-0.8511539528827153366918878418800886720419} +#define T_16384_2669 {0.5203388028867219627926488101365976035595,-0.8539599113602541757828134905139449983835} +#define T_16384_2673 {0.5190282330990808601356434337503742426634,-0.8547570960489572211571385196293704211712} +#define T_16384_2691 {0.5131155997666405621515650636865757405758,-0.8583195100171734370775311617762781679630} +#define T_16384_2703 {0.5091602434547546351950586540624499320984,-0.8606717414235783847331617835152428597212} +#define T_16384_2709 {0.5071785164624251773801688614184968173504,-0.8618410250382453341444488614797592163086} +#define T_16384_2727 {0.5012172660886099473387389480194542557001,-0.8653214733118897994401663709140848368406} +#define T_16384_2737 {0.4978951222734108728040780533774523064494,-0.8672372496706683975276064302306622266769} +#define T_16384_2745 {0.4952321326989312377797602948703570291400,-0.8687606889946553057058054037042893469334} +#define T_16384_2763 {0.4892234014851519763311671340488828718662,-0.8721585082078244788306164991809055209160} +#define T_16384_2771 {0.4865453545130302703825009302818216383457,-0.8736553199069926334274782675493042916059} +#define T_16384_2781 {0.4831913587634719120522674984385957941413,-0.8755147690452227404733775983913801610470} +#define T_16384_2799 {0.4771362919608848063290906793554313480854,-0.8788293115809333588828167194151319563389} +#define T_16384_2805 {0.4751128697346203022533472903887741267681,-0.8799248610037868623123813449637964367867} +#define T_16384_2817 {0.4710584896014824995980063704337226226926,-0.8821019778769175756494291817944031208754} +#define T_16384_2835 {0.4649582412927066865648839666391722857952,-0.8853326119905405855448066176904831081629} +#define T_16384_2839 {0.4635996115618140067660135628102580085397,-0.8860448070835555967050822800956666469574} +#define T_16384_2853 {0.4588358377115491770759092560183489695191,-0.8885210599820022592254531446087639778852} +#define T_16384_2871 {0.4526915705907009201958146604738431051373,-0.8916671699216722757341813121456652879715} +#define T_16384_2873 {0.4520075373504364724119852780859218910336,-0.8920141177012804734047790589102078229189} +#define T_16384_2889 {0.4465257327046513458057575007842388004065,-0.8947707918973295537767853602417744696140} +#define T_16384_2907 {0.4403386178557372465647290482593234628439,-0.8978317780213056487070844013942405581474} +#define T_16384_2925 {0.4341305208601433651693923820857889950275,-0.9008499824375314490865207517344970256090} +#define T_16384_2941 {0.4285948368973444000396000319597078487277,-0.9034967989898684548322194132197182625532} +#define T_16384_2943 {0.4279017375338541318008367397851543501019,-0.9038252613284873948629183360026217997074} +#define T_16384_2961 {0.4216525646785583281150877610343741253018,-0.9067574729220565510345863913244102150202} +#define T_16384_2975 {0.4167781910219976460041380050824955105782,-0.9090082175032474509634994319640100002289} +#define T_16384_2979 {0.4153833000675062891993150060443440452218,-0.9096464774982795375990463071502745151520} +#define T_16384_2997 {0.4090942424313209757613662986841518431902,-0.9124921373960126480540111515438184142113} +#define T_16384_3009 {0.4048906891641176342133690013724844902754,-0.9143650965714985590082619637541938573122} +#define T_16384_3015 {0.4027856914437635271752924381871707737446,-0.9152943170194870470268710960226599127054} +#define T_16384_3033 {0.3964579477074539060055258232750929892063,-0.9180528828447703793003142891393508762121} +#define T_16384_3043 {0.3929343523042694852343004185968311503530,-0.9195665254777515285056210814218502491713} +#define T_16384_3051 {0.3901113127395469115654691449890378862619,-0.9207677034261287918326388535206206142902} +#define T_16384_3069 {0.3837460889573650058004261609312379732728,-0.9234386494022903679734781690058298408985} +#define T_16384_3077 {0.3809112131255780742122851734166033565998,-0.9246116199330399743061548178957309573889} +#define T_16384_3087 {0.3773625796639883400729331697220914065838,-0.9260655935026093077411246667907107621431} +#define T_16384_3105 {0.3709610890338019828504911856725811958313,-0.9286484105531305210945447470294311642647} +#define T_16384_3111 {0.3688233156681539059995600382535485550761,-0.9294995222266385637155394761066418141127} +#define T_16384_3123 {0.3645419220980021801636894451803527772427,-0.9311869774825537460216651197697501629591} +#define T_16384_3141 {0.3581053847300616488347202448494499549270,-0.9336811733280984126892576568934600800276} +#define T_16384_3145 {0.3566727149815882569328095996752381324768,-0.9342294013718808187718423141632229089737} +#define T_16384_3159 {0.3516517836311546241212511176854604855180,-0.9361308792412670332083735047490336000919} +#define T_16384_3177 {0.3451814263155426054652252787491306662560,-0.9385359784935085603052584701799787580967} +#define T_16384_3179 {0.3444614767755765361023634341108845546842,-0.9388004532474347740844677900895476341248} +#define T_16384_3195 {0.3386946210959212444358001903310650959611,-0.9408963564817808267193299798236694186926} +#define T_16384_3213 {0.3321916770687292075336927155149169266224,-0.9432119007340106220382835999771486967802} +#define T_16384_3231 {0.3256729040994197910130480977386469021440,-0.9454825009144537384386808298586402088404} +#define T_16384_3247 {0.3198654018356305028802921697206329554319,-0.9474629938464776968132241563580464571714} +#define T_16384_3249 {0.3191386128076958983434963101899484172463,-0.9477080488289522097744566053734160959721} +#define T_16384_3267 {0.3125891145527087133793031625828007236123,-0.9498884384300892991248588259622920304537} +#define T_16384_3281 {0.3074847466522040995151598963275318965316,-0.9515530098613685927588790036679711192846} +#define T_16384_3285 {0.3060247214182217900813043343077879399061,-0.9520235658222435670694494547205977141857} +#define T_16384_3303 {0.2994457461977398926578075588622596114874,-0.9541133292665388010433957788336556404829} +#define T_16384_3315 {0.2950518163394467152649269792163977399468,-0.9554812534397487677750859802472405135632} +#define T_16384_3321 {0.2928525023796048065705122098734136670828,-0.9561576291856921372414035431575030088425} +#define T_16384_3339 {0.2862453041320571767158753573312424123287,-0.9581563681687588207580574817257001996040} +#define T_16384_3349 {0.2825687246055897938745715691766235977411,-0.9592470567454300933718513988424092531204} +#define T_16384_3357 {0.2796244662882665932279735443444224074483,-0.9601094509757739370314766347291879355907} +#define T_16384_3375 {0.2729903043313299249028602844191482290626,-0.9620167845422905594787721383909229189157} +#define T_16384_3383 {0.2700375936867505655136767472868086770177,-0.9628497795585090335279687678848858922720} +#define T_16384_3393 {0.2663431343792381778001754355500452220440,-0.9638782779838142023010050252196379005909} +#define T_16384_3411 {0.2596832731698137664011483138892799615860,-0.9656938426001336894799464971583802253008} +#define T_16384_3417 {0.2574605539861331005013767025957349687815,-0.9662888093842096903784977257600985467434} +#define T_16384_3429 {0.2530110380456179197672383907047333195806,-0.9674633918795474407659185089869424700737} +#define T_16384_3447 {0.2463267469388290276111064258657279424369,-0.9691868415029859518128318995877634733915} +#define T_16384_3451 {0.2448397437118406683165261483736685477197,-0.9695635615570131760065919479529839009047} +#define T_16384_3465 {0.2396307183560935916144529755911207757890,-0.9708641093480294692596999084344133734703} +#define T_16384_3483 {0.2329232713633489770810314212212688289583,-0.9724951154928211938255344648496247828007} +#define T_16384_3485 {0.2321773085133617131603500638448167592287,-0.9726734793400564349497017246903851628304} +#define T_16384_3501 {0.2262047255706201853708847693269490264356,-0.9740797822198756783507178624859079718590} +#define T_16384_3519 {0.2194754011167903140488988356082700192928,-0.9756180340197817546510350439348258078098} +#define T_16384_3537 {0.2127356186543459259929989002557704225183,-0.9771097975948008773627861955901607871056} +#define T_16384_3553 {0.2067361809588436905027464263184810988605,-0.9783967249958230860684693652729038149118} +#define T_16384_3555 {0.2059856993340979380846533786098007112741,-0.9785550018623595525113501025771256536245} +#define T_16384_3573 {0.1992259647898788621489529759855940937996,-0.9799535779584367389816179638728499412537} +#define T_16384_3587 {0.1939618138197388708121593481337185949087,-0.9810090798661126321178471698658540844917} +#define T_16384_3591 {0.1924567371232168422334751767266425304115,-0.9813054592408446685780631923989858478308} +#define T_16384_3609 {0.1856783388879876262045343082718318328261,-0.9826105812924047500089841378212440758944} +#define T_16384_3621 {0.1811544714549908063894179122144123539329,-0.9834546545071932710513351594272535294294} +#define T_16384_3627 {0.1788910930750447492165733365254709497094,-0.9838688819240172245272901818680111318827} +#define T_16384_3645 {0.1720953230968290115221464020578423514962,-0.9850803011776237960717139685584697872400} +#define T_16384_3655 {0.1683163312261948540982103850183193571866,-0.9857330331497234920945516023493837565184} +#define T_16384_3663 {0.1652913527719580000230337191169383004308,-0.9862447813290654563544990196533035486937} +#define T_16384_3681 {0.1584795063097959588738206093694316223264,-0.9873622668908323962710937848896719515324} +#define T_16384_3689 {0.1554495757308558268050546757876873016357,-0.9878438284491617427107712501310743391514} +#define T_16384_3699 {0.1516601082950053414055702205587294884026,-0.9884327046147083351002038398291915655136} +#define T_16384_3717 {0.1448334836720802099030436238535912707448,-0.9894560434943077131819677560997661203146} +#define T_16384_3723 {0.1425563924313273389543610392138361930847,-0.9897866815516186411727517224790062755346} +#define T_16384_3735 {0.1379999577298627877475212244462454691529,-0.9904322347675059701188615690625738352537} +#define T_16384_3753 {0.1311598560860432749475279479156597517431,-0.9913612319187634636108441554824821650982} +#define T_16384_3757 {0.1296389732829235919009391864165081642568,-0.9915612621548652860781203344231471419334} +#define T_16384_3771 {0.1243135046716442454473039447293558623642,-0.9922429906813416966571139710140414535999} +#define T_16384_3789 {0.1174612297154899870355038160596450325102,-0.9930774690394122972136869975656736642122} +#define T_16384_3791 {0.1166995143612676866240462913992814719677,-0.9931672685644872267118898889748379588127} +#define T_16384_3807 {0.1106033577286617414214831001118000131100,-0.9938646272300597495075180631829425692558} +#define T_16384_3825 {0.1037402154889393718351087159135204274207,-0.9946044277451756565611162841378245502710} +#define T_16384_3843 {0.0968721300252304712286033350210345815867,-0.9952968353332460882398891044431366026402} +#define T_16384_3859 {0.0907632798614856350916113569837762042880,-0.9958724953671457269521738453477155417204} +#define T_16384_3861 {0.0899994286019873551696335312044539023191,-0.9959418170010313486884001576981972903013} +#define T_16384_3879 {0.0831224387036129247485760629388096276671,-0.9965393420151379411109360262344125658274} +#define T_16384_3893 {0.0777709136728579608632827557812561281025,-0.9969712558476743202717784697597380727530} +#define T_16384_3897 {0.0762414880188560656337770637946960050613,-0.9970893819034833960301966726547107100487} +#define T_16384_3915 {0.0693569044251972077752554923790739849210,-0.9975919104566526307564799935789778828621} +#define T_16384_3927 {0.0647653257403398852076747971295844763517,-0.9979005223877516206343329940864350646734} +#define T_16384_3933 {0.0624690159732249963853156771165231475607,-0.9980469037291468392680826582363806664944} +#define T_16384_3951 {0.0555781508710046848187324997070390963927,-0.9984543400405248014806147693889215588570} +#define T_16384_3961 {0.0517487271290284628344480211126210633665,-0.9986601370038384883898174848582129925489} +#define T_16384_3969 {0.0486846374684389432418996079832140821964,-0.9988141999764353906599012589140329509974} +#define T_16384_3987 {0.0417888042416220684138750129932304844260,-0.9991264663895433884022168058436363935471} +#define T_16384_3995 {0.0387233307759336231601565714299795217812,-0.9992499705547244204240087128710001707077} +#define T_16384_4005 {0.0348909797771880039740644008361414307728,-0.9993911244003460536688976389996241778135} +#define T_16384_4023 {0.0279914927566532467650972648698370903730,-0.9996081613978821112098671619605738669634} +#define T_16384_4029 {0.0256913511137592982802591023983040940948,-0.9996699227634837647826770989922806620598} +#define T_16384_4041 {0.0210906719407551214440221798440688871779,-0.9997775670403329373314704753283876925707} +#define T_16384_4059 {0.0141888461537863445199869616430987662170,-0.9998993332555153878971054837165866047144} +#define T_16384_4063 {0.0126550036944302422142749620093127305154,-0.9999199222345227511254961427766829729080} +#define T_16384_4077 {0.0072863442679265222681750380218090867857,-0.9999734542412659354937431999132968485355} +#define T_16384_4095 {0.0003834951875713955632071772150482047437,-0.9999999264657178921211766464693937450647} +#define T_16384_4097 {-0.0003834951875713955632071772150482047437,-0.9999999264657178921211766464693937450647} +#define T_16384_4113 {-0.0065193721663394680818082171924743306590,-0.9999787486674688308241343293047975748777} +#define T_16384_4131 {-0.0134219288719957668964966046587505843490,-0.9999099218556415369363321588025428354740} +#define T_16384_4149 {-0.0203238460223895966993978845493984408677,-0.9997934493098352692896924054366536438465} +#define T_16384_4165 {-0.0264580807096771869479923111612151842564,-0.9996499237058742437511682510375976562500} +#define T_16384_4167 {-0.0272247947409878753022294262109426199459,-0.9996293365799701069462912528251763433218} +#define T_16384_4185 {-0.0341244461974033255757809968145011225715,-0.9994175914860217169177758478326722979546} +#define T_16384_4199 {-0.0394897344393841248644250185861892532557,-0.9992199762184035272127857751911506056786} +#define T_16384_4203 {-0.0410224716230632446833936910479678772390,-0.9991582241176494294521148731291759759188} +#define T_16384_4221 {-0.0479185423268753338255088181085739051923,-0.9988512468337151783970284668612293899059} +#define T_16384_4233 {-0.0525146745646032225840471596711722668260,-0.9986201524881088698037956419284455478191} +#define T_16384_4239 {-0.0548123297108898538398946698180225212127,-0.9984966742616946389077270396228414028883} +#define T_16384_4257 {-0.0617035052859573052153052685753209516406,-0.9980945232969800073874466761481016874313} +#define T_16384_4267 {-0.0655306867301933271274805292705423198640,-0.9978505544903351065499919059220701456070} +#define T_16384_4275 {-0.0685917406873809420986276563780847936869,-0.9976448131020754228615032843663357198238} +#define T_16384_4293 {-0.0754767076905634021199986705141782294959,-0.9971475651056834754726310166006442159414} +#define T_16384_4301 {-0.0785355580988454787805252976795600261539,-0.9969113130635557373793176338949706405401} +#define T_16384_4311 {-0.0823580782266465499796836979840009007603,-0.9966028030016841343652345130976755172014} +#define T_16384_4329 {-0.0892355243981440143796746156112931203097,-0.9960105527480058729139500428573228418827} +#define T_16384_4335 {-0.0915270777272848279348949063205509446561,-0.9958025878871291647342900432704482227564} +#define T_16384_4347 {-0.0961087184945655093049055039955419488251,-0.9953708425653889912965155417623464018106} +#define T_16384_4365 {-0.1029773330080322180091201289542368613183,-0.9946837029360402482325298478826880455017} +#define T_16384_4369 {-0.1045030369421505733740218602179083973169,-0.9945245674541517422184710994770284742117} +#define T_16384_4383 {-0.1098410406488826013271520309899642597884,-0.9939491666021811333564528467832133173943} +#define T_16384_4401 {-0.1166995143612676866240462913992814719677,-0.9931672685644872267118898889748379588127} +#define T_16384_4403 {-0.1174612297154899870355038160596450325102,-0.9930774690394122972136869975656736642122} +#define T_16384_4419 {-0.1235524273387353794140253171462973114103,-0.9923380460804204217239998797595035284758} +#define T_16384_4437 {-0.1303994530398027151818496349733322858810,-0.9914615386624537896054221164376940578222} +#define T_16384_4455 {-0.1372402652035155934395760368715855292976,-0.9905377880761887521288144853315316140652} +#define T_16384_4471 {-0.1433155073025715042778216457008966244757,-0.9896770510457472136423007214034441858530} +#define T_16384_4473 {-0.1440745378649951891070202236733166500926,-0.9895668383383651178775153312017209827900} +#define T_16384_4491 {-0.1509019453709700420152017841246561147273,-0.9885487357147632048182117614487651735544} +#define T_16384_4505 {-0.1562071966602159023285167904759873636067,-0.9877243095679869577807608038710895925760} +#define T_16384_4509 {-0.1577221623952936302437422000366495922208,-0.9874835287179997145301513228332623839378} +#define T_16384_4527 {-0.1645348639544459978800716726254904642701,-0.9863712681052160258232675005274359136820} +#define T_16384_4539 {-0.1690723294114050145875438602161011658609,-0.9856036462125133956746481089794542640448} +#define T_16384_4545 {-0.1713397254230193123003544997118297033012,-0.9852120068756593518344288895605131983757} +#define T_16384_4563 {-0.1781364225491862962602596098804497160017,-0.9840058002681578708248366638144943863153} +#define T_16384_4573 {-0.1819087183696661835785590710656833834946,-0.9833154215108728113392544400994665920734} +#define T_16384_4581 {-0.1849246314701507853950346316196373663843,-0.9827527057584878322771260172885376960039} +#define T_16384_4599 {-0.1917040287275798005062199536041589453816,-0.9814527830566355248720356030389666557312} +#define T_16384_4607 {-0.1947141812352259915286367686348967254162,-0.9808600244815238733409046290034893900156} +#define T_16384_4617 {-0.1984742912830163852344611541411723010242,-0.9801060941039517748762932569661643356085} +#define T_16384_4635 {-0.2052350965332723486955757152827573008835,-0.9787127030702004182316500191518571227789} +#define T_16384_4641 {-0.2074865409660206494457668213726719841361,-0.9782378725637010941085236481740139424801} +#define T_16384_4653 {-0.2119861223258003302127860933978809043765,-0.9772726763505008573673649152624420821667} +#define T_16384_4671 {-0.2187270469740444367445775242231320589781,-0.9757860825621639255800232604087796062231} +#define T_16384_4675 {-0.2202236261478123791057726066355826333165,-0.9754494115464463765974301168171223253012} +#define T_16384_4689 {-0.2254575492727685370741852466380805708468,-0.9742529925414225022706204981659539043903} +#define T_16384_4707 {-0.2321773085133617131603500638448167592287,-0.9726734793400564349497017246903851628304} +#define T_16384_4709 {-0.2329232713633489770810314212212688289583,-0.9724951154928211938255344648496247828007} +#define T_16384_4725 {-0.2388860044991200637376493887131800875068,-0.9710476182219111018767421228403691202402} +#define T_16384_4743 {-0.2455833175605040552547109200531849637628,-0.9693754866593112806683052440348546952009} +#define T_16384_4761 {-0.2522689285703708095276454059785464778543,-0.9676571643293698787857692877878434956074} +#define T_16384_4777 {-0.2582016124193349249082984897540882229805,-0.9660910554104388259233360258804168552160} +#define T_16384_4779 {-0.2589425189591805231970056411228142678738,-0.9658927331101908597688066038244869560003} +#define T_16384_4797 {-0.2656037707301763250988813069852767512202,-0.9640822770769681415714558170293457806110} +#define T_16384_4811 {-0.2707760107179960074041957795998314395547,-0.9626423801285957093654133132076822221279} +#define T_16384_4815 {-0.2722523664745367111272855709103168919683,-0.9622258824979790237108545625233091413975} +#define T_16384_4833 {-0.2788879893865002235209260561532573774457,-0.9603236378304739195854722311196383088827} +#define T_16384_4845 {-0.2833043746974457377518774592317640781403,-0.9590300471191136599813376051315572112799} +#define T_16384_4851 {-0.2855103232784613198624867891339818015695,-0.9583756337164611727175156374869402498007} +#define T_16384_4869 {-0.2921190525960364325896989612374454736710,-0.9563819629783877340756248486286494880915} +#define T_16384_4879 {-0.2957845744248842612122984974121209233999,-0.9552546705105869895291448301577474921942} +#define T_16384_4887 {-0.2987138624331003855516541989345569163561,-0.9543427206147164776339764102885965257883} +#define T_16384_4905 {-0.3052944385467916688092770982621004804969,-0.9522580037953995990562816587043926119804} +#define T_16384_4913 {-0.3082144881558611104743761188728967681527,-0.9513168921504655539678196873865090310574} +#define T_16384_4923 {-0.3118604673724860165684447110834298655391,-0.9501279118572480975046801177086308598518} +#define T_16384_4941 {-0.3184116360387377886453919018094893544912,-0.9479525462991986728411575313657522201538} +#define T_16384_4947 {-0.3205920026949903300916844273160677403212,-0.9472173814959348181474751982023008167744} +#define T_16384_4959 {-0.3249476323821884338194365682284114882350,-0.9457320107774771500430688320193439722061} +#define T_16384_4977 {-0.3314681449624408693388488700293237343431,-0.9434664111006593190111857438751030713320} +#define T_16384_4981 {-0.3329150137552126520468220860493602231145,-0.9429568355001021195960220211418345570564} +#define T_16384_4995 {-0.3379728630768997765798644650203641504049,-0.9411558552246291897702690221194643527269} +#define T_16384_5013 {-0.3444614767755765361023634341108845546842,-0.9388004532474347740844677900895476341248} +#define T_16384_5015 {-0.3451814263155426054652252787491306662560,-0.9385359784935085603052584701799787580967} +#define T_16384_5031 {-0.3509336768758583580130050449952250346541,-0.9364003174040420596213607495883479714394} +#define T_16384_5049 {-0.3573891549772409415020035794441355392337,-0.9339555620609867325754294142825528979301} +#define T_16384_5067 {-0.3638276034760234978193693677894771099091,-0.9314663037109250920408953788864891976118} +#define T_16384_5083 {-0.3695361243183507005127808042743708938360,-0.9292163649138840408880923860124312341213} +#define T_16384_5085 {-0.3702487155799664142463711868913378566504,-0.9289326609670828238662920739443507045507} +#define T_16384_5103 {-0.3766521853229096161719269275636179372668,-0.9263547545576028552360980938829015940428} +#define T_16384_5117 {-0.3816202692465373580432697053765878081322,-0.9243191927576751565354129525076132267714} +#define T_16384_5121 {-0.3830377075793520713631323815206997096539,-0.9237327073197931781578517984598875045776} +#define T_16384_5139 {-0.3894049780789909376288449038838734850287,-0.9210666441942736426540250249672681093216} +#define T_16384_5151 {-0.3936395353501729310607970546698197722435,-0.9192648781549852543548695393837988376617} +#define T_16384_5157 {-0.3957536934212200763205657949583837762475,-0.9183566922190217196586559111892711371183} +#define T_16384_5175 {-0.4020835510895869879810504698980366811156,-0.9156029805233202312209073170379269868135} +#define T_16384_5185 {-0.4055918792476038703398444340564310550690,-0.9140542803840464580744651357235852628946} +#define T_16384_5193 {-0.4083942494662080036071927224838873371482,-0.9128056403216034953018720443651545792818} +#define T_16384_5211 {-0.4146854878461400661926461452821968123317,-0.9099648049072056599229085804836358875036} +#define T_16384_5219 {-0.4174752689345443412705094488046597689390,-0.9086882852926133624649196462996769696474} +#define T_16384_5229 {-0.4209569664517094356170900937286205589771,-0.9070806096460084511079458025051280856133} +#define T_16384_5247 {-0.4272083864467963176814180314977420493960,-0.9041531919699917763111329804814886301756} +#define T_16384_5253 {-0.4292876841295346079796502181125106289983,-0.9031678051473607249377550942881498485804} +#define T_16384_5265 {-0.4334394499510740850212187069701030850410,-0.9011826913706845187945759789727162569761} +#define T_16384_5283 {-0.4396498600542034784766087796015199273825,-0.8981692493925180764691162949020508676767} +#define T_16384_5287 {-0.4410271166174072332566424847755115479231,-0.8974937784787903050087720657757017761469} +#define T_16384_5301 {-0.4458393208299802901173336522333556786180,-0.8951130096260817570197332315728999674320} +#define T_16384_5319 {-0.4520075373504364724119852780859218910336,-0.8920141177012804734047790589102078229189} +#define T_16384_5321 {-0.4526915705907009201958146604738431051373,-0.8916671699216722757341813121456652879715} +#define T_16384_5337 {-0.4581542156998931192291024672158528119326,-0.8888727212803956279074668600514996796846} +#define T_16384_5355 {-0.4642790629889658182705147737578954547644,-0.8856889700510489626950061392562929540873} +#define T_16384_5373 {-0.4703817873685207096023930262163048610091,-0.8824630157190701540415034287434536963701} +#define T_16384_5389 {-0.4757876238359011189515967998886480927467,-0.8795601952138278889492539747152477502823} +#define T_16384_5391 {-0.4764620980435812502840065008058445528150,-0.8791950120012674840808131193625740706921} +#define T_16384_5409 {-0.4825197052871843528265571876545436680317,-0.8758851146181036995130853028967976570129} +#define T_16384_5423 {-0.4872152965742687635852803396119270473719,-0.8732818873559942129958244549925439059734} +#define T_16384_5427 {-0.4885543204541862305489985374151729047298,-0.8725334812862760580642884633562061935663} +#define T_16384_5445 {-0.4945656559950159514293943630036665126681,-0.8691402717112005626987070172617677599192} +#define T_16384_5457 {-0.4985601383985251433550445199216483160853,-0.8668551138454704263480721238011028617620} +#define T_16384_5463 {-0.5005534254693775331190863653318956494331,-0.8657056475794022709635555656859651207924} +#define T_16384_5481 {-0.5065173435598985252070747264951933175325,-0.8622297725508112353765000079874880611897} +#define T_16384_5491 {-0.5098202205851154467097785527585074305534,-0.8602809672906545079840157086437102407217} +#define T_16384_5499 {-0.5124571260857258003795777767663821578026,-0.8587128122509635197445732046617195010185} +#define T_16384_5517 {-0.5183724900160661119841165600519161671400,-0.8551549342631096228828369021357502788305} +#define T_16384_5525 {-0.5209936288203739218616306061448995023966,-0.8535605653546668447617662422999273985624} +#define T_16384_5535 {-0.5242631534836733608884173918340820819139,-0.8515563081202289774651603693200740963221} +#define T_16384_5553 {-0.5301288357982788523869999153248500078917,-0.8479171052969514121855354460421949625015} +#define T_16384_5559 {-0.5320784635259735440016015672881621867418,-0.8466950505653374481340733836987055838108} +#define T_16384_5571 {-0.5359692574599667080903486748866271227598,-0.8442374992013870205198600160656496882439} +#define T_16384_5589 {-0.5417841401724915462168041813129093497992,-0.8405176651668625487090480419283267110586} +#define T_16384_5593 {-0.5430728401818718520388529213960282504559,-0.8396855901209661077189139177789911627769} +#define T_16384_5607 {-0.5475732068565398735771054816723335534334,-0.8367577804435671895433301870070863515139} +#define T_16384_5625 {-0.5533361816629323026717202083091251552105,-0.8329580241901066717247203996521420776844} +#define T_16384_5627 {-0.5539748896466955008222043943533208221197,-0.8325333756918886773945587265188805758953} +#define T_16384_5643 {-0.5590727899857684812801039697660598903894,-0.8291185774649659778745558469381649047136} +#define T_16384_5661 {-0.5647827584755114038728152081603184342384,-0.8252396232178821344760422107356134802103} +#define T_16384_5679 {-0.5704658150520129922256273857783526182175,-0.8213213462811267406848969585553277283907} +#define T_16384_5695 {-0.5754946092349282338318516849540174007416,-0.8178055727014442721767295552126597613096} +#define T_16384_5697 {-0.5761216889174783917226818630297202616930,-0.8173639333606984580526955141976941376925} +#define T_16384_5715 {-0.5817501105693696494824962428538128733635,-0.8133675730274265713859449533629231154919} +#define T_16384_5729 {-0.5861086208154764332078912048018537461758,-0.8102324879969823312464427544909995049238} +#define T_16384_5733 {-0.5873508118132476640838035564229357987642,-0.8093324557079858427854901492537464946508} +#define T_16384_5751 {-0.5929235257755512966681976649852003902197,-0.8052587736758222147770425181079190224409} +#define T_16384_5763 {-0.5966229887412133336965780472382903099060,-0.8025216565959463199675383293651975691319} +#define T_16384_5769 {-0.5984679869163143095533996529411524534225,-0.8011467210419912499119732274266425520182} +#define T_16384_5787 {-0.6039839310418180229333984243567101657391,-0.7969964937459087517268585543206427246332} +#define T_16384_5797 {-0.6070359254764996492781392589677125215530,-0.7946743894079445480471690643753390759230} +#define T_16384_5805 {-0.6094710953171802447769778154906816780567,-0.7928082895460141221732897065521683543921} +#define T_16384_5823 {-0.6149292182788795857462105232116300612688,-0.7885823080103472326740643438824918121099} +#define T_16384_5831 {-0.6173456607298968279451401031110435724258,-0.7866920205378767905202153087884653359652} +#define T_16384_5841 {-0.6203580398472138268672892991162370890379,-0.7843187505070389198280622622405644506216} +#define T_16384_5859 {-0.6257573013386928950652077219274360686541,-0.7800178201947159939422249408380594104528} +#define T_16384_5865 {-0.6275504417551315272660872324195224791765,-0.7785759070591249386694698841893114149570} +#define T_16384_5877 {-0.6311267454783653363392659230157732963562,-0.7756797220128206493683364897151477634907} +#define T_16384_5895 {-0.6364661164120771763563766398874577134848,-0.7713046626718447207338158477796241641045} +#define T_16384_5899 {-0.6376485336490788080610059296304825693369,-0.7703274287828388855459138540027197450399} +#define T_16384_5913 {-0.6417751597186635015290789851860608905554,-0.7668928506434806724456620941055007278919} +#define T_16384_5931 {-0.6470536224220716503552353060513269156218,-0.7624444961506871010215036221779882907867} +#define T_16384_5933 {-0.6476382196467103113945995573885738849640,-0.7619479880233553936719204102701041847467} +#define T_16384_5949 {-0.6523012530034154599078988212568219751120,-0.7579598111576723029614299775857944041491} +#define T_16384_5967 {-0.6575178014129601233861421860638074576855,-0.7534390093597935766922546463320031762123} +#define T_16384_5985 {-0.6627030190820374366822420597600284963846,-0.7488823061733751451640728191705420613289} +#define T_16384_6001 {-0.6672855993314564804208544046559836715460,-0.7448019393938626331319596829416695982218} +#define T_16384_6003 {-0.6678566589348894355637753506016451865435,-0.7442899187254431447868796567490790039301} +#define T_16384_6021 {-0.6729784754004420888051640758931171149015,-0.7396620658433800121756007683870848268270} +#define T_16384_6035 {-0.6769399527900711310834935829916503280401,-0.7360382465039274624274412417435087263584} +#define T_16384_6039 {-0.6780682244240066047780146618606522679329,-0.7349989680444966033689979667542502284050} +#define T_16384_6057 {-0.6831256634789086845671590708661824464798,-0.7303008475255254872138266364345327019691} +#define T_16384_6069 {-0.6864792204632389482910070910293143242598,-0.7271494205953710210366125465952791273594} +#define T_16384_6075 {-0.6881505515780448334339780558366328477859,-0.7255679281520323042897757659375201910734} +#define T_16384_6093 {-0.6931426492853653975600991543615236878395,-0.7208004354477492992003817562363110482693} +#define T_16384_6103 {-0.6959017805909968323874181805877014994621,-0.7181369728472974944821771714487113058567} +#define T_16384_6111 {-0.6981017187272838819822595723962876945734,-0.7159985965838286947615642930031754076481} +#define T_16384_6129 {-0.7030275236040113284730068698991090059280,-0.7111626403680183505784384578873869031668} +#define T_16384_6137 {-0.7052060312546978293468669107824098318815,-0.7090024354556182517583806657057721167803} +#define T_16384_6147 {-0.7079198292008163084787497609795536845922,-0.7062927972337584847650759911630302667618} +#define T_16384_6165 {-0.7127784023992089768029245533398352563381,-0.7013892992292022343647772686381358653307} +#define T_16384_6171 {-0.7143903906493513922271176852518692612648,-0.6997473613725649910222159633121918886900} +#define T_16384_6183 {-0.7176030116880490750119747644930612295866,-0.6964523800061578340248047425120603293180} +#define T_16384_6201 {-0.7223934271745775514972365272114984691143,-0.6914822748089558546169541841663885861635} +#define T_16384_6205 {-0.7234532973525443777518262322701048105955,-0.6903733240426740414008577317872550338507} +#define T_16384_6219 {-0.7271494205953710210366125465952791273594,-0.6864792204632389482910070910293143242598} +#define T_16384_6237 {-0.7318707653272182866999173711519688367844,-0.6814434553646778747193479830457363277674} +#define T_16384_6239 {-0.7323932105898960376322293086559511721134,-0.6808819171352872290370328300923574715853} +#define T_16384_6255 {-0.7365572363979191461424989029183052480221,-0.6763752194676116991089998009556438773870} +#define T_16384_6273 {-0.7412086104970042610418090589519124478102,-0.6712747542736134942487069565686397254467} +#define T_16384_6291 {-0.7458246659863759786546211216773372143507,-0.6661423028199835449925103603163734078407} +#define T_16384_6307 {-0.7498979983778352176315706856257747858763,-0.6615534687603989993931463686749339103699} +#define T_16384_6309 {-0.7504051829108693283387765404768288135529,-0.6609781096681680567783701008011121302843} +#define T_16384_6327 {-0.7549499430087326379279488719475921243429,-0.6557824208921061437393973392318002879620} +#define T_16384_6341 {-0.7584598969595154338563247620186302810907,-0.6517197133002509135124569183972198516130} +#define T_16384_6345 {-0.7594587297220282140486347088881302624941,-0.6505554840665039861846707935910671949387} +#define T_16384_6363 {-0.7639313282069510879779272727319039404392,-0.6452975482550383778956870628462638705969} +#define T_16384_6375 {-0.7668928506434806724456620941055007278919,-0.6417751597186635015290789851860608905554} +#define T_16384_6381 {-0.7683675253440662711312825194909237325191,-0.6400088639984884419931177035323344171047} +#define T_16384_6399 {-0.7727671097484638540464629841153509914875,-0.6346896833027977358199223090196028351784} +#define T_16384_6409 {-0.7751954257529413139238272378861438483000,-0.6317214986777923702021553253871388733387} +#define T_16384_6417 {-0.7771298717798316157967519757221452891827,-0.6293402596270657456400954288255888968706} +#define T_16384_6435 {-0.7814556035524445887219258111144881695509,-0.6239608478714706585321891907369717955589} +#define T_16384_6443 {-0.7833662107766197202352032036287710070610,-0.6215604393890271639477873577561695128679} +#define T_16384_6453 {-0.7857440989450703572316569989197887480259,-0.6185517043651238555668214758043177425861} +#define T_16384_6471 {-0.7899951536107910898820705369871575385332,-0.6131130868538549050228425585373770445585} +#define T_16384_6477 {-0.7914038166087195014242183788155671209097,-0.6112937093224108942024486168520525097847} +#define T_16384_6489 {-0.7942085649867406393909163853095378726721,-0.6076452544879308304004439378331881016493} +#define T_16384_6507 {-0.7983841323037563775244507269235327839851,-0.6021484678097072107405551832925993949175} +#define T_16384_6511 {-0.7993068767850861622292768515762872993946,-0.6009230539129540860088241061021108180285} +#define T_16384_6525 {-0.8025216565959463199675383293651975691319,-0.5966229887412133336965780472382903099060} +#define T_16384_6543 {-0.8066209407101696537978341439156793057919,-0.5910690805716714013584578424342907965183} +#define T_16384_6545 {-0.8070740477155176062495911537553183734417,-0.5904502362638958112839304703811649233103} +#define T_16384_6561 {-0.8106817893154306675640441426367033272982,-0.5854870079449514541636290232418105006218} +#define T_16384_6579 {-0.8147040089121870831689875558367930352688,-0.5798770368469603475602980324765667319298} +#define T_16384_6597 {-0.8186874078415696809329915595299098640680,-0.5742394345929678900475323644059244543314} +#define T_16384_6613 {-0.8221954632141371677178653953887987881899,-0.5692052531996611985931622257339768111706} +#define T_16384_6615 {-0.8226317962945149941944578131369780749083,-0.5685744698148691433914336812449619174004} +#define T_16384_6633 {-0.8265369863208099632245762222737539559603,-0.5628824124483844393296294583706185221672} +#define T_16384_6647 {-0.8295471370078089101696150464704260230064,-0.5584366996197041022043094926630146801472} +#define T_16384_6651 {-0.8304027918380475492554637639841530472040,-0.5571635337201963356079659206443466246128} +#define T_16384_6669 {-0.8342290286404934196440308369346894323826,-0.5514181061350260648623589077033102512360} +#define T_16384_6681 {-0.8367577804435671895433301870070863515139,-0.5475732068565398735771054816723335534334} +#define T_16384_6687 {-0.8380155144078637041715751365700270980597,-0.5456464034626485881673829680948983877897} +#define T_16384_6705 {-0.8417620687140124902114735050417948514223,-0.5398487007248475855192282324424013495445} +#define T_16384_6715 {-0.8438261676481867423760263591248076409101,-0.5366166218001211518640047870576381683350} +#define T_16384_6723 {-0.8454685130355288347203668308793567121029,-0.5340252741823103832530250656418502330780} +#define T_16384_6741 {-0.8491346707602436261197453859494999051094,-0.5281764013214643727067709733091760426760} +#define T_16384_6749 {-0.8507510969332607864146211795741692185402,-0.5255688071669146754771873020217753946781} +#define T_16384_6759 {-0.8527603671956452968672124370641540735960,-0.5223023608412546980872548374463804066181} +#define T_16384_6777 {-0.8563454295772036095613088946265634149313,-0.5164034326398639906940957189362961798906} +#define T_16384_6783 {-0.8575313909994991545815423705789726227522,-0.5144316411832229318790155048191081732512} +#define T_16384_6795 {-0.8598896870766022937360162359254900366068,-0.5104798978013757038141307020850945264101} +#define T_16384_6813 {-0.8633929708098784239211909152800217270851,-0.5045320385823802711300345436029601842165} +#define T_16384_6817 {-0.8641658971368793018541509809438139200211,-0.5032070172658690276890069981163833290339} +#define T_16384_6831 {-0.8668551138454704263480721238011028617620,-0.4985601383985251433550445199216483160853} +#define T_16384_6849 {-0.8702759512121719387423013358784373849630,-0.4925644818110106459840835668728686869144} +#define T_16384_6851 {-0.8706534874206174334787533553026150912046,-0.4918968437002992910578313967562280595303} +#define T_16384_6867 {-0.8736553199069926334274782675493042916059,-0.4865453545130302703825009302818216383457} +#define T_16384_6885 {-0.8769930589029258927169507842336315661669,-0.4805030433161575076361771152733126655221} +#define T_16384_6903 {-0.8802890091566208941031845824909396469593,-0.4744378361366792806741443655482726171613} +#define T_16384_6919 {-0.8831835338005233904112856180290691554546,-0.4690275531603871894681390131154330447316} +#define T_16384_6921 {-0.8835430136159618808022742086905054748058,-0.4683500219818765297752349852089537307620} +#define T_16384_6939 {-0.8867549172275508428953116890625096857548,-0.4622398909362533392375382845784770324826} +#define T_16384_6953 {-0.8892238596778682113708214274083729833364,-0.4574723241679160556927286052086856216192} +#define T_16384_6957 {-0.8899245669440967176910817215684801340103,-0.4561077341477141056103050686942879110575} +#define T_16384_6975 {-0.8930518117317074455741021665744483470917,-0.4499538438136905238451390687259845435619} +#define T_16384_6987 {-0.8951130096260817570197332315728999674320,-0.4458393208299802901173336522333556786180} +#define T_16384_6993 {-0.8961365025770867687171516990929376333952,-0.4437785131672182803441728538018651306629} +#define T_16384_7011 {-0.8991784924946353285690747725311666727066,-0.4375820364629644521059503858850803226233} +#define T_16384_7021 {-0.9008499824375314490865207517344970256090,-0.4341305208601433651693923820857889950275} +#define T_16384_7029 {-0.9021776365334536196272097186010796576738,-0.4313647089632063891251334553089691326022} +#define T_16384_7047 {-0.9051337917842496860743040087982080876827,-0.4251268269237623576017881532607134431601} +#define T_16384_7055 {-0.9064338027760454608028339862357825040817,-0.4223479148580670528012603881506947800517} +#define T_16384_7065 {-0.9080468173861483416331452644953969866037,-0.4188686875798751096944272376276785507798} +#define T_16384_7083 {-0.9109165745334033559288400283548980951309,-0.4125905891320482687945059296907857060432} +#define T_16384_7089 {-0.9118635213427285224341289904259610921144,-0.4104935059710924094567019437818089500070} +#define T_16384_7101 {-0.9137429264820113861134132093866355717182,-0.4062928307318374177015130044310353696346} +#define T_16384_7119 {-0.9165257385562282088642405142309144139290,-0.3999757124675953345871448618709109723568} +#define T_16384_7123 {-0.9171382150373507124996308448316995054483,-0.3985693095536863017613882220757659524679} +#define T_16384_7137 {-0.9192648781549852543548695393837988376617,-0.3936395353501729310607970546698197722435} +#define T_16384_7155 {-0.9219602147582092177557910872565116733313,-0.3872846012985757813140708094579167664051} +#define T_16384_7157 {-0.9222569871152830334182226579287089407444,-0.3865773528248139245810932607128052040935} +#define T_16384_7173 {-0.9246116199330399743061548178957309573889,-0.3809112131255780742122851734166033565998} +#define T_16384_7191 {-0.9272189673399517939600400495692156255245,-0.3745196745232932111768775484961224719882} +#define T_16384_7209 {-0.9297821327387721934698561199184041470289,-0.3681102900487030482601369385520229116082} +#define T_16384_7225 {-0.9320233121307864854188096614961978048086,-0.3623983245611913650563451483321841806173} +#define T_16384_7227 {-0.9323009939946027557766683457884937524796,-0.3616833651091458379234211406583199277520} +#define T_16384_7245 {-0.9347754310836386970606781687820330262184,-0.3552392059477633146080677306599682196975} +#define T_16384_7259 {-0.9366692047066361714868776289222296327353,-0.3502153636753216336963134835968958213925} +#define T_16384_7263 {-0.9372053260988879586079747241456061601639,-0.3487781196289084229000820869259769096971} +#define T_16384_7281 {-0.9395905632557891573597430578956846147776,-0.3423004140235135217551487585296854376793} +#define T_16384_7293 {-0.9411558552246291897702690221194643527269,-0.3379728630768997765798644650203641504049} +#define T_16384_7299 {-0.9419310288977295053669536173401866108179,-0.3358063977944205080738981905597029253840} +#define T_16384_7317 {-0.9442266115014598115706689895887393504381,-0.3292963803816728041162775753036839887500} +#define T_16384_7327 {-0.9454825009144537384386808298586402088404,-0.3256729040994197910130480977386469021440} +#define T_16384_7335 {-0.9464772016824086753317146758490707725286,-0.3227706719877707630672603045240975916386} +#define T_16384_7353 {-0.9486826921998950945535966638999525457621,-0.3162295835628903262204403290525078773499} +#define T_16384_7361 {-0.9496484062080354782153790438314899802208,-0.3133175778448090142980220207391539588571} +#define T_16384_7371 {-0.9508429779622381561310362485528457909822,-0.3096734267900664305805946696636965498328} +#define T_16384_7389 {-0.9529579560317646969025418002274818718433,-0.3031025140703410558629116167139727622271} +#define T_16384_7395 {-0.9536528628645905003580196535040158778429,-0.3009089848279218881721419620589585974813} +#define T_16384_7407 {-0.9550275256297141579509002440317999571562,-0.2965171585078774096899678625050000846386} +#define T_16384_7425 {-0.9570515881410409653184956368932034820318,-0.2899176738950407505868156476935837417841} +#define T_16384_7429 {-0.9574951900910325663929256734263617545366,-0.2884492346194342249354747309553204104304} +#define T_16384_7443 {-0.9590300471191136599813376051315572112799,-0.2833043746974457377518774592317640781403} +#define T_16384_7461 {-0.9609628082903097778810774798330385237932,-0.2766775760389724170273950676346430554986} +#define T_16384_7463 {-0.9611747346577140804768646376032847911119,-0.2759404454871972056650974991498515009880} +#define T_16384_7479 {-0.9628497795585090335279687678848858922720,-0.2700375936867505655136767472868086770177} +#define T_16384_7497 {-0.9646908710094810324164882331388071179390,-0.2633847440361132830055623799125896766782} +#define T_16384_7515 {-0.9664859949151698437219693005317822098732,-0.2567193440955207184295261413353728130460} +#define T_16384_7531 {-0.9680430013720222603978982078842818737030,-0.2507842648465944956903683760174317285419} +#define T_16384_7533 {-0.9682350657378743230552231580077204853296,-0.2500417114714546529263827778777340427041} +#define T_16384_7551 {-0.9699380001343239632305426312086638063192,-0.2433521643532847444912903256408753804862} +#define T_16384_7565 {-0.9712305558534973792816913373826537281275,-0.2381411501116648399278830083858338184655} +#define T_16384_7569 {-0.9715947169596501620247863684198819100857,-0.2366510214981063786598980414055404253304} +#define T_16384_7587 {-0.9732051372712527959052408732532057911158,-0.2299386022155522346643863329518353566527} +#define T_16384_7599 {-0.9742529925414225022706204981659539043903,-0.2254575492727685370741852466380805708468} +#define T_16384_7605 {-0.9747691843325617666593529975216370075941,-0.2232152263525770119478863762196851894259} +#define T_16384_7623 {-0.9762867836166936319486353568208869546652,-0.2164812142782167603272114320134278386831} +#define T_16384_7633 {-0.9771097975948008773627861955901607871056,-0.2127356186543459259929989002557704225183} +#define T_16384_7641 {-0.9777578628100027646752323562395758926868,-0.2097368868683233134042609435709891840816} +#define T_16384_7659 {-0.9791823518155269301388443636824376881123,-0.2029825654902744636043365744626498781145} +#define T_16384_7667 {-0.9798004853314797868435448435775469988585,-0.1999775210972391803476710947506944648921} +#define T_16384_7677 {-0.9805601827563278360955223433848004788160,-0.1962185719876608780420923494602902792394} +#define T_16384_7695 {-0.9818912899787251014060984743991866707802,-0.1894452286649502270599043640686431899667} +#define T_16384_7701 {-0.9823245983107212753182579945132602006197,-0.1871854255909903286259066135244211181998} +#define T_16384_7713 {-0.9831756100554244204303699916636105626822,-0.1826628582721293025947773003281326964498} +#define T_16384_7731 {-0.9844130817885407003231534872611518949270,-0.1758717839893250423077830646434449590743} +#define T_16384_7735 {-0.9846817074109709411189328420732636004686,-0.1743615069050937771422127298137638717890} +#define T_16384_7749 {-0.9856036462125133956746481089794542640448,-0.1690723294114050145875438602161011658609} +#define T_16384_7767 {-0.9867472465969164829857618315145373344421,-0.1622648185325580305615744691749569028616} +#define T_16384_7769 {-0.9868714119028124676091806577460374683142,-0.1615079452192661468856726969534065574408} +#define T_16384_7785 {-0.9878438284491617427107712501310743391514,-0.1554495757308558268050546757876873016357} +#define T_16384_7803 {-0.9888933395170951268227099717478267848492,-0.1486269257527965403919978371050092391670} +#define T_16384_7821 {-0.9898957297914866559906954535108525305986,-0.1417971936978303926224498354713432490826} +#define T_16384_7837 {-0.9907471465082227091158983967034146189690,-0.1357206383930399384851739341684151440859} +#define T_16384_7839 {-0.9908509515084136243245893638231791555882,-0.1349607050028687749154698849451960995793} +#define T_16384_7857 {-0.9917589591515361124862693031900562345982,-0.1281177854267771254459518104340531863272} +#define T_16384_7871 {-0.9924325177125936647826165426522493362427,-0.1227912773231167736787128319519979413599} +#define T_16384_7875 {-0.9926197094542661414351414350676350295544,-0.1212687610348525957260079621846671216190} +#define T_16384_7893 {-0.9934331614018293565848694015585351735353,-0.1144139581832869234645500000624451786280} +#define T_16384_7905 {-0.9939491666021811333564528467832133173943,-0.1098410406488826013271520309899642597884} +#define T_16384_7911 {-0.9941992762332189093044121364073362201452,-0.1075537035036156358058789805909327697009} +#define T_16384_7929 {-0.9949180174430432010979075130308046936989,-0.1006883238871539576475555577417253516614} +#define T_16384_7939 {-0.9952968353332460882398891044431366026402,-0.0968721300252304712286033350210345815867} +#define T_16384_7947 {-0.9955893507832646038835378021758515387774,-0.0938181464694205491383627304458059370518} +#define T_16384_7965 {-0.9962132442648320429512409646122250705957,-0.0869434986145493776676929087443568278104} +#define T_16384_7973 {-0.9964752947901721613632730623066890984774,-0.0838867502817902122025373046199092641473} +#define T_16384_7983 {-0.9967896681592045560194037534529343247414,-0.0800647078996908900805351549934130162001} +#define T_16384_8001 {-0.9973185949997686039480981889937538653612,-0.0731821020994028875739090267416031565517} +#define T_16384_8007 {-0.9974843446244179290260944981127977371216,-0.0708871090480878152106924972031265497208} +#define T_16384_8019 {-0.9977999995831464685736023056961130350828,-0.0662960091700321302310427995507779996842} +#define T_16384_8037 {-0.9982338589703968478872297964699100703001,-0.0594067572340871499769754393582843476906} +#define T_16384_8041 {-0.9983238132885775550917628606839571148157,-0.0578754163782288638673279024260409642011} +#define T_16384_8055 {-0.9986201524881088698037956419284455478191,-0.0525146745646032225840471596711722668260} +#define T_16384_8073 {-0.9989588617293860828283413866301998496056,-0.0456200895695001509833055308718030573800} +#define T_16384_8075 {-0.9989935580655456837106953571492340415716,-0.0448538843751698154682649999358545755967} +#define T_16384_8091 {-0.9992499705547244204240087128710001707077,-0.0387233307759336231601565714299795217812} +#define T_16384_8109 {-0.9994934650927805863673825115256477147341,-0.0318247268146408870959085390950349392369} +#define T_16384_8127 {-0.9996893337410336366488650128303561359644,-0.0249246064042814678696213803732462110929} +#define T_16384_8143 {-0.9998234493816615664485425440943799912930,-0.0187901587687845580965095848569035297260} +#define T_16384_8145 {-0.9998375671663370933828218767303042113781,-0.0180232983357737422402955473899055505171} +#define T_16384_8163 {-0.9999381583053645883296667307149618864059,-0.0111211314566280212307036734387111209799} +#define T_16384_8177 {-0.9999834548319377347525005461648106575012,-0.0057523962295737366551273694881274423096} +#define T_16384_8181 {-0.9999911023649455943385078171559143811464,-0.0042184346552769638391544582134429219877} +#define T_16384_8199 {-0.9999963968222943533348257005854975432158,0.0026844631545959616800156588567460858030} +#define T_16384_8211 {-0.9999734542412659354937431999132968485355,0.0072863442679265222681750380218090867857} +#define T_16384_8217 {-0.9999540414251297804071327846031635999680,0.0095872330497292247708518431181801133789} +#define T_16384_8235 {-0.9998640381916876762602441885974258184433,0.0164895461129564366298261290921800537035} +#define T_16384_8245 {-0.9997934493098352692896924054366536438465,0.0203238460223895966993978845493984408677} +#define T_16384_8253 {-0.9997263914106244708790427466738037765026,0.0233910734488792584895300308289733948186} +#define T_16384_8271 {-0.9995411076408129424919479788513854146004,0.0302914861995392838134311119802077882923} +#define T_16384_8279 {-0.9994434706400777690404879649577196687460,0.0333578925430861455980746654859103728086} +#define T_16384_8289 {-0.9993081957110294677448791844653896987438,0.0371904555600881189802997539572970708832} +#define T_16384_8307 {-0.9990276667195336912641323579009622335434,0.0440876527944549442827870677774626528844} +#define T_16384_8313 {-0.9989235777314657838132916367612779140472,0.0463862679267071573163683240181853761896} +#define T_16384_8325 {-0.9986995340335392823405413764703553169966,0.0509827492510108032375271136515948455781} +#define T_16384_8343 {-0.9983238132885775550917628606839571148157,0.0578754163782288638673279024260409642011} +#define T_16384_8347 {-0.9982338589703968478872297964699100703001,0.0594067572340871499769754393582843476906} +#define T_16384_8361 {-0.9979005223877516206343329940864350646734,0.0647653257403398852076747971295844763517} +#define T_16384_8379 {-0.9974296815008841798899652530963066965342,0.0716521490329822124953906836708483751863} +#define T_16384_8381 {-0.9973744316151671451819993308163248002529,0.0724171468667634127269749910738028120250} +#define T_16384_8397 {-0.9969113130635557373793176338949706405401,0.0785355580988454787805252976795600261539} +#define T_16384_8415 {-0.9963454417760359005384884767408948391676,0.0854152249433073329498711245832964777946} +#define T_16384_8433 {-0.9957320946021064322906113375211134552956,0.0922908217500623545559435001450765412301} +#define T_16384_8449 {-0.9951470643903864710111406566284131258726,0.0983987816753638944167192903478280641139} +#define T_16384_8451 {-0.9950713007677761678948513690556865185499,0.0991620208967425031953979441823321394622} +#define T_16384_8469 {-0.9943630917598885732289204497647006064653,0.1060284949705284085474943367444211617112} +#define T_16384_8483 {-0.9937795031929845812612711597466841340065,0.1113656097433351616077246148961421567947} +#define T_16384_8487 {-0.9936075013246216114382036721508484333754,0.1128899167837505157496735819222521968186} +#define T_16384_8505 {-0.9928045654658791407953799534880090504885,0.1197459593894796003876734857840347103775} +#define T_16384_8517 {-0.9922429906813416966571139710140414535999,0.1243135046716442454473039447293558623642} +#define T_16384_8523 {-0.9919543224435759531942835565132554620504,0.1265962960971058759440666108275763690472} +#define T_16384_8541 {-0.9910568127718143438542597323248628526926,0.1334406004879056806267101364937843754888} +#define T_16384_8551 {-0.9905377880761887521288144853315316140652,0.1372402652035155934395760368715855292976} +#define T_16384_8559 {-0.9901120792169537665472489607054740190506,0.1402785464305954243879881460088654421270} +#define T_16384_8577 {-0.9891201667955726861691800877451896667480,0.1471098080968717969341241769143380224705} +#define T_16384_8585 {-0.9886641852770662319827010833250824362040,0.1501436936752081896528920879063662141562} +#define T_16384_8595 {-0.9880811227723240719456043734680861234665,0.1539340599769373763017199507885379716754} +#define T_16384_8613 {-0.9869949966576829769593359742430038750172,0.1607509768950112216678860477259149774909} +#define T_16384_8619 {-0.9866225008130384832583104071090929210186,0.1630215963896378406072784628122462891042} +#define T_16384_8631 {-0.9858618402055869811562160975881852209568,0.1675602340248235899711204410778009332716} +#define T_16384_8649 {-0.9846817074109709411189328420732636004686,0.1743615069050937771422127298137638717890} +#define T_16384_8653 {-0.9844130817885407003231534872611518949270,0.1758717839893250423077830646434449590743} +#define T_16384_8667 {-0.9834546545071932710513351594272535294294,0.1811544714549908063894179122144123539329} +#define T_16384_8685 {-0.9821807399633570856423148143221624195576,0.1879388039895759088260263069969369098544} +#define T_16384_8687 {-0.9820363038243690168016541974793653935194,0.1886920718286052289780485580195090733469} +#define T_16384_8703 {-0.9808600244815238733409046290034893900156,0.1947141812352259915286367686348967254162} +#define T_16384_8721 {-0.9794925709938208102656176379241514950991,0.2014802803450377599592258093252894468606} +#define T_16384_8739 {-0.9780784446594423808818419274757616221905,0.2082367789142113290168367711885366588831} +#define T_16384_8757 {-0.9766177128615456393134763857233338057995,0.2149833549954128508741035830098553560674} +#define T_16384_8775 {-0.9751104452040388892442024371121078729630,0.2217196871141152159090381701389560475945} +#define T_16384_8793 {-0.9735567135082655587652311623969580978155,0.2284454542839164659095985143721918575466} +#define T_16384_8811 {-0.9719565918095817158572913285752292722464,0.2351603360218347260612858917738776654005} +#define T_16384_8829 {-0.9703101563538281126852780289482325315475,0.2418640123635792116285614383741631172597} +#define T_16384_8847 {-0.9686174855936975358616791709209792315960,0.2485561638787965599295404217627947218716} +#define T_16384_8865 {-0.9668786601849959083665453363209962844849,0.2552364716862917104478469809691887348890} +#define T_16384_8883 {-0.9650937629827995856146571895806118845940,0.2619046174692226114366633282770635560155} +#define T_16384_8901 {-0.9632628790375070693130510335322469472885,0.2685602834902678925921293284773128107190} +#define T_16384_8919 {-0.9613860955907862493319271379732526838779,0.2752031526067673095070631461567245423794} +#define T_16384_8937 {-0.9594635020714175066558482285472564399242,0.2818329082858333500816172545455629006028} +#define T_16384_8955 {-0.9574951900910325663929256734263617545366,0.2884492346194342249354747309553204104304} +#define T_16384_8973 {-0.9554812534397487677750859802472405135632,0.2950518163394467152649269792163977399468} +#define T_16384_8991 {-0.9534217880817003054616520785202737897635,0.3016403388326788226336816478578839451075} +#define T_16384_9009 {-0.9513168921504655539678196873865090310574,0.3082144881558611104743761188728967681527} +#define T_16384_9027 {-0.9491666659443906972626336937537416815758,0.3147739510506060711669817919755587354302} +#define T_16384_9045 {-0.9469712119218108847817916284839157015085,0.3213184149583349080714356205135118216276} +#define T_16384_9063 {-0.9447306346961678036322496154753025621176,0.3278475680351708443360791989107383415103} +#define T_16384_9081 {-0.9424450410310248882339578813116531819105,0.3343610991667987364373004766093799844384} +#define T_16384_9099 {-0.9401145398349802784210282879939768463373,0.3408586979832894381381436232913983985782} +#define T_16384_9117 {-0.9377392421564769708908215761766768991947,0.3473400548738891369104919704113854095340} +#define T_16384_9135 {-0.9353192611785116072908863316115457564592,0.3538048610017720529974383225635392591357} +#define T_16384_9153 {-0.9328547122132411217876324371900409460068,0.3602528083187568896939012574875960126519} +#define T_16384_9171 {-0.9303457126964884693620660982560366392136,0.3666835895799849254217406269162893295288} +#define T_16384_9189 {-0.9277923821821463246095618160325102508068,0.3730968983585606357777919583895709365606} +#define T_16384_9207 {-0.9251948423364805274005107094126287847757,0.3794924290601526251087705077225109562278} +// Pre-computed twiddles for N=19683 +#define T_19683_1 {0.9999999490496522813742785729118622839451,-0.0003192188792597382249356663397321653974} +#define T_19683_2 {0.9999997961986142325230275673675350844860,-0.0006384377259908506066152233948685079667} +#define T_19683_4 {0.9999991847945400857966546936950180679560,-0.0012768751917527146333419807433529058471} +#define T_19683_5 {0.9999987262415661604109118343330919742584,-0.0015960937457262446784872489402573592088} +#define T_19683_7 {0.9999975034339785295856017910409718751907,-0.0022345303332155399926584049552502619918} +#define T_19683_8 {0.9999967391794893911693975496746134012938,-0.0025537483016741732710919166038365801796} +#define T_19683_10 {0.9999949049695097125223242073843721300364,-0.0031921834253767497960319055039235536242} +#define T_19683_11 {0.9999938350142061338488019828218966722488,-0.0035114005155637093708032026029286498670} +#define T_19683_13 {0.9999913894035166173068773787235841155052,-0.0041498335899667580795036059271296835504} +#define T_19683_14 {0.9999900137483799245075033468310721218586,-0.0044690495091260730095572561992867122171} +#define T_19683_16 {0.9999869567392234426250752221676521003246,-0.0051074799487186650984171798484112514416} +#define T_19683_17 {0.9999852753855151821227309483219869434834,-0.0054266944040954369354867736774394870736} +#define T_19683_19 {0.9999816069806952700815827483893372118473,-0.0060651216233690600707073592445794929517} +#define T_19683_20 {0.9999796199299575416574725750251673161983,-0.0063843343222097339098586310512928321259} +#define T_19683_22 {0.9999753401328386193114283742033876478672,-0.0070227577356588291243588706436185020721} +#define T_19683_23 {0.9999730473868936320158695707505103200674,-0.0073419683852114607514049993142180028372} +#define T_19683_25 {0.9999681562014006708238866849569603800774,-0.0079803874073339584743758123863699438516} +#define T_19683_26 {0.9999655577623511870655192979029379785061,-0.0082995957148484841153779356659470067825} +#define T_19683_28 {0.9999600551929699321362932096235454082489,-0.0089380097601463436712831978070425975602} +#define T_19683_29 {0.9999571510631989346151726749667432159185,-0.0092572154328748445378804987626608635765} +#define T_19683_31 {0.9999510371149759047071370332560036331415,-0.0098956239158545893086493805412828805856} +#define T_19683_32 {0.9999478272971468184593391015368979424238,-0.0102148266610515639496448159206920536235} +#define T_19683_34 {0.9999411019756890839360607969865668565035,-0.0108532289962248191389493356950879388023} +#define T_19683_35 {0.9999375864727457763336815332877449691296,-0.0111724285211474531898101503202269668691} +#define T_19683_37 {0.9999302497842210701861631605424918234348,-0.0118108241230314783831722991180868120864} +#define T_19683_38 {0.9999264285993875178704115569416899234056,-0.0121300201349399117134453263133764266968} +#define T_19683_40 {0.9999184805505244577616963397304061800241,-0.0127684084180581438466850485724535246845} +#define T_19683_41 {0.9999143536873050797098017028474714607000,-0.0130876006242157342379650586394745914731} +#define T_19683_43 {0.9999057942853929459303685689519625157118,-0.0137259810030983253614778050177847035229} +#define T_19683_44 {0.9999013617475724924332780574331991374493,-0.0140451691107719208589932335939920449164} +#define T_19683_46 {0.9998921910004611168787391761725302785635,-0.0146835409999562689631336098727842909284} +#define T_19683_47 {0.9998874527921046695411178006906993687153,-0.0150027247164164802273322862902205088176} +#define T_19683_49 {0.9998776707082046577568235079525038599968,-0.0156410875304477670066916061841766349971} +#define T_19683_50 {0.9998726268336577405193565937224775552750,-0.0159602665629692309912091019441504613496} +#define T_19683_52 {0.9998622334219401386334880044159945100546,-0.0165986197164009648130633678420053911395} +#define T_19683_53 {0.9998568838858284957282762661634478718042,-0.0169177937722626119121382970433842274360} +#define T_19683_55 {0.9998458791558253455633575867977924644947,-0.0175561366796571638460022768413182348013} +#define T_19683_56 {0.9998402239630550525362195912748575210571,-0.0178753054661424833071681206320135970600} +#define T_19683_58 {0.9998286079248587254753033448650967329741,-0.0185136375420716162154555206598161021248} +#define T_19683_59 {0.9998226470806161891857755108503624796867,-0.0188328007664689406341906874331471044570} +#define T_19683_61 {0.9998104197448798302616523869801312685013,-0.0194711214255143504059386572180301300250} +#define T_19683_62 {0.9998041532546318999052914477942977100611,-0.0197902787951171124647409271801734576002} +#define T_19683_64 {0.9997913146325693167781878401001449674368,-0.0204285874518709623104406603033567080274} +#define T_19683_65 {0.9997847425020629508196634560590609908104,-0.0207477386739779584567955339480249676853} +#define T_19683_67 {0.9997712926054485027549389997147955000401,-0.0213860347430434305504576286693918518722} +#define T_19683_68 {0.9997644148407109909726386831607669591904,-0.0217051795249590950831475311133544892073} +#define T_19683_70 {0.9997503536818796998630887173931114375591,-0.0223434624209509075098978314599662553519} +#define T_19683_71 {0.9997431702892187743714202952105551958084,-0.0226626004699855831958643648249562829733} +#define T_19683_73 {0.9997284978810661026926709382678382098675,-0.0233008696075305346551154173084796639159} +#define T_19683_74 {0.9997210088670696048751551643363200128078,-0.0236200006310007398768746611494862008840} +#define T_19683_76 {0.9997057252230518997748731635510921478271,-0.0242582554247382509160502195300068706274} +#define T_19683_77 {0.9996979305945881133510511062922887504101,-0.0245773791299669502885549832171818707138} +#define T_19683_79 {0.9996820357287219405151290629873983561993,-0.0252156189945495906590267054525611456484} +#define T_19683_80 {0.9996739354929393694959571803337894380093,-0.0255347350888664517687409727386693703011} +#define T_19683_82 {0.9996574294198020682600258624006528407335,-0.0261729594389604920678937816092002321966} +#define T_19683_83 {0.9996490235841294369478760017955210059881,-0.0264920676297021699674427708259827340953} +#define T_19683_85 {0.9996319063188587872303969561471603810787,-0.0271302758799880985862706950229039648548} +#define T_19683_86 {0.9996231948910051512413588170602452009916,-0.0274493758744984925335153036485280608758} +#define T_19683_88 {0.9996054664492995955882292946625966578722,-0.0280875674396715707681337903522944543511} +#define T_19683_89 {0.9995964494372543418521104285900946706533,-0.0284066589453020983124797993468746426515} +#define T_19683_91 {0.9995781098353726523697559969150461256504,-0.0290448332400728877200624111765137058683} +#define T_19683_92 {0.9995687872474051660631744198326487094164,-0.0293639159641827518498757854104042053223} +#define T_19683_94 {0.9995498365021668885077588129206560552120,-0.0300020724032776520129317532337154261768} +#define T_19683_95 {0.9995402083468271081656553178618196398020,-0.0303211460532341048335069899621885269880} +#define T_19683_97 {0.9995206464756117847869631987123284488916,-0.0309592840513958911241587657059426419437} +#define T_19683_98 {0.9995107127617297582133915057056583464146,-0.0312783483345745183523689547655521892011} +#define T_19683_100 {0.9994905397824779269555506289179902523756,-0.0319164673065628762271828122720762621611} +#define T_19683_101 {0.9994803005191638112236773849872406572104,-0.0322355219303478435222132247872650623322} +#define T_19683_103 {0.9994595164503762285690413591510150581598,-0.0328736212909399097559237645782559411600} +#define T_19683_104 {0.9994489716470206230880535258620511740446,-0.0331926659627242437444749612041050568223} +#define T_19683_106 {0.9994275765077583750795042760728392750025,-0.0338307451267151407248157113372144522145} +#define T_19683_107 {0.9994167261740319885277017419866751879454,-0.0341497795539009926790718907341215526685} +#define T_19683_109 {0.9993947199839167128132544348773080855608,-0.0347878379361043696404998115667694946751} +#define T_19683_110 {0.9993835641297703631380500155501067638397,-0.0351068618261032930338849666895839618519} +#define T_19683_112 {0.9993609469089841379485505967750214040279,-0.0357448988413518464746232439210871234536} +#define T_19683_113 {0.9993494855446490854333774223050568252802,-0.0360639119015850606597695104937884025276} +#define T_19683_115 {0.9993262573139342075378976915089879184961,-0.0367019269647310894533198677436303114519} +#define T_19683_116 {0.9993144904499215996906968939583748579025,-0.0370209289026297294622480649195495061576} +#define T_19683_118 {0.9992906512305810284857443548389710485935,-0.0376589214285456691522213645839656237513} +#define T_19683_119 {0.9992785788776822331058724557806272059679,-0.0379779119515510910076727668638341128826} +#define T_19683_121 {0.9992541286915789244815755409945268183947,-0.0386158813551300411637257070651685353369} +#define T_19683_122 {0.9992417508608657517044093765434809029102,-0.0389348601706940508626608732356544351205} +#define T_19683_124 {0.9992166897304231021337272977689281105995,-0.0395728058668503232531143964934017276391} +#define T_19683_125 {0.9992040064332472493191517060040496289730,-0.0398917726824354751391510376379301305860} +#define T_19683_127 {0.9991783343814489848355719914252404123545,-0.0405296940861051072091392200036352733150} +#define T_19683_128 {0.9991653456294424806571896624518558382988,-0.0408486486091849607116266440698382211849} +#define T_19683_130 {0.9991390626798326568547281567589379847050,-0.0414865451353262915112907194270519539714} +#define T_19683_131 {0.9991257684849075282329522451618686318398,-0.0418054870733856748232781797014467883855} +#define T_19683_133 {0.9990988746615903082215481845196336507797,-0.0424433581369798376692337171789404237643} +#define T_19683_134 {0.9990852750359386913459047718788497149944,-0.0427622871975151183643326646688365144655} +#define T_19683_136 {0.9990577703635785677960257089580409228802,-0.0434001322135666237067574968477856600657} +#define T_19683_137 {0.9990438653196728191474562663643155246973,-0.0437190481040859862948977365704195108265} +#define T_19683_139 {0.9990157498234946142900980703416280448437,-0.0443568664876232005012113290831621270627} +#define T_19683_140 {0.9990015393740870885963545333652291446924,-0.0446757689156469101066093685403757262975} +#define T_19683_142 {0.9989728130798756211561340023763477802277,-0.0453135600817226522063485560920526040718} +#define T_19683_143 {0.9989582972379988934363836960983462631702,-0.0456324487547833113065820498377433978021} +#define T_19683_145 {0.9989289601720992006761434822692535817623,-0.0462702121184753387139743097122845938429} +#define T_19683_146 {0.9989141389510657331740617337345611304045,-0.0465890867441181785735260234559973469004} +#define T_19683_148 {0.9988841911403830708948703431815374642611,-0.0472268217205297630156834998160775285214} +#define T_19683_149 {0.9988690645537855461455478689458686858416,-0.0475456820063128796083340432687691645697} +#define T_19683_151 {0.9988385060257851666420947367441840469837,-0.0481833880105733275422963401979359332472} +#define T_19683_152 {0.9988230740874962654274327178427483886480,-0.0485022336640679729846681311755673959851} +#define T_19683_154 {0.9987919048702037505549355955736245959997,-0.0491399101113331529533390096275979885831} +#define T_19683_155 {0.9987761675943761519036456775211263448000,-0.0494587408401240130606524303402693476528} +#define T_19683_157 {0.9987443877163768579663383206934668123722,-0.0500963871455769038654182168102124705911} +#define T_19683_158 {0.9987283451174434611985475385154131799936,-0.0504152026572623340738843467079277615994} +#define T_19683_160 {0.9986959546078828520165870941127650439739,-0.0510528182361135521305506301814602920786} +#define T_19683_161 {0.9986796067005562216323255597671959549189,-0.0513716182383058758698091139649477554485} +#define T_19683_163 {0.9986466055891399795640950287634041160345,-0.0520092025057942303201130584966449532658} +#define T_19683_164 {0.9986299523884130113771107062348164618015,-0.0523279867061199818745187428703502519056} +#define T_19683_166 {0.9985963407054065932300090935314074158669,-0.0529655390775129880642779767185857053846} +#define T_19683_167 {0.9985793822265520702785579487681388854980,-0.0532843071836132109453387784014921635389} +#define T_19683_169 {0.9985451600027808183313027257099747657776,-0.0539218270742076316581758987922512460500} +#define T_19683_170 {0.9985278962613515219004511891398578882217,-0.0542405787937381214658394412708730669692} +#define T_19683_172 {0.9984930635282009969699856810620985925198,-0.0548780656188605220346943269760231487453} +#define T_19683_173 {0.9984754945400293735247032600454986095428,-0.0551968006594920901353162889790837652981} +#define T_19683_175 {0.9984400513294449108769867962109856307507,-0.0558342538344993519205949894512741593644} +#define T_19683_176 {0.9984221771106438492182633126503787934780,-0.0561529719039181238193769729605264728889} +#define T_19683_178 {0.9983861234551305585682712262496352195740,-0.0567903908441979923815701170042302692309} +#define T_19683_179 {0.9983679440220923906323946539487224072218,-0.0571090916501056505838462840074498672038} +#define T_19683_181 {0.9983312799547153781887232071312610059977,-0.0577464757710772561005718728210922563449} +#define T_19683_182 {0.9983127953241127672256993719202000647783,-0.0580651590211913246064590055084408959374} +#define T_19683_184 {0.9982755208784968026236583682475611567497,-0.0587025077383057439228686291698977584019} +#define T_19683_185 {0.9982567310672819660410937103733886033297,-0.0590211731403598310885527666869165841490} +#define T_19683_187 {0.9982188462776118154096138823661021888256,-0.0596584858691006081343743971956428140402} +#define T_19683_188 {0.9981997513030169688619253065553493797779,-0.0599771331308446981056547997468442190439} +#define T_19683_190 {0.9981612562040368397120460031146649271250,-0.0606144092867283851289172957876871805638} +#define T_19683_191 {0.9981418560835743081227633410890121012926,-0.0609330381159290945802808892040047794580} +#define T_19683_193 {0.9981027507105884044591448400751687586308,-0.0615702771145057725643567891893326304853} +#define T_19683_194 {0.9980830454620499558870960754575207829475,-0.0618888872189466421325221290317131206393} +#define T_19683_196 {0.9980433298509220341188097336271312087774,-0.0625260884758004759076399636796850245446} +#define T_19683_197 {0.9980233194923795458919357770355418324471,-0.0628446795632821991750560641776246484369} +#define T_19683_199 {0.9979829936795330258547664925572462379932,-0.0634818424940319647742370534615474753082} +#define T_19683_200 {0.9979626782293381515032137940579559653997,-0.0638004142723726797026273516166838817298} +#define T_19683_202 {0.9979217422517561164596600065124221146107,-0.0644375382926723194731977173432824201882} +#define T_19683_203 {0.9979011217285402857157805556198582053185,-0.0647560904697078582037406135896162595600} +#define T_19683_205 {0.9978595756237652603104493209684733301401,-0.0653931749952469804076926607194764073938} +#define T_19683_206 {0.9978386500464396791088006466452497988939,-0.0657117072788311884501410986558767035604} +#define T_19683_208 {0.9977964938525739624353150247770827263594,-0.0663487517253356084978577200672589242458} +#define T_19683_209 {0.9977752632403296129126601954340003430843,-0.0666672638233405528973563036743144039065} +#define T_19683_211 {0.9977324969960349454467518626188393682241,-0.0673042676065728900924867161847942043096} +#define T_19683_212 {0.9977109613683424749197570235992316156626,-0.0676227592268891369853278661139484029263} +#define T_19683_214 {0.9976675851128399274969638099719304591417,-0.0682597217626492447362096527285757474601} +#define T_19683_215 {0.9976457444894498705068031085829716175795,-0.0685781926131861785389531860346323810518} +#define T_19683_217 {0.9976017582625201773893763856904115527868,-0.0692151133173117827368514554109424352646} +#define T_19683_218 {0.9975796126634626226348245836561545729637,-0.0695335631059978004353538949544599745423} +#define T_19683_220 {0.9975350165054457374225194143946282565594,-0.0701704413943649851770345549084595404565} +#define T_19683_221 {0.9975125659510307718491617379186209291220,-0.0704888698291478016377809012737998273224} +#define T_19683_223 {0.9974673599028260895238418015651404857635,-0.0711257051176715227036595479148672893643} +#define T_19683_224 {0.9974446044136430211679567037208471447229,-0.0714441119065184621073072435137873981148} +#define T_19683_226 {0.9973987885167093780935942959331441670656,-0.0720809036111531437063248972663132008165} +#define T_19683_227 {0.9973757281136275132382706942735239863396,-0.0723992884620513615923087513692735228688} +#define T_19683_229 {0.9973293024099829651163418020587414503098,-0.0730360359987913820845051304786466062069} +#define T_19683_230 {0.9973059371141510531799667660379782319069,-0.0733543986197481567845812833184027113020} +#define T_19683_232 {0.9972589016463728750494510677526704967022,-0.0739911014046284037926071164292807225138} +#define T_19683_233 {0.9972352314792194416526172062731347978115,-0.0743094415036714139866091954900184646249} +#define T_19683_235 {0.9971875862904441278899980716232676059008,-0.0749460989527678117516629185956844594330} +#define T_19683_236 {0.9971636112736772528108986080042086541653,-0.0752644162379453723898947714587848167866} +#define T_19683_238 {0.9971153564076005171301630980451591312885,-0.0759010277673754368832348404794174712151} +#define T_19683_239 {0.9970910765632077232822894075070507824421,-0.0762193219467567767422266911125916521996} +#define T_19683_241 {0.9970422120640843877126258121279533952475,-0.0768558869726801291433204710301652085036} +#define T_19683_242 {0.9970176274143330852339772718551103025675,-0.0771741577543557100149484995199600234628} +#define T_19683_244 {0.9969681533269769690974726472632028162479,-0.0778106756929746179451967691420577466488} +#define T_19683_245 {0.9969432638944137892167418613098561763763,-0.0781289227850563289257124210962501820177} +#define T_19683_247 {0.9968931802641980421952894175774417817593,-0.0787653930526162338043860700054210610688} +#define T_19683_248 {0.9968679860716491702987696044147014617920,-0.0790836161632376827279600206566101405770} +#define T_19683_250 {0.9968172929445058283448588554165326058865,-0.0797200381760277826392879774175526108593} +#define T_19683_251 {0.9967917940150770039764438479323871433735,-0.0800382370133445875115540957267512567341} +#define T_19683_253 {0.9967404914374969893131606113456655293703,-0.0806746101876982951717209857633861247450} +#define T_19683_254 {0.9967146877945735061743448568449821323156,-0.0809927844598883339699568750802427530289} +#define T_19683_256 {0.9966627758136066272953712541493587195873,-0.0816291082121838734719787566973536740988} +#define T_19683_257 {0.9966366674808532222229473518382292240858,-0.0819472576274475478230741032348305452615} +#define T_19683_259 {0.9965841461441083959371667333471123129129,-0.0825835313741084542371595489385072141886} +#define T_19683_260 {0.9965577331454690268586205093015450984240,-0.0829016556406689669733722780620155390352} +#define T_19683_262 {0.9965046025011139452232100666151382029057,-0.0835378787981646553362224949523806571960} +#define T_19683_263 {0.9964778848608122352459304238436743617058,-0.0838559776242682602953593118400021921843} +#define T_19683_265 {0.9964241449575732545440587273333221673965,-0.0844921496091145252105292229316546581686} +#define T_19683_266 {0.9963971227001121588884302582300733774900,-0.0848102227030308186694895766777335666120} +#define T_19683_268 {0.9963427735872745216738621820695698261261,-0.0854463429317903894189001334780186880380} +#define T_19683_269 {0.9963154467374361056286602433829102665186,-0.0857643900018125737716445655678398907185} +#define T_19683_271 {0.9962604884648437186811520405171904712915,-0.0864004578910956555493072528406628407538} +#define T_19683_272 {0.9962328570476898237373575284436810761690,-0.0867184786455407891070379378106736112386} +#define T_19683_274 {0.9961772896657452580626568305888213217258,-0.0873544936120055626194158548969426192343} +#define T_19683_275 {0.9961493537066168357796414056792855262756,-0.0876724877592148371663327566238876897842} +#define T_19683_277 {0.9960931772662811045648822982911951839924,-0.0883084492195680414994285456486977636814} +#define T_19683_278 {0.9960649367907983275927108479663729667664,-0.0886264164679070598484855736387544311583} +#define T_19683_280 {0.9960081513435914413179261828190647065639,-0.0892623238389045059459903086462873034179} +#define T_19683_281 {0.9959796063776538144196592838852666318417,-0.0895802638967635317390758586952870246023} +#define T_19683_283 {0.9959222119756538926793609789456240832806,-0.0902161165952106297583057425981678534299} +#define T_19683_284 {0.9958933625454402527310548975947313010693,-0.0905340291710048650219988530807313509285} +#define T_19683_286 {0.9958353592412841903680487121164333075285,-0.0911698266137571516898319146093854214996} +#define T_19683_287 {0.9958062053732523732918480163789354264736,-0.0914877114159270143911584227680577896535} +#define T_19683_289 {0.9957475932201352852857212383241858333349,-0.0921234530198907219933346368634374812245} +#define T_19683_290 {0.9957181349410225701390686481317970901728,-0.0924413097569020680843721038399962708354} +#define T_19683_292 {0.9956589139926980136507950192026328295469,-0.0930769949390346379436422807884810026735} +#define T_19683_293 {0.9956291513295209005818264813569840043783,-0.0933948233193790944284273791708983480930} +#define T_19683_295 {0.9955693216403007639314637344796210527420,-0.0940304514966897042604898615536512807012} +#define T_19683_296 {0.9955392546203546411121010351052973419428,-0.0943482512288848912396233004074019845575} +#define T_19683_298 {0.9954788162451091437787908944301307201385,-0.0949838218184349963868484678641834761947} +#define T_19683_299 {0.9954484448959686204716490465216338634491,-0.0953015926110248184910389568358368705958} +#define T_19683_301 {0.9953873978901262020713147649075835943222,-0.0959371050299286931561937308288179337978} +#define T_19683_302 {0.9953567222396451086297020083293318748474,-0.0962548465914835893464385208062594756484} +#define T_19683_304 {0.9952950666591923178927459048281889408827,-0.0968903002569088539486230615693784784526} +#define T_19683_305 {0.9952640867355033726937563187675550580025,-0.0972080122960260473163884853420313447714} +#define T_19683_307 {0.9952018226369849784873622411396354436874,-0.0978434066251942097247606966448074672371} +#define T_19683_308 {0.9951705384685002320210855941695626825094,-0.0981610888504980544366773642650514375418} +#define T_19683_310 {0.9951076659090188902823115313367452472448,-0.0987964232606850373263895903619413729757} +#define T_19683_311 {0.9950760775244291700403209688374772667885,-0.0991140753808271712799182751041371375322} +#define T_19683_313 {0.9950125965616457568430064384301658719778,-0.0997493492893638533658418054983485490084} +#define T_19683_314 {0.9949807039899210003852658701362088322639,-0.1000669710130235451339686392202565912157} +#define T_19683_316 {0.9949166146820543898954269934620242565870,-0.1007021838372962885266304056131048128009} +#define T_19683_317 {0.9948844179524433117833837059151846915483,-0.1010197748731807010358352272305637598038} +#define T_19683_319 {0.9948197203582701542146082829276565462351,-0.1016549260306318785973545004708284977823} +#define T_19683_320 {0.9947872195003006901004027895396575331688,-0.1019724860874763050500035888035199604928} +#define T_19683_322 {0.9947219136791555227361527613538783043623,-0.1026075749956048693833920992801722604781} +#define T_19683_323 {0.9946891087226344962957114148593973368406,-0.1029251037821729969357065215262991841882} +#define T_19683_325 {0.9946231947344096324670204012363683432341,-0.1035601298585349661074417326744878664613} +#define T_19683_326 {0.9945900857094225333554504686617292463779,-0.1038776270836191950586169241432799026370} +#define T_19683_328 {0.9945235636145682844855286930396687239408,-0.1045125897458282215879421528370585292578} +#define T_19683_329 {0.9944901505514797124263282057654578238726,-0.1048300551182498735469650341656233649701} +#define T_19683_331 {0.9944230204110037218967477201658766716719,-0.1054649537839777578840383398528501857072} +#define T_19683_332 {0.9943893033404569425925956238643266260624,-0.1057823870125873672032312811097654048353} +#define T_19683_334 {0.9943215652159247408548026214702986180782,-0.1064172210995646128406377783903735689819} +#define T_19683_335 {0.9942875441688417970098612386209424585104,-0.1067346218932422180492025631792785134166} +#define T_19683_337 {0.9942191981223764685182686662301421165466,-0.1073693908192585450001033109401760157198} +#define T_19683_338 {0.9941848731299585129050910836667753756046,-0.1076867588869138830931504458021663594991} +#define T_19683_340 {0.9941159192242405850947761791758239269257,-0.1083214620698187968805825676099630072713} +#define T_19683_341 {0.9940812903179671033981890104769263416529,-0.1086387971203916641416142851994663942605} +#define T_19683_343 {0.9940117286162346577071957653970457613468,-0.1092734339780949137654886271775467321277} +#define T_19683_344 {0.9939767958278640236358114634640514850616,-0.1095907357205553600554281956647173501551} +#define T_19683_346 {0.9939066263939125844828481604054104536772,-0.1102253056710275763707684859582514036447} +#define T_19683_347 {0.9938713897554819487467625549470540136099,-0.1105425738143762104392919809470186010003} +#define T_19683_349 {0.9938006126536642614865968425874598324299,-0.1111770762756493363676568719711212906986} +#define T_19683_350 {0.9937650721974894407750866776041220873594,-0.1114943105289175895311615249738679267466} +#define T_19683_352 {0.9936936874927155827208480332046747207642,-0.1121287449190854906833081372496963012964} +#define T_19683_353 {0.9936578432513907266354635794414207339287,-0.1124459449913358527473050685330235864967} +#define T_19683_355 {0.9935858510091282180809457713621668517590,-0.1130803107285548309013378798226767685264} +#define T_19683_356 {0.9935497030155264752693256014026701450348,-0.1133974763288811415939960625109961256385} +#define T_19683_358 {0.9934771033017996133551719140086788684130,-0.1140317728313704481735157969524152576923} +#define T_19683_359 {0.9934406515890725763995305896969512104988,-0.1143489036688981747014182133170834276825} +#define T_19683_361 {0.9933674444704632122693510609678924083710,-0.1149831303549405520092463461878651287407} +#define T_19683_362 {0.9933306890720408066641766708926297724247,-0.1153002261388270388575705283074057660997} +#define T_19683_364 {0.9932568746156875683084308548131957650185,-0.1159343824267692613094737907886155880988} +#define T_19683_365 {0.9932198155652783855273924018547404557467,-0.1162514428662040077977479768378543667495} +#define T_19683_367 {0.9931453938388770108502967559616081416607,-0.1168855281744574092783750529633834958076} +#define T_19683_368 {0.9931080311704683083462441572919487953186,-0.1172025529786623193606587278736697044224} +#define T_19683_370 {0.9930330022422713120988646551268175244331,-0.1178365667257033344572647592940484173596} +#define T_19683_371 {0.9929953359901285692146188921469729393721,-0.1181535556039329942779048110423900652677} +#define T_19683_373 {0.9929196999289455760617784108035266399384,-0.1187874972083036856362880939741444308311} +#define T_19683_374 {0.9928817301276126050524339916591998189688,-0.1191044498698456272078871620578865986317} +#define T_19683_376 {0.9928054870028099054835024617204908281565,-0.1197383187501542267661136520473519340158} +#define T_19683_377 {0.9927672136871091845833348088490311056376,-0.1200552349043292055252862837733118794858} +#define T_19683_379 {0.9926903635686097349122292143874801695347,-0.1206890304792506418696262926459894515574} +#define T_19683_380 {0.9926517867736419642454848144552670419216,-0.1210059098354128864771794837906782049686} +#define T_19683_382 {0.9925743297319254976329716555483173578978,-0.1216396315236892983202565687861351761967} +#define T_19683_383 {0.9925354494930695992138680594507604837418,-0.1219564737912267743391581120704358909279} +#define T_19683_385 {0.9924573855991725146452608896652236580849,-0.1225901210116680795092491962350322864950} +#define T_19683_386 {0.9924182019520859654448941000737249851227,-0.1229069259000028085937472610567056108266} +#define T_19683_388 {0.9923395312776009946631461389188189059496,-0.1235404980714872036351437145640375092626} +#define T_19683_389 {0.9923000442582192714979782977025024592876,-0.1238572652900754439420083485856594052166} +#define T_19683_391 {0.9922207668752959230928922806924674659967,-0.1244907618315499731043161091292859055102} +#define T_19683_392 {0.9921809765198327246693565939494874328375,-0.1248074910898825384819588180107530206442} +#define T_19683_394 {0.9921010925011767289660724600253161042929,-0.1254409114203636210760350877535529434681} +#define T_19683_395 {0.9920609988461241979251781231141649186611,-0.1257576024279661308646893758123042061925} +#define T_19683_397 {0.9919805082649977290287779396749101579189,-0.1263909459665400192296402792635490186512} +#define T_19683_398 {0.9919401113471257858122953621204942464828,-0.1267075984329732452060568448359845206141} +#define T_19683_400 {0.9918590142773472395631983999919611960649,-0.1273408645987966492096887805018923245370} +#define T_19683_401 {0.9918183141337043595697764430951792746782,-0.1276574782336566127316501706445706076920} +#define T_19683_403 {0.9917366106496483535437391765299253165722,-0.1282906664459572410041943157921195961535} +#define T_19683_404 {0.9916956073175607899727879157580900937319,-0.1286072409588756015885735450865468010306} +#define T_19683_406 {0.9916132974941580524586015599197708070278,-0.1292403506369526333674713214350049383938} +#define T_19683_407 {0.9915719910112302803995021349692251533270,-0.1295568857375969662459880282767699100077} +#define T_19683_409 {0.9914890749239676503989926459325943142176,-0.1301899163008215509762521833181381225586} +#define T_19683_410 {0.9914474653280820337641898731817491352558,-0.1305064116988955413845019393193069845438} +#define T_19683_412 {0.9913639430530024609922179479326587170362,-0.1311393625667114370969557057833299040794} +#define T_19683_413 {0.9913220303823193635395227829576469957829,-0.1314558179719552133413174033194081857800} +#define T_19683_415 {0.9912379019960215753570764718460850417614,-0.1320886885638792584973799648651038296521} +#define T_19683_416 {0.9911956862889795827342709344520699232817,-0.1324051036860695862440451264774310402572} +#define T_19683_418 {0.9911109518686181951707681037078145891428,-0.1330378934216922548472439302713610231876} +#define T_19683_419 {0.9910684331639334487817905028350651264191,-0.1333542679706428701891240962140727788210} +#define T_19683_421 {0.9909830927872189665350788345676846802235,-0.1339869762696287713854559342507855035365} +#define T_19683_422 {0.9909402711238856076292336183541920036077,-0.1343033099551906306423632031510351225734} +#define T_19683_424 {0.9908543248690843130432881480373907834291,-0.1349359362372790360762309092024224810302} +#define T_19683_425 {0.9908112002863743716929434413032140582800,-0.1352522287693405378394828630916890688241} +#define T_19683_427 {0.9907246482323081027132616327435243874788,-0.1358847724543459922763588565430836752057} +#define T_19683_428 {0.9906812207697713867915467744751367717981,-0.1362010235428332827201103327752207405865} +#define T_19683_430 {0.9905940629958174259428460572962649166584,-0.1368334840506460481357464686880121007562} +#define T_19683_431 {0.9905503326932816321459540631622076034546,-0.1371496934055233263283213318572961725295} +#define T_19683_433 {0.9904625692793730395990792203519959002733,-0.1377820701561099370202612135472008958459} +#define T_19683_434 {0.9904185361769434203793593951559159904718,-0.1380982374873796214576060492618125863373} +#define T_19683_436 {0.9903301672035684788397702504880726337433,-0.1387305299007834391566973408771445974708} +#define T_19683_437 {0.9902858313416280644503331132000312209129,-0.1390466549184865563404400745639577507973} +#define T_19683_439 {0.9901968568898305012027094562654383480549,-0.1396788624148282975667711980349849909544} +#define T_19683_440 {0.9901522183090399886751242775062564760447,-0.1399949448290445652709479418263072147965} +#define T_19683_442 {0.9900626384604188645610634011973161250353,-0.1406270668285228286897847738146083429456} +#define T_19683_443 {0.9900176972017166177053582032385747879744,-0.1409431063493711555612009078686241991818} +#define T_19683_445 {0.9899275120384262161010724412335548549891,-0.1415751422722629215833478610875317826867} +#define T_19683_446 {0.9898822681430280434611290729662869125605,-0.1418911386099014904083048804750433191657} +#define T_19683_448 {0.9897914777477777592551433372136671096087,-0.1425230878765627040571928318968275561929} +#define T_19683_449 {0.9897459312571771361533023991796653717756,-0.1428390407411893048283957341482164338231} +#define T_19683_451 {0.9896545357132313647241517173824831843376,-0.1434709027720554030960187219534418545663} +#define T_19683_452 {0.9896086866691993222389100992586463689804,-0.1437868118739077105683321633478044532239} +#define T_19683_454 {0.9895166860603773484328371523588430136442,-0.1444185860894940665044572369879460893571} +#define T_19683_455 {0.9894705345049623623765455704415217041969,-0.1447344511388498899950860732133151032031} +#define T_19683_457 {0.9893779289156385825521056176512502133846,-0.1453661369597524788410680685046827420592} +#define T_19683_458 {0.9893314748911664624486661523405928164721,-0.1456819576669300397853135109471622854471} +#define T_19683_460 {0.9892382644062701624321221061109099537134,-0.1463135545138258553077292845046031288803} +#define T_19683_461 {0.9891915079553441625392906644265167415142,-0.1466293305891840093035938252796768210828} +#define T_19683_463 {0.9890976926603591845577057029004208743572,-0.1472608378828317021724814139815862290561} +#define T_19683_464 {0.9890506338258600038670920184813439846039,-0.1475765690367702442920005978521658107638} +#define T_19683_466 {0.9889562138068248575706320480094291269779,-0.1482079861980105661700690689031034708023} +#define T_19683_467 {0.9889088526319104177630947560828644782305,-0.1485236721409705085150676495686639100313} +#define T_19683_469 {0.9888138279754183912473308737389743328094,-0.1491549985907268394136337974487105384469} +#define T_19683_470 {0.9887661645035238366929775111202616244555,-0.1494706390331906331603306625765981152654} +#define T_19683_472 {0.9886705352967226634319786171545274555683,-0.1501018741924695920619825528774526901543} +#define T_19683_473 {0.9886225695715605832347705472784582525492,-0.1504174688449614605278981116498471237719} +#define T_19683_475 {0.9885263359021522200364984200859908014536,-0.1510486121348533494757049311374430544674} +#define T_19683_476 {0.9884780679677122039450409829441923648119,-0.1513641607079394546531148080248385667801} +#define T_19683_478 {0.9883812299239532750405601291276980191469,-0.1519952115496188971288660241043544374406} +#define T_19683_479 {0.9883326598245022465150100288155954331160,-0.1523107137539077005072840620414353907108} +#define T_19683_481 {0.9882352174952034884469753706071060150862,-0.1529416715686340300095480415620841085911} +#define T_19683_482 {0.9881863452752852605698308252613060176373,-0.1532571271147764868647556113501195795834} +#define T_19683_484 {0.9880882987498116332147901630378328263760,-0.1538879913238943852871187800701591186225} +#define T_19683_485 {0.9880391244542472417577982923830859363079,-0.1542033999225842777480721679239650256932} +#define T_19683_487 {0.9879404738225179283261923046666197478771,-0.1548341699475242472239244762022281065583} +#define T_19683_488 {0.9878909974964054097057442049845121800900,-0.1551495313094984063173598087814752943814} +#define T_19683_490 {0.9877917428488933726526965983794070780277,-0.1557802065717773520869826597845531068742} +#define T_19683_491 {0.9877419645376079859744322675396688282490,-0.1560955204078159075375964448539889417589} +#define T_19683_493 {0.9876421059653403000666571642796043306589,-0.1567261003290376653040993915055878460407} +#define T_19683_494 {0.9875920257145337499693482641305308789015,-0.1570413663499643508458802898530848324299} +#define T_19683_496 {0.9874915633090913802405452770472038537264,-0.1576718503518201586199865005255560390651} +#define T_19683_497 {0.9874411811646927050745148335408885031939,-0.1579870682685025062852446353645063936710} +#define T_19683_499 {0.9873401150182103958030666035483591258526,-0.1586174557727715872523788220860296860337} +#define T_19683_500 {0.9872894310264253014963742316467687487602,-0.1589326252961213437053800134890479966998} +#define T_19683_502 {0.9871877612315913541607415027101524174213,-0.1595629157246713780704538976351614110172} +#define T_19683_503 {0.9871367754389025472860907939320895820856,-0.1598780365656446156297221250497386790812} +#define T_19683_505 {0.9870345020889589315871148755832109600306,-0.1605082293404323789953735968083492480218} +#define T_19683_506 {0.9869832145421258973172484729730058461428,-0.1608233012100298009450227709749015048146} +#define T_19683_508 {0.9868803377308681401558487777947448194027,-0.1614533957531015806452501237799879163504} +#define T_19683_509 {0.9868287484769268091966409883752930909395,-0.1617684183623688265463158586499048396945} +#define T_19683_511 {0.9867252682987038836515125694859307259321,-0.1623984140958610045135657173887011595070} +#define T_19683_512 {0.9866733773849671873534816768369637429714,-0.1627133871558889000041858707845676690340} +#define T_19683_514 {0.9865692939346815126810952278901822865009,-0.1633432835020285078808655043758335523307} +#define T_19683_515 {0.9865171014087387169055887170543428510427,-0.1636582067239533144764607186516514047980} +#define T_19683_517 {0.9864124147818458254732831846922636032104,-0.1642880031050584777041478901082882657647} +#define T_19683_518 {0.9863599206915633077485949797846842557192,-0.1646028762000621425976021328096976503730} +#define T_19683_520 {0.9862546309840718450345775636378675699234,-0.1652325720385426910397086430748458951712} +#define T_19683_521 {0.9862018353775920953552258652052842080593,-0.1655473947178531524127009788571740500629} +#define T_19683_523 {0.9860959426860640419931769429240375757217,-0.1661769894362111754659849793824832886457} +#define T_19683_524 {0.9860428456118063289537190030387137085199,-0.1664917614111026122891701106709660962224} +#define T_19683_526 {0.9859363500333564456212798177148215472698,-0.1671212544319328752173703378502978011966} +#define T_19683_527 {0.9858829515400163723271020899119321256876,-0.1674359754137259292949835298713878728449} +#define T_19683_529 {0.9857758531723124217904796751099638640881,-0.1680653661597164560959072332479991018772} +#define T_19683_530 {0.9857221533088620368801002769032493233681,-0.1683800358597785928882473172052414156497} +#define T_19683_532 {0.9856144522501246729717649941449053585529,-0.1690093237537112769164338033078820444643} +#define T_19683_533 {0.9855604510658124706168337070266716182232,-0.1693239418834569243177412545264814980328} +#define T_19683_535 {0.9854521474148149051686118582438211888075,-0.1699531263482079168625205056741833686829} +#define T_19683_536 {0.9853978449591657140516076651692856103182,-0.1702676926190988815346116780347074382007} +#define T_19683_538 {0.9852889388152339389392864177352748811245,-0.1708967730776392024427678961728815920651} +#define T_19683_539 {0.9852343351380490332758199656382203102112,-0.1712112872011848085929131002558278851211} +#define T_19683_541 {0.9851248266010614873522399648209102451801,-0.1718402630765808458690457882767077535391} +#define T_19683_542 {0.9850699217524180317795412520354148000479,-0.1721547247643382405613010632805526256561} +#define T_19683_544 {0.9849598109228058229192015460284892469645,-0.1727835954797523609904885688592912629247} +#define T_19683_545 {0.9849046049530574276076322348671965301037,-0.1730980044433267361903006076317979022861} +#define T_19683_547 {0.9847938919318036665728754996962379664183,-0.1737267694220177016717343576601706445217} +#define T_19683_548 {0.9847383848915801651813239914190489798784,-0.1740411253730627105795747411320917308331} +#define T_19683_550 {0.9846270697802206317561513060354627668858,-0.1746697840383862609936471699256799183786} +#define T_19683_551 {0.9845712617204277483651253533025737851858,-0.1749840866886040735561635983685846440494} +#define T_19683_553 {0.9844593446210502252213814244896639138460,-0.1756126384640134541204048446161323226988} +#define T_19683_554 {0.9844032355928699073999155189085286110640,-0.1759268875251551733640553720761090517044} +#define T_19683_556 {0.9842907166081141800972886812814977020025,-0.1765553318342016342334943601599661633372} +#define T_19683_557 {0.9842343066630044878806415908911731094122,-0.1768695270180674905535767038600170053542} +#define T_19683_559 {0.9841211858960623448666638068971224129200,-0.1774978632844008974434046876922366209328} +#define T_19683_560 {0.9840644750857571176894111886213067919016,-0.1778120043028405539153879999503260478377} +#define T_19683_562 {0.9839507526403724613217605110548902302980,-0.1784402319502098044345927974063670262694} +#define T_19683_563 {0.9838937410168814290400973732175771147013,-0.1787543185151225788587225906667299568653} +#define T_19683_565 {0.9837794169973497204750856326427310705185,-0.1793824369673761853771765117926406674087} +#define T_19683_566 {0.9837221046129587254114312599995173513889,-0.1796964687907113555898064305438310839236} +#define T_19683_568 {0.9836071791241273176709114522964227944613,-0.1803244774717980281053542057634331285954} +#define T_19683_569 {0.9835495660313978705246995559718925505877,-0.1806384542655550262679753359407186508179} +#define T_19683_571 {0.9834340391786654533845535297587048262358,-0.1812663525995240887400683504893095232546} +#define T_19683_572 {0.9833761254304349552768371722777374088764,-0.1815802740757528899173678382794605568051} +#define T_19683_574 {0.9832599973197521103784879414888564497232,-0.1822080614867549186453032916688243858516} +#define T_19683_575 {0.9832017829691334087627296867140103131533,-0.1825219273575561518274668060257681645453} +#define T_19683_577 {0.9830850537070019434793266555061563849449,-0.1831496032698433917840219464778783731163} +#define T_19683_578 {0.9830265388073839982752133437315933406353,-0.1834634132473687839759435291853151284158} +#define T_19683_580 {0.9829092085008569457116323064838070422411,-0.1840909770852957594300391974684316664934} +#define T_19683_581 {0.9828503931059040521489578168257139623165,-0.1844047308817481911624724943976616486907} +#define T_19683_583 {0.9827324618625857821641034206550102680922,-0.1850321820697722330351098207756876945496} +#define T_19683_584 {0.9826733460262377928273735960829071700573,-0.1853458793974061546983023163193138316274} +#define T_19683_586 {0.9825548139542839010118768783286213874817,-0.1859732173600879001629238018722389824688} +#define T_19683_587 {0.9824953977307564478849144506966695189476,-0.1862868579312095540512217439754749648273} +#define T_19683_589 {0.9823762649388733114719229888578411191702,-0.1869140820932134738896479575487319380045} +#define T_19683_590 {0.9823165483826573618486577288422267884016,-0.1872276656201811440016768983696238137782} +#define T_19683_592 {0.9821968149801024727807430281245615333319,-0.1878547754062760422044675578945316374302} +#define T_19683_593 {0.9821367981459644402875142077391501516104,-0.1881683016015003595544641257220064289868} +#define T_19683_595 {0.9820164642425459611274618509924039244652,-0.1887952964365599839435816420518676750362} +#define T_19683_596 {0.9819561471855275947007157810730859637260,-0.1891087650125040930948472350792144425213} +#define T_19683_598 {0.9818352128916044696538278913067188113928,-0.1897356443215075794128665620519313961267} +#define T_19683_599 {0.9817745956670230755847228465427178889513,-0.1900490549906875270558259671815903857350} +#define T_19683_601 {0.9816530610935050304988180869258940219879,-0.1906758181987200095885981454557622782886} +#define T_19683_602 {0.9815921437569528062994095307658426463604,-0.1909891706737049110742532320728059858084} +#define T_19683_604 {0.9814700090152999045756132545648142695427,-0.1916158172059579389845396235614316537976} +#define T_19683_605 {0.9814087916226447161349710768263321369886,-0.1919291111993703391469523467094404622912} +#define T_19683_607 {0.9812860568248674697500177899200934916735,-0.1925556404811424870970881784160155802965} +#define T_19683_608 {0.9812245394322520741781090691802091896534,-0.1928688757056584990312586569416453130543} +#define T_19683_610 {0.9811012046909113326620399675448425114155,-0.1934952871623559222946653335384326055646} +#define T_19683_611 {0.9810393873547537113566363586869556456804,-0.1938084633307055604234392376383766531944} +#define T_19683_613 {0.9809154527829604397481944033643230795860,-0.1944347563878423834626829602711950428784} +#define T_19683_614 {0.9808533355599537983948721375782042741776,-0.1947478732128098688480832834102329798043} +#define T_19683_616 {0.9807288012713692992861069797072559595108,-0.1953740472960088792042654404212953522801} +#define T_19683_617 {0.9806663842184815127467345519107766449451,-0.1956871044904328060809461931057740002871} +#define T_19683_619 {0.9805412503273168711714902201492805033922,-0.1963131590254258707073375944673898629844} +#define T_19683_620 {0.9804785335017911496180431640823371708393,-0.1966261563021995673050668074210989288986} +#define T_19683_622 {0.9803528001228074550965629896381869912148,-0.1972520907148281876786199973139446228743} +#define T_19683_623 {0.9802897835821616778773091027687769383192,-0.1975650277868999105113090308805112726986} +#define T_19683_625 {0.9801634508306698023716307943686842918396,-0.1981908415031157222330193690140731632710} +#define T_19683_626 {0.9801001346326969621003399879555217921734,-0.1985037180834889614100546850750106386840} +#define T_19683_628 {0.9799732026245574489919931693293619900942,-0.1991294105293543725831995061525958590209} +#define T_19683_629 {0.9799095868273250964364251558436080813408,-0.1994422263310880460984719775296980515122} +#define T_19683_631 {0.9797820556789480495041289032087661325932,-0.2000677969327765981510935944243101403117} +#define T_19683_632 {0.9797181403407987376752430463966447860003,-0.2003805516689854127054815080555272288620} +#define T_19683_634 {0.9795900101691438210949058884580153971910,-0.2010059998527824742797776025327038951218} +#define T_19683_635 {0.9795257953486946611576513532781973481178,-0.2013186932366371195701759688745369203389} +#define T_19683_637 {0.9793970662712707664354638836812227964401,-0.2019440184289402473449825947682256810367} +#define T_19683_638 {0.9793325520274135387310820988204795867205,-0.2022566501736676736200593040848616510630} +#define T_19683_640 {0.9792032241622788957258194386668037623167,-0.2028818518009873062002412780202575959265} +#define T_19683_641 {0.9791384105541800497718440965400077402592,-0.2031944216198710018161932566727045923471} +#define T_19683_643 {0.9790084840199421156725634318718221038580,-0.2038194991088308205551271612421260215342} +#define T_19683_644 {0.9789433711070426591405180261062923818827,-0.2041320067152110895314365279773483052850} +#define T_19683_646 {0.9788128460228577853996512203593738377094,-0.2047569594925486291536742555763339623809} +#define T_19683_647 {0.9787474338648731730927465832792222499847,-0.2050694045998227854621376309296465478837} +#define T_19683_649 {0.9786163103504468274707051023142412304878,-0.2056942320923900446860699275930528528988} +#define T_19683_650 {0.9785505990073667392792344799090642482042,-0.2060066144140127453177058214350836351514} +#define T_19683_652 {0.9784188771829530617551995419489685446024,-0.2066313160487765199224696743840468116105} +#define T_19683_653 {0.9783528667150419577680509064521174877882,-0.2069436352982600146876990265809581615031} +#define T_19683_655 {0.9782205467014438715622759445977862924337,-0.2075682105023025358914168236879049800336} +#define T_19683_656 {0.9781542371692404369554196819080971181393,-0.2078804663932169172202435447616153396666} +#define T_19683_658 {0.9780213190878094264846254191070329397917,-0.2085049145937364623026866183863603509963} +#define T_19683_659 {0.9779547105521263494765094037575181573629,-0.2088171068397098872893025145458523184061} +#define T_19683_661 {0.9778211945247623493315813902881927788258,-0.2094414274640210849032229134536464698613} +#define T_19683_662 {0.9777542870466868762946432980243116617203,-0.2097535557787401361284906897708424367011} +#define T_19683_664 {0.9776201731958383822629343740118201822042,-0.2103777482542746879445871854841243475676} +#define T_19683_665 {0.9775529668367316515897869066975545138121,-0.2106898123514845677650697552962810732424} +#define T_19683_667 {0.9774182552853952765659073520509991794825,-0.2113138761057916925611976921572932042181} +#define T_19683_668 {0.9773507501068927627585480877314694225788,-0.2116258756992964173981874864693963900208} +#define T_19683_670 {0.9772154409786132367443656221439596265554,-0.2122498101600434061708710942184552550316} +#define T_19683_671 {0.9771476370426243063249671649828087538481,-0.2125617449637062783551755273947492241859} +#define T_19683_673 {0.9770117304614945874519094104471150785685,-0.2131855495586789106532421556039480492473} +#define T_19683_674 {0.9769436278302026099851218532421626150608,-0.2134974192864225739363348566257627680898} +#define T_19683_676 {0.9768071239208633294026640214724466204643,-0.2141210934435258395058809810507227666676} +#define T_19683_677 {0.9767387226567257885179174081713426858187,-0.2144328978093326121268091810634359717369} +#define T_19683_679 {0.9766016215443655834604896881501190364361,-0.2150564409565910439781077911902684718370} +#define T_19683_680 {0.9765329217101136327627841637877281755209,-0.2153681796745032239748240954213542863727} +#define T_19683_682 {0.9763952235204688134828643342189025133848,-0.2159915912400615645161394695605849847198} +#define T_19683_683 {0.9763262251791073875750726074329577386379,-0.2163032640241816517701067823509220033884} +#define T_19683_685 {0.9761879300384618263208835742261726409197,-0.2169265434363052968969043377001071348786} +#define T_19683_686 {0.9761186332532698628483558422885835170746,-0.2172381500007962429332764031642000190914} +#define T_19683_688 {0.9759797412884547718192607135279104113579,-0.2178612966878717416285837771283695474267} +#define T_19683_689 {0.9759101461229847673806148122821468859911,-0.2181728367469572271719613354434841312468} +#define T_19683_691 {0.9757706574613789207717218232573941349983,-0.2187958501374929198846075451001524925232} +#define T_19683_692 {0.9757007639794570419411456896341405808926,-0.2191073234054576324147944887954508885741} +#define T_19683_694 {0.9755606787489863318540983527782373130322,-0.2197302029280840673930441653283196501434} +#define T_19683_695 {0.9754904870147123041590475622797384858131,-0.2200416091192739509452280799450818449259} +#define T_19683_697 {0.9753498053438498516243271296843886375427,-0.2206643542027444671038693968512234278023} +#define T_19683_698 {0.9752793154215968485232224338687956333160,-0.2209756930315669443132264859741553664207} +#define T_19683_700 {0.9751380374393625594109380472218617796898,-0.2215983031047581708339322403844562359154} +#define T_19683_701 {0.9750672493937775353600727612501941621304,-0.2219095742856825037581103288175654597580} +#define T_19683_703 {0.9749253752297383224245663768670056015253,-0.2225320487775948596897990228171693161130} +#define T_19683_704 {0.9748542891257413467442916044092271476984,-0.2228432520251523163423712503572460263968} +#define T_19683_706 {0.9747118189100109075795330682012718170881,-0.2234655903649106212238706348216510377824} +#define T_19683_707 {0.9746404348127953864988626264675986021757,-0.2237767253936947531300916125474032014608} +#define T_19683_709 {0.9744973686760342035384496739425230771303,-0.2243989270105486433237729215761646628380} +#define T_19683_710 {0.9744256866510669912173625561990775167942,-0.2247099935352155908319105037662666290998} +#define T_19683_712 {0.9742820247244816656007060373667627573013,-0.2253320578585401023907763828901806846261} +#define T_19683_713 {0.9742100448375028420855414879042655229568,-0.2256430555938088722278678233124082908034} +#define T_19683_715 {0.9740657872528467597916801423707511276007,-0.2262649820531049404959134108139551244676} +#define T_19683_716 {0.9739935095698694089705327314732130616903,-0.2265759107137575445456434408697532489896} +#define T_19683_718 {0.9738486564594420746843184133467730134726,-0.2271976987386525870249442959902808070183} +#define T_19683_719 {0.9737760810467526173539454248384572565556,-0.2275085580395344309057037435195525176823} +#define T_19683_721 {0.9736306325433996544660431027295999228954,-0.2281302070597827913456256965218926779926} +#define T_19683_722 {0.9735577594675575152649571464280597865582,-0.2284409967158028686995407952053938060999} +#define T_19683_724 {0.9734117157046706658718449034495279192924,-0.2290625061612863999638278755810461007059} +#define T_19683_725 {0.9733385450325080512357089901342988014221,-0.2293732258874175977680920368584338575602} +#define T_19683_727 {0.9731919061440250651173755613854154944420,-0.2299945951881461059240763233901816420257} +#define T_19683_728 {0.9731184379426471853236080278293229639530,-0.2303052446994254542911306771202362142503} +#define T_19683_742 {0.9720794745962966221242140818503685295582,-0.2346518592907115563495068499832996167243} +#define T_19683_770 {0.9699433244606892046135726559441536664963,-0.2433309419990525857002694465336389839649} +#define T_19683_784 {0.9688461803356978041534830481396056711674,-0.2476632367730999817467818502336740493774} +#define T_19683_812 {0.9665938635344750728961571439867839217186,-0.2563129005288196093914621087606064975262} +#define T_19683_826 {0.9654387358426483878304225072497501969337,-0.2606300967550544411821533685724716633558} +#define T_19683_854 {0.9630706572300612444692546887381467968225,-0.2692487867799183143269203810632461681962} +#define T_19683_868 {0.9618577536057586474527170139481313526630,-0.2735501084417328310216532827325863763690} +#define T_19683_896 {0.9593743388442591557563332571589853614569,-0.2821362755250387821526203424582490697503} +#define T_19683_910 {0.9581038773070716940694069307937752455473,-0.2864209494592803162404948125185910612345} +#define T_19683_938 {0.9555055727908005058779394857992883771658,-0.2949730502363294704082363750785589218140} +#define T_19683_952 {0.9541777817063621380810900518554262816906,-0.2992403062722766016179321013623848557472} +#define T_19683_980 {0.9514650544809087895359311914944555610418,-0.3077568035017931857844075693719787523150} +#define T_19683_994 {0.9500801725198469993571848135616164654493,-0.3120058745995941684725494269514456391335} +#define T_19683_1022 {0.9472535101982985095148137588694225996733,-0.3204852374400449233071697108243824914098} +#define T_19683_1036 {0.9458117862933340669329140837362501770258,-0.3247153598285930087286033085547387599945} +#define T_19683_1064 {0.9428716969686263826844196955789811909199,-0.3331560641133561873061808000784367322922} +#define T_19683_1078 {0.9413733902698268041220330815121997147799,-0.3373664774275776934153725505893817171454} +#define T_19683_1106 {0.9383204024234154116967943082272540777922,-0.3457670059389122418913586898270295932889} +#define T_19683_1120 {0.9367657822516132215540096694894600659609,-0.3499569533564423351457151056820293888450} +#define T_19683_1148 {0.9336004446584779126183661901450250297785,-0.3583157960982077394618272592197172343731} +#define T_19683_1162 {0.9319897904568600344177298211434390395880,-0.3624845244754292816757867967680795118213} +#define T_19683_1190 {0.9287126720868629226046664371096994727850,-0.3708001789445078966167557155131362378597} +#define T_19683_1204 {0.9270462733707408586880660550377797335386,-0.3749469389519291540047163380222627893090} +#define T_19683_1232 {0.9236579632863537447917678946396335959435,-0.3832179104083011655923485250241355970502} +#define T_19683_1246 {0.9219361195911229822641530518012586981058,-0.3873419566652477885604355378745822235942} +#define T_19683_1274 {0.9184372268415437190469674533233046531677,-0.3955667584006710146837804131791926920414} +#define T_19683_1288 {0.9166602476688420209072205580014269798994,-0.3996673496092684185754251302569173276424} +#define T_19683_1316 {0.9130514011805173080205122460029087960720,-0.4078445032145143200885684109380235895514} +#define T_19683_1330 {0.9112196059425924365982041308598127216101,-0.4119209022929359864662046675221063196659} +#define T_19683_1358 {0.9075014544061685839437814138364046812057,-0.4200489379235338716078729248692980036139} +#define T_19683_1372 {0.9056151723684631171806813654256984591484,-0.4241004121384920888537806149543030187488} +#define T_19683_1400 {0.9017883841221834284596070574480108916759,-0.4321778687789337713986981270863907411695} +#define T_19683_1414 {0.8998479543441498806899403462011832743883,-0.4362036898773883342173007804376538842916} +#define T_19683_1442 {0.8959132172537213056884297657234128564596,-0.4442291156037461163919033424463123083115} +#define T_19683_1456 {0.8939189885278755465236599775380454957485,-0.4482285599438080581080612319055944681168} +#define T_19683_1484 {0.8898770098628242530836018886475358158350,-0.4562005121847180211247518855088856071234} +#define T_19683_1498 {0.8878293406520505470780335599556565284729,-0.4601728608657241759161138361378107219934} +#define T_19683_1526 {0.8836808469585902825471634969289880245924,-0.4680899066616887593816898061049869284034} +#define T_19683_1540 {0.8815801053317073865400743670761585235596,-0.4720344456534243948730988904571859166026} +#define T_19683_1568 {0.8773258423021428331622928453725762665272,-0.4798951619143868585481982336204964667559} +#define T_19683_1582 {0.8751724058677433637498666030296590179205,-0.4838111821854327310177268373081460595131} +#define T_19683_1610 {0.8708131382064315806346144199778791517019,-0.4916141559465779797832851727434899657965} +#define T_19683_1624 {0.8686073940450068642249448203074280172586,-0.4955009535917586638298359957843786105514} +#define T_19683_1652 {0.8641439053309019069359919740236364305019,-0.5032447822674938064935190595861058682203} +#define T_19683_1666 {0.8618862499252645248404292033228557556868,-0.5071016586344049281720458566269371658564} +#define T_19683_1694 {0.8573193424710686683098970206629019230604,-0.5147849502704741064817994811164680868387} +#define T_19683_1708 {0.8550101816350841321678899475955404341221,-0.5186112120850651097114791809872258454561} +#define T_19683_1736 {0.8503406763430324533103998874139506369829,-0.5262325856087536335436993795156013220549} +#define T_19683_1750 {0.8479804251486746657917592528974637389183,-0.5300275450999437643062606184685137122869} +#define T_19683_1778 {0.8432091613629771886806452130258549004793,-0.5375856305683260893957253756525460630655} +#define T_19683_1792 {0.8407982440657187916954740103392396122217,-0.5413486055916316708191970974439755082130} +#define T_19683_1820 {0.8359260794216887280327910048072226345539,-0.5488420444378185880651699335430748760700} +#define T_19683_1834 {0.8334649293842413264599144895328208804131,-0.5525723585979683827318353905866388231516} +#define T_19683_1862 {0.8284927396541349464698100746318232268095,-0.5599998038753100093600778563995845615864} +#define T_19683_1876 {0.8259817992685517529238836687000002712011,-0.5636967866478271860231075152114499360323} +#define T_19683_1904 {0.8209104782041481973564600593817885965109,-0.5710569032720266280378496048797387629747} +#define T_19683_1918 {0.8183501988123043080491925138630904257298,-0.5747198901237557944199352277792058885098} +#define T_19683_1946 {0.8131806579842534299373824069334659725428,-0.5820113551128506257370531784545164555311} +#define T_19683_1960 {0.8105714997967172763537746504880487918854,-0.5856396876214079449951555034203920513391} +#define T_19683_1988 {0.8053046684306847113887783962127286940813,-0.5928611903335763155808990632067434489727} +#define T_19683_2002 {0.8026471004439946765884883461694698780775,-0.5964542163057009460658264288213104009628} +#define T_19683_2030 {0.7972839252536334520016225724248215556145,-0.6036044586748499085615549120120704174042} +#define T_19683_2044 {0.7945784251659960828462203608069103211164,-0.6071615322636354505903000244870781898499} +#define T_19683_2072 {0.7891198701827747408188429290021304041147,-0.6142392290327285397921741605387069284916} +#define T_19683_2086 {0.7863669243081979898235545078932773321867,-0.6177597108537143943962632874900009483099} +#define T_19683_2114 {0.7808139707081159786028479175001848489046,-0.6247635898057963821372595702996477484703} +#define T_19683_2128 {0.7780140738889945728473662711621727794409,-0.6282468470518973724381339707178995013237} +#define T_19683_2156 {0.7723677198162154367011567046574782580137,-0.6351756492387756747319826899911276996136} +#define T_19683_2170 {0.7695213753343826956765383329184260219336,-0.6386210557940288357059444024343974888325} +#define T_19683_2198 {0.7637826357218174821994693957094568759203,-0.6454735357625698277672654512571170926094} +#define T_19683_2212 {0.7608903552080804599810903710022103041410,-0.6488804723146791575416614250571001321077} +#define T_19683_2240 {0.7550602615949536522421681183914188295603,-0.6556553983306780963857818278484046459198} +#define T_19683_2254 {0.7521225649371274801779918561805970966816,-0.6590232524823357307397486692934762686491} +#define T_19683_2282 {0.7462021652835576501772152369085233658552,-0.6657194067519214275563399496604688465595} +#define T_19683_2296 {0.7432195805330152893475315067917108535767,-0.6690475731308864748569931180099956691265} +#define T_19683_2324 {0.7372099390316437794723469778546132147312,-0.6756637520194197499279198382282629609108} +#define T_19683_2338 {0.7341830023083991685339810828736517578363,-0.6789516323873338032868218760995659977198} +#define T_19683_2366 {0.7280851991931018840631395505624823272228,-0.6854866466357613097315493178030010312796} +#define T_19683_2380 {0.7250144545894418035558715018851216882467,-0.6887336499956816515677360257541295140982} +#define T_19683_2408 {0.7188295859411566457453091061324812471867,-0.6951863249343050998874105061986483633518} +#define T_19683_2422 {0.7157155854238406167411312708281911909580,-0.6983918676369370581724638213927391916513} +#define T_19683_2450 {0.7094447629735467497624767929664812982082,-0.7047610433965600940098283899715170264244} +#define T_19683_2464 {0.7062880662845905099800347670679911971092,-0.7079245492451676780021330159797798842192} +#define T_19683_2492 {0.6999324172134746557816242784610949456692,-0.7142090809655828875790462006989400833845} +#define T_19683_2506 {0.6967335917695366420687719255511183291674,-0.7173299813195602725457433734845835715532} +#define T_19683_2534 {0.6902942585063822633628660696558654308319,-0.7235287393553374579724390969204250723124} +#define T_19683_2548 {0.6870538792967703090042164149053860455751,-0.7266064732324224451076588593423366546631} +#define T_19683_2576 {0.6805320193126056516064181778347119688988,-0.7327183433559633085607742941647302359343} +#define T_19683_2590 {0.6772506687959226612250063226383645087481,-0.7357523575330734422195178012771066278219} +#define T_19683_2618 {0.6706474543959652923064140850328840315342,-0.7417762411348934881161198973131831735373} +#define T_19683_2632 {0.6673257223954119909947735322930384427309,-0.7447659902475686211076322251756209880114} +#define T_19683_2660 {0.6606423405083469146958918827294837683439,-0.7507008045337725254952943032549228519201} +#define T_19683_2674 {0.6572808241057015443686850630911067128181,-0.7536457511742037373991820459195878356695} +#define T_19683_2702 {0.6505184760703301982687207782873883843422,-0.7594904293611178802692052158818114548922} +#define T_19683_2716 {0.6471177794986223696938054672500584274530,-0.7623900441747467615627442683035042136908} +#define T_19683_2744 {0.6402776808479223591419327021867502480745,-0.7681435356806735059720381286751944571733} +#define T_19683_2758 {0.6368384153828224869542395936150569468737,-0.7709972974613436003110678029770497232676} +#define T_19683_2786 {0.6299217956254548056449493742547929286957,-0.7766585680954036785550442800740711390972} +#define T_19683_2800 {0.6264445794753973340007746628543827682734,-0.7794659638790474298630783778207842260599} +#define T_19683_2828 {0.6194526818747023710898247372824698686600,-0.7850339960270756867188879368768539279699} +#define T_19683_2842 {0.6159381400697623298867711127968505024910,-0.7877945211839200156944684749760199338198} +#define T_19683_2870 {0.6088722214202831883866906537150498479605,-0.7932683139913818681776547236950136721134} +#define T_19683_2884 {0.6053209856998270632644221223017666488886,-0.7959814723166549477184616989688947796822} +#define T_19683_2912 {0.5981823161014001577484577865106984972954,-0.8013600418685502546622956288047134876251} +#define T_19683_2926 {0.5945950248005309468624091095989570021629,-0.8040253456716747182397853066504467278719} +#define T_19683_2954 {0.5873848874299858469072432853863574564457,-0.8093077251693961970957502671808470040560} +#define T_19683_2968 {0.5837621853648018444005174387712031602859,-0.8119246953616515716234403043927159160376} +#define T_19683_2996 {0.5764818762453097766851328742632176727057,-0.8171099352967673423719929814978968352079} +#define T_19683_3010 {0.5728244145969990652744741055357735604048,-0.8196781014774063844896545560914091765881} +#define T_19683_3038 {0.5654752423651125958770080615067854523659,-0.8247652698023342221489428993663750588894} +#define T_19683_3052 {0.5617836785629030105226888736069668084383,-0.8272841703431371707111452451499644666910} +#define T_19683_3080 {0.5543669642333286518010027066338807344437,-0.8322723526386799353105061527458019554615} +#define T_19683_3094 {0.5506419618363148638096049580781254917383,-0.8347415347669326912694032216677442193031} +#define T_19683_3122 {0.5431590385644603502512950399250257760286,-0.8396298344066456253997898784291464835405} +#define T_19683_3136 {0.5394012671423288329819456521363463252783,-0.8420488542865253167590822158672381192446} +#define T_19683_3164 {0.5318534799846680316548486189276445657015,-0.8468363925978844575226389679301064461470} +#define T_19683_3178 {0.5280636149973422233117048563144635409117,-0.8492048154102393997533226865925826132298} +#define T_19683_3206 {0.5204523206696415327243698811798822134733,-0.8538907318325832385141893610125407576561} +#define T_19683_3220 {0.5166310433458674022944023818126879632473,-0.8562081318530913032205376111960504204035} +#define T_19683_3248 {0.5089576099793156060968613019213080406189,-0.8607915840923068273582430265378206968307} +#define T_19683_3262 {0.5051056071942108260941495245788246393204,-0.8630575447679995626515392359578981995583} +#define T_19683_3290 {0.4973714140894973101403309101442573592067,-0.8675377089479244796521584248694125562906} +#define T_19683_3304 {0.4934893782410855189723974945081863552332,-0.8697518229720629934220710310910362750292} +#define T_19683_3332 {0.4856958156204709275982622784795239567757,-0.8741278937825777139991600961366202682257} +#define T_19683_3346 {0.4817844445052228419257289715460501611233,-0.8762897631678655541165312570228707045317} +#define T_19683_3374 {0.4739329132626463048083564899570774286985,-0.8805609540096477338977365434402599930763} +#define T_19683_3388 {0.4699929099500509965814387669524876400828,-0.8826701901597693300516311865067109465599} +#define T_19683_3416 {0.4620848213993192787896191475738305598497,-0.8868357332856853236791039307718165218830} +#define T_19683_3430 {0.4581168941055070997769860241533024236560,-0.8888919570651562240826137895055580884218} +#define T_19683_3458 {0.4501536697266109166015723985765362158418,-0.8929511037182636945530589400732424110174} +#define T_19683_3472 {0.4461585316870516071396934876247541978955,-0.8949539455205804960868931630102451890707} +#define T_19683_3500 {0.4381416028706541232473625768761849030852,-0.8989059660687173103355007697246037423611} +#define T_19683_3514 {0.4341199722119527537600447431032080203295,-0.9008550658827941814976725254382472485304} +#define T_19683_3542 {0.4260507800020971735932562296511605381966,-0.9046992499497302775424145693250466138124} +#define T_19683_3556 {0.4220033796129105119199209639191394671798,-0.9065942574246109719737773957604076713324} +#define T_19683_3584 {0.4138833744479920584424803564616013318300,-0.9103299140177385506689233807264827191830} +#define T_19683_3598 {0.4098109318490896768594211607705801725388,-0.9121704885255720318681937897054012864828} +#define T_19683_3626 {0.4016415733011391431261927209561690688133,-0.9157969461601112026727378179202787578106} +#define T_19683_3640 {0.3975448205146310809432463884149910882115,-0.9175827568573799997153628282831050455570} +#define T_19683_3668 {0.3893275770269565838610503760719439014792,-0.9210993636770786752165918187529314309359} +#define T_19683_3682 {0.3852072504447123235671313068451127037406,-0.9228300895640674239572831538680475205183} +#define T_19683_3710 {0.3769435990679463888142208816134370863438,-0.9262362134583722594882715384301263839006} +#define T_19683_3724 {0.3728004393192285714775380256469361484051,-0.9279115434368677695076144118502270430326} +#define T_19683_3752 {0.3644918654458280671271097617136547341943,-0.9312065721545463858888069808017462491989} +#define T_19683_3766 {0.3603266172641639841778271602379390969872,-0.9328262050837576868644873684388585388660} +#define T_19683_3794 {0.3519746143614106981267752871644916012883,-0.9360095463429506379426925377629231661558} +#define T_19683_3808 {0.3477880264507265395401702789968112483621,-0.9375731910936386803712139226263388991356} +#define T_19683_3836 {0.3393940957922760848219922991120256483555,-0.9406442726883225136091937201854307204485} +#define T_19683_3850 {0.3351869206923171473633260575297754257917,-0.9421516481951311972053986210084985941648} +#define T_19683_3878 {0.3267525710883450451582632467761868610978,-0.9451099180979724012630072138563264161348} +#define T_19683_3892 {0.3225255650394067141739640192099614068866,-0.9465607534099496067625523210153914988041} +#define T_19683_3920 {0.3140523125653993385952844619168899953365,-0.9494056798715313494341216937755234539509} +#define T_19683_3934 {0.3098062353723928241677754158445168286562,-0.9507997142008328683715490114991553127766} +#define T_19683_3962 {0.3012956030966327247710978554096072912216,-0.9535307858452375384672450309153646230698} +#define T_19683_3976 {0.2970312179925099216326600526372203603387,-0.9548677686140034648332175493123941123486} +#define T_19683_4004 {0.2884847357023042069279483712307410314679,-0.9574844945307321442129477873095311224461} +#define T_19683_4018 {0.2842028092108664916182192428095731884241,-0.9587641854161281784740822331514209508896} +#define T_19683_4046 {0.2756220131375670678863798457314260303974,-0.9612660952483421672454255713091697543859} +#define T_19683_4060 {0.2713233149356826801046338459855178371072,-0.9624882642257578391209449364396277815104} +#define T_19683_4088 {0.2627097474785486941328827015240676701069,-0.9648749082548251365665237244684249162674} +#define T_19683_4102 {0.2583950502578030161693334321171278133988,-0.9660393356392209529559522707131691277027} +#define T_19683_4130 {0.2497502597067543239628406581687158904970,-0.9683102848655531502686244493816047906876} +#define T_19683_4144 {0.2454203390345585655829552251816494390368,-0.9694167613509487857470503513468429446220} +#define T_19683_4172 {0.2367458792918703536223290484485914930701,-0.9715716075711141597182063378568273037672} +#define T_19683_4186 {0.2324015134720532060885744840561528690159,-0.9726199342682110282609642126772087067366} +#define T_19683_4214 {0.2236989437730419194583220132699352689087,-0.9746582901483090699557010339049156755209} +#define T_19683_4228 {0.2193409137059492697297002905543195083737,-0.9756482786202409496212339945486746728420} +#define T_19683_4256 {0.2106117983386998071537732357683125883341,-0.9775697777655254494533210163353942334652} +#define T_19683_4270 {0.2062408873808279363704087927544605918229,-0.9785012500617301656191671099804807454348} +#define T_19683_4298 {0.1974867954050126550580301909576519392431,-0.9803055470824688644171374107827432453632} +#define T_19683_4312 {0.1931037892281996237731078736032941378653,-0.9811783357706748143201025413873139768839} +#define T_19683_4340 {0.1843262941930396969780758809065446257591,-0.9828651063442332969088965910486876964569} +#define T_19683_4354 {0.1799319806432408408447543024522019550204,-0.9836790545405551533519883378176018595695} +#define T_19683_4382 {0.1711326603046604832858434974696137942374,-0.9852479954696937713976012673811055719852} +#define T_19683_4396 {0.1667278292603329992171978801707155071199,-0.9860029568668327026870201734709553420544} +#define T_19683_4424 {0.1579082652973577416410932983126258477569,-0.9874537861342065347969310096232220530510} +#define T_19683_4438 {0.1534937085274798163059273292674333788455,-0.9881496250277490567270888277562335133553} +#define T_19683_4466 {0.1446554862579293998514629038254497572780,-0.9894820818466001366431328278849832713604} +#define T_19683_4480 {0.1402319972796798042136146023040055297315,-0.9901186731594107115483893721830099821091} +#define T_19683_4508 {0.1313767053752070701477805414469912648201,-0.9913325180204450859378084714990109205246} +#define T_19683_4522 {0.1269450793113302000669762037432519719005,-0.9919097473251485830303408874897286295891} +#define T_19683_4550 {0.1180743095118572116852817543986020609736,-0.9930047620395876517562783192261122167110} +#define T_19683_4564 {0.1136353429477396786984044751989131327718,-0.9935225255791383380810088965517934411764} +#define T_19683_4592 {0.1047506897753422566710312935356341768056,-0.9944985133179385927704174719110596925020} +#define T_19683_4606 {0.1003051806158266473500972892907157074660,-0.9949567180242698816172719489259179681540} +#define T_19683_4634 {0.0914082410881187357176713703665882349014,-0.9958135033535026048312488455849234014750} +#define T_19683_4648 {0.0869569884140799914673536363807215820998,-0.9962120668642563403594181181688327342272} +#define T_19683_4676 {0.0780493617571495906792833352483285125345,-0.9969494957766424914069602891686372458935} +#define T_19683_4690 {0.0735931656818602092373637901800975669175,-0.9972883464499734396113694856467191129923} +#define T_19683_4718 {0.0646764530428082379254206557561701629311,-0.9979062863925657334007723875402007251978} +#define T_19683_4732 {0.0602161145681177414901164013372181216255,-0.9981853633200195030639179094578139483929} +#define T_19683_4760 {0.0512919187272516605147210100312804570422,-0.9986837032180293505234658368863165378571} +#define T_19683_4774 {0.0468282395996064276788573010890104342252,-0.9989029562354903024612440276541747152805} +#define T_19683_4802 {0.0378981646823402171242456404343101894483,-0.9992816065122535063025566159922163933516} +#define T_19683_4816 {0.0334319472486694566071285805719526251778,-0.9994409962089618737479668197920545935631} +#define T_19683_4844 {0.0244975984371818868157166804166990914382,-0.9996998888020397489029278403904754668474} +#define T_19683_4858 {0.0200296455006756593530958809878939064220,-0.9997993865276759706262055260594934225082} +#define T_19683_4886 {0.0110926287453785594322441809822521463502,-0.9999384749010896689114247237739618867636} +#define T_19683_4900 {0.0066237434211837906139486165102425729856,-0.9999780627709241587197652734175790101290} +#define T_19683_4928 {-0.0023143348479477379893431976398687766050,-0.9999973219235197552379190710780676454306} +#define T_19683_4942 {-0.0067833492770873063740233455121142469579,-0.9999769928216274417209774583170656114817} +#define T_19683_4970 {-0.0157208824392709098360931818660901626572,-0.9998764192915695625529792778252158313990} +#define T_19683_4984 {-0.0201892226674049053702919565012052771635,-0.9997961768721091990741456356772687286139} +#define T_19683_5012 {-0.0291246041998412842632948382970425882377,-0.9995757887375036343513556857942603528500} +#define T_19683_5026 {-0.0335914670422070571476069744676351547241,-0.9994356474240612131509919890959281474352} +#define T_19683_5054 {-0.0425230908088526371457760433258954435587,-0.9990954842997049611952320447016973048449} +#define T_19683_5068 {-0.0469876733462478296532438548638310749084,-0.9988954692827004544497526694613043218851} +#define T_19683_5096 {-0.0559139338865179552628958958848670590669,-0.9984355923129614174271750925981905311346} +#define T_19683_5110 {-0.0603754336096253355070651025471306638792,-0.9981757395451212877546254276239778846502} +#define T_19683_5138 {-0.0692947264269760387911389898363268002868,-0.9975962313929471747542265802621841430664} +#define T_19683_5152 {-0.0737523413806146499460680843185400590301,-0.9972765875828416559656375284248497337103} +#define T_19683_5180 {-0.0826630632309510726685175541206263005733,-0.9965775524149015351937919149349909275770} +#define T_19683_5194 {-0.0871159921582278368168417159722594078630,-0.9961981750185490147586619968933518975973} +#define T_19683_5222 {-0.0960165413380876103799366205748810898513,-0.9953797384865091801842140739609021693468} +#define T_19683_5236 {-0.1004639838244232863395666299766162410378,-0.9949406956970481274993289844132959842682} +#define T_19683_5264 {-0.1093527604588829665877014463148952927440,-0.9940030049149862767521312889584805816412} +#define T_19683_5278 {-0.1137939170758868190347357085556723177433,-0.9935043756504176037935849308269098401070} +#define T_19683_5306 {-0.1226693234061395632039648262434639036655,-0.9924475991683792130970687139779329299927} +#define T_19683_5320 {-0.1271033958553067499952504704197053797543,-0.9918894730573810658569300358067266643047} +#define T_19683_5348 {-0.1359638365258597492157832675729878246784,-0.9907138008310822918645044410368427634239} +#define T_19683_5362 {-0.1403900277820655240201830338264699093997,-0.9900962781968988268843645528249908238649} +#define T_19683_5390 {-0.1492339101275052815065436107033747248352,-0.9888019215535818196016748515830840915442} +#define T_19683_5404 {-0.1536514245822704627375543395828572101891,-0.9881251133959904064951729196764063090086} +#define T_19683_5432 {-0.1624771589135446947516072668804554268718,-0.9867123049964374725817606304190121591091} +#define T_19683_5446 {-0.1668852025180462550491000683905440382659,-0.9859763329717964319698353392595890909433} +#define T_19683_5474 {-0.1756912024082107059985702335325186140835,-0.9844453267685093766914405932766385376453} +#define T_19683_5488 {-0.1800889828160122940747811526307486928999,-0.9836503231678898062639859745104331523180} +#define T_19683_5516 {-0.1888736653853911595657422139993286691606,-0.9820013943594414485005472670309245586395} +#define T_19683_5530 {-0.1932603920948673947854956622904865071177,-0.9811475020848486883195960217562969774008} +#define T_19683_5558 {-0.2020221782955761991029675073150428943336,-0.9793809470664162075692615871957968920469} +#define T_19683_5572 {-0.2063970627920054812243222386314300820231,-0.9784683196051024989259303765720687806606} +#define T_19683_5600 {-0.2151343776917850891816641478726523928344,-0.9765844559151903858662535640178248286247} +#define T_19683_5614 {-0.2194966335890852493495373209952958859503,-0.9756132573120657180965054067200981080532} +#define T_19683_5642 {-0.2282079066543962198032602373132249340415,-0.9736124235754278766208358320000115782022} +#define T_19683_5656 {-0.2325567498364774776664631872336030937731,-0.9725828284035730186829482590837869793177} +#define T_19683_5684 {-0.2412404152148035774150258703230065293610,-0.9704653842703454547091723725316114723682} +#define T_19683_5698 {-0.2455750639765135467929013657339964993298,-0.9693775775996323895711270779429469257593} +#define T_19683_5726 {-0.2542295607778241595120505280647194013000,-0.9671439036806852573846526865963824093342} +#define T_19683_5740 {-0.2585492359654593674811451364803360775113,-0.9659980810445107923811747241416014730930} +#define T_19683_5768 {-0.2671730085427794776364862627815455198288,-0.9636485788430344534560845204396173357964} +#define T_19683_5782 {-0.2714769336941385557970818354078801348805,-0.9624449462031728907973615605442319065332} +#define T_19683_5810 {-0.2800684319231764862756506317964522168040,-0.9599800380425069779022351212915964424610} +#define T_19683_5824 {-0.2843558334071291660016811420064186677337,-0.9587188117520887287170694435189943760633} +#define T_19683_5852 {-0.2929135129649119151373781733127543702722,-0.9561389406998097584278184513095766305923} +#define T_19683_5866 {-0.2971836201204593463920389240229269489646,-0.9548203474644318955455446484847925603390} +#define T_19683_5894 {-0.3057059427629245651480971446289913728833,-0.9521259772527118636631371373368892818689} +#define T_19683_5908 {-0.3099579880377257845580629691539797931910,-0.9507502540896873854947557447303552180529} +#define T_19683_5936 {-0.3184434218762214330311621779401320964098,-0.9479418690319374452002421094221062958241} +#define T_19683_5950 {-0.3226766409645610012013605683023342862725,-0.9465092632276916884137563101830892264843} +#define T_19683_5978 {-0.3311236607412026411445538087718887254596,-0.9435873681315074534836639941204339265823} +#define T_19683_5992 {-0.3353372927213747467511950617335969582200,-0.9420981371971282047894646893837489187717} +#define T_19683_6020 {-0.3437443800832106766129925290442770346999,-0.9390632572735497785032521278480999171734} +#define T_19683_6034 {-0.3479376675542948382791053063556319102645,-0.9375176688984997452891434477351140230894} +#define T_19683_6062 {-0.3563033113262303319679347168857930228114,-0.9343703496676054598424343566875904798508} +#define T_19683_6076 {-0.3604755005442343285260164975625229999423,-0.9327686816716048712194719882973004132509} +#define T_19683_6104 {-0.3687981970006656839977665640617487952113,-0.9295094888644538366762049008684698492289} +#define T_19683_6118 {-0.3729485380140110661884023102174978703260,-0.9278520291475422787641491595422849059105} +#define T_19683_6146 {-0.3812267911491202809770584281068295240402,-0.9244815486044840602275485252903308719397} +#define T_19683_6160 {-0.3853545379334465392773267922166269272566,-0.9227685950952708715533390204655006527901} +#define T_19683_6188 {-0.3935868597301087068451863615337060764432,-0.9192874326606405022133117199700791388750} +#define T_19683_6202 {-0.3976912703223708378530432128172833472490,-0.9175192932627515007837359917175490409136} +#define T_19683_6230 {-0.4058761810196253594362758576608030125499,-0.9139280746759693707659266692644450813532} +#define T_19683_6244 {-0.4099565176514622377723640056501608341932,-0.9121050672127012370893339721078518778086} +#define T_19683_6272 {-0.4180925460104999991095553468767320737243,-0.9084044379957959547411405765160452574492} +#define T_19683_6286 {-0.4221480752408481307291765460831811651587,-0.9065268901529878187162125868781004101038} +#define T_19683_6314 {-0.4302337588094670151050991080410312861204,-0.9027175154945634716341373859904706478119} +#define T_19683_6328 {-0.4342637516563965793814361404656665399671,-0.9007857647616949181568202220660168677568} +#define T_19683_6356 {-0.4422976370318774108625348162604495882988,-0.8968683293973633841034143188153393566608} +#define T_19683_6370 {-0.4463013691036270547130015984294004738331,-0.8948827230068909788229802870773710310459} +#define T_19683_6398 {-0.4542820121939828981183495670848060399294,-0.8908579310961910469046642901957966387272} +#define T_19683_6412 {-0.4582587638191691348232836844545090571046,-0.8888188259611330410692175973963458091021} +#define T_19683_6440 {-0.4661847301027209899970671358460094779730,-0.8846874009609571043455389371956698596478} +#define T_19683_6454 {-0.4701337864596992766053062950959429144859,-0.8825951636107383091456313195521943271160} +#define T_19683_6482 {-0.4780036512429315931349549373408081009984,-0.8783578481452908315318950371874962002039} +#define T_19683_6496 {-0.4819243024882857162616289770085131749511,-0.8762128546598590972394049458671361207962} +#define T_19683_6524 {-0.4897366511619352102968605322530493140221,-0.8718704103871703914308000094024464488029} +#define T_19683_6538 {-0.4936281925580714435852769383927807211876,-0.8696730463293959045856240663852076977491} +#define T_19683_6566 {-0.5013816208514034755694410705473273992538,-0.8652262538044135364856401793076656758785} +#define T_19683_6580 {-0.5052433528932275264011764193128328770399,-0.8629769141507839247395850179600529372692} +#define T_19683_6608 {-0.5129364671264536879036199934489559382200,-0.8584265726850680566784035363525617867708} +#define T_19683_6622 {-0.5167676956671067856063928047660738229752,-0.8561256617546915137495489034336060285568} +#define T_19683_6650 {-0.5243991130018992308237102406565099954605,-0.8514725892727382783320422277029138058424} +#define T_19683_6664 {-0.5281991493775310964053915085969492793083,-0.8491205206546669215228462235245388001204} +#define T_19683_6692 {-0.5357674980655876550983407469175290316343,-0.8443655535468866935033815934730228036642} +#define T_19683_6706 {-0.5395356592191438149797022560960613191128,-0.8419627500257739205480334021558519452810} +#define T_19683_6734 {-0.5470395788487605326366747249267064034939,-0.8371067429981500218616474739974364638329} +#define T_19683_6748 {-0.5507751874527619939669875748222693800926,-0.8346536364782549677343581606692168861628} +#define T_19683_6776 {-0.5582133291933665253381491311301942914724,-0.8296974623987096730814982947777025401592} +#define T_19683_6790 {-0.5619157137716596084331399652000982314348,-0.8271944938262623114866300966241396963596} +#define T_19683_6818 {-0.5692867406162642751610292179975658655167,-0.8221390435677594643593124601466115564108} +#define T_19683_6832 {-0.5729552356647183986027016544539947062731,-0.8195866628517005647580617733183316886425} +#define T_19683_6860 {-0.5802578226702471697606711131811607629061,-0.8144328451321103390370126362540759146214} +#define T_19683_6874 {-0.5838917687763792718769195744243916124105,-0.8118315110632201569984545130864717066288} +#define T_19683_6902 {-0.5911246033018264789404838666087016463280,-0.8065802522819773834328316297614946961403} +#define T_19683_6916 {-0.5947233472633305373378220792801585048437,-0.8039304324504079612978557634050957858562} +#define T_19683_6944 {-0.6018851292057083579578602439141832292080,-0.7985826765219915523985605432244483381510} +#define T_19683_6958 {-0.6054480241478688018474940690794028341770,-0.7958848472332172852006237917521502822638} +#define T_19683_6986 {-0.6125374661759006578165553946746513247490,-0.7904415554174815117249863760662265121937} +#define T_19683_7000 {-0.6160638716678680237848197975836228579283,-0.7876962016066832994454216532176360487938} +#define T_19683_7028 {-0.6230796994533868149446220741083379834890,-0.7821583523360708944949237775290384888649} +#define T_19683_7042 {-0.6265689816232949960195242056215647608042,-0.7793659674809690907082426747365389019251} +#define T_19683_7070 {-0.6335099340703038706124061718583106994629,-0.7737345561846383779069924457871820777655} +#define T_19683_7084 {-0.6369614657192091966564362337521743029356,-0.7708956422167897448716189501283224672079} +#define T_19683_7112 {-0.6438262951905634468019457017362583428621,-0.7651716811416856556249399545777123421431} +#define T_19683_7126 {-0.6472394559051833917706630927568767219782,-0.7622867483562613122316520275489892810583} +#define T_19683_7154 {-0.6540269284468536170606967061758041381836,-0.7564712663851643759116427645494695752859} +#define T_19683_7168 {-0.6574011047110863703579752836958505213261,-0.7535408333492241705897640713374130427837} +#define T_19683_7196 {-0.6641100002739614982516513919108547270298,-0.7476348758158081198033073633268941193819} +#define T_19683_7210 {-0.6674445855791655279887208962463773787022,-0.7446594692750881927523209924402181059122} +#define T_19683_7238 {-0.6740736982383573883126359760353807359934,-0.7386640977760214887837264541303738951683} +#define T_19683_7252 {-0.6773680931923713455233837521518580615520,-0.7356442525602514548310750797099899500608} +#define T_19683_7280 {-0.6839162313639794987807363213505595922470,-0.7295605447643750407493712373252492398024} +#define T_19683_7294 {-0.6871698437988630336903383977187331765890,-0.7264968036911423343582328016054816544056} +#define T_19683_7322 {-0.6936358304541621055960831654374487698078,-0.7203258531457584767920820922881830483675} +#define T_19683_7336 {-0.6968480755326383890846386748307850211859,-0.7172187669229370676760026981355622410774} +#define T_19683_7364 {-0.7032307484096492755654139727994333952665,-0.7109616828572440372369101169169880449772} +#define T_19683_7378 {-0.7064010487302293528344421247311402112246,-0.7078118099840042809489659703103825449944} +#define T_19683_7406 {-0.7126992605426358817766185893560759723186,-0.7014697171097125094618718321726191788912} +#define T_19683_7420 {-0.7158270462434065395385118790727574378252,-0.6982776237761310067497788622858934104443} +#define T_19683_7448 {-0.7220396648867802857907349789456930011511,-0.6918516620852955822940089092298876494169} +#define T_19683_7462 {-0.7251243737478374473681697054416872560978,-0.6886179220705823667003642185591161251068} +#define T_19683_7490 {-0.7312502825031338415939785591035615652800,-0.6821092466306896140437743270013015717268} +#define T_19683_7504 {-0.7342913600476407287587221617286559194326,-0.6788344412000514305205456366820726543665} +#define T_19683_7532 {-0.7403294577819293786902221654599998146296,-0.6722442219463954371505565177358221262693} +#define T_19683_7546 {-0.7433263573757845632528074020228814333677,-0.6689289397465529862785160730709321796894} +#define T_19683_7574 {-0.7492755587401778161193988125887699425220,-0.6622583612719391554790604459412861615419} +#define T_19683_7588 {-0.7522277416902724000991042885289061814547,-0.6589031982253180652620017099252436310053} +#define T_19683_7616 {-0.7580869773150175072728984559944365173578,-0.6521534595671329981314556789584457874298} +#define T_19683_7630 {-0.7609939129660654444364809023682028055191,-0.6487590187647462869335868163034319877625} +#define T_19683_7658 {-0.7667621296527632468453816727560479193926,-0.6419313331894300755919857692788355052471} +#define T_19683_7672 {-0.7696232954826865979569561204698402434587,-0.6384982247824728673890604113694280385971} +#define T_19683_7700 {-0.7752994563936046468199947412358596920967,-0.6315938195674346555819056447944603860378} +#define T_19683_7714 {-0.7781143381074570042343907516624312847853,-0.6281226606576091331390898631070740520954} +#define T_19683_7742 {-0.7836974229519008128264090373704675585032,-0.6211427768706236918205831898376345634460} +#define T_19683_7756 {-0.7864655145743120190360286869690753519535,-0.6176341913992152710122240932832937687635} +#define T_19683_7784 {-0.7919545197920229151478110907191876322031,-0.6105800836753410010260267881676554679871} +#define T_19683_7798 {-0.7946753237581479778484094822488259524107,-0.6070347023110644890664389095036312937737} +#define T_19683_7826 {-0.8000692626996928069615933054592460393906,-0.5999076386271223748636316486226860433817} +#define T_19683_7840 {-0.8027422899446494675146368535934016108513,-0.5963260986527592066863689979072660207748} +#define T_19683_7868 {-0.8080401930487710604467110897530801594257,-0.5891273600994141323994313097500707954168} +#define T_19683_7882 {-0.8106649630955491403483392787165939807892,-0.5855103052972611132886981977208051830530} +#define T_19683_7910 {-0.8158658780634454599223204240843188017607,-0.5782411858487451761234865443839225918055} +#define T_19683_7924 {-0.8184419191092726642011712101520970463753,-0.5745892663848940484783156534831505268812} +#define T_19683_7952 {-0.8235449110757721014053345243155490607023,-0.5672510726664151681220005229988601058722} +#define T_19683_7966 {-0.8260717600769207358268886309815570712090,-0.5635649449738853178359931916929781436920} +#define T_19683_7994 {-0.8310759117785243565990072056592907756567,-0.5561589960267606658206318570591975003481} +#define T_19683_8008 {-0.8335531145335434155541065592842642217875,-0.5524393226875051743363087553007062524557} +#define T_19683_8036 {-0.8384575264733036270570210035657510161400,-0.5449669497320634992121313189272768795490} +#define T_19683_8050 {-0.8408846377046614861683337949216365814209,-0.5412143993578701905988737053121440112591} +#define T_19683_8078 {-0.8456884283138662583567679575935471802950,-0.5336769455541645612584034097380936145782} +#define T_19683_8092 {-0.8480650117479896499261826647853013128042,-0.5298921926664726944622429982700850814581} +#define T_19683_8120 {-0.8527673175446246478514922273461706936359,-0.5222910128728478484916308843821752816439} +#define T_19683_8134 {-0.8550929459903185980707007729506585747004,-0.5184747377815027702396832864906173199415} +#define T_19683_8162 {-0.8596929217342786921918218467908445745707,-0.5108111983110583675937732550664804875851} +#define T_19683_8176 {-0.8619671771595143194844013123656623065472,-0.5069640869920258863245976499456446617842} +#define T_19683_8204 {-0.8664639960045348310302415484329685568810,-0.4992395653670223532039074143540346994996} +#define T_19683_8218 {-0.8686864696115907946705192443914711475372,-0.4953623093390843723504701756610302254558} +#define T_19683_8246 {-0.8730793232538739401249472393828909844160,-0.4875781940433324135319992365111829712987} +#define T_19683_8260 {-0.8752496155528175503235388532630167901516,-0.4836714902437863616846414061001269146800} +#define T_19683_8288 {-0.8795377143763246641228192856942769140005,-0.4758291804730672147627501544775441288948} +#define T_19683_8302 {-0.8816554352568205521478716946148779243231,-0.4718937311324514216615000350429909303784} +#define T_19683_8330 {-0.8858380084752058847286093623552005738020,-0.4639946365430111518968203654367243871093} +#define T_19683_8344 {-0.8879027772766382442526378326874691992998,-0.4600311490588788743139048165176063776016} +#define T_19683_8372 {-0.8919790730717981341868494382651988416910,-0.4520766895140422847454431121150264516473} +#define T_19683_8386 {-0.8939905186516946544728057233442086726427,-0.4480858763238069752965486713947029784322} +#define T_19683_8414 {-0.8979598043089063175159481033915653824806,-0.4400774816387571508613518744823522865772} +#define T_19683_8428 {-0.8999175651096508188331313249364029616117,-0.4360600600916315072730355950625380501151} +#define T_19683_8456 {-0.9037791271492786604468960831582080572844,-0.4279991697764004565662787626934004947543} +#define T_19683_8470 {-0.9056828512631014405087626073509454727173,-0.4239558620044530101722557446919381618500} +#define T_19683_8498 {-0.9094359955688444685506510722916573286057,-0.4158439250051692015475168773264158517122} +#define T_19683_8512 {-0.9112853408010777034320426537306047976017,-0.4117754577935208715189219219610095024109} +#define T_19683_8540 {-0.9149293927447372798411606709123589098454,-0.4036139322319612920964004842971917241812} +#define T_19683_8554 {-0.9167240266753243771447046128741931170225,-0.3995210368881457752010533113207202404737} +#define T_19683_8582 {-0.9202583312380691049625625055341515690088,-0.3913113897996380874388933079899288713932} +#define T_19683_8596 {-0.9219979312813173510932074350421316921711,-0.3871948020221490649461770772177260369062} +#define T_19683_8624 {-0.9254218531714221152029153927287552505732,-0.3789385090918720999653146463970188051462} +#define T_19683_8638 {-0.9271061066339876255426588613772764801979,-0.3747989688379214645586046117387013509870} +#define T_19683_8666 {-0.9304190304010278023127966662286780774593,-0.3664975141356503485212670057080686092377} +#define T_19683_8680 {-0.9320476345381220051322657127457205206156,-0.3623357654881607659014264299912611022592} +#define T_19683_8708 {-0.9352489646836008585495392253505997359753,-0.3539906412015051961894585019763326272368} +#define T_19683_8722 {-0.9368216267534088537161096610361710190773,-0.3498074322353610932090361984592163935304} +#define T_19683_8750 {-0.9399107878377980229700483505439478904009,-0.3414201384015446150144157400063704699278} +#define T_19683_8764 {-0.9414272251540993785567934537539258599281,-0.3372162210491255751598771439603297039866} +#define T_19683_8792 {-0.9444036619002744714634900446981191635132,-0.3287882652853535425663267233176156878471} +#define T_19683_8806 {-0.9458636018832556890956198003550525754690,-0.3245643952013743671614065533503890037537} +#define T_19683_8834 {-0.9487267792763076634798835584660992026329,-0.3160972924338394940413365929998690262437} +#define T_19683_8848 {-0.9501299595015585408575020665011834353209,-0.3118542288595212985669036243052687495947} +#define T_19683_8876 {-0.9528793628849621111243095583631657063961,-0.3033495010510955935956189932767301797867} +#define T_19683_8890 {-0.9542255311306467868703862222901079803705,-0.2990880066776918644322336149343755096197} +#define T_19683_8918 {-0.9568606662987705346878897216811310499907,-0.2905471825543535779878823177568847313523} +#define T_19683_8932 {-0.9581495805909640006703398285026196390390,-0.2862680233860565026660083276510704308748} +#define T_19683_8960 {-0.9606699738779037600622245918202679604292,-0.2776926381621013240064144156349357217550} +#define T_19683_8974 {-0.9619014025340869578073466072964947670698,-0.2733965833783523757816169563739094883204} +#define T_19683_9002 {-0.9643066008988071535767971909081097692251,-0.2647881784804385074671984057204099372029} +#define T_19683_9016 {-0.9654803225695121060567771564819850027561,-0.2604760002976683197495333388360450044274} +#define T_19683_9044 {-0.9677698936772801685535227989021223038435,-0.2518361230877449452592031775566283613443} +#define T_19683_9058 {-0.9688856973858770427199260666384361684322,-0.2475085966205673171369738838620833121240} +#define T_19683_9086 {-0.9710592296859755778726253083732444792986,-0.2388388001177358666016203869730816222727} +#define T_19683_9100 {-0.9721169148665962378430549506447277963161,-0.2344967032396207962108292122138664126396} +#define T_19683_9128 {-0.9741740176662991856915141397621482610703,-0.2257985458409797197010249192317132838070} +#define T_19683_9142 {-0.9751733941998890209390538075240328907967,-0.2214426590444305564808757935679750517011} +#define T_19683_9170 {-0.9771136977346887020345889141026418656111,-0.2127177042449531485512892459155409596860} +#define T_19683_9184 {-0.9780545859831799582195799303008243441582,-0.2083488105012129276705223901444696821272} +#define T_19683_9212 {-0.9798777414832527954402507930353749543428,-0.1995986266127089681088335737513261847198} +#define T_19683_9226 {-0.9807599723218551890369099055533297359943,-0.1952175112310212956590049770966288633645} +#define T_19683_9254 {-0.9824656520747523380521215585758909583092,-0.1864436711002326885200375272688688710332} +#define T_19683_9268 {-0.9832890669223527391196171265619341284037,-0.1820511215866821852493728783883852884173} +#define T_19683_9296 {-0.9848769643319070787867985927732661366463,-0.1732552023125636386780001885199453681707} +#define T_19683_9310 {-0.9856414151795749312157113308785483241081,-0.1688520082285217549511457946209702640772} +#define T_19683_9338 {-0.9871112448210116463442886924894992262125,-0.1600355908787569336748646264823037199676} +#define T_19683_9352 {-0.9878165942586032421957042970461770892143,-0.1556225436989580879210137709378614090383} +#define T_19683_9380 {-0.9891680919298445617826587294985074549913,-0.1467872130257625862270742800319567322731} +#define T_19683_9394 {-0.9898142131707027280285160486528184264898,-0.1423651059960364395617915533875930123031} +#define T_19683_9422 {-0.9910471359398588253597495167923625558615,-0.1335124501512983119511090990272350609303} +#define T_19683_9436 {-0.9916339128436028049762285263568628579378,-0.1290820781459838528792971601433237083256} +#define T_19683_9464 {-0.9927480390926386455419105914188548922539,-0.1202136883957930502120348137395922094584} +#define T_19683_9478 {-0.9932753661860402871752739883959293365479,-0.1157758477748597192302781877515371888876} +#define T_19683_9491 {-0.9937472639946681729483657363743986934423,-0.1116529233970666445951991363472188822925} +#define T_19683_9505 {-0.9942363226269432807669090834679082036018,-0.1072107026805287355308848873391980305314} +#define T_19683_9506 {-0.9942704956506112079495096622849814593792,-0.1068933182134778475669278918758209329098} +#define T_19683_9520 {-0.9947382781465540224630217380763497203588,-0.1024488066793797641507168805219407659024} +#define T_19683_9533 {-0.9951548583558443850094477056700270622969,-0.0983199262141669777514962902387196663767} +#define T_19683_9547 {-0.9955843171070126729205185256432741880417,-0.0938715480354049142119876592005311977118} +#define T_19683_9548 {-0.9956142319520037142766000215488020330667,-0.0935537339427014308368413253447215538472} +#define T_19683_9562 {-0.9960223857665203572864243142248596996069,-0.0891033503969902440644546004477888345718} +#define T_19683_9575 {-0.9963835734961241641371998412068933248520,-0.0849692560111819233936714113042398821563} +#define T_19683_9589 {-0.9967533551709166772525350097566843032837,-0.0805155199667772769611673311374033801258} +#define T_19683_9590 {-0.9967790064600333677091725803620647639036,-0.0801973333755474915296446170032140798867} +#define T_19683_9604 {-0.9971274582274204378862236808345187455416,-0.0757418777752697641725276866964122746140} +#define T_19683_9617 {-0.9974331885537941477792855948791839182377,-0.0716033125728930130993177272102911956608} +#define T_19683_9631 {-0.9977432266839045826856136045535095036030,-0.0671450192225044456550975269237824250013} +#define T_19683_9632 {-0.9977646098063244206599620156339369714260,-0.0668265173268313278365226892674400005490} +#define T_19683_9646 {-0.9980532968923292447271933269803412258625,-0.0623667905407360861902432702663645613939} +#define T_19683_9659 {-0.9983035148604051389753522016690112650394,-0.0582244984294479281383694058149558259174} +#define T_19683_9673 {-0.9985537537164377308585017090081237256527,-0.0537624491519104255599081909622327657416} +#define T_19683_9674 {-0.9985708648285412936118632387660909444094,-0.0534436892025529838856634512467280728742} +#define T_19683_9688 {-0.9987997353416211421262005387688986957073,-0.0489804928671355097047346305316750658676} +#define T_19683_9701 {-0.9989943959746855339787430239084642380476,-0.0448352184245053020372751007016631774604} +#define T_19683_9715 {-0.9991847905761719328410208618151955306530,-0.0403702152737823033445963005760859232396} +#define T_19683_9716 {-0.9991976266022345454231867734051775187254,-0.0400512545678846990093724400594510370865} +#define T_19683_9730 {-0.9993666394028835053831016921321861445904,-0.0355853909432922305144941788057622034103} +#define T_19683_9743 {-0.9995057077106612730688084411667659878731,-0.0314378792829638278871762224753183545545} +#define T_19683_9757 {-0.9996362238341457429058323214121628552675,-0.0269707248439778682347345295511331642047} +#define T_19683_9758 {-0.9996447824668901471767412658664397895336,-0.0266516207147702854918236425874056294560} +#define T_19683_9772 {-0.9997539071750344286115819159022066742182,-0.0221838925405956483016822744502860587090} +#define T_19683_9785 {-0.9998373581599782067286241726833395659924,-0.0180348891783530841659288768141777836718} +#define T_19683_9799 {-0.9999079723451700374425854533910751342773,-0.0135663864227210488599961735189936007373} +#define T_19683_9800 {-0.9999122520461806162828111155249644070864,-0.0132471962292140411215912720876985986251} +#define T_19683_9814 {-0.9999614690466391841994209244148805737495,-0.0087784065802031050612663776178123953287} +#define T_19683_9827 {-0.9999892877084231024298333068145439028740,-0.0046286572999640288733624871042593440507} +#define T_19683_9841 {-0.9999999872624130148324184119701385498047,-0.0001596094416629082683289214683597379008} +#define T_19683_9842 {-0.9999999872624130148324184119701385498047,0.0001596094416629082683289214683597379008} +#define T_19683_9856 {-0.9999892877084231024298333068145439028740,0.0046286572999640288733624871042593440507} +#define T_19683_9884 {-0.9999079723451700374425854533910751342773,0.0135663864227210488599961735189936007373} +#define T_19683_9898 {-0.9998373581599782067286241726833395659924,0.0180348891783530841659288768141777836718} +#define T_19683_9926 {-0.9996362238341457429058323214121628552675,0.0269707248439778682347345295511331642047} +#define T_19683_9940 {-0.9995057077106612730688084411667659878731,0.0314378792829638278871762224753183545545} +#define T_19683_9968 {-0.9991847905761719328410208618151955306530,0.0403702152737823033445963005760859232396} +#define T_19683_9982 {-0.9989943959746855339787430239084642380476,0.0448352184245053020372751007016631774604} +#define T_19683_10010 {-0.9985537537164377308585017090081237256527,0.0537624491519104255599081909622327657416} +#define T_19683_10024 {-0.9983035148604051389753522016690112650394,0.0582244984294479281383694058149558259174} +#define T_19683_10052 {-0.9977432266839045826856136045535095036030,0.0671450192225044456550975269237824250013} +#define T_19683_10066 {-0.9974331885537941477792855948791839182377,0.0716033125728930130993177272102911956608} +#define T_19683_10094 {-0.9967533551709166772525350097566843032837,0.0805155199667772769611673311374033801258} +#define T_19683_10108 {-0.9963835734961241641371998412068933248520,0.0849692560111819233936714113042398821563} +#define T_19683_10136 {-0.9955843171070126729205185256432741880417,0.0938715480354049142119876592005311977118} +#define T_19683_10150 {-0.9951548583558443850094477056700270622969,0.0983199262141669777514962902387196663767} +#define T_19683_10178 {-0.9942363226269432807669090834679082036018,0.1072107026805287355308848873391980305314} +#define T_19683_10192 {-0.9937472639946681729483657363743986934423,0.1116529233970666445951991363472188822925} +// Pre-computed twiddles for N=32768 +#define T_32768_1 {0.9999999816164293342524160834727808833122,-0.0001917475973107033186335068641881207441} +#define T_32768_3 {0.9999998345478676720077260142716113477945,-0.0005752427637320660933420435334539888572} +#define T_32768_5 {0.9999995404107661078896285289374645799398,-0.0009587378455533015087636217188560294744} +#define T_32768_7 {0.9999990992051678295737815460597630590200,-0.0013422327863743383196543090818408927589} +#define T_32768_9 {0.9999985109311377851071256372961215674877,-0.0017257275297951264227125367511916920193} +#define T_32768_11 {0.9999977755887623498409766398253850638866,-0.0021092220194156444459943067215590417618} +#define T_32768_13 {0.9999968931781498815425379689258988946676,-0.0024927161988359080972699199918451995472} +#define T_32768_15 {0.9999958636994299432387833803659304976463,-0.0028762100116559792713222254434413116542} +#define T_32768_17 {0.9999946871527540803725742080132476985455,-0.0032597034014759732056809582445566775277} +#define T_32768_19 {0.9999933635382951546688445887411944568157,-0.0036431963118960680216018577226577690453} +#define T_32768_21 {0.9999918928562480102684162375226151198149,-0.0040266886865165116629605712716966081643} +#define T_32768_23 {0.9999902751068289186164861348515842109919,-0.0044101804689376314372317722245497861877} +#define T_32768_25 {0.9999885102902756894849289892590604722500,-0.0047936716027598413880639327544486150146} +#define T_32768_27 {0.9999865984068480040392046248598489910364,-0.0051771620315836514025775727532163728029} +#define T_32768_29 {0.9999845394568269707491481312899850308895,-0.0055606516990096737165782947442949080141} +#define T_32768_31 {0.9999823334405153474335747887380421161652,-0.0059441405486386341902593777319907530909} +#define T_32768_33 {0.9999799803582376522825825304607860743999,-0.0063276285240713783797339431202999548987} +#define T_32768_35 {0.9999774802103399418129470177518669515848,-0.0067111155689088793432905966085399995791} +#define T_32768_37 {0.9999748329971898108681216399418190121651,-0.0070946016267522497844577600289994734339} +#define T_32768_39 {0.9999720387191767256851449019450228661299,-0.0074780866412027446540888853121487045428} +#define T_32768_41 {0.9999690973767115798054305741970892995596,-0.0078615705558617732934267863242894236464} +#define T_32768_43 {0.9999660089702269161193726176861673593521,-0.0082450533143309072403592807631866890006} +#define T_32768_45 {0.9999627735001769268663451839529443532228,-0.0086285348602118863009513560768937168177} +#define T_32768_47 {0.9999593909670374536347026150906458497047,-0.0090120151371066332945947152666121837683} +#define T_32768_49 {0.9999558613713060983840819062606897205114,-0.0093954940886172514519225629214815853629} +#define T_32768_51 {0.9999521847135017793561928556300699710846,-0.0097789716583460434967678409634572744835} +#define T_32768_53 {0.9999483609941653972086328394652809947729,-0.0101624477898955151156101806009246502072} +#define T_32768_55 {0.9999443902138590578587695745227392762899,-0.0105459224268683784270228542823133466300} +#define T_32768_57 {0.9999402723731669606621608181740157306194,-0.0109293955128675710636310114409752713982} +#define T_32768_59 {0.9999360074726946212564371307962574064732,-0.0113128669914962579068351544719917001203} +#define T_32768_61 {0.9999315955130692046282092633191496133804,-0.0116963368063578380257050426394016540144} +#define T_32768_63 {0.9999270364949396361353706197405699640512,-0.0120798049010559568200440239138515607920} +#define T_32768_65 {0.9999223304189764904847947946109343320131,-0.0124632712191945112245594629030165378936} +#define T_32768_67 {0.9999174772858717696877306480018887668848,-0.0128467357043776618519270726892500533722} +#define T_32768_69 {0.9999124770963392361267096930532716214657,-0.0132301983002098364622378667831981147174} +#define T_32768_71 {0.9999073298511143015332436334574595093727,-0.0136136589502957403713390149846418353263} +#define T_32768_73 {0.9999020355509539159655219009437132626772,-0.0139971175982403685938981752201470953878} +#define T_32768_75 {0.9998965941966366788307141177938319742680,-0.0143805741876490075781269695198716362938} +#define T_32768_77 {0.9998910057889629499072725593578070402145,-0.0147640286621272456141218398784076271113} +#define T_32768_79 {0.9998852703287545162780247665068600326777,-0.0151474809652809867116518560692384198774} +#define T_32768_81 {0.9998793878168549253970809331804048269987,-0.0155309310407164488654352396679314551875} +#define T_32768_83 {0.9998733582541292630452289813547395169735,-0.0159143788320401796676506478434021119028} +#define T_32768_85 {0.9998671816414643753745394860743544995785,-0.0162978242828590649815545532419491792098} +#define T_32768_87 {0.9998608579797685358414582879049703478813,-0.0166812673367803324109281959408690454438} +#define T_32768_89 {0.9998543872699718892960163429961539804935,-0.0170647079374115599736949633324911701493} +#define T_32768_91 {0.9998477695130258968703174105030484497547,-0.0174481460283606934491551498922490281984} +#define T_32768_93 {0.9998410047099040021123528276802971959114,-0.0178315815532360394390920532714517321438} +#define T_32768_95 {0.9998340928616009648521867347881197929382,-0.0182150144556462896539006379725833539851} +#define T_32768_97 {0.9998270339691334163134683876705821603537,-0.0185984446792005105042466794884603586979} +#define T_32768_99 {0.9998198280335394150242223076929803937674,-0.0189818721675081777955362838383734924719} +#define T_32768_101 {0.9998124750558787798837556692888028919697,-0.0193652968641791559112341758464026497677} +#define T_32768_103 {0.9998049750372328681180533749284222722054,-0.0197487187128237290378862667239445727319} +#define T_32768_105 {0.9997973279787046863020805176347494125366,-0.0201321376570525942262257501624844735488} +#define T_32768_107 {0.9997895338814187793374799184675794094801,-0.0205155536404768752689609101480527897365} +#define T_32768_109 {0.9997815927465216745417819765862077474594,-0.0208989666067081400480098807292961282656} +#define T_32768_111 {0.9997735045751809934699849691241979598999,-0.0212823764993583866567128382030205102637} +#define T_32768_113 {0.9997652693685864511152772138302680104971,-0.0216657832620400815637484726039474480785} +#define T_32768_115 {0.9997568871279490787529198314587119966745,-0.0220491868383661353270053240294146235101} +#define T_32768_117 {0.9997483578545017790517590583476703613997,-0.0224325871719499338186043502219035872258} +#define T_32768_119 {0.9997396815494986599404114713252056390047,-0.0228159842064053486332397824298823252320} +#define T_32768_121 {0.9997308582142160338079861503501888364553,-0.0231993778853467197409443656397343147546} +#define T_32768_123 {0.9997218878499513072810600533557590097189,-0.0235827681523888936510058300655146013014} +#define T_32768_125 {0.9997127704580238694020977163745556026697,-0.0239661549511472130036260352881072321907} +#define T_32768_127 {0.9997035060397746475402414034761022776365,-0.0243495382252375339171557300232962006703} +#define T_32768_129 {0.9996940945965659963690086442511528730392,-0.0247329179182762225186476001681512570940} +#define T_32768_131 {0.9996845361297821419555020838743075728416,-0.0251162939738801861688788363835556083359} +#define T_32768_133 {0.9996748306408287376711996330413967370987,-0.0254996663356668526456694223725207848474} +#define T_32768_135 {0.9996649781311333082811643180320970714092,-0.0258830349472542013689047024627143400721} +#define T_32768_137 {0.9996549786021446948325319681316614151001,-0.0262663997522607599310884296528456616215} +#define T_32768_139 {0.9996448320553336097660235282091889530420,-0.0266497606943056179751305734271227265708} +#define T_32768_141 {0.9996345384921923038490376711706630885601,-0.0270331177170084341332412236624804791063} +#define T_32768_143 {0.9996240979142345661756507979589514434338,-0.0274164707639894360269305906285808305256} +#define T_32768_145 {0.9996135103229959462112219625851139426231,-0.0277998197788694480225846206167261698283} +#define T_32768_147 {0.9996027757200335317477879470970947295427,-0.0281831647052698738842302361717884195969} +#define T_32768_149 {0.9995918941069259489040632615797221660614,-0.0285665054868127314680048556283509242348} +#define T_32768_151 {0.9995808654852736951923475317016709595919,-0.0289498420671206388443685852962516946718} +#define T_32768_153 {0.9995696898566985844070131861371919512749,-0.0293331743898168351147859311822685413063} +#define T_32768_155 {0.9995583672228443017360177691443823277950,-0.0297165023985251908200666548509616404772} +#define T_32768_157 {0.9995468975853759596716940905025694519281,-0.0300998260368702010014718695174451568164} +#define T_32768_159 {0.9995352809459805420999600755749270319939,-0.0304831452484770129562896556763007538393} +#define T_32768_161 {0.9995235173063663491888064527302049100399,-0.0308664599769714158294942052407350274734} +#define T_32768_163 {0.9995116066682634414775066034053452312946,-0.0312497701659798648998744852178788278252} +#define T_32768_165 {0.9994995490334236398766165621054824441671,-0.0316330757591294781105872857551730703562} +#define T_32768_167 {0.9994873444036200815787651663413271307945,-0.0320163767000480603552858838156680576503} +#define T_32768_169 {0.9994749927806477751701663692074362188578,-0.0323996729323640861308852834099525352940} +#define T_32768_171 {0.9994624941663231565414093893195968121290,-0.0327829643997067238236908792714530136436} +#define T_32768_173 {0.9994498485624845329766685608774423599243,-0.0331662510457058634649740724853472784162} +#define T_32768_175 {0.9994370559709915280421910210861824452877,-0.0335495328139920750976088470451941248029} +#define T_32768_177 {0.9994241163937256366978090227348729968071,-0.0339328096481966573483290972035320010036} +#define T_32768_179 {0.9994110298325897812077300841338001191616,-0.0343160814919516513055164352863357635215} +#define T_32768_181 {0.9993977962895086442074443766614422202110,-0.0346993482888897988858367682496464112774} +#define T_32768_183 {0.9993844157664285576814222622488159686327,-0.0350826099826446191620732406590832397342} +#define T_32768_185 {0.9993708882653171698962069058325141668320,-0.0354658665168503528519750034320168197155} +#define T_32768_187 {0.9993572137881640005119265879329759627581,-0.0358491178351420178294084450953960185871} +#define T_32768_189 {0.9993433923369802185376897796231787651777,-0.0362323638811553952465693839712912449613} +#define T_32768_191 {0.9993294239137984202869802174973301589489,-0.0366156045985270295339830681768944486976} +#define T_32768_193 {0.9993153085206730734668667537334840744734,-0.0369988399308942630949736951606610091403} +#define T_32768_195 {0.9993010461596800730887935060309246182442,-0.0373820698218952363056644117023097351193} +#define T_32768_197 {0.9992866368329167414685798576101660728455,-0.0377652942151688597594016982839093543589} +#define T_32768_199 {0.9992720805425026053825376948225311934948,-0.0381485130543548905945883120693906676024} +#define T_32768_201 {0.9992573772905780637998418569623026996851,-0.0385317262830938769835320556467195274308} +#define T_32768_203 {0.9992425270793058311724621489702258259058,-0.0389149338450271928269152965640387265012} +#define T_32768_205 {0.9992275299108696051675337912456598132849,-0.0392981356837970585704766790513531304896} +#define T_32768_207 {0.9992123857874752879126845073187723755836,-0.0396813317430465342661172201133013004437} +#define T_32768_209 {0.9991970947113498757730098986939992755651,-0.0400645219664195195719003095291554927826} +#define T_32768_211 {0.9991816566847423475294931449752766638994,-0.0404477062975607815076273254817351698875} +#define T_32768_213 {0.9991660717099229982451902287721168249846,-0.0408308846801159475159437306501786224544} +#define T_32768_215 {0.9991503397891841053990447107935324311256,-0.0412140570577315262790207839316281024367} +#define T_32768_217 {0.9991344609248391517297704922384582459927,-0.0415972233740549007796616365340014453977} +#define T_32768_219 {0.9991184351192234913696665898896753787994,-0.0419803835727343560568769476049055811018} +#define T_32768_221 {0.9991022623746941278000122110825031995773,-0.0423635375974190722669909803244081558660} +#define T_32768_223 {0.9990859426936292697618569036421831697226,-0.0427466853917591316225355058122659102082} +#define T_32768_225 {0.9990694760784293304567427185247652232647,-0.0431298268994055461478254187568381894380} +#define T_32768_227 {0.9990528625315159283459820471762213855982,-0.0435129620640102438011709296006301883608} +#define T_32768_229 {0.9990361020553323312398674715950619429350,-0.0438960908292260754137714684475213289261} +#define T_32768_231 {0.9990191946523434562976717643323354423046,-0.0442792131387068493841852045989071484655} +#define T_32768_233 {0.9990021403250359810499503510072827339172,-0.0446623289361073247394351426464709220454} +#define T_32768_235 {0.9989849390759180103316339227603748440742,-0.0450454381650832041961152185649552848190} +#define T_32768_237 {0.9989675909075192983266333612846210598946,-0.0454285407692911619159659153410757426172} +#define T_32768_239 {0.9989500958223912485678397388255689293146,-0.0458116366923888504447681668807490495965} +#define T_32768_241 {0.9989324538231066918925193931499961763620,-0.0461947258780349007123433580090932082385} +#define T_32768_243 {0.9989146649122604415538262401241809129715,-0.0465778082698889428492350361921126022935} +#define T_32768_245 {0.9988967290924684050423820735886693000793,-0.0469608838116115923089211037222412414849} +#define T_32768_247 {0.9988786463663686943093011905148159712553,-0.0473439524468644845622833372544846497476} +#define T_32768_249 {0.9988604167366205155431657658482436090708,-0.0477270141193102612198195799919631099328} +#define T_32768_251 {0.9988420402059048353038406276027671992779,-0.0481100687726125908483254534075967967510} +#define T_32768_253 {0.9988235167769244915447757193760480731726,-0.0484931163504361759097882611513341544196} +#define T_32768_255 {0.9988048464524034164568888627400156110525,-0.0488761567964467666391747968646086519584} +#define T_32768_257 {0.9987860292350876356692879198817536234856,-0.0492591900543111402277496324586536502466} +#define T_32768_259 {0.9987670651277443800708510934782680124044,-0.0496422160676971632731202532795578008518} +#define T_32768_261 {0.9987479541331628629663441643060650676489,-0.0500252347802737362680858268504380248487} +#define T_32768_263 {0.9987286962541537249649081786628812551498,-0.0504082461357108560506823380364949116483} +#define T_32768_265 {0.9987092914935490339800594483676832169294,-0.0507912500776795811097130695088708307594} +#define T_32768_267 {0.9986897398542026182965969383076298981905,-0.0511742465498520801570059290952485753223} +#define T_32768_269 {0.9986700413389900665706022664380725473166,-0.0515572354959016113107317380581662291661} +#define T_32768_271 {0.9986501959508082837402298537199385464191,-0.0519402168595025429120859428167022997513} +#define T_32768_273 {0.9986302036925760461372192366980016231537,-0.0523231905843303465863947110392473405227} +#define T_32768_275 {0.9986100645672333353530802924069575965405,-0.0527061566140616319375844511796458391473} +#define T_32768_277 {0.9985897785777422264175129384966567158699,-0.0530891148923741326703940046627394622192} +#define T_32768_279 {0.9985693457270861106422898956225253641605,-0.0534720653629467274070563576060521882027} +#define T_32768_281 {0.9985487660182699176658616124768741428852,-0.0538550079694594396872986408197903074324} +#define T_32768_283 {0.9985280394543202264756587283045519143343,-0.0542379426555934518461299376212991774082} +#define T_32768_285 {0.9985071660382854874526969979342538863420,-0.0546208693650311050138412838350632227957} +#define T_32768_287 {0.9984861457732353562377625166845973581076,-0.0550037880414559268715812834216194460168} +#define T_32768_289 {0.9984649786622612488429240329423919320107,-0.0553866986285526038957804928486439166591} +#define T_32768_291 {0.9984436647084763416515329481626395136118,-0.0557696010700070299304087484415504150093} +#define T_32768_293 {0.9984222039150150163067110042902640998363,-0.0561524953095062992480812624762620544061} +#define T_32768_295 {0.9984005962850336368674675213696900755167,-0.0565353812907386996111647192719829035923} +#define T_32768_297 {0.9983788418217099946971870849665720015764,-0.0569182589573937400273528908201114973053} +#define T_32768_299 {0.9983569405282434194859320086834486573935,-0.0573011282531621576885605406914692139253} +#define T_32768_301 {0.9983348924078550012950472591910511255264,-0.0576839891217359110320295201290718978271} +#define T_32768_303 {0.9983126974637872574902530686813406646252,-0.0580668415068082005570104797698149923235} +#define T_32768_305 {0.9982903556993043547862498598988167941570,-0.0584496853520734688247628696444735396653} +#define T_32768_307 {0.9982678671176921092467182461405172944069,-0.0588325206012274351530244587138440692797} +#define T_32768_309 {0.9982452317222578752620165687403641641140,-0.0592153471979670678604357192398310871795} +#define T_32768_311 {0.9982224495163305455491808970691636204720,-0.0595981650859905981443276345999038312584} +#define T_32768_313 {0.9981995205032606621742274910502601414919,-0.0599809742089975478362973149160097818822} +#define T_32768_315 {0.9981764446864205275744552636751905083656,-0.0603637745106887432799958048690314171836} +#define T_32768_317 {0.9981532220692037604692359309410676360130,-0.0607465659347662875755524680698727024719} +#define T_32768_319 {0.9981298526550256289269213993975427001715,-0.0611293484249335883351506026883726008236} +#define T_32768_321 {0.9981063364473230503648437661468051373959,-0.0615121219248953854386030570822185836732} +#define T_32768_323 {0.9980826734495545915493153188435826450586,-0.0618948863783577163388827102608047425747} +#define T_32768_325 {0.9980588636652002465510236106638330966234,-0.0622776417290279785121676070502871880308} +#define T_32768_327 {0.9980349070977617698119388478517066687346,-0.0626603879206148739466897268357570283115} +#define T_32768_329 {0.9980108037507624541007089646882377564907,-0.0630431248968284924094618304479809012264} +#define T_32768_331 {0.9979865536277470194903571609756909310818,-0.0634258526013802281795506132766604423523} +#define T_32768_333 {0.9979621567322819464251892895845230668783,-0.0638085709779828980092730716933147050440} +#define T_32768_335 {0.9979376130679552536761889314220752567053,-0.0641912799703506370407879444428544957191} +#define T_32768_337 {0.9979129226383766093633198579482268542051,-0.0645739795221989959506103673447796609253} +#define T_32768_339 {0.9978880854471771089109211061440873891115,-0.0649566695772448854384606420353520661592} +#define T_32768_341 {0.9978631014980094970923119035433046519756,-0.0653393500792066456162032750398793723434} +#define T_32768_343 {0.9978379707945482790520941307477187365294,-0.0657220209718039904966957465148880146444} +#define T_32768_345 {0.9978126933404892762169424713647458702326,-0.0661046821987580773827275493204069789499} +#define T_32768_347 {0.9977872691395499593625117995543405413628,-0.0664873337037914513558689577621407806873} +#define T_32768_349 {0.9977616981954695596357396425446495413780,-0.0668699754306281146654100666637532413006} +#define T_32768_351 {0.9977359805120086244656363305693957954645,-0.0672526073229934989727851757379539776593} +#define T_32768_353 {0.9977101160929495726747973094461485743523,-0.0676352293246144792293605974009551573545} +#define T_32768_355 {0.9976841049420960283455883654823992401361,-0.0680178413792193875542224645869282539934} +#define T_32768_357 {0.9976579470632737089985653256007935851812,-0.0684004434305380271119645385624608024955} +#define T_32768_359 {0.9976316424603293153694494321825914084911,-0.0687830354223016443571125932976428885013} +#define T_32768_361 {0.9976051911371316416321519682242069393396,-0.0691656172982429845452756467238941695541} +#define T_32768_363 {0.9975785930975707982426570197276305407286,-0.0695481890020963056109337685484206303954} +#define T_32768_365 {0.9975518483455584339836264007317367941141,-0.0699307504775973087784990411819308064878} +#define T_32768_367 {0.9975249568850279580090045783435925841331,-0.0703133016684832495846180222542898263782} +#define T_32768_369 {0.9974979187199342067771112851914949715137,-0.0706958425184928546114448977277788799256} +#define T_32768_371 {0.9974707338542536660952464444562792778015,-0.0710783729713664047533683287838357500732} +#define T_32768_373 {0.9974434022919843600973877073556650429964,-0.0714608929708456797058602205652277916670} +#define T_32768_375 {0.9974159240371459622664929156599100679159,-0.0718434024606740412322025690627924632281} +#define T_32768_377 {0.9973882990937794623675927141448482871056,-0.0722259013845963360189728064142400398850} +#define T_32768_379 {0.9973605274659479436039077882014680653811,-0.0726083896863590066983462634198076557368} +#define T_32768_381 {0.9973326091577354723938242386793717741966,-0.0729908673097100502147327460988890379667} +#define T_32768_383 {0.9973045441732479865493132820120081305504,-0.0733733341983990317025643435044912621379} +#define T_32768_385 {0.9972763325166131842536287877010181546211,-0.0737557902961770983640832355376915074885} +#define T_32768_387 {0.9972479741919798579274925032223109155893,-0.0741382355467969794693416929476370569319} +#define T_32768_389 {0.9972194692035186713852112916356418281794,-0.0745206698940130002339898851460020523518} +#define T_32768_391 {0.9971908175554219377900722065533045679331,-0.0749030932815810818192758802069874946028} +#define T_32768_393 {0.9971620192519032865874351045931689441204,-0.0752855056532587690876212604962347541004} +#define T_32768_395 {0.9971330742971981075939424954412970691919,-0.0756679069528052444804089304852823261172} +#define T_32768_397 {0.9971039826955633289529146168206352740526,-0.0760502971239812725068318854937388096005} +#define T_32768_399 {0.9970747444512773061120469719753600656986,-0.0764326761105492830106200585760234389454} +#define T_32768_401 {0.9970453595686400438680152547021862119436,-0.0768150438562733572922525127069093286991} +#define T_32768_403 {0.9970158280519733073887778118660207837820,-0.0771974003049192142311696329670667182654} +#define T_32768_405 {0.9969861499056201781243657933373469859362,-0.0775797454002542380413487421719764824957} +#define T_32768_407 {0.9969563251339452758514880770235322415829,-0.0779620790860474921490919086863868869841} +#define T_32768_409 {0.9969263537413350917404386564157903194427,-0.0783444013060697053152381386098568327725} +#define T_32768_411 {0.9968962357321972111989794029796030372381,-0.0787267120040932855129511835912126116455} +#define T_32768_413 {0.9968659711109613130730622287956066429615,-0.0791090111238923754388707720863749273121} +#define T_32768_415 {0.9968355598820781704461069239187054336071,-0.0794912986092427692463857624716183636338} +#define T_32768_417 {0.9968050020500204277951183939876500517130,-0.0798735744039220096901487977447686716914} +#define T_32768_419 {0.9967742976192820458791743476467672735453,-0.0802558384517093326149250742673757486045} +#define T_32768_421 {0.9967434465943788568509376091242302209139,-0.0806380906963857085889557652080839034170} +#define T_32768_423 {0.9967124489798480091451438056537881493568,-0.0810203310817338706595336361715453676879} +#define T_32768_425 {0.9966813047802483005455087550217285752296,-0.0814025595515382588418518139405932743102} +#define T_32768_427 {0.9966500140001600671624260030512232333422,-0.0817847760495850756301550177340686786920} +#define T_32768_429 {0.9966185766441850724106643610866740345955,-0.0821669805196622998755273670212773140520} +#define T_32768_431 {0.9965869927169469510985777560563292354345,-0.0825491729055596729081045737075328361243} +#define T_32768_433 {0.9965552622230905432942904553783591836691,-0.0829313531510686985370739421341568231583} +#define T_32768_435 {0.9965233851672824494372093795391265302896,-0.0833135211999826846840377925218490418047} +#define T_32768_437 {0.9964913615542109193157216395775321871042,-0.0836956769960967156274378453417739365250} +#define T_32768_439 {0.9964591913885854079779846870223991572857,-0.0840778204832077075137064525733876507729} +#define T_32768_441 {0.9964268746751372418657410889863967895508,-0.0844599516051143389683275586321542505175} +#define T_32768_443 {0.9963944114186192857474111406190786510706,-0.0848420703056171482403513550707430113107} +#define T_32768_445 {0.9963618016238057206734879400755744427443,-0.0852241765285184776912430493212013971061} +#define T_32768_447 {0.9963290452954923770434447760635521262884,-0.0856062702176225293060340959527820814401} +#define T_32768_449 {0.9962961424384968456280375903588719666004,-0.0859883513167353369377465810430294368416} +#define T_32768_451 {0.9962630930576581445023975902586244046688,-0.0863704197696647524296054143633227795362} +#define T_32768_453 {0.9962298971578364970014263235498219728470,-0.0867524755202205427595529840800736565143} +#define T_32768_455 {0.9961965547439142198982153786346316337585,-0.0871345185122143067735223098679853137583} +#define T_32768_457 {0.9961630658207949462479291469207964837551,-0.0875165486894595306965882741678797174245} +#define T_32768_459 {0.9961294303934037364101072853372897952795,-0.0878985659957715881329676221866975538433} +#define T_32768_461 {0.9960956484666873000932696413656231015921,-0.0882805703749677400660189618974982295185} +#define T_32768_463 {0.9960617200456139963549162530398461967707,-0.0886625617708671626138183796683733817190} +#define T_32768_465 {0.9960276451351736115569224239152390509844,-0.0890445401272909053957960168190766125917} +#define T_32768_467 {0.9959934237403773593655387230683118104935,-0.0894265053880619747994629165077640209347} +#define T_32768_469 {0.9959590558662583248406008351594209671021,-0.0898084574970052923470476002876239363104} +#define T_32768_471 {0.9959245415178707983017147853388451039791,-0.0901903963979476946954960681068769190460} +#define T_32768_473 {0.9958898807002907194174667893094010651112,-0.0905723220347179891476230295666027814150} +#define T_32768_475 {0.9958550734186157882277257158420979976654,-0.0909542343511469258965362882918270770460} +#define T_32768_477 {0.9958201196779649100321307741978671401739,-0.0913361332910671841478489341170643456280} +#define T_32768_479 {0.9957850194834787505016038267058320343494,-0.0917180187983134692641939977875153999776} +#define T_32768_481 {0.9957497728403195136337444637320004403591,-0.0920998908167223878651341806289565283805} +#define T_32768_483 {0.9957143797536706086859226161323022097349,-0.0924817492901326143606155483212205581367} +#define T_32768_485 {0.9956788402287375383536982553778216242790,-0.0928635941623847382953016449391725473106} +#define T_32768_487 {0.9956431542707468995700992309139110147953,-0.0932454253773213892486637632828205823898} +#define T_32768_489 {0.9956073218849470496394360452541150152683,-0.0936272428787872090794053292484022676945} +#define T_32768_491 {0.9955713430766077731703944664332084357738,-0.0940090466106288380476740940139279700816} +#define T_32768_493 {0.9955352178510203930983379905228503048420,-0.0943908365166949425706377496680943295360} +#define T_32768_495 {0.9954989462134977706853078416315838694572,-0.0947726125408362429780595448391977697611} +#define T_32768_497 {0.9954625281693744165423254344204906374216,-0.0951543746269054857567226690662209875882} +#define T_32768_499 {0.9954259637240061575624849865562282502651,-0.0955361227187574713060058684277464635670} +#define T_32768_501 {0.9953892528827706920324658312893006950617,-0.0959178567602490400600956377274997066706} +#define T_32768_503 {0.9953523956510668124764151798444800078869,-0.0962995766952391279991374517521762754768} +#define T_32768_505 {0.9953153920343150717897628965147305279970,-0.0966812824675887250158723418280715122819} +#define T_32768_507 {0.9952782420379576722169190361455548554659,-0.0970629740211609165490003192644508089870} +#define T_32768_509 {0.9952409456674581322843664565880317240953,-0.0974446512998208835831803753535496070981} +#define T_32768_511 {0.9952035029283015088452657437301240861416,-0.0978263142474358748934548657416598871350} +#define T_32768_513 {0.9951659138259946191240601365279871970415,-0.0982079628078752764341885495014139451087} +#define T_32768_515 {0.9951281783660654856049632144276984035969,-0.0985895969250105835834929735028708819300} +#define T_32768_517 {0.9950902965540640021657736724591813981533,-0.0989712165427154288988020880424301140010} +#define T_32768_519 {0.9950522683955610458994556211109738796949,-0.0993528216048655404835088233994611073285} +#define T_32768_521 {0.9950140938961496983594656740024220198393,-0.0997344120553388252536919367230439092964} +#define T_32768_523 {0.9949757730614441353367283227271400392056,-0.1001159878380153134269647807741421274841} +#define T_32768_525 {0.9949373058970800709488457869156263768673,-0.1004975488967772140336265351834299508482} +#define T_32768_527 {0.9948986924087148686624004767509177327156,-0.1008790951755088732832987830079218838364} +#define T_32768_529 {0.9948599326020273192483500679372809827328,-0.1012606266180968439538645498032565228641} +#define T_32768_531 {0.9948210264827178628266324267315212637186,-0.1016421431684298298803170723658695351332} +#define T_32768_533 {0.9947819740565082557992582223960198462009,-0.1020236447703987553436988378052774351090} +#define T_32768_535 {0.9947427753291420149395207772613503038883,-0.1024051313678967234377381601007073186338} +#define T_32768_537 {0.9947034303063838622804837541480083018541,-0.1027866029048190438244247957300103735179} +#define T_32768_539 {0.9946639389940203912487959314603358507156,-0.1031680593250632327340099436696618795395} +#define T_32768_541 {0.9946243013978594005308764280925970524549,-0.1035495005725290684761574766525882296264} +#define T_32768_543 {0.9945845175237303381621245534915942698717,-0.1039309265911185220510049020958831533790} +#define T_32768_545 {0.9945445873774843015269198076566681265831,-0.1043123373247357987825267855441779829562} +#define T_32768_547 {0.9945045109649937042917144935927353799343,-0.1046937327172874077074737897419254295528} +#define T_32768_549 {0.9944642882921523874273361798259429633617,-0.1050751127126820505530702121177455410361} +#define T_32768_551 {0.9944239193648759522758950879506301134825,-0.1054564772548307188815286394856229890138} +#define T_32768_553 {0.9943834041891014274838767050823662430048,-0.1058378262876466802122621402304503135383} +#define T_32768_555 {0.9943427427707872690021417838579509407282,-0.1062191597550454780218842643080279231071} +#define T_32768_557 {0.9943019351159135821305312674667220562696,-0.1066004776009449594997846588739776052535} +#define T_32768_559 {0.9942609812304817884509589021035935729742,-0.1069817797692652339147656448403722606599} +#define T_32768_561 {0.9942198811205149588943186245160177350044,-0.1073630662039287558817690637624764349312} +#define T_32768_563 {0.9941786347920575916958796369726769626141,-0.1077443368488602837285128543953760527074} +#define T_32768_565 {0.9941372422511757234175888697791378945112,-0.1081255916479868656177032448795216623694} +#define T_32768_567 {0.9940957035039569289480709812778513878584,-0.1085068305452379228137615996274689678103} +#define T_32768_569 {0.9940540185565102104803258953324984759092,-0.1088880534845452080494609958805085625499} +#define T_32768_571 {0.9940121874149662195563337263592984527349,-0.1092692604098427916481384158942091744393} +#define T_32768_573 {0.9939702100854769240001473917800467461348,-0.1096504512650671170348459781962446868420} +#define T_32768_575 {0.9939280865742158299624975370534230023623,-0.1100316259941570007363509375863941386342} +#define T_32768_577 {0.9938858168873780929430949981906451284885,-0.1104127845410536323811356851365417242050} +#define T_32768_579 {0.9938434010311801847237234142085071653128,-0.1107939268497005608216099403762200381607} +#define T_32768_581 {0.9938008390118601154128441521606873720884,-0.1111750528640437218896863669215235859156} +#define T_32768_583 {0.9937581308356774334455963071377482265234,-0.1115561625280314800301439959184790495783} +#define T_32768_585 {0.9937152765089132255837967022671364247799,-0.1119372557856145727894769947852182667702} +#define T_32768_587 {0.9936722760378700058936374261975288391113,-0.1123183325807461802048337062842620071024} +#define T_32768_589 {0.9936291294288717157456858330988325178623,-0.1126993928573818554150776094502361956984} +#define T_32768_591 {0.9935858366882639458594894676934927701950,-0.1130804365594796356830897821055259555578} +#define T_32768_593 {0.9935423978224136032366686777095310389996,-0.1134614636309999452512542461590783204883} +#define T_32768_595 {0.9934988128377093552501264639431610703468,-0.1138424740159057063637604301220562774688} +#define T_32768_597 {0.9934550817405609635102337051648646593094,-0.1142234676581622698776641300355549901724} +#define T_32768_599 {0.9934112045374000610209463957289699465036,-0.1146044445017374152628875094706017989665} +#define T_32768_601 {0.9933671812346795970682933329953812062740,-0.1149854044906014616245215620438102632761} +#define T_32768_603 {0.9933230118388739482426785798452328890562,-0.1153663475687271428027358410872693639249} +#define T_32768_605 {0.9932786963564790294611839271965436637402,-0.1157472736800897183950809221641975454986} +#define T_32768_607 {0.9932342347940122939675688940042164176702,-0.1161281827686669460009127874400292057544} +#define T_32768_609 {0.9931896271580126223099682647443842142820,-0.1165090747784390395880294022390444297343} +#define T_32768_611 {0.9931448734550404333631945519300643354654,-0.1168899496533887943927609853744797874242} +#define T_32768_613 {0.9930999736916775733064355335955042392015,-0.1172708073375014620198797388184175360948} +#define T_32768_615 {0.9930549278745273156232542532961815595627,-0.1176516477747648614649023102174396626651} +#define T_32768_617 {0.9930097360102145831461939451401121914387,-0.1180324709091693513585141772637143731117} +#define T_32768_619 {0.9929643981053856149898706462408881634474,-0.1184132766847077883332062242516258265823} +#define T_32768_621 {0.9929189141667082996178805842646397650242,-0.1187940650453756380455772045934281777591} +#define T_32768_623 {0.9928732842008717307535903273674193769693,-0.1191748359351708780318190861180482897907} +#define T_32768_625 {0.9928275082145867624916490967734716832638,-0.1195555892980941087300195135867397766560} +#define T_32768_627 {0.9927815862145855652087789167126175016165,-0.1199363250781484702134349618063424713910} +#define T_32768_629 {0.9927355182076218476083795394515618681908,-0.1203170432193396977016419668871094472706} +#define T_32768_631 {0.9926893042004707456982259827782399952412,-0.1206977436656761076827493184282502625138} +#define T_32768_633 {0.9926429441999288227904685300018172711134,-0.1210784263611686534245492907757579814643} +#define T_32768_635 {0.9925964382128142915462376549839973449707,-0.1214590912498308555855786039501253981143} +#define T_32768_637 {0.9925497862459666809087366345920599997044,-0.1218397382756788993596330783475423231721} +#define T_32768_639 {0.9925029883062469471255440112145151942968,-0.1222203673827315512090407878531550522894} +#define T_32768_641 {0.9924560444005376957932185177924111485481,-0.1226009785150102421313889067278068978339} +#define T_32768_643 {0.9924089545357428487903916902723722159863,-0.1229815716165390537817359017935814335942} +#define T_32768_645 {0.9923617187187878663223727926379069685936,-0.1233621466313446768392481089904322288930} +#define T_32768_647 {0.9923143369566196358988463543937541544437,-0.1237427035034565220295021958918368909508} +#define T_32768_649 {0.9922668092562065833561746330815367400646,-0.1241232421769066091021826991891430225223} +#define T_32768_651 {0.9922191356245384508127926892484538257122,-0.1245037625957296778533844872072222642601} +#define T_32768_653 {0.9921713160686265187138133114785887300968,-0.1248842647039631326144615286466432735324} +#define T_32768_655 {0.9921233505955037168533294789085630327463,-0.1252647484456470561298147003981284797192} +#define T_32768_657 {0.9920752392122240692629020486492663621902,-0.1256452137648242928236186344292946159840} +#define T_32768_659 {0.9920269819258633603453745308797806501389,-0.1260256606055403238997314474545419216156} +#define T_32768_661 {0.9919785787435185797633607762691099196672,-0.1264060889118433783639972034507081843913} +#define T_32768_663 {0.9919300296723084775507572885544504970312,-0.1267864986277844330242459136570687405765} +#define T_32768_665 {0.9918813347193730090012309119629207998514,-0.1271668896974171569791423053175094537437} +#define T_32768_667 {0.9918324938918737787574286812741775065660,-0.1275472620647979671293370529383537359536} +#define T_32768_669 {0.9917835071969934856994655092421453446150,-0.1279276156739860836886180095461895689368} +#define T_32768_671 {0.9917343746419368111233438867202494293451,-0.1283079504690434191616077441722154617310} +#define T_32768_673 {0.9916850962339294195402317200205288827419,-0.1286882663940347171216416199968080036342} +#define T_32768_675 {0.9916356719802187358325795685232151299715,-0.1290685633930274134328897162049543112516} +#define T_32768_677 {0.9915861018880735011649107946141157299280,-0.1294488414100917750282349061308195814490} +#define T_32768_679 {0.9915363859647838840061240262002684175968,-0.1298291003893009276648484728866606019437} +#define T_32768_681 {0.9914865242176614801294931567099411040545,-0.1302093402747306338795851843315176665783} +#define T_32768_683 {0.9914365166540394236349698076082859188318,-0.1305895610104596538114662962470902130008} +#define T_32768_685 {0.9913863632812722759268808658816851675510,-0.1309697625405693843791965491618611849844} +#define T_32768_687 {0.9913360641067361367362309465534053742886,-0.1313499448091441923480715558980591595173} +#define T_32768_689 {0.9912856191378282000314925426209811121225,-0.1317301077602711922853728765403502620757} +#define T_32768_691 {0.9912350283819674201524208001501392573118,-0.1321102513380403575826704809514922089875} +#define T_32768_693 {0.9911842918465941787431461307278368622065,-0.1324903754865445482113983644012478180230} +#define T_32768_695 {0.9911334095391701737298717489466071128845,-0.1328704801498794274561277006796444766223} +#define T_32768_697 {0.9910823814671786413654785974358674138784,-0.1332505652721435451812936889837146736681} +#define T_32768_699 {0.9910312076381241341849204218306113034487,-0.1336306307974383655867711695464095100760} +#define T_32768_701 {0.9909798880595327430498286958027165383101,-0.1340106766698681284299965454920311458409} +#define T_32768_703 {0.9909284227389519861262101585452910512686,-0.1343907028335400988261483234964543953538} +#define T_32768_705 {0.9908768116839506978621443522570189088583,-0.1347707092325643452035421887558186426759} +#define T_32768_707 {0.9908250549021194730769934722047764807940,-0.1351506958110538503259334675021818839014} +#define T_32768_709 {0.9907731524010697787829826665984001010656,-0.1355306625131245945592439738902612589300} +#define T_32768_711 {0.9907211041884351754305271242628805339336,-0.1359106092828953615825327005950384773314} +#define T_32768_713 {0.9906689102718700956629049869661685079336,-0.1362905360644879604326007438430679030716} +#define T_32768_715 {0.9906165706590506214723745870287530124187,-0.1366704428020270867261132252679090015590} +#define T_32768_717 {0.9905640853576743731778719848080072551966,-0.1370503294396404059263261387968668714166} +#define T_32768_719 {0.9905114543754602873804060436668805778027,-0.1374301959214585533430863506509922444820} +#define T_32768_721 {0.9904586777201486169630584299738984555006,-0.1378100421916150786216803680872544646263} +#define T_32768_723 {0.9904057553995012641578910006501246243715,-0.1381898681942465567651368019141955301166} +#define T_32768_725 {0.9903526874213014474790384156221989542246,-0.1385696738734925048674995196051895618439} +#define T_32768_727 {0.9902994737933535907004056753066834062338,-0.1389494591734954931361301078140968456864} +#define T_32768_729 {0.9902461145234839889894828957039862871170,-0.1393292240384009783582541786017827689648} +#define T_32768_731 {0.9901926096195400317512280707887839525938,-0.1397089684123575537011419100963394157588} +#define T_32768_733 {0.9901389590893906467172769225726369768381,-0.1400886922395166989119275058328639715910} +#define T_32768_735 {0.9900851629409259668790355135570280253887,-0.1404683954640330023622141197847668081522} +#define T_32768_737 {0.9900312211820579966214950218272861093283,-0.1408480780300640777813470094770309515297} +#define T_32768_739 {0.9899771338207196125225095784117002040148,-0.1412277398817705087452623047283850610256} +#define T_32768_741 {0.9899229008648654515312159674067515879869,-0.1416073809633160152099407014247844927013} +#define T_32768_743 {0.9898685223224715779011262384301517158747,-0.1419870012188672869779537677459302358329} +#define T_32768_745 {0.9898139982015352611455227815895341336727,-0.1423666005925941779874932535676634870470} +#define T_32768_747 {0.9897593285100751980820632525137625634670,-0.1427461790286694842677661654306575655937} +#define T_32768_749 {0.9897045132561318458996879598998930305243,-0.1431257364712691937391753072006395086646} +#define T_32768_751 {0.9896495524477665339802001653879415243864,-0.1435052728645723196798655862949090078473} +#define T_32768_753 {0.9895944460930624630989882462017703801394,-0.1438847881527609839924508605690789408982} +#define T_32768_755 {0.9895391942001239282689084575395099818707,-0.1442642822800204449595895539459888823330} +#define T_32768_757 {0.9894837967770767628294947826361749321222,-0.1446437551905390417328334251578780822456} +#define T_32768_759 {0.9894282538320682274246564702480100095272,-0.1450232068285082220882031833752989768982} +#define T_32768_761 {0.9893725653732670100026780346524901688099,-0.1454026371381225701817641038360306993127} +#define T_32768_763 {0.9893167314088630037716143306170124560595,-0.1457820460635798343052016434739925898612} +#define T_32768_765 {0.9892607519470676402661979409458581358194,-0.1461614335490808991302458252903306856751} +#define T_32768_767 {0.9892046269961137783255367139645386487246,-0.1465407995388297579530956227245042100549} +#define T_32768_769 {0.9891483565642555930708113010041415691376,-0.1469201439770336237167214221699396148324} +#define T_32768_771 {0.9890919406597687979498800814326386898756,-0.1472994668079028734997137917162035591900} +#define T_32768_773 {0.9890353792909503116703717751079238951206,-0.1476787679756509930051322498911758884788} +#define T_32768_775 {0.9889786724661184802442903674091212451458,-0.1480580474244947153383833438056171871722} +#define T_32768_777 {0.9889218201936131880103175717522390186787,-0.1484373050986539932516450335242552682757} +#define T_32768_779 {0.9888648224817956355892079045588616281748,-0.1488165409423519158771398451790446415544} +#define T_32768_781 {0.9888076793390484509060911477718036621809,-0.1491957548998148197494373334848205558956} +#define T_32768_783 {0.9887503907737753561235649613081477582455,-0.1495749469152722332943028504814719781280} +#define T_32768_785 {0.9886929567944019447978121206688228994608,-0.1499541169329569600954243924206821247935} +#define T_32768_787 {0.9886353774093747937001808168133720755577,-0.1503332648971050233832613685081014409661} +#define T_32768_789 {0.9885776526271620179286969687382224947214,-0.1507123907519556105238933696455205790699} +#define T_32768_791 {0.9885197824562532709080642234766855835915,-0.1510914944417512950636250934621784836054} +#define T_32768_793 {0.9884617669051593003004541060363408178091,-0.1514705759107378146843814192834543064237} +#define T_32768_795 {0.9884036059824123920947158694616518914700,-0.1518496351031641822260098706465214490891} +#define T_32768_797 {0.9883452996965661485617715698026586323977,-0.1522286719632827411974318465581745840609} +#define T_32768_799 {0.9882868480561957102992209911462850868702,-0.1526076864353490825099157746080891229212} +#define T_32768_801 {0.9882282510698974231644342580693773925304,-0.1529866784636220722326527265977347269654} +#define T_32768_803 {0.9881695087462890603191567606700118631124,-0.1533656479923638793483320341692888177931} +#define T_32768_805 {0.9881106210940098222295091545674949884415,-0.1537445949658400312642925200634635984898} +#define T_32768_807 {0.9880515881217201146213824358710553497076,-0.1541235193283193583013712668616790324450} +#define T_32768_809 {0.9879924098381018815473453287268057465553,-0.1545024210240739659383280013571493327618} +#define T_32768_811 {0.9879330862518583833420393602864351123571,-0.1548812999973793180785719414416234940290} +#define T_32768_813 {0.9878736173717141966221788607072085142136,-0.1552601561925142370501617961053852923214} +#define T_32768_815 {0.9878140032064155473534583506989292800426,-0.1556389895537609036058057654372532851994} +#define T_32768_817 {0.9877542437647295336944353039143607020378,-0.1560178000254048291672859249956673011184} +#define T_32768_819 {0.9876943390554451251972523095901124179363,-0.1563965875517349113366094570665154606104} +#define T_32768_821 {0.9876342890873721636069149099057540297508,-0.1567753520770433783848574194053071551025} +#define T_32768_823 {0.9875740938693423620620137626247014850378,-0.1571540935456259002744872077528270892799} +#define T_32768_825 {0.9875137534102084169163049409689847379923,-0.1575328119017815331481813245773082599044} +#define T_32768_827 {0.9874532677188445628502222461975179612637,-0.1579115070898126638176961478166049346328} +#define T_32768_829 {0.9873926368041462398039698200591374188662,-0.1582901790540251762973156246516737155616} +#define T_32768_831 {0.9873318606750304260444295323395635932684,-0.1586688277387283130259731933620059862733} +#define T_32768_833 {0.9872709393404354161205560558300931006670,-0.1590474530882347581339786302123684436083} +#define T_32768_835 {0.9872098728093208208633768663275986909866,-0.1594260550468606096874424338238895870745} +#define T_32768_837 {0.9871486610906675673859922426345292478800,-0.1598046335589254351994270564318867400289} +#define T_32768_839 {0.9870873041934778990835752665589097887278,-0.1601831885687522438743712882569525390863} +#define T_32768_841 {0.9870258021267755976779767479456495493650,-0.1605617200206674866080902575049549341202} +#define T_32768_843 {0.9869641548996056501508178371295798569918,-0.1609402278590010837433510459959506988525} +#define T_32768_845 {0.9869023625210344707880949499667622148991,-0.1613187120280864250698726891641854308546} +#define T_32768_847 {0.9868404250001496791355748428031802177429,-0.1616971724722603975799017916870070621371} +#define T_32768_849 {0.9867783423460604330657020000217016786337,-0.1620756091358633577126369118559523485601} +#define T_32768_851 {0.9867161145678970957106912464951165020466,-0.1624540219632391868653797928345738910139} +#define T_32768_853 {0.9866537416748113464848302101017907261848,-0.1628324108987352358823841314006131142378} +#define T_32768_855 {0.9865912236759764031290842467569746077061,-0.1632107758867023805660068092038272880018} +#define T_32768_857 {0.9865285605805866886441890528658404946327,-0.1635891168714950494322835083949030376971} +#define T_32768_859 {0.9864657523978579423129531278391368687153,-0.1639674337974711681997774803676293231547} +#define T_32768_861 {0.9864027991370272197002577740931883454323,-0.1643457266089921875451551613878109492362} +#define T_32768_863 {0.9863397008073530036753595595655497163534,-0.1647239952504231663699130194800090976059} +#define T_32768_865 {0.9862764574181150933895878551993519067764,-0.1651022396661326607780750919118872843683} +#define T_32768_867 {0.9862130689786144932540423724276479333639,-0.1654804598004927795873442164520383812487} +#define T_32768_869 {0.9861495354981738570288030132360290735960,-0.1658586555978792953514044938856386579573} +#define T_32768_871 {0.9860858569861368216891150950687006115913,-0.1662368270026714223153163629831396974623} +#define T_32768_873 {0.9860220334518685625369016634067520499229,-0.1666149739592520939712727567894035018981} +#define T_32768_875 {0.9859580649047554601338561042211949825287,-0.1669930964120077410139941775923944078386} +#define T_32768_877 {0.9858939513542052113237446064886171370745,-0.1673711943053284301186067750677466392517} +#define T_32768_879 {0.9858296928096470512770110872224904596806,-0.1677492675836078916962179619076778180897} +#define T_32768_881 {0.9857652892805313094015673414105549454689,-0.1681273161912434088716139513053349219263} +#define T_32768_883 {0.9857007407763298534320028920774348080158,-0.1685053400726359285055622194704483263195} +#define T_32768_885 {0.9856360473065354232957702151907142251730,-0.1688833391721899779280846587425912730396} +#define T_32768_887 {0.9855712088806627413362093648174777626991,-0.1692613134343138037163356557357474230230} +#define T_32768_889 {0.9855062255082472910672208854521159082651,-0.1696392628034192884278752444515703245997} +#define T_32768_891 {0.9854410971988462053516855121415574103594,-0.1700171872239219506006691062793834134936} +#define T_32768_893 {0.9853758239620377112899518579069990664721,-0.1703950866402409725086641856250935234129} +#define T_32768_895 {0.9853104058074215743090462638065218925476,-0.1707729609967992279173643055401043966413} +#define T_32768_897 {0.9852448427446185430511604863568209111691,-0.1711508102380232820838301677213166840374} +#define T_32768_899 {0.9851791347832711265297689351427834481001,-0.1715286343083434195122549681400414556265} +#define T_32768_901 {0.9851132819330427059512089726922567933798,-0.1719064331521935606872375501552596688271} +#define T_32768_903 {0.9850472842036182008484956895699724555016,-0.1722842067140114008516604826581897214055} +#define T_32768_905 {0.9849811416047039580590194418618921190500,-0.1726619549382382712288119819277198985219} +#define T_32768_907 {0.9849148541460271966130335385969374328852,-0.1730396777693193610669908366617164574564} +#define T_32768_909 {0.9848484218373370069343764043878763914108,-0.1734173751517034400837502516878885217011} +#define T_32768_911 {0.9847818446884033516397494167904369533062,-0.1737950470298431637772296198818366974592} +#define T_32768_913 {0.9847151227090176206502292188815772533417,-0.1741726933481948236259739815068314783275} +#define T_32768_915 {0.9846482559089926311912677192594856023788,-0.1745503140512185413779633336162078194320} +#define T_32768_917 {0.9845812442981621837034822419809643179178,-0.1749279090833781580283101675377110950649} +#define T_32768_919 {0.9845140878863818389987727641710080206394,-0.1753054783891413448415619313891511410475} +#define T_32768_921 {0.9844467866835279190595997533819172531366,-0.1756830219129794923293985675627482123673} +#define T_32768_923 {0.9843793406994985062397063302341848611832,-0.1760605395993678490285105908697005361319} +#define T_32768_925 {0.9843117499442127771303034933225717395544,-0.1764380313927854104782966260245302692056} +#define T_32768_927 {0.9842440144276111135823725817317608743906,-0.1768154972377150024875902545318240299821} +#define T_32768_929 {0.9841761341596553247512702000676654279232,-0.1771929370786432811346600146862328983843} +#define T_32768_931 {0.9841081091503285360744257559417746961117,-0.1775703508600607050116337859435589052737} +#define T_32768_933 {0.9840399394096349672267365349398460239172,-0.1779477385264615629800744045496685430408} +#define T_32768_935 {0.9839716249476002651874750881688669323921,-0.1783251000223440019265552791694062761962} +#define T_32768_937 {0.9839031657742715042402892322570551186800,-0.1787024352922099990070847752576810307801} +#define T_32768_939 {0.9838345618997166308616897367755882441998,-0.1790797442805653894026818306883797049522} +#define T_32768_941 {0.9837658133340252408771675618481822311878,-0.1794570269319198940749515713832806795835} +#define T_32768_943 {0.9836969200873081353719840080884750932455,-0.1798342831907870920105096956831403076649} +#define T_32768_945 {0.9836278821696972096688682540843728929758,-0.1802115130016844757321337056055199354887} +#define T_32768_947 {0.9835586995913458974172272064606659114361,-0.1805887163091333402764604443291318602860} +#define T_32768_949 {0.9834893723624287265039356498164124786854,-0.1809658930576589774830154055962339043617} +#define T_32768_951 {0.9834199004931415410979411717562470585108,-0.1813430431917905649719102711969753727317} +#define T_32768_953 {0.9833502839937015016502641628903802484274,-0.1817201666560611106326916797115700319409} +#define T_32768_955 {0.9832805228743469738716953543189447373152,-0.1820972633950076469133705359126906841993} +#define T_32768_957 {0.9832106171453376397550982801476493477821,-0.1824743333531711197981195482498151250184} +#define T_32768_959 {0.9831405668169544975754092774877790361643,-0.1828513764750963332961219975913991220295} +#define T_32768_961 {0.9830703718994996398450325614248868077993,-0.1832283927053321437306010466272709891200} +#define T_32768_963 {0.9830000324032965863807476125657558441162,-0.1836053819884312932053660460951505228877} +#define T_32768_965 {0.9829295483386901732814067145227454602718,-0.1839823442689505206271149972963030450046} +#define T_32768_967 {0.9828589197160461088387251038511749356985,-0.1843592794914505061942833208377123810351} +#define T_32768_969 {0.9827881465457519727380031326902098953724,-0.1847361876004959824193463191477349027991} +#define T_32768_971 {0.9827172288382159948127991810906678438187,-0.1851130685406555398397898670737049542367} +#define T_32768_973 {0.9826461666038680542456518196559045463800,-0.1854899222565018768182909525421564467251} +#define T_32768_975 {0.9825749598531592354788699594791978597641,-0.1858667486926116607648395984142553061247} +#define T_32768_977 {0.9825036085965618282145328521437477320433,-0.1862435477935655558923144781147129833698} +#define T_32768_979 {0.9824321128445691053698851646913681179285,-0.1866203195039482787276341468896134756505} +#define T_32768_981 {0.9823604726076962112557566797477193176746,-0.1869970637683485426006058105485863052309} +#define T_32768_983 {0.9822886878964788293089327453344594687223,-0.1873737805313591131550765567226335406303} +#define T_32768_985 {0.9822167587214745143597838250570930540562,-0.1877504697375767805933577392352162860334} +#define T_32768_987 {0.9821446850932615824092408729484304785728,-0.1881271313316024429429518249889952130616} +#define T_32768_989 {0.9820724670224398877849125710781663656235,-0.1885037652580409672786743158212630078197} +#define T_32768_991 {0.9820001045196304900741779420059174299240,-0.1888803714615013840116830579063389450312} +#define T_32768_993 {0.9819275975954755431018838862655684351921,-0.1892569498865967481116001636110013350844} +#define T_32768_995 {0.9818549462606386279972525699122343212366,-0.1896335004779441946176632427523145452142} +#define T_32768_997 {0.9817821505258043091046715744596440345049,-0.1900100231801649941498766338554560206831} +#define T_32768_999 {0.9817092104016788001175086719740647822618,-0.1903865179378844696422845572669757530093} +#define T_32768_1001 {0.9816361258989890758996921249490696936846,-0.1907629846957321073652735776704503223300} +#define T_32768_1003 {0.9815628970284836496418279239151161164045,-0.1911394233983414459032701415708288550377} +#define T_32768_1005 {0.9814895238009321287719899373769294470549,-0.1915158339903502149326186554390005767345} +#define T_32768_1007 {0.9814160062271255480226272993604652583599,-0.1918922164164002519548546388250542804599} +#define T_32768_1009 {0.9813423443178760363636570218659471720457,-0.1922685706211375022967047243582783266902} +#define T_32768_1011 {0.9812685380840167059801615323522128164768,-0.1926448965492121301323891202628146857023} +#define T_32768_1013 {0.9811945875364023184062034488306380808353,-0.1930211941452783797057435322130913846195} +#define T_32768_1015 {0.9811204926859087294133132672868669033051,-0.1933974633539947418636728571073035709560} +#define T_32768_1017 {0.9810462535434327779881868991651572287083,-0.1937737041200238430338487205517594702542} +#define T_32768_1019 {0.9809718701198928414441979839466512203217,-0.1941499163880324452247094768608803860843} +#define T_32768_1021 {0.9808973424262283913321880390867590904236,-0.1945261001026916125589139028306817635894} +#define T_32768_1023 {0.9808226704734001044627689225308131426573,-0.1949022552086765169843118883363786153495} +#define T_32768_1033 {0.9804471472069090642875721641757991164923,-0.1967825996724141268678920368984108790755} +#define T_32768_1035 {0.9803716099304597975105934892781078815460,-0.1971585819647688753164516128890682011843} +#define T_32768_1037 {0.9802959284721652899463606445351615548134,-0.1975345352612940252878814817449892871082} +#define T_32768_1039 {0.9802201028431560825282531368429772555828,-0.1979104595066986937990094475026126019657} +#define T_32768_1067 {0.9791434123954302348735723171557765454054,-0.2031703176220197859525029571159393526614} +#define T_32768_1069 {0.9790654255557569296541942094336263835430,-0.2035457994686322158450764163717394694686} +#define T_32768_1071 {0.9789872947263370539516813551017548888922,-0.2039212513800561232812924572499468922615} +#define T_32768_1073 {0.9789090199186613050486016618378926068544,-0.2042966733010743995979652254391112364829} +#define T_32768_1101 {0.9777980613794463593535510881338268518448,-0.2095494002916649667334070272772805765271} +#define T_32768_1103 {0.9777176282911974602995996974641457200050,-0.2099243657335558843612943746848031878471} +#define T_32768_1105 {0.9776370514114207654898791588493622839451,-0.2102993003021717266509682531250291503966} +#define T_32768_1107 {0.9775563307519664624223310056549962610006,-0.2106742039423714352164296315095270983875} +#define T_32768_1135 {0.9764111513399610364771774584369268268347,-0.2159195765533354904874840940465219318867} +#define T_32768_1137 {0.9763282754217572634303223821916617453098,-0.2162940096534743672496148292339057661593} +#define T_32768_1139 {0.9762452559163558030164153933583293110132,-0.2166684109435637328733292861215886659920} +#define T_32768_1141 {0.9761620928359661109041667259589303284883,-0.2170427803685410217404694321885472163558} +#define T_32768_1169 {0.9749827412243471380648429658322129398584,-0.2222805756575633728822083412524079903960} +#define T_32768_1171 {0.9748974259986358248397664283402264118195,-0.2226544605015455258190115728211821988225} +#define T_32768_1173 {0.9748119673961598286027196991199161857367,-0.2230283126000558169277354636506061069667} +#define T_32768_1175 {0.9747263654294873180816694002714939415455,-0.2234021318981123938041122301001450978220} +#define T_32768_1203 {0.9735128917438413687079901137622073292732,-0.2286321272449342301147368061720044352114} +#define T_32768_1205 {0.9734251408367470270732724202389363199472,-0.2290054479416573640637722064639092423022} +#define T_32768_1207 {0.9733372467694147989192288150661624968052,-0.2293787349588780333231596841869759373367} +#define T_32768_1209 {0.9732492095547712329661749208753462880850,-0.2297519882416974856820246486677206121385} +#define T_32768_1237 {0.9720016653709638854152785825135651975870,-0.2349739613575782815058090591264772228897} +#define T_32768_1239 {0.9719114825121339951152776848175562918186,-0.2353467020399178655232219625759171321988} +#define T_32768_1241 {0.9718211567156777030973557884863112121820,-0.2357194081101558480462188072124263271689} +#define T_32768_1243 {0.9717306879948791609180602790729608386755,-0.2360920795134789085256699081583064980805} +#define T_32768_1271 {0.9704491263368630882268917048349976539612,-0.2413058084506443656813701181818032637239} +#define T_32768_1273 {0.9703565153593094461115242665982805192471,-0.2416779532761280102626244570274138823152} +#define T_32768_1275 {0.9702637616728161384926920618454460054636,-0.2420500625583820453190497801188030280173} +#define T_32768_1277 {0.9701708652910244756739643889886792749166,-0.2424221362426809134316130212027928791940} +#define T_32768_1305 {0.9688553406285855817969832060043700039387,-0.2476273994037562764525262082315748557448} +#define T_32768_1307 {0.9687603054685214321040120921679772436619,-0.2479989325552371393079909012158168479800} +#define T_32768_1309 {0.9686651278342700566170719866931904107332,-0.2483704292338710373222454563801875337958} +#define T_32768_1311 {0.9685698077398289251860319382103625684977,-0.2487418893850224799191295232958509586751} +#define T_32768_1339 {0.9672203759862714189665666708606295287609,-0.2539384655324510853269259769149357452989} +#define T_32768_1341 {0.9671229206829443647563948616152629256248,-0.2543093712187800004009829990536672994494} +#define T_32768_1343 {0.9670253231462379028826603644120041280985,-0.2546802395041948829934597142710117623210} +#define T_32768_1345 {0.9669275833905056627415319780993741005659,-0.2550510703341525298171177382755558937788} +#define T_32768_1373 {0.9655443019002751814383600503788329660892,-0.2602387385995988400289036235335515812039} +#define T_32768_1375 {0.9654444305957954330921211294480599462986,-0.2606090010562957526296656851627631112933} +#define T_32768_1377 {0.9653444173047893706041122641181573271751,-0.2609792251856010714128331073879962787032} +#define T_32768_1379 {0.9652442620419657837160798408149275928736,-0.2613494109330663506263192630285630002618} +#define T_32768_1407 {0.9638271896082123424420728952100034803152,-0.2665279508268036945395351722254417836666} +#define T_32768_1409 {0.9637249065473765252676230375072918832302,-0.2668975543167274033784508446842664852738} +#define T_32768_1411 {0.9636224817529022157458484798553399741650,-0.2672671185544109317966388061904581263661} +#define T_32768_1413 {0.9635199152398531419194682712259236723185,-0.2676366434855030895789695932762697339058} +#define T_32768_1441 {0.9620691120919315775239510912797413766384,-0.2728058349057848053220709516608621925116} +#define T_32768_1443 {0.9619644216220423205143674749706406146288,-0.2731747637198019273974125553650083020329} +#define T_32768_1445 {0.9618595896774265741058229650661814957857,-0.2735436523583987256103000618168152868748} +#define T_32768_1447 {0.9617546162735020054412871104432269930840,-0.2739125007673233191951567278010770678520} +#define T_32768_1475 {0.9602701440744128014159741724142804741859,-0.2790721240097378541555883657565573230386} +#define T_32768_1477 {0.9601630506450939961382573528680950403214,-0.2794403624673905683906127705995459109545} +#define T_32768_1479 {0.9600558160059738943559182189346756786108,-0.2798085598281503938444814139074878767133} +#define T_32768_1481 {0.9599484401728232141337571192707400768995,-0.2801767160378669796472195230307988822460} +#define T_32768_1509 {0.9584303620165909309847052099939901381731,-0.2853265518046758653092354052205337211490} +#define T_32768_1511 {0.9583208701795988826432903806562535464764,-0.2856940842548483794338665120449149981141} +#define T_32768_1513 {0.9582112374037322632958080248499754816294,-0.2860615746884020427209804893209366127849} +#define T_32768_1515 {0.9581014637051147309065868284960743039846,-0.2864290230512906987314636353403329849243} +#define T_32768_1543 {0.9565498441141067065274228298221714794636,-0.2915688524607490950124599748960463330150} +#define T_32768_1545 {0.9564379585231361780017778073670342564583,-0.2919356632823327801773416467767674475908} +#define T_32768_1547 {0.9563259322702082343425900035072118043900,-0.2923024311693575572057568479067413136363} +#define T_32768_1549 {0.9562137653717984742129942787869367748499,-0.2926691560678834624908972728007938712835} +#define T_32768_1577 {0.9546286702939826840363934934430290013552,-0.2977987606635435491320151868421817198396} +#define T_32768_1579 {0.9545143957044694982272403649403713643551,-0.2981648342661009087883883239555871114135} +#define T_32768_1581 {0.9543999807358944931223732055514119565487,-0.2985308640179841210837707876635249704123} +#define T_32768_1583 {0.9542854254050846529722207378654275089502,-0.2988968498653618022231626127904746681452} +#define T_32768_1611 {0.9526669222112261747881234441592823714018,-0.3040160116253575739442283065727679058909} +#define T_32768_1613 {0.9525502634801449319823518635530490428209,-0.3043813324497848804561783708777511492372} +#define T_32768_1615 {0.9524334646588640262976355188584420830011,-0.3047466085092865872852030406647827476263} +#define T_32768_1617 {0.9523165257645609393932772945845499634743,-0.3051118397501421108941599413810763508081} +#define T_32768_1645 {0.9506646832453589102129853927181102335453,-0.3102203410964559093798698086175136268139} +#define T_32768_1647 {0.9505456453310165976233747642254456877708,-0.3105848936156444528577935670909937471151} +#define T_32768_1649 {0.9504264676212909046526533529686275869608,-0.3109494004575586445504598032130161300302} +#define T_32768_1651 {0.9503071501337092552574858927982859313488,-0.3113138615685909771357842146244365721941} +#define T_32768_1679 {0.9486220384968729879560100926028098911047,-0.3164114853763010382081688476318959146738} +#define T_32768_1681 {0.9485006264586982593911557160026859492064,-0.3167752540957973206303677216055803000927} +#define T_32768_1683 {0.9483790749258981245262134507356677204370,-0.3171389762276117796346852628630585968494} +#define T_32768_1685 {0.9482573839163490614367901798686943948269,-0.3175026517182523155824469540675636380911} +#define T_32768_1713 {0.9465390747836140983295649675710592418909,-0.3225891813247613315596140637353528290987} +#define T_32768_1715 {0.9464152937819421129717056828667409718037,-0.3229521507834252624924431529507273808122} +#define T_32768_1717 {0.9462913735923316194131871270656120032072,-0.3233150727459800366503372970328200608492} +#define T_32768_1719 {0.9461673142330073726924410948413424193859,-0.3236779471590512380352322452381486073136} +#define T_32768_1747 {0.9444158806370912540018025538302026689053,-0.3287531663732950470979687906947219744325} +#define T_32768_1749 {0.9442897359329444073594572728325147181749,-0.3291153211439572512375661972328089177608} +#define T_32768_1751 {0.9441634523534617740025964849337469786406,-0.3294774275121017392820021996158175170422} +#define T_32768_1753 {0.9440370299172158308209645838360302150249,-0.3298394854244739438087208327488042414188} +#define T_32768_1781 {0.9422525462987140221216009194904472678900,-0.3349031785361101798415006669529248028994} +#define T_32768_1783 {0.9421240432535785691570140443218406289816,-0.3352645032262278057544335752027109265327} +#define T_32768_1785 {0.9419954016516125516744750711950473487377,-0.3356257786094762884943065728293731808662} +#define T_32768_1787 {0.9418666215117352802366212927154265344143,-0.3359870046327233517047261557308956980705} +#define T_32768_1815 {0.9400491637159573699022985238116234540939,-0.3410389564212996660330645681824535131454} +#define T_32768_1817 {0.9399183077915550521552745522058103233576,-0.3413994356736104185401359245588537305593} +#define T_32768_1819 {0.9397873136347165701209860344533808529377,-0.3417598647167963132886825405876152217388} +#define T_32768_1821 {0.9396561812647070688342409994220361113548,-0.3421202434978495854345226234727306291461} +#define T_32768_1849 {0.9378058265384531244635013536026235669851,-0.3471602392419512184140728550119092687964} +#define T_32768_1851 {0.9376726232965094665061656087345909327269,-0.3475198577351261142176497287437086924911} +#define T_32768_1853 {0.9375392821523992337162667354277800768614,-0.3478794251190545083751715083053568378091} +#define T_32768_1855 {0.9374058031257329615115736487496178597212,-0.3482389413408553124895661312621086835861} +#define T_32768_1883 {0.9355226301140099343101041995396371930838,-0.3532667668272312377908406233473215252161} +#define T_32768_1885 {0.9353870852160177662426576716825366020203,-0.3536255092765259733234017858194420114160} +#define T_32768_1887 {0.9352514027519898132112530220183543860912,-0.3539841997186247701812078503280645236373} +#define T_32768_1889 {0.9351155827418808907935954266577027738094,-0.3543428381007755478826481976284412667155} +#define T_32768_1917 {0.9331996714845607332478039097622968256474,-0.3593582796334431339602133448352105915546} +#define T_32768_1919 {0.9330617906915393833244820598338264971972,-0.3597161307913475658537549861648585647345} +#define T_32768_1921 {0.9329237726744601388872979441657662391663,-0.3600739290463170205747189811518182978034} +#define T_32768_1923 {0.9327856174536210964731708372710272669792,-0.3604316743457307570253078665700741112232} +#define T_32768_1951 {0.9308370493820381508243144708103500306606,-0.3654345187550583906599399597325827926397} +#define T_32768_1953 {0.9306968385542888633210623083868995308876,-0.3657914634119445729254493926418945193291} +#define T_32768_1955 {0.9305564908502917953470046086295042186975,-0.3661483542724053852346344228863017633557} +#define T_32768_1957 {0.9304160062906875472421575068437959998846,-0.3665051912839533687638038372824667021632} +#define T_32768_1985 {0.9284348642241779803185863784165121614933,-0.3714952259357207631218500409886473789811} +#define T_32768_1987 {0.9282923293210345594417276515741832554340,-0.3718512489204894855276961607160046696663} +#define T_32768_1989 {0.9281496578952711518084583985910285264254,-0.3722072172176288940015354000934166833758} +#define T_32768_1991 {0.9280068499678699733834719154401682317257,-0.3725631307747872544844369713246123865247} +#define T_32768_2019 {0.9259932181102514814341475357650779187679,-0.3775401435792229420940202544443309307098} +#define T_32768_2021 {0.9258483651898272714930726579041220247746,-0.3778952297599485476986558296630391851068} +#define T_32768_2023 {0.9257033761062132271035807207226753234863,-0.3782502603641652028088060433219652622938} +#define T_32768_2025 {0.9255582508807326247435298682830762118101,-0.3786052353396591740875010145828127861023} +#define T_32768_2053 {0.9235122148167256295181459790910594165325,-0.3835690147604549626869641087978379800916} +#define T_32768_2055 {0.9233650500356557211389940675871912389994,-0.3839231490450283890147886722843395546079} +#define T_32768_2057 {0.9232177494566135012021845795970875769854,-0.3842772268665755053262955698301084339619} +#define T_32768_2059 {0.9230703131012624185203208071470726281404,-0.3846312481730226329190713840944226831198} +#define T_32768_2087 {0.9209919597928523105068165932607371360064,-0.3895815832363243025326937640784308314323} +#define T_32768_2089 {0.9208424894060320831101762450998649001122,-0.3899347505730947349711357219348428770900} +#define T_32768_2091 {0.9206928835922291165516639921406749635935,-0.3902878605627212449391549853316973894835} +#define T_32768_2093 {0.9205431423734454776663937991543207317591,-0.3906409131532724288149438507389277219772} +#define T_32768_2121 {0.9184325601561869056865816673962399363518,-0.3955775934566468365893854297610232606530} +#define T_32768_2123 {0.9182807905165061290730932341830339282751,-0.3959297788350612523622373828402487561107} +#define T_32768_2125 {0.9181288858265879149200827669119462370872,-0.3962819059846515168388236816099379211664} +#define T_32768_2127 {0.9179768461087727260405699780676513910294,-0.3966339748536308329462940491794142872095} +#define T_32768_2155 {0.9158341246880347119585508153249975293875,-0.4015567905750085930804971212637610733509} +#define T_32768_2157 {0.9156800622461076510916200277279131114483,-0.4019079790262497509090167113754432648420} +#define T_32768_2159 {0.9155258651364285293894340611586812883615,-0.4022591083694614932930733175453497096896} +#define T_32768_2161 {0.9153715333816747623529863631119951605797,-0.4026101785530036836213696460617939010262} +#define T_32768_2189 {0.9131967638288281952085867487767245620489,-0.4075189204595969227895579933829139918089} +#define T_32768_2191 {0.9130404151327191630826973778312094509602,-0.4078690970572197982413342742802342399955} +#define T_32768_2193 {0.9128839321570672016292746775434352457523,-0.4082192136701201512494208145653828978539} +#define T_32768_2195 {0.9127273149248859018811685928085353225470,-0.4085692702468067816212737852765712887049} +#define T_32768_2223 {0.9105205896734327453145851904992014169693,-0.4134637297040024694894100321107544004917} +#define T_32768_2225 {0.9103619613683779920876304458943195641041,-0.4138128795645683033477268963906681165099} +#define T_32768_2227 {0.9102031991776965424989498387731146067381,-0.4141619685662681349391789353830972686410} +#define T_32768_2229 {0.9100443031247373859571325738215819001198,-0.4145109966577618099137225726735778152943} +#define T_32768_2257 {0.9078057159663819319916910899337381124496,-0.4193909656379888883925843856559367850423} +#define T_32768_2259 {0.9076448147945070932252065176726318895817,-0.4197390739216982358250618290185229852796} +#define T_32768_2261 {0.9074837801366125722424271771160420030355,-0.4200871204749845322368173583527095615864} +#define T_32768_2263 {0.9073226120163814245600519825529772788286,-0.4204351052466612226332642876514000818133} +#define T_32768_2291 {0.9050522580970435937430806916381698101759,-0.4253003763382326440911640474951127544045} +#define T_32768_2293 {0.9048890908970774749064958086819387972355,-0.4256474282495556482608378701115725561976} +#define T_32768_2295 {0.9047257906163719276548818015726283192635,-0.4259944175615223982411805536685278639197} +#define T_32768_2297 {0.9045623572789432964569300565926823765039,-0.4263413442231018257722041653323685750365} +#define T_32768_2325 {0.9022603330947155386709823687851894646883,-0.4311917106390300014062688660487765446305} +#define T_32768_2327 {0.9020949068016989036422614844923373311758,-0.4315376914273355013484945175150642171502} +#define T_32768_2329 {0.9019293478388794582656373677309602499008,-0.4318836087500122489224452237976947799325} +#define T_32768_2331 {0.9017636562306057257387692516203969717026,-0.4322294625561867165153273617761442437768} +#define T_32768_2359 {0.8994300596236508571479362217360176146030,-0.4370647181429723748813387373957084491849} +#define T_32768_2361 {0.8992623812686419970674478463479317724705,-0.4374096131031547929346459113730816170573} +#define T_32768_2363 {0.8990945706604057674482533002446871250868,-0.4377544437341334138125148456310853362083} +#define T_32768_2365 {0.8989266278236218710162575007416307926178,-0.4380992099851944709953954770753625780344} +#define T_32768_2393 {0.8965615579780149557720392294868361204863,-0.4429191492315889822073415871273027732968} +#define T_32768_2395 {0.8963916346877908170043269819871056824923,-0.4432629437046933795762981844745809212327} +#define T_32768_2397 {0.8962215795665359197030852556054014712572,-0.4436066729877530256942463893210515379906} +#define T_32768_2399 {0.8960513926392601469217424892121925950050,-0.4439503370302161355809289489116054028273} +#define T_32768_2427 {0.8936549500767725362493365537375211715698,-0.4487547550759560244237889037322020158172} +#define T_32768_2429 {0.8934827890735258515420014191477093845606,-0.4490974344498010473181182078405981883407} +#define T_32768_2431 {0.8933104966670482038892942000529728829861,-0.4494400477755311484884259698446840047836} +#define T_32768_2433 {0.8931380728826783244045373066910542547703,-0.4497825950027586894286457663838518783450} +#define T_32768_2461 {0.8907103594585056294263836207392159849405,-0.4545712876472729480070711360895074903965} +#define T_32768_2463 {0.8905359680595378302214726318197790533304,-0.4549128373570719419127783567091682925820} +#define T_32768_2465 {0.8903614456907238405847238027490675449371,-0.4552543201634930980503668251913040876389} +#define T_32768_2467 {0.8901867923777302404886313524912111461163,-0.4555957360163149783005565041094087064266} +#define T_32768_2495 {0.8877279112761630175398863684677053242922,-0.4603684997274040124892735548201017081738} +#define T_32768_2497 {0.8875512968935733715269975618866737931967,-0.4607089052563840825271768153470475226641} +#define T_32768_2499 {0.8873745519800888548189732318860478699207,-0.4610492430295669019635340646345866844058} +#define T_32768_2501 {0.8871976765617028970467572435154579579830,-0.4613895129968994535119009015033952891827} +#define T_32768_2529 {0.8847077322917410446834196591225918382406,-0.4661461449193859407635898151056608185172} +#define T_32768_2531 {0.8845289024321114634830109935137443244457,-0.4664853917994049026241043520712992176414} +#define T_32768_2533 {0.8843499424860861157782210284494794905186,-0.4668245700740870085532208122458541765809} +#define T_32768_2535 {0.8841708524799843926800235749396961182356,-0.4671636796935497715210772184946108609438} +#define T_32768_2563 {0.8816499508708952603797115443740040063858,-0.4719039776579002087864012082718545570970} +#define T_32768_2565 {0.8814689131349714434904285553784575313330,-0.4722420514700614857517280142928939312696} +#define T_32768_2567 {0.8812877457626799859724542329786345362663,-0.4725800558302623022299826516245957463980} +#define T_32768_2569 {0.8811064487806651301937677089881617575884,-0.4729179906887927553604811237164540216327} +#define T_32768_2597 {0.8785546969774854497714500212168786674738,-0.4776417532197104742763826834561768919230} +#define T_32768_2599 {0.8783714590598534766030525133828632533550,-0.4779786395949761623747065186762483790517} +#define T_32768_2601 {0.8781880919613922475974732151371426880360,-0.4783154556746095353858549970027524977922} +#define T_32768_2603 {0.8780045957090690800228571788466069847345,-0.4786522014090755505755225840402999892831} +#define T_32768_2631 {0.8754221021680509418061433279945049434900,-0.4833592277340638676541573204303858801723} +#define T_32768_2633 {0.8752366718568108661102655787544790655375,-0.4836949123538650807674343923281412571669} +#define T_32768_2635 {0.8750511128257699722254869811877142637968,-0.4840305258373500119084553716675145551562} +#define T_32768_2637 {0.8748654251022183192532111206674017012119,-0.4843660681351604768707375114900059998035} +#define T_32768_2665 {0.8722522995862198591510150436079129576683,-0.4890561581930560342001967910618986934423} +#define T_32768_2667 {0.8720646847626538633591053439886309206486,-0.4893906267899019169043128840712597593665} +#define T_32768_2669 {0.8718769416857868881365334345900919288397,-0.4897250234127709100917513751483056694269} +#define T_32768_2671 {0.8716890703832297360165171085100155323744,-0.4900593480124838530187503238266799598932} +#define T_32768_2699 {0.8690454239570495342803724270197562873363,-0.4947323024619598719198165781563147902489} +#define T_32768_2701 {0.8688556325952877523732809095236007124186,-0.4950655408200436147225786953640636056662} +#define T_32768_2703 {0.8686657134521756917777679518621880561113,-0.4953987063695490244974450888548744842410} +#define T_32768_2705 {0.8684756665556441213027483172481879591942,-0.4957317990614779623648189499363070353866} +#define T_32768_2733 {0.8658016115813007562707070974283851683140,-0.5003874192895165773364851702353917062283} +#define T_32768_2735 {0.8656096517479819896223602881946135312319,-0.5007194132453198776033786998596042394638} +#define T_32768_2737 {0.8654175646106944119750892241427209228277,-0.5010513335610380369899985453230328857899} +#define T_32768_2739 {0.8652253501976882033019933260220568627119,-0.5013831801878557703489036612154450267553} +#define T_32768_2767 {0.8625210003296445160358985049242619425058,-0.5060212683181898318807157011178787797689} +#define T_32768_2769 {0.8623268801835730634763876878423616290092,-0.5063520037610848012477049451263155788183} +#define T_32768_2771 {0.8621326332163254946294728142675012350082,-0.5066826647355174850773096295597497373819} +#define T_32768_2773 {0.8619382594564691801863887121726293116808,-0.5070132511928582275118060351815074682236} +#define T_32768_2801 {0.8592037296368019161363349667226430028677,-0.5116336100943813525177006340527441352606} +#define T_32768_2803 {0.8590074574286015174351405221386812627316,-0.5119630729672302038579800864681601524353} +#define T_32768_2805 {0.8588110588874074968757099668437149375677,-0.5122924605464048708469704251911025494337} +#define T_32768_2807 {0.8586145340421041938228086110029835253954,-0.5126217727834629922512021948932670056820} +#define T_32768_2835 {0.8558499404956182443626744316134136170149,-0.5172242060786083062140505717252381145954} +#define T_32768_2837 {0.8556515245673806946413719742849934846163,-0.5175523823783608756698981778754387050867} +#define T_32768_2839 {0.8554529827997018331942058466665912419558,-0.5178804825624276908513365924591198563576} +#define T_32768_2841 {0.8552543152217809696580275158339645713568,-0.5182085065825554615059900243068113923073} +#define T_32768_2869 {0.8524597754510701008712203474715352058411,-0.5227928186556421996655785733310040086508} +#define T_32768_2871 {0.8522592242360010894586253016314003616571,-0.5231196944339311372118572762701660394669} +#define T_32768_2873 {0.8520585476803916868959731800714507699013,-0.5234464932777578294320619534119032323360} +#define T_32768_2875 {0.8518577458137548408245720565901137888432,-0.5237732151390601664786572655430063605309} +#define T_32768_2903 {0.8490333785942068001162397195002995431423,-0.5283392111446076899738955034990794956684} +#define T_32768_2905 {0.8488307006162675305560583183250855654478,-0.5286647725083414295355055401159916073084} +#define T_32768_2907 {0.8486278978020158625383828621124848723412,-0.5289902561221060395268978027161210775375} +#define T_32768_2909 {0.8484249701812776045528607937740162014961,-0.5293156619380332550406365044182166457176} +#define T_32768_2937 {0.8455708955560262696238282842386979609728,-0.5338631478090426485394459632516372948885} +#define T_32768_2939 {0.8453660994295709718571174562384840101004,-0.5341873809209954915289131349709350615740} +#define T_32768_2941 {0.8451611789763371351824616795056499540806,-0.5345115354707771215814204879279714077711} +#define T_32768_2943 {0.8449561342264620966702182158769574016333,-0.5348356114107146730418662627926096320152} +#define T_32768_2971 {0.8420724735012854456073227993329055607319,-0.5393643938669170356803306276560761034489} +#define T_32768_2973 {0.8418655679306953354412712542398367077112,-0.5396872849463176802231600959203206002712} +#define T_32768_2975 {0.8416585385481447634248297617887146770954,-0.5400100966546840242088478589721489697695} +#define T_32768_2977 {0.8414513853840812629414358525536954402924,-0.5403328289445408216806754353456199169159} +#define T_32768_3005 {0.8385382611222451654242604490718804299831,-0.5448427155006124733560568529355805367231} +#define T_32768_3007 {0.8383292549015582961757786506495904177427,-0.5451642508237293238693155217333696782589} +#define T_32768_3009 {0.8381201253889914992711851482454221695662,-0.5454857059703225319324815245636273175478} +#define T_32768_3011 {0.8379108726153010611170657284674234688282,-0.5458070808931161366217565955594182014465} +#define T_32768_3039 {0.8349684086323504450533050658123102039099,-0.5502978798668591853271436775685288012028} +#define T_32768_3041 {0.8347573106448882285235413291957229375839,-0.5506180457675843298304130257747601717710} +#define T_32768_3043 {0.8345460898908667601858724083285778760910,-0.5509381306897038754755158151965588331223} +#define T_32768_3045 {0.8343347464013500802693101832119282335043,-0.5512581345861435888622281709103845059872} +#define T_32768_3073 {0.8313630677598459195465352422615978866816,-0.5557296551066334133750501678150612860918} +#define T_32768_3075 {0.8311498869778354281478982557018753141165,-0.5560484379770627150207928934833034873009} +#define T_32768_3077 {0.8309365839598044090053008403629064559937,-0.5563671390702464858435405403724871575832} +#define T_32768_3079 {0.8307231587371228798133415693882852792740,-0.5566857583393138853011805622372776269913} +#define T_32768_3107 {0.8277223917413272236132115722284652292728,-0.5611378103550114238018409196229185909033} +#define T_32768_3109 {0.8275071372255198287604116558213718235493,-0.5614551966460232801381380340899340808392} +#define T_32768_3111 {0.8272917610094258078490270236216019839048,-0.5617725003646254489098055273643694818020} +#define T_32768_3113 {0.8270762631247202678608232417900580912828,-0.5620897214641524808342865071608684957027} +#define T_32768_3141 {0.8240465353152276462012082447472494095564,-0.5665221157509821026110330421943217515945} +#define T_32768_3143 {0.8238292162145139929663173461449332535267,-0.5668380919728133182644569387775845825672} +#define T_32768_3145 {0.8236117759544202554522485115739982575178,-0.5671539848305801045924567915790248662233} +#define T_32768_3147 {0.8233942145669250756156998249934986233711,-0.5674697942778246240180806125863455235958} +#define T_32768_3175 {0.8203356547152418354329483918263576924801,-0.5718823424472165850573901479947380721569} +#define T_32768_3177 {0.8201162802662628203975714313855860382318,-0.5721968951700355798806185703142546117306} +#define T_32768_3179 {0.8198967852039598103175421783817000687122,-0.5725113637406786804007197133614681661129} +#define T_32768_3181 {0.8196771695606137608791641468997113406658,-0.5728257481128975481254883561632595956326} +#define T_32768_3209 {0.8165899076636848885613062520860694348812,-0.5772182626197949195656633492035325616598} +#define T_32768_3211 {0.8163684871904391959773761300311889499426,-0.5775313784742727163390441091905813664198} +#define T_32768_3213 {0.8161469466550521634218284816597588360310,-0.5778444093920398483277267587254755198956} +#define T_32768_3215 {0.8159252860901055059983377759635914117098,-0.5781573553270593635033947066403925418854} +#define T_32768_3243 {0.8128094533647891584138278631144203245640,-0.5825296494778893219290694105438888072968} +#define T_32768_3245 {0.8125859962782371326284192036837339401245,-0.5828413151557676474823210810427553951740} +#define T_32768_3247 {0.8123624196858291179879074661585036665201,-0.5831528951160105389561749689164571464062} +#define T_32768_3249 {0.8121387236204464787903134492808021605015,-0.5834643893127943181653449755685869604349} +#define T_32768_3277 {0.8089944524979376661022456573846284300089,-0.5878162772734029095644814333354588598013} +#define T_32768_3279 {0.8087689682956008541125925148662645369768,-0.5881264795280598534077398653607815504074} +#define T_32768_3281 {0.8085433651487730077889182211947627365589,-0.5884365952877999017545107562909834086895} +#define T_32768_3283 {0.8083176430906332532444480420963373035192,-0.5887466245070145376416803628671914339066} +#define T_32768_3311 {0.8051450672108342310195894242497161030769,-0.5930779213105654701365665459888987243176} +#define T_32768_3313 {0.8049175654763922604928438886418007314205,-0.5933866469575784785561722856073174625635} +#define T_32768_3315 {0.8046899453638794996734873166133183985949,-0.5936952853360691895190370814816560596228} +#define T_32768_3317 {0.8044622069067718372892272782337386161089,-0.5940038364006466897748737210349645465612} +#define T_32768_3345 {0.8012614611126125385709428883274085819721,-0.5983143579554826008148893379257060587406} +#define T_32768_3347 {0.8010319515154953329272302653407678008080,-0.5986215938731889174562184052774682641029} +#define T_32768_3349 {0.8008023241117591117799179301073309034109,-0.5989287417524769008281282367534004151821} +#define T_32768_3351 {0.8005725789351747501143563567893579602242,-0.5992358015481745736607877006463240832090} +#define T_32768_3379 {0.7973437992668817031471917289309203624725,-0.6035253646456415488330549123929813504219} +#define T_32768_3381 {0.7971122915618589210851041571004316210747,-0.6038310977756958797257880178221967071295} +#define T_32768_3383 {0.7968806666266758931271851906785741448402,-0.6041367421011775151740152978163678199053} +#define T_32768_3385 {0.7966489244953972592711011202482040971518,-0.6044422975771359674013183393981307744980} +#define T_32768_3413 {0.7933922481847111018993246034369803965092,-0.6087107198993704226808176827034913003445} +#define T_32768_3415 {0.7931587522114771360293161706067621707916,-0.6090149372472998257066478799970354884863} +#define T_32768_3417 {0.7929251395895242637834599008783698081970,-0.6093190650282768183743087320181075483561} +#define T_32768_3419 {0.7926914103532094468818058885517530143261,-0.6096231031975737346684240947070065885782} +#define T_32768_3447 {0.7894069758175529250010526993719395250082,-0.6138702033252514400629706869949586689472} +#define T_32768_3449 {0.7891715015003089028766680712578818202019,-0.6141728919610079939772617763082962483168} +#define T_32768_3451 {0.7889359111207451258351852629857603460550,-0.6144754902712391553265547372575383633375} +#define T_32768_3453 {0.7887002047135096560737110849004238843918,-0.6147779982114420782579600199824199080467} +#define T_32768_3481 {0.7853881515501035526227724403725005686283,-0.6190035956314886567142252715711947530508} +#define T_32768_3483 {0.7851507088971355630135917635925579816103,-0.6193047426899986884762938643689267337322} +#define T_32768_3485 {0.7849131507731800239113795214507263153791,-0.6196057986682493856633868745120707899332} +#define T_32768_3487 {0.7846754772131743216334598400862887501717,-0.6199067635219647209865456716215703636408} +#define T_32768_3515 {0.7813359461931048688398959711776115000248,-0.6241106786352285107355442050902638584375} +#define T_32768_3517 {0.7810965452963585242684985132655128836632,-0.6244102713169393803482876137422863394022} +#define T_32768_3519 {0.7808570295248645765084916092746425420046,-0.6247097721675280990183409812743775546551} +#define T_32768_3521 {0.7806173989138482927074846884352155029774,-0.6250091811429474564221209220704622566700} +#define T_32768_3549 {0.7772505319760840691856174089480191469193,-0.6291912352718324052958109859901014715433} +#define T_32768_3551 {0.7770091830107352937062614728347398340702,-0.6294892608432567371323784755077213048935} +#define T_32768_3553 {0.7767677197717615111827171858749352395535,-0.6297871938365992017239136657735798507929} +#define T_32768_3555 {0.7765261422946744263029472676862496882677,-0.6300850342080432930913502787007018923759} +#define T_32768_3583 {0.7731320825400330720711394860700238496065,-0.6342450496041033281002796684333588927984} +#define T_32768_3585 {0.7728887957640562245131832241895608603954,-0.6345414953983600225839722952514421194792} +#define T_32768_3587 {0.7726453953204338587212873790122102946043,-0.6348378478715100969864693070121575146914} +#define T_32768_3589 {0.7724018812449624515892310228082351386547,-0.6351341069799691929631535458611324429512} +#define T_32768_3617 {0.7689807729300288663409901346312835812569,-0.6392719068314635100236387188488151878119} +#define T_32768_3619 {0.7687355586837603116379113998846150934696,-0.6395667602488163128171549942635465413332} +#define T_32768_3621 {0.7684902313806567519804957555606961250305,-0.6398615196060040144487857105559669435024} +#define T_32768_3623 {0.7682447910567982152230115389102138578892,-0.6401561848596765136321096179017331451178} +#define T_32768_3651 {0.7647967795877934626957994623808190226555,-0.6442715932990837890415036781632807105780} +#define T_32768_3653 {0.7645496482934921456120491711772046983242,-0.6445648418074767516117162813316099345684} +#define T_32768_3655 {0.7643024045579717151710497091698925942183,-0.6448579955206437119841211824677884578705} +#define T_32768_3657 {0.7640550484175939738307192783395294100046,-0.6451510543954711573988447526062373071909} +#define T_32768_3685 {0.7605802803441944481832592828141059726477,-0.6492438965069649015049435547553002834320} +#define T_32768_3687 {0.7603312425055990253497384401271119713783,-0.6495355276425547330632070952560752630234} +#define T_32768_3689 {0.7600820928461793357655551517382264137268,-0.6498270632518871048333153339626733213663} +#define T_32768_3691 {0.7598328314025775132023454716545529663563,-0.6501185032920862028049668879248201847076} +#define T_32768_3719 {0.7563314544116869209133824369928333908319,-0.6541886051189690354235040103958453983068} +#define T_32768_3721 {0.7560805206135691181046354358841199427843,-0.6544786064866553454422160029935184866190} +#define T_32768_3723 {0.7558294756197747599202330093248747289181,-0.6547685116011126016388743664720095694065} +#define T_32768_3725 {0.7555783194672245350886896630981937050819,-0.6550583204197049091987992142094299197197} +#define T_32768_3753 {0.7520504823766963609088520570367109030485,-0.6591055089718022008682396517542656511068} +#define T_32768_3755 {0.7517976632844114437403959527728147804737,-0.6593938682457538558168153031147085130215} +#define T_32768_3757 {0.7515447336263236799425158096710219979286,-0.6596821305435961457774851623980794101954} +#define T_32768_3759 {0.7512916934396308699106725725869182497263,-0.6599702958229345384211228520143777132034} +#define T_32768_3787 {0.7477375461919433252688804714125581085682,-0.6639943990839466403386381898599211126566} +#define T_32768_3789 {0.7474828525509765730916456050181295722723,-0.6642811040081262330403433225001208484173} +#define T_32768_3791 {0.7472280489787798130052465239714365452528,-0.6645677112374375195003040062147192656994} +#define T_32768_3793 {0.7469731355128267358267635245283599942923,-0.6648542207297296613432990852743387222290} +#define T_32768_3821 {0.7433928291687099676465777520206756889820,-0.6688550676645436121603438550664577633142} +#define T_32768_3823 {0.7431362718042198167722744983620941638947,-0.6691401060532276012793317931937053799629} +#define T_32768_3825 {0.7428796051477450923528067505685612559319,-0.6694250460324369100817420985549688339233} +#define T_32768_3827 {0.7426228292370333772254298310144804418087,-0.6697098875602658374361908499849960207939} +#define T_32768_3855 {0.7390165159690487151067372906254604458809,-0.6736873081222243264676308172056451439857} +#define T_32768_3857 {0.7387581057854069044310563185717910528183,-0.6739706678605216216837447973375674337149} +#define T_32768_3859 {0.7384995869536711277092422278656158596277,-0.6742539284789204057091183130978606641293} +#define T_32768_3861 {0.7382409595118613054864908917807042598724,-0.6745370899357620020353465406515169888735} +#define T_32768_3889 {0.7346087925979335464532482546928804367781,-0.6784909150738911431943733987282030284405} +#define T_32768_3891 {0.7343485405782615993430795242602471262217,-0.6787725841182576935750603297492489218712} +#define T_32768_3893 {0.7340881805590040443121324642561376094818,-0.6790541533365148652023890463169664144516} +#define T_32768_3895 {0.7338277125784518073459139486658386886120,-0.6793356226872525605031682971457485109568} +#define T_32768_3923 {0.7301698463953548712268570852756965905428,-0.6832656843534467006051613680028822273016} +#define T_32768_3925 {0.7299077636010571357161325067863799631596,-0.6835456507321975294644289533607661724091} +#define T_32768_3927 {0.7296455734602724785275995600386522710323,-0.6838255165828707182740231473871972411871} +#define T_32768_3929 {0.7293832760115609437079342569631990045309,-0.6841052818643070798643179841747041791677} +#define T_32768_3957 {0.7256998660283561219941361741803120821714,-0.6880114130204716405003750878677237778902} +#define T_32768_3959 {0.7254359635986498133775057794991880655289,-0.6882896648342893275440701472689397633076} +#define T_32768_3961 {0.7251719544801179528903389837068971246481,-0.6885678154223342506412564034690149128437} +#define T_32768_3963 {0.7249078387115878152613390739134047180414,-0.6888458647436991322265953385795000940561} +#define T_32768_3991 {0.7211990414830157236636409834318328648806,-0.6927278993688498198721958942769560962915} +#define T_32768_3993 {0.7209333306344575253987727592175360769033,-0.6930044247912908739195358975848648697138} +#define T_32768_3995 {0.7206675137592694069255117028660606592894,-0.6932808482945661543439541674160864204168} +#define T_32768_3997 {0.7204015908965447634315637515101116150618,-0.6935571698380222915858439591829665005207} +#define T_32768_4025 {0.7166675640563718863162989691772963851690,-0.6974149429353417861676689426531083881855} +#define T_32768_4027 {0.7164000560823809982124998896324541419744,-0.6976897302133387990252799681911710649729} +#define T_32768_4029 {0.7161324427484623278417075198376551270485,-0.6979644148831087857587363032507710158825} +#define T_32768_4031 {0.7158647240939735034714885841822251677513,-0.6982389969042542832156073018268216401339} +#define T_32768_4059 {0.7121056263482918868845672477618791162968,-0.7020723445081046287796766591782215982676} +#define T_32768_4061 {0.7118363326186700756892378194606862962246,-0.7023453819624658756026747141731902956963} +#define T_32768_4063 {0.7115669342003007047381402117025572806597,-0.7026183161239001329079201241256669163704} +#define T_32768_4065 {0.7112974311328039700441649983986280858517,-0.7028911469522673982623928168322890996933} +#define T_32768_4093 {0.7075134222532862837695688540406990796328,-0.7066999061351594280111498846963513642550} +#define T_32768_4095 {0.7072423542137346030855837852868717163801,-0.7069711821610653590397532752831466495991} +#define T_32768_4097 {0.7069711821610653590397532752831466495991,-0.7072423542137346030855837852868717163801} +#define T_32768_4131 {0.7023453819624658756026747141731902956963,-0.7118363326186700756892378194606862962246} +#define T_32768_4165 {0.6976897302133387990252799681911710649729,-0.7164000560823809982124998896324541419744} +#define T_32768_4199 {0.6930044247912908739195358975848648697138,-0.7209333306344575253987727592175360769033} +#define T_32768_4233 {0.6882896648342893275440701472689397633076,-0.7254359635986498133775057794991880655289} +#define T_32768_4267 {0.6835456507321975294644289533607661724091,-0.7299077636010571357161325067863799631596} +#define T_32768_4301 {0.6787725841182576935750603297492489218712,-0.7343485405782615993430795242602471262217} +#define T_32768_4335 {0.6739706678605216216837447973375674337149,-0.7387581057854069044310563185717910528183} +#define T_32768_4369 {0.6691401060532276012793317931937053799629,-0.7431362718042198167722744983620941638947} +#define T_32768_4403 {0.6642811040081262330403433225001208484173,-0.7474828525509765730916456050181295722723} +#define T_32768_4437 {0.6593938682457538558168153031147085130215,-0.7517976632844114437403959527728147804737} +#define T_32768_4471 {0.6544786064866553454422160029935184866190,-0.7560805206135691181046354358841199427843} +#define T_32768_4505 {0.6495355276425547330632070952560752630234,-0.7603312425055990253497384401271119713783} +#define T_32768_4539 {0.6445648418074767516117162813316099345684,-0.7645496482934921456120491711772046983242} +#define T_32768_4573 {0.6395667602488163128171549942635465413332,-0.7687355586837603116379113998846150934696} +#define T_32768_4607 {0.6345414953983600225839722952514421194792,-0.7728887957640562245131832241895608603954} +#define T_32768_4641 {0.6294892608432567371323784755077213048935,-0.7770091830107352937062614728347398340702} +#define T_32768_4675 {0.6244102713169393803482876137422863394022,-0.7810965452963585242684985132655128836632} +#define T_32768_4709 {0.6193047426899986884762938643689267337322,-0.7851507088971355630135917635925579816103} +#define T_32768_4743 {0.6141728919610079939772617763082962483168,-0.7891715015003089028766680712578818202019} +#define T_32768_4777 {0.6090149372472998257066478799970354884863,-0.7931587522114771360293161706067621707916} +#define T_32768_4811 {0.6038310977756958797257880178221967071295,-0.7971122915618589210851041571004316210747} +#define T_32768_4845 {0.5986215938731889174562184052774682641029,-0.8010319515154953329272302653407678008080} +#define T_32768_4879 {0.5933866469575784785561722856073174625635,-0.8049175654763922604928438886418007314205} +#define T_32768_4913 {0.5881264795280598534077398653607815504074,-0.8087689682956008541125925148662645369768} +#define T_32768_4947 {0.5828413151557676474823210810427553951740,-0.8125859962782371326284192036837339401245} +#define T_32768_4981 {0.5775313784742727163390441091905813664198,-0.8163684871904391959773761300311889499426} +#define T_32768_5015 {0.5721968951700355798806185703142546117306,-0.8201162802662628203975714313855860382318} +#define T_32768_5049 {0.5668380919728133182644569387775845825672,-0.8238292162145139929663173461449332535267} +#define T_32768_5083 {0.5614551966460232801381380340899340808392,-0.8275071372255198287604116558213718235493} +#define T_32768_5117 {0.5560484379770627150207928934833034873009,-0.8311498869778354281478982557018753141165} +#define T_32768_5151 {0.5506180457675843298304130257747601717710,-0.8347573106448882285235413291957229375839} +#define T_32768_5185 {0.5451642508237293238693155217333696782589,-0.8383292549015582961757786506495904177427} +#define T_32768_5219 {0.5396872849463176802231600959203206002712,-0.8418655679306953354412712542398367077112} +#define T_32768_5253 {0.5341873809209954915289131349709350615740,-0.8453660994295709718571174562384840101004} +#define T_32768_5287 {0.5286647725083414295355055401159916073084,-0.8488307006162675305560583183250855654478} +#define T_32768_5321 {0.5231196944339311372118572762701660394669,-0.8522592242360010894586253016314003616571} +#define T_32768_5355 {0.5175523823783608756698981778754387050867,-0.8556515245673806946413719742849934846163} +#define T_32768_5389 {0.5119630729672302038579800864681601524353,-0.8590074574286015174351405221386812627316} +#define T_32768_5423 {0.5063520037610848012477049451263155788183,-0.8623268801835730634763876878423616290092} +#define T_32768_5457 {0.5007194132453198776033786998596042394638,-0.8656096517479819896223602881946135312319} +#define T_32768_5491 {0.4950655408200436147225786953640636056662,-0.8688556325952877523732809095236007124186} +#define T_32768_5525 {0.4893906267899019169043128840712597593665,-0.8720646847626538633591053439886309206486} +#define T_32768_5559 {0.4836949123538650807674343923281412571669,-0.8752366718568108661102655787544790655375} +#define T_32768_5593 {0.4779786395949761623747065186762483790517,-0.8783714590598534766030525133828632533550} +#define T_32768_5627 {0.4722420514700614857517280142928939312696,-0.8814689131349714434904285553784575313330} +#define T_32768_5661 {0.4664853917994049026241043520712992176414,-0.8845289024321114634830109935137443244457} +#define T_32768_5695 {0.4607089052563840825271768153470475226641,-0.8875512968935733715269975618866737931967} +#define T_32768_5729 {0.4549128373570719419127783567091682925820,-0.8905359680595378302214726318197790533304} +#define T_32768_5763 {0.4490974344498010473181182078405981883407,-0.8934827890735258515420014191477093845606} +#define T_32768_5797 {0.4432629437046933795762981844745809212327,-0.8963916346877908170043269819871056824923} +#define T_32768_5831 {0.4374096131031547929346459113730816170573,-0.8992623812686419970674478463479317724705} +#define T_32768_5865 {0.4315376914273355013484945175150642171502,-0.9020949068016989036422614844923373311758} +#define T_32768_5899 {0.4256474282495556482608378701115725561976,-0.9048890908970774749064958086819387972355} +#define T_32768_5933 {0.4197390739216982358250618290185229852796,-0.9076448147945070932252065176726318895817} +#define T_32768_5967 {0.4138128795645683033477268963906681165099,-0.9103619613683779920876304458943195641041} +#define T_32768_6001 {0.4078690970572197982413342742802342399955,-0.9130404151327191630826973778312094509602} +#define T_32768_6035 {0.4019079790262497509090167113754432648420,-0.9156800622461076510916200277279131114483} +#define T_32768_6069 {0.3959297788350612523622373828402487561107,-0.9182807905165061290730932341830339282751} +#define T_32768_6103 {0.3899347505730947349711357219348428770900,-0.9208424894060320831101762450998649001122} +#define T_32768_6137 {0.3839231490450283890147886722843395546079,-0.9233650500356557211389940675871912389994} +#define T_32768_6171 {0.3778952297599485476986558296630391851068,-0.9258483651898272714930726579041220247746} +#define T_32768_6205 {0.3718512489204894855276961607160046696663,-0.9282923293210345594417276515741832554340} +#define T_32768_6239 {0.3657914634119445729254493926418945193291,-0.9306968385542888633210623083868995308876} +#define T_32768_6273 {0.3597161307913475658537549861648585647345,-0.9330617906915393833244820598338264971972} +#define T_32768_6307 {0.3536255092765259733234017858194420114160,-0.9353870852160177662426576716825366020203} +#define T_32768_6341 {0.3475198577351261142176497287437086924911,-0.9376726232965094665061656087345909327269} +#define T_32768_6375 {0.3413994356736104185401359245588537305593,-0.9399183077915550521552745522058103233576} +#define T_32768_6409 {0.3352645032262278057544335752027109265327,-0.9421240432535785691570140443218406289816} +#define T_32768_6443 {0.3291153211439572512375661972328089177608,-0.9442897359329444073594572728325147181749} +#define T_32768_6477 {0.3229521507834252624924431529507273808122,-0.9464152937819421129717056828667409718037} +#define T_32768_6511 {0.3167752540957973206303677216055803000927,-0.9485006264586982593911557160026859492064} +#define T_32768_6545 {0.3105848936156444528577935670909937471151,-0.9505456453310165976233747642254456877708} +#define T_32768_6579 {0.3043813324497848804561783708777511492372,-0.9525502634801449319823518635530490428209} +#define T_32768_6613 {0.2981648342661009087883883239555871114135,-0.9545143957044694982272403649403713643551} +#define T_32768_6647 {0.2919356632823327801773416467767674475908,-0.9564379585231361780017778073670342564583} +#define T_32768_6681 {0.2856940842548483794338665120449149981141,-0.9583208701795988826432903806562535464764} +#define T_32768_6715 {0.2794403624673905683906127705995459109545,-0.9601630506450939961382573528680950403214} +#define T_32768_6749 {0.2731747637198019273974125553650083020329,-0.9619644216220423205143674749706406146288} +#define T_32768_6783 {0.2668975543167274033784508446842664852738,-0.9637249065473765252676230375072918832302} +#define T_32768_6817 {0.2606090010562957526296656851627631112933,-0.9654444305957954330921211294480599462986} +#define T_32768_6851 {0.2543093712187800004009829990536672994494,-0.9671229206829443647563948616152629256248} +#define T_32768_6885 {0.2479989325552371393079909012158168479800,-0.9687603054685214321040120921679772436619} +#define T_32768_6919 {0.2416779532761280102626244570274138823152,-0.9703565153593094461115242665982805192471} +#define T_32768_6953 {0.2353467020399178655232219625759171321988,-0.9719114825121339951152776848175562918186} +#define T_32768_6987 {0.2290054479416573640637722064639092423022,-0.9734251408367470270732724202389363199472} +#define T_32768_7021 {0.2226544605015455258190115728211821988225,-0.9748974259986358248397664283402264118195} +#define T_32768_7055 {0.2162940096534743672496148292339057661593,-0.9763282754217572634303223821916617453098} +#define T_32768_7089 {0.2099243657335558843612943746848031878471,-0.9777176282911974602995996974641457200050} +#define T_32768_7123 {0.2035457994686322158450764163717394694686,-0.9790654255557569296541942094336263835430} +#define T_32768_7157 {0.1971585819647688753164516128890682011843,-0.9803716099304597975105934892781078815460} +#define T_32768_7191 {0.1907629846957321073652735776704503223300,-0.9816361258989890758996921249490696936846} +#define T_32768_7225 {0.1843592794914505061942833208377123810351,-0.9828589197160461088387251038511749356985} +#define T_32768_7259 {0.1779477385264615629800744045496685430408,-0.9840399394096349672267365349398460239172} +#define T_32768_7293 {0.1715286343083434195122549681400414556265,-0.9851791347832711265297689351427834481001} +#define T_32768_7327 {0.1651022396661326607780750919118872843683,-0.9862764574181150933895878551993519067764} +#define T_32768_7361 {0.1586688277387283130259731933620059862733,-0.9873318606750304260444295323395635932684} +#define T_32768_7395 {0.1522286719632827411974318465581745840609,-0.9883452996965661485617715698026586323977} +#define T_32768_7429 {0.1457820460635798343052016434739925898612,-0.9893167314088630037716143306170124560595} +#define T_32768_7463 {0.1393292240384009783582541786017827689648,-0.9902461145234839889894828957039862871170} +#define T_32768_7497 {0.1328704801498794274561277006796444766223,-0.9911334095391701737298717489466071128845} +#define T_32768_7531 {0.1264060889118433783639972034507081843913,-0.9919785787435185797633607762691099196672} +#define T_32768_7565 {0.1199363250781484702134349618063424713910,-0.9927815862145855652087789167126175016165} +#define T_32768_7599 {0.1134614636309999452512542461590783204883,-0.9935423978224136032366686777095310389996} +#define T_32768_7633 {0.1069817797692652339147656448403722606599,-0.9942609812304817884509589021035935729742} +#define T_32768_7667 {0.1004975488967772140336265351834299508482,-0.9949373058970800709488457869156263768673} +#define T_32768_7701 {0.0940090466106288380476740940139279700816,-0.9955713430766077731703944664332084357738} +#define T_32768_7735 {0.0875165486894595306965882741678797174245,-0.9961630658207949462479291469207964837551} +#define T_32768_7769 {0.0810203310817338706595336361715453676879,-0.9967124489798480091451438056537881493568} +#define T_32768_7803 {0.0745206698940130002339898851460020523518,-0.9972194692035186713852112916356418281794} +#define T_32768_7837 {0.0680178413792193875542224645869282539934,-0.9976841049420960283455883654823992401361} +#define T_32768_7871 {0.0615121219248953854386030570822185836732,-0.9981063364473230503648437661468051373959} +#define T_32768_7905 {0.0550037880414559268715812834216194460168,-0.9984861457732353562377625166845973581076} +#define T_32768_7939 {0.0484931163504361759097882611513341544196,-0.9988235167769244915447757193760480731726} +#define T_32768_7973 {0.0419803835727343560568769476049055811018,-0.9991184351192234913696665898896753787994} +#define T_32768_8007 {0.0354658665168503528519750034320168197155,-0.9993708882653171698962069058325141668320} +#define T_32768_8041 {0.0289498420671206388443685852962516946718,-0.9995808654852736951923475317016709595919} +#define T_32768_8075 {0.0224325871719499338186043502219035872258,-0.9997483578545017790517590583476703613997} +#define T_32768_8109 {0.0159143788320401796676506478434021119028,-0.9998733582541292630452289813547395169735} +#define T_32768_8143 {0.0093954940886172514519225629214815853629,-0.9999558613713060983840819062606897205114} +#define T_32768_8177 {0.0028762100116559792713222254434413116542,-0.9999958636994299432387833803659304976463} +#define T_32768_8211 {-0.0036431963118960680216018577226577690453,-0.9999933635382951546688445887411944568157} +#define T_32768_8245 {-0.0101624477898955151156101806009246502072,-0.9999483609941653972086328394652809947729} +#define T_32768_8279 {-0.0166812673367803324109281959408690454438,-0.9998608579797685358414582879049703478813} +#define T_32768_8313 {-0.0231993778853467197409443656397343147546,-0.9997308582142160338079861503501888364553} +#define T_32768_8347 {-0.0297165023985251908200666548509616404772,-0.9995583672228443017360177691443823277950} +#define T_32768_8381 {-0.0362323638811553952465693839712912449613,-0.9993433923369802185376897796231787651777} +#define T_32768_8415 {-0.0427466853917591316225355058122659102082,-0.9990859426936292697618569036421831697226} +#define T_32768_8449 {-0.0492591900543111402277496324586536502466,-0.9987860292350876356692879198817536234856} +#define T_32768_8483 {-0.0557696010700070299304087484415504150093,-0.9984436647084763416515329481626395136118} +#define T_32768_8517 {-0.0622776417290279785121676070502871880308,-0.9980588636652002465510236106638330966234} +#define T_32768_8551 {-0.0687830354223016443571125932976428885013,-0.9976316424603293153694494321825914084911} +#define T_32768_8585 {-0.0752855056532587690876212604962347541004,-0.9971620192519032865874351045931689441204} +#define T_32768_8619 {-0.0817847760495850756301550177340686786920,-0.9966500140001600671624260030512232333422} +#define T_32768_8653 {-0.0882805703749677400660189618974982295185,-0.9960956484666873000932696413656231015921} +#define T_32768_8687 {-0.0947726125408362429780595448391977697611,-0.9954989462134977706853078416315838694572} +#define T_32768_8721 {-0.1012606266180968439538645498032565228641,-0.9948599326020273192483500679372809827328} +#define T_32768_8755 {-0.1077443368488602837285128543953760527074,-0.9941786347920575916958796369726769626141} +#define T_32768_8789 {-0.1142234676581622698776641300355549901724,-0.9934550817405609635102337051648646593094} +#define T_32768_8823 {-0.1206977436656761076827493184282502625138,-0.9926893042004707456982259827782399952412} +#define T_32768_8857 {-0.1271668896974171569791423053175094537437,-0.9918813347193730090012309119629207998514} +#define T_32768_8891 {-0.1336306307974383655867711695464095100760,-0.9910312076381241341849204218306113034487} +#define T_32768_8925 {-0.1400886922395166989119275058328639715910,-0.9901389590893906467172769225726369768381} +#define T_32768_8959 {-0.1465407995388297579530956227245042100549,-0.9892046269961137783255367139645386487246} +#define T_32768_8993 {-0.1529866784636220722326527265977347269654,-0.9882282510698974231644342580693773925304} +#define T_32768_9027 {-0.1594260550468606096874424338238895870745,-0.9872098728093208208633768663275986909866} +#define T_32768_9061 {-0.1658586555978792953514044938856386579573,-0.9861495354981738570288030132360290735960} +#define T_32768_9095 {-0.1722842067140114008516604826581897214055,-0.9850472842036182008484956895699724555016} +#define T_32768_9129 {-0.1787024352922099990070847752576810307801,-0.9839031657742715042402892322570551186800} +#define T_32768_9163 {-0.1851130685406555398397898670737049542367,-0.9827172288382159948127991810906678438187} +#define T_32768_9197 {-0.1915158339903502149326186554390005767345,-0.9814895238009321287719899373769294470549} +#define T_32768_9231 {-0.1979104595066986937990094475026126019657,-0.9802201028431560825282531368429772555828} +#define T_32768_9265 {-0.2042966733010743995979652254391112364829,-0.9789090199186613050486016618378926068544} +#define T_32768_9299 {-0.2106742039423714352164296315095270983875,-0.9775563307519664624223310056549962610006} +#define T_32768_9333 {-0.2170427803685410217404694321885472163558,-0.9761620928359661109041667259589303284883} +#define T_32768_9367 {-0.2234021318981123938041122301001450978220,-0.9747263654294873180816694002714939415455} +#define T_32768_9401 {-0.2297519882416974856820246486677206121385,-0.9732492095547712329661749208753462880850} +#define T_32768_9435 {-0.2360920795134789085256699081583064980805,-0.9717306879948791609180602790729608386755} +#define T_32768_9469 {-0.2424221362426809134316130212027928791940,-0.9701708652910244756739643889886792749166} +#define T_32768_9503 {-0.2487418893850224799191295232958509586751,-0.9685698077398289251860319382103625684977} +#define T_32768_9537 {-0.2550510703341525298171177382755558937788,-0.9669275833905056627415319780993741005659} +#define T_32768_9571 {-0.2613494109330663506263192630285630002618,-0.9652442620419657837160798408149275928736} +#define T_32768_9605 {-0.2676366434855030895789695932762697339058,-0.9635199152398531419194682712259236723185} +#define T_32768_9639 {-0.2739125007673233191951567278010770678520,-0.9617546162735020054412871104432269930840} +#define T_32768_9673 {-0.2801767160378669796472195230307988822460,-0.9599484401728232141337571192707400768995} +#define T_32768_9707 {-0.2864290230512906987314636353403329849243,-0.9581014637051147309065868284960743039846} +#define T_32768_9741 {-0.2926691560678834624908972728007938712835,-0.9562137653717984742129942787869367748499} +#define T_32768_9775 {-0.2988968498653618022231626127904746681452,-0.9542854254050846529722207378654275089502} +#define T_32768_9809 {-0.3051118397501421108941599413810763508081,-0.9523165257645609393932772945845499634743} +#define T_32768_9843 {-0.3113138615685909771357842146244365721941,-0.9503071501337092552574858927982859313488} +#define T_32768_9877 {-0.3175026517182523155824469540675636380911,-0.9482573839163490614367901798686943948269} +#define T_32768_9911 {-0.3236779471590512380352322452381486073136,-0.9461673142330073726924410948413424193859} +#define T_32768_9945 {-0.3298394854244739438087208327488042414188,-0.9440370299172158308209645838360302150249} +#define T_32768_9979 {-0.3359870046327233517047261557308956980705,-0.9418666215117352802366212927154265344143} +#define T_32768_10013 {-0.3421202434978495854345226234727306291461,-0.9396561812647070688342409994220361113548} +#define T_32768_10047 {-0.3482389413408553124895661312621086835861,-0.9374058031257329615115736487496178597212} +#define T_32768_10081 {-0.3543428381007755478826481976284412667155,-0.9351155827418808907935954266577027738094} +#define T_32768_10115 {-0.3604316743457307570253078665700741112232,-0.9327856174536210964731708372710272669792} +#define T_32768_10149 {-0.3665051912839533687638038372824667021632,-0.9304160062906875472421575068437959998846} +#define T_32768_10183 {-0.3725631307747872544844369713246123865247,-0.9280068499678699733834719154401682317257} +#define T_32768_10217 {-0.3786052353396591740875010145828127861023,-0.9255582508807326247435298682830762118101} +#define T_32768_10251 {-0.3846312481730226329190713840944226831198,-0.9230703131012624185203208071470726281404} +#define T_32768_10285 {-0.3906409131532724288149438507389277219772,-0.9205431423734454776663937991543207317591} +#define T_32768_10319 {-0.3966339748536308329462940491794142872095,-0.9179768461087727260405699780676513910294} +#define T_32768_10353 {-0.4026101785530036836213696460617939010262,-0.9153715333816747623529863631119951605797} +#define T_32768_10387 {-0.4085692702468067816212737852765712887049,-0.9127273149248859018811685928085353225470} +#define T_32768_10421 {-0.4145109966577618099137225726735778152943,-0.9100443031247373859571325738215819001198} +#define T_32768_10455 {-0.4204351052466612226332642876514000818133,-0.9073226120163814245600519825529772788286} +#define T_32768_10489 {-0.4263413442231018257722041653323685750365,-0.9045623572789432964569300565926823765039} +#define T_32768_10523 {-0.4322294625561867165153273617761442437768,-0.9017636562306057257387692516203969717026} +#define T_32768_10557 {-0.4380992099851944709953954770753625780344,-0.8989266278236218710162575007416307926178} +#define T_32768_10591 {-0.4439503370302161355809289489116054028273,-0.8960513926392601469217424892121925950050} +#define T_32768_10625 {-0.4497825950027586894286457663838518783450,-0.8931380728826783244045373066910542547703} +#define T_32768_10659 {-0.4555957360163149783005565041094087064266,-0.8901867923777302404886313524912111461163} +#define T_32768_10693 {-0.4613895129968994535119009015033952891827,-0.8871976765617028970467572435154579579830} +#define T_32768_10727 {-0.4671636796935497715210772184946108609438,-0.8841708524799843926800235749396961182356} +#define T_32768_10761 {-0.4729179906887927553604811237164540216327,-0.8811064487806651301937677089881617575884} +#define T_32768_10795 {-0.4786522014090755505755225840402999892831,-0.8780045957090690800228571788466069847345} +#define T_32768_10829 {-0.4843660681351604768707375114900059998035,-0.8748654251022183192532111206674017012119} +#define T_32768_10863 {-0.4900593480124838530187503238266799598932,-0.8716890703832297360165171085100155323744} +#define T_32768_10897 {-0.4957317990614779623648189499363070353866,-0.8684756665556441213027483172481879591942} +#define T_32768_10931 {-0.5013831801878557703489036612154450267553,-0.8652253501976882033019933260220568627119} +#define T_32768_10965 {-0.5070132511928582275118060351815074682236,-0.8619382594564691801863887121726293116808} +#define T_32768_10999 {-0.5126217727834629922512021948932670056820,-0.8586145340421041938228086110029835253954} +#define T_32768_11033 {-0.5182085065825554615059900243068113923073,-0.8552543152217809696580275158339645713568} +#define T_32768_11067 {-0.5237732151390601664786572655430063605309,-0.8518577458137548408245720565901137888432} +#define T_32768_11101 {-0.5293156619380332550406365044182166457176,-0.8484249701812776045528607937740162014961} +#define T_32768_11135 {-0.5348356114107146730418662627926096320152,-0.8449561342264620966702182158769574016333} +#define T_32768_11169 {-0.5403328289445408216806754353456199169159,-0.8414513853840812629414358525536954402924} +#define T_32768_11203 {-0.5458070808931161366217565955594182014465,-0.8379108726153010611170657284674234688282} +#define T_32768_11237 {-0.5512581345861435888622281709103845059872,-0.8343347464013500802693101832119282335043} +#define T_32768_11271 {-0.5566857583393138853011805622372776269913,-0.8307231587371228798133415693882852792740} +#define T_32768_11305 {-0.5620897214641524808342865071608684957027,-0.8270762631247202678608232417900580912828} +#define T_32768_11339 {-0.5674697942778246240180806125863455235958,-0.8233942145669250756156998249934986233711} +#define T_32768_11373 {-0.5728257481128975481254883561632595956326,-0.8196771695606137608791641468997113406658} +#define T_32768_11407 {-0.5781573553270593635033947066403925418854,-0.8159252860901055059983377759635914117098} +#define T_32768_11441 {-0.5834643893127943181653449755685869604349,-0.8121387236204464787903134492808021605015} +#define T_32768_11475 {-0.5887466245070145376416803628671914339066,-0.8083176430906332532444480420963373035192} +#define T_32768_11509 {-0.5940038364006466897748737210349645465612,-0.8044622069067718372892272782337386161089} +#define T_32768_11543 {-0.5992358015481745736607877006463240832090,-0.8005725789351747501143563567893579602242} +#define T_32768_11577 {-0.6044422975771359674013183393981307744980,-0.7966489244953972592711011202482040971518} +#define T_32768_11611 {-0.6096231031975737346684240947070065885782,-0.7926914103532094468818058885517530143261} +#define T_32768_11645 {-0.6147779982114420782579600199824199080467,-0.7887002047135096560737110849004238843918} +#define T_32768_11679 {-0.6199067635219647209865456716215703636408,-0.7846754772131743216334598400862887501717} +#define T_32768_11713 {-0.6250091811429474564221209220704622566700,-0.7806173989138482927074846884352155029774} +#define T_32768_11747 {-0.6300850342080432930913502787007018923759,-0.7765261422946744263029472676862496882677} +#define T_32768_11781 {-0.6351341069799691929631535458611324429512,-0.7724018812449624515892310228082351386547} +#define T_32768_11815 {-0.6401561848596765136321096179017331451178,-0.7682447910567982152230115389102138578892} +#define T_32768_11849 {-0.6451510543954711573988447526062373071909,-0.7640550484175939738307192783395294100046} +#define T_32768_11883 {-0.6501185032920862028049668879248201847076,-0.7598328314025775132023454716545529663563} +#define T_32768_11917 {-0.6550583204197049091987992142094299197197,-0.7555783194672245350886896630981937050819} +#define T_32768_11951 {-0.6599702958229345384211228520143777132034,-0.7512916934396308699106725725869182497263} +#define T_32768_11985 {-0.6648542207297296613432990852743387222290,-0.7469731355128267358267635245283599942923} +#define T_32768_12019 {-0.6697098875602658374361908499849960207939,-0.7426228292370333772254298310144804418087} +#define T_32768_12053 {-0.6745370899357620020353465406515169888735,-0.7382409595118613054864908917807042598724} +#define T_32768_12087 {-0.6793356226872525605031682971457485109568,-0.7338277125784518073459139486658386886120} +#define T_32768_12121 {-0.6841052818643070798643179841747041791677,-0.7293832760115609437079342569631990045309} +#define T_32768_12155 {-0.6888458647436991322265953385795000940561,-0.7249078387115878152613390739134047180414} +#define T_32768_12189 {-0.6935571698380222915858439591829665005207,-0.7204015908965447634315637515101116150618} +#define T_32768_12223 {-0.6982389969042542832156073018268216401339,-0.7158647240939735034714885841822251677513} +#define T_32768_12257 {-0.7028911469522673982623928168322890996933,-0.7112974311328039700441649983986280858517} +#define T_32768_12291 {-0.7075134222532862837695688540406990796328,-0.7066999061351594280111498846963513642550} +#define T_32768_12325 {-0.7121056263482918868845672477618791162968,-0.7020723445081046287796766591782215982676} +#define T_32768_12359 {-0.7166675640563718863162989691772963851690,-0.6974149429353417861676689426531083881855} +#define T_32768_12393 {-0.7211990414830157236636409834318328648806,-0.6927278993688498198721958942769560962915} +#define T_32768_12427 {-0.7256998660283561219941361741803120821714,-0.6880114130204716405003750878677237778902} +#define T_32768_12461 {-0.7301698463953548712268570852756965905428,-0.6832656843534467006051613680028822273016} +#define T_32768_12495 {-0.7346087925979335464532482546928804367781,-0.6784909150738911431943733987282030284405} +#define T_32768_12529 {-0.7390165159690487151067372906254604458809,-0.6736873081222243264676308172056451439857} +#define T_32768_12563 {-0.7433928291687099676465777520206756889820,-0.6688550676645436121603438550664577633142} +#define T_32768_12597 {-0.7477375461919433252688804714125581085682,-0.6639943990839466403386381898599211126566} +#define T_32768_12631 {-0.7520504823766963609088520570367109030485,-0.6591055089718022008682396517542656511068} +#define T_32768_12665 {-0.7563314544116869209133824369928333908319,-0.6541886051189690354235040103958453983068} +#define T_32768_12699 {-0.7605802803441944481832592828141059726477,-0.6492438965069649015049435547553002834320} +#define T_32768_12733 {-0.7647967795877934626957994623808190226555,-0.6442715932990837890415036781632807105780} +#define T_32768_12767 {-0.7689807729300288663409901346312835812569,-0.6392719068314635100236387188488151878119} +#define T_32768_12801 {-0.7731320825400330720711394860700238496065,-0.6342450496041033281002796684333588927984} +#define T_32768_12835 {-0.7772505319760840691856174089480191469193,-0.6291912352718324052958109859901014715433} +#define T_32768_12869 {-0.7813359461931048688398959711776115000248,-0.6241106786352285107355442050902638584375} +#define T_32768_12903 {-0.7853881515501035526227724403725005686283,-0.6190035956314886567142252715711947530508} +#define T_32768_12937 {-0.7894069758175529250010526993719395250082,-0.6138702033252514400629706869949586689472} +#define T_32768_12971 {-0.7933922481847111018993246034369803965092,-0.6087107198993704226808176827034913003445} +#define T_32768_13005 {-0.7973437992668817031471917289309203624725,-0.6035253646456415488330549123929813504219} +#define T_32768_13039 {-0.8012614611126125385709428883274085819721,-0.5983143579554826008148893379257060587406} +#define T_32768_13073 {-0.8051450672108342310195894242497161030769,-0.5930779213105654701365665459888987243176} +#define T_32768_13107 {-0.8089944524979376661022456573846284300089,-0.5878162772734029095644814333354588598013} +#define T_32768_13141 {-0.8128094533647891584138278631144203245640,-0.5825296494778893219290694105438888072968} +#define T_32768_13175 {-0.8165899076636848885613062520860694348812,-0.5772182626197949195656633492035325616598} +#define T_32768_13209 {-0.8203356547152418354329483918263576924801,-0.5718823424472165850573901479947380721569} +#define T_32768_13243 {-0.8240465353152276462012082447472494095564,-0.5665221157509821026110330421943217515945} +#define T_32768_13277 {-0.8277223917413272236132115722284652292728,-0.5611378103550114238018409196229185909033} +#define T_32768_13311 {-0.8313630677598459195465352422615978866816,-0.5557296551066334133750501678150612860918} +#define T_32768_13345 {-0.8349684086323504450533050658123102039099,-0.5502978798668591853271436775685288012028} +#define T_32768_13379 {-0.8385382611222451654242604490718804299831,-0.5448427155006124733560568529355805367231} +#define T_32768_13413 {-0.8420724735012854456073227993329055607319,-0.5393643938669170356803306276560761034489} +#define T_32768_13447 {-0.8455708955560262696238282842386979609728,-0.5338631478090426485394459632516372948885} +#define T_32768_13481 {-0.8490333785942068001162397195002995431423,-0.5283392111446076899738955034990794956684} +#define T_32768_13515 {-0.8524597754510701008712203474715352058411,-0.5227928186556421996655785733310040086508} +#define T_32768_13549 {-0.8558499404956182443626744316134136170149,-0.5172242060786083062140505717252381145954} +#define T_32768_13583 {-0.8592037296368019161363349667226430028677,-0.5116336100943813525177006340527441352606} +#define T_32768_13617 {-0.8625210003296445160358985049242619425058,-0.5060212683181898318807157011178787797689} +#define T_32768_13651 {-0.8658016115813007562707070974283851683140,-0.5003874192895165773364851702353917062283} +#define T_32768_13685 {-0.8690454239570495342803724270197562873363,-0.4947323024619598719198165781563147902489} +#define T_32768_13719 {-0.8722522995862198591510150436079129576683,-0.4890561581930560342001967910618986934423} +#define T_32768_13753 {-0.8754221021680509418061433279945049434900,-0.4833592277340638676541573204303858801723} +#define T_32768_13787 {-0.8785546969774854497714500212168786674738,-0.4776417532197104742763826834561768919230} +#define T_32768_13821 {-0.8816499508708952603797115443740040063858,-0.4719039776579002087864012082718545570970} +#define T_32768_13855 {-0.8847077322917410446834196591225918382406,-0.4661461449193859407635898151056608185172} +#define T_32768_13889 {-0.8877279112761630175398863684677053242922,-0.4603684997274040124892735548201017081738} +#define T_32768_13923 {-0.8907103594585056294263836207392159849405,-0.4545712876472729480070711360895074903965} +#define T_32768_13957 {-0.8936549500767725362493365537375211715698,-0.4487547550759560244237889037322020158172} +#define T_32768_13991 {-0.8965615579780149557720392294868361204863,-0.4429191492315889822073415871273027732968} +#define T_32768_14025 {-0.8994300596236508571479362217360176146030,-0.4370647181429723748813387373957084491849} +#define T_32768_14059 {-0.9022603330947155386709823687851894646883,-0.4311917106390300014062688660487765446305} +#define T_32768_14093 {-0.9050522580970435937430806916381698101759,-0.4253003763382326440911640474951127544045} +#define T_32768_14127 {-0.9078057159663819319916910899337381124496,-0.4193909656379888883925843856559367850423} +#define T_32768_14161 {-0.9105205896734327453145851904992014169693,-0.4134637297040024694894100321107544004917} +#define T_32768_14195 {-0.9131967638288281952085867487767245620489,-0.4075189204595969227895579933829139918089} +#define T_32768_14229 {-0.9158341246880347119585508153249975293875,-0.4015567905750085930804971212637610733509} +#define T_32768_14263 {-0.9184325601561869056865816673962399363518,-0.3955775934566468365893854297610232606530} +#define T_32768_14297 {-0.9209919597928523105068165932607371360064,-0.3895815832363243025326937640784308314323} +#define T_32768_14331 {-0.9235122148167256295181459790910594165325,-0.3835690147604549626869641087978379800916} +#define T_32768_14365 {-0.9259932181102514814341475357650779187679,-0.3775401435792229420940202544443309307098} +#define T_32768_14399 {-0.9284348642241779803185863784165121614933,-0.3714952259357207631218500409886473789811} +#define T_32768_14433 {-0.9308370493820381508243144708103500306606,-0.3654345187550583906599399597325827926397} +#define T_32768_14467 {-0.9331996714845607332478039097622968256474,-0.3593582796334431339602133448352105915546} +#define T_32768_14501 {-0.9355226301140099343101041995396371930838,-0.3532667668272312377908406233473215252161} +#define T_32768_14535 {-0.9378058265384531244635013536026235669851,-0.3471602392419512184140728550119092687964} +#define T_32768_14569 {-0.9400491637159573699022985238116234540939,-0.3410389564212996660330645681824535131454} +#define T_32768_14603 {-0.9422525462987140221216009194904472678900,-0.3349031785361101798415006669529248028994} +#define T_32768_14637 {-0.9444158806370912540018025538302026689053,-0.3287531663732950470979687906947219744325} +#define T_32768_14671 {-0.9465390747836140983295649675710592418909,-0.3225891813247613315596140637353528290987} +#define T_32768_14705 {-0.9486220384968729879560100926028098911047,-0.3164114853763010382081688476318959146738} +#define T_32768_14739 {-0.9506646832453589102129853927181102335453,-0.3102203410964559093798698086175136268139} +#define T_32768_14773 {-0.9526669222112261747881234441592823714018,-0.3040160116253575739442283065727679058909} +#define T_32768_14807 {-0.9546286702939826840363934934430290013552,-0.2977987606635435491320151868421817198396} +#define T_32768_14841 {-0.9565498441141067065274228298221714794636,-0.2915688524607490950124599748960463330150} +#define T_32768_14875 {-0.9584303620165909309847052099939901381731,-0.2853265518046758653092354052205337211490} +#define T_32768_14909 {-0.9602701440744128014159741724142804741859,-0.2790721240097378541555883657565573230386} +#define T_32768_14943 {-0.9620691120919315775239510912797413766384,-0.2728058349057848053220709516608621925116} +#define T_32768_14977 {-0.9638271896082123424420728952100034803152,-0.2665279508268036945395351722254417836666} +#define T_32768_15011 {-0.9655443019002751814383600503788329660892,-0.2602387385995988400289036235335515812039} +#define T_32768_15045 {-0.9672203759862714189665666708606295287609,-0.2539384655324510853269259769149357452989} +#define T_32768_15079 {-0.9688553406285855817969832060043700039387,-0.2476273994037562764525262082315748557448} +#define T_32768_15113 {-0.9704491263368630882268917048349976539612,-0.2413058084506443656813701181818032637239} +#define T_32768_15147 {-0.9720016653709638854152785825135651975870,-0.2349739613575782815058090591264772228897} +#define T_32768_15181 {-0.9735128917438413687079901137622073292732,-0.2286321272449342301147368061720044352114} +#define T_32768_15215 {-0.9749827412243471380648429658322129398584,-0.2222805756575633728822083412524079903960} +#define T_32768_15249 {-0.9764111513399610364771774584369268268347,-0.2159195765533354904874840940465219318867} +#define T_32768_15283 {-0.9777980613794463593535510881338268518448,-0.2095494002916649667334070272772805765271} +#define T_32768_15317 {-0.9791434123954302348735723171557765454054,-0.2031703176220197859525029571159393526614} +#define T_32768_15351 {-0.9804471472069090642875721641757991164923,-0.1967825996724141268678920368984108790755} +#define T_32768_15385 {-0.9817092104016788001175086719740647822618,-0.1903865179378844696422845572669757530093} +#define T_32768_15419 {-0.9829295483386901732814067145227454602718,-0.1839823442689505206271149972963030450046} +#define T_32768_15453 {-0.9841081091503285360744257559417746961117,-0.1775703508600607050116337859435589052737} +#define T_32768_15487 {-0.9852448427446185430511604863568209111691,-0.1711508102380232820838301677213166840374} +#define T_32768_15521 {-0.9863397008073530036753595595655497163534,-0.1647239952504231663699130194800090976059} +#define T_32768_15555 {-0.9873926368041462398039698200591374188662,-0.1582901790540251762973156246516737155616} +#define T_32768_15589 {-0.9884036059824123920947158694616518914700,-0.1518496351031641822260098706465214490891} +#define T_32768_15623 {-0.9893725653732670100026780346524901688099,-0.1454026371381225701817641038360306993127} +#define T_32768_15657 {-0.9902994737933535907004056753066834062338,-0.1389494591734954931361301078140968456864} +#define T_32768_15691 {-0.9911842918465941787431461307278368622065,-0.1324903754865445482113983644012478180230} +#define T_32768_15725 {-0.9920269819258633603453745308797806501389,-0.1260256606055403238997314474545419216156} +#define T_32768_15759 {-0.9928275082145867624916490967734716832638,-0.1195555892980941087300195135867397766560} +#define T_32768_15793 {-0.9935858366882639458594894676934927701950,-0.1130804365594796356830897821055259555578} +#define T_32768_15827 {-0.9943019351159135821305312674667220562696,-0.1066004776009449594997846588739776052535} +#define T_32768_15861 {-0.9949757730614441353367283227271400392056,-0.1001159878380153134269647807741421274841} +#define T_32768_15895 {-0.9956073218849470496394360452541150152683,-0.0936272428787872090794053292484022676945} +#define T_32768_15929 {-0.9961965547439142198982153786346316337585,-0.0871345185122143067735223098679853137583} +#define T_32768_15963 {-0.9967434465943788568509376091242302209139,-0.0806380906963857085889557652080839034170} +#define T_32768_15997 {-0.9972479741919798579274925032223109155893,-0.0741382355467969794693416929476370569319} +#define T_32768_16031 {-0.9977101160929495726747973094461485743523,-0.0676352293246144792293605974009551573545} +#define T_32768_16065 {-0.9981298526550256289269213993975427001715,-0.0611293484249335883351506026883726008236} +#define T_32768_16099 {-0.9985071660382854874526969979342538863420,-0.0546208693650311050138412838350632227957} +#define T_32768_16133 {-0.9988420402059048353038406276027671992779,-0.0481100687726125908483254534075967967510} +#define T_32768_16167 {-0.9991344609248391517297704922384582459927,-0.0415972233740549007796616365340014453977} +#define T_32768_16201 {-0.9993844157664285576814222622488159686327,-0.0350826099826446191620732406590832397342} +#define T_32768_16235 {-0.9995918941069259489040632615797221660614,-0.0285665054868127314680048556283509242348} +#define T_32768_16269 {-0.9997568871279490787529198314587119966745,-0.0220491868383661353270053240294146235101} +#define T_32768_16303 {-0.9998793878168549253970809331804048269987,-0.0155309310407164488654352396679314551875} +#define T_32768_16337 {-0.9999593909670374536347026150906458497047,-0.0090120151371066332945947152666121837683} +#define T_32768_16371 {-0.9999968931781498815425379689258988946676,-0.0024927161988359080972699199918451995472} +#define T_32768_16405 {-0.9999918928562480102684162375226151198149,0.0040266886865165116629605712716966081643} +#define T_32768_16439 {-0.9999443902138590578587695745227392762899,0.0105459224268683784270228542823133466300} +#define T_32768_16473 {-0.9998543872699718892960163429961539804935,0.0170647079374115599736949633324911701493} +#define T_32768_16507 {-0.9997218878499513072810600533557590097189,0.0235827681523888936510058300655146013014} +#define T_32768_16541 {-0.9995468975853759596716940905025694519281,0.0300998260368702010014718695174451568164} +#define T_32768_16575 {-0.9993294239137984202869802174973301589489,0.0366156045985270295339830681768944486976} +#define T_32768_16609 {-0.9990694760784293304567427185247652232647,0.0431298268994055461478254187568381894380} +#define T_32768_16643 {-0.9987670651277443800708510934782680124044,0.0496422160676971632731202532795578008518} +#define T_32768_16677 {-0.9984222039150150163067110042902640998363,0.0561524953095062992480812624762620544061} +#define T_32768_16711 {-0.9980349070977617698119388478517066687346,0.0626603879206148739466897268357570283115} +#define T_32768_16745 {-0.9976051911371316416321519682242069393396,0.0691656172982429845452756467238941695541} +#define T_32768_16779 {-0.9971330742971981075939424954412970691919,0.0756679069528052444804089304852823261172} +#define T_32768_16813 {-0.9966185766441850724106643610866740345955,0.0821669805196622998755273670212773140520} +#define T_32768_16847 {-0.9960617200456139963549162530398461967707,0.0886625617708671626138183796683733817190} +#define T_32768_16881 {-0.9954625281693744165423254344204906374216,0.0951543746269054857567226690662209875882} +#define T_32768_16915 {-0.9948210264827178628266324267315212637186,0.1016421431684298298803170723658695351332} +#define T_32768_16949 {-0.9941372422511757234175888697791378945112,0.1081255916479868656177032448795216623694} +#define T_32768_16983 {-0.9934112045374000610209463957289699465036,0.1146044445017374152628875094706017989665} +#define T_32768_17017 {-0.9926429441999288227904685300018172711134,0.1210784263611686534245492907757579814643} +#define T_32768_17051 {-0.9918324938918737787574286812741775065660,0.1275472620647979671293370529383537359536} +#define T_32768_17085 {-0.9909798880595327430498286958027165383101,0.1340106766698681284299965454920311458409} +#define T_32768_17119 {-0.9900851629409259668790355135570280253887,0.1404683954640330023622141197847668081522} +#define T_32768_17153 {-0.9891483565642555930708113010041415691376,0.1469201439770336237167214221699396148324} +#define T_32768_17187 {-0.9881695087462890603191567606700118631124,0.1533656479923638793483320341692888177931} +#define T_32768_17221 {-0.9871486610906675673859922426345292478800,0.1598046335589254351994270564318867400289} +#define T_32768_17255 {-0.9860858569861368216891150950687006115913,0.1662368270026714223153163629831396974623} +#define T_32768_17289 {-0.9849811416047039580590194418618921190500,0.1726619549382382712288119819277198985219} +#define T_32768_17323 {-0.9838345618997166308616897367755882441998,0.1790797442805653894026818306883797049522} +#define T_32768_17357 {-0.9826461666038680542456518196559045463800,0.1854899222565018768182909525421564467251} +#define T_32768_17391 {-0.9814160062271255480226272993604652583599,0.1918922164164002519548546388250542804599} diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp32.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp32.hpp.inc new file mode 100644 index 0000000000000..487ca279a895b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp32.hpp.inc @@ -0,0 +1,29 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#ifndef CUFFTDX_DATABASE_LUT_FP32_INC_HPP + +#define CUFFTDX_DATABASE_LUT_FP32_INC_HPP + +#include "lut_defines.hpp.inc" + +#ifdef _MSC_VER +// truncation of twiddle values is intended +#pragma warning(disable : 4305) +#pragma warning(disable : 4838) +#endif + + + +#include "lut_fp32_0.hpp.inc" + + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp32_0.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp32_0.hpp.inc new file mode 100644 index 0000000000000..5abda8553d00a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp32_0.hpp.inc @@ -0,0 +1,370 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +static const __device__ float2 lut_sp_2_2[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_3_3[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_2_4[2*2] = { + T_2_0,T_4_1,T_2_0,T_2_1 +}; +static const __device__ float2 lut_sp_4_4[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_5_5[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_6_6[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_7_7[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_2_8[4*2] = { + T_2_0,T_8_1,T_4_1,T_8_3,T_2_0,T_4_1,T_2_1,T_4_3 +}; +static const __device__ float2 lut_sp_4_8[2*2] = { + T_2_0,T_8_1,T_2_0,T_8_3 +}; +static const __device__ float2 lut_sp_8_8[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_3_9[3*2] = { + T_2_0,T_9_1,T_9_2,T_2_0,T_9_2,T_9_4 +}; +static const __device__ float2 lut_sp_9_9[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_10_10[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_11_11[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_12_12[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_13_13[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_14_14[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_15_15[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_2_16[8*2] = { + T_2_0,T_16_1,T_8_1,T_16_3,T_4_1,T_16_5,T_8_3,T_16_7,T_2_0,T_8_1,T_4_1,T_8_3,T_2_1,T_8_5,T_4_3,T_8_7 +}; +static const __device__ float2 lut_sp_4_16[4*2] = { + T_2_0,T_16_1,T_8_1,T_16_3,T_2_0,T_16_3,T_8_3,T_16_9 +}; +static const __device__ float2 lut_sp_8_16[2*2] = { + T_2_0,T_16_1,T_2_0,T_16_5 +}; +static const __device__ float2 lut_sp_16_16[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_17_17[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_18_18[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_19_19[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_20_20[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_21_21[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_22_22[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_23_23[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_24_24[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_5_25[5*2] = { + T_2_0,T_25_1,T_25_2,T_25_3,T_25_4,T_2_0,T_25_3,T_25_6,T_25_9,T_25_12 +}; +static const __device__ float2 lut_sp_25_25[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_26_26[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_3_27[9*2] = { + T_2_0,T_27_1,T_27_2,T_9_1,T_27_4,T_27_5,T_9_2,T_27_7,T_27_8,T_2_0,T_27_2,T_27_4,T_9_2,T_27_8,T_27_10,T_9_4,T_27_14,T_27_16 +}; +static const __device__ float2 lut_sp_9_27[3*2] = { + T_2_0,T_27_1,T_27_2,T_2_0,T_27_5,T_27_10 +}; +static const __device__ float2 lut_sp_27_27[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_28_28[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_29_29[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_30_30[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_31_31[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_2_32[16*2] = { + T_2_0,T_32_1,T_16_1,T_32_3,T_8_1,T_32_5,T_16_3,T_32_7,T_4_1,T_32_9,T_16_5,T_32_11,T_8_3,T_32_13,T_16_7,T_32_15,T_2_0,T_16_1,T_8_1,T_16_3,T_4_1,T_16_5,T_8_3,T_16_7,T_2_1,T_16_9,T_8_5,T_16_11,T_4_3,T_16_13,T_8_7,T_16_15 +}; +static const __device__ float2 lut_sp_4_32[8*2] = { + T_2_0,T_32_1,T_16_1,T_32_3,T_8_1,T_32_5,T_16_3,T_32_7,T_2_0,T_32_3,T_16_3,T_32_9,T_8_3,T_32_15,T_16_9,T_32_21 +}; +static const __device__ float2 lut_sp_8_32[4*2] = { + T_2_0,T_32_1,T_16_1,T_32_3,T_2_0,T_32_5,T_16_5,T_32_15 +}; +static const __device__ float2 lut_sp_16_32[2*2] = { + T_2_0,T_32_1,T_2_0,T_32_9 +}; +static const __device__ float2 lut_sp_32_32[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ float2 lut_sp_6_36[6*2] = { + T_2_0,T_36_1,T_18_1,T_12_1,T_9_1,T_36_5,T_2_0,T_9_1,T_9_2,T_3_1,T_9_4,T_9_5 +}; +static const __device__ float2 lut_sp_7_49[7*2] = { + T_2_0,T_49_1,T_49_2,T_49_3,T_49_4,T_49_5,T_49_6,T_2_0,T_49_4,T_49_8,T_49_12,T_49_16,T_49_20,T_49_24 +}; +static const __device__ float2 lut_sp_2_64[32*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_8_1,T_64_9,T_32_5,T_64_11,T_16_3,T_64_13,T_32_7,T_64_15,T_4_1,T_64_17,T_32_9,T_64_19,T_16_5,T_64_21,T_32_11,T_64_23,T_8_3,T_64_25,T_32_13,T_64_27,T_16_7,T_64_29,T_32_15,T_64_31,T_2_0,T_32_1,T_16_1,T_32_3,T_8_1,T_32_5,T_16_3,T_32_7,T_4_1,T_32_9,T_16_5,T_32_11,T_8_3,T_32_13,T_16_7,T_32_15,T_2_1,T_32_17,T_16_9,T_32_19,T_8_5,T_32_21,T_16_11,T_32_23,T_4_3,T_32_25,T_16_13,T_32_27,T_8_7,T_32_29,T_16_15,T_32_31 +}; +static const __device__ float2 lut_sp_4_64[16*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_8_1,T_64_9,T_32_5,T_64_11,T_16_3,T_64_13,T_32_7,T_64_15,T_2_0,T_64_3,T_32_3,T_64_9,T_16_3,T_64_15,T_32_9,T_64_21,T_8_3,T_64_27,T_32_15,T_64_33,T_16_9,T_64_39,T_32_21,T_64_45 +}; +static const __device__ float2 lut_sp_8_64[8*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_2_0,T_64_5,T_32_5,T_64_15,T_16_5,T_64_25,T_32_15,T_64_35 +}; +static const __device__ float2 lut_sp_16_64[4*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_2_0,T_64_9,T_32_9,T_64_27 +}; +static const __device__ float2 lut_sp_32_64[2*2] = { + T_2_0,T_64_1,T_2_0,T_64_17 +}; +static const __device__ float2 lut_sp_3_81[27*2] = { + T_2_0,T_81_1,T_81_2,T_27_1,T_81_4,T_81_5,T_27_2,T_81_7,T_81_8,T_9_1,T_81_10,T_81_11,T_27_4,T_81_13,T_81_14,T_27_5,T_81_16,T_81_17,T_9_2,T_81_19,T_81_20,T_27_7,T_81_22,T_81_23,T_27_8,T_81_25,T_81_26,T_2_0,T_81_2,T_81_4,T_27_2,T_81_8,T_81_10,T_27_4,T_81_14,T_81_16,T_9_2,T_81_20,T_81_22,T_27_8,T_81_26,T_81_28,T_27_10,T_81_32,T_81_34,T_9_4,T_81_38,T_81_40,T_27_14,T_81_44,T_81_46,T_27_16,T_81_50,T_81_52 +}; +static const __device__ float2 lut_sp_9_81[9*2] = { + T_2_0,T_81_1,T_81_2,T_27_1,T_81_4,T_81_5,T_27_2,T_81_7,T_81_8,T_2_0,T_81_5,T_81_10,T_27_5,T_81_20,T_81_25,T_27_10,T_81_35,T_81_40 +}; +static const __device__ float2 lut_sp_27_81[3*2] = { + T_2_0,T_81_1,T_81_2,T_2_0,T_81_14,T_81_28 +}; +static const __device__ float2 lut_sp_10_100[10*2] = { + T_2_0,T_100_1,T_50_1,T_100_3,T_25_1,T_20_1,T_50_3,T_100_7,T_25_2,T_100_9,T_2_0,T_50_3,T_25_3,T_50_9,T_25_6,T_10_3,T_25_9,T_50_21,T_25_12,T_50_27 +}; +static const __device__ float2 lut_sp_11_121[11*2] = { + T_2_0,T_121_1,T_121_2,T_121_3,T_121_4,T_121_5,T_121_6,T_121_7,T_121_8,T_121_9,T_121_10,T_2_0,T_121_6,T_121_12,T_121_18,T_121_24,T_121_30,T_121_36,T_121_42,T_121_48,T_121_54,T_121_60 +}; +static const __device__ float2 lut_sp_5_125[25*2] = { + T_2_0,T_125_1,T_125_2,T_125_3,T_125_4,T_25_1,T_125_6,T_125_7,T_125_8,T_125_9,T_25_2,T_125_11,T_125_12,T_125_13,T_125_14,T_25_3,T_125_16,T_125_17,T_125_18,T_125_19,T_25_4,T_125_21,T_125_22,T_125_23,T_125_24,T_2_0,T_125_3,T_125_6,T_125_9,T_125_12,T_25_3,T_125_18,T_125_21,T_125_24,T_125_27,T_25_6,T_125_33,T_125_36,T_125_39,T_125_42,T_25_9,T_125_48,T_125_51,T_125_54,T_125_57,T_25_12,T_125_63,T_125_66,T_125_69,T_125_72 +}; +static const __device__ float2 lut_sp_25_125[5*2] = { + T_2_0,T_125_1,T_125_2,T_125_3,T_125_4,T_2_0,T_125_13,T_125_26,T_125_39,T_125_52 +}; +static const __device__ float2 lut_sp_2_128[64*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_8_1,T_128_17,T_64_9,T_128_19,T_32_5,T_128_21,T_64_11,T_128_23,T_16_3,T_128_25,T_64_13,T_128_27,T_32_7,T_128_29,T_64_15,T_128_31,T_4_1,T_128_33,T_64_17,T_128_35,T_32_9,T_128_37,T_64_19,T_128_39,T_16_5,T_128_41,T_64_21,T_128_43,T_32_11,T_128_45,T_64_23,T_128_47,T_8_3,T_128_49,T_64_25,T_128_51,T_32_13,T_128_53,T_64_27,T_128_55,T_16_7,T_128_57,T_64_29,T_128_59,T_32_15,T_128_61,T_64_31,T_128_63,T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_8_1,T_64_9,T_32_5,T_64_11,T_16_3,T_64_13,T_32_7,T_64_15,T_4_1,T_64_17,T_32_9,T_64_19,T_16_5,T_64_21,T_32_11,T_64_23,T_8_3,T_64_25,T_32_13,T_64_27,T_16_7,T_64_29,T_32_15,T_64_31,T_2_1,T_64_33,T_32_17,T_64_35,T_16_9,T_64_37,T_32_19,T_64_39,T_8_5,T_64_41,T_32_21,T_64_43,T_16_11,T_64_45,T_32_23,T_64_47,T_4_3,T_64_49,T_32_25,T_64_51,T_16_13,T_64_53,T_32_27,T_64_55,T_8_7,T_64_57,T_32_29,T_64_59,T_16_15,T_64_61,T_32_31,T_64_63 +}; +static const __device__ float2 lut_sp_4_128[32*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_8_1,T_128_17,T_64_9,T_128_19,T_32_5,T_128_21,T_64_11,T_128_23,T_16_3,T_128_25,T_64_13,T_128_27,T_32_7,T_128_29,T_64_15,T_128_31,T_2_0,T_128_3,T_64_3,T_128_9,T_32_3,T_128_15,T_64_9,T_128_21,T_16_3,T_128_27,T_64_15,T_128_33,T_32_9,T_128_39,T_64_21,T_128_45,T_8_3,T_128_51,T_64_27,T_128_57,T_32_15,T_128_63,T_64_33,T_128_69,T_16_9,T_128_75,T_64_39,T_128_81,T_32_21,T_128_87,T_64_45,T_128_93 +}; +static const __device__ float2 lut_sp_8_128[16*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_2_0,T_128_5,T_64_5,T_128_15,T_32_5,T_128_25,T_64_15,T_128_35,T_16_5,T_128_45,T_64_25,T_128_55,T_32_15,T_128_65,T_64_35,T_128_75 +}; +static const __device__ float2 lut_sp_16_128[8*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_2_0,T_128_9,T_64_9,T_128_27,T_32_9,T_128_45,T_64_27,T_128_63 +}; +static const __device__ float2 lut_sp_32_128[4*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_2_0,T_128_17,T_64_17,T_128_51 +}; +static const __device__ float2 lut_sp_12_144[12*2] = { + T_2_0,T_144_1,T_72_1,T_48_1,T_36_1,T_144_5,T_24_1,T_144_7,T_18_1,T_16_1,T_72_5,T_144_11,T_2_0,T_144_7,T_72_7,T_48_7,T_36_7,T_144_35,T_24_7,T_144_49,T_18_7,T_16_7,T_72_35,T_144_77 +}; +static const __device__ float2 lut_sp_6_216[36*2] = { + T_2_0,T_216_1,T_108_1,T_72_1,T_54_1,T_216_5,T_36_1,T_216_7,T_27_1,T_24_1,T_108_5,T_216_11,T_18_1,T_216_13,T_108_7,T_72_5,T_27_2,T_216_17,T_12_1,T_216_19,T_54_5,T_72_7,T_108_11,T_216_23,T_9_1,T_216_25,T_108_13,T_8_1,T_54_7,T_216_29,T_36_5,T_216_31,T_27_4,T_72_11,T_108_17,T_216_35,T_2_0,T_54_1,T_27_1,T_18_1,T_27_2,T_54_5,T_9_1,T_54_7,T_27_4,T_6_1,T_27_5,T_54_11,T_9_2,T_54_13,T_27_7,T_18_5,T_27_8,T_54_17,T_3_1,T_54_19,T_27_10,T_18_7,T_27_11,T_54_23,T_9_4,T_54_25,T_27_13,T_2_1,T_27_14,T_54_29,T_9_5,T_54_31,T_27_16,T_18_11,T_27_17,T_54_35 +}; +static const __device__ float2 lut_sp_3_243[81*2] = { + T_2_0,T_243_1,T_243_2,T_81_1,T_243_4,T_243_5,T_81_2,T_243_7,T_243_8,T_27_1,T_243_10,T_243_11,T_81_4,T_243_13,T_243_14,T_81_5,T_243_16,T_243_17,T_27_2,T_243_19,T_243_20,T_81_7,T_243_22,T_243_23,T_81_8,T_243_25,T_243_26,T_9_1,T_243_28,T_243_29,T_81_10,T_243_31,T_243_32,T_81_11,T_243_34,T_243_35,T_27_4,T_243_37,T_243_38,T_81_13,T_243_40,T_243_41,T_81_14,T_243_43,T_243_44,T_27_5,T_243_46,T_243_47,T_81_16,T_243_49,T_243_50,T_81_17,T_243_52,T_243_53,T_9_2,T_243_55,T_243_56,T_81_19,T_243_58,T_243_59,T_81_20,T_243_61,T_243_62,T_27_7,T_243_64,T_243_65,T_81_22,T_243_67,T_243_68,T_81_23,T_243_70,T_243_71,T_27_8,T_243_73,T_243_74,T_81_25,T_243_76,T_243_77,T_81_26,T_243_79,T_243_80,T_2_0,T_243_2,T_243_4,T_81_2,T_243_8,T_243_10,T_81_4,T_243_14,T_243_16,T_27_2,T_243_20,T_243_22,T_81_8,T_243_26,T_243_28,T_81_10,T_243_32,T_243_34,T_27_4,T_243_38,T_243_40,T_81_14,T_243_44,T_243_46,T_81_16,T_243_50,T_243_52,T_9_2,T_243_56,T_243_58,T_81_20,T_243_62,T_243_64,T_81_22,T_243_68,T_243_70,T_27_8,T_243_74,T_243_76,T_81_26,T_243_80,T_243_82,T_81_28,T_243_86,T_243_88,T_27_10,T_243_92,T_243_94,T_81_32,T_243_98,T_243_100,T_81_34,T_243_104,T_243_106,T_9_4,T_243_110,T_243_112,T_81_38,T_243_116,T_243_118,T_81_40,T_243_122,T_243_124,T_27_14,T_243_128,T_243_130,T_81_44,T_243_134,T_243_136,T_81_46,T_243_140,T_243_142,T_27_16,T_243_146,T_243_148,T_81_50,T_243_152,T_243_154,T_81_52,T_243_158,T_243_160 +}; +static const __device__ float2 lut_sp_9_243[27*2] = { + T_2_0,T_243_1,T_243_2,T_81_1,T_243_4,T_243_5,T_81_2,T_243_7,T_243_8,T_27_1,T_243_10,T_243_11,T_81_4,T_243_13,T_243_14,T_81_5,T_243_16,T_243_17,T_27_2,T_243_19,T_243_20,T_81_7,T_243_22,T_243_23,T_81_8,T_243_25,T_243_26,T_2_0,T_243_5,T_243_10,T_81_5,T_243_20,T_243_25,T_81_10,T_243_35,T_243_40,T_27_5,T_243_50,T_243_55,T_81_20,T_243_65,T_243_70,T_81_25,T_243_80,T_243_85,T_27_10,T_243_95,T_243_100,T_81_35,T_243_110,T_243_115,T_81_40,T_243_125,T_243_130 +}; +static const __device__ float2 lut_sp_27_243[9*2] = { + T_2_0,T_243_1,T_243_2,T_81_1,T_243_4,T_243_5,T_81_2,T_243_7,T_243_8,T_2_0,T_243_14,T_243_28,T_81_14,T_243_56,T_243_70,T_81_28,T_243_98,T_243_112 +}; +static const __device__ float2 lut_sp_2_256[128*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_8_1,T_256_33,T_128_17,T_256_35,T_64_9,T_256_37,T_128_19,T_256_39,T_32_5,T_256_41,T_128_21,T_256_43,T_64_11,T_256_45,T_128_23,T_256_47,T_16_3,T_256_49,T_128_25,T_256_51,T_64_13,T_256_53,T_128_27,T_256_55,T_32_7,T_256_57,T_128_29,T_256_59,T_64_15,T_256_61,T_128_31,T_256_63,T_4_1,T_256_65,T_128_33,T_256_67,T_64_17,T_256_69,T_128_35,T_256_71,T_32_9,T_256_73,T_128_37,T_256_75,T_64_19,T_256_77,T_128_39,T_256_79,T_16_5,T_256_81,T_128_41,T_256_83,T_64_21,T_256_85,T_128_43,T_256_87,T_32_11,T_256_89,T_128_45,T_256_91,T_64_23,T_256_93,T_128_47,T_256_95,T_8_3,T_256_97,T_128_49,T_256_99,T_64_25,T_256_101,T_128_51,T_256_103,T_32_13,T_256_105,T_128_53,T_256_107,T_64_27,T_256_109,T_128_55,T_256_111,T_16_7,T_256_113,T_128_57,T_256_115,T_64_29,T_256_117,T_128_59,T_256_119,T_32_15,T_256_121,T_128_61,T_256_123,T_64_31,T_256_125,T_128_63,T_256_127,T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_8_1,T_128_17,T_64_9,T_128_19,T_32_5,T_128_21,T_64_11,T_128_23,T_16_3,T_128_25,T_64_13,T_128_27,T_32_7,T_128_29,T_64_15,T_128_31,T_4_1,T_128_33,T_64_17,T_128_35,T_32_9,T_128_37,T_64_19,T_128_39,T_16_5,T_128_41,T_64_21,T_128_43,T_32_11,T_128_45,T_64_23,T_128_47,T_8_3,T_128_49,T_64_25,T_128_51,T_32_13,T_128_53,T_64_27,T_128_55,T_16_7,T_128_57,T_64_29,T_128_59,T_32_15,T_128_61,T_64_31,T_128_63,T_2_1,T_128_65,T_64_33,T_128_67,T_32_17,T_128_69,T_64_35,T_128_71,T_16_9,T_128_73,T_64_37,T_128_75,T_32_19,T_128_77,T_64_39,T_128_79,T_8_5,T_128_81,T_64_41,T_128_83,T_32_21,T_128_85,T_64_43,T_128_87,T_16_11,T_128_89,T_64_45,T_128_91,T_32_23,T_128_93,T_64_47,T_128_95,T_4_3,T_128_97,T_64_49,T_128_99,T_32_25,T_128_101,T_64_51,T_128_103,T_16_13,T_128_105,T_64_53,T_128_107,T_32_27,T_128_109,T_64_55,T_128_111,T_8_7,T_128_113,T_64_57,T_128_115,T_32_29,T_128_117,T_64_59,T_128_119,T_16_15,T_128_121,T_64_61,T_128_123,T_32_31,T_128_125,T_64_63,T_128_127 +}; +static const __device__ float2 lut_sp_4_256[64*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_8_1,T_256_33,T_128_17,T_256_35,T_64_9,T_256_37,T_128_19,T_256_39,T_32_5,T_256_41,T_128_21,T_256_43,T_64_11,T_256_45,T_128_23,T_256_47,T_16_3,T_256_49,T_128_25,T_256_51,T_64_13,T_256_53,T_128_27,T_256_55,T_32_7,T_256_57,T_128_29,T_256_59,T_64_15,T_256_61,T_128_31,T_256_63,T_2_0,T_256_3,T_128_3,T_256_9,T_64_3,T_256_15,T_128_9,T_256_21,T_32_3,T_256_27,T_128_15,T_256_33,T_64_9,T_256_39,T_128_21,T_256_45,T_16_3,T_256_51,T_128_27,T_256_57,T_64_15,T_256_63,T_128_33,T_256_69,T_32_9,T_256_75,T_128_39,T_256_81,T_64_21,T_256_87,T_128_45,T_256_93,T_8_3,T_256_99,T_128_51,T_256_105,T_64_27,T_256_111,T_128_57,T_256_117,T_32_15,T_256_123,T_128_63,T_256_129,T_64_33,T_256_135,T_128_69,T_256_141,T_16_9,T_256_147,T_128_75,T_256_153,T_64_39,T_256_159,T_128_81,T_256_165,T_32_21,T_256_171,T_128_87,T_256_177,T_64_45,T_256_183,T_128_93,T_256_189 +}; +static const __device__ float2 lut_sp_8_256[32*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_2_0,T_256_5,T_128_5,T_256_15,T_64_5,T_256_25,T_128_15,T_256_35,T_32_5,T_256_45,T_128_25,T_256_55,T_64_15,T_256_65,T_128_35,T_256_75,T_16_5,T_256_85,T_128_45,T_256_95,T_64_25,T_256_105,T_128_55,T_256_115,T_32_15,T_256_125,T_128_65,T_256_135,T_64_35,T_256_145,T_128_75,T_256_155 +}; +static const __device__ float2 lut_sp_16_256[16*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_2_0,T_256_9,T_128_9,T_256_27,T_64_9,T_256_45,T_128_27,T_256_63,T_32_9,T_256_81,T_128_45,T_256_99,T_64_27,T_256_117,T_128_63,T_256_135 +}; +static const __device__ float2 lut_sp_32_256[8*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_2_0,T_256_17,T_128_17,T_256_51,T_64_17,T_256_85,T_128_51,T_256_119 +}; +static const __device__ float2 lut_sp_7_343[49*2] = { + T_2_0,T_343_1,T_343_2,T_343_3,T_343_4,T_343_5,T_343_6,T_49_1,T_343_8,T_343_9,T_343_10,T_343_11,T_343_12,T_343_13,T_49_2,T_343_15,T_343_16,T_343_17,T_343_18,T_343_19,T_343_20,T_49_3,T_343_22,T_343_23,T_343_24,T_343_25,T_343_26,T_343_27,T_49_4,T_343_29,T_343_30,T_343_31,T_343_32,T_343_33,T_343_34,T_49_5,T_343_36,T_343_37,T_343_38,T_343_39,T_343_40,T_343_41,T_49_6,T_343_43,T_343_44,T_343_45,T_343_46,T_343_47,T_343_48,T_2_0,T_343_4,T_343_8,T_343_12,T_343_16,T_343_20,T_343_24,T_49_4,T_343_32,T_343_36,T_343_40,T_343_44,T_343_48,T_343_52,T_49_8,T_343_60,T_343_64,T_343_68,T_343_72,T_343_76,T_343_80,T_49_12,T_343_88,T_343_92,T_343_96,T_343_100,T_343_104,T_343_108,T_49_16,T_343_116,T_343_120,T_343_124,T_343_128,T_343_132,T_343_136,T_49_20,T_343_144,T_343_148,T_343_152,T_343_156,T_343_160,T_343_164,T_49_24,T_343_172,T_343_176,T_343_180,T_343_184,T_343_188,T_343_192 +}; +static const __device__ float2 lut_sp_2_512[256*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_8_1,T_512_65,T_256_33,T_512_67,T_128_17,T_512_69,T_256_35,T_512_71,T_64_9,T_512_73,T_256_37,T_512_75,T_128_19,T_512_77,T_256_39,T_512_79,T_32_5,T_512_81,T_256_41,T_512_83,T_128_21,T_512_85,T_256_43,T_512_87,T_64_11,T_512_89,T_256_45,T_512_91,T_128_23,T_512_93,T_256_47,T_512_95,T_16_3,T_512_97,T_256_49,T_512_99,T_128_25,T_512_101,T_256_51,T_512_103,T_64_13,T_512_105,T_256_53,T_512_107,T_128_27,T_512_109,T_256_55,T_512_111,T_32_7,T_512_113,T_256_57,T_512_115,T_128_29,T_512_117,T_256_59,T_512_119,T_64_15,T_512_121,T_256_61,T_512_123,T_128_31,T_512_125,T_256_63,T_512_127,T_4_1,T_512_129,T_256_65,T_512_131,T_128_33,T_512_133,T_256_67,T_512_135,T_64_17,T_512_137,T_256_69,T_512_139,T_128_35,T_512_141,T_256_71,T_512_143,T_32_9,T_512_145,T_256_73,T_512_147,T_128_37,T_512_149,T_256_75,T_512_151,T_64_19,T_512_153,T_256_77,T_512_155,T_128_39,T_512_157,T_256_79,T_512_159,T_16_5,T_512_161,T_256_81,T_512_163,T_128_41,T_512_165,T_256_83,T_512_167,T_64_21,T_512_169,T_256_85,T_512_171,T_128_43,T_512_173,T_256_87,T_512_175,T_32_11,T_512_177,T_256_89,T_512_179,T_128_45,T_512_181,T_256_91,T_512_183,T_64_23,T_512_185,T_256_93,T_512_187,T_128_47,T_512_189,T_256_95,T_512_191,T_8_3,T_512_193,T_256_97,T_512_195,T_128_49,T_512_197,T_256_99,T_512_199,T_64_25,T_512_201,T_256_101,T_512_203,T_128_51,T_512_205,T_256_103,T_512_207,T_32_13,T_512_209,T_256_105,T_512_211,T_128_53,T_512_213,T_256_107,T_512_215,T_64_27,T_512_217,T_256_109,T_512_219,T_128_55,T_512_221,T_256_111,T_512_223,T_16_7,T_512_225,T_256_113,T_512_227,T_128_57,T_512_229,T_256_115,T_512_231,T_64_29,T_512_233,T_256_117,T_512_235,T_128_59,T_512_237,T_256_119,T_512_239,T_32_15,T_512_241,T_256_121,T_512_243,T_128_61,T_512_245,T_256_123,T_512_247,T_64_31,T_512_249,T_256_125,T_512_251,T_128_63,T_512_253,T_256_127,T_512_255,T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_8_1,T_256_33,T_128_17,T_256_35,T_64_9,T_256_37,T_128_19,T_256_39,T_32_5,T_256_41,T_128_21,T_256_43,T_64_11,T_256_45,T_128_23,T_256_47,T_16_3,T_256_49,T_128_25,T_256_51,T_64_13,T_256_53,T_128_27,T_256_55,T_32_7,T_256_57,T_128_29,T_256_59,T_64_15,T_256_61,T_128_31,T_256_63,T_4_1,T_256_65,T_128_33,T_256_67,T_64_17,T_256_69,T_128_35,T_256_71,T_32_9,T_256_73,T_128_37,T_256_75,T_64_19,T_256_77,T_128_39,T_256_79,T_16_5,T_256_81,T_128_41,T_256_83,T_64_21,T_256_85,T_128_43,T_256_87,T_32_11,T_256_89,T_128_45,T_256_91,T_64_23,T_256_93,T_128_47,T_256_95,T_8_3,T_256_97,T_128_49,T_256_99,T_64_25,T_256_101,T_128_51,T_256_103,T_32_13,T_256_105,T_128_53,T_256_107,T_64_27,T_256_109,T_128_55,T_256_111,T_16_7,T_256_113,T_128_57,T_256_115,T_64_29,T_256_117,T_128_59,T_256_119,T_32_15,T_256_121,T_128_61,T_256_123,T_64_31,T_256_125,T_128_63,T_256_127,T_2_1,T_256_129,T_128_65,T_256_131,T_64_33,T_256_133,T_128_67,T_256_135,T_32_17,T_256_137,T_128_69,T_256_139,T_64_35,T_256_141,T_128_71,T_256_143,T_16_9,T_256_145,T_128_73,T_256_147,T_64_37,T_256_149,T_128_75,T_256_151,T_32_19,T_256_153,T_128_77,T_256_155,T_64_39,T_256_157,T_128_79,T_256_159,T_8_5,T_256_161,T_128_81,T_256_163,T_64_41,T_256_165,T_128_83,T_256_167,T_32_21,T_256_169,T_128_85,T_256_171,T_64_43,T_256_173,T_128_87,T_256_175,T_16_11,T_256_177,T_128_89,T_256_179,T_64_45,T_256_181,T_128_91,T_256_183,T_32_23,T_256_185,T_128_93,T_256_187,T_64_47,T_256_189,T_128_95,T_256_191,T_4_3,T_256_193,T_128_97,T_256_195,T_64_49,T_256_197,T_128_99,T_256_199,T_32_25,T_256_201,T_128_101,T_256_203,T_64_51,T_256_205,T_128_103,T_256_207,T_16_13,T_256_209,T_128_105,T_256_211,T_64_53,T_256_213,T_128_107,T_256_215,T_32_27,T_256_217,T_128_109,T_256_219,T_64_55,T_256_221,T_128_111,T_256_223,T_8_7,T_256_225,T_128_113,T_256_227,T_64_57,T_256_229,T_128_115,T_256_231,T_32_29,T_256_233,T_128_117,T_256_235,T_64_59,T_256_237,T_128_119,T_256_239,T_16_15,T_256_241,T_128_121,T_256_243,T_64_61,T_256_245,T_128_123,T_256_247,T_32_31,T_256_249,T_128_125,T_256_251,T_64_63,T_256_253,T_128_127,T_256_255 +}; +static const __device__ float2 lut_sp_4_512[128*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_8_1,T_512_65,T_256_33,T_512_67,T_128_17,T_512_69,T_256_35,T_512_71,T_64_9,T_512_73,T_256_37,T_512_75,T_128_19,T_512_77,T_256_39,T_512_79,T_32_5,T_512_81,T_256_41,T_512_83,T_128_21,T_512_85,T_256_43,T_512_87,T_64_11,T_512_89,T_256_45,T_512_91,T_128_23,T_512_93,T_256_47,T_512_95,T_16_3,T_512_97,T_256_49,T_512_99,T_128_25,T_512_101,T_256_51,T_512_103,T_64_13,T_512_105,T_256_53,T_512_107,T_128_27,T_512_109,T_256_55,T_512_111,T_32_7,T_512_113,T_256_57,T_512_115,T_128_29,T_512_117,T_256_59,T_512_119,T_64_15,T_512_121,T_256_61,T_512_123,T_128_31,T_512_125,T_256_63,T_512_127,T_2_0,T_512_3,T_256_3,T_512_9,T_128_3,T_512_15,T_256_9,T_512_21,T_64_3,T_512_27,T_256_15,T_512_33,T_128_9,T_512_39,T_256_21,T_512_45,T_32_3,T_512_51,T_256_27,T_512_57,T_128_15,T_512_63,T_256_33,T_512_69,T_64_9,T_512_75,T_256_39,T_512_81,T_128_21,T_512_87,T_256_45,T_512_93,T_16_3,T_512_99,T_256_51,T_512_105,T_128_27,T_512_111,T_256_57,T_512_117,T_64_15,T_512_123,T_256_63,T_512_129,T_128_33,T_512_135,T_256_69,T_512_141,T_32_9,T_512_147,T_256_75,T_512_153,T_128_39,T_512_159,T_256_81,T_512_165,T_64_21,T_512_171,T_256_87,T_512_177,T_128_45,T_512_183,T_256_93,T_512_189,T_8_3,T_512_195,T_256_99,T_512_201,T_128_51,T_512_207,T_256_105,T_512_213,T_64_27,T_512_219,T_256_111,T_512_225,T_128_57,T_512_231,T_256_117,T_512_237,T_32_15,T_512_243,T_256_123,T_512_249,T_128_63,T_512_255,T_256_129,T_512_261,T_64_33,T_512_267,T_256_135,T_512_273,T_128_69,T_512_279,T_256_141,T_512_285,T_16_9,T_512_291,T_256_147,T_512_297,T_128_75,T_512_303,T_256_153,T_512_309,T_64_39,T_512_315,T_256_159,T_512_321,T_128_81,T_512_327,T_256_165,T_512_333,T_32_21,T_512_339,T_256_171,T_512_345,T_128_87,T_512_351,T_256_177,T_512_357,T_64_45,T_512_363,T_256_183,T_512_369,T_128_93,T_512_375,T_256_189,T_512_381 +}; +static const __device__ float2 lut_sp_8_512[64*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_2_0,T_512_5,T_256_5,T_512_15,T_128_5,T_512_25,T_256_15,T_512_35,T_64_5,T_512_45,T_256_25,T_512_55,T_128_15,T_512_65,T_256_35,T_512_75,T_32_5,T_512_85,T_256_45,T_512_95,T_128_25,T_512_105,T_256_55,T_512_115,T_64_15,T_512_125,T_256_65,T_512_135,T_128_35,T_512_145,T_256_75,T_512_155,T_16_5,T_512_165,T_256_85,T_512_175,T_128_45,T_512_185,T_256_95,T_512_195,T_64_25,T_512_205,T_256_105,T_512_215,T_128_55,T_512_225,T_256_115,T_512_235,T_32_15,T_512_245,T_256_125,T_512_255,T_128_65,T_512_265,T_256_135,T_512_275,T_64_35,T_512_285,T_256_145,T_512_295,T_128_75,T_512_305,T_256_155,T_512_315 +}; +static const __device__ float2 lut_sp_16_512[32*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_2_0,T_512_9,T_256_9,T_512_27,T_128_9,T_512_45,T_256_27,T_512_63,T_64_9,T_512_81,T_256_45,T_512_99,T_128_27,T_512_117,T_256_63,T_512_135,T_32_9,T_512_153,T_256_81,T_512_171,T_128_45,T_512_189,T_256_99,T_512_207,T_64_27,T_512_225,T_256_117,T_512_243,T_128_63,T_512_261,T_256_135,T_512_279 +}; +static const __device__ float2 lut_sp_32_512[16*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_2_0,T_512_17,T_256_17,T_512_51,T_128_17,T_512_85,T_256_51,T_512_119,T_64_17,T_512_153,T_256_85,T_512_187,T_128_51,T_512_221,T_256_119,T_512_255 +}; +static const __device__ float2 lut_sp_5_625[125*2] = { + T_2_0,T_625_1,T_625_2,T_625_3,T_625_4,T_125_1,T_625_6,T_625_7,T_625_8,T_625_9,T_125_2,T_625_11,T_625_12,T_625_13,T_625_14,T_125_3,T_625_16,T_625_17,T_625_18,T_625_19,T_125_4,T_625_21,T_625_22,T_625_23,T_625_24,T_25_1,T_625_26,T_625_27,T_625_28,T_625_29,T_125_6,T_625_31,T_625_32,T_625_33,T_625_34,T_125_7,T_625_36,T_625_37,T_625_38,T_625_39,T_125_8,T_625_41,T_625_42,T_625_43,T_625_44,T_125_9,T_625_46,T_625_47,T_625_48,T_625_49,T_25_2,T_625_51,T_625_52,T_625_53,T_625_54,T_125_11,T_625_56,T_625_57,T_625_58,T_625_59,T_125_12,T_625_61,T_625_62,T_625_63,T_625_64,T_125_13,T_625_66,T_625_67,T_625_68,T_625_69,T_125_14,T_625_71,T_625_72,T_625_73,T_625_74,T_25_3,T_625_76,T_625_77,T_625_78,T_625_79,T_125_16,T_625_81,T_625_82,T_625_83,T_625_84,T_125_17,T_625_86,T_625_87,T_625_88,T_625_89,T_125_18,T_625_91,T_625_92,T_625_93,T_625_94,T_125_19,T_625_96,T_625_97,T_625_98,T_625_99,T_25_4,T_625_101,T_625_102,T_625_103,T_625_104,T_125_21,T_625_106,T_625_107,T_625_108,T_625_109,T_125_22,T_625_111,T_625_112,T_625_113,T_625_114,T_125_23,T_625_116,T_625_117,T_625_118,T_625_119,T_125_24,T_625_121,T_625_122,T_625_123,T_625_124,T_2_0,T_625_3,T_625_6,T_625_9,T_625_12,T_125_3,T_625_18,T_625_21,T_625_24,T_625_27,T_125_6,T_625_33,T_625_36,T_625_39,T_625_42,T_125_9,T_625_48,T_625_51,T_625_54,T_625_57,T_125_12,T_625_63,T_625_66,T_625_69,T_625_72,T_25_3,T_625_78,T_625_81,T_625_84,T_625_87,T_125_18,T_625_93,T_625_96,T_625_99,T_625_102,T_125_21,T_625_108,T_625_111,T_625_114,T_625_117,T_125_24,T_625_123,T_625_126,T_625_129,T_625_132,T_125_27,T_625_138,T_625_141,T_625_144,T_625_147,T_25_6,T_625_153,T_625_156,T_625_159,T_625_162,T_125_33,T_625_168,T_625_171,T_625_174,T_625_177,T_125_36,T_625_183,T_625_186,T_625_189,T_625_192,T_125_39,T_625_198,T_625_201,T_625_204,T_625_207,T_125_42,T_625_213,T_625_216,T_625_219,T_625_222,T_25_9,T_625_228,T_625_231,T_625_234,T_625_237,T_125_48,T_625_243,T_625_246,T_625_249,T_625_252,T_125_51,T_625_258,T_625_261,T_625_264,T_625_267,T_125_54,T_625_273,T_625_276,T_625_279,T_625_282,T_125_57,T_625_288,T_625_291,T_625_294,T_625_297,T_25_12,T_625_303,T_625_306,T_625_309,T_625_312,T_125_63,T_625_318,T_625_321,T_625_324,T_625_327,T_125_66,T_625_333,T_625_336,T_625_339,T_625_342,T_125_69,T_625_348,T_625_351,T_625_354,T_625_357,T_125_72,T_625_363,T_625_366,T_625_369,T_625_372 +}; +static const __device__ float2 lut_sp_25_625[25*2] = { + T_2_0,T_625_1,T_625_2,T_625_3,T_625_4,T_125_1,T_625_6,T_625_7,T_625_8,T_625_9,T_125_2,T_625_11,T_625_12,T_625_13,T_625_14,T_125_3,T_625_16,T_625_17,T_625_18,T_625_19,T_125_4,T_625_21,T_625_22,T_625_23,T_625_24,T_2_0,T_625_13,T_625_26,T_625_39,T_625_52,T_125_13,T_625_78,T_625_91,T_625_104,T_625_117,T_125_26,T_625_143,T_625_156,T_625_169,T_625_182,T_125_39,T_625_208,T_625_221,T_625_234,T_625_247,T_125_52,T_625_273,T_625_286,T_625_299,T_625_312 +}; +static const __device__ float2 lut_sp_3_729[243*2] = { + T_2_0,T_729_1,T_729_2,T_243_1,T_729_4,T_729_5,T_243_2,T_729_7,T_729_8,T_81_1,T_729_10,T_729_11,T_243_4,T_729_13,T_729_14,T_243_5,T_729_16,T_729_17,T_81_2,T_729_19,T_729_20,T_243_7,T_729_22,T_729_23,T_243_8,T_729_25,T_729_26,T_27_1,T_729_28,T_729_29,T_243_10,T_729_31,T_729_32,T_243_11,T_729_34,T_729_35,T_81_4,T_729_37,T_729_38,T_243_13,T_729_40,T_729_41,T_243_14,T_729_43,T_729_44,T_81_5,T_729_46,T_729_47,T_243_16,T_729_49,T_729_50,T_243_17,T_729_52,T_729_53,T_27_2,T_729_55,T_729_56,T_243_19,T_729_58,T_729_59,T_243_20,T_729_61,T_729_62,T_81_7,T_729_64,T_729_65,T_243_22,T_729_67,T_729_68,T_243_23,T_729_70,T_729_71,T_81_8,T_729_73,T_729_74,T_243_25,T_729_76,T_729_77,T_243_26,T_729_79,T_729_80,T_9_1,T_729_82,T_729_83,T_243_28,T_729_85,T_729_86,T_243_29,T_729_88,T_729_89,T_81_10,T_729_91,T_729_92,T_243_31,T_729_94,T_729_95,T_243_32,T_729_97,T_729_98,T_81_11,T_729_100,T_729_101,T_243_34,T_729_103,T_729_104,T_243_35,T_729_106,T_729_107,T_27_4,T_729_109,T_729_110,T_243_37,T_729_112,T_729_113,T_243_38,T_729_115,T_729_116,T_81_13,T_729_118,T_729_119,T_243_40,T_729_121,T_729_122,T_243_41,T_729_124,T_729_125,T_81_14,T_729_127,T_729_128,T_243_43,T_729_130,T_729_131,T_243_44,T_729_133,T_729_134,T_27_5,T_729_136,T_729_137,T_243_46,T_729_139,T_729_140,T_243_47,T_729_142,T_729_143,T_81_16,T_729_145,T_729_146,T_243_49,T_729_148,T_729_149,T_243_50,T_729_151,T_729_152,T_81_17,T_729_154,T_729_155,T_243_52,T_729_157,T_729_158,T_243_53,T_729_160,T_729_161,T_9_2,T_729_163,T_729_164,T_243_55,T_729_166,T_729_167,T_243_56,T_729_169,T_729_170,T_81_19,T_729_172,T_729_173,T_243_58,T_729_175,T_729_176,T_243_59,T_729_178,T_729_179,T_81_20,T_729_181,T_729_182,T_243_61,T_729_184,T_729_185,T_243_62,T_729_187,T_729_188,T_27_7,T_729_190,T_729_191,T_243_64,T_729_193,T_729_194,T_243_65,T_729_196,T_729_197,T_81_22,T_729_199,T_729_200,T_243_67,T_729_202,T_729_203,T_243_68,T_729_205,T_729_206,T_81_23,T_729_208,T_729_209,T_243_70,T_729_211,T_729_212,T_243_71,T_729_214,T_729_215,T_27_8,T_729_217,T_729_218,T_243_73,T_729_220,T_729_221,T_243_74,T_729_223,T_729_224,T_81_25,T_729_226,T_729_227,T_243_76,T_729_229,T_729_230,T_243_77,T_729_232,T_729_233,T_81_26,T_729_235,T_729_236,T_243_79,T_729_238,T_729_239,T_243_80,T_729_241,T_729_242,T_2_0,T_729_2,T_729_4,T_243_2,T_729_8,T_729_10,T_243_4,T_729_14,T_729_16,T_81_2,T_729_20,T_729_22,T_243_8,T_729_26,T_729_28,T_243_10,T_729_32,T_729_34,T_81_4,T_729_38,T_729_40,T_243_14,T_729_44,T_729_46,T_243_16,T_729_50,T_729_52,T_27_2,T_729_56,T_729_58,T_243_20,T_729_62,T_729_64,T_243_22,T_729_68,T_729_70,T_81_8,T_729_74,T_729_76,T_243_26,T_729_80,T_729_82,T_243_28,T_729_86,T_729_88,T_81_10,T_729_92,T_729_94,T_243_32,T_729_98,T_729_100,T_243_34,T_729_104,T_729_106,T_27_4,T_729_110,T_729_112,T_243_38,T_729_116,T_729_118,T_243_40,T_729_122,T_729_124,T_81_14,T_729_128,T_729_130,T_243_44,T_729_134,T_729_136,T_243_46,T_729_140,T_729_142,T_81_16,T_729_146,T_729_148,T_243_50,T_729_152,T_729_154,T_243_52,T_729_158,T_729_160,T_9_2,T_729_164,T_729_166,T_243_56,T_729_170,T_729_172,T_243_58,T_729_176,T_729_178,T_81_20,T_729_182,T_729_184,T_243_62,T_729_188,T_729_190,T_243_64,T_729_194,T_729_196,T_81_22,T_729_200,T_729_202,T_243_68,T_729_206,T_729_208,T_243_70,T_729_212,T_729_214,T_27_8,T_729_218,T_729_220,T_243_74,T_729_224,T_729_226,T_243_76,T_729_230,T_729_232,T_81_26,T_729_236,T_729_238,T_243_80,T_729_242,T_729_244,T_243_82,T_729_248,T_729_250,T_81_28,T_729_254,T_729_256,T_243_86,T_729_260,T_729_262,T_243_88,T_729_266,T_729_268,T_27_10,T_729_272,T_729_274,T_243_92,T_729_278,T_729_280,T_243_94,T_729_284,T_729_286,T_81_32,T_729_290,T_729_292,T_243_98,T_729_296,T_729_298,T_243_100,T_729_302,T_729_304,T_81_34,T_729_308,T_729_310,T_243_104,T_729_314,T_729_316,T_243_106,T_729_320,T_729_322,T_9_4,T_729_326,T_729_328,T_243_110,T_729_332,T_729_334,T_243_112,T_729_338,T_729_340,T_81_38,T_729_344,T_729_346,T_243_116,T_729_350,T_729_352,T_243_118,T_729_356,T_729_358,T_81_40,T_729_362,T_729_364,T_243_122,T_729_368,T_729_370,T_243_124,T_729_374,T_729_376,T_27_14,T_729_380,T_729_382,T_243_128,T_729_386,T_729_388,T_243_130,T_729_392,T_729_394,T_81_44,T_729_398,T_729_400,T_243_134,T_729_404,T_729_406,T_243_136,T_729_410,T_729_412,T_81_46,T_729_416,T_729_418,T_243_140,T_729_422,T_729_424,T_243_142,T_729_428,T_729_430,T_27_16,T_729_434,T_729_436,T_243_146,T_729_440,T_729_442,T_243_148,T_729_446,T_729_448,T_81_50,T_729_452,T_729_454,T_243_152,T_729_458,T_729_460,T_243_154,T_729_464,T_729_466,T_81_52,T_729_470,T_729_472,T_243_158,T_729_476,T_729_478,T_243_160,T_729_482,T_729_484 +}; +static const __device__ float2 lut_sp_9_729[81*2] = { + T_2_0,T_729_1,T_729_2,T_243_1,T_729_4,T_729_5,T_243_2,T_729_7,T_729_8,T_81_1,T_729_10,T_729_11,T_243_4,T_729_13,T_729_14,T_243_5,T_729_16,T_729_17,T_81_2,T_729_19,T_729_20,T_243_7,T_729_22,T_729_23,T_243_8,T_729_25,T_729_26,T_27_1,T_729_28,T_729_29,T_243_10,T_729_31,T_729_32,T_243_11,T_729_34,T_729_35,T_81_4,T_729_37,T_729_38,T_243_13,T_729_40,T_729_41,T_243_14,T_729_43,T_729_44,T_81_5,T_729_46,T_729_47,T_243_16,T_729_49,T_729_50,T_243_17,T_729_52,T_729_53,T_27_2,T_729_55,T_729_56,T_243_19,T_729_58,T_729_59,T_243_20,T_729_61,T_729_62,T_81_7,T_729_64,T_729_65,T_243_22,T_729_67,T_729_68,T_243_23,T_729_70,T_729_71,T_81_8,T_729_73,T_729_74,T_243_25,T_729_76,T_729_77,T_243_26,T_729_79,T_729_80,T_2_0,T_729_5,T_729_10,T_243_5,T_729_20,T_729_25,T_243_10,T_729_35,T_729_40,T_81_5,T_729_50,T_729_55,T_243_20,T_729_65,T_729_70,T_243_25,T_729_80,T_729_85,T_81_10,T_729_95,T_729_100,T_243_35,T_729_110,T_729_115,T_243_40,T_729_125,T_729_130,T_27_5,T_729_140,T_729_145,T_243_50,T_729_155,T_729_160,T_243_55,T_729_170,T_729_175,T_81_20,T_729_185,T_729_190,T_243_65,T_729_200,T_729_205,T_243_70,T_729_215,T_729_220,T_81_25,T_729_230,T_729_235,T_243_80,T_729_245,T_729_250,T_243_85,T_729_260,T_729_265,T_27_10,T_729_275,T_729_280,T_243_95,T_729_290,T_729_295,T_243_100,T_729_305,T_729_310,T_81_35,T_729_320,T_729_325,T_243_110,T_729_335,T_729_340,T_243_115,T_729_350,T_729_355,T_81_40,T_729_365,T_729_370,T_243_125,T_729_380,T_729_385,T_243_130,T_729_395,T_729_400 +}; +static const __device__ float2 lut_sp_27_729[27*2] = { + T_2_0,T_729_1,T_729_2,T_243_1,T_729_4,T_729_5,T_243_2,T_729_7,T_729_8,T_81_1,T_729_10,T_729_11,T_243_4,T_729_13,T_729_14,T_243_5,T_729_16,T_729_17,T_81_2,T_729_19,T_729_20,T_243_7,T_729_22,T_729_23,T_243_8,T_729_25,T_729_26,T_2_0,T_729_14,T_729_28,T_243_14,T_729_56,T_729_70,T_243_28,T_729_98,T_729_112,T_81_14,T_729_140,T_729_154,T_243_56,T_729_182,T_729_196,T_243_70,T_729_224,T_729_238,T_81_28,T_729_266,T_729_280,T_243_98,T_729_308,T_729_322,T_243_112,T_729_350,T_729_364 +}; +static const __device__ float2 lut_sp_10_1000[100*2] = { + T_2_0,T_1000_1,T_500_1,T_1000_3,T_250_1,T_200_1,T_500_3,T_1000_7,T_125_1,T_1000_9,T_100_1,T_1000_11,T_250_3,T_1000_13,T_500_7,T_200_3,T_125_2,T_1000_17,T_500_9,T_1000_19,T_50_1,T_1000_21,T_500_11,T_1000_23,T_125_3,T_40_1,T_500_13,T_1000_27,T_250_7,T_1000_29,T_100_3,T_1000_31,T_125_4,T_1000_33,T_500_17,T_200_7,T_250_9,T_1000_37,T_500_19,T_1000_39,T_25_1,T_1000_41,T_500_21,T_1000_43,T_250_11,T_200_9,T_500_23,T_1000_47,T_125_6,T_1000_49,T_20_1,T_1000_51,T_250_13,T_1000_53,T_500_27,T_200_11,T_125_7,T_1000_57,T_500_29,T_1000_59,T_50_3,T_1000_61,T_500_31,T_1000_63,T_125_8,T_200_13,T_500_33,T_1000_67,T_250_17,T_1000_69,T_100_7,T_1000_71,T_125_9,T_1000_73,T_500_37,T_40_3,T_250_19,T_1000_77,T_500_39,T_1000_79,T_25_2,T_1000_81,T_500_41,T_1000_83,T_250_21,T_200_17,T_500_43,T_1000_87,T_125_11,T_1000_89,T_100_9,T_1000_91,T_250_23,T_1000_93,T_500_47,T_200_19,T_125_12,T_1000_97,T_500_49,T_1000_99,T_2_0,T_500_3,T_250_3,T_500_9,T_125_3,T_100_3,T_250_9,T_500_21,T_125_6,T_500_27,T_50_3,T_500_33,T_125_9,T_500_39,T_250_21,T_100_9,T_125_12,T_500_51,T_250_27,T_500_57,T_25_3,T_500_63,T_250_33,T_500_69,T_125_18,T_20_3,T_250_39,T_500_81,T_125_21,T_500_87,T_50_9,T_500_93,T_125_24,T_500_99,T_250_51,T_100_21,T_125_27,T_500_111,T_250_57,T_500_117,T_25_6,T_500_123,T_250_63,T_500_129,T_125_33,T_100_27,T_250_69,T_500_141,T_125_36,T_500_147,T_10_3,T_500_153,T_125_39,T_500_159,T_250_81,T_100_33,T_125_42,T_500_171,T_250_87,T_500_177,T_25_9,T_500_183,T_250_93,T_500_189,T_125_48,T_100_39,T_250_99,T_500_201,T_125_51,T_500_207,T_50_21,T_500_213,T_125_54,T_500_219,T_250_111,T_20_9,T_125_57,T_500_231,T_250_117,T_500_237,T_25_12,T_500_243,T_250_123,T_500_249,T_125_63,T_100_51,T_250_129,T_500_261,T_125_66,T_500_267,T_50_27,T_500_273,T_125_69,T_500_279,T_250_141,T_100_57,T_125_72,T_500_291,T_250_147,T_500_297 +}; +static const __device__ float2 lut_sp_2_1024[512*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_8_1,T_1024_129,T_512_65,T_1024_131,T_256_33,T_1024_133,T_512_67,T_1024_135,T_128_17,T_1024_137,T_512_69,T_1024_139,T_256_35,T_1024_141,T_512_71,T_1024_143,T_64_9,T_1024_145,T_512_73,T_1024_147,T_256_37,T_1024_149,T_512_75,T_1024_151,T_128_19,T_1024_153,T_512_77,T_1024_155,T_256_39,T_1024_157,T_512_79,T_1024_159,T_32_5,T_1024_161,T_512_81,T_1024_163,T_256_41,T_1024_165,T_512_83,T_1024_167,T_128_21,T_1024_169,T_512_85,T_1024_171,T_256_43,T_1024_173,T_512_87,T_1024_175,T_64_11,T_1024_177,T_512_89,T_1024_179,T_256_45,T_1024_181,T_512_91,T_1024_183,T_128_23,T_1024_185,T_512_93,T_1024_187,T_256_47,T_1024_189,T_512_95,T_1024_191,T_16_3,T_1024_193,T_512_97,T_1024_195,T_256_49,T_1024_197,T_512_99,T_1024_199,T_128_25,T_1024_201,T_512_101,T_1024_203,T_256_51,T_1024_205,T_512_103,T_1024_207,T_64_13,T_1024_209,T_512_105,T_1024_211,T_256_53,T_1024_213,T_512_107,T_1024_215,T_128_27,T_1024_217,T_512_109,T_1024_219,T_256_55,T_1024_221,T_512_111,T_1024_223,T_32_7,T_1024_225,T_512_113,T_1024_227,T_256_57,T_1024_229,T_512_115,T_1024_231,T_128_29,T_1024_233,T_512_117,T_1024_235,T_256_59,T_1024_237,T_512_119,T_1024_239,T_64_15,T_1024_241,T_512_121,T_1024_243,T_256_61,T_1024_245,T_512_123,T_1024_247,T_128_31,T_1024_249,T_512_125,T_1024_251,T_256_63,T_1024_253,T_512_127,T_1024_255,T_4_1,T_1024_257,T_512_129,T_1024_259,T_256_65,T_1024_261,T_512_131,T_1024_263,T_128_33,T_1024_265,T_512_133,T_1024_267,T_256_67,T_1024_269,T_512_135,T_1024_271,T_64_17,T_1024_273,T_512_137,T_1024_275,T_256_69,T_1024_277,T_512_139,T_1024_279,T_128_35,T_1024_281,T_512_141,T_1024_283,T_256_71,T_1024_285,T_512_143,T_1024_287,T_32_9,T_1024_289,T_512_145,T_1024_291,T_256_73,T_1024_293,T_512_147,T_1024_295,T_128_37,T_1024_297,T_512_149,T_1024_299,T_256_75,T_1024_301,T_512_151,T_1024_303,T_64_19,T_1024_305,T_512_153,T_1024_307,T_256_77,T_1024_309,T_512_155,T_1024_311,T_128_39,T_1024_313,T_512_157,T_1024_315,T_256_79,T_1024_317,T_512_159,T_1024_319,T_16_5,T_1024_321,T_512_161,T_1024_323,T_256_81,T_1024_325,T_512_163,T_1024_327,T_128_41,T_1024_329,T_512_165,T_1024_331,T_256_83,T_1024_333,T_512_167,T_1024_335,T_64_21,T_1024_337,T_512_169,T_1024_339,T_256_85,T_1024_341,T_512_171,T_1024_343,T_128_43,T_1024_345,T_512_173,T_1024_347,T_256_87,T_1024_349,T_512_175,T_1024_351,T_32_11,T_1024_353,T_512_177,T_1024_355,T_256_89,T_1024_357,T_512_179,T_1024_359,T_128_45,T_1024_361,T_512_181,T_1024_363,T_256_91,T_1024_365,T_512_183,T_1024_367,T_64_23,T_1024_369,T_512_185,T_1024_371,T_256_93,T_1024_373,T_512_187,T_1024_375,T_128_47,T_1024_377,T_512_189,T_1024_379,T_256_95,T_1024_381,T_512_191,T_1024_383,T_8_3,T_1024_385,T_512_193,T_1024_387,T_256_97,T_1024_389,T_512_195,T_1024_391,T_128_49,T_1024_393,T_512_197,T_1024_395,T_256_99,T_1024_397,T_512_199,T_1024_399,T_64_25,T_1024_401,T_512_201,T_1024_403,T_256_101,T_1024_405,T_512_203,T_1024_407,T_128_51,T_1024_409,T_512_205,T_1024_411,T_256_103,T_1024_413,T_512_207,T_1024_415,T_32_13,T_1024_417,T_512_209,T_1024_419,T_256_105,T_1024_421,T_512_211,T_1024_423,T_128_53,T_1024_425,T_512_213,T_1024_427,T_256_107,T_1024_429,T_512_215,T_1024_431,T_64_27,T_1024_433,T_512_217,T_1024_435,T_256_109,T_1024_437,T_512_219,T_1024_439,T_128_55,T_1024_441,T_512_221,T_1024_443,T_256_111,T_1024_445,T_512_223,T_1024_447,T_16_7,T_1024_449,T_512_225,T_1024_451,T_256_113,T_1024_453,T_512_227,T_1024_455,T_128_57,T_1024_457,T_512_229,T_1024_459,T_256_115,T_1024_461,T_512_231,T_1024_463,T_64_29,T_1024_465,T_512_233,T_1024_467,T_256_117,T_1024_469,T_512_235,T_1024_471,T_128_59,T_1024_473,T_512_237,T_1024_475,T_256_119,T_1024_477,T_512_239,T_1024_479,T_32_15,T_1024_481,T_512_241,T_1024_483,T_256_121,T_1024_485,T_512_243,T_1024_487,T_128_61,T_1024_489,T_512_245,T_1024_491,T_256_123,T_1024_493,T_512_247,T_1024_495,T_64_31,T_1024_497,T_512_249,T_1024_499,T_256_125,T_1024_501,T_512_251,T_1024_503,T_128_63,T_1024_505,T_512_253,T_1024_507,T_256_127,T_1024_509,T_512_255,T_1024_511,T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_8_1,T_512_65,T_256_33,T_512_67,T_128_17,T_512_69,T_256_35,T_512_71,T_64_9,T_512_73,T_256_37,T_512_75,T_128_19,T_512_77,T_256_39,T_512_79,T_32_5,T_512_81,T_256_41,T_512_83,T_128_21,T_512_85,T_256_43,T_512_87,T_64_11,T_512_89,T_256_45,T_512_91,T_128_23,T_512_93,T_256_47,T_512_95,T_16_3,T_512_97,T_256_49,T_512_99,T_128_25,T_512_101,T_256_51,T_512_103,T_64_13,T_512_105,T_256_53,T_512_107,T_128_27,T_512_109,T_256_55,T_512_111,T_32_7,T_512_113,T_256_57,T_512_115,T_128_29,T_512_117,T_256_59,T_512_119,T_64_15,T_512_121,T_256_61,T_512_123,T_128_31,T_512_125,T_256_63,T_512_127,T_4_1,T_512_129,T_256_65,T_512_131,T_128_33,T_512_133,T_256_67,T_512_135,T_64_17,T_512_137,T_256_69,T_512_139,T_128_35,T_512_141,T_256_71,T_512_143,T_32_9,T_512_145,T_256_73,T_512_147,T_128_37,T_512_149,T_256_75,T_512_151,T_64_19,T_512_153,T_256_77,T_512_155,T_128_39,T_512_157,T_256_79,T_512_159,T_16_5,T_512_161,T_256_81,T_512_163,T_128_41,T_512_165,T_256_83,T_512_167,T_64_21,T_512_169,T_256_85,T_512_171,T_128_43,T_512_173,T_256_87,T_512_175,T_32_11,T_512_177,T_256_89,T_512_179,T_128_45,T_512_181,T_256_91,T_512_183,T_64_23,T_512_185,T_256_93,T_512_187,T_128_47,T_512_189,T_256_95,T_512_191,T_8_3,T_512_193,T_256_97,T_512_195,T_128_49,T_512_197,T_256_99,T_512_199,T_64_25,T_512_201,T_256_101,T_512_203,T_128_51,T_512_205,T_256_103,T_512_207,T_32_13,T_512_209,T_256_105,T_512_211,T_128_53,T_512_213,T_256_107,T_512_215,T_64_27,T_512_217,T_256_109,T_512_219,T_128_55,T_512_221,T_256_111,T_512_223,T_16_7,T_512_225,T_256_113,T_512_227,T_128_57,T_512_229,T_256_115,T_512_231,T_64_29,T_512_233,T_256_117,T_512_235,T_128_59,T_512_237,T_256_119,T_512_239,T_32_15,T_512_241,T_256_121,T_512_243,T_128_61,T_512_245,T_256_123,T_512_247,T_64_31,T_512_249,T_256_125,T_512_251,T_128_63,T_512_253,T_256_127,T_512_255,T_2_1,T_512_257,T_256_129,T_512_259,T_128_65,T_512_261,T_256_131,T_512_263,T_64_33,T_512_265,T_256_133,T_512_267,T_128_67,T_512_269,T_256_135,T_512_271,T_32_17,T_512_273,T_256_137,T_512_275,T_128_69,T_512_277,T_256_139,T_512_279,T_64_35,T_512_281,T_256_141,T_512_283,T_128_71,T_512_285,T_256_143,T_512_287,T_16_9,T_512_289,T_256_145,T_512_291,T_128_73,T_512_293,T_256_147,T_512_295,T_64_37,T_512_297,T_256_149,T_512_299,T_128_75,T_512_301,T_256_151,T_512_303,T_32_19,T_512_305,T_256_153,T_512_307,T_128_77,T_512_309,T_256_155,T_512_311,T_64_39,T_512_313,T_256_157,T_512_315,T_128_79,T_512_317,T_256_159,T_512_319,T_8_5,T_512_321,T_256_161,T_512_323,T_128_81,T_512_325,T_256_163,T_512_327,T_64_41,T_512_329,T_256_165,T_512_331,T_128_83,T_512_333,T_256_167,T_512_335,T_32_21,T_512_337,T_256_169,T_512_339,T_128_85,T_512_341,T_256_171,T_512_343,T_64_43,T_512_345,T_256_173,T_512_347,T_128_87,T_512_349,T_256_175,T_512_351,T_16_11,T_512_353,T_256_177,T_512_355,T_128_89,T_512_357,T_256_179,T_512_359,T_64_45,T_512_361,T_256_181,T_512_363,T_128_91,T_512_365,T_256_183,T_512_367,T_32_23,T_512_369,T_256_185,T_512_371,T_128_93,T_512_373,T_256_187,T_512_375,T_64_47,T_512_377,T_256_189,T_512_379,T_128_95,T_512_381,T_256_191,T_512_383,T_4_3,T_512_385,T_256_193,T_512_387,T_128_97,T_512_389,T_256_195,T_512_391,T_64_49,T_512_393,T_256_197,T_512_395,T_128_99,T_512_397,T_256_199,T_512_399,T_32_25,T_512_401,T_256_201,T_512_403,T_128_101,T_512_405,T_256_203,T_512_407,T_64_51,T_512_409,T_256_205,T_512_411,T_128_103,T_512_413,T_256_207,T_512_415,T_16_13,T_512_417,T_256_209,T_512_419,T_128_105,T_512_421,T_256_211,T_512_423,T_64_53,T_512_425,T_256_213,T_512_427,T_128_107,T_512_429,T_256_215,T_512_431,T_32_27,T_512_433,T_256_217,T_512_435,T_128_109,T_512_437,T_256_219,T_512_439,T_64_55,T_512_441,T_256_221,T_512_443,T_128_111,T_512_445,T_256_223,T_512_447,T_8_7,T_512_449,T_256_225,T_512_451,T_128_113,T_512_453,T_256_227,T_512_455,T_64_57,T_512_457,T_256_229,T_512_459,T_128_115,T_512_461,T_256_231,T_512_463,T_32_29,T_512_465,T_256_233,T_512_467,T_128_117,T_512_469,T_256_235,T_512_471,T_64_59,T_512_473,T_256_237,T_512_475,T_128_119,T_512_477,T_256_239,T_512_479,T_16_15,T_512_481,T_256_241,T_512_483,T_128_121,T_512_485,T_256_243,T_512_487,T_64_61,T_512_489,T_256_245,T_512_491,T_128_123,T_512_493,T_256_247,T_512_495,T_32_31,T_512_497,T_256_249,T_512_499,T_128_125,T_512_501,T_256_251,T_512_503,T_64_63,T_512_505,T_256_253,T_512_507,T_128_127,T_512_509,T_256_255,T_512_511 +}; +static const __device__ float2 lut_sp_4_1024[256*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_8_1,T_1024_129,T_512_65,T_1024_131,T_256_33,T_1024_133,T_512_67,T_1024_135,T_128_17,T_1024_137,T_512_69,T_1024_139,T_256_35,T_1024_141,T_512_71,T_1024_143,T_64_9,T_1024_145,T_512_73,T_1024_147,T_256_37,T_1024_149,T_512_75,T_1024_151,T_128_19,T_1024_153,T_512_77,T_1024_155,T_256_39,T_1024_157,T_512_79,T_1024_159,T_32_5,T_1024_161,T_512_81,T_1024_163,T_256_41,T_1024_165,T_512_83,T_1024_167,T_128_21,T_1024_169,T_512_85,T_1024_171,T_256_43,T_1024_173,T_512_87,T_1024_175,T_64_11,T_1024_177,T_512_89,T_1024_179,T_256_45,T_1024_181,T_512_91,T_1024_183,T_128_23,T_1024_185,T_512_93,T_1024_187,T_256_47,T_1024_189,T_512_95,T_1024_191,T_16_3,T_1024_193,T_512_97,T_1024_195,T_256_49,T_1024_197,T_512_99,T_1024_199,T_128_25,T_1024_201,T_512_101,T_1024_203,T_256_51,T_1024_205,T_512_103,T_1024_207,T_64_13,T_1024_209,T_512_105,T_1024_211,T_256_53,T_1024_213,T_512_107,T_1024_215,T_128_27,T_1024_217,T_512_109,T_1024_219,T_256_55,T_1024_221,T_512_111,T_1024_223,T_32_7,T_1024_225,T_512_113,T_1024_227,T_256_57,T_1024_229,T_512_115,T_1024_231,T_128_29,T_1024_233,T_512_117,T_1024_235,T_256_59,T_1024_237,T_512_119,T_1024_239,T_64_15,T_1024_241,T_512_121,T_1024_243,T_256_61,T_1024_245,T_512_123,T_1024_247,T_128_31,T_1024_249,T_512_125,T_1024_251,T_256_63,T_1024_253,T_512_127,T_1024_255,T_2_0,T_1024_3,T_512_3,T_1024_9,T_256_3,T_1024_15,T_512_9,T_1024_21,T_128_3,T_1024_27,T_512_15,T_1024_33,T_256_9,T_1024_39,T_512_21,T_1024_45,T_64_3,T_1024_51,T_512_27,T_1024_57,T_256_15,T_1024_63,T_512_33,T_1024_69,T_128_9,T_1024_75,T_512_39,T_1024_81,T_256_21,T_1024_87,T_512_45,T_1024_93,T_32_3,T_1024_99,T_512_51,T_1024_105,T_256_27,T_1024_111,T_512_57,T_1024_117,T_128_15,T_1024_123,T_512_63,T_1024_129,T_256_33,T_1024_135,T_512_69,T_1024_141,T_64_9,T_1024_147,T_512_75,T_1024_153,T_256_39,T_1024_159,T_512_81,T_1024_165,T_128_21,T_1024_171,T_512_87,T_1024_177,T_256_45,T_1024_183,T_512_93,T_1024_189,T_16_3,T_1024_195,T_512_99,T_1024_201,T_256_51,T_1024_207,T_512_105,T_1024_213,T_128_27,T_1024_219,T_512_111,T_1024_225,T_256_57,T_1024_231,T_512_117,T_1024_237,T_64_15,T_1024_243,T_512_123,T_1024_249,T_256_63,T_1024_255,T_512_129,T_1024_261,T_128_33,T_1024_267,T_512_135,T_1024_273,T_256_69,T_1024_279,T_512_141,T_1024_285,T_32_9,T_1024_291,T_512_147,T_1024_297,T_256_75,T_1024_303,T_512_153,T_1024_309,T_128_39,T_1024_315,T_512_159,T_1024_321,T_256_81,T_1024_327,T_512_165,T_1024_333,T_64_21,T_1024_339,T_512_171,T_1024_345,T_256_87,T_1024_351,T_512_177,T_1024_357,T_128_45,T_1024_363,T_512_183,T_1024_369,T_256_93,T_1024_375,T_512_189,T_1024_381,T_8_3,T_1024_387,T_512_195,T_1024_393,T_256_99,T_1024_399,T_512_201,T_1024_405,T_128_51,T_1024_411,T_512_207,T_1024_417,T_256_105,T_1024_423,T_512_213,T_1024_429,T_64_27,T_1024_435,T_512_219,T_1024_441,T_256_111,T_1024_447,T_512_225,T_1024_453,T_128_57,T_1024_459,T_512_231,T_1024_465,T_256_117,T_1024_471,T_512_237,T_1024_477,T_32_15,T_1024_483,T_512_243,T_1024_489,T_256_123,T_1024_495,T_512_249,T_1024_501,T_128_63,T_1024_507,T_512_255,T_1024_513,T_256_129,T_1024_519,T_512_261,T_1024_525,T_64_33,T_1024_531,T_512_267,T_1024_537,T_256_135,T_1024_543,T_512_273,T_1024_549,T_128_69,T_1024_555,T_512_279,T_1024_561,T_256_141,T_1024_567,T_512_285,T_1024_573,T_16_9,T_1024_579,T_512_291,T_1024_585,T_256_147,T_1024_591,T_512_297,T_1024_597,T_128_75,T_1024_603,T_512_303,T_1024_609,T_256_153,T_1024_615,T_512_309,T_1024_621,T_64_39,T_1024_627,T_512_315,T_1024_633,T_256_159,T_1024_639,T_512_321,T_1024_645,T_128_81,T_1024_651,T_512_327,T_1024_657,T_256_165,T_1024_663,T_512_333,T_1024_669,T_32_21,T_1024_675,T_512_339,T_1024_681,T_256_171,T_1024_687,T_512_345,T_1024_693,T_128_87,T_1024_699,T_512_351,T_1024_705,T_256_177,T_1024_711,T_512_357,T_1024_717,T_64_45,T_1024_723,T_512_363,T_1024_729,T_256_183,T_1024_735,T_512_369,T_1024_741,T_128_93,T_1024_747,T_512_375,T_1024_753,T_256_189,T_1024_759,T_512_381,T_1024_765 +}; +static const __device__ float2 lut_sp_8_1024[128*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_2_0,T_1024_5,T_512_5,T_1024_15,T_256_5,T_1024_25,T_512_15,T_1024_35,T_128_5,T_1024_45,T_512_25,T_1024_55,T_256_15,T_1024_65,T_512_35,T_1024_75,T_64_5,T_1024_85,T_512_45,T_1024_95,T_256_25,T_1024_105,T_512_55,T_1024_115,T_128_15,T_1024_125,T_512_65,T_1024_135,T_256_35,T_1024_145,T_512_75,T_1024_155,T_32_5,T_1024_165,T_512_85,T_1024_175,T_256_45,T_1024_185,T_512_95,T_1024_195,T_128_25,T_1024_205,T_512_105,T_1024_215,T_256_55,T_1024_225,T_512_115,T_1024_235,T_64_15,T_1024_245,T_512_125,T_1024_255,T_256_65,T_1024_265,T_512_135,T_1024_275,T_128_35,T_1024_285,T_512_145,T_1024_295,T_256_75,T_1024_305,T_512_155,T_1024_315,T_16_5,T_1024_325,T_512_165,T_1024_335,T_256_85,T_1024_345,T_512_175,T_1024_355,T_128_45,T_1024_365,T_512_185,T_1024_375,T_256_95,T_1024_385,T_512_195,T_1024_395,T_64_25,T_1024_405,T_512_205,T_1024_415,T_256_105,T_1024_425,T_512_215,T_1024_435,T_128_55,T_1024_445,T_512_225,T_1024_455,T_256_115,T_1024_465,T_512_235,T_1024_475,T_32_15,T_1024_485,T_512_245,T_1024_495,T_256_125,T_1024_505,T_512_255,T_1024_515,T_128_65,T_1024_525,T_512_265,T_1024_535,T_256_135,T_1024_545,T_512_275,T_1024_555,T_64_35,T_1024_565,T_512_285,T_1024_575,T_256_145,T_1024_585,T_512_295,T_1024_595,T_128_75,T_1024_605,T_512_305,T_1024_615,T_256_155,T_1024_625,T_512_315,T_1024_635 +}; +static const __device__ float2 lut_sp_16_1024[64*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_2_0,T_1024_9,T_512_9,T_1024_27,T_256_9,T_1024_45,T_512_27,T_1024_63,T_128_9,T_1024_81,T_512_45,T_1024_99,T_256_27,T_1024_117,T_512_63,T_1024_135,T_64_9,T_1024_153,T_512_81,T_1024_171,T_256_45,T_1024_189,T_512_99,T_1024_207,T_128_27,T_1024_225,T_512_117,T_1024_243,T_256_63,T_1024_261,T_512_135,T_1024_279,T_32_9,T_1024_297,T_512_153,T_1024_315,T_256_81,T_1024_333,T_512_171,T_1024_351,T_128_45,T_1024_369,T_512_189,T_1024_387,T_256_99,T_1024_405,T_512_207,T_1024_423,T_64_27,T_1024_441,T_512_225,T_1024_459,T_256_117,T_1024_477,T_512_243,T_1024_495,T_128_63,T_1024_513,T_512_261,T_1024_531,T_256_135,T_1024_549,T_512_279,T_1024_567 +}; +static const __device__ float2 lut_sp_32_1024[32*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_2_0,T_1024_17,T_512_17,T_1024_51,T_256_17,T_1024_85,T_512_51,T_1024_119,T_128_17,T_1024_153,T_512_85,T_1024_187,T_256_51,T_1024_221,T_512_119,T_1024_255,T_64_17,T_1024_289,T_512_153,T_1024_323,T_256_85,T_1024_357,T_512_187,T_1024_391,T_128_51,T_1024_425,T_512_221,T_1024_459,T_256_119,T_1024_493,T_512_255,T_1024_527 +}; +static const __device__ float2 lut_sp_6_1296[216*2] = { + T_2_0,T_1296_1,T_648_1,T_432_1,T_324_1,T_1296_5,T_216_1,T_1296_7,T_162_1,T_144_1,T_648_5,T_1296_11,T_108_1,T_1296_13,T_648_7,T_432_5,T_81_1,T_1296_17,T_72_1,T_1296_19,T_324_5,T_432_7,T_648_11,T_1296_23,T_54_1,T_1296_25,T_648_13,T_48_1,T_324_7,T_1296_29,T_216_5,T_1296_31,T_81_2,T_432_11,T_648_17,T_1296_35,T_36_1,T_1296_37,T_648_19,T_432_13,T_162_5,T_1296_41,T_216_7,T_1296_43,T_324_11,T_144_5,T_648_23,T_1296_47,T_27_1,T_1296_49,T_648_25,T_432_17,T_324_13,T_1296_53,T_24_1,T_1296_55,T_162_7,T_432_19,T_648_29,T_1296_59,T_108_5,T_1296_61,T_648_31,T_144_7,T_81_4,T_1296_65,T_216_11,T_1296_67,T_324_17,T_432_23,T_648_35,T_1296_71,T_18_1,T_1296_73,T_648_37,T_432_25,T_324_19,T_1296_77,T_216_13,T_1296_79,T_81_5,T_16_1,T_648_41,T_1296_83,T_108_7,T_1296_85,T_648_43,T_432_29,T_162_11,T_1296_89,T_72_5,T_1296_91,T_324_23,T_432_31,T_648_47,T_1296_95,T_27_2,T_1296_97,T_648_49,T_144_11,T_324_25,T_1296_101,T_216_17,T_1296_103,T_162_13,T_432_35,T_648_53,T_1296_107,T_12_1,T_1296_109,T_648_55,T_432_37,T_81_7,T_1296_113,T_216_19,T_1296_115,T_324_29,T_144_13,T_648_59,T_1296_119,T_54_5,T_1296_121,T_648_61,T_432_41,T_324_31,T_1296_125,T_72_7,T_1296_127,T_81_8,T_432_43,T_648_65,T_1296_131,T_108_11,T_1296_133,T_648_67,T_48_5,T_162_17,T_1296_137,T_216_23,T_1296_139,T_324_35,T_432_47,T_648_71,T_1296_143,T_9_1,T_1296_145,T_648_73,T_432_49,T_324_37,T_1296_149,T_216_25,T_1296_151,T_162_19,T_144_17,T_648_77,T_1296_155,T_108_13,T_1296_157,T_648_79,T_432_53,T_81_10,T_1296_161,T_8_1,T_1296_163,T_324_41,T_432_55,T_648_83,T_1296_167,T_54_7,T_1296_169,T_648_85,T_144_19,T_324_43,T_1296_173,T_216_29,T_1296_175,T_81_11,T_432_59,T_648_89,T_1296_179,T_36_5,T_1296_181,T_648_91,T_432_61,T_162_23,T_1296_185,T_216_31,T_1296_187,T_324_47,T_48_7,T_648_95,T_1296_191,T_27_4,T_1296_193,T_648_97,T_432_65,T_324_49,T_1296_197,T_72_11,T_1296_199,T_162_25,T_432_67,T_648_101,T_1296_203,T_108_17,T_1296_205,T_648_103,T_144_23,T_81_13,T_1296_209,T_216_35,T_1296_211,T_324_53,T_432_71,T_648_107,T_1296_215,T_2_0,T_324_1,T_162_1,T_108_1,T_81_1,T_324_5,T_54_1,T_324_7,T_81_2,T_36_1,T_162_5,T_324_11,T_27_1,T_324_13,T_162_7,T_108_5,T_81_4,T_324_17,T_18_1,T_324_19,T_81_5,T_108_7,T_162_11,T_324_23,T_27_2,T_324_25,T_162_13,T_12_1,T_81_7,T_324_29,T_54_5,T_324_31,T_81_8,T_108_11,T_162_17,T_324_35,T_9_1,T_324_37,T_162_19,T_108_13,T_81_10,T_324_41,T_54_7,T_324_43,T_81_11,T_36_5,T_162_23,T_324_47,T_27_4,T_324_49,T_162_25,T_108_17,T_81_13,T_324_53,T_6_1,T_324_55,T_81_14,T_108_19,T_162_29,T_324_59,T_27_5,T_324_61,T_162_31,T_36_7,T_81_16,T_324_65,T_54_11,T_324_67,T_81_17,T_108_23,T_162_35,T_324_71,T_9_2,T_324_73,T_162_37,T_108_25,T_81_19,T_324_77,T_54_13,T_324_79,T_81_20,T_4_1,T_162_41,T_324_83,T_27_7,T_324_85,T_162_43,T_108_29,T_81_22,T_324_89,T_18_5,T_324_91,T_81_23,T_108_31,T_162_47,T_324_95,T_27_8,T_324_97,T_162_49,T_36_11,T_81_25,T_324_101,T_54_17,T_324_103,T_81_26,T_108_35,T_162_53,T_324_107,T_3_1,T_324_109,T_162_55,T_108_37,T_81_28,T_324_113,T_54_19,T_324_115,T_81_29,T_36_13,T_162_59,T_324_119,T_27_10,T_324_121,T_162_61,T_108_41,T_81_31,T_324_125,T_18_7,T_324_127,T_81_32,T_108_43,T_162_65,T_324_131,T_27_11,T_324_133,T_162_67,T_12_5,T_81_34,T_324_137,T_54_23,T_324_139,T_81_35,T_108_47,T_162_71,T_324_143,T_9_4,T_324_145,T_162_73,T_108_49,T_81_37,T_324_149,T_54_25,T_324_151,T_81_38,T_36_17,T_162_77,T_324_155,T_27_13,T_324_157,T_162_79,T_108_53,T_81_40,T_324_161,T_2_1,T_324_163,T_81_41,T_108_55,T_162_83,T_324_167,T_27_14,T_324_169,T_162_85,T_36_19,T_81_43,T_324_173,T_54_29,T_324_175,T_81_44,T_108_59,T_162_89,T_324_179,T_9_5,T_324_181,T_162_91,T_108_61,T_81_46,T_324_185,T_54_31,T_324_187,T_81_47,T_12_7,T_162_95,T_324_191,T_27_16,T_324_193,T_162_97,T_108_65,T_81_49,T_324_197,T_18_11,T_324_199,T_81_50,T_108_67,T_162_101,T_324_203,T_27_17,T_324_205,T_162_103,T_36_23,T_81_52,T_324_209,T_54_35,T_324_211,T_81_53,T_108_71,T_162_107,T_324_215 +}; +static const __device__ float2 lut_sp_11_1331[121*2] = { + T_2_0,T_1331_1,T_1331_2,T_1331_3,T_1331_4,T_1331_5,T_1331_6,T_1331_7,T_1331_8,T_1331_9,T_1331_10,T_121_1,T_1331_12,T_1331_13,T_1331_14,T_1331_15,T_1331_16,T_1331_17,T_1331_18,T_1331_19,T_1331_20,T_1331_21,T_121_2,T_1331_23,T_1331_24,T_1331_25,T_1331_26,T_1331_27,T_1331_28,T_1331_29,T_1331_30,T_1331_31,T_1331_32,T_121_3,T_1331_34,T_1331_35,T_1331_36,T_1331_37,T_1331_38,T_1331_39,T_1331_40,T_1331_41,T_1331_42,T_1331_43,T_121_4,T_1331_45,T_1331_46,T_1331_47,T_1331_48,T_1331_49,T_1331_50,T_1331_51,T_1331_52,T_1331_53,T_1331_54,T_121_5,T_1331_56,T_1331_57,T_1331_58,T_1331_59,T_1331_60,T_1331_61,T_1331_62,T_1331_63,T_1331_64,T_1331_65,T_121_6,T_1331_67,T_1331_68,T_1331_69,T_1331_70,T_1331_71,T_1331_72,T_1331_73,T_1331_74,T_1331_75,T_1331_76,T_121_7,T_1331_78,T_1331_79,T_1331_80,T_1331_81,T_1331_82,T_1331_83,T_1331_84,T_1331_85,T_1331_86,T_1331_87,T_121_8,T_1331_89,T_1331_90,T_1331_91,T_1331_92,T_1331_93,T_1331_94,T_1331_95,T_1331_96,T_1331_97,T_1331_98,T_121_9,T_1331_100,T_1331_101,T_1331_102,T_1331_103,T_1331_104,T_1331_105,T_1331_106,T_1331_107,T_1331_108,T_1331_109,T_121_10,T_1331_111,T_1331_112,T_1331_113,T_1331_114,T_1331_115,T_1331_116,T_1331_117,T_1331_118,T_1331_119,T_1331_120,T_2_0,T_1331_6,T_1331_12,T_1331_18,T_1331_24,T_1331_30,T_1331_36,T_1331_42,T_1331_48,T_1331_54,T_1331_60,T_121_6,T_1331_72,T_1331_78,T_1331_84,T_1331_90,T_1331_96,T_1331_102,T_1331_108,T_1331_114,T_1331_120,T_1331_126,T_121_12,T_1331_138,T_1331_144,T_1331_150,T_1331_156,T_1331_162,T_1331_168,T_1331_174,T_1331_180,T_1331_186,T_1331_192,T_121_18,T_1331_204,T_1331_210,T_1331_216,T_1331_222,T_1331_228,T_1331_234,T_1331_240,T_1331_246,T_1331_252,T_1331_258,T_121_24,T_1331_270,T_1331_276,T_1331_282,T_1331_288,T_1331_294,T_1331_300,T_1331_306,T_1331_312,T_1331_318,T_1331_324,T_121_30,T_1331_336,T_1331_342,T_1331_348,T_1331_354,T_1331_360,T_1331_366,T_1331_372,T_1331_378,T_1331_384,T_1331_390,T_121_36,T_1331_402,T_1331_408,T_1331_414,T_1331_420,T_1331_426,T_1331_432,T_1331_438,T_1331_444,T_1331_450,T_1331_456,T_121_42,T_1331_468,T_1331_474,T_1331_480,T_1331_486,T_1331_492,T_1331_498,T_1331_504,T_1331_510,T_1331_516,T_1331_522,T_121_48,T_1331_534,T_1331_540,T_1331_546,T_1331_552,T_1331_558,T_1331_564,T_1331_570,T_1331_576,T_1331_582,T_1331_588,T_121_54,T_1331_600,T_1331_606,T_1331_612,T_1331_618,T_1331_624,T_1331_630,T_1331_636,T_1331_642,T_1331_648,T_1331_654,T_121_60,T_1331_666,T_1331_672,T_1331_678,T_1331_684,T_1331_690,T_1331_696,T_1331_702,T_1331_708,T_1331_714,T_1331_720 +}; +static const __device__ float2 lut_sp_12_1728[144*2] = { + T_2_0,T_1728_1,T_864_1,T_576_1,T_432_1,T_1728_5,T_288_1,T_1728_7,T_216_1,T_192_1,T_864_5,T_1728_11,T_144_1,T_1728_13,T_864_7,T_576_5,T_108_1,T_1728_17,T_96_1,T_1728_19,T_432_5,T_576_7,T_864_11,T_1728_23,T_72_1,T_1728_25,T_864_13,T_64_1,T_432_7,T_1728_29,T_288_5,T_1728_31,T_54_1,T_576_11,T_864_17,T_1728_35,T_48_1,T_1728_37,T_864_19,T_576_13,T_216_5,T_1728_41,T_288_7,T_1728_43,T_432_11,T_192_5,T_864_23,T_1728_47,T_36_1,T_1728_49,T_864_25,T_576_17,T_432_13,T_1728_53,T_32_1,T_1728_55,T_216_7,T_576_19,T_864_29,T_1728_59,T_144_5,T_1728_61,T_864_31,T_192_7,T_27_1,T_1728_65,T_288_11,T_1728_67,T_432_17,T_576_23,T_864_35,T_1728_71,T_24_1,T_1728_73,T_864_37,T_576_25,T_432_19,T_1728_77,T_288_13,T_1728_79,T_108_5,T_64_3,T_864_41,T_1728_83,T_144_7,T_1728_85,T_864_43,T_576_29,T_216_11,T_1728_89,T_96_5,T_1728_91,T_432_23,T_576_31,T_864_47,T_1728_95,T_18_1,T_1728_97,T_864_49,T_192_11,T_432_25,T_1728_101,T_288_17,T_1728_103,T_216_13,T_576_35,T_864_53,T_1728_107,T_16_1,T_1728_109,T_864_55,T_576_37,T_108_7,T_1728_113,T_288_19,T_1728_115,T_432_29,T_192_13,T_864_59,T_1728_119,T_72_5,T_1728_121,T_864_61,T_576_41,T_432_31,T_1728_125,T_96_7,T_1728_127,T_27_2,T_576_43,T_864_65,T_1728_131,T_144_11,T_1728_133,T_864_67,T_64_5,T_216_17,T_1728_137,T_288_23,T_1728_139,T_432_35,T_576_47,T_864_71,T_1728_143,T_2_0,T_1728_7,T_864_7,T_576_7,T_432_7,T_1728_35,T_288_7,T_1728_49,T_216_7,T_192_7,T_864_35,T_1728_77,T_144_7,T_1728_91,T_864_49,T_576_35,T_108_7,T_1728_119,T_96_7,T_1728_133,T_432_35,T_576_49,T_864_77,T_1728_161,T_72_7,T_1728_175,T_864_91,T_64_7,T_432_49,T_1728_203,T_288_35,T_1728_217,T_54_7,T_576_77,T_864_119,T_1728_245,T_48_7,T_1728_259,T_864_133,T_576_91,T_216_35,T_1728_287,T_288_49,T_1728_301,T_432_77,T_192_35,T_864_161,T_1728_329,T_36_7,T_1728_343,T_864_175,T_576_119,T_432_91,T_1728_371,T_32_7,T_1728_385,T_216_49,T_576_133,T_864_203,T_1728_413,T_144_35,T_1728_427,T_864_217,T_192_49,T_27_7,T_1728_455,T_288_77,T_1728_469,T_432_119,T_576_161,T_864_245,T_1728_497,T_24_7,T_1728_511,T_864_259,T_576_175,T_432_133,T_1728_539,T_288_91,T_1728_553,T_108_35,T_64_21,T_864_287,T_1728_581,T_144_49,T_1728_595,T_864_301,T_576_203,T_216_77,T_1728_623,T_96_35,T_1728_637,T_432_161,T_576_217,T_864_329,T_1728_665,T_18_7,T_1728_679,T_864_343,T_192_77,T_432_175,T_1728_707,T_288_119,T_1728_721,T_216_91,T_576_245,T_864_371,T_1728_749,T_16_7,T_1728_763,T_864_385,T_576_259,T_108_49,T_1728_791,T_288_133,T_1728_805,T_432_203,T_192_91,T_864_413,T_1728_833,T_72_35,T_1728_847,T_864_427,T_576_287,T_432_217,T_1728_875,T_96_49,T_1728_889,T_27_14,T_576_301,T_864_455,T_1728_917,T_144_77,T_1728_931,T_864_469,T_64_35,T_216_119,T_1728_959,T_288_161,T_1728_973,T_432_245,T_576_329,T_864_497,T_1728_1001 +}; +static const __device__ float2 lut_sp_2_2048[1024*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_16_1,T_2048_129,T_1024_65,T_2048_131,T_512_33,T_2048_133,T_1024_67,T_2048_135,T_256_17,T_2048_137,T_1024_69,T_2048_139,T_512_35,T_2048_141,T_1024_71,T_2048_143,T_128_9,T_2048_145,T_1024_73,T_2048_147,T_512_37,T_2048_149,T_1024_75,T_2048_151,T_256_19,T_2048_153,T_1024_77,T_2048_155,T_512_39,T_2048_157,T_1024_79,T_2048_159,T_64_5,T_2048_161,T_1024_81,T_2048_163,T_512_41,T_2048_165,T_1024_83,T_2048_167,T_256_21,T_2048_169,T_1024_85,T_2048_171,T_512_43,T_2048_173,T_1024_87,T_2048_175,T_128_11,T_2048_177,T_1024_89,T_2048_179,T_512_45,T_2048_181,T_1024_91,T_2048_183,T_256_23,T_2048_185,T_1024_93,T_2048_187,T_512_47,T_2048_189,T_1024_95,T_2048_191,T_32_3,T_2048_193,T_1024_97,T_2048_195,T_512_49,T_2048_197,T_1024_99,T_2048_199,T_256_25,T_2048_201,T_1024_101,T_2048_203,T_512_51,T_2048_205,T_1024_103,T_2048_207,T_128_13,T_2048_209,T_1024_105,T_2048_211,T_512_53,T_2048_213,T_1024_107,T_2048_215,T_256_27,T_2048_217,T_1024_109,T_2048_219,T_512_55,T_2048_221,T_1024_111,T_2048_223,T_64_7,T_2048_225,T_1024_113,T_2048_227,T_512_57,T_2048_229,T_1024_115,T_2048_231,T_256_29,T_2048_233,T_1024_117,T_2048_235,T_512_59,T_2048_237,T_1024_119,T_2048_239,T_128_15,T_2048_241,T_1024_121,T_2048_243,T_512_61,T_2048_245,T_1024_123,T_2048_247,T_256_31,T_2048_249,T_1024_125,T_2048_251,T_512_63,T_2048_253,T_1024_127,T_2048_255,T_8_1,T_2048_257,T_1024_129,T_2048_259,T_512_65,T_2048_261,T_1024_131,T_2048_263,T_256_33,T_2048_265,T_1024_133,T_2048_267,T_512_67,T_2048_269,T_1024_135,T_2048_271,T_128_17,T_2048_273,T_1024_137,T_2048_275,T_512_69,T_2048_277,T_1024_139,T_2048_279,T_256_35,T_2048_281,T_1024_141,T_2048_283,T_512_71,T_2048_285,T_1024_143,T_2048_287,T_64_9,T_2048_289,T_1024_145,T_2048_291,T_512_73,T_2048_293,T_1024_147,T_2048_295,T_256_37,T_2048_297,T_1024_149,T_2048_299,T_512_75,T_2048_301,T_1024_151,T_2048_303,T_128_19,T_2048_305,T_1024_153,T_2048_307,T_512_77,T_2048_309,T_1024_155,T_2048_311,T_256_39,T_2048_313,T_1024_157,T_2048_315,T_512_79,T_2048_317,T_1024_159,T_2048_319,T_32_5,T_2048_321,T_1024_161,T_2048_323,T_512_81,T_2048_325,T_1024_163,T_2048_327,T_256_41,T_2048_329,T_1024_165,T_2048_331,T_512_83,T_2048_333,T_1024_167,T_2048_335,T_128_21,T_2048_337,T_1024_169,T_2048_339,T_512_85,T_2048_341,T_1024_171,T_2048_343,T_256_43,T_2048_345,T_1024_173,T_2048_347,T_512_87,T_2048_349,T_1024_175,T_2048_351,T_64_11,T_2048_353,T_1024_177,T_2048_355,T_512_89,T_2048_357,T_1024_179,T_2048_359,T_256_45,T_2048_361,T_1024_181,T_2048_363,T_512_91,T_2048_365,T_1024_183,T_2048_367,T_128_23,T_2048_369,T_1024_185,T_2048_371,T_512_93,T_2048_373,T_1024_187,T_2048_375,T_256_47,T_2048_377,T_1024_189,T_2048_379,T_512_95,T_2048_381,T_1024_191,T_2048_383,T_16_3,T_2048_385,T_1024_193,T_2048_387,T_512_97,T_2048_389,T_1024_195,T_2048_391,T_256_49,T_2048_393,T_1024_197,T_2048_395,T_512_99,T_2048_397,T_1024_199,T_2048_399,T_128_25,T_2048_401,T_1024_201,T_2048_403,T_512_101,T_2048_405,T_1024_203,T_2048_407,T_256_51,T_2048_409,T_1024_205,T_2048_411,T_512_103,T_2048_413,T_1024_207,T_2048_415,T_64_13,T_2048_417,T_1024_209,T_2048_419,T_512_105,T_2048_421,T_1024_211,T_2048_423,T_256_53,T_2048_425,T_1024_213,T_2048_427,T_512_107,T_2048_429,T_1024_215,T_2048_431,T_128_27,T_2048_433,T_1024_217,T_2048_435,T_512_109,T_2048_437,T_1024_219,T_2048_439,T_256_55,T_2048_441,T_1024_221,T_2048_443,T_512_111,T_2048_445,T_1024_223,T_2048_447,T_32_7,T_2048_449,T_1024_225,T_2048_451,T_512_113,T_2048_453,T_1024_227,T_2048_455,T_256_57,T_2048_457,T_1024_229,T_2048_459,T_512_115,T_2048_461,T_1024_231,T_2048_463,T_128_29,T_2048_465,T_1024_233,T_2048_467,T_512_117,T_2048_469,T_1024_235,T_2048_471,T_256_59,T_2048_473,T_1024_237,T_2048_475,T_512_119,T_2048_477,T_1024_239,T_2048_479,T_64_15,T_2048_481,T_1024_241,T_2048_483,T_512_121,T_2048_485,T_1024_243,T_2048_487,T_256_61,T_2048_489,T_1024_245,T_2048_491,T_512_123,T_2048_493,T_1024_247,T_2048_495,T_128_31,T_2048_497,T_1024_249,T_2048_499,T_512_125,T_2048_501,T_1024_251,T_2048_503,T_256_63,T_2048_505,T_1024_253,T_2048_507,T_512_127,T_2048_509,T_1024_255,T_2048_511,T_4_1,T_2048_513,T_1024_257,T_2048_515,T_512_129,T_2048_517,T_1024_259,T_2048_519,T_256_65,T_2048_521,T_1024_261,T_2048_523,T_512_131,T_2048_525,T_1024_263,T_2048_527,T_128_33,T_2048_529,T_1024_265,T_2048_531,T_512_133,T_2048_533,T_1024_267,T_2048_535,T_256_67,T_2048_537,T_1024_269,T_2048_539,T_512_135,T_2048_541,T_1024_271,T_2048_543,T_64_17,T_2048_545,T_1024_273,T_2048_547,T_512_137,T_2048_549,T_1024_275,T_2048_551,T_256_69,T_2048_553,T_1024_277,T_2048_555,T_512_139,T_2048_557,T_1024_279,T_2048_559,T_128_35,T_2048_561,T_1024_281,T_2048_563,T_512_141,T_2048_565,T_1024_283,T_2048_567,T_256_71,T_2048_569,T_1024_285,T_2048_571,T_512_143,T_2048_573,T_1024_287,T_2048_575,T_32_9,T_2048_577,T_1024_289,T_2048_579,T_512_145,T_2048_581,T_1024_291,T_2048_583,T_256_73,T_2048_585,T_1024_293,T_2048_587,T_512_147,T_2048_589,T_1024_295,T_2048_591,T_128_37,T_2048_593,T_1024_297,T_2048_595,T_512_149,T_2048_597,T_1024_299,T_2048_599,T_256_75,T_2048_601,T_1024_301,T_2048_603,T_512_151,T_2048_605,T_1024_303,T_2048_607,T_64_19,T_2048_609,T_1024_305,T_2048_611,T_512_153,T_2048_613,T_1024_307,T_2048_615,T_256_77,T_2048_617,T_1024_309,T_2048_619,T_512_155,T_2048_621,T_1024_311,T_2048_623,T_128_39,T_2048_625,T_1024_313,T_2048_627,T_512_157,T_2048_629,T_1024_315,T_2048_631,T_256_79,T_2048_633,T_1024_317,T_2048_635,T_512_159,T_2048_637,T_1024_319,T_2048_639,T_16_5,T_2048_641,T_1024_321,T_2048_643,T_512_161,T_2048_645,T_1024_323,T_2048_647,T_256_81,T_2048_649,T_1024_325,T_2048_651,T_512_163,T_2048_653,T_1024_327,T_2048_655,T_128_41,T_2048_657,T_1024_329,T_2048_659,T_512_165,T_2048_661,T_1024_331,T_2048_663,T_256_83,T_2048_665,T_1024_333,T_2048_667,T_512_167,T_2048_669,T_1024_335,T_2048_671,T_64_21,T_2048_673,T_1024_337,T_2048_675,T_512_169,T_2048_677,T_1024_339,T_2048_679,T_256_85,T_2048_681,T_1024_341,T_2048_683,T_512_171,T_2048_685,T_1024_343,T_2048_687,T_128_43,T_2048_689,T_1024_345,T_2048_691,T_512_173,T_2048_693,T_1024_347,T_2048_695,T_256_87,T_2048_697,T_1024_349,T_2048_699,T_512_175,T_2048_701,T_1024_351,T_2048_703,T_32_11,T_2048_705,T_1024_353,T_2048_707,T_512_177,T_2048_709,T_1024_355,T_2048_711,T_256_89,T_2048_713,T_1024_357,T_2048_715,T_512_179,T_2048_717,T_1024_359,T_2048_719,T_128_45,T_2048_721,T_1024_361,T_2048_723,T_512_181,T_2048_725,T_1024_363,T_2048_727,T_256_91,T_2048_729,T_1024_365,T_2048_731,T_512_183,T_2048_733,T_1024_367,T_2048_735,T_64_23,T_2048_737,T_1024_369,T_2048_739,T_512_185,T_2048_741,T_1024_371,T_2048_743,T_256_93,T_2048_745,T_1024_373,T_2048_747,T_512_187,T_2048_749,T_1024_375,T_2048_751,T_128_47,T_2048_753,T_1024_377,T_2048_755,T_512_189,T_2048_757,T_1024_379,T_2048_759,T_256_95,T_2048_761,T_1024_381,T_2048_763,T_512_191,T_2048_765,T_1024_383,T_2048_767,T_8_3,T_2048_769,T_1024_385,T_2048_771,T_512_193,T_2048_773,T_1024_387,T_2048_775,T_256_97,T_2048_777,T_1024_389,T_2048_779,T_512_195,T_2048_781,T_1024_391,T_2048_783,T_128_49,T_2048_785,T_1024_393,T_2048_787,T_512_197,T_2048_789,T_1024_395,T_2048_791,T_256_99,T_2048_793,T_1024_397,T_2048_795,T_512_199,T_2048_797,T_1024_399,T_2048_799,T_64_25,T_2048_801,T_1024_401,T_2048_803,T_512_201,T_2048_805,T_1024_403,T_2048_807,T_256_101,T_2048_809,T_1024_405,T_2048_811,T_512_203,T_2048_813,T_1024_407,T_2048_815,T_128_51,T_2048_817,T_1024_409,T_2048_819,T_512_205,T_2048_821,T_1024_411,T_2048_823,T_256_103,T_2048_825,T_1024_413,T_2048_827,T_512_207,T_2048_829,T_1024_415,T_2048_831,T_32_13,T_2048_833,T_1024_417,T_2048_835,T_512_209,T_2048_837,T_1024_419,T_2048_839,T_256_105,T_2048_841,T_1024_421,T_2048_843,T_512_211,T_2048_845,T_1024_423,T_2048_847,T_128_53,T_2048_849,T_1024_425,T_2048_851,T_512_213,T_2048_853,T_1024_427,T_2048_855,T_256_107,T_2048_857,T_1024_429,T_2048_859,T_512_215,T_2048_861,T_1024_431,T_2048_863,T_64_27,T_2048_865,T_1024_433,T_2048_867,T_512_217,T_2048_869,T_1024_435,T_2048_871,T_256_109,T_2048_873,T_1024_437,T_2048_875,T_512_219,T_2048_877,T_1024_439,T_2048_879,T_128_55,T_2048_881,T_1024_441,T_2048_883,T_512_221,T_2048_885,T_1024_443,T_2048_887,T_256_111,T_2048_889,T_1024_445,T_2048_891,T_512_223,T_2048_893,T_1024_447,T_2048_895,T_16_7,T_2048_897,T_1024_449,T_2048_899,T_512_225,T_2048_901,T_1024_451,T_2048_903,T_256_113,T_2048_905,T_1024_453,T_2048_907,T_512_227,T_2048_909,T_1024_455,T_2048_911,T_128_57,T_2048_913,T_1024_457,T_2048_915,T_512_229,T_2048_917,T_1024_459,T_2048_919,T_256_115,T_2048_921,T_1024_461,T_2048_923,T_512_231,T_2048_925,T_1024_463,T_2048_927,T_64_29,T_2048_929,T_1024_465,T_2048_931,T_512_233,T_2048_933,T_1024_467,T_2048_935,T_256_117,T_2048_937,T_1024_469,T_2048_939,T_512_235,T_2048_941,T_1024_471,T_2048_943,T_128_59,T_2048_945,T_1024_473,T_2048_947,T_512_237,T_2048_949,T_1024_475,T_2048_951,T_256_119,T_2048_953,T_1024_477,T_2048_955,T_512_239,T_2048_957,T_1024_479,T_2048_959,T_32_15,T_2048_961,T_1024_481,T_2048_963,T_512_241,T_2048_965,T_1024_483,T_2048_967,T_256_121,T_2048_969,T_1024_485,T_2048_971,T_512_243,T_2048_973,T_1024_487,T_2048_975,T_128_61,T_2048_977,T_1024_489,T_2048_979,T_512_245,T_2048_981,T_1024_491,T_2048_983,T_256_123,T_2048_985,T_1024_493,T_2048_987,T_512_247,T_2048_989,T_1024_495,T_2048_991,T_64_31,T_2048_993,T_1024_497,T_2048_995,T_512_249,T_2048_997,T_1024_499,T_2048_999,T_256_125,T_2048_1001,T_1024_501,T_2048_1003,T_512_251,T_2048_1005,T_1024_503,T_2048_1007,T_128_63,T_2048_1009,T_1024_505,T_2048_1011,T_512_253,T_2048_1013,T_1024_507,T_2048_1015,T_256_127,T_2048_1017,T_1024_509,T_2048_1019,T_512_255,T_2048_1021,T_1024_511,T_2048_1023,T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_8_1,T_1024_129,T_512_65,T_1024_131,T_256_33,T_1024_133,T_512_67,T_1024_135,T_128_17,T_1024_137,T_512_69,T_1024_139,T_256_35,T_1024_141,T_512_71,T_1024_143,T_64_9,T_1024_145,T_512_73,T_1024_147,T_256_37,T_1024_149,T_512_75,T_1024_151,T_128_19,T_1024_153,T_512_77,T_1024_155,T_256_39,T_1024_157,T_512_79,T_1024_159,T_32_5,T_1024_161,T_512_81,T_1024_163,T_256_41,T_1024_165,T_512_83,T_1024_167,T_128_21,T_1024_169,T_512_85,T_1024_171,T_256_43,T_1024_173,T_512_87,T_1024_175,T_64_11,T_1024_177,T_512_89,T_1024_179,T_256_45,T_1024_181,T_512_91,T_1024_183,T_128_23,T_1024_185,T_512_93,T_1024_187,T_256_47,T_1024_189,T_512_95,T_1024_191,T_16_3,T_1024_193,T_512_97,T_1024_195,T_256_49,T_1024_197,T_512_99,T_1024_199,T_128_25,T_1024_201,T_512_101,T_1024_203,T_256_51,T_1024_205,T_512_103,T_1024_207,T_64_13,T_1024_209,T_512_105,T_1024_211,T_256_53,T_1024_213,T_512_107,T_1024_215,T_128_27,T_1024_217,T_512_109,T_1024_219,T_256_55,T_1024_221,T_512_111,T_1024_223,T_32_7,T_1024_225,T_512_113,T_1024_227,T_256_57,T_1024_229,T_512_115,T_1024_231,T_128_29,T_1024_233,T_512_117,T_1024_235,T_256_59,T_1024_237,T_512_119,T_1024_239,T_64_15,T_1024_241,T_512_121,T_1024_243,T_256_61,T_1024_245,T_512_123,T_1024_247,T_128_31,T_1024_249,T_512_125,T_1024_251,T_256_63,T_1024_253,T_512_127,T_1024_255,T_4_1,T_1024_257,T_512_129,T_1024_259,T_256_65,T_1024_261,T_512_131,T_1024_263,T_128_33,T_1024_265,T_512_133,T_1024_267,T_256_67,T_1024_269,T_512_135,T_1024_271,T_64_17,T_1024_273,T_512_137,T_1024_275,T_256_69,T_1024_277,T_512_139,T_1024_279,T_128_35,T_1024_281,T_512_141,T_1024_283,T_256_71,T_1024_285,T_512_143,T_1024_287,T_32_9,T_1024_289,T_512_145,T_1024_291,T_256_73,T_1024_293,T_512_147,T_1024_295,T_128_37,T_1024_297,T_512_149,T_1024_299,T_256_75,T_1024_301,T_512_151,T_1024_303,T_64_19,T_1024_305,T_512_153,T_1024_307,T_256_77,T_1024_309,T_512_155,T_1024_311,T_128_39,T_1024_313,T_512_157,T_1024_315,T_256_79,T_1024_317,T_512_159,T_1024_319,T_16_5,T_1024_321,T_512_161,T_1024_323,T_256_81,T_1024_325,T_512_163,T_1024_327,T_128_41,T_1024_329,T_512_165,T_1024_331,T_256_83,T_1024_333,T_512_167,T_1024_335,T_64_21,T_1024_337,T_512_169,T_1024_339,T_256_85,T_1024_341,T_512_171,T_1024_343,T_128_43,T_1024_345,T_512_173,T_1024_347,T_256_87,T_1024_349,T_512_175,T_1024_351,T_32_11,T_1024_353,T_512_177,T_1024_355,T_256_89,T_1024_357,T_512_179,T_1024_359,T_128_45,T_1024_361,T_512_181,T_1024_363,T_256_91,T_1024_365,T_512_183,T_1024_367,T_64_23,T_1024_369,T_512_185,T_1024_371,T_256_93,T_1024_373,T_512_187,T_1024_375,T_128_47,T_1024_377,T_512_189,T_1024_379,T_256_95,T_1024_381,T_512_191,T_1024_383,T_8_3,T_1024_385,T_512_193,T_1024_387,T_256_97,T_1024_389,T_512_195,T_1024_391,T_128_49,T_1024_393,T_512_197,T_1024_395,T_256_99,T_1024_397,T_512_199,T_1024_399,T_64_25,T_1024_401,T_512_201,T_1024_403,T_256_101,T_1024_405,T_512_203,T_1024_407,T_128_51,T_1024_409,T_512_205,T_1024_411,T_256_103,T_1024_413,T_512_207,T_1024_415,T_32_13,T_1024_417,T_512_209,T_1024_419,T_256_105,T_1024_421,T_512_211,T_1024_423,T_128_53,T_1024_425,T_512_213,T_1024_427,T_256_107,T_1024_429,T_512_215,T_1024_431,T_64_27,T_1024_433,T_512_217,T_1024_435,T_256_109,T_1024_437,T_512_219,T_1024_439,T_128_55,T_1024_441,T_512_221,T_1024_443,T_256_111,T_1024_445,T_512_223,T_1024_447,T_16_7,T_1024_449,T_512_225,T_1024_451,T_256_113,T_1024_453,T_512_227,T_1024_455,T_128_57,T_1024_457,T_512_229,T_1024_459,T_256_115,T_1024_461,T_512_231,T_1024_463,T_64_29,T_1024_465,T_512_233,T_1024_467,T_256_117,T_1024_469,T_512_235,T_1024_471,T_128_59,T_1024_473,T_512_237,T_1024_475,T_256_119,T_1024_477,T_512_239,T_1024_479,T_32_15,T_1024_481,T_512_241,T_1024_483,T_256_121,T_1024_485,T_512_243,T_1024_487,T_128_61,T_1024_489,T_512_245,T_1024_491,T_256_123,T_1024_493,T_512_247,T_1024_495,T_64_31,T_1024_497,T_512_249,T_1024_499,T_256_125,T_1024_501,T_512_251,T_1024_503,T_128_63,T_1024_505,T_512_253,T_1024_507,T_256_127,T_1024_509,T_512_255,T_1024_511,T_2_1,T_1024_513,T_512_257,T_1024_515,T_256_129,T_1024_517,T_512_259,T_1024_519,T_128_65,T_1024_521,T_512_261,T_1024_523,T_256_131,T_1024_525,T_512_263,T_1024_527,T_64_33,T_1024_529,T_512_265,T_1024_531,T_256_133,T_1024_533,T_512_267,T_1024_535,T_128_67,T_1024_537,T_512_269,T_1024_539,T_256_135,T_1024_541,T_512_271,T_1024_543,T_32_17,T_1024_545,T_512_273,T_1024_547,T_256_137,T_1024_549,T_512_275,T_1024_551,T_128_69,T_1024_553,T_512_277,T_1024_555,T_256_139,T_1024_557,T_512_279,T_1024_559,T_64_35,T_1024_561,T_512_281,T_1024_563,T_256_141,T_1024_565,T_512_283,T_1024_567,T_128_71,T_1024_569,T_512_285,T_1024_571,T_256_143,T_1024_573,T_512_287,T_1024_575,T_16_9,T_1024_577,T_512_289,T_1024_579,T_256_145,T_1024_581,T_512_291,T_1024_583,T_128_73,T_1024_585,T_512_293,T_1024_587,T_256_147,T_1024_589,T_512_295,T_1024_591,T_64_37,T_1024_593,T_512_297,T_1024_595,T_256_149,T_1024_597,T_512_299,T_1024_599,T_128_75,T_1024_601,T_512_301,T_1024_603,T_256_151,T_1024_605,T_512_303,T_1024_607,T_32_19,T_1024_609,T_512_305,T_1024_611,T_256_153,T_1024_613,T_512_307,T_1024_615,T_128_77,T_1024_617,T_512_309,T_1024_619,T_256_155,T_1024_621,T_512_311,T_1024_623,T_64_39,T_1024_625,T_512_313,T_1024_627,T_256_157,T_1024_629,T_512_315,T_1024_631,T_128_79,T_1024_633,T_512_317,T_1024_635,T_256_159,T_1024_637,T_512_319,T_1024_639,T_8_5,T_1024_641,T_512_321,T_1024_643,T_256_161,T_1024_645,T_512_323,T_1024_647,T_128_81,T_1024_649,T_512_325,T_1024_651,T_256_163,T_1024_653,T_512_327,T_1024_655,T_64_41,T_1024_657,T_512_329,T_1024_659,T_256_165,T_1024_661,T_512_331,T_1024_663,T_128_83,T_1024_665,T_512_333,T_1024_667,T_256_167,T_1024_669,T_512_335,T_1024_671,T_32_21,T_1024_673,T_512_337,T_1024_675,T_256_169,T_1024_677,T_512_339,T_1024_679,T_128_85,T_1024_681,T_512_341,T_1024_683,T_256_171,T_1024_685,T_512_343,T_1024_687,T_64_43,T_1024_689,T_512_345,T_1024_691,T_256_173,T_1024_693,T_512_347,T_1024_695,T_128_87,T_1024_697,T_512_349,T_1024_699,T_256_175,T_1024_701,T_512_351,T_1024_703,T_16_11,T_1024_705,T_512_353,T_1024_707,T_256_177,T_1024_709,T_512_355,T_1024_711,T_128_89,T_1024_713,T_512_357,T_1024_715,T_256_179,T_1024_717,T_512_359,T_1024_719,T_64_45,T_1024_721,T_512_361,T_1024_723,T_256_181,T_1024_725,T_512_363,T_1024_727,T_128_91,T_1024_729,T_512_365,T_1024_731,T_256_183,T_1024_733,T_512_367,T_1024_735,T_32_23,T_1024_737,T_512_369,T_1024_739,T_256_185,T_1024_741,T_512_371,T_1024_743,T_128_93,T_1024_745,T_512_373,T_1024_747,T_256_187,T_1024_749,T_512_375,T_1024_751,T_64_47,T_1024_753,T_512_377,T_1024_755,T_256_189,T_1024_757,T_512_379,T_1024_759,T_128_95,T_1024_761,T_512_381,T_1024_763,T_256_191,T_1024_765,T_512_383,T_1024_767,T_4_3,T_1024_769,T_512_385,T_1024_771,T_256_193,T_1024_773,T_512_387,T_1024_775,T_128_97,T_1024_777,T_512_389,T_1024_779,T_256_195,T_1024_781,T_512_391,T_1024_783,T_64_49,T_1024_785,T_512_393,T_1024_787,T_256_197,T_1024_789,T_512_395,T_1024_791,T_128_99,T_1024_793,T_512_397,T_1024_795,T_256_199,T_1024_797,T_512_399,T_1024_799,T_32_25,T_1024_801,T_512_401,T_1024_803,T_256_201,T_1024_805,T_512_403,T_1024_807,T_128_101,T_1024_809,T_512_405,T_1024_811,T_256_203,T_1024_813,T_512_407,T_1024_815,T_64_51,T_1024_817,T_512_409,T_1024_819,T_256_205,T_1024_821,T_512_411,T_1024_823,T_128_103,T_1024_825,T_512_413,T_1024_827,T_256_207,T_1024_829,T_512_415,T_1024_831,T_16_13,T_1024_833,T_512_417,T_1024_835,T_256_209,T_1024_837,T_512_419,T_1024_839,T_128_105,T_1024_841,T_512_421,T_1024_843,T_256_211,T_1024_845,T_512_423,T_1024_847,T_64_53,T_1024_849,T_512_425,T_1024_851,T_256_213,T_1024_853,T_512_427,T_1024_855,T_128_107,T_1024_857,T_512_429,T_1024_859,T_256_215,T_1024_861,T_512_431,T_1024_863,T_32_27,T_1024_865,T_512_433,T_1024_867,T_256_217,T_1024_869,T_512_435,T_1024_871,T_128_109,T_1024_873,T_512_437,T_1024_875,T_256_219,T_1024_877,T_512_439,T_1024_879,T_64_55,T_1024_881,T_512_441,T_1024_883,T_256_221,T_1024_885,T_512_443,T_1024_887,T_128_111,T_1024_889,T_512_445,T_1024_891,T_256_223,T_1024_893,T_512_447,T_1024_895,T_8_7,T_1024_897,T_512_449,T_1024_899,T_256_225,T_1024_901,T_512_451,T_1024_903,T_128_113,T_1024_905,T_512_453,T_1024_907,T_256_227,T_1024_909,T_512_455,T_1024_911,T_64_57,T_1024_913,T_512_457,T_1024_915,T_256_229,T_1024_917,T_512_459,T_1024_919,T_128_115,T_1024_921,T_512_461,T_1024_923,T_256_231,T_1024_925,T_512_463,T_1024_927,T_32_29,T_1024_929,T_512_465,T_1024_931,T_256_233,T_1024_933,T_512_467,T_1024_935,T_128_117,T_1024_937,T_512_469,T_1024_939,T_256_235,T_1024_941,T_512_471,T_1024_943,T_64_59,T_1024_945,T_512_473,T_1024_947,T_256_237,T_1024_949,T_512_475,T_1024_951,T_128_119,T_1024_953,T_512_477,T_1024_955,T_256_239,T_1024_957,T_512_479,T_1024_959,T_16_15,T_1024_961,T_512_481,T_1024_963,T_256_241,T_1024_965,T_512_483,T_1024_967,T_128_121,T_1024_969,T_512_485,T_1024_971,T_256_243,T_1024_973,T_512_487,T_1024_975,T_64_61,T_1024_977,T_512_489,T_1024_979,T_256_245,T_1024_981,T_512_491,T_1024_983,T_128_123,T_1024_985,T_512_493,T_1024_987,T_256_247,T_1024_989,T_512_495,T_1024_991,T_32_31,T_1024_993,T_512_497,T_1024_995,T_256_249,T_1024_997,T_512_499,T_1024_999,T_128_125,T_1024_1001,T_512_501,T_1024_1003,T_256_251,T_1024_1005,T_512_503,T_1024_1007,T_64_63,T_1024_1009,T_512_505,T_1024_1011,T_256_253,T_1024_1013,T_512_507,T_1024_1015,T_128_127,T_1024_1017,T_512_509,T_1024_1019,T_256_255,T_1024_1021,T_512_511,T_1024_1023 +}; +static const __device__ float2 lut_sp_4_2048[512*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_16_1,T_2048_129,T_1024_65,T_2048_131,T_512_33,T_2048_133,T_1024_67,T_2048_135,T_256_17,T_2048_137,T_1024_69,T_2048_139,T_512_35,T_2048_141,T_1024_71,T_2048_143,T_128_9,T_2048_145,T_1024_73,T_2048_147,T_512_37,T_2048_149,T_1024_75,T_2048_151,T_256_19,T_2048_153,T_1024_77,T_2048_155,T_512_39,T_2048_157,T_1024_79,T_2048_159,T_64_5,T_2048_161,T_1024_81,T_2048_163,T_512_41,T_2048_165,T_1024_83,T_2048_167,T_256_21,T_2048_169,T_1024_85,T_2048_171,T_512_43,T_2048_173,T_1024_87,T_2048_175,T_128_11,T_2048_177,T_1024_89,T_2048_179,T_512_45,T_2048_181,T_1024_91,T_2048_183,T_256_23,T_2048_185,T_1024_93,T_2048_187,T_512_47,T_2048_189,T_1024_95,T_2048_191,T_32_3,T_2048_193,T_1024_97,T_2048_195,T_512_49,T_2048_197,T_1024_99,T_2048_199,T_256_25,T_2048_201,T_1024_101,T_2048_203,T_512_51,T_2048_205,T_1024_103,T_2048_207,T_128_13,T_2048_209,T_1024_105,T_2048_211,T_512_53,T_2048_213,T_1024_107,T_2048_215,T_256_27,T_2048_217,T_1024_109,T_2048_219,T_512_55,T_2048_221,T_1024_111,T_2048_223,T_64_7,T_2048_225,T_1024_113,T_2048_227,T_512_57,T_2048_229,T_1024_115,T_2048_231,T_256_29,T_2048_233,T_1024_117,T_2048_235,T_512_59,T_2048_237,T_1024_119,T_2048_239,T_128_15,T_2048_241,T_1024_121,T_2048_243,T_512_61,T_2048_245,T_1024_123,T_2048_247,T_256_31,T_2048_249,T_1024_125,T_2048_251,T_512_63,T_2048_253,T_1024_127,T_2048_255,T_8_1,T_2048_257,T_1024_129,T_2048_259,T_512_65,T_2048_261,T_1024_131,T_2048_263,T_256_33,T_2048_265,T_1024_133,T_2048_267,T_512_67,T_2048_269,T_1024_135,T_2048_271,T_128_17,T_2048_273,T_1024_137,T_2048_275,T_512_69,T_2048_277,T_1024_139,T_2048_279,T_256_35,T_2048_281,T_1024_141,T_2048_283,T_512_71,T_2048_285,T_1024_143,T_2048_287,T_64_9,T_2048_289,T_1024_145,T_2048_291,T_512_73,T_2048_293,T_1024_147,T_2048_295,T_256_37,T_2048_297,T_1024_149,T_2048_299,T_512_75,T_2048_301,T_1024_151,T_2048_303,T_128_19,T_2048_305,T_1024_153,T_2048_307,T_512_77,T_2048_309,T_1024_155,T_2048_311,T_256_39,T_2048_313,T_1024_157,T_2048_315,T_512_79,T_2048_317,T_1024_159,T_2048_319,T_32_5,T_2048_321,T_1024_161,T_2048_323,T_512_81,T_2048_325,T_1024_163,T_2048_327,T_256_41,T_2048_329,T_1024_165,T_2048_331,T_512_83,T_2048_333,T_1024_167,T_2048_335,T_128_21,T_2048_337,T_1024_169,T_2048_339,T_512_85,T_2048_341,T_1024_171,T_2048_343,T_256_43,T_2048_345,T_1024_173,T_2048_347,T_512_87,T_2048_349,T_1024_175,T_2048_351,T_64_11,T_2048_353,T_1024_177,T_2048_355,T_512_89,T_2048_357,T_1024_179,T_2048_359,T_256_45,T_2048_361,T_1024_181,T_2048_363,T_512_91,T_2048_365,T_1024_183,T_2048_367,T_128_23,T_2048_369,T_1024_185,T_2048_371,T_512_93,T_2048_373,T_1024_187,T_2048_375,T_256_47,T_2048_377,T_1024_189,T_2048_379,T_512_95,T_2048_381,T_1024_191,T_2048_383,T_16_3,T_2048_385,T_1024_193,T_2048_387,T_512_97,T_2048_389,T_1024_195,T_2048_391,T_256_49,T_2048_393,T_1024_197,T_2048_395,T_512_99,T_2048_397,T_1024_199,T_2048_399,T_128_25,T_2048_401,T_1024_201,T_2048_403,T_512_101,T_2048_405,T_1024_203,T_2048_407,T_256_51,T_2048_409,T_1024_205,T_2048_411,T_512_103,T_2048_413,T_1024_207,T_2048_415,T_64_13,T_2048_417,T_1024_209,T_2048_419,T_512_105,T_2048_421,T_1024_211,T_2048_423,T_256_53,T_2048_425,T_1024_213,T_2048_427,T_512_107,T_2048_429,T_1024_215,T_2048_431,T_128_27,T_2048_433,T_1024_217,T_2048_435,T_512_109,T_2048_437,T_1024_219,T_2048_439,T_256_55,T_2048_441,T_1024_221,T_2048_443,T_512_111,T_2048_445,T_1024_223,T_2048_447,T_32_7,T_2048_449,T_1024_225,T_2048_451,T_512_113,T_2048_453,T_1024_227,T_2048_455,T_256_57,T_2048_457,T_1024_229,T_2048_459,T_512_115,T_2048_461,T_1024_231,T_2048_463,T_128_29,T_2048_465,T_1024_233,T_2048_467,T_512_117,T_2048_469,T_1024_235,T_2048_471,T_256_59,T_2048_473,T_1024_237,T_2048_475,T_512_119,T_2048_477,T_1024_239,T_2048_479,T_64_15,T_2048_481,T_1024_241,T_2048_483,T_512_121,T_2048_485,T_1024_243,T_2048_487,T_256_61,T_2048_489,T_1024_245,T_2048_491,T_512_123,T_2048_493,T_1024_247,T_2048_495,T_128_31,T_2048_497,T_1024_249,T_2048_499,T_512_125,T_2048_501,T_1024_251,T_2048_503,T_256_63,T_2048_505,T_1024_253,T_2048_507,T_512_127,T_2048_509,T_1024_255,T_2048_511,T_2_0,T_2048_3,T_1024_3,T_2048_9,T_512_3,T_2048_15,T_1024_9,T_2048_21,T_256_3,T_2048_27,T_1024_15,T_2048_33,T_512_9,T_2048_39,T_1024_21,T_2048_45,T_128_3,T_2048_51,T_1024_27,T_2048_57,T_512_15,T_2048_63,T_1024_33,T_2048_69,T_256_9,T_2048_75,T_1024_39,T_2048_81,T_512_21,T_2048_87,T_1024_45,T_2048_93,T_64_3,T_2048_99,T_1024_51,T_2048_105,T_512_27,T_2048_111,T_1024_57,T_2048_117,T_256_15,T_2048_123,T_1024_63,T_2048_129,T_512_33,T_2048_135,T_1024_69,T_2048_141,T_128_9,T_2048_147,T_1024_75,T_2048_153,T_512_39,T_2048_159,T_1024_81,T_2048_165,T_256_21,T_2048_171,T_1024_87,T_2048_177,T_512_45,T_2048_183,T_1024_93,T_2048_189,T_32_3,T_2048_195,T_1024_99,T_2048_201,T_512_51,T_2048_207,T_1024_105,T_2048_213,T_256_27,T_2048_219,T_1024_111,T_2048_225,T_512_57,T_2048_231,T_1024_117,T_2048_237,T_128_15,T_2048_243,T_1024_123,T_2048_249,T_512_63,T_2048_255,T_1024_129,T_2048_261,T_256_33,T_2048_267,T_1024_135,T_2048_273,T_512_69,T_2048_279,T_1024_141,T_2048_285,T_64_9,T_2048_291,T_1024_147,T_2048_297,T_512_75,T_2048_303,T_1024_153,T_2048_309,T_256_39,T_2048_315,T_1024_159,T_2048_321,T_512_81,T_2048_327,T_1024_165,T_2048_333,T_128_21,T_2048_339,T_1024_171,T_2048_345,T_512_87,T_2048_351,T_1024_177,T_2048_357,T_256_45,T_2048_363,T_1024_183,T_2048_369,T_512_93,T_2048_375,T_1024_189,T_2048_381,T_16_3,T_2048_387,T_1024_195,T_2048_393,T_512_99,T_2048_399,T_1024_201,T_2048_405,T_256_51,T_2048_411,T_1024_207,T_2048_417,T_512_105,T_2048_423,T_1024_213,T_2048_429,T_128_27,T_2048_435,T_1024_219,T_2048_441,T_512_111,T_2048_447,T_1024_225,T_2048_453,T_256_57,T_2048_459,T_1024_231,T_2048_465,T_512_117,T_2048_471,T_1024_237,T_2048_477,T_64_15,T_2048_483,T_1024_243,T_2048_489,T_512_123,T_2048_495,T_1024_249,T_2048_501,T_256_63,T_2048_507,T_1024_255,T_2048_513,T_512_129,T_2048_519,T_1024_261,T_2048_525,T_128_33,T_2048_531,T_1024_267,T_2048_537,T_512_135,T_2048_543,T_1024_273,T_2048_549,T_256_69,T_2048_555,T_1024_279,T_2048_561,T_512_141,T_2048_567,T_1024_285,T_2048_573,T_32_9,T_2048_579,T_1024_291,T_2048_585,T_512_147,T_2048_591,T_1024_297,T_2048_597,T_256_75,T_2048_603,T_1024_303,T_2048_609,T_512_153,T_2048_615,T_1024_309,T_2048_621,T_128_39,T_2048_627,T_1024_315,T_2048_633,T_512_159,T_2048_639,T_1024_321,T_2048_645,T_256_81,T_2048_651,T_1024_327,T_2048_657,T_512_165,T_2048_663,T_1024_333,T_2048_669,T_64_21,T_2048_675,T_1024_339,T_2048_681,T_512_171,T_2048_687,T_1024_345,T_2048_693,T_256_87,T_2048_699,T_1024_351,T_2048_705,T_512_177,T_2048_711,T_1024_357,T_2048_717,T_128_45,T_2048_723,T_1024_363,T_2048_729,T_512_183,T_2048_735,T_1024_369,T_2048_741,T_256_93,T_2048_747,T_1024_375,T_2048_753,T_512_189,T_2048_759,T_1024_381,T_2048_765,T_8_3,T_2048_771,T_1024_387,T_2048_777,T_512_195,T_2048_783,T_1024_393,T_2048_789,T_256_99,T_2048_795,T_1024_399,T_2048_801,T_512_201,T_2048_807,T_1024_405,T_2048_813,T_128_51,T_2048_819,T_1024_411,T_2048_825,T_512_207,T_2048_831,T_1024_417,T_2048_837,T_256_105,T_2048_843,T_1024_423,T_2048_849,T_512_213,T_2048_855,T_1024_429,T_2048_861,T_64_27,T_2048_867,T_1024_435,T_2048_873,T_512_219,T_2048_879,T_1024_441,T_2048_885,T_256_111,T_2048_891,T_1024_447,T_2048_897,T_512_225,T_2048_903,T_1024_453,T_2048_909,T_128_57,T_2048_915,T_1024_459,T_2048_921,T_512_231,T_2048_927,T_1024_465,T_2048_933,T_256_117,T_2048_939,T_1024_471,T_2048_945,T_512_237,T_2048_951,T_1024_477,T_2048_957,T_32_15,T_2048_963,T_1024_483,T_2048_969,T_512_243,T_2048_975,T_1024_489,T_2048_981,T_256_123,T_2048_987,T_1024_495,T_2048_993,T_512_249,T_2048_999,T_1024_501,T_2048_1005,T_128_63,T_2048_1011,T_1024_507,T_2048_1017,T_512_255,T_2048_1023,T_1024_513,T_2048_1029,T_256_129,T_2048_1035,T_1024_519,T_2048_1041,T_512_261,T_2048_1047,T_1024_525,T_2048_1053,T_64_33,T_2048_1059,T_1024_531,T_2048_1065,T_512_267,T_2048_1071,T_1024_537,T_2048_1077,T_256_135,T_2048_1083,T_1024_543,T_2048_1089,T_512_273,T_2048_1095,T_1024_549,T_2048_1101,T_128_69,T_2048_1107,T_1024_555,T_2048_1113,T_512_279,T_2048_1119,T_1024_561,T_2048_1125,T_256_141,T_2048_1131,T_1024_567,T_2048_1137,T_512_285,T_2048_1143,T_1024_573,T_2048_1149,T_16_9,T_2048_1155,T_1024_579,T_2048_1161,T_512_291,T_2048_1167,T_1024_585,T_2048_1173,T_256_147,T_2048_1179,T_1024_591,T_2048_1185,T_512_297,T_2048_1191,T_1024_597,T_2048_1197,T_128_75,T_2048_1203,T_1024_603,T_2048_1209,T_512_303,T_2048_1215,T_1024_609,T_2048_1221,T_256_153,T_2048_1227,T_1024_615,T_2048_1233,T_512_309,T_2048_1239,T_1024_621,T_2048_1245,T_64_39,T_2048_1251,T_1024_627,T_2048_1257,T_512_315,T_2048_1263,T_1024_633,T_2048_1269,T_256_159,T_2048_1275,T_1024_639,T_2048_1281,T_512_321,T_2048_1287,T_1024_645,T_2048_1293,T_128_81,T_2048_1299,T_1024_651,T_2048_1305,T_512_327,T_2048_1311,T_1024_657,T_2048_1317,T_256_165,T_2048_1323,T_1024_663,T_2048_1329,T_512_333,T_2048_1335,T_1024_669,T_2048_1341,T_32_21,T_2048_1347,T_1024_675,T_2048_1353,T_512_339,T_2048_1359,T_1024_681,T_2048_1365,T_256_171,T_2048_1371,T_1024_687,T_2048_1377,T_512_345,T_2048_1383,T_1024_693,T_2048_1389,T_128_87,T_2048_1395,T_1024_699,T_2048_1401,T_512_351,T_2048_1407,T_1024_705,T_2048_1413,T_256_177,T_2048_1419,T_1024_711,T_2048_1425,T_512_357,T_2048_1431,T_1024_717,T_2048_1437,T_64_45,T_2048_1443,T_1024_723,T_2048_1449,T_512_363,T_2048_1455,T_1024_729,T_2048_1461,T_256_183,T_2048_1467,T_1024_735,T_2048_1473,T_512_369,T_2048_1479,T_1024_741,T_2048_1485,T_128_93,T_2048_1491,T_1024_747,T_2048_1497,T_512_375,T_2048_1503,T_1024_753,T_2048_1509,T_256_189,T_2048_1515,T_1024_759,T_2048_1521,T_512_381,T_2048_1527,T_1024_765,T_2048_1533 +}; +static const __device__ float2 lut_sp_8_2048[256*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_16_1,T_2048_129,T_1024_65,T_2048_131,T_512_33,T_2048_133,T_1024_67,T_2048_135,T_256_17,T_2048_137,T_1024_69,T_2048_139,T_512_35,T_2048_141,T_1024_71,T_2048_143,T_128_9,T_2048_145,T_1024_73,T_2048_147,T_512_37,T_2048_149,T_1024_75,T_2048_151,T_256_19,T_2048_153,T_1024_77,T_2048_155,T_512_39,T_2048_157,T_1024_79,T_2048_159,T_64_5,T_2048_161,T_1024_81,T_2048_163,T_512_41,T_2048_165,T_1024_83,T_2048_167,T_256_21,T_2048_169,T_1024_85,T_2048_171,T_512_43,T_2048_173,T_1024_87,T_2048_175,T_128_11,T_2048_177,T_1024_89,T_2048_179,T_512_45,T_2048_181,T_1024_91,T_2048_183,T_256_23,T_2048_185,T_1024_93,T_2048_187,T_512_47,T_2048_189,T_1024_95,T_2048_191,T_32_3,T_2048_193,T_1024_97,T_2048_195,T_512_49,T_2048_197,T_1024_99,T_2048_199,T_256_25,T_2048_201,T_1024_101,T_2048_203,T_512_51,T_2048_205,T_1024_103,T_2048_207,T_128_13,T_2048_209,T_1024_105,T_2048_211,T_512_53,T_2048_213,T_1024_107,T_2048_215,T_256_27,T_2048_217,T_1024_109,T_2048_219,T_512_55,T_2048_221,T_1024_111,T_2048_223,T_64_7,T_2048_225,T_1024_113,T_2048_227,T_512_57,T_2048_229,T_1024_115,T_2048_231,T_256_29,T_2048_233,T_1024_117,T_2048_235,T_512_59,T_2048_237,T_1024_119,T_2048_239,T_128_15,T_2048_241,T_1024_121,T_2048_243,T_512_61,T_2048_245,T_1024_123,T_2048_247,T_256_31,T_2048_249,T_1024_125,T_2048_251,T_512_63,T_2048_253,T_1024_127,T_2048_255,T_2_0,T_2048_5,T_1024_5,T_2048_15,T_512_5,T_2048_25,T_1024_15,T_2048_35,T_256_5,T_2048_45,T_1024_25,T_2048_55,T_512_15,T_2048_65,T_1024_35,T_2048_75,T_128_5,T_2048_85,T_1024_45,T_2048_95,T_512_25,T_2048_105,T_1024_55,T_2048_115,T_256_15,T_2048_125,T_1024_65,T_2048_135,T_512_35,T_2048_145,T_1024_75,T_2048_155,T_64_5,T_2048_165,T_1024_85,T_2048_175,T_512_45,T_2048_185,T_1024_95,T_2048_195,T_256_25,T_2048_205,T_1024_105,T_2048_215,T_512_55,T_2048_225,T_1024_115,T_2048_235,T_128_15,T_2048_245,T_1024_125,T_2048_255,T_512_65,T_2048_265,T_1024_135,T_2048_275,T_256_35,T_2048_285,T_1024_145,T_2048_295,T_512_75,T_2048_305,T_1024_155,T_2048_315,T_32_5,T_2048_325,T_1024_165,T_2048_335,T_512_85,T_2048_345,T_1024_175,T_2048_355,T_256_45,T_2048_365,T_1024_185,T_2048_375,T_512_95,T_2048_385,T_1024_195,T_2048_395,T_128_25,T_2048_405,T_1024_205,T_2048_415,T_512_105,T_2048_425,T_1024_215,T_2048_435,T_256_55,T_2048_445,T_1024_225,T_2048_455,T_512_115,T_2048_465,T_1024_235,T_2048_475,T_64_15,T_2048_485,T_1024_245,T_2048_495,T_512_125,T_2048_505,T_1024_255,T_2048_515,T_256_65,T_2048_525,T_1024_265,T_2048_535,T_512_135,T_2048_545,T_1024_275,T_2048_555,T_128_35,T_2048_565,T_1024_285,T_2048_575,T_512_145,T_2048_585,T_1024_295,T_2048_595,T_256_75,T_2048_605,T_1024_305,T_2048_615,T_512_155,T_2048_625,T_1024_315,T_2048_635,T_16_5,T_2048_645,T_1024_325,T_2048_655,T_512_165,T_2048_665,T_1024_335,T_2048_675,T_256_85,T_2048_685,T_1024_345,T_2048_695,T_512_175,T_2048_705,T_1024_355,T_2048_715,T_128_45,T_2048_725,T_1024_365,T_2048_735,T_512_185,T_2048_745,T_1024_375,T_2048_755,T_256_95,T_2048_765,T_1024_385,T_2048_775,T_512_195,T_2048_785,T_1024_395,T_2048_795,T_64_25,T_2048_805,T_1024_405,T_2048_815,T_512_205,T_2048_825,T_1024_415,T_2048_835,T_256_105,T_2048_845,T_1024_425,T_2048_855,T_512_215,T_2048_865,T_1024_435,T_2048_875,T_128_55,T_2048_885,T_1024_445,T_2048_895,T_512_225,T_2048_905,T_1024_455,T_2048_915,T_256_115,T_2048_925,T_1024_465,T_2048_935,T_512_235,T_2048_945,T_1024_475,T_2048_955,T_32_15,T_2048_965,T_1024_485,T_2048_975,T_512_245,T_2048_985,T_1024_495,T_2048_995,T_256_125,T_2048_1005,T_1024_505,T_2048_1015,T_512_255,T_2048_1025,T_1024_515,T_2048_1035,T_128_65,T_2048_1045,T_1024_525,T_2048_1055,T_512_265,T_2048_1065,T_1024_535,T_2048_1075,T_256_135,T_2048_1085,T_1024_545,T_2048_1095,T_512_275,T_2048_1105,T_1024_555,T_2048_1115,T_64_35,T_2048_1125,T_1024_565,T_2048_1135,T_512_285,T_2048_1145,T_1024_575,T_2048_1155,T_256_145,T_2048_1165,T_1024_585,T_2048_1175,T_512_295,T_2048_1185,T_1024_595,T_2048_1195,T_128_75,T_2048_1205,T_1024_605,T_2048_1215,T_512_305,T_2048_1225,T_1024_615,T_2048_1235,T_256_155,T_2048_1245,T_1024_625,T_2048_1255,T_512_315,T_2048_1265,T_1024_635,T_2048_1275 +}; +static const __device__ float2 lut_sp_16_2048[128*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_2_0,T_2048_9,T_1024_9,T_2048_27,T_512_9,T_2048_45,T_1024_27,T_2048_63,T_256_9,T_2048_81,T_1024_45,T_2048_99,T_512_27,T_2048_117,T_1024_63,T_2048_135,T_128_9,T_2048_153,T_1024_81,T_2048_171,T_512_45,T_2048_189,T_1024_99,T_2048_207,T_256_27,T_2048_225,T_1024_117,T_2048_243,T_512_63,T_2048_261,T_1024_135,T_2048_279,T_64_9,T_2048_297,T_1024_153,T_2048_315,T_512_81,T_2048_333,T_1024_171,T_2048_351,T_256_45,T_2048_369,T_1024_189,T_2048_387,T_512_99,T_2048_405,T_1024_207,T_2048_423,T_128_27,T_2048_441,T_1024_225,T_2048_459,T_512_117,T_2048_477,T_1024_243,T_2048_495,T_256_63,T_2048_513,T_1024_261,T_2048_531,T_512_135,T_2048_549,T_1024_279,T_2048_567,T_32_9,T_2048_585,T_1024_297,T_2048_603,T_512_153,T_2048_621,T_1024_315,T_2048_639,T_256_81,T_2048_657,T_1024_333,T_2048_675,T_512_171,T_2048_693,T_1024_351,T_2048_711,T_128_45,T_2048_729,T_1024_369,T_2048_747,T_512_189,T_2048_765,T_1024_387,T_2048_783,T_256_99,T_2048_801,T_1024_405,T_2048_819,T_512_207,T_2048_837,T_1024_423,T_2048_855,T_64_27,T_2048_873,T_1024_441,T_2048_891,T_512_225,T_2048_909,T_1024_459,T_2048_927,T_256_117,T_2048_945,T_1024_477,T_2048_963,T_512_243,T_2048_981,T_1024_495,T_2048_999,T_128_63,T_2048_1017,T_1024_513,T_2048_1035,T_512_261,T_2048_1053,T_1024_531,T_2048_1071,T_256_135,T_2048_1089,T_1024_549,T_2048_1107,T_512_279,T_2048_1125,T_1024_567,T_2048_1143 +}; +static const __device__ float2 lut_sp_32_2048[64*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_2_0,T_2048_17,T_1024_17,T_2048_51,T_512_17,T_2048_85,T_1024_51,T_2048_119,T_256_17,T_2048_153,T_1024_85,T_2048_187,T_512_51,T_2048_221,T_1024_119,T_2048_255,T_128_17,T_2048_289,T_1024_153,T_2048_323,T_512_85,T_2048_357,T_1024_187,T_2048_391,T_256_51,T_2048_425,T_1024_221,T_2048_459,T_512_119,T_2048_493,T_1024_255,T_2048_527,T_64_17,T_2048_561,T_1024_289,T_2048_595,T_512_153,T_2048_629,T_1024_323,T_2048_663,T_256_85,T_2048_697,T_1024_357,T_2048_731,T_512_187,T_2048_765,T_1024_391,T_2048_799,T_128_51,T_2048_833,T_1024_425,T_2048_867,T_512_221,T_2048_901,T_1024_459,T_2048_935,T_256_119,T_2048_969,T_1024_493,T_2048_1003,T_512_255,T_2048_1037,T_1024_527,T_2048_1071 +}; +static const __device__ float2 lut_sp_3_2187[729*2] = { + T_2_0,T_2187_1,T_2187_2,T_729_1,T_2187_4,T_2187_5,T_729_2,T_2187_7,T_2187_8,T_243_1,T_2187_10,T_2187_11,T_729_4,T_2187_13,T_2187_14,T_729_5,T_2187_16,T_2187_17,T_243_2,T_2187_19,T_2187_20,T_729_7,T_2187_22,T_2187_23,T_729_8,T_2187_25,T_2187_26,T_81_1,T_2187_28,T_2187_29,T_729_10,T_2187_31,T_2187_32,T_729_11,T_2187_34,T_2187_35,T_243_4,T_2187_37,T_2187_38,T_729_13,T_2187_40,T_2187_41,T_729_14,T_2187_43,T_2187_44,T_243_5,T_2187_46,T_2187_47,T_729_16,T_2187_49,T_2187_50,T_729_17,T_2187_52,T_2187_53,T_81_2,T_2187_55,T_2187_56,T_729_19,T_2187_58,T_2187_59,T_729_20,T_2187_61,T_2187_62,T_243_7,T_2187_64,T_2187_65,T_729_22,T_2187_67,T_2187_68,T_729_23,T_2187_70,T_2187_71,T_243_8,T_2187_73,T_2187_74,T_729_25,T_2187_76,T_2187_77,T_729_26,T_2187_79,T_2187_80,T_27_1,T_2187_82,T_2187_83,T_729_28,T_2187_85,T_2187_86,T_729_29,T_2187_88,T_2187_89,T_243_10,T_2187_91,T_2187_92,T_729_31,T_2187_94,T_2187_95,T_729_32,T_2187_97,T_2187_98,T_243_11,T_2187_100,T_2187_101,T_729_34,T_2187_103,T_2187_104,T_729_35,T_2187_106,T_2187_107,T_81_4,T_2187_109,T_2187_110,T_729_37,T_2187_112,T_2187_113,T_729_38,T_2187_115,T_2187_116,T_243_13,T_2187_118,T_2187_119,T_729_40,T_2187_121,T_2187_122,T_729_41,T_2187_124,T_2187_125,T_243_14,T_2187_127,T_2187_128,T_729_43,T_2187_130,T_2187_131,T_729_44,T_2187_133,T_2187_134,T_81_5,T_2187_136,T_2187_137,T_729_46,T_2187_139,T_2187_140,T_729_47,T_2187_142,T_2187_143,T_243_16,T_2187_145,T_2187_146,T_729_49,T_2187_148,T_2187_149,T_729_50,T_2187_151,T_2187_152,T_243_17,T_2187_154,T_2187_155,T_729_52,T_2187_157,T_2187_158,T_729_53,T_2187_160,T_2187_161,T_27_2,T_2187_163,T_2187_164,T_729_55,T_2187_166,T_2187_167,T_729_56,T_2187_169,T_2187_170,T_243_19,T_2187_172,T_2187_173,T_729_58,T_2187_175,T_2187_176,T_729_59,T_2187_178,T_2187_179,T_243_20,T_2187_181,T_2187_182,T_729_61,T_2187_184,T_2187_185,T_729_62,T_2187_187,T_2187_188,T_81_7,T_2187_190,T_2187_191,T_729_64,T_2187_193,T_2187_194,T_729_65,T_2187_196,T_2187_197,T_243_22,T_2187_199,T_2187_200,T_729_67,T_2187_202,T_2187_203,T_729_68,T_2187_205,T_2187_206,T_243_23,T_2187_208,T_2187_209,T_729_70,T_2187_211,T_2187_212,T_729_71,T_2187_214,T_2187_215,T_81_8,T_2187_217,T_2187_218,T_729_73,T_2187_220,T_2187_221,T_729_74,T_2187_223,T_2187_224,T_243_25,T_2187_226,T_2187_227,T_729_76,T_2187_229,T_2187_230,T_729_77,T_2187_232,T_2187_233,T_243_26,T_2187_235,T_2187_236,T_729_79,T_2187_238,T_2187_239,T_729_80,T_2187_241,T_2187_242,T_9_1,T_2187_244,T_2187_245,T_729_82,T_2187_247,T_2187_248,T_729_83,T_2187_250,T_2187_251,T_243_28,T_2187_253,T_2187_254,T_729_85,T_2187_256,T_2187_257,T_729_86,T_2187_259,T_2187_260,T_243_29,T_2187_262,T_2187_263,T_729_88,T_2187_265,T_2187_266,T_729_89,T_2187_268,T_2187_269,T_81_10,T_2187_271,T_2187_272,T_729_91,T_2187_274,T_2187_275,T_729_92,T_2187_277,T_2187_278,T_243_31,T_2187_280,T_2187_281,T_729_94,T_2187_283,T_2187_284,T_729_95,T_2187_286,T_2187_287,T_243_32,T_2187_289,T_2187_290,T_729_97,T_2187_292,T_2187_293,T_729_98,T_2187_295,T_2187_296,T_81_11,T_2187_298,T_2187_299,T_729_100,T_2187_301,T_2187_302,T_729_101,T_2187_304,T_2187_305,T_243_34,T_2187_307,T_2187_308,T_729_103,T_2187_310,T_2187_311,T_729_104,T_2187_313,T_2187_314,T_243_35,T_2187_316,T_2187_317,T_729_106,T_2187_319,T_2187_320,T_729_107,T_2187_322,T_2187_323,T_27_4,T_2187_325,T_2187_326,T_729_109,T_2187_328,T_2187_329,T_729_110,T_2187_331,T_2187_332,T_243_37,T_2187_334,T_2187_335,T_729_112,T_2187_337,T_2187_338,T_729_113,T_2187_340,T_2187_341,T_243_38,T_2187_343,T_2187_344,T_729_115,T_2187_346,T_2187_347,T_729_116,T_2187_349,T_2187_350,T_81_13,T_2187_352,T_2187_353,T_729_118,T_2187_355,T_2187_356,T_729_119,T_2187_358,T_2187_359,T_243_40,T_2187_361,T_2187_362,T_729_121,T_2187_364,T_2187_365,T_729_122,T_2187_367,T_2187_368,T_243_41,T_2187_370,T_2187_371,T_729_124,T_2187_373,T_2187_374,T_729_125,T_2187_376,T_2187_377,T_81_14,T_2187_379,T_2187_380,T_729_127,T_2187_382,T_2187_383,T_729_128,T_2187_385,T_2187_386,T_243_43,T_2187_388,T_2187_389,T_729_130,T_2187_391,T_2187_392,T_729_131,T_2187_394,T_2187_395,T_243_44,T_2187_397,T_2187_398,T_729_133,T_2187_400,T_2187_401,T_729_134,T_2187_403,T_2187_404,T_27_5,T_2187_406,T_2187_407,T_729_136,T_2187_409,T_2187_410,T_729_137,T_2187_412,T_2187_413,T_243_46,T_2187_415,T_2187_416,T_729_139,T_2187_418,T_2187_419,T_729_140,T_2187_421,T_2187_422,T_243_47,T_2187_424,T_2187_425,T_729_142,T_2187_427,T_2187_428,T_729_143,T_2187_430,T_2187_431,T_81_16,T_2187_433,T_2187_434,T_729_145,T_2187_436,T_2187_437,T_729_146,T_2187_439,T_2187_440,T_243_49,T_2187_442,T_2187_443,T_729_148,T_2187_445,T_2187_446,T_729_149,T_2187_448,T_2187_449,T_243_50,T_2187_451,T_2187_452,T_729_151,T_2187_454,T_2187_455,T_729_152,T_2187_457,T_2187_458,T_81_17,T_2187_460,T_2187_461,T_729_154,T_2187_463,T_2187_464,T_729_155,T_2187_466,T_2187_467,T_243_52,T_2187_469,T_2187_470,T_729_157,T_2187_472,T_2187_473,T_729_158,T_2187_475,T_2187_476,T_243_53,T_2187_478,T_2187_479,T_729_160,T_2187_481,T_2187_482,T_729_161,T_2187_484,T_2187_485,T_9_2,T_2187_487,T_2187_488,T_729_163,T_2187_490,T_2187_491,T_729_164,T_2187_493,T_2187_494,T_243_55,T_2187_496,T_2187_497,T_729_166,T_2187_499,T_2187_500,T_729_167,T_2187_502,T_2187_503,T_243_56,T_2187_505,T_2187_506,T_729_169,T_2187_508,T_2187_509,T_729_170,T_2187_511,T_2187_512,T_81_19,T_2187_514,T_2187_515,T_729_172,T_2187_517,T_2187_518,T_729_173,T_2187_520,T_2187_521,T_243_58,T_2187_523,T_2187_524,T_729_175,T_2187_526,T_2187_527,T_729_176,T_2187_529,T_2187_530,T_243_59,T_2187_532,T_2187_533,T_729_178,T_2187_535,T_2187_536,T_729_179,T_2187_538,T_2187_539,T_81_20,T_2187_541,T_2187_542,T_729_181,T_2187_544,T_2187_545,T_729_182,T_2187_547,T_2187_548,T_243_61,T_2187_550,T_2187_551,T_729_184,T_2187_553,T_2187_554,T_729_185,T_2187_556,T_2187_557,T_243_62,T_2187_559,T_2187_560,T_729_187,T_2187_562,T_2187_563,T_729_188,T_2187_565,T_2187_566,T_27_7,T_2187_568,T_2187_569,T_729_190,T_2187_571,T_2187_572,T_729_191,T_2187_574,T_2187_575,T_243_64,T_2187_577,T_2187_578,T_729_193,T_2187_580,T_2187_581,T_729_194,T_2187_583,T_2187_584,T_243_65,T_2187_586,T_2187_587,T_729_196,T_2187_589,T_2187_590,T_729_197,T_2187_592,T_2187_593,T_81_22,T_2187_595,T_2187_596,T_729_199,T_2187_598,T_2187_599,T_729_200,T_2187_601,T_2187_602,T_243_67,T_2187_604,T_2187_605,T_729_202,T_2187_607,T_2187_608,T_729_203,T_2187_610,T_2187_611,T_243_68,T_2187_613,T_2187_614,T_729_205,T_2187_616,T_2187_617,T_729_206,T_2187_619,T_2187_620,T_81_23,T_2187_622,T_2187_623,T_729_208,T_2187_625,T_2187_626,T_729_209,T_2187_628,T_2187_629,T_243_70,T_2187_631,T_2187_632,T_729_211,T_2187_634,T_2187_635,T_729_212,T_2187_637,T_2187_638,T_243_71,T_2187_640,T_2187_641,T_729_214,T_2187_643,T_2187_644,T_729_215,T_2187_646,T_2187_647,T_27_8,T_2187_649,T_2187_650,T_729_217,T_2187_652,T_2187_653,T_729_218,T_2187_655,T_2187_656,T_243_73,T_2187_658,T_2187_659,T_729_220,T_2187_661,T_2187_662,T_729_221,T_2187_664,T_2187_665,T_243_74,T_2187_667,T_2187_668,T_729_223,T_2187_670,T_2187_671,T_729_224,T_2187_673,T_2187_674,T_81_25,T_2187_676,T_2187_677,T_729_226,T_2187_679,T_2187_680,T_729_227,T_2187_682,T_2187_683,T_243_76,T_2187_685,T_2187_686,T_729_229,T_2187_688,T_2187_689,T_729_230,T_2187_691,T_2187_692,T_243_77,T_2187_694,T_2187_695,T_729_232,T_2187_697,T_2187_698,T_729_233,T_2187_700,T_2187_701,T_81_26,T_2187_703,T_2187_704,T_729_235,T_2187_706,T_2187_707,T_729_236,T_2187_709,T_2187_710,T_243_79,T_2187_712,T_2187_713,T_729_238,T_2187_715,T_2187_716,T_729_239,T_2187_718,T_2187_719,T_243_80,T_2187_721,T_2187_722,T_729_241,T_2187_724,T_2187_725,T_729_242,T_2187_727,T_2187_728,T_2_0,T_2187_2,T_2187_4,T_729_2,T_2187_8,T_2187_10,T_729_4,T_2187_14,T_2187_16,T_243_2,T_2187_20,T_2187_22,T_729_8,T_2187_26,T_2187_28,T_729_10,T_2187_32,T_2187_34,T_243_4,T_2187_38,T_2187_40,T_729_14,T_2187_44,T_2187_46,T_729_16,T_2187_50,T_2187_52,T_81_2,T_2187_56,T_2187_58,T_729_20,T_2187_62,T_2187_64,T_729_22,T_2187_68,T_2187_70,T_243_8,T_2187_74,T_2187_76,T_729_26,T_2187_80,T_2187_82,T_729_28,T_2187_86,T_2187_88,T_243_10,T_2187_92,T_2187_94,T_729_32,T_2187_98,T_2187_100,T_729_34,T_2187_104,T_2187_106,T_81_4,T_2187_110,T_2187_112,T_729_38,T_2187_116,T_2187_118,T_729_40,T_2187_122,T_2187_124,T_243_14,T_2187_128,T_2187_130,T_729_44,T_2187_134,T_2187_136,T_729_46,T_2187_140,T_2187_142,T_243_16,T_2187_146,T_2187_148,T_729_50,T_2187_152,T_2187_154,T_729_52,T_2187_158,T_2187_160,T_27_2,T_2187_164,T_2187_166,T_729_56,T_2187_170,T_2187_172,T_729_58,T_2187_176,T_2187_178,T_243_20,T_2187_182,T_2187_184,T_729_62,T_2187_188,T_2187_190,T_729_64,T_2187_194,T_2187_196,T_243_22,T_2187_200,T_2187_202,T_729_68,T_2187_206,T_2187_208,T_729_70,T_2187_212,T_2187_214,T_81_8,T_2187_218,T_2187_220,T_729_74,T_2187_224,T_2187_226,T_729_76,T_2187_230,T_2187_232,T_243_26,T_2187_236,T_2187_238,T_729_80,T_2187_242,T_2187_244,T_729_82,T_2187_248,T_2187_250,T_243_28,T_2187_254,T_2187_256,T_729_86,T_2187_260,T_2187_262,T_729_88,T_2187_266,T_2187_268,T_81_10,T_2187_272,T_2187_274,T_729_92,T_2187_278,T_2187_280,T_729_94,T_2187_284,T_2187_286,T_243_32,T_2187_290,T_2187_292,T_729_98,T_2187_296,T_2187_298,T_729_100,T_2187_302,T_2187_304,T_243_34,T_2187_308,T_2187_310,T_729_104,T_2187_314,T_2187_316,T_729_106,T_2187_320,T_2187_322,T_27_4,T_2187_326,T_2187_328,T_729_110,T_2187_332,T_2187_334,T_729_112,T_2187_338,T_2187_340,T_243_38,T_2187_344,T_2187_346,T_729_116,T_2187_350,T_2187_352,T_729_118,T_2187_356,T_2187_358,T_243_40,T_2187_362,T_2187_364,T_729_122,T_2187_368,T_2187_370,T_729_124,T_2187_374,T_2187_376,T_81_14,T_2187_380,T_2187_382,T_729_128,T_2187_386,T_2187_388,T_729_130,T_2187_392,T_2187_394,T_243_44,T_2187_398,T_2187_400,T_729_134,T_2187_404,T_2187_406,T_729_136,T_2187_410,T_2187_412,T_243_46,T_2187_416,T_2187_418,T_729_140,T_2187_422,T_2187_424,T_729_142,T_2187_428,T_2187_430,T_81_16,T_2187_434,T_2187_436,T_729_146,T_2187_440,T_2187_442,T_729_148,T_2187_446,T_2187_448,T_243_50,T_2187_452,T_2187_454,T_729_152,T_2187_458,T_2187_460,T_729_154,T_2187_464,T_2187_466,T_243_52,T_2187_470,T_2187_472,T_729_158,T_2187_476,T_2187_478,T_729_160,T_2187_482,T_2187_484,T_9_2,T_2187_488,T_2187_490,T_729_164,T_2187_494,T_2187_496,T_729_166,T_2187_500,T_2187_502,T_243_56,T_2187_506,T_2187_508,T_729_170,T_2187_512,T_2187_514,T_729_172,T_2187_518,T_2187_520,T_243_58,T_2187_524,T_2187_526,T_729_176,T_2187_530,T_2187_532,T_729_178,T_2187_536,T_2187_538,T_81_20,T_2187_542,T_2187_544,T_729_182,T_2187_548,T_2187_550,T_729_184,T_2187_554,T_2187_556,T_243_62,T_2187_560,T_2187_562,T_729_188,T_2187_566,T_2187_568,T_729_190,T_2187_572,T_2187_574,T_243_64,T_2187_578,T_2187_580,T_729_194,T_2187_584,T_2187_586,T_729_196,T_2187_590,T_2187_592,T_81_22,T_2187_596,T_2187_598,T_729_200,T_2187_602,T_2187_604,T_729_202,T_2187_608,T_2187_610,T_243_68,T_2187_614,T_2187_616,T_729_206,T_2187_620,T_2187_622,T_729_208,T_2187_626,T_2187_628,T_243_70,T_2187_632,T_2187_634,T_729_212,T_2187_638,T_2187_640,T_729_214,T_2187_644,T_2187_646,T_27_8,T_2187_650,T_2187_652,T_729_218,T_2187_656,T_2187_658,T_729_220,T_2187_662,T_2187_664,T_243_74,T_2187_668,T_2187_670,T_729_224,T_2187_674,T_2187_676,T_729_226,T_2187_680,T_2187_682,T_243_76,T_2187_686,T_2187_688,T_729_230,T_2187_692,T_2187_694,T_729_232,T_2187_698,T_2187_700,T_81_26,T_2187_704,T_2187_706,T_729_236,T_2187_710,T_2187_712,T_729_238,T_2187_716,T_2187_718,T_243_80,T_2187_722,T_2187_724,T_729_242,T_2187_728,T_2187_730,T_729_244,T_2187_734,T_2187_736,T_243_82,T_2187_740,T_2187_742,T_729_248,T_2187_746,T_2187_748,T_729_250,T_2187_752,T_2187_754,T_81_28,T_2187_758,T_2187_760,T_729_254,T_2187_764,T_2187_766,T_729_256,T_2187_770,T_2187_772,T_243_86,T_2187_776,T_2187_778,T_729_260,T_2187_782,T_2187_784,T_729_262,T_2187_788,T_2187_790,T_243_88,T_2187_794,T_2187_796,T_729_266,T_2187_800,T_2187_802,T_729_268,T_2187_806,T_2187_808,T_27_10,T_2187_812,T_2187_814,T_729_272,T_2187_818,T_2187_820,T_729_274,T_2187_824,T_2187_826,T_243_92,T_2187_830,T_2187_832,T_729_278,T_2187_836,T_2187_838,T_729_280,T_2187_842,T_2187_844,T_243_94,T_2187_848,T_2187_850,T_729_284,T_2187_854,T_2187_856,T_729_286,T_2187_860,T_2187_862,T_81_32,T_2187_866,T_2187_868,T_729_290,T_2187_872,T_2187_874,T_729_292,T_2187_878,T_2187_880,T_243_98,T_2187_884,T_2187_886,T_729_296,T_2187_890,T_2187_892,T_729_298,T_2187_896,T_2187_898,T_243_100,T_2187_902,T_2187_904,T_729_302,T_2187_908,T_2187_910,T_729_304,T_2187_914,T_2187_916,T_81_34,T_2187_920,T_2187_922,T_729_308,T_2187_926,T_2187_928,T_729_310,T_2187_932,T_2187_934,T_243_104,T_2187_938,T_2187_940,T_729_314,T_2187_944,T_2187_946,T_729_316,T_2187_950,T_2187_952,T_243_106,T_2187_956,T_2187_958,T_729_320,T_2187_962,T_2187_964,T_729_322,T_2187_968,T_2187_970,T_9_4,T_2187_974,T_2187_976,T_729_326,T_2187_980,T_2187_982,T_729_328,T_2187_986,T_2187_988,T_243_110,T_2187_992,T_2187_994,T_729_332,T_2187_998,T_2187_1000,T_729_334,T_2187_1004,T_2187_1006,T_243_112,T_2187_1010,T_2187_1012,T_729_338,T_2187_1016,T_2187_1018,T_729_340,T_2187_1022,T_2187_1024,T_81_38,T_2187_1028,T_2187_1030,T_729_344,T_2187_1034,T_2187_1036,T_729_346,T_2187_1040,T_2187_1042,T_243_116,T_2187_1046,T_2187_1048,T_729_350,T_2187_1052,T_2187_1054,T_729_352,T_2187_1058,T_2187_1060,T_243_118,T_2187_1064,T_2187_1066,T_729_356,T_2187_1070,T_2187_1072,T_729_358,T_2187_1076,T_2187_1078,T_81_40,T_2187_1082,T_2187_1084,T_729_362,T_2187_1088,T_2187_1090,T_729_364,T_2187_1094,T_2187_1096,T_243_122,T_2187_1100,T_2187_1102,T_729_368,T_2187_1106,T_2187_1108,T_729_370,T_2187_1112,T_2187_1114,T_243_124,T_2187_1118,T_2187_1120,T_729_374,T_2187_1124,T_2187_1126,T_729_376,T_2187_1130,T_2187_1132,T_27_14,T_2187_1136,T_2187_1138,T_729_380,T_2187_1142,T_2187_1144,T_729_382,T_2187_1148,T_2187_1150,T_243_128,T_2187_1154,T_2187_1156,T_729_386,T_2187_1160,T_2187_1162,T_729_388,T_2187_1166,T_2187_1168,T_243_130,T_2187_1172,T_2187_1174,T_729_392,T_2187_1178,T_2187_1180,T_729_394,T_2187_1184,T_2187_1186,T_81_44,T_2187_1190,T_2187_1192,T_729_398,T_2187_1196,T_2187_1198,T_729_400,T_2187_1202,T_2187_1204,T_243_134,T_2187_1208,T_2187_1210,T_729_404,T_2187_1214,T_2187_1216,T_729_406,T_2187_1220,T_2187_1222,T_243_136,T_2187_1226,T_2187_1228,T_729_410,T_2187_1232,T_2187_1234,T_729_412,T_2187_1238,T_2187_1240,T_81_46,T_2187_1244,T_2187_1246,T_729_416,T_2187_1250,T_2187_1252,T_729_418,T_2187_1256,T_2187_1258,T_243_140,T_2187_1262,T_2187_1264,T_729_422,T_2187_1268,T_2187_1270,T_729_424,T_2187_1274,T_2187_1276,T_243_142,T_2187_1280,T_2187_1282,T_729_428,T_2187_1286,T_2187_1288,T_729_430,T_2187_1292,T_2187_1294,T_27_16,T_2187_1298,T_2187_1300,T_729_434,T_2187_1304,T_2187_1306,T_729_436,T_2187_1310,T_2187_1312,T_243_146,T_2187_1316,T_2187_1318,T_729_440,T_2187_1322,T_2187_1324,T_729_442,T_2187_1328,T_2187_1330,T_243_148,T_2187_1334,T_2187_1336,T_729_446,T_2187_1340,T_2187_1342,T_729_448,T_2187_1346,T_2187_1348,T_81_50,T_2187_1352,T_2187_1354,T_729_452,T_2187_1358,T_2187_1360,T_729_454,T_2187_1364,T_2187_1366,T_243_152,T_2187_1370,T_2187_1372,T_729_458,T_2187_1376,T_2187_1378,T_729_460,T_2187_1382,T_2187_1384,T_243_154,T_2187_1388,T_2187_1390,T_729_464,T_2187_1394,T_2187_1396,T_729_466,T_2187_1400,T_2187_1402,T_81_52,T_2187_1406,T_2187_1408,T_729_470,T_2187_1412,T_2187_1414,T_729_472,T_2187_1418,T_2187_1420,T_243_158,T_2187_1424,T_2187_1426,T_729_476,T_2187_1430,T_2187_1432,T_729_478,T_2187_1436,T_2187_1438,T_243_160,T_2187_1442,T_2187_1444,T_729_482,T_2187_1448,T_2187_1450,T_729_484,T_2187_1454,T_2187_1456 +}; +static const __device__ float2 lut_sp_9_2187[243*2] = { + T_2_0,T_2187_1,T_2187_2,T_729_1,T_2187_4,T_2187_5,T_729_2,T_2187_7,T_2187_8,T_243_1,T_2187_10,T_2187_11,T_729_4,T_2187_13,T_2187_14,T_729_5,T_2187_16,T_2187_17,T_243_2,T_2187_19,T_2187_20,T_729_7,T_2187_22,T_2187_23,T_729_8,T_2187_25,T_2187_26,T_81_1,T_2187_28,T_2187_29,T_729_10,T_2187_31,T_2187_32,T_729_11,T_2187_34,T_2187_35,T_243_4,T_2187_37,T_2187_38,T_729_13,T_2187_40,T_2187_41,T_729_14,T_2187_43,T_2187_44,T_243_5,T_2187_46,T_2187_47,T_729_16,T_2187_49,T_2187_50,T_729_17,T_2187_52,T_2187_53,T_81_2,T_2187_55,T_2187_56,T_729_19,T_2187_58,T_2187_59,T_729_20,T_2187_61,T_2187_62,T_243_7,T_2187_64,T_2187_65,T_729_22,T_2187_67,T_2187_68,T_729_23,T_2187_70,T_2187_71,T_243_8,T_2187_73,T_2187_74,T_729_25,T_2187_76,T_2187_77,T_729_26,T_2187_79,T_2187_80,T_27_1,T_2187_82,T_2187_83,T_729_28,T_2187_85,T_2187_86,T_729_29,T_2187_88,T_2187_89,T_243_10,T_2187_91,T_2187_92,T_729_31,T_2187_94,T_2187_95,T_729_32,T_2187_97,T_2187_98,T_243_11,T_2187_100,T_2187_101,T_729_34,T_2187_103,T_2187_104,T_729_35,T_2187_106,T_2187_107,T_81_4,T_2187_109,T_2187_110,T_729_37,T_2187_112,T_2187_113,T_729_38,T_2187_115,T_2187_116,T_243_13,T_2187_118,T_2187_119,T_729_40,T_2187_121,T_2187_122,T_729_41,T_2187_124,T_2187_125,T_243_14,T_2187_127,T_2187_128,T_729_43,T_2187_130,T_2187_131,T_729_44,T_2187_133,T_2187_134,T_81_5,T_2187_136,T_2187_137,T_729_46,T_2187_139,T_2187_140,T_729_47,T_2187_142,T_2187_143,T_243_16,T_2187_145,T_2187_146,T_729_49,T_2187_148,T_2187_149,T_729_50,T_2187_151,T_2187_152,T_243_17,T_2187_154,T_2187_155,T_729_52,T_2187_157,T_2187_158,T_729_53,T_2187_160,T_2187_161,T_27_2,T_2187_163,T_2187_164,T_729_55,T_2187_166,T_2187_167,T_729_56,T_2187_169,T_2187_170,T_243_19,T_2187_172,T_2187_173,T_729_58,T_2187_175,T_2187_176,T_729_59,T_2187_178,T_2187_179,T_243_20,T_2187_181,T_2187_182,T_729_61,T_2187_184,T_2187_185,T_729_62,T_2187_187,T_2187_188,T_81_7,T_2187_190,T_2187_191,T_729_64,T_2187_193,T_2187_194,T_729_65,T_2187_196,T_2187_197,T_243_22,T_2187_199,T_2187_200,T_729_67,T_2187_202,T_2187_203,T_729_68,T_2187_205,T_2187_206,T_243_23,T_2187_208,T_2187_209,T_729_70,T_2187_211,T_2187_212,T_729_71,T_2187_214,T_2187_215,T_81_8,T_2187_217,T_2187_218,T_729_73,T_2187_220,T_2187_221,T_729_74,T_2187_223,T_2187_224,T_243_25,T_2187_226,T_2187_227,T_729_76,T_2187_229,T_2187_230,T_729_77,T_2187_232,T_2187_233,T_243_26,T_2187_235,T_2187_236,T_729_79,T_2187_238,T_2187_239,T_729_80,T_2187_241,T_2187_242,T_2_0,T_2187_5,T_2187_10,T_729_5,T_2187_20,T_2187_25,T_729_10,T_2187_35,T_2187_40,T_243_5,T_2187_50,T_2187_55,T_729_20,T_2187_65,T_2187_70,T_729_25,T_2187_80,T_2187_85,T_243_10,T_2187_95,T_2187_100,T_729_35,T_2187_110,T_2187_115,T_729_40,T_2187_125,T_2187_130,T_81_5,T_2187_140,T_2187_145,T_729_50,T_2187_155,T_2187_160,T_729_55,T_2187_170,T_2187_175,T_243_20,T_2187_185,T_2187_190,T_729_65,T_2187_200,T_2187_205,T_729_70,T_2187_215,T_2187_220,T_243_25,T_2187_230,T_2187_235,T_729_80,T_2187_245,T_2187_250,T_729_85,T_2187_260,T_2187_265,T_81_10,T_2187_275,T_2187_280,T_729_95,T_2187_290,T_2187_295,T_729_100,T_2187_305,T_2187_310,T_243_35,T_2187_320,T_2187_325,T_729_110,T_2187_335,T_2187_340,T_729_115,T_2187_350,T_2187_355,T_243_40,T_2187_365,T_2187_370,T_729_125,T_2187_380,T_2187_385,T_729_130,T_2187_395,T_2187_400,T_27_5,T_2187_410,T_2187_415,T_729_140,T_2187_425,T_2187_430,T_729_145,T_2187_440,T_2187_445,T_243_50,T_2187_455,T_2187_460,T_729_155,T_2187_470,T_2187_475,T_729_160,T_2187_485,T_2187_490,T_243_55,T_2187_500,T_2187_505,T_729_170,T_2187_515,T_2187_520,T_729_175,T_2187_530,T_2187_535,T_81_20,T_2187_545,T_2187_550,T_729_185,T_2187_560,T_2187_565,T_729_190,T_2187_575,T_2187_580,T_243_65,T_2187_590,T_2187_595,T_729_200,T_2187_605,T_2187_610,T_729_205,T_2187_620,T_2187_625,T_243_70,T_2187_635,T_2187_640,T_729_215,T_2187_650,T_2187_655,T_729_220,T_2187_665,T_2187_670,T_81_25,T_2187_680,T_2187_685,T_729_230,T_2187_695,T_2187_700,T_729_235,T_2187_710,T_2187_715,T_243_80,T_2187_725,T_2187_730,T_729_245,T_2187_740,T_2187_745,T_729_250,T_2187_755,T_2187_760,T_243_85,T_2187_770,T_2187_775,T_729_260,T_2187_785,T_2187_790,T_729_265,T_2187_800,T_2187_805,T_27_10,T_2187_815,T_2187_820,T_729_275,T_2187_830,T_2187_835,T_729_280,T_2187_845,T_2187_850,T_243_95,T_2187_860,T_2187_865,T_729_290,T_2187_875,T_2187_880,T_729_295,T_2187_890,T_2187_895,T_243_100,T_2187_905,T_2187_910,T_729_305,T_2187_920,T_2187_925,T_729_310,T_2187_935,T_2187_940,T_81_35,T_2187_950,T_2187_955,T_729_320,T_2187_965,T_2187_970,T_729_325,T_2187_980,T_2187_985,T_243_110,T_2187_995,T_2187_1000,T_729_335,T_2187_1010,T_2187_1015,T_729_340,T_2187_1025,T_2187_1030,T_243_115,T_2187_1040,T_2187_1045,T_729_350,T_2187_1055,T_2187_1060,T_729_355,T_2187_1070,T_2187_1075,T_81_40,T_2187_1085,T_2187_1090,T_729_365,T_2187_1100,T_2187_1105,T_729_370,T_2187_1115,T_2187_1120,T_243_125,T_2187_1130,T_2187_1135,T_729_380,T_2187_1145,T_2187_1150,T_729_385,T_2187_1160,T_2187_1165,T_243_130,T_2187_1175,T_2187_1180,T_729_395,T_2187_1190,T_2187_1195,T_729_400,T_2187_1205,T_2187_1210 +}; +static const __device__ float2 lut_sp_27_2187[81*2] = { + T_2_0,T_2187_1,T_2187_2,T_729_1,T_2187_4,T_2187_5,T_729_2,T_2187_7,T_2187_8,T_243_1,T_2187_10,T_2187_11,T_729_4,T_2187_13,T_2187_14,T_729_5,T_2187_16,T_2187_17,T_243_2,T_2187_19,T_2187_20,T_729_7,T_2187_22,T_2187_23,T_729_8,T_2187_25,T_2187_26,T_81_1,T_2187_28,T_2187_29,T_729_10,T_2187_31,T_2187_32,T_729_11,T_2187_34,T_2187_35,T_243_4,T_2187_37,T_2187_38,T_729_13,T_2187_40,T_2187_41,T_729_14,T_2187_43,T_2187_44,T_243_5,T_2187_46,T_2187_47,T_729_16,T_2187_49,T_2187_50,T_729_17,T_2187_52,T_2187_53,T_81_2,T_2187_55,T_2187_56,T_729_19,T_2187_58,T_2187_59,T_729_20,T_2187_61,T_2187_62,T_243_7,T_2187_64,T_2187_65,T_729_22,T_2187_67,T_2187_68,T_729_23,T_2187_70,T_2187_71,T_243_8,T_2187_73,T_2187_74,T_729_25,T_2187_76,T_2187_77,T_729_26,T_2187_79,T_2187_80,T_2_0,T_2187_14,T_2187_28,T_729_14,T_2187_56,T_2187_70,T_729_28,T_2187_98,T_2187_112,T_243_14,T_2187_140,T_2187_154,T_729_56,T_2187_182,T_2187_196,T_729_70,T_2187_224,T_2187_238,T_243_28,T_2187_266,T_2187_280,T_729_98,T_2187_308,T_2187_322,T_729_112,T_2187_350,T_2187_364,T_81_14,T_2187_392,T_2187_406,T_729_140,T_2187_434,T_2187_448,T_729_154,T_2187_476,T_2187_490,T_243_56,T_2187_518,T_2187_532,T_729_182,T_2187_560,T_2187_574,T_729_196,T_2187_602,T_2187_616,T_243_70,T_2187_644,T_2187_658,T_729_224,T_2187_686,T_2187_700,T_729_238,T_2187_728,T_2187_742,T_81_28,T_2187_770,T_2187_784,T_729_266,T_2187_812,T_2187_826,T_729_280,T_2187_854,T_2187_868,T_243_98,T_2187_896,T_2187_910,T_729_308,T_2187_938,T_2187_952,T_729_322,T_2187_980,T_2187_994,T_243_112,T_2187_1022,T_2187_1036,T_729_350,T_2187_1064,T_2187_1078,T_729_364,T_2187_1106,T_2187_1120 +}; +static const __device__ float2 lut_sp_7_2401[343*2] = { + T_2_0,T_2401_1,T_2401_2,T_2401_3,T_2401_4,T_2401_5,T_2401_6,T_343_1,T_2401_8,T_2401_9,T_2401_10,T_2401_11,T_2401_12,T_2401_13,T_343_2,T_2401_15,T_2401_16,T_2401_17,T_2401_18,T_2401_19,T_2401_20,T_343_3,T_2401_22,T_2401_23,T_2401_24,T_2401_25,T_2401_26,T_2401_27,T_343_4,T_2401_29,T_2401_30,T_2401_31,T_2401_32,T_2401_33,T_2401_34,T_343_5,T_2401_36,T_2401_37,T_2401_38,T_2401_39,T_2401_40,T_2401_41,T_343_6,T_2401_43,T_2401_44,T_2401_45,T_2401_46,T_2401_47,T_2401_48,T_49_1,T_2401_50,T_2401_51,T_2401_52,T_2401_53,T_2401_54,T_2401_55,T_343_8,T_2401_57,T_2401_58,T_2401_59,T_2401_60,T_2401_61,T_2401_62,T_343_9,T_2401_64,T_2401_65,T_2401_66,T_2401_67,T_2401_68,T_2401_69,T_343_10,T_2401_71,T_2401_72,T_2401_73,T_2401_74,T_2401_75,T_2401_76,T_343_11,T_2401_78,T_2401_79,T_2401_80,T_2401_81,T_2401_82,T_2401_83,T_343_12,T_2401_85,T_2401_86,T_2401_87,T_2401_88,T_2401_89,T_2401_90,T_343_13,T_2401_92,T_2401_93,T_2401_94,T_2401_95,T_2401_96,T_2401_97,T_49_2,T_2401_99,T_2401_100,T_2401_101,T_2401_102,T_2401_103,T_2401_104,T_343_15,T_2401_106,T_2401_107,T_2401_108,T_2401_109,T_2401_110,T_2401_111,T_343_16,T_2401_113,T_2401_114,T_2401_115,T_2401_116,T_2401_117,T_2401_118,T_343_17,T_2401_120,T_2401_121,T_2401_122,T_2401_123,T_2401_124,T_2401_125,T_343_18,T_2401_127,T_2401_128,T_2401_129,T_2401_130,T_2401_131,T_2401_132,T_343_19,T_2401_134,T_2401_135,T_2401_136,T_2401_137,T_2401_138,T_2401_139,T_343_20,T_2401_141,T_2401_142,T_2401_143,T_2401_144,T_2401_145,T_2401_146,T_49_3,T_2401_148,T_2401_149,T_2401_150,T_2401_151,T_2401_152,T_2401_153,T_343_22,T_2401_155,T_2401_156,T_2401_157,T_2401_158,T_2401_159,T_2401_160,T_343_23,T_2401_162,T_2401_163,T_2401_164,T_2401_165,T_2401_166,T_2401_167,T_343_24,T_2401_169,T_2401_170,T_2401_171,T_2401_172,T_2401_173,T_2401_174,T_343_25,T_2401_176,T_2401_177,T_2401_178,T_2401_179,T_2401_180,T_2401_181,T_343_26,T_2401_183,T_2401_184,T_2401_185,T_2401_186,T_2401_187,T_2401_188,T_343_27,T_2401_190,T_2401_191,T_2401_192,T_2401_193,T_2401_194,T_2401_195,T_49_4,T_2401_197,T_2401_198,T_2401_199,T_2401_200,T_2401_201,T_2401_202,T_343_29,T_2401_204,T_2401_205,T_2401_206,T_2401_207,T_2401_208,T_2401_209,T_343_30,T_2401_211,T_2401_212,T_2401_213,T_2401_214,T_2401_215,T_2401_216,T_343_31,T_2401_218,T_2401_219,T_2401_220,T_2401_221,T_2401_222,T_2401_223,T_343_32,T_2401_225,T_2401_226,T_2401_227,T_2401_228,T_2401_229,T_2401_230,T_343_33,T_2401_232,T_2401_233,T_2401_234,T_2401_235,T_2401_236,T_2401_237,T_343_34,T_2401_239,T_2401_240,T_2401_241,T_2401_242,T_2401_243,T_2401_244,T_49_5,T_2401_246,T_2401_247,T_2401_248,T_2401_249,T_2401_250,T_2401_251,T_343_36,T_2401_253,T_2401_254,T_2401_255,T_2401_256,T_2401_257,T_2401_258,T_343_37,T_2401_260,T_2401_261,T_2401_262,T_2401_263,T_2401_264,T_2401_265,T_343_38,T_2401_267,T_2401_268,T_2401_269,T_2401_270,T_2401_271,T_2401_272,T_343_39,T_2401_274,T_2401_275,T_2401_276,T_2401_277,T_2401_278,T_2401_279,T_343_40,T_2401_281,T_2401_282,T_2401_283,T_2401_284,T_2401_285,T_2401_286,T_343_41,T_2401_288,T_2401_289,T_2401_290,T_2401_291,T_2401_292,T_2401_293,T_49_6,T_2401_295,T_2401_296,T_2401_297,T_2401_298,T_2401_299,T_2401_300,T_343_43,T_2401_302,T_2401_303,T_2401_304,T_2401_305,T_2401_306,T_2401_307,T_343_44,T_2401_309,T_2401_310,T_2401_311,T_2401_312,T_2401_313,T_2401_314,T_343_45,T_2401_316,T_2401_317,T_2401_318,T_2401_319,T_2401_320,T_2401_321,T_343_46,T_2401_323,T_2401_324,T_2401_325,T_2401_326,T_2401_327,T_2401_328,T_343_47,T_2401_330,T_2401_331,T_2401_332,T_2401_333,T_2401_334,T_2401_335,T_343_48,T_2401_337,T_2401_338,T_2401_339,T_2401_340,T_2401_341,T_2401_342,T_2_0,T_2401_4,T_2401_8,T_2401_12,T_2401_16,T_2401_20,T_2401_24,T_343_4,T_2401_32,T_2401_36,T_2401_40,T_2401_44,T_2401_48,T_2401_52,T_343_8,T_2401_60,T_2401_64,T_2401_68,T_2401_72,T_2401_76,T_2401_80,T_343_12,T_2401_88,T_2401_92,T_2401_96,T_2401_100,T_2401_104,T_2401_108,T_343_16,T_2401_116,T_2401_120,T_2401_124,T_2401_128,T_2401_132,T_2401_136,T_343_20,T_2401_144,T_2401_148,T_2401_152,T_2401_156,T_2401_160,T_2401_164,T_343_24,T_2401_172,T_2401_176,T_2401_180,T_2401_184,T_2401_188,T_2401_192,T_49_4,T_2401_200,T_2401_204,T_2401_208,T_2401_212,T_2401_216,T_2401_220,T_343_32,T_2401_228,T_2401_232,T_2401_236,T_2401_240,T_2401_244,T_2401_248,T_343_36,T_2401_256,T_2401_260,T_2401_264,T_2401_268,T_2401_272,T_2401_276,T_343_40,T_2401_284,T_2401_288,T_2401_292,T_2401_296,T_2401_300,T_2401_304,T_343_44,T_2401_312,T_2401_316,T_2401_320,T_2401_324,T_2401_328,T_2401_332,T_343_48,T_2401_340,T_2401_344,T_2401_348,T_2401_352,T_2401_356,T_2401_360,T_343_52,T_2401_368,T_2401_372,T_2401_376,T_2401_380,T_2401_384,T_2401_388,T_49_8,T_2401_396,T_2401_400,T_2401_404,T_2401_408,T_2401_412,T_2401_416,T_343_60,T_2401_424,T_2401_428,T_2401_432,T_2401_436,T_2401_440,T_2401_444,T_343_64,T_2401_452,T_2401_456,T_2401_460,T_2401_464,T_2401_468,T_2401_472,T_343_68,T_2401_480,T_2401_484,T_2401_488,T_2401_492,T_2401_496,T_2401_500,T_343_72,T_2401_508,T_2401_512,T_2401_516,T_2401_520,T_2401_524,T_2401_528,T_343_76,T_2401_536,T_2401_540,T_2401_544,T_2401_548,T_2401_552,T_2401_556,T_343_80,T_2401_564,T_2401_568,T_2401_572,T_2401_576,T_2401_580,T_2401_584,T_49_12,T_2401_592,T_2401_596,T_2401_600,T_2401_604,T_2401_608,T_2401_612,T_343_88,T_2401_620,T_2401_624,T_2401_628,T_2401_632,T_2401_636,T_2401_640,T_343_92,T_2401_648,T_2401_652,T_2401_656,T_2401_660,T_2401_664,T_2401_668,T_343_96,T_2401_676,T_2401_680,T_2401_684,T_2401_688,T_2401_692,T_2401_696,T_343_100,T_2401_704,T_2401_708,T_2401_712,T_2401_716,T_2401_720,T_2401_724,T_343_104,T_2401_732,T_2401_736,T_2401_740,T_2401_744,T_2401_748,T_2401_752,T_343_108,T_2401_760,T_2401_764,T_2401_768,T_2401_772,T_2401_776,T_2401_780,T_49_16,T_2401_788,T_2401_792,T_2401_796,T_2401_800,T_2401_804,T_2401_808,T_343_116,T_2401_816,T_2401_820,T_2401_824,T_2401_828,T_2401_832,T_2401_836,T_343_120,T_2401_844,T_2401_848,T_2401_852,T_2401_856,T_2401_860,T_2401_864,T_343_124,T_2401_872,T_2401_876,T_2401_880,T_2401_884,T_2401_888,T_2401_892,T_343_128,T_2401_900,T_2401_904,T_2401_908,T_2401_912,T_2401_916,T_2401_920,T_343_132,T_2401_928,T_2401_932,T_2401_936,T_2401_940,T_2401_944,T_2401_948,T_343_136,T_2401_956,T_2401_960,T_2401_964,T_2401_968,T_2401_972,T_2401_976,T_49_20,T_2401_984,T_2401_988,T_2401_992,T_2401_996,T_2401_1000,T_2401_1004,T_343_144,T_2401_1012,T_2401_1016,T_2401_1020,T_2401_1024,T_2401_1028,T_2401_1032,T_343_148,T_2401_1040,T_2401_1044,T_2401_1048,T_2401_1052,T_2401_1056,T_2401_1060,T_343_152,T_2401_1068,T_2401_1072,T_2401_1076,T_2401_1080,T_2401_1084,T_2401_1088,T_343_156,T_2401_1096,T_2401_1100,T_2401_1104,T_2401_1108,T_2401_1112,T_2401_1116,T_343_160,T_2401_1124,T_2401_1128,T_2401_1132,T_2401_1136,T_2401_1140,T_2401_1144,T_343_164,T_2401_1152,T_2401_1156,T_2401_1160,T_2401_1164,T_2401_1168,T_2401_1172,T_49_24,T_2401_1180,T_2401_1184,T_2401_1188,T_2401_1192,T_2401_1196,T_2401_1200,T_343_172,T_2401_1208,T_2401_1212,T_2401_1216,T_2401_1220,T_2401_1224,T_2401_1228,T_343_176,T_2401_1236,T_2401_1240,T_2401_1244,T_2401_1248,T_2401_1252,T_2401_1256,T_343_180,T_2401_1264,T_2401_1268,T_2401_1272,T_2401_1276,T_2401_1280,T_2401_1284,T_343_184,T_2401_1292,T_2401_1296,T_2401_1300,T_2401_1304,T_2401_1308,T_2401_1312,T_343_188,T_2401_1320,T_2401_1324,T_2401_1328,T_2401_1332,T_2401_1336,T_2401_1340,T_343_192,T_2401_1348,T_2401_1352,T_2401_1356,T_2401_1360,T_2401_1364,T_2401_1368 +}; +static const __device__ float2 lut_sp_5_3125[625*2] = { + T_2_0,T_3125_1,T_3125_2,T_3125_3,T_3125_4,T_625_1,T_3125_6,T_3125_7,T_3125_8,T_3125_9,T_625_2,T_3125_11,T_3125_12,T_3125_13,T_3125_14,T_625_3,T_3125_16,T_3125_17,T_3125_18,T_3125_19,T_625_4,T_3125_21,T_3125_22,T_3125_23,T_3125_24,T_125_1,T_3125_26,T_3125_27,T_3125_28,T_3125_29,T_625_6,T_3125_31,T_3125_32,T_3125_33,T_3125_34,T_625_7,T_3125_36,T_3125_37,T_3125_38,T_3125_39,T_625_8,T_3125_41,T_3125_42,T_3125_43,T_3125_44,T_625_9,T_3125_46,T_3125_47,T_3125_48,T_3125_49,T_125_2,T_3125_51,T_3125_52,T_3125_53,T_3125_54,T_625_11,T_3125_56,T_3125_57,T_3125_58,T_3125_59,T_625_12,T_3125_61,T_3125_62,T_3125_63,T_3125_64,T_625_13,T_3125_66,T_3125_67,T_3125_68,T_3125_69,T_625_14,T_3125_71,T_3125_72,T_3125_73,T_3125_74,T_125_3,T_3125_76,T_3125_77,T_3125_78,T_3125_79,T_625_16,T_3125_81,T_3125_82,T_3125_83,T_3125_84,T_625_17,T_3125_86,T_3125_87,T_3125_88,T_3125_89,T_625_18,T_3125_91,T_3125_92,T_3125_93,T_3125_94,T_625_19,T_3125_96,T_3125_97,T_3125_98,T_3125_99,T_125_4,T_3125_101,T_3125_102,T_3125_103,T_3125_104,T_625_21,T_3125_106,T_3125_107,T_3125_108,T_3125_109,T_625_22,T_3125_111,T_3125_112,T_3125_113,T_3125_114,T_625_23,T_3125_116,T_3125_117,T_3125_118,T_3125_119,T_625_24,T_3125_121,T_3125_122,T_3125_123,T_3125_124,T_25_1,T_3125_126,T_3125_127,T_3125_128,T_3125_129,T_625_26,T_3125_131,T_3125_132,T_3125_133,T_3125_134,T_625_27,T_3125_136,T_3125_137,T_3125_138,T_3125_139,T_625_28,T_3125_141,T_3125_142,T_3125_143,T_3125_144,T_625_29,T_3125_146,T_3125_147,T_3125_148,T_3125_149,T_125_6,T_3125_151,T_3125_152,T_3125_153,T_3125_154,T_625_31,T_3125_156,T_3125_157,T_3125_158,T_3125_159,T_625_32,T_3125_161,T_3125_162,T_3125_163,T_3125_164,T_625_33,T_3125_166,T_3125_167,T_3125_168,T_3125_169,T_625_34,T_3125_171,T_3125_172,T_3125_173,T_3125_174,T_125_7,T_3125_176,T_3125_177,T_3125_178,T_3125_179,T_625_36,T_3125_181,T_3125_182,T_3125_183,T_3125_184,T_625_37,T_3125_186,T_3125_187,T_3125_188,T_3125_189,T_625_38,T_3125_191,T_3125_192,T_3125_193,T_3125_194,T_625_39,T_3125_196,T_3125_197,T_3125_198,T_3125_199,T_125_8,T_3125_201,T_3125_202,T_3125_203,T_3125_204,T_625_41,T_3125_206,T_3125_207,T_3125_208,T_3125_209,T_625_42,T_3125_211,T_3125_212,T_3125_213,T_3125_214,T_625_43,T_3125_216,T_3125_217,T_3125_218,T_3125_219,T_625_44,T_3125_221,T_3125_222,T_3125_223,T_3125_224,T_125_9,T_3125_226,T_3125_227,T_3125_228,T_3125_229,T_625_46,T_3125_231,T_3125_232,T_3125_233,T_3125_234,T_625_47,T_3125_236,T_3125_237,T_3125_238,T_3125_239,T_625_48,T_3125_241,T_3125_242,T_3125_243,T_3125_244,T_625_49,T_3125_246,T_3125_247,T_3125_248,T_3125_249,T_25_2,T_3125_251,T_3125_252,T_3125_253,T_3125_254,T_625_51,T_3125_256,T_3125_257,T_3125_258,T_3125_259,T_625_52,T_3125_261,T_3125_262,T_3125_263,T_3125_264,T_625_53,T_3125_266,T_3125_267,T_3125_268,T_3125_269,T_625_54,T_3125_271,T_3125_272,T_3125_273,T_3125_274,T_125_11,T_3125_276,T_3125_277,T_3125_278,T_3125_279,T_625_56,T_3125_281,T_3125_282,T_3125_283,T_3125_284,T_625_57,T_3125_286,T_3125_287,T_3125_288,T_3125_289,T_625_58,T_3125_291,T_3125_292,T_3125_293,T_3125_294,T_625_59,T_3125_296,T_3125_297,T_3125_298,T_3125_299,T_125_12,T_3125_301,T_3125_302,T_3125_303,T_3125_304,T_625_61,T_3125_306,T_3125_307,T_3125_308,T_3125_309,T_625_62,T_3125_311,T_3125_312,T_3125_313,T_3125_314,T_625_63,T_3125_316,T_3125_317,T_3125_318,T_3125_319,T_625_64,T_3125_321,T_3125_322,T_3125_323,T_3125_324,T_125_13,T_3125_326,T_3125_327,T_3125_328,T_3125_329,T_625_66,T_3125_331,T_3125_332,T_3125_333,T_3125_334,T_625_67,T_3125_336,T_3125_337,T_3125_338,T_3125_339,T_625_68,T_3125_341,T_3125_342,T_3125_343,T_3125_344,T_625_69,T_3125_346,T_3125_347,T_3125_348,T_3125_349,T_125_14,T_3125_351,T_3125_352,T_3125_353,T_3125_354,T_625_71,T_3125_356,T_3125_357,T_3125_358,T_3125_359,T_625_72,T_3125_361,T_3125_362,T_3125_363,T_3125_364,T_625_73,T_3125_366,T_3125_367,T_3125_368,T_3125_369,T_625_74,T_3125_371,T_3125_372,T_3125_373,T_3125_374,T_25_3,T_3125_376,T_3125_377,T_3125_378,T_3125_379,T_625_76,T_3125_381,T_3125_382,T_3125_383,T_3125_384,T_625_77,T_3125_386,T_3125_387,T_3125_388,T_3125_389,T_625_78,T_3125_391,T_3125_392,T_3125_393,T_3125_394,T_625_79,T_3125_396,T_3125_397,T_3125_398,T_3125_399,T_125_16,T_3125_401,T_3125_402,T_3125_403,T_3125_404,T_625_81,T_3125_406,T_3125_407,T_3125_408,T_3125_409,T_625_82,T_3125_411,T_3125_412,T_3125_413,T_3125_414,T_625_83,T_3125_416,T_3125_417,T_3125_418,T_3125_419,T_625_84,T_3125_421,T_3125_422,T_3125_423,T_3125_424,T_125_17,T_3125_426,T_3125_427,T_3125_428,T_3125_429,T_625_86,T_3125_431,T_3125_432,T_3125_433,T_3125_434,T_625_87,T_3125_436,T_3125_437,T_3125_438,T_3125_439,T_625_88,T_3125_441,T_3125_442,T_3125_443,T_3125_444,T_625_89,T_3125_446,T_3125_447,T_3125_448,T_3125_449,T_125_18,T_3125_451,T_3125_452,T_3125_453,T_3125_454,T_625_91,T_3125_456,T_3125_457,T_3125_458,T_3125_459,T_625_92,T_3125_461,T_3125_462,T_3125_463,T_3125_464,T_625_93,T_3125_466,T_3125_467,T_3125_468,T_3125_469,T_625_94,T_3125_471,T_3125_472,T_3125_473,T_3125_474,T_125_19,T_3125_476,T_3125_477,T_3125_478,T_3125_479,T_625_96,T_3125_481,T_3125_482,T_3125_483,T_3125_484,T_625_97,T_3125_486,T_3125_487,T_3125_488,T_3125_489,T_625_98,T_3125_491,T_3125_492,T_3125_493,T_3125_494,T_625_99,T_3125_496,T_3125_497,T_3125_498,T_3125_499,T_25_4,T_3125_501,T_3125_502,T_3125_503,T_3125_504,T_625_101,T_3125_506,T_3125_507,T_3125_508,T_3125_509,T_625_102,T_3125_511,T_3125_512,T_3125_513,T_3125_514,T_625_103,T_3125_516,T_3125_517,T_3125_518,T_3125_519,T_625_104,T_3125_521,T_3125_522,T_3125_523,T_3125_524,T_125_21,T_3125_526,T_3125_527,T_3125_528,T_3125_529,T_625_106,T_3125_531,T_3125_532,T_3125_533,T_3125_534,T_625_107,T_3125_536,T_3125_537,T_3125_538,T_3125_539,T_625_108,T_3125_541,T_3125_542,T_3125_543,T_3125_544,T_625_109,T_3125_546,T_3125_547,T_3125_548,T_3125_549,T_125_22,T_3125_551,T_3125_552,T_3125_553,T_3125_554,T_625_111,T_3125_556,T_3125_557,T_3125_558,T_3125_559,T_625_112,T_3125_561,T_3125_562,T_3125_563,T_3125_564,T_625_113,T_3125_566,T_3125_567,T_3125_568,T_3125_569,T_625_114,T_3125_571,T_3125_572,T_3125_573,T_3125_574,T_125_23,T_3125_576,T_3125_577,T_3125_578,T_3125_579,T_625_116,T_3125_581,T_3125_582,T_3125_583,T_3125_584,T_625_117,T_3125_586,T_3125_587,T_3125_588,T_3125_589,T_625_118,T_3125_591,T_3125_592,T_3125_593,T_3125_594,T_625_119,T_3125_596,T_3125_597,T_3125_598,T_3125_599,T_125_24,T_3125_601,T_3125_602,T_3125_603,T_3125_604,T_625_121,T_3125_606,T_3125_607,T_3125_608,T_3125_609,T_625_122,T_3125_611,T_3125_612,T_3125_613,T_3125_614,T_625_123,T_3125_616,T_3125_617,T_3125_618,T_3125_619,T_625_124,T_3125_621,T_3125_622,T_3125_623,T_3125_624,T_2_0,T_3125_3,T_3125_6,T_3125_9,T_3125_12,T_625_3,T_3125_18,T_3125_21,T_3125_24,T_3125_27,T_625_6,T_3125_33,T_3125_36,T_3125_39,T_3125_42,T_625_9,T_3125_48,T_3125_51,T_3125_54,T_3125_57,T_625_12,T_3125_63,T_3125_66,T_3125_69,T_3125_72,T_125_3,T_3125_78,T_3125_81,T_3125_84,T_3125_87,T_625_18,T_3125_93,T_3125_96,T_3125_99,T_3125_102,T_625_21,T_3125_108,T_3125_111,T_3125_114,T_3125_117,T_625_24,T_3125_123,T_3125_126,T_3125_129,T_3125_132,T_625_27,T_3125_138,T_3125_141,T_3125_144,T_3125_147,T_125_6,T_3125_153,T_3125_156,T_3125_159,T_3125_162,T_625_33,T_3125_168,T_3125_171,T_3125_174,T_3125_177,T_625_36,T_3125_183,T_3125_186,T_3125_189,T_3125_192,T_625_39,T_3125_198,T_3125_201,T_3125_204,T_3125_207,T_625_42,T_3125_213,T_3125_216,T_3125_219,T_3125_222,T_125_9,T_3125_228,T_3125_231,T_3125_234,T_3125_237,T_625_48,T_3125_243,T_3125_246,T_3125_249,T_3125_252,T_625_51,T_3125_258,T_3125_261,T_3125_264,T_3125_267,T_625_54,T_3125_273,T_3125_276,T_3125_279,T_3125_282,T_625_57,T_3125_288,T_3125_291,T_3125_294,T_3125_297,T_125_12,T_3125_303,T_3125_306,T_3125_309,T_3125_312,T_625_63,T_3125_318,T_3125_321,T_3125_324,T_3125_327,T_625_66,T_3125_333,T_3125_336,T_3125_339,T_3125_342,T_625_69,T_3125_348,T_3125_351,T_3125_354,T_3125_357,T_625_72,T_3125_363,T_3125_366,T_3125_369,T_3125_372,T_25_3,T_3125_378,T_3125_381,T_3125_384,T_3125_387,T_625_78,T_3125_393,T_3125_396,T_3125_399,T_3125_402,T_625_81,T_3125_408,T_3125_411,T_3125_414,T_3125_417,T_625_84,T_3125_423,T_3125_426,T_3125_429,T_3125_432,T_625_87,T_3125_438,T_3125_441,T_3125_444,T_3125_447,T_125_18,T_3125_453,T_3125_456,T_3125_459,T_3125_462,T_625_93,T_3125_468,T_3125_471,T_3125_474,T_3125_477,T_625_96,T_3125_483,T_3125_486,T_3125_489,T_3125_492,T_625_99,T_3125_498,T_3125_501,T_3125_504,T_3125_507,T_625_102,T_3125_513,T_3125_516,T_3125_519,T_3125_522,T_125_21,T_3125_528,T_3125_531,T_3125_534,T_3125_537,T_625_108,T_3125_543,T_3125_546,T_3125_549,T_3125_552,T_625_111,T_3125_558,T_3125_561,T_3125_564,T_3125_567,T_625_114,T_3125_573,T_3125_576,T_3125_579,T_3125_582,T_625_117,T_3125_588,T_3125_591,T_3125_594,T_3125_597,T_125_24,T_3125_603,T_3125_606,T_3125_609,T_3125_612,T_625_123,T_3125_618,T_3125_621,T_3125_624,T_3125_627,T_625_126,T_3125_633,T_3125_636,T_3125_639,T_3125_642,T_625_129,T_3125_648,T_3125_651,T_3125_654,T_3125_657,T_625_132,T_3125_663,T_3125_666,T_3125_669,T_3125_672,T_125_27,T_3125_678,T_3125_681,T_3125_684,T_3125_687,T_625_138,T_3125_693,T_3125_696,T_3125_699,T_3125_702,T_625_141,T_3125_708,T_3125_711,T_3125_714,T_3125_717,T_625_144,T_3125_723,T_3125_726,T_3125_729,T_3125_732,T_625_147,T_3125_738,T_3125_741,T_3125_744,T_3125_747,T_25_6,T_3125_753,T_3125_756,T_3125_759,T_3125_762,T_625_153,T_3125_768,T_3125_771,T_3125_774,T_3125_777,T_625_156,T_3125_783,T_3125_786,T_3125_789,T_3125_792,T_625_159,T_3125_798,T_3125_801,T_3125_804,T_3125_807,T_625_162,T_3125_813,T_3125_816,T_3125_819,T_3125_822,T_125_33,T_3125_828,T_3125_831,T_3125_834,T_3125_837,T_625_168,T_3125_843,T_3125_846,T_3125_849,T_3125_852,T_625_171,T_3125_858,T_3125_861,T_3125_864,T_3125_867,T_625_174,T_3125_873,T_3125_876,T_3125_879,T_3125_882,T_625_177,T_3125_888,T_3125_891,T_3125_894,T_3125_897,T_125_36,T_3125_903,T_3125_906,T_3125_909,T_3125_912,T_625_183,T_3125_918,T_3125_921,T_3125_924,T_3125_927,T_625_186,T_3125_933,T_3125_936,T_3125_939,T_3125_942,T_625_189,T_3125_948,T_3125_951,T_3125_954,T_3125_957,T_625_192,T_3125_963,T_3125_966,T_3125_969,T_3125_972,T_125_39,T_3125_978,T_3125_981,T_3125_984,T_3125_987,T_625_198,T_3125_993,T_3125_996,T_3125_999,T_3125_1002,T_625_201,T_3125_1008,T_3125_1011,T_3125_1014,T_3125_1017,T_625_204,T_3125_1023,T_3125_1026,T_3125_1029,T_3125_1032,T_625_207,T_3125_1038,T_3125_1041,T_3125_1044,T_3125_1047,T_125_42,T_3125_1053,T_3125_1056,T_3125_1059,T_3125_1062,T_625_213,T_3125_1068,T_3125_1071,T_3125_1074,T_3125_1077,T_625_216,T_3125_1083,T_3125_1086,T_3125_1089,T_3125_1092,T_625_219,T_3125_1098,T_3125_1101,T_3125_1104,T_3125_1107,T_625_222,T_3125_1113,T_3125_1116,T_3125_1119,T_3125_1122,T_25_9,T_3125_1128,T_3125_1131,T_3125_1134,T_3125_1137,T_625_228,T_3125_1143,T_3125_1146,T_3125_1149,T_3125_1152,T_625_231,T_3125_1158,T_3125_1161,T_3125_1164,T_3125_1167,T_625_234,T_3125_1173,T_3125_1176,T_3125_1179,T_3125_1182,T_625_237,T_3125_1188,T_3125_1191,T_3125_1194,T_3125_1197,T_125_48,T_3125_1203,T_3125_1206,T_3125_1209,T_3125_1212,T_625_243,T_3125_1218,T_3125_1221,T_3125_1224,T_3125_1227,T_625_246,T_3125_1233,T_3125_1236,T_3125_1239,T_3125_1242,T_625_249,T_3125_1248,T_3125_1251,T_3125_1254,T_3125_1257,T_625_252,T_3125_1263,T_3125_1266,T_3125_1269,T_3125_1272,T_125_51,T_3125_1278,T_3125_1281,T_3125_1284,T_3125_1287,T_625_258,T_3125_1293,T_3125_1296,T_3125_1299,T_3125_1302,T_625_261,T_3125_1308,T_3125_1311,T_3125_1314,T_3125_1317,T_625_264,T_3125_1323,T_3125_1326,T_3125_1329,T_3125_1332,T_625_267,T_3125_1338,T_3125_1341,T_3125_1344,T_3125_1347,T_125_54,T_3125_1353,T_3125_1356,T_3125_1359,T_3125_1362,T_625_273,T_3125_1368,T_3125_1371,T_3125_1374,T_3125_1377,T_625_276,T_3125_1383,T_3125_1386,T_3125_1389,T_3125_1392,T_625_279,T_3125_1398,T_3125_1401,T_3125_1404,T_3125_1407,T_625_282,T_3125_1413,T_3125_1416,T_3125_1419,T_3125_1422,T_125_57,T_3125_1428,T_3125_1431,T_3125_1434,T_3125_1437,T_625_288,T_3125_1443,T_3125_1446,T_3125_1449,T_3125_1452,T_625_291,T_3125_1458,T_3125_1461,T_3125_1464,T_3125_1467,T_625_294,T_3125_1473,T_3125_1476,T_3125_1479,T_3125_1482,T_625_297,T_3125_1488,T_3125_1491,T_3125_1494,T_3125_1497,T_25_12,T_3125_1503,T_3125_1506,T_3125_1509,T_3125_1512,T_625_303,T_3125_1518,T_3125_1521,T_3125_1524,T_3125_1527,T_625_306,T_3125_1533,T_3125_1536,T_3125_1539,T_3125_1542,T_625_309,T_3125_1548,T_3125_1551,T_3125_1554,T_3125_1557,T_625_312,T_3125_1563,T_3125_1566,T_3125_1569,T_3125_1572,T_125_63,T_3125_1578,T_3125_1581,T_3125_1584,T_3125_1587,T_625_318,T_3125_1593,T_3125_1596,T_3125_1599,T_3125_1602,T_625_321,T_3125_1608,T_3125_1611,T_3125_1614,T_3125_1617,T_625_324,T_3125_1623,T_3125_1626,T_3125_1629,T_3125_1632,T_625_327,T_3125_1638,T_3125_1641,T_3125_1644,T_3125_1647,T_125_66,T_3125_1653,T_3125_1656,T_3125_1659,T_3125_1662,T_625_333,T_3125_1668,T_3125_1671,T_3125_1674,T_3125_1677,T_625_336,T_3125_1683,T_3125_1686,T_3125_1689,T_3125_1692,T_625_339,T_3125_1698,T_3125_1701,T_3125_1704,T_3125_1707,T_625_342,T_3125_1713,T_3125_1716,T_3125_1719,T_3125_1722,T_125_69,T_3125_1728,T_3125_1731,T_3125_1734,T_3125_1737,T_625_348,T_3125_1743,T_3125_1746,T_3125_1749,T_3125_1752,T_625_351,T_3125_1758,T_3125_1761,T_3125_1764,T_3125_1767,T_625_354,T_3125_1773,T_3125_1776,T_3125_1779,T_3125_1782,T_625_357,T_3125_1788,T_3125_1791,T_3125_1794,T_3125_1797,T_125_72,T_3125_1803,T_3125_1806,T_3125_1809,T_3125_1812,T_625_363,T_3125_1818,T_3125_1821,T_3125_1824,T_3125_1827,T_625_366,T_3125_1833,T_3125_1836,T_3125_1839,T_3125_1842,T_625_369,T_3125_1848,T_3125_1851,T_3125_1854,T_3125_1857,T_625_372,T_3125_1863,T_3125_1866,T_3125_1869,T_3125_1872 +}; +static const __device__ float2 lut_sp_25_3125[125*2] = { + T_2_0,T_3125_1,T_3125_2,T_3125_3,T_3125_4,T_625_1,T_3125_6,T_3125_7,T_3125_8,T_3125_9,T_625_2,T_3125_11,T_3125_12,T_3125_13,T_3125_14,T_625_3,T_3125_16,T_3125_17,T_3125_18,T_3125_19,T_625_4,T_3125_21,T_3125_22,T_3125_23,T_3125_24,T_125_1,T_3125_26,T_3125_27,T_3125_28,T_3125_29,T_625_6,T_3125_31,T_3125_32,T_3125_33,T_3125_34,T_625_7,T_3125_36,T_3125_37,T_3125_38,T_3125_39,T_625_8,T_3125_41,T_3125_42,T_3125_43,T_3125_44,T_625_9,T_3125_46,T_3125_47,T_3125_48,T_3125_49,T_125_2,T_3125_51,T_3125_52,T_3125_53,T_3125_54,T_625_11,T_3125_56,T_3125_57,T_3125_58,T_3125_59,T_625_12,T_3125_61,T_3125_62,T_3125_63,T_3125_64,T_625_13,T_3125_66,T_3125_67,T_3125_68,T_3125_69,T_625_14,T_3125_71,T_3125_72,T_3125_73,T_3125_74,T_125_3,T_3125_76,T_3125_77,T_3125_78,T_3125_79,T_625_16,T_3125_81,T_3125_82,T_3125_83,T_3125_84,T_625_17,T_3125_86,T_3125_87,T_3125_88,T_3125_89,T_625_18,T_3125_91,T_3125_92,T_3125_93,T_3125_94,T_625_19,T_3125_96,T_3125_97,T_3125_98,T_3125_99,T_125_4,T_3125_101,T_3125_102,T_3125_103,T_3125_104,T_625_21,T_3125_106,T_3125_107,T_3125_108,T_3125_109,T_625_22,T_3125_111,T_3125_112,T_3125_113,T_3125_114,T_625_23,T_3125_116,T_3125_117,T_3125_118,T_3125_119,T_625_24,T_3125_121,T_3125_122,T_3125_123,T_3125_124,T_2_0,T_3125_13,T_3125_26,T_3125_39,T_3125_52,T_625_13,T_3125_78,T_3125_91,T_3125_104,T_3125_117,T_625_26,T_3125_143,T_3125_156,T_3125_169,T_3125_182,T_625_39,T_3125_208,T_3125_221,T_3125_234,T_3125_247,T_625_52,T_3125_273,T_3125_286,T_3125_299,T_3125_312,T_125_13,T_3125_338,T_3125_351,T_3125_364,T_3125_377,T_625_78,T_3125_403,T_3125_416,T_3125_429,T_3125_442,T_625_91,T_3125_468,T_3125_481,T_3125_494,T_3125_507,T_625_104,T_3125_533,T_3125_546,T_3125_559,T_3125_572,T_625_117,T_3125_598,T_3125_611,T_3125_624,T_3125_637,T_125_26,T_3125_663,T_3125_676,T_3125_689,T_3125_702,T_625_143,T_3125_728,T_3125_741,T_3125_754,T_3125_767,T_625_156,T_3125_793,T_3125_806,T_3125_819,T_3125_832,T_625_169,T_3125_858,T_3125_871,T_3125_884,T_3125_897,T_625_182,T_3125_923,T_3125_936,T_3125_949,T_3125_962,T_125_39,T_3125_988,T_3125_1001,T_3125_1014,T_3125_1027,T_625_208,T_3125_1053,T_3125_1066,T_3125_1079,T_3125_1092,T_625_221,T_3125_1118,T_3125_1131,T_3125_1144,T_3125_1157,T_625_234,T_3125_1183,T_3125_1196,T_3125_1209,T_3125_1222,T_625_247,T_3125_1248,T_3125_1261,T_3125_1274,T_3125_1287,T_125_52,T_3125_1313,T_3125_1326,T_3125_1339,T_3125_1352,T_625_273,T_3125_1378,T_3125_1391,T_3125_1404,T_3125_1417,T_625_286,T_3125_1443,T_3125_1456,T_3125_1469,T_3125_1482,T_625_299,T_3125_1508,T_3125_1521,T_3125_1534,T_3125_1547,T_625_312,T_3125_1573,T_3125_1586,T_3125_1599,T_3125_1612 +}; +static const __device__ float2 lut_sp_4_4096[1024*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_32_1,T_4096_129,T_2048_65,T_4096_131,T_1024_33,T_4096_133,T_2048_67,T_4096_135,T_512_17,T_4096_137,T_2048_69,T_4096_139,T_1024_35,T_4096_141,T_2048_71,T_4096_143,T_256_9,T_4096_145,T_2048_73,T_4096_147,T_1024_37,T_4096_149,T_2048_75,T_4096_151,T_512_19,T_4096_153,T_2048_77,T_4096_155,T_1024_39,T_4096_157,T_2048_79,T_4096_159,T_128_5,T_4096_161,T_2048_81,T_4096_163,T_1024_41,T_4096_165,T_2048_83,T_4096_167,T_512_21,T_4096_169,T_2048_85,T_4096_171,T_1024_43,T_4096_173,T_2048_87,T_4096_175,T_256_11,T_4096_177,T_2048_89,T_4096_179,T_1024_45,T_4096_181,T_2048_91,T_4096_183,T_512_23,T_4096_185,T_2048_93,T_4096_187,T_1024_47,T_4096_189,T_2048_95,T_4096_191,T_64_3,T_4096_193,T_2048_97,T_4096_195,T_1024_49,T_4096_197,T_2048_99,T_4096_199,T_512_25,T_4096_201,T_2048_101,T_4096_203,T_1024_51,T_4096_205,T_2048_103,T_4096_207,T_256_13,T_4096_209,T_2048_105,T_4096_211,T_1024_53,T_4096_213,T_2048_107,T_4096_215,T_512_27,T_4096_217,T_2048_109,T_4096_219,T_1024_55,T_4096_221,T_2048_111,T_4096_223,T_128_7,T_4096_225,T_2048_113,T_4096_227,T_1024_57,T_4096_229,T_2048_115,T_4096_231,T_512_29,T_4096_233,T_2048_117,T_4096_235,T_1024_59,T_4096_237,T_2048_119,T_4096_239,T_256_15,T_4096_241,T_2048_121,T_4096_243,T_1024_61,T_4096_245,T_2048_123,T_4096_247,T_512_31,T_4096_249,T_2048_125,T_4096_251,T_1024_63,T_4096_253,T_2048_127,T_4096_255,T_16_1,T_4096_257,T_2048_129,T_4096_259,T_1024_65,T_4096_261,T_2048_131,T_4096_263,T_512_33,T_4096_265,T_2048_133,T_4096_267,T_1024_67,T_4096_269,T_2048_135,T_4096_271,T_256_17,T_4096_273,T_2048_137,T_4096_275,T_1024_69,T_4096_277,T_2048_139,T_4096_279,T_512_35,T_4096_281,T_2048_141,T_4096_283,T_1024_71,T_4096_285,T_2048_143,T_4096_287,T_128_9,T_4096_289,T_2048_145,T_4096_291,T_1024_73,T_4096_293,T_2048_147,T_4096_295,T_512_37,T_4096_297,T_2048_149,T_4096_299,T_1024_75,T_4096_301,T_2048_151,T_4096_303,T_256_19,T_4096_305,T_2048_153,T_4096_307,T_1024_77,T_4096_309,T_2048_155,T_4096_311,T_512_39,T_4096_313,T_2048_157,T_4096_315,T_1024_79,T_4096_317,T_2048_159,T_4096_319,T_64_5,T_4096_321,T_2048_161,T_4096_323,T_1024_81,T_4096_325,T_2048_163,T_4096_327,T_512_41,T_4096_329,T_2048_165,T_4096_331,T_1024_83,T_4096_333,T_2048_167,T_4096_335,T_256_21,T_4096_337,T_2048_169,T_4096_339,T_1024_85,T_4096_341,T_2048_171,T_4096_343,T_512_43,T_4096_345,T_2048_173,T_4096_347,T_1024_87,T_4096_349,T_2048_175,T_4096_351,T_128_11,T_4096_353,T_2048_177,T_4096_355,T_1024_89,T_4096_357,T_2048_179,T_4096_359,T_512_45,T_4096_361,T_2048_181,T_4096_363,T_1024_91,T_4096_365,T_2048_183,T_4096_367,T_256_23,T_4096_369,T_2048_185,T_4096_371,T_1024_93,T_4096_373,T_2048_187,T_4096_375,T_512_47,T_4096_377,T_2048_189,T_4096_379,T_1024_95,T_4096_381,T_2048_191,T_4096_383,T_32_3,T_4096_385,T_2048_193,T_4096_387,T_1024_97,T_4096_389,T_2048_195,T_4096_391,T_512_49,T_4096_393,T_2048_197,T_4096_395,T_1024_99,T_4096_397,T_2048_199,T_4096_399,T_256_25,T_4096_401,T_2048_201,T_4096_403,T_1024_101,T_4096_405,T_2048_203,T_4096_407,T_512_51,T_4096_409,T_2048_205,T_4096_411,T_1024_103,T_4096_413,T_2048_207,T_4096_415,T_128_13,T_4096_417,T_2048_209,T_4096_419,T_1024_105,T_4096_421,T_2048_211,T_4096_423,T_512_53,T_4096_425,T_2048_213,T_4096_427,T_1024_107,T_4096_429,T_2048_215,T_4096_431,T_256_27,T_4096_433,T_2048_217,T_4096_435,T_1024_109,T_4096_437,T_2048_219,T_4096_439,T_512_55,T_4096_441,T_2048_221,T_4096_443,T_1024_111,T_4096_445,T_2048_223,T_4096_447,T_64_7,T_4096_449,T_2048_225,T_4096_451,T_1024_113,T_4096_453,T_2048_227,T_4096_455,T_512_57,T_4096_457,T_2048_229,T_4096_459,T_1024_115,T_4096_461,T_2048_231,T_4096_463,T_256_29,T_4096_465,T_2048_233,T_4096_467,T_1024_117,T_4096_469,T_2048_235,T_4096_471,T_512_59,T_4096_473,T_2048_237,T_4096_475,T_1024_119,T_4096_477,T_2048_239,T_4096_479,T_128_15,T_4096_481,T_2048_241,T_4096_483,T_1024_121,T_4096_485,T_2048_243,T_4096_487,T_512_61,T_4096_489,T_2048_245,T_4096_491,T_1024_123,T_4096_493,T_2048_247,T_4096_495,T_256_31,T_4096_497,T_2048_249,T_4096_499,T_1024_125,T_4096_501,T_2048_251,T_4096_503,T_512_63,T_4096_505,T_2048_253,T_4096_507,T_1024_127,T_4096_509,T_2048_255,T_4096_511,T_8_1,T_4096_513,T_2048_257,T_4096_515,T_1024_129,T_4096_517,T_2048_259,T_4096_519,T_512_65,T_4096_521,T_2048_261,T_4096_523,T_1024_131,T_4096_525,T_2048_263,T_4096_527,T_256_33,T_4096_529,T_2048_265,T_4096_531,T_1024_133,T_4096_533,T_2048_267,T_4096_535,T_512_67,T_4096_537,T_2048_269,T_4096_539,T_1024_135,T_4096_541,T_2048_271,T_4096_543,T_128_17,T_4096_545,T_2048_273,T_4096_547,T_1024_137,T_4096_549,T_2048_275,T_4096_551,T_512_69,T_4096_553,T_2048_277,T_4096_555,T_1024_139,T_4096_557,T_2048_279,T_4096_559,T_256_35,T_4096_561,T_2048_281,T_4096_563,T_1024_141,T_4096_565,T_2048_283,T_4096_567,T_512_71,T_4096_569,T_2048_285,T_4096_571,T_1024_143,T_4096_573,T_2048_287,T_4096_575,T_64_9,T_4096_577,T_2048_289,T_4096_579,T_1024_145,T_4096_581,T_2048_291,T_4096_583,T_512_73,T_4096_585,T_2048_293,T_4096_587,T_1024_147,T_4096_589,T_2048_295,T_4096_591,T_256_37,T_4096_593,T_2048_297,T_4096_595,T_1024_149,T_4096_597,T_2048_299,T_4096_599,T_512_75,T_4096_601,T_2048_301,T_4096_603,T_1024_151,T_4096_605,T_2048_303,T_4096_607,T_128_19,T_4096_609,T_2048_305,T_4096_611,T_1024_153,T_4096_613,T_2048_307,T_4096_615,T_512_77,T_4096_617,T_2048_309,T_4096_619,T_1024_155,T_4096_621,T_2048_311,T_4096_623,T_256_39,T_4096_625,T_2048_313,T_4096_627,T_1024_157,T_4096_629,T_2048_315,T_4096_631,T_512_79,T_4096_633,T_2048_317,T_4096_635,T_1024_159,T_4096_637,T_2048_319,T_4096_639,T_32_5,T_4096_641,T_2048_321,T_4096_643,T_1024_161,T_4096_645,T_2048_323,T_4096_647,T_512_81,T_4096_649,T_2048_325,T_4096_651,T_1024_163,T_4096_653,T_2048_327,T_4096_655,T_256_41,T_4096_657,T_2048_329,T_4096_659,T_1024_165,T_4096_661,T_2048_331,T_4096_663,T_512_83,T_4096_665,T_2048_333,T_4096_667,T_1024_167,T_4096_669,T_2048_335,T_4096_671,T_128_21,T_4096_673,T_2048_337,T_4096_675,T_1024_169,T_4096_677,T_2048_339,T_4096_679,T_512_85,T_4096_681,T_2048_341,T_4096_683,T_1024_171,T_4096_685,T_2048_343,T_4096_687,T_256_43,T_4096_689,T_2048_345,T_4096_691,T_1024_173,T_4096_693,T_2048_347,T_4096_695,T_512_87,T_4096_697,T_2048_349,T_4096_699,T_1024_175,T_4096_701,T_2048_351,T_4096_703,T_64_11,T_4096_705,T_2048_353,T_4096_707,T_1024_177,T_4096_709,T_2048_355,T_4096_711,T_512_89,T_4096_713,T_2048_357,T_4096_715,T_1024_179,T_4096_717,T_2048_359,T_4096_719,T_256_45,T_4096_721,T_2048_361,T_4096_723,T_1024_181,T_4096_725,T_2048_363,T_4096_727,T_512_91,T_4096_729,T_2048_365,T_4096_731,T_1024_183,T_4096_733,T_2048_367,T_4096_735,T_128_23,T_4096_737,T_2048_369,T_4096_739,T_1024_185,T_4096_741,T_2048_371,T_4096_743,T_512_93,T_4096_745,T_2048_373,T_4096_747,T_1024_187,T_4096_749,T_2048_375,T_4096_751,T_256_47,T_4096_753,T_2048_377,T_4096_755,T_1024_189,T_4096_757,T_2048_379,T_4096_759,T_512_95,T_4096_761,T_2048_381,T_4096_763,T_1024_191,T_4096_765,T_2048_383,T_4096_767,T_16_3,T_4096_769,T_2048_385,T_4096_771,T_1024_193,T_4096_773,T_2048_387,T_4096_775,T_512_97,T_4096_777,T_2048_389,T_4096_779,T_1024_195,T_4096_781,T_2048_391,T_4096_783,T_256_49,T_4096_785,T_2048_393,T_4096_787,T_1024_197,T_4096_789,T_2048_395,T_4096_791,T_512_99,T_4096_793,T_2048_397,T_4096_795,T_1024_199,T_4096_797,T_2048_399,T_4096_799,T_128_25,T_4096_801,T_2048_401,T_4096_803,T_1024_201,T_4096_805,T_2048_403,T_4096_807,T_512_101,T_4096_809,T_2048_405,T_4096_811,T_1024_203,T_4096_813,T_2048_407,T_4096_815,T_256_51,T_4096_817,T_2048_409,T_4096_819,T_1024_205,T_4096_821,T_2048_411,T_4096_823,T_512_103,T_4096_825,T_2048_413,T_4096_827,T_1024_207,T_4096_829,T_2048_415,T_4096_831,T_64_13,T_4096_833,T_2048_417,T_4096_835,T_1024_209,T_4096_837,T_2048_419,T_4096_839,T_512_105,T_4096_841,T_2048_421,T_4096_843,T_1024_211,T_4096_845,T_2048_423,T_4096_847,T_256_53,T_4096_849,T_2048_425,T_4096_851,T_1024_213,T_4096_853,T_2048_427,T_4096_855,T_512_107,T_4096_857,T_2048_429,T_4096_859,T_1024_215,T_4096_861,T_2048_431,T_4096_863,T_128_27,T_4096_865,T_2048_433,T_4096_867,T_1024_217,T_4096_869,T_2048_435,T_4096_871,T_512_109,T_4096_873,T_2048_437,T_4096_875,T_1024_219,T_4096_877,T_2048_439,T_4096_879,T_256_55,T_4096_881,T_2048_441,T_4096_883,T_1024_221,T_4096_885,T_2048_443,T_4096_887,T_512_111,T_4096_889,T_2048_445,T_4096_891,T_1024_223,T_4096_893,T_2048_447,T_4096_895,T_32_7,T_4096_897,T_2048_449,T_4096_899,T_1024_225,T_4096_901,T_2048_451,T_4096_903,T_512_113,T_4096_905,T_2048_453,T_4096_907,T_1024_227,T_4096_909,T_2048_455,T_4096_911,T_256_57,T_4096_913,T_2048_457,T_4096_915,T_1024_229,T_4096_917,T_2048_459,T_4096_919,T_512_115,T_4096_921,T_2048_461,T_4096_923,T_1024_231,T_4096_925,T_2048_463,T_4096_927,T_128_29,T_4096_929,T_2048_465,T_4096_931,T_1024_233,T_4096_933,T_2048_467,T_4096_935,T_512_117,T_4096_937,T_2048_469,T_4096_939,T_1024_235,T_4096_941,T_2048_471,T_4096_943,T_256_59,T_4096_945,T_2048_473,T_4096_947,T_1024_237,T_4096_949,T_2048_475,T_4096_951,T_512_119,T_4096_953,T_2048_477,T_4096_955,T_1024_239,T_4096_957,T_2048_479,T_4096_959,T_64_15,T_4096_961,T_2048_481,T_4096_963,T_1024_241,T_4096_965,T_2048_483,T_4096_967,T_512_121,T_4096_969,T_2048_485,T_4096_971,T_1024_243,T_4096_973,T_2048_487,T_4096_975,T_256_61,T_4096_977,T_2048_489,T_4096_979,T_1024_245,T_4096_981,T_2048_491,T_4096_983,T_512_123,T_4096_985,T_2048_493,T_4096_987,T_1024_247,T_4096_989,T_2048_495,T_4096_991,T_128_31,T_4096_993,T_2048_497,T_4096_995,T_1024_249,T_4096_997,T_2048_499,T_4096_999,T_512_125,T_4096_1001,T_2048_501,T_4096_1003,T_1024_251,T_4096_1005,T_2048_503,T_4096_1007,T_256_63,T_4096_1009,T_2048_505,T_4096_1011,T_1024_253,T_4096_1013,T_2048_507,T_4096_1015,T_512_127,T_4096_1017,T_2048_509,T_4096_1019,T_1024_255,T_4096_1021,T_2048_511,T_4096_1023,T_2_0,T_4096_3,T_2048_3,T_4096_9,T_1024_3,T_4096_15,T_2048_9,T_4096_21,T_512_3,T_4096_27,T_2048_15,T_4096_33,T_1024_9,T_4096_39,T_2048_21,T_4096_45,T_256_3,T_4096_51,T_2048_27,T_4096_57,T_1024_15,T_4096_63,T_2048_33,T_4096_69,T_512_9,T_4096_75,T_2048_39,T_4096_81,T_1024_21,T_4096_87,T_2048_45,T_4096_93,T_128_3,T_4096_99,T_2048_51,T_4096_105,T_1024_27,T_4096_111,T_2048_57,T_4096_117,T_512_15,T_4096_123,T_2048_63,T_4096_129,T_1024_33,T_4096_135,T_2048_69,T_4096_141,T_256_9,T_4096_147,T_2048_75,T_4096_153,T_1024_39,T_4096_159,T_2048_81,T_4096_165,T_512_21,T_4096_171,T_2048_87,T_4096_177,T_1024_45,T_4096_183,T_2048_93,T_4096_189,T_64_3,T_4096_195,T_2048_99,T_4096_201,T_1024_51,T_4096_207,T_2048_105,T_4096_213,T_512_27,T_4096_219,T_2048_111,T_4096_225,T_1024_57,T_4096_231,T_2048_117,T_4096_237,T_256_15,T_4096_243,T_2048_123,T_4096_249,T_1024_63,T_4096_255,T_2048_129,T_4096_261,T_512_33,T_4096_267,T_2048_135,T_4096_273,T_1024_69,T_4096_279,T_2048_141,T_4096_285,T_128_9,T_4096_291,T_2048_147,T_4096_297,T_1024_75,T_4096_303,T_2048_153,T_4096_309,T_512_39,T_4096_315,T_2048_159,T_4096_321,T_1024_81,T_4096_327,T_2048_165,T_4096_333,T_256_21,T_4096_339,T_2048_171,T_4096_345,T_1024_87,T_4096_351,T_2048_177,T_4096_357,T_512_45,T_4096_363,T_2048_183,T_4096_369,T_1024_93,T_4096_375,T_2048_189,T_4096_381,T_32_3,T_4096_387,T_2048_195,T_4096_393,T_1024_99,T_4096_399,T_2048_201,T_4096_405,T_512_51,T_4096_411,T_2048_207,T_4096_417,T_1024_105,T_4096_423,T_2048_213,T_4096_429,T_256_27,T_4096_435,T_2048_219,T_4096_441,T_1024_111,T_4096_447,T_2048_225,T_4096_453,T_512_57,T_4096_459,T_2048_231,T_4096_465,T_1024_117,T_4096_471,T_2048_237,T_4096_477,T_128_15,T_4096_483,T_2048_243,T_4096_489,T_1024_123,T_4096_495,T_2048_249,T_4096_501,T_512_63,T_4096_507,T_2048_255,T_4096_513,T_1024_129,T_4096_519,T_2048_261,T_4096_525,T_256_33,T_4096_531,T_2048_267,T_4096_537,T_1024_135,T_4096_543,T_2048_273,T_4096_549,T_512_69,T_4096_555,T_2048_279,T_4096_561,T_1024_141,T_4096_567,T_2048_285,T_4096_573,T_64_9,T_4096_579,T_2048_291,T_4096_585,T_1024_147,T_4096_591,T_2048_297,T_4096_597,T_512_75,T_4096_603,T_2048_303,T_4096_609,T_1024_153,T_4096_615,T_2048_309,T_4096_621,T_256_39,T_4096_627,T_2048_315,T_4096_633,T_1024_159,T_4096_639,T_2048_321,T_4096_645,T_512_81,T_4096_651,T_2048_327,T_4096_657,T_1024_165,T_4096_663,T_2048_333,T_4096_669,T_128_21,T_4096_675,T_2048_339,T_4096_681,T_1024_171,T_4096_687,T_2048_345,T_4096_693,T_512_87,T_4096_699,T_2048_351,T_4096_705,T_1024_177,T_4096_711,T_2048_357,T_4096_717,T_256_45,T_4096_723,T_2048_363,T_4096_729,T_1024_183,T_4096_735,T_2048_369,T_4096_741,T_512_93,T_4096_747,T_2048_375,T_4096_753,T_1024_189,T_4096_759,T_2048_381,T_4096_765,T_16_3,T_4096_771,T_2048_387,T_4096_777,T_1024_195,T_4096_783,T_2048_393,T_4096_789,T_512_99,T_4096_795,T_2048_399,T_4096_801,T_1024_201,T_4096_807,T_2048_405,T_4096_813,T_256_51,T_4096_819,T_2048_411,T_4096_825,T_1024_207,T_4096_831,T_2048_417,T_4096_837,T_512_105,T_4096_843,T_2048_423,T_4096_849,T_1024_213,T_4096_855,T_2048_429,T_4096_861,T_128_27,T_4096_867,T_2048_435,T_4096_873,T_1024_219,T_4096_879,T_2048_441,T_4096_885,T_512_111,T_4096_891,T_2048_447,T_4096_897,T_1024_225,T_4096_903,T_2048_453,T_4096_909,T_256_57,T_4096_915,T_2048_459,T_4096_921,T_1024_231,T_4096_927,T_2048_465,T_4096_933,T_512_117,T_4096_939,T_2048_471,T_4096_945,T_1024_237,T_4096_951,T_2048_477,T_4096_957,T_64_15,T_4096_963,T_2048_483,T_4096_969,T_1024_243,T_4096_975,T_2048_489,T_4096_981,T_512_123,T_4096_987,T_2048_495,T_4096_993,T_1024_249,T_4096_999,T_2048_501,T_4096_1005,T_256_63,T_4096_1011,T_2048_507,T_4096_1017,T_1024_255,T_4096_1023,T_2048_513,T_4096_1029,T_512_129,T_4096_1035,T_2048_519,T_4096_1041,T_1024_261,T_4096_1047,T_2048_525,T_4096_1053,T_128_33,T_4096_1059,T_2048_531,T_4096_1065,T_1024_267,T_4096_1071,T_2048_537,T_4096_1077,T_512_135,T_4096_1083,T_2048_543,T_4096_1089,T_1024_273,T_4096_1095,T_2048_549,T_4096_1101,T_256_69,T_4096_1107,T_2048_555,T_4096_1113,T_1024_279,T_4096_1119,T_2048_561,T_4096_1125,T_512_141,T_4096_1131,T_2048_567,T_4096_1137,T_1024_285,T_4096_1143,T_2048_573,T_4096_1149,T_32_9,T_4096_1155,T_2048_579,T_4096_1161,T_1024_291,T_4096_1167,T_2048_585,T_4096_1173,T_512_147,T_4096_1179,T_2048_591,T_4096_1185,T_1024_297,T_4096_1191,T_2048_597,T_4096_1197,T_256_75,T_4096_1203,T_2048_603,T_4096_1209,T_1024_303,T_4096_1215,T_2048_609,T_4096_1221,T_512_153,T_4096_1227,T_2048_615,T_4096_1233,T_1024_309,T_4096_1239,T_2048_621,T_4096_1245,T_128_39,T_4096_1251,T_2048_627,T_4096_1257,T_1024_315,T_4096_1263,T_2048_633,T_4096_1269,T_512_159,T_4096_1275,T_2048_639,T_4096_1281,T_1024_321,T_4096_1287,T_2048_645,T_4096_1293,T_256_81,T_4096_1299,T_2048_651,T_4096_1305,T_1024_327,T_4096_1311,T_2048_657,T_4096_1317,T_512_165,T_4096_1323,T_2048_663,T_4096_1329,T_1024_333,T_4096_1335,T_2048_669,T_4096_1341,T_64_21,T_4096_1347,T_2048_675,T_4096_1353,T_1024_339,T_4096_1359,T_2048_681,T_4096_1365,T_512_171,T_4096_1371,T_2048_687,T_4096_1377,T_1024_345,T_4096_1383,T_2048_693,T_4096_1389,T_256_87,T_4096_1395,T_2048_699,T_4096_1401,T_1024_351,T_4096_1407,T_2048_705,T_4096_1413,T_512_177,T_4096_1419,T_2048_711,T_4096_1425,T_1024_357,T_4096_1431,T_2048_717,T_4096_1437,T_128_45,T_4096_1443,T_2048_723,T_4096_1449,T_1024_363,T_4096_1455,T_2048_729,T_4096_1461,T_512_183,T_4096_1467,T_2048_735,T_4096_1473,T_1024_369,T_4096_1479,T_2048_741,T_4096_1485,T_256_93,T_4096_1491,T_2048_747,T_4096_1497,T_1024_375,T_4096_1503,T_2048_753,T_4096_1509,T_512_189,T_4096_1515,T_2048_759,T_4096_1521,T_1024_381,T_4096_1527,T_2048_765,T_4096_1533,T_8_3,T_4096_1539,T_2048_771,T_4096_1545,T_1024_387,T_4096_1551,T_2048_777,T_4096_1557,T_512_195,T_4096_1563,T_2048_783,T_4096_1569,T_1024_393,T_4096_1575,T_2048_789,T_4096_1581,T_256_99,T_4096_1587,T_2048_795,T_4096_1593,T_1024_399,T_4096_1599,T_2048_801,T_4096_1605,T_512_201,T_4096_1611,T_2048_807,T_4096_1617,T_1024_405,T_4096_1623,T_2048_813,T_4096_1629,T_128_51,T_4096_1635,T_2048_819,T_4096_1641,T_1024_411,T_4096_1647,T_2048_825,T_4096_1653,T_512_207,T_4096_1659,T_2048_831,T_4096_1665,T_1024_417,T_4096_1671,T_2048_837,T_4096_1677,T_256_105,T_4096_1683,T_2048_843,T_4096_1689,T_1024_423,T_4096_1695,T_2048_849,T_4096_1701,T_512_213,T_4096_1707,T_2048_855,T_4096_1713,T_1024_429,T_4096_1719,T_2048_861,T_4096_1725,T_64_27,T_4096_1731,T_2048_867,T_4096_1737,T_1024_435,T_4096_1743,T_2048_873,T_4096_1749,T_512_219,T_4096_1755,T_2048_879,T_4096_1761,T_1024_441,T_4096_1767,T_2048_885,T_4096_1773,T_256_111,T_4096_1779,T_2048_891,T_4096_1785,T_1024_447,T_4096_1791,T_2048_897,T_4096_1797,T_512_225,T_4096_1803,T_2048_903,T_4096_1809,T_1024_453,T_4096_1815,T_2048_909,T_4096_1821,T_128_57,T_4096_1827,T_2048_915,T_4096_1833,T_1024_459,T_4096_1839,T_2048_921,T_4096_1845,T_512_231,T_4096_1851,T_2048_927,T_4096_1857,T_1024_465,T_4096_1863,T_2048_933,T_4096_1869,T_256_117,T_4096_1875,T_2048_939,T_4096_1881,T_1024_471,T_4096_1887,T_2048_945,T_4096_1893,T_512_237,T_4096_1899,T_2048_951,T_4096_1905,T_1024_477,T_4096_1911,T_2048_957,T_4096_1917,T_32_15,T_4096_1923,T_2048_963,T_4096_1929,T_1024_483,T_4096_1935,T_2048_969,T_4096_1941,T_512_243,T_4096_1947,T_2048_975,T_4096_1953,T_1024_489,T_4096_1959,T_2048_981,T_4096_1965,T_256_123,T_4096_1971,T_2048_987,T_4096_1977,T_1024_495,T_4096_1983,T_2048_993,T_4096_1989,T_512_249,T_4096_1995,T_2048_999,T_4096_2001,T_1024_501,T_4096_2007,T_2048_1005,T_4096_2013,T_128_63,T_4096_2019,T_2048_1011,T_4096_2025,T_1024_507,T_4096_2031,T_2048_1017,T_4096_2037,T_512_255,T_4096_2043,T_2048_1023,T_4096_2049,T_1024_513,T_4096_2055,T_2048_1029,T_4096_2061,T_256_129,T_4096_2067,T_2048_1035,T_4096_2073,T_1024_519,T_4096_2079,T_2048_1041,T_4096_2085,T_512_261,T_4096_2091,T_2048_1047,T_4096_2097,T_1024_525,T_4096_2103,T_2048_1053,T_4096_2109,T_64_33,T_4096_2115,T_2048_1059,T_4096_2121,T_1024_531,T_4096_2127,T_2048_1065,T_4096_2133,T_512_267,T_4096_2139,T_2048_1071,T_4096_2145,T_1024_537,T_4096_2151,T_2048_1077,T_4096_2157,T_256_135,T_4096_2163,T_2048_1083,T_4096_2169,T_1024_543,T_4096_2175,T_2048_1089,T_4096_2181,T_512_273,T_4096_2187,T_2048_1095,T_4096_2193,T_1024_549,T_4096_2199,T_2048_1101,T_4096_2205,T_128_69,T_4096_2211,T_2048_1107,T_4096_2217,T_1024_555,T_4096_2223,T_2048_1113,T_4096_2229,T_512_279,T_4096_2235,T_2048_1119,T_4096_2241,T_1024_561,T_4096_2247,T_2048_1125,T_4096_2253,T_256_141,T_4096_2259,T_2048_1131,T_4096_2265,T_1024_567,T_4096_2271,T_2048_1137,T_4096_2277,T_512_285,T_4096_2283,T_2048_1143,T_4096_2289,T_1024_573,T_4096_2295,T_2048_1149,T_4096_2301,T_16_9,T_4096_2307,T_2048_1155,T_4096_2313,T_1024_579,T_4096_2319,T_2048_1161,T_4096_2325,T_512_291,T_4096_2331,T_2048_1167,T_4096_2337,T_1024_585,T_4096_2343,T_2048_1173,T_4096_2349,T_256_147,T_4096_2355,T_2048_1179,T_4096_2361,T_1024_591,T_4096_2367,T_2048_1185,T_4096_2373,T_512_297,T_4096_2379,T_2048_1191,T_4096_2385,T_1024_597,T_4096_2391,T_2048_1197,T_4096_2397,T_128_75,T_4096_2403,T_2048_1203,T_4096_2409,T_1024_603,T_4096_2415,T_2048_1209,T_4096_2421,T_512_303,T_4096_2427,T_2048_1215,T_4096_2433,T_1024_609,T_4096_2439,T_2048_1221,T_4096_2445,T_256_153,T_4096_2451,T_2048_1227,T_4096_2457,T_1024_615,T_4096_2463,T_2048_1233,T_4096_2469,T_512_309,T_4096_2475,T_2048_1239,T_4096_2481,T_1024_621,T_4096_2487,T_2048_1245,T_4096_2493,T_64_39,T_4096_2499,T_2048_1251,T_4096_2505,T_1024_627,T_4096_2511,T_2048_1257,T_4096_2517,T_512_315,T_4096_2523,T_2048_1263,T_4096_2529,T_1024_633,T_4096_2535,T_2048_1269,T_4096_2541,T_256_159,T_4096_2547,T_2048_1275,T_4096_2553,T_1024_639,T_4096_2559,T_2048_1281,T_4096_2565,T_512_321,T_4096_2571,T_2048_1287,T_4096_2577,T_1024_645,T_4096_2583,T_2048_1293,T_4096_2589,T_128_81,T_4096_2595,T_2048_1299,T_4096_2601,T_1024_651,T_4096_2607,T_2048_1305,T_4096_2613,T_512_327,T_4096_2619,T_2048_1311,T_4096_2625,T_1024_657,T_4096_2631,T_2048_1317,T_4096_2637,T_256_165,T_4096_2643,T_2048_1323,T_4096_2649,T_1024_663,T_4096_2655,T_2048_1329,T_4096_2661,T_512_333,T_4096_2667,T_2048_1335,T_4096_2673,T_1024_669,T_4096_2679,T_2048_1341,T_4096_2685,T_32_21,T_4096_2691,T_2048_1347,T_4096_2697,T_1024_675,T_4096_2703,T_2048_1353,T_4096_2709,T_512_339,T_4096_2715,T_2048_1359,T_4096_2721,T_1024_681,T_4096_2727,T_2048_1365,T_4096_2733,T_256_171,T_4096_2739,T_2048_1371,T_4096_2745,T_1024_687,T_4096_2751,T_2048_1377,T_4096_2757,T_512_345,T_4096_2763,T_2048_1383,T_4096_2769,T_1024_693,T_4096_2775,T_2048_1389,T_4096_2781,T_128_87,T_4096_2787,T_2048_1395,T_4096_2793,T_1024_699,T_4096_2799,T_2048_1401,T_4096_2805,T_512_351,T_4096_2811,T_2048_1407,T_4096_2817,T_1024_705,T_4096_2823,T_2048_1413,T_4096_2829,T_256_177,T_4096_2835,T_2048_1419,T_4096_2841,T_1024_711,T_4096_2847,T_2048_1425,T_4096_2853,T_512_357,T_4096_2859,T_2048_1431,T_4096_2865,T_1024_717,T_4096_2871,T_2048_1437,T_4096_2877,T_64_45,T_4096_2883,T_2048_1443,T_4096_2889,T_1024_723,T_4096_2895,T_2048_1449,T_4096_2901,T_512_363,T_4096_2907,T_2048_1455,T_4096_2913,T_1024_729,T_4096_2919,T_2048_1461,T_4096_2925,T_256_183,T_4096_2931,T_2048_1467,T_4096_2937,T_1024_735,T_4096_2943,T_2048_1473,T_4096_2949,T_512_369,T_4096_2955,T_2048_1479,T_4096_2961,T_1024_741,T_4096_2967,T_2048_1485,T_4096_2973,T_128_93,T_4096_2979,T_2048_1491,T_4096_2985,T_1024_747,T_4096_2991,T_2048_1497,T_4096_2997,T_512_375,T_4096_3003,T_2048_1503,T_4096_3009,T_1024_753,T_4096_3015,T_2048_1509,T_4096_3021,T_256_189,T_4096_3027,T_2048_1515,T_4096_3033,T_1024_759,T_4096_3039,T_2048_1521,T_4096_3045,T_512_381,T_4096_3051,T_2048_1527,T_4096_3057,T_1024_765,T_4096_3063,T_2048_1533,T_4096_3069 +}; +static const __device__ float2 lut_sp_8_4096[512*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_32_1,T_4096_129,T_2048_65,T_4096_131,T_1024_33,T_4096_133,T_2048_67,T_4096_135,T_512_17,T_4096_137,T_2048_69,T_4096_139,T_1024_35,T_4096_141,T_2048_71,T_4096_143,T_256_9,T_4096_145,T_2048_73,T_4096_147,T_1024_37,T_4096_149,T_2048_75,T_4096_151,T_512_19,T_4096_153,T_2048_77,T_4096_155,T_1024_39,T_4096_157,T_2048_79,T_4096_159,T_128_5,T_4096_161,T_2048_81,T_4096_163,T_1024_41,T_4096_165,T_2048_83,T_4096_167,T_512_21,T_4096_169,T_2048_85,T_4096_171,T_1024_43,T_4096_173,T_2048_87,T_4096_175,T_256_11,T_4096_177,T_2048_89,T_4096_179,T_1024_45,T_4096_181,T_2048_91,T_4096_183,T_512_23,T_4096_185,T_2048_93,T_4096_187,T_1024_47,T_4096_189,T_2048_95,T_4096_191,T_64_3,T_4096_193,T_2048_97,T_4096_195,T_1024_49,T_4096_197,T_2048_99,T_4096_199,T_512_25,T_4096_201,T_2048_101,T_4096_203,T_1024_51,T_4096_205,T_2048_103,T_4096_207,T_256_13,T_4096_209,T_2048_105,T_4096_211,T_1024_53,T_4096_213,T_2048_107,T_4096_215,T_512_27,T_4096_217,T_2048_109,T_4096_219,T_1024_55,T_4096_221,T_2048_111,T_4096_223,T_128_7,T_4096_225,T_2048_113,T_4096_227,T_1024_57,T_4096_229,T_2048_115,T_4096_231,T_512_29,T_4096_233,T_2048_117,T_4096_235,T_1024_59,T_4096_237,T_2048_119,T_4096_239,T_256_15,T_4096_241,T_2048_121,T_4096_243,T_1024_61,T_4096_245,T_2048_123,T_4096_247,T_512_31,T_4096_249,T_2048_125,T_4096_251,T_1024_63,T_4096_253,T_2048_127,T_4096_255,T_16_1,T_4096_257,T_2048_129,T_4096_259,T_1024_65,T_4096_261,T_2048_131,T_4096_263,T_512_33,T_4096_265,T_2048_133,T_4096_267,T_1024_67,T_4096_269,T_2048_135,T_4096_271,T_256_17,T_4096_273,T_2048_137,T_4096_275,T_1024_69,T_4096_277,T_2048_139,T_4096_279,T_512_35,T_4096_281,T_2048_141,T_4096_283,T_1024_71,T_4096_285,T_2048_143,T_4096_287,T_128_9,T_4096_289,T_2048_145,T_4096_291,T_1024_73,T_4096_293,T_2048_147,T_4096_295,T_512_37,T_4096_297,T_2048_149,T_4096_299,T_1024_75,T_4096_301,T_2048_151,T_4096_303,T_256_19,T_4096_305,T_2048_153,T_4096_307,T_1024_77,T_4096_309,T_2048_155,T_4096_311,T_512_39,T_4096_313,T_2048_157,T_4096_315,T_1024_79,T_4096_317,T_2048_159,T_4096_319,T_64_5,T_4096_321,T_2048_161,T_4096_323,T_1024_81,T_4096_325,T_2048_163,T_4096_327,T_512_41,T_4096_329,T_2048_165,T_4096_331,T_1024_83,T_4096_333,T_2048_167,T_4096_335,T_256_21,T_4096_337,T_2048_169,T_4096_339,T_1024_85,T_4096_341,T_2048_171,T_4096_343,T_512_43,T_4096_345,T_2048_173,T_4096_347,T_1024_87,T_4096_349,T_2048_175,T_4096_351,T_128_11,T_4096_353,T_2048_177,T_4096_355,T_1024_89,T_4096_357,T_2048_179,T_4096_359,T_512_45,T_4096_361,T_2048_181,T_4096_363,T_1024_91,T_4096_365,T_2048_183,T_4096_367,T_256_23,T_4096_369,T_2048_185,T_4096_371,T_1024_93,T_4096_373,T_2048_187,T_4096_375,T_512_47,T_4096_377,T_2048_189,T_4096_379,T_1024_95,T_4096_381,T_2048_191,T_4096_383,T_32_3,T_4096_385,T_2048_193,T_4096_387,T_1024_97,T_4096_389,T_2048_195,T_4096_391,T_512_49,T_4096_393,T_2048_197,T_4096_395,T_1024_99,T_4096_397,T_2048_199,T_4096_399,T_256_25,T_4096_401,T_2048_201,T_4096_403,T_1024_101,T_4096_405,T_2048_203,T_4096_407,T_512_51,T_4096_409,T_2048_205,T_4096_411,T_1024_103,T_4096_413,T_2048_207,T_4096_415,T_128_13,T_4096_417,T_2048_209,T_4096_419,T_1024_105,T_4096_421,T_2048_211,T_4096_423,T_512_53,T_4096_425,T_2048_213,T_4096_427,T_1024_107,T_4096_429,T_2048_215,T_4096_431,T_256_27,T_4096_433,T_2048_217,T_4096_435,T_1024_109,T_4096_437,T_2048_219,T_4096_439,T_512_55,T_4096_441,T_2048_221,T_4096_443,T_1024_111,T_4096_445,T_2048_223,T_4096_447,T_64_7,T_4096_449,T_2048_225,T_4096_451,T_1024_113,T_4096_453,T_2048_227,T_4096_455,T_512_57,T_4096_457,T_2048_229,T_4096_459,T_1024_115,T_4096_461,T_2048_231,T_4096_463,T_256_29,T_4096_465,T_2048_233,T_4096_467,T_1024_117,T_4096_469,T_2048_235,T_4096_471,T_512_59,T_4096_473,T_2048_237,T_4096_475,T_1024_119,T_4096_477,T_2048_239,T_4096_479,T_128_15,T_4096_481,T_2048_241,T_4096_483,T_1024_121,T_4096_485,T_2048_243,T_4096_487,T_512_61,T_4096_489,T_2048_245,T_4096_491,T_1024_123,T_4096_493,T_2048_247,T_4096_495,T_256_31,T_4096_497,T_2048_249,T_4096_499,T_1024_125,T_4096_501,T_2048_251,T_4096_503,T_512_63,T_4096_505,T_2048_253,T_4096_507,T_1024_127,T_4096_509,T_2048_255,T_4096_511,T_2_0,T_4096_5,T_2048_5,T_4096_15,T_1024_5,T_4096_25,T_2048_15,T_4096_35,T_512_5,T_4096_45,T_2048_25,T_4096_55,T_1024_15,T_4096_65,T_2048_35,T_4096_75,T_256_5,T_4096_85,T_2048_45,T_4096_95,T_1024_25,T_4096_105,T_2048_55,T_4096_115,T_512_15,T_4096_125,T_2048_65,T_4096_135,T_1024_35,T_4096_145,T_2048_75,T_4096_155,T_128_5,T_4096_165,T_2048_85,T_4096_175,T_1024_45,T_4096_185,T_2048_95,T_4096_195,T_512_25,T_4096_205,T_2048_105,T_4096_215,T_1024_55,T_4096_225,T_2048_115,T_4096_235,T_256_15,T_4096_245,T_2048_125,T_4096_255,T_1024_65,T_4096_265,T_2048_135,T_4096_275,T_512_35,T_4096_285,T_2048_145,T_4096_295,T_1024_75,T_4096_305,T_2048_155,T_4096_315,T_64_5,T_4096_325,T_2048_165,T_4096_335,T_1024_85,T_4096_345,T_2048_175,T_4096_355,T_512_45,T_4096_365,T_2048_185,T_4096_375,T_1024_95,T_4096_385,T_2048_195,T_4096_395,T_256_25,T_4096_405,T_2048_205,T_4096_415,T_1024_105,T_4096_425,T_2048_215,T_4096_435,T_512_55,T_4096_445,T_2048_225,T_4096_455,T_1024_115,T_4096_465,T_2048_235,T_4096_475,T_128_15,T_4096_485,T_2048_245,T_4096_495,T_1024_125,T_4096_505,T_2048_255,T_4096_515,T_512_65,T_4096_525,T_2048_265,T_4096_535,T_1024_135,T_4096_545,T_2048_275,T_4096_555,T_256_35,T_4096_565,T_2048_285,T_4096_575,T_1024_145,T_4096_585,T_2048_295,T_4096_595,T_512_75,T_4096_605,T_2048_305,T_4096_615,T_1024_155,T_4096_625,T_2048_315,T_4096_635,T_32_5,T_4096_645,T_2048_325,T_4096_655,T_1024_165,T_4096_665,T_2048_335,T_4096_675,T_512_85,T_4096_685,T_2048_345,T_4096_695,T_1024_175,T_4096_705,T_2048_355,T_4096_715,T_256_45,T_4096_725,T_2048_365,T_4096_735,T_1024_185,T_4096_745,T_2048_375,T_4096_755,T_512_95,T_4096_765,T_2048_385,T_4096_775,T_1024_195,T_4096_785,T_2048_395,T_4096_795,T_128_25,T_4096_805,T_2048_405,T_4096_815,T_1024_205,T_4096_825,T_2048_415,T_4096_835,T_512_105,T_4096_845,T_2048_425,T_4096_855,T_1024_215,T_4096_865,T_2048_435,T_4096_875,T_256_55,T_4096_885,T_2048_445,T_4096_895,T_1024_225,T_4096_905,T_2048_455,T_4096_915,T_512_115,T_4096_925,T_2048_465,T_4096_935,T_1024_235,T_4096_945,T_2048_475,T_4096_955,T_64_15,T_4096_965,T_2048_485,T_4096_975,T_1024_245,T_4096_985,T_2048_495,T_4096_995,T_512_125,T_4096_1005,T_2048_505,T_4096_1015,T_1024_255,T_4096_1025,T_2048_515,T_4096_1035,T_256_65,T_4096_1045,T_2048_525,T_4096_1055,T_1024_265,T_4096_1065,T_2048_535,T_4096_1075,T_512_135,T_4096_1085,T_2048_545,T_4096_1095,T_1024_275,T_4096_1105,T_2048_555,T_4096_1115,T_128_35,T_4096_1125,T_2048_565,T_4096_1135,T_1024_285,T_4096_1145,T_2048_575,T_4096_1155,T_512_145,T_4096_1165,T_2048_585,T_4096_1175,T_1024_295,T_4096_1185,T_2048_595,T_4096_1195,T_256_75,T_4096_1205,T_2048_605,T_4096_1215,T_1024_305,T_4096_1225,T_2048_615,T_4096_1235,T_512_155,T_4096_1245,T_2048_625,T_4096_1255,T_1024_315,T_4096_1265,T_2048_635,T_4096_1275,T_16_5,T_4096_1285,T_2048_645,T_4096_1295,T_1024_325,T_4096_1305,T_2048_655,T_4096_1315,T_512_165,T_4096_1325,T_2048_665,T_4096_1335,T_1024_335,T_4096_1345,T_2048_675,T_4096_1355,T_256_85,T_4096_1365,T_2048_685,T_4096_1375,T_1024_345,T_4096_1385,T_2048_695,T_4096_1395,T_512_175,T_4096_1405,T_2048_705,T_4096_1415,T_1024_355,T_4096_1425,T_2048_715,T_4096_1435,T_128_45,T_4096_1445,T_2048_725,T_4096_1455,T_1024_365,T_4096_1465,T_2048_735,T_4096_1475,T_512_185,T_4096_1485,T_2048_745,T_4096_1495,T_1024_375,T_4096_1505,T_2048_755,T_4096_1515,T_256_95,T_4096_1525,T_2048_765,T_4096_1535,T_1024_385,T_4096_1545,T_2048_775,T_4096_1555,T_512_195,T_4096_1565,T_2048_785,T_4096_1575,T_1024_395,T_4096_1585,T_2048_795,T_4096_1595,T_64_25,T_4096_1605,T_2048_805,T_4096_1615,T_1024_405,T_4096_1625,T_2048_815,T_4096_1635,T_512_205,T_4096_1645,T_2048_825,T_4096_1655,T_1024_415,T_4096_1665,T_2048_835,T_4096_1675,T_256_105,T_4096_1685,T_2048_845,T_4096_1695,T_1024_425,T_4096_1705,T_2048_855,T_4096_1715,T_512_215,T_4096_1725,T_2048_865,T_4096_1735,T_1024_435,T_4096_1745,T_2048_875,T_4096_1755,T_128_55,T_4096_1765,T_2048_885,T_4096_1775,T_1024_445,T_4096_1785,T_2048_895,T_4096_1795,T_512_225,T_4096_1805,T_2048_905,T_4096_1815,T_1024_455,T_4096_1825,T_2048_915,T_4096_1835,T_256_115,T_4096_1845,T_2048_925,T_4096_1855,T_1024_465,T_4096_1865,T_2048_935,T_4096_1875,T_512_235,T_4096_1885,T_2048_945,T_4096_1895,T_1024_475,T_4096_1905,T_2048_955,T_4096_1915,T_32_15,T_4096_1925,T_2048_965,T_4096_1935,T_1024_485,T_4096_1945,T_2048_975,T_4096_1955,T_512_245,T_4096_1965,T_2048_985,T_4096_1975,T_1024_495,T_4096_1985,T_2048_995,T_4096_1995,T_256_125,T_4096_2005,T_2048_1005,T_4096_2015,T_1024_505,T_4096_2025,T_2048_1015,T_4096_2035,T_512_255,T_4096_2045,T_2048_1025,T_4096_2055,T_1024_515,T_4096_2065,T_2048_1035,T_4096_2075,T_128_65,T_4096_2085,T_2048_1045,T_4096_2095,T_1024_525,T_4096_2105,T_2048_1055,T_4096_2115,T_512_265,T_4096_2125,T_2048_1065,T_4096_2135,T_1024_535,T_4096_2145,T_2048_1075,T_4096_2155,T_256_135,T_4096_2165,T_2048_1085,T_4096_2175,T_1024_545,T_4096_2185,T_2048_1095,T_4096_2195,T_512_275,T_4096_2205,T_2048_1105,T_4096_2215,T_1024_555,T_4096_2225,T_2048_1115,T_4096_2235,T_64_35,T_4096_2245,T_2048_1125,T_4096_2255,T_1024_565,T_4096_2265,T_2048_1135,T_4096_2275,T_512_285,T_4096_2285,T_2048_1145,T_4096_2295,T_1024_575,T_4096_2305,T_2048_1155,T_4096_2315,T_256_145,T_4096_2325,T_2048_1165,T_4096_2335,T_1024_585,T_4096_2345,T_2048_1175,T_4096_2355,T_512_295,T_4096_2365,T_2048_1185,T_4096_2375,T_1024_595,T_4096_2385,T_2048_1195,T_4096_2395,T_128_75,T_4096_2405,T_2048_1205,T_4096_2415,T_1024_605,T_4096_2425,T_2048_1215,T_4096_2435,T_512_305,T_4096_2445,T_2048_1225,T_4096_2455,T_1024_615,T_4096_2465,T_2048_1235,T_4096_2475,T_256_155,T_4096_2485,T_2048_1245,T_4096_2495,T_1024_625,T_4096_2505,T_2048_1255,T_4096_2515,T_512_315,T_4096_2525,T_2048_1265,T_4096_2535,T_1024_635,T_4096_2545,T_2048_1275,T_4096_2555 +}; +static const __device__ float2 lut_sp_16_4096[256*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_32_1,T_4096_129,T_2048_65,T_4096_131,T_1024_33,T_4096_133,T_2048_67,T_4096_135,T_512_17,T_4096_137,T_2048_69,T_4096_139,T_1024_35,T_4096_141,T_2048_71,T_4096_143,T_256_9,T_4096_145,T_2048_73,T_4096_147,T_1024_37,T_4096_149,T_2048_75,T_4096_151,T_512_19,T_4096_153,T_2048_77,T_4096_155,T_1024_39,T_4096_157,T_2048_79,T_4096_159,T_128_5,T_4096_161,T_2048_81,T_4096_163,T_1024_41,T_4096_165,T_2048_83,T_4096_167,T_512_21,T_4096_169,T_2048_85,T_4096_171,T_1024_43,T_4096_173,T_2048_87,T_4096_175,T_256_11,T_4096_177,T_2048_89,T_4096_179,T_1024_45,T_4096_181,T_2048_91,T_4096_183,T_512_23,T_4096_185,T_2048_93,T_4096_187,T_1024_47,T_4096_189,T_2048_95,T_4096_191,T_64_3,T_4096_193,T_2048_97,T_4096_195,T_1024_49,T_4096_197,T_2048_99,T_4096_199,T_512_25,T_4096_201,T_2048_101,T_4096_203,T_1024_51,T_4096_205,T_2048_103,T_4096_207,T_256_13,T_4096_209,T_2048_105,T_4096_211,T_1024_53,T_4096_213,T_2048_107,T_4096_215,T_512_27,T_4096_217,T_2048_109,T_4096_219,T_1024_55,T_4096_221,T_2048_111,T_4096_223,T_128_7,T_4096_225,T_2048_113,T_4096_227,T_1024_57,T_4096_229,T_2048_115,T_4096_231,T_512_29,T_4096_233,T_2048_117,T_4096_235,T_1024_59,T_4096_237,T_2048_119,T_4096_239,T_256_15,T_4096_241,T_2048_121,T_4096_243,T_1024_61,T_4096_245,T_2048_123,T_4096_247,T_512_31,T_4096_249,T_2048_125,T_4096_251,T_1024_63,T_4096_253,T_2048_127,T_4096_255,T_2_0,T_4096_9,T_2048_9,T_4096_27,T_1024_9,T_4096_45,T_2048_27,T_4096_63,T_512_9,T_4096_81,T_2048_45,T_4096_99,T_1024_27,T_4096_117,T_2048_63,T_4096_135,T_256_9,T_4096_153,T_2048_81,T_4096_171,T_1024_45,T_4096_189,T_2048_99,T_4096_207,T_512_27,T_4096_225,T_2048_117,T_4096_243,T_1024_63,T_4096_261,T_2048_135,T_4096_279,T_128_9,T_4096_297,T_2048_153,T_4096_315,T_1024_81,T_4096_333,T_2048_171,T_4096_351,T_512_45,T_4096_369,T_2048_189,T_4096_387,T_1024_99,T_4096_405,T_2048_207,T_4096_423,T_256_27,T_4096_441,T_2048_225,T_4096_459,T_1024_117,T_4096_477,T_2048_243,T_4096_495,T_512_63,T_4096_513,T_2048_261,T_4096_531,T_1024_135,T_4096_549,T_2048_279,T_4096_567,T_64_9,T_4096_585,T_2048_297,T_4096_603,T_1024_153,T_4096_621,T_2048_315,T_4096_639,T_512_81,T_4096_657,T_2048_333,T_4096_675,T_1024_171,T_4096_693,T_2048_351,T_4096_711,T_256_45,T_4096_729,T_2048_369,T_4096_747,T_1024_189,T_4096_765,T_2048_387,T_4096_783,T_512_99,T_4096_801,T_2048_405,T_4096_819,T_1024_207,T_4096_837,T_2048_423,T_4096_855,T_128_27,T_4096_873,T_2048_441,T_4096_891,T_1024_225,T_4096_909,T_2048_459,T_4096_927,T_512_117,T_4096_945,T_2048_477,T_4096_963,T_1024_243,T_4096_981,T_2048_495,T_4096_999,T_256_63,T_4096_1017,T_2048_513,T_4096_1035,T_1024_261,T_4096_1053,T_2048_531,T_4096_1071,T_512_135,T_4096_1089,T_2048_549,T_4096_1107,T_1024_279,T_4096_1125,T_2048_567,T_4096_1143,T_32_9,T_4096_1161,T_2048_585,T_4096_1179,T_1024_297,T_4096_1197,T_2048_603,T_4096_1215,T_512_153,T_4096_1233,T_2048_621,T_4096_1251,T_1024_315,T_4096_1269,T_2048_639,T_4096_1287,T_256_81,T_4096_1305,T_2048_657,T_4096_1323,T_1024_333,T_4096_1341,T_2048_675,T_4096_1359,T_512_171,T_4096_1377,T_2048_693,T_4096_1395,T_1024_351,T_4096_1413,T_2048_711,T_4096_1431,T_128_45,T_4096_1449,T_2048_729,T_4096_1467,T_1024_369,T_4096_1485,T_2048_747,T_4096_1503,T_512_189,T_4096_1521,T_2048_765,T_4096_1539,T_1024_387,T_4096_1557,T_2048_783,T_4096_1575,T_256_99,T_4096_1593,T_2048_801,T_4096_1611,T_1024_405,T_4096_1629,T_2048_819,T_4096_1647,T_512_207,T_4096_1665,T_2048_837,T_4096_1683,T_1024_423,T_4096_1701,T_2048_855,T_4096_1719,T_64_27,T_4096_1737,T_2048_873,T_4096_1755,T_1024_441,T_4096_1773,T_2048_891,T_4096_1791,T_512_225,T_4096_1809,T_2048_909,T_4096_1827,T_1024_459,T_4096_1845,T_2048_927,T_4096_1863,T_256_117,T_4096_1881,T_2048_945,T_4096_1899,T_1024_477,T_4096_1917,T_2048_963,T_4096_1935,T_512_243,T_4096_1953,T_2048_981,T_4096_1971,T_1024_495,T_4096_1989,T_2048_999,T_4096_2007,T_128_63,T_4096_2025,T_2048_1017,T_4096_2043,T_1024_513,T_4096_2061,T_2048_1035,T_4096_2079,T_512_261,T_4096_2097,T_2048_1053,T_4096_2115,T_1024_531,T_4096_2133,T_2048_1071,T_4096_2151,T_256_135,T_4096_2169,T_2048_1089,T_4096_2187,T_1024_549,T_4096_2205,T_2048_1107,T_4096_2223,T_512_279,T_4096_2241,T_2048_1125,T_4096_2259,T_1024_567,T_4096_2277,T_2048_1143,T_4096_2295 +}; +static const __device__ float2 lut_sp_32_4096[128*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_2_0,T_4096_17,T_2048_17,T_4096_51,T_1024_17,T_4096_85,T_2048_51,T_4096_119,T_512_17,T_4096_153,T_2048_85,T_4096_187,T_1024_51,T_4096_221,T_2048_119,T_4096_255,T_256_17,T_4096_289,T_2048_153,T_4096_323,T_1024_85,T_4096_357,T_2048_187,T_4096_391,T_512_51,T_4096_425,T_2048_221,T_4096_459,T_1024_119,T_4096_493,T_2048_255,T_4096_527,T_128_17,T_4096_561,T_2048_289,T_4096_595,T_1024_153,T_4096_629,T_2048_323,T_4096_663,T_512_85,T_4096_697,T_2048_357,T_4096_731,T_1024_187,T_4096_765,T_2048_391,T_4096_799,T_256_51,T_4096_833,T_2048_425,T_4096_867,T_1024_221,T_4096_901,T_2048_459,T_4096_935,T_512_119,T_4096_969,T_2048_493,T_4096_1003,T_1024_255,T_4096_1037,T_2048_527,T_4096_1071,T_64_17,T_4096_1105,T_2048_561,T_4096_1139,T_1024_289,T_4096_1173,T_2048_595,T_4096_1207,T_512_153,T_4096_1241,T_2048_629,T_4096_1275,T_1024_323,T_4096_1309,T_2048_663,T_4096_1343,T_256_85,T_4096_1377,T_2048_697,T_4096_1411,T_1024_357,T_4096_1445,T_2048_731,T_4096_1479,T_512_187,T_4096_1513,T_2048_765,T_4096_1547,T_1024_391,T_4096_1581,T_2048_799,T_4096_1615,T_128_51,T_4096_1649,T_2048_833,T_4096_1683,T_1024_425,T_4096_1717,T_2048_867,T_4096_1751,T_512_221,T_4096_1785,T_2048_901,T_4096_1819,T_1024_459,T_4096_1853,T_2048_935,T_4096_1887,T_256_119,T_4096_1921,T_2048_969,T_4096_1955,T_1024_493,T_4096_1989,T_2048_1003,T_4096_2023,T_512_255,T_4096_2057,T_2048_1037,T_4096_2091,T_1024_527,T_4096_2125,T_2048_1071,T_4096_2159 +}; +static const __device__ float2 lut_sp_9_6561[729*2] = { + T_2_0,T_6561_1,T_6561_2,T_2187_1,T_6561_4,T_6561_5,T_2187_2,T_6561_7,T_6561_8,T_729_1,T_6561_10,T_6561_11,T_2187_4,T_6561_13,T_6561_14,T_2187_5,T_6561_16,T_6561_17,T_729_2,T_6561_19,T_6561_20,T_2187_7,T_6561_22,T_6561_23,T_2187_8,T_6561_25,T_6561_26,T_243_1,T_6561_28,T_6561_29,T_2187_10,T_6561_31,T_6561_32,T_2187_11,T_6561_34,T_6561_35,T_729_4,T_6561_37,T_6561_38,T_2187_13,T_6561_40,T_6561_41,T_2187_14,T_6561_43,T_6561_44,T_729_5,T_6561_46,T_6561_47,T_2187_16,T_6561_49,T_6561_50,T_2187_17,T_6561_52,T_6561_53,T_243_2,T_6561_55,T_6561_56,T_2187_19,T_6561_58,T_6561_59,T_2187_20,T_6561_61,T_6561_62,T_729_7,T_6561_64,T_6561_65,T_2187_22,T_6561_67,T_6561_68,T_2187_23,T_6561_70,T_6561_71,T_729_8,T_6561_73,T_6561_74,T_2187_25,T_6561_76,T_6561_77,T_2187_26,T_6561_79,T_6561_80,T_81_1,T_6561_82,T_6561_83,T_2187_28,T_6561_85,T_6561_86,T_2187_29,T_6561_88,T_6561_89,T_729_10,T_6561_91,T_6561_92,T_2187_31,T_6561_94,T_6561_95,T_2187_32,T_6561_97,T_6561_98,T_729_11,T_6561_100,T_6561_101,T_2187_34,T_6561_103,T_6561_104,T_2187_35,T_6561_106,T_6561_107,T_243_4,T_6561_109,T_6561_110,T_2187_37,T_6561_112,T_6561_113,T_2187_38,T_6561_115,T_6561_116,T_729_13,T_6561_118,T_6561_119,T_2187_40,T_6561_121,T_6561_122,T_2187_41,T_6561_124,T_6561_125,T_729_14,T_6561_127,T_6561_128,T_2187_43,T_6561_130,T_6561_131,T_2187_44,T_6561_133,T_6561_134,T_243_5,T_6561_136,T_6561_137,T_2187_46,T_6561_139,T_6561_140,T_2187_47,T_6561_142,T_6561_143,T_729_16,T_6561_145,T_6561_146,T_2187_49,T_6561_148,T_6561_149,T_2187_50,T_6561_151,T_6561_152,T_729_17,T_6561_154,T_6561_155,T_2187_52,T_6561_157,T_6561_158,T_2187_53,T_6561_160,T_6561_161,T_81_2,T_6561_163,T_6561_164,T_2187_55,T_6561_166,T_6561_167,T_2187_56,T_6561_169,T_6561_170,T_729_19,T_6561_172,T_6561_173,T_2187_58,T_6561_175,T_6561_176,T_2187_59,T_6561_178,T_6561_179,T_729_20,T_6561_181,T_6561_182,T_2187_61,T_6561_184,T_6561_185,T_2187_62,T_6561_187,T_6561_188,T_243_7,T_6561_190,T_6561_191,T_2187_64,T_6561_193,T_6561_194,T_2187_65,T_6561_196,T_6561_197,T_729_22,T_6561_199,T_6561_200,T_2187_67,T_6561_202,T_6561_203,T_2187_68,T_6561_205,T_6561_206,T_729_23,T_6561_208,T_6561_209,T_2187_70,T_6561_211,T_6561_212,T_2187_71,T_6561_214,T_6561_215,T_243_8,T_6561_217,T_6561_218,T_2187_73,T_6561_220,T_6561_221,T_2187_74,T_6561_223,T_6561_224,T_729_25,T_6561_226,T_6561_227,T_2187_76,T_6561_229,T_6561_230,T_2187_77,T_6561_232,T_6561_233,T_729_26,T_6561_235,T_6561_236,T_2187_79,T_6561_238,T_6561_239,T_2187_80,T_6561_241,T_6561_242,T_27_1,T_6561_244,T_6561_245,T_2187_82,T_6561_247,T_6561_248,T_2187_83,T_6561_250,T_6561_251,T_729_28,T_6561_253,T_6561_254,T_2187_85,T_6561_256,T_6561_257,T_2187_86,T_6561_259,T_6561_260,T_729_29,T_6561_262,T_6561_263,T_2187_88,T_6561_265,T_6561_266,T_2187_89,T_6561_268,T_6561_269,T_243_10,T_6561_271,T_6561_272,T_2187_91,T_6561_274,T_6561_275,T_2187_92,T_6561_277,T_6561_278,T_729_31,T_6561_280,T_6561_281,T_2187_94,T_6561_283,T_6561_284,T_2187_95,T_6561_286,T_6561_287,T_729_32,T_6561_289,T_6561_290,T_2187_97,T_6561_292,T_6561_293,T_2187_98,T_6561_295,T_6561_296,T_243_11,T_6561_298,T_6561_299,T_2187_100,T_6561_301,T_6561_302,T_2187_101,T_6561_304,T_6561_305,T_729_34,T_6561_307,T_6561_308,T_2187_103,T_6561_310,T_6561_311,T_2187_104,T_6561_313,T_6561_314,T_729_35,T_6561_316,T_6561_317,T_2187_106,T_6561_319,T_6561_320,T_2187_107,T_6561_322,T_6561_323,T_81_4,T_6561_325,T_6561_326,T_2187_109,T_6561_328,T_6561_329,T_2187_110,T_6561_331,T_6561_332,T_729_37,T_6561_334,T_6561_335,T_2187_112,T_6561_337,T_6561_338,T_2187_113,T_6561_340,T_6561_341,T_729_38,T_6561_343,T_6561_344,T_2187_115,T_6561_346,T_6561_347,T_2187_116,T_6561_349,T_6561_350,T_243_13,T_6561_352,T_6561_353,T_2187_118,T_6561_355,T_6561_356,T_2187_119,T_6561_358,T_6561_359,T_729_40,T_6561_361,T_6561_362,T_2187_121,T_6561_364,T_6561_365,T_2187_122,T_6561_367,T_6561_368,T_729_41,T_6561_370,T_6561_371,T_2187_124,T_6561_373,T_6561_374,T_2187_125,T_6561_376,T_6561_377,T_243_14,T_6561_379,T_6561_380,T_2187_127,T_6561_382,T_6561_383,T_2187_128,T_6561_385,T_6561_386,T_729_43,T_6561_388,T_6561_389,T_2187_130,T_6561_391,T_6561_392,T_2187_131,T_6561_394,T_6561_395,T_729_44,T_6561_397,T_6561_398,T_2187_133,T_6561_400,T_6561_401,T_2187_134,T_6561_403,T_6561_404,T_81_5,T_6561_406,T_6561_407,T_2187_136,T_6561_409,T_6561_410,T_2187_137,T_6561_412,T_6561_413,T_729_46,T_6561_415,T_6561_416,T_2187_139,T_6561_418,T_6561_419,T_2187_140,T_6561_421,T_6561_422,T_729_47,T_6561_424,T_6561_425,T_2187_142,T_6561_427,T_6561_428,T_2187_143,T_6561_430,T_6561_431,T_243_16,T_6561_433,T_6561_434,T_2187_145,T_6561_436,T_6561_437,T_2187_146,T_6561_439,T_6561_440,T_729_49,T_6561_442,T_6561_443,T_2187_148,T_6561_445,T_6561_446,T_2187_149,T_6561_448,T_6561_449,T_729_50,T_6561_451,T_6561_452,T_2187_151,T_6561_454,T_6561_455,T_2187_152,T_6561_457,T_6561_458,T_243_17,T_6561_460,T_6561_461,T_2187_154,T_6561_463,T_6561_464,T_2187_155,T_6561_466,T_6561_467,T_729_52,T_6561_469,T_6561_470,T_2187_157,T_6561_472,T_6561_473,T_2187_158,T_6561_475,T_6561_476,T_729_53,T_6561_478,T_6561_479,T_2187_160,T_6561_481,T_6561_482,T_2187_161,T_6561_484,T_6561_485,T_27_2,T_6561_487,T_6561_488,T_2187_163,T_6561_490,T_6561_491,T_2187_164,T_6561_493,T_6561_494,T_729_55,T_6561_496,T_6561_497,T_2187_166,T_6561_499,T_6561_500,T_2187_167,T_6561_502,T_6561_503,T_729_56,T_6561_505,T_6561_506,T_2187_169,T_6561_508,T_6561_509,T_2187_170,T_6561_511,T_6561_512,T_243_19,T_6561_514,T_6561_515,T_2187_172,T_6561_517,T_6561_518,T_2187_173,T_6561_520,T_6561_521,T_729_58,T_6561_523,T_6561_524,T_2187_175,T_6561_526,T_6561_527,T_2187_176,T_6561_529,T_6561_530,T_729_59,T_6561_532,T_6561_533,T_2187_178,T_6561_535,T_6561_536,T_2187_179,T_6561_538,T_6561_539,T_243_20,T_6561_541,T_6561_542,T_2187_181,T_6561_544,T_6561_545,T_2187_182,T_6561_547,T_6561_548,T_729_61,T_6561_550,T_6561_551,T_2187_184,T_6561_553,T_6561_554,T_2187_185,T_6561_556,T_6561_557,T_729_62,T_6561_559,T_6561_560,T_2187_187,T_6561_562,T_6561_563,T_2187_188,T_6561_565,T_6561_566,T_81_7,T_6561_568,T_6561_569,T_2187_190,T_6561_571,T_6561_572,T_2187_191,T_6561_574,T_6561_575,T_729_64,T_6561_577,T_6561_578,T_2187_193,T_6561_580,T_6561_581,T_2187_194,T_6561_583,T_6561_584,T_729_65,T_6561_586,T_6561_587,T_2187_196,T_6561_589,T_6561_590,T_2187_197,T_6561_592,T_6561_593,T_243_22,T_6561_595,T_6561_596,T_2187_199,T_6561_598,T_6561_599,T_2187_200,T_6561_601,T_6561_602,T_729_67,T_6561_604,T_6561_605,T_2187_202,T_6561_607,T_6561_608,T_2187_203,T_6561_610,T_6561_611,T_729_68,T_6561_613,T_6561_614,T_2187_205,T_6561_616,T_6561_617,T_2187_206,T_6561_619,T_6561_620,T_243_23,T_6561_622,T_6561_623,T_2187_208,T_6561_625,T_6561_626,T_2187_209,T_6561_628,T_6561_629,T_729_70,T_6561_631,T_6561_632,T_2187_211,T_6561_634,T_6561_635,T_2187_212,T_6561_637,T_6561_638,T_729_71,T_6561_640,T_6561_641,T_2187_214,T_6561_643,T_6561_644,T_2187_215,T_6561_646,T_6561_647,T_81_8,T_6561_649,T_6561_650,T_2187_217,T_6561_652,T_6561_653,T_2187_218,T_6561_655,T_6561_656,T_729_73,T_6561_658,T_6561_659,T_2187_220,T_6561_661,T_6561_662,T_2187_221,T_6561_664,T_6561_665,T_729_74,T_6561_667,T_6561_668,T_2187_223,T_6561_670,T_6561_671,T_2187_224,T_6561_673,T_6561_674,T_243_25,T_6561_676,T_6561_677,T_2187_226,T_6561_679,T_6561_680,T_2187_227,T_6561_682,T_6561_683,T_729_76,T_6561_685,T_6561_686,T_2187_229,T_6561_688,T_6561_689,T_2187_230,T_6561_691,T_6561_692,T_729_77,T_6561_694,T_6561_695,T_2187_232,T_6561_697,T_6561_698,T_2187_233,T_6561_700,T_6561_701,T_243_26,T_6561_703,T_6561_704,T_2187_235,T_6561_706,T_6561_707,T_2187_236,T_6561_709,T_6561_710,T_729_79,T_6561_712,T_6561_713,T_2187_238,T_6561_715,T_6561_716,T_2187_239,T_6561_718,T_6561_719,T_729_80,T_6561_721,T_6561_722,T_2187_241,T_6561_724,T_6561_725,T_2187_242,T_6561_727,T_6561_728,T_2_0,T_6561_5,T_6561_10,T_2187_5,T_6561_20,T_6561_25,T_2187_10,T_6561_35,T_6561_40,T_729_5,T_6561_50,T_6561_55,T_2187_20,T_6561_65,T_6561_70,T_2187_25,T_6561_80,T_6561_85,T_729_10,T_6561_95,T_6561_100,T_2187_35,T_6561_110,T_6561_115,T_2187_40,T_6561_125,T_6561_130,T_243_5,T_6561_140,T_6561_145,T_2187_50,T_6561_155,T_6561_160,T_2187_55,T_6561_170,T_6561_175,T_729_20,T_6561_185,T_6561_190,T_2187_65,T_6561_200,T_6561_205,T_2187_70,T_6561_215,T_6561_220,T_729_25,T_6561_230,T_6561_235,T_2187_80,T_6561_245,T_6561_250,T_2187_85,T_6561_260,T_6561_265,T_243_10,T_6561_275,T_6561_280,T_2187_95,T_6561_290,T_6561_295,T_2187_100,T_6561_305,T_6561_310,T_729_35,T_6561_320,T_6561_325,T_2187_110,T_6561_335,T_6561_340,T_2187_115,T_6561_350,T_6561_355,T_729_40,T_6561_365,T_6561_370,T_2187_125,T_6561_380,T_6561_385,T_2187_130,T_6561_395,T_6561_400,T_81_5,T_6561_410,T_6561_415,T_2187_140,T_6561_425,T_6561_430,T_2187_145,T_6561_440,T_6561_445,T_729_50,T_6561_455,T_6561_460,T_2187_155,T_6561_470,T_6561_475,T_2187_160,T_6561_485,T_6561_490,T_729_55,T_6561_500,T_6561_505,T_2187_170,T_6561_515,T_6561_520,T_2187_175,T_6561_530,T_6561_535,T_243_20,T_6561_545,T_6561_550,T_2187_185,T_6561_560,T_6561_565,T_2187_190,T_6561_575,T_6561_580,T_729_65,T_6561_590,T_6561_595,T_2187_200,T_6561_605,T_6561_610,T_2187_205,T_6561_620,T_6561_625,T_729_70,T_6561_635,T_6561_640,T_2187_215,T_6561_650,T_6561_655,T_2187_220,T_6561_665,T_6561_670,T_243_25,T_6561_680,T_6561_685,T_2187_230,T_6561_695,T_6561_700,T_2187_235,T_6561_710,T_6561_715,T_729_80,T_6561_725,T_6561_730,T_2187_245,T_6561_740,T_6561_745,T_2187_250,T_6561_755,T_6561_760,T_729_85,T_6561_770,T_6561_775,T_2187_260,T_6561_785,T_6561_790,T_2187_265,T_6561_800,T_6561_805,T_81_10,T_6561_815,T_6561_820,T_2187_275,T_6561_830,T_6561_835,T_2187_280,T_6561_845,T_6561_850,T_729_95,T_6561_860,T_6561_865,T_2187_290,T_6561_875,T_6561_880,T_2187_295,T_6561_890,T_6561_895,T_729_100,T_6561_905,T_6561_910,T_2187_305,T_6561_920,T_6561_925,T_2187_310,T_6561_935,T_6561_940,T_243_35,T_6561_950,T_6561_955,T_2187_320,T_6561_965,T_6561_970,T_2187_325,T_6561_980,T_6561_985,T_729_110,T_6561_995,T_6561_1000,T_2187_335,T_6561_1010,T_6561_1015,T_2187_340,T_6561_1025,T_6561_1030,T_729_115,T_6561_1040,T_6561_1045,T_2187_350,T_6561_1055,T_6561_1060,T_2187_355,T_6561_1070,T_6561_1075,T_243_40,T_6561_1085,T_6561_1090,T_2187_365,T_6561_1100,T_6561_1105,T_2187_370,T_6561_1115,T_6561_1120,T_729_125,T_6561_1130,T_6561_1135,T_2187_380,T_6561_1145,T_6561_1150,T_2187_385,T_6561_1160,T_6561_1165,T_729_130,T_6561_1175,T_6561_1180,T_2187_395,T_6561_1190,T_6561_1195,T_2187_400,T_6561_1205,T_6561_1210,T_27_5,T_6561_1220,T_6561_1225,T_2187_410,T_6561_1235,T_6561_1240,T_2187_415,T_6561_1250,T_6561_1255,T_729_140,T_6561_1265,T_6561_1270,T_2187_425,T_6561_1280,T_6561_1285,T_2187_430,T_6561_1295,T_6561_1300,T_729_145,T_6561_1310,T_6561_1315,T_2187_440,T_6561_1325,T_6561_1330,T_2187_445,T_6561_1340,T_6561_1345,T_243_50,T_6561_1355,T_6561_1360,T_2187_455,T_6561_1370,T_6561_1375,T_2187_460,T_6561_1385,T_6561_1390,T_729_155,T_6561_1400,T_6561_1405,T_2187_470,T_6561_1415,T_6561_1420,T_2187_475,T_6561_1430,T_6561_1435,T_729_160,T_6561_1445,T_6561_1450,T_2187_485,T_6561_1460,T_6561_1465,T_2187_490,T_6561_1475,T_6561_1480,T_243_55,T_6561_1490,T_6561_1495,T_2187_500,T_6561_1505,T_6561_1510,T_2187_505,T_6561_1520,T_6561_1525,T_729_170,T_6561_1535,T_6561_1540,T_2187_515,T_6561_1550,T_6561_1555,T_2187_520,T_6561_1565,T_6561_1570,T_729_175,T_6561_1580,T_6561_1585,T_2187_530,T_6561_1595,T_6561_1600,T_2187_535,T_6561_1610,T_6561_1615,T_81_20,T_6561_1625,T_6561_1630,T_2187_545,T_6561_1640,T_6561_1645,T_2187_550,T_6561_1655,T_6561_1660,T_729_185,T_6561_1670,T_6561_1675,T_2187_560,T_6561_1685,T_6561_1690,T_2187_565,T_6561_1700,T_6561_1705,T_729_190,T_6561_1715,T_6561_1720,T_2187_575,T_6561_1730,T_6561_1735,T_2187_580,T_6561_1745,T_6561_1750,T_243_65,T_6561_1760,T_6561_1765,T_2187_590,T_6561_1775,T_6561_1780,T_2187_595,T_6561_1790,T_6561_1795,T_729_200,T_6561_1805,T_6561_1810,T_2187_605,T_6561_1820,T_6561_1825,T_2187_610,T_6561_1835,T_6561_1840,T_729_205,T_6561_1850,T_6561_1855,T_2187_620,T_6561_1865,T_6561_1870,T_2187_625,T_6561_1880,T_6561_1885,T_243_70,T_6561_1895,T_6561_1900,T_2187_635,T_6561_1910,T_6561_1915,T_2187_640,T_6561_1925,T_6561_1930,T_729_215,T_6561_1940,T_6561_1945,T_2187_650,T_6561_1955,T_6561_1960,T_2187_655,T_6561_1970,T_6561_1975,T_729_220,T_6561_1985,T_6561_1990,T_2187_665,T_6561_2000,T_6561_2005,T_2187_670,T_6561_2015,T_6561_2020,T_81_25,T_6561_2030,T_6561_2035,T_2187_680,T_6561_2045,T_6561_2050,T_2187_685,T_6561_2060,T_6561_2065,T_729_230,T_6561_2075,T_6561_2080,T_2187_695,T_6561_2090,T_6561_2095,T_2187_700,T_6561_2105,T_6561_2110,T_729_235,T_6561_2120,T_6561_2125,T_2187_710,T_6561_2135,T_6561_2140,T_2187_715,T_6561_2150,T_6561_2155,T_243_80,T_6561_2165,T_6561_2170,T_2187_725,T_6561_2180,T_6561_2185,T_2187_730,T_6561_2195,T_6561_2200,T_729_245,T_6561_2210,T_6561_2215,T_2187_740,T_6561_2225,T_6561_2230,T_2187_745,T_6561_2240,T_6561_2245,T_729_250,T_6561_2255,T_6561_2260,T_2187_755,T_6561_2270,T_6561_2275,T_2187_760,T_6561_2285,T_6561_2290,T_243_85,T_6561_2300,T_6561_2305,T_2187_770,T_6561_2315,T_6561_2320,T_2187_775,T_6561_2330,T_6561_2335,T_729_260,T_6561_2345,T_6561_2350,T_2187_785,T_6561_2360,T_6561_2365,T_2187_790,T_6561_2375,T_6561_2380,T_729_265,T_6561_2390,T_6561_2395,T_2187_800,T_6561_2405,T_6561_2410,T_2187_805,T_6561_2420,T_6561_2425,T_27_10,T_6561_2435,T_6561_2440,T_2187_815,T_6561_2450,T_6561_2455,T_2187_820,T_6561_2465,T_6561_2470,T_729_275,T_6561_2480,T_6561_2485,T_2187_830,T_6561_2495,T_6561_2500,T_2187_835,T_6561_2510,T_6561_2515,T_729_280,T_6561_2525,T_6561_2530,T_2187_845,T_6561_2540,T_6561_2545,T_2187_850,T_6561_2555,T_6561_2560,T_243_95,T_6561_2570,T_6561_2575,T_2187_860,T_6561_2585,T_6561_2590,T_2187_865,T_6561_2600,T_6561_2605,T_729_290,T_6561_2615,T_6561_2620,T_2187_875,T_6561_2630,T_6561_2635,T_2187_880,T_6561_2645,T_6561_2650,T_729_295,T_6561_2660,T_6561_2665,T_2187_890,T_6561_2675,T_6561_2680,T_2187_895,T_6561_2690,T_6561_2695,T_243_100,T_6561_2705,T_6561_2710,T_2187_905,T_6561_2720,T_6561_2725,T_2187_910,T_6561_2735,T_6561_2740,T_729_305,T_6561_2750,T_6561_2755,T_2187_920,T_6561_2765,T_6561_2770,T_2187_925,T_6561_2780,T_6561_2785,T_729_310,T_6561_2795,T_6561_2800,T_2187_935,T_6561_2810,T_6561_2815,T_2187_940,T_6561_2825,T_6561_2830,T_81_35,T_6561_2840,T_6561_2845,T_2187_950,T_6561_2855,T_6561_2860,T_2187_955,T_6561_2870,T_6561_2875,T_729_320,T_6561_2885,T_6561_2890,T_2187_965,T_6561_2900,T_6561_2905,T_2187_970,T_6561_2915,T_6561_2920,T_729_325,T_6561_2930,T_6561_2935,T_2187_980,T_6561_2945,T_6561_2950,T_2187_985,T_6561_2960,T_6561_2965,T_243_110,T_6561_2975,T_6561_2980,T_2187_995,T_6561_2990,T_6561_2995,T_2187_1000,T_6561_3005,T_6561_3010,T_729_335,T_6561_3020,T_6561_3025,T_2187_1010,T_6561_3035,T_6561_3040,T_2187_1015,T_6561_3050,T_6561_3055,T_729_340,T_6561_3065,T_6561_3070,T_2187_1025,T_6561_3080,T_6561_3085,T_2187_1030,T_6561_3095,T_6561_3100,T_243_115,T_6561_3110,T_6561_3115,T_2187_1040,T_6561_3125,T_6561_3130,T_2187_1045,T_6561_3140,T_6561_3145,T_729_350,T_6561_3155,T_6561_3160,T_2187_1055,T_6561_3170,T_6561_3175,T_2187_1060,T_6561_3185,T_6561_3190,T_729_355,T_6561_3200,T_6561_3205,T_2187_1070,T_6561_3215,T_6561_3220,T_2187_1075,T_6561_3230,T_6561_3235,T_81_40,T_6561_3245,T_6561_3250,T_2187_1085,T_6561_3260,T_6561_3265,T_2187_1090,T_6561_3275,T_6561_3280,T_729_365,T_6561_3290,T_6561_3295,T_2187_1100,T_6561_3305,T_6561_3310,T_2187_1105,T_6561_3320,T_6561_3325,T_729_370,T_6561_3335,T_6561_3340,T_2187_1115,T_6561_3350,T_6561_3355,T_2187_1120,T_6561_3365,T_6561_3370,T_243_125,T_6561_3380,T_6561_3385,T_2187_1130,T_6561_3395,T_6561_3400,T_2187_1135,T_6561_3410,T_6561_3415,T_729_380,T_6561_3425,T_6561_3430,T_2187_1145,T_6561_3440,T_6561_3445,T_2187_1150,T_6561_3455,T_6561_3460,T_729_385,T_6561_3470,T_6561_3475,T_2187_1160,T_6561_3485,T_6561_3490,T_2187_1165,T_6561_3500,T_6561_3505,T_243_130,T_6561_3515,T_6561_3520,T_2187_1175,T_6561_3530,T_6561_3535,T_2187_1180,T_6561_3545,T_6561_3550,T_729_395,T_6561_3560,T_6561_3565,T_2187_1190,T_6561_3575,T_6561_3580,T_2187_1195,T_6561_3590,T_6561_3595,T_729_400,T_6561_3605,T_6561_3610,T_2187_1205,T_6561_3620,T_6561_3625,T_2187_1210,T_6561_3635,T_6561_3640 +}; +static const __device__ float2 lut_sp_27_6561[243*2] = { + T_2_0,T_6561_1,T_6561_2,T_2187_1,T_6561_4,T_6561_5,T_2187_2,T_6561_7,T_6561_8,T_729_1,T_6561_10,T_6561_11,T_2187_4,T_6561_13,T_6561_14,T_2187_5,T_6561_16,T_6561_17,T_729_2,T_6561_19,T_6561_20,T_2187_7,T_6561_22,T_6561_23,T_2187_8,T_6561_25,T_6561_26,T_243_1,T_6561_28,T_6561_29,T_2187_10,T_6561_31,T_6561_32,T_2187_11,T_6561_34,T_6561_35,T_729_4,T_6561_37,T_6561_38,T_2187_13,T_6561_40,T_6561_41,T_2187_14,T_6561_43,T_6561_44,T_729_5,T_6561_46,T_6561_47,T_2187_16,T_6561_49,T_6561_50,T_2187_17,T_6561_52,T_6561_53,T_243_2,T_6561_55,T_6561_56,T_2187_19,T_6561_58,T_6561_59,T_2187_20,T_6561_61,T_6561_62,T_729_7,T_6561_64,T_6561_65,T_2187_22,T_6561_67,T_6561_68,T_2187_23,T_6561_70,T_6561_71,T_729_8,T_6561_73,T_6561_74,T_2187_25,T_6561_76,T_6561_77,T_2187_26,T_6561_79,T_6561_80,T_81_1,T_6561_82,T_6561_83,T_2187_28,T_6561_85,T_6561_86,T_2187_29,T_6561_88,T_6561_89,T_729_10,T_6561_91,T_6561_92,T_2187_31,T_6561_94,T_6561_95,T_2187_32,T_6561_97,T_6561_98,T_729_11,T_6561_100,T_6561_101,T_2187_34,T_6561_103,T_6561_104,T_2187_35,T_6561_106,T_6561_107,T_243_4,T_6561_109,T_6561_110,T_2187_37,T_6561_112,T_6561_113,T_2187_38,T_6561_115,T_6561_116,T_729_13,T_6561_118,T_6561_119,T_2187_40,T_6561_121,T_6561_122,T_2187_41,T_6561_124,T_6561_125,T_729_14,T_6561_127,T_6561_128,T_2187_43,T_6561_130,T_6561_131,T_2187_44,T_6561_133,T_6561_134,T_243_5,T_6561_136,T_6561_137,T_2187_46,T_6561_139,T_6561_140,T_2187_47,T_6561_142,T_6561_143,T_729_16,T_6561_145,T_6561_146,T_2187_49,T_6561_148,T_6561_149,T_2187_50,T_6561_151,T_6561_152,T_729_17,T_6561_154,T_6561_155,T_2187_52,T_6561_157,T_6561_158,T_2187_53,T_6561_160,T_6561_161,T_81_2,T_6561_163,T_6561_164,T_2187_55,T_6561_166,T_6561_167,T_2187_56,T_6561_169,T_6561_170,T_729_19,T_6561_172,T_6561_173,T_2187_58,T_6561_175,T_6561_176,T_2187_59,T_6561_178,T_6561_179,T_729_20,T_6561_181,T_6561_182,T_2187_61,T_6561_184,T_6561_185,T_2187_62,T_6561_187,T_6561_188,T_243_7,T_6561_190,T_6561_191,T_2187_64,T_6561_193,T_6561_194,T_2187_65,T_6561_196,T_6561_197,T_729_22,T_6561_199,T_6561_200,T_2187_67,T_6561_202,T_6561_203,T_2187_68,T_6561_205,T_6561_206,T_729_23,T_6561_208,T_6561_209,T_2187_70,T_6561_211,T_6561_212,T_2187_71,T_6561_214,T_6561_215,T_243_8,T_6561_217,T_6561_218,T_2187_73,T_6561_220,T_6561_221,T_2187_74,T_6561_223,T_6561_224,T_729_25,T_6561_226,T_6561_227,T_2187_76,T_6561_229,T_6561_230,T_2187_77,T_6561_232,T_6561_233,T_729_26,T_6561_235,T_6561_236,T_2187_79,T_6561_238,T_6561_239,T_2187_80,T_6561_241,T_6561_242,T_2_0,T_6561_14,T_6561_28,T_2187_14,T_6561_56,T_6561_70,T_2187_28,T_6561_98,T_6561_112,T_729_14,T_6561_140,T_6561_154,T_2187_56,T_6561_182,T_6561_196,T_2187_70,T_6561_224,T_6561_238,T_729_28,T_6561_266,T_6561_280,T_2187_98,T_6561_308,T_6561_322,T_2187_112,T_6561_350,T_6561_364,T_243_14,T_6561_392,T_6561_406,T_2187_140,T_6561_434,T_6561_448,T_2187_154,T_6561_476,T_6561_490,T_729_56,T_6561_518,T_6561_532,T_2187_182,T_6561_560,T_6561_574,T_2187_196,T_6561_602,T_6561_616,T_729_70,T_6561_644,T_6561_658,T_2187_224,T_6561_686,T_6561_700,T_2187_238,T_6561_728,T_6561_742,T_243_28,T_6561_770,T_6561_784,T_2187_266,T_6561_812,T_6561_826,T_2187_280,T_6561_854,T_6561_868,T_729_98,T_6561_896,T_6561_910,T_2187_308,T_6561_938,T_6561_952,T_2187_322,T_6561_980,T_6561_994,T_729_112,T_6561_1022,T_6561_1036,T_2187_350,T_6561_1064,T_6561_1078,T_2187_364,T_6561_1106,T_6561_1120,T_81_14,T_6561_1148,T_6561_1162,T_2187_392,T_6561_1190,T_6561_1204,T_2187_406,T_6561_1232,T_6561_1246,T_729_140,T_6561_1274,T_6561_1288,T_2187_434,T_6561_1316,T_6561_1330,T_2187_448,T_6561_1358,T_6561_1372,T_729_154,T_6561_1400,T_6561_1414,T_2187_476,T_6561_1442,T_6561_1456,T_2187_490,T_6561_1484,T_6561_1498,T_243_56,T_6561_1526,T_6561_1540,T_2187_518,T_6561_1568,T_6561_1582,T_2187_532,T_6561_1610,T_6561_1624,T_729_182,T_6561_1652,T_6561_1666,T_2187_560,T_6561_1694,T_6561_1708,T_2187_574,T_6561_1736,T_6561_1750,T_729_196,T_6561_1778,T_6561_1792,T_2187_602,T_6561_1820,T_6561_1834,T_2187_616,T_6561_1862,T_6561_1876,T_243_70,T_6561_1904,T_6561_1918,T_2187_644,T_6561_1946,T_6561_1960,T_2187_658,T_6561_1988,T_6561_2002,T_729_224,T_6561_2030,T_6561_2044,T_2187_686,T_6561_2072,T_6561_2086,T_2187_700,T_6561_2114,T_6561_2128,T_729_238,T_6561_2156,T_6561_2170,T_2187_728,T_6561_2198,T_6561_2212,T_2187_742,T_6561_2240,T_6561_2254,T_81_28,T_6561_2282,T_6561_2296,T_2187_770,T_6561_2324,T_6561_2338,T_2187_784,T_6561_2366,T_6561_2380,T_729_266,T_6561_2408,T_6561_2422,T_2187_812,T_6561_2450,T_6561_2464,T_2187_826,T_6561_2492,T_6561_2506,T_729_280,T_6561_2534,T_6561_2548,T_2187_854,T_6561_2576,T_6561_2590,T_2187_868,T_6561_2618,T_6561_2632,T_243_98,T_6561_2660,T_6561_2674,T_2187_896,T_6561_2702,T_6561_2716,T_2187_910,T_6561_2744,T_6561_2758,T_729_308,T_6561_2786,T_6561_2800,T_2187_938,T_6561_2828,T_6561_2842,T_2187_952,T_6561_2870,T_6561_2884,T_729_322,T_6561_2912,T_6561_2926,T_2187_980,T_6561_2954,T_6561_2968,T_2187_994,T_6561_2996,T_6561_3010,T_243_112,T_6561_3038,T_6561_3052,T_2187_1022,T_6561_3080,T_6561_3094,T_2187_1036,T_6561_3122,T_6561_3136,T_729_350,T_6561_3164,T_6561_3178,T_2187_1064,T_6561_3206,T_6561_3220,T_2187_1078,T_6561_3248,T_6561_3262,T_729_364,T_6561_3290,T_6561_3304,T_2187_1106,T_6561_3332,T_6561_3346,T_2187_1120,T_6561_3374,T_6561_3388 +}; +static const __device__ float2 lut_sp_8_8192[1024*2] = { + T_2_0,T_8192_1,T_4096_1,T_8192_3,T_2048_1,T_8192_5,T_4096_3,T_8192_7,T_1024_1,T_8192_9,T_4096_5,T_8192_11,T_2048_3,T_8192_13,T_4096_7,T_8192_15,T_512_1,T_8192_17,T_4096_9,T_8192_19,T_2048_5,T_8192_21,T_4096_11,T_8192_23,T_1024_3,T_8192_25,T_4096_13,T_8192_27,T_2048_7,T_8192_29,T_4096_15,T_8192_31,T_256_1,T_8192_33,T_4096_17,T_8192_35,T_2048_9,T_8192_37,T_4096_19,T_8192_39,T_1024_5,T_8192_41,T_4096_21,T_8192_43,T_2048_11,T_8192_45,T_4096_23,T_8192_47,T_512_3,T_8192_49,T_4096_25,T_8192_51,T_2048_13,T_8192_53,T_4096_27,T_8192_55,T_1024_7,T_8192_57,T_4096_29,T_8192_59,T_2048_15,T_8192_61,T_4096_31,T_8192_63,T_128_1,T_8192_65,T_4096_33,T_8192_67,T_2048_17,T_8192_69,T_4096_35,T_8192_71,T_1024_9,T_8192_73,T_4096_37,T_8192_75,T_2048_19,T_8192_77,T_4096_39,T_8192_79,T_512_5,T_8192_81,T_4096_41,T_8192_83,T_2048_21,T_8192_85,T_4096_43,T_8192_87,T_1024_11,T_8192_89,T_4096_45,T_8192_91,T_2048_23,T_8192_93,T_4096_47,T_8192_95,T_256_3,T_8192_97,T_4096_49,T_8192_99,T_2048_25,T_8192_101,T_4096_51,T_8192_103,T_1024_13,T_8192_105,T_4096_53,T_8192_107,T_2048_27,T_8192_109,T_4096_55,T_8192_111,T_512_7,T_8192_113,T_4096_57,T_8192_115,T_2048_29,T_8192_117,T_4096_59,T_8192_119,T_1024_15,T_8192_121,T_4096_61,T_8192_123,T_2048_31,T_8192_125,T_4096_63,T_8192_127,T_64_1,T_8192_129,T_4096_65,T_8192_131,T_2048_33,T_8192_133,T_4096_67,T_8192_135,T_1024_17,T_8192_137,T_4096_69,T_8192_139,T_2048_35,T_8192_141,T_4096_71,T_8192_143,T_512_9,T_8192_145,T_4096_73,T_8192_147,T_2048_37,T_8192_149,T_4096_75,T_8192_151,T_1024_19,T_8192_153,T_4096_77,T_8192_155,T_2048_39,T_8192_157,T_4096_79,T_8192_159,T_256_5,T_8192_161,T_4096_81,T_8192_163,T_2048_41,T_8192_165,T_4096_83,T_8192_167,T_1024_21,T_8192_169,T_4096_85,T_8192_171,T_2048_43,T_8192_173,T_4096_87,T_8192_175,T_512_11,T_8192_177,T_4096_89,T_8192_179,T_2048_45,T_8192_181,T_4096_91,T_8192_183,T_1024_23,T_8192_185,T_4096_93,T_8192_187,T_2048_47,T_8192_189,T_4096_95,T_8192_191,T_128_3,T_8192_193,T_4096_97,T_8192_195,T_2048_49,T_8192_197,T_4096_99,T_8192_199,T_1024_25,T_8192_201,T_4096_101,T_8192_203,T_2048_51,T_8192_205,T_4096_103,T_8192_207,T_512_13,T_8192_209,T_4096_105,T_8192_211,T_2048_53,T_8192_213,T_4096_107,T_8192_215,T_1024_27,T_8192_217,T_4096_109,T_8192_219,T_2048_55,T_8192_221,T_4096_111,T_8192_223,T_256_7,T_8192_225,T_4096_113,T_8192_227,T_2048_57,T_8192_229,T_4096_115,T_8192_231,T_1024_29,T_8192_233,T_4096_117,T_8192_235,T_2048_59,T_8192_237,T_4096_119,T_8192_239,T_512_15,T_8192_241,T_4096_121,T_8192_243,T_2048_61,T_8192_245,T_4096_123,T_8192_247,T_1024_31,T_8192_249,T_4096_125,T_8192_251,T_2048_63,T_8192_253,T_4096_127,T_8192_255,T_32_1,T_8192_257,T_4096_129,T_8192_259,T_2048_65,T_8192_261,T_4096_131,T_8192_263,T_1024_33,T_8192_265,T_4096_133,T_8192_267,T_2048_67,T_8192_269,T_4096_135,T_8192_271,T_512_17,T_8192_273,T_4096_137,T_8192_275,T_2048_69,T_8192_277,T_4096_139,T_8192_279,T_1024_35,T_8192_281,T_4096_141,T_8192_283,T_2048_71,T_8192_285,T_4096_143,T_8192_287,T_256_9,T_8192_289,T_4096_145,T_8192_291,T_2048_73,T_8192_293,T_4096_147,T_8192_295,T_1024_37,T_8192_297,T_4096_149,T_8192_299,T_2048_75,T_8192_301,T_4096_151,T_8192_303,T_512_19,T_8192_305,T_4096_153,T_8192_307,T_2048_77,T_8192_309,T_4096_155,T_8192_311,T_1024_39,T_8192_313,T_4096_157,T_8192_315,T_2048_79,T_8192_317,T_4096_159,T_8192_319,T_128_5,T_8192_321,T_4096_161,T_8192_323,T_2048_81,T_8192_325,T_4096_163,T_8192_327,T_1024_41,T_8192_329,T_4096_165,T_8192_331,T_2048_83,T_8192_333,T_4096_167,T_8192_335,T_512_21,T_8192_337,T_4096_169,T_8192_339,T_2048_85,T_8192_341,T_4096_171,T_8192_343,T_1024_43,T_8192_345,T_4096_173,T_8192_347,T_2048_87,T_8192_349,T_4096_175,T_8192_351,T_256_11,T_8192_353,T_4096_177,T_8192_355,T_2048_89,T_8192_357,T_4096_179,T_8192_359,T_1024_45,T_8192_361,T_4096_181,T_8192_363,T_2048_91,T_8192_365,T_4096_183,T_8192_367,T_512_23,T_8192_369,T_4096_185,T_8192_371,T_2048_93,T_8192_373,T_4096_187,T_8192_375,T_1024_47,T_8192_377,T_4096_189,T_8192_379,T_2048_95,T_8192_381,T_4096_191,T_8192_383,T_64_3,T_8192_385,T_4096_193,T_8192_387,T_2048_97,T_8192_389,T_4096_195,T_8192_391,T_1024_49,T_8192_393,T_4096_197,T_8192_395,T_2048_99,T_8192_397,T_4096_199,T_8192_399,T_512_25,T_8192_401,T_4096_201,T_8192_403,T_2048_101,T_8192_405,T_4096_203,T_8192_407,T_1024_51,T_8192_409,T_4096_205,T_8192_411,T_2048_103,T_8192_413,T_4096_207,T_8192_415,T_256_13,T_8192_417,T_4096_209,T_8192_419,T_2048_105,T_8192_421,T_4096_211,T_8192_423,T_1024_53,T_8192_425,T_4096_213,T_8192_427,T_2048_107,T_8192_429,T_4096_215,T_8192_431,T_512_27,T_8192_433,T_4096_217,T_8192_435,T_2048_109,T_8192_437,T_4096_219,T_8192_439,T_1024_55,T_8192_441,T_4096_221,T_8192_443,T_2048_111,T_8192_445,T_4096_223,T_8192_447,T_128_7,T_8192_449,T_4096_225,T_8192_451,T_2048_113,T_8192_453,T_4096_227,T_8192_455,T_1024_57,T_8192_457,T_4096_229,T_8192_459,T_2048_115,T_8192_461,T_4096_231,T_8192_463,T_512_29,T_8192_465,T_4096_233,T_8192_467,T_2048_117,T_8192_469,T_4096_235,T_8192_471,T_1024_59,T_8192_473,T_4096_237,T_8192_475,T_2048_119,T_8192_477,T_4096_239,T_8192_479,T_256_15,T_8192_481,T_4096_241,T_8192_483,T_2048_121,T_8192_485,T_4096_243,T_8192_487,T_1024_61,T_8192_489,T_4096_245,T_8192_491,T_2048_123,T_8192_493,T_4096_247,T_8192_495,T_512_31,T_8192_497,T_4096_249,T_8192_499,T_2048_125,T_8192_501,T_4096_251,T_8192_503,T_1024_63,T_8192_505,T_4096_253,T_8192_507,T_2048_127,T_8192_509,T_4096_255,T_8192_511,T_16_1,T_8192_513,T_4096_257,T_8192_515,T_2048_129,T_8192_517,T_4096_259,T_8192_519,T_1024_65,T_8192_521,T_4096_261,T_8192_523,T_2048_131,T_8192_525,T_4096_263,T_8192_527,T_512_33,T_8192_529,T_4096_265,T_8192_531,T_2048_133,T_8192_533,T_4096_267,T_8192_535,T_1024_67,T_8192_537,T_4096_269,T_8192_539,T_2048_135,T_8192_541,T_4096_271,T_8192_543,T_256_17,T_8192_545,T_4096_273,T_8192_547,T_2048_137,T_8192_549,T_4096_275,T_8192_551,T_1024_69,T_8192_553,T_4096_277,T_8192_555,T_2048_139,T_8192_557,T_4096_279,T_8192_559,T_512_35,T_8192_561,T_4096_281,T_8192_563,T_2048_141,T_8192_565,T_4096_283,T_8192_567,T_1024_71,T_8192_569,T_4096_285,T_8192_571,T_2048_143,T_8192_573,T_4096_287,T_8192_575,T_128_9,T_8192_577,T_4096_289,T_8192_579,T_2048_145,T_8192_581,T_4096_291,T_8192_583,T_1024_73,T_8192_585,T_4096_293,T_8192_587,T_2048_147,T_8192_589,T_4096_295,T_8192_591,T_512_37,T_8192_593,T_4096_297,T_8192_595,T_2048_149,T_8192_597,T_4096_299,T_8192_599,T_1024_75,T_8192_601,T_4096_301,T_8192_603,T_2048_151,T_8192_605,T_4096_303,T_8192_607,T_256_19,T_8192_609,T_4096_305,T_8192_611,T_2048_153,T_8192_613,T_4096_307,T_8192_615,T_1024_77,T_8192_617,T_4096_309,T_8192_619,T_2048_155,T_8192_621,T_4096_311,T_8192_623,T_512_39,T_8192_625,T_4096_313,T_8192_627,T_2048_157,T_8192_629,T_4096_315,T_8192_631,T_1024_79,T_8192_633,T_4096_317,T_8192_635,T_2048_159,T_8192_637,T_4096_319,T_8192_639,T_64_5,T_8192_641,T_4096_321,T_8192_643,T_2048_161,T_8192_645,T_4096_323,T_8192_647,T_1024_81,T_8192_649,T_4096_325,T_8192_651,T_2048_163,T_8192_653,T_4096_327,T_8192_655,T_512_41,T_8192_657,T_4096_329,T_8192_659,T_2048_165,T_8192_661,T_4096_331,T_8192_663,T_1024_83,T_8192_665,T_4096_333,T_8192_667,T_2048_167,T_8192_669,T_4096_335,T_8192_671,T_256_21,T_8192_673,T_4096_337,T_8192_675,T_2048_169,T_8192_677,T_4096_339,T_8192_679,T_1024_85,T_8192_681,T_4096_341,T_8192_683,T_2048_171,T_8192_685,T_4096_343,T_8192_687,T_512_43,T_8192_689,T_4096_345,T_8192_691,T_2048_173,T_8192_693,T_4096_347,T_8192_695,T_1024_87,T_8192_697,T_4096_349,T_8192_699,T_2048_175,T_8192_701,T_4096_351,T_8192_703,T_128_11,T_8192_705,T_4096_353,T_8192_707,T_2048_177,T_8192_709,T_4096_355,T_8192_711,T_1024_89,T_8192_713,T_4096_357,T_8192_715,T_2048_179,T_8192_717,T_4096_359,T_8192_719,T_512_45,T_8192_721,T_4096_361,T_8192_723,T_2048_181,T_8192_725,T_4096_363,T_8192_727,T_1024_91,T_8192_729,T_4096_365,T_8192_731,T_2048_183,T_8192_733,T_4096_367,T_8192_735,T_256_23,T_8192_737,T_4096_369,T_8192_739,T_2048_185,T_8192_741,T_4096_371,T_8192_743,T_1024_93,T_8192_745,T_4096_373,T_8192_747,T_2048_187,T_8192_749,T_4096_375,T_8192_751,T_512_47,T_8192_753,T_4096_377,T_8192_755,T_2048_189,T_8192_757,T_4096_379,T_8192_759,T_1024_95,T_8192_761,T_4096_381,T_8192_763,T_2048_191,T_8192_765,T_4096_383,T_8192_767,T_32_3,T_8192_769,T_4096_385,T_8192_771,T_2048_193,T_8192_773,T_4096_387,T_8192_775,T_1024_97,T_8192_777,T_4096_389,T_8192_779,T_2048_195,T_8192_781,T_4096_391,T_8192_783,T_512_49,T_8192_785,T_4096_393,T_8192_787,T_2048_197,T_8192_789,T_4096_395,T_8192_791,T_1024_99,T_8192_793,T_4096_397,T_8192_795,T_2048_199,T_8192_797,T_4096_399,T_8192_799,T_256_25,T_8192_801,T_4096_401,T_8192_803,T_2048_201,T_8192_805,T_4096_403,T_8192_807,T_1024_101,T_8192_809,T_4096_405,T_8192_811,T_2048_203,T_8192_813,T_4096_407,T_8192_815,T_512_51,T_8192_817,T_4096_409,T_8192_819,T_2048_205,T_8192_821,T_4096_411,T_8192_823,T_1024_103,T_8192_825,T_4096_413,T_8192_827,T_2048_207,T_8192_829,T_4096_415,T_8192_831,T_128_13,T_8192_833,T_4096_417,T_8192_835,T_2048_209,T_8192_837,T_4096_419,T_8192_839,T_1024_105,T_8192_841,T_4096_421,T_8192_843,T_2048_211,T_8192_845,T_4096_423,T_8192_847,T_512_53,T_8192_849,T_4096_425,T_8192_851,T_2048_213,T_8192_853,T_4096_427,T_8192_855,T_1024_107,T_8192_857,T_4096_429,T_8192_859,T_2048_215,T_8192_861,T_4096_431,T_8192_863,T_256_27,T_8192_865,T_4096_433,T_8192_867,T_2048_217,T_8192_869,T_4096_435,T_8192_871,T_1024_109,T_8192_873,T_4096_437,T_8192_875,T_2048_219,T_8192_877,T_4096_439,T_8192_879,T_512_55,T_8192_881,T_4096_441,T_8192_883,T_2048_221,T_8192_885,T_4096_443,T_8192_887,T_1024_111,T_8192_889,T_4096_445,T_8192_891,T_2048_223,T_8192_893,T_4096_447,T_8192_895,T_64_7,T_8192_897,T_4096_449,T_8192_899,T_2048_225,T_8192_901,T_4096_451,T_8192_903,T_1024_113,T_8192_905,T_4096_453,T_8192_907,T_2048_227,T_8192_909,T_4096_455,T_8192_911,T_512_57,T_8192_913,T_4096_457,T_8192_915,T_2048_229,T_8192_917,T_4096_459,T_8192_919,T_1024_115,T_8192_921,T_4096_461,T_8192_923,T_2048_231,T_8192_925,T_4096_463,T_8192_927,T_256_29,T_8192_929,T_4096_465,T_8192_931,T_2048_233,T_8192_933,T_4096_467,T_8192_935,T_1024_117,T_8192_937,T_4096_469,T_8192_939,T_2048_235,T_8192_941,T_4096_471,T_8192_943,T_512_59,T_8192_945,T_4096_473,T_8192_947,T_2048_237,T_8192_949,T_4096_475,T_8192_951,T_1024_119,T_8192_953,T_4096_477,T_8192_955,T_2048_239,T_8192_957,T_4096_479,T_8192_959,T_128_15,T_8192_961,T_4096_481,T_8192_963,T_2048_241,T_8192_965,T_4096_483,T_8192_967,T_1024_121,T_8192_969,T_4096_485,T_8192_971,T_2048_243,T_8192_973,T_4096_487,T_8192_975,T_512_61,T_8192_977,T_4096_489,T_8192_979,T_2048_245,T_8192_981,T_4096_491,T_8192_983,T_1024_123,T_8192_985,T_4096_493,T_8192_987,T_2048_247,T_8192_989,T_4096_495,T_8192_991,T_256_31,T_8192_993,T_4096_497,T_8192_995,T_2048_249,T_8192_997,T_4096_499,T_8192_999,T_1024_125,T_8192_1001,T_4096_501,T_8192_1003,T_2048_251,T_8192_1005,T_4096_503,T_8192_1007,T_512_63,T_8192_1009,T_4096_505,T_8192_1011,T_2048_253,T_8192_1013,T_4096_507,T_8192_1015,T_1024_127,T_8192_1017,T_4096_509,T_8192_1019,T_2048_255,T_8192_1021,T_4096_511,T_8192_1023,T_2_0,T_8192_5,T_4096_5,T_8192_15,T_2048_5,T_8192_25,T_4096_15,T_8192_35,T_1024_5,T_8192_45,T_4096_25,T_8192_55,T_2048_15,T_8192_65,T_4096_35,T_8192_75,T_512_5,T_8192_85,T_4096_45,T_8192_95,T_2048_25,T_8192_105,T_4096_55,T_8192_115,T_1024_15,T_8192_125,T_4096_65,T_8192_135,T_2048_35,T_8192_145,T_4096_75,T_8192_155,T_256_5,T_8192_165,T_4096_85,T_8192_175,T_2048_45,T_8192_185,T_4096_95,T_8192_195,T_1024_25,T_8192_205,T_4096_105,T_8192_215,T_2048_55,T_8192_225,T_4096_115,T_8192_235,T_512_15,T_8192_245,T_4096_125,T_8192_255,T_2048_65,T_8192_265,T_4096_135,T_8192_275,T_1024_35,T_8192_285,T_4096_145,T_8192_295,T_2048_75,T_8192_305,T_4096_155,T_8192_315,T_128_5,T_8192_325,T_4096_165,T_8192_335,T_2048_85,T_8192_345,T_4096_175,T_8192_355,T_1024_45,T_8192_365,T_4096_185,T_8192_375,T_2048_95,T_8192_385,T_4096_195,T_8192_395,T_512_25,T_8192_405,T_4096_205,T_8192_415,T_2048_105,T_8192_425,T_4096_215,T_8192_435,T_1024_55,T_8192_445,T_4096_225,T_8192_455,T_2048_115,T_8192_465,T_4096_235,T_8192_475,T_256_15,T_8192_485,T_4096_245,T_8192_495,T_2048_125,T_8192_505,T_4096_255,T_8192_515,T_1024_65,T_8192_525,T_4096_265,T_8192_535,T_2048_135,T_8192_545,T_4096_275,T_8192_555,T_512_35,T_8192_565,T_4096_285,T_8192_575,T_2048_145,T_8192_585,T_4096_295,T_8192_595,T_1024_75,T_8192_605,T_4096_305,T_8192_615,T_2048_155,T_8192_625,T_4096_315,T_8192_635,T_64_5,T_8192_645,T_4096_325,T_8192_655,T_2048_165,T_8192_665,T_4096_335,T_8192_675,T_1024_85,T_8192_685,T_4096_345,T_8192_695,T_2048_175,T_8192_705,T_4096_355,T_8192_715,T_512_45,T_8192_725,T_4096_365,T_8192_735,T_2048_185,T_8192_745,T_4096_375,T_8192_755,T_1024_95,T_8192_765,T_4096_385,T_8192_775,T_2048_195,T_8192_785,T_4096_395,T_8192_795,T_256_25,T_8192_805,T_4096_405,T_8192_815,T_2048_205,T_8192_825,T_4096_415,T_8192_835,T_1024_105,T_8192_845,T_4096_425,T_8192_855,T_2048_215,T_8192_865,T_4096_435,T_8192_875,T_512_55,T_8192_885,T_4096_445,T_8192_895,T_2048_225,T_8192_905,T_4096_455,T_8192_915,T_1024_115,T_8192_925,T_4096_465,T_8192_935,T_2048_235,T_8192_945,T_4096_475,T_8192_955,T_128_15,T_8192_965,T_4096_485,T_8192_975,T_2048_245,T_8192_985,T_4096_495,T_8192_995,T_1024_125,T_8192_1005,T_4096_505,T_8192_1015,T_2048_255,T_8192_1025,T_4096_515,T_8192_1035,T_512_65,T_8192_1045,T_4096_525,T_8192_1055,T_2048_265,T_8192_1065,T_4096_535,T_8192_1075,T_1024_135,T_8192_1085,T_4096_545,T_8192_1095,T_2048_275,T_8192_1105,T_4096_555,T_8192_1115,T_256_35,T_8192_1125,T_4096_565,T_8192_1135,T_2048_285,T_8192_1145,T_4096_575,T_8192_1155,T_1024_145,T_8192_1165,T_4096_585,T_8192_1175,T_2048_295,T_8192_1185,T_4096_595,T_8192_1195,T_512_75,T_8192_1205,T_4096_605,T_8192_1215,T_2048_305,T_8192_1225,T_4096_615,T_8192_1235,T_1024_155,T_8192_1245,T_4096_625,T_8192_1255,T_2048_315,T_8192_1265,T_4096_635,T_8192_1275,T_32_5,T_8192_1285,T_4096_645,T_8192_1295,T_2048_325,T_8192_1305,T_4096_655,T_8192_1315,T_1024_165,T_8192_1325,T_4096_665,T_8192_1335,T_2048_335,T_8192_1345,T_4096_675,T_8192_1355,T_512_85,T_8192_1365,T_4096_685,T_8192_1375,T_2048_345,T_8192_1385,T_4096_695,T_8192_1395,T_1024_175,T_8192_1405,T_4096_705,T_8192_1415,T_2048_355,T_8192_1425,T_4096_715,T_8192_1435,T_256_45,T_8192_1445,T_4096_725,T_8192_1455,T_2048_365,T_8192_1465,T_4096_735,T_8192_1475,T_1024_185,T_8192_1485,T_4096_745,T_8192_1495,T_2048_375,T_8192_1505,T_4096_755,T_8192_1515,T_512_95,T_8192_1525,T_4096_765,T_8192_1535,T_2048_385,T_8192_1545,T_4096_775,T_8192_1555,T_1024_195,T_8192_1565,T_4096_785,T_8192_1575,T_2048_395,T_8192_1585,T_4096_795,T_8192_1595,T_128_25,T_8192_1605,T_4096_805,T_8192_1615,T_2048_405,T_8192_1625,T_4096_815,T_8192_1635,T_1024_205,T_8192_1645,T_4096_825,T_8192_1655,T_2048_415,T_8192_1665,T_4096_835,T_8192_1675,T_512_105,T_8192_1685,T_4096_845,T_8192_1695,T_2048_425,T_8192_1705,T_4096_855,T_8192_1715,T_1024_215,T_8192_1725,T_4096_865,T_8192_1735,T_2048_435,T_8192_1745,T_4096_875,T_8192_1755,T_256_55,T_8192_1765,T_4096_885,T_8192_1775,T_2048_445,T_8192_1785,T_4096_895,T_8192_1795,T_1024_225,T_8192_1805,T_4096_905,T_8192_1815,T_2048_455,T_8192_1825,T_4096_915,T_8192_1835,T_512_115,T_8192_1845,T_4096_925,T_8192_1855,T_2048_465,T_8192_1865,T_4096_935,T_8192_1875,T_1024_235,T_8192_1885,T_4096_945,T_8192_1895,T_2048_475,T_8192_1905,T_4096_955,T_8192_1915,T_64_15,T_8192_1925,T_4096_965,T_8192_1935,T_2048_485,T_8192_1945,T_4096_975,T_8192_1955,T_1024_245,T_8192_1965,T_4096_985,T_8192_1975,T_2048_495,T_8192_1985,T_4096_995,T_8192_1995,T_512_125,T_8192_2005,T_4096_1005,T_8192_2015,T_2048_505,T_8192_2025,T_4096_1015,T_8192_2035,T_1024_255,T_8192_2045,T_4096_1025,T_8192_2055,T_2048_515,T_8192_2065,T_4096_1035,T_8192_2075,T_256_65,T_8192_2085,T_4096_1045,T_8192_2095,T_2048_525,T_8192_2105,T_4096_1055,T_8192_2115,T_1024_265,T_8192_2125,T_4096_1065,T_8192_2135,T_2048_535,T_8192_2145,T_4096_1075,T_8192_2155,T_512_135,T_8192_2165,T_4096_1085,T_8192_2175,T_2048_545,T_8192_2185,T_4096_1095,T_8192_2195,T_1024_275,T_8192_2205,T_4096_1105,T_8192_2215,T_2048_555,T_8192_2225,T_4096_1115,T_8192_2235,T_128_35,T_8192_2245,T_4096_1125,T_8192_2255,T_2048_565,T_8192_2265,T_4096_1135,T_8192_2275,T_1024_285,T_8192_2285,T_4096_1145,T_8192_2295,T_2048_575,T_8192_2305,T_4096_1155,T_8192_2315,T_512_145,T_8192_2325,T_4096_1165,T_8192_2335,T_2048_585,T_8192_2345,T_4096_1175,T_8192_2355,T_1024_295,T_8192_2365,T_4096_1185,T_8192_2375,T_2048_595,T_8192_2385,T_4096_1195,T_8192_2395,T_256_75,T_8192_2405,T_4096_1205,T_8192_2415,T_2048_605,T_8192_2425,T_4096_1215,T_8192_2435,T_1024_305,T_8192_2445,T_4096_1225,T_8192_2455,T_2048_615,T_8192_2465,T_4096_1235,T_8192_2475,T_512_155,T_8192_2485,T_4096_1245,T_8192_2495,T_2048_625,T_8192_2505,T_4096_1255,T_8192_2515,T_1024_315,T_8192_2525,T_4096_1265,T_8192_2535,T_2048_635,T_8192_2545,T_4096_1275,T_8192_2555,T_16_5,T_8192_2565,T_4096_1285,T_8192_2575,T_2048_645,T_8192_2585,T_4096_1295,T_8192_2595,T_1024_325,T_8192_2605,T_4096_1305,T_8192_2615,T_2048_655,T_8192_2625,T_4096_1315,T_8192_2635,T_512_165,T_8192_2645,T_4096_1325,T_8192_2655,T_2048_665,T_8192_2665,T_4096_1335,T_8192_2675,T_1024_335,T_8192_2685,T_4096_1345,T_8192_2695,T_2048_675,T_8192_2705,T_4096_1355,T_8192_2715,T_256_85,T_8192_2725,T_4096_1365,T_8192_2735,T_2048_685,T_8192_2745,T_4096_1375,T_8192_2755,T_1024_345,T_8192_2765,T_4096_1385,T_8192_2775,T_2048_695,T_8192_2785,T_4096_1395,T_8192_2795,T_512_175,T_8192_2805,T_4096_1405,T_8192_2815,T_2048_705,T_8192_2825,T_4096_1415,T_8192_2835,T_1024_355,T_8192_2845,T_4096_1425,T_8192_2855,T_2048_715,T_8192_2865,T_4096_1435,T_8192_2875,T_128_45,T_8192_2885,T_4096_1445,T_8192_2895,T_2048_725,T_8192_2905,T_4096_1455,T_8192_2915,T_1024_365,T_8192_2925,T_4096_1465,T_8192_2935,T_2048_735,T_8192_2945,T_4096_1475,T_8192_2955,T_512_185,T_8192_2965,T_4096_1485,T_8192_2975,T_2048_745,T_8192_2985,T_4096_1495,T_8192_2995,T_1024_375,T_8192_3005,T_4096_1505,T_8192_3015,T_2048_755,T_8192_3025,T_4096_1515,T_8192_3035,T_256_95,T_8192_3045,T_4096_1525,T_8192_3055,T_2048_765,T_8192_3065,T_4096_1535,T_8192_3075,T_1024_385,T_8192_3085,T_4096_1545,T_8192_3095,T_2048_775,T_8192_3105,T_4096_1555,T_8192_3115,T_512_195,T_8192_3125,T_4096_1565,T_8192_3135,T_2048_785,T_8192_3145,T_4096_1575,T_8192_3155,T_1024_395,T_8192_3165,T_4096_1585,T_8192_3175,T_2048_795,T_8192_3185,T_4096_1595,T_8192_3195,T_64_25,T_8192_3205,T_4096_1605,T_8192_3215,T_2048_805,T_8192_3225,T_4096_1615,T_8192_3235,T_1024_405,T_8192_3245,T_4096_1625,T_8192_3255,T_2048_815,T_8192_3265,T_4096_1635,T_8192_3275,T_512_205,T_8192_3285,T_4096_1645,T_8192_3295,T_2048_825,T_8192_3305,T_4096_1655,T_8192_3315,T_1024_415,T_8192_3325,T_4096_1665,T_8192_3335,T_2048_835,T_8192_3345,T_4096_1675,T_8192_3355,T_256_105,T_8192_3365,T_4096_1685,T_8192_3375,T_2048_845,T_8192_3385,T_4096_1695,T_8192_3395,T_1024_425,T_8192_3405,T_4096_1705,T_8192_3415,T_2048_855,T_8192_3425,T_4096_1715,T_8192_3435,T_512_215,T_8192_3445,T_4096_1725,T_8192_3455,T_2048_865,T_8192_3465,T_4096_1735,T_8192_3475,T_1024_435,T_8192_3485,T_4096_1745,T_8192_3495,T_2048_875,T_8192_3505,T_4096_1755,T_8192_3515,T_128_55,T_8192_3525,T_4096_1765,T_8192_3535,T_2048_885,T_8192_3545,T_4096_1775,T_8192_3555,T_1024_445,T_8192_3565,T_4096_1785,T_8192_3575,T_2048_895,T_8192_3585,T_4096_1795,T_8192_3595,T_512_225,T_8192_3605,T_4096_1805,T_8192_3615,T_2048_905,T_8192_3625,T_4096_1815,T_8192_3635,T_1024_455,T_8192_3645,T_4096_1825,T_8192_3655,T_2048_915,T_8192_3665,T_4096_1835,T_8192_3675,T_256_115,T_8192_3685,T_4096_1845,T_8192_3695,T_2048_925,T_8192_3705,T_4096_1855,T_8192_3715,T_1024_465,T_8192_3725,T_4096_1865,T_8192_3735,T_2048_935,T_8192_3745,T_4096_1875,T_8192_3755,T_512_235,T_8192_3765,T_4096_1885,T_8192_3775,T_2048_945,T_8192_3785,T_4096_1895,T_8192_3795,T_1024_475,T_8192_3805,T_4096_1905,T_8192_3815,T_2048_955,T_8192_3825,T_4096_1915,T_8192_3835,T_32_15,T_8192_3845,T_4096_1925,T_8192_3855,T_2048_965,T_8192_3865,T_4096_1935,T_8192_3875,T_1024_485,T_8192_3885,T_4096_1945,T_8192_3895,T_2048_975,T_8192_3905,T_4096_1955,T_8192_3915,T_512_245,T_8192_3925,T_4096_1965,T_8192_3935,T_2048_985,T_8192_3945,T_4096_1975,T_8192_3955,T_1024_495,T_8192_3965,T_4096_1985,T_8192_3975,T_2048_995,T_8192_3985,T_4096_1995,T_8192_3995,T_256_125,T_8192_4005,T_4096_2005,T_8192_4015,T_2048_1005,T_8192_4025,T_4096_2015,T_8192_4035,T_1024_505,T_8192_4045,T_4096_2025,T_8192_4055,T_2048_1015,T_8192_4065,T_4096_2035,T_8192_4075,T_512_255,T_8192_4085,T_4096_2045,T_8192_4095,T_2048_1025,T_8192_4105,T_4096_2055,T_8192_4115,T_1024_515,T_8192_4125,T_4096_2065,T_8192_4135,T_2048_1035,T_8192_4145,T_4096_2075,T_8192_4155,T_128_65,T_8192_4165,T_4096_2085,T_8192_4175,T_2048_1045,T_8192_4185,T_4096_2095,T_8192_4195,T_1024_525,T_8192_4205,T_4096_2105,T_8192_4215,T_2048_1055,T_8192_4225,T_4096_2115,T_8192_4235,T_512_265,T_8192_4245,T_4096_2125,T_8192_4255,T_2048_1065,T_8192_4265,T_4096_2135,T_8192_4275,T_1024_535,T_8192_4285,T_4096_2145,T_8192_4295,T_2048_1075,T_8192_4305,T_4096_2155,T_8192_4315,T_256_135,T_8192_4325,T_4096_2165,T_8192_4335,T_2048_1085,T_8192_4345,T_4096_2175,T_8192_4355,T_1024_545,T_8192_4365,T_4096_2185,T_8192_4375,T_2048_1095,T_8192_4385,T_4096_2195,T_8192_4395,T_512_275,T_8192_4405,T_4096_2205,T_8192_4415,T_2048_1105,T_8192_4425,T_4096_2215,T_8192_4435,T_1024_555,T_8192_4445,T_4096_2225,T_8192_4455,T_2048_1115,T_8192_4465,T_4096_2235,T_8192_4475,T_64_35,T_8192_4485,T_4096_2245,T_8192_4495,T_2048_1125,T_8192_4505,T_4096_2255,T_8192_4515,T_1024_565,T_8192_4525,T_4096_2265,T_8192_4535,T_2048_1135,T_8192_4545,T_4096_2275,T_8192_4555,T_512_285,T_8192_4565,T_4096_2285,T_8192_4575,T_2048_1145,T_8192_4585,T_4096_2295,T_8192_4595,T_1024_575,T_8192_4605,T_4096_2305,T_8192_4615,T_2048_1155,T_8192_4625,T_4096_2315,T_8192_4635,T_256_145,T_8192_4645,T_4096_2325,T_8192_4655,T_2048_1165,T_8192_4665,T_4096_2335,T_8192_4675,T_1024_585,T_8192_4685,T_4096_2345,T_8192_4695,T_2048_1175,T_8192_4705,T_4096_2355,T_8192_4715,T_512_295,T_8192_4725,T_4096_2365,T_8192_4735,T_2048_1185,T_8192_4745,T_4096_2375,T_8192_4755,T_1024_595,T_8192_4765,T_4096_2385,T_8192_4775,T_2048_1195,T_8192_4785,T_4096_2395,T_8192_4795,T_128_75,T_8192_4805,T_4096_2405,T_8192_4815,T_2048_1205,T_8192_4825,T_4096_2415,T_8192_4835,T_1024_605,T_8192_4845,T_4096_2425,T_8192_4855,T_2048_1215,T_8192_4865,T_4096_2435,T_8192_4875,T_512_305,T_8192_4885,T_4096_2445,T_8192_4895,T_2048_1225,T_8192_4905,T_4096_2455,T_8192_4915,T_1024_615,T_8192_4925,T_4096_2465,T_8192_4935,T_2048_1235,T_8192_4945,T_4096_2475,T_8192_4955,T_256_155,T_8192_4965,T_4096_2485,T_8192_4975,T_2048_1245,T_8192_4985,T_4096_2495,T_8192_4995,T_1024_625,T_8192_5005,T_4096_2505,T_8192_5015,T_2048_1255,T_8192_5025,T_4096_2515,T_8192_5035,T_512_315,T_8192_5045,T_4096_2525,T_8192_5055,T_2048_1265,T_8192_5065,T_4096_2535,T_8192_5075,T_1024_635,T_8192_5085,T_4096_2545,T_8192_5095,T_2048_1275,T_8192_5105,T_4096_2555,T_8192_5115 +}; +static const __device__ float2 lut_sp_16_8192[512*2] = { + T_2_0,T_8192_1,T_4096_1,T_8192_3,T_2048_1,T_8192_5,T_4096_3,T_8192_7,T_1024_1,T_8192_9,T_4096_5,T_8192_11,T_2048_3,T_8192_13,T_4096_7,T_8192_15,T_512_1,T_8192_17,T_4096_9,T_8192_19,T_2048_5,T_8192_21,T_4096_11,T_8192_23,T_1024_3,T_8192_25,T_4096_13,T_8192_27,T_2048_7,T_8192_29,T_4096_15,T_8192_31,T_256_1,T_8192_33,T_4096_17,T_8192_35,T_2048_9,T_8192_37,T_4096_19,T_8192_39,T_1024_5,T_8192_41,T_4096_21,T_8192_43,T_2048_11,T_8192_45,T_4096_23,T_8192_47,T_512_3,T_8192_49,T_4096_25,T_8192_51,T_2048_13,T_8192_53,T_4096_27,T_8192_55,T_1024_7,T_8192_57,T_4096_29,T_8192_59,T_2048_15,T_8192_61,T_4096_31,T_8192_63,T_128_1,T_8192_65,T_4096_33,T_8192_67,T_2048_17,T_8192_69,T_4096_35,T_8192_71,T_1024_9,T_8192_73,T_4096_37,T_8192_75,T_2048_19,T_8192_77,T_4096_39,T_8192_79,T_512_5,T_8192_81,T_4096_41,T_8192_83,T_2048_21,T_8192_85,T_4096_43,T_8192_87,T_1024_11,T_8192_89,T_4096_45,T_8192_91,T_2048_23,T_8192_93,T_4096_47,T_8192_95,T_256_3,T_8192_97,T_4096_49,T_8192_99,T_2048_25,T_8192_101,T_4096_51,T_8192_103,T_1024_13,T_8192_105,T_4096_53,T_8192_107,T_2048_27,T_8192_109,T_4096_55,T_8192_111,T_512_7,T_8192_113,T_4096_57,T_8192_115,T_2048_29,T_8192_117,T_4096_59,T_8192_119,T_1024_15,T_8192_121,T_4096_61,T_8192_123,T_2048_31,T_8192_125,T_4096_63,T_8192_127,T_64_1,T_8192_129,T_4096_65,T_8192_131,T_2048_33,T_8192_133,T_4096_67,T_8192_135,T_1024_17,T_8192_137,T_4096_69,T_8192_139,T_2048_35,T_8192_141,T_4096_71,T_8192_143,T_512_9,T_8192_145,T_4096_73,T_8192_147,T_2048_37,T_8192_149,T_4096_75,T_8192_151,T_1024_19,T_8192_153,T_4096_77,T_8192_155,T_2048_39,T_8192_157,T_4096_79,T_8192_159,T_256_5,T_8192_161,T_4096_81,T_8192_163,T_2048_41,T_8192_165,T_4096_83,T_8192_167,T_1024_21,T_8192_169,T_4096_85,T_8192_171,T_2048_43,T_8192_173,T_4096_87,T_8192_175,T_512_11,T_8192_177,T_4096_89,T_8192_179,T_2048_45,T_8192_181,T_4096_91,T_8192_183,T_1024_23,T_8192_185,T_4096_93,T_8192_187,T_2048_47,T_8192_189,T_4096_95,T_8192_191,T_128_3,T_8192_193,T_4096_97,T_8192_195,T_2048_49,T_8192_197,T_4096_99,T_8192_199,T_1024_25,T_8192_201,T_4096_101,T_8192_203,T_2048_51,T_8192_205,T_4096_103,T_8192_207,T_512_13,T_8192_209,T_4096_105,T_8192_211,T_2048_53,T_8192_213,T_4096_107,T_8192_215,T_1024_27,T_8192_217,T_4096_109,T_8192_219,T_2048_55,T_8192_221,T_4096_111,T_8192_223,T_256_7,T_8192_225,T_4096_113,T_8192_227,T_2048_57,T_8192_229,T_4096_115,T_8192_231,T_1024_29,T_8192_233,T_4096_117,T_8192_235,T_2048_59,T_8192_237,T_4096_119,T_8192_239,T_512_15,T_8192_241,T_4096_121,T_8192_243,T_2048_61,T_8192_245,T_4096_123,T_8192_247,T_1024_31,T_8192_249,T_4096_125,T_8192_251,T_2048_63,T_8192_253,T_4096_127,T_8192_255,T_32_1,T_8192_257,T_4096_129,T_8192_259,T_2048_65,T_8192_261,T_4096_131,T_8192_263,T_1024_33,T_8192_265,T_4096_133,T_8192_267,T_2048_67,T_8192_269,T_4096_135,T_8192_271,T_512_17,T_8192_273,T_4096_137,T_8192_275,T_2048_69,T_8192_277,T_4096_139,T_8192_279,T_1024_35,T_8192_281,T_4096_141,T_8192_283,T_2048_71,T_8192_285,T_4096_143,T_8192_287,T_256_9,T_8192_289,T_4096_145,T_8192_291,T_2048_73,T_8192_293,T_4096_147,T_8192_295,T_1024_37,T_8192_297,T_4096_149,T_8192_299,T_2048_75,T_8192_301,T_4096_151,T_8192_303,T_512_19,T_8192_305,T_4096_153,T_8192_307,T_2048_77,T_8192_309,T_4096_155,T_8192_311,T_1024_39,T_8192_313,T_4096_157,T_8192_315,T_2048_79,T_8192_317,T_4096_159,T_8192_319,T_128_5,T_8192_321,T_4096_161,T_8192_323,T_2048_81,T_8192_325,T_4096_163,T_8192_327,T_1024_41,T_8192_329,T_4096_165,T_8192_331,T_2048_83,T_8192_333,T_4096_167,T_8192_335,T_512_21,T_8192_337,T_4096_169,T_8192_339,T_2048_85,T_8192_341,T_4096_171,T_8192_343,T_1024_43,T_8192_345,T_4096_173,T_8192_347,T_2048_87,T_8192_349,T_4096_175,T_8192_351,T_256_11,T_8192_353,T_4096_177,T_8192_355,T_2048_89,T_8192_357,T_4096_179,T_8192_359,T_1024_45,T_8192_361,T_4096_181,T_8192_363,T_2048_91,T_8192_365,T_4096_183,T_8192_367,T_512_23,T_8192_369,T_4096_185,T_8192_371,T_2048_93,T_8192_373,T_4096_187,T_8192_375,T_1024_47,T_8192_377,T_4096_189,T_8192_379,T_2048_95,T_8192_381,T_4096_191,T_8192_383,T_64_3,T_8192_385,T_4096_193,T_8192_387,T_2048_97,T_8192_389,T_4096_195,T_8192_391,T_1024_49,T_8192_393,T_4096_197,T_8192_395,T_2048_99,T_8192_397,T_4096_199,T_8192_399,T_512_25,T_8192_401,T_4096_201,T_8192_403,T_2048_101,T_8192_405,T_4096_203,T_8192_407,T_1024_51,T_8192_409,T_4096_205,T_8192_411,T_2048_103,T_8192_413,T_4096_207,T_8192_415,T_256_13,T_8192_417,T_4096_209,T_8192_419,T_2048_105,T_8192_421,T_4096_211,T_8192_423,T_1024_53,T_8192_425,T_4096_213,T_8192_427,T_2048_107,T_8192_429,T_4096_215,T_8192_431,T_512_27,T_8192_433,T_4096_217,T_8192_435,T_2048_109,T_8192_437,T_4096_219,T_8192_439,T_1024_55,T_8192_441,T_4096_221,T_8192_443,T_2048_111,T_8192_445,T_4096_223,T_8192_447,T_128_7,T_8192_449,T_4096_225,T_8192_451,T_2048_113,T_8192_453,T_4096_227,T_8192_455,T_1024_57,T_8192_457,T_4096_229,T_8192_459,T_2048_115,T_8192_461,T_4096_231,T_8192_463,T_512_29,T_8192_465,T_4096_233,T_8192_467,T_2048_117,T_8192_469,T_4096_235,T_8192_471,T_1024_59,T_8192_473,T_4096_237,T_8192_475,T_2048_119,T_8192_477,T_4096_239,T_8192_479,T_256_15,T_8192_481,T_4096_241,T_8192_483,T_2048_121,T_8192_485,T_4096_243,T_8192_487,T_1024_61,T_8192_489,T_4096_245,T_8192_491,T_2048_123,T_8192_493,T_4096_247,T_8192_495,T_512_31,T_8192_497,T_4096_249,T_8192_499,T_2048_125,T_8192_501,T_4096_251,T_8192_503,T_1024_63,T_8192_505,T_4096_253,T_8192_507,T_2048_127,T_8192_509,T_4096_255,T_8192_511,T_2_0,T_8192_9,T_4096_9,T_8192_27,T_2048_9,T_8192_45,T_4096_27,T_8192_63,T_1024_9,T_8192_81,T_4096_45,T_8192_99,T_2048_27,T_8192_117,T_4096_63,T_8192_135,T_512_9,T_8192_153,T_4096_81,T_8192_171,T_2048_45,T_8192_189,T_4096_99,T_8192_207,T_1024_27,T_8192_225,T_4096_117,T_8192_243,T_2048_63,T_8192_261,T_4096_135,T_8192_279,T_256_9,T_8192_297,T_4096_153,T_8192_315,T_2048_81,T_8192_333,T_4096_171,T_8192_351,T_1024_45,T_8192_369,T_4096_189,T_8192_387,T_2048_99,T_8192_405,T_4096_207,T_8192_423,T_512_27,T_8192_441,T_4096_225,T_8192_459,T_2048_117,T_8192_477,T_4096_243,T_8192_495,T_1024_63,T_8192_513,T_4096_261,T_8192_531,T_2048_135,T_8192_549,T_4096_279,T_8192_567,T_128_9,T_8192_585,T_4096_297,T_8192_603,T_2048_153,T_8192_621,T_4096_315,T_8192_639,T_1024_81,T_8192_657,T_4096_333,T_8192_675,T_2048_171,T_8192_693,T_4096_351,T_8192_711,T_512_45,T_8192_729,T_4096_369,T_8192_747,T_2048_189,T_8192_765,T_4096_387,T_8192_783,T_1024_99,T_8192_801,T_4096_405,T_8192_819,T_2048_207,T_8192_837,T_4096_423,T_8192_855,T_256_27,T_8192_873,T_4096_441,T_8192_891,T_2048_225,T_8192_909,T_4096_459,T_8192_927,T_1024_117,T_8192_945,T_4096_477,T_8192_963,T_2048_243,T_8192_981,T_4096_495,T_8192_999,T_512_63,T_8192_1017,T_4096_513,T_8192_1035,T_2048_261,T_8192_1053,T_4096_531,T_8192_1071,T_1024_135,T_8192_1089,T_4096_549,T_8192_1107,T_2048_279,T_8192_1125,T_4096_567,T_8192_1143,T_64_9,T_8192_1161,T_4096_585,T_8192_1179,T_2048_297,T_8192_1197,T_4096_603,T_8192_1215,T_1024_153,T_8192_1233,T_4096_621,T_8192_1251,T_2048_315,T_8192_1269,T_4096_639,T_8192_1287,T_512_81,T_8192_1305,T_4096_657,T_8192_1323,T_2048_333,T_8192_1341,T_4096_675,T_8192_1359,T_1024_171,T_8192_1377,T_4096_693,T_8192_1395,T_2048_351,T_8192_1413,T_4096_711,T_8192_1431,T_256_45,T_8192_1449,T_4096_729,T_8192_1467,T_2048_369,T_8192_1485,T_4096_747,T_8192_1503,T_1024_189,T_8192_1521,T_4096_765,T_8192_1539,T_2048_387,T_8192_1557,T_4096_783,T_8192_1575,T_512_99,T_8192_1593,T_4096_801,T_8192_1611,T_2048_405,T_8192_1629,T_4096_819,T_8192_1647,T_1024_207,T_8192_1665,T_4096_837,T_8192_1683,T_2048_423,T_8192_1701,T_4096_855,T_8192_1719,T_128_27,T_8192_1737,T_4096_873,T_8192_1755,T_2048_441,T_8192_1773,T_4096_891,T_8192_1791,T_1024_225,T_8192_1809,T_4096_909,T_8192_1827,T_2048_459,T_8192_1845,T_4096_927,T_8192_1863,T_512_117,T_8192_1881,T_4096_945,T_8192_1899,T_2048_477,T_8192_1917,T_4096_963,T_8192_1935,T_1024_243,T_8192_1953,T_4096_981,T_8192_1971,T_2048_495,T_8192_1989,T_4096_999,T_8192_2007,T_256_63,T_8192_2025,T_4096_1017,T_8192_2043,T_2048_513,T_8192_2061,T_4096_1035,T_8192_2079,T_1024_261,T_8192_2097,T_4096_1053,T_8192_2115,T_2048_531,T_8192_2133,T_4096_1071,T_8192_2151,T_512_135,T_8192_2169,T_4096_1089,T_8192_2187,T_2048_549,T_8192_2205,T_4096_1107,T_8192_2223,T_1024_279,T_8192_2241,T_4096_1125,T_8192_2259,T_2048_567,T_8192_2277,T_4096_1143,T_8192_2295,T_32_9,T_8192_2313,T_4096_1161,T_8192_2331,T_2048_585,T_8192_2349,T_4096_1179,T_8192_2367,T_1024_297,T_8192_2385,T_4096_1197,T_8192_2403,T_2048_603,T_8192_2421,T_4096_1215,T_8192_2439,T_512_153,T_8192_2457,T_4096_1233,T_8192_2475,T_2048_621,T_8192_2493,T_4096_1251,T_8192_2511,T_1024_315,T_8192_2529,T_4096_1269,T_8192_2547,T_2048_639,T_8192_2565,T_4096_1287,T_8192_2583,T_256_81,T_8192_2601,T_4096_1305,T_8192_2619,T_2048_657,T_8192_2637,T_4096_1323,T_8192_2655,T_1024_333,T_8192_2673,T_4096_1341,T_8192_2691,T_2048_675,T_8192_2709,T_4096_1359,T_8192_2727,T_512_171,T_8192_2745,T_4096_1377,T_8192_2763,T_2048_693,T_8192_2781,T_4096_1395,T_8192_2799,T_1024_351,T_8192_2817,T_4096_1413,T_8192_2835,T_2048_711,T_8192_2853,T_4096_1431,T_8192_2871,T_128_45,T_8192_2889,T_4096_1449,T_8192_2907,T_2048_729,T_8192_2925,T_4096_1467,T_8192_2943,T_1024_369,T_8192_2961,T_4096_1485,T_8192_2979,T_2048_747,T_8192_2997,T_4096_1503,T_8192_3015,T_512_189,T_8192_3033,T_4096_1521,T_8192_3051,T_2048_765,T_8192_3069,T_4096_1539,T_8192_3087,T_1024_387,T_8192_3105,T_4096_1557,T_8192_3123,T_2048_783,T_8192_3141,T_4096_1575,T_8192_3159,T_256_99,T_8192_3177,T_4096_1593,T_8192_3195,T_2048_801,T_8192_3213,T_4096_1611,T_8192_3231,T_1024_405,T_8192_3249,T_4096_1629,T_8192_3267,T_2048_819,T_8192_3285,T_4096_1647,T_8192_3303,T_512_207,T_8192_3321,T_4096_1665,T_8192_3339,T_2048_837,T_8192_3357,T_4096_1683,T_8192_3375,T_1024_423,T_8192_3393,T_4096_1701,T_8192_3411,T_2048_855,T_8192_3429,T_4096_1719,T_8192_3447,T_64_27,T_8192_3465,T_4096_1737,T_8192_3483,T_2048_873,T_8192_3501,T_4096_1755,T_8192_3519,T_1024_441,T_8192_3537,T_4096_1773,T_8192_3555,T_2048_891,T_8192_3573,T_4096_1791,T_8192_3591,T_512_225,T_8192_3609,T_4096_1809,T_8192_3627,T_2048_909,T_8192_3645,T_4096_1827,T_8192_3663,T_1024_459,T_8192_3681,T_4096_1845,T_8192_3699,T_2048_927,T_8192_3717,T_4096_1863,T_8192_3735,T_256_117,T_8192_3753,T_4096_1881,T_8192_3771,T_2048_945,T_8192_3789,T_4096_1899,T_8192_3807,T_1024_477,T_8192_3825,T_4096_1917,T_8192_3843,T_2048_963,T_8192_3861,T_4096_1935,T_8192_3879,T_512_243,T_8192_3897,T_4096_1953,T_8192_3915,T_2048_981,T_8192_3933,T_4096_1971,T_8192_3951,T_1024_495,T_8192_3969,T_4096_1989,T_8192_3987,T_2048_999,T_8192_4005,T_4096_2007,T_8192_4023,T_128_63,T_8192_4041,T_4096_2025,T_8192_4059,T_2048_1017,T_8192_4077,T_4096_2043,T_8192_4095,T_1024_513,T_8192_4113,T_4096_2061,T_8192_4131,T_2048_1035,T_8192_4149,T_4096_2079,T_8192_4167,T_512_261,T_8192_4185,T_4096_2097,T_8192_4203,T_2048_1053,T_8192_4221,T_4096_2115,T_8192_4239,T_1024_531,T_8192_4257,T_4096_2133,T_8192_4275,T_2048_1071,T_8192_4293,T_4096_2151,T_8192_4311,T_256_135,T_8192_4329,T_4096_2169,T_8192_4347,T_2048_1089,T_8192_4365,T_4096_2187,T_8192_4383,T_1024_549,T_8192_4401,T_4096_2205,T_8192_4419,T_2048_1107,T_8192_4437,T_4096_2223,T_8192_4455,T_512_279,T_8192_4473,T_4096_2241,T_8192_4491,T_2048_1125,T_8192_4509,T_4096_2259,T_8192_4527,T_1024_567,T_8192_4545,T_4096_2277,T_8192_4563,T_2048_1143,T_8192_4581,T_4096_2295,T_8192_4599 +}; +static const __device__ float2 lut_sp_32_8192[256*2] = { + T_2_0,T_8192_1,T_4096_1,T_8192_3,T_2048_1,T_8192_5,T_4096_3,T_8192_7,T_1024_1,T_8192_9,T_4096_5,T_8192_11,T_2048_3,T_8192_13,T_4096_7,T_8192_15,T_512_1,T_8192_17,T_4096_9,T_8192_19,T_2048_5,T_8192_21,T_4096_11,T_8192_23,T_1024_3,T_8192_25,T_4096_13,T_8192_27,T_2048_7,T_8192_29,T_4096_15,T_8192_31,T_256_1,T_8192_33,T_4096_17,T_8192_35,T_2048_9,T_8192_37,T_4096_19,T_8192_39,T_1024_5,T_8192_41,T_4096_21,T_8192_43,T_2048_11,T_8192_45,T_4096_23,T_8192_47,T_512_3,T_8192_49,T_4096_25,T_8192_51,T_2048_13,T_8192_53,T_4096_27,T_8192_55,T_1024_7,T_8192_57,T_4096_29,T_8192_59,T_2048_15,T_8192_61,T_4096_31,T_8192_63,T_128_1,T_8192_65,T_4096_33,T_8192_67,T_2048_17,T_8192_69,T_4096_35,T_8192_71,T_1024_9,T_8192_73,T_4096_37,T_8192_75,T_2048_19,T_8192_77,T_4096_39,T_8192_79,T_512_5,T_8192_81,T_4096_41,T_8192_83,T_2048_21,T_8192_85,T_4096_43,T_8192_87,T_1024_11,T_8192_89,T_4096_45,T_8192_91,T_2048_23,T_8192_93,T_4096_47,T_8192_95,T_256_3,T_8192_97,T_4096_49,T_8192_99,T_2048_25,T_8192_101,T_4096_51,T_8192_103,T_1024_13,T_8192_105,T_4096_53,T_8192_107,T_2048_27,T_8192_109,T_4096_55,T_8192_111,T_512_7,T_8192_113,T_4096_57,T_8192_115,T_2048_29,T_8192_117,T_4096_59,T_8192_119,T_1024_15,T_8192_121,T_4096_61,T_8192_123,T_2048_31,T_8192_125,T_4096_63,T_8192_127,T_64_1,T_8192_129,T_4096_65,T_8192_131,T_2048_33,T_8192_133,T_4096_67,T_8192_135,T_1024_17,T_8192_137,T_4096_69,T_8192_139,T_2048_35,T_8192_141,T_4096_71,T_8192_143,T_512_9,T_8192_145,T_4096_73,T_8192_147,T_2048_37,T_8192_149,T_4096_75,T_8192_151,T_1024_19,T_8192_153,T_4096_77,T_8192_155,T_2048_39,T_8192_157,T_4096_79,T_8192_159,T_256_5,T_8192_161,T_4096_81,T_8192_163,T_2048_41,T_8192_165,T_4096_83,T_8192_167,T_1024_21,T_8192_169,T_4096_85,T_8192_171,T_2048_43,T_8192_173,T_4096_87,T_8192_175,T_512_11,T_8192_177,T_4096_89,T_8192_179,T_2048_45,T_8192_181,T_4096_91,T_8192_183,T_1024_23,T_8192_185,T_4096_93,T_8192_187,T_2048_47,T_8192_189,T_4096_95,T_8192_191,T_128_3,T_8192_193,T_4096_97,T_8192_195,T_2048_49,T_8192_197,T_4096_99,T_8192_199,T_1024_25,T_8192_201,T_4096_101,T_8192_203,T_2048_51,T_8192_205,T_4096_103,T_8192_207,T_512_13,T_8192_209,T_4096_105,T_8192_211,T_2048_53,T_8192_213,T_4096_107,T_8192_215,T_1024_27,T_8192_217,T_4096_109,T_8192_219,T_2048_55,T_8192_221,T_4096_111,T_8192_223,T_256_7,T_8192_225,T_4096_113,T_8192_227,T_2048_57,T_8192_229,T_4096_115,T_8192_231,T_1024_29,T_8192_233,T_4096_117,T_8192_235,T_2048_59,T_8192_237,T_4096_119,T_8192_239,T_512_15,T_8192_241,T_4096_121,T_8192_243,T_2048_61,T_8192_245,T_4096_123,T_8192_247,T_1024_31,T_8192_249,T_4096_125,T_8192_251,T_2048_63,T_8192_253,T_4096_127,T_8192_255,T_2_0,T_8192_17,T_4096_17,T_8192_51,T_2048_17,T_8192_85,T_4096_51,T_8192_119,T_1024_17,T_8192_153,T_4096_85,T_8192_187,T_2048_51,T_8192_221,T_4096_119,T_8192_255,T_512_17,T_8192_289,T_4096_153,T_8192_323,T_2048_85,T_8192_357,T_4096_187,T_8192_391,T_1024_51,T_8192_425,T_4096_221,T_8192_459,T_2048_119,T_8192_493,T_4096_255,T_8192_527,T_256_17,T_8192_561,T_4096_289,T_8192_595,T_2048_153,T_8192_629,T_4096_323,T_8192_663,T_1024_85,T_8192_697,T_4096_357,T_8192_731,T_2048_187,T_8192_765,T_4096_391,T_8192_799,T_512_51,T_8192_833,T_4096_425,T_8192_867,T_2048_221,T_8192_901,T_4096_459,T_8192_935,T_1024_119,T_8192_969,T_4096_493,T_8192_1003,T_2048_255,T_8192_1037,T_4096_527,T_8192_1071,T_128_17,T_8192_1105,T_4096_561,T_8192_1139,T_2048_289,T_8192_1173,T_4096_595,T_8192_1207,T_1024_153,T_8192_1241,T_4096_629,T_8192_1275,T_2048_323,T_8192_1309,T_4096_663,T_8192_1343,T_512_85,T_8192_1377,T_4096_697,T_8192_1411,T_2048_357,T_8192_1445,T_4096_731,T_8192_1479,T_1024_187,T_8192_1513,T_4096_765,T_8192_1547,T_2048_391,T_8192_1581,T_4096_799,T_8192_1615,T_256_51,T_8192_1649,T_4096_833,T_8192_1683,T_2048_425,T_8192_1717,T_4096_867,T_8192_1751,T_1024_221,T_8192_1785,T_4096_901,T_8192_1819,T_2048_459,T_8192_1853,T_4096_935,T_8192_1887,T_512_119,T_8192_1921,T_4096_969,T_8192_1955,T_2048_493,T_8192_1989,T_4096_1003,T_8192_2023,T_1024_255,T_8192_2057,T_4096_1037,T_8192_2091,T_2048_527,T_8192_2125,T_4096_1071,T_8192_2159,T_64_17,T_8192_2193,T_4096_1105,T_8192_2227,T_2048_561,T_8192_2261,T_4096_1139,T_8192_2295,T_1024_289,T_8192_2329,T_4096_1173,T_8192_2363,T_2048_595,T_8192_2397,T_4096_1207,T_8192_2431,T_512_153,T_8192_2465,T_4096_1241,T_8192_2499,T_2048_629,T_8192_2533,T_4096_1275,T_8192_2567,T_1024_323,T_8192_2601,T_4096_1309,T_8192_2635,T_2048_663,T_8192_2669,T_4096_1343,T_8192_2703,T_256_85,T_8192_2737,T_4096_1377,T_8192_2771,T_2048_697,T_8192_2805,T_4096_1411,T_8192_2839,T_1024_357,T_8192_2873,T_4096_1445,T_8192_2907,T_2048_731,T_8192_2941,T_4096_1479,T_8192_2975,T_512_187,T_8192_3009,T_4096_1513,T_8192_3043,T_2048_765,T_8192_3077,T_4096_1547,T_8192_3111,T_1024_391,T_8192_3145,T_4096_1581,T_8192_3179,T_2048_799,T_8192_3213,T_4096_1615,T_8192_3247,T_128_51,T_8192_3281,T_4096_1649,T_8192_3315,T_2048_833,T_8192_3349,T_4096_1683,T_8192_3383,T_1024_425,T_8192_3417,T_4096_1717,T_8192_3451,T_2048_867,T_8192_3485,T_4096_1751,T_8192_3519,T_512_221,T_8192_3553,T_4096_1785,T_8192_3587,T_2048_901,T_8192_3621,T_4096_1819,T_8192_3655,T_1024_459,T_8192_3689,T_4096_1853,T_8192_3723,T_2048_935,T_8192_3757,T_4096_1887,T_8192_3791,T_256_119,T_8192_3825,T_4096_1921,T_8192_3859,T_2048_969,T_8192_3893,T_4096_1955,T_8192_3927,T_1024_493,T_8192_3961,T_4096_1989,T_8192_3995,T_2048_1003,T_8192_4029,T_4096_2023,T_8192_4063,T_512_255,T_8192_4097,T_4096_2057,T_8192_4131,T_2048_1037,T_8192_4165,T_4096_2091,T_8192_4199,T_1024_527,T_8192_4233,T_4096_2125,T_8192_4267,T_2048_1071,T_8192_4301,T_4096_2159,T_8192_4335 +}; +static const __device__ float2 lut_sp_10_10000[1000*2] = { + T_2_0,T_10000_1,T_5000_1,T_10000_3,T_2500_1,T_2000_1,T_5000_3,T_10000_7,T_1250_1,T_10000_9,T_1000_1,T_10000_11,T_2500_3,T_10000_13,T_5000_7,T_2000_3,T_625_1,T_10000_17,T_5000_9,T_10000_19,T_500_1,T_10000_21,T_5000_11,T_10000_23,T_1250_3,T_400_1,T_5000_13,T_10000_27,T_2500_7,T_10000_29,T_1000_3,T_10000_31,T_625_2,T_10000_33,T_5000_17,T_2000_7,T_2500_9,T_10000_37,T_5000_19,T_10000_39,T_250_1,T_10000_41,T_5000_21,T_10000_43,T_2500_11,T_2000_9,T_5000_23,T_10000_47,T_625_3,T_10000_49,T_200_1,T_10000_51,T_2500_13,T_10000_53,T_5000_27,T_2000_11,T_1250_7,T_10000_57,T_5000_29,T_10000_59,T_500_3,T_10000_61,T_5000_31,T_10000_63,T_625_4,T_2000_13,T_5000_33,T_10000_67,T_2500_17,T_10000_69,T_1000_7,T_10000_71,T_1250_9,T_10000_73,T_5000_37,T_400_3,T_2500_19,T_10000_77,T_5000_39,T_10000_79,T_125_1,T_10000_81,T_5000_41,T_10000_83,T_2500_21,T_2000_17,T_5000_43,T_10000_87,T_1250_11,T_10000_89,T_1000_9,T_10000_91,T_2500_23,T_10000_93,T_5000_47,T_2000_19,T_625_6,T_10000_97,T_5000_49,T_10000_99,T_100_1,T_10000_101,T_5000_51,T_10000_103,T_1250_13,T_2000_21,T_5000_53,T_10000_107,T_2500_27,T_10000_109,T_1000_11,T_10000_111,T_625_7,T_10000_113,T_5000_57,T_2000_23,T_2500_29,T_10000_117,T_5000_59,T_10000_119,T_250_3,T_10000_121,T_5000_61,T_10000_123,T_2500_31,T_80_1,T_5000_63,T_10000_127,T_625_8,T_10000_129,T_1000_13,T_10000_131,T_2500_33,T_10000_133,T_5000_67,T_2000_27,T_1250_17,T_10000_137,T_5000_69,T_10000_139,T_500_7,T_10000_141,T_5000_71,T_10000_143,T_625_9,T_2000_29,T_5000_73,T_10000_147,T_2500_37,T_10000_149,T_200_3,T_10000_151,T_1250_19,T_10000_153,T_5000_77,T_2000_31,T_2500_39,T_10000_157,T_5000_79,T_10000_159,T_125_2,T_10000_161,T_5000_81,T_10000_163,T_2500_41,T_2000_33,T_5000_83,T_10000_167,T_1250_21,T_10000_169,T_1000_17,T_10000_171,T_2500_43,T_10000_173,T_5000_87,T_400_7,T_625_11,T_10000_177,T_5000_89,T_10000_179,T_500_9,T_10000_181,T_5000_91,T_10000_183,T_1250_23,T_2000_37,T_5000_93,T_10000_187,T_2500_47,T_10000_189,T_1000_19,T_10000_191,T_625_12,T_10000_193,T_5000_97,T_2000_39,T_2500_49,T_10000_197,T_5000_99,T_10000_199,T_50_1,T_10000_201,T_5000_101,T_10000_203,T_2500_51,T_2000_41,T_5000_103,T_10000_207,T_625_13,T_10000_209,T_1000_21,T_10000_211,T_2500_53,T_10000_213,T_5000_107,T_2000_43,T_1250_27,T_10000_217,T_5000_109,T_10000_219,T_500_11,T_10000_221,T_5000_111,T_10000_223,T_625_14,T_400_9,T_5000_113,T_10000_227,T_2500_57,T_10000_229,T_1000_23,T_10000_231,T_1250_29,T_10000_233,T_5000_117,T_2000_47,T_2500_59,T_10000_237,T_5000_119,T_10000_239,T_125_3,T_10000_241,T_5000_121,T_10000_243,T_2500_61,T_2000_49,T_5000_123,T_10000_247,T_1250_31,T_10000_249,T_40_1,T_10000_251,T_2500_63,T_10000_253,T_5000_127,T_2000_51,T_625_16,T_10000_257,T_5000_129,T_10000_259,T_500_13,T_10000_261,T_5000_131,T_10000_263,T_1250_33,T_2000_53,T_5000_133,T_10000_267,T_2500_67,T_10000_269,T_1000_27,T_10000_271,T_625_17,T_10000_273,T_5000_137,T_400_11,T_2500_69,T_10000_277,T_5000_139,T_10000_279,T_250_7,T_10000_281,T_5000_141,T_10000_283,T_2500_71,T_2000_57,T_5000_143,T_10000_287,T_625_18,T_10000_289,T_1000_29,T_10000_291,T_2500_73,T_10000_293,T_5000_147,T_2000_59,T_1250_37,T_10000_297,T_5000_149,T_10000_299,T_100_3,T_10000_301,T_5000_151,T_10000_303,T_625_19,T_2000_61,T_5000_153,T_10000_307,T_2500_77,T_10000_309,T_1000_31,T_10000_311,T_1250_39,T_10000_313,T_5000_157,T_2000_63,T_2500_79,T_10000_317,T_5000_159,T_10000_319,T_125_4,T_10000_321,T_5000_161,T_10000_323,T_2500_81,T_400_13,T_5000_163,T_10000_327,T_1250_41,T_10000_329,T_1000_33,T_10000_331,T_2500_83,T_10000_333,T_5000_167,T_2000_67,T_625_21,T_10000_337,T_5000_169,T_10000_339,T_500_17,T_10000_341,T_5000_171,T_10000_343,T_1250_43,T_2000_69,T_5000_173,T_10000_347,T_2500_87,T_10000_349,T_200_7,T_10000_351,T_625_22,T_10000_353,T_5000_177,T_2000_71,T_2500_89,T_10000_357,T_5000_179,T_10000_359,T_250_9,T_10000_361,T_5000_181,T_10000_363,T_2500_91,T_2000_73,T_5000_183,T_10000_367,T_625_23,T_10000_369,T_1000_37,T_10000_371,T_2500_93,T_10000_373,T_5000_187,T_80_3,T_1250_47,T_10000_377,T_5000_189,T_10000_379,T_500_19,T_10000_381,T_5000_191,T_10000_383,T_625_24,T_2000_77,T_5000_193,T_10000_387,T_2500_97,T_10000_389,T_1000_39,T_10000_391,T_1250_49,T_10000_393,T_5000_197,T_2000_79,T_2500_99,T_10000_397,T_5000_199,T_10000_399,T_25_1,T_10000_401,T_5000_201,T_10000_403,T_2500_101,T_2000_81,T_5000_203,T_10000_407,T_1250_51,T_10000_409,T_1000_41,T_10000_411,T_2500_103,T_10000_413,T_5000_207,T_2000_83,T_625_26,T_10000_417,T_5000_209,T_10000_419,T_500_21,T_10000_421,T_5000_211,T_10000_423,T_1250_53,T_400_17,T_5000_213,T_10000_427,T_2500_107,T_10000_429,T_1000_43,T_10000_431,T_625_27,T_10000_433,T_5000_217,T_2000_87,T_2500_109,T_10000_437,T_5000_219,T_10000_439,T_250_11,T_10000_441,T_5000_221,T_10000_443,T_2500_111,T_2000_89,T_5000_223,T_10000_447,T_625_28,T_10000_449,T_200_9,T_10000_451,T_2500_113,T_10000_453,T_5000_227,T_2000_91,T_1250_57,T_10000_457,T_5000_229,T_10000_459,T_500_23,T_10000_461,T_5000_231,T_10000_463,T_625_29,T_2000_93,T_5000_233,T_10000_467,T_2500_117,T_10000_469,T_1000_47,T_10000_471,T_1250_59,T_10000_473,T_5000_237,T_400_19,T_2500_119,T_10000_477,T_5000_239,T_10000_479,T_125_6,T_10000_481,T_5000_241,T_10000_483,T_2500_121,T_2000_97,T_5000_243,T_10000_487,T_1250_61,T_10000_489,T_1000_49,T_10000_491,T_2500_123,T_10000_493,T_5000_247,T_2000_99,T_625_31,T_10000_497,T_5000_249,T_10000_499,T_20_1,T_10000_501,T_5000_251,T_10000_503,T_1250_63,T_2000_101,T_5000_253,T_10000_507,T_2500_127,T_10000_509,T_1000_51,T_10000_511,T_625_32,T_10000_513,T_5000_257,T_2000_103,T_2500_129,T_10000_517,T_5000_259,T_10000_519,T_250_13,T_10000_521,T_5000_261,T_10000_523,T_2500_131,T_400_21,T_5000_263,T_10000_527,T_625_33,T_10000_529,T_1000_53,T_10000_531,T_2500_133,T_10000_533,T_5000_267,T_2000_107,T_1250_67,T_10000_537,T_5000_269,T_10000_539,T_500_27,T_10000_541,T_5000_271,T_10000_543,T_625_34,T_2000_109,T_5000_273,T_10000_547,T_2500_137,T_10000_549,T_200_11,T_10000_551,T_1250_69,T_10000_553,T_5000_277,T_2000_111,T_2500_139,T_10000_557,T_5000_279,T_10000_559,T_125_7,T_10000_561,T_5000_281,T_10000_563,T_2500_141,T_2000_113,T_5000_283,T_10000_567,T_1250_71,T_10000_569,T_1000_57,T_10000_571,T_2500_143,T_10000_573,T_5000_287,T_400_23,T_625_36,T_10000_577,T_5000_289,T_10000_579,T_500_29,T_10000_581,T_5000_291,T_10000_583,T_1250_73,T_2000_117,T_5000_293,T_10000_587,T_2500_147,T_10000_589,T_1000_59,T_10000_591,T_625_37,T_10000_593,T_5000_297,T_2000_119,T_2500_149,T_10000_597,T_5000_299,T_10000_599,T_50_3,T_10000_601,T_5000_301,T_10000_603,T_2500_151,T_2000_121,T_5000_303,T_10000_607,T_625_38,T_10000_609,T_1000_61,T_10000_611,T_2500_153,T_10000_613,T_5000_307,T_2000_123,T_1250_77,T_10000_617,T_5000_309,T_10000_619,T_500_31,T_10000_621,T_5000_311,T_10000_623,T_625_39,T_16_1,T_5000_313,T_10000_627,T_2500_157,T_10000_629,T_1000_63,T_10000_631,T_1250_79,T_10000_633,T_5000_317,T_2000_127,T_2500_159,T_10000_637,T_5000_319,T_10000_639,T_125_8,T_10000_641,T_5000_321,T_10000_643,T_2500_161,T_2000_129,T_5000_323,T_10000_647,T_1250_81,T_10000_649,T_200_13,T_10000_651,T_2500_163,T_10000_653,T_5000_327,T_2000_131,T_625_41,T_10000_657,T_5000_329,T_10000_659,T_500_33,T_10000_661,T_5000_331,T_10000_663,T_1250_83,T_2000_133,T_5000_333,T_10000_667,T_2500_167,T_10000_669,T_1000_67,T_10000_671,T_625_42,T_10000_673,T_5000_337,T_400_27,T_2500_169,T_10000_677,T_5000_339,T_10000_679,T_250_17,T_10000_681,T_5000_341,T_10000_683,T_2500_171,T_2000_137,T_5000_343,T_10000_687,T_625_43,T_10000_689,T_1000_69,T_10000_691,T_2500_173,T_10000_693,T_5000_347,T_2000_139,T_1250_87,T_10000_697,T_5000_349,T_10000_699,T_100_7,T_10000_701,T_5000_351,T_10000_703,T_625_44,T_2000_141,T_5000_353,T_10000_707,T_2500_177,T_10000_709,T_1000_71,T_10000_711,T_1250_89,T_10000_713,T_5000_357,T_2000_143,T_2500_179,T_10000_717,T_5000_359,T_10000_719,T_125_9,T_10000_721,T_5000_361,T_10000_723,T_2500_181,T_400_29,T_5000_363,T_10000_727,T_1250_91,T_10000_729,T_1000_73,T_10000_731,T_2500_183,T_10000_733,T_5000_367,T_2000_147,T_625_46,T_10000_737,T_5000_369,T_10000_739,T_500_37,T_10000_741,T_5000_371,T_10000_743,T_1250_93,T_2000_149,T_5000_373,T_10000_747,T_2500_187,T_10000_749,T_40_3,T_10000_751,T_625_47,T_10000_753,T_5000_377,T_2000_151,T_2500_189,T_10000_757,T_5000_379,T_10000_759,T_250_19,T_10000_761,T_5000_381,T_10000_763,T_2500_191,T_2000_153,T_5000_383,T_10000_767,T_625_48,T_10000_769,T_1000_77,T_10000_771,T_2500_193,T_10000_773,T_5000_387,T_400_31,T_1250_97,T_10000_777,T_5000_389,T_10000_779,T_500_39,T_10000_781,T_5000_391,T_10000_783,T_625_49,T_2000_157,T_5000_393,T_10000_787,T_2500_197,T_10000_789,T_1000_79,T_10000_791,T_1250_99,T_10000_793,T_5000_397,T_2000_159,T_2500_199,T_10000_797,T_5000_399,T_10000_799,T_25_2,T_10000_801,T_5000_401,T_10000_803,T_2500_201,T_2000_161,T_5000_403,T_10000_807,T_1250_101,T_10000_809,T_1000_81,T_10000_811,T_2500_203,T_10000_813,T_5000_407,T_2000_163,T_625_51,T_10000_817,T_5000_409,T_10000_819,T_500_41,T_10000_821,T_5000_411,T_10000_823,T_1250_103,T_400_33,T_5000_413,T_10000_827,T_2500_207,T_10000_829,T_1000_83,T_10000_831,T_625_52,T_10000_833,T_5000_417,T_2000_167,T_2500_209,T_10000_837,T_5000_419,T_10000_839,T_250_21,T_10000_841,T_5000_421,T_10000_843,T_2500_211,T_2000_169,T_5000_423,T_10000_847,T_625_53,T_10000_849,T_200_17,T_10000_851,T_2500_213,T_10000_853,T_5000_427,T_2000_171,T_1250_107,T_10000_857,T_5000_429,T_10000_859,T_500_43,T_10000_861,T_5000_431,T_10000_863,T_625_54,T_2000_173,T_5000_433,T_10000_867,T_2500_217,T_10000_869,T_1000_87,T_10000_871,T_1250_109,T_10000_873,T_5000_437,T_80_7,T_2500_219,T_10000_877,T_5000_439,T_10000_879,T_125_11,T_10000_881,T_5000_441,T_10000_883,T_2500_221,T_2000_177,T_5000_443,T_10000_887,T_1250_111,T_10000_889,T_1000_89,T_10000_891,T_2500_223,T_10000_893,T_5000_447,T_2000_179,T_625_56,T_10000_897,T_5000_449,T_10000_899,T_100_9,T_10000_901,T_5000_451,T_10000_903,T_1250_113,T_2000_181,T_5000_453,T_10000_907,T_2500_227,T_10000_909,T_1000_91,T_10000_911,T_625_57,T_10000_913,T_5000_457,T_2000_183,T_2500_229,T_10000_917,T_5000_459,T_10000_919,T_250_23,T_10000_921,T_5000_461,T_10000_923,T_2500_231,T_400_37,T_5000_463,T_10000_927,T_625_58,T_10000_929,T_1000_93,T_10000_931,T_2500_233,T_10000_933,T_5000_467,T_2000_187,T_1250_117,T_10000_937,T_5000_469,T_10000_939,T_500_47,T_10000_941,T_5000_471,T_10000_943,T_625_59,T_2000_189,T_5000_473,T_10000_947,T_2500_237,T_10000_949,T_200_19,T_10000_951,T_1250_119,T_10000_953,T_5000_477,T_2000_191,T_2500_239,T_10000_957,T_5000_479,T_10000_959,T_125_12,T_10000_961,T_5000_481,T_10000_963,T_2500_241,T_2000_193,T_5000_483,T_10000_967,T_1250_121,T_10000_969,T_1000_97,T_10000_971,T_2500_243,T_10000_973,T_5000_487,T_400_39,T_625_61,T_10000_977,T_5000_489,T_10000_979,T_500_49,T_10000_981,T_5000_491,T_10000_983,T_1250_123,T_2000_197,T_5000_493,T_10000_987,T_2500_247,T_10000_989,T_1000_99,T_10000_991,T_625_62,T_10000_993,T_5000_497,T_2000_199,T_2500_249,T_10000_997,T_5000_499,T_10000_999,T_2_0,T_5000_3,T_2500_3,T_5000_9,T_1250_3,T_1000_3,T_2500_9,T_5000_21,T_625_3,T_5000_27,T_500_3,T_5000_33,T_1250_9,T_5000_39,T_2500_21,T_1000_9,T_625_6,T_5000_51,T_2500_27,T_5000_57,T_250_3,T_5000_63,T_2500_33,T_5000_69,T_625_9,T_200_3,T_2500_39,T_5000_81,T_1250_21,T_5000_87,T_500_9,T_5000_93,T_625_12,T_5000_99,T_2500_51,T_1000_21,T_1250_27,T_5000_111,T_2500_57,T_5000_117,T_125_3,T_5000_123,T_2500_63,T_5000_129,T_1250_33,T_1000_27,T_2500_69,T_5000_141,T_625_18,T_5000_147,T_100_3,T_5000_153,T_1250_39,T_5000_159,T_2500_81,T_1000_33,T_625_21,T_5000_171,T_2500_87,T_5000_177,T_250_9,T_5000_183,T_2500_93,T_5000_189,T_625_24,T_1000_39,T_2500_99,T_5000_201,T_1250_51,T_5000_207,T_500_21,T_5000_213,T_625_27,T_5000_219,T_2500_111,T_200_9,T_1250_57,T_5000_231,T_2500_117,T_5000_237,T_125_6,T_5000_243,T_2500_123,T_5000_249,T_1250_63,T_1000_51,T_2500_129,T_5000_261,T_625_33,T_5000_267,T_500_27,T_5000_273,T_1250_69,T_5000_279,T_2500_141,T_1000_57,T_625_36,T_5000_291,T_2500_147,T_5000_297,T_50_3,T_5000_303,T_2500_153,T_5000_309,T_625_39,T_1000_63,T_2500_159,T_5000_321,T_1250_81,T_5000_327,T_500_33,T_5000_333,T_625_42,T_5000_339,T_2500_171,T_1000_69,T_1250_87,T_5000_351,T_2500_177,T_5000_357,T_125_9,T_5000_363,T_2500_183,T_5000_369,T_1250_93,T_40_3,T_2500_189,T_5000_381,T_625_48,T_5000_387,T_500_39,T_5000_393,T_1250_99,T_5000_399,T_2500_201,T_1000_81,T_625_51,T_5000_411,T_2500_207,T_5000_417,T_250_21,T_5000_423,T_2500_213,T_5000_429,T_625_54,T_1000_87,T_2500_219,T_5000_441,T_1250_111,T_5000_447,T_100_9,T_5000_453,T_625_57,T_5000_459,T_2500_231,T_1000_93,T_1250_117,T_5000_471,T_2500_237,T_5000_477,T_125_12,T_5000_483,T_2500_243,T_5000_489,T_1250_123,T_1000_99,T_2500_249,T_5000_501,T_625_63,T_5000_507,T_500_51,T_5000_513,T_1250_129,T_5000_519,T_2500_261,T_200_21,T_625_66,T_5000_531,T_2500_267,T_5000_537,T_250_27,T_5000_543,T_2500_273,T_5000_549,T_625_69,T_1000_111,T_2500_279,T_5000_561,T_1250_141,T_5000_567,T_500_57,T_5000_573,T_625_72,T_5000_579,T_2500_291,T_1000_117,T_1250_147,T_5000_591,T_2500_297,T_5000_597,T_25_3,T_5000_603,T_2500_303,T_5000_609,T_1250_153,T_1000_123,T_2500_309,T_5000_621,T_625_78,T_5000_627,T_500_63,T_5000_633,T_1250_159,T_5000_639,T_2500_321,T_1000_129,T_625_81,T_5000_651,T_2500_327,T_5000_657,T_250_33,T_5000_663,T_2500_333,T_5000_669,T_625_84,T_200_27,T_2500_339,T_5000_681,T_1250_171,T_5000_687,T_500_69,T_5000_693,T_625_87,T_5000_699,T_2500_351,T_1000_141,T_1250_177,T_5000_711,T_2500_357,T_5000_717,T_125_18,T_5000_723,T_2500_363,T_5000_729,T_1250_183,T_1000_147,T_2500_369,T_5000_741,T_625_93,T_5000_747,T_20_3,T_5000_753,T_1250_189,T_5000_759,T_2500_381,T_1000_153,T_625_96,T_5000_771,T_2500_387,T_5000_777,T_250_39,T_5000_783,T_2500_393,T_5000_789,T_625_99,T_1000_159,T_2500_399,T_5000_801,T_1250_201,T_5000_807,T_500_81,T_5000_813,T_625_102,T_5000_819,T_2500_411,T_200_33,T_1250_207,T_5000_831,T_2500_417,T_5000_837,T_125_21,T_5000_843,T_2500_423,T_5000_849,T_1250_213,T_1000_171,T_2500_429,T_5000_861,T_625_108,T_5000_867,T_500_87,T_5000_873,T_1250_219,T_5000_879,T_2500_441,T_1000_177,T_625_111,T_5000_891,T_2500_447,T_5000_897,T_50_9,T_5000_903,T_2500_453,T_5000_909,T_625_114,T_1000_183,T_2500_459,T_5000_921,T_1250_231,T_5000_927,T_500_93,T_5000_933,T_625_117,T_5000_939,T_2500_471,T_1000_189,T_1250_237,T_5000_951,T_2500_477,T_5000_957,T_125_24,T_5000_963,T_2500_483,T_5000_969,T_1250_243,T_200_39,T_2500_489,T_5000_981,T_625_123,T_5000_987,T_500_99,T_5000_993,T_1250_249,T_5000_999,T_2500_501,T_1000_201,T_625_126,T_5000_1011,T_2500_507,T_5000_1017,T_250_51,T_5000_1023,T_2500_513,T_5000_1029,T_625_129,T_1000_207,T_2500_519,T_5000_1041,T_1250_261,T_5000_1047,T_100_21,T_5000_1053,T_625_132,T_5000_1059,T_2500_531,T_1000_213,T_1250_267,T_5000_1071,T_2500_537,T_5000_1077,T_125_27,T_5000_1083,T_2500_543,T_5000_1089,T_1250_273,T_1000_219,T_2500_549,T_5000_1101,T_625_138,T_5000_1107,T_500_111,T_5000_1113,T_1250_279,T_5000_1119,T_2500_561,T_40_9,T_625_141,T_5000_1131,T_2500_567,T_5000_1137,T_250_57,T_5000_1143,T_2500_573,T_5000_1149,T_625_144,T_1000_231,T_2500_579,T_5000_1161,T_1250_291,T_5000_1167,T_500_117,T_5000_1173,T_625_147,T_5000_1179,T_2500_591,T_1000_237,T_1250_297,T_5000_1191,T_2500_597,T_5000_1197,T_25_6,T_5000_1203,T_2500_603,T_5000_1209,T_1250_303,T_1000_243,T_2500_609,T_5000_1221,T_625_153,T_5000_1227,T_500_123,T_5000_1233,T_1250_309,T_5000_1239,T_2500_621,T_1000_249,T_625_156,T_5000_1251,T_2500_627,T_5000_1257,T_250_63,T_5000_1263,T_2500_633,T_5000_1269,T_625_159,T_200_51,T_2500_639,T_5000_1281,T_1250_321,T_5000_1287,T_500_129,T_5000_1293,T_625_162,T_5000_1299,T_2500_651,T_1000_261,T_1250_327,T_5000_1311,T_2500_657,T_5000_1317,T_125_33,T_5000_1323,T_2500_663,T_5000_1329,T_1250_333,T_1000_267,T_2500_669,T_5000_1341,T_625_168,T_5000_1347,T_100_27,T_5000_1353,T_1250_339,T_5000_1359,T_2500_681,T_1000_273,T_625_171,T_5000_1371,T_2500_687,T_5000_1377,T_250_69,T_5000_1383,T_2500_693,T_5000_1389,T_625_174,T_1000_279,T_2500_699,T_5000_1401,T_1250_351,T_5000_1407,T_500_141,T_5000_1413,T_625_177,T_5000_1419,T_2500_711,T_200_57,T_1250_357,T_5000_1431,T_2500_717,T_5000_1437,T_125_36,T_5000_1443,T_2500_723,T_5000_1449,T_1250_363,T_1000_291,T_2500_729,T_5000_1461,T_625_183,T_5000_1467,T_500_147,T_5000_1473,T_1250_369,T_5000_1479,T_2500_741,T_1000_297,T_625_186,T_5000_1491,T_2500_747,T_5000_1497,T_10_3,T_5000_1503,T_2500_753,T_5000_1509,T_625_189,T_1000_303,T_2500_759,T_5000_1521,T_1250_381,T_5000_1527,T_500_153,T_5000_1533,T_625_192,T_5000_1539,T_2500_771,T_1000_309,T_1250_387,T_5000_1551,T_2500_777,T_5000_1557,T_125_39,T_5000_1563,T_2500_783,T_5000_1569,T_1250_393,T_200_63,T_2500_789,T_5000_1581,T_625_198,T_5000_1587,T_500_159,T_5000_1593,T_1250_399,T_5000_1599,T_2500_801,T_1000_321,T_625_201,T_5000_1611,T_2500_807,T_5000_1617,T_250_81,T_5000_1623,T_2500_813,T_5000_1629,T_625_204,T_1000_327,T_2500_819,T_5000_1641,T_1250_411,T_5000_1647,T_100_33,T_5000_1653,T_625_207,T_5000_1659,T_2500_831,T_1000_333,T_1250_417,T_5000_1671,T_2500_837,T_5000_1677,T_125_42,T_5000_1683,T_2500_843,T_5000_1689,T_1250_423,T_1000_339,T_2500_849,T_5000_1701,T_625_213,T_5000_1707,T_500_171,T_5000_1713,T_1250_429,T_5000_1719,T_2500_861,T_200_69,T_625_216,T_5000_1731,T_2500_867,T_5000_1737,T_250_87,T_5000_1743,T_2500_873,T_5000_1749,T_625_219,T_1000_351,T_2500_879,T_5000_1761,T_1250_441,T_5000_1767,T_500_177,T_5000_1773,T_625_222,T_5000_1779,T_2500_891,T_1000_357,T_1250_447,T_5000_1791,T_2500_897,T_5000_1797,T_25_9,T_5000_1803,T_2500_903,T_5000_1809,T_1250_453,T_1000_363,T_2500_909,T_5000_1821,T_625_228,T_5000_1827,T_500_183,T_5000_1833,T_1250_459,T_5000_1839,T_2500_921,T_1000_369,T_625_231,T_5000_1851,T_2500_927,T_5000_1857,T_250_93,T_5000_1863,T_2500_933,T_5000_1869,T_625_234,T_8_3,T_2500_939,T_5000_1881,T_1250_471,T_5000_1887,T_500_189,T_5000_1893,T_625_237,T_5000_1899,T_2500_951,T_1000_381,T_1250_477,T_5000_1911,T_2500_957,T_5000_1917,T_125_48,T_5000_1923,T_2500_963,T_5000_1929,T_1250_483,T_1000_387,T_2500_969,T_5000_1941,T_625_243,T_5000_1947,T_100_39,T_5000_1953,T_1250_489,T_5000_1959,T_2500_981,T_1000_393,T_625_246,T_5000_1971,T_2500_987,T_5000_1977,T_250_99,T_5000_1983,T_2500_993,T_5000_1989,T_625_249,T_1000_399,T_2500_999,T_5000_2001,T_1250_501,T_5000_2007,T_500_201,T_5000_2013,T_625_252,T_5000_2019,T_2500_1011,T_200_81,T_1250_507,T_5000_2031,T_2500_1017,T_5000_2037,T_125_51,T_5000_2043,T_2500_1023,T_5000_2049,T_1250_513,T_1000_411,T_2500_1029,T_5000_2061,T_625_258,T_5000_2067,T_500_207,T_5000_2073,T_1250_519,T_5000_2079,T_2500_1041,T_1000_417,T_625_261,T_5000_2091,T_2500_1047,T_5000_2097,T_50_21,T_5000_2103,T_2500_1053,T_5000_2109,T_625_264,T_1000_423,T_2500_1059,T_5000_2121,T_1250_531,T_5000_2127,T_500_213,T_5000_2133,T_625_267,T_5000_2139,T_2500_1071,T_1000_429,T_1250_537,T_5000_2151,T_2500_1077,T_5000_2157,T_125_54,T_5000_2163,T_2500_1083,T_5000_2169,T_1250_543,T_200_87,T_2500_1089,T_5000_2181,T_625_273,T_5000_2187,T_500_219,T_5000_2193,T_1250_549,T_5000_2199,T_2500_1101,T_1000_441,T_625_276,T_5000_2211,T_2500_1107,T_5000_2217,T_250_111,T_5000_2223,T_2500_1113,T_5000_2229,T_625_279,T_1000_447,T_2500_1119,T_5000_2241,T_1250_561,T_5000_2247,T_20_9,T_5000_2253,T_625_282,T_5000_2259,T_2500_1131,T_1000_453,T_1250_567,T_5000_2271,T_2500_1137,T_5000_2277,T_125_57,T_5000_2283,T_2500_1143,T_5000_2289,T_1250_573,T_1000_459,T_2500_1149,T_5000_2301,T_625_288,T_5000_2307,T_500_231,T_5000_2313,T_1250_579,T_5000_2319,T_2500_1161,T_200_93,T_625_291,T_5000_2331,T_2500_1167,T_5000_2337,T_250_117,T_5000_2343,T_2500_1173,T_5000_2349,T_625_294,T_1000_471,T_2500_1179,T_5000_2361,T_1250_591,T_5000_2367,T_500_237,T_5000_2373,T_625_297,T_5000_2379,T_2500_1191,T_1000_477,T_1250_597,T_5000_2391,T_2500_1197,T_5000_2397,T_25_12,T_5000_2403,T_2500_1203,T_5000_2409,T_1250_603,T_1000_483,T_2500_1209,T_5000_2421,T_625_303,T_5000_2427,T_500_243,T_5000_2433,T_1250_609,T_5000_2439,T_2500_1221,T_1000_489,T_625_306,T_5000_2451,T_2500_1227,T_5000_2457,T_250_123,T_5000_2463,T_2500_1233,T_5000_2469,T_625_309,T_200_99,T_2500_1239,T_5000_2481,T_1250_621,T_5000_2487,T_500_249,T_5000_2493,T_625_312,T_5000_2499,T_2500_1251,T_1000_501,T_1250_627,T_5000_2511,T_2500_1257,T_5000_2517,T_125_63,T_5000_2523,T_2500_1263,T_5000_2529,T_1250_633,T_1000_507,T_2500_1269,T_5000_2541,T_625_318,T_5000_2547,T_100_51,T_5000_2553,T_1250_639,T_5000_2559,T_2500_1281,T_1000_513,T_625_321,T_5000_2571,T_2500_1287,T_5000_2577,T_250_129,T_5000_2583,T_2500_1293,T_5000_2589,T_625_324,T_1000_519,T_2500_1299,T_5000_2601,T_1250_651,T_5000_2607,T_500_261,T_5000_2613,T_625_327,T_5000_2619,T_2500_1311,T_40_21,T_1250_657,T_5000_2631,T_2500_1317,T_5000_2637,T_125_66,T_5000_2643,T_2500_1323,T_5000_2649,T_1250_663,T_1000_531,T_2500_1329,T_5000_2661,T_625_333,T_5000_2667,T_500_267,T_5000_2673,T_1250_669,T_5000_2679,T_2500_1341,T_1000_537,T_625_336,T_5000_2691,T_2500_1347,T_5000_2697,T_50_27,T_5000_2703,T_2500_1353,T_5000_2709,T_625_339,T_1000_543,T_2500_1359,T_5000_2721,T_1250_681,T_5000_2727,T_500_273,T_5000_2733,T_625_342,T_5000_2739,T_2500_1371,T_1000_549,T_1250_687,T_5000_2751,T_2500_1377,T_5000_2757,T_125_69,T_5000_2763,T_2500_1383,T_5000_2769,T_1250_693,T_200_111,T_2500_1389,T_5000_2781,T_625_348,T_5000_2787,T_500_279,T_5000_2793,T_1250_699,T_5000_2799,T_2500_1401,T_1000_561,T_625_351,T_5000_2811,T_2500_1407,T_5000_2817,T_250_141,T_5000_2823,T_2500_1413,T_5000_2829,T_625_354,T_1000_567,T_2500_1419,T_5000_2841,T_1250_711,T_5000_2847,T_100_57,T_5000_2853,T_625_357,T_5000_2859,T_2500_1431,T_1000_573,T_1250_717,T_5000_2871,T_2500_1437,T_5000_2877,T_125_72,T_5000_2883,T_2500_1443,T_5000_2889,T_1250_723,T_1000_579,T_2500_1449,T_5000_2901,T_625_363,T_5000_2907,T_500_291,T_5000_2913,T_1250_729,T_5000_2919,T_2500_1461,T_200_117,T_625_366,T_5000_2931,T_2500_1467,T_5000_2937,T_250_147,T_5000_2943,T_2500_1473,T_5000_2949,T_625_369,T_1000_591,T_2500_1479,T_5000_2961,T_1250_741,T_5000_2967,T_500_297,T_5000_2973,T_625_372,T_5000_2979,T_2500_1491,T_1000_597,T_1250_747,T_5000_2991,T_2500_1497,T_5000_2997 +}; +static const __device__ float2 lut_sp_25_15625[625*2] = { + T_2_0,T_15625_1,T_15625_2,T_15625_3,T_15625_4,T_3125_1,T_15625_6,T_15625_7,T_15625_8,T_15625_9,T_3125_2,T_15625_11,T_15625_12,T_15625_13,T_15625_14,T_3125_3,T_15625_16,T_15625_17,T_15625_18,T_15625_19,T_3125_4,T_15625_21,T_15625_22,T_15625_23,T_15625_24,T_625_1,T_15625_26,T_15625_27,T_15625_28,T_15625_29,T_3125_6,T_15625_31,T_15625_32,T_15625_33,T_15625_34,T_3125_7,T_15625_36,T_15625_37,T_15625_38,T_15625_39,T_3125_8,T_15625_41,T_15625_42,T_15625_43,T_15625_44,T_3125_9,T_15625_46,T_15625_47,T_15625_48,T_15625_49,T_625_2,T_15625_51,T_15625_52,T_15625_53,T_15625_54,T_3125_11,T_15625_56,T_15625_57,T_15625_58,T_15625_59,T_3125_12,T_15625_61,T_15625_62,T_15625_63,T_15625_64,T_3125_13,T_15625_66,T_15625_67,T_15625_68,T_15625_69,T_3125_14,T_15625_71,T_15625_72,T_15625_73,T_15625_74,T_625_3,T_15625_76,T_15625_77,T_15625_78,T_15625_79,T_3125_16,T_15625_81,T_15625_82,T_15625_83,T_15625_84,T_3125_17,T_15625_86,T_15625_87,T_15625_88,T_15625_89,T_3125_18,T_15625_91,T_15625_92,T_15625_93,T_15625_94,T_3125_19,T_15625_96,T_15625_97,T_15625_98,T_15625_99,T_625_4,T_15625_101,T_15625_102,T_15625_103,T_15625_104,T_3125_21,T_15625_106,T_15625_107,T_15625_108,T_15625_109,T_3125_22,T_15625_111,T_15625_112,T_15625_113,T_15625_114,T_3125_23,T_15625_116,T_15625_117,T_15625_118,T_15625_119,T_3125_24,T_15625_121,T_15625_122,T_15625_123,T_15625_124,T_125_1,T_15625_126,T_15625_127,T_15625_128,T_15625_129,T_3125_26,T_15625_131,T_15625_132,T_15625_133,T_15625_134,T_3125_27,T_15625_136,T_15625_137,T_15625_138,T_15625_139,T_3125_28,T_15625_141,T_15625_142,T_15625_143,T_15625_144,T_3125_29,T_15625_146,T_15625_147,T_15625_148,T_15625_149,T_625_6,T_15625_151,T_15625_152,T_15625_153,T_15625_154,T_3125_31,T_15625_156,T_15625_157,T_15625_158,T_15625_159,T_3125_32,T_15625_161,T_15625_162,T_15625_163,T_15625_164,T_3125_33,T_15625_166,T_15625_167,T_15625_168,T_15625_169,T_3125_34,T_15625_171,T_15625_172,T_15625_173,T_15625_174,T_625_7,T_15625_176,T_15625_177,T_15625_178,T_15625_179,T_3125_36,T_15625_181,T_15625_182,T_15625_183,T_15625_184,T_3125_37,T_15625_186,T_15625_187,T_15625_188,T_15625_189,T_3125_38,T_15625_191,T_15625_192,T_15625_193,T_15625_194,T_3125_39,T_15625_196,T_15625_197,T_15625_198,T_15625_199,T_625_8,T_15625_201,T_15625_202,T_15625_203,T_15625_204,T_3125_41,T_15625_206,T_15625_207,T_15625_208,T_15625_209,T_3125_42,T_15625_211,T_15625_212,T_15625_213,T_15625_214,T_3125_43,T_15625_216,T_15625_217,T_15625_218,T_15625_219,T_3125_44,T_15625_221,T_15625_222,T_15625_223,T_15625_224,T_625_9,T_15625_226,T_15625_227,T_15625_228,T_15625_229,T_3125_46,T_15625_231,T_15625_232,T_15625_233,T_15625_234,T_3125_47,T_15625_236,T_15625_237,T_15625_238,T_15625_239,T_3125_48,T_15625_241,T_15625_242,T_15625_243,T_15625_244,T_3125_49,T_15625_246,T_15625_247,T_15625_248,T_15625_249,T_125_2,T_15625_251,T_15625_252,T_15625_253,T_15625_254,T_3125_51,T_15625_256,T_15625_257,T_15625_258,T_15625_259,T_3125_52,T_15625_261,T_15625_262,T_15625_263,T_15625_264,T_3125_53,T_15625_266,T_15625_267,T_15625_268,T_15625_269,T_3125_54,T_15625_271,T_15625_272,T_15625_273,T_15625_274,T_625_11,T_15625_276,T_15625_277,T_15625_278,T_15625_279,T_3125_56,T_15625_281,T_15625_282,T_15625_283,T_15625_284,T_3125_57,T_15625_286,T_15625_287,T_15625_288,T_15625_289,T_3125_58,T_15625_291,T_15625_292,T_15625_293,T_15625_294,T_3125_59,T_15625_296,T_15625_297,T_15625_298,T_15625_299,T_625_12,T_15625_301,T_15625_302,T_15625_303,T_15625_304,T_3125_61,T_15625_306,T_15625_307,T_15625_308,T_15625_309,T_3125_62,T_15625_311,T_15625_312,T_15625_313,T_15625_314,T_3125_63,T_15625_316,T_15625_317,T_15625_318,T_15625_319,T_3125_64,T_15625_321,T_15625_322,T_15625_323,T_15625_324,T_625_13,T_15625_326,T_15625_327,T_15625_328,T_15625_329,T_3125_66,T_15625_331,T_15625_332,T_15625_333,T_15625_334,T_3125_67,T_15625_336,T_15625_337,T_15625_338,T_15625_339,T_3125_68,T_15625_341,T_15625_342,T_15625_343,T_15625_344,T_3125_69,T_15625_346,T_15625_347,T_15625_348,T_15625_349,T_625_14,T_15625_351,T_15625_352,T_15625_353,T_15625_354,T_3125_71,T_15625_356,T_15625_357,T_15625_358,T_15625_359,T_3125_72,T_15625_361,T_15625_362,T_15625_363,T_15625_364,T_3125_73,T_15625_366,T_15625_367,T_15625_368,T_15625_369,T_3125_74,T_15625_371,T_15625_372,T_15625_373,T_15625_374,T_125_3,T_15625_376,T_15625_377,T_15625_378,T_15625_379,T_3125_76,T_15625_381,T_15625_382,T_15625_383,T_15625_384,T_3125_77,T_15625_386,T_15625_387,T_15625_388,T_15625_389,T_3125_78,T_15625_391,T_15625_392,T_15625_393,T_15625_394,T_3125_79,T_15625_396,T_15625_397,T_15625_398,T_15625_399,T_625_16,T_15625_401,T_15625_402,T_15625_403,T_15625_404,T_3125_81,T_15625_406,T_15625_407,T_15625_408,T_15625_409,T_3125_82,T_15625_411,T_15625_412,T_15625_413,T_15625_414,T_3125_83,T_15625_416,T_15625_417,T_15625_418,T_15625_419,T_3125_84,T_15625_421,T_15625_422,T_15625_423,T_15625_424,T_625_17,T_15625_426,T_15625_427,T_15625_428,T_15625_429,T_3125_86,T_15625_431,T_15625_432,T_15625_433,T_15625_434,T_3125_87,T_15625_436,T_15625_437,T_15625_438,T_15625_439,T_3125_88,T_15625_441,T_15625_442,T_15625_443,T_15625_444,T_3125_89,T_15625_446,T_15625_447,T_15625_448,T_15625_449,T_625_18,T_15625_451,T_15625_452,T_15625_453,T_15625_454,T_3125_91,T_15625_456,T_15625_457,T_15625_458,T_15625_459,T_3125_92,T_15625_461,T_15625_462,T_15625_463,T_15625_464,T_3125_93,T_15625_466,T_15625_467,T_15625_468,T_15625_469,T_3125_94,T_15625_471,T_15625_472,T_15625_473,T_15625_474,T_625_19,T_15625_476,T_15625_477,T_15625_478,T_15625_479,T_3125_96,T_15625_481,T_15625_482,T_15625_483,T_15625_484,T_3125_97,T_15625_486,T_15625_487,T_15625_488,T_15625_489,T_3125_98,T_15625_491,T_15625_492,T_15625_493,T_15625_494,T_3125_99,T_15625_496,T_15625_497,T_15625_498,T_15625_499,T_125_4,T_15625_501,T_15625_502,T_15625_503,T_15625_504,T_3125_101,T_15625_506,T_15625_507,T_15625_508,T_15625_509,T_3125_102,T_15625_511,T_15625_512,T_15625_513,T_15625_514,T_3125_103,T_15625_516,T_15625_517,T_15625_518,T_15625_519,T_3125_104,T_15625_521,T_15625_522,T_15625_523,T_15625_524,T_625_21,T_15625_526,T_15625_527,T_15625_528,T_15625_529,T_3125_106,T_15625_531,T_15625_532,T_15625_533,T_15625_534,T_3125_107,T_15625_536,T_15625_537,T_15625_538,T_15625_539,T_3125_108,T_15625_541,T_15625_542,T_15625_543,T_15625_544,T_3125_109,T_15625_546,T_15625_547,T_15625_548,T_15625_549,T_625_22,T_15625_551,T_15625_552,T_15625_553,T_15625_554,T_3125_111,T_15625_556,T_15625_557,T_15625_558,T_15625_559,T_3125_112,T_15625_561,T_15625_562,T_15625_563,T_15625_564,T_3125_113,T_15625_566,T_15625_567,T_15625_568,T_15625_569,T_3125_114,T_15625_571,T_15625_572,T_15625_573,T_15625_574,T_625_23,T_15625_576,T_15625_577,T_15625_578,T_15625_579,T_3125_116,T_15625_581,T_15625_582,T_15625_583,T_15625_584,T_3125_117,T_15625_586,T_15625_587,T_15625_588,T_15625_589,T_3125_118,T_15625_591,T_15625_592,T_15625_593,T_15625_594,T_3125_119,T_15625_596,T_15625_597,T_15625_598,T_15625_599,T_625_24,T_15625_601,T_15625_602,T_15625_603,T_15625_604,T_3125_121,T_15625_606,T_15625_607,T_15625_608,T_15625_609,T_3125_122,T_15625_611,T_15625_612,T_15625_613,T_15625_614,T_3125_123,T_15625_616,T_15625_617,T_15625_618,T_15625_619,T_3125_124,T_15625_621,T_15625_622,T_15625_623,T_15625_624,T_2_0,T_15625_13,T_15625_26,T_15625_39,T_15625_52,T_3125_13,T_15625_78,T_15625_91,T_15625_104,T_15625_117,T_3125_26,T_15625_143,T_15625_156,T_15625_169,T_15625_182,T_3125_39,T_15625_208,T_15625_221,T_15625_234,T_15625_247,T_3125_52,T_15625_273,T_15625_286,T_15625_299,T_15625_312,T_625_13,T_15625_338,T_15625_351,T_15625_364,T_15625_377,T_3125_78,T_15625_403,T_15625_416,T_15625_429,T_15625_442,T_3125_91,T_15625_468,T_15625_481,T_15625_494,T_15625_507,T_3125_104,T_15625_533,T_15625_546,T_15625_559,T_15625_572,T_3125_117,T_15625_598,T_15625_611,T_15625_624,T_15625_637,T_625_26,T_15625_663,T_15625_676,T_15625_689,T_15625_702,T_3125_143,T_15625_728,T_15625_741,T_15625_754,T_15625_767,T_3125_156,T_15625_793,T_15625_806,T_15625_819,T_15625_832,T_3125_169,T_15625_858,T_15625_871,T_15625_884,T_15625_897,T_3125_182,T_15625_923,T_15625_936,T_15625_949,T_15625_962,T_625_39,T_15625_988,T_15625_1001,T_15625_1014,T_15625_1027,T_3125_208,T_15625_1053,T_15625_1066,T_15625_1079,T_15625_1092,T_3125_221,T_15625_1118,T_15625_1131,T_15625_1144,T_15625_1157,T_3125_234,T_15625_1183,T_15625_1196,T_15625_1209,T_15625_1222,T_3125_247,T_15625_1248,T_15625_1261,T_15625_1274,T_15625_1287,T_625_52,T_15625_1313,T_15625_1326,T_15625_1339,T_15625_1352,T_3125_273,T_15625_1378,T_15625_1391,T_15625_1404,T_15625_1417,T_3125_286,T_15625_1443,T_15625_1456,T_15625_1469,T_15625_1482,T_3125_299,T_15625_1508,T_15625_1521,T_15625_1534,T_15625_1547,T_3125_312,T_15625_1573,T_15625_1586,T_15625_1599,T_15625_1612,T_125_13,T_15625_1638,T_15625_1651,T_15625_1664,T_15625_1677,T_3125_338,T_15625_1703,T_15625_1716,T_15625_1729,T_15625_1742,T_3125_351,T_15625_1768,T_15625_1781,T_15625_1794,T_15625_1807,T_3125_364,T_15625_1833,T_15625_1846,T_15625_1859,T_15625_1872,T_3125_377,T_15625_1898,T_15625_1911,T_15625_1924,T_15625_1937,T_625_78,T_15625_1963,T_15625_1976,T_15625_1989,T_15625_2002,T_3125_403,T_15625_2028,T_15625_2041,T_15625_2054,T_15625_2067,T_3125_416,T_15625_2093,T_15625_2106,T_15625_2119,T_15625_2132,T_3125_429,T_15625_2158,T_15625_2171,T_15625_2184,T_15625_2197,T_3125_442,T_15625_2223,T_15625_2236,T_15625_2249,T_15625_2262,T_625_91,T_15625_2288,T_15625_2301,T_15625_2314,T_15625_2327,T_3125_468,T_15625_2353,T_15625_2366,T_15625_2379,T_15625_2392,T_3125_481,T_15625_2418,T_15625_2431,T_15625_2444,T_15625_2457,T_3125_494,T_15625_2483,T_15625_2496,T_15625_2509,T_15625_2522,T_3125_507,T_15625_2548,T_15625_2561,T_15625_2574,T_15625_2587,T_625_104,T_15625_2613,T_15625_2626,T_15625_2639,T_15625_2652,T_3125_533,T_15625_2678,T_15625_2691,T_15625_2704,T_15625_2717,T_3125_546,T_15625_2743,T_15625_2756,T_15625_2769,T_15625_2782,T_3125_559,T_15625_2808,T_15625_2821,T_15625_2834,T_15625_2847,T_3125_572,T_15625_2873,T_15625_2886,T_15625_2899,T_15625_2912,T_625_117,T_15625_2938,T_15625_2951,T_15625_2964,T_15625_2977,T_3125_598,T_15625_3003,T_15625_3016,T_15625_3029,T_15625_3042,T_3125_611,T_15625_3068,T_15625_3081,T_15625_3094,T_15625_3107,T_3125_624,T_15625_3133,T_15625_3146,T_15625_3159,T_15625_3172,T_3125_637,T_15625_3198,T_15625_3211,T_15625_3224,T_15625_3237,T_125_26,T_15625_3263,T_15625_3276,T_15625_3289,T_15625_3302,T_3125_663,T_15625_3328,T_15625_3341,T_15625_3354,T_15625_3367,T_3125_676,T_15625_3393,T_15625_3406,T_15625_3419,T_15625_3432,T_3125_689,T_15625_3458,T_15625_3471,T_15625_3484,T_15625_3497,T_3125_702,T_15625_3523,T_15625_3536,T_15625_3549,T_15625_3562,T_625_143,T_15625_3588,T_15625_3601,T_15625_3614,T_15625_3627,T_3125_728,T_15625_3653,T_15625_3666,T_15625_3679,T_15625_3692,T_3125_741,T_15625_3718,T_15625_3731,T_15625_3744,T_15625_3757,T_3125_754,T_15625_3783,T_15625_3796,T_15625_3809,T_15625_3822,T_3125_767,T_15625_3848,T_15625_3861,T_15625_3874,T_15625_3887,T_625_156,T_15625_3913,T_15625_3926,T_15625_3939,T_15625_3952,T_3125_793,T_15625_3978,T_15625_3991,T_15625_4004,T_15625_4017,T_3125_806,T_15625_4043,T_15625_4056,T_15625_4069,T_15625_4082,T_3125_819,T_15625_4108,T_15625_4121,T_15625_4134,T_15625_4147,T_3125_832,T_15625_4173,T_15625_4186,T_15625_4199,T_15625_4212,T_625_169,T_15625_4238,T_15625_4251,T_15625_4264,T_15625_4277,T_3125_858,T_15625_4303,T_15625_4316,T_15625_4329,T_15625_4342,T_3125_871,T_15625_4368,T_15625_4381,T_15625_4394,T_15625_4407,T_3125_884,T_15625_4433,T_15625_4446,T_15625_4459,T_15625_4472,T_3125_897,T_15625_4498,T_15625_4511,T_15625_4524,T_15625_4537,T_625_182,T_15625_4563,T_15625_4576,T_15625_4589,T_15625_4602,T_3125_923,T_15625_4628,T_15625_4641,T_15625_4654,T_15625_4667,T_3125_936,T_15625_4693,T_15625_4706,T_15625_4719,T_15625_4732,T_3125_949,T_15625_4758,T_15625_4771,T_15625_4784,T_15625_4797,T_3125_962,T_15625_4823,T_15625_4836,T_15625_4849,T_15625_4862,T_125_39,T_15625_4888,T_15625_4901,T_15625_4914,T_15625_4927,T_3125_988,T_15625_4953,T_15625_4966,T_15625_4979,T_15625_4992,T_3125_1001,T_15625_5018,T_15625_5031,T_15625_5044,T_15625_5057,T_3125_1014,T_15625_5083,T_15625_5096,T_15625_5109,T_15625_5122,T_3125_1027,T_15625_5148,T_15625_5161,T_15625_5174,T_15625_5187,T_625_208,T_15625_5213,T_15625_5226,T_15625_5239,T_15625_5252,T_3125_1053,T_15625_5278,T_15625_5291,T_15625_5304,T_15625_5317,T_3125_1066,T_15625_5343,T_15625_5356,T_15625_5369,T_15625_5382,T_3125_1079,T_15625_5408,T_15625_5421,T_15625_5434,T_15625_5447,T_3125_1092,T_15625_5473,T_15625_5486,T_15625_5499,T_15625_5512,T_625_221,T_15625_5538,T_15625_5551,T_15625_5564,T_15625_5577,T_3125_1118,T_15625_5603,T_15625_5616,T_15625_5629,T_15625_5642,T_3125_1131,T_15625_5668,T_15625_5681,T_15625_5694,T_15625_5707,T_3125_1144,T_15625_5733,T_15625_5746,T_15625_5759,T_15625_5772,T_3125_1157,T_15625_5798,T_15625_5811,T_15625_5824,T_15625_5837,T_625_234,T_15625_5863,T_15625_5876,T_15625_5889,T_15625_5902,T_3125_1183,T_15625_5928,T_15625_5941,T_15625_5954,T_15625_5967,T_3125_1196,T_15625_5993,T_15625_6006,T_15625_6019,T_15625_6032,T_3125_1209,T_15625_6058,T_15625_6071,T_15625_6084,T_15625_6097,T_3125_1222,T_15625_6123,T_15625_6136,T_15625_6149,T_15625_6162,T_625_247,T_15625_6188,T_15625_6201,T_15625_6214,T_15625_6227,T_3125_1248,T_15625_6253,T_15625_6266,T_15625_6279,T_15625_6292,T_3125_1261,T_15625_6318,T_15625_6331,T_15625_6344,T_15625_6357,T_3125_1274,T_15625_6383,T_15625_6396,T_15625_6409,T_15625_6422,T_3125_1287,T_15625_6448,T_15625_6461,T_15625_6474,T_15625_6487,T_125_52,T_15625_6513,T_15625_6526,T_15625_6539,T_15625_6552,T_3125_1313,T_15625_6578,T_15625_6591,T_15625_6604,T_15625_6617,T_3125_1326,T_15625_6643,T_15625_6656,T_15625_6669,T_15625_6682,T_3125_1339,T_15625_6708,T_15625_6721,T_15625_6734,T_15625_6747,T_3125_1352,T_15625_6773,T_15625_6786,T_15625_6799,T_15625_6812,T_625_273,T_15625_6838,T_15625_6851,T_15625_6864,T_15625_6877,T_3125_1378,T_15625_6903,T_15625_6916,T_15625_6929,T_15625_6942,T_3125_1391,T_15625_6968,T_15625_6981,T_15625_6994,T_15625_7007,T_3125_1404,T_15625_7033,T_15625_7046,T_15625_7059,T_15625_7072,T_3125_1417,T_15625_7098,T_15625_7111,T_15625_7124,T_15625_7137,T_625_286,T_15625_7163,T_15625_7176,T_15625_7189,T_15625_7202,T_3125_1443,T_15625_7228,T_15625_7241,T_15625_7254,T_15625_7267,T_3125_1456,T_15625_7293,T_15625_7306,T_15625_7319,T_15625_7332,T_3125_1469,T_15625_7358,T_15625_7371,T_15625_7384,T_15625_7397,T_3125_1482,T_15625_7423,T_15625_7436,T_15625_7449,T_15625_7462,T_625_299,T_15625_7488,T_15625_7501,T_15625_7514,T_15625_7527,T_3125_1508,T_15625_7553,T_15625_7566,T_15625_7579,T_15625_7592,T_3125_1521,T_15625_7618,T_15625_7631,T_15625_7644,T_15625_7657,T_3125_1534,T_15625_7683,T_15625_7696,T_15625_7709,T_15625_7722,T_3125_1547,T_15625_7748,T_15625_7761,T_15625_7774,T_15625_7787,T_625_312,T_15625_7813,T_15625_7826,T_15625_7839,T_15625_7852,T_3125_1573,T_15625_7878,T_15625_7891,T_15625_7904,T_15625_7917,T_3125_1586,T_15625_7943,T_15625_7956,T_15625_7969,T_15625_7982,T_3125_1599,T_15625_8008,T_15625_8021,T_15625_8034,T_15625_8047,T_3125_1612,T_15625_8073,T_15625_8086,T_15625_8099,T_15625_8112 +}; +static const __device__ float2 lut_sp_16_16384[1024*2] = { + T_2_0,T_16384_1,T_8192_1,T_16384_3,T_4096_1,T_16384_5,T_8192_3,T_16384_7,T_2048_1,T_16384_9,T_8192_5,T_16384_11,T_4096_3,T_16384_13,T_8192_7,T_16384_15,T_1024_1,T_16384_17,T_8192_9,T_16384_19,T_4096_5,T_16384_21,T_8192_11,T_16384_23,T_2048_3,T_16384_25,T_8192_13,T_16384_27,T_4096_7,T_16384_29,T_8192_15,T_16384_31,T_512_1,T_16384_33,T_8192_17,T_16384_35,T_4096_9,T_16384_37,T_8192_19,T_16384_39,T_2048_5,T_16384_41,T_8192_21,T_16384_43,T_4096_11,T_16384_45,T_8192_23,T_16384_47,T_1024_3,T_16384_49,T_8192_25,T_16384_51,T_4096_13,T_16384_53,T_8192_27,T_16384_55,T_2048_7,T_16384_57,T_8192_29,T_16384_59,T_4096_15,T_16384_61,T_8192_31,T_16384_63,T_256_1,T_16384_65,T_8192_33,T_16384_67,T_4096_17,T_16384_69,T_8192_35,T_16384_71,T_2048_9,T_16384_73,T_8192_37,T_16384_75,T_4096_19,T_16384_77,T_8192_39,T_16384_79,T_1024_5,T_16384_81,T_8192_41,T_16384_83,T_4096_21,T_16384_85,T_8192_43,T_16384_87,T_2048_11,T_16384_89,T_8192_45,T_16384_91,T_4096_23,T_16384_93,T_8192_47,T_16384_95,T_512_3,T_16384_97,T_8192_49,T_16384_99,T_4096_25,T_16384_101,T_8192_51,T_16384_103,T_2048_13,T_16384_105,T_8192_53,T_16384_107,T_4096_27,T_16384_109,T_8192_55,T_16384_111,T_1024_7,T_16384_113,T_8192_57,T_16384_115,T_4096_29,T_16384_117,T_8192_59,T_16384_119,T_2048_15,T_16384_121,T_8192_61,T_16384_123,T_4096_31,T_16384_125,T_8192_63,T_16384_127,T_128_1,T_16384_129,T_8192_65,T_16384_131,T_4096_33,T_16384_133,T_8192_67,T_16384_135,T_2048_17,T_16384_137,T_8192_69,T_16384_139,T_4096_35,T_16384_141,T_8192_71,T_16384_143,T_1024_9,T_16384_145,T_8192_73,T_16384_147,T_4096_37,T_16384_149,T_8192_75,T_16384_151,T_2048_19,T_16384_153,T_8192_77,T_16384_155,T_4096_39,T_16384_157,T_8192_79,T_16384_159,T_512_5,T_16384_161,T_8192_81,T_16384_163,T_4096_41,T_16384_165,T_8192_83,T_16384_167,T_2048_21,T_16384_169,T_8192_85,T_16384_171,T_4096_43,T_16384_173,T_8192_87,T_16384_175,T_1024_11,T_16384_177,T_8192_89,T_16384_179,T_4096_45,T_16384_181,T_8192_91,T_16384_183,T_2048_23,T_16384_185,T_8192_93,T_16384_187,T_4096_47,T_16384_189,T_8192_95,T_16384_191,T_256_3,T_16384_193,T_8192_97,T_16384_195,T_4096_49,T_16384_197,T_8192_99,T_16384_199,T_2048_25,T_16384_201,T_8192_101,T_16384_203,T_4096_51,T_16384_205,T_8192_103,T_16384_207,T_1024_13,T_16384_209,T_8192_105,T_16384_211,T_4096_53,T_16384_213,T_8192_107,T_16384_215,T_2048_27,T_16384_217,T_8192_109,T_16384_219,T_4096_55,T_16384_221,T_8192_111,T_16384_223,T_512_7,T_16384_225,T_8192_113,T_16384_227,T_4096_57,T_16384_229,T_8192_115,T_16384_231,T_2048_29,T_16384_233,T_8192_117,T_16384_235,T_4096_59,T_16384_237,T_8192_119,T_16384_239,T_1024_15,T_16384_241,T_8192_121,T_16384_243,T_4096_61,T_16384_245,T_8192_123,T_16384_247,T_2048_31,T_16384_249,T_8192_125,T_16384_251,T_4096_63,T_16384_253,T_8192_127,T_16384_255,T_64_1,T_16384_257,T_8192_129,T_16384_259,T_4096_65,T_16384_261,T_8192_131,T_16384_263,T_2048_33,T_16384_265,T_8192_133,T_16384_267,T_4096_67,T_16384_269,T_8192_135,T_16384_271,T_1024_17,T_16384_273,T_8192_137,T_16384_275,T_4096_69,T_16384_277,T_8192_139,T_16384_279,T_2048_35,T_16384_281,T_8192_141,T_16384_283,T_4096_71,T_16384_285,T_8192_143,T_16384_287,T_512_9,T_16384_289,T_8192_145,T_16384_291,T_4096_73,T_16384_293,T_8192_147,T_16384_295,T_2048_37,T_16384_297,T_8192_149,T_16384_299,T_4096_75,T_16384_301,T_8192_151,T_16384_303,T_1024_19,T_16384_305,T_8192_153,T_16384_307,T_4096_77,T_16384_309,T_8192_155,T_16384_311,T_2048_39,T_16384_313,T_8192_157,T_16384_315,T_4096_79,T_16384_317,T_8192_159,T_16384_319,T_256_5,T_16384_321,T_8192_161,T_16384_323,T_4096_81,T_16384_325,T_8192_163,T_16384_327,T_2048_41,T_16384_329,T_8192_165,T_16384_331,T_4096_83,T_16384_333,T_8192_167,T_16384_335,T_1024_21,T_16384_337,T_8192_169,T_16384_339,T_4096_85,T_16384_341,T_8192_171,T_16384_343,T_2048_43,T_16384_345,T_8192_173,T_16384_347,T_4096_87,T_16384_349,T_8192_175,T_16384_351,T_512_11,T_16384_353,T_8192_177,T_16384_355,T_4096_89,T_16384_357,T_8192_179,T_16384_359,T_2048_45,T_16384_361,T_8192_181,T_16384_363,T_4096_91,T_16384_365,T_8192_183,T_16384_367,T_1024_23,T_16384_369,T_8192_185,T_16384_371,T_4096_93,T_16384_373,T_8192_187,T_16384_375,T_2048_47,T_16384_377,T_8192_189,T_16384_379,T_4096_95,T_16384_381,T_8192_191,T_16384_383,T_128_3,T_16384_385,T_8192_193,T_16384_387,T_4096_97,T_16384_389,T_8192_195,T_16384_391,T_2048_49,T_16384_393,T_8192_197,T_16384_395,T_4096_99,T_16384_397,T_8192_199,T_16384_399,T_1024_25,T_16384_401,T_8192_201,T_16384_403,T_4096_101,T_16384_405,T_8192_203,T_16384_407,T_2048_51,T_16384_409,T_8192_205,T_16384_411,T_4096_103,T_16384_413,T_8192_207,T_16384_415,T_512_13,T_16384_417,T_8192_209,T_16384_419,T_4096_105,T_16384_421,T_8192_211,T_16384_423,T_2048_53,T_16384_425,T_8192_213,T_16384_427,T_4096_107,T_16384_429,T_8192_215,T_16384_431,T_1024_27,T_16384_433,T_8192_217,T_16384_435,T_4096_109,T_16384_437,T_8192_219,T_16384_439,T_2048_55,T_16384_441,T_8192_221,T_16384_443,T_4096_111,T_16384_445,T_8192_223,T_16384_447,T_256_7,T_16384_449,T_8192_225,T_16384_451,T_4096_113,T_16384_453,T_8192_227,T_16384_455,T_2048_57,T_16384_457,T_8192_229,T_16384_459,T_4096_115,T_16384_461,T_8192_231,T_16384_463,T_1024_29,T_16384_465,T_8192_233,T_16384_467,T_4096_117,T_16384_469,T_8192_235,T_16384_471,T_2048_59,T_16384_473,T_8192_237,T_16384_475,T_4096_119,T_16384_477,T_8192_239,T_16384_479,T_512_15,T_16384_481,T_8192_241,T_16384_483,T_4096_121,T_16384_485,T_8192_243,T_16384_487,T_2048_61,T_16384_489,T_8192_245,T_16384_491,T_4096_123,T_16384_493,T_8192_247,T_16384_495,T_1024_31,T_16384_497,T_8192_249,T_16384_499,T_4096_125,T_16384_501,T_8192_251,T_16384_503,T_2048_63,T_16384_505,T_8192_253,T_16384_507,T_4096_127,T_16384_509,T_8192_255,T_16384_511,T_32_1,T_16384_513,T_8192_257,T_16384_515,T_4096_129,T_16384_517,T_8192_259,T_16384_519,T_2048_65,T_16384_521,T_8192_261,T_16384_523,T_4096_131,T_16384_525,T_8192_263,T_16384_527,T_1024_33,T_16384_529,T_8192_265,T_16384_531,T_4096_133,T_16384_533,T_8192_267,T_16384_535,T_2048_67,T_16384_537,T_8192_269,T_16384_539,T_4096_135,T_16384_541,T_8192_271,T_16384_543,T_512_17,T_16384_545,T_8192_273,T_16384_547,T_4096_137,T_16384_549,T_8192_275,T_16384_551,T_2048_69,T_16384_553,T_8192_277,T_16384_555,T_4096_139,T_16384_557,T_8192_279,T_16384_559,T_1024_35,T_16384_561,T_8192_281,T_16384_563,T_4096_141,T_16384_565,T_8192_283,T_16384_567,T_2048_71,T_16384_569,T_8192_285,T_16384_571,T_4096_143,T_16384_573,T_8192_287,T_16384_575,T_256_9,T_16384_577,T_8192_289,T_16384_579,T_4096_145,T_16384_581,T_8192_291,T_16384_583,T_2048_73,T_16384_585,T_8192_293,T_16384_587,T_4096_147,T_16384_589,T_8192_295,T_16384_591,T_1024_37,T_16384_593,T_8192_297,T_16384_595,T_4096_149,T_16384_597,T_8192_299,T_16384_599,T_2048_75,T_16384_601,T_8192_301,T_16384_603,T_4096_151,T_16384_605,T_8192_303,T_16384_607,T_512_19,T_16384_609,T_8192_305,T_16384_611,T_4096_153,T_16384_613,T_8192_307,T_16384_615,T_2048_77,T_16384_617,T_8192_309,T_16384_619,T_4096_155,T_16384_621,T_8192_311,T_16384_623,T_1024_39,T_16384_625,T_8192_313,T_16384_627,T_4096_157,T_16384_629,T_8192_315,T_16384_631,T_2048_79,T_16384_633,T_8192_317,T_16384_635,T_4096_159,T_16384_637,T_8192_319,T_16384_639,T_128_5,T_16384_641,T_8192_321,T_16384_643,T_4096_161,T_16384_645,T_8192_323,T_16384_647,T_2048_81,T_16384_649,T_8192_325,T_16384_651,T_4096_163,T_16384_653,T_8192_327,T_16384_655,T_1024_41,T_16384_657,T_8192_329,T_16384_659,T_4096_165,T_16384_661,T_8192_331,T_16384_663,T_2048_83,T_16384_665,T_8192_333,T_16384_667,T_4096_167,T_16384_669,T_8192_335,T_16384_671,T_512_21,T_16384_673,T_8192_337,T_16384_675,T_4096_169,T_16384_677,T_8192_339,T_16384_679,T_2048_85,T_16384_681,T_8192_341,T_16384_683,T_4096_171,T_16384_685,T_8192_343,T_16384_687,T_1024_43,T_16384_689,T_8192_345,T_16384_691,T_4096_173,T_16384_693,T_8192_347,T_16384_695,T_2048_87,T_16384_697,T_8192_349,T_16384_699,T_4096_175,T_16384_701,T_8192_351,T_16384_703,T_256_11,T_16384_705,T_8192_353,T_16384_707,T_4096_177,T_16384_709,T_8192_355,T_16384_711,T_2048_89,T_16384_713,T_8192_357,T_16384_715,T_4096_179,T_16384_717,T_8192_359,T_16384_719,T_1024_45,T_16384_721,T_8192_361,T_16384_723,T_4096_181,T_16384_725,T_8192_363,T_16384_727,T_2048_91,T_16384_729,T_8192_365,T_16384_731,T_4096_183,T_16384_733,T_8192_367,T_16384_735,T_512_23,T_16384_737,T_8192_369,T_16384_739,T_4096_185,T_16384_741,T_8192_371,T_16384_743,T_2048_93,T_16384_745,T_8192_373,T_16384_747,T_4096_187,T_16384_749,T_8192_375,T_16384_751,T_1024_47,T_16384_753,T_8192_377,T_16384_755,T_4096_189,T_16384_757,T_8192_379,T_16384_759,T_2048_95,T_16384_761,T_8192_381,T_16384_763,T_4096_191,T_16384_765,T_8192_383,T_16384_767,T_64_3,T_16384_769,T_8192_385,T_16384_771,T_4096_193,T_16384_773,T_8192_387,T_16384_775,T_2048_97,T_16384_777,T_8192_389,T_16384_779,T_4096_195,T_16384_781,T_8192_391,T_16384_783,T_1024_49,T_16384_785,T_8192_393,T_16384_787,T_4096_197,T_16384_789,T_8192_395,T_16384_791,T_2048_99,T_16384_793,T_8192_397,T_16384_795,T_4096_199,T_16384_797,T_8192_399,T_16384_799,T_512_25,T_16384_801,T_8192_401,T_16384_803,T_4096_201,T_16384_805,T_8192_403,T_16384_807,T_2048_101,T_16384_809,T_8192_405,T_16384_811,T_4096_203,T_16384_813,T_8192_407,T_16384_815,T_1024_51,T_16384_817,T_8192_409,T_16384_819,T_4096_205,T_16384_821,T_8192_411,T_16384_823,T_2048_103,T_16384_825,T_8192_413,T_16384_827,T_4096_207,T_16384_829,T_8192_415,T_16384_831,T_256_13,T_16384_833,T_8192_417,T_16384_835,T_4096_209,T_16384_837,T_8192_419,T_16384_839,T_2048_105,T_16384_841,T_8192_421,T_16384_843,T_4096_211,T_16384_845,T_8192_423,T_16384_847,T_1024_53,T_16384_849,T_8192_425,T_16384_851,T_4096_213,T_16384_853,T_8192_427,T_16384_855,T_2048_107,T_16384_857,T_8192_429,T_16384_859,T_4096_215,T_16384_861,T_8192_431,T_16384_863,T_512_27,T_16384_865,T_8192_433,T_16384_867,T_4096_217,T_16384_869,T_8192_435,T_16384_871,T_2048_109,T_16384_873,T_8192_437,T_16384_875,T_4096_219,T_16384_877,T_8192_439,T_16384_879,T_1024_55,T_16384_881,T_8192_441,T_16384_883,T_4096_221,T_16384_885,T_8192_443,T_16384_887,T_2048_111,T_16384_889,T_8192_445,T_16384_891,T_4096_223,T_16384_893,T_8192_447,T_16384_895,T_128_7,T_16384_897,T_8192_449,T_16384_899,T_4096_225,T_16384_901,T_8192_451,T_16384_903,T_2048_113,T_16384_905,T_8192_453,T_16384_907,T_4096_227,T_16384_909,T_8192_455,T_16384_911,T_1024_57,T_16384_913,T_8192_457,T_16384_915,T_4096_229,T_16384_917,T_8192_459,T_16384_919,T_2048_115,T_16384_921,T_8192_461,T_16384_923,T_4096_231,T_16384_925,T_8192_463,T_16384_927,T_512_29,T_16384_929,T_8192_465,T_16384_931,T_4096_233,T_16384_933,T_8192_467,T_16384_935,T_2048_117,T_16384_937,T_8192_469,T_16384_939,T_4096_235,T_16384_941,T_8192_471,T_16384_943,T_1024_59,T_16384_945,T_8192_473,T_16384_947,T_4096_237,T_16384_949,T_8192_475,T_16384_951,T_2048_119,T_16384_953,T_8192_477,T_16384_955,T_4096_239,T_16384_957,T_8192_479,T_16384_959,T_256_15,T_16384_961,T_8192_481,T_16384_963,T_4096_241,T_16384_965,T_8192_483,T_16384_967,T_2048_121,T_16384_969,T_8192_485,T_16384_971,T_4096_243,T_16384_973,T_8192_487,T_16384_975,T_1024_61,T_16384_977,T_8192_489,T_16384_979,T_4096_245,T_16384_981,T_8192_491,T_16384_983,T_2048_123,T_16384_985,T_8192_493,T_16384_987,T_4096_247,T_16384_989,T_8192_495,T_16384_991,T_512_31,T_16384_993,T_8192_497,T_16384_995,T_4096_249,T_16384_997,T_8192_499,T_16384_999,T_2048_125,T_16384_1001,T_8192_501,T_16384_1003,T_4096_251,T_16384_1005,T_8192_503,T_16384_1007,T_1024_63,T_16384_1009,T_8192_505,T_16384_1011,T_4096_253,T_16384_1013,T_8192_507,T_16384_1015,T_2048_127,T_16384_1017,T_8192_509,T_16384_1019,T_4096_255,T_16384_1021,T_8192_511,T_16384_1023,T_2_0,T_16384_9,T_8192_9,T_16384_27,T_4096_9,T_16384_45,T_8192_27,T_16384_63,T_2048_9,T_16384_81,T_8192_45,T_16384_99,T_4096_27,T_16384_117,T_8192_63,T_16384_135,T_1024_9,T_16384_153,T_8192_81,T_16384_171,T_4096_45,T_16384_189,T_8192_99,T_16384_207,T_2048_27,T_16384_225,T_8192_117,T_16384_243,T_4096_63,T_16384_261,T_8192_135,T_16384_279,T_512_9,T_16384_297,T_8192_153,T_16384_315,T_4096_81,T_16384_333,T_8192_171,T_16384_351,T_2048_45,T_16384_369,T_8192_189,T_16384_387,T_4096_99,T_16384_405,T_8192_207,T_16384_423,T_1024_27,T_16384_441,T_8192_225,T_16384_459,T_4096_117,T_16384_477,T_8192_243,T_16384_495,T_2048_63,T_16384_513,T_8192_261,T_16384_531,T_4096_135,T_16384_549,T_8192_279,T_16384_567,T_256_9,T_16384_585,T_8192_297,T_16384_603,T_4096_153,T_16384_621,T_8192_315,T_16384_639,T_2048_81,T_16384_657,T_8192_333,T_16384_675,T_4096_171,T_16384_693,T_8192_351,T_16384_711,T_1024_45,T_16384_729,T_8192_369,T_16384_747,T_4096_189,T_16384_765,T_8192_387,T_16384_783,T_2048_99,T_16384_801,T_8192_405,T_16384_819,T_4096_207,T_16384_837,T_8192_423,T_16384_855,T_512_27,T_16384_873,T_8192_441,T_16384_891,T_4096_225,T_16384_909,T_8192_459,T_16384_927,T_2048_117,T_16384_945,T_8192_477,T_16384_963,T_4096_243,T_16384_981,T_8192_495,T_16384_999,T_1024_63,T_16384_1017,T_8192_513,T_16384_1035,T_4096_261,T_16384_1053,T_8192_531,T_16384_1071,T_2048_135,T_16384_1089,T_8192_549,T_16384_1107,T_4096_279,T_16384_1125,T_8192_567,T_16384_1143,T_128_9,T_16384_1161,T_8192_585,T_16384_1179,T_4096_297,T_16384_1197,T_8192_603,T_16384_1215,T_2048_153,T_16384_1233,T_8192_621,T_16384_1251,T_4096_315,T_16384_1269,T_8192_639,T_16384_1287,T_1024_81,T_16384_1305,T_8192_657,T_16384_1323,T_4096_333,T_16384_1341,T_8192_675,T_16384_1359,T_2048_171,T_16384_1377,T_8192_693,T_16384_1395,T_4096_351,T_16384_1413,T_8192_711,T_16384_1431,T_512_45,T_16384_1449,T_8192_729,T_16384_1467,T_4096_369,T_16384_1485,T_8192_747,T_16384_1503,T_2048_189,T_16384_1521,T_8192_765,T_16384_1539,T_4096_387,T_16384_1557,T_8192_783,T_16384_1575,T_1024_99,T_16384_1593,T_8192_801,T_16384_1611,T_4096_405,T_16384_1629,T_8192_819,T_16384_1647,T_2048_207,T_16384_1665,T_8192_837,T_16384_1683,T_4096_423,T_16384_1701,T_8192_855,T_16384_1719,T_256_27,T_16384_1737,T_8192_873,T_16384_1755,T_4096_441,T_16384_1773,T_8192_891,T_16384_1791,T_2048_225,T_16384_1809,T_8192_909,T_16384_1827,T_4096_459,T_16384_1845,T_8192_927,T_16384_1863,T_1024_117,T_16384_1881,T_8192_945,T_16384_1899,T_4096_477,T_16384_1917,T_8192_963,T_16384_1935,T_2048_243,T_16384_1953,T_8192_981,T_16384_1971,T_4096_495,T_16384_1989,T_8192_999,T_16384_2007,T_512_63,T_16384_2025,T_8192_1017,T_16384_2043,T_4096_513,T_16384_2061,T_8192_1035,T_16384_2079,T_2048_261,T_16384_2097,T_8192_1053,T_16384_2115,T_4096_531,T_16384_2133,T_8192_1071,T_16384_2151,T_1024_135,T_16384_2169,T_8192_1089,T_16384_2187,T_4096_549,T_16384_2205,T_8192_1107,T_16384_2223,T_2048_279,T_16384_2241,T_8192_1125,T_16384_2259,T_4096_567,T_16384_2277,T_8192_1143,T_16384_2295,T_64_9,T_16384_2313,T_8192_1161,T_16384_2331,T_4096_585,T_16384_2349,T_8192_1179,T_16384_2367,T_2048_297,T_16384_2385,T_8192_1197,T_16384_2403,T_4096_603,T_16384_2421,T_8192_1215,T_16384_2439,T_1024_153,T_16384_2457,T_8192_1233,T_16384_2475,T_4096_621,T_16384_2493,T_8192_1251,T_16384_2511,T_2048_315,T_16384_2529,T_8192_1269,T_16384_2547,T_4096_639,T_16384_2565,T_8192_1287,T_16384_2583,T_512_81,T_16384_2601,T_8192_1305,T_16384_2619,T_4096_657,T_16384_2637,T_8192_1323,T_16384_2655,T_2048_333,T_16384_2673,T_8192_1341,T_16384_2691,T_4096_675,T_16384_2709,T_8192_1359,T_16384_2727,T_1024_171,T_16384_2745,T_8192_1377,T_16384_2763,T_4096_693,T_16384_2781,T_8192_1395,T_16384_2799,T_2048_351,T_16384_2817,T_8192_1413,T_16384_2835,T_4096_711,T_16384_2853,T_8192_1431,T_16384_2871,T_256_45,T_16384_2889,T_8192_1449,T_16384_2907,T_4096_729,T_16384_2925,T_8192_1467,T_16384_2943,T_2048_369,T_16384_2961,T_8192_1485,T_16384_2979,T_4096_747,T_16384_2997,T_8192_1503,T_16384_3015,T_1024_189,T_16384_3033,T_8192_1521,T_16384_3051,T_4096_765,T_16384_3069,T_8192_1539,T_16384_3087,T_2048_387,T_16384_3105,T_8192_1557,T_16384_3123,T_4096_783,T_16384_3141,T_8192_1575,T_16384_3159,T_512_99,T_16384_3177,T_8192_1593,T_16384_3195,T_4096_801,T_16384_3213,T_8192_1611,T_16384_3231,T_2048_405,T_16384_3249,T_8192_1629,T_16384_3267,T_4096_819,T_16384_3285,T_8192_1647,T_16384_3303,T_1024_207,T_16384_3321,T_8192_1665,T_16384_3339,T_4096_837,T_16384_3357,T_8192_1683,T_16384_3375,T_2048_423,T_16384_3393,T_8192_1701,T_16384_3411,T_4096_855,T_16384_3429,T_8192_1719,T_16384_3447,T_128_27,T_16384_3465,T_8192_1737,T_16384_3483,T_4096_873,T_16384_3501,T_8192_1755,T_16384_3519,T_2048_441,T_16384_3537,T_8192_1773,T_16384_3555,T_4096_891,T_16384_3573,T_8192_1791,T_16384_3591,T_1024_225,T_16384_3609,T_8192_1809,T_16384_3627,T_4096_909,T_16384_3645,T_8192_1827,T_16384_3663,T_2048_459,T_16384_3681,T_8192_1845,T_16384_3699,T_4096_927,T_16384_3717,T_8192_1863,T_16384_3735,T_512_117,T_16384_3753,T_8192_1881,T_16384_3771,T_4096_945,T_16384_3789,T_8192_1899,T_16384_3807,T_2048_477,T_16384_3825,T_8192_1917,T_16384_3843,T_4096_963,T_16384_3861,T_8192_1935,T_16384_3879,T_1024_243,T_16384_3897,T_8192_1953,T_16384_3915,T_4096_981,T_16384_3933,T_8192_1971,T_16384_3951,T_2048_495,T_16384_3969,T_8192_1989,T_16384_3987,T_4096_999,T_16384_4005,T_8192_2007,T_16384_4023,T_256_63,T_16384_4041,T_8192_2025,T_16384_4059,T_4096_1017,T_16384_4077,T_8192_2043,T_16384_4095,T_2048_513,T_16384_4113,T_8192_2061,T_16384_4131,T_4096_1035,T_16384_4149,T_8192_2079,T_16384_4167,T_1024_261,T_16384_4185,T_8192_2097,T_16384_4203,T_4096_1053,T_16384_4221,T_8192_2115,T_16384_4239,T_2048_531,T_16384_4257,T_8192_2133,T_16384_4275,T_4096_1071,T_16384_4293,T_8192_2151,T_16384_4311,T_512_135,T_16384_4329,T_8192_2169,T_16384_4347,T_4096_1089,T_16384_4365,T_8192_2187,T_16384_4383,T_2048_549,T_16384_4401,T_8192_2205,T_16384_4419,T_4096_1107,T_16384_4437,T_8192_2223,T_16384_4455,T_1024_279,T_16384_4473,T_8192_2241,T_16384_4491,T_4096_1125,T_16384_4509,T_8192_2259,T_16384_4527,T_2048_567,T_16384_4545,T_8192_2277,T_16384_4563,T_4096_1143,T_16384_4581,T_8192_2295,T_16384_4599,T_32_9,T_16384_4617,T_8192_2313,T_16384_4635,T_4096_1161,T_16384_4653,T_8192_2331,T_16384_4671,T_2048_585,T_16384_4689,T_8192_2349,T_16384_4707,T_4096_1179,T_16384_4725,T_8192_2367,T_16384_4743,T_1024_297,T_16384_4761,T_8192_2385,T_16384_4779,T_4096_1197,T_16384_4797,T_8192_2403,T_16384_4815,T_2048_603,T_16384_4833,T_8192_2421,T_16384_4851,T_4096_1215,T_16384_4869,T_8192_2439,T_16384_4887,T_512_153,T_16384_4905,T_8192_2457,T_16384_4923,T_4096_1233,T_16384_4941,T_8192_2475,T_16384_4959,T_2048_621,T_16384_4977,T_8192_2493,T_16384_4995,T_4096_1251,T_16384_5013,T_8192_2511,T_16384_5031,T_1024_315,T_16384_5049,T_8192_2529,T_16384_5067,T_4096_1269,T_16384_5085,T_8192_2547,T_16384_5103,T_2048_639,T_16384_5121,T_8192_2565,T_16384_5139,T_4096_1287,T_16384_5157,T_8192_2583,T_16384_5175,T_256_81,T_16384_5193,T_8192_2601,T_16384_5211,T_4096_1305,T_16384_5229,T_8192_2619,T_16384_5247,T_2048_657,T_16384_5265,T_8192_2637,T_16384_5283,T_4096_1323,T_16384_5301,T_8192_2655,T_16384_5319,T_1024_333,T_16384_5337,T_8192_2673,T_16384_5355,T_4096_1341,T_16384_5373,T_8192_2691,T_16384_5391,T_2048_675,T_16384_5409,T_8192_2709,T_16384_5427,T_4096_1359,T_16384_5445,T_8192_2727,T_16384_5463,T_512_171,T_16384_5481,T_8192_2745,T_16384_5499,T_4096_1377,T_16384_5517,T_8192_2763,T_16384_5535,T_2048_693,T_16384_5553,T_8192_2781,T_16384_5571,T_4096_1395,T_16384_5589,T_8192_2799,T_16384_5607,T_1024_351,T_16384_5625,T_8192_2817,T_16384_5643,T_4096_1413,T_16384_5661,T_8192_2835,T_16384_5679,T_2048_711,T_16384_5697,T_8192_2853,T_16384_5715,T_4096_1431,T_16384_5733,T_8192_2871,T_16384_5751,T_128_45,T_16384_5769,T_8192_2889,T_16384_5787,T_4096_1449,T_16384_5805,T_8192_2907,T_16384_5823,T_2048_729,T_16384_5841,T_8192_2925,T_16384_5859,T_4096_1467,T_16384_5877,T_8192_2943,T_16384_5895,T_1024_369,T_16384_5913,T_8192_2961,T_16384_5931,T_4096_1485,T_16384_5949,T_8192_2979,T_16384_5967,T_2048_747,T_16384_5985,T_8192_2997,T_16384_6003,T_4096_1503,T_16384_6021,T_8192_3015,T_16384_6039,T_512_189,T_16384_6057,T_8192_3033,T_16384_6075,T_4096_1521,T_16384_6093,T_8192_3051,T_16384_6111,T_2048_765,T_16384_6129,T_8192_3069,T_16384_6147,T_4096_1539,T_16384_6165,T_8192_3087,T_16384_6183,T_1024_387,T_16384_6201,T_8192_3105,T_16384_6219,T_4096_1557,T_16384_6237,T_8192_3123,T_16384_6255,T_2048_783,T_16384_6273,T_8192_3141,T_16384_6291,T_4096_1575,T_16384_6309,T_8192_3159,T_16384_6327,T_256_99,T_16384_6345,T_8192_3177,T_16384_6363,T_4096_1593,T_16384_6381,T_8192_3195,T_16384_6399,T_2048_801,T_16384_6417,T_8192_3213,T_16384_6435,T_4096_1611,T_16384_6453,T_8192_3231,T_16384_6471,T_1024_405,T_16384_6489,T_8192_3249,T_16384_6507,T_4096_1629,T_16384_6525,T_8192_3267,T_16384_6543,T_2048_819,T_16384_6561,T_8192_3285,T_16384_6579,T_4096_1647,T_16384_6597,T_8192_3303,T_16384_6615,T_512_207,T_16384_6633,T_8192_3321,T_16384_6651,T_4096_1665,T_16384_6669,T_8192_3339,T_16384_6687,T_2048_837,T_16384_6705,T_8192_3357,T_16384_6723,T_4096_1683,T_16384_6741,T_8192_3375,T_16384_6759,T_1024_423,T_16384_6777,T_8192_3393,T_16384_6795,T_4096_1701,T_16384_6813,T_8192_3411,T_16384_6831,T_2048_855,T_16384_6849,T_8192_3429,T_16384_6867,T_4096_1719,T_16384_6885,T_8192_3447,T_16384_6903,T_64_27,T_16384_6921,T_8192_3465,T_16384_6939,T_4096_1737,T_16384_6957,T_8192_3483,T_16384_6975,T_2048_873,T_16384_6993,T_8192_3501,T_16384_7011,T_4096_1755,T_16384_7029,T_8192_3519,T_16384_7047,T_1024_441,T_16384_7065,T_8192_3537,T_16384_7083,T_4096_1773,T_16384_7101,T_8192_3555,T_16384_7119,T_2048_891,T_16384_7137,T_8192_3573,T_16384_7155,T_4096_1791,T_16384_7173,T_8192_3591,T_16384_7191,T_512_225,T_16384_7209,T_8192_3609,T_16384_7227,T_4096_1809,T_16384_7245,T_8192_3627,T_16384_7263,T_2048_909,T_16384_7281,T_8192_3645,T_16384_7299,T_4096_1827,T_16384_7317,T_8192_3663,T_16384_7335,T_1024_459,T_16384_7353,T_8192_3681,T_16384_7371,T_4096_1845,T_16384_7389,T_8192_3699,T_16384_7407,T_2048_927,T_16384_7425,T_8192_3717,T_16384_7443,T_4096_1863,T_16384_7461,T_8192_3735,T_16384_7479,T_256_117,T_16384_7497,T_8192_3753,T_16384_7515,T_4096_1881,T_16384_7533,T_8192_3771,T_16384_7551,T_2048_945,T_16384_7569,T_8192_3789,T_16384_7587,T_4096_1899,T_16384_7605,T_8192_3807,T_16384_7623,T_1024_477,T_16384_7641,T_8192_3825,T_16384_7659,T_4096_1917,T_16384_7677,T_8192_3843,T_16384_7695,T_2048_963,T_16384_7713,T_8192_3861,T_16384_7731,T_4096_1935,T_16384_7749,T_8192_3879,T_16384_7767,T_512_243,T_16384_7785,T_8192_3897,T_16384_7803,T_4096_1953,T_16384_7821,T_8192_3915,T_16384_7839,T_2048_981,T_16384_7857,T_8192_3933,T_16384_7875,T_4096_1971,T_16384_7893,T_8192_3951,T_16384_7911,T_1024_495,T_16384_7929,T_8192_3969,T_16384_7947,T_4096_1989,T_16384_7965,T_8192_3987,T_16384_7983,T_2048_999,T_16384_8001,T_8192_4005,T_16384_8019,T_4096_2007,T_16384_8037,T_8192_4023,T_16384_8055,T_128_63,T_16384_8073,T_8192_4041,T_16384_8091,T_4096_2025,T_16384_8109,T_8192_4059,T_16384_8127,T_2048_1017,T_16384_8145,T_8192_4077,T_16384_8163,T_4096_2043,T_16384_8181,T_8192_4095,T_16384_8199,T_1024_513,T_16384_8217,T_8192_4113,T_16384_8235,T_4096_2061,T_16384_8253,T_8192_4131,T_16384_8271,T_2048_1035,T_16384_8289,T_8192_4149,T_16384_8307,T_4096_2079,T_16384_8325,T_8192_4167,T_16384_8343,T_512_261,T_16384_8361,T_8192_4185,T_16384_8379,T_4096_2097,T_16384_8397,T_8192_4203,T_16384_8415,T_2048_1053,T_16384_8433,T_8192_4221,T_16384_8451,T_4096_2115,T_16384_8469,T_8192_4239,T_16384_8487,T_1024_531,T_16384_8505,T_8192_4257,T_16384_8523,T_4096_2133,T_16384_8541,T_8192_4275,T_16384_8559,T_2048_1071,T_16384_8577,T_8192_4293,T_16384_8595,T_4096_2151,T_16384_8613,T_8192_4311,T_16384_8631,T_256_135,T_16384_8649,T_8192_4329,T_16384_8667,T_4096_2169,T_16384_8685,T_8192_4347,T_16384_8703,T_2048_1089,T_16384_8721,T_8192_4365,T_16384_8739,T_4096_2187,T_16384_8757,T_8192_4383,T_16384_8775,T_1024_549,T_16384_8793,T_8192_4401,T_16384_8811,T_4096_2205,T_16384_8829,T_8192_4419,T_16384_8847,T_2048_1107,T_16384_8865,T_8192_4437,T_16384_8883,T_4096_2223,T_16384_8901,T_8192_4455,T_16384_8919,T_512_279,T_16384_8937,T_8192_4473,T_16384_8955,T_4096_2241,T_16384_8973,T_8192_4491,T_16384_8991,T_2048_1125,T_16384_9009,T_8192_4509,T_16384_9027,T_4096_2259,T_16384_9045,T_8192_4527,T_16384_9063,T_1024_567,T_16384_9081,T_8192_4545,T_16384_9099,T_4096_2277,T_16384_9117,T_8192_4563,T_16384_9135,T_2048_1143,T_16384_9153,T_8192_4581,T_16384_9171,T_4096_2295,T_16384_9189,T_8192_4599,T_16384_9207 +}; +static const __device__ float2 lut_sp_32_16384[512*2] = { + T_2_0,T_16384_1,T_8192_1,T_16384_3,T_4096_1,T_16384_5,T_8192_3,T_16384_7,T_2048_1,T_16384_9,T_8192_5,T_16384_11,T_4096_3,T_16384_13,T_8192_7,T_16384_15,T_1024_1,T_16384_17,T_8192_9,T_16384_19,T_4096_5,T_16384_21,T_8192_11,T_16384_23,T_2048_3,T_16384_25,T_8192_13,T_16384_27,T_4096_7,T_16384_29,T_8192_15,T_16384_31,T_512_1,T_16384_33,T_8192_17,T_16384_35,T_4096_9,T_16384_37,T_8192_19,T_16384_39,T_2048_5,T_16384_41,T_8192_21,T_16384_43,T_4096_11,T_16384_45,T_8192_23,T_16384_47,T_1024_3,T_16384_49,T_8192_25,T_16384_51,T_4096_13,T_16384_53,T_8192_27,T_16384_55,T_2048_7,T_16384_57,T_8192_29,T_16384_59,T_4096_15,T_16384_61,T_8192_31,T_16384_63,T_256_1,T_16384_65,T_8192_33,T_16384_67,T_4096_17,T_16384_69,T_8192_35,T_16384_71,T_2048_9,T_16384_73,T_8192_37,T_16384_75,T_4096_19,T_16384_77,T_8192_39,T_16384_79,T_1024_5,T_16384_81,T_8192_41,T_16384_83,T_4096_21,T_16384_85,T_8192_43,T_16384_87,T_2048_11,T_16384_89,T_8192_45,T_16384_91,T_4096_23,T_16384_93,T_8192_47,T_16384_95,T_512_3,T_16384_97,T_8192_49,T_16384_99,T_4096_25,T_16384_101,T_8192_51,T_16384_103,T_2048_13,T_16384_105,T_8192_53,T_16384_107,T_4096_27,T_16384_109,T_8192_55,T_16384_111,T_1024_7,T_16384_113,T_8192_57,T_16384_115,T_4096_29,T_16384_117,T_8192_59,T_16384_119,T_2048_15,T_16384_121,T_8192_61,T_16384_123,T_4096_31,T_16384_125,T_8192_63,T_16384_127,T_128_1,T_16384_129,T_8192_65,T_16384_131,T_4096_33,T_16384_133,T_8192_67,T_16384_135,T_2048_17,T_16384_137,T_8192_69,T_16384_139,T_4096_35,T_16384_141,T_8192_71,T_16384_143,T_1024_9,T_16384_145,T_8192_73,T_16384_147,T_4096_37,T_16384_149,T_8192_75,T_16384_151,T_2048_19,T_16384_153,T_8192_77,T_16384_155,T_4096_39,T_16384_157,T_8192_79,T_16384_159,T_512_5,T_16384_161,T_8192_81,T_16384_163,T_4096_41,T_16384_165,T_8192_83,T_16384_167,T_2048_21,T_16384_169,T_8192_85,T_16384_171,T_4096_43,T_16384_173,T_8192_87,T_16384_175,T_1024_11,T_16384_177,T_8192_89,T_16384_179,T_4096_45,T_16384_181,T_8192_91,T_16384_183,T_2048_23,T_16384_185,T_8192_93,T_16384_187,T_4096_47,T_16384_189,T_8192_95,T_16384_191,T_256_3,T_16384_193,T_8192_97,T_16384_195,T_4096_49,T_16384_197,T_8192_99,T_16384_199,T_2048_25,T_16384_201,T_8192_101,T_16384_203,T_4096_51,T_16384_205,T_8192_103,T_16384_207,T_1024_13,T_16384_209,T_8192_105,T_16384_211,T_4096_53,T_16384_213,T_8192_107,T_16384_215,T_2048_27,T_16384_217,T_8192_109,T_16384_219,T_4096_55,T_16384_221,T_8192_111,T_16384_223,T_512_7,T_16384_225,T_8192_113,T_16384_227,T_4096_57,T_16384_229,T_8192_115,T_16384_231,T_2048_29,T_16384_233,T_8192_117,T_16384_235,T_4096_59,T_16384_237,T_8192_119,T_16384_239,T_1024_15,T_16384_241,T_8192_121,T_16384_243,T_4096_61,T_16384_245,T_8192_123,T_16384_247,T_2048_31,T_16384_249,T_8192_125,T_16384_251,T_4096_63,T_16384_253,T_8192_127,T_16384_255,T_64_1,T_16384_257,T_8192_129,T_16384_259,T_4096_65,T_16384_261,T_8192_131,T_16384_263,T_2048_33,T_16384_265,T_8192_133,T_16384_267,T_4096_67,T_16384_269,T_8192_135,T_16384_271,T_1024_17,T_16384_273,T_8192_137,T_16384_275,T_4096_69,T_16384_277,T_8192_139,T_16384_279,T_2048_35,T_16384_281,T_8192_141,T_16384_283,T_4096_71,T_16384_285,T_8192_143,T_16384_287,T_512_9,T_16384_289,T_8192_145,T_16384_291,T_4096_73,T_16384_293,T_8192_147,T_16384_295,T_2048_37,T_16384_297,T_8192_149,T_16384_299,T_4096_75,T_16384_301,T_8192_151,T_16384_303,T_1024_19,T_16384_305,T_8192_153,T_16384_307,T_4096_77,T_16384_309,T_8192_155,T_16384_311,T_2048_39,T_16384_313,T_8192_157,T_16384_315,T_4096_79,T_16384_317,T_8192_159,T_16384_319,T_256_5,T_16384_321,T_8192_161,T_16384_323,T_4096_81,T_16384_325,T_8192_163,T_16384_327,T_2048_41,T_16384_329,T_8192_165,T_16384_331,T_4096_83,T_16384_333,T_8192_167,T_16384_335,T_1024_21,T_16384_337,T_8192_169,T_16384_339,T_4096_85,T_16384_341,T_8192_171,T_16384_343,T_2048_43,T_16384_345,T_8192_173,T_16384_347,T_4096_87,T_16384_349,T_8192_175,T_16384_351,T_512_11,T_16384_353,T_8192_177,T_16384_355,T_4096_89,T_16384_357,T_8192_179,T_16384_359,T_2048_45,T_16384_361,T_8192_181,T_16384_363,T_4096_91,T_16384_365,T_8192_183,T_16384_367,T_1024_23,T_16384_369,T_8192_185,T_16384_371,T_4096_93,T_16384_373,T_8192_187,T_16384_375,T_2048_47,T_16384_377,T_8192_189,T_16384_379,T_4096_95,T_16384_381,T_8192_191,T_16384_383,T_128_3,T_16384_385,T_8192_193,T_16384_387,T_4096_97,T_16384_389,T_8192_195,T_16384_391,T_2048_49,T_16384_393,T_8192_197,T_16384_395,T_4096_99,T_16384_397,T_8192_199,T_16384_399,T_1024_25,T_16384_401,T_8192_201,T_16384_403,T_4096_101,T_16384_405,T_8192_203,T_16384_407,T_2048_51,T_16384_409,T_8192_205,T_16384_411,T_4096_103,T_16384_413,T_8192_207,T_16384_415,T_512_13,T_16384_417,T_8192_209,T_16384_419,T_4096_105,T_16384_421,T_8192_211,T_16384_423,T_2048_53,T_16384_425,T_8192_213,T_16384_427,T_4096_107,T_16384_429,T_8192_215,T_16384_431,T_1024_27,T_16384_433,T_8192_217,T_16384_435,T_4096_109,T_16384_437,T_8192_219,T_16384_439,T_2048_55,T_16384_441,T_8192_221,T_16384_443,T_4096_111,T_16384_445,T_8192_223,T_16384_447,T_256_7,T_16384_449,T_8192_225,T_16384_451,T_4096_113,T_16384_453,T_8192_227,T_16384_455,T_2048_57,T_16384_457,T_8192_229,T_16384_459,T_4096_115,T_16384_461,T_8192_231,T_16384_463,T_1024_29,T_16384_465,T_8192_233,T_16384_467,T_4096_117,T_16384_469,T_8192_235,T_16384_471,T_2048_59,T_16384_473,T_8192_237,T_16384_475,T_4096_119,T_16384_477,T_8192_239,T_16384_479,T_512_15,T_16384_481,T_8192_241,T_16384_483,T_4096_121,T_16384_485,T_8192_243,T_16384_487,T_2048_61,T_16384_489,T_8192_245,T_16384_491,T_4096_123,T_16384_493,T_8192_247,T_16384_495,T_1024_31,T_16384_497,T_8192_249,T_16384_499,T_4096_125,T_16384_501,T_8192_251,T_16384_503,T_2048_63,T_16384_505,T_8192_253,T_16384_507,T_4096_127,T_16384_509,T_8192_255,T_16384_511,T_2_0,T_16384_17,T_8192_17,T_16384_51,T_4096_17,T_16384_85,T_8192_51,T_16384_119,T_2048_17,T_16384_153,T_8192_85,T_16384_187,T_4096_51,T_16384_221,T_8192_119,T_16384_255,T_1024_17,T_16384_289,T_8192_153,T_16384_323,T_4096_85,T_16384_357,T_8192_187,T_16384_391,T_2048_51,T_16384_425,T_8192_221,T_16384_459,T_4096_119,T_16384_493,T_8192_255,T_16384_527,T_512_17,T_16384_561,T_8192_289,T_16384_595,T_4096_153,T_16384_629,T_8192_323,T_16384_663,T_2048_85,T_16384_697,T_8192_357,T_16384_731,T_4096_187,T_16384_765,T_8192_391,T_16384_799,T_1024_51,T_16384_833,T_8192_425,T_16384_867,T_4096_221,T_16384_901,T_8192_459,T_16384_935,T_2048_119,T_16384_969,T_8192_493,T_16384_1003,T_4096_255,T_16384_1037,T_8192_527,T_16384_1071,T_256_17,T_16384_1105,T_8192_561,T_16384_1139,T_4096_289,T_16384_1173,T_8192_595,T_16384_1207,T_2048_153,T_16384_1241,T_8192_629,T_16384_1275,T_4096_323,T_16384_1309,T_8192_663,T_16384_1343,T_1024_85,T_16384_1377,T_8192_697,T_16384_1411,T_4096_357,T_16384_1445,T_8192_731,T_16384_1479,T_2048_187,T_16384_1513,T_8192_765,T_16384_1547,T_4096_391,T_16384_1581,T_8192_799,T_16384_1615,T_512_51,T_16384_1649,T_8192_833,T_16384_1683,T_4096_425,T_16384_1717,T_8192_867,T_16384_1751,T_2048_221,T_16384_1785,T_8192_901,T_16384_1819,T_4096_459,T_16384_1853,T_8192_935,T_16384_1887,T_1024_119,T_16384_1921,T_8192_969,T_16384_1955,T_4096_493,T_16384_1989,T_8192_1003,T_16384_2023,T_2048_255,T_16384_2057,T_8192_1037,T_16384_2091,T_4096_527,T_16384_2125,T_8192_1071,T_16384_2159,T_128_17,T_16384_2193,T_8192_1105,T_16384_2227,T_4096_561,T_16384_2261,T_8192_1139,T_16384_2295,T_2048_289,T_16384_2329,T_8192_1173,T_16384_2363,T_4096_595,T_16384_2397,T_8192_1207,T_16384_2431,T_1024_153,T_16384_2465,T_8192_1241,T_16384_2499,T_4096_629,T_16384_2533,T_8192_1275,T_16384_2567,T_2048_323,T_16384_2601,T_8192_1309,T_16384_2635,T_4096_663,T_16384_2669,T_8192_1343,T_16384_2703,T_512_85,T_16384_2737,T_8192_1377,T_16384_2771,T_4096_697,T_16384_2805,T_8192_1411,T_16384_2839,T_2048_357,T_16384_2873,T_8192_1445,T_16384_2907,T_4096_731,T_16384_2941,T_8192_1479,T_16384_2975,T_1024_187,T_16384_3009,T_8192_1513,T_16384_3043,T_4096_765,T_16384_3077,T_8192_1547,T_16384_3111,T_2048_391,T_16384_3145,T_8192_1581,T_16384_3179,T_4096_799,T_16384_3213,T_8192_1615,T_16384_3247,T_256_51,T_16384_3281,T_8192_1649,T_16384_3315,T_4096_833,T_16384_3349,T_8192_1683,T_16384_3383,T_2048_425,T_16384_3417,T_8192_1717,T_16384_3451,T_4096_867,T_16384_3485,T_8192_1751,T_16384_3519,T_1024_221,T_16384_3553,T_8192_1785,T_16384_3587,T_4096_901,T_16384_3621,T_8192_1819,T_16384_3655,T_2048_459,T_16384_3689,T_8192_1853,T_16384_3723,T_4096_935,T_16384_3757,T_8192_1887,T_16384_3791,T_512_119,T_16384_3825,T_8192_1921,T_16384_3859,T_4096_969,T_16384_3893,T_8192_1955,T_16384_3927,T_2048_493,T_16384_3961,T_8192_1989,T_16384_3995,T_4096_1003,T_16384_4029,T_8192_2023,T_16384_4063,T_1024_255,T_16384_4097,T_8192_2057,T_16384_4131,T_4096_1037,T_16384_4165,T_8192_2091,T_16384_4199,T_2048_527,T_16384_4233,T_8192_2125,T_16384_4267,T_4096_1071,T_16384_4301,T_8192_2159,T_16384_4335,T_64_17,T_16384_4369,T_8192_2193,T_16384_4403,T_4096_1105,T_16384_4437,T_8192_2227,T_16384_4471,T_2048_561,T_16384_4505,T_8192_2261,T_16384_4539,T_4096_1139,T_16384_4573,T_8192_2295,T_16384_4607,T_1024_289,T_16384_4641,T_8192_2329,T_16384_4675,T_4096_1173,T_16384_4709,T_8192_2363,T_16384_4743,T_2048_595,T_16384_4777,T_8192_2397,T_16384_4811,T_4096_1207,T_16384_4845,T_8192_2431,T_16384_4879,T_512_153,T_16384_4913,T_8192_2465,T_16384_4947,T_4096_1241,T_16384_4981,T_8192_2499,T_16384_5015,T_2048_629,T_16384_5049,T_8192_2533,T_16384_5083,T_4096_1275,T_16384_5117,T_8192_2567,T_16384_5151,T_1024_323,T_16384_5185,T_8192_2601,T_16384_5219,T_4096_1309,T_16384_5253,T_8192_2635,T_16384_5287,T_2048_663,T_16384_5321,T_8192_2669,T_16384_5355,T_4096_1343,T_16384_5389,T_8192_2703,T_16384_5423,T_256_85,T_16384_5457,T_8192_2737,T_16384_5491,T_4096_1377,T_16384_5525,T_8192_2771,T_16384_5559,T_2048_697,T_16384_5593,T_8192_2805,T_16384_5627,T_4096_1411,T_16384_5661,T_8192_2839,T_16384_5695,T_1024_357,T_16384_5729,T_8192_2873,T_16384_5763,T_4096_1445,T_16384_5797,T_8192_2907,T_16384_5831,T_2048_731,T_16384_5865,T_8192_2941,T_16384_5899,T_4096_1479,T_16384_5933,T_8192_2975,T_16384_5967,T_512_187,T_16384_6001,T_8192_3009,T_16384_6035,T_4096_1513,T_16384_6069,T_8192_3043,T_16384_6103,T_2048_765,T_16384_6137,T_8192_3077,T_16384_6171,T_4096_1547,T_16384_6205,T_8192_3111,T_16384_6239,T_1024_391,T_16384_6273,T_8192_3145,T_16384_6307,T_4096_1581,T_16384_6341,T_8192_3179,T_16384_6375,T_2048_799,T_16384_6409,T_8192_3213,T_16384_6443,T_4096_1615,T_16384_6477,T_8192_3247,T_16384_6511,T_128_51,T_16384_6545,T_8192_3281,T_16384_6579,T_4096_1649,T_16384_6613,T_8192_3315,T_16384_6647,T_2048_833,T_16384_6681,T_8192_3349,T_16384_6715,T_4096_1683,T_16384_6749,T_8192_3383,T_16384_6783,T_1024_425,T_16384_6817,T_8192_3417,T_16384_6851,T_4096_1717,T_16384_6885,T_8192_3451,T_16384_6919,T_2048_867,T_16384_6953,T_8192_3485,T_16384_6987,T_4096_1751,T_16384_7021,T_8192_3519,T_16384_7055,T_512_221,T_16384_7089,T_8192_3553,T_16384_7123,T_4096_1785,T_16384_7157,T_8192_3587,T_16384_7191,T_2048_901,T_16384_7225,T_8192_3621,T_16384_7259,T_4096_1819,T_16384_7293,T_8192_3655,T_16384_7327,T_1024_459,T_16384_7361,T_8192_3689,T_16384_7395,T_4096_1853,T_16384_7429,T_8192_3723,T_16384_7463,T_2048_935,T_16384_7497,T_8192_3757,T_16384_7531,T_4096_1887,T_16384_7565,T_8192_3791,T_16384_7599,T_256_119,T_16384_7633,T_8192_3825,T_16384_7667,T_4096_1921,T_16384_7701,T_8192_3859,T_16384_7735,T_2048_969,T_16384_7769,T_8192_3893,T_16384_7803,T_4096_1955,T_16384_7837,T_8192_3927,T_16384_7871,T_1024_493,T_16384_7905,T_8192_3961,T_16384_7939,T_4096_1989,T_16384_7973,T_8192_3995,T_16384_8007,T_2048_1003,T_16384_8041,T_8192_4029,T_16384_8075,T_4096_2023,T_16384_8109,T_8192_4063,T_16384_8143,T_512_255,T_16384_8177,T_8192_4097,T_16384_8211,T_4096_2057,T_16384_8245,T_8192_4131,T_16384_8279,T_2048_1037,T_16384_8313,T_8192_4165,T_16384_8347,T_4096_2091,T_16384_8381,T_8192_4199,T_16384_8415,T_1024_527,T_16384_8449,T_8192_4233,T_16384_8483,T_4096_2125,T_16384_8517,T_8192_4267,T_16384_8551,T_2048_1071,T_16384_8585,T_8192_4301,T_16384_8619,T_4096_2159,T_16384_8653,T_8192_4335,T_16384_8687 +}; +static const __device__ float2 lut_sp_27_19683[729*2] = { + T_2_0,T_19683_1,T_19683_2,T_6561_1,T_19683_4,T_19683_5,T_6561_2,T_19683_7,T_19683_8,T_2187_1,T_19683_10,T_19683_11,T_6561_4,T_19683_13,T_19683_14,T_6561_5,T_19683_16,T_19683_17,T_2187_2,T_19683_19,T_19683_20,T_6561_7,T_19683_22,T_19683_23,T_6561_8,T_19683_25,T_19683_26,T_729_1,T_19683_28,T_19683_29,T_6561_10,T_19683_31,T_19683_32,T_6561_11,T_19683_34,T_19683_35,T_2187_4,T_19683_37,T_19683_38,T_6561_13,T_19683_40,T_19683_41,T_6561_14,T_19683_43,T_19683_44,T_2187_5,T_19683_46,T_19683_47,T_6561_16,T_19683_49,T_19683_50,T_6561_17,T_19683_52,T_19683_53,T_729_2,T_19683_55,T_19683_56,T_6561_19,T_19683_58,T_19683_59,T_6561_20,T_19683_61,T_19683_62,T_2187_7,T_19683_64,T_19683_65,T_6561_22,T_19683_67,T_19683_68,T_6561_23,T_19683_70,T_19683_71,T_2187_8,T_19683_73,T_19683_74,T_6561_25,T_19683_76,T_19683_77,T_6561_26,T_19683_79,T_19683_80,T_243_1,T_19683_82,T_19683_83,T_6561_28,T_19683_85,T_19683_86,T_6561_29,T_19683_88,T_19683_89,T_2187_10,T_19683_91,T_19683_92,T_6561_31,T_19683_94,T_19683_95,T_6561_32,T_19683_97,T_19683_98,T_2187_11,T_19683_100,T_19683_101,T_6561_34,T_19683_103,T_19683_104,T_6561_35,T_19683_106,T_19683_107,T_729_4,T_19683_109,T_19683_110,T_6561_37,T_19683_112,T_19683_113,T_6561_38,T_19683_115,T_19683_116,T_2187_13,T_19683_118,T_19683_119,T_6561_40,T_19683_121,T_19683_122,T_6561_41,T_19683_124,T_19683_125,T_2187_14,T_19683_127,T_19683_128,T_6561_43,T_19683_130,T_19683_131,T_6561_44,T_19683_133,T_19683_134,T_729_5,T_19683_136,T_19683_137,T_6561_46,T_19683_139,T_19683_140,T_6561_47,T_19683_142,T_19683_143,T_2187_16,T_19683_145,T_19683_146,T_6561_49,T_19683_148,T_19683_149,T_6561_50,T_19683_151,T_19683_152,T_2187_17,T_19683_154,T_19683_155,T_6561_52,T_19683_157,T_19683_158,T_6561_53,T_19683_160,T_19683_161,T_243_2,T_19683_163,T_19683_164,T_6561_55,T_19683_166,T_19683_167,T_6561_56,T_19683_169,T_19683_170,T_2187_19,T_19683_172,T_19683_173,T_6561_58,T_19683_175,T_19683_176,T_6561_59,T_19683_178,T_19683_179,T_2187_20,T_19683_181,T_19683_182,T_6561_61,T_19683_184,T_19683_185,T_6561_62,T_19683_187,T_19683_188,T_729_7,T_19683_190,T_19683_191,T_6561_64,T_19683_193,T_19683_194,T_6561_65,T_19683_196,T_19683_197,T_2187_22,T_19683_199,T_19683_200,T_6561_67,T_19683_202,T_19683_203,T_6561_68,T_19683_205,T_19683_206,T_2187_23,T_19683_208,T_19683_209,T_6561_70,T_19683_211,T_19683_212,T_6561_71,T_19683_214,T_19683_215,T_729_8,T_19683_217,T_19683_218,T_6561_73,T_19683_220,T_19683_221,T_6561_74,T_19683_223,T_19683_224,T_2187_25,T_19683_226,T_19683_227,T_6561_76,T_19683_229,T_19683_230,T_6561_77,T_19683_232,T_19683_233,T_2187_26,T_19683_235,T_19683_236,T_6561_79,T_19683_238,T_19683_239,T_6561_80,T_19683_241,T_19683_242,T_81_1,T_19683_244,T_19683_245,T_6561_82,T_19683_247,T_19683_248,T_6561_83,T_19683_250,T_19683_251,T_2187_28,T_19683_253,T_19683_254,T_6561_85,T_19683_256,T_19683_257,T_6561_86,T_19683_259,T_19683_260,T_2187_29,T_19683_262,T_19683_263,T_6561_88,T_19683_265,T_19683_266,T_6561_89,T_19683_268,T_19683_269,T_729_10,T_19683_271,T_19683_272,T_6561_91,T_19683_274,T_19683_275,T_6561_92,T_19683_277,T_19683_278,T_2187_31,T_19683_280,T_19683_281,T_6561_94,T_19683_283,T_19683_284,T_6561_95,T_19683_286,T_19683_287,T_2187_32,T_19683_289,T_19683_290,T_6561_97,T_19683_292,T_19683_293,T_6561_98,T_19683_295,T_19683_296,T_729_11,T_19683_298,T_19683_299,T_6561_100,T_19683_301,T_19683_302,T_6561_101,T_19683_304,T_19683_305,T_2187_34,T_19683_307,T_19683_308,T_6561_103,T_19683_310,T_19683_311,T_6561_104,T_19683_313,T_19683_314,T_2187_35,T_19683_316,T_19683_317,T_6561_106,T_19683_319,T_19683_320,T_6561_107,T_19683_322,T_19683_323,T_243_4,T_19683_325,T_19683_326,T_6561_109,T_19683_328,T_19683_329,T_6561_110,T_19683_331,T_19683_332,T_2187_37,T_19683_334,T_19683_335,T_6561_112,T_19683_337,T_19683_338,T_6561_113,T_19683_340,T_19683_341,T_2187_38,T_19683_343,T_19683_344,T_6561_115,T_19683_346,T_19683_347,T_6561_116,T_19683_349,T_19683_350,T_729_13,T_19683_352,T_19683_353,T_6561_118,T_19683_355,T_19683_356,T_6561_119,T_19683_358,T_19683_359,T_2187_40,T_19683_361,T_19683_362,T_6561_121,T_19683_364,T_19683_365,T_6561_122,T_19683_367,T_19683_368,T_2187_41,T_19683_370,T_19683_371,T_6561_124,T_19683_373,T_19683_374,T_6561_125,T_19683_376,T_19683_377,T_729_14,T_19683_379,T_19683_380,T_6561_127,T_19683_382,T_19683_383,T_6561_128,T_19683_385,T_19683_386,T_2187_43,T_19683_388,T_19683_389,T_6561_130,T_19683_391,T_19683_392,T_6561_131,T_19683_394,T_19683_395,T_2187_44,T_19683_397,T_19683_398,T_6561_133,T_19683_400,T_19683_401,T_6561_134,T_19683_403,T_19683_404,T_243_5,T_19683_406,T_19683_407,T_6561_136,T_19683_409,T_19683_410,T_6561_137,T_19683_412,T_19683_413,T_2187_46,T_19683_415,T_19683_416,T_6561_139,T_19683_418,T_19683_419,T_6561_140,T_19683_421,T_19683_422,T_2187_47,T_19683_424,T_19683_425,T_6561_142,T_19683_427,T_19683_428,T_6561_143,T_19683_430,T_19683_431,T_729_16,T_19683_433,T_19683_434,T_6561_145,T_19683_436,T_19683_437,T_6561_146,T_19683_439,T_19683_440,T_2187_49,T_19683_442,T_19683_443,T_6561_148,T_19683_445,T_19683_446,T_6561_149,T_19683_448,T_19683_449,T_2187_50,T_19683_451,T_19683_452,T_6561_151,T_19683_454,T_19683_455,T_6561_152,T_19683_457,T_19683_458,T_729_17,T_19683_460,T_19683_461,T_6561_154,T_19683_463,T_19683_464,T_6561_155,T_19683_466,T_19683_467,T_2187_52,T_19683_469,T_19683_470,T_6561_157,T_19683_472,T_19683_473,T_6561_158,T_19683_475,T_19683_476,T_2187_53,T_19683_478,T_19683_479,T_6561_160,T_19683_481,T_19683_482,T_6561_161,T_19683_484,T_19683_485,T_81_2,T_19683_487,T_19683_488,T_6561_163,T_19683_490,T_19683_491,T_6561_164,T_19683_493,T_19683_494,T_2187_55,T_19683_496,T_19683_497,T_6561_166,T_19683_499,T_19683_500,T_6561_167,T_19683_502,T_19683_503,T_2187_56,T_19683_505,T_19683_506,T_6561_169,T_19683_508,T_19683_509,T_6561_170,T_19683_511,T_19683_512,T_729_19,T_19683_514,T_19683_515,T_6561_172,T_19683_517,T_19683_518,T_6561_173,T_19683_520,T_19683_521,T_2187_58,T_19683_523,T_19683_524,T_6561_175,T_19683_526,T_19683_527,T_6561_176,T_19683_529,T_19683_530,T_2187_59,T_19683_532,T_19683_533,T_6561_178,T_19683_535,T_19683_536,T_6561_179,T_19683_538,T_19683_539,T_729_20,T_19683_541,T_19683_542,T_6561_181,T_19683_544,T_19683_545,T_6561_182,T_19683_547,T_19683_548,T_2187_61,T_19683_550,T_19683_551,T_6561_184,T_19683_553,T_19683_554,T_6561_185,T_19683_556,T_19683_557,T_2187_62,T_19683_559,T_19683_560,T_6561_187,T_19683_562,T_19683_563,T_6561_188,T_19683_565,T_19683_566,T_243_7,T_19683_568,T_19683_569,T_6561_190,T_19683_571,T_19683_572,T_6561_191,T_19683_574,T_19683_575,T_2187_64,T_19683_577,T_19683_578,T_6561_193,T_19683_580,T_19683_581,T_6561_194,T_19683_583,T_19683_584,T_2187_65,T_19683_586,T_19683_587,T_6561_196,T_19683_589,T_19683_590,T_6561_197,T_19683_592,T_19683_593,T_729_22,T_19683_595,T_19683_596,T_6561_199,T_19683_598,T_19683_599,T_6561_200,T_19683_601,T_19683_602,T_2187_67,T_19683_604,T_19683_605,T_6561_202,T_19683_607,T_19683_608,T_6561_203,T_19683_610,T_19683_611,T_2187_68,T_19683_613,T_19683_614,T_6561_205,T_19683_616,T_19683_617,T_6561_206,T_19683_619,T_19683_620,T_729_23,T_19683_622,T_19683_623,T_6561_208,T_19683_625,T_19683_626,T_6561_209,T_19683_628,T_19683_629,T_2187_70,T_19683_631,T_19683_632,T_6561_211,T_19683_634,T_19683_635,T_6561_212,T_19683_637,T_19683_638,T_2187_71,T_19683_640,T_19683_641,T_6561_214,T_19683_643,T_19683_644,T_6561_215,T_19683_646,T_19683_647,T_243_8,T_19683_649,T_19683_650,T_6561_217,T_19683_652,T_19683_653,T_6561_218,T_19683_655,T_19683_656,T_2187_73,T_19683_658,T_19683_659,T_6561_220,T_19683_661,T_19683_662,T_6561_221,T_19683_664,T_19683_665,T_2187_74,T_19683_667,T_19683_668,T_6561_223,T_19683_670,T_19683_671,T_6561_224,T_19683_673,T_19683_674,T_729_25,T_19683_676,T_19683_677,T_6561_226,T_19683_679,T_19683_680,T_6561_227,T_19683_682,T_19683_683,T_2187_76,T_19683_685,T_19683_686,T_6561_229,T_19683_688,T_19683_689,T_6561_230,T_19683_691,T_19683_692,T_2187_77,T_19683_694,T_19683_695,T_6561_232,T_19683_697,T_19683_698,T_6561_233,T_19683_700,T_19683_701,T_729_26,T_19683_703,T_19683_704,T_6561_235,T_19683_706,T_19683_707,T_6561_236,T_19683_709,T_19683_710,T_2187_79,T_19683_712,T_19683_713,T_6561_238,T_19683_715,T_19683_716,T_6561_239,T_19683_718,T_19683_719,T_2187_80,T_19683_721,T_19683_722,T_6561_241,T_19683_724,T_19683_725,T_6561_242,T_19683_727,T_19683_728,T_2_0,T_19683_14,T_19683_28,T_6561_14,T_19683_56,T_19683_70,T_6561_28,T_19683_98,T_19683_112,T_2187_14,T_19683_140,T_19683_154,T_6561_56,T_19683_182,T_19683_196,T_6561_70,T_19683_224,T_19683_238,T_2187_28,T_19683_266,T_19683_280,T_6561_98,T_19683_308,T_19683_322,T_6561_112,T_19683_350,T_19683_364,T_729_14,T_19683_392,T_19683_406,T_6561_140,T_19683_434,T_19683_448,T_6561_154,T_19683_476,T_19683_490,T_2187_56,T_19683_518,T_19683_532,T_6561_182,T_19683_560,T_19683_574,T_6561_196,T_19683_602,T_19683_616,T_2187_70,T_19683_644,T_19683_658,T_6561_224,T_19683_686,T_19683_700,T_6561_238,T_19683_728,T_19683_742,T_729_28,T_19683_770,T_19683_784,T_6561_266,T_19683_812,T_19683_826,T_6561_280,T_19683_854,T_19683_868,T_2187_98,T_19683_896,T_19683_910,T_6561_308,T_19683_938,T_19683_952,T_6561_322,T_19683_980,T_19683_994,T_2187_112,T_19683_1022,T_19683_1036,T_6561_350,T_19683_1064,T_19683_1078,T_6561_364,T_19683_1106,T_19683_1120,T_243_14,T_19683_1148,T_19683_1162,T_6561_392,T_19683_1190,T_19683_1204,T_6561_406,T_19683_1232,T_19683_1246,T_2187_140,T_19683_1274,T_19683_1288,T_6561_434,T_19683_1316,T_19683_1330,T_6561_448,T_19683_1358,T_19683_1372,T_2187_154,T_19683_1400,T_19683_1414,T_6561_476,T_19683_1442,T_19683_1456,T_6561_490,T_19683_1484,T_19683_1498,T_729_56,T_19683_1526,T_19683_1540,T_6561_518,T_19683_1568,T_19683_1582,T_6561_532,T_19683_1610,T_19683_1624,T_2187_182,T_19683_1652,T_19683_1666,T_6561_560,T_19683_1694,T_19683_1708,T_6561_574,T_19683_1736,T_19683_1750,T_2187_196,T_19683_1778,T_19683_1792,T_6561_602,T_19683_1820,T_19683_1834,T_6561_616,T_19683_1862,T_19683_1876,T_729_70,T_19683_1904,T_19683_1918,T_6561_644,T_19683_1946,T_19683_1960,T_6561_658,T_19683_1988,T_19683_2002,T_2187_224,T_19683_2030,T_19683_2044,T_6561_686,T_19683_2072,T_19683_2086,T_6561_700,T_19683_2114,T_19683_2128,T_2187_238,T_19683_2156,T_19683_2170,T_6561_728,T_19683_2198,T_19683_2212,T_6561_742,T_19683_2240,T_19683_2254,T_243_28,T_19683_2282,T_19683_2296,T_6561_770,T_19683_2324,T_19683_2338,T_6561_784,T_19683_2366,T_19683_2380,T_2187_266,T_19683_2408,T_19683_2422,T_6561_812,T_19683_2450,T_19683_2464,T_6561_826,T_19683_2492,T_19683_2506,T_2187_280,T_19683_2534,T_19683_2548,T_6561_854,T_19683_2576,T_19683_2590,T_6561_868,T_19683_2618,T_19683_2632,T_729_98,T_19683_2660,T_19683_2674,T_6561_896,T_19683_2702,T_19683_2716,T_6561_910,T_19683_2744,T_19683_2758,T_2187_308,T_19683_2786,T_19683_2800,T_6561_938,T_19683_2828,T_19683_2842,T_6561_952,T_19683_2870,T_19683_2884,T_2187_322,T_19683_2912,T_19683_2926,T_6561_980,T_19683_2954,T_19683_2968,T_6561_994,T_19683_2996,T_19683_3010,T_729_112,T_19683_3038,T_19683_3052,T_6561_1022,T_19683_3080,T_19683_3094,T_6561_1036,T_19683_3122,T_19683_3136,T_2187_350,T_19683_3164,T_19683_3178,T_6561_1064,T_19683_3206,T_19683_3220,T_6561_1078,T_19683_3248,T_19683_3262,T_2187_364,T_19683_3290,T_19683_3304,T_6561_1106,T_19683_3332,T_19683_3346,T_6561_1120,T_19683_3374,T_19683_3388,T_81_14,T_19683_3416,T_19683_3430,T_6561_1148,T_19683_3458,T_19683_3472,T_6561_1162,T_19683_3500,T_19683_3514,T_2187_392,T_19683_3542,T_19683_3556,T_6561_1190,T_19683_3584,T_19683_3598,T_6561_1204,T_19683_3626,T_19683_3640,T_2187_406,T_19683_3668,T_19683_3682,T_6561_1232,T_19683_3710,T_19683_3724,T_6561_1246,T_19683_3752,T_19683_3766,T_729_140,T_19683_3794,T_19683_3808,T_6561_1274,T_19683_3836,T_19683_3850,T_6561_1288,T_19683_3878,T_19683_3892,T_2187_434,T_19683_3920,T_19683_3934,T_6561_1316,T_19683_3962,T_19683_3976,T_6561_1330,T_19683_4004,T_19683_4018,T_2187_448,T_19683_4046,T_19683_4060,T_6561_1358,T_19683_4088,T_19683_4102,T_6561_1372,T_19683_4130,T_19683_4144,T_729_154,T_19683_4172,T_19683_4186,T_6561_1400,T_19683_4214,T_19683_4228,T_6561_1414,T_19683_4256,T_19683_4270,T_2187_476,T_19683_4298,T_19683_4312,T_6561_1442,T_19683_4340,T_19683_4354,T_6561_1456,T_19683_4382,T_19683_4396,T_2187_490,T_19683_4424,T_19683_4438,T_6561_1484,T_19683_4466,T_19683_4480,T_6561_1498,T_19683_4508,T_19683_4522,T_243_56,T_19683_4550,T_19683_4564,T_6561_1526,T_19683_4592,T_19683_4606,T_6561_1540,T_19683_4634,T_19683_4648,T_2187_518,T_19683_4676,T_19683_4690,T_6561_1568,T_19683_4718,T_19683_4732,T_6561_1582,T_19683_4760,T_19683_4774,T_2187_532,T_19683_4802,T_19683_4816,T_6561_1610,T_19683_4844,T_19683_4858,T_6561_1624,T_19683_4886,T_19683_4900,T_729_182,T_19683_4928,T_19683_4942,T_6561_1652,T_19683_4970,T_19683_4984,T_6561_1666,T_19683_5012,T_19683_5026,T_2187_560,T_19683_5054,T_19683_5068,T_6561_1694,T_19683_5096,T_19683_5110,T_6561_1708,T_19683_5138,T_19683_5152,T_2187_574,T_19683_5180,T_19683_5194,T_6561_1736,T_19683_5222,T_19683_5236,T_6561_1750,T_19683_5264,T_19683_5278,T_729_196,T_19683_5306,T_19683_5320,T_6561_1778,T_19683_5348,T_19683_5362,T_6561_1792,T_19683_5390,T_19683_5404,T_2187_602,T_19683_5432,T_19683_5446,T_6561_1820,T_19683_5474,T_19683_5488,T_6561_1834,T_19683_5516,T_19683_5530,T_2187_616,T_19683_5558,T_19683_5572,T_6561_1862,T_19683_5600,T_19683_5614,T_6561_1876,T_19683_5642,T_19683_5656,T_243_70,T_19683_5684,T_19683_5698,T_6561_1904,T_19683_5726,T_19683_5740,T_6561_1918,T_19683_5768,T_19683_5782,T_2187_644,T_19683_5810,T_19683_5824,T_6561_1946,T_19683_5852,T_19683_5866,T_6561_1960,T_19683_5894,T_19683_5908,T_2187_658,T_19683_5936,T_19683_5950,T_6561_1988,T_19683_5978,T_19683_5992,T_6561_2002,T_19683_6020,T_19683_6034,T_729_224,T_19683_6062,T_19683_6076,T_6561_2030,T_19683_6104,T_19683_6118,T_6561_2044,T_19683_6146,T_19683_6160,T_2187_686,T_19683_6188,T_19683_6202,T_6561_2072,T_19683_6230,T_19683_6244,T_6561_2086,T_19683_6272,T_19683_6286,T_2187_700,T_19683_6314,T_19683_6328,T_6561_2114,T_19683_6356,T_19683_6370,T_6561_2128,T_19683_6398,T_19683_6412,T_729_238,T_19683_6440,T_19683_6454,T_6561_2156,T_19683_6482,T_19683_6496,T_6561_2170,T_19683_6524,T_19683_6538,T_2187_728,T_19683_6566,T_19683_6580,T_6561_2198,T_19683_6608,T_19683_6622,T_6561_2212,T_19683_6650,T_19683_6664,T_2187_742,T_19683_6692,T_19683_6706,T_6561_2240,T_19683_6734,T_19683_6748,T_6561_2254,T_19683_6776,T_19683_6790,T_81_28,T_19683_6818,T_19683_6832,T_6561_2282,T_19683_6860,T_19683_6874,T_6561_2296,T_19683_6902,T_19683_6916,T_2187_770,T_19683_6944,T_19683_6958,T_6561_2324,T_19683_6986,T_19683_7000,T_6561_2338,T_19683_7028,T_19683_7042,T_2187_784,T_19683_7070,T_19683_7084,T_6561_2366,T_19683_7112,T_19683_7126,T_6561_2380,T_19683_7154,T_19683_7168,T_729_266,T_19683_7196,T_19683_7210,T_6561_2408,T_19683_7238,T_19683_7252,T_6561_2422,T_19683_7280,T_19683_7294,T_2187_812,T_19683_7322,T_19683_7336,T_6561_2450,T_19683_7364,T_19683_7378,T_6561_2464,T_19683_7406,T_19683_7420,T_2187_826,T_19683_7448,T_19683_7462,T_6561_2492,T_19683_7490,T_19683_7504,T_6561_2506,T_19683_7532,T_19683_7546,T_729_280,T_19683_7574,T_19683_7588,T_6561_2534,T_19683_7616,T_19683_7630,T_6561_2548,T_19683_7658,T_19683_7672,T_2187_854,T_19683_7700,T_19683_7714,T_6561_2576,T_19683_7742,T_19683_7756,T_6561_2590,T_19683_7784,T_19683_7798,T_2187_868,T_19683_7826,T_19683_7840,T_6561_2618,T_19683_7868,T_19683_7882,T_6561_2632,T_19683_7910,T_19683_7924,T_243_98,T_19683_7952,T_19683_7966,T_6561_2660,T_19683_7994,T_19683_8008,T_6561_2674,T_19683_8036,T_19683_8050,T_2187_896,T_19683_8078,T_19683_8092,T_6561_2702,T_19683_8120,T_19683_8134,T_6561_2716,T_19683_8162,T_19683_8176,T_2187_910,T_19683_8204,T_19683_8218,T_6561_2744,T_19683_8246,T_19683_8260,T_6561_2758,T_19683_8288,T_19683_8302,T_729_308,T_19683_8330,T_19683_8344,T_6561_2786,T_19683_8372,T_19683_8386,T_6561_2800,T_19683_8414,T_19683_8428,T_2187_938,T_19683_8456,T_19683_8470,T_6561_2828,T_19683_8498,T_19683_8512,T_6561_2842,T_19683_8540,T_19683_8554,T_2187_952,T_19683_8582,T_19683_8596,T_6561_2870,T_19683_8624,T_19683_8638,T_6561_2884,T_19683_8666,T_19683_8680,T_729_322,T_19683_8708,T_19683_8722,T_6561_2912,T_19683_8750,T_19683_8764,T_6561_2926,T_19683_8792,T_19683_8806,T_2187_980,T_19683_8834,T_19683_8848,T_6561_2954,T_19683_8876,T_19683_8890,T_6561_2968,T_19683_8918,T_19683_8932,T_2187_994,T_19683_8960,T_19683_8974,T_6561_2996,T_19683_9002,T_19683_9016,T_6561_3010,T_19683_9044,T_19683_9058,T_243_112,T_19683_9086,T_19683_9100,T_6561_3038,T_19683_9128,T_19683_9142,T_6561_3052,T_19683_9170,T_19683_9184,T_2187_1022,T_19683_9212,T_19683_9226,T_6561_3080,T_19683_9254,T_19683_9268,T_6561_3094,T_19683_9296,T_19683_9310,T_2187_1036,T_19683_9338,T_19683_9352,T_6561_3122,T_19683_9380,T_19683_9394,T_6561_3136,T_19683_9422,T_19683_9436,T_729_350,T_19683_9464,T_19683_9478,T_6561_3164,T_19683_9506,T_19683_9520,T_6561_3178,T_19683_9548,T_19683_9562,T_2187_1064,T_19683_9590,T_19683_9604,T_6561_3206,T_19683_9632,T_19683_9646,T_6561_3220,T_19683_9674,T_19683_9688,T_2187_1078,T_19683_9716,T_19683_9730,T_6561_3248,T_19683_9758,T_19683_9772,T_6561_3262,T_19683_9800,T_19683_9814,T_729_364,T_19683_9842,T_19683_9856,T_6561_3290,T_19683_9884,T_19683_9898,T_6561_3304,T_19683_9926,T_19683_9940,T_2187_1106,T_19683_9968,T_19683_9982,T_6561_3332,T_19683_10010,T_19683_10024,T_6561_3346,T_19683_10052,T_19683_10066,T_2187_1120,T_19683_10094,T_19683_10108,T_6561_3374,T_19683_10136,T_19683_10150,T_6561_3388,T_19683_10178,T_19683_10192 +}; +static const __device__ float2 lut_sp_32_32768[1024*2] = { + T_2_0,T_32768_1,T_16384_1,T_32768_3,T_8192_1,T_32768_5,T_16384_3,T_32768_7,T_4096_1,T_32768_9,T_16384_5,T_32768_11,T_8192_3,T_32768_13,T_16384_7,T_32768_15,T_2048_1,T_32768_17,T_16384_9,T_32768_19,T_8192_5,T_32768_21,T_16384_11,T_32768_23,T_4096_3,T_32768_25,T_16384_13,T_32768_27,T_8192_7,T_32768_29,T_16384_15,T_32768_31,T_1024_1,T_32768_33,T_16384_17,T_32768_35,T_8192_9,T_32768_37,T_16384_19,T_32768_39,T_4096_5,T_32768_41,T_16384_21,T_32768_43,T_8192_11,T_32768_45,T_16384_23,T_32768_47,T_2048_3,T_32768_49,T_16384_25,T_32768_51,T_8192_13,T_32768_53,T_16384_27,T_32768_55,T_4096_7,T_32768_57,T_16384_29,T_32768_59,T_8192_15,T_32768_61,T_16384_31,T_32768_63,T_512_1,T_32768_65,T_16384_33,T_32768_67,T_8192_17,T_32768_69,T_16384_35,T_32768_71,T_4096_9,T_32768_73,T_16384_37,T_32768_75,T_8192_19,T_32768_77,T_16384_39,T_32768_79,T_2048_5,T_32768_81,T_16384_41,T_32768_83,T_8192_21,T_32768_85,T_16384_43,T_32768_87,T_4096_11,T_32768_89,T_16384_45,T_32768_91,T_8192_23,T_32768_93,T_16384_47,T_32768_95,T_1024_3,T_32768_97,T_16384_49,T_32768_99,T_8192_25,T_32768_101,T_16384_51,T_32768_103,T_4096_13,T_32768_105,T_16384_53,T_32768_107,T_8192_27,T_32768_109,T_16384_55,T_32768_111,T_2048_7,T_32768_113,T_16384_57,T_32768_115,T_8192_29,T_32768_117,T_16384_59,T_32768_119,T_4096_15,T_32768_121,T_16384_61,T_32768_123,T_8192_31,T_32768_125,T_16384_63,T_32768_127,T_256_1,T_32768_129,T_16384_65,T_32768_131,T_8192_33,T_32768_133,T_16384_67,T_32768_135,T_4096_17,T_32768_137,T_16384_69,T_32768_139,T_8192_35,T_32768_141,T_16384_71,T_32768_143,T_2048_9,T_32768_145,T_16384_73,T_32768_147,T_8192_37,T_32768_149,T_16384_75,T_32768_151,T_4096_19,T_32768_153,T_16384_77,T_32768_155,T_8192_39,T_32768_157,T_16384_79,T_32768_159,T_1024_5,T_32768_161,T_16384_81,T_32768_163,T_8192_41,T_32768_165,T_16384_83,T_32768_167,T_4096_21,T_32768_169,T_16384_85,T_32768_171,T_8192_43,T_32768_173,T_16384_87,T_32768_175,T_2048_11,T_32768_177,T_16384_89,T_32768_179,T_8192_45,T_32768_181,T_16384_91,T_32768_183,T_4096_23,T_32768_185,T_16384_93,T_32768_187,T_8192_47,T_32768_189,T_16384_95,T_32768_191,T_512_3,T_32768_193,T_16384_97,T_32768_195,T_8192_49,T_32768_197,T_16384_99,T_32768_199,T_4096_25,T_32768_201,T_16384_101,T_32768_203,T_8192_51,T_32768_205,T_16384_103,T_32768_207,T_2048_13,T_32768_209,T_16384_105,T_32768_211,T_8192_53,T_32768_213,T_16384_107,T_32768_215,T_4096_27,T_32768_217,T_16384_109,T_32768_219,T_8192_55,T_32768_221,T_16384_111,T_32768_223,T_1024_7,T_32768_225,T_16384_113,T_32768_227,T_8192_57,T_32768_229,T_16384_115,T_32768_231,T_4096_29,T_32768_233,T_16384_117,T_32768_235,T_8192_59,T_32768_237,T_16384_119,T_32768_239,T_2048_15,T_32768_241,T_16384_121,T_32768_243,T_8192_61,T_32768_245,T_16384_123,T_32768_247,T_4096_31,T_32768_249,T_16384_125,T_32768_251,T_8192_63,T_32768_253,T_16384_127,T_32768_255,T_128_1,T_32768_257,T_16384_129,T_32768_259,T_8192_65,T_32768_261,T_16384_131,T_32768_263,T_4096_33,T_32768_265,T_16384_133,T_32768_267,T_8192_67,T_32768_269,T_16384_135,T_32768_271,T_2048_17,T_32768_273,T_16384_137,T_32768_275,T_8192_69,T_32768_277,T_16384_139,T_32768_279,T_4096_35,T_32768_281,T_16384_141,T_32768_283,T_8192_71,T_32768_285,T_16384_143,T_32768_287,T_1024_9,T_32768_289,T_16384_145,T_32768_291,T_8192_73,T_32768_293,T_16384_147,T_32768_295,T_4096_37,T_32768_297,T_16384_149,T_32768_299,T_8192_75,T_32768_301,T_16384_151,T_32768_303,T_2048_19,T_32768_305,T_16384_153,T_32768_307,T_8192_77,T_32768_309,T_16384_155,T_32768_311,T_4096_39,T_32768_313,T_16384_157,T_32768_315,T_8192_79,T_32768_317,T_16384_159,T_32768_319,T_512_5,T_32768_321,T_16384_161,T_32768_323,T_8192_81,T_32768_325,T_16384_163,T_32768_327,T_4096_41,T_32768_329,T_16384_165,T_32768_331,T_8192_83,T_32768_333,T_16384_167,T_32768_335,T_2048_21,T_32768_337,T_16384_169,T_32768_339,T_8192_85,T_32768_341,T_16384_171,T_32768_343,T_4096_43,T_32768_345,T_16384_173,T_32768_347,T_8192_87,T_32768_349,T_16384_175,T_32768_351,T_1024_11,T_32768_353,T_16384_177,T_32768_355,T_8192_89,T_32768_357,T_16384_179,T_32768_359,T_4096_45,T_32768_361,T_16384_181,T_32768_363,T_8192_91,T_32768_365,T_16384_183,T_32768_367,T_2048_23,T_32768_369,T_16384_185,T_32768_371,T_8192_93,T_32768_373,T_16384_187,T_32768_375,T_4096_47,T_32768_377,T_16384_189,T_32768_379,T_8192_95,T_32768_381,T_16384_191,T_32768_383,T_256_3,T_32768_385,T_16384_193,T_32768_387,T_8192_97,T_32768_389,T_16384_195,T_32768_391,T_4096_49,T_32768_393,T_16384_197,T_32768_395,T_8192_99,T_32768_397,T_16384_199,T_32768_399,T_2048_25,T_32768_401,T_16384_201,T_32768_403,T_8192_101,T_32768_405,T_16384_203,T_32768_407,T_4096_51,T_32768_409,T_16384_205,T_32768_411,T_8192_103,T_32768_413,T_16384_207,T_32768_415,T_1024_13,T_32768_417,T_16384_209,T_32768_419,T_8192_105,T_32768_421,T_16384_211,T_32768_423,T_4096_53,T_32768_425,T_16384_213,T_32768_427,T_8192_107,T_32768_429,T_16384_215,T_32768_431,T_2048_27,T_32768_433,T_16384_217,T_32768_435,T_8192_109,T_32768_437,T_16384_219,T_32768_439,T_4096_55,T_32768_441,T_16384_221,T_32768_443,T_8192_111,T_32768_445,T_16384_223,T_32768_447,T_512_7,T_32768_449,T_16384_225,T_32768_451,T_8192_113,T_32768_453,T_16384_227,T_32768_455,T_4096_57,T_32768_457,T_16384_229,T_32768_459,T_8192_115,T_32768_461,T_16384_231,T_32768_463,T_2048_29,T_32768_465,T_16384_233,T_32768_467,T_8192_117,T_32768_469,T_16384_235,T_32768_471,T_4096_59,T_32768_473,T_16384_237,T_32768_475,T_8192_119,T_32768_477,T_16384_239,T_32768_479,T_1024_15,T_32768_481,T_16384_241,T_32768_483,T_8192_121,T_32768_485,T_16384_243,T_32768_487,T_4096_61,T_32768_489,T_16384_245,T_32768_491,T_8192_123,T_32768_493,T_16384_247,T_32768_495,T_2048_31,T_32768_497,T_16384_249,T_32768_499,T_8192_125,T_32768_501,T_16384_251,T_32768_503,T_4096_63,T_32768_505,T_16384_253,T_32768_507,T_8192_127,T_32768_509,T_16384_255,T_32768_511,T_64_1,T_32768_513,T_16384_257,T_32768_515,T_8192_129,T_32768_517,T_16384_259,T_32768_519,T_4096_65,T_32768_521,T_16384_261,T_32768_523,T_8192_131,T_32768_525,T_16384_263,T_32768_527,T_2048_33,T_32768_529,T_16384_265,T_32768_531,T_8192_133,T_32768_533,T_16384_267,T_32768_535,T_4096_67,T_32768_537,T_16384_269,T_32768_539,T_8192_135,T_32768_541,T_16384_271,T_32768_543,T_1024_17,T_32768_545,T_16384_273,T_32768_547,T_8192_137,T_32768_549,T_16384_275,T_32768_551,T_4096_69,T_32768_553,T_16384_277,T_32768_555,T_8192_139,T_32768_557,T_16384_279,T_32768_559,T_2048_35,T_32768_561,T_16384_281,T_32768_563,T_8192_141,T_32768_565,T_16384_283,T_32768_567,T_4096_71,T_32768_569,T_16384_285,T_32768_571,T_8192_143,T_32768_573,T_16384_287,T_32768_575,T_512_9,T_32768_577,T_16384_289,T_32768_579,T_8192_145,T_32768_581,T_16384_291,T_32768_583,T_4096_73,T_32768_585,T_16384_293,T_32768_587,T_8192_147,T_32768_589,T_16384_295,T_32768_591,T_2048_37,T_32768_593,T_16384_297,T_32768_595,T_8192_149,T_32768_597,T_16384_299,T_32768_599,T_4096_75,T_32768_601,T_16384_301,T_32768_603,T_8192_151,T_32768_605,T_16384_303,T_32768_607,T_1024_19,T_32768_609,T_16384_305,T_32768_611,T_8192_153,T_32768_613,T_16384_307,T_32768_615,T_4096_77,T_32768_617,T_16384_309,T_32768_619,T_8192_155,T_32768_621,T_16384_311,T_32768_623,T_2048_39,T_32768_625,T_16384_313,T_32768_627,T_8192_157,T_32768_629,T_16384_315,T_32768_631,T_4096_79,T_32768_633,T_16384_317,T_32768_635,T_8192_159,T_32768_637,T_16384_319,T_32768_639,T_256_5,T_32768_641,T_16384_321,T_32768_643,T_8192_161,T_32768_645,T_16384_323,T_32768_647,T_4096_81,T_32768_649,T_16384_325,T_32768_651,T_8192_163,T_32768_653,T_16384_327,T_32768_655,T_2048_41,T_32768_657,T_16384_329,T_32768_659,T_8192_165,T_32768_661,T_16384_331,T_32768_663,T_4096_83,T_32768_665,T_16384_333,T_32768_667,T_8192_167,T_32768_669,T_16384_335,T_32768_671,T_1024_21,T_32768_673,T_16384_337,T_32768_675,T_8192_169,T_32768_677,T_16384_339,T_32768_679,T_4096_85,T_32768_681,T_16384_341,T_32768_683,T_8192_171,T_32768_685,T_16384_343,T_32768_687,T_2048_43,T_32768_689,T_16384_345,T_32768_691,T_8192_173,T_32768_693,T_16384_347,T_32768_695,T_4096_87,T_32768_697,T_16384_349,T_32768_699,T_8192_175,T_32768_701,T_16384_351,T_32768_703,T_512_11,T_32768_705,T_16384_353,T_32768_707,T_8192_177,T_32768_709,T_16384_355,T_32768_711,T_4096_89,T_32768_713,T_16384_357,T_32768_715,T_8192_179,T_32768_717,T_16384_359,T_32768_719,T_2048_45,T_32768_721,T_16384_361,T_32768_723,T_8192_181,T_32768_725,T_16384_363,T_32768_727,T_4096_91,T_32768_729,T_16384_365,T_32768_731,T_8192_183,T_32768_733,T_16384_367,T_32768_735,T_1024_23,T_32768_737,T_16384_369,T_32768_739,T_8192_185,T_32768_741,T_16384_371,T_32768_743,T_4096_93,T_32768_745,T_16384_373,T_32768_747,T_8192_187,T_32768_749,T_16384_375,T_32768_751,T_2048_47,T_32768_753,T_16384_377,T_32768_755,T_8192_189,T_32768_757,T_16384_379,T_32768_759,T_4096_95,T_32768_761,T_16384_381,T_32768_763,T_8192_191,T_32768_765,T_16384_383,T_32768_767,T_128_3,T_32768_769,T_16384_385,T_32768_771,T_8192_193,T_32768_773,T_16384_387,T_32768_775,T_4096_97,T_32768_777,T_16384_389,T_32768_779,T_8192_195,T_32768_781,T_16384_391,T_32768_783,T_2048_49,T_32768_785,T_16384_393,T_32768_787,T_8192_197,T_32768_789,T_16384_395,T_32768_791,T_4096_99,T_32768_793,T_16384_397,T_32768_795,T_8192_199,T_32768_797,T_16384_399,T_32768_799,T_1024_25,T_32768_801,T_16384_401,T_32768_803,T_8192_201,T_32768_805,T_16384_403,T_32768_807,T_4096_101,T_32768_809,T_16384_405,T_32768_811,T_8192_203,T_32768_813,T_16384_407,T_32768_815,T_2048_51,T_32768_817,T_16384_409,T_32768_819,T_8192_205,T_32768_821,T_16384_411,T_32768_823,T_4096_103,T_32768_825,T_16384_413,T_32768_827,T_8192_207,T_32768_829,T_16384_415,T_32768_831,T_512_13,T_32768_833,T_16384_417,T_32768_835,T_8192_209,T_32768_837,T_16384_419,T_32768_839,T_4096_105,T_32768_841,T_16384_421,T_32768_843,T_8192_211,T_32768_845,T_16384_423,T_32768_847,T_2048_53,T_32768_849,T_16384_425,T_32768_851,T_8192_213,T_32768_853,T_16384_427,T_32768_855,T_4096_107,T_32768_857,T_16384_429,T_32768_859,T_8192_215,T_32768_861,T_16384_431,T_32768_863,T_1024_27,T_32768_865,T_16384_433,T_32768_867,T_8192_217,T_32768_869,T_16384_435,T_32768_871,T_4096_109,T_32768_873,T_16384_437,T_32768_875,T_8192_219,T_32768_877,T_16384_439,T_32768_879,T_2048_55,T_32768_881,T_16384_441,T_32768_883,T_8192_221,T_32768_885,T_16384_443,T_32768_887,T_4096_111,T_32768_889,T_16384_445,T_32768_891,T_8192_223,T_32768_893,T_16384_447,T_32768_895,T_256_7,T_32768_897,T_16384_449,T_32768_899,T_8192_225,T_32768_901,T_16384_451,T_32768_903,T_4096_113,T_32768_905,T_16384_453,T_32768_907,T_8192_227,T_32768_909,T_16384_455,T_32768_911,T_2048_57,T_32768_913,T_16384_457,T_32768_915,T_8192_229,T_32768_917,T_16384_459,T_32768_919,T_4096_115,T_32768_921,T_16384_461,T_32768_923,T_8192_231,T_32768_925,T_16384_463,T_32768_927,T_1024_29,T_32768_929,T_16384_465,T_32768_931,T_8192_233,T_32768_933,T_16384_467,T_32768_935,T_4096_117,T_32768_937,T_16384_469,T_32768_939,T_8192_235,T_32768_941,T_16384_471,T_32768_943,T_2048_59,T_32768_945,T_16384_473,T_32768_947,T_8192_237,T_32768_949,T_16384_475,T_32768_951,T_4096_119,T_32768_953,T_16384_477,T_32768_955,T_8192_239,T_32768_957,T_16384_479,T_32768_959,T_512_15,T_32768_961,T_16384_481,T_32768_963,T_8192_241,T_32768_965,T_16384_483,T_32768_967,T_4096_121,T_32768_969,T_16384_485,T_32768_971,T_8192_243,T_32768_973,T_16384_487,T_32768_975,T_2048_61,T_32768_977,T_16384_489,T_32768_979,T_8192_245,T_32768_981,T_16384_491,T_32768_983,T_4096_123,T_32768_985,T_16384_493,T_32768_987,T_8192_247,T_32768_989,T_16384_495,T_32768_991,T_1024_31,T_32768_993,T_16384_497,T_32768_995,T_8192_249,T_32768_997,T_16384_499,T_32768_999,T_4096_125,T_32768_1001,T_16384_501,T_32768_1003,T_8192_251,T_32768_1005,T_16384_503,T_32768_1007,T_2048_63,T_32768_1009,T_16384_505,T_32768_1011,T_8192_253,T_32768_1013,T_16384_507,T_32768_1015,T_4096_127,T_32768_1017,T_16384_509,T_32768_1019,T_8192_255,T_32768_1021,T_16384_511,T_32768_1023,T_2_0,T_32768_17,T_16384_17,T_32768_51,T_8192_17,T_32768_85,T_16384_51,T_32768_119,T_4096_17,T_32768_153,T_16384_85,T_32768_187,T_8192_51,T_32768_221,T_16384_119,T_32768_255,T_2048_17,T_32768_289,T_16384_153,T_32768_323,T_8192_85,T_32768_357,T_16384_187,T_32768_391,T_4096_51,T_32768_425,T_16384_221,T_32768_459,T_8192_119,T_32768_493,T_16384_255,T_32768_527,T_1024_17,T_32768_561,T_16384_289,T_32768_595,T_8192_153,T_32768_629,T_16384_323,T_32768_663,T_4096_85,T_32768_697,T_16384_357,T_32768_731,T_8192_187,T_32768_765,T_16384_391,T_32768_799,T_2048_51,T_32768_833,T_16384_425,T_32768_867,T_8192_221,T_32768_901,T_16384_459,T_32768_935,T_4096_119,T_32768_969,T_16384_493,T_32768_1003,T_8192_255,T_32768_1037,T_16384_527,T_32768_1071,T_512_17,T_32768_1105,T_16384_561,T_32768_1139,T_8192_289,T_32768_1173,T_16384_595,T_32768_1207,T_4096_153,T_32768_1241,T_16384_629,T_32768_1275,T_8192_323,T_32768_1309,T_16384_663,T_32768_1343,T_2048_85,T_32768_1377,T_16384_697,T_32768_1411,T_8192_357,T_32768_1445,T_16384_731,T_32768_1479,T_4096_187,T_32768_1513,T_16384_765,T_32768_1547,T_8192_391,T_32768_1581,T_16384_799,T_32768_1615,T_1024_51,T_32768_1649,T_16384_833,T_32768_1683,T_8192_425,T_32768_1717,T_16384_867,T_32768_1751,T_4096_221,T_32768_1785,T_16384_901,T_32768_1819,T_8192_459,T_32768_1853,T_16384_935,T_32768_1887,T_2048_119,T_32768_1921,T_16384_969,T_32768_1955,T_8192_493,T_32768_1989,T_16384_1003,T_32768_2023,T_4096_255,T_32768_2057,T_16384_1037,T_32768_2091,T_8192_527,T_32768_2125,T_16384_1071,T_32768_2159,T_256_17,T_32768_2193,T_16384_1105,T_32768_2227,T_8192_561,T_32768_2261,T_16384_1139,T_32768_2295,T_4096_289,T_32768_2329,T_16384_1173,T_32768_2363,T_8192_595,T_32768_2397,T_16384_1207,T_32768_2431,T_2048_153,T_32768_2465,T_16384_1241,T_32768_2499,T_8192_629,T_32768_2533,T_16384_1275,T_32768_2567,T_4096_323,T_32768_2601,T_16384_1309,T_32768_2635,T_8192_663,T_32768_2669,T_16384_1343,T_32768_2703,T_1024_85,T_32768_2737,T_16384_1377,T_32768_2771,T_8192_697,T_32768_2805,T_16384_1411,T_32768_2839,T_4096_357,T_32768_2873,T_16384_1445,T_32768_2907,T_8192_731,T_32768_2941,T_16384_1479,T_32768_2975,T_2048_187,T_32768_3009,T_16384_1513,T_32768_3043,T_8192_765,T_32768_3077,T_16384_1547,T_32768_3111,T_4096_391,T_32768_3145,T_16384_1581,T_32768_3179,T_8192_799,T_32768_3213,T_16384_1615,T_32768_3247,T_512_51,T_32768_3281,T_16384_1649,T_32768_3315,T_8192_833,T_32768_3349,T_16384_1683,T_32768_3383,T_4096_425,T_32768_3417,T_16384_1717,T_32768_3451,T_8192_867,T_32768_3485,T_16384_1751,T_32768_3519,T_2048_221,T_32768_3553,T_16384_1785,T_32768_3587,T_8192_901,T_32768_3621,T_16384_1819,T_32768_3655,T_4096_459,T_32768_3689,T_16384_1853,T_32768_3723,T_8192_935,T_32768_3757,T_16384_1887,T_32768_3791,T_1024_119,T_32768_3825,T_16384_1921,T_32768_3859,T_8192_969,T_32768_3893,T_16384_1955,T_32768_3927,T_4096_493,T_32768_3961,T_16384_1989,T_32768_3995,T_8192_1003,T_32768_4029,T_16384_2023,T_32768_4063,T_2048_255,T_32768_4097,T_16384_2057,T_32768_4131,T_8192_1037,T_32768_4165,T_16384_2091,T_32768_4199,T_4096_527,T_32768_4233,T_16384_2125,T_32768_4267,T_8192_1071,T_32768_4301,T_16384_2159,T_32768_4335,T_128_17,T_32768_4369,T_16384_2193,T_32768_4403,T_8192_1105,T_32768_4437,T_16384_2227,T_32768_4471,T_4096_561,T_32768_4505,T_16384_2261,T_32768_4539,T_8192_1139,T_32768_4573,T_16384_2295,T_32768_4607,T_2048_289,T_32768_4641,T_16384_2329,T_32768_4675,T_8192_1173,T_32768_4709,T_16384_2363,T_32768_4743,T_4096_595,T_32768_4777,T_16384_2397,T_32768_4811,T_8192_1207,T_32768_4845,T_16384_2431,T_32768_4879,T_1024_153,T_32768_4913,T_16384_2465,T_32768_4947,T_8192_1241,T_32768_4981,T_16384_2499,T_32768_5015,T_4096_629,T_32768_5049,T_16384_2533,T_32768_5083,T_8192_1275,T_32768_5117,T_16384_2567,T_32768_5151,T_2048_323,T_32768_5185,T_16384_2601,T_32768_5219,T_8192_1309,T_32768_5253,T_16384_2635,T_32768_5287,T_4096_663,T_32768_5321,T_16384_2669,T_32768_5355,T_8192_1343,T_32768_5389,T_16384_2703,T_32768_5423,T_512_85,T_32768_5457,T_16384_2737,T_32768_5491,T_8192_1377,T_32768_5525,T_16384_2771,T_32768_5559,T_4096_697,T_32768_5593,T_16384_2805,T_32768_5627,T_8192_1411,T_32768_5661,T_16384_2839,T_32768_5695,T_2048_357,T_32768_5729,T_16384_2873,T_32768_5763,T_8192_1445,T_32768_5797,T_16384_2907,T_32768_5831,T_4096_731,T_32768_5865,T_16384_2941,T_32768_5899,T_8192_1479,T_32768_5933,T_16384_2975,T_32768_5967,T_1024_187,T_32768_6001,T_16384_3009,T_32768_6035,T_8192_1513,T_32768_6069,T_16384_3043,T_32768_6103,T_4096_765,T_32768_6137,T_16384_3077,T_32768_6171,T_8192_1547,T_32768_6205,T_16384_3111,T_32768_6239,T_2048_391,T_32768_6273,T_16384_3145,T_32768_6307,T_8192_1581,T_32768_6341,T_16384_3179,T_32768_6375,T_4096_799,T_32768_6409,T_16384_3213,T_32768_6443,T_8192_1615,T_32768_6477,T_16384_3247,T_32768_6511,T_256_51,T_32768_6545,T_16384_3281,T_32768_6579,T_8192_1649,T_32768_6613,T_16384_3315,T_32768_6647,T_4096_833,T_32768_6681,T_16384_3349,T_32768_6715,T_8192_1683,T_32768_6749,T_16384_3383,T_32768_6783,T_2048_425,T_32768_6817,T_16384_3417,T_32768_6851,T_8192_1717,T_32768_6885,T_16384_3451,T_32768_6919,T_4096_867,T_32768_6953,T_16384_3485,T_32768_6987,T_8192_1751,T_32768_7021,T_16384_3519,T_32768_7055,T_1024_221,T_32768_7089,T_16384_3553,T_32768_7123,T_8192_1785,T_32768_7157,T_16384_3587,T_32768_7191,T_4096_901,T_32768_7225,T_16384_3621,T_32768_7259,T_8192_1819,T_32768_7293,T_16384_3655,T_32768_7327,T_2048_459,T_32768_7361,T_16384_3689,T_32768_7395,T_8192_1853,T_32768_7429,T_16384_3723,T_32768_7463,T_4096_935,T_32768_7497,T_16384_3757,T_32768_7531,T_8192_1887,T_32768_7565,T_16384_3791,T_32768_7599,T_512_119,T_32768_7633,T_16384_3825,T_32768_7667,T_8192_1921,T_32768_7701,T_16384_3859,T_32768_7735,T_4096_969,T_32768_7769,T_16384_3893,T_32768_7803,T_8192_1955,T_32768_7837,T_16384_3927,T_32768_7871,T_2048_493,T_32768_7905,T_16384_3961,T_32768_7939,T_8192_1989,T_32768_7973,T_16384_3995,T_32768_8007,T_4096_1003,T_32768_8041,T_16384_4029,T_32768_8075,T_8192_2023,T_32768_8109,T_16384_4063,T_32768_8143,T_1024_255,T_32768_8177,T_16384_4097,T_32768_8211,T_8192_2057,T_32768_8245,T_16384_4131,T_32768_8279,T_4096_1037,T_32768_8313,T_16384_4165,T_32768_8347,T_8192_2091,T_32768_8381,T_16384_4199,T_32768_8415,T_2048_527,T_32768_8449,T_16384_4233,T_32768_8483,T_8192_2125,T_32768_8517,T_16384_4267,T_32768_8551,T_4096_1071,T_32768_8585,T_16384_4301,T_32768_8619,T_8192_2159,T_32768_8653,T_16384_4335,T_32768_8687,T_64_17,T_32768_8721,T_16384_4369,T_32768_8755,T_8192_2193,T_32768_8789,T_16384_4403,T_32768_8823,T_4096_1105,T_32768_8857,T_16384_4437,T_32768_8891,T_8192_2227,T_32768_8925,T_16384_4471,T_32768_8959,T_2048_561,T_32768_8993,T_16384_4505,T_32768_9027,T_8192_2261,T_32768_9061,T_16384_4539,T_32768_9095,T_4096_1139,T_32768_9129,T_16384_4573,T_32768_9163,T_8192_2295,T_32768_9197,T_16384_4607,T_32768_9231,T_1024_289,T_32768_9265,T_16384_4641,T_32768_9299,T_8192_2329,T_32768_9333,T_16384_4675,T_32768_9367,T_4096_1173,T_32768_9401,T_16384_4709,T_32768_9435,T_8192_2363,T_32768_9469,T_16384_4743,T_32768_9503,T_2048_595,T_32768_9537,T_16384_4777,T_32768_9571,T_8192_2397,T_32768_9605,T_16384_4811,T_32768_9639,T_4096_1207,T_32768_9673,T_16384_4845,T_32768_9707,T_8192_2431,T_32768_9741,T_16384_4879,T_32768_9775,T_512_153,T_32768_9809,T_16384_4913,T_32768_9843,T_8192_2465,T_32768_9877,T_16384_4947,T_32768_9911,T_4096_1241,T_32768_9945,T_16384_4981,T_32768_9979,T_8192_2499,T_32768_10013,T_16384_5015,T_32768_10047,T_2048_629,T_32768_10081,T_16384_5049,T_32768_10115,T_8192_2533,T_32768_10149,T_16384_5083,T_32768_10183,T_4096_1275,T_32768_10217,T_16384_5117,T_32768_10251,T_8192_2567,T_32768_10285,T_16384_5151,T_32768_10319,T_1024_323,T_32768_10353,T_16384_5185,T_32768_10387,T_8192_2601,T_32768_10421,T_16384_5219,T_32768_10455,T_4096_1309,T_32768_10489,T_16384_5253,T_32768_10523,T_8192_2635,T_32768_10557,T_16384_5287,T_32768_10591,T_2048_663,T_32768_10625,T_16384_5321,T_32768_10659,T_8192_2669,T_32768_10693,T_16384_5355,T_32768_10727,T_4096_1343,T_32768_10761,T_16384_5389,T_32768_10795,T_8192_2703,T_32768_10829,T_16384_5423,T_32768_10863,T_256_85,T_32768_10897,T_16384_5457,T_32768_10931,T_8192_2737,T_32768_10965,T_16384_5491,T_32768_10999,T_4096_1377,T_32768_11033,T_16384_5525,T_32768_11067,T_8192_2771,T_32768_11101,T_16384_5559,T_32768_11135,T_2048_697,T_32768_11169,T_16384_5593,T_32768_11203,T_8192_2805,T_32768_11237,T_16384_5627,T_32768_11271,T_4096_1411,T_32768_11305,T_16384_5661,T_32768_11339,T_8192_2839,T_32768_11373,T_16384_5695,T_32768_11407,T_1024_357,T_32768_11441,T_16384_5729,T_32768_11475,T_8192_2873,T_32768_11509,T_16384_5763,T_32768_11543,T_4096_1445,T_32768_11577,T_16384_5797,T_32768_11611,T_8192_2907,T_32768_11645,T_16384_5831,T_32768_11679,T_2048_731,T_32768_11713,T_16384_5865,T_32768_11747,T_8192_2941,T_32768_11781,T_16384_5899,T_32768_11815,T_4096_1479,T_32768_11849,T_16384_5933,T_32768_11883,T_8192_2975,T_32768_11917,T_16384_5967,T_32768_11951,T_512_187,T_32768_11985,T_16384_6001,T_32768_12019,T_8192_3009,T_32768_12053,T_16384_6035,T_32768_12087,T_4096_1513,T_32768_12121,T_16384_6069,T_32768_12155,T_8192_3043,T_32768_12189,T_16384_6103,T_32768_12223,T_2048_765,T_32768_12257,T_16384_6137,T_32768_12291,T_8192_3077,T_32768_12325,T_16384_6171,T_32768_12359,T_4096_1547,T_32768_12393,T_16384_6205,T_32768_12427,T_8192_3111,T_32768_12461,T_16384_6239,T_32768_12495,T_1024_391,T_32768_12529,T_16384_6273,T_32768_12563,T_8192_3145,T_32768_12597,T_16384_6307,T_32768_12631,T_4096_1581,T_32768_12665,T_16384_6341,T_32768_12699,T_8192_3179,T_32768_12733,T_16384_6375,T_32768_12767,T_2048_799,T_32768_12801,T_16384_6409,T_32768_12835,T_8192_3213,T_32768_12869,T_16384_6443,T_32768_12903,T_4096_1615,T_32768_12937,T_16384_6477,T_32768_12971,T_8192_3247,T_32768_13005,T_16384_6511,T_32768_13039,T_128_51,T_32768_13073,T_16384_6545,T_32768_13107,T_8192_3281,T_32768_13141,T_16384_6579,T_32768_13175,T_4096_1649,T_32768_13209,T_16384_6613,T_32768_13243,T_8192_3315,T_32768_13277,T_16384_6647,T_32768_13311,T_2048_833,T_32768_13345,T_16384_6681,T_32768_13379,T_8192_3349,T_32768_13413,T_16384_6715,T_32768_13447,T_4096_1683,T_32768_13481,T_16384_6749,T_32768_13515,T_8192_3383,T_32768_13549,T_16384_6783,T_32768_13583,T_1024_425,T_32768_13617,T_16384_6817,T_32768_13651,T_8192_3417,T_32768_13685,T_16384_6851,T_32768_13719,T_4096_1717,T_32768_13753,T_16384_6885,T_32768_13787,T_8192_3451,T_32768_13821,T_16384_6919,T_32768_13855,T_2048_867,T_32768_13889,T_16384_6953,T_32768_13923,T_8192_3485,T_32768_13957,T_16384_6987,T_32768_13991,T_4096_1751,T_32768_14025,T_16384_7021,T_32768_14059,T_8192_3519,T_32768_14093,T_16384_7055,T_32768_14127,T_512_221,T_32768_14161,T_16384_7089,T_32768_14195,T_8192_3553,T_32768_14229,T_16384_7123,T_32768_14263,T_4096_1785,T_32768_14297,T_16384_7157,T_32768_14331,T_8192_3587,T_32768_14365,T_16384_7191,T_32768_14399,T_2048_901,T_32768_14433,T_16384_7225,T_32768_14467,T_8192_3621,T_32768_14501,T_16384_7259,T_32768_14535,T_4096_1819,T_32768_14569,T_16384_7293,T_32768_14603,T_8192_3655,T_32768_14637,T_16384_7327,T_32768_14671,T_1024_459,T_32768_14705,T_16384_7361,T_32768_14739,T_8192_3689,T_32768_14773,T_16384_7395,T_32768_14807,T_4096_1853,T_32768_14841,T_16384_7429,T_32768_14875,T_8192_3723,T_32768_14909,T_16384_7463,T_32768_14943,T_2048_935,T_32768_14977,T_16384_7497,T_32768_15011,T_8192_3757,T_32768_15045,T_16384_7531,T_32768_15079,T_4096_1887,T_32768_15113,T_16384_7565,T_32768_15147,T_8192_3791,T_32768_15181,T_16384_7599,T_32768_15215,T_256_119,T_32768_15249,T_16384_7633,T_32768_15283,T_8192_3825,T_32768_15317,T_16384_7667,T_32768_15351,T_4096_1921,T_32768_15385,T_16384_7701,T_32768_15419,T_8192_3859,T_32768_15453,T_16384_7735,T_32768_15487,T_2048_969,T_32768_15521,T_16384_7769,T_32768_15555,T_8192_3893,T_32768_15589,T_16384_7803,T_32768_15623,T_4096_1955,T_32768_15657,T_16384_7837,T_32768_15691,T_8192_3927,T_32768_15725,T_16384_7871,T_32768_15759,T_1024_493,T_32768_15793,T_16384_7905,T_32768_15827,T_8192_3961,T_32768_15861,T_16384_7939,T_32768_15895,T_4096_1989,T_32768_15929,T_16384_7973,T_32768_15963,T_8192_3995,T_32768_15997,T_16384_8007,T_32768_16031,T_2048_1003,T_32768_16065,T_16384_8041,T_32768_16099,T_8192_4029,T_32768_16133,T_16384_8075,T_32768_16167,T_4096_2023,T_32768_16201,T_16384_8109,T_32768_16235,T_8192_4063,T_32768_16269,T_16384_8143,T_32768_16303,T_512_255,T_32768_16337,T_16384_8177,T_32768_16371,T_8192_4097,T_32768_16405,T_16384_8211,T_32768_16439,T_4096_2057,T_32768_16473,T_16384_8245,T_32768_16507,T_8192_4131,T_32768_16541,T_16384_8279,T_32768_16575,T_2048_1037,T_32768_16609,T_16384_8313,T_32768_16643,T_8192_4165,T_32768_16677,T_16384_8347,T_32768_16711,T_4096_2091,T_32768_16745,T_16384_8381,T_32768_16779,T_8192_4199,T_32768_16813,T_16384_8415,T_32768_16847,T_1024_527,T_32768_16881,T_16384_8449,T_32768_16915,T_8192_4233,T_32768_16949,T_16384_8483,T_32768_16983,T_4096_2125,T_32768_17017,T_16384_8517,T_32768_17051,T_8192_4267,T_32768_17085,T_16384_8551,T_32768_17119,T_2048_1071,T_32768_17153,T_16384_8585,T_32768_17187,T_8192_4301,T_32768_17221,T_16384_8619,T_32768_17255,T_4096_2159,T_32768_17289,T_16384_8653,T_32768_17323,T_8192_4335,T_32768_17357,T_16384_8687,T_32768_17391 +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp64.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp64.hpp.inc new file mode 100644 index 0000000000000..c29fd2227e9b5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp64.hpp.inc @@ -0,0 +1,29 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#ifndef CUFFTDX_DATABASE_LUT_FP64_INC_HPP + +#define CUFFTDX_DATABASE_LUT_FP64_INC_HPP + +#include "lut_defines.hpp.inc" + +#ifdef _MSC_VER +// truncation of twiddle values is intended +#pragma warning(disable : 4305) +#pragma warning(disable : 4838) +#endif + + + +#include "lut_fp64_0.hpp.inc" + + + +#endif diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp64_0.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp64_0.hpp.inc new file mode 100644 index 0000000000000..70f61dd4966dc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/lut_fp64_0.hpp.inc @@ -0,0 +1,322 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +static const __device__ double2 lut_dp_2_2[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_3_3[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_2_4[2*2] = { + T_2_0,T_4_1,T_2_0,T_2_1 +}; +static const __device__ double2 lut_dp_4_4[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_5_5[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_6_6[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_7_7[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_2_8[4*2] = { + T_2_0,T_8_1,T_4_1,T_8_3,T_2_0,T_4_1,T_2_1,T_4_3 +}; +static const __device__ double2 lut_dp_4_8[2*2] = { + T_2_0,T_8_1,T_2_0,T_8_3 +}; +static const __device__ double2 lut_dp_8_8[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_3_9[3*2] = { + T_2_0,T_9_1,T_9_2,T_2_0,T_9_2,T_9_4 +}; +static const __device__ double2 lut_dp_9_9[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_10_10[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_11_11[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_12_12[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_13_13[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_14_14[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_15_15[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_2_16[8*2] = { + T_2_0,T_16_1,T_8_1,T_16_3,T_4_1,T_16_5,T_8_3,T_16_7,T_2_0,T_8_1,T_4_1,T_8_3,T_2_1,T_8_5,T_4_3,T_8_7 +}; +static const __device__ double2 lut_dp_4_16[4*2] = { + T_2_0,T_16_1,T_8_1,T_16_3,T_2_0,T_16_3,T_8_3,T_16_9 +}; +static const __device__ double2 lut_dp_8_16[2*2] = { + T_2_0,T_16_1,T_2_0,T_16_5 +}; +static const __device__ double2 lut_dp_16_16[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_17_17[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_18_18[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_19_19[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_20_20[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_21_21[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_22_22[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_23_23[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_24_24[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_5_25[5*2] = { + T_2_0,T_25_1,T_25_2,T_25_3,T_25_4,T_2_0,T_25_3,T_25_6,T_25_9,T_25_12 +}; +static const __device__ double2 lut_dp_25_25[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_26_26[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_3_27[9*2] = { + T_2_0,T_27_1,T_27_2,T_9_1,T_27_4,T_27_5,T_9_2,T_27_7,T_27_8,T_2_0,T_27_2,T_27_4,T_9_2,T_27_8,T_27_10,T_9_4,T_27_14,T_27_16 +}; +static const __device__ double2 lut_dp_9_27[3*2] = { + T_2_0,T_27_1,T_27_2,T_2_0,T_27_5,T_27_10 +}; +static const __device__ double2 lut_dp_27_27[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_28_28[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_29_29[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_30_30[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_31_31[1*2] = { + T_2_0,T_2_0 +}; +static const __device__ double2 lut_dp_2_32[16*2] = { + T_2_0,T_32_1,T_16_1,T_32_3,T_8_1,T_32_5,T_16_3,T_32_7,T_4_1,T_32_9,T_16_5,T_32_11,T_8_3,T_32_13,T_16_7,T_32_15,T_2_0,T_16_1,T_8_1,T_16_3,T_4_1,T_16_5,T_8_3,T_16_7,T_2_1,T_16_9,T_8_5,T_16_11,T_4_3,T_16_13,T_8_7,T_16_15 +}; +static const __device__ double2 lut_dp_4_32[8*2] = { + T_2_0,T_32_1,T_16_1,T_32_3,T_8_1,T_32_5,T_16_3,T_32_7,T_2_0,T_32_3,T_16_3,T_32_9,T_8_3,T_32_15,T_16_9,T_32_21 +}; +static const __device__ double2 lut_dp_8_32[4*2] = { + T_2_0,T_32_1,T_16_1,T_32_3,T_2_0,T_32_5,T_16_5,T_32_15 +}; +static const __device__ double2 lut_dp_16_32[2*2] = { + T_2_0,T_32_1,T_2_0,T_32_9 +}; +static const __device__ double2 lut_dp_6_36[6*2] = { + T_2_0,T_36_1,T_18_1,T_12_1,T_9_1,T_36_5,T_2_0,T_9_1,T_9_2,T_3_1,T_9_4,T_9_5 +}; +static const __device__ double2 lut_dp_7_49[7*2] = { + T_2_0,T_49_1,T_49_2,T_49_3,T_49_4,T_49_5,T_49_6,T_2_0,T_49_4,T_49_8,T_49_12,T_49_16,T_49_20,T_49_24 +}; +static const __device__ double2 lut_dp_2_64[32*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_8_1,T_64_9,T_32_5,T_64_11,T_16_3,T_64_13,T_32_7,T_64_15,T_4_1,T_64_17,T_32_9,T_64_19,T_16_5,T_64_21,T_32_11,T_64_23,T_8_3,T_64_25,T_32_13,T_64_27,T_16_7,T_64_29,T_32_15,T_64_31,T_2_0,T_32_1,T_16_1,T_32_3,T_8_1,T_32_5,T_16_3,T_32_7,T_4_1,T_32_9,T_16_5,T_32_11,T_8_3,T_32_13,T_16_7,T_32_15,T_2_1,T_32_17,T_16_9,T_32_19,T_8_5,T_32_21,T_16_11,T_32_23,T_4_3,T_32_25,T_16_13,T_32_27,T_8_7,T_32_29,T_16_15,T_32_31 +}; +static const __device__ double2 lut_dp_4_64[16*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_8_1,T_64_9,T_32_5,T_64_11,T_16_3,T_64_13,T_32_7,T_64_15,T_2_0,T_64_3,T_32_3,T_64_9,T_16_3,T_64_15,T_32_9,T_64_21,T_8_3,T_64_27,T_32_15,T_64_33,T_16_9,T_64_39,T_32_21,T_64_45 +}; +static const __device__ double2 lut_dp_8_64[8*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_2_0,T_64_5,T_32_5,T_64_15,T_16_5,T_64_25,T_32_15,T_64_35 +}; +static const __device__ double2 lut_dp_16_64[4*2] = { + T_2_0,T_64_1,T_32_1,T_64_3,T_2_0,T_64_9,T_32_9,T_64_27 +}; +static const __device__ double2 lut_dp_3_81[27*2] = { + T_2_0,T_81_1,T_81_2,T_27_1,T_81_4,T_81_5,T_27_2,T_81_7,T_81_8,T_9_1,T_81_10,T_81_11,T_27_4,T_81_13,T_81_14,T_27_5,T_81_16,T_81_17,T_9_2,T_81_19,T_81_20,T_27_7,T_81_22,T_81_23,T_27_8,T_81_25,T_81_26,T_2_0,T_81_2,T_81_4,T_27_2,T_81_8,T_81_10,T_27_4,T_81_14,T_81_16,T_9_2,T_81_20,T_81_22,T_27_8,T_81_26,T_81_28,T_27_10,T_81_32,T_81_34,T_9_4,T_81_38,T_81_40,T_27_14,T_81_44,T_81_46,T_27_16,T_81_50,T_81_52 +}; +static const __device__ double2 lut_dp_9_81[9*2] = { + T_2_0,T_81_1,T_81_2,T_27_1,T_81_4,T_81_5,T_27_2,T_81_7,T_81_8,T_2_0,T_81_5,T_81_10,T_27_5,T_81_20,T_81_25,T_27_10,T_81_35,T_81_40 +}; +static const __device__ double2 lut_dp_27_81[3*2] = { + T_2_0,T_81_1,T_81_2,T_2_0,T_81_14,T_81_28 +}; +static const __device__ double2 lut_dp_10_100[10*2] = { + T_2_0,T_100_1,T_50_1,T_100_3,T_25_1,T_20_1,T_50_3,T_100_7,T_25_2,T_100_9,T_2_0,T_50_3,T_25_3,T_50_9,T_25_6,T_10_3,T_25_9,T_50_21,T_25_12,T_50_27 +}; +static const __device__ double2 lut_dp_11_121[11*2] = { + T_2_0,T_121_1,T_121_2,T_121_3,T_121_4,T_121_5,T_121_6,T_121_7,T_121_8,T_121_9,T_121_10,T_2_0,T_121_6,T_121_12,T_121_18,T_121_24,T_121_30,T_121_36,T_121_42,T_121_48,T_121_54,T_121_60 +}; +static const __device__ double2 lut_dp_5_125[25*2] = { + T_2_0,T_125_1,T_125_2,T_125_3,T_125_4,T_25_1,T_125_6,T_125_7,T_125_8,T_125_9,T_25_2,T_125_11,T_125_12,T_125_13,T_125_14,T_25_3,T_125_16,T_125_17,T_125_18,T_125_19,T_25_4,T_125_21,T_125_22,T_125_23,T_125_24,T_2_0,T_125_3,T_125_6,T_125_9,T_125_12,T_25_3,T_125_18,T_125_21,T_125_24,T_125_27,T_25_6,T_125_33,T_125_36,T_125_39,T_125_42,T_25_9,T_125_48,T_125_51,T_125_54,T_125_57,T_25_12,T_125_63,T_125_66,T_125_69,T_125_72 +}; +static const __device__ double2 lut_dp_25_125[5*2] = { + T_2_0,T_125_1,T_125_2,T_125_3,T_125_4,T_2_0,T_125_13,T_125_26,T_125_39,T_125_52 +}; +static const __device__ double2 lut_dp_2_128[64*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_8_1,T_128_17,T_64_9,T_128_19,T_32_5,T_128_21,T_64_11,T_128_23,T_16_3,T_128_25,T_64_13,T_128_27,T_32_7,T_128_29,T_64_15,T_128_31,T_4_1,T_128_33,T_64_17,T_128_35,T_32_9,T_128_37,T_64_19,T_128_39,T_16_5,T_128_41,T_64_21,T_128_43,T_32_11,T_128_45,T_64_23,T_128_47,T_8_3,T_128_49,T_64_25,T_128_51,T_32_13,T_128_53,T_64_27,T_128_55,T_16_7,T_128_57,T_64_29,T_128_59,T_32_15,T_128_61,T_64_31,T_128_63,T_2_0,T_64_1,T_32_1,T_64_3,T_16_1,T_64_5,T_32_3,T_64_7,T_8_1,T_64_9,T_32_5,T_64_11,T_16_3,T_64_13,T_32_7,T_64_15,T_4_1,T_64_17,T_32_9,T_64_19,T_16_5,T_64_21,T_32_11,T_64_23,T_8_3,T_64_25,T_32_13,T_64_27,T_16_7,T_64_29,T_32_15,T_64_31,T_2_1,T_64_33,T_32_17,T_64_35,T_16_9,T_64_37,T_32_19,T_64_39,T_8_5,T_64_41,T_32_21,T_64_43,T_16_11,T_64_45,T_32_23,T_64_47,T_4_3,T_64_49,T_32_25,T_64_51,T_16_13,T_64_53,T_32_27,T_64_55,T_8_7,T_64_57,T_32_29,T_64_59,T_16_15,T_64_61,T_32_31,T_64_63 +}; +static const __device__ double2 lut_dp_4_128[32*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_8_1,T_128_17,T_64_9,T_128_19,T_32_5,T_128_21,T_64_11,T_128_23,T_16_3,T_128_25,T_64_13,T_128_27,T_32_7,T_128_29,T_64_15,T_128_31,T_2_0,T_128_3,T_64_3,T_128_9,T_32_3,T_128_15,T_64_9,T_128_21,T_16_3,T_128_27,T_64_15,T_128_33,T_32_9,T_128_39,T_64_21,T_128_45,T_8_3,T_128_51,T_64_27,T_128_57,T_32_15,T_128_63,T_64_33,T_128_69,T_16_9,T_128_75,T_64_39,T_128_81,T_32_21,T_128_87,T_64_45,T_128_93 +}; +static const __device__ double2 lut_dp_8_128[16*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_2_0,T_128_5,T_64_5,T_128_15,T_32_5,T_128_25,T_64_15,T_128_35,T_16_5,T_128_45,T_64_25,T_128_55,T_32_15,T_128_65,T_64_35,T_128_75 +}; +static const __device__ double2 lut_dp_16_128[8*2] = { + T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_2_0,T_128_9,T_64_9,T_128_27,T_32_9,T_128_45,T_64_27,T_128_63 +}; +static const __device__ double2 lut_dp_12_144[12*2] = { + T_2_0,T_144_1,T_72_1,T_48_1,T_36_1,T_144_5,T_24_1,T_144_7,T_18_1,T_16_1,T_72_5,T_144_11,T_2_0,T_144_7,T_72_7,T_48_7,T_36_7,T_144_35,T_24_7,T_144_49,T_18_7,T_16_7,T_72_35,T_144_77 +}; +static const __device__ double2 lut_dp_6_216[36*2] = { + T_2_0,T_216_1,T_108_1,T_72_1,T_54_1,T_216_5,T_36_1,T_216_7,T_27_1,T_24_1,T_108_5,T_216_11,T_18_1,T_216_13,T_108_7,T_72_5,T_27_2,T_216_17,T_12_1,T_216_19,T_54_5,T_72_7,T_108_11,T_216_23,T_9_1,T_216_25,T_108_13,T_8_1,T_54_7,T_216_29,T_36_5,T_216_31,T_27_4,T_72_11,T_108_17,T_216_35,T_2_0,T_54_1,T_27_1,T_18_1,T_27_2,T_54_5,T_9_1,T_54_7,T_27_4,T_6_1,T_27_5,T_54_11,T_9_2,T_54_13,T_27_7,T_18_5,T_27_8,T_54_17,T_3_1,T_54_19,T_27_10,T_18_7,T_27_11,T_54_23,T_9_4,T_54_25,T_27_13,T_2_1,T_27_14,T_54_29,T_9_5,T_54_31,T_27_16,T_18_11,T_27_17,T_54_35 +}; +static const __device__ double2 lut_dp_3_243[81*2] = { + T_2_0,T_243_1,T_243_2,T_81_1,T_243_4,T_243_5,T_81_2,T_243_7,T_243_8,T_27_1,T_243_10,T_243_11,T_81_4,T_243_13,T_243_14,T_81_5,T_243_16,T_243_17,T_27_2,T_243_19,T_243_20,T_81_7,T_243_22,T_243_23,T_81_8,T_243_25,T_243_26,T_9_1,T_243_28,T_243_29,T_81_10,T_243_31,T_243_32,T_81_11,T_243_34,T_243_35,T_27_4,T_243_37,T_243_38,T_81_13,T_243_40,T_243_41,T_81_14,T_243_43,T_243_44,T_27_5,T_243_46,T_243_47,T_81_16,T_243_49,T_243_50,T_81_17,T_243_52,T_243_53,T_9_2,T_243_55,T_243_56,T_81_19,T_243_58,T_243_59,T_81_20,T_243_61,T_243_62,T_27_7,T_243_64,T_243_65,T_81_22,T_243_67,T_243_68,T_81_23,T_243_70,T_243_71,T_27_8,T_243_73,T_243_74,T_81_25,T_243_76,T_243_77,T_81_26,T_243_79,T_243_80,T_2_0,T_243_2,T_243_4,T_81_2,T_243_8,T_243_10,T_81_4,T_243_14,T_243_16,T_27_2,T_243_20,T_243_22,T_81_8,T_243_26,T_243_28,T_81_10,T_243_32,T_243_34,T_27_4,T_243_38,T_243_40,T_81_14,T_243_44,T_243_46,T_81_16,T_243_50,T_243_52,T_9_2,T_243_56,T_243_58,T_81_20,T_243_62,T_243_64,T_81_22,T_243_68,T_243_70,T_27_8,T_243_74,T_243_76,T_81_26,T_243_80,T_243_82,T_81_28,T_243_86,T_243_88,T_27_10,T_243_92,T_243_94,T_81_32,T_243_98,T_243_100,T_81_34,T_243_104,T_243_106,T_9_4,T_243_110,T_243_112,T_81_38,T_243_116,T_243_118,T_81_40,T_243_122,T_243_124,T_27_14,T_243_128,T_243_130,T_81_44,T_243_134,T_243_136,T_81_46,T_243_140,T_243_142,T_27_16,T_243_146,T_243_148,T_81_50,T_243_152,T_243_154,T_81_52,T_243_158,T_243_160 +}; +static const __device__ double2 lut_dp_9_243[27*2] = { + T_2_0,T_243_1,T_243_2,T_81_1,T_243_4,T_243_5,T_81_2,T_243_7,T_243_8,T_27_1,T_243_10,T_243_11,T_81_4,T_243_13,T_243_14,T_81_5,T_243_16,T_243_17,T_27_2,T_243_19,T_243_20,T_81_7,T_243_22,T_243_23,T_81_8,T_243_25,T_243_26,T_2_0,T_243_5,T_243_10,T_81_5,T_243_20,T_243_25,T_81_10,T_243_35,T_243_40,T_27_5,T_243_50,T_243_55,T_81_20,T_243_65,T_243_70,T_81_25,T_243_80,T_243_85,T_27_10,T_243_95,T_243_100,T_81_35,T_243_110,T_243_115,T_81_40,T_243_125,T_243_130 +}; +static const __device__ double2 lut_dp_27_243[9*2] = { + T_2_0,T_243_1,T_243_2,T_81_1,T_243_4,T_243_5,T_81_2,T_243_7,T_243_8,T_2_0,T_243_14,T_243_28,T_81_14,T_243_56,T_243_70,T_81_28,T_243_98,T_243_112 +}; +static const __device__ double2 lut_dp_2_256[128*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_8_1,T_256_33,T_128_17,T_256_35,T_64_9,T_256_37,T_128_19,T_256_39,T_32_5,T_256_41,T_128_21,T_256_43,T_64_11,T_256_45,T_128_23,T_256_47,T_16_3,T_256_49,T_128_25,T_256_51,T_64_13,T_256_53,T_128_27,T_256_55,T_32_7,T_256_57,T_128_29,T_256_59,T_64_15,T_256_61,T_128_31,T_256_63,T_4_1,T_256_65,T_128_33,T_256_67,T_64_17,T_256_69,T_128_35,T_256_71,T_32_9,T_256_73,T_128_37,T_256_75,T_64_19,T_256_77,T_128_39,T_256_79,T_16_5,T_256_81,T_128_41,T_256_83,T_64_21,T_256_85,T_128_43,T_256_87,T_32_11,T_256_89,T_128_45,T_256_91,T_64_23,T_256_93,T_128_47,T_256_95,T_8_3,T_256_97,T_128_49,T_256_99,T_64_25,T_256_101,T_128_51,T_256_103,T_32_13,T_256_105,T_128_53,T_256_107,T_64_27,T_256_109,T_128_55,T_256_111,T_16_7,T_256_113,T_128_57,T_256_115,T_64_29,T_256_117,T_128_59,T_256_119,T_32_15,T_256_121,T_128_61,T_256_123,T_64_31,T_256_125,T_128_63,T_256_127,T_2_0,T_128_1,T_64_1,T_128_3,T_32_1,T_128_5,T_64_3,T_128_7,T_16_1,T_128_9,T_64_5,T_128_11,T_32_3,T_128_13,T_64_7,T_128_15,T_8_1,T_128_17,T_64_9,T_128_19,T_32_5,T_128_21,T_64_11,T_128_23,T_16_3,T_128_25,T_64_13,T_128_27,T_32_7,T_128_29,T_64_15,T_128_31,T_4_1,T_128_33,T_64_17,T_128_35,T_32_9,T_128_37,T_64_19,T_128_39,T_16_5,T_128_41,T_64_21,T_128_43,T_32_11,T_128_45,T_64_23,T_128_47,T_8_3,T_128_49,T_64_25,T_128_51,T_32_13,T_128_53,T_64_27,T_128_55,T_16_7,T_128_57,T_64_29,T_128_59,T_32_15,T_128_61,T_64_31,T_128_63,T_2_1,T_128_65,T_64_33,T_128_67,T_32_17,T_128_69,T_64_35,T_128_71,T_16_9,T_128_73,T_64_37,T_128_75,T_32_19,T_128_77,T_64_39,T_128_79,T_8_5,T_128_81,T_64_41,T_128_83,T_32_21,T_128_85,T_64_43,T_128_87,T_16_11,T_128_89,T_64_45,T_128_91,T_32_23,T_128_93,T_64_47,T_128_95,T_4_3,T_128_97,T_64_49,T_128_99,T_32_25,T_128_101,T_64_51,T_128_103,T_16_13,T_128_105,T_64_53,T_128_107,T_32_27,T_128_109,T_64_55,T_128_111,T_8_7,T_128_113,T_64_57,T_128_115,T_32_29,T_128_117,T_64_59,T_128_119,T_16_15,T_128_121,T_64_61,T_128_123,T_32_31,T_128_125,T_64_63,T_128_127 +}; +static const __device__ double2 lut_dp_4_256[64*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_8_1,T_256_33,T_128_17,T_256_35,T_64_9,T_256_37,T_128_19,T_256_39,T_32_5,T_256_41,T_128_21,T_256_43,T_64_11,T_256_45,T_128_23,T_256_47,T_16_3,T_256_49,T_128_25,T_256_51,T_64_13,T_256_53,T_128_27,T_256_55,T_32_7,T_256_57,T_128_29,T_256_59,T_64_15,T_256_61,T_128_31,T_256_63,T_2_0,T_256_3,T_128_3,T_256_9,T_64_3,T_256_15,T_128_9,T_256_21,T_32_3,T_256_27,T_128_15,T_256_33,T_64_9,T_256_39,T_128_21,T_256_45,T_16_3,T_256_51,T_128_27,T_256_57,T_64_15,T_256_63,T_128_33,T_256_69,T_32_9,T_256_75,T_128_39,T_256_81,T_64_21,T_256_87,T_128_45,T_256_93,T_8_3,T_256_99,T_128_51,T_256_105,T_64_27,T_256_111,T_128_57,T_256_117,T_32_15,T_256_123,T_128_63,T_256_129,T_64_33,T_256_135,T_128_69,T_256_141,T_16_9,T_256_147,T_128_75,T_256_153,T_64_39,T_256_159,T_128_81,T_256_165,T_32_21,T_256_171,T_128_87,T_256_177,T_64_45,T_256_183,T_128_93,T_256_189 +}; +static const __device__ double2 lut_dp_8_256[32*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_2_0,T_256_5,T_128_5,T_256_15,T_64_5,T_256_25,T_128_15,T_256_35,T_32_5,T_256_45,T_128_25,T_256_55,T_64_15,T_256_65,T_128_35,T_256_75,T_16_5,T_256_85,T_128_45,T_256_95,T_64_25,T_256_105,T_128_55,T_256_115,T_32_15,T_256_125,T_128_65,T_256_135,T_64_35,T_256_145,T_128_75,T_256_155 +}; +static const __device__ double2 lut_dp_16_256[16*2] = { + T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_2_0,T_256_9,T_128_9,T_256_27,T_64_9,T_256_45,T_128_27,T_256_63,T_32_9,T_256_81,T_128_45,T_256_99,T_64_27,T_256_117,T_128_63,T_256_135 +}; +static const __device__ double2 lut_dp_7_343[49*2] = { + T_2_0,T_343_1,T_343_2,T_343_3,T_343_4,T_343_5,T_343_6,T_49_1,T_343_8,T_343_9,T_343_10,T_343_11,T_343_12,T_343_13,T_49_2,T_343_15,T_343_16,T_343_17,T_343_18,T_343_19,T_343_20,T_49_3,T_343_22,T_343_23,T_343_24,T_343_25,T_343_26,T_343_27,T_49_4,T_343_29,T_343_30,T_343_31,T_343_32,T_343_33,T_343_34,T_49_5,T_343_36,T_343_37,T_343_38,T_343_39,T_343_40,T_343_41,T_49_6,T_343_43,T_343_44,T_343_45,T_343_46,T_343_47,T_343_48,T_2_0,T_343_4,T_343_8,T_343_12,T_343_16,T_343_20,T_343_24,T_49_4,T_343_32,T_343_36,T_343_40,T_343_44,T_343_48,T_343_52,T_49_8,T_343_60,T_343_64,T_343_68,T_343_72,T_343_76,T_343_80,T_49_12,T_343_88,T_343_92,T_343_96,T_343_100,T_343_104,T_343_108,T_49_16,T_343_116,T_343_120,T_343_124,T_343_128,T_343_132,T_343_136,T_49_20,T_343_144,T_343_148,T_343_152,T_343_156,T_343_160,T_343_164,T_49_24,T_343_172,T_343_176,T_343_180,T_343_184,T_343_188,T_343_192 +}; +static const __device__ double2 lut_dp_2_512[256*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_8_1,T_512_65,T_256_33,T_512_67,T_128_17,T_512_69,T_256_35,T_512_71,T_64_9,T_512_73,T_256_37,T_512_75,T_128_19,T_512_77,T_256_39,T_512_79,T_32_5,T_512_81,T_256_41,T_512_83,T_128_21,T_512_85,T_256_43,T_512_87,T_64_11,T_512_89,T_256_45,T_512_91,T_128_23,T_512_93,T_256_47,T_512_95,T_16_3,T_512_97,T_256_49,T_512_99,T_128_25,T_512_101,T_256_51,T_512_103,T_64_13,T_512_105,T_256_53,T_512_107,T_128_27,T_512_109,T_256_55,T_512_111,T_32_7,T_512_113,T_256_57,T_512_115,T_128_29,T_512_117,T_256_59,T_512_119,T_64_15,T_512_121,T_256_61,T_512_123,T_128_31,T_512_125,T_256_63,T_512_127,T_4_1,T_512_129,T_256_65,T_512_131,T_128_33,T_512_133,T_256_67,T_512_135,T_64_17,T_512_137,T_256_69,T_512_139,T_128_35,T_512_141,T_256_71,T_512_143,T_32_9,T_512_145,T_256_73,T_512_147,T_128_37,T_512_149,T_256_75,T_512_151,T_64_19,T_512_153,T_256_77,T_512_155,T_128_39,T_512_157,T_256_79,T_512_159,T_16_5,T_512_161,T_256_81,T_512_163,T_128_41,T_512_165,T_256_83,T_512_167,T_64_21,T_512_169,T_256_85,T_512_171,T_128_43,T_512_173,T_256_87,T_512_175,T_32_11,T_512_177,T_256_89,T_512_179,T_128_45,T_512_181,T_256_91,T_512_183,T_64_23,T_512_185,T_256_93,T_512_187,T_128_47,T_512_189,T_256_95,T_512_191,T_8_3,T_512_193,T_256_97,T_512_195,T_128_49,T_512_197,T_256_99,T_512_199,T_64_25,T_512_201,T_256_101,T_512_203,T_128_51,T_512_205,T_256_103,T_512_207,T_32_13,T_512_209,T_256_105,T_512_211,T_128_53,T_512_213,T_256_107,T_512_215,T_64_27,T_512_217,T_256_109,T_512_219,T_128_55,T_512_221,T_256_111,T_512_223,T_16_7,T_512_225,T_256_113,T_512_227,T_128_57,T_512_229,T_256_115,T_512_231,T_64_29,T_512_233,T_256_117,T_512_235,T_128_59,T_512_237,T_256_119,T_512_239,T_32_15,T_512_241,T_256_121,T_512_243,T_128_61,T_512_245,T_256_123,T_512_247,T_64_31,T_512_249,T_256_125,T_512_251,T_128_63,T_512_253,T_256_127,T_512_255,T_2_0,T_256_1,T_128_1,T_256_3,T_64_1,T_256_5,T_128_3,T_256_7,T_32_1,T_256_9,T_128_5,T_256_11,T_64_3,T_256_13,T_128_7,T_256_15,T_16_1,T_256_17,T_128_9,T_256_19,T_64_5,T_256_21,T_128_11,T_256_23,T_32_3,T_256_25,T_128_13,T_256_27,T_64_7,T_256_29,T_128_15,T_256_31,T_8_1,T_256_33,T_128_17,T_256_35,T_64_9,T_256_37,T_128_19,T_256_39,T_32_5,T_256_41,T_128_21,T_256_43,T_64_11,T_256_45,T_128_23,T_256_47,T_16_3,T_256_49,T_128_25,T_256_51,T_64_13,T_256_53,T_128_27,T_256_55,T_32_7,T_256_57,T_128_29,T_256_59,T_64_15,T_256_61,T_128_31,T_256_63,T_4_1,T_256_65,T_128_33,T_256_67,T_64_17,T_256_69,T_128_35,T_256_71,T_32_9,T_256_73,T_128_37,T_256_75,T_64_19,T_256_77,T_128_39,T_256_79,T_16_5,T_256_81,T_128_41,T_256_83,T_64_21,T_256_85,T_128_43,T_256_87,T_32_11,T_256_89,T_128_45,T_256_91,T_64_23,T_256_93,T_128_47,T_256_95,T_8_3,T_256_97,T_128_49,T_256_99,T_64_25,T_256_101,T_128_51,T_256_103,T_32_13,T_256_105,T_128_53,T_256_107,T_64_27,T_256_109,T_128_55,T_256_111,T_16_7,T_256_113,T_128_57,T_256_115,T_64_29,T_256_117,T_128_59,T_256_119,T_32_15,T_256_121,T_128_61,T_256_123,T_64_31,T_256_125,T_128_63,T_256_127,T_2_1,T_256_129,T_128_65,T_256_131,T_64_33,T_256_133,T_128_67,T_256_135,T_32_17,T_256_137,T_128_69,T_256_139,T_64_35,T_256_141,T_128_71,T_256_143,T_16_9,T_256_145,T_128_73,T_256_147,T_64_37,T_256_149,T_128_75,T_256_151,T_32_19,T_256_153,T_128_77,T_256_155,T_64_39,T_256_157,T_128_79,T_256_159,T_8_5,T_256_161,T_128_81,T_256_163,T_64_41,T_256_165,T_128_83,T_256_167,T_32_21,T_256_169,T_128_85,T_256_171,T_64_43,T_256_173,T_128_87,T_256_175,T_16_11,T_256_177,T_128_89,T_256_179,T_64_45,T_256_181,T_128_91,T_256_183,T_32_23,T_256_185,T_128_93,T_256_187,T_64_47,T_256_189,T_128_95,T_256_191,T_4_3,T_256_193,T_128_97,T_256_195,T_64_49,T_256_197,T_128_99,T_256_199,T_32_25,T_256_201,T_128_101,T_256_203,T_64_51,T_256_205,T_128_103,T_256_207,T_16_13,T_256_209,T_128_105,T_256_211,T_64_53,T_256_213,T_128_107,T_256_215,T_32_27,T_256_217,T_128_109,T_256_219,T_64_55,T_256_221,T_128_111,T_256_223,T_8_7,T_256_225,T_128_113,T_256_227,T_64_57,T_256_229,T_128_115,T_256_231,T_32_29,T_256_233,T_128_117,T_256_235,T_64_59,T_256_237,T_128_119,T_256_239,T_16_15,T_256_241,T_128_121,T_256_243,T_64_61,T_256_245,T_128_123,T_256_247,T_32_31,T_256_249,T_128_125,T_256_251,T_64_63,T_256_253,T_128_127,T_256_255 +}; +static const __device__ double2 lut_dp_4_512[128*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_8_1,T_512_65,T_256_33,T_512_67,T_128_17,T_512_69,T_256_35,T_512_71,T_64_9,T_512_73,T_256_37,T_512_75,T_128_19,T_512_77,T_256_39,T_512_79,T_32_5,T_512_81,T_256_41,T_512_83,T_128_21,T_512_85,T_256_43,T_512_87,T_64_11,T_512_89,T_256_45,T_512_91,T_128_23,T_512_93,T_256_47,T_512_95,T_16_3,T_512_97,T_256_49,T_512_99,T_128_25,T_512_101,T_256_51,T_512_103,T_64_13,T_512_105,T_256_53,T_512_107,T_128_27,T_512_109,T_256_55,T_512_111,T_32_7,T_512_113,T_256_57,T_512_115,T_128_29,T_512_117,T_256_59,T_512_119,T_64_15,T_512_121,T_256_61,T_512_123,T_128_31,T_512_125,T_256_63,T_512_127,T_2_0,T_512_3,T_256_3,T_512_9,T_128_3,T_512_15,T_256_9,T_512_21,T_64_3,T_512_27,T_256_15,T_512_33,T_128_9,T_512_39,T_256_21,T_512_45,T_32_3,T_512_51,T_256_27,T_512_57,T_128_15,T_512_63,T_256_33,T_512_69,T_64_9,T_512_75,T_256_39,T_512_81,T_128_21,T_512_87,T_256_45,T_512_93,T_16_3,T_512_99,T_256_51,T_512_105,T_128_27,T_512_111,T_256_57,T_512_117,T_64_15,T_512_123,T_256_63,T_512_129,T_128_33,T_512_135,T_256_69,T_512_141,T_32_9,T_512_147,T_256_75,T_512_153,T_128_39,T_512_159,T_256_81,T_512_165,T_64_21,T_512_171,T_256_87,T_512_177,T_128_45,T_512_183,T_256_93,T_512_189,T_8_3,T_512_195,T_256_99,T_512_201,T_128_51,T_512_207,T_256_105,T_512_213,T_64_27,T_512_219,T_256_111,T_512_225,T_128_57,T_512_231,T_256_117,T_512_237,T_32_15,T_512_243,T_256_123,T_512_249,T_128_63,T_512_255,T_256_129,T_512_261,T_64_33,T_512_267,T_256_135,T_512_273,T_128_69,T_512_279,T_256_141,T_512_285,T_16_9,T_512_291,T_256_147,T_512_297,T_128_75,T_512_303,T_256_153,T_512_309,T_64_39,T_512_315,T_256_159,T_512_321,T_128_81,T_512_327,T_256_165,T_512_333,T_32_21,T_512_339,T_256_171,T_512_345,T_128_87,T_512_351,T_256_177,T_512_357,T_64_45,T_512_363,T_256_183,T_512_369,T_128_93,T_512_375,T_256_189,T_512_381 +}; +static const __device__ double2 lut_dp_8_512[64*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_2_0,T_512_5,T_256_5,T_512_15,T_128_5,T_512_25,T_256_15,T_512_35,T_64_5,T_512_45,T_256_25,T_512_55,T_128_15,T_512_65,T_256_35,T_512_75,T_32_5,T_512_85,T_256_45,T_512_95,T_128_25,T_512_105,T_256_55,T_512_115,T_64_15,T_512_125,T_256_65,T_512_135,T_128_35,T_512_145,T_256_75,T_512_155,T_16_5,T_512_165,T_256_85,T_512_175,T_128_45,T_512_185,T_256_95,T_512_195,T_64_25,T_512_205,T_256_105,T_512_215,T_128_55,T_512_225,T_256_115,T_512_235,T_32_15,T_512_245,T_256_125,T_512_255,T_128_65,T_512_265,T_256_135,T_512_275,T_64_35,T_512_285,T_256_145,T_512_295,T_128_75,T_512_305,T_256_155,T_512_315 +}; +static const __device__ double2 lut_dp_16_512[32*2] = { + T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_2_0,T_512_9,T_256_9,T_512_27,T_128_9,T_512_45,T_256_27,T_512_63,T_64_9,T_512_81,T_256_45,T_512_99,T_128_27,T_512_117,T_256_63,T_512_135,T_32_9,T_512_153,T_256_81,T_512_171,T_128_45,T_512_189,T_256_99,T_512_207,T_64_27,T_512_225,T_256_117,T_512_243,T_128_63,T_512_261,T_256_135,T_512_279 +}; +static const __device__ double2 lut_dp_5_625[125*2] = { + T_2_0,T_625_1,T_625_2,T_625_3,T_625_4,T_125_1,T_625_6,T_625_7,T_625_8,T_625_9,T_125_2,T_625_11,T_625_12,T_625_13,T_625_14,T_125_3,T_625_16,T_625_17,T_625_18,T_625_19,T_125_4,T_625_21,T_625_22,T_625_23,T_625_24,T_25_1,T_625_26,T_625_27,T_625_28,T_625_29,T_125_6,T_625_31,T_625_32,T_625_33,T_625_34,T_125_7,T_625_36,T_625_37,T_625_38,T_625_39,T_125_8,T_625_41,T_625_42,T_625_43,T_625_44,T_125_9,T_625_46,T_625_47,T_625_48,T_625_49,T_25_2,T_625_51,T_625_52,T_625_53,T_625_54,T_125_11,T_625_56,T_625_57,T_625_58,T_625_59,T_125_12,T_625_61,T_625_62,T_625_63,T_625_64,T_125_13,T_625_66,T_625_67,T_625_68,T_625_69,T_125_14,T_625_71,T_625_72,T_625_73,T_625_74,T_25_3,T_625_76,T_625_77,T_625_78,T_625_79,T_125_16,T_625_81,T_625_82,T_625_83,T_625_84,T_125_17,T_625_86,T_625_87,T_625_88,T_625_89,T_125_18,T_625_91,T_625_92,T_625_93,T_625_94,T_125_19,T_625_96,T_625_97,T_625_98,T_625_99,T_25_4,T_625_101,T_625_102,T_625_103,T_625_104,T_125_21,T_625_106,T_625_107,T_625_108,T_625_109,T_125_22,T_625_111,T_625_112,T_625_113,T_625_114,T_125_23,T_625_116,T_625_117,T_625_118,T_625_119,T_125_24,T_625_121,T_625_122,T_625_123,T_625_124,T_2_0,T_625_3,T_625_6,T_625_9,T_625_12,T_125_3,T_625_18,T_625_21,T_625_24,T_625_27,T_125_6,T_625_33,T_625_36,T_625_39,T_625_42,T_125_9,T_625_48,T_625_51,T_625_54,T_625_57,T_125_12,T_625_63,T_625_66,T_625_69,T_625_72,T_25_3,T_625_78,T_625_81,T_625_84,T_625_87,T_125_18,T_625_93,T_625_96,T_625_99,T_625_102,T_125_21,T_625_108,T_625_111,T_625_114,T_625_117,T_125_24,T_625_123,T_625_126,T_625_129,T_625_132,T_125_27,T_625_138,T_625_141,T_625_144,T_625_147,T_25_6,T_625_153,T_625_156,T_625_159,T_625_162,T_125_33,T_625_168,T_625_171,T_625_174,T_625_177,T_125_36,T_625_183,T_625_186,T_625_189,T_625_192,T_125_39,T_625_198,T_625_201,T_625_204,T_625_207,T_125_42,T_625_213,T_625_216,T_625_219,T_625_222,T_25_9,T_625_228,T_625_231,T_625_234,T_625_237,T_125_48,T_625_243,T_625_246,T_625_249,T_625_252,T_125_51,T_625_258,T_625_261,T_625_264,T_625_267,T_125_54,T_625_273,T_625_276,T_625_279,T_625_282,T_125_57,T_625_288,T_625_291,T_625_294,T_625_297,T_25_12,T_625_303,T_625_306,T_625_309,T_625_312,T_125_63,T_625_318,T_625_321,T_625_324,T_625_327,T_125_66,T_625_333,T_625_336,T_625_339,T_625_342,T_125_69,T_625_348,T_625_351,T_625_354,T_625_357,T_125_72,T_625_363,T_625_366,T_625_369,T_625_372 +}; +static const __device__ double2 lut_dp_25_625[25*2] = { + T_2_0,T_625_1,T_625_2,T_625_3,T_625_4,T_125_1,T_625_6,T_625_7,T_625_8,T_625_9,T_125_2,T_625_11,T_625_12,T_625_13,T_625_14,T_125_3,T_625_16,T_625_17,T_625_18,T_625_19,T_125_4,T_625_21,T_625_22,T_625_23,T_625_24,T_2_0,T_625_13,T_625_26,T_625_39,T_625_52,T_125_13,T_625_78,T_625_91,T_625_104,T_625_117,T_125_26,T_625_143,T_625_156,T_625_169,T_625_182,T_125_39,T_625_208,T_625_221,T_625_234,T_625_247,T_125_52,T_625_273,T_625_286,T_625_299,T_625_312 +}; +static const __device__ double2 lut_dp_3_729[243*2] = { + T_2_0,T_729_1,T_729_2,T_243_1,T_729_4,T_729_5,T_243_2,T_729_7,T_729_8,T_81_1,T_729_10,T_729_11,T_243_4,T_729_13,T_729_14,T_243_5,T_729_16,T_729_17,T_81_2,T_729_19,T_729_20,T_243_7,T_729_22,T_729_23,T_243_8,T_729_25,T_729_26,T_27_1,T_729_28,T_729_29,T_243_10,T_729_31,T_729_32,T_243_11,T_729_34,T_729_35,T_81_4,T_729_37,T_729_38,T_243_13,T_729_40,T_729_41,T_243_14,T_729_43,T_729_44,T_81_5,T_729_46,T_729_47,T_243_16,T_729_49,T_729_50,T_243_17,T_729_52,T_729_53,T_27_2,T_729_55,T_729_56,T_243_19,T_729_58,T_729_59,T_243_20,T_729_61,T_729_62,T_81_7,T_729_64,T_729_65,T_243_22,T_729_67,T_729_68,T_243_23,T_729_70,T_729_71,T_81_8,T_729_73,T_729_74,T_243_25,T_729_76,T_729_77,T_243_26,T_729_79,T_729_80,T_9_1,T_729_82,T_729_83,T_243_28,T_729_85,T_729_86,T_243_29,T_729_88,T_729_89,T_81_10,T_729_91,T_729_92,T_243_31,T_729_94,T_729_95,T_243_32,T_729_97,T_729_98,T_81_11,T_729_100,T_729_101,T_243_34,T_729_103,T_729_104,T_243_35,T_729_106,T_729_107,T_27_4,T_729_109,T_729_110,T_243_37,T_729_112,T_729_113,T_243_38,T_729_115,T_729_116,T_81_13,T_729_118,T_729_119,T_243_40,T_729_121,T_729_122,T_243_41,T_729_124,T_729_125,T_81_14,T_729_127,T_729_128,T_243_43,T_729_130,T_729_131,T_243_44,T_729_133,T_729_134,T_27_5,T_729_136,T_729_137,T_243_46,T_729_139,T_729_140,T_243_47,T_729_142,T_729_143,T_81_16,T_729_145,T_729_146,T_243_49,T_729_148,T_729_149,T_243_50,T_729_151,T_729_152,T_81_17,T_729_154,T_729_155,T_243_52,T_729_157,T_729_158,T_243_53,T_729_160,T_729_161,T_9_2,T_729_163,T_729_164,T_243_55,T_729_166,T_729_167,T_243_56,T_729_169,T_729_170,T_81_19,T_729_172,T_729_173,T_243_58,T_729_175,T_729_176,T_243_59,T_729_178,T_729_179,T_81_20,T_729_181,T_729_182,T_243_61,T_729_184,T_729_185,T_243_62,T_729_187,T_729_188,T_27_7,T_729_190,T_729_191,T_243_64,T_729_193,T_729_194,T_243_65,T_729_196,T_729_197,T_81_22,T_729_199,T_729_200,T_243_67,T_729_202,T_729_203,T_243_68,T_729_205,T_729_206,T_81_23,T_729_208,T_729_209,T_243_70,T_729_211,T_729_212,T_243_71,T_729_214,T_729_215,T_27_8,T_729_217,T_729_218,T_243_73,T_729_220,T_729_221,T_243_74,T_729_223,T_729_224,T_81_25,T_729_226,T_729_227,T_243_76,T_729_229,T_729_230,T_243_77,T_729_232,T_729_233,T_81_26,T_729_235,T_729_236,T_243_79,T_729_238,T_729_239,T_243_80,T_729_241,T_729_242,T_2_0,T_729_2,T_729_4,T_243_2,T_729_8,T_729_10,T_243_4,T_729_14,T_729_16,T_81_2,T_729_20,T_729_22,T_243_8,T_729_26,T_729_28,T_243_10,T_729_32,T_729_34,T_81_4,T_729_38,T_729_40,T_243_14,T_729_44,T_729_46,T_243_16,T_729_50,T_729_52,T_27_2,T_729_56,T_729_58,T_243_20,T_729_62,T_729_64,T_243_22,T_729_68,T_729_70,T_81_8,T_729_74,T_729_76,T_243_26,T_729_80,T_729_82,T_243_28,T_729_86,T_729_88,T_81_10,T_729_92,T_729_94,T_243_32,T_729_98,T_729_100,T_243_34,T_729_104,T_729_106,T_27_4,T_729_110,T_729_112,T_243_38,T_729_116,T_729_118,T_243_40,T_729_122,T_729_124,T_81_14,T_729_128,T_729_130,T_243_44,T_729_134,T_729_136,T_243_46,T_729_140,T_729_142,T_81_16,T_729_146,T_729_148,T_243_50,T_729_152,T_729_154,T_243_52,T_729_158,T_729_160,T_9_2,T_729_164,T_729_166,T_243_56,T_729_170,T_729_172,T_243_58,T_729_176,T_729_178,T_81_20,T_729_182,T_729_184,T_243_62,T_729_188,T_729_190,T_243_64,T_729_194,T_729_196,T_81_22,T_729_200,T_729_202,T_243_68,T_729_206,T_729_208,T_243_70,T_729_212,T_729_214,T_27_8,T_729_218,T_729_220,T_243_74,T_729_224,T_729_226,T_243_76,T_729_230,T_729_232,T_81_26,T_729_236,T_729_238,T_243_80,T_729_242,T_729_244,T_243_82,T_729_248,T_729_250,T_81_28,T_729_254,T_729_256,T_243_86,T_729_260,T_729_262,T_243_88,T_729_266,T_729_268,T_27_10,T_729_272,T_729_274,T_243_92,T_729_278,T_729_280,T_243_94,T_729_284,T_729_286,T_81_32,T_729_290,T_729_292,T_243_98,T_729_296,T_729_298,T_243_100,T_729_302,T_729_304,T_81_34,T_729_308,T_729_310,T_243_104,T_729_314,T_729_316,T_243_106,T_729_320,T_729_322,T_9_4,T_729_326,T_729_328,T_243_110,T_729_332,T_729_334,T_243_112,T_729_338,T_729_340,T_81_38,T_729_344,T_729_346,T_243_116,T_729_350,T_729_352,T_243_118,T_729_356,T_729_358,T_81_40,T_729_362,T_729_364,T_243_122,T_729_368,T_729_370,T_243_124,T_729_374,T_729_376,T_27_14,T_729_380,T_729_382,T_243_128,T_729_386,T_729_388,T_243_130,T_729_392,T_729_394,T_81_44,T_729_398,T_729_400,T_243_134,T_729_404,T_729_406,T_243_136,T_729_410,T_729_412,T_81_46,T_729_416,T_729_418,T_243_140,T_729_422,T_729_424,T_243_142,T_729_428,T_729_430,T_27_16,T_729_434,T_729_436,T_243_146,T_729_440,T_729_442,T_243_148,T_729_446,T_729_448,T_81_50,T_729_452,T_729_454,T_243_152,T_729_458,T_729_460,T_243_154,T_729_464,T_729_466,T_81_52,T_729_470,T_729_472,T_243_158,T_729_476,T_729_478,T_243_160,T_729_482,T_729_484 +}; +static const __device__ double2 lut_dp_9_729[81*2] = { + T_2_0,T_729_1,T_729_2,T_243_1,T_729_4,T_729_5,T_243_2,T_729_7,T_729_8,T_81_1,T_729_10,T_729_11,T_243_4,T_729_13,T_729_14,T_243_5,T_729_16,T_729_17,T_81_2,T_729_19,T_729_20,T_243_7,T_729_22,T_729_23,T_243_8,T_729_25,T_729_26,T_27_1,T_729_28,T_729_29,T_243_10,T_729_31,T_729_32,T_243_11,T_729_34,T_729_35,T_81_4,T_729_37,T_729_38,T_243_13,T_729_40,T_729_41,T_243_14,T_729_43,T_729_44,T_81_5,T_729_46,T_729_47,T_243_16,T_729_49,T_729_50,T_243_17,T_729_52,T_729_53,T_27_2,T_729_55,T_729_56,T_243_19,T_729_58,T_729_59,T_243_20,T_729_61,T_729_62,T_81_7,T_729_64,T_729_65,T_243_22,T_729_67,T_729_68,T_243_23,T_729_70,T_729_71,T_81_8,T_729_73,T_729_74,T_243_25,T_729_76,T_729_77,T_243_26,T_729_79,T_729_80,T_2_0,T_729_5,T_729_10,T_243_5,T_729_20,T_729_25,T_243_10,T_729_35,T_729_40,T_81_5,T_729_50,T_729_55,T_243_20,T_729_65,T_729_70,T_243_25,T_729_80,T_729_85,T_81_10,T_729_95,T_729_100,T_243_35,T_729_110,T_729_115,T_243_40,T_729_125,T_729_130,T_27_5,T_729_140,T_729_145,T_243_50,T_729_155,T_729_160,T_243_55,T_729_170,T_729_175,T_81_20,T_729_185,T_729_190,T_243_65,T_729_200,T_729_205,T_243_70,T_729_215,T_729_220,T_81_25,T_729_230,T_729_235,T_243_80,T_729_245,T_729_250,T_243_85,T_729_260,T_729_265,T_27_10,T_729_275,T_729_280,T_243_95,T_729_290,T_729_295,T_243_100,T_729_305,T_729_310,T_81_35,T_729_320,T_729_325,T_243_110,T_729_335,T_729_340,T_243_115,T_729_350,T_729_355,T_81_40,T_729_365,T_729_370,T_243_125,T_729_380,T_729_385,T_243_130,T_729_395,T_729_400 +}; +static const __device__ double2 lut_dp_27_729[27*2] = { + T_2_0,T_729_1,T_729_2,T_243_1,T_729_4,T_729_5,T_243_2,T_729_7,T_729_8,T_81_1,T_729_10,T_729_11,T_243_4,T_729_13,T_729_14,T_243_5,T_729_16,T_729_17,T_81_2,T_729_19,T_729_20,T_243_7,T_729_22,T_729_23,T_243_8,T_729_25,T_729_26,T_2_0,T_729_14,T_729_28,T_243_14,T_729_56,T_729_70,T_243_28,T_729_98,T_729_112,T_81_14,T_729_140,T_729_154,T_243_56,T_729_182,T_729_196,T_243_70,T_729_224,T_729_238,T_81_28,T_729_266,T_729_280,T_243_98,T_729_308,T_729_322,T_243_112,T_729_350,T_729_364 +}; +static const __device__ double2 lut_dp_10_1000[100*2] = { + T_2_0,T_1000_1,T_500_1,T_1000_3,T_250_1,T_200_1,T_500_3,T_1000_7,T_125_1,T_1000_9,T_100_1,T_1000_11,T_250_3,T_1000_13,T_500_7,T_200_3,T_125_2,T_1000_17,T_500_9,T_1000_19,T_50_1,T_1000_21,T_500_11,T_1000_23,T_125_3,T_40_1,T_500_13,T_1000_27,T_250_7,T_1000_29,T_100_3,T_1000_31,T_125_4,T_1000_33,T_500_17,T_200_7,T_250_9,T_1000_37,T_500_19,T_1000_39,T_25_1,T_1000_41,T_500_21,T_1000_43,T_250_11,T_200_9,T_500_23,T_1000_47,T_125_6,T_1000_49,T_20_1,T_1000_51,T_250_13,T_1000_53,T_500_27,T_200_11,T_125_7,T_1000_57,T_500_29,T_1000_59,T_50_3,T_1000_61,T_500_31,T_1000_63,T_125_8,T_200_13,T_500_33,T_1000_67,T_250_17,T_1000_69,T_100_7,T_1000_71,T_125_9,T_1000_73,T_500_37,T_40_3,T_250_19,T_1000_77,T_500_39,T_1000_79,T_25_2,T_1000_81,T_500_41,T_1000_83,T_250_21,T_200_17,T_500_43,T_1000_87,T_125_11,T_1000_89,T_100_9,T_1000_91,T_250_23,T_1000_93,T_500_47,T_200_19,T_125_12,T_1000_97,T_500_49,T_1000_99,T_2_0,T_500_3,T_250_3,T_500_9,T_125_3,T_100_3,T_250_9,T_500_21,T_125_6,T_500_27,T_50_3,T_500_33,T_125_9,T_500_39,T_250_21,T_100_9,T_125_12,T_500_51,T_250_27,T_500_57,T_25_3,T_500_63,T_250_33,T_500_69,T_125_18,T_20_3,T_250_39,T_500_81,T_125_21,T_500_87,T_50_9,T_500_93,T_125_24,T_500_99,T_250_51,T_100_21,T_125_27,T_500_111,T_250_57,T_500_117,T_25_6,T_500_123,T_250_63,T_500_129,T_125_33,T_100_27,T_250_69,T_500_141,T_125_36,T_500_147,T_10_3,T_500_153,T_125_39,T_500_159,T_250_81,T_100_33,T_125_42,T_500_171,T_250_87,T_500_177,T_25_9,T_500_183,T_250_93,T_500_189,T_125_48,T_100_39,T_250_99,T_500_201,T_125_51,T_500_207,T_50_21,T_500_213,T_125_54,T_500_219,T_250_111,T_20_9,T_125_57,T_500_231,T_250_117,T_500_237,T_25_12,T_500_243,T_250_123,T_500_249,T_125_63,T_100_51,T_250_129,T_500_261,T_125_66,T_500_267,T_50_27,T_500_273,T_125_69,T_500_279,T_250_141,T_100_57,T_125_72,T_500_291,T_250_147,T_500_297 +}; +static const __device__ double2 lut_dp_2_1024[512*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_8_1,T_1024_129,T_512_65,T_1024_131,T_256_33,T_1024_133,T_512_67,T_1024_135,T_128_17,T_1024_137,T_512_69,T_1024_139,T_256_35,T_1024_141,T_512_71,T_1024_143,T_64_9,T_1024_145,T_512_73,T_1024_147,T_256_37,T_1024_149,T_512_75,T_1024_151,T_128_19,T_1024_153,T_512_77,T_1024_155,T_256_39,T_1024_157,T_512_79,T_1024_159,T_32_5,T_1024_161,T_512_81,T_1024_163,T_256_41,T_1024_165,T_512_83,T_1024_167,T_128_21,T_1024_169,T_512_85,T_1024_171,T_256_43,T_1024_173,T_512_87,T_1024_175,T_64_11,T_1024_177,T_512_89,T_1024_179,T_256_45,T_1024_181,T_512_91,T_1024_183,T_128_23,T_1024_185,T_512_93,T_1024_187,T_256_47,T_1024_189,T_512_95,T_1024_191,T_16_3,T_1024_193,T_512_97,T_1024_195,T_256_49,T_1024_197,T_512_99,T_1024_199,T_128_25,T_1024_201,T_512_101,T_1024_203,T_256_51,T_1024_205,T_512_103,T_1024_207,T_64_13,T_1024_209,T_512_105,T_1024_211,T_256_53,T_1024_213,T_512_107,T_1024_215,T_128_27,T_1024_217,T_512_109,T_1024_219,T_256_55,T_1024_221,T_512_111,T_1024_223,T_32_7,T_1024_225,T_512_113,T_1024_227,T_256_57,T_1024_229,T_512_115,T_1024_231,T_128_29,T_1024_233,T_512_117,T_1024_235,T_256_59,T_1024_237,T_512_119,T_1024_239,T_64_15,T_1024_241,T_512_121,T_1024_243,T_256_61,T_1024_245,T_512_123,T_1024_247,T_128_31,T_1024_249,T_512_125,T_1024_251,T_256_63,T_1024_253,T_512_127,T_1024_255,T_4_1,T_1024_257,T_512_129,T_1024_259,T_256_65,T_1024_261,T_512_131,T_1024_263,T_128_33,T_1024_265,T_512_133,T_1024_267,T_256_67,T_1024_269,T_512_135,T_1024_271,T_64_17,T_1024_273,T_512_137,T_1024_275,T_256_69,T_1024_277,T_512_139,T_1024_279,T_128_35,T_1024_281,T_512_141,T_1024_283,T_256_71,T_1024_285,T_512_143,T_1024_287,T_32_9,T_1024_289,T_512_145,T_1024_291,T_256_73,T_1024_293,T_512_147,T_1024_295,T_128_37,T_1024_297,T_512_149,T_1024_299,T_256_75,T_1024_301,T_512_151,T_1024_303,T_64_19,T_1024_305,T_512_153,T_1024_307,T_256_77,T_1024_309,T_512_155,T_1024_311,T_128_39,T_1024_313,T_512_157,T_1024_315,T_256_79,T_1024_317,T_512_159,T_1024_319,T_16_5,T_1024_321,T_512_161,T_1024_323,T_256_81,T_1024_325,T_512_163,T_1024_327,T_128_41,T_1024_329,T_512_165,T_1024_331,T_256_83,T_1024_333,T_512_167,T_1024_335,T_64_21,T_1024_337,T_512_169,T_1024_339,T_256_85,T_1024_341,T_512_171,T_1024_343,T_128_43,T_1024_345,T_512_173,T_1024_347,T_256_87,T_1024_349,T_512_175,T_1024_351,T_32_11,T_1024_353,T_512_177,T_1024_355,T_256_89,T_1024_357,T_512_179,T_1024_359,T_128_45,T_1024_361,T_512_181,T_1024_363,T_256_91,T_1024_365,T_512_183,T_1024_367,T_64_23,T_1024_369,T_512_185,T_1024_371,T_256_93,T_1024_373,T_512_187,T_1024_375,T_128_47,T_1024_377,T_512_189,T_1024_379,T_256_95,T_1024_381,T_512_191,T_1024_383,T_8_3,T_1024_385,T_512_193,T_1024_387,T_256_97,T_1024_389,T_512_195,T_1024_391,T_128_49,T_1024_393,T_512_197,T_1024_395,T_256_99,T_1024_397,T_512_199,T_1024_399,T_64_25,T_1024_401,T_512_201,T_1024_403,T_256_101,T_1024_405,T_512_203,T_1024_407,T_128_51,T_1024_409,T_512_205,T_1024_411,T_256_103,T_1024_413,T_512_207,T_1024_415,T_32_13,T_1024_417,T_512_209,T_1024_419,T_256_105,T_1024_421,T_512_211,T_1024_423,T_128_53,T_1024_425,T_512_213,T_1024_427,T_256_107,T_1024_429,T_512_215,T_1024_431,T_64_27,T_1024_433,T_512_217,T_1024_435,T_256_109,T_1024_437,T_512_219,T_1024_439,T_128_55,T_1024_441,T_512_221,T_1024_443,T_256_111,T_1024_445,T_512_223,T_1024_447,T_16_7,T_1024_449,T_512_225,T_1024_451,T_256_113,T_1024_453,T_512_227,T_1024_455,T_128_57,T_1024_457,T_512_229,T_1024_459,T_256_115,T_1024_461,T_512_231,T_1024_463,T_64_29,T_1024_465,T_512_233,T_1024_467,T_256_117,T_1024_469,T_512_235,T_1024_471,T_128_59,T_1024_473,T_512_237,T_1024_475,T_256_119,T_1024_477,T_512_239,T_1024_479,T_32_15,T_1024_481,T_512_241,T_1024_483,T_256_121,T_1024_485,T_512_243,T_1024_487,T_128_61,T_1024_489,T_512_245,T_1024_491,T_256_123,T_1024_493,T_512_247,T_1024_495,T_64_31,T_1024_497,T_512_249,T_1024_499,T_256_125,T_1024_501,T_512_251,T_1024_503,T_128_63,T_1024_505,T_512_253,T_1024_507,T_256_127,T_1024_509,T_512_255,T_1024_511,T_2_0,T_512_1,T_256_1,T_512_3,T_128_1,T_512_5,T_256_3,T_512_7,T_64_1,T_512_9,T_256_5,T_512_11,T_128_3,T_512_13,T_256_7,T_512_15,T_32_1,T_512_17,T_256_9,T_512_19,T_128_5,T_512_21,T_256_11,T_512_23,T_64_3,T_512_25,T_256_13,T_512_27,T_128_7,T_512_29,T_256_15,T_512_31,T_16_1,T_512_33,T_256_17,T_512_35,T_128_9,T_512_37,T_256_19,T_512_39,T_64_5,T_512_41,T_256_21,T_512_43,T_128_11,T_512_45,T_256_23,T_512_47,T_32_3,T_512_49,T_256_25,T_512_51,T_128_13,T_512_53,T_256_27,T_512_55,T_64_7,T_512_57,T_256_29,T_512_59,T_128_15,T_512_61,T_256_31,T_512_63,T_8_1,T_512_65,T_256_33,T_512_67,T_128_17,T_512_69,T_256_35,T_512_71,T_64_9,T_512_73,T_256_37,T_512_75,T_128_19,T_512_77,T_256_39,T_512_79,T_32_5,T_512_81,T_256_41,T_512_83,T_128_21,T_512_85,T_256_43,T_512_87,T_64_11,T_512_89,T_256_45,T_512_91,T_128_23,T_512_93,T_256_47,T_512_95,T_16_3,T_512_97,T_256_49,T_512_99,T_128_25,T_512_101,T_256_51,T_512_103,T_64_13,T_512_105,T_256_53,T_512_107,T_128_27,T_512_109,T_256_55,T_512_111,T_32_7,T_512_113,T_256_57,T_512_115,T_128_29,T_512_117,T_256_59,T_512_119,T_64_15,T_512_121,T_256_61,T_512_123,T_128_31,T_512_125,T_256_63,T_512_127,T_4_1,T_512_129,T_256_65,T_512_131,T_128_33,T_512_133,T_256_67,T_512_135,T_64_17,T_512_137,T_256_69,T_512_139,T_128_35,T_512_141,T_256_71,T_512_143,T_32_9,T_512_145,T_256_73,T_512_147,T_128_37,T_512_149,T_256_75,T_512_151,T_64_19,T_512_153,T_256_77,T_512_155,T_128_39,T_512_157,T_256_79,T_512_159,T_16_5,T_512_161,T_256_81,T_512_163,T_128_41,T_512_165,T_256_83,T_512_167,T_64_21,T_512_169,T_256_85,T_512_171,T_128_43,T_512_173,T_256_87,T_512_175,T_32_11,T_512_177,T_256_89,T_512_179,T_128_45,T_512_181,T_256_91,T_512_183,T_64_23,T_512_185,T_256_93,T_512_187,T_128_47,T_512_189,T_256_95,T_512_191,T_8_3,T_512_193,T_256_97,T_512_195,T_128_49,T_512_197,T_256_99,T_512_199,T_64_25,T_512_201,T_256_101,T_512_203,T_128_51,T_512_205,T_256_103,T_512_207,T_32_13,T_512_209,T_256_105,T_512_211,T_128_53,T_512_213,T_256_107,T_512_215,T_64_27,T_512_217,T_256_109,T_512_219,T_128_55,T_512_221,T_256_111,T_512_223,T_16_7,T_512_225,T_256_113,T_512_227,T_128_57,T_512_229,T_256_115,T_512_231,T_64_29,T_512_233,T_256_117,T_512_235,T_128_59,T_512_237,T_256_119,T_512_239,T_32_15,T_512_241,T_256_121,T_512_243,T_128_61,T_512_245,T_256_123,T_512_247,T_64_31,T_512_249,T_256_125,T_512_251,T_128_63,T_512_253,T_256_127,T_512_255,T_2_1,T_512_257,T_256_129,T_512_259,T_128_65,T_512_261,T_256_131,T_512_263,T_64_33,T_512_265,T_256_133,T_512_267,T_128_67,T_512_269,T_256_135,T_512_271,T_32_17,T_512_273,T_256_137,T_512_275,T_128_69,T_512_277,T_256_139,T_512_279,T_64_35,T_512_281,T_256_141,T_512_283,T_128_71,T_512_285,T_256_143,T_512_287,T_16_9,T_512_289,T_256_145,T_512_291,T_128_73,T_512_293,T_256_147,T_512_295,T_64_37,T_512_297,T_256_149,T_512_299,T_128_75,T_512_301,T_256_151,T_512_303,T_32_19,T_512_305,T_256_153,T_512_307,T_128_77,T_512_309,T_256_155,T_512_311,T_64_39,T_512_313,T_256_157,T_512_315,T_128_79,T_512_317,T_256_159,T_512_319,T_8_5,T_512_321,T_256_161,T_512_323,T_128_81,T_512_325,T_256_163,T_512_327,T_64_41,T_512_329,T_256_165,T_512_331,T_128_83,T_512_333,T_256_167,T_512_335,T_32_21,T_512_337,T_256_169,T_512_339,T_128_85,T_512_341,T_256_171,T_512_343,T_64_43,T_512_345,T_256_173,T_512_347,T_128_87,T_512_349,T_256_175,T_512_351,T_16_11,T_512_353,T_256_177,T_512_355,T_128_89,T_512_357,T_256_179,T_512_359,T_64_45,T_512_361,T_256_181,T_512_363,T_128_91,T_512_365,T_256_183,T_512_367,T_32_23,T_512_369,T_256_185,T_512_371,T_128_93,T_512_373,T_256_187,T_512_375,T_64_47,T_512_377,T_256_189,T_512_379,T_128_95,T_512_381,T_256_191,T_512_383,T_4_3,T_512_385,T_256_193,T_512_387,T_128_97,T_512_389,T_256_195,T_512_391,T_64_49,T_512_393,T_256_197,T_512_395,T_128_99,T_512_397,T_256_199,T_512_399,T_32_25,T_512_401,T_256_201,T_512_403,T_128_101,T_512_405,T_256_203,T_512_407,T_64_51,T_512_409,T_256_205,T_512_411,T_128_103,T_512_413,T_256_207,T_512_415,T_16_13,T_512_417,T_256_209,T_512_419,T_128_105,T_512_421,T_256_211,T_512_423,T_64_53,T_512_425,T_256_213,T_512_427,T_128_107,T_512_429,T_256_215,T_512_431,T_32_27,T_512_433,T_256_217,T_512_435,T_128_109,T_512_437,T_256_219,T_512_439,T_64_55,T_512_441,T_256_221,T_512_443,T_128_111,T_512_445,T_256_223,T_512_447,T_8_7,T_512_449,T_256_225,T_512_451,T_128_113,T_512_453,T_256_227,T_512_455,T_64_57,T_512_457,T_256_229,T_512_459,T_128_115,T_512_461,T_256_231,T_512_463,T_32_29,T_512_465,T_256_233,T_512_467,T_128_117,T_512_469,T_256_235,T_512_471,T_64_59,T_512_473,T_256_237,T_512_475,T_128_119,T_512_477,T_256_239,T_512_479,T_16_15,T_512_481,T_256_241,T_512_483,T_128_121,T_512_485,T_256_243,T_512_487,T_64_61,T_512_489,T_256_245,T_512_491,T_128_123,T_512_493,T_256_247,T_512_495,T_32_31,T_512_497,T_256_249,T_512_499,T_128_125,T_512_501,T_256_251,T_512_503,T_64_63,T_512_505,T_256_253,T_512_507,T_128_127,T_512_509,T_256_255,T_512_511 +}; +static const __device__ double2 lut_dp_4_1024[256*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_8_1,T_1024_129,T_512_65,T_1024_131,T_256_33,T_1024_133,T_512_67,T_1024_135,T_128_17,T_1024_137,T_512_69,T_1024_139,T_256_35,T_1024_141,T_512_71,T_1024_143,T_64_9,T_1024_145,T_512_73,T_1024_147,T_256_37,T_1024_149,T_512_75,T_1024_151,T_128_19,T_1024_153,T_512_77,T_1024_155,T_256_39,T_1024_157,T_512_79,T_1024_159,T_32_5,T_1024_161,T_512_81,T_1024_163,T_256_41,T_1024_165,T_512_83,T_1024_167,T_128_21,T_1024_169,T_512_85,T_1024_171,T_256_43,T_1024_173,T_512_87,T_1024_175,T_64_11,T_1024_177,T_512_89,T_1024_179,T_256_45,T_1024_181,T_512_91,T_1024_183,T_128_23,T_1024_185,T_512_93,T_1024_187,T_256_47,T_1024_189,T_512_95,T_1024_191,T_16_3,T_1024_193,T_512_97,T_1024_195,T_256_49,T_1024_197,T_512_99,T_1024_199,T_128_25,T_1024_201,T_512_101,T_1024_203,T_256_51,T_1024_205,T_512_103,T_1024_207,T_64_13,T_1024_209,T_512_105,T_1024_211,T_256_53,T_1024_213,T_512_107,T_1024_215,T_128_27,T_1024_217,T_512_109,T_1024_219,T_256_55,T_1024_221,T_512_111,T_1024_223,T_32_7,T_1024_225,T_512_113,T_1024_227,T_256_57,T_1024_229,T_512_115,T_1024_231,T_128_29,T_1024_233,T_512_117,T_1024_235,T_256_59,T_1024_237,T_512_119,T_1024_239,T_64_15,T_1024_241,T_512_121,T_1024_243,T_256_61,T_1024_245,T_512_123,T_1024_247,T_128_31,T_1024_249,T_512_125,T_1024_251,T_256_63,T_1024_253,T_512_127,T_1024_255,T_2_0,T_1024_3,T_512_3,T_1024_9,T_256_3,T_1024_15,T_512_9,T_1024_21,T_128_3,T_1024_27,T_512_15,T_1024_33,T_256_9,T_1024_39,T_512_21,T_1024_45,T_64_3,T_1024_51,T_512_27,T_1024_57,T_256_15,T_1024_63,T_512_33,T_1024_69,T_128_9,T_1024_75,T_512_39,T_1024_81,T_256_21,T_1024_87,T_512_45,T_1024_93,T_32_3,T_1024_99,T_512_51,T_1024_105,T_256_27,T_1024_111,T_512_57,T_1024_117,T_128_15,T_1024_123,T_512_63,T_1024_129,T_256_33,T_1024_135,T_512_69,T_1024_141,T_64_9,T_1024_147,T_512_75,T_1024_153,T_256_39,T_1024_159,T_512_81,T_1024_165,T_128_21,T_1024_171,T_512_87,T_1024_177,T_256_45,T_1024_183,T_512_93,T_1024_189,T_16_3,T_1024_195,T_512_99,T_1024_201,T_256_51,T_1024_207,T_512_105,T_1024_213,T_128_27,T_1024_219,T_512_111,T_1024_225,T_256_57,T_1024_231,T_512_117,T_1024_237,T_64_15,T_1024_243,T_512_123,T_1024_249,T_256_63,T_1024_255,T_512_129,T_1024_261,T_128_33,T_1024_267,T_512_135,T_1024_273,T_256_69,T_1024_279,T_512_141,T_1024_285,T_32_9,T_1024_291,T_512_147,T_1024_297,T_256_75,T_1024_303,T_512_153,T_1024_309,T_128_39,T_1024_315,T_512_159,T_1024_321,T_256_81,T_1024_327,T_512_165,T_1024_333,T_64_21,T_1024_339,T_512_171,T_1024_345,T_256_87,T_1024_351,T_512_177,T_1024_357,T_128_45,T_1024_363,T_512_183,T_1024_369,T_256_93,T_1024_375,T_512_189,T_1024_381,T_8_3,T_1024_387,T_512_195,T_1024_393,T_256_99,T_1024_399,T_512_201,T_1024_405,T_128_51,T_1024_411,T_512_207,T_1024_417,T_256_105,T_1024_423,T_512_213,T_1024_429,T_64_27,T_1024_435,T_512_219,T_1024_441,T_256_111,T_1024_447,T_512_225,T_1024_453,T_128_57,T_1024_459,T_512_231,T_1024_465,T_256_117,T_1024_471,T_512_237,T_1024_477,T_32_15,T_1024_483,T_512_243,T_1024_489,T_256_123,T_1024_495,T_512_249,T_1024_501,T_128_63,T_1024_507,T_512_255,T_1024_513,T_256_129,T_1024_519,T_512_261,T_1024_525,T_64_33,T_1024_531,T_512_267,T_1024_537,T_256_135,T_1024_543,T_512_273,T_1024_549,T_128_69,T_1024_555,T_512_279,T_1024_561,T_256_141,T_1024_567,T_512_285,T_1024_573,T_16_9,T_1024_579,T_512_291,T_1024_585,T_256_147,T_1024_591,T_512_297,T_1024_597,T_128_75,T_1024_603,T_512_303,T_1024_609,T_256_153,T_1024_615,T_512_309,T_1024_621,T_64_39,T_1024_627,T_512_315,T_1024_633,T_256_159,T_1024_639,T_512_321,T_1024_645,T_128_81,T_1024_651,T_512_327,T_1024_657,T_256_165,T_1024_663,T_512_333,T_1024_669,T_32_21,T_1024_675,T_512_339,T_1024_681,T_256_171,T_1024_687,T_512_345,T_1024_693,T_128_87,T_1024_699,T_512_351,T_1024_705,T_256_177,T_1024_711,T_512_357,T_1024_717,T_64_45,T_1024_723,T_512_363,T_1024_729,T_256_183,T_1024_735,T_512_369,T_1024_741,T_128_93,T_1024_747,T_512_375,T_1024_753,T_256_189,T_1024_759,T_512_381,T_1024_765 +}; +static const __device__ double2 lut_dp_8_1024[128*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_2_0,T_1024_5,T_512_5,T_1024_15,T_256_5,T_1024_25,T_512_15,T_1024_35,T_128_5,T_1024_45,T_512_25,T_1024_55,T_256_15,T_1024_65,T_512_35,T_1024_75,T_64_5,T_1024_85,T_512_45,T_1024_95,T_256_25,T_1024_105,T_512_55,T_1024_115,T_128_15,T_1024_125,T_512_65,T_1024_135,T_256_35,T_1024_145,T_512_75,T_1024_155,T_32_5,T_1024_165,T_512_85,T_1024_175,T_256_45,T_1024_185,T_512_95,T_1024_195,T_128_25,T_1024_205,T_512_105,T_1024_215,T_256_55,T_1024_225,T_512_115,T_1024_235,T_64_15,T_1024_245,T_512_125,T_1024_255,T_256_65,T_1024_265,T_512_135,T_1024_275,T_128_35,T_1024_285,T_512_145,T_1024_295,T_256_75,T_1024_305,T_512_155,T_1024_315,T_16_5,T_1024_325,T_512_165,T_1024_335,T_256_85,T_1024_345,T_512_175,T_1024_355,T_128_45,T_1024_365,T_512_185,T_1024_375,T_256_95,T_1024_385,T_512_195,T_1024_395,T_64_25,T_1024_405,T_512_205,T_1024_415,T_256_105,T_1024_425,T_512_215,T_1024_435,T_128_55,T_1024_445,T_512_225,T_1024_455,T_256_115,T_1024_465,T_512_235,T_1024_475,T_32_15,T_1024_485,T_512_245,T_1024_495,T_256_125,T_1024_505,T_512_255,T_1024_515,T_128_65,T_1024_525,T_512_265,T_1024_535,T_256_135,T_1024_545,T_512_275,T_1024_555,T_64_35,T_1024_565,T_512_285,T_1024_575,T_256_145,T_1024_585,T_512_295,T_1024_595,T_128_75,T_1024_605,T_512_305,T_1024_615,T_256_155,T_1024_625,T_512_315,T_1024_635 +}; +static const __device__ double2 lut_dp_16_1024[64*2] = { + T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_2_0,T_1024_9,T_512_9,T_1024_27,T_256_9,T_1024_45,T_512_27,T_1024_63,T_128_9,T_1024_81,T_512_45,T_1024_99,T_256_27,T_1024_117,T_512_63,T_1024_135,T_64_9,T_1024_153,T_512_81,T_1024_171,T_256_45,T_1024_189,T_512_99,T_1024_207,T_128_27,T_1024_225,T_512_117,T_1024_243,T_256_63,T_1024_261,T_512_135,T_1024_279,T_32_9,T_1024_297,T_512_153,T_1024_315,T_256_81,T_1024_333,T_512_171,T_1024_351,T_128_45,T_1024_369,T_512_189,T_1024_387,T_256_99,T_1024_405,T_512_207,T_1024_423,T_64_27,T_1024_441,T_512_225,T_1024_459,T_256_117,T_1024_477,T_512_243,T_1024_495,T_128_63,T_1024_513,T_512_261,T_1024_531,T_256_135,T_1024_549,T_512_279,T_1024_567 +}; +static const __device__ double2 lut_dp_6_1296[216*2] = { + T_2_0,T_1296_1,T_648_1,T_432_1,T_324_1,T_1296_5,T_216_1,T_1296_7,T_162_1,T_144_1,T_648_5,T_1296_11,T_108_1,T_1296_13,T_648_7,T_432_5,T_81_1,T_1296_17,T_72_1,T_1296_19,T_324_5,T_432_7,T_648_11,T_1296_23,T_54_1,T_1296_25,T_648_13,T_48_1,T_324_7,T_1296_29,T_216_5,T_1296_31,T_81_2,T_432_11,T_648_17,T_1296_35,T_36_1,T_1296_37,T_648_19,T_432_13,T_162_5,T_1296_41,T_216_7,T_1296_43,T_324_11,T_144_5,T_648_23,T_1296_47,T_27_1,T_1296_49,T_648_25,T_432_17,T_324_13,T_1296_53,T_24_1,T_1296_55,T_162_7,T_432_19,T_648_29,T_1296_59,T_108_5,T_1296_61,T_648_31,T_144_7,T_81_4,T_1296_65,T_216_11,T_1296_67,T_324_17,T_432_23,T_648_35,T_1296_71,T_18_1,T_1296_73,T_648_37,T_432_25,T_324_19,T_1296_77,T_216_13,T_1296_79,T_81_5,T_16_1,T_648_41,T_1296_83,T_108_7,T_1296_85,T_648_43,T_432_29,T_162_11,T_1296_89,T_72_5,T_1296_91,T_324_23,T_432_31,T_648_47,T_1296_95,T_27_2,T_1296_97,T_648_49,T_144_11,T_324_25,T_1296_101,T_216_17,T_1296_103,T_162_13,T_432_35,T_648_53,T_1296_107,T_12_1,T_1296_109,T_648_55,T_432_37,T_81_7,T_1296_113,T_216_19,T_1296_115,T_324_29,T_144_13,T_648_59,T_1296_119,T_54_5,T_1296_121,T_648_61,T_432_41,T_324_31,T_1296_125,T_72_7,T_1296_127,T_81_8,T_432_43,T_648_65,T_1296_131,T_108_11,T_1296_133,T_648_67,T_48_5,T_162_17,T_1296_137,T_216_23,T_1296_139,T_324_35,T_432_47,T_648_71,T_1296_143,T_9_1,T_1296_145,T_648_73,T_432_49,T_324_37,T_1296_149,T_216_25,T_1296_151,T_162_19,T_144_17,T_648_77,T_1296_155,T_108_13,T_1296_157,T_648_79,T_432_53,T_81_10,T_1296_161,T_8_1,T_1296_163,T_324_41,T_432_55,T_648_83,T_1296_167,T_54_7,T_1296_169,T_648_85,T_144_19,T_324_43,T_1296_173,T_216_29,T_1296_175,T_81_11,T_432_59,T_648_89,T_1296_179,T_36_5,T_1296_181,T_648_91,T_432_61,T_162_23,T_1296_185,T_216_31,T_1296_187,T_324_47,T_48_7,T_648_95,T_1296_191,T_27_4,T_1296_193,T_648_97,T_432_65,T_324_49,T_1296_197,T_72_11,T_1296_199,T_162_25,T_432_67,T_648_101,T_1296_203,T_108_17,T_1296_205,T_648_103,T_144_23,T_81_13,T_1296_209,T_216_35,T_1296_211,T_324_53,T_432_71,T_648_107,T_1296_215,T_2_0,T_324_1,T_162_1,T_108_1,T_81_1,T_324_5,T_54_1,T_324_7,T_81_2,T_36_1,T_162_5,T_324_11,T_27_1,T_324_13,T_162_7,T_108_5,T_81_4,T_324_17,T_18_1,T_324_19,T_81_5,T_108_7,T_162_11,T_324_23,T_27_2,T_324_25,T_162_13,T_12_1,T_81_7,T_324_29,T_54_5,T_324_31,T_81_8,T_108_11,T_162_17,T_324_35,T_9_1,T_324_37,T_162_19,T_108_13,T_81_10,T_324_41,T_54_7,T_324_43,T_81_11,T_36_5,T_162_23,T_324_47,T_27_4,T_324_49,T_162_25,T_108_17,T_81_13,T_324_53,T_6_1,T_324_55,T_81_14,T_108_19,T_162_29,T_324_59,T_27_5,T_324_61,T_162_31,T_36_7,T_81_16,T_324_65,T_54_11,T_324_67,T_81_17,T_108_23,T_162_35,T_324_71,T_9_2,T_324_73,T_162_37,T_108_25,T_81_19,T_324_77,T_54_13,T_324_79,T_81_20,T_4_1,T_162_41,T_324_83,T_27_7,T_324_85,T_162_43,T_108_29,T_81_22,T_324_89,T_18_5,T_324_91,T_81_23,T_108_31,T_162_47,T_324_95,T_27_8,T_324_97,T_162_49,T_36_11,T_81_25,T_324_101,T_54_17,T_324_103,T_81_26,T_108_35,T_162_53,T_324_107,T_3_1,T_324_109,T_162_55,T_108_37,T_81_28,T_324_113,T_54_19,T_324_115,T_81_29,T_36_13,T_162_59,T_324_119,T_27_10,T_324_121,T_162_61,T_108_41,T_81_31,T_324_125,T_18_7,T_324_127,T_81_32,T_108_43,T_162_65,T_324_131,T_27_11,T_324_133,T_162_67,T_12_5,T_81_34,T_324_137,T_54_23,T_324_139,T_81_35,T_108_47,T_162_71,T_324_143,T_9_4,T_324_145,T_162_73,T_108_49,T_81_37,T_324_149,T_54_25,T_324_151,T_81_38,T_36_17,T_162_77,T_324_155,T_27_13,T_324_157,T_162_79,T_108_53,T_81_40,T_324_161,T_2_1,T_324_163,T_81_41,T_108_55,T_162_83,T_324_167,T_27_14,T_324_169,T_162_85,T_36_19,T_81_43,T_324_173,T_54_29,T_324_175,T_81_44,T_108_59,T_162_89,T_324_179,T_9_5,T_324_181,T_162_91,T_108_61,T_81_46,T_324_185,T_54_31,T_324_187,T_81_47,T_12_7,T_162_95,T_324_191,T_27_16,T_324_193,T_162_97,T_108_65,T_81_49,T_324_197,T_18_11,T_324_199,T_81_50,T_108_67,T_162_101,T_324_203,T_27_17,T_324_205,T_162_103,T_36_23,T_81_52,T_324_209,T_54_35,T_324_211,T_81_53,T_108_71,T_162_107,T_324_215 +}; +static const __device__ double2 lut_dp_11_1331[121*2] = { + T_2_0,T_1331_1,T_1331_2,T_1331_3,T_1331_4,T_1331_5,T_1331_6,T_1331_7,T_1331_8,T_1331_9,T_1331_10,T_121_1,T_1331_12,T_1331_13,T_1331_14,T_1331_15,T_1331_16,T_1331_17,T_1331_18,T_1331_19,T_1331_20,T_1331_21,T_121_2,T_1331_23,T_1331_24,T_1331_25,T_1331_26,T_1331_27,T_1331_28,T_1331_29,T_1331_30,T_1331_31,T_1331_32,T_121_3,T_1331_34,T_1331_35,T_1331_36,T_1331_37,T_1331_38,T_1331_39,T_1331_40,T_1331_41,T_1331_42,T_1331_43,T_121_4,T_1331_45,T_1331_46,T_1331_47,T_1331_48,T_1331_49,T_1331_50,T_1331_51,T_1331_52,T_1331_53,T_1331_54,T_121_5,T_1331_56,T_1331_57,T_1331_58,T_1331_59,T_1331_60,T_1331_61,T_1331_62,T_1331_63,T_1331_64,T_1331_65,T_121_6,T_1331_67,T_1331_68,T_1331_69,T_1331_70,T_1331_71,T_1331_72,T_1331_73,T_1331_74,T_1331_75,T_1331_76,T_121_7,T_1331_78,T_1331_79,T_1331_80,T_1331_81,T_1331_82,T_1331_83,T_1331_84,T_1331_85,T_1331_86,T_1331_87,T_121_8,T_1331_89,T_1331_90,T_1331_91,T_1331_92,T_1331_93,T_1331_94,T_1331_95,T_1331_96,T_1331_97,T_1331_98,T_121_9,T_1331_100,T_1331_101,T_1331_102,T_1331_103,T_1331_104,T_1331_105,T_1331_106,T_1331_107,T_1331_108,T_1331_109,T_121_10,T_1331_111,T_1331_112,T_1331_113,T_1331_114,T_1331_115,T_1331_116,T_1331_117,T_1331_118,T_1331_119,T_1331_120,T_2_0,T_1331_6,T_1331_12,T_1331_18,T_1331_24,T_1331_30,T_1331_36,T_1331_42,T_1331_48,T_1331_54,T_1331_60,T_121_6,T_1331_72,T_1331_78,T_1331_84,T_1331_90,T_1331_96,T_1331_102,T_1331_108,T_1331_114,T_1331_120,T_1331_126,T_121_12,T_1331_138,T_1331_144,T_1331_150,T_1331_156,T_1331_162,T_1331_168,T_1331_174,T_1331_180,T_1331_186,T_1331_192,T_121_18,T_1331_204,T_1331_210,T_1331_216,T_1331_222,T_1331_228,T_1331_234,T_1331_240,T_1331_246,T_1331_252,T_1331_258,T_121_24,T_1331_270,T_1331_276,T_1331_282,T_1331_288,T_1331_294,T_1331_300,T_1331_306,T_1331_312,T_1331_318,T_1331_324,T_121_30,T_1331_336,T_1331_342,T_1331_348,T_1331_354,T_1331_360,T_1331_366,T_1331_372,T_1331_378,T_1331_384,T_1331_390,T_121_36,T_1331_402,T_1331_408,T_1331_414,T_1331_420,T_1331_426,T_1331_432,T_1331_438,T_1331_444,T_1331_450,T_1331_456,T_121_42,T_1331_468,T_1331_474,T_1331_480,T_1331_486,T_1331_492,T_1331_498,T_1331_504,T_1331_510,T_1331_516,T_1331_522,T_121_48,T_1331_534,T_1331_540,T_1331_546,T_1331_552,T_1331_558,T_1331_564,T_1331_570,T_1331_576,T_1331_582,T_1331_588,T_121_54,T_1331_600,T_1331_606,T_1331_612,T_1331_618,T_1331_624,T_1331_630,T_1331_636,T_1331_642,T_1331_648,T_1331_654,T_121_60,T_1331_666,T_1331_672,T_1331_678,T_1331_684,T_1331_690,T_1331_696,T_1331_702,T_1331_708,T_1331_714,T_1331_720 +}; +static const __device__ double2 lut_dp_12_1728[144*2] = { + T_2_0,T_1728_1,T_864_1,T_576_1,T_432_1,T_1728_5,T_288_1,T_1728_7,T_216_1,T_192_1,T_864_5,T_1728_11,T_144_1,T_1728_13,T_864_7,T_576_5,T_108_1,T_1728_17,T_96_1,T_1728_19,T_432_5,T_576_7,T_864_11,T_1728_23,T_72_1,T_1728_25,T_864_13,T_64_1,T_432_7,T_1728_29,T_288_5,T_1728_31,T_54_1,T_576_11,T_864_17,T_1728_35,T_48_1,T_1728_37,T_864_19,T_576_13,T_216_5,T_1728_41,T_288_7,T_1728_43,T_432_11,T_192_5,T_864_23,T_1728_47,T_36_1,T_1728_49,T_864_25,T_576_17,T_432_13,T_1728_53,T_32_1,T_1728_55,T_216_7,T_576_19,T_864_29,T_1728_59,T_144_5,T_1728_61,T_864_31,T_192_7,T_27_1,T_1728_65,T_288_11,T_1728_67,T_432_17,T_576_23,T_864_35,T_1728_71,T_24_1,T_1728_73,T_864_37,T_576_25,T_432_19,T_1728_77,T_288_13,T_1728_79,T_108_5,T_64_3,T_864_41,T_1728_83,T_144_7,T_1728_85,T_864_43,T_576_29,T_216_11,T_1728_89,T_96_5,T_1728_91,T_432_23,T_576_31,T_864_47,T_1728_95,T_18_1,T_1728_97,T_864_49,T_192_11,T_432_25,T_1728_101,T_288_17,T_1728_103,T_216_13,T_576_35,T_864_53,T_1728_107,T_16_1,T_1728_109,T_864_55,T_576_37,T_108_7,T_1728_113,T_288_19,T_1728_115,T_432_29,T_192_13,T_864_59,T_1728_119,T_72_5,T_1728_121,T_864_61,T_576_41,T_432_31,T_1728_125,T_96_7,T_1728_127,T_27_2,T_576_43,T_864_65,T_1728_131,T_144_11,T_1728_133,T_864_67,T_64_5,T_216_17,T_1728_137,T_288_23,T_1728_139,T_432_35,T_576_47,T_864_71,T_1728_143,T_2_0,T_1728_7,T_864_7,T_576_7,T_432_7,T_1728_35,T_288_7,T_1728_49,T_216_7,T_192_7,T_864_35,T_1728_77,T_144_7,T_1728_91,T_864_49,T_576_35,T_108_7,T_1728_119,T_96_7,T_1728_133,T_432_35,T_576_49,T_864_77,T_1728_161,T_72_7,T_1728_175,T_864_91,T_64_7,T_432_49,T_1728_203,T_288_35,T_1728_217,T_54_7,T_576_77,T_864_119,T_1728_245,T_48_7,T_1728_259,T_864_133,T_576_91,T_216_35,T_1728_287,T_288_49,T_1728_301,T_432_77,T_192_35,T_864_161,T_1728_329,T_36_7,T_1728_343,T_864_175,T_576_119,T_432_91,T_1728_371,T_32_7,T_1728_385,T_216_49,T_576_133,T_864_203,T_1728_413,T_144_35,T_1728_427,T_864_217,T_192_49,T_27_7,T_1728_455,T_288_77,T_1728_469,T_432_119,T_576_161,T_864_245,T_1728_497,T_24_7,T_1728_511,T_864_259,T_576_175,T_432_133,T_1728_539,T_288_91,T_1728_553,T_108_35,T_64_21,T_864_287,T_1728_581,T_144_49,T_1728_595,T_864_301,T_576_203,T_216_77,T_1728_623,T_96_35,T_1728_637,T_432_161,T_576_217,T_864_329,T_1728_665,T_18_7,T_1728_679,T_864_343,T_192_77,T_432_175,T_1728_707,T_288_119,T_1728_721,T_216_91,T_576_245,T_864_371,T_1728_749,T_16_7,T_1728_763,T_864_385,T_576_259,T_108_49,T_1728_791,T_288_133,T_1728_805,T_432_203,T_192_91,T_864_413,T_1728_833,T_72_35,T_1728_847,T_864_427,T_576_287,T_432_217,T_1728_875,T_96_49,T_1728_889,T_27_14,T_576_301,T_864_455,T_1728_917,T_144_77,T_1728_931,T_864_469,T_64_35,T_216_119,T_1728_959,T_288_161,T_1728_973,T_432_245,T_576_329,T_864_497,T_1728_1001 +}; +static const __device__ double2 lut_dp_2_2048[1024*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_16_1,T_2048_129,T_1024_65,T_2048_131,T_512_33,T_2048_133,T_1024_67,T_2048_135,T_256_17,T_2048_137,T_1024_69,T_2048_139,T_512_35,T_2048_141,T_1024_71,T_2048_143,T_128_9,T_2048_145,T_1024_73,T_2048_147,T_512_37,T_2048_149,T_1024_75,T_2048_151,T_256_19,T_2048_153,T_1024_77,T_2048_155,T_512_39,T_2048_157,T_1024_79,T_2048_159,T_64_5,T_2048_161,T_1024_81,T_2048_163,T_512_41,T_2048_165,T_1024_83,T_2048_167,T_256_21,T_2048_169,T_1024_85,T_2048_171,T_512_43,T_2048_173,T_1024_87,T_2048_175,T_128_11,T_2048_177,T_1024_89,T_2048_179,T_512_45,T_2048_181,T_1024_91,T_2048_183,T_256_23,T_2048_185,T_1024_93,T_2048_187,T_512_47,T_2048_189,T_1024_95,T_2048_191,T_32_3,T_2048_193,T_1024_97,T_2048_195,T_512_49,T_2048_197,T_1024_99,T_2048_199,T_256_25,T_2048_201,T_1024_101,T_2048_203,T_512_51,T_2048_205,T_1024_103,T_2048_207,T_128_13,T_2048_209,T_1024_105,T_2048_211,T_512_53,T_2048_213,T_1024_107,T_2048_215,T_256_27,T_2048_217,T_1024_109,T_2048_219,T_512_55,T_2048_221,T_1024_111,T_2048_223,T_64_7,T_2048_225,T_1024_113,T_2048_227,T_512_57,T_2048_229,T_1024_115,T_2048_231,T_256_29,T_2048_233,T_1024_117,T_2048_235,T_512_59,T_2048_237,T_1024_119,T_2048_239,T_128_15,T_2048_241,T_1024_121,T_2048_243,T_512_61,T_2048_245,T_1024_123,T_2048_247,T_256_31,T_2048_249,T_1024_125,T_2048_251,T_512_63,T_2048_253,T_1024_127,T_2048_255,T_8_1,T_2048_257,T_1024_129,T_2048_259,T_512_65,T_2048_261,T_1024_131,T_2048_263,T_256_33,T_2048_265,T_1024_133,T_2048_267,T_512_67,T_2048_269,T_1024_135,T_2048_271,T_128_17,T_2048_273,T_1024_137,T_2048_275,T_512_69,T_2048_277,T_1024_139,T_2048_279,T_256_35,T_2048_281,T_1024_141,T_2048_283,T_512_71,T_2048_285,T_1024_143,T_2048_287,T_64_9,T_2048_289,T_1024_145,T_2048_291,T_512_73,T_2048_293,T_1024_147,T_2048_295,T_256_37,T_2048_297,T_1024_149,T_2048_299,T_512_75,T_2048_301,T_1024_151,T_2048_303,T_128_19,T_2048_305,T_1024_153,T_2048_307,T_512_77,T_2048_309,T_1024_155,T_2048_311,T_256_39,T_2048_313,T_1024_157,T_2048_315,T_512_79,T_2048_317,T_1024_159,T_2048_319,T_32_5,T_2048_321,T_1024_161,T_2048_323,T_512_81,T_2048_325,T_1024_163,T_2048_327,T_256_41,T_2048_329,T_1024_165,T_2048_331,T_512_83,T_2048_333,T_1024_167,T_2048_335,T_128_21,T_2048_337,T_1024_169,T_2048_339,T_512_85,T_2048_341,T_1024_171,T_2048_343,T_256_43,T_2048_345,T_1024_173,T_2048_347,T_512_87,T_2048_349,T_1024_175,T_2048_351,T_64_11,T_2048_353,T_1024_177,T_2048_355,T_512_89,T_2048_357,T_1024_179,T_2048_359,T_256_45,T_2048_361,T_1024_181,T_2048_363,T_512_91,T_2048_365,T_1024_183,T_2048_367,T_128_23,T_2048_369,T_1024_185,T_2048_371,T_512_93,T_2048_373,T_1024_187,T_2048_375,T_256_47,T_2048_377,T_1024_189,T_2048_379,T_512_95,T_2048_381,T_1024_191,T_2048_383,T_16_3,T_2048_385,T_1024_193,T_2048_387,T_512_97,T_2048_389,T_1024_195,T_2048_391,T_256_49,T_2048_393,T_1024_197,T_2048_395,T_512_99,T_2048_397,T_1024_199,T_2048_399,T_128_25,T_2048_401,T_1024_201,T_2048_403,T_512_101,T_2048_405,T_1024_203,T_2048_407,T_256_51,T_2048_409,T_1024_205,T_2048_411,T_512_103,T_2048_413,T_1024_207,T_2048_415,T_64_13,T_2048_417,T_1024_209,T_2048_419,T_512_105,T_2048_421,T_1024_211,T_2048_423,T_256_53,T_2048_425,T_1024_213,T_2048_427,T_512_107,T_2048_429,T_1024_215,T_2048_431,T_128_27,T_2048_433,T_1024_217,T_2048_435,T_512_109,T_2048_437,T_1024_219,T_2048_439,T_256_55,T_2048_441,T_1024_221,T_2048_443,T_512_111,T_2048_445,T_1024_223,T_2048_447,T_32_7,T_2048_449,T_1024_225,T_2048_451,T_512_113,T_2048_453,T_1024_227,T_2048_455,T_256_57,T_2048_457,T_1024_229,T_2048_459,T_512_115,T_2048_461,T_1024_231,T_2048_463,T_128_29,T_2048_465,T_1024_233,T_2048_467,T_512_117,T_2048_469,T_1024_235,T_2048_471,T_256_59,T_2048_473,T_1024_237,T_2048_475,T_512_119,T_2048_477,T_1024_239,T_2048_479,T_64_15,T_2048_481,T_1024_241,T_2048_483,T_512_121,T_2048_485,T_1024_243,T_2048_487,T_256_61,T_2048_489,T_1024_245,T_2048_491,T_512_123,T_2048_493,T_1024_247,T_2048_495,T_128_31,T_2048_497,T_1024_249,T_2048_499,T_512_125,T_2048_501,T_1024_251,T_2048_503,T_256_63,T_2048_505,T_1024_253,T_2048_507,T_512_127,T_2048_509,T_1024_255,T_2048_511,T_4_1,T_2048_513,T_1024_257,T_2048_515,T_512_129,T_2048_517,T_1024_259,T_2048_519,T_256_65,T_2048_521,T_1024_261,T_2048_523,T_512_131,T_2048_525,T_1024_263,T_2048_527,T_128_33,T_2048_529,T_1024_265,T_2048_531,T_512_133,T_2048_533,T_1024_267,T_2048_535,T_256_67,T_2048_537,T_1024_269,T_2048_539,T_512_135,T_2048_541,T_1024_271,T_2048_543,T_64_17,T_2048_545,T_1024_273,T_2048_547,T_512_137,T_2048_549,T_1024_275,T_2048_551,T_256_69,T_2048_553,T_1024_277,T_2048_555,T_512_139,T_2048_557,T_1024_279,T_2048_559,T_128_35,T_2048_561,T_1024_281,T_2048_563,T_512_141,T_2048_565,T_1024_283,T_2048_567,T_256_71,T_2048_569,T_1024_285,T_2048_571,T_512_143,T_2048_573,T_1024_287,T_2048_575,T_32_9,T_2048_577,T_1024_289,T_2048_579,T_512_145,T_2048_581,T_1024_291,T_2048_583,T_256_73,T_2048_585,T_1024_293,T_2048_587,T_512_147,T_2048_589,T_1024_295,T_2048_591,T_128_37,T_2048_593,T_1024_297,T_2048_595,T_512_149,T_2048_597,T_1024_299,T_2048_599,T_256_75,T_2048_601,T_1024_301,T_2048_603,T_512_151,T_2048_605,T_1024_303,T_2048_607,T_64_19,T_2048_609,T_1024_305,T_2048_611,T_512_153,T_2048_613,T_1024_307,T_2048_615,T_256_77,T_2048_617,T_1024_309,T_2048_619,T_512_155,T_2048_621,T_1024_311,T_2048_623,T_128_39,T_2048_625,T_1024_313,T_2048_627,T_512_157,T_2048_629,T_1024_315,T_2048_631,T_256_79,T_2048_633,T_1024_317,T_2048_635,T_512_159,T_2048_637,T_1024_319,T_2048_639,T_16_5,T_2048_641,T_1024_321,T_2048_643,T_512_161,T_2048_645,T_1024_323,T_2048_647,T_256_81,T_2048_649,T_1024_325,T_2048_651,T_512_163,T_2048_653,T_1024_327,T_2048_655,T_128_41,T_2048_657,T_1024_329,T_2048_659,T_512_165,T_2048_661,T_1024_331,T_2048_663,T_256_83,T_2048_665,T_1024_333,T_2048_667,T_512_167,T_2048_669,T_1024_335,T_2048_671,T_64_21,T_2048_673,T_1024_337,T_2048_675,T_512_169,T_2048_677,T_1024_339,T_2048_679,T_256_85,T_2048_681,T_1024_341,T_2048_683,T_512_171,T_2048_685,T_1024_343,T_2048_687,T_128_43,T_2048_689,T_1024_345,T_2048_691,T_512_173,T_2048_693,T_1024_347,T_2048_695,T_256_87,T_2048_697,T_1024_349,T_2048_699,T_512_175,T_2048_701,T_1024_351,T_2048_703,T_32_11,T_2048_705,T_1024_353,T_2048_707,T_512_177,T_2048_709,T_1024_355,T_2048_711,T_256_89,T_2048_713,T_1024_357,T_2048_715,T_512_179,T_2048_717,T_1024_359,T_2048_719,T_128_45,T_2048_721,T_1024_361,T_2048_723,T_512_181,T_2048_725,T_1024_363,T_2048_727,T_256_91,T_2048_729,T_1024_365,T_2048_731,T_512_183,T_2048_733,T_1024_367,T_2048_735,T_64_23,T_2048_737,T_1024_369,T_2048_739,T_512_185,T_2048_741,T_1024_371,T_2048_743,T_256_93,T_2048_745,T_1024_373,T_2048_747,T_512_187,T_2048_749,T_1024_375,T_2048_751,T_128_47,T_2048_753,T_1024_377,T_2048_755,T_512_189,T_2048_757,T_1024_379,T_2048_759,T_256_95,T_2048_761,T_1024_381,T_2048_763,T_512_191,T_2048_765,T_1024_383,T_2048_767,T_8_3,T_2048_769,T_1024_385,T_2048_771,T_512_193,T_2048_773,T_1024_387,T_2048_775,T_256_97,T_2048_777,T_1024_389,T_2048_779,T_512_195,T_2048_781,T_1024_391,T_2048_783,T_128_49,T_2048_785,T_1024_393,T_2048_787,T_512_197,T_2048_789,T_1024_395,T_2048_791,T_256_99,T_2048_793,T_1024_397,T_2048_795,T_512_199,T_2048_797,T_1024_399,T_2048_799,T_64_25,T_2048_801,T_1024_401,T_2048_803,T_512_201,T_2048_805,T_1024_403,T_2048_807,T_256_101,T_2048_809,T_1024_405,T_2048_811,T_512_203,T_2048_813,T_1024_407,T_2048_815,T_128_51,T_2048_817,T_1024_409,T_2048_819,T_512_205,T_2048_821,T_1024_411,T_2048_823,T_256_103,T_2048_825,T_1024_413,T_2048_827,T_512_207,T_2048_829,T_1024_415,T_2048_831,T_32_13,T_2048_833,T_1024_417,T_2048_835,T_512_209,T_2048_837,T_1024_419,T_2048_839,T_256_105,T_2048_841,T_1024_421,T_2048_843,T_512_211,T_2048_845,T_1024_423,T_2048_847,T_128_53,T_2048_849,T_1024_425,T_2048_851,T_512_213,T_2048_853,T_1024_427,T_2048_855,T_256_107,T_2048_857,T_1024_429,T_2048_859,T_512_215,T_2048_861,T_1024_431,T_2048_863,T_64_27,T_2048_865,T_1024_433,T_2048_867,T_512_217,T_2048_869,T_1024_435,T_2048_871,T_256_109,T_2048_873,T_1024_437,T_2048_875,T_512_219,T_2048_877,T_1024_439,T_2048_879,T_128_55,T_2048_881,T_1024_441,T_2048_883,T_512_221,T_2048_885,T_1024_443,T_2048_887,T_256_111,T_2048_889,T_1024_445,T_2048_891,T_512_223,T_2048_893,T_1024_447,T_2048_895,T_16_7,T_2048_897,T_1024_449,T_2048_899,T_512_225,T_2048_901,T_1024_451,T_2048_903,T_256_113,T_2048_905,T_1024_453,T_2048_907,T_512_227,T_2048_909,T_1024_455,T_2048_911,T_128_57,T_2048_913,T_1024_457,T_2048_915,T_512_229,T_2048_917,T_1024_459,T_2048_919,T_256_115,T_2048_921,T_1024_461,T_2048_923,T_512_231,T_2048_925,T_1024_463,T_2048_927,T_64_29,T_2048_929,T_1024_465,T_2048_931,T_512_233,T_2048_933,T_1024_467,T_2048_935,T_256_117,T_2048_937,T_1024_469,T_2048_939,T_512_235,T_2048_941,T_1024_471,T_2048_943,T_128_59,T_2048_945,T_1024_473,T_2048_947,T_512_237,T_2048_949,T_1024_475,T_2048_951,T_256_119,T_2048_953,T_1024_477,T_2048_955,T_512_239,T_2048_957,T_1024_479,T_2048_959,T_32_15,T_2048_961,T_1024_481,T_2048_963,T_512_241,T_2048_965,T_1024_483,T_2048_967,T_256_121,T_2048_969,T_1024_485,T_2048_971,T_512_243,T_2048_973,T_1024_487,T_2048_975,T_128_61,T_2048_977,T_1024_489,T_2048_979,T_512_245,T_2048_981,T_1024_491,T_2048_983,T_256_123,T_2048_985,T_1024_493,T_2048_987,T_512_247,T_2048_989,T_1024_495,T_2048_991,T_64_31,T_2048_993,T_1024_497,T_2048_995,T_512_249,T_2048_997,T_1024_499,T_2048_999,T_256_125,T_2048_1001,T_1024_501,T_2048_1003,T_512_251,T_2048_1005,T_1024_503,T_2048_1007,T_128_63,T_2048_1009,T_1024_505,T_2048_1011,T_512_253,T_2048_1013,T_1024_507,T_2048_1015,T_256_127,T_2048_1017,T_1024_509,T_2048_1019,T_512_255,T_2048_1021,T_1024_511,T_2048_1023,T_2_0,T_1024_1,T_512_1,T_1024_3,T_256_1,T_1024_5,T_512_3,T_1024_7,T_128_1,T_1024_9,T_512_5,T_1024_11,T_256_3,T_1024_13,T_512_7,T_1024_15,T_64_1,T_1024_17,T_512_9,T_1024_19,T_256_5,T_1024_21,T_512_11,T_1024_23,T_128_3,T_1024_25,T_512_13,T_1024_27,T_256_7,T_1024_29,T_512_15,T_1024_31,T_32_1,T_1024_33,T_512_17,T_1024_35,T_256_9,T_1024_37,T_512_19,T_1024_39,T_128_5,T_1024_41,T_512_21,T_1024_43,T_256_11,T_1024_45,T_512_23,T_1024_47,T_64_3,T_1024_49,T_512_25,T_1024_51,T_256_13,T_1024_53,T_512_27,T_1024_55,T_128_7,T_1024_57,T_512_29,T_1024_59,T_256_15,T_1024_61,T_512_31,T_1024_63,T_16_1,T_1024_65,T_512_33,T_1024_67,T_256_17,T_1024_69,T_512_35,T_1024_71,T_128_9,T_1024_73,T_512_37,T_1024_75,T_256_19,T_1024_77,T_512_39,T_1024_79,T_64_5,T_1024_81,T_512_41,T_1024_83,T_256_21,T_1024_85,T_512_43,T_1024_87,T_128_11,T_1024_89,T_512_45,T_1024_91,T_256_23,T_1024_93,T_512_47,T_1024_95,T_32_3,T_1024_97,T_512_49,T_1024_99,T_256_25,T_1024_101,T_512_51,T_1024_103,T_128_13,T_1024_105,T_512_53,T_1024_107,T_256_27,T_1024_109,T_512_55,T_1024_111,T_64_7,T_1024_113,T_512_57,T_1024_115,T_256_29,T_1024_117,T_512_59,T_1024_119,T_128_15,T_1024_121,T_512_61,T_1024_123,T_256_31,T_1024_125,T_512_63,T_1024_127,T_8_1,T_1024_129,T_512_65,T_1024_131,T_256_33,T_1024_133,T_512_67,T_1024_135,T_128_17,T_1024_137,T_512_69,T_1024_139,T_256_35,T_1024_141,T_512_71,T_1024_143,T_64_9,T_1024_145,T_512_73,T_1024_147,T_256_37,T_1024_149,T_512_75,T_1024_151,T_128_19,T_1024_153,T_512_77,T_1024_155,T_256_39,T_1024_157,T_512_79,T_1024_159,T_32_5,T_1024_161,T_512_81,T_1024_163,T_256_41,T_1024_165,T_512_83,T_1024_167,T_128_21,T_1024_169,T_512_85,T_1024_171,T_256_43,T_1024_173,T_512_87,T_1024_175,T_64_11,T_1024_177,T_512_89,T_1024_179,T_256_45,T_1024_181,T_512_91,T_1024_183,T_128_23,T_1024_185,T_512_93,T_1024_187,T_256_47,T_1024_189,T_512_95,T_1024_191,T_16_3,T_1024_193,T_512_97,T_1024_195,T_256_49,T_1024_197,T_512_99,T_1024_199,T_128_25,T_1024_201,T_512_101,T_1024_203,T_256_51,T_1024_205,T_512_103,T_1024_207,T_64_13,T_1024_209,T_512_105,T_1024_211,T_256_53,T_1024_213,T_512_107,T_1024_215,T_128_27,T_1024_217,T_512_109,T_1024_219,T_256_55,T_1024_221,T_512_111,T_1024_223,T_32_7,T_1024_225,T_512_113,T_1024_227,T_256_57,T_1024_229,T_512_115,T_1024_231,T_128_29,T_1024_233,T_512_117,T_1024_235,T_256_59,T_1024_237,T_512_119,T_1024_239,T_64_15,T_1024_241,T_512_121,T_1024_243,T_256_61,T_1024_245,T_512_123,T_1024_247,T_128_31,T_1024_249,T_512_125,T_1024_251,T_256_63,T_1024_253,T_512_127,T_1024_255,T_4_1,T_1024_257,T_512_129,T_1024_259,T_256_65,T_1024_261,T_512_131,T_1024_263,T_128_33,T_1024_265,T_512_133,T_1024_267,T_256_67,T_1024_269,T_512_135,T_1024_271,T_64_17,T_1024_273,T_512_137,T_1024_275,T_256_69,T_1024_277,T_512_139,T_1024_279,T_128_35,T_1024_281,T_512_141,T_1024_283,T_256_71,T_1024_285,T_512_143,T_1024_287,T_32_9,T_1024_289,T_512_145,T_1024_291,T_256_73,T_1024_293,T_512_147,T_1024_295,T_128_37,T_1024_297,T_512_149,T_1024_299,T_256_75,T_1024_301,T_512_151,T_1024_303,T_64_19,T_1024_305,T_512_153,T_1024_307,T_256_77,T_1024_309,T_512_155,T_1024_311,T_128_39,T_1024_313,T_512_157,T_1024_315,T_256_79,T_1024_317,T_512_159,T_1024_319,T_16_5,T_1024_321,T_512_161,T_1024_323,T_256_81,T_1024_325,T_512_163,T_1024_327,T_128_41,T_1024_329,T_512_165,T_1024_331,T_256_83,T_1024_333,T_512_167,T_1024_335,T_64_21,T_1024_337,T_512_169,T_1024_339,T_256_85,T_1024_341,T_512_171,T_1024_343,T_128_43,T_1024_345,T_512_173,T_1024_347,T_256_87,T_1024_349,T_512_175,T_1024_351,T_32_11,T_1024_353,T_512_177,T_1024_355,T_256_89,T_1024_357,T_512_179,T_1024_359,T_128_45,T_1024_361,T_512_181,T_1024_363,T_256_91,T_1024_365,T_512_183,T_1024_367,T_64_23,T_1024_369,T_512_185,T_1024_371,T_256_93,T_1024_373,T_512_187,T_1024_375,T_128_47,T_1024_377,T_512_189,T_1024_379,T_256_95,T_1024_381,T_512_191,T_1024_383,T_8_3,T_1024_385,T_512_193,T_1024_387,T_256_97,T_1024_389,T_512_195,T_1024_391,T_128_49,T_1024_393,T_512_197,T_1024_395,T_256_99,T_1024_397,T_512_199,T_1024_399,T_64_25,T_1024_401,T_512_201,T_1024_403,T_256_101,T_1024_405,T_512_203,T_1024_407,T_128_51,T_1024_409,T_512_205,T_1024_411,T_256_103,T_1024_413,T_512_207,T_1024_415,T_32_13,T_1024_417,T_512_209,T_1024_419,T_256_105,T_1024_421,T_512_211,T_1024_423,T_128_53,T_1024_425,T_512_213,T_1024_427,T_256_107,T_1024_429,T_512_215,T_1024_431,T_64_27,T_1024_433,T_512_217,T_1024_435,T_256_109,T_1024_437,T_512_219,T_1024_439,T_128_55,T_1024_441,T_512_221,T_1024_443,T_256_111,T_1024_445,T_512_223,T_1024_447,T_16_7,T_1024_449,T_512_225,T_1024_451,T_256_113,T_1024_453,T_512_227,T_1024_455,T_128_57,T_1024_457,T_512_229,T_1024_459,T_256_115,T_1024_461,T_512_231,T_1024_463,T_64_29,T_1024_465,T_512_233,T_1024_467,T_256_117,T_1024_469,T_512_235,T_1024_471,T_128_59,T_1024_473,T_512_237,T_1024_475,T_256_119,T_1024_477,T_512_239,T_1024_479,T_32_15,T_1024_481,T_512_241,T_1024_483,T_256_121,T_1024_485,T_512_243,T_1024_487,T_128_61,T_1024_489,T_512_245,T_1024_491,T_256_123,T_1024_493,T_512_247,T_1024_495,T_64_31,T_1024_497,T_512_249,T_1024_499,T_256_125,T_1024_501,T_512_251,T_1024_503,T_128_63,T_1024_505,T_512_253,T_1024_507,T_256_127,T_1024_509,T_512_255,T_1024_511,T_2_1,T_1024_513,T_512_257,T_1024_515,T_256_129,T_1024_517,T_512_259,T_1024_519,T_128_65,T_1024_521,T_512_261,T_1024_523,T_256_131,T_1024_525,T_512_263,T_1024_527,T_64_33,T_1024_529,T_512_265,T_1024_531,T_256_133,T_1024_533,T_512_267,T_1024_535,T_128_67,T_1024_537,T_512_269,T_1024_539,T_256_135,T_1024_541,T_512_271,T_1024_543,T_32_17,T_1024_545,T_512_273,T_1024_547,T_256_137,T_1024_549,T_512_275,T_1024_551,T_128_69,T_1024_553,T_512_277,T_1024_555,T_256_139,T_1024_557,T_512_279,T_1024_559,T_64_35,T_1024_561,T_512_281,T_1024_563,T_256_141,T_1024_565,T_512_283,T_1024_567,T_128_71,T_1024_569,T_512_285,T_1024_571,T_256_143,T_1024_573,T_512_287,T_1024_575,T_16_9,T_1024_577,T_512_289,T_1024_579,T_256_145,T_1024_581,T_512_291,T_1024_583,T_128_73,T_1024_585,T_512_293,T_1024_587,T_256_147,T_1024_589,T_512_295,T_1024_591,T_64_37,T_1024_593,T_512_297,T_1024_595,T_256_149,T_1024_597,T_512_299,T_1024_599,T_128_75,T_1024_601,T_512_301,T_1024_603,T_256_151,T_1024_605,T_512_303,T_1024_607,T_32_19,T_1024_609,T_512_305,T_1024_611,T_256_153,T_1024_613,T_512_307,T_1024_615,T_128_77,T_1024_617,T_512_309,T_1024_619,T_256_155,T_1024_621,T_512_311,T_1024_623,T_64_39,T_1024_625,T_512_313,T_1024_627,T_256_157,T_1024_629,T_512_315,T_1024_631,T_128_79,T_1024_633,T_512_317,T_1024_635,T_256_159,T_1024_637,T_512_319,T_1024_639,T_8_5,T_1024_641,T_512_321,T_1024_643,T_256_161,T_1024_645,T_512_323,T_1024_647,T_128_81,T_1024_649,T_512_325,T_1024_651,T_256_163,T_1024_653,T_512_327,T_1024_655,T_64_41,T_1024_657,T_512_329,T_1024_659,T_256_165,T_1024_661,T_512_331,T_1024_663,T_128_83,T_1024_665,T_512_333,T_1024_667,T_256_167,T_1024_669,T_512_335,T_1024_671,T_32_21,T_1024_673,T_512_337,T_1024_675,T_256_169,T_1024_677,T_512_339,T_1024_679,T_128_85,T_1024_681,T_512_341,T_1024_683,T_256_171,T_1024_685,T_512_343,T_1024_687,T_64_43,T_1024_689,T_512_345,T_1024_691,T_256_173,T_1024_693,T_512_347,T_1024_695,T_128_87,T_1024_697,T_512_349,T_1024_699,T_256_175,T_1024_701,T_512_351,T_1024_703,T_16_11,T_1024_705,T_512_353,T_1024_707,T_256_177,T_1024_709,T_512_355,T_1024_711,T_128_89,T_1024_713,T_512_357,T_1024_715,T_256_179,T_1024_717,T_512_359,T_1024_719,T_64_45,T_1024_721,T_512_361,T_1024_723,T_256_181,T_1024_725,T_512_363,T_1024_727,T_128_91,T_1024_729,T_512_365,T_1024_731,T_256_183,T_1024_733,T_512_367,T_1024_735,T_32_23,T_1024_737,T_512_369,T_1024_739,T_256_185,T_1024_741,T_512_371,T_1024_743,T_128_93,T_1024_745,T_512_373,T_1024_747,T_256_187,T_1024_749,T_512_375,T_1024_751,T_64_47,T_1024_753,T_512_377,T_1024_755,T_256_189,T_1024_757,T_512_379,T_1024_759,T_128_95,T_1024_761,T_512_381,T_1024_763,T_256_191,T_1024_765,T_512_383,T_1024_767,T_4_3,T_1024_769,T_512_385,T_1024_771,T_256_193,T_1024_773,T_512_387,T_1024_775,T_128_97,T_1024_777,T_512_389,T_1024_779,T_256_195,T_1024_781,T_512_391,T_1024_783,T_64_49,T_1024_785,T_512_393,T_1024_787,T_256_197,T_1024_789,T_512_395,T_1024_791,T_128_99,T_1024_793,T_512_397,T_1024_795,T_256_199,T_1024_797,T_512_399,T_1024_799,T_32_25,T_1024_801,T_512_401,T_1024_803,T_256_201,T_1024_805,T_512_403,T_1024_807,T_128_101,T_1024_809,T_512_405,T_1024_811,T_256_203,T_1024_813,T_512_407,T_1024_815,T_64_51,T_1024_817,T_512_409,T_1024_819,T_256_205,T_1024_821,T_512_411,T_1024_823,T_128_103,T_1024_825,T_512_413,T_1024_827,T_256_207,T_1024_829,T_512_415,T_1024_831,T_16_13,T_1024_833,T_512_417,T_1024_835,T_256_209,T_1024_837,T_512_419,T_1024_839,T_128_105,T_1024_841,T_512_421,T_1024_843,T_256_211,T_1024_845,T_512_423,T_1024_847,T_64_53,T_1024_849,T_512_425,T_1024_851,T_256_213,T_1024_853,T_512_427,T_1024_855,T_128_107,T_1024_857,T_512_429,T_1024_859,T_256_215,T_1024_861,T_512_431,T_1024_863,T_32_27,T_1024_865,T_512_433,T_1024_867,T_256_217,T_1024_869,T_512_435,T_1024_871,T_128_109,T_1024_873,T_512_437,T_1024_875,T_256_219,T_1024_877,T_512_439,T_1024_879,T_64_55,T_1024_881,T_512_441,T_1024_883,T_256_221,T_1024_885,T_512_443,T_1024_887,T_128_111,T_1024_889,T_512_445,T_1024_891,T_256_223,T_1024_893,T_512_447,T_1024_895,T_8_7,T_1024_897,T_512_449,T_1024_899,T_256_225,T_1024_901,T_512_451,T_1024_903,T_128_113,T_1024_905,T_512_453,T_1024_907,T_256_227,T_1024_909,T_512_455,T_1024_911,T_64_57,T_1024_913,T_512_457,T_1024_915,T_256_229,T_1024_917,T_512_459,T_1024_919,T_128_115,T_1024_921,T_512_461,T_1024_923,T_256_231,T_1024_925,T_512_463,T_1024_927,T_32_29,T_1024_929,T_512_465,T_1024_931,T_256_233,T_1024_933,T_512_467,T_1024_935,T_128_117,T_1024_937,T_512_469,T_1024_939,T_256_235,T_1024_941,T_512_471,T_1024_943,T_64_59,T_1024_945,T_512_473,T_1024_947,T_256_237,T_1024_949,T_512_475,T_1024_951,T_128_119,T_1024_953,T_512_477,T_1024_955,T_256_239,T_1024_957,T_512_479,T_1024_959,T_16_15,T_1024_961,T_512_481,T_1024_963,T_256_241,T_1024_965,T_512_483,T_1024_967,T_128_121,T_1024_969,T_512_485,T_1024_971,T_256_243,T_1024_973,T_512_487,T_1024_975,T_64_61,T_1024_977,T_512_489,T_1024_979,T_256_245,T_1024_981,T_512_491,T_1024_983,T_128_123,T_1024_985,T_512_493,T_1024_987,T_256_247,T_1024_989,T_512_495,T_1024_991,T_32_31,T_1024_993,T_512_497,T_1024_995,T_256_249,T_1024_997,T_512_499,T_1024_999,T_128_125,T_1024_1001,T_512_501,T_1024_1003,T_256_251,T_1024_1005,T_512_503,T_1024_1007,T_64_63,T_1024_1009,T_512_505,T_1024_1011,T_256_253,T_1024_1013,T_512_507,T_1024_1015,T_128_127,T_1024_1017,T_512_509,T_1024_1019,T_256_255,T_1024_1021,T_512_511,T_1024_1023 +}; +static const __device__ double2 lut_dp_4_2048[512*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_16_1,T_2048_129,T_1024_65,T_2048_131,T_512_33,T_2048_133,T_1024_67,T_2048_135,T_256_17,T_2048_137,T_1024_69,T_2048_139,T_512_35,T_2048_141,T_1024_71,T_2048_143,T_128_9,T_2048_145,T_1024_73,T_2048_147,T_512_37,T_2048_149,T_1024_75,T_2048_151,T_256_19,T_2048_153,T_1024_77,T_2048_155,T_512_39,T_2048_157,T_1024_79,T_2048_159,T_64_5,T_2048_161,T_1024_81,T_2048_163,T_512_41,T_2048_165,T_1024_83,T_2048_167,T_256_21,T_2048_169,T_1024_85,T_2048_171,T_512_43,T_2048_173,T_1024_87,T_2048_175,T_128_11,T_2048_177,T_1024_89,T_2048_179,T_512_45,T_2048_181,T_1024_91,T_2048_183,T_256_23,T_2048_185,T_1024_93,T_2048_187,T_512_47,T_2048_189,T_1024_95,T_2048_191,T_32_3,T_2048_193,T_1024_97,T_2048_195,T_512_49,T_2048_197,T_1024_99,T_2048_199,T_256_25,T_2048_201,T_1024_101,T_2048_203,T_512_51,T_2048_205,T_1024_103,T_2048_207,T_128_13,T_2048_209,T_1024_105,T_2048_211,T_512_53,T_2048_213,T_1024_107,T_2048_215,T_256_27,T_2048_217,T_1024_109,T_2048_219,T_512_55,T_2048_221,T_1024_111,T_2048_223,T_64_7,T_2048_225,T_1024_113,T_2048_227,T_512_57,T_2048_229,T_1024_115,T_2048_231,T_256_29,T_2048_233,T_1024_117,T_2048_235,T_512_59,T_2048_237,T_1024_119,T_2048_239,T_128_15,T_2048_241,T_1024_121,T_2048_243,T_512_61,T_2048_245,T_1024_123,T_2048_247,T_256_31,T_2048_249,T_1024_125,T_2048_251,T_512_63,T_2048_253,T_1024_127,T_2048_255,T_8_1,T_2048_257,T_1024_129,T_2048_259,T_512_65,T_2048_261,T_1024_131,T_2048_263,T_256_33,T_2048_265,T_1024_133,T_2048_267,T_512_67,T_2048_269,T_1024_135,T_2048_271,T_128_17,T_2048_273,T_1024_137,T_2048_275,T_512_69,T_2048_277,T_1024_139,T_2048_279,T_256_35,T_2048_281,T_1024_141,T_2048_283,T_512_71,T_2048_285,T_1024_143,T_2048_287,T_64_9,T_2048_289,T_1024_145,T_2048_291,T_512_73,T_2048_293,T_1024_147,T_2048_295,T_256_37,T_2048_297,T_1024_149,T_2048_299,T_512_75,T_2048_301,T_1024_151,T_2048_303,T_128_19,T_2048_305,T_1024_153,T_2048_307,T_512_77,T_2048_309,T_1024_155,T_2048_311,T_256_39,T_2048_313,T_1024_157,T_2048_315,T_512_79,T_2048_317,T_1024_159,T_2048_319,T_32_5,T_2048_321,T_1024_161,T_2048_323,T_512_81,T_2048_325,T_1024_163,T_2048_327,T_256_41,T_2048_329,T_1024_165,T_2048_331,T_512_83,T_2048_333,T_1024_167,T_2048_335,T_128_21,T_2048_337,T_1024_169,T_2048_339,T_512_85,T_2048_341,T_1024_171,T_2048_343,T_256_43,T_2048_345,T_1024_173,T_2048_347,T_512_87,T_2048_349,T_1024_175,T_2048_351,T_64_11,T_2048_353,T_1024_177,T_2048_355,T_512_89,T_2048_357,T_1024_179,T_2048_359,T_256_45,T_2048_361,T_1024_181,T_2048_363,T_512_91,T_2048_365,T_1024_183,T_2048_367,T_128_23,T_2048_369,T_1024_185,T_2048_371,T_512_93,T_2048_373,T_1024_187,T_2048_375,T_256_47,T_2048_377,T_1024_189,T_2048_379,T_512_95,T_2048_381,T_1024_191,T_2048_383,T_16_3,T_2048_385,T_1024_193,T_2048_387,T_512_97,T_2048_389,T_1024_195,T_2048_391,T_256_49,T_2048_393,T_1024_197,T_2048_395,T_512_99,T_2048_397,T_1024_199,T_2048_399,T_128_25,T_2048_401,T_1024_201,T_2048_403,T_512_101,T_2048_405,T_1024_203,T_2048_407,T_256_51,T_2048_409,T_1024_205,T_2048_411,T_512_103,T_2048_413,T_1024_207,T_2048_415,T_64_13,T_2048_417,T_1024_209,T_2048_419,T_512_105,T_2048_421,T_1024_211,T_2048_423,T_256_53,T_2048_425,T_1024_213,T_2048_427,T_512_107,T_2048_429,T_1024_215,T_2048_431,T_128_27,T_2048_433,T_1024_217,T_2048_435,T_512_109,T_2048_437,T_1024_219,T_2048_439,T_256_55,T_2048_441,T_1024_221,T_2048_443,T_512_111,T_2048_445,T_1024_223,T_2048_447,T_32_7,T_2048_449,T_1024_225,T_2048_451,T_512_113,T_2048_453,T_1024_227,T_2048_455,T_256_57,T_2048_457,T_1024_229,T_2048_459,T_512_115,T_2048_461,T_1024_231,T_2048_463,T_128_29,T_2048_465,T_1024_233,T_2048_467,T_512_117,T_2048_469,T_1024_235,T_2048_471,T_256_59,T_2048_473,T_1024_237,T_2048_475,T_512_119,T_2048_477,T_1024_239,T_2048_479,T_64_15,T_2048_481,T_1024_241,T_2048_483,T_512_121,T_2048_485,T_1024_243,T_2048_487,T_256_61,T_2048_489,T_1024_245,T_2048_491,T_512_123,T_2048_493,T_1024_247,T_2048_495,T_128_31,T_2048_497,T_1024_249,T_2048_499,T_512_125,T_2048_501,T_1024_251,T_2048_503,T_256_63,T_2048_505,T_1024_253,T_2048_507,T_512_127,T_2048_509,T_1024_255,T_2048_511,T_2_0,T_2048_3,T_1024_3,T_2048_9,T_512_3,T_2048_15,T_1024_9,T_2048_21,T_256_3,T_2048_27,T_1024_15,T_2048_33,T_512_9,T_2048_39,T_1024_21,T_2048_45,T_128_3,T_2048_51,T_1024_27,T_2048_57,T_512_15,T_2048_63,T_1024_33,T_2048_69,T_256_9,T_2048_75,T_1024_39,T_2048_81,T_512_21,T_2048_87,T_1024_45,T_2048_93,T_64_3,T_2048_99,T_1024_51,T_2048_105,T_512_27,T_2048_111,T_1024_57,T_2048_117,T_256_15,T_2048_123,T_1024_63,T_2048_129,T_512_33,T_2048_135,T_1024_69,T_2048_141,T_128_9,T_2048_147,T_1024_75,T_2048_153,T_512_39,T_2048_159,T_1024_81,T_2048_165,T_256_21,T_2048_171,T_1024_87,T_2048_177,T_512_45,T_2048_183,T_1024_93,T_2048_189,T_32_3,T_2048_195,T_1024_99,T_2048_201,T_512_51,T_2048_207,T_1024_105,T_2048_213,T_256_27,T_2048_219,T_1024_111,T_2048_225,T_512_57,T_2048_231,T_1024_117,T_2048_237,T_128_15,T_2048_243,T_1024_123,T_2048_249,T_512_63,T_2048_255,T_1024_129,T_2048_261,T_256_33,T_2048_267,T_1024_135,T_2048_273,T_512_69,T_2048_279,T_1024_141,T_2048_285,T_64_9,T_2048_291,T_1024_147,T_2048_297,T_512_75,T_2048_303,T_1024_153,T_2048_309,T_256_39,T_2048_315,T_1024_159,T_2048_321,T_512_81,T_2048_327,T_1024_165,T_2048_333,T_128_21,T_2048_339,T_1024_171,T_2048_345,T_512_87,T_2048_351,T_1024_177,T_2048_357,T_256_45,T_2048_363,T_1024_183,T_2048_369,T_512_93,T_2048_375,T_1024_189,T_2048_381,T_16_3,T_2048_387,T_1024_195,T_2048_393,T_512_99,T_2048_399,T_1024_201,T_2048_405,T_256_51,T_2048_411,T_1024_207,T_2048_417,T_512_105,T_2048_423,T_1024_213,T_2048_429,T_128_27,T_2048_435,T_1024_219,T_2048_441,T_512_111,T_2048_447,T_1024_225,T_2048_453,T_256_57,T_2048_459,T_1024_231,T_2048_465,T_512_117,T_2048_471,T_1024_237,T_2048_477,T_64_15,T_2048_483,T_1024_243,T_2048_489,T_512_123,T_2048_495,T_1024_249,T_2048_501,T_256_63,T_2048_507,T_1024_255,T_2048_513,T_512_129,T_2048_519,T_1024_261,T_2048_525,T_128_33,T_2048_531,T_1024_267,T_2048_537,T_512_135,T_2048_543,T_1024_273,T_2048_549,T_256_69,T_2048_555,T_1024_279,T_2048_561,T_512_141,T_2048_567,T_1024_285,T_2048_573,T_32_9,T_2048_579,T_1024_291,T_2048_585,T_512_147,T_2048_591,T_1024_297,T_2048_597,T_256_75,T_2048_603,T_1024_303,T_2048_609,T_512_153,T_2048_615,T_1024_309,T_2048_621,T_128_39,T_2048_627,T_1024_315,T_2048_633,T_512_159,T_2048_639,T_1024_321,T_2048_645,T_256_81,T_2048_651,T_1024_327,T_2048_657,T_512_165,T_2048_663,T_1024_333,T_2048_669,T_64_21,T_2048_675,T_1024_339,T_2048_681,T_512_171,T_2048_687,T_1024_345,T_2048_693,T_256_87,T_2048_699,T_1024_351,T_2048_705,T_512_177,T_2048_711,T_1024_357,T_2048_717,T_128_45,T_2048_723,T_1024_363,T_2048_729,T_512_183,T_2048_735,T_1024_369,T_2048_741,T_256_93,T_2048_747,T_1024_375,T_2048_753,T_512_189,T_2048_759,T_1024_381,T_2048_765,T_8_3,T_2048_771,T_1024_387,T_2048_777,T_512_195,T_2048_783,T_1024_393,T_2048_789,T_256_99,T_2048_795,T_1024_399,T_2048_801,T_512_201,T_2048_807,T_1024_405,T_2048_813,T_128_51,T_2048_819,T_1024_411,T_2048_825,T_512_207,T_2048_831,T_1024_417,T_2048_837,T_256_105,T_2048_843,T_1024_423,T_2048_849,T_512_213,T_2048_855,T_1024_429,T_2048_861,T_64_27,T_2048_867,T_1024_435,T_2048_873,T_512_219,T_2048_879,T_1024_441,T_2048_885,T_256_111,T_2048_891,T_1024_447,T_2048_897,T_512_225,T_2048_903,T_1024_453,T_2048_909,T_128_57,T_2048_915,T_1024_459,T_2048_921,T_512_231,T_2048_927,T_1024_465,T_2048_933,T_256_117,T_2048_939,T_1024_471,T_2048_945,T_512_237,T_2048_951,T_1024_477,T_2048_957,T_32_15,T_2048_963,T_1024_483,T_2048_969,T_512_243,T_2048_975,T_1024_489,T_2048_981,T_256_123,T_2048_987,T_1024_495,T_2048_993,T_512_249,T_2048_999,T_1024_501,T_2048_1005,T_128_63,T_2048_1011,T_1024_507,T_2048_1017,T_512_255,T_2048_1023,T_1024_513,T_2048_1029,T_256_129,T_2048_1035,T_1024_519,T_2048_1041,T_512_261,T_2048_1047,T_1024_525,T_2048_1053,T_64_33,T_2048_1059,T_1024_531,T_2048_1065,T_512_267,T_2048_1071,T_1024_537,T_2048_1077,T_256_135,T_2048_1083,T_1024_543,T_2048_1089,T_512_273,T_2048_1095,T_1024_549,T_2048_1101,T_128_69,T_2048_1107,T_1024_555,T_2048_1113,T_512_279,T_2048_1119,T_1024_561,T_2048_1125,T_256_141,T_2048_1131,T_1024_567,T_2048_1137,T_512_285,T_2048_1143,T_1024_573,T_2048_1149,T_16_9,T_2048_1155,T_1024_579,T_2048_1161,T_512_291,T_2048_1167,T_1024_585,T_2048_1173,T_256_147,T_2048_1179,T_1024_591,T_2048_1185,T_512_297,T_2048_1191,T_1024_597,T_2048_1197,T_128_75,T_2048_1203,T_1024_603,T_2048_1209,T_512_303,T_2048_1215,T_1024_609,T_2048_1221,T_256_153,T_2048_1227,T_1024_615,T_2048_1233,T_512_309,T_2048_1239,T_1024_621,T_2048_1245,T_64_39,T_2048_1251,T_1024_627,T_2048_1257,T_512_315,T_2048_1263,T_1024_633,T_2048_1269,T_256_159,T_2048_1275,T_1024_639,T_2048_1281,T_512_321,T_2048_1287,T_1024_645,T_2048_1293,T_128_81,T_2048_1299,T_1024_651,T_2048_1305,T_512_327,T_2048_1311,T_1024_657,T_2048_1317,T_256_165,T_2048_1323,T_1024_663,T_2048_1329,T_512_333,T_2048_1335,T_1024_669,T_2048_1341,T_32_21,T_2048_1347,T_1024_675,T_2048_1353,T_512_339,T_2048_1359,T_1024_681,T_2048_1365,T_256_171,T_2048_1371,T_1024_687,T_2048_1377,T_512_345,T_2048_1383,T_1024_693,T_2048_1389,T_128_87,T_2048_1395,T_1024_699,T_2048_1401,T_512_351,T_2048_1407,T_1024_705,T_2048_1413,T_256_177,T_2048_1419,T_1024_711,T_2048_1425,T_512_357,T_2048_1431,T_1024_717,T_2048_1437,T_64_45,T_2048_1443,T_1024_723,T_2048_1449,T_512_363,T_2048_1455,T_1024_729,T_2048_1461,T_256_183,T_2048_1467,T_1024_735,T_2048_1473,T_512_369,T_2048_1479,T_1024_741,T_2048_1485,T_128_93,T_2048_1491,T_1024_747,T_2048_1497,T_512_375,T_2048_1503,T_1024_753,T_2048_1509,T_256_189,T_2048_1515,T_1024_759,T_2048_1521,T_512_381,T_2048_1527,T_1024_765,T_2048_1533 +}; +static const __device__ double2 lut_dp_8_2048[256*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_16_1,T_2048_129,T_1024_65,T_2048_131,T_512_33,T_2048_133,T_1024_67,T_2048_135,T_256_17,T_2048_137,T_1024_69,T_2048_139,T_512_35,T_2048_141,T_1024_71,T_2048_143,T_128_9,T_2048_145,T_1024_73,T_2048_147,T_512_37,T_2048_149,T_1024_75,T_2048_151,T_256_19,T_2048_153,T_1024_77,T_2048_155,T_512_39,T_2048_157,T_1024_79,T_2048_159,T_64_5,T_2048_161,T_1024_81,T_2048_163,T_512_41,T_2048_165,T_1024_83,T_2048_167,T_256_21,T_2048_169,T_1024_85,T_2048_171,T_512_43,T_2048_173,T_1024_87,T_2048_175,T_128_11,T_2048_177,T_1024_89,T_2048_179,T_512_45,T_2048_181,T_1024_91,T_2048_183,T_256_23,T_2048_185,T_1024_93,T_2048_187,T_512_47,T_2048_189,T_1024_95,T_2048_191,T_32_3,T_2048_193,T_1024_97,T_2048_195,T_512_49,T_2048_197,T_1024_99,T_2048_199,T_256_25,T_2048_201,T_1024_101,T_2048_203,T_512_51,T_2048_205,T_1024_103,T_2048_207,T_128_13,T_2048_209,T_1024_105,T_2048_211,T_512_53,T_2048_213,T_1024_107,T_2048_215,T_256_27,T_2048_217,T_1024_109,T_2048_219,T_512_55,T_2048_221,T_1024_111,T_2048_223,T_64_7,T_2048_225,T_1024_113,T_2048_227,T_512_57,T_2048_229,T_1024_115,T_2048_231,T_256_29,T_2048_233,T_1024_117,T_2048_235,T_512_59,T_2048_237,T_1024_119,T_2048_239,T_128_15,T_2048_241,T_1024_121,T_2048_243,T_512_61,T_2048_245,T_1024_123,T_2048_247,T_256_31,T_2048_249,T_1024_125,T_2048_251,T_512_63,T_2048_253,T_1024_127,T_2048_255,T_2_0,T_2048_5,T_1024_5,T_2048_15,T_512_5,T_2048_25,T_1024_15,T_2048_35,T_256_5,T_2048_45,T_1024_25,T_2048_55,T_512_15,T_2048_65,T_1024_35,T_2048_75,T_128_5,T_2048_85,T_1024_45,T_2048_95,T_512_25,T_2048_105,T_1024_55,T_2048_115,T_256_15,T_2048_125,T_1024_65,T_2048_135,T_512_35,T_2048_145,T_1024_75,T_2048_155,T_64_5,T_2048_165,T_1024_85,T_2048_175,T_512_45,T_2048_185,T_1024_95,T_2048_195,T_256_25,T_2048_205,T_1024_105,T_2048_215,T_512_55,T_2048_225,T_1024_115,T_2048_235,T_128_15,T_2048_245,T_1024_125,T_2048_255,T_512_65,T_2048_265,T_1024_135,T_2048_275,T_256_35,T_2048_285,T_1024_145,T_2048_295,T_512_75,T_2048_305,T_1024_155,T_2048_315,T_32_5,T_2048_325,T_1024_165,T_2048_335,T_512_85,T_2048_345,T_1024_175,T_2048_355,T_256_45,T_2048_365,T_1024_185,T_2048_375,T_512_95,T_2048_385,T_1024_195,T_2048_395,T_128_25,T_2048_405,T_1024_205,T_2048_415,T_512_105,T_2048_425,T_1024_215,T_2048_435,T_256_55,T_2048_445,T_1024_225,T_2048_455,T_512_115,T_2048_465,T_1024_235,T_2048_475,T_64_15,T_2048_485,T_1024_245,T_2048_495,T_512_125,T_2048_505,T_1024_255,T_2048_515,T_256_65,T_2048_525,T_1024_265,T_2048_535,T_512_135,T_2048_545,T_1024_275,T_2048_555,T_128_35,T_2048_565,T_1024_285,T_2048_575,T_512_145,T_2048_585,T_1024_295,T_2048_595,T_256_75,T_2048_605,T_1024_305,T_2048_615,T_512_155,T_2048_625,T_1024_315,T_2048_635,T_16_5,T_2048_645,T_1024_325,T_2048_655,T_512_165,T_2048_665,T_1024_335,T_2048_675,T_256_85,T_2048_685,T_1024_345,T_2048_695,T_512_175,T_2048_705,T_1024_355,T_2048_715,T_128_45,T_2048_725,T_1024_365,T_2048_735,T_512_185,T_2048_745,T_1024_375,T_2048_755,T_256_95,T_2048_765,T_1024_385,T_2048_775,T_512_195,T_2048_785,T_1024_395,T_2048_795,T_64_25,T_2048_805,T_1024_405,T_2048_815,T_512_205,T_2048_825,T_1024_415,T_2048_835,T_256_105,T_2048_845,T_1024_425,T_2048_855,T_512_215,T_2048_865,T_1024_435,T_2048_875,T_128_55,T_2048_885,T_1024_445,T_2048_895,T_512_225,T_2048_905,T_1024_455,T_2048_915,T_256_115,T_2048_925,T_1024_465,T_2048_935,T_512_235,T_2048_945,T_1024_475,T_2048_955,T_32_15,T_2048_965,T_1024_485,T_2048_975,T_512_245,T_2048_985,T_1024_495,T_2048_995,T_256_125,T_2048_1005,T_1024_505,T_2048_1015,T_512_255,T_2048_1025,T_1024_515,T_2048_1035,T_128_65,T_2048_1045,T_1024_525,T_2048_1055,T_512_265,T_2048_1065,T_1024_535,T_2048_1075,T_256_135,T_2048_1085,T_1024_545,T_2048_1095,T_512_275,T_2048_1105,T_1024_555,T_2048_1115,T_64_35,T_2048_1125,T_1024_565,T_2048_1135,T_512_285,T_2048_1145,T_1024_575,T_2048_1155,T_256_145,T_2048_1165,T_1024_585,T_2048_1175,T_512_295,T_2048_1185,T_1024_595,T_2048_1195,T_128_75,T_2048_1205,T_1024_605,T_2048_1215,T_512_305,T_2048_1225,T_1024_615,T_2048_1235,T_256_155,T_2048_1245,T_1024_625,T_2048_1255,T_512_315,T_2048_1265,T_1024_635,T_2048_1275 +}; +static const __device__ double2 lut_dp_16_2048[128*2] = { + T_2_0,T_2048_1,T_1024_1,T_2048_3,T_512_1,T_2048_5,T_1024_3,T_2048_7,T_256_1,T_2048_9,T_1024_5,T_2048_11,T_512_3,T_2048_13,T_1024_7,T_2048_15,T_128_1,T_2048_17,T_1024_9,T_2048_19,T_512_5,T_2048_21,T_1024_11,T_2048_23,T_256_3,T_2048_25,T_1024_13,T_2048_27,T_512_7,T_2048_29,T_1024_15,T_2048_31,T_64_1,T_2048_33,T_1024_17,T_2048_35,T_512_9,T_2048_37,T_1024_19,T_2048_39,T_256_5,T_2048_41,T_1024_21,T_2048_43,T_512_11,T_2048_45,T_1024_23,T_2048_47,T_128_3,T_2048_49,T_1024_25,T_2048_51,T_512_13,T_2048_53,T_1024_27,T_2048_55,T_256_7,T_2048_57,T_1024_29,T_2048_59,T_512_15,T_2048_61,T_1024_31,T_2048_63,T_32_1,T_2048_65,T_1024_33,T_2048_67,T_512_17,T_2048_69,T_1024_35,T_2048_71,T_256_9,T_2048_73,T_1024_37,T_2048_75,T_512_19,T_2048_77,T_1024_39,T_2048_79,T_128_5,T_2048_81,T_1024_41,T_2048_83,T_512_21,T_2048_85,T_1024_43,T_2048_87,T_256_11,T_2048_89,T_1024_45,T_2048_91,T_512_23,T_2048_93,T_1024_47,T_2048_95,T_64_3,T_2048_97,T_1024_49,T_2048_99,T_512_25,T_2048_101,T_1024_51,T_2048_103,T_256_13,T_2048_105,T_1024_53,T_2048_107,T_512_27,T_2048_109,T_1024_55,T_2048_111,T_128_7,T_2048_113,T_1024_57,T_2048_115,T_512_29,T_2048_117,T_1024_59,T_2048_119,T_256_15,T_2048_121,T_1024_61,T_2048_123,T_512_31,T_2048_125,T_1024_63,T_2048_127,T_2_0,T_2048_9,T_1024_9,T_2048_27,T_512_9,T_2048_45,T_1024_27,T_2048_63,T_256_9,T_2048_81,T_1024_45,T_2048_99,T_512_27,T_2048_117,T_1024_63,T_2048_135,T_128_9,T_2048_153,T_1024_81,T_2048_171,T_512_45,T_2048_189,T_1024_99,T_2048_207,T_256_27,T_2048_225,T_1024_117,T_2048_243,T_512_63,T_2048_261,T_1024_135,T_2048_279,T_64_9,T_2048_297,T_1024_153,T_2048_315,T_512_81,T_2048_333,T_1024_171,T_2048_351,T_256_45,T_2048_369,T_1024_189,T_2048_387,T_512_99,T_2048_405,T_1024_207,T_2048_423,T_128_27,T_2048_441,T_1024_225,T_2048_459,T_512_117,T_2048_477,T_1024_243,T_2048_495,T_256_63,T_2048_513,T_1024_261,T_2048_531,T_512_135,T_2048_549,T_1024_279,T_2048_567,T_32_9,T_2048_585,T_1024_297,T_2048_603,T_512_153,T_2048_621,T_1024_315,T_2048_639,T_256_81,T_2048_657,T_1024_333,T_2048_675,T_512_171,T_2048_693,T_1024_351,T_2048_711,T_128_45,T_2048_729,T_1024_369,T_2048_747,T_512_189,T_2048_765,T_1024_387,T_2048_783,T_256_99,T_2048_801,T_1024_405,T_2048_819,T_512_207,T_2048_837,T_1024_423,T_2048_855,T_64_27,T_2048_873,T_1024_441,T_2048_891,T_512_225,T_2048_909,T_1024_459,T_2048_927,T_256_117,T_2048_945,T_1024_477,T_2048_963,T_512_243,T_2048_981,T_1024_495,T_2048_999,T_128_63,T_2048_1017,T_1024_513,T_2048_1035,T_512_261,T_2048_1053,T_1024_531,T_2048_1071,T_256_135,T_2048_1089,T_1024_549,T_2048_1107,T_512_279,T_2048_1125,T_1024_567,T_2048_1143 +}; +static const __device__ double2 lut_dp_3_2187[729*2] = { + T_2_0,T_2187_1,T_2187_2,T_729_1,T_2187_4,T_2187_5,T_729_2,T_2187_7,T_2187_8,T_243_1,T_2187_10,T_2187_11,T_729_4,T_2187_13,T_2187_14,T_729_5,T_2187_16,T_2187_17,T_243_2,T_2187_19,T_2187_20,T_729_7,T_2187_22,T_2187_23,T_729_8,T_2187_25,T_2187_26,T_81_1,T_2187_28,T_2187_29,T_729_10,T_2187_31,T_2187_32,T_729_11,T_2187_34,T_2187_35,T_243_4,T_2187_37,T_2187_38,T_729_13,T_2187_40,T_2187_41,T_729_14,T_2187_43,T_2187_44,T_243_5,T_2187_46,T_2187_47,T_729_16,T_2187_49,T_2187_50,T_729_17,T_2187_52,T_2187_53,T_81_2,T_2187_55,T_2187_56,T_729_19,T_2187_58,T_2187_59,T_729_20,T_2187_61,T_2187_62,T_243_7,T_2187_64,T_2187_65,T_729_22,T_2187_67,T_2187_68,T_729_23,T_2187_70,T_2187_71,T_243_8,T_2187_73,T_2187_74,T_729_25,T_2187_76,T_2187_77,T_729_26,T_2187_79,T_2187_80,T_27_1,T_2187_82,T_2187_83,T_729_28,T_2187_85,T_2187_86,T_729_29,T_2187_88,T_2187_89,T_243_10,T_2187_91,T_2187_92,T_729_31,T_2187_94,T_2187_95,T_729_32,T_2187_97,T_2187_98,T_243_11,T_2187_100,T_2187_101,T_729_34,T_2187_103,T_2187_104,T_729_35,T_2187_106,T_2187_107,T_81_4,T_2187_109,T_2187_110,T_729_37,T_2187_112,T_2187_113,T_729_38,T_2187_115,T_2187_116,T_243_13,T_2187_118,T_2187_119,T_729_40,T_2187_121,T_2187_122,T_729_41,T_2187_124,T_2187_125,T_243_14,T_2187_127,T_2187_128,T_729_43,T_2187_130,T_2187_131,T_729_44,T_2187_133,T_2187_134,T_81_5,T_2187_136,T_2187_137,T_729_46,T_2187_139,T_2187_140,T_729_47,T_2187_142,T_2187_143,T_243_16,T_2187_145,T_2187_146,T_729_49,T_2187_148,T_2187_149,T_729_50,T_2187_151,T_2187_152,T_243_17,T_2187_154,T_2187_155,T_729_52,T_2187_157,T_2187_158,T_729_53,T_2187_160,T_2187_161,T_27_2,T_2187_163,T_2187_164,T_729_55,T_2187_166,T_2187_167,T_729_56,T_2187_169,T_2187_170,T_243_19,T_2187_172,T_2187_173,T_729_58,T_2187_175,T_2187_176,T_729_59,T_2187_178,T_2187_179,T_243_20,T_2187_181,T_2187_182,T_729_61,T_2187_184,T_2187_185,T_729_62,T_2187_187,T_2187_188,T_81_7,T_2187_190,T_2187_191,T_729_64,T_2187_193,T_2187_194,T_729_65,T_2187_196,T_2187_197,T_243_22,T_2187_199,T_2187_200,T_729_67,T_2187_202,T_2187_203,T_729_68,T_2187_205,T_2187_206,T_243_23,T_2187_208,T_2187_209,T_729_70,T_2187_211,T_2187_212,T_729_71,T_2187_214,T_2187_215,T_81_8,T_2187_217,T_2187_218,T_729_73,T_2187_220,T_2187_221,T_729_74,T_2187_223,T_2187_224,T_243_25,T_2187_226,T_2187_227,T_729_76,T_2187_229,T_2187_230,T_729_77,T_2187_232,T_2187_233,T_243_26,T_2187_235,T_2187_236,T_729_79,T_2187_238,T_2187_239,T_729_80,T_2187_241,T_2187_242,T_9_1,T_2187_244,T_2187_245,T_729_82,T_2187_247,T_2187_248,T_729_83,T_2187_250,T_2187_251,T_243_28,T_2187_253,T_2187_254,T_729_85,T_2187_256,T_2187_257,T_729_86,T_2187_259,T_2187_260,T_243_29,T_2187_262,T_2187_263,T_729_88,T_2187_265,T_2187_266,T_729_89,T_2187_268,T_2187_269,T_81_10,T_2187_271,T_2187_272,T_729_91,T_2187_274,T_2187_275,T_729_92,T_2187_277,T_2187_278,T_243_31,T_2187_280,T_2187_281,T_729_94,T_2187_283,T_2187_284,T_729_95,T_2187_286,T_2187_287,T_243_32,T_2187_289,T_2187_290,T_729_97,T_2187_292,T_2187_293,T_729_98,T_2187_295,T_2187_296,T_81_11,T_2187_298,T_2187_299,T_729_100,T_2187_301,T_2187_302,T_729_101,T_2187_304,T_2187_305,T_243_34,T_2187_307,T_2187_308,T_729_103,T_2187_310,T_2187_311,T_729_104,T_2187_313,T_2187_314,T_243_35,T_2187_316,T_2187_317,T_729_106,T_2187_319,T_2187_320,T_729_107,T_2187_322,T_2187_323,T_27_4,T_2187_325,T_2187_326,T_729_109,T_2187_328,T_2187_329,T_729_110,T_2187_331,T_2187_332,T_243_37,T_2187_334,T_2187_335,T_729_112,T_2187_337,T_2187_338,T_729_113,T_2187_340,T_2187_341,T_243_38,T_2187_343,T_2187_344,T_729_115,T_2187_346,T_2187_347,T_729_116,T_2187_349,T_2187_350,T_81_13,T_2187_352,T_2187_353,T_729_118,T_2187_355,T_2187_356,T_729_119,T_2187_358,T_2187_359,T_243_40,T_2187_361,T_2187_362,T_729_121,T_2187_364,T_2187_365,T_729_122,T_2187_367,T_2187_368,T_243_41,T_2187_370,T_2187_371,T_729_124,T_2187_373,T_2187_374,T_729_125,T_2187_376,T_2187_377,T_81_14,T_2187_379,T_2187_380,T_729_127,T_2187_382,T_2187_383,T_729_128,T_2187_385,T_2187_386,T_243_43,T_2187_388,T_2187_389,T_729_130,T_2187_391,T_2187_392,T_729_131,T_2187_394,T_2187_395,T_243_44,T_2187_397,T_2187_398,T_729_133,T_2187_400,T_2187_401,T_729_134,T_2187_403,T_2187_404,T_27_5,T_2187_406,T_2187_407,T_729_136,T_2187_409,T_2187_410,T_729_137,T_2187_412,T_2187_413,T_243_46,T_2187_415,T_2187_416,T_729_139,T_2187_418,T_2187_419,T_729_140,T_2187_421,T_2187_422,T_243_47,T_2187_424,T_2187_425,T_729_142,T_2187_427,T_2187_428,T_729_143,T_2187_430,T_2187_431,T_81_16,T_2187_433,T_2187_434,T_729_145,T_2187_436,T_2187_437,T_729_146,T_2187_439,T_2187_440,T_243_49,T_2187_442,T_2187_443,T_729_148,T_2187_445,T_2187_446,T_729_149,T_2187_448,T_2187_449,T_243_50,T_2187_451,T_2187_452,T_729_151,T_2187_454,T_2187_455,T_729_152,T_2187_457,T_2187_458,T_81_17,T_2187_460,T_2187_461,T_729_154,T_2187_463,T_2187_464,T_729_155,T_2187_466,T_2187_467,T_243_52,T_2187_469,T_2187_470,T_729_157,T_2187_472,T_2187_473,T_729_158,T_2187_475,T_2187_476,T_243_53,T_2187_478,T_2187_479,T_729_160,T_2187_481,T_2187_482,T_729_161,T_2187_484,T_2187_485,T_9_2,T_2187_487,T_2187_488,T_729_163,T_2187_490,T_2187_491,T_729_164,T_2187_493,T_2187_494,T_243_55,T_2187_496,T_2187_497,T_729_166,T_2187_499,T_2187_500,T_729_167,T_2187_502,T_2187_503,T_243_56,T_2187_505,T_2187_506,T_729_169,T_2187_508,T_2187_509,T_729_170,T_2187_511,T_2187_512,T_81_19,T_2187_514,T_2187_515,T_729_172,T_2187_517,T_2187_518,T_729_173,T_2187_520,T_2187_521,T_243_58,T_2187_523,T_2187_524,T_729_175,T_2187_526,T_2187_527,T_729_176,T_2187_529,T_2187_530,T_243_59,T_2187_532,T_2187_533,T_729_178,T_2187_535,T_2187_536,T_729_179,T_2187_538,T_2187_539,T_81_20,T_2187_541,T_2187_542,T_729_181,T_2187_544,T_2187_545,T_729_182,T_2187_547,T_2187_548,T_243_61,T_2187_550,T_2187_551,T_729_184,T_2187_553,T_2187_554,T_729_185,T_2187_556,T_2187_557,T_243_62,T_2187_559,T_2187_560,T_729_187,T_2187_562,T_2187_563,T_729_188,T_2187_565,T_2187_566,T_27_7,T_2187_568,T_2187_569,T_729_190,T_2187_571,T_2187_572,T_729_191,T_2187_574,T_2187_575,T_243_64,T_2187_577,T_2187_578,T_729_193,T_2187_580,T_2187_581,T_729_194,T_2187_583,T_2187_584,T_243_65,T_2187_586,T_2187_587,T_729_196,T_2187_589,T_2187_590,T_729_197,T_2187_592,T_2187_593,T_81_22,T_2187_595,T_2187_596,T_729_199,T_2187_598,T_2187_599,T_729_200,T_2187_601,T_2187_602,T_243_67,T_2187_604,T_2187_605,T_729_202,T_2187_607,T_2187_608,T_729_203,T_2187_610,T_2187_611,T_243_68,T_2187_613,T_2187_614,T_729_205,T_2187_616,T_2187_617,T_729_206,T_2187_619,T_2187_620,T_81_23,T_2187_622,T_2187_623,T_729_208,T_2187_625,T_2187_626,T_729_209,T_2187_628,T_2187_629,T_243_70,T_2187_631,T_2187_632,T_729_211,T_2187_634,T_2187_635,T_729_212,T_2187_637,T_2187_638,T_243_71,T_2187_640,T_2187_641,T_729_214,T_2187_643,T_2187_644,T_729_215,T_2187_646,T_2187_647,T_27_8,T_2187_649,T_2187_650,T_729_217,T_2187_652,T_2187_653,T_729_218,T_2187_655,T_2187_656,T_243_73,T_2187_658,T_2187_659,T_729_220,T_2187_661,T_2187_662,T_729_221,T_2187_664,T_2187_665,T_243_74,T_2187_667,T_2187_668,T_729_223,T_2187_670,T_2187_671,T_729_224,T_2187_673,T_2187_674,T_81_25,T_2187_676,T_2187_677,T_729_226,T_2187_679,T_2187_680,T_729_227,T_2187_682,T_2187_683,T_243_76,T_2187_685,T_2187_686,T_729_229,T_2187_688,T_2187_689,T_729_230,T_2187_691,T_2187_692,T_243_77,T_2187_694,T_2187_695,T_729_232,T_2187_697,T_2187_698,T_729_233,T_2187_700,T_2187_701,T_81_26,T_2187_703,T_2187_704,T_729_235,T_2187_706,T_2187_707,T_729_236,T_2187_709,T_2187_710,T_243_79,T_2187_712,T_2187_713,T_729_238,T_2187_715,T_2187_716,T_729_239,T_2187_718,T_2187_719,T_243_80,T_2187_721,T_2187_722,T_729_241,T_2187_724,T_2187_725,T_729_242,T_2187_727,T_2187_728,T_2_0,T_2187_2,T_2187_4,T_729_2,T_2187_8,T_2187_10,T_729_4,T_2187_14,T_2187_16,T_243_2,T_2187_20,T_2187_22,T_729_8,T_2187_26,T_2187_28,T_729_10,T_2187_32,T_2187_34,T_243_4,T_2187_38,T_2187_40,T_729_14,T_2187_44,T_2187_46,T_729_16,T_2187_50,T_2187_52,T_81_2,T_2187_56,T_2187_58,T_729_20,T_2187_62,T_2187_64,T_729_22,T_2187_68,T_2187_70,T_243_8,T_2187_74,T_2187_76,T_729_26,T_2187_80,T_2187_82,T_729_28,T_2187_86,T_2187_88,T_243_10,T_2187_92,T_2187_94,T_729_32,T_2187_98,T_2187_100,T_729_34,T_2187_104,T_2187_106,T_81_4,T_2187_110,T_2187_112,T_729_38,T_2187_116,T_2187_118,T_729_40,T_2187_122,T_2187_124,T_243_14,T_2187_128,T_2187_130,T_729_44,T_2187_134,T_2187_136,T_729_46,T_2187_140,T_2187_142,T_243_16,T_2187_146,T_2187_148,T_729_50,T_2187_152,T_2187_154,T_729_52,T_2187_158,T_2187_160,T_27_2,T_2187_164,T_2187_166,T_729_56,T_2187_170,T_2187_172,T_729_58,T_2187_176,T_2187_178,T_243_20,T_2187_182,T_2187_184,T_729_62,T_2187_188,T_2187_190,T_729_64,T_2187_194,T_2187_196,T_243_22,T_2187_200,T_2187_202,T_729_68,T_2187_206,T_2187_208,T_729_70,T_2187_212,T_2187_214,T_81_8,T_2187_218,T_2187_220,T_729_74,T_2187_224,T_2187_226,T_729_76,T_2187_230,T_2187_232,T_243_26,T_2187_236,T_2187_238,T_729_80,T_2187_242,T_2187_244,T_729_82,T_2187_248,T_2187_250,T_243_28,T_2187_254,T_2187_256,T_729_86,T_2187_260,T_2187_262,T_729_88,T_2187_266,T_2187_268,T_81_10,T_2187_272,T_2187_274,T_729_92,T_2187_278,T_2187_280,T_729_94,T_2187_284,T_2187_286,T_243_32,T_2187_290,T_2187_292,T_729_98,T_2187_296,T_2187_298,T_729_100,T_2187_302,T_2187_304,T_243_34,T_2187_308,T_2187_310,T_729_104,T_2187_314,T_2187_316,T_729_106,T_2187_320,T_2187_322,T_27_4,T_2187_326,T_2187_328,T_729_110,T_2187_332,T_2187_334,T_729_112,T_2187_338,T_2187_340,T_243_38,T_2187_344,T_2187_346,T_729_116,T_2187_350,T_2187_352,T_729_118,T_2187_356,T_2187_358,T_243_40,T_2187_362,T_2187_364,T_729_122,T_2187_368,T_2187_370,T_729_124,T_2187_374,T_2187_376,T_81_14,T_2187_380,T_2187_382,T_729_128,T_2187_386,T_2187_388,T_729_130,T_2187_392,T_2187_394,T_243_44,T_2187_398,T_2187_400,T_729_134,T_2187_404,T_2187_406,T_729_136,T_2187_410,T_2187_412,T_243_46,T_2187_416,T_2187_418,T_729_140,T_2187_422,T_2187_424,T_729_142,T_2187_428,T_2187_430,T_81_16,T_2187_434,T_2187_436,T_729_146,T_2187_440,T_2187_442,T_729_148,T_2187_446,T_2187_448,T_243_50,T_2187_452,T_2187_454,T_729_152,T_2187_458,T_2187_460,T_729_154,T_2187_464,T_2187_466,T_243_52,T_2187_470,T_2187_472,T_729_158,T_2187_476,T_2187_478,T_729_160,T_2187_482,T_2187_484,T_9_2,T_2187_488,T_2187_490,T_729_164,T_2187_494,T_2187_496,T_729_166,T_2187_500,T_2187_502,T_243_56,T_2187_506,T_2187_508,T_729_170,T_2187_512,T_2187_514,T_729_172,T_2187_518,T_2187_520,T_243_58,T_2187_524,T_2187_526,T_729_176,T_2187_530,T_2187_532,T_729_178,T_2187_536,T_2187_538,T_81_20,T_2187_542,T_2187_544,T_729_182,T_2187_548,T_2187_550,T_729_184,T_2187_554,T_2187_556,T_243_62,T_2187_560,T_2187_562,T_729_188,T_2187_566,T_2187_568,T_729_190,T_2187_572,T_2187_574,T_243_64,T_2187_578,T_2187_580,T_729_194,T_2187_584,T_2187_586,T_729_196,T_2187_590,T_2187_592,T_81_22,T_2187_596,T_2187_598,T_729_200,T_2187_602,T_2187_604,T_729_202,T_2187_608,T_2187_610,T_243_68,T_2187_614,T_2187_616,T_729_206,T_2187_620,T_2187_622,T_729_208,T_2187_626,T_2187_628,T_243_70,T_2187_632,T_2187_634,T_729_212,T_2187_638,T_2187_640,T_729_214,T_2187_644,T_2187_646,T_27_8,T_2187_650,T_2187_652,T_729_218,T_2187_656,T_2187_658,T_729_220,T_2187_662,T_2187_664,T_243_74,T_2187_668,T_2187_670,T_729_224,T_2187_674,T_2187_676,T_729_226,T_2187_680,T_2187_682,T_243_76,T_2187_686,T_2187_688,T_729_230,T_2187_692,T_2187_694,T_729_232,T_2187_698,T_2187_700,T_81_26,T_2187_704,T_2187_706,T_729_236,T_2187_710,T_2187_712,T_729_238,T_2187_716,T_2187_718,T_243_80,T_2187_722,T_2187_724,T_729_242,T_2187_728,T_2187_730,T_729_244,T_2187_734,T_2187_736,T_243_82,T_2187_740,T_2187_742,T_729_248,T_2187_746,T_2187_748,T_729_250,T_2187_752,T_2187_754,T_81_28,T_2187_758,T_2187_760,T_729_254,T_2187_764,T_2187_766,T_729_256,T_2187_770,T_2187_772,T_243_86,T_2187_776,T_2187_778,T_729_260,T_2187_782,T_2187_784,T_729_262,T_2187_788,T_2187_790,T_243_88,T_2187_794,T_2187_796,T_729_266,T_2187_800,T_2187_802,T_729_268,T_2187_806,T_2187_808,T_27_10,T_2187_812,T_2187_814,T_729_272,T_2187_818,T_2187_820,T_729_274,T_2187_824,T_2187_826,T_243_92,T_2187_830,T_2187_832,T_729_278,T_2187_836,T_2187_838,T_729_280,T_2187_842,T_2187_844,T_243_94,T_2187_848,T_2187_850,T_729_284,T_2187_854,T_2187_856,T_729_286,T_2187_860,T_2187_862,T_81_32,T_2187_866,T_2187_868,T_729_290,T_2187_872,T_2187_874,T_729_292,T_2187_878,T_2187_880,T_243_98,T_2187_884,T_2187_886,T_729_296,T_2187_890,T_2187_892,T_729_298,T_2187_896,T_2187_898,T_243_100,T_2187_902,T_2187_904,T_729_302,T_2187_908,T_2187_910,T_729_304,T_2187_914,T_2187_916,T_81_34,T_2187_920,T_2187_922,T_729_308,T_2187_926,T_2187_928,T_729_310,T_2187_932,T_2187_934,T_243_104,T_2187_938,T_2187_940,T_729_314,T_2187_944,T_2187_946,T_729_316,T_2187_950,T_2187_952,T_243_106,T_2187_956,T_2187_958,T_729_320,T_2187_962,T_2187_964,T_729_322,T_2187_968,T_2187_970,T_9_4,T_2187_974,T_2187_976,T_729_326,T_2187_980,T_2187_982,T_729_328,T_2187_986,T_2187_988,T_243_110,T_2187_992,T_2187_994,T_729_332,T_2187_998,T_2187_1000,T_729_334,T_2187_1004,T_2187_1006,T_243_112,T_2187_1010,T_2187_1012,T_729_338,T_2187_1016,T_2187_1018,T_729_340,T_2187_1022,T_2187_1024,T_81_38,T_2187_1028,T_2187_1030,T_729_344,T_2187_1034,T_2187_1036,T_729_346,T_2187_1040,T_2187_1042,T_243_116,T_2187_1046,T_2187_1048,T_729_350,T_2187_1052,T_2187_1054,T_729_352,T_2187_1058,T_2187_1060,T_243_118,T_2187_1064,T_2187_1066,T_729_356,T_2187_1070,T_2187_1072,T_729_358,T_2187_1076,T_2187_1078,T_81_40,T_2187_1082,T_2187_1084,T_729_362,T_2187_1088,T_2187_1090,T_729_364,T_2187_1094,T_2187_1096,T_243_122,T_2187_1100,T_2187_1102,T_729_368,T_2187_1106,T_2187_1108,T_729_370,T_2187_1112,T_2187_1114,T_243_124,T_2187_1118,T_2187_1120,T_729_374,T_2187_1124,T_2187_1126,T_729_376,T_2187_1130,T_2187_1132,T_27_14,T_2187_1136,T_2187_1138,T_729_380,T_2187_1142,T_2187_1144,T_729_382,T_2187_1148,T_2187_1150,T_243_128,T_2187_1154,T_2187_1156,T_729_386,T_2187_1160,T_2187_1162,T_729_388,T_2187_1166,T_2187_1168,T_243_130,T_2187_1172,T_2187_1174,T_729_392,T_2187_1178,T_2187_1180,T_729_394,T_2187_1184,T_2187_1186,T_81_44,T_2187_1190,T_2187_1192,T_729_398,T_2187_1196,T_2187_1198,T_729_400,T_2187_1202,T_2187_1204,T_243_134,T_2187_1208,T_2187_1210,T_729_404,T_2187_1214,T_2187_1216,T_729_406,T_2187_1220,T_2187_1222,T_243_136,T_2187_1226,T_2187_1228,T_729_410,T_2187_1232,T_2187_1234,T_729_412,T_2187_1238,T_2187_1240,T_81_46,T_2187_1244,T_2187_1246,T_729_416,T_2187_1250,T_2187_1252,T_729_418,T_2187_1256,T_2187_1258,T_243_140,T_2187_1262,T_2187_1264,T_729_422,T_2187_1268,T_2187_1270,T_729_424,T_2187_1274,T_2187_1276,T_243_142,T_2187_1280,T_2187_1282,T_729_428,T_2187_1286,T_2187_1288,T_729_430,T_2187_1292,T_2187_1294,T_27_16,T_2187_1298,T_2187_1300,T_729_434,T_2187_1304,T_2187_1306,T_729_436,T_2187_1310,T_2187_1312,T_243_146,T_2187_1316,T_2187_1318,T_729_440,T_2187_1322,T_2187_1324,T_729_442,T_2187_1328,T_2187_1330,T_243_148,T_2187_1334,T_2187_1336,T_729_446,T_2187_1340,T_2187_1342,T_729_448,T_2187_1346,T_2187_1348,T_81_50,T_2187_1352,T_2187_1354,T_729_452,T_2187_1358,T_2187_1360,T_729_454,T_2187_1364,T_2187_1366,T_243_152,T_2187_1370,T_2187_1372,T_729_458,T_2187_1376,T_2187_1378,T_729_460,T_2187_1382,T_2187_1384,T_243_154,T_2187_1388,T_2187_1390,T_729_464,T_2187_1394,T_2187_1396,T_729_466,T_2187_1400,T_2187_1402,T_81_52,T_2187_1406,T_2187_1408,T_729_470,T_2187_1412,T_2187_1414,T_729_472,T_2187_1418,T_2187_1420,T_243_158,T_2187_1424,T_2187_1426,T_729_476,T_2187_1430,T_2187_1432,T_729_478,T_2187_1436,T_2187_1438,T_243_160,T_2187_1442,T_2187_1444,T_729_482,T_2187_1448,T_2187_1450,T_729_484,T_2187_1454,T_2187_1456 +}; +static const __device__ double2 lut_dp_9_2187[243*2] = { + T_2_0,T_2187_1,T_2187_2,T_729_1,T_2187_4,T_2187_5,T_729_2,T_2187_7,T_2187_8,T_243_1,T_2187_10,T_2187_11,T_729_4,T_2187_13,T_2187_14,T_729_5,T_2187_16,T_2187_17,T_243_2,T_2187_19,T_2187_20,T_729_7,T_2187_22,T_2187_23,T_729_8,T_2187_25,T_2187_26,T_81_1,T_2187_28,T_2187_29,T_729_10,T_2187_31,T_2187_32,T_729_11,T_2187_34,T_2187_35,T_243_4,T_2187_37,T_2187_38,T_729_13,T_2187_40,T_2187_41,T_729_14,T_2187_43,T_2187_44,T_243_5,T_2187_46,T_2187_47,T_729_16,T_2187_49,T_2187_50,T_729_17,T_2187_52,T_2187_53,T_81_2,T_2187_55,T_2187_56,T_729_19,T_2187_58,T_2187_59,T_729_20,T_2187_61,T_2187_62,T_243_7,T_2187_64,T_2187_65,T_729_22,T_2187_67,T_2187_68,T_729_23,T_2187_70,T_2187_71,T_243_8,T_2187_73,T_2187_74,T_729_25,T_2187_76,T_2187_77,T_729_26,T_2187_79,T_2187_80,T_27_1,T_2187_82,T_2187_83,T_729_28,T_2187_85,T_2187_86,T_729_29,T_2187_88,T_2187_89,T_243_10,T_2187_91,T_2187_92,T_729_31,T_2187_94,T_2187_95,T_729_32,T_2187_97,T_2187_98,T_243_11,T_2187_100,T_2187_101,T_729_34,T_2187_103,T_2187_104,T_729_35,T_2187_106,T_2187_107,T_81_4,T_2187_109,T_2187_110,T_729_37,T_2187_112,T_2187_113,T_729_38,T_2187_115,T_2187_116,T_243_13,T_2187_118,T_2187_119,T_729_40,T_2187_121,T_2187_122,T_729_41,T_2187_124,T_2187_125,T_243_14,T_2187_127,T_2187_128,T_729_43,T_2187_130,T_2187_131,T_729_44,T_2187_133,T_2187_134,T_81_5,T_2187_136,T_2187_137,T_729_46,T_2187_139,T_2187_140,T_729_47,T_2187_142,T_2187_143,T_243_16,T_2187_145,T_2187_146,T_729_49,T_2187_148,T_2187_149,T_729_50,T_2187_151,T_2187_152,T_243_17,T_2187_154,T_2187_155,T_729_52,T_2187_157,T_2187_158,T_729_53,T_2187_160,T_2187_161,T_27_2,T_2187_163,T_2187_164,T_729_55,T_2187_166,T_2187_167,T_729_56,T_2187_169,T_2187_170,T_243_19,T_2187_172,T_2187_173,T_729_58,T_2187_175,T_2187_176,T_729_59,T_2187_178,T_2187_179,T_243_20,T_2187_181,T_2187_182,T_729_61,T_2187_184,T_2187_185,T_729_62,T_2187_187,T_2187_188,T_81_7,T_2187_190,T_2187_191,T_729_64,T_2187_193,T_2187_194,T_729_65,T_2187_196,T_2187_197,T_243_22,T_2187_199,T_2187_200,T_729_67,T_2187_202,T_2187_203,T_729_68,T_2187_205,T_2187_206,T_243_23,T_2187_208,T_2187_209,T_729_70,T_2187_211,T_2187_212,T_729_71,T_2187_214,T_2187_215,T_81_8,T_2187_217,T_2187_218,T_729_73,T_2187_220,T_2187_221,T_729_74,T_2187_223,T_2187_224,T_243_25,T_2187_226,T_2187_227,T_729_76,T_2187_229,T_2187_230,T_729_77,T_2187_232,T_2187_233,T_243_26,T_2187_235,T_2187_236,T_729_79,T_2187_238,T_2187_239,T_729_80,T_2187_241,T_2187_242,T_2_0,T_2187_5,T_2187_10,T_729_5,T_2187_20,T_2187_25,T_729_10,T_2187_35,T_2187_40,T_243_5,T_2187_50,T_2187_55,T_729_20,T_2187_65,T_2187_70,T_729_25,T_2187_80,T_2187_85,T_243_10,T_2187_95,T_2187_100,T_729_35,T_2187_110,T_2187_115,T_729_40,T_2187_125,T_2187_130,T_81_5,T_2187_140,T_2187_145,T_729_50,T_2187_155,T_2187_160,T_729_55,T_2187_170,T_2187_175,T_243_20,T_2187_185,T_2187_190,T_729_65,T_2187_200,T_2187_205,T_729_70,T_2187_215,T_2187_220,T_243_25,T_2187_230,T_2187_235,T_729_80,T_2187_245,T_2187_250,T_729_85,T_2187_260,T_2187_265,T_81_10,T_2187_275,T_2187_280,T_729_95,T_2187_290,T_2187_295,T_729_100,T_2187_305,T_2187_310,T_243_35,T_2187_320,T_2187_325,T_729_110,T_2187_335,T_2187_340,T_729_115,T_2187_350,T_2187_355,T_243_40,T_2187_365,T_2187_370,T_729_125,T_2187_380,T_2187_385,T_729_130,T_2187_395,T_2187_400,T_27_5,T_2187_410,T_2187_415,T_729_140,T_2187_425,T_2187_430,T_729_145,T_2187_440,T_2187_445,T_243_50,T_2187_455,T_2187_460,T_729_155,T_2187_470,T_2187_475,T_729_160,T_2187_485,T_2187_490,T_243_55,T_2187_500,T_2187_505,T_729_170,T_2187_515,T_2187_520,T_729_175,T_2187_530,T_2187_535,T_81_20,T_2187_545,T_2187_550,T_729_185,T_2187_560,T_2187_565,T_729_190,T_2187_575,T_2187_580,T_243_65,T_2187_590,T_2187_595,T_729_200,T_2187_605,T_2187_610,T_729_205,T_2187_620,T_2187_625,T_243_70,T_2187_635,T_2187_640,T_729_215,T_2187_650,T_2187_655,T_729_220,T_2187_665,T_2187_670,T_81_25,T_2187_680,T_2187_685,T_729_230,T_2187_695,T_2187_700,T_729_235,T_2187_710,T_2187_715,T_243_80,T_2187_725,T_2187_730,T_729_245,T_2187_740,T_2187_745,T_729_250,T_2187_755,T_2187_760,T_243_85,T_2187_770,T_2187_775,T_729_260,T_2187_785,T_2187_790,T_729_265,T_2187_800,T_2187_805,T_27_10,T_2187_815,T_2187_820,T_729_275,T_2187_830,T_2187_835,T_729_280,T_2187_845,T_2187_850,T_243_95,T_2187_860,T_2187_865,T_729_290,T_2187_875,T_2187_880,T_729_295,T_2187_890,T_2187_895,T_243_100,T_2187_905,T_2187_910,T_729_305,T_2187_920,T_2187_925,T_729_310,T_2187_935,T_2187_940,T_81_35,T_2187_950,T_2187_955,T_729_320,T_2187_965,T_2187_970,T_729_325,T_2187_980,T_2187_985,T_243_110,T_2187_995,T_2187_1000,T_729_335,T_2187_1010,T_2187_1015,T_729_340,T_2187_1025,T_2187_1030,T_243_115,T_2187_1040,T_2187_1045,T_729_350,T_2187_1055,T_2187_1060,T_729_355,T_2187_1070,T_2187_1075,T_81_40,T_2187_1085,T_2187_1090,T_729_365,T_2187_1100,T_2187_1105,T_729_370,T_2187_1115,T_2187_1120,T_243_125,T_2187_1130,T_2187_1135,T_729_380,T_2187_1145,T_2187_1150,T_729_385,T_2187_1160,T_2187_1165,T_243_130,T_2187_1175,T_2187_1180,T_729_395,T_2187_1190,T_2187_1195,T_729_400,T_2187_1205,T_2187_1210 +}; +static const __device__ double2 lut_dp_7_2401[343*2] = { + T_2_0,T_2401_1,T_2401_2,T_2401_3,T_2401_4,T_2401_5,T_2401_6,T_343_1,T_2401_8,T_2401_9,T_2401_10,T_2401_11,T_2401_12,T_2401_13,T_343_2,T_2401_15,T_2401_16,T_2401_17,T_2401_18,T_2401_19,T_2401_20,T_343_3,T_2401_22,T_2401_23,T_2401_24,T_2401_25,T_2401_26,T_2401_27,T_343_4,T_2401_29,T_2401_30,T_2401_31,T_2401_32,T_2401_33,T_2401_34,T_343_5,T_2401_36,T_2401_37,T_2401_38,T_2401_39,T_2401_40,T_2401_41,T_343_6,T_2401_43,T_2401_44,T_2401_45,T_2401_46,T_2401_47,T_2401_48,T_49_1,T_2401_50,T_2401_51,T_2401_52,T_2401_53,T_2401_54,T_2401_55,T_343_8,T_2401_57,T_2401_58,T_2401_59,T_2401_60,T_2401_61,T_2401_62,T_343_9,T_2401_64,T_2401_65,T_2401_66,T_2401_67,T_2401_68,T_2401_69,T_343_10,T_2401_71,T_2401_72,T_2401_73,T_2401_74,T_2401_75,T_2401_76,T_343_11,T_2401_78,T_2401_79,T_2401_80,T_2401_81,T_2401_82,T_2401_83,T_343_12,T_2401_85,T_2401_86,T_2401_87,T_2401_88,T_2401_89,T_2401_90,T_343_13,T_2401_92,T_2401_93,T_2401_94,T_2401_95,T_2401_96,T_2401_97,T_49_2,T_2401_99,T_2401_100,T_2401_101,T_2401_102,T_2401_103,T_2401_104,T_343_15,T_2401_106,T_2401_107,T_2401_108,T_2401_109,T_2401_110,T_2401_111,T_343_16,T_2401_113,T_2401_114,T_2401_115,T_2401_116,T_2401_117,T_2401_118,T_343_17,T_2401_120,T_2401_121,T_2401_122,T_2401_123,T_2401_124,T_2401_125,T_343_18,T_2401_127,T_2401_128,T_2401_129,T_2401_130,T_2401_131,T_2401_132,T_343_19,T_2401_134,T_2401_135,T_2401_136,T_2401_137,T_2401_138,T_2401_139,T_343_20,T_2401_141,T_2401_142,T_2401_143,T_2401_144,T_2401_145,T_2401_146,T_49_3,T_2401_148,T_2401_149,T_2401_150,T_2401_151,T_2401_152,T_2401_153,T_343_22,T_2401_155,T_2401_156,T_2401_157,T_2401_158,T_2401_159,T_2401_160,T_343_23,T_2401_162,T_2401_163,T_2401_164,T_2401_165,T_2401_166,T_2401_167,T_343_24,T_2401_169,T_2401_170,T_2401_171,T_2401_172,T_2401_173,T_2401_174,T_343_25,T_2401_176,T_2401_177,T_2401_178,T_2401_179,T_2401_180,T_2401_181,T_343_26,T_2401_183,T_2401_184,T_2401_185,T_2401_186,T_2401_187,T_2401_188,T_343_27,T_2401_190,T_2401_191,T_2401_192,T_2401_193,T_2401_194,T_2401_195,T_49_4,T_2401_197,T_2401_198,T_2401_199,T_2401_200,T_2401_201,T_2401_202,T_343_29,T_2401_204,T_2401_205,T_2401_206,T_2401_207,T_2401_208,T_2401_209,T_343_30,T_2401_211,T_2401_212,T_2401_213,T_2401_214,T_2401_215,T_2401_216,T_343_31,T_2401_218,T_2401_219,T_2401_220,T_2401_221,T_2401_222,T_2401_223,T_343_32,T_2401_225,T_2401_226,T_2401_227,T_2401_228,T_2401_229,T_2401_230,T_343_33,T_2401_232,T_2401_233,T_2401_234,T_2401_235,T_2401_236,T_2401_237,T_343_34,T_2401_239,T_2401_240,T_2401_241,T_2401_242,T_2401_243,T_2401_244,T_49_5,T_2401_246,T_2401_247,T_2401_248,T_2401_249,T_2401_250,T_2401_251,T_343_36,T_2401_253,T_2401_254,T_2401_255,T_2401_256,T_2401_257,T_2401_258,T_343_37,T_2401_260,T_2401_261,T_2401_262,T_2401_263,T_2401_264,T_2401_265,T_343_38,T_2401_267,T_2401_268,T_2401_269,T_2401_270,T_2401_271,T_2401_272,T_343_39,T_2401_274,T_2401_275,T_2401_276,T_2401_277,T_2401_278,T_2401_279,T_343_40,T_2401_281,T_2401_282,T_2401_283,T_2401_284,T_2401_285,T_2401_286,T_343_41,T_2401_288,T_2401_289,T_2401_290,T_2401_291,T_2401_292,T_2401_293,T_49_6,T_2401_295,T_2401_296,T_2401_297,T_2401_298,T_2401_299,T_2401_300,T_343_43,T_2401_302,T_2401_303,T_2401_304,T_2401_305,T_2401_306,T_2401_307,T_343_44,T_2401_309,T_2401_310,T_2401_311,T_2401_312,T_2401_313,T_2401_314,T_343_45,T_2401_316,T_2401_317,T_2401_318,T_2401_319,T_2401_320,T_2401_321,T_343_46,T_2401_323,T_2401_324,T_2401_325,T_2401_326,T_2401_327,T_2401_328,T_343_47,T_2401_330,T_2401_331,T_2401_332,T_2401_333,T_2401_334,T_2401_335,T_343_48,T_2401_337,T_2401_338,T_2401_339,T_2401_340,T_2401_341,T_2401_342,T_2_0,T_2401_4,T_2401_8,T_2401_12,T_2401_16,T_2401_20,T_2401_24,T_343_4,T_2401_32,T_2401_36,T_2401_40,T_2401_44,T_2401_48,T_2401_52,T_343_8,T_2401_60,T_2401_64,T_2401_68,T_2401_72,T_2401_76,T_2401_80,T_343_12,T_2401_88,T_2401_92,T_2401_96,T_2401_100,T_2401_104,T_2401_108,T_343_16,T_2401_116,T_2401_120,T_2401_124,T_2401_128,T_2401_132,T_2401_136,T_343_20,T_2401_144,T_2401_148,T_2401_152,T_2401_156,T_2401_160,T_2401_164,T_343_24,T_2401_172,T_2401_176,T_2401_180,T_2401_184,T_2401_188,T_2401_192,T_49_4,T_2401_200,T_2401_204,T_2401_208,T_2401_212,T_2401_216,T_2401_220,T_343_32,T_2401_228,T_2401_232,T_2401_236,T_2401_240,T_2401_244,T_2401_248,T_343_36,T_2401_256,T_2401_260,T_2401_264,T_2401_268,T_2401_272,T_2401_276,T_343_40,T_2401_284,T_2401_288,T_2401_292,T_2401_296,T_2401_300,T_2401_304,T_343_44,T_2401_312,T_2401_316,T_2401_320,T_2401_324,T_2401_328,T_2401_332,T_343_48,T_2401_340,T_2401_344,T_2401_348,T_2401_352,T_2401_356,T_2401_360,T_343_52,T_2401_368,T_2401_372,T_2401_376,T_2401_380,T_2401_384,T_2401_388,T_49_8,T_2401_396,T_2401_400,T_2401_404,T_2401_408,T_2401_412,T_2401_416,T_343_60,T_2401_424,T_2401_428,T_2401_432,T_2401_436,T_2401_440,T_2401_444,T_343_64,T_2401_452,T_2401_456,T_2401_460,T_2401_464,T_2401_468,T_2401_472,T_343_68,T_2401_480,T_2401_484,T_2401_488,T_2401_492,T_2401_496,T_2401_500,T_343_72,T_2401_508,T_2401_512,T_2401_516,T_2401_520,T_2401_524,T_2401_528,T_343_76,T_2401_536,T_2401_540,T_2401_544,T_2401_548,T_2401_552,T_2401_556,T_343_80,T_2401_564,T_2401_568,T_2401_572,T_2401_576,T_2401_580,T_2401_584,T_49_12,T_2401_592,T_2401_596,T_2401_600,T_2401_604,T_2401_608,T_2401_612,T_343_88,T_2401_620,T_2401_624,T_2401_628,T_2401_632,T_2401_636,T_2401_640,T_343_92,T_2401_648,T_2401_652,T_2401_656,T_2401_660,T_2401_664,T_2401_668,T_343_96,T_2401_676,T_2401_680,T_2401_684,T_2401_688,T_2401_692,T_2401_696,T_343_100,T_2401_704,T_2401_708,T_2401_712,T_2401_716,T_2401_720,T_2401_724,T_343_104,T_2401_732,T_2401_736,T_2401_740,T_2401_744,T_2401_748,T_2401_752,T_343_108,T_2401_760,T_2401_764,T_2401_768,T_2401_772,T_2401_776,T_2401_780,T_49_16,T_2401_788,T_2401_792,T_2401_796,T_2401_800,T_2401_804,T_2401_808,T_343_116,T_2401_816,T_2401_820,T_2401_824,T_2401_828,T_2401_832,T_2401_836,T_343_120,T_2401_844,T_2401_848,T_2401_852,T_2401_856,T_2401_860,T_2401_864,T_343_124,T_2401_872,T_2401_876,T_2401_880,T_2401_884,T_2401_888,T_2401_892,T_343_128,T_2401_900,T_2401_904,T_2401_908,T_2401_912,T_2401_916,T_2401_920,T_343_132,T_2401_928,T_2401_932,T_2401_936,T_2401_940,T_2401_944,T_2401_948,T_343_136,T_2401_956,T_2401_960,T_2401_964,T_2401_968,T_2401_972,T_2401_976,T_49_20,T_2401_984,T_2401_988,T_2401_992,T_2401_996,T_2401_1000,T_2401_1004,T_343_144,T_2401_1012,T_2401_1016,T_2401_1020,T_2401_1024,T_2401_1028,T_2401_1032,T_343_148,T_2401_1040,T_2401_1044,T_2401_1048,T_2401_1052,T_2401_1056,T_2401_1060,T_343_152,T_2401_1068,T_2401_1072,T_2401_1076,T_2401_1080,T_2401_1084,T_2401_1088,T_343_156,T_2401_1096,T_2401_1100,T_2401_1104,T_2401_1108,T_2401_1112,T_2401_1116,T_343_160,T_2401_1124,T_2401_1128,T_2401_1132,T_2401_1136,T_2401_1140,T_2401_1144,T_343_164,T_2401_1152,T_2401_1156,T_2401_1160,T_2401_1164,T_2401_1168,T_2401_1172,T_49_24,T_2401_1180,T_2401_1184,T_2401_1188,T_2401_1192,T_2401_1196,T_2401_1200,T_343_172,T_2401_1208,T_2401_1212,T_2401_1216,T_2401_1220,T_2401_1224,T_2401_1228,T_343_176,T_2401_1236,T_2401_1240,T_2401_1244,T_2401_1248,T_2401_1252,T_2401_1256,T_343_180,T_2401_1264,T_2401_1268,T_2401_1272,T_2401_1276,T_2401_1280,T_2401_1284,T_343_184,T_2401_1292,T_2401_1296,T_2401_1300,T_2401_1304,T_2401_1308,T_2401_1312,T_343_188,T_2401_1320,T_2401_1324,T_2401_1328,T_2401_1332,T_2401_1336,T_2401_1340,T_343_192,T_2401_1348,T_2401_1352,T_2401_1356,T_2401_1360,T_2401_1364,T_2401_1368 +}; +static const __device__ double2 lut_dp_5_3125[625*2] = { + T_2_0,T_3125_1,T_3125_2,T_3125_3,T_3125_4,T_625_1,T_3125_6,T_3125_7,T_3125_8,T_3125_9,T_625_2,T_3125_11,T_3125_12,T_3125_13,T_3125_14,T_625_3,T_3125_16,T_3125_17,T_3125_18,T_3125_19,T_625_4,T_3125_21,T_3125_22,T_3125_23,T_3125_24,T_125_1,T_3125_26,T_3125_27,T_3125_28,T_3125_29,T_625_6,T_3125_31,T_3125_32,T_3125_33,T_3125_34,T_625_7,T_3125_36,T_3125_37,T_3125_38,T_3125_39,T_625_8,T_3125_41,T_3125_42,T_3125_43,T_3125_44,T_625_9,T_3125_46,T_3125_47,T_3125_48,T_3125_49,T_125_2,T_3125_51,T_3125_52,T_3125_53,T_3125_54,T_625_11,T_3125_56,T_3125_57,T_3125_58,T_3125_59,T_625_12,T_3125_61,T_3125_62,T_3125_63,T_3125_64,T_625_13,T_3125_66,T_3125_67,T_3125_68,T_3125_69,T_625_14,T_3125_71,T_3125_72,T_3125_73,T_3125_74,T_125_3,T_3125_76,T_3125_77,T_3125_78,T_3125_79,T_625_16,T_3125_81,T_3125_82,T_3125_83,T_3125_84,T_625_17,T_3125_86,T_3125_87,T_3125_88,T_3125_89,T_625_18,T_3125_91,T_3125_92,T_3125_93,T_3125_94,T_625_19,T_3125_96,T_3125_97,T_3125_98,T_3125_99,T_125_4,T_3125_101,T_3125_102,T_3125_103,T_3125_104,T_625_21,T_3125_106,T_3125_107,T_3125_108,T_3125_109,T_625_22,T_3125_111,T_3125_112,T_3125_113,T_3125_114,T_625_23,T_3125_116,T_3125_117,T_3125_118,T_3125_119,T_625_24,T_3125_121,T_3125_122,T_3125_123,T_3125_124,T_25_1,T_3125_126,T_3125_127,T_3125_128,T_3125_129,T_625_26,T_3125_131,T_3125_132,T_3125_133,T_3125_134,T_625_27,T_3125_136,T_3125_137,T_3125_138,T_3125_139,T_625_28,T_3125_141,T_3125_142,T_3125_143,T_3125_144,T_625_29,T_3125_146,T_3125_147,T_3125_148,T_3125_149,T_125_6,T_3125_151,T_3125_152,T_3125_153,T_3125_154,T_625_31,T_3125_156,T_3125_157,T_3125_158,T_3125_159,T_625_32,T_3125_161,T_3125_162,T_3125_163,T_3125_164,T_625_33,T_3125_166,T_3125_167,T_3125_168,T_3125_169,T_625_34,T_3125_171,T_3125_172,T_3125_173,T_3125_174,T_125_7,T_3125_176,T_3125_177,T_3125_178,T_3125_179,T_625_36,T_3125_181,T_3125_182,T_3125_183,T_3125_184,T_625_37,T_3125_186,T_3125_187,T_3125_188,T_3125_189,T_625_38,T_3125_191,T_3125_192,T_3125_193,T_3125_194,T_625_39,T_3125_196,T_3125_197,T_3125_198,T_3125_199,T_125_8,T_3125_201,T_3125_202,T_3125_203,T_3125_204,T_625_41,T_3125_206,T_3125_207,T_3125_208,T_3125_209,T_625_42,T_3125_211,T_3125_212,T_3125_213,T_3125_214,T_625_43,T_3125_216,T_3125_217,T_3125_218,T_3125_219,T_625_44,T_3125_221,T_3125_222,T_3125_223,T_3125_224,T_125_9,T_3125_226,T_3125_227,T_3125_228,T_3125_229,T_625_46,T_3125_231,T_3125_232,T_3125_233,T_3125_234,T_625_47,T_3125_236,T_3125_237,T_3125_238,T_3125_239,T_625_48,T_3125_241,T_3125_242,T_3125_243,T_3125_244,T_625_49,T_3125_246,T_3125_247,T_3125_248,T_3125_249,T_25_2,T_3125_251,T_3125_252,T_3125_253,T_3125_254,T_625_51,T_3125_256,T_3125_257,T_3125_258,T_3125_259,T_625_52,T_3125_261,T_3125_262,T_3125_263,T_3125_264,T_625_53,T_3125_266,T_3125_267,T_3125_268,T_3125_269,T_625_54,T_3125_271,T_3125_272,T_3125_273,T_3125_274,T_125_11,T_3125_276,T_3125_277,T_3125_278,T_3125_279,T_625_56,T_3125_281,T_3125_282,T_3125_283,T_3125_284,T_625_57,T_3125_286,T_3125_287,T_3125_288,T_3125_289,T_625_58,T_3125_291,T_3125_292,T_3125_293,T_3125_294,T_625_59,T_3125_296,T_3125_297,T_3125_298,T_3125_299,T_125_12,T_3125_301,T_3125_302,T_3125_303,T_3125_304,T_625_61,T_3125_306,T_3125_307,T_3125_308,T_3125_309,T_625_62,T_3125_311,T_3125_312,T_3125_313,T_3125_314,T_625_63,T_3125_316,T_3125_317,T_3125_318,T_3125_319,T_625_64,T_3125_321,T_3125_322,T_3125_323,T_3125_324,T_125_13,T_3125_326,T_3125_327,T_3125_328,T_3125_329,T_625_66,T_3125_331,T_3125_332,T_3125_333,T_3125_334,T_625_67,T_3125_336,T_3125_337,T_3125_338,T_3125_339,T_625_68,T_3125_341,T_3125_342,T_3125_343,T_3125_344,T_625_69,T_3125_346,T_3125_347,T_3125_348,T_3125_349,T_125_14,T_3125_351,T_3125_352,T_3125_353,T_3125_354,T_625_71,T_3125_356,T_3125_357,T_3125_358,T_3125_359,T_625_72,T_3125_361,T_3125_362,T_3125_363,T_3125_364,T_625_73,T_3125_366,T_3125_367,T_3125_368,T_3125_369,T_625_74,T_3125_371,T_3125_372,T_3125_373,T_3125_374,T_25_3,T_3125_376,T_3125_377,T_3125_378,T_3125_379,T_625_76,T_3125_381,T_3125_382,T_3125_383,T_3125_384,T_625_77,T_3125_386,T_3125_387,T_3125_388,T_3125_389,T_625_78,T_3125_391,T_3125_392,T_3125_393,T_3125_394,T_625_79,T_3125_396,T_3125_397,T_3125_398,T_3125_399,T_125_16,T_3125_401,T_3125_402,T_3125_403,T_3125_404,T_625_81,T_3125_406,T_3125_407,T_3125_408,T_3125_409,T_625_82,T_3125_411,T_3125_412,T_3125_413,T_3125_414,T_625_83,T_3125_416,T_3125_417,T_3125_418,T_3125_419,T_625_84,T_3125_421,T_3125_422,T_3125_423,T_3125_424,T_125_17,T_3125_426,T_3125_427,T_3125_428,T_3125_429,T_625_86,T_3125_431,T_3125_432,T_3125_433,T_3125_434,T_625_87,T_3125_436,T_3125_437,T_3125_438,T_3125_439,T_625_88,T_3125_441,T_3125_442,T_3125_443,T_3125_444,T_625_89,T_3125_446,T_3125_447,T_3125_448,T_3125_449,T_125_18,T_3125_451,T_3125_452,T_3125_453,T_3125_454,T_625_91,T_3125_456,T_3125_457,T_3125_458,T_3125_459,T_625_92,T_3125_461,T_3125_462,T_3125_463,T_3125_464,T_625_93,T_3125_466,T_3125_467,T_3125_468,T_3125_469,T_625_94,T_3125_471,T_3125_472,T_3125_473,T_3125_474,T_125_19,T_3125_476,T_3125_477,T_3125_478,T_3125_479,T_625_96,T_3125_481,T_3125_482,T_3125_483,T_3125_484,T_625_97,T_3125_486,T_3125_487,T_3125_488,T_3125_489,T_625_98,T_3125_491,T_3125_492,T_3125_493,T_3125_494,T_625_99,T_3125_496,T_3125_497,T_3125_498,T_3125_499,T_25_4,T_3125_501,T_3125_502,T_3125_503,T_3125_504,T_625_101,T_3125_506,T_3125_507,T_3125_508,T_3125_509,T_625_102,T_3125_511,T_3125_512,T_3125_513,T_3125_514,T_625_103,T_3125_516,T_3125_517,T_3125_518,T_3125_519,T_625_104,T_3125_521,T_3125_522,T_3125_523,T_3125_524,T_125_21,T_3125_526,T_3125_527,T_3125_528,T_3125_529,T_625_106,T_3125_531,T_3125_532,T_3125_533,T_3125_534,T_625_107,T_3125_536,T_3125_537,T_3125_538,T_3125_539,T_625_108,T_3125_541,T_3125_542,T_3125_543,T_3125_544,T_625_109,T_3125_546,T_3125_547,T_3125_548,T_3125_549,T_125_22,T_3125_551,T_3125_552,T_3125_553,T_3125_554,T_625_111,T_3125_556,T_3125_557,T_3125_558,T_3125_559,T_625_112,T_3125_561,T_3125_562,T_3125_563,T_3125_564,T_625_113,T_3125_566,T_3125_567,T_3125_568,T_3125_569,T_625_114,T_3125_571,T_3125_572,T_3125_573,T_3125_574,T_125_23,T_3125_576,T_3125_577,T_3125_578,T_3125_579,T_625_116,T_3125_581,T_3125_582,T_3125_583,T_3125_584,T_625_117,T_3125_586,T_3125_587,T_3125_588,T_3125_589,T_625_118,T_3125_591,T_3125_592,T_3125_593,T_3125_594,T_625_119,T_3125_596,T_3125_597,T_3125_598,T_3125_599,T_125_24,T_3125_601,T_3125_602,T_3125_603,T_3125_604,T_625_121,T_3125_606,T_3125_607,T_3125_608,T_3125_609,T_625_122,T_3125_611,T_3125_612,T_3125_613,T_3125_614,T_625_123,T_3125_616,T_3125_617,T_3125_618,T_3125_619,T_625_124,T_3125_621,T_3125_622,T_3125_623,T_3125_624,T_2_0,T_3125_3,T_3125_6,T_3125_9,T_3125_12,T_625_3,T_3125_18,T_3125_21,T_3125_24,T_3125_27,T_625_6,T_3125_33,T_3125_36,T_3125_39,T_3125_42,T_625_9,T_3125_48,T_3125_51,T_3125_54,T_3125_57,T_625_12,T_3125_63,T_3125_66,T_3125_69,T_3125_72,T_125_3,T_3125_78,T_3125_81,T_3125_84,T_3125_87,T_625_18,T_3125_93,T_3125_96,T_3125_99,T_3125_102,T_625_21,T_3125_108,T_3125_111,T_3125_114,T_3125_117,T_625_24,T_3125_123,T_3125_126,T_3125_129,T_3125_132,T_625_27,T_3125_138,T_3125_141,T_3125_144,T_3125_147,T_125_6,T_3125_153,T_3125_156,T_3125_159,T_3125_162,T_625_33,T_3125_168,T_3125_171,T_3125_174,T_3125_177,T_625_36,T_3125_183,T_3125_186,T_3125_189,T_3125_192,T_625_39,T_3125_198,T_3125_201,T_3125_204,T_3125_207,T_625_42,T_3125_213,T_3125_216,T_3125_219,T_3125_222,T_125_9,T_3125_228,T_3125_231,T_3125_234,T_3125_237,T_625_48,T_3125_243,T_3125_246,T_3125_249,T_3125_252,T_625_51,T_3125_258,T_3125_261,T_3125_264,T_3125_267,T_625_54,T_3125_273,T_3125_276,T_3125_279,T_3125_282,T_625_57,T_3125_288,T_3125_291,T_3125_294,T_3125_297,T_125_12,T_3125_303,T_3125_306,T_3125_309,T_3125_312,T_625_63,T_3125_318,T_3125_321,T_3125_324,T_3125_327,T_625_66,T_3125_333,T_3125_336,T_3125_339,T_3125_342,T_625_69,T_3125_348,T_3125_351,T_3125_354,T_3125_357,T_625_72,T_3125_363,T_3125_366,T_3125_369,T_3125_372,T_25_3,T_3125_378,T_3125_381,T_3125_384,T_3125_387,T_625_78,T_3125_393,T_3125_396,T_3125_399,T_3125_402,T_625_81,T_3125_408,T_3125_411,T_3125_414,T_3125_417,T_625_84,T_3125_423,T_3125_426,T_3125_429,T_3125_432,T_625_87,T_3125_438,T_3125_441,T_3125_444,T_3125_447,T_125_18,T_3125_453,T_3125_456,T_3125_459,T_3125_462,T_625_93,T_3125_468,T_3125_471,T_3125_474,T_3125_477,T_625_96,T_3125_483,T_3125_486,T_3125_489,T_3125_492,T_625_99,T_3125_498,T_3125_501,T_3125_504,T_3125_507,T_625_102,T_3125_513,T_3125_516,T_3125_519,T_3125_522,T_125_21,T_3125_528,T_3125_531,T_3125_534,T_3125_537,T_625_108,T_3125_543,T_3125_546,T_3125_549,T_3125_552,T_625_111,T_3125_558,T_3125_561,T_3125_564,T_3125_567,T_625_114,T_3125_573,T_3125_576,T_3125_579,T_3125_582,T_625_117,T_3125_588,T_3125_591,T_3125_594,T_3125_597,T_125_24,T_3125_603,T_3125_606,T_3125_609,T_3125_612,T_625_123,T_3125_618,T_3125_621,T_3125_624,T_3125_627,T_625_126,T_3125_633,T_3125_636,T_3125_639,T_3125_642,T_625_129,T_3125_648,T_3125_651,T_3125_654,T_3125_657,T_625_132,T_3125_663,T_3125_666,T_3125_669,T_3125_672,T_125_27,T_3125_678,T_3125_681,T_3125_684,T_3125_687,T_625_138,T_3125_693,T_3125_696,T_3125_699,T_3125_702,T_625_141,T_3125_708,T_3125_711,T_3125_714,T_3125_717,T_625_144,T_3125_723,T_3125_726,T_3125_729,T_3125_732,T_625_147,T_3125_738,T_3125_741,T_3125_744,T_3125_747,T_25_6,T_3125_753,T_3125_756,T_3125_759,T_3125_762,T_625_153,T_3125_768,T_3125_771,T_3125_774,T_3125_777,T_625_156,T_3125_783,T_3125_786,T_3125_789,T_3125_792,T_625_159,T_3125_798,T_3125_801,T_3125_804,T_3125_807,T_625_162,T_3125_813,T_3125_816,T_3125_819,T_3125_822,T_125_33,T_3125_828,T_3125_831,T_3125_834,T_3125_837,T_625_168,T_3125_843,T_3125_846,T_3125_849,T_3125_852,T_625_171,T_3125_858,T_3125_861,T_3125_864,T_3125_867,T_625_174,T_3125_873,T_3125_876,T_3125_879,T_3125_882,T_625_177,T_3125_888,T_3125_891,T_3125_894,T_3125_897,T_125_36,T_3125_903,T_3125_906,T_3125_909,T_3125_912,T_625_183,T_3125_918,T_3125_921,T_3125_924,T_3125_927,T_625_186,T_3125_933,T_3125_936,T_3125_939,T_3125_942,T_625_189,T_3125_948,T_3125_951,T_3125_954,T_3125_957,T_625_192,T_3125_963,T_3125_966,T_3125_969,T_3125_972,T_125_39,T_3125_978,T_3125_981,T_3125_984,T_3125_987,T_625_198,T_3125_993,T_3125_996,T_3125_999,T_3125_1002,T_625_201,T_3125_1008,T_3125_1011,T_3125_1014,T_3125_1017,T_625_204,T_3125_1023,T_3125_1026,T_3125_1029,T_3125_1032,T_625_207,T_3125_1038,T_3125_1041,T_3125_1044,T_3125_1047,T_125_42,T_3125_1053,T_3125_1056,T_3125_1059,T_3125_1062,T_625_213,T_3125_1068,T_3125_1071,T_3125_1074,T_3125_1077,T_625_216,T_3125_1083,T_3125_1086,T_3125_1089,T_3125_1092,T_625_219,T_3125_1098,T_3125_1101,T_3125_1104,T_3125_1107,T_625_222,T_3125_1113,T_3125_1116,T_3125_1119,T_3125_1122,T_25_9,T_3125_1128,T_3125_1131,T_3125_1134,T_3125_1137,T_625_228,T_3125_1143,T_3125_1146,T_3125_1149,T_3125_1152,T_625_231,T_3125_1158,T_3125_1161,T_3125_1164,T_3125_1167,T_625_234,T_3125_1173,T_3125_1176,T_3125_1179,T_3125_1182,T_625_237,T_3125_1188,T_3125_1191,T_3125_1194,T_3125_1197,T_125_48,T_3125_1203,T_3125_1206,T_3125_1209,T_3125_1212,T_625_243,T_3125_1218,T_3125_1221,T_3125_1224,T_3125_1227,T_625_246,T_3125_1233,T_3125_1236,T_3125_1239,T_3125_1242,T_625_249,T_3125_1248,T_3125_1251,T_3125_1254,T_3125_1257,T_625_252,T_3125_1263,T_3125_1266,T_3125_1269,T_3125_1272,T_125_51,T_3125_1278,T_3125_1281,T_3125_1284,T_3125_1287,T_625_258,T_3125_1293,T_3125_1296,T_3125_1299,T_3125_1302,T_625_261,T_3125_1308,T_3125_1311,T_3125_1314,T_3125_1317,T_625_264,T_3125_1323,T_3125_1326,T_3125_1329,T_3125_1332,T_625_267,T_3125_1338,T_3125_1341,T_3125_1344,T_3125_1347,T_125_54,T_3125_1353,T_3125_1356,T_3125_1359,T_3125_1362,T_625_273,T_3125_1368,T_3125_1371,T_3125_1374,T_3125_1377,T_625_276,T_3125_1383,T_3125_1386,T_3125_1389,T_3125_1392,T_625_279,T_3125_1398,T_3125_1401,T_3125_1404,T_3125_1407,T_625_282,T_3125_1413,T_3125_1416,T_3125_1419,T_3125_1422,T_125_57,T_3125_1428,T_3125_1431,T_3125_1434,T_3125_1437,T_625_288,T_3125_1443,T_3125_1446,T_3125_1449,T_3125_1452,T_625_291,T_3125_1458,T_3125_1461,T_3125_1464,T_3125_1467,T_625_294,T_3125_1473,T_3125_1476,T_3125_1479,T_3125_1482,T_625_297,T_3125_1488,T_3125_1491,T_3125_1494,T_3125_1497,T_25_12,T_3125_1503,T_3125_1506,T_3125_1509,T_3125_1512,T_625_303,T_3125_1518,T_3125_1521,T_3125_1524,T_3125_1527,T_625_306,T_3125_1533,T_3125_1536,T_3125_1539,T_3125_1542,T_625_309,T_3125_1548,T_3125_1551,T_3125_1554,T_3125_1557,T_625_312,T_3125_1563,T_3125_1566,T_3125_1569,T_3125_1572,T_125_63,T_3125_1578,T_3125_1581,T_3125_1584,T_3125_1587,T_625_318,T_3125_1593,T_3125_1596,T_3125_1599,T_3125_1602,T_625_321,T_3125_1608,T_3125_1611,T_3125_1614,T_3125_1617,T_625_324,T_3125_1623,T_3125_1626,T_3125_1629,T_3125_1632,T_625_327,T_3125_1638,T_3125_1641,T_3125_1644,T_3125_1647,T_125_66,T_3125_1653,T_3125_1656,T_3125_1659,T_3125_1662,T_625_333,T_3125_1668,T_3125_1671,T_3125_1674,T_3125_1677,T_625_336,T_3125_1683,T_3125_1686,T_3125_1689,T_3125_1692,T_625_339,T_3125_1698,T_3125_1701,T_3125_1704,T_3125_1707,T_625_342,T_3125_1713,T_3125_1716,T_3125_1719,T_3125_1722,T_125_69,T_3125_1728,T_3125_1731,T_3125_1734,T_3125_1737,T_625_348,T_3125_1743,T_3125_1746,T_3125_1749,T_3125_1752,T_625_351,T_3125_1758,T_3125_1761,T_3125_1764,T_3125_1767,T_625_354,T_3125_1773,T_3125_1776,T_3125_1779,T_3125_1782,T_625_357,T_3125_1788,T_3125_1791,T_3125_1794,T_3125_1797,T_125_72,T_3125_1803,T_3125_1806,T_3125_1809,T_3125_1812,T_625_363,T_3125_1818,T_3125_1821,T_3125_1824,T_3125_1827,T_625_366,T_3125_1833,T_3125_1836,T_3125_1839,T_3125_1842,T_625_369,T_3125_1848,T_3125_1851,T_3125_1854,T_3125_1857,T_625_372,T_3125_1863,T_3125_1866,T_3125_1869,T_3125_1872 +}; +static const __device__ double2 lut_dp_4_4096[1024*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_32_1,T_4096_129,T_2048_65,T_4096_131,T_1024_33,T_4096_133,T_2048_67,T_4096_135,T_512_17,T_4096_137,T_2048_69,T_4096_139,T_1024_35,T_4096_141,T_2048_71,T_4096_143,T_256_9,T_4096_145,T_2048_73,T_4096_147,T_1024_37,T_4096_149,T_2048_75,T_4096_151,T_512_19,T_4096_153,T_2048_77,T_4096_155,T_1024_39,T_4096_157,T_2048_79,T_4096_159,T_128_5,T_4096_161,T_2048_81,T_4096_163,T_1024_41,T_4096_165,T_2048_83,T_4096_167,T_512_21,T_4096_169,T_2048_85,T_4096_171,T_1024_43,T_4096_173,T_2048_87,T_4096_175,T_256_11,T_4096_177,T_2048_89,T_4096_179,T_1024_45,T_4096_181,T_2048_91,T_4096_183,T_512_23,T_4096_185,T_2048_93,T_4096_187,T_1024_47,T_4096_189,T_2048_95,T_4096_191,T_64_3,T_4096_193,T_2048_97,T_4096_195,T_1024_49,T_4096_197,T_2048_99,T_4096_199,T_512_25,T_4096_201,T_2048_101,T_4096_203,T_1024_51,T_4096_205,T_2048_103,T_4096_207,T_256_13,T_4096_209,T_2048_105,T_4096_211,T_1024_53,T_4096_213,T_2048_107,T_4096_215,T_512_27,T_4096_217,T_2048_109,T_4096_219,T_1024_55,T_4096_221,T_2048_111,T_4096_223,T_128_7,T_4096_225,T_2048_113,T_4096_227,T_1024_57,T_4096_229,T_2048_115,T_4096_231,T_512_29,T_4096_233,T_2048_117,T_4096_235,T_1024_59,T_4096_237,T_2048_119,T_4096_239,T_256_15,T_4096_241,T_2048_121,T_4096_243,T_1024_61,T_4096_245,T_2048_123,T_4096_247,T_512_31,T_4096_249,T_2048_125,T_4096_251,T_1024_63,T_4096_253,T_2048_127,T_4096_255,T_16_1,T_4096_257,T_2048_129,T_4096_259,T_1024_65,T_4096_261,T_2048_131,T_4096_263,T_512_33,T_4096_265,T_2048_133,T_4096_267,T_1024_67,T_4096_269,T_2048_135,T_4096_271,T_256_17,T_4096_273,T_2048_137,T_4096_275,T_1024_69,T_4096_277,T_2048_139,T_4096_279,T_512_35,T_4096_281,T_2048_141,T_4096_283,T_1024_71,T_4096_285,T_2048_143,T_4096_287,T_128_9,T_4096_289,T_2048_145,T_4096_291,T_1024_73,T_4096_293,T_2048_147,T_4096_295,T_512_37,T_4096_297,T_2048_149,T_4096_299,T_1024_75,T_4096_301,T_2048_151,T_4096_303,T_256_19,T_4096_305,T_2048_153,T_4096_307,T_1024_77,T_4096_309,T_2048_155,T_4096_311,T_512_39,T_4096_313,T_2048_157,T_4096_315,T_1024_79,T_4096_317,T_2048_159,T_4096_319,T_64_5,T_4096_321,T_2048_161,T_4096_323,T_1024_81,T_4096_325,T_2048_163,T_4096_327,T_512_41,T_4096_329,T_2048_165,T_4096_331,T_1024_83,T_4096_333,T_2048_167,T_4096_335,T_256_21,T_4096_337,T_2048_169,T_4096_339,T_1024_85,T_4096_341,T_2048_171,T_4096_343,T_512_43,T_4096_345,T_2048_173,T_4096_347,T_1024_87,T_4096_349,T_2048_175,T_4096_351,T_128_11,T_4096_353,T_2048_177,T_4096_355,T_1024_89,T_4096_357,T_2048_179,T_4096_359,T_512_45,T_4096_361,T_2048_181,T_4096_363,T_1024_91,T_4096_365,T_2048_183,T_4096_367,T_256_23,T_4096_369,T_2048_185,T_4096_371,T_1024_93,T_4096_373,T_2048_187,T_4096_375,T_512_47,T_4096_377,T_2048_189,T_4096_379,T_1024_95,T_4096_381,T_2048_191,T_4096_383,T_32_3,T_4096_385,T_2048_193,T_4096_387,T_1024_97,T_4096_389,T_2048_195,T_4096_391,T_512_49,T_4096_393,T_2048_197,T_4096_395,T_1024_99,T_4096_397,T_2048_199,T_4096_399,T_256_25,T_4096_401,T_2048_201,T_4096_403,T_1024_101,T_4096_405,T_2048_203,T_4096_407,T_512_51,T_4096_409,T_2048_205,T_4096_411,T_1024_103,T_4096_413,T_2048_207,T_4096_415,T_128_13,T_4096_417,T_2048_209,T_4096_419,T_1024_105,T_4096_421,T_2048_211,T_4096_423,T_512_53,T_4096_425,T_2048_213,T_4096_427,T_1024_107,T_4096_429,T_2048_215,T_4096_431,T_256_27,T_4096_433,T_2048_217,T_4096_435,T_1024_109,T_4096_437,T_2048_219,T_4096_439,T_512_55,T_4096_441,T_2048_221,T_4096_443,T_1024_111,T_4096_445,T_2048_223,T_4096_447,T_64_7,T_4096_449,T_2048_225,T_4096_451,T_1024_113,T_4096_453,T_2048_227,T_4096_455,T_512_57,T_4096_457,T_2048_229,T_4096_459,T_1024_115,T_4096_461,T_2048_231,T_4096_463,T_256_29,T_4096_465,T_2048_233,T_4096_467,T_1024_117,T_4096_469,T_2048_235,T_4096_471,T_512_59,T_4096_473,T_2048_237,T_4096_475,T_1024_119,T_4096_477,T_2048_239,T_4096_479,T_128_15,T_4096_481,T_2048_241,T_4096_483,T_1024_121,T_4096_485,T_2048_243,T_4096_487,T_512_61,T_4096_489,T_2048_245,T_4096_491,T_1024_123,T_4096_493,T_2048_247,T_4096_495,T_256_31,T_4096_497,T_2048_249,T_4096_499,T_1024_125,T_4096_501,T_2048_251,T_4096_503,T_512_63,T_4096_505,T_2048_253,T_4096_507,T_1024_127,T_4096_509,T_2048_255,T_4096_511,T_8_1,T_4096_513,T_2048_257,T_4096_515,T_1024_129,T_4096_517,T_2048_259,T_4096_519,T_512_65,T_4096_521,T_2048_261,T_4096_523,T_1024_131,T_4096_525,T_2048_263,T_4096_527,T_256_33,T_4096_529,T_2048_265,T_4096_531,T_1024_133,T_4096_533,T_2048_267,T_4096_535,T_512_67,T_4096_537,T_2048_269,T_4096_539,T_1024_135,T_4096_541,T_2048_271,T_4096_543,T_128_17,T_4096_545,T_2048_273,T_4096_547,T_1024_137,T_4096_549,T_2048_275,T_4096_551,T_512_69,T_4096_553,T_2048_277,T_4096_555,T_1024_139,T_4096_557,T_2048_279,T_4096_559,T_256_35,T_4096_561,T_2048_281,T_4096_563,T_1024_141,T_4096_565,T_2048_283,T_4096_567,T_512_71,T_4096_569,T_2048_285,T_4096_571,T_1024_143,T_4096_573,T_2048_287,T_4096_575,T_64_9,T_4096_577,T_2048_289,T_4096_579,T_1024_145,T_4096_581,T_2048_291,T_4096_583,T_512_73,T_4096_585,T_2048_293,T_4096_587,T_1024_147,T_4096_589,T_2048_295,T_4096_591,T_256_37,T_4096_593,T_2048_297,T_4096_595,T_1024_149,T_4096_597,T_2048_299,T_4096_599,T_512_75,T_4096_601,T_2048_301,T_4096_603,T_1024_151,T_4096_605,T_2048_303,T_4096_607,T_128_19,T_4096_609,T_2048_305,T_4096_611,T_1024_153,T_4096_613,T_2048_307,T_4096_615,T_512_77,T_4096_617,T_2048_309,T_4096_619,T_1024_155,T_4096_621,T_2048_311,T_4096_623,T_256_39,T_4096_625,T_2048_313,T_4096_627,T_1024_157,T_4096_629,T_2048_315,T_4096_631,T_512_79,T_4096_633,T_2048_317,T_4096_635,T_1024_159,T_4096_637,T_2048_319,T_4096_639,T_32_5,T_4096_641,T_2048_321,T_4096_643,T_1024_161,T_4096_645,T_2048_323,T_4096_647,T_512_81,T_4096_649,T_2048_325,T_4096_651,T_1024_163,T_4096_653,T_2048_327,T_4096_655,T_256_41,T_4096_657,T_2048_329,T_4096_659,T_1024_165,T_4096_661,T_2048_331,T_4096_663,T_512_83,T_4096_665,T_2048_333,T_4096_667,T_1024_167,T_4096_669,T_2048_335,T_4096_671,T_128_21,T_4096_673,T_2048_337,T_4096_675,T_1024_169,T_4096_677,T_2048_339,T_4096_679,T_512_85,T_4096_681,T_2048_341,T_4096_683,T_1024_171,T_4096_685,T_2048_343,T_4096_687,T_256_43,T_4096_689,T_2048_345,T_4096_691,T_1024_173,T_4096_693,T_2048_347,T_4096_695,T_512_87,T_4096_697,T_2048_349,T_4096_699,T_1024_175,T_4096_701,T_2048_351,T_4096_703,T_64_11,T_4096_705,T_2048_353,T_4096_707,T_1024_177,T_4096_709,T_2048_355,T_4096_711,T_512_89,T_4096_713,T_2048_357,T_4096_715,T_1024_179,T_4096_717,T_2048_359,T_4096_719,T_256_45,T_4096_721,T_2048_361,T_4096_723,T_1024_181,T_4096_725,T_2048_363,T_4096_727,T_512_91,T_4096_729,T_2048_365,T_4096_731,T_1024_183,T_4096_733,T_2048_367,T_4096_735,T_128_23,T_4096_737,T_2048_369,T_4096_739,T_1024_185,T_4096_741,T_2048_371,T_4096_743,T_512_93,T_4096_745,T_2048_373,T_4096_747,T_1024_187,T_4096_749,T_2048_375,T_4096_751,T_256_47,T_4096_753,T_2048_377,T_4096_755,T_1024_189,T_4096_757,T_2048_379,T_4096_759,T_512_95,T_4096_761,T_2048_381,T_4096_763,T_1024_191,T_4096_765,T_2048_383,T_4096_767,T_16_3,T_4096_769,T_2048_385,T_4096_771,T_1024_193,T_4096_773,T_2048_387,T_4096_775,T_512_97,T_4096_777,T_2048_389,T_4096_779,T_1024_195,T_4096_781,T_2048_391,T_4096_783,T_256_49,T_4096_785,T_2048_393,T_4096_787,T_1024_197,T_4096_789,T_2048_395,T_4096_791,T_512_99,T_4096_793,T_2048_397,T_4096_795,T_1024_199,T_4096_797,T_2048_399,T_4096_799,T_128_25,T_4096_801,T_2048_401,T_4096_803,T_1024_201,T_4096_805,T_2048_403,T_4096_807,T_512_101,T_4096_809,T_2048_405,T_4096_811,T_1024_203,T_4096_813,T_2048_407,T_4096_815,T_256_51,T_4096_817,T_2048_409,T_4096_819,T_1024_205,T_4096_821,T_2048_411,T_4096_823,T_512_103,T_4096_825,T_2048_413,T_4096_827,T_1024_207,T_4096_829,T_2048_415,T_4096_831,T_64_13,T_4096_833,T_2048_417,T_4096_835,T_1024_209,T_4096_837,T_2048_419,T_4096_839,T_512_105,T_4096_841,T_2048_421,T_4096_843,T_1024_211,T_4096_845,T_2048_423,T_4096_847,T_256_53,T_4096_849,T_2048_425,T_4096_851,T_1024_213,T_4096_853,T_2048_427,T_4096_855,T_512_107,T_4096_857,T_2048_429,T_4096_859,T_1024_215,T_4096_861,T_2048_431,T_4096_863,T_128_27,T_4096_865,T_2048_433,T_4096_867,T_1024_217,T_4096_869,T_2048_435,T_4096_871,T_512_109,T_4096_873,T_2048_437,T_4096_875,T_1024_219,T_4096_877,T_2048_439,T_4096_879,T_256_55,T_4096_881,T_2048_441,T_4096_883,T_1024_221,T_4096_885,T_2048_443,T_4096_887,T_512_111,T_4096_889,T_2048_445,T_4096_891,T_1024_223,T_4096_893,T_2048_447,T_4096_895,T_32_7,T_4096_897,T_2048_449,T_4096_899,T_1024_225,T_4096_901,T_2048_451,T_4096_903,T_512_113,T_4096_905,T_2048_453,T_4096_907,T_1024_227,T_4096_909,T_2048_455,T_4096_911,T_256_57,T_4096_913,T_2048_457,T_4096_915,T_1024_229,T_4096_917,T_2048_459,T_4096_919,T_512_115,T_4096_921,T_2048_461,T_4096_923,T_1024_231,T_4096_925,T_2048_463,T_4096_927,T_128_29,T_4096_929,T_2048_465,T_4096_931,T_1024_233,T_4096_933,T_2048_467,T_4096_935,T_512_117,T_4096_937,T_2048_469,T_4096_939,T_1024_235,T_4096_941,T_2048_471,T_4096_943,T_256_59,T_4096_945,T_2048_473,T_4096_947,T_1024_237,T_4096_949,T_2048_475,T_4096_951,T_512_119,T_4096_953,T_2048_477,T_4096_955,T_1024_239,T_4096_957,T_2048_479,T_4096_959,T_64_15,T_4096_961,T_2048_481,T_4096_963,T_1024_241,T_4096_965,T_2048_483,T_4096_967,T_512_121,T_4096_969,T_2048_485,T_4096_971,T_1024_243,T_4096_973,T_2048_487,T_4096_975,T_256_61,T_4096_977,T_2048_489,T_4096_979,T_1024_245,T_4096_981,T_2048_491,T_4096_983,T_512_123,T_4096_985,T_2048_493,T_4096_987,T_1024_247,T_4096_989,T_2048_495,T_4096_991,T_128_31,T_4096_993,T_2048_497,T_4096_995,T_1024_249,T_4096_997,T_2048_499,T_4096_999,T_512_125,T_4096_1001,T_2048_501,T_4096_1003,T_1024_251,T_4096_1005,T_2048_503,T_4096_1007,T_256_63,T_4096_1009,T_2048_505,T_4096_1011,T_1024_253,T_4096_1013,T_2048_507,T_4096_1015,T_512_127,T_4096_1017,T_2048_509,T_4096_1019,T_1024_255,T_4096_1021,T_2048_511,T_4096_1023,T_2_0,T_4096_3,T_2048_3,T_4096_9,T_1024_3,T_4096_15,T_2048_9,T_4096_21,T_512_3,T_4096_27,T_2048_15,T_4096_33,T_1024_9,T_4096_39,T_2048_21,T_4096_45,T_256_3,T_4096_51,T_2048_27,T_4096_57,T_1024_15,T_4096_63,T_2048_33,T_4096_69,T_512_9,T_4096_75,T_2048_39,T_4096_81,T_1024_21,T_4096_87,T_2048_45,T_4096_93,T_128_3,T_4096_99,T_2048_51,T_4096_105,T_1024_27,T_4096_111,T_2048_57,T_4096_117,T_512_15,T_4096_123,T_2048_63,T_4096_129,T_1024_33,T_4096_135,T_2048_69,T_4096_141,T_256_9,T_4096_147,T_2048_75,T_4096_153,T_1024_39,T_4096_159,T_2048_81,T_4096_165,T_512_21,T_4096_171,T_2048_87,T_4096_177,T_1024_45,T_4096_183,T_2048_93,T_4096_189,T_64_3,T_4096_195,T_2048_99,T_4096_201,T_1024_51,T_4096_207,T_2048_105,T_4096_213,T_512_27,T_4096_219,T_2048_111,T_4096_225,T_1024_57,T_4096_231,T_2048_117,T_4096_237,T_256_15,T_4096_243,T_2048_123,T_4096_249,T_1024_63,T_4096_255,T_2048_129,T_4096_261,T_512_33,T_4096_267,T_2048_135,T_4096_273,T_1024_69,T_4096_279,T_2048_141,T_4096_285,T_128_9,T_4096_291,T_2048_147,T_4096_297,T_1024_75,T_4096_303,T_2048_153,T_4096_309,T_512_39,T_4096_315,T_2048_159,T_4096_321,T_1024_81,T_4096_327,T_2048_165,T_4096_333,T_256_21,T_4096_339,T_2048_171,T_4096_345,T_1024_87,T_4096_351,T_2048_177,T_4096_357,T_512_45,T_4096_363,T_2048_183,T_4096_369,T_1024_93,T_4096_375,T_2048_189,T_4096_381,T_32_3,T_4096_387,T_2048_195,T_4096_393,T_1024_99,T_4096_399,T_2048_201,T_4096_405,T_512_51,T_4096_411,T_2048_207,T_4096_417,T_1024_105,T_4096_423,T_2048_213,T_4096_429,T_256_27,T_4096_435,T_2048_219,T_4096_441,T_1024_111,T_4096_447,T_2048_225,T_4096_453,T_512_57,T_4096_459,T_2048_231,T_4096_465,T_1024_117,T_4096_471,T_2048_237,T_4096_477,T_128_15,T_4096_483,T_2048_243,T_4096_489,T_1024_123,T_4096_495,T_2048_249,T_4096_501,T_512_63,T_4096_507,T_2048_255,T_4096_513,T_1024_129,T_4096_519,T_2048_261,T_4096_525,T_256_33,T_4096_531,T_2048_267,T_4096_537,T_1024_135,T_4096_543,T_2048_273,T_4096_549,T_512_69,T_4096_555,T_2048_279,T_4096_561,T_1024_141,T_4096_567,T_2048_285,T_4096_573,T_64_9,T_4096_579,T_2048_291,T_4096_585,T_1024_147,T_4096_591,T_2048_297,T_4096_597,T_512_75,T_4096_603,T_2048_303,T_4096_609,T_1024_153,T_4096_615,T_2048_309,T_4096_621,T_256_39,T_4096_627,T_2048_315,T_4096_633,T_1024_159,T_4096_639,T_2048_321,T_4096_645,T_512_81,T_4096_651,T_2048_327,T_4096_657,T_1024_165,T_4096_663,T_2048_333,T_4096_669,T_128_21,T_4096_675,T_2048_339,T_4096_681,T_1024_171,T_4096_687,T_2048_345,T_4096_693,T_512_87,T_4096_699,T_2048_351,T_4096_705,T_1024_177,T_4096_711,T_2048_357,T_4096_717,T_256_45,T_4096_723,T_2048_363,T_4096_729,T_1024_183,T_4096_735,T_2048_369,T_4096_741,T_512_93,T_4096_747,T_2048_375,T_4096_753,T_1024_189,T_4096_759,T_2048_381,T_4096_765,T_16_3,T_4096_771,T_2048_387,T_4096_777,T_1024_195,T_4096_783,T_2048_393,T_4096_789,T_512_99,T_4096_795,T_2048_399,T_4096_801,T_1024_201,T_4096_807,T_2048_405,T_4096_813,T_256_51,T_4096_819,T_2048_411,T_4096_825,T_1024_207,T_4096_831,T_2048_417,T_4096_837,T_512_105,T_4096_843,T_2048_423,T_4096_849,T_1024_213,T_4096_855,T_2048_429,T_4096_861,T_128_27,T_4096_867,T_2048_435,T_4096_873,T_1024_219,T_4096_879,T_2048_441,T_4096_885,T_512_111,T_4096_891,T_2048_447,T_4096_897,T_1024_225,T_4096_903,T_2048_453,T_4096_909,T_256_57,T_4096_915,T_2048_459,T_4096_921,T_1024_231,T_4096_927,T_2048_465,T_4096_933,T_512_117,T_4096_939,T_2048_471,T_4096_945,T_1024_237,T_4096_951,T_2048_477,T_4096_957,T_64_15,T_4096_963,T_2048_483,T_4096_969,T_1024_243,T_4096_975,T_2048_489,T_4096_981,T_512_123,T_4096_987,T_2048_495,T_4096_993,T_1024_249,T_4096_999,T_2048_501,T_4096_1005,T_256_63,T_4096_1011,T_2048_507,T_4096_1017,T_1024_255,T_4096_1023,T_2048_513,T_4096_1029,T_512_129,T_4096_1035,T_2048_519,T_4096_1041,T_1024_261,T_4096_1047,T_2048_525,T_4096_1053,T_128_33,T_4096_1059,T_2048_531,T_4096_1065,T_1024_267,T_4096_1071,T_2048_537,T_4096_1077,T_512_135,T_4096_1083,T_2048_543,T_4096_1089,T_1024_273,T_4096_1095,T_2048_549,T_4096_1101,T_256_69,T_4096_1107,T_2048_555,T_4096_1113,T_1024_279,T_4096_1119,T_2048_561,T_4096_1125,T_512_141,T_4096_1131,T_2048_567,T_4096_1137,T_1024_285,T_4096_1143,T_2048_573,T_4096_1149,T_32_9,T_4096_1155,T_2048_579,T_4096_1161,T_1024_291,T_4096_1167,T_2048_585,T_4096_1173,T_512_147,T_4096_1179,T_2048_591,T_4096_1185,T_1024_297,T_4096_1191,T_2048_597,T_4096_1197,T_256_75,T_4096_1203,T_2048_603,T_4096_1209,T_1024_303,T_4096_1215,T_2048_609,T_4096_1221,T_512_153,T_4096_1227,T_2048_615,T_4096_1233,T_1024_309,T_4096_1239,T_2048_621,T_4096_1245,T_128_39,T_4096_1251,T_2048_627,T_4096_1257,T_1024_315,T_4096_1263,T_2048_633,T_4096_1269,T_512_159,T_4096_1275,T_2048_639,T_4096_1281,T_1024_321,T_4096_1287,T_2048_645,T_4096_1293,T_256_81,T_4096_1299,T_2048_651,T_4096_1305,T_1024_327,T_4096_1311,T_2048_657,T_4096_1317,T_512_165,T_4096_1323,T_2048_663,T_4096_1329,T_1024_333,T_4096_1335,T_2048_669,T_4096_1341,T_64_21,T_4096_1347,T_2048_675,T_4096_1353,T_1024_339,T_4096_1359,T_2048_681,T_4096_1365,T_512_171,T_4096_1371,T_2048_687,T_4096_1377,T_1024_345,T_4096_1383,T_2048_693,T_4096_1389,T_256_87,T_4096_1395,T_2048_699,T_4096_1401,T_1024_351,T_4096_1407,T_2048_705,T_4096_1413,T_512_177,T_4096_1419,T_2048_711,T_4096_1425,T_1024_357,T_4096_1431,T_2048_717,T_4096_1437,T_128_45,T_4096_1443,T_2048_723,T_4096_1449,T_1024_363,T_4096_1455,T_2048_729,T_4096_1461,T_512_183,T_4096_1467,T_2048_735,T_4096_1473,T_1024_369,T_4096_1479,T_2048_741,T_4096_1485,T_256_93,T_4096_1491,T_2048_747,T_4096_1497,T_1024_375,T_4096_1503,T_2048_753,T_4096_1509,T_512_189,T_4096_1515,T_2048_759,T_4096_1521,T_1024_381,T_4096_1527,T_2048_765,T_4096_1533,T_8_3,T_4096_1539,T_2048_771,T_4096_1545,T_1024_387,T_4096_1551,T_2048_777,T_4096_1557,T_512_195,T_4096_1563,T_2048_783,T_4096_1569,T_1024_393,T_4096_1575,T_2048_789,T_4096_1581,T_256_99,T_4096_1587,T_2048_795,T_4096_1593,T_1024_399,T_4096_1599,T_2048_801,T_4096_1605,T_512_201,T_4096_1611,T_2048_807,T_4096_1617,T_1024_405,T_4096_1623,T_2048_813,T_4096_1629,T_128_51,T_4096_1635,T_2048_819,T_4096_1641,T_1024_411,T_4096_1647,T_2048_825,T_4096_1653,T_512_207,T_4096_1659,T_2048_831,T_4096_1665,T_1024_417,T_4096_1671,T_2048_837,T_4096_1677,T_256_105,T_4096_1683,T_2048_843,T_4096_1689,T_1024_423,T_4096_1695,T_2048_849,T_4096_1701,T_512_213,T_4096_1707,T_2048_855,T_4096_1713,T_1024_429,T_4096_1719,T_2048_861,T_4096_1725,T_64_27,T_4096_1731,T_2048_867,T_4096_1737,T_1024_435,T_4096_1743,T_2048_873,T_4096_1749,T_512_219,T_4096_1755,T_2048_879,T_4096_1761,T_1024_441,T_4096_1767,T_2048_885,T_4096_1773,T_256_111,T_4096_1779,T_2048_891,T_4096_1785,T_1024_447,T_4096_1791,T_2048_897,T_4096_1797,T_512_225,T_4096_1803,T_2048_903,T_4096_1809,T_1024_453,T_4096_1815,T_2048_909,T_4096_1821,T_128_57,T_4096_1827,T_2048_915,T_4096_1833,T_1024_459,T_4096_1839,T_2048_921,T_4096_1845,T_512_231,T_4096_1851,T_2048_927,T_4096_1857,T_1024_465,T_4096_1863,T_2048_933,T_4096_1869,T_256_117,T_4096_1875,T_2048_939,T_4096_1881,T_1024_471,T_4096_1887,T_2048_945,T_4096_1893,T_512_237,T_4096_1899,T_2048_951,T_4096_1905,T_1024_477,T_4096_1911,T_2048_957,T_4096_1917,T_32_15,T_4096_1923,T_2048_963,T_4096_1929,T_1024_483,T_4096_1935,T_2048_969,T_4096_1941,T_512_243,T_4096_1947,T_2048_975,T_4096_1953,T_1024_489,T_4096_1959,T_2048_981,T_4096_1965,T_256_123,T_4096_1971,T_2048_987,T_4096_1977,T_1024_495,T_4096_1983,T_2048_993,T_4096_1989,T_512_249,T_4096_1995,T_2048_999,T_4096_2001,T_1024_501,T_4096_2007,T_2048_1005,T_4096_2013,T_128_63,T_4096_2019,T_2048_1011,T_4096_2025,T_1024_507,T_4096_2031,T_2048_1017,T_4096_2037,T_512_255,T_4096_2043,T_2048_1023,T_4096_2049,T_1024_513,T_4096_2055,T_2048_1029,T_4096_2061,T_256_129,T_4096_2067,T_2048_1035,T_4096_2073,T_1024_519,T_4096_2079,T_2048_1041,T_4096_2085,T_512_261,T_4096_2091,T_2048_1047,T_4096_2097,T_1024_525,T_4096_2103,T_2048_1053,T_4096_2109,T_64_33,T_4096_2115,T_2048_1059,T_4096_2121,T_1024_531,T_4096_2127,T_2048_1065,T_4096_2133,T_512_267,T_4096_2139,T_2048_1071,T_4096_2145,T_1024_537,T_4096_2151,T_2048_1077,T_4096_2157,T_256_135,T_4096_2163,T_2048_1083,T_4096_2169,T_1024_543,T_4096_2175,T_2048_1089,T_4096_2181,T_512_273,T_4096_2187,T_2048_1095,T_4096_2193,T_1024_549,T_4096_2199,T_2048_1101,T_4096_2205,T_128_69,T_4096_2211,T_2048_1107,T_4096_2217,T_1024_555,T_4096_2223,T_2048_1113,T_4096_2229,T_512_279,T_4096_2235,T_2048_1119,T_4096_2241,T_1024_561,T_4096_2247,T_2048_1125,T_4096_2253,T_256_141,T_4096_2259,T_2048_1131,T_4096_2265,T_1024_567,T_4096_2271,T_2048_1137,T_4096_2277,T_512_285,T_4096_2283,T_2048_1143,T_4096_2289,T_1024_573,T_4096_2295,T_2048_1149,T_4096_2301,T_16_9,T_4096_2307,T_2048_1155,T_4096_2313,T_1024_579,T_4096_2319,T_2048_1161,T_4096_2325,T_512_291,T_4096_2331,T_2048_1167,T_4096_2337,T_1024_585,T_4096_2343,T_2048_1173,T_4096_2349,T_256_147,T_4096_2355,T_2048_1179,T_4096_2361,T_1024_591,T_4096_2367,T_2048_1185,T_4096_2373,T_512_297,T_4096_2379,T_2048_1191,T_4096_2385,T_1024_597,T_4096_2391,T_2048_1197,T_4096_2397,T_128_75,T_4096_2403,T_2048_1203,T_4096_2409,T_1024_603,T_4096_2415,T_2048_1209,T_4096_2421,T_512_303,T_4096_2427,T_2048_1215,T_4096_2433,T_1024_609,T_4096_2439,T_2048_1221,T_4096_2445,T_256_153,T_4096_2451,T_2048_1227,T_4096_2457,T_1024_615,T_4096_2463,T_2048_1233,T_4096_2469,T_512_309,T_4096_2475,T_2048_1239,T_4096_2481,T_1024_621,T_4096_2487,T_2048_1245,T_4096_2493,T_64_39,T_4096_2499,T_2048_1251,T_4096_2505,T_1024_627,T_4096_2511,T_2048_1257,T_4096_2517,T_512_315,T_4096_2523,T_2048_1263,T_4096_2529,T_1024_633,T_4096_2535,T_2048_1269,T_4096_2541,T_256_159,T_4096_2547,T_2048_1275,T_4096_2553,T_1024_639,T_4096_2559,T_2048_1281,T_4096_2565,T_512_321,T_4096_2571,T_2048_1287,T_4096_2577,T_1024_645,T_4096_2583,T_2048_1293,T_4096_2589,T_128_81,T_4096_2595,T_2048_1299,T_4096_2601,T_1024_651,T_4096_2607,T_2048_1305,T_4096_2613,T_512_327,T_4096_2619,T_2048_1311,T_4096_2625,T_1024_657,T_4096_2631,T_2048_1317,T_4096_2637,T_256_165,T_4096_2643,T_2048_1323,T_4096_2649,T_1024_663,T_4096_2655,T_2048_1329,T_4096_2661,T_512_333,T_4096_2667,T_2048_1335,T_4096_2673,T_1024_669,T_4096_2679,T_2048_1341,T_4096_2685,T_32_21,T_4096_2691,T_2048_1347,T_4096_2697,T_1024_675,T_4096_2703,T_2048_1353,T_4096_2709,T_512_339,T_4096_2715,T_2048_1359,T_4096_2721,T_1024_681,T_4096_2727,T_2048_1365,T_4096_2733,T_256_171,T_4096_2739,T_2048_1371,T_4096_2745,T_1024_687,T_4096_2751,T_2048_1377,T_4096_2757,T_512_345,T_4096_2763,T_2048_1383,T_4096_2769,T_1024_693,T_4096_2775,T_2048_1389,T_4096_2781,T_128_87,T_4096_2787,T_2048_1395,T_4096_2793,T_1024_699,T_4096_2799,T_2048_1401,T_4096_2805,T_512_351,T_4096_2811,T_2048_1407,T_4096_2817,T_1024_705,T_4096_2823,T_2048_1413,T_4096_2829,T_256_177,T_4096_2835,T_2048_1419,T_4096_2841,T_1024_711,T_4096_2847,T_2048_1425,T_4096_2853,T_512_357,T_4096_2859,T_2048_1431,T_4096_2865,T_1024_717,T_4096_2871,T_2048_1437,T_4096_2877,T_64_45,T_4096_2883,T_2048_1443,T_4096_2889,T_1024_723,T_4096_2895,T_2048_1449,T_4096_2901,T_512_363,T_4096_2907,T_2048_1455,T_4096_2913,T_1024_729,T_4096_2919,T_2048_1461,T_4096_2925,T_256_183,T_4096_2931,T_2048_1467,T_4096_2937,T_1024_735,T_4096_2943,T_2048_1473,T_4096_2949,T_512_369,T_4096_2955,T_2048_1479,T_4096_2961,T_1024_741,T_4096_2967,T_2048_1485,T_4096_2973,T_128_93,T_4096_2979,T_2048_1491,T_4096_2985,T_1024_747,T_4096_2991,T_2048_1497,T_4096_2997,T_512_375,T_4096_3003,T_2048_1503,T_4096_3009,T_1024_753,T_4096_3015,T_2048_1509,T_4096_3021,T_256_189,T_4096_3027,T_2048_1515,T_4096_3033,T_1024_759,T_4096_3039,T_2048_1521,T_4096_3045,T_512_381,T_4096_3051,T_2048_1527,T_4096_3057,T_1024_765,T_4096_3063,T_2048_1533,T_4096_3069 +}; +static const __device__ double2 lut_dp_8_4096[512*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_32_1,T_4096_129,T_2048_65,T_4096_131,T_1024_33,T_4096_133,T_2048_67,T_4096_135,T_512_17,T_4096_137,T_2048_69,T_4096_139,T_1024_35,T_4096_141,T_2048_71,T_4096_143,T_256_9,T_4096_145,T_2048_73,T_4096_147,T_1024_37,T_4096_149,T_2048_75,T_4096_151,T_512_19,T_4096_153,T_2048_77,T_4096_155,T_1024_39,T_4096_157,T_2048_79,T_4096_159,T_128_5,T_4096_161,T_2048_81,T_4096_163,T_1024_41,T_4096_165,T_2048_83,T_4096_167,T_512_21,T_4096_169,T_2048_85,T_4096_171,T_1024_43,T_4096_173,T_2048_87,T_4096_175,T_256_11,T_4096_177,T_2048_89,T_4096_179,T_1024_45,T_4096_181,T_2048_91,T_4096_183,T_512_23,T_4096_185,T_2048_93,T_4096_187,T_1024_47,T_4096_189,T_2048_95,T_4096_191,T_64_3,T_4096_193,T_2048_97,T_4096_195,T_1024_49,T_4096_197,T_2048_99,T_4096_199,T_512_25,T_4096_201,T_2048_101,T_4096_203,T_1024_51,T_4096_205,T_2048_103,T_4096_207,T_256_13,T_4096_209,T_2048_105,T_4096_211,T_1024_53,T_4096_213,T_2048_107,T_4096_215,T_512_27,T_4096_217,T_2048_109,T_4096_219,T_1024_55,T_4096_221,T_2048_111,T_4096_223,T_128_7,T_4096_225,T_2048_113,T_4096_227,T_1024_57,T_4096_229,T_2048_115,T_4096_231,T_512_29,T_4096_233,T_2048_117,T_4096_235,T_1024_59,T_4096_237,T_2048_119,T_4096_239,T_256_15,T_4096_241,T_2048_121,T_4096_243,T_1024_61,T_4096_245,T_2048_123,T_4096_247,T_512_31,T_4096_249,T_2048_125,T_4096_251,T_1024_63,T_4096_253,T_2048_127,T_4096_255,T_16_1,T_4096_257,T_2048_129,T_4096_259,T_1024_65,T_4096_261,T_2048_131,T_4096_263,T_512_33,T_4096_265,T_2048_133,T_4096_267,T_1024_67,T_4096_269,T_2048_135,T_4096_271,T_256_17,T_4096_273,T_2048_137,T_4096_275,T_1024_69,T_4096_277,T_2048_139,T_4096_279,T_512_35,T_4096_281,T_2048_141,T_4096_283,T_1024_71,T_4096_285,T_2048_143,T_4096_287,T_128_9,T_4096_289,T_2048_145,T_4096_291,T_1024_73,T_4096_293,T_2048_147,T_4096_295,T_512_37,T_4096_297,T_2048_149,T_4096_299,T_1024_75,T_4096_301,T_2048_151,T_4096_303,T_256_19,T_4096_305,T_2048_153,T_4096_307,T_1024_77,T_4096_309,T_2048_155,T_4096_311,T_512_39,T_4096_313,T_2048_157,T_4096_315,T_1024_79,T_4096_317,T_2048_159,T_4096_319,T_64_5,T_4096_321,T_2048_161,T_4096_323,T_1024_81,T_4096_325,T_2048_163,T_4096_327,T_512_41,T_4096_329,T_2048_165,T_4096_331,T_1024_83,T_4096_333,T_2048_167,T_4096_335,T_256_21,T_4096_337,T_2048_169,T_4096_339,T_1024_85,T_4096_341,T_2048_171,T_4096_343,T_512_43,T_4096_345,T_2048_173,T_4096_347,T_1024_87,T_4096_349,T_2048_175,T_4096_351,T_128_11,T_4096_353,T_2048_177,T_4096_355,T_1024_89,T_4096_357,T_2048_179,T_4096_359,T_512_45,T_4096_361,T_2048_181,T_4096_363,T_1024_91,T_4096_365,T_2048_183,T_4096_367,T_256_23,T_4096_369,T_2048_185,T_4096_371,T_1024_93,T_4096_373,T_2048_187,T_4096_375,T_512_47,T_4096_377,T_2048_189,T_4096_379,T_1024_95,T_4096_381,T_2048_191,T_4096_383,T_32_3,T_4096_385,T_2048_193,T_4096_387,T_1024_97,T_4096_389,T_2048_195,T_4096_391,T_512_49,T_4096_393,T_2048_197,T_4096_395,T_1024_99,T_4096_397,T_2048_199,T_4096_399,T_256_25,T_4096_401,T_2048_201,T_4096_403,T_1024_101,T_4096_405,T_2048_203,T_4096_407,T_512_51,T_4096_409,T_2048_205,T_4096_411,T_1024_103,T_4096_413,T_2048_207,T_4096_415,T_128_13,T_4096_417,T_2048_209,T_4096_419,T_1024_105,T_4096_421,T_2048_211,T_4096_423,T_512_53,T_4096_425,T_2048_213,T_4096_427,T_1024_107,T_4096_429,T_2048_215,T_4096_431,T_256_27,T_4096_433,T_2048_217,T_4096_435,T_1024_109,T_4096_437,T_2048_219,T_4096_439,T_512_55,T_4096_441,T_2048_221,T_4096_443,T_1024_111,T_4096_445,T_2048_223,T_4096_447,T_64_7,T_4096_449,T_2048_225,T_4096_451,T_1024_113,T_4096_453,T_2048_227,T_4096_455,T_512_57,T_4096_457,T_2048_229,T_4096_459,T_1024_115,T_4096_461,T_2048_231,T_4096_463,T_256_29,T_4096_465,T_2048_233,T_4096_467,T_1024_117,T_4096_469,T_2048_235,T_4096_471,T_512_59,T_4096_473,T_2048_237,T_4096_475,T_1024_119,T_4096_477,T_2048_239,T_4096_479,T_128_15,T_4096_481,T_2048_241,T_4096_483,T_1024_121,T_4096_485,T_2048_243,T_4096_487,T_512_61,T_4096_489,T_2048_245,T_4096_491,T_1024_123,T_4096_493,T_2048_247,T_4096_495,T_256_31,T_4096_497,T_2048_249,T_4096_499,T_1024_125,T_4096_501,T_2048_251,T_4096_503,T_512_63,T_4096_505,T_2048_253,T_4096_507,T_1024_127,T_4096_509,T_2048_255,T_4096_511,T_2_0,T_4096_5,T_2048_5,T_4096_15,T_1024_5,T_4096_25,T_2048_15,T_4096_35,T_512_5,T_4096_45,T_2048_25,T_4096_55,T_1024_15,T_4096_65,T_2048_35,T_4096_75,T_256_5,T_4096_85,T_2048_45,T_4096_95,T_1024_25,T_4096_105,T_2048_55,T_4096_115,T_512_15,T_4096_125,T_2048_65,T_4096_135,T_1024_35,T_4096_145,T_2048_75,T_4096_155,T_128_5,T_4096_165,T_2048_85,T_4096_175,T_1024_45,T_4096_185,T_2048_95,T_4096_195,T_512_25,T_4096_205,T_2048_105,T_4096_215,T_1024_55,T_4096_225,T_2048_115,T_4096_235,T_256_15,T_4096_245,T_2048_125,T_4096_255,T_1024_65,T_4096_265,T_2048_135,T_4096_275,T_512_35,T_4096_285,T_2048_145,T_4096_295,T_1024_75,T_4096_305,T_2048_155,T_4096_315,T_64_5,T_4096_325,T_2048_165,T_4096_335,T_1024_85,T_4096_345,T_2048_175,T_4096_355,T_512_45,T_4096_365,T_2048_185,T_4096_375,T_1024_95,T_4096_385,T_2048_195,T_4096_395,T_256_25,T_4096_405,T_2048_205,T_4096_415,T_1024_105,T_4096_425,T_2048_215,T_4096_435,T_512_55,T_4096_445,T_2048_225,T_4096_455,T_1024_115,T_4096_465,T_2048_235,T_4096_475,T_128_15,T_4096_485,T_2048_245,T_4096_495,T_1024_125,T_4096_505,T_2048_255,T_4096_515,T_512_65,T_4096_525,T_2048_265,T_4096_535,T_1024_135,T_4096_545,T_2048_275,T_4096_555,T_256_35,T_4096_565,T_2048_285,T_4096_575,T_1024_145,T_4096_585,T_2048_295,T_4096_595,T_512_75,T_4096_605,T_2048_305,T_4096_615,T_1024_155,T_4096_625,T_2048_315,T_4096_635,T_32_5,T_4096_645,T_2048_325,T_4096_655,T_1024_165,T_4096_665,T_2048_335,T_4096_675,T_512_85,T_4096_685,T_2048_345,T_4096_695,T_1024_175,T_4096_705,T_2048_355,T_4096_715,T_256_45,T_4096_725,T_2048_365,T_4096_735,T_1024_185,T_4096_745,T_2048_375,T_4096_755,T_512_95,T_4096_765,T_2048_385,T_4096_775,T_1024_195,T_4096_785,T_2048_395,T_4096_795,T_128_25,T_4096_805,T_2048_405,T_4096_815,T_1024_205,T_4096_825,T_2048_415,T_4096_835,T_512_105,T_4096_845,T_2048_425,T_4096_855,T_1024_215,T_4096_865,T_2048_435,T_4096_875,T_256_55,T_4096_885,T_2048_445,T_4096_895,T_1024_225,T_4096_905,T_2048_455,T_4096_915,T_512_115,T_4096_925,T_2048_465,T_4096_935,T_1024_235,T_4096_945,T_2048_475,T_4096_955,T_64_15,T_4096_965,T_2048_485,T_4096_975,T_1024_245,T_4096_985,T_2048_495,T_4096_995,T_512_125,T_4096_1005,T_2048_505,T_4096_1015,T_1024_255,T_4096_1025,T_2048_515,T_4096_1035,T_256_65,T_4096_1045,T_2048_525,T_4096_1055,T_1024_265,T_4096_1065,T_2048_535,T_4096_1075,T_512_135,T_4096_1085,T_2048_545,T_4096_1095,T_1024_275,T_4096_1105,T_2048_555,T_4096_1115,T_128_35,T_4096_1125,T_2048_565,T_4096_1135,T_1024_285,T_4096_1145,T_2048_575,T_4096_1155,T_512_145,T_4096_1165,T_2048_585,T_4096_1175,T_1024_295,T_4096_1185,T_2048_595,T_4096_1195,T_256_75,T_4096_1205,T_2048_605,T_4096_1215,T_1024_305,T_4096_1225,T_2048_615,T_4096_1235,T_512_155,T_4096_1245,T_2048_625,T_4096_1255,T_1024_315,T_4096_1265,T_2048_635,T_4096_1275,T_16_5,T_4096_1285,T_2048_645,T_4096_1295,T_1024_325,T_4096_1305,T_2048_655,T_4096_1315,T_512_165,T_4096_1325,T_2048_665,T_4096_1335,T_1024_335,T_4096_1345,T_2048_675,T_4096_1355,T_256_85,T_4096_1365,T_2048_685,T_4096_1375,T_1024_345,T_4096_1385,T_2048_695,T_4096_1395,T_512_175,T_4096_1405,T_2048_705,T_4096_1415,T_1024_355,T_4096_1425,T_2048_715,T_4096_1435,T_128_45,T_4096_1445,T_2048_725,T_4096_1455,T_1024_365,T_4096_1465,T_2048_735,T_4096_1475,T_512_185,T_4096_1485,T_2048_745,T_4096_1495,T_1024_375,T_4096_1505,T_2048_755,T_4096_1515,T_256_95,T_4096_1525,T_2048_765,T_4096_1535,T_1024_385,T_4096_1545,T_2048_775,T_4096_1555,T_512_195,T_4096_1565,T_2048_785,T_4096_1575,T_1024_395,T_4096_1585,T_2048_795,T_4096_1595,T_64_25,T_4096_1605,T_2048_805,T_4096_1615,T_1024_405,T_4096_1625,T_2048_815,T_4096_1635,T_512_205,T_4096_1645,T_2048_825,T_4096_1655,T_1024_415,T_4096_1665,T_2048_835,T_4096_1675,T_256_105,T_4096_1685,T_2048_845,T_4096_1695,T_1024_425,T_4096_1705,T_2048_855,T_4096_1715,T_512_215,T_4096_1725,T_2048_865,T_4096_1735,T_1024_435,T_4096_1745,T_2048_875,T_4096_1755,T_128_55,T_4096_1765,T_2048_885,T_4096_1775,T_1024_445,T_4096_1785,T_2048_895,T_4096_1795,T_512_225,T_4096_1805,T_2048_905,T_4096_1815,T_1024_455,T_4096_1825,T_2048_915,T_4096_1835,T_256_115,T_4096_1845,T_2048_925,T_4096_1855,T_1024_465,T_4096_1865,T_2048_935,T_4096_1875,T_512_235,T_4096_1885,T_2048_945,T_4096_1895,T_1024_475,T_4096_1905,T_2048_955,T_4096_1915,T_32_15,T_4096_1925,T_2048_965,T_4096_1935,T_1024_485,T_4096_1945,T_2048_975,T_4096_1955,T_512_245,T_4096_1965,T_2048_985,T_4096_1975,T_1024_495,T_4096_1985,T_2048_995,T_4096_1995,T_256_125,T_4096_2005,T_2048_1005,T_4096_2015,T_1024_505,T_4096_2025,T_2048_1015,T_4096_2035,T_512_255,T_4096_2045,T_2048_1025,T_4096_2055,T_1024_515,T_4096_2065,T_2048_1035,T_4096_2075,T_128_65,T_4096_2085,T_2048_1045,T_4096_2095,T_1024_525,T_4096_2105,T_2048_1055,T_4096_2115,T_512_265,T_4096_2125,T_2048_1065,T_4096_2135,T_1024_535,T_4096_2145,T_2048_1075,T_4096_2155,T_256_135,T_4096_2165,T_2048_1085,T_4096_2175,T_1024_545,T_4096_2185,T_2048_1095,T_4096_2195,T_512_275,T_4096_2205,T_2048_1105,T_4096_2215,T_1024_555,T_4096_2225,T_2048_1115,T_4096_2235,T_64_35,T_4096_2245,T_2048_1125,T_4096_2255,T_1024_565,T_4096_2265,T_2048_1135,T_4096_2275,T_512_285,T_4096_2285,T_2048_1145,T_4096_2295,T_1024_575,T_4096_2305,T_2048_1155,T_4096_2315,T_256_145,T_4096_2325,T_2048_1165,T_4096_2335,T_1024_585,T_4096_2345,T_2048_1175,T_4096_2355,T_512_295,T_4096_2365,T_2048_1185,T_4096_2375,T_1024_595,T_4096_2385,T_2048_1195,T_4096_2395,T_128_75,T_4096_2405,T_2048_1205,T_4096_2415,T_1024_605,T_4096_2425,T_2048_1215,T_4096_2435,T_512_305,T_4096_2445,T_2048_1225,T_4096_2455,T_1024_615,T_4096_2465,T_2048_1235,T_4096_2475,T_256_155,T_4096_2485,T_2048_1245,T_4096_2495,T_1024_625,T_4096_2505,T_2048_1255,T_4096_2515,T_512_315,T_4096_2525,T_2048_1265,T_4096_2535,T_1024_635,T_4096_2545,T_2048_1275,T_4096_2555 +}; +static const __device__ double2 lut_dp_16_4096[256*2] = { + T_2_0,T_4096_1,T_2048_1,T_4096_3,T_1024_1,T_4096_5,T_2048_3,T_4096_7,T_512_1,T_4096_9,T_2048_5,T_4096_11,T_1024_3,T_4096_13,T_2048_7,T_4096_15,T_256_1,T_4096_17,T_2048_9,T_4096_19,T_1024_5,T_4096_21,T_2048_11,T_4096_23,T_512_3,T_4096_25,T_2048_13,T_4096_27,T_1024_7,T_4096_29,T_2048_15,T_4096_31,T_128_1,T_4096_33,T_2048_17,T_4096_35,T_1024_9,T_4096_37,T_2048_19,T_4096_39,T_512_5,T_4096_41,T_2048_21,T_4096_43,T_1024_11,T_4096_45,T_2048_23,T_4096_47,T_256_3,T_4096_49,T_2048_25,T_4096_51,T_1024_13,T_4096_53,T_2048_27,T_4096_55,T_512_7,T_4096_57,T_2048_29,T_4096_59,T_1024_15,T_4096_61,T_2048_31,T_4096_63,T_64_1,T_4096_65,T_2048_33,T_4096_67,T_1024_17,T_4096_69,T_2048_35,T_4096_71,T_512_9,T_4096_73,T_2048_37,T_4096_75,T_1024_19,T_4096_77,T_2048_39,T_4096_79,T_256_5,T_4096_81,T_2048_41,T_4096_83,T_1024_21,T_4096_85,T_2048_43,T_4096_87,T_512_11,T_4096_89,T_2048_45,T_4096_91,T_1024_23,T_4096_93,T_2048_47,T_4096_95,T_128_3,T_4096_97,T_2048_49,T_4096_99,T_1024_25,T_4096_101,T_2048_51,T_4096_103,T_512_13,T_4096_105,T_2048_53,T_4096_107,T_1024_27,T_4096_109,T_2048_55,T_4096_111,T_256_7,T_4096_113,T_2048_57,T_4096_115,T_1024_29,T_4096_117,T_2048_59,T_4096_119,T_512_15,T_4096_121,T_2048_61,T_4096_123,T_1024_31,T_4096_125,T_2048_63,T_4096_127,T_32_1,T_4096_129,T_2048_65,T_4096_131,T_1024_33,T_4096_133,T_2048_67,T_4096_135,T_512_17,T_4096_137,T_2048_69,T_4096_139,T_1024_35,T_4096_141,T_2048_71,T_4096_143,T_256_9,T_4096_145,T_2048_73,T_4096_147,T_1024_37,T_4096_149,T_2048_75,T_4096_151,T_512_19,T_4096_153,T_2048_77,T_4096_155,T_1024_39,T_4096_157,T_2048_79,T_4096_159,T_128_5,T_4096_161,T_2048_81,T_4096_163,T_1024_41,T_4096_165,T_2048_83,T_4096_167,T_512_21,T_4096_169,T_2048_85,T_4096_171,T_1024_43,T_4096_173,T_2048_87,T_4096_175,T_256_11,T_4096_177,T_2048_89,T_4096_179,T_1024_45,T_4096_181,T_2048_91,T_4096_183,T_512_23,T_4096_185,T_2048_93,T_4096_187,T_1024_47,T_4096_189,T_2048_95,T_4096_191,T_64_3,T_4096_193,T_2048_97,T_4096_195,T_1024_49,T_4096_197,T_2048_99,T_4096_199,T_512_25,T_4096_201,T_2048_101,T_4096_203,T_1024_51,T_4096_205,T_2048_103,T_4096_207,T_256_13,T_4096_209,T_2048_105,T_4096_211,T_1024_53,T_4096_213,T_2048_107,T_4096_215,T_512_27,T_4096_217,T_2048_109,T_4096_219,T_1024_55,T_4096_221,T_2048_111,T_4096_223,T_128_7,T_4096_225,T_2048_113,T_4096_227,T_1024_57,T_4096_229,T_2048_115,T_4096_231,T_512_29,T_4096_233,T_2048_117,T_4096_235,T_1024_59,T_4096_237,T_2048_119,T_4096_239,T_256_15,T_4096_241,T_2048_121,T_4096_243,T_1024_61,T_4096_245,T_2048_123,T_4096_247,T_512_31,T_4096_249,T_2048_125,T_4096_251,T_1024_63,T_4096_253,T_2048_127,T_4096_255,T_2_0,T_4096_9,T_2048_9,T_4096_27,T_1024_9,T_4096_45,T_2048_27,T_4096_63,T_512_9,T_4096_81,T_2048_45,T_4096_99,T_1024_27,T_4096_117,T_2048_63,T_4096_135,T_256_9,T_4096_153,T_2048_81,T_4096_171,T_1024_45,T_4096_189,T_2048_99,T_4096_207,T_512_27,T_4096_225,T_2048_117,T_4096_243,T_1024_63,T_4096_261,T_2048_135,T_4096_279,T_128_9,T_4096_297,T_2048_153,T_4096_315,T_1024_81,T_4096_333,T_2048_171,T_4096_351,T_512_45,T_4096_369,T_2048_189,T_4096_387,T_1024_99,T_4096_405,T_2048_207,T_4096_423,T_256_27,T_4096_441,T_2048_225,T_4096_459,T_1024_117,T_4096_477,T_2048_243,T_4096_495,T_512_63,T_4096_513,T_2048_261,T_4096_531,T_1024_135,T_4096_549,T_2048_279,T_4096_567,T_64_9,T_4096_585,T_2048_297,T_4096_603,T_1024_153,T_4096_621,T_2048_315,T_4096_639,T_512_81,T_4096_657,T_2048_333,T_4096_675,T_1024_171,T_4096_693,T_2048_351,T_4096_711,T_256_45,T_4096_729,T_2048_369,T_4096_747,T_1024_189,T_4096_765,T_2048_387,T_4096_783,T_512_99,T_4096_801,T_2048_405,T_4096_819,T_1024_207,T_4096_837,T_2048_423,T_4096_855,T_128_27,T_4096_873,T_2048_441,T_4096_891,T_1024_225,T_4096_909,T_2048_459,T_4096_927,T_512_117,T_4096_945,T_2048_477,T_4096_963,T_1024_243,T_4096_981,T_2048_495,T_4096_999,T_256_63,T_4096_1017,T_2048_513,T_4096_1035,T_1024_261,T_4096_1053,T_2048_531,T_4096_1071,T_512_135,T_4096_1089,T_2048_549,T_4096_1107,T_1024_279,T_4096_1125,T_2048_567,T_4096_1143,T_32_9,T_4096_1161,T_2048_585,T_4096_1179,T_1024_297,T_4096_1197,T_2048_603,T_4096_1215,T_512_153,T_4096_1233,T_2048_621,T_4096_1251,T_1024_315,T_4096_1269,T_2048_639,T_4096_1287,T_256_81,T_4096_1305,T_2048_657,T_4096_1323,T_1024_333,T_4096_1341,T_2048_675,T_4096_1359,T_512_171,T_4096_1377,T_2048_693,T_4096_1395,T_1024_351,T_4096_1413,T_2048_711,T_4096_1431,T_128_45,T_4096_1449,T_2048_729,T_4096_1467,T_1024_369,T_4096_1485,T_2048_747,T_4096_1503,T_512_189,T_4096_1521,T_2048_765,T_4096_1539,T_1024_387,T_4096_1557,T_2048_783,T_4096_1575,T_256_99,T_4096_1593,T_2048_801,T_4096_1611,T_1024_405,T_4096_1629,T_2048_819,T_4096_1647,T_512_207,T_4096_1665,T_2048_837,T_4096_1683,T_1024_423,T_4096_1701,T_2048_855,T_4096_1719,T_64_27,T_4096_1737,T_2048_873,T_4096_1755,T_1024_441,T_4096_1773,T_2048_891,T_4096_1791,T_512_225,T_4096_1809,T_2048_909,T_4096_1827,T_1024_459,T_4096_1845,T_2048_927,T_4096_1863,T_256_117,T_4096_1881,T_2048_945,T_4096_1899,T_1024_477,T_4096_1917,T_2048_963,T_4096_1935,T_512_243,T_4096_1953,T_2048_981,T_4096_1971,T_1024_495,T_4096_1989,T_2048_999,T_4096_2007,T_128_63,T_4096_2025,T_2048_1017,T_4096_2043,T_1024_513,T_4096_2061,T_2048_1035,T_4096_2079,T_512_261,T_4096_2097,T_2048_1053,T_4096_2115,T_1024_531,T_4096_2133,T_2048_1071,T_4096_2151,T_256_135,T_4096_2169,T_2048_1089,T_4096_2187,T_1024_549,T_4096_2205,T_2048_1107,T_4096_2223,T_512_279,T_4096_2241,T_2048_1125,T_4096_2259,T_1024_567,T_4096_2277,T_2048_1143,T_4096_2295 +}; +static const __device__ double2 lut_dp_9_6561[729*2] = { + T_2_0,T_6561_1,T_6561_2,T_2187_1,T_6561_4,T_6561_5,T_2187_2,T_6561_7,T_6561_8,T_729_1,T_6561_10,T_6561_11,T_2187_4,T_6561_13,T_6561_14,T_2187_5,T_6561_16,T_6561_17,T_729_2,T_6561_19,T_6561_20,T_2187_7,T_6561_22,T_6561_23,T_2187_8,T_6561_25,T_6561_26,T_243_1,T_6561_28,T_6561_29,T_2187_10,T_6561_31,T_6561_32,T_2187_11,T_6561_34,T_6561_35,T_729_4,T_6561_37,T_6561_38,T_2187_13,T_6561_40,T_6561_41,T_2187_14,T_6561_43,T_6561_44,T_729_5,T_6561_46,T_6561_47,T_2187_16,T_6561_49,T_6561_50,T_2187_17,T_6561_52,T_6561_53,T_243_2,T_6561_55,T_6561_56,T_2187_19,T_6561_58,T_6561_59,T_2187_20,T_6561_61,T_6561_62,T_729_7,T_6561_64,T_6561_65,T_2187_22,T_6561_67,T_6561_68,T_2187_23,T_6561_70,T_6561_71,T_729_8,T_6561_73,T_6561_74,T_2187_25,T_6561_76,T_6561_77,T_2187_26,T_6561_79,T_6561_80,T_81_1,T_6561_82,T_6561_83,T_2187_28,T_6561_85,T_6561_86,T_2187_29,T_6561_88,T_6561_89,T_729_10,T_6561_91,T_6561_92,T_2187_31,T_6561_94,T_6561_95,T_2187_32,T_6561_97,T_6561_98,T_729_11,T_6561_100,T_6561_101,T_2187_34,T_6561_103,T_6561_104,T_2187_35,T_6561_106,T_6561_107,T_243_4,T_6561_109,T_6561_110,T_2187_37,T_6561_112,T_6561_113,T_2187_38,T_6561_115,T_6561_116,T_729_13,T_6561_118,T_6561_119,T_2187_40,T_6561_121,T_6561_122,T_2187_41,T_6561_124,T_6561_125,T_729_14,T_6561_127,T_6561_128,T_2187_43,T_6561_130,T_6561_131,T_2187_44,T_6561_133,T_6561_134,T_243_5,T_6561_136,T_6561_137,T_2187_46,T_6561_139,T_6561_140,T_2187_47,T_6561_142,T_6561_143,T_729_16,T_6561_145,T_6561_146,T_2187_49,T_6561_148,T_6561_149,T_2187_50,T_6561_151,T_6561_152,T_729_17,T_6561_154,T_6561_155,T_2187_52,T_6561_157,T_6561_158,T_2187_53,T_6561_160,T_6561_161,T_81_2,T_6561_163,T_6561_164,T_2187_55,T_6561_166,T_6561_167,T_2187_56,T_6561_169,T_6561_170,T_729_19,T_6561_172,T_6561_173,T_2187_58,T_6561_175,T_6561_176,T_2187_59,T_6561_178,T_6561_179,T_729_20,T_6561_181,T_6561_182,T_2187_61,T_6561_184,T_6561_185,T_2187_62,T_6561_187,T_6561_188,T_243_7,T_6561_190,T_6561_191,T_2187_64,T_6561_193,T_6561_194,T_2187_65,T_6561_196,T_6561_197,T_729_22,T_6561_199,T_6561_200,T_2187_67,T_6561_202,T_6561_203,T_2187_68,T_6561_205,T_6561_206,T_729_23,T_6561_208,T_6561_209,T_2187_70,T_6561_211,T_6561_212,T_2187_71,T_6561_214,T_6561_215,T_243_8,T_6561_217,T_6561_218,T_2187_73,T_6561_220,T_6561_221,T_2187_74,T_6561_223,T_6561_224,T_729_25,T_6561_226,T_6561_227,T_2187_76,T_6561_229,T_6561_230,T_2187_77,T_6561_232,T_6561_233,T_729_26,T_6561_235,T_6561_236,T_2187_79,T_6561_238,T_6561_239,T_2187_80,T_6561_241,T_6561_242,T_27_1,T_6561_244,T_6561_245,T_2187_82,T_6561_247,T_6561_248,T_2187_83,T_6561_250,T_6561_251,T_729_28,T_6561_253,T_6561_254,T_2187_85,T_6561_256,T_6561_257,T_2187_86,T_6561_259,T_6561_260,T_729_29,T_6561_262,T_6561_263,T_2187_88,T_6561_265,T_6561_266,T_2187_89,T_6561_268,T_6561_269,T_243_10,T_6561_271,T_6561_272,T_2187_91,T_6561_274,T_6561_275,T_2187_92,T_6561_277,T_6561_278,T_729_31,T_6561_280,T_6561_281,T_2187_94,T_6561_283,T_6561_284,T_2187_95,T_6561_286,T_6561_287,T_729_32,T_6561_289,T_6561_290,T_2187_97,T_6561_292,T_6561_293,T_2187_98,T_6561_295,T_6561_296,T_243_11,T_6561_298,T_6561_299,T_2187_100,T_6561_301,T_6561_302,T_2187_101,T_6561_304,T_6561_305,T_729_34,T_6561_307,T_6561_308,T_2187_103,T_6561_310,T_6561_311,T_2187_104,T_6561_313,T_6561_314,T_729_35,T_6561_316,T_6561_317,T_2187_106,T_6561_319,T_6561_320,T_2187_107,T_6561_322,T_6561_323,T_81_4,T_6561_325,T_6561_326,T_2187_109,T_6561_328,T_6561_329,T_2187_110,T_6561_331,T_6561_332,T_729_37,T_6561_334,T_6561_335,T_2187_112,T_6561_337,T_6561_338,T_2187_113,T_6561_340,T_6561_341,T_729_38,T_6561_343,T_6561_344,T_2187_115,T_6561_346,T_6561_347,T_2187_116,T_6561_349,T_6561_350,T_243_13,T_6561_352,T_6561_353,T_2187_118,T_6561_355,T_6561_356,T_2187_119,T_6561_358,T_6561_359,T_729_40,T_6561_361,T_6561_362,T_2187_121,T_6561_364,T_6561_365,T_2187_122,T_6561_367,T_6561_368,T_729_41,T_6561_370,T_6561_371,T_2187_124,T_6561_373,T_6561_374,T_2187_125,T_6561_376,T_6561_377,T_243_14,T_6561_379,T_6561_380,T_2187_127,T_6561_382,T_6561_383,T_2187_128,T_6561_385,T_6561_386,T_729_43,T_6561_388,T_6561_389,T_2187_130,T_6561_391,T_6561_392,T_2187_131,T_6561_394,T_6561_395,T_729_44,T_6561_397,T_6561_398,T_2187_133,T_6561_400,T_6561_401,T_2187_134,T_6561_403,T_6561_404,T_81_5,T_6561_406,T_6561_407,T_2187_136,T_6561_409,T_6561_410,T_2187_137,T_6561_412,T_6561_413,T_729_46,T_6561_415,T_6561_416,T_2187_139,T_6561_418,T_6561_419,T_2187_140,T_6561_421,T_6561_422,T_729_47,T_6561_424,T_6561_425,T_2187_142,T_6561_427,T_6561_428,T_2187_143,T_6561_430,T_6561_431,T_243_16,T_6561_433,T_6561_434,T_2187_145,T_6561_436,T_6561_437,T_2187_146,T_6561_439,T_6561_440,T_729_49,T_6561_442,T_6561_443,T_2187_148,T_6561_445,T_6561_446,T_2187_149,T_6561_448,T_6561_449,T_729_50,T_6561_451,T_6561_452,T_2187_151,T_6561_454,T_6561_455,T_2187_152,T_6561_457,T_6561_458,T_243_17,T_6561_460,T_6561_461,T_2187_154,T_6561_463,T_6561_464,T_2187_155,T_6561_466,T_6561_467,T_729_52,T_6561_469,T_6561_470,T_2187_157,T_6561_472,T_6561_473,T_2187_158,T_6561_475,T_6561_476,T_729_53,T_6561_478,T_6561_479,T_2187_160,T_6561_481,T_6561_482,T_2187_161,T_6561_484,T_6561_485,T_27_2,T_6561_487,T_6561_488,T_2187_163,T_6561_490,T_6561_491,T_2187_164,T_6561_493,T_6561_494,T_729_55,T_6561_496,T_6561_497,T_2187_166,T_6561_499,T_6561_500,T_2187_167,T_6561_502,T_6561_503,T_729_56,T_6561_505,T_6561_506,T_2187_169,T_6561_508,T_6561_509,T_2187_170,T_6561_511,T_6561_512,T_243_19,T_6561_514,T_6561_515,T_2187_172,T_6561_517,T_6561_518,T_2187_173,T_6561_520,T_6561_521,T_729_58,T_6561_523,T_6561_524,T_2187_175,T_6561_526,T_6561_527,T_2187_176,T_6561_529,T_6561_530,T_729_59,T_6561_532,T_6561_533,T_2187_178,T_6561_535,T_6561_536,T_2187_179,T_6561_538,T_6561_539,T_243_20,T_6561_541,T_6561_542,T_2187_181,T_6561_544,T_6561_545,T_2187_182,T_6561_547,T_6561_548,T_729_61,T_6561_550,T_6561_551,T_2187_184,T_6561_553,T_6561_554,T_2187_185,T_6561_556,T_6561_557,T_729_62,T_6561_559,T_6561_560,T_2187_187,T_6561_562,T_6561_563,T_2187_188,T_6561_565,T_6561_566,T_81_7,T_6561_568,T_6561_569,T_2187_190,T_6561_571,T_6561_572,T_2187_191,T_6561_574,T_6561_575,T_729_64,T_6561_577,T_6561_578,T_2187_193,T_6561_580,T_6561_581,T_2187_194,T_6561_583,T_6561_584,T_729_65,T_6561_586,T_6561_587,T_2187_196,T_6561_589,T_6561_590,T_2187_197,T_6561_592,T_6561_593,T_243_22,T_6561_595,T_6561_596,T_2187_199,T_6561_598,T_6561_599,T_2187_200,T_6561_601,T_6561_602,T_729_67,T_6561_604,T_6561_605,T_2187_202,T_6561_607,T_6561_608,T_2187_203,T_6561_610,T_6561_611,T_729_68,T_6561_613,T_6561_614,T_2187_205,T_6561_616,T_6561_617,T_2187_206,T_6561_619,T_6561_620,T_243_23,T_6561_622,T_6561_623,T_2187_208,T_6561_625,T_6561_626,T_2187_209,T_6561_628,T_6561_629,T_729_70,T_6561_631,T_6561_632,T_2187_211,T_6561_634,T_6561_635,T_2187_212,T_6561_637,T_6561_638,T_729_71,T_6561_640,T_6561_641,T_2187_214,T_6561_643,T_6561_644,T_2187_215,T_6561_646,T_6561_647,T_81_8,T_6561_649,T_6561_650,T_2187_217,T_6561_652,T_6561_653,T_2187_218,T_6561_655,T_6561_656,T_729_73,T_6561_658,T_6561_659,T_2187_220,T_6561_661,T_6561_662,T_2187_221,T_6561_664,T_6561_665,T_729_74,T_6561_667,T_6561_668,T_2187_223,T_6561_670,T_6561_671,T_2187_224,T_6561_673,T_6561_674,T_243_25,T_6561_676,T_6561_677,T_2187_226,T_6561_679,T_6561_680,T_2187_227,T_6561_682,T_6561_683,T_729_76,T_6561_685,T_6561_686,T_2187_229,T_6561_688,T_6561_689,T_2187_230,T_6561_691,T_6561_692,T_729_77,T_6561_694,T_6561_695,T_2187_232,T_6561_697,T_6561_698,T_2187_233,T_6561_700,T_6561_701,T_243_26,T_6561_703,T_6561_704,T_2187_235,T_6561_706,T_6561_707,T_2187_236,T_6561_709,T_6561_710,T_729_79,T_6561_712,T_6561_713,T_2187_238,T_6561_715,T_6561_716,T_2187_239,T_6561_718,T_6561_719,T_729_80,T_6561_721,T_6561_722,T_2187_241,T_6561_724,T_6561_725,T_2187_242,T_6561_727,T_6561_728,T_2_0,T_6561_5,T_6561_10,T_2187_5,T_6561_20,T_6561_25,T_2187_10,T_6561_35,T_6561_40,T_729_5,T_6561_50,T_6561_55,T_2187_20,T_6561_65,T_6561_70,T_2187_25,T_6561_80,T_6561_85,T_729_10,T_6561_95,T_6561_100,T_2187_35,T_6561_110,T_6561_115,T_2187_40,T_6561_125,T_6561_130,T_243_5,T_6561_140,T_6561_145,T_2187_50,T_6561_155,T_6561_160,T_2187_55,T_6561_170,T_6561_175,T_729_20,T_6561_185,T_6561_190,T_2187_65,T_6561_200,T_6561_205,T_2187_70,T_6561_215,T_6561_220,T_729_25,T_6561_230,T_6561_235,T_2187_80,T_6561_245,T_6561_250,T_2187_85,T_6561_260,T_6561_265,T_243_10,T_6561_275,T_6561_280,T_2187_95,T_6561_290,T_6561_295,T_2187_100,T_6561_305,T_6561_310,T_729_35,T_6561_320,T_6561_325,T_2187_110,T_6561_335,T_6561_340,T_2187_115,T_6561_350,T_6561_355,T_729_40,T_6561_365,T_6561_370,T_2187_125,T_6561_380,T_6561_385,T_2187_130,T_6561_395,T_6561_400,T_81_5,T_6561_410,T_6561_415,T_2187_140,T_6561_425,T_6561_430,T_2187_145,T_6561_440,T_6561_445,T_729_50,T_6561_455,T_6561_460,T_2187_155,T_6561_470,T_6561_475,T_2187_160,T_6561_485,T_6561_490,T_729_55,T_6561_500,T_6561_505,T_2187_170,T_6561_515,T_6561_520,T_2187_175,T_6561_530,T_6561_535,T_243_20,T_6561_545,T_6561_550,T_2187_185,T_6561_560,T_6561_565,T_2187_190,T_6561_575,T_6561_580,T_729_65,T_6561_590,T_6561_595,T_2187_200,T_6561_605,T_6561_610,T_2187_205,T_6561_620,T_6561_625,T_729_70,T_6561_635,T_6561_640,T_2187_215,T_6561_650,T_6561_655,T_2187_220,T_6561_665,T_6561_670,T_243_25,T_6561_680,T_6561_685,T_2187_230,T_6561_695,T_6561_700,T_2187_235,T_6561_710,T_6561_715,T_729_80,T_6561_725,T_6561_730,T_2187_245,T_6561_740,T_6561_745,T_2187_250,T_6561_755,T_6561_760,T_729_85,T_6561_770,T_6561_775,T_2187_260,T_6561_785,T_6561_790,T_2187_265,T_6561_800,T_6561_805,T_81_10,T_6561_815,T_6561_820,T_2187_275,T_6561_830,T_6561_835,T_2187_280,T_6561_845,T_6561_850,T_729_95,T_6561_860,T_6561_865,T_2187_290,T_6561_875,T_6561_880,T_2187_295,T_6561_890,T_6561_895,T_729_100,T_6561_905,T_6561_910,T_2187_305,T_6561_920,T_6561_925,T_2187_310,T_6561_935,T_6561_940,T_243_35,T_6561_950,T_6561_955,T_2187_320,T_6561_965,T_6561_970,T_2187_325,T_6561_980,T_6561_985,T_729_110,T_6561_995,T_6561_1000,T_2187_335,T_6561_1010,T_6561_1015,T_2187_340,T_6561_1025,T_6561_1030,T_729_115,T_6561_1040,T_6561_1045,T_2187_350,T_6561_1055,T_6561_1060,T_2187_355,T_6561_1070,T_6561_1075,T_243_40,T_6561_1085,T_6561_1090,T_2187_365,T_6561_1100,T_6561_1105,T_2187_370,T_6561_1115,T_6561_1120,T_729_125,T_6561_1130,T_6561_1135,T_2187_380,T_6561_1145,T_6561_1150,T_2187_385,T_6561_1160,T_6561_1165,T_729_130,T_6561_1175,T_6561_1180,T_2187_395,T_6561_1190,T_6561_1195,T_2187_400,T_6561_1205,T_6561_1210,T_27_5,T_6561_1220,T_6561_1225,T_2187_410,T_6561_1235,T_6561_1240,T_2187_415,T_6561_1250,T_6561_1255,T_729_140,T_6561_1265,T_6561_1270,T_2187_425,T_6561_1280,T_6561_1285,T_2187_430,T_6561_1295,T_6561_1300,T_729_145,T_6561_1310,T_6561_1315,T_2187_440,T_6561_1325,T_6561_1330,T_2187_445,T_6561_1340,T_6561_1345,T_243_50,T_6561_1355,T_6561_1360,T_2187_455,T_6561_1370,T_6561_1375,T_2187_460,T_6561_1385,T_6561_1390,T_729_155,T_6561_1400,T_6561_1405,T_2187_470,T_6561_1415,T_6561_1420,T_2187_475,T_6561_1430,T_6561_1435,T_729_160,T_6561_1445,T_6561_1450,T_2187_485,T_6561_1460,T_6561_1465,T_2187_490,T_6561_1475,T_6561_1480,T_243_55,T_6561_1490,T_6561_1495,T_2187_500,T_6561_1505,T_6561_1510,T_2187_505,T_6561_1520,T_6561_1525,T_729_170,T_6561_1535,T_6561_1540,T_2187_515,T_6561_1550,T_6561_1555,T_2187_520,T_6561_1565,T_6561_1570,T_729_175,T_6561_1580,T_6561_1585,T_2187_530,T_6561_1595,T_6561_1600,T_2187_535,T_6561_1610,T_6561_1615,T_81_20,T_6561_1625,T_6561_1630,T_2187_545,T_6561_1640,T_6561_1645,T_2187_550,T_6561_1655,T_6561_1660,T_729_185,T_6561_1670,T_6561_1675,T_2187_560,T_6561_1685,T_6561_1690,T_2187_565,T_6561_1700,T_6561_1705,T_729_190,T_6561_1715,T_6561_1720,T_2187_575,T_6561_1730,T_6561_1735,T_2187_580,T_6561_1745,T_6561_1750,T_243_65,T_6561_1760,T_6561_1765,T_2187_590,T_6561_1775,T_6561_1780,T_2187_595,T_6561_1790,T_6561_1795,T_729_200,T_6561_1805,T_6561_1810,T_2187_605,T_6561_1820,T_6561_1825,T_2187_610,T_6561_1835,T_6561_1840,T_729_205,T_6561_1850,T_6561_1855,T_2187_620,T_6561_1865,T_6561_1870,T_2187_625,T_6561_1880,T_6561_1885,T_243_70,T_6561_1895,T_6561_1900,T_2187_635,T_6561_1910,T_6561_1915,T_2187_640,T_6561_1925,T_6561_1930,T_729_215,T_6561_1940,T_6561_1945,T_2187_650,T_6561_1955,T_6561_1960,T_2187_655,T_6561_1970,T_6561_1975,T_729_220,T_6561_1985,T_6561_1990,T_2187_665,T_6561_2000,T_6561_2005,T_2187_670,T_6561_2015,T_6561_2020,T_81_25,T_6561_2030,T_6561_2035,T_2187_680,T_6561_2045,T_6561_2050,T_2187_685,T_6561_2060,T_6561_2065,T_729_230,T_6561_2075,T_6561_2080,T_2187_695,T_6561_2090,T_6561_2095,T_2187_700,T_6561_2105,T_6561_2110,T_729_235,T_6561_2120,T_6561_2125,T_2187_710,T_6561_2135,T_6561_2140,T_2187_715,T_6561_2150,T_6561_2155,T_243_80,T_6561_2165,T_6561_2170,T_2187_725,T_6561_2180,T_6561_2185,T_2187_730,T_6561_2195,T_6561_2200,T_729_245,T_6561_2210,T_6561_2215,T_2187_740,T_6561_2225,T_6561_2230,T_2187_745,T_6561_2240,T_6561_2245,T_729_250,T_6561_2255,T_6561_2260,T_2187_755,T_6561_2270,T_6561_2275,T_2187_760,T_6561_2285,T_6561_2290,T_243_85,T_6561_2300,T_6561_2305,T_2187_770,T_6561_2315,T_6561_2320,T_2187_775,T_6561_2330,T_6561_2335,T_729_260,T_6561_2345,T_6561_2350,T_2187_785,T_6561_2360,T_6561_2365,T_2187_790,T_6561_2375,T_6561_2380,T_729_265,T_6561_2390,T_6561_2395,T_2187_800,T_6561_2405,T_6561_2410,T_2187_805,T_6561_2420,T_6561_2425,T_27_10,T_6561_2435,T_6561_2440,T_2187_815,T_6561_2450,T_6561_2455,T_2187_820,T_6561_2465,T_6561_2470,T_729_275,T_6561_2480,T_6561_2485,T_2187_830,T_6561_2495,T_6561_2500,T_2187_835,T_6561_2510,T_6561_2515,T_729_280,T_6561_2525,T_6561_2530,T_2187_845,T_6561_2540,T_6561_2545,T_2187_850,T_6561_2555,T_6561_2560,T_243_95,T_6561_2570,T_6561_2575,T_2187_860,T_6561_2585,T_6561_2590,T_2187_865,T_6561_2600,T_6561_2605,T_729_290,T_6561_2615,T_6561_2620,T_2187_875,T_6561_2630,T_6561_2635,T_2187_880,T_6561_2645,T_6561_2650,T_729_295,T_6561_2660,T_6561_2665,T_2187_890,T_6561_2675,T_6561_2680,T_2187_895,T_6561_2690,T_6561_2695,T_243_100,T_6561_2705,T_6561_2710,T_2187_905,T_6561_2720,T_6561_2725,T_2187_910,T_6561_2735,T_6561_2740,T_729_305,T_6561_2750,T_6561_2755,T_2187_920,T_6561_2765,T_6561_2770,T_2187_925,T_6561_2780,T_6561_2785,T_729_310,T_6561_2795,T_6561_2800,T_2187_935,T_6561_2810,T_6561_2815,T_2187_940,T_6561_2825,T_6561_2830,T_81_35,T_6561_2840,T_6561_2845,T_2187_950,T_6561_2855,T_6561_2860,T_2187_955,T_6561_2870,T_6561_2875,T_729_320,T_6561_2885,T_6561_2890,T_2187_965,T_6561_2900,T_6561_2905,T_2187_970,T_6561_2915,T_6561_2920,T_729_325,T_6561_2930,T_6561_2935,T_2187_980,T_6561_2945,T_6561_2950,T_2187_985,T_6561_2960,T_6561_2965,T_243_110,T_6561_2975,T_6561_2980,T_2187_995,T_6561_2990,T_6561_2995,T_2187_1000,T_6561_3005,T_6561_3010,T_729_335,T_6561_3020,T_6561_3025,T_2187_1010,T_6561_3035,T_6561_3040,T_2187_1015,T_6561_3050,T_6561_3055,T_729_340,T_6561_3065,T_6561_3070,T_2187_1025,T_6561_3080,T_6561_3085,T_2187_1030,T_6561_3095,T_6561_3100,T_243_115,T_6561_3110,T_6561_3115,T_2187_1040,T_6561_3125,T_6561_3130,T_2187_1045,T_6561_3140,T_6561_3145,T_729_350,T_6561_3155,T_6561_3160,T_2187_1055,T_6561_3170,T_6561_3175,T_2187_1060,T_6561_3185,T_6561_3190,T_729_355,T_6561_3200,T_6561_3205,T_2187_1070,T_6561_3215,T_6561_3220,T_2187_1075,T_6561_3230,T_6561_3235,T_81_40,T_6561_3245,T_6561_3250,T_2187_1085,T_6561_3260,T_6561_3265,T_2187_1090,T_6561_3275,T_6561_3280,T_729_365,T_6561_3290,T_6561_3295,T_2187_1100,T_6561_3305,T_6561_3310,T_2187_1105,T_6561_3320,T_6561_3325,T_729_370,T_6561_3335,T_6561_3340,T_2187_1115,T_6561_3350,T_6561_3355,T_2187_1120,T_6561_3365,T_6561_3370,T_243_125,T_6561_3380,T_6561_3385,T_2187_1130,T_6561_3395,T_6561_3400,T_2187_1135,T_6561_3410,T_6561_3415,T_729_380,T_6561_3425,T_6561_3430,T_2187_1145,T_6561_3440,T_6561_3445,T_2187_1150,T_6561_3455,T_6561_3460,T_729_385,T_6561_3470,T_6561_3475,T_2187_1160,T_6561_3485,T_6561_3490,T_2187_1165,T_6561_3500,T_6561_3505,T_243_130,T_6561_3515,T_6561_3520,T_2187_1175,T_6561_3530,T_6561_3535,T_2187_1180,T_6561_3545,T_6561_3550,T_729_395,T_6561_3560,T_6561_3565,T_2187_1190,T_6561_3575,T_6561_3580,T_2187_1195,T_6561_3590,T_6561_3595,T_729_400,T_6561_3605,T_6561_3610,T_2187_1205,T_6561_3620,T_6561_3625,T_2187_1210,T_6561_3635,T_6561_3640 +}; +static const __device__ double2 lut_dp_8_8192[1024*2] = { + T_2_0,T_8192_1,T_4096_1,T_8192_3,T_2048_1,T_8192_5,T_4096_3,T_8192_7,T_1024_1,T_8192_9,T_4096_5,T_8192_11,T_2048_3,T_8192_13,T_4096_7,T_8192_15,T_512_1,T_8192_17,T_4096_9,T_8192_19,T_2048_5,T_8192_21,T_4096_11,T_8192_23,T_1024_3,T_8192_25,T_4096_13,T_8192_27,T_2048_7,T_8192_29,T_4096_15,T_8192_31,T_256_1,T_8192_33,T_4096_17,T_8192_35,T_2048_9,T_8192_37,T_4096_19,T_8192_39,T_1024_5,T_8192_41,T_4096_21,T_8192_43,T_2048_11,T_8192_45,T_4096_23,T_8192_47,T_512_3,T_8192_49,T_4096_25,T_8192_51,T_2048_13,T_8192_53,T_4096_27,T_8192_55,T_1024_7,T_8192_57,T_4096_29,T_8192_59,T_2048_15,T_8192_61,T_4096_31,T_8192_63,T_128_1,T_8192_65,T_4096_33,T_8192_67,T_2048_17,T_8192_69,T_4096_35,T_8192_71,T_1024_9,T_8192_73,T_4096_37,T_8192_75,T_2048_19,T_8192_77,T_4096_39,T_8192_79,T_512_5,T_8192_81,T_4096_41,T_8192_83,T_2048_21,T_8192_85,T_4096_43,T_8192_87,T_1024_11,T_8192_89,T_4096_45,T_8192_91,T_2048_23,T_8192_93,T_4096_47,T_8192_95,T_256_3,T_8192_97,T_4096_49,T_8192_99,T_2048_25,T_8192_101,T_4096_51,T_8192_103,T_1024_13,T_8192_105,T_4096_53,T_8192_107,T_2048_27,T_8192_109,T_4096_55,T_8192_111,T_512_7,T_8192_113,T_4096_57,T_8192_115,T_2048_29,T_8192_117,T_4096_59,T_8192_119,T_1024_15,T_8192_121,T_4096_61,T_8192_123,T_2048_31,T_8192_125,T_4096_63,T_8192_127,T_64_1,T_8192_129,T_4096_65,T_8192_131,T_2048_33,T_8192_133,T_4096_67,T_8192_135,T_1024_17,T_8192_137,T_4096_69,T_8192_139,T_2048_35,T_8192_141,T_4096_71,T_8192_143,T_512_9,T_8192_145,T_4096_73,T_8192_147,T_2048_37,T_8192_149,T_4096_75,T_8192_151,T_1024_19,T_8192_153,T_4096_77,T_8192_155,T_2048_39,T_8192_157,T_4096_79,T_8192_159,T_256_5,T_8192_161,T_4096_81,T_8192_163,T_2048_41,T_8192_165,T_4096_83,T_8192_167,T_1024_21,T_8192_169,T_4096_85,T_8192_171,T_2048_43,T_8192_173,T_4096_87,T_8192_175,T_512_11,T_8192_177,T_4096_89,T_8192_179,T_2048_45,T_8192_181,T_4096_91,T_8192_183,T_1024_23,T_8192_185,T_4096_93,T_8192_187,T_2048_47,T_8192_189,T_4096_95,T_8192_191,T_128_3,T_8192_193,T_4096_97,T_8192_195,T_2048_49,T_8192_197,T_4096_99,T_8192_199,T_1024_25,T_8192_201,T_4096_101,T_8192_203,T_2048_51,T_8192_205,T_4096_103,T_8192_207,T_512_13,T_8192_209,T_4096_105,T_8192_211,T_2048_53,T_8192_213,T_4096_107,T_8192_215,T_1024_27,T_8192_217,T_4096_109,T_8192_219,T_2048_55,T_8192_221,T_4096_111,T_8192_223,T_256_7,T_8192_225,T_4096_113,T_8192_227,T_2048_57,T_8192_229,T_4096_115,T_8192_231,T_1024_29,T_8192_233,T_4096_117,T_8192_235,T_2048_59,T_8192_237,T_4096_119,T_8192_239,T_512_15,T_8192_241,T_4096_121,T_8192_243,T_2048_61,T_8192_245,T_4096_123,T_8192_247,T_1024_31,T_8192_249,T_4096_125,T_8192_251,T_2048_63,T_8192_253,T_4096_127,T_8192_255,T_32_1,T_8192_257,T_4096_129,T_8192_259,T_2048_65,T_8192_261,T_4096_131,T_8192_263,T_1024_33,T_8192_265,T_4096_133,T_8192_267,T_2048_67,T_8192_269,T_4096_135,T_8192_271,T_512_17,T_8192_273,T_4096_137,T_8192_275,T_2048_69,T_8192_277,T_4096_139,T_8192_279,T_1024_35,T_8192_281,T_4096_141,T_8192_283,T_2048_71,T_8192_285,T_4096_143,T_8192_287,T_256_9,T_8192_289,T_4096_145,T_8192_291,T_2048_73,T_8192_293,T_4096_147,T_8192_295,T_1024_37,T_8192_297,T_4096_149,T_8192_299,T_2048_75,T_8192_301,T_4096_151,T_8192_303,T_512_19,T_8192_305,T_4096_153,T_8192_307,T_2048_77,T_8192_309,T_4096_155,T_8192_311,T_1024_39,T_8192_313,T_4096_157,T_8192_315,T_2048_79,T_8192_317,T_4096_159,T_8192_319,T_128_5,T_8192_321,T_4096_161,T_8192_323,T_2048_81,T_8192_325,T_4096_163,T_8192_327,T_1024_41,T_8192_329,T_4096_165,T_8192_331,T_2048_83,T_8192_333,T_4096_167,T_8192_335,T_512_21,T_8192_337,T_4096_169,T_8192_339,T_2048_85,T_8192_341,T_4096_171,T_8192_343,T_1024_43,T_8192_345,T_4096_173,T_8192_347,T_2048_87,T_8192_349,T_4096_175,T_8192_351,T_256_11,T_8192_353,T_4096_177,T_8192_355,T_2048_89,T_8192_357,T_4096_179,T_8192_359,T_1024_45,T_8192_361,T_4096_181,T_8192_363,T_2048_91,T_8192_365,T_4096_183,T_8192_367,T_512_23,T_8192_369,T_4096_185,T_8192_371,T_2048_93,T_8192_373,T_4096_187,T_8192_375,T_1024_47,T_8192_377,T_4096_189,T_8192_379,T_2048_95,T_8192_381,T_4096_191,T_8192_383,T_64_3,T_8192_385,T_4096_193,T_8192_387,T_2048_97,T_8192_389,T_4096_195,T_8192_391,T_1024_49,T_8192_393,T_4096_197,T_8192_395,T_2048_99,T_8192_397,T_4096_199,T_8192_399,T_512_25,T_8192_401,T_4096_201,T_8192_403,T_2048_101,T_8192_405,T_4096_203,T_8192_407,T_1024_51,T_8192_409,T_4096_205,T_8192_411,T_2048_103,T_8192_413,T_4096_207,T_8192_415,T_256_13,T_8192_417,T_4096_209,T_8192_419,T_2048_105,T_8192_421,T_4096_211,T_8192_423,T_1024_53,T_8192_425,T_4096_213,T_8192_427,T_2048_107,T_8192_429,T_4096_215,T_8192_431,T_512_27,T_8192_433,T_4096_217,T_8192_435,T_2048_109,T_8192_437,T_4096_219,T_8192_439,T_1024_55,T_8192_441,T_4096_221,T_8192_443,T_2048_111,T_8192_445,T_4096_223,T_8192_447,T_128_7,T_8192_449,T_4096_225,T_8192_451,T_2048_113,T_8192_453,T_4096_227,T_8192_455,T_1024_57,T_8192_457,T_4096_229,T_8192_459,T_2048_115,T_8192_461,T_4096_231,T_8192_463,T_512_29,T_8192_465,T_4096_233,T_8192_467,T_2048_117,T_8192_469,T_4096_235,T_8192_471,T_1024_59,T_8192_473,T_4096_237,T_8192_475,T_2048_119,T_8192_477,T_4096_239,T_8192_479,T_256_15,T_8192_481,T_4096_241,T_8192_483,T_2048_121,T_8192_485,T_4096_243,T_8192_487,T_1024_61,T_8192_489,T_4096_245,T_8192_491,T_2048_123,T_8192_493,T_4096_247,T_8192_495,T_512_31,T_8192_497,T_4096_249,T_8192_499,T_2048_125,T_8192_501,T_4096_251,T_8192_503,T_1024_63,T_8192_505,T_4096_253,T_8192_507,T_2048_127,T_8192_509,T_4096_255,T_8192_511,T_16_1,T_8192_513,T_4096_257,T_8192_515,T_2048_129,T_8192_517,T_4096_259,T_8192_519,T_1024_65,T_8192_521,T_4096_261,T_8192_523,T_2048_131,T_8192_525,T_4096_263,T_8192_527,T_512_33,T_8192_529,T_4096_265,T_8192_531,T_2048_133,T_8192_533,T_4096_267,T_8192_535,T_1024_67,T_8192_537,T_4096_269,T_8192_539,T_2048_135,T_8192_541,T_4096_271,T_8192_543,T_256_17,T_8192_545,T_4096_273,T_8192_547,T_2048_137,T_8192_549,T_4096_275,T_8192_551,T_1024_69,T_8192_553,T_4096_277,T_8192_555,T_2048_139,T_8192_557,T_4096_279,T_8192_559,T_512_35,T_8192_561,T_4096_281,T_8192_563,T_2048_141,T_8192_565,T_4096_283,T_8192_567,T_1024_71,T_8192_569,T_4096_285,T_8192_571,T_2048_143,T_8192_573,T_4096_287,T_8192_575,T_128_9,T_8192_577,T_4096_289,T_8192_579,T_2048_145,T_8192_581,T_4096_291,T_8192_583,T_1024_73,T_8192_585,T_4096_293,T_8192_587,T_2048_147,T_8192_589,T_4096_295,T_8192_591,T_512_37,T_8192_593,T_4096_297,T_8192_595,T_2048_149,T_8192_597,T_4096_299,T_8192_599,T_1024_75,T_8192_601,T_4096_301,T_8192_603,T_2048_151,T_8192_605,T_4096_303,T_8192_607,T_256_19,T_8192_609,T_4096_305,T_8192_611,T_2048_153,T_8192_613,T_4096_307,T_8192_615,T_1024_77,T_8192_617,T_4096_309,T_8192_619,T_2048_155,T_8192_621,T_4096_311,T_8192_623,T_512_39,T_8192_625,T_4096_313,T_8192_627,T_2048_157,T_8192_629,T_4096_315,T_8192_631,T_1024_79,T_8192_633,T_4096_317,T_8192_635,T_2048_159,T_8192_637,T_4096_319,T_8192_639,T_64_5,T_8192_641,T_4096_321,T_8192_643,T_2048_161,T_8192_645,T_4096_323,T_8192_647,T_1024_81,T_8192_649,T_4096_325,T_8192_651,T_2048_163,T_8192_653,T_4096_327,T_8192_655,T_512_41,T_8192_657,T_4096_329,T_8192_659,T_2048_165,T_8192_661,T_4096_331,T_8192_663,T_1024_83,T_8192_665,T_4096_333,T_8192_667,T_2048_167,T_8192_669,T_4096_335,T_8192_671,T_256_21,T_8192_673,T_4096_337,T_8192_675,T_2048_169,T_8192_677,T_4096_339,T_8192_679,T_1024_85,T_8192_681,T_4096_341,T_8192_683,T_2048_171,T_8192_685,T_4096_343,T_8192_687,T_512_43,T_8192_689,T_4096_345,T_8192_691,T_2048_173,T_8192_693,T_4096_347,T_8192_695,T_1024_87,T_8192_697,T_4096_349,T_8192_699,T_2048_175,T_8192_701,T_4096_351,T_8192_703,T_128_11,T_8192_705,T_4096_353,T_8192_707,T_2048_177,T_8192_709,T_4096_355,T_8192_711,T_1024_89,T_8192_713,T_4096_357,T_8192_715,T_2048_179,T_8192_717,T_4096_359,T_8192_719,T_512_45,T_8192_721,T_4096_361,T_8192_723,T_2048_181,T_8192_725,T_4096_363,T_8192_727,T_1024_91,T_8192_729,T_4096_365,T_8192_731,T_2048_183,T_8192_733,T_4096_367,T_8192_735,T_256_23,T_8192_737,T_4096_369,T_8192_739,T_2048_185,T_8192_741,T_4096_371,T_8192_743,T_1024_93,T_8192_745,T_4096_373,T_8192_747,T_2048_187,T_8192_749,T_4096_375,T_8192_751,T_512_47,T_8192_753,T_4096_377,T_8192_755,T_2048_189,T_8192_757,T_4096_379,T_8192_759,T_1024_95,T_8192_761,T_4096_381,T_8192_763,T_2048_191,T_8192_765,T_4096_383,T_8192_767,T_32_3,T_8192_769,T_4096_385,T_8192_771,T_2048_193,T_8192_773,T_4096_387,T_8192_775,T_1024_97,T_8192_777,T_4096_389,T_8192_779,T_2048_195,T_8192_781,T_4096_391,T_8192_783,T_512_49,T_8192_785,T_4096_393,T_8192_787,T_2048_197,T_8192_789,T_4096_395,T_8192_791,T_1024_99,T_8192_793,T_4096_397,T_8192_795,T_2048_199,T_8192_797,T_4096_399,T_8192_799,T_256_25,T_8192_801,T_4096_401,T_8192_803,T_2048_201,T_8192_805,T_4096_403,T_8192_807,T_1024_101,T_8192_809,T_4096_405,T_8192_811,T_2048_203,T_8192_813,T_4096_407,T_8192_815,T_512_51,T_8192_817,T_4096_409,T_8192_819,T_2048_205,T_8192_821,T_4096_411,T_8192_823,T_1024_103,T_8192_825,T_4096_413,T_8192_827,T_2048_207,T_8192_829,T_4096_415,T_8192_831,T_128_13,T_8192_833,T_4096_417,T_8192_835,T_2048_209,T_8192_837,T_4096_419,T_8192_839,T_1024_105,T_8192_841,T_4096_421,T_8192_843,T_2048_211,T_8192_845,T_4096_423,T_8192_847,T_512_53,T_8192_849,T_4096_425,T_8192_851,T_2048_213,T_8192_853,T_4096_427,T_8192_855,T_1024_107,T_8192_857,T_4096_429,T_8192_859,T_2048_215,T_8192_861,T_4096_431,T_8192_863,T_256_27,T_8192_865,T_4096_433,T_8192_867,T_2048_217,T_8192_869,T_4096_435,T_8192_871,T_1024_109,T_8192_873,T_4096_437,T_8192_875,T_2048_219,T_8192_877,T_4096_439,T_8192_879,T_512_55,T_8192_881,T_4096_441,T_8192_883,T_2048_221,T_8192_885,T_4096_443,T_8192_887,T_1024_111,T_8192_889,T_4096_445,T_8192_891,T_2048_223,T_8192_893,T_4096_447,T_8192_895,T_64_7,T_8192_897,T_4096_449,T_8192_899,T_2048_225,T_8192_901,T_4096_451,T_8192_903,T_1024_113,T_8192_905,T_4096_453,T_8192_907,T_2048_227,T_8192_909,T_4096_455,T_8192_911,T_512_57,T_8192_913,T_4096_457,T_8192_915,T_2048_229,T_8192_917,T_4096_459,T_8192_919,T_1024_115,T_8192_921,T_4096_461,T_8192_923,T_2048_231,T_8192_925,T_4096_463,T_8192_927,T_256_29,T_8192_929,T_4096_465,T_8192_931,T_2048_233,T_8192_933,T_4096_467,T_8192_935,T_1024_117,T_8192_937,T_4096_469,T_8192_939,T_2048_235,T_8192_941,T_4096_471,T_8192_943,T_512_59,T_8192_945,T_4096_473,T_8192_947,T_2048_237,T_8192_949,T_4096_475,T_8192_951,T_1024_119,T_8192_953,T_4096_477,T_8192_955,T_2048_239,T_8192_957,T_4096_479,T_8192_959,T_128_15,T_8192_961,T_4096_481,T_8192_963,T_2048_241,T_8192_965,T_4096_483,T_8192_967,T_1024_121,T_8192_969,T_4096_485,T_8192_971,T_2048_243,T_8192_973,T_4096_487,T_8192_975,T_512_61,T_8192_977,T_4096_489,T_8192_979,T_2048_245,T_8192_981,T_4096_491,T_8192_983,T_1024_123,T_8192_985,T_4096_493,T_8192_987,T_2048_247,T_8192_989,T_4096_495,T_8192_991,T_256_31,T_8192_993,T_4096_497,T_8192_995,T_2048_249,T_8192_997,T_4096_499,T_8192_999,T_1024_125,T_8192_1001,T_4096_501,T_8192_1003,T_2048_251,T_8192_1005,T_4096_503,T_8192_1007,T_512_63,T_8192_1009,T_4096_505,T_8192_1011,T_2048_253,T_8192_1013,T_4096_507,T_8192_1015,T_1024_127,T_8192_1017,T_4096_509,T_8192_1019,T_2048_255,T_8192_1021,T_4096_511,T_8192_1023,T_2_0,T_8192_5,T_4096_5,T_8192_15,T_2048_5,T_8192_25,T_4096_15,T_8192_35,T_1024_5,T_8192_45,T_4096_25,T_8192_55,T_2048_15,T_8192_65,T_4096_35,T_8192_75,T_512_5,T_8192_85,T_4096_45,T_8192_95,T_2048_25,T_8192_105,T_4096_55,T_8192_115,T_1024_15,T_8192_125,T_4096_65,T_8192_135,T_2048_35,T_8192_145,T_4096_75,T_8192_155,T_256_5,T_8192_165,T_4096_85,T_8192_175,T_2048_45,T_8192_185,T_4096_95,T_8192_195,T_1024_25,T_8192_205,T_4096_105,T_8192_215,T_2048_55,T_8192_225,T_4096_115,T_8192_235,T_512_15,T_8192_245,T_4096_125,T_8192_255,T_2048_65,T_8192_265,T_4096_135,T_8192_275,T_1024_35,T_8192_285,T_4096_145,T_8192_295,T_2048_75,T_8192_305,T_4096_155,T_8192_315,T_128_5,T_8192_325,T_4096_165,T_8192_335,T_2048_85,T_8192_345,T_4096_175,T_8192_355,T_1024_45,T_8192_365,T_4096_185,T_8192_375,T_2048_95,T_8192_385,T_4096_195,T_8192_395,T_512_25,T_8192_405,T_4096_205,T_8192_415,T_2048_105,T_8192_425,T_4096_215,T_8192_435,T_1024_55,T_8192_445,T_4096_225,T_8192_455,T_2048_115,T_8192_465,T_4096_235,T_8192_475,T_256_15,T_8192_485,T_4096_245,T_8192_495,T_2048_125,T_8192_505,T_4096_255,T_8192_515,T_1024_65,T_8192_525,T_4096_265,T_8192_535,T_2048_135,T_8192_545,T_4096_275,T_8192_555,T_512_35,T_8192_565,T_4096_285,T_8192_575,T_2048_145,T_8192_585,T_4096_295,T_8192_595,T_1024_75,T_8192_605,T_4096_305,T_8192_615,T_2048_155,T_8192_625,T_4096_315,T_8192_635,T_64_5,T_8192_645,T_4096_325,T_8192_655,T_2048_165,T_8192_665,T_4096_335,T_8192_675,T_1024_85,T_8192_685,T_4096_345,T_8192_695,T_2048_175,T_8192_705,T_4096_355,T_8192_715,T_512_45,T_8192_725,T_4096_365,T_8192_735,T_2048_185,T_8192_745,T_4096_375,T_8192_755,T_1024_95,T_8192_765,T_4096_385,T_8192_775,T_2048_195,T_8192_785,T_4096_395,T_8192_795,T_256_25,T_8192_805,T_4096_405,T_8192_815,T_2048_205,T_8192_825,T_4096_415,T_8192_835,T_1024_105,T_8192_845,T_4096_425,T_8192_855,T_2048_215,T_8192_865,T_4096_435,T_8192_875,T_512_55,T_8192_885,T_4096_445,T_8192_895,T_2048_225,T_8192_905,T_4096_455,T_8192_915,T_1024_115,T_8192_925,T_4096_465,T_8192_935,T_2048_235,T_8192_945,T_4096_475,T_8192_955,T_128_15,T_8192_965,T_4096_485,T_8192_975,T_2048_245,T_8192_985,T_4096_495,T_8192_995,T_1024_125,T_8192_1005,T_4096_505,T_8192_1015,T_2048_255,T_8192_1025,T_4096_515,T_8192_1035,T_512_65,T_8192_1045,T_4096_525,T_8192_1055,T_2048_265,T_8192_1065,T_4096_535,T_8192_1075,T_1024_135,T_8192_1085,T_4096_545,T_8192_1095,T_2048_275,T_8192_1105,T_4096_555,T_8192_1115,T_256_35,T_8192_1125,T_4096_565,T_8192_1135,T_2048_285,T_8192_1145,T_4096_575,T_8192_1155,T_1024_145,T_8192_1165,T_4096_585,T_8192_1175,T_2048_295,T_8192_1185,T_4096_595,T_8192_1195,T_512_75,T_8192_1205,T_4096_605,T_8192_1215,T_2048_305,T_8192_1225,T_4096_615,T_8192_1235,T_1024_155,T_8192_1245,T_4096_625,T_8192_1255,T_2048_315,T_8192_1265,T_4096_635,T_8192_1275,T_32_5,T_8192_1285,T_4096_645,T_8192_1295,T_2048_325,T_8192_1305,T_4096_655,T_8192_1315,T_1024_165,T_8192_1325,T_4096_665,T_8192_1335,T_2048_335,T_8192_1345,T_4096_675,T_8192_1355,T_512_85,T_8192_1365,T_4096_685,T_8192_1375,T_2048_345,T_8192_1385,T_4096_695,T_8192_1395,T_1024_175,T_8192_1405,T_4096_705,T_8192_1415,T_2048_355,T_8192_1425,T_4096_715,T_8192_1435,T_256_45,T_8192_1445,T_4096_725,T_8192_1455,T_2048_365,T_8192_1465,T_4096_735,T_8192_1475,T_1024_185,T_8192_1485,T_4096_745,T_8192_1495,T_2048_375,T_8192_1505,T_4096_755,T_8192_1515,T_512_95,T_8192_1525,T_4096_765,T_8192_1535,T_2048_385,T_8192_1545,T_4096_775,T_8192_1555,T_1024_195,T_8192_1565,T_4096_785,T_8192_1575,T_2048_395,T_8192_1585,T_4096_795,T_8192_1595,T_128_25,T_8192_1605,T_4096_805,T_8192_1615,T_2048_405,T_8192_1625,T_4096_815,T_8192_1635,T_1024_205,T_8192_1645,T_4096_825,T_8192_1655,T_2048_415,T_8192_1665,T_4096_835,T_8192_1675,T_512_105,T_8192_1685,T_4096_845,T_8192_1695,T_2048_425,T_8192_1705,T_4096_855,T_8192_1715,T_1024_215,T_8192_1725,T_4096_865,T_8192_1735,T_2048_435,T_8192_1745,T_4096_875,T_8192_1755,T_256_55,T_8192_1765,T_4096_885,T_8192_1775,T_2048_445,T_8192_1785,T_4096_895,T_8192_1795,T_1024_225,T_8192_1805,T_4096_905,T_8192_1815,T_2048_455,T_8192_1825,T_4096_915,T_8192_1835,T_512_115,T_8192_1845,T_4096_925,T_8192_1855,T_2048_465,T_8192_1865,T_4096_935,T_8192_1875,T_1024_235,T_8192_1885,T_4096_945,T_8192_1895,T_2048_475,T_8192_1905,T_4096_955,T_8192_1915,T_64_15,T_8192_1925,T_4096_965,T_8192_1935,T_2048_485,T_8192_1945,T_4096_975,T_8192_1955,T_1024_245,T_8192_1965,T_4096_985,T_8192_1975,T_2048_495,T_8192_1985,T_4096_995,T_8192_1995,T_512_125,T_8192_2005,T_4096_1005,T_8192_2015,T_2048_505,T_8192_2025,T_4096_1015,T_8192_2035,T_1024_255,T_8192_2045,T_4096_1025,T_8192_2055,T_2048_515,T_8192_2065,T_4096_1035,T_8192_2075,T_256_65,T_8192_2085,T_4096_1045,T_8192_2095,T_2048_525,T_8192_2105,T_4096_1055,T_8192_2115,T_1024_265,T_8192_2125,T_4096_1065,T_8192_2135,T_2048_535,T_8192_2145,T_4096_1075,T_8192_2155,T_512_135,T_8192_2165,T_4096_1085,T_8192_2175,T_2048_545,T_8192_2185,T_4096_1095,T_8192_2195,T_1024_275,T_8192_2205,T_4096_1105,T_8192_2215,T_2048_555,T_8192_2225,T_4096_1115,T_8192_2235,T_128_35,T_8192_2245,T_4096_1125,T_8192_2255,T_2048_565,T_8192_2265,T_4096_1135,T_8192_2275,T_1024_285,T_8192_2285,T_4096_1145,T_8192_2295,T_2048_575,T_8192_2305,T_4096_1155,T_8192_2315,T_512_145,T_8192_2325,T_4096_1165,T_8192_2335,T_2048_585,T_8192_2345,T_4096_1175,T_8192_2355,T_1024_295,T_8192_2365,T_4096_1185,T_8192_2375,T_2048_595,T_8192_2385,T_4096_1195,T_8192_2395,T_256_75,T_8192_2405,T_4096_1205,T_8192_2415,T_2048_605,T_8192_2425,T_4096_1215,T_8192_2435,T_1024_305,T_8192_2445,T_4096_1225,T_8192_2455,T_2048_615,T_8192_2465,T_4096_1235,T_8192_2475,T_512_155,T_8192_2485,T_4096_1245,T_8192_2495,T_2048_625,T_8192_2505,T_4096_1255,T_8192_2515,T_1024_315,T_8192_2525,T_4096_1265,T_8192_2535,T_2048_635,T_8192_2545,T_4096_1275,T_8192_2555,T_16_5,T_8192_2565,T_4096_1285,T_8192_2575,T_2048_645,T_8192_2585,T_4096_1295,T_8192_2595,T_1024_325,T_8192_2605,T_4096_1305,T_8192_2615,T_2048_655,T_8192_2625,T_4096_1315,T_8192_2635,T_512_165,T_8192_2645,T_4096_1325,T_8192_2655,T_2048_665,T_8192_2665,T_4096_1335,T_8192_2675,T_1024_335,T_8192_2685,T_4096_1345,T_8192_2695,T_2048_675,T_8192_2705,T_4096_1355,T_8192_2715,T_256_85,T_8192_2725,T_4096_1365,T_8192_2735,T_2048_685,T_8192_2745,T_4096_1375,T_8192_2755,T_1024_345,T_8192_2765,T_4096_1385,T_8192_2775,T_2048_695,T_8192_2785,T_4096_1395,T_8192_2795,T_512_175,T_8192_2805,T_4096_1405,T_8192_2815,T_2048_705,T_8192_2825,T_4096_1415,T_8192_2835,T_1024_355,T_8192_2845,T_4096_1425,T_8192_2855,T_2048_715,T_8192_2865,T_4096_1435,T_8192_2875,T_128_45,T_8192_2885,T_4096_1445,T_8192_2895,T_2048_725,T_8192_2905,T_4096_1455,T_8192_2915,T_1024_365,T_8192_2925,T_4096_1465,T_8192_2935,T_2048_735,T_8192_2945,T_4096_1475,T_8192_2955,T_512_185,T_8192_2965,T_4096_1485,T_8192_2975,T_2048_745,T_8192_2985,T_4096_1495,T_8192_2995,T_1024_375,T_8192_3005,T_4096_1505,T_8192_3015,T_2048_755,T_8192_3025,T_4096_1515,T_8192_3035,T_256_95,T_8192_3045,T_4096_1525,T_8192_3055,T_2048_765,T_8192_3065,T_4096_1535,T_8192_3075,T_1024_385,T_8192_3085,T_4096_1545,T_8192_3095,T_2048_775,T_8192_3105,T_4096_1555,T_8192_3115,T_512_195,T_8192_3125,T_4096_1565,T_8192_3135,T_2048_785,T_8192_3145,T_4096_1575,T_8192_3155,T_1024_395,T_8192_3165,T_4096_1585,T_8192_3175,T_2048_795,T_8192_3185,T_4096_1595,T_8192_3195,T_64_25,T_8192_3205,T_4096_1605,T_8192_3215,T_2048_805,T_8192_3225,T_4096_1615,T_8192_3235,T_1024_405,T_8192_3245,T_4096_1625,T_8192_3255,T_2048_815,T_8192_3265,T_4096_1635,T_8192_3275,T_512_205,T_8192_3285,T_4096_1645,T_8192_3295,T_2048_825,T_8192_3305,T_4096_1655,T_8192_3315,T_1024_415,T_8192_3325,T_4096_1665,T_8192_3335,T_2048_835,T_8192_3345,T_4096_1675,T_8192_3355,T_256_105,T_8192_3365,T_4096_1685,T_8192_3375,T_2048_845,T_8192_3385,T_4096_1695,T_8192_3395,T_1024_425,T_8192_3405,T_4096_1705,T_8192_3415,T_2048_855,T_8192_3425,T_4096_1715,T_8192_3435,T_512_215,T_8192_3445,T_4096_1725,T_8192_3455,T_2048_865,T_8192_3465,T_4096_1735,T_8192_3475,T_1024_435,T_8192_3485,T_4096_1745,T_8192_3495,T_2048_875,T_8192_3505,T_4096_1755,T_8192_3515,T_128_55,T_8192_3525,T_4096_1765,T_8192_3535,T_2048_885,T_8192_3545,T_4096_1775,T_8192_3555,T_1024_445,T_8192_3565,T_4096_1785,T_8192_3575,T_2048_895,T_8192_3585,T_4096_1795,T_8192_3595,T_512_225,T_8192_3605,T_4096_1805,T_8192_3615,T_2048_905,T_8192_3625,T_4096_1815,T_8192_3635,T_1024_455,T_8192_3645,T_4096_1825,T_8192_3655,T_2048_915,T_8192_3665,T_4096_1835,T_8192_3675,T_256_115,T_8192_3685,T_4096_1845,T_8192_3695,T_2048_925,T_8192_3705,T_4096_1855,T_8192_3715,T_1024_465,T_8192_3725,T_4096_1865,T_8192_3735,T_2048_935,T_8192_3745,T_4096_1875,T_8192_3755,T_512_235,T_8192_3765,T_4096_1885,T_8192_3775,T_2048_945,T_8192_3785,T_4096_1895,T_8192_3795,T_1024_475,T_8192_3805,T_4096_1905,T_8192_3815,T_2048_955,T_8192_3825,T_4096_1915,T_8192_3835,T_32_15,T_8192_3845,T_4096_1925,T_8192_3855,T_2048_965,T_8192_3865,T_4096_1935,T_8192_3875,T_1024_485,T_8192_3885,T_4096_1945,T_8192_3895,T_2048_975,T_8192_3905,T_4096_1955,T_8192_3915,T_512_245,T_8192_3925,T_4096_1965,T_8192_3935,T_2048_985,T_8192_3945,T_4096_1975,T_8192_3955,T_1024_495,T_8192_3965,T_4096_1985,T_8192_3975,T_2048_995,T_8192_3985,T_4096_1995,T_8192_3995,T_256_125,T_8192_4005,T_4096_2005,T_8192_4015,T_2048_1005,T_8192_4025,T_4096_2015,T_8192_4035,T_1024_505,T_8192_4045,T_4096_2025,T_8192_4055,T_2048_1015,T_8192_4065,T_4096_2035,T_8192_4075,T_512_255,T_8192_4085,T_4096_2045,T_8192_4095,T_2048_1025,T_8192_4105,T_4096_2055,T_8192_4115,T_1024_515,T_8192_4125,T_4096_2065,T_8192_4135,T_2048_1035,T_8192_4145,T_4096_2075,T_8192_4155,T_128_65,T_8192_4165,T_4096_2085,T_8192_4175,T_2048_1045,T_8192_4185,T_4096_2095,T_8192_4195,T_1024_525,T_8192_4205,T_4096_2105,T_8192_4215,T_2048_1055,T_8192_4225,T_4096_2115,T_8192_4235,T_512_265,T_8192_4245,T_4096_2125,T_8192_4255,T_2048_1065,T_8192_4265,T_4096_2135,T_8192_4275,T_1024_535,T_8192_4285,T_4096_2145,T_8192_4295,T_2048_1075,T_8192_4305,T_4096_2155,T_8192_4315,T_256_135,T_8192_4325,T_4096_2165,T_8192_4335,T_2048_1085,T_8192_4345,T_4096_2175,T_8192_4355,T_1024_545,T_8192_4365,T_4096_2185,T_8192_4375,T_2048_1095,T_8192_4385,T_4096_2195,T_8192_4395,T_512_275,T_8192_4405,T_4096_2205,T_8192_4415,T_2048_1105,T_8192_4425,T_4096_2215,T_8192_4435,T_1024_555,T_8192_4445,T_4096_2225,T_8192_4455,T_2048_1115,T_8192_4465,T_4096_2235,T_8192_4475,T_64_35,T_8192_4485,T_4096_2245,T_8192_4495,T_2048_1125,T_8192_4505,T_4096_2255,T_8192_4515,T_1024_565,T_8192_4525,T_4096_2265,T_8192_4535,T_2048_1135,T_8192_4545,T_4096_2275,T_8192_4555,T_512_285,T_8192_4565,T_4096_2285,T_8192_4575,T_2048_1145,T_8192_4585,T_4096_2295,T_8192_4595,T_1024_575,T_8192_4605,T_4096_2305,T_8192_4615,T_2048_1155,T_8192_4625,T_4096_2315,T_8192_4635,T_256_145,T_8192_4645,T_4096_2325,T_8192_4655,T_2048_1165,T_8192_4665,T_4096_2335,T_8192_4675,T_1024_585,T_8192_4685,T_4096_2345,T_8192_4695,T_2048_1175,T_8192_4705,T_4096_2355,T_8192_4715,T_512_295,T_8192_4725,T_4096_2365,T_8192_4735,T_2048_1185,T_8192_4745,T_4096_2375,T_8192_4755,T_1024_595,T_8192_4765,T_4096_2385,T_8192_4775,T_2048_1195,T_8192_4785,T_4096_2395,T_8192_4795,T_128_75,T_8192_4805,T_4096_2405,T_8192_4815,T_2048_1205,T_8192_4825,T_4096_2415,T_8192_4835,T_1024_605,T_8192_4845,T_4096_2425,T_8192_4855,T_2048_1215,T_8192_4865,T_4096_2435,T_8192_4875,T_512_305,T_8192_4885,T_4096_2445,T_8192_4895,T_2048_1225,T_8192_4905,T_4096_2455,T_8192_4915,T_1024_615,T_8192_4925,T_4096_2465,T_8192_4935,T_2048_1235,T_8192_4945,T_4096_2475,T_8192_4955,T_256_155,T_8192_4965,T_4096_2485,T_8192_4975,T_2048_1245,T_8192_4985,T_4096_2495,T_8192_4995,T_1024_625,T_8192_5005,T_4096_2505,T_8192_5015,T_2048_1255,T_8192_5025,T_4096_2515,T_8192_5035,T_512_315,T_8192_5045,T_4096_2525,T_8192_5055,T_2048_1265,T_8192_5065,T_4096_2535,T_8192_5075,T_1024_635,T_8192_5085,T_4096_2545,T_8192_5095,T_2048_1275,T_8192_5105,T_4096_2555,T_8192_5115 +}; +static const __device__ double2 lut_dp_16_8192[512*2] = { + T_2_0,T_8192_1,T_4096_1,T_8192_3,T_2048_1,T_8192_5,T_4096_3,T_8192_7,T_1024_1,T_8192_9,T_4096_5,T_8192_11,T_2048_3,T_8192_13,T_4096_7,T_8192_15,T_512_1,T_8192_17,T_4096_9,T_8192_19,T_2048_5,T_8192_21,T_4096_11,T_8192_23,T_1024_3,T_8192_25,T_4096_13,T_8192_27,T_2048_7,T_8192_29,T_4096_15,T_8192_31,T_256_1,T_8192_33,T_4096_17,T_8192_35,T_2048_9,T_8192_37,T_4096_19,T_8192_39,T_1024_5,T_8192_41,T_4096_21,T_8192_43,T_2048_11,T_8192_45,T_4096_23,T_8192_47,T_512_3,T_8192_49,T_4096_25,T_8192_51,T_2048_13,T_8192_53,T_4096_27,T_8192_55,T_1024_7,T_8192_57,T_4096_29,T_8192_59,T_2048_15,T_8192_61,T_4096_31,T_8192_63,T_128_1,T_8192_65,T_4096_33,T_8192_67,T_2048_17,T_8192_69,T_4096_35,T_8192_71,T_1024_9,T_8192_73,T_4096_37,T_8192_75,T_2048_19,T_8192_77,T_4096_39,T_8192_79,T_512_5,T_8192_81,T_4096_41,T_8192_83,T_2048_21,T_8192_85,T_4096_43,T_8192_87,T_1024_11,T_8192_89,T_4096_45,T_8192_91,T_2048_23,T_8192_93,T_4096_47,T_8192_95,T_256_3,T_8192_97,T_4096_49,T_8192_99,T_2048_25,T_8192_101,T_4096_51,T_8192_103,T_1024_13,T_8192_105,T_4096_53,T_8192_107,T_2048_27,T_8192_109,T_4096_55,T_8192_111,T_512_7,T_8192_113,T_4096_57,T_8192_115,T_2048_29,T_8192_117,T_4096_59,T_8192_119,T_1024_15,T_8192_121,T_4096_61,T_8192_123,T_2048_31,T_8192_125,T_4096_63,T_8192_127,T_64_1,T_8192_129,T_4096_65,T_8192_131,T_2048_33,T_8192_133,T_4096_67,T_8192_135,T_1024_17,T_8192_137,T_4096_69,T_8192_139,T_2048_35,T_8192_141,T_4096_71,T_8192_143,T_512_9,T_8192_145,T_4096_73,T_8192_147,T_2048_37,T_8192_149,T_4096_75,T_8192_151,T_1024_19,T_8192_153,T_4096_77,T_8192_155,T_2048_39,T_8192_157,T_4096_79,T_8192_159,T_256_5,T_8192_161,T_4096_81,T_8192_163,T_2048_41,T_8192_165,T_4096_83,T_8192_167,T_1024_21,T_8192_169,T_4096_85,T_8192_171,T_2048_43,T_8192_173,T_4096_87,T_8192_175,T_512_11,T_8192_177,T_4096_89,T_8192_179,T_2048_45,T_8192_181,T_4096_91,T_8192_183,T_1024_23,T_8192_185,T_4096_93,T_8192_187,T_2048_47,T_8192_189,T_4096_95,T_8192_191,T_128_3,T_8192_193,T_4096_97,T_8192_195,T_2048_49,T_8192_197,T_4096_99,T_8192_199,T_1024_25,T_8192_201,T_4096_101,T_8192_203,T_2048_51,T_8192_205,T_4096_103,T_8192_207,T_512_13,T_8192_209,T_4096_105,T_8192_211,T_2048_53,T_8192_213,T_4096_107,T_8192_215,T_1024_27,T_8192_217,T_4096_109,T_8192_219,T_2048_55,T_8192_221,T_4096_111,T_8192_223,T_256_7,T_8192_225,T_4096_113,T_8192_227,T_2048_57,T_8192_229,T_4096_115,T_8192_231,T_1024_29,T_8192_233,T_4096_117,T_8192_235,T_2048_59,T_8192_237,T_4096_119,T_8192_239,T_512_15,T_8192_241,T_4096_121,T_8192_243,T_2048_61,T_8192_245,T_4096_123,T_8192_247,T_1024_31,T_8192_249,T_4096_125,T_8192_251,T_2048_63,T_8192_253,T_4096_127,T_8192_255,T_32_1,T_8192_257,T_4096_129,T_8192_259,T_2048_65,T_8192_261,T_4096_131,T_8192_263,T_1024_33,T_8192_265,T_4096_133,T_8192_267,T_2048_67,T_8192_269,T_4096_135,T_8192_271,T_512_17,T_8192_273,T_4096_137,T_8192_275,T_2048_69,T_8192_277,T_4096_139,T_8192_279,T_1024_35,T_8192_281,T_4096_141,T_8192_283,T_2048_71,T_8192_285,T_4096_143,T_8192_287,T_256_9,T_8192_289,T_4096_145,T_8192_291,T_2048_73,T_8192_293,T_4096_147,T_8192_295,T_1024_37,T_8192_297,T_4096_149,T_8192_299,T_2048_75,T_8192_301,T_4096_151,T_8192_303,T_512_19,T_8192_305,T_4096_153,T_8192_307,T_2048_77,T_8192_309,T_4096_155,T_8192_311,T_1024_39,T_8192_313,T_4096_157,T_8192_315,T_2048_79,T_8192_317,T_4096_159,T_8192_319,T_128_5,T_8192_321,T_4096_161,T_8192_323,T_2048_81,T_8192_325,T_4096_163,T_8192_327,T_1024_41,T_8192_329,T_4096_165,T_8192_331,T_2048_83,T_8192_333,T_4096_167,T_8192_335,T_512_21,T_8192_337,T_4096_169,T_8192_339,T_2048_85,T_8192_341,T_4096_171,T_8192_343,T_1024_43,T_8192_345,T_4096_173,T_8192_347,T_2048_87,T_8192_349,T_4096_175,T_8192_351,T_256_11,T_8192_353,T_4096_177,T_8192_355,T_2048_89,T_8192_357,T_4096_179,T_8192_359,T_1024_45,T_8192_361,T_4096_181,T_8192_363,T_2048_91,T_8192_365,T_4096_183,T_8192_367,T_512_23,T_8192_369,T_4096_185,T_8192_371,T_2048_93,T_8192_373,T_4096_187,T_8192_375,T_1024_47,T_8192_377,T_4096_189,T_8192_379,T_2048_95,T_8192_381,T_4096_191,T_8192_383,T_64_3,T_8192_385,T_4096_193,T_8192_387,T_2048_97,T_8192_389,T_4096_195,T_8192_391,T_1024_49,T_8192_393,T_4096_197,T_8192_395,T_2048_99,T_8192_397,T_4096_199,T_8192_399,T_512_25,T_8192_401,T_4096_201,T_8192_403,T_2048_101,T_8192_405,T_4096_203,T_8192_407,T_1024_51,T_8192_409,T_4096_205,T_8192_411,T_2048_103,T_8192_413,T_4096_207,T_8192_415,T_256_13,T_8192_417,T_4096_209,T_8192_419,T_2048_105,T_8192_421,T_4096_211,T_8192_423,T_1024_53,T_8192_425,T_4096_213,T_8192_427,T_2048_107,T_8192_429,T_4096_215,T_8192_431,T_512_27,T_8192_433,T_4096_217,T_8192_435,T_2048_109,T_8192_437,T_4096_219,T_8192_439,T_1024_55,T_8192_441,T_4096_221,T_8192_443,T_2048_111,T_8192_445,T_4096_223,T_8192_447,T_128_7,T_8192_449,T_4096_225,T_8192_451,T_2048_113,T_8192_453,T_4096_227,T_8192_455,T_1024_57,T_8192_457,T_4096_229,T_8192_459,T_2048_115,T_8192_461,T_4096_231,T_8192_463,T_512_29,T_8192_465,T_4096_233,T_8192_467,T_2048_117,T_8192_469,T_4096_235,T_8192_471,T_1024_59,T_8192_473,T_4096_237,T_8192_475,T_2048_119,T_8192_477,T_4096_239,T_8192_479,T_256_15,T_8192_481,T_4096_241,T_8192_483,T_2048_121,T_8192_485,T_4096_243,T_8192_487,T_1024_61,T_8192_489,T_4096_245,T_8192_491,T_2048_123,T_8192_493,T_4096_247,T_8192_495,T_512_31,T_8192_497,T_4096_249,T_8192_499,T_2048_125,T_8192_501,T_4096_251,T_8192_503,T_1024_63,T_8192_505,T_4096_253,T_8192_507,T_2048_127,T_8192_509,T_4096_255,T_8192_511,T_2_0,T_8192_9,T_4096_9,T_8192_27,T_2048_9,T_8192_45,T_4096_27,T_8192_63,T_1024_9,T_8192_81,T_4096_45,T_8192_99,T_2048_27,T_8192_117,T_4096_63,T_8192_135,T_512_9,T_8192_153,T_4096_81,T_8192_171,T_2048_45,T_8192_189,T_4096_99,T_8192_207,T_1024_27,T_8192_225,T_4096_117,T_8192_243,T_2048_63,T_8192_261,T_4096_135,T_8192_279,T_256_9,T_8192_297,T_4096_153,T_8192_315,T_2048_81,T_8192_333,T_4096_171,T_8192_351,T_1024_45,T_8192_369,T_4096_189,T_8192_387,T_2048_99,T_8192_405,T_4096_207,T_8192_423,T_512_27,T_8192_441,T_4096_225,T_8192_459,T_2048_117,T_8192_477,T_4096_243,T_8192_495,T_1024_63,T_8192_513,T_4096_261,T_8192_531,T_2048_135,T_8192_549,T_4096_279,T_8192_567,T_128_9,T_8192_585,T_4096_297,T_8192_603,T_2048_153,T_8192_621,T_4096_315,T_8192_639,T_1024_81,T_8192_657,T_4096_333,T_8192_675,T_2048_171,T_8192_693,T_4096_351,T_8192_711,T_512_45,T_8192_729,T_4096_369,T_8192_747,T_2048_189,T_8192_765,T_4096_387,T_8192_783,T_1024_99,T_8192_801,T_4096_405,T_8192_819,T_2048_207,T_8192_837,T_4096_423,T_8192_855,T_256_27,T_8192_873,T_4096_441,T_8192_891,T_2048_225,T_8192_909,T_4096_459,T_8192_927,T_1024_117,T_8192_945,T_4096_477,T_8192_963,T_2048_243,T_8192_981,T_4096_495,T_8192_999,T_512_63,T_8192_1017,T_4096_513,T_8192_1035,T_2048_261,T_8192_1053,T_4096_531,T_8192_1071,T_1024_135,T_8192_1089,T_4096_549,T_8192_1107,T_2048_279,T_8192_1125,T_4096_567,T_8192_1143,T_64_9,T_8192_1161,T_4096_585,T_8192_1179,T_2048_297,T_8192_1197,T_4096_603,T_8192_1215,T_1024_153,T_8192_1233,T_4096_621,T_8192_1251,T_2048_315,T_8192_1269,T_4096_639,T_8192_1287,T_512_81,T_8192_1305,T_4096_657,T_8192_1323,T_2048_333,T_8192_1341,T_4096_675,T_8192_1359,T_1024_171,T_8192_1377,T_4096_693,T_8192_1395,T_2048_351,T_8192_1413,T_4096_711,T_8192_1431,T_256_45,T_8192_1449,T_4096_729,T_8192_1467,T_2048_369,T_8192_1485,T_4096_747,T_8192_1503,T_1024_189,T_8192_1521,T_4096_765,T_8192_1539,T_2048_387,T_8192_1557,T_4096_783,T_8192_1575,T_512_99,T_8192_1593,T_4096_801,T_8192_1611,T_2048_405,T_8192_1629,T_4096_819,T_8192_1647,T_1024_207,T_8192_1665,T_4096_837,T_8192_1683,T_2048_423,T_8192_1701,T_4096_855,T_8192_1719,T_128_27,T_8192_1737,T_4096_873,T_8192_1755,T_2048_441,T_8192_1773,T_4096_891,T_8192_1791,T_1024_225,T_8192_1809,T_4096_909,T_8192_1827,T_2048_459,T_8192_1845,T_4096_927,T_8192_1863,T_512_117,T_8192_1881,T_4096_945,T_8192_1899,T_2048_477,T_8192_1917,T_4096_963,T_8192_1935,T_1024_243,T_8192_1953,T_4096_981,T_8192_1971,T_2048_495,T_8192_1989,T_4096_999,T_8192_2007,T_256_63,T_8192_2025,T_4096_1017,T_8192_2043,T_2048_513,T_8192_2061,T_4096_1035,T_8192_2079,T_1024_261,T_8192_2097,T_4096_1053,T_8192_2115,T_2048_531,T_8192_2133,T_4096_1071,T_8192_2151,T_512_135,T_8192_2169,T_4096_1089,T_8192_2187,T_2048_549,T_8192_2205,T_4096_1107,T_8192_2223,T_1024_279,T_8192_2241,T_4096_1125,T_8192_2259,T_2048_567,T_8192_2277,T_4096_1143,T_8192_2295,T_32_9,T_8192_2313,T_4096_1161,T_8192_2331,T_2048_585,T_8192_2349,T_4096_1179,T_8192_2367,T_1024_297,T_8192_2385,T_4096_1197,T_8192_2403,T_2048_603,T_8192_2421,T_4096_1215,T_8192_2439,T_512_153,T_8192_2457,T_4096_1233,T_8192_2475,T_2048_621,T_8192_2493,T_4096_1251,T_8192_2511,T_1024_315,T_8192_2529,T_4096_1269,T_8192_2547,T_2048_639,T_8192_2565,T_4096_1287,T_8192_2583,T_256_81,T_8192_2601,T_4096_1305,T_8192_2619,T_2048_657,T_8192_2637,T_4096_1323,T_8192_2655,T_1024_333,T_8192_2673,T_4096_1341,T_8192_2691,T_2048_675,T_8192_2709,T_4096_1359,T_8192_2727,T_512_171,T_8192_2745,T_4096_1377,T_8192_2763,T_2048_693,T_8192_2781,T_4096_1395,T_8192_2799,T_1024_351,T_8192_2817,T_4096_1413,T_8192_2835,T_2048_711,T_8192_2853,T_4096_1431,T_8192_2871,T_128_45,T_8192_2889,T_4096_1449,T_8192_2907,T_2048_729,T_8192_2925,T_4096_1467,T_8192_2943,T_1024_369,T_8192_2961,T_4096_1485,T_8192_2979,T_2048_747,T_8192_2997,T_4096_1503,T_8192_3015,T_512_189,T_8192_3033,T_4096_1521,T_8192_3051,T_2048_765,T_8192_3069,T_4096_1539,T_8192_3087,T_1024_387,T_8192_3105,T_4096_1557,T_8192_3123,T_2048_783,T_8192_3141,T_4096_1575,T_8192_3159,T_256_99,T_8192_3177,T_4096_1593,T_8192_3195,T_2048_801,T_8192_3213,T_4096_1611,T_8192_3231,T_1024_405,T_8192_3249,T_4096_1629,T_8192_3267,T_2048_819,T_8192_3285,T_4096_1647,T_8192_3303,T_512_207,T_8192_3321,T_4096_1665,T_8192_3339,T_2048_837,T_8192_3357,T_4096_1683,T_8192_3375,T_1024_423,T_8192_3393,T_4096_1701,T_8192_3411,T_2048_855,T_8192_3429,T_4096_1719,T_8192_3447,T_64_27,T_8192_3465,T_4096_1737,T_8192_3483,T_2048_873,T_8192_3501,T_4096_1755,T_8192_3519,T_1024_441,T_8192_3537,T_4096_1773,T_8192_3555,T_2048_891,T_8192_3573,T_4096_1791,T_8192_3591,T_512_225,T_8192_3609,T_4096_1809,T_8192_3627,T_2048_909,T_8192_3645,T_4096_1827,T_8192_3663,T_1024_459,T_8192_3681,T_4096_1845,T_8192_3699,T_2048_927,T_8192_3717,T_4096_1863,T_8192_3735,T_256_117,T_8192_3753,T_4096_1881,T_8192_3771,T_2048_945,T_8192_3789,T_4096_1899,T_8192_3807,T_1024_477,T_8192_3825,T_4096_1917,T_8192_3843,T_2048_963,T_8192_3861,T_4096_1935,T_8192_3879,T_512_243,T_8192_3897,T_4096_1953,T_8192_3915,T_2048_981,T_8192_3933,T_4096_1971,T_8192_3951,T_1024_495,T_8192_3969,T_4096_1989,T_8192_3987,T_2048_999,T_8192_4005,T_4096_2007,T_8192_4023,T_128_63,T_8192_4041,T_4096_2025,T_8192_4059,T_2048_1017,T_8192_4077,T_4096_2043,T_8192_4095,T_1024_513,T_8192_4113,T_4096_2061,T_8192_4131,T_2048_1035,T_8192_4149,T_4096_2079,T_8192_4167,T_512_261,T_8192_4185,T_4096_2097,T_8192_4203,T_2048_1053,T_8192_4221,T_4096_2115,T_8192_4239,T_1024_531,T_8192_4257,T_4096_2133,T_8192_4275,T_2048_1071,T_8192_4293,T_4096_2151,T_8192_4311,T_256_135,T_8192_4329,T_4096_2169,T_8192_4347,T_2048_1089,T_8192_4365,T_4096_2187,T_8192_4383,T_1024_549,T_8192_4401,T_4096_2205,T_8192_4419,T_2048_1107,T_8192_4437,T_4096_2223,T_8192_4455,T_512_279,T_8192_4473,T_4096_2241,T_8192_4491,T_2048_1125,T_8192_4509,T_4096_2259,T_8192_4527,T_1024_567,T_8192_4545,T_4096_2277,T_8192_4563,T_2048_1143,T_8192_4581,T_4096_2295,T_8192_4599 +}; +static const __device__ double2 lut_dp_10_10000[1000*2] = { + T_2_0,T_10000_1,T_5000_1,T_10000_3,T_2500_1,T_2000_1,T_5000_3,T_10000_7,T_1250_1,T_10000_9,T_1000_1,T_10000_11,T_2500_3,T_10000_13,T_5000_7,T_2000_3,T_625_1,T_10000_17,T_5000_9,T_10000_19,T_500_1,T_10000_21,T_5000_11,T_10000_23,T_1250_3,T_400_1,T_5000_13,T_10000_27,T_2500_7,T_10000_29,T_1000_3,T_10000_31,T_625_2,T_10000_33,T_5000_17,T_2000_7,T_2500_9,T_10000_37,T_5000_19,T_10000_39,T_250_1,T_10000_41,T_5000_21,T_10000_43,T_2500_11,T_2000_9,T_5000_23,T_10000_47,T_625_3,T_10000_49,T_200_1,T_10000_51,T_2500_13,T_10000_53,T_5000_27,T_2000_11,T_1250_7,T_10000_57,T_5000_29,T_10000_59,T_500_3,T_10000_61,T_5000_31,T_10000_63,T_625_4,T_2000_13,T_5000_33,T_10000_67,T_2500_17,T_10000_69,T_1000_7,T_10000_71,T_1250_9,T_10000_73,T_5000_37,T_400_3,T_2500_19,T_10000_77,T_5000_39,T_10000_79,T_125_1,T_10000_81,T_5000_41,T_10000_83,T_2500_21,T_2000_17,T_5000_43,T_10000_87,T_1250_11,T_10000_89,T_1000_9,T_10000_91,T_2500_23,T_10000_93,T_5000_47,T_2000_19,T_625_6,T_10000_97,T_5000_49,T_10000_99,T_100_1,T_10000_101,T_5000_51,T_10000_103,T_1250_13,T_2000_21,T_5000_53,T_10000_107,T_2500_27,T_10000_109,T_1000_11,T_10000_111,T_625_7,T_10000_113,T_5000_57,T_2000_23,T_2500_29,T_10000_117,T_5000_59,T_10000_119,T_250_3,T_10000_121,T_5000_61,T_10000_123,T_2500_31,T_80_1,T_5000_63,T_10000_127,T_625_8,T_10000_129,T_1000_13,T_10000_131,T_2500_33,T_10000_133,T_5000_67,T_2000_27,T_1250_17,T_10000_137,T_5000_69,T_10000_139,T_500_7,T_10000_141,T_5000_71,T_10000_143,T_625_9,T_2000_29,T_5000_73,T_10000_147,T_2500_37,T_10000_149,T_200_3,T_10000_151,T_1250_19,T_10000_153,T_5000_77,T_2000_31,T_2500_39,T_10000_157,T_5000_79,T_10000_159,T_125_2,T_10000_161,T_5000_81,T_10000_163,T_2500_41,T_2000_33,T_5000_83,T_10000_167,T_1250_21,T_10000_169,T_1000_17,T_10000_171,T_2500_43,T_10000_173,T_5000_87,T_400_7,T_625_11,T_10000_177,T_5000_89,T_10000_179,T_500_9,T_10000_181,T_5000_91,T_10000_183,T_1250_23,T_2000_37,T_5000_93,T_10000_187,T_2500_47,T_10000_189,T_1000_19,T_10000_191,T_625_12,T_10000_193,T_5000_97,T_2000_39,T_2500_49,T_10000_197,T_5000_99,T_10000_199,T_50_1,T_10000_201,T_5000_101,T_10000_203,T_2500_51,T_2000_41,T_5000_103,T_10000_207,T_625_13,T_10000_209,T_1000_21,T_10000_211,T_2500_53,T_10000_213,T_5000_107,T_2000_43,T_1250_27,T_10000_217,T_5000_109,T_10000_219,T_500_11,T_10000_221,T_5000_111,T_10000_223,T_625_14,T_400_9,T_5000_113,T_10000_227,T_2500_57,T_10000_229,T_1000_23,T_10000_231,T_1250_29,T_10000_233,T_5000_117,T_2000_47,T_2500_59,T_10000_237,T_5000_119,T_10000_239,T_125_3,T_10000_241,T_5000_121,T_10000_243,T_2500_61,T_2000_49,T_5000_123,T_10000_247,T_1250_31,T_10000_249,T_40_1,T_10000_251,T_2500_63,T_10000_253,T_5000_127,T_2000_51,T_625_16,T_10000_257,T_5000_129,T_10000_259,T_500_13,T_10000_261,T_5000_131,T_10000_263,T_1250_33,T_2000_53,T_5000_133,T_10000_267,T_2500_67,T_10000_269,T_1000_27,T_10000_271,T_625_17,T_10000_273,T_5000_137,T_400_11,T_2500_69,T_10000_277,T_5000_139,T_10000_279,T_250_7,T_10000_281,T_5000_141,T_10000_283,T_2500_71,T_2000_57,T_5000_143,T_10000_287,T_625_18,T_10000_289,T_1000_29,T_10000_291,T_2500_73,T_10000_293,T_5000_147,T_2000_59,T_1250_37,T_10000_297,T_5000_149,T_10000_299,T_100_3,T_10000_301,T_5000_151,T_10000_303,T_625_19,T_2000_61,T_5000_153,T_10000_307,T_2500_77,T_10000_309,T_1000_31,T_10000_311,T_1250_39,T_10000_313,T_5000_157,T_2000_63,T_2500_79,T_10000_317,T_5000_159,T_10000_319,T_125_4,T_10000_321,T_5000_161,T_10000_323,T_2500_81,T_400_13,T_5000_163,T_10000_327,T_1250_41,T_10000_329,T_1000_33,T_10000_331,T_2500_83,T_10000_333,T_5000_167,T_2000_67,T_625_21,T_10000_337,T_5000_169,T_10000_339,T_500_17,T_10000_341,T_5000_171,T_10000_343,T_1250_43,T_2000_69,T_5000_173,T_10000_347,T_2500_87,T_10000_349,T_200_7,T_10000_351,T_625_22,T_10000_353,T_5000_177,T_2000_71,T_2500_89,T_10000_357,T_5000_179,T_10000_359,T_250_9,T_10000_361,T_5000_181,T_10000_363,T_2500_91,T_2000_73,T_5000_183,T_10000_367,T_625_23,T_10000_369,T_1000_37,T_10000_371,T_2500_93,T_10000_373,T_5000_187,T_80_3,T_1250_47,T_10000_377,T_5000_189,T_10000_379,T_500_19,T_10000_381,T_5000_191,T_10000_383,T_625_24,T_2000_77,T_5000_193,T_10000_387,T_2500_97,T_10000_389,T_1000_39,T_10000_391,T_1250_49,T_10000_393,T_5000_197,T_2000_79,T_2500_99,T_10000_397,T_5000_199,T_10000_399,T_25_1,T_10000_401,T_5000_201,T_10000_403,T_2500_101,T_2000_81,T_5000_203,T_10000_407,T_1250_51,T_10000_409,T_1000_41,T_10000_411,T_2500_103,T_10000_413,T_5000_207,T_2000_83,T_625_26,T_10000_417,T_5000_209,T_10000_419,T_500_21,T_10000_421,T_5000_211,T_10000_423,T_1250_53,T_400_17,T_5000_213,T_10000_427,T_2500_107,T_10000_429,T_1000_43,T_10000_431,T_625_27,T_10000_433,T_5000_217,T_2000_87,T_2500_109,T_10000_437,T_5000_219,T_10000_439,T_250_11,T_10000_441,T_5000_221,T_10000_443,T_2500_111,T_2000_89,T_5000_223,T_10000_447,T_625_28,T_10000_449,T_200_9,T_10000_451,T_2500_113,T_10000_453,T_5000_227,T_2000_91,T_1250_57,T_10000_457,T_5000_229,T_10000_459,T_500_23,T_10000_461,T_5000_231,T_10000_463,T_625_29,T_2000_93,T_5000_233,T_10000_467,T_2500_117,T_10000_469,T_1000_47,T_10000_471,T_1250_59,T_10000_473,T_5000_237,T_400_19,T_2500_119,T_10000_477,T_5000_239,T_10000_479,T_125_6,T_10000_481,T_5000_241,T_10000_483,T_2500_121,T_2000_97,T_5000_243,T_10000_487,T_1250_61,T_10000_489,T_1000_49,T_10000_491,T_2500_123,T_10000_493,T_5000_247,T_2000_99,T_625_31,T_10000_497,T_5000_249,T_10000_499,T_20_1,T_10000_501,T_5000_251,T_10000_503,T_1250_63,T_2000_101,T_5000_253,T_10000_507,T_2500_127,T_10000_509,T_1000_51,T_10000_511,T_625_32,T_10000_513,T_5000_257,T_2000_103,T_2500_129,T_10000_517,T_5000_259,T_10000_519,T_250_13,T_10000_521,T_5000_261,T_10000_523,T_2500_131,T_400_21,T_5000_263,T_10000_527,T_625_33,T_10000_529,T_1000_53,T_10000_531,T_2500_133,T_10000_533,T_5000_267,T_2000_107,T_1250_67,T_10000_537,T_5000_269,T_10000_539,T_500_27,T_10000_541,T_5000_271,T_10000_543,T_625_34,T_2000_109,T_5000_273,T_10000_547,T_2500_137,T_10000_549,T_200_11,T_10000_551,T_1250_69,T_10000_553,T_5000_277,T_2000_111,T_2500_139,T_10000_557,T_5000_279,T_10000_559,T_125_7,T_10000_561,T_5000_281,T_10000_563,T_2500_141,T_2000_113,T_5000_283,T_10000_567,T_1250_71,T_10000_569,T_1000_57,T_10000_571,T_2500_143,T_10000_573,T_5000_287,T_400_23,T_625_36,T_10000_577,T_5000_289,T_10000_579,T_500_29,T_10000_581,T_5000_291,T_10000_583,T_1250_73,T_2000_117,T_5000_293,T_10000_587,T_2500_147,T_10000_589,T_1000_59,T_10000_591,T_625_37,T_10000_593,T_5000_297,T_2000_119,T_2500_149,T_10000_597,T_5000_299,T_10000_599,T_50_3,T_10000_601,T_5000_301,T_10000_603,T_2500_151,T_2000_121,T_5000_303,T_10000_607,T_625_38,T_10000_609,T_1000_61,T_10000_611,T_2500_153,T_10000_613,T_5000_307,T_2000_123,T_1250_77,T_10000_617,T_5000_309,T_10000_619,T_500_31,T_10000_621,T_5000_311,T_10000_623,T_625_39,T_16_1,T_5000_313,T_10000_627,T_2500_157,T_10000_629,T_1000_63,T_10000_631,T_1250_79,T_10000_633,T_5000_317,T_2000_127,T_2500_159,T_10000_637,T_5000_319,T_10000_639,T_125_8,T_10000_641,T_5000_321,T_10000_643,T_2500_161,T_2000_129,T_5000_323,T_10000_647,T_1250_81,T_10000_649,T_200_13,T_10000_651,T_2500_163,T_10000_653,T_5000_327,T_2000_131,T_625_41,T_10000_657,T_5000_329,T_10000_659,T_500_33,T_10000_661,T_5000_331,T_10000_663,T_1250_83,T_2000_133,T_5000_333,T_10000_667,T_2500_167,T_10000_669,T_1000_67,T_10000_671,T_625_42,T_10000_673,T_5000_337,T_400_27,T_2500_169,T_10000_677,T_5000_339,T_10000_679,T_250_17,T_10000_681,T_5000_341,T_10000_683,T_2500_171,T_2000_137,T_5000_343,T_10000_687,T_625_43,T_10000_689,T_1000_69,T_10000_691,T_2500_173,T_10000_693,T_5000_347,T_2000_139,T_1250_87,T_10000_697,T_5000_349,T_10000_699,T_100_7,T_10000_701,T_5000_351,T_10000_703,T_625_44,T_2000_141,T_5000_353,T_10000_707,T_2500_177,T_10000_709,T_1000_71,T_10000_711,T_1250_89,T_10000_713,T_5000_357,T_2000_143,T_2500_179,T_10000_717,T_5000_359,T_10000_719,T_125_9,T_10000_721,T_5000_361,T_10000_723,T_2500_181,T_400_29,T_5000_363,T_10000_727,T_1250_91,T_10000_729,T_1000_73,T_10000_731,T_2500_183,T_10000_733,T_5000_367,T_2000_147,T_625_46,T_10000_737,T_5000_369,T_10000_739,T_500_37,T_10000_741,T_5000_371,T_10000_743,T_1250_93,T_2000_149,T_5000_373,T_10000_747,T_2500_187,T_10000_749,T_40_3,T_10000_751,T_625_47,T_10000_753,T_5000_377,T_2000_151,T_2500_189,T_10000_757,T_5000_379,T_10000_759,T_250_19,T_10000_761,T_5000_381,T_10000_763,T_2500_191,T_2000_153,T_5000_383,T_10000_767,T_625_48,T_10000_769,T_1000_77,T_10000_771,T_2500_193,T_10000_773,T_5000_387,T_400_31,T_1250_97,T_10000_777,T_5000_389,T_10000_779,T_500_39,T_10000_781,T_5000_391,T_10000_783,T_625_49,T_2000_157,T_5000_393,T_10000_787,T_2500_197,T_10000_789,T_1000_79,T_10000_791,T_1250_99,T_10000_793,T_5000_397,T_2000_159,T_2500_199,T_10000_797,T_5000_399,T_10000_799,T_25_2,T_10000_801,T_5000_401,T_10000_803,T_2500_201,T_2000_161,T_5000_403,T_10000_807,T_1250_101,T_10000_809,T_1000_81,T_10000_811,T_2500_203,T_10000_813,T_5000_407,T_2000_163,T_625_51,T_10000_817,T_5000_409,T_10000_819,T_500_41,T_10000_821,T_5000_411,T_10000_823,T_1250_103,T_400_33,T_5000_413,T_10000_827,T_2500_207,T_10000_829,T_1000_83,T_10000_831,T_625_52,T_10000_833,T_5000_417,T_2000_167,T_2500_209,T_10000_837,T_5000_419,T_10000_839,T_250_21,T_10000_841,T_5000_421,T_10000_843,T_2500_211,T_2000_169,T_5000_423,T_10000_847,T_625_53,T_10000_849,T_200_17,T_10000_851,T_2500_213,T_10000_853,T_5000_427,T_2000_171,T_1250_107,T_10000_857,T_5000_429,T_10000_859,T_500_43,T_10000_861,T_5000_431,T_10000_863,T_625_54,T_2000_173,T_5000_433,T_10000_867,T_2500_217,T_10000_869,T_1000_87,T_10000_871,T_1250_109,T_10000_873,T_5000_437,T_80_7,T_2500_219,T_10000_877,T_5000_439,T_10000_879,T_125_11,T_10000_881,T_5000_441,T_10000_883,T_2500_221,T_2000_177,T_5000_443,T_10000_887,T_1250_111,T_10000_889,T_1000_89,T_10000_891,T_2500_223,T_10000_893,T_5000_447,T_2000_179,T_625_56,T_10000_897,T_5000_449,T_10000_899,T_100_9,T_10000_901,T_5000_451,T_10000_903,T_1250_113,T_2000_181,T_5000_453,T_10000_907,T_2500_227,T_10000_909,T_1000_91,T_10000_911,T_625_57,T_10000_913,T_5000_457,T_2000_183,T_2500_229,T_10000_917,T_5000_459,T_10000_919,T_250_23,T_10000_921,T_5000_461,T_10000_923,T_2500_231,T_400_37,T_5000_463,T_10000_927,T_625_58,T_10000_929,T_1000_93,T_10000_931,T_2500_233,T_10000_933,T_5000_467,T_2000_187,T_1250_117,T_10000_937,T_5000_469,T_10000_939,T_500_47,T_10000_941,T_5000_471,T_10000_943,T_625_59,T_2000_189,T_5000_473,T_10000_947,T_2500_237,T_10000_949,T_200_19,T_10000_951,T_1250_119,T_10000_953,T_5000_477,T_2000_191,T_2500_239,T_10000_957,T_5000_479,T_10000_959,T_125_12,T_10000_961,T_5000_481,T_10000_963,T_2500_241,T_2000_193,T_5000_483,T_10000_967,T_1250_121,T_10000_969,T_1000_97,T_10000_971,T_2500_243,T_10000_973,T_5000_487,T_400_39,T_625_61,T_10000_977,T_5000_489,T_10000_979,T_500_49,T_10000_981,T_5000_491,T_10000_983,T_1250_123,T_2000_197,T_5000_493,T_10000_987,T_2500_247,T_10000_989,T_1000_99,T_10000_991,T_625_62,T_10000_993,T_5000_497,T_2000_199,T_2500_249,T_10000_997,T_5000_499,T_10000_999,T_2_0,T_5000_3,T_2500_3,T_5000_9,T_1250_3,T_1000_3,T_2500_9,T_5000_21,T_625_3,T_5000_27,T_500_3,T_5000_33,T_1250_9,T_5000_39,T_2500_21,T_1000_9,T_625_6,T_5000_51,T_2500_27,T_5000_57,T_250_3,T_5000_63,T_2500_33,T_5000_69,T_625_9,T_200_3,T_2500_39,T_5000_81,T_1250_21,T_5000_87,T_500_9,T_5000_93,T_625_12,T_5000_99,T_2500_51,T_1000_21,T_1250_27,T_5000_111,T_2500_57,T_5000_117,T_125_3,T_5000_123,T_2500_63,T_5000_129,T_1250_33,T_1000_27,T_2500_69,T_5000_141,T_625_18,T_5000_147,T_100_3,T_5000_153,T_1250_39,T_5000_159,T_2500_81,T_1000_33,T_625_21,T_5000_171,T_2500_87,T_5000_177,T_250_9,T_5000_183,T_2500_93,T_5000_189,T_625_24,T_1000_39,T_2500_99,T_5000_201,T_1250_51,T_5000_207,T_500_21,T_5000_213,T_625_27,T_5000_219,T_2500_111,T_200_9,T_1250_57,T_5000_231,T_2500_117,T_5000_237,T_125_6,T_5000_243,T_2500_123,T_5000_249,T_1250_63,T_1000_51,T_2500_129,T_5000_261,T_625_33,T_5000_267,T_500_27,T_5000_273,T_1250_69,T_5000_279,T_2500_141,T_1000_57,T_625_36,T_5000_291,T_2500_147,T_5000_297,T_50_3,T_5000_303,T_2500_153,T_5000_309,T_625_39,T_1000_63,T_2500_159,T_5000_321,T_1250_81,T_5000_327,T_500_33,T_5000_333,T_625_42,T_5000_339,T_2500_171,T_1000_69,T_1250_87,T_5000_351,T_2500_177,T_5000_357,T_125_9,T_5000_363,T_2500_183,T_5000_369,T_1250_93,T_40_3,T_2500_189,T_5000_381,T_625_48,T_5000_387,T_500_39,T_5000_393,T_1250_99,T_5000_399,T_2500_201,T_1000_81,T_625_51,T_5000_411,T_2500_207,T_5000_417,T_250_21,T_5000_423,T_2500_213,T_5000_429,T_625_54,T_1000_87,T_2500_219,T_5000_441,T_1250_111,T_5000_447,T_100_9,T_5000_453,T_625_57,T_5000_459,T_2500_231,T_1000_93,T_1250_117,T_5000_471,T_2500_237,T_5000_477,T_125_12,T_5000_483,T_2500_243,T_5000_489,T_1250_123,T_1000_99,T_2500_249,T_5000_501,T_625_63,T_5000_507,T_500_51,T_5000_513,T_1250_129,T_5000_519,T_2500_261,T_200_21,T_625_66,T_5000_531,T_2500_267,T_5000_537,T_250_27,T_5000_543,T_2500_273,T_5000_549,T_625_69,T_1000_111,T_2500_279,T_5000_561,T_1250_141,T_5000_567,T_500_57,T_5000_573,T_625_72,T_5000_579,T_2500_291,T_1000_117,T_1250_147,T_5000_591,T_2500_297,T_5000_597,T_25_3,T_5000_603,T_2500_303,T_5000_609,T_1250_153,T_1000_123,T_2500_309,T_5000_621,T_625_78,T_5000_627,T_500_63,T_5000_633,T_1250_159,T_5000_639,T_2500_321,T_1000_129,T_625_81,T_5000_651,T_2500_327,T_5000_657,T_250_33,T_5000_663,T_2500_333,T_5000_669,T_625_84,T_200_27,T_2500_339,T_5000_681,T_1250_171,T_5000_687,T_500_69,T_5000_693,T_625_87,T_5000_699,T_2500_351,T_1000_141,T_1250_177,T_5000_711,T_2500_357,T_5000_717,T_125_18,T_5000_723,T_2500_363,T_5000_729,T_1250_183,T_1000_147,T_2500_369,T_5000_741,T_625_93,T_5000_747,T_20_3,T_5000_753,T_1250_189,T_5000_759,T_2500_381,T_1000_153,T_625_96,T_5000_771,T_2500_387,T_5000_777,T_250_39,T_5000_783,T_2500_393,T_5000_789,T_625_99,T_1000_159,T_2500_399,T_5000_801,T_1250_201,T_5000_807,T_500_81,T_5000_813,T_625_102,T_5000_819,T_2500_411,T_200_33,T_1250_207,T_5000_831,T_2500_417,T_5000_837,T_125_21,T_5000_843,T_2500_423,T_5000_849,T_1250_213,T_1000_171,T_2500_429,T_5000_861,T_625_108,T_5000_867,T_500_87,T_5000_873,T_1250_219,T_5000_879,T_2500_441,T_1000_177,T_625_111,T_5000_891,T_2500_447,T_5000_897,T_50_9,T_5000_903,T_2500_453,T_5000_909,T_625_114,T_1000_183,T_2500_459,T_5000_921,T_1250_231,T_5000_927,T_500_93,T_5000_933,T_625_117,T_5000_939,T_2500_471,T_1000_189,T_1250_237,T_5000_951,T_2500_477,T_5000_957,T_125_24,T_5000_963,T_2500_483,T_5000_969,T_1250_243,T_200_39,T_2500_489,T_5000_981,T_625_123,T_5000_987,T_500_99,T_5000_993,T_1250_249,T_5000_999,T_2500_501,T_1000_201,T_625_126,T_5000_1011,T_2500_507,T_5000_1017,T_250_51,T_5000_1023,T_2500_513,T_5000_1029,T_625_129,T_1000_207,T_2500_519,T_5000_1041,T_1250_261,T_5000_1047,T_100_21,T_5000_1053,T_625_132,T_5000_1059,T_2500_531,T_1000_213,T_1250_267,T_5000_1071,T_2500_537,T_5000_1077,T_125_27,T_5000_1083,T_2500_543,T_5000_1089,T_1250_273,T_1000_219,T_2500_549,T_5000_1101,T_625_138,T_5000_1107,T_500_111,T_5000_1113,T_1250_279,T_5000_1119,T_2500_561,T_40_9,T_625_141,T_5000_1131,T_2500_567,T_5000_1137,T_250_57,T_5000_1143,T_2500_573,T_5000_1149,T_625_144,T_1000_231,T_2500_579,T_5000_1161,T_1250_291,T_5000_1167,T_500_117,T_5000_1173,T_625_147,T_5000_1179,T_2500_591,T_1000_237,T_1250_297,T_5000_1191,T_2500_597,T_5000_1197,T_25_6,T_5000_1203,T_2500_603,T_5000_1209,T_1250_303,T_1000_243,T_2500_609,T_5000_1221,T_625_153,T_5000_1227,T_500_123,T_5000_1233,T_1250_309,T_5000_1239,T_2500_621,T_1000_249,T_625_156,T_5000_1251,T_2500_627,T_5000_1257,T_250_63,T_5000_1263,T_2500_633,T_5000_1269,T_625_159,T_200_51,T_2500_639,T_5000_1281,T_1250_321,T_5000_1287,T_500_129,T_5000_1293,T_625_162,T_5000_1299,T_2500_651,T_1000_261,T_1250_327,T_5000_1311,T_2500_657,T_5000_1317,T_125_33,T_5000_1323,T_2500_663,T_5000_1329,T_1250_333,T_1000_267,T_2500_669,T_5000_1341,T_625_168,T_5000_1347,T_100_27,T_5000_1353,T_1250_339,T_5000_1359,T_2500_681,T_1000_273,T_625_171,T_5000_1371,T_2500_687,T_5000_1377,T_250_69,T_5000_1383,T_2500_693,T_5000_1389,T_625_174,T_1000_279,T_2500_699,T_5000_1401,T_1250_351,T_5000_1407,T_500_141,T_5000_1413,T_625_177,T_5000_1419,T_2500_711,T_200_57,T_1250_357,T_5000_1431,T_2500_717,T_5000_1437,T_125_36,T_5000_1443,T_2500_723,T_5000_1449,T_1250_363,T_1000_291,T_2500_729,T_5000_1461,T_625_183,T_5000_1467,T_500_147,T_5000_1473,T_1250_369,T_5000_1479,T_2500_741,T_1000_297,T_625_186,T_5000_1491,T_2500_747,T_5000_1497,T_10_3,T_5000_1503,T_2500_753,T_5000_1509,T_625_189,T_1000_303,T_2500_759,T_5000_1521,T_1250_381,T_5000_1527,T_500_153,T_5000_1533,T_625_192,T_5000_1539,T_2500_771,T_1000_309,T_1250_387,T_5000_1551,T_2500_777,T_5000_1557,T_125_39,T_5000_1563,T_2500_783,T_5000_1569,T_1250_393,T_200_63,T_2500_789,T_5000_1581,T_625_198,T_5000_1587,T_500_159,T_5000_1593,T_1250_399,T_5000_1599,T_2500_801,T_1000_321,T_625_201,T_5000_1611,T_2500_807,T_5000_1617,T_250_81,T_5000_1623,T_2500_813,T_5000_1629,T_625_204,T_1000_327,T_2500_819,T_5000_1641,T_1250_411,T_5000_1647,T_100_33,T_5000_1653,T_625_207,T_5000_1659,T_2500_831,T_1000_333,T_1250_417,T_5000_1671,T_2500_837,T_5000_1677,T_125_42,T_5000_1683,T_2500_843,T_5000_1689,T_1250_423,T_1000_339,T_2500_849,T_5000_1701,T_625_213,T_5000_1707,T_500_171,T_5000_1713,T_1250_429,T_5000_1719,T_2500_861,T_200_69,T_625_216,T_5000_1731,T_2500_867,T_5000_1737,T_250_87,T_5000_1743,T_2500_873,T_5000_1749,T_625_219,T_1000_351,T_2500_879,T_5000_1761,T_1250_441,T_5000_1767,T_500_177,T_5000_1773,T_625_222,T_5000_1779,T_2500_891,T_1000_357,T_1250_447,T_5000_1791,T_2500_897,T_5000_1797,T_25_9,T_5000_1803,T_2500_903,T_5000_1809,T_1250_453,T_1000_363,T_2500_909,T_5000_1821,T_625_228,T_5000_1827,T_500_183,T_5000_1833,T_1250_459,T_5000_1839,T_2500_921,T_1000_369,T_625_231,T_5000_1851,T_2500_927,T_5000_1857,T_250_93,T_5000_1863,T_2500_933,T_5000_1869,T_625_234,T_8_3,T_2500_939,T_5000_1881,T_1250_471,T_5000_1887,T_500_189,T_5000_1893,T_625_237,T_5000_1899,T_2500_951,T_1000_381,T_1250_477,T_5000_1911,T_2500_957,T_5000_1917,T_125_48,T_5000_1923,T_2500_963,T_5000_1929,T_1250_483,T_1000_387,T_2500_969,T_5000_1941,T_625_243,T_5000_1947,T_100_39,T_5000_1953,T_1250_489,T_5000_1959,T_2500_981,T_1000_393,T_625_246,T_5000_1971,T_2500_987,T_5000_1977,T_250_99,T_5000_1983,T_2500_993,T_5000_1989,T_625_249,T_1000_399,T_2500_999,T_5000_2001,T_1250_501,T_5000_2007,T_500_201,T_5000_2013,T_625_252,T_5000_2019,T_2500_1011,T_200_81,T_1250_507,T_5000_2031,T_2500_1017,T_5000_2037,T_125_51,T_5000_2043,T_2500_1023,T_5000_2049,T_1250_513,T_1000_411,T_2500_1029,T_5000_2061,T_625_258,T_5000_2067,T_500_207,T_5000_2073,T_1250_519,T_5000_2079,T_2500_1041,T_1000_417,T_625_261,T_5000_2091,T_2500_1047,T_5000_2097,T_50_21,T_5000_2103,T_2500_1053,T_5000_2109,T_625_264,T_1000_423,T_2500_1059,T_5000_2121,T_1250_531,T_5000_2127,T_500_213,T_5000_2133,T_625_267,T_5000_2139,T_2500_1071,T_1000_429,T_1250_537,T_5000_2151,T_2500_1077,T_5000_2157,T_125_54,T_5000_2163,T_2500_1083,T_5000_2169,T_1250_543,T_200_87,T_2500_1089,T_5000_2181,T_625_273,T_5000_2187,T_500_219,T_5000_2193,T_1250_549,T_5000_2199,T_2500_1101,T_1000_441,T_625_276,T_5000_2211,T_2500_1107,T_5000_2217,T_250_111,T_5000_2223,T_2500_1113,T_5000_2229,T_625_279,T_1000_447,T_2500_1119,T_5000_2241,T_1250_561,T_5000_2247,T_20_9,T_5000_2253,T_625_282,T_5000_2259,T_2500_1131,T_1000_453,T_1250_567,T_5000_2271,T_2500_1137,T_5000_2277,T_125_57,T_5000_2283,T_2500_1143,T_5000_2289,T_1250_573,T_1000_459,T_2500_1149,T_5000_2301,T_625_288,T_5000_2307,T_500_231,T_5000_2313,T_1250_579,T_5000_2319,T_2500_1161,T_200_93,T_625_291,T_5000_2331,T_2500_1167,T_5000_2337,T_250_117,T_5000_2343,T_2500_1173,T_5000_2349,T_625_294,T_1000_471,T_2500_1179,T_5000_2361,T_1250_591,T_5000_2367,T_500_237,T_5000_2373,T_625_297,T_5000_2379,T_2500_1191,T_1000_477,T_1250_597,T_5000_2391,T_2500_1197,T_5000_2397,T_25_12,T_5000_2403,T_2500_1203,T_5000_2409,T_1250_603,T_1000_483,T_2500_1209,T_5000_2421,T_625_303,T_5000_2427,T_500_243,T_5000_2433,T_1250_609,T_5000_2439,T_2500_1221,T_1000_489,T_625_306,T_5000_2451,T_2500_1227,T_5000_2457,T_250_123,T_5000_2463,T_2500_1233,T_5000_2469,T_625_309,T_200_99,T_2500_1239,T_5000_2481,T_1250_621,T_5000_2487,T_500_249,T_5000_2493,T_625_312,T_5000_2499,T_2500_1251,T_1000_501,T_1250_627,T_5000_2511,T_2500_1257,T_5000_2517,T_125_63,T_5000_2523,T_2500_1263,T_5000_2529,T_1250_633,T_1000_507,T_2500_1269,T_5000_2541,T_625_318,T_5000_2547,T_100_51,T_5000_2553,T_1250_639,T_5000_2559,T_2500_1281,T_1000_513,T_625_321,T_5000_2571,T_2500_1287,T_5000_2577,T_250_129,T_5000_2583,T_2500_1293,T_5000_2589,T_625_324,T_1000_519,T_2500_1299,T_5000_2601,T_1250_651,T_5000_2607,T_500_261,T_5000_2613,T_625_327,T_5000_2619,T_2500_1311,T_40_21,T_1250_657,T_5000_2631,T_2500_1317,T_5000_2637,T_125_66,T_5000_2643,T_2500_1323,T_5000_2649,T_1250_663,T_1000_531,T_2500_1329,T_5000_2661,T_625_333,T_5000_2667,T_500_267,T_5000_2673,T_1250_669,T_5000_2679,T_2500_1341,T_1000_537,T_625_336,T_5000_2691,T_2500_1347,T_5000_2697,T_50_27,T_5000_2703,T_2500_1353,T_5000_2709,T_625_339,T_1000_543,T_2500_1359,T_5000_2721,T_1250_681,T_5000_2727,T_500_273,T_5000_2733,T_625_342,T_5000_2739,T_2500_1371,T_1000_549,T_1250_687,T_5000_2751,T_2500_1377,T_5000_2757,T_125_69,T_5000_2763,T_2500_1383,T_5000_2769,T_1250_693,T_200_111,T_2500_1389,T_5000_2781,T_625_348,T_5000_2787,T_500_279,T_5000_2793,T_1250_699,T_5000_2799,T_2500_1401,T_1000_561,T_625_351,T_5000_2811,T_2500_1407,T_5000_2817,T_250_141,T_5000_2823,T_2500_1413,T_5000_2829,T_625_354,T_1000_567,T_2500_1419,T_5000_2841,T_1250_711,T_5000_2847,T_100_57,T_5000_2853,T_625_357,T_5000_2859,T_2500_1431,T_1000_573,T_1250_717,T_5000_2871,T_2500_1437,T_5000_2877,T_125_72,T_5000_2883,T_2500_1443,T_5000_2889,T_1250_723,T_1000_579,T_2500_1449,T_5000_2901,T_625_363,T_5000_2907,T_500_291,T_5000_2913,T_1250_729,T_5000_2919,T_2500_1461,T_200_117,T_625_366,T_5000_2931,T_2500_1467,T_5000_2937,T_250_147,T_5000_2943,T_2500_1473,T_5000_2949,T_625_369,T_1000_591,T_2500_1479,T_5000_2961,T_1250_741,T_5000_2967,T_500_297,T_5000_2973,T_625_372,T_5000_2979,T_2500_1491,T_1000_597,T_1250_747,T_5000_2991,T_2500_1497,T_5000_2997 +}; +static const __device__ double2 lut_dp_16_16384[1024*2] = { + T_2_0,T_16384_1,T_8192_1,T_16384_3,T_4096_1,T_16384_5,T_8192_3,T_16384_7,T_2048_1,T_16384_9,T_8192_5,T_16384_11,T_4096_3,T_16384_13,T_8192_7,T_16384_15,T_1024_1,T_16384_17,T_8192_9,T_16384_19,T_4096_5,T_16384_21,T_8192_11,T_16384_23,T_2048_3,T_16384_25,T_8192_13,T_16384_27,T_4096_7,T_16384_29,T_8192_15,T_16384_31,T_512_1,T_16384_33,T_8192_17,T_16384_35,T_4096_9,T_16384_37,T_8192_19,T_16384_39,T_2048_5,T_16384_41,T_8192_21,T_16384_43,T_4096_11,T_16384_45,T_8192_23,T_16384_47,T_1024_3,T_16384_49,T_8192_25,T_16384_51,T_4096_13,T_16384_53,T_8192_27,T_16384_55,T_2048_7,T_16384_57,T_8192_29,T_16384_59,T_4096_15,T_16384_61,T_8192_31,T_16384_63,T_256_1,T_16384_65,T_8192_33,T_16384_67,T_4096_17,T_16384_69,T_8192_35,T_16384_71,T_2048_9,T_16384_73,T_8192_37,T_16384_75,T_4096_19,T_16384_77,T_8192_39,T_16384_79,T_1024_5,T_16384_81,T_8192_41,T_16384_83,T_4096_21,T_16384_85,T_8192_43,T_16384_87,T_2048_11,T_16384_89,T_8192_45,T_16384_91,T_4096_23,T_16384_93,T_8192_47,T_16384_95,T_512_3,T_16384_97,T_8192_49,T_16384_99,T_4096_25,T_16384_101,T_8192_51,T_16384_103,T_2048_13,T_16384_105,T_8192_53,T_16384_107,T_4096_27,T_16384_109,T_8192_55,T_16384_111,T_1024_7,T_16384_113,T_8192_57,T_16384_115,T_4096_29,T_16384_117,T_8192_59,T_16384_119,T_2048_15,T_16384_121,T_8192_61,T_16384_123,T_4096_31,T_16384_125,T_8192_63,T_16384_127,T_128_1,T_16384_129,T_8192_65,T_16384_131,T_4096_33,T_16384_133,T_8192_67,T_16384_135,T_2048_17,T_16384_137,T_8192_69,T_16384_139,T_4096_35,T_16384_141,T_8192_71,T_16384_143,T_1024_9,T_16384_145,T_8192_73,T_16384_147,T_4096_37,T_16384_149,T_8192_75,T_16384_151,T_2048_19,T_16384_153,T_8192_77,T_16384_155,T_4096_39,T_16384_157,T_8192_79,T_16384_159,T_512_5,T_16384_161,T_8192_81,T_16384_163,T_4096_41,T_16384_165,T_8192_83,T_16384_167,T_2048_21,T_16384_169,T_8192_85,T_16384_171,T_4096_43,T_16384_173,T_8192_87,T_16384_175,T_1024_11,T_16384_177,T_8192_89,T_16384_179,T_4096_45,T_16384_181,T_8192_91,T_16384_183,T_2048_23,T_16384_185,T_8192_93,T_16384_187,T_4096_47,T_16384_189,T_8192_95,T_16384_191,T_256_3,T_16384_193,T_8192_97,T_16384_195,T_4096_49,T_16384_197,T_8192_99,T_16384_199,T_2048_25,T_16384_201,T_8192_101,T_16384_203,T_4096_51,T_16384_205,T_8192_103,T_16384_207,T_1024_13,T_16384_209,T_8192_105,T_16384_211,T_4096_53,T_16384_213,T_8192_107,T_16384_215,T_2048_27,T_16384_217,T_8192_109,T_16384_219,T_4096_55,T_16384_221,T_8192_111,T_16384_223,T_512_7,T_16384_225,T_8192_113,T_16384_227,T_4096_57,T_16384_229,T_8192_115,T_16384_231,T_2048_29,T_16384_233,T_8192_117,T_16384_235,T_4096_59,T_16384_237,T_8192_119,T_16384_239,T_1024_15,T_16384_241,T_8192_121,T_16384_243,T_4096_61,T_16384_245,T_8192_123,T_16384_247,T_2048_31,T_16384_249,T_8192_125,T_16384_251,T_4096_63,T_16384_253,T_8192_127,T_16384_255,T_64_1,T_16384_257,T_8192_129,T_16384_259,T_4096_65,T_16384_261,T_8192_131,T_16384_263,T_2048_33,T_16384_265,T_8192_133,T_16384_267,T_4096_67,T_16384_269,T_8192_135,T_16384_271,T_1024_17,T_16384_273,T_8192_137,T_16384_275,T_4096_69,T_16384_277,T_8192_139,T_16384_279,T_2048_35,T_16384_281,T_8192_141,T_16384_283,T_4096_71,T_16384_285,T_8192_143,T_16384_287,T_512_9,T_16384_289,T_8192_145,T_16384_291,T_4096_73,T_16384_293,T_8192_147,T_16384_295,T_2048_37,T_16384_297,T_8192_149,T_16384_299,T_4096_75,T_16384_301,T_8192_151,T_16384_303,T_1024_19,T_16384_305,T_8192_153,T_16384_307,T_4096_77,T_16384_309,T_8192_155,T_16384_311,T_2048_39,T_16384_313,T_8192_157,T_16384_315,T_4096_79,T_16384_317,T_8192_159,T_16384_319,T_256_5,T_16384_321,T_8192_161,T_16384_323,T_4096_81,T_16384_325,T_8192_163,T_16384_327,T_2048_41,T_16384_329,T_8192_165,T_16384_331,T_4096_83,T_16384_333,T_8192_167,T_16384_335,T_1024_21,T_16384_337,T_8192_169,T_16384_339,T_4096_85,T_16384_341,T_8192_171,T_16384_343,T_2048_43,T_16384_345,T_8192_173,T_16384_347,T_4096_87,T_16384_349,T_8192_175,T_16384_351,T_512_11,T_16384_353,T_8192_177,T_16384_355,T_4096_89,T_16384_357,T_8192_179,T_16384_359,T_2048_45,T_16384_361,T_8192_181,T_16384_363,T_4096_91,T_16384_365,T_8192_183,T_16384_367,T_1024_23,T_16384_369,T_8192_185,T_16384_371,T_4096_93,T_16384_373,T_8192_187,T_16384_375,T_2048_47,T_16384_377,T_8192_189,T_16384_379,T_4096_95,T_16384_381,T_8192_191,T_16384_383,T_128_3,T_16384_385,T_8192_193,T_16384_387,T_4096_97,T_16384_389,T_8192_195,T_16384_391,T_2048_49,T_16384_393,T_8192_197,T_16384_395,T_4096_99,T_16384_397,T_8192_199,T_16384_399,T_1024_25,T_16384_401,T_8192_201,T_16384_403,T_4096_101,T_16384_405,T_8192_203,T_16384_407,T_2048_51,T_16384_409,T_8192_205,T_16384_411,T_4096_103,T_16384_413,T_8192_207,T_16384_415,T_512_13,T_16384_417,T_8192_209,T_16384_419,T_4096_105,T_16384_421,T_8192_211,T_16384_423,T_2048_53,T_16384_425,T_8192_213,T_16384_427,T_4096_107,T_16384_429,T_8192_215,T_16384_431,T_1024_27,T_16384_433,T_8192_217,T_16384_435,T_4096_109,T_16384_437,T_8192_219,T_16384_439,T_2048_55,T_16384_441,T_8192_221,T_16384_443,T_4096_111,T_16384_445,T_8192_223,T_16384_447,T_256_7,T_16384_449,T_8192_225,T_16384_451,T_4096_113,T_16384_453,T_8192_227,T_16384_455,T_2048_57,T_16384_457,T_8192_229,T_16384_459,T_4096_115,T_16384_461,T_8192_231,T_16384_463,T_1024_29,T_16384_465,T_8192_233,T_16384_467,T_4096_117,T_16384_469,T_8192_235,T_16384_471,T_2048_59,T_16384_473,T_8192_237,T_16384_475,T_4096_119,T_16384_477,T_8192_239,T_16384_479,T_512_15,T_16384_481,T_8192_241,T_16384_483,T_4096_121,T_16384_485,T_8192_243,T_16384_487,T_2048_61,T_16384_489,T_8192_245,T_16384_491,T_4096_123,T_16384_493,T_8192_247,T_16384_495,T_1024_31,T_16384_497,T_8192_249,T_16384_499,T_4096_125,T_16384_501,T_8192_251,T_16384_503,T_2048_63,T_16384_505,T_8192_253,T_16384_507,T_4096_127,T_16384_509,T_8192_255,T_16384_511,T_32_1,T_16384_513,T_8192_257,T_16384_515,T_4096_129,T_16384_517,T_8192_259,T_16384_519,T_2048_65,T_16384_521,T_8192_261,T_16384_523,T_4096_131,T_16384_525,T_8192_263,T_16384_527,T_1024_33,T_16384_529,T_8192_265,T_16384_531,T_4096_133,T_16384_533,T_8192_267,T_16384_535,T_2048_67,T_16384_537,T_8192_269,T_16384_539,T_4096_135,T_16384_541,T_8192_271,T_16384_543,T_512_17,T_16384_545,T_8192_273,T_16384_547,T_4096_137,T_16384_549,T_8192_275,T_16384_551,T_2048_69,T_16384_553,T_8192_277,T_16384_555,T_4096_139,T_16384_557,T_8192_279,T_16384_559,T_1024_35,T_16384_561,T_8192_281,T_16384_563,T_4096_141,T_16384_565,T_8192_283,T_16384_567,T_2048_71,T_16384_569,T_8192_285,T_16384_571,T_4096_143,T_16384_573,T_8192_287,T_16384_575,T_256_9,T_16384_577,T_8192_289,T_16384_579,T_4096_145,T_16384_581,T_8192_291,T_16384_583,T_2048_73,T_16384_585,T_8192_293,T_16384_587,T_4096_147,T_16384_589,T_8192_295,T_16384_591,T_1024_37,T_16384_593,T_8192_297,T_16384_595,T_4096_149,T_16384_597,T_8192_299,T_16384_599,T_2048_75,T_16384_601,T_8192_301,T_16384_603,T_4096_151,T_16384_605,T_8192_303,T_16384_607,T_512_19,T_16384_609,T_8192_305,T_16384_611,T_4096_153,T_16384_613,T_8192_307,T_16384_615,T_2048_77,T_16384_617,T_8192_309,T_16384_619,T_4096_155,T_16384_621,T_8192_311,T_16384_623,T_1024_39,T_16384_625,T_8192_313,T_16384_627,T_4096_157,T_16384_629,T_8192_315,T_16384_631,T_2048_79,T_16384_633,T_8192_317,T_16384_635,T_4096_159,T_16384_637,T_8192_319,T_16384_639,T_128_5,T_16384_641,T_8192_321,T_16384_643,T_4096_161,T_16384_645,T_8192_323,T_16384_647,T_2048_81,T_16384_649,T_8192_325,T_16384_651,T_4096_163,T_16384_653,T_8192_327,T_16384_655,T_1024_41,T_16384_657,T_8192_329,T_16384_659,T_4096_165,T_16384_661,T_8192_331,T_16384_663,T_2048_83,T_16384_665,T_8192_333,T_16384_667,T_4096_167,T_16384_669,T_8192_335,T_16384_671,T_512_21,T_16384_673,T_8192_337,T_16384_675,T_4096_169,T_16384_677,T_8192_339,T_16384_679,T_2048_85,T_16384_681,T_8192_341,T_16384_683,T_4096_171,T_16384_685,T_8192_343,T_16384_687,T_1024_43,T_16384_689,T_8192_345,T_16384_691,T_4096_173,T_16384_693,T_8192_347,T_16384_695,T_2048_87,T_16384_697,T_8192_349,T_16384_699,T_4096_175,T_16384_701,T_8192_351,T_16384_703,T_256_11,T_16384_705,T_8192_353,T_16384_707,T_4096_177,T_16384_709,T_8192_355,T_16384_711,T_2048_89,T_16384_713,T_8192_357,T_16384_715,T_4096_179,T_16384_717,T_8192_359,T_16384_719,T_1024_45,T_16384_721,T_8192_361,T_16384_723,T_4096_181,T_16384_725,T_8192_363,T_16384_727,T_2048_91,T_16384_729,T_8192_365,T_16384_731,T_4096_183,T_16384_733,T_8192_367,T_16384_735,T_512_23,T_16384_737,T_8192_369,T_16384_739,T_4096_185,T_16384_741,T_8192_371,T_16384_743,T_2048_93,T_16384_745,T_8192_373,T_16384_747,T_4096_187,T_16384_749,T_8192_375,T_16384_751,T_1024_47,T_16384_753,T_8192_377,T_16384_755,T_4096_189,T_16384_757,T_8192_379,T_16384_759,T_2048_95,T_16384_761,T_8192_381,T_16384_763,T_4096_191,T_16384_765,T_8192_383,T_16384_767,T_64_3,T_16384_769,T_8192_385,T_16384_771,T_4096_193,T_16384_773,T_8192_387,T_16384_775,T_2048_97,T_16384_777,T_8192_389,T_16384_779,T_4096_195,T_16384_781,T_8192_391,T_16384_783,T_1024_49,T_16384_785,T_8192_393,T_16384_787,T_4096_197,T_16384_789,T_8192_395,T_16384_791,T_2048_99,T_16384_793,T_8192_397,T_16384_795,T_4096_199,T_16384_797,T_8192_399,T_16384_799,T_512_25,T_16384_801,T_8192_401,T_16384_803,T_4096_201,T_16384_805,T_8192_403,T_16384_807,T_2048_101,T_16384_809,T_8192_405,T_16384_811,T_4096_203,T_16384_813,T_8192_407,T_16384_815,T_1024_51,T_16384_817,T_8192_409,T_16384_819,T_4096_205,T_16384_821,T_8192_411,T_16384_823,T_2048_103,T_16384_825,T_8192_413,T_16384_827,T_4096_207,T_16384_829,T_8192_415,T_16384_831,T_256_13,T_16384_833,T_8192_417,T_16384_835,T_4096_209,T_16384_837,T_8192_419,T_16384_839,T_2048_105,T_16384_841,T_8192_421,T_16384_843,T_4096_211,T_16384_845,T_8192_423,T_16384_847,T_1024_53,T_16384_849,T_8192_425,T_16384_851,T_4096_213,T_16384_853,T_8192_427,T_16384_855,T_2048_107,T_16384_857,T_8192_429,T_16384_859,T_4096_215,T_16384_861,T_8192_431,T_16384_863,T_512_27,T_16384_865,T_8192_433,T_16384_867,T_4096_217,T_16384_869,T_8192_435,T_16384_871,T_2048_109,T_16384_873,T_8192_437,T_16384_875,T_4096_219,T_16384_877,T_8192_439,T_16384_879,T_1024_55,T_16384_881,T_8192_441,T_16384_883,T_4096_221,T_16384_885,T_8192_443,T_16384_887,T_2048_111,T_16384_889,T_8192_445,T_16384_891,T_4096_223,T_16384_893,T_8192_447,T_16384_895,T_128_7,T_16384_897,T_8192_449,T_16384_899,T_4096_225,T_16384_901,T_8192_451,T_16384_903,T_2048_113,T_16384_905,T_8192_453,T_16384_907,T_4096_227,T_16384_909,T_8192_455,T_16384_911,T_1024_57,T_16384_913,T_8192_457,T_16384_915,T_4096_229,T_16384_917,T_8192_459,T_16384_919,T_2048_115,T_16384_921,T_8192_461,T_16384_923,T_4096_231,T_16384_925,T_8192_463,T_16384_927,T_512_29,T_16384_929,T_8192_465,T_16384_931,T_4096_233,T_16384_933,T_8192_467,T_16384_935,T_2048_117,T_16384_937,T_8192_469,T_16384_939,T_4096_235,T_16384_941,T_8192_471,T_16384_943,T_1024_59,T_16384_945,T_8192_473,T_16384_947,T_4096_237,T_16384_949,T_8192_475,T_16384_951,T_2048_119,T_16384_953,T_8192_477,T_16384_955,T_4096_239,T_16384_957,T_8192_479,T_16384_959,T_256_15,T_16384_961,T_8192_481,T_16384_963,T_4096_241,T_16384_965,T_8192_483,T_16384_967,T_2048_121,T_16384_969,T_8192_485,T_16384_971,T_4096_243,T_16384_973,T_8192_487,T_16384_975,T_1024_61,T_16384_977,T_8192_489,T_16384_979,T_4096_245,T_16384_981,T_8192_491,T_16384_983,T_2048_123,T_16384_985,T_8192_493,T_16384_987,T_4096_247,T_16384_989,T_8192_495,T_16384_991,T_512_31,T_16384_993,T_8192_497,T_16384_995,T_4096_249,T_16384_997,T_8192_499,T_16384_999,T_2048_125,T_16384_1001,T_8192_501,T_16384_1003,T_4096_251,T_16384_1005,T_8192_503,T_16384_1007,T_1024_63,T_16384_1009,T_8192_505,T_16384_1011,T_4096_253,T_16384_1013,T_8192_507,T_16384_1015,T_2048_127,T_16384_1017,T_8192_509,T_16384_1019,T_4096_255,T_16384_1021,T_8192_511,T_16384_1023,T_2_0,T_16384_9,T_8192_9,T_16384_27,T_4096_9,T_16384_45,T_8192_27,T_16384_63,T_2048_9,T_16384_81,T_8192_45,T_16384_99,T_4096_27,T_16384_117,T_8192_63,T_16384_135,T_1024_9,T_16384_153,T_8192_81,T_16384_171,T_4096_45,T_16384_189,T_8192_99,T_16384_207,T_2048_27,T_16384_225,T_8192_117,T_16384_243,T_4096_63,T_16384_261,T_8192_135,T_16384_279,T_512_9,T_16384_297,T_8192_153,T_16384_315,T_4096_81,T_16384_333,T_8192_171,T_16384_351,T_2048_45,T_16384_369,T_8192_189,T_16384_387,T_4096_99,T_16384_405,T_8192_207,T_16384_423,T_1024_27,T_16384_441,T_8192_225,T_16384_459,T_4096_117,T_16384_477,T_8192_243,T_16384_495,T_2048_63,T_16384_513,T_8192_261,T_16384_531,T_4096_135,T_16384_549,T_8192_279,T_16384_567,T_256_9,T_16384_585,T_8192_297,T_16384_603,T_4096_153,T_16384_621,T_8192_315,T_16384_639,T_2048_81,T_16384_657,T_8192_333,T_16384_675,T_4096_171,T_16384_693,T_8192_351,T_16384_711,T_1024_45,T_16384_729,T_8192_369,T_16384_747,T_4096_189,T_16384_765,T_8192_387,T_16384_783,T_2048_99,T_16384_801,T_8192_405,T_16384_819,T_4096_207,T_16384_837,T_8192_423,T_16384_855,T_512_27,T_16384_873,T_8192_441,T_16384_891,T_4096_225,T_16384_909,T_8192_459,T_16384_927,T_2048_117,T_16384_945,T_8192_477,T_16384_963,T_4096_243,T_16384_981,T_8192_495,T_16384_999,T_1024_63,T_16384_1017,T_8192_513,T_16384_1035,T_4096_261,T_16384_1053,T_8192_531,T_16384_1071,T_2048_135,T_16384_1089,T_8192_549,T_16384_1107,T_4096_279,T_16384_1125,T_8192_567,T_16384_1143,T_128_9,T_16384_1161,T_8192_585,T_16384_1179,T_4096_297,T_16384_1197,T_8192_603,T_16384_1215,T_2048_153,T_16384_1233,T_8192_621,T_16384_1251,T_4096_315,T_16384_1269,T_8192_639,T_16384_1287,T_1024_81,T_16384_1305,T_8192_657,T_16384_1323,T_4096_333,T_16384_1341,T_8192_675,T_16384_1359,T_2048_171,T_16384_1377,T_8192_693,T_16384_1395,T_4096_351,T_16384_1413,T_8192_711,T_16384_1431,T_512_45,T_16384_1449,T_8192_729,T_16384_1467,T_4096_369,T_16384_1485,T_8192_747,T_16384_1503,T_2048_189,T_16384_1521,T_8192_765,T_16384_1539,T_4096_387,T_16384_1557,T_8192_783,T_16384_1575,T_1024_99,T_16384_1593,T_8192_801,T_16384_1611,T_4096_405,T_16384_1629,T_8192_819,T_16384_1647,T_2048_207,T_16384_1665,T_8192_837,T_16384_1683,T_4096_423,T_16384_1701,T_8192_855,T_16384_1719,T_256_27,T_16384_1737,T_8192_873,T_16384_1755,T_4096_441,T_16384_1773,T_8192_891,T_16384_1791,T_2048_225,T_16384_1809,T_8192_909,T_16384_1827,T_4096_459,T_16384_1845,T_8192_927,T_16384_1863,T_1024_117,T_16384_1881,T_8192_945,T_16384_1899,T_4096_477,T_16384_1917,T_8192_963,T_16384_1935,T_2048_243,T_16384_1953,T_8192_981,T_16384_1971,T_4096_495,T_16384_1989,T_8192_999,T_16384_2007,T_512_63,T_16384_2025,T_8192_1017,T_16384_2043,T_4096_513,T_16384_2061,T_8192_1035,T_16384_2079,T_2048_261,T_16384_2097,T_8192_1053,T_16384_2115,T_4096_531,T_16384_2133,T_8192_1071,T_16384_2151,T_1024_135,T_16384_2169,T_8192_1089,T_16384_2187,T_4096_549,T_16384_2205,T_8192_1107,T_16384_2223,T_2048_279,T_16384_2241,T_8192_1125,T_16384_2259,T_4096_567,T_16384_2277,T_8192_1143,T_16384_2295,T_64_9,T_16384_2313,T_8192_1161,T_16384_2331,T_4096_585,T_16384_2349,T_8192_1179,T_16384_2367,T_2048_297,T_16384_2385,T_8192_1197,T_16384_2403,T_4096_603,T_16384_2421,T_8192_1215,T_16384_2439,T_1024_153,T_16384_2457,T_8192_1233,T_16384_2475,T_4096_621,T_16384_2493,T_8192_1251,T_16384_2511,T_2048_315,T_16384_2529,T_8192_1269,T_16384_2547,T_4096_639,T_16384_2565,T_8192_1287,T_16384_2583,T_512_81,T_16384_2601,T_8192_1305,T_16384_2619,T_4096_657,T_16384_2637,T_8192_1323,T_16384_2655,T_2048_333,T_16384_2673,T_8192_1341,T_16384_2691,T_4096_675,T_16384_2709,T_8192_1359,T_16384_2727,T_1024_171,T_16384_2745,T_8192_1377,T_16384_2763,T_4096_693,T_16384_2781,T_8192_1395,T_16384_2799,T_2048_351,T_16384_2817,T_8192_1413,T_16384_2835,T_4096_711,T_16384_2853,T_8192_1431,T_16384_2871,T_256_45,T_16384_2889,T_8192_1449,T_16384_2907,T_4096_729,T_16384_2925,T_8192_1467,T_16384_2943,T_2048_369,T_16384_2961,T_8192_1485,T_16384_2979,T_4096_747,T_16384_2997,T_8192_1503,T_16384_3015,T_1024_189,T_16384_3033,T_8192_1521,T_16384_3051,T_4096_765,T_16384_3069,T_8192_1539,T_16384_3087,T_2048_387,T_16384_3105,T_8192_1557,T_16384_3123,T_4096_783,T_16384_3141,T_8192_1575,T_16384_3159,T_512_99,T_16384_3177,T_8192_1593,T_16384_3195,T_4096_801,T_16384_3213,T_8192_1611,T_16384_3231,T_2048_405,T_16384_3249,T_8192_1629,T_16384_3267,T_4096_819,T_16384_3285,T_8192_1647,T_16384_3303,T_1024_207,T_16384_3321,T_8192_1665,T_16384_3339,T_4096_837,T_16384_3357,T_8192_1683,T_16384_3375,T_2048_423,T_16384_3393,T_8192_1701,T_16384_3411,T_4096_855,T_16384_3429,T_8192_1719,T_16384_3447,T_128_27,T_16384_3465,T_8192_1737,T_16384_3483,T_4096_873,T_16384_3501,T_8192_1755,T_16384_3519,T_2048_441,T_16384_3537,T_8192_1773,T_16384_3555,T_4096_891,T_16384_3573,T_8192_1791,T_16384_3591,T_1024_225,T_16384_3609,T_8192_1809,T_16384_3627,T_4096_909,T_16384_3645,T_8192_1827,T_16384_3663,T_2048_459,T_16384_3681,T_8192_1845,T_16384_3699,T_4096_927,T_16384_3717,T_8192_1863,T_16384_3735,T_512_117,T_16384_3753,T_8192_1881,T_16384_3771,T_4096_945,T_16384_3789,T_8192_1899,T_16384_3807,T_2048_477,T_16384_3825,T_8192_1917,T_16384_3843,T_4096_963,T_16384_3861,T_8192_1935,T_16384_3879,T_1024_243,T_16384_3897,T_8192_1953,T_16384_3915,T_4096_981,T_16384_3933,T_8192_1971,T_16384_3951,T_2048_495,T_16384_3969,T_8192_1989,T_16384_3987,T_4096_999,T_16384_4005,T_8192_2007,T_16384_4023,T_256_63,T_16384_4041,T_8192_2025,T_16384_4059,T_4096_1017,T_16384_4077,T_8192_2043,T_16384_4095,T_2048_513,T_16384_4113,T_8192_2061,T_16384_4131,T_4096_1035,T_16384_4149,T_8192_2079,T_16384_4167,T_1024_261,T_16384_4185,T_8192_2097,T_16384_4203,T_4096_1053,T_16384_4221,T_8192_2115,T_16384_4239,T_2048_531,T_16384_4257,T_8192_2133,T_16384_4275,T_4096_1071,T_16384_4293,T_8192_2151,T_16384_4311,T_512_135,T_16384_4329,T_8192_2169,T_16384_4347,T_4096_1089,T_16384_4365,T_8192_2187,T_16384_4383,T_2048_549,T_16384_4401,T_8192_2205,T_16384_4419,T_4096_1107,T_16384_4437,T_8192_2223,T_16384_4455,T_1024_279,T_16384_4473,T_8192_2241,T_16384_4491,T_4096_1125,T_16384_4509,T_8192_2259,T_16384_4527,T_2048_567,T_16384_4545,T_8192_2277,T_16384_4563,T_4096_1143,T_16384_4581,T_8192_2295,T_16384_4599,T_32_9,T_16384_4617,T_8192_2313,T_16384_4635,T_4096_1161,T_16384_4653,T_8192_2331,T_16384_4671,T_2048_585,T_16384_4689,T_8192_2349,T_16384_4707,T_4096_1179,T_16384_4725,T_8192_2367,T_16384_4743,T_1024_297,T_16384_4761,T_8192_2385,T_16384_4779,T_4096_1197,T_16384_4797,T_8192_2403,T_16384_4815,T_2048_603,T_16384_4833,T_8192_2421,T_16384_4851,T_4096_1215,T_16384_4869,T_8192_2439,T_16384_4887,T_512_153,T_16384_4905,T_8192_2457,T_16384_4923,T_4096_1233,T_16384_4941,T_8192_2475,T_16384_4959,T_2048_621,T_16384_4977,T_8192_2493,T_16384_4995,T_4096_1251,T_16384_5013,T_8192_2511,T_16384_5031,T_1024_315,T_16384_5049,T_8192_2529,T_16384_5067,T_4096_1269,T_16384_5085,T_8192_2547,T_16384_5103,T_2048_639,T_16384_5121,T_8192_2565,T_16384_5139,T_4096_1287,T_16384_5157,T_8192_2583,T_16384_5175,T_256_81,T_16384_5193,T_8192_2601,T_16384_5211,T_4096_1305,T_16384_5229,T_8192_2619,T_16384_5247,T_2048_657,T_16384_5265,T_8192_2637,T_16384_5283,T_4096_1323,T_16384_5301,T_8192_2655,T_16384_5319,T_1024_333,T_16384_5337,T_8192_2673,T_16384_5355,T_4096_1341,T_16384_5373,T_8192_2691,T_16384_5391,T_2048_675,T_16384_5409,T_8192_2709,T_16384_5427,T_4096_1359,T_16384_5445,T_8192_2727,T_16384_5463,T_512_171,T_16384_5481,T_8192_2745,T_16384_5499,T_4096_1377,T_16384_5517,T_8192_2763,T_16384_5535,T_2048_693,T_16384_5553,T_8192_2781,T_16384_5571,T_4096_1395,T_16384_5589,T_8192_2799,T_16384_5607,T_1024_351,T_16384_5625,T_8192_2817,T_16384_5643,T_4096_1413,T_16384_5661,T_8192_2835,T_16384_5679,T_2048_711,T_16384_5697,T_8192_2853,T_16384_5715,T_4096_1431,T_16384_5733,T_8192_2871,T_16384_5751,T_128_45,T_16384_5769,T_8192_2889,T_16384_5787,T_4096_1449,T_16384_5805,T_8192_2907,T_16384_5823,T_2048_729,T_16384_5841,T_8192_2925,T_16384_5859,T_4096_1467,T_16384_5877,T_8192_2943,T_16384_5895,T_1024_369,T_16384_5913,T_8192_2961,T_16384_5931,T_4096_1485,T_16384_5949,T_8192_2979,T_16384_5967,T_2048_747,T_16384_5985,T_8192_2997,T_16384_6003,T_4096_1503,T_16384_6021,T_8192_3015,T_16384_6039,T_512_189,T_16384_6057,T_8192_3033,T_16384_6075,T_4096_1521,T_16384_6093,T_8192_3051,T_16384_6111,T_2048_765,T_16384_6129,T_8192_3069,T_16384_6147,T_4096_1539,T_16384_6165,T_8192_3087,T_16384_6183,T_1024_387,T_16384_6201,T_8192_3105,T_16384_6219,T_4096_1557,T_16384_6237,T_8192_3123,T_16384_6255,T_2048_783,T_16384_6273,T_8192_3141,T_16384_6291,T_4096_1575,T_16384_6309,T_8192_3159,T_16384_6327,T_256_99,T_16384_6345,T_8192_3177,T_16384_6363,T_4096_1593,T_16384_6381,T_8192_3195,T_16384_6399,T_2048_801,T_16384_6417,T_8192_3213,T_16384_6435,T_4096_1611,T_16384_6453,T_8192_3231,T_16384_6471,T_1024_405,T_16384_6489,T_8192_3249,T_16384_6507,T_4096_1629,T_16384_6525,T_8192_3267,T_16384_6543,T_2048_819,T_16384_6561,T_8192_3285,T_16384_6579,T_4096_1647,T_16384_6597,T_8192_3303,T_16384_6615,T_512_207,T_16384_6633,T_8192_3321,T_16384_6651,T_4096_1665,T_16384_6669,T_8192_3339,T_16384_6687,T_2048_837,T_16384_6705,T_8192_3357,T_16384_6723,T_4096_1683,T_16384_6741,T_8192_3375,T_16384_6759,T_1024_423,T_16384_6777,T_8192_3393,T_16384_6795,T_4096_1701,T_16384_6813,T_8192_3411,T_16384_6831,T_2048_855,T_16384_6849,T_8192_3429,T_16384_6867,T_4096_1719,T_16384_6885,T_8192_3447,T_16384_6903,T_64_27,T_16384_6921,T_8192_3465,T_16384_6939,T_4096_1737,T_16384_6957,T_8192_3483,T_16384_6975,T_2048_873,T_16384_6993,T_8192_3501,T_16384_7011,T_4096_1755,T_16384_7029,T_8192_3519,T_16384_7047,T_1024_441,T_16384_7065,T_8192_3537,T_16384_7083,T_4096_1773,T_16384_7101,T_8192_3555,T_16384_7119,T_2048_891,T_16384_7137,T_8192_3573,T_16384_7155,T_4096_1791,T_16384_7173,T_8192_3591,T_16384_7191,T_512_225,T_16384_7209,T_8192_3609,T_16384_7227,T_4096_1809,T_16384_7245,T_8192_3627,T_16384_7263,T_2048_909,T_16384_7281,T_8192_3645,T_16384_7299,T_4096_1827,T_16384_7317,T_8192_3663,T_16384_7335,T_1024_459,T_16384_7353,T_8192_3681,T_16384_7371,T_4096_1845,T_16384_7389,T_8192_3699,T_16384_7407,T_2048_927,T_16384_7425,T_8192_3717,T_16384_7443,T_4096_1863,T_16384_7461,T_8192_3735,T_16384_7479,T_256_117,T_16384_7497,T_8192_3753,T_16384_7515,T_4096_1881,T_16384_7533,T_8192_3771,T_16384_7551,T_2048_945,T_16384_7569,T_8192_3789,T_16384_7587,T_4096_1899,T_16384_7605,T_8192_3807,T_16384_7623,T_1024_477,T_16384_7641,T_8192_3825,T_16384_7659,T_4096_1917,T_16384_7677,T_8192_3843,T_16384_7695,T_2048_963,T_16384_7713,T_8192_3861,T_16384_7731,T_4096_1935,T_16384_7749,T_8192_3879,T_16384_7767,T_512_243,T_16384_7785,T_8192_3897,T_16384_7803,T_4096_1953,T_16384_7821,T_8192_3915,T_16384_7839,T_2048_981,T_16384_7857,T_8192_3933,T_16384_7875,T_4096_1971,T_16384_7893,T_8192_3951,T_16384_7911,T_1024_495,T_16384_7929,T_8192_3969,T_16384_7947,T_4096_1989,T_16384_7965,T_8192_3987,T_16384_7983,T_2048_999,T_16384_8001,T_8192_4005,T_16384_8019,T_4096_2007,T_16384_8037,T_8192_4023,T_16384_8055,T_128_63,T_16384_8073,T_8192_4041,T_16384_8091,T_4096_2025,T_16384_8109,T_8192_4059,T_16384_8127,T_2048_1017,T_16384_8145,T_8192_4077,T_16384_8163,T_4096_2043,T_16384_8181,T_8192_4095,T_16384_8199,T_1024_513,T_16384_8217,T_8192_4113,T_16384_8235,T_4096_2061,T_16384_8253,T_8192_4131,T_16384_8271,T_2048_1035,T_16384_8289,T_8192_4149,T_16384_8307,T_4096_2079,T_16384_8325,T_8192_4167,T_16384_8343,T_512_261,T_16384_8361,T_8192_4185,T_16384_8379,T_4096_2097,T_16384_8397,T_8192_4203,T_16384_8415,T_2048_1053,T_16384_8433,T_8192_4221,T_16384_8451,T_4096_2115,T_16384_8469,T_8192_4239,T_16384_8487,T_1024_531,T_16384_8505,T_8192_4257,T_16384_8523,T_4096_2133,T_16384_8541,T_8192_4275,T_16384_8559,T_2048_1071,T_16384_8577,T_8192_4293,T_16384_8595,T_4096_2151,T_16384_8613,T_8192_4311,T_16384_8631,T_256_135,T_16384_8649,T_8192_4329,T_16384_8667,T_4096_2169,T_16384_8685,T_8192_4347,T_16384_8703,T_2048_1089,T_16384_8721,T_8192_4365,T_16384_8739,T_4096_2187,T_16384_8757,T_8192_4383,T_16384_8775,T_1024_549,T_16384_8793,T_8192_4401,T_16384_8811,T_4096_2205,T_16384_8829,T_8192_4419,T_16384_8847,T_2048_1107,T_16384_8865,T_8192_4437,T_16384_8883,T_4096_2223,T_16384_8901,T_8192_4455,T_16384_8919,T_512_279,T_16384_8937,T_8192_4473,T_16384_8955,T_4096_2241,T_16384_8973,T_8192_4491,T_16384_8991,T_2048_1125,T_16384_9009,T_8192_4509,T_16384_9027,T_4096_2259,T_16384_9045,T_8192_4527,T_16384_9063,T_1024_567,T_16384_9081,T_8192_4545,T_16384_9099,T_4096_2277,T_16384_9117,T_8192_4563,T_16384_9135,T_2048_1143,T_16384_9153,T_8192_4581,T_16384_9171,T_4096_2295,T_16384_9189,T_8192_4599,T_16384_9207 +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..d808f2d7210ab --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp16_fwd.hpp.inc @@ -0,0 +1,583 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 64, 0, 746> + >; +}; + +template<> struct block_fft_record<14, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 64, 0, 747> + >; +}; + +template<> struct block_fft_record<15, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 64, 0, 748> + >; +}; + +template<> struct block_fft_record<17, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 64, 0, 749> + >; +}; + +template<> struct block_fft_record<18, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 64, 0, 750> + >; +}; + +template<> struct block_fft_record<19, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 64, 0, 751> + >; +}; + +template<> struct block_fft_record<20, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 64, 0, 752> + >; +}; + +template<> struct block_fft_record<21, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 64, 0, 753> + >; +}; + +template<> struct block_fft_record<22, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 64, 0, 754> + >; +}; + +template<> struct block_fft_record<23, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 64, 0, 755> + >; +}; + +template<> struct block_fft_record<24, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 64, 0, 756> + >; +}; + +template<> struct block_fft_record<26, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 64, 0, 757> + >; +}; + +template<> struct block_fft_record<28, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 64, 0, 758> + >; +}; + +template<> struct block_fft_record<29, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 64, 0, 759> + >; +}; + +template<> struct block_fft_record<30, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 64, 0, 760> + >; +}; + +template<> struct block_fft_record<31, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 64, 0, 761> + >; +}; + +template<> struct block_fft_record<2, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 128, 0, 762> + >; +}; + +template<> struct block_fft_record<4, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 128, 0, 763>, + block_fft_implementation< 2, 2, 2, 256, 8, 764>, + block_fft_implementation< 2, 2, 2, 256, 16, 765> + >; +}; + +template<> struct block_fft_record<8, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 64, 0, 766>, + block_fft_implementation< 4, 4, 2, 256, 16, 767>, + block_fft_implementation< 4, 4, 2, 192, 32, 768>, + block_fft_implementation< 2, 2, 4, 160, 16, 769>, + block_fft_implementation< 2, 2, 4, 256, 32, 770> + >; +}; + +template<> struct block_fft_record<16, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 224, 0, 771>, + block_fft_implementation< 4, 4, 4, 128, 32, 772>, + block_fft_implementation< 4, 4, 4, 192, 64, 773>, + block_fft_implementation< 8, 8, 2, 256, 32, 774>, + block_fft_implementation< 8, 8, 2, 224, 64, 775>, + block_fft_implementation< 2, 2, 8, 128, 32, 776>, + block_fft_implementation< 2, 2, 8, 128, 64, 777> + >; +}; + +template<> struct block_fft_record<32, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 64, 0, 778>, + block_fft_implementation< 8, 8, 4, 96, 128, 779>, + block_fft_implementation< 8, 8, 4, 192, 64, 780>, + block_fft_implementation< 4, 4, 8, 96, 128, 781>, + block_fft_implementation< 4, 4, 8, 128, 64, 782>, + block_fft_implementation< 2, 2, 16, 40, 128, 783>, + block_fft_implementation<16, 16, 2, 128, 128, 784>, + block_fft_implementation< 2, 2, 16, 52, 64, 785>, + block_fft_implementation<16, 16, 2, 256, 64, 786> + >; +}; + +template<> struct block_fft_record<64, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 48, 256, 787>, + block_fft_implementation< 8, 8, 8, 26, 128, 788>, + block_fft_implementation< 4, 4, 16, 18, 256, 789>, + block_fft_implementation< 4, 4, 16, 30, 128, 790>, + block_fft_implementation<16, 16, 4, 50, 256, 791>, + block_fft_implementation<16, 16, 4, 58, 128, 792>, + block_fft_implementation<32, 32, 2, 46, 256, 793>, + block_fft_implementation< 2, 2, 32, 28, 256, 794>, + block_fft_implementation< 2, 2, 32, 16, 128, 795>, + block_fft_implementation<32, 32, 2, 128, 128, 796> + >; +}; + +template<> struct block_fft_record<128, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 32, 256, 797>, + block_fft_implementation< 8, 8, 16, 16, 512, 798>, + block_fft_implementation<16, 16, 8, 26, 512, 799>, + block_fft_implementation< 8, 8, 16, 32, 256, 800>, + block_fft_implementation< 4, 4, 32, 8, 512, 801>, + block_fft_implementation< 4, 4, 32, 14, 256, 802>, + block_fft_implementation<32, 32, 4, 8, 512, 803>, + block_fft_implementation<32, 32, 4, 62, 256, 804>, + block_fft_implementation< 2, 2, 64, 12, 512, 805>, + block_fft_implementation< 2, 2, 64, 12, 256, 806> + >; +}; + +template<> struct block_fft_record<256, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 4, 512, 807>, + block_fft_implementation< 8, 8, 32, 8, 1024, 808>, + block_fft_implementation< 8, 8, 32, 8, 512, 809>, + block_fft_implementation<16, 16, 16, 6, 1024, 810>, + block_fft_implementation<32, 32, 8, 8, 1024, 811>, + block_fft_implementation< 4, 4, 64, 4, 1024, 812>, + block_fft_implementation< 4, 4, 64, 8, 512, 813>, + block_fft_implementation<32, 32, 8, 32, 512, 814>, + block_fft_implementation< 2, 2, 128, 6, 1024, 815>, + block_fft_implementation< 2, 2, 128, 8, 512, 816> + >; +}; + +template<> struct block_fft_record<512, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 4, 2048, 817>, + block_fft_implementation< 8, 8, 64, 4, 1024, 818>, + block_fft_implementation<16, 16, 32, 2, 1024, 819>, + block_fft_implementation<32, 32, 16, 4, 2048, 820>, + block_fft_implementation<32, 32, 16, 4, 1024, 821>, + block_fft_implementation<16, 16, 32, 8, 2048, 822>, + block_fft_implementation< 4, 4, 128, 4, 2048, 823>, + block_fft_implementation< 4, 4, 128, 4, 1024, 824>, + block_fft_implementation< 2, 2, 256, 2, 2048, 825>, + block_fft_implementation< 2, 2, 256, 4, 1024, 826> + >; +}; + +template<> struct block_fft_record<1024, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 64, 2, 2048, 827>, + block_fft_implementation< 8, 8, 128, 2, 4096, 828>, + block_fft_implementation<32, 32, 32, 8, 4096, 829>, + block_fft_implementation< 8, 8, 128, 2, 2048, 830>, + block_fft_implementation<32, 32, 32, 8, 2048, 831>, + block_fft_implementation<16, 16, 64, 4, 4096, 832>, + block_fft_implementation< 4, 4, 256, 2, 4096, 833>, + block_fft_implementation< 4, 4, 256, 2, 2048, 834>, + block_fft_implementation< 2, 2, 512, 2, 4096, 835>, + block_fft_implementation< 2, 2, 512, 2, 2048, 836> + >; +}; + +template<> struct block_fft_record<2048, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 2, 4096, 837>, + block_fft_implementation< 8, 8, 256, 2, 8192, 838>, + block_fft_implementation< 8, 8, 256, 2, 4096, 839>, + block_fft_implementation<16, 16, 128, 2, 8192, 840>, + block_fft_implementation<32, 32, 64, 4, 8192, 841>, + block_fft_implementation<32, 32, 64, 4, 4096, 842>, + block_fft_implementation< 4, 4, 512, 2, 8192, 843>, + block_fft_implementation< 4, 4, 512, 2, 4096, 844>, + block_fft_implementation< 2, 2, 1024, 2, 8192, 845>, + block_fft_implementation< 2, 2, 1024, 2, 4096, 846> + >; +}; + +template<> struct block_fft_record<4096, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 2, 8192, 847>, + block_fft_implementation< 8, 8, 512, 2, 16384, 848>, + block_fft_implementation< 8, 8, 512, 2, 8192, 849>, + block_fft_implementation<16, 16, 256, 2, 16384, 850>, + block_fft_implementation<32, 32, 128, 2, 16384, 851>, + block_fft_implementation<32, 32, 128, 2, 8192, 852>, + block_fft_implementation< 4, 4, 1024, 2, 16384, 853>, + block_fft_implementation< 4, 4, 1024, 2, 8192, 854> + >; +}; + +template<> struct block_fft_record<8192, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 2, 16384, 855>, + block_fft_implementation<32, 32, 256, 2, 32768, 856>, + block_fft_implementation<32, 32, 256, 2, 16384, 857>, + block_fft_implementation< 8, 8, 1024, 2, 32768, 858>, + block_fft_implementation<16, 16, 512, 2, 32768, 859>, + block_fft_implementation< 8, 8, 1024, 2, 16384, 860> + >; +}; + +template<> struct block_fft_record<16384, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1024, 2, 32768, 861>, + block_fft_implementation<32, 32, 512, 2, 32768, 862> + >; +}; + +template<> struct block_fft_record<3, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 128, 0, 863> + >; +}; + +template<> struct block_fft_record<9, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 128, 0, 864>, + block_fft_implementation< 3, 3, 3, 128, 36, 865>, + block_fft_implementation< 3, 3, 3, 256, 18, 866> + >; +}; + +template<> struct block_fft_record<27, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 64, 0, 867>, + block_fft_implementation< 9, 9, 3, 42, 54, 868>, + block_fft_implementation< 9, 9, 3, 20, 108, 869>, + block_fft_implementation< 3, 3, 9, 52, 108, 870>, + block_fft_implementation< 3, 3, 9, 48, 54, 871> + >; +}; + +template<> struct block_fft_record<81, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 28, 324, 872>, + block_fft_implementation< 9, 9, 9, 14, 162, 873>, + block_fft_implementation<27, 27, 3, 20, 162, 874>, + block_fft_implementation<27, 27, 3, 18, 324, 875>, + block_fft_implementation< 3, 3, 27, 14, 324, 876>, + block_fft_implementation< 3, 3, 27, 18, 162, 877> + >; +}; + +template<> struct block_fft_record<243, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 28, 972, 878>, + block_fft_implementation<27, 27, 9, 28, 486, 879>, + block_fft_implementation< 9, 9, 27, 18, 972, 880>, + block_fft_implementation< 9, 9, 27, 18, 486, 881>, + block_fft_implementation< 3, 3, 81, 6, 972, 882>, + block_fft_implementation< 3, 3, 81, 6, 486, 883> + >; +}; + +template<> struct block_fft_record<729, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 28, 2916, 884>, + block_fft_implementation<27, 27, 27, 18, 1458, 885>, + block_fft_implementation< 9, 9, 81, 6, 2916, 886>, + block_fft_implementation< 9, 9, 81, 6, 1458, 887>, + block_fft_implementation< 3, 3, 243, 2, 2916, 888>, + block_fft_implementation< 3, 3, 243, 2, 1458, 889> + >; +}; + +template<> struct block_fft_record<2187, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 2, 8748, 890>, + block_fft_implementation<27, 27, 81, 6, 8748, 891>, + block_fft_implementation< 9, 9, 243, 2, 4374, 892>, + block_fft_implementation<27, 27, 81, 6, 4374, 893>, + block_fft_implementation< 3, 3, 729, 2, 8748, 894>, + block_fft_implementation< 3, 3, 729, 2, 4374, 895> + >; +}; + +template<> struct block_fft_record<6561, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 2, 13122, 896>, + block_fft_implementation<27, 27, 243, 2, 26244, 897>, + block_fft_implementation< 9, 9, 729, 2, 13122, 898>, + block_fft_implementation< 9, 9, 729, 2, 26244, 899> + >; +}; + +template<> struct block_fft_record<5, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 64, 0, 900> + >; +}; + +template<> struct block_fft_record<25, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 64, 0, 901>, + block_fft_implementation< 5, 5, 5, 256, 100, 902>, + block_fft_implementation< 5, 5, 5, 50, 50, 903> + >; +}; + +template<> struct block_fft_record<125, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 50, 250, 904>, + block_fft_implementation<25, 25, 5, 50, 500, 905>, + block_fft_implementation< 5, 5, 25, 10, 500, 906>, + block_fft_implementation< 5, 5, 25, 10, 250, 907> + >; +}; + +template<> struct block_fft_record<625, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 10, 1250, 908>, + block_fft_implementation<25, 25, 25, 10, 2500, 909>, + block_fft_implementation< 5, 5, 125, 2, 2500, 910>, + block_fft_implementation< 5, 5, 125, 2, 1250, 911> + >; +}; + +template<> struct block_fft_record<3125, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 2, 6250, 912>, + block_fft_implementation<25, 25, 125, 6, 12500, 913>, + block_fft_implementation< 5, 5, 625, 2, 12500, 914>, + block_fft_implementation< 5, 5, 625, 2, 6250, 915> + >; +}; + +template<> struct block_fft_record<15625, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 2, 31250, 916> + >; +}; + +template<> struct block_fft_record<7, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 128, 0, 917> + >; +}; + +template<> struct block_fft_record<49, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 18, 196, 918>, + block_fft_implementation< 7, 7, 7, 18, 98, 919> + >; +}; + +template<> struct block_fft_record<343, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 10, 1372, 920>, + block_fft_implementation< 7, 7, 49, 10, 686, 921> + >; +}; + +template<> struct block_fft_record<2401, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 2, 9604, 922>, + block_fft_implementation< 7, 7, 343, 2, 4802, 923> + >; +}; + +template<> struct block_fft_record<11, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 192, 0, 924> + >; +}; + +template<> struct block_fft_record<121, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 46, 484, 925>, + block_fft_implementation<11, 11, 11, 46, 242, 926> + >; +}; + +template<> struct block_fft_record<1331, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 2, 2662, 927>, + block_fft_implementation<11, 11, 121, 2, 5324, 928> + >; +}; + +template<> struct block_fft_record<6, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 64, 0, 929> + >; +}; + +template<> struct block_fft_record<36, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 128, 72, 930>, + block_fft_implementation< 6, 6, 6, 128, 144, 931> + >; +}; + +template<> struct block_fft_record<216, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 14, 864, 932>, + block_fft_implementation< 6, 6, 36, 14, 432, 933> + >; +}; + +template<> struct block_fft_record<1296, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 4, 5184, 934>, + block_fft_implementation< 6, 6, 216, 2, 2592, 935> + >; +}; + +template<> struct block_fft_record<10, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 64, 0, 936> + >; +}; + +template<> struct block_fft_record<100, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 50, 200, 937>, + block_fft_implementation<10, 10, 10, 50, 400, 938> + >; +}; + +template<> struct block_fft_record<1000, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 10, 4000, 939>, + block_fft_implementation<10, 10, 100, 10, 2000, 940> + >; +}; + +template<> struct block_fft_record<10000, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 2, 40000, 941>, + block_fft_implementation<10, 10, 1000, 2, 20000, 942> + >; +}; + +template<> struct block_fft_record<12, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 128, 0, 943> + >; +}; + +template<> struct block_fft_record<144, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 42, 576, 944>, + block_fft_implementation<12, 12, 12, 42, 288, 945> + >; +}; + +template<> struct block_fft_record<1728, half, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 2, 6912, 946>, + block_fft_implementation<12, 12, 144, 2, 3456, 947> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..e31609364b463 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp16_inv.hpp.inc @@ -0,0 +1,583 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 64, 0, 948> + >; +}; + +template<> struct block_fft_record<14, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 64, 0, 949> + >; +}; + +template<> struct block_fft_record<15, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 64, 0, 950> + >; +}; + +template<> struct block_fft_record<17, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 64, 0, 951> + >; +}; + +template<> struct block_fft_record<18, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 64, 0, 952> + >; +}; + +template<> struct block_fft_record<19, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 64, 0, 953> + >; +}; + +template<> struct block_fft_record<20, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 64, 0, 954> + >; +}; + +template<> struct block_fft_record<21, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 64, 0, 955> + >; +}; + +template<> struct block_fft_record<22, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 64, 0, 956> + >; +}; + +template<> struct block_fft_record<23, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 64, 0, 957> + >; +}; + +template<> struct block_fft_record<24, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 64, 0, 958> + >; +}; + +template<> struct block_fft_record<26, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 64, 0, 959> + >; +}; + +template<> struct block_fft_record<28, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 64, 0, 960> + >; +}; + +template<> struct block_fft_record<29, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 64, 0, 961> + >; +}; + +template<> struct block_fft_record<30, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 64, 0, 962> + >; +}; + +template<> struct block_fft_record<31, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 64, 0, 963> + >; +}; + +template<> struct block_fft_record<2, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 128, 0, 964> + >; +}; + +template<> struct block_fft_record<4, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 128, 0, 965>, + block_fft_implementation< 2, 2, 2, 256, 8, 966>, + block_fft_implementation< 2, 2, 2, 256, 16, 967> + >; +}; + +template<> struct block_fft_record<8, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 64, 0, 968>, + block_fft_implementation< 4, 4, 2, 256, 16, 969>, + block_fft_implementation< 4, 4, 2, 192, 32, 970>, + block_fft_implementation< 2, 2, 4, 160, 16, 971>, + block_fft_implementation< 2, 2, 4, 256, 32, 972> + >; +}; + +template<> struct block_fft_record<16, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 224, 0, 973>, + block_fft_implementation< 4, 4, 4, 128, 32, 974>, + block_fft_implementation< 4, 4, 4, 192, 64, 975>, + block_fft_implementation< 8, 8, 2, 256, 32, 976>, + block_fft_implementation< 8, 8, 2, 224, 64, 977>, + block_fft_implementation< 2, 2, 8, 128, 32, 978>, + block_fft_implementation< 2, 2, 8, 128, 64, 979> + >; +}; + +template<> struct block_fft_record<32, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 64, 0, 980>, + block_fft_implementation< 8, 8, 4, 96, 128, 981>, + block_fft_implementation< 8, 8, 4, 192, 64, 982>, + block_fft_implementation< 4, 4, 8, 96, 128, 983>, + block_fft_implementation< 4, 4, 8, 128, 64, 984>, + block_fft_implementation< 2, 2, 16, 40, 128, 985>, + block_fft_implementation<16, 16, 2, 128, 128, 986>, + block_fft_implementation< 2, 2, 16, 52, 64, 987>, + block_fft_implementation<16, 16, 2, 256, 64, 988> + >; +}; + +template<> struct block_fft_record<64, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 48, 256, 989>, + block_fft_implementation< 8, 8, 8, 26, 128, 990>, + block_fft_implementation< 4, 4, 16, 18, 256, 991>, + block_fft_implementation< 4, 4, 16, 30, 128, 992>, + block_fft_implementation<16, 16, 4, 50, 256, 993>, + block_fft_implementation<16, 16, 4, 58, 128, 994>, + block_fft_implementation<32, 32, 2, 46, 256, 995>, + block_fft_implementation< 2, 2, 32, 28, 256, 996>, + block_fft_implementation< 2, 2, 32, 16, 128, 997>, + block_fft_implementation<32, 32, 2, 128, 128, 998> + >; +}; + +template<> struct block_fft_record<128, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 32, 256, 999>, + block_fft_implementation< 8, 8, 16, 16, 512, 1000>, + block_fft_implementation<16, 16, 8, 26, 512, 1001>, + block_fft_implementation< 8, 8, 16, 32, 256, 1002>, + block_fft_implementation< 4, 4, 32, 8, 512, 1003>, + block_fft_implementation< 4, 4, 32, 14, 256, 1004>, + block_fft_implementation<32, 32, 4, 8, 512, 1005>, + block_fft_implementation<32, 32, 4, 62, 256, 1006>, + block_fft_implementation< 2, 2, 64, 12, 512, 1007>, + block_fft_implementation< 2, 2, 64, 12, 256, 1008> + >; +}; + +template<> struct block_fft_record<256, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 4, 512, 1009>, + block_fft_implementation< 8, 8, 32, 8, 1024, 1010>, + block_fft_implementation< 8, 8, 32, 8, 512, 1011>, + block_fft_implementation<16, 16, 16, 6, 1024, 1012>, + block_fft_implementation<32, 32, 8, 8, 1024, 1013>, + block_fft_implementation< 4, 4, 64, 4, 1024, 1014>, + block_fft_implementation< 4, 4, 64, 8, 512, 1015>, + block_fft_implementation<32, 32, 8, 32, 512, 1016>, + block_fft_implementation< 2, 2, 128, 6, 1024, 1017>, + block_fft_implementation< 2, 2, 128, 8, 512, 1018> + >; +}; + +template<> struct block_fft_record<512, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 4, 2048, 1019>, + block_fft_implementation< 8, 8, 64, 4, 1024, 1020>, + block_fft_implementation<16, 16, 32, 2, 1024, 1021>, + block_fft_implementation<32, 32, 16, 4, 2048, 1022>, + block_fft_implementation<32, 32, 16, 4, 1024, 1023>, + block_fft_implementation<16, 16, 32, 8, 2048, 1024>, + block_fft_implementation< 4, 4, 128, 4, 2048, 1025>, + block_fft_implementation< 4, 4, 128, 4, 1024, 1026>, + block_fft_implementation< 2, 2, 256, 2, 2048, 1027>, + block_fft_implementation< 2, 2, 256, 4, 1024, 1028> + >; +}; + +template<> struct block_fft_record<1024, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 64, 2, 2048, 1029>, + block_fft_implementation< 8, 8, 128, 2, 4096, 1030>, + block_fft_implementation<32, 32, 32, 8, 4096, 1031>, + block_fft_implementation< 8, 8, 128, 2, 2048, 1032>, + block_fft_implementation<32, 32, 32, 8, 2048, 1033>, + block_fft_implementation<16, 16, 64, 4, 4096, 1034>, + block_fft_implementation< 4, 4, 256, 2, 4096, 1035>, + block_fft_implementation< 4, 4, 256, 2, 2048, 1036>, + block_fft_implementation< 2, 2, 512, 2, 4096, 1037>, + block_fft_implementation< 2, 2, 512, 2, 2048, 1038> + >; +}; + +template<> struct block_fft_record<2048, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 2, 4096, 1039>, + block_fft_implementation< 8, 8, 256, 2, 8192, 1040>, + block_fft_implementation< 8, 8, 256, 2, 4096, 1041>, + block_fft_implementation<16, 16, 128, 2, 8192, 1042>, + block_fft_implementation<32, 32, 64, 4, 8192, 1043>, + block_fft_implementation<32, 32, 64, 4, 4096, 1044>, + block_fft_implementation< 4, 4, 512, 2, 8192, 1045>, + block_fft_implementation< 4, 4, 512, 2, 4096, 1046>, + block_fft_implementation< 2, 2, 1024, 2, 8192, 1047>, + block_fft_implementation< 2, 2, 1024, 2, 4096, 1048> + >; +}; + +template<> struct block_fft_record<4096, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 2, 8192, 1049>, + block_fft_implementation< 8, 8, 512, 2, 16384, 1050>, + block_fft_implementation< 8, 8, 512, 2, 8192, 1051>, + block_fft_implementation<16, 16, 256, 2, 16384, 1052>, + block_fft_implementation<32, 32, 128, 2, 16384, 1053>, + block_fft_implementation<32, 32, 128, 2, 8192, 1054>, + block_fft_implementation< 4, 4, 1024, 2, 16384, 1055>, + block_fft_implementation< 4, 4, 1024, 2, 8192, 1056> + >; +}; + +template<> struct block_fft_record<8192, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 2, 16384, 1057>, + block_fft_implementation<32, 32, 256, 2, 32768, 1058>, + block_fft_implementation<32, 32, 256, 2, 16384, 1059>, + block_fft_implementation< 8, 8, 1024, 2, 32768, 1060>, + block_fft_implementation<16, 16, 512, 2, 32768, 1061>, + block_fft_implementation< 8, 8, 1024, 2, 16384, 1062> + >; +}; + +template<> struct block_fft_record<16384, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1024, 2, 32768, 1063>, + block_fft_implementation<32, 32, 512, 2, 32768, 1064> + >; +}; + +template<> struct block_fft_record<3, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 128, 0, 1065> + >; +}; + +template<> struct block_fft_record<9, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 128, 0, 1066>, + block_fft_implementation< 3, 3, 3, 128, 36, 1067>, + block_fft_implementation< 3, 3, 3, 256, 18, 1068> + >; +}; + +template<> struct block_fft_record<27, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 64, 0, 1069>, + block_fft_implementation< 9, 9, 3, 42, 54, 1070>, + block_fft_implementation< 9, 9, 3, 20, 108, 1071>, + block_fft_implementation< 3, 3, 9, 52, 108, 1072>, + block_fft_implementation< 3, 3, 9, 48, 54, 1073> + >; +}; + +template<> struct block_fft_record<81, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 28, 324, 1074>, + block_fft_implementation< 9, 9, 9, 14, 162, 1075>, + block_fft_implementation<27, 27, 3, 20, 162, 1076>, + block_fft_implementation<27, 27, 3, 18, 324, 1077>, + block_fft_implementation< 3, 3, 27, 14, 324, 1078>, + block_fft_implementation< 3, 3, 27, 18, 162, 1079> + >; +}; + +template<> struct block_fft_record<243, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 28, 972, 1080>, + block_fft_implementation<27, 27, 9, 28, 486, 1081>, + block_fft_implementation< 9, 9, 27, 18, 972, 1082>, + block_fft_implementation< 9, 9, 27, 18, 486, 1083>, + block_fft_implementation< 3, 3, 81, 6, 972, 1084>, + block_fft_implementation< 3, 3, 81, 6, 486, 1085> + >; +}; + +template<> struct block_fft_record<729, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 28, 2916, 1086>, + block_fft_implementation<27, 27, 27, 18, 1458, 1087>, + block_fft_implementation< 9, 9, 81, 6, 2916, 1088>, + block_fft_implementation< 9, 9, 81, 6, 1458, 1089>, + block_fft_implementation< 3, 3, 243, 2, 2916, 1090>, + block_fft_implementation< 3, 3, 243, 2, 1458, 1091> + >; +}; + +template<> struct block_fft_record<2187, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 2, 8748, 1092>, + block_fft_implementation<27, 27, 81, 6, 8748, 1093>, + block_fft_implementation< 9, 9, 243, 2, 4374, 1094>, + block_fft_implementation<27, 27, 81, 6, 4374, 1095>, + block_fft_implementation< 3, 3, 729, 2, 8748, 1096>, + block_fft_implementation< 3, 3, 729, 2, 4374, 1097> + >; +}; + +template<> struct block_fft_record<6561, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 2, 13122, 1098>, + block_fft_implementation<27, 27, 243, 2, 26244, 1099>, + block_fft_implementation< 9, 9, 729, 2, 13122, 1100>, + block_fft_implementation< 9, 9, 729, 2, 26244, 1101> + >; +}; + +template<> struct block_fft_record<5, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 64, 0, 1102> + >; +}; + +template<> struct block_fft_record<25, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 64, 0, 1103>, + block_fft_implementation< 5, 5, 5, 256, 100, 1104>, + block_fft_implementation< 5, 5, 5, 50, 50, 1105> + >; +}; + +template<> struct block_fft_record<125, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 50, 250, 1106>, + block_fft_implementation<25, 25, 5, 50, 500, 1107>, + block_fft_implementation< 5, 5, 25, 10, 500, 1108>, + block_fft_implementation< 5, 5, 25, 10, 250, 1109> + >; +}; + +template<> struct block_fft_record<625, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 10, 1250, 1110>, + block_fft_implementation<25, 25, 25, 10, 2500, 1111>, + block_fft_implementation< 5, 5, 125, 2, 2500, 1112>, + block_fft_implementation< 5, 5, 125, 2, 1250, 1113> + >; +}; + +template<> struct block_fft_record<3125, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 2, 6250, 1114>, + block_fft_implementation<25, 25, 125, 6, 12500, 1115>, + block_fft_implementation< 5, 5, 625, 2, 12500, 1116>, + block_fft_implementation< 5, 5, 625, 2, 6250, 1117> + >; +}; + +template<> struct block_fft_record<15625, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 2, 31250, 1118> + >; +}; + +template<> struct block_fft_record<7, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 128, 0, 1119> + >; +}; + +template<> struct block_fft_record<49, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 18, 196, 1120>, + block_fft_implementation< 7, 7, 7, 18, 98, 1121> + >; +}; + +template<> struct block_fft_record<343, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 10, 1372, 1122>, + block_fft_implementation< 7, 7, 49, 10, 686, 1123> + >; +}; + +template<> struct block_fft_record<2401, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 2, 9604, 1124>, + block_fft_implementation< 7, 7, 343, 2, 4802, 1125> + >; +}; + +template<> struct block_fft_record<11, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 192, 0, 1126> + >; +}; + +template<> struct block_fft_record<121, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 46, 484, 1127>, + block_fft_implementation<11, 11, 11, 46, 242, 1128> + >; +}; + +template<> struct block_fft_record<1331, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 2, 2662, 1129>, + block_fft_implementation<11, 11, 121, 2, 5324, 1130> + >; +}; + +template<> struct block_fft_record<6, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 64, 0, 1131> + >; +}; + +template<> struct block_fft_record<36, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 128, 72, 1132>, + block_fft_implementation< 6, 6, 6, 128, 144, 1133> + >; +}; + +template<> struct block_fft_record<216, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 14, 864, 1134>, + block_fft_implementation< 6, 6, 36, 14, 432, 1135> + >; +}; + +template<> struct block_fft_record<1296, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 4, 5184, 1136>, + block_fft_implementation< 6, 6, 216, 2, 2592, 1137> + >; +}; + +template<> struct block_fft_record<10, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 64, 0, 1138> + >; +}; + +template<> struct block_fft_record<100, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 50, 200, 1139>, + block_fft_implementation<10, 10, 10, 50, 400, 1140> + >; +}; + +template<> struct block_fft_record<1000, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 10, 4000, 1141>, + block_fft_implementation<10, 10, 100, 10, 2000, 1142> + >; +}; + +template<> struct block_fft_record<10000, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 2, 40000, 1143>, + block_fft_implementation<10, 10, 1000, 2, 20000, 1144> + >; +}; + +template<> struct block_fft_record<12, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 128, 0, 1145> + >; +}; + +template<> struct block_fft_record<144, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 42, 576, 1146>, + block_fft_implementation<12, 12, 12, 42, 288, 1147> + >; +}; + +template<> struct block_fft_record<1728, half, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 2, 6912, 1148>, + block_fft_implementation<12, 12, 144, 2, 3456, 1149> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..eb53617cb4fdc --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp32_fwd.hpp.inc @@ -0,0 +1,583 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 0> + >; +}; + +template<> struct block_fft_record<14, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 1> + >; +}; + +template<> struct block_fft_record<15, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 2> + >; +}; + +template<> struct block_fft_record<17, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 3> + >; +}; + +template<> struct block_fft_record<18, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 4> + >; +}; + +template<> struct block_fft_record<19, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 5> + >; +}; + +template<> struct block_fft_record<20, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 6> + >; +}; + +template<> struct block_fft_record<21, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 7> + >; +}; + +template<> struct block_fft_record<22, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 8> + >; +}; + +template<> struct block_fft_record<23, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 9> + >; +}; + +template<> struct block_fft_record<24, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 10> + >; +}; + +template<> struct block_fft_record<26, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 11> + >; +}; + +template<> struct block_fft_record<28, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 12> + >; +}; + +template<> struct block_fft_record<29, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 13> + >; +}; + +template<> struct block_fft_record<30, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 14> + >; +}; + +template<> struct block_fft_record<31, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 15> + >; +}; + +template<> struct block_fft_record<2, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 64, 0, 16> + >; +}; + +template<> struct block_fft_record<4, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 64, 0, 17>, + block_fft_implementation< 2, 2, 2, 128, 16, 18>, + block_fft_implementation< 2, 2, 2, 128, 32, 19> + >; +}; + +template<> struct block_fft_record<8, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 20>, + block_fft_implementation< 4, 4, 2, 128, 32, 21>, + block_fft_implementation< 4, 4, 2, 96, 64, 22>, + block_fft_implementation< 2, 2, 4, 80, 32, 23>, + block_fft_implementation< 2, 2, 4, 128, 64, 24> + >; +}; + +template<> struct block_fft_record<16, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 112, 0, 25>, + block_fft_implementation< 4, 4, 4, 64, 64, 26>, + block_fft_implementation< 4, 4, 4, 96, 128, 27>, + block_fft_implementation< 8, 8, 2, 128, 64, 28>, + block_fft_implementation< 8, 8, 2, 112, 128, 29>, + block_fft_implementation< 2, 2, 8, 64, 64, 30>, + block_fft_implementation< 2, 2, 8, 64, 128, 31> + >; +}; + +template<> struct block_fft_record<32, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 32, 0, 32>, + block_fft_implementation< 8, 8, 4, 48, 256, 33>, + block_fft_implementation< 8, 8, 4, 96, 128, 34>, + block_fft_implementation< 4, 4, 8, 48, 256, 35>, + block_fft_implementation< 4, 4, 8, 64, 128, 36>, + block_fft_implementation< 2, 2, 16, 20, 256, 37>, + block_fft_implementation<16, 16, 2, 64, 256, 38>, + block_fft_implementation< 2, 2, 16, 26, 128, 39>, + block_fft_implementation<16, 16, 2, 128, 128, 40> + >; +}; + +template<> struct block_fft_record<64, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 24, 512, 41>, + block_fft_implementation< 8, 8, 8, 13, 256, 42>, + block_fft_implementation< 4, 4, 16, 9, 512, 43>, + block_fft_implementation< 4, 4, 16, 15, 256, 44>, + block_fft_implementation<16, 16, 4, 25, 512, 45>, + block_fft_implementation<16, 16, 4, 29, 256, 46>, + block_fft_implementation<32, 32, 2, 23, 512, 47>, + block_fft_implementation< 2, 2, 32, 14, 512, 48>, + block_fft_implementation< 2, 2, 32, 8, 256, 49>, + block_fft_implementation<32, 32, 2, 64, 256, 50> + >; +}; + +template<> struct block_fft_record<128, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 16, 512, 51>, + block_fft_implementation< 8, 8, 16, 8, 1024, 52>, + block_fft_implementation<16, 16, 8, 13, 1024, 53>, + block_fft_implementation< 8, 8, 16, 16, 512, 54>, + block_fft_implementation< 4, 4, 32, 4, 1024, 55>, + block_fft_implementation< 4, 4, 32, 7, 512, 56>, + block_fft_implementation<32, 32, 4, 4, 1024, 57>, + block_fft_implementation<32, 32, 4, 31, 512, 58>, + block_fft_implementation< 2, 2, 64, 6, 1024, 59>, + block_fft_implementation< 2, 2, 64, 6, 512, 60> + >; +}; + +template<> struct block_fft_record<256, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 2, 1024, 61>, + block_fft_implementation< 8, 8, 32, 4, 2048, 62>, + block_fft_implementation< 8, 8, 32, 4, 1024, 63>, + block_fft_implementation<16, 16, 16, 3, 2048, 64>, + block_fft_implementation<32, 32, 8, 4, 2048, 65>, + block_fft_implementation< 4, 4, 64, 2, 2048, 66>, + block_fft_implementation< 4, 4, 64, 4, 1024, 67>, + block_fft_implementation<32, 32, 8, 16, 1024, 68>, + block_fft_implementation< 2, 2, 128, 3, 2048, 69>, + block_fft_implementation< 2, 2, 128, 4, 1024, 70> + >; +}; + +template<> struct block_fft_record<512, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 2, 4096, 71>, + block_fft_implementation< 8, 8, 64, 2, 2048, 72>, + block_fft_implementation<16, 16, 32, 1, 2048, 73>, + block_fft_implementation<32, 32, 16, 2, 4096, 74>, + block_fft_implementation<32, 32, 16, 2, 2048, 75>, + block_fft_implementation<16, 16, 32, 4, 4096, 76>, + block_fft_implementation< 4, 4, 128, 2, 4096, 77>, + block_fft_implementation< 4, 4, 128, 2, 2048, 78>, + block_fft_implementation< 2, 2, 256, 1, 4096, 79>, + block_fft_implementation< 2, 2, 256, 2, 2048, 80> + >; +}; + +template<> struct block_fft_record<1024, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 64, 1, 4096, 81>, + block_fft_implementation< 8, 8, 128, 1, 8192, 82>, + block_fft_implementation<32, 32, 32, 4, 8192, 83>, + block_fft_implementation< 8, 8, 128, 1, 4096, 84>, + block_fft_implementation<32, 32, 32, 4, 4096, 85>, + block_fft_implementation<16, 16, 64, 2, 8192, 86>, + block_fft_implementation< 4, 4, 256, 1, 8192, 87>, + block_fft_implementation< 4, 4, 256, 1, 4096, 88>, + block_fft_implementation< 2, 2, 512, 1, 8192, 89>, + block_fft_implementation< 2, 2, 512, 1, 4096, 90> + >; +}; + +template<> struct block_fft_record<2048, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 1, 8192, 91>, + block_fft_implementation< 8, 8, 256, 1, 16384, 92>, + block_fft_implementation< 8, 8, 256, 1, 8192, 93>, + block_fft_implementation<16, 16, 128, 1, 16384, 94>, + block_fft_implementation<32, 32, 64, 2, 16384, 95>, + block_fft_implementation<32, 32, 64, 2, 8192, 96>, + block_fft_implementation< 4, 4, 512, 1, 16384, 97>, + block_fft_implementation< 4, 4, 512, 1, 8192, 98>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 99>, + block_fft_implementation< 2, 2, 1024, 1, 8192, 100> + >; +}; + +template<> struct block_fft_record<4096, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 1, 16384, 101>, + block_fft_implementation< 8, 8, 512, 1, 32768, 102>, + block_fft_implementation< 8, 8, 512, 1, 16384, 103>, + block_fft_implementation<16, 16, 256, 1, 32768, 104>, + block_fft_implementation<32, 32, 128, 1, 32768, 105>, + block_fft_implementation<32, 32, 128, 1, 16384, 106>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 107>, + block_fft_implementation< 4, 4, 1024, 1, 16384, 108> + >; +}; + +template<> struct block_fft_record<8192, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 1, 32768, 109>, + block_fft_implementation<32, 32, 256, 1, 65536, 110>, + block_fft_implementation<32, 32, 256, 1, 32768, 111>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 112>, + block_fft_implementation<16, 16, 512, 1, 65536, 113>, + block_fft_implementation< 8, 8, 1024, 1, 32768, 114> + >; +}; + +template<> struct block_fft_record<16384, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1024, 1, 65536, 115>, + block_fft_implementation<32, 32, 512, 1, 65536, 116> + >; +}; + +template<> struct block_fft_record<3, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 117> + >; +}; + +template<> struct block_fft_record<9, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 64, 0, 118>, + block_fft_implementation< 3, 3, 3, 64, 72, 119>, + block_fft_implementation< 3, 3, 3, 128, 36, 120> + >; +}; + +template<> struct block_fft_record<27, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 32, 0, 121>, + block_fft_implementation< 9, 9, 3, 21, 108, 122>, + block_fft_implementation< 9, 9, 3, 10, 216, 123>, + block_fft_implementation< 3, 3, 9, 26, 216, 124>, + block_fft_implementation< 3, 3, 9, 24, 108, 125> + >; +}; + +template<> struct block_fft_record<81, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 14, 648, 126>, + block_fft_implementation< 9, 9, 9, 7, 324, 127>, + block_fft_implementation<27, 27, 3, 10, 324, 128>, + block_fft_implementation<27, 27, 3, 9, 648, 129>, + block_fft_implementation< 3, 3, 27, 7, 648, 130>, + block_fft_implementation< 3, 3, 27, 9, 324, 131> + >; +}; + +template<> struct block_fft_record<243, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 14, 1944, 132>, + block_fft_implementation<27, 27, 9, 14, 972, 133>, + block_fft_implementation< 9, 9, 27, 9, 1944, 134>, + block_fft_implementation< 9, 9, 27, 9, 972, 135>, + block_fft_implementation< 3, 3, 81, 3, 1944, 136>, + block_fft_implementation< 3, 3, 81, 3, 972, 137> + >; +}; + +template<> struct block_fft_record<729, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 14, 5832, 138>, + block_fft_implementation<27, 27, 27, 9, 2916, 139>, + block_fft_implementation< 9, 9, 81, 3, 5832, 140>, + block_fft_implementation< 9, 9, 81, 3, 2916, 141>, + block_fft_implementation< 3, 3, 243, 1, 5832, 142>, + block_fft_implementation< 3, 3, 243, 1, 2916, 143> + >; +}; + +template<> struct block_fft_record<2187, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 1, 17496, 144>, + block_fft_implementation<27, 27, 81, 3, 17496, 145>, + block_fft_implementation< 9, 9, 243, 1, 8748, 146>, + block_fft_implementation<27, 27, 81, 3, 8748, 147>, + block_fft_implementation< 3, 3, 729, 1, 17496, 148>, + block_fft_implementation< 3, 3, 729, 1, 8748, 149> + >; +}; + +template<> struct block_fft_record<6561, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 1, 26244, 150>, + block_fft_implementation<27, 27, 243, 1, 52488, 151>, + block_fft_implementation< 9, 9, 729, 1, 26244, 152>, + block_fft_implementation< 9, 9, 729, 1, 52488, 153> + >; +}; + +template<> struct block_fft_record<5, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 32, 0, 154> + >; +}; + +template<> struct block_fft_record<25, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 32, 0, 155>, + block_fft_implementation< 5, 5, 5, 128, 200, 156>, + block_fft_implementation< 5, 5, 5, 25, 100, 157> + >; +}; + +template<> struct block_fft_record<125, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 25, 500, 158>, + block_fft_implementation<25, 25, 5, 25, 1000, 159>, + block_fft_implementation< 5, 5, 25, 5, 1000, 160>, + block_fft_implementation< 5, 5, 25, 5, 500, 161> + >; +}; + +template<> struct block_fft_record<625, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 2500, 162>, + block_fft_implementation<25, 25, 25, 5, 5000, 163>, + block_fft_implementation< 5, 5, 125, 1, 5000, 164>, + block_fft_implementation< 5, 5, 125, 1, 2500, 165> + >; +}; + +template<> struct block_fft_record<3125, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 1, 12500, 166>, + block_fft_implementation<25, 25, 125, 3, 25000, 167>, + block_fft_implementation< 5, 5, 625, 1, 25000, 168>, + block_fft_implementation< 5, 5, 625, 1, 12500, 169> + >; +}; + +template<> struct block_fft_record<15625, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 1, 62500, 170> + >; +}; + +template<> struct block_fft_record<7, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 64, 0, 171> + >; +}; + +template<> struct block_fft_record<49, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 9, 392, 172>, + block_fft_implementation< 7, 7, 7, 9, 196, 173> + >; +}; + +template<> struct block_fft_record<343, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 2744, 174>, + block_fft_implementation< 7, 7, 49, 5, 1372, 175> + >; +}; + +template<> struct block_fft_record<2401, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 19208, 176>, + block_fft_implementation< 7, 7, 343, 1, 9604, 177> + >; +}; + +template<> struct block_fft_record<11, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 96, 0, 178> + >; +}; + +template<> struct block_fft_record<121, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 968, 179>, + block_fft_implementation<11, 11, 11, 23, 484, 180> + >; +}; + +template<> struct block_fft_record<1331, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 5324, 181>, + block_fft_implementation<11, 11, 121, 1, 10648, 182> + >; +}; + +template<> struct block_fft_record<6, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 32, 0, 183> + >; +}; + +template<> struct block_fft_record<36, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 64, 144, 184>, + block_fft_implementation< 6, 6, 6, 64, 288, 185> + >; +}; + +template<> struct block_fft_record<216, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 186>, + block_fft_implementation< 6, 6, 36, 7, 864, 187> + >; +}; + +template<> struct block_fft_record<1296, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 2, 10368, 188>, + block_fft_implementation< 6, 6, 216, 1, 5184, 189> + >; +}; + +template<> struct block_fft_record<10, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 190> + >; +}; + +template<> struct block_fft_record<100, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 25, 400, 191>, + block_fft_implementation<10, 10, 10, 25, 800, 192> + >; +}; + +template<> struct block_fft_record<1000, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 8000, 193>, + block_fft_implementation<10, 10, 100, 5, 4000, 194> + >; +}; + +template<> struct block_fft_record<10000, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 1, 80000, 195>, + block_fft_implementation<10, 10, 1000, 1, 40000, 196> + >; +}; + +template<> struct block_fft_record<12, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 64, 0, 197> + >; +}; + +template<> struct block_fft_record<144, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 21, 1152, 198>, + block_fft_implementation<12, 12, 12, 21, 576, 199> + >; +}; + +template<> struct block_fft_record<1728, float, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 1, 13824, 200>, + block_fft_implementation<12, 12, 144, 1, 6912, 201> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..096f1d334becd --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp32_inv.hpp.inc @@ -0,0 +1,583 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 202> + >; +}; + +template<> struct block_fft_record<14, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 203> + >; +}; + +template<> struct block_fft_record<15, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 204> + >; +}; + +template<> struct block_fft_record<17, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 205> + >; +}; + +template<> struct block_fft_record<18, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 206> + >; +}; + +template<> struct block_fft_record<19, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 207> + >; +}; + +template<> struct block_fft_record<20, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 208> + >; +}; + +template<> struct block_fft_record<21, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 209> + >; +}; + +template<> struct block_fft_record<22, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 210> + >; +}; + +template<> struct block_fft_record<23, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 211> + >; +}; + +template<> struct block_fft_record<24, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 212> + >; +}; + +template<> struct block_fft_record<26, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 213> + >; +}; + +template<> struct block_fft_record<28, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 214> + >; +}; + +template<> struct block_fft_record<29, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 215> + >; +}; + +template<> struct block_fft_record<30, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 216> + >; +}; + +template<> struct block_fft_record<31, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 217> + >; +}; + +template<> struct block_fft_record<2, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 64, 0, 218> + >; +}; + +template<> struct block_fft_record<4, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 64, 0, 219>, + block_fft_implementation< 2, 2, 2, 128, 16, 220>, + block_fft_implementation< 2, 2, 2, 128, 32, 221> + >; +}; + +template<> struct block_fft_record<8, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 222>, + block_fft_implementation< 4, 4, 2, 128, 32, 223>, + block_fft_implementation< 4, 4, 2, 96, 64, 224>, + block_fft_implementation< 2, 2, 4, 80, 32, 225>, + block_fft_implementation< 2, 2, 4, 128, 64, 226> + >; +}; + +template<> struct block_fft_record<16, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 112, 0, 227>, + block_fft_implementation< 4, 4, 4, 64, 64, 228>, + block_fft_implementation< 4, 4, 4, 96, 128, 229>, + block_fft_implementation< 8, 8, 2, 128, 64, 230>, + block_fft_implementation< 8, 8, 2, 112, 128, 231>, + block_fft_implementation< 2, 2, 8, 64, 64, 232>, + block_fft_implementation< 2, 2, 8, 64, 128, 233> + >; +}; + +template<> struct block_fft_record<32, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 32, 0, 234>, + block_fft_implementation< 8, 8, 4, 48, 256, 235>, + block_fft_implementation< 8, 8, 4, 96, 128, 236>, + block_fft_implementation< 4, 4, 8, 48, 256, 237>, + block_fft_implementation< 4, 4, 8, 64, 128, 238>, + block_fft_implementation< 2, 2, 16, 20, 256, 239>, + block_fft_implementation<16, 16, 2, 64, 256, 240>, + block_fft_implementation< 2, 2, 16, 26, 128, 241>, + block_fft_implementation<16, 16, 2, 128, 128, 242> + >; +}; + +template<> struct block_fft_record<64, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 24, 512, 243>, + block_fft_implementation< 8, 8, 8, 13, 256, 244>, + block_fft_implementation< 4, 4, 16, 9, 512, 245>, + block_fft_implementation< 4, 4, 16, 15, 256, 246>, + block_fft_implementation<16, 16, 4, 25, 512, 247>, + block_fft_implementation<16, 16, 4, 29, 256, 248>, + block_fft_implementation<32, 32, 2, 23, 512, 249>, + block_fft_implementation< 2, 2, 32, 14, 512, 250>, + block_fft_implementation< 2, 2, 32, 8, 256, 251>, + block_fft_implementation<32, 32, 2, 64, 256, 252> + >; +}; + +template<> struct block_fft_record<128, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 16, 512, 253>, + block_fft_implementation< 8, 8, 16, 8, 1024, 254>, + block_fft_implementation<16, 16, 8, 13, 1024, 255>, + block_fft_implementation< 8, 8, 16, 16, 512, 256>, + block_fft_implementation< 4, 4, 32, 4, 1024, 257>, + block_fft_implementation< 4, 4, 32, 7, 512, 258>, + block_fft_implementation<32, 32, 4, 4, 1024, 259>, + block_fft_implementation<32, 32, 4, 31, 512, 260>, + block_fft_implementation< 2, 2, 64, 6, 1024, 261>, + block_fft_implementation< 2, 2, 64, 6, 512, 262> + >; +}; + +template<> struct block_fft_record<256, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 2, 1024, 263>, + block_fft_implementation< 8, 8, 32, 4, 2048, 264>, + block_fft_implementation< 8, 8, 32, 4, 1024, 265>, + block_fft_implementation<16, 16, 16, 3, 2048, 266>, + block_fft_implementation<32, 32, 8, 4, 2048, 267>, + block_fft_implementation< 4, 4, 64, 2, 2048, 268>, + block_fft_implementation< 4, 4, 64, 4, 1024, 269>, + block_fft_implementation<32, 32, 8, 16, 1024, 270>, + block_fft_implementation< 2, 2, 128, 3, 2048, 271>, + block_fft_implementation< 2, 2, 128, 4, 1024, 272> + >; +}; + +template<> struct block_fft_record<512, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 2, 4096, 273>, + block_fft_implementation< 8, 8, 64, 2, 2048, 274>, + block_fft_implementation<16, 16, 32, 1, 2048, 275>, + block_fft_implementation<32, 32, 16, 2, 4096, 276>, + block_fft_implementation<32, 32, 16, 2, 2048, 277>, + block_fft_implementation<16, 16, 32, 4, 4096, 278>, + block_fft_implementation< 4, 4, 128, 2, 4096, 279>, + block_fft_implementation< 4, 4, 128, 2, 2048, 280>, + block_fft_implementation< 2, 2, 256, 1, 4096, 281>, + block_fft_implementation< 2, 2, 256, 2, 2048, 282> + >; +}; + +template<> struct block_fft_record<1024, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 64, 1, 4096, 283>, + block_fft_implementation< 8, 8, 128, 1, 8192, 284>, + block_fft_implementation<32, 32, 32, 4, 8192, 285>, + block_fft_implementation< 8, 8, 128, 1, 4096, 286>, + block_fft_implementation<32, 32, 32, 4, 4096, 287>, + block_fft_implementation<16, 16, 64, 2, 8192, 288>, + block_fft_implementation< 4, 4, 256, 1, 8192, 289>, + block_fft_implementation< 4, 4, 256, 1, 4096, 290>, + block_fft_implementation< 2, 2, 512, 1, 8192, 291>, + block_fft_implementation< 2, 2, 512, 1, 4096, 292> + >; +}; + +template<> struct block_fft_record<2048, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 1, 8192, 293>, + block_fft_implementation< 8, 8, 256, 1, 16384, 294>, + block_fft_implementation< 8, 8, 256, 1, 8192, 295>, + block_fft_implementation<16, 16, 128, 1, 16384, 296>, + block_fft_implementation<32, 32, 64, 2, 16384, 297>, + block_fft_implementation<32, 32, 64, 2, 8192, 298>, + block_fft_implementation< 4, 4, 512, 1, 16384, 299>, + block_fft_implementation< 4, 4, 512, 1, 8192, 300>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 301>, + block_fft_implementation< 2, 2, 1024, 1, 8192, 302> + >; +}; + +template<> struct block_fft_record<4096, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 1, 16384, 303>, + block_fft_implementation< 8, 8, 512, 1, 32768, 304>, + block_fft_implementation< 8, 8, 512, 1, 16384, 305>, + block_fft_implementation<16, 16, 256, 1, 32768, 306>, + block_fft_implementation<32, 32, 128, 1, 32768, 307>, + block_fft_implementation<32, 32, 128, 1, 16384, 308>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 309>, + block_fft_implementation< 4, 4, 1024, 1, 16384, 310> + >; +}; + +template<> struct block_fft_record<8192, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 1, 32768, 311>, + block_fft_implementation<32, 32, 256, 1, 65536, 312>, + block_fft_implementation<32, 32, 256, 1, 32768, 313>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 314>, + block_fft_implementation<16, 16, 512, 1, 65536, 315>, + block_fft_implementation< 8, 8, 1024, 1, 32768, 316> + >; +}; + +template<> struct block_fft_record<16384, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1024, 1, 65536, 317>, + block_fft_implementation<32, 32, 512, 1, 65536, 318> + >; +}; + +template<> struct block_fft_record<3, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 319> + >; +}; + +template<> struct block_fft_record<9, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 64, 0, 320>, + block_fft_implementation< 3, 3, 3, 64, 72, 321>, + block_fft_implementation< 3, 3, 3, 128, 36, 322> + >; +}; + +template<> struct block_fft_record<27, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 32, 0, 323>, + block_fft_implementation< 9, 9, 3, 21, 108, 324>, + block_fft_implementation< 9, 9, 3, 10, 216, 325>, + block_fft_implementation< 3, 3, 9, 26, 216, 326>, + block_fft_implementation< 3, 3, 9, 24, 108, 327> + >; +}; + +template<> struct block_fft_record<81, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 14, 648, 328>, + block_fft_implementation< 9, 9, 9, 7, 324, 329>, + block_fft_implementation<27, 27, 3, 10, 324, 330>, + block_fft_implementation<27, 27, 3, 9, 648, 331>, + block_fft_implementation< 3, 3, 27, 7, 648, 332>, + block_fft_implementation< 3, 3, 27, 9, 324, 333> + >; +}; + +template<> struct block_fft_record<243, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 14, 1944, 334>, + block_fft_implementation<27, 27, 9, 14, 972, 335>, + block_fft_implementation< 9, 9, 27, 9, 1944, 336>, + block_fft_implementation< 9, 9, 27, 9, 972, 337>, + block_fft_implementation< 3, 3, 81, 3, 1944, 338>, + block_fft_implementation< 3, 3, 81, 3, 972, 339> + >; +}; + +template<> struct block_fft_record<729, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 14, 5832, 340>, + block_fft_implementation<27, 27, 27, 9, 2916, 341>, + block_fft_implementation< 9, 9, 81, 3, 5832, 342>, + block_fft_implementation< 9, 9, 81, 3, 2916, 343>, + block_fft_implementation< 3, 3, 243, 1, 5832, 344>, + block_fft_implementation< 3, 3, 243, 1, 2916, 345> + >; +}; + +template<> struct block_fft_record<2187, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 1, 17496, 346>, + block_fft_implementation<27, 27, 81, 3, 17496, 347>, + block_fft_implementation< 9, 9, 243, 1, 8748, 348>, + block_fft_implementation<27, 27, 81, 3, 8748, 349>, + block_fft_implementation< 3, 3, 729, 1, 17496, 350>, + block_fft_implementation< 3, 3, 729, 1, 8748, 351> + >; +}; + +template<> struct block_fft_record<6561, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 1, 26244, 352>, + block_fft_implementation<27, 27, 243, 1, 52488, 353>, + block_fft_implementation< 9, 9, 729, 1, 26244, 354>, + block_fft_implementation< 9, 9, 729, 1, 52488, 355> + >; +}; + +template<> struct block_fft_record<5, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 32, 0, 356> + >; +}; + +template<> struct block_fft_record<25, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 32, 0, 357>, + block_fft_implementation< 5, 5, 5, 128, 200, 358>, + block_fft_implementation< 5, 5, 5, 25, 100, 359> + >; +}; + +template<> struct block_fft_record<125, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 25, 500, 360>, + block_fft_implementation<25, 25, 5, 25, 1000, 361>, + block_fft_implementation< 5, 5, 25, 5, 1000, 362>, + block_fft_implementation< 5, 5, 25, 5, 500, 363> + >; +}; + +template<> struct block_fft_record<625, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 2500, 364>, + block_fft_implementation<25, 25, 25, 5, 5000, 365>, + block_fft_implementation< 5, 5, 125, 1, 5000, 366>, + block_fft_implementation< 5, 5, 125, 1, 2500, 367> + >; +}; + +template<> struct block_fft_record<3125, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 1, 12500, 368>, + block_fft_implementation<25, 25, 125, 3, 25000, 369>, + block_fft_implementation< 5, 5, 625, 1, 25000, 370>, + block_fft_implementation< 5, 5, 625, 1, 12500, 371> + >; +}; + +template<> struct block_fft_record<15625, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 1, 62500, 372> + >; +}; + +template<> struct block_fft_record<7, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 64, 0, 373> + >; +}; + +template<> struct block_fft_record<49, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 9, 392, 374>, + block_fft_implementation< 7, 7, 7, 9, 196, 375> + >; +}; + +template<> struct block_fft_record<343, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 2744, 376>, + block_fft_implementation< 7, 7, 49, 5, 1372, 377> + >; +}; + +template<> struct block_fft_record<2401, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 19208, 378>, + block_fft_implementation< 7, 7, 343, 1, 9604, 379> + >; +}; + +template<> struct block_fft_record<11, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 96, 0, 380> + >; +}; + +template<> struct block_fft_record<121, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 968, 381>, + block_fft_implementation<11, 11, 11, 23, 484, 382> + >; +}; + +template<> struct block_fft_record<1331, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 5324, 383>, + block_fft_implementation<11, 11, 121, 1, 10648, 384> + >; +}; + +template<> struct block_fft_record<6, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 32, 0, 385> + >; +}; + +template<> struct block_fft_record<36, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 64, 144, 386>, + block_fft_implementation< 6, 6, 6, 64, 288, 387> + >; +}; + +template<> struct block_fft_record<216, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 388>, + block_fft_implementation< 6, 6, 36, 7, 864, 389> + >; +}; + +template<> struct block_fft_record<1296, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 2, 10368, 390>, + block_fft_implementation< 6, 6, 216, 1, 5184, 391> + >; +}; + +template<> struct block_fft_record<10, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 392> + >; +}; + +template<> struct block_fft_record<100, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 25, 400, 393>, + block_fft_implementation<10, 10, 10, 25, 800, 394> + >; +}; + +template<> struct block_fft_record<1000, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 8000, 395>, + block_fft_implementation<10, 10, 100, 5, 4000, 396> + >; +}; + +template<> struct block_fft_record<10000, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 1, 80000, 397>, + block_fft_implementation<10, 10, 1000, 1, 40000, 398> + >; +}; + +template<> struct block_fft_record<12, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 64, 0, 399> + >; +}; + +template<> struct block_fft_record<144, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 21, 1152, 400>, + block_fft_implementation<12, 12, 12, 21, 576, 401> + >; +}; + +template<> struct block_fft_record<1728, float, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 1, 13824, 402>, + block_fft_implementation<12, 12, 144, 1, 6912, 403> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..ef54a51a2e436 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp64_fwd.hpp.inc @@ -0,0 +1,534 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 404> + >; +}; + +template<> struct block_fft_record<14, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 405> + >; +}; + +template<> struct block_fft_record<15, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 406> + >; +}; + +template<> struct block_fft_record<17, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 407> + >; +}; + +template<> struct block_fft_record<18, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 408> + >; +}; + +template<> struct block_fft_record<19, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 409> + >; +}; + +template<> struct block_fft_record<20, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 410> + >; +}; + +template<> struct block_fft_record<21, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 411> + >; +}; + +template<> struct block_fft_record<22, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 412> + >; +}; + +template<> struct block_fft_record<23, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 413> + >; +}; + +template<> struct block_fft_record<24, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 414> + >; +}; + +template<> struct block_fft_record<26, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 415> + >; +}; + +template<> struct block_fft_record<28, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 416> + >; +}; + +template<> struct block_fft_record<29, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 417> + >; +}; + +template<> struct block_fft_record<30, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 418> + >; +}; + +template<> struct block_fft_record<31, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 419> + >; +}; + +template<> struct block_fft_record<2, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 32, 0, 420> + >; +}; + +template<> struct block_fft_record<4, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 96, 0, 421>, + block_fft_implementation< 2, 2, 2, 128, 32, 422>, + block_fft_implementation< 2, 2, 2, 128, 64, 423> + >; +}; + +template<> struct block_fft_record<8, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 424>, + block_fft_implementation< 4, 4, 2, 128, 64, 425>, + block_fft_implementation< 2, 2, 4, 128, 64, 426>, + block_fft_implementation< 4, 4, 2, 128, 128, 427>, + block_fft_implementation< 2, 2, 4, 112, 128, 428> + >; +}; + +template<> struct block_fft_record<16, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 32, 0, 429>, + block_fft_implementation< 4, 4, 4, 64, 256, 430>, + block_fft_implementation< 4, 4, 4, 64, 128, 431>, + block_fft_implementation< 2, 2, 8, 48, 256, 432>, + block_fft_implementation< 8, 8, 2, 128, 128, 433>, + block_fft_implementation< 8, 8, 2, 64, 256, 434>, + block_fft_implementation< 2, 2, 8, 64, 128, 435> + >; +}; + +template<> struct block_fft_record<32, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 4, 64, 256, 436>, + block_fft_implementation< 4, 4, 8, 24, 512, 437>, + block_fft_implementation< 8, 8, 4, 48, 512, 438>, + block_fft_implementation< 4, 4, 8, 27, 256, 439>, + block_fft_implementation<16, 16, 2, 11, 512, 440>, + block_fft_implementation< 2, 2, 16, 18, 512, 441>, + block_fft_implementation< 2, 2, 16, 30, 256, 442>, + block_fft_implementation<16, 16, 2, 64, 256, 443> + >; +}; + +template<> struct block_fft_record<64, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 16, 512, 444>, + block_fft_implementation< 4, 4, 16, 20, 512, 445>, + block_fft_implementation< 4, 4, 16, 12, 1024, 446>, + block_fft_implementation< 8, 8, 8, 13, 1024, 447>, + block_fft_implementation<16, 16, 4, 6, 1024, 448>, + block_fft_implementation<16, 16, 4, 26, 512, 449>, + block_fft_implementation< 2, 2, 32, 7, 1024, 450>, + block_fft_implementation< 2, 2, 32, 12, 512, 451> + >; +}; + +template<> struct block_fft_record<128, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 16, 8, 1024, 452>, + block_fft_implementation<16, 16, 8, 4, 2048, 453>, + block_fft_implementation<16, 16, 8, 16, 1024, 454>, + block_fft_implementation< 4, 4, 32, 5, 2048, 455>, + block_fft_implementation< 4, 4, 32, 8, 1024, 456>, + block_fft_implementation< 8, 8, 16, 8, 2048, 457>, + block_fft_implementation< 2, 2, 64, 6, 2048, 458>, + block_fft_implementation< 2, 2, 64, 6, 1024, 459> + >; +}; + +template<> struct block_fft_record<256, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 32, 4, 2048, 460>, + block_fft_implementation<16, 16, 16, 2, 4096, 461>, + block_fft_implementation<16, 16, 16, 2, 2048, 462>, + block_fft_implementation< 4, 4, 64, 2, 4096, 463>, + block_fft_implementation< 4, 4, 64, 4, 2048, 464>, + block_fft_implementation< 8, 8, 32, 4, 4096, 465>, + block_fft_implementation< 2, 2, 128, 3, 2048, 466>, + block_fft_implementation< 2, 2, 128, 2, 4096, 467> + >; +}; + +template<> struct block_fft_record<512, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 2, 4096, 468>, + block_fft_implementation< 8, 8, 64, 2, 8192, 469>, + block_fft_implementation<16, 16, 32, 1, 8192, 470>, + block_fft_implementation<16, 16, 32, 4, 4096, 471>, + block_fft_implementation< 4, 4, 128, 2, 4096, 472>, + block_fft_implementation< 4, 4, 128, 1, 8192, 473>, + block_fft_implementation< 2, 2, 256, 2, 4096, 474>, + block_fft_implementation< 2, 2, 256, 1, 8192, 475> + >; +}; + +template<> struct block_fft_record<1024, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 128, 1, 8192, 476>, + block_fft_implementation<16, 16, 64, 2, 8192, 477>, + block_fft_implementation< 4, 4, 256, 1, 8192, 478>, + block_fft_implementation< 4, 4, 256, 1, 16384, 479>, + block_fft_implementation<16, 16, 64, 2, 16384, 480>, + block_fft_implementation< 8, 8, 128, 1, 16384, 481>, + block_fft_implementation< 2, 2, 512, 1, 16384, 482>, + block_fft_implementation< 2, 2, 512, 1, 8192, 483> + >; +}; + +template<> struct block_fft_record<2048, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 256, 1, 16384, 484>, + block_fft_implementation<16, 16, 128, 1, 16384, 485>, + block_fft_implementation<16, 16, 128, 1, 32768, 486>, + block_fft_implementation< 8, 8, 256, 1, 32768, 487>, + block_fft_implementation< 4, 4, 512, 1, 16384, 488>, + block_fft_implementation< 4, 4, 512, 1, 32768, 489>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 490>, + block_fft_implementation< 2, 2, 1024, 1, 32768, 491> + >; +}; + +template<> struct block_fft_record<4096, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 512, 1, 32768, 492>, + block_fft_implementation<16, 16, 256, 1, 32768, 493>, + block_fft_implementation<16, 16, 256, 1, 65536, 494>, + block_fft_implementation< 8, 8, 512, 1, 65536, 495>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 496>, + block_fft_implementation< 4, 4, 1024, 1, 65536, 497> + >; +}; + +template<> struct block_fft_record<8192, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 1, 65536, 498>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 499> + >; +}; + +template<> struct block_fft_record<3, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 500> + >; +}; + +template<> struct block_fft_record<9, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 32, 0, 501>, + block_fft_implementation< 3, 3, 3, 80, 72, 502>, + block_fft_implementation< 3, 3, 3, 64, 144, 503> + >; +}; + +template<> struct block_fft_record<27, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 128, 0, 504>, + block_fft_implementation< 9, 9, 3, 21, 216, 505>, + block_fft_implementation< 9, 9, 3, 80, 432, 506>, + block_fft_implementation< 3, 3, 9, 30, 216, 507>, + block_fft_implementation< 3, 3, 9, 23, 432, 508> + >; +}; + +template<> struct block_fft_record<81, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 14, 1296, 509>, + block_fft_implementation< 9, 9, 9, 14, 648, 510>, + block_fft_implementation<27, 27, 3, 10, 648, 511>, + block_fft_implementation<27, 27, 3, 9, 1296, 512>, + block_fft_implementation< 3, 3, 27, 8, 1296, 513>, + block_fft_implementation< 3, 3, 27, 13, 648, 514> + >; +}; + +template<> struct block_fft_record<243, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 14, 1944, 515>, + block_fft_implementation<27, 27, 9, 3, 3888, 516>, + block_fft_implementation< 9, 9, 27, 9, 3888, 517>, + block_fft_implementation< 9, 9, 27, 9, 1944, 518>, + block_fft_implementation< 3, 3, 81, 3, 3888, 519>, + block_fft_implementation< 3, 3, 81, 3, 1944, 520> + >; +}; + +template<> struct block_fft_record<729, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 81, 3, 11664, 521>, + block_fft_implementation< 9, 9, 81, 3, 5832, 522>, + block_fft_implementation<27, 27, 27, 1, 5832, 523>, + block_fft_implementation<27, 27, 27, 1, 11664, 524>, + block_fft_implementation< 3, 3, 243, 1, 5832, 525>, + block_fft_implementation< 3, 3, 243, 1, 11664, 526> + >; +}; + +template<> struct block_fft_record<2187, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 1, 17496, 527>, + block_fft_implementation< 9, 9, 243, 1, 34992, 528>, + block_fft_implementation< 3, 3, 729, 1, 17496, 529>, + block_fft_implementation< 3, 3, 729, 1, 34992, 530> + >; +}; + +template<> struct block_fft_record<6561, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 729, 1, 52488, 531> + >; +}; + +template<> struct block_fft_record<5, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 64, 0, 532> + >; +}; + +template<> struct block_fft_record<25, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 128, 0, 533>, + block_fft_implementation< 5, 5, 5, 25, 400, 534>, + block_fft_implementation< 5, 5, 5, 25, 200, 535> + >; +}; + +template<> struct block_fft_record<125, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 25, 5, 2000, 536>, + block_fft_implementation< 5, 5, 25, 5, 1000, 537>, + block_fft_implementation<25, 25, 5, 25, 1000, 538>, + block_fft_implementation<25, 25, 5, 6, 2000, 539> + >; +}; + +template<> struct block_fft_record<625, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 5000, 540>, + block_fft_implementation< 5, 5, 125, 1, 5000, 541>, + block_fft_implementation< 5, 5, 125, 1, 10000, 542>, + block_fft_implementation<25, 25, 25, 5, 10000, 543> + >; +}; + +template<> struct block_fft_record<3125, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 625, 1, 50000, 544>, + block_fft_implementation< 5, 5, 625, 1, 25000, 545> + >; +}; + +template<> struct block_fft_record<7, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 32, 0, 546> + >; +}; + +template<> struct block_fft_record<49, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 18, 392, 547>, + block_fft_implementation< 7, 7, 7, 18, 784, 548> + >; +}; + +template<> struct block_fft_record<343, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 5488, 549>, + block_fft_implementation< 7, 7, 49, 5, 2744, 550> + >; +}; + +template<> struct block_fft_record<2401, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 38416, 551>, + block_fft_implementation< 7, 7, 343, 1, 19208, 552> + >; +}; + +template<> struct block_fft_record<11, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 32, 0, 553> + >; +}; + +template<> struct block_fft_record<121, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 1936, 554>, + block_fft_implementation<11, 11, 11, 23, 968, 555> + >; +}; + +template<> struct block_fft_record<1331, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 10648, 556>, + block_fft_implementation<11, 11, 121, 1, 21296, 557> + >; +}; + +template<> struct block_fft_record<6, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 32, 0, 558> + >; +}; + +template<> struct block_fft_record<36, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 21, 576, 559>, + block_fft_implementation< 6, 6, 6, 21, 288, 560> + >; +}; + +template<> struct block_fft_record<216, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 561>, + block_fft_implementation< 6, 6, 36, 7, 3456, 562> + >; +}; + +template<> struct block_fft_record<1296, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 1, 10368, 563>, + block_fft_implementation< 6, 6, 216, 1, 20736, 564> + >; +}; + +template<> struct block_fft_record<10, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 565> + >; +}; + +template<> struct block_fft_record<100, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 3, 1600, 566>, + block_fft_implementation<10, 10, 10, 3, 800, 567> + >; +}; + +template<> struct block_fft_record<1000, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 16000, 568>, + block_fft_implementation<10, 10, 100, 1, 8000, 569> + >; +}; + +template<> struct block_fft_record<12, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 32, 0, 570> + >; +}; + +template<> struct block_fft_record<144, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 10, 1152, 571>, + block_fft_implementation<12, 12, 12, 5, 2304, 572> + >; +}; + +template<> struct block_fft_record<1728, double, fft_type::c2c, fft_direction::forward, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 2, 13824, 573>, + block_fft_implementation<12, 12, 144, 3, 27648, 574> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..8ec9adbddd1a0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/700/database_fp64_inv.hpp.inc @@ -0,0 +1,534 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 575> + >; +}; + +template<> struct block_fft_record<14, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 576> + >; +}; + +template<> struct block_fft_record<15, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 577> + >; +}; + +template<> struct block_fft_record<17, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 578> + >; +}; + +template<> struct block_fft_record<18, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 579> + >; +}; + +template<> struct block_fft_record<19, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 580> + >; +}; + +template<> struct block_fft_record<20, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 581> + >; +}; + +template<> struct block_fft_record<21, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 582> + >; +}; + +template<> struct block_fft_record<22, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 583> + >; +}; + +template<> struct block_fft_record<23, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 584> + >; +}; + +template<> struct block_fft_record<24, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 585> + >; +}; + +template<> struct block_fft_record<26, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 586> + >; +}; + +template<> struct block_fft_record<28, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 587> + >; +}; + +template<> struct block_fft_record<29, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 588> + >; +}; + +template<> struct block_fft_record<30, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 589> + >; +}; + +template<> struct block_fft_record<31, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 590> + >; +}; + +template<> struct block_fft_record<2, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 32, 0, 591> + >; +}; + +template<> struct block_fft_record<4, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 96, 0, 592>, + block_fft_implementation< 2, 2, 2, 128, 32, 593>, + block_fft_implementation< 2, 2, 2, 128, 64, 594> + >; +}; + +template<> struct block_fft_record<8, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 595>, + block_fft_implementation< 4, 4, 2, 128, 64, 596>, + block_fft_implementation< 2, 2, 4, 128, 64, 597>, + block_fft_implementation< 4, 4, 2, 128, 128, 598>, + block_fft_implementation< 2, 2, 4, 112, 128, 599> + >; +}; + +template<> struct block_fft_record<16, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 32, 0, 600>, + block_fft_implementation< 4, 4, 4, 64, 256, 601>, + block_fft_implementation< 4, 4, 4, 64, 128, 602>, + block_fft_implementation< 2, 2, 8, 48, 256, 603>, + block_fft_implementation< 8, 8, 2, 128, 128, 604>, + block_fft_implementation< 8, 8, 2, 64, 256, 605>, + block_fft_implementation< 2, 2, 8, 64, 128, 606> + >; +}; + +template<> struct block_fft_record<32, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 4, 64, 256, 607>, + block_fft_implementation< 4, 4, 8, 24, 512, 608>, + block_fft_implementation< 8, 8, 4, 48, 512, 609>, + block_fft_implementation< 4, 4, 8, 27, 256, 610>, + block_fft_implementation<16, 16, 2, 11, 512, 611>, + block_fft_implementation< 2, 2, 16, 18, 512, 612>, + block_fft_implementation< 2, 2, 16, 30, 256, 613>, + block_fft_implementation<16, 16, 2, 64, 256, 614> + >; +}; + +template<> struct block_fft_record<64, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 16, 512, 615>, + block_fft_implementation< 4, 4, 16, 20, 512, 616>, + block_fft_implementation< 4, 4, 16, 12, 1024, 617>, + block_fft_implementation< 8, 8, 8, 13, 1024, 618>, + block_fft_implementation<16, 16, 4, 6, 1024, 619>, + block_fft_implementation<16, 16, 4, 26, 512, 620>, + block_fft_implementation< 2, 2, 32, 7, 1024, 621>, + block_fft_implementation< 2, 2, 32, 12, 512, 622> + >; +}; + +template<> struct block_fft_record<128, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 16, 8, 1024, 623>, + block_fft_implementation<16, 16, 8, 4, 2048, 624>, + block_fft_implementation<16, 16, 8, 16, 1024, 625>, + block_fft_implementation< 4, 4, 32, 5, 2048, 626>, + block_fft_implementation< 4, 4, 32, 8, 1024, 627>, + block_fft_implementation< 8, 8, 16, 8, 2048, 628>, + block_fft_implementation< 2, 2, 64, 6, 2048, 629>, + block_fft_implementation< 2, 2, 64, 6, 1024, 630> + >; +}; + +template<> struct block_fft_record<256, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 32, 4, 2048, 631>, + block_fft_implementation<16, 16, 16, 2, 4096, 632>, + block_fft_implementation<16, 16, 16, 2, 2048, 633>, + block_fft_implementation< 4, 4, 64, 2, 4096, 634>, + block_fft_implementation< 4, 4, 64, 4, 2048, 635>, + block_fft_implementation< 8, 8, 32, 4, 4096, 636>, + block_fft_implementation< 2, 2, 128, 3, 2048, 637>, + block_fft_implementation< 2, 2, 128, 2, 4096, 638> + >; +}; + +template<> struct block_fft_record<512, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 2, 4096, 639>, + block_fft_implementation< 8, 8, 64, 2, 8192, 640>, + block_fft_implementation<16, 16, 32, 1, 8192, 641>, + block_fft_implementation<16, 16, 32, 4, 4096, 642>, + block_fft_implementation< 4, 4, 128, 2, 4096, 643>, + block_fft_implementation< 4, 4, 128, 1, 8192, 644>, + block_fft_implementation< 2, 2, 256, 2, 4096, 645>, + block_fft_implementation< 2, 2, 256, 1, 8192, 646> + >; +}; + +template<> struct block_fft_record<1024, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 128, 1, 8192, 647>, + block_fft_implementation<16, 16, 64, 2, 8192, 648>, + block_fft_implementation< 4, 4, 256, 1, 8192, 649>, + block_fft_implementation< 4, 4, 256, 1, 16384, 650>, + block_fft_implementation<16, 16, 64, 2, 16384, 651>, + block_fft_implementation< 8, 8, 128, 1, 16384, 652>, + block_fft_implementation< 2, 2, 512, 1, 16384, 653>, + block_fft_implementation< 2, 2, 512, 1, 8192, 654> + >; +}; + +template<> struct block_fft_record<2048, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 256, 1, 16384, 655>, + block_fft_implementation<16, 16, 128, 1, 16384, 656>, + block_fft_implementation<16, 16, 128, 1, 32768, 657>, + block_fft_implementation< 8, 8, 256, 1, 32768, 658>, + block_fft_implementation< 4, 4, 512, 1, 16384, 659>, + block_fft_implementation< 4, 4, 512, 1, 32768, 660>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 661>, + block_fft_implementation< 2, 2, 1024, 1, 32768, 662> + >; +}; + +template<> struct block_fft_record<4096, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 512, 1, 32768, 663>, + block_fft_implementation<16, 16, 256, 1, 32768, 664>, + block_fft_implementation<16, 16, 256, 1, 65536, 665>, + block_fft_implementation< 8, 8, 512, 1, 65536, 666>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 667>, + block_fft_implementation< 4, 4, 1024, 1, 65536, 668> + >; +}; + +template<> struct block_fft_record<8192, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 1, 65536, 669>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 670> + >; +}; + +template<> struct block_fft_record<3, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 671> + >; +}; + +template<> struct block_fft_record<9, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 32, 0, 672>, + block_fft_implementation< 3, 3, 3, 80, 72, 673>, + block_fft_implementation< 3, 3, 3, 64, 144, 674> + >; +}; + +template<> struct block_fft_record<27, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 128, 0, 675>, + block_fft_implementation< 9, 9, 3, 21, 216, 676>, + block_fft_implementation< 9, 9, 3, 80, 432, 677>, + block_fft_implementation< 3, 3, 9, 30, 216, 678>, + block_fft_implementation< 3, 3, 9, 23, 432, 679> + >; +}; + +template<> struct block_fft_record<81, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 14, 1296, 680>, + block_fft_implementation< 9, 9, 9, 14, 648, 681>, + block_fft_implementation<27, 27, 3, 10, 648, 682>, + block_fft_implementation<27, 27, 3, 9, 1296, 683>, + block_fft_implementation< 3, 3, 27, 8, 1296, 684>, + block_fft_implementation< 3, 3, 27, 13, 648, 685> + >; +}; + +template<> struct block_fft_record<243, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 14, 1944, 686>, + block_fft_implementation<27, 27, 9, 3, 3888, 687>, + block_fft_implementation< 9, 9, 27, 9, 3888, 688>, + block_fft_implementation< 9, 9, 27, 9, 1944, 689>, + block_fft_implementation< 3, 3, 81, 3, 3888, 690>, + block_fft_implementation< 3, 3, 81, 3, 1944, 691> + >; +}; + +template<> struct block_fft_record<729, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 81, 3, 11664, 692>, + block_fft_implementation< 9, 9, 81, 3, 5832, 693>, + block_fft_implementation<27, 27, 27, 1, 5832, 694>, + block_fft_implementation<27, 27, 27, 1, 11664, 695>, + block_fft_implementation< 3, 3, 243, 1, 5832, 696>, + block_fft_implementation< 3, 3, 243, 1, 11664, 697> + >; +}; + +template<> struct block_fft_record<2187, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 1, 17496, 698>, + block_fft_implementation< 9, 9, 243, 1, 34992, 699>, + block_fft_implementation< 3, 3, 729, 1, 17496, 700>, + block_fft_implementation< 3, 3, 729, 1, 34992, 701> + >; +}; + +template<> struct block_fft_record<6561, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 729, 1, 52488, 702> + >; +}; + +template<> struct block_fft_record<5, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 64, 0, 703> + >; +}; + +template<> struct block_fft_record<25, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 128, 0, 704>, + block_fft_implementation< 5, 5, 5, 25, 400, 705>, + block_fft_implementation< 5, 5, 5, 25, 200, 706> + >; +}; + +template<> struct block_fft_record<125, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 25, 5, 2000, 707>, + block_fft_implementation< 5, 5, 25, 5, 1000, 708>, + block_fft_implementation<25, 25, 5, 25, 1000, 709>, + block_fft_implementation<25, 25, 5, 6, 2000, 710> + >; +}; + +template<> struct block_fft_record<625, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 5000, 711>, + block_fft_implementation< 5, 5, 125, 1, 5000, 712>, + block_fft_implementation< 5, 5, 125, 1, 10000, 713>, + block_fft_implementation<25, 25, 25, 5, 10000, 714> + >; +}; + +template<> struct block_fft_record<3125, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 625, 1, 50000, 715>, + block_fft_implementation< 5, 5, 625, 1, 25000, 716> + >; +}; + +template<> struct block_fft_record<7, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 32, 0, 717> + >; +}; + +template<> struct block_fft_record<49, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 18, 392, 718>, + block_fft_implementation< 7, 7, 7, 18, 784, 719> + >; +}; + +template<> struct block_fft_record<343, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 5488, 720>, + block_fft_implementation< 7, 7, 49, 5, 2744, 721> + >; +}; + +template<> struct block_fft_record<2401, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 38416, 722>, + block_fft_implementation< 7, 7, 343, 1, 19208, 723> + >; +}; + +template<> struct block_fft_record<11, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 32, 0, 724> + >; +}; + +template<> struct block_fft_record<121, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 1936, 725>, + block_fft_implementation<11, 11, 11, 23, 968, 726> + >; +}; + +template<> struct block_fft_record<1331, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 10648, 727>, + block_fft_implementation<11, 11, 121, 1, 21296, 728> + >; +}; + +template<> struct block_fft_record<6, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 32, 0, 729> + >; +}; + +template<> struct block_fft_record<36, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 21, 576, 730>, + block_fft_implementation< 6, 6, 6, 21, 288, 731> + >; +}; + +template<> struct block_fft_record<216, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 732>, + block_fft_implementation< 6, 6, 36, 7, 3456, 733> + >; +}; + +template<> struct block_fft_record<1296, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 1, 10368, 734>, + block_fft_implementation< 6, 6, 216, 1, 20736, 735> + >; +}; + +template<> struct block_fft_record<10, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 736> + >; +}; + +template<> struct block_fft_record<100, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 3, 1600, 737>, + block_fft_implementation<10, 10, 10, 3, 800, 738> + >; +}; + +template<> struct block_fft_record<1000, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 16000, 739>, + block_fft_implementation<10, 10, 100, 1, 8000, 740> + >; +}; + +template<> struct block_fft_record<12, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 32, 0, 741> + >; +}; + +template<> struct block_fft_record<144, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 10, 1152, 742>, + block_fft_implementation<12, 12, 12, 5, 2304, 743> + >; +}; + +template<> struct block_fft_record<1728, double, fft_type::c2c, fft_direction::inverse, 700> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 2, 13824, 744>, + block_fft_implementation<12, 12, 144, 3, 27648, 745> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..2a52007508302 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp16_fwd.hpp.inc @@ -0,0 +1,601 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 64, 0, 746> + >; +}; + +template<> struct block_fft_record<14, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 64, 0, 747> + >; +}; + +template<> struct block_fft_record<15, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 64, 0, 748> + >; +}; + +template<> struct block_fft_record<17, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 64, 0, 749> + >; +}; + +template<> struct block_fft_record<18, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 64, 0, 750> + >; +}; + +template<> struct block_fft_record<19, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 64, 0, 751> + >; +}; + +template<> struct block_fft_record<20, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 64, 0, 752> + >; +}; + +template<> struct block_fft_record<21, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 64, 0, 753> + >; +}; + +template<> struct block_fft_record<22, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 64, 0, 754> + >; +}; + +template<> struct block_fft_record<23, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 64, 0, 755> + >; +}; + +template<> struct block_fft_record<24, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 64, 0, 756> + >; +}; + +template<> struct block_fft_record<26, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 64, 0, 757> + >; +}; + +template<> struct block_fft_record<28, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 64, 0, 758> + >; +}; + +template<> struct block_fft_record<29, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 64, 0, 759> + >; +}; + +template<> struct block_fft_record<30, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 64, 0, 760> + >; +}; + +template<> struct block_fft_record<31, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 64, 0, 761> + >; +}; + +template<> struct block_fft_record<2, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 256, 0, 762> + >; +}; + +template<> struct block_fft_record<4, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 128, 0, 763>, + block_fft_implementation< 2, 2, 2, 256, 8, 764>, + block_fft_implementation< 2, 2, 2, 256, 16, 765> + >; +}; + +template<> struct block_fft_record<8, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 64, 0, 766>, + block_fft_implementation< 4, 4, 2, 224, 16, 767>, + block_fft_implementation< 4, 4, 2, 256, 32, 768>, + block_fft_implementation< 2, 2, 4, 192, 16, 769>, + block_fft_implementation< 2, 2, 4, 256, 32, 770> + >; +}; + +template<> struct block_fft_record<16, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 64, 0, 771>, + block_fft_implementation< 4, 4, 4, 128, 32, 772>, + block_fft_implementation< 4, 4, 4, 160, 64, 773>, + block_fft_implementation< 8, 8, 2, 192, 32, 774>, + block_fft_implementation< 8, 8, 2, 192, 64, 775>, + block_fft_implementation< 2, 2, 8, 128, 32, 776>, + block_fft_implementation< 2, 2, 8, 160, 64, 777> + >; +}; + +template<> struct block_fft_record<32, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 64, 0, 778>, + block_fft_implementation< 8, 8, 4, 60, 128, 779>, + block_fft_implementation< 8, 8, 4, 96, 64, 780>, + block_fft_implementation< 4, 4, 8, 64, 128, 781>, + block_fft_implementation< 4, 4, 8, 128, 64, 782>, + block_fft_implementation< 2, 2, 16, 60, 128, 783>, + block_fft_implementation<16, 16, 2, 192, 128, 784>, + block_fft_implementation< 2, 2, 16, 48, 64, 785>, + block_fft_implementation<16, 16, 2, 54, 64, 786> + >; +}; + +template<> struct block_fft_record<64, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 32, 256, 787>, + block_fft_implementation< 8, 8, 8, 32, 128, 788>, + block_fft_implementation< 4, 4, 16, 24, 256, 789>, + block_fft_implementation< 4, 4, 16, 32, 128, 790>, + block_fft_implementation<16, 16, 4, 62, 256, 791>, + block_fft_implementation<16, 16, 4, 28, 128, 792>, + block_fft_implementation<32, 32, 2, 56, 256, 793>, + block_fft_implementation< 2, 2, 32, 24, 256, 794>, + block_fft_implementation< 2, 2, 32, 24, 128, 795>, + block_fft_implementation<32, 32, 2, 60, 128, 796> + >; +}; + +template<> struct block_fft_record<128, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 16, 256, 797>, + block_fft_implementation< 8, 8, 16, 16, 512, 798>, + block_fft_implementation<16, 16, 8, 32, 512, 799>, + block_fft_implementation< 8, 8, 16, 16, 256, 800>, + block_fft_implementation< 4, 4, 32, 8, 512, 801>, + block_fft_implementation< 4, 4, 32, 16, 256, 802>, + block_fft_implementation<32, 32, 4, 30, 512, 803>, + block_fft_implementation<32, 32, 4, 32, 256, 804>, + block_fft_implementation< 2, 2, 64, 10, 512, 805>, + block_fft_implementation< 2, 2, 64, 10, 256, 806> + >; +}; + +template<> struct block_fft_record<256, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 8, 512, 807>, + block_fft_implementation< 8, 8, 32, 4, 1024, 808>, + block_fft_implementation< 8, 8, 32, 8, 512, 809>, + block_fft_implementation<16, 16, 16, 8, 1024, 810>, + block_fft_implementation<32, 32, 8, 16, 1024, 811>, + block_fft_implementation< 4, 4, 64, 4, 1024, 812>, + block_fft_implementation< 4, 4, 64, 8, 512, 813>, + block_fft_implementation<32, 32, 8, 16, 512, 814>, + block_fft_implementation< 2, 2, 128, 6, 1024, 815>, + block_fft_implementation< 2, 2, 128, 6, 512, 816> + >; +}; + +template<> struct block_fft_record<512, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 2, 2048, 817>, + block_fft_implementation< 8, 8, 64, 2, 1024, 818>, + block_fft_implementation<32, 32, 16, 4, 2048, 820>, + block_fft_implementation<16, 16, 32, 4, 1024, 819>, + block_fft_implementation<16, 16, 32, 4, 2048, 822>, + block_fft_implementation<32, 32, 16, 8, 1024, 821>, + block_fft_implementation< 4, 4, 128, 2, 2048, 823>, + block_fft_implementation< 4, 4, 128, 4, 1024, 824>, + block_fft_implementation< 2, 2, 256, 4, 2048, 825>, + block_fft_implementation< 2, 2, 256, 2, 1024, 826> + >; +}; + +template<> struct block_fft_record<1024, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 32, 2, 4096, 829>, + block_fft_implementation<16, 16, 64, 2, 2048, 827>, + block_fft_implementation< 8, 8, 128, 2, 2048, 830>, + block_fft_implementation<32, 32, 32, 4, 2048, 831>, + block_fft_implementation< 8, 8, 128, 2, 4096, 828>, + block_fft_implementation<16, 16, 64, 2, 4096, 832>, + block_fft_implementation< 4, 4, 256, 2, 4096, 833>, + block_fft_implementation< 4, 4, 256, 2, 2048, 834>, + block_fft_implementation< 2, 2, 512, 2, 4096, 835>, + block_fft_implementation< 2, 2, 512, 2, 2048, 836> + >; +}; + +template<> struct block_fft_record<2048, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 2, 4096, 837>, + block_fft_implementation< 8, 8, 256, 2, 4096, 839>, + block_fft_implementation< 8, 8, 256, 2, 8192, 838>, + block_fft_implementation<16, 16, 128, 2, 8192, 840>, + block_fft_implementation<32, 32, 64, 2, 8192, 841>, + block_fft_implementation<32, 32, 64, 2, 4096, 842>, + block_fft_implementation< 4, 4, 512, 2, 8192, 843>, + block_fft_implementation< 4, 4, 512, 2, 4096, 844>, + block_fft_implementation< 2, 2, 1024, 2, 8192, 845>, + block_fft_implementation< 2, 2, 1024, 2, 4096, 846> + >; +}; + +template<> struct block_fft_record<4096, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 2, 8192, 847>, + block_fft_implementation< 8, 8, 512, 2, 8192, 849>, + block_fft_implementation<16, 16, 256, 2, 16384, 850>, + block_fft_implementation<32, 32, 128, 2, 16384, 851>, + block_fft_implementation< 8, 8, 512, 2, 16384, 848>, + block_fft_implementation<32, 32, 128, 2, 8192, 852>, + block_fft_implementation< 4, 4, 1024, 2, 16384, 853>, + block_fft_implementation< 4, 4, 1024, 2, 8192, 854> + >; +}; + +template<> struct block_fft_record<8192, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 256, 2, 32768, 856>, + block_fft_implementation<32, 32, 256, 2, 16384, 857>, + block_fft_implementation<16, 16, 512, 2, 16384, 855>, + block_fft_implementation<16, 16, 512, 2, 32768, 859>, + block_fft_implementation< 8, 8, 1024, 2, 16384, 860>, + block_fft_implementation< 8, 8, 1024, 2, 32768, 858> + >; +}; + +template<> struct block_fft_record<16384, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 512, 2, 65536, 1174>, + block_fft_implementation<16, 16, 1024, 2, 32768, 861>, + block_fft_implementation<16, 16, 1024, 2, 65536, 1175>, + block_fft_implementation<32, 32, 512, 2, 32768, 862> + >; +}; + +template<> struct block_fft_record<32768, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1024, 2, 65536, 1176> + >; +}; + +template<> struct block_fft_record<3, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 863> + >; +}; + +template<> struct block_fft_record<9, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 64, 0, 864>, + block_fft_implementation< 3, 3, 3, 42, 36, 865>, + block_fft_implementation< 3, 3, 3, 128, 18, 866> + >; +}; + +template<> struct block_fft_record<27, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 64, 0, 867>, + block_fft_implementation< 9, 9, 3, 42, 108, 869>, + block_fft_implementation< 9, 9, 3, 42, 54, 868>, + block_fft_implementation< 3, 3, 9, 56, 108, 870>, + block_fft_implementation< 3, 3, 9, 56, 54, 871> + >; +}; + +template<> struct block_fft_record<81, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 14, 324, 872>, + block_fft_implementation< 9, 9, 9, 14, 162, 873>, + block_fft_implementation<27, 27, 3, 42, 324, 875>, + block_fft_implementation<27, 27, 3, 42, 162, 874>, + block_fft_implementation< 3, 3, 27, 14, 324, 876>, + block_fft_implementation< 3, 3, 27, 28, 162, 877> + >; +}; + +template<> struct block_fft_record<243, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 14, 972, 878>, + block_fft_implementation<27, 27, 9, 14, 486, 879>, + block_fft_implementation< 9, 9, 27, 28, 972, 880>, + block_fft_implementation< 9, 9, 27, 18, 486, 881>, + block_fft_implementation< 3, 3, 81, 6, 972, 882>, + block_fft_implementation< 3, 3, 81, 6, 486, 883> + >; +}; + +template<> struct block_fft_record<729, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 28, 2916, 884>, + block_fft_implementation<27, 27, 27, 18, 1458, 885>, + block_fft_implementation< 9, 9, 81, 6, 2916, 886>, + block_fft_implementation< 9, 9, 81, 6, 1458, 887>, + block_fft_implementation< 3, 3, 243, 2, 2916, 888>, + block_fft_implementation< 3, 3, 243, 2, 1458, 889> + >; +}; + +template<> struct block_fft_record<2187, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 81, 6, 8748, 891>, + block_fft_implementation< 9, 9, 243, 2, 8748, 890>, + block_fft_implementation< 9, 9, 243, 2, 4374, 892>, + block_fft_implementation<27, 27, 81, 6, 4374, 893>, + block_fft_implementation< 3, 3, 729, 2, 8748, 894>, + block_fft_implementation< 3, 3, 729, 2, 4374, 895> + >; +}; + +template<> struct block_fft_record<6561, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 2, 26244, 897>, + block_fft_implementation<27, 27, 243, 2, 13122, 896>, + block_fft_implementation< 9, 9, 729, 2, 26244, 899>, + block_fft_implementation< 9, 9, 729, 2, 13122, 898> + >; +}; + +template<> struct block_fft_record<19683, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 729, 2, 78732, 1177>, + block_fft_implementation<27, 27, 729, 2, 39366, 1178> + >; +}; + +template<> struct block_fft_record<5, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 128, 0, 900> + >; +}; + +template<> struct block_fft_record<25, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 64, 0, 901>, + block_fft_implementation< 5, 5, 5, 50, 100, 902>, + block_fft_implementation< 5, 5, 5, 50, 50, 903> + >; +}; + +template<> struct block_fft_record<125, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 12, 500, 905>, + block_fft_implementation<25, 25, 5, 50, 250, 904>, + block_fft_implementation< 5, 5, 25, 10, 500, 906>, + block_fft_implementation< 5, 5, 25, 10, 250, 907> + >; +}; + +template<> struct block_fft_record<625, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 10, 2500, 909>, + block_fft_implementation<25, 25, 25, 10, 1250, 908>, + block_fft_implementation< 5, 5, 125, 2, 2500, 910>, + block_fft_implementation< 5, 5, 125, 2, 1250, 911> + >; +}; + +template<> struct block_fft_record<3125, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 2, 12500, 913>, + block_fft_implementation<25, 25, 125, 2, 6250, 912>, + block_fft_implementation< 5, 5, 625, 2, 12500, 914>, + block_fft_implementation< 5, 5, 625, 2, 6250, 915> + >; +}; + +template<> struct block_fft_record<15625, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 2, 62500, 1179>, + block_fft_implementation<25, 25, 625, 2, 31250, 916> + >; +}; + +template<> struct block_fft_record<7, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 64, 0, 917> + >; +}; + +template<> struct block_fft_record<49, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 18, 196, 918>, + block_fft_implementation< 7, 7, 7, 18, 98, 919> + >; +}; + +template<> struct block_fft_record<343, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 10, 1372, 920>, + block_fft_implementation< 7, 7, 49, 10, 686, 921> + >; +}; + +template<> struct block_fft_record<2401, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 2, 9604, 922>, + block_fft_implementation< 7, 7, 343, 2, 4802, 923> + >; +}; + +template<> struct block_fft_record<11, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 64, 0, 924> + >; +}; + +template<> struct block_fft_record<121, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 46, 484, 925>, + block_fft_implementation<11, 11, 11, 46, 242, 926> + >; +}; + +template<> struct block_fft_record<1331, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 2, 5324, 928>, + block_fft_implementation<11, 11, 121, 2, 2662, 927> + >; +}; + +template<> struct block_fft_record<6, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 128, 0, 929> + >; +}; + +template<> struct block_fft_record<36, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 42, 144, 931>, + block_fft_implementation< 6, 6, 6, 42, 72, 930> + >; +}; + +template<> struct block_fft_record<216, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 14, 864, 932>, + block_fft_implementation< 6, 6, 36, 14, 432, 933> + >; +}; + +template<> struct block_fft_record<1296, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 8, 5184, 934>, + block_fft_implementation< 6, 6, 216, 2, 2592, 935> + >; +}; + +template<> struct block_fft_record<10, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 64, 0, 936> + >; +}; + +template<> struct block_fft_record<100, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 6, 400, 938>, + block_fft_implementation<10, 10, 10, 6, 200, 937> + >; +}; + +template<> struct block_fft_record<1000, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 10, 4000, 939>, + block_fft_implementation<10, 10, 100, 10, 2000, 940> + >; +}; + +template<> struct block_fft_record<10000, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 2, 40000, 941>, + block_fft_implementation<10, 10, 1000, 2, 20000, 942> + >; +}; + +template<> struct block_fft_record<12, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 64, 0, 943> + >; +}; + +template<> struct block_fft_record<144, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 42, 576, 944>, + block_fft_implementation<12, 12, 12, 10, 288, 945> + >; +}; + +template<> struct block_fft_record<1728, half, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 14, 6912, 946>, + block_fft_implementation<12, 12, 144, 8, 3456, 947> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..eef3bbf42d27a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp16_inv.hpp.inc @@ -0,0 +1,601 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 64, 0, 948> + >; +}; + +template<> struct block_fft_record<14, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 64, 0, 949> + >; +}; + +template<> struct block_fft_record<15, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 64, 0, 950> + >; +}; + +template<> struct block_fft_record<17, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 64, 0, 951> + >; +}; + +template<> struct block_fft_record<18, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 64, 0, 952> + >; +}; + +template<> struct block_fft_record<19, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 64, 0, 953> + >; +}; + +template<> struct block_fft_record<20, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 64, 0, 954> + >; +}; + +template<> struct block_fft_record<21, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 64, 0, 955> + >; +}; + +template<> struct block_fft_record<22, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 64, 0, 956> + >; +}; + +template<> struct block_fft_record<23, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 64, 0, 957> + >; +}; + +template<> struct block_fft_record<24, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 64, 0, 958> + >; +}; + +template<> struct block_fft_record<26, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 64, 0, 959> + >; +}; + +template<> struct block_fft_record<28, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 64, 0, 960> + >; +}; + +template<> struct block_fft_record<29, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 64, 0, 961> + >; +}; + +template<> struct block_fft_record<30, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 64, 0, 962> + >; +}; + +template<> struct block_fft_record<31, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 64, 0, 963> + >; +}; + +template<> struct block_fft_record<2, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 256, 0, 964> + >; +}; + +template<> struct block_fft_record<4, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 128, 0, 965>, + block_fft_implementation< 2, 2, 2, 256, 8, 966>, + block_fft_implementation< 2, 2, 2, 256, 16, 967> + >; +}; + +template<> struct block_fft_record<8, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 64, 0, 968>, + block_fft_implementation< 4, 4, 2, 224, 16, 969>, + block_fft_implementation< 4, 4, 2, 256, 32, 970>, + block_fft_implementation< 2, 2, 4, 192, 16, 971>, + block_fft_implementation< 2, 2, 4, 256, 32, 972> + >; +}; + +template<> struct block_fft_record<16, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 64, 0, 973>, + block_fft_implementation< 4, 4, 4, 128, 32, 974>, + block_fft_implementation< 4, 4, 4, 160, 64, 975>, + block_fft_implementation< 8, 8, 2, 192, 32, 976>, + block_fft_implementation< 8, 8, 2, 192, 64, 977>, + block_fft_implementation< 2, 2, 8, 128, 32, 978>, + block_fft_implementation< 2, 2, 8, 160, 64, 979> + >; +}; + +template<> struct block_fft_record<32, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 64, 0, 980>, + block_fft_implementation< 8, 8, 4, 60, 128, 981>, + block_fft_implementation< 8, 8, 4, 96, 64, 982>, + block_fft_implementation< 4, 4, 8, 64, 128, 983>, + block_fft_implementation< 4, 4, 8, 128, 64, 984>, + block_fft_implementation< 2, 2, 16, 60, 128, 985>, + block_fft_implementation<16, 16, 2, 192, 128, 986>, + block_fft_implementation< 2, 2, 16, 48, 64, 987>, + block_fft_implementation<16, 16, 2, 54, 64, 988> + >; +}; + +template<> struct block_fft_record<64, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 32, 256, 989>, + block_fft_implementation< 8, 8, 8, 32, 128, 990>, + block_fft_implementation< 4, 4, 16, 24, 256, 991>, + block_fft_implementation< 4, 4, 16, 32, 128, 992>, + block_fft_implementation<16, 16, 4, 62, 256, 993>, + block_fft_implementation<16, 16, 4, 28, 128, 994>, + block_fft_implementation<32, 32, 2, 56, 256, 995>, + block_fft_implementation< 2, 2, 32, 24, 256, 996>, + block_fft_implementation< 2, 2, 32, 24, 128, 997>, + block_fft_implementation<32, 32, 2, 60, 128, 998> + >; +}; + +template<> struct block_fft_record<128, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 16, 256, 999>, + block_fft_implementation< 8, 8, 16, 16, 512, 1000>, + block_fft_implementation<16, 16, 8, 32, 512, 1001>, + block_fft_implementation< 8, 8, 16, 16, 256, 1002>, + block_fft_implementation< 4, 4, 32, 8, 512, 1003>, + block_fft_implementation< 4, 4, 32, 16, 256, 1004>, + block_fft_implementation<32, 32, 4, 30, 512, 1005>, + block_fft_implementation<32, 32, 4, 32, 256, 1006>, + block_fft_implementation< 2, 2, 64, 10, 512, 1007>, + block_fft_implementation< 2, 2, 64, 10, 256, 1008> + >; +}; + +template<> struct block_fft_record<256, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 8, 512, 1009>, + block_fft_implementation< 8, 8, 32, 4, 1024, 1010>, + block_fft_implementation< 8, 8, 32, 8, 512, 1011>, + block_fft_implementation<16, 16, 16, 8, 1024, 1012>, + block_fft_implementation<32, 32, 8, 16, 1024, 1013>, + block_fft_implementation< 4, 4, 64, 4, 1024, 1014>, + block_fft_implementation< 4, 4, 64, 8, 512, 1015>, + block_fft_implementation<32, 32, 8, 16, 512, 1016>, + block_fft_implementation< 2, 2, 128, 6, 1024, 1017>, + block_fft_implementation< 2, 2, 128, 6, 512, 1018> + >; +}; + +template<> struct block_fft_record<512, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 2, 2048, 1019>, + block_fft_implementation< 8, 8, 64, 2, 1024, 1020>, + block_fft_implementation<32, 32, 16, 4, 2048, 1022>, + block_fft_implementation<16, 16, 32, 4, 1024, 1021>, + block_fft_implementation<16, 16, 32, 4, 2048, 1024>, + block_fft_implementation<32, 32, 16, 8, 1024, 1023>, + block_fft_implementation< 4, 4, 128, 2, 2048, 1025>, + block_fft_implementation< 4, 4, 128, 4, 1024, 1026>, + block_fft_implementation< 2, 2, 256, 4, 2048, 1027>, + block_fft_implementation< 2, 2, 256, 2, 1024, 1028> + >; +}; + +template<> struct block_fft_record<1024, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 32, 2, 4096, 1031>, + block_fft_implementation<16, 16, 64, 2, 2048, 1029>, + block_fft_implementation< 8, 8, 128, 2, 2048, 1032>, + block_fft_implementation<32, 32, 32, 4, 2048, 1033>, + block_fft_implementation< 8, 8, 128, 2, 4096, 1030>, + block_fft_implementation<16, 16, 64, 2, 4096, 1034>, + block_fft_implementation< 4, 4, 256, 2, 4096, 1035>, + block_fft_implementation< 4, 4, 256, 2, 2048, 1036>, + block_fft_implementation< 2, 2, 512, 2, 4096, 1037>, + block_fft_implementation< 2, 2, 512, 2, 2048, 1038> + >; +}; + +template<> struct block_fft_record<2048, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 2, 4096, 1039>, + block_fft_implementation< 8, 8, 256, 2, 4096, 1041>, + block_fft_implementation< 8, 8, 256, 2, 8192, 1040>, + block_fft_implementation<16, 16, 128, 2, 8192, 1042>, + block_fft_implementation<32, 32, 64, 2, 8192, 1043>, + block_fft_implementation<32, 32, 64, 2, 4096, 1044>, + block_fft_implementation< 4, 4, 512, 2, 8192, 1045>, + block_fft_implementation< 4, 4, 512, 2, 4096, 1046>, + block_fft_implementation< 2, 2, 1024, 2, 8192, 1047>, + block_fft_implementation< 2, 2, 1024, 2, 4096, 1048> + >; +}; + +template<> struct block_fft_record<4096, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 2, 8192, 1049>, + block_fft_implementation< 8, 8, 512, 2, 8192, 1051>, + block_fft_implementation<16, 16, 256, 2, 16384, 1052>, + block_fft_implementation<32, 32, 128, 2, 16384, 1053>, + block_fft_implementation< 8, 8, 512, 2, 16384, 1050>, + block_fft_implementation<32, 32, 128, 2, 8192, 1054>, + block_fft_implementation< 4, 4, 1024, 2, 16384, 1055>, + block_fft_implementation< 4, 4, 1024, 2, 8192, 1056> + >; +}; + +template<> struct block_fft_record<8192, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 256, 2, 32768, 1058>, + block_fft_implementation<32, 32, 256, 2, 16384, 1059>, + block_fft_implementation<16, 16, 512, 2, 16384, 1057>, + block_fft_implementation<16, 16, 512, 2, 32768, 1061>, + block_fft_implementation< 8, 8, 1024, 2, 16384, 1062>, + block_fft_implementation< 8, 8, 1024, 2, 32768, 1060> + >; +}; + +template<> struct block_fft_record<16384, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 512, 2, 65536, 1180>, + block_fft_implementation<16, 16, 1024, 2, 32768, 1063>, + block_fft_implementation<16, 16, 1024, 2, 65536, 1181>, + block_fft_implementation<32, 32, 512, 2, 32768, 1064> + >; +}; + +template<> struct block_fft_record<32768, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1024, 2, 65536, 1182> + >; +}; + +template<> struct block_fft_record<3, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 1065> + >; +}; + +template<> struct block_fft_record<9, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 64, 0, 1066>, + block_fft_implementation< 3, 3, 3, 42, 36, 1067>, + block_fft_implementation< 3, 3, 3, 128, 18, 1068> + >; +}; + +template<> struct block_fft_record<27, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 64, 0, 1069>, + block_fft_implementation< 9, 9, 3, 42, 108, 1071>, + block_fft_implementation< 9, 9, 3, 42, 54, 1070>, + block_fft_implementation< 3, 3, 9, 56, 108, 1072>, + block_fft_implementation< 3, 3, 9, 56, 54, 1073> + >; +}; + +template<> struct block_fft_record<81, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 14, 324, 1074>, + block_fft_implementation< 9, 9, 9, 14, 162, 1075>, + block_fft_implementation<27, 27, 3, 42, 324, 1077>, + block_fft_implementation<27, 27, 3, 42, 162, 1076>, + block_fft_implementation< 3, 3, 27, 14, 324, 1078>, + block_fft_implementation< 3, 3, 27, 28, 162, 1079> + >; +}; + +template<> struct block_fft_record<243, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 14, 972, 1080>, + block_fft_implementation<27, 27, 9, 14, 486, 1081>, + block_fft_implementation< 9, 9, 27, 28, 972, 1082>, + block_fft_implementation< 9, 9, 27, 18, 486, 1083>, + block_fft_implementation< 3, 3, 81, 6, 972, 1084>, + block_fft_implementation< 3, 3, 81, 6, 486, 1085> + >; +}; + +template<> struct block_fft_record<729, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 28, 2916, 1086>, + block_fft_implementation<27, 27, 27, 18, 1458, 1087>, + block_fft_implementation< 9, 9, 81, 6, 2916, 1088>, + block_fft_implementation< 9, 9, 81, 6, 1458, 1089>, + block_fft_implementation< 3, 3, 243, 2, 2916, 1090>, + block_fft_implementation< 3, 3, 243, 2, 1458, 1091> + >; +}; + +template<> struct block_fft_record<2187, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 81, 6, 8748, 1093>, + block_fft_implementation< 9, 9, 243, 2, 8748, 1092>, + block_fft_implementation< 9, 9, 243, 2, 4374, 1094>, + block_fft_implementation<27, 27, 81, 6, 4374, 1095>, + block_fft_implementation< 3, 3, 729, 2, 8748, 1096>, + block_fft_implementation< 3, 3, 729, 2, 4374, 1097> + >; +}; + +template<> struct block_fft_record<6561, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 2, 26244, 1099>, + block_fft_implementation<27, 27, 243, 2, 13122, 1098>, + block_fft_implementation< 9, 9, 729, 2, 26244, 1101>, + block_fft_implementation< 9, 9, 729, 2, 13122, 1100> + >; +}; + +template<> struct block_fft_record<19683, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 729, 2, 78732, 1183>, + block_fft_implementation<27, 27, 729, 2, 39366, 1184> + >; +}; + +template<> struct block_fft_record<5, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 128, 0, 1102> + >; +}; + +template<> struct block_fft_record<25, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 64, 0, 1103>, + block_fft_implementation< 5, 5, 5, 50, 100, 1104>, + block_fft_implementation< 5, 5, 5, 50, 50, 1105> + >; +}; + +template<> struct block_fft_record<125, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 12, 500, 1107>, + block_fft_implementation<25, 25, 5, 50, 250, 1106>, + block_fft_implementation< 5, 5, 25, 10, 500, 1108>, + block_fft_implementation< 5, 5, 25, 10, 250, 1109> + >; +}; + +template<> struct block_fft_record<625, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 10, 2500, 1111>, + block_fft_implementation<25, 25, 25, 10, 1250, 1110>, + block_fft_implementation< 5, 5, 125, 2, 2500, 1112>, + block_fft_implementation< 5, 5, 125, 2, 1250, 1113> + >; +}; + +template<> struct block_fft_record<3125, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 2, 12500, 1115>, + block_fft_implementation<25, 25, 125, 2, 6250, 1114>, + block_fft_implementation< 5, 5, 625, 2, 12500, 1116>, + block_fft_implementation< 5, 5, 625, 2, 6250, 1117> + >; +}; + +template<> struct block_fft_record<15625, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 2, 62500, 1185>, + block_fft_implementation<25, 25, 625, 2, 31250, 1118> + >; +}; + +template<> struct block_fft_record<7, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 64, 0, 1119> + >; +}; + +template<> struct block_fft_record<49, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 18, 196, 1120>, + block_fft_implementation< 7, 7, 7, 18, 98, 1121> + >; +}; + +template<> struct block_fft_record<343, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 10, 1372, 1122>, + block_fft_implementation< 7, 7, 49, 10, 686, 1123> + >; +}; + +template<> struct block_fft_record<2401, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 2, 9604, 1124>, + block_fft_implementation< 7, 7, 343, 2, 4802, 1125> + >; +}; + +template<> struct block_fft_record<11, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 64, 0, 1126> + >; +}; + +template<> struct block_fft_record<121, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 46, 484, 1127>, + block_fft_implementation<11, 11, 11, 46, 242, 1128> + >; +}; + +template<> struct block_fft_record<1331, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 2, 5324, 1130>, + block_fft_implementation<11, 11, 121, 2, 2662, 1129> + >; +}; + +template<> struct block_fft_record<6, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 128, 0, 1131> + >; +}; + +template<> struct block_fft_record<36, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 42, 144, 1133>, + block_fft_implementation< 6, 6, 6, 42, 72, 1132> + >; +}; + +template<> struct block_fft_record<216, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 14, 864, 1134>, + block_fft_implementation< 6, 6, 36, 14, 432, 1135> + >; +}; + +template<> struct block_fft_record<1296, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 8, 5184, 1136>, + block_fft_implementation< 6, 6, 216, 2, 2592, 1137> + >; +}; + +template<> struct block_fft_record<10, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 64, 0, 1138> + >; +}; + +template<> struct block_fft_record<100, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 6, 400, 1140>, + block_fft_implementation<10, 10, 10, 6, 200, 1139> + >; +}; + +template<> struct block_fft_record<1000, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 10, 4000, 1141>, + block_fft_implementation<10, 10, 100, 10, 2000, 1142> + >; +}; + +template<> struct block_fft_record<10000, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 2, 40000, 1143>, + block_fft_implementation<10, 10, 1000, 2, 20000, 1144> + >; +}; + +template<> struct block_fft_record<12, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 64, 0, 1145> + >; +}; + +template<> struct block_fft_record<144, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 42, 576, 1146>, + block_fft_implementation<12, 12, 12, 10, 288, 1147> + >; +}; + +template<> struct block_fft_record<1728, half, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 14, 6912, 1148>, + block_fft_implementation<12, 12, 144, 8, 3456, 1149> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..b13a14ed2dd1a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp32_fwd.hpp.inc @@ -0,0 +1,601 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 0> + >; +}; + +template<> struct block_fft_record<14, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 1> + >; +}; + +template<> struct block_fft_record<15, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 2> + >; +}; + +template<> struct block_fft_record<17, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 3> + >; +}; + +template<> struct block_fft_record<18, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 4> + >; +}; + +template<> struct block_fft_record<19, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 5> + >; +}; + +template<> struct block_fft_record<20, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 6> + >; +}; + +template<> struct block_fft_record<21, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 7> + >; +}; + +template<> struct block_fft_record<22, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 8> + >; +}; + +template<> struct block_fft_record<23, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 9> + >; +}; + +template<> struct block_fft_record<24, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 10> + >; +}; + +template<> struct block_fft_record<26, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 11> + >; +}; + +template<> struct block_fft_record<28, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 12> + >; +}; + +template<> struct block_fft_record<29, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 13> + >; +}; + +template<> struct block_fft_record<30, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 14> + >; +}; + +template<> struct block_fft_record<31, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 15> + >; +}; + +template<> struct block_fft_record<2, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 128, 0, 16> + >; +}; + +template<> struct block_fft_record<4, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 64, 0, 17>, + block_fft_implementation< 2, 2, 2, 128, 16, 18>, + block_fft_implementation< 2, 2, 2, 128, 32, 19> + >; +}; + +template<> struct block_fft_record<8, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 20>, + block_fft_implementation< 4, 4, 2, 112, 32, 21>, + block_fft_implementation< 4, 4, 2, 128, 64, 22>, + block_fft_implementation< 2, 2, 4, 96, 32, 23>, + block_fft_implementation< 2, 2, 4, 128, 64, 24> + >; +}; + +template<> struct block_fft_record<16, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 32, 0, 25>, + block_fft_implementation< 4, 4, 4, 64, 64, 26>, + block_fft_implementation< 4, 4, 4, 80, 128, 27>, + block_fft_implementation< 8, 8, 2, 96, 64, 28>, + block_fft_implementation< 8, 8, 2, 96, 128, 29>, + block_fft_implementation< 2, 2, 8, 64, 64, 30>, + block_fft_implementation< 2, 2, 8, 80, 128, 31> + >; +}; + +template<> struct block_fft_record<32, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 32, 0, 32>, + block_fft_implementation< 8, 8, 4, 30, 256, 33>, + block_fft_implementation< 8, 8, 4, 48, 128, 34>, + block_fft_implementation< 4, 4, 8, 32, 256, 35>, + block_fft_implementation< 4, 4, 8, 64, 128, 36>, + block_fft_implementation< 2, 2, 16, 30, 256, 37>, + block_fft_implementation<16, 16, 2, 96, 256, 38>, + block_fft_implementation< 2, 2, 16, 24, 128, 39>, + block_fft_implementation<16, 16, 2, 27, 128, 40> + >; +}; + +template<> struct block_fft_record<64, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 16, 512, 41>, + block_fft_implementation< 8, 8, 8, 16, 256, 42>, + block_fft_implementation< 4, 4, 16, 12, 512, 43>, + block_fft_implementation< 4, 4, 16, 16, 256, 44>, + block_fft_implementation<16, 16, 4, 31, 512, 45>, + block_fft_implementation<16, 16, 4, 14, 256, 46>, + block_fft_implementation<32, 32, 2, 28, 512, 47>, + block_fft_implementation< 2, 2, 32, 12, 512, 48>, + block_fft_implementation< 2, 2, 32, 12, 256, 49>, + block_fft_implementation<32, 32, 2, 30, 256, 50> + >; +}; + +template<> struct block_fft_record<128, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 8, 512, 51>, + block_fft_implementation< 8, 8, 16, 8, 1024, 52>, + block_fft_implementation<16, 16, 8, 16, 1024, 53>, + block_fft_implementation< 8, 8, 16, 8, 512, 54>, + block_fft_implementation< 4, 4, 32, 4, 1024, 55>, + block_fft_implementation< 4, 4, 32, 8, 512, 56>, + block_fft_implementation<32, 32, 4, 15, 1024, 57>, + block_fft_implementation<32, 32, 4, 16, 512, 58>, + block_fft_implementation< 2, 2, 64, 5, 1024, 59>, + block_fft_implementation< 2, 2, 64, 5, 512, 60> + >; +}; + +template<> struct block_fft_record<256, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 4, 1024, 61>, + block_fft_implementation< 8, 8, 32, 2, 2048, 62>, + block_fft_implementation< 8, 8, 32, 4, 1024, 63>, + block_fft_implementation<16, 16, 16, 4, 2048, 64>, + block_fft_implementation<32, 32, 8, 8, 2048, 65>, + block_fft_implementation< 4, 4, 64, 2, 2048, 66>, + block_fft_implementation< 4, 4, 64, 4, 1024, 67>, + block_fft_implementation<32, 32, 8, 8, 1024, 68>, + block_fft_implementation< 2, 2, 128, 3, 2048, 69>, + block_fft_implementation< 2, 2, 128, 3, 1024, 70> + >; +}; + +template<> struct block_fft_record<512, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 1, 4096, 71>, + block_fft_implementation< 8, 8, 64, 1, 2048, 72>, + block_fft_implementation<32, 32, 16, 2, 4096, 74>, + block_fft_implementation<16, 16, 32, 2, 2048, 73>, + block_fft_implementation<16, 16, 32, 2, 4096, 76>, + block_fft_implementation<32, 32, 16, 4, 2048, 75>, + block_fft_implementation< 4, 4, 128, 1, 4096, 77>, + block_fft_implementation< 4, 4, 128, 2, 2048, 78>, + block_fft_implementation< 2, 2, 256, 2, 4096, 79>, + block_fft_implementation< 2, 2, 256, 1, 2048, 80> + >; +}; + +template<> struct block_fft_record<1024, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 32, 1, 8192, 83>, + block_fft_implementation<16, 16, 64, 1, 4096, 81>, + block_fft_implementation< 8, 8, 128, 1, 4096, 84>, + block_fft_implementation<32, 32, 32, 2, 4096, 85>, + block_fft_implementation< 8, 8, 128, 1, 8192, 82>, + block_fft_implementation<16, 16, 64, 1, 8192, 86>, + block_fft_implementation< 4, 4, 256, 1, 8192, 87>, + block_fft_implementation< 4, 4, 256, 1, 4096, 88>, + block_fft_implementation< 2, 2, 512, 1, 8192, 89>, + block_fft_implementation< 2, 2, 512, 1, 4096, 90> + >; +}; + +template<> struct block_fft_record<2048, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 1, 8192, 91>, + block_fft_implementation< 8, 8, 256, 1, 8192, 93>, + block_fft_implementation< 8, 8, 256, 1, 16384, 92>, + block_fft_implementation<16, 16, 128, 1, 16384, 94>, + block_fft_implementation<32, 32, 64, 1, 16384, 95>, + block_fft_implementation<32, 32, 64, 1, 8192, 96>, + block_fft_implementation< 4, 4, 512, 1, 16384, 97>, + block_fft_implementation< 4, 4, 512, 1, 8192, 98>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 99>, + block_fft_implementation< 2, 2, 1024, 1, 8192, 100> + >; +}; + +template<> struct block_fft_record<4096, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 1, 16384, 101>, + block_fft_implementation< 8, 8, 512, 1, 16384, 103>, + block_fft_implementation<16, 16, 256, 1, 32768, 104>, + block_fft_implementation<32, 32, 128, 1, 32768, 105>, + block_fft_implementation< 8, 8, 512, 1, 32768, 102>, + block_fft_implementation<32, 32, 128, 1, 16384, 106>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 107>, + block_fft_implementation< 4, 4, 1024, 1, 16384, 108> + >; +}; + +template<> struct block_fft_record<8192, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 256, 1, 65536, 110>, + block_fft_implementation<32, 32, 256, 1, 32768, 111>, + block_fft_implementation<16, 16, 512, 1, 32768, 109>, + block_fft_implementation<16, 16, 512, 1, 65536, 113>, + block_fft_implementation< 8, 8, 1024, 1, 32768, 114>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 112> + >; +}; + +template<> struct block_fft_record<16384, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 512, 1, 131072, 1150>, + block_fft_implementation<16, 16, 1024, 1, 65536, 115>, + block_fft_implementation<16, 16, 1024, 1, 131072, 1151>, + block_fft_implementation<32, 32, 512, 1, 65536, 116> + >; +}; + +template<> struct block_fft_record<32768, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1024, 1, 131072, 1152> + >; +}; + +template<> struct block_fft_record<3, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 32, 0, 117> + >; +}; + +template<> struct block_fft_record<9, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 32, 0, 118>, + block_fft_implementation< 3, 3, 3, 21, 72, 119>, + block_fft_implementation< 3, 3, 3, 64, 36, 120> + >; +}; + +template<> struct block_fft_record<27, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 32, 0, 121>, + block_fft_implementation< 9, 9, 3, 21, 216, 123>, + block_fft_implementation< 9, 9, 3, 21, 108, 122>, + block_fft_implementation< 3, 3, 9, 28, 216, 124>, + block_fft_implementation< 3, 3, 9, 28, 108, 125> + >; +}; + +template<> struct block_fft_record<81, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 7, 648, 126>, + block_fft_implementation< 9, 9, 9, 7, 324, 127>, + block_fft_implementation<27, 27, 3, 21, 648, 129>, + block_fft_implementation<27, 27, 3, 21, 324, 128>, + block_fft_implementation< 3, 3, 27, 7, 648, 130>, + block_fft_implementation< 3, 3, 27, 14, 324, 131> + >; +}; + +template<> struct block_fft_record<243, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 7, 1944, 132>, + block_fft_implementation<27, 27, 9, 7, 972, 133>, + block_fft_implementation< 9, 9, 27, 14, 1944, 134>, + block_fft_implementation< 9, 9, 27, 9, 972, 135>, + block_fft_implementation< 3, 3, 81, 3, 1944, 136>, + block_fft_implementation< 3, 3, 81, 3, 972, 137> + >; +}; + +template<> struct block_fft_record<729, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 14, 5832, 138>, + block_fft_implementation<27, 27, 27, 9, 2916, 139>, + block_fft_implementation< 9, 9, 81, 3, 5832, 140>, + block_fft_implementation< 9, 9, 81, 3, 2916, 141>, + block_fft_implementation< 3, 3, 243, 1, 5832, 142>, + block_fft_implementation< 3, 3, 243, 1, 2916, 143> + >; +}; + +template<> struct block_fft_record<2187, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 81, 3, 17496, 145>, + block_fft_implementation< 9, 9, 243, 1, 17496, 144>, + block_fft_implementation< 9, 9, 243, 1, 8748, 146>, + block_fft_implementation<27, 27, 81, 3, 8748, 147>, + block_fft_implementation< 3, 3, 729, 1, 17496, 148>, + block_fft_implementation< 3, 3, 729, 1, 8748, 149> + >; +}; + +template<> struct block_fft_record<6561, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 1, 52488, 151>, + block_fft_implementation<27, 27, 243, 1, 26244, 150>, + block_fft_implementation< 9, 9, 729, 1, 52488, 153>, + block_fft_implementation< 9, 9, 729, 1, 26244, 152> + >; +}; + +template<> struct block_fft_record<19683, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 729, 1, 157464, 1153>, + block_fft_implementation<27, 27, 729, 1, 78732, 1154> + >; +}; + +template<> struct block_fft_record<5, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 64, 0, 154> + >; +}; + +template<> struct block_fft_record<25, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 32, 0, 155>, + block_fft_implementation< 5, 5, 5, 25, 200, 156>, + block_fft_implementation< 5, 5, 5, 25, 100, 157> + >; +}; + +template<> struct block_fft_record<125, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 6, 1000, 159>, + block_fft_implementation<25, 25, 5, 25, 500, 158>, + block_fft_implementation< 5, 5, 25, 5, 1000, 160>, + block_fft_implementation< 5, 5, 25, 5, 500, 161> + >; +}; + +template<> struct block_fft_record<625, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 5000, 163>, + block_fft_implementation<25, 25, 25, 5, 2500, 162>, + block_fft_implementation< 5, 5, 125, 1, 5000, 164>, + block_fft_implementation< 5, 5, 125, 1, 2500, 165> + >; +}; + +template<> struct block_fft_record<3125, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 1, 25000, 167>, + block_fft_implementation<25, 25, 125, 1, 12500, 166>, + block_fft_implementation< 5, 5, 625, 1, 25000, 168>, + block_fft_implementation< 5, 5, 625, 1, 12500, 169> + >; +}; + +template<> struct block_fft_record<15625, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 1, 125000, 1155>, + block_fft_implementation<25, 25, 625, 1, 62500, 170> + >; +}; + +template<> struct block_fft_record<7, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 32, 0, 171> + >; +}; + +template<> struct block_fft_record<49, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 9, 392, 172>, + block_fft_implementation< 7, 7, 7, 9, 196, 173> + >; +}; + +template<> struct block_fft_record<343, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 2744, 174>, + block_fft_implementation< 7, 7, 49, 5, 1372, 175> + >; +}; + +template<> struct block_fft_record<2401, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 19208, 176>, + block_fft_implementation< 7, 7, 343, 1, 9604, 177> + >; +}; + +template<> struct block_fft_record<11, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 32, 0, 178> + >; +}; + +template<> struct block_fft_record<121, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 968, 179>, + block_fft_implementation<11, 11, 11, 23, 484, 180> + >; +}; + +template<> struct block_fft_record<1331, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 10648, 182>, + block_fft_implementation<11, 11, 121, 1, 5324, 181> + >; +}; + +template<> struct block_fft_record<6, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 64, 0, 183> + >; +}; + +template<> struct block_fft_record<36, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 21, 288, 185>, + block_fft_implementation< 6, 6, 6, 21, 144, 184> + >; +}; + +template<> struct block_fft_record<216, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 186>, + block_fft_implementation< 6, 6, 36, 7, 864, 187> + >; +}; + +template<> struct block_fft_record<1296, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 4, 10368, 188>, + block_fft_implementation< 6, 6, 216, 1, 5184, 189> + >; +}; + +template<> struct block_fft_record<10, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 190> + >; +}; + +template<> struct block_fft_record<100, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 3, 800, 192>, + block_fft_implementation<10, 10, 10, 3, 400, 191> + >; +}; + +template<> struct block_fft_record<1000, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 8000, 193>, + block_fft_implementation<10, 10, 100, 5, 4000, 194> + >; +}; + +template<> struct block_fft_record<10000, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 1, 80000, 195>, + block_fft_implementation<10, 10, 1000, 1, 40000, 196> + >; +}; + +template<> struct block_fft_record<12, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 32, 0, 197> + >; +}; + +template<> struct block_fft_record<144, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 21, 1152, 198>, + block_fft_implementation<12, 12, 12, 5, 576, 199> + >; +}; + +template<> struct block_fft_record<1728, float, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 7, 13824, 200>, + block_fft_implementation<12, 12, 144, 4, 6912, 201> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..b46cd50f5f3f2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp32_inv.hpp.inc @@ -0,0 +1,601 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 202> + >; +}; + +template<> struct block_fft_record<14, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 203> + >; +}; + +template<> struct block_fft_record<15, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 204> + >; +}; + +template<> struct block_fft_record<17, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 205> + >; +}; + +template<> struct block_fft_record<18, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 206> + >; +}; + +template<> struct block_fft_record<19, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 207> + >; +}; + +template<> struct block_fft_record<20, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 208> + >; +}; + +template<> struct block_fft_record<21, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 209> + >; +}; + +template<> struct block_fft_record<22, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 210> + >; +}; + +template<> struct block_fft_record<23, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 211> + >; +}; + +template<> struct block_fft_record<24, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 212> + >; +}; + +template<> struct block_fft_record<26, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 213> + >; +}; + +template<> struct block_fft_record<28, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 214> + >; +}; + +template<> struct block_fft_record<29, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 215> + >; +}; + +template<> struct block_fft_record<30, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 216> + >; +}; + +template<> struct block_fft_record<31, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 217> + >; +}; + +template<> struct block_fft_record<2, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 128, 0, 218> + >; +}; + +template<> struct block_fft_record<4, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 64, 0, 219>, + block_fft_implementation< 2, 2, 2, 128, 16, 220>, + block_fft_implementation< 2, 2, 2, 128, 32, 221> + >; +}; + +template<> struct block_fft_record<8, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 222>, + block_fft_implementation< 4, 4, 2, 112, 32, 223>, + block_fft_implementation< 4, 4, 2, 128, 64, 224>, + block_fft_implementation< 2, 2, 4, 96, 32, 225>, + block_fft_implementation< 2, 2, 4, 128, 64, 226> + >; +}; + +template<> struct block_fft_record<16, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 32, 0, 227>, + block_fft_implementation< 4, 4, 4, 64, 64, 228>, + block_fft_implementation< 4, 4, 4, 80, 128, 229>, + block_fft_implementation< 8, 8, 2, 96, 64, 230>, + block_fft_implementation< 8, 8, 2, 96, 128, 231>, + block_fft_implementation< 2, 2, 8, 64, 64, 232>, + block_fft_implementation< 2, 2, 8, 80, 128, 233> + >; +}; + +template<> struct block_fft_record<32, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1, 32, 0, 234>, + block_fft_implementation< 8, 8, 4, 30, 256, 235>, + block_fft_implementation< 8, 8, 4, 48, 128, 236>, + block_fft_implementation< 4, 4, 8, 32, 256, 237>, + block_fft_implementation< 4, 4, 8, 64, 128, 238>, + block_fft_implementation< 2, 2, 16, 30, 256, 239>, + block_fft_implementation<16, 16, 2, 96, 256, 240>, + block_fft_implementation< 2, 2, 16, 24, 128, 241>, + block_fft_implementation<16, 16, 2, 27, 128, 242> + >; +}; + +template<> struct block_fft_record<64, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 16, 512, 243>, + block_fft_implementation< 8, 8, 8, 16, 256, 244>, + block_fft_implementation< 4, 4, 16, 12, 512, 245>, + block_fft_implementation< 4, 4, 16, 16, 256, 246>, + block_fft_implementation<16, 16, 4, 31, 512, 247>, + block_fft_implementation<16, 16, 4, 14, 256, 248>, + block_fft_implementation<32, 32, 2, 28, 512, 249>, + block_fft_implementation< 2, 2, 32, 12, 512, 250>, + block_fft_implementation< 2, 2, 32, 12, 256, 251>, + block_fft_implementation<32, 32, 2, 30, 256, 252> + >; +}; + +template<> struct block_fft_record<128, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 8, 8, 512, 253>, + block_fft_implementation< 8, 8, 16, 8, 1024, 254>, + block_fft_implementation<16, 16, 8, 16, 1024, 255>, + block_fft_implementation< 8, 8, 16, 8, 512, 256>, + block_fft_implementation< 4, 4, 32, 4, 1024, 257>, + block_fft_implementation< 4, 4, 32, 8, 512, 258>, + block_fft_implementation<32, 32, 4, 15, 1024, 259>, + block_fft_implementation<32, 32, 4, 16, 512, 260>, + block_fft_implementation< 2, 2, 64, 5, 1024, 261>, + block_fft_implementation< 2, 2, 64, 5, 512, 262> + >; +}; + +template<> struct block_fft_record<256, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 16, 4, 1024, 263>, + block_fft_implementation< 8, 8, 32, 2, 2048, 264>, + block_fft_implementation< 8, 8, 32, 4, 1024, 265>, + block_fft_implementation<16, 16, 16, 4, 2048, 266>, + block_fft_implementation<32, 32, 8, 8, 2048, 267>, + block_fft_implementation< 4, 4, 64, 2, 2048, 268>, + block_fft_implementation< 4, 4, 64, 4, 1024, 269>, + block_fft_implementation<32, 32, 8, 8, 1024, 270>, + block_fft_implementation< 2, 2, 128, 3, 2048, 271>, + block_fft_implementation< 2, 2, 128, 3, 1024, 272> + >; +}; + +template<> struct block_fft_record<512, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 1, 4096, 273>, + block_fft_implementation< 8, 8, 64, 1, 2048, 274>, + block_fft_implementation<32, 32, 16, 2, 4096, 276>, + block_fft_implementation<16, 16, 32, 2, 2048, 275>, + block_fft_implementation<16, 16, 32, 2, 4096, 278>, + block_fft_implementation<32, 32, 16, 4, 2048, 277>, + block_fft_implementation< 4, 4, 128, 1, 4096, 279>, + block_fft_implementation< 4, 4, 128, 2, 2048, 280>, + block_fft_implementation< 2, 2, 256, 2, 4096, 281>, + block_fft_implementation< 2, 2, 256, 1, 2048, 282> + >; +}; + +template<> struct block_fft_record<1024, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 32, 1, 8192, 285>, + block_fft_implementation<16, 16, 64, 1, 4096, 283>, + block_fft_implementation< 8, 8, 128, 1, 4096, 286>, + block_fft_implementation<32, 32, 32, 2, 4096, 287>, + block_fft_implementation< 8, 8, 128, 1, 8192, 284>, + block_fft_implementation<16, 16, 64, 1, 8192, 288>, + block_fft_implementation< 4, 4, 256, 1, 8192, 289>, + block_fft_implementation< 4, 4, 256, 1, 4096, 290>, + block_fft_implementation< 2, 2, 512, 1, 8192, 291>, + block_fft_implementation< 2, 2, 512, 1, 4096, 292> + >; +}; + +template<> struct block_fft_record<2048, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 128, 1, 8192, 293>, + block_fft_implementation< 8, 8, 256, 1, 8192, 295>, + block_fft_implementation< 8, 8, 256, 1, 16384, 294>, + block_fft_implementation<16, 16, 128, 1, 16384, 296>, + block_fft_implementation<32, 32, 64, 1, 16384, 297>, + block_fft_implementation<32, 32, 64, 1, 8192, 298>, + block_fft_implementation< 4, 4, 512, 1, 16384, 299>, + block_fft_implementation< 4, 4, 512, 1, 8192, 300>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 301>, + block_fft_implementation< 2, 2, 1024, 1, 8192, 302> + >; +}; + +template<> struct block_fft_record<4096, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 256, 1, 16384, 303>, + block_fft_implementation< 8, 8, 512, 1, 16384, 305>, + block_fft_implementation<16, 16, 256, 1, 32768, 306>, + block_fft_implementation<32, 32, 128, 1, 32768, 307>, + block_fft_implementation< 8, 8, 512, 1, 32768, 304>, + block_fft_implementation<32, 32, 128, 1, 16384, 308>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 309>, + block_fft_implementation< 4, 4, 1024, 1, 16384, 310> + >; +}; + +template<> struct block_fft_record<8192, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 256, 1, 65536, 312>, + block_fft_implementation<32, 32, 256, 1, 32768, 313>, + block_fft_implementation<16, 16, 512, 1, 32768, 311>, + block_fft_implementation<16, 16, 512, 1, 65536, 315>, + block_fft_implementation< 8, 8, 1024, 1, 32768, 316>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 314> + >; +}; + +template<> struct block_fft_record<16384, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 512, 1, 131072, 1156>, + block_fft_implementation<16, 16, 1024, 1, 65536, 317>, + block_fft_implementation<16, 16, 1024, 1, 131072, 1157>, + block_fft_implementation<32, 32, 512, 1, 65536, 318> + >; +}; + +template<> struct block_fft_record<32768, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<32, 32, 1024, 1, 131072, 1158> + >; +}; + +template<> struct block_fft_record<3, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 32, 0, 319> + >; +}; + +template<> struct block_fft_record<9, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 32, 0, 320>, + block_fft_implementation< 3, 3, 3, 21, 72, 321>, + block_fft_implementation< 3, 3, 3, 64, 36, 322> + >; +}; + +template<> struct block_fft_record<27, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 32, 0, 323>, + block_fft_implementation< 9, 9, 3, 21, 216, 325>, + block_fft_implementation< 9, 9, 3, 21, 108, 324>, + block_fft_implementation< 3, 3, 9, 28, 216, 326>, + block_fft_implementation< 3, 3, 9, 28, 108, 327> + >; +}; + +template<> struct block_fft_record<81, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 7, 648, 328>, + block_fft_implementation< 9, 9, 9, 7, 324, 329>, + block_fft_implementation<27, 27, 3, 21, 648, 331>, + block_fft_implementation<27, 27, 3, 21, 324, 330>, + block_fft_implementation< 3, 3, 27, 7, 648, 332>, + block_fft_implementation< 3, 3, 27, 14, 324, 333> + >; +}; + +template<> struct block_fft_record<243, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 7, 1944, 334>, + block_fft_implementation<27, 27, 9, 7, 972, 335>, + block_fft_implementation< 9, 9, 27, 14, 1944, 336>, + block_fft_implementation< 9, 9, 27, 9, 972, 337>, + block_fft_implementation< 3, 3, 81, 3, 1944, 338>, + block_fft_implementation< 3, 3, 81, 3, 972, 339> + >; +}; + +template<> struct block_fft_record<729, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 14, 5832, 340>, + block_fft_implementation<27, 27, 27, 9, 2916, 341>, + block_fft_implementation< 9, 9, 81, 3, 5832, 342>, + block_fft_implementation< 9, 9, 81, 3, 2916, 343>, + block_fft_implementation< 3, 3, 243, 1, 5832, 344>, + block_fft_implementation< 3, 3, 243, 1, 2916, 345> + >; +}; + +template<> struct block_fft_record<2187, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 81, 3, 17496, 347>, + block_fft_implementation< 9, 9, 243, 1, 17496, 346>, + block_fft_implementation< 9, 9, 243, 1, 8748, 348>, + block_fft_implementation<27, 27, 81, 3, 8748, 349>, + block_fft_implementation< 3, 3, 729, 1, 17496, 350>, + block_fft_implementation< 3, 3, 729, 1, 8748, 351> + >; +}; + +template<> struct block_fft_record<6561, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 243, 1, 52488, 353>, + block_fft_implementation<27, 27, 243, 1, 26244, 352>, + block_fft_implementation< 9, 9, 729, 1, 52488, 355>, + block_fft_implementation< 9, 9, 729, 1, 26244, 354> + >; +}; + +template<> struct block_fft_record<19683, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 729, 1, 157464, 1159>, + block_fft_implementation<27, 27, 729, 1, 78732, 1160> + >; +}; + +template<> struct block_fft_record<5, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 64, 0, 356> + >; +}; + +template<> struct block_fft_record<25, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 32, 0, 357>, + block_fft_implementation< 5, 5, 5, 25, 200, 358>, + block_fft_implementation< 5, 5, 5, 25, 100, 359> + >; +}; + +template<> struct block_fft_record<125, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 6, 1000, 361>, + block_fft_implementation<25, 25, 5, 25, 500, 360>, + block_fft_implementation< 5, 5, 25, 5, 1000, 362>, + block_fft_implementation< 5, 5, 25, 5, 500, 363> + >; +}; + +template<> struct block_fft_record<625, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 5000, 365>, + block_fft_implementation<25, 25, 25, 5, 2500, 364>, + block_fft_implementation< 5, 5, 125, 1, 5000, 366>, + block_fft_implementation< 5, 5, 125, 1, 2500, 367> + >; +}; + +template<> struct block_fft_record<3125, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 125, 1, 25000, 369>, + block_fft_implementation<25, 25, 125, 1, 12500, 368>, + block_fft_implementation< 5, 5, 625, 1, 25000, 370>, + block_fft_implementation< 5, 5, 625, 1, 12500, 371> + >; +}; + +template<> struct block_fft_record<15625, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 625, 1, 125000, 1161>, + block_fft_implementation<25, 25, 625, 1, 62500, 372> + >; +}; + +template<> struct block_fft_record<7, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 32, 0, 373> + >; +}; + +template<> struct block_fft_record<49, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 9, 392, 374>, + block_fft_implementation< 7, 7, 7, 9, 196, 375> + >; +}; + +template<> struct block_fft_record<343, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 2744, 376>, + block_fft_implementation< 7, 7, 49, 5, 1372, 377> + >; +}; + +template<> struct block_fft_record<2401, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 19208, 378>, + block_fft_implementation< 7, 7, 343, 1, 9604, 379> + >; +}; + +template<> struct block_fft_record<11, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 32, 0, 380> + >; +}; + +template<> struct block_fft_record<121, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 968, 381>, + block_fft_implementation<11, 11, 11, 23, 484, 382> + >; +}; + +template<> struct block_fft_record<1331, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 10648, 384>, + block_fft_implementation<11, 11, 121, 1, 5324, 383> + >; +}; + +template<> struct block_fft_record<6, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 64, 0, 385> + >; +}; + +template<> struct block_fft_record<36, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 21, 288, 387>, + block_fft_implementation< 6, 6, 6, 21, 144, 386> + >; +}; + +template<> struct block_fft_record<216, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 388>, + block_fft_implementation< 6, 6, 36, 7, 864, 389> + >; +}; + +template<> struct block_fft_record<1296, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 4, 10368, 390>, + block_fft_implementation< 6, 6, 216, 1, 5184, 391> + >; +}; + +template<> struct block_fft_record<10, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 392> + >; +}; + +template<> struct block_fft_record<100, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 3, 800, 394>, + block_fft_implementation<10, 10, 10, 3, 400, 393> + >; +}; + +template<> struct block_fft_record<1000, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 8000, 395>, + block_fft_implementation<10, 10, 100, 5, 4000, 396> + >; +}; + +template<> struct block_fft_record<10000, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 1, 80000, 397>, + block_fft_implementation<10, 10, 1000, 1, 40000, 398> + >; +}; + +template<> struct block_fft_record<12, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 32, 0, 399> + >; +}; + +template<> struct block_fft_record<144, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 21, 1152, 400>, + block_fft_implementation<12, 12, 12, 5, 576, 401> + >; +}; + +template<> struct block_fft_record<1728, float, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 7, 13824, 402>, + block_fft_implementation<12, 12, 144, 4, 6912, 403> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..795b319a7fa96 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp64_fwd.hpp.inc @@ -0,0 +1,552 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 404> + >; +}; + +template<> struct block_fft_record<14, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 405> + >; +}; + +template<> struct block_fft_record<15, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 406> + >; +}; + +template<> struct block_fft_record<17, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 407> + >; +}; + +template<> struct block_fft_record<18, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 408> + >; +}; + +template<> struct block_fft_record<19, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 409> + >; +}; + +template<> struct block_fft_record<20, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 410> + >; +}; + +template<> struct block_fft_record<21, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 411> + >; +}; + +template<> struct block_fft_record<22, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 412> + >; +}; + +template<> struct block_fft_record<23, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 413> + >; +}; + +template<> struct block_fft_record<24, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 414> + >; +}; + +template<> struct block_fft_record<26, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 415> + >; +}; + +template<> struct block_fft_record<28, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 416> + >; +}; + +template<> struct block_fft_record<29, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 417> + >; +}; + +template<> struct block_fft_record<30, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 418> + >; +}; + +template<> struct block_fft_record<31, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 419> + >; +}; + +template<> struct block_fft_record<2, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 64, 0, 420> + >; +}; + +template<> struct block_fft_record<4, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 32, 0, 421>, + block_fft_implementation< 2, 2, 2, 128, 32, 422>, + block_fft_implementation< 2, 2, 2, 128, 64, 423> + >; +}; + +template<> struct block_fft_record<8, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 424>, + block_fft_implementation< 4, 4, 2, 128, 64, 425>, + block_fft_implementation< 2, 2, 4, 112, 64, 426>, + block_fft_implementation< 4, 4, 2, 96, 128, 427>, + block_fft_implementation< 2, 2, 4, 80, 128, 428> + >; +}; + +template<> struct block_fft_record<16, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 32, 0, 429>, + block_fft_implementation< 4, 4, 4, 48, 256, 430>, + block_fft_implementation< 4, 4, 4, 64, 128, 431>, + block_fft_implementation< 2, 2, 8, 48, 256, 432>, + block_fft_implementation< 8, 8, 2, 64, 256, 434>, + block_fft_implementation< 8, 8, 2, 64, 128, 433>, + block_fft_implementation< 2, 2, 8, 64, 128, 435> + >; +}; + +template<> struct block_fft_record<32, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 4, 32, 256, 436>, + block_fft_implementation< 4, 4, 8, 16, 512, 437>, + block_fft_implementation< 8, 8, 4, 30, 512, 438>, + block_fft_implementation< 4, 4, 8, 28, 256, 439>, + block_fft_implementation<16, 16, 2, 30, 512, 440>, + block_fft_implementation< 2, 2, 16, 16, 512, 441>, + block_fft_implementation< 2, 2, 16, 30, 256, 442>, + block_fft_implementation<16, 16, 2, 31, 256, 443> + >; +}; + +template<> struct block_fft_record<64, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 15, 512, 444>, + block_fft_implementation< 4, 4, 16, 8, 1024, 446>, + block_fft_implementation< 8, 8, 8, 8, 1024, 447>, + block_fft_implementation< 4, 4, 16, 8, 512, 445>, + block_fft_implementation<16, 16, 4, 15, 1024, 448>, + block_fft_implementation<16, 16, 4, 15, 512, 449>, + block_fft_implementation< 2, 2, 32, 19, 512, 451>, + block_fft_implementation< 2, 2, 32, 7, 1024, 450> + >; +}; + +template<> struct block_fft_record<128, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 16, 8, 1024, 452>, + block_fft_implementation<16, 16, 8, 4, 2048, 453>, + block_fft_implementation<16, 16, 8, 8, 1024, 454>, + block_fft_implementation< 4, 4, 32, 4, 2048, 455>, + block_fft_implementation< 8, 8, 16, 4, 2048, 457>, + block_fft_implementation< 4, 4, 32, 4, 1024, 456>, + block_fft_implementation< 2, 2, 64, 4, 2048, 458>, + block_fft_implementation< 2, 2, 64, 8, 1024, 459> + >; +}; + +template<> struct block_fft_record<256, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 32, 2, 2048, 460>, + block_fft_implementation<16, 16, 16, 2, 4096, 461>, + block_fft_implementation<16, 16, 16, 4, 2048, 462>, + block_fft_implementation< 4, 4, 64, 2, 4096, 463>, + block_fft_implementation< 8, 8, 32, 2, 4096, 465>, + block_fft_implementation< 4, 4, 64, 2, 2048, 464>, + block_fft_implementation< 2, 2, 128, 2, 4096, 467>, + block_fft_implementation< 2, 2, 128, 4, 2048, 466> + >; +}; + +template<> struct block_fft_record<512, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 1, 4096, 468>, + block_fft_implementation< 8, 8, 64, 1, 8192, 469>, + block_fft_implementation<16, 16, 32, 2, 8192, 470>, + block_fft_implementation<16, 16, 32, 2, 4096, 471>, + block_fft_implementation< 4, 4, 128, 1, 8192, 473>, + block_fft_implementation< 4, 4, 128, 2, 4096, 472>, + block_fft_implementation< 2, 2, 256, 1, 8192, 475>, + block_fft_implementation< 2, 2, 256, 2, 4096, 474> + >; +}; + +template<> struct block_fft_record<1024, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 128, 1, 8192, 476>, + block_fft_implementation<16, 16, 64, 1, 8192, 477>, + block_fft_implementation<16, 16, 64, 1, 16384, 480>, + block_fft_implementation< 4, 4, 256, 1, 8192, 478>, + block_fft_implementation< 4, 4, 256, 1, 16384, 479>, + block_fft_implementation< 8, 8, 128, 1, 16384, 481>, + block_fft_implementation< 2, 2, 512, 1, 8192, 483>, + block_fft_implementation< 2, 2, 512, 1, 16384, 482> + >; +}; + +template<> struct block_fft_record<2048, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 256, 1, 16384, 484>, + block_fft_implementation<16, 16, 128, 1, 16384, 485>, + block_fft_implementation<16, 16, 128, 1, 32768, 486>, + block_fft_implementation< 8, 8, 256, 1, 32768, 487>, + block_fft_implementation< 4, 4, 512, 1, 16384, 488>, + block_fft_implementation< 4, 4, 512, 1, 32768, 489>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 490>, + block_fft_implementation< 2, 2, 1024, 1, 32768, 491> + >; +}; + +template<> struct block_fft_record<4096, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 512, 1, 32768, 492>, + block_fft_implementation<16, 16, 256, 1, 65536, 494>, + block_fft_implementation< 8, 8, 512, 1, 65536, 495>, + block_fft_implementation<16, 16, 256, 1, 32768, 493>, + block_fft_implementation< 4, 4, 1024, 1, 65536, 497>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 496> + >; +}; + +template<> struct block_fft_record<8192, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 1, 131072, 1162>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 499>, + block_fft_implementation<16, 16, 512, 1, 65536, 498>, + block_fft_implementation< 8, 8, 1024, 1, 131072, 1163> + >; +}; + +template<> struct block_fft_record<16384, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1024, 1, 131072, 1164> + >; +}; + +template<> struct block_fft_record<3, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 500> + >; +}; + +template<> struct block_fft_record<9, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 32, 0, 501>, + block_fft_implementation< 3, 3, 3, 64, 72, 502>, + block_fft_implementation< 3, 3, 3, 64, 144, 503> + >; +}; + +template<> struct block_fft_record<27, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 32, 0, 504>, + block_fft_implementation< 9, 9, 3, 21, 432, 506>, + block_fft_implementation< 9, 9, 3, 21, 216, 505>, + block_fft_implementation< 3, 3, 9, 28, 216, 507>, + block_fft_implementation< 3, 3, 9, 27, 432, 508> + >; +}; + +template<> struct block_fft_record<81, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 7, 1296, 509>, + block_fft_implementation< 9, 9, 9, 7, 648, 510>, + block_fft_implementation<27, 27, 3, 21, 1296, 512>, + block_fft_implementation<27, 27, 3, 21, 648, 511>, + block_fft_implementation< 3, 3, 27, 7, 1296, 513>, + block_fft_implementation< 3, 3, 27, 13, 648, 514> + >; +}; + +template<> struct block_fft_record<243, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 7, 3888, 516>, + block_fft_implementation<27, 27, 9, 7, 1944, 515>, + block_fft_implementation< 9, 9, 27, 14, 3888, 517>, + block_fft_implementation< 9, 9, 27, 1, 1944, 518>, + block_fft_implementation< 3, 3, 81, 3, 1944, 520>, + block_fft_implementation< 3, 3, 81, 3, 3888, 519> + >; +}; + +template<> struct block_fft_record<729, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 14, 5832, 523>, + block_fft_implementation<27, 27, 27, 9, 11664, 524>, + block_fft_implementation< 9, 9, 81, 3, 11664, 521>, + block_fft_implementation< 9, 9, 81, 3, 5832, 522>, + block_fft_implementation< 3, 3, 243, 1, 5832, 525>, + block_fft_implementation< 3, 3, 243, 1, 11664, 526> + >; +}; + +template<> struct block_fft_record<2187, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 1, 34992, 528>, + block_fft_implementation< 9, 9, 243, 1, 17496, 527>, + block_fft_implementation< 3, 3, 729, 1, 17496, 529>, + block_fft_implementation< 3, 3, 729, 1, 34992, 530> + >; +}; + +template<> struct block_fft_record<6561, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 729, 1, 104976, 1165>, + block_fft_implementation< 9, 9, 729, 1, 52488, 531> + >; +}; + +template<> struct block_fft_record<5, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 32, 0, 532> + >; +}; + +template<> struct block_fft_record<25, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 32, 0, 533>, + block_fft_implementation< 5, 5, 5, 25, 400, 534>, + block_fft_implementation< 5, 5, 5, 12, 200, 535> + >; +}; + +template<> struct block_fft_record<125, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 6, 2000, 539>, + block_fft_implementation< 5, 5, 25, 5, 1000, 537>, + block_fft_implementation< 5, 5, 25, 5, 2000, 536>, + block_fft_implementation<25, 25, 5, 6, 1000, 538> + >; +}; + +template<> struct block_fft_record<625, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 5000, 540>, + block_fft_implementation<25, 25, 25, 5, 10000, 543>, + block_fft_implementation< 5, 5, 125, 1, 5000, 541>, + block_fft_implementation< 5, 5, 125, 1, 10000, 542> + >; +}; + +template<> struct block_fft_record<3125, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 625, 1, 25000, 545>, + block_fft_implementation< 5, 5, 625, 1, 50000, 544> + >; +}; + +template<> struct block_fft_record<7, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 32, 0, 546> + >; +}; + +template<> struct block_fft_record<49, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 9, 784, 548>, + block_fft_implementation< 7, 7, 7, 9, 392, 547> + >; +}; + +template<> struct block_fft_record<343, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 5488, 549>, + block_fft_implementation< 7, 7, 49, 5, 2744, 550> + >; +}; + +template<> struct block_fft_record<2401, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 38416, 551>, + block_fft_implementation< 7, 7, 343, 1, 19208, 552> + >; +}; + +template<> struct block_fft_record<11, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 32, 0, 553> + >; +}; + +template<> struct block_fft_record<121, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 968, 555>, + block_fft_implementation<11, 11, 11, 11, 1936, 554> + >; +}; + +template<> struct block_fft_record<1331, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 10648, 556>, + block_fft_implementation<11, 11, 121, 1, 21296, 557> + >; +}; + +template<> struct block_fft_record<6, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 32, 0, 558> + >; +}; + +template<> struct block_fft_record<36, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 5, 576, 559>, + block_fft_implementation< 6, 6, 6, 5, 288, 560> + >; +}; + +template<> struct block_fft_record<216, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 561>, + block_fft_implementation< 6, 6, 36, 7, 3456, 562> + >; +}; + +template<> struct block_fft_record<1296, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 1, 20736, 564>, + block_fft_implementation< 6, 6, 216, 1, 10368, 563> + >; +}; + +template<> struct block_fft_record<10, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 565> + >; +}; + +template<> struct block_fft_record<100, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 3, 800, 567>, + block_fft_implementation<10, 10, 10, 3, 1600, 566> + >; +}; + +template<> struct block_fft_record<1000, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 16000, 568>, + block_fft_implementation<10, 10, 100, 1, 8000, 569> + >; +}; + +template<> struct block_fft_record<10000, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 1, 160000, 1166>, + block_fft_implementation<10, 10, 1000, 1, 80000, 1167> + >; +}; + +template<> struct block_fft_record<12, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 32, 0, 570> + >; +}; + +template<> struct block_fft_record<144, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 5, 1152, 571>, + block_fft_implementation<12, 12, 12, 5, 2304, 572> + >; +}; + +template<> struct block_fft_record<1728, double, fft_type::c2c, fft_direction::forward, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 5, 13824, 573>, + block_fft_implementation<12, 12, 144, 4, 27648, 574> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..1f38dbdffd977 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/800/database_fp64_inv.hpp.inc @@ -0,0 +1,552 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +template<> struct block_fft_record<13, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<13, 13, 1, 32, 0, 575> + >; +}; + +template<> struct block_fft_record<14, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<14, 14, 1, 32, 0, 576> + >; +}; + +template<> struct block_fft_record<15, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<15, 15, 1, 32, 0, 577> + >; +}; + +template<> struct block_fft_record<17, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<17, 17, 1, 32, 0, 578> + >; +}; + +template<> struct block_fft_record<18, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<18, 18, 1, 32, 0, 579> + >; +}; + +template<> struct block_fft_record<19, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<19, 19, 1, 32, 0, 580> + >; +}; + +template<> struct block_fft_record<20, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<20, 20, 1, 32, 0, 581> + >; +}; + +template<> struct block_fft_record<21, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<21, 21, 1, 32, 0, 582> + >; +}; + +template<> struct block_fft_record<22, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<22, 22, 1, 32, 0, 583> + >; +}; + +template<> struct block_fft_record<23, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<23, 23, 1, 32, 0, 584> + >; +}; + +template<> struct block_fft_record<24, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<24, 24, 1, 32, 0, 585> + >; +}; + +template<> struct block_fft_record<26, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<26, 26, 1, 32, 0, 586> + >; +}; + +template<> struct block_fft_record<28, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<28, 28, 1, 32, 0, 587> + >; +}; + +template<> struct block_fft_record<29, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<29, 29, 1, 32, 0, 588> + >; +}; + +template<> struct block_fft_record<30, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<30, 30, 1, 32, 0, 589> + >; +}; + +template<> struct block_fft_record<31, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<31, 31, 1, 32, 0, 590> + >; +}; + +template<> struct block_fft_record<2, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 2, 2, 1, 64, 0, 591> + >; +}; + +template<> struct block_fft_record<4, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 4, 4, 1, 32, 0, 592>, + block_fft_implementation< 2, 2, 2, 128, 32, 593>, + block_fft_implementation< 2, 2, 2, 128, 64, 594> + >; +}; + +template<> struct block_fft_record<8, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 1, 32, 0, 595>, + block_fft_implementation< 4, 4, 2, 128, 64, 596>, + block_fft_implementation< 2, 2, 4, 112, 64, 597>, + block_fft_implementation< 4, 4, 2, 96, 128, 598>, + block_fft_implementation< 2, 2, 4, 80, 128, 599> + >; +}; + +template<> struct block_fft_record<16, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1, 32, 0, 600>, + block_fft_implementation< 4, 4, 4, 48, 256, 601>, + block_fft_implementation< 4, 4, 4, 64, 128, 602>, + block_fft_implementation< 2, 2, 8, 48, 256, 603>, + block_fft_implementation< 8, 8, 2, 64, 256, 605>, + block_fft_implementation< 8, 8, 2, 64, 128, 604>, + block_fft_implementation< 2, 2, 8, 64, 128, 606> + >; +}; + +template<> struct block_fft_record<32, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 4, 32, 256, 607>, + block_fft_implementation< 4, 4, 8, 16, 512, 608>, + block_fft_implementation< 8, 8, 4, 30, 512, 609>, + block_fft_implementation< 4, 4, 8, 28, 256, 610>, + block_fft_implementation<16, 16, 2, 30, 512, 611>, + block_fft_implementation< 2, 2, 16, 16, 512, 612>, + block_fft_implementation< 2, 2, 16, 30, 256, 613>, + block_fft_implementation<16, 16, 2, 31, 256, 614> + >; +}; + +template<> struct block_fft_record<64, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 8, 15, 512, 615>, + block_fft_implementation< 4, 4, 16, 8, 1024, 617>, + block_fft_implementation< 8, 8, 8, 8, 1024, 618>, + block_fft_implementation< 4, 4, 16, 8, 512, 616>, + block_fft_implementation<16, 16, 4, 15, 1024, 619>, + block_fft_implementation<16, 16, 4, 15, 512, 620>, + block_fft_implementation< 2, 2, 32, 19, 512, 622>, + block_fft_implementation< 2, 2, 32, 7, 1024, 621> + >; +}; + +template<> struct block_fft_record<128, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 16, 8, 1024, 623>, + block_fft_implementation<16, 16, 8, 4, 2048, 624>, + block_fft_implementation<16, 16, 8, 8, 1024, 625>, + block_fft_implementation< 4, 4, 32, 4, 2048, 626>, + block_fft_implementation< 8, 8, 16, 4, 2048, 628>, + block_fft_implementation< 4, 4, 32, 4, 1024, 627>, + block_fft_implementation< 2, 2, 64, 4, 2048, 629>, + block_fft_implementation< 2, 2, 64, 8, 1024, 630> + >; +}; + +template<> struct block_fft_record<256, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 32, 2, 2048, 631>, + block_fft_implementation<16, 16, 16, 2, 4096, 632>, + block_fft_implementation<16, 16, 16, 4, 2048, 633>, + block_fft_implementation< 4, 4, 64, 2, 4096, 634>, + block_fft_implementation< 8, 8, 32, 2, 4096, 636>, + block_fft_implementation< 4, 4, 64, 2, 2048, 635>, + block_fft_implementation< 2, 2, 128, 2, 4096, 638>, + block_fft_implementation< 2, 2, 128, 4, 2048, 637> + >; +}; + +template<> struct block_fft_record<512, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 64, 1, 4096, 639>, + block_fft_implementation< 8, 8, 64, 1, 8192, 640>, + block_fft_implementation<16, 16, 32, 2, 8192, 641>, + block_fft_implementation<16, 16, 32, 2, 4096, 642>, + block_fft_implementation< 4, 4, 128, 1, 8192, 644>, + block_fft_implementation< 4, 4, 128, 2, 4096, 643>, + block_fft_implementation< 2, 2, 256, 1, 8192, 646>, + block_fft_implementation< 2, 2, 256, 2, 4096, 645> + >; +}; + +template<> struct block_fft_record<1024, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 128, 1, 8192, 647>, + block_fft_implementation<16, 16, 64, 1, 8192, 648>, + block_fft_implementation<16, 16, 64, 1, 16384, 651>, + block_fft_implementation< 4, 4, 256, 1, 8192, 649>, + block_fft_implementation< 4, 4, 256, 1, 16384, 650>, + block_fft_implementation< 8, 8, 128, 1, 16384, 652>, + block_fft_implementation< 2, 2, 512, 1, 8192, 654>, + block_fft_implementation< 2, 2, 512, 1, 16384, 653> + >; +}; + +template<> struct block_fft_record<2048, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 256, 1, 16384, 655>, + block_fft_implementation<16, 16, 128, 1, 16384, 656>, + block_fft_implementation<16, 16, 128, 1, 32768, 657>, + block_fft_implementation< 8, 8, 256, 1, 32768, 658>, + block_fft_implementation< 4, 4, 512, 1, 16384, 659>, + block_fft_implementation< 4, 4, 512, 1, 32768, 660>, + block_fft_implementation< 2, 2, 1024, 1, 16384, 661>, + block_fft_implementation< 2, 2, 1024, 1, 32768, 662> + >; +}; + +template<> struct block_fft_record<4096, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 8, 8, 512, 1, 32768, 663>, + block_fft_implementation<16, 16, 256, 1, 65536, 665>, + block_fft_implementation< 8, 8, 512, 1, 65536, 666>, + block_fft_implementation<16, 16, 256, 1, 32768, 664>, + block_fft_implementation< 4, 4, 1024, 1, 65536, 668>, + block_fft_implementation< 4, 4, 1024, 1, 32768, 667> + >; +}; + +template<> struct block_fft_record<8192, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 512, 1, 131072, 1168>, + block_fft_implementation< 8, 8, 1024, 1, 65536, 670>, + block_fft_implementation<16, 16, 512, 1, 65536, 669>, + block_fft_implementation< 8, 8, 1024, 1, 131072, 1169> + >; +}; + +template<> struct block_fft_record<16384, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<16, 16, 1024, 1, 131072, 1170> + >; +}; + +template<> struct block_fft_record<3, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 3, 3, 1, 64, 0, 671> + >; +}; + +template<> struct block_fft_record<9, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 1, 32, 0, 672>, + block_fft_implementation< 3, 3, 3, 64, 72, 673>, + block_fft_implementation< 3, 3, 3, 64, 144, 674> + >; +}; + +template<> struct block_fft_record<27, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 1, 32, 0, 675>, + block_fft_implementation< 9, 9, 3, 21, 432, 677>, + block_fft_implementation< 9, 9, 3, 21, 216, 676>, + block_fft_implementation< 3, 3, 9, 28, 216, 678>, + block_fft_implementation< 3, 3, 9, 27, 432, 679> + >; +}; + +template<> struct block_fft_record<81, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 9, 7, 1296, 680>, + block_fft_implementation< 9, 9, 9, 7, 648, 681>, + block_fft_implementation<27, 27, 3, 21, 1296, 683>, + block_fft_implementation<27, 27, 3, 21, 648, 682>, + block_fft_implementation< 3, 3, 27, 7, 1296, 684>, + block_fft_implementation< 3, 3, 27, 13, 648, 685> + >; +}; + +template<> struct block_fft_record<243, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 9, 7, 3888, 687>, + block_fft_implementation<27, 27, 9, 7, 1944, 686>, + block_fft_implementation< 9, 9, 27, 14, 3888, 688>, + block_fft_implementation< 9, 9, 27, 1, 1944, 689>, + block_fft_implementation< 3, 3, 81, 3, 1944, 691>, + block_fft_implementation< 3, 3, 81, 3, 3888, 690> + >; +}; + +template<> struct block_fft_record<729, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<27, 27, 27, 14, 5832, 694>, + block_fft_implementation<27, 27, 27, 9, 11664, 695>, + block_fft_implementation< 9, 9, 81, 3, 11664, 692>, + block_fft_implementation< 9, 9, 81, 3, 5832, 693>, + block_fft_implementation< 3, 3, 243, 1, 5832, 696>, + block_fft_implementation< 3, 3, 243, 1, 11664, 697> + >; +}; + +template<> struct block_fft_record<2187, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 243, 1, 34992, 699>, + block_fft_implementation< 9, 9, 243, 1, 17496, 698>, + block_fft_implementation< 3, 3, 729, 1, 17496, 700>, + block_fft_implementation< 3, 3, 729, 1, 34992, 701> + >; +}; + +template<> struct block_fft_record<6561, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 9, 9, 729, 1, 104976, 1171>, + block_fft_implementation< 9, 9, 729, 1, 52488, 702> + >; +}; + +template<> struct block_fft_record<5, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 1, 32, 0, 703> + >; +}; + +template<> struct block_fft_record<25, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 1, 32, 0, 704>, + block_fft_implementation< 5, 5, 5, 25, 400, 705>, + block_fft_implementation< 5, 5, 5, 12, 200, 706> + >; +}; + +template<> struct block_fft_record<125, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 5, 6, 2000, 710>, + block_fft_implementation< 5, 5, 25, 5, 1000, 708>, + block_fft_implementation< 5, 5, 25, 5, 2000, 707>, + block_fft_implementation<25, 25, 5, 6, 1000, 709> + >; +}; + +template<> struct block_fft_record<625, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<25, 25, 25, 5, 5000, 711>, + block_fft_implementation<25, 25, 25, 5, 10000, 714>, + block_fft_implementation< 5, 5, 125, 1, 5000, 712>, + block_fft_implementation< 5, 5, 125, 1, 10000, 713> + >; +}; + +template<> struct block_fft_record<3125, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 5, 5, 625, 1, 25000, 716>, + block_fft_implementation< 5, 5, 625, 1, 50000, 715> + >; +}; + +template<> struct block_fft_record<7, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 1, 32, 0, 717> + >; +}; + +template<> struct block_fft_record<49, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 7, 9, 784, 719>, + block_fft_implementation< 7, 7, 7, 9, 392, 718> + >; +}; + +template<> struct block_fft_record<343, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 49, 5, 5488, 720>, + block_fft_implementation< 7, 7, 49, 5, 2744, 721> + >; +}; + +template<> struct block_fft_record<2401, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 7, 7, 343, 1, 38416, 722>, + block_fft_implementation< 7, 7, 343, 1, 19208, 723> + >; +}; + +template<> struct block_fft_record<11, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 1, 32, 0, 724> + >; +}; + +template<> struct block_fft_record<121, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 11, 23, 968, 726>, + block_fft_implementation<11, 11, 11, 11, 1936, 725> + >; +}; + +template<> struct block_fft_record<1331, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<11, 11, 121, 1, 10648, 727>, + block_fft_implementation<11, 11, 121, 1, 21296, 728> + >; +}; + +template<> struct block_fft_record<6, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 1, 32, 0, 729> + >; +}; + +template<> struct block_fft_record<36, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 6, 5, 576, 730>, + block_fft_implementation< 6, 6, 6, 5, 288, 731> + >; +}; + +template<> struct block_fft_record<216, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 36, 7, 1728, 732>, + block_fft_implementation< 6, 6, 36, 7, 3456, 733> + >; +}; + +template<> struct block_fft_record<1296, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation< 6, 6, 216, 1, 20736, 735>, + block_fft_implementation< 6, 6, 216, 1, 10368, 734> + >; +}; + +template<> struct block_fft_record<10, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1, 32, 0, 736> + >; +}; + +template<> struct block_fft_record<100, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 10, 3, 800, 738>, + block_fft_implementation<10, 10, 10, 3, 1600, 737> + >; +}; + +template<> struct block_fft_record<1000, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 100, 5, 16000, 739>, + block_fft_implementation<10, 10, 100, 1, 8000, 740> + >; +}; + +template<> struct block_fft_record<10000, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<10, 10, 1000, 1, 160000, 1172>, + block_fft_implementation<10, 10, 1000, 1, 80000, 1173> + >; +}; + +template<> struct block_fft_record<12, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 1, 32, 0, 741> + >; +}; + +template<> struct block_fft_record<144, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 12, 5, 1152, 742>, + block_fft_implementation<12, 12, 12, 5, 2304, 743> + >; +}; + +template<> struct block_fft_record<1728, double, fft_type::c2c, fft_direction::inverse, 800> { + static constexpr bool defined = true; + using blobs = type_list< + block_fft_implementation<12, 12, 144, 5, 13824, 744>, + block_fft_implementation<12, 12, 144, 4, 27648, 745> + >; +}; diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp16_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp16_fwd.hpp.inc new file mode 100644 index 0000000000000..52c083b5f452b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp16_fwd.hpp.inc @@ -0,0 +1,74 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#include "../fft_13_fp16_fwd.hpp.inc" +#include "../fft_14_fp16_fwd.hpp.inc" +#include "../fft_15_fp16_fwd.hpp.inc" +#include "../fft_17_fp16_fwd.hpp.inc" +#include "../fft_18_fp16_fwd.hpp.inc" +#include "../fft_19_fp16_fwd.hpp.inc" +#include "../fft_20_fp16_fwd.hpp.inc" +#include "../fft_21_fp16_fwd.hpp.inc" +#include "../fft_22_fp16_fwd.hpp.inc" +#include "../fft_23_fp16_fwd.hpp.inc" +#include "../fft_24_fp16_fwd.hpp.inc" +#include "../fft_26_fp16_fwd.hpp.inc" +#include "../fft_28_fp16_fwd.hpp.inc" +#include "../fft_29_fp16_fwd.hpp.inc" +#include "../fft_30_fp16_fwd.hpp.inc" +#include "../fft_31_fp16_fwd.hpp.inc" +#include "../fft_2_fp16_fwd.hpp.inc" +#include "../fft_4_fp16_fwd.hpp.inc" +#include "../fft_8_fp16_fwd.hpp.inc" +#include "../fft_16_fp16_fwd.hpp.inc" +#include "../fft_32_fp16_fwd.hpp.inc" +#include "../fft_64_fp16_fwd.hpp.inc" +#include "../fft_128_fp16_fwd.hpp.inc" +#include "../fft_256_fp16_fwd.hpp.inc" +#include "../fft_512_fp16_fwd.hpp.inc" +#include "../fft_1024_fp16_fwd.hpp.inc" +#include "../fft_2048_fp16_fwd.hpp.inc" +#include "../fft_4096_fp16_fwd.hpp.inc" +#include "../fft_8192_fp16_fwd.hpp.inc" +#include "../fft_16384_fp16_fwd.hpp.inc" +#include "../fft_32768_fp16_fwd.hpp.inc" +#include "../fft_3_fp16_fwd.hpp.inc" +#include "../fft_9_fp16_fwd.hpp.inc" +#include "../fft_27_fp16_fwd.hpp.inc" +#include "../fft_81_fp16_fwd.hpp.inc" +#include "../fft_243_fp16_fwd.hpp.inc" +#include "../fft_729_fp16_fwd.hpp.inc" +#include "../fft_2187_fp16_fwd.hpp.inc" +#include "../fft_6561_fp16_fwd.hpp.inc" +#include "../fft_19683_fp16_fwd.hpp.inc" +#include "../fft_5_fp16_fwd.hpp.inc" +#include "../fft_25_fp16_fwd.hpp.inc" +#include "../fft_125_fp16_fwd.hpp.inc" +#include "../fft_625_fp16_fwd.hpp.inc" +#include "../fft_3125_fp16_fwd.hpp.inc" +#include "../fft_15625_fp16_fwd.hpp.inc" +#include "../fft_7_fp16_fwd.hpp.inc" +#include "../fft_49_fp16_fwd.hpp.inc" +#include "../fft_343_fp16_fwd.hpp.inc" +#include "../fft_2401_fp16_fwd.hpp.inc" +#include "../fft_11_fp16_fwd.hpp.inc" +#include "../fft_121_fp16_fwd.hpp.inc" +#include "../fft_1331_fp16_fwd.hpp.inc" +#include "../fft_6_fp16_fwd.hpp.inc" +#include "../fft_36_fp16_fwd.hpp.inc" +#include "../fft_216_fp16_fwd.hpp.inc" +#include "../fft_1296_fp16_fwd.hpp.inc" +#include "../fft_10_fp16_fwd.hpp.inc" +#include "../fft_100_fp16_fwd.hpp.inc" +#include "../fft_1000_fp16_fwd.hpp.inc" +#include "../fft_10000_fp16_fwd.hpp.inc" +#include "../fft_12_fp16_fwd.hpp.inc" +#include "../fft_144_fp16_fwd.hpp.inc" +#include "../fft_1728_fp16_fwd.hpp.inc" diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp16_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp16_inv.hpp.inc new file mode 100644 index 0000000000000..e91ac31c049af --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp16_inv.hpp.inc @@ -0,0 +1,74 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#include "../fft_13_fp16_inv.hpp.inc" +#include "../fft_14_fp16_inv.hpp.inc" +#include "../fft_15_fp16_inv.hpp.inc" +#include "../fft_17_fp16_inv.hpp.inc" +#include "../fft_18_fp16_inv.hpp.inc" +#include "../fft_19_fp16_inv.hpp.inc" +#include "../fft_20_fp16_inv.hpp.inc" +#include "../fft_21_fp16_inv.hpp.inc" +#include "../fft_22_fp16_inv.hpp.inc" +#include "../fft_23_fp16_inv.hpp.inc" +#include "../fft_24_fp16_inv.hpp.inc" +#include "../fft_26_fp16_inv.hpp.inc" +#include "../fft_28_fp16_inv.hpp.inc" +#include "../fft_29_fp16_inv.hpp.inc" +#include "../fft_30_fp16_inv.hpp.inc" +#include "../fft_31_fp16_inv.hpp.inc" +#include "../fft_2_fp16_inv.hpp.inc" +#include "../fft_4_fp16_inv.hpp.inc" +#include "../fft_8_fp16_inv.hpp.inc" +#include "../fft_16_fp16_inv.hpp.inc" +#include "../fft_32_fp16_inv.hpp.inc" +#include "../fft_64_fp16_inv.hpp.inc" +#include "../fft_128_fp16_inv.hpp.inc" +#include "../fft_256_fp16_inv.hpp.inc" +#include "../fft_512_fp16_inv.hpp.inc" +#include "../fft_1024_fp16_inv.hpp.inc" +#include "../fft_2048_fp16_inv.hpp.inc" +#include "../fft_4096_fp16_inv.hpp.inc" +#include "../fft_8192_fp16_inv.hpp.inc" +#include "../fft_16384_fp16_inv.hpp.inc" +#include "../fft_32768_fp16_inv.hpp.inc" +#include "../fft_3_fp16_inv.hpp.inc" +#include "../fft_9_fp16_inv.hpp.inc" +#include "../fft_27_fp16_inv.hpp.inc" +#include "../fft_81_fp16_inv.hpp.inc" +#include "../fft_243_fp16_inv.hpp.inc" +#include "../fft_729_fp16_inv.hpp.inc" +#include "../fft_2187_fp16_inv.hpp.inc" +#include "../fft_6561_fp16_inv.hpp.inc" +#include "../fft_19683_fp16_inv.hpp.inc" +#include "../fft_5_fp16_inv.hpp.inc" +#include "../fft_25_fp16_inv.hpp.inc" +#include "../fft_125_fp16_inv.hpp.inc" +#include "../fft_625_fp16_inv.hpp.inc" +#include "../fft_3125_fp16_inv.hpp.inc" +#include "../fft_15625_fp16_inv.hpp.inc" +#include "../fft_7_fp16_inv.hpp.inc" +#include "../fft_49_fp16_inv.hpp.inc" +#include "../fft_343_fp16_inv.hpp.inc" +#include "../fft_2401_fp16_inv.hpp.inc" +#include "../fft_11_fp16_inv.hpp.inc" +#include "../fft_121_fp16_inv.hpp.inc" +#include "../fft_1331_fp16_inv.hpp.inc" +#include "../fft_6_fp16_inv.hpp.inc" +#include "../fft_36_fp16_inv.hpp.inc" +#include "../fft_216_fp16_inv.hpp.inc" +#include "../fft_1296_fp16_inv.hpp.inc" +#include "../fft_10_fp16_inv.hpp.inc" +#include "../fft_100_fp16_inv.hpp.inc" +#include "../fft_1000_fp16_inv.hpp.inc" +#include "../fft_10000_fp16_inv.hpp.inc" +#include "../fft_12_fp16_inv.hpp.inc" +#include "../fft_144_fp16_inv.hpp.inc" +#include "../fft_1728_fp16_inv.hpp.inc" diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp32_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp32_fwd.hpp.inc new file mode 100644 index 0000000000000..4516ed1091f6f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp32_fwd.hpp.inc @@ -0,0 +1,74 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#include "../fft_13_fp32_fwd.hpp.inc" +#include "../fft_14_fp32_fwd.hpp.inc" +#include "../fft_15_fp32_fwd.hpp.inc" +#include "../fft_17_fp32_fwd.hpp.inc" +#include "../fft_18_fp32_fwd.hpp.inc" +#include "../fft_19_fp32_fwd.hpp.inc" +#include "../fft_20_fp32_fwd.hpp.inc" +#include "../fft_21_fp32_fwd.hpp.inc" +#include "../fft_22_fp32_fwd.hpp.inc" +#include "../fft_23_fp32_fwd.hpp.inc" +#include "../fft_24_fp32_fwd.hpp.inc" +#include "../fft_26_fp32_fwd.hpp.inc" +#include "../fft_28_fp32_fwd.hpp.inc" +#include "../fft_29_fp32_fwd.hpp.inc" +#include "../fft_30_fp32_fwd.hpp.inc" +#include "../fft_31_fp32_fwd.hpp.inc" +#include "../fft_2_fp32_fwd.hpp.inc" +#include "../fft_4_fp32_fwd.hpp.inc" +#include "../fft_8_fp32_fwd.hpp.inc" +#include "../fft_16_fp32_fwd.hpp.inc" +#include "../fft_32_fp32_fwd.hpp.inc" +#include "../fft_64_fp32_fwd.hpp.inc" +#include "../fft_128_fp32_fwd.hpp.inc" +#include "../fft_256_fp32_fwd.hpp.inc" +#include "../fft_512_fp32_fwd.hpp.inc" +#include "../fft_1024_fp32_fwd.hpp.inc" +#include "../fft_2048_fp32_fwd.hpp.inc" +#include "../fft_4096_fp32_fwd.hpp.inc" +#include "../fft_8192_fp32_fwd.hpp.inc" +#include "../fft_16384_fp32_fwd.hpp.inc" +#include "../fft_32768_fp32_fwd.hpp.inc" +#include "../fft_3_fp32_fwd.hpp.inc" +#include "../fft_9_fp32_fwd.hpp.inc" +#include "../fft_27_fp32_fwd.hpp.inc" +#include "../fft_81_fp32_fwd.hpp.inc" +#include "../fft_243_fp32_fwd.hpp.inc" +#include "../fft_729_fp32_fwd.hpp.inc" +#include "../fft_2187_fp32_fwd.hpp.inc" +#include "../fft_6561_fp32_fwd.hpp.inc" +#include "../fft_19683_fp32_fwd.hpp.inc" +#include "../fft_5_fp32_fwd.hpp.inc" +#include "../fft_25_fp32_fwd.hpp.inc" +#include "../fft_125_fp32_fwd.hpp.inc" +#include "../fft_625_fp32_fwd.hpp.inc" +#include "../fft_3125_fp32_fwd.hpp.inc" +#include "../fft_15625_fp32_fwd.hpp.inc" +#include "../fft_7_fp32_fwd.hpp.inc" +#include "../fft_49_fp32_fwd.hpp.inc" +#include "../fft_343_fp32_fwd.hpp.inc" +#include "../fft_2401_fp32_fwd.hpp.inc" +#include "../fft_11_fp32_fwd.hpp.inc" +#include "../fft_121_fp32_fwd.hpp.inc" +#include "../fft_1331_fp32_fwd.hpp.inc" +#include "../fft_6_fp32_fwd.hpp.inc" +#include "../fft_36_fp32_fwd.hpp.inc" +#include "../fft_216_fp32_fwd.hpp.inc" +#include "../fft_1296_fp32_fwd.hpp.inc" +#include "../fft_10_fp32_fwd.hpp.inc" +#include "../fft_100_fp32_fwd.hpp.inc" +#include "../fft_1000_fp32_fwd.hpp.inc" +#include "../fft_10000_fp32_fwd.hpp.inc" +#include "../fft_12_fp32_fwd.hpp.inc" +#include "../fft_144_fp32_fwd.hpp.inc" +#include "../fft_1728_fp32_fwd.hpp.inc" diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp32_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp32_inv.hpp.inc new file mode 100644 index 0000000000000..75fcecbc53d3f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp32_inv.hpp.inc @@ -0,0 +1,74 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#include "../fft_13_fp32_inv.hpp.inc" +#include "../fft_14_fp32_inv.hpp.inc" +#include "../fft_15_fp32_inv.hpp.inc" +#include "../fft_17_fp32_inv.hpp.inc" +#include "../fft_18_fp32_inv.hpp.inc" +#include "../fft_19_fp32_inv.hpp.inc" +#include "../fft_20_fp32_inv.hpp.inc" +#include "../fft_21_fp32_inv.hpp.inc" +#include "../fft_22_fp32_inv.hpp.inc" +#include "../fft_23_fp32_inv.hpp.inc" +#include "../fft_24_fp32_inv.hpp.inc" +#include "../fft_26_fp32_inv.hpp.inc" +#include "../fft_28_fp32_inv.hpp.inc" +#include "../fft_29_fp32_inv.hpp.inc" +#include "../fft_30_fp32_inv.hpp.inc" +#include "../fft_31_fp32_inv.hpp.inc" +#include "../fft_2_fp32_inv.hpp.inc" +#include "../fft_4_fp32_inv.hpp.inc" +#include "../fft_8_fp32_inv.hpp.inc" +#include "../fft_16_fp32_inv.hpp.inc" +#include "../fft_32_fp32_inv.hpp.inc" +#include "../fft_64_fp32_inv.hpp.inc" +#include "../fft_128_fp32_inv.hpp.inc" +#include "../fft_256_fp32_inv.hpp.inc" +#include "../fft_512_fp32_inv.hpp.inc" +#include "../fft_1024_fp32_inv.hpp.inc" +#include "../fft_2048_fp32_inv.hpp.inc" +#include "../fft_4096_fp32_inv.hpp.inc" +#include "../fft_8192_fp32_inv.hpp.inc" +#include "../fft_16384_fp32_inv.hpp.inc" +#include "../fft_32768_fp32_inv.hpp.inc" +#include "../fft_3_fp32_inv.hpp.inc" +#include "../fft_9_fp32_inv.hpp.inc" +#include "../fft_27_fp32_inv.hpp.inc" +#include "../fft_81_fp32_inv.hpp.inc" +#include "../fft_243_fp32_inv.hpp.inc" +#include "../fft_729_fp32_inv.hpp.inc" +#include "../fft_2187_fp32_inv.hpp.inc" +#include "../fft_6561_fp32_inv.hpp.inc" +#include "../fft_19683_fp32_inv.hpp.inc" +#include "../fft_5_fp32_inv.hpp.inc" +#include "../fft_25_fp32_inv.hpp.inc" +#include "../fft_125_fp32_inv.hpp.inc" +#include "../fft_625_fp32_inv.hpp.inc" +#include "../fft_3125_fp32_inv.hpp.inc" +#include "../fft_15625_fp32_inv.hpp.inc" +#include "../fft_7_fp32_inv.hpp.inc" +#include "../fft_49_fp32_inv.hpp.inc" +#include "../fft_343_fp32_inv.hpp.inc" +#include "../fft_2401_fp32_inv.hpp.inc" +#include "../fft_11_fp32_inv.hpp.inc" +#include "../fft_121_fp32_inv.hpp.inc" +#include "../fft_1331_fp32_inv.hpp.inc" +#include "../fft_6_fp32_inv.hpp.inc" +#include "../fft_36_fp32_inv.hpp.inc" +#include "../fft_216_fp32_inv.hpp.inc" +#include "../fft_1296_fp32_inv.hpp.inc" +#include "../fft_10_fp32_inv.hpp.inc" +#include "../fft_100_fp32_inv.hpp.inc" +#include "../fft_1000_fp32_inv.hpp.inc" +#include "../fft_10000_fp32_inv.hpp.inc" +#include "../fft_12_fp32_inv.hpp.inc" +#include "../fft_144_fp32_inv.hpp.inc" +#include "../fft_1728_fp32_inv.hpp.inc" diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp64_fwd.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp64_fwd.hpp.inc new file mode 100644 index 0000000000000..94488dca28d7f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp64_fwd.hpp.inc @@ -0,0 +1,71 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#include "../fft_13_fp64_fwd.hpp.inc" +#include "../fft_14_fp64_fwd.hpp.inc" +#include "../fft_15_fp64_fwd.hpp.inc" +#include "../fft_17_fp64_fwd.hpp.inc" +#include "../fft_18_fp64_fwd.hpp.inc" +#include "../fft_19_fp64_fwd.hpp.inc" +#include "../fft_20_fp64_fwd.hpp.inc" +#include "../fft_21_fp64_fwd.hpp.inc" +#include "../fft_22_fp64_fwd.hpp.inc" +#include "../fft_23_fp64_fwd.hpp.inc" +#include "../fft_24_fp64_fwd.hpp.inc" +#include "../fft_26_fp64_fwd.hpp.inc" +#include "../fft_28_fp64_fwd.hpp.inc" +#include "../fft_29_fp64_fwd.hpp.inc" +#include "../fft_30_fp64_fwd.hpp.inc" +#include "../fft_31_fp64_fwd.hpp.inc" +#include "../fft_2_fp64_fwd.hpp.inc" +#include "../fft_4_fp64_fwd.hpp.inc" +#include "../fft_8_fp64_fwd.hpp.inc" +#include "../fft_16_fp64_fwd.hpp.inc" +#include "../fft_32_fp64_fwd.hpp.inc" +#include "../fft_64_fp64_fwd.hpp.inc" +#include "../fft_128_fp64_fwd.hpp.inc" +#include "../fft_256_fp64_fwd.hpp.inc" +#include "../fft_512_fp64_fwd.hpp.inc" +#include "../fft_1024_fp64_fwd.hpp.inc" +#include "../fft_2048_fp64_fwd.hpp.inc" +#include "../fft_4096_fp64_fwd.hpp.inc" +#include "../fft_8192_fp64_fwd.hpp.inc" +#include "../fft_16384_fp64_fwd.hpp.inc" +#include "../fft_3_fp64_fwd.hpp.inc" +#include "../fft_9_fp64_fwd.hpp.inc" +#include "../fft_27_fp64_fwd.hpp.inc" +#include "../fft_81_fp64_fwd.hpp.inc" +#include "../fft_243_fp64_fwd.hpp.inc" +#include "../fft_729_fp64_fwd.hpp.inc" +#include "../fft_2187_fp64_fwd.hpp.inc" +#include "../fft_6561_fp64_fwd.hpp.inc" +#include "../fft_5_fp64_fwd.hpp.inc" +#include "../fft_25_fp64_fwd.hpp.inc" +#include "../fft_125_fp64_fwd.hpp.inc" +#include "../fft_625_fp64_fwd.hpp.inc" +#include "../fft_3125_fp64_fwd.hpp.inc" +#include "../fft_7_fp64_fwd.hpp.inc" +#include "../fft_49_fp64_fwd.hpp.inc" +#include "../fft_343_fp64_fwd.hpp.inc" +#include "../fft_2401_fp64_fwd.hpp.inc" +#include "../fft_11_fp64_fwd.hpp.inc" +#include "../fft_121_fp64_fwd.hpp.inc" +#include "../fft_1331_fp64_fwd.hpp.inc" +#include "../fft_6_fp64_fwd.hpp.inc" +#include "../fft_36_fp64_fwd.hpp.inc" +#include "../fft_216_fp64_fwd.hpp.inc" +#include "../fft_1296_fp64_fwd.hpp.inc" +#include "../fft_10_fp64_fwd.hpp.inc" +#include "../fft_100_fp64_fwd.hpp.inc" +#include "../fft_1000_fp64_fwd.hpp.inc" +#include "../fft_10000_fp64_fwd.hpp.inc" +#include "../fft_12_fp64_fwd.hpp.inc" +#include "../fft_144_fp64_fwd.hpp.inc" +#include "../fft_1728_fp64_fwd.hpp.inc" diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp64_inv.hpp.inc b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp64_inv.hpp.inc new file mode 100644 index 0000000000000..20dbb0d19798a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/database/records/definitions_fp64_inv.hpp.inc @@ -0,0 +1,71 @@ +//Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. +// +//NVIDIA CORPORATION and its licensors retain all intellectual property +//and proprietary rights in and to this software, related documentation +//and any modifications thereto. Any use, reproduction, disclosure or +//distribution of this software and related documentation without an express +//license agreement from NVIDIA CORPORATION is strictly prohibited. +// + + +#include "../fft_13_fp64_inv.hpp.inc" +#include "../fft_14_fp64_inv.hpp.inc" +#include "../fft_15_fp64_inv.hpp.inc" +#include "../fft_17_fp64_inv.hpp.inc" +#include "../fft_18_fp64_inv.hpp.inc" +#include "../fft_19_fp64_inv.hpp.inc" +#include "../fft_20_fp64_inv.hpp.inc" +#include "../fft_21_fp64_inv.hpp.inc" +#include "../fft_22_fp64_inv.hpp.inc" +#include "../fft_23_fp64_inv.hpp.inc" +#include "../fft_24_fp64_inv.hpp.inc" +#include "../fft_26_fp64_inv.hpp.inc" +#include "../fft_28_fp64_inv.hpp.inc" +#include "../fft_29_fp64_inv.hpp.inc" +#include "../fft_30_fp64_inv.hpp.inc" +#include "../fft_31_fp64_inv.hpp.inc" +#include "../fft_2_fp64_inv.hpp.inc" +#include "../fft_4_fp64_inv.hpp.inc" +#include "../fft_8_fp64_inv.hpp.inc" +#include "../fft_16_fp64_inv.hpp.inc" +#include "../fft_32_fp64_inv.hpp.inc" +#include "../fft_64_fp64_inv.hpp.inc" +#include "../fft_128_fp64_inv.hpp.inc" +#include "../fft_256_fp64_inv.hpp.inc" +#include "../fft_512_fp64_inv.hpp.inc" +#include "../fft_1024_fp64_inv.hpp.inc" +#include "../fft_2048_fp64_inv.hpp.inc" +#include "../fft_4096_fp64_inv.hpp.inc" +#include "../fft_8192_fp64_inv.hpp.inc" +#include "../fft_16384_fp64_inv.hpp.inc" +#include "../fft_3_fp64_inv.hpp.inc" +#include "../fft_9_fp64_inv.hpp.inc" +#include "../fft_27_fp64_inv.hpp.inc" +#include "../fft_81_fp64_inv.hpp.inc" +#include "../fft_243_fp64_inv.hpp.inc" +#include "../fft_729_fp64_inv.hpp.inc" +#include "../fft_2187_fp64_inv.hpp.inc" +#include "../fft_6561_fp64_inv.hpp.inc" +#include "../fft_5_fp64_inv.hpp.inc" +#include "../fft_25_fp64_inv.hpp.inc" +#include "../fft_125_fp64_inv.hpp.inc" +#include "../fft_625_fp64_inv.hpp.inc" +#include "../fft_3125_fp64_inv.hpp.inc" +#include "../fft_7_fp64_inv.hpp.inc" +#include "../fft_49_fp64_inv.hpp.inc" +#include "../fft_343_fp64_inv.hpp.inc" +#include "../fft_2401_fp64_inv.hpp.inc" +#include "../fft_11_fp64_inv.hpp.inc" +#include "../fft_121_fp64_inv.hpp.inc" +#include "../fft_1331_fp64_inv.hpp.inc" +#include "../fft_6_fp64_inv.hpp.inc" +#include "../fft_36_fp64_inv.hpp.inc" +#include "../fft_216_fp64_inv.hpp.inc" +#include "../fft_1296_fp64_inv.hpp.inc" +#include "../fft_10_fp64_inv.hpp.inc" +#include "../fft_100_fp64_inv.hpp.inc" +#include "../fft_1000_fp64_inv.hpp.inc" +#include "../fft_10000_fp64_inv.hpp.inc" +#include "../fft_12_fp64_inv.hpp.inc" +#include "../fft_144_fp64_inv.hpp.inc" +#include "../fft_1728_fp64_inv.hpp.inc" diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/config.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/config.hpp new file mode 100644 index 0000000000000..c5bca41b052b8 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/config.hpp @@ -0,0 +1,22 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_CONFIG_HPP +#define CUFFTDX_DETAIL_CONFIG_HPP + +#ifdef __CUDACC_RTC__ +# define CUFFTDX_DETAIL_USE_CUDA_STL +#endif + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# define CUFFTDX_STD ::cuda::std +#else +# define CUFFTDX_STD ::std +#endif + +#endif // CUFFTDX_DETAIL_CONFIG_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/expressions.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/expressions.hpp new file mode 100644 index 0000000000000..c0f28eebf597c --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/expressions.hpp @@ -0,0 +1,39 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_EXPRESSIONS_HPP +#define CUFFTDX_DETAIL_EXPRESSIONS_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +namespace cufftdx { + namespace detail { + struct expression {}; + struct operator_expression: expression {}; + struct block_operator_expression: operator_expression {}; + + struct description_expression: expression {}; + struct execution_description_expression: description_expression {}; + + template + struct constant_operator_expression: + public operator_expression, + public CUFFTDX_STD::integral_constant {}; + + template + struct constant_block_operator_expression: + public block_operator_expression, + public CUFFTDX_STD::integral_constant {}; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_DETAIL_EXPRESSIONS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_checks.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_checks.hpp new file mode 100644 index 0000000000000..7b0bfc869c48f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_checks.hpp @@ -0,0 +1,161 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_FFT_CHECKS_HPP +#define CUFFTDX_DETAIL_FFT_CHECKS_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include + +#include "../operators.hpp" +#include "../traits/detail/bluestein_helpers.hpp" + +namespace cufftdx { + namespace detail { + +// SM70 +#define CUFFTDX_DETAIL_SM700_FP16_MAX 16384 +#define CUFFTDX_DETAIL_SM700_FP32_MAX 16384 +#define CUFFTDX_DETAIL_SM700_FP64_MAX 8192 +// SM72 +#define CUFFTDX_DETAIL_SM720_FP16_MAX 16384 +#define CUFFTDX_DETAIL_SM720_FP32_MAX 16384 +#define CUFFTDX_DETAIL_SM720_FP64_MAX 8192 +// SM75 +#define CUFFTDX_DETAIL_SM750_FP16_MAX 4096 +#define CUFFTDX_DETAIL_SM750_FP32_MAX 4096 +#define CUFFTDX_DETAIL_SM750_FP64_MAX 2048 +// SM80 +#define CUFFTDX_DETAIL_SM800_FP16_MAX 32768 +#define CUFFTDX_DETAIL_SM800_FP32_MAX 32768 +#define CUFFTDX_DETAIL_SM800_FP64_MAX 16384 +// SM86 +#define CUFFTDX_DETAIL_SM860_FP16_MAX 16384 +#define CUFFTDX_DETAIL_SM860_FP32_MAX 16384 +#define CUFFTDX_DETAIL_SM860_FP64_MAX 8192 + + template + class is_supported: public CUFFTDX_STD::false_type + {}; + + // Max supported sizes, ignores SM + template + class is_supported + { + static constexpr auto blue_size = detail::get_bluestein_size(Size); + + public: + static constexpr bool fp16_block_value = + CUFFTDX_STD::is_same<__half, Precision>::value && ((Size <= CUFFTDX_DETAIL_SM800_FP16_MAX) && (Size >= 2)); + static constexpr bool fp32_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM800_FP32_MAX) && (Size >= 2)); + static constexpr bool fp64_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM800_FP64_MAX) && (Size >= 2)); + static constexpr bool blue_block_value = ((blue_size <= CUFFTDX_DETAIL_SM800_FP64_MAX) && (blue_size >= 2)); + + static constexpr bool value = fp16_block_value || fp32_block_value || fp64_block_value || blue_block_value; + }; + + // SM70 + template + class is_supported + { + static constexpr auto blue_size = detail::get_bluestein_size(Size); + + public: + static constexpr bool fp16_block_value = + CUFFTDX_STD::is_same<__half, Precision>::value && ((Size <= CUFFTDX_DETAIL_SM700_FP16_MAX) && (Size >= 2)); + static constexpr bool fp32_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM700_FP32_MAX) && (Size >= 2)); + static constexpr bool fp64_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM700_FP64_MAX) && (Size >= 2)); + static constexpr bool blue_block_value = ((blue_size <= CUFFTDX_DETAIL_SM700_FP64_MAX) && (blue_size >= 2)); + + static constexpr bool value = fp16_block_value || fp32_block_value || fp64_block_value || blue_block_value; + }; + + // SM72 + template + class is_supported + { + static constexpr auto blue_size = detail::get_bluestein_size(Size); + + public: + static constexpr bool fp16_block_value = + CUFFTDX_STD::is_same<__half, Precision>::value && ((Size <= CUFFTDX_DETAIL_SM720_FP16_MAX) && (Size >= 2)); + static constexpr bool fp32_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM720_FP32_MAX) && (Size >= 2)); + static constexpr bool fp64_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM720_FP64_MAX) && (Size >= 2)); + static constexpr bool blue_block_value = ((blue_size <= CUFFTDX_DETAIL_SM720_FP64_MAX) && (blue_size >= 2)); + + static constexpr bool value = fp16_block_value || fp32_block_value || fp64_block_value || blue_block_value; + }; + + // SM75 + template + class is_supported + { + static constexpr auto blue_size = detail::get_bluestein_size(Size); + + public: + static constexpr bool fp16_block_value = + CUFFTDX_STD::is_same<__half, Precision>::value && ((Size <= CUFFTDX_DETAIL_SM750_FP16_MAX) && (Size >= 2)); + static constexpr bool fp32_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM750_FP32_MAX) && (Size >= 2)); + static constexpr bool fp64_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM750_FP64_MAX) && (Size >= 2)); + static constexpr bool blue_block_value = ((blue_size <= CUFFTDX_DETAIL_SM750_FP64_MAX) && (blue_size >= 2)); + + static constexpr bool value = fp16_block_value || fp32_block_value || fp64_block_value || blue_block_value; + }; + + // SM80 + template + class is_supported + { + static constexpr auto blue_size = detail::get_bluestein_size(Size); + + public: + static constexpr bool fp16_block_value = + CUFFTDX_STD::is_same<__half, Precision>::value && ((Size <= CUFFTDX_DETAIL_SM800_FP16_MAX) && (Size >= 2)); + static constexpr bool fp32_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM800_FP32_MAX) && (Size >= 2)); + static constexpr bool fp64_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM800_FP64_MAX) && (Size >= 2)); + static constexpr bool blue_block_value = ((blue_size <= CUFFTDX_DETAIL_SM800_FP64_MAX) && (blue_size >= 2)); + + static constexpr bool value = fp16_block_value || fp32_block_value || fp64_block_value || blue_block_value; + }; + + // SM86 + template + class is_supported + { + static constexpr auto blue_size = detail::get_bluestein_size(Size); + + public: + static constexpr bool fp16_block_value = + CUFFTDX_STD::is_same<__half, Precision>::value && ((Size <= CUFFTDX_DETAIL_SM860_FP16_MAX) && (Size >= 2)); + static constexpr bool fp32_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM860_FP32_MAX) && (Size >= 2)); + static constexpr bool fp64_block_value = + CUFFTDX_STD::is_same::value && ((Size <= CUFFTDX_DETAIL_SM860_FP64_MAX) && (Size >= 2)); + static constexpr bool blue_block_value = ((blue_size <= CUFFTDX_DETAIL_SM860_FP64_MAX) && (blue_size >= 2)); + + static constexpr bool value = fp16_block_value || fp32_block_value || fp64_block_value || blue_block_value; + }; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_DETAIL_FFT_CHECKS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_description.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_description.hpp new file mode 100644 index 0000000000000..1f32907ad6393 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_description.hpp @@ -0,0 +1,147 @@ +// Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_FFT_DESCRIPTION_HPP +#define CUFFTDX_DETAIL_FFT_DESCRIPTION_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include + +#include "../operators.hpp" +#include "../traits/detail/check_and_get_trait.hpp" +#include "../traits/detail/description_traits.hpp" +#include "../traits/detail/get.hpp" +#include "../traits/detail/make_complex_type.hpp" +#include "../database/database.hpp" + +#include "expressions.hpp" + +namespace cufftdx { + namespace detail { + template + class fft_operator_wrapper: public description_expression { }; + + template + class fft_description: public description_expression + { + using description_type = fft_operator_wrapper; + + protected: + /// ---- Traits + + // Size + // * Default value: NONE + // * If there is no size, then dummy size is 2. This is required so this_fft_size_v does not break. + // * Values of has_size or is_complete should be checked before using this property. + static constexpr bool has_size = has_operator::value; + using dummy_default_fft_size = Size<2>; + using this_fft_size = get_or_default_t; + static constexpr auto this_fft_size_v = this_fft_size::value; + + // Type (C2C, C2R, R2C) + // * Default value: C2C + using default_fft_type = Type; + using this_fft_type = get_or_default_t; + static constexpr auto this_fft_type_v = this_fft_type::value; + + // Direction + // * Default value: NONE + // * Direction can be deduced from FFT Type + // * If there is no direction and we can't deduced it, dummy direction is FORWARD.This is required so + // this_fft_direction_v does not break. + // * Values of has_size or is_complete should be checked before using this property. + static constexpr bool has_direction = has_operator::value; + using deduced_fft_direction = deduce_direction_type_t; + using dummy_default_fft_direction = Direction; + using this_fft_direction = + get_or_default_t::value, + deduced_fft_direction, + dummy_default_fft_direction>::type>; + static constexpr auto this_fft_direction_v = this_fft_direction::value; + + // Precision + // * Default: float + using default_fft_precision = Precision; + using this_fft_precision = + get_or_default_t; + using this_fft_precision_t = typename this_fft_precision::type; + + // True if description is complete FFT description + static constexpr bool is_complete = is_complete_description::value; + + // SM + static constexpr bool has_sm = has_operator::value; + using dummy_default_fft_sm = SM<700>; + using this_fft_sm = get_or_default_t; + static constexpr auto this_fft_sm_v = this_fft_sm::value; + + /// ---- Constraints + + // Not-implemented-yet / disabled features + + static constexpr bool has_block_dim = has_operator::value; +#ifndef CUFFTDX_DETAIL_TEST_ENABLE_BLOCKDIM + static_assert(!has_block_dim, "BlockDim<> feature is not implemented yet"); +#endif + + // We can only have one of each option + + // Main operators + static constexpr bool has_one_direction = + has_at_most_one_of::value; + static constexpr bool has_one_precision = + has_at_most_one_of::value; + static constexpr bool has_one_size = has_at_most_one_of::value; + static constexpr bool has_one_sm = has_at_most_one_of::value; + static constexpr bool has_one_type = has_at_most_one_of::value; + + static_assert(has_one_direction, "Can't create FFT with two Direction<> expressions"); + static_assert(has_one_precision, "Can't create FFT with two Precision<> expressions"); + static_assert(has_one_size, "Can't create FFT with two Size<> expressions"); + static_assert(has_one_sm, "Can't create FFT with two SM<> expressions"); + static_assert(has_one_type, "Can't create FFT with two Type<> expressions"); + + // Block-only operators + static constexpr bool has_one_ept = + has_at_most_one_of::value; + static constexpr bool has_one_fpb = + has_at_most_one_of::value; + static constexpr bool has_one_block_dim = + has_at_most_one_of::value; + + static_assert(has_one_ept, "Can't create FFT with two ElementsPerThread<> expressions"); + static_assert(has_one_fpb, "Can't create FFT with two FFTsPerBlock<> expressions"); + static_assert(has_one_block_dim, "Can't create FFT with two BlockDim<> expressions"); + + // Mutually exclusive options + static constexpr bool c2r_type_forward_dir = + !has_direction || !(CUFFTDX_STD::is_same>::value && + CUFFTDX_STD::is_same>::value); + static constexpr bool r2c_type_inverse_dir = + !has_direction || !(CUFFTDX_STD::is_same>::value && + CUFFTDX_STD::is_same>::value); + + static_assert(c2r_type_forward_dir, "Can't create Complex-to-Real FFT with forward direction"); + static_assert(r2c_type_inverse_dir, "Can't create Real-to-Complex FFT with inverse direction"); + + /// ---- End of Constraints + }; + + template<> + class fft_description<>: public description_expression {}; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_DETAIL_FFT_DESCRIPTION_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_description_fd.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_description_fd.hpp new file mode 100644 index 0000000000000..40365f4846315 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_description_fd.hpp @@ -0,0 +1,20 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_FFT_DESCRIPTION_FD_HPP +#define CUFFTDX_DETAIL_FFT_DESCRIPTION_FD_HPP + +// forward declaration +namespace cufftdx { + namespace detail { + template + class fft_description; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_DETAIL_FFT_DESCRIPTION_FD_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_execution.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_execution.hpp new file mode 100644 index 0000000000000..dcbeb7117ad0b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/fft_execution.hpp @@ -0,0 +1,1034 @@ +// Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_FFT_EXECUTION_HPP +#define CUFFTDX_DETAIL_FFT_EXECUTION_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#ifndef __CUDACC_RTC__ +#include +#endif + +#ifndef __CUDACC_RTC__ +#include // cudaError_t +#endif + +#include + +#include "fft_checks.hpp" +#include "fft_description.hpp" +#include "workspace.hpp" + +#include "../traits/detail/ldg_type.hpp" + +#define STRINGIFY(s) XSTRINGIFY(s) +#define XSTRINGIFY(s) #s + +namespace cufftdx { + namespace detail { + template + struct normalize_helper { + inline __device__ void operator()(T& value) { value /= Size; } + }; + + template + struct normalize_helper, Size> { + inline __device__ void operator()(complex<__half2>& value) { + value.x /= __half2 {Size, Size}; + value.y /= __half2 {Size, Size}; + } + }; + + template + inline __device__ void normalize(T& value) { + return normalize_helper()(value); + }; + + template + inline __device__ constexpr T get_zero() { + return 0.; + } + + template<> + inline __device__ constexpr __half2 get_zero<__half2>() { + // This return __half2 with zeros everywhere + return __half2 {}; + } + + // C2C OR (C2R AND ept == 2) + template + inline __device__ auto preprocess(ComplexType * /* input */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2c) || + ((type_of::value == fft_type::c2r) && // + (FFT::elements_per_thread == 2))>::type { + // NOP, C2C and C2R with ept == 2 don't require any preprocess + } + + // R2C AND ept > 2 + template + inline __device__ auto preprocess(ComplexType* input) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::r2c)>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + + // Input has packed real values (this means .y has real values), this + // unpacks input so every complex value is {real, 0} + for (unsigned int i = ept; i > 1; i--) { + (reinterpret_cast(input))[2 * i - 1] = get_zero(); + (reinterpret_cast(input))[2 * i - 2] = (reinterpret_cast(input))[i - 1]; + } + // input[0].x is in the right position from the start, just need to set .y to zero + input[0].y = get_zero(); + } + + // C2R AND ept > 2 + template + inline __device__ auto preprocess(ComplexType* input) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2r) && (FFT::elements_per_thread > 2)>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + + // If ept is even we need to fill one value less + static constexpr unsigned number_of_values_to_fill = (ept % 2 == 0) ? (ept / 2 - 1) : (ept / 2); + for (unsigned int i = 0; i < number_of_values_to_fill; i++) { + input[ept - i - 1] = input[i + 1]; + // conjugate + input[ept - i - 1].y = -input[ept - i - 1].y; + } + } + + // C2C or R2C + template + inline __device__ auto postprocess(ComplexType* input) // + -> typename CUFFTDX_STD::enable_if<(type_of::value != fft_type::c2r)>::type { + // NOP, C2R and R2C don't require postprocess + } + + // C2R + template + inline __device__ auto postprocess(ComplexType* input) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2r)>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + + // Pack real values + for (unsigned int i = 1; i < ept; i++) { + (reinterpret_cast(input))[i] = (reinterpret_cast(input))[2 * i]; + } + } + + template + class fft_execution: public fft_description, public execution_description_expression + { + using base_type = fft_description; + using execution_type = fft_execution; + + protected: + // Precision type + using typename base_type::this_fft_precision_t; + + /// ---- Constraints + + // We need Block or Thread to be specified exactly once + static constexpr bool is_thread_execution = has_n_of<1, fft_operator::thread, execution_type>::value; + static constexpr bool is_block_execution = has_n_of<1, fft_operator::block, execution_type>::value; + static_assert((is_thread_execution != is_block_execution), "Can't create FFT with two execution operators"); + }; + + template + class fft_thread_execution: public fft_execution + { + using this_type = fft_thread_execution; + using base_type = fft_execution; + using typename base_type::this_fft_precision_t; + using host_workspace_type = cufftdx::detail::empty_workspace; + + template + friend typename FFT::host_workspace_type cufftdx::make_workspace(cudaError_t&) noexcept; + + protected: + // Thread can't have block-only operators + static constexpr bool has_block_only_operators = has_any_block_operator::value; + static_assert(!has_block_only_operators, "FFT for thread execution can't contain block-only operators"); + + // Thread, Size and Precision constrains + static constexpr bool valid_size_for_thread_fp16 = + !base_type::has_size || // Size<> was not defined + !CUFFTDX_STD::is_same<__half, this_fft_precision_t>::value || + ((base_type::this_fft_size_v <= 32) && (base_type::this_fft_size_v >= 2)); + static constexpr bool valid_size_for_thread_fp32 = + !base_type::has_size || // Size<> was not defined + !CUFFTDX_STD::is_same::value || + ((base_type::this_fft_size_v <= 32) && (base_type::this_fft_size_v >= 2)); + static constexpr bool valid_size_for_thread_fp64 = + !base_type::has_size || // Size<> was not defined + !CUFFTDX_STD::is_same::value || + ((base_type::this_fft_size_v <= 16) && (base_type::this_fft_size_v >= 2)); + static_assert(valid_size_for_thread_fp16, + "Thread execution in fp16 precision supports sizes in range [2; 32]"); + static_assert(valid_size_for_thread_fp32, + "Thread execution in fp32 precision supports sizes in range [2; 32]"); + static_assert(valid_size_for_thread_fp64, + "Thread execution in fp64 precision supports sizes in range [2; 16]"); + + public: + using value_type = typename make_complex_type::cufftdx_type; + using input_type = value_type; + using output_type = value_type; + using workspace_type = typename host_workspace_type::device_handle; + static_assert(CUFFTDX_STD::is_same::value, + "Internal cuFFTDx error, thread FFT should never require non-empty workspace"); + + inline __device__ void execute(value_type* input) { + static_assert(base_type::is_complete, "Can't execute, FFT description is not complete"); + + using fft_implementation_t = check_and_get_fft_implementation_t; + static constexpr auto function_id = fft_implementation_t::function_id; + + preprocess(input); + using scalar_type = typename value_type::value_type; + database::detail::cufftdx_private_function_wrapper(input, nullptr); + postprocess(input); + } + + // T - can be any type if it's alignment and size are the same as those of ::value_type + template, 2>::type */> + inline __device__ auto execute(T* input) // + -> typename CUFFTDX_STD::enable_if::value && (sizeof(T) == sizeof(value_type)) && + (alignof(T) == alignof(value_type))>::type { + return execute(reinterpret_cast(input)); + } + + template + inline __device__ auto execute(T* input) // + -> typename CUFFTDX_STD::enable_if::value || (sizeof(T) != sizeof(value_type)) || + (alignof(T) != alignof(value_type))>::type { + static constexpr bool condition = + CUFFTDX_STD::is_void::value || (sizeof(T) != sizeof(value_type)) || (alignof(T) != alignof(value_type)); + static_assert(condition, "Incorrect value type is used, try using ::value_type"); + } + + template + inline __device__ auto execute(T* input, workspace_type & /* workspace */) // + -> typename CUFFTDX_STD::enable_if::value && (sizeof(T) == sizeof(value_type)) && + (alignof(T) == alignof(value_type))>::type { + return execute(reinterpret_cast(input)); + } + + template + inline __device__ auto execute(T* /* input */, workspace_type & /* workspace */) // + -> typename CUFFTDX_STD::enable_if::value || (sizeof(T) != sizeof(value_type)) || + (alignof(T) != alignof(value_type))>::type { + static constexpr bool condition = + CUFFTDX_STD::is_void::value || (sizeof(T) != sizeof(value_type)) || (alignof(T) != alignof(value_type)); + static_assert(condition, "Incorrect value type is used, try using ::value_type"); + } + + static constexpr unsigned int elements_per_thread = check_and_get_trait::value; + static constexpr unsigned int stride = 1; + static constexpr unsigned int storage_size = elements_per_thread; + + static constexpr unsigned int implicit_type_batching = + CUFFTDX_STD::is_same::value ? 2 : 1; + }; + + template + constexpr unsigned int fft_thread_execution::elements_per_thread; + template + constexpr unsigned int fft_thread_execution::stride; + template + constexpr unsigned int fft_thread_execution::storage_size; + template + constexpr unsigned int fft_thread_execution::implicit_type_batching; + + // Registers API + + // C2C + template + inline __device__ auto block_preprocess(ComplexType* /* input */, ComplexType * /* smem */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2c)>::type { + // NOP, C2C and C2R with ept == 2 don't require any preprocess + } + + // R2C AND ept > 2 + template + inline __device__ auto block_preprocess(ComplexType* input, ComplexType * /* smem */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::r2c)>::type { + // Same implementation as thread_preprocess + preprocess(input); + } + + // C2R, EPT == SIZE + template + inline __device__ auto block_preprocess(ComplexType* input, ComplexType * /* smem */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2r) && + (FFT::elements_per_thread == size_of::value)>::type { + // Same implementation as thread_preprocess + preprocess(input); + } + + // C2R, EPT < SIZE, CT + template + inline __device__ auto block_preprocess(ComplexType* input, ComplexType* smem) // + -> typename CUFFTDX_STD::enable_if::value == fft_type::c2r) && + (FFT::elements_per_thread < size_of::value)>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + static constexpr auto fft_size = size_of::value; + static constexpr bool fft_size_is_even = (fft_size % 2) == 0; + + // Move to the part of shared memory for that FFT batch + ComplexType* smem_fft_batch = smem + (threadIdx.y * (fft_size / 2)); + + for (unsigned int i = 0; i < (ept / 2); i++) { + if (!(threadIdx.x == 0 && i == 0)) { + smem_fft_batch[threadIdx.x + (i * (fft_size / ept)) - 1] = input[i]; + } + } + if (!fft_size_is_even) { + constexpr unsigned int i = ept / 2; + unsigned int index = threadIdx.x + (i * (fft_size / ept)) - 1; + if (index < (fft_size / 2)) { + smem_fft_batch[index] = input[i]; + } + } + __syncthreads(); + + const unsigned int reversed_thread_id = (fft_size / ept) - threadIdx.x; + for (unsigned int i = 0; i < (ept / 2); i++) { + if (i < ((ept / 2) - ((threadIdx.x == 0) && fft_size_is_even))) { + input[ept - 1 - i] = smem_fft_batch[reversed_thread_id + (i * (fft_size / ept)) - 1]; + // conjugate + input[ept - 1 - i].y = -input[ept - 1 - i].y; + } + } + if (!fft_size_is_even) { + constexpr unsigned int i = ept / 2; + unsigned int index = reversed_thread_id + (i * (fft_size / ept)) - 1; + if (index < (fft_size / 2)) { + input[i] = smem_fft_batch[index]; + // conjugate + input[i].y = -input[i].y; + } + } + } + + // C2R, EPT < SIZE, Bluestein + template + inline __device__ auto block_preprocess(ComplexType* input, ComplexType* smem) // + -> typename CUFFTDX_STD::enable_if::value == fft_type::c2r) && + (FFT::elements_per_thread < get_bluestein_size(size_of::value))>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + static constexpr auto fft_size = size_of::value; + static constexpr auto fft_blue_size = get_bluestein_size(fft_size); + static constexpr unsigned int stride = fft_blue_size / ept; + + // Move to the part of shared memory for that FFT batch + ComplexType* smem_fft_batch = smem + (threadIdx.y * (fft_blue_size / 2)); + + // max_meaningful_ept limits number of loops + static constexpr unsigned int max_meaningful_ept = ((fft_size / 2 + 1) + (stride - 1)) / stride; + for (unsigned i = 0; i < max_meaningful_ept /*ept/2*/; i++) { + unsigned index = (i * stride) + threadIdx.x; + if (index < (fft_size / 2 + 1)) { + if (!(threadIdx.x == 0 && i == 0)) { + smem_fft_batch[index - 1] = input[i]; + } + } + } + __syncthreads(); + + // max_meaningful_ept_2 limits number of loops + static constexpr unsigned int max_meaningful_ept_2 = + ept > (2 * max_meaningful_ept) ? ept : (2 * max_meaningful_ept); + for (unsigned i = (max_meaningful_ept - 1); i < max_meaningful_ept_2; i++) { + unsigned int index = (i * stride) + threadIdx.x; + if ((index >= (fft_size / 2 + 1)) && (index < fft_size)) { + input[i] = smem_fft_batch[(fft_size - index) - 1]; + // conjugate + input[i].y = -input[i].y; + } + } + } + + // Shared memory API + + // C2C + template + inline __device__ auto block_preprocess_shared_api(ComplexType* /* input */, ComplexType * /* smem */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2c)>::type { + // NOP, C2C and C2R with ept == 2 don't require any preprocess + } + + // R2C AND ept > 2 + template + inline __device__ auto block_preprocess_shared_api(ComplexType* input, ComplexType * /* smem */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::r2c)>::type { + // Same implementation as thread_preprocess + preprocess(input); + } + + // C2R, EPT == SIZE + template + inline __device__ auto block_preprocess_shared_api(ComplexType* input, ComplexType * /* smem */) // + -> typename CUFFTDX_STD::enable_if<(type_of::value == fft_type::c2r) && + (FFT::elements_per_thread == size_of::value)>::type { + // Same implementation as thread_preprocess + preprocess(input); + } + + // C2R, EPT < SIZE, CT + template + inline __device__ auto block_preprocess_shared_api(ComplexType* input, ComplexType* smem) // + -> typename CUFFTDX_STD::enable_if::value == fft_type::c2r) && + (FFT::elements_per_thread < size_of::value)>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + static constexpr auto fft_size = size_of::value; + static constexpr bool fft_size_is_even = (fft_size % 2) == 0; + + // Move to the part of shared memory for that FFT batch + ComplexType* smem_fft_batch = smem + (threadIdx.y * (fft_size / 2 + 1)); + + const unsigned int reversed_thread_id = (fft_size / ept) - threadIdx.x; + for (unsigned int i = 0; i < (ept / 2); i++) { + if (i < ((ept / 2) - ((threadIdx.x == 0) && fft_size_is_even))) { + input[ept - 1 - i] = smem_fft_batch[reversed_thread_id + (i * (fft_size / ept))]; + // conjugate + input[ept - 1 - i].y = -input[ept - 1 - i].y; + } + } + if (!fft_size_is_even) { + constexpr unsigned int i = ept / 2; + unsigned int index = reversed_thread_id + (i * (fft_size / ept)); + if (index < (fft_size / 2) + 1) { + input[i] = smem_fft_batch[index]; + // conjugate + input[i].y = -input[i].y; + } + } + } + + // C2R, EPT < SIZE, Bluestein + template + inline __device__ auto block_preprocess_shared_api(ComplexType* input, ComplexType* smem) // + -> typename CUFFTDX_STD::enable_if::value == fft_type::c2r) && + (FFT::elements_per_thread < get_bluestein_size(size_of::value))>::type { + using scalar_type = typename ComplexType::value_type; + static constexpr auto ept = FFT::elements_per_thread; + static constexpr auto fft_size = size_of::value; + static constexpr auto fft_blue_size = get_bluestein_size(fft_size); + static constexpr unsigned int stride = fft_blue_size / ept; + + // Move to the part of shared memory for that FFT batch + ComplexType* smem_fft_batch = smem + (threadIdx.y * (fft_size / 2 + 1)); + + static constexpr unsigned int first_missing_index = ((fft_size / 2 + 1) + (stride - 1)) / stride; + static constexpr unsigned int last_missing_index = + ept > (2 * first_missing_index) ? ept : (2 * first_missing_index); + for (unsigned i = (first_missing_index - 1); i < last_missing_index; i++) { + unsigned int index = (i * stride) + threadIdx.x; + if ((index >= (fft_size / 2 + 1)) && (index < fft_size)) { + input[i] = smem_fft_batch[(fft_size - index)]; + // conjugate + input[i].y = -input[i].y; + } + } + } + +#ifdef __CUDACC_RTC__ + template + class fft_block_execution_partial: public fft_execution + { + using base_type = fft_execution; + using typename base_type::this_fft_precision_t; + + public: + using value_type = typename make_complex_type::cufftdx_type; + using input_type = value_type; + using output_type = value_type; + }; +#endif + + template + class fft_block_execution: public fft_execution + { + using this_type = fft_block_execution; + using base_type = fft_execution; + using typename base_type::this_fft_precision_t; + + public: + using value_type = typename make_complex_type::cufftdx_type; + + private: + template + friend typename FFT::host_workspace_type cufftdx::make_workspace(cudaError_t&) noexcept; + using host_workspace_type = typename workspace_selector::type; + + // Return false if fft's precision matches 'Precision', sm matches 'SM' + // and test can not be executed + template + static constexpr bool is_valid_size_for_block() { + using is_size_supported = is_supported; + return !base_type::has_size || // Size<> was not defined + !(base_type::this_fft_sm_v == SM && base_type::has_sm) || + !(CUFFTDX_STD::is_same::value) || + ((CUFFTDX_STD::is_same::value) && is_size_supported::fp16_block_value) || + ((CUFFTDX_STD::is_same::value) && is_size_supported::fp32_block_value) || + ((CUFFTDX_STD::is_same::value) && is_size_supported::fp64_block_value); + } + + // Check requirements for Bluestein size + // If we need Bluestein, we need to generate chirp using FP64 FFT of size next-power_of_2(2*N - 1) + template + static constexpr bool is_valid_size_for_bluestein() { + return !CUFFTDX_STD::is_same>::value || + !base_type::has_size || // Size<> was not defined + !(base_type::this_fft_sm_v == SM && base_type::has_sm) || + is_supported::blue_block_value; + } + + protected: + + // Block, Size and Precision constrains + + // SM70 + static constexpr bool valid_size_for_block_fp16_sm70 = is_valid_size_for_block<__half, 700>(); + static constexpr bool valid_size_for_block_fp32_sm70 = is_valid_size_for_block(); + static constexpr bool valid_size_for_block_fp64_sm70 = is_valid_size_for_block(); + static_assert(valid_size_for_block_fp16_sm70, + "Block execution in fp16 precision on SM70 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM700_FP16_MAX) "]"); + static_assert(valid_size_for_block_fp32_sm70, + "Block execution in fp32 precision on SM70 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM700_FP32_MAX) "]"); + static_assert(valid_size_for_block_fp64_sm70, + "Block execution in fp64 precision on SM70 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM700_FP64_MAX) "]"); + + static constexpr bool valid_size_for_bluestein_sm70 = is_valid_size_for_bluestein<700>(); + static_assert(valid_size_for_bluestein_sm70, + "Block execution for this size is not supported"); + + // SM72 + static constexpr bool valid_size_for_block_fp16_sm72 = is_valid_size_for_block<__half, 720>(); + static constexpr bool valid_size_for_block_fp32_sm72 = is_valid_size_for_block(); + static constexpr bool valid_size_for_block_fp64_sm72 = is_valid_size_for_block(); + static_assert(valid_size_for_block_fp16_sm72, + "Block execution in fp16 precision on SM72 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM720_FP16_MAX) "]"); + static_assert(valid_size_for_block_fp32_sm72, + "Block execution in fp32 precision on SM72 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM720_FP32_MAX) "]"); + static_assert(valid_size_for_block_fp64_sm72, + "Block execution in fp64 precision on SM72 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM720_FP64_MAX) "]"); + + static constexpr bool valid_size_for_bluestein_sm72 = is_valid_size_for_bluestein<720>(); + static_assert(valid_size_for_bluestein_sm72, + "Block execution for this size is not supported"); + + // SM75 + static constexpr bool valid_size_for_block_fp16_sm75 = is_valid_size_for_block<__half, 750>(); + static constexpr bool valid_size_for_block_fp32_sm75 = is_valid_size_for_block(); + static constexpr bool valid_size_for_block_fp64_sm75 = is_valid_size_for_block(); + static_assert(valid_size_for_block_fp16_sm75, + "Block execution in fp16 precision on SM75 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM750_FP16_MAX) "]"); + static_assert(valid_size_for_block_fp32_sm75, + "Block execution in fp32 precision on SM75 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM750_FP32_MAX) "]"); + static_assert(valid_size_for_block_fp64_sm75, + "Block execution in fp64 precision on SM75 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM750_FP64_MAX) "]"); + + static constexpr bool valid_size_for_bluestein_sm75 = is_valid_size_for_bluestein<750>(); + static_assert(valid_size_for_bluestein_sm75, + "Block execution for this size is not supported"); + + // SM80 + static constexpr bool valid_size_for_block_fp16_sm80 = is_valid_size_for_block<__half, 800>(); + static constexpr bool valid_size_for_block_fp32_sm80 = is_valid_size_for_block(); + static constexpr bool valid_size_for_block_fp64_sm80 = is_valid_size_for_block(); + static_assert(valid_size_for_block_fp16_sm80, + "Block execution in fp16 precision on SM80 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM800_FP16_MAX) "]"); + static_assert(valid_size_for_block_fp32_sm80, + "Block execution in fp32 precision on SM80 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM800_FP32_MAX) "]"); + static_assert(valid_size_for_block_fp64_sm80, + "Block execution in fp64 precision on SM80 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM800_FP64_MAX) "]"); + + static constexpr bool valid_size_for_bluestein_sm80 = is_valid_size_for_bluestein<800>(); + static_assert(valid_size_for_bluestein_sm80, + "Block execution for this size is not supported"); + + // SM86 + static constexpr bool valid_size_for_block_fp16_sm86 = is_valid_size_for_block<__half, 860>(); + static constexpr bool valid_size_for_block_fp32_sm86 = is_valid_size_for_block(); + static constexpr bool valid_size_for_block_fp64_sm86 = is_valid_size_for_block(); + static_assert(valid_size_for_block_fp16_sm86, + "Block execution in fp16 precision on SM86 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM860_FP16_MAX) "]"); + static_assert(valid_size_for_block_fp32_sm86, + "Block execution in fp32 precision on SM86 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM860_FP32_MAX) "]"); + static_assert(valid_size_for_block_fp64_sm86, + "Block execution in fp64 precision on SM86 supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM860_FP64_MAX) "]"); + + static constexpr bool valid_size_for_bluestein_sm86 = is_valid_size_for_bluestein<860>(); + static_assert(valid_size_for_bluestein_sm86, + "Block execution for this size is not supported"); + + // MAX (No SM must be defined) + static constexpr bool valid_size_for_block_fp16_max = + !base_type::has_size || // Size<> was not defined + !(CUFFTDX_STD::is_same::value) || + is_supported<__half, base_type::this_fft_size_v, unsigned(-1)>::fp16_block_value; + static constexpr bool valid_size_for_block_fp32_max = + !base_type::has_size || // Size<> was not defined + !(CUFFTDX_STD::is_same::value) || + is_supported::fp32_block_value; + static constexpr bool valid_size_for_block_fp64_max = + !base_type::has_size || // Size<> was not defined + !(CUFFTDX_STD::is_same::value) || + is_supported::fp64_block_value; + static_assert(valid_size_for_block_fp16_max, + "Block execution in fp16 precision supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM800_FP16_MAX) "]"); + static_assert(valid_size_for_block_fp32_max, + "Block execution in fp32 precision supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM800_FP32_MAX) "]"); + static_assert(valid_size_for_block_fp64_max, + "Block execution in fp64 precision supports sizes in range [2; " STRINGIFY(CUFFTDX_DETAIL_SM800_FP64_MAX) "]"); + + // MAX (No SM must be defined) + // Check requirements for Bluestein size + // If we need Bluestein, we need to generate chirp using FP64 FFT of size next-power_of_2(2*N - 1) + static constexpr bool valid_size_for_bluestein_max = + !CUFFTDX_STD::is_same>::value || + !base_type::has_size || // Size<> was not defined + is_supported::blue_block_value; + static_assert(valid_size_for_bluestein_max, + "Block execution for this size is not supported"); + + public: + using input_type = value_type; + using output_type = value_type; + using workspace_type = typename host_workspace_type::device_handle; + + template + inline __device__ void execute(T* shared_memory_input) { +#if !defined(NDEBUG) && !defined(CUFFTDX_DISABLE_RUNTIME_ASSERTS) && !defined(__CUDACC_RTC__) + const bool block_dimension_x_is_correct = (blockDim.x == block_dim.x); + assert(block_dimension_x_is_correct); + const bool block_dimension_y_is_correct = (blockDim.y == block_dim.y); + assert(block_dimension_y_is_correct); +#endif + static_assert(base_type::is_complete, "Can't execute, FFT description is not complete"); + static_assert(!requires_workspace, "This FFT configuration requires workspace"); + + static constexpr bool use_bluestein = check_and_get_fft_implementation::use_bluestein; + + value_type thread_data[storage_size]; + shared_to_registers(shared_memory_input, thread_data); + + block_preprocess_shared_api(thread_data, reinterpret_cast(shared_memory_input)); + workspace_type dummy_workspace; + internal_execute(thread_data, shared_memory_input, dummy_workspace); + postprocess(thread_data); + + registers_to_shared(thread_data, shared_memory_input); + } + + template + inline __device__ void execute(T* shared_memory_input, workspace_type& workspace) { +#if !defined(NDEBUG) && !defined(CUFFTDX_DISABLE_RUNTIME_ASSERTS) && !defined(__CUDACC_RTC__) + const bool block_dimension_x_is_correct = (blockDim.x == block_dim.x); + assert(block_dimension_x_is_correct); + const bool block_dimension_y_is_correct = (blockDim.y == block_dim.y); + assert(block_dimension_y_is_correct); +#endif + static_assert(base_type::is_complete, "Can't execute, FFT description is not complete"); + + value_type thread_data[storage_size]; + shared_to_registers(shared_memory_input, thread_data); + + static constexpr bool use_bluestein = check_and_get_fft_implementation::use_bluestein; + block_preprocess_shared_api(thread_data, reinterpret_cast(shared_memory_input)); + internal_execute(thread_data, shared_memory_input, workspace); + postprocess(thread_data); + + registers_to_shared(thread_data, shared_memory_input); + } + + inline __device__ void execute(value_type* input, void* shared_memory) { +#if !defined(NDEBUG) && !defined(CUFFTDX_DISABLE_RUNTIME_ASSERTS) && !defined(__CUDACC_RTC__) + const bool block_dimension_x_is_correct = (blockDim.x == block_dim.x); + assert(block_dimension_x_is_correct); + const bool block_dimension_y_is_correct = (blockDim.y == block_dim.y); + assert(block_dimension_y_is_correct); +#endif + + static_assert(base_type::is_complete, "Can't execute, FFT description is not complete"); + static_assert(!requires_workspace, "This FFT configuration requires workspace"); + + static constexpr bool use_bluestein = check_and_get_fft_implementation::use_bluestein; + block_preprocess(input, reinterpret_cast(shared_memory)); + workspace_type dummy_workspace; + internal_execute(input, shared_memory, dummy_workspace); + postprocess(input); + } + + // T - can be any type if its alignment and size are the same as those of ::value_type + template, 2>::type */> + inline __device__ auto execute(T* input, void* shared_memory) // + -> typename CUFFTDX_STD::enable_if::value && (sizeof(T) == sizeof(value_type)) && + (alignof(T) == alignof(value_type))>::type { + return execute(reinterpret_cast(input), shared_memory); + } + + template + inline __device__ auto execute(T* /* input */, void * /* shared_memory */) // + -> typename CUFFTDX_STD::enable_if::value || (sizeof(T) != sizeof(value_type)) || + (alignof(T) != alignof(value_type))>::type { + static constexpr bool condition = + CUFFTDX_STD::is_void::value || (sizeof(T) != sizeof(value_type)) || (alignof(T) != alignof(value_type)); + static_assert(condition, "Incorrect value type is used, try using ::value_type"); + } + + inline __device__ void execute(value_type* input, void* shared_memory, workspace_type& workspace) { +#if !defined(NDEBUG) && !defined(CUFFTDX_DISABLE_RUNTIME_ASSERTS) && !defined(__CUDACC_RTC__) + const bool block_dimension_x_is_correct = (blockDim.x == block_dim.x); + assert(block_dimension_x_is_correct); + const bool block_dimension_y_is_correct = (blockDim.y == block_dim.y); + assert(block_dimension_y_is_correct); +#endif + static_assert(base_type::is_complete, "Can't execute, FFT description is not complete"); + + static constexpr bool use_bluestein = check_and_get_fft_implementation::use_bluestein; + block_preprocess(input, reinterpret_cast(shared_memory)); + internal_execute(input, shared_memory, workspace); + postprocess(input); + } + + // T - can be any type if its alignment and size are the same as those of ::value_type + template + inline __device__ auto execute(T* input, void* shared_memory, workspace_type& workspace) // + -> typename CUFFTDX_STD::enable_if::value && (sizeof(T) == sizeof(value_type)) && + (alignof(T) == alignof(value_type))>::type { + return execute(reinterpret_cast(input), shared_memory, workspace); + } + + template + inline __device__ auto execute(T* /* input */, void* /* shared_memory */, workspace_type & + /* workspace */) // + -> typename CUFFTDX_STD::enable_if::value || (sizeof(T) != sizeof(value_type)) || + (alignof(T) != alignof(value_type))>::type { + static constexpr bool condition = + CUFFTDX_STD::is_void::value || (sizeof(T) != sizeof(value_type)) || (alignof(T) != alignof(value_type)); + static_assert(condition, "Incorrect value type is used, try using ::value_type"); + } + + private: + + template + inline __device__ void shared_to_registers_impl(T* shared_memory, T* thread_data) { + const unsigned int batch_offset = threadIdx.y * N; + unsigned int index = threadIdx.x; + for(unsigned int i = 0; i < elements_per_thread; i++) { + if (index < N) { + thread_data[i] = shared_memory[batch_offset + index]; + } + index += stride; + } + } + + template + inline __device__ void registers_to_shared_impl(T* thread_data, T* shared_memory) { + const unsigned int batch_offset = threadIdx.y * N; + unsigned int index = threadIdx.x; + for(unsigned int i = 0; i < elements_per_thread; i++) { + if (index < N) { + shared_memory[batch_offset + index] = thread_data[i]; + } + index += stride; + } + } + + template + inline __device__ void shared_to_registers(void* shared_memory, V* thread_data) { + if (base_type::this_fft_type_v == fft_type::c2c) { + shared_to_registers_impl( + reinterpret_cast(shared_memory), reinterpret_cast(thread_data)); + } else if (base_type::this_fft_type_v == fft_type::c2r) { + shared_to_registers_impl( + reinterpret_cast(shared_memory), reinterpret_cast(thread_data)); + } else if (base_type::this_fft_type_v == fft_type::r2c) { + shared_to_registers_impl( + reinterpret_cast(shared_memory), + reinterpret_cast(thread_data)); + } + __syncthreads(); + } + + template + inline __device__ void registers_to_shared(void* shared_memory, V* thread_data) { + __syncthreads(); + if (base_type::this_fft_type_v == fft_type::c2c) { + registers_to_shared_impl( + reinterpret_cast(shared_memory), reinterpret_cast(thread_data)); + } else if (base_type::this_fft_type_v == fft_type::c2r) { + registers_to_shared_impl( + reinterpret_cast(shared_memory), + reinterpret_cast(thread_data)); + } else if (base_type::this_fft_type_v == fft_type::r2c) { + registers_to_shared_impl( + reinterpret_cast(shared_memory), reinterpret_cast(thread_data)); + } + } + + // Cooley-Tukey execution + template + inline __device__ auto internal_execute(value_type* input, + void* shared_memory, + workspace_type& /* workspace */, + const unsigned int /* fft_id */ = threadIdx.x) // + -> typename CUFFTDX_STD::enable_if::type { + using fft_implementation_t = check_and_get_fft_implementation_t; + static constexpr auto function_id = fft_implementation_t::function_id; + using scalar_type = typename value_type::value_type; + database::detail::cufftdx_private_function_wrapper(input, + shared_memory); + } + + // Bluestein execution + // Assumptions: + // * fft_id is threadIdx.x -> user must use our block dimension + template + inline __device__ auto internal_execute(value_type* input, + void* shared_memory, + workspace_type& workspace, + const unsigned int fft_id = threadIdx.x) // + -> typename CUFFTDX_STD::enable_if::type { +#if !defined(NDEBUG) + const bool workspace_valid = workspace.valid(); + assert(workspace_valid && "Workspace is invalid, check if workspace was created successfully before passing it to kernel"); +#endif + + using scalar_type = typename value_type::value_type; + using ldg_type = typename ldg_type::type; + + using fft_implementation_t = check_and_get_fft_implementation_t; + static constexpr auto function_id = fft_implementation_t::function_id; + static constexpr auto fft_blue_size = get_bluestein_size(base_type::this_fft_size_v); + + unsigned int index = fft_id; + static constexpr unsigned int stride = fft_blue_size / elements_per_thread; + + // Only first fft_size values are meaningful, others should be zero. + static constexpr unsigned int max_meaningful_ept = (base_type::this_fft_size_v + (stride - 1)) / stride; + // In this case user is expected to zero-padded input. + // for (unsigned int i = 0; i < max_meaningful_ept; ++i) { + // auto v = __ldg((ldg_type*)workspace.w_time + index); + // input[i] *= *(reinterpret_cast(&v)); + // index += stride; + // } + // This zeroes the padding. + for (unsigned int i = 0; i < elements_per_thread; ++i) { + // Make swap real<->imag for inverse FFT + if (base_type::this_fft_direction_v == fft_direction::inverse) { + const auto tmp = input[i].x; + input[i].x = input[i].y; + input[i].y = tmp; + } + + if ((i * stride + fft_id) < base_type::this_fft_size_v) { + auto v = __ldg((ldg_type*)workspace.w_time + index); + // For half precision we're loading float2 in ldg, so we need + // to reinterpret in to complex<__half2> in order to have correct + // multiplication performed. + input[i] *= *(reinterpret_cast(&v)); + } else { + input[i] = value_type(0., 0.); + } + index += stride; + } + + database::detail::cufftdx_private_function_wrapper(input, + shared_memory); + index = fft_id; + for (unsigned int i = 0; i < elements_per_thread; ++i) { + auto v = __ldg((ldg_type*)workspace.w_freq + index); + input[i] *= *(reinterpret_cast(&v)); + input[i].y = -input[i].y; // conjugate + index += stride; + } + + database::detail::cufftdx_private_function_wrapper(input, + shared_memory); + + // We can limit the last loop to just max_meaningful_ept, other values are not needed. + index = fft_id; + for (unsigned int i = 0; i < max_meaningful_ept; ++i) { + input[i].y = -input[i].y; // conjugate + // normalize; input[i] /= fft_blue_size; // divide by xsize, for ifft + normalize( input[i]); + auto v = __ldg((ldg_type*)workspace.w_time + index); + input[i] *= *(reinterpret_cast(&v)); + index += stride; + + // Make swap real<->imag for inverse FFT + if (base_type::this_fft_direction_v == fft_direction::inverse) { + const auto tmp = input[i].x; + input[i].x = input[i].y; + input[i].y = tmp; + } + } + } + + inline static constexpr unsigned int get_shared_memory_size() { + static_assert(base_type::is_complete, "Can't calculate shared memory, FFT description is not complete"); + using fft_implementation_t = check_and_get_fft_implementation_t; + return fft_implementation_t::shared_memory_size * ffts_per_block; + } + + inline static constexpr unsigned int get_storage_size() { + static_assert(base_type::is_complete, "Can't calculate storage_size, FFT description is not complete"); + using fft_implementation_t = check_and_get_fft_implementation_t; + return fft_implementation_t::storage_size; + } + + public: + static constexpr dim3 block_dim = check_and_get_trait::value; + static constexpr unsigned int ffts_per_block = + check_and_get_trait::value; + static constexpr unsigned int elements_per_thread = check_and_get_trait::value; + static constexpr unsigned int stride = block_dim.x; + + static constexpr unsigned int suggested_ffts_per_block = + check_and_get_trait::suggested; + + static constexpr unsigned int storage_size = get_storage_size(); + static constexpr unsigned int shared_memory_size = get_shared_memory_size(); + + static constexpr unsigned int max_threads_per_block = block_dim.x * block_dim.y * block_dim.z; + + static constexpr unsigned int implicit_type_batching = + CUFFTDX_STD::is_same::value ? 2 : 1; + + static constexpr bool requires_workspace = check_and_get_fft_implementation::requires_workspace; + static constexpr unsigned int workspace_size = check_and_get_fft_implementation::workspace_size; + }; + + template + constexpr dim3 fft_block_execution::block_dim; + template + constexpr unsigned int fft_block_execution::ffts_per_block; + template + constexpr unsigned int fft_block_execution::elements_per_thread; + template + constexpr unsigned int fft_block_execution::stride; + template + constexpr unsigned int fft_block_execution::suggested_ffts_per_block; + template + constexpr unsigned int fft_block_execution::storage_size; + template + constexpr unsigned int fft_block_execution::shared_memory_size; + template + constexpr unsigned int fft_block_execution::max_threads_per_block; + template + constexpr unsigned int fft_block_execution::implicit_type_batching; + template + constexpr bool fft_block_execution::requires_workspace; + template + constexpr unsigned int fft_block_execution::workspace_size; + + + // [NOTE] Idea for testing static assert. + // + // Switch (macro) which changes behaviour from going to static_asserts + // to returning description_error type in operator+(). That would required more indirection + // in creating fft_description and fft_execution types. + + template + struct make_description { + private: + static constexpr bool has_block_operator = + has_operator>::value; + static constexpr bool has_thread_operator = + has_operator>::value; + static constexpr bool has_execution_operator = has_block_operator || has_thread_operator; + + // Workaround (NVRTC) + // + // For NVRTC we need to utilize a in-between class called fft_block_execution_partial, otherwise + // we run into a complation error if Block() is added to description before FFT description is + // complete, example: + // + // Fails on NVRTC: + // Size<...>() + Direction<...>() + Type<...>() + Precision<...>() + Block() + SM<700>() + // Works on NVRTC: + // Size<...>() + Direction<...>() + Type<...>() + Precision<...>() + SM<700>() + Block() + // + // This workaround disables some useful diagnostics based on static_asserts. +#ifdef __CUDACC_RTC__ + using operator_wrapper_type = fft_operator_wrapper; + using fft_block_execution_type = + typename CUFFTDX_STD::conditional::value, + fft_block_execution, + fft_block_execution_partial>::type; +#else + using fft_block_execution_type = fft_block_execution; +#endif + + using description_type = fft_description; + using execution_type = typename CUFFTDX_STD::conditional>::type; + + public: + using type = typename CUFFTDX_STD::conditional::type; + }; + + template + using make_description_t = typename make_description::type; + } // namespace detail + + template + __host__ __device__ __forceinline__ auto operator+(const Operator1&, const Operator2&) // + -> typename CUFFTDX_STD::enable_if::value, + detail::make_description_t>::type { + return detail::make_description_t(); + } + + template + __host__ __device__ __forceinline__ auto operator+(const detail::fft_description&, + const Operator2&) // + -> typename CUFFTDX_STD::enable_if::value, + detail::make_description_t>::type { + return detail::make_description_t(); + } + + template + __host__ __device__ __forceinline__ auto operator+(const Operator1&, + const detail::fft_description&) // + -> typename CUFFTDX_STD::enable_if::value, + detail::make_description_t>::type { + return detail::make_description_t(); + } + + template + __host__ __device__ __forceinline__ auto operator+(const detail::fft_description&, + const detail::fft_description&) // + -> detail::make_description_t { + return detail::make_description_t(); + } +} // namespace cufftdx + +#undef STRINGIFY +#undef XSTRINGIFY + +#endif // CUFFTDX_DETAIL_FFT_EXECUTION_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/system_checks.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/system_checks.hpp new file mode 100644 index 0000000000000..ca90a872d5bc0 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/system_checks.hpp @@ -0,0 +1,72 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_SYSTEM_CHECKS_HPP +#define CUFFTDX_DETAIL_SYSTEM_CHECKS_HPP + +// We require target architecture to be Volta+ (only checking on device) +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +# error "cuFFTDx requires GPU architecture sm_70 or higher"); +#endif + +#ifdef __CUDACC_RTC__ + +// NVRTC version check +# ifndef CUFFTDX_IGNORE_DEPRECATED_COMPILER +# if (__CUDACC_VER_MAJOR__ < 11) +# error cuFFTDx requires NVRTC from CUDA Toolkit 11.0 or newer +# endif +# endif // CUFFTDX_IGNORE_DEPRECATED_COMPILER + +// NVRTC compilation checks +# ifndef CUFFTDX_IGNORE_DEPRECATED_COMPILER +static_assert(__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0, + "cuFFTDx requires CUDA Runtime 11.0 or newer to work with NVRTC"); +# endif // CUFFTDX_IGNORE_DEPRECATED_COMPILER + +#else +# include + +// NVCC compilation + +static_assert(CUDART_VERSION >= 11000, "cuFFTDx requires CUDA Runtime 11.0 or newer"); +static_assert(CUDA_VERSION >= 11000, "cuFFTDx requires CUDA Toolkit 11.0 or newer"); + +# ifndef CUFFTDX_IGNORE_DEPRECATED_COMPILER + +// Test for GCC 7+ +# if defined(__GNUC__) && !defined(__clang__) +# if (__GNUC__ < 7) +# error cuFFTDx requires GCC in version 7 or newer +# endif +# endif // __GNUC__ + +// Test for clang 9+ +# ifdef __clang__ +# if (__clang_major__ < 9) +# error cuFFTDx requires clang in version 9 or newer (experimental support for clang as host compiler) +# endif +# endif // __clang__ + +// MSVC (Visual Studio) is not supported +# ifdef _MSC_VER +# error cuFFTDx does not support compilation with MSVC +# endif // _MSC_VER + +# endif // CUFFTDX_IGNORE_DEPRECATED_COMPILER + +#endif // __CUDACC_RTC__ + +// C++ Version +#ifndef CUFFTDX_IGNORE_DEPRECATED_DIALECT +# if (__cplusplus < 201703L) +# error cuFFTDx requires C++17 (or newer) enabled +# endif +#endif // CUFFTDX_IGNORE_DEPRECATED_DIALECT + +#endif // CUFFTDX_DETAIL_SYSTEM_CHECKS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/workspace.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/workspace.hpp new file mode 100644 index 0000000000000..79108c85a539b --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/detail/workspace.hpp @@ -0,0 +1,379 @@ +// Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_DETAIL_WORKSPACE_HPP +#define CUFFTDX_DETAIL_WORKSPACE_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#ifndef __CUDACC_RTC__ +# include +# include +#endif // __CUDACC_RTC__ + +#ifndef __CUDACC_RTC__ +# include +#endif // __CUDACC_RTC__ + +#include "../traits.hpp" +#include "../traits/detail/bluestein_helpers.hpp" + +#ifdef CUFFTDX_DETAIL_DEBUG +# define CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error) \ + { \ + auto status = static_cast(error); \ + if (status != cudaSuccess) { \ + std::cout << cudaGetErrorString(status) << " " << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(status); \ + } \ + } +#else +# define CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error) (void)error; +#endif // CUDA_CHECK_AND_EXIT + +namespace cufftdx { + namespace detail { + struct empty_workspace { + // CUDA device handle type + struct device_handle { + __forceinline__ __device__ bool valid() const { return true; } + }; + + // __host__ functions should not be visible for NVRTC + #ifndef __CUDACC_RTC__ + static __forceinline__ __host__ empty_workspace create(cudaError_t& error_code) noexcept { + error_code = cudaSuccess; + return empty_workspace {}; + } + + __forceinline__ __host__ device_handle get_device_handle() const { return device_handle {}; } + __forceinline__ __host__ operator device_handle() const { return get_device_handle(); } + __forceinline__ __host__ bool valid() const { return true; } + __forceinline__ __host__ void release() { } + #endif // __CUDACC_RTC__ + }; + + struct unknown_workspace { + // CUDA device handle type + struct device_handle { + __forceinline__ __device__ bool valid() const { return false; } + }; + + // __host__ functions should not be visible for NVRTC + #ifndef __CUDACC_RTC__ + static __forceinline__ __host__ unknown_workspace create(cudaError_t& error_code) noexcept { + error_code = cudaSuccess; + return unknown_workspace {}; + } + + __forceinline__ __host__ device_handle get_device_handle() const { return device_handle {}; } + __forceinline__ __host__ operator device_handle() const { return get_device_handle(); } + __forceinline__ __host__ bool valid() const { return false; } + __forceinline__ __host__ void release() { } + #endif // __CUDACC_RTC__ + }; + + namespace __bluestein_workspace { + template + __global__ __launch_bounds__(BluesteinFFT::max_threads_per_block) void kernel(FFTComplexType* w_time, + FFTComplexType* w_freq, + // const unsigned int fft_size, + // const unsigned int fft_blue_size, + const double theta0) { + static_assert(BluesteinFFT::storage_size == BluesteinFFT::elements_per_thread, + "Workspace generation incorrectly configured"); + + using blue_complex_type = typename BluesteinFFT::value_type; + + static constexpr auto blue_fft_size = size_of::value; + + blue_complex_type thread_w_time[BluesteinFFT::storage_size]; + // Generate w_time signal and store + const unsigned int stride = blue_fft_size / BluesteinFFT::elements_per_thread; + unsigned int index = threadIdx.x; + unsigned int compute_index = index; + for (unsigned int i = 0; i < BluesteinFFT::elements_per_thread; i++) { + if (index >= FFTSize) { + compute_index = blue_fft_size - index; + } + thread_w_time[i] = 0; + blue_complex_type b_n = {0, 0}; + if (compute_index < FFTSize) { + const double theta = theta0 * ((compute_index * compute_index) % (2 * FFTSize)); + b_n.x = cos(theta); + b_n.y = sin(theta); + thread_w_time[i] = b_n; + } + // Store conjugated value in w_time + b_n.y = -b_n.y; + w_time[index] = FFTComplexType(b_n); + index += stride; + compute_index = index; + } + __syncthreads(); + + // Calculate w_freq + extern __shared__ unsigned char shared_mem[]; + BluesteinFFT().execute(thread_w_time, shared_mem); + + // Store w_freq + index = threadIdx.x; + for (unsigned int i = 0; i < BluesteinFFT::elements_per_thread; i++) { + w_freq[index] = FFTComplexType(thread_w_time[i]); + index += stride; + } + }; + } // namespace __bluestein_workspace + + template + class bluestein_workspace: empty_workspace + { + using value_type = FFTValueType; + + // Replace size with next power-of-two, type with C2C, direction to forward, precision to double + using bluestein_fft_type = + typename CUFFTDX_STD::decay() + + Direction() + Precision() + + Type() + Block() + SM())>::type; + + // std::unique_ptr and std::shared_ptr undefined for NVRTC + #ifndef __CUDACC_RTC__ + template + using d_value_type_uptr = std::unique_ptr; + using d_value_type_sptr = std::shared_ptr; + #endif // __CUDACC_RTC__ + + // Types with access to private/protected members + template + friend class fft_block_execution; + + bluestein_workspace(): + w_time(nullptr), w_freq(nullptr) {} + + public: + class device_handle + { + template + friend class fft_block_execution; + + friend class bluestein_workspace; + + device_handle(value_type* w_time, value_type* w_freq): + w_time(w_time), w_freq(w_freq) {} + + __forceinline__ __device__ bool valid() const { + return (w_time != nullptr) && (w_freq != nullptr); + } + + value_type* w_time; + value_type* w_freq; + }; + + // __host__ functions should not be visible for NVRTC + #ifndef __CUDACC_RTC__ + static __forceinline__ __host__ bluestein_workspace create(cudaError_t& error_code) noexcept { + error_code = cudaSuccess; + bluestein_workspace ws; + + static constexpr auto fft_size = FFTSize; + static constexpr auto fft_blue_size = detail::get_bluestein_size(FFTSize); + static constexpr auto ws_buffers_size_bytes = fft_blue_size * sizeof(value_type); + + auto deleter = [&error_code](value_type* ptr) { error_code = cudaFree(ptr); }; +#ifdef CUFFTDX_DETAIL_DEBUG + auto cuda_malloc = [](size_t size_bytes, cudaError_t& error_code) { + void* ptr = nullptr; + error_code = cudaMallocManaged((void**)&ptr, size_bytes); + return (value_type*)ptr; + }; +#else + auto cuda_malloc = [](size_t size_bytes, cudaError_t& error_code) { + void* ptr = nullptr; + error_code = cudaMalloc((void**)&ptr, size_bytes); + return (value_type*)ptr; + }; +#endif + + d_value_type_uptr uptr_w_time(cuda_malloc(ws_buffers_size_bytes, error_code), + deleter); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + d_value_type_uptr uptr_w_freq(cuda_malloc(ws_buffers_size_bytes, error_code), + deleter); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + + error_code = cudaMemset((void*)uptr_w_time.get(), 0, ws_buffers_size_bytes); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + error_code = cudaMemset((void*)uptr_w_freq.get(), 0, ws_buffers_size_bytes); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + // Increase max shared memory if needed + error_code = cudaFuncSetAttribute( + (void*)__bluestein_workspace::kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + bluestein_fft_type::shared_memory_size); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + + // Needs l-value for args + auto tmp_w_time = uptr_w_time.get(); + auto tmp_w_freq = uptr_w_freq.get(); + + double theta0 = M_PI / fft_size; + void* args[] = {&tmp_w_time, &tmp_w_freq, &theta0}; + dim3 blocks(1, 1, 1); + error_code = + cudaLaunchKernel((void*)__bluestein_workspace::kernel, + blocks, + bluestein_fft_type::block_dim, + args, + bluestein_fft_type::shared_memory_size, + 0); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + error_code = cudaGetLastError(); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + error_code = cudaDeviceSynchronize(); + if (error_code != cudaSuccess) { + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(error_code); + return ws; + } + + auto shared_deleter = [](value_type* ptr) { + // User should handle and clear all CUDA RT errors which occurred before, + // thus we expect that cudaGetLastError() return cudaSuccess. + assert(cudaGetLastError() == cudaSuccess); + // We don't expect cudaFree to fail here + CUFFTDX_DETAIL_CUDA_CHECK_AND_EXIT(cudaFree(ptr)); + }; + ws.w_time = d_value_type_sptr(uptr_w_time.release(), shared_deleter); + ws.w_freq = d_value_type_sptr(uptr_w_freq.release(), shared_deleter); + return ws; + } + + __forceinline__ __host__ device_handle get_device_handle() const { + return device_handle {w_time.get(), w_freq.get()}; + } + __forceinline__ __host__ operator device_handle() const { return get_device_handle(); } + + __forceinline__ __host__ bool valid() const { + return (w_time.get() != nullptr) && (w_freq.get() != nullptr); + } + + __forceinline__ __host__ void release() { + w_time.reset(); + w_freq.reset(); + } + #endif // __CUDACC_RTC__ + + protected: + #ifndef __CUDACC_RTC__ + d_value_type_sptr w_time; + d_value_type_sptr w_freq; + #endif // __CUDACC_RTC__ + }; + + template + struct workspace_selector; + + + template + struct workspace_selector { + using type = unknown_workspace; + }; + + namespace __workspace_selector { + + template + struct helper; + + template + struct helper { + using type = empty_workspace; + }; + + template + struct helper { + using type = bluestein_workspace; + }; + } // namespace __workspace_selector + + template + struct workspace_selector { + using type = typename __workspace_selector::helper< + FFTSize, + FFTValueType, + FFTSM, + is_bluestein_required::value>::type; + }; + } // namespace detail + + template + constexpr bool is_workspace_required() noexcept { + return FFT::requires_workspace; + } + + template + auto make_workspace(cudaError_t& error_code) noexcept // + -> typename FFT::host_workspace_type { + static_assert(!CUFFTDX_STD::is_same::value, + "Workspace type unknown. FFT description isn't complete"); + using workspace_type = typename FFT::host_workspace_type; + return workspace_type::create(error_code); + } + + template + auto make_workspace() -> typename FFT::host_workspace_type { + cudaError_t error_code = cudaSuccess; + auto ws = make_workspace(error_code); + if (error_code != cudaSuccess) { + std::string error_what = cudaGetErrorString(error_code); + throw std::runtime_error(error_what); + } + return ws; + } +} // namespace cufftdx + +#endif // CUFFTDX_DETAIL_WORKSPACE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators.hpp new file mode 100644 index 0000000000000..fe9957b71a243 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators.hpp @@ -0,0 +1,38 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_HPP__ +#define CUFFTDX_OPERATORS_HPP__ + +#include "operators/block_operators.hpp" +#include "operators/direction.hpp" +#include "operators/execution_operators.hpp" +#include "operators/precision.hpp" +#include "operators/size.hpp" +#include "operators/sm.hpp" +#include "operators/type.hpp" + +namespace cufftdx { + enum class fft_operator + { + direction, + precision, + size, + sm, + type, + // execution + thread, + block, + // block-only + elements_per_thread, + ffts_per_block, + block_dim, + }; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_HPP__ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/block_operators.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/block_operators.hpp new file mode 100644 index 0000000000000..abed8c5434ffa --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/block_operators.hpp @@ -0,0 +1,33 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_BLOCK_OPERATORS_HPP +#define CUFFTDX_OPERATORS_BLOCK_OPERATORS_HPP + +#include "../detail/expressions.hpp" + +namespace cufftdx { + template + struct FFTsPerBlock: detail::constant_block_operator_expression {}; + + template + struct ElementsPerThread: detail::constant_block_operator_expression {}; + + template + struct BlockDim: detail::block_operator_expression { + static constexpr unsigned int x = X; + static constexpr unsigned int y = Y; + static constexpr unsigned int z = Z; + static constexpr dim3 value = dim3(x, y, z); + + static constexpr unsigned int flat_size = x * y * z; + static constexpr unsigned int rank = (x != 1) + (y != 1) + (z != 1); + }; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_BLOCK_OPERATORS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/direction.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/direction.hpp new file mode 100644 index 0000000000000..bb051a4558412 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/direction.hpp @@ -0,0 +1,25 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_DIRECTION_HPP +#define CUFFTDX_OPERATORS_DIRECTION_HPP + +#include "../detail/expressions.hpp" + +namespace cufftdx { + enum class fft_direction + { + forward, + inverse + }; + + template + struct Direction: public detail::constant_operator_expression {}; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_DIRECTION_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/execution_operators.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/execution_operators.hpp new file mode 100644 index 0000000000000..7d1daa2de1f14 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/execution_operators.hpp @@ -0,0 +1,19 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_EXECUTION_OPERATORS_HPP +#define CUFFTDX_OPERATORS_EXECUTION_OPERATORS_HPP + +#include "../detail/expressions.hpp" + +namespace cufftdx { + struct Thread: detail::operator_expression {}; + struct Block: detail::operator_expression {}; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_EXECUTION_OPERATORS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/precision.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/precision.hpp new file mode 100644 index 0000000000000..ce5549f515a98 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/precision.hpp @@ -0,0 +1,39 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_PRECISION_HPP +#define CUFFTDX_OPERATORS_PRECISION_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include + +#include "../detail/expressions.hpp" + +namespace cufftdx { + namespace detail { + template + struct is_supported_fp_type: + CUFFTDX_STD::integral_constant::type>::value || + CUFFTDX_STD::is_same::type>::value || + CUFFTDX_STD::is_same<__half, typename CUFFTDX_STD::remove_cv::type>::value> {}; + } // namespace detail + + template + struct Precision: detail::operator_expression { + using type = typename CUFFTDX_STD::remove_cv::type; + static_assert(detail::is_supported_fp_type::value, "Precision must be double, float, or __half."); + }; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_TYPE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/size.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/size.hpp new file mode 100644 index 0000000000000..2edd2b5ca2cf3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/size.hpp @@ -0,0 +1,21 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_SIZE_HPP +#define CUFFTDX_OPERATORS_SIZE_HPP + +#include "../detail/expressions.hpp" + +namespace cufftdx { + template + struct Size: public detail::constant_operator_expression { + static_assert(Value > 1, "FFT size must be greater than 1"); + }; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_SIZE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/sm.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/sm.hpp new file mode 100644 index 0000000000000..69d427eaacd67 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/sm.hpp @@ -0,0 +1,34 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_SM_HPP +#define CUFFTDX_OPERATORS_SM_HPP + +#include "../detail/expressions.hpp" + +namespace cufftdx { + template + struct SM; + + template<> + struct SM<700>: public detail::constant_operator_expression {}; + + template<> + struct SM<720>: public detail::constant_operator_expression {}; + + template<> + struct SM<750>: public detail::constant_operator_expression {}; + + template<> + struct SM<800>: public detail::constant_operator_expression {}; + + template<> + struct SM<860>: public detail::constant_operator_expression {}; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_SM_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/type.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/type.hpp new file mode 100644 index 0000000000000..c33dd4d9f2625 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/operators/type.hpp @@ -0,0 +1,26 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_OPERATORS_TYPE_HPP +#define CUFFTDX_OPERATORS_TYPE_HPP + +#include "../detail/expressions.hpp" + +namespace cufftdx { + enum class fft_type + { + c2c, + c2r, + r2c + }; + + template + struct Type: public detail::constant_operator_expression {}; +} // namespace cufftdx + +#endif // CUFFTDX_OPERATORS_TYPE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits.hpp new file mode 100644 index 0000000000000..2c2d076255c3f --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits.hpp @@ -0,0 +1,16 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_HPP__ +#define CUFFTDX_TRAITS_HPP__ + +#include "traits/fft_traits.hpp" +#include "traits/replace.hpp" +#include "traits/type_traits.hpp" + +#endif // CUFFTDX_TRAITS_HPP__ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/bluestein_helpers.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/bluestein_helpers.hpp new file mode 100644 index 0000000000000..8683ffa850f9e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/bluestein_helpers.hpp @@ -0,0 +1,72 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_DETAIL_BLUESTEIN_HELPERS_HPP +#define CUFFTDX_TRAITS_DETAIL_BLUESTEIN_HELPERS_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include "../../database/database.hpp" + +namespace cufftdx { + namespace detail { + // Return Bluestein size required to calculate FFT + __device__ __host__ __forceinline__ + constexpr unsigned int get_bluestein_size(const unsigned int fft_size) { + return + fft_size <= 16 ? 32 : + (fft_size <= 32 ? 64 : + (fft_size <= 64 ? 128 : + (fft_size <= 128 ? 256 : + (fft_size <= 256 ? 512 : + (fft_size <= 512 ? 1024 : + (fft_size <= 1024 ? 2048 : + (fft_size <= 2048 ? 4096 : + (fft_size <= 4096 ? 8192 : + (fft_size <= 8192 ? 16384 : 0))))))))); + } + + template + __device__ __host__ __forceinline__ + constexpr unsigned int get_bluestein_size() { + return get_bluestein_size(size_of::value); + } + + template + __device__ __host__ __forceinline__ constexpr bool is_bluestein_supported() { + return get_bluestein_size(FFTSize) > 0; + } + + template + struct is_bluestein_required { + private: + // Search for record in database + using block_fft_record_t = + database::detail::block_fft_record; + public: + static constexpr bool value = !block_fft_record_t::defined; + }; + + template + constexpr bool is_bluestein_required::value; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_DETAIL_BLUESTEIN_HELPERS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/check_and_get_trait.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/check_and_get_trait.hpp new file mode 100644 index 0000000000000..2e93000ab3945 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/check_and_get_trait.hpp @@ -0,0 +1,341 @@ +// Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_DETAIL_CHECK_AND_GET_TRAIT_HPP +#define CUFFTDX_TRAITS_DETAIL_CHECK_AND_GET_TRAIT_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +# include +#else +# include +# include +#endif + +#include + +#include "../../operators.hpp" +#include "../../detail/expressions.hpp" +#include "../../database/database.hpp" + +#include "../fft_traits.hpp" +#include "../replace.hpp" + +#include "get.hpp" +#include "description_traits.hpp" +#include "bluestein_helpers.hpp" + +namespace cufftdx { + namespace detail { + template + class check_and_get_trait; + + namespace __get_block_config { + template + class helper_block_ct + { + // Using SIZE, PRECISION, ARCHITECTURE, we search for optimal EPT and FFTs per block (FPB, BPB) + static constexpr unsigned int this_fft_size_v = size_of::value; + using this_fft_precision_t = precision_of_t; + static constexpr auto this_fft_direction_v = direction_of::value; + static constexpr auto this_fft_type_v = type_of::value; + static constexpr auto this_fft_sm_v = sm_of::value; + + // Select block_fft implementation + // * database::detail::block_fft_record has all possible implementations in type_list named "blobs" + // * first implementation from blobs is considered default/suggested/optimal + using block_fft_record_t = database::detail::block_fft_record; + // Checks if record for requested (size, precision, type, direction, arch) exists + static_assert(block_fft_record_t::defined, "This FFT configuration is not supported"); + + // Get default (optimal) implementation + using suggested_block_config_t = + typename database::detail::type_list_element<0, typename block_fft_record_t::blobs>::type; + + // Get suggested EPT and FPB + using suggested_ept = ElementsPerThread; + using suggested_fpb = FFTsPerBlock; + + // Get selected EPT (suggested or provided by user) + using this_fft_elements_per_thread = + get_or_default_t; + static constexpr bool has_ept = has_operator::value; + static constexpr auto this_fft_ept_v = this_fft_elements_per_thread::value; + + #ifdef CUFFTDX_DETAIL_BLOCK_FFT_ENFORCE_X_TRANSPOSITION + static constexpr unsigned int this_fft_trp_option_v = 1; + #elif defined(CUFFTDX_DETAIL_BLOCK_FFT_ENFORCE_XY_TRANSPOSITION) + static constexpr unsigned int this_fft_trp_option_v = 2; + #else + static constexpr unsigned int this_fft_trp_option_v = 0; + #endif + + // Search for implementation + using this_fft_block_fft_implementation = + typename database::detail::search_by_ept::type; + // Checks if implementation for requested EPT exists within selected record + static_assert(!CUFFTDX_STD::is_void::value, + "This FFT configuration is not supported"); + + // suggested_fpb is not used as default fpb + // For fp16 FPB must be even, each thread processes two half complex numbers + static constexpr auto default_fpb_v = CUFFTDX_STD::is_same::value ? 2 : 1; + using default_fpb = FFTsPerBlock; + using this_ffts_per_block = get_or_default_t; + static constexpr auto this_ffts_per_block_v = this_ffts_per_block::value; + static_assert(!CUFFTDX_STD::is_same::value || (this_ffts_per_block_v % 2 == 0), + "FP16 block FFT can only process even number of FFTs per block"); + + static constexpr unsigned int default_block_dim_v = + this_fft_size_v / this_fft_elements_per_thread::value; + // Default block dimension (X = SIZE/EPT, Y = FFTs Per Block, Z = 1) + using default_block_dim = + BlockDim::value ? this_ffts_per_block::value / 2 + : this_ffts_per_block::value)>; + using this_block_dim = get_or_default_t; + + public: + // Searches database for optimal default + using elements_per_thread = this_fft_elements_per_thread; + // Default calculated based on size and ept + using block_dim = this_block_dim; + // Defaults to 1 + using ffts_per_block = this_ffts_per_block; + // FFT implementation type + using block_fft_implementation = this_fft_block_fft_implementation; + + // Suggested values of EPT and FPB + using suggested_elements_per_thread = suggested_ept; + // If user set EPT, suggested FPB is for that EPT; otherwise it's for default EPT + using suggested_ffts_per_block = + typename CUFFTDX_STD::conditional, + suggested_fpb>::type; + + // + static constexpr bool use_bluestein = false; + static constexpr unsigned int workspace_size = 0; + + private: + // Checks + + // Must specify EPT if user specified BlockDim + static constexpr bool has_block_dim = has_operator::value; + static_assert(!has_block_dim || (has_block_dim && has_ept), + "If BlockDim<> was specifided, user must also specify ElementsPerThread<>"); + + // SIZE % EPT == 0 + static constexpr bool ept_is_factor_of_size = (this_fft_size_v % elements_per_thread::value) == 0; + static_assert(ept_is_factor_of_size, "Elements per thread must be a factor of FFT size"); + + // SIZE * FFTS_PER_BLOCK <= EPT * FLAT_BLOCK_SIZE + static constexpr auto max_elements_processed_per_block = + block_dim::flat_size * elements_per_thread::value * + (CUFFTDX_STD::is_same::value ? 2 : 1); + static constexpr auto elements_to_process_per_block = this_fft_size_v * ffts_per_block::value; + static_assert(elements_to_process_per_block <= max_elements_processed_per_block, + "Not enough threads in block to calculate FFT, you need to increase BlockDim<> or " + "ElementsPerThread<>"); + + // FLAT_BLOCK_SIZE % FFTS_PER_BLOCK == 0 + static constexpr bool fpb_is_factor_of_flat_block_size = + (block_dim::flat_size % ffts_per_block::value) == 0; + static_assert(ept_is_factor_of_size || has_block_dim, + "Elements per thread must be a factor of FFT size"); + }; + + template + class helper_block_bluestein + { + static constexpr unsigned int this_fft_size_v = size_of::value; + static constexpr unsigned int this_fft_blue_size_v = get_bluestein_size(this_fft_size_v); + + // Create description with size changed to this_fft_blue_size_v + using bluestein_description = typename detail::replace_force, + Size, + Direction>::type; + using bluestein_block_helper_t = helper_block_ct; + + public: + using elements_per_thread = typename bluestein_block_helper_t::elements_per_thread; + using block_dim = typename bluestein_block_helper_t::block_dim; + using ffts_per_block = typename bluestein_block_helper_t::ffts_per_block; + using block_fft_implementation = typename bluestein_block_helper_t::block_fft_implementation; + + using suggested_elements_per_thread = typename bluestein_block_helper_t::suggested_elements_per_thread; + using suggested_ffts_per_block = typename bluestein_block_helper_t::suggested_ffts_per_block; + + static constexpr bool use_bluestein = true; + + using complex_type = typename make_complex_type>::cufftdx_type; + static constexpr unsigned int workspace_size = 2 * this_fft_blue_size_v * sizeof(complex_type); + + private: + // Checks + static_assert(this_fft_blue_size_v >= (2 * this_fft_size_v - 1), + "cuFFTDx internal error, selected Bluestein size is too small"); + static_assert(this_fft_blue_size_v <= (4 * this_fft_size_v - 3), + "cuFFTDx internal error, selected Bluestein size is too big"); + }; + + template + class helper + { + // To suggest EPT and FPB (BPB) we need to know Size, Type, Direction, Precision + Architecture + static constexpr bool is_complete = is_complete_description::value; + static_assert(is_complete, "FFT description must be complete to calculate queried information"); + + // Right now we go to Bluestein only if there's no CT implementation. User can't force Bluestein if + // there is CT implementation. +#ifdef CUFFTDX_DETAIL_DISABLE_BLUESTEIN + static constexpr bool is_bluestein_required_v = false; + static constexpr bool is_bluestein_supported_v = false; +#else + + static constexpr bool is_bluestein_required_v = is_bluestein_required::value, + precision_of_t, + direction_of::value, + type_of::value, + sm_of::value>::value; + static constexpr bool is_bluestein_supported_v = is_bluestein_supported::value>(); + + // Check if we have implementation or bluestein which can do requested size + static_assert(!is_bluestein_required_v || (is_bluestein_required_v && is_bluestein_supported_v), + "This FFT configuration is not supported"); +#endif + using selected_block_helper_t = typename CUFFTDX_STD::conditional, + helper_block_ct>::type; + + public: + using elements_per_thread = typename selected_block_helper_t::elements_per_thread; + using block_dim = typename selected_block_helper_t::block_dim; + using ffts_per_block = typename selected_block_helper_t::ffts_per_block; + using fft_implementation = typename selected_block_helper_t::block_fft_implementation; + + using suggested_elements_per_thread = typename selected_block_helper_t::suggested_elements_per_thread; + using suggested_ffts_per_block = typename selected_block_helper_t::suggested_ffts_per_block; + + static constexpr bool use_bluestein = is_bluestein_required_v; + static constexpr bool requires_workspace = is_bluestein_required_v; + static constexpr unsigned int workspace_size = selected_block_helper_t::workspace_size; + }; + + template + class helper + { + // To suggest EPT we need to know Size, Type, Direction, Precision + Architecture + static constexpr bool is_complete = is_complete_description::value; + static_assert(is_complete, "FFT description must be complete to calculate queried information"); + + // We don't need SM for thread FFT for description to be complete, so we select dummy SM. Every + // thread FFT implementation will look the same no matter CUDA architecture. + static constexpr unsigned int dummy_thread_fft_sm_v = 800; + using block_fft_record_t = database::detail::block_fft_record::value, + precision_of_t, + type_of::value, + direction_of::value, + dummy_thread_fft_sm_v>; + static_assert(block_fft_record_t::defined, "This FFT configuration is not supported"); + + using thread_fft_implementation = + typename database::detail::search_by_ept::value, + precision_of_t, + 0 /* trp_option */, + typename block_fft_record_t::blobs>::type; + static_assert(!CUFFTDX_STD::is_void::value, + "This FFT configuration is not supported"); + + public: + using elements_per_thread = + ElementsPerThread>::value>; + using fft_implementation = thread_fft_implementation; + + static constexpr bool use_bluestein = false; + static constexpr bool requires_workspace = false; + static constexpr unsigned int workspace_size = 0; + }; + } // namespace __get_block_config + + template + class check_and_get_trait + { + // FAIL if it's not a block execution + static constexpr bool is_block_execution = has_operator::value; + static_assert(is_block_execution, "Must be block execution to get ::block_dim trait"); + + public: + using type = typename __get_block_config::helper::block_dim; + static constexpr dim3 value = type::value; + }; + + template + class check_and_get_trait + { + // FAIL if it's not a block execution + static constexpr bool is_block_execution = has_operator::value; + static constexpr bool is_thread_execution = has_operator::value; + static_assert(is_block_execution || is_thread_execution, + "FFT must be define as either thread of block execution to get ::elements_per_thread trait"); + + public: + using type = typename __get_block_config::helper::elements_per_thread; + static constexpr unsigned int value = type::value; + }; + + template + class check_and_get_trait + { + // FAIL if it's not a block execution + static constexpr bool is_block_execution = has_operator::value; + static_assert(is_block_execution, "Must be block execution to get ::ffts_per_block trait"); + + public: + using type = typename __get_block_config::helper::ffts_per_block; + static constexpr unsigned int value = type::value; + + using suggested_type = typename __get_block_config::helper::suggested_ffts_per_block; + static constexpr unsigned int suggested = suggested_type::value; + }; + + template + class check_and_get_fft_implementation + { + // FAIL if it's not a block execution + static constexpr bool is_block_execution = has_operator::value; + static constexpr bool is_thread_execution = has_operator::value; + static_assert(is_block_execution || is_thread_execution, + "FFT must be define as either thread of block execution to get ::elements_per_thread trait"); + + using block_config_t = __get_block_config::helper; + + public: + using type = typename block_config_t::fft_implementation; + + static constexpr bool use_bluestein = block_config_t::use_bluestein; + static constexpr bool requires_workspace = block_config_t::requires_workspace; + static constexpr unsigned int workspace_size = block_config_t::workspace_size; + }; + + /// Alias template for check_and_get_fft_implementation_t::type + template + using check_and_get_fft_implementation_t = typename check_and_get_fft_implementation::type; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_DETAIL_CHECK_AND_GET_TRAIT_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/description_traits.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/description_traits.hpp new file mode 100644 index 0000000000000..1e9f887c461f4 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/description_traits.hpp @@ -0,0 +1,264 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_DETAIL_DESCRIPTION_TRAITS_HPP +#define CUFFTDX_TRAITS_DETAIL_DESCRIPTION_TRAITS_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include "../../operators.hpp" +#include "../../detail/expressions.hpp" + +#include "get.hpp" + +namespace cufftdx { + namespace detail { + /// is_expression + + template + struct is_expression: public CUFFTDX_STD::is_base_of {}; + + template + struct are_expressions: + public CUFFTDX_STD::integral_constant::value && is_expression::value> {}; + + /// is_operator_expression + + template + struct is_operator_expression: public CUFFTDX_STD::is_base_of {}; + + template + struct are_operator_expressions: + public CUFFTDX_STD::integral_constant::value && is_operator_expression::value> { + }; + + /// is_description_expression + + template + struct is_description_expression: public CUFFTDX_STD::is_base_of {}; + + /// is_operator + + template + struct is_operator: CUFFTDX_STD::false_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template<> + struct is_operator: CUFFTDX_STD::true_type {}; + + template<> + struct is_operator: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + template + struct is_operator>: CUFFTDX_STD::true_type {}; + + // get_operator_type, TODO: Consider moving that info inside operator class + + template + struct get_operator_type; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::direction; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::precision; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::size; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::sm; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::type; + }; + + template<> + struct get_operator_type { + static constexpr fft_operator value = fft_operator::thread; + }; + + template<> + struct get_operator_type { + static constexpr fft_operator value = fft_operator::block; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::elements_per_thread; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::ffts_per_block; + }; + + template + struct get_operator_type> { + static constexpr fft_operator value = fft_operator::block_dim; + }; + + /// has_n_of + + namespace __has_n_of { + template + struct counter_helper { + static constexpr unsigned int value = is_operator::value + ? counter_helper<(Counter + 1), OperatorType, Types...>::value + : counter_helper::value; + }; + + template + struct counter_helper { + static constexpr unsigned int value = is_operator::value ? Counter + 1 : Counter; + }; + + template + struct counter: CUFFTDX_STD::integral_constant::value> {}; + + template class Description, class... Types> + struct counter>: + CUFFTDX_STD::integral_constant::value> {}; + } // namespace __has_n_of + + template + struct has_n_of: CUFFTDX_STD::integral_constant::value == N> {}; + + template + struct has_at_most_one_of: + CUFFTDX_STD::integral_constant::value <= 1)> {}; + + /// has_block_operator + namespace __has_block_operator { + template + struct counter_helper { + static constexpr unsigned int value = CUFFTDX_STD::is_base_of::value + ? counter_helper<(Counter + 1), Types...>::value + : counter_helper::value; + }; + + template + struct counter_helper { + static constexpr unsigned int value = + CUFFTDX_STD::is_base_of::value ? Counter + 1 : Counter; + }; + + template + struct counter: + CUFFTDX_STD::integral_constant::value> {}; + + template class Description, class... Types> + struct counter>: + CUFFTDX_STD::integral_constant::value> {}; + } // namespace __has_block_operator + + template + struct has_any_block_operator: + CUFFTDX_STD::integral_constant::value > 0)> {}; + + /// has_operator + + template + struct has_operator: + CUFFTDX_STD::integral_constant::value > 0)> {}; + + /// deduce_direction_type + + template + struct deduce_direction_type { + using type = void; + }; + + template<> + struct deduce_direction_type> { + using type = Direction; + }; + + template<> + struct deduce_direction_type> { + using type = Direction; + }; + + template + using deduce_direction_type_t = typename deduce_direction_type::type; + + // is_complete_description + + namespace __is_complete_description { + template + struct helper: CUFFTDX_STD::false_type {}; + + template class Description, class... Types> + struct helper, + typename CUFFTDX_STD::enable_if>::value>::type> { + using description_type = Description; + + // Extract and/or deduce description types + + // Size + using this_fft_size = get_t; + // Type (C2C, C2R, R2C) + using default_fft_type = Type; + using this_fft_type = get_or_default_t; + // Direction + using deduced_fft_direction = deduce_direction_type_t; + using this_fft_direction = + get_or_default_t; + // SM + using this_fft_sm = get_t; + // Thread FFT + static constexpr bool is_thread_execution = has_operator::value; + + static constexpr bool value = + !(CUFFTDX_STD::is_void::value || CUFFTDX_STD::is_void::value || + CUFFTDX_STD::is_void::value || + // If we not that FFT is a thread FFT, then we don't require SM for completness + (CUFFTDX_STD::is_void::value && !is_thread_execution)); + }; + } // namespace __is_complete_description + + template + struct is_complete_description: + CUFFTDX_STD::integral_constant::value> {}; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_DETAIL_DESCRIPTION_TRAITS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/get.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/get.hpp new file mode 100644 index 0000000000000..353fbde3e5815 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/get.hpp @@ -0,0 +1,70 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_DETAIL_GET_HPP +#define CUFFTDX_TRAITS_DETAIL_GET_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +namespace cufftdx { + namespace detail { + // Forward declaration + template + struct is_operator; + + namespace __get { + template + struct helper { + using type = typename CUFFTDX_STD::conditional::value, T, void>::type; + }; + + // clang-format off + template class DescriptionType, + class TypeHead, + class... TailTypes> + struct helper> { + using type = typename CUFFTDX_STD::conditional< + is_operator::value, + TypeHead, + typename helper>::type>::type; + }; + // clang-format on + } // namespace __get + + /// get + + template + struct get { + using type = typename __get::helper::type; + }; + + template + using get_t = typename get::type; + + /// get_or_default + + template + struct get_or_default { + private: + using get_type = get_t; + + public: + using type = typename CUFFTDX_STD::conditional::value, Default, get_type>::type; + }; + + template + using get_or_default_t = typename get_or_default::type; + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_DETAIL_GET_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/ldg_type.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/ldg_type.hpp new file mode 100644 index 0000000000000..c4ea875784ea3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/ldg_type.hpp @@ -0,0 +1,47 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_DETAIL_LDG_TYPE_HPP +#define CUFFTDX_TRAITS_DETAIL_LDG_TYPE_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include + +#include "../../types.hpp" + +namespace cufftdx { + namespace detail { + template + struct ldg_type { + using type = void; + }; + +#define CUFFTDX_DETAIL_DEFINE_LDG_TYPE(mytype, ldgtype) \ + template<> \ + struct ldg_type { \ + using type = ldgtype; \ + }; + + CUFFTDX_DETAIL_DEFINE_LDG_TYPE(__half, __half) + CUFFTDX_DETAIL_DEFINE_LDG_TYPE(float, float) + CUFFTDX_DETAIL_DEFINE_LDG_TYPE(double, double) + CUFFTDX_DETAIL_DEFINE_LDG_TYPE(::cufftdx::detail::complex<__half2>, float2) + CUFFTDX_DETAIL_DEFINE_LDG_TYPE(::cufftdx::detail::complex, float2) + CUFFTDX_DETAIL_DEFINE_LDG_TYPE(::cufftdx::detail::complex, double2) + + +#undef CUFFTDX_DETAIL_DEFINE_LDG_TYPE + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_DETAIL_LDG_TYPE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/make_complex_type.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/make_complex_type.hpp new file mode 100644 index 0000000000000..ea8346fb5fe01 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/detail/make_complex_type.hpp @@ -0,0 +1,54 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_DETAIL_MAKE_COMPLEX_TYPE_HPP +#define CUFFTDX_TRAITS_DETAIL_MAKE_COMPLEX_TYPE_HPP + +#ifdef CUFFTDX_DETAIL_USE_CUDA_STL +# include +#else +# include +#endif + +#include + +#include "../../operators/precision.hpp" // is_supported_fp_type +#include "../../types.hpp" + +namespace cufftdx { + namespace detail { + template + struct make_complex_type { + static_assert(detail::is_supported_fp_type::value, + "Only double, float, and __half floating-point types are supported"); + }; + +#define CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(scalar_type) \ + template<> \ + struct make_complex_type { \ + using cufftdx_type = ::cufftdx::detail::complex; \ + }; + + CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(float) + CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(double) + + template<> + struct make_complex_type<__half> { + using cufftdx_type = ::cufftdx::detail::complex<__half2>; + }; + + template<> + struct make_complex_type<__half2> { + using cufftdx_type = ::cufftdx::detail::complex<__half2>; + }; + +#undef CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE + } // namespace detail +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_DETAIL_MAKE_COMPLEX_TYPE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/fft_traits.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/fft_traits.hpp new file mode 100644 index 0000000000000..4ca0da588a538 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/fft_traits.hpp @@ -0,0 +1,201 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_FFT_TRAITS_HPP +#define CUFFTDX_TRAITS_FFT_TRAITS_HPP + +#include "../detail/fft_description_fd.hpp" + +#include "../operators.hpp" + +#include "detail/get.hpp" +#include "detail/description_traits.hpp" +#include "detail/make_complex_type.hpp" + +namespace cufftdx { + template + struct size_of { + private: + static constexpr bool has_size = detail::has_operator::value; + static_assert(has_size, "Description does not have size defined"); + + public: + using value_type = unsigned int; + static constexpr value_type value = detail::get_t::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr unsigned int size_of::value; + + template + struct sm_of { + private: + static constexpr bool has_sm = detail::has_operator::value; + static_assert(has_sm, "Description does not have CUDA architecture defined"); + + public: + using value_type = unsigned int; + static constexpr value_type value = detail::get_t::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr unsigned int sm_of::value; + + template + struct type_of { + using value_type = fft_type; + static constexpr value_type value = + detail::get_or_default_t>::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr fft_type type_of::value; + + template + struct direction_of { + private: + using deduced_fft_direction = detail::deduce_direction_type_t::value>>; + using this_fft_direction = + detail::get_or_default_t; + + static_assert(!CUFFTDX_STD::is_void::value, + "Description has neither direction defined, nor it can be deduced from its type"); + + public: + using value_type = fft_direction; + static constexpr value_type value = this_fft_direction::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr fft_direction direction_of::value; + + template + struct precision_of { + using type = typename detail::get_or_default_t>::type; + }; + + template + using precision_of_t = typename precision_of::type; + + template + struct is_fft { + using value_type = bool; + static constexpr value_type value = detail::is_expression::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr bool is_fft::value; + + template + struct is_fft_execution { + static constexpr auto block = detail::has_operator::value; + static constexpr auto thread = detail::has_operator::value; + + public: + using value_type = bool; + static constexpr value_type value = is_fft::value && (thread || block); + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr bool is_fft_execution::value; + + template + struct is_complete_fft { + using value_type = bool; + static constexpr value_type value = + is_fft::value && detail::is_complete_description::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr bool is_complete_fft::value; + + template + struct is_complete_fft_execution { + using value_type = bool; + static constexpr value_type value = is_fft_execution::value && is_complete_fft::value; + constexpr operator value_type() const noexcept { return value; } + }; + + template + constexpr bool is_complete_fft_execution::value; + + namespace detail { + // Concatenates OperatorType to the description (faster than using decltype and adding operators) + template + struct concatenate_description; + + template class Description, class... Operators> + struct concatenate_description> { + using type = Description; + }; + + template + using concatenate_description_t = typename concatenate_description::type; + + // Removes give OperatorType from an FFT description + template + struct filter { + using type = void; + }; + + template + using filter_t = typename filter::type; + + template class Description, fft_operator OperatorType> + struct filter, OperatorType> { + using type = Description<>; + }; + + template class Description, fft_operator OperatorType, class Head, class... Tail> + struct filter, OperatorType> { + using type = typename CUFFTDX_STD::conditional< + is_operator::value, + filter_t, OperatorType>, + concatenate_description_t, OperatorType>::type> // + >::type; + }; + + template + struct convert_to_fft_description { + using type = void; + }; + + template class Description, class... Types> + struct convert_to_fft_description> { + using type = typename detail::fft_description; + }; + } // namespace detail + + // This extracts an FFT description from FFT execution description. + template + struct extract_fft_description { + private: + // Converts execution description to simple description, filter_t will remove Thread and Block operators + using fft_description_type = typename detail::convert_to_fft_description::type; + + public: + static_assert(is_fft::value, "Description is not a cuFFDx FFT description"); + using type = typename CUFFTDX_STD::conditional< + detail::is_operator_expression::value, + Description, // For single operator or if Description just return Description + detail::filter_t, fft_operator::thread> // + >::type; + }; + + template + using extract_fft_description_t = typename extract_fft_description::type; +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_FFT_TRAITS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/replace.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/replace.hpp new file mode 100644 index 0000000000000..e53c47b96c9ee --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/replace.hpp @@ -0,0 +1,133 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_REPLACE_HPP +#define CUFFTDX_TRAITS_REPLACE_HPP + +#include "fft_traits.hpp" + +namespace cufftdx { + // Implementation of replace + namespace detail { + template + struct check_operators; + + template + struct check_operators { + static constexpr bool value = detail::is_operator_expression::value; + }; + + template + struct check_operators { + static constexpr bool value = + detail::is_operator_expression::value && check_operators::value; + }; + + template + struct remove_operators; + + template + using remove_operators_t = typename remove_operators::type; + + template + struct remove_operators { + static constexpr fft_operator operator_type = get_operator_type::value; + static constexpr bool can_be_replaced = !(is_operator::value || + is_operator::value); + using type = + typename CUFFTDX_STD::conditional, Description>::type; + }; + + template + struct remove_operators { + static constexpr fft_operator operator_type = get_operator_type::value; + static constexpr bool can_be_replaced = !(is_operator::value || + is_operator::value); + using filtered_type = filter_t; + using type = typename CUFFTDX_STD::conditional, + remove_operators_t>::type; + }; + + template + struct add_operators; + + template + using add_operators_t = typename add_operators::type; + + template + struct add_operators { + static constexpr fft_operator operator_type = get_operator_type::value; + static constexpr bool had_operator = has_operator::value; + static constexpr bool can_be_added = + (had_operator || Force) && !(is_operator::value || + is_operator::value); + using type = typename CUFFTDX_STD:: + conditional, Description>::type; + }; + + template + struct add_operators { + static constexpr fft_operator operator_type = get_operator_type::value; + static constexpr bool had_operator = has_operator::value; + // We only add operator if: + // * it's not block or thread operator + // * there is the same type of operator in the original FFT description + static constexpr bool can_be_added = + (had_operator || Force) && !(is_operator::value || + is_operator::value); + using concatenated_type = concatenate_description_t; + using type = + typename CUFFTDX_STD::conditional, + add_operators_t>::type; + }; + + template + struct replace_force { + static_assert(is_fft::value, "Description is not a cuFFTDx FFT description"); + static_assert(detail::check_operators::value, + "One of operators is not a cuFFTDx FFT operator"); + + // First remove all operators of the same types as NewOperators..., + // and then add NewOperators... to the description. + using filtered_description = detail::remove_operators_t; + using replaced_description = + detail::add_operators_t; + using new_fft_description = typename CUFFTDX_STD::conditional< + is_fft_execution::value, + replaced_description, + typename detail::convert_to_fft_description::type>::type; + + public: + /// cuFFTDx FFT description with replaced operators + using type = replaced_description; + }; + } // namespace detail + + /// \class replace + /// \brief Replaces operators of the same type as \p NewOperators in \p Description with \p NewOperators. + /// + /// \par Overview + /// * Replaces operators of the same type as \p NewOperators in \p Description with \p NewOperators. + /// * cufftdx::Thread and cufftdx::Block operators in \p NewOperators are ignored. + /// + /// \tparam Description - cuFFTDx FFT description type to process + /// \tparam NewOperators - the list of operators to use as replacement + template + struct replace { + /// cuFFTDx FFT description with replaced operators + using type = typename detail::replace_force::type; + }; + + /// Alias template for replace::type + template + using replace_t = typename replace::type; +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_REPLACE_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/type_traits.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/type_traits.hpp new file mode 100644 index 0000000000000..ba20395b8278e --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/traits/type_traits.hpp @@ -0,0 +1,38 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TRAITS_TYPE_TRAITS_HPP +#define CUFFTDX_TRAITS_TYPE_TRAITS_HPP + +#include + +#include "detail/make_complex_type.hpp" + +namespace cufftdx { + // Creates cuFFTDx complex type from scalar floating point type + template + struct make_complex_type; + + template + using make_complex_type_t = typename make_complex_type::type; + +#define CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(scalar_type) \ + template<> \ + struct make_complex_type { \ + using type = typename detail::make_complex_type::cufftdx_type; \ + }; + + CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(float) + CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(double) + CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE(__half2) + +#undef CUFFTDX_DETAIL_DEFINE_MAKE_COMPLEX_TYPE + +} // namespace cufftdx + +#endif // CUFFTDX_TRAITS_TYPE_TRAITS_HPP diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/types.hpp b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/types.hpp new file mode 100644 index 0000000000000..cd1684f8ce48a --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/mathdx/22.02/include/cufftdx/include/types.hpp @@ -0,0 +1,235 @@ +// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. + +#ifndef CUFFTDX_TYPES_HPP__ +#define CUFFTDX_TYPES_HPP__ + +#include + +namespace cufftdx { + namespace detail { + template + struct complex_base { + using value_type = T; + + complex_base() = default; + complex_base(const complex_base&) = default; + + __device__ __forceinline__ __host__ constexpr complex_base(value_type re, value_type im): x(re), y(im) {} + + __device__ __forceinline__ __host__ constexpr value_type real() const { return x; } + __device__ __forceinline__ __host__ constexpr value_type imag() const { return y; } + __device__ __forceinline__ __host__ void real(value_type re) { x = re; } + __device__ __forceinline__ __host__ void imag(value_type im) { y = im; } + + __device__ __forceinline__ __host__ complex_base& operator=(value_type re) { + x = re; + y = value_type(); + return *this; + } + __device__ __forceinline__ __host__ complex_base& operator+=(value_type re) { + x += re; + return *this; + } + __device__ __forceinline__ __host__ complex_base& operator-=(value_type re) { + x -= re; + return *this; + } + __device__ __forceinline__ __host__ complex_base& operator*=(value_type re) { + x *= re; + y *= re; + return *this; + } + __device__ __forceinline__ __host__ complex_base& operator/=(value_type re) { + x /= re; + y /= re; + return *this; + } + + template + __device__ __forceinline__ __host__ complex_base& operator=(const complex_base& other) { + x = other.real(); + y = other.imag(); + return *this; + } + + template + __device__ __forceinline__ __host__ complex_base& operator+=(const OtherType& other) { + x = x + other.x; + y = y + other.y; + return *this; + } + + template + __device__ __forceinline__ __host__ complex_base& operator-=(const OtherType& other) { + x = x - other.x; + y = y - other.y; + return *this; + } + + template + __device__ __forceinline__ __host__ complex_base& operator*=(const OtherType& other) { + auto saved_x = x; + x = x * other.x - y * other.y; + y = saved_x * other.y + y * other.x; + return *this; + } + + /// \internal + value_type x, y; + }; + + template + struct complex; + + template<> + struct alignas(2 * sizeof(float)) complex: complex_base { + private: + using base_type = complex_base; + + public: + using value_type = float; + complex() = default; + complex(const complex&) = default; + __device__ __forceinline__ __host__ constexpr complex(float re, float im): base_type(re, im) {} + __device__ __forceinline__ __host__ explicit constexpr complex(const complex& other); + using base_type::operator+=; + using base_type::operator-=; + using base_type::operator*=; + using base_type::operator/=; + using base_type::operator=; + }; + + template<> + struct alignas(2 * sizeof(double)) complex: complex_base { + private: + using base_type = complex_base; + + public: + using value_type = double; + complex() = default; + complex(const complex&) = default; + __device__ __forceinline__ __host__ constexpr complex(double re, double im): base_type(re, im) {} + __device__ __forceinline__ __host__ explicit constexpr complex(const complex& other); + using base_type::operator+=; + using base_type::operator-=; + using base_type::operator*=; + using base_type::operator/=; + using base_type::operator=; + }; + + // For FFT computations, complex should be in RRII layout. + template<> + struct alignas(2 * sizeof(__half2)) complex<__half2> { + using value_type = __half2; + complex() = default; + complex(const complex&) = default; + + __device__ __forceinline__ __host__ complex(value_type re, value_type im): x(re), y(im) {} +# if CUDA_VERSION < 11000 + __device__ __forceinline__ __host__ complex(double re, double im): + x(__float2half2_rn(re)), y(__float2half2_rn(im)) {} +# else + __device__ __forceinline__ __host__ complex(double re, double im) + + { + __half hre = __double2half(re); + x = __half2(hre, hre); + __half him = __double2half(im); + y = __half2(him, him); + } + +# endif + __device__ __forceinline__ __host__ complex(float re, float im): + x(__float2half2_rn(re)), y(__float2half2_rn(im)) {} + +# if CUDA_VERSION < 11000 + __device__ __forceinline__ __host__ explicit complex(const complex& other): + x(__float2half2_rn(other.real())), y(__float2half2_rn(other.imag())) {} +# else + __device__ __forceinline__ __host__ explicit complex(const complex& other) { + + __half hre = __double2half(other.real()); + x = __half2(hre, hre); + __half him = __double2half(other.imag()); + y = __half2(him, him); + } +# endif + __device__ __forceinline__ __host__ explicit complex(const complex& other): + x(__float2half2_rn(other.real())), y(__float2half2_rn(other.imag())) {} + + __device__ __forceinline__ __host__ value_type real() const { return x; } + __device__ __forceinline__ __host__ value_type imag() const { return y; } + __device__ __forceinline__ __host__ void real(value_type re) { x = re; } + __device__ __forceinline__ __host__ void imag(value_type im) { y = im; } + + __device__ __forceinline__ __host__ complex& operator=(value_type re) { + x = re; + y = value_type(); + return *this; + } + __device__ __forceinline__ complex& operator+=(value_type re) { + x += re; + return *this; + } + __device__ __forceinline__ complex& operator-=(value_type re) { + x -= re; + return *this; + } + __device__ __forceinline__ complex& operator*=(value_type re) { + x *= re; + y *= re; + return *this; + } + __device__ __forceinline__ complex& operator/=(value_type re) { + x /= re; + y /= re; + return *this; + } + + __device__ __forceinline__ __host__ complex& operator=(const complex& other) { + x = other.real(); + y = other.imag(); + return *this; + } + + __device__ __forceinline__ complex& operator+=(const complex& other) { + x = x + other.x; + y = y + other.y; + return *this; + } + + __device__ __forceinline__ complex& operator-=(const complex& other) { + x = x - other.x; + y = y - other.y; + return *this; + } + + __device__ __forceinline__ complex& operator*=(const complex& other) { + auto saved_x = x; + x = __hfma2(x, other.x, - y * other.y); + y = __hfma2(saved_x, other.y, y * other.x); + return *this; + } + + /// \internal + value_type x, y; + }; + + __forceinline__ constexpr complex::complex(const complex& other): + complex_base(other.real(), other.imag()) {}; + + __forceinline__ constexpr complex::complex(const complex& other): + complex_base(other.real(), other.imag()) {}; + } // namespace detail + + template + using complex = typename detail::complex; +} // namespace cufftdx + +#endif // CUFFTDX_TYPES_HPP__ diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/setup.py b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/setup.py new file mode 100644 index 0000000000000..e78b4ad30d7f5 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/setup.py @@ -0,0 +1,122 @@ +# Adapted from https://github.com/NVIDIA/apex/blob/master/setup.py +import torch +from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME +from setuptools import setup, find_packages +import subprocess + +import sys +import warnings +import os + +# ninja build does not work unless include_dirs are abs path +this_dir = os.path.dirname(os.path.abspath(__file__)) + + +def get_cuda_bare_metal_version(cuda_dir): + raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) + output = raw_output.split() + release_idx = output.index("release") + 1 + release = output[release_idx].split(".") + bare_metal_major = release[0] + bare_metal_minor = release[1][0] + + return raw_output, bare_metal_major, bare_metal_minor + + +def check_cuda_torch_binary_vs_bare_metal(cuda_dir): + raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(cuda_dir) + torch_binary_major = torch.version.cuda.split(".")[0] + torch_binary_minor = torch.version.cuda.split(".")[1] + + print("\nCompiling cuda extensions with") + print(raw_output + "from " + cuda_dir + "/bin\n") + + if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor): + raise RuntimeError( + "Cuda extensions are being compiled with a version of Cuda that does " + "not match the version used to compile Pytorch binaries. " + "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) + + "In some cases, a minor-version mismatch will not cause later errors: " + "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " + "You can try commenting out this check (at your own risk)." + ) + + +def raise_if_cuda_home_none(global_option: str) -> None: + if CUDA_HOME is not None: + return + raise RuntimeError( + f"{global_option} was requested, but nvcc was not found. Are you sure your environment has nvcc available? " + "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, " + "only images whose names contain 'devel' will provide nvcc." + ) + + +def append_nvcc_threads(nvcc_extra_args): + _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME) + if int(bare_metal_major) >= 11 and int(bare_metal_minor) >= 2: + return nvcc_extra_args + ["--threads", "4"] + return nvcc_extra_args + + +if not torch.cuda.is_available(): + # https://github.com/NVIDIA/apex/issues/486 + # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(), + # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command). + print( + "\nWarning: Torch did not find available GPUs on this system.\n", + "If your intention is to cross-compile, this is not an error.\n" + "By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n" + "Volta (compute capability 7.0), Turing (compute capability 7.5),\n" + "and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n" + "If you wish to cross-compile for a single specific architecture,\n" + 'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n', + ) + if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: + _, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(CUDA_HOME) + if int(bare_metal_major) == 11: + os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0" + if int(bare_metal_minor) > 0: + os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0;8.6" + else: + os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5" + +print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) +TORCH_MAJOR = int(torch.__version__.split(".")[0]) +TORCH_MINOR = int(torch.__version__.split(".")[1]) + +cmdclass = {} +ext_modules = [] + +raise_if_cuda_home_none("fftconv") +# Check, if CUDA11 is installed for compute capability 8.0 +cc_flag = [] +# cc_flag.append("-gencode") +# cc_flag.append("arch=compute_70,code=sm_70") +cc_flag.append("-gencode") +cc_flag.append("arch=compute_80,code=sm_80") + +ext_modules.append( + CUDAExtension( + 'fftconv', [ + 'fftconv.cpp', + 'fftconv_cuda.cu', + ], + extra_compile_args={'cxx': ['-g', '-march=native', '-funroll-loops'], + 'nvcc': ['-O3', '--threads', '4', '-lineinfo', '--use_fast_math', '-std=c++17', '-arch=compute_70'] + # extra_compile_args={'cxx': ['-O3'], + # 'nvcc': append_nvcc_threads(['-O3', '-lineinfo', '--use_fast_math', '-std=c++17'] + cc_flag) + }, + include_dirs=[os.path.join(this_dir, 'mathdx/22.02/include')] + ) +) + +torch.utils.cpp_extension.COMMON_NVCC_FLAGS.remove('-D__CUDA_NO_HALF2_OPERATORS__') + +setup( + name="fftconv", + version="0.1", + description="FFTConv for state-space models", + ext_modules=ext_modules, + cmdclass={"build_ext": BuildExtension} if ext_modules else {}, +) diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/static_switch.h b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/static_switch.h new file mode 100644 index 0000000000000..7920ac045d0a2 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/static_switch.h @@ -0,0 +1,25 @@ +// Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h +// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h + +#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` +#define BOOL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + if (COND) { \ + constexpr bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + } else { \ + constexpr bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + } \ + }() diff --git a/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/twiddle.cuh b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/twiddle.cuh new file mode 100644 index 0000000000000..8417e6a9cf722 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/csrc/fftconv/twiddle.cuh @@ -0,0 +1,43 @@ +#pragma once + +#include +#include "lut.h" + +// index must be less than or equal to FFT_SIZE / 8 +template inline __device__ float2 twiddle_lut(int index); +template<> inline __device__ float2 twiddle_lut<8>(int index) { return cufftdx::database::detail::lut_sp_4_8[index]; }; +template<> inline __device__ float2 twiddle_lut<16>(int index) { return cufftdx::database::detail::lut_sp_4_16[index]; }; +template<> inline __device__ float2 twiddle_lut<32>(int index) { return cufftdx::database::detail::lut_sp_4_32[index]; }; +template<> inline __device__ float2 twiddle_lut<64>(int index) { return cufftdx::database::detail::lut_sp_4_64[index]; }; +template<> inline __device__ float2 twiddle_lut<128>(int index) { return cufftdx::database::detail::lut_sp_4_128[index]; }; +template<> inline __device__ float2 twiddle_lut<256>(int index) { return cufftdx::database::detail::lut_sp_4_256[index]; }; +template<> inline __device__ float2 twiddle_lut<512>(int index) { return cufftdx::database::detail::lut_sp_4_512[index]; }; +template<> inline __device__ float2 twiddle_lut<1024>(int index) { return cufftdx::database::detail::lut_sp_4_1024[index]; }; +template<> inline __device__ float2 twiddle_lut<2048>(int index) { return cufftdx::database::detail::lut_sp_4_2048[index]; }; +template<> inline __device__ float2 twiddle_lut<4096>(int index) { return cufftdx::database::detail::lut_sp_4_4096[index]; }; +// Doesn't work with 8192 because of the edge case where the index is equal to FFT_SIZE / 8, and the +// lookup table doesn't have that value. So we have to use our own lookup table. +template<> inline __device__ float2 twiddle_lut<8192>(int index) { return cufftdx::database::detail::lut_mine_sp_8_8192[index]; }; +template<> inline __device__ float2 twiddle_lut<16384>(int index) { return cufftdx::database::detail::lut_mine_sp_8_16384[index]; }; + +// The quadrant argument is not strictly necessary but we can compute it from the loop index, +// which will be unrolled and so it avoids branching. +template +inline __device__ c10::complex twiddle_from_lut(int quadrant, int index) { + using cfloat_t = c10::complex; + if (quadrant == 0) { + float2 twiddle = twiddle_lut(index); + return cfloat_t(twiddle.x, twiddle.y); + } else if (quadrant == 1) { + float2 twiddle = twiddle_lut(FFT_SIZE / 4 - index); + return cfloat_t(-twiddle.y, -twiddle.x); + } else if (quadrant == 2) { + float2 twiddle = twiddle_lut(index - FFT_SIZE / 4); + return cfloat_t(twiddle.y, -twiddle.x); + } else if (quadrant == 3) { + float2 twiddle = twiddle_lut(FFT_SIZE / 2 - index); + return cfloat_t(-twiddle.x, twiddle.y); + } else { + assert(false); + } +} diff --git a/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py new file mode 100644 index 0000000000000..b5d2749b2b1c3 --- /dev/null +++ b/nemo/collections/nlp/modules/common/hyena/fftconv_wrapper.py @@ -0,0 +1,103 @@ +import math + +import torch +import torch.nn.functional as F + +from einops import rearrange + +from fftconv import fftconv_fwd, fftconv_bwd + +@torch.jit.script +def _mul_sum(y, q): + return (y * q).sum(dim=1) + +# reference convolution with residual connection +def fftconv_ref(u, k, D, dropout_mask, gelu=True, k_rev=None): + seqlen = u.shape[-1] + fft_size = 2 * seqlen + k_f = torch.fft.rfft(k, n=fft_size) / fft_size + if k_rev is not None: + k_rev_f = torch.fft.rfft(k_rev, n=fft_size) / fft_size + k_f = k_f + k_rev_f.conj() + u_f = torch.fft.rfft(u.to(dtype=k.dtype), n=fft_size) + y = torch.fft.irfft(u_f * k_f, n=fft_size, norm='forward')[..., :seqlen] + out = y + u * D.unsqueeze(-1) + if gelu: + out = F.gelu(out) + if dropout_mask is not None: + return (out * rearrange(dropout_mask, 'b H -> b H 1')).to(dtype=u.dtype) + else: + return out.to(dtype=u.dtype) + +# reference H3 forward pass +def fftconv_h3_ref(k, ssm_kernel, D, q, v, head_dim=1, ssm_kernel_rev=None): + seqlen = k.shape[-1] + fft_size = 2 * seqlen + kv = (rearrange(k, 'b (h d1) l -> b d1 1 h l', d1=head_dim) + * rearrange(v, 'b (h d2) l -> b 1 d2 h l', d2=head_dim)) # b d1 d2 h l + kv_f = torch.fft.rfft(kv.to(dtype=ssm_kernel.dtype), n=fft_size) / fft_size + ssm_kernel_f = torch.fft.rfft(ssm_kernel, n=fft_size) # h L+1 + if ssm_kernel_rev is not None: + ssm_kernel_rev_f = torch.fft.rfft(ssm_kernel_rev, n=fft_size) # h L+1 + ssm_kernel_f = ssm_kernel_f + ssm_kernel_rev_f.conj() + y = torch.fft.irfft(kv_f * ssm_kernel_f, n=fft_size, norm='forward')[..., :seqlen] # b d1 d2 h l + out = y + kv * D.unsqueeze(-1) # b d1 d2 h l + q = rearrange(q, 'b (h d1) l -> b d1 1 h l', d1=head_dim) + if head_dim > 1: + out = _mul_sum(out, q) + return rearrange(out, 'b d2 h l -> b (h d2) l').to(dtype=k.dtype) + else: + return rearrange(out * q, 'b 1 1 h l -> b h l').to(dtype=k.dtype) + + +class FFTConvFunc(torch.autograd.Function): + + @staticmethod + def forward(ctx, u, k, D, dropout_mask=None, gelu=True, force_fp16_output=False, + output_hbl_layout=False, v=None, head_dim=1, q=None, fftfp16=False, k_rev=None): + seqlen = u.shape[-1] + fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16) + k_f = torch.fft.rfft(k, n=fft_size) + if k_rev is not None: + k_f = k_f + torch.fft.rfft(k_rev, n=fft_size).conj() + if u.stride(-1) != 1: + u = u.contiguous() + k_f = k_f.contiguous() + D = D.contiguous() + if v is not None and v.stride(-1) != 1: + v = v.contiguous() + if q is not None and q.stride(-1) != 1: + q = q.contiguous() + if dropout_mask is not None: + dropout_mask = dropout_mask.contiguous() + ctx.save_for_backward(u, k_f, D, dropout_mask, v, q) + ctx.output_hbl_layout = output_hbl_layout + ctx.head_dim = head_dim + ctx.gelu = gelu + ctx.fftfp16 = fftfp16 + ctx.has_k_rev = k_rev is not None + out = fftconv_fwd(u, k_f, D, v, head_dim, q, dropout_mask, gelu, False, False, fft_size, force_fp16_output, output_hbl_layout, fftfp16) + return out + + @staticmethod + def backward(ctx, dout): + if ctx.output_hbl_layout: + dout = rearrange(rearrange(dout, 'b h l -> h b l').contiguous(), 'h b l -> b h l') + else: + dout = dout.contiguous() + u, k_f, D, dropout_mask, v, q = ctx.saved_tensors + seqlen = u.shape[-1] + fft_size = max(2 * 2 ** int(math.ceil(math.log2(seqlen))), 16) + du, dk_f, dD, dv, dq = fftconv_bwd(dout, u, k_f, D, v, ctx.head_dim, q, dropout_mask, ctx.gelu, False, False, fft_size, + ctx.output_hbl_layout, ctx.fftfp16) + dk = torch.fft.irfft(dk_f, n=fft_size, norm='forward')[..., :seqlen] + dk_rev = (None if not ctx.has_k_rev + else torch.fft.irfft(dk_f.conj(), n=fft_size, norm='forward')[..., :seqlen]) + if v is not None: + dv = dv.to(dtype=v.dtype) # We do atomicAdd in fp32 so might need to convert to fp16 + return du, dk, dD, None, None, None, None, dv if v is not None else None, None, dq if q is not None else None, None, dk_rev + +def fftconv_func(u, k, D, dropout_mask=None, gelu=True, force_fp16_output=False, + output_hbl_layout=False, v=None, head_dim=1, q=None, fftfp16=False, k_rev=None): + return FFTConvFunc.apply(u, k, D, dropout_mask, gelu, force_fp16_output, + output_hbl_layout, v, head_dim, q, fftfp16, k_rev) diff --git a/nemo/collections/nlp/modules/common/hyena/hyena.py b/nemo/collections/nlp/modules/common/hyena/hyena.py index 1830a5d30306b..f90ae680db311 100644 --- a/nemo/collections/nlp/modules/common/hyena/hyena.py +++ b/nemo/collections/nlp/modules/common/hyena/hyena.py @@ -19,10 +19,10 @@ from einops import rearrange, repeat -# try: -# from src.ops.fftconv import fftconv_ref, fftconv_func -# except ImportError: -# fftconv_func = None +try: + from .fftconv_wrapper import fftconv_ref, fftconv_func +except ImportError: + fftconv_func = None try: from flash_attn.ops.fused_dense import FusedDense @@ -225,15 +225,15 @@ def forward(self, x, L, k=None, bias=None, *args, **kwargs): if bias is None: bias = self.bias bias = bias if self.use_bias else 0 * bias - # if self.fused_fft_conv: - # bias = bias.to(dtype=torch.float32) - # y = fftconv_func( - # x, k, bias, dropout_mask=None, gelu=False, - # force_fp16_output=torch.is_autocast_enabled() - # ) - # else: - # y = fftconv_ref(x, k, bias, dropout_mask=None, gelu=False) - y = fftconv_ref(x, k, bias, dropout_mask=None, gelu=False) + if self.fused_fft_conv: + bias = bias.to(dtype=torch.float32) + + y = fftconv_func( + x, k, bias, dropout_mask=None, gelu=False, + force_fp16_output=torch.is_autocast_enabled() + ) + else: + y = fftconv_ref(x, k, bias, dropout_mask=None, gelu=False) return y